diff options
-rwxr-xr-x | build.py | 116 | ||||
-rw-r--r-- | gcc-4.9/gcc/cfgloop.h | 11 | ||||
-rw-r--r-- | gcc-4.9/gcc/config/aarch64/aarch64-protos.h | 13 | ||||
-rw-r--r-- | gcc-4.9/gcc/config/aarch64/aarch64.c | 1340 | ||||
-rw-r--r-- | gcc-4.9/gcc/config/aarch64/aarch64.md | 67 | ||||
-rw-r--r-- | gcc-4.9/gcc/testsuite/gcc.dg/tree-ssa/pr65447.c | 54 | ||||
-rw-r--r-- | gcc-4.9/gcc/testsuite/gcc.dg/tree-ssa/scev-10.c | 21 | ||||
-rw-r--r-- | gcc-4.9/gcc/testsuite/gcc.dg/tree-ssa/scev-3.c | 2 | ||||
-rw-r--r-- | gcc-4.9/gcc/testsuite/gcc.dg/tree-ssa/scev-4.c | 2 | ||||
-rw-r--r-- | gcc-4.9/gcc/testsuite/gcc.dg/tree-ssa/scev-8.c | 62 | ||||
-rw-r--r-- | gcc-4.9/gcc/testsuite/gcc.dg/tree-ssa/scev-9.c | 21 | ||||
-rw-r--r-- | gcc-4.9/gcc/testsuite/gcc.dg/vect/pr48052.c | 26 | ||||
-rw-r--r-- | gcc-4.9/gcc/tree-chrec.c | 104 | ||||
-rw-r--r-- | gcc-4.9/gcc/tree-chrec.h | 4 | ||||
-rw-r--r-- | gcc-4.9/gcc/tree-scalar-evolution.c | 138 | ||||
-rw-r--r-- | gcc-4.9/gcc/tree-scalar-evolution.h | 2 | ||||
-rw-r--r-- | gcc-4.9/gcc/tree-ssa-loop-ivopts.c | 436 | ||||
-rw-r--r-- | gcc-4.9/gcc/tree-ssa-loop-niter.c | 290 | ||||
-rw-r--r-- | gcc-4.9/gcc/tree-ssa-loop-niter.h | 1 |
19 files changed, 2157 insertions, 553 deletions
@@ -14,129 +14,51 @@ # See the License for the specific language governing permissions and # limitations under the License. # +"""Builds GCC for Android.""" from __future__ import print_function -import argparse -import multiprocessing import os -import subprocess -import sys +import site +site.addsitedir(os.path.join(os.path.dirname(__file__), '../../ndk/build/lib')) -def android_path(path=''): - top = os.getenv('ANDROID_BUILD_TOP', '') - return os.path.realpath(os.path.join(top, path)) +import build_support -def get_default_package_dir(): - return android_path('out/ndk') - - -def get_default_host(): - if sys.platform in ('linux', 'linux2'): - return 'linux' - elif sys.platform == 'darwin': - return 'darwin' - else: - raise RuntimeError('Unsupported host: {}'.format(sys.platform)) - - -ALL_TOOLCHAINS = ( - 'arm-linux-androideabi', 'aarch64-linux-android', - 'mipsel-linux-android', 'mips64el-linux-android', - 'x86', 'x86_64', -) - - -class ArgParser(argparse.ArgumentParser): +class ArgParser(build_support.ArgParser): def __init__(self): - super(ArgParser, self).__init__( - description='Builds GCC for Android.') + super(ArgParser, self).__init__() self.add_argument( - '--host', choices=('darwin', 'linux', 'windows', 'windows64'), - help='Build binaries for given OS (e.g. linux).') - self.add_argument( - '--package-dir', help='Directory to place the packaged GCC.', - default=get_default_package_dir()) - self.add_argument( - '--toolchain', choices=ALL_TOOLCHAINS, + '--toolchain', choices=build_support.ALL_TOOLCHAINS, help='Toolchain to build. Builds all if not present.') -def toolchain_to_arch(toolchain): - return { - 'arm-linux-androideabi': 'arm', - 'aarch64-linux-android': 'arm64', - 'mipsel-linux-android': 'mips', - 'mips64el-linux-android': 'mips64', - 'x86': 'x86', - 'x86_64': 'x86_64', - }[toolchain] - - -def default_api_level(arch): - if '64' in arch: - return 21 - else: - return 9 - - -def sysroot_path(toolchain): - arch = toolchain_to_arch(toolchain) - version = default_api_level(arch) - - prebuilt_ndk = 'prebuilts/ndk/current' - sysroot_subpath = 'platforms/android-{}/arch-{}'.format(version, arch) - return android_path(os.path.join(prebuilt_ndk, sysroot_subpath)) - - -def main(): - args = ArgParser().parse_args() - - if 'ANDROID_BUILD_TOP' not in os.environ: - top = os.path.join(os.path.dirname(__file__), '../..') - os.environ['ANDROID_BUILD_TOP'] = os.path.realpath(top) - - os.chdir(android_path('toolchain/gcc')) - - toolchain_path = android_path('toolchain') - ndk_path = android_path('ndk') - +def main(args): GCC_VERSION = '4.9' - jobs_arg = '-j{}'.format(multiprocessing.cpu_count() * 2) - - package_dir_arg = '--package-dir={}'.format(args.package_dir) - toolchains = ALL_TOOLCHAINS + toolchains = build_support.ALL_TOOLCHAINS if args.toolchain is not None: toolchains = [args.toolchain] - host = args.host - if host is None: - host = get_default_host() - - ndk_build_tools_path = android_path('ndk/build/tools') - build_env = dict(os.environ) - build_env['NDK_BUILDTOOLS_PATH'] = ndk_build_tools_path - build_env['ANDROID_NDK_ROOT'] = ndk_path - - print('Building {} toolchains: {}'.format(host, ' '.join(toolchains))) + print('Building {} toolchains: {}'.format(args.host, ' '.join(toolchains))) for toolchain in toolchains: toolchain_name = '-'.join([toolchain, GCC_VERSION]) - sysroot_arg = '--sysroot={}'.format(sysroot_path(toolchain)) + sysroot_arg = '--sysroot={}'.format( + build_support.sysroot_path(toolchain)) build_cmd = [ - 'bash', 'build-gcc.sh', toolchain_path, ndk_path, toolchain_name, - package_dir_arg, '--verbose', jobs_arg, sysroot_arg, + 'bash', 'build-gcc.sh', build_support.toolchain_path(), + build_support.ndk_path(), toolchain_name, build_support.jobs_arg(), + sysroot_arg, ] - if host in ('windows', 'windows64'): + if args.host in ('windows', 'windows64'): build_cmd.append('--mingw') - if host != 'windows': + if args.host != 'windows': build_cmd.append('--try-64') - subprocess.check_call(build_cmd, env=build_env) + build_support.build(build_cmd, args) if __name__ == '__main__': - main() + build_support.run(main, ArgParser) diff --git a/gcc-4.9/gcc/cfgloop.h b/gcc-4.9/gcc/cfgloop.h index c7e417bf2..11bfc810f 100644 --- a/gcc-4.9/gcc/cfgloop.h +++ b/gcc-4.9/gcc/cfgloop.h @@ -100,6 +100,14 @@ enum loop_estimation EST_LAST }; +/* The structure describing non-overflow control induction variable for + loop's exit edge. */ +struct GTY ((chain_next ("%h.next"))) control_iv { + tree base; + tree step; + struct control_iv *next; +}; + /* Structure to hold information for each natural loop. */ struct GTY ((chain_next ("%h.next"))) loop { /* Index into loops array. */ @@ -187,6 +195,9 @@ struct GTY ((chain_next ("%h.next"))) loop { /* Upper bound on number of iterations of a loop. */ struct nb_iter_bound *bounds; + /* Non-overflow control ivs of a loop. */ + struct control_iv *control_ivs; + /* Head of the cyclic list of the exits of the loop. */ struct loop_exit *exits; diff --git a/gcc-4.9/gcc/config/aarch64/aarch64-protos.h b/gcc-4.9/gcc/config/aarch64/aarch64-protos.h index 8b0a70538..e78348ea0 100644 --- a/gcc-4.9/gcc/config/aarch64/aarch64-protos.h +++ b/gcc-4.9/gcc/config/aarch64/aarch64-protos.h @@ -108,9 +108,22 @@ enum aarch64_symbol_type cost models and vectors for address cost calculations, register move costs and memory move costs. */ +/* Scaled addressing modes can vary cost depending on the mode of the + value to be loaded/stored. QImode values cannot use scaled + addressing modes. */ + +struct scale_addr_mode_cost +{ + const int hi; + const int si; + const int di; + const int ti; +}; + /* Additional cost for addresses. */ struct cpu_addrcost_table { + const struct scale_addr_mode_cost addr_scale_costs; const int pre_modify; const int post_modify; const int register_offset; diff --git a/gcc-4.9/gcc/config/aarch64/aarch64.c b/gcc-4.9/gcc/config/aarch64/aarch64.c index f9e8c4067..09f2d2c5d 100644 --- a/gcc-4.9/gcc/config/aarch64/aarch64.c +++ b/gcc-4.9/gcc/config/aarch64/aarch64.c @@ -141,6 +141,7 @@ static bool aarch64_const_vec_all_same_int_p (rtx, static bool aarch64_vectorize_vec_perm_const_ok (enum machine_mode vmode, const unsigned char *sel); +static int aarch64_address_cost (rtx, enum machine_mode, addr_space_t, bool); /* The processor for which instructions should be scheduled. */ enum aarch64_processor aarch64_tune = cortexa53; @@ -171,6 +172,15 @@ __extension__ #endif static const struct cpu_addrcost_table generic_addrcost_table = { +#if HAVE_DESIGNATED_INITIALIZERS + .addr_scale_costs = +#endif + { + NAMED_PARAM (hi, 0), + NAMED_PARAM (si, 0), + NAMED_PARAM (di, 0), + NAMED_PARAM (ti, 0), + }, NAMED_PARAM (pre_modify, 0), NAMED_PARAM (post_modify, 0), NAMED_PARAM (register_offset, 0), @@ -181,6 +191,27 @@ static const struct cpu_addrcost_table generic_addrcost_table = #if HAVE_DESIGNATED_INITIALIZERS && GCC_VERSION >= 2007 __extension__ #endif +static const struct cpu_addrcost_table cortexa57_addrcost_table = +{ +#if HAVE_DESIGNATED_INITIALIZERS + .addr_scale_costs = +#endif + { + NAMED_PARAM (hi, 1), + NAMED_PARAM (si, 0), + NAMED_PARAM (di, 0), + NAMED_PARAM (ti, 1), + }, + NAMED_PARAM (pre_modify, 0), + NAMED_PARAM (post_modify, 0), + NAMED_PARAM (register_offset, 0), + NAMED_PARAM (register_extend, 0), + NAMED_PARAM (imm_offset, 0), +}; + +#if HAVE_DESIGNATED_INITIALIZERS && GCC_VERSION >= 2007 +__extension__ +#endif static const struct cpu_regmove_cost generic_regmove_cost = { NAMED_PARAM (GP2GP, 1), @@ -214,6 +245,26 @@ static const struct cpu_vector_cost generic_vector_cost = NAMED_PARAM (cond_not_taken_branch_cost, 1) }; +/* Generic costs for vector insn classes. */ +#if HAVE_DESIGNATED_INITIALIZERS && GCC_VERSION >= 2007 +__extension__ +#endif +static const struct cpu_vector_cost cortexa57_vector_cost = +{ + NAMED_PARAM (scalar_stmt_cost, 1), + NAMED_PARAM (scalar_load_cost, 4), + NAMED_PARAM (scalar_store_cost, 1), + NAMED_PARAM (vec_stmt_cost, 3), + NAMED_PARAM (vec_to_scalar_cost, 8), + NAMED_PARAM (scalar_to_vec_cost, 8), + NAMED_PARAM (vec_align_load_cost, 5), + NAMED_PARAM (vec_unalign_load_cost, 5), + NAMED_PARAM (vec_unalign_store_cost, 1), + NAMED_PARAM (vec_store_cost, 1), + NAMED_PARAM (cond_taken_branch_cost, 1), + NAMED_PARAM (cond_not_taken_branch_cost, 1) +}; + #if HAVE_DESIGNATED_INITIALIZERS && GCC_VERSION >= 2007 __extension__ #endif @@ -240,9 +291,9 @@ static const struct tune_params cortexa53_tunings = static const struct tune_params cortexa57_tunings = { &cortexa57_extra_costs, - &generic_addrcost_table, + &cortexa57_addrcost_table, &generic_regmove_cost, - &generic_vector_cost, + &cortexa57_vector_cost, NAMED_PARAM (memmov_cost, 4), NAMED_PARAM (issue_rate, 3) }; @@ -446,7 +497,7 @@ aarch64_is_long_call_p (rtx sym) represent an expression that matches an extend operation. The operands represent the paramters from - (extract (mult (reg) (mult_imm)) (extract_imm) (const_int 0)). */ + (extract:MODE (mult (reg) (MULT_IMM)) (EXTRACT_IMM) (const_int 0)). */ bool aarch64_is_extend_from_extract (enum machine_mode mode, rtx mult_imm, rtx extract_imm) @@ -2463,12 +2514,22 @@ aarch64_final_eh_return_addr (void) - 2 * UNITS_PER_WORD)); } -/* Output code to build up a constant in a register. */ -static void -aarch64_build_constant (int regnum, HOST_WIDE_INT val) +/* Possibly output code to build up a constant in a register. For + the benefit of the costs infrastructure, returns the number of + instructions which would be emitted. GENERATE inhibits or + enables code generation. */ + +static int +aarch64_build_constant (int regnum, HOST_WIDE_INT val, bool generate) { + int insns = 0; + if (aarch64_bitmask_imm (val, DImode)) - emit_move_insn (gen_rtx_REG (Pmode, regnum), GEN_INT (val)); + { + if (generate) + emit_move_insn (gen_rtx_REG (Pmode, regnum), GEN_INT (val)); + insns = 1; + } else { int i; @@ -2499,15 +2560,19 @@ aarch64_build_constant (int regnum, HOST_WIDE_INT val) the same. */ if (ncount < zcount) { - emit_move_insn (gen_rtx_REG (Pmode, regnum), - GEN_INT (val | ~(HOST_WIDE_INT) 0xffff)); + if (generate) + emit_move_insn (gen_rtx_REG (Pmode, regnum), + GEN_INT (val | ~(HOST_WIDE_INT) 0xffff)); tval = 0xffff; + insns++; } else { - emit_move_insn (gen_rtx_REG (Pmode, regnum), - GEN_INT (val & 0xffff)); + if (generate) + emit_move_insn (gen_rtx_REG (Pmode, regnum), + GEN_INT (val & 0xffff)); tval = 0; + insns++; } val >>= 16; @@ -2515,11 +2580,17 @@ aarch64_build_constant (int regnum, HOST_WIDE_INT val) for (i = 16; i < 64; i += 16) { if ((val & 0xffff) != tval) - emit_insn (gen_insv_immdi (gen_rtx_REG (Pmode, regnum), - GEN_INT (i), GEN_INT (val & 0xffff))); + { + if (generate) + emit_insn (gen_insv_immdi (gen_rtx_REG (Pmode, regnum), + GEN_INT (i), + GEN_INT (val & 0xffff))); + insns++; + } val >>= 16; } } + return insns; } static void @@ -2534,7 +2605,7 @@ aarch64_add_constant (int regnum, int scratchreg, HOST_WIDE_INT delta) if (mdelta >= 4096 * 4096) { - aarch64_build_constant (scratchreg, delta); + (void) aarch64_build_constant (scratchreg, delta, true); emit_insn (gen_add3_insn (this_rtx, this_rtx, scratch_rtx)); } else if (mdelta > 0) @@ -2608,7 +2679,7 @@ aarch64_output_mi_thunk (FILE *file, tree thunk ATTRIBUTE_UNUSED, addr = plus_constant (Pmode, temp0, vcall_offset); else { - aarch64_build_constant (IP1_REGNUM, vcall_offset); + (void) aarch64_build_constant (IP1_REGNUM, vcall_offset, true); addr = gen_rtx_PLUS (Pmode, temp0, temp1); } @@ -4471,12 +4542,12 @@ aarch64_strip_shift (rtx x) return x; } -/* Helper function for rtx cost calculation. Strip a shift or extend +/* Helper function for rtx cost calculation. Strip an extend expression from X. Returns the inner operand if successful, or the original expression on failure. We deal with a number of possible canonicalization variations here. */ static rtx -aarch64_strip_shift_or_extend (rtx x) +aarch64_strip_extend (rtx x) { rtx op = x; @@ -4512,7 +4583,246 @@ aarch64_strip_shift_or_extend (rtx x) if (op != x) return op; - return aarch64_strip_shift (x); + return x; +} + +/* Helper function for rtx cost calculation. Calculate the cost of + a MULT, which may be part of a multiply-accumulate rtx. Return + the calculated cost of the expression, recursing manually in to + operands where needed. */ + +static int +aarch64_rtx_mult_cost (rtx x, int code, int outer, bool speed) +{ + rtx op0, op1; + const struct cpu_cost_table *extra_cost + = aarch64_tune_params->insn_extra_cost; + int cost = 0; + bool maybe_fma = (outer == PLUS || outer == MINUS); + enum machine_mode mode = GET_MODE (x); + + gcc_checking_assert (code == MULT); + + op0 = XEXP (x, 0); + op1 = XEXP (x, 1); + + if (VECTOR_MODE_P (mode)) + mode = GET_MODE_INNER (mode); + + /* Integer multiply/fma. */ + if (GET_MODE_CLASS (mode) == MODE_INT) + { + /* The multiply will be canonicalized as a shift, cost it as such. */ + if (CONST_INT_P (op1) + && exact_log2 (INTVAL (op1)) > 0) + { + if (speed) + { + if (maybe_fma) + /* ADD (shifted register). */ + cost += extra_cost->alu.arith_shift; + else + /* LSL (immediate). */ + cost += extra_cost->alu.shift; + } + + cost += rtx_cost (op0, GET_CODE (op0), 0, speed); + + return cost; + } + + /* Integer multiplies or FMAs have zero/sign extending variants. */ + if ((GET_CODE (op0) == ZERO_EXTEND + && GET_CODE (op1) == ZERO_EXTEND) + || (GET_CODE (op0) == SIGN_EXTEND + && GET_CODE (op1) == SIGN_EXTEND)) + { + cost += rtx_cost (XEXP (op0, 0), MULT, 0, speed) + + rtx_cost (XEXP (op1, 0), MULT, 1, speed); + + if (speed) + { + if (maybe_fma) + /* MADD/SMADDL/UMADDL. */ + cost += extra_cost->mult[0].extend_add; + else + /* MUL/SMULL/UMULL. */ + cost += extra_cost->mult[0].extend; + } + + return cost; + } + + /* This is either an integer multiply or an FMA. In both cases + we want to recurse and cost the operands. */ + cost += rtx_cost (op0, MULT, 0, speed) + + rtx_cost (op1, MULT, 1, speed); + + if (speed) + { + if (maybe_fma) + /* MADD. */ + cost += extra_cost->mult[mode == DImode].add; + else + /* MUL. */ + cost += extra_cost->mult[mode == DImode].simple; + } + + return cost; + } + else + { + if (speed) + { + /* Floating-point FMA can also support negations of the + operands. */ + if (GET_CODE (op0) == NEG) + { + maybe_fma = true; + op0 = XEXP (op0, 0); + } + if (GET_CODE (op1) == NEG) + { + maybe_fma = true; + op1 = XEXP (op1, 0); + } + + if (maybe_fma) + /* FMADD/FNMADD/FNMSUB/FMSUB. */ + cost += extra_cost->fp[mode == DFmode].fma; + else + /* FMUL. */ + cost += extra_cost->fp[mode == DFmode].mult; + } + + cost += rtx_cost (op0, MULT, 0, speed) + + rtx_cost (op1, MULT, 1, speed); + return cost; + } +} + +static int +aarch64_address_cost (rtx x, + enum machine_mode mode, + addr_space_t as ATTRIBUTE_UNUSED, + bool speed) +{ + enum rtx_code c = GET_CODE (x); + const struct cpu_addrcost_table *addr_cost = aarch64_tune_params->addr_cost; + struct aarch64_address_info info; + int cost = 0; + info.shift = 0; + + if (!aarch64_classify_address (&info, x, mode, c, false)) + { + if (GET_CODE (x) == CONST || GET_CODE (x) == SYMBOL_REF) + { + /* This is a CONST or SYMBOL ref which will be split + in a different way depending on the code model in use. + Cost it through the generic infrastructure. */ + int cost_symbol_ref = rtx_cost (x, MEM, 1, speed); + /* Divide through by the cost of one instruction to + bring it to the same units as the address costs. */ + cost_symbol_ref /= COSTS_N_INSNS (1); + /* The cost is then the cost of preparing the address, + followed by an immediate (possibly 0) offset. */ + return cost_symbol_ref + addr_cost->imm_offset; + } + else + { + /* This is most likely a jump table from a case + statement. */ + return addr_cost->register_offset; + } + } + + switch (info.type) + { + case ADDRESS_LO_SUM: + case ADDRESS_SYMBOLIC: + case ADDRESS_REG_IMM: + cost += addr_cost->imm_offset; + break; + + case ADDRESS_REG_WB: + if (c == PRE_INC || c == PRE_DEC || c == PRE_MODIFY) + cost += addr_cost->pre_modify; + else if (c == POST_INC || c == POST_DEC || c == POST_MODIFY) + cost += addr_cost->post_modify; + else + gcc_unreachable (); + + break; + + case ADDRESS_REG_REG: + cost += addr_cost->register_offset; + break; + + case ADDRESS_REG_UXTW: + case ADDRESS_REG_SXTW: + cost += addr_cost->register_extend; + break; + + default: + gcc_unreachable (); + } + + + if (info.shift > 0) + { + /* For the sake of calculating the cost of the shifted register + component, we can treat same sized modes in the same way. */ + switch (GET_MODE_BITSIZE (mode)) + { + case 16: + cost += addr_cost->addr_scale_costs.hi; + break; + + case 32: + cost += addr_cost->addr_scale_costs.si; + break; + + case 64: + cost += addr_cost->addr_scale_costs.di; + break; + + /* We can't tell, or this is a 128-bit vector. */ + default: + cost += addr_cost->addr_scale_costs.ti; + break; + } + } + + return cost; +} + +/* Return true if the RTX X in mode MODE is a zero or sign extract + usable in an ADD or SUB (extended register) instruction. */ +static bool +aarch64_rtx_arith_op_extract_p (rtx x, enum machine_mode mode) +{ + /* Catch add with a sign extract. + This is add_<optab><mode>_multp2. */ + if (GET_CODE (x) == SIGN_EXTRACT + || GET_CODE (x) == ZERO_EXTRACT) + { + rtx op0 = XEXP (x, 0); + rtx op1 = XEXP (x, 1); + rtx op2 = XEXP (x, 2); + + if (GET_CODE (op0) == MULT + && CONST_INT_P (op1) + && op2 == const0_rtx + && CONST_INT_P (XEXP (op0, 1)) + && aarch64_is_extend_from_extract (mode, + XEXP (op0, 1), + op1)) + { + return true; + } + } + + return false; } /* Calculate the cost of calculating X, storing it in *COST. Result @@ -4521,13 +4831,31 @@ static bool aarch64_rtx_costs (rtx x, int code, int outer ATTRIBUTE_UNUSED, int param ATTRIBUTE_UNUSED, int *cost, bool speed) { - rtx op0, op1; + rtx op0, op1, op2; const struct cpu_cost_table *extra_cost = aarch64_tune_params->insn_extra_cost; + enum machine_mode mode = GET_MODE (x); + + /* By default, assume that everything has equivalent cost to the + cheapest instruction. Any additional costs are applied as a delta + above this default. */ + *cost = COSTS_N_INSNS (1); + + /* TODO: The cost infrastructure currently does not handle + vector operations. Assume that all vector operations + are equally expensive. */ + if (VECTOR_MODE_P (mode)) + { + if (speed) + *cost += extra_cost->vect.alu; + return true; + } switch (code) { case SET: + /* The cost depends entirely on the operands to SET. */ + *cost = 0; op0 = SET_DEST (x); op1 = SET_SRC (x); @@ -4535,25 +4863,47 @@ aarch64_rtx_costs (rtx x, int code, int outer ATTRIBUTE_UNUSED, { case MEM: if (speed) - *cost += extra_cost->ldst.store; + { + rtx address = XEXP (op0, 0); + if (GET_MODE_CLASS (mode) == MODE_INT) + *cost += extra_cost->ldst.store; + else if (mode == SFmode) + *cost += extra_cost->ldst.storef; + else if (mode == DFmode) + *cost += extra_cost->ldst.stored; + + *cost += + COSTS_N_INSNS (aarch64_address_cost (address, mode, + 0, speed)); + } - if (op1 != const0_rtx) - *cost += rtx_cost (op1, SET, 1, speed); + *cost += rtx_cost (op1, SET, 1, speed); return true; case SUBREG: if (! REG_P (SUBREG_REG (op0))) *cost += rtx_cost (SUBREG_REG (op0), SET, 0, speed); + /* Fall through. */ case REG: - /* Cost is just the cost of the RHS of the set. */ - *cost += rtx_cost (op1, SET, 1, true); + /* const0_rtx is in general free, but we will use an + instruction to set a register to 0. */ + if (REG_P (op1) || op1 == const0_rtx) + { + /* The cost is 1 per register copied. */ + int n_minus_1 = (GET_MODE_SIZE (GET_MODE (op0)) - 1) + / UNITS_PER_WORD; + *cost = COSTS_N_INSNS (n_minus_1 + 1); + } + else + /* Cost is just the cost of the RHS of the set. */ + *cost += rtx_cost (op1, SET, 1, speed); return true; - case ZERO_EXTRACT: /* Bit-field insertion. */ + case ZERO_EXTRACT: case SIGN_EXTRACT: - /* Strip any redundant widening of the RHS to meet the width of - the target. */ + /* Bit-field insertion. Strip any redundant widening of + the RHS to meet the width of the target. */ if (GET_CODE (op1) == SUBREG) op1 = SUBREG_REG (op1); if ((GET_CODE (op1) == ZERO_EXTEND @@ -4562,24 +4912,138 @@ aarch64_rtx_costs (rtx x, int code, int outer ATTRIBUTE_UNUSED, && (GET_MODE_BITSIZE (GET_MODE (XEXP (op1, 0))) >= INTVAL (XEXP (op0, 1)))) op1 = XEXP (op1, 0); - *cost += rtx_cost (op1, SET, 1, speed); + + if (CONST_INT_P (op1)) + { + /* MOV immediate is assumed to always be cheap. */ + *cost = COSTS_N_INSNS (1); + } + else + { + /* BFM. */ + if (speed) + *cost += extra_cost->alu.bfi; + *cost += rtx_cost (op1, (enum rtx_code) code, 1, speed); + } + return true; default: + /* We can't make sense of this, assume default cost. */ + *cost = COSTS_N_INSNS (1); break; } return false; + case CONST_INT: + /* If an instruction can incorporate a constant within the + instruction, the instruction's expression avoids calling + rtx_cost() on the constant. If rtx_cost() is called on a + constant, then it is usually because the constant must be + moved into a register by one or more instructions. + + The exception is constant 0, which can be expressed + as XZR/WZR and is therefore free. The exception to this is + if we have (set (reg) (const0_rtx)) in which case we must cost + the move. However, we can catch that when we cost the SET, so + we don't need to consider that here. */ + if (x == const0_rtx) + *cost = 0; + else + { + /* To an approximation, building any other constant is + proportionally expensive to the number of instructions + required to build that constant. This is true whether we + are compiling for SPEED or otherwise. */ + *cost = COSTS_N_INSNS (aarch64_build_constant (0, + INTVAL (x), + false)); + } + return true; + + case CONST_DOUBLE: + if (speed) + { + /* mov[df,sf]_aarch64. */ + if (aarch64_float_const_representable_p (x)) + /* FMOV (scalar immediate). */ + *cost += extra_cost->fp[mode == DFmode].fpconst; + else if (!aarch64_float_const_zero_rtx_p (x)) + { + /* This will be a load from memory. */ + if (mode == DFmode) + *cost += extra_cost->ldst.loadd; + else + *cost += extra_cost->ldst.loadf; + } + else + /* Otherwise this is +0.0. We get this using MOVI d0, #0 + or MOV v0.s[0], wzr - neither of which are modeled by the + cost tables. Just use the default cost. */ + { + } + } + + return true; + case MEM: if (speed) - *cost += extra_cost->ldst.load; + { + /* For loads we want the base cost of a load, plus an + approximation for the additional cost of the addressing + mode. */ + rtx address = XEXP (x, 0); + if (GET_MODE_CLASS (mode) == MODE_INT) + *cost += extra_cost->ldst.load; + else if (mode == SFmode) + *cost += extra_cost->ldst.loadf; + else if (mode == DFmode) + *cost += extra_cost->ldst.loadd; + + *cost += + COSTS_N_INSNS (aarch64_address_cost (address, mode, + 0, speed)); + } return true; case NEG: - op0 = CONST0_RTX (GET_MODE (x)); - op1 = XEXP (x, 0); - goto cost_minus; + op0 = XEXP (x, 0); + + if (GET_MODE_CLASS (GET_MODE (x)) == MODE_INT) + { + if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE + || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE) + { + /* CSETM. */ + *cost += rtx_cost (XEXP (op0, 0), NEG, 0, speed); + return true; + } + + /* Cost this as SUB wzr, X. */ + op0 = CONST0_RTX (GET_MODE (x)); + op1 = XEXP (x, 0); + goto cost_minus; + } + + if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT) + { + /* Support (neg(fma...)) as a single instruction only if + sign of zeros is unimportant. This matches the decision + making in aarch64.md. */ + if (GET_CODE (op0) == FMA && !HONOR_SIGNED_ZEROS (GET_MODE (op0))) + { + /* FNMADD. */ + *cost = rtx_cost (op0, NEG, 0, speed); + return true; + } + if (speed) + /* FNEG. */ + *cost += extra_cost->fp[mode == DFmode].neg; + return false; + } + + return false; case COMPARE: op0 = XEXP (x, 0); @@ -4592,94 +5056,207 @@ aarch64_rtx_costs (rtx x, int code, int outer ATTRIBUTE_UNUSED, goto cost_logic; } - /* Comparisons can work if the order is swapped. - Canonicalization puts the more complex operation first, but - we want it in op1. */ - if (! (REG_P (op0) - || (GET_CODE (op0) == SUBREG && REG_P (SUBREG_REG (op0))))) - { - op0 = XEXP (x, 1); - op1 = XEXP (x, 0); - } - goto cost_minus; + if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT) + { + /* TODO: A write to the CC flags possibly costs extra, this + needs encoding in the cost tables. */ - case MINUS: - op0 = XEXP (x, 0); - op1 = XEXP (x, 1); + /* CC_ZESWPmode supports zero extend for free. */ + if (GET_MODE (x) == CC_ZESWPmode && GET_CODE (op0) == ZERO_EXTEND) + op0 = XEXP (op0, 0); - cost_minus: - if (GET_MODE_CLASS (GET_MODE (x)) == MODE_INT - || (GET_MODE_CLASS (GET_MODE (x)) == MODE_CC - && GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT)) - { - if (op0 != const0_rtx) - *cost += rtx_cost (op0, MINUS, 0, speed); + /* ANDS. */ + if (GET_CODE (op0) == AND) + { + x = op0; + goto cost_logic; + } - if (CONST_INT_P (op1)) - { - if (!aarch64_uimm12_shift (INTVAL (op1))) - *cost += rtx_cost (op1, MINUS, 1, speed); - } - else - { - op1 = aarch64_strip_shift_or_extend (op1); - *cost += rtx_cost (op1, MINUS, 1, speed); - } - return true; - } + if (GET_CODE (op0) == PLUS) + { + /* ADDS (and CMN alias). */ + x = op0; + goto cost_plus; + } + + if (GET_CODE (op0) == MINUS) + { + /* SUBS. */ + x = op0; + goto cost_minus; + } + + if (GET_CODE (op1) == NEG) + { + /* CMN. */ + if (speed) + *cost += extra_cost->alu.arith; + + *cost += rtx_cost (op0, COMPARE, 0, speed); + *cost += rtx_cost (XEXP (op1, 0), NEG, 1, speed); + return true; + } + + /* CMP. + + Compare can freely swap the order of operands, and + canonicalization puts the more complex operation first. + But the integer MINUS logic expects the shift/extend + operation in op1. */ + if (! (REG_P (op0) + || (GET_CODE (op0) == SUBREG && REG_P (SUBREG_REG (op0))))) + { + op0 = XEXP (x, 1); + op1 = XEXP (x, 0); + } + goto cost_minus; + } + + if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_FLOAT) + { + /* FCMP. */ + if (speed) + *cost += extra_cost->fp[mode == DFmode].compare; + + if (CONST_DOUBLE_P (op1) && aarch64_float_const_zero_rtx_p (op1)) + { + *cost += rtx_cost (op0, COMPARE, 0, speed); + /* FCMP supports constant 0.0 for no extra cost. */ + return true; + } + return false; + } return false; + case MINUS: + { + op0 = XEXP (x, 0); + op1 = XEXP (x, 1); + +cost_minus: + *cost += rtx_cost (op0, MINUS, 0, speed); + + /* Detect valid immediates. */ + if ((GET_MODE_CLASS (mode) == MODE_INT + || (GET_MODE_CLASS (mode) == MODE_CC + && GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT)) + && CONST_INT_P (op1) + && aarch64_uimm12_shift (INTVAL (op1))) + { + if (speed) + /* SUB(S) (immediate). */ + *cost += extra_cost->alu.arith; + return true; + } + + /* Look for SUB (extended register). */ + if (aarch64_rtx_arith_op_extract_p (op1, mode)) + { + if (speed) + *cost += extra_cost->alu.arith_shift; + + *cost += rtx_cost (XEXP (XEXP (op1, 0), 0), + (enum rtx_code) GET_CODE (op1), + 0, speed); + return true; + } + + rtx new_op1 = aarch64_strip_extend (op1); + + /* Cost this as an FMA-alike operation. */ + if ((GET_CODE (new_op1) == MULT + || GET_CODE (new_op1) == ASHIFT) + && code != COMPARE) + { + *cost += aarch64_rtx_mult_cost (new_op1, MULT, + (enum rtx_code) code, + speed); + return true; + } + + *cost += rtx_cost (new_op1, MINUS, 1, speed); + + if (speed) + { + if (GET_MODE_CLASS (mode) == MODE_INT) + /* SUB(S). */ + *cost += extra_cost->alu.arith; + else if (GET_MODE_CLASS (mode) == MODE_FLOAT) + /* FSUB. */ + *cost += extra_cost->fp[mode == DFmode].addsub; + } + return true; + } + case PLUS: - op0 = XEXP (x, 0); - op1 = XEXP (x, 1); + { + rtx new_op0; - if (GET_MODE_CLASS (GET_MODE (x)) == MODE_INT) - { - if (CONST_INT_P (op1) && aarch64_uimm12_shift (INTVAL (op1))) - { - *cost += rtx_cost (op0, PLUS, 0, speed); - } - else - { - rtx new_op0 = aarch64_strip_shift_or_extend (op0); + op0 = XEXP (x, 0); + op1 = XEXP (x, 1); - if (new_op0 == op0 - && GET_CODE (op0) == MULT) - { - if ((GET_CODE (XEXP (op0, 0)) == ZERO_EXTEND - && GET_CODE (XEXP (op0, 1)) == ZERO_EXTEND) - || (GET_CODE (XEXP (op0, 0)) == SIGN_EXTEND - && GET_CODE (XEXP (op0, 1)) == SIGN_EXTEND)) - { - *cost += (rtx_cost (XEXP (XEXP (op0, 0), 0), MULT, 0, - speed) - + rtx_cost (XEXP (XEXP (op0, 1), 0), MULT, 1, - speed) - + rtx_cost (op1, PLUS, 1, speed)); - if (speed) - *cost += - extra_cost->mult[GET_MODE (x) == DImode].extend_add; - return true; - } +cost_plus: + if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE + || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE) + { + /* CSINC. */ + *cost += rtx_cost (XEXP (op0, 0), PLUS, 0, speed); + *cost += rtx_cost (op1, PLUS, 1, speed); + return true; + } - *cost += (rtx_cost (XEXP (op0, 0), MULT, 0, speed) - + rtx_cost (XEXP (op0, 1), MULT, 1, speed) - + rtx_cost (op1, PLUS, 1, speed)); + if (GET_MODE_CLASS (mode) == MODE_INT + && CONST_INT_P (op1) + && aarch64_uimm12_shift (INTVAL (op1))) + { + *cost += rtx_cost (op0, PLUS, 0, speed); - if (speed) - *cost += extra_cost->mult[GET_MODE (x) == DImode].add; + if (speed) + /* ADD (immediate). */ + *cost += extra_cost->alu.arith; + return true; + } - return true; - } + *cost += rtx_cost (op1, PLUS, 1, speed); - *cost += (rtx_cost (new_op0, PLUS, 0, speed) - + rtx_cost (op1, PLUS, 1, speed)); - } - return true; - } + /* Look for ADD (extended register). */ + if (aarch64_rtx_arith_op_extract_p (op0, mode)) + { + if (speed) + *cost += extra_cost->alu.arith_shift; - return false; + *cost += rtx_cost (XEXP (XEXP (op0, 0), 0), + (enum rtx_code) GET_CODE (op0), + 0, speed); + return true; + } + + /* Strip any extend, leave shifts behind as we will + cost them through mult_cost. */ + new_op0 = aarch64_strip_extend (op0); + + if (GET_CODE (new_op0) == MULT + || GET_CODE (new_op0) == ASHIFT) + { + *cost += aarch64_rtx_mult_cost (new_op0, MULT, PLUS, + speed); + return true; + } + + *cost += rtx_cost (new_op0, PLUS, 0, speed); + + if (speed) + { + if (GET_MODE_CLASS (mode) == MODE_INT) + /* ADD. */ + *cost += extra_cost->alu.arith; + else if (GET_MODE_CLASS (mode) == MODE_FLOAT) + /* FADD. */ + *cost += extra_cost->fp[mode == DFmode].addsub; + } + return true; + } case IOR: case XOR: @@ -4688,117 +5265,284 @@ aarch64_rtx_costs (rtx x, int code, int outer ATTRIBUTE_UNUSED, op0 = XEXP (x, 0); op1 = XEXP (x, 1); + if (code == AND + && GET_CODE (op0) == MULT + && CONST_INT_P (XEXP (op0, 1)) + && CONST_INT_P (op1) + && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (op0, 1))), + INTVAL (op1)) != 0) + { + /* This is a UBFM/SBFM. */ + *cost += rtx_cost (XEXP (op0, 0), ZERO_EXTRACT, 0, speed); + if (speed) + *cost += extra_cost->alu.bfx; + return true; + } + if (GET_MODE_CLASS (GET_MODE (x)) == MODE_INT) { + /* We possibly get the immediate for free, this is not + modelled. */ if (CONST_INT_P (op1) && aarch64_bitmask_imm (INTVAL (op1), GET_MODE (x))) { - *cost += rtx_cost (op0, AND, 0, speed); + *cost += rtx_cost (op0, (enum rtx_code) code, 0, speed); + + if (speed) + *cost += extra_cost->alu.logical; + + return true; } else { + rtx new_op0 = op0; + + /* Handle ORN, EON, or BIC. */ if (GET_CODE (op0) == NOT) op0 = XEXP (op0, 0); - op0 = aarch64_strip_shift (op0); - *cost += (rtx_cost (op0, AND, 0, speed) - + rtx_cost (op1, AND, 1, speed)); + + new_op0 = aarch64_strip_shift (op0); + + /* If we had a shift on op0 then this is a logical-shift- + by-register/immediate operation. Otherwise, this is just + a logical operation. */ + if (speed) + { + if (new_op0 != op0) + { + /* Shift by immediate. */ + if (CONST_INT_P (XEXP (op0, 1))) + *cost += extra_cost->alu.log_shift; + else + *cost += extra_cost->alu.log_shift_reg; + } + else + *cost += extra_cost->alu.logical; + } + + /* In both cases we want to cost both operands. */ + *cost += rtx_cost (new_op0, (enum rtx_code) code, 0, speed) + + rtx_cost (op1, (enum rtx_code) code, 1, speed); + + return true; } - return true; } return false; + case NOT: + x = XEXP (x, 0); + op0 = aarch64_strip_shift (x); + + /* MVN-shifted-reg. */ + if (op0 != x) + { + *cost += rtx_cost (op0, (enum rtx_code) code, 0, speed); + + if (speed) + *cost += extra_cost->alu.log_shift; + + return true; + } + /* EON can have two forms: (xor (not a) b) but also (not (xor a b)). + Handle the second form here taking care that 'a' in the above can + be a shift. */ + else if (GET_CODE (op0) == XOR) + { + rtx newop0 = XEXP (op0, 0); + rtx newop1 = XEXP (op0, 1); + rtx op0_stripped = aarch64_strip_shift (newop0); + + *cost += rtx_cost (newop1, (enum rtx_code) code, 1, speed) + + rtx_cost (op0_stripped, XOR, 0, speed); + + if (speed) + { + if (op0_stripped != newop0) + *cost += extra_cost->alu.log_shift; + else + *cost += extra_cost->alu.logical; + } + + return true; + } + /* MVN. */ + if (speed) + *cost += extra_cost->alu.logical; + + return false; + case ZERO_EXTEND: - if ((GET_MODE (x) == DImode - && GET_MODE (XEXP (x, 0)) == SImode) - || GET_CODE (XEXP (x, 0)) == MEM) + + op0 = XEXP (x, 0); + /* If a value is written in SI mode, then zero extended to DI + mode, the operation will in general be free as a write to + a 'w' register implicitly zeroes the upper bits of an 'x' + register. However, if this is + + (set (reg) (zero_extend (reg))) + + we must cost the explicit register move. */ + if (mode == DImode + && GET_MODE (op0) == SImode + && outer == SET) { - *cost += rtx_cost (XEXP (x, 0), ZERO_EXTEND, 0, speed); + int op_cost = rtx_cost (XEXP (x, 0), ZERO_EXTEND, 0, speed); + + if (!op_cost && speed) + /* MOV. */ + *cost += extra_cost->alu.extend; + else + /* Free, the cost is that of the SI mode operation. */ + *cost = op_cost; + + return true; + } + else if (MEM_P (XEXP (x, 0))) + { + /* All loads can zero extend to any size for free. */ + *cost = rtx_cost (XEXP (x, 0), ZERO_EXTEND, param, speed); return true; } + + /* UXTB/UXTH. */ + if (speed) + *cost += extra_cost->alu.extend; + return false; case SIGN_EXTEND: - if (GET_CODE (XEXP (x, 0)) == MEM) + if (MEM_P (XEXP (x, 0))) { - *cost += rtx_cost (XEXP (x, 0), SIGN_EXTEND, 0, speed); + /* LDRSH. */ + if (speed) + { + rtx address = XEXP (XEXP (x, 0), 0); + *cost += extra_cost->ldst.load_sign_extend; + + *cost += + COSTS_N_INSNS (aarch64_address_cost (address, mode, + 0, speed)); + } return true; } + + if (speed) + *cost += extra_cost->alu.extend; return false; - case ROTATE: - if (!CONST_INT_P (XEXP (x, 1))) - *cost += COSTS_N_INSNS (2); - /* Fall through. */ - case ROTATERT: - case LSHIFTRT: case ASHIFT: - case ASHIFTRT: - - /* Shifting by a register often takes an extra cycle. */ - if (speed && !CONST_INT_P (XEXP (x, 1))) - *cost += extra_cost->alu.arith_shift_reg; + op0 = XEXP (x, 0); + op1 = XEXP (x, 1); - *cost += rtx_cost (XEXP (x, 0), ASHIFT, 0, speed); - return true; + if (CONST_INT_P (op1)) + { + /* LSL (immediate), UBMF, UBFIZ and friends. These are all + aliases. */ + if (speed) + *cost += extra_cost->alu.shift; - case HIGH: - if (!CONSTANT_P (XEXP (x, 0))) - *cost += rtx_cost (XEXP (x, 0), HIGH, 0, speed); - return true; + /* We can incorporate zero/sign extend for free. */ + if (GET_CODE (op0) == ZERO_EXTEND + || GET_CODE (op0) == SIGN_EXTEND) + op0 = XEXP (op0, 0); - case LO_SUM: - if (!CONSTANT_P (XEXP (x, 1))) - *cost += rtx_cost (XEXP (x, 1), LO_SUM, 1, speed); - *cost += rtx_cost (XEXP (x, 0), LO_SUM, 0, speed); - return true; + *cost += rtx_cost (op0, ASHIFT, 0, speed); + return true; + } + else + { + /* LSLV. */ + if (speed) + *cost += extra_cost->alu.shift_reg; - case ZERO_EXTRACT: - case SIGN_EXTRACT: - *cost += rtx_cost (XEXP (x, 0), ZERO_EXTRACT, 0, speed); - return true; + return false; /* All arguments need to be in registers. */ + } - case MULT: + case ROTATE: + case ROTATERT: + case LSHIFTRT: + case ASHIFTRT: op0 = XEXP (x, 0); op1 = XEXP (x, 1); - *cost = COSTS_N_INSNS (1); - if (GET_MODE_CLASS (GET_MODE (x)) == MODE_INT) + if (CONST_INT_P (op1)) { - if (CONST_INT_P (op1) - && exact_log2 (INTVAL (op1)) > 0) - { - *cost += rtx_cost (op0, ASHIFT, 0, speed); - return true; - } + /* ASR (immediate) and friends. */ + if (speed) + *cost += extra_cost->alu.shift; - if ((GET_CODE (op0) == ZERO_EXTEND - && GET_CODE (op1) == ZERO_EXTEND) - || (GET_CODE (op0) == SIGN_EXTEND - && GET_CODE (op1) == SIGN_EXTEND)) - { - *cost += (rtx_cost (XEXP (op0, 0), MULT, 0, speed) - + rtx_cost (XEXP (op1, 0), MULT, 1, speed)); - if (speed) - *cost += extra_cost->mult[GET_MODE (x) == DImode].extend; - return true; - } + *cost += rtx_cost (op0, (enum rtx_code) code, 0, speed); + return true; + } + else + { + /* ASR (register) and friends. */ if (speed) - *cost += extra_cost->mult[GET_MODE (x) == DImode].simple; + *cost += extra_cost->alu.shift_reg; + + return false; /* All arguments need to be in registers. */ } - else if (speed) + + case SYMBOL_REF: + + if (aarch64_cmodel == AARCH64_CMODEL_LARGE) { - if (GET_MODE (x) == DFmode) - *cost += extra_cost->fp[1].mult; - else if (GET_MODE (x) == SFmode) - *cost += extra_cost->fp[0].mult; + /* LDR. */ + if (speed) + *cost += extra_cost->ldst.load; + } + else if (aarch64_cmodel == AARCH64_CMODEL_SMALL + || aarch64_cmodel == AARCH64_CMODEL_SMALL_PIC) + { + /* ADRP, followed by ADD. */ + *cost += COSTS_N_INSNS (1); + if (speed) + *cost += 2 * extra_cost->alu.arith; + } + else if (aarch64_cmodel == AARCH64_CMODEL_TINY + || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC) + { + /* ADR. */ + if (speed) + *cost += extra_cost->alu.arith; } - return false; /* All arguments need to be in registers. */ + if (flag_pic) + { + /* One extra load instruction, after accessing the GOT. */ + *cost += COSTS_N_INSNS (1); + if (speed) + *cost += extra_cost->ldst.load; + } + return true; + + case HIGH: + case LO_SUM: + /* ADRP/ADD (immediate). */ + if (speed) + *cost += extra_cost->alu.arith; + return true; + + case ZERO_EXTRACT: + case SIGN_EXTRACT: + /* UBFX/SBFX. */ + if (speed) + *cost += extra_cost->alu.bfx; + + /* We can trust that the immediates used will be correct (there + are no by-register forms), so we need only cost op0. */ + *cost += rtx_cost (XEXP (x, 0), (enum rtx_code) code, 0, speed); + return true; + + case MULT: + *cost += aarch64_rtx_mult_cost (x, MULT, 0, speed); + /* aarch64_rtx_mult_cost always handles recursion to its + operands. */ + return true; case MOD: case UMOD: - *cost = COSTS_N_INSNS (2); if (speed) { if (GET_MODE_CLASS (GET_MODE (x)) == MODE_INT) @@ -4815,53 +5559,225 @@ aarch64_rtx_costs (rtx x, int code, int outer ATTRIBUTE_UNUSED, case DIV: case UDIV: - *cost = COSTS_N_INSNS (1); + case SQRT: if (speed) { - if (GET_MODE_CLASS (GET_MODE (x)) == MODE_INT) - *cost += extra_cost->mult[GET_MODE (x) == DImode].idiv; - else if (GET_MODE (x) == DFmode) - *cost += extra_cost->fp[1].div; - else if (GET_MODE (x) == SFmode) - *cost += extra_cost->fp[0].div; + if (GET_MODE_CLASS (mode) == MODE_INT) + /* There is no integer SQRT, so only DIV and UDIV can get + here. */ + *cost += extra_cost->mult[mode == DImode].idiv; + else + *cost += extra_cost->fp[mode == DFmode].div; } return false; /* All arguments need to be in registers. */ - default: - break; - } - return false; -} + case IF_THEN_ELSE: + op2 = XEXP (x, 2); + op0 = XEXP (x, 0); + op1 = XEXP (x, 1); -static int -aarch64_address_cost (rtx x ATTRIBUTE_UNUSED, - enum machine_mode mode ATTRIBUTE_UNUSED, - addr_space_t as ATTRIBUTE_UNUSED, bool speed ATTRIBUTE_UNUSED) -{ - enum rtx_code c = GET_CODE (x); - const struct cpu_addrcost_table *addr_cost = aarch64_tune_params->addr_cost; + if (GET_CODE (op1) == PC || GET_CODE (op2) == PC) + { + /* Conditional branch. */ + if (GET_MODE_CLASS (GET_MODE (XEXP (op0, 0))) == MODE_CC) + return true; + else + { + if (GET_CODE (op0) == NE + || GET_CODE (op0) == EQ) + { + rtx inner = XEXP (op0, 0); + rtx comparator = XEXP (op0, 1); - if (c == PRE_INC || c == PRE_DEC || c == PRE_MODIFY) - return addr_cost->pre_modify; + if (comparator == const0_rtx) + { + /* TBZ/TBNZ/CBZ/CBNZ. */ + if (GET_CODE (inner) == ZERO_EXTRACT) + /* TBZ/TBNZ. */ + *cost += rtx_cost (XEXP (inner, 0), ZERO_EXTRACT, + 0, speed); + else + /* CBZ/CBNZ. */ + *cost += rtx_cost (inner, GET_CODE (op0), 0, speed); - if (c == POST_INC || c == POST_DEC || c == POST_MODIFY) - return addr_cost->post_modify; + return true; + } + } + else if (GET_CODE (op0) == LT + || GET_CODE (op0) == GE) + { + rtx comparator = XEXP (op0, 1); - if (c == PLUS) - { - if (GET_CODE (XEXP (x, 1)) == CONST_INT) - return addr_cost->imm_offset; - else if (GET_CODE (XEXP (x, 0)) == MULT - || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND - || GET_CODE (XEXP (x, 0)) == SIGN_EXTEND) - return addr_cost->register_extend; + /* TBZ/TBNZ. */ + if (comparator == const0_rtx) + return true; + } + } + } + else if (GET_MODE_CLASS (GET_MODE (XEXP (op0, 0))) == MODE_CC) + { + /* It's a conditional operation based on the status flags, + so it must be some flavor of CSEL. */ + + /* CSNEG, CSINV, and CSINC are handled for free as part of CSEL. */ + if (GET_CODE (op1) == NEG + || GET_CODE (op1) == NOT + || (GET_CODE (op1) == PLUS && XEXP (op1, 1) == const1_rtx)) + op1 = XEXP (op1, 0); + + *cost += rtx_cost (op1, IF_THEN_ELSE, 1, speed); + *cost += rtx_cost (op2, IF_THEN_ELSE, 2, speed); + return true; + } - return addr_cost->register_offset; - } - else if (c == MEM || c == LABEL_REF || c == SYMBOL_REF) - return addr_cost->imm_offset; + /* We don't know what this is, cost all operands. */ + return false; - return 0; + case EQ: + case NE: + case GT: + case GTU: + case LT: + case LTU: + case GE: + case GEU: + case LE: + case LEU: + + return false; /* All arguments must be in registers. */ + + case FMA: + op0 = XEXP (x, 0); + op1 = XEXP (x, 1); + op2 = XEXP (x, 2); + + if (speed) + *cost += extra_cost->fp[mode == DFmode].fma; + + /* FMSUB, FNMADD, and FNMSUB are free. */ + if (GET_CODE (op0) == NEG) + op0 = XEXP (op0, 0); + + if (GET_CODE (op2) == NEG) + op2 = XEXP (op2, 0); + + /* aarch64_fnma4_elt_to_64v2df has the NEG as operand 1, + and the by-element operand as operand 0. */ + if (GET_CODE (op1) == NEG) + op1 = XEXP (op1, 0); + + /* Catch vector-by-element operations. The by-element operand can + either be (vec_duplicate (vec_select (x))) or just + (vec_select (x)), depending on whether we are multiplying by + a vector or a scalar. + + Canonicalization is not very good in these cases, FMA4 will put the + by-element operand as operand 0, FNMA4 will have it as operand 1. */ + if (GET_CODE (op0) == VEC_DUPLICATE) + op0 = XEXP (op0, 0); + else if (GET_CODE (op1) == VEC_DUPLICATE) + op1 = XEXP (op1, 0); + + if (GET_CODE (op0) == VEC_SELECT) + op0 = XEXP (op0, 0); + else if (GET_CODE (op1) == VEC_SELECT) + op1 = XEXP (op1, 0); + + /* If the remaining parameters are not registers, + get the cost to put them into registers. */ + *cost += rtx_cost (op0, FMA, 0, speed); + *cost += rtx_cost (op1, FMA, 1, speed); + *cost += rtx_cost (op2, FMA, 2, speed); + return true; + + case FLOAT_EXTEND: + if (speed) + *cost += extra_cost->fp[mode == DFmode].widen; + return false; + + case FLOAT_TRUNCATE: + if (speed) + *cost += extra_cost->fp[mode == DFmode].narrow; + return false; + + case ABS: + if (GET_MODE_CLASS (mode) == MODE_FLOAT) + { + op0 = XEXP (x, 0); + + /* FABD, which is analogous to FADD. */ + if (GET_CODE (op0) == MINUS) + { + *cost += rtx_cost (XEXP (op0, 0), MINUS, 0, speed); + + rtx_cost (XEXP (op0, 1), MINUS, 1, speed); + if (speed) + *cost += extra_cost->fp[mode == DFmode].addsub; + + return true; + } + /* Simple FABS is analogous to FNEG. */ + if (speed) + *cost += extra_cost->fp[mode == DFmode].neg; + } + else + { + /* Integer ABS will either be split to + two arithmetic instructions, or will be an ABS + (scalar), which we don't model. */ + *cost = COSTS_N_INSNS (2); + if (speed) + *cost += 2 * extra_cost->alu.arith; + } + return false; + + case SMAX: + case SMIN: + if (speed) + { + /* FMAXNM/FMINNM/FMAX/FMIN. + TODO: This may not be accurate for all implementations, but + we do not model this in the cost tables. */ + *cost += extra_cost->fp[mode == DFmode].addsub; + } + return false; + + case TRUNCATE: + + /* Decompose <su>muldi3_highpart. */ + if (/* (truncate:DI */ + mode == DImode + /* (lshiftrt:TI */ + && GET_MODE (XEXP (x, 0)) == TImode + && GET_CODE (XEXP (x, 0)) == LSHIFTRT + /* (mult:TI */ + && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT + /* (ANY_EXTEND:TI (reg:DI)) + (ANY_EXTEND:TI (reg:DI))) */ + && ((GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == ZERO_EXTEND + && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == ZERO_EXTEND) + || (GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == SIGN_EXTEND + && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == SIGN_EXTEND)) + && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0)) == DImode + && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0)) == DImode + /* (const_int 64) */ + && CONST_INT_P (XEXP (XEXP (x, 0), 1)) + && UINTVAL (XEXP (XEXP (x, 0), 1)) == 64) + { + /* UMULH/SMULH. */ + if (speed) + *cost += extra_cost->mult[mode == DImode].extend; + *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0), + MULT, 0, speed); + *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0), + MULT, 1, speed); + return true; + } + + /* Fall through. */ + default: + return true; + } + return false; } static int @@ -4994,15 +5910,9 @@ aarch64_add_stmt_cost (void *data, int count, enum vect_cost_for_stmt kind, /* Statements in an inner loop relative to the loop being vectorized are weighted more heavily. The value here is - a function (linear for now) of the loop nest level. */ + arbitrary and could potentially be improved with analysis. */ if (where == vect_body && stmt_info && stmt_in_inner_loop_p (stmt_info)) - { - loop_vec_info loop_info = STMT_VINFO_LOOP_VINFO (stmt_info); - struct loop *loop = LOOP_VINFO_LOOP (loop_info); - unsigned nest_level = loop_depth (loop); - - count *= nest_level; - } + count *= 50; /* FIXME */ retval = (unsigned) (count * stmt_cost); cost[where] += retval; diff --git a/gcc-4.9/gcc/config/aarch64/aarch64.md b/gcc-4.9/gcc/config/aarch64/aarch64.md index dc88f8b10..fe68bfea1 100644 --- a/gcc-4.9/gcc/config/aarch64/aarch64.md +++ b/gcc-4.9/gcc/config/aarch64/aarch64.md @@ -2571,6 +2571,32 @@ [(set_attr "type" "logics_shift_imm")] ) +(define_insn "*eor_one_cmpl_<SHIFT:optab><mode>3_alt" + [(set (match_operand:GPI 0 "register_operand" "=r") + (not:GPI (xor:GPI + (SHIFT:GPI + (match_operand:GPI 1 "register_operand" "r") + (match_operand:QI 2 "aarch64_shift_imm_<mode>" "n")) + (match_operand:GPI 3 "register_operand" "r"))))] + "" + "eon\\t%<w>0, %<w>3, %<w>1, <SHIFT:shift> %2" + [(set_attr "type" "logic_shift_imm")] +) + +;; Zero-extend version of the above. +(define_insn "*eor_one_cmpl_<SHIFT:optab>sidi3_alt_ze" + [(set (match_operand:DI 0 "register_operand" "=r") + (zero_extend:DI + (not:SI (xor:SI + (SHIFT:SI + (match_operand:SI 1 "register_operand" "r") + (match_operand:QI 2 "aarch64_shift_imm_si" "n")) + (match_operand:SI 3 "register_operand" "r")))))] + "" + "eon\\t%w0, %w3, %w1, <SHIFT:shift> %2" + [(set_attr "type" "logic_shift_imm")] +) + (define_insn "*and_one_cmpl_<SHIFT:optab><mode>3_compare0" [(set (reg:CC_NZ CC_REGNUM) (compare:CC_NZ @@ -2771,32 +2797,33 @@ ;; Logical left shift using SISD or Integer instruction (define_insn "*aarch64_ashl_sisd_or_int_<mode>3" - [(set (match_operand:GPI 0 "register_operand" "=w,w,r") + [(set (match_operand:GPI 0 "register_operand" "=r,w,w") (ashift:GPI - (match_operand:GPI 1 "register_operand" "w,w,r") - (match_operand:QI 2 "aarch64_reg_or_shift_imm_<mode>" "Us<cmode>,w,rUs<cmode>")))] + (match_operand:GPI 1 "register_operand" "r,w,w") + (match_operand:QI 2 "aarch64_reg_or_shift_imm_<mode>" "rUs<cmode>,Us<cmode>,w")))] "" "@ + lsl\t%<w>0, %<w>1, %<w>2 shl\t%<rtn>0<vas>, %<rtn>1<vas>, %2 - ushl\t%<rtn>0<vas>, %<rtn>1<vas>, %<rtn>2<vas> - lsl\t%<w>0, %<w>1, %<w>2" - [(set_attr "simd" "yes,yes,no") - (set_attr "type" "neon_shift_imm<q>, neon_shift_reg<q>,shift_reg")] + ushl\t%<rtn>0<vas>, %<rtn>1<vas>, %<rtn>2<vas>" + [(set_attr "simd" "no,yes,yes") + (set_attr "type" "shift_reg,neon_shift_imm<q>, neon_shift_reg<q>")] ) ;; Logical right shift using SISD or Integer instruction (define_insn "*aarch64_lshr_sisd_or_int_<mode>3" - [(set (match_operand:GPI 0 "register_operand" "=w,&w,r") + [(set (match_operand:GPI 0 "register_operand" "=r,w,&w,&w") (lshiftrt:GPI - (match_operand:GPI 1 "register_operand" "w,w,r") - (match_operand:QI 2 "aarch64_reg_or_shift_imm_<mode>" "Us<cmode>,w,rUs<cmode>")))] + (match_operand:GPI 1 "register_operand" "r,w,w,w") + (match_operand:QI 2 "aarch64_reg_or_shift_imm_<mode>" "rUs<cmode>,Us<cmode>,w,0")))] "" "@ + lsr\t%<w>0, %<w>1, %<w>2 ushr\t%<rtn>0<vas>, %<rtn>1<vas>, %2 # - lsr\t%<w>0, %<w>1, %<w>2" - [(set_attr "simd" "yes,yes,no") - (set_attr "type" "neon_shift_imm<q>,neon_shift_reg<q>,shift_reg")] + #" + [(set_attr "simd" "no,yes,yes,yes") + (set_attr "type" "shift_reg,neon_shift_imm<q>,neon_shift_reg<q>,neon_shift_reg<q>")] ) (define_split @@ -2831,18 +2858,18 @@ ;; Arithmetic right shift using SISD or Integer instruction (define_insn "*aarch64_ashr_sisd_or_int_<mode>3" - [(set (match_operand:GPI 0 "register_operand" "=w,&w,&w,r") + [(set (match_operand:GPI 0 "register_operand" "=r,w,&w,&w") (ashiftrt:GPI - (match_operand:GPI 1 "register_operand" "w,w,w,r") - (match_operand:QI 2 "aarch64_reg_or_shift_imm_di" "Us<cmode>,w,0,rUs<cmode>")))] + (match_operand:GPI 1 "register_operand" "r,w,w,w") + (match_operand:QI 2 "aarch64_reg_or_shift_imm_di" "rUs<cmode>,Us<cmode>,w,0")))] "" "@ + asr\t%<w>0, %<w>1, %<w>2 sshr\t%<rtn>0<vas>, %<rtn>1<vas>, %2 # - # - asr\t%<w>0, %<w>1, %<w>2" - [(set_attr "simd" "yes,yes,yes,no") - (set_attr "type" "neon_shift_imm<q>,neon_shift_reg<q>,neon_shift_reg<q>,shift_reg")] + #" + [(set_attr "simd" "no,yes,yes,yes") + (set_attr "type" "shift_reg,neon_shift_imm<q>,neon_shift_reg<q>,neon_shift_reg<q>")] ) (define_split diff --git a/gcc-4.9/gcc/testsuite/gcc.dg/tree-ssa/pr65447.c b/gcc-4.9/gcc/testsuite/gcc.dg/tree-ssa/pr65447.c new file mode 100644 index 000000000..c5bddbfba --- /dev/null +++ b/gcc-4.9/gcc/testsuite/gcc.dg/tree-ssa/pr65447.c @@ -0,0 +1,54 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -fdump-tree-ivopts-details" } */ + +void foo (double *p) +{ + int i; + for (i = -20000; i < 200000; i+= 40) + { + p[i+0] = 1.0; + p[i+1] = 1.0; + p[i+2] = 1.0; + p[i+3] = 1.0; + p[i+4] = 1.0; + p[i+5] = 1.0; + p[i+6] = 1.0; + p[i+7] = 1.0; + p[i+8] = 1.0; + p[i+9] = 1.0; + p[i+10] = 1.0; + p[i+11] = 1.0; + p[i+12] = 1.0; + p[i+13] = 1.0; + p[i+14] = 1.0; + p[i+15] = 1.0; + p[i+16] = 1.0; + p[i+17] = 1.0; + p[i+18] = 1.0; + p[i+19] = 1.0; + p[i+20] = 1.0; + p[i+21] = 1.0; + p[i+22] = 1.0; + p[i+23] = 1.0; + p[i+24] = 1.0; + p[i+25] = 1.0; + p[i+26] = 1.0; + p[i+27] = 1.0; + p[i+28] = 1.0; + p[i+29] = 1.0; + p[i+30] = 1.0; + p[i+31] = 1.0; + p[i+32] = 1.0; + p[i+33] = 1.0; + p[i+34] = 1.0; + p[i+35] = 1.0; + p[i+36] = 1.0; + p[i+37] = 1.0; + p[i+38] = 1.0; + p[i+39] = 1.0; + } +} + +/* We should groups address type IV uses. */ +/* { dg-final { scan-tree-dump-not "\\nuse 2\\n" "ivopts" } } */ +/* { dg-final { cleanup-tree-dump "ivopts" } } */ diff --git a/gcc-4.9/gcc/testsuite/gcc.dg/tree-ssa/scev-10.c b/gcc-4.9/gcc/testsuite/gcc.dg/tree-ssa/scev-10.c new file mode 100644 index 000000000..2e16c894b --- /dev/null +++ b/gcc-4.9/gcc/testsuite/gcc.dg/tree-ssa/scev-10.c @@ -0,0 +1,21 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -fdump-tree-ivopts-details" } */ + +int *a; + +int +foo (signed char s, signed char l) +{ + signed char i; + int sum = 0; + + for (i = s; i < l; i++) + { + sum += a[i]; + } + + return sum; +} + +/* Address of array reference is scev. */ +/* { dg-final { scan-tree-dump-times "use \[0-9\]\n address" 1 "ivopts" } } */ diff --git a/gcc-4.9/gcc/testsuite/gcc.dg/tree-ssa/scev-3.c b/gcc-4.9/gcc/testsuite/gcc.dg/tree-ssa/scev-3.c index 5cac1cefb..28d5c9327 100644 --- a/gcc-4.9/gcc/testsuite/gcc.dg/tree-ssa/scev-3.c +++ b/gcc-4.9/gcc/testsuite/gcc.dg/tree-ssa/scev-3.c @@ -14,5 +14,5 @@ f(int k) } } -/* { dg-final { scan-tree-dump-times "&a" 1 "optimized" { xfail { lp64 || llp64 } } } } */ +/* { dg-final { scan-tree-dump-times "&a" 1 "optimized" } } */ /* { dg-final { cleanup-tree-dump "optimized" } } */ diff --git a/gcc-4.9/gcc/testsuite/gcc.dg/tree-ssa/scev-4.c b/gcc-4.9/gcc/testsuite/gcc.dg/tree-ssa/scev-4.c index 5f15d622d..6c1e530a9 100644 --- a/gcc-4.9/gcc/testsuite/gcc.dg/tree-ssa/scev-4.c +++ b/gcc-4.9/gcc/testsuite/gcc.dg/tree-ssa/scev-4.c @@ -19,5 +19,5 @@ f(int k) } } -/* { dg-final { scan-tree-dump-times "&a" 1 "optimized" { xfail { lp64 || llp64 } } } } */ +/* { dg-final { scan-tree-dump-times "&a" 1 "optimized" } } */ /* { dg-final { cleanup-tree-dump "optimized" } } */ diff --git a/gcc-4.9/gcc/testsuite/gcc.dg/tree-ssa/scev-8.c b/gcc-4.9/gcc/testsuite/gcc.dg/tree-ssa/scev-8.c new file mode 100644 index 000000000..766f674d5 --- /dev/null +++ b/gcc-4.9/gcc/testsuite/gcc.dg/tree-ssa/scev-8.c @@ -0,0 +1,62 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -fdump-tree-ivopts-details" } */ + +int *a; + +int +foo1 (long long s, long long l) +{ + long long i; + + for (i = s; i < l; i++) + { + a[(short)i] = 0; + } + return 0; +} + +int +foo2 (unsigned char s, unsigned char l, unsigned char c) +{ + unsigned char i, step = 1; + int sum = 0; + + for (i = s; i < l; i++) + { + sum += a[c]; + c += step; + } + + return sum; +} + +int +foo3 (unsigned char s, unsigned char l, unsigned char c) +{ + unsigned char i; + int sum = 0; + + for (i = s; i != l; i += c) + { + sum += a[i]; + } + + return sum; +} + +int +foo4 (unsigned char s, unsigned char l) +{ + unsigned char i; + int sum = 0; + + for (i = s; i != l; i++) + { + sum += a[i]; + } + + return sum; +} + +/* Address of array references are not scevs. */ +/* { dg-final { scan-tree-dump-not "use \[0-9\]\n address" "ivopts" } } */ diff --git a/gcc-4.9/gcc/testsuite/gcc.dg/tree-ssa/scev-9.c b/gcc-4.9/gcc/testsuite/gcc.dg/tree-ssa/scev-9.c new file mode 100644 index 000000000..557e33850 --- /dev/null +++ b/gcc-4.9/gcc/testsuite/gcc.dg/tree-ssa/scev-9.c @@ -0,0 +1,21 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -fdump-tree-ivopts-details" } */ + +int *a; + +int +foo (unsigned char s, unsigned char l) +{ + unsigned char i; + int sum = 0; + + for (i = s; i < l; i += 1) + { + sum += a[i]; + } + + return sum; +} + +/* Address of array reference is scev. */ +/* { dg-final { scan-tree-dump-times "use \[0-9\]\n address" 1 "ivopts" } } */ diff --git a/gcc-4.9/gcc/testsuite/gcc.dg/vect/pr48052.c b/gcc-4.9/gcc/testsuite/gcc.dg/vect/pr48052.c new file mode 100644 index 000000000..c822ebd41 --- /dev/null +++ b/gcc-4.9/gcc/testsuite/gcc.dg/vect/pr48052.c @@ -0,0 +1,26 @@ +/* { dg-do compile } */ +/* { dg-additional-options "-O3 -std=c99" } */ + +int foo(int* A, int* B, unsigned start, unsigned BS) +{ + int s; + for (unsigned k = start; k < start + BS; k++) + { + s += A[k] * B[k]; + } + + return s; +} + +int bar(int* A, int* B, unsigned BS) +{ + int s; + for (unsigned k = 0; k < BS; k++) + { + s += A[k] * B[k]; + } + + return s; +} + +/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 2 "vect" } } */ diff --git a/gcc-4.9/gcc/tree-chrec.c b/gcc-4.9/gcc/tree-chrec.c index b9350f015..3e34c65dc 100644 --- a/gcc-4.9/gcc/tree-chrec.c +++ b/gcc-4.9/gcc/tree-chrec.c @@ -1162,8 +1162,6 @@ nb_vars_in_chrec (tree chrec) } } -static tree chrec_convert_1 (tree, tree, gimple, bool); - /* Converts BASE and STEP of affine scev to TYPE. LOOP is the loop whose iv the scev corresponds to. AT_STMT is the statement at that the scev is evaluated. USE_OVERFLOW_SEMANTICS is true if this function should assume that @@ -1238,8 +1236,7 @@ convert_affine_scev (struct loop *loop, tree type, use_overflow_semantics)) return false; - new_base = chrec_convert_1 (type, *base, at_stmt, - use_overflow_semantics); + new_base = chrec_convert (type, *base, at_stmt, use_overflow_semantics); /* The step must be sign extended, regardless of the signedness of CT and TYPE. This only needs to be handled specially when CT is unsigned -- to avoid e.g. unsigned char [100, +, 255] @@ -1250,10 +1247,11 @@ convert_affine_scev (struct loop *loop, tree type, if (TYPE_PRECISION (step_type) > TYPE_PRECISION (ct) && TYPE_UNSIGNED (ct)) { tree signed_ct = build_nonstandard_integer_type (TYPE_PRECISION (ct), 0); - new_step = chrec_convert_1 (signed_ct, new_step, at_stmt, - use_overflow_semantics); + new_step = chrec_convert (signed_ct, new_step, at_stmt, + use_overflow_semantics); } - new_step = chrec_convert_1 (step_type, new_step, at_stmt, use_overflow_semantics); + new_step = chrec_convert (step_type, new_step, at_stmt, + use_overflow_semantics); if (automatically_generated_chrec_p (new_base) || automatically_generated_chrec_p (new_step)) @@ -1290,36 +1288,6 @@ chrec_convert_rhs (tree type, tree chrec, gimple at_stmt) determining a more accurate estimation of the number of iterations. By default AT_STMT could be safely set to NULL_TREE. - The following rule is always true: TREE_TYPE (chrec) == - TREE_TYPE (CHREC_LEFT (chrec)) == TREE_TYPE (CHREC_RIGHT (chrec)). - An example of what could happen when adding two chrecs and the type - of the CHREC_RIGHT is different than CHREC_LEFT is: - - {(uint) 0, +, (uchar) 10} + - {(uint) 0, +, (uchar) 250} - - that would produce a wrong result if CHREC_RIGHT is not (uint): - - {(uint) 0, +, (uchar) 4} - - instead of - - {(uint) 0, +, (uint) 260} -*/ - -tree -chrec_convert (tree type, tree chrec, gimple at_stmt) -{ - return chrec_convert_1 (type, chrec, at_stmt, true); -} - -/* Convert CHREC to TYPE. When the analyzer knows the context in - which the CHREC is built, it sets AT_STMT to the statement that - contains the definition of the analyzed variable, otherwise the - conversion is less accurate: the information is used for - determining a more accurate estimation of the number of iterations. - By default AT_STMT could be safely set to NULL_TREE. - USE_OVERFLOW_SEMANTICS is true if this function should assume that the rules for overflow of the given language apply (e.g., that signed arithmetics in C does not overflow) -- i.e., to use them to avoid unnecessary @@ -1404,15 +1372,53 @@ keep_cast: return res; } +/* Convert CHREC to TYPE. When the analyzer knows the context in + which the CHREC is built, it sets AT_STMT to the statement that + contains the definition of the analyzed variable, otherwise the + conversion is less accurate: the information is used for + determining a more accurate estimation of the number of iterations. + By default AT_STMT could be safely set to NULL_TREE. + + The following rule is always true: TREE_TYPE (chrec) == + TREE_TYPE (CHREC_LEFT (chrec)) == TREE_TYPE (CHREC_RIGHT (chrec)). + An example of what could happen when adding two chrecs and the type + of the CHREC_RIGHT is different than CHREC_LEFT is: + + {(uint) 0, +, (uchar) 10} + + {(uint) 0, +, (uchar) 250} + + that would produce a wrong result if CHREC_RIGHT is not (uint): + + {(uint) 0, +, (uchar) 4} + + instead of + + {(uint) 0, +, (uint) 260} + + USE_OVERFLOW_SEMANTICS is true if this function should assume that + the rules for overflow of the given language apply (e.g., that signed + arithmetics in C does not overflow) -- i.e., to use them to avoid unnecessary + tests, but also to enforce that the result follows them. */ + +tree +chrec_convert (tree type, tree chrec, gimple at_stmt, + bool use_overflow_semantics) +{ + return chrec_convert_1 (type, chrec, at_stmt, use_overflow_semantics); +} + /* Convert CHREC to TYPE, without regard to signed overflows. Returns the new chrec if something else than what chrec_convert would do happens, NULL_TREE - otherwise. */ + otherwise. This function set TRUE to variable pointed by FOLD_CONVERSIONS + if the result chrec may overflow. */ tree -chrec_convert_aggressive (tree type, tree chrec) +chrec_convert_aggressive (tree type, tree chrec, bool *fold_conversions) { tree inner_type, left, right, lc, rc, rtype; + gcc_assert (fold_conversions != NULL); + if (automatically_generated_chrec_p (chrec) || TREE_CODE (chrec) != POLYNOMIAL_CHREC) return NULL_TREE; @@ -1421,17 +1427,33 @@ chrec_convert_aggressive (tree type, tree chrec) if (TYPE_PRECISION (type) > TYPE_PRECISION (inner_type)) return NULL_TREE; + if (useless_type_conversion_p (type, inner_type)) + return NULL_TREE; + + if (!*fold_conversions && evolution_function_is_affine_p (chrec)) + { + tree base, step; + struct loop *loop; + + loop = get_chrec_loop (chrec); + base = CHREC_LEFT (chrec); + step = CHREC_RIGHT (chrec); + if (convert_affine_scev (loop, type, &base, &step, NULL, true)) + return build_polynomial_chrec (loop->num, base, step); + } rtype = POINTER_TYPE_P (type) ? sizetype : type; left = CHREC_LEFT (chrec); right = CHREC_RIGHT (chrec); - lc = chrec_convert_aggressive (type, left); + lc = chrec_convert_aggressive (type, left, fold_conversions); if (!lc) lc = chrec_convert (type, left, NULL); - rc = chrec_convert_aggressive (rtype, right); + rc = chrec_convert_aggressive (rtype, right, fold_conversions); if (!rc) rc = chrec_convert (rtype, right, NULL); + *fold_conversions = true; + return build_polynomial_chrec (CHREC_VARIABLE (chrec), lc, rc); } diff --git a/gcc-4.9/gcc/tree-chrec.h b/gcc-4.9/gcc/tree-chrec.h index 90cc7a74b..ea4627751 100644 --- a/gcc-4.9/gcc/tree-chrec.h +++ b/gcc-4.9/gcc/tree-chrec.h @@ -59,9 +59,9 @@ enum ev_direction scev_direction (const_tree); extern tree chrec_fold_plus (tree, tree, tree); extern tree chrec_fold_minus (tree, tree, tree); extern tree chrec_fold_multiply (tree, tree, tree); -extern tree chrec_convert (tree, tree, gimple); +extern tree chrec_convert (tree, tree, gimple, bool = true); extern tree chrec_convert_rhs (tree, tree, gimple); -extern tree chrec_convert_aggressive (tree, tree); +extern tree chrec_convert_aggressive (tree, tree, bool *); /* Operations. */ extern tree chrec_apply (unsigned, tree, tree); diff --git a/gcc-4.9/gcc/tree-scalar-evolution.c b/gcc-4.9/gcc/tree-scalar-evolution.c index f1ddc24b1..218baa406 100644 --- a/gcc-4.9/gcc/tree-scalar-evolution.c +++ b/gcc-4.9/gcc/tree-scalar-evolution.c @@ -2116,7 +2116,7 @@ analyze_scalar_evolution_in_loop (struct loop *wrto_loop, struct loop *use_loop, /* We cannot just do tmp = analyze_scalar_evolution (use_loop, version); - ev = resolve_mixers (wrto_loop, tmp); + ev = resolve_mixers (wrto_loop, tmp, folded_casts); as resolve_mixers would query the scalar evolution with respect to wrto_loop. For example, in the situation described in the function @@ -2125,9 +2125,9 @@ analyze_scalar_evolution_in_loop (struct loop *wrto_loop, struct loop *use_loop, analyze_scalar_evolution (use_loop, version) = k2 - and resolve_mixers (loop1, k2) finds that the value of k2 in loop 1 - is 100, which is a wrong result, since we are interested in the - value in loop 3. + and resolve_mixers (loop1, k2, folded_casts) finds that the value of + k2 in loop 1 is 100, which is a wrong result, since we are interested + in the value in loop 3. Instead, we need to proceed from use_loop to wrto_loop loop by loop, each time checking that there is no evolution in the inner loop. */ @@ -2137,10 +2137,7 @@ analyze_scalar_evolution_in_loop (struct loop *wrto_loop, struct loop *use_loop, while (1) { tmp = analyze_scalar_evolution (use_loop, ev); - ev = resolve_mixers (use_loop, tmp); - - if (folded_casts && tmp != ev) - *folded_casts = true; + ev = resolve_mixers (use_loop, tmp, folded_casts); if (use_loop == wrto_loop) return ev; @@ -2262,7 +2259,7 @@ loop_closed_phi_def (tree var) } static tree instantiate_scev_r (basic_block, struct loop *, struct loop *, - tree, bool, int); + tree, bool *, int); /* Analyze all the parameters of the chrec, between INSTANTIATE_BELOW and EVOLUTION_LOOP, that were left under a symbolic form. @@ -2271,9 +2268,10 @@ static tree instantiate_scev_r (basic_block, struct loop *, struct loop *, CACHE is the cache of already instantiated values. - FOLD_CONVERSIONS should be set to true when the conversions that - may wrap in signed/pointer type are folded, as long as the value of - the chrec is preserved. + Variable pointed by FOLD_CONVERSIONS is set to TRUE when the + conversions that may wrap in signed/pointer type are folded, as long + as the value of the chrec is preserved. If FOLD_CONVERSIONS is NULL + then we don't do such fold. SIZE_EXPR is used for computing the size of the expression to be instantiated, and to stop if it exceeds some limit. */ @@ -2282,7 +2280,7 @@ static tree instantiate_scev_name (basic_block instantiate_below, struct loop *evolution_loop, struct loop *inner_loop, tree chrec, - bool fold_conversions, + bool *fold_conversions, int size_expr) { tree res; @@ -2376,9 +2374,10 @@ instantiate_scev_name (basic_block instantiate_below, CACHE is the cache of already instantiated values. - FOLD_CONVERSIONS should be set to true when the conversions that - may wrap in signed/pointer type are folded, as long as the value of - the chrec is preserved. + Variable pointed by FOLD_CONVERSIONS is set to TRUE when the + conversions that may wrap in signed/pointer type are folded, as long + as the value of the chrec is preserved. If FOLD_CONVERSIONS is NULL + then we don't do such fold. SIZE_EXPR is used for computing the size of the expression to be instantiated, and to stop if it exceeds some limit. */ @@ -2386,7 +2385,7 @@ instantiate_scev_name (basic_block instantiate_below, static tree instantiate_scev_poly (basic_block instantiate_below, struct loop *evolution_loop, struct loop *, - tree chrec, bool fold_conversions, int size_expr) + tree chrec, bool *fold_conversions, int size_expr) { tree op1; tree op0 = instantiate_scev_r (instantiate_below, evolution_loop, @@ -2420,9 +2419,10 @@ instantiate_scev_poly (basic_block instantiate_below, CACHE is the cache of already instantiated values. - FOLD_CONVERSIONS should be set to true when the conversions that - may wrap in signed/pointer type are folded, as long as the value of - the chrec is preserved. + Variable pointed by FOLD_CONVERSIONS is set to TRUE when the + conversions that may wrap in signed/pointer type are folded, as long + as the value of the chrec is preserved. If FOLD_CONVERSIONS is NULL + then we don't do such fold. SIZE_EXPR is used for computing the size of the expression to be instantiated, and to stop if it exceeds some limit. */ @@ -2432,7 +2432,7 @@ instantiate_scev_binary (basic_block instantiate_below, struct loop *evolution_loop, struct loop *inner_loop, tree chrec, enum tree_code code, tree type, tree c0, tree c1, - bool fold_conversions, int size_expr) + bool *fold_conversions, int size_expr) { tree op1; tree op0 = instantiate_scev_r (instantiate_below, evolution_loop, inner_loop, @@ -2478,9 +2478,10 @@ instantiate_scev_binary (basic_block instantiate_below, CACHE is the cache of already instantiated values. - FOLD_CONVERSIONS should be set to true when the conversions that - may wrap in signed/pointer type are folded, as long as the value of - the chrec is preserved. + Variable pointed by FOLD_CONVERSIONS is set to TRUE when the + conversions that may wrap in signed/pointer type are folded, as long + as the value of the chrec is preserved. If FOLD_CONVERSIONS is NULL + then we don't do such fold. SIZE_EXPR is used for computing the size of the expression to be instantiated, and to stop if it exceeds some limit. */ @@ -2488,7 +2489,7 @@ instantiate_scev_binary (basic_block instantiate_below, static tree instantiate_array_ref (basic_block instantiate_below, struct loop *evolution_loop, struct loop *inner_loop, - tree chrec, bool fold_conversions, int size_expr) + tree chrec, bool *fold_conversions, int size_expr) { tree res; tree index = TREE_OPERAND (chrec, 1); @@ -2515,9 +2516,10 @@ instantiate_array_ref (basic_block instantiate_below, CACHE is the cache of already instantiated values. - FOLD_CONVERSIONS should be set to true when the conversions that - may wrap in signed/pointer type are folded, as long as the value of - the chrec is preserved. + Variable pointed by FOLD_CONVERSIONS is set to TRUE when the + conversions that may wrap in signed/pointer type are folded, as long + as the value of the chrec is preserved. If FOLD_CONVERSIONS is NULL + then we don't do such fold. SIZE_EXPR is used for computing the size of the expression to be instantiated, and to stop if it exceeds some limit. */ @@ -2526,7 +2528,7 @@ static tree instantiate_scev_convert (basic_block instantiate_below, struct loop *evolution_loop, struct loop *inner_loop, tree chrec, tree type, tree op, - bool fold_conversions, int size_expr) + bool *fold_conversions, int size_expr) { tree op0 = instantiate_scev_r (instantiate_below, evolution_loop, inner_loop, op, @@ -2537,19 +2539,21 @@ instantiate_scev_convert (basic_block instantiate_below, if (fold_conversions) { - tree tmp = chrec_convert_aggressive (type, op0); + tree tmp = chrec_convert_aggressive (type, op0, fold_conversions); if (tmp) return tmp; - } - if (chrec && op0 == op) - return chrec; + /* If we used chrec_convert_aggressive, we can no longer assume that + signed chrecs do not overflow, as chrec_convert does, so avoid + calling it in that case. */ + if (*fold_conversions) + { + if (chrec && op0 == op) + return chrec; - /* If we used chrec_convert_aggressive, we can no longer assume that - signed chrecs do not overflow, as chrec_convert does, so avoid - calling it in that case. */ - if (fold_conversions) - return fold_convert (type, op0); + return fold_convert (type, op0); + } + } return chrec_convert (type, op0, NULL); } @@ -2563,9 +2567,10 @@ instantiate_scev_convert (basic_block instantiate_below, CACHE is the cache of already instantiated values. - FOLD_CONVERSIONS should be set to true when the conversions that - may wrap in signed/pointer type are folded, as long as the value of - the chrec is preserved. + Variable pointed by FOLD_CONVERSIONS is set to TRUE when the + conversions that may wrap in signed/pointer type are folded, as long + as the value of the chrec is preserved. If FOLD_CONVERSIONS is NULL + then we don't do such fold. SIZE_EXPR is used for computing the size of the expression to be instantiated, and to stop if it exceeds some limit. */ @@ -2575,7 +2580,7 @@ instantiate_scev_not (basic_block instantiate_below, struct loop *evolution_loop, struct loop *inner_loop, tree chrec, enum tree_code code, tree type, tree op, - bool fold_conversions, int size_expr) + bool *fold_conversions, int size_expr) { tree op0 = instantiate_scev_r (instantiate_below, evolution_loop, inner_loop, op, @@ -2613,9 +2618,10 @@ instantiate_scev_not (basic_block instantiate_below, CACHE is the cache of already instantiated values. - FOLD_CONVERSIONS should be set to true when the conversions that - may wrap in signed/pointer type are folded, as long as the value of - the chrec is preserved. + Variable pointed by FOLD_CONVERSIONS is set to TRUE when the + conversions that may wrap in signed/pointer type are folded, as long + as the value of the chrec is preserved. If FOLD_CONVERSIONS is NULL + then we don't do such fold. SIZE_EXPR is used for computing the size of the expression to be instantiated, and to stop if it exceeds some limit. */ @@ -2624,7 +2630,7 @@ static tree instantiate_scev_3 (basic_block instantiate_below, struct loop *evolution_loop, struct loop *inner_loop, tree chrec, - bool fold_conversions, int size_expr) + bool *fold_conversions, int size_expr) { tree op1, op2; tree op0 = instantiate_scev_r (instantiate_below, evolution_loop, @@ -2661,9 +2667,10 @@ instantiate_scev_3 (basic_block instantiate_below, CACHE is the cache of already instantiated values. - FOLD_CONVERSIONS should be set to true when the conversions that - may wrap in signed/pointer type are folded, as long as the value of - the chrec is preserved. + Variable pointed by FOLD_CONVERSIONS is set to TRUE when the + conversions that may wrap in signed/pointer type are folded, as long + as the value of the chrec is preserved. If FOLD_CONVERSIONS is NULL + then we don't do such fold. SIZE_EXPR is used for computing the size of the expression to be instantiated, and to stop if it exceeds some limit. */ @@ -2672,7 +2679,7 @@ static tree instantiate_scev_2 (basic_block instantiate_below, struct loop *evolution_loop, struct loop *inner_loop, tree chrec, - bool fold_conversions, int size_expr) + bool *fold_conversions, int size_expr) { tree op1; tree op0 = instantiate_scev_r (instantiate_below, evolution_loop, @@ -2701,9 +2708,10 @@ instantiate_scev_2 (basic_block instantiate_below, CACHE is the cache of already instantiated values. - FOLD_CONVERSIONS should be set to true when the conversions that - may wrap in signed/pointer type are folded, as long as the value of - the chrec is preserved. + Variable pointed by FOLD_CONVERSIONS is set to TRUE when the + conversions that may wrap in signed/pointer type are folded, as long + as the value of the chrec is preserved. If FOLD_CONVERSIONS is NULL + then we don't do such fold. SIZE_EXPR is used for computing the size of the expression to be instantiated, and to stop if it exceeds some limit. */ @@ -2712,7 +2720,7 @@ static tree instantiate_scev_1 (basic_block instantiate_below, struct loop *evolution_loop, struct loop *inner_loop, tree chrec, - bool fold_conversions, int size_expr) + bool *fold_conversions, int size_expr) { tree op0 = instantiate_scev_r (instantiate_below, evolution_loop, inner_loop, TREE_OPERAND (chrec, 0), @@ -2734,9 +2742,10 @@ instantiate_scev_1 (basic_block instantiate_below, CACHE is the cache of already instantiated values. - FOLD_CONVERSIONS should be set to true when the conversions that - may wrap in signed/pointer type are folded, as long as the value of - the chrec is preserved. + Variable pointed by FOLD_CONVERSIONS is set to TRUE when the + conversions that may wrap in signed/pointer type are folded, as long + as the value of the chrec is preserved. If FOLD_CONVERSIONS is NULL + then we don't do such fold. SIZE_EXPR is used for computing the size of the expression to be instantiated, and to stop if it exceeds some limit. */ @@ -2745,7 +2754,7 @@ static tree instantiate_scev_r (basic_block instantiate_below, struct loop *evolution_loop, struct loop *inner_loop, tree chrec, - bool fold_conversions, int size_expr) + bool *fold_conversions, int size_expr) { /* Give up if the expression is larger than the MAX that we allow. */ if (size_expr++ > PARAM_VALUE (PARAM_SCEV_MAX_EXPR_SIZE)) @@ -2870,7 +2879,7 @@ instantiate_scev (basic_block instantiate_below, struct loop *evolution_loop, } res = instantiate_scev_r (instantiate_below, evolution_loop, - NULL, chrec, false, 0); + NULL, chrec, NULL, 0); if (destr) { @@ -2894,9 +2903,10 @@ instantiate_scev (basic_block instantiate_below, struct loop *evolution_loop, of an expression. */ tree -resolve_mixers (struct loop *loop, tree chrec) +resolve_mixers (struct loop *loop, tree chrec, bool *folded_casts) { bool destr = false; + bool fold_conversions = false; if (!global_cache) { global_cache = new instantiate_cache_type; @@ -2904,7 +2914,10 @@ resolve_mixers (struct loop *loop, tree chrec) } tree ret = instantiate_scev_r (block_before_loop (loop), loop, NULL, - chrec, true, 0); + chrec, &fold_conversions, 0); + + if (folded_casts && !*folded_casts) + *folded_casts = fold_conversions; if (destr) { @@ -3369,7 +3382,8 @@ scev_const_prop (void) && !INTEGRAL_TYPE_P (type)) continue; - ev = resolve_mixers (loop, analyze_scalar_evolution (loop, name)); + ev = resolve_mixers (loop, analyze_scalar_evolution (loop, name), + NULL); if (!is_gimple_min_invariant (ev) || !may_propagate_copy (name, ev)) continue; diff --git a/gcc-4.9/gcc/tree-scalar-evolution.h b/gcc-4.9/gcc/tree-scalar-evolution.h index 556997684..cb9af5122 100644 --- a/gcc-4.9/gcc/tree-scalar-evolution.h +++ b/gcc-4.9/gcc/tree-scalar-evolution.h @@ -31,7 +31,7 @@ extern void scev_reset_htab (void); extern void scev_finalize (void); extern tree analyze_scalar_evolution (struct loop *, tree); extern tree instantiate_scev (basic_block, struct loop *, tree); -extern tree resolve_mixers (struct loop *, tree); +extern tree resolve_mixers (struct loop *, tree, bool *); extern void gather_stats_on_scev_database (void); extern unsigned int scev_const_prop (void); extern bool expression_expensive_p (tree); diff --git a/gcc-4.9/gcc/tree-ssa-loop-ivopts.c b/gcc-4.9/gcc/tree-ssa-loop-ivopts.c index c5a5dd48a..0bf8e2aff 100644 --- a/gcc-4.9/gcc/tree-ssa-loop-ivopts.c +++ b/gcc-4.9/gcc/tree-ssa-loop-ivopts.c @@ -142,9 +142,10 @@ struct iv tree base_object; /* A memory object to that the induction variable points. */ tree step; /* Step of the iv (constant only). */ tree ssa_name; /* The ssa name with the value. */ + unsigned use_id; /* The identifier in the use if it is the case. */ bool biv_p; /* Is it a biv? */ bool have_use_for; /* Do we already have a use for it? */ - unsigned use_id; /* The identifier in the use if it is the case. */ + bool no_overflow; /* True if the iv doesn't overflow. */ }; /* Per-ssa version information (induction variable descriptions, etc.). */ @@ -197,6 +198,7 @@ struct cost_pair struct iv_use { unsigned id; /* The id of the use. */ + unsigned sub_id; /* The id of the sub use. */ enum use_type type; /* Type of the use. */ struct iv *iv; /* The induction variable it is based on. */ gimple stmt; /* Statement in that it occurs. */ @@ -210,6 +212,11 @@ struct iv_use struct iv_cand *selected; /* The selected candidate. */ + + struct iv_use *next; /* The next sub use. */ + tree addr_base; /* Base address with const offset stripped. */ + unsigned HOST_WIDE_INT addr_offset; + /* Const offset stripped from base address. */ }; /* The position where the iv is computed. */ @@ -522,7 +529,11 @@ dump_iv (FILE *file, struct iv *iv) void dump_use (FILE *file, struct iv_use *use) { - fprintf (file, "use %d\n", use->id); + fprintf (file, "use %d", use->id); + if (use->sub_id) + fprintf (file, ".%d", use->sub_id); + + fprintf (file, "\n"); switch (use->type) { @@ -571,8 +582,12 @@ dump_uses (FILE *file, struct ivopts_data *data) for (i = 0; i < n_iv_uses (data); i++) { use = iv_use (data, i); - - dump_use (file, use); + do + { + dump_use (file, use); + use = use->next; + } + while (use); fprintf (file, "\n"); } } @@ -929,10 +944,10 @@ determine_base_object (tree expr) } /* Allocates an induction variable with given initial value BASE and step STEP - for loop LOOP. */ + for loop LOOP. NO_OVERFLOW implies the iv doesn't overflow. */ static struct iv * -alloc_iv (tree base, tree step) +alloc_iv (tree base, tree step, bool no_overflow = false) { tree base_object = base; struct iv *iv = XCNEW (struct iv); @@ -963,21 +978,24 @@ alloc_iv (tree base, tree step) iv->have_use_for = false; iv->use_id = 0; iv->ssa_name = NULL_TREE; + iv->no_overflow = no_overflow; return iv; } -/* Sets STEP and BASE for induction variable IV. */ +/* Sets STEP and BASE for induction variable IV. NO_OVERFLOW implies the IV + doesn't overflow. */ static void -set_iv (struct ivopts_data *data, tree iv, tree base, tree step) +set_iv (struct ivopts_data *data, tree iv, tree base, tree step, + bool no_overflow) { struct version_info *info = name_info (data, iv); gcc_assert (!info->iv); bitmap_set_bit (data->relevant, SSA_NAME_VERSION (iv)); - info->iv = alloc_iv (base, step); + info->iv = alloc_iv (base, step, no_overflow); info->iv->ssa_name = iv; } @@ -999,37 +1017,19 @@ get_iv (struct ivopts_data *data, tree var) if (!bb || !flow_bb_inside_loop_p (data->current_loop, bb)) - set_iv (data, var, var, build_int_cst (type, 0)); + set_iv (data, var, var, build_int_cst (type, 0), true); } return name_info (data, var)->iv; } -/* Determines the step of a biv defined in PHI. Returns NULL if PHI does - not define a simple affine biv with nonzero step. */ - -static tree -determine_biv_step (gimple phi) -{ - struct loop *loop = gimple_bb (phi)->loop_father; - tree name = PHI_RESULT (phi); - affine_iv iv; - - if (virtual_operand_p (name)) - return NULL_TREE; - - if (!simple_iv (loop, loop, name, &iv, true)) - return NULL_TREE; - - return integer_zerop (iv.step) ? NULL_TREE : iv.step; -} - /* Finds basic ivs. */ static bool find_bivs (struct ivopts_data *data) { gimple phi; + affine_iv iv; tree step, type, base; bool found = false; struct loop *loop = data->current_loop; @@ -1042,10 +1042,16 @@ find_bivs (struct ivopts_data *data) if (SSA_NAME_OCCURS_IN_ABNORMAL_PHI (PHI_RESULT (phi))) continue; - step = determine_biv_step (phi); - if (!step) + if (virtual_operand_p (PHI_RESULT (phi))) continue; + if (!simple_iv (loop, loop, PHI_RESULT (phi), &iv, true)) + continue; + + if (integer_zerop (iv.step)) + continue; + + step = iv.step; base = PHI_ARG_DEF_FROM_EDGE (phi, loop_preheader_edge (loop)); base = expand_simple_operations (base); if (contains_abnormal_ssa_name_p (base) @@ -1062,7 +1068,7 @@ find_bivs (struct ivopts_data *data) step = fold_convert (type, step); } - set_iv (data, PHI_RESULT (phi), base, step); + set_iv (data, PHI_RESULT (phi), base, step, iv.no_overflow); found = true; } @@ -1158,7 +1164,7 @@ find_givs_in_stmt (struct ivopts_data *data, gimple stmt) if (!find_givs_in_stmt_scev (data, stmt, &iv)) return; - set_iv (data, gimple_assign_lhs (stmt), iv.base, iv.step); + set_iv (data, gimple_assign_lhs (stmt), iv.base, iv.step, iv.no_overflow); } /* Finds general ivs in basic block BB. */ @@ -1229,33 +1235,88 @@ find_induction_variables (struct ivopts_data *data) return true; } -/* Records a use of type USE_TYPE at *USE_P in STMT whose value is IV. */ +/* Records a use of type USE_TYPE at *USE_P in STMT whose value is IV. + For address type use, ADDR_BASE is the stripped IV base, ADDR_OFFSET + is the const offset stripped from IV base. For uses of other types, + ADDR_BASE and ADDR_OFFSET are zero by default. */ static struct iv_use * record_use (struct ivopts_data *data, tree *use_p, struct iv *iv, - gimple stmt, enum use_type use_type) + gimple stmt, enum use_type use_type, tree addr_base = NULL, + unsigned HOST_WIDE_INT addr_offset = 0) { struct iv_use *use = XCNEW (struct iv_use); use->id = n_iv_uses (data); + use->sub_id = 0; use->type = use_type; use->iv = iv; use->stmt = stmt; use->op_p = use_p; use->related_cands = BITMAP_ALLOC (NULL); + use->next = NULL; + use->addr_base = addr_base; + use->addr_offset = addr_offset; /* To avoid showing ssa name in the dumps, if it was not reset by the caller. */ iv->ssa_name = NULL_TREE; - if (dump_file && (dump_flags & TDF_DETAILS)) - dump_use (dump_file, use); - data->iv_uses.safe_push (use); return use; } +/* Records a sub use of type USE_TYPE at *USE_P in STMT whose value is IV. + The sub use is recorded under the one whose use id is ID_GROUP. */ + +static struct iv_use * +record_sub_use (struct ivopts_data *data, tree *use_p, + struct iv *iv, gimple stmt, enum use_type use_type, + tree addr_base, unsigned HOST_WIDE_INT addr_offset, + unsigned int id_group) +{ + struct iv_use *use = XCNEW (struct iv_use); + struct iv_use *group = iv_use (data, id_group); + + use->id = group->id; + use->sub_id = 0; + use->type = use_type; + use->iv = iv; + use->stmt = stmt; + use->op_p = use_p; + use->related_cands = NULL; + use->addr_base = addr_base; + use->addr_offset = addr_offset; + + /* Sub use list is maintained in offset ascending order. */ + if (addr_offset <= group->addr_offset) + { + use->related_cands = group->related_cands; + group->related_cands = NULL; + use->next = group; + data->iv_uses[id_group] = use; + } + else + { + struct iv_use *pre; + do + { + pre = group; + group = group->next; + } + while (group && addr_offset > group->addr_offset); + use->next = pre->next; + pre->next = use; + } + + /* To avoid showing ssa name in the dumps, if it was not reset by the + caller. */ + iv->ssa_name = NULL_TREE; + + return use; +} + /* Checks whether OP is a loop-level invariant and if so, records it. NONLINEAR_USE is true if the invariant is used in a way we do not handle specially. */ @@ -1515,6 +1576,7 @@ idx_find_step (tree base, tree *idx, void *data) { struct ifs_ivopts_data *dta = (struct ifs_ivopts_data *) data; struct iv *iv; + bool use_overflow_semantics = false; tree step, iv_base, iv_step, lbound, off; struct loop *loop = dta->ivopts_data->current_loop; @@ -1574,9 +1636,12 @@ idx_find_step (tree base, tree *idx, void *data) iv_base = iv->base; iv_step = iv->step; + if (iv->no_overflow && nowrap_type_p (TREE_TYPE (iv_step))) + use_overflow_semantics = true; + if (!convert_affine_scev (dta->ivopts_data->current_loop, sizetype, &iv_base, &iv_step, dta->stmt, - false)) + use_overflow_semantics)) { /* The index might wrap. */ return false; @@ -1739,6 +1804,50 @@ may_be_nonaddressable_p (tree expr) return false; } +static tree +strip_offset (tree expr, unsigned HOST_WIDE_INT *offset); + +/* Record a use of type USE_TYPE at *USE_P in STMT whose value is IV. + If there is an existing use which has same stripped iv base and step, + this function records this one as a sub use to that; otherwise records + it as a normal one. */ + +static struct iv_use * +record_group_use (struct ivopts_data *data, tree *use_p, + struct iv *iv, gimple stmt, enum use_type use_type) +{ + unsigned int i; + struct iv_use *use; + tree addr_base; + unsigned HOST_WIDE_INT addr_offset; + + /* Only support sub use for address type uses, that is, with base + object. */ + if (!iv->base_object) + return record_use (data, use_p, iv, stmt, use_type); + + addr_base = strip_offset (iv->base, &addr_offset); + for (i = 0; i < n_iv_uses (data); i++) + { + use = iv_use (data, i); + if (use->type != USE_ADDRESS || !use->iv->base_object) + continue; + + /* Check if it has the same stripped base and step. */ + if (operand_equal_p (iv->base_object, use->iv->base_object, 0) + && operand_equal_p (iv->step, use->iv->step, 0) + && operand_equal_p (addr_base, use->addr_base, 0)) + break; + } + + if (i == n_iv_uses (data)) + return record_use (data, use_p, iv, stmt, + use_type, addr_base, addr_offset); + else + return record_sub_use (data, use_p, iv, stmt, + use_type, addr_base, addr_offset, i); +} + /* Finds addresses in *OP_P inside STMT. */ static void @@ -1849,7 +1958,7 @@ find_interesting_uses_address (struct ivopts_data *data, gimple stmt, tree *op_p } civ = alloc_iv (base, step); - record_use (data, op_p, civ, stmt, USE_ADDRESS); + record_group_use (data, op_p, civ, stmt, USE_ADDRESS); return; fail: @@ -2035,6 +2144,172 @@ find_interesting_uses (struct ivopts_data *data) free (body); } +/* Compute maximum offset of [base + offset] addressing mode + for memory reference represented by USE. */ + +static HOST_WIDE_INT +compute_max_addr_offset (struct iv_use *use) +{ + int width; + rtx reg, addr; + HOST_WIDE_INT i, off; + unsigned list_index, num; + addr_space_t as; + machine_mode mem_mode, addr_mode; + static vec<HOST_WIDE_INT> max_offset_list; + + as = TYPE_ADDR_SPACE (TREE_TYPE (use->iv->base)); + mem_mode = TYPE_MODE (TREE_TYPE (*use->op_p)); + + num = max_offset_list.length (); + list_index = (unsigned) as * MAX_MACHINE_MODE + (unsigned) mem_mode; + if (list_index >= num) + { + max_offset_list.safe_grow (list_index + MAX_MACHINE_MODE); + for (; num < max_offset_list.length (); num++) + max_offset_list[num] = -1; + } + + off = max_offset_list[list_index]; + if (off != -1) + return off; + + addr_mode = targetm.addr_space.address_mode (as); + reg = gen_raw_REG (addr_mode, LAST_VIRTUAL_REGISTER + 1); + addr = gen_rtx_fmt_ee (PLUS, addr_mode, reg, NULL_RTX); + + width = GET_MODE_BITSIZE (addr_mode) - 1; + if (width > (HOST_BITS_PER_WIDE_INT - 1)) + width = HOST_BITS_PER_WIDE_INT - 1; + + for (i = width; i > 0; i--) + { + off = ((unsigned HOST_WIDE_INT) 1 << i) - 1; + XEXP (addr, 1) = gen_int_mode (off, addr_mode); + if (memory_address_addr_space_p (mem_mode, addr, as)) + break; + + /* For some strict-alignment targets, the offset must be naturally + aligned. Try an aligned offset if mem_mode is not QImode. */ + off = ((unsigned HOST_WIDE_INT) 1 << i); + if (off > GET_MODE_SIZE (mem_mode) && mem_mode != QImode) + { + off -= GET_MODE_SIZE (mem_mode); + XEXP (addr, 1) = gen_int_mode (off, addr_mode); + if (memory_address_addr_space_p (mem_mode, addr, as)) + break; + } + } + if (i == 0) + off = 0; + + max_offset_list[list_index] = off; + return off; +} + +/* Check if all small groups should be split. Return true if and + only if: + + 1) At least one groups contain two uses with different offsets. + 2) No group contains more than two uses with different offsets. + + Return false otherwise. We want to split such groups because: + + 1) Small groups don't have much benefit and may interfer with + general candidate selection. + 2) Size for problem with only small groups is usually small and + general algorithm can handle it well. + + TODO -- Above claim may not hold when auto increment is supported. */ + +static bool +split_all_small_groups (struct ivopts_data *data) +{ + bool split_p = false; + unsigned int i, n, distinct; + struct iv_use *pre, *use; + + n = n_iv_uses (data); + for (i = 0; i < n; i++) + { + use = iv_use (data, i); + if (!use->next) + continue; + + distinct = 1; + gcc_assert (use->type == USE_ADDRESS); + for (pre = use, use = use->next; use; pre = use, use = use->next) + { + if (pre->addr_offset != use->addr_offset) + distinct++; + + if (distinct > 2) + return false; + } + if (distinct == 2) + split_p = true; + } + + return split_p; +} + +/* For each group of address type uses, this function further groups + these uses according to the maximum offset supported by target's + [base + offset] addressing mode. */ + +static void +group_address_uses (struct ivopts_data *data) +{ + HOST_WIDE_INT max_offset = -1; + unsigned int i, n, sub_id; + struct iv_use *pre, *use; + unsigned HOST_WIDE_INT addr_offset_first; + + /* Reset max offset to split all small groups. */ + if (split_all_small_groups (data)) + max_offset = 0; + + n = n_iv_uses (data); + for (i = 0; i < n; i++) + { + use = iv_use (data, i); + if (!use->next) + continue; + + gcc_assert (use->type == USE_ADDRESS); + if (max_offset != 0) + max_offset = compute_max_addr_offset (use); + + while (use) + { + sub_id = 0; + addr_offset_first = use->addr_offset; + /* Only uses with offset that can fit in offset part against + the first use can be grouped together. */ + for (pre = use, use = use->next; + use && (use->addr_offset - addr_offset_first + <= (unsigned HOST_WIDE_INT) max_offset); + pre = use, use = use->next) + { + use->id = pre->id; + use->sub_id = ++sub_id; + } + + /* Break the list and create new group. */ + if (use) + { + pre->next = NULL; + use->id = n_iv_uses (data); + use->related_cands = BITMAP_ALLOC (NULL); + data->iv_uses.safe_push (use); + } + } + } + + if (dump_file && (dump_flags & TDF_DETAILS)) + dump_uses (dump_file, data); +} + /* Strips constant offsets from EXPR and stores them to OFFSET. If INSIDE_ADDR is true, assume we are inside an address. If TOP_COMPREF is true, assume we are at the top-level of the processed address. */ @@ -2458,6 +2733,8 @@ static void add_candidate (struct ivopts_data *data, tree base, tree step, bool important, struct iv_use *use) { + gcc_assert (use == NULL || use->sub_id == 0); + if (ip_normal_pos (data->current_loop)) add_candidate_1 (data, base, step, important, IP_NORMAL, use, NULL); if (ip_end_pos (data->current_loop) @@ -2687,11 +2964,22 @@ new_cost (unsigned runtime, unsigned complexity) return cost; } +/* Returns true if COST is infinite. */ + +static bool +infinite_cost_p (comp_cost cost) +{ + return cost.cost == INFTY; +} + /* Adds costs COST1 and COST2. */ static comp_cost add_costs (comp_cost cost1, comp_cost cost2) { + if (infinite_cost_p (cost1) || infinite_cost_p (cost2)) + return infinite_cost; + cost1.cost += cost2.cost; cost1.complexity += cost2.complexity; @@ -2720,14 +3008,6 @@ compare_costs (comp_cost cost1, comp_cost cost2) return cost1.cost - cost2.cost; } -/* Returns true if COST is infinite. */ - -static bool -infinite_cost_p (comp_cost cost) -{ - return cost.cost == INFTY; -} - /* Sets cost of (USE, CANDIDATE) pair to COST and record that it depends on invariants DEPENDS_ON and that the value used in expressing it is VALUE, and in case of iv elimination the comparison operator is COMP. */ @@ -4204,7 +4484,15 @@ get_computation_cost_at (struct ivopts_data *data, cost.cost += add_cost (data->speed, TYPE_MODE (ctype)); } - if (inv_expr_id) + /* Set of invariants depended on by sub use has already been computed + for the first use in the group. */ + if (use->sub_id) + { + cost.cost = 0; + if (depends_on && *depends_on) + bitmap_clear (*depends_on); + } + else if (inv_expr_id) { *inv_expr_id = get_loop_invariant_expr_id (data, ubase, cbase, ratio, address_p); @@ -4333,6 +4621,8 @@ determine_use_iv_cost_address (struct ivopts_data *data, bitmap depends_on; bool can_autoinc; int inv_expr_id = -1; + struct iv_use *sub_use; + comp_cost sub_cost; comp_cost cost = get_computation_cost (data, use, cand, true, &depends_on, &can_autoinc, &inv_expr_id); @@ -4346,6 +4636,15 @@ determine_use_iv_cost_address (struct ivopts_data *data, else if (cand->pos == IP_AFTER_USE || cand->pos == IP_BEFORE_USE) cost = infinite_cost; } + for (sub_use = use->next; + sub_use && !infinite_cost_p (cost); + sub_use = sub_use->next) + { + sub_cost = get_computation_cost (data, sub_use, cand, true, &depends_on, + &can_autoinc, &inv_expr_id); + cost = add_costs (cost, sub_cost); + } + set_use_iv_cost (data, use, cand, cost, depends_on, NULL_TREE, ERROR_MARK, inv_expr_id); @@ -6533,8 +6832,8 @@ adjust_iv_update_pos (struct iv_cand *cand, struct iv_use *use) /* Rewrites USE (address that is an iv) using candidate CAND. */ static void -rewrite_use_address (struct ivopts_data *data, - struct iv_use *use, struct iv_cand *cand) +rewrite_use_address_1 (struct ivopts_data *data, + struct iv_use *use, struct iv_cand *cand) { aff_tree aff; gimple_stmt_iterator bsi = gsi_for_stmt (use->stmt); @@ -6569,6 +6868,28 @@ rewrite_use_address (struct ivopts_data *data, *use->op_p = ref; } +/* Rewrites USE (address that is an iv) using candidate CAND. If it's the + first use of a group, rewrites sub uses in the group too. */ + +static void +rewrite_use_address (struct ivopts_data *data, + struct iv_use *use, struct iv_cand *cand) +{ + struct iv_use *next; + + gcc_assert (use->sub_id == 0); + rewrite_use_address_1 (data, use, cand); + update_stmt (use->stmt); + + for (next = use->next; next != NULL; next = next->next) + { + rewrite_use_address_1 (data, next, cand); + update_stmt (next->stmt); + } + + return; +} + /* Rewrites USE (the condition such that one of the arguments is an iv) using candidate CAND. */ @@ -6845,6 +7166,18 @@ free_loop_data (struct ivopts_data *data) for (i = 0; i < n_iv_uses (data); i++) { struct iv_use *use = iv_use (data, i); + struct iv_use *pre = use, *sub = use->next; + + while (sub) + { + gcc_assert (sub->related_cands == NULL); + gcc_assert (sub->n_map_members == 0 && sub->cost_map == NULL); + + free (sub->iv); + pre = sub; + sub = sub->next; + free (pre); + } free (use->iv); BITMAP_FREE (use->related_cands); @@ -6964,6 +7297,7 @@ tree_ssa_iv_optimize_loop (struct ivopts_data *data, struct loop *loop) /* Finds interesting uses (item 1). */ find_interesting_uses (data); + group_address_uses (data); if (n_iv_uses (data) > MAX_CONSIDERED_USES) goto finish; diff --git a/gcc-4.9/gcc/tree-ssa-loop-niter.c b/gcc-4.9/gcc/tree-ssa-loop-niter.c index 8fb72b6a5..cb9214fbb 100644 --- a/gcc-4.9/gcc/tree-ssa-loop-niter.c +++ b/gcc-4.9/gcc/tree-ssa-loop-niter.c @@ -22,6 +22,7 @@ along with GCC; see the file COPYING3. If not see #include "coretypes.h" #include "tm.h" #include "tree.h" +#include "stor-layout.h" #include "calls.h" #include "expr.h" #include "tm_p.h" @@ -55,7 +56,7 @@ along with GCC; see the file COPYING3. If not see #include "tree-pass.h" #include "stringpool.h" #include "tree-ssanames.h" - +#include "ggc.h" #define SWAP(X, Y) do { affine_iv *tmp = (X); (X) = (Y); (Y) = tmp; } while (0) @@ -1161,6 +1162,7 @@ number_of_iterations_lt (tree type, affine_iv *iv0, affine_iv *iv1, iv1->base, iv0->base); niter->niter = delta; niter->max = mpz_get_double_int (niter_type, bnds->up, false); + niter->control.no_overflow = true; return true; } @@ -1938,6 +1940,9 @@ number_of_iterations_exit (struct loop *loop, edge exit, return false; niter->assumptions = boolean_false_node; + niter->control.base = NULL_TREE; + niter->control.step = NULL_TREE; + niter->control.no_overflow = false; stmt = last_stmt (exit->src); if (!stmt || gimple_code (stmt) != GIMPLE_COND) return false; @@ -2714,6 +2719,29 @@ record_estimate (struct loop *loop, tree bound, double_int i_bound, record_niter_bound (loop, i_bound, realistic, upper); } +/* Records the control iv analyzed in NITER for LOOP if the iv is valid + and doesn't overflow. */ + +static void +record_control_iv (struct loop *loop, struct tree_niter_desc *niter) +{ + struct control_iv *iv; + + if (!niter->control.base || !niter->control.step) + return; + + if (!integer_onep (niter->assumptions) || !niter->control.no_overflow) + return; + + iv = ggc_alloc_control_iv (); + iv->base = niter->control.base; + iv->step = niter->control.step; + iv->next = loop->control_ivs; + loop->control_ivs = iv; + + return; +} + /* Record the estimate on number of iterations of LOOP based on the fact that the induction variable BASE + STEP * i evaluated in STMT does not wrap and its values belong to the range <LOW, HIGH>. REALISTIC is true if the @@ -3452,6 +3480,7 @@ estimate_numbers_of_iterations_loop (struct loop *loop) record_estimate (loop, niter, niter_desc.max, last_stmt (ex->src), true, ex == likely_exit, true); + record_control_iv (loop, &niter_desc); } exits.release (); @@ -3759,6 +3788,189 @@ nowrap_type_p (tree type) return false; } +/* Return true if we can prove LOOP is exited before evolution of induction + variabled {BASE, STEP} overflows with respect to its type bound. */ + +static bool +loop_exits_before_overflow (tree base, tree step, + gimple at_stmt, struct loop *loop) +{ + double_int niter; + struct control_iv *civ; + struct nb_iter_bound *bound; + tree e, delta, step_abs, unsigned_base; + tree type = TREE_TYPE (step); + tree unsigned_type, valid_niter; + + /* Don't issue signed overflow warnings. */ + fold_defer_overflow_warnings (); + + /* Compute the number of iterations before we reach the bound of the + type, and verify that the loop is exited before this occurs. */ + unsigned_type = unsigned_type_for (type); + unsigned_base = fold_convert (unsigned_type, base); + + if (tree_int_cst_sign_bit (step)) + { + tree extreme = fold_convert (unsigned_type, + lower_bound_in_type (type, type)); + delta = fold_build2 (MINUS_EXPR, unsigned_type, unsigned_base, extreme); + step_abs = fold_build1 (NEGATE_EXPR, unsigned_type, + fold_convert (unsigned_type, step)); + } + else + { + tree extreme = fold_convert (unsigned_type, + upper_bound_in_type (type, type)); + delta = fold_build2 (MINUS_EXPR, unsigned_type, extreme, unsigned_base); + step_abs = fold_convert (unsigned_type, step); + } + + valid_niter = fold_build2 (FLOOR_DIV_EXPR, unsigned_type, delta, step_abs); + + estimate_numbers_of_iterations_loop (loop); + + if (max_loop_iterations (loop, &niter) + && double_int_fits_to_tree_p (TREE_TYPE (valid_niter), niter) + && (e = fold_binary (GT_EXPR, boolean_type_node, valid_niter, + double_int_to_tree (TREE_TYPE (valid_niter), + niter))) != NULL + && integer_nonzerop (e)) + { + fold_undefer_and_ignore_overflow_warnings (); + return true; + } + if (at_stmt) + for (bound = loop->bounds; bound; bound = bound->next) + { + if (n_of_executions_at_most (at_stmt, bound, valid_niter)) + { + fold_undefer_and_ignore_overflow_warnings (); + return true; + } + } + fold_undefer_and_ignore_overflow_warnings (); + + /* Try to prove loop is exited before {base, step} overflows with the + help of analyzed loop control IV. This is done only for IVs with + constant step because otherwise we don't have the information. */ + if (TREE_CODE (step) == INTEGER_CST) + for (civ = loop->control_ivs; civ; civ = civ->next) + { + enum tree_code code; + tree stepped, extreme, civ_type = TREE_TYPE (civ->step); + + /* Have to consider type difference because operand_equal_p ignores + that for constants. */ + if (TYPE_UNSIGNED (type) != TYPE_UNSIGNED (civ_type) + || element_precision (type) != element_precision (civ_type)) + continue; + + /* Only consider control IV with same step. */ + if (!operand_equal_p (step, civ->step, 0)) + continue; + + /* Done proving if this is a no-overflow control IV. */ + if (operand_equal_p (base, civ->base, 0)) + return true; + + /* If this is a before stepping control IV, in other words, we have + + {civ_base, step} = {base + step, step} + + Because civ {base + step, step} doesn't overflow during loop + iterations, {base, step} will not overflow if we can prove the + operation "base + step" does not overflow. Specifically, we try + to prove below conditions are satisfied: + + base <= UPPER_BOUND (type) - step ;;step > 0 + base >= LOWER_BOUND (type) - step ;;step < 0 + + by proving the reverse conditions are false using loop's initial + condition. */ + stepped = fold_build2 (PLUS_EXPR, TREE_TYPE (base), base, step); + if (operand_equal_p (stepped, civ->base, 0)) + { + if (tree_int_cst_sign_bit (step)) + { + code = LT_EXPR; + extreme = lower_bound_in_type (type, type); + } + else + { + code = GT_EXPR; + extreme = upper_bound_in_type (type, type); + } + extreme = fold_build2 (MINUS_EXPR, type, extreme, step); + e = fold_build2 (code, boolean_type_node, base, extreme); + e = simplify_using_initial_conditions (loop, e); + if (integer_zerop (e)) + return true; + + continue; + } + + /* Similar to above, only in this case we have: + + {civ_base, step} = {(signed T)((unsigned T)base + step), step} + && TREE_TYPE (civ_base) = signed T. + + We prove that below condition is satisfied: + + (signed T)((unsigned T)base + step) + == (signed T)(unsigned T)base + step + == base + step + + because of exact the same reason as above. This also proves + there is no overflow in the operation "base + step", thus the + induction variable {base, step} during loop iterations. + + This is necessary to handle cases as below: + + int foo (int *a, signed char s, signed char l) + { + signed char i; + for (i = s; i < l; i++) + a[i] = 0; + return 0; + } + + The variable I is firstly converted to type unsigned char, + incremented, then converted back to type signed char. */ + if (!CONVERT_EXPR_P (civ->base) || TREE_TYPE (civ->base) != type) + continue; + e = TREE_OPERAND (civ->base, 0); + if (TREE_CODE (e) != PLUS_EXPR + || TREE_CODE (TREE_OPERAND (e, 1)) != INTEGER_CST + || !operand_equal_p (step, + fold_convert (type, + TREE_OPERAND (e, 1)), 0)) + continue; + e = TREE_OPERAND (e, 0); + if (!CONVERT_EXPR_P (e) || !operand_equal_p (e, unsigned_base, 0)) + continue; + e = TREE_OPERAND (e, 0); + gcc_assert (operand_equal_p (e, base, 0)); + if (tree_int_cst_sign_bit (step)) + { + code = LT_EXPR; + extreme = lower_bound_in_type (type, type); + } + else + { + code = GT_EXPR; + extreme = upper_bound_in_type (type, type); + } + extreme = fold_build2 (MINUS_EXPR, type, extreme, step); + e = fold_build2 (code, boolean_type_node, base, extreme); + e = simplify_using_initial_conditions (loop, e); + if (integer_zerop (e)) + return true; + } + + return false; +} + /* Return false only when the induction variable BASE + STEP * I is known to not overflow: i.e. when the number of iterations is small enough with respect to the step and initial condition in order to @@ -3774,13 +3986,6 @@ scev_probably_wraps_p (tree base, tree step, gimple at_stmt, struct loop *loop, bool use_overflow_semantics) { - tree delta, step_abs; - tree unsigned_type, valid_niter; - tree type = TREE_TYPE (step); - tree e; - double_int niter; - struct nb_iter_bound *bound; - /* FIXME: We really need something like http://gcc.gnu.org/ml/gcc-patches/2005-06/msg02025.html. @@ -3814,56 +4019,8 @@ scev_probably_wraps_p (tree base, tree step, if (TREE_CODE (step) != INTEGER_CST) return true; - /* Don't issue signed overflow warnings. */ - fold_defer_overflow_warnings (); - - /* Otherwise, compute the number of iterations before we reach the - bound of the type, and verify that the loop is exited before this - occurs. */ - unsigned_type = unsigned_type_for (type); - base = fold_convert (unsigned_type, base); - - if (tree_int_cst_sign_bit (step)) - { - tree extreme = fold_convert (unsigned_type, - lower_bound_in_type (type, type)); - delta = fold_build2 (MINUS_EXPR, unsigned_type, base, extreme); - step_abs = fold_build1 (NEGATE_EXPR, unsigned_type, - fold_convert (unsigned_type, step)); - } - else - { - tree extreme = fold_convert (unsigned_type, - upper_bound_in_type (type, type)); - delta = fold_build2 (MINUS_EXPR, unsigned_type, extreme, base); - step_abs = fold_convert (unsigned_type, step); - } - - valid_niter = fold_build2 (FLOOR_DIV_EXPR, unsigned_type, delta, step_abs); - - estimate_numbers_of_iterations_loop (loop); - - if (max_loop_iterations (loop, &niter) - && double_int_fits_to_tree_p (TREE_TYPE (valid_niter), niter) - && (e = fold_binary (GT_EXPR, boolean_type_node, valid_niter, - double_int_to_tree (TREE_TYPE (valid_niter), - niter))) != NULL - && integer_nonzerop (e)) - { - fold_undefer_and_ignore_overflow_warnings (); - return false; - } - if (at_stmt) - for (bound = loop->bounds; bound; bound = bound->next) - { - if (n_of_executions_at_most (at_stmt, bound, valid_niter)) - { - fold_undefer_and_ignore_overflow_warnings (); - return false; - } - } - - fold_undefer_and_ignore_overflow_warnings (); + if (loop_exits_before_overflow (base, step, at_stmt, loop)) + return false; /* At this point we still don't have a proof that the iv does not overflow: give up. */ @@ -3875,17 +4032,26 @@ scev_probably_wraps_p (tree base, tree step, void free_numbers_of_iterations_estimates_loop (struct loop *loop) { - struct nb_iter_bound *bound, *next; + struct control_iv *civ; + struct nb_iter_bound *bound; loop->nb_iterations = NULL; loop->estimate_state = EST_NOT_COMPUTED; - for (bound = loop->bounds; bound; bound = next) + for (bound = loop->bounds; bound;) { - next = bound->next; + struct nb_iter_bound *next = bound->next; ggc_free (bound); + bound = next; } - loop->bounds = NULL; + + for (civ = loop->control_ivs; civ;) + { + struct control_iv *next = civ->next; + ggc_free (civ); + civ = next; + } + loop->control_ivs = NULL; } /* Frees the information on upper bounds on numbers of iterations of loops. */ diff --git a/gcc-4.9/gcc/tree-ssa-loop-niter.h b/gcc-4.9/gcc/tree-ssa-loop-niter.h index df0d64d02..dd2535851 100644 --- a/gcc-4.9/gcc/tree-ssa-loop-niter.h +++ b/gcc-4.9/gcc/tree-ssa-loop-niter.h @@ -41,6 +41,7 @@ extern void estimate_numbers_of_iterations (void); extern bool stmt_dominates_stmt_p (gimple, gimple); extern bool nowrap_type_p (tree); extern bool scev_probably_wraps_p (tree, tree, gimple, struct loop *, bool); +extern void free_loop_control_ivs (struct loop *); extern void free_numbers_of_iterations_estimates_loop (struct loop *); extern void free_numbers_of_iterations_estimates (void); extern void substitute_in_loop_info (struct loop *, tree, tree); |