19 files changed, 2157 insertions, 553 deletions
diff --git a/build.py b/build.py
index 34cc19f5b..908f7fd57 100755
--- a/build.py
+++ b/build.py
@@ -14,129 +14,51 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #
+"""Builds GCC for Android."""
 from __future__ import print_function
 
-import argparse
-import multiprocessing
 import os
-import subprocess
-import sys
+import site
 
+site.addsitedir(os.path.join(os.path.dirname(__file__), '../../ndk/build/lib'))
 
-def android_path(path=''):
-    top = os.getenv('ANDROID_BUILD_TOP', '')
-    return os.path.realpath(os.path.join(top, path))
+import build_support
 
 
-def get_default_package_dir():
-    return android_path('out/ndk')
-
-
-def get_default_host():
-    if sys.platform in ('linux', 'linux2'):
-        return 'linux'
-    elif sys.platform == 'darwin':
-        return 'darwin'
-    else:
-        raise RuntimeError('Unsupported host: {}'.format(sys.platform))
-
-
-ALL_TOOLCHAINS = (
-    'arm-linux-androideabi', 'aarch64-linux-android',
-    'mipsel-linux-android', 'mips64el-linux-android',
-    'x86', 'x86_64',
-)
-
-
-class ArgParser(argparse.ArgumentParser):
+class ArgParser(build_support.ArgParser):
     def __init__(self):
-        super(ArgParser, self).__init__(
-            description='Builds GCC for Android.')
+        super(ArgParser, self).__init__()
 
         self.add_argument(
-            '--host', choices=('darwin', 'linux', 'windows', 'windows64'),
-            help='Build binaries for given OS (e.g. linux).')
-        self.add_argument(
-            '--package-dir', help='Directory to place the packaged GCC.',
-            default=get_default_package_dir())
-        self.add_argument(
-            '--toolchain', choices=ALL_TOOLCHAINS,
+            '--toolchain', choices=build_support.ALL_TOOLCHAINS,
             help='Toolchain to build. Builds all if not present.')
 
 
-def toolchain_to_arch(toolchain):
-    return {
-        'arm-linux-androideabi': 'arm',
-        'aarch64-linux-android': 'arm64',
-        'mipsel-linux-android': 'mips',
-        'mips64el-linux-android': 'mips64',
-        'x86': 'x86',
-        'x86_64': 'x86_64',
-    }[toolchain]
-
-
-def default_api_level(arch):
-    if '64' in arch:
-        return 21
-    else:
-        return 9
-
-
-def sysroot_path(toolchain):
-    arch = toolchain_to_arch(toolchain)
-    version = default_api_level(arch)
-
-    prebuilt_ndk = 'prebuilts/ndk/current'
-    sysroot_subpath = 'platforms/android-{}/arch-{}'.format(version, arch)
-    return android_path(os.path.join(prebuilt_ndk, sysroot_subpath))
-
-
-def main():
-    args = ArgParser().parse_args()
-
-    if 'ANDROID_BUILD_TOP' not in os.environ:
-        top = os.path.join(os.path.dirname(__file__), '../..')
-        os.environ['ANDROID_BUILD_TOP'] = os.path.realpath(top)
-
-    os.chdir(android_path('toolchain/gcc'))
-
-    toolchain_path = android_path('toolchain')
-    ndk_path = android_path('ndk')
-
+def main(args):
     GCC_VERSION = '4.9'
-    jobs_arg = '-j{}'.format(multiprocessing.cpu_count() * 2)
-
-    package_dir_arg = '--package-dir={}'.format(args.package_dir)
 
-    toolchains = ALL_TOOLCHAINS
+    toolchains = build_support.ALL_TOOLCHAINS
     if args.toolchain is not None:
         toolchains = [args.toolchain]
 
-    host = args.host
-    if host is None:
-        host = get_default_host()
-
-    ndk_build_tools_path = android_path('ndk/build/tools')
-    build_env = dict(os.environ)
-    build_env['NDK_BUILDTOOLS_PATH'] = ndk_build_tools_path
-    build_env['ANDROID_NDK_ROOT'] = ndk_path
-
-    print('Building {} toolchains: {}'.format(host, ' '.join(toolchains)))
+    print('Building {} toolchains: {}'.format(args.host, ' '.join(toolchains)))
     for toolchain in toolchains:
         toolchain_name = '-'.join([toolchain, GCC_VERSION])
-        sysroot_arg = '--sysroot={}'.format(sysroot_path(toolchain))
+        sysroot_arg = '--sysroot={}'.format(
+            build_support.sysroot_path(toolchain))
         build_cmd = [
-            'bash', 'build-gcc.sh', toolchain_path, ndk_path, toolchain_name,
-            package_dir_arg, '--verbose', jobs_arg, sysroot_arg,
+            'bash', 'build-gcc.sh', build_support.toolchain_path(),
+            build_support.ndk_path(), toolchain_name, build_support.jobs_arg(),
+            sysroot_arg,
         ]
 
-        if host in ('windows', 'windows64'):
+        if args.host in ('windows', 'windows64'):
             build_cmd.append('--mingw')
 
-        if host != 'windows':
+        if args.host != 'windows':
             build_cmd.append('--try-64')
 
-        subprocess.check_call(build_cmd, env=build_env)
+        build_support.build(build_cmd, args)
 
 if __name__ == '__main__':
-    main()
+    build_support.run(main, ArgParser)
diff --git a/gcc-4.9/gcc/cfgloop.h b/gcc-4.9/gcc/cfgloop.h
index c7e417bf2..11bfc810f 100644
--- a/gcc-4.9/gcc/cfgloop.h
+++ b/gcc-4.9/gcc/cfgloop.h
@@ -100,6 +100,14 @@ enum loop_estimation
   EST_LAST
 };
 
+/* The structure describing non-overflow control induction variable for
+   loop's exit edge.  */
+struct GTY ((chain_next ("%h.next"))) control_iv {
+  tree base;
+  tree step;
+  struct control_iv *next;
+};
+
 /* Structure to hold information for each natural loop.  */
 struct GTY ((chain_next ("%h.next"))) loop {
   /* Index into loops array.  */
@@ -187,6 +195,9 @@ struct GTY ((chain_next ("%h.next"))) loop {
   /* Upper bound on number of iterations of a loop.  */
   struct nb_iter_bound *bounds;
 
+  /* Non-overflow control ivs of a loop.  */
+  struct control_iv *control_ivs;
+
   /* Head of the cyclic list of the exits of the loop.  */
   struct loop_exit *exits;
 
diff --git a/gcc-4.9/gcc/config/aarch64/aarch64-protos.h b/gcc-4.9/gcc/config/aarch64/aarch64-protos.h
index 8b0a70538..e78348ea0 100644
--- a/gcc-4.9/gcc/config/aarch64/aarch64-protos.h
+++ b/gcc-4.9/gcc/config/aarch64/aarch64-protos.h
@@ -108,9 +108,22 @@ enum aarch64_symbol_type
    cost models and vectors for address cost calculations, register
    move costs and memory move costs.  */
 
+/* Scaled addressing modes can vary cost depending on the mode of the
+   value to be loaded/stored.  QImode values cannot use scaled
+   addressing modes.  */
+
+struct scale_addr_mode_cost
+{
+  const int hi;
+  const int si;
+  const int di;
+  const int ti;
+};
+
 /* Additional cost for addresses.  */
 struct cpu_addrcost_table
 {
+  const struct scale_addr_mode_cost addr_scale_costs;
   const int pre_modify;
   const int post_modify;
   const int register_offset;
diff --git a/gcc-4.9/gcc/config/aarch64/aarch64.c b/gcc-4.9/gcc/config/aarch64/aarch64.c
index f9e8c4067..09f2d2c5d 100644
--- a/gcc-4.9/gcc/config/aarch64/aarch64.c
+++ b/gcc-4.9/gcc/config/aarch64/aarch64.c
@@ -141,6 +141,7 @@ static bool aarch64_const_vec_all_same_int_p (rtx,
 
 static bool aarch64_vectorize_vec_perm_const_ok (enum machine_mode vmode,
 						 const unsigned char *sel);
+static int aarch64_address_cost (rtx, enum machine_mode, addr_space_t, bool);
 
 /* The processor for which instructions should be scheduled.  */
 enum aarch64_processor aarch64_tune = cortexa53;
@@ -171,6 +172,15 @@ __extension__
 #endif
 static const struct cpu_addrcost_table generic_addrcost_table =
 {
+#if HAVE_DESIGNATED_INITIALIZERS
+  .addr_scale_costs =
+#endif
+    {
+      NAMED_PARAM (hi, 0),
+      NAMED_PARAM (si, 0),
+      NAMED_PARAM (di, 0),
+      NAMED_PARAM (ti, 0),
+    },
   NAMED_PARAM (pre_modify, 0),
   NAMED_PARAM (post_modify, 0),
   NAMED_PARAM (register_offset, 0),
@@ -181,6 +191,27 @@ static const struct cpu_addrcost_table generic_addrcost_table =
 #if HAVE_DESIGNATED_INITIALIZERS && GCC_VERSION >= 2007
 __extension__
 #endif
+static const struct cpu_addrcost_table cortexa57_addrcost_table =
+{
+#if HAVE_DESIGNATED_INITIALIZERS
+  .addr_scale_costs =
+#endif
+    {
+      NAMED_PARAM (hi, 1),
+      NAMED_PARAM (si, 0),
+      NAMED_PARAM (di, 0),
+      NAMED_PARAM (ti, 1),
+    },
+  NAMED_PARAM (pre_modify, 0),
+  NAMED_PARAM (post_modify, 0),
+  NAMED_PARAM (register_offset, 0),
+  NAMED_PARAM (register_extend, 0),
+  NAMED_PARAM (imm_offset, 0),
+};
+
+#if HAVE_DESIGNATED_INITIALIZERS && GCC_VERSION >= 2007
+__extension__
+#endif
 static const struct cpu_regmove_cost generic_regmove_cost =
 {
   NAMED_PARAM (GP2GP, 1),
@@ -214,6 +245,26 @@ static const struct cpu_vector_cost generic_vector_cost =
   NAMED_PARAM (cond_not_taken_branch_cost, 1)
 };
 
+/* Generic costs for vector insn classes.  */
+#if HAVE_DESIGNATED_INITIALIZERS && GCC_VERSION >= 2007
+__extension__
+#endif
+static const struct cpu_vector_cost cortexa57_vector_cost =
+{
+  NAMED_PARAM (scalar_stmt_cost, 1),
+  NAMED_PARAM (scalar_load_cost, 4),
+  NAMED_PARAM (scalar_store_cost, 1),
+  NAMED_PARAM (vec_stmt_cost, 3),
+  NAMED_PARAM (vec_to_scalar_cost, 8),
+  NAMED_PARAM (scalar_to_vec_cost, 8),
+  NAMED_PARAM (vec_align_load_cost, 5),
+  NAMED_PARAM (vec_unalign_load_cost, 5),
+  NAMED_PARAM (vec_unalign_store_cost, 1),
+  NAMED_PARAM (vec_store_cost, 1),
+  NAMED_PARAM (cond_taken_branch_cost, 1),
+  NAMED_PARAM (cond_not_taken_branch_cost, 1)
+};
+
 #if HAVE_DESIGNATED_INITIALIZERS && GCC_VERSION >= 2007
 __extension__
 #endif
@@ -240,9 +291,9 @@ static const struct tune_params cortexa53_tunings =
 static const struct tune_params cortexa57_tunings =
 {
   &cortexa57_extra_costs,
-  &generic_addrcost_table,
+  &cortexa57_addrcost_table,
   &generic_regmove_cost,
-  &generic_vector_cost,
+  &cortexa57_vector_cost,
   NAMED_PARAM (memmov_cost, 4),
   NAMED_PARAM (issue_rate, 3)
 };
@@ -446,7 +497,7 @@ aarch64_is_long_call_p (rtx sym)
    represent an expression that matches an extend operation.  The
    operands represent the paramters from
 
-   (extract (mult (reg) (mult_imm)) (extract_imm) (const_int 0)).  */
+   (extract:MODE (mult (reg) (MULT_IMM)) (EXTRACT_IMM) (const_int 0)).  */
 bool
 aarch64_is_extend_from_extract (enum machine_mode mode, rtx mult_imm,
 				rtx extract_imm)
@@ -2463,12 +2514,22 @@ aarch64_final_eh_return_addr (void)
 				       - 2 * UNITS_PER_WORD));
 }
 
-/* Output code to build up a constant in a register.  */
-static void
-aarch64_build_constant (int regnum, HOST_WIDE_INT val)
+/* Possibly output code to build up a constant in a register.  For
+   the benefit of the costs infrastructure, returns the number of
+   instructions which would be emitted.  GENERATE inhibits or
+   enables code generation.  */
+
+static int
+aarch64_build_constant (int regnum, HOST_WIDE_INT val, bool generate)
 {
+  int insns = 0;
+
   if (aarch64_bitmask_imm (val, DImode))
-    emit_move_insn (gen_rtx_REG (Pmode, regnum), GEN_INT (val));
+    {
+      if (generate)
+	emit_move_insn (gen_rtx_REG (Pmode, regnum), GEN_INT (val));
+      insns = 1;
+    }
   else
     {
       int i;
@@ -2499,15 +2560,19 @@ aarch64_build_constant (int regnum, HOST_WIDE_INT val)
 	 the same.  */
       if (ncount < zcount)
 	{
-	  emit_move_insn (gen_rtx_REG (Pmode, regnum),
-			  GEN_INT (val | ~(HOST_WIDE_INT) 0xffff));
+	  if (generate)
+	    emit_move_insn (gen_rtx_REG (Pmode, regnum),
+			    GEN_INT (val | ~(HOST_WIDE_INT) 0xffff));
 	  tval = 0xffff;
+	  insns++;
 	}
       else
 	{
-	  emit_move_insn (gen_rtx_REG (Pmode, regnum),
-			  GEN_INT (val & 0xffff));
+	  if (generate)
+	    emit_move_insn (gen_rtx_REG (Pmode, regnum),
+			    GEN_INT (val & 0xffff));
 	  tval = 0;
+	  insns++;
 	}
 
       val >>= 16;
@@ -2515,11 +2580,17 @@ aarch64_build_constant (int regnum, HOST_WIDE_INT val)
       for (i = 16; i < 64; i += 16)
 	{
 	  if ((val & 0xffff) != tval)
-	    emit_insn (gen_insv_immdi (gen_rtx_REG (Pmode, regnum),
-				       GEN_INT (i), GEN_INT (val & 0xffff)));
+	    {
+	      if (generate)
+		emit_insn (gen_insv_immdi (gen_rtx_REG (Pmode, regnum),
+					   GEN_INT (i),
+					   GEN_INT (val & 0xffff)));
+	      insns++;
+	    }
 	  val >>= 16;
 	}
     }
+  return insns;
 }
 
 static void
@@ -2534,7 +2605,7 @@ aarch64_add_constant (int regnum, int scratchreg, HOST_WIDE_INT delta)
 
   if (mdelta >= 4096 * 4096)
     {
-      aarch64_build_constant (scratchreg, delta);
+      (void) aarch64_build_constant (scratchreg, delta, true);
       emit_insn (gen_add3_insn (this_rtx, this_rtx, scratch_rtx));
     }
   else if (mdelta > 0)
@@ -2608,7 +2679,7 @@ aarch64_output_mi_thunk (FILE *file, tree thunk ATTRIBUTE_UNUSED,
 	  addr = plus_constant (Pmode, temp0, vcall_offset);
       else
 	{
-	  aarch64_build_constant (IP1_REGNUM, vcall_offset);
+	  (void) aarch64_build_constant (IP1_REGNUM, vcall_offset, true);
 	  addr = gen_rtx_PLUS (Pmode, temp0, temp1);
 	}
 
@@ -4471,12 +4542,12 @@ aarch64_strip_shift (rtx x)
   return x;
 }
 
-/* Helper function for rtx cost calculation.  Strip a shift or extend
+/* Helper function for rtx cost calculation.  Strip an extend
    expression from X.  Returns the inner operand if successful, or the
    original expression on failure.  We deal with a number of possible
    canonicalization variations here.  */
 static rtx
-aarch64_strip_shift_or_extend (rtx x)
+aarch64_strip_extend (rtx x)
 {
   rtx op = x;
 
@@ -4512,7 +4583,246 @@ aarch64_strip_shift_or_extend (rtx x)
   if (op != x)
     return op;
 
-  return aarch64_strip_shift (x);
+  return x;
+}
+
+/* Helper function for rtx cost calculation.  Calculate the cost of
+   a MULT, which may be part of a multiply-accumulate rtx.  Return
+   the calculated cost of the expression, recursing manually in to
+   operands where needed.  */
+
+static int
+aarch64_rtx_mult_cost (rtx x, int code, int outer, bool speed)
+{
+  rtx op0, op1;
+  const struct cpu_cost_table *extra_cost
+    = aarch64_tune_params->insn_extra_cost;
+  int cost = 0;
+  bool maybe_fma = (outer == PLUS || outer == MINUS);
+  enum machine_mode mode = GET_MODE (x);
+
+  gcc_checking_assert (code == MULT);
+
+  op0 = XEXP (x, 0);
+  op1 = XEXP (x, 1);
+
+  if (VECTOR_MODE_P (mode))
+    mode = GET_MODE_INNER (mode);
+
+  /* Integer multiply/fma.  */
+  if (GET_MODE_CLASS (mode) == MODE_INT)
+    {
+      /* The multiply will be canonicalized as a shift, cost it as such.  */
+      if (CONST_INT_P (op1)
+	  && exact_log2 (INTVAL (op1)) > 0)
+	{
+	  if (speed)
+	    {
+	      if (maybe_fma)
+		/* ADD (shifted register).  */
+		cost += extra_cost->alu.arith_shift;
+	      else
+		/* LSL (immediate).  */
+		cost += extra_cost->alu.shift;
+	    }
+
+	  cost += rtx_cost (op0, GET_CODE (op0), 0, speed);
+
+	  return cost;
+	}
+
+      /* Integer multiplies or FMAs have zero/sign extending variants.  */
+      if ((GET_CODE (op0) == ZERO_EXTEND
+	   && GET_CODE (op1) == ZERO_EXTEND)
+	  || (GET_CODE (op0) == SIGN_EXTEND
+	      && GET_CODE (op1) == SIGN_EXTEND))
+	{
+	  cost += rtx_cost (XEXP (op0, 0), MULT, 0, speed)
+		  + rtx_cost (XEXP (op1, 0), MULT, 1, speed);
+
+	  if (speed)
+	    {
+	      if (maybe_fma)
+		/* MADD/SMADDL/UMADDL.  */
+		cost += extra_cost->mult[0].extend_add;
+	      else
+		/* MUL/SMULL/UMULL.  */
+		cost += extra_cost->mult[0].extend;
+	    }
+
+	  return cost;
+	}
+
+      /* This is either an integer multiply or an FMA.  In both cases
+	 we want to recurse and cost the operands.  */
+      cost += rtx_cost (op0, MULT, 0, speed)
+	      + rtx_cost (op1, MULT, 1, speed);
+
+      if (speed)
+	{
+	  if (maybe_fma)
+	    /* MADD.  */
+	    cost += extra_cost->mult[mode == DImode].add;
+	  else
+	    /* MUL.  */
+	    cost += extra_cost->mult[mode == DImode].simple;
+	}
+
+      return cost;
+    }
+  else
+    {
+      if (speed)
+	{
+	  /* Floating-point FMA can also support negations of the
+	     operands.  */
+	  if (GET_CODE (op0) == NEG)
+	    {
+	      maybe_fma = true;
+	      op0 = XEXP (op0, 0);
+	    }
+	  if (GET_CODE (op1) == NEG)
+	    {
+	      maybe_fma = true;
+	      op1 = XEXP (op1, 0);
+	    }
+
+	  if (maybe_fma)
+	    /* FMADD/FNMADD/FNMSUB/FMSUB.  */
+	    cost += extra_cost->fp[mode == DFmode].fma;
+	  else
+	    /* FMUL.  */
+	    cost += extra_cost->fp[mode == DFmode].mult;
+	}
+
+      cost += rtx_cost (op0, MULT, 0, speed)
+	      + rtx_cost (op1, MULT, 1, speed);
+      return cost;
+    }
+}
+
+static int
+aarch64_address_cost (rtx x,
+		      enum machine_mode mode,
+		      addr_space_t as ATTRIBUTE_UNUSED,
+		      bool speed)
+{
+  enum rtx_code c = GET_CODE (x);
+  const struct cpu_addrcost_table *addr_cost = aarch64_tune_params->addr_cost;
+  struct aarch64_address_info info;
+  int cost = 0;
+  info.shift = 0;
+
+  if (!aarch64_classify_address (&info, x, mode, c, false))
+    {
+      if (GET_CODE (x) == CONST || GET_CODE (x) == SYMBOL_REF)
+	{
+	  /* This is a CONST or SYMBOL ref which will be split
+	     in a different way depending on the code model in use.
+	     Cost it through the generic infrastructure.  */
+	  int cost_symbol_ref = rtx_cost (x, MEM, 1, speed);
+	  /* Divide through by the cost of one instruction to
+	     bring it to the same units as the address costs.  */
+	  cost_symbol_ref /= COSTS_N_INSNS (1);
+	  /* The cost is then the cost of preparing the address,
+	     followed by an immediate (possibly 0) offset.  */
+	  return cost_symbol_ref + addr_cost->imm_offset;
+	}
+      else
+	{
+	  /* This is most likely a jump table from a case
+	     statement.  */
+	  return addr_cost->register_offset;
+	}
+    }
+
+  switch (info.type)
+    {
+      case ADDRESS_LO_SUM:
+      case ADDRESS_SYMBOLIC:
+      case ADDRESS_REG_IMM:
+	cost += addr_cost->imm_offset;
+	break;
+
+      case ADDRESS_REG_WB:
+	if (c == PRE_INC || c == PRE_DEC || c == PRE_MODIFY)
+	  cost += addr_cost->pre_modify;
+	else if (c == POST_INC || c == POST_DEC || c == POST_MODIFY)
+	  cost += addr_cost->post_modify;
+	else
+	  gcc_unreachable ();
+
+	break;
+
+      case ADDRESS_REG_REG:
+	cost += addr_cost->register_offset;
+	break;
+
+      case ADDRESS_REG_UXTW:
+      case ADDRESS_REG_SXTW:
+	cost += addr_cost->register_extend;
+	break;
+
+      default:
+	gcc_unreachable ();
+    }
+
+
+  if (info.shift > 0)
+    {
+      /* For the sake of calculating the cost of the shifted register
+	 component, we can treat same sized modes in the same way.  */
+      switch (GET_MODE_BITSIZE (mode))
+	{
+	  case 16:
+	    cost += addr_cost->addr_scale_costs.hi;
+	    break;
+
+	  case 32:
+	    cost += addr_cost->addr_scale_costs.si;
+	    break;
+
+	  case 64:
+	    cost += addr_cost->addr_scale_costs.di;
+	    break;
+
+	  /* We can't tell, or this is a 128-bit vector.  */
+	  default:
+	    cost += addr_cost->addr_scale_costs.ti;
+	    break;
+	}
+    }
+
+  return cost;
+}
+
+/* Return true if the RTX X in mode MODE is a zero or sign extract
+   usable in an ADD or SUB (extended register) instruction.  */
+static bool
+aarch64_rtx_arith_op_extract_p (rtx x, enum machine_mode mode)
+{
+  /* Catch add with a sign extract.
+     This is add_<optab><mode>_multp2.  */
+  if (GET_CODE (x) == SIGN_EXTRACT
+      || GET_CODE (x) == ZERO_EXTRACT)
+    {
+      rtx op0 = XEXP (x, 0);
+      rtx op1 = XEXP (x, 1);
+      rtx op2 = XEXP (x, 2);
+
+      if (GET_CODE (op0) == MULT
+	  && CONST_INT_P (op1)
+	  && op2 == const0_rtx
+	  && CONST_INT_P (XEXP (op0, 1))
+	  && aarch64_is_extend_from_extract (mode,
+					     XEXP (op0, 1),
+					     op1))
+	{
+	  return true;
+	}
+    }
+
+  return false;
 }
 
 /* Calculate the cost of calculating X, storing it in *COST.  Result
@@ -4521,13 +4831,31 @@ static bool
 aarch64_rtx_costs (rtx x, int code, int outer ATTRIBUTE_UNUSED,
 		   int param ATTRIBUTE_UNUSED, int *cost, bool speed)
 {
-  rtx op0, op1;
+  rtx op0, op1, op2;
   const struct cpu_cost_table *extra_cost
     = aarch64_tune_params->insn_extra_cost;
+  enum machine_mode mode = GET_MODE (x);
+
+  /* By default, assume that everything has equivalent cost to the
+     cheapest instruction.  Any additional costs are applied as a delta
+     above this default.  */
+  *cost = COSTS_N_INSNS (1);
+
+  /* TODO: The cost infrastructure currently does not handle
+     vector operations.  Assume that all vector operations
+     are equally expensive.  */
+  if (VECTOR_MODE_P (mode))
+    {
+      if (speed)
+	*cost += extra_cost->vect.alu;
+      return true;
+    }
 
   switch (code)
     {
     case SET:
+      /* The cost depends entirely on the operands to SET.  */
+      *cost = 0;
       op0 = SET_DEST (x);
       op1 = SET_SRC (x);
 
@@ -4535,25 +4863,47 @@ aarch64_rtx_costs (rtx x, int code, int outer ATTRIBUTE_UNUSED,
 	{
 	case MEM:
 	  if (speed)
-	    *cost += extra_cost->ldst.store;
+	    {
+	      rtx address = XEXP (op0, 0);
+	      if (GET_MODE_CLASS (mode) == MODE_INT)
+		*cost += extra_cost->ldst.store;
+	      else if (mode == SFmode)
+		*cost += extra_cost->ldst.storef;
+	      else if (mode == DFmode)
+		*cost += extra_cost->ldst.stored;
+
+	      *cost +=
+		COSTS_N_INSNS (aarch64_address_cost (address, mode,
+						     0, speed));
+	    }
 
-	  if (op1 != const0_rtx)
-	    *cost += rtx_cost (op1, SET, 1, speed);
+	  *cost += rtx_cost (op1, SET, 1, speed);
 	  return true;
 
 	case SUBREG:
 	  if (! REG_P (SUBREG_REG (op0)))
 	    *cost += rtx_cost (SUBREG_REG (op0), SET, 0, speed);
+
 	  /* Fall through.  */
 	case REG:
-	  /* Cost is just the cost of the RHS of the set.  */
-	  *cost += rtx_cost (op1, SET, 1, true);
+	  /* const0_rtx is in general free, but we will use an
+	     instruction to set a register to 0.  */
+          if (REG_P (op1) || op1 == const0_rtx)
+            {
+              /* The cost is 1 per register copied.  */
+              int n_minus_1 = (GET_MODE_SIZE (GET_MODE (op0)) - 1)
+			      / UNITS_PER_WORD;
+              *cost = COSTS_N_INSNS (n_minus_1 + 1);
+            }
+          else
+	    /* Cost is just the cost of the RHS of the set.  */
+	    *cost += rtx_cost (op1, SET, 1, speed);
 	  return true;
 
-	case ZERO_EXTRACT:  /* Bit-field insertion.  */
+	case ZERO_EXTRACT:
 	case SIGN_EXTRACT:
-	  /* Strip any redundant widening of the RHS to meet the width of
-	     the target.  */
+	  /* Bit-field insertion.  Strip any redundant widening of
+	     the RHS to meet the width of the target.  */
 	  if (GET_CODE (op1) == SUBREG)
 	    op1 = SUBREG_REG (op1);
 	  if ((GET_CODE (op1) == ZERO_EXTEND
@@ -4562,24 +4912,138 @@ aarch64_rtx_costs (rtx x, int code, int outer ATTRIBUTE_UNUSED,
 	      && (GET_MODE_BITSIZE (GET_MODE (XEXP (op1, 0)))
 		  >= INTVAL (XEXP (op0, 1))))
 	    op1 = XEXP (op1, 0);
-	  *cost += rtx_cost (op1, SET, 1, speed);
+
+          if (CONST_INT_P (op1))
+            {
+              /* MOV immediate is assumed to always be cheap.  */
+              *cost = COSTS_N_INSNS (1);
+            }
+          else
+            {
+              /* BFM.  */
+	      if (speed)
+		*cost += extra_cost->alu.bfi;
+              *cost += rtx_cost (op1, (enum rtx_code) code, 1, speed);
+            }
+
 	  return true;
 
 	default:
+	  /* We can't make sense of this, assume default cost.  */
+          *cost = COSTS_N_INSNS (1);
 	  break;
 	}
       return false;
 
+    case CONST_INT:
+      /* If an instruction can incorporate a constant within the
+	 instruction, the instruction's expression avoids calling
+	 rtx_cost() on the constant.  If rtx_cost() is called on a
+	 constant, then it is usually because the constant must be
+	 moved into a register by one or more instructions.
+
+	 The exception is constant 0, which can be expressed
+	 as XZR/WZR and is therefore free.  The exception to this is
+	 if we have (set (reg) (const0_rtx)) in which case we must cost
+	 the move.  However, we can catch that when we cost the SET, so
+	 we don't need to consider that here.  */
+      if (x == const0_rtx)
+	*cost = 0;
+      else
+	{
+	  /* To an approximation, building any other constant is
+	     proportionally expensive to the number of instructions
+	     required to build that constant.  This is true whether we
+	     are compiling for SPEED or otherwise.  */
+	  *cost = COSTS_N_INSNS (aarch64_build_constant (0,
+							 INTVAL (x),
+							 false));
+	}
+      return true;
+
+    case CONST_DOUBLE:
+      if (speed)
+	{
+	  /* mov[df,sf]_aarch64.  */
+	  if (aarch64_float_const_representable_p (x))
+	    /* FMOV (scalar immediate).  */
+	    *cost += extra_cost->fp[mode == DFmode].fpconst;
+	  else if (!aarch64_float_const_zero_rtx_p (x))
+	    {
+	      /* This will be a load from memory.  */
+	      if (mode == DFmode)
+		*cost += extra_cost->ldst.loadd;
+	      else
+		*cost += extra_cost->ldst.loadf;
+	    }
+	  else
+	    /* Otherwise this is +0.0.  We get this using MOVI d0, #0
+	       or MOV v0.s[0], wzr - neither of which are modeled by the
+	       cost tables.  Just use the default cost.  */
+	    {
+	    }
+	}
+
+      return true;
+
     case MEM:
       if (speed)
-	*cost += extra_cost->ldst.load;
+	{
+	  /* For loads we want the base cost of a load, plus an
+	     approximation for the additional cost of the addressing
+	     mode.  */
+	  rtx address = XEXP (x, 0);
+	  if (GET_MODE_CLASS (mode) == MODE_INT)
+	    *cost += extra_cost->ldst.load;
+	  else if (mode == SFmode)
+	    *cost += extra_cost->ldst.loadf;
+	  else if (mode == DFmode)
+	    *cost += extra_cost->ldst.loadd;
+
+	  *cost +=
+		COSTS_N_INSNS (aarch64_address_cost (address, mode,
+						     0, speed));
+	}
 
       return true;
 
     case NEG:
-      op0 = CONST0_RTX (GET_MODE (x));
-      op1 = XEXP (x, 0);
-      goto cost_minus;
+      op0 = XEXP (x, 0);
+
+      if (GET_MODE_CLASS (GET_MODE (x)) == MODE_INT)
+       {
+          if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
+              || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
+            {
+              /* CSETM.  */
+              *cost += rtx_cost (XEXP (op0, 0), NEG, 0, speed);
+              return true;
+            }
+
+	  /* Cost this as SUB wzr, X.  */
+          op0 = CONST0_RTX (GET_MODE (x));
+          op1 = XEXP (x, 0);
+          goto cost_minus;
+        }
+
+      if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT)
+        {
+          /* Support (neg(fma...)) as a single instruction only if
+             sign of zeros is unimportant.  This matches the decision
+             making in aarch64.md.  */
+          if (GET_CODE (op0) == FMA && !HONOR_SIGNED_ZEROS (GET_MODE (op0)))
+            {
+	      /* FNMADD.  */
+              *cost = rtx_cost (op0, NEG, 0, speed);
+              return true;
+            }
+	  if (speed)
+	    /* FNEG.  */
+	    *cost += extra_cost->fp[mode == DFmode].neg;
+          return false;
+        }
+
+      return false;
 
     case COMPARE:
       op0 = XEXP (x, 0);
@@ -4592,94 +5056,207 @@ aarch64_rtx_costs (rtx x, int code, int outer ATTRIBUTE_UNUSED,
 	  goto cost_logic;
 	}
 
-      /* Comparisons can work if the order is swapped.
-	 Canonicalization puts the more complex operation first, but
-	 we want it in op1.  */
-      if (! (REG_P (op0)
-	     || (GET_CODE (op0) == SUBREG && REG_P (SUBREG_REG (op0)))))
-	{
-	  op0 = XEXP (x, 1);
-	  op1 = XEXP (x, 0);
-	}
-      goto cost_minus;
+      if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT)
+        {
+          /* TODO: A write to the CC flags possibly costs extra, this
+	     needs encoding in the cost tables.  */
 
-    case MINUS:
-      op0 = XEXP (x, 0);
-      op1 = XEXP (x, 1);
+          /* CC_ZESWPmode supports zero extend for free.  */
+          if (GET_MODE (x) == CC_ZESWPmode && GET_CODE (op0) == ZERO_EXTEND)
+            op0 = XEXP (op0, 0);
 
-    cost_minus:
-      if (GET_MODE_CLASS (GET_MODE (x)) == MODE_INT
-	  || (GET_MODE_CLASS (GET_MODE (x)) == MODE_CC
-	      && GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT))
-	{
-	  if (op0 != const0_rtx)
-	    *cost += rtx_cost (op0, MINUS, 0, speed);
+          /* ANDS.  */
+          if (GET_CODE (op0) == AND)
+            {
+              x = op0;
+              goto cost_logic;
+            }
 
-	  if (CONST_INT_P (op1))
-	    {
-	      if (!aarch64_uimm12_shift (INTVAL (op1)))
-		*cost += rtx_cost (op1, MINUS, 1, speed);
-	    }
-	  else
-	    {
-	      op1 = aarch64_strip_shift_or_extend (op1);
-	      *cost += rtx_cost (op1, MINUS, 1, speed);
-	    }
-	  return true;
-	}
+          if (GET_CODE (op0) == PLUS)
+            {
+	      /* ADDS (and CMN alias).  */
+              x = op0;
+              goto cost_plus;
+            }
+
+          if (GET_CODE (op0) == MINUS)
+            {
+	      /* SUBS.  */
+              x = op0;
+              goto cost_minus;
+            }
+
+          if (GET_CODE (op1) == NEG)
+            {
+	      /* CMN.  */
+	      if (speed)
+		*cost += extra_cost->alu.arith;
+
+              *cost += rtx_cost (op0, COMPARE, 0, speed);
+	      *cost += rtx_cost (XEXP (op1, 0), NEG, 1, speed);
+              return true;
+            }
+
+          /* CMP.
+
+	     Compare can freely swap the order of operands, and
+             canonicalization puts the more complex operation first.
+             But the integer MINUS logic expects the shift/extend
+             operation in op1.  */
+          if (! (REG_P (op0)
+                 || (GET_CODE (op0) == SUBREG && REG_P (SUBREG_REG (op0)))))
+          {
+            op0 = XEXP (x, 1);
+            op1 = XEXP (x, 0);
+          }
+          goto cost_minus;
+        }
+
+      if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_FLOAT)
+        {
+	  /* FCMP.  */
+	  if (speed)
+	    *cost += extra_cost->fp[mode == DFmode].compare;
+
+          if (CONST_DOUBLE_P (op1) && aarch64_float_const_zero_rtx_p (op1))
+            {
+              *cost += rtx_cost (op0, COMPARE, 0, speed);
+              /* FCMP supports constant 0.0 for no extra cost. */
+              return true;
+            }
+          return false;
+        }
 
       return false;
 
+    case MINUS:
+      {
+	op0 = XEXP (x, 0);
+	op1 = XEXP (x, 1);
+
+cost_minus:
+	*cost += rtx_cost (op0, MINUS, 0, speed);
+
+	/* Detect valid immediates.  */
+	if ((GET_MODE_CLASS (mode) == MODE_INT
+	     || (GET_MODE_CLASS (mode) == MODE_CC
+		 && GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT))
+	    && CONST_INT_P (op1)
+	    && aarch64_uimm12_shift (INTVAL (op1)))
+	  {
+	    if (speed)
+	      /* SUB(S) (immediate).  */
+	      *cost += extra_cost->alu.arith;
+	    return true;
+	  }
+
+	/* Look for SUB (extended register).  */
+        if (aarch64_rtx_arith_op_extract_p (op1, mode))
+	  {
+	    if (speed)
+	      *cost += extra_cost->alu.arith_shift;
+
+	    *cost += rtx_cost (XEXP (XEXP (op1, 0), 0),
+			       (enum rtx_code) GET_CODE (op1),
+			       0, speed);
+	    return true;
+	  }
+
+	rtx new_op1 = aarch64_strip_extend (op1);
+
+	/* Cost this as an FMA-alike operation.  */
+	if ((GET_CODE (new_op1) == MULT
+	     || GET_CODE (new_op1) == ASHIFT)
+	    && code != COMPARE)
+	  {
+	    *cost += aarch64_rtx_mult_cost (new_op1, MULT,
+					    (enum rtx_code) code,
+					    speed);
+	    return true;
+	  }
+
+	*cost += rtx_cost (new_op1, MINUS, 1, speed);
+
+	if (speed)
+	  {
+	    if (GET_MODE_CLASS (mode) == MODE_INT)
+	      /* SUB(S).  */
+	      *cost += extra_cost->alu.arith;
+	    else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
+	      /* FSUB.  */
+	      *cost += extra_cost->fp[mode == DFmode].addsub;
+	  }
+	return true;
+      }
+
     case PLUS:
-      op0 = XEXP (x, 0);
-      op1 = XEXP (x, 1);
+      {
+	rtx new_op0;
 
-      if (GET_MODE_CLASS (GET_MODE (x)) == MODE_INT)
-	{
-	  if (CONST_INT_P (op1) && aarch64_uimm12_shift (INTVAL (op1)))
-	    {
-	      *cost += rtx_cost (op0, PLUS, 0, speed);
-	    }
-	  else
-	    {
-	      rtx new_op0 = aarch64_strip_shift_or_extend (op0);
+	op0 = XEXP (x, 0);
+	op1 = XEXP (x, 1);
 
-	      if (new_op0 == op0
-		  && GET_CODE (op0) == MULT)
-		{
-		  if ((GET_CODE (XEXP (op0, 0)) == ZERO_EXTEND
-		       && GET_CODE (XEXP (op0, 1)) == ZERO_EXTEND)
-		      || (GET_CODE (XEXP (op0, 0)) == SIGN_EXTEND
-			  && GET_CODE (XEXP (op0, 1)) == SIGN_EXTEND))
-		    {
-		      *cost += (rtx_cost (XEXP (XEXP (op0, 0), 0), MULT, 0,
-					  speed)
-				+ rtx_cost (XEXP (XEXP (op0, 1), 0), MULT, 1,
-					    speed)
-				+ rtx_cost (op1, PLUS, 1, speed));
-		      if (speed)
-			*cost +=
-			  extra_cost->mult[GET_MODE (x) == DImode].extend_add;
-		      return true;
-		    }
+cost_plus:
+	if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
+	    || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
+	  {
+	    /* CSINC.  */
+	    *cost += rtx_cost (XEXP (op0, 0), PLUS, 0, speed);
+	    *cost += rtx_cost (op1, PLUS, 1, speed);
+	    return true;
+	  }
 
-		  *cost += (rtx_cost (XEXP (op0, 0), MULT, 0, speed)
-			    + rtx_cost (XEXP (op0, 1), MULT, 1, speed)
-			    + rtx_cost (op1, PLUS, 1, speed));
+	if (GET_MODE_CLASS (mode) == MODE_INT
+	    && CONST_INT_P (op1)
+	    && aarch64_uimm12_shift (INTVAL (op1)))
+	  {
+	    *cost += rtx_cost (op0, PLUS, 0, speed);
 
-		  if (speed)
-		    *cost += extra_cost->mult[GET_MODE (x) == DImode].add;
+	    if (speed)
+	      /* ADD (immediate).  */
+	      *cost += extra_cost->alu.arith;
+	    return true;
+	  }
 
-		  return true;
-		}
+	*cost += rtx_cost (op1, PLUS, 1, speed);
 
-	      *cost += (rtx_cost (new_op0, PLUS, 0, speed)
-			+ rtx_cost (op1, PLUS, 1, speed));
-	    }
-	  return true;
-	}
+	/* Look for ADD (extended register).  */
+        if (aarch64_rtx_arith_op_extract_p (op0, mode))
+	  {
+	    if (speed)
+	      *cost += extra_cost->alu.arith_shift;
 
-      return false;
+	    *cost += rtx_cost (XEXP (XEXP (op0, 0), 0),
+			       (enum rtx_code) GET_CODE (op0),
+			       0, speed);
+	    return true;
+	  }
+
+	/* Strip any extend, leave shifts behind as we will
+	   cost them through mult_cost.  */
+	new_op0 = aarch64_strip_extend (op0);
+
+	if (GET_CODE (new_op0) == MULT
+	    || GET_CODE (new_op0) == ASHIFT)
+	  {
+	    *cost += aarch64_rtx_mult_cost (new_op0, MULT, PLUS,
+					    speed);
+	    return true;
+	  }
+
+	*cost += rtx_cost (new_op0, PLUS, 0, speed);
+
+	if (speed)
+	  {
+	    if (GET_MODE_CLASS (mode) == MODE_INT)
+	      /* ADD.  */
+	      *cost += extra_cost->alu.arith;
+	    else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
+	      /* FADD.  */
+	      *cost += extra_cost->fp[mode == DFmode].addsub;
+	  }
+	return true;
+      }
 
     case IOR:
     case XOR:
@@ -4688,117 +5265,284 @@ aarch64_rtx_costs (rtx x, int code, int outer ATTRIBUTE_UNUSED,
       op0 = XEXP (x, 0);
       op1 = XEXP (x, 1);
 
+      if (code == AND
+          && GET_CODE (op0) == MULT
+          && CONST_INT_P (XEXP (op0, 1))
+          && CONST_INT_P (op1)
+          && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (op0, 1))),
+                               INTVAL (op1)) != 0)
+        {
+          /* This is a UBFM/SBFM.  */
+          *cost += rtx_cost (XEXP (op0, 0), ZERO_EXTRACT, 0, speed);
+	  if (speed)
+	    *cost += extra_cost->alu.bfx;
+          return true;
+        }
+
       if (GET_MODE_CLASS (GET_MODE (x)) == MODE_INT)
 	{
+	  /* We possibly get the immediate for free, this is not
+	     modelled.  */
 	  if (CONST_INT_P (op1)
 	      && aarch64_bitmask_imm (INTVAL (op1), GET_MODE (x)))
 	    {
-	      *cost += rtx_cost (op0, AND, 0, speed);
+	      *cost += rtx_cost (op0, (enum rtx_code) code, 0, speed);
+
+	      if (speed)
+		*cost += extra_cost->alu.logical;
+
+	      return true;
 	    }
 	  else
 	    {
+	      rtx new_op0 = op0;
+
+	      /* Handle ORN, EON, or BIC.  */
 	      if (GET_CODE (op0) == NOT)
 		op0 = XEXP (op0, 0);
-	      op0 = aarch64_strip_shift (op0);
-	      *cost += (rtx_cost (op0, AND, 0, speed)
-			+ rtx_cost (op1, AND, 1, speed));
+
+	      new_op0 = aarch64_strip_shift (op0);
+
+	      /* If we had a shift on op0 then this is a logical-shift-
+		 by-register/immediate operation.  Otherwise, this is just
+		 a logical operation.  */
+	      if (speed)
+		{
+		  if (new_op0 != op0)
+		    {
+		      /* Shift by immediate.  */
+		      if (CONST_INT_P (XEXP (op0, 1)))
+			*cost += extra_cost->alu.log_shift;
+		      else
+			*cost += extra_cost->alu.log_shift_reg;
+		    }
+		  else
+		    *cost += extra_cost->alu.logical;
+		}
+
+	      /* In both cases we want to cost both operands.  */
+	      *cost += rtx_cost (new_op0, (enum rtx_code) code, 0, speed)
+		       + rtx_cost (op1, (enum rtx_code) code, 1, speed);
+
+	      return true;
 	    }
-	  return true;
 	}
       return false;
 
+    case NOT:
+      x = XEXP (x, 0);
+      op0 = aarch64_strip_shift (x);
+
+      /* MVN-shifted-reg.  */
+      if (op0 != x)
+        {
+          *cost += rtx_cost (op0, (enum rtx_code) code, 0, speed);
+
+          if (speed)
+            *cost += extra_cost->alu.log_shift;
+
+          return true;
+        }
+      /* EON can have two forms: (xor (not a) b) but also (not (xor a b)).
+         Handle the second form here taking care that 'a' in the above can
+         be a shift.  */
+      else if (GET_CODE (op0) == XOR)
+        {
+          rtx newop0 = XEXP (op0, 0);
+          rtx newop1 = XEXP (op0, 1);
+          rtx op0_stripped = aarch64_strip_shift (newop0);
+
+          *cost += rtx_cost (newop1, (enum rtx_code) code, 1, speed)
+                   + rtx_cost (op0_stripped, XOR, 0, speed);
+
+          if (speed)
+            {
+              if (op0_stripped != newop0)
+                *cost += extra_cost->alu.log_shift;
+              else
+                *cost += extra_cost->alu.logical;
+            }
+
+          return true;
+        }
+      /* MVN.  */
+      if (speed)
+	*cost += extra_cost->alu.logical;
+
+      return false;
+
     case ZERO_EXTEND:
-      if ((GET_MODE (x) == DImode
-	   && GET_MODE (XEXP (x, 0)) == SImode)
-	  || GET_CODE (XEXP (x, 0)) == MEM)
+
+      op0 = XEXP (x, 0);
+      /* If a value is written in SI mode, then zero extended to DI
+	 mode, the operation will in general be free as a write to
+	 a 'w' register implicitly zeroes the upper bits of an 'x'
+	 register.  However, if this is
+
+	   (set (reg) (zero_extend (reg)))
+
+	 we must cost the explicit register move.  */
+      if (mode == DImode
+	  && GET_MODE (op0) == SImode
+	  && outer == SET)
 	{
-	  *cost += rtx_cost (XEXP (x, 0), ZERO_EXTEND, 0, speed);
+	  int op_cost = rtx_cost (XEXP (x, 0), ZERO_EXTEND, 0, speed);
+
+	  if (!op_cost && speed)
+	    /* MOV.  */
+	    *cost += extra_cost->alu.extend;
+	  else
+	    /* Free, the cost is that of the SI mode operation.  */
+	    *cost = op_cost;
+
+	  return true;
+	}
+      else if (MEM_P (XEXP (x, 0)))
+	{
+	  /* All loads can zero extend to any size for free.  */
+	  *cost = rtx_cost (XEXP (x, 0), ZERO_EXTEND, param, speed);
 	  return true;
 	}
+
+      /* UXTB/UXTH.  */
+      if (speed)
+	*cost += extra_cost->alu.extend;
+
       return false;
 
     case SIGN_EXTEND:
-      if (GET_CODE (XEXP (x, 0)) == MEM)
+      if (MEM_P (XEXP (x, 0)))
 	{
-	  *cost += rtx_cost (XEXP (x, 0), SIGN_EXTEND, 0, speed);
+	  /* LDRSH.  */
+	  if (speed)
+	    {
+	      rtx address = XEXP (XEXP (x, 0), 0);
+	      *cost += extra_cost->ldst.load_sign_extend;
+
+	      *cost +=
+		COSTS_N_INSNS (aarch64_address_cost (address, mode,
+						     0, speed));
+	    }
 	  return true;
 	}
+
+      if (speed)
+	*cost += extra_cost->alu.extend;
       return false;
 
-    case ROTATE:
-      if (!CONST_INT_P (XEXP (x, 1)))
-	*cost += COSTS_N_INSNS (2);
-      /* Fall through.  */
-    case ROTATERT:
-    case LSHIFTRT:
     case ASHIFT:
-    case ASHIFTRT:
-
-      /* Shifting by a register often takes an extra cycle.  */
-      if (speed && !CONST_INT_P (XEXP (x, 1)))
-	*cost += extra_cost->alu.arith_shift_reg;
+      op0 = XEXP (x, 0);
+      op1 = XEXP (x, 1);
 
-      *cost += rtx_cost (XEXP (x, 0), ASHIFT, 0, speed);
-      return true;
+      if (CONST_INT_P (op1))
+        {
+	  /* LSL (immediate), UBMF, UBFIZ and friends.  These are all
+	     aliases.  */
+	  if (speed)
+	    *cost += extra_cost->alu.shift;
 
-    case HIGH:
-      if (!CONSTANT_P (XEXP (x, 0)))
-	*cost += rtx_cost (XEXP (x, 0), HIGH, 0, speed);
-      return true;
+          /* We can incorporate zero/sign extend for free.  */
+          if (GET_CODE (op0) == ZERO_EXTEND
+              || GET_CODE (op0) == SIGN_EXTEND)
+            op0 = XEXP (op0, 0);
 
-    case LO_SUM:
-      if (!CONSTANT_P (XEXP (x, 1)))
-	*cost += rtx_cost (XEXP (x, 1), LO_SUM, 1, speed);
-      *cost += rtx_cost (XEXP (x, 0), LO_SUM, 0, speed);
-      return true;
+          *cost += rtx_cost (op0, ASHIFT, 0, speed);
+          return true;
+        }
+      else
+        {
+	  /* LSLV.  */
+	  if (speed)
+	    *cost += extra_cost->alu.shift_reg;
 
-    case ZERO_EXTRACT:
-    case SIGN_EXTRACT:
-      *cost += rtx_cost (XEXP (x, 0), ZERO_EXTRACT, 0, speed);
-      return true;
+	  return false;  /* All arguments need to be in registers.  */
+        }
 
-    case MULT:
+    case ROTATE:
+    case ROTATERT:
+    case LSHIFTRT:
+    case ASHIFTRT:
       op0 = XEXP (x, 0);
       op1 = XEXP (x, 1);
 
-      *cost = COSTS_N_INSNS (1);
-      if (GET_MODE_CLASS (GET_MODE (x)) == MODE_INT)
+      if (CONST_INT_P (op1))
 	{
-	  if (CONST_INT_P (op1)
-	      && exact_log2 (INTVAL (op1)) > 0)
-	    {
-	      *cost += rtx_cost (op0, ASHIFT, 0, speed);
-	      return true;
-	    }
+	  /* ASR (immediate) and friends.  */
+	  if (speed)
+	    *cost += extra_cost->alu.shift;
 
-	  if ((GET_CODE (op0) == ZERO_EXTEND
-	       && GET_CODE (op1) == ZERO_EXTEND)
-	      || (GET_CODE (op0) == SIGN_EXTEND
-		  && GET_CODE (op1) == SIGN_EXTEND))
-	    {
-	      *cost += (rtx_cost (XEXP (op0, 0), MULT, 0, speed)
-			+ rtx_cost (XEXP (op1, 0), MULT, 1, speed));
-	      if (speed)
-		*cost += extra_cost->mult[GET_MODE (x) == DImode].extend;
-	      return true;
-	    }
+	  *cost += rtx_cost (op0, (enum rtx_code) code, 0, speed);
+	  return true;
+	}
+      else
+	{
 
+	  /* ASR (register) and friends.  */
 	  if (speed)
-	    *cost += extra_cost->mult[GET_MODE (x) == DImode].simple;
+	    *cost += extra_cost->alu.shift_reg;
+
+	  return false;  /* All arguments need to be in registers.  */
 	}
-      else if (speed)
+
+    case SYMBOL_REF:
+
+      if (aarch64_cmodel == AARCH64_CMODEL_LARGE)
 	{
-	  if (GET_MODE (x) == DFmode)
-	    *cost += extra_cost->fp[1].mult;
-	  else if (GET_MODE (x) == SFmode)
-	    *cost += extra_cost->fp[0].mult;
+	  /* LDR.  */
+	  if (speed)
+	    *cost += extra_cost->ldst.load;
+	}
+      else if (aarch64_cmodel == AARCH64_CMODEL_SMALL
+	       || aarch64_cmodel == AARCH64_CMODEL_SMALL_PIC)
+	{
+	  /* ADRP, followed by ADD.  */
+	  *cost += COSTS_N_INSNS (1);
+	  if (speed)
+	    *cost += 2 * extra_cost->alu.arith;
+	}
+      else if (aarch64_cmodel == AARCH64_CMODEL_TINY
+	       || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
+	{
+	  /* ADR.  */
+	  if (speed)
+	    *cost += extra_cost->alu.arith;
 	}
 
-      return false;  /* All arguments need to be in registers.  */
+      if (flag_pic)
+	{
+	  /* One extra load instruction, after accessing the GOT.  */
+	  *cost += COSTS_N_INSNS (1);
+	  if (speed)
+	    *cost += extra_cost->ldst.load;
+	}
+      return true;
+
+    case HIGH:
+    case LO_SUM:
+      /* ADRP/ADD (immediate).  */
+      if (speed)
+	*cost += extra_cost->alu.arith;
+      return true;
+
+    case ZERO_EXTRACT:
+    case SIGN_EXTRACT:
+      /* UBFX/SBFX.  */
+      if (speed)
+	*cost += extra_cost->alu.bfx;
+
+      /* We can trust that the immediates used will be correct (there
+	 are no by-register forms), so we need only cost op0.  */
+      *cost += rtx_cost (XEXP (x, 0), (enum rtx_code) code, 0, speed);
+      return true;
+
+    case MULT:
+      *cost += aarch64_rtx_mult_cost (x, MULT, 0, speed);
+      /* aarch64_rtx_mult_cost always handles recursion to its
+	 operands.  */
+      return true;
 
     case MOD:
     case UMOD:
-      *cost = COSTS_N_INSNS (2);
       if (speed)
 	{
 	  if (GET_MODE_CLASS (GET_MODE (x)) == MODE_INT)
@@ -4815,53 +5559,225 @@ aarch64_rtx_costs (rtx x, int code, int outer ATTRIBUTE_UNUSED,
 
     case DIV:
     case UDIV:
-      *cost = COSTS_N_INSNS (1);
+    case SQRT:
       if (speed)
 	{
-	  if (GET_MODE_CLASS (GET_MODE (x)) == MODE_INT)
-	    *cost += extra_cost->mult[GET_MODE (x) == DImode].idiv;
-	  else if (GET_MODE (x) == DFmode)
-	    *cost += extra_cost->fp[1].div;
-	  else if (GET_MODE (x) == SFmode)
-	    *cost += extra_cost->fp[0].div;
+	  if (GET_MODE_CLASS (mode) == MODE_INT)
+	    /* There is no integer SQRT, so only DIV and UDIV can get
+	       here.  */
+	    *cost += extra_cost->mult[mode == DImode].idiv;
+	  else
+	    *cost += extra_cost->fp[mode == DFmode].div;
 	}
       return false;  /* All arguments need to be in registers.  */
 
-    default:
-      break;
-    }
-  return false;
-}
+    case IF_THEN_ELSE:
+      op2 = XEXP (x, 2);
+      op0 = XEXP (x, 0);
+      op1 = XEXP (x, 1);
 
-static int
-aarch64_address_cost (rtx x ATTRIBUTE_UNUSED,
-		  enum machine_mode mode ATTRIBUTE_UNUSED,
-		  addr_space_t as ATTRIBUTE_UNUSED, bool speed ATTRIBUTE_UNUSED)
-{
-  enum rtx_code c  = GET_CODE (x);
-  const struct cpu_addrcost_table *addr_cost = aarch64_tune_params->addr_cost;
+      if (GET_CODE (op1) == PC || GET_CODE (op2) == PC)
+        {
+          /* Conditional branch.  */
+          if (GET_MODE_CLASS (GET_MODE (XEXP (op0, 0))) == MODE_CC)
+	    return true;
+	  else
+	    {
+	      if (GET_CODE (op0) == NE
+		  || GET_CODE (op0) == EQ)
+		{
+		  rtx inner = XEXP (op0, 0);
+		  rtx comparator = XEXP (op0, 1);
 
-  if (c == PRE_INC || c == PRE_DEC || c == PRE_MODIFY)
-    return addr_cost->pre_modify;
+		  if (comparator == const0_rtx)
+		    {
+		      /* TBZ/TBNZ/CBZ/CBNZ.  */
+		      if (GET_CODE (inner) == ZERO_EXTRACT)
+			/* TBZ/TBNZ.  */
+			*cost += rtx_cost (XEXP (inner, 0), ZERO_EXTRACT,
+					   0, speed);
+		      else
+			/* CBZ/CBNZ.  */
+			*cost += rtx_cost (inner, GET_CODE (op0), 0, speed);
 
-  if (c == POST_INC || c == POST_DEC || c == POST_MODIFY)
-    return addr_cost->post_modify;
+		      return true;
+		    }
+		}
+	      else if (GET_CODE (op0) == LT
+		       || GET_CODE (op0) == GE)
+		{
+		  rtx comparator = XEXP (op0, 1);
 
-  if (c == PLUS)
-    {
-      if (GET_CODE (XEXP (x, 1)) == CONST_INT)
-	return addr_cost->imm_offset;
-      else if (GET_CODE (XEXP (x, 0)) == MULT
-	       || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND
-	       || GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
-	return addr_cost->register_extend;
+		  /* TBZ/TBNZ.  */
+		  if (comparator == const0_rtx)
+		    return true;
+		}
+	    }
+        }
+      else if (GET_MODE_CLASS (GET_MODE (XEXP (op0, 0))) == MODE_CC)
+        {
+          /* It's a conditional operation based on the status flags,
+             so it must be some flavor of CSEL.  */
+
+          /* CSNEG, CSINV, and CSINC are handled for free as part of CSEL.  */
+          if (GET_CODE (op1) == NEG
+              || GET_CODE (op1) == NOT
+              || (GET_CODE (op1) == PLUS && XEXP (op1, 1) == const1_rtx))
+            op1 = XEXP (op1, 0);
+
+          *cost += rtx_cost (op1, IF_THEN_ELSE, 1, speed);
+          *cost += rtx_cost (op2, IF_THEN_ELSE, 2, speed);
+          return true;
+        }
 
-      return addr_cost->register_offset;
-    }
-  else if (c == MEM || c == LABEL_REF || c == SYMBOL_REF)
-    return addr_cost->imm_offset;
+      /* We don't know what this is, cost all operands.  */
+      return false;
 
-  return 0;
+    case EQ:
+    case NE:
+    case GT:
+    case GTU:
+    case LT:
+    case LTU:
+    case GE:
+    case GEU:
+    case LE:
+    case LEU:
+
+      return false; /* All arguments must be in registers.  */
+
+    case FMA:
+      op0 = XEXP (x, 0);
+      op1 = XEXP (x, 1);
+      op2 = XEXP (x, 2);
+
+      if (speed)
+	*cost += extra_cost->fp[mode == DFmode].fma;
+
+      /* FMSUB, FNMADD, and FNMSUB are free.  */
+      if (GET_CODE (op0) == NEG)
+        op0 = XEXP (op0, 0);
+
+      if (GET_CODE (op2) == NEG)
+        op2 = XEXP (op2, 0);
+
+      /* aarch64_fnma4_elt_to_64v2df has the NEG as operand 1,
+	 and the by-element operand as operand 0.  */
+      if (GET_CODE (op1) == NEG)
+        op1 = XEXP (op1, 0);
+
+      /* Catch vector-by-element operations.  The by-element operand can
+	 either be (vec_duplicate (vec_select (x))) or just
+	 (vec_select (x)), depending on whether we are multiplying by
+	 a vector or a scalar.
+
+	 Canonicalization is not very good in these cases, FMA4 will put the
+	 by-element operand as operand 0, FNMA4 will have it as operand 1.  */
+      if (GET_CODE (op0) == VEC_DUPLICATE)
+	op0 = XEXP (op0, 0);
+      else if (GET_CODE (op1) == VEC_DUPLICATE)
+	op1 = XEXP (op1, 0);
+
+      if (GET_CODE (op0) == VEC_SELECT)
+	op0 = XEXP (op0, 0);
+      else if (GET_CODE (op1) == VEC_SELECT)
+	op1 = XEXP (op1, 0);
+
+      /* If the remaining parameters are not registers,
+         get the cost to put them into registers.  */
+      *cost += rtx_cost (op0, FMA, 0, speed);
+      *cost += rtx_cost (op1, FMA, 1, speed);
+      *cost += rtx_cost (op2, FMA, 2, speed);
+      return true;
+
+    case FLOAT_EXTEND:
+      if (speed)
+	*cost += extra_cost->fp[mode == DFmode].widen;
+      return false;
+
+    case FLOAT_TRUNCATE:
+      if (speed)
+	*cost += extra_cost->fp[mode == DFmode].narrow;
+      return false;
+
+    case ABS:
+      if (GET_MODE_CLASS (mode) == MODE_FLOAT)
+	{
+	  op0 = XEXP (x, 0);
+
+	  /* FABD, which is analogous to FADD.  */
+	  if (GET_CODE (op0) == MINUS)
+	    {
+	      *cost += rtx_cost (XEXP (op0, 0), MINUS, 0, speed);
+			+ rtx_cost (XEXP (op0, 1), MINUS, 1, speed);
+	      if (speed)
+		*cost += extra_cost->fp[mode == DFmode].addsub;
+
+	      return true;
+	    }
+	  /* Simple FABS is analogous to FNEG.  */
+	  if (speed)
+	    *cost += extra_cost->fp[mode == DFmode].neg;
+	}
+      else
+	{
+	  /* Integer ABS will either be split to
+	     two arithmetic instructions, or will be an ABS
+	     (scalar), which we don't model.  */
+	  *cost = COSTS_N_INSNS (2);
+	  if (speed)
+	    *cost += 2 * extra_cost->alu.arith;
+	}
+      return false;
+
+    case SMAX:
+    case SMIN:
+      if (speed)
+	{
+	  /* FMAXNM/FMINNM/FMAX/FMIN.
+	     TODO: This may not be accurate for all implementations, but
+	     we do not model this in the cost tables.  */
+	  *cost += extra_cost->fp[mode == DFmode].addsub;
+	}
+      return false;
+
+    case TRUNCATE:
+
+      /* Decompose <su>muldi3_highpart.  */
+      if (/* (truncate:DI  */
+	  mode == DImode
+	  /*   (lshiftrt:TI  */
+          && GET_MODE (XEXP (x, 0)) == TImode
+          && GET_CODE (XEXP (x, 0)) == LSHIFTRT
+	  /*      (mult:TI  */
+          && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
+	  /*        (ANY_EXTEND:TI (reg:DI))
+	            (ANY_EXTEND:TI (reg:DI)))  */
+          && ((GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == ZERO_EXTEND
+               && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == ZERO_EXTEND)
+              || (GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == SIGN_EXTEND
+                  && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == SIGN_EXTEND))
+          && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0)) == DImode
+          && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0)) == DImode
+	  /*     (const_int 64)  */
+          && CONST_INT_P (XEXP (XEXP (x, 0), 1))
+          && UINTVAL (XEXP (XEXP (x, 0), 1)) == 64)
+        {
+          /* UMULH/SMULH.  */
+	  if (speed)
+	    *cost += extra_cost->mult[mode == DImode].extend;
+          *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0),
+			     MULT, 0, speed);
+          *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0),
+			     MULT, 1, speed);
+          return true;
+        }
+
+      /* Fall through.  */
+    default:
+      return true;
+    }
+  return false;
 }
 
 static int
@@ -4994,15 +5910,9 @@ aarch64_add_stmt_cost (void *data, int count, enum vect_cost_for_stmt kind,
 
       /* Statements in an inner loop relative to the loop being
 	 vectorized are weighted more heavily.  The value here is
-	 a function (linear for now) of the loop nest level.  */
+	 arbitrary and could potentially be improved with analysis.  */
       if (where == vect_body && stmt_info && stmt_in_inner_loop_p (stmt_info))
-	{
-	  loop_vec_info loop_info = STMT_VINFO_LOOP_VINFO (stmt_info);
-	  struct loop *loop =  LOOP_VINFO_LOOP (loop_info);
-	  unsigned nest_level = loop_depth (loop);
-
-	  count *= nest_level;
-	}
+	count *= 50; /*  FIXME  */
 
       retval = (unsigned) (count * stmt_cost);
       cost[where] += retval;
diff --git a/gcc-4.9/gcc/config/aarch64/aarch64.md b/gcc-4.9/gcc/config/aarch64/aarch64.md
index dc88f8b10..fe68bfea1 100644
--- a/gcc-4.9/gcc/config/aarch64/aarch64.md
+++ b/gcc-4.9/gcc/config/aarch64/aarch64.md
@@ -2571,6 +2571,32 @@
   [(set_attr "type" "logics_shift_imm")]
 )
 
+(define_insn "*eor_one_cmpl_<SHIFT:optab><mode>3_alt"
+  [(set (match_operand:GPI 0 "register_operand" "=r")
+	(not:GPI (xor:GPI
+		      (SHIFT:GPI
+		       (match_operand:GPI 1 "register_operand" "r")
+		       (match_operand:QI 2 "aarch64_shift_imm_<mode>" "n"))
+		     (match_operand:GPI 3 "register_operand" "r"))))]
+  ""
+  "eon\\t%<w>0, %<w>3, %<w>1, <SHIFT:shift> %2"
+  [(set_attr "type" "logic_shift_imm")]
+)
+
+;; Zero-extend version of the above.
+(define_insn "*eor_one_cmpl_<SHIFT:optab>sidi3_alt_ze"
+  [(set (match_operand:DI 0 "register_operand" "=r")
+	(zero_extend:DI
+	  (not:SI (xor:SI
+		    (SHIFT:SI
+		      (match_operand:SI 1 "register_operand" "r")
+		      (match_operand:QI 2 "aarch64_shift_imm_si" "n"))
+		    (match_operand:SI 3 "register_operand" "r")))))]
+  ""
+  "eon\\t%w0, %w3, %w1, <SHIFT:shift> %2"
+  [(set_attr "type" "logic_shift_imm")]
+)
+
 (define_insn "*and_one_cmpl_<SHIFT:optab><mode>3_compare0"
   [(set (reg:CC_NZ CC_REGNUM)
 	(compare:CC_NZ
@@ -2771,32 +2797,33 @@
 
 ;; Logical left shift using SISD or Integer instruction
 (define_insn "*aarch64_ashl_sisd_or_int_<mode>3"
-  [(set (match_operand:GPI 0 "register_operand" "=w,w,r")
+  [(set (match_operand:GPI 0 "register_operand" "=r,w,w")
         (ashift:GPI
-          (match_operand:GPI 1 "register_operand" "w,w,r")
-          (match_operand:QI 2 "aarch64_reg_or_shift_imm_<mode>" "Us<cmode>,w,rUs<cmode>")))]
+          (match_operand:GPI 1 "register_operand" "r,w,w")
+          (match_operand:QI 2 "aarch64_reg_or_shift_imm_<mode>" "rUs<cmode>,Us<cmode>,w")))]
   ""
   "@
+   lsl\t%<w>0, %<w>1, %<w>2
    shl\t%<rtn>0<vas>, %<rtn>1<vas>, %2
-   ushl\t%<rtn>0<vas>, %<rtn>1<vas>, %<rtn>2<vas>
-   lsl\t%<w>0, %<w>1, %<w>2"
-  [(set_attr "simd" "yes,yes,no")
-   (set_attr "type" "neon_shift_imm<q>, neon_shift_reg<q>,shift_reg")]
+   ushl\t%<rtn>0<vas>, %<rtn>1<vas>, %<rtn>2<vas>"
+  [(set_attr "simd" "no,yes,yes")
+   (set_attr "type" "shift_reg,neon_shift_imm<q>, neon_shift_reg<q>")]
 )
 
 ;; Logical right shift using SISD or Integer instruction
 (define_insn "*aarch64_lshr_sisd_or_int_<mode>3"
-  [(set (match_operand:GPI 0 "register_operand" "=w,&w,r")
+  [(set (match_operand:GPI 0 "register_operand" "=r,w,&w,&w")
         (lshiftrt:GPI
-          (match_operand:GPI 1 "register_operand" "w,w,r")
-          (match_operand:QI 2 "aarch64_reg_or_shift_imm_<mode>" "Us<cmode>,w,rUs<cmode>")))]
+          (match_operand:GPI 1 "register_operand" "r,w,w,w")
+          (match_operand:QI 2 "aarch64_reg_or_shift_imm_<mode>" "rUs<cmode>,Us<cmode>,w,0")))]
   ""
   "@
+   lsr\t%<w>0, %<w>1, %<w>2
    ushr\t%<rtn>0<vas>, %<rtn>1<vas>, %2
    #
-   lsr\t%<w>0, %<w>1, %<w>2"
-  [(set_attr "simd" "yes,yes,no")
-   (set_attr "type" "neon_shift_imm<q>,neon_shift_reg<q>,shift_reg")]
+   #"
+  [(set_attr "simd" "no,yes,yes,yes")
+   (set_attr "type" "shift_reg,neon_shift_imm<q>,neon_shift_reg<q>,neon_shift_reg<q>")]
 )
 
 (define_split
@@ -2831,18 +2858,18 @@
 
 ;; Arithmetic right shift using SISD or Integer instruction
 (define_insn "*aarch64_ashr_sisd_or_int_<mode>3"
-  [(set (match_operand:GPI 0 "register_operand" "=w,&w,&w,r")
+  [(set (match_operand:GPI 0 "register_operand" "=r,w,&w,&w")
         (ashiftrt:GPI
-          (match_operand:GPI 1 "register_operand" "w,w,w,r")
-          (match_operand:QI 2 "aarch64_reg_or_shift_imm_di" "Us<cmode>,w,0,rUs<cmode>")))]
+          (match_operand:GPI 1 "register_operand" "r,w,w,w")
+          (match_operand:QI 2 "aarch64_reg_or_shift_imm_di" "rUs<cmode>,Us<cmode>,w,0")))]
   ""
   "@
+   asr\t%<w>0, %<w>1, %<w>2
    sshr\t%<rtn>0<vas>, %<rtn>1<vas>, %2
    #
-   #
-   asr\t%<w>0, %<w>1, %<w>2"
-  [(set_attr "simd" "yes,yes,yes,no")
-   (set_attr "type" "neon_shift_imm<q>,neon_shift_reg<q>,neon_shift_reg<q>,shift_reg")]
+   #"
+  [(set_attr "simd" "no,yes,yes,yes")
+   (set_attr "type" "shift_reg,neon_shift_imm<q>,neon_shift_reg<q>,neon_shift_reg<q>")]
 )
 
 (define_split
diff --git a/gcc-4.9/gcc/testsuite/gcc.dg/tree-ssa/pr65447.c b/gcc-4.9/gcc/testsuite/gcc.dg/tree-ssa/pr65447.c
new file mode 100644
index 000000000..c5bddbfba
--- /dev/null
+++ b/gcc-4.9/gcc/testsuite/gcc.dg/tree-ssa/pr65447.c
@@ -0,0 +1,54 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -fdump-tree-ivopts-details" } */
+
+void foo (double *p)
+{
+  int i;
+  for (i = -20000; i < 200000; i+= 40)
+    {
+      p[i+0] = 1.0;
+      p[i+1] = 1.0;
+      p[i+2] = 1.0;
+      p[i+3] = 1.0;
+      p[i+4] = 1.0;
+      p[i+5] = 1.0;
+      p[i+6] = 1.0;
+      p[i+7] = 1.0;
+      p[i+8] = 1.0;
+      p[i+9] = 1.0;
+      p[i+10] = 1.0;
+      p[i+11] = 1.0;
+      p[i+12] = 1.0;
+      p[i+13] = 1.0;
+      p[i+14] = 1.0;
+      p[i+15] = 1.0;
+      p[i+16] = 1.0;
+      p[i+17] = 1.0;
+      p[i+18] = 1.0;
+      p[i+19] = 1.0;
+      p[i+20] = 1.0;
+      p[i+21] = 1.0;
+      p[i+22] = 1.0;
+      p[i+23] = 1.0;
+      p[i+24] = 1.0;
+      p[i+25] = 1.0;
+      p[i+26] = 1.0;
+      p[i+27] = 1.0;
+      p[i+28] = 1.0;
+      p[i+29] = 1.0;
+      p[i+30] = 1.0;
+      p[i+31] = 1.0;
+      p[i+32] = 1.0;
+      p[i+33] = 1.0;
+      p[i+34] = 1.0;
+      p[i+35] = 1.0;
+      p[i+36] = 1.0;
+      p[i+37] = 1.0;
+      p[i+38] = 1.0;
+      p[i+39] = 1.0;
+    }
+}
+
+/* We should groups address type IV uses.  */
+/* { dg-final { scan-tree-dump-not "\\nuse 2\\n" "ivopts" } }  */
+/* { dg-final { cleanup-tree-dump "ivopts" } }  */
diff --git a/gcc-4.9/gcc/testsuite/gcc.dg/tree-ssa/scev-10.c b/gcc-4.9/gcc/testsuite/gcc.dg/tree-ssa/scev-10.c
new file mode 100644
index 000000000..2e16c894b
--- /dev/null
+++ b/gcc-4.9/gcc/testsuite/gcc.dg/tree-ssa/scev-10.c
@@ -0,0 +1,21 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -fdump-tree-ivopts-details" } */
+
+int *a;
+
+int
+foo (signed char s, signed char l)
+{
+  signed char i;
+  int sum = 0;
+
+  for (i = s; i < l; i++)
+    {
+      sum += a[i];
+    }
+
+  return sum;
+}
+
+/* Address of array reference is scev.  */
+/* { dg-final { scan-tree-dump-times "use \[0-9\]\n  address" 1 "ivopts" } } */
diff --git a/gcc-4.9/gcc/testsuite/gcc.dg/tree-ssa/scev-3.c b/gcc-4.9/gcc/testsuite/gcc.dg/tree-ssa/scev-3.c
index 5cac1cefb..28d5c9327 100644
--- a/gcc-4.9/gcc/testsuite/gcc.dg/tree-ssa/scev-3.c
+++ b/gcc-4.9/gcc/testsuite/gcc.dg/tree-ssa/scev-3.c
@@ -14,5 +14,5 @@ f(int k)
         }
 }
 
-/* { dg-final { scan-tree-dump-times "&a" 1 "optimized" { xfail { lp64 || llp64 } } } } */
+/* { dg-final { scan-tree-dump-times "&a" 1 "optimized" } } */
 /* { dg-final { cleanup-tree-dump "optimized" } } */
diff --git a/gcc-4.9/gcc/testsuite/gcc.dg/tree-ssa/scev-4.c b/gcc-4.9/gcc/testsuite/gcc.dg/tree-ssa/scev-4.c
index 5f15d622d..6c1e530a9 100644
--- a/gcc-4.9/gcc/testsuite/gcc.dg/tree-ssa/scev-4.c
+++ b/gcc-4.9/gcc/testsuite/gcc.dg/tree-ssa/scev-4.c
@@ -19,5 +19,5 @@ f(int k)
         }
 }
 
-/* { dg-final { scan-tree-dump-times "&a" 1 "optimized" { xfail { lp64 || llp64 } } } } */
+/* { dg-final { scan-tree-dump-times "&a" 1 "optimized" } } */
 /* { dg-final { cleanup-tree-dump "optimized" } } */
diff --git a/gcc-4.9/gcc/testsuite/gcc.dg/tree-ssa/scev-8.c b/gcc-4.9/gcc/testsuite/gcc.dg/tree-ssa/scev-8.c
new file mode 100644
index 000000000..766f674d5
--- /dev/null
+++ b/gcc-4.9/gcc/testsuite/gcc.dg/tree-ssa/scev-8.c
@@ -0,0 +1,62 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -fdump-tree-ivopts-details" } */
+
+int *a;
+
+int
+foo1 (long long s, long long l)
+{
+  long long i;
+
+  for (i = s; i < l; i++)
+    {
+      a[(short)i] = 0;
+    }
+  return 0;
+}
+
+int
+foo2 (unsigned char s, unsigned char l, unsigned char c)
+{
+  unsigned char i, step = 1;
+  int sum = 0;
+
+  for (i = s; i < l; i++)
+    {
+      sum += a[c];
+      c += step;
+    }
+
+  return sum;
+}
+
+int
+foo3 (unsigned char s, unsigned char l, unsigned char c)
+{
+  unsigned char i;
+  int sum = 0;
+
+  for (i = s; i != l; i += c)
+    {
+      sum += a[i];
+    }
+
+  return sum;
+}
+
+int
+foo4 (unsigned char s, unsigned char l)
+{
+  unsigned char i;
+  int sum = 0;
+
+  for (i = s; i != l; i++)
+    {
+      sum += a[i];
+    }
+
+  return sum;
+}
+
+/* Address of array references are not scevs.  */
+/* { dg-final { scan-tree-dump-not "use \[0-9\]\n  address" "ivopts" } } */
diff --git a/gcc-4.9/gcc/testsuite/gcc.dg/tree-ssa/scev-9.c b/gcc-4.9/gcc/testsuite/gcc.dg/tree-ssa/scev-9.c
new file mode 100644
index 000000000..557e33850
--- /dev/null
+++ b/gcc-4.9/gcc/testsuite/gcc.dg/tree-ssa/scev-9.c
@@ -0,0 +1,21 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -fdump-tree-ivopts-details" } */
+
+int *a;
+
+int
+foo (unsigned char s, unsigned char l)
+{
+  unsigned char i;
+  int sum = 0;
+
+  for (i = s; i < l; i += 1)
+    {
+      sum += a[i];
+    }
+
+  return sum;
+}
+
+/* Address of array reference is scev.  */
+/* { dg-final { scan-tree-dump-times "use \[0-9\]\n  address" 1 "ivopts" } } */
diff --git a/gcc-4.9/gcc/testsuite/gcc.dg/vect/pr48052.c b/gcc-4.9/gcc/testsuite/gcc.dg/vect/pr48052.c
new file mode 100644
index 000000000..c822ebd41
--- /dev/null
+++ b/gcc-4.9/gcc/testsuite/gcc.dg/vect/pr48052.c
@@ -0,0 +1,26 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-O3 -std=c99" } */
+
+int foo(int* A, int* B,  unsigned start, unsigned BS)
+{
+  int s;
+  for (unsigned k = start;  k < start + BS; k++)
+    {
+      s += A[k] * B[k];
+    }
+
+  return s;
+}
+
+int bar(int* A, int* B, unsigned BS)
+{
+  int s;
+  for (unsigned k = 0;  k < BS; k++)
+    {
+      s += A[k] * B[k];
+    }
+
+  return s;
+}
+
+/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 2 "vect" } } */
diff --git a/gcc-4.9/gcc/tree-chrec.c b/gcc-4.9/gcc/tree-chrec.c
index b9350f015..3e34c65dc 100644
--- a/gcc-4.9/gcc/tree-chrec.c
+++ b/gcc-4.9/gcc/tree-chrec.c
@@ -1162,8 +1162,6 @@ nb_vars_in_chrec (tree chrec)
     }
 }
 
-static tree chrec_convert_1 (tree, tree, gimple, bool);
-
 /* Converts BASE and STEP of affine scev to TYPE.  LOOP is the loop whose iv
    the scev corresponds to.  AT_STMT is the statement at that the scev is
    evaluated.  USE_OVERFLOW_SEMANTICS is true if this function should assume that
@@ -1238,8 +1236,7 @@ convert_affine_scev (struct loop *loop, tree type,
 				use_overflow_semantics))
     return false;
 
-  new_base = chrec_convert_1 (type, *base, at_stmt,
-			      use_overflow_semantics);
+  new_base = chrec_convert (type, *base, at_stmt, use_overflow_semantics);
   /* The step must be sign extended, regardless of the signedness
      of CT and TYPE.  This only needs to be handled specially when
      CT is unsigned -- to avoid e.g. unsigned char [100, +, 255]
@@ -1250,10 +1247,11 @@ convert_affine_scev (struct loop *loop, tree type,
   if (TYPE_PRECISION (step_type) > TYPE_PRECISION (ct) && TYPE_UNSIGNED (ct))
     {
       tree signed_ct = build_nonstandard_integer_type (TYPE_PRECISION (ct), 0);
-      new_step = chrec_convert_1 (signed_ct, new_step, at_stmt,
-                                  use_overflow_semantics);
+      new_step = chrec_convert (signed_ct, new_step, at_stmt,
+                                use_overflow_semantics);
     }
-  new_step = chrec_convert_1 (step_type, new_step, at_stmt, use_overflow_semantics);
+  new_step = chrec_convert (step_type, new_step, at_stmt,
+			    use_overflow_semantics);
 
   if (automatically_generated_chrec_p (new_base)
       || automatically_generated_chrec_p (new_step))
@@ -1290,36 +1288,6 @@ chrec_convert_rhs (tree type, tree chrec, gimple at_stmt)
    determining a more accurate estimation of the number of iterations.
    By default AT_STMT could be safely set to NULL_TREE.
 
-   The following rule is always true: TREE_TYPE (chrec) ==
-   TREE_TYPE (CHREC_LEFT (chrec)) == TREE_TYPE (CHREC_RIGHT (chrec)).
-   An example of what could happen when adding two chrecs and the type
-   of the CHREC_RIGHT is different than CHREC_LEFT is:
-
-   {(uint) 0, +, (uchar) 10} +
-   {(uint) 0, +, (uchar) 250}
-
-   that would produce a wrong result if CHREC_RIGHT is not (uint):
-
-   {(uint) 0, +, (uchar) 4}
-
-   instead of
-
-   {(uint) 0, +, (uint) 260}
-*/
-
-tree
-chrec_convert (tree type, tree chrec, gimple at_stmt)
-{
-  return chrec_convert_1 (type, chrec, at_stmt, true);
-}
-
-/* Convert CHREC to TYPE.  When the analyzer knows the context in
-   which the CHREC is built, it sets AT_STMT to the statement that
-   contains the definition of the analyzed variable, otherwise the
-   conversion is less accurate: the information is used for
-   determining a more accurate estimation of the number of iterations.
-   By default AT_STMT could be safely set to NULL_TREE.
-
    USE_OVERFLOW_SEMANTICS is true if this function should assume that
    the rules for overflow of the given language apply (e.g., that signed
    arithmetics in C does not overflow) -- i.e., to use them to avoid unnecessary
@@ -1404,15 +1372,53 @@ keep_cast:
   return res;
 }
 
+/* Convert CHREC to TYPE.  When the analyzer knows the context in
+   which the CHREC is built, it sets AT_STMT to the statement that
+   contains the definition of the analyzed variable, otherwise the
+   conversion is less accurate: the information is used for
+   determining a more accurate estimation of the number of iterations.
+   By default AT_STMT could be safely set to NULL_TREE.
+
+   The following rule is always true: TREE_TYPE (chrec) ==
+   TREE_TYPE (CHREC_LEFT (chrec)) == TREE_TYPE (CHREC_RIGHT (chrec)).
+   An example of what could happen when adding two chrecs and the type
+   of the CHREC_RIGHT is different than CHREC_LEFT is:
+
+   {(uint) 0, +, (uchar) 10} +
+   {(uint) 0, +, (uchar) 250}
+
+   that would produce a wrong result if CHREC_RIGHT is not (uint):
+
+   {(uint) 0, +, (uchar) 4}
+
+   instead of
+
+   {(uint) 0, +, (uint) 260}
+
+   USE_OVERFLOW_SEMANTICS is true if this function should assume that
+   the rules for overflow of the given language apply (e.g., that signed
+   arithmetics in C does not overflow) -- i.e., to use them to avoid unnecessary
+   tests, but also to enforce that the result follows them.  */
+
+tree
+chrec_convert (tree type, tree chrec, gimple at_stmt,
+	       bool use_overflow_semantics)
+{
+  return chrec_convert_1 (type, chrec, at_stmt, use_overflow_semantics);
+}
+
 /* Convert CHREC to TYPE, without regard to signed overflows.  Returns the new
    chrec if something else than what chrec_convert would do happens, NULL_TREE
-   otherwise.  */
+   otherwise.  This function set TRUE to variable pointed by FOLD_CONVERSIONS
+   if the result chrec may overflow.  */
 
 tree
-chrec_convert_aggressive (tree type, tree chrec)
+chrec_convert_aggressive (tree type, tree chrec, bool *fold_conversions)
 {
   tree inner_type, left, right, lc, rc, rtype;
 
+  gcc_assert (fold_conversions != NULL);
+
   if (automatically_generated_chrec_p (chrec)
       || TREE_CODE (chrec) != POLYNOMIAL_CHREC)
     return NULL_TREE;
@@ -1421,17 +1427,33 @@ chrec_convert_aggressive (tree type, tree chrec)
   if (TYPE_PRECISION (type) > TYPE_PRECISION (inner_type))
     return NULL_TREE;
 
+  if (useless_type_conversion_p (type, inner_type))
+    return NULL_TREE;
+
+  if (!*fold_conversions && evolution_function_is_affine_p (chrec))
+    {
+      tree base, step;
+      struct loop *loop;
+
+      loop = get_chrec_loop (chrec);
+      base = CHREC_LEFT (chrec);
+      step = CHREC_RIGHT (chrec);
+      if (convert_affine_scev (loop, type, &base, &step, NULL, true))
+	return build_polynomial_chrec (loop->num, base, step);
+    }
   rtype = POINTER_TYPE_P (type) ? sizetype : type;
 
   left = CHREC_LEFT (chrec);
   right = CHREC_RIGHT (chrec);
-  lc = chrec_convert_aggressive (type, left);
+  lc = chrec_convert_aggressive (type, left, fold_conversions);
   if (!lc)
     lc = chrec_convert (type, left, NULL);
-  rc = chrec_convert_aggressive (rtype, right);
+  rc = chrec_convert_aggressive (rtype, right, fold_conversions);
   if (!rc)
     rc = chrec_convert (rtype, right, NULL);
 
+  *fold_conversions = true;
+
   return build_polynomial_chrec (CHREC_VARIABLE (chrec), lc, rc);
 }
 
diff --git a/gcc-4.9/gcc/tree-chrec.h b/gcc-4.9/gcc/tree-chrec.h
index 90cc7a74b..ea4627751 100644
--- a/gcc-4.9/gcc/tree-chrec.h
+++ b/gcc-4.9/gcc/tree-chrec.h
@@ -59,9 +59,9 @@ enum ev_direction scev_direction (const_tree);
 extern tree chrec_fold_plus (tree, tree, tree);
 extern tree chrec_fold_minus (tree, tree, tree);
 extern tree chrec_fold_multiply (tree, tree, tree);
-extern tree chrec_convert (tree, tree, gimple);
+extern tree chrec_convert (tree, tree, gimple, bool = true);
 extern tree chrec_convert_rhs (tree, tree, gimple);
-extern tree chrec_convert_aggressive (tree, tree);
+extern tree chrec_convert_aggressive (tree, tree, bool *);
 
 /* Operations.  */
 extern tree chrec_apply (unsigned, tree, tree);
diff --git a/gcc-4.9/gcc/tree-scalar-evolution.c b/gcc-4.9/gcc/tree-scalar-evolution.c
index f1ddc24b1..218baa406 100644
--- a/gcc-4.9/gcc/tree-scalar-evolution.c
+++ b/gcc-4.9/gcc/tree-scalar-evolution.c
@@ -2116,7 +2116,7 @@ analyze_scalar_evolution_in_loop (struct loop *wrto_loop, struct loop *use_loop,
   /* We cannot just do
 
      tmp = analyze_scalar_evolution (use_loop, version);
-     ev = resolve_mixers (wrto_loop, tmp);
+     ev = resolve_mixers (wrto_loop, tmp, folded_casts);
 
      as resolve_mixers would query the scalar evolution with respect to
      wrto_loop.  For example, in the situation described in the function
@@ -2125,9 +2125,9 @@ analyze_scalar_evolution_in_loop (struct loop *wrto_loop, struct loop *use_loop,
 
      analyze_scalar_evolution (use_loop, version) = k2
 
-     and resolve_mixers (loop1, k2) finds that the value of k2 in loop 1
-     is 100, which is a wrong result, since we are interested in the
-     value in loop 3.
+     and resolve_mixers (loop1, k2, folded_casts) finds that the value of
+     k2 in loop 1 is 100, which is a wrong result, since we are interested
+     in the value in loop 3.
 
      Instead, we need to proceed from use_loop to wrto_loop loop by loop,
      each time checking that there is no evolution in the inner loop.  */
@@ -2137,10 +2137,7 @@ analyze_scalar_evolution_in_loop (struct loop *wrto_loop, struct loop *use_loop,
   while (1)
     {
       tmp = analyze_scalar_evolution (use_loop, ev);
-      ev = resolve_mixers (use_loop, tmp);
-
-      if (folded_casts && tmp != ev)
-	*folded_casts = true;
+      ev = resolve_mixers (use_loop, tmp, folded_casts);
 
       if (use_loop == wrto_loop)
 	return ev;
@@ -2262,7 +2259,7 @@ loop_closed_phi_def (tree var)
 }
 
 static tree instantiate_scev_r (basic_block, struct loop *, struct loop *,
-				tree, bool, int);
+				tree, bool *, int);
 
 /* Analyze all the parameters of the chrec, between INSTANTIATE_BELOW
    and EVOLUTION_LOOP, that were left under a symbolic form.
@@ -2271,9 +2268,10 @@ static tree instantiate_scev_r (basic_block, struct loop *, struct loop *,
 
    CACHE is the cache of already instantiated values.
 
-   FOLD_CONVERSIONS should be set to true when the conversions that
-   may wrap in signed/pointer type are folded, as long as the value of
-   the chrec is preserved.
+   Variable pointed by FOLD_CONVERSIONS is set to TRUE when the
+   conversions that may wrap in signed/pointer type are folded, as long
+   as the value of the chrec is preserved.  If FOLD_CONVERSIONS is NULL
+   then we don't do such fold.
 
    SIZE_EXPR is used for computing the size of the expression to be
    instantiated, and to stop if it exceeds some limit.  */
@@ -2282,7 +2280,7 @@ static tree
 instantiate_scev_name (basic_block instantiate_below,
 		       struct loop *evolution_loop, struct loop *inner_loop,
 		       tree chrec,
-		       bool fold_conversions,
+		       bool *fold_conversions,
 		       int size_expr)
 {
   tree res;
@@ -2376,9 +2374,10 @@ instantiate_scev_name (basic_block instantiate_below,
 
    CACHE is the cache of already instantiated values.
 
-   FOLD_CONVERSIONS should be set to true when the conversions that
-   may wrap in signed/pointer type are folded, as long as the value of
-   the chrec is preserved.
+   Variable pointed by FOLD_CONVERSIONS is set to TRUE when the
+   conversions that may wrap in signed/pointer type are folded, as long
+   as the value of the chrec is preserved.  If FOLD_CONVERSIONS is NULL
+   then we don't do such fold.
 
    SIZE_EXPR is used for computing the size of the expression to be
    instantiated, and to stop if it exceeds some limit.  */
@@ -2386,7 +2385,7 @@ instantiate_scev_name (basic_block instantiate_below,
 static tree
 instantiate_scev_poly (basic_block instantiate_below,
 		       struct loop *evolution_loop, struct loop *,
-		       tree chrec, bool fold_conversions, int size_expr)
+		       tree chrec, bool *fold_conversions, int size_expr)
 {
   tree op1;
   tree op0 = instantiate_scev_r (instantiate_below, evolution_loop,
@@ -2420,9 +2419,10 @@ instantiate_scev_poly (basic_block instantiate_below,
 
    CACHE is the cache of already instantiated values.
 
-   FOLD_CONVERSIONS should be set to true when the conversions that
-   may wrap in signed/pointer type are folded, as long as the value of
-   the chrec is preserved.
+   Variable pointed by FOLD_CONVERSIONS is set to TRUE when the
+   conversions that may wrap in signed/pointer type are folded, as long
+   as the value of the chrec is preserved.  If FOLD_CONVERSIONS is NULL
+   then we don't do such fold.
 
    SIZE_EXPR is used for computing the size of the expression to be
    instantiated, and to stop if it exceeds some limit.  */
@@ -2432,7 +2432,7 @@ instantiate_scev_binary (basic_block instantiate_below,
 			 struct loop *evolution_loop, struct loop *inner_loop,
 			 tree chrec, enum tree_code code,
 			 tree type, tree c0, tree c1,
-			 bool fold_conversions, int size_expr)
+			 bool *fold_conversions, int size_expr)
 {
   tree op1;
   tree op0 = instantiate_scev_r (instantiate_below, evolution_loop, inner_loop,
@@ -2478,9 +2478,10 @@ instantiate_scev_binary (basic_block instantiate_below,
 
    CACHE is the cache of already instantiated values.
 
-   FOLD_CONVERSIONS should be set to true when the conversions that
-   may wrap in signed/pointer type are folded, as long as the value of
-   the chrec is preserved.
+   Variable pointed by FOLD_CONVERSIONS is set to TRUE when the
+   conversions that may wrap in signed/pointer type are folded, as long
+   as the value of the chrec is preserved.  If FOLD_CONVERSIONS is NULL
+   then we don't do such fold.
 
    SIZE_EXPR is used for computing the size of the expression to be
    instantiated, and to stop if it exceeds some limit.  */
@@ -2488,7 +2489,7 @@ instantiate_scev_binary (basic_block instantiate_below,
 static tree
 instantiate_array_ref (basic_block instantiate_below,
 		       struct loop *evolution_loop, struct loop *inner_loop,
-		       tree chrec, bool fold_conversions, int size_expr)
+		       tree chrec, bool *fold_conversions, int size_expr)
 {
   tree res;
   tree index = TREE_OPERAND (chrec, 1);
@@ -2515,9 +2516,10 @@ instantiate_array_ref (basic_block instantiate_below,
 
    CACHE is the cache of already instantiated values.
 
-   FOLD_CONVERSIONS should be set to true when the conversions that
-   may wrap in signed/pointer type are folded, as long as the value of
-   the chrec is preserved.
+   Variable pointed by FOLD_CONVERSIONS is set to TRUE when the
+   conversions that may wrap in signed/pointer type are folded, as long
+   as the value of the chrec is preserved.  If FOLD_CONVERSIONS is NULL
+   then we don't do such fold.
 
    SIZE_EXPR is used for computing the size of the expression to be
    instantiated, and to stop if it exceeds some limit.  */
@@ -2526,7 +2528,7 @@ static tree
 instantiate_scev_convert (basic_block instantiate_below,
 			  struct loop *evolution_loop, struct loop *inner_loop,
 			  tree chrec, tree type, tree op,
-			  bool fold_conversions, int size_expr)
+			  bool *fold_conversions, int size_expr)
 {
   tree op0 = instantiate_scev_r (instantiate_below, evolution_loop,
 				 inner_loop, op,
@@ -2537,19 +2539,21 @@ instantiate_scev_convert (basic_block instantiate_below,
 
   if (fold_conversions)
     {
-      tree tmp = chrec_convert_aggressive (type, op0);
+      tree tmp = chrec_convert_aggressive (type, op0, fold_conversions);
       if (tmp)
 	return tmp;
-    }
 
-  if (chrec && op0 == op)
-    return chrec;
+      /* If we used chrec_convert_aggressive, we can no longer assume that
+	 signed chrecs do not overflow, as chrec_convert does, so avoid
+	 calling it in that case.  */
+      if (*fold_conversions)
+	{
+	  if (chrec && op0 == op)
+	    return chrec;
 
-  /* If we used chrec_convert_aggressive, we can no longer assume that
-     signed chrecs do not overflow, as chrec_convert does, so avoid
-     calling it in that case.  */
-  if (fold_conversions)
-    return fold_convert (type, op0);
+	  return fold_convert (type, op0);
+	}
+    }
 
   return chrec_convert (type, op0, NULL);
 }
@@ -2563,9 +2567,10 @@ instantiate_scev_convert (basic_block instantiate_below,
 
    CACHE is the cache of already instantiated values.
 
-   FOLD_CONVERSIONS should be set to true when the conversions that
-   may wrap in signed/pointer type are folded, as long as the value of
-   the chrec is preserved.
+   Variable pointed by FOLD_CONVERSIONS is set to TRUE when the
+   conversions that may wrap in signed/pointer type are folded, as long
+   as the value of the chrec is preserved.  If FOLD_CONVERSIONS is NULL
+   then we don't do such fold.
 
    SIZE_EXPR is used for computing the size of the expression to be
    instantiated, and to stop if it exceeds some limit.  */
@@ -2575,7 +2580,7 @@ instantiate_scev_not (basic_block instantiate_below,
 		      struct loop *evolution_loop, struct loop *inner_loop,
 		      tree chrec,
 		      enum tree_code code, tree type, tree op,
-		      bool fold_conversions, int size_expr)
+		      bool *fold_conversions, int size_expr)
 {
   tree op0 = instantiate_scev_r (instantiate_below, evolution_loop,
 				 inner_loop, op,
@@ -2613,9 +2618,10 @@ instantiate_scev_not (basic_block instantiate_below,
 
    CACHE is the cache of already instantiated values.
 
-   FOLD_CONVERSIONS should be set to true when the conversions that
-   may wrap in signed/pointer type are folded, as long as the value of
-   the chrec is preserved.
+   Variable pointed by FOLD_CONVERSIONS is set to TRUE when the
+   conversions that may wrap in signed/pointer type are folded, as long
+   as the value of the chrec is preserved.  If FOLD_CONVERSIONS is NULL
+   then we don't do such fold.
 
    SIZE_EXPR is used for computing the size of the expression to be
    instantiated, and to stop if it exceeds some limit.  */
@@ -2624,7 +2630,7 @@ static tree
 instantiate_scev_3 (basic_block instantiate_below,
 		    struct loop *evolution_loop, struct loop *inner_loop,
 		    tree chrec,
-		    bool fold_conversions, int size_expr)
+		    bool *fold_conversions, int size_expr)
 {
   tree op1, op2;
   tree op0 = instantiate_scev_r (instantiate_below, evolution_loop,
@@ -2661,9 +2667,10 @@ instantiate_scev_3 (basic_block instantiate_below,
 
    CACHE is the cache of already instantiated values.
 
-   FOLD_CONVERSIONS should be set to true when the conversions that
-   may wrap in signed/pointer type are folded, as long as the value of
-   the chrec is preserved.
+   Variable pointed by FOLD_CONVERSIONS is set to TRUE when the
+   conversions that may wrap in signed/pointer type are folded, as long
+   as the value of the chrec is preserved.  If FOLD_CONVERSIONS is NULL
+   then we don't do such fold.
 
    SIZE_EXPR is used for computing the size of the expression to be
    instantiated, and to stop if it exceeds some limit.  */
@@ -2672,7 +2679,7 @@ static tree
 instantiate_scev_2 (basic_block instantiate_below,
 		    struct loop *evolution_loop, struct loop *inner_loop,
 		    tree chrec,
-		    bool fold_conversions, int size_expr)
+		    bool *fold_conversions, int size_expr)
 {
   tree op1;
   tree op0 = instantiate_scev_r (instantiate_below, evolution_loop,
@@ -2701,9 +2708,10 @@ instantiate_scev_2 (basic_block instantiate_below,
 
    CACHE is the cache of already instantiated values.
 
-   FOLD_CONVERSIONS should be set to true when the conversions that
-   may wrap in signed/pointer type are folded, as long as the value of
-   the chrec is preserved.
+   Variable pointed by FOLD_CONVERSIONS is set to TRUE when the
+   conversions that may wrap in signed/pointer type are folded, as long
+   as the value of the chrec is preserved.  If FOLD_CONVERSIONS is NULL
+   then we don't do such fold.
 
    SIZE_EXPR is used for computing the size of the expression to be
    instantiated, and to stop if it exceeds some limit.  */
@@ -2712,7 +2720,7 @@ static tree
 instantiate_scev_1 (basic_block instantiate_below,
 		    struct loop *evolution_loop, struct loop *inner_loop,
 		    tree chrec,
-		    bool fold_conversions, int size_expr)
+		    bool *fold_conversions, int size_expr)
 {
   tree op0 = instantiate_scev_r (instantiate_below, evolution_loop,
 				 inner_loop, TREE_OPERAND (chrec, 0),
@@ -2734,9 +2742,10 @@ instantiate_scev_1 (basic_block instantiate_below,
 
    CACHE is the cache of already instantiated values.
 
-   FOLD_CONVERSIONS should be set to true when the conversions that
-   may wrap in signed/pointer type are folded, as long as the value of
-   the chrec is preserved.
+   Variable pointed by FOLD_CONVERSIONS is set to TRUE when the
+   conversions that may wrap in signed/pointer type are folded, as long
+   as the value of the chrec is preserved.  If FOLD_CONVERSIONS is NULL
+   then we don't do such fold.
 
    SIZE_EXPR is used for computing the size of the expression to be
    instantiated, and to stop if it exceeds some limit.  */
@@ -2745,7 +2754,7 @@ static tree
 instantiate_scev_r (basic_block instantiate_below,
 		    struct loop *evolution_loop, struct loop *inner_loop,
 		    tree chrec,
-		    bool fold_conversions, int size_expr)
+		    bool *fold_conversions, int size_expr)
 {
   /* Give up if the expression is larger than the MAX that we allow.  */
   if (size_expr++ > PARAM_VALUE (PARAM_SCEV_MAX_EXPR_SIZE))
@@ -2870,7 +2879,7 @@ instantiate_scev (basic_block instantiate_below, struct loop *evolution_loop,
     }
 
   res = instantiate_scev_r (instantiate_below, evolution_loop,
-			    NULL, chrec, false, 0);
+			    NULL, chrec, NULL, 0);
 
   if (destr)
     {
@@ -2894,9 +2903,10 @@ instantiate_scev (basic_block instantiate_below, struct loop *evolution_loop,
    of an expression.  */
 
 tree
-resolve_mixers (struct loop *loop, tree chrec)
+resolve_mixers (struct loop *loop, tree chrec, bool *folded_casts)
 {
   bool destr = false;
+  bool fold_conversions = false;
   if (!global_cache)
     {
       global_cache = new instantiate_cache_type;
@@ -2904,7 +2914,10 @@ resolve_mixers (struct loop *loop, tree chrec)
     }
 
   tree ret = instantiate_scev_r (block_before_loop (loop), loop, NULL,
-				 chrec, true, 0);
+				 chrec, &fold_conversions, 0);
+
+  if (folded_casts && !*folded_casts)
+    *folded_casts = fold_conversions;
 
   if (destr)
     {
@@ -3369,7 +3382,8 @@ scev_const_prop (void)
 	      && !INTEGRAL_TYPE_P (type))
 	    continue;
 
-	  ev = resolve_mixers (loop, analyze_scalar_evolution (loop, name));
+	  ev = resolve_mixers (loop, analyze_scalar_evolution (loop, name),
+			       NULL);
 	  if (!is_gimple_min_invariant (ev)
 	      || !may_propagate_copy (name, ev))
 	    continue;
diff --git a/gcc-4.9/gcc/tree-scalar-evolution.h b/gcc-4.9/gcc/tree-scalar-evolution.h
index 556997684..cb9af5122 100644
--- a/gcc-4.9/gcc/tree-scalar-evolution.h
+++ b/gcc-4.9/gcc/tree-scalar-evolution.h
@@ -31,7 +31,7 @@ extern void scev_reset_htab (void);
 extern void scev_finalize (void);
 extern tree analyze_scalar_evolution (struct loop *, tree);
 extern tree instantiate_scev (basic_block, struct loop *, tree);
-extern tree resolve_mixers (struct loop *, tree);
+extern tree resolve_mixers (struct loop *, tree, bool *);
 extern void gather_stats_on_scev_database (void);
 extern unsigned int scev_const_prop (void);
 extern bool expression_expensive_p (tree);
diff --git a/gcc-4.9/gcc/tree-ssa-loop-ivopts.c b/gcc-4.9/gcc/tree-ssa-loop-ivopts.c
index c5a5dd48a..0bf8e2aff 100644
--- a/gcc-4.9/gcc/tree-ssa-loop-ivopts.c
+++ b/gcc-4.9/gcc/tree-ssa-loop-ivopts.c
@@ -142,9 +142,10 @@ struct iv
   tree base_object;	/* A memory object to that the induction variable points.  */
   tree step;		/* Step of the iv (constant only).  */
   tree ssa_name;	/* The ssa name with the value.  */
+  unsigned use_id;	/* The identifier in the use if it is the case.  */
   bool biv_p;		/* Is it a biv?  */
   bool have_use_for;	/* Do we already have a use for it?  */
-  unsigned use_id;	/* The identifier in the use if it is the case.  */
+  bool no_overflow;	/* True if the iv doesn't overflow.  */
 };
 
 /* Per-ssa version information (induction variable descriptions, etc.).  */
@@ -197,6 +198,7 @@ struct cost_pair
 struct iv_use
 {
   unsigned id;		/* The id of the use.  */
+  unsigned sub_id;	/* The id of the sub use.  */
   enum use_type type;	/* Type of the use.  */
   struct iv *iv;	/* The induction variable it is based on.  */
   gimple stmt;		/* Statement in that it occurs.  */
@@ -210,6 +212,11 @@ struct iv_use
 
   struct iv_cand *selected;
 			/* The selected candidate.  */
+
+  struct iv_use *next;	/* The next sub use.  */
+  tree addr_base;	/* Base address with const offset stripped.  */
+  unsigned HOST_WIDE_INT addr_offset;
+			/* Const offset stripped from base address.  */
 };
 
 /* The position where the iv is computed.  */
@@ -522,7 +529,11 @@ dump_iv (FILE *file, struct iv *iv)
 void
 dump_use (FILE *file, struct iv_use *use)
 {
-  fprintf (file, "use %d\n", use->id);
+  fprintf (file, "use %d", use->id);
+  if (use->sub_id)
+    fprintf (file, ".%d", use->sub_id);
+
+  fprintf (file, "\n");
 
   switch (use->type)
     {
@@ -571,8 +582,12 @@ dump_uses (FILE *file, struct ivopts_data *data)
   for (i = 0; i < n_iv_uses (data); i++)
     {
       use = iv_use (data, i);
-
-      dump_use (file, use);
+      do
+	{
+	  dump_use (file, use);
+	  use = use->next;
+	}
+      while (use);
       fprintf (file, "\n");
     }
 }
@@ -929,10 +944,10 @@ determine_base_object (tree expr)
 }
 
 /* Allocates an induction variable with given initial value BASE and step STEP
-   for loop LOOP.  */
+   for loop LOOP.  NO_OVERFLOW implies the iv doesn't overflow.  */
 
 static struct iv *
-alloc_iv (tree base, tree step)
+alloc_iv (tree base, tree step, bool no_overflow = false)
 {
   tree base_object = base;
   struct iv *iv = XCNEW (struct iv);
@@ -963,21 +978,24 @@ alloc_iv (tree base, tree step)
   iv->have_use_for = false;
   iv->use_id = 0;
   iv->ssa_name = NULL_TREE;
+  iv->no_overflow = no_overflow;
 
   return iv;
 }
 
-/* Sets STEP and BASE for induction variable IV.  */
+/* Sets STEP and BASE for induction variable IV.  NO_OVERFLOW implies the IV
+   doesn't overflow.  */
 
 static void
-set_iv (struct ivopts_data *data, tree iv, tree base, tree step)
+set_iv (struct ivopts_data *data, tree iv, tree base, tree step,
+	bool no_overflow)
 {
   struct version_info *info = name_info (data, iv);
 
   gcc_assert (!info->iv);
 
   bitmap_set_bit (data->relevant, SSA_NAME_VERSION (iv));
-  info->iv = alloc_iv (base, step);
+  info->iv = alloc_iv (base, step, no_overflow);
   info->iv->ssa_name = iv;
 }
 
@@ -999,37 +1017,19 @@ get_iv (struct ivopts_data *data, tree var)
 
       if (!bb
 	  || !flow_bb_inside_loop_p (data->current_loop, bb))
-	set_iv (data, var, var, build_int_cst (type, 0));
+	set_iv (data, var, var, build_int_cst (type, 0), true);
     }
 
   return name_info (data, var)->iv;
 }
 
-/* Determines the step of a biv defined in PHI.  Returns NULL if PHI does
-   not define a simple affine biv with nonzero step.  */
-
-static tree
-determine_biv_step (gimple phi)
-{
-  struct loop *loop = gimple_bb (phi)->loop_father;
-  tree name = PHI_RESULT (phi);
-  affine_iv iv;
-
-  if (virtual_operand_p (name))
-    return NULL_TREE;
-
-  if (!simple_iv (loop, loop, name, &iv, true))
-    return NULL_TREE;
-
-  return integer_zerop (iv.step) ? NULL_TREE : iv.step;
-}
-
 /* Finds basic ivs.  */
 
 static bool
 find_bivs (struct ivopts_data *data)
 {
   gimple phi;
+  affine_iv iv;
   tree step, type, base;
   bool found = false;
   struct loop *loop = data->current_loop;
@@ -1042,10 +1042,16 @@ find_bivs (struct ivopts_data *data)
       if (SSA_NAME_OCCURS_IN_ABNORMAL_PHI (PHI_RESULT (phi)))
 	continue;
 
-      step = determine_biv_step (phi);
-      if (!step)
+      if (virtual_operand_p (PHI_RESULT (phi)))
 	continue;
 
+      if (!simple_iv (loop, loop, PHI_RESULT (phi), &iv, true))
+	continue;
+
+      if (integer_zerop (iv.step))
+	continue;
+
+      step = iv.step;
       base = PHI_ARG_DEF_FROM_EDGE (phi, loop_preheader_edge (loop));
       base = expand_simple_operations (base);
       if (contains_abnormal_ssa_name_p (base)
@@ -1062,7 +1068,7 @@ find_bivs (struct ivopts_data *data)
 	    step = fold_convert (type, step);
 	}
 
-      set_iv (data, PHI_RESULT (phi), base, step);
+      set_iv (data, PHI_RESULT (phi), base, step, iv.no_overflow);
       found = true;
     }
 
@@ -1158,7 +1164,7 @@ find_givs_in_stmt (struct ivopts_data *data, gimple stmt)
   if (!find_givs_in_stmt_scev (data, stmt, &iv))
     return;
 
-  set_iv (data, gimple_assign_lhs (stmt), iv.base, iv.step);
+  set_iv (data, gimple_assign_lhs (stmt), iv.base, iv.step, iv.no_overflow);
 }
 
 /* Finds general ivs in basic block BB.  */
@@ -1229,33 +1235,88 @@ find_induction_variables (struct ivopts_data *data)
   return true;
 }
 
-/* Records a use of type USE_TYPE at *USE_P in STMT whose value is IV.  */
+/* Records a use of type USE_TYPE at *USE_P in STMT whose value is IV.
+   For address type use, ADDR_BASE is the stripped IV base, ADDR_OFFSET
+   is the const offset stripped from IV base.  For uses of other types,
+   ADDR_BASE and ADDR_OFFSET are zero by default.  */
 
 static struct iv_use *
 record_use (struct ivopts_data *data, tree *use_p, struct iv *iv,
-	    gimple stmt, enum use_type use_type)
+	    gimple stmt, enum use_type use_type, tree addr_base = NULL,
+	    unsigned HOST_WIDE_INT addr_offset = 0)
 {
   struct iv_use *use = XCNEW (struct iv_use);
 
   use->id = n_iv_uses (data);
+  use->sub_id = 0;
   use->type = use_type;
   use->iv = iv;
   use->stmt = stmt;
   use->op_p = use_p;
   use->related_cands = BITMAP_ALLOC (NULL);
+  use->next = NULL;
+  use->addr_base = addr_base;
+  use->addr_offset = addr_offset;
 
   /* To avoid showing ssa name in the dumps, if it was not reset by the
      caller.  */
   iv->ssa_name = NULL_TREE;
 
-  if (dump_file && (dump_flags & TDF_DETAILS))
-    dump_use (dump_file, use);
-
   data->iv_uses.safe_push (use);
 
   return use;
 }
 
+/* Records a sub use of type USE_TYPE at *USE_P in STMT whose value is IV.
+   The sub use is recorded under the one whose use id is ID_GROUP.  */
+
+static struct iv_use *
+record_sub_use (struct ivopts_data *data, tree *use_p,
+		    struct iv *iv, gimple stmt, enum use_type use_type,
+		    tree addr_base, unsigned HOST_WIDE_INT addr_offset,
+		    unsigned int id_group)
+{
+  struct iv_use *use = XCNEW (struct iv_use);
+  struct iv_use *group = iv_use (data, id_group);
+
+  use->id = group->id;
+  use->sub_id = 0;
+  use->type = use_type;
+  use->iv = iv;
+  use->stmt = stmt;
+  use->op_p = use_p;
+  use->related_cands = NULL;
+  use->addr_base = addr_base;
+  use->addr_offset = addr_offset;
+
+  /* Sub use list is maintained in offset ascending order.  */
+  if (addr_offset <= group->addr_offset)
+    {
+      use->related_cands = group->related_cands;
+      group->related_cands = NULL;
+      use->next = group;
+      data->iv_uses[id_group] = use;
+    }
+  else
+    {
+      struct iv_use *pre;
+      do
+	{
+	  pre = group;
+	  group = group->next;
+	}
+      while (group && addr_offset > group->addr_offset);
+      use->next = pre->next;
+      pre->next = use;
+    }
+
+  /* To avoid showing ssa name in the dumps, if it was not reset by the
+     caller.  */
+  iv->ssa_name = NULL_TREE;
+
+  return use;
+}
+
 /* Checks whether OP is a loop-level invariant and if so, records it.
    NONLINEAR_USE is true if the invariant is used in a way we do not
    handle specially.  */
@@ -1515,6 +1576,7 @@ idx_find_step (tree base, tree *idx, void *data)
 {
   struct ifs_ivopts_data *dta = (struct ifs_ivopts_data *) data;
   struct iv *iv;
+  bool use_overflow_semantics = false;
   tree step, iv_base, iv_step, lbound, off;
   struct loop *loop = dta->ivopts_data->current_loop;
 
@@ -1574,9 +1636,12 @@ idx_find_step (tree base, tree *idx, void *data)
 
   iv_base = iv->base;
   iv_step = iv->step;
+  if (iv->no_overflow && nowrap_type_p (TREE_TYPE (iv_step)))
+    use_overflow_semantics = true;
+
   if (!convert_affine_scev (dta->ivopts_data->current_loop,
 			    sizetype, &iv_base, &iv_step, dta->stmt,
-			    false))
+			    use_overflow_semantics))
     {
       /* The index might wrap.  */
       return false;
@@ -1739,6 +1804,50 @@ may_be_nonaddressable_p (tree expr)
   return false;
 }
 
+static tree
+strip_offset (tree expr, unsigned HOST_WIDE_INT *offset);
+
+/* Record a use of type USE_TYPE at *USE_P in STMT whose value is IV.
+   If there is an existing use which has same stripped iv base and step,
+   this function records this one as a sub use to that; otherwise records
+   it as a normal one.  */
+
+static struct iv_use *
+record_group_use (struct ivopts_data *data, tree *use_p,
+		  struct iv *iv, gimple stmt, enum use_type use_type)
+{
+  unsigned int i;
+  struct iv_use *use;
+  tree addr_base;
+  unsigned HOST_WIDE_INT addr_offset;
+
+  /* Only support sub use for address type uses, that is, with base
+     object.  */
+  if (!iv->base_object)
+    return record_use (data, use_p, iv, stmt, use_type);
+
+  addr_base = strip_offset (iv->base, &addr_offset);
+  for (i = 0; i < n_iv_uses (data); i++)
+    {
+      use = iv_use (data, i);
+      if (use->type != USE_ADDRESS || !use->iv->base_object)
+	continue;
+
+      /* Check if it has the same stripped base and step.  */
+      if (operand_equal_p (iv->base_object, use->iv->base_object, 0)
+	  && operand_equal_p (iv->step, use->iv->step, 0)
+	  && operand_equal_p (addr_base, use->addr_base, 0))
+	break;
+    }
+
+  if (i == n_iv_uses (data))
+    return record_use (data, use_p, iv, stmt,
+		       use_type, addr_base, addr_offset);
+  else
+    return record_sub_use (data, use_p, iv, stmt,
+			   use_type, addr_base, addr_offset, i);
+}
+
 /* Finds addresses in *OP_P inside STMT.  */
 
 static void
@@ -1849,7 +1958,7 @@ find_interesting_uses_address (struct ivopts_data *data, gimple stmt, tree *op_p
     }
 
   civ = alloc_iv (base, step);
-  record_use (data, op_p, civ, stmt, USE_ADDRESS);
+  record_group_use (data, op_p, civ, stmt, USE_ADDRESS);
   return;
 
 fail:
@@ -2035,6 +2144,172 @@ find_interesting_uses (struct ivopts_data *data)
   free (body);
 }
 
+/* Compute maximum offset of [base + offset] addressing mode
+   for memory reference represented by USE.  */
+
+static HOST_WIDE_INT
+compute_max_addr_offset (struct iv_use *use)
+{
+  int width;
+  rtx reg, addr;
+  HOST_WIDE_INT i, off;
+  unsigned list_index, num;
+  addr_space_t as;
+  machine_mode mem_mode, addr_mode;
+  static vec<HOST_WIDE_INT> max_offset_list;
+
+  as = TYPE_ADDR_SPACE (TREE_TYPE (use->iv->base));
+  mem_mode = TYPE_MODE (TREE_TYPE (*use->op_p));
+
+  num = max_offset_list.length ();
+  list_index = (unsigned) as * MAX_MACHINE_MODE + (unsigned) mem_mode;
+  if (list_index >= num)
+    {
+      max_offset_list.safe_grow (list_index + MAX_MACHINE_MODE);
+      for (; num < max_offset_list.length (); num++)
+	max_offset_list[num] = -1;
+    }
+
+  off = max_offset_list[list_index];
+  if (off != -1)
+    return off;
+
+  addr_mode = targetm.addr_space.address_mode (as);
+  reg = gen_raw_REG (addr_mode, LAST_VIRTUAL_REGISTER + 1);
+  addr = gen_rtx_fmt_ee (PLUS, addr_mode, reg, NULL_RTX);
+
+  width = GET_MODE_BITSIZE (addr_mode) - 1;
+  if (width > (HOST_BITS_PER_WIDE_INT - 1))
+    width = HOST_BITS_PER_WIDE_INT - 1;
+
+  for (i = width; i > 0; i--)
+    {
+      off = ((unsigned HOST_WIDE_INT) 1 << i) - 1;
+      XEXP (addr, 1) = gen_int_mode (off, addr_mode);
+      if (memory_address_addr_space_p (mem_mode, addr, as))
+	break;
+
+      /* For some strict-alignment targets, the offset must be naturally
+	 aligned.  Try an aligned offset if mem_mode is not QImode.  */
+      off = ((unsigned HOST_WIDE_INT) 1 << i);
+      if (off > GET_MODE_SIZE (mem_mode) && mem_mode != QImode)
+	{
+	  off -= GET_MODE_SIZE (mem_mode);
+	  XEXP (addr, 1) = gen_int_mode (off, addr_mode);
+	  if (memory_address_addr_space_p (mem_mode, addr, as))
+	    break;
+	}
+    }
+  if (i == 0)
+    off = 0;
+
+  max_offset_list[list_index] = off;
+  return off;
+}
+
+/* Check if all small groups should be split.  Return true if and
+   only if:
+
+     1) At least one groups contain two uses with different offsets.
+     2) No group contains more than two uses with different offsets.
+
+   Return false otherwise.  We want to split such groups because:
+
+     1) Small groups don't have much benefit and may interfer with
+	general candidate selection.
+     2) Size for problem with only small groups is usually small and
+	general algorithm can handle it well.
+
+   TODO -- Above claim may not hold when auto increment is supported.  */
+
+static bool
+split_all_small_groups (struct ivopts_data *data)
+{
+  bool split_p = false;
+  unsigned int i, n, distinct;
+  struct iv_use *pre, *use;
+
+  n = n_iv_uses (data);
+  for (i = 0; i < n; i++)
+    {
+      use = iv_use (data, i);
+      if (!use->next)
+	continue;
+
+      distinct = 1;
+      gcc_assert (use->type == USE_ADDRESS);
+      for (pre = use, use = use->next; use; pre = use, use = use->next)
+	{
+	  if (pre->addr_offset != use->addr_offset)
+	    distinct++;
+
+	  if (distinct > 2)
+	    return false;
+	}
+      if (distinct == 2)
+	split_p = true;
+    }
+
+  return split_p;
+}
+
+/* For each group of address type uses, this function further groups
+   these uses according to the maximum offset supported by target's
+   [base + offset] addressing mode.  */
+
+static void
+group_address_uses (struct ivopts_data *data)
+{
+  HOST_WIDE_INT max_offset = -1;
+  unsigned int i, n, sub_id;
+  struct iv_use *pre, *use;
+  unsigned HOST_WIDE_INT addr_offset_first;
+
+  /* Reset max offset to split all small groups.  */
+  if (split_all_small_groups (data))
+    max_offset = 0;
+
+  n = n_iv_uses (data);
+  for (i = 0; i < n; i++)
+    {
+      use = iv_use (data, i);
+      if (!use->next)
+	continue;
+
+      gcc_assert (use->type == USE_ADDRESS);
+      if (max_offset != 0)
+	max_offset = compute_max_addr_offset (use);
+
+      while (use)
+	{
+	  sub_id = 0;
+	  addr_offset_first = use->addr_offset;
+	  /* Only uses with offset that can fit in offset part against
+	     the first use can be grouped together.  */
+	  for (pre = use, use = use->next;
+	       use && (use->addr_offset - addr_offset_first
+		       <= (unsigned HOST_WIDE_INT) max_offset);
+	       pre = use, use = use->next)
+	    {
+	      use->id = pre->id;
+	      use->sub_id = ++sub_id;
+	    }
+
+	  /* Break the list and create new group.  */
+	  if (use)
+	    {
+	      pre->next = NULL;
+	      use->id = n_iv_uses (data);
+	      use->related_cands = BITMAP_ALLOC (NULL);
+	      data->iv_uses.safe_push (use);
+	    }
+	}
+    }
+
+  if (dump_file && (dump_flags & TDF_DETAILS))
+    dump_uses (dump_file, data);
+}
+
 /* Strips constant offsets from EXPR and stores them to OFFSET.  If INSIDE_ADDR
    is true, assume we are inside an address.  If TOP_COMPREF is true, assume
    we are at the top-level of the processed address.  */
@@ -2458,6 +2733,8 @@ static void
 add_candidate (struct ivopts_data *data,
 	       tree base, tree step, bool important, struct iv_use *use)
 {
+  gcc_assert (use == NULL || use->sub_id == 0);
+
   if (ip_normal_pos (data->current_loop))
     add_candidate_1 (data, base, step, important, IP_NORMAL, use, NULL);
   if (ip_end_pos (data->current_loop)
@@ -2687,11 +2964,22 @@ new_cost (unsigned runtime, unsigned complexity)
   return cost;
 }
 
+/* Returns true if COST is infinite.  */
+
+static bool
+infinite_cost_p (comp_cost cost)
+{
+  return cost.cost == INFTY;
+}
+
 /* Adds costs COST1 and COST2.  */
 
 static comp_cost
 add_costs (comp_cost cost1, comp_cost cost2)
 {
+  if (infinite_cost_p (cost1) || infinite_cost_p (cost2))
+    return infinite_cost;
+
   cost1.cost += cost2.cost;
   cost1.complexity += cost2.complexity;
 
@@ -2720,14 +3008,6 @@ compare_costs (comp_cost cost1, comp_cost cost2)
   return cost1.cost - cost2.cost;
 }
 
-/* Returns true if COST is infinite.  */
-
-static bool
-infinite_cost_p (comp_cost cost)
-{
-  return cost.cost == INFTY;
-}
-
 /* Sets cost of (USE, CANDIDATE) pair to COST and record that it depends
    on invariants DEPENDS_ON and that the value used in expressing it
    is VALUE, and in case of iv elimination the comparison operator is COMP.  */
@@ -4204,7 +4484,15 @@ get_computation_cost_at (struct ivopts_data *data,
       cost.cost += add_cost (data->speed, TYPE_MODE (ctype));
     }
 
-  if (inv_expr_id)
+  /* Set of invariants depended on by sub use has already been computed
+     for the first use in the group.  */
+  if (use->sub_id)
+    {
+      cost.cost = 0;
+      if (depends_on && *depends_on)
+	bitmap_clear (*depends_on);
+    }
+  else if (inv_expr_id)
     {
       *inv_expr_id =
           get_loop_invariant_expr_id (data, ubase, cbase, ratio, address_p);
@@ -4333,6 +4621,8 @@ determine_use_iv_cost_address (struct ivopts_data *data,
   bitmap depends_on;
   bool can_autoinc;
   int inv_expr_id = -1;
+  struct iv_use *sub_use;
+  comp_cost sub_cost;
   comp_cost cost = get_computation_cost (data, use, cand, true, &depends_on,
 					 &can_autoinc, &inv_expr_id);
 
@@ -4346,6 +4636,15 @@ determine_use_iv_cost_address (struct ivopts_data *data,
       else if (cand->pos == IP_AFTER_USE || cand->pos == IP_BEFORE_USE)
 	cost = infinite_cost;
     }
+  for (sub_use = use->next;
+       sub_use && !infinite_cost_p (cost);
+       sub_use = sub_use->next)
+    {
+       sub_cost = get_computation_cost (data, sub_use, cand, true, &depends_on,
+					&can_autoinc, &inv_expr_id);
+       cost = add_costs (cost, sub_cost);
+    }
+
   set_use_iv_cost (data, use, cand, cost, depends_on, NULL_TREE, ERROR_MARK,
                    inv_expr_id);
 
@@ -6533,8 +6832,8 @@ adjust_iv_update_pos (struct iv_cand *cand, struct iv_use *use)
 /* Rewrites USE (address that is an iv) using candidate CAND.  */
 
 static void
-rewrite_use_address (struct ivopts_data *data,
-		     struct iv_use *use, struct iv_cand *cand)
+rewrite_use_address_1 (struct ivopts_data *data,
+		       struct iv_use *use, struct iv_cand *cand)
 {
   aff_tree aff;
   gimple_stmt_iterator bsi = gsi_for_stmt (use->stmt);
@@ -6569,6 +6868,28 @@ rewrite_use_address (struct ivopts_data *data,
   *use->op_p = ref;
 }
 
+/* Rewrites USE (address that is an iv) using candidate CAND.  If it's the
+   first use of a group, rewrites sub uses in the group too.  */
+
+static void
+rewrite_use_address (struct ivopts_data *data,
+		      struct iv_use *use, struct iv_cand *cand)
+{
+  struct iv_use *next;
+
+  gcc_assert (use->sub_id == 0);
+  rewrite_use_address_1 (data, use, cand);
+  update_stmt (use->stmt);
+
+  for (next = use->next; next != NULL; next = next->next)
+    {
+      rewrite_use_address_1 (data, next, cand);
+      update_stmt (next->stmt);
+    }
+
+  return;
+}
+
 /* Rewrites USE (the condition such that one of the arguments is an iv) using
    candidate CAND.  */
 
@@ -6845,6 +7166,18 @@ free_loop_data (struct ivopts_data *data)
   for (i = 0; i < n_iv_uses (data); i++)
     {
       struct iv_use *use = iv_use (data, i);
+      struct iv_use *pre = use, *sub = use->next;
+
+      while (sub)
+	{
+	  gcc_assert (sub->related_cands == NULL);
+	  gcc_assert (sub->n_map_members == 0 && sub->cost_map == NULL);
+
+	  free (sub->iv);
+	  pre = sub;
+	  sub = sub->next;
+	  free (pre);
+	}
 
       free (use->iv);
       BITMAP_FREE (use->related_cands);
@@ -6964,6 +7297,7 @@ tree_ssa_iv_optimize_loop (struct ivopts_data *data, struct loop *loop)
 
   /* Finds interesting uses (item 1).  */
   find_interesting_uses (data);
+  group_address_uses (data);
   if (n_iv_uses (data) > MAX_CONSIDERED_USES)
     goto finish;
 
diff --git a/gcc-4.9/gcc/tree-ssa-loop-niter.c b/gcc-4.9/gcc/tree-ssa-loop-niter.c
index 8fb72b6a5..cb9214fbb 100644
--- a/gcc-4.9/gcc/tree-ssa-loop-niter.c
+++ b/gcc-4.9/gcc/tree-ssa-loop-niter.c
@@ -22,6 +22,7 @@ along with GCC; see the file COPYING3.  If not see
 #include "coretypes.h"
 #include "tm.h"
 #include "tree.h"
+#include "stor-layout.h"
 #include "calls.h"
 #include "expr.h"
 #include "tm_p.h"
@@ -55,7 +56,7 @@ along with GCC; see the file COPYING3.  If not see
 #include "tree-pass.h"
 #include "stringpool.h"
 #include "tree-ssanames.h"
-
+#include "ggc.h"
 
 #define SWAP(X, Y) do { affine_iv *tmp = (X); (X) = (Y); (Y) = tmp; } while (0)
 
@@ -1161,6 +1162,7 @@ number_of_iterations_lt (tree type, affine_iv *iv0, affine_iv *iv1,
 					  iv1->base, iv0->base);
       niter->niter = delta;
       niter->max = mpz_get_double_int (niter_type, bnds->up, false);
+      niter->control.no_overflow = true;
       return true;
     }
 
@@ -1938,6 +1940,9 @@ number_of_iterations_exit (struct loop *loop, edge exit,
     return false;
 
   niter->assumptions = boolean_false_node;
+  niter->control.base = NULL_TREE;
+  niter->control.step = NULL_TREE;
+  niter->control.no_overflow = false;
   stmt = last_stmt (exit->src);
   if (!stmt || gimple_code (stmt) != GIMPLE_COND)
     return false;
@@ -2714,6 +2719,29 @@ record_estimate (struct loop *loop, tree bound, double_int i_bound,
   record_niter_bound (loop, i_bound, realistic, upper);
 }
 
+/* Records the control iv analyzed in NITER for LOOP if the iv is valid
+   and doesn't overflow.  */
+
+static void
+record_control_iv (struct loop *loop, struct tree_niter_desc *niter)
+{
+  struct control_iv *iv;
+
+  if (!niter->control.base || !niter->control.step)
+    return;
+
+  if (!integer_onep (niter->assumptions) || !niter->control.no_overflow)
+    return;
+
+  iv = ggc_alloc_control_iv ();
+  iv->base = niter->control.base;
+  iv->step = niter->control.step;
+  iv->next = loop->control_ivs;
+  loop->control_ivs = iv;
+
+  return;
+}
+
 /* Record the estimate on number of iterations of LOOP based on the fact that
    the induction variable BASE + STEP * i evaluated in STMT does not wrap and
    its values belong to the range <LOW, HIGH>.  REALISTIC is true if the
@@ -3452,6 +3480,7 @@ estimate_numbers_of_iterations_loop (struct loop *loop)
       record_estimate (loop, niter, niter_desc.max,
 		       last_stmt (ex->src),
 		       true, ex == likely_exit, true);
+      record_control_iv (loop, &niter_desc);
     }
   exits.release ();
 
@@ -3759,6 +3788,189 @@ nowrap_type_p (tree type)
   return false;
 }
 
+/* Return true if we can prove LOOP is exited before evolution of induction
+   variabled {BASE, STEP} overflows with respect to its type bound.  */
+
+static bool
+loop_exits_before_overflow (tree base, tree step,
+			    gimple at_stmt, struct loop *loop)
+{
+  double_int niter;
+  struct control_iv *civ;
+  struct nb_iter_bound *bound;
+  tree e, delta, step_abs, unsigned_base;
+  tree type = TREE_TYPE (step);
+  tree unsigned_type, valid_niter;
+
+  /* Don't issue signed overflow warnings.  */
+  fold_defer_overflow_warnings ();
+
+  /* Compute the number of iterations before we reach the bound of the
+     type, and verify that the loop is exited before this occurs.  */
+  unsigned_type = unsigned_type_for (type);
+  unsigned_base = fold_convert (unsigned_type, base);
+
+  if (tree_int_cst_sign_bit (step))
+    {
+      tree extreme = fold_convert (unsigned_type,
+				   lower_bound_in_type (type, type));
+      delta = fold_build2 (MINUS_EXPR, unsigned_type, unsigned_base, extreme);
+      step_abs = fold_build1 (NEGATE_EXPR, unsigned_type,
+			      fold_convert (unsigned_type, step));
+    }
+  else
+    {
+      tree extreme = fold_convert (unsigned_type,
+				   upper_bound_in_type (type, type));
+      delta = fold_build2 (MINUS_EXPR, unsigned_type, extreme, unsigned_base);
+      step_abs = fold_convert (unsigned_type, step);
+    }
+
+  valid_niter = fold_build2 (FLOOR_DIV_EXPR, unsigned_type, delta, step_abs);
+
+  estimate_numbers_of_iterations_loop (loop);
+
+  if (max_loop_iterations (loop, &niter)
+      && double_int_fits_to_tree_p (TREE_TYPE (valid_niter), niter)
+      && (e = fold_binary (GT_EXPR, boolean_type_node, valid_niter,
+			   double_int_to_tree (TREE_TYPE (valid_niter),
+					     niter))) != NULL
+      && integer_nonzerop (e))
+    {
+      fold_undefer_and_ignore_overflow_warnings ();
+      return true;
+    }
+  if (at_stmt)
+    for (bound = loop->bounds; bound; bound = bound->next)
+      {
+	if (n_of_executions_at_most (at_stmt, bound, valid_niter))
+	  {
+	    fold_undefer_and_ignore_overflow_warnings ();
+	    return true;
+	  }
+      }
+  fold_undefer_and_ignore_overflow_warnings ();
+
+  /* Try to prove loop is exited before {base, step} overflows with the
+     help of analyzed loop control IV.  This is done only for IVs with
+     constant step because otherwise we don't have the information.  */
+  if (TREE_CODE (step) == INTEGER_CST)
+    for (civ = loop->control_ivs; civ; civ = civ->next)
+      {
+	enum tree_code code;
+	tree stepped, extreme, civ_type = TREE_TYPE (civ->step);
+
+	/* Have to consider type difference because operand_equal_p ignores
+	   that for constants.  */
+	if (TYPE_UNSIGNED (type) != TYPE_UNSIGNED (civ_type)
+	    || element_precision (type) != element_precision (civ_type))
+	  continue;
+
+	/* Only consider control IV with same step.  */
+	if (!operand_equal_p (step, civ->step, 0))
+	  continue;
+
+	/* Done proving if this is a no-overflow control IV.  */
+	if (operand_equal_p (base, civ->base, 0))
+	  return true;
+
+	/* If this is a before stepping control IV, in other words, we have
+
+	     {civ_base, step} = {base + step, step}
+
+	   Because civ {base + step, step} doesn't overflow during loop
+	   iterations, {base, step} will not overflow if we can prove the
+	   operation "base + step" does not overflow.  Specifically, we try
+	   to prove below conditions are satisfied:
+
+	     base <= UPPER_BOUND (type) - step  ;;step > 0
+	     base >= LOWER_BOUND (type) - step  ;;step < 0
+
+	   by proving the reverse conditions are false using loop's initial
+	   condition.  */
+	stepped = fold_build2 (PLUS_EXPR, TREE_TYPE (base), base, step);
+	if (operand_equal_p (stepped, civ->base, 0))
+	  {
+	    if (tree_int_cst_sign_bit (step))
+	      {
+		code = LT_EXPR;
+		extreme = lower_bound_in_type (type, type);
+	      }
+	    else
+	      {
+		code = GT_EXPR;
+		extreme = upper_bound_in_type (type, type);
+	      }
+	    extreme = fold_build2 (MINUS_EXPR, type, extreme, step);
+	    e = fold_build2 (code, boolean_type_node, base, extreme);
+	    e = simplify_using_initial_conditions (loop, e);
+	    if (integer_zerop (e))
+	      return true;
+
+	    continue;
+	  }
+
+	/* Similar to above, only in this case we have:
+
+	     {civ_base, step} = {(signed T)((unsigned T)base + step), step}
+	     && TREE_TYPE (civ_base) = signed T.
+
+	   We prove that below condition is satisfied:
+
+	     (signed T)((unsigned T)base + step)
+	       == (signed T)(unsigned T)base + step
+	       == base + step
+
+	   because of exact the same reason as above.  This also proves
+	   there is no overflow in the operation "base + step", thus the
+	   induction variable {base, step} during loop iterations.
+
+	   This is necessary to handle cases as below:
+
+	     int foo (int *a, signed char s, signed char l)
+	       {
+		 signed char i;
+		 for (i = s; i < l; i++)
+		   a[i] = 0;
+		 return 0;
+	       }
+
+	   The variable I is firstly converted to type unsigned char,
+	   incremented, then converted back to type signed char.  */
+	if (!CONVERT_EXPR_P (civ->base) || TREE_TYPE (civ->base) != type)
+	  continue;
+	e = TREE_OPERAND (civ->base, 0);
+	if (TREE_CODE (e) != PLUS_EXPR
+	    || TREE_CODE (TREE_OPERAND (e, 1)) != INTEGER_CST
+	    || !operand_equal_p (step,
+				 fold_convert (type,
+					       TREE_OPERAND (e, 1)), 0))
+	  continue;
+	e = TREE_OPERAND (e, 0);
+	if (!CONVERT_EXPR_P (e) || !operand_equal_p (e, unsigned_base, 0))
+	  continue;
+	e = TREE_OPERAND (e, 0);
+	gcc_assert (operand_equal_p (e, base, 0));
+	if (tree_int_cst_sign_bit (step))
+	  {
+	    code = LT_EXPR;
+	    extreme = lower_bound_in_type (type, type);
+	  }
+	else
+	  {
+	    code = GT_EXPR;
+	    extreme = upper_bound_in_type (type, type);
+	  }
+	extreme = fold_build2 (MINUS_EXPR, type, extreme, step);
+	e = fold_build2 (code, boolean_type_node, base, extreme);
+	e = simplify_using_initial_conditions (loop, e);
+	if (integer_zerop (e))
+	  return true;
+      }
+
+  return false;
+}
+
 /* Return false only when the induction variable BASE + STEP * I is
    known to not overflow: i.e. when the number of iterations is small
    enough with respect to the step and initial condition in order to
@@ -3774,13 +3986,6 @@ scev_probably_wraps_p (tree base, tree step,
 		       gimple at_stmt, struct loop *loop,
 		       bool use_overflow_semantics)
 {
-  tree delta, step_abs;
-  tree unsigned_type, valid_niter;
-  tree type = TREE_TYPE (step);
-  tree e;
-  double_int niter;
-  struct nb_iter_bound *bound;
-
   /* FIXME: We really need something like
      http://gcc.gnu.org/ml/gcc-patches/2005-06/msg02025.html.
 
@@ -3814,56 +4019,8 @@ scev_probably_wraps_p (tree base, tree step,
   if (TREE_CODE (step) != INTEGER_CST)
     return true;
 
-  /* Don't issue signed overflow warnings.  */
-  fold_defer_overflow_warnings ();
-
-  /* Otherwise, compute the number of iterations before we reach the
-     bound of the type, and verify that the loop is exited before this
-     occurs.  */
-  unsigned_type = unsigned_type_for (type);
-  base = fold_convert (unsigned_type, base);
-
-  if (tree_int_cst_sign_bit (step))
-    {
-      tree extreme = fold_convert (unsigned_type,
-				   lower_bound_in_type (type, type));
-      delta = fold_build2 (MINUS_EXPR, unsigned_type, base, extreme);
-      step_abs = fold_build1 (NEGATE_EXPR, unsigned_type,
-			      fold_convert (unsigned_type, step));
-    }
-  else
-    {
-      tree extreme = fold_convert (unsigned_type,
-				   upper_bound_in_type (type, type));
-      delta = fold_build2 (MINUS_EXPR, unsigned_type, extreme, base);
-      step_abs = fold_convert (unsigned_type, step);
-    }
-
-  valid_niter = fold_build2 (FLOOR_DIV_EXPR, unsigned_type, delta, step_abs);
-
-  estimate_numbers_of_iterations_loop (loop);
-
-  if (max_loop_iterations (loop, &niter)
-      && double_int_fits_to_tree_p (TREE_TYPE (valid_niter), niter)
-      && (e = fold_binary (GT_EXPR, boolean_type_node, valid_niter,
-			   double_int_to_tree (TREE_TYPE (valid_niter),
-					       niter))) != NULL
-      && integer_nonzerop (e))
-    {
-      fold_undefer_and_ignore_overflow_warnings ();
-      return false;
-    }
-  if (at_stmt)
-    for (bound = loop->bounds; bound; bound = bound->next)
-      {
-	if (n_of_executions_at_most (at_stmt, bound, valid_niter))
-	  {
-	    fold_undefer_and_ignore_overflow_warnings ();
-	    return false;
-	  }
-      }
-
-  fold_undefer_and_ignore_overflow_warnings ();
+  if (loop_exits_before_overflow (base, step, at_stmt, loop))
+    return false;
 
   /* At this point we still don't have a proof that the iv does not
      overflow: give up.  */
@@ -3875,17 +4032,26 @@ scev_probably_wraps_p (tree base, tree step,
 void
 free_numbers_of_iterations_estimates_loop (struct loop *loop)
 {
-  struct nb_iter_bound *bound, *next;
+  struct control_iv *civ;
+  struct nb_iter_bound *bound;
 
   loop->nb_iterations = NULL;
   loop->estimate_state = EST_NOT_COMPUTED;
-  for (bound = loop->bounds; bound; bound = next)
+  for (bound = loop->bounds; bound;)
     {
-      next = bound->next;
+      struct nb_iter_bound *next = bound->next;
       ggc_free (bound);
+      bound = next;
     }
-
   loop->bounds = NULL;
+
+  for (civ = loop->control_ivs; civ;)
+    {
+      struct control_iv *next = civ->next;
+      ggc_free (civ);
+      civ = next;
+    }
+  loop->control_ivs = NULL;
 }
 
 /* Frees the information on upper bounds on numbers of iterations of loops.  */
diff --git a/gcc-4.9/gcc/tree-ssa-loop-niter.h b/gcc-4.9/gcc/tree-ssa-loop-niter.h
index df0d64d02..dd2535851 100644
--- a/gcc-4.9/gcc/tree-ssa-loop-niter.h
+++ b/gcc-4.9/gcc/tree-ssa-loop-niter.h
@@ -41,6 +41,7 @@ extern void estimate_numbers_of_iterations (void);
 extern bool stmt_dominates_stmt_p (gimple, gimple);
 extern bool nowrap_type_p (tree);
 extern bool scev_probably_wraps_p (tree, tree, gimple, struct loop *, bool);
+extern void free_loop_control_ivs (struct loop *);
 extern void free_numbers_of_iterations_estimates_loop (struct loop *);
 extern void free_numbers_of_iterations_estimates (void);
 extern void substitute_in_loop_info (struct loop *, tree, tree);