18 files changed, 808 insertions, 112 deletions
diff --git a/gcc-4.9/gcc/cfgcleanup.c b/gcc-4.9/gcc/cfgcleanup.c
index 77196ee6b..de307da54 100644
--- a/gcc-4.9/gcc/cfgcleanup.c
+++ b/gcc-4.9/gcc/cfgcleanup.c
@@ -53,6 +53,7 @@ along with GCC; see the file COPYING3.  If not see
 #include "df.h"
 #include "dce.h"
 #include "dbgcnt.h"
+#include "emit-rtl.h"
 
 #define FORWARDER_BLOCK_P(BB) ((BB)->flags & BB_FORWARDER_BLOCK)
 
@@ -882,7 +883,7 @@ merge_memattrs (rtx x, rtx y)
   if (GET_MODE (x) != GET_MODE (y))
     return;
 
-  if (code == MEM && MEM_ATTRS (x) != MEM_ATTRS (y))
+  if (code == MEM && !mem_attrs_eq_p (MEM_ATTRS (x), MEM_ATTRS (y)))
     {
       if (! MEM_ATTRS (x))
 	MEM_ATTRS (y) = 0;
diff --git a/gcc-4.9/gcc/config/i386/gnu-user.h b/gcc-4.9/gcc/config/i386/gnu-user.h
index 21b9e9692..d4a16e470 100644
--- a/gcc-4.9/gcc/config/i386/gnu-user.h
+++ b/gcc-4.9/gcc/config/i386/gnu-user.h
@@ -65,6 +65,10 @@ along with GCC; see the file COPYING3.  If not see
    When the -shared link option is used a final link is not being
    done.  */
 
+#undef ANDROID_TARGET_CC1_SPEC
+#define ANDROID_TARGET_CC1_SPEC \
+  " -mstackrealign -mssse3 -fno-short-enums " \
+
 #undef  ASM_SPEC
 #define ASM_SPEC \
   "--32 %{!mno-sse2avx:%{mavx:-msse2avx}} %{msse2avx:%{!mavx:-msse2avx}} " \
diff --git a/gcc-4.9/gcc/config/i386/gnu-user64.h b/gcc-4.9/gcc/config/i386/gnu-user64.h
index 1c72b41e4..0333b5c26 100644
--- a/gcc-4.9/gcc/config/i386/gnu-user64.h
+++ b/gcc-4.9/gcc/config/i386/gnu-user64.h
@@ -46,6 +46,11 @@ see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
 #define SPEC_X32 "mx32"
 #endif
 
+#undef ANDROID_TARGET_CC1_SPEC
+#define ANDROID_TARGET_CC1_SPEC \
+  "%{m32:-mstackrealign -mssse3 -fno-short-enums}" \
+  "%{!m32:-msse4.2 -mpopcnt}"
+
 #undef ASM_SPEC
 #define ASM_SPEC "%{" SPEC_32 ":--32} \
  %{" SPEC_64 ":--64} \
diff --git a/gcc-4.9/gcc/config/i386/i386.c b/gcc-4.9/gcc/config/i386/i386.c
index 85ceff337..fcd5f0dd1 100644
--- a/gcc-4.9/gcc/config/i386/i386.c
+++ b/gcc-4.9/gcc/config/i386/i386.c
@@ -82,6 +82,7 @@ along with GCC; see the file COPYING3.  If not see
 #include "context.h"
 #include "pass_manager.h"
 #include "target-globals.h"
+#include "tree-vectorizer.h"
 
 static rtx legitimize_dllimport_symbol (rtx, bool);
 static rtx legitimize_pe_coff_extern_decl (rtx, bool);
@@ -1739,7 +1740,7 @@ struct processor_costs slm_cost = {
   1,					/* scalar load_cost.  */
   1,					/* scalar_store_cost.  */
   1,					/* vec_stmt_cost.  */
-  1,					/* vec_to_scalar_cost.  */
+  4,					/* vec_to_scalar_cost.  */
   1,					/* scalar_to_vec_cost.  */
   1,					/* vec_align_load_cost.  */
   2,					/* vec_unalign_load_cost.  */
@@ -1816,7 +1817,7 @@ struct processor_costs intel_cost = {
   1,					/* scalar load_cost.  */
   1,					/* scalar_store_cost.  */
   1,					/* vec_stmt_cost.  */
-  1,					/* vec_to_scalar_cost.  */
+  4,					/* vec_to_scalar_cost.  */
   1,					/* scalar_to_vec_cost.  */
   1,					/* vec_align_load_cost.  */
   2,					/* vec_unalign_load_cost.  */
@@ -44301,7 +44302,7 @@ expand_vec_perm_even_odd_1 (struct expand_vec_perm_d *d, unsigned odd)
       gcc_unreachable ();
 
     case V8HImode:
-      if (TARGET_SSSE3)
+      if (TARGET_SSSE3 && !TARGET_SLOW_PSHUFB)
 	return expand_vec_perm_pshufb2 (d);
       else
 	{
@@ -44324,7 +44325,7 @@ expand_vec_perm_even_odd_1 (struct expand_vec_perm_d *d, unsigned odd)
       break;
 
     case V16QImode:
-      if (TARGET_SSSE3)
+      if (TARGET_SSSE3 && !TARGET_SLOW_PSHUFB)
 	return expand_vec_perm_pshufb2 (d);
       else
 	{
@@ -46496,6 +46497,16 @@ ix86_reassociation_width (unsigned int opc ATTRIBUTE_UNUSED,
 {
   int res = 1;
 
+  /* Vector part.  */
+  if (VECTOR_MODE_P (mode))
+    {
+      if (TARGET_VECTOR_PARALLEL_EXECUTION)
+	return 2;
+      else
+	return 1;
+    }
+
+  /* Scalar part.  */
   if (INTEGRAL_MODE_P (mode) && TARGET_REASSOC_INT_TO_PARALLEL)
     res = 2;
   else if (FLOAT_MODE_P (mode) && TARGET_REASSOC_FP_TO_PARALLEL)
@@ -46595,7 +46606,6 @@ ix86_add_stmt_cost (void *data, int count, enum vect_cost_for_stmt kind,
 {
   unsigned *cost = (unsigned *) data;
   unsigned retval = 0;
-
   tree vectype = stmt_info ? stmt_vectype (stmt_info) : NULL_TREE;
   int stmt_cost = ix86_builtin_vectorization_cost (kind, vectype, misalign);
 
@@ -46606,6 +46616,18 @@ ix86_add_stmt_cost (void *data, int count, enum vect_cost_for_stmt kind,
     count *= 50;  /* FIXME.  */
 
   retval = (unsigned) (count * stmt_cost);
+
+  /* We need to multiply all vector stmt cost by 1.7 (estimated cost)
+     for Silvermont as it has out of order integer pipeline and can execute
+     2 scalar instruction per tick, but has in order SIMD pipeline.  */
+  if (TARGET_SILVERMONT || TARGET_INTEL)
+    if (stmt_info && stmt_info->stmt)
+      {
+	tree lhs_op = gimple_get_lhs (stmt_info->stmt);
+	if (lhs_op && TREE_CODE (TREE_TYPE (lhs_op)) == INTEGER_TYPE)
+	  retval = (retval * 17) / 10;
+      }
+
   cost[where] += retval;
 
   return retval;
diff --git a/gcc-4.9/gcc/config/i386/i386.h b/gcc-4.9/gcc/config/i386/i386.h
index b3b7c8d30..f6b169c24 100644
--- a/gcc-4.9/gcc/config/i386/i386.h
+++ b/gcc-4.9/gcc/config/i386/i386.h
@@ -425,6 +425,10 @@ extern unsigned char ix86_tune_features[X86_TUNE_LAST];
 	ix86_tune_features[X86_TUNE_USE_VECTOR_FP_CONVERTS]
 #define TARGET_USE_VECTOR_CONVERTS \
 	ix86_tune_features[X86_TUNE_USE_VECTOR_CONVERTS]
+#define TARGET_SLOW_PSHUFB \
+	ix86_tune_features[X86_TUNE_SLOW_PSHUFB]
+#define TARGET_VECTOR_PARALLEL_EXECUTION \
+	ix86_tune_features[X86_TUNE_VECTOR_PARALLEL_EXECUTION]
 #define TARGET_FUSE_CMP_AND_BRANCH_32 \
 	ix86_tune_features[X86_TUNE_FUSE_CMP_AND_BRANCH_32]
 #define TARGET_FUSE_CMP_AND_BRANCH_64 \
diff --git a/gcc-4.9/gcc/config/i386/linux-common.h b/gcc-4.9/gcc/config/i386/linux-common.h
index 574f096e6..d980fb719 100644
--- a/gcc-4.9/gcc/config/i386/linux-common.h
+++ b/gcc-4.9/gcc/config/i386/linux-common.h
@@ -27,11 +27,6 @@ along with GCC; see the file COPYING3.  If not see
     }                                          \
   while (0)
 
-#undef ANDROID_TARGET_CC1_SPEC
-#define ANDROID_TARGET_CC1_SPEC \
-  "%{m32:-mstackrealign -mssse3 -fno-short-enums}" \
-  "%{!m32:-msse4.2 -mpopcnt}"
-
 #undef CC1_SPEC
 #define CC1_SPEC \
   LINUX_OR_ANDROID_CC (GNU_USER_TARGET_CC1_SPEC, \
diff --git a/gcc-4.9/gcc/config/i386/predicates.md b/gcc-4.9/gcc/config/i386/predicates.md
index 2ef138424..8266f3eaf 100644
--- a/gcc-4.9/gcc/config/i386/predicates.md
+++ b/gcc-4.9/gcc/config/i386/predicates.md
@@ -1417,6 +1417,22 @@
   return true;
 })
 
+;; Return true if OP is a parallel for a palignr permute.
+(define_predicate "palignr_operand"
+  (and (match_code "parallel")
+       (match_code "const_int" "a"))
+{
+  int elt = INTVAL (XVECEXP (op, 0, 0));
+  int i, nelt = XVECLEN (op, 0);
+
+  /* Check that an order in the permutation is suitable for palignr.
+     For example, {5 6 7 0 1 2 3 4} is "palignr 5, xmm, xmm".  */
+  for (i = 1; i < nelt; ++i)
+    if (INTVAL (XVECEXP (op, 0, i)) != ((elt + i) % nelt))
+      return false;
+  return true;
+})
+
 ;; Return true if OP is a proper third operand to vpblendw256.
 (define_predicate "avx2_pblendw_operand"
   (match_code "const_int")
diff --git a/gcc-4.9/gcc/config/i386/sse.md b/gcc-4.9/gcc/config/i386/sse.md
index 8d061da95..4aced2da9 100644
--- a/gcc-4.9/gcc/config/i386/sse.md
+++ b/gcc-4.9/gcc/config/i386/sse.md
@@ -14576,6 +14576,35 @@
    (set_attr "prefix" "vex")
    (set_attr "mode" "<sseinsnmode>")])
 
+(define_insn "*ssse3_palignr<mode>_perm"
+  [(set (match_operand:V_128 0 "register_operand" "=x,x")
+      (vec_select:V_128
+	(match_operand:V_128 1 "register_operand" "0,x")
+	(match_parallel 2 "palignr_operand"
+	  [(match_operand 3 "const_int_operand" "n, n")])))]
+  "TARGET_SSSE3"
+{
+  enum machine_mode imode = GET_MODE_INNER (GET_MODE (operands[0]));
+  operands[2] = GEN_INT (INTVAL (operands[3]) * GET_MODE_SIZE (imode));
+
+  switch (which_alternative)
+    {
+    case 0:
+      return "palignr\t{%2, %1, %0|%0, %1, %2}";
+    case 1:
+      return "vpalignr\t{%2, %1, %1, %0|%0, %1, %1, %2}";
+    default:
+      gcc_unreachable ();
+    }
+}
+  [(set_attr "isa" "noavx,avx")
+   (set_attr "type" "sseishft")
+   (set_attr "atom_unit" "sishuf")
+   (set_attr "prefix_data16" "1,*")
+   (set_attr "prefix_extra" "1")
+   (set_attr "length_immediate" "1")
+   (set_attr "prefix" "orig,vex")])
+
 (define_expand "avx_vinsertf128<mode>"
   [(match_operand:V_256 0 "register_operand")
    (match_operand:V_256 1 "register_operand")
diff --git a/gcc-4.9/gcc/config/i386/x86-tune.def b/gcc-4.9/gcc/config/i386/x86-tune.def
index c36174855..b7a703fa0 100644
--- a/gcc-4.9/gcc/config/i386/x86-tune.def
+++ b/gcc-4.9/gcc/config/i386/x86-tune.def
@@ -386,6 +386,15 @@ DEF_TUNE (X86_TUNE_USE_VECTOR_FP_CONVERTS, "use_vector_fp_converts",
    from integer to FP. */
 DEF_TUNE (X86_TUNE_USE_VECTOR_CONVERTS, "use_vector_converts", m_AMDFAM10)
 
+/* X86_TUNE_SLOW_SHUFB: Indicates tunings with slow pshufb instruction.  */
+DEF_TUNE (X86_TUNE_SLOW_PSHUFB, "slow_pshufb",
+          m_BONNELL | m_SILVERMONT | m_INTEL)
+
+/* X86_TUNE_VECTOR_PARALLEL_EXECUTION: Indicates tunings with ability to
+   execute 2 or more vector instructions in parallel.  */
+DEF_TUNE (X86_TUNE_VECTOR_PARALLEL_EXECUTION, "vec_parallel",
+          m_NEHALEM | m_SANDYBRIDGE | m_HASWELL)
+
 /*****************************************************************************/
 /* AVX instruction selection tuning (some of SSE flags affects AVX, too)     */
 /*****************************************************************************/
diff --git a/gcc-4.9/gcc/config/mips/p5600.md b/gcc-4.9/gcc/config/mips/p5600.md
index 14d417fcc..d672dc401 100644
--- a/gcc-4.9/gcc/config/mips/p5600.md
+++ b/gcc-4.9/gcc/config/mips/p5600.md
@@ -18,14 +18,14 @@
 ;; along with GCC; see the file COPYING3.  If not see
 ;; <http://www.gnu.org/licenses/>.
 
-(define_automaton "p5600_agen_pipe, p5600_alu_pipe, p5600_fpu_pipe")
+(define_automaton "p5600_agen_alq_pipe, p5600_fpu_pipe")
 
 ;; The address generation queue (AGQ) has AL2, CTISTD and LDSTA pipes
 (define_cpu_unit "p5600_agq, p5600_al2, p5600_ctistd, p5600_ldsta,
-		  p5600_gpdiv" "p5600_agen_pipe")
+		  p5600_gpdiv" "p5600_agen_alq_pipe")
 
 ;; The arithmetic-logic-unit queue (ALQ) has ALU pipe
-(define_cpu_unit "p5600_alq, p5600_alu" "p5600_alu_pipe")
+(define_cpu_unit "p5600_alq, p5600_alu" "p5600_agen_alq_pipe")
 
 ;; The floating-point-unit queue (FPQ) has short and long pipes
 (define_cpu_unit "p5600_fpu_short, p5600_fpu_long" "p5600_fpu_pipe")
@@ -53,92 +53,109 @@
 ;; Arithmetic
 ;; add, hadd, sub, hsub, average, min, max, compare
 (define_insn_reservation "msa_short_int_add" 2
-  (eq_attr "msa_execunit" "msa_eu_int_add")
+  (and (eq_attr "cpu" "p5600")
+       (eq_attr "msa_execunit" "msa_eu_int_add"))
   "p5600_fpu_short, p5600_fpu_intadd")
 
 ;; Bitwise Instructions
 ;; and, or, xor, bit-clear, leading-bits-count, shift, shuffle
 (define_insn_reservation "msa_short_logic" 2
-  (eq_attr "msa_execunit" "msa_eu_logic")
+  (and (eq_attr "cpu" "p5600")
+       (eq_attr "msa_execunit" "msa_eu_logic"))
   "p5600_fpu_short, p5600_fpu_logic_a")
 
 ;; move.v
 (define_insn_reservation "msa_short_logic_move_v" 2
-  (and (eq_attr "type" "fmove")
-    (eq_attr "mode" "TI"))
+  (and (eq_attr "cpu" "p5600")
+       (and (eq_attr "type" "fmove")
+	    (eq_attr "mode" "TI")))
   "p5600_fpu_short, p5600_fpu_logic_a")
 
 ;; Float compare
 (define_insn_reservation "msa_short_cmp" 2
-  (eq_attr "msa_execunit" "msa_eu_cmp")
+  (and (eq_attr "cpu" "p5600")
+       (eq_attr "msa_execunit" "msa_eu_cmp"))
   "p5600_fpu_short, p5600_fpu_cmp")
 
 ;; Float exp2, min, max
 (define_insn_reservation "msa_short_float2" 2
-  (eq_attr "msa_execunit" "msa_eu_float2")
+  (and (eq_attr "cpu" "p5600")
+       (eq_attr "msa_execunit" "msa_eu_float2"))
   "p5600_fpu_short, p5600_fpu_float")
 
 ;; Vector sat
 (define_insn_reservation "msa_short_logic3" 3
-  (eq_attr "msa_execunit" "msa_eu_logic3")
+  (and (eq_attr "cpu" "p5600")
+       (eq_attr "msa_execunit" "msa_eu_logic3"))
   "p5600_fpu_short, p5600_fpu_logic_a, p5600_fpu_logic_b")
 
 ;; Vector copy, bz, bnz
 (define_insn_reservation "msa_short_store4" 4
-  (eq_attr "msa_execunit" "msa_eu_store4")
+  (and (eq_attr "cpu" "p5600")
+       (eq_attr "msa_execunit" "msa_eu_store4"))
   "p5600_fpu_short, p5600_fpu_store")
 
 ;; Vector load
 (define_insn_reservation "msa_long_load" 10
-  (and (eq_attr "type" "fpload")
-    (eq_attr "mode" "TI"))
+  (and (eq_attr "cpu" "p5600")
+       (and (eq_attr "type" "fpload")
+	    (eq_attr "mode" "TI")))
   "p5600_fpu_long, p5600_fpu_load")
 
 ;; Vector store
 (define_insn_reservation "msa_short_store" 2
-  (and (eq_attr "type" "fpstore")
-    (eq_attr "mode" "TI"))
+  (and (eq_attr "cpu" "p5600")
+       (and (eq_attr "type" "fpstore")
+	    (eq_attr "mode" "TI")))
   "p5600_fpu_short, p5600_fpu_store")
 
 ;; binsl, binsr, insert, vshf, sld
 (define_insn_reservation "msa_long_logic" 2
-  (eq_attr "msa_execunit" "msa_eu_logic_l")
+  (and (eq_attr "cpu" "p5600")
+       (eq_attr "msa_execunit" "msa_eu_logic_l"))
   "p5600_fpu_long, p5600_fpu_logic")
 
 ;; Float fclass, flog2
 (define_insn_reservation "msa_long_float2" 2
-  (eq_attr "msa_execunit" "msa_eu_float2_l")
+  (and (eq_attr "cpu" "p5600")
+       (eq_attr "msa_execunit" "msa_eu_float2_l"))
   "p5600_fpu_long, p5600_fpu_float_a")
 
 ;; fadd, fsub
 (define_insn_reservation "msa_long_float4" 4
-  (eq_attr "msa_execunit" "msa_eu_float4")
+  (and (eq_attr "cpu" "p5600")
+       (eq_attr "msa_execunit" "msa_eu_float4"))
   "p5600_fpu_long, p5600_fpu_float_a, p5600_fpu_float_b")
 
 ;; fmul
 (define_insn_reservation "msa_long_float5" 5
-  (eq_attr "msa_execunit" "msa_eu_float5")
+  (and (eq_attr "cpu" "p5600")
+       (eq_attr "msa_execunit" "msa_eu_float5"))
   "p5600_fpu_long, p5600_fpu_float_a, p5600_fpu_float_b, p5600_fpu_float_c")
 
 ;; fmadd, fmsub
 (define_insn_reservation "msa_long_float8" 8
-  (eq_attr "msa_execunit" "msa_eu_float8")
+  (and (eq_attr "cpu" "p5600")
+       (eq_attr "msa_execunit" "msa_eu_float8"))
   "p5600_fpu_long, p5600_fpu_float_a,
    p5600_fpu_float_b, p5600_fpu_float_c, p5600_fpu_float_d")
 
 ;; Vector mul, dotp, madd, msub
 (define_insn_reservation "msa_long_mult" 5
-  (eq_attr "msa_execunit" "msa_eu_mult")
+  (and (eq_attr "cpu" "p5600")
+       (eq_attr "msa_execunit" "msa_eu_mult"))
   "p5600_fpu_long, p5600_fpu_mult")
 
 ;; fdiv, fmod (semi-pipelined)
 (define_insn_reservation "msa_long_fdiv" 10
-  (eq_attr "msa_execunit" "msa_eu_fdiv")
+  (and (eq_attr "cpu" "p5600")
+       (eq_attr "msa_execunit" "msa_eu_fdiv"))
   "p5600_fpu_long, nothing, nothing, p5600_fpu_fdiv*8")
 
 ;; div, mod (non-pipelined)
 (define_insn_reservation "msa_long_div" 10
-  (eq_attr "msa_execunit" "msa_eu_div")
+  (and (eq_attr "cpu" "p5600")
+       (eq_attr "msa_execunit" "msa_eu_div"))
   "p5600_fpu_long, p5600_fpu_div*9, p5600_fpu_div + p5600_fpu_logic_a")
 
 ;;
@@ -147,52 +164,62 @@
 
 ;; fadd, fsub
 (define_insn_reservation "p5600_fpu_fadd" 4
-  (eq_attr "type" "fadd,fabs,fneg")
+  (and (eq_attr "cpu" "p5600")
+       (eq_attr "type" "fadd,fabs,fneg"))
   "p5600_fpu_long, p5600_fpu_apu")
 
 ;; fabs, fneg, fcmp
 (define_insn_reservation "p5600_fpu_fabs" 2
-  (eq_attr "type" "fabs,fneg,fcmp,fmove")
+  (and (eq_attr "cpu" "p5600")
+       (eq_attr "type" "fabs,fneg,fcmp,fmove"))
   "p5600_fpu_short, p5600_fpu_apu")
 
 ;; fload
 (define_insn_reservation "p5600_fpu_fload" 8
-  (eq_attr "type" "fpload,fpidxload")
+  (and (eq_attr "cpu" "p5600")
+       (eq_attr "type" "fpload,fpidxload"))
   "p5600_fpu_long, p5600_fpu_apu")
 
 ;; fstore
 (define_insn_reservation "p5600_fpu_fstore" 1
-  (eq_attr "type" "fpstore,fpidxstore")
+  (and (eq_attr "cpu" "p5600")
+       (eq_attr "type" "fpstore,fpidxstore"))
   "p5600_fpu_short, p5600_fpu_apu")
 
 ;; fmadd
 (define_insn_reservation "p5600_fpu_fmadd" 9
-  (eq_attr "type" "fmadd")
+  (and (eq_attr "cpu" "p5600")
+       (eq_attr "type" "fmadd"))
   "p5600_fpu_long, p5600_fpu_apu")
 
 ;; fmul
 (define_insn_reservation "p5600_fpu_fmul" 5
-  (eq_attr "type" "fmul")
+  (and (eq_attr "cpu" "p5600")
+       (eq_attr "type" "fmul"))
   "p5600_fpu_long, p5600_fpu_apu")
 
 ;; fdiv, fsqrt
 (define_insn_reservation "p5600_fpu_div" 17
-  (eq_attr "type" "fdiv,frdiv,fsqrt,frsqrt")
+  (and (eq_attr "cpu" "p5600")
+       (eq_attr "type" "fdiv,frdiv,fsqrt,frsqrt"))
   "p5600_fpu_long, p5600_fpu_apu*17")
 
 ;; fcvt
 (define_insn_reservation "p5600_fpu_fcvt" 4
-  (eq_attr "type" "fcvt")
+  (and (eq_attr "cpu" "p5600")
+       (eq_attr "type" "fcvt"))
   "p5600_fpu_long, p5600_fpu_apu")
 
 ;; mtc
 (define_insn_reservation "p5600_fpu_fmtc" 7
-  (eq_attr "type" "mtc")
+  (and (eq_attr "cpu" "p5600")
+       (eq_attr "type" "mtc"))
   "p5600_fpu_short, p5600_fpu_store")
 
 ;; mfc
 (define_insn_reservation "p5600_fpu_fmfc" 4
-  (eq_attr "type" "mfc")
+  (and (eq_attr "cpu" "p5600")
+       (eq_attr "type" "mfc"))
   "p5600_fpu_short, p5600_fpu_store")
 
 ;; madd/msub feeding into the add source
@@ -205,100 +232,120 @@
 
 ;; and
 (define_insn_reservation "p5600_int_and" 1
-  (eq_attr "move_type" "logical")
+  (and (eq_attr "cpu" "p5600")
+       (eq_attr "move_type" "logical"))
   "p5600_alq_alu")
 
 ;; lui
 (define_insn_reservation "p5600_int_lui" 1
-  (eq_attr "move_type" "const")
+  (and (eq_attr "cpu" "p5600")
+       (eq_attr "move_type" "const"))
   "p5600_alq_alu")
 
 ;; Load lb, lbu, lh, lhu, lq, lw, lw_i2f, lwxs
 (define_insn_reservation "p5600_int_load" 4
-  (eq_attr "move_type" "load")
+  (and (eq_attr "cpu" "p5600")
+       (eq_attr "move_type" "load"))
   "p5600_agq_ldsta")
 
 ;; store
 (define_insn_reservation "p5600_int_store" 3
-  (eq_attr "move_type" "store")
+  (and (eq_attr "cpu" "p5600")
+       (eq_attr "move_type" "store"))
   "p5600_agq_ldsta")
 
 ;; andi, sll, srl, seb, seh
 (define_insn_reservation "p5600_int_arith_1" 1
-  (eq_attr "move_type" "andi,sll0,signext")
-  "p5600_agq_al2 | p5600_alq_alu")
+  (and (eq_attr "cpu" "p5600")
+       (eq_attr "move_type" "andi,sll0,signext"))
+  "p5600_alq_alu | p5600_agq_al2")
 
 ;; addi, addiu, ori, xori, add, addu
 (define_insn_reservation "p5600_int_arith_2" 1
-  (eq_attr "alu_type" "add,or,xor")
-  "p5600_agq_al2 | p5600_alq_alu")
+  (and (eq_attr "cpu" "p5600")
+       (eq_attr "alu_type" "add,or,xor"))
+  "p5600_alq_alu | p5600_agq_al2")
 
 ;; nor, sub
 (define_insn_reservation "p5600_int_arith_3" 1
-  (eq_attr "alu_type" "nor,sub")
+  (and (eq_attr "cpu" "p5600")
+       (eq_attr "alu_type" "nor,sub"))
   "p5600_alq_alu")
 
 ;; srl, sra, rotr, slt, sllv, srlv
 (define_insn_reservation "p5600_int_arith_4" 1
-  (eq_attr "type" "shift,slt,move")
-  "p5600_agq_al2 | p5600_alq_alu")
+  (and (eq_attr "cpu" "p5600")
+       (eq_attr "type" "shift,slt,move"))
+  "p5600_alq_alu | p5600_agq_al2")
 
 ;; nop
 (define_insn_reservation "p5600_int_nop" 0
-  (eq_attr "type" "nop")
+  (and (eq_attr "cpu" "p5600")
+       (eq_attr "type" "nop"))
   "p5600_agq_al2")
 
 ;; clo, clz
 (define_insn_reservation "p5600_int_countbits" 1
-  (eq_attr "type" "clz")
+  (and (eq_attr "cpu" "p5600")
+       (eq_attr "type" "clz"))
   "p5600_agq_al2")
 
 ;; Conditional moves
 (define_insn_reservation "p5600_int_condmove" 1
-  (eq_attr "type" "condmove")
+  (and (eq_attr "cpu" "p5600")
+       (eq_attr "type" "condmove"))
   "p5600_agq_al2")
 
 ;; madd, msub
 (define_insn_reservation "p5600_dsp_mac" 5
-  (eq_attr "type" "imadd")
+  (and (eq_attr "cpu" "p5600")
+       (eq_attr "type" "imadd"))
   "p5600_agq_al2")
 
 ;; mfhi/lo
 (define_insn_reservation "p5600_dsp_mfhilo" 1
-  (eq_attr "type" "mfhi,mflo")
+  (and (eq_attr "cpu" "p5600")
+       (eq_attr "type" "mfhi,mflo"))
   "p5600_agq_al2")
 
 ;; mthi/lo
 (define_insn_reservation "p5600_dsp_mthilo" 5
-  (eq_attr "type" "mthi,mtlo")
+  (and (eq_attr "cpu" "p5600")
+       (eq_attr "type" "mthi,mtlo"))
   "p5600_agq_al2")
 
 ;; mult, multu, mul
 (define_insn_reservation "p5600_dsp_mult" 5
-  (eq_attr "type" "imul3,imul")
+  (and (eq_attr "cpu" "p5600")
+       (eq_attr "type" "imul3,imul"))
   "p5600_agq_al2")
 
 ;; branch and jump
 (define_insn_reservation "p5600_int_branch" 1
-  (eq_attr "type" "branch,jump")
+  (and (eq_attr "cpu" "p5600")
+       (eq_attr "type" "branch,jump"))
   "p5600_agq_ctistd")
 
 ;; prefetch
 (define_insn_reservation "p5600_int_prefetch" 3
-  (eq_attr "type" "prefetch,prefetchx")
+  (and (eq_attr "cpu" "p5600")
+       (eq_attr "type" "prefetch,prefetchx"))
   "p5600_agq_ldsta")
 
 ;; divide
 (define_insn_reservation "p5600_int_div" 8
-  (eq_attr "type" "idiv")
+  (and (eq_attr "cpu" "p5600")
+       (eq_attr "type" "idiv"))
   "p5600_agq_al2+p5600_gpdiv*8")
 
 ;; arith
 (define_insn_reservation "p5600_int_arith_5" 2
-  (eq_attr "type" "arith")
+  (and (eq_attr "cpu" "p5600")
+       (eq_attr "type" "arith"))
   "p5600_agq_al2")
 
 ;; call
 (define_insn_reservation "p5600_int_call" 2
-  (eq_attr "jal" "indirect,direct")
+  (and (eq_attr "cpu" "p5600")
+       (eq_attr "jal" "indirect,direct"))
   "p5600_agq_ctistd")
diff --git a/gcc-4.9/gcc/cse.c b/gcc-4.9/gcc/cse.c
index b8223f7a3..ec9aff419 100644
--- a/gcc-4.9/gcc/cse.c
+++ b/gcc-4.9/gcc/cse.c
@@ -2680,7 +2680,7 @@ exp_equiv_p (const_rtx x, const_rtx y, int validate, bool for_gcse)
 	     But because really all MEM attributes should be the same for
 	     equivalent MEMs, we just use the invariant that MEMs that have
 	     the same attributes share the same mem_attrs data structure.  */
-	  if (MEM_ATTRS (x) != MEM_ATTRS (y))
+	  if (!mem_attrs_eq_p (MEM_ATTRS (x), MEM_ATTRS (y)))
 	    return 0;
 	}
       break;
diff --git a/gcc-4.9/gcc/emit-rtl.c b/gcc-4.9/gcc/emit-rtl.c
index 89b676837..3041b9e7a 100644
--- a/gcc-4.9/gcc/emit-rtl.c
+++ b/gcc-4.9/gcc/emit-rtl.c
@@ -248,6 +248,10 @@ const_fixed_htab_eq (const void *x, const void *y)
 bool
 mem_attrs_eq_p (const struct mem_attrs *p, const struct mem_attrs *q)
 {
+  if (p == q)
+    return true;
+  if (!p || !q)
+    return false;
   return (p->alias == q->alias
 	  && p->offset_known_p == q->offset_known_p
 	  && (!p->offset_known_p || p->offset == q->offset)
diff --git a/gcc-4.9/gcc/testsuite/gcc.dg/vect/pr52252-ld.c b/gcc-4.9/gcc/testsuite/gcc.dg/vect/pr52252-ld.c
new file mode 100644
index 000000000..6e3cb52b8
--- /dev/null
+++ b/gcc-4.9/gcc/testsuite/gcc.dg/vect/pr52252-ld.c
@@ -0,0 +1,30 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -g -ftree-vectorize -mssse3 -fdump-tree-vect-details" { target { i?86-*-* x86_64-*-* } } } */
+
+#define byte unsigned char
+
+void
+matrix_mul (byte *in, byte *out, int size)
+{
+  int i;
+  for (i = 0; i < size; i++)
+    {
+      byte in0 = in[0];
+      byte in1 = in[1];
+      byte in2 = in[2];
+      byte out0, out1, out2, out3;
+      out0 = in0 + in1;
+      out1 = in0 + in2;
+      out2 = in1 + in2;
+      out3 = in0 + in1 + in2;
+      out[0] = out0;
+      out[1] = out1;
+      out[2] = out2;
+      out[3] = out3;
+      in += 3;
+      out += 4;
+    }
+}
+
+/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" } } */
+/* { dg-final { cleanup-tree-dump "vect" } } */
diff --git a/gcc-4.9/gcc/testsuite/gcc.target/i386/pr52252-atom.c b/gcc-4.9/gcc/testsuite/gcc.target/i386/pr52252-atom.c
new file mode 100644
index 000000000..715b45943
--- /dev/null
+++ b/gcc-4.9/gcc/testsuite/gcc.target/i386/pr52252-atom.c
@@ -0,0 +1,29 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target ssse3 } */
+/* { dg-options "-O2 -ftree-vectorize -mssse3 -mtune=slm" } */
+#define byte unsigned char
+
+void
+matrix_mul (byte *in, byte *out, int size)
+{
+  int i;
+  for (i = 0; i < size; i++)
+    {
+      byte in0 = in[0];
+      byte in1 = in[1];
+      byte in2 = in[2];
+      byte out0, out1, out2, out3;
+      out0 = in0 + in1;
+      out1 = in0 + in2;
+      out2 = in1 + in2;
+      out3 = in0 + in1 + in2;
+      out[0] = out0;
+      out[1] = out1;
+      out[2] = out2;
+      out[3] = out3;
+      in += 3;
+      out += 4;
+    }
+}
+
+/* { dg-final { scan-assembler "palignr" } } */
diff --git a/gcc-4.9/gcc/testsuite/gcc.target/i386/pr52252-core.c b/gcc-4.9/gcc/testsuite/gcc.target/i386/pr52252-core.c
new file mode 100644
index 000000000..ac857a5fe
--- /dev/null
+++ b/gcc-4.9/gcc/testsuite/gcc.target/i386/pr52252-core.c
@@ -0,0 +1,29 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target ssse3 } */
+/* { dg-options "-O2 -ftree-vectorize -mssse3 -mtune=corei7" } */
+#define byte unsigned char
+
+void
+matrix_mul (byte *in, byte *out, int size)
+{
+  int i;
+  for (i = 0; i < size; i++)
+    {
+      byte in0 = in[0];
+      byte in1 = in[1];
+      byte in2 = in[2];
+      byte out0, out1, out2, out3;
+      out0 = in0 + in1;
+      out1 = in0 + in2;
+      out2 = in1 + in2;
+      out3 = in0 + in1 + in2;
+      out[0] = out0;
+      out[1] = out1;
+      out[2] = out2;
+      out[3] = out3;
+      in += 3;
+      out += 4;
+    }
+}
+
+/* { dg-final { scan-assembler "pshufb" } } */
diff --git a/gcc-4.9/gcc/testsuite/gcc.target/i386/pr61403.c b/gcc-4.9/gcc/testsuite/gcc.target/i386/pr61403.c
new file mode 100644
index 000000000..84cc5c5c8
--- /dev/null
+++ b/gcc-4.9/gcc/testsuite/gcc.target/i386/pr61403.c
@@ -0,0 +1,27 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target sse4 } */
+/* { dg-options "-O2 -ffast-math -ftree-vectorize -msse4.2 -mtune=corei7" } */
+
+#include <math.h>
+
+struct XYZ
+{
+  float x;
+  float y;
+  float z;
+};
+
+void
+norm (struct XYZ *in, struct XYZ *out, int size)
+{
+  int i;
+  for (i = 0; i < size; ++i)
+    {
+      float n = sqrt (in[i].x * in[i].x + in[i].y * in[i].y + in[i].z * in[i].z);
+      out[i].x = in[i].x / n;
+      out[i].y = in[i].y / n;
+      out[i].z = in[i].z / n;
+    }
+}
+
+/* { dg-final { scan-assembler "blend" } } */
diff --git a/gcc-4.9/gcc/tree-vect-data-refs.c b/gcc-4.9/gcc/tree-vect-data-refs.c
index 6622bd84d..ab1197ec6 100644
--- a/gcc-4.9/gcc/tree-vect-data-refs.c
+++ b/gcc-4.9/gcc/tree-vect-data-refs.c
@@ -4815,36 +4815,76 @@ vect_grouped_load_supported (tree vectype, unsigned HOST_WIDE_INT count)
 {
   enum machine_mode mode = TYPE_MODE (vectype);
 
-  /* vect_permute_load_chain requires the group size to be a power of two.  */
-  if (exact_log2 (count) == -1)
+  /* vect_permute_load_chain requires the group size to be equal to 3 or
+     be a power of two.  */
+  if (count != 3 && exact_log2 (count) == -1)
     {
       if (dump_enabled_p ())
 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-                         "the size of the group of accesses"
-                         " is not a power of 2\n");
+			 "the size of the group of accesses"
+			 " is not a power of 2 or not equal to 3\n");
       return false;
     }
 
   /* Check that the permutation is supported.  */
   if (VECTOR_MODE_P (mode))
     {
-      unsigned int i, nelt = GET_MODE_NUNITS (mode);
+      unsigned int i, j, nelt = GET_MODE_NUNITS (mode);
       unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
 
-      for (i = 0; i < nelt; i++)
-	sel[i] = i * 2;
-      if (can_vec_perm_p (mode, false, sel))
+      if (count == 3)
 	{
+	  unsigned int k;
+	  for (k = 0; k < 3; k++)
+	    {
+	      for (i = 0; i < nelt; i++)
+		if (3 * i + k < 2 * nelt)
+		  sel[i] = 3 * i + k;
+		else
+		  sel[i] = 0;
+	      if (!can_vec_perm_p (mode, false, sel))
+		{
+		  if (dump_enabled_p ())
+		    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+				     "shuffle of 3 loads is not supported by"
+				     " target\n");
+		    return false;
+		}
+	      for (i = 0, j = 0; i < nelt; i++)
+		if (3 * i + k < 2 * nelt)
+		  sel[i] = i;
+		else
+		  sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
+	      if (!can_vec_perm_p (mode, false, sel))
+		{
+		  if (dump_enabled_p ())
+		    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+				     "shuffle of 3 loads is not supported by"
+				     " target\n");
+		  return false;
+		}
+	    }
+	  return true;
+	}
+      else
+	{
+	  /* If length is not equal to 3 then only power of 2 is supported.  */
+	  gcc_assert (exact_log2 (count) != -1);
 	  for (i = 0; i < nelt; i++)
-	    sel[i] = i * 2 + 1;
+	    sel[i] = i * 2;
 	  if (can_vec_perm_p (mode, false, sel))
-	    return true;
-	}
+	    {
+	      for (i = 0; i < nelt; i++)
+		sel[i] = i * 2 + 1;
+	      if (can_vec_perm_p (mode, false, sel))
+		return true;
+	    }
+        }
     }
 
   if (dump_enabled_p ())
     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-                     "extract even/odd not supported by target\n");
+		     "extract even/odd not supported by target\n");
   return false;
 }
 
@@ -4862,8 +4902,9 @@ vect_load_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count)
 /* Function vect_permute_load_chain.
 
    Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be
-   a power of 2, generate extract_even/odd stmts to reorder the input data
-   correctly.  Return the final references for loads in RESULT_CHAIN.
+   a power of 2 or equal to 3, generate extract_even/odd stmts to reorder
+   the input data correctly.  Return the final references for loads in
+   RESULT_CHAIN.
 
    E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
    The input is 4 vectors each containing 8 elements. We assign a number to each
@@ -4944,6 +4985,7 @@ vect_permute_load_chain (vec<tree> dr_chain,
 {
   tree data_ref, first_vect, second_vect;
   tree perm_mask_even, perm_mask_odd;
+  tree perm3_mask_low, perm3_mask_high;
   gimple perm_stmt;
   tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
   unsigned int i, j, log_length = exact_log2 (length);
@@ -4954,44 +4996,437 @@ vect_permute_load_chain (vec<tree> dr_chain,
   memcpy (result_chain->address (), dr_chain.address (),
 	  length * sizeof (tree));
 
-  for (i = 0; i < nelt; ++i)
-    sel[i] = i * 2;
-  perm_mask_even = vect_gen_perm_mask (vectype, sel);
-  gcc_assert (perm_mask_even != NULL);
-
-  for (i = 0; i < nelt; ++i)
-    sel[i] = i * 2 + 1;
-  perm_mask_odd = vect_gen_perm_mask (vectype, sel);
-  gcc_assert (perm_mask_odd != NULL);
-
-  for (i = 0; i < log_length; i++)
+  if (length == 3)
     {
-      for (j = 0; j < length; j += 2)
-	{
-	  first_vect = dr_chain[j];
-	  second_vect = dr_chain[j+1];
+      unsigned int k;
 
-	  /* data_ref = permute_even (first_data_ref, second_data_ref);  */
-	  data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_even");
+      for (k = 0; k < 3; k++)
+	{
+	  for (i = 0; i < nelt; i++)
+	    if (3 * i + k < 2 * nelt)
+	      sel[i] = 3 * i + k;
+	    else
+	      sel[i] = 0;
+	  perm3_mask_low = vect_gen_perm_mask (vectype, sel);
+	  gcc_assert (perm3_mask_low != NULL);
+
+	  for (i = 0, j = 0; i < nelt; i++)
+	    if (3 * i + k < 2 * nelt)
+	      sel[i] = i;
+	    else
+	      sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
+
+	  perm3_mask_high = vect_gen_perm_mask (vectype, sel);
+	  gcc_assert (perm3_mask_high != NULL);
+
+	  first_vect = dr_chain[0];
+	  second_vect = dr_chain[1];
+
+	  /* Create interleaving stmt (low part of):
+	     low = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
+							     ...}>  */
+	  data_ref = make_temp_ssa_name (vectype, NULL, "vect_suffle3_low");
 	  perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, data_ref,
 						    first_vect, second_vect,
-						    perm_mask_even);
+						    perm3_mask_low);
 	  vect_finish_stmt_generation (stmt, perm_stmt, gsi);
-	  (*result_chain)[j/2] = data_ref;
 
-	  /* data_ref = permute_odd (first_data_ref, second_data_ref);  */
-	  data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_odd");
+	  /* Create interleaving stmt (high part of):
+	     high = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
+							      ...}>  */
+	  first_vect = data_ref;
+	  second_vect = dr_chain[2];
+	  data_ref = make_temp_ssa_name (vectype, NULL, "vect_suffle3_high");
 	  perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, data_ref,
 						    first_vect, second_vect,
-						    perm_mask_odd);
+						    perm3_mask_high);
 	  vect_finish_stmt_generation (stmt, perm_stmt, gsi);
-	  (*result_chain)[j/2+length/2] = data_ref;
+	  (*result_chain)[k] = data_ref;
+	}
+    }
+  else
+    {
+      /* If length is not equal to 3 then only power of 2 is supported.  */
+      gcc_assert (exact_log2 (length) != -1);
+
+      for (i = 0; i < nelt; ++i)
+	sel[i] = i * 2;
+      perm_mask_even = vect_gen_perm_mask (vectype, sel);
+      gcc_assert (perm_mask_even != NULL);
+
+      for (i = 0; i < nelt; ++i)
+	sel[i] = i * 2 + 1;
+      perm_mask_odd = vect_gen_perm_mask (vectype, sel);
+      gcc_assert (perm_mask_odd != NULL);
+
+      for (i = 0; i < log_length; i++)
+	{
+	  for (j = 0; j < length; j += 2)
+	    {
+	      first_vect = dr_chain[j];
+	      second_vect = dr_chain[j+1];
+
+	      /* data_ref = permute_even (first_data_ref, second_data_ref);  */
+	      data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_even");
+	      perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, data_ref,
+							first_vect, second_vect,
+							perm_mask_even);
+	      vect_finish_stmt_generation (stmt, perm_stmt, gsi);
+	      (*result_chain)[j/2] = data_ref;
+
+	      /* data_ref = permute_odd (first_data_ref, second_data_ref);  */
+	      data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_odd");
+	      perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, data_ref,
+							first_vect, second_vect,
+							perm_mask_odd);
+	      vect_finish_stmt_generation (stmt, perm_stmt, gsi);
+	      (*result_chain)[j/2+length/2] = data_ref;
+	    }
+	  memcpy (dr_chain.address (), result_chain->address (),
+		  length * sizeof (tree));
 	}
-      memcpy (dr_chain.address (), result_chain->address (),
-	      length * sizeof (tree));
     }
 }
 
+/* Function vect_shift_permute_load_chain.
+
+   Given a chain of loads in DR_CHAIN of LENGTH 2 or 3, generate
+   sequence of stmts to reorder the input data accordingly.
+   Return the final references for loads in RESULT_CHAIN.
+   Return true if successed, false otherwise.
+
+   E.g., LENGTH is 3 and the scalar type is short, i.e., VF is 8.
+   The input is 3 vectors each containing 8 elements.  We assign a
+   number to each element, the input sequence is:
+
+   1st vec:   0  1  2  3  4  5  6  7
+   2nd vec:   8  9 10 11 12 13 14 15
+   3rd vec:  16 17 18 19 20 21 22 23
+
+   The output sequence should be:
+
+   1st vec:  0 3 6  9 12 15 18 21
+   2nd vec:  1 4 7 10 13 16 19 22
+   3rd vec:  2 5 8 11 14 17 20 23
+
+   We use 3 shuffle instructions and 3 * 3 - 1 shifts to create such output.
+
+   First we shuffle all 3 vectors to get correct elements order:
+
+   1st vec:  ( 0  3  6) ( 1  4  7) ( 2  5)
+   2nd vec:  ( 8 11 14) ( 9 12 15) (10 13)
+   3rd vec:  (16 19 22) (17 20 23) (18 21)
+
+   Next we unite and shift vector 3 times:
+
+   1st step:
+     shift right by 6 the concatenation of:
+     "1st vec" and  "2nd vec"
+       ( 0  3  6) ( 1  4  7) |( 2  5) _ ( 8 11 14) ( 9 12 15)| (10 13)
+     "2nd vec" and  "3rd vec"
+       ( 8 11 14) ( 9 12 15) |(10 13) _ (16 19 22) (17 20 23)| (18 21)
+     "3rd vec" and  "1st vec"
+       (16 19 22) (17 20 23) |(18 21) _ ( 0  3  6) ( 1  4  7)| ( 2  5)
+			     | New vectors                   |
+
+     So that now new vectors are:
+
+     1st vec:  ( 2  5) ( 8 11 14) ( 9 12 15)
+     2nd vec:  (10 13) (16 19 22) (17 20 23)
+     3rd vec:  (18 21) ( 0  3  6) ( 1  4  7)
+
+   2nd step:
+     shift right by 5 the concatenation of:
+     "1st vec" and  "3rd vec"
+       ( 2  5) ( 8 11 14) |( 9 12 15) _ (18 21) ( 0  3  6)| ( 1  4  7)
+     "2nd vec" and  "1st vec"
+       (10 13) (16 19 22) |(17 20 23) _ ( 2  5) ( 8 11 14)| ( 9 12 15)
+     "3rd vec" and  "2nd vec"
+       (18 21) ( 0  3  6) |( 1  4  7) _ (10 13) (16 19 22)| (17 20 23)
+			  | New vectors                   |
+
+     So that now new vectors are:
+
+     1st vec:  ( 9 12 15) (18 21) ( 0  3  6)
+     2nd vec:  (17 20 23) ( 2  5) ( 8 11 14)
+     3rd vec:  ( 1  4  7) (10 13) (16 19 22) READY
+
+   3rd step:
+     shift right by 5 the concatenation of:
+     "1st vec" and  "1st vec"
+       ( 9 12 15) (18 21) |( 0  3  6) _ ( 9 12 15) (18 21)| ( 0  3  6)
+     shift right by 3 the concatenation of:
+     "2nd vec" and  "2nd vec"
+               (17 20 23) |( 2  5) ( 8 11 14) _ (17 20 23)| ( 2  5) ( 8 11 14)
+			  | New vectors                   |
+
+     So that now all vectors are READY:
+     1st vec:  ( 0  3  6) ( 9 12 15) (18 21)
+     2nd vec:  ( 2  5) ( 8 11 14) (17 20 23)
+     3rd vec:  ( 1  4  7) (10 13) (16 19 22)
+
+   This algorithm is faster than one in vect_permute_load_chain if:
+     1.  "shift of a concatination" is faster than general permutation.
+	 This is usually so.
+     2.  The TARGET machine can't execute vector instructions in parallel.
+	 This is because each step of the algorithm depends on previous.
+	 The algorithm in vect_permute_load_chain is much more parallel.
+
+   The algorithm is applicable only for LOAD CHAIN LENGTH less than VF.
+*/
+
+static bool
+vect_shift_permute_load_chain (vec<tree> dr_chain,
+			       unsigned int length,
+			       gimple stmt,
+			       gimple_stmt_iterator *gsi,
+			       vec<tree> *result_chain)
+{
+  tree vect[3], vect_shift[3], data_ref, first_vect, second_vect;
+  tree perm2_mask1, perm2_mask2, perm3_mask;
+  tree select_mask, shift1_mask, shift2_mask, shift3_mask, shift4_mask;
+  gimple perm_stmt;
+
+  tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
+  unsigned int i;
+  unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype);
+  unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
+  stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
+  loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
+
+  result_chain->quick_grow (length);
+  memcpy (result_chain->address (), dr_chain.address (),
+	  length * sizeof (tree));
+
+  if (length == 2 && LOOP_VINFO_VECT_FACTOR (loop_vinfo) > 4)
+    {
+      for (i = 0; i < nelt / 2; ++i)
+	sel[i] = i * 2;
+      for (i = 0; i < nelt / 2; ++i)
+	sel[nelt / 2 + i] = i * 2 + 1;
+      if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
+	{
+	  if (dump_enabled_p ())
+	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+			     "shuffle of 2 fields structure is not \
+			      supported by target\n");
+	  return false;
+	}
+      perm2_mask1 = vect_gen_perm_mask (vectype, sel);
+      gcc_assert (perm2_mask1 != NULL);
+
+      for (i = 0; i < nelt / 2; ++i)
+	sel[i] = i * 2 + 1;
+      for (i = 0; i < nelt / 2; ++i)
+	sel[nelt / 2 + i] = i * 2;
+      if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
+	{
+	  if (dump_enabled_p ())
+	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+			     "shuffle of 2 fields structure is not \
+			      supported by target\n");
+	  return false;
+	}
+      perm2_mask2 = vect_gen_perm_mask (vectype, sel);
+      gcc_assert (perm2_mask2 != NULL);
+
+      /* Generating permutation constant to shift all elements.
+	 For vector length 8 it is {4 5 6 7 8 9 10 11}.  */
+      for (i = 0; i < nelt; i++)
+	sel[i] = nelt / 2 + i;
+      if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
+	{
+	  if (dump_enabled_p ())
+	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+			     "shift permutation is not supported by target\n");
+	  return false;
+	}
+      shift1_mask = vect_gen_perm_mask (vectype, sel);
+      gcc_assert (shift1_mask != NULL);
+
+      /* Generating permutation constant to select vector from 2.
+	 For vector length 8 it is {0 1 2 3 12 13 14 15}.  */
+      for (i = 0; i < nelt / 2; i++)
+	sel[i] = i;
+      for (i = nelt / 2; i < nelt; i++)
+	sel[i] = nelt + i;
+      if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
+	{
+	  if (dump_enabled_p ())
+	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+			     "select is not supported by target\n");
+	  return false;
+	}
+      select_mask = vect_gen_perm_mask (vectype, sel);
+      gcc_assert (select_mask != NULL);
+
+      first_vect = dr_chain[0];
+      second_vect = dr_chain[1];
+
+      data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
+      perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, data_ref,
+						first_vect, first_vect,
+						perm2_mask1);
+      vect_finish_stmt_generation (stmt, perm_stmt, gsi);
+      vect[0] = data_ref;
+
+      data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
+      perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, data_ref,
+						second_vect, second_vect,
+						perm2_mask2);
+      vect_finish_stmt_generation (stmt, perm_stmt, gsi);
+      vect[1] = data_ref;
+
+      data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift");
+      perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, data_ref,
+						vect[0], vect[1],
+						shift1_mask);
+      vect_finish_stmt_generation (stmt, perm_stmt, gsi);
+      (*result_chain)[1] = data_ref;
+
+      data_ref = make_temp_ssa_name (vectype, NULL, "vect_select");
+      perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, data_ref,
+						vect[0], vect[1],
+						select_mask);
+      vect_finish_stmt_generation (stmt, perm_stmt, gsi);
+      (*result_chain)[0] = data_ref;
+
+      return true;
+    }
+  if (length == 3 && LOOP_VINFO_VECT_FACTOR (loop_vinfo) > 2)
+    {
+      unsigned int k = 0, l = 0;
+
+      /* Generating permutation constant to get all elements in rigth order.
+	 For vector length 8 it is {0 3 6 1 4 7 2 5}.  */
+      for (i = 0; i < nelt; i++)
+	{
+	  if (3 * k + (l % 3) >= nelt)
+	    {
+	      k = 0;
+	      l += (3 - (nelt % 3));
+	    }
+	  sel[i] = 3 * k + (l % 3);
+	  k++;
+	}
+      if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
+	{
+	  if (dump_enabled_p ())
+	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+			     "shuffle of 3 fields structure is not \
+			      supported by target\n");
+	  return false;
+	}
+      perm3_mask = vect_gen_perm_mask (vectype, sel);
+      gcc_assert (perm3_mask != NULL);
+
+      /* Generating permutation constant to shift all elements.
+	 For vector length 8 it is {6 7 8 9 10 11 12 13}.  */
+      for (i = 0; i < nelt; i++)
+	sel[i] = 2 * (nelt / 3) + (nelt % 3) + i;
+      if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
+	{
+	  if (dump_enabled_p ())
+	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+			     "shift permutation is not supported by target\n");
+	  return false;
+	}
+      shift1_mask = vect_gen_perm_mask (vectype, sel);
+      gcc_assert (shift1_mask != NULL);
+
+      /* Generating permutation constant to shift all elements.
+	 For vector length 8 it is {5 6 7 8 9 10 11 12}.  */
+      for (i = 0; i < nelt; i++)
+	sel[i] = 2 * (nelt / 3) + 1 + i;
+      if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
+	{
+	  if (dump_enabled_p ())
+	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+			     "shift permutation is not supported by target\n");
+	  return false;
+	}
+      shift2_mask = vect_gen_perm_mask (vectype, sel);
+      gcc_assert (shift2_mask != NULL);
+
+      /* Generating permutation constant to shift all elements.
+	 For vector length 8 it is {3 4 5 6 7 8 9 10}.  */
+      for (i = 0; i < nelt; i++)
+	sel[i] = (nelt / 3) + (nelt % 3) / 2 + i;
+      if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
+	{
+	  if (dump_enabled_p ())
+	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+			     "shift permutation is not supported by target\n");
+	  return false;
+	}
+      shift3_mask = vect_gen_perm_mask (vectype, sel);
+      gcc_assert (shift3_mask != NULL);
+
+      /* Generating permutation constant to shift all elements.
+	 For vector length 8 it is {5 6 7 8 9 10 11 12}.  */
+      for (i = 0; i < nelt; i++)
+	sel[i] = 2 * (nelt / 3) + (nelt % 3) / 2 + i;
+      if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
+	{
+	  if (dump_enabled_p ())
+	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+			     "shift permutation is not supported by target\n");
+	  return false;
+	}
+      shift4_mask = vect_gen_perm_mask (vectype, sel);
+      gcc_assert (shift4_mask != NULL);
+
+      for (k = 0; k < 3; k++)
+	{
+	  data_ref = make_temp_ssa_name (vectype, NULL, "vect_suffle3");
+	  perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, data_ref,
+						    dr_chain[k], dr_chain[k],
+						    perm3_mask);
+	  vect_finish_stmt_generation (stmt, perm_stmt, gsi);
+	  vect[k] = data_ref;
+	}
+
+      for (k = 0; k < 3; k++)
+	{
+	  data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift1");
+	  perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, data_ref,
+						    vect[k % 3],
+						    vect[(k + 1) % 3],
+						    shift1_mask);
+	  vect_finish_stmt_generation (stmt, perm_stmt, gsi);
+	  vect_shift[k] = data_ref;
+	}
+
+      for (k = 0; k < 3; k++)
+	{
+	  data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift2");
+	  perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, data_ref,
+						    vect_shift[(4 - k) % 3],
+						    vect_shift[(3 - k) % 3],
+						    shift2_mask);
+	  vect_finish_stmt_generation (stmt, perm_stmt, gsi);
+	  vect[k] = data_ref;
+	}
+
+      (*result_chain)[3 - (nelt % 3)] = vect[2];
+
+      data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift3");
+      perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, data_ref,
+						vect[0], vect[0],
+						shift3_mask);
+      vect_finish_stmt_generation (stmt, perm_stmt, gsi);
+      (*result_chain)[nelt % 3] = data_ref;
+
+      data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift4");
+      perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, data_ref,
+						vect[1], vect[1],
+						shift4_mask);
+      vect_finish_stmt_generation (stmt, perm_stmt, gsi);
+      (*result_chain)[0] = data_ref;
+      return true;
+    }
+  return false;
+}
 
 /* Function vect_transform_grouped_load.
 
@@ -5004,13 +5439,22 @@ void
 vect_transform_grouped_load (gimple stmt, vec<tree> dr_chain, int size,
 			     gimple_stmt_iterator *gsi)
 {
+  enum machine_mode mode;
   vec<tree> result_chain = vNULL;
 
   /* DR_CHAIN contains input data-refs that are a part of the interleaving.
      RESULT_CHAIN is the output of vect_permute_load_chain, it contains permuted
      vectors, that are ready for vector computation.  */
   result_chain.create (size);
-  vect_permute_load_chain (dr_chain, size, stmt, gsi, &result_chain);
+
+  /* If reassociation width for vector type is 2 or greater target machine can
+     execute 2 or more vector instructions in parallel.  Otherwise try to
+     get chain for loads group using vect_shift_permute_load_chain.  */
+  mode = TYPE_MODE (STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt)));
+  if (targetm.sched.reassociation_width (VEC_PERM_EXPR, mode) > 1
+      || !vect_shift_permute_load_chain (dr_chain, size, stmt,
+					 gsi, &result_chain))
+    vect_permute_load_chain (dr_chain, size, stmt, gsi, &result_chain);
   vect_record_grouped_load_vectors (stmt, result_chain);
   result_chain.release ();
 }
diff --git a/gcc-4.9/gcc/tree-vect-stmts.c b/gcc-4.9/gcc/tree-vect-stmts.c
index 1a51d6d7b..b87c14345 100644
--- a/gcc-4.9/gcc/tree-vect-stmts.c
+++ b/gcc-4.9/gcc/tree-vect-stmts.c
@@ -1091,10 +1091,11 @@ vect_model_load_cost (stmt_vec_info stmt_info, int ncopies,
      include the cost of the permutes.  */
   if (!load_lanes_p && group_size > 1)
     {
-      /* Uses an even and odd extract operations for each needed permute.  */
-      int nstmts = ncopies * exact_log2 (group_size) * group_size;
-      inside_cost += record_stmt_cost (body_cost_vec, nstmts, vec_perm,
-				       stmt_info, 0, vect_body);
+      /* Uses an even and odd extract operations or shuffle operations
+	 for each needed permute.  */
+      int nstmts = ncopies * ceil_log2 (group_size) * group_size;
+      inside_cost = record_stmt_cost (body_cost_vec, nstmts, vec_perm,
+				      stmt_info, 0, vect_body);
 
       if (dump_enabled_p ())
         dump_printf_loc (MSG_NOTE, vect_location,