[4.8, 4.9] Backport of additional SLM tuning.

Six patches from trunk, reg-tested via 'make check': 2014-05-07 Evgeny Stupachenko <evstupac@gmail.com> * tree-vect-data-refs.c (vect_grouped_load_supported): New check for loads group of length 3. (vect_permute_load_chain): New permutations for loads group of length 3. * tree-vect-stmts.c (vect_model_load_cost): Change cost of vec_perm_shuffle for the new permutations. 2014-04-17 Evgeny Stupachenko <evstupac@gmail.com> * config/i386/i386.c (x86_add_stmt_cost): Fix vector cost model for Silvermont. 2014-04-17 Evgeny Stupachenko <evstupac@gmail.com> * config/i386/x86-tune.def (TARGET_SLOW_PSHUFB): New tune definition. * config/i386/i386.h (TARGET_SLOW_PSHUFB): New tune flag. * config/i386/i386.c (expand_vec_perm_even_odd_1): Avoid byte shuffles for TARGET_SLOW_PSHUFB 2014-04-17 Evgeny Stupachenko <evstupac@gmail.com> * config/i386/i386.c (slm_cost): Adjust vec_to_scalar_cost. * config/i386/i386.c (intel_cost): Ditto. 2014-06-18 Evgeny Stupachenko <evstupac@gmail.com> * config/i386/i386.c (ix86_reassociation_width): Add alternative for vector case. * config/i386/i386.h (TARGET_VECTOR_PARALLEL_EXECUTION): New. * config/i386/x86-tune.def (X86_TUNE_VECTOR_PARALLEL_EXECUTION): New. * tree-vect-data-refs.c (vect_shift_permute_load_chain): New. Introduces alternative way of loads group permutaions. (vect_transform_grouped_load): Try alternative way of permutations. 2014-06-05 Evgeny Stupachenko <evstupac@gmail.com> * config/i386/sse.md (*ssse3_palignr<mode>_perm): New. * config/i386/predicates.md (palignr_operand): New. Indicates if permutation is suitable for palignr instruction. Change-Id: I5e505735ce3dc0ec3c2a1151713a119b24d712fe Signed-off-by: Alexander Ivchenko <alexander.ivchenko@intel.com>
author: Alexander Ivchenko <alexander.ivchenko@intel.com> 2014-07-11 15:24:10 +0400
committer: Alexander Ivchenko <alexander.ivchenko@intel.com> 2014-08-06 16:24:16 +0400
commit: 55f9fbb03d0413cb8fe74e5ec5d6c2dd4280933e (patch)
tree: a276531909449c8ed589df86ad3cfdd3048b7400 /gcc-4.9/gcc/config
parent: 38a8aecfb882072900434499696b5c32a2274515 (diff)
download: toolchain_gcc-55f9fbb03d0413cb8fe74e5ec5d6c2dd4280933e.tar.gz
toolchain_gcc-55f9fbb03d0413cb8fe74e5ec5d6c2dd4280933e.tar.bz2
toolchain_gcc-55f9fbb03d0413cb8fe74e5ec5d6c2dd4280933e.zip
5 files changed, 85 insertions, 5 deletions
diff --git a/gcc-4.9/gcc/config/i386/i386.c b/gcc-4.9/gcc/config/i386/i386.c
index df504335e..4016b6052 100644
--- a/gcc-4.9/gcc/config/i386/i386.c
+++ b/gcc-4.9/gcc/config/i386/i386.c
@@ -82,6 +82,7 @@ along with GCC; see the file COPYING3.  If not see
 #include "context.h"
 #include "pass_manager.h"
 #include "target-globals.h"
+#include "tree-vectorizer.h"
 
 static rtx legitimize_dllimport_symbol (rtx, bool);
 static rtx legitimize_pe_coff_extern_decl (rtx, bool);
@@ -1739,7 +1740,7 @@ struct processor_costs slm_cost = {
   1,					/* scalar load_cost.  */
   1,					/* scalar_store_cost.  */
   1,					/* vec_stmt_cost.  */
-  1,					/* vec_to_scalar_cost.  */
+  4,					/* vec_to_scalar_cost.  */
   1,					/* scalar_to_vec_cost.  */
   1,					/* vec_align_load_cost.  */
   2,					/* vec_unalign_load_cost.  */
@@ -1816,7 +1817,7 @@ struct processor_costs intel_cost = {
   1,					/* scalar load_cost.  */
   1,					/* scalar_store_cost.  */
   1,					/* vec_stmt_cost.  */
-  1,					/* vec_to_scalar_cost.  */
+  4,					/* vec_to_scalar_cost.  */
   1,					/* scalar_to_vec_cost.  */
   1,					/* vec_align_load_cost.  */
   2,					/* vec_unalign_load_cost.  */
@@ -44300,7 +44301,7 @@ expand_vec_perm_even_odd_1 (struct expand_vec_perm_d *d, unsigned odd)
       gcc_unreachable ();
 
     case V8HImode:
-      if (TARGET_SSSE3)
+      if (TARGET_SSSE3 && !TARGET_SLOW_PSHUFB)
 	return expand_vec_perm_pshufb2 (d);
       else
 	{
@@ -44323,7 +44324,7 @@ expand_vec_perm_even_odd_1 (struct expand_vec_perm_d *d, unsigned odd)
       break;
 
     case V16QImode:
-      if (TARGET_SSSE3)
+      if (TARGET_SSSE3 && !TARGET_SLOW_PSHUFB)
 	return expand_vec_perm_pshufb2 (d);
       else
 	{
@@ -46493,6 +46494,16 @@ ix86_reassociation_width (unsigned int opc ATTRIBUTE_UNUSED,
 {
   int res = 1;
 
+  /* Vector part.  */
+  if (VECTOR_MODE_P (mode))
+    {
+      if (TARGET_VECTOR_PARALLEL_EXECUTION)
+	return 2;
+      else
+	return 1;
+    }
+
+  /* Scalar part.  */
   if (INTEGRAL_MODE_P (mode) && TARGET_REASSOC_INT_TO_PARALLEL)
     res = 2;
   else if (FLOAT_MODE_P (mode) && TARGET_REASSOC_FP_TO_PARALLEL)
@@ -46592,7 +46603,6 @@ ix86_add_stmt_cost (void *data, int count, enum vect_cost_for_stmt kind,
 {
   unsigned *cost = (unsigned *) data;
   unsigned retval = 0;
-
   tree vectype = stmt_info ? stmt_vectype (stmt_info) : NULL_TREE;
   int stmt_cost = ix86_builtin_vectorization_cost (kind, vectype, misalign);
 
@@ -46603,6 +46613,18 @@ ix86_add_stmt_cost (void *data, int count, enum vect_cost_for_stmt kind,
     count *= 50;  /* FIXME.  */
 
   retval = (unsigned) (count * stmt_cost);
+
+  /* We need to multiply all vector stmt cost by 1.7 (estimated cost)
+     for Silvermont as it has out of order integer pipeline and can execute
+     2 scalar instruction per tick, but has in order SIMD pipeline.  */
+  if (TARGET_SILVERMONT || TARGET_INTEL)
+    if (stmt_info && stmt_info->stmt)
+      {
+	tree lhs_op = gimple_get_lhs (stmt_info->stmt);
+	if (lhs_op && TREE_CODE (TREE_TYPE (lhs_op)) == INTEGER_TYPE)
+	  retval = (retval * 17) / 10;
+      }
+
   cost[where] += retval;
 
   return retval;
diff --git a/gcc-4.9/gcc/config/i386/i386.h b/gcc-4.9/gcc/config/i386/i386.h
index 51659deb3..fb527411a 100644
--- a/gcc-4.9/gcc/config/i386/i386.h
+++ b/gcc-4.9/gcc/config/i386/i386.h
@@ -425,6 +425,10 @@ extern unsigned char ix86_tune_features[X86_TUNE_LAST];
 	ix86_tune_features[X86_TUNE_USE_VECTOR_FP_CONVERTS]
 #define TARGET_USE_VECTOR_CONVERTS \
 	ix86_tune_features[X86_TUNE_USE_VECTOR_CONVERTS]
+#define TARGET_SLOW_PSHUFB \
+	ix86_tune_features[X86_TUNE_SLOW_PSHUFB]
+#define TARGET_VECTOR_PARALLEL_EXECUTION \
+	ix86_tune_features[X86_TUNE_VECTOR_PARALLEL_EXECUTION]
 #define TARGET_FUSE_CMP_AND_BRANCH_32 \
 	ix86_tune_features[X86_TUNE_FUSE_CMP_AND_BRANCH_32]
 #define TARGET_FUSE_CMP_AND_BRANCH_64 \
diff --git a/gcc-4.9/gcc/config/i386/predicates.md b/gcc-4.9/gcc/config/i386/predicates.md
index 2ef138424..8266f3eaf 100644
--- a/gcc-4.9/gcc/config/i386/predicates.md
+++ b/gcc-4.9/gcc/config/i386/predicates.md
@@ -1417,6 +1417,22 @@
   return true;
 })
 
+;; Return true if OP is a parallel for a palignr permute.
+(define_predicate "palignr_operand"
+  (and (match_code "parallel")
+       (match_code "const_int" "a"))
+{
+  int elt = INTVAL (XVECEXP (op, 0, 0));
+  int i, nelt = XVECLEN (op, 0);
+
+  /* Check that an order in the permutation is suitable for palignr.
+     For example, {5 6 7 0 1 2 3 4} is "palignr 5, xmm, xmm".  */
+  for (i = 1; i < nelt; ++i)
+    if (INTVAL (XVECEXP (op, 0, i)) != ((elt + i) % nelt))
+      return false;
+  return true;
+})
+
 ;; Return true if OP is a proper third operand to vpblendw256.
 (define_predicate "avx2_pblendw_operand"
   (match_code "const_int")
diff --git a/gcc-4.9/gcc/config/i386/sse.md b/gcc-4.9/gcc/config/i386/sse.md
index 27ade1964..26332d2b5 100644
--- a/gcc-4.9/gcc/config/i386/sse.md
+++ b/gcc-4.9/gcc/config/i386/sse.md
@@ -14574,6 +14574,35 @@
    (set_attr "prefix" "vex")
    (set_attr "mode" "<sseinsnmode>")])
 
+(define_insn "*ssse3_palignr<mode>_perm"
+  [(set (match_operand:V_128 0 "register_operand" "=x,x")
+      (vec_select:V_128
+	(match_operand:V_128 1 "register_operand" "0,x")
+	(match_parallel 2 "palignr_operand"
+	  [(match_operand 3 "const_int_operand" "n, n")])))]
+  "TARGET_SSSE3"
+{
+  enum machine_mode imode = GET_MODE_INNER (GET_MODE (operands[0]));
+  operands[2] = GEN_INT (INTVAL (operands[3]) * GET_MODE_SIZE (imode));
+
+  switch (which_alternative)
+    {
+    case 0:
+      return "palignr\t{%2, %1, %0|%0, %1, %2}";
+    case 1:
+      return "vpalignr\t{%2, %1, %1, %0|%0, %1, %1, %2}";
+    default:
+      gcc_unreachable ();
+    }
+}
+  [(set_attr "isa" "noavx,avx")
+   (set_attr "type" "sseishft")
+   (set_attr "atom_unit" "sishuf")
+   (set_attr "prefix_data16" "1,*")
+   (set_attr "prefix_extra" "1")
+   (set_attr "length_immediate" "1")
+   (set_attr "prefix" "orig,vex")])
+
 (define_expand "avx_vinsertf128<mode>"
   [(match_operand:V_256 0 "register_operand")
    (match_operand:V_256 1 "register_operand")
diff --git a/gcc-4.9/gcc/config/i386/x86-tune.def b/gcc-4.9/gcc/config/i386/x86-tune.def
index 839910267..cb44dc312 100644
--- a/gcc-4.9/gcc/config/i386/x86-tune.def
+++ b/gcc-4.9/gcc/config/i386/x86-tune.def
@@ -386,6 +386,15 @@ DEF_TUNE (X86_TUNE_USE_VECTOR_FP_CONVERTS, "use_vector_fp_converts",
    from integer to FP. */
 DEF_TUNE (X86_TUNE_USE_VECTOR_CONVERTS, "use_vector_converts", m_AMDFAM10)
 
+/* X86_TUNE_SLOW_SHUFB: Indicates tunings with slow pshufb instruction.  */
+DEF_TUNE (X86_TUNE_SLOW_PSHUFB, "slow_pshufb",
+          m_BONNELL | m_SILVERMONT | m_INTEL)
+
+/* X86_TUNE_VECTOR_PARALLEL_EXECUTION: Indicates tunings with ability to
+   execute 2 or more vector instructions in parallel.  */
+DEF_TUNE (X86_TUNE_VECTOR_PARALLEL_EXECUTION, "vec_parallel",
+          m_NEHALEM | m_SANDYBRIDGE | m_HASWELL)
+
 /*****************************************************************************/
 /* AVX instruction selection tuning (some of SSE flags affects AVX, too)     */
 /*****************************************************************************/
author	Alexander Ivchenko <alexander.ivchenko@intel.com>	2014-07-11 15:24:10 +0400
committer	Alexander Ivchenko <alexander.ivchenko@intel.com>	2014-08-06 16:24:16 +0400
commit	55f9fbb03d0413cb8fe74e5ec5d6c2dd4280933e (patch)
tree	a276531909449c8ed589df86ad3cfdd3048b7400 /gcc-4.9/gcc/config
parent	38a8aecfb882072900434499696b5c32a2274515 (diff)
download	toolchain_gcc-55f9fbb03d0413cb8fe74e5ec5d6c2dd4280933e.tar.gz toolchain_gcc-55f9fbb03d0413cb8fe74e5ec5d6c2dd4280933e.tar.bz2 toolchain_gcc-55f9fbb03d0413cb8fe74e5ec5d6c2dd4280933e.zip