diff options
author | Andrew Hsieh <andrewhsieh@google.com> | 2014-09-03 15:53:59 +0000 |
---|---|---|
committer | Gerrit Code Review <noreply-gerritcodereview@google.com> | 2014-09-03 15:54:00 +0000 |
commit | 54737a0f41ce0bf1e8f7f8d6d8e3170ef51e07c2 (patch) | |
tree | afd56565c7170158a14ce5c2fa2a1c8752cf5cb2 /gcc-4.9/gcc/config | |
parent | ecf187208f8ec68c49823080179fa1b5f448a35d (diff) | |
parent | 55f9fbb03d0413cb8fe74e5ec5d6c2dd4280933e (diff) | |
download | toolchain_gcc-54737a0f41ce0bf1e8f7f8d6d8e3170ef51e07c2.tar.gz toolchain_gcc-54737a0f41ce0bf1e8f7f8d6d8e3170ef51e07c2.tar.bz2 toolchain_gcc-54737a0f41ce0bf1e8f7f8d6d8e3170ef51e07c2.zip |
Merge "[4.8, 4.9] Backport of additional SLM tuning."
Diffstat (limited to 'gcc-4.9/gcc/config')
-rw-r--r-- | gcc-4.9/gcc/config/i386/i386.c | 32 | ||||
-rw-r--r-- | gcc-4.9/gcc/config/i386/i386.h | 4 | ||||
-rw-r--r-- | gcc-4.9/gcc/config/i386/predicates.md | 16 | ||||
-rw-r--r-- | gcc-4.9/gcc/config/i386/sse.md | 29 | ||||
-rw-r--r-- | gcc-4.9/gcc/config/i386/x86-tune.def | 9 |
5 files changed, 85 insertions, 5 deletions
diff --git a/gcc-4.9/gcc/config/i386/i386.c b/gcc-4.9/gcc/config/i386/i386.c index 70f18ad7b..8a1fbde2c 100644 --- a/gcc-4.9/gcc/config/i386/i386.c +++ b/gcc-4.9/gcc/config/i386/i386.c @@ -82,6 +82,7 @@ along with GCC; see the file COPYING3. If not see #include "context.h" #include "pass_manager.h" #include "target-globals.h" +#include "tree-vectorizer.h" static rtx legitimize_dllimport_symbol (rtx, bool); static rtx legitimize_pe_coff_extern_decl (rtx, bool); @@ -1739,7 +1740,7 @@ struct processor_costs slm_cost = { 1, /* scalar load_cost. */ 1, /* scalar_store_cost. */ 1, /* vec_stmt_cost. */ - 1, /* vec_to_scalar_cost. */ + 4, /* vec_to_scalar_cost. */ 1, /* scalar_to_vec_cost. */ 1, /* vec_align_load_cost. */ 2, /* vec_unalign_load_cost. */ @@ -1816,7 +1817,7 @@ struct processor_costs intel_cost = { 1, /* scalar load_cost. */ 1, /* scalar_store_cost. */ 1, /* vec_stmt_cost. */ - 1, /* vec_to_scalar_cost. */ + 4, /* vec_to_scalar_cost. */ 1, /* scalar_to_vec_cost. */ 1, /* vec_align_load_cost. */ 2, /* vec_unalign_load_cost. */ @@ -44300,7 +44301,7 @@ expand_vec_perm_even_odd_1 (struct expand_vec_perm_d *d, unsigned odd) gcc_unreachable (); case V8HImode: - if (TARGET_SSSE3) + if (TARGET_SSSE3 && !TARGET_SLOW_PSHUFB) return expand_vec_perm_pshufb2 (d); else { @@ -44323,7 +44324,7 @@ expand_vec_perm_even_odd_1 (struct expand_vec_perm_d *d, unsigned odd) break; case V16QImode: - if (TARGET_SSSE3) + if (TARGET_SSSE3 && !TARGET_SLOW_PSHUFB) return expand_vec_perm_pshufb2 (d); else { @@ -46493,6 +46494,16 @@ ix86_reassociation_width (unsigned int opc ATTRIBUTE_UNUSED, { int res = 1; + /* Vector part. */ + if (VECTOR_MODE_P (mode)) + { + if (TARGET_VECTOR_PARALLEL_EXECUTION) + return 2; + else + return 1; + } + + /* Scalar part. */ if (INTEGRAL_MODE_P (mode) && TARGET_REASSOC_INT_TO_PARALLEL) res = 2; else if (FLOAT_MODE_P (mode) && TARGET_REASSOC_FP_TO_PARALLEL) @@ -46592,7 +46603,6 @@ ix86_add_stmt_cost (void *data, int count, enum vect_cost_for_stmt kind, { unsigned *cost = (unsigned *) data; unsigned retval = 0; - tree vectype = stmt_info ? stmt_vectype (stmt_info) : NULL_TREE; int stmt_cost = ix86_builtin_vectorization_cost (kind, vectype, misalign); @@ -46603,6 +46613,18 @@ ix86_add_stmt_cost (void *data, int count, enum vect_cost_for_stmt kind, count *= 50; /* FIXME. */ retval = (unsigned) (count * stmt_cost); + + /* We need to multiply all vector stmt cost by 1.7 (estimated cost) + for Silvermont as it has out of order integer pipeline and can execute + 2 scalar instruction per tick, but has in order SIMD pipeline. */ + if (TARGET_SILVERMONT || TARGET_INTEL) + if (stmt_info && stmt_info->stmt) + { + tree lhs_op = gimple_get_lhs (stmt_info->stmt); + if (lhs_op && TREE_CODE (TREE_TYPE (lhs_op)) == INTEGER_TYPE) + retval = (retval * 17) / 10; + } + cost[where] += retval; return retval; diff --git a/gcc-4.9/gcc/config/i386/i386.h b/gcc-4.9/gcc/config/i386/i386.h index 51659deb3..fb527411a 100644 --- a/gcc-4.9/gcc/config/i386/i386.h +++ b/gcc-4.9/gcc/config/i386/i386.h @@ -425,6 +425,10 @@ extern unsigned char ix86_tune_features[X86_TUNE_LAST]; ix86_tune_features[X86_TUNE_USE_VECTOR_FP_CONVERTS] #define TARGET_USE_VECTOR_CONVERTS \ ix86_tune_features[X86_TUNE_USE_VECTOR_CONVERTS] +#define TARGET_SLOW_PSHUFB \ + ix86_tune_features[X86_TUNE_SLOW_PSHUFB] +#define TARGET_VECTOR_PARALLEL_EXECUTION \ + ix86_tune_features[X86_TUNE_VECTOR_PARALLEL_EXECUTION] #define TARGET_FUSE_CMP_AND_BRANCH_32 \ ix86_tune_features[X86_TUNE_FUSE_CMP_AND_BRANCH_32] #define TARGET_FUSE_CMP_AND_BRANCH_64 \ diff --git a/gcc-4.9/gcc/config/i386/predicates.md b/gcc-4.9/gcc/config/i386/predicates.md index 2ef138424..8266f3eaf 100644 --- a/gcc-4.9/gcc/config/i386/predicates.md +++ b/gcc-4.9/gcc/config/i386/predicates.md @@ -1417,6 +1417,22 @@ return true; }) +;; Return true if OP is a parallel for a palignr permute. +(define_predicate "palignr_operand" + (and (match_code "parallel") + (match_code "const_int" "a")) +{ + int elt = INTVAL (XVECEXP (op, 0, 0)); + int i, nelt = XVECLEN (op, 0); + + /* Check that an order in the permutation is suitable for palignr. + For example, {5 6 7 0 1 2 3 4} is "palignr 5, xmm, xmm". */ + for (i = 1; i < nelt; ++i) + if (INTVAL (XVECEXP (op, 0, i)) != ((elt + i) % nelt)) + return false; + return true; +}) + ;; Return true if OP is a proper third operand to vpblendw256. (define_predicate "avx2_pblendw_operand" (match_code "const_int") diff --git a/gcc-4.9/gcc/config/i386/sse.md b/gcc-4.9/gcc/config/i386/sse.md index b60a8226d..57e2daa22 100644 --- a/gcc-4.9/gcc/config/i386/sse.md +++ b/gcc-4.9/gcc/config/i386/sse.md @@ -14576,6 +14576,35 @@ (set_attr "prefix" "vex") (set_attr "mode" "<sseinsnmode>")]) +(define_insn "*ssse3_palignr<mode>_perm" + [(set (match_operand:V_128 0 "register_operand" "=x,x") + (vec_select:V_128 + (match_operand:V_128 1 "register_operand" "0,x") + (match_parallel 2 "palignr_operand" + [(match_operand 3 "const_int_operand" "n, n")])))] + "TARGET_SSSE3" +{ + enum machine_mode imode = GET_MODE_INNER (GET_MODE (operands[0])); + operands[2] = GEN_INT (INTVAL (operands[3]) * GET_MODE_SIZE (imode)); + + switch (which_alternative) + { + case 0: + return "palignr\t{%2, %1, %0|%0, %1, %2}"; + case 1: + return "vpalignr\t{%2, %1, %1, %0|%0, %1, %1, %2}"; + default: + gcc_unreachable (); + } +} + [(set_attr "isa" "noavx,avx") + (set_attr "type" "sseishft") + (set_attr "atom_unit" "sishuf") + (set_attr "prefix_data16" "1,*") + (set_attr "prefix_extra" "1") + (set_attr "length_immediate" "1") + (set_attr "prefix" "orig,vex")]) + (define_expand "avx_vinsertf128<mode>" [(match_operand:V_256 0 "register_operand") (match_operand:V_256 1 "register_operand") diff --git a/gcc-4.9/gcc/config/i386/x86-tune.def b/gcc-4.9/gcc/config/i386/x86-tune.def index 839910267..cb44dc312 100644 --- a/gcc-4.9/gcc/config/i386/x86-tune.def +++ b/gcc-4.9/gcc/config/i386/x86-tune.def @@ -386,6 +386,15 @@ DEF_TUNE (X86_TUNE_USE_VECTOR_FP_CONVERTS, "use_vector_fp_converts", from integer to FP. */ DEF_TUNE (X86_TUNE_USE_VECTOR_CONVERTS, "use_vector_converts", m_AMDFAM10) +/* X86_TUNE_SLOW_SHUFB: Indicates tunings with slow pshufb instruction. */ +DEF_TUNE (X86_TUNE_SLOW_PSHUFB, "slow_pshufb", + m_BONNELL | m_SILVERMONT | m_INTEL) + +/* X86_TUNE_VECTOR_PARALLEL_EXECUTION: Indicates tunings with ability to + execute 2 or more vector instructions in parallel. */ +DEF_TUNE (X86_TUNE_VECTOR_PARALLEL_EXECUTION, "vec_parallel", + m_NEHALEM | m_SANDYBRIDGE | m_HASWELL) + /*****************************************************************************/ /* AVX instruction selection tuning (some of SSE flags affects AVX, too) */ /*****************************************************************************/ |