diff options
Diffstat (limited to 'gcc-4.9/gcc')
-rw-r--r-- | gcc-4.9/gcc/cfgcleanup.c | 3 | ||||
-rw-r--r-- | gcc-4.9/gcc/config/i386/gnu-user.h | 4 | ||||
-rw-r--r-- | gcc-4.9/gcc/config/i386/gnu-user64.h | 5 | ||||
-rw-r--r-- | gcc-4.9/gcc/config/i386/i386.c | 32 | ||||
-rw-r--r-- | gcc-4.9/gcc/config/i386/i386.h | 4 | ||||
-rw-r--r-- | gcc-4.9/gcc/config/i386/linux-common.h | 5 | ||||
-rw-r--r-- | gcc-4.9/gcc/config/i386/predicates.md | 16 | ||||
-rw-r--r-- | gcc-4.9/gcc/config/i386/sse.md | 29 | ||||
-rw-r--r-- | gcc-4.9/gcc/config/i386/x86-tune.def | 9 | ||||
-rw-r--r-- | gcc-4.9/gcc/config/mips/p5600.md | 159 | ||||
-rw-r--r-- | gcc-4.9/gcc/cse.c | 2 | ||||
-rw-r--r-- | gcc-4.9/gcc/emit-rtl.c | 4 | ||||
-rw-r--r-- | gcc-4.9/gcc/testsuite/gcc.dg/vect/pr52252-ld.c | 30 | ||||
-rw-r--r-- | gcc-4.9/gcc/testsuite/gcc.target/i386/pr52252-atom.c | 29 | ||||
-rw-r--r-- | gcc-4.9/gcc/testsuite/gcc.target/i386/pr52252-core.c | 29 | ||||
-rw-r--r-- | gcc-4.9/gcc/testsuite/gcc.target/i386/pr61403.c | 27 | ||||
-rw-r--r-- | gcc-4.9/gcc/tree-vect-data-refs.c | 524 | ||||
-rw-r--r-- | gcc-4.9/gcc/tree-vect-stmts.c | 9 |
18 files changed, 808 insertions, 112 deletions
diff --git a/gcc-4.9/gcc/cfgcleanup.c b/gcc-4.9/gcc/cfgcleanup.c index 77196ee6b..de307da54 100644 --- a/gcc-4.9/gcc/cfgcleanup.c +++ b/gcc-4.9/gcc/cfgcleanup.c @@ -53,6 +53,7 @@ along with GCC; see the file COPYING3. If not see #include "df.h" #include "dce.h" #include "dbgcnt.h" +#include "emit-rtl.h" #define FORWARDER_BLOCK_P(BB) ((BB)->flags & BB_FORWARDER_BLOCK) @@ -882,7 +883,7 @@ merge_memattrs (rtx x, rtx y) if (GET_MODE (x) != GET_MODE (y)) return; - if (code == MEM && MEM_ATTRS (x) != MEM_ATTRS (y)) + if (code == MEM && !mem_attrs_eq_p (MEM_ATTRS (x), MEM_ATTRS (y))) { if (! MEM_ATTRS (x)) MEM_ATTRS (y) = 0; diff --git a/gcc-4.9/gcc/config/i386/gnu-user.h b/gcc-4.9/gcc/config/i386/gnu-user.h index 21b9e9692..d4a16e470 100644 --- a/gcc-4.9/gcc/config/i386/gnu-user.h +++ b/gcc-4.9/gcc/config/i386/gnu-user.h @@ -65,6 +65,10 @@ along with GCC; see the file COPYING3. If not see When the -shared link option is used a final link is not being done. */ +#undef ANDROID_TARGET_CC1_SPEC +#define ANDROID_TARGET_CC1_SPEC \ + " -mstackrealign -mssse3 -fno-short-enums " \ + #undef ASM_SPEC #define ASM_SPEC \ "--32 %{!mno-sse2avx:%{mavx:-msse2avx}} %{msse2avx:%{!mavx:-msse2avx}} " \ diff --git a/gcc-4.9/gcc/config/i386/gnu-user64.h b/gcc-4.9/gcc/config/i386/gnu-user64.h index 1c72b41e4..0333b5c26 100644 --- a/gcc-4.9/gcc/config/i386/gnu-user64.h +++ b/gcc-4.9/gcc/config/i386/gnu-user64.h @@ -46,6 +46,11 @@ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see #define SPEC_X32 "mx32" #endif +#undef ANDROID_TARGET_CC1_SPEC +#define ANDROID_TARGET_CC1_SPEC \ + "%{m32:-mstackrealign -mssse3 -fno-short-enums}" \ + "%{!m32:-msse4.2 -mpopcnt}" + #undef ASM_SPEC #define ASM_SPEC "%{" SPEC_32 ":--32} \ %{" SPEC_64 ":--64} \ diff --git a/gcc-4.9/gcc/config/i386/i386.c b/gcc-4.9/gcc/config/i386/i386.c index 85ceff337..fcd5f0dd1 100644 --- a/gcc-4.9/gcc/config/i386/i386.c +++ b/gcc-4.9/gcc/config/i386/i386.c @@ -82,6 +82,7 @@ along with GCC; see the file COPYING3. If not see #include "context.h" #include "pass_manager.h" #include "target-globals.h" +#include "tree-vectorizer.h" static rtx legitimize_dllimport_symbol (rtx, bool); static rtx legitimize_pe_coff_extern_decl (rtx, bool); @@ -1739,7 +1740,7 @@ struct processor_costs slm_cost = { 1, /* scalar load_cost. */ 1, /* scalar_store_cost. */ 1, /* vec_stmt_cost. */ - 1, /* vec_to_scalar_cost. */ + 4, /* vec_to_scalar_cost. */ 1, /* scalar_to_vec_cost. */ 1, /* vec_align_load_cost. */ 2, /* vec_unalign_load_cost. */ @@ -1816,7 +1817,7 @@ struct processor_costs intel_cost = { 1, /* scalar load_cost. */ 1, /* scalar_store_cost. */ 1, /* vec_stmt_cost. */ - 1, /* vec_to_scalar_cost. */ + 4, /* vec_to_scalar_cost. */ 1, /* scalar_to_vec_cost. */ 1, /* vec_align_load_cost. */ 2, /* vec_unalign_load_cost. */ @@ -44301,7 +44302,7 @@ expand_vec_perm_even_odd_1 (struct expand_vec_perm_d *d, unsigned odd) gcc_unreachable (); case V8HImode: - if (TARGET_SSSE3) + if (TARGET_SSSE3 && !TARGET_SLOW_PSHUFB) return expand_vec_perm_pshufb2 (d); else { @@ -44324,7 +44325,7 @@ expand_vec_perm_even_odd_1 (struct expand_vec_perm_d *d, unsigned odd) break; case V16QImode: - if (TARGET_SSSE3) + if (TARGET_SSSE3 && !TARGET_SLOW_PSHUFB) return expand_vec_perm_pshufb2 (d); else { @@ -46496,6 +46497,16 @@ ix86_reassociation_width (unsigned int opc ATTRIBUTE_UNUSED, { int res = 1; + /* Vector part. */ + if (VECTOR_MODE_P (mode)) + { + if (TARGET_VECTOR_PARALLEL_EXECUTION) + return 2; + else + return 1; + } + + /* Scalar part. */ if (INTEGRAL_MODE_P (mode) && TARGET_REASSOC_INT_TO_PARALLEL) res = 2; else if (FLOAT_MODE_P (mode) && TARGET_REASSOC_FP_TO_PARALLEL) @@ -46595,7 +46606,6 @@ ix86_add_stmt_cost (void *data, int count, enum vect_cost_for_stmt kind, { unsigned *cost = (unsigned *) data; unsigned retval = 0; - tree vectype = stmt_info ? stmt_vectype (stmt_info) : NULL_TREE; int stmt_cost = ix86_builtin_vectorization_cost (kind, vectype, misalign); @@ -46606,6 +46616,18 @@ ix86_add_stmt_cost (void *data, int count, enum vect_cost_for_stmt kind, count *= 50; /* FIXME. */ retval = (unsigned) (count * stmt_cost); + + /* We need to multiply all vector stmt cost by 1.7 (estimated cost) + for Silvermont as it has out of order integer pipeline and can execute + 2 scalar instruction per tick, but has in order SIMD pipeline. */ + if (TARGET_SILVERMONT || TARGET_INTEL) + if (stmt_info && stmt_info->stmt) + { + tree lhs_op = gimple_get_lhs (stmt_info->stmt); + if (lhs_op && TREE_CODE (TREE_TYPE (lhs_op)) == INTEGER_TYPE) + retval = (retval * 17) / 10; + } + cost[where] += retval; return retval; diff --git a/gcc-4.9/gcc/config/i386/i386.h b/gcc-4.9/gcc/config/i386/i386.h index b3b7c8d30..f6b169c24 100644 --- a/gcc-4.9/gcc/config/i386/i386.h +++ b/gcc-4.9/gcc/config/i386/i386.h @@ -425,6 +425,10 @@ extern unsigned char ix86_tune_features[X86_TUNE_LAST]; ix86_tune_features[X86_TUNE_USE_VECTOR_FP_CONVERTS] #define TARGET_USE_VECTOR_CONVERTS \ ix86_tune_features[X86_TUNE_USE_VECTOR_CONVERTS] +#define TARGET_SLOW_PSHUFB \ + ix86_tune_features[X86_TUNE_SLOW_PSHUFB] +#define TARGET_VECTOR_PARALLEL_EXECUTION \ + ix86_tune_features[X86_TUNE_VECTOR_PARALLEL_EXECUTION] #define TARGET_FUSE_CMP_AND_BRANCH_32 \ ix86_tune_features[X86_TUNE_FUSE_CMP_AND_BRANCH_32] #define TARGET_FUSE_CMP_AND_BRANCH_64 \ diff --git a/gcc-4.9/gcc/config/i386/linux-common.h b/gcc-4.9/gcc/config/i386/linux-common.h index 574f096e6..d980fb719 100644 --- a/gcc-4.9/gcc/config/i386/linux-common.h +++ b/gcc-4.9/gcc/config/i386/linux-common.h @@ -27,11 +27,6 @@ along with GCC; see the file COPYING3. If not see } \ while (0) -#undef ANDROID_TARGET_CC1_SPEC -#define ANDROID_TARGET_CC1_SPEC \ - "%{m32:-mstackrealign -mssse3 -fno-short-enums}" \ - "%{!m32:-msse4.2 -mpopcnt}" - #undef CC1_SPEC #define CC1_SPEC \ LINUX_OR_ANDROID_CC (GNU_USER_TARGET_CC1_SPEC, \ diff --git a/gcc-4.9/gcc/config/i386/predicates.md b/gcc-4.9/gcc/config/i386/predicates.md index 2ef138424..8266f3eaf 100644 --- a/gcc-4.9/gcc/config/i386/predicates.md +++ b/gcc-4.9/gcc/config/i386/predicates.md @@ -1417,6 +1417,22 @@ return true; }) +;; Return true if OP is a parallel for a palignr permute. +(define_predicate "palignr_operand" + (and (match_code "parallel") + (match_code "const_int" "a")) +{ + int elt = INTVAL (XVECEXP (op, 0, 0)); + int i, nelt = XVECLEN (op, 0); + + /* Check that an order in the permutation is suitable for palignr. + For example, {5 6 7 0 1 2 3 4} is "palignr 5, xmm, xmm". */ + for (i = 1; i < nelt; ++i) + if (INTVAL (XVECEXP (op, 0, i)) != ((elt + i) % nelt)) + return false; + return true; +}) + ;; Return true if OP is a proper third operand to vpblendw256. (define_predicate "avx2_pblendw_operand" (match_code "const_int") diff --git a/gcc-4.9/gcc/config/i386/sse.md b/gcc-4.9/gcc/config/i386/sse.md index 8d061da95..4aced2da9 100644 --- a/gcc-4.9/gcc/config/i386/sse.md +++ b/gcc-4.9/gcc/config/i386/sse.md @@ -14576,6 +14576,35 @@ (set_attr "prefix" "vex") (set_attr "mode" "<sseinsnmode>")]) +(define_insn "*ssse3_palignr<mode>_perm" + [(set (match_operand:V_128 0 "register_operand" "=x,x") + (vec_select:V_128 + (match_operand:V_128 1 "register_operand" "0,x") + (match_parallel 2 "palignr_operand" + [(match_operand 3 "const_int_operand" "n, n")])))] + "TARGET_SSSE3" +{ + enum machine_mode imode = GET_MODE_INNER (GET_MODE (operands[0])); + operands[2] = GEN_INT (INTVAL (operands[3]) * GET_MODE_SIZE (imode)); + + switch (which_alternative) + { + case 0: + return "palignr\t{%2, %1, %0|%0, %1, %2}"; + case 1: + return "vpalignr\t{%2, %1, %1, %0|%0, %1, %1, %2}"; + default: + gcc_unreachable (); + } +} + [(set_attr "isa" "noavx,avx") + (set_attr "type" "sseishft") + (set_attr "atom_unit" "sishuf") + (set_attr "prefix_data16" "1,*") + (set_attr "prefix_extra" "1") + (set_attr "length_immediate" "1") + (set_attr "prefix" "orig,vex")]) + (define_expand "avx_vinsertf128<mode>" [(match_operand:V_256 0 "register_operand") (match_operand:V_256 1 "register_operand") diff --git a/gcc-4.9/gcc/config/i386/x86-tune.def b/gcc-4.9/gcc/config/i386/x86-tune.def index c36174855..b7a703fa0 100644 --- a/gcc-4.9/gcc/config/i386/x86-tune.def +++ b/gcc-4.9/gcc/config/i386/x86-tune.def @@ -386,6 +386,15 @@ DEF_TUNE (X86_TUNE_USE_VECTOR_FP_CONVERTS, "use_vector_fp_converts", from integer to FP. */ DEF_TUNE (X86_TUNE_USE_VECTOR_CONVERTS, "use_vector_converts", m_AMDFAM10) +/* X86_TUNE_SLOW_SHUFB: Indicates tunings with slow pshufb instruction. */ +DEF_TUNE (X86_TUNE_SLOW_PSHUFB, "slow_pshufb", + m_BONNELL | m_SILVERMONT | m_INTEL) + +/* X86_TUNE_VECTOR_PARALLEL_EXECUTION: Indicates tunings with ability to + execute 2 or more vector instructions in parallel. */ +DEF_TUNE (X86_TUNE_VECTOR_PARALLEL_EXECUTION, "vec_parallel", + m_NEHALEM | m_SANDYBRIDGE | m_HASWELL) + /*****************************************************************************/ /* AVX instruction selection tuning (some of SSE flags affects AVX, too) */ /*****************************************************************************/ diff --git a/gcc-4.9/gcc/config/mips/p5600.md b/gcc-4.9/gcc/config/mips/p5600.md index 14d417fcc..d672dc401 100644 --- a/gcc-4.9/gcc/config/mips/p5600.md +++ b/gcc-4.9/gcc/config/mips/p5600.md @@ -18,14 +18,14 @@ ;; along with GCC; see the file COPYING3. If not see ;; <http://www.gnu.org/licenses/>. -(define_automaton "p5600_agen_pipe, p5600_alu_pipe, p5600_fpu_pipe") +(define_automaton "p5600_agen_alq_pipe, p5600_fpu_pipe") ;; The address generation queue (AGQ) has AL2, CTISTD and LDSTA pipes (define_cpu_unit "p5600_agq, p5600_al2, p5600_ctistd, p5600_ldsta, - p5600_gpdiv" "p5600_agen_pipe") + p5600_gpdiv" "p5600_agen_alq_pipe") ;; The arithmetic-logic-unit queue (ALQ) has ALU pipe -(define_cpu_unit "p5600_alq, p5600_alu" "p5600_alu_pipe") +(define_cpu_unit "p5600_alq, p5600_alu" "p5600_agen_alq_pipe") ;; The floating-point-unit queue (FPQ) has short and long pipes (define_cpu_unit "p5600_fpu_short, p5600_fpu_long" "p5600_fpu_pipe") @@ -53,92 +53,109 @@ ;; Arithmetic ;; add, hadd, sub, hsub, average, min, max, compare (define_insn_reservation "msa_short_int_add" 2 - (eq_attr "msa_execunit" "msa_eu_int_add") + (and (eq_attr "cpu" "p5600") + (eq_attr "msa_execunit" "msa_eu_int_add")) "p5600_fpu_short, p5600_fpu_intadd") ;; Bitwise Instructions ;; and, or, xor, bit-clear, leading-bits-count, shift, shuffle (define_insn_reservation "msa_short_logic" 2 - (eq_attr "msa_execunit" "msa_eu_logic") + (and (eq_attr "cpu" "p5600") + (eq_attr "msa_execunit" "msa_eu_logic")) "p5600_fpu_short, p5600_fpu_logic_a") ;; move.v (define_insn_reservation "msa_short_logic_move_v" 2 - (and (eq_attr "type" "fmove") - (eq_attr "mode" "TI")) + (and (eq_attr "cpu" "p5600") + (and (eq_attr "type" "fmove") + (eq_attr "mode" "TI"))) "p5600_fpu_short, p5600_fpu_logic_a") ;; Float compare (define_insn_reservation "msa_short_cmp" 2 - (eq_attr "msa_execunit" "msa_eu_cmp") + (and (eq_attr "cpu" "p5600") + (eq_attr "msa_execunit" "msa_eu_cmp")) "p5600_fpu_short, p5600_fpu_cmp") ;; Float exp2, min, max (define_insn_reservation "msa_short_float2" 2 - (eq_attr "msa_execunit" "msa_eu_float2") + (and (eq_attr "cpu" "p5600") + (eq_attr "msa_execunit" "msa_eu_float2")) "p5600_fpu_short, p5600_fpu_float") ;; Vector sat (define_insn_reservation "msa_short_logic3" 3 - (eq_attr "msa_execunit" "msa_eu_logic3") + (and (eq_attr "cpu" "p5600") + (eq_attr "msa_execunit" "msa_eu_logic3")) "p5600_fpu_short, p5600_fpu_logic_a, p5600_fpu_logic_b") ;; Vector copy, bz, bnz (define_insn_reservation "msa_short_store4" 4 - (eq_attr "msa_execunit" "msa_eu_store4") + (and (eq_attr "cpu" "p5600") + (eq_attr "msa_execunit" "msa_eu_store4")) "p5600_fpu_short, p5600_fpu_store") ;; Vector load (define_insn_reservation "msa_long_load" 10 - (and (eq_attr "type" "fpload") - (eq_attr "mode" "TI")) + (and (eq_attr "cpu" "p5600") + (and (eq_attr "type" "fpload") + (eq_attr "mode" "TI"))) "p5600_fpu_long, p5600_fpu_load") ;; Vector store (define_insn_reservation "msa_short_store" 2 - (and (eq_attr "type" "fpstore") - (eq_attr "mode" "TI")) + (and (eq_attr "cpu" "p5600") + (and (eq_attr "type" "fpstore") + (eq_attr "mode" "TI"))) "p5600_fpu_short, p5600_fpu_store") ;; binsl, binsr, insert, vshf, sld (define_insn_reservation "msa_long_logic" 2 - (eq_attr "msa_execunit" "msa_eu_logic_l") + (and (eq_attr "cpu" "p5600") + (eq_attr "msa_execunit" "msa_eu_logic_l")) "p5600_fpu_long, p5600_fpu_logic") ;; Float fclass, flog2 (define_insn_reservation "msa_long_float2" 2 - (eq_attr "msa_execunit" "msa_eu_float2_l") + (and (eq_attr "cpu" "p5600") + (eq_attr "msa_execunit" "msa_eu_float2_l")) "p5600_fpu_long, p5600_fpu_float_a") ;; fadd, fsub (define_insn_reservation "msa_long_float4" 4 - (eq_attr "msa_execunit" "msa_eu_float4") + (and (eq_attr "cpu" "p5600") + (eq_attr "msa_execunit" "msa_eu_float4")) "p5600_fpu_long, p5600_fpu_float_a, p5600_fpu_float_b") ;; fmul (define_insn_reservation "msa_long_float5" 5 - (eq_attr "msa_execunit" "msa_eu_float5") + (and (eq_attr "cpu" "p5600") + (eq_attr "msa_execunit" "msa_eu_float5")) "p5600_fpu_long, p5600_fpu_float_a, p5600_fpu_float_b, p5600_fpu_float_c") ;; fmadd, fmsub (define_insn_reservation "msa_long_float8" 8 - (eq_attr "msa_execunit" "msa_eu_float8") + (and (eq_attr "cpu" "p5600") + (eq_attr "msa_execunit" "msa_eu_float8")) "p5600_fpu_long, p5600_fpu_float_a, p5600_fpu_float_b, p5600_fpu_float_c, p5600_fpu_float_d") ;; Vector mul, dotp, madd, msub (define_insn_reservation "msa_long_mult" 5 - (eq_attr "msa_execunit" "msa_eu_mult") + (and (eq_attr "cpu" "p5600") + (eq_attr "msa_execunit" "msa_eu_mult")) "p5600_fpu_long, p5600_fpu_mult") ;; fdiv, fmod (semi-pipelined) (define_insn_reservation "msa_long_fdiv" 10 - (eq_attr "msa_execunit" "msa_eu_fdiv") + (and (eq_attr "cpu" "p5600") + (eq_attr "msa_execunit" "msa_eu_fdiv")) "p5600_fpu_long, nothing, nothing, p5600_fpu_fdiv*8") ;; div, mod (non-pipelined) (define_insn_reservation "msa_long_div" 10 - (eq_attr "msa_execunit" "msa_eu_div") + (and (eq_attr "cpu" "p5600") + (eq_attr "msa_execunit" "msa_eu_div")) "p5600_fpu_long, p5600_fpu_div*9, p5600_fpu_div + p5600_fpu_logic_a") ;; @@ -147,52 +164,62 @@ ;; fadd, fsub (define_insn_reservation "p5600_fpu_fadd" 4 - (eq_attr "type" "fadd,fabs,fneg") + (and (eq_attr "cpu" "p5600") + (eq_attr "type" "fadd,fabs,fneg")) "p5600_fpu_long, p5600_fpu_apu") ;; fabs, fneg, fcmp (define_insn_reservation "p5600_fpu_fabs" 2 - (eq_attr "type" "fabs,fneg,fcmp,fmove") + (and (eq_attr "cpu" "p5600") + (eq_attr "type" "fabs,fneg,fcmp,fmove")) "p5600_fpu_short, p5600_fpu_apu") ;; fload (define_insn_reservation "p5600_fpu_fload" 8 - (eq_attr "type" "fpload,fpidxload") + (and (eq_attr "cpu" "p5600") + (eq_attr "type" "fpload,fpidxload")) "p5600_fpu_long, p5600_fpu_apu") ;; fstore (define_insn_reservation "p5600_fpu_fstore" 1 - (eq_attr "type" "fpstore,fpidxstore") + (and (eq_attr "cpu" "p5600") + (eq_attr "type" "fpstore,fpidxstore")) "p5600_fpu_short, p5600_fpu_apu") ;; fmadd (define_insn_reservation "p5600_fpu_fmadd" 9 - (eq_attr "type" "fmadd") + (and (eq_attr "cpu" "p5600") + (eq_attr "type" "fmadd")) "p5600_fpu_long, p5600_fpu_apu") ;; fmul (define_insn_reservation "p5600_fpu_fmul" 5 - (eq_attr "type" "fmul") + (and (eq_attr "cpu" "p5600") + (eq_attr "type" "fmul")) "p5600_fpu_long, p5600_fpu_apu") ;; fdiv, fsqrt (define_insn_reservation "p5600_fpu_div" 17 - (eq_attr "type" "fdiv,frdiv,fsqrt,frsqrt") + (and (eq_attr "cpu" "p5600") + (eq_attr "type" "fdiv,frdiv,fsqrt,frsqrt")) "p5600_fpu_long, p5600_fpu_apu*17") ;; fcvt (define_insn_reservation "p5600_fpu_fcvt" 4 - (eq_attr "type" "fcvt") + (and (eq_attr "cpu" "p5600") + (eq_attr "type" "fcvt")) "p5600_fpu_long, p5600_fpu_apu") ;; mtc (define_insn_reservation "p5600_fpu_fmtc" 7 - (eq_attr "type" "mtc") + (and (eq_attr "cpu" "p5600") + (eq_attr "type" "mtc")) "p5600_fpu_short, p5600_fpu_store") ;; mfc (define_insn_reservation "p5600_fpu_fmfc" 4 - (eq_attr "type" "mfc") + (and (eq_attr "cpu" "p5600") + (eq_attr "type" "mfc")) "p5600_fpu_short, p5600_fpu_store") ;; madd/msub feeding into the add source @@ -205,100 +232,120 @@ ;; and (define_insn_reservation "p5600_int_and" 1 - (eq_attr "move_type" "logical") + (and (eq_attr "cpu" "p5600") + (eq_attr "move_type" "logical")) "p5600_alq_alu") ;; lui (define_insn_reservation "p5600_int_lui" 1 - (eq_attr "move_type" "const") + (and (eq_attr "cpu" "p5600") + (eq_attr "move_type" "const")) "p5600_alq_alu") ;; Load lb, lbu, lh, lhu, lq, lw, lw_i2f, lwxs (define_insn_reservation "p5600_int_load" 4 - (eq_attr "move_type" "load") + (and (eq_attr "cpu" "p5600") + (eq_attr "move_type" "load")) "p5600_agq_ldsta") ;; store (define_insn_reservation "p5600_int_store" 3 - (eq_attr "move_type" "store") + (and (eq_attr "cpu" "p5600") + (eq_attr "move_type" "store")) "p5600_agq_ldsta") ;; andi, sll, srl, seb, seh (define_insn_reservation "p5600_int_arith_1" 1 - (eq_attr "move_type" "andi,sll0,signext") - "p5600_agq_al2 | p5600_alq_alu") + (and (eq_attr "cpu" "p5600") + (eq_attr "move_type" "andi,sll0,signext")) + "p5600_alq_alu | p5600_agq_al2") ;; addi, addiu, ori, xori, add, addu (define_insn_reservation "p5600_int_arith_2" 1 - (eq_attr "alu_type" "add,or,xor") - "p5600_agq_al2 | p5600_alq_alu") + (and (eq_attr "cpu" "p5600") + (eq_attr "alu_type" "add,or,xor")) + "p5600_alq_alu | p5600_agq_al2") ;; nor, sub (define_insn_reservation "p5600_int_arith_3" 1 - (eq_attr "alu_type" "nor,sub") + (and (eq_attr "cpu" "p5600") + (eq_attr "alu_type" "nor,sub")) "p5600_alq_alu") ;; srl, sra, rotr, slt, sllv, srlv (define_insn_reservation "p5600_int_arith_4" 1 - (eq_attr "type" "shift,slt,move") - "p5600_agq_al2 | p5600_alq_alu") + (and (eq_attr "cpu" "p5600") + (eq_attr "type" "shift,slt,move")) + "p5600_alq_alu | p5600_agq_al2") ;; nop (define_insn_reservation "p5600_int_nop" 0 - (eq_attr "type" "nop") + (and (eq_attr "cpu" "p5600") + (eq_attr "type" "nop")) "p5600_agq_al2") ;; clo, clz (define_insn_reservation "p5600_int_countbits" 1 - (eq_attr "type" "clz") + (and (eq_attr "cpu" "p5600") + (eq_attr "type" "clz")) "p5600_agq_al2") ;; Conditional moves (define_insn_reservation "p5600_int_condmove" 1 - (eq_attr "type" "condmove") + (and (eq_attr "cpu" "p5600") + (eq_attr "type" "condmove")) "p5600_agq_al2") ;; madd, msub (define_insn_reservation "p5600_dsp_mac" 5 - (eq_attr "type" "imadd") + (and (eq_attr "cpu" "p5600") + (eq_attr "type" "imadd")) "p5600_agq_al2") ;; mfhi/lo (define_insn_reservation "p5600_dsp_mfhilo" 1 - (eq_attr "type" "mfhi,mflo") + (and (eq_attr "cpu" "p5600") + (eq_attr "type" "mfhi,mflo")) "p5600_agq_al2") ;; mthi/lo (define_insn_reservation "p5600_dsp_mthilo" 5 - (eq_attr "type" "mthi,mtlo") + (and (eq_attr "cpu" "p5600") + (eq_attr "type" "mthi,mtlo")) "p5600_agq_al2") ;; mult, multu, mul (define_insn_reservation "p5600_dsp_mult" 5 - (eq_attr "type" "imul3,imul") + (and (eq_attr "cpu" "p5600") + (eq_attr "type" "imul3,imul")) "p5600_agq_al2") ;; branch and jump (define_insn_reservation "p5600_int_branch" 1 - (eq_attr "type" "branch,jump") + (and (eq_attr "cpu" "p5600") + (eq_attr "type" "branch,jump")) "p5600_agq_ctistd") ;; prefetch (define_insn_reservation "p5600_int_prefetch" 3 - (eq_attr "type" "prefetch,prefetchx") + (and (eq_attr "cpu" "p5600") + (eq_attr "type" "prefetch,prefetchx")) "p5600_agq_ldsta") ;; divide (define_insn_reservation "p5600_int_div" 8 - (eq_attr "type" "idiv") + (and (eq_attr "cpu" "p5600") + (eq_attr "type" "idiv")) "p5600_agq_al2+p5600_gpdiv*8") ;; arith (define_insn_reservation "p5600_int_arith_5" 2 - (eq_attr "type" "arith") + (and (eq_attr "cpu" "p5600") + (eq_attr "type" "arith")) "p5600_agq_al2") ;; call (define_insn_reservation "p5600_int_call" 2 - (eq_attr "jal" "indirect,direct") + (and (eq_attr "cpu" "p5600") + (eq_attr "jal" "indirect,direct")) "p5600_agq_ctistd") diff --git a/gcc-4.9/gcc/cse.c b/gcc-4.9/gcc/cse.c index b8223f7a3..ec9aff419 100644 --- a/gcc-4.9/gcc/cse.c +++ b/gcc-4.9/gcc/cse.c @@ -2680,7 +2680,7 @@ exp_equiv_p (const_rtx x, const_rtx y, int validate, bool for_gcse) But because really all MEM attributes should be the same for equivalent MEMs, we just use the invariant that MEMs that have the same attributes share the same mem_attrs data structure. */ - if (MEM_ATTRS (x) != MEM_ATTRS (y)) + if (!mem_attrs_eq_p (MEM_ATTRS (x), MEM_ATTRS (y))) return 0; } break; diff --git a/gcc-4.9/gcc/emit-rtl.c b/gcc-4.9/gcc/emit-rtl.c index 89b676837..3041b9e7a 100644 --- a/gcc-4.9/gcc/emit-rtl.c +++ b/gcc-4.9/gcc/emit-rtl.c @@ -248,6 +248,10 @@ const_fixed_htab_eq (const void *x, const void *y) bool mem_attrs_eq_p (const struct mem_attrs *p, const struct mem_attrs *q) { + if (p == q) + return true; + if (!p || !q) + return false; return (p->alias == q->alias && p->offset_known_p == q->offset_known_p && (!p->offset_known_p || p->offset == q->offset) diff --git a/gcc-4.9/gcc/testsuite/gcc.dg/vect/pr52252-ld.c b/gcc-4.9/gcc/testsuite/gcc.dg/vect/pr52252-ld.c new file mode 100644 index 000000000..6e3cb52b8 --- /dev/null +++ b/gcc-4.9/gcc/testsuite/gcc.dg/vect/pr52252-ld.c @@ -0,0 +1,30 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -g -ftree-vectorize -mssse3 -fdump-tree-vect-details" { target { i?86-*-* x86_64-*-* } } } */ + +#define byte unsigned char + +void +matrix_mul (byte *in, byte *out, int size) +{ + int i; + for (i = 0; i < size; i++) + { + byte in0 = in[0]; + byte in1 = in[1]; + byte in2 = in[2]; + byte out0, out1, out2, out3; + out0 = in0 + in1; + out1 = in0 + in2; + out2 = in1 + in2; + out3 = in0 + in1 + in2; + out[0] = out0; + out[1] = out1; + out[2] = out2; + out[3] = out3; + in += 3; + out += 4; + } +} + +/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" } } */ +/* { dg-final { cleanup-tree-dump "vect" } } */ diff --git a/gcc-4.9/gcc/testsuite/gcc.target/i386/pr52252-atom.c b/gcc-4.9/gcc/testsuite/gcc.target/i386/pr52252-atom.c new file mode 100644 index 000000000..715b45943 --- /dev/null +++ b/gcc-4.9/gcc/testsuite/gcc.target/i386/pr52252-atom.c @@ -0,0 +1,29 @@ +/* { dg-do compile } */ +/* { dg-require-effective-target ssse3 } */ +/* { dg-options "-O2 -ftree-vectorize -mssse3 -mtune=slm" } */ +#define byte unsigned char + +void +matrix_mul (byte *in, byte *out, int size) +{ + int i; + for (i = 0; i < size; i++) + { + byte in0 = in[0]; + byte in1 = in[1]; + byte in2 = in[2]; + byte out0, out1, out2, out3; + out0 = in0 + in1; + out1 = in0 + in2; + out2 = in1 + in2; + out3 = in0 + in1 + in2; + out[0] = out0; + out[1] = out1; + out[2] = out2; + out[3] = out3; + in += 3; + out += 4; + } +} + +/* { dg-final { scan-assembler "palignr" } } */ diff --git a/gcc-4.9/gcc/testsuite/gcc.target/i386/pr52252-core.c b/gcc-4.9/gcc/testsuite/gcc.target/i386/pr52252-core.c new file mode 100644 index 000000000..ac857a5fe --- /dev/null +++ b/gcc-4.9/gcc/testsuite/gcc.target/i386/pr52252-core.c @@ -0,0 +1,29 @@ +/* { dg-do compile } */ +/* { dg-require-effective-target ssse3 } */ +/* { dg-options "-O2 -ftree-vectorize -mssse3 -mtune=corei7" } */ +#define byte unsigned char + +void +matrix_mul (byte *in, byte *out, int size) +{ + int i; + for (i = 0; i < size; i++) + { + byte in0 = in[0]; + byte in1 = in[1]; + byte in2 = in[2]; + byte out0, out1, out2, out3; + out0 = in0 + in1; + out1 = in0 + in2; + out2 = in1 + in2; + out3 = in0 + in1 + in2; + out[0] = out0; + out[1] = out1; + out[2] = out2; + out[3] = out3; + in += 3; + out += 4; + } +} + +/* { dg-final { scan-assembler "pshufb" } } */ diff --git a/gcc-4.9/gcc/testsuite/gcc.target/i386/pr61403.c b/gcc-4.9/gcc/testsuite/gcc.target/i386/pr61403.c new file mode 100644 index 000000000..84cc5c5c8 --- /dev/null +++ b/gcc-4.9/gcc/testsuite/gcc.target/i386/pr61403.c @@ -0,0 +1,27 @@ +/* { dg-do compile } */ +/* { dg-require-effective-target sse4 } */ +/* { dg-options "-O2 -ffast-math -ftree-vectorize -msse4.2 -mtune=corei7" } */ + +#include <math.h> + +struct XYZ +{ + float x; + float y; + float z; +}; + +void +norm (struct XYZ *in, struct XYZ *out, int size) +{ + int i; + for (i = 0; i < size; ++i) + { + float n = sqrt (in[i].x * in[i].x + in[i].y * in[i].y + in[i].z * in[i].z); + out[i].x = in[i].x / n; + out[i].y = in[i].y / n; + out[i].z = in[i].z / n; + } +} + +/* { dg-final { scan-assembler "blend" } } */ diff --git a/gcc-4.9/gcc/tree-vect-data-refs.c b/gcc-4.9/gcc/tree-vect-data-refs.c index 6622bd84d..ab1197ec6 100644 --- a/gcc-4.9/gcc/tree-vect-data-refs.c +++ b/gcc-4.9/gcc/tree-vect-data-refs.c @@ -4815,36 +4815,76 @@ vect_grouped_load_supported (tree vectype, unsigned HOST_WIDE_INT count) { enum machine_mode mode = TYPE_MODE (vectype); - /* vect_permute_load_chain requires the group size to be a power of two. */ - if (exact_log2 (count) == -1) + /* vect_permute_load_chain requires the group size to be equal to 3 or + be a power of two. */ + if (count != 3 && exact_log2 (count) == -1) { if (dump_enabled_p ()) dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, - "the size of the group of accesses" - " is not a power of 2\n"); + "the size of the group of accesses" + " is not a power of 2 or not equal to 3\n"); return false; } /* Check that the permutation is supported. */ if (VECTOR_MODE_P (mode)) { - unsigned int i, nelt = GET_MODE_NUNITS (mode); + unsigned int i, j, nelt = GET_MODE_NUNITS (mode); unsigned char *sel = XALLOCAVEC (unsigned char, nelt); - for (i = 0; i < nelt; i++) - sel[i] = i * 2; - if (can_vec_perm_p (mode, false, sel)) + if (count == 3) { + unsigned int k; + for (k = 0; k < 3; k++) + { + for (i = 0; i < nelt; i++) + if (3 * i + k < 2 * nelt) + sel[i] = 3 * i + k; + else + sel[i] = 0; + if (!can_vec_perm_p (mode, false, sel)) + { + if (dump_enabled_p ()) + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, + "shuffle of 3 loads is not supported by" + " target\n"); + return false; + } + for (i = 0, j = 0; i < nelt; i++) + if (3 * i + k < 2 * nelt) + sel[i] = i; + else + sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++); + if (!can_vec_perm_p (mode, false, sel)) + { + if (dump_enabled_p ()) + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, + "shuffle of 3 loads is not supported by" + " target\n"); + return false; + } + } + return true; + } + else + { + /* If length is not equal to 3 then only power of 2 is supported. */ + gcc_assert (exact_log2 (count) != -1); for (i = 0; i < nelt; i++) - sel[i] = i * 2 + 1; + sel[i] = i * 2; if (can_vec_perm_p (mode, false, sel)) - return true; - } + { + for (i = 0; i < nelt; i++) + sel[i] = i * 2 + 1; + if (can_vec_perm_p (mode, false, sel)) + return true; + } + } } if (dump_enabled_p ()) dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, - "extract even/odd not supported by target\n"); + "extract even/odd not supported by target\n"); return false; } @@ -4862,8 +4902,9 @@ vect_load_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count) /* Function vect_permute_load_chain. Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be - a power of 2, generate extract_even/odd stmts to reorder the input data - correctly. Return the final references for loads in RESULT_CHAIN. + a power of 2 or equal to 3, generate extract_even/odd stmts to reorder + the input data correctly. Return the final references for loads in + RESULT_CHAIN. E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8. The input is 4 vectors each containing 8 elements. We assign a number to each @@ -4944,6 +4985,7 @@ vect_permute_load_chain (vec<tree> dr_chain, { tree data_ref, first_vect, second_vect; tree perm_mask_even, perm_mask_odd; + tree perm3_mask_low, perm3_mask_high; gimple perm_stmt; tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt)); unsigned int i, j, log_length = exact_log2 (length); @@ -4954,44 +4996,437 @@ vect_permute_load_chain (vec<tree> dr_chain, memcpy (result_chain->address (), dr_chain.address (), length * sizeof (tree)); - for (i = 0; i < nelt; ++i) - sel[i] = i * 2; - perm_mask_even = vect_gen_perm_mask (vectype, sel); - gcc_assert (perm_mask_even != NULL); - - for (i = 0; i < nelt; ++i) - sel[i] = i * 2 + 1; - perm_mask_odd = vect_gen_perm_mask (vectype, sel); - gcc_assert (perm_mask_odd != NULL); - - for (i = 0; i < log_length; i++) + if (length == 3) { - for (j = 0; j < length; j += 2) - { - first_vect = dr_chain[j]; - second_vect = dr_chain[j+1]; + unsigned int k; - /* data_ref = permute_even (first_data_ref, second_data_ref); */ - data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_even"); + for (k = 0; k < 3; k++) + { + for (i = 0; i < nelt; i++) + if (3 * i + k < 2 * nelt) + sel[i] = 3 * i + k; + else + sel[i] = 0; + perm3_mask_low = vect_gen_perm_mask (vectype, sel); + gcc_assert (perm3_mask_low != NULL); + + for (i = 0, j = 0; i < nelt; i++) + if (3 * i + k < 2 * nelt) + sel[i] = i; + else + sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++); + + perm3_mask_high = vect_gen_perm_mask (vectype, sel); + gcc_assert (perm3_mask_high != NULL); + + first_vect = dr_chain[0]; + second_vect = dr_chain[1]; + + /* Create interleaving stmt (low part of): + low = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k, + ...}> */ + data_ref = make_temp_ssa_name (vectype, NULL, "vect_suffle3_low"); perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, data_ref, first_vect, second_vect, - perm_mask_even); + perm3_mask_low); vect_finish_stmt_generation (stmt, perm_stmt, gsi); - (*result_chain)[j/2] = data_ref; - /* data_ref = permute_odd (first_data_ref, second_data_ref); */ - data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_odd"); + /* Create interleaving stmt (high part of): + high = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k, + ...}> */ + first_vect = data_ref; + second_vect = dr_chain[2]; + data_ref = make_temp_ssa_name (vectype, NULL, "vect_suffle3_high"); perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, data_ref, first_vect, second_vect, - perm_mask_odd); + perm3_mask_high); vect_finish_stmt_generation (stmt, perm_stmt, gsi); - (*result_chain)[j/2+length/2] = data_ref; + (*result_chain)[k] = data_ref; + } + } + else + { + /* If length is not equal to 3 then only power of 2 is supported. */ + gcc_assert (exact_log2 (length) != -1); + + for (i = 0; i < nelt; ++i) + sel[i] = i * 2; + perm_mask_even = vect_gen_perm_mask (vectype, sel); + gcc_assert (perm_mask_even != NULL); + + for (i = 0; i < nelt; ++i) + sel[i] = i * 2 + 1; + perm_mask_odd = vect_gen_perm_mask (vectype, sel); + gcc_assert (perm_mask_odd != NULL); + + for (i = 0; i < log_length; i++) + { + for (j = 0; j < length; j += 2) + { + first_vect = dr_chain[j]; + second_vect = dr_chain[j+1]; + + /* data_ref = permute_even (first_data_ref, second_data_ref); */ + data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_even"); + perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, data_ref, + first_vect, second_vect, + perm_mask_even); + vect_finish_stmt_generation (stmt, perm_stmt, gsi); + (*result_chain)[j/2] = data_ref; + + /* data_ref = permute_odd (first_data_ref, second_data_ref); */ + data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_odd"); + perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, data_ref, + first_vect, second_vect, + perm_mask_odd); + vect_finish_stmt_generation (stmt, perm_stmt, gsi); + (*result_chain)[j/2+length/2] = data_ref; + } + memcpy (dr_chain.address (), result_chain->address (), + length * sizeof (tree)); } - memcpy (dr_chain.address (), result_chain->address (), - length * sizeof (tree)); } } +/* Function vect_shift_permute_load_chain. + + Given a chain of loads in DR_CHAIN of LENGTH 2 or 3, generate + sequence of stmts to reorder the input data accordingly. + Return the final references for loads in RESULT_CHAIN. + Return true if successed, false otherwise. + + E.g., LENGTH is 3 and the scalar type is short, i.e., VF is 8. + The input is 3 vectors each containing 8 elements. We assign a + number to each element, the input sequence is: + + 1st vec: 0 1 2 3 4 5 6 7 + 2nd vec: 8 9 10 11 12 13 14 15 + 3rd vec: 16 17 18 19 20 21 22 23 + + The output sequence should be: + + 1st vec: 0 3 6 9 12 15 18 21 + 2nd vec: 1 4 7 10 13 16 19 22 + 3rd vec: 2 5 8 11 14 17 20 23 + + We use 3 shuffle instructions and 3 * 3 - 1 shifts to create such output. + + First we shuffle all 3 vectors to get correct elements order: + + 1st vec: ( 0 3 6) ( 1 4 7) ( 2 5) + 2nd vec: ( 8 11 14) ( 9 12 15) (10 13) + 3rd vec: (16 19 22) (17 20 23) (18 21) + + Next we unite and shift vector 3 times: + + 1st step: + shift right by 6 the concatenation of: + "1st vec" and "2nd vec" + ( 0 3 6) ( 1 4 7) |( 2 5) _ ( 8 11 14) ( 9 12 15)| (10 13) + "2nd vec" and "3rd vec" + ( 8 11 14) ( 9 12 15) |(10 13) _ (16 19 22) (17 20 23)| (18 21) + "3rd vec" and "1st vec" + (16 19 22) (17 20 23) |(18 21) _ ( 0 3 6) ( 1 4 7)| ( 2 5) + | New vectors | + + So that now new vectors are: + + 1st vec: ( 2 5) ( 8 11 14) ( 9 12 15) + 2nd vec: (10 13) (16 19 22) (17 20 23) + 3rd vec: (18 21) ( 0 3 6) ( 1 4 7) + + 2nd step: + shift right by 5 the concatenation of: + "1st vec" and "3rd vec" + ( 2 5) ( 8 11 14) |( 9 12 15) _ (18 21) ( 0 3 6)| ( 1 4 7) + "2nd vec" and "1st vec" + (10 13) (16 19 22) |(17 20 23) _ ( 2 5) ( 8 11 14)| ( 9 12 15) + "3rd vec" and "2nd vec" + (18 21) ( 0 3 6) |( 1 4 7) _ (10 13) (16 19 22)| (17 20 23) + | New vectors | + + So that now new vectors are: + + 1st vec: ( 9 12 15) (18 21) ( 0 3 6) + 2nd vec: (17 20 23) ( 2 5) ( 8 11 14) + 3rd vec: ( 1 4 7) (10 13) (16 19 22) READY + + 3rd step: + shift right by 5 the concatenation of: + "1st vec" and "1st vec" + ( 9 12 15) (18 21) |( 0 3 6) _ ( 9 12 15) (18 21)| ( 0 3 6) + shift right by 3 the concatenation of: + "2nd vec" and "2nd vec" + (17 20 23) |( 2 5) ( 8 11 14) _ (17 20 23)| ( 2 5) ( 8 11 14) + | New vectors | + + So that now all vectors are READY: + 1st vec: ( 0 3 6) ( 9 12 15) (18 21) + 2nd vec: ( 2 5) ( 8 11 14) (17 20 23) + 3rd vec: ( 1 4 7) (10 13) (16 19 22) + + This algorithm is faster than one in vect_permute_load_chain if: + 1. "shift of a concatination" is faster than general permutation. + This is usually so. + 2. The TARGET machine can't execute vector instructions in parallel. + This is because each step of the algorithm depends on previous. + The algorithm in vect_permute_load_chain is much more parallel. + + The algorithm is applicable only for LOAD CHAIN LENGTH less than VF. +*/ + +static bool +vect_shift_permute_load_chain (vec<tree> dr_chain, + unsigned int length, + gimple stmt, + gimple_stmt_iterator *gsi, + vec<tree> *result_chain) +{ + tree vect[3], vect_shift[3], data_ref, first_vect, second_vect; + tree perm2_mask1, perm2_mask2, perm3_mask; + tree select_mask, shift1_mask, shift2_mask, shift3_mask, shift4_mask; + gimple perm_stmt; + + tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt)); + unsigned int i; + unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype); + unsigned char *sel = XALLOCAVEC (unsigned char, nelt); + stmt_vec_info stmt_info = vinfo_for_stmt (stmt); + loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info); + + result_chain->quick_grow (length); + memcpy (result_chain->address (), dr_chain.address (), + length * sizeof (tree)); + + if (length == 2 && LOOP_VINFO_VECT_FACTOR (loop_vinfo) > 4) + { + for (i = 0; i < nelt / 2; ++i) + sel[i] = i * 2; + for (i = 0; i < nelt / 2; ++i) + sel[nelt / 2 + i] = i * 2 + 1; + if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel)) + { + if (dump_enabled_p ()) + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, + "shuffle of 2 fields structure is not \ + supported by target\n"); + return false; + } + perm2_mask1 = vect_gen_perm_mask (vectype, sel); + gcc_assert (perm2_mask1 != NULL); + + for (i = 0; i < nelt / 2; ++i) + sel[i] = i * 2 + 1; + for (i = 0; i < nelt / 2; ++i) + sel[nelt / 2 + i] = i * 2; + if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel)) + { + if (dump_enabled_p ()) + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, + "shuffle of 2 fields structure is not \ + supported by target\n"); + return false; + } + perm2_mask2 = vect_gen_perm_mask (vectype, sel); + gcc_assert (perm2_mask2 != NULL); + + /* Generating permutation constant to shift all elements. + For vector length 8 it is {4 5 6 7 8 9 10 11}. */ + for (i = 0; i < nelt; i++) + sel[i] = nelt / 2 + i; + if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel)) + { + if (dump_enabled_p ()) + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, + "shift permutation is not supported by target\n"); + return false; + } + shift1_mask = vect_gen_perm_mask (vectype, sel); + gcc_assert (shift1_mask != NULL); + + /* Generating permutation constant to select vector from 2. + For vector length 8 it is {0 1 2 3 12 13 14 15}. */ + for (i = 0; i < nelt / 2; i++) + sel[i] = i; + for (i = nelt / 2; i < nelt; i++) + sel[i] = nelt + i; + if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel)) + { + if (dump_enabled_p ()) + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, + "select is not supported by target\n"); + return false; + } + select_mask = vect_gen_perm_mask (vectype, sel); + gcc_assert (select_mask != NULL); + + first_vect = dr_chain[0]; + second_vect = dr_chain[1]; + + data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2"); + perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, data_ref, + first_vect, first_vect, + perm2_mask1); + vect_finish_stmt_generation (stmt, perm_stmt, gsi); + vect[0] = data_ref; + + data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2"); + perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, data_ref, + second_vect, second_vect, + perm2_mask2); + vect_finish_stmt_generation (stmt, perm_stmt, gsi); + vect[1] = data_ref; + + data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift"); + perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, data_ref, + vect[0], vect[1], + shift1_mask); + vect_finish_stmt_generation (stmt, perm_stmt, gsi); + (*result_chain)[1] = data_ref; + + data_ref = make_temp_ssa_name (vectype, NULL, "vect_select"); + perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, data_ref, + vect[0], vect[1], + select_mask); + vect_finish_stmt_generation (stmt, perm_stmt, gsi); + (*result_chain)[0] = data_ref; + + return true; + } + if (length == 3 && LOOP_VINFO_VECT_FACTOR (loop_vinfo) > 2) + { + unsigned int k = 0, l = 0; + + /* Generating permutation constant to get all elements in rigth order. + For vector length 8 it is {0 3 6 1 4 7 2 5}. */ + for (i = 0; i < nelt; i++) + { + if (3 * k + (l % 3) >= nelt) + { + k = 0; + l += (3 - (nelt % 3)); + } + sel[i] = 3 * k + (l % 3); + k++; + } + if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel)) + { + if (dump_enabled_p ()) + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, + "shuffle of 3 fields structure is not \ + supported by target\n"); + return false; + } + perm3_mask = vect_gen_perm_mask (vectype, sel); + gcc_assert (perm3_mask != NULL); + + /* Generating permutation constant to shift all elements. + For vector length 8 it is {6 7 8 9 10 11 12 13}. */ + for (i = 0; i < nelt; i++) + sel[i] = 2 * (nelt / 3) + (nelt % 3) + i; + if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel)) + { + if (dump_enabled_p ()) + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, + "shift permutation is not supported by target\n"); + return false; + } + shift1_mask = vect_gen_perm_mask (vectype, sel); + gcc_assert (shift1_mask != NULL); + + /* Generating permutation constant to shift all elements. + For vector length 8 it is {5 6 7 8 9 10 11 12}. */ + for (i = 0; i < nelt; i++) + sel[i] = 2 * (nelt / 3) + 1 + i; + if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel)) + { + if (dump_enabled_p ()) + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, + "shift permutation is not supported by target\n"); + return false; + } + shift2_mask = vect_gen_perm_mask (vectype, sel); + gcc_assert (shift2_mask != NULL); + + /* Generating permutation constant to shift all elements. + For vector length 8 it is {3 4 5 6 7 8 9 10}. */ + for (i = 0; i < nelt; i++) + sel[i] = (nelt / 3) + (nelt % 3) / 2 + i; + if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel)) + { + if (dump_enabled_p ()) + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, + "shift permutation is not supported by target\n"); + return false; + } + shift3_mask = vect_gen_perm_mask (vectype, sel); + gcc_assert (shift3_mask != NULL); + + /* Generating permutation constant to shift all elements. + For vector length 8 it is {5 6 7 8 9 10 11 12}. */ + for (i = 0; i < nelt; i++) + sel[i] = 2 * (nelt / 3) + (nelt % 3) / 2 + i; + if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel)) + { + if (dump_enabled_p ()) + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, + "shift permutation is not supported by target\n"); + return false; + } + shift4_mask = vect_gen_perm_mask (vectype, sel); + gcc_assert (shift4_mask != NULL); + + for (k = 0; k < 3; k++) + { + data_ref = make_temp_ssa_name (vectype, NULL, "vect_suffle3"); + perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, data_ref, + dr_chain[k], dr_chain[k], + perm3_mask); + vect_finish_stmt_generation (stmt, perm_stmt, gsi); + vect[k] = data_ref; + } + + for (k = 0; k < 3; k++) + { + data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift1"); + perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, data_ref, + vect[k % 3], + vect[(k + 1) % 3], + shift1_mask); + vect_finish_stmt_generation (stmt, perm_stmt, gsi); + vect_shift[k] = data_ref; + } + + for (k = 0; k < 3; k++) + { + data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift2"); + perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, data_ref, + vect_shift[(4 - k) % 3], + vect_shift[(3 - k) % 3], + shift2_mask); + vect_finish_stmt_generation (stmt, perm_stmt, gsi); + vect[k] = data_ref; + } + + (*result_chain)[3 - (nelt % 3)] = vect[2]; + + data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift3"); + perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, data_ref, + vect[0], vect[0], + shift3_mask); + vect_finish_stmt_generation (stmt, perm_stmt, gsi); + (*result_chain)[nelt % 3] = data_ref; + + data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift4"); + perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, data_ref, + vect[1], vect[1], + shift4_mask); + vect_finish_stmt_generation (stmt, perm_stmt, gsi); + (*result_chain)[0] = data_ref; + return true; + } + return false; +} /* Function vect_transform_grouped_load. @@ -5004,13 +5439,22 @@ void vect_transform_grouped_load (gimple stmt, vec<tree> dr_chain, int size, gimple_stmt_iterator *gsi) { + enum machine_mode mode; vec<tree> result_chain = vNULL; /* DR_CHAIN contains input data-refs that are a part of the interleaving. RESULT_CHAIN is the output of vect_permute_load_chain, it contains permuted vectors, that are ready for vector computation. */ result_chain.create (size); - vect_permute_load_chain (dr_chain, size, stmt, gsi, &result_chain); + + /* If reassociation width for vector type is 2 or greater target machine can + execute 2 or more vector instructions in parallel. Otherwise try to + get chain for loads group using vect_shift_permute_load_chain. */ + mode = TYPE_MODE (STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt))); + if (targetm.sched.reassociation_width (VEC_PERM_EXPR, mode) > 1 + || !vect_shift_permute_load_chain (dr_chain, size, stmt, + gsi, &result_chain)) + vect_permute_load_chain (dr_chain, size, stmt, gsi, &result_chain); vect_record_grouped_load_vectors (stmt, result_chain); result_chain.release (); } diff --git a/gcc-4.9/gcc/tree-vect-stmts.c b/gcc-4.9/gcc/tree-vect-stmts.c index 1a51d6d7b..b87c14345 100644 --- a/gcc-4.9/gcc/tree-vect-stmts.c +++ b/gcc-4.9/gcc/tree-vect-stmts.c @@ -1091,10 +1091,11 @@ vect_model_load_cost (stmt_vec_info stmt_info, int ncopies, include the cost of the permutes. */ if (!load_lanes_p && group_size > 1) { - /* Uses an even and odd extract operations for each needed permute. */ - int nstmts = ncopies * exact_log2 (group_size) * group_size; - inside_cost += record_stmt_cost (body_cost_vec, nstmts, vec_perm, - stmt_info, 0, vect_body); + /* Uses an even and odd extract operations or shuffle operations + for each needed permute. */ + int nstmts = ncopies * ceil_log2 (group_size) * group_size; + inside_cost = record_stmt_cost (body_cost_vec, nstmts, vec_perm, + stmt_info, 0, vect_body); if (dump_enabled_p ()) dump_printf_loc (MSG_NOTE, vect_location, |