aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--gcc-4.8/gcc/config/i386/i386.c14
-rw-r--r--gcc-4.8/gcc/config/i386/i386.h3
-rw-r--r--gcc-4.8/gcc/config/i386/predicates.md16
-rw-r--r--gcc-4.8/gcc/config/i386/sse.md29
-rw-r--r--gcc-4.8/gcc/tree-vect-data-refs.c527
-rw-r--r--gcc-4.8/gcc/tree-vect-stmts.c9
-rw-r--r--gcc-4.9/gcc/config/i386/i386.c32
-rw-r--r--gcc-4.9/gcc/config/i386/i386.h4
-rw-r--r--gcc-4.9/gcc/config/i386/predicates.md16
-rw-r--r--gcc-4.9/gcc/config/i386/sse.md29
-rw-r--r--gcc-4.9/gcc/config/i386/x86-tune.def9
-rw-r--r--gcc-4.9/gcc/testsuite/gcc.dg/vect/pr52252-ld.c30
-rw-r--r--gcc-4.9/gcc/testsuite/gcc.target/i386/pr52252-atom.c29
-rw-r--r--gcc-4.9/gcc/testsuite/gcc.target/i386/pr52252-core.c29
-rw-r--r--gcc-4.9/gcc/testsuite/gcc.target/i386/pr61403.c27
-rw-r--r--gcc-4.9/gcc/tree-vect-data-refs.c524
-rw-r--r--gcc-4.9/gcc/tree-vect-stmts.c9
17 files changed, 1240 insertions, 96 deletions
diff --git a/gcc-4.8/gcc/config/i386/i386.c b/gcc-4.8/gcc/config/i386/i386.c
index c958228..e54013a 100644
--- a/gcc-4.8/gcc/config/i386/i386.c
+++ b/gcc-4.8/gcc/config/i386/i386.c
@@ -2097,6 +2097,10 @@ static unsigned int initial_ix86_tune_features[X86_TUNE_LAST] = {
during reassociation of fp computation. */
m_ATOM | m_SLM | m_HASWELL,
+ /* X86_TUNE_VECTOR_PARALLEL_EXECUTION: Indicates tunings with ability to
+ execute 2 or more vector instructions in parallel. */
+ m_COREI7 | m_HASWELL,
+
/* X86_TUNE_GENERAL_REGS_SSE_SPILL: Try to spill general regs to SSE
regs instead of memory. */
m_CORE_ALL,
@@ -42496,6 +42500,16 @@ ix86_reassociation_width (unsigned int opc ATTRIBUTE_UNUSED,
{
int res = 1;
+ /* Vector part. */
+ if (VECTOR_MODE_P (mode))
+ {
+ if (TARGET_VECTOR_PARALLEL_EXECUTION)
+ return 2;
+ else
+ return 1;
+ }
+
+ /* Scalar part. */
if (INTEGRAL_MODE_P (mode) && TARGET_REASSOC_INT_TO_PARALLEL)
res = 2;
else if (FLOAT_MODE_P (mode) && TARGET_REASSOC_FP_TO_PARALLEL)
diff --git a/gcc-4.8/gcc/config/i386/i386.h b/gcc-4.8/gcc/config/i386/i386.h
index ac28ec1..1552779 100644
--- a/gcc-4.8/gcc/config/i386/i386.h
+++ b/gcc-4.8/gcc/config/i386/i386.h
@@ -330,6 +330,7 @@ enum ix86_tune_indices {
X86_TUNE_AVX128_OPTIMAL,
X86_TUNE_REASSOC_INT_TO_PARALLEL,
X86_TUNE_REASSOC_FP_TO_PARALLEL,
+ X86_TUNE_VECTOR_PARALLEL_EXECUTION,
X86_TUNE_GENERAL_REGS_SSE_SPILL,
X86_TUNE_AVOID_MEM_OPND_FOR_CMOVE,
X86_TUNE_SPLIT_MEM_OPND_FOR_FP_CONVERTS,
@@ -436,6 +437,8 @@ extern unsigned char ix86_tune_features[X86_TUNE_LAST];
ix86_tune_features[X86_TUNE_REASSOC_INT_TO_PARALLEL]
#define TARGET_REASSOC_FP_TO_PARALLEL \
ix86_tune_features[X86_TUNE_REASSOC_FP_TO_PARALLEL]
+#define TARGET_VECTOR_PARALLEL_EXECUTION \
+ ix86_tune_features[X86_TUNE_VECTOR_PARALLEL_EXECUTION]
#define TARGET_GENERAL_REGS_SSE_SPILL \
ix86_tune_features[X86_TUNE_GENERAL_REGS_SSE_SPILL]
#define TARGET_AVOID_MEM_OPND_FOR_CMOVE \
diff --git a/gcc-4.8/gcc/config/i386/predicates.md b/gcc-4.8/gcc/config/i386/predicates.md
index 0e52c85..5575239 100644
--- a/gcc-4.8/gcc/config/i386/predicates.md
+++ b/gcc-4.8/gcc/config/i386/predicates.md
@@ -1259,6 +1259,22 @@
return true;
})
+;; Return true if OP is a parallel for a palignr permute.
+(define_predicate "palignr_operand"
+ (and (match_code "parallel")
+ (match_code "const_int" "a"))
+{
+ int elt = INTVAL (XVECEXP (op, 0, 0));
+ int i, nelt = XVECLEN (op, 0);
+
+ /* Check that an order in the permutation is suitable for palignr.
+ For example, {5 6 7 0 1 2 3 4} is "palignr 5, xmm, xmm". */
+ for (i = 1; i < nelt; ++i)
+ if (INTVAL (XVECEXP (op, 0, i)) != ((elt + i) % nelt))
+ return false;
+ return true;
+})
+
;; Return true if OP is a proper third operand to vpblendw256.
(define_predicate "avx2_pblendw_operand"
(match_code "const_int")
diff --git a/gcc-4.8/gcc/config/i386/sse.md b/gcc-4.8/gcc/config/i386/sse.md
index 8b44e1d..7ff8f1e 100644
--- a/gcc-4.8/gcc/config/i386/sse.md
+++ b/gcc-4.8/gcc/config/i386/sse.md
@@ -10901,6 +10901,35 @@
(set_attr "prefix" "vex")
(set_attr "mode" "<sseinsnmode>")])
+(define_insn "*ssse3_palignr<mode>_perm"
+ [(set (match_operand:V_128 0 "register_operand" "=x,x")
+ (vec_select:V_128
+ (match_operand:V_128 1 "register_operand" "0,x")
+ (match_parallel 2 "palignr_operand"
+ [(match_operand 3 "const_int_operand" "n, n")])))]
+ "TARGET_SSSE3"
+{
+ enum machine_mode imode = GET_MODE_INNER (GET_MODE (operands[0]));
+ operands[2] = GEN_INT (INTVAL (operands[3]) * GET_MODE_SIZE (imode));
+
+ switch (which_alternative)
+ {
+ case 0:
+ return "palignr\t{%2, %1, %0|%0, %1, %2}";
+ case 1:
+ return "vpalignr\t{%2, %1, %1, %0|%0, %1, %1, %2}";
+ default:
+ gcc_unreachable ();
+ }
+}
+ [(set_attr "isa" "noavx,avx")
+ (set_attr "type" "sseishft")
+ (set_attr "atom_unit" "sishuf")
+ (set_attr "prefix_data16" "1,*")
+ (set_attr "prefix_extra" "1")
+ (set_attr "length_immediate" "1")
+ (set_attr "prefix" "orig,vex")])
+
(define_expand "avx_vinsertf128<mode>"
[(match_operand:V_256 0 "register_operand")
(match_operand:V_256 1 "register_operand")
diff --git a/gcc-4.8/gcc/tree-vect-data-refs.c b/gcc-4.8/gcc/tree-vect-data-refs.c
index d8d1435..6aba52c 100644
--- a/gcc-4.8/gcc/tree-vect-data-refs.c
+++ b/gcc-4.8/gcc/tree-vect-data-refs.c
@@ -4555,42 +4555,79 @@ vect_grouped_load_supported (tree vectype, unsigned HOST_WIDE_INT count)
{
enum machine_mode mode = TYPE_MODE (vectype);
- /* vect_permute_load_chain requires the group size to be a power of two. */
- if (exact_log2 (count) == -1)
+ /* vect_permute_load_chain requires the group size to be equal to 3 or
+ be a power of two. */
+ if (count != 3 && exact_log2 (count) == -1)
{
if (dump_enabled_p ())
dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
- "the size of the group of accesses"
- " is not a power of 2");
+ "the size of the group of accesses"
+ " is not a power of 2 or not equal to 3\n");
return false;
}
/* Check that the permutation is supported. */
if (VECTOR_MODE_P (mode))
{
- unsigned int i, nelt = GET_MODE_NUNITS (mode);
+ unsigned int i, j, nelt = GET_MODE_NUNITS (mode);
unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
- for (i = 0; i < nelt; i++)
- sel[i] = i * 2;
- if (can_vec_perm_p (mode, false, sel))
+ if (count == 3)
{
+ unsigned int k;
+ for (k = 0; k < 3; k++)
+ {
+ for (i = 0; i < nelt; i++)
+ if (3 * i + k < 2 * nelt)
+ sel[i] = 3 * i + k;
+ else
+ sel[i] = 0;
+ if (!can_vec_perm_p (mode, false, sel))
+ {
+ if (dump_enabled_p ())
+ dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+ "shuffle of 3 loads is not supported by"
+ " target\n");
+ return false;
+ }
+ for (i = 0, j = 0; i < nelt; i++)
+ if (3 * i + k < 2 * nelt)
+ sel[i] = i;
+ else
+ sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
+ if (!can_vec_perm_p (mode, false, sel))
+ {
+ if (dump_enabled_p ())
+ dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+ "shuffle of 3 loads is not supported by"
+ " target\n");
+ return false;
+ }
+ }
+ return true;
+ }
+ else
+ {
+ /* If length is not equal to 3 then only power of 2 is supported. */
+ gcc_assert (exact_log2 (count) != -1);
for (i = 0; i < nelt; i++)
- sel[i] = i * 2 + 1;
+ sel[i] = i * 2;
if (can_vec_perm_p (mode, false, sel))
- return true;
- }
+ {
+ for (i = 0; i < nelt; i++)
+ sel[i] = i * 2 + 1;
+ if (can_vec_perm_p (mode, false, sel))
+ return true;
+ }
+ }
}
if (dump_enabled_p ())
dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
- "extract even/odd not supported by target");
+ "extract even/odd not supported by target\n");
return false;
}
-/* Return TRUE if vec_load_lanes is available for COUNT vectors of
- type VECTYPE. */
-
bool
vect_load_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count)
{
@@ -4602,8 +4639,9 @@ vect_load_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count)
/* Function vect_permute_load_chain.
Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be
- a power of 2, generate extract_even/odd stmts to reorder the input data
- correctly. Return the final references for loads in RESULT_CHAIN.
+ a power of 2 or equal to 3, generate extract_even/odd stmts to reorder
+ the input data correctly. Return the final references for loads in
+ RESULT_CHAIN.
E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
The input is 4 vectors each containing 8 elements. We assign a number to each
@@ -4684,6 +4722,7 @@ vect_permute_load_chain (vec<tree> dr_chain,
{
tree data_ref, first_vect, second_vect;
tree perm_mask_even, perm_mask_odd;
+ tree perm3_mask_low, perm3_mask_high;
gimple perm_stmt;
tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
unsigned int i, j, log_length = exact_log2 (length);
@@ -4694,44 +4733,437 @@ vect_permute_load_chain (vec<tree> dr_chain,
memcpy (result_chain->address (), dr_chain.address (),
length * sizeof (tree));
- for (i = 0; i < nelt; ++i)
- sel[i] = i * 2;
- perm_mask_even = vect_gen_perm_mask (vectype, sel);
- gcc_assert (perm_mask_even != NULL);
-
- for (i = 0; i < nelt; ++i)
- sel[i] = i * 2 + 1;
- perm_mask_odd = vect_gen_perm_mask (vectype, sel);
- gcc_assert (perm_mask_odd != NULL);
-
- for (i = 0; i < log_length; i++)
+ if (length == 3)
{
- for (j = 0; j < length; j += 2)
- {
- first_vect = dr_chain[j];
- second_vect = dr_chain[j+1];
+ unsigned int k;
- /* data_ref = permute_even (first_data_ref, second_data_ref); */
- data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_even");
+ for (k = 0; k < 3; k++)
+ {
+ for (i = 0; i < nelt; i++)
+ if (3 * i + k < 2 * nelt)
+ sel[i] = 3 * i + k;
+ else
+ sel[i] = 0;
+ perm3_mask_low = vect_gen_perm_mask (vectype, sel);
+ gcc_assert (perm3_mask_low != NULL);
+
+ for (i = 0, j = 0; i < nelt; i++)
+ if (3 * i + k < 2 * nelt)
+ sel[i] = i;
+ else
+ sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
+
+ perm3_mask_high = vect_gen_perm_mask (vectype, sel);
+ gcc_assert (perm3_mask_high != NULL);
+
+ first_vect = dr_chain[0];
+ second_vect = dr_chain[1];
+
+ /* Create interleaving stmt (low part of):
+ low = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
+ ...}> */
+ data_ref = make_temp_ssa_name (vectype, NULL, "vect_suffle3_low");
perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, data_ref,
first_vect, second_vect,
- perm_mask_even);
+ perm3_mask_low);
vect_finish_stmt_generation (stmt, perm_stmt, gsi);
- (*result_chain)[j/2] = data_ref;
- /* data_ref = permute_odd (first_data_ref, second_data_ref); */
- data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_odd");
+ /* Create interleaving stmt (high part of):
+ high = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
+ ...}> */
+ first_vect = data_ref;
+ second_vect = dr_chain[2];
+ data_ref = make_temp_ssa_name (vectype, NULL, "vect_suffle3_high");
perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, data_ref,
first_vect, second_vect,
- perm_mask_odd);
+ perm3_mask_high);
vect_finish_stmt_generation (stmt, perm_stmt, gsi);
- (*result_chain)[j/2+length/2] = data_ref;
+ (*result_chain)[k] = data_ref;
+ }
+ }
+ else
+ {
+ /* If length is not equal to 3 then only power of 2 is supported. */
+ gcc_assert (exact_log2 (length) != -1);
+
+ for (i = 0; i < nelt; ++i)
+ sel[i] = i * 2;
+ perm_mask_even = vect_gen_perm_mask (vectype, sel);
+ gcc_assert (perm_mask_even != NULL);
+
+ for (i = 0; i < nelt; ++i)
+ sel[i] = i * 2 + 1;
+ perm_mask_odd = vect_gen_perm_mask (vectype, sel);
+ gcc_assert (perm_mask_odd != NULL);
+
+ for (i = 0; i < log_length; i++)
+ {
+ for (j = 0; j < length; j += 2)
+ {
+ first_vect = dr_chain[j];
+ second_vect = dr_chain[j+1];
+
+ /* data_ref = permute_even (first_data_ref, second_data_ref); */
+ data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_even");
+ perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, data_ref,
+ first_vect, second_vect,
+ perm_mask_even);
+ vect_finish_stmt_generation (stmt, perm_stmt, gsi);
+ (*result_chain)[j/2] = data_ref;
+
+ /* data_ref = permute_odd (first_data_ref, second_data_ref); */
+ data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_odd");
+ perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, data_ref,
+ first_vect, second_vect,
+ perm_mask_odd);
+ vect_finish_stmt_generation (stmt, perm_stmt, gsi);
+ (*result_chain)[j/2+length/2] = data_ref;
+ }
+ memcpy (dr_chain.address (), result_chain->address (),
+ length * sizeof (tree));
}
- memcpy (dr_chain.address (), result_chain->address (),
- length * sizeof (tree));
}
}
+/* Function vect_shift_permute_load_chain.
+
+ Given a chain of loads in DR_CHAIN of LENGTH 2 or 3, generate
+ sequence of stmts to reorder the input data accordingly.
+ Return the final references for loads in RESULT_CHAIN.
+ Return true if successed, false otherwise.
+
+ E.g., LENGTH is 3 and the scalar type is short, i.e., VF is 8.
+ The input is 3 vectors each containing 8 elements. We assign a
+ number to each element, the input sequence is:
+
+ 1st vec: 0 1 2 3 4 5 6 7
+ 2nd vec: 8 9 10 11 12 13 14 15
+ 3rd vec: 16 17 18 19 20 21 22 23
+
+ The output sequence should be:
+
+ 1st vec: 0 3 6 9 12 15 18 21
+ 2nd vec: 1 4 7 10 13 16 19 22
+ 3rd vec: 2 5 8 11 14 17 20 23
+
+ We use 3 shuffle instructions and 3 * 3 - 1 shifts to create such output.
+
+ First we shuffle all 3 vectors to get correct elements order:
+
+ 1st vec: ( 0 3 6) ( 1 4 7) ( 2 5)
+ 2nd vec: ( 8 11 14) ( 9 12 15) (10 13)
+ 3rd vec: (16 19 22) (17 20 23) (18 21)
+
+ Next we unite and shift vector 3 times:
+
+ 1st step:
+ shift right by 6 the concatenation of:
+ "1st vec" and "2nd vec"
+ ( 0 3 6) ( 1 4 7) |( 2 5) _ ( 8 11 14) ( 9 12 15)| (10 13)
+ "2nd vec" and "3rd vec"
+ ( 8 11 14) ( 9 12 15) |(10 13) _ (16 19 22) (17 20 23)| (18 21)
+ "3rd vec" and "1st vec"
+ (16 19 22) (17 20 23) |(18 21) _ ( 0 3 6) ( 1 4 7)| ( 2 5)
+ | New vectors |
+
+ So that now new vectors are:
+
+ 1st vec: ( 2 5) ( 8 11 14) ( 9 12 15)
+ 2nd vec: (10 13) (16 19 22) (17 20 23)
+ 3rd vec: (18 21) ( 0 3 6) ( 1 4 7)
+
+ 2nd step:
+ shift right by 5 the concatenation of:
+ "1st vec" and "3rd vec"
+ ( 2 5) ( 8 11 14) |( 9 12 15) _ (18 21) ( 0 3 6)| ( 1 4 7)
+ "2nd vec" and "1st vec"
+ (10 13) (16 19 22) |(17 20 23) _ ( 2 5) ( 8 11 14)| ( 9 12 15)
+ "3rd vec" and "2nd vec"
+ (18 21) ( 0 3 6) |( 1 4 7) _ (10 13) (16 19 22)| (17 20 23)
+ | New vectors |
+
+ So that now new vectors are:
+
+ 1st vec: ( 9 12 15) (18 21) ( 0 3 6)
+ 2nd vec: (17 20 23) ( 2 5) ( 8 11 14)
+ 3rd vec: ( 1 4 7) (10 13) (16 19 22) READY
+
+ 3rd step:
+ shift right by 5 the concatenation of:
+ "1st vec" and "1st vec"
+ ( 9 12 15) (18 21) |( 0 3 6) _ ( 9 12 15) (18 21)| ( 0 3 6)
+ shift right by 3 the concatenation of:
+ "2nd vec" and "2nd vec"
+ (17 20 23) |( 2 5) ( 8 11 14) _ (17 20 23)| ( 2 5) ( 8 11 14)
+ | New vectors |
+
+ So that now all vectors are READY:
+ 1st vec: ( 0 3 6) ( 9 12 15) (18 21)
+ 2nd vec: ( 2 5) ( 8 11 14) (17 20 23)
+ 3rd vec: ( 1 4 7) (10 13) (16 19 22)
+
+ This algorithm is faster than one in vect_permute_load_chain if:
+ 1. "shift of a concatination" is faster than general permutation.
+ This is usually so.
+ 2. The TARGET machine can't execute vector instructions in parallel.
+ This is because each step of the algorithm depends on previous.
+ The algorithm in vect_permute_load_chain is much more parallel.
+
+ The algorithm is applicable only for LOAD CHAIN LENGTH less than VF.
+*/
+
+static bool
+vect_shift_permute_load_chain (vec<tree> dr_chain,
+ unsigned int length,
+ gimple stmt,
+ gimple_stmt_iterator *gsi,
+ vec<tree> *result_chain)
+{
+ tree vect[3], vect_shift[3], data_ref, first_vect, second_vect;
+ tree perm2_mask1, perm2_mask2, perm3_mask;
+ tree select_mask, shift1_mask, shift2_mask, shift3_mask, shift4_mask;
+ gimple perm_stmt;
+
+ tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
+ unsigned int i;
+ unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype);
+ unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
+ stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
+ loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
+
+ result_chain->quick_grow (length);
+ memcpy (result_chain->address (), dr_chain.address (),
+ length * sizeof (tree));
+
+ if (length == 2 && LOOP_VINFO_VECT_FACTOR (loop_vinfo) > 4)
+ {
+ for (i = 0; i < nelt / 2; ++i)
+ sel[i] = i * 2;
+ for (i = 0; i < nelt / 2; ++i)
+ sel[nelt / 2 + i] = i * 2 + 1;
+ if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
+ {
+ if (dump_enabled_p ())
+ dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+ "shuffle of 2 fields structure is not \
+ supported by target\n");
+ return false;
+ }
+ perm2_mask1 = vect_gen_perm_mask (vectype, sel);
+ gcc_assert (perm2_mask1 != NULL);
+
+ for (i = 0; i < nelt / 2; ++i)
+ sel[i] = i * 2 + 1;
+ for (i = 0; i < nelt / 2; ++i)
+ sel[nelt / 2 + i] = i * 2;
+ if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
+ {
+ if (dump_enabled_p ())
+ dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+ "shuffle of 2 fields structure is not \
+ supported by target\n");
+ return false;
+ }
+ perm2_mask2 = vect_gen_perm_mask (vectype, sel);
+ gcc_assert (perm2_mask2 != NULL);
+
+ /* Generating permutation constant to shift all elements.
+ For vector length 8 it is {4 5 6 7 8 9 10 11}. */
+ for (i = 0; i < nelt; i++)
+ sel[i] = nelt / 2 + i;
+ if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
+ {
+ if (dump_enabled_p ())
+ dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+ "shift permutation is not supported by target\n");
+ return false;
+ }
+ shift1_mask = vect_gen_perm_mask (vectype, sel);
+ gcc_assert (shift1_mask != NULL);
+
+ /* Generating permutation constant to select vector from 2.
+ For vector length 8 it is {0 1 2 3 12 13 14 15}. */
+ for (i = 0; i < nelt / 2; i++)
+ sel[i] = i;
+ for (i = nelt / 2; i < nelt; i++)
+ sel[i] = nelt + i;
+ if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
+ {
+ if (dump_enabled_p ())
+ dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+ "select is not supported by target\n");
+ return false;
+ }
+ select_mask = vect_gen_perm_mask (vectype, sel);
+ gcc_assert (select_mask != NULL);
+
+ first_vect = dr_chain[0];
+ second_vect = dr_chain[1];
+
+ data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
+ perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, data_ref,
+ first_vect, first_vect,
+ perm2_mask1);
+ vect_finish_stmt_generation (stmt, perm_stmt, gsi);
+ vect[0] = data_ref;
+
+ data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
+ perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, data_ref,
+ second_vect, second_vect,
+ perm2_mask2);
+ vect_finish_stmt_generation (stmt, perm_stmt, gsi);
+ vect[1] = data_ref;
+
+ data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift");
+ perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, data_ref,
+ vect[0], vect[1],
+ shift1_mask);
+ vect_finish_stmt_generation (stmt, perm_stmt, gsi);
+ (*result_chain)[1] = data_ref;
+
+ data_ref = make_temp_ssa_name (vectype, NULL, "vect_select");
+ perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, data_ref,
+ vect[0], vect[1],
+ select_mask);
+ vect_finish_stmt_generation (stmt, perm_stmt, gsi);
+ (*result_chain)[0] = data_ref;
+
+ return true;
+ }
+ if (length == 3 && LOOP_VINFO_VECT_FACTOR (loop_vinfo) > 2)
+ {
+ unsigned int k = 0, l = 0;
+
+ /* Generating permutation constant to get all elements in rigth order.
+ For vector length 8 it is {0 3 6 1 4 7 2 5}. */
+ for (i = 0; i < nelt; i++)
+ {
+ if (3 * k + (l % 3) >= nelt)
+ {
+ k = 0;
+ l += (3 - (nelt % 3));
+ }
+ sel[i] = 3 * k + (l % 3);
+ k++;
+ }
+ if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
+ {
+ if (dump_enabled_p ())
+ dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+ "shuffle of 3 fields structure is not \
+ supported by target\n");
+ return false;
+ }
+ perm3_mask = vect_gen_perm_mask (vectype, sel);
+ gcc_assert (perm3_mask != NULL);
+
+ /* Generating permutation constant to shift all elements.
+ For vector length 8 it is {6 7 8 9 10 11 12 13}. */
+ for (i = 0; i < nelt; i++)
+ sel[i] = 2 * (nelt / 3) + (nelt % 3) + i;
+ if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
+ {
+ if (dump_enabled_p ())
+ dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+ "shift permutation is not supported by target\n");
+ return false;
+ }
+ shift1_mask = vect_gen_perm_mask (vectype, sel);
+ gcc_assert (shift1_mask != NULL);
+
+ /* Generating permutation constant to shift all elements.
+ For vector length 8 it is {5 6 7 8 9 10 11 12}. */
+ for (i = 0; i < nelt; i++)
+ sel[i] = 2 * (nelt / 3) + 1 + i;
+ if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
+ {
+ if (dump_enabled_p ())
+ dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+ "shift permutation is not supported by target\n");
+ return false;
+ }
+ shift2_mask = vect_gen_perm_mask (vectype, sel);
+ gcc_assert (shift2_mask != NULL);
+
+ /* Generating permutation constant to shift all elements.
+ For vector length 8 it is {3 4 5 6 7 8 9 10}. */
+ for (i = 0; i < nelt; i++)
+ sel[i] = (nelt / 3) + (nelt % 3) / 2 + i;
+ if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
+ {
+ if (dump_enabled_p ())
+ dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+ "shift permutation is not supported by target\n");
+ return false;
+ }
+ shift3_mask = vect_gen_perm_mask (vectype, sel);
+ gcc_assert (shift3_mask != NULL);
+
+ /* Generating permutation constant to shift all elements.
+ For vector length 8 it is {5 6 7 8 9 10 11 12}. */
+ for (i = 0; i < nelt; i++)
+ sel[i] = 2 * (nelt / 3) + (nelt % 3) / 2 + i;
+ if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
+ {
+ if (dump_enabled_p ())
+ dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+ "shift permutation is not supported by target\n");
+ return false;
+ }
+ shift4_mask = vect_gen_perm_mask (vectype, sel);
+ gcc_assert (shift4_mask != NULL);
+
+ for (k = 0; k < 3; k++)
+ {
+ data_ref = make_temp_ssa_name (vectype, NULL, "vect_suffle3");
+ perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, data_ref,
+ dr_chain[k], dr_chain[k],
+ perm3_mask);
+ vect_finish_stmt_generation (stmt, perm_stmt, gsi);
+ vect[k] = data_ref;
+ }
+
+ for (k = 0; k < 3; k++)
+ {
+ data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift1");
+ perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, data_ref,
+ vect[k % 3],
+ vect[(k + 1) % 3],
+ shift1_mask);
+ vect_finish_stmt_generation (stmt, perm_stmt, gsi);
+ vect_shift[k] = data_ref;
+ }
+
+ for (k = 0; k < 3; k++)
+ {
+ data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift2");
+ perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, data_ref,
+ vect_shift[(4 - k) % 3],
+ vect_shift[(3 - k) % 3],
+ shift2_mask);
+ vect_finish_stmt_generation (stmt, perm_stmt, gsi);
+ vect[k] = data_ref;
+ }
+
+ (*result_chain)[3 - (nelt % 3)] = vect[2];
+
+ data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift3");
+ perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, data_ref,
+ vect[0], vect[0],
+ shift3_mask);
+ vect_finish_stmt_generation (stmt, perm_stmt, gsi);
+ (*result_chain)[nelt % 3] = data_ref;
+
+ data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift4");
+ perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, data_ref,
+ vect[1], vect[1],
+ shift4_mask);
+ vect_finish_stmt_generation (stmt, perm_stmt, gsi);
+ (*result_chain)[0] = data_ref;
+ return true;
+ }
+ return false;
+}
/* Function vect_transform_grouped_load.
@@ -4744,13 +5176,22 @@ void
vect_transform_grouped_load (gimple stmt, vec<tree> dr_chain, int size,
gimple_stmt_iterator *gsi)
{
+ enum machine_mode mode;
vec<tree> result_chain = vNULL;
/* DR_CHAIN contains input data-refs that are a part of the interleaving.
RESULT_CHAIN is the output of vect_permute_load_chain, it contains permuted
vectors, that are ready for vector computation. */
result_chain.create (size);
- vect_permute_load_chain (dr_chain, size, stmt, gsi, &result_chain);
+
+ /* If reassociation width for vector type is 2 or greater target machine can
+ execute 2 or more vector instructions in parallel. Otherwise try to
+ get chain for loads group using vect_shift_permute_load_chain. */
+ mode = TYPE_MODE (STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt)));
+ if (targetm.sched.reassociation_width (VEC_PERM_EXPR, mode) > 1
+ || !vect_shift_permute_load_chain (dr_chain, size, stmt,
+ gsi, &result_chain))
+ vect_permute_load_chain (dr_chain, size, stmt, gsi, &result_chain);
vect_record_grouped_load_vectors (stmt, result_chain);
result_chain.release ();
}
diff --git a/gcc-4.8/gcc/tree-vect-stmts.c b/gcc-4.8/gcc/tree-vect-stmts.c
index 361c312..ff19b04 100644
--- a/gcc-4.8/gcc/tree-vect-stmts.c
+++ b/gcc-4.8/gcc/tree-vect-stmts.c
@@ -1070,10 +1070,11 @@ vect_model_load_cost (stmt_vec_info stmt_info, int ncopies,
include the cost of the permutes. */
if (!load_lanes_p && group_size > 1)
{
- /* Uses an even and odd extract operations for each needed permute. */
- int nstmts = ncopies * exact_log2 (group_size) * group_size;
- inside_cost += record_stmt_cost (body_cost_vec, nstmts, vec_perm,
- stmt_info, 0, vect_body);
+ /* Uses an even and odd extract operations or shuffle operations
+ for each needed permute. */
+ int nstmts = ncopies * ceil_log2 (group_size) * group_size;
+ inside_cost = record_stmt_cost (body_cost_vec, nstmts, vec_perm,
+ stmt_info, 0, vect_body);
if (dump_enabled_p ())
dump_printf_loc (MSG_NOTE, vect_location,
diff --git a/gcc-4.9/gcc/config/i386/i386.c b/gcc-4.9/gcc/config/i386/i386.c
index df50433..4016b60 100644
--- a/gcc-4.9/gcc/config/i386/i386.c
+++ b/gcc-4.9/gcc/config/i386/i386.c
@@ -82,6 +82,7 @@ along with GCC; see the file COPYING3. If not see
#include "context.h"
#include "pass_manager.h"
#include "target-globals.h"
+#include "tree-vectorizer.h"
static rtx legitimize_dllimport_symbol (rtx, bool);
static rtx legitimize_pe_coff_extern_decl (rtx, bool);
@@ -1739,7 +1740,7 @@ struct processor_costs slm_cost = {
1, /* scalar load_cost. */
1, /* scalar_store_cost. */
1, /* vec_stmt_cost. */
- 1, /* vec_to_scalar_cost. */
+ 4, /* vec_to_scalar_cost. */
1, /* scalar_to_vec_cost. */
1, /* vec_align_load_cost. */
2, /* vec_unalign_load_cost. */
@@ -1816,7 +1817,7 @@ struct processor_costs intel_cost = {
1, /* scalar load_cost. */
1, /* scalar_store_cost. */
1, /* vec_stmt_cost. */
- 1, /* vec_to_scalar_cost. */
+ 4, /* vec_to_scalar_cost. */
1, /* scalar_to_vec_cost. */
1, /* vec_align_load_cost. */
2, /* vec_unalign_load_cost. */
@@ -44300,7 +44301,7 @@ expand_vec_perm_even_odd_1 (struct expand_vec_perm_d *d, unsigned odd)
gcc_unreachable ();
case V8HImode:
- if (TARGET_SSSE3)
+ if (TARGET_SSSE3 && !TARGET_SLOW_PSHUFB)
return expand_vec_perm_pshufb2 (d);
else
{
@@ -44323,7 +44324,7 @@ expand_vec_perm_even_odd_1 (struct expand_vec_perm_d *d, unsigned odd)
break;
case V16QImode:
- if (TARGET_SSSE3)
+ if (TARGET_SSSE3 && !TARGET_SLOW_PSHUFB)
return expand_vec_perm_pshufb2 (d);
else
{
@@ -46493,6 +46494,16 @@ ix86_reassociation_width (unsigned int opc ATTRIBUTE_UNUSED,
{
int res = 1;
+ /* Vector part. */
+ if (VECTOR_MODE_P (mode))
+ {
+ if (TARGET_VECTOR_PARALLEL_EXECUTION)
+ return 2;
+ else
+ return 1;
+ }
+
+ /* Scalar part. */
if (INTEGRAL_MODE_P (mode) && TARGET_REASSOC_INT_TO_PARALLEL)
res = 2;
else if (FLOAT_MODE_P (mode) && TARGET_REASSOC_FP_TO_PARALLEL)
@@ -46592,7 +46603,6 @@ ix86_add_stmt_cost (void *data, int count, enum vect_cost_for_stmt kind,
{
unsigned *cost = (unsigned *) data;
unsigned retval = 0;
-
tree vectype = stmt_info ? stmt_vectype (stmt_info) : NULL_TREE;
int stmt_cost = ix86_builtin_vectorization_cost (kind, vectype, misalign);
@@ -46603,6 +46613,18 @@ ix86_add_stmt_cost (void *data, int count, enum vect_cost_for_stmt kind,
count *= 50; /* FIXME. */
retval = (unsigned) (count * stmt_cost);
+
+ /* We need to multiply all vector stmt cost by 1.7 (estimated cost)
+ for Silvermont as it has out of order integer pipeline and can execute
+ 2 scalar instruction per tick, but has in order SIMD pipeline. */
+ if (TARGET_SILVERMONT || TARGET_INTEL)
+ if (stmt_info && stmt_info->stmt)
+ {
+ tree lhs_op = gimple_get_lhs (stmt_info->stmt);
+ if (lhs_op && TREE_CODE (TREE_TYPE (lhs_op)) == INTEGER_TYPE)
+ retval = (retval * 17) / 10;
+ }
+
cost[where] += retval;
return retval;
diff --git a/gcc-4.9/gcc/config/i386/i386.h b/gcc-4.9/gcc/config/i386/i386.h
index 51659de..fb52741 100644
--- a/gcc-4.9/gcc/config/i386/i386.h
+++ b/gcc-4.9/gcc/config/i386/i386.h
@@ -425,6 +425,10 @@ extern unsigned char ix86_tune_features[X86_TUNE_LAST];
ix86_tune_features[X86_TUNE_USE_VECTOR_FP_CONVERTS]
#define TARGET_USE_VECTOR_CONVERTS \
ix86_tune_features[X86_TUNE_USE_VECTOR_CONVERTS]
+#define TARGET_SLOW_PSHUFB \
+ ix86_tune_features[X86_TUNE_SLOW_PSHUFB]
+#define TARGET_VECTOR_PARALLEL_EXECUTION \
+ ix86_tune_features[X86_TUNE_VECTOR_PARALLEL_EXECUTION]
#define TARGET_FUSE_CMP_AND_BRANCH_32 \
ix86_tune_features[X86_TUNE_FUSE_CMP_AND_BRANCH_32]
#define TARGET_FUSE_CMP_AND_BRANCH_64 \
diff --git a/gcc-4.9/gcc/config/i386/predicates.md b/gcc-4.9/gcc/config/i386/predicates.md
index 2ef1384..8266f3e 100644
--- a/gcc-4.9/gcc/config/i386/predicates.md
+++ b/gcc-4.9/gcc/config/i386/predicates.md
@@ -1417,6 +1417,22 @@
return true;
})
+;; Return true if OP is a parallel for a palignr permute.
+(define_predicate "palignr_operand"
+ (and (match_code "parallel")
+ (match_code "const_int" "a"))
+{
+ int elt = INTVAL (XVECEXP (op, 0, 0));
+ int i, nelt = XVECLEN (op, 0);
+
+ /* Check that an order in the permutation is suitable for palignr.
+ For example, {5 6 7 0 1 2 3 4} is "palignr 5, xmm, xmm". */
+ for (i = 1; i < nelt; ++i)
+ if (INTVAL (XVECEXP (op, 0, i)) != ((elt + i) % nelt))
+ return false;
+ return true;
+})
+
;; Return true if OP is a proper third operand to vpblendw256.
(define_predicate "avx2_pblendw_operand"
(match_code "const_int")
diff --git a/gcc-4.9/gcc/config/i386/sse.md b/gcc-4.9/gcc/config/i386/sse.md
index 27ade19..26332d2 100644
--- a/gcc-4.9/gcc/config/i386/sse.md
+++ b/gcc-4.9/gcc/config/i386/sse.md
@@ -14574,6 +14574,35 @@
(set_attr "prefix" "vex")
(set_attr "mode" "<sseinsnmode>")])
+(define_insn "*ssse3_palignr<mode>_perm"
+ [(set (match_operand:V_128 0 "register_operand" "=x,x")
+ (vec_select:V_128
+ (match_operand:V_128 1 "register_operand" "0,x")
+ (match_parallel 2 "palignr_operand"
+ [(match_operand 3 "const_int_operand" "n, n")])))]
+ "TARGET_SSSE3"
+{
+ enum machine_mode imode = GET_MODE_INNER (GET_MODE (operands[0]));
+ operands[2] = GEN_INT (INTVAL (operands[3]) * GET_MODE_SIZE (imode));
+
+ switch (which_alternative)
+ {
+ case 0:
+ return "palignr\t{%2, %1, %0|%0, %1, %2}";
+ case 1:
+ return "vpalignr\t{%2, %1, %1, %0|%0, %1, %1, %2}";
+ default:
+ gcc_unreachable ();
+ }
+}
+ [(set_attr "isa" "noavx,avx")
+ (set_attr "type" "sseishft")
+ (set_attr "atom_unit" "sishuf")
+ (set_attr "prefix_data16" "1,*")
+ (set_attr "prefix_extra" "1")
+ (set_attr "length_immediate" "1")
+ (set_attr "prefix" "orig,vex")])
+
(define_expand "avx_vinsertf128<mode>"
[(match_operand:V_256 0 "register_operand")
(match_operand:V_256 1 "register_operand")
diff --git a/gcc-4.9/gcc/config/i386/x86-tune.def b/gcc-4.9/gcc/config/i386/x86-tune.def
index 8399102..cb44dc3 100644
--- a/gcc-4.9/gcc/config/i386/x86-tune.def
+++ b/gcc-4.9/gcc/config/i386/x86-tune.def
@@ -386,6 +386,15 @@ DEF_TUNE (X86_TUNE_USE_VECTOR_FP_CONVERTS, "use_vector_fp_converts",
from integer to FP. */
DEF_TUNE (X86_TUNE_USE_VECTOR_CONVERTS, "use_vector_converts", m_AMDFAM10)
+/* X86_TUNE_SLOW_SHUFB: Indicates tunings with slow pshufb instruction. */
+DEF_TUNE (X86_TUNE_SLOW_PSHUFB, "slow_pshufb",
+ m_BONNELL | m_SILVERMONT | m_INTEL)
+
+/* X86_TUNE_VECTOR_PARALLEL_EXECUTION: Indicates tunings with ability to
+ execute 2 or more vector instructions in parallel. */
+DEF_TUNE (X86_TUNE_VECTOR_PARALLEL_EXECUTION, "vec_parallel",
+ m_NEHALEM | m_SANDYBRIDGE | m_HASWELL)
+
/*****************************************************************************/
/* AVX instruction selection tuning (some of SSE flags affects AVX, too) */
/*****************************************************************************/
diff --git a/gcc-4.9/gcc/testsuite/gcc.dg/vect/pr52252-ld.c b/gcc-4.9/gcc/testsuite/gcc.dg/vect/pr52252-ld.c
new file mode 100644
index 0000000..6e3cb52
--- /dev/null
+++ b/gcc-4.9/gcc/testsuite/gcc.dg/vect/pr52252-ld.c
@@ -0,0 +1,30 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -g -ftree-vectorize -mssse3 -fdump-tree-vect-details" { target { i?86-*-* x86_64-*-* } } } */
+
+#define byte unsigned char
+
+void
+matrix_mul (byte *in, byte *out, int size)
+{
+ int i;
+ for (i = 0; i < size; i++)
+ {
+ byte in0 = in[0];
+ byte in1 = in[1];
+ byte in2 = in[2];
+ byte out0, out1, out2, out3;
+ out0 = in0 + in1;
+ out1 = in0 + in2;
+ out2 = in1 + in2;
+ out3 = in0 + in1 + in2;
+ out[0] = out0;
+ out[1] = out1;
+ out[2] = out2;
+ out[3] = out3;
+ in += 3;
+ out += 4;
+ }
+}
+
+/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" } } */
+/* { dg-final { cleanup-tree-dump "vect" } } */
diff --git a/gcc-4.9/gcc/testsuite/gcc.target/i386/pr52252-atom.c b/gcc-4.9/gcc/testsuite/gcc.target/i386/pr52252-atom.c
new file mode 100644
index 0000000..715b459
--- /dev/null
+++ b/gcc-4.9/gcc/testsuite/gcc.target/i386/pr52252-atom.c
@@ -0,0 +1,29 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target ssse3 } */
+/* { dg-options "-O2 -ftree-vectorize -mssse3 -mtune=slm" } */
+#define byte unsigned char
+
+void
+matrix_mul (byte *in, byte *out, int size)
+{
+ int i;
+ for (i = 0; i < size; i++)
+ {
+ byte in0 = in[0];
+ byte in1 = in[1];
+ byte in2 = in[2];
+ byte out0, out1, out2, out3;
+ out0 = in0 + in1;
+ out1 = in0 + in2;
+ out2 = in1 + in2;
+ out3 = in0 + in1 + in2;
+ out[0] = out0;
+ out[1] = out1;
+ out[2] = out2;
+ out[3] = out3;
+ in += 3;
+ out += 4;
+ }
+}
+
+/* { dg-final { scan-assembler "palignr" } } */
diff --git a/gcc-4.9/gcc/testsuite/gcc.target/i386/pr52252-core.c b/gcc-4.9/gcc/testsuite/gcc.target/i386/pr52252-core.c
new file mode 100644
index 0000000..ac857a5
--- /dev/null
+++ b/gcc-4.9/gcc/testsuite/gcc.target/i386/pr52252-core.c
@@ -0,0 +1,29 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target ssse3 } */
+/* { dg-options "-O2 -ftree-vectorize -mssse3 -mtune=corei7" } */
+#define byte unsigned char
+
+void
+matrix_mul (byte *in, byte *out, int size)
+{
+ int i;
+ for (i = 0; i < size; i++)
+ {
+ byte in0 = in[0];
+ byte in1 = in[1];
+ byte in2 = in[2];
+ byte out0, out1, out2, out3;
+ out0 = in0 + in1;
+ out1 = in0 + in2;
+ out2 = in1 + in2;
+ out3 = in0 + in1 + in2;
+ out[0] = out0;
+ out[1] = out1;
+ out[2] = out2;
+ out[3] = out3;
+ in += 3;
+ out += 4;
+ }
+}
+
+/* { dg-final { scan-assembler "pshufb" } } */
diff --git a/gcc-4.9/gcc/testsuite/gcc.target/i386/pr61403.c b/gcc-4.9/gcc/testsuite/gcc.target/i386/pr61403.c
new file mode 100644
index 0000000..84cc5c5
--- /dev/null
+++ b/gcc-4.9/gcc/testsuite/gcc.target/i386/pr61403.c
@@ -0,0 +1,27 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target sse4 } */
+/* { dg-options "-O2 -ffast-math -ftree-vectorize -msse4.2 -mtune=corei7" } */
+
+#include <math.h>
+
+struct XYZ
+{
+ float x;
+ float y;
+ float z;
+};
+
+void
+norm (struct XYZ *in, struct XYZ *out, int size)
+{
+ int i;
+ for (i = 0; i < size; ++i)
+ {
+ float n = sqrt (in[i].x * in[i].x + in[i].y * in[i].y + in[i].z * in[i].z);
+ out[i].x = in[i].x / n;
+ out[i].y = in[i].y / n;
+ out[i].z = in[i].z / n;
+ }
+}
+
+/* { dg-final { scan-assembler "blend" } } */
diff --git a/gcc-4.9/gcc/tree-vect-data-refs.c b/gcc-4.9/gcc/tree-vect-data-refs.c
index 6622bd8..ab1197e 100644
--- a/gcc-4.9/gcc/tree-vect-data-refs.c
+++ b/gcc-4.9/gcc/tree-vect-data-refs.c
@@ -4815,36 +4815,76 @@ vect_grouped_load_supported (tree vectype, unsigned HOST_WIDE_INT count)
{
enum machine_mode mode = TYPE_MODE (vectype);
- /* vect_permute_load_chain requires the group size to be a power of two. */
- if (exact_log2 (count) == -1)
+ /* vect_permute_load_chain requires the group size to be equal to 3 or
+ be a power of two. */
+ if (count != 3 && exact_log2 (count) == -1)
{
if (dump_enabled_p ())
dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
- "the size of the group of accesses"
- " is not a power of 2\n");
+ "the size of the group of accesses"
+ " is not a power of 2 or not equal to 3\n");
return false;
}
/* Check that the permutation is supported. */
if (VECTOR_MODE_P (mode))
{
- unsigned int i, nelt = GET_MODE_NUNITS (mode);
+ unsigned int i, j, nelt = GET_MODE_NUNITS (mode);
unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
- for (i = 0; i < nelt; i++)
- sel[i] = i * 2;
- if (can_vec_perm_p (mode, false, sel))
+ if (count == 3)
{
+ unsigned int k;
+ for (k = 0; k < 3; k++)
+ {
+ for (i = 0; i < nelt; i++)
+ if (3 * i + k < 2 * nelt)
+ sel[i] = 3 * i + k;
+ else
+ sel[i] = 0;
+ if (!can_vec_perm_p (mode, false, sel))
+ {
+ if (dump_enabled_p ())
+ dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+ "shuffle of 3 loads is not supported by"
+ " target\n");
+ return false;
+ }
+ for (i = 0, j = 0; i < nelt; i++)
+ if (3 * i + k < 2 * nelt)
+ sel[i] = i;
+ else
+ sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
+ if (!can_vec_perm_p (mode, false, sel))
+ {
+ if (dump_enabled_p ())
+ dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+ "shuffle of 3 loads is not supported by"
+ " target\n");
+ return false;
+ }
+ }
+ return true;
+ }
+ else
+ {
+ /* If length is not equal to 3 then only power of 2 is supported. */
+ gcc_assert (exact_log2 (count) != -1);
for (i = 0; i < nelt; i++)
- sel[i] = i * 2 + 1;
+ sel[i] = i * 2;
if (can_vec_perm_p (mode, false, sel))
- return true;
- }
+ {
+ for (i = 0; i < nelt; i++)
+ sel[i] = i * 2 + 1;
+ if (can_vec_perm_p (mode, false, sel))
+ return true;
+ }
+ }
}
if (dump_enabled_p ())
dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
- "extract even/odd not supported by target\n");
+ "extract even/odd not supported by target\n");
return false;
}
@@ -4862,8 +4902,9 @@ vect_load_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count)
/* Function vect_permute_load_chain.
Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be
- a power of 2, generate extract_even/odd stmts to reorder the input data
- correctly. Return the final references for loads in RESULT_CHAIN.
+ a power of 2 or equal to 3, generate extract_even/odd stmts to reorder
+ the input data correctly. Return the final references for loads in
+ RESULT_CHAIN.
E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
The input is 4 vectors each containing 8 elements. We assign a number to each
@@ -4944,6 +4985,7 @@ vect_permute_load_chain (vec<tree> dr_chain,
{
tree data_ref, first_vect, second_vect;
tree perm_mask_even, perm_mask_odd;
+ tree perm3_mask_low, perm3_mask_high;
gimple perm_stmt;
tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
unsigned int i, j, log_length = exact_log2 (length);
@@ -4954,44 +4996,437 @@ vect_permute_load_chain (vec<tree> dr_chain,
memcpy (result_chain->address (), dr_chain.address (),
length * sizeof (tree));
- for (i = 0; i < nelt; ++i)
- sel[i] = i * 2;
- perm_mask_even = vect_gen_perm_mask (vectype, sel);
- gcc_assert (perm_mask_even != NULL);
-
- for (i = 0; i < nelt; ++i)
- sel[i] = i * 2 + 1;
- perm_mask_odd = vect_gen_perm_mask (vectype, sel);
- gcc_assert (perm_mask_odd != NULL);
-
- for (i = 0; i < log_length; i++)
+ if (length == 3)
{
- for (j = 0; j < length; j += 2)
- {
- first_vect = dr_chain[j];
- second_vect = dr_chain[j+1];
+ unsigned int k;
- /* data_ref = permute_even (first_data_ref, second_data_ref); */
- data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_even");
+ for (k = 0; k < 3; k++)
+ {
+ for (i = 0; i < nelt; i++)
+ if (3 * i + k < 2 * nelt)
+ sel[i] = 3 * i + k;
+ else
+ sel[i] = 0;
+ perm3_mask_low = vect_gen_perm_mask (vectype, sel);
+ gcc_assert (perm3_mask_low != NULL);
+
+ for (i = 0, j = 0; i < nelt; i++)
+ if (3 * i + k < 2 * nelt)
+ sel[i] = i;
+ else
+ sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
+
+ perm3_mask_high = vect_gen_perm_mask (vectype, sel);
+ gcc_assert (perm3_mask_high != NULL);
+
+ first_vect = dr_chain[0];
+ second_vect = dr_chain[1];
+
+ /* Create interleaving stmt (low part of):
+ low = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
+ ...}> */
+ data_ref = make_temp_ssa_name (vectype, NULL, "vect_suffle3_low");
perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, data_ref,
first_vect, second_vect,
- perm_mask_even);
+ perm3_mask_low);
vect_finish_stmt_generation (stmt, perm_stmt, gsi);
- (*result_chain)[j/2] = data_ref;
- /* data_ref = permute_odd (first_data_ref, second_data_ref); */
- data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_odd");
+ /* Create interleaving stmt (high part of):
+ high = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
+ ...}> */
+ first_vect = data_ref;
+ second_vect = dr_chain[2];
+ data_ref = make_temp_ssa_name (vectype, NULL, "vect_suffle3_high");
perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, data_ref,
first_vect, second_vect,
- perm_mask_odd);
+ perm3_mask_high);
vect_finish_stmt_generation (stmt, perm_stmt, gsi);
- (*result_chain)[j/2+length/2] = data_ref;
+ (*result_chain)[k] = data_ref;
+ }
+ }
+ else
+ {
+ /* If length is not equal to 3 then only power of 2 is supported. */
+ gcc_assert (exact_log2 (length) != -1);
+
+ for (i = 0; i < nelt; ++i)
+ sel[i] = i * 2;
+ perm_mask_even = vect_gen_perm_mask (vectype, sel);
+ gcc_assert (perm_mask_even != NULL);
+
+ for (i = 0; i < nelt; ++i)
+ sel[i] = i * 2 + 1;
+ perm_mask_odd = vect_gen_perm_mask (vectype, sel);
+ gcc_assert (perm_mask_odd != NULL);
+
+ for (i = 0; i < log_length; i++)
+ {
+ for (j = 0; j < length; j += 2)
+ {
+ first_vect = dr_chain[j];
+ second_vect = dr_chain[j+1];
+
+ /* data_ref = permute_even (first_data_ref, second_data_ref); */
+ data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_even");
+ perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, data_ref,
+ first_vect, second_vect,
+ perm_mask_even);
+ vect_finish_stmt_generation (stmt, perm_stmt, gsi);
+ (*result_chain)[j/2] = data_ref;
+
+ /* data_ref = permute_odd (first_data_ref, second_data_ref); */
+ data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_odd");
+ perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, data_ref,
+ first_vect, second_vect,
+ perm_mask_odd);
+ vect_finish_stmt_generation (stmt, perm_stmt, gsi);
+ (*result_chain)[j/2+length/2] = data_ref;
+ }
+ memcpy (dr_chain.address (), result_chain->address (),
+ length * sizeof (tree));
}
- memcpy (dr_chain.address (), result_chain->address (),
- length * sizeof (tree));
}
}
+/* Function vect_shift_permute_load_chain.
+
+ Given a chain of loads in DR_CHAIN of LENGTH 2 or 3, generate
+ sequence of stmts to reorder the input data accordingly.
+ Return the final references for loads in RESULT_CHAIN.
+ Return true if successed, false otherwise.
+
+ E.g., LENGTH is 3 and the scalar type is short, i.e., VF is 8.
+ The input is 3 vectors each containing 8 elements. We assign a
+ number to each element, the input sequence is:
+
+ 1st vec: 0 1 2 3 4 5 6 7
+ 2nd vec: 8 9 10 11 12 13 14 15
+ 3rd vec: 16 17 18 19 20 21 22 23
+
+ The output sequence should be:
+
+ 1st vec: 0 3 6 9 12 15 18 21
+ 2nd vec: 1 4 7 10 13 16 19 22
+ 3rd vec: 2 5 8 11 14 17 20 23
+
+ We use 3 shuffle instructions and 3 * 3 - 1 shifts to create such output.
+
+ First we shuffle all 3 vectors to get correct elements order:
+
+ 1st vec: ( 0 3 6) ( 1 4 7) ( 2 5)
+ 2nd vec: ( 8 11 14) ( 9 12 15) (10 13)
+ 3rd vec: (16 19 22) (17 20 23) (18 21)
+
+ Next we unite and shift vector 3 times:
+
+ 1st step:
+ shift right by 6 the concatenation of:
+ "1st vec" and "2nd vec"
+ ( 0 3 6) ( 1 4 7) |( 2 5) _ ( 8 11 14) ( 9 12 15)| (10 13)
+ "2nd vec" and "3rd vec"
+ ( 8 11 14) ( 9 12 15) |(10 13) _ (16 19 22) (17 20 23)| (18 21)
+ "3rd vec" and "1st vec"
+ (16 19 22) (17 20 23) |(18 21) _ ( 0 3 6) ( 1 4 7)| ( 2 5)
+ | New vectors |
+
+ So that now new vectors are:
+
+ 1st vec: ( 2 5) ( 8 11 14) ( 9 12 15)
+ 2nd vec: (10 13) (16 19 22) (17 20 23)
+ 3rd vec: (18 21) ( 0 3 6) ( 1 4 7)
+
+ 2nd step:
+ shift right by 5 the concatenation of:
+ "1st vec" and "3rd vec"
+ ( 2 5) ( 8 11 14) |( 9 12 15) _ (18 21) ( 0 3 6)| ( 1 4 7)
+ "2nd vec" and "1st vec"
+ (10 13) (16 19 22) |(17 20 23) _ ( 2 5) ( 8 11 14)| ( 9 12 15)
+ "3rd vec" and "2nd vec"
+ (18 21) ( 0 3 6) |( 1 4 7) _ (10 13) (16 19 22)| (17 20 23)
+ | New vectors |
+
+ So that now new vectors are:
+
+ 1st vec: ( 9 12 15) (18 21) ( 0 3 6)
+ 2nd vec: (17 20 23) ( 2 5) ( 8 11 14)
+ 3rd vec: ( 1 4 7) (10 13) (16 19 22) READY
+
+ 3rd step:
+ shift right by 5 the concatenation of:
+ "1st vec" and "1st vec"
+ ( 9 12 15) (18 21) |( 0 3 6) _ ( 9 12 15) (18 21)| ( 0 3 6)
+ shift right by 3 the concatenation of:
+ "2nd vec" and "2nd vec"
+ (17 20 23) |( 2 5) ( 8 11 14) _ (17 20 23)| ( 2 5) ( 8 11 14)
+ | New vectors |
+
+ So that now all vectors are READY:
+ 1st vec: ( 0 3 6) ( 9 12 15) (18 21)
+ 2nd vec: ( 2 5) ( 8 11 14) (17 20 23)
+ 3rd vec: ( 1 4 7) (10 13) (16 19 22)
+
+ This algorithm is faster than one in vect_permute_load_chain if:
+ 1. "shift of a concatination" is faster than general permutation.
+ This is usually so.
+ 2. The TARGET machine can't execute vector instructions in parallel.
+ This is because each step of the algorithm depends on previous.
+ The algorithm in vect_permute_load_chain is much more parallel.
+
+ The algorithm is applicable only for LOAD CHAIN LENGTH less than VF.
+*/
+
+static bool
+vect_shift_permute_load_chain (vec<tree> dr_chain,
+ unsigned int length,
+ gimple stmt,
+ gimple_stmt_iterator *gsi,
+ vec<tree> *result_chain)
+{
+ tree vect[3], vect_shift[3], data_ref, first_vect, second_vect;
+ tree perm2_mask1, perm2_mask2, perm3_mask;
+ tree select_mask, shift1_mask, shift2_mask, shift3_mask, shift4_mask;
+ gimple perm_stmt;
+
+ tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
+ unsigned int i;
+ unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype);
+ unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
+ stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
+ loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
+
+ result_chain->quick_grow (length);
+ memcpy (result_chain->address (), dr_chain.address (),
+ length * sizeof (tree));
+
+ if (length == 2 && LOOP_VINFO_VECT_FACTOR (loop_vinfo) > 4)
+ {
+ for (i = 0; i < nelt / 2; ++i)
+ sel[i] = i * 2;
+ for (i = 0; i < nelt / 2; ++i)
+ sel[nelt / 2 + i] = i * 2 + 1;
+ if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
+ {
+ if (dump_enabled_p ())
+ dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+ "shuffle of 2 fields structure is not \
+ supported by target\n");
+ return false;
+ }
+ perm2_mask1 = vect_gen_perm_mask (vectype, sel);
+ gcc_assert (perm2_mask1 != NULL);
+
+ for (i = 0; i < nelt / 2; ++i)
+ sel[i] = i * 2 + 1;
+ for (i = 0; i < nelt / 2; ++i)
+ sel[nelt / 2 + i] = i * 2;
+ if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
+ {
+ if (dump_enabled_p ())
+ dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+ "shuffle of 2 fields structure is not \
+ supported by target\n");
+ return false;
+ }
+ perm2_mask2 = vect_gen_perm_mask (vectype, sel);
+ gcc_assert (perm2_mask2 != NULL);
+
+ /* Generating permutation constant to shift all elements.
+ For vector length 8 it is {4 5 6 7 8 9 10 11}. */
+ for (i = 0; i < nelt; i++)
+ sel[i] = nelt / 2 + i;
+ if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
+ {
+ if (dump_enabled_p ())
+ dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+ "shift permutation is not supported by target\n");
+ return false;
+ }
+ shift1_mask = vect_gen_perm_mask (vectype, sel);
+ gcc_assert (shift1_mask != NULL);
+
+ /* Generating permutation constant to select vector from 2.
+ For vector length 8 it is {0 1 2 3 12 13 14 15}. */
+ for (i = 0; i < nelt / 2; i++)
+ sel[i] = i;
+ for (i = nelt / 2; i < nelt; i++)
+ sel[i] = nelt + i;
+ if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
+ {
+ if (dump_enabled_p ())
+ dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+ "select is not supported by target\n");
+ return false;
+ }
+ select_mask = vect_gen_perm_mask (vectype, sel);
+ gcc_assert (select_mask != NULL);
+
+ first_vect = dr_chain[0];
+ second_vect = dr_chain[1];
+
+ data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
+ perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, data_ref,
+ first_vect, first_vect,
+ perm2_mask1);
+ vect_finish_stmt_generation (stmt, perm_stmt, gsi);
+ vect[0] = data_ref;
+
+ data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
+ perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, data_ref,
+ second_vect, second_vect,
+ perm2_mask2);
+ vect_finish_stmt_generation (stmt, perm_stmt, gsi);
+ vect[1] = data_ref;
+
+ data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift");
+ perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, data_ref,
+ vect[0], vect[1],
+ shift1_mask);
+ vect_finish_stmt_generation (stmt, perm_stmt, gsi);
+ (*result_chain)[1] = data_ref;
+
+ data_ref = make_temp_ssa_name (vectype, NULL, "vect_select");
+ perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, data_ref,
+ vect[0], vect[1],
+ select_mask);
+ vect_finish_stmt_generation (stmt, perm_stmt, gsi);
+ (*result_chain)[0] = data_ref;
+
+ return true;
+ }
+ if (length == 3 && LOOP_VINFO_VECT_FACTOR (loop_vinfo) > 2)
+ {
+ unsigned int k = 0, l = 0;
+
+ /* Generating permutation constant to get all elements in rigth order.
+ For vector length 8 it is {0 3 6 1 4 7 2 5}. */
+ for (i = 0; i < nelt; i++)
+ {
+ if (3 * k + (l % 3) >= nelt)
+ {
+ k = 0;
+ l += (3 - (nelt % 3));
+ }
+ sel[i] = 3 * k + (l % 3);
+ k++;
+ }
+ if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
+ {
+ if (dump_enabled_p ())
+ dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+ "shuffle of 3 fields structure is not \
+ supported by target\n");
+ return false;
+ }
+ perm3_mask = vect_gen_perm_mask (vectype, sel);
+ gcc_assert (perm3_mask != NULL);
+
+ /* Generating permutation constant to shift all elements.
+ For vector length 8 it is {6 7 8 9 10 11 12 13}. */
+ for (i = 0; i < nelt; i++)
+ sel[i] = 2 * (nelt / 3) + (nelt % 3) + i;
+ if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
+ {
+ if (dump_enabled_p ())
+ dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+ "shift permutation is not supported by target\n");
+ return false;
+ }
+ shift1_mask = vect_gen_perm_mask (vectype, sel);
+ gcc_assert (shift1_mask != NULL);
+
+ /* Generating permutation constant to shift all elements.
+ For vector length 8 it is {5 6 7 8 9 10 11 12}. */
+ for (i = 0; i < nelt; i++)
+ sel[i] = 2 * (nelt / 3) + 1 + i;
+ if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
+ {
+ if (dump_enabled_p ())
+ dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+ "shift permutation is not supported by target\n");
+ return false;
+ }
+ shift2_mask = vect_gen_perm_mask (vectype, sel);
+ gcc_assert (shift2_mask != NULL);
+
+ /* Generating permutation constant to shift all elements.
+ For vector length 8 it is {3 4 5 6 7 8 9 10}. */
+ for (i = 0; i < nelt; i++)
+ sel[i] = (nelt / 3) + (nelt % 3) / 2 + i;
+ if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
+ {
+ if (dump_enabled_p ())
+ dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+ "shift permutation is not supported by target\n");
+ return false;
+ }
+ shift3_mask = vect_gen_perm_mask (vectype, sel);
+ gcc_assert (shift3_mask != NULL);
+
+ /* Generating permutation constant to shift all elements.
+ For vector length 8 it is {5 6 7 8 9 10 11 12}. */
+ for (i = 0; i < nelt; i++)
+ sel[i] = 2 * (nelt / 3) + (nelt % 3) / 2 + i;
+ if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
+ {
+ if (dump_enabled_p ())
+ dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+ "shift permutation is not supported by target\n");
+ return false;
+ }
+ shift4_mask = vect_gen_perm_mask (vectype, sel);
+ gcc_assert (shift4_mask != NULL);
+
+ for (k = 0; k < 3; k++)
+ {
+ data_ref = make_temp_ssa_name (vectype, NULL, "vect_suffle3");
+ perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, data_ref,
+ dr_chain[k], dr_chain[k],
+ perm3_mask);
+ vect_finish_stmt_generation (stmt, perm_stmt, gsi);
+ vect[k] = data_ref;
+ }
+
+ for (k = 0; k < 3; k++)
+ {
+ data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift1");
+ perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, data_ref,
+ vect[k % 3],
+ vect[(k + 1) % 3],
+ shift1_mask);
+ vect_finish_stmt_generation (stmt, perm_stmt, gsi);
+ vect_shift[k] = data_ref;
+ }
+
+ for (k = 0; k < 3; k++)
+ {
+ data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift2");
+ perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, data_ref,
+ vect_shift[(4 - k) % 3],
+ vect_shift[(3 - k) % 3],
+ shift2_mask);
+ vect_finish_stmt_generation (stmt, perm_stmt, gsi);
+ vect[k] = data_ref;
+ }
+
+ (*result_chain)[3 - (nelt % 3)] = vect[2];
+
+ data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift3");
+ perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, data_ref,
+ vect[0], vect[0],
+ shift3_mask);
+ vect_finish_stmt_generation (stmt, perm_stmt, gsi);
+ (*result_chain)[nelt % 3] = data_ref;
+
+ data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift4");
+ perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, data_ref,
+ vect[1], vect[1],
+ shift4_mask);
+ vect_finish_stmt_generation (stmt, perm_stmt, gsi);
+ (*result_chain)[0] = data_ref;
+ return true;
+ }
+ return false;
+}
/* Function vect_transform_grouped_load.
@@ -5004,13 +5439,22 @@ void
vect_transform_grouped_load (gimple stmt, vec<tree> dr_chain, int size,
gimple_stmt_iterator *gsi)
{
+ enum machine_mode mode;
vec<tree> result_chain = vNULL;
/* DR_CHAIN contains input data-refs that are a part of the interleaving.
RESULT_CHAIN is the output of vect_permute_load_chain, it contains permuted
vectors, that are ready for vector computation. */
result_chain.create (size);
- vect_permute_load_chain (dr_chain, size, stmt, gsi, &result_chain);
+
+ /* If reassociation width for vector type is 2 or greater target machine can
+ execute 2 or more vector instructions in parallel. Otherwise try to
+ get chain for loads group using vect_shift_permute_load_chain. */
+ mode = TYPE_MODE (STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt)));
+ if (targetm.sched.reassociation_width (VEC_PERM_EXPR, mode) > 1
+ || !vect_shift_permute_load_chain (dr_chain, size, stmt,
+ gsi, &result_chain))
+ vect_permute_load_chain (dr_chain, size, stmt, gsi, &result_chain);
vect_record_grouped_load_vectors (stmt, result_chain);
result_chain.release ();
}
diff --git a/gcc-4.9/gcc/tree-vect-stmts.c b/gcc-4.9/gcc/tree-vect-stmts.c
index 1a51d6d..b87c143 100644
--- a/gcc-4.9/gcc/tree-vect-stmts.c
+++ b/gcc-4.9/gcc/tree-vect-stmts.c
@@ -1091,10 +1091,11 @@ vect_model_load_cost (stmt_vec_info stmt_info, int ncopies,
include the cost of the permutes. */
if (!load_lanes_p && group_size > 1)
{
- /* Uses an even and odd extract operations for each needed permute. */
- int nstmts = ncopies * exact_log2 (group_size) * group_size;
- inside_cost += record_stmt_cost (body_cost_vec, nstmts, vec_perm,
- stmt_info, 0, vect_body);
+ /* Uses an even and odd extract operations or shuffle operations
+ for each needed permute. */
+ int nstmts = ncopies * ceil_log2 (group_size) * group_size;
+ inside_cost = record_stmt_cost (body_cost_vec, nstmts, vec_perm,
+ stmt_info, 0, vect_body);
if (dump_enabled_p ())
dump_printf_loc (MSG_NOTE, vect_location,