From 55f9fbb03d0413cb8fe74e5ec5d6c2dd4280933e Mon Sep 17 00:00:00 2001 From: Alexander Ivchenko Date: Fri, 11 Jul 2014 15:24:10 +0400 Subject: [4.8, 4.9] Backport of additional SLM tuning. Six patches from trunk, reg-tested via 'make check': 2014-05-07 Evgeny Stupachenko * tree-vect-data-refs.c (vect_grouped_load_supported): New check for loads group of length 3. (vect_permute_load_chain): New permutations for loads group of length 3. * tree-vect-stmts.c (vect_model_load_cost): Change cost of vec_perm_shuffle for the new permutations. 2014-04-17 Evgeny Stupachenko * config/i386/i386.c (x86_add_stmt_cost): Fix vector cost model for Silvermont. 2014-04-17 Evgeny Stupachenko * config/i386/x86-tune.def (TARGET_SLOW_PSHUFB): New tune definition. * config/i386/i386.h (TARGET_SLOW_PSHUFB): New tune flag. * config/i386/i386.c (expand_vec_perm_even_odd_1): Avoid byte shuffles for TARGET_SLOW_PSHUFB 2014-04-17 Evgeny Stupachenko * config/i386/i386.c (slm_cost): Adjust vec_to_scalar_cost. * config/i386/i386.c (intel_cost): Ditto. 2014-06-18 Evgeny Stupachenko * config/i386/i386.c (ix86_reassociation_width): Add alternative for vector case. * config/i386/i386.h (TARGET_VECTOR_PARALLEL_EXECUTION): New. * config/i386/x86-tune.def (X86_TUNE_VECTOR_PARALLEL_EXECUTION): New. * tree-vect-data-refs.c (vect_shift_permute_load_chain): New. Introduces alternative way of loads group permutaions. (vect_transform_grouped_load): Try alternative way of permutations. 2014-06-05 Evgeny Stupachenko * config/i386/sse.md (*ssse3_palignr_perm): New. * config/i386/predicates.md (palignr_operand): New. Indicates if permutation is suitable for palignr instruction. Change-Id: I5e505735ce3dc0ec3c2a1151713a119b24d712fe Signed-off-by: Alexander Ivchenko --- gcc-4.8/gcc/config/i386/i386.c | 14 + gcc-4.8/gcc/config/i386/i386.h | 3 + gcc-4.8/gcc/config/i386/predicates.md | 16 + gcc-4.8/gcc/config/i386/sse.md | 29 ++ gcc-4.8/gcc/tree-vect-data-refs.c | 527 +++++++++++++++++++-- gcc-4.8/gcc/tree-vect-stmts.c | 9 +- gcc-4.9/gcc/config/i386/i386.c | 32 +- gcc-4.9/gcc/config/i386/i386.h | 4 + gcc-4.9/gcc/config/i386/predicates.md | 16 + gcc-4.9/gcc/config/i386/sse.md | 29 ++ gcc-4.9/gcc/config/i386/x86-tune.def | 9 + gcc-4.9/gcc/testsuite/gcc.dg/vect/pr52252-ld.c | 30 ++ .../gcc/testsuite/gcc.target/i386/pr52252-atom.c | 29 ++ .../gcc/testsuite/gcc.target/i386/pr52252-core.c | 29 ++ gcc-4.9/gcc/testsuite/gcc.target/i386/pr61403.c | 27 ++ gcc-4.9/gcc/tree-vect-data-refs.c | 524 ++++++++++++++++++-- gcc-4.9/gcc/tree-vect-stmts.c | 9 +- 17 files changed, 1240 insertions(+), 96 deletions(-) create mode 100644 gcc-4.9/gcc/testsuite/gcc.dg/vect/pr52252-ld.c create mode 100644 gcc-4.9/gcc/testsuite/gcc.target/i386/pr52252-atom.c create mode 100644 gcc-4.9/gcc/testsuite/gcc.target/i386/pr52252-core.c create mode 100644 gcc-4.9/gcc/testsuite/gcc.target/i386/pr61403.c diff --git a/gcc-4.8/gcc/config/i386/i386.c b/gcc-4.8/gcc/config/i386/i386.c index c95822851..e54013a20 100644 --- a/gcc-4.8/gcc/config/i386/i386.c +++ b/gcc-4.8/gcc/config/i386/i386.c @@ -2097,6 +2097,10 @@ static unsigned int initial_ix86_tune_features[X86_TUNE_LAST] = { during reassociation of fp computation. */ m_ATOM | m_SLM | m_HASWELL, + /* X86_TUNE_VECTOR_PARALLEL_EXECUTION: Indicates tunings with ability to + execute 2 or more vector instructions in parallel. */ + m_COREI7 | m_HASWELL, + /* X86_TUNE_GENERAL_REGS_SSE_SPILL: Try to spill general regs to SSE regs instead of memory. */ m_CORE_ALL, @@ -42496,6 +42500,16 @@ ix86_reassociation_width (unsigned int opc ATTRIBUTE_UNUSED, { int res = 1; + /* Vector part. */ + if (VECTOR_MODE_P (mode)) + { + if (TARGET_VECTOR_PARALLEL_EXECUTION) + return 2; + else + return 1; + } + + /* Scalar part. */ if (INTEGRAL_MODE_P (mode) && TARGET_REASSOC_INT_TO_PARALLEL) res = 2; else if (FLOAT_MODE_P (mode) && TARGET_REASSOC_FP_TO_PARALLEL) diff --git a/gcc-4.8/gcc/config/i386/i386.h b/gcc-4.8/gcc/config/i386/i386.h index ac28ec1aa..155277924 100644 --- a/gcc-4.8/gcc/config/i386/i386.h +++ b/gcc-4.8/gcc/config/i386/i386.h @@ -330,6 +330,7 @@ enum ix86_tune_indices { X86_TUNE_AVX128_OPTIMAL, X86_TUNE_REASSOC_INT_TO_PARALLEL, X86_TUNE_REASSOC_FP_TO_PARALLEL, + X86_TUNE_VECTOR_PARALLEL_EXECUTION, X86_TUNE_GENERAL_REGS_SSE_SPILL, X86_TUNE_AVOID_MEM_OPND_FOR_CMOVE, X86_TUNE_SPLIT_MEM_OPND_FOR_FP_CONVERTS, @@ -436,6 +437,8 @@ extern unsigned char ix86_tune_features[X86_TUNE_LAST]; ix86_tune_features[X86_TUNE_REASSOC_INT_TO_PARALLEL] #define TARGET_REASSOC_FP_TO_PARALLEL \ ix86_tune_features[X86_TUNE_REASSOC_FP_TO_PARALLEL] +#define TARGET_VECTOR_PARALLEL_EXECUTION \ + ix86_tune_features[X86_TUNE_VECTOR_PARALLEL_EXECUTION] #define TARGET_GENERAL_REGS_SSE_SPILL \ ix86_tune_features[X86_TUNE_GENERAL_REGS_SSE_SPILL] #define TARGET_AVOID_MEM_OPND_FOR_CMOVE \ diff --git a/gcc-4.8/gcc/config/i386/predicates.md b/gcc-4.8/gcc/config/i386/predicates.md index 0e52c8518..5575239d8 100644 --- a/gcc-4.8/gcc/config/i386/predicates.md +++ b/gcc-4.8/gcc/config/i386/predicates.md @@ -1259,6 +1259,22 @@ return true; }) +;; Return true if OP is a parallel for a palignr permute. +(define_predicate "palignr_operand" + (and (match_code "parallel") + (match_code "const_int" "a")) +{ + int elt = INTVAL (XVECEXP (op, 0, 0)); + int i, nelt = XVECLEN (op, 0); + + /* Check that an order in the permutation is suitable for palignr. + For example, {5 6 7 0 1 2 3 4} is "palignr 5, xmm, xmm". */ + for (i = 1; i < nelt; ++i) + if (INTVAL (XVECEXP (op, 0, i)) != ((elt + i) % nelt)) + return false; + return true; +}) + ;; Return true if OP is a proper third operand to vpblendw256. (define_predicate "avx2_pblendw_operand" (match_code "const_int") diff --git a/gcc-4.8/gcc/config/i386/sse.md b/gcc-4.8/gcc/config/i386/sse.md index 8b44e1d9e..7ff8f1e43 100644 --- a/gcc-4.8/gcc/config/i386/sse.md +++ b/gcc-4.8/gcc/config/i386/sse.md @@ -10901,6 +10901,35 @@ (set_attr "prefix" "vex") (set_attr "mode" "")]) +(define_insn "*ssse3_palignr_perm" + [(set (match_operand:V_128 0 "register_operand" "=x,x") + (vec_select:V_128 + (match_operand:V_128 1 "register_operand" "0,x") + (match_parallel 2 "palignr_operand" + [(match_operand 3 "const_int_operand" "n, n")])))] + "TARGET_SSSE3" +{ + enum machine_mode imode = GET_MODE_INNER (GET_MODE (operands[0])); + operands[2] = GEN_INT (INTVAL (operands[3]) * GET_MODE_SIZE (imode)); + + switch (which_alternative) + { + case 0: + return "palignr\t{%2, %1, %0|%0, %1, %2}"; + case 1: + return "vpalignr\t{%2, %1, %1, %0|%0, %1, %1, %2}"; + default: + gcc_unreachable (); + } +} + [(set_attr "isa" "noavx,avx") + (set_attr "type" "sseishft") + (set_attr "atom_unit" "sishuf") + (set_attr "prefix_data16" "1,*") + (set_attr "prefix_extra" "1") + (set_attr "length_immediate" "1") + (set_attr "prefix" "orig,vex")]) + (define_expand "avx_vinsertf128" [(match_operand:V_256 0 "register_operand") (match_operand:V_256 1 "register_operand") diff --git a/gcc-4.8/gcc/tree-vect-data-refs.c b/gcc-4.8/gcc/tree-vect-data-refs.c index d8d1435c3..6aba52c99 100644 --- a/gcc-4.8/gcc/tree-vect-data-refs.c +++ b/gcc-4.8/gcc/tree-vect-data-refs.c @@ -4555,42 +4555,79 @@ vect_grouped_load_supported (tree vectype, unsigned HOST_WIDE_INT count) { enum machine_mode mode = TYPE_MODE (vectype); - /* vect_permute_load_chain requires the group size to be a power of two. */ - if (exact_log2 (count) == -1) + /* vect_permute_load_chain requires the group size to be equal to 3 or + be a power of two. */ + if (count != 3 && exact_log2 (count) == -1) { if (dump_enabled_p ()) dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, - "the size of the group of accesses" - " is not a power of 2"); + "the size of the group of accesses" + " is not a power of 2 or not equal to 3\n"); return false; } /* Check that the permutation is supported. */ if (VECTOR_MODE_P (mode)) { - unsigned int i, nelt = GET_MODE_NUNITS (mode); + unsigned int i, j, nelt = GET_MODE_NUNITS (mode); unsigned char *sel = XALLOCAVEC (unsigned char, nelt); - for (i = 0; i < nelt; i++) - sel[i] = i * 2; - if (can_vec_perm_p (mode, false, sel)) + if (count == 3) { + unsigned int k; + for (k = 0; k < 3; k++) + { + for (i = 0; i < nelt; i++) + if (3 * i + k < 2 * nelt) + sel[i] = 3 * i + k; + else + sel[i] = 0; + if (!can_vec_perm_p (mode, false, sel)) + { + if (dump_enabled_p ()) + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, + "shuffle of 3 loads is not supported by" + " target\n"); + return false; + } + for (i = 0, j = 0; i < nelt; i++) + if (3 * i + k < 2 * nelt) + sel[i] = i; + else + sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++); + if (!can_vec_perm_p (mode, false, sel)) + { + if (dump_enabled_p ()) + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, + "shuffle of 3 loads is not supported by" + " target\n"); + return false; + } + } + return true; + } + else + { + /* If length is not equal to 3 then only power of 2 is supported. */ + gcc_assert (exact_log2 (count) != -1); for (i = 0; i < nelt; i++) - sel[i] = i * 2 + 1; + sel[i] = i * 2; if (can_vec_perm_p (mode, false, sel)) - return true; - } + { + for (i = 0; i < nelt; i++) + sel[i] = i * 2 + 1; + if (can_vec_perm_p (mode, false, sel)) + return true; + } + } } if (dump_enabled_p ()) dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, - "extract even/odd not supported by target"); + "extract even/odd not supported by target\n"); return false; } -/* Return TRUE if vec_load_lanes is available for COUNT vectors of - type VECTYPE. */ - bool vect_load_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count) { @@ -4602,8 +4639,9 @@ vect_load_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count) /* Function vect_permute_load_chain. Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be - a power of 2, generate extract_even/odd stmts to reorder the input data - correctly. Return the final references for loads in RESULT_CHAIN. + a power of 2 or equal to 3, generate extract_even/odd stmts to reorder + the input data correctly. Return the final references for loads in + RESULT_CHAIN. E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8. The input is 4 vectors each containing 8 elements. We assign a number to each @@ -4684,6 +4722,7 @@ vect_permute_load_chain (vec dr_chain, { tree data_ref, first_vect, second_vect; tree perm_mask_even, perm_mask_odd; + tree perm3_mask_low, perm3_mask_high; gimple perm_stmt; tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt)); unsigned int i, j, log_length = exact_log2 (length); @@ -4694,44 +4733,437 @@ vect_permute_load_chain (vec dr_chain, memcpy (result_chain->address (), dr_chain.address (), length * sizeof (tree)); - for (i = 0; i < nelt; ++i) - sel[i] = i * 2; - perm_mask_even = vect_gen_perm_mask (vectype, sel); - gcc_assert (perm_mask_even != NULL); - - for (i = 0; i < nelt; ++i) - sel[i] = i * 2 + 1; - perm_mask_odd = vect_gen_perm_mask (vectype, sel); - gcc_assert (perm_mask_odd != NULL); - - for (i = 0; i < log_length; i++) + if (length == 3) { - for (j = 0; j < length; j += 2) - { - first_vect = dr_chain[j]; - second_vect = dr_chain[j+1]; + unsigned int k; - /* data_ref = permute_even (first_data_ref, second_data_ref); */ - data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_even"); + for (k = 0; k < 3; k++) + { + for (i = 0; i < nelt; i++) + if (3 * i + k < 2 * nelt) + sel[i] = 3 * i + k; + else + sel[i] = 0; + perm3_mask_low = vect_gen_perm_mask (vectype, sel); + gcc_assert (perm3_mask_low != NULL); + + for (i = 0, j = 0; i < nelt; i++) + if (3 * i + k < 2 * nelt) + sel[i] = i; + else + sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++); + + perm3_mask_high = vect_gen_perm_mask (vectype, sel); + gcc_assert (perm3_mask_high != NULL); + + first_vect = dr_chain[0]; + second_vect = dr_chain[1]; + + /* Create interleaving stmt (low part of): + low = VEC_PERM_EXPR */ + data_ref = make_temp_ssa_name (vectype, NULL, "vect_suffle3_low"); perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, data_ref, first_vect, second_vect, - perm_mask_even); + perm3_mask_low); vect_finish_stmt_generation (stmt, perm_stmt, gsi); - (*result_chain)[j/2] = data_ref; - /* data_ref = permute_odd (first_data_ref, second_data_ref); */ - data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_odd"); + /* Create interleaving stmt (high part of): + high = VEC_PERM_EXPR */ + first_vect = data_ref; + second_vect = dr_chain[2]; + data_ref = make_temp_ssa_name (vectype, NULL, "vect_suffle3_high"); perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, data_ref, first_vect, second_vect, - perm_mask_odd); + perm3_mask_high); vect_finish_stmt_generation (stmt, perm_stmt, gsi); - (*result_chain)[j/2+length/2] = data_ref; + (*result_chain)[k] = data_ref; + } + } + else + { + /* If length is not equal to 3 then only power of 2 is supported. */ + gcc_assert (exact_log2 (length) != -1); + + for (i = 0; i < nelt; ++i) + sel[i] = i * 2; + perm_mask_even = vect_gen_perm_mask (vectype, sel); + gcc_assert (perm_mask_even != NULL); + + for (i = 0; i < nelt; ++i) + sel[i] = i * 2 + 1; + perm_mask_odd = vect_gen_perm_mask (vectype, sel); + gcc_assert (perm_mask_odd != NULL); + + for (i = 0; i < log_length; i++) + { + for (j = 0; j < length; j += 2) + { + first_vect = dr_chain[j]; + second_vect = dr_chain[j+1]; + + /* data_ref = permute_even (first_data_ref, second_data_ref); */ + data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_even"); + perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, data_ref, + first_vect, second_vect, + perm_mask_even); + vect_finish_stmt_generation (stmt, perm_stmt, gsi); + (*result_chain)[j/2] = data_ref; + + /* data_ref = permute_odd (first_data_ref, second_data_ref); */ + data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_odd"); + perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, data_ref, + first_vect, second_vect, + perm_mask_odd); + vect_finish_stmt_generation (stmt, perm_stmt, gsi); + (*result_chain)[j/2+length/2] = data_ref; + } + memcpy (dr_chain.address (), result_chain->address (), + length * sizeof (tree)); } - memcpy (dr_chain.address (), result_chain->address (), - length * sizeof (tree)); } } +/* Function vect_shift_permute_load_chain. + + Given a chain of loads in DR_CHAIN of LENGTH 2 or 3, generate + sequence of stmts to reorder the input data accordingly. + Return the final references for loads in RESULT_CHAIN. + Return true if successed, false otherwise. + + E.g., LENGTH is 3 and the scalar type is short, i.e., VF is 8. + The input is 3 vectors each containing 8 elements. We assign a + number to each element, the input sequence is: + + 1st vec: 0 1 2 3 4 5 6 7 + 2nd vec: 8 9 10 11 12 13 14 15 + 3rd vec: 16 17 18 19 20 21 22 23 + + The output sequence should be: + + 1st vec: 0 3 6 9 12 15 18 21 + 2nd vec: 1 4 7 10 13 16 19 22 + 3rd vec: 2 5 8 11 14 17 20 23 + + We use 3 shuffle instructions and 3 * 3 - 1 shifts to create such output. + + First we shuffle all 3 vectors to get correct elements order: + + 1st vec: ( 0 3 6) ( 1 4 7) ( 2 5) + 2nd vec: ( 8 11 14) ( 9 12 15) (10 13) + 3rd vec: (16 19 22) (17 20 23) (18 21) + + Next we unite and shift vector 3 times: + + 1st step: + shift right by 6 the concatenation of: + "1st vec" and "2nd vec" + ( 0 3 6) ( 1 4 7) |( 2 5) _ ( 8 11 14) ( 9 12 15)| (10 13) + "2nd vec" and "3rd vec" + ( 8 11 14) ( 9 12 15) |(10 13) _ (16 19 22) (17 20 23)| (18 21) + "3rd vec" and "1st vec" + (16 19 22) (17 20 23) |(18 21) _ ( 0 3 6) ( 1 4 7)| ( 2 5) + | New vectors | + + So that now new vectors are: + + 1st vec: ( 2 5) ( 8 11 14) ( 9 12 15) + 2nd vec: (10 13) (16 19 22) (17 20 23) + 3rd vec: (18 21) ( 0 3 6) ( 1 4 7) + + 2nd step: + shift right by 5 the concatenation of: + "1st vec" and "3rd vec" + ( 2 5) ( 8 11 14) |( 9 12 15) _ (18 21) ( 0 3 6)| ( 1 4 7) + "2nd vec" and "1st vec" + (10 13) (16 19 22) |(17 20 23) _ ( 2 5) ( 8 11 14)| ( 9 12 15) + "3rd vec" and "2nd vec" + (18 21) ( 0 3 6) |( 1 4 7) _ (10 13) (16 19 22)| (17 20 23) + | New vectors | + + So that now new vectors are: + + 1st vec: ( 9 12 15) (18 21) ( 0 3 6) + 2nd vec: (17 20 23) ( 2 5) ( 8 11 14) + 3rd vec: ( 1 4 7) (10 13) (16 19 22) READY + + 3rd step: + shift right by 5 the concatenation of: + "1st vec" and "1st vec" + ( 9 12 15) (18 21) |( 0 3 6) _ ( 9 12 15) (18 21)| ( 0 3 6) + shift right by 3 the concatenation of: + "2nd vec" and "2nd vec" + (17 20 23) |( 2 5) ( 8 11 14) _ (17 20 23)| ( 2 5) ( 8 11 14) + | New vectors | + + So that now all vectors are READY: + 1st vec: ( 0 3 6) ( 9 12 15) (18 21) + 2nd vec: ( 2 5) ( 8 11 14) (17 20 23) + 3rd vec: ( 1 4 7) (10 13) (16 19 22) + + This algorithm is faster than one in vect_permute_load_chain if: + 1. "shift of a concatination" is faster than general permutation. + This is usually so. + 2. The TARGET machine can't execute vector instructions in parallel. + This is because each step of the algorithm depends on previous. + The algorithm in vect_permute_load_chain is much more parallel. + + The algorithm is applicable only for LOAD CHAIN LENGTH less than VF. +*/ + +static bool +vect_shift_permute_load_chain (vec dr_chain, + unsigned int length, + gimple stmt, + gimple_stmt_iterator *gsi, + vec *result_chain) +{ + tree vect[3], vect_shift[3], data_ref, first_vect, second_vect; + tree perm2_mask1, perm2_mask2, perm3_mask; + tree select_mask, shift1_mask, shift2_mask, shift3_mask, shift4_mask; + gimple perm_stmt; + + tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt)); + unsigned int i; + unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype); + unsigned char *sel = XALLOCAVEC (unsigned char, nelt); + stmt_vec_info stmt_info = vinfo_for_stmt (stmt); + loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info); + + result_chain->quick_grow (length); + memcpy (result_chain->address (), dr_chain.address (), + length * sizeof (tree)); + + if (length == 2 && LOOP_VINFO_VECT_FACTOR (loop_vinfo) > 4) + { + for (i = 0; i < nelt / 2; ++i) + sel[i] = i * 2; + for (i = 0; i < nelt / 2; ++i) + sel[nelt / 2 + i] = i * 2 + 1; + if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel)) + { + if (dump_enabled_p ()) + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, + "shuffle of 2 fields structure is not \ + supported by target\n"); + return false; + } + perm2_mask1 = vect_gen_perm_mask (vectype, sel); + gcc_assert (perm2_mask1 != NULL); + + for (i = 0; i < nelt / 2; ++i) + sel[i] = i * 2 + 1; + for (i = 0; i < nelt / 2; ++i) + sel[nelt / 2 + i] = i * 2; + if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel)) + { + if (dump_enabled_p ()) + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, + "shuffle of 2 fields structure is not \ + supported by target\n"); + return false; + } + perm2_mask2 = vect_gen_perm_mask (vectype, sel); + gcc_assert (perm2_mask2 != NULL); + + /* Generating permutation constant to shift all elements. + For vector length 8 it is {4 5 6 7 8 9 10 11}. */ + for (i = 0; i < nelt; i++) + sel[i] = nelt / 2 + i; + if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel)) + { + if (dump_enabled_p ()) + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, + "shift permutation is not supported by target\n"); + return false; + } + shift1_mask = vect_gen_perm_mask (vectype, sel); + gcc_assert (shift1_mask != NULL); + + /* Generating permutation constant to select vector from 2. + For vector length 8 it is {0 1 2 3 12 13 14 15}. */ + for (i = 0; i < nelt / 2; i++) + sel[i] = i; + for (i = nelt / 2; i < nelt; i++) + sel[i] = nelt + i; + if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel)) + { + if (dump_enabled_p ()) + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, + "select is not supported by target\n"); + return false; + } + select_mask = vect_gen_perm_mask (vectype, sel); + gcc_assert (select_mask != NULL); + + first_vect = dr_chain[0]; + second_vect = dr_chain[1]; + + data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2"); + perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, data_ref, + first_vect, first_vect, + perm2_mask1); + vect_finish_stmt_generation (stmt, perm_stmt, gsi); + vect[0] = data_ref; + + data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2"); + perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, data_ref, + second_vect, second_vect, + perm2_mask2); + vect_finish_stmt_generation (stmt, perm_stmt, gsi); + vect[1] = data_ref; + + data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift"); + perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, data_ref, + vect[0], vect[1], + shift1_mask); + vect_finish_stmt_generation (stmt, perm_stmt, gsi); + (*result_chain)[1] = data_ref; + + data_ref = make_temp_ssa_name (vectype, NULL, "vect_select"); + perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, data_ref, + vect[0], vect[1], + select_mask); + vect_finish_stmt_generation (stmt, perm_stmt, gsi); + (*result_chain)[0] = data_ref; + + return true; + } + if (length == 3 && LOOP_VINFO_VECT_FACTOR (loop_vinfo) > 2) + { + unsigned int k = 0, l = 0; + + /* Generating permutation constant to get all elements in rigth order. + For vector length 8 it is {0 3 6 1 4 7 2 5}. */ + for (i = 0; i < nelt; i++) + { + if (3 * k + (l % 3) >= nelt) + { + k = 0; + l += (3 - (nelt % 3)); + } + sel[i] = 3 * k + (l % 3); + k++; + } + if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel)) + { + if (dump_enabled_p ()) + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, + "shuffle of 3 fields structure is not \ + supported by target\n"); + return false; + } + perm3_mask = vect_gen_perm_mask (vectype, sel); + gcc_assert (perm3_mask != NULL); + + /* Generating permutation constant to shift all elements. + For vector length 8 it is {6 7 8 9 10 11 12 13}. */ + for (i = 0; i < nelt; i++) + sel[i] = 2 * (nelt / 3) + (nelt % 3) + i; + if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel)) + { + if (dump_enabled_p ()) + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, + "shift permutation is not supported by target\n"); + return false; + } + shift1_mask = vect_gen_perm_mask (vectype, sel); + gcc_assert (shift1_mask != NULL); + + /* Generating permutation constant to shift all elements. + For vector length 8 it is {5 6 7 8 9 10 11 12}. */ + for (i = 0; i < nelt; i++) + sel[i] = 2 * (nelt / 3) + 1 + i; + if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel)) + { + if (dump_enabled_p ()) + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, + "shift permutation is not supported by target\n"); + return false; + } + shift2_mask = vect_gen_perm_mask (vectype, sel); + gcc_assert (shift2_mask != NULL); + + /* Generating permutation constant to shift all elements. + For vector length 8 it is {3 4 5 6 7 8 9 10}. */ + for (i = 0; i < nelt; i++) + sel[i] = (nelt / 3) + (nelt % 3) / 2 + i; + if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel)) + { + if (dump_enabled_p ()) + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, + "shift permutation is not supported by target\n"); + return false; + } + shift3_mask = vect_gen_perm_mask (vectype, sel); + gcc_assert (shift3_mask != NULL); + + /* Generating permutation constant to shift all elements. + For vector length 8 it is {5 6 7 8 9 10 11 12}. */ + for (i = 0; i < nelt; i++) + sel[i] = 2 * (nelt / 3) + (nelt % 3) / 2 + i; + if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel)) + { + if (dump_enabled_p ()) + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, + "shift permutation is not supported by target\n"); + return false; + } + shift4_mask = vect_gen_perm_mask (vectype, sel); + gcc_assert (shift4_mask != NULL); + + for (k = 0; k < 3; k++) + { + data_ref = make_temp_ssa_name (vectype, NULL, "vect_suffle3"); + perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, data_ref, + dr_chain[k], dr_chain[k], + perm3_mask); + vect_finish_stmt_generation (stmt, perm_stmt, gsi); + vect[k] = data_ref; + } + + for (k = 0; k < 3; k++) + { + data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift1"); + perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, data_ref, + vect[k % 3], + vect[(k + 1) % 3], + shift1_mask); + vect_finish_stmt_generation (stmt, perm_stmt, gsi); + vect_shift[k] = data_ref; + } + + for (k = 0; k < 3; k++) + { + data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift2"); + perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, data_ref, + vect_shift[(4 - k) % 3], + vect_shift[(3 - k) % 3], + shift2_mask); + vect_finish_stmt_generation (stmt, perm_stmt, gsi); + vect[k] = data_ref; + } + + (*result_chain)[3 - (nelt % 3)] = vect[2]; + + data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift3"); + perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, data_ref, + vect[0], vect[0], + shift3_mask); + vect_finish_stmt_generation (stmt, perm_stmt, gsi); + (*result_chain)[nelt % 3] = data_ref; + + data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift4"); + perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, data_ref, + vect[1], vect[1], + shift4_mask); + vect_finish_stmt_generation (stmt, perm_stmt, gsi); + (*result_chain)[0] = data_ref; + return true; + } + return false; +} /* Function vect_transform_grouped_load. @@ -4744,13 +5176,22 @@ void vect_transform_grouped_load (gimple stmt, vec dr_chain, int size, gimple_stmt_iterator *gsi) { + enum machine_mode mode; vec result_chain = vNULL; /* DR_CHAIN contains input data-refs that are a part of the interleaving. RESULT_CHAIN is the output of vect_permute_load_chain, it contains permuted vectors, that are ready for vector computation. */ result_chain.create (size); - vect_permute_load_chain (dr_chain, size, stmt, gsi, &result_chain); + + /* If reassociation width for vector type is 2 or greater target machine can + execute 2 or more vector instructions in parallel. Otherwise try to + get chain for loads group using vect_shift_permute_load_chain. */ + mode = TYPE_MODE (STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt))); + if (targetm.sched.reassociation_width (VEC_PERM_EXPR, mode) > 1 + || !vect_shift_permute_load_chain (dr_chain, size, stmt, + gsi, &result_chain)) + vect_permute_load_chain (dr_chain, size, stmt, gsi, &result_chain); vect_record_grouped_load_vectors (stmt, result_chain); result_chain.release (); } diff --git a/gcc-4.8/gcc/tree-vect-stmts.c b/gcc-4.8/gcc/tree-vect-stmts.c index 361c312d8..ff19b04f7 100644 --- a/gcc-4.8/gcc/tree-vect-stmts.c +++ b/gcc-4.8/gcc/tree-vect-stmts.c @@ -1070,10 +1070,11 @@ vect_model_load_cost (stmt_vec_info stmt_info, int ncopies, include the cost of the permutes. */ if (!load_lanes_p && group_size > 1) { - /* Uses an even and odd extract operations for each needed permute. */ - int nstmts = ncopies * exact_log2 (group_size) * group_size; - inside_cost += record_stmt_cost (body_cost_vec, nstmts, vec_perm, - stmt_info, 0, vect_body); + /* Uses an even and odd extract operations or shuffle operations + for each needed permute. */ + int nstmts = ncopies * ceil_log2 (group_size) * group_size; + inside_cost = record_stmt_cost (body_cost_vec, nstmts, vec_perm, + stmt_info, 0, vect_body); if (dump_enabled_p ()) dump_printf_loc (MSG_NOTE, vect_location, diff --git a/gcc-4.9/gcc/config/i386/i386.c b/gcc-4.9/gcc/config/i386/i386.c index df504335e..4016b6052 100644 --- a/gcc-4.9/gcc/config/i386/i386.c +++ b/gcc-4.9/gcc/config/i386/i386.c @@ -82,6 +82,7 @@ along with GCC; see the file COPYING3. If not see #include "context.h" #include "pass_manager.h" #include "target-globals.h" +#include "tree-vectorizer.h" static rtx legitimize_dllimport_symbol (rtx, bool); static rtx legitimize_pe_coff_extern_decl (rtx, bool); @@ -1739,7 +1740,7 @@ struct processor_costs slm_cost = { 1, /* scalar load_cost. */ 1, /* scalar_store_cost. */ 1, /* vec_stmt_cost. */ - 1, /* vec_to_scalar_cost. */ + 4, /* vec_to_scalar_cost. */ 1, /* scalar_to_vec_cost. */ 1, /* vec_align_load_cost. */ 2, /* vec_unalign_load_cost. */ @@ -1816,7 +1817,7 @@ struct processor_costs intel_cost = { 1, /* scalar load_cost. */ 1, /* scalar_store_cost. */ 1, /* vec_stmt_cost. */ - 1, /* vec_to_scalar_cost. */ + 4, /* vec_to_scalar_cost. */ 1, /* scalar_to_vec_cost. */ 1, /* vec_align_load_cost. */ 2, /* vec_unalign_load_cost. */ @@ -44300,7 +44301,7 @@ expand_vec_perm_even_odd_1 (struct expand_vec_perm_d *d, unsigned odd) gcc_unreachable (); case V8HImode: - if (TARGET_SSSE3) + if (TARGET_SSSE3 && !TARGET_SLOW_PSHUFB) return expand_vec_perm_pshufb2 (d); else { @@ -44323,7 +44324,7 @@ expand_vec_perm_even_odd_1 (struct expand_vec_perm_d *d, unsigned odd) break; case V16QImode: - if (TARGET_SSSE3) + if (TARGET_SSSE3 && !TARGET_SLOW_PSHUFB) return expand_vec_perm_pshufb2 (d); else { @@ -46493,6 +46494,16 @@ ix86_reassociation_width (unsigned int opc ATTRIBUTE_UNUSED, { int res = 1; + /* Vector part. */ + if (VECTOR_MODE_P (mode)) + { + if (TARGET_VECTOR_PARALLEL_EXECUTION) + return 2; + else + return 1; + } + + /* Scalar part. */ if (INTEGRAL_MODE_P (mode) && TARGET_REASSOC_INT_TO_PARALLEL) res = 2; else if (FLOAT_MODE_P (mode) && TARGET_REASSOC_FP_TO_PARALLEL) @@ -46592,7 +46603,6 @@ ix86_add_stmt_cost (void *data, int count, enum vect_cost_for_stmt kind, { unsigned *cost = (unsigned *) data; unsigned retval = 0; - tree vectype = stmt_info ? stmt_vectype (stmt_info) : NULL_TREE; int stmt_cost = ix86_builtin_vectorization_cost (kind, vectype, misalign); @@ -46603,6 +46613,18 @@ ix86_add_stmt_cost (void *data, int count, enum vect_cost_for_stmt kind, count *= 50; /* FIXME. */ retval = (unsigned) (count * stmt_cost); + + /* We need to multiply all vector stmt cost by 1.7 (estimated cost) + for Silvermont as it has out of order integer pipeline and can execute + 2 scalar instruction per tick, but has in order SIMD pipeline. */ + if (TARGET_SILVERMONT || TARGET_INTEL) + if (stmt_info && stmt_info->stmt) + { + tree lhs_op = gimple_get_lhs (stmt_info->stmt); + if (lhs_op && TREE_CODE (TREE_TYPE (lhs_op)) == INTEGER_TYPE) + retval = (retval * 17) / 10; + } + cost[where] += retval; return retval; diff --git a/gcc-4.9/gcc/config/i386/i386.h b/gcc-4.9/gcc/config/i386/i386.h index 51659deb3..fb527411a 100644 --- a/gcc-4.9/gcc/config/i386/i386.h +++ b/gcc-4.9/gcc/config/i386/i386.h @@ -425,6 +425,10 @@ extern unsigned char ix86_tune_features[X86_TUNE_LAST]; ix86_tune_features[X86_TUNE_USE_VECTOR_FP_CONVERTS] #define TARGET_USE_VECTOR_CONVERTS \ ix86_tune_features[X86_TUNE_USE_VECTOR_CONVERTS] +#define TARGET_SLOW_PSHUFB \ + ix86_tune_features[X86_TUNE_SLOW_PSHUFB] +#define TARGET_VECTOR_PARALLEL_EXECUTION \ + ix86_tune_features[X86_TUNE_VECTOR_PARALLEL_EXECUTION] #define TARGET_FUSE_CMP_AND_BRANCH_32 \ ix86_tune_features[X86_TUNE_FUSE_CMP_AND_BRANCH_32] #define TARGET_FUSE_CMP_AND_BRANCH_64 \ diff --git a/gcc-4.9/gcc/config/i386/predicates.md b/gcc-4.9/gcc/config/i386/predicates.md index 2ef138424..8266f3eaf 100644 --- a/gcc-4.9/gcc/config/i386/predicates.md +++ b/gcc-4.9/gcc/config/i386/predicates.md @@ -1417,6 +1417,22 @@ return true; }) +;; Return true if OP is a parallel for a palignr permute. +(define_predicate "palignr_operand" + (and (match_code "parallel") + (match_code "const_int" "a")) +{ + int elt = INTVAL (XVECEXP (op, 0, 0)); + int i, nelt = XVECLEN (op, 0); + + /* Check that an order in the permutation is suitable for palignr. + For example, {5 6 7 0 1 2 3 4} is "palignr 5, xmm, xmm". */ + for (i = 1; i < nelt; ++i) + if (INTVAL (XVECEXP (op, 0, i)) != ((elt + i) % nelt)) + return false; + return true; +}) + ;; Return true if OP is a proper third operand to vpblendw256. (define_predicate "avx2_pblendw_operand" (match_code "const_int") diff --git a/gcc-4.9/gcc/config/i386/sse.md b/gcc-4.9/gcc/config/i386/sse.md index 27ade1964..26332d2b5 100644 --- a/gcc-4.9/gcc/config/i386/sse.md +++ b/gcc-4.9/gcc/config/i386/sse.md @@ -14574,6 +14574,35 @@ (set_attr "prefix" "vex") (set_attr "mode" "")]) +(define_insn "*ssse3_palignr_perm" + [(set (match_operand:V_128 0 "register_operand" "=x,x") + (vec_select:V_128 + (match_operand:V_128 1 "register_operand" "0,x") + (match_parallel 2 "palignr_operand" + [(match_operand 3 "const_int_operand" "n, n")])))] + "TARGET_SSSE3" +{ + enum machine_mode imode = GET_MODE_INNER (GET_MODE (operands[0])); + operands[2] = GEN_INT (INTVAL (operands[3]) * GET_MODE_SIZE (imode)); + + switch (which_alternative) + { + case 0: + return "palignr\t{%2, %1, %0|%0, %1, %2}"; + case 1: + return "vpalignr\t{%2, %1, %1, %0|%0, %1, %1, %2}"; + default: + gcc_unreachable (); + } +} + [(set_attr "isa" "noavx,avx") + (set_attr "type" "sseishft") + (set_attr "atom_unit" "sishuf") + (set_attr "prefix_data16" "1,*") + (set_attr "prefix_extra" "1") + (set_attr "length_immediate" "1") + (set_attr "prefix" "orig,vex")]) + (define_expand "avx_vinsertf128" [(match_operand:V_256 0 "register_operand") (match_operand:V_256 1 "register_operand") diff --git a/gcc-4.9/gcc/config/i386/x86-tune.def b/gcc-4.9/gcc/config/i386/x86-tune.def index 839910267..cb44dc312 100644 --- a/gcc-4.9/gcc/config/i386/x86-tune.def +++ b/gcc-4.9/gcc/config/i386/x86-tune.def @@ -386,6 +386,15 @@ DEF_TUNE (X86_TUNE_USE_VECTOR_FP_CONVERTS, "use_vector_fp_converts", from integer to FP. */ DEF_TUNE (X86_TUNE_USE_VECTOR_CONVERTS, "use_vector_converts", m_AMDFAM10) +/* X86_TUNE_SLOW_SHUFB: Indicates tunings with slow pshufb instruction. */ +DEF_TUNE (X86_TUNE_SLOW_PSHUFB, "slow_pshufb", + m_BONNELL | m_SILVERMONT | m_INTEL) + +/* X86_TUNE_VECTOR_PARALLEL_EXECUTION: Indicates tunings with ability to + execute 2 or more vector instructions in parallel. */ +DEF_TUNE (X86_TUNE_VECTOR_PARALLEL_EXECUTION, "vec_parallel", + m_NEHALEM | m_SANDYBRIDGE | m_HASWELL) + /*****************************************************************************/ /* AVX instruction selection tuning (some of SSE flags affects AVX, too) */ /*****************************************************************************/ diff --git a/gcc-4.9/gcc/testsuite/gcc.dg/vect/pr52252-ld.c b/gcc-4.9/gcc/testsuite/gcc.dg/vect/pr52252-ld.c new file mode 100644 index 000000000..6e3cb52b8 --- /dev/null +++ b/gcc-4.9/gcc/testsuite/gcc.dg/vect/pr52252-ld.c @@ -0,0 +1,30 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -g -ftree-vectorize -mssse3 -fdump-tree-vect-details" { target { i?86-*-* x86_64-*-* } } } */ + +#define byte unsigned char + +void +matrix_mul (byte *in, byte *out, int size) +{ + int i; + for (i = 0; i < size; i++) + { + byte in0 = in[0]; + byte in1 = in[1]; + byte in2 = in[2]; + byte out0, out1, out2, out3; + out0 = in0 + in1; + out1 = in0 + in2; + out2 = in1 + in2; + out3 = in0 + in1 + in2; + out[0] = out0; + out[1] = out1; + out[2] = out2; + out[3] = out3; + in += 3; + out += 4; + } +} + +/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" } } */ +/* { dg-final { cleanup-tree-dump "vect" } } */ diff --git a/gcc-4.9/gcc/testsuite/gcc.target/i386/pr52252-atom.c b/gcc-4.9/gcc/testsuite/gcc.target/i386/pr52252-atom.c new file mode 100644 index 000000000..715b45943 --- /dev/null +++ b/gcc-4.9/gcc/testsuite/gcc.target/i386/pr52252-atom.c @@ -0,0 +1,29 @@ +/* { dg-do compile } */ +/* { dg-require-effective-target ssse3 } */ +/* { dg-options "-O2 -ftree-vectorize -mssse3 -mtune=slm" } */ +#define byte unsigned char + +void +matrix_mul (byte *in, byte *out, int size) +{ + int i; + for (i = 0; i < size; i++) + { + byte in0 = in[0]; + byte in1 = in[1]; + byte in2 = in[2]; + byte out0, out1, out2, out3; + out0 = in0 + in1; + out1 = in0 + in2; + out2 = in1 + in2; + out3 = in0 + in1 + in2; + out[0] = out0; + out[1] = out1; + out[2] = out2; + out[3] = out3; + in += 3; + out += 4; + } +} + +/* { dg-final { scan-assembler "palignr" } } */ diff --git a/gcc-4.9/gcc/testsuite/gcc.target/i386/pr52252-core.c b/gcc-4.9/gcc/testsuite/gcc.target/i386/pr52252-core.c new file mode 100644 index 000000000..ac857a5fe --- /dev/null +++ b/gcc-4.9/gcc/testsuite/gcc.target/i386/pr52252-core.c @@ -0,0 +1,29 @@ +/* { dg-do compile } */ +/* { dg-require-effective-target ssse3 } */ +/* { dg-options "-O2 -ftree-vectorize -mssse3 -mtune=corei7" } */ +#define byte unsigned char + +void +matrix_mul (byte *in, byte *out, int size) +{ + int i; + for (i = 0; i < size; i++) + { + byte in0 = in[0]; + byte in1 = in[1]; + byte in2 = in[2]; + byte out0, out1, out2, out3; + out0 = in0 + in1; + out1 = in0 + in2; + out2 = in1 + in2; + out3 = in0 + in1 + in2; + out[0] = out0; + out[1] = out1; + out[2] = out2; + out[3] = out3; + in += 3; + out += 4; + } +} + +/* { dg-final { scan-assembler "pshufb" } } */ diff --git a/gcc-4.9/gcc/testsuite/gcc.target/i386/pr61403.c b/gcc-4.9/gcc/testsuite/gcc.target/i386/pr61403.c new file mode 100644 index 000000000..84cc5c5c8 --- /dev/null +++ b/gcc-4.9/gcc/testsuite/gcc.target/i386/pr61403.c @@ -0,0 +1,27 @@ +/* { dg-do compile } */ +/* { dg-require-effective-target sse4 } */ +/* { dg-options "-O2 -ffast-math -ftree-vectorize -msse4.2 -mtune=corei7" } */ + +#include + +struct XYZ +{ + float x; + float y; + float z; +}; + +void +norm (struct XYZ *in, struct XYZ *out, int size) +{ + int i; + for (i = 0; i < size; ++i) + { + float n = sqrt (in[i].x * in[i].x + in[i].y * in[i].y + in[i].z * in[i].z); + out[i].x = in[i].x / n; + out[i].y = in[i].y / n; + out[i].z = in[i].z / n; + } +} + +/* { dg-final { scan-assembler "blend" } } */ diff --git a/gcc-4.9/gcc/tree-vect-data-refs.c b/gcc-4.9/gcc/tree-vect-data-refs.c index 6622bd84d..ab1197ec6 100644 --- a/gcc-4.9/gcc/tree-vect-data-refs.c +++ b/gcc-4.9/gcc/tree-vect-data-refs.c @@ -4815,36 +4815,76 @@ vect_grouped_load_supported (tree vectype, unsigned HOST_WIDE_INT count) { enum machine_mode mode = TYPE_MODE (vectype); - /* vect_permute_load_chain requires the group size to be a power of two. */ - if (exact_log2 (count) == -1) + /* vect_permute_load_chain requires the group size to be equal to 3 or + be a power of two. */ + if (count != 3 && exact_log2 (count) == -1) { if (dump_enabled_p ()) dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, - "the size of the group of accesses" - " is not a power of 2\n"); + "the size of the group of accesses" + " is not a power of 2 or not equal to 3\n"); return false; } /* Check that the permutation is supported. */ if (VECTOR_MODE_P (mode)) { - unsigned int i, nelt = GET_MODE_NUNITS (mode); + unsigned int i, j, nelt = GET_MODE_NUNITS (mode); unsigned char *sel = XALLOCAVEC (unsigned char, nelt); - for (i = 0; i < nelt; i++) - sel[i] = i * 2; - if (can_vec_perm_p (mode, false, sel)) + if (count == 3) { + unsigned int k; + for (k = 0; k < 3; k++) + { + for (i = 0; i < nelt; i++) + if (3 * i + k < 2 * nelt) + sel[i] = 3 * i + k; + else + sel[i] = 0; + if (!can_vec_perm_p (mode, false, sel)) + { + if (dump_enabled_p ()) + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, + "shuffle of 3 loads is not supported by" + " target\n"); + return false; + } + for (i = 0, j = 0; i < nelt; i++) + if (3 * i + k < 2 * nelt) + sel[i] = i; + else + sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++); + if (!can_vec_perm_p (mode, false, sel)) + { + if (dump_enabled_p ()) + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, + "shuffle of 3 loads is not supported by" + " target\n"); + return false; + } + } + return true; + } + else + { + /* If length is not equal to 3 then only power of 2 is supported. */ + gcc_assert (exact_log2 (count) != -1); for (i = 0; i < nelt; i++) - sel[i] = i * 2 + 1; + sel[i] = i * 2; if (can_vec_perm_p (mode, false, sel)) - return true; - } + { + for (i = 0; i < nelt; i++) + sel[i] = i * 2 + 1; + if (can_vec_perm_p (mode, false, sel)) + return true; + } + } } if (dump_enabled_p ()) dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, - "extract even/odd not supported by target\n"); + "extract even/odd not supported by target\n"); return false; } @@ -4862,8 +4902,9 @@ vect_load_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count) /* Function vect_permute_load_chain. Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be - a power of 2, generate extract_even/odd stmts to reorder the input data - correctly. Return the final references for loads in RESULT_CHAIN. + a power of 2 or equal to 3, generate extract_even/odd stmts to reorder + the input data correctly. Return the final references for loads in + RESULT_CHAIN. E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8. The input is 4 vectors each containing 8 elements. We assign a number to each @@ -4944,6 +4985,7 @@ vect_permute_load_chain (vec dr_chain, { tree data_ref, first_vect, second_vect; tree perm_mask_even, perm_mask_odd; + tree perm3_mask_low, perm3_mask_high; gimple perm_stmt; tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt)); unsigned int i, j, log_length = exact_log2 (length); @@ -4954,44 +4996,437 @@ vect_permute_load_chain (vec dr_chain, memcpy (result_chain->address (), dr_chain.address (), length * sizeof (tree)); - for (i = 0; i < nelt; ++i) - sel[i] = i * 2; - perm_mask_even = vect_gen_perm_mask (vectype, sel); - gcc_assert (perm_mask_even != NULL); - - for (i = 0; i < nelt; ++i) - sel[i] = i * 2 + 1; - perm_mask_odd = vect_gen_perm_mask (vectype, sel); - gcc_assert (perm_mask_odd != NULL); - - for (i = 0; i < log_length; i++) + if (length == 3) { - for (j = 0; j < length; j += 2) - { - first_vect = dr_chain[j]; - second_vect = dr_chain[j+1]; + unsigned int k; - /* data_ref = permute_even (first_data_ref, second_data_ref); */ - data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_even"); + for (k = 0; k < 3; k++) + { + for (i = 0; i < nelt; i++) + if (3 * i + k < 2 * nelt) + sel[i] = 3 * i + k; + else + sel[i] = 0; + perm3_mask_low = vect_gen_perm_mask (vectype, sel); + gcc_assert (perm3_mask_low != NULL); + + for (i = 0, j = 0; i < nelt; i++) + if (3 * i + k < 2 * nelt) + sel[i] = i; + else + sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++); + + perm3_mask_high = vect_gen_perm_mask (vectype, sel); + gcc_assert (perm3_mask_high != NULL); + + first_vect = dr_chain[0]; + second_vect = dr_chain[1]; + + /* Create interleaving stmt (low part of): + low = VEC_PERM_EXPR */ + data_ref = make_temp_ssa_name (vectype, NULL, "vect_suffle3_low"); perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, data_ref, first_vect, second_vect, - perm_mask_even); + perm3_mask_low); vect_finish_stmt_generation (stmt, perm_stmt, gsi); - (*result_chain)[j/2] = data_ref; - /* data_ref = permute_odd (first_data_ref, second_data_ref); */ - data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_odd"); + /* Create interleaving stmt (high part of): + high = VEC_PERM_EXPR */ + first_vect = data_ref; + second_vect = dr_chain[2]; + data_ref = make_temp_ssa_name (vectype, NULL, "vect_suffle3_high"); perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, data_ref, first_vect, second_vect, - perm_mask_odd); + perm3_mask_high); vect_finish_stmt_generation (stmt, perm_stmt, gsi); - (*result_chain)[j/2+length/2] = data_ref; + (*result_chain)[k] = data_ref; + } + } + else + { + /* If length is not equal to 3 then only power of 2 is supported. */ + gcc_assert (exact_log2 (length) != -1); + + for (i = 0; i < nelt; ++i) + sel[i] = i * 2; + perm_mask_even = vect_gen_perm_mask (vectype, sel); + gcc_assert (perm_mask_even != NULL); + + for (i = 0; i < nelt; ++i) + sel[i] = i * 2 + 1; + perm_mask_odd = vect_gen_perm_mask (vectype, sel); + gcc_assert (perm_mask_odd != NULL); + + for (i = 0; i < log_length; i++) + { + for (j = 0; j < length; j += 2) + { + first_vect = dr_chain[j]; + second_vect = dr_chain[j+1]; + + /* data_ref = permute_even (first_data_ref, second_data_ref); */ + data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_even"); + perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, data_ref, + first_vect, second_vect, + perm_mask_even); + vect_finish_stmt_generation (stmt, perm_stmt, gsi); + (*result_chain)[j/2] = data_ref; + + /* data_ref = permute_odd (first_data_ref, second_data_ref); */ + data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_odd"); + perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, data_ref, + first_vect, second_vect, + perm_mask_odd); + vect_finish_stmt_generation (stmt, perm_stmt, gsi); + (*result_chain)[j/2+length/2] = data_ref; + } + memcpy (dr_chain.address (), result_chain->address (), + length * sizeof (tree)); } - memcpy (dr_chain.address (), result_chain->address (), - length * sizeof (tree)); } } +/* Function vect_shift_permute_load_chain. + + Given a chain of loads in DR_CHAIN of LENGTH 2 or 3, generate + sequence of stmts to reorder the input data accordingly. + Return the final references for loads in RESULT_CHAIN. + Return true if successed, false otherwise. + + E.g., LENGTH is 3 and the scalar type is short, i.e., VF is 8. + The input is 3 vectors each containing 8 elements. We assign a + number to each element, the input sequence is: + + 1st vec: 0 1 2 3 4 5 6 7 + 2nd vec: 8 9 10 11 12 13 14 15 + 3rd vec: 16 17 18 19 20 21 22 23 + + The output sequence should be: + + 1st vec: 0 3 6 9 12 15 18 21 + 2nd vec: 1 4 7 10 13 16 19 22 + 3rd vec: 2 5 8 11 14 17 20 23 + + We use 3 shuffle instructions and 3 * 3 - 1 shifts to create such output. + + First we shuffle all 3 vectors to get correct elements order: + + 1st vec: ( 0 3 6) ( 1 4 7) ( 2 5) + 2nd vec: ( 8 11 14) ( 9 12 15) (10 13) + 3rd vec: (16 19 22) (17 20 23) (18 21) + + Next we unite and shift vector 3 times: + + 1st step: + shift right by 6 the concatenation of: + "1st vec" and "2nd vec" + ( 0 3 6) ( 1 4 7) |( 2 5) _ ( 8 11 14) ( 9 12 15)| (10 13) + "2nd vec" and "3rd vec" + ( 8 11 14) ( 9 12 15) |(10 13) _ (16 19 22) (17 20 23)| (18 21) + "3rd vec" and "1st vec" + (16 19 22) (17 20 23) |(18 21) _ ( 0 3 6) ( 1 4 7)| ( 2 5) + | New vectors | + + So that now new vectors are: + + 1st vec: ( 2 5) ( 8 11 14) ( 9 12 15) + 2nd vec: (10 13) (16 19 22) (17 20 23) + 3rd vec: (18 21) ( 0 3 6) ( 1 4 7) + + 2nd step: + shift right by 5 the concatenation of: + "1st vec" and "3rd vec" + ( 2 5) ( 8 11 14) |( 9 12 15) _ (18 21) ( 0 3 6)| ( 1 4 7) + "2nd vec" and "1st vec" + (10 13) (16 19 22) |(17 20 23) _ ( 2 5) ( 8 11 14)| ( 9 12 15) + "3rd vec" and "2nd vec" + (18 21) ( 0 3 6) |( 1 4 7) _ (10 13) (16 19 22)| (17 20 23) + | New vectors | + + So that now new vectors are: + + 1st vec: ( 9 12 15) (18 21) ( 0 3 6) + 2nd vec: (17 20 23) ( 2 5) ( 8 11 14) + 3rd vec: ( 1 4 7) (10 13) (16 19 22) READY + + 3rd step: + shift right by 5 the concatenation of: + "1st vec" and "1st vec" + ( 9 12 15) (18 21) |( 0 3 6) _ ( 9 12 15) (18 21)| ( 0 3 6) + shift right by 3 the concatenation of: + "2nd vec" and "2nd vec" + (17 20 23) |( 2 5) ( 8 11 14) _ (17 20 23)| ( 2 5) ( 8 11 14) + | New vectors | + + So that now all vectors are READY: + 1st vec: ( 0 3 6) ( 9 12 15) (18 21) + 2nd vec: ( 2 5) ( 8 11 14) (17 20 23) + 3rd vec: ( 1 4 7) (10 13) (16 19 22) + + This algorithm is faster than one in vect_permute_load_chain if: + 1. "shift of a concatination" is faster than general permutation. + This is usually so. + 2. The TARGET machine can't execute vector instructions in parallel. + This is because each step of the algorithm depends on previous. + The algorithm in vect_permute_load_chain is much more parallel. + + The algorithm is applicable only for LOAD CHAIN LENGTH less than VF. +*/ + +static bool +vect_shift_permute_load_chain (vec dr_chain, + unsigned int length, + gimple stmt, + gimple_stmt_iterator *gsi, + vec *result_chain) +{ + tree vect[3], vect_shift[3], data_ref, first_vect, second_vect; + tree perm2_mask1, perm2_mask2, perm3_mask; + tree select_mask, shift1_mask, shift2_mask, shift3_mask, shift4_mask; + gimple perm_stmt; + + tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt)); + unsigned int i; + unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype); + unsigned char *sel = XALLOCAVEC (unsigned char, nelt); + stmt_vec_info stmt_info = vinfo_for_stmt (stmt); + loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info); + + result_chain->quick_grow (length); + memcpy (result_chain->address (), dr_chain.address (), + length * sizeof (tree)); + + if (length == 2 && LOOP_VINFO_VECT_FACTOR (loop_vinfo) > 4) + { + for (i = 0; i < nelt / 2; ++i) + sel[i] = i * 2; + for (i = 0; i < nelt / 2; ++i) + sel[nelt / 2 + i] = i * 2 + 1; + if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel)) + { + if (dump_enabled_p ()) + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, + "shuffle of 2 fields structure is not \ + supported by target\n"); + return false; + } + perm2_mask1 = vect_gen_perm_mask (vectype, sel); + gcc_assert (perm2_mask1 != NULL); + + for (i = 0; i < nelt / 2; ++i) + sel[i] = i * 2 + 1; + for (i = 0; i < nelt / 2; ++i) + sel[nelt / 2 + i] = i * 2; + if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel)) + { + if (dump_enabled_p ()) + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, + "shuffle of 2 fields structure is not \ + supported by target\n"); + return false; + } + perm2_mask2 = vect_gen_perm_mask (vectype, sel); + gcc_assert (perm2_mask2 != NULL); + + /* Generating permutation constant to shift all elements. + For vector length 8 it is {4 5 6 7 8 9 10 11}. */ + for (i = 0; i < nelt; i++) + sel[i] = nelt / 2 + i; + if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel)) + { + if (dump_enabled_p ()) + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, + "shift permutation is not supported by target\n"); + return false; + } + shift1_mask = vect_gen_perm_mask (vectype, sel); + gcc_assert (shift1_mask != NULL); + + /* Generating permutation constant to select vector from 2. + For vector length 8 it is {0 1 2 3 12 13 14 15}. */ + for (i = 0; i < nelt / 2; i++) + sel[i] = i; + for (i = nelt / 2; i < nelt; i++) + sel[i] = nelt + i; + if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel)) + { + if (dump_enabled_p ()) + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, + "select is not supported by target\n"); + return false; + } + select_mask = vect_gen_perm_mask (vectype, sel); + gcc_assert (select_mask != NULL); + + first_vect = dr_chain[0]; + second_vect = dr_chain[1]; + + data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2"); + perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, data_ref, + first_vect, first_vect, + perm2_mask1); + vect_finish_stmt_generation (stmt, perm_stmt, gsi); + vect[0] = data_ref; + + data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2"); + perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, data_ref, + second_vect, second_vect, + perm2_mask2); + vect_finish_stmt_generation (stmt, perm_stmt, gsi); + vect[1] = data_ref; + + data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift"); + perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, data_ref, + vect[0], vect[1], + shift1_mask); + vect_finish_stmt_generation (stmt, perm_stmt, gsi); + (*result_chain)[1] = data_ref; + + data_ref = make_temp_ssa_name (vectype, NULL, "vect_select"); + perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, data_ref, + vect[0], vect[1], + select_mask); + vect_finish_stmt_generation (stmt, perm_stmt, gsi); + (*result_chain)[0] = data_ref; + + return true; + } + if (length == 3 && LOOP_VINFO_VECT_FACTOR (loop_vinfo) > 2) + { + unsigned int k = 0, l = 0; + + /* Generating permutation constant to get all elements in rigth order. + For vector length 8 it is {0 3 6 1 4 7 2 5}. */ + for (i = 0; i < nelt; i++) + { + if (3 * k + (l % 3) >= nelt) + { + k = 0; + l += (3 - (nelt % 3)); + } + sel[i] = 3 * k + (l % 3); + k++; + } + if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel)) + { + if (dump_enabled_p ()) + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, + "shuffle of 3 fields structure is not \ + supported by target\n"); + return false; + } + perm3_mask = vect_gen_perm_mask (vectype, sel); + gcc_assert (perm3_mask != NULL); + + /* Generating permutation constant to shift all elements. + For vector length 8 it is {6 7 8 9 10 11 12 13}. */ + for (i = 0; i < nelt; i++) + sel[i] = 2 * (nelt / 3) + (nelt % 3) + i; + if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel)) + { + if (dump_enabled_p ()) + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, + "shift permutation is not supported by target\n"); + return false; + } + shift1_mask = vect_gen_perm_mask (vectype, sel); + gcc_assert (shift1_mask != NULL); + + /* Generating permutation constant to shift all elements. + For vector length 8 it is {5 6 7 8 9 10 11 12}. */ + for (i = 0; i < nelt; i++) + sel[i] = 2 * (nelt / 3) + 1 + i; + if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel)) + { + if (dump_enabled_p ()) + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, + "shift permutation is not supported by target\n"); + return false; + } + shift2_mask = vect_gen_perm_mask (vectype, sel); + gcc_assert (shift2_mask != NULL); + + /* Generating permutation constant to shift all elements. + For vector length 8 it is {3 4 5 6 7 8 9 10}. */ + for (i = 0; i < nelt; i++) + sel[i] = (nelt / 3) + (nelt % 3) / 2 + i; + if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel)) + { + if (dump_enabled_p ()) + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, + "shift permutation is not supported by target\n"); + return false; + } + shift3_mask = vect_gen_perm_mask (vectype, sel); + gcc_assert (shift3_mask != NULL); + + /* Generating permutation constant to shift all elements. + For vector length 8 it is {5 6 7 8 9 10 11 12}. */ + for (i = 0; i < nelt; i++) + sel[i] = 2 * (nelt / 3) + (nelt % 3) / 2 + i; + if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel)) + { + if (dump_enabled_p ()) + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, + "shift permutation is not supported by target\n"); + return false; + } + shift4_mask = vect_gen_perm_mask (vectype, sel); + gcc_assert (shift4_mask != NULL); + + for (k = 0; k < 3; k++) + { + data_ref = make_temp_ssa_name (vectype, NULL, "vect_suffle3"); + perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, data_ref, + dr_chain[k], dr_chain[k], + perm3_mask); + vect_finish_stmt_generation (stmt, perm_stmt, gsi); + vect[k] = data_ref; + } + + for (k = 0; k < 3; k++) + { + data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift1"); + perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, data_ref, + vect[k % 3], + vect[(k + 1) % 3], + shift1_mask); + vect_finish_stmt_generation (stmt, perm_stmt, gsi); + vect_shift[k] = data_ref; + } + + for (k = 0; k < 3; k++) + { + data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift2"); + perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, data_ref, + vect_shift[(4 - k) % 3], + vect_shift[(3 - k) % 3], + shift2_mask); + vect_finish_stmt_generation (stmt, perm_stmt, gsi); + vect[k] = data_ref; + } + + (*result_chain)[3 - (nelt % 3)] = vect[2]; + + data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift3"); + perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, data_ref, + vect[0], vect[0], + shift3_mask); + vect_finish_stmt_generation (stmt, perm_stmt, gsi); + (*result_chain)[nelt % 3] = data_ref; + + data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift4"); + perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, data_ref, + vect[1], vect[1], + shift4_mask); + vect_finish_stmt_generation (stmt, perm_stmt, gsi); + (*result_chain)[0] = data_ref; + return true; + } + return false; +} /* Function vect_transform_grouped_load. @@ -5004,13 +5439,22 @@ void vect_transform_grouped_load (gimple stmt, vec dr_chain, int size, gimple_stmt_iterator *gsi) { + enum machine_mode mode; vec result_chain = vNULL; /* DR_CHAIN contains input data-refs that are a part of the interleaving. RESULT_CHAIN is the output of vect_permute_load_chain, it contains permuted vectors, that are ready for vector computation. */ result_chain.create (size); - vect_permute_load_chain (dr_chain, size, stmt, gsi, &result_chain); + + /* If reassociation width for vector type is 2 or greater target machine can + execute 2 or more vector instructions in parallel. Otherwise try to + get chain for loads group using vect_shift_permute_load_chain. */ + mode = TYPE_MODE (STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt))); + if (targetm.sched.reassociation_width (VEC_PERM_EXPR, mode) > 1 + || !vect_shift_permute_load_chain (dr_chain, size, stmt, + gsi, &result_chain)) + vect_permute_load_chain (dr_chain, size, stmt, gsi, &result_chain); vect_record_grouped_load_vectors (stmt, result_chain); result_chain.release (); } diff --git a/gcc-4.9/gcc/tree-vect-stmts.c b/gcc-4.9/gcc/tree-vect-stmts.c index 1a51d6d7b..b87c14345 100644 --- a/gcc-4.9/gcc/tree-vect-stmts.c +++ b/gcc-4.9/gcc/tree-vect-stmts.c @@ -1091,10 +1091,11 @@ vect_model_load_cost (stmt_vec_info stmt_info, int ncopies, include the cost of the permutes. */ if (!load_lanes_p && group_size > 1) { - /* Uses an even and odd extract operations for each needed permute. */ - int nstmts = ncopies * exact_log2 (group_size) * group_size; - inside_cost += record_stmt_cost (body_cost_vec, nstmts, vec_perm, - stmt_info, 0, vect_body); + /* Uses an even and odd extract operations or shuffle operations + for each needed permute. */ + int nstmts = ncopies * ceil_log2 (group_size) * group_size; + inside_cost = record_stmt_cost (body_cost_vec, nstmts, vec_perm, + stmt_info, 0, vect_body); if (dump_enabled_p ()) dump_printf_loc (MSG_NOTE, vect_location, -- cgit v1.2.3