diff options
author | Alexander Ivchenko <alexander.ivchenko@intel.com> | 2015-03-16 10:30:57 +0300 |
---|---|---|
committer | Alexander Ivchenko <alexander.ivchenko@intel.com> | 2015-03-17 13:03:08 +0300 |
commit | 3951a3654b8197466bee3e6732b3bc94e4018f68 (patch) | |
tree | 5f71295bf4a3df8c9d8187ae983591466ff82a86 /gcc-4.9/gcc/config | |
parent | 8075018d7ad15059179e6ff7d0dd12071e1749b9 (diff) | |
download | toolchain_gcc-3951a3654b8197466bee3e6732b3bc94e4018f68.tar.gz toolchain_gcc-3951a3654b8197466bee3e6732b3bc94e4018f68.tar.bz2 toolchain_gcc-3951a3654b8197466bee3e6732b3bc94e4018f68.zip |
[4.9] Several improvements in code generation for x86. Backport from trunk.
2014-11-21 Evgeny Stupachenko <evstupac@gmail.com>
PR target/60451
* config/i386/i386.c (expand_vec_perm_even_odd_pack): New.
(expand_vec_perm_even_odd_1): Add new expand for V8HI mode,
replace for V16QI, V16HI and V32QI modes.
(ix86_expand_vec_perm_const_1): Add new expand.
2014-06-11 Evgeny Stupachenko <evstupac@gmail.com>
* tree-vect-data-refs.c (vect_grouped_store_supported): New
check for stores group of length 3.
(vect_permute_store_chain): New permutations for stores group of
length 3.
* tree-vect-stmts.c (vect_model_store_cost): Change cost
of vec_perm_shuffle for the new permutations.
2014-11-28 Evgeny Stupachenko <evstupac@gmail.com>
* tree-vect-data-refs.c (vect_transform_grouped_load): Limit shift
permutations to loads group of size 3.
2014-12-18 Bin Cheng <bin.cheng@arm.com>
PR tree-optimization/62178
* tree-ssa-loop-ivopts.c (cheaper_cost_with_cand): New function.
(iv_ca_replace): New function.
(try_improve_iv_set): New parameter try_replace_p.
Break local optimal fixed-point by calling iv_ca_replace.
(find_optimal_iv_set_1): Pass new argument to try_improve_iv_set.
Change-Id: I5dca8236d3807cedc5e09d7eda65f0ccec9f5cb2
Signed-off-by: Alexander Ivchenko <alexander.ivchenko@intel.com>
Diffstat (limited to 'gcc-4.9/gcc/config')
-rw-r--r-- | gcc-4.9/gcc/config/i386/i386.c | 153 |
1 files changed, 129 insertions, 24 deletions
diff --git a/gcc-4.9/gcc/config/i386/i386.c b/gcc-4.9/gcc/config/i386/i386.c index fcd5f0dd1..a598b8eef 100644 --- a/gcc-4.9/gcc/config/i386/i386.c +++ b/gcc-4.9/gcc/config/i386/i386.c @@ -44231,6 +44231,127 @@ expand_vec_perm_vpshufb2_vpermq_even_odd (struct expand_vec_perm_d *d) return true; } +/* A subroutine of expand_vec_perm_even_odd_1. Implement extract-even + and extract-odd permutations of two V16QI, V8HI, V16HI or V32QI operands + with two "and" and "pack" or two "shift" and "pack" insns. We should + have already failed all two instruction sequences. */ + +static bool +expand_vec_perm_even_odd_pack (struct expand_vec_perm_d *d) +{ + rtx op, dop0, dop1, t, rperm[16]; + unsigned i, odd, c, s, nelt = d->nelt; + bool end_perm = false; + machine_mode half_mode; + rtx (*gen_and) (rtx, rtx, rtx); + rtx (*gen_pack) (rtx, rtx, rtx); + rtx (*gen_shift) (rtx, rtx, rtx); + + if (d->one_operand_p) + return false; + + switch (d->vmode) + { + case V8HImode: + /* Required for "pack". */ + if (!TARGET_SSE4_1) + return false; + c = 0xffff; + s = 16; + half_mode = V4SImode; + gen_and = gen_andv4si3; + gen_pack = gen_sse4_1_packusdw; + gen_shift = gen_lshrv4si3; + break; + case V16QImode: + /* No check as all instructions are SSE2. */ + c = 0xff; + s = 8; + half_mode = V8HImode; + gen_and = gen_andv8hi3; + gen_pack = gen_sse2_packuswb; + gen_shift = gen_lshrv8hi3; + break; + case V16HImode: + if (!TARGET_AVX2) + return false; + c = 0xffff; + s = 16; + half_mode = V8SImode; + gen_and = gen_andv8si3; + gen_pack = gen_avx2_packusdw; + gen_shift = gen_lshrv8si3; + end_perm = true; + break; + case V32QImode: + if (!TARGET_AVX2) + return false; + c = 0xff; + s = 8; + half_mode = V16HImode; + gen_and = gen_andv16hi3; + gen_pack = gen_avx2_packuswb; + gen_shift = gen_lshrv16hi3; + end_perm = true; + break; + default: + /* Only V8HI, V16QI, V16HI and V32QI modes are more profitable than + general shuffles. */ + return false; + } + + /* Check that permutation is even or odd. */ + odd = d->perm[0]; + if (odd > 1) + return false; + + for (i = 1; i < nelt; ++i) + if (d->perm[i] != 2 * i + odd) + return false; + + if (d->testing_p) + return true; + + dop0 = gen_reg_rtx (half_mode); + dop1 = gen_reg_rtx (half_mode); + if (odd == 0) + { + for (i = 0; i < nelt / 2; i++) + rperm[i] = GEN_INT (c); + t = gen_rtx_CONST_VECTOR (half_mode, gen_rtvec_v (nelt / 2, rperm)); + t = force_reg (half_mode, t); + emit_insn (gen_and (dop0, t, gen_lowpart (half_mode, d->op0))); + emit_insn (gen_and (dop1, t, gen_lowpart (half_mode, d->op1))); + } + else + { + emit_insn (gen_shift (dop0, + gen_lowpart (half_mode, d->op0), + GEN_INT (s))); + emit_insn (gen_shift (dop1, + gen_lowpart (half_mode, d->op1), + GEN_INT (s))); + } + /* In AVX2 for 256 bit case we need to permute pack result. */ + if (TARGET_AVX2 && end_perm) + { + op = gen_reg_rtx (d->vmode); + t = gen_reg_rtx (V4DImode); + emit_insn (gen_pack (op, dop0, dop1)); + emit_insn (gen_avx2_permv4di_1 (t, + gen_lowpart (V4DImode, op), + const0_rtx, + const2_rtx, + const1_rtx, + GEN_INT (3))); + emit_move_insn (d->target, gen_lowpart (d->vmode, t)); + } + else + emit_insn (gen_pack (d->target, dop0, dop1)); + + return true; +} + /* A subroutine of ix86_expand_vec_perm_builtin_1. Implement extract-even and extract-odd permutations. */ @@ -44302,7 +44423,9 @@ expand_vec_perm_even_odd_1 (struct expand_vec_perm_d *d, unsigned odd) gcc_unreachable (); case V8HImode: - if (TARGET_SSSE3 && !TARGET_SLOW_PSHUFB) + if (TARGET_SSE4_1) + return expand_vec_perm_even_odd_pack (d); + else if (TARGET_SSSE3 && !TARGET_SLOW_PSHUFB) return expand_vec_perm_pshufb2 (d); else { @@ -44325,32 +44448,11 @@ expand_vec_perm_even_odd_1 (struct expand_vec_perm_d *d, unsigned odd) break; case V16QImode: - if (TARGET_SSSE3 && !TARGET_SLOW_PSHUFB) - return expand_vec_perm_pshufb2 (d); - else - { - if (d->testing_p) - break; - t1 = gen_reg_rtx (V16QImode); - t2 = gen_reg_rtx (V16QImode); - t3 = gen_reg_rtx (V16QImode); - emit_insn (gen_vec_interleave_highv16qi (t1, d->op0, d->op1)); - emit_insn (gen_vec_interleave_lowv16qi (d->target, d->op0, d->op1)); - emit_insn (gen_vec_interleave_highv16qi (t2, d->target, t1)); - emit_insn (gen_vec_interleave_lowv16qi (d->target, d->target, t1)); - emit_insn (gen_vec_interleave_highv16qi (t3, d->target, t2)); - emit_insn (gen_vec_interleave_lowv16qi (d->target, d->target, t2)); - if (odd) - t3 = gen_vec_interleave_highv16qi (d->target, d->target, t3); - else - t3 = gen_vec_interleave_lowv16qi (d->target, d->target, t3); - emit_insn (t3); - } - break; + return expand_vec_perm_even_odd_pack (d); case V16HImode: case V32QImode: - return expand_vec_perm_vpshufb2_vpermq_even_odd (d); + return expand_vec_perm_even_odd_pack (d); case V4DImode: if (!TARGET_AVX2) @@ -44720,6 +44822,9 @@ ix86_expand_vec_perm_const_1 (struct expand_vec_perm_d *d) /* Try sequences of three instructions. */ + if (expand_vec_perm_even_odd_pack (d)) + return true; + if (expand_vec_perm_2vperm2f128_vshuf (d)) return true; |