From e3cc64dec20832769406aa38cde83c7dd4194bf4 Mon Sep 17 00:00:00 2001 From: Ben Cheng Date: Tue, 22 Apr 2014 13:33:12 -0700 Subject: [4.9] GCC 4.9.0 official release refresh Change-Id: Ic99a7da8b44b789a48aeec93b33e93944d6e6767 --- gcc-4.9/gcc/config/rs6000/altivec.h | 6 ++ gcc-4.9/gcc/config/rs6000/altivec.md | 12 +++ gcc-4.9/gcc/config/rs6000/constraints.md | 5 + gcc-4.9/gcc/config/rs6000/predicates.md | 8 ++ gcc-4.9/gcc/config/rs6000/rs6000-builtin.def | 2 + gcc-4.9/gcc/config/rs6000/rs6000-c.c | 6 ++ gcc-4.9/gcc/config/rs6000/rs6000.c | 32 ++++-- gcc-4.9/gcc/config/rs6000/rs6000.h | 4 + gcc-4.9/gcc/config/rs6000/rs6000.md | 7 +- gcc-4.9/gcc/config/rs6000/vsx.md | 145 ++++++++++++++++++++------- 10 files changed, 183 insertions(+), 44 deletions(-) (limited to 'gcc-4.9/gcc/config/rs6000') diff --git a/gcc-4.9/gcc/config/rs6000/altivec.h b/gcc-4.9/gcc/config/rs6000/altivec.h index 49c250c84..129cf6fa1 100644 --- a/gcc-4.9/gcc/config/rs6000/altivec.h +++ b/gcc-4.9/gcc/config/rs6000/altivec.h @@ -319,6 +319,11 @@ #define vec_sqrt __builtin_vec_sqrt #define vec_vsx_ld __builtin_vec_vsx_ld #define vec_vsx_st __builtin_vec_vsx_st + +/* Note, xxsldi and xxpermdi were added as __builtin_vsx_ functions + instead of __builtin_vec_ */ +#define vec_xxsldwi __builtin_vsx_xxsldwi +#define vec_xxpermdi __builtin_vsx_xxpermdi #endif #ifdef _ARCH_PWR8 @@ -329,6 +334,7 @@ #define vec_vaddcuq __builtin_vec_vaddcuq #define vec_vaddudm __builtin_vec_vaddudm #define vec_vadduqm __builtin_vec_vadduqm +#define vec_vbpermq __builtin_vec_vbpermq #define vec_vclz __builtin_vec_vclz #define vec_vclzb __builtin_vec_vclzb #define vec_vclzd __builtin_vec_vclzd diff --git a/gcc-4.9/gcc/config/rs6000/altivec.md b/gcc-4.9/gcc/config/rs6000/altivec.md index faa88d007..674cb40bf 100644 --- a/gcc-4.9/gcc/config/rs6000/altivec.md +++ b/gcc-4.9/gcc/config/rs6000/altivec.md @@ -142,6 +142,7 @@ UNSPEC_VSUBCUQ UNSPEC_VSUBEUQM UNSPEC_VSUBECUQ + UNSPEC_VBPERMQ ]) (define_c_enum "unspecv" @@ -3322,3 +3323,14 @@ [(set_attr "length" "4") (set_attr "type" "vecsimple")]) +;; We use V2DI as the output type to simplify converting the permute +;; bits into an integer +(define_insn "altivec_vbpermq" + [(set (match_operand:V2DI 0 "register_operand" "=v") + (unspec:V2DI [(match_operand:V16QI 1 "register_operand" "v") + (match_operand:V16QI 2 "register_operand" "v")] + UNSPEC_VBPERMQ))] + "TARGET_P8_VECTOR" + "vbpermq %0,%1,%2" + [(set_attr "length" "4") + (set_attr "type" "vecsimple")]) diff --git a/gcc-4.9/gcc/config/rs6000/constraints.md b/gcc-4.9/gcc/config/rs6000/constraints.md index 50fb101e8..9d6a3bbe7 100644 --- a/gcc-4.9/gcc/config/rs6000/constraints.md +++ b/gcc-4.9/gcc/config/rs6000/constraints.md @@ -106,6 +106,11 @@ (define_register_constraint "wz" "rs6000_constraints[RS6000_CONSTRAINT_wz]" "Floating point register if the LFIWZX instruction is enabled or NO_REGS.") +(define_constraint "wD" + "Int constant that is the element number of the 64-bit scalar in a vector." + (and (match_code "const_int") + (match_test "TARGET_VSX && (ival == VECTOR_ELEMENT_SCALAR_64BIT)"))) + ;; Lq/stq validates the address for load/store quad (define_memory_constraint "wQ" "Memory operand suitable for the load/store quad instructions" diff --git a/gcc-4.9/gcc/config/rs6000/predicates.md b/gcc-4.9/gcc/config/rs6000/predicates.md index 7b1121ddb..28f4f5d98 100644 --- a/gcc-4.9/gcc/config/rs6000/predicates.md +++ b/gcc-4.9/gcc/config/rs6000/predicates.md @@ -981,6 +981,14 @@ (ior (match_operand 0 "zero_fp_constant") (match_operand 0 "reg_or_mem_operand"))) +;; Return 1 if the operand is a CONST_INT and it is the element for 64-bit +;; data types inside of a vector that scalar instructions operate on +(define_predicate "vsx_scalar_64bit" + (match_code "const_int") +{ + return (INTVAL (op) == VECTOR_ELEMENT_SCALAR_64BIT); +}) + ;; Return 1 if the operand is a general register or memory operand without ;; pre_inc or pre_dec or pre_modify, which produces invalid form of PowerPC ;; lwa instruction. diff --git a/gcc-4.9/gcc/config/rs6000/rs6000-builtin.def b/gcc-4.9/gcc/config/rs6000/rs6000-builtin.def index 9226035a3..83351691f 100644 --- a/gcc-4.9/gcc/config/rs6000/rs6000-builtin.def +++ b/gcc-4.9/gcc/config/rs6000/rs6000-builtin.def @@ -1374,6 +1374,7 @@ BU_P8V_AV_2 (VMINUD, "vminud", CONST, uminv2di3) BU_P8V_AV_2 (VMAXUD, "vmaxud", CONST, umaxv2di3) BU_P8V_AV_2 (VMRGEW, "vmrgew", CONST, p8_vmrgew) BU_P8V_AV_2 (VMRGOW, "vmrgow", CONST, p8_vmrgow) +BU_P8V_AV_2 (VBPERMQ, "vbpermq", CONST, altivec_vbpermq) BU_P8V_AV_2 (VPKUDUM, "vpkudum", CONST, altivec_vpkudum) BU_P8V_AV_2 (VPKSDSS, "vpksdss", CONST, altivec_vpksdss) BU_P8V_AV_2 (VPKUDUS, "vpkudus", CONST, altivec_vpkudus) @@ -1448,6 +1449,7 @@ BU_P8V_OVERLOAD_2 (ORC, "orc") BU_P8V_OVERLOAD_2 (VADDCUQ, "vaddcuq") BU_P8V_OVERLOAD_2 (VADDUDM, "vaddudm") BU_P8V_OVERLOAD_2 (VADDUQM, "vadduqm") +BU_P8V_OVERLOAD_2 (VBPERMQ, "vbpermq") BU_P8V_OVERLOAD_2 (VMAXSD, "vmaxsd") BU_P8V_OVERLOAD_2 (VMAXUD, "vmaxud") BU_P8V_OVERLOAD_2 (VMINSD, "vminsd") diff --git a/gcc-4.9/gcc/config/rs6000/rs6000-c.c b/gcc-4.9/gcc/config/rs6000/rs6000-c.c index 0f1dafc5a..46c4a9d8c 100644 --- a/gcc-4.9/gcc/config/rs6000/rs6000-c.c +++ b/gcc-4.9/gcc/config/rs6000/rs6000-c.c @@ -3778,6 +3778,12 @@ const struct altivec_builtin_types altivec_overloaded_builtins[] = { RS6000_BTI_unsigned_V1TI, RS6000_BTI_unsigned_V1TI, RS6000_BTI_unsigned_V1TI, 0 }, + { P8V_BUILTIN_VEC_VBPERMQ, P8V_BUILTIN_VBPERMQ, + RS6000_BTI_V2DI, RS6000_BTI_V16QI, RS6000_BTI_V16QI, 0 }, + { P8V_BUILTIN_VEC_VBPERMQ, P8V_BUILTIN_VBPERMQ, + RS6000_BTI_unsigned_V2DI, RS6000_BTI_unsigned_V16QI, + RS6000_BTI_unsigned_V16QI, 0 }, + { P8V_BUILTIN_VEC_VCLZ, P8V_BUILTIN_VCLZB, RS6000_BTI_V16QI, RS6000_BTI_V16QI, 0, 0 }, { P8V_BUILTIN_VEC_VCLZ, P8V_BUILTIN_VCLZB, diff --git a/gcc-4.9/gcc/config/rs6000/rs6000.c b/gcc-4.9/gcc/config/rs6000/rs6000.c index fc837352c..494efc562 100644 --- a/gcc-4.9/gcc/config/rs6000/rs6000.c +++ b/gcc-4.9/gcc/config/rs6000/rs6000.c @@ -2310,6 +2310,10 @@ rs6000_debug_reg_global (void) (int)END_BUILTINS); fprintf (stderr, DEBUG_FMT_D, "Number of rs6000 builtins", (int)RS6000_BUILTIN_COUNT); + + if (TARGET_VSX) + fprintf (stderr, DEBUG_FMT_D, "VSX easy 64-bit scalar element", + (int)VECTOR_ELEMENT_SCALAR_64BIT); } @@ -5632,11 +5636,15 @@ rs6000_expand_vector_set (rtx target, rtx val, int elt) UNSPEC_VPERM); else { - /* Invert selector. */ + /* Invert selector. We prefer to generate VNAND on P8 so + that future fusion opportunities can kick in, but must + generate VNOR elsewhere. */ rtx notx = gen_rtx_NOT (V16QImode, force_reg (V16QImode, x)); - rtx andx = gen_rtx_AND (V16QImode, notx, notx); + rtx iorx = (TARGET_P8_VECTOR + ? gen_rtx_IOR (V16QImode, notx, notx) + : gen_rtx_AND (V16QImode, notx, notx)); rtx tmp = gen_reg_rtx (V16QImode); - emit_move_insn (tmp, andx); + emit_insn (gen_rtx_SET (VOIDmode, tmp, iorx)); /* Permute with operands reversed and adjusted selector. */ x = gen_rtx_UNSPEC (mode, gen_rtvec (3, reg, target, tmp), @@ -30209,12 +30217,12 @@ altivec_expand_vec_perm_const_le (rtx operands[4]) /* Similarly to altivec_expand_vec_perm_const_le, we must adjust the permute control vector. But here it's not a constant, so we must - generate a vector NOR to do the adjustment. */ + generate a vector NAND or NOR to do the adjustment. */ void altivec_expand_vec_perm_le (rtx operands[4]) { - rtx notx, andx, unspec; + rtx notx, iorx, unspec; rtx target = operands[0]; rtx op0 = operands[1]; rtx op1 = operands[2]; @@ -30233,10 +30241,13 @@ altivec_expand_vec_perm_le (rtx operands[4]) if (!REG_P (target)) tmp = gen_reg_rtx (mode); - /* Invert the selector with a VNOR. */ + /* Invert the selector with a VNAND if available, else a VNOR. + The VNAND is preferred for future fusion opportunities. */ notx = gen_rtx_NOT (V16QImode, sel); - andx = gen_rtx_AND (V16QImode, notx, notx); - emit_move_insn (norreg, andx); + iorx = (TARGET_P8_VECTOR + ? gen_rtx_IOR (V16QImode, notx, notx) + : gen_rtx_AND (V16QImode, notx, notx)); + emit_insn (gen_rtx_SET (VOIDmode, norreg, iorx)); /* Permute with operands reversed and adjusted selector. */ unspec = gen_rtx_UNSPEC (mode, gen_rtvec (3, op1, op0, norreg), @@ -32519,6 +32530,11 @@ fusion_gpr_load_p (rtx *operands, bool peep2_p) if (!peep2_reg_dead_p (2, addis_reg)) return false; + + /* If the target register being loaded is the stack pointer, we must + avoid loading any other value into it, even temporarily. */ + if (REG_P (target) && REGNO (target) == STACK_POINTER_REGNUM) + return false; } base_reg = XEXP (addr, 0); diff --git a/gcc-4.9/gcc/config/rs6000/rs6000.h b/gcc-4.9/gcc/config/rs6000/rs6000.h index a6afb6c37..9ec3647fe 100644 --- a/gcc-4.9/gcc/config/rs6000/rs6000.h +++ b/gcc-4.9/gcc/config/rs6000/rs6000.h @@ -477,6 +477,10 @@ extern int rs6000_vector_align[]; #define VECTOR_ELT_ORDER_BIG \ (BYTES_BIG_ENDIAN || (rs6000_altivec_element_order == 2)) +/* Element number of the 64-bit value in a 128-bit vector that can be accessed + with scalar instructions. */ +#define VECTOR_ELEMENT_SCALAR_64BIT ((BYTES_BIG_ENDIAN) ? 0 : 1) + /* Alignment options for fields in structures for sub-targets following AIX-like ABI. ALIGN_POWER word-aligns FP doubles (default AIX ABI). diff --git a/gcc-4.9/gcc/config/rs6000/rs6000.md b/gcc-4.9/gcc/config/rs6000/rs6000.md index 4bab9591e..64c9e7c10 100644 --- a/gcc-4.9/gcc/config/rs6000/rs6000.md +++ b/gcc-4.9/gcc/config/rs6000/rs6000.md @@ -10028,13 +10028,16 @@ rtx op0 = operands[0]; rtx op1 = operands[1]; rtx op2 = operands[2]; - rtx op0_di = simplify_gen_subreg (DImode, op0, SFmode, 0); + /* Also use the destination register to hold the unconverted DImode value. + This is conceptually a separate value from OP0, so we use gen_rtx_REG + rather than simplify_gen_subreg. */ + rtx op0_di = gen_rtx_REG (DImode, REGNO (op0)); rtx op1_di = simplify_gen_subreg (DImode, op1, SFmode, 0); /* Move SF value to upper 32-bits for xscvspdpn. */ emit_insn (gen_ashldi3 (op2, op1_di, GEN_INT (32))); emit_move_insn (op0_di, op2); - emit_insn (gen_vsx_xscvspdpn_directmove (op0, op0)); + emit_insn (gen_vsx_xscvspdpn_directmove (op0, op0_di)); DONE; } [(set_attr "length" "8") diff --git a/gcc-4.9/gcc/config/rs6000/vsx.md b/gcc-4.9/gcc/config/rs6000/vsx.md index 93c8c3b29..d83cdc3df 100644 --- a/gcc-4.9/gcc/config/rs6000/vsx.md +++ b/gcc-4.9/gcc/config/rs6000/vsx.md @@ -1223,7 +1223,7 @@ ;; Used by direct move to move a SFmode value from GPR to VSX register (define_insn "vsx_xscvspdpn_directmove" [(set (match_operand:SF 0 "vsx_register_operand" "=wa") - (unspec:SF [(match_operand:SF 1 "vsx_register_operand" "wa")] + (unspec:SF [(match_operand:DI 1 "vsx_register_operand" "wa")] UNSPEC_VSX_CVSPDPN))] "TARGET_XSCVSPDPN" "xscvspdpn %x0,%x1" @@ -1531,52 +1531,129 @@ [(set_attr "type" "vecperm")]) ;; Extract a DF/DI element from V2DF/V2DI -(define_insn "vsx_extract_" - [(set (match_operand: 0 "vsx_register_operand" "=ws,d,?wa") - (vec_select: (match_operand:VSX_D 1 "vsx_register_operand" "wd,wd,wa") +(define_expand "vsx_extract_" + [(set (match_operand: 0 "register_operand" "") + (vec_select: (match_operand:VSX_D 1 "register_operand" "") (parallel - [(match_operand:QI 2 "u5bit_cint_operand" "i,i,i")])))] + [(match_operand:QI 2 "u5bit_cint_operand" "")])))] "VECTOR_MEM_VSX_P (mode)" + "") + +;; Optimize cases were we can do a simple or direct move. +;; Or see if we can avoid doing the move at all +(define_insn "*vsx_extract__internal1" + [(set (match_operand: 0 "register_operand" "=d,ws,?wa,r") + (vec_select: + (match_operand:VSX_D 1 "register_operand" "d,wd,wa,wm") + (parallel + [(match_operand:QI 2 "vsx_scalar_64bit" "wD,wD,wD,wD")])))] + "VECTOR_MEM_VSX_P (mode) && TARGET_POWERPC64 && TARGET_DIRECT_MOVE" +{ + int op0_regno = REGNO (operands[0]); + int op1_regno = REGNO (operands[1]); + + if (op0_regno == op1_regno) + return "nop"; + + if (INT_REGNO_P (op0_regno)) + return "mfvsrd %0,%x1"; + + if (FP_REGNO_P (op0_regno) && FP_REGNO_P (op1_regno)) + return "fmr %0,%1"; + + return "xxlor %x0,%x1,%x1"; +} + [(set_attr "type" "fp,vecsimple,vecsimple,mftgpr") + (set_attr "length" "4")]) + +(define_insn "*vsx_extract__internal2" + [(set (match_operand: 0 "vsx_register_operand" "=d,ws,ws,?wa") + (vec_select: + (match_operand:VSX_D 1 "vsx_register_operand" "d,wd,wd,wa") + (parallel [(match_operand:QI 2 "u5bit_cint_operand" "wD,wD,i,i")])))] + "VECTOR_MEM_VSX_P (mode) + && (!TARGET_POWERPC64 || !TARGET_DIRECT_MOVE + || INTVAL (operands[2]) != VECTOR_ELEMENT_SCALAR_64BIT)" { int fldDM; gcc_assert (UINTVAL (operands[2]) <= 1); + + if (INTVAL (operands[2]) == VECTOR_ELEMENT_SCALAR_64BIT) + { + int op0_regno = REGNO (operands[0]); + int op1_regno = REGNO (operands[1]); + + if (op0_regno == op1_regno) + return "nop"; + + if (FP_REGNO_P (op0_regno) && FP_REGNO_P (op1_regno)) + return "fmr %0,%1"; + + return "xxlor %x0,%x1,%x1"; + } + fldDM = INTVAL (operands[2]) << 1; if (!BYTES_BIG_ENDIAN) fldDM = 3 - fldDM; operands[3] = GEN_INT (fldDM); - return \"xxpermdi %x0,%x1,%x1,%3\"; + return "xxpermdi %x0,%x1,%x1,%3"; } - [(set_attr "type" "vecperm")]) + [(set_attr "type" "fp,vecsimple,vecperm,vecperm") + (set_attr "length" "4")]) -;; Optimize extracting element 0 from memory -(define_insn "*vsx_extract__zero" - [(set (match_operand: 0 "vsx_register_operand" "=ws,d,?wa") +;; Optimize extracting a single scalar element from memory if the scalar is in +;; the correct location to use a single load. +(define_insn "*vsx_extract__load" + [(set (match_operand: 0 "register_operand" "=d,wv,wr") (vec_select: - (match_operand:VSX_D 1 "indexed_or_indirect_operand" "Z,Z,Z") - (parallel [(const_int 0)])))] - "VECTOR_MEM_VSX_P (mode) && WORDS_BIG_ENDIAN" - "lxsd%U1x %x0,%y1" - [(set (attr "type") - (if_then_else - (match_test "update_indexed_address_mem (operands[1], VOIDmode)") - (const_string "fpload_ux") - (const_string "fpload"))) - (set_attr "length" "4")]) - -;; Optimize extracting element 1 from memory for little endian -(define_insn "*vsx_extract__one_le" - [(set (match_operand: 0 "vsx_register_operand" "=ws,d,?wa") + (match_operand:VSX_D 1 "memory_operand" "m,Z,m") + (parallel [(match_operand:QI 2 "vsx_scalar_64bit" "wD,wD,wD")])))] + "VECTOR_MEM_VSX_P (mode)" + "@ + lfd%U1%X1 %0,%1 + lxsd%U1x %x0,%y1 + ld%U1%X1 %0,%1" + [(set_attr_alternative "type" + [(if_then_else + (match_test "update_indexed_address_mem (operands[1], VOIDmode)") + (const_string "fpload_ux") + (if_then_else + (match_test "update_address_mem (operands[1], VOIDmode)") + (const_string "fpload_u") + (const_string "fpload"))) + (const_string "fpload") + (if_then_else + (match_test "update_indexed_address_mem (operands[1], VOIDmode)") + (const_string "load_ux") + (if_then_else + (match_test "update_address_mem (operands[1], VOIDmode)") + (const_string "load_u") + (const_string "load")))]) + (set_attr "length" "4")]) + +;; Optimize storing a single scalar element that is the right location to +;; memory +(define_insn "*vsx_extract__store" + [(set (match_operand: 0 "memory_operand" "=m,Z,?Z") (vec_select: - (match_operand:VSX_D 1 "indexed_or_indirect_operand" "Z,Z,Z") - (parallel [(const_int 1)])))] - "VECTOR_MEM_VSX_P (mode) && !WORDS_BIG_ENDIAN" - "lxsd%U1x %x0,%y1" - [(set (attr "type") - (if_then_else - (match_test "update_indexed_address_mem (operands[1], VOIDmode)") - (const_string "fpload_ux") - (const_string "fpload"))) - (set_attr "length" "4")]) + (match_operand:VSX_D 1 "register_operand" "d,wd,wa") + (parallel [(match_operand:QI 2 "vsx_scalar_64bit" "wD,wD,wD")])))] + "VECTOR_MEM_VSX_P (mode)" + "@ + stfd%U0%X0 %1,%0 + stxsd%U0x %x1,%y0 + stxsd%U0x %x1,%y0" + [(set_attr_alternative "type" + [(if_then_else + (match_test "update_indexed_address_mem (operands[0], VOIDmode)") + (const_string "fpstore_ux") + (if_then_else + (match_test "update_address_mem (operands[0], VOIDmode)") + (const_string "fpstore_u") + (const_string "fpstore"))) + (const_string "fpstore") + (const_string "fpstore")]) + (set_attr "length" "4")]) ;; Extract a SF element from V4SF (define_insn_and_split "vsx_extract_v4sf" -- cgit v1.2.3