diff options
Diffstat (limited to 'lib/Target/X86/X86ISelLowering.cpp')
-rw-r--r-- | lib/Target/X86/X86ISelLowering.cpp | 890 |
1 files changed, 541 insertions, 349 deletions
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index 4ab92add25..1c3b9ae790 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -605,10 +605,12 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) setOperationAction(ISD::FGETSIGN, MVT::i32, Custom); // We don't support sin/cos/fmod - setOperationAction(ISD::FSIN , MVT::f64, Expand); - setOperationAction(ISD::FCOS , MVT::f64, Expand); - setOperationAction(ISD::FSIN , MVT::f32, Expand); - setOperationAction(ISD::FCOS , MVT::f32, Expand); + setOperationAction(ISD::FSIN , MVT::f64, Expand); + setOperationAction(ISD::FCOS , MVT::f64, Expand); + setOperationAction(ISD::FSINCOS, MVT::f64, Expand); + setOperationAction(ISD::FSIN , MVT::f32, Expand); + setOperationAction(ISD::FCOS , MVT::f32, Expand); + setOperationAction(ISD::FSINCOS, MVT::f32, Expand); // Expand FP immediates into loads from the stack, except for the special // cases we handle. @@ -633,8 +635,9 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom); // We don't support sin/cos/fmod - setOperationAction(ISD::FSIN , MVT::f32, Expand); - setOperationAction(ISD::FCOS , MVT::f32, Expand); + setOperationAction(ISD::FSIN , MVT::f32, Expand); + setOperationAction(ISD::FCOS , MVT::f32, Expand); + setOperationAction(ISD::FSINCOS, MVT::f32, Expand); // Special cases we handle for FP constants. addLegalFPImmediate(APFloat(+0.0f)); // xorps @@ -644,8 +647,9 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS if (!TM.Options.UnsafeFPMath) { - setOperationAction(ISD::FSIN , MVT::f64 , Expand); - setOperationAction(ISD::FCOS , MVT::f64 , Expand); + setOperationAction(ISD::FSIN , MVT::f64, Expand); + setOperationAction(ISD::FCOS , MVT::f64, Expand); + setOperationAction(ISD::FSINCOS, MVT::f64, Expand); } } else if (!TM.Options.UseSoftFloat) { // f32 and f64 in x87. @@ -659,10 +663,12 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand); if (!TM.Options.UnsafeFPMath) { - setOperationAction(ISD::FSIN , MVT::f32 , Expand); - setOperationAction(ISD::FSIN , MVT::f64 , Expand); - setOperationAction(ISD::FCOS , MVT::f32 , Expand); - setOperationAction(ISD::FCOS , MVT::f64 , Expand); + setOperationAction(ISD::FSIN , MVT::f64, Expand); + setOperationAction(ISD::FSIN , MVT::f32, Expand); + setOperationAction(ISD::FCOS , MVT::f64, Expand); + setOperationAction(ISD::FCOS , MVT::f32, Expand); + setOperationAction(ISD::FSINCOS, MVT::f64, Expand); + setOperationAction(ISD::FSINCOS, MVT::f32, Expand); } addLegalFPImmediate(APFloat(+0.0)); // FLD0 addLegalFPImmediate(APFloat(+1.0)); // FLD1 @@ -699,8 +705,9 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) } if (!TM.Options.UnsafeFPMath) { - setOperationAction(ISD::FSIN , MVT::f80 , Expand); - setOperationAction(ISD::FCOS , MVT::f80 , Expand); + setOperationAction(ISD::FSIN , MVT::f80, Expand); + setOperationAction(ISD::FCOS , MVT::f80, Expand); + setOperationAction(ISD::FSINCOS, MVT::f80, Expand); } setOperationAction(ISD::FFLOOR, MVT::f80, Expand); @@ -748,7 +755,9 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) setOperationAction(ISD::INSERT_SUBVECTOR, VT,Expand); setOperationAction(ISD::FABS, VT, Expand); setOperationAction(ISD::FSIN, VT, Expand); + setOperationAction(ISD::FSINCOS, VT, Expand); setOperationAction(ISD::FCOS, VT, Expand); + setOperationAction(ISD::FSINCOS, VT, Expand); setOperationAction(ISD::FREM, VT, Expand); setOperationAction(ISD::FMA, VT, Expand); setOperationAction(ISD::FPOWI, VT, Expand); @@ -1047,6 +1056,8 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) setOperationAction(ISD::SRA, MVT::v4i32, Custom); } + setOperationAction(ISD::SDIV, MVT::v8i16, Custom); + setOperationAction(ISD::SDIV, MVT::v4i32, Custom); } if (!TM.Options.UseSoftFloat && Subtarget->hasFp256()) { @@ -1111,6 +1122,8 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) setOperationAction(ISD::SRA, MVT::v16i16, Custom); setOperationAction(ISD::SRA, MVT::v32i8, Custom); + setOperationAction(ISD::SDIV, MVT::v16i16, Custom); + setOperationAction(ISD::SETCC, MVT::v32i8, Custom); setOperationAction(ISD::SETCC, MVT::v16i16, Custom); setOperationAction(ISD::SETCC, MVT::v8i32, Custom); @@ -1166,6 +1179,8 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) setOperationAction(ISD::SHL, MVT::v8i32, Legal); setOperationAction(ISD::SRA, MVT::v8i32, Legal); + + setOperationAction(ISD::SDIV, MVT::v8i32, Custom); } else { setOperationAction(ISD::ADD, MVT::v4i64, Custom); setOperationAction(ISD::ADD, MVT::v8i32, Custom); @@ -1275,6 +1290,19 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) setLibcallName(RTLIB::SRA_I128, 0); } + // Combine sin / cos into one node or libcall if possible. + if (Subtarget->hasSinCos()) { + setLibcallName(RTLIB::SINCOS_F32, "sincosf"); + setLibcallName(RTLIB::SINCOS_F64, "sincos"); + if (Subtarget->isTargetDarwin()) { + // For MacOSX, we don't want to the normal expansion of a libcall to + // sincos. We want to issue a libcall to __sincos_stret to avoid memory + // traffic. + setOperationAction(ISD::FSINCOS, MVT::f64, Custom); + setOperationAction(ISD::FSINCOS, MVT::f32, Custom); + } + } + // We have target-specific dag combine patterns for the following nodes: setTargetDAGCombine(ISD::VECTOR_SHUFFLE); setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT); @@ -1295,6 +1323,7 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) setTargetDAGCombine(ISD::ZERO_EXTEND); setTargetDAGCombine(ISD::ANY_EXTEND); setTargetDAGCombine(ISD::SIGN_EXTEND); + setTargetDAGCombine(ISD::SIGN_EXTEND_INREG); setTargetDAGCombine(ISD::TRUNCATE); setTargetDAGCombine(ISD::SINT_TO_FP); setTargetDAGCombine(ISD::SETCC); @@ -1306,17 +1335,17 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) // On Darwin, -Os means optimize for size without hurting performance, // do not reduce the limit. - maxStoresPerMemset = 16; // For @llvm.memset -> sequence of stores - maxStoresPerMemsetOptSize = Subtarget->isTargetDarwin() ? 16 : 8; - maxStoresPerMemcpy = 8; // For @llvm.memcpy -> sequence of stores - maxStoresPerMemcpyOptSize = Subtarget->isTargetDarwin() ? 8 : 4; - maxStoresPerMemmove = 8; // For @llvm.memmove -> sequence of stores - maxStoresPerMemmoveOptSize = Subtarget->isTargetDarwin() ? 8 : 4; + MaxStoresPerMemset = 16; // For @llvm.memset -> sequence of stores + MaxStoresPerMemsetOptSize = Subtarget->isTargetDarwin() ? 16 : 8; + MaxStoresPerMemcpy = 8; // For @llvm.memcpy -> sequence of stores + MaxStoresPerMemcpyOptSize = Subtarget->isTargetDarwin() ? 8 : 4; + MaxStoresPerMemmove = 8; // For @llvm.memmove -> sequence of stores + MaxStoresPerMemmoveOptSize = Subtarget->isTargetDarwin() ? 8 : 4; setPrefLoopAlignment(4); // 2^4 bytes. - benefitFromCodePlacementOpt = true; + BenefitFromCodePlacementOpt = true; // Predictable cmov don't hurt on atom because it's in-order. - predictableSelectIsExpensive = !Subtarget->isAtom(); + PredictableSelectIsExpensive = !Subtarget->isAtom(); setPrefFunctionAlignment(4); // 2^4 bytes. } @@ -1562,14 +1591,7 @@ X86TargetLowering::LowerReturn(SDValue Chain, RVLocs, *DAG.getContext()); CCInfo.AnalyzeReturn(Outs, RetCC_X86); - // Add the regs to the liveout set for the function. - MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo(); - for (unsigned i = 0; i != RVLocs.size(); ++i) - if (RVLocs[i].isRegLoc() && !MRI.isLiveOut(RVLocs[i].getLocReg())) - MRI.addLiveOut(RVLocs[i].getLocReg()); - SDValue Flag; - SmallVector<SDValue, 6> RetOps; RetOps.push_back(Chain); // Operand #0 = Chain (updated below) // Operand #1 = Bytes To Pop @@ -1638,12 +1660,13 @@ X86TargetLowering::LowerReturn(SDValue Chain, Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), ValToCopy, Flag); Flag = Chain.getValue(1); + RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT())); } - // The x86-64 ABI for returning structs by value requires that we copy - // the sret argument into %rax for the return. We saved the argument into - // a virtual register in the entry block, so now we copy the value out - // and into %rax. + // The x86-64 ABIs require that for returning structs by value we copy + // the sret argument into %rax/%eax (depending on ABI) for the return. + // We saved the argument into a virtual register in the entry block, + // so now we copy the value out and into %rax/%eax. if (Subtarget->is64Bit() && DAG.getMachineFunction().getFunction()->hasStructRetAttr()) { MachineFunction &MF = DAG.getMachineFunction(); @@ -1653,11 +1676,12 @@ X86TargetLowering::LowerReturn(SDValue Chain, "SRetReturnReg should have been set in LowerFormalArguments()."); SDValue Val = DAG.getCopyFromReg(Chain, dl, Reg, getPointerTy()); - Chain = DAG.getCopyToReg(Chain, dl, X86::RAX, Val, Flag); + unsigned RetValReg = Subtarget->isTarget64BitILP32() ? X86::EAX : X86::RAX; + Chain = DAG.getCopyToReg(Chain, dl, RetValReg, Val, Flag); Flag = Chain.getValue(1); - // RAX now acts like a return value. - MRI.addLiveOut(X86::RAX); + // RAX/EAX now acts like a return value. + RetOps.push_back(DAG.getRegister(RetValReg, MVT::i64)); } RetOps[0] = Chain; // Update chain. @@ -2009,14 +2033,16 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain, InVals.push_back(ArgValue); } - // The x86-64 ABI for returning structs by value requires that we copy - // the sret argument into %rax for the return. Save the argument into - // a virtual register so that we can access it from the return points. + // The x86-64 ABIs require that for returning structs by value we copy + // the sret argument into %rax/%eax (depending on ABI) for the return. + // Save the argument into a virtual register so that we can access it + // from the return points. if (Is64Bit && MF.getFunction()->hasStructRetAttr()) { X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); unsigned Reg = FuncInfo->getSRetReturnReg(); if (!Reg) { - Reg = MF.getRegInfo().createVirtualRegister(getRegClassFor(MVT::i64)); + MVT PtrTy = getPointerTy(); + Reg = MF.getRegInfo().createVirtualRegister(getRegClassFor(PtrTy)); FuncInfo->setSRetReturnReg(Reg); } SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), dl, Reg, InVals[0]); @@ -2630,8 +2656,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, // This isn't right, although it's probably harmless on x86; liveouts // should be computed from returns not tail calls. Consider a void // function making a tail call to a function returning int. - return DAG.getNode(X86ISD::TC_RETURN, dl, - NodeTys, &Ops[0], Ops.size()); + return DAG.getNode(X86ISD::TC_RETURN, dl, NodeTys, &Ops[0], Ops.size()); } Chain = DAG.getNode(X86ISD::CALL, dl, NodeTys, &Ops[0], Ops.size()); @@ -2789,7 +2814,7 @@ X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee, const SmallVectorImpl<ISD::OutputArg> &Outs, const SmallVectorImpl<SDValue> &OutVals, const SmallVectorImpl<ISD::InputArg> &Ins, - SelectionDAG& DAG) const { + SelectionDAG &DAG) const { if (!IsTailCallConvention(CalleeCC) && CalleeCC != CallingConv::C) return false; @@ -2828,7 +2853,7 @@ X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee, // An stdcall caller is expected to clean up its arguments; the callee // isn't going to do that. - if (!CCMatch && CallerCC==CallingConv::X86_StdCall) + if (!CCMatch && CallerCC == CallingConv::X86_StdCall) return false; // Do not sibcall optimize vararg calls unless all arguments are passed via @@ -2948,9 +2973,15 @@ X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee, // callee-saved registers are restored. These happen to be the same // registers used to pass 'inreg' arguments so watch out for those. if (!Subtarget->is64Bit() && - !isa<GlobalAddressSDNode>(Callee) && - !isa<ExternalSymbolSDNode>(Callee)) { + ((!isa<GlobalAddressSDNode>(Callee) && + !isa<ExternalSymbolSDNode>(Callee)) || + getTargetMachine().getRelocationModel() == Reloc::PIC_)) { unsigned NumInRegs = 0; + // In PIC we need an extra register to formulate the address computation + // for the callee. + unsigned MaxInRegs = + (getTargetMachine().getRelocationModel() == Reloc::PIC_) ? 2 : 3; + for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { CCValAssign &VA = ArgLocs[i]; if (!VA.isRegLoc()) @@ -2959,7 +2990,7 @@ X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee, switch (Reg) { default: break; case X86::EAX: case X86::EDX: case X86::ECX: - if (++NumInRegs == 3) + if (++NumInRegs == MaxInRegs) return false; break; } @@ -2995,7 +3026,7 @@ static bool isTargetShuffle(unsigned Opcode) { case X86ISD::PSHUFHW: case X86ISD::PSHUFLW: case X86ISD::SHUFP: - case X86ISD::PALIGN: + case X86ISD::PALIGNR: case X86ISD::MOVLHPS: case X86ISD::MOVLHPD: case X86ISD::MOVHLPS: @@ -3045,7 +3076,7 @@ static SDValue getTargetShuffleNode(unsigned Opc, DebugLoc dl, EVT VT, SelectionDAG &DAG) { switch(Opc) { default: llvm_unreachable("Unknown x86 shuffle node"); - case X86ISD::PALIGN: + case X86ISD::PALIGNR: case X86ISD::SHUFP: case X86ISD::VPERM2X128: return DAG.getNode(Opc, dl, VT, V1, V2, @@ -3355,8 +3386,8 @@ static bool isPSHUFLWMask(ArrayRef<int> Mask, EVT VT, bool HasInt256) { /// is suitable for input to PALIGNR. static bool isPALIGNRMask(ArrayRef<int> Mask, EVT VT, const X86Subtarget *Subtarget) { - if ((VT.getSizeInBits() == 128 && !Subtarget->hasSSSE3()) || - (VT.getSizeInBits() == 256 && !Subtarget->hasInt256())) + if ((VT.is128BitVector() && !Subtarget->hasSSSE3()) || + (VT.is256BitVector() && !Subtarget->hasInt256())) return false; unsigned NumElts = VT.getVectorNumElements(); @@ -3445,7 +3476,7 @@ static void CommuteVectorShuffleMask(SmallVectorImpl<int> &Mask, /// reverse of what x86 shuffles want. static bool isSHUFPMask(ArrayRef<int> Mask, EVT VT, bool HasFp256, bool Commuted = false) { - if (!HasFp256 && VT.getSizeInBits() == 256) + if (!HasFp256 && VT.is256BitVector()) return false; unsigned NumElems = VT.getVectorNumElements(); @@ -3580,7 +3611,7 @@ static bool isMOVLHPSMask(ArrayRef<int> Mask, EVT VT) { static SDValue Compact8x32ShuffleNode(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) { - EVT VT = SVOp->getValueType(0); + MVT VT = SVOp->getValueType(0).getSimpleVT(); DebugLoc dl = SVOp->getDebugLoc(); if (VT != MVT::v8i32 && VT != MVT::v8f32) @@ -3630,7 +3661,7 @@ static bool isUNPCKLMask(ArrayRef<int> Mask, EVT VT, assert((VT.is128BitVector() || VT.is256BitVector()) && "Unsupported vector type for unpckh"); - if (VT.getSizeInBits() == 256 && NumElts != 4 && NumElts != 8 && + if (VT.is256BitVector() && NumElts != 4 && NumElts != 8 && (!HasInt256 || (NumElts != 16 && NumElts != 32))) return false; @@ -3669,7 +3700,7 @@ static bool isUNPCKHMask(ArrayRef<int> Mask, EVT VT, assert((VT.is128BitVector() || VT.is256BitVector()) && "Unsupported vector type for unpckh"); - if (VT.getSizeInBits() == 256 && NumElts != 4 && NumElts != 8 && + if (VT.is256BitVector() && NumElts != 4 && NumElts != 8 && (!HasInt256 || (NumElts != 16 && NumElts != 32))) return false; @@ -3700,14 +3731,14 @@ static bool isUNPCKHMask(ArrayRef<int> Mask, EVT VT, /// isUNPCKL_v_undef_Mask - Special case of isUNPCKLMask for canonical form /// of vector_shuffle v, v, <0, 4, 1, 5>, i.e. vector_shuffle v, undef, /// <0, 0, 1, 1> -static bool isUNPCKL_v_undef_Mask(ArrayRef<int> Mask, EVT VT, - bool HasInt256) { +static bool isUNPCKL_v_undef_Mask(ArrayRef<int> Mask, EVT VT, bool HasInt256) { unsigned NumElts = VT.getVectorNumElements(); + bool Is256BitVec = VT.is256BitVector(); assert((VT.is128BitVector() || VT.is256BitVector()) && "Unsupported vector type for unpckh"); - if (VT.getSizeInBits() == 256 && NumElts != 4 && NumElts != 8 && + if (Is256BitVec && NumElts != 4 && NumElts != 8 && (!HasInt256 || (NumElts != 16 && NumElts != 32))) return false; @@ -3715,7 +3746,7 @@ static bool isUNPCKL_v_undef_Mask(ArrayRef<int> Mask, EVT VT, // FIXME: Need a better way to get rid of this, there's no latency difference // between UNPCKLPD and MOVDDUP, the later should always be checked first and // the former later. We should also remove the "_undef" special mask. - if (NumElts == 4 && VT.getSizeInBits() == 256) + if (NumElts == 4 && Is256BitVec) return false; // Handle 128 and 256-bit vector lengths. AVX defines UNPCK* to operate @@ -3749,7 +3780,7 @@ static bool isUNPCKH_v_undef_Mask(ArrayRef<int> Mask, EVT VT, bool HasInt256) { assert((VT.is128BitVector() || VT.is256BitVector()) && "Unsupported vector type for unpckh"); - if (VT.getSizeInBits() == 256 && NumElts != 4 && NumElts != 8 && + if (VT.is256BitVector() && NumElts != 4 && NumElts != 8 && (!HasInt256 || (NumElts != 16 && NumElts != 32))) return false; @@ -3831,7 +3862,7 @@ static bool isVPERM2X128Mask(ArrayRef<int> Mask, EVT VT, bool HasFp256) { /// getShuffleVPERM2X128Immediate - Return the appropriate immediate to shuffle /// the specified VECTOR_MASK mask with VPERM2F128/VPERM2I128 instructions. static unsigned getShuffleVPERM2X128Immediate(ShuffleVectorSDNode *SVOp) { - EVT VT = SVOp->getValueType(0); + MVT VT = SVOp->getValueType(0).getSimpleVT(); unsigned HalfSize = VT.getVectorNumElements()/2; @@ -3865,7 +3896,7 @@ static bool isVPERMILPMask(ArrayRef<int> Mask, EVT VT, bool HasFp256) { unsigned NumElts = VT.getVectorNumElements(); // Only match 256-bit with 32/64-bit types - if (VT.getSizeInBits() != 256 || (NumElts != 4 && NumElts != 8)) + if (!VT.is256BitVector() || (NumElts != 4 && NumElts != 8)) return false; unsigned NumLanes = VT.getSizeInBits()/128; @@ -3921,8 +3952,8 @@ static bool isMOVSHDUPMask(ArrayRef<int> Mask, EVT VT, unsigned NumElems = VT.getVectorNumElements(); - if ((VT.getSizeInBits() == 128 && NumElems != 4) || - (VT.getSizeInBits() == 256 && NumElems != 8)) + if ((VT.is128BitVector() && NumElems != 4) || + (VT.is256BitVector() && NumElems != 8)) return false; // "i+1" is the value the indexed mask element must have @@ -3944,8 +3975,8 @@ static bool isMOVSLDUPMask(ArrayRef<int> Mask, EVT VT, unsigned NumElems = VT.getVectorNumElements(); - if ((VT.getSizeInBits() == 128 && NumElems != 4) || - (VT.getSizeInBits() == 256 && NumElems != 8)) + if ((VT.is128BitVector() && NumElems != 4) || + (VT.is256BitVector() && NumElems != 8)) return false; // "i" is the value the indexed mask element must have @@ -4005,9 +4036,8 @@ bool X86::isVEXTRACTF128Index(SDNode *N) { uint64_t Index = cast<ConstantSDNode>(N->getOperand(1).getNode())->getZExtValue(); - unsigned VL = N->getValueType(0).getVectorNumElements(); - unsigned VBits = N->getValueType(0).getSizeInBits(); - unsigned ElSize = VBits / VL; + MVT VT = N->getValueType(0).getSimpleVT(); + unsigned ElSize = VT.getVectorElementType().getSizeInBits(); bool Result = (Index * ElSize) % 128 == 0; return Result; @@ -4024,9 +4054,8 @@ bool X86::isVINSERTF128Index(SDNode *N) { uint64_t Index = cast<ConstantSDNode>(N->getOperand(2).getNode())->getZExtValue(); - unsigned VL = N->getValueType(0).getVectorNumElements(); - unsigned VBits = N->getValueType(0).getSizeInBits(); - unsigned ElSize = VBits / VL; + MVT VT = N->getValueType(0).getSimpleVT(); + unsigned ElSize = VT.getVectorElementType().getSizeInBits(); bool Result = (Index * ElSize) % 128 == 0; return Result; @@ -4036,7 +4065,7 @@ bool X86::isVINSERTF128Index(SDNode *N) { /// the specified VECTOR_SHUFFLE mask with PSHUF* and SHUFP* instructions. /// Handles 128-bit and 256-bit. static unsigned getShuffleSHUFImmediate(ShuffleVectorSDNode *N) { - EVT VT = N->getValueType(0); + MVT VT = N->getValueType(0).getSimpleVT(); assert((VT.is128BitVector() || VT.is256BitVector()) && "Unsupported vector type for PSHUF/SHUFP"); @@ -4066,7 +4095,7 @@ static unsigned getShuffleSHUFImmediate(ShuffleVectorSDNode *N) { /// getShufflePSHUFHWImmediate - Return the appropriate immediate to shuffle /// the specified VECTOR_SHUFFLE mask with the PSHUFHW instruction. static unsigned getShufflePSHUFHWImmediate(ShuffleVectorSDNode *N) { - EVT VT = N->getValueType(0); + MVT VT = N->getValueType(0).getSimpleVT(); assert((VT == MVT::v8i16 || VT == MVT::v16i16) && "Unsupported vector type for PSHUFHW"); @@ -4090,7 +4119,7 @@ static unsigned getShufflePSHUFHWImmediate(ShuffleVectorSDNode *N) { /// getShufflePSHUFLWImmediate - Return the appropriate immediate to shuffle /// the specified VECTOR_SHUFFLE mask with the PSHUFLW instruction. static unsigned getShufflePSHUFLWImmediate(ShuffleVectorSDNode *N) { - EVT VT = N->getValueType(0); + MVT VT = N->getValueType(0).getSimpleVT(); assert((VT == MVT::v8i16 || VT == MVT::v16i16) && "Unsupported vector type for PSHUFHW"); @@ -4114,7 +4143,7 @@ static unsigned getShufflePSHUFLWImmediate(ShuffleVectorSDNode *N) { /// getShufflePALIGNRImmediate - Return the appropriate immediate to shuffle /// the specified VECTOR_SHUFFLE mask with the PALIGNR instruction. static unsigned getShufflePALIGNRImmediate(ShuffleVectorSDNode *SVOp) { - EVT VT = SVOp->getValueType(0); + MVT VT = SVOp->getValueType(0).getSimpleVT(); unsigned EltSize = VT.getVectorElementType().getSizeInBits() >> 3; unsigned NumElts = VT.getVectorNumElements(); @@ -4145,8 +4174,8 @@ unsigned X86::getExtractVEXTRACTF128Immediate(SDNode *N) { uint64_t Index = cast<ConstantSDNode>(N->getOperand(1).getNode())->getZExtValue(); - EVT VecVT = N->getOperand(0).getValueType(); - EVT ElVT = VecVT.getVectorElementType(); + MVT VecVT = N->getOperand(0).getValueType().getSimpleVT(); + MVT ElVT = VecVT.getVectorElementType(); unsigned NumElemsPerChunk = 128 / ElVT.getSizeInBits(); return Index / NumElemsPerChunk; @@ -4162,8 +4191,8 @@ unsigned X86::getInsertVINSERTF128Immediate(SDNode *N) { uint64_t Index = cast<ConstantSDNode>(N->getOperand(2).getNode())->getZExtValue(); - EVT VecVT = N->getValueType(0); - EVT ElVT = VecVT.getVectorElementType(); + MVT VecVT = N->getValueType(0).getSimpleVT(); + MVT ElVT = VecVT.getVectorElementType(); unsigned NumElemsPerChunk = 128 / ElVT.getSizeInBits(); return Index / NumElemsPerChunk; @@ -4173,7 +4202,7 @@ unsigned X86::getInsertVINSERTF128Immediate(SDNode *N) { /// the specified VECTOR_SHUFFLE mask with VPERMQ and VPERMPD instructions. /// Handles 256-bit. static unsigned getShuffleCLImmediate(ShuffleVectorSDNode *N) { - EVT VT = N->getValueType(0); + MVT VT = N->getValueType(0).getSimpleVT(); unsigned NumElts = VT.getVectorNumElements(); @@ -4193,17 +4222,18 @@ static unsigned getShuffleCLImmediate(ShuffleVectorSDNode *N) { /// isZeroNode - Returns true if Elt is a constant zero or a floating point /// constant +0.0. bool X86::isZeroNode(SDValue Elt) { - return ((isa<ConstantSDNode>(Elt) && - cast<ConstantSDNode>(Elt)->isNullValue()) || - (isa<ConstantFPSDNode>(Elt) && - cast<ConstantFPSDNode>(Elt)->getValueAPF().isPosZero())); + if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Elt)) + return CN->isNullValue(); + if (ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(Elt)) + return CFP->getValueAPF().isPosZero(); + return false; } /// CommuteVectorShuffle - Swap vector_shuffle operands as well as values in /// their permute mask. static SDValue CommuteVectorShuffle(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) { - EVT VT = SVOp->getValueType(0); + MVT VT = SVOp->getValueType(0).getSimpleVT(); unsigned NumElems = VT.getVectorNumElements(); SmallVector<int, 8> MaskVec; @@ -4352,12 +4382,11 @@ static bool isZeroShuffle(ShuffleVectorSDNode *N) { static SDValue getZeroVector(EVT VT, const X86Subtarget *Subtarget, SelectionDAG &DAG, DebugLoc dl) { assert(VT.isVector() && "Expected a vector type"); - unsigned Size = VT.getSizeInBits(); // Always build SSE zero vectors as <4 x i32> bitcasted // to their dest type. This ensures they get CSE'd. SDValue Vec; - if (Size == 128) { // SSE + if (VT.is128BitVector()) { // SSE if (Subtarget->hasSSE2()) { // SSE2 SDValue Cst = DAG.getTargetConstant(0, MVT::i32); Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst); @@ -4365,7 +4394,7 @@ static SDValue getZeroVector(EVT VT, const X86Subtarget *Subtarget, SDValue Cst = DAG.getTargetConstantFP(+0.0, MVT::f32); Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4f32, Cst, Cst, Cst, Cst); } - } else if (Size == 256) { // AVX + } else if (VT.is256BitVector()) { // AVX if (Subtarget->hasInt256()) { // AVX2 SDValue Cst = DAG.getTargetConstant(0, MVT::i32); SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst }; @@ -4387,14 +4416,13 @@ static SDValue getZeroVector(EVT VT, const X86Subtarget *Subtarget, /// Always build ones vectors as <4 x i32> or <8 x i32>. For 256-bit types with /// no AVX2 supprt, use two <4 x i32> inserted in a <8 x i32> appropriately. /// Then bitcast to their original type, ensuring they get CSE'd. -static SDValue getOnesVector(EVT VT, bool HasInt256, SelectionDAG &DAG, +static SDValue getOnesVector(MVT VT, bool HasInt256, SelectionDAG &DAG, DebugLoc dl) { assert(VT.isVector() && "Expected a vector type"); - unsigned Size = VT.getSizeInBits(); SDValue Cst = DAG.getTargetConstant(~0U, MVT::i32); SDValue Vec; - if (Size == 256) { + if (VT.is256BitVector()) { if (HasInt256) { // AVX2 SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst }; Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8i32, Ops, 8); @@ -4402,7 +4430,7 @@ static SDValue getOnesVector(EVT VT, bool HasInt256, SelectionDAG &DAG, Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst); Vec = Concat128BitVectors(Vec, Vec, MVT::v8i32, 8, DAG, dl); } - } else if (Size == 128) { + } else if (VT.is128BitVector()) { Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst); } else llvm_unreachable("Unexpected vector type"); @@ -4481,14 +4509,13 @@ static SDValue PromoteSplati8i16(SDValue V, SelectionDAG &DAG, int &EltNo) { static SDValue getLegalSplat(SelectionDAG &DAG, SDValue V, int EltNo) { EVT VT = V.getValueType(); DebugLoc dl = V.getDebugLoc(); - unsigned Size = VT.getSizeInBits(); - if (Size == 128) { + if (VT.is128BitVector()) { V = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, V); int SplatMask[4] = { EltNo, EltNo, EltNo, EltNo }; V = DAG.getVectorShuffle(MVT::v4f32, dl, V, DAG.getUNDEF(MVT::v4f32), &SplatMask[0]); - } else if (Size == 256) { + } else if (VT.is256BitVector()) { // To use VPERMILPS to splat scalars, the second half of indicies must // refer to the higher part, which is a duplication of the lower one, // because VPERMILPS can only handle in-lane permutations. @@ -4512,14 +4539,14 @@ static SDValue PromoteSplat(ShuffleVectorSDNode *SV, SelectionDAG &DAG) { int EltNo = SV->getSplatIndex(); int NumElems = SrcVT.getVectorNumElements(); - unsigned Size = SrcVT.getSizeInBits(); + bool Is256BitVec = SrcVT.is256BitVector(); - assert(((Size == 128 && NumElems > 4) || Size == 256) && - "Unknown how to promote splat for type"); + assert(((SrcVT.is128BitVector() && NumElems > 4) || Is256BitVec) && + "Unknown how to promote splat for type"); // Extract the 128-bit part containing the splat element and update // the splat element index when it refers to the higher register. - if (Size == 256) { + if (Is256BitVec) { V1 = Extract128BitVector(V1, EltNo, DAG, dl); if (EltNo >= NumElems/2) EltNo -= NumElems/2; @@ -4536,7 +4563,7 @@ static SDValue PromoteSplat(ShuffleVectorSDNode *SV, SelectionDAG &DAG) { // Recreate the 256-bit vector and place the same 128-bit vector // into the low and high part. This is necessary because we want // to use VPERM* to shuffle the vectors - if (Size == 256) { + if (Is256BitVec) { V1 = DAG.getNode(ISD::CONCAT_VECTORS, dl, SrcVT, V1, V1); } @@ -4588,6 +4615,10 @@ static bool getTargetShuffleMask(SDNode *N, MVT VT, case X86ISD::MOVLHPS: DecodeMOVLHPSMask(NumElems, Mask); break; + case X86ISD::PALIGNR: + ImmN = N->getOperand(N->getNumOperands()-1); + DecodePALIGNRMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask); + break; case X86ISD::PSHUFD: case X86ISD::VPERMILP: ImmN = N->getOperand(N->getNumOperands()-1); @@ -4631,7 +4662,6 @@ static bool getTargetShuffleMask(SDNode *N, MVT VT, case X86ISD::MOVLPS: case X86ISD::MOVSHDUP: case X86ISD::MOVSLDUP: - case X86ISD::PALIGN: // Not yet implemented return false; default: llvm_unreachable("unknown target shuffle node"); @@ -5099,7 +5129,7 @@ X86TargetLowering::LowerVectorBroadcast(SDValue Op, SelectionDAG &DAG) const { if (!Subtarget->hasFp256()) return SDValue(); - EVT VT = Op.getValueType(); + MVT VT = Op.getValueType().getSimpleVT(); DebugLoc dl = Op.getDebugLoc(); assert((VT.is128BitVector() || VT.is256BitVector()) && @@ -5297,8 +5327,8 @@ SDValue X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { DebugLoc dl = Op.getDebugLoc(); - EVT VT = Op.getValueType(); - EVT ExtVT = VT.getVectorElementType(); + MVT VT = Op.getValueType().getSimpleVT(); + MVT ExtVT = VT.getVectorElementType(); unsigned NumElems = Op.getNumOperands(); // Vectors containing all zeros can be matched by pxor and xorps later @@ -5314,7 +5344,7 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { // Vectors containing all ones can be matched by pcmpeqd on 128-bit width // vectors or broken into v4i32 operations on 256-bit vectors. AVX2 can use // vpcmpeqd on 256-bit vectors. - if (ISD::isBuildVectorAllOnes(Op.getNode())) { + if (Subtarget->hasSSE2() && ISD::isBuildVectorAllOnes(Op.getNode())) { if (VT == MVT::v4i32 || (VT == MVT::v8i32 && Subtarget->hasInt256())) return Op; @@ -5629,7 +5659,7 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { // to create 256-bit vectors from two other 128-bit ones. static SDValue LowerAVXCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) { DebugLoc dl = Op.getDebugLoc(); - EVT ResVT = Op.getValueType(); + MVT ResVT = Op.getValueType().getSimpleVT(); assert(ResVT.is256BitVector() && "Value type must be 256-bit wide"); @@ -5655,8 +5685,8 @@ LowerVECTOR_SHUFFLEtoBlend(ShuffleVectorSDNode *SVOp, SDValue V1 = SVOp->getOperand(0); SDValue V2 = SVOp->getOperand(1); DebugLoc dl = SVOp->getDebugLoc(); - EVT VT = SVOp->getValueType(0); - EVT EltVT = VT.getVectorElementType(); + MVT VT = SVOp->getValueType(0).getSimpleVT(); + MVT EltVT = VT.getVectorElementType(); unsigned NumElems = VT.getVectorNumElements(); if (!Subtarget->hasSSE41() || EltVT == MVT::i8) @@ -5667,41 +5697,40 @@ LowerVECTOR_SHUFFLEtoBlend(ShuffleVectorSDNode *SVOp, // Check the mask for BLEND and build the value. unsigned MaskValue = 0; // There are 2 lanes if (NumElems > 8), and 1 lane otherwise. - unsigned NumLanes = (NumElems-1)/8 + 1; + unsigned NumLanes = (NumElems-1)/8 + 1; unsigned NumElemsInLane = NumElems / NumLanes; // Blend for v16i16 should be symetric for the both lanes. for (unsigned i = 0; i < NumElemsInLane; ++i) { - int SndLaneEltIdx = (NumLanes == 2) ? + int SndLaneEltIdx = (NumLanes == 2) ? SVOp->getMaskElt(i + NumElemsInLane) : -1; int EltIdx = SVOp->getMaskElt(i); - if ((EltIdx == -1 || EltIdx == (int)i) && - (SndLaneEltIdx == -1 || SndLaneEltIdx == (int)(i + NumElemsInLane))) + if ((EltIdx < 0 || EltIdx == (int)i) && + (SndLaneEltIdx < 0 || SndLaneEltIdx == (int)(i + NumElemsInLane))) continue; - if (((unsigned)EltIdx == (i + NumElems)) && - (SndLaneEltIdx == -1 || + if (((unsigned)EltIdx == (i + NumElems)) && + (SndLaneEltIdx < 0 || (unsigned)SndLaneEltIdx == i + NumElems + NumElemsInLane)) MaskValue |= (1<<i); - else + else return SDValue(); } // Convert i32 vectors to floating point if it is not AVX2. // AVX2 introduced VPBLENDD instruction for 128 and 256-bit vectors. - EVT BlendVT = VT; + MVT BlendVT = VT; if (EltVT == MVT::i64 || (EltVT == MVT::i32 && !Subtarget->hasInt256())) { - BlendVT = EVT::getVectorVT(*DAG.getContext(), - EVT::getFloatingPointVT(EltVT.getSizeInBits()), - NumElems); + BlendVT = MVT::getVectorVT(MVT::getFloatingPointVT(EltVT.getSizeInBits()), + NumElems); V1 = DAG.getNode(ISD::BITCAST, dl, VT, V1); V2 = DAG.getNode(ISD::BITCAST, dl, VT, V2); } - - SDValue Ret = DAG.getNode(X86ISD::BLENDI, dl, BlendVT, V1, V2, - DAG.getConstant(MaskValue, MVT::i32)); + + SDValue Ret = DAG.getNode(X86ISD::BLENDI, dl, BlendVT, V1, V2, + DAG.getConstant(MaskValue, MVT::i32)); return DAG.getNode(ISD::BITCAST, dl, VT, Ret); } @@ -5836,6 +5865,11 @@ LowerVECTOR_SHUFFLEv8i16(SDValue Op, const X86Subtarget *Subtarget, } } + // Promote splats to a larger type which usually leads to more efficient code. + // FIXME: Is this true if pshufb is available? + if (SVOp->isSplat()) + return PromoteSplat(SVOp, DAG); + // If we have SSSE3, and all words of the result are from 1 input vector, // case 2 is generated, otherwise case 3 is generated. If no SSSE3 // is present, fall back to case 4. @@ -5851,7 +5885,7 @@ LowerVECTOR_SHUFFLEv8i16(SDValue Op, const X86Subtarget *Subtarget, int EltIdx = MaskVals[i] * 2; int Idx0 = (TwoInputs && (EltIdx >= 16)) ? 0x80 : EltIdx; int Idx1 = (TwoInputs && (EltIdx >= 16)) ? 0x80 : EltIdx+1; - pshufbMask.push_back(DAG.getConstant(Idx0, MVT::i8)); + pshufbMask.push_back(DAG.getConstant(Idx0, MVT::i8)); pshufbMask.push_back(DAG.getConstant(Idx1, MVT::i8)); } V1 = DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, V1); @@ -5969,6 +6003,11 @@ SDValue LowerVECTOR_SHUFFLEv16i8(ShuffleVectorSDNode *SVOp, DebugLoc dl = SVOp->getDebugLoc(); ArrayRef<int> MaskVals = SVOp->getMask(); + // Promote splats to a larger type which usually leads to more efficient code. + // FIXME: Is this true if pshufb is available? + if (SVOp->isSplat()) + return PromoteSplat(SVOp, DAG); + // If we have SSSE3, case 1 is generated when all result bytes come from // one of the inputs. Otherwise, case 2 is generated. If no SSSE3 is // present, fall back to case 3. @@ -6087,7 +6126,7 @@ static SDValue LowerVECTOR_SHUFFLEv32i8(ShuffleVectorSDNode *SVOp, const X86Subtarget *Subtarget, SelectionDAG &DAG) { - EVT VT = SVOp->getValueType(0); + MVT VT = SVOp->getValueType(0).getSimpleVT(); SDValue V1 = SVOp->getOperand(0); SDValue V2 = SVOp->getOperand(1); DebugLoc dl = SVOp->getDebugLoc(); @@ -6134,8 +6173,9 @@ SDValue LowerVECTOR_SHUFFLEv32i8(ShuffleVectorSDNode *SVOp, /// vector_shuffle X, Y, <2, 3, | 10, 11, | 0, 1, | 14, 15> static SDValue RewriteAsNarrowerShuffle(ShuffleVectorSDNode *SVOp, - SelectionDAG &DAG, DebugLoc dl) { + SelectionDAG &DAG) { MVT VT = SVOp->getValueType(0).getSimpleVT(); + DebugLoc dl = SVOp->getDebugLoc(); unsigned NumElems = VT.getVectorNumElements(); MVT NewVT; unsigned Scale; @@ -6171,7 +6211,7 @@ SDValue RewriteAsNarrowerShuffle(ShuffleVectorSDNode *SVOp, /// getVZextMovL - Return a zero-extending vector move low node. /// -static SDValue getVZextMovL(EVT VT, EVT OpVT, +static SDValue getVZextMovL(MVT VT, EVT OpVT, SDValue SrcOp, SelectionDAG &DAG, const X86Subtarget *Subtarget, DebugLoc dl) { if (VT == MVT::v2f64 || VT == MVT::v4f32) { @@ -6213,14 +6253,14 @@ LowerVECTOR_SHUFFLE_256(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) { if (NewOp.getNode()) return NewOp; - EVT VT = SVOp->getValueType(0); + MVT VT = SVOp->getValueType(0).getSimpleVT(); unsigned NumElems = VT.getVectorNumElements(); unsigned NumLaneElems = NumElems / 2; DebugLoc dl = SVOp->getDebugLoc(); - MVT EltVT = VT.getVectorElementType().getSimpleVT(); - EVT NVT = MVT::getVectorVT(EltVT, NumLaneElems); + MVT EltVT = VT.getVectorElementType(); + MVT NVT = MVT::getVectorVT(EltVT, NumLaneElems); SDValue Output[2]; SmallVector<int, 16> Mask; @@ -6325,7 +6365,7 @@ LowerVECTOR_SHUFFLE_128v4(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) { SDValue V1 = SVOp->getOperand(0); SDValue V2 = SVOp->getOperand(1); DebugLoc dl = SVOp->getDebugLoc(); - EVT VT = SVOp->getValueType(0); + MVT VT = SVOp->getValueType(0).getSimpleVT(); assert(VT.is128BitVector() && "Unsupported vector size"); @@ -6579,7 +6619,7 @@ SDValue getMOVLP(SDValue &Op, DebugLoc &dl, SelectionDAG &DAG, bool HasSSE2) { // Reduce a vector shuffle to zext. SDValue -X86TargetLowering::lowerVectorIntExtend(SDValue Op, SelectionDAG &DAG) const { +X86TargetLowering::LowerVectorIntExtend(SDValue Op, SelectionDAG &DAG) const { // PMOVZX is only available from SSE41. if (!Subtarget->hasSSE41()) return SDValue(); @@ -6623,9 +6663,10 @@ X86TargetLowering::lowerVectorIntExtend(SDValue Op, SelectionDAG &DAG) const { return SDValue(); } + LLVMContext *Context = DAG.getContext(); unsigned NBits = VT.getVectorElementType().getSizeInBits() << Shift; - EVT NeVT = EVT::getIntegerVT(*DAG.getContext(), NBits); - EVT NVT = EVT::getVectorVT(*DAG.getContext(), NeVT, NumElems >> Shift); + EVT NeVT = EVT::getIntegerVT(*Context, NBits); + EVT NVT = EVT::getVectorVT(*Context, NeVT, NumElems >> Shift); if (!isTypeLegal(NVT)) return SDValue(); @@ -6644,8 +6685,21 @@ X86TargetLowering::lowerVectorIntExtend(SDValue Op, SelectionDAG &DAG) const { // If it's foldable, i.e. normal load with single use, we will let code // selection to fold it. Otherwise, we will short the conversion sequence. if (CIdx && CIdx->getZExtValue() == 0 && - (!ISD::isNormalLoad(V.getNode()) || !V.hasOneUse())) + (!ISD::isNormalLoad(V.getNode()) || !V.hasOneUse())) { + if (V.getValueSizeInBits() > V1.getValueSizeInBits()) { + // The "ext_vec_elt" node is wider than the result node. + // In this case we should extract subvector from V. + // (bitcast (sclr2vec (ext_vec_elt x))) -> (bitcast (extract_subvector x)). + unsigned Ratio = V.getValueSizeInBits() / V1.getValueSizeInBits(); + EVT FullVT = V.getValueType(); + EVT SubVecVT = EVT::getVectorVT(*Context, + FullVT.getVectorElementType(), + FullVT.getVectorNumElements()/Ratio); + V = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVecVT, V, + DAG.getIntPtrConstant(0)); + } V1 = DAG.getNode(ISD::BITCAST, DL, V1.getValueType(), V); + } } return DAG.getNode(ISD::BITCAST, DL, VT, @@ -6655,7 +6709,7 @@ X86TargetLowering::lowerVectorIntExtend(SDValue Op, SelectionDAG &DAG) const { SDValue X86TargetLowering::NormalizeVectorShuffle(SDValue Op, SelectionDAG &DAG) const { ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); - EVT VT = Op.getValueType(); + MVT VT = Op.getValueType().getSimpleVT(); DebugLoc dl = Op.getDebugLoc(); SDValue V1 = Op.getOperand(0); SDValue V2 = Op.getOperand(1); @@ -6665,25 +6719,14 @@ X86TargetLowering::NormalizeVectorShuffle(SDValue Op, SelectionDAG &DAG) const { // Handle splat operations if (SVOp->isSplat()) { - unsigned NumElem = VT.getVectorNumElements(); - int Size = VT.getSizeInBits(); - // Use vbroadcast whenever the splat comes from a foldable load SDValue Broadcast = LowerVectorBroadcast(Op, DAG); if (Broadcast.getNode()) return Broadcast; - - // Handle splats by matching through known shuffle masks - if ((Size == 128 && NumElem <= 4) || - (Size == 256 && NumElem <= 8)) - return SDValue(); - - // All remaning splats are promoted to target supported vector shuffles. - return PromoteSplat(SVOp, DAG); } // Check integer expanding shuffles. - SDValue NewOp = lowerVectorIntExtend(Op, DAG); + SDValue NewOp = LowerVectorIntExtend(Op, DAG); if (NewOp.getNode()) return NewOp; @@ -6691,7 +6734,7 @@ X86TargetLowering::NormalizeVectorShuffle(SDValue Op, SelectionDAG &DAG) const { // do it! if (VT == MVT::v8i16 || VT == MVT::v16i8 || VT == MVT::v16i16 || VT == MVT::v32i8) { - SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG, dl); + SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG); if (NewOp.getNode()) return DAG.getNode(ISD::BITCAST, dl, VT, NewOp); } else if ((VT == MVT::v4i32 || @@ -6699,18 +6742,18 @@ X86TargetLowering::NormalizeVectorShuffle(SDValue Op, SelectionDAG &DAG) const { // FIXME: Figure out a cleaner way to do this. // Try to make use of movq to zero out the top part. if (ISD::isBuildVectorAllZeros(V2.getNode())) { - SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG, dl); + SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG); if (NewOp.getNode()) { - EVT NewVT = NewOp.getValueType(); + MVT NewVT = NewOp.getValueType().getSimpleVT(); if (isCommutedMOVLMask(cast<ShuffleVectorSDNode>(NewOp)->getMask(), NewVT, true, false)) return getVZextMovL(VT, NewVT, NewOp.getOperand(0), DAG, Subtarget, dl); } } else if (ISD::isBuildVectorAllZeros(V1.getNode())) { - SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG, dl); + SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG); if (NewOp.getNode()) { - EVT NewVT = NewOp.getValueType(); + MVT NewVT = NewOp.getValueType().getSimpleVT(); if (isMOVLMask(cast<ShuffleVectorSDNode>(NewOp)->getMask(), NewVT)) return getVZextMovL(VT, NewVT, NewOp.getOperand(1), DAG, Subtarget, dl); @@ -6725,7 +6768,7 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const { ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); SDValue V1 = Op.getOperand(0); SDValue V2 = Op.getOperand(1); - EVT VT = Op.getValueType(); + MVT VT = Op.getValueType().getSimpleVT(); DebugLoc dl = Op.getDebugLoc(); unsigned NumElems = VT.getVectorNumElements(); bool V1IsUndef = V1.getOpcode() == ISD::UNDEF; @@ -6816,7 +6859,7 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const { if (isShift && ShVal.hasOneUse()) { // If the shifted value has multiple uses, it may be cheaper to use // v_set0 + movlhps or movhlps, etc. - EVT EltVT = VT.getVectorElementType(); + MVT EltVT = VT.getVectorElementType(); ShAmt *= EltVT.getSizeInBits(); return getVShift(isLeft, VT, ShVal, ShAmt, DAG, *this, dl); } @@ -6855,7 +6898,7 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const { if (isShift) { // No better options. Use a vshldq / vsrldq. - EVT EltVT = VT.getVectorElementType(); + MVT EltVT = VT.getVectorElementType(); ShAmt *= EltVT.getSizeInBits(); return getVShift(isLeft, VT, ShVal, ShAmt, DAG, *this, dl); } @@ -6926,7 +6969,7 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const { // nodes, and remove one by one until they don't return Op anymore. if (isPALIGNRMask(M, VT, Subtarget)) - return getTargetShuffleNode(X86ISD::PALIGN, dl, VT, V1, V2, + return getTargetShuffleNode(X86ISD::PALIGNR, dl, VT, V1, V2, getShufflePALIGNRImmediate(SVOp), DAG); @@ -7035,13 +7078,11 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const { return SDValue(); } -SDValue -X86TargetLowering::LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op, - SelectionDAG &DAG) const { - EVT VT = Op.getValueType(); +static SDValue LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG) { + MVT VT = Op.getValueType().getSimpleVT(); DebugLoc dl = Op.getDebugLoc(); - if (!Op.getOperand(0).getValueType().is128BitVector()) + if (!Op.getOperand(0).getValueType().getSimpleVT().is128BitVector()) return SDValue(); if (VT.getSizeInBits() == 8) { @@ -7106,7 +7147,7 @@ X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op, return SDValue(); SDValue Vec = Op.getOperand(0); - EVT VecVT = Vec.getValueType(); + MVT VecVT = Vec.getValueType().getSimpleVT(); // If this is a 256-bit vector result, first extract the 128-bit vector and // then extract the element from the 128-bit vector. @@ -7133,7 +7174,7 @@ X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op, return Res; } - EVT VT = Op.getValueType(); + MVT VT = Op.getValueType().getSimpleVT(); DebugLoc dl = Op.getDebugLoc(); // TODO: handle v16i8. if (VT.getSizeInBits() == 16) { @@ -7146,7 +7187,7 @@ X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op, MVT::v4i32, Vec), Op.getOperand(1))); // Transform it so it match pextrw which produces a 32-bit result. - EVT EltVT = MVT::i32; + MVT EltVT = MVT::i32; SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, EltVT, Op.getOperand(0), Op.getOperand(1)); SDValue Assert = DAG.getNode(ISD::AssertZext, dl, EltVT, Extract, @@ -7161,7 +7202,7 @@ X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op, // SHUFPS the element to the lowest double word, then movss. int Mask[4] = { static_cast<int>(Idx), -1, -1, -1 }; - EVT VVT = Op.getOperand(0).getValueType(); + MVT VVT = Op.getOperand(0).getValueType().getSimpleVT(); SDValue Vec = DAG.getVectorShuffle(VVT, dl, Op.getOperand(0), DAG.getUNDEF(VVT), Mask); return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec, @@ -7180,7 +7221,7 @@ X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op, // Note if the lower 64 bits of the result of the UNPCKHPD is then stored // to a f64mem, the whole operation is folded into a single MOVHPDmr. int Mask[2] = { 1, -1 }; - EVT VVT = Op.getOperand(0).getValueType(); + MVT VVT = Op.getOperand(0).getValueType().getSimpleVT(); SDValue Vec = DAG.getVectorShuffle(VVT, dl, Op.getOperand(0), DAG.getUNDEF(VVT), Mask); return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec, @@ -7190,11 +7231,9 @@ X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op, return SDValue(); } -SDValue -X86TargetLowering::LowerINSERT_VECTOR_ELT_SSE4(SDValue Op, - SelectionDAG &DAG) const { - EVT VT = Op.getValueType(); - EVT EltVT = VT.getVectorElementType(); +static SDValue LowerINSERT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG) { + MVT VT = Op.getValueType().getSimpleVT(); + MVT EltVT = VT.getVectorElementType(); DebugLoc dl = Op.getDebugLoc(); SDValue N0 = Op.getOperand(0); @@ -7247,8 +7286,8 @@ X86TargetLowering::LowerINSERT_VECTOR_ELT_SSE4(SDValue Op, SDValue X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const { - EVT VT = Op.getValueType(); - EVT EltVT = VT.getVectorElementType(); + MVT VT = Op.getValueType().getSimpleVT(); + MVT EltVT = VT.getVectorElementType(); DebugLoc dl = Op.getDebugLoc(); SDValue N0 = Op.getOperand(0); @@ -7296,7 +7335,7 @@ X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const { static SDValue LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) { LLVMContext *Context = DAG.getContext(); DebugLoc dl = Op.getDebugLoc(); - EVT OpVT = Op.getValueType(); + MVT OpVT = Op.getValueType().getSimpleVT(); // If this is a 256-bit vector result, first insert into a 128-bit // vector and then insert into the 256-bit vector. @@ -7511,8 +7550,7 @@ X86TargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const { SDValue X86TargetLowering::LowerGlobalAddress(const GlobalValue *GV, DebugLoc dl, - int64_t Offset, - SelectionDAG &DAG) const { + int64_t Offset, SelectionDAG &DAG) const { // Create the TargetGlobalAddress node, folding in the constant // offset if it is legal. unsigned char OpFlags = @@ -7732,7 +7770,7 @@ X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const { case TLSModel::LocalExec: return LowerToTLSExecModel(GA, DAG, getPointerTy(), model, Subtarget->is64Bit(), - getTargetMachine().getRelocationModel() == Reloc::PIC_); + getTargetMachine().getRelocationModel() == Reloc::PIC_); } llvm_unreachable("Unknown TLS model."); } @@ -8015,9 +8053,11 @@ SDValue X86TargetLowering::LowerUINT_TO_FP_i64(SDValue Op, SmallVector<Constant*,2> CV1; CV1.push_back( - ConstantFP::get(*Context, APFloat(APInt(64, 0x4330000000000000ULL)))); + ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble, + APInt(64, 0x4330000000000000ULL)))); CV1.push_back( - ConstantFP::get(*Context, APFloat(APInt(64, 0x4530000000000000ULL)))); + ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble, + APInt(64, 0x4530000000000000ULL)))); Constant *C1 = ConstantVector::get(CV1); SDValue CPIdx1 = DAG.getConstantPool(C1, getPointerTy(), 16); @@ -8111,7 +8151,8 @@ SDValue X86TargetLowering::lowerUINT_TO_FP_vec(SDValue Op, SVT == MVT::v8i8 || SVT == MVT::v8i16) && "Custom UINT_TO_FP is not supported!"); - EVT NVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, SVT.getVectorNumElements()); + EVT NVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, + SVT.getVectorNumElements()); return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(), DAG.getNode(ISD::ZERO_EXTEND, dl, NVT, N0)); } @@ -8204,8 +8245,9 @@ SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op, return DAG.getNode(ISD::FP_ROUND, dl, DstVT, Add, DAG.getIntPtrConstant(0)); } -std::pair<SDValue,SDValue> X86TargetLowering:: -FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG, bool IsSigned, bool IsReplace) const { +std::pair<SDValue,SDValue> +X86TargetLowering:: FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG, + bool IsSigned, bool IsReplace) const { DebugLoc DL = Op.getDebugLoc(); EVT DstTy = Op.getValueType(); @@ -8299,9 +8341,9 @@ FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG, bool IsSigned, bool IsReplace) co static SDValue LowerAVXExtend(SDValue Op, SelectionDAG &DAG, const X86Subtarget *Subtarget) { - EVT VT = Op->getValueType(0); + MVT VT = Op->getValueType(0).getSimpleVT(); SDValue In = Op->getOperand(0); - EVT InVT = In.getValueType(); + MVT InVT = In.getValueType().getSimpleVT(); DebugLoc dl = Op->getDebugLoc(); // Optimize vectors in AVX mode: @@ -8330,7 +8372,7 @@ static SDValue LowerAVXExtend(SDValue Op, SelectionDAG &DAG, SDValue OpLo = getUnpackl(DAG, dl, InVT, In, NeedZero ? ZeroVec : Undef); SDValue OpHi = getUnpackh(DAG, dl, InVT, In, NeedZero ? ZeroVec : Undef); - EVT HVT = EVT::getVectorVT(*DAG.getContext(), VT.getVectorElementType(), + MVT HVT = MVT::getVectorVT(VT.getVectorElementType(), VT.getVectorNumElements()/2); OpLo = DAG.getNode(ISD::BITCAST, dl, HVT, OpLo); @@ -8352,9 +8394,9 @@ SDValue X86TargetLowering::LowerANY_EXTEND(SDValue Op, SDValue X86TargetLowering::LowerZERO_EXTEND(SDValue Op, SelectionDAG &DAG) const { DebugLoc DL = Op.getDebugLoc(); - EVT VT = Op.getValueType(); + MVT VT = Op.getValueType().getSimpleVT(); SDValue In = Op.getOperand(0); - EVT SVT = In.getValueType(); + MVT SVT = In.getValueType().getSimpleVT(); if (Subtarget->hasFp256()) { SDValue Res = LowerAVXExtend(Op, DAG, Subtarget); @@ -8382,11 +8424,11 @@ SDValue X86TargetLowering::LowerZERO_EXTEND(SDValue Op, return DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8i32, Lo, Hi); } -SDValue X86TargetLowering::lowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const { +SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const { DebugLoc DL = Op.getDebugLoc(); - EVT VT = Op.getValueType(); + MVT VT = Op.getValueType().getSimpleVT(); SDValue In = Op.getOperand(0); - EVT SVT = In.getValueType(); + MVT SVT = In.getValueType().getSimpleVT(); if ((VT == MVT::v4i32) && (SVT == MVT::v4i64)) { // On AVX2, v4i64 -> v4i32 becomes VPERMD. @@ -8501,9 +8543,10 @@ SDValue X86TargetLowering::lowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const { SDValue X86TargetLowering::LowerFP_TO_SINT(SDValue Op, SelectionDAG &DAG) const { - if (Op.getValueType().isVector()) { - if (Op.getValueType() == MVT::v8i16) - return DAG.getNode(ISD::TRUNCATE, Op.getDebugLoc(), Op.getValueType(), + MVT VT = Op.getValueType().getSimpleVT(); + if (VT.isVector()) { + if (VT == MVT::v8i16) + return DAG.getNode(ISD::TRUNCATE, Op.getDebugLoc(), VT, DAG.getNode(ISD::FP_TO_SINT, Op.getDebugLoc(), MVT::v8i32, Op.getOperand(0))); return SDValue(); @@ -8542,12 +8585,11 @@ SDValue X86TargetLowering::LowerFP_TO_UINT(SDValue Op, return FIST; } -SDValue X86TargetLowering::lowerFP_EXTEND(SDValue Op, - SelectionDAG &DAG) const { +static SDValue LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) { DebugLoc DL = Op.getDebugLoc(); - EVT VT = Op.getValueType(); + MVT VT = Op.getValueType().getSimpleVT(); SDValue In = Op.getOperand(0); - EVT SVT = In.getValueType(); + MVT SVT = In.getValueType().getSimpleVT(); assert(SVT == MVT::v2f32 && "Only customize MVT::v2f32 type legalization!"); @@ -8559,8 +8601,8 @@ SDValue X86TargetLowering::lowerFP_EXTEND(SDValue Op, SDValue X86TargetLowering::LowerFABS(SDValue Op, SelectionDAG &DAG) const { LLVMContext *Context = DAG.getContext(); DebugLoc dl = Op.getDebugLoc(); - EVT VT = Op.getValueType(); - EVT EltVT = VT; + MVT VT = Op.getValueType().getSimpleVT(); + MVT EltVT = VT; unsigned NumElts = VT == MVT::f64 ? 2 : 4; if (VT.isVector()) { EltVT = VT.getVectorElementType(); @@ -8568,9 +8610,11 @@ SDValue X86TargetLowering::LowerFABS(SDValue Op, SelectionDAG &DAG) const { } Constant *C; if (EltVT == MVT::f64) - C = ConstantFP::get(*Context, APFloat(APInt(64, ~(1ULL << 63)))); + C = ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble, + APInt(64, ~(1ULL << 63)))); else - C = ConstantFP::get(*Context, APFloat(APInt(32, ~(1U << 31)))); + C = ConstantFP::get(*Context, APFloat(APFloat::IEEEsingle, + APInt(32, ~(1U << 31)))); C = ConstantVector::getSplat(NumElts, C); SDValue CPIdx = DAG.getConstantPool(C, getPointerTy()); unsigned Alignment = cast<ConstantPoolSDNode>(CPIdx)->getAlignment(); @@ -8591,8 +8635,8 @@ SDValue X86TargetLowering::LowerFABS(SDValue Op, SelectionDAG &DAG) const { SDValue X86TargetLowering::LowerFNEG(SDValue Op, SelectionDAG &DAG) const { LLVMContext *Context = DAG.getContext(); DebugLoc dl = Op.getDebugLoc(); - EVT VT = Op.getValueType(); - EVT EltVT = VT; + MVT VT = Op.getValueType().getSimpleVT(); + MVT EltVT = VT; unsigned NumElts = VT == MVT::f64 ? 2 : 4; if (VT.isVector()) { EltVT = VT.getVectorElementType(); @@ -8600,9 +8644,11 @@ SDValue X86TargetLowering::LowerFNEG(SDValue Op, SelectionDAG &DAG) const { } Constant *C; if (EltVT == MVT::f64) - C = ConstantFP::get(*Context, APFloat(APInt(64, 1ULL << 63))); + C = ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble, + APInt(64, 1ULL << 63))); else - C = ConstantFP::get(*Context, APFloat(APInt(32, 1U << 31))); + C = ConstantFP::get(*Context, APFloat(APFloat::IEEEsingle, + APInt(32, 1U << 31))); C = ConstantVector::getSplat(NumElts, C); SDValue CPIdx = DAG.getConstantPool(C, getPointerTy()); unsigned Alignment = cast<ConstantPoolSDNode>(CPIdx)->getAlignment(); @@ -8626,8 +8672,8 @@ SDValue X86TargetLowering::LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const { SDValue Op0 = Op.getOperand(0); SDValue Op1 = Op.getOperand(1); DebugLoc dl = Op.getDebugLoc(); - EVT VT = Op.getValueType(); - EVT SrcVT = Op1.getValueType(); + MVT VT = Op.getValueType().getSimpleVT(); + MVT SrcVT = Op1.getValueType().getSimpleVT(); // If second operand is smaller, extend it first. if (SrcVT.bitsLT(VT)) { @@ -8646,13 +8692,15 @@ SDValue X86TargetLowering::LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const { // First get the sign bit of second operand. SmallVector<Constant*,4> CV; if (SrcVT == MVT::f64) { - CV.push_back(ConstantFP::get(*Context, APFloat(APInt(64, 1ULL << 63)))); - CV.push_back(ConstantFP::get(*Context, APFloat(APInt(64, 0)))); + const fltSemantics &Sem = APFloat::IEEEdouble; + CV.push_back(ConstantFP::get(*Context, APFloat(Sem, APInt(64, 1ULL << 63)))); + CV.push_back(ConstantFP::get(*Context, APFloat(Sem, APInt(64, 0)))); } else { - CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 1U << 31)))); - CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0)))); - CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0)))); - CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0)))); + const fltSemantics &Sem = APFloat::IEEEsingle; + CV.push_back(ConstantFP::get(*Context, APFloat(Sem, APInt(32, 1U << 31)))); + CV.push_back(ConstantFP::get(*Context, APFloat(Sem, APInt(32, 0)))); + CV.push_back(ConstantFP::get(*Context, APFloat(Sem, APInt(32, 0)))); + CV.push_back(ConstantFP::get(*Context, APFloat(Sem, APInt(32, 0)))); } Constant *C = ConstantVector::get(CV); SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16); @@ -8675,13 +8723,17 @@ SDValue X86TargetLowering::LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const { // Clear first operand sign bit. CV.clear(); if (VT == MVT::f64) { - CV.push_back(ConstantFP::get(*Context, APFloat(APInt(64, ~(1ULL << 63))))); - CV.push_back(ConstantFP::get(*Context, APFloat(APInt(64, 0)))); + const fltSemantics &Sem = APFloat::IEEEdouble; + CV.push_back(ConstantFP::get(*Context, APFloat(Sem, + APInt(64, ~(1ULL << 63))))); + CV.push_back(ConstantFP::get(*Context, APFloat(Sem, APInt(64, 0)))); } else { - CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, ~(1U << 31))))); - CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0)))); - CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0)))); - CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0)))); + const fltSemantics &Sem = APFloat::IEEEsingle; + CV.push_back(ConstantFP::get(*Context, APFloat(Sem, + APInt(32, ~(1U << 31))))); + CV.push_back(ConstantFP::get(*Context, APFloat(Sem, APInt(32, 0)))); + CV.push_back(ConstantFP::get(*Context, APFloat(Sem, APInt(32, 0)))); + CV.push_back(ConstantFP::get(*Context, APFloat(Sem, APInt(32, 0)))); } C = ConstantVector::get(CV); CPIdx = DAG.getConstantPool(C, getPointerTy(), 16); @@ -8697,7 +8749,7 @@ SDValue X86TargetLowering::LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const { static SDValue LowerFGETSIGN(SDValue Op, SelectionDAG &DAG) { SDValue N0 = Op.getOperand(0); DebugLoc dl = Op.getDebugLoc(); - EVT VT = Op.getValueType(); + MVT VT = Op.getValueType().getSimpleVT(); // Lower ISD::FGETSIGN to (AND (X86ISD::FGETSIGNx86 ...) 1). SDValue xFGETSIGN = DAG.getNode(X86ISD::FGETSIGNx86, dl, VT, N0, @@ -8707,7 +8759,8 @@ static SDValue LowerFGETSIGN(SDValue Op, SelectionDAG &DAG) { // LowerVectorAllZeroTest - Check whether an OR'd tree is PTEST-able. // -SDValue X86TargetLowering::LowerVectorAllZeroTest(SDValue Op, SelectionDAG &DAG) const { +SDValue X86TargetLowering::LowerVectorAllZeroTest(SDValue Op, + SelectionDAG &DAG) const { assert(Op.getOpcode() == ISD::OR && "Only check OR'd tree."); if (!Subtarget->hasSSE41()) @@ -9139,65 +9192,10 @@ SDValue X86TargetLowering::LowerToBT(SDValue And, ISD::CondCode CC, return SDValue(); } -SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const { - - if (Op.getValueType().isVector()) return LowerVSETCC(Op, DAG); - - assert(Op.getValueType() == MVT::i8 && "SetCC type must be 8-bit integer"); - SDValue Op0 = Op.getOperand(0); - SDValue Op1 = Op.getOperand(1); - DebugLoc dl = Op.getDebugLoc(); - ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get(); - - // Optimize to BT if possible. - // Lower (X & (1 << N)) == 0 to BT(X, N). - // Lower ((X >>u N) & 1) != 0 to BT(X, N). - // Lower ((X >>s N) & 1) != 0 to BT(X, N). - if (Op0.getOpcode() == ISD::AND && Op0.hasOneUse() && - Op1.getOpcode() == ISD::Constant && - cast<ConstantSDNode>(Op1)->isNullValue() && - (CC == ISD::SETEQ || CC == ISD::SETNE)) { - SDValue NewSetCC = LowerToBT(Op0, CC, dl, DAG); - if (NewSetCC.getNode()) - return NewSetCC; - } - - // Look for X == 0, X == 1, X != 0, or X != 1. We can simplify some forms of - // these. - if (Op1.getOpcode() == ISD::Constant && - (cast<ConstantSDNode>(Op1)->getZExtValue() == 1 || - cast<ConstantSDNode>(Op1)->isNullValue()) && - (CC == ISD::SETEQ || CC == ISD::SETNE)) { - - // If the input is a setcc, then reuse the input setcc or use a new one with - // the inverted condition. - if (Op0.getOpcode() == X86ISD::SETCC) { - X86::CondCode CCode = (X86::CondCode)Op0.getConstantOperandVal(0); - bool Invert = (CC == ISD::SETNE) ^ - cast<ConstantSDNode>(Op1)->isNullValue(); - if (!Invert) return Op0; - - CCode = X86::GetOppositeBranchCondition(CCode); - return DAG.getNode(X86ISD::SETCC, dl, MVT::i8, - DAG.getConstant(CCode, MVT::i8), Op0.getOperand(1)); - } - } - - bool isFP = Op1.getValueType().isFloatingPoint(); - unsigned X86CC = TranslateX86CC(CC, isFP, Op0, Op1, DAG); - if (X86CC == X86::COND_INVALID) - return SDValue(); - - SDValue EFLAGS = EmitCmp(Op0, Op1, X86CC, DAG); - EFLAGS = ConvertCmpIfNecessary(EFLAGS, DAG); - return DAG.getNode(X86ISD::SETCC, dl, MVT::i8, - DAG.getConstant(X86CC, MVT::i8), EFLAGS); -} - // Lower256IntVSETCC - Break a VSETCC 256-bit integer VSETCC into two new 128 // ones, and then concatenate the result back. static SDValue Lower256IntVSETCC(SDValue Op, SelectionDAG &DAG) { - EVT VT = Op.getValueType(); + MVT VT = Op.getValueType().getSimpleVT(); assert(VT.is256BitVector() && Op.getOpcode() == ISD::SETCC && "Unsupported value type for operation"); @@ -9217,26 +9215,27 @@ static SDValue Lower256IntVSETCC(SDValue Op, SelectionDAG &DAG) { SDValue RHS2 = Extract128BitVector(RHS, NumElems/2, DAG, dl); // Issue the operation on the smaller types and concatenate the result back - MVT EltVT = VT.getVectorElementType().getSimpleVT(); - EVT NewVT = MVT::getVectorVT(EltVT, NumElems/2); + MVT EltVT = VT.getVectorElementType(); + MVT NewVT = MVT::getVectorVT(EltVT, NumElems/2); return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, RHS1, CC), DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2, CC)); } -SDValue X86TargetLowering::LowerVSETCC(SDValue Op, SelectionDAG &DAG) const { +static SDValue LowerVSETCC(SDValue Op, const X86Subtarget *Subtarget, + SelectionDAG &DAG) { SDValue Cond; SDValue Op0 = Op.getOperand(0); SDValue Op1 = Op.getOperand(1); SDValue CC = Op.getOperand(2); - EVT VT = Op.getValueType(); + MVT VT = Op.getValueType().getSimpleVT(); ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get(); - bool isFP = Op.getOperand(1).getValueType().isFloatingPoint(); + bool isFP = Op.getOperand(1).getValueType().getSimpleVT().isFloatingPoint(); DebugLoc dl = Op.getDebugLoc(); if (isFP) { #ifndef NDEBUG - EVT EltVT = Op0.getValueType().getVectorElementType(); + MVT EltVT = Op0.getValueType().getVectorElementType().getSimpleVT(); assert(EltVT == MVT::f32 || EltVT == MVT::f64); #endif @@ -9377,6 +9376,63 @@ SDValue X86TargetLowering::LowerVSETCC(SDValue Op, SelectionDAG &DAG) const { return Result; } +SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const { + + MVT VT = Op.getValueType().getSimpleVT(); + + if (VT.isVector()) return LowerVSETCC(Op, Subtarget, DAG); + + assert(VT == MVT::i8 && "SetCC type must be 8-bit integer"); + SDValue Op0 = Op.getOperand(0); + SDValue Op1 = Op.getOperand(1); + DebugLoc dl = Op.getDebugLoc(); + ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get(); + + // Optimize to BT if possible. + // Lower (X & (1 << N)) == 0 to BT(X, N). + // Lower ((X >>u N) & 1) != 0 to BT(X, N). + // Lower ((X >>s N) & 1) != 0 to BT(X, N). + if (Op0.getOpcode() == ISD::AND && Op0.hasOneUse() && + Op1.getOpcode() == ISD::Constant && + cast<ConstantSDNode>(Op1)->isNullValue() && + (CC == ISD::SETEQ || CC == ISD::SETNE)) { + SDValue NewSetCC = LowerToBT(Op0, CC, dl, DAG); + if (NewSetCC.getNode()) + return NewSetCC; + } + + // Look for X == 0, X == 1, X != 0, or X != 1. We can simplify some forms of + // these. + if (Op1.getOpcode() == ISD::Constant && + (cast<ConstantSDNode>(Op1)->getZExtValue() == 1 || + cast<ConstantSDNode>(Op1)->isNullValue()) && + (CC == ISD::SETEQ || CC == ISD::SETNE)) { + + // If the input is a setcc, then reuse the input setcc or use a new one with + // the inverted condition. + if (Op0.getOpcode() == X86ISD::SETCC) { + X86::CondCode CCode = (X86::CondCode)Op0.getConstantOperandVal(0); + bool Invert = (CC == ISD::SETNE) ^ + cast<ConstantSDNode>(Op1)->isNullValue(); + if (!Invert) return Op0; + + CCode = X86::GetOppositeBranchCondition(CCode); + return DAG.getNode(X86ISD::SETCC, dl, MVT::i8, + DAG.getConstant(CCode, MVT::i8), Op0.getOperand(1)); + } + } + + bool isFP = Op1.getValueType().getSimpleVT().isFloatingPoint(); + unsigned X86CC = TranslateX86CC(CC, isFP, Op0, Op1, DAG); + if (X86CC == X86::COND_INVALID) + return SDValue(); + + SDValue EFLAGS = EmitCmp(Op0, Op1, X86CC, DAG); + EFLAGS = ConvertCmpIfNecessary(EFLAGS, DAG); + return DAG.getNode(X86ISD::SETCC, dl, MVT::i8, + DAG.getConstant(X86CC, MVT::i8), EFLAGS); +} + // isX86LogicalCmp - Return true if opcode is a X86 logical comparison. static bool isX86LogicalCmp(SDValue Op) { unsigned Opc = Op.getNode()->getOpcode(); @@ -9499,7 +9555,7 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { SDValue Cmp = Cond.getOperand(1); unsigned Opc = Cmp.getOpcode(); - EVT VT = Op.getValueType(); + MVT VT = Op.getValueType().getSimpleVT(); bool IllegalFPCMov = false; if (VT.isFloatingPoint() && !VT.isVector() && @@ -9610,9 +9666,9 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { SDValue X86TargetLowering::LowerSIGN_EXTEND(SDValue Op, SelectionDAG &DAG) const { - EVT VT = Op->getValueType(0); + MVT VT = Op->getValueType(0).getSimpleVT(); SDValue In = Op->getOperand(0); - EVT InVT = In.getValueType(); + MVT InVT = In.getValueType().getSimpleVT(); DebugLoc dl = Op->getDebugLoc(); if ((VT != MVT::v4i64 || InVT != MVT::v4i32) && @@ -9646,7 +9702,7 @@ SDValue X86TargetLowering::LowerSIGN_EXTEND(SDValue Op, SDValue OpHi = DAG.getVectorShuffle(InVT, dl, In, Undef, &ShufMask2[0]); - EVT HalfVT = EVT::getVectorVT(*DAG.getContext(), VT.getScalarType(), + MVT HalfVT = MVT::getVectorVT(VT.getScalarType(), VT.getVectorNumElements()/2); OpLo = DAG.getNode(X86ISD::VSEXT_MOVL, dl, HalfVT, OpLo); @@ -10155,7 +10211,7 @@ static SDValue LowerVACOPY(SDValue Op, const X86Subtarget *Subtarget, MachinePointerInfo(DstSV), MachinePointerInfo(SrcSV)); } -// getTargetVShiftNOde - Handle vector element shifts where the shift amount +// getTargetVShiftNode - Handle vector element shifts where the shift amount // may or may not be a constant. Takes immediate version of shift as input. static SDValue getTargetVShiftNode(unsigned Opc, DebugLoc dl, EVT VT, SDValue SrcOp, SDValue ShAmt, @@ -11377,13 +11433,55 @@ static SDValue LowerMUL(SDValue Op, const X86Subtarget *Subtarget, return DAG.getNode(ISD::ADD, dl, VT, Res, AhiBlo); } +SDValue X86TargetLowering::LowerSDIV(SDValue Op, SelectionDAG &DAG) const { + EVT VT = Op.getValueType(); + EVT EltTy = VT.getVectorElementType(); + unsigned NumElts = VT.getVectorNumElements(); + SDValue N0 = Op.getOperand(0); + DebugLoc dl = Op.getDebugLoc(); + + // Lower sdiv X, pow2-const. + BuildVectorSDNode *C = dyn_cast<BuildVectorSDNode>(Op.getOperand(1)); + if (!C) + return SDValue(); + + APInt SplatValue, SplatUndef; + unsigned MinSplatBits; + bool HasAnyUndefs; + if (!C->isConstantSplat(SplatValue, SplatUndef, MinSplatBits, HasAnyUndefs)) + return SDValue(); + + if ((SplatValue != 0) && + (SplatValue.isPowerOf2() || (-SplatValue).isPowerOf2())) { + unsigned lg2 = SplatValue.countTrailingZeros(); + // Splat the sign bit. + SDValue Sz = DAG.getConstant(EltTy.getSizeInBits()-1, MVT::i32); + SDValue SGN = getTargetVShiftNode(X86ISD::VSRAI, dl, VT, N0, Sz, DAG); + // Add (N0 < 0) ? abs2 - 1 : 0; + SDValue Amt = DAG.getConstant(EltTy.getSizeInBits() - lg2, MVT::i32); + SDValue SRL = getTargetVShiftNode(X86ISD::VSRLI, dl, VT, SGN, Amt, DAG); + SDValue ADD = DAG.getNode(ISD::ADD, dl, VT, N0, SRL); + SDValue Lg2Amt = DAG.getConstant(lg2, MVT::i32); + SDValue SRA = getTargetVShiftNode(X86ISD::VSRAI, dl, VT, ADD, Lg2Amt, DAG); + + // If we're dividing by a positive value, we're done. Otherwise, we must + // negate the result. + if (SplatValue.isNonNegative()) + return SRA; + + SmallVector<SDValue, 16> V(NumElts, DAG.getConstant(0, EltTy)); + SDValue Zero = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &V[0], NumElts); + return DAG.getNode(ISD::SUB, dl, VT, Zero, SRA); + } + return SDValue(); +} + SDValue X86TargetLowering::LowerShift(SDValue Op, SelectionDAG &DAG) const { EVT VT = Op.getValueType(); DebugLoc dl = Op.getDebugLoc(); SDValue R = Op.getOperand(0); SDValue Amt = Op.getOperand(1); - LLVMContext *Context = DAG.getContext(); if (!Subtarget->hasSSE2()) return SDValue(); @@ -11500,17 +11598,9 @@ SDValue X86TargetLowering::LowerShift(SDValue Op, SelectionDAG &DAG) const { // Lower SHL with variable shift amount. if (VT == MVT::v4i32 && Op->getOpcode() == ISD::SHL) { - Op = DAG.getNode(X86ISD::VSHLI, dl, VT, Op.getOperand(1), - DAG.getConstant(23, MVT::i32)); - - const uint32_t CV[] = { 0x3f800000U, 0x3f800000U, 0x3f800000U, 0x3f800000U}; - Constant *C = ConstantDataVector::get(*Context, CV); - SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16); - SDValue Addend = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx, - MachinePointerInfo::getConstantPool(), - false, false, false, 16); + Op = DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(23, VT)); - Op = DAG.getNode(ISD::ADD, dl, VT, Op, Addend); + Op = DAG.getNode(ISD::ADD, dl, VT, Op, DAG.getConstant(0x3f800000U, VT)); Op = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, Op); Op = DAG.getNode(ISD::FP_TO_SINT, dl, VT, Op); return DAG.getNode(ISD::MUL, dl, VT, Op, R); @@ -11519,8 +11609,7 @@ SDValue X86TargetLowering::LowerShift(SDValue Op, SelectionDAG &DAG) const { assert(Subtarget->hasSSE2() && "Need SSE2 for pslli/pcmpeq."); // a = a << 5; - Op = DAG.getNode(X86ISD::VSHLI, dl, MVT::v8i16, Op.getOperand(1), - DAG.getConstant(5, MVT::i32)); + Op = DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(5, VT)); Op = DAG.getNode(ISD::BITCAST, dl, VT, Op); // Turn 'a' into a mask suitable for VSELECT @@ -11952,6 +12041,43 @@ static SDValue LowerADDC_ADDE_SUBC_SUBE(SDValue Op, SelectionDAG &DAG) { Op.getOperand(1), Op.getOperand(2)); } +SDValue X86TargetLowering::LowerFSINCOS(SDValue Op, SelectionDAG &DAG) const { + assert(Subtarget->isTargetDarwin() && Subtarget->is64Bit()); + + // For MacOSX, we want to call an alternative entry point: __sincos_stret, + // which returns the values in two XMM registers. + DebugLoc dl = Op.getDebugLoc(); + SDValue Arg = Op.getOperand(0); + EVT ArgVT = Arg.getValueType(); + Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext()); + + ArgListTy Args; + ArgListEntry Entry; + + Entry.Node = Arg; + Entry.Ty = ArgTy; + Entry.isSExt = false; + Entry.isZExt = false; + Args.push_back(Entry); + + // Only optimize x86_64 for now. i386 is a bit messy. For f32, + // the small struct {f32, f32} is returned in (eax, edx). For f64, + // the results are returned via SRet in memory. + const char *LibcallName = (ArgVT == MVT::f64) + ? "__sincos_stret" : "__sincosf_stret"; + SDValue Callee = DAG.getExternalSymbol(LibcallName, getPointerTy()); + + StructType *RetTy = StructType::get(ArgTy, ArgTy, NULL); + TargetLowering:: + CallLoweringInfo CLI(DAG.getEntryNode(), RetTy, + false, false, false, false, 0, + CallingConv::C, /*isTaillCall=*/false, + /*doesNotRet=*/false, /*isReturnValueUsed*/true, + Callee, Args, DAG, dl); + std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI); + return CallResult.first; +} + /// LowerOperation - Provide custom lowering hooks for some operations. /// SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { @@ -11981,13 +12107,13 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::SRL_PARTS: return LowerShiftParts(Op, DAG); case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG); case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG); - case ISD::TRUNCATE: return lowerTRUNCATE(Op, DAG); + case ISD::TRUNCATE: return LowerTRUNCATE(Op, DAG); case ISD::ZERO_EXTEND: return LowerZERO_EXTEND(Op, DAG); case ISD::SIGN_EXTEND: return LowerSIGN_EXTEND(Op, DAG); case ISD::ANY_EXTEND: return LowerANY_EXTEND(Op, DAG); case ISD::FP_TO_SINT: return LowerFP_TO_SINT(Op, DAG); case ISD::FP_TO_UINT: return LowerFP_TO_UINT(Op, DAG); - case ISD::FP_EXTEND: return lowerFP_EXTEND(Op, DAG); + case ISD::FP_EXTEND: return LowerFP_EXTEND(Op, DAG); case ISD::FABS: return LowerFABS(Op, DAG); case ISD::FNEG: return LowerFNEG(Op, DAG); case ISD::FCOPYSIGN: return LowerFCOPYSIGN(Op, DAG); @@ -12033,6 +12159,8 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::SUBE: return LowerADDC_ADDE_SUBC_SUBE(Op, DAG); case ISD::ADD: return LowerADD(Op, DAG); case ISD::SUB: return LowerSUB(Op, DAG); + case ISD::SDIV: return LowerSDIV(Op, DAG); + case ISD::FSINCOS: return LowerFSINCOS(Op, DAG); } } @@ -12372,7 +12500,7 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::MUL_IMM: return "X86ISD::MUL_IMM"; case X86ISD::PTEST: return "X86ISD::PTEST"; case X86ISD::TESTP: return "X86ISD::TESTP"; - case X86ISD::PALIGN: return "X86ISD::PALIGN"; + case X86ISD::PALIGNR: return "X86ISD::PALIGNR"; case X86ISD::PSHUFD: return "X86ISD::PSHUFD"; case X86ISD::PSHUFHW: return "X86ISD::PSHUFHW"; case X86ISD::PSHUFLW: return "X86ISD::PSHUFLW"; @@ -12783,7 +12911,7 @@ X86TargetLowering::EmitAtomicLoadArith(MachineInstr *MI, MachineFunction::iterator I = MBB; ++I; - assert(MI->getNumOperands() <= X86::AddrNumOperands + 2 && + assert(MI->getNumOperands() <= X86::AddrNumOperands + 4 && "Unexpected number of operands"); assert(MI->hasOneMemOperand() && @@ -13015,7 +13143,7 @@ X86TargetLowering::EmitAtomicLoadArith6432(MachineInstr *MI, MachineFunction::iterator I = MBB; ++I; - assert(MI->getNumOperands() <= X86::AddrNumOperands + 4 && + assert(MI->getNumOperands() <= X86::AddrNumOperands + 7 && "Unexpected number of operands"); assert(MI->hasOneMemOperand() && @@ -15246,13 +15374,9 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG, if (CC == ISD::SETUGT && Other->getOpcode() == ISD::ADD && isSplatVector(CondRHS.getNode()) && isSplatVector(OpRHS.getNode())) { APInt A = cast<ConstantSDNode>(OpRHS.getOperand(0))->getAPIntValue(); - if (CondRHS.getConstantOperandVal(0) == -A-1) { - SmallVector<SDValue, 32> V(VT.getVectorNumElements(), - DAG.getConstant(-A, VT.getScalarType())); + if (CondRHS.getConstantOperandVal(0) == -A-1) return DAG.getNode(X86ISD::SUBUS, DL, VT, OpLHS, - DAG.getNode(ISD::BUILD_VECTOR, DL, VT, - V.data(), V.size())); - } + DAG.getConstant(-A, VT)); } // Another special case: If C was a sign bit, the sub has been @@ -15552,7 +15676,7 @@ static SDValue PerformCMOVCombine(SDNode *N, SelectionDAG &DAG, ConstantSDNode *CmpAgainst = 0; if ((Cond.getOpcode() == X86ISD::CMP || Cond.getOpcode() == X86ISD::SUB) && (CmpAgainst = dyn_cast<ConstantSDNode>(Cond.getOperand(1))) && - dyn_cast<ConstantSDNode>(Cond.getOperand(0)) == 0) { + !isa<ConstantSDNode>(Cond.getOperand(0))) { if (CC == X86::COND_NE && CmpAgainst == dyn_cast<ConstantSDNode>(FalseOp)) { @@ -15832,8 +15956,7 @@ static SDValue CMPEQCombine(SDNode *N, SelectionDAG &DAG, if (VT == MVT::f32 || VT == MVT::f64) { bool ExpectingFlags = false; // Check for any users that want flags: - for (SDNode::use_iterator UI = N->use_begin(), - UE = N->use_end(); + for (SDNode::use_iterator UI = N->use_begin(), UE = N->use_end(); !ExpectingFlags && UI != UE; ++UI) switch (UI->getOpcode()) { default: @@ -15920,7 +16043,7 @@ static SDValue WidenMaskArithmetic(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget *Subtarget) { EVT VT = N->getValueType(0); - if (VT.getSizeInBits() != 256) + if (!VT.is256BitVector()) return SDValue(); assert((N->getOpcode() == ISD::ANY_EXTEND || @@ -15929,7 +16052,7 @@ static SDValue WidenMaskArithmetic(SDNode *N, SelectionDAG &DAG, SDValue Narrow = N->getOperand(0); EVT NarrowVT = Narrow->getValueType(0); - if (NarrowVT.getSizeInBits() != 128) + if (!NarrowVT.is128BitVector()) return SDValue(); if (Narrow->getOpcode() != ISD::XOR && @@ -16125,11 +16248,6 @@ static SDValue PerformOrCombine(SDNode *N, SelectionDAG &DAG, DebugLoc DL = N->getDebugLoc(); - // We are going to replace the AND, OR, NAND with either BLEND - // or PSIGN, which only look at the MSB. The VSRAI instruction - // does not affect the highest bit, so we can get rid of it. - Mask = Mask.getOperand(0); - // Now we know we at least have a plendvb with the mask val. See if // we can form a psignb/w/d. // psign = x.type == y.type == mask.type && y = sub(0, x); @@ -16138,7 +16256,7 @@ static SDValue PerformOrCombine(SDNode *N, SelectionDAG &DAG, X.getValueType() == MaskVT && Y.getValueType() == MaskVT) { assert((EltBits == 8 || EltBits == 16 || EltBits == 32) && "Unsupported VT for PSIGN"); - Mask = DAG.getNode(X86ISD::PSIGN, DL, MaskVT, X, Mask); + Mask = DAG.getNode(X86ISD::PSIGN, DL, MaskVT, X, Mask.getOperand(0)); return DAG.getNode(ISD::BITCAST, DL, VT, Mask); } // PBLENDVB only available on SSE 4.1 @@ -16296,8 +16414,42 @@ static SDValue PerformLOADCombine(SDNode *N, SelectionDAG &DAG, EVT MemVT = Ld->getMemoryVT(); DebugLoc dl = Ld->getDebugLoc(); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + unsigned RegSz = RegVT.getSizeInBits(); ISD::LoadExtType Ext = Ld->getExtensionType(); + unsigned Alignment = Ld->getAlignment(); + bool IsAligned = Alignment == 0 || Alignment == MemVT.getSizeInBits()/8; + + // On Sandybridge unaligned 256bit loads are inefficient. + if (RegVT.is256BitVector() && !Subtarget->hasInt256() && + !DCI.isBeforeLegalizeOps() && !IsAligned && Ext == ISD::NON_EXTLOAD) { + unsigned NumElems = RegVT.getVectorNumElements(); + if (NumElems < 2) + return SDValue(); + + SDValue Ptr = Ld->getBasePtr(); + SDValue Increment = DAG.getConstant(16, TLI.getPointerTy()); + + EVT HalfVT = EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(), + NumElems/2); + SDValue Load1 = DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr, + Ld->getPointerInfo(), Ld->isVolatile(), + Ld->isNonTemporal(), Ld->isInvariant(), + Alignment); + Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr, Increment); + SDValue Load2 = DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr, + Ld->getPointerInfo(), Ld->isVolatile(), + Ld->isNonTemporal(), Ld->isInvariant(), + std::max(Alignment/2U, 1U)); + SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, + Load1.getValue(1), + Load2.getValue(1)); + + SDValue NewVec = DAG.getUNDEF(RegVT); + NewVec = Insert128BitVector(NewVec, Load1, 0, DAG, dl); + NewVec = Insert128BitVector(NewVec, Load2, NumElems/2, DAG, dl); + return DCI.CombineTo(N, NewVec, TF, true); + } // If this is a vector EXT Load then attempt to optimize it using a // shuffle. If SSSE3 is not available we may emit an illegal shuffle but the @@ -16312,7 +16464,6 @@ static SDValue PerformLOADCombine(SDNode *N, SelectionDAG &DAG, assert(MemVT.isVector() && "Must load a vector from memory"); unsigned NumElems = RegVT.getVectorNumElements(); - unsigned RegSz = RegVT.getSizeInBits(); unsigned MemSz = MemVT.getSizeInBits(); assert(RegSz > MemSz && "Register size must be greater than the mem size"); @@ -16356,8 +16507,8 @@ static SDValue PerformLOADCombine(SDNode *N, SelectionDAG &DAG, // Represent the data using the same element type that is stored in // memory. In practice, we ''widen'' MemVT. - EVT WideVecVT = - EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(), + EVT WideVecVT = + EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(), loadRegZize/MemVT.getScalarType().getSizeInBits()); assert(WideVecVT.getSizeInBits() == LoadUnitVecVT.getSizeInBits() && @@ -16426,10 +16577,8 @@ static SDValue PerformLOADCombine(SDNode *N, SelectionDAG &DAG, // Build the arithmetic shift. unsigned Amt = RegVT.getVectorElementType().getSizeInBits() - MemVT.getVectorElementType().getSizeInBits(); - SmallVector<SDValue, 8> C(NumElems, - DAG.getConstant(Amt, RegVT.getScalarType())); - SDValue BV = DAG.getNode(ISD::BUILD_VECTOR, dl, RegVT, &C[0], C.size()); - Shuff = DAG.getNode(ISD::SRA, dl, RegVT, Shuff, BV); + Shuff = DAG.getNode(ISD::SRA, dl, RegVT, Shuff, + DAG.getConstant(Amt, RegVT)); return DCI.CombineTo(N, Shuff, TF, true); } @@ -16462,16 +16611,21 @@ static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG, DebugLoc dl = St->getDebugLoc(); SDValue StoredVal = St->getOperand(1); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + unsigned Alignment = St->getAlignment(); + bool IsAligned = Alignment == 0 || Alignment == VT.getSizeInBits()/8; // If we are saving a concatenation of two XMM registers, perform two stores. // On Sandy Bridge, 256-bit memory operations are executed by two // 128-bit ports. However, on Haswell it is better to issue a single 256-bit // memory operation. if (VT.is256BitVector() && !Subtarget->hasInt256() && - StoredVal.getNode()->getOpcode() == ISD::CONCAT_VECTORS && - StoredVal.getNumOperands() == 2) { - SDValue Value0 = StoredVal.getOperand(0); - SDValue Value1 = StoredVal.getOperand(1); + StVT == VT && !IsAligned) { + unsigned NumElems = VT.getVectorNumElements(); + if (NumElems < 2) + return SDValue(); + + SDValue Value0 = Extract128BitVector(StoredVal, 0, DAG, dl); + SDValue Value1 = Extract128BitVector(StoredVal, NumElems/2, DAG, dl); SDValue Stride = DAG.getConstant(16, TLI.getPointerTy()); SDValue Ptr0 = St->getBasePtr(); @@ -16479,10 +16633,11 @@ static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG, SDValue Ch0 = DAG.getStore(St->getChain(), dl, Value0, Ptr0, St->getPointerInfo(), St->isVolatile(), - St->isNonTemporal(), St->getAlignment()); + St->isNonTemporal(), Alignment); SDValue Ch1 = DAG.getStore(St->getChain(), dl, Value1, Ptr1, St->getPointerInfo(), St->isVolatile(), - St->isNonTemporal(), St->getAlignment()); + St->isNonTemporal(), + std::max(Alignment/2U, 1U)); return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Ch0, Ch1); } @@ -16917,6 +17072,41 @@ static SDValue PerformVZEXT_MOVLCombine(SDNode *N, SelectionDAG &DAG) { return SDValue(); } +static SDValue PerformSIGN_EXTEND_INREGCombine(SDNode *N, SelectionDAG &DAG, + const X86Subtarget *Subtarget) { + EVT VT = N->getValueType(0); + if (!VT.isVector()) + return SDValue(); + + SDValue N0 = N->getOperand(0); + SDValue N1 = N->getOperand(1); + EVT ExtraVT = cast<VTSDNode>(N1)->getVT(); + DebugLoc dl = N->getDebugLoc(); + + // The SIGN_EXTEND_INREG to v4i64 is expensive operation on the + // both SSE and AVX2 since there is no sign-extended shift right + // operation on a vector with 64-bit elements. + //(sext_in_reg (v4i64 anyext (v4i32 x )), ExtraVT) -> + // (v4i64 sext (v4i32 sext_in_reg (v4i32 x , ExtraVT))) + if (VT == MVT::v4i64 && (N0.getOpcode() == ISD::ANY_EXTEND || + N0.getOpcode() == ISD::SIGN_EXTEND)) { + SDValue N00 = N0.getOperand(0); + + // EXTLOAD has a better solution on AVX2, + // it may be replaced with X86ISD::VSEXT node. + if (N00.getOpcode() == ISD::LOAD && Subtarget->hasInt256()) + if (!ISD::isNormalLoad(N00.getNode())) + return SDValue(); + + if (N00.getValueType() == MVT::v4i32 && ExtraVT.getSizeInBits() < 128) { + SDValue Tmp = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, MVT::v4i32, + N00, N1); + return DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i64, Tmp); + } + } + return SDValue(); +} + static SDValue PerformSExtCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget *Subtarget) { @@ -17002,7 +17192,7 @@ static SDValue PerformZExtCombine(SDNode *N, SelectionDAG &DAG, } } - if (VT.isVector() && VT.getSizeInBits() == 256) { + if (VT.is256BitVector()) { SDValue R = WidenMaskArithmetic(N, DAG, DCI, Subtarget); if (R.getNode()) return R; @@ -17037,8 +17227,8 @@ static SDValue PerformISDSETCCCombine(SDNode *N, SelectionDAG &DAG) { return SDValue(); } -// Helper function of PerformSETCCCombine. It is to materialize "setb reg" -// as "sbb reg,reg", since it can be extended without zext and produces +// Helper function of PerformSETCCCombine. It is to materialize "setb reg" +// as "sbb reg,reg", since it can be extended without zext and produces // an all-ones bit which is more useful than 0/1 in some cases. static SDValue MaterializeSETB(DebugLoc DL, SDValue EFLAGS, SelectionDAG &DAG) { return DAG.getNode(ISD::AND, DL, MVT::i8, @@ -17056,13 +17246,13 @@ static SDValue PerformSETCCCombine(SDNode *N, SelectionDAG &DAG, SDValue EFLAGS = N->getOperand(1); if (CC == X86::COND_A) { - // Try to convert COND_A into COND_B in an attempt to facilitate + // Try to convert COND_A into COND_B in an attempt to facilitate // materializing "setb reg". // // Do not flip "e > c", where "c" is a constant, because Cmp instruction // cannot take an immediate as its first operand. // - if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.hasOneUse() && + if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.hasOneUse() && EFLAGS.getValueType().isInteger() && !isa<ConstantSDNode>(EFLAGS.getOperand(1))) { SDValue NewSub = DAG.getNode(X86ISD::SUB, EFLAGS.getDebugLoc(), @@ -17270,7 +17460,8 @@ static SDValue performVZEXTCombine(SDNode *N, SelectionDAG &DAG, if (In.getOpcode() != X86ISD::VZEXT) return SDValue(); - return DAG.getNode(X86ISD::VZEXT, N->getDebugLoc(), N->getValueType(0), In.getOperand(0)); + return DAG.getNode(X86ISD::VZEXT, N->getDebugLoc(), N->getValueType(0), + In.getOperand(0)); } SDValue X86TargetLowering::PerformDAGCombine(SDNode *N, @@ -17308,13 +17499,14 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N, case ISD::ANY_EXTEND: case ISD::ZERO_EXTEND: return PerformZExtCombine(N, DAG, DCI, Subtarget); case ISD::SIGN_EXTEND: return PerformSExtCombine(N, DAG, DCI, Subtarget); + case ISD::SIGN_EXTEND_INREG: return PerformSIGN_EXTEND_INREGCombine(N, DAG, Subtarget); case ISD::TRUNCATE: return PerformTruncateCombine(N, DAG,DCI,Subtarget); case ISD::SETCC: return PerformISDSETCCCombine(N, DAG); case X86ISD::SETCC: return PerformSETCCCombine(N, DAG, DCI, Subtarget); case X86ISD::BRCOND: return PerformBrCondCombine(N, DAG, DCI, Subtarget); case X86ISD::VZEXT: return performVZEXTCombine(N, DAG, DCI, Subtarget); case X86ISD::SHUFP: // Handle all target specific shuffles - case X86ISD::PALIGN: + case X86ISD::PALIGNR: case X86ISD::UNPCKH: case X86ISD::UNPCKL: case X86ISD::MOVHLPS: @@ -17497,7 +17689,7 @@ bool X86TargetLowering::ExpandInlineAsm(CallInst *CI) const { AsmPieces.clear(); const std::string &ConstraintsStr = IA->getConstraintString(); SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ","); - std::sort(AsmPieces.begin(), AsmPieces.end()); + array_pod_sort(AsmPieces.begin(), AsmPieces.end()); if (AsmPieces.size() == 4 && AsmPieces[0] == "~{cc}" && AsmPieces[1] == "~{dirflag}" && @@ -17515,7 +17707,7 @@ bool X86TargetLowering::ExpandInlineAsm(CallInst *CI) const { AsmPieces.clear(); const std::string &ConstraintsStr = IA->getConstraintString(); SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ","); - std::sort(AsmPieces.begin(), AsmPieces.end()); + array_pod_sort(AsmPieces.begin(), AsmPieces.end()); if (AsmPieces.size() == 4 && AsmPieces[0] == "~{cc}" && AsmPieces[1] == "~{dirflag}" && @@ -17995,7 +18187,7 @@ X86TargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint, // really want an 8-bit or 32-bit register, map to the appropriate register // class and return the appropriate register. if (Res.second == &X86::GR16RegClass) { - if (VT == MVT::i8) { + if (VT == MVT::i8 || VT == MVT::i1) { unsigned DestReg = 0; switch (Res.first) { default: break; @@ -18008,7 +18200,7 @@ X86TargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint, Res.first = DestReg; Res.second = &X86::GR8RegClass; } - } else if (VT == MVT::i32) { + } else if (VT == MVT::i32 || VT == MVT::f32) { unsigned DestReg = 0; switch (Res.first) { default: break; @@ -18025,7 +18217,7 @@ X86TargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint, Res.first = DestReg; Res.second = &X86::GR32RegClass; } - } else if (VT == MVT::i64) { + } else if (VT == MVT::i64 || VT == MVT::f64) { unsigned DestReg = 0; switch (Res.first) { default: break; |