diff options
Diffstat (limited to 'lib/Transforms/Vectorize')
-rw-r--r-- | lib/Transforms/Vectorize/BBVectorize.cpp | 94 | ||||
-rw-r--r-- | lib/Transforms/Vectorize/LoopVectorize.cpp | 443 | ||||
-rw-r--r-- | lib/Transforms/Vectorize/SLPVectorizer.cpp | 257 |
3 files changed, 413 insertions, 381 deletions
diff --git a/lib/Transforms/Vectorize/BBVectorize.cpp b/lib/Transforms/Vectorize/BBVectorize.cpp index 71350e7b39..28ec83bf86 100644 --- a/lib/Transforms/Vectorize/BBVectorize.cpp +++ b/lib/Transforms/Vectorize/BBVectorize.cpp @@ -15,7 +15,6 @@ //===----------------------------------------------------------------------===// #define BBV_NAME "bb-vectorize" -#define DEBUG_TYPE BBV_NAME #include "llvm/Transforms/Vectorize.h" #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/DenseSet.h" @@ -50,6 +49,8 @@ #include <algorithm> using namespace llvm; +#define DEBUG_TYPE BBV_NAME + static cl::opt<bool> IgnoreTargetInfo("bb-vectorize-ignore-target-info", cl::init(false), cl::Hidden, cl::desc("Ignore target information")); @@ -122,6 +123,10 @@ NoMath("bb-vectorize-no-math", cl::init(false), cl::Hidden, cl::desc("Don't try to vectorize floating-point math intrinsics")); static cl::opt<bool> + NoBitManipulation("bb-vectorize-no-bitmanip", cl::init(false), cl::Hidden, + cl::desc("Don't try to vectorize BitManipulation intrinsics")); + +static cl::opt<bool> NoFMA("bb-vectorize-no-fma", cl::init(false), cl::Hidden, cl::desc("Don't try to vectorize the fused-multiply-add intrinsic")); @@ -202,8 +207,8 @@ namespace { DT = &P->getAnalysis<DominatorTreeWrapperPass>().getDomTree(); SE = &P->getAnalysis<ScalarEvolution>(); DataLayoutPass *DLP = P->getAnalysisIfAvailable<DataLayoutPass>(); - DL = DLP ? &DLP->getDataLayout() : 0; - TTI = IgnoreTargetInfo ? 0 : &P->getAnalysis<TargetTransformInfo>(); + DL = DLP ? &DLP->getDataLayout() : nullptr; + TTI = IgnoreTargetInfo ? nullptr : &P->getAnalysis<TargetTransformInfo>(); } typedef std::pair<Value *, Value *> ValuePair; @@ -279,7 +284,7 @@ namespace { bool trackUsesOfI(DenseSet<Value *> &Users, AliasSetTracker &WriteSet, Instruction *I, Instruction *J, bool UpdateUsers = true, - DenseSet<ValuePair> *LoadMoveSetPairs = 0); + DenseSet<ValuePair> *LoadMoveSetPairs = nullptr); void computePairsConnectedTo( DenseMap<Value *, std::vector<Value *> > &CandidatePairs, @@ -292,8 +297,8 @@ namespace { bool pairsConflict(ValuePair P, ValuePair Q, DenseSet<ValuePair> &PairableInstUsers, DenseMap<ValuePair, std::vector<ValuePair> > - *PairableInstUserMap = 0, - DenseSet<VPPair> *PairableInstUserPairSet = 0); + *PairableInstUserMap = nullptr, + DenseSet<VPPair> *PairableInstUserPairSet = nullptr); bool pairWillFormCycle(ValuePair P, DenseMap<ValuePair, std::vector<ValuePair> > &PairableInstUsers, @@ -438,8 +443,8 @@ namespace { DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); SE = &getAnalysis<ScalarEvolution>(); DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>(); - DL = DLP ? &DLP->getDataLayout() : 0; - TTI = IgnoreTargetInfo ? 0 : &getAnalysis<TargetTransformInfo>(); + DL = DLP ? &DLP->getDataLayout() : nullptr; + TTI = IgnoreTargetInfo ? nullptr : &getAnalysis<TargetTransformInfo>(); return vectorizeBB(BB); } @@ -674,7 +679,20 @@ namespace { case Intrinsic::exp: case Intrinsic::exp2: case Intrinsic::pow: + case Intrinsic::round: + case Intrinsic::copysign: + case Intrinsic::ceil: + case Intrinsic::nearbyint: + case Intrinsic::rint: + case Intrinsic::trunc: + case Intrinsic::floor: + case Intrinsic::fabs: return Config.VectorizeMath; + case Intrinsic::bswap: + case Intrinsic::ctpop: + case Intrinsic::ctlz: + case Intrinsic::cttz: + return Config.VectorizeBitManipulations; case Intrinsic::fma: case Intrinsic::fmuladd: return Config.VectorizeFMA; @@ -878,7 +896,7 @@ namespace { } // We can't vectorize memory operations without target data - if (DL == 0 && IsSimpleLoadStore) + if (!DL && IsSimpleLoadStore) return false; Type *T1, *T2; @@ -915,7 +933,7 @@ namespace { if (T2->isX86_FP80Ty() || T2->isPPC_FP128Ty() || T2->isX86_MMXTy()) return false; - if ((!Config.VectorizePointers || DL == 0) && + if ((!Config.VectorizePointers || !DL) && (T1->getScalarType()->isPointerTy() || T2->getScalarType()->isPointerTy())) return false; @@ -1049,7 +1067,7 @@ namespace { (isa<ConstantVector>(JOp) || isa<ConstantDataVector>(JOp))) { Op2VK = TargetTransformInfo::OK_NonUniformConstantValue; Constant *SplatValue = cast<Constant>(IOp)->getSplatValue(); - if (SplatValue != NULL && + if (SplatValue != nullptr && SplatValue == cast<Constant>(JOp)->getSplatValue()) Op2VK = TargetTransformInfo::OK_UniformConstantValue; } @@ -1079,13 +1097,14 @@ namespace { CostSavings = ICost + JCost - VCost; } - // The powi intrinsic is special because only the first argument is - // vectorized, the second arguments must be equal. + // The powi,ctlz,cttz intrinsics are special because only the first + // argument is vectorized, the second arguments must be equal. CallInst *CI = dyn_cast<CallInst>(I); Function *FI; if (CI && (FI = CI->getCalledFunction())) { Intrinsic::ID IID = (Intrinsic::ID) FI->getIntrinsicID(); - if (IID == Intrinsic::powi) { + if (IID == Intrinsic::powi || IID == Intrinsic::ctlz || + IID == Intrinsic::cttz) { Value *A1I = CI->getArgOperand(1), *A1J = cast<CallInst>(J)->getArgOperand(1); const SCEV *A1ISCEV = SE->getSCEV(A1I), @@ -1109,7 +1128,8 @@ namespace { assert(CI->getNumArgOperands() == CJ->getNumArgOperands() && "Intrinsic argument counts differ"); for (unsigned i = 0, ie = CI->getNumArgOperands(); i != ie; ++i) { - if (IID == Intrinsic::powi && i == 1) + if ((IID == Intrinsic::powi || IID == Intrinsic::ctlz || + IID == Intrinsic::cttz) && i == 1) Tys.push_back(CI->getArgOperand(i)->getType()); else Tys.push_back(getVecTypeForPair(CI->getArgOperand(i)->getType(), @@ -1665,8 +1685,9 @@ namespace { C2->first.second == C->first.first || C2->first.second == C->first.second || pairsConflict(C2->first, C->first, PairableInstUsers, - UseCycleCheck ? &PairableInstUserMap : 0, - UseCycleCheck ? &PairableInstUserPairSet : 0)) { + UseCycleCheck ? &PairableInstUserMap : nullptr, + UseCycleCheck ? &PairableInstUserPairSet + : nullptr)) { if (C2->second >= C->second) { CanAdd = false; break; @@ -1686,8 +1707,9 @@ namespace { T->second == C->first.first || T->second == C->first.second || pairsConflict(*T, C->first, PairableInstUsers, - UseCycleCheck ? &PairableInstUserMap : 0, - UseCycleCheck ? &PairableInstUserPairSet : 0)) { + UseCycleCheck ? &PairableInstUserMap : nullptr, + UseCycleCheck ? &PairableInstUserPairSet + : nullptr)) { CanAdd = false; break; } @@ -1704,8 +1726,9 @@ namespace { C2->first.second == C->first.first || C2->first.second == C->first.second || pairsConflict(C2->first, C->first, PairableInstUsers, - UseCycleCheck ? &PairableInstUserMap : 0, - UseCycleCheck ? &PairableInstUserPairSet : 0)) { + UseCycleCheck ? &PairableInstUserMap : nullptr, + UseCycleCheck ? &PairableInstUserPairSet + : nullptr)) { CanAdd = false; break; } @@ -1720,8 +1743,9 @@ namespace { ChosenPairs.begin(), E2 = ChosenPairs.end(); C2 != E2; ++C2) { if (pairsConflict(*C2, C->first, PairableInstUsers, - UseCycleCheck ? &PairableInstUserMap : 0, - UseCycleCheck ? &PairableInstUserPairSet : 0)) { + UseCycleCheck ? &PairableInstUserMap : nullptr, + UseCycleCheck ? &PairableInstUserPairSet + : nullptr)) { CanAdd = false; break; } @@ -1802,8 +1826,8 @@ namespace { for (DenseMap<Value *, Value *>::iterator C = ChosenPairs.begin(), E = ChosenPairs.end(); C != E; ++C) { if (pairsConflict(*C, IJ, PairableInstUsers, - UseCycleCheck ? &PairableInstUserMap : 0, - UseCycleCheck ? &PairableInstUserPairSet : 0)) { + UseCycleCheck ? &PairableInstUserMap : nullptr, + UseCycleCheck ? &PairableInstUserPairSet : nullptr)) { DoesConflict = true; break; } @@ -2373,7 +2397,7 @@ namespace { } while ((LIENext = dyn_cast<InsertElementInst>(LIENext->getOperand(0)))); - LIENext = 0; + LIENext = nullptr; Value *LIEPrev = UndefValue::get(ArgTypeH); for (unsigned i = 0; i < numElemL; ++i) { if (isa<UndefValue>(VectElemts[i])) continue; @@ -2441,14 +2465,14 @@ namespace { if ((LEE || LSV) && (HEE || HSV) && !IsSizeChangeShuffle) { // We can have at most two unique vector inputs. bool CanUseInputs = true; - Value *I1, *I2 = 0; + Value *I1, *I2 = nullptr; if (LEE) { I1 = LEE->getOperand(0); } else { I1 = LSV->getOperand(0); I2 = LSV->getOperand(1); if (I2 == I1 || isa<UndefValue>(I2)) - I2 = 0; + I2 = nullptr; } if (HEE) { @@ -2764,10 +2788,11 @@ namespace { ReplacedOperands[o] = Intrinsic::getDeclaration(M, IID, VArgType); continue; - } else if (IID == Intrinsic::powi && o == 1) { - // The second argument of powi is a single integer and we've already - // checked that both arguments are equal. As a result, we just keep - // I's second argument. + } else if ((IID == Intrinsic::powi || IID == Intrinsic::ctlz || + IID == Intrinsic::cttz) && o == 1) { + // The second argument of powi/ctlz/cttz is a single integer/constant + // and we've already checked that both arguments are equal. + // As a result, we just keep I's second argument. ReplacedOperands[o] = I->getOperand(o); continue; } @@ -2952,7 +2977,7 @@ namespace { switch (Kind) { default: - K->setMetadata(Kind, 0); // Remove unknown metadata + K->setMetadata(Kind, nullptr); // Remove unknown metadata break; case LLVMContext::MD_tbaa: K->setMetadata(Kind, MDNode::getMostGenericTBAA(JMD, KMD)); @@ -3123,7 +3148,7 @@ namespace { // Instruction insertion point: Instruction *InsertionPt = K; - Instruction *K1 = 0, *K2 = 0; + Instruction *K1 = nullptr, *K2 = nullptr; replaceOutputsOfPair(Context, L, H, K, InsertionPt, K1, K2); // The use dag of the first original instruction must be moved to after @@ -3213,6 +3238,7 @@ VectorizeConfig::VectorizeConfig() { VectorizePointers = !::NoPointers; VectorizeCasts = !::NoCasts; VectorizeMath = !::NoMath; + VectorizeBitManipulations = !::NoBitManipulation; VectorizeFMA = !::NoFMA; VectorizeSelect = !::NoSelect; VectorizeCmp = !::NoCmp; diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp index 9a98c44e4e..34d8a1053f 100644 --- a/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -42,9 +42,6 @@ // //===----------------------------------------------------------------------===// -#define LV_NAME "loop-vectorize" -#define DEBUG_TYPE LV_NAME - #include "llvm/Transforms/Vectorize.h" #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/EquivalenceClasses.h" @@ -54,6 +51,7 @@ #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/SmallSet.h" #include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/Statistic.h" #include "llvm/ADT/StringExtras.h" #include "llvm/Analysis/AliasAnalysis.h" #include "llvm/Analysis/BlockFrequencyInfo.h" @@ -67,7 +65,9 @@ #include "llvm/Analysis/ValueTracking.h" #include "llvm/IR/Constants.h" #include "llvm/IR/DataLayout.h" +#include "llvm/IR/DebugInfo.h" #include "llvm/IR/DerivedTypes.h" +#include "llvm/IR/DiagnosticInfo.h" #include "llvm/IR/Dominators.h" #include "llvm/IR/Function.h" #include "llvm/IR/IRBuilder.h" @@ -85,16 +85,23 @@ #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" -#include "llvm/Target/TargetLibraryInfo.h" #include "llvm/Transforms/Scalar.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" #include "llvm/Transforms/Utils/Local.h" +#include "llvm/Transforms/Utils/VectorUtils.h" #include <algorithm> #include <map> +#include <tuple> using namespace llvm; using namespace llvm::PatternMatch; +#define LV_NAME "loop-vectorize" +#define DEBUG_TYPE LV_NAME + +STATISTIC(LoopsVectorized, "Number of loops vectorized"); +STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization"); + static cl::opt<unsigned> VectorizationFactor("force-vector-width", cl::init(0), cl::Hidden, cl::desc("Sets the SIMD width. Zero is autoselect.")); @@ -223,8 +230,9 @@ public: const TargetLibraryInfo *TLI, unsigned VecWidth, unsigned UnrollFactor) : OrigLoop(OrigLoop), SE(SE), LI(LI), DT(DT), DL(DL), TLI(TLI), - VF(VecWidth), UF(UnrollFactor), Builder(SE->getContext()), Induction(0), - OldInduction(0), WidenMap(UnrollFactor), Legal(0) {} + VF(VecWidth), UF(UnrollFactor), Builder(SE->getContext()), + Induction(nullptr), OldInduction(nullptr), WidenMap(UnrollFactor), + Legal(nullptr) {} // Perform the actual loop widening (vectorization). void vectorize(LoopVectorizationLegality *L) { @@ -469,6 +477,24 @@ static void setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr) { B.SetCurrentDebugLocation(DebugLoc()); } +#ifndef NDEBUG +/// \return string containing a file name and a line # for the given loop. +static std::string getDebugLocString(const Loop *L) { + std::string Result; + if (L) { + raw_string_ostream OS(Result); + const DebugLoc LoopDbgLoc = L->getStartLoc(); + if (!LoopDbgLoc.isUnknown()) + LoopDbgLoc.print(L->getHeader()->getContext(), OS); + else + // Just print the module name. + OS << L->getHeader()->getParent()->getParent()->getModuleIdentifier(); + OS.flush(); + } + return Result; +} +#endif + /// LoopVectorizationLegality checks if it is legal to vectorize a loop, and /// to what vectorization factor. /// This class does not look at the profitability of vectorization, only the @@ -491,8 +517,8 @@ public: LoopVectorizationLegality(Loop *L, ScalarEvolution *SE, const DataLayout *DL, DominatorTree *DT, TargetLibraryInfo *TLI) : NumLoads(0), NumStores(0), NumPredStores(0), TheLoop(L), SE(SE), DL(DL), - DT(DT), TLI(TLI), Induction(0), WidestIndTy(0), HasFunNoNaNAttr(false), - MaxSafeDepDistBytes(-1U) {} + DT(DT), TLI(TLI), Induction(nullptr), WidestIndTy(nullptr), + HasFunNoNaNAttr(false), MaxSafeDepDistBytes(-1U) {} /// This enum represents the kinds of reductions that we support. enum ReductionKind { @@ -530,7 +556,7 @@ public: /// This struct holds information about reduction variables. struct ReductionDescriptor { - ReductionDescriptor() : StartValue(0), LoopExitInstr(0), + ReductionDescriptor() : StartValue(nullptr), LoopExitInstr(nullptr), Kind(RK_NoReduction), MinMaxKind(MRK_Invalid) {} ReductionDescriptor(Value *Start, Instruction *Exit, ReductionKind K, @@ -602,7 +628,7 @@ public: /// A struct for saving information about induction variables. struct InductionInfo { InductionInfo(Value *Start, InductionKind K) : StartValue(Start), IK(K) {} - InductionInfo() : StartValue(0), IK(IK_NoInduction) {} + InductionInfo() : StartValue(nullptr), IK(IK_NoInduction) {} /// Start value. TrackingVH<Value> StartValue; /// Induction kind. @@ -789,7 +815,8 @@ public: /// then this vectorization factor will be selected if vectorization is /// possible. VectorizationFactor selectVectorizationFactor(bool OptForSize, - unsigned UserVF); + unsigned UserVF, + bool ForceVectorization); /// \return The size (in bits) of the widest type in the code that /// needs to be vectorized. We ignore values that remain scalar such as @@ -856,35 +883,32 @@ private: /// Utility class for getting and setting loop vectorizer hints in the form /// of loop metadata. -struct LoopVectorizeHints { - /// Vectorization width. - unsigned Width; - /// Vectorization unroll factor. - unsigned Unroll; - /// Vectorization forced (-1 not selected, 0 force disabled, 1 force enabled) - int Force; +class LoopVectorizeHints { +public: + enum ForceKind { + FK_Undefined = -1, ///< Not selected. + FK_Disabled = 0, ///< Forcing disabled. + FK_Enabled = 1, ///< Forcing enabled. + }; LoopVectorizeHints(const Loop *L, bool DisableUnrolling) - : Width(VectorizationFactor) - , Unroll(DisableUnrolling ? 1 : VectorizationUnroll) - , Force(-1) - , LoopID(L->getLoopID()) { + : Width(VectorizationFactor), + Unroll(DisableUnrolling), + Force(FK_Undefined), + LoopID(L->getLoopID()) { getHints(L); - // The command line options override any loop metadata except for when - // width == 1 which is used to indicate the loop is already vectorized. - if (VectorizationFactor.getNumOccurrences() > 0 && Width != 1) - Width = VectorizationFactor; + // force-vector-unroll overrides DisableUnrolling. if (VectorizationUnroll.getNumOccurrences() > 0) Unroll = VectorizationUnroll; - DEBUG(if (DisableUnrolling && Unroll == 1) - dbgs() << "LV: Unrolling disabled by the pass manager\n"); + DEBUG(if (DisableUnrolling && Unroll == 1) dbgs() + << "LV: Unrolling disabled by the pass manager\n"); } /// Return the loop vectorizer metadata prefix. static StringRef Prefix() { return "llvm.vectorizer."; } - MDNode *createHint(LLVMContext &Context, StringRef Name, unsigned V) { + MDNode *createHint(LLVMContext &Context, StringRef Name, unsigned V) const { SmallVector<Value*, 2> Vals; Vals.push_back(MDString::get(Context, Name)); Vals.push_back(ConstantInt::get(Type::getInt32Ty(Context), V)); @@ -918,9 +942,12 @@ struct LoopVectorizeHints { LoopID = NewLoopID; } -private: - MDNode *LoopID; + unsigned getWidth() const { return Width; } + unsigned getUnroll() const { return Unroll; } + enum ForceKind getForce() const { return Force; } + MDNode *getLoopID() const { return LoopID; } +private: /// Find hints specified in the loop metadata. void getHints(const Loop *L) { if (!LoopID) @@ -931,7 +958,7 @@ private: assert(LoopID->getOperand(0) == LoopID && "invalid loop id"); for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) { - const MDString *S = 0; + const MDString *S = nullptr; SmallVector<Value*, 4> Args; // The expected hint is either a MDString or a MDNode with the first @@ -980,13 +1007,23 @@ private: DEBUG(dbgs() << "LV: ignoring invalid unroll hint metadata\n"); } else if (Hint == "enable") { if (C->getBitWidth() == 1) - Force = Val; + Force = Val == 1 ? LoopVectorizeHints::FK_Enabled + : LoopVectorizeHints::FK_Disabled; else DEBUG(dbgs() << "LV: ignoring invalid enable hint metadata\n"); } else { DEBUG(dbgs() << "LV: ignoring unknown hint " << Hint << '\n'); } } + + /// Vectorization width. + unsigned Width; + /// Vectorization unroll factor. + unsigned Unroll; + /// Vectorization forced + enum ForceKind Force; + + MDNode *LoopID; }; static void addInnerLoop(Loop &L, SmallVectorImpl<Loop *> &V) { @@ -1024,7 +1061,7 @@ struct LoopVectorize : public FunctionPass { bool runOnFunction(Function &F) override { SE = &getAnalysis<ScalarEvolution>(); DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>(); - DL = DLP ? &DLP->getDataLayout() : 0; + DL = DLP ? &DLP->getDataLayout() : nullptr; LI = &getAnalysis<LoopInfo>(); TTI = &getAnalysis<TargetTransformInfo>(); DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); @@ -1041,8 +1078,9 @@ struct LoopVectorize : public FunctionPass { if (!TTI->getNumberOfRegisters(true)) return false; - if (DL == NULL) { - DEBUG(dbgs() << "LV: Not vectorizing: Missing data layout\n"); + if (!DL) { + DEBUG(dbgs() << "\nLV: Not vectorizing " << F.getName() + << ": Missing data layout\n"); return false; } @@ -1054,6 +1092,8 @@ struct LoopVectorize : public FunctionPass { for (Loop *L : *LI) addInnerLoop(*L, Worklist); + LoopsAnalyzed += Worklist.size(); + // Now walk the identified inner loops. bool Changed = false; while (!Worklist.empty()) @@ -1065,26 +1105,56 @@ struct LoopVectorize : public FunctionPass { bool processLoop(Loop *L) { assert(L->empty() && "Only process inner loops."); - DEBUG(dbgs() << "LV: Checking a loop in \"" << - L->getHeader()->getParent()->getName() << "\"\n"); + +#ifndef NDEBUG + const std::string DebugLocStr = getDebugLocString(L); +#endif /* NDEBUG */ + + DEBUG(dbgs() << "\nLV: Checking a loop in \"" + << L->getHeader()->getParent()->getName() << "\" from " + << DebugLocStr << "\n"); LoopVectorizeHints Hints(L, DisableUnrolling); - if (Hints.Force == 0) { + DEBUG(dbgs() << "LV: Loop hints:" + << " force=" + << (Hints.getForce() == LoopVectorizeHints::FK_Disabled + ? "disabled" + : (Hints.getForce() == LoopVectorizeHints::FK_Enabled + ? "enabled" + : "?")) << " width=" << Hints.getWidth() + << " unroll=" << Hints.getUnroll() << "\n"); + + if (Hints.getForce() == LoopVectorizeHints::FK_Disabled) { DEBUG(dbgs() << "LV: Not vectorizing: #pragma vectorize disable.\n"); return false; } - if (!AlwaysVectorize && Hints.Force != 1) { + if (!AlwaysVectorize && Hints.getForce() != LoopVectorizeHints::FK_Enabled) { DEBUG(dbgs() << "LV: Not vectorizing: No #pragma vectorize enable.\n"); return false; } - if (Hints.Width == 1 && Hints.Unroll == 1) { + if (Hints.getWidth() == 1 && Hints.getUnroll() == 1) { DEBUG(dbgs() << "LV: Not vectorizing: Disabled/already vectorized.\n"); return false; } + // Check the loop for a trip count threshold: + // do not vectorize loops with a tiny trip count. + BasicBlock *Latch = L->getLoopLatch(); + const unsigned TC = SE->getSmallConstantTripCount(L, Latch); + if (TC > 0u && TC < TinyTripCountVectorThreshold) { + DEBUG(dbgs() << "LV: Found a loop with a very small trip count. " + << "This loop is not worth vectorizing."); + if (Hints.getForce() == LoopVectorizeHints::FK_Enabled) + DEBUG(dbgs() << " But vectorizing was explicitly forced.\n"); + else { + DEBUG(dbgs() << "\n"); + return false; + } + } + // Check if it is legal to vectorize the loop. LoopVectorizationLegality LVL(L, SE, DL, DT, TLI); if (!LVL.canVectorize()) { @@ -1098,8 +1168,8 @@ struct LoopVectorize : public FunctionPass { // Check the function attributes to find out if this function should be // optimized for size. Function *F = L->getHeader()->getParent(); - bool OptForSize = - Hints.Force != 1 && F->hasFnAttribute(Attribute::OptimizeForSize); + bool OptForSize = Hints.getForce() != LoopVectorizeHints::FK_Enabled && + F->hasFnAttribute(Attribute::OptimizeForSize); // Compute the weighted frequency of this loop being executed and see if it // is less than 20% of the function entry baseline frequency. Note that we @@ -1108,7 +1178,8 @@ struct LoopVectorize : public FunctionPass { // exactly what block frequency models. if (LoopVectorizeWithBlockFrequency) { BlockFrequency LoopEntryFreq = BFI->getBlockFreq(L->getLoopPreheader()); - if (Hints.Force != 1 && LoopEntryFreq < ColdEntryFreq) + if (Hints.getForce() != LoopVectorizeHints::FK_Enabled && + LoopEntryFreq < ColdEntryFreq) OptForSize = true; } @@ -1123,14 +1194,17 @@ struct LoopVectorize : public FunctionPass { } // Select the optimal vectorization factor. - LoopVectorizationCostModel::VectorizationFactor VF; - VF = CM.selectVectorizationFactor(OptForSize, Hints.Width); + const LoopVectorizationCostModel::VectorizationFactor VF = + CM.selectVectorizationFactor(OptForSize, Hints.getWidth(), + Hints.getForce() == + LoopVectorizeHints::FK_Enabled); + // Select the unroll factor. - unsigned UF = CM.selectUnrollFactor(OptForSize, Hints.Unroll, VF.Width, - VF.Cost); + const unsigned UF = + CM.selectUnrollFactor(OptForSize, Hints.getUnroll(), VF.Width, VF.Cost); - DEBUG(dbgs() << "LV: Found a vectorizable loop ("<< VF.Width << ") in "<< - F->getParent()->getModuleIdentifier() << '\n'); + DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width << ") in " + << DebugLocStr << '\n'); DEBUG(dbgs() << "LV: Unroll Factor is " << UF << '\n'); if (VF.Width == 1) { @@ -1138,6 +1212,13 @@ struct LoopVectorize : public FunctionPass { if (UF == 1) return false; DEBUG(dbgs() << "LV: Trying to at least unroll the loops.\n"); + + // Report the unrolling decision. + emitOptimizationRemark(F->getContext(), DEBUG_TYPE, *F, L->getStartLoc(), + Twine("unrolled with interleaving factor " + + Twine(UF) + + " (vectorization not beneficial)")); + // We decided not to vectorize, but we may want to unroll. InnerLoopUnroller Unroller(L, SE, LI, DT, DL, TLI, UF); Unroller.vectorize(&LVL); @@ -1145,6 +1226,13 @@ struct LoopVectorize : public FunctionPass { // If we decided that it is *legal* to vectorize the loop then do it. InnerLoopVectorizer LB(L, SE, LI, DT, DL, TLI, VF.Width, UF); LB.vectorize(&LVL); + ++LoopsVectorized; + + // Report the vectorization decision. + emitOptimizationRemark( + F->getContext(), DEBUG_TYPE, *F, L->getStartLoc(), + Twine("vectorized loop (vectorization factor: ") + Twine(VF.Width) + + ", unrolling interleave factor: " + Twine(UF) + ")"); } // Mark the loop as already vectorized to avoid vectorizing again. @@ -1188,7 +1276,7 @@ static Value *stripIntegerCast(Value *V) { /// \p Ptr. static const SCEV *replaceSymbolicStrideSCEV(ScalarEvolution *SE, ValueToValueMap &PtrToStride, - Value *Ptr, Value *OrigPtr = 0) { + Value *Ptr, Value *OrigPtr = nullptr) { const SCEV *OrigSCEV = SE->getSCEV(Ptr); @@ -1355,7 +1443,7 @@ int LoopVectorizationLegality::isConsecutivePtr(Value *Ptr) { // We can emit wide load/stores only if the last non-zero index is the // induction variable. - const SCEV *Last = 0; + const SCEV *Last = nullptr; if (!Strides.count(Gep)) Last = SE->getSCEV(Gep->getOperand(InductionOperand)); else { @@ -1604,17 +1692,17 @@ void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, bool IfPredic // Does this instruction return a value ? bool IsVoidRetTy = Instr->getType()->isVoidTy(); - Value *UndefVec = IsVoidRetTy ? 0 : + Value *UndefVec = IsVoidRetTy ? nullptr : UndefValue::get(VectorType::get(Instr->getType(), VF)); // Create a new entry in the WidenMap and initialize it to Undef or Null. VectorParts &VecResults = WidenMap.splat(Instr, UndefVec); Instruction *InsertPt = Builder.GetInsertPoint(); BasicBlock *IfBlock = Builder.GetInsertBlock(); - BasicBlock *CondBlock = 0; + BasicBlock *CondBlock = nullptr; VectorParts Cond; - Loop *VectorLp = 0; + Loop *VectorLp = nullptr; if (IfPredicateStore) { assert(Instr->getParent()->getSinglePredecessor() && "Only support single predecessor blocks"); @@ -1630,7 +1718,7 @@ void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, bool IfPredic for (unsigned Width = 0; Width < VF; ++Width) { // Start if-block. - Value *Cmp = 0; + Value *Cmp = nullptr; if (IfPredicateStore) { Cmp = Builder.CreateExtractElement(Cond[Part], Builder.getInt32(Width)); Cmp = Builder.CreateICmp(ICmpInst::ICMP_EQ, Cmp, ConstantInt::get(Cmp->getType(), 1)); @@ -1681,21 +1769,21 @@ static Instruction *getFirstInst(Instruction *FirstInst, Value *V, if (FirstInst) return FirstInst; if (Instruction *I = dyn_cast<Instruction>(V)) - return I->getParent() == Loc->getParent() ? I : 0; - return 0; + return I->getParent() == Loc->getParent() ? I : nullptr; + return nullptr; } std::pair<Instruction *, Instruction *> InnerLoopVectorizer::addStrideCheck(Instruction *Loc) { - Instruction *tnullptr = 0; + Instruction *tnullptr = nullptr; if (!Legal->mustCheckStrides()) return std::pair<Instruction *, Instruction *>(tnullptr, tnullptr); IRBuilder<> ChkBuilder(Loc); // Emit checks. - Value *Check = 0; - Instruction *FirstInst = 0; + Value *Check = nullptr; + Instruction *FirstInst = nullptr; for (SmallPtrSet<Value *, 8>::iterator SI = Legal->strides_begin(), SE = Legal->strides_end(); SI != SE; ++SI) { @@ -1727,7 +1815,7 @@ InnerLoopVectorizer::addRuntimeCheck(Instruction *Loc) { LoopVectorizationLegality::RuntimePointerCheck *PtrRtCheck = Legal->getRuntimePointerCheck(); - Instruction *tnullptr = 0; + Instruction *tnullptr = nullptr; if (!PtrRtCheck->Need) return std::pair<Instruction *, Instruction *>(tnullptr, tnullptr); @@ -1737,7 +1825,7 @@ InnerLoopVectorizer::addRuntimeCheck(Instruction *Loc) { LLVMContext &Ctx = Loc->getContext(); SCEVExpander Exp(*SE, "induction"); - Instruction *FirstInst = 0; + Instruction *FirstInst = nullptr; for (unsigned i = 0; i < NumPointers; ++i) { Value *Ptr = PtrRtCheck->Pointers[i]; @@ -1764,7 +1852,7 @@ InnerLoopVectorizer::addRuntimeCheck(Instruction *Loc) { IRBuilder<> ChkBuilder(Loc); // Our instructions might fold to a constant. - Value *MemoryRuntimeCheck = 0; + Value *MemoryRuntimeCheck = nullptr; for (unsigned i = 0; i < NumPointers; ++i) { for (unsigned j = i+1; j < NumPointers; ++j) { // No need to check if two readonly pointers intersect. @@ -2028,7 +2116,7 @@ void InnerLoopVectorizer::createEmptyLoop() { // start value. // This variable saves the new starting index for the scalar loop. - PHINode *ResumeIndex = 0; + PHINode *ResumeIndex = nullptr; LoopVectorizationLegality::InductionList::iterator I, E; LoopVectorizationLegality::InductionList *List = Legal->getInductionVars(); // Set builder to point to last bypass block. @@ -2044,9 +2132,9 @@ void InnerLoopVectorizer::createEmptyLoop() { // truncated version for the scalar loop. PHINode *TruncResumeVal = (OrigPhi == OldInduction) ? PHINode::Create(OrigPhi->getType(), 2, "trunc.resume.val", - MiddleBlock->getTerminator()) : 0; + MiddleBlock->getTerminator()) : nullptr; - Value *EndValue = 0; + Value *EndValue = nullptr; switch (II.IK) { case LoopVectorizationLegality::IK_NoInduction: llvm_unreachable("Unknown induction"); @@ -2209,148 +2297,6 @@ LoopVectorizationLegality::getReductionIdentity(ReductionKind K, Type *Tp) { } } -static Intrinsic::ID checkUnaryFloatSignature(const CallInst &I, - Intrinsic::ID ValidIntrinsicID) { - if (I.getNumArgOperands() != 1 || - !I.getArgOperand(0)->getType()->isFloatingPointTy() || - I.getType() != I.getArgOperand(0)->getType() || - !I.onlyReadsMemory()) - return Intrinsic::not_intrinsic; - - return ValidIntrinsicID; -} - -static Intrinsic::ID checkBinaryFloatSignature(const CallInst &I, - Intrinsic::ID ValidIntrinsicID) { - if (I.getNumArgOperands() != 2 || - !I.getArgOperand(0)->getType()->isFloatingPointTy() || - !I.getArgOperand(1)->getType()->isFloatingPointTy() || - I.getType() != I.getArgOperand(0)->getType() || - I.getType() != I.getArgOperand(1)->getType() || - !I.onlyReadsMemory()) - return Intrinsic::not_intrinsic; - - return ValidIntrinsicID; -} - - -static Intrinsic::ID -getIntrinsicIDForCall(CallInst *CI, const TargetLibraryInfo *TLI) { - // If we have an intrinsic call, check if it is trivially vectorizable. - if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(CI)) { - switch (II->getIntrinsicID()) { - case Intrinsic::sqrt: - case Intrinsic::sin: - case Intrinsic::cos: - case Intrinsic::exp: - case Intrinsic::exp2: - case Intrinsic::log: - case Intrinsic::log10: - case Intrinsic::log2: - case Intrinsic::fabs: - case Intrinsic::copysign: - case Intrinsic::floor: - case Intrinsic::ceil: - case Intrinsic::trunc: - case Intrinsic::rint: - case Intrinsic::nearbyint: - case Intrinsic::round: - case Intrinsic::pow: - case Intrinsic::fma: - case Intrinsic::fmuladd: - case Intrinsic::lifetime_start: - case Intrinsic::lifetime_end: - return II->getIntrinsicID(); - default: - return Intrinsic::not_intrinsic; - } - } - - if (!TLI) - return Intrinsic::not_intrinsic; - - LibFunc::Func Func; - Function *F = CI->getCalledFunction(); - // We're going to make assumptions on the semantics of the functions, check - // that the target knows that it's available in this environment and it does - // not have local linkage. - if (!F || F->hasLocalLinkage() || !TLI->getLibFunc(F->getName(), Func)) - return Intrinsic::not_intrinsic; - - // Otherwise check if we have a call to a function that can be turned into a - // vector intrinsic. - switch (Func) { - default: - break; - case LibFunc::sin: - case LibFunc::sinf: - case LibFunc::sinl: - return checkUnaryFloatSignature(*CI, Intrinsic::sin); - case LibFunc::cos: - case LibFunc::cosf: - case LibFunc::cosl: - return checkUnaryFloatSignature(*CI, Intrinsic::cos); - case LibFunc::exp: - case LibFunc::expf: - case LibFunc::expl: - return checkUnaryFloatSignature(*CI, Intrinsic::exp); - case LibFunc::exp2: - case LibFunc::exp2f: - case LibFunc::exp2l: - return checkUnaryFloatSignature(*CI, Intrinsic::exp2); - case LibFunc::log: - case LibFunc::logf: - case LibFunc::logl: - return checkUnaryFloatSignature(*CI, Intrinsic::log); - case LibFunc::log10: - case LibFunc::log10f: - case LibFunc::log10l: - return checkUnaryFloatSignature(*CI, Intrinsic::log10); - case LibFunc::log2: - case LibFunc::log2f: - case LibFunc::log2l: - return checkUnaryFloatSignature(*CI, Intrinsic::log2); - case LibFunc::fabs: - case LibFunc::fabsf: - case LibFunc::fabsl: - return checkUnaryFloatSignature(*CI, Intrinsic::fabs); - case LibFunc::copysign: - case LibFunc::copysignf: - case LibFunc::copysignl: - return checkBinaryFloatSignature(*CI, Intrinsic::copysign); - case LibFunc::floor: - case LibFunc::floorf: - case LibFunc::floorl: - return checkUnaryFloatSignature(*CI, Intrinsic::floor); - case LibFunc::ceil: - case LibFunc::ceilf: - case LibFunc::ceill: - return checkUnaryFloatSignature(*CI, Intrinsic::ceil); - case LibFunc::trunc: - case LibFunc::truncf: - case LibFunc::truncl: - return checkUnaryFloatSignature(*CI, Intrinsic::trunc); - case LibFunc::rint: - case LibFunc::rintf: - case LibFunc::rintl: - return checkUnaryFloatSignature(*CI, Intrinsic::rint); - case LibFunc::nearbyint: - case LibFunc::nearbyintf: - case LibFunc::nearbyintl: - return checkUnaryFloatSignature(*CI, Intrinsic::nearbyint); - case LibFunc::round: - case LibFunc::roundf: - case LibFunc::roundl: - return checkUnaryFloatSignature(*CI, Intrinsic::round); - case LibFunc::pow: - case LibFunc::powf: - case LibFunc::powl: - return checkBinaryFloatSignature(*CI, Intrinsic::pow); - } - - return Intrinsic::not_intrinsic; -} - /// This function translates the reduction kind to an LLVM binary operator. static unsigned getReductionBinOp(LoopVectorizationLegality::ReductionKind Kind) { @@ -2651,7 +2597,7 @@ void InnerLoopVectorizer::vectorizeLoop() { assert(isPowerOf2_32(VF) && "Reduction emission only supported for pow2 vectors!"); Value *TmpVec = ReducedPartRdx; - SmallVector<Constant*, 32> ShuffleMask(VF, 0); + SmallVector<Constant*, 32> ShuffleMask(VF, nullptr); for (unsigned i = VF; i != 1; i >>= 1) { // Move the upper half of the vector to the lower half. for (unsigned j = 0; j != i/2; ++j) @@ -3049,7 +2995,7 @@ void InnerLoopVectorizer::vectorizeBlockInLoop(BasicBlock *BB, PhiVector *PV) { VectorParts &A = getVectorValue(it->getOperand(0)); VectorParts &B = getVectorValue(it->getOperand(1)); for (unsigned Part = 0; Part < UF; ++Part) { - Value *C = 0; + Value *C = nullptr; if (FCmp) C = Builder.CreateFCmp(Cmp->getPredicate(), A[Part], B[Part]); else @@ -3275,15 +3221,6 @@ bool LoopVectorizationLegality::canVectorize() { return false; } - // Do not loop-vectorize loops with a tiny trip count. - BasicBlock *Latch = TheLoop->getLoopLatch(); - unsigned TC = SE->getSmallConstantTripCount(TheLoop, Latch); - if (TC > 0u && TC < TinyTripCountVectorThreshold) { - DEBUG(dbgs() << "LV: Found a loop with a very small trip count. " << - "This loop is not worth vectorizing.\n"); - return false; - } - // Check if we can vectorize the instructions and CFG in this loop. if (!canVectorizeInstrs()) { DEBUG(dbgs() << "LV: Can't vectorize the instructions or CFG\n"); @@ -3536,14 +3473,14 @@ static Value *stripGetElementPtr(Value *Ptr, ScalarEvolution *SE, ///\brief Look for a cast use of the passed value. static Value *getUniqueCastUse(Value *Ptr, Loop *Lp, Type *Ty) { - Value *UniqueCast = 0; + Value *UniqueCast = nullptr; for (User *U : Ptr->users()) { CastInst *CI = dyn_cast<CastInst>(U); if (CI && CI->getType() == Ty) { if (!UniqueCast) UniqueCast = CI; else - return 0; + return nullptr; } } return UniqueCast; @@ -3556,7 +3493,7 @@ static Value *getStrideFromPointer(Value *Ptr, ScalarEvolution *SE, const DataLayout *DL, Loop *Lp) { const PointerType *PtrTy = dyn_cast<PointerType>(Ptr->getType()); if (!PtrTy || PtrTy->isAggregateType()) - return 0; + return nullptr; // Try to remove a gep instruction to make the pointer (actually index at this // point) easier analyzable. If OrigPtr is equal to Ptr we are analzying the @@ -3576,11 +3513,11 @@ static Value *getStrideFromPointer(Value *Ptr, ScalarEvolution *SE, const SCEVAddRecExpr *S = dyn_cast<SCEVAddRecExpr>(V); if (!S) - return 0; + return nullptr; V = S->getStepRecurrence(*SE); if (!V) - return 0; + return nullptr; // Strip off the size of access multiplication if we are still analyzing the // pointer. @@ -3588,24 +3525,24 @@ static Value *getStrideFromPointer(Value *Ptr, ScalarEvolution *SE, DL->getTypeAllocSize(PtrTy->getElementType()); if (const SCEVMulExpr *M = dyn_cast<SCEVMulExpr>(V)) { if (M->getOperand(0)->getSCEVType() != scConstant) - return 0; + return nullptr; const APInt &APStepVal = cast<SCEVConstant>(M->getOperand(0))->getValue()->getValue(); // Huge step value - give up. if (APStepVal.getBitWidth() > 64) - return 0; + return nullptr; int64_t StepVal = APStepVal.getSExtValue(); if (PtrAccessSize != StepVal) - return 0; + return nullptr; V = M->getOperand(1); } } // Strip off casts. - Type *StripedOffRecurrenceCast = 0; + Type *StripedOffRecurrenceCast = nullptr; if (const SCEVCastExpr *C = dyn_cast<SCEVCastExpr>(V)) { StripedOffRecurrenceCast = C->getType(); V = C->getOperand(); @@ -3614,11 +3551,11 @@ static Value *getStrideFromPointer(Value *Ptr, ScalarEvolution *SE, // Look for the loop invariant symbolic value. const SCEVUnknown *U = dyn_cast<SCEVUnknown>(V); if (!U) - return 0; + return nullptr; Value *Stride = U->getValue(); if (!Lp->isLoopInvariant(Stride)) - return 0; + return nullptr; // If we have stripped off the recurrence cast we have to make sure that we // return the value that is used in this loop so that we can replace it later. @@ -3629,7 +3566,7 @@ static Value *getStrideFromPointer(Value *Ptr, ScalarEvolution *SE, } void LoopVectorizationLegality::collectStridedAcccess(Value *MemAccess) { - Value *Ptr = 0; + Value *Ptr = nullptr; if (LoadInst *LI = dyn_cast<LoadInst>(MemAccess)) Ptr = LI->getPointerOperand(); else if (StoreInst *SI = dyn_cast<StoreInst>(MemAccess)) @@ -4628,7 +4565,7 @@ bool LoopVectorizationLegality::AddReductionVar(PHINode *Phi, // We only allow for a single reduction value to be used outside the loop. // This includes users of the reduction, variables (which form a cycle // which ends in the phi node). - Instruction *ExitInstruction = 0; + Instruction *ExitInstruction = nullptr; // Indicates that we found a reduction operation in our scan. bool FoundReduxOp = false; @@ -4642,7 +4579,7 @@ bool LoopVectorizationLegality::AddReductionVar(PHINode *Phi, // the number of instruction we saw from the recognized min/max pattern, // to make sure we only see exactly the two instructions. unsigned NumCmpSelectPatternInst = 0; - ReductionInstDesc ReduxDesc(false, 0); + ReductionInstDesc ReduxDesc(false, nullptr); SmallPtrSet<Instruction *, 8> VisitedInsts; SmallVector<Instruction *, 8> Worklist; @@ -4725,7 +4662,7 @@ bool LoopVectorizationLegality::AddReductionVar(PHINode *Phi, // being used. In this case the user uses the value of the previous // iteration, in which case we would loose "VF-1" iterations of the // reduction operation if we vectorize. - if (ExitInstruction != 0 || Cur == Phi) + if (ExitInstruction != nullptr || Cur == Phi) return false; // The instruction used by an outside user must be the last instruction @@ -4741,7 +4678,7 @@ bool LoopVectorizationLegality::AddReductionVar(PHINode *Phi, // Process instructions only once (termination). Each reduction cycle // value must only be used once, except by phi nodes and min/max // reductions which are represented as a cmp followed by a select. - ReductionInstDesc IgnoredVal(false, 0); + ReductionInstDesc IgnoredVal(false, nullptr); if (VisitedInsts.insert(UI)) { if (isa<PHINode>(UI)) PHIs.push_back(UI); @@ -4795,8 +4732,8 @@ LoopVectorizationLegality::isMinMaxSelectCmpPattern(Instruction *I, assert((isa<ICmpInst>(I) || isa<FCmpInst>(I) || isa<SelectInst>(I)) && "Expect a select instruction"); - Instruction *Cmp = 0; - SelectInst *Select = 0; + Instruction *Cmp = nullptr; + SelectInst *Select = nullptr; // We must handle the select(cmp()) as a single instruction. Advance to the // select. @@ -4982,7 +4919,8 @@ bool LoopVectorizationLegality::blockCanBePredicated(BasicBlock *BB, LoopVectorizationCostModel::VectorizationFactor LoopVectorizationCostModel::selectVectorizationFactor(bool OptForSize, - unsigned UserVF) { + unsigned UserVF, + bool ForceVectorization) { // Width 1 means no vectorize VectorizationFactor Factor = { 1U, 0U }; if (OptForSize && Legal->getRuntimePointerCheck()->Need) { @@ -5052,8 +4990,18 @@ LoopVectorizationCostModel::selectVectorizationFactor(bool OptForSize, } float Cost = expectedCost(1); +#ifndef NDEBUG + const float ScalarCost = Cost; +#endif /* NDEBUG */ unsigned Width = 1; - DEBUG(dbgs() << "LV: Scalar loop costs: " << (int)Cost << ".\n"); + DEBUG(dbgs() << "LV: Scalar loop costs: " << (int)ScalarCost << ".\n"); + + // Ignore scalar width, because the user explicitly wants vectorization. + if (ForceVectorization && VF > 1) { + Width = 2; + Cost = expectedCost(Width) / (float)Width; + } + for (unsigned i=2; i <= VF; i*=2) { // Notice that the vector loop needs to be executed less times, so // we need to divide the cost of the vector loops by the width of @@ -5067,7 +5015,10 @@ LoopVectorizationCostModel::selectVectorizationFactor(bool OptForSize, } } - DEBUG(dbgs() << "LV: Selecting VF = : "<< Width << ".\n"); + DEBUG(if (ForceVectorization && Width > 1 && Cost >= ScalarCost) dbgs() + << "LV: Vectorization seems to be not beneficial, " + << "but was forced by a user.\n"); + DEBUG(dbgs() << "LV: Selecting VF: "<< Width << ".\n"); Factor.Width = Width; Factor.Cost = Width * Cost; return Factor; @@ -5516,7 +5467,7 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) { Op2VK = TargetTransformInfo::OK_UniformConstantValue; else if (isa<ConstantVector>(Op2) || isa<ConstantDataVector>(Op2)) { Op2VK = TargetTransformInfo::OK_NonUniformConstantValue; - if (cast<Constant>(Op2)->getSplatValue() != NULL) + if (cast<Constant>(Op2)->getSplatValue() != nullptr) Op2VK = TargetTransformInfo::OK_UniformConstantValue; } @@ -5730,17 +5681,17 @@ void InnerLoopUnroller::scalarizeInstruction(Instruction *Instr, // Does this instruction return a value ? bool IsVoidRetTy = Instr->getType()->isVoidTy(); - Value *UndefVec = IsVoidRetTy ? 0 : + Value *UndefVec = IsVoidRetTy ? nullptr : UndefValue::get(Instr->getType()); // Create a new entry in the WidenMap and initialize it to Undef or Null. VectorParts &VecResults = WidenMap.splat(Instr, UndefVec); Instruction *InsertPt = Builder.GetInsertPoint(); BasicBlock *IfBlock = Builder.GetInsertBlock(); - BasicBlock *CondBlock = 0; + BasicBlock *CondBlock = nullptr; VectorParts Cond; - Loop *VectorLp = 0; + Loop *VectorLp = nullptr; if (IfPredicateStore) { assert(Instr->getParent()->getSinglePredecessor() && "Only support single predecessor blocks"); @@ -5755,7 +5706,7 @@ void InnerLoopUnroller::scalarizeInstruction(Instruction *Instr, // For each scalar that we create: // Start an "if (pred) a[i] = ..." block. - Value *Cmp = 0; + Value *Cmp = nullptr; if (IfPredicateStore) { if (Cond[Part]->getType()->isVectorTy()) Cond[Part] = diff --git a/lib/Transforms/Vectorize/SLPVectorizer.cpp b/lib/Transforms/Vectorize/SLPVectorizer.cpp index ee322279df..e13ba956c3 100644 --- a/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -15,9 +15,6 @@ // "Loop-Aware SLP in GCC" by Ira Rosen, Dorit Nuzman, Ayal Zaks. // //===----------------------------------------------------------------------===// -#define SV_NAME "slp-vectorizer" -#define DEBUG_TYPE "SLP" - #include "llvm/Transforms/Vectorize.h" #include "llvm/ADT/MapVector.h" #include "llvm/ADT/PostOrderIterator.h" @@ -34,6 +31,7 @@ #include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/Module.h" +#include "llvm/IR/NoFolder.h" #include "llvm/IR/Type.h" #include "llvm/IR/Value.h" #include "llvm/IR/Verifier.h" @@ -41,11 +39,15 @@ #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" +#include "llvm/Transforms/Utils/VectorUtils.h" #include <algorithm> #include <map> using namespace llvm; +#define SV_NAME "slp-vectorizer" +#define DEBUG_TYPE "SLP" + static cl::opt<int> SLPCostThreshold("slp-threshold", cl::init(0), cl::Hidden, cl::desc("Only vectorize if you gain more than this " @@ -72,8 +74,6 @@ struct BlockNumbering { BlockNumbering(BasicBlock *Bb) : BB(Bb), Valid(false) {} - BlockNumbering() : BB(0), Valid(false) {} - void numberInstructions() { unsigned Loc = 0; InstrIdx.clear(); @@ -120,15 +120,15 @@ private: static BasicBlock *getSameBlock(ArrayRef<Value *> VL) { Instruction *I0 = dyn_cast<Instruction>(VL[0]); if (!I0) - return 0; + return nullptr; BasicBlock *BB = I0->getParent(); for (int i = 1, e = VL.size(); i < e; i++) { Instruction *I = dyn_cast<Instruction>(VL[i]); if (!I) - return 0; + return nullptr; if (BB != I->getParent()) - return 0; + return nullptr; } return BB; } @@ -180,7 +180,7 @@ static Instruction *propagateMetadata(Instruction *I, ArrayRef<Value *> VL) { switch (Kind) { default: - MD = 0; // Remove unknown metadata + MD = nullptr; // Remove unknown metadata break; case LLVMContext::MD_tbaa: MD = MDNode::getMostGenericTBAA(MD, IMD); @@ -201,7 +201,7 @@ static Type* getSameType(ArrayRef<Value *> VL) { Type *Ty = VL[0]->getType(); for (int i = 1, e = VL.size(); i < e; i++) if (VL[i]->getType() != Ty) - return 0; + return nullptr; return Ty; } @@ -345,17 +345,10 @@ public: typedef SmallVector<StoreInst *, 8> StoreList; BoUpSLP(Function *Func, ScalarEvolution *Se, const DataLayout *Dl, - TargetTransformInfo *Tti, AliasAnalysis *Aa, LoopInfo *Li, - DominatorTree *Dt) : - F(Func), SE(Se), DL(Dl), TTI(Tti), AA(Aa), LI(Li), DT(Dt), - Builder(Se->getContext()) { - // Setup the block numbering utility for all of the blocks in the - // function. - for (Function::iterator it = F->begin(), e = F->end(); it != e; ++it) { - BasicBlock *BB = it; - BlocksNumbers[BB] = BlockNumbering(BB); - } - } + TargetTransformInfo *Tti, TargetLibraryInfo *TLi, AliasAnalysis *Aa, + LoopInfo *Li, DominatorTree *Dt) + : F(Func), SE(Se), DL(Dl), TTI(Tti), TLI(TLi), AA(Aa), LI(Li), DT(Dt), + Builder(Se->getContext()) {} /// \brief Vectorize the tree that starts with the elements in \p VL. /// Returns the vectorized root. @@ -365,13 +358,13 @@ public: /// A negative number means that this is profitable. int getTreeCost(); - /// Construct a vectorizable tree that starts at \p Roots and is possibly - /// used by a reduction of \p RdxOps. - void buildTree(ArrayRef<Value *> Roots, ValueSet *RdxOps = 0); + /// Construct a vectorizable tree that starts at \p Roots, ignoring users for + /// the purpose of scheduling and extraction in the \p UserIgnoreLst. + void buildTree(ArrayRef<Value *> Roots, + ArrayRef<Value *> UserIgnoreLst = None); /// Clear the internal data structures that are created by 'buildTree'. void deleteTree() { - RdxOps = 0; VectorizableTree.clear(); ScalarToTreeEntry.clear(); MustGather.clear(); @@ -446,7 +439,7 @@ private: bool isFullyVectorizableTinyTree(); struct TreeEntry { - TreeEntry() : Scalars(), VectorizedValue(0), LastScalarIndex(0), + TreeEntry() : Scalars(), VectorizedValue(nullptr), LastScalarIndex(0), NeedToGather(0) {} /// \returns true if the scalars in VL are equal to this entry. @@ -527,14 +520,22 @@ private: /// Numbers instructions in different blocks. DenseMap<BasicBlock *, BlockNumbering> BlocksNumbers; - /// Reduction operators. - ValueSet *RdxOps; + /// \brief Get the corresponding instruction numbering list for a given + /// BasicBlock. The list is allocated lazily. + BlockNumbering &getBlockNumbering(BasicBlock *BB) { + auto I = BlocksNumbers.insert(std::make_pair(BB, BlockNumbering(BB))); + return I.first->second; + } + + /// List of users to ignore during scheduling and that don't need extracting. + ArrayRef<Value *> UserIgnoreList; // Analysis and block reference. Function *F; ScalarEvolution *SE; const DataLayout *DL; TargetTransformInfo *TTI; + TargetLibraryInfo *TLI; AliasAnalysis *AA; LoopInfo *LI; DominatorTree *DT; @@ -542,9 +543,10 @@ private: IRBuilder<> Builder; }; -void BoUpSLP::buildTree(ArrayRef<Value *> Roots, ValueSet *Rdx) { +void BoUpSLP::buildTree(ArrayRef<Value *> Roots, + ArrayRef<Value *> UserIgnoreLst) { deleteTree(); - RdxOps = Rdx; + UserIgnoreList = UserIgnoreLst; if (!getSameType(Roots)) return; buildTree_rec(Roots, 0); @@ -576,8 +578,9 @@ void BoUpSLP::buildTree(ArrayRef<Value *> Roots, ValueSet *Rdx) { if (!UserInst) continue; - // Ignore uses that are part of the reduction. - if (Rdx && std::find(Rdx->begin(), Rdx->end(), UserInst) != Rdx->end()) + // Ignore users in the user ignore list. + if (std::find(UserIgnoreList.begin(), UserIgnoreList.end(), UserInst) != + UserIgnoreList.end()) continue; DEBUG(dbgs() << "SLP: Need to extract:" << *U << " from lane " << @@ -708,12 +711,13 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) { continue; } - // This user is part of the reduction. - if (RdxOps && RdxOps->count(UI)) + // Ignore users in the user ignore list. + if (std::find(UserIgnoreList.begin(), UserIgnoreList.end(), UI) != + UserIgnoreList.end()) continue; // Make sure that we can schedule this unknown user. - BlockNumbering &BN = BlocksNumbers[BB]; + BlockNumbering &BN = getBlockNumbering(BB); int UserIndex = BN.getIndex(UI); if (UserIndex < MyLastIndex) { @@ -948,32 +952,36 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) { } case Instruction::Call: { // Check if the calls are all to the same vectorizable intrinsic. - IntrinsicInst *II = dyn_cast<IntrinsicInst>(VL[0]); - if (II==NULL) { + CallInst *CI = cast<CallInst>(VL[0]); + // Check if this is an Intrinsic call or something that can be + // represented by an intrinsic call + Intrinsic::ID ID = getIntrinsicIDForCall(CI, TLI); + if (!isTriviallyVectorizable(ID)) { newTreeEntry(VL, false); DEBUG(dbgs() << "SLP: Non-vectorizable call.\n"); return; } - Function *Int = II->getCalledFunction(); + Function *Int = CI->getCalledFunction(); for (unsigned i = 1, e = VL.size(); i != e; ++i) { - IntrinsicInst *II2 = dyn_cast<IntrinsicInst>(VL[i]); - if (!II2 || II2->getCalledFunction() != Int) { + CallInst *CI2 = dyn_cast<CallInst>(VL[i]); + if (!CI2 || CI2->getCalledFunction() != Int || + getIntrinsicIDForCall(CI2, TLI) != ID) { newTreeEntry(VL, false); - DEBUG(dbgs() << "SLP: mismatched calls:" << *II << "!=" << *VL[i] + DEBUG(dbgs() << "SLP: mismatched calls:" << *CI << "!=" << *VL[i] << "\n"); return; } } newTreeEntry(VL, true); - for (unsigned i = 0, e = II->getNumArgOperands(); i != e; ++i) { + for (unsigned i = 0, e = CI->getNumArgOperands(); i != e; ++i) { ValueList Operands; // Prepare the operand vector. for (unsigned j = 0; j < VL.size(); ++j) { - IntrinsicInst *II2 = dyn_cast<IntrinsicInst>(VL[j]); - Operands.push_back(II2->getArgOperand(i)); + CallInst *CI2 = dyn_cast<CallInst>(VL[j]); + Operands.push_back(CI2->getArgOperand(i)); } buildTree_rec(Operands, Depth + 1); } @@ -1090,7 +1098,7 @@ int BoUpSLP::getEntryCost(TreeEntry *E) { // If instead not all operands are constants, then set the operand kind // to OK_AnyValue. If all operands are constants but not the same, // then set the operand kind to OK_NonUniformConstantValue. - ConstantInt *CInt = NULL; + ConstantInt *CInt = nullptr; for (unsigned i = 0; i < VL.size(); ++i) { const Instruction *I = cast<Instruction>(VL[i]); if (!isa<ConstantInt>(I->getOperand(1))) { @@ -1129,12 +1137,11 @@ int BoUpSLP::getEntryCost(TreeEntry *E) { } case Instruction::Call: { CallInst *CI = cast<CallInst>(VL0); - IntrinsicInst *II = cast<IntrinsicInst>(CI); - Intrinsic::ID ID = II->getIntrinsicID(); + Intrinsic::ID ID = getIntrinsicIDForCall(CI, TLI); // Calculate the cost of the scalar and vector calls. SmallVector<Type*, 4> ScalarTys, VecTys; - for (unsigned op = 0, opc = II->getNumArgOperands(); op!= opc; ++op) { + for (unsigned op = 0, opc = CI->getNumArgOperands(); op!= opc; ++op) { ScalarTys.push_back(CI->getArgOperand(op)->getType()); VecTys.push_back(VectorType::get(CI->getArgOperand(op)->getType(), VecTy->getNumElements())); @@ -1147,7 +1154,7 @@ int BoUpSLP::getEntryCost(TreeEntry *E) { DEBUG(dbgs() << "SLP: Call cost "<< VecCallCost - ScalarCallCost << " (" << VecCallCost << "-" << ScalarCallCost << ")" - << " for " << *II << "\n"); + << " for " << *CI << "\n"); return VecCallCost - ScalarCallCost; } @@ -1244,7 +1251,7 @@ Value *BoUpSLP::getPointerOperand(Value *I) { return LI->getPointerOperand(); if (StoreInst *SI = dyn_cast<StoreInst>(I)) return SI->getPointerOperand(); - return 0; + return nullptr; } unsigned BoUpSLP::getAddressSpaceOperand(Value *I) { @@ -1318,13 +1325,13 @@ Value *BoUpSLP::getSinkBarrier(Instruction *Src, Instruction *Dst) { if (!A.Ptr || !B.Ptr || AA->alias(A, B)) return I; } - return 0; + return nullptr; } int BoUpSLP::getLastIndex(ArrayRef<Value *> VL) { BasicBlock *BB = cast<Instruction>(VL[0])->getParent(); - assert(BB == getSameBlock(VL) && BlocksNumbers.count(BB) && "Invalid block"); - BlockNumbering &BN = BlocksNumbers[BB]; + assert(BB == getSameBlock(VL) && "Invalid block"); + BlockNumbering &BN = getBlockNumbering(BB); int MaxIdx = BN.getIndex(BB->getFirstNonPHI()); for (unsigned i = 0, e = VL.size(); i < e; ++i) @@ -1334,8 +1341,8 @@ int BoUpSLP::getLastIndex(ArrayRef<Value *> VL) { Instruction *BoUpSLP::getLastInstruction(ArrayRef<Value *> VL) { BasicBlock *BB = cast<Instruction>(VL[0])->getParent(); - assert(BB == getSameBlock(VL) && BlocksNumbers.count(BB) && "Invalid block"); - BlockNumbering &BN = BlocksNumbers[BB]; + assert(BB == getSameBlock(VL) && "Invalid block"); + BlockNumbering &BN = getBlockNumbering(BB); int MaxIdx = BN.getIndex(cast<Instruction>(VL[0])); for (unsigned i = 1, e = VL.size(); i < e; ++i) @@ -1394,7 +1401,7 @@ Value *BoUpSLP::alreadyVectorized(ArrayRef<Value *> VL) const { if (En->isSame(VL) && En->VectorizedValue) return En->VectorizedValue; } - return 0; + return nullptr; } Value *BoUpSLP::vectorizeTree(ArrayRef<Value *> VL) { @@ -1615,6 +1622,8 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { VecTy->getPointerTo(AS)); unsigned Alignment = LI->getAlignment(); LI = Builder.CreateLoad(VecPtr); + if (!Alignment) + Alignment = DL->getABITypeAlignment(LI->getPointerOperand()->getType()); LI->setAlignment(Alignment); E->VectorizedValue = LI; return propagateMetadata(LI, E->Scalars); @@ -1634,13 +1643,14 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { Value *VecPtr = Builder.CreateBitCast(SI->getPointerOperand(), VecTy->getPointerTo(AS)); StoreInst *S = Builder.CreateStore(VecValue, VecPtr); + if (!Alignment) + Alignment = DL->getABITypeAlignment(SI->getPointerOperand()->getType()); S->setAlignment(Alignment); E->VectorizedValue = S; return propagateMetadata(S, E->Scalars); } case Instruction::Call: { CallInst *CI = cast<CallInst>(VL0); - setInsertPointAfterBundle(E->Scalars); std::vector<Value *> OpVecs; for (int j = 0, e = CI->getNumArgOperands(); j < e; ++j) { @@ -1656,8 +1666,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { } Module *M = F->getParent(); - IntrinsicInst *II = cast<IntrinsicInst>(CI); - Intrinsic::ID ID = II->getIntrinsicID(); + Intrinsic::ID ID = getIntrinsicIDForCall(CI, TLI); Type *Tys[] = { VectorType::get(CI->getType(), E->Scalars.size()) }; Function *CF = Intrinsic::getDeclaration(M, ID, Tys); Value *V = Builder.CreateCall(CF, OpVecs); @@ -1667,7 +1676,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { default: llvm_unreachable("unknown inst"); } - return 0; + return nullptr; } Value *BoUpSLP::vectorizeTree() { @@ -1746,8 +1755,9 @@ Value *BoUpSLP::vectorizeTree() { DEBUG(dbgs() << "SLP: \tvalidating user:" << *U << ".\n"); assert((ScalarToTreeEntry.count(U) || - // It is legal to replace the reduction users by undef. - (RdxOps && RdxOps->count(U))) && + // It is legal to replace users in the ignorelist by undef. + (std::find(UserIgnoreList.begin(), UserIgnoreList.end(), U) != + UserIgnoreList.end())) && "Replacing out-of-tree value with undef"); } #endif @@ -1759,9 +1769,9 @@ Value *BoUpSLP::vectorizeTree() { } } - for (Function::iterator it = F->begin(), e = F->end(); it != e; ++it) { - BlocksNumbers[it].forget(); - } + for (auto &BN : BlocksNumbers) + BN.second.forget(); + Builder.ClearInsertionPoint(); return VectorizableTree[0].VectorizedValue; @@ -1802,11 +1812,19 @@ void BoUpSLP::optimizeGatherSequence() { Insert->moveBefore(PreHeader->getTerminator()); } + // Make a list of all reachable blocks in our CSE queue. + SmallVector<const DomTreeNode *, 8> CSEWorkList; + CSEWorkList.reserve(CSEBlocks.size()); + for (BasicBlock *BB : CSEBlocks) + if (DomTreeNode *N = DT->getNode(BB)) { + assert(DT->isReachableFromEntry(N)); + CSEWorkList.push_back(N); + } + // Sort blocks by domination. This ensures we visit a block after all blocks // dominating it are visited. - SmallVector<BasicBlock *, 8> CSEWorkList(CSEBlocks.begin(), CSEBlocks.end()); std::stable_sort(CSEWorkList.begin(), CSEWorkList.end(), - [this](const BasicBlock *A, const BasicBlock *B) { + [this](const DomTreeNode *A, const DomTreeNode *B) { return DT->properlyDominates(A, B); }); @@ -1814,12 +1832,10 @@ void BoUpSLP::optimizeGatherSequence() { // instructions. TODO: We can further optimize this scan if we split the // instructions into different buckets based on the insert lane. SmallVector<Instruction *, 16> Visited; - for (SmallVectorImpl<BasicBlock *>::iterator I = CSEWorkList.begin(), - E = CSEWorkList.end(); - I != E; ++I) { + for (auto I = CSEWorkList.begin(), E = CSEWorkList.end(); I != E; ++I) { assert((I == CSEWorkList.begin() || !DT->dominates(*I, *std::prev(I))) && "Worklist not sorted properly!"); - BasicBlock *BB = *I; + BasicBlock *BB = (*I)->getBlock(); // For all instructions in blocks containing gather sequences: for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e;) { Instruction *In = it++; @@ -1835,7 +1851,7 @@ void BoUpSLP::optimizeGatherSequence() { DT->dominates((*v)->getParent(), In->getParent())) { In->replaceAllUsesWith(*v); In->eraseFromParent(); - In = 0; + In = nullptr; break; } } @@ -1864,6 +1880,7 @@ struct SLPVectorizer : public FunctionPass { ScalarEvolution *SE; const DataLayout *DL; TargetTransformInfo *TTI; + TargetLibraryInfo *TLI; AliasAnalysis *AA; LoopInfo *LI; DominatorTree *DT; @@ -1874,8 +1891,9 @@ struct SLPVectorizer : public FunctionPass { SE = &getAnalysis<ScalarEvolution>(); DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>(); - DL = DLP ? &DLP->getDataLayout() : 0; + DL = DLP ? &DLP->getDataLayout() : nullptr; TTI = &getAnalysis<TargetTransformInfo>(); + TLI = getAnalysisIfAvailable<TargetLibraryInfo>(); AA = &getAnalysis<AliasAnalysis>(); LI = &getAnalysis<LoopInfo>(); DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); @@ -1900,8 +1918,8 @@ struct SLPVectorizer : public FunctionPass { DEBUG(dbgs() << "SLP: Analyzing blocks in " << F.getName() << ".\n"); // Use the bottom up slp vectorizer to construct chains that start with - // he store instructions. - BoUpSLP R(&F, SE, DL, TTI, AA, LI, DT); + // store instructions. + BoUpSLP R(&F, SE, DL, TTI, TLI, AA, LI, DT); // Scan the blocks in the function in post order. for (po_iterator<BasicBlock*> it = po_begin(&F.getEntryBlock()), @@ -1951,8 +1969,11 @@ private: bool tryToVectorizePair(Value *A, Value *B, BoUpSLP &R); /// \brief Try to vectorize a list of operands. + /// \@param BuildVector A list of users to ignore for the purpose of + /// scheduling and that don't need extracting. /// \returns true if a value was vectorized. - bool tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R); + bool tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R, + ArrayRef<Value *> BuildVector = None); /// \brief Try to vectorize a chain that may start at the operands of \V; bool tryToVectorize(BinaryOperator *V, BoUpSLP &R); @@ -2106,7 +2127,7 @@ unsigned SLPVectorizer::collectStores(BasicBlock *BB, BoUpSLP &R) { // Check that the pointer points to scalars. Type *Ty = SI->getValueOperand()->getType(); if (Ty->isAggregateType() || Ty->isVectorTy()) - return 0; + continue; // Find the base pointer. Value *Ptr = GetUnderlyingObject(SI->getPointerOperand(), DL); @@ -2125,7 +2146,8 @@ bool SLPVectorizer::tryToVectorizePair(Value *A, Value *B, BoUpSLP &R) { return tryToVectorizeList(VL, R); } -bool SLPVectorizer::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R) { +bool SLPVectorizer::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R, + ArrayRef<Value *> BuildVector) { if (VL.size() < 2) return false; @@ -2153,7 +2175,7 @@ bool SLPVectorizer::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R) { bool Changed = false; - // Keep track of values that were delete by vectorizing in the loop below. + // Keep track of values that were deleted by vectorizing in the loop below. SmallVector<WeakVH, 8> TrackValues(VL.begin(), VL.end()); for (unsigned i = 0, e = VL.size(); i < e; ++i) { @@ -2175,13 +2197,38 @@ bool SLPVectorizer::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R) { << "\n"); ArrayRef<Value *> Ops = VL.slice(i, OpsWidth); - R.buildTree(Ops); + ArrayRef<Value *> BuildVectorSlice; + if (!BuildVector.empty()) + BuildVectorSlice = BuildVector.slice(i, OpsWidth); + + R.buildTree(Ops, BuildVectorSlice); int Cost = R.getTreeCost(); if (Cost < -SLPCostThreshold) { DEBUG(dbgs() << "SLP: Vectorizing list at cost:" << Cost << ".\n"); - R.vectorizeTree(); - + Value *VectorizedRoot = R.vectorizeTree(); + + // Reconstruct the build vector by extracting the vectorized root. This + // way we handle the case where some elements of the vector are undefined. + // (return (inserelt <4 xi32> (insertelt undef (opd0) 0) (opd1) 2)) + if (!BuildVectorSlice.empty()) { + // The insert point is the last build vector instruction. The vectorized + // root will precede it. This guarantees that we get an instruction. The + // vectorized tree could have been constant folded. + Instruction *InsertAfter = cast<Instruction>(BuildVectorSlice.back()); + unsigned VecIdx = 0; + for (auto &V : BuildVectorSlice) { + IRBuilder<true, NoFolder> Builder( + ++BasicBlock::iterator(InsertAfter)); + InsertElementInst *IE = cast<InsertElementInst>(V); + Instruction *Extract = cast<Instruction>(Builder.CreateExtractElement( + VectorizedRoot, Builder.getInt32(VecIdx++))); + IE->setOperand(1, Extract); + IE->removeFromParent(); + IE->insertAfter(Extract); + InsertAfter = IE; + } + } // Move to the next bundle. i += VF - 1; Changed = true; @@ -2290,7 +2337,7 @@ static Value *createRdxShuffleMask(unsigned VecLen, unsigned NumEltsToRdx, /// *p = /// class HorizontalReduction { - SmallPtrSet<Value *, 16> ReductionOps; + SmallVector<Value *, 16> ReductionOps; SmallVector<Value *, 32> ReducedVals; BinaryOperator *ReductionRoot; @@ -2308,7 +2355,7 @@ class HorizontalReduction { public: HorizontalReduction() - : ReductionRoot(0), ReductionPHI(0), ReductionOpcode(0), + : ReductionRoot(nullptr), ReductionPHI(nullptr), ReductionOpcode(0), ReducedValueOpcode(0), ReduxWidth(0), IsPairwiseReduction(false) {} /// \brief Try to find a reduction tree. @@ -2323,10 +2370,10 @@ public: // In such a case start looking for a tree rooted in the first '+'. if (Phi) { if (B->getOperand(0) == Phi) { - Phi = 0; + Phi = nullptr; B = dyn_cast<BinaryOperator>(B->getOperand(1)); } else if (B->getOperand(1) == Phi) { - Phi = 0; + Phi = nullptr; B = dyn_cast<BinaryOperator>(B->getOperand(0)); } } @@ -2384,7 +2431,7 @@ public: // We need to be able to reassociate the adds. if (!TreeN->isAssociative()) return false; - ReductionOps.insert(TreeN); + ReductionOps.push_back(TreeN); } // Retract. Stack.pop_back(); @@ -2412,7 +2459,7 @@ public: if (NumReducedVals < ReduxWidth) return false; - Value *VectorizedTree = 0; + Value *VectorizedTree = nullptr; IRBuilder<> Builder(ReductionRoot); FastMathFlags Unsafe; Unsafe.setUnsafeAlgebra(); @@ -2421,7 +2468,7 @@ public: for (; i < NumReducedVals - ReduxWidth + 1; i += ReduxWidth) { ArrayRef<Value *> ValsToReduce(&ReducedVals[i], ReduxWidth); - V.buildTree(ValsToReduce, &ReductionOps); + V.buildTree(ValsToReduce, ReductionOps); // Estimate cost. int Cost = V.getTreeCost() + getReductionCost(TTI, ReducedVals[i]); @@ -2455,13 +2502,13 @@ public: } // Update users. if (ReductionPHI) { - assert(ReductionRoot != NULL && "Need a reduction operation"); + assert(ReductionRoot && "Need a reduction operation"); ReductionRoot->setOperand(0, VectorizedTree); ReductionRoot->setOperand(1, ReductionPHI); } else ReductionRoot->replaceAllUsesWith(VectorizedTree); } - return VectorizedTree != 0; + return VectorizedTree != nullptr; } private: @@ -2540,13 +2587,16 @@ private: /// /// Returns true if it matches /// -static bool findBuildVector(InsertElementInst *IE, - SmallVectorImpl<Value *> &Ops) { - if (!isa<UndefValue>(IE->getOperand(0))) +static bool findBuildVector(InsertElementInst *FirstInsertElem, + SmallVectorImpl<Value *> &BuildVector, + SmallVectorImpl<Value *> &BuildVectorOpds) { + if (!isa<UndefValue>(FirstInsertElem->getOperand(0))) return false; + InsertElementInst *IE = FirstInsertElem; while (true) { - Ops.push_back(IE->getOperand(1)); + BuildVector.push_back(IE); + BuildVectorOpds.push_back(IE->getOperand(1)); if (IE->use_empty()) return false; @@ -2641,7 +2691,8 @@ bool SLPVectorizer::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) { Value *Rdx = (P->getIncomingBlock(0) == BB ? (P->getIncomingValue(0)) - : (P->getIncomingBlock(1) == BB ? P->getIncomingValue(1) : 0)); + : (P->getIncomingBlock(1) == BB ? P->getIncomingValue(1) + : nullptr)); // Check if this is a Binary Operator. BinaryOperator *BI = dyn_cast_or_null<BinaryOperator>(Rdx); if (!BI) @@ -2680,7 +2731,7 @@ bool SLPVectorizer::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) { if (BinaryOperator *BinOp = dyn_cast<BinaryOperator>(SI->getValueOperand())) { HorizontalReduction HorRdx; - if (((HorRdx.matchAssociativeReduction(0, BinOp, DL) && + if (((HorRdx.matchAssociativeReduction(nullptr, BinOp, DL) && HorRdx.tryToReduce(R, TTI)) || tryToVectorize(BinOp, R))) { Changed = true; @@ -2716,12 +2767,16 @@ bool SLPVectorizer::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) { } // Try to vectorize trees that start at insertelement instructions. - if (InsertElementInst *IE = dyn_cast<InsertElementInst>(it)) { - SmallVector<Value *, 8> Ops; - if (!findBuildVector(IE, Ops)) + if (InsertElementInst *FirstInsertElem = dyn_cast<InsertElementInst>(it)) { + SmallVector<Value *, 16> BuildVector; + SmallVector<Value *, 16> BuildVectorOpds; + if (!findBuildVector(FirstInsertElem, BuildVector, BuildVectorOpds)) continue; - if (tryToVectorizeList(Ops, R)) { + // Vectorize starting with the build vector operands ignoring the + // BuildVector instructions for the purpose of scheduling and user + // extraction. + if (tryToVectorizeList(BuildVectorOpds, R, BuildVector)) { Changed = true; it = BB->begin(); e = BB->end(); |