diff options
-rw-r--r-- | lib/Transforms/IPO/PassManagerBuilder.cpp | 2 | ||||
-rw-r--r-- | lib/Transforms/Scalar/SROA.cpp | 227 | ||||
-rw-r--r-- | test/Transforms/SROA/fca.ll | 49 |
3 files changed, 270 insertions, 8 deletions
diff --git a/lib/Transforms/IPO/PassManagerBuilder.cpp b/lib/Transforms/IPO/PassManagerBuilder.cpp index c81b333813..105b2886d9 100644 --- a/lib/Transforms/IPO/PassManagerBuilder.cpp +++ b/lib/Transforms/IPO/PassManagerBuilder.cpp @@ -41,7 +41,7 @@ UseGVNAfterVectorization("use-gvn-after-vectorization", cl::desc("Run GVN instead of Early CSE after vectorization passes")); static cl::opt<bool> UseNewSROA("use-new-sroa", - cl::init(false), cl::Hidden, + cl::init(true), cl::Hidden, cl::desc("Enable the new, experimental SROA pass")); PassManagerBuilder::PassManagerBuilder() { diff --git a/lib/Transforms/Scalar/SROA.cpp b/lib/Transforms/Scalar/SROA.cpp index a7d8ee7e68..cdbd9f9f23 100644 --- a/lib/Transforms/Scalar/SROA.cpp +++ b/lib/Transforms/Scalar/SROA.cpp @@ -569,14 +569,19 @@ private: } bool visitLoadInst(LoadInst &LI) { + assert((!LI.isSimple() || LI.getType()->isSingleValueType()) && + "All simple FCA loads should have been pre-split"); return handleLoadOrStore(LI.getType(), LI, Offset); } bool visitStoreInst(StoreInst &SI) { - if (SI.getOperand(0) == *U) + Value *ValOp = SI.getValueOperand(); + if (ValOp == *U) return markAsEscaping(SI); - return handleLoadOrStore(SI.getOperand(0)->getType(), SI, Offset); + assert((!SI.isSimple() || ValOp->getType()->isSingleValueType()) && + "All simple FCA stores should have been pre-split"); + return handleLoadOrStore(ValOp->getType(), SI, Offset); } @@ -1051,10 +1056,10 @@ AllocaPartitioning::AllocaPartitioning(const TargetData &TD, AllocaInst &AI) Type *AllocaPartitioning::getCommonType(iterator I) const { Type *Ty = 0; for (const_use_iterator UI = use_begin(I), UE = use_end(I); UI != UE; ++UI) { - if (isa<MemIntrinsic>(*UI->User)) + if (isa<IntrinsicInst>(*UI->User)) continue; if (UI->BeginOffset != I->BeginOffset || UI->EndOffset != I->EndOffset) - break; + return 0; Type *UserTy = 0; if (LoadInst *LI = dyn_cast<LoadInst>(&*UI->User)) { @@ -2409,6 +2414,208 @@ private: }; } +namespace { +/// \brief Visitor to rewrite aggregate loads and stores as scalar. +/// +/// This pass aggressively rewrites all aggregate loads and stores on +/// a particular pointer (or any pointer derived from it which we can identify) +/// with scalar loads and stores. +class AggLoadStoreRewriter : public InstVisitor<AggLoadStoreRewriter, bool> { + // Befriend the base class so it can delegate to private visit methods. + friend class llvm::InstVisitor<AggLoadStoreRewriter, bool>; + + const TargetData &TD; + + /// Queue of pointer uses to analyze and potentially rewrite. + SmallVector<Use *, 8> Queue; + + /// Set to prevent us from cycling with phi nodes and loops. + SmallPtrSet<User *, 8> Visited; + + /// The current pointer use being rewritten. This is used to dig up the used + /// value (as opposed to the user). + Use *U; + +public: + AggLoadStoreRewriter(const TargetData &TD) : TD(TD) {} + + /// Rewrite loads and stores through a pointer and all pointers derived from + /// it. + bool rewrite(Instruction &I) { + DEBUG(dbgs() << " Rewriting FCA loads and stores...\n"); + enqueueUsers(I); + bool Changed = false; + while (!Queue.empty()) { + U = Queue.pop_back_val(); + Changed |= visit(cast<Instruction>(U->getUser())); + } + return Changed; + } + +private: + /// Enqueue all the users of the given instruction for further processing. + /// This uses a set to de-duplicate users. + void enqueueUsers(Instruction &I) { + for (Value::use_iterator UI = I.use_begin(), UE = I.use_end(); UI != UE; + ++UI) + if (Visited.insert(*UI)) + Queue.push_back(&UI.getUse()); + } + + // Conservative default is to not rewrite anything. + bool visitInstruction(Instruction &I) { return false; } + + /// \brief Generic recursive split emission routine. + /// + /// This method recursively splits an aggregate op (load or store) into + /// scalar or vector ops. It splits recursively until it hits a single value + /// and emits that single value operation via the template argument. + /// + /// The logic of this routine relies on GEPs and insertvalue and extractvalue + /// all operating with the same fundamental index list, merely formatted + /// differently (GEPs need actual values). + /// + /// \param IRB The builder used to form new instructions. + /// \param Ty The type being split recursively into smaller ops. + /// \param Agg The aggregate value being built up or stored, depending on + /// whether this is splitting a load or a store respectively. + /// \param Ptr The base pointer of the original op, used as a base for GEPing + /// the split operations. + /// \param Indices The indices which to be used with insert- or extractvalue + /// to select the appropriate value within the aggregate for \p Ty. + /// \param GEPIndices The indices to a GEP instruction which will move \p Ptr + /// to the correct slot within the aggregate for \p Ty. + template <void (AggLoadStoreRewriter::*emitFunc)( + IRBuilder<> &IRB, Type *Ty, Value *&Agg, Value *Ptr, + ArrayRef<unsigned> Indices, ArrayRef<Value *> GEPIndices, + const Twine &Name)> + void emitSplitOps(IRBuilder<> &IRB, Type *Ty, Value *&Agg, Value *Ptr, + SmallVectorImpl<unsigned> &Indices, + SmallVectorImpl<Value *> &GEPIndices, + const Twine &Name) { + if (Ty->isSingleValueType()) + return (this->*emitFunc)(IRB, Ty, Agg, Ptr, Indices, GEPIndices, Name); + + if (ArrayType *ATy = dyn_cast<ArrayType>(Ty)) { + unsigned OldSize = Indices.size(); + (void)OldSize; + for (unsigned Idx = 0, Size = ATy->getNumElements(); Idx != Size; ++Idx) { + assert(Indices.size() == OldSize && "Did not return to the old size"); + Indices.push_back(Idx); + GEPIndices.push_back(IRB.getInt32(Idx)); + emitSplitOps<emitFunc>(IRB, ATy->getElementType(), Agg, Ptr, + Indices, GEPIndices, Name + "." + Twine(Idx)); + GEPIndices.pop_back(); + Indices.pop_back(); + } + return; + } + + if (StructType *STy = dyn_cast<StructType>(Ty)) { + unsigned OldSize = Indices.size(); + (void)OldSize; + for (unsigned Idx = 0, Size = STy->getNumElements(); Idx != Size; ++Idx) { + assert(Indices.size() == OldSize && "Did not return to the old size"); + Indices.push_back(Idx); + GEPIndices.push_back(IRB.getInt32(Idx)); + emitSplitOps<emitFunc>(IRB, STy->getElementType(Idx), Agg, Ptr, + Indices, GEPIndices, Name + "." + Twine(Idx)); + GEPIndices.pop_back(); + Indices.pop_back(); + } + return; + } + + llvm_unreachable("Only arrays and structs are aggregate loadable types"); + } + + /// Emit a leaf load of a single value. This is called at the leaves of the + /// recursive emission to actually load values. + void emitLoad(IRBuilder<> &IRB, Type *Ty, Value *&Agg, Value *Ptr, + ArrayRef<unsigned> Indices, ArrayRef<Value *> GEPIndices, + const Twine &Name) { + assert(Ty->isSingleValueType()); + // Load the single value and insert it using the indices. + Value *Load = IRB.CreateLoad(IRB.CreateInBoundsGEP(Ptr, GEPIndices, + Name + ".gep"), + Name + ".load"); + Agg = IRB.CreateInsertValue(Agg, Load, Indices, Name + ".insert"); + DEBUG(dbgs() << " to: " << *Load << "\n"); + } + + bool visitLoadInst(LoadInst &LI) { + assert(LI.getPointerOperand() == *U); + if (!LI.isSimple() || LI.getType()->isSingleValueType()) + return false; + + // We have an aggregate being loaded, split it apart. + DEBUG(dbgs() << " original: " << LI << "\n"); + IRBuilder<> IRB(&LI); + Value *V = UndefValue::get(LI.getType()); + SmallVector<unsigned, 4> Indices; + SmallVector<Value *, 4> GEPIndices; + GEPIndices.push_back(IRB.getInt32(0)); + emitSplitOps<&AggLoadStoreRewriter::emitLoad>( + IRB, LI.getType(), V, *U, Indices, GEPIndices, LI.getName() + ".fca"); + LI.replaceAllUsesWith(V); + LI.eraseFromParent(); + return true; + } + + /// Emit a leaf store of a single value. This is called at the leaves of the + /// recursive emission to actually produce stores. + void emitStore(IRBuilder<> &IRB, Type *Ty, Value *&Agg, Value *Ptr, + ArrayRef<unsigned> Indices, ArrayRef<Value *> GEPIndices, + const Twine &Name) { + assert(Ty->isSingleValueType()); + // Extract the single value and store it using the indices. + Value *Store = IRB.CreateStore( + IRB.CreateExtractValue(Agg, Indices, Name + ".extract"), + IRB.CreateInBoundsGEP(Ptr, GEPIndices, Name + ".gep")); + DEBUG(dbgs() << " to: " << *Store << "\n"); + } + + bool visitStoreInst(StoreInst &SI) { + if (!SI.isSimple() || SI.getPointerOperand() != *U) + return false; + Value *V = SI.getValueOperand(); + if (V->getType()->isSingleValueType()) + return false; + + // We have an aggregate being stored, split it apart. + DEBUG(dbgs() << " original: " << SI << "\n"); + IRBuilder<> IRB(&SI); + SmallVector<unsigned, 4> Indices; + SmallVector<Value *, 4> GEPIndices; + GEPIndices.push_back(IRB.getInt32(0)); + emitSplitOps<&AggLoadStoreRewriter::emitStore>( + IRB, V->getType(), V, *U, Indices, GEPIndices, V->getName() + ".fca"); + SI.eraseFromParent(); + return true; + } + + bool visitBitCastInst(BitCastInst &BC) { + enqueueUsers(BC); + return false; + } + + bool visitGetElementPtrInst(GetElementPtrInst &GEPI) { + enqueueUsers(GEPI); + return false; + } + + bool visitPHINode(PHINode &PN) { + enqueueUsers(PN); + return false; + } + + bool visitSelectInst(SelectInst &SI) { + enqueueUsers(SI); + return false; + } +}; +} + /// \brief Try to find a partition of the aggregate type passed in for a given /// offset and size. /// @@ -2637,18 +2844,24 @@ bool SROA::runOnAlloca(AllocaInst &AI) { return false; } + bool Changed = false; + + // First, split any FCA loads and stores touching this alloca to promote + // better splitting and promotion opportunities. + AggLoadStoreRewriter AggRewriter(*TD); + Changed |= AggRewriter.rewrite(AI); + // Build the partition set using a recursive instruction-visiting builder. AllocaPartitioning P(*TD, AI); DEBUG(P.print(dbgs())); if (P.isEscaped()) - return false; + return Changed; // No partitions to split. Leave the dead alloca for a later pass to clean up. if (P.begin() == P.end()) - return false; + return Changed; // Delete all the dead users of this alloca before splitting and rewriting it. - bool Changed = false; for (AllocaPartitioning::dead_user_iterator DI = P.dead_user_begin(), DE = P.dead_user_end(); DI != DE; ++DI) { diff --git a/test/Transforms/SROA/fca.ll b/test/Transforms/SROA/fca.ll new file mode 100644 index 0000000000..6ddaed2f30 --- /dev/null +++ b/test/Transforms/SROA/fca.ll @@ -0,0 +1,49 @@ +; RUN: opt < %s -sroa -S | FileCheck %s +; RUN: opt < %s -sroa -force-ssa-updater -S | FileCheck %s +target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-n8:16:32:64" + +define { i32, i32 } @test0(i32 %x, i32 %y) { +; CHECK: @test0 +; CHECK-NOT: alloca +; CHECK: insertvalue { i32, i32 } +; CHECK: insertvalue { i32, i32 } +; CHECK: ret { i32, i32 } + +entry: + %a = alloca { i32, i32 } + + store { i32, i32 } undef, { i32, i32 }* %a + + %gep1 = getelementptr inbounds { i32, i32 }* %a, i32 0, i32 0 + store i32 %x, i32* %gep1 + %gep2 = getelementptr inbounds { i32, i32 }* %a, i32 0, i32 1 + store i32 %y, i32* %gep2 + + %result = load { i32, i32 }* %a + ret { i32, i32 } %result +} + +define { i32, i32 } @test1(i32 %x, i32 %y) { +; FIXME: This may be too conservative. Duncan argues that we are allowed to +; split the volatile load and store here but must produce volatile scalar loads +; and stores from them. +; CHECK: @test1 +; CHECK: alloca +; CHECK: alloca +; CHECK: load volatile { i32, i32 }* +; CHECK: store volatile { i32, i32 } +; CHECK: ret { i32, i32 } + +entry: + %a = alloca { i32, i32 } + %b = alloca { i32, i32 } + + %gep1 = getelementptr inbounds { i32, i32 }* %a, i32 0, i32 0 + store i32 %x, i32* %gep1 + %gep2 = getelementptr inbounds { i32, i32 }* %a, i32 0, i32 1 + store i32 %y, i32* %gep2 + + %result = load volatile { i32, i32 }* %a + store volatile { i32, i32 } %result, { i32, i32 }* %b + ret { i32, i32 } %result +} |