3 files changed, 270 insertions, 8 deletions
diff --git a/lib/Transforms/IPO/PassManagerBuilder.cpp b/lib/Transforms/IPO/PassManagerBuilder.cpp
index c81b333813..105b2886d9 100644
--- a/lib/Transforms/IPO/PassManagerBuilder.cpp
+++ b/lib/Transforms/IPO/PassManagerBuilder.cpp
@@ -41,7 +41,7 @@ UseGVNAfterVectorization("use-gvn-after-vectorization",
   cl::desc("Run GVN instead of Early CSE after vectorization passes"));
 
 static cl::opt<bool> UseNewSROA("use-new-sroa",
-  cl::init(false), cl::Hidden,
+  cl::init(true), cl::Hidden,
   cl::desc("Enable the new, experimental SROA pass"));
 
 PassManagerBuilder::PassManagerBuilder() {
diff --git a/lib/Transforms/Scalar/SROA.cpp b/lib/Transforms/Scalar/SROA.cpp
index a7d8ee7e68..cdbd9f9f23 100644
--- a/lib/Transforms/Scalar/SROA.cpp
+++ b/lib/Transforms/Scalar/SROA.cpp
@@ -569,14 +569,19 @@ private:
   }
 
   bool visitLoadInst(LoadInst &LI) {
+    assert((!LI.isSimple() || LI.getType()->isSingleValueType()) &&
+           "All simple FCA loads should have been pre-split");
     return handleLoadOrStore(LI.getType(), LI, Offset);
   }
 
   bool visitStoreInst(StoreInst &SI) {
-    if (SI.getOperand(0) == *U)
+    Value *ValOp = SI.getValueOperand();
+    if (ValOp == *U)
       return markAsEscaping(SI);
 
-    return handleLoadOrStore(SI.getOperand(0)->getType(), SI, Offset);
+    assert((!SI.isSimple() || ValOp->getType()->isSingleValueType()) &&
+           "All simple FCA stores should have been pre-split");
+    return handleLoadOrStore(ValOp->getType(), SI, Offset);
   }
 
 
@@ -1051,10 +1056,10 @@ AllocaPartitioning::AllocaPartitioning(const TargetData &TD, AllocaInst &AI)
 Type *AllocaPartitioning::getCommonType(iterator I) const {
   Type *Ty = 0;
   for (const_use_iterator UI = use_begin(I), UE = use_end(I); UI != UE; ++UI) {
-    if (isa<MemIntrinsic>(*UI->User))
+    if (isa<IntrinsicInst>(*UI->User))
       continue;
     if (UI->BeginOffset != I->BeginOffset || UI->EndOffset != I->EndOffset)
-      break;
+      return 0;
 
     Type *UserTy = 0;
     if (LoadInst *LI = dyn_cast<LoadInst>(&*UI->User)) {
@@ -2409,6 +2414,208 @@ private:
 };
 }
 
+namespace {
+/// \brief Visitor to rewrite aggregate loads and stores as scalar.
+///
+/// This pass aggressively rewrites all aggregate loads and stores on
+/// a particular pointer (or any pointer derived from it which we can identify)
+/// with scalar loads and stores.
+class AggLoadStoreRewriter : public InstVisitor<AggLoadStoreRewriter, bool> {
+  // Befriend the base class so it can delegate to private visit methods.
+  friend class llvm::InstVisitor<AggLoadStoreRewriter, bool>;
+
+  const TargetData &TD;
+
+  /// Queue of pointer uses to analyze and potentially rewrite.
+  SmallVector<Use *, 8> Queue;
+
+  /// Set to prevent us from cycling with phi nodes and loops.
+  SmallPtrSet<User *, 8> Visited;
+
+  /// The current pointer use being rewritten. This is used to dig up the used
+  /// value (as opposed to the user).
+  Use *U;
+
+public:
+  AggLoadStoreRewriter(const TargetData &TD) : TD(TD) {}
+
+  /// Rewrite loads and stores through a pointer and all pointers derived from
+  /// it.
+  bool rewrite(Instruction &I) {
+    DEBUG(dbgs() << "  Rewriting FCA loads and stores...\n");
+    enqueueUsers(I);
+    bool Changed = false;
+    while (!Queue.empty()) {
+      U = Queue.pop_back_val();
+      Changed |= visit(cast<Instruction>(U->getUser()));
+    }
+    return Changed;
+  }
+
+private:
+  /// Enqueue all the users of the given instruction for further processing.
+  /// This uses a set to de-duplicate users.
+  void enqueueUsers(Instruction &I) {
+    for (Value::use_iterator UI = I.use_begin(), UE = I.use_end(); UI != UE;
+         ++UI)
+      if (Visited.insert(*UI))
+        Queue.push_back(&UI.getUse());
+  }
+
+  // Conservative default is to not rewrite anything.
+  bool visitInstruction(Instruction &I) { return false; }
+
+  /// \brief Generic recursive split emission routine.
+  ///
+  /// This method recursively splits an aggregate op (load or store) into
+  /// scalar or vector ops. It splits recursively until it hits a single value
+  /// and emits that single value operation via the template argument.
+  ///
+  /// The logic of this routine relies on GEPs and insertvalue and extractvalue
+  /// all operating with the same fundamental index list, merely formatted
+  /// differently (GEPs need actual values).
+  ///
+  /// \param IRB The builder used to form new instructions.
+  /// \param Ty  The type being split recursively into smaller ops.
+  /// \param Agg The aggregate value being built up or stored, depending on
+  /// whether this is splitting a load or a store respectively.
+  /// \param Ptr The base pointer of the original op, used as a base for GEPing
+  /// the split operations.
+  /// \param Indices The indices which to be used with insert- or extractvalue
+  /// to select the appropriate value within the aggregate for \p Ty.
+  /// \param GEPIndices The indices to a GEP instruction which will move \p Ptr
+  /// to the correct slot within the aggregate for \p Ty.
+  template <void (AggLoadStoreRewriter::*emitFunc)(
+                IRBuilder<> &IRB, Type *Ty, Value *&Agg, Value *Ptr,
+                ArrayRef<unsigned> Indices, ArrayRef<Value *> GEPIndices,
+                const Twine &Name)>
+  void emitSplitOps(IRBuilder<> &IRB, Type *Ty, Value *&Agg, Value *Ptr,
+                    SmallVectorImpl<unsigned> &Indices,
+                    SmallVectorImpl<Value *> &GEPIndices,
+                    const Twine &Name) {
+    if (Ty->isSingleValueType())
+      return (this->*emitFunc)(IRB, Ty, Agg, Ptr, Indices, GEPIndices, Name);
+
+    if (ArrayType *ATy = dyn_cast<ArrayType>(Ty)) {
+      unsigned OldSize = Indices.size();
+      (void)OldSize;
+      for (unsigned Idx = 0, Size = ATy->getNumElements(); Idx != Size; ++Idx) {
+        assert(Indices.size() == OldSize && "Did not return to the old size");
+        Indices.push_back(Idx);
+        GEPIndices.push_back(IRB.getInt32(Idx));
+        emitSplitOps<emitFunc>(IRB, ATy->getElementType(), Agg, Ptr,
+                               Indices, GEPIndices, Name + "." + Twine(Idx));
+        GEPIndices.pop_back();
+        Indices.pop_back();
+      }
+      return;
+    }
+
+    if (StructType *STy = dyn_cast<StructType>(Ty)) {
+      unsigned OldSize = Indices.size();
+      (void)OldSize;
+      for (unsigned Idx = 0, Size = STy->getNumElements(); Idx != Size; ++Idx) {
+        assert(Indices.size() == OldSize && "Did not return to the old size");
+        Indices.push_back(Idx);
+        GEPIndices.push_back(IRB.getInt32(Idx));
+        emitSplitOps<emitFunc>(IRB, STy->getElementType(Idx), Agg, Ptr,
+                               Indices, GEPIndices, Name + "." + Twine(Idx));
+        GEPIndices.pop_back();
+        Indices.pop_back();
+      }
+      return;
+    }
+
+    llvm_unreachable("Only arrays and structs are aggregate loadable types");
+  }
+
+  /// Emit a leaf load of a single value. This is called at the leaves of the
+  /// recursive emission to actually load values.
+  void emitLoad(IRBuilder<> &IRB, Type *Ty, Value *&Agg, Value *Ptr,
+                ArrayRef<unsigned> Indices, ArrayRef<Value *> GEPIndices,
+                const Twine &Name) {
+    assert(Ty->isSingleValueType());
+    // Load the single value and insert it using the indices.
+    Value *Load = IRB.CreateLoad(IRB.CreateInBoundsGEP(Ptr, GEPIndices,
+                                                       Name + ".gep"),
+                                 Name + ".load");
+    Agg = IRB.CreateInsertValue(Agg, Load, Indices, Name + ".insert");
+    DEBUG(dbgs() << "          to: " << *Load << "\n");
+  }
+
+  bool visitLoadInst(LoadInst &LI) {
+    assert(LI.getPointerOperand() == *U);
+    if (!LI.isSimple() || LI.getType()->isSingleValueType())
+      return false;
+
+    // We have an aggregate being loaded, split it apart.
+    DEBUG(dbgs() << "    original: " << LI << "\n");
+    IRBuilder<> IRB(&LI);
+    Value *V = UndefValue::get(LI.getType());
+    SmallVector<unsigned, 4> Indices;
+    SmallVector<Value *, 4> GEPIndices;
+    GEPIndices.push_back(IRB.getInt32(0));
+    emitSplitOps<&AggLoadStoreRewriter::emitLoad>(
+      IRB, LI.getType(), V, *U, Indices, GEPIndices, LI.getName() + ".fca");
+    LI.replaceAllUsesWith(V);
+    LI.eraseFromParent();
+    return true;
+  }
+
+  /// Emit a leaf store of a single value. This is called at the leaves of the
+  /// recursive emission to actually produce stores.
+  void emitStore(IRBuilder<> &IRB, Type *Ty, Value *&Agg, Value *Ptr,
+                 ArrayRef<unsigned> Indices, ArrayRef<Value *> GEPIndices,
+                 const Twine &Name) {
+    assert(Ty->isSingleValueType());
+    // Extract the single value and store it using the indices.
+    Value *Store = IRB.CreateStore(
+      IRB.CreateExtractValue(Agg, Indices, Name + ".extract"),
+      IRB.CreateInBoundsGEP(Ptr, GEPIndices, Name + ".gep"));
+    DEBUG(dbgs() << "          to: " << *Store << "\n");
+  }
+
+  bool visitStoreInst(StoreInst &SI) {
+    if (!SI.isSimple() || SI.getPointerOperand() != *U)
+      return false;
+    Value *V = SI.getValueOperand();
+    if (V->getType()->isSingleValueType())
+      return false;
+
+    // We have an aggregate being stored, split it apart.
+    DEBUG(dbgs() << "    original: " << SI << "\n");
+    IRBuilder<> IRB(&SI);
+    SmallVector<unsigned, 4> Indices;
+    SmallVector<Value *, 4> GEPIndices;
+    GEPIndices.push_back(IRB.getInt32(0));
+    emitSplitOps<&AggLoadStoreRewriter::emitStore>(
+      IRB, V->getType(), V, *U, Indices, GEPIndices, V->getName() + ".fca");
+    SI.eraseFromParent();
+    return true;
+  }
+
+  bool visitBitCastInst(BitCastInst &BC) {
+    enqueueUsers(BC);
+    return false;
+  }
+
+  bool visitGetElementPtrInst(GetElementPtrInst &GEPI) {
+    enqueueUsers(GEPI);
+    return false;
+  }
+
+  bool visitPHINode(PHINode &PN) {
+    enqueueUsers(PN);
+    return false;
+  }
+
+  bool visitSelectInst(SelectInst &SI) {
+    enqueueUsers(SI);
+    return false;
+  }
+};
+}
+
 /// \brief Try to find a partition of the aggregate type passed in for a given
 /// offset and size.
 ///
@@ -2637,18 +2844,24 @@ bool SROA::runOnAlloca(AllocaInst &AI) {
     return false;
   }
 
+  bool Changed = false;
+
+  // First, split any FCA loads and stores touching this alloca to promote
+  // better splitting and promotion opportunities.
+  AggLoadStoreRewriter AggRewriter(*TD);
+  Changed |= AggRewriter.rewrite(AI);
+
   // Build the partition set using a recursive instruction-visiting builder.
   AllocaPartitioning P(*TD, AI);
   DEBUG(P.print(dbgs()));
   if (P.isEscaped())
-    return false;
+    return Changed;
 
   // No partitions to split. Leave the dead alloca for a later pass to clean up.
   if (P.begin() == P.end())
-    return false;
+    return Changed;
 
   // Delete all the dead users of this alloca before splitting and rewriting it.
-  bool Changed = false;
   for (AllocaPartitioning::dead_user_iterator DI = P.dead_user_begin(),
                                               DE = P.dead_user_end();
        DI != DE; ++DI) {
diff --git a/test/Transforms/SROA/fca.ll b/test/Transforms/SROA/fca.ll
new file mode 100644
index 0000000000..6ddaed2f30
--- /dev/null
+++ b/test/Transforms/SROA/fca.ll
@@ -0,0 +1,49 @@
+; RUN: opt < %s -sroa -S | FileCheck %s
+; RUN: opt < %s -sroa -force-ssa-updater -S | FileCheck %s
+target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-n8:16:32:64"
+
+define { i32, i32 } @test0(i32 %x, i32 %y) {
+; CHECK: @test0
+; CHECK-NOT: alloca
+; CHECK: insertvalue { i32, i32 }
+; CHECK: insertvalue { i32, i32 }
+; CHECK: ret { i32, i32 }
+
+entry:
+  %a = alloca { i32, i32 }
+
+  store { i32, i32 } undef, { i32, i32 }* %a
+
+  %gep1 = getelementptr inbounds { i32, i32 }* %a, i32 0, i32 0
+  store i32 %x, i32* %gep1
+  %gep2 = getelementptr inbounds { i32, i32 }* %a, i32 0, i32 1
+  store i32 %y, i32* %gep2
+
+  %result = load { i32, i32 }* %a
+  ret { i32, i32 } %result
+}
+
+define { i32, i32 } @test1(i32 %x, i32 %y) {
+; FIXME: This may be too conservative. Duncan argues that we are allowed to
+; split the volatile load and store here but must produce volatile scalar loads
+; and stores from them.
+; CHECK: @test1
+; CHECK: alloca
+; CHECK: alloca
+; CHECK: load volatile { i32, i32 }*
+; CHECK: store volatile { i32, i32 }
+; CHECK: ret { i32, i32 }
+
+entry:
+  %a = alloca { i32, i32 }
+  %b = alloca { i32, i32 }
+
+  %gep1 = getelementptr inbounds { i32, i32 }* %a, i32 0, i32 0
+  store i32 %x, i32* %gep1
+  %gep2 = getelementptr inbounds { i32, i32 }* %a, i32 0, i32 1
+  store i32 %y, i32* %gep2
+
+  %result = load volatile { i32, i32 }* %a
+  store volatile { i32, i32 } %result, { i32, i32 }* %b
+  ret { i32, i32 } %result
+}