19 files changed, 279 insertions, 215 deletions
diff --git a/test/CodeGen/ARM/arm-negative-stride.ll b/test/CodeGen/ARM/arm-negative-stride.ll
index 72ec8efcc4..52ab8717c1 100644
--- a/test/CodeGen/ARM/arm-negative-stride.ll
+++ b/test/CodeGen/ARM/arm-negative-stride.ll
@@ -1,7 +1,32 @@
 ; RUN: llc < %s -march=arm | FileCheck %s
 
+; This loop is rewritten with an indvar which counts down, which
+; frees up a register from holding the trip count.
+
 define void @test(i32* %P, i32 %A, i32 %i) nounwind {
 entry:
+; CHECK: str r1, [{{r.*}}, +{{r.*}}, lsl #2]
+        icmp eq i32 %i, 0               ; <i1>:0 [#uses=1]
+        br i1 %0, label %return, label %bb
+
+bb:             ; preds = %bb, %entry
+        %indvar = phi i32 [ 0, %entry ], [ %indvar.next, %bb ]          ; <i32> [#uses=2]
+        %i_addr.09.0 = sub i32 %i, %indvar              ; <i32> [#uses=1]
+        %tmp2 = getelementptr i32* %P, i32 %i_addr.09.0         ; <i32*> [#uses=1]
+        store i32 %A, i32* %tmp2
+        %indvar.next = add i32 %indvar, 1               ; <i32> [#uses=2]
+        icmp eq i32 %indvar.next, %i            ; <i1>:1 [#uses=1]
+        br i1 %1, label %return, label %bb
+
+return:         ; preds = %bb, %entry
+        ret void
+}
+
+; This loop has a non-address use of the count-up indvar, so
+; it'll remain. Now the original store uses a negative-stride address.
+
+define void @test_with_forced_iv(i32* %P, i32 %A, i32 %i) nounwind {
+entry:
 ; CHECK: str r1, [{{r.*}}, -{{r.*}}, lsl #2]
         icmp eq i32 %i, 0               ; <i1>:0 [#uses=1]
         br i1 %0, label %return, label %bb
@@ -11,6 +36,7 @@ bb:             ; preds = %bb, %entry
         %i_addr.09.0 = sub i32 %i, %indvar              ; <i32> [#uses=1]
         %tmp2 = getelementptr i32* %P, i32 %i_addr.09.0         ; <i32*> [#uses=1]
         store i32 %A, i32* %tmp2
+        store i32 %indvar, i32* null
         %indvar.next = add i32 %indvar, 1               ; <i32> [#uses=2]
         icmp eq i32 %indvar.next, %i            ; <i1>:1 [#uses=1]
         br i1 %1, label %return, label %bb
diff --git a/test/CodeGen/ARM/lsr-code-insertion.ll b/test/CodeGen/ARM/lsr-code-insertion.ll
index 507ec2c7bd..1bbb96deee 100644
--- a/test/CodeGen/ARM/lsr-code-insertion.ll
+++ b/test/CodeGen/ARM/lsr-code-insertion.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -stats |& grep {40.*Number of machine instrs printed}
-; RUN: llc < %s -stats |& grep {.*Number of re-materialization}
+; RUN: llc < %s -stats |& grep {39.*Number of machine instrs printed}
+; RUN: llc < %s -stats |& not grep {.*Number of re-materialization}
 ; This test really wants to check that the resultant "cond_true" block only 
 ; has a single store in it, and that cond_true55 only has code to materialize 
 ; the constant and do a store.  We do *not* want something like this:
diff --git a/test/CodeGen/ARM/remat.ll b/test/CodeGen/ARM/remat.ll
index 9565c8bca6..9072bcb762 100644
--- a/test/CodeGen/ARM/remat.ll
+++ b/test/CodeGen/ARM/remat.ll
@@ -1,5 +1,4 @@
-; RUN: llc < %s -mtriple=arm-apple-darwin 
-; RUN: llc < %s -mtriple=arm-apple-darwin -stats -info-output-file - | grep "Number of re-materialization" | grep 3
+; RUN: llc < %s -mtriple=arm-apple-darwin -stats -info-output-file - | not grep "Number of re-materialization"
 
 	%struct.CONTENTBOX = type { i32, i32, i32, i32, i32 }
 	%struct.LOCBOX = type { i32, i32, i32, i32 }
diff --git a/test/CodeGen/Thumb2/lsr-deficiency.ll b/test/CodeGen/Thumb2/lsr-deficiency.ll
index 7b1b57a786..ac2cd34e4b 100644
--- a/test/CodeGen/Thumb2/lsr-deficiency.ll
+++ b/test/CodeGen/Thumb2/lsr-deficiency.ll
@@ -1,25 +1,29 @@
 ; RUN: llc < %s -mtriple=thumbv7-apple-darwin10 -relocation-model=pic | FileCheck %s
 ; rdar://7387640
 
-; FIXME: We still need to rewrite array reference iv of stride -4 with loop
-; count iv of stride -1.
+; This now reduces to a single induction variable.
+
+; TODO: It still gets a GPR shuffle at the end of the loop
+; This is because something in instruction selection has decided
+; that comparing the pre-incremented value with zero is better
+; than comparing the post-incremented value with -4.
 
 @G = external global i32                          ; <i32*> [#uses=2]
 @array = external global i32*                     ; <i32**> [#uses=1]
 
 define arm_apcscc void @t() nounwind optsize {
 ; CHECK: t:
-; CHECK: mov.w r2, #4000
-; CHECK: movw r3, #1001
+; CHECK: mov.w r2, #1000
 entry:
   %.pre = load i32* @G, align 4                   ; <i32> [#uses=1]
   br label %bb
 
 bb:                                               ; preds = %bb, %entry
 ; CHECK: LBB1_1:
-; CHECK: subs r3, #1
-; CHECK: cmp r3, #0
-; CHECK: sub.w r2, r2, #4
+; CHECK: cmp r2, #0
+; CHECK: sub.w r9, r2, #1
+; CHECK: mov r2, r9
+
   %0 = phi i32 [ %.pre, %entry ], [ %3, %bb ]     ; <i32> [#uses=1]
   %indvar = phi i32 [ 0, %entry ], [ %indvar.next, %bb ] ; <i32> [#uses=2]
   %tmp5 = sub i32 1000, %indvar                   ; <i32> [#uses=1]
diff --git a/test/CodeGen/Thumb2/thumb2-ifcvt1.ll b/test/CodeGen/Thumb2/thumb2-ifcvt1.ll
index 71199abc57..1d267565e0 100644
--- a/test/CodeGen/Thumb2/thumb2-ifcvt1.ll
+++ b/test/CodeGen/Thumb2/thumb2-ifcvt1.ll
@@ -1,6 +1,6 @@
 ; RUN: llc < %s -mtriple=thumbv7-apple-darwin | FileCheck %s
 
-define i32 @t1(i32 %a, i32 %b, i32 %c, i32 %d) {
+define i32 @t1(i32 %a, i32 %b, i32 %c, i32 %d) nounwind {
 ; CHECK: t1:
 ; CHECK: it ne
 ; CHECK: cmpne
@@ -20,12 +20,12 @@ cond_next:
 }
 
 ; FIXME: Check for # of unconditional branch after adding branch folding post ifcvt.
-define i32 @t2(i32 %a, i32 %b) {
+define i32 @t2(i32 %a, i32 %b) nounwind {
 entry:
 ; CHECK: t2:
-; CHECK: ite le
-; CHECK: suble
+; CHECK: ite gt
 ; CHECK: subgt
+; CHECK: suble
 	%tmp1434 = icmp eq i32 %a, %b		; <i1> [#uses=1]
 	br i1 %tmp1434, label %bb17, label %bb.outer
 
@@ -60,14 +60,14 @@ bb17:		; preds = %cond_false, %cond_true, %entry
 
 @x = external global i32*		; <i32**> [#uses=1]
 
-define void @foo(i32 %a) {
+define void @foo(i32 %a) nounwind {
 entry:
 	%tmp = load i32** @x		; <i32*> [#uses=1]
 	store i32 %a, i32* %tmp
 	ret void
 }
 
-define void @t3(i32 %a, i32 %b) {
+define void @t3(i32 %a, i32 %b) nounwind {
 entry:
 ; CHECK: t3:
 ; CHECK: it lt
diff --git a/test/CodeGen/X86/2006-05-11-InstrSched.ll b/test/CodeGen/X86/2006-05-11-InstrSched.ll
index bdbe713a29..56d6aa960e 100644
--- a/test/CodeGen/X86/2006-05-11-InstrSched.ll
+++ b/test/CodeGen/X86/2006-05-11-InstrSched.ll
@@ -1,5 +1,5 @@
 ; RUN: llc < %s -march=x86 -mattr=+sse2 -stats -realign-stack=0 |&\
-; RUN:     grep {asm-printer} | grep 31
+; RUN:     grep {asm-printer} | grep 34
 
 target datalayout = "e-p:32:32"
 define void @foo(i32* %mc, i32* %bp, i32* %ms, i32* %xmb, i32* %mpp, i32* %tpmm, i32* %ip, i32* %tpim, i32* %dpp, i32* %tpdm, i32* %bpi, i32 %M) nounwind {
@@ -40,7 +40,7 @@ cond_true:		; preds = %cond_true, %entry
 	%tmp137.upgrd.7 = bitcast i32* %tmp137 to <2 x i64>*		; <<2 x i64>*> [#uses=1]
 	store <2 x i64> %tmp131, <2 x i64>* %tmp137.upgrd.7
 	%tmp147 = add nsw i32 %tmp.10, 8		; <i32> [#uses=1]
-	%tmp.upgrd.8 = icmp slt i32 %tmp147, %M		; <i1> [#uses=1]
+	%tmp.upgrd.8 = icmp ne i32 %tmp147, %M		; <i1> [#uses=1]
 	%indvar.next = add i32 %indvar, 1		; <i32> [#uses=1]
 	br i1 %tmp.upgrd.8, label %cond_true, label %return
 
diff --git a/test/CodeGen/X86/2007-08-13-SpillerReuse.ll b/test/CodeGen/X86/2007-08-13-SpillerReuse.ll
deleted file mode 100644
index d6ea5109d1..0000000000
--- a/test/CodeGen/X86/2007-08-13-SpillerReuse.ll
+++ /dev/null
@@ -1,102 +0,0 @@
-; RUN: llc < %s -mtriple=i686-apple-darwin | grep "48(%esp)" | count 5
-
-	%struct..0anon = type { i32 }
-	%struct.rtvec_def = type { i32, [1 x %struct..0anon] }
-	%struct.rtx_def = type { i16, i8, i8, [1 x %struct..0anon] }
-@rtx_format = external global [116 x i8*]		; <[116 x i8*]*> [#uses=1]
-@rtx_length = external global [117 x i32]		; <[117 x i32]*> [#uses=1]
-
-declare %struct.rtx_def* @fixup_memory_subreg(%struct.rtx_def*, %struct.rtx_def*, i32)
-
-define %struct.rtx_def* @walk_fixup_memory_subreg(%struct.rtx_def* %x, %struct.rtx_def* %insn) {
-entry:
-	%tmp2 = icmp eq %struct.rtx_def* %x, null		; <i1> [#uses=1]
-	br i1 %tmp2, label %UnifiedReturnBlock, label %cond_next
-
-cond_next:		; preds = %entry
-	%tmp6 = getelementptr %struct.rtx_def* %x, i32 0, i32 0		; <i16*> [#uses=1]
-	%tmp7 = load i16* %tmp6		; <i16> [#uses=2]
-	%tmp78 = zext i16 %tmp7 to i32		; <i32> [#uses=2]
-	%tmp10 = icmp eq i16 %tmp7, 54		; <i1> [#uses=1]
-	br i1 %tmp10, label %cond_true13, label %cond_next32
-
-cond_true13:		; preds = %cond_next
-	%tmp15 = getelementptr %struct.rtx_def* %x, i32 0, i32 3		; <[1 x %struct..0anon]*> [#uses=1]
-	%tmp1718 = bitcast [1 x %struct..0anon]* %tmp15 to %struct.rtx_def**		; <%struct.rtx_def**> [#uses=1]
-	%tmp19 = load %struct.rtx_def** %tmp1718		; <%struct.rtx_def*> [#uses=1]
-	%tmp20 = getelementptr %struct.rtx_def* %tmp19, i32 0, i32 0		; <i16*> [#uses=1]
-	%tmp21 = load i16* %tmp20		; <i16> [#uses=1]
-	%tmp22 = icmp eq i16 %tmp21, 57		; <i1> [#uses=1]
-	br i1 %tmp22, label %cond_true25, label %cond_next32
-
-cond_true25:		; preds = %cond_true13
-	%tmp29 = tail call %struct.rtx_def* @fixup_memory_subreg( %struct.rtx_def* %x, %struct.rtx_def* %insn, i32 1 )		; <%struct.rtx_def*> [#uses=1]
-	ret %struct.rtx_def* %tmp29
-
-cond_next32:		; preds = %cond_true13, %cond_next
-	%tmp34 = getelementptr [116 x i8*]* @rtx_format, i32 0, i32 %tmp78		; <i8**> [#uses=1]
-	%tmp35 = load i8** %tmp34, align 4		; <i8*> [#uses=1]
-	%tmp37 = getelementptr [117 x i32]* @rtx_length, i32 0, i32 %tmp78		; <i32*> [#uses=1]
-	%tmp38 = load i32* %tmp37, align 4		; <i32> [#uses=1]
-	%i.011 = add i32 %tmp38, -1		; <i32> [#uses=2]
-	%tmp12513 = icmp sgt i32 %i.011, -1		; <i1> [#uses=1]
-	br i1 %tmp12513, label %bb, label %UnifiedReturnBlock
-
-bb:		; preds = %bb123, %cond_next32
-	%indvar = phi i32 [ %indvar.next26, %bb123 ], [ 0, %cond_next32 ]		; <i32> [#uses=2]
-	%i.01.0 = sub i32 %i.011, %indvar		; <i32> [#uses=5]
-	%tmp42 = getelementptr i8* %tmp35, i32 %i.01.0		; <i8*> [#uses=2]
-	%tmp43 = load i8* %tmp42		; <i8> [#uses=1]
-	switch i8 %tmp43, label %bb123 [
-		 i8 101, label %cond_true47
-		 i8 69, label %bb105.preheader
-	]
-
-cond_true47:		; preds = %bb
-	%tmp52 = getelementptr %struct.rtx_def* %x, i32 0, i32 3, i32 %i.01.0		; <%struct..0anon*> [#uses=1]
-	%tmp5354 = bitcast %struct..0anon* %tmp52 to %struct.rtx_def**		; <%struct.rtx_def**> [#uses=1]
-	%tmp55 = load %struct.rtx_def** %tmp5354		; <%struct.rtx_def*> [#uses=1]
-	%tmp58 = tail call  %struct.rtx_def* @walk_fixup_memory_subreg( %struct.rtx_def* %tmp55, %struct.rtx_def* %insn )		; <%struct.rtx_def*> [#uses=1]
-	%tmp62 = getelementptr %struct.rtx_def* %x, i32 0, i32 3, i32 %i.01.0, i32 0		; <i32*> [#uses=1]
-	%tmp58.c = ptrtoint %struct.rtx_def* %tmp58 to i32		; <i32> [#uses=1]
-	store i32 %tmp58.c, i32* %tmp62
-	%tmp6816 = load i8* %tmp42		; <i8> [#uses=1]
-	%tmp6917 = icmp eq i8 %tmp6816, 69		; <i1> [#uses=1]
-	br i1 %tmp6917, label %bb105.preheader, label %bb123
-
-bb105.preheader:		; preds = %cond_true47, %bb
-	%tmp11020 = getelementptr %struct.rtx_def* %x, i32 0, i32 3, i32 %i.01.0		; <%struct..0anon*> [#uses=1]
-	%tmp11111221 = bitcast %struct..0anon* %tmp11020 to %struct.rtvec_def**		; <%struct.rtvec_def**> [#uses=3]
-	%tmp11322 = load %struct.rtvec_def** %tmp11111221		; <%struct.rtvec_def*> [#uses=1]
-	%tmp11423 = getelementptr %struct.rtvec_def* %tmp11322, i32 0, i32 0		; <i32*> [#uses=1]
-	%tmp11524 = load i32* %tmp11423		; <i32> [#uses=1]
-	%tmp11625 = icmp eq i32 %tmp11524, 0		; <i1> [#uses=1]
-	br i1 %tmp11625, label %bb123, label %bb73
-
-bb73:		; preds = %bb73, %bb105.preheader
-	%j.019 = phi i32 [ %tmp104, %bb73 ], [ 0, %bb105.preheader ]		; <i32> [#uses=3]
-	%tmp81 = load %struct.rtvec_def** %tmp11111221		; <%struct.rtvec_def*> [#uses=2]
-	%tmp92 = getelementptr %struct.rtvec_def* %tmp81, i32 0, i32 1, i32 %j.019		; <%struct..0anon*> [#uses=1]
-	%tmp9394 = bitcast %struct..0anon* %tmp92 to %struct.rtx_def**		; <%struct.rtx_def**> [#uses=1]
-	%tmp95 = load %struct.rtx_def** %tmp9394		; <%struct.rtx_def*> [#uses=1]
-	%tmp98 = tail call  %struct.rtx_def* @walk_fixup_memory_subreg( %struct.rtx_def* %tmp95, %struct.rtx_def* %insn )		; <%struct.rtx_def*> [#uses=1]
-	%tmp101 = getelementptr %struct.rtvec_def* %tmp81, i32 0, i32 1, i32 %j.019, i32 0		; <i32*> [#uses=1]
-	%tmp98.c = ptrtoint %struct.rtx_def* %tmp98 to i32		; <i32> [#uses=1]
-	store i32 %tmp98.c, i32* %tmp101
-	%tmp104 = add i32 %j.019, 1		; <i32> [#uses=2]
-	%tmp113 = load %struct.rtvec_def** %tmp11111221		; <%struct.rtvec_def*> [#uses=1]
-	%tmp114 = getelementptr %struct.rtvec_def* %tmp113, i32 0, i32 0		; <i32*> [#uses=1]
-	%tmp115 = load i32* %tmp114		; <i32> [#uses=1]
-	%tmp116 = icmp ult i32 %tmp104, %tmp115		; <i1> [#uses=1]
-	br i1 %tmp116, label %bb73, label %bb123
-
-bb123:		; preds = %bb73, %bb105.preheader, %cond_true47, %bb
-	%i.0 = add i32 %i.01.0, -1		; <i32> [#uses=1]
-	%tmp125 = icmp sgt i32 %i.0, -1		; <i1> [#uses=1]
-	%indvar.next26 = add i32 %indvar, 1		; <i32> [#uses=1]
-	br i1 %tmp125, label %bb, label %UnifiedReturnBlock
-
-UnifiedReturnBlock:		; preds = %bb123, %cond_next32, %entry
-	%UnifiedRetVal = phi %struct.rtx_def* [ null, %entry ], [ %x, %cond_next32 ], [ %x, %bb123 ]		; <%struct.rtx_def*> [#uses=1]
-	ret %struct.rtx_def* %UnifiedRetVal
-}
diff --git a/test/CodeGen/X86/2007-11-30-LoadFolding-Bug.ll b/test/CodeGen/X86/2007-11-30-LoadFolding-Bug.ll
index 721d4c945b..8e315f4d80 100644
--- a/test/CodeGen/X86/2007-11-30-LoadFolding-Bug.ll
+++ b/test/CodeGen/X86/2007-11-30-LoadFolding-Bug.ll
@@ -35,7 +35,7 @@ cond_next36.i:		; preds = %cond_next.i
 bb.i28.i:		; preds = %bb.i28.i, %cond_next36.i
 ; CHECK: %bb.i28.i
 ; CHECK: addl $2
-; CHECK: addl $2
+; CHECK: addl $-2
 	%j.0.reg2mem.0.i16.i = phi i32 [ 0, %cond_next36.i ], [ %indvar.next39.i, %bb.i28.i ]		; <i32> [#uses=2]
 	%din_addr.1.reg2mem.0.i17.i = phi double [ 0.000000e+00, %cond_next36.i ], [ %tmp16.i25.i, %bb.i28.i ]		; <double> [#uses=1]
 	%tmp1.i18.i = fptosi double %din_addr.1.reg2mem.0.i17.i to i32		; <i32> [#uses=2]
diff --git a/test/CodeGen/X86/2009-09-10-SpillComments.ll b/test/CodeGen/X86/2009-09-10-SpillComments.ll
index 1dd9990e71..f9ca861c55 100644
--- a/test/CodeGen/X86/2009-09-10-SpillComments.ll
+++ b/test/CodeGen/X86/2009-09-10-SpillComments.ll
@@ -1,5 +1,11 @@
 ; RUN: llc < %s -mtriple=x86_64-unknown-linux | FileCheck %s
 
+; This test shouldn't require spills.
+
+; CHECK: subq  $8, %rsp
+; CHECK-NOT: $rsp
+; CHECK: addq  $8, %rsp
+
 	%struct..0anon = type { i32 }
 	%struct.rtvec_def = type { i32, [1 x %struct..0anon] }
 	%struct.rtx_def = type { i16, i8, i8, [1 x %struct..0anon] }
@@ -10,9 +16,6 @@ declare %struct.rtx_def* @fixup_memory_subreg(%struct.rtx_def*, %struct.rtx_def*
 
 define %struct.rtx_def* @walk_fixup_memory_subreg(%struct.rtx_def* %x, %struct.rtx_def* %insn) {
 entry:
-; CHECK: Spill
-; CHECK: Folded Spill
-; CHECK: Reload
 	%tmp2 = icmp eq %struct.rtx_def* %x, null		; <i1> [#uses=1]
 	br i1 %tmp2, label %UnifiedReturnBlock, label %cond_next
 
diff --git a/test/CodeGen/X86/full-lsr.ll b/test/CodeGen/X86/full-lsr.ll
index 68575bc401..3bd58b65be 100644
--- a/test/CodeGen/X86/full-lsr.ll
+++ b/test/CodeGen/X86/full-lsr.ll
@@ -1,6 +1,12 @@
-; RUN: llc < %s -march=x86 -enable-full-lsr >%t
-; RUN: grep {addl	\\\$4,} %t | count 3
-; RUN: not grep {,%} %t
+; RUN: llc < %s -march=x86 >%t
+
+; TODO: Enhance full lsr mode to get this:
+; RUNX: grep {addl	\\\$4,} %t | count 3
+; RUNX: not grep {,%} %t
+
+; For now, it should find this, which is still pretty good:
+; RUN: not grep {addl	\\\$4,} %t
+; RUN: grep {,%} %t | count 6
 
 define void @foo(float* nocapture %A, float* nocapture %B, float* nocapture %C, i32 %N) nounwind {
 entry:
diff --git a/test/CodeGen/X86/iv-users-in-other-loops.ll b/test/CodeGen/X86/iv-users-in-other-loops.ll
index c695c29e06..0410bc0d9a 100644
--- a/test/CodeGen/X86/iv-users-in-other-loops.ll
+++ b/test/CodeGen/X86/iv-users-in-other-loops.ll
@@ -1,11 +1,11 @@
 ; RUN: llc < %s -march=x86-64 -o %t
-; RUN: grep inc %t | count 1
+; RUN: not grep inc %t
 ; RUN: grep dec %t | count 2
-; RUN: grep addq %t | count 13
+; RUN: grep addq %t | count 10
 ; RUN: not grep addb %t
 ; RUN: grep leaq %t | count 9
-; RUN: grep leal %t | count 3
-; RUN: grep movq %t | count 5
+; RUN: grep leal %t | count 2
+; RUN: grep movq %t | count 10
 
 ; IV users in each of the loops from other loops shouldn't cause LSR
 ; to insert new induction variables. Previously it would create a
diff --git a/test/CodeGen/X86/loop-strength-reduce4.ll b/test/CodeGen/X86/loop-strength-reduce4.ll
index 07e46eca75..6c0eb8c0df 100644
--- a/test/CodeGen/X86/loop-strength-reduce4.ll
+++ b/test/CodeGen/X86/loop-strength-reduce4.ll
@@ -1,5 +1,19 @@
-; RUN: llc < %s -march=x86 | grep cmp | grep 64
-; RUN: llc < %s -march=x86 | not grep inc
+; RUN: llc < %s -march=x86 -relocation-model=static -mtriple=i686-apple-darwin | FileCheck %s -check-prefix=STATIC
+; RUN: llc < %s -march=x86 -relocation-model=pic | FileCheck %s -check-prefix=PIC
+
+; By starting the IV at -64 instead of 0, a cmp is eliminated,
+; as the flags from the add can be used directly.
+
+; STATIC: movl    $-64, %ecx
+
+; STATIC: movl    %eax, _state+76(%ecx)
+; STATIC: addl    $16, %ecx
+; STATIC: jne
+
+; In PIC mode the symbol can't be folded, so the change-compare-stride
+; trick applies.
+
+; PIC: cmpl $64
 
 @state = external global [0 x i32]		; <[0 x i32]*> [#uses=4]
 @S = external global [0 x i32]		; <[0 x i32]*> [#uses=4]
diff --git a/test/CodeGen/X86/loop-strength-reduce8.ll b/test/CodeGen/X86/loop-strength-reduce8.ll
index e14cd8a99e..6b2247d1d6 100644
--- a/test/CodeGen/X86/loop-strength-reduce8.ll
+++ b/test/CodeGen/X86/loop-strength-reduce8.ll
@@ -1,4 +1,10 @@
-; RUN: llc < %s -mtriple=i386-apple-darwin | grep leal | not grep 16
+; RUN: llc < %s -mtriple=i386-apple-darwin | FileCheck %s
+
+; CHECK: leal 16(%eax), %edx
+; CHECK: align
+; CHECK: addl    $4, %edx
+; CHECK: decl    %ecx
+; CHECK: jne     LBB1_2
 
 	%struct.CUMULATIVE_ARGS = type { i32, i32, i32, i32, i32, i32, i32 }
 	%struct.bitmap_element = type { %struct.bitmap_element*, %struct.bitmap_element*, i32, [2 x i64] }
diff --git a/test/CodeGen/X86/lsr-reuse.ll b/test/CodeGen/X86/lsr-reuse.ll
new file mode 100644
index 0000000000..a1919bab38
--- /dev/null
+++ b/test/CodeGen/X86/lsr-reuse.ll
@@ -0,0 +1,159 @@
+; RUN: llc < %s -march=x86-64 | FileCheck %s
+target datalayout = "e-p:64:64:64"
+target triple = "x86_64-unknown-unknown"
+
+; Full strength reduction reduces register pressure from 5 to 4 here.
+
+; CHECK: full_me:
+; CHECK: movsd   (%rsi), %xmm0
+; CHECK: mulsd   (%rdx), %xmm0
+; CHECK: movsd   %xmm0, (%rdi)
+; CHECK: addq    $8, %rsi
+; CHECK: addq    $8, %rdx
+; CHECK: addq    $8, %rdi
+; CHECK: decq    %rcx
+; CHECK: jne
+
+define void @full_me(double* nocapture %A, double* nocapture %B, double* nocapture %C, i64 %n) nounwind {
+entry:
+  %t0 = icmp sgt i64 %n, 0
+  br i1 %t0, label %loop, label %return
+
+loop:
+  %i = phi i64 [ %i.next, %loop ], [ 0, %entry ]
+  %Ai = getelementptr inbounds double* %A, i64 %i
+  %Bi = getelementptr inbounds double* %B, i64 %i
+  %Ci = getelementptr inbounds double* %C, i64 %i
+  %t1 = load double* %Bi
+  %t2 = load double* %Ci
+  %m = fmul double %t1, %t2
+  store double %m, double* %Ai
+  %i.next = add nsw i64 %i, 1
+  %exitcond = icmp eq i64 %i.next, %n
+  br i1 %exitcond, label %return, label %loop
+
+return:
+  ret void
+}
+
+; In this test, the counting IV exit value is used, so full strength reduction
+; would not reduce register pressure. IndVarSimplify ought to simplify such
+; cases away, but it's useful here to verify that LSR's register pressure
+; heuristics are working as expected.
+
+; CHECK: count_me_0:
+; CHECK: movsd   (%rsi,%rax,8), %xmm0
+; CHECK: mulsd   (%rdx,%rax,8), %xmm0
+; CHECK: movsd   %xmm0, (%rdi,%rax,8)
+; CHECK: incq    %rax
+; CHECK: cmpq    %rax, %rcx
+; CHECK: jne
+
+define i64 @count_me_0(double* nocapture %A, double* nocapture %B, double* nocapture %C, i64 %n) nounwind {
+entry:
+  %t0 = icmp sgt i64 %n, 0
+  br i1 %t0, label %loop, label %return
+
+loop:
+  %i = phi i64 [ %i.next, %loop ], [ 0, %entry ]
+  %Ai = getelementptr inbounds double* %A, i64 %i
+  %Bi = getelementptr inbounds double* %B, i64 %i
+  %Ci = getelementptr inbounds double* %C, i64 %i
+  %t1 = load double* %Bi
+  %t2 = load double* %Ci
+  %m = fmul double %t1, %t2
+  store double %m, double* %Ai
+  %i.next = add nsw i64 %i, 1
+  %exitcond = icmp eq i64 %i.next, %n
+  br i1 %exitcond, label %return, label %loop
+
+return:
+  %q = phi i64 [ 0, %entry ], [ %i.next, %loop ]
+  ret i64 %q
+}
+
+; In this test, the trip count value is used, so full strength reduction
+; would not reduce register pressure.
+; (though it would reduce register pressure inside the loop...)
+
+; CHECK: count_me_1:
+; CHECK: movsd   (%rsi,%rax,8), %xmm0
+; CHECK: mulsd   (%rdx,%rax,8), %xmm0
+; CHECK: movsd   %xmm0, (%rdi,%rax,8)
+; CHECK: incq    %rax
+; CHECK: cmpq    %rax, %rcx
+; CHECK: jne
+
+define i64 @count_me_1(double* nocapture %A, double* nocapture %B, double* nocapture %C, i64 %n) nounwind {
+entry:
+  %t0 = icmp sgt i64 %n, 0
+  br i1 %t0, label %loop, label %return
+
+loop:
+  %i = phi i64 [ %i.next, %loop ], [ 0, %entry ]
+  %Ai = getelementptr inbounds double* %A, i64 %i
+  %Bi = getelementptr inbounds double* %B, i64 %i
+  %Ci = getelementptr inbounds double* %C, i64 %i
+  %t1 = load double* %Bi
+  %t2 = load double* %Ci
+  %m = fmul double %t1, %t2
+  store double %m, double* %Ai
+  %i.next = add nsw i64 %i, 1
+  %exitcond = icmp eq i64 %i.next, %n
+  br i1 %exitcond, label %return, label %loop
+
+return:
+  %q = phi i64 [ 0, %entry ], [ %n, %loop ]
+  ret i64 %q
+}
+
+; This should be fully strength-reduced to reduce register pressure, however
+; the current heuristics get distracted by all the reuse with the stride-1
+; induction variable first.
+
+; But even so, be clever and start the stride-1 variable at a non-zero value
+; to eliminate an in-loop immediate value.
+
+; CHECK: count_me_2:
+; CHECK: movl    $5, %eax
+; CHECK: align
+; CHECK: BB4_1:
+; CHECK: movsd   (%rdi,%rax,8), %xmm0
+; CHECK: addsd   (%rsi,%rax,8), %xmm0
+; CHECK: movsd   %xmm0, (%rdx,%rax,8)
+; CHECK: movsd   40(%rdi,%rax,8), %xmm0
+; CHECK: addsd   40(%rsi,%rax,8), %xmm0
+; CHECK: movsd   %xmm0, 40(%rdx,%rax,8)
+; CHECK: incq    %rax
+; CHECK: cmpq    $5005, %rax
+; CHECK: jne
+
+define void @count_me_2(double* nocapture %A, double* nocapture %B, double* nocapture %C) nounwind {
+entry:
+  br label %loop
+
+loop:
+  %i = phi i64 [ 0, %entry ], [ %i.next, %loop ]
+  %i5 = add i64 %i, 5
+  %Ai = getelementptr double* %A, i64 %i5
+  %t2 = load double* %Ai
+  %Bi = getelementptr double* %B, i64 %i5
+  %t4 = load double* %Bi
+  %t5 = fadd double %t2, %t4
+  %Ci = getelementptr double* %C, i64 %i5
+  store double %t5, double* %Ci
+  %i10 = add i64 %i, 10
+  %Ai10 = getelementptr double* %A, i64 %i10
+  %t9 = load double* %Ai10
+  %Bi10 = getelementptr double* %B, i64 %i10
+  %t11 = load double* %Bi10
+  %t12 = fadd double %t9, %t11
+  %Ci10 = getelementptr double* %C, i64 %i10
+  store double %t12, double* %Ci10
+  %i.next = add i64 %i, 1
+  %exitcond = icmp eq i64 %i.next, 5000
+  br i1 %exitcond, label %return, label %loop
+
+return:
+  ret void
+}
diff --git a/test/CodeGen/X86/masked-iv-safe.ll b/test/CodeGen/X86/masked-iv-safe.ll
index bc493bd8f7..7111d687ed 100644
--- a/test/CodeGen/X86/masked-iv-safe.ll
+++ b/test/CodeGen/X86/masked-iv-safe.ll
@@ -4,9 +4,9 @@
 ; RUN: not grep sar %t
 ; RUN: not grep shl %t
 ; RUN: grep add %t | count 2
-; RUN: grep inc %t | count 4
+; RUN: grep inc %t | count 3
 ; RUN: grep dec %t | count 2
-; RUN: grep lea %t | count 2
+; RUN: grep lea %t | count 3
 
 ; Optimize away zext-inreg and sext-inreg on the loop induction
 ; variable using trip-count information.
@@ -127,6 +127,9 @@ return:
 	ret void
 }
 
+; TODO: If we could handle all the loads and stores as post-inc users, we could
+; use {-1,+,1} in the induction variable register, and we'd get another inc,
+; one fewer add, and a comparison with zero.
 define void @another_count_up(double* %d, i64 %n) nounwind {
 entry:
 	br label %loop
diff --git a/test/CodeGen/X86/pr3495-2.ll b/test/CodeGen/X86/pr3495-2.ll
index 1372a1522b..71aa5a0488 100644
--- a/test/CodeGen/X86/pr3495-2.ll
+++ b/test/CodeGen/X86/pr3495-2.ll
@@ -1,5 +1,6 @@
 ; RUN: llc < %s -march=x86 -relocation-model=pic -disable-fp-elim -stats |& grep {Number of reloads omited}
 
+target datalayout = "e-p:32:32:32"
 target triple = "i386-apple-darwin9.6"
 	%struct.constraintVCGType = type { i32, i32, i32, i32 }
 	%struct.nodeVCGType = type { %struct.constraintVCGType*, i32, i32, i32, %struct.constraintVCGType*, i32, i32, i32 }
diff --git a/test/CodeGen/X86/remat-mov-0.ll b/test/CodeGen/X86/remat-mov-0.ll
index c4f768ca52..5fb445c935 100644
--- a/test/CodeGen/X86/remat-mov-0.ll
+++ b/test/CodeGen/X86/remat-mov-0.ll
@@ -1,13 +1,33 @@
-; RUN: llc < %s -march=x86-64 | grep {xorl	%edi, %edi} | count 4
+; RUN: llc < %s -march=x86-64 | FileCheck %s
 
 ; CodeGen should remat the zero instead of spilling it.
 
 declare void @foo(i64 %p)
 
+; CHECK: bar:
+; CHECK: xorl %edi, %edi
+; CHECK: xorl %edi, %edi
 define void @bar() nounwind {
   call void @foo(i64 0)
   call void @foo(i64 0)
-  call void @foo(i64 0)
-  call void @foo(i64 0)
   ret void
 }
+
+; CHECK: bat:
+; CHECK: movq $-1, %rdi
+; CHECK: movq $-1, %rdi
+define void @bat() nounwind {
+  call void @foo(i64 -1)
+  call void @foo(i64 -1)
+  ret void
+}
+
+; CHECK: bau:
+; CHECK: movl $1, %edi
+; CHECK: movl $1, %edi
+define void @bau() nounwind {
+  call void @foo(i64 1)
+  call void @foo(i64 1)
+  ret void
+}
+
diff --git a/test/CodeGen/X86/remat-mov-1.ll b/test/CodeGen/X86/remat-mov-1.ll
deleted file mode 100644
index d71b7a5b91..0000000000
--- a/test/CodeGen/X86/remat-mov-1.ll
+++ /dev/null
@@ -1,40 +0,0 @@
-; RUN: llc < %s -march=x86 | grep -- -1 | grep mov | count 2
-
-	%struct.FILE = type { i8*, i32, i32, i16, i16, %struct.__sbuf, i32, i8*, i32 (i8*)*, i32 (i8*, i8*, i32)*, i64 (i8*, i64, i32)*, i32 (i8*, i8*, i32)*, %struct.__sbuf, %struct.__sFILEX*, i32, [3 x i8], [1 x i8], %struct.__sbuf, i32, i64 }
-	%struct.ImgT = type { i8, i8*, i8*, %struct.FILE*, i32, i32, i32, i32, i8*, double*, float*, float*, float*, i32*, double, double, i32*, double*, i32*, i32* }
-	%struct._CompT = type { i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, i8, %struct._PixT*, %struct._CompT*, i8, %struct._CompT* }
-	%struct._PixT = type { i32, i32, %struct._PixT* }
-	%struct.__sFILEX = type opaque
-	%struct.__sbuf = type { i8*, i32 }
-
-declare fastcc void @MergeComponents(%struct._CompT*, %struct._CompT*, %struct._CompT*, %struct._CompT**, %struct.ImgT*) nounwind 
-
-define fastcc void @MergeToLeft(%struct._CompT* %comp, %struct._CompT** %head, %struct.ImgT* %img) nounwind  {
-entry:
-	br label %bb208
-
-bb105:		; preds = %bb200
-	br i1 false, label %bb197, label %bb149
-
-bb149:		; preds = %bb105
-	%tmp151 = getelementptr %struct._CompT* %comp, i32 0, i32 0		; <i32*> [#uses=1]
-	br label %bb193
-
-bb193:		; preds = %bb184, %bb149
-	%tmp196 = load i32* %tmp151, align 4		; <i32> [#uses=1]
-	br label %bb197
-
-bb197:		; preds = %bb193, %bb105
-	%last_comp.0 = phi i32 [ %tmp196, %bb193 ], [ 0, %bb105 ]		; <i32> [#uses=0]
-	%indvar.next = add i32 %indvar, 1		; <i32> [#uses=1]
-	br label %bb200
-
-bb200:		; preds = %bb208, %bb197
-	%indvar = phi i32 [ 0, %bb208 ], [ %indvar.next, %bb197 ]		; <i32> [#uses=2]
-	%xm.0 = sub i32 %indvar, 0		; <i32> [#uses=1]
-	%tmp202 = icmp slt i32 %xm.0, 1		; <i1> [#uses=1]
-	br i1 %tmp202, label %bb105, label %bb208
-
-bb208:		; preds = %bb200, %entry
-	br label %bb200
-}
diff --git a/test/CodeGen/X86/subreg-to-reg-5.ll b/test/CodeGen/X86/subreg-to-reg-5.ll
deleted file mode 100644
index ba4c307d10..0000000000
--- a/test/CodeGen/X86/subreg-to-reg-5.ll
+++ /dev/null
@@ -1,35 +0,0 @@
-; RUN: llc < %s -march=x86-64 > %t
-; RUN: grep addl %t
-; RUN: not egrep {movl|movq} %t
-
-define float @foo(float* %B) nounwind {
-entry:
-	br label %bb2
-
-bb2:		; preds = %bb3, %entry
-	%B_addr.0.rec = phi i64 [ %indvar.next154, %bb3 ], [ 0, %entry ]		; <i64> [#uses=2]
-        %z = icmp slt i64 %B_addr.0.rec, 20000
-	br i1 %z, label %bb3, label %bb4
-
-bb3:		; preds = %bb2
-	%indvar.next154 = add i64 %B_addr.0.rec, 1		; <i64> [#uses=1]
-	br label %bb2
-
-bb4:		; preds = %bb2
-	%B_addr.0 = getelementptr float* %B, i64 %B_addr.0.rec		; <float*> [#uses=1]
-	%t1 = ptrtoint float* %B_addr.0 to i64		; <i64> [#uses=1]
-	%t2 = and i64 %t1, 4294967295		; <i64> [#uses=1]
-	%t3 = icmp eq i64 %t2, 0		; <i1> [#uses=1]
-	br i1 %t3, label %bb5, label %bb10.preheader
-
-bb10.preheader:		; preds = %bb4
-	br label %bb9
-
-bb5:		; preds = %bb4
-	ret float 7.0
-
-bb9:		; preds = %bb10.preheader
-	%t5 = getelementptr float* %B, i64 0		; <float*> [#uses=1]
-	%t7 = load float* %t5		; <float> [#uses=1]
-	ret float %t7
-}