Fix implementation of vload with [u]long3/4 and double3/4.

Bug: 18380209 These implementations were accidentally attempting to return <3 x i64> or <4 x i64>, or the double equivalents. The ABI requires that this be converted into a stack return instead, so we transform our hand-written bitcode to do exactly that. Change-Id: I2be489b23bf639b16d8762a11a8430f40ea5b16c
author: Stephen Hines <srhines@google.com> 2014-11-13 15:45:27 -0800
committer: Stephen Hines <srhines@google.com> 2014-11-18 14:34:46 -0800
commit: b9675775b030b187b8528cba2d8e0e5c0a7bf8f7 (patch)
tree: 2cd4f9b5c15c3fe5e97a230502a60aa65977e429
parent: 4283f579c424f07bc07c7f075398053eed3f8281 (diff)
download: android_frameworks_rs-b9675775b030b187b8528cba2d8e0e5c0a7bf8f7.tar.gz
android_frameworks_rs-b9675775b030b187b8528cba2d8e0e5c0a7bf8f7.tar.bz2
android_frameworks_rs-b9675775b030b187b8528cba2d8e0e5c0a7bf8f7.zip
1 files changed, 27 insertions, 18 deletions
diff --git a/driver/runtime/ll32/allocation.ll b/driver/runtime/ll32/allocation.ll
index d0b3932b..21d7cac5 100644
--- a/driver/runtime/ll32/allocation.ll
+++ b/driver/runtime/ll32/allocation.ll
@@ -650,17 +650,20 @@ define void @rsGetElementAtImpl_double4(<4 x double>* noalias nocapture sret %ag
 }
 
 
-define <4 x i64> @__rsAllocationVLoadXImpl_long4([1 x i32] %a.coerce, i32 %x, i32 %y, i32 %z) #0 {
+define void @__rsAllocationVLoadXImpl_long4(<4 x i64>* noalias nocapture sret %agg.result, [1 x i32] %a.coerce, i32 %x, i32 %y, i32 %z) #1 {
   %1 = tail call i8* @rsOffsetNs([1 x i32] %a.coerce, i32 %x, i32 %y, i32 %z) #2
   %2 = bitcast i8* %1 to <4 x i64>*
   %3 = load <4 x i64>* %2, align 8
-  ret <4 x i64> %3
+  store <4 x i64> %3, <4 x i64>* %agg.result, align 32, !tbaa !52
+  ret void
 }
-define <3 x i64> @__rsAllocationVLoadXImpl_long3([1 x i32] %a.coerce, i32 %x, i32 %y, i32 %z) #0 {
+define void @__rsAllocationVLoadXImpl_long3(<3 x i64>* noalias nocapture sret %agg.result, [1 x i32] %a.coerce, i32 %x, i32 %y, i32 %z) #1 {
   %1 = tail call i8* @rsOffsetNs([1 x i32] %a.coerce, i32 %x, i32 %y, i32 %z) #2
-  %2 = bitcast i8* %1 to <3 x i64>*
-  %3 = load <3 x i64>* %2, align 8
-  ret <3 x i64> %3
+  %2 = bitcast i8* %1 to <4 x i64>*
+  %3 = load <4 x i64>* %2, align 8
+  %4 = bitcast <3 x i64>* %agg.result to <4 x i64>*
+  store <4 x i64> %3, <4 x i64>* %4, align 32, !tbaa !47
+  ret void
 }
 define <2 x i64> @__rsAllocationVLoadXImpl_long2([1 x i32] %a.coerce, i32 %x, i32 %y, i32 %z) #0 {
   %1 = tail call i8* @rsOffsetNs([1 x i32] %a.coerce, i32 %x, i32 %y, i32 %z) #2
@@ -669,17 +672,20 @@ define <2 x i64> @__rsAllocationVLoadXImpl_long2([1 x i32] %a.coerce, i32 %x, i3
   ret <2 x i64> %3
 }
 
-define <4 x i64> @__rsAllocationVLoadXImpl_ulong4([1 x i32] %a.coerce, i32 %x, i32 %y, i32 %z) #0 {
+define void @__rsAllocationVLoadXImpl_ulong4(<4 x i64>* noalias nocapture sret %agg.result, [1 x i32] %a.coerce, i32 %x, i32 %y, i32 %z) #1 {
   %1 = tail call i8* @rsOffsetNs([1 x i32] %a.coerce, i32 %x, i32 %y, i32 %z) #2
   %2 = bitcast i8* %1 to <4 x i64>*
   %3 = load <4 x i64>* %2, align 8
-  ret <4 x i64> %3
+  store <4 x i64> %3, <4 x i64>* %agg.result, align 32, !tbaa !48
+  ret void
 }
-define <3 x i64> @__rsAllocationVLoadXImpl_ulong3([1 x i32] %a.coerce, i32 %x, i32 %y, i32 %z) #0 {
+define void @__rsAllocationVLoadXImpl_ulong3(<3 x i64>* noalias nocapture sret %agg.result, [1 x i32] %a.coerce, i32 %x, i32 %y, i32 %z) #1 {
   %1 = tail call i8* @rsOffsetNs([1 x i32] %a.coerce, i32 %x, i32 %y, i32 %z) #2
-  %2 = bitcast i8* %1 to <3 x i64>*
-  %3 = load <3 x i64>* %2, align 8
-  ret <3 x i64> %3
+  %2 = bitcast i8* %1 to <4 x i64>*
+  %3 = load <4 x i64>* %2, align 8
+  %4 = bitcast <3 x i64>* %agg.result to <4 x i64>*
+  store <4 x i64> %3, <4 x i64>* %4, align 32, !tbaa !51
+  ret void
 }
 define <2 x i64> @__rsAllocationVLoadXImpl_ulong2([1 x i32] %a.coerce, i32 %x, i32 %y, i32 %z) #0 {
   %1 = tail call i8* @rsOffsetNs([1 x i32] %a.coerce, i32 %x, i32 %y, i32 %z) #2
@@ -821,17 +827,20 @@ define <2 x float> @__rsAllocationVLoadXImpl_float2([1 x i32] %a.coerce, i32 %x,
   ret <2 x float> %3
 }
 
-define <4 x double> @__rsAllocationVLoadXImpl_double4([1 x i32] %a.coerce, i32 %x, i32 %y, i32 %z) #0 {
+define void @__rsAllocationVLoadXImpl_double4(<4 x double>* noalias nocapture sret %agg.result, [1 x i32] %a.coerce, i32 %x, i32 %y, i32 %z) #1 {
   %1 = tail call i8* @rsOffsetNs([1 x i32] %a.coerce, i32 %x, i32 %y, i32 %z) #2
   %2 = bitcast i8* %1 to <4 x double>*
   %3 = load <4 x double>* %2, align 8
-  ret <4 x double> %3
+  store <4 x double> %3, <4 x double>* %agg.result, align 32, !tbaa !60
+  ret void
 }
-define <3 x double> @__rsAllocationVLoadXImpl_double3([1 x i32] %a.coerce, i32 %x, i32 %y, i32 %z) #0 {
+define void @__rsAllocationVLoadXImpl_double3(<3 x double>* noalias nocapture sret %agg.result, [1 x i32] %a.coerce, i32 %x, i32 %y, i32 %z) #1 {
   %1 = tail call i8* @rsOffsetNs([1 x i32] %a.coerce, i32 %x, i32 %y, i32 %z) #2
-  %2 = bitcast i8* %1 to <3 x double>*
-  %3 = load <3 x double>* %2, align 8
-  ret <3 x double> %3
+  %2 = bitcast i8* %1 to <4 x double>*
+  %3 = load <4 x double>* %2, align 8
+  %4 = bitcast <3 x double>* %agg.result to <4 x double>*
+  store <4 x double> %3, <4 x double>* %4, align 32, !tbaa !59
+  ret void
 }
 define <2 x double> @__rsAllocationVLoadXImpl_double2([1 x i32] %a.coerce, i32 %x, i32 %y, i32 %z) #0 {
   %1 = tail call i8* @rsOffsetNs([1 x i32] %a.coerce, i32 %x, i32 %y, i32 %z) #2
author	Stephen Hines <srhines@google.com>	2014-11-13 15:45:27 -0800
committer	Stephen Hines <srhines@google.com>	2014-11-18 14:34:46 -0800
commit	b9675775b030b187b8528cba2d8e0e5c0a7bf8f7 (patch)
tree	2cd4f9b5c15c3fe5e97a230502a60aa65977e429
parent	4283f579c424f07bc07c7f075398053eed3f8281 (diff)
download	android_frameworks_rs-b9675775b030b187b8528cba2d8e0e5c0a7bf8f7.tar.gz android_frameworks_rs-b9675775b030b187b8528cba2d8e0e5c0a7bf8f7.tar.bz2 android_frameworks_rs-b9675775b030b187b8528cba2d8e0e5c0a7bf8f7.zip