YUV(NV21) to RGBA function NEON optimizations.

Used perf to measure the performance gains and the average of 3 runs showed that: Old NEON: Decodes 7200 1920x1080 frames in 92,372,429,902.33 cycles (~12.83E6 / frame) (stddev 3.22E-3) New NEON: Decodes 7200 1920x1080 frames in 66,456,635,523.00 cycles ( ~9.23E6 / frame) (stddev 5.16E-5) about 35% faster. This code was wrapped in C framework that was developed to obtain these measurements, and has not been tested within RenderScript. Change-Id: I4f4e25b968f858f9fca973b36c105c715d90acbf Signed-off-by: David Butcher <David.Butcher@arm.com>
author: Vassilis Laganakos <vasileios.laganakos@arm.com> 2013-01-25 17:43:18 +0000
committer: Chirayu Desai <cdesai@cyanogenmod.org> 2013-02-06 10:22:01 +0530
commit: 9e0106667f8f19b2eeb3e0907b66cf4c1f1e1061 (patch)
tree: 25ca7751fcb53c110704b3274184ad3ed056d288
parent: f5cf4a541655eccb902925f38607dbcb0687ca13 (diff)
download: android_frameworks_rs-9e0106667f8f19b2eeb3e0907b66cf4c1f1e1061.tar.gz
android_frameworks_rs-9e0106667f8f19b2eeb3e0907b66cf4c1f1e1061.tar.bz2
android_frameworks_rs-9e0106667f8f19b2eeb3e0907b66cf4c1f1e1061.zip
1 files changed, 80 insertions, 71 deletions
diff --git a/driver/rsdIntrinsics_Convolve.S b/driver/rsdIntrinsics_Convolve.S
index beb7d141..0bea299a 100644
--- a/driver/rsdIntrinsics_Convolve.S
+++ b/driver/rsdIntrinsics_Convolve.S
@@ -382,85 +382,94 @@ ENTRY(rsdIntrinsicBlurHF_K)
 END(rsdIntrinsicBlurHF_K)
 
 /*
+    Function called with the following arguments: dst, Y, vu, len, YuvCoeff
         r0 = dst
         r1 = Y
         r2 = VU
         r3 = length (pixels / 8)
-        r4 = sp, params
+        ---- Args below will be in the stack ----
+        sp = YuvCoeff
 
         This function converts 8 pixels per iteration
 */
 ENTRY(rsdIntrinsicYuv_K)
-        push            {r4-r8, r10, r11, lr}
-        vpush           {q4-q7}
-
-        ldr r4, [sp, #32+64]
-        vld1.16 {q2}, [r4]!  // mults
-        vld1.16 {q3}, [r4]!  // y offset
-        vld1.16 {q4}, [r4]!  // 128
-        vdup.8 d3, d5[1]
-
-1:
-        vld1.8 {d10}, [r1]!
-        vld1.8 {d12}, [r2]!
-        vmovl.u8 q5, d10 // Y at .16
-        vmovl.u8 q6, d12 // vu at .16
-
-        vsub.i16 q5, q5, q3
-        vsub.i16 q6, q6, q4
-        vtrn.16 d12, d13  // d12 = u, d13 = v
-        vmov q7, q6
-        vtrn.16 d12, d14
-        vtrn.32 d12, d14
-        vtrn.16 d13, d15
-        vtrn.32 d13, d15
-
-        vmull.s16 q8, d10, d4[0]
-        vmull.s16 q11, d11, d4[0]
-        vmov q9, q8
-        vmov q10, q8
-        vmov q12, q11
-        vmov q13, q11
-
-        vmlal.s16 q8,  d12, d4[1]
-        vmlal.s16 q9,  d12, d5[0]
-        vmlal.s16 q10, d13, d4[3]
-        vmlal.s16 q9,  d13, d4[2]
-
-        vmlal.s16 q11, d14, d4[1]
-        vmlal.s16 q12, d14, d5[0]
-        vmlal.s16 q13, d15, d4[3]
-        vmlal.s16 q12, d15, d4[2]
-
-
-        vshrn.i32 d16, q8, #8
-        vshrn.i32 d18, q9, #8
-        vshrn.i32 d20, q10, #8
-        vqmovun.s16 d0, q8
-        vqmovun.s16 d1, q9
-        vqmovun.s16 d2, q10
-        vst4.8 {d0[0],d1[0],d2[0],d3[0]}, [r0]!
-        vst4.8 {d0[1],d1[1],d2[1],d3[1]}, [r0]!
-        vst4.8 {d0[2],d1[2],d2[2],d3[2]}, [r0]!
-        vst4.8 {d0[3],d1[3],d2[3],d3[3]}, [r0]!
-
-        vshrn.i32 d16, q11, #8
-        vshrn.i32 d18, q12, #8
-        vshrn.i32 d20, q13, #8
-        vqmovun.s16 d0, q8
-        vqmovun.s16 d1, q9
-        vqmovun.s16 d2, q10
-        vst4.8 {d0[0],d1[0],d2[0],d3[0]}, [r0]!
-        vst4.8 {d0[1],d1[1],d2[1],d3[1]}, [r0]!
-        vst4.8 {d0[2],d1[2],d2[2],d3[2]}, [r0]!
-        vst4.8 {d0[3],d1[3],d2[3],d3[3]}, [r0]!
-
-        subs r3, r3, #1
-        bne 1b
-
-        vpop            {q4-q7}
-        pop             {r4-r8, r10, r11, lr}
-        bx              lr
+        push        {r4, r5, lr}            @ preserve clobbered int registers
+        vpush       {Q4-Q7}                 @ preserve Vregisters we clobber
+
+        mov  r5, #16                        @ Integer 16 in r5; used as an incrementing value
+
+        ldr         r4, [sp, #64+12]        @ load the coeffs address in memory in r4 (16*4 + 4*3)
+        vld1.16     {Q2}, [r4]!             @ load the multipliers from the coeffs matrix (r4) in Q2
+        vld1.8      {d6[]}, [r4], r5        @ load y offset 16 from the coeffs matrix (r4) in d6
+        vld1.8      {d8[]}, [r4], r5        @ load V and U offset of 128 from the coeffs matrix (r4) in d8
+
+        mov         r4, #8                  @ Integer 8 in r4; used as an incrementing value
+
+        vdup.8      d3, d5[1]               @ d3 = 255 (alpha) from the multipliers line in
+                                            @ the coeffs matrix (Q2)
+
+        1:
+        vld1.8      {d10}, [r1]!            @ get Y (r1->Y)
+        vld2.8      {d12, d14}, [r2], r4    @ split V from U (r2 -> VU) and increase pointer by 8 (in r4)
+        pld         [r1, #64]               @ preloading data from address y(r1) + 64 for subsequent loops
+        pld         [r2, #64]               @ preloading data from address vu(r2) + 64 for subsequent loops
+
+        vsubl.u8    Q5, d10, d6             @ Y to 16 bit - 16 (in 16bit) (n to n+7)
+        vmull.s16   Q8, d10, d4[0]          @ Y(n,n+1,n+2,n+3) * 298 = Q8 (to 32bit)
+        vmull.s16   Q11, d11, d4[0]         @ Y(n+4,n+5,n+6,n+7) * 298 = Q11 (to 32bit)
+
+        vsubl.u8    Q5, d12, d8             @ V to 16 bit - 128 = Q5 // V(n, n+1, n+2,n+3)
+        vsubl.u8    Q6, d14, d8             @ U to 16 bit - 128 = Q6 // U(n, n+1, n+2,n+3)
+        vmov.u16    d11, d10                @ Copying V to d11
+        vmov.u16    d13, d12                @ Copying U to d13
+        vzip.u16    d10, d11                @ Q5 = V (n,n n+1, n+1) V(n+2, n+2, n+3, n+3)
+        vzip.u16    d12, d13                @ Q5 = U (n,n n+1, n+1) U(n+2, n+2, n+3, n+3)
+
+
+        vmov        Q9, Q8                  @ Copy Q8(Y: n, n+1, n+2, n+3) to Q9
+        vmov        Q10, Q8                 @ Copy Q8(Y: n, n+1, n+2, n+3) to Q10
+        vmov        Q12, Q11                @ Copy Q11(Y: n+5, n+6, n+6, n+7) to Q12
+        vmov        Q13, Q11                @ Copy Q11(Y: n+5, n+6, n+6, n+7) to Q13
+
+                                            @                  R    G    B
+                                            @     Pixel(0-3)  Q8,  Q9, Q10
+                                            @     Pixel(4-7) Q11, Q12, Q13
+                                            @
+
+                                            @ Pixel(0-3)
+        vmlal.s16   Q8,  d10, d4[1]         @ R : Q8  = Q8(Y-16)  + (V-128) * 409
+        vmlal.s16   Q9,  d10, d5[0]         @ G : Q9  = Q9(Y-16)  + (V-128) * (-208)
+        vmlal.s16   Q9,  d12, d4[2]         @                     + (U-128) * (-100)
+        vmlal.s16   Q10, d12, d4[3]         @ B : Q10 = Q10(Y-16) + (U-128) * 516
+
+                                            @ Pixel(4-7)
+        vmlal.s16   Q11, d11, d4[1]         @ R : Q11 = Q11(Y-16) + (V-128) * 409
+        vmlal.s16   Q12, d11, d5[0]         @ G : Q12 = Q12(Y-16) + (V-128) * (-208)
+        vmlal.s16   Q12, d13, d4[2]         @                     + (U-128) * (-100)
+        vmlal.s16   Q13, d13, d4[3]         @ B : Q13 = Q13(Y-16) + (U-128) * 516
+
+                                            @ Pixel(0-3)
+        vrshrn.i32  d16, Q8, #8             @ d16 : R shifted right by 8 rounded'n narrowed to 16bit
+        vrshrn.i32  d18, Q9, #8             @ d18 : G shifted right by 8 rounded'n narrowed to 16bit
+        vrshrn.i32  d20, Q10, #8            @ d20 : B shifted right by 8 rounded'n narrowed to 16bit
+
+                                            @ Pixel(4-7)
+        vrshrn.i32  d17, Q11, #8            @ d17 : R shifted right by 8 rounded'n narrowed to 16bit
+        vrshrn.i32  d19, Q12, #8            @ d19 : G shifted right by 8 rounded'n narrowed to 16bit
+        vrshrn.i32  d21, Q13, #8            @ d21 : B shifted right by 8 rounded'n narrowed to 16bit
+
+        vqmovun.s16 d0, Q8                  @ r = d0 (saturated, unsigned and narrowed to 8bit)
+        vqmovun.s16 d1, Q9                  @ g = d1 (saturated, unsigned and narrowed to 8bit)
+        vqmovun.s16 d2, Q10                 @ b = d2 (saturated, unsigned and narrowed to 8bit)
+
+        subs        r3, r3, #1              @ Checking length (r3)
+        vst4.8      {d0, d1, d2, d3}, [r0]! @ Writing out 8 RGBA values to dst (r0)
+
+        bne 1b                              @ if not done with length, loop
+
+        vpop        {Q4-Q7}                 @ Restore Vregisters
+        pop         {r4, r5, lr}            @ Restore int registers
+        bx          lr
 END(rsdIntrinsicYuv_K)
 
 /* Convolve 5x5 */
author	Vassilis Laganakos <vasileios.laganakos@arm.com>	2013-01-25 17:43:18 +0000
committer	Chirayu Desai <cdesai@cyanogenmod.org>	2013-02-06 10:22:01 +0530
commit	9e0106667f8f19b2eeb3e0907b66cf4c1f1e1061 (patch)
tree	25ca7751fcb53c110704b3274184ad3ed056d288
parent	f5cf4a541655eccb902925f38607dbcb0687ca13 (diff)
download	android_frameworks_rs-9e0106667f8f19b2eeb3e0907b66cf4c1f1e1061.tar.gz android_frameworks_rs-9e0106667f8f19b2eeb3e0907b66cf4c1f1e1061.tar.bz2 android_frameworks_rs-9e0106667f8f19b2eeb3e0907b66cf4c1f1e1061.zip