diff options
| author | Vassilis Laganakos <vasileios.laganakos@arm.com> | 2013-01-25 17:43:18 +0000 |
|---|---|---|
| committer | Chirayu Desai <cdesai@cyanogenmod.org> | 2013-02-06 10:22:01 +0530 |
| commit | 9e0106667f8f19b2eeb3e0907b66cf4c1f1e1061 (patch) | |
| tree | 25ca7751fcb53c110704b3274184ad3ed056d288 | |
| parent | f5cf4a541655eccb902925f38607dbcb0687ca13 (diff) | |
| download | android_frameworks_rs-9e0106667f8f19b2eeb3e0907b66cf4c1f1e1061.tar.gz android_frameworks_rs-9e0106667f8f19b2eeb3e0907b66cf4c1f1e1061.tar.bz2 android_frameworks_rs-9e0106667f8f19b2eeb3e0907b66cf4c1f1e1061.zip | |
YUV(NV21) to RGBA function NEON optimizations.
Used perf to measure the performance gains and the average of 3 runs
showed that:
Old NEON:
Decodes 7200 1920x1080 frames in 92,372,429,902.33 cycles (~12.83E6 / frame)
(stddev 3.22E-3)
New NEON:
Decodes 7200 1920x1080 frames in 66,456,635,523.00 cycles ( ~9.23E6 / frame)
(stddev 5.16E-5)
about 35% faster.
This code was wrapped in C framework that was developed to obtain these
measurements, and has not been tested within RenderScript.
Change-Id: I4f4e25b968f858f9fca973b36c105c715d90acbf
Signed-off-by: David Butcher <David.Butcher@arm.com>
| -rw-r--r-- | driver/rsdIntrinsics_Convolve.S | 151 |
1 files changed, 80 insertions, 71 deletions
diff --git a/driver/rsdIntrinsics_Convolve.S b/driver/rsdIntrinsics_Convolve.S index beb7d141..0bea299a 100644 --- a/driver/rsdIntrinsics_Convolve.S +++ b/driver/rsdIntrinsics_Convolve.S @@ -382,85 +382,94 @@ ENTRY(rsdIntrinsicBlurHF_K) END(rsdIntrinsicBlurHF_K) /* + Function called with the following arguments: dst, Y, vu, len, YuvCoeff r0 = dst r1 = Y r2 = VU r3 = length (pixels / 8) - r4 = sp, params + ---- Args below will be in the stack ---- + sp = YuvCoeff This function converts 8 pixels per iteration */ ENTRY(rsdIntrinsicYuv_K) - push {r4-r8, r10, r11, lr} - vpush {q4-q7} - - ldr r4, [sp, #32+64] - vld1.16 {q2}, [r4]! // mults - vld1.16 {q3}, [r4]! // y offset - vld1.16 {q4}, [r4]! // 128 - vdup.8 d3, d5[1] - -1: - vld1.8 {d10}, [r1]! - vld1.8 {d12}, [r2]! - vmovl.u8 q5, d10 // Y at .16 - vmovl.u8 q6, d12 // vu at .16 - - vsub.i16 q5, q5, q3 - vsub.i16 q6, q6, q4 - vtrn.16 d12, d13 // d12 = u, d13 = v - vmov q7, q6 - vtrn.16 d12, d14 - vtrn.32 d12, d14 - vtrn.16 d13, d15 - vtrn.32 d13, d15 - - vmull.s16 q8, d10, d4[0] - vmull.s16 q11, d11, d4[0] - vmov q9, q8 - vmov q10, q8 - vmov q12, q11 - vmov q13, q11 - - vmlal.s16 q8, d12, d4[1] - vmlal.s16 q9, d12, d5[0] - vmlal.s16 q10, d13, d4[3] - vmlal.s16 q9, d13, d4[2] - - vmlal.s16 q11, d14, d4[1] - vmlal.s16 q12, d14, d5[0] - vmlal.s16 q13, d15, d4[3] - vmlal.s16 q12, d15, d4[2] - - - vshrn.i32 d16, q8, #8 - vshrn.i32 d18, q9, #8 - vshrn.i32 d20, q10, #8 - vqmovun.s16 d0, q8 - vqmovun.s16 d1, q9 - vqmovun.s16 d2, q10 - vst4.8 {d0[0],d1[0],d2[0],d3[0]}, [r0]! - vst4.8 {d0[1],d1[1],d2[1],d3[1]}, [r0]! - vst4.8 {d0[2],d1[2],d2[2],d3[2]}, [r0]! - vst4.8 {d0[3],d1[3],d2[3],d3[3]}, [r0]! - - vshrn.i32 d16, q11, #8 - vshrn.i32 d18, q12, #8 - vshrn.i32 d20, q13, #8 - vqmovun.s16 d0, q8 - vqmovun.s16 d1, q9 - vqmovun.s16 d2, q10 - vst4.8 {d0[0],d1[0],d2[0],d3[0]}, [r0]! - vst4.8 {d0[1],d1[1],d2[1],d3[1]}, [r0]! - vst4.8 {d0[2],d1[2],d2[2],d3[2]}, [r0]! - vst4.8 {d0[3],d1[3],d2[3],d3[3]}, [r0]! - - subs r3, r3, #1 - bne 1b - - vpop {q4-q7} - pop {r4-r8, r10, r11, lr} - bx lr + push {r4, r5, lr} @ preserve clobbered int registers + vpush {Q4-Q7} @ preserve Vregisters we clobber + + mov r5, #16 @ Integer 16 in r5; used as an incrementing value + + ldr r4, [sp, #64+12] @ load the coeffs address in memory in r4 (16*4 + 4*3) + vld1.16 {Q2}, [r4]! @ load the multipliers from the coeffs matrix (r4) in Q2 + vld1.8 {d6[]}, [r4], r5 @ load y offset 16 from the coeffs matrix (r4) in d6 + vld1.8 {d8[]}, [r4], r5 @ load V and U offset of 128 from the coeffs matrix (r4) in d8 + + mov r4, #8 @ Integer 8 in r4; used as an incrementing value + + vdup.8 d3, d5[1] @ d3 = 255 (alpha) from the multipliers line in + @ the coeffs matrix (Q2) + + 1: + vld1.8 {d10}, [r1]! @ get Y (r1->Y) + vld2.8 {d12, d14}, [r2], r4 @ split V from U (r2 -> VU) and increase pointer by 8 (in r4) + pld [r1, #64] @ preloading data from address y(r1) + 64 for subsequent loops + pld [r2, #64] @ preloading data from address vu(r2) + 64 for subsequent loops + + vsubl.u8 Q5, d10, d6 @ Y to 16 bit - 16 (in 16bit) (n to n+7) + vmull.s16 Q8, d10, d4[0] @ Y(n,n+1,n+2,n+3) * 298 = Q8 (to 32bit) + vmull.s16 Q11, d11, d4[0] @ Y(n+4,n+5,n+6,n+7) * 298 = Q11 (to 32bit) + + vsubl.u8 Q5, d12, d8 @ V to 16 bit - 128 = Q5 // V(n, n+1, n+2,n+3) + vsubl.u8 Q6, d14, d8 @ U to 16 bit - 128 = Q6 // U(n, n+1, n+2,n+3) + vmov.u16 d11, d10 @ Copying V to d11 + vmov.u16 d13, d12 @ Copying U to d13 + vzip.u16 d10, d11 @ Q5 = V (n,n n+1, n+1) V(n+2, n+2, n+3, n+3) + vzip.u16 d12, d13 @ Q5 = U (n,n n+1, n+1) U(n+2, n+2, n+3, n+3) + + + vmov Q9, Q8 @ Copy Q8(Y: n, n+1, n+2, n+3) to Q9 + vmov Q10, Q8 @ Copy Q8(Y: n, n+1, n+2, n+3) to Q10 + vmov Q12, Q11 @ Copy Q11(Y: n+5, n+6, n+6, n+7) to Q12 + vmov Q13, Q11 @ Copy Q11(Y: n+5, n+6, n+6, n+7) to Q13 + + @ R G B + @ Pixel(0-3) Q8, Q9, Q10 + @ Pixel(4-7) Q11, Q12, Q13 + @ + + @ Pixel(0-3) + vmlal.s16 Q8, d10, d4[1] @ R : Q8 = Q8(Y-16) + (V-128) * 409 + vmlal.s16 Q9, d10, d5[0] @ G : Q9 = Q9(Y-16) + (V-128) * (-208) + vmlal.s16 Q9, d12, d4[2] @ + (U-128) * (-100) + vmlal.s16 Q10, d12, d4[3] @ B : Q10 = Q10(Y-16) + (U-128) * 516 + + @ Pixel(4-7) + vmlal.s16 Q11, d11, d4[1] @ R : Q11 = Q11(Y-16) + (V-128) * 409 + vmlal.s16 Q12, d11, d5[0] @ G : Q12 = Q12(Y-16) + (V-128) * (-208) + vmlal.s16 Q12, d13, d4[2] @ + (U-128) * (-100) + vmlal.s16 Q13, d13, d4[3] @ B : Q13 = Q13(Y-16) + (U-128) * 516 + + @ Pixel(0-3) + vrshrn.i32 d16, Q8, #8 @ d16 : R shifted right by 8 rounded'n narrowed to 16bit + vrshrn.i32 d18, Q9, #8 @ d18 : G shifted right by 8 rounded'n narrowed to 16bit + vrshrn.i32 d20, Q10, #8 @ d20 : B shifted right by 8 rounded'n narrowed to 16bit + + @ Pixel(4-7) + vrshrn.i32 d17, Q11, #8 @ d17 : R shifted right by 8 rounded'n narrowed to 16bit + vrshrn.i32 d19, Q12, #8 @ d19 : G shifted right by 8 rounded'n narrowed to 16bit + vrshrn.i32 d21, Q13, #8 @ d21 : B shifted right by 8 rounded'n narrowed to 16bit + + vqmovun.s16 d0, Q8 @ r = d0 (saturated, unsigned and narrowed to 8bit) + vqmovun.s16 d1, Q9 @ g = d1 (saturated, unsigned and narrowed to 8bit) + vqmovun.s16 d2, Q10 @ b = d2 (saturated, unsigned and narrowed to 8bit) + + subs r3, r3, #1 @ Checking length (r3) + vst4.8 {d0, d1, d2, d3}, [r0]! @ Writing out 8 RGBA values to dst (r0) + + bne 1b @ if not done with length, loop + + vpop {Q4-Q7} @ Restore Vregisters + pop {r4, r5, lr} @ Restore int registers + bx lr END(rsdIntrinsicYuv_K) /* Convolve 5x5 */ |
