diff options
author | Naveen Kumar Ponnusamy <naveenkumar.p@ittiam.com> | 2014-06-10 12:14:27 -0700 |
---|---|---|
committer | Lajos Molnar <lajos@google.com> | 2014-07-12 15:09:24 -0700 |
commit | 9cbd70a2930875be59d7df68136ac9a1a949a13d (patch) | |
tree | 6d9957d14352fc77e2323f90b49387e577f1ade2 /decoder/arm64 | |
parent | 707042fda96ebede81408b854385173483798bcd (diff) | |
download | android_external_libhevc-9cbd70a2930875be59d7df68136ac9a1a949a13d.tar.gz android_external_libhevc-9cbd70a2930875be59d7df68136ac9a1a949a13d.tar.bz2 android_external_libhevc-9cbd70a2930875be59d7df68136ac9a1a949a13d.zip |
Reduced stack operations in arm64 assembly
Change-Id: Ia19a99001fef37334f18521dd8f8710907fe370d
Diffstat (limited to 'decoder/arm64')
-rw-r--r-- | decoder/arm64/ihevcd_fmt_conv_420sp_to_rgba8888.s | 65 | ||||
-rw-r--r-- | decoder/arm64/ihevcd_itrans_recon_dc_luma.s | 20 |
2 files changed, 45 insertions, 40 deletions
diff --git a/decoder/arm64/ihevcd_fmt_conv_420sp_to_rgba8888.s b/decoder/arm64/ihevcd_fmt_conv_420sp_to_rgba8888.s index 485ee66..a6041f5 100644 --- a/decoder/arm64/ihevcd_fmt_conv_420sp_to_rgba8888.s +++ b/decoder/arm64/ihevcd_fmt_conv_420sp_to_rgba8888.s @@ -91,7 +91,10 @@ ihevcd_fmt_conv_420sp_to_rgba8888_av8: //// push the registers on the stack // STMFD sp!,{x4-x12,x14} - push_v_regs + + stp d12,d14,[sp,#-16]! + stp d8,d15,[sp,#-16]! // Storing d15 using { sub sp,sp,#8; str d15,[sp] } is giving bus error. + // d8 is used as dummy register and stored along with d15 using stp. d8 is not used in the function. stp x19, x20,[sp,#-16]! @@ -194,8 +197,8 @@ LABEL_YUV420SP_TO_RGB8888_WIDTH_LOOP: prfm PLDL1KEEP,[x1] ////NEED TO MULTIPLY WITH Q2,Q3 WITH CO-EEFICIENTS - sMULL v8.4s, v4.4h, v0.4h[3] ////(U-128)*C4 FOR B - sMULL2 v10.4s, v4.8h, v0.4h[3] ////(U-128)*C4 FOR B + sMULL v5.4s, v4.4h, v0.4h[3] ////(U-128)*C4 FOR B + sMULL2 v7.4s, v4.8h, v0.4h[3] ////(U-128)*C4 FOR B sMULL v20.4s, v6.4h, v0.4h[0] ////(V-128)*C1 FOR R sMULL2 v22.4s, v6.8h, v0.4h[0] ////(V-128)*C1 FOR R @@ -206,13 +209,13 @@ LABEL_YUV420SP_TO_RGB8888_WIDTH_LOOP: sMLAL2 v14.4s, v6.8h, v0.4h[2] ////Q7 = (U-128)*C2 + (V-128)*C3 ////NARROW RIGHT SHIFT BY 13 FOR R&B - sqshrn v8.4h, v8.4s,#13 ////D8 = (U-128)*C4>>13 4 16-BIT VALUES - sqshrn2 v8.8h, v10.4s,#13 ////D9 = (U-128)*C4>>13 4 16-BIT VALUES + sqshrn v5.4h, v5.4s,#13 ////D8 = (U-128)*C4>>13 4 16-BIT VALUES + sqshrn2 v5.8h, v7.4s,#13 ////D9 = (U-128)*C4>>13 4 16-BIT VALUES ////Q4 - WEIGHT FOR B ////NARROW RIGHT SHIFT BY 13 FOR R&B - sqshrn v10.4h, v20.4s,#13 ////D10 = (V-128)*C1>>13 4 16-BIT VALUES - sqshrn2 v10.8h, v22.4s,#13 ////D11 = (V-128)*C1>>13 4 16-BIT VALUES + sqshrn v7.4h, v20.4s,#13 ////D10 = (V-128)*C1>>13 4 16-BIT VALUES + sqshrn2 v7.8h, v22.4s,#13 ////D11 = (V-128)*C1>>13 4 16-BIT VALUES ////Q5 - WEIGHT FOR R ////NARROW RIGHT SHIFT BY 13 FOR G @@ -220,12 +223,12 @@ LABEL_YUV420SP_TO_RGB8888_WIDTH_LOOP: sqshrn2 v12.8h, v14.4s,#13 ////D13 = [(U-128)*C2 + (V-128)*C3]>>13 4 16-BIT VALUES ////Q6 - WEIGHT FOR G - UADDW v14.8h, v8.8h , v30.8b ////Q7 - HAS Y + B - UADDW v16.8h, v10.8h , v30.8b ////Q8 - HAS Y + R + UADDW v14.8h, v5.8h , v30.8b ////Q7 - HAS Y + B + UADDW v16.8h, v7.8h , v30.8b ////Q8 - HAS Y + R UADDW v18.8h, v12.8h , v30.8b ////Q9 - HAS Y + G - UADDW v20.8h, v8.8h , v31.8b ////Q10 - HAS Y + B - UADDW v22.8h, v10.8h , v31.8b ////Q11 - HAS Y + R + UADDW v20.8h, v5.8h , v31.8b ////Q10 - HAS Y + B + UADDW v22.8h, v7.8h , v31.8b ////Q11 - HAS Y + R UADDW v24.8h, v12.8h , v31.8b ////Q12 - HAS Y + G sqxtun v14.8b, v14.8h @@ -276,12 +279,12 @@ LABEL_YUV420SP_TO_RGB8888_WIDTH_LOOP: ////D14-D20 - TOALLY HAVE 16 VALUES ////WE NEED TO SHIFT R,G,B VALUES TO GET 5BIT,6BIT AND 5BIT COMBINATIONS - UADDW v14.8h, v8.8h , v28.8b ////Q7 - HAS Y + B - UADDW v16.8h, v10.8h , v28.8b ////Q2 - HAS Y + R + UADDW v14.8h, v5.8h , v28.8b ////Q7 - HAS Y + B + UADDW v16.8h, v7.8h , v28.8b ////Q2 - HAS Y + R UADDW v18.8h, v12.8h , v28.8b ////Q3 - HAS Y + G - UADDW v20.8h, v8.8h , v29.8b ////Q10 - HAS Y + B - UADDW v22.8h, v10.8h , v29.8b ////Q11 - HAS Y + R + UADDW v20.8h, v5.8h , v29.8b ////Q10 - HAS Y + B + UADDW v22.8h, v7.8h , v29.8b ////Q11 - HAS Y + R UADDW v24.8h, v12.8h , v29.8b ////Q12 - HAS Y + G ////COMPUTE THE ACTUAL RGB VALUES,WE CAN DO TWO ROWS AT A TIME @@ -357,8 +360,8 @@ LABEL_YUV420SP_TO_RGB8888_WIDTH_LOOP_SKIP: ////NEED TO MULTIPLY WITH Q2,Q3 WITH CO-EEFICIENTS - sMULL v8.4s, v4.4h, v0.4h[3] ////(U-128)*C4 FOR B - sMULL2 v10.4s, v4.8h, v0.4h[3] ////(U-128)*C4 FOR B + sMULL v5.4s, v4.4h, v0.4h[3] ////(U-128)*C4 FOR B + sMULL2 v7.4s, v4.8h, v0.4h[3] ////(U-128)*C4 FOR B sMULL v20.4s, v6.4h, v0.4h[0] ////(V-128)*C1 FOR R sMULL2 v22.4s, v6.8h, v0.4h[0] ////(V-128)*C1 FOR R @@ -369,13 +372,13 @@ LABEL_YUV420SP_TO_RGB8888_WIDTH_LOOP_SKIP: sMLAL2 v14.4s, v6.8h, v0.4h[2] ////Q7 = (U-128)*C2 + (V-128)*C3 ////NARROW RIGHT SHIFT BY 13 FOR R&B - sqshrn v8.4h, v8.4s,#13 ////D8 = (U-128)*C4>>13 4 16-BIT VALUES - sqshrn2 v8.8h, v10.4s,#13 ////D9 = (U-128)*C4>>13 4 16-BIT VALUES + sqshrn v5.4h, v5.4s,#13 ////D8 = (U-128)*C4>>13 4 16-BIT VALUES + sqshrn2 v5.8h, v7.4s,#13 ////D9 = (U-128)*C4>>13 4 16-BIT VALUES ////Q4 - WEIGHT FOR B ////NARROW RIGHT SHIFT BY 13 FOR R&B - sqshrn v10.4h, v20.4s,#13 ////D10 = (V-128)*C1>>13 4 16-BIT VALUES - sqshrn2 v10.8h, v22.4s,#13 ////D11 = (V-128)*C1>>13 4 16-BIT VALUES + sqshrn v7.4h, v20.4s,#13 ////D10 = (V-128)*C1>>13 4 16-BIT VALUES + sqshrn2 v7.8h, v22.4s,#13 ////D11 = (V-128)*C1>>13 4 16-BIT VALUES ////Q5 - WEIGHT FOR R ////NARROW RIGHT SHIFT BY 13 FOR G @@ -383,12 +386,12 @@ LABEL_YUV420SP_TO_RGB8888_WIDTH_LOOP_SKIP: sqshrn2 v12.8h, v14.4s,#13 ////D13 = [(U-128)*C2 + (V-128)*C3]>>13 4 16-BIT VALUES ////Q6 - WEIGHT FOR G - UADDW v14.8h, v8.8h , v30.8b ////Q7 - HAS Y + B - UADDW v16.8h, v10.8h , v30.8b ////Q8 - HAS Y + R + UADDW v14.8h, v5.8h , v30.8b ////Q7 - HAS Y + B + UADDW v16.8h, v7.8h , v30.8b ////Q8 - HAS Y + R UADDW v18.8h, v12.8h , v30.8b ////Q9 - HAS Y + G - UADDW v20.8h, v8.8h , v31.8b ////Q10 - HAS Y + B - UADDW v22.8h, v10.8h , v31.8b ////Q11 - HAS Y + R + UADDW v20.8h, v5.8h , v31.8b ////Q10 - HAS Y + B + UADDW v22.8h, v7.8h , v31.8b ////Q11 - HAS Y + R UADDW v24.8h, v12.8h , v31.8b ////Q12 - HAS Y + G sqxtun v14.8b, v14.8h @@ -439,12 +442,12 @@ LABEL_YUV420SP_TO_RGB8888_WIDTH_LOOP_SKIP: ////D14-D20 - TOALLY HAVE 16 VALUES ////WE NEED TO SHIFT R,G,B VALUES TO GET 5BIT,6BIT AND 5BIT COMBINATIONS - UADDW v14.8h, v8.8h , v28.8b ////Q7 - HAS Y + B - UADDW v16.8h, v10.8h , v28.8b ////Q2 - HAS Y + R + UADDW v14.8h, v5.8h , v28.8b ////Q7 - HAS Y + B + UADDW v16.8h, v7.8h , v28.8b ////Q2 - HAS Y + R UADDW v18.8h, v12.8h , v28.8b ////Q3 - HAS Y + G - UADDW v20.8h, v8.8h , v29.8b ////Q10 - HAS Y + B - UADDW v22.8h, v10.8h , v29.8b ////Q11 - HAS Y + R + UADDW v20.8h, v5.8h , v29.8b ////Q10 - HAS Y + B + UADDW v22.8h, v7.8h , v29.8b ////Q11 - HAS Y + R UADDW v24.8h, v12.8h , v29.8b ////Q12 - HAS Y + G sqxtun v14.8b, v14.8h @@ -513,7 +516,9 @@ LABEL_YUV420SP_TO_RGB8888_WIDTH_LOOP_SKIP: ////POP THE REGISTERS // LDMFD sp!,{x4-x12,PC} ldp x19, x20,[sp],#16 - pop_v_regs + ldp d8,d15,[sp],#16 // Loading d15 using { ldr d15,[sp]; add sp,sp,#8 } is giving bus error. + // d8 is used as dummy register and loaded along with d15 using ldp. d8 is not used in the function. + ldp d12,d14,[sp],#16 ret diff --git a/decoder/arm64/ihevcd_itrans_recon_dc_luma.s b/decoder/arm64/ihevcd_itrans_recon_dc_luma.s index 279888b..edc70e7 100644 --- a/decoder/arm64/ihevcd_itrans_recon_dc_luma.s +++ b/decoder/arm64/ihevcd_itrans_recon_dc_luma.s @@ -59,7 +59,7 @@ ihevcd_itrans_recon_dc_luma_av8: - push_v_regs + stp x19, x20,[sp,#-16]! sxth x5,w5 @@ -120,8 +120,8 @@ col_loop: ld1 {v6.8b},[x7],x2 ld1 {v7.8b},[x7],x2 - ld1 {v8.8b},[x7],x2 - ld1 {v9.8b},[x7] + ld1 {v1.8b},[x7],x2 + ld1 {v17.8b},[x7] add x0,x0,#8 @@ -132,8 +132,8 @@ col_loop: uaddw v24.8h, v0.8h , v5.8b uaddw v22.8h, v0.8h , v6.8b uaddw v20.8h, v0.8h , v7.8b - uaddw v18.8h, v0.8h , v8.8b - uaddw v16.8h, v0.8h , v9.8b + uaddw v18.8h, v0.8h , v1.8b + uaddw v16.8h, v0.8h , v17.8b mov x11,x1 sqxtun v2.8b, v30.8h @@ -142,8 +142,8 @@ col_loop: sqxtun v5.8b, v24.8h sqxtun v6.8b, v22.8h sqxtun v7.8b, v20.8h - sqxtun v8.8b, v18.8h - sqxtun v9.8b, v16.8h + sqxtun v1.8b, v18.8h + sqxtun v17.8b, v16.8h st1 {v2.2s},[x11],x3 @@ -152,8 +152,8 @@ col_loop: st1 {v5.2s},[x11],x3 st1 {v6.2s},[x11],x3 st1 {v7.2s},[x11],x3 - st1 {v8.2s},[x11],x3 - st1 {v9.2s},[x11] + st1 {v1.2s},[x11],x3 + st1 {v17.2s},[x11] add x1,x1,#8 @@ -206,7 +206,7 @@ col_loop_4: end_loops: ldp x19, x20,[sp],#16 - pop_v_regs + ret |