summaryrefslogtreecommitdiffstats
path: root/decoder/arm64
diff options
context:
space:
mode:
authorNaveen Kumar Ponnusamy <naveenkumar.p@ittiam.com>2014-06-10 12:14:27 -0700
committerLajos Molnar <lajos@google.com>2014-07-12 15:09:24 -0700
commit9cbd70a2930875be59d7df68136ac9a1a949a13d (patch)
tree6d9957d14352fc77e2323f90b49387e577f1ade2 /decoder/arm64
parent707042fda96ebede81408b854385173483798bcd (diff)
downloadandroid_external_libhevc-9cbd70a2930875be59d7df68136ac9a1a949a13d.tar.gz
android_external_libhevc-9cbd70a2930875be59d7df68136ac9a1a949a13d.tar.bz2
android_external_libhevc-9cbd70a2930875be59d7df68136ac9a1a949a13d.zip
Reduced stack operations in arm64 assembly
Change-Id: Ia19a99001fef37334f18521dd8f8710907fe370d
Diffstat (limited to 'decoder/arm64')
-rw-r--r--decoder/arm64/ihevcd_fmt_conv_420sp_to_rgba8888.s65
-rw-r--r--decoder/arm64/ihevcd_itrans_recon_dc_luma.s20
2 files changed, 45 insertions, 40 deletions
diff --git a/decoder/arm64/ihevcd_fmt_conv_420sp_to_rgba8888.s b/decoder/arm64/ihevcd_fmt_conv_420sp_to_rgba8888.s
index 485ee66..a6041f5 100644
--- a/decoder/arm64/ihevcd_fmt_conv_420sp_to_rgba8888.s
+++ b/decoder/arm64/ihevcd_fmt_conv_420sp_to_rgba8888.s
@@ -91,7 +91,10 @@ ihevcd_fmt_conv_420sp_to_rgba8888_av8:
//// push the registers on the stack
// STMFD sp!,{x4-x12,x14}
- push_v_regs
+
+ stp d12,d14,[sp,#-16]!
+ stp d8,d15,[sp,#-16]! // Storing d15 using { sub sp,sp,#8; str d15,[sp] } is giving bus error.
+ // d8 is used as dummy register and stored along with d15 using stp. d8 is not used in the function.
stp x19, x20,[sp,#-16]!
@@ -194,8 +197,8 @@ LABEL_YUV420SP_TO_RGB8888_WIDTH_LOOP:
prfm PLDL1KEEP,[x1]
////NEED TO MULTIPLY WITH Q2,Q3 WITH CO-EEFICIENTS
- sMULL v8.4s, v4.4h, v0.4h[3] ////(U-128)*C4 FOR B
- sMULL2 v10.4s, v4.8h, v0.4h[3] ////(U-128)*C4 FOR B
+ sMULL v5.4s, v4.4h, v0.4h[3] ////(U-128)*C4 FOR B
+ sMULL2 v7.4s, v4.8h, v0.4h[3] ////(U-128)*C4 FOR B
sMULL v20.4s, v6.4h, v0.4h[0] ////(V-128)*C1 FOR R
sMULL2 v22.4s, v6.8h, v0.4h[0] ////(V-128)*C1 FOR R
@@ -206,13 +209,13 @@ LABEL_YUV420SP_TO_RGB8888_WIDTH_LOOP:
sMLAL2 v14.4s, v6.8h, v0.4h[2] ////Q7 = (U-128)*C2 + (V-128)*C3
////NARROW RIGHT SHIFT BY 13 FOR R&B
- sqshrn v8.4h, v8.4s,#13 ////D8 = (U-128)*C4>>13 4 16-BIT VALUES
- sqshrn2 v8.8h, v10.4s,#13 ////D9 = (U-128)*C4>>13 4 16-BIT VALUES
+ sqshrn v5.4h, v5.4s,#13 ////D8 = (U-128)*C4>>13 4 16-BIT VALUES
+ sqshrn2 v5.8h, v7.4s,#13 ////D9 = (U-128)*C4>>13 4 16-BIT VALUES
////Q4 - WEIGHT FOR B
////NARROW RIGHT SHIFT BY 13 FOR R&B
- sqshrn v10.4h, v20.4s,#13 ////D10 = (V-128)*C1>>13 4 16-BIT VALUES
- sqshrn2 v10.8h, v22.4s,#13 ////D11 = (V-128)*C1>>13 4 16-BIT VALUES
+ sqshrn v7.4h, v20.4s,#13 ////D10 = (V-128)*C1>>13 4 16-BIT VALUES
+ sqshrn2 v7.8h, v22.4s,#13 ////D11 = (V-128)*C1>>13 4 16-BIT VALUES
////Q5 - WEIGHT FOR R
////NARROW RIGHT SHIFT BY 13 FOR G
@@ -220,12 +223,12 @@ LABEL_YUV420SP_TO_RGB8888_WIDTH_LOOP:
sqshrn2 v12.8h, v14.4s,#13 ////D13 = [(U-128)*C2 + (V-128)*C3]>>13 4 16-BIT VALUES
////Q6 - WEIGHT FOR G
- UADDW v14.8h, v8.8h , v30.8b ////Q7 - HAS Y + B
- UADDW v16.8h, v10.8h , v30.8b ////Q8 - HAS Y + R
+ UADDW v14.8h, v5.8h , v30.8b ////Q7 - HAS Y + B
+ UADDW v16.8h, v7.8h , v30.8b ////Q8 - HAS Y + R
UADDW v18.8h, v12.8h , v30.8b ////Q9 - HAS Y + G
- UADDW v20.8h, v8.8h , v31.8b ////Q10 - HAS Y + B
- UADDW v22.8h, v10.8h , v31.8b ////Q11 - HAS Y + R
+ UADDW v20.8h, v5.8h , v31.8b ////Q10 - HAS Y + B
+ UADDW v22.8h, v7.8h , v31.8b ////Q11 - HAS Y + R
UADDW v24.8h, v12.8h , v31.8b ////Q12 - HAS Y + G
sqxtun v14.8b, v14.8h
@@ -276,12 +279,12 @@ LABEL_YUV420SP_TO_RGB8888_WIDTH_LOOP:
////D14-D20 - TOALLY HAVE 16 VALUES
////WE NEED TO SHIFT R,G,B VALUES TO GET 5BIT,6BIT AND 5BIT COMBINATIONS
- UADDW v14.8h, v8.8h , v28.8b ////Q7 - HAS Y + B
- UADDW v16.8h, v10.8h , v28.8b ////Q2 - HAS Y + R
+ UADDW v14.8h, v5.8h , v28.8b ////Q7 - HAS Y + B
+ UADDW v16.8h, v7.8h , v28.8b ////Q2 - HAS Y + R
UADDW v18.8h, v12.8h , v28.8b ////Q3 - HAS Y + G
- UADDW v20.8h, v8.8h , v29.8b ////Q10 - HAS Y + B
- UADDW v22.8h, v10.8h , v29.8b ////Q11 - HAS Y + R
+ UADDW v20.8h, v5.8h , v29.8b ////Q10 - HAS Y + B
+ UADDW v22.8h, v7.8h , v29.8b ////Q11 - HAS Y + R
UADDW v24.8h, v12.8h , v29.8b ////Q12 - HAS Y + G
////COMPUTE THE ACTUAL RGB VALUES,WE CAN DO TWO ROWS AT A TIME
@@ -357,8 +360,8 @@ LABEL_YUV420SP_TO_RGB8888_WIDTH_LOOP_SKIP:
////NEED TO MULTIPLY WITH Q2,Q3 WITH CO-EEFICIENTS
- sMULL v8.4s, v4.4h, v0.4h[3] ////(U-128)*C4 FOR B
- sMULL2 v10.4s, v4.8h, v0.4h[3] ////(U-128)*C4 FOR B
+ sMULL v5.4s, v4.4h, v0.4h[3] ////(U-128)*C4 FOR B
+ sMULL2 v7.4s, v4.8h, v0.4h[3] ////(U-128)*C4 FOR B
sMULL v20.4s, v6.4h, v0.4h[0] ////(V-128)*C1 FOR R
sMULL2 v22.4s, v6.8h, v0.4h[0] ////(V-128)*C1 FOR R
@@ -369,13 +372,13 @@ LABEL_YUV420SP_TO_RGB8888_WIDTH_LOOP_SKIP:
sMLAL2 v14.4s, v6.8h, v0.4h[2] ////Q7 = (U-128)*C2 + (V-128)*C3
////NARROW RIGHT SHIFT BY 13 FOR R&B
- sqshrn v8.4h, v8.4s,#13 ////D8 = (U-128)*C4>>13 4 16-BIT VALUES
- sqshrn2 v8.8h, v10.4s,#13 ////D9 = (U-128)*C4>>13 4 16-BIT VALUES
+ sqshrn v5.4h, v5.4s,#13 ////D8 = (U-128)*C4>>13 4 16-BIT VALUES
+ sqshrn2 v5.8h, v7.4s,#13 ////D9 = (U-128)*C4>>13 4 16-BIT VALUES
////Q4 - WEIGHT FOR B
////NARROW RIGHT SHIFT BY 13 FOR R&B
- sqshrn v10.4h, v20.4s,#13 ////D10 = (V-128)*C1>>13 4 16-BIT VALUES
- sqshrn2 v10.8h, v22.4s,#13 ////D11 = (V-128)*C1>>13 4 16-BIT VALUES
+ sqshrn v7.4h, v20.4s,#13 ////D10 = (V-128)*C1>>13 4 16-BIT VALUES
+ sqshrn2 v7.8h, v22.4s,#13 ////D11 = (V-128)*C1>>13 4 16-BIT VALUES
////Q5 - WEIGHT FOR R
////NARROW RIGHT SHIFT BY 13 FOR G
@@ -383,12 +386,12 @@ LABEL_YUV420SP_TO_RGB8888_WIDTH_LOOP_SKIP:
sqshrn2 v12.8h, v14.4s,#13 ////D13 = [(U-128)*C2 + (V-128)*C3]>>13 4 16-BIT VALUES
////Q6 - WEIGHT FOR G
- UADDW v14.8h, v8.8h , v30.8b ////Q7 - HAS Y + B
- UADDW v16.8h, v10.8h , v30.8b ////Q8 - HAS Y + R
+ UADDW v14.8h, v5.8h , v30.8b ////Q7 - HAS Y + B
+ UADDW v16.8h, v7.8h , v30.8b ////Q8 - HAS Y + R
UADDW v18.8h, v12.8h , v30.8b ////Q9 - HAS Y + G
- UADDW v20.8h, v8.8h , v31.8b ////Q10 - HAS Y + B
- UADDW v22.8h, v10.8h , v31.8b ////Q11 - HAS Y + R
+ UADDW v20.8h, v5.8h , v31.8b ////Q10 - HAS Y + B
+ UADDW v22.8h, v7.8h , v31.8b ////Q11 - HAS Y + R
UADDW v24.8h, v12.8h , v31.8b ////Q12 - HAS Y + G
sqxtun v14.8b, v14.8h
@@ -439,12 +442,12 @@ LABEL_YUV420SP_TO_RGB8888_WIDTH_LOOP_SKIP:
////D14-D20 - TOALLY HAVE 16 VALUES
////WE NEED TO SHIFT R,G,B VALUES TO GET 5BIT,6BIT AND 5BIT COMBINATIONS
- UADDW v14.8h, v8.8h , v28.8b ////Q7 - HAS Y + B
- UADDW v16.8h, v10.8h , v28.8b ////Q2 - HAS Y + R
+ UADDW v14.8h, v5.8h , v28.8b ////Q7 - HAS Y + B
+ UADDW v16.8h, v7.8h , v28.8b ////Q2 - HAS Y + R
UADDW v18.8h, v12.8h , v28.8b ////Q3 - HAS Y + G
- UADDW v20.8h, v8.8h , v29.8b ////Q10 - HAS Y + B
- UADDW v22.8h, v10.8h , v29.8b ////Q11 - HAS Y + R
+ UADDW v20.8h, v5.8h , v29.8b ////Q10 - HAS Y + B
+ UADDW v22.8h, v7.8h , v29.8b ////Q11 - HAS Y + R
UADDW v24.8h, v12.8h , v29.8b ////Q12 - HAS Y + G
sqxtun v14.8b, v14.8h
@@ -513,7 +516,9 @@ LABEL_YUV420SP_TO_RGB8888_WIDTH_LOOP_SKIP:
////POP THE REGISTERS
// LDMFD sp!,{x4-x12,PC}
ldp x19, x20,[sp],#16
- pop_v_regs
+ ldp d8,d15,[sp],#16 // Loading d15 using { ldr d15,[sp]; add sp,sp,#8 } is giving bus error.
+ // d8 is used as dummy register and loaded along with d15 using ldp. d8 is not used in the function.
+ ldp d12,d14,[sp],#16
ret
diff --git a/decoder/arm64/ihevcd_itrans_recon_dc_luma.s b/decoder/arm64/ihevcd_itrans_recon_dc_luma.s
index 279888b..edc70e7 100644
--- a/decoder/arm64/ihevcd_itrans_recon_dc_luma.s
+++ b/decoder/arm64/ihevcd_itrans_recon_dc_luma.s
@@ -59,7 +59,7 @@ ihevcd_itrans_recon_dc_luma_av8:
- push_v_regs
+
stp x19, x20,[sp,#-16]!
sxth x5,w5
@@ -120,8 +120,8 @@ col_loop:
ld1 {v6.8b},[x7],x2
ld1 {v7.8b},[x7],x2
- ld1 {v8.8b},[x7],x2
- ld1 {v9.8b},[x7]
+ ld1 {v1.8b},[x7],x2
+ ld1 {v17.8b},[x7]
add x0,x0,#8
@@ -132,8 +132,8 @@ col_loop:
uaddw v24.8h, v0.8h , v5.8b
uaddw v22.8h, v0.8h , v6.8b
uaddw v20.8h, v0.8h , v7.8b
- uaddw v18.8h, v0.8h , v8.8b
- uaddw v16.8h, v0.8h , v9.8b
+ uaddw v18.8h, v0.8h , v1.8b
+ uaddw v16.8h, v0.8h , v17.8b
mov x11,x1
sqxtun v2.8b, v30.8h
@@ -142,8 +142,8 @@ col_loop:
sqxtun v5.8b, v24.8h
sqxtun v6.8b, v22.8h
sqxtun v7.8b, v20.8h
- sqxtun v8.8b, v18.8h
- sqxtun v9.8b, v16.8h
+ sqxtun v1.8b, v18.8h
+ sqxtun v17.8b, v16.8h
st1 {v2.2s},[x11],x3
@@ -152,8 +152,8 @@ col_loop:
st1 {v5.2s},[x11],x3
st1 {v6.2s},[x11],x3
st1 {v7.2s},[x11],x3
- st1 {v8.2s},[x11],x3
- st1 {v9.2s},[x11]
+ st1 {v1.2s},[x11],x3
+ st1 {v17.2s},[x11]
add x1,x1,#8
@@ -206,7 +206,7 @@ col_loop_4:
end_loops:
ldp x19, x20,[sp],#16
- pop_v_regs
+
ret