summaryrefslogtreecommitdiffstats
path: root/decoder/arm64/ihevcd_fmt_conv_420sp_to_rgba8888.s
diff options
context:
space:
mode:
Diffstat (limited to 'decoder/arm64/ihevcd_fmt_conv_420sp_to_rgba8888.s')
-rw-r--r--decoder/arm64/ihevcd_fmt_conv_420sp_to_rgba8888.s65
1 files changed, 35 insertions, 30 deletions
diff --git a/decoder/arm64/ihevcd_fmt_conv_420sp_to_rgba8888.s b/decoder/arm64/ihevcd_fmt_conv_420sp_to_rgba8888.s
index 485ee66..a6041f5 100644
--- a/decoder/arm64/ihevcd_fmt_conv_420sp_to_rgba8888.s
+++ b/decoder/arm64/ihevcd_fmt_conv_420sp_to_rgba8888.s
@@ -91,7 +91,10 @@ ihevcd_fmt_conv_420sp_to_rgba8888_av8:
//// push the registers on the stack
// STMFD sp!,{x4-x12,x14}
- push_v_regs
+
+ stp d12,d14,[sp,#-16]!
+ stp d8,d15,[sp,#-16]! // Storing d15 using { sub sp,sp,#8; str d15,[sp] } is giving bus error.
+ // d8 is used as dummy register and stored along with d15 using stp. d8 is not used in the function.
stp x19, x20,[sp,#-16]!
@@ -194,8 +197,8 @@ LABEL_YUV420SP_TO_RGB8888_WIDTH_LOOP:
prfm PLDL1KEEP,[x1]
////NEED TO MULTIPLY WITH Q2,Q3 WITH CO-EEFICIENTS
- sMULL v8.4s, v4.4h, v0.4h[3] ////(U-128)*C4 FOR B
- sMULL2 v10.4s, v4.8h, v0.4h[3] ////(U-128)*C4 FOR B
+ sMULL v5.4s, v4.4h, v0.4h[3] ////(U-128)*C4 FOR B
+ sMULL2 v7.4s, v4.8h, v0.4h[3] ////(U-128)*C4 FOR B
sMULL v20.4s, v6.4h, v0.4h[0] ////(V-128)*C1 FOR R
sMULL2 v22.4s, v6.8h, v0.4h[0] ////(V-128)*C1 FOR R
@@ -206,13 +209,13 @@ LABEL_YUV420SP_TO_RGB8888_WIDTH_LOOP:
sMLAL2 v14.4s, v6.8h, v0.4h[2] ////Q7 = (U-128)*C2 + (V-128)*C3
////NARROW RIGHT SHIFT BY 13 FOR R&B
- sqshrn v8.4h, v8.4s,#13 ////D8 = (U-128)*C4>>13 4 16-BIT VALUES
- sqshrn2 v8.8h, v10.4s,#13 ////D9 = (U-128)*C4>>13 4 16-BIT VALUES
+ sqshrn v5.4h, v5.4s,#13 ////D8 = (U-128)*C4>>13 4 16-BIT VALUES
+ sqshrn2 v5.8h, v7.4s,#13 ////D9 = (U-128)*C4>>13 4 16-BIT VALUES
////Q4 - WEIGHT FOR B
////NARROW RIGHT SHIFT BY 13 FOR R&B
- sqshrn v10.4h, v20.4s,#13 ////D10 = (V-128)*C1>>13 4 16-BIT VALUES
- sqshrn2 v10.8h, v22.4s,#13 ////D11 = (V-128)*C1>>13 4 16-BIT VALUES
+ sqshrn v7.4h, v20.4s,#13 ////D10 = (V-128)*C1>>13 4 16-BIT VALUES
+ sqshrn2 v7.8h, v22.4s,#13 ////D11 = (V-128)*C1>>13 4 16-BIT VALUES
////Q5 - WEIGHT FOR R
////NARROW RIGHT SHIFT BY 13 FOR G
@@ -220,12 +223,12 @@ LABEL_YUV420SP_TO_RGB8888_WIDTH_LOOP:
sqshrn2 v12.8h, v14.4s,#13 ////D13 = [(U-128)*C2 + (V-128)*C3]>>13 4 16-BIT VALUES
////Q6 - WEIGHT FOR G
- UADDW v14.8h, v8.8h , v30.8b ////Q7 - HAS Y + B
- UADDW v16.8h, v10.8h , v30.8b ////Q8 - HAS Y + R
+ UADDW v14.8h, v5.8h , v30.8b ////Q7 - HAS Y + B
+ UADDW v16.8h, v7.8h , v30.8b ////Q8 - HAS Y + R
UADDW v18.8h, v12.8h , v30.8b ////Q9 - HAS Y + G
- UADDW v20.8h, v8.8h , v31.8b ////Q10 - HAS Y + B
- UADDW v22.8h, v10.8h , v31.8b ////Q11 - HAS Y + R
+ UADDW v20.8h, v5.8h , v31.8b ////Q10 - HAS Y + B
+ UADDW v22.8h, v7.8h , v31.8b ////Q11 - HAS Y + R
UADDW v24.8h, v12.8h , v31.8b ////Q12 - HAS Y + G
sqxtun v14.8b, v14.8h
@@ -276,12 +279,12 @@ LABEL_YUV420SP_TO_RGB8888_WIDTH_LOOP:
////D14-D20 - TOALLY HAVE 16 VALUES
////WE NEED TO SHIFT R,G,B VALUES TO GET 5BIT,6BIT AND 5BIT COMBINATIONS
- UADDW v14.8h, v8.8h , v28.8b ////Q7 - HAS Y + B
- UADDW v16.8h, v10.8h , v28.8b ////Q2 - HAS Y + R
+ UADDW v14.8h, v5.8h , v28.8b ////Q7 - HAS Y + B
+ UADDW v16.8h, v7.8h , v28.8b ////Q2 - HAS Y + R
UADDW v18.8h, v12.8h , v28.8b ////Q3 - HAS Y + G
- UADDW v20.8h, v8.8h , v29.8b ////Q10 - HAS Y + B
- UADDW v22.8h, v10.8h , v29.8b ////Q11 - HAS Y + R
+ UADDW v20.8h, v5.8h , v29.8b ////Q10 - HAS Y + B
+ UADDW v22.8h, v7.8h , v29.8b ////Q11 - HAS Y + R
UADDW v24.8h, v12.8h , v29.8b ////Q12 - HAS Y + G
////COMPUTE THE ACTUAL RGB VALUES,WE CAN DO TWO ROWS AT A TIME
@@ -357,8 +360,8 @@ LABEL_YUV420SP_TO_RGB8888_WIDTH_LOOP_SKIP:
////NEED TO MULTIPLY WITH Q2,Q3 WITH CO-EEFICIENTS
- sMULL v8.4s, v4.4h, v0.4h[3] ////(U-128)*C4 FOR B
- sMULL2 v10.4s, v4.8h, v0.4h[3] ////(U-128)*C4 FOR B
+ sMULL v5.4s, v4.4h, v0.4h[3] ////(U-128)*C4 FOR B
+ sMULL2 v7.4s, v4.8h, v0.4h[3] ////(U-128)*C4 FOR B
sMULL v20.4s, v6.4h, v0.4h[0] ////(V-128)*C1 FOR R
sMULL2 v22.4s, v6.8h, v0.4h[0] ////(V-128)*C1 FOR R
@@ -369,13 +372,13 @@ LABEL_YUV420SP_TO_RGB8888_WIDTH_LOOP_SKIP:
sMLAL2 v14.4s, v6.8h, v0.4h[2] ////Q7 = (U-128)*C2 + (V-128)*C3
////NARROW RIGHT SHIFT BY 13 FOR R&B
- sqshrn v8.4h, v8.4s,#13 ////D8 = (U-128)*C4>>13 4 16-BIT VALUES
- sqshrn2 v8.8h, v10.4s,#13 ////D9 = (U-128)*C4>>13 4 16-BIT VALUES
+ sqshrn v5.4h, v5.4s,#13 ////D8 = (U-128)*C4>>13 4 16-BIT VALUES
+ sqshrn2 v5.8h, v7.4s,#13 ////D9 = (U-128)*C4>>13 4 16-BIT VALUES
////Q4 - WEIGHT FOR B
////NARROW RIGHT SHIFT BY 13 FOR R&B
- sqshrn v10.4h, v20.4s,#13 ////D10 = (V-128)*C1>>13 4 16-BIT VALUES
- sqshrn2 v10.8h, v22.4s,#13 ////D11 = (V-128)*C1>>13 4 16-BIT VALUES
+ sqshrn v7.4h, v20.4s,#13 ////D10 = (V-128)*C1>>13 4 16-BIT VALUES
+ sqshrn2 v7.8h, v22.4s,#13 ////D11 = (V-128)*C1>>13 4 16-BIT VALUES
////Q5 - WEIGHT FOR R
////NARROW RIGHT SHIFT BY 13 FOR G
@@ -383,12 +386,12 @@ LABEL_YUV420SP_TO_RGB8888_WIDTH_LOOP_SKIP:
sqshrn2 v12.8h, v14.4s,#13 ////D13 = [(U-128)*C2 + (V-128)*C3]>>13 4 16-BIT VALUES
////Q6 - WEIGHT FOR G
- UADDW v14.8h, v8.8h , v30.8b ////Q7 - HAS Y + B
- UADDW v16.8h, v10.8h , v30.8b ////Q8 - HAS Y + R
+ UADDW v14.8h, v5.8h , v30.8b ////Q7 - HAS Y + B
+ UADDW v16.8h, v7.8h , v30.8b ////Q8 - HAS Y + R
UADDW v18.8h, v12.8h , v30.8b ////Q9 - HAS Y + G
- UADDW v20.8h, v8.8h , v31.8b ////Q10 - HAS Y + B
- UADDW v22.8h, v10.8h , v31.8b ////Q11 - HAS Y + R
+ UADDW v20.8h, v5.8h , v31.8b ////Q10 - HAS Y + B
+ UADDW v22.8h, v7.8h , v31.8b ////Q11 - HAS Y + R
UADDW v24.8h, v12.8h , v31.8b ////Q12 - HAS Y + G
sqxtun v14.8b, v14.8h
@@ -439,12 +442,12 @@ LABEL_YUV420SP_TO_RGB8888_WIDTH_LOOP_SKIP:
////D14-D20 - TOALLY HAVE 16 VALUES
////WE NEED TO SHIFT R,G,B VALUES TO GET 5BIT,6BIT AND 5BIT COMBINATIONS
- UADDW v14.8h, v8.8h , v28.8b ////Q7 - HAS Y + B
- UADDW v16.8h, v10.8h , v28.8b ////Q2 - HAS Y + R
+ UADDW v14.8h, v5.8h , v28.8b ////Q7 - HAS Y + B
+ UADDW v16.8h, v7.8h , v28.8b ////Q2 - HAS Y + R
UADDW v18.8h, v12.8h , v28.8b ////Q3 - HAS Y + G
- UADDW v20.8h, v8.8h , v29.8b ////Q10 - HAS Y + B
- UADDW v22.8h, v10.8h , v29.8b ////Q11 - HAS Y + R
+ UADDW v20.8h, v5.8h , v29.8b ////Q10 - HAS Y + B
+ UADDW v22.8h, v7.8h , v29.8b ////Q11 - HAS Y + R
UADDW v24.8h, v12.8h , v29.8b ////Q12 - HAS Y + G
sqxtun v14.8b, v14.8h
@@ -513,7 +516,9 @@ LABEL_YUV420SP_TO_RGB8888_WIDTH_LOOP_SKIP:
////POP THE REGISTERS
// LDMFD sp!,{x4-x12,PC}
ldp x19, x20,[sp],#16
- pop_v_regs
+ ldp d8,d15,[sp],#16 // Loading d15 using { ldr d15,[sp]; add sp,sp,#8 } is giving bus error.
+ // d8 is used as dummy register and loaded along with d15 using ldp. d8 is not used in the function.
+ ldp d12,d14,[sp],#16
ret