1 files changed, 17 insertions, 12 deletions
diff --git a/common/arm64/ihevc_intra_pred_filters_luma_mode_19_to_25.s b/common/arm64/ihevc_intra_pred_filters_luma_mode_19_to_25.s
index fe7ac11..9b59d58 100644
--- a/common/arm64/ihevc_intra_pred_filters_luma_mode_19_to_25.s
+++ b/common/arm64/ihevc_intra_pred_filters_luma_mode_19_to_25.s
@@ -104,7 +104,10 @@
 ihevc_intra_pred_luma_mode_19_to_25_av8:
 
     // stmfd sp!, {x4-x12, x14}            //stack stores the values of the arguments
-    push_v_regs
+
+    stp         d9,d10,[sp,#-16]!
+    stp         d12,d13,[sp,#-16]!
+    stp         d14,d15,[sp,#-16]!
     stp         x19, x20,[sp,#-16]!
 
     adrp        x7,  :got:gai4_ihevc_ang_table
@@ -267,7 +270,7 @@ prologue:
 
     add         x10,x8,x9                   //(i row)*pu1_ref[ref_main_idx]
 
-    ld1         {v8.8b},[x10],x11           //(i row)ref_main_idx
+    ld1         {v23.8b},[x10],x11          //(i row)ref_main_idx
     sbfx        x9,x14,#8,#8
 
     ld1         {v9.8b},[x10]               //(i row)ref_main_idx_1
@@ -278,7 +281,7 @@ prologue:
     add         x10,x8,x9                   //(iii)*pu1_ref[ref_main_idx]
 
     ld1         {v12.8b},[x12],x11          //(ii)ref_main_idx
-    umull       v10.8h, v8.8b, v30.8b       //(i row)vmull_u8(ref_main_idx, dup_const_32_fract)
+    umull       v10.8h, v23.8b, v30.8b      //(i row)vmull_u8(ref_main_idx, dup_const_32_fract)
 
     ld1         {v13.8b},[x12]              //(ii)ref_main_idx_1
     umlal       v10.8h, v9.8b, v31.8b       //(i row)vmull_u8(ref_main_idx_1, dup_const_fract)
@@ -316,7 +319,7 @@ prologue:
     dup         v29.8b, v4.8b[5]            //(vi)
     add         x10,x8,x9                   //(v)*pu1_ref[ref_main_idx]
 
-    ld1         {v8.8b},[x10],x11           //(v)ref_main_idx
+    ld1         {v23.8b},[x10],x11          //(v)ref_main_idx
     sub         v24.8b,  v1.8b ,  v25.8b    //(iv)32-fract(dup_const_32_fract)
 
     umull       v22.8h, v20.8b, v24.8b      //(iv)vmull_u8(ref_main_idx, dup_const_32_fract)
@@ -336,7 +339,7 @@ prologue:
     add         x10,x8,x9                   //(vii)*pu1_ref[ref_main_idx]
 
     ld1         {v12.8b},[x12],x11          //(vi)ref_main_idx
-    umull       v10.8h, v8.8b, v30.8b       //(v)vmull_u8(ref_main_idx, dup_const_32_fract)
+    umull       v10.8h, v23.8b, v30.8b      //(v)vmull_u8(ref_main_idx, dup_const_32_fract)
 
     ld1         {v13.8b},[x12]              //(vi)ref_main_idx_1
     umlal       v10.8h, v9.8b, v31.8b       //(v)vmull_u8(ref_main_idx_1, dup_const_fract)
@@ -392,7 +395,7 @@ kernel_8_rows:
     subs        x4,x4,#8
     sbfx        x9,x14,#8,#8
 
-    ld1         {v8.8b},[x10],x11           //(i)ref_main_idx
+    ld1         {v23.8b},[x10],x11          //(i)ref_main_idx
     sub         v24.8b,  v1.8b ,  v25.8b    //(viii)32-fract(dup_const_32_fract)
 
     add         x20,x6,#8                   //increment the row value
@@ -416,7 +419,7 @@ kernel_8_rows:
     add         x10,x8,x9                   //(iii)*pu1_ref[ref_main_idx]
 
     ld1         {v12.8b},[x12],x11          //(ii)ref_main_idx
-    umull       v10.8h, v8.8b, v30.8b       //(i)vmull_u8(ref_main_idx, dup_const_32_fract)
+    umull       v10.8h, v23.8b, v30.8b      //(i)vmull_u8(ref_main_idx, dup_const_32_fract)
 
     ld1         {v13.8b},[x12]              //(ii)ref_main_idx_1
     umlal       v10.8h, v9.8b, v31.8b       //(i)vmull_u8(ref_main_idx_1, dup_const_fract)
@@ -477,7 +480,7 @@ kernel_8_rows:
     dup         v25.8b, v4.8b[7]            //(viii)
     rshrn       v18.8b, v18.8h,#5           //(iii)shift_res = vrshrn_n_u16(add_res, 5)
 
-    ld1         {v8.8b},[x10],x11           //(v)ref_main_idx
+    ld1         {v23.8b},[x10],x11          //(v)ref_main_idx
     and         v4.16b,  v2.16b ,  v6.16b   //dup_const_fract(fract = pos & (31))
 
     ld1         {v9.8b},[x10]               //(v)ref_main_idx_1
@@ -493,7 +496,7 @@ kernel_8_rows:
     sub         v30.8b,  v1.8b ,  v31.8b    //(v)32-fract(dup_const_32_fract)
 
     ld1         {v13.8b},[x12]              //(vi)ref_main_idx_1
-    umull       v10.8h, v8.8b, v30.8b       //(v)vmull_u8(ref_main_idx, dup_const_32_fract)
+    umull       v10.8h, v23.8b, v30.8b      //(v)vmull_u8(ref_main_idx, dup_const_32_fract)
 
     umov        w14, v3.2s[0]               //(i)extract idx to the r register
     sxtw        x14,w14
@@ -592,7 +595,7 @@ core_loop_4:
     dup         v7.8b,w4                    //dup_const_32_fract
     umlal       v4.8h, v3.8b, v0.8b         //vmull_u8(ref_main_idx_1, dup_const_fract)
 
-    ld1         {v8.s}[0],[x10]             //ref_main_idx
+    ld1         {v23.s}[0],[x10]            //ref_main_idx
     add         x8,x8,#1
 
     ld1         {v9.s}[0],[x11]             //ref_main_idx_1
@@ -607,7 +610,7 @@ core_loop_4:
     add         x11,x10,#1                  //pu1_ref_main_idx_1 += 1
 
     dup         v12.8b,w5                   //dup_const_fract
-    umull       v10.8h, v8.8b, v7.8b        //vmull_u8(ref_main_idx, dup_const_32_fract)
+    umull       v10.8h, v23.8b, v7.8b       //vmull_u8(ref_main_idx, dup_const_32_fract)
 
     sub         x20,x5,#32
     neg         x4, x20
@@ -655,7 +658,9 @@ end_loops:
     add         sp, sp, #132
     // ldmfd sp!,{x4-x12,x15}                  //reload the registers from sp
     ldp         x19, x20,[sp],#16
-    pop_v_regs
+    ldp         d14,d15,[sp],#16
+    ldp         d12,d13,[sp],#16
+    ldp         d9,d10,[sp],#16
     ret