Reduced stack operations in arm64 assembly

Change-Id: Ia19a99001fef37334f18521dd8f8710907fe370d
author: Naveen Kumar Ponnusamy <naveenkumar.p@ittiam.com> 2014-06-10 12:14:27 -0700
committer: Lajos Molnar <lajos@google.com> 2014-07-12 15:09:24 -0700
commit: 9cbd70a2930875be59d7df68136ac9a1a949a13d (patch)
tree: 6d9957d14352fc77e2323f90b49387e577f1ade2
parent: 707042fda96ebede81408b854385173483798bcd (diff)
download: android_external_libhevc-9cbd70a2930875be59d7df68136ac9a1a949a13d.tar.gz
android_external_libhevc-9cbd70a2930875be59d7df68136ac9a1a949a13d.tar.bz2
android_external_libhevc-9cbd70a2930875be59d7df68136ac9a1a949a13d.zip
46 files changed, 2398 insertions, 2331 deletions
diff --git a/common/arm64/ihevc_deblk_luma_horz.s b/common/arm64/ihevc_deblk_luma_horz.s
index a5c314d..f6989e9 100644
--- a/common/arm64/ihevc_deblk_luma_horz.s
+++ b/common/arm64/ihevc_deblk_luma_horz.s
@@ -50,7 +50,8 @@ ihevc_deblk_luma_horz_av8:
     // stmfd sp!, {x3-x12,x14}
     sxtw        x5,w5
     sxtw        x6,w6
-    stp         d8,d9,[sp,#-16]!
+    stp         d8,d9,[sp,#-16]!            // Storing d9 using { sub sp,sp,#8; str d9,[sp] } is giving bus error.
+                                            // d8 is used as dummy register and stored along with d9 using stp. d8 is not used in the function.
     stp         d10,d11,[sp,#-16]!
     stp         d12,d13,[sp,#-16]!
     stp         d14,d15,[sp,#-16]!
@@ -212,11 +213,11 @@ l1.1564:
     neg         x19, x1
     ldrb        w7,[x0,x19]                 // has the -1 value
     dup         v22.2s,w2                   // -4 value
-    uaddw       v8.8h,  v6.8h ,  v27.8b
+    uaddw       v7.8h,  v6.8h ,  v27.8b
     ldrb        w3,[x0,#0]                  // x4 has the 0 value
     uqadd       v16.8b,  v27.8b ,  v1.8b
     and         x2,x2,#0xff
-    mul         v12.8h, v8.8h, v0.4h[0]
+    mul         v12.8h, v7.8h, v0.4h[0]
     ldr         w8, [x0,x10]                // has the 3 value
     uaddl       v10.8h,  v24.8b ,  v28.8b
     subs        x2,x2,x7
@@ -233,7 +234,7 @@ l1.1564:
 
     cmp         x8,x5,asr #3
     bge         l1.1840
-    uaddw       v14.8h,  v8.8h ,  v28.8b
+    uaddw       v14.8h,  v7.8h ,  v28.8b
     subs        x7,x3,x7
     umax        v4.8b,  v18.8b ,  v31.8b
     csneg       x7,x7,x7,pl
@@ -285,13 +286,13 @@ l1.1564:
     subs        x2,x2,x7
     umax        v3.8b,  v18.8b ,  v31.8b
     csneg       x2,x2,x2,pl
-    uaddw       v8.8h,  v6.8h ,  v26.8b
+    uaddw       v7.8h,  v6.8h ,  v26.8b
     add         x8,x8,x2
     uqadd       v30.8b,  v25.8b ,  v1.8b
     cmp         x8,x5,asr #3
     uqsub       v31.8b,  v25.8b ,  v1.8b
     bge         l1.1840
-    mul         v12.8h, v8.8h, v0.4h[0]
+    mul         v12.8h, v7.8h, v0.4h[0]
     subs        x7,x3,x7
     uqadd       v16.8b,  v24.8b ,  v1.8b
     csneg       x7,x7,x7,pl
@@ -303,7 +304,7 @@ l1.1564:
     add         x10, x10,#1
     rshrn       v20.8b, v12.8h,#3
     cmp         x7,x10,asr #1
-    uaddw       v14.8h,  v8.8h ,  v23.8b
+    uaddw       v14.8h,  v7.8h ,  v23.8b
     bge         l1.1840
     umin        v18.8b,  v20.8b ,  v30.8b
     mov         x2,#2
@@ -397,7 +398,7 @@ end_dep_deq_decision_horz:
     cmp         x2,#1
     uqsub       v31.8b,  v23.8b ,  v1.8b
     beq         l1.2408
-    uaddl       v8.8h,  v23.8b ,  v22.8b
+    uaddl       v7.8h,  v23.8b ,  v22.8b
     cmp         x5,#1
 
     bne         strong_filtering_p
@@ -412,10 +413,10 @@ strong_filtering_q:
 strong_filtering_p:
     umax        v5.8b,  v18.8b ,  v17.8b
     mov         x12,x0
-    mul         v8.8h, v8.8h, v0.4h[0]
+    mul         v7.8h, v7.8h, v0.4h[0]
     sub         x20,x1,#0
     neg         x11, x20
-    add         v16.8h,  v8.8h ,  v14.8h
+    add         v16.8h,  v7.8h ,  v14.8h
     add         x12,x12,x11
     rshrn       v19.8b, v16.8h,#3
     st1         {v2.s}[0],[x12],x11
@@ -431,7 +432,8 @@ l1.2404:
     ldp         d14,d15,[sp],#16
     ldp         d12,d13,[sp],#16
     ldp         d10,d11,[sp],#16
-    ldp         d8,d9,[sp],#16
+    ldp         d8,d9,[sp],#16              // Loading d9 using { ldr d9,[sp]; add sp,sp,#8 } is giving bus error.
+                                            // d8 is used as dummy register and loaded along with d9 using ldp. d8 is not used in the function.
     ret
 
     // x4=flag p
@@ -486,8 +488,8 @@ l1.2408:
     srshr       v10.8h, v10.8h,#4
     //   delta = ( 9 * (pu1_src[0] - pu1_src[-1]) - 3 * (pu1_src[1] - pu1_src[-2]) + 8 ) >> 4@
 
-    abs         v8.8h, v10.8h
-    xtn         v9.8b,  v8.8h
+    abs         v7.8h, v10.8h
+    xtn         v9.8b,  v7.8h
     // storing the absolute values of delta in d9
 
     sqxtn       v10.8b,  v10.8h
@@ -495,16 +497,16 @@ l1.2408:
 
 
     smin        v11.8b,  v10.8b ,  v30.8b
-    smax        v8.8b,  v31.8b ,  v11.8b    // d8 has the value  delta = clip3(delta, -tc, tc)//
+    smax        v7.8b,  v31.8b ,  v11.8b    // d8 has the value  delta = clip3(delta, -tc, tc)//
 
 
     uxtl        v6.8h, v25.8b
 
-    saddw       v4.8h,  v6.8h ,  v8.8b
+    saddw       v4.8h,  v6.8h ,  v7.8b
 
     sqxtun      v12.8b, v4.8h
     uxtl        v6.8h, v26.8b
-    ssubw       v4.8h,  v6.8h ,  v8.8b
+    ssubw       v4.8h,  v6.8h ,  v7.8b
     sqxtun      v13.8b, v4.8h
 
 
@@ -525,7 +527,7 @@ l1.2408:
     uaddl       v14.8h,  v23.8b ,  v25.8b
     rshrn       v14.8b, v14.8h,#1
     usubl       v14.8h,  v14.8b ,  v24.8b
-    saddw       v14.8h,  v14.8h ,  v8.8b
+    saddw       v14.8h,  v14.8h ,  v7.8b
     sqshrn      v14.8b, v14.8h,#1
     smin        v15.8b,  v14.8b ,  v0.8b
     smax        v14.8b,  v1.8b ,  v15.8b
@@ -558,7 +560,7 @@ l1.2724:
     uaddl       v14.8h,  v26.8b ,  v28.8b
     rshrn       v14.8b, v14.8h,#1
     usubl       v14.8h,  v14.8b ,  v27.8b
-    ssubw       v14.8h,  v14.8h ,  v8.8b
+    ssubw       v14.8h,  v14.8h ,  v7.8b
     sqshrn      v14.8b, v14.8h,#1
     smin        v15.8b,  v14.8b ,  v0.8b
     smax        v14.8b,  v1.8b ,  v15.8b
@@ -580,7 +582,8 @@ l1.2852:
     ldp         d14,d15,[sp],#16
     ldp         d12,d13,[sp],#16
     ldp         d10,d11,[sp],#16
-    ldp         d8,d9,[sp],#16
+    ldp         d8,d9,[sp],#16              // Loading d9 using { ldr d9,[sp]; add sp,sp,#8 } is giving bus error.
+                                            // d8 is used as dummy register and loaded along with d9 using ldp. d8 is not used in the function.
     ret
 
 
diff --git a/common/arm64/ihevc_inter_pred_chroma_copy_w16out.s b/common/arm64/ihevc_inter_pred_chroma_copy_w16out.s
index e479651..180e5f5 100644
--- a/common/arm64/ihevc_inter_pred_chroma_copy_w16out.s
+++ b/common/arm64/ihevc_inter_pred_chroma_copy_w16out.s
@@ -104,7 +104,7 @@
 ihevc_inter_pred_chroma_copy_w16out_av8:
 
     // stmfd sp!, {x4-x12, x14}        //stack stores the values of the arguments
-    push_v_regs
+
     stp         x19, x20,[sp,#-16]!
 
     mov         x15,x4 // pi1_coeff
@@ -172,7 +172,7 @@ end_inner_loop_wd_4:
 end_loops:
     // ldmfd sp!,{x4-x12,x15}        //reload the registers from sp
     ldp         x19, x20,[sp],#16
-    pop_v_regs
+
     ret
 
 
@@ -219,14 +219,14 @@ core_loop_wd_8:
 prolog:
     add         x6,x0,x2                    //pu1_src_tmp += src_strd
     add         x10,x1,x5
-    ld1         {v8.8b},[x0],#8             //vld1_u8(pu1_src_tmp)
-    ld1         {v10.8b},[x6],x2            //vld1_u8(pu1_src_tmp)
-    ld1         {v12.8b},[x6],x2            //vld1_u8(pu1_src_tmp)
-    ld1         {v14.8b},[x6],x2            //vld1_u8(pu1_src_tmp)
-    uxtl        v16.8h, v8.8b               //vmovl_u8(vld1_u8(pu1_src_tmp))
-    uxtl        v18.8h, v10.8b              //vmovl_u8(vld1_u8(pu1_src_tmp)
-    uxtl        v20.8h, v12.8b              //vmovl_u8(vld1_u8(pu1_src_tmp)
-    uxtl        v22.8h, v14.8b              //vmovl_u8(vld1_u8(pu1_src_tmp)
+    ld1         {v1.8b},[x0],#8             //vld1_u8(pu1_src_tmp)
+    ld1         {v3.8b},[x6],x2             //vld1_u8(pu1_src_tmp)
+    ld1         {v5.8b},[x6],x2             //vld1_u8(pu1_src_tmp)
+    ld1         {v7.8b},[x6],x2             //vld1_u8(pu1_src_tmp)
+    uxtl        v16.8h, v1.8b               //vmovl_u8(vld1_u8(pu1_src_tmp))
+    uxtl        v18.8h, v3.8b               //vmovl_u8(vld1_u8(pu1_src_tmp)
+    uxtl        v20.8h, v5.8b               //vmovl_u8(vld1_u8(pu1_src_tmp)
+    uxtl        v22.8h, v7.8b               //vmovl_u8(vld1_u8(pu1_src_tmp)
     subs        x4,x4,#8                    //wd decrements by 8
     shl         v0.8h, v16.8h,#6            //vshlq_n_s16(tmp, 6)
     shl         v2.8h, v18.8h,#6            //vshlq_n_s16(tmp, 6)
@@ -235,10 +235,10 @@ prolog:
     add         x20,x0,x8
     csel        x0, x20, x0,le
     add         x6,x0,x2                    //pu1_src_tmp += src_strd
-    ld1         {v8.8b},[x0],#8             //vld1_u8(pu1_src_tmp)
-    ld1         {v10.8b},[x6],x2            //vld1_u8(pu1_src_tmp)
-    ld1         {v12.8b},[x6],x2            //vld1_u8(pu1_src_tmp)
-    ld1         {v14.8b},[x6],x2            //vld1_u8(pu1_src_tmp)
+    ld1         {v1.8b},[x0],#8             //vld1_u8(pu1_src_tmp)
+    ld1         {v3.8b},[x6],x2             //vld1_u8(pu1_src_tmp)
+    ld1         {v5.8b},[x6],x2             //vld1_u8(pu1_src_tmp)
+    ld1         {v7.8b},[x6],x2             //vld1_u8(pu1_src_tmp)
 
     st1         {v0.8h},[x1],#16            //vst1q_s16(pi2_dst_tmp, tmp)
     add         x20,x1,x11,lsl #1
@@ -256,15 +256,15 @@ prolog:
 outer_loop_wd_8:
 
     st1         {v2.8h},[x10],x5            //vst1q_s16(pi2_dst_tmp, tmp)
-    uxtl        v16.8h, v8.8b               //vmovl_u8(vld1_u8(pu1_src_tmp))
+    uxtl        v16.8h, v1.8b               //vmovl_u8(vld1_u8(pu1_src_tmp))
 
     st1         {v4.8h},[x10],x5            //vst1q_s16(pi2_dst_tmp, tmp)
-    uxtl        v18.8h, v10.8b              //vmovl_u8(vld1_u8(pu1_src_tmp)
+    uxtl        v18.8h, v3.8b               //vmovl_u8(vld1_u8(pu1_src_tmp)
 
     st1         {v6.8h},[x10],x5            //vst1q_s16(pi2_dst_tmp, tmp)
-    uxtl        v20.8h, v12.8b              //vmovl_u8(vld1_u8(pu1_src_tmp)
+    uxtl        v20.8h, v5.8b               //vmovl_u8(vld1_u8(pu1_src_tmp)
 
-    uxtl        v22.8h, v14.8b              //vmovl_u8(vld1_u8(pu1_src_tmp)
+    uxtl        v22.8h, v7.8b               //vmovl_u8(vld1_u8(pu1_src_tmp)
 
     subs        x4,x4,#8                    //wd decrements by 8
     add         x20,x0,x8
@@ -272,16 +272,16 @@ outer_loop_wd_8:
 
     add         x6,x0,x2                    //pu1_src_tmp += src_strd
 
-    ld1         {v8.8b},[x0],#8             //vld1_u8(pu1_src_tmp)
+    ld1         {v1.8b},[x0],#8             //vld1_u8(pu1_src_tmp)
     shl         v0.8h, v16.8h,#6            //vshlq_n_s16(tmp, 6)
 
-    ld1         {v10.8b},[x6],x2            //vld1_u8(pu1_src_tmp)
+    ld1         {v3.8b},[x6],x2             //vld1_u8(pu1_src_tmp)
     shl         v2.8h, v18.8h,#6            //vshlq_n_s16(tmp, 6)
 
-    ld1         {v12.8b},[x6],x2            //vld1_u8(pu1_src_tmp)
+    ld1         {v5.8b},[x6],x2             //vld1_u8(pu1_src_tmp)
     shl         v4.8h, v20.8h,#6            //vshlq_n_s16(tmp, 6)
 
-    ld1         {v14.8b},[x6],x2            //vld1_u8(pu1_src_tmp)
+    ld1         {v7.8b},[x6],x2             //vld1_u8(pu1_src_tmp)
     add         x10,x1,x5
 
     shl         v6.8h, v22.8h,#6            //vshlq_n_s16(tmp, 6)
@@ -298,15 +298,15 @@ outer_loop_wd_8:
 
 epilog:
     st1         {v2.8h},[x10],x5            //vst1q_s16(pi2_dst_tmp, tmp)
-    uxtl        v16.8h, v8.8b               //vmovl_u8(vld1_u8(pu1_src_tmp))
+    uxtl        v16.8h, v1.8b               //vmovl_u8(vld1_u8(pu1_src_tmp))
 
     st1         {v4.8h},[x10],x5            //vst1q_s16(pi2_dst_tmp, tmp)
-    uxtl        v18.8h, v10.8b              //vmovl_u8(vld1_u8(pu1_src_tmp)
+    uxtl        v18.8h, v3.8b               //vmovl_u8(vld1_u8(pu1_src_tmp)
 
     st1         {v6.8h},[x10],x5            //vst1q_s16(pi2_dst_tmp, tmp)
-    uxtl        v20.8h, v12.8b              //vmovl_u8(vld1_u8(pu1_src_tmp)
+    uxtl        v20.8h, v5.8b               //vmovl_u8(vld1_u8(pu1_src_tmp)
 
-    uxtl        v22.8h, v14.8b              //vmovl_u8(vld1_u8(pu1_src_tmp)
+    uxtl        v22.8h, v7.8b               //vmovl_u8(vld1_u8(pu1_src_tmp)
     //add          x6,x0,x2                //pu1_src_tmp += src_strd
 
     shl         v0.8h, v16.8h,#6            //vshlq_n_s16(tmp, 6)
@@ -325,10 +325,10 @@ epilog_end:
 core_loop_wd_8_ht_2:
     add         x6,x0,x2                    //pu1_src_tmp += src_strd
     add         x10,x1,x5
-    ld1         {v8.8b},[x0],#8             //vld1_u8(pu1_src_tmp)
-    ld1         {v10.8b},[x6],x2            //vld1_u8(pu1_src_tmp)
-    uxtl        v16.8h, v8.8b               //vmovl_u8(vld1_u8(pu1_src_tmp))
-    uxtl        v18.8h, v10.8b              //vmovl_u8(vld1_u8(pu1_src_tmp)
+    ld1         {v1.8b},[x0],#8             //vld1_u8(pu1_src_tmp)
+    ld1         {v3.8b},[x6],x2             //vld1_u8(pu1_src_tmp)
+    uxtl        v16.8h, v1.8b               //vmovl_u8(vld1_u8(pu1_src_tmp))
+    uxtl        v18.8h, v3.8b               //vmovl_u8(vld1_u8(pu1_src_tmp)
     subs        x12,x12,#8                  //wd decrements by 8
     shl         v0.8h, v16.8h,#6            //vshlq_n_s16(tmp, 6)
     shl         v2.8h, v18.8h,#6            //vshlq_n_s16(tmp, 6)
@@ -338,7 +338,7 @@ core_loop_wd_8_ht_2:
 
     // ldmfd sp!,{x4-x12,x15}         //reload the registers from sp
     ldp         x19, x20,[sp],#16
-    pop_v_regs
+
     ret
 
 
diff --git a/common/arm64/ihevc_inter_pred_chroma_horz.s b/common/arm64/ihevc_inter_pred_chroma_horz.s
index cf4f0f9..513a362 100644
--- a/common/arm64/ihevc_inter_pred_chroma_horz.s
+++ b/common/arm64/ihevc_inter_pred_chroma_horz.s
@@ -105,7 +105,12 @@
 ihevc_inter_pred_chroma_horz_av8:
 
     // stmfd sp!, {x4-x12, x14}                    //stack stores the values of the arguments
-    push_v_regs
+
+    stp         d9,d10,[sp,#-16]!
+    stp         d11,d12,[sp,#-16]!
+    stp         d13,d14,[sp,#-16]!
+    stp         d8,d15,[sp,#-16]!           // Storing d15 using { sub sp,sp,#8; str d15,[sp] } is giving bus error.
+                                            // d8 is used as dummy register and stored along with d15 using stp. d8 is not used in the function.
     stp         x19, x20,[sp,#-16]!
 
     mov         x15,x4 // pi1_coeff
@@ -184,7 +189,7 @@ outer_loop_16:
 
     add         x19,x4,#8
     umull       v30.8h, v2.8b, v25.8b       //mul_res = vmull_u8(src[0_3], coeffabs_3)//
-    ld1         { v8.2s},[x4],x11           //vector load pu1_src
+    ld1         { v29.2s},[x4],x11          //vector load pu1_src
     ld1         { v9.2s},[x19],x11          //vector load pu1_src
 
     umlsl       v30.8h, v0.8b, v24.8b       //mul_res = vmlsl_u8(src[0_2], coeffabs_2)//
@@ -239,7 +244,7 @@ inner_loop_16:
     csel        x12, x20, x12,eq
     add         x20,x12,x2
     csel        x4, x20, x4,eq
-    umlsl       v22.8h, v8.8b, v24.8b       //mul_res = vmlsl_u8(src[0_2], coeffabs_2)//
+    umlsl       v22.8h, v29.8b, v24.8b      //mul_res = vmlsl_u8(src[0_2], coeffabs_2)//
 
 
 
@@ -282,7 +287,7 @@ inner_loop_16:
     umlal       v20.8h, v13.8b, v26.8b      //mul_res = vmlsl_u8(src[0_0], coeffabs_0)//
 
     add         x19,x4,#8
-    ld1         { v8.2s},[x4],x11           //vector load pu1_src
+    ld1         { v29.2s},[x4],x11          //vector load pu1_src
     ld1         { v9.2s},[x19],x11          //vector load pu1_src
     umlsl       v20.8h, v15.8b, v27.8b      //mul_res = vmlal_u8(src[0_1], coeffabs_1)//
 
@@ -351,7 +356,7 @@ epilog:
 
 
 
-    umlsl       v22.8h, v8.8b, v24.8b       //mul_res = vmlsl_u8(src[0_2], coeffabs_2)//
+    umlsl       v22.8h, v29.8b, v24.8b      //mul_res = vmlsl_u8(src[0_2], coeffabs_2)//
     subs        x10,x10,#16                 //decrement the wd loop
     umlal       v22.8h, v12.8b, v26.8b      //mul_res = vmlsl_u8(src[0_0], coeffabs_0)//
     add         x20,x12,x8
@@ -383,7 +388,7 @@ epilog:
 
 
     add         x19,x4,#8
-    ld1         { v8.2s},[x4],x11           //vector load pu1_src
+    ld1         { v29.2s},[x4],x11          //vector load pu1_src
     ld1         { v9.2s},[x19],x11          //vector load pu1_src
     umlsl       v30.8h, v0.8b, v24.8b       //mul_res = vmlsl_u8(src[0_2], coeffabs_2)//
     ld1         { v10.2s},[x4],x11          //vector load pu1_src
@@ -418,7 +423,7 @@ epilog_end:
 
 
     umull       v22.8h, v10.8b, v25.8b      //mul_res = vmull_u8(src[0_3], coeffabs_3)//
-    umlsl       v22.8h, v8.8b, v24.8b       //mul_res = vmlsl_u8(src[0_2], coeffabs_2)//
+    umlsl       v22.8h, v29.8b, v24.8b      //mul_res = vmlsl_u8(src[0_2], coeffabs_2)//
     umlal       v22.8h, v12.8b, v26.8b      //mul_res = vmlsl_u8(src[0_0], coeffabs_0)//
     umlsl       v22.8h, v14.8b, v27.8b      //mul_res = vmlal_u8(src[0_1], coeffabs_1)//
 
@@ -478,12 +483,12 @@ inner_loop_8:
     ld1         {v3.2s},[x12],x11           //vector load pu1_src
 
     //vext.u8    d2,d0,d1,#2                        //vector extract of src[0_2]
-    umull       v8.8h, v1.8b, v25.8b        //mul_res = vmull_u8(src[0_3], coeffabs_3)//
-    umlsl       v8.8h, v0.8b, v24.8b        //mul_res = vmlsl_u8(src[0_2], coeffabs_2)//
+    umull       v29.8h, v1.8b, v25.8b       //mul_res = vmull_u8(src[0_3], coeffabs_3)//
+    umlsl       v29.8h, v0.8b, v24.8b       //mul_res = vmlsl_u8(src[0_2], coeffabs_2)//
     //vext.u8    d4,d0,d1,#4                        //vector extract of src[0_4]
     //vext.u8    d6,d0,d1,#6                        //vector extract of src[0_6]
-    umlal       v8.8h, v2.8b, v26.8b        //mul_res = vmlsl_u8(src[0_0], coeffabs_0)//
-    umlsl       v8.8h, v3.8b, v27.8b        //mul_res = vmlal_u8(src[0_1], coeffabs_1)//
+    umlal       v29.8h, v2.8b, v26.8b       //mul_res = vmlsl_u8(src[0_0], coeffabs_0)//
+    umlsl       v29.8h, v3.8b, v27.8b       //mul_res = vmlal_u8(src[0_1], coeffabs_1)//
 
     ld1         {v4.2s},[x4],x11            //vector load pu1_src
     ld1         {v5.2s},[x4],x11            //vector load pu1_src
@@ -495,11 +500,11 @@ inner_loop_8:
     umlsl       v10.8h, v4.8b, v24.8b       //mul_res = vmlsl_u8(src[0_2], coeffabs_2)//
     //vext.u8    d16,d12,d13,#4                    //vector extract of src[0_4]
     //vext.u8    d18,d12,d13,#6                    //vector extract of src[0_6]
-    sqrshrun    v8.8b, v8.8h,#6             //right shift and saturating narrow result 1
+    sqrshrun    v29.8b, v29.8h,#6           //right shift and saturating narrow result 1
     umlal       v10.8h, v6.8b, v26.8b       //mul_res = vmlsl_u8(src[0_0], coeffabs_0)//
     umlsl       v10.8h, v7.8b, v27.8b       //mul_res = vmlal_u8(src[0_1], coeffabs_1)//
 
-    st1         {v8.8b},[x1],#8             //store the result pu1_dst
+    st1         {v29.8b},[x1],#8            //store the result pu1_dst
 
     sqrshrun    v10.8b, v10.8h,#6           //right shift and saturating narrow result 2
     subs        x7,x7,#8                    //decrement the wd loop
@@ -545,17 +550,17 @@ inner_loop_ht_4:
     //sub        x12, x12, #6                //(2)
 
     ld1         {v14.2s},[x12],x11          //(3)vector load pu1_src
-    umull       v8.8h, v1.8b, v25.8b        //(1)mul_res = vmull_u8(src[0_3], coeffabs_3)//
+    umull       v29.8h, v1.8b, v25.8b       //(1)mul_res = vmull_u8(src[0_3], coeffabs_3)//
 
     ld1         {v15.2s},[x12],x11          //(3)vector load pu1_src
-    umlsl       v8.8h, v0.8b, v24.8b        //(1)mul_res = vmlsl_u8(src[0_2], coeffabs_2)//
+    umlsl       v29.8h, v0.8b, v24.8b       //(1)mul_res = vmlsl_u8(src[0_2], coeffabs_2)//
 
     ld1         {v16.2s},[x12],x11          //(3)vector load pu1_src
-    umlal       v8.8h, v2.8b, v26.8b        //(1)mul_res = vmlsl_u8(src[0_0], coeffabs_0)//
+    umlal       v29.8h, v2.8b, v26.8b       //(1)mul_res = vmlsl_u8(src[0_0], coeffabs_0)//
 
     //ld1 {v17.2s},[x12],x2                //(3)vector load pu1_src
     ld1         {v17.2s},[x12],x8           //(3)vector load pu1_src
-    umlsl       v8.8h, v3.8b, v27.8b        //(1)mul_res = vmlal_u8(src[0_1], coeffabs_1)//
+    umlsl       v29.8h, v3.8b, v27.8b       //(1)mul_res = vmlal_u8(src[0_1], coeffabs_1)//
 
     //sub        x12, x12, #6                //(3)
     umull       v10.8h, v5.8b, v25.8b       //(2)mul_res = vmull_u8(src[0_3], coeffabs_3)//
@@ -570,7 +575,7 @@ inner_loop_ht_4:
     umlsl       v10.8h, v7.8b, v27.8b       //(2)mul_res = vmlal_u8(src[0_1], coeffabs_1)//
 
     ld1         {v21.2s},[x12],x2           //(4)vector load pu1_src
-    sqrshrun    v8.8b, v8.8h,#6             //(1)right shift and saturating narrow result 1
+    sqrshrun    v29.8b, v29.8h,#6           //(1)right shift and saturating narrow result 1
 
     add         x9,x9,#8                    //(core loop)
 
@@ -595,7 +600,7 @@ core_loop:
 
     //sub        x12, x12, #6                //(1_1)
 
-    st1         {v8.8b},[x4],x3             //(1)store the result pu1_dst
+    st1         {v29.8b},[x4],x3            //(1)store the result pu1_dst
     sqrshrun    v10.8b, v10.8h,#6           //(2)right shift and saturating narrow result 2
 
     ld1         {v4.2s},[x12],x11           //(2_1)vector load pu1_src
@@ -617,17 +622,17 @@ core_loop:
     sqrshrun    v12.8b, v12.8h,#6           //(3)right shift and saturating narrow result 1
 
     ld1         {v14.2s},[x12],x11          //(3_1)vector load pu1_src
-    umull       v8.8h, v1.8b, v25.8b        //(1_1)mul_res = vmull_u8(src[0_3], coeffabs_3)//
+    umull       v29.8h, v1.8b, v25.8b       //(1_1)mul_res = vmull_u8(src[0_3], coeffabs_3)//
 
     ld1         {v15.2s},[x12],x11          //(3_1)vector load pu1_src
-    umlsl       v8.8h, v0.8b, v24.8b        //(1_1)mul_res = vmlsl_u8(src[0_2], coeffabs_2)//
+    umlsl       v29.8h, v0.8b, v24.8b       //(1_1)mul_res = vmlsl_u8(src[0_2], coeffabs_2)//
 
     ld1         {v16.2s},[x12],x11          //(3_1)vector load pu1_src
-    umlal       v8.8h, v2.8b, v26.8b        //(1_1)mul_res = vmlsl_u8(src[0_0], coeffabs_0)//
+    umlal       v29.8h, v2.8b, v26.8b       //(1_1)mul_res = vmlsl_u8(src[0_0], coeffabs_0)//
 
     //ld1 {v17.2s},[x12],x2                //(3_1)vector load pu1_src
     ld1         {v17.2s},[x12],x8           //(3_1)vector load pu1_src
-    umlsl       v8.8h, v3.8b, v27.8b        //(1_1)mul_res = vmlal_u8(src[0_1], coeffabs_1)//
+    umlsl       v29.8h, v3.8b, v27.8b       //(1_1)mul_res = vmlal_u8(src[0_1], coeffabs_1)//
 
     //sub        x12, x12, #6                //(3_1)
 
@@ -653,7 +658,7 @@ core_loop:
     subs        x7,x7,#8                    //(core loop)
 
     st1         {v22.8b},[x4], x3           //(4)store the result pu1_dst
-    sqrshrun    v8.8b, v8.8h,#6             //(1_1)right shift and saturating narrow result 1
+    sqrshrun    v29.8b, v29.8h,#6           //(1_1)right shift and saturating narrow result 1
 
     mov         x4, x1                      //(core loop)
 
@@ -668,7 +673,7 @@ epilogue:
 
     umlsl       v12.8h, v17.8b, v27.8b      //(3)mul_res = vmlal_u8(src[0_1], coeffabs_1)//
 
-    st1         {v8.8b},[x4],x3             //(1)store the result pu1_dst
+    st1         {v29.8b},[x4],x3            //(1)store the result pu1_dst
     sqrshrun    v10.8b, v10.8h,#6           //(2)right shift and saturating narrow result 2
 
     umull       v22.8h, v19.8b, v25.8b      //(4)mul_res = vmull_u8(src[0_3], coeffabs_3)//
@@ -735,16 +740,16 @@ inner_loop_4:
     zip1        v3.2s, v23.2s, v19.2s
     zip2        v7.2s, v23.2s, v19.2s
 
-    umull       v8.8h, v1.8b, v25.8b        //arithmetic operations for ii iteration in the same time
-    umlsl       v8.8h, v0.8b, v24.8b
-    umlal       v8.8h, v2.8b, v26.8b
-    umlsl       v8.8h, v3.8b, v27.8b
+    umull       v29.8h, v1.8b, v25.8b       //arithmetic operations for ii iteration in the same time
+    umlsl       v29.8h, v0.8b, v24.8b
+    umlal       v29.8h, v2.8b, v26.8b
+    umlsl       v29.8h, v3.8b, v27.8b
 
-    sqrshrun    v8.8b, v8.8h,#6             //narrow right shift and saturating the result
-    st1         {v8.s}[0],[x1],#4           //store the i iteration result which is in upper part of the register
+    sqrshrun    v29.8b, v29.8h,#6           //narrow right shift and saturating the result
+    st1         {v29.s}[0],[x1],#4          //store the i iteration result which is in upper part of the register
     subs        x7,x7,#4                    //decrement the wd by 4
 
-    st1         {v8.s}[1],[x6],#4           //store the ii iteration result which is in lower part of the register
+    st1         {v29.s}[1],[x6],#4          //store the ii iteration result which is in lower part of the register
 
     bgt         inner_loop_4
 
@@ -759,7 +764,11 @@ end_loops:
 
     // ldmfd sp!,{x4-x12,x15}                  //reload the registers from sp
     ldp         x19, x20,[sp],#16
-    pop_v_regs
+    ldp         d8,d15,[sp],#16             // Loading d15 using { ldr d15,[sp]; add sp,sp,#8 } is giving bus error.
+                                            // d8 is used as dummy register and loaded along with d15 using ldp. d8 is not used in the function.
+    ldp         d13,d14,[sp],#16
+    ldp         d11,d12,[sp],#16
+    ldp         d9,d10,[sp],#16
     ret
 
 
diff --git a/common/arm64/ihevc_inter_pred_chroma_horz_w16out.s b/common/arm64/ihevc_inter_pred_chroma_horz_w16out.s
index a35fdaa..efc09f9 100644
--- a/common/arm64/ihevc_inter_pred_chroma_horz_w16out.s
+++ b/common/arm64/ihevc_inter_pred_chroma_horz_w16out.s
@@ -104,7 +104,10 @@
 ihevc_inter_pred_chroma_horz_w16out_av8:
 
     // stmfd sp!, {x4-x12, x14}                    //stack stores the values of the arguments
-    push_v_regs
+
+    stp         d10,d11,[sp,#-16]!
+    stp         d12,d13,[sp,#-16]!
+    stp         d14,d15,[sp,#-16]!
     stp         x19, x20,[sp,#-16]!
 
     mov         x15,x4 // pi1_coeff
@@ -201,8 +204,8 @@ outer_loop_16:
 
     add         x19,x4,#8
     umull       v30.8h, v2.8b, v25.8b       //mul_res = vmull_u8(src[0_3], coeffabs_3)//
-    ld1         { v8.2s},[x4],x11           //vector load pu1_src
-    ld1         { v9.2s},[x19],x11          //vector load pu1_src
+    ld1         { v29.2s},[x4],x11          //vector load pu1_src
+    ld1         { v31.2s},[x19],x11         //vector load pu1_src
 
     umlsl       v30.8h, v0.8b, v24.8b       //mul_res = vmlsl_u8(src[0_2], coeffabs_2)//
 
@@ -261,7 +264,7 @@ inner_loop_16:
 
 
     st1         { v30.8h}, [x1],#16
-    umlsl       v22.8h, v8.8b, v24.8b       //mul_res = vmlsl_u8(src[0_2], coeffabs_2)//
+    umlsl       v22.8h, v29.8b, v24.8b      //mul_res = vmlsl_u8(src[0_2], coeffabs_2)//
 
 
 
@@ -284,15 +287,15 @@ inner_loop_16:
     umull       v20.8h, v11.8b, v25.8b      //mul_res = vmull_u8(src[0_3], coeffabs_3)//
 
     st1         { v28.8h}, [x1],x8
-    umlsl       v20.8h, v9.8b, v24.8b       //mul_res = vmlsl_u8(src[0_2], coeffabs_2)//
+    umlsl       v20.8h, v31.8b, v24.8b      //mul_res = vmlsl_u8(src[0_2], coeffabs_2)//
 
     ld1         { v6.2s},[x12],x9           //vector load pu1_src
     ld1         { v7.2s},[x19],x9           //vector load pu1_src
     umlal       v20.8h, v13.8b, v26.8b      //mul_res = vmlsl_u8(src[0_0], coeffabs_0)//
 
     add         x19,x4,#8
-    ld1         { v8.2s},[x4],x11           //vector load pu1_src
-    ld1         { v9.2s},[x19],x11          //vector load pu1_src
+    ld1         { v29.2s},[x4],x11          //vector load pu1_src
+    ld1         { v31.2s},[x19],x11         //vector load pu1_src
     umlsl       v20.8h, v15.8b, v27.8b      //mul_res = vmlal_u8(src[0_1], coeffabs_1)//
 
 
@@ -346,7 +349,7 @@ epilog:
 
 
 
-    umlsl       v22.8h, v8.8b, v24.8b       //mul_res = vmlsl_u8(src[0_2], coeffabs_2)//
+    umlsl       v22.8h, v29.8b, v24.8b      //mul_res = vmlsl_u8(src[0_2], coeffabs_2)//
     subs        x10,x10,#16                 //decrement the wd loop
     umlal       v22.8h, v12.8b, v26.8b      //mul_res = vmlsl_u8(src[0_0], coeffabs_0)//
 //     add x20,x12,x2,lsl #1
@@ -365,7 +368,7 @@ epilog:
     ld1         { v0.2s},[x12],x11          //vector load pu1_src
     ld1         { v1.2s},[x19],x11          //vector load pu1_src
 
-    umlsl       v20.8h, v9.8b, v24.8b       //mul_res = vmlsl_u8(src[0_2], coeffabs_2)//
+    umlsl       v20.8h, v31.8b, v24.8b      //mul_res = vmlsl_u8(src[0_2], coeffabs_2)//
 
     ld1         { v2.2s},[x12],x11          //vector load pu1_src
     ld1         { v3.2s},[x19],x11          //vector load pu1_src
@@ -381,8 +384,8 @@ epilog:
     umull       v30.8h, v2.8b, v25.8b       //mul_res = vmull_u8(src[0_3], coeffabs_3)//
 
     add         x19,x4,#8
-    ld1         { v8.2s},[x4],x11           //vector load pu1_src
-    ld1         { v9.2s},[x19],x11          //vector load pu1_src
+    ld1         { v29.2s},[x4],x11          //vector load pu1_src
+    ld1         { v31.2s},[x19],x11         //vector load pu1_src
     umlsl       v30.8h, v0.8b, v24.8b       //mul_res = vmlsl_u8(src[0_2], coeffabs_2)//
 
     ld1         { v10.2s},[x4],x11          //vector load pu1_src
@@ -410,13 +413,13 @@ epilog:
 epilog_end:
 
     umull       v22.8h, v10.8b, v25.8b      //mul_res = vmull_u8(src[0_3], coeffabs_3)//
-    umlsl       v22.8h, v8.8b, v24.8b       //mul_res = vmlsl_u8(src[0_2], coeffabs_2)//
+    umlsl       v22.8h, v29.8b, v24.8b      //mul_res = vmlsl_u8(src[0_2], coeffabs_2)//
     umlal       v22.8h, v12.8b, v26.8b      //mul_res = vmlsl_u8(src[0_0], coeffabs_0)//
     umlsl       v22.8h, v14.8b, v27.8b      //mul_res = vmlal_u8(src[0_1], coeffabs_1)//
 
 
     umull       v20.8h, v11.8b, v25.8b      //mul_res = vmull_u8(src[0_3], coeffabs_3)//
-    umlsl       v20.8h, v9.8b, v24.8b       //mul_res = vmlsl_u8(src[0_2], coeffabs_2)//
+    umlsl       v20.8h, v31.8b, v24.8b      //mul_res = vmlsl_u8(src[0_2], coeffabs_2)//
     umlal       v20.8h, v13.8b, v26.8b      //mul_res = vmlsl_u8(src[0_0], coeffabs_0)//
     umlsl       v20.8h, v15.8b, v27.8b      //mul_res = vmlal_u8(src[0_1], coeffabs_1)//
 
@@ -463,12 +466,12 @@ inner_loop_8:
 
 
     //vext.u8    d2,d0,d1,#2                        //vector extract of src[0_2]
-    umull       v8.8h, v1.8b, v25.8b        //mul_res = vmull_u8(src[0_3], coeffabs_3)//
-    umlsl       v8.8h, v0.8b, v24.8b        //mul_res = vmlsl_u8(src[0_2], coeffabs_2)//
+    umull       v29.8h, v1.8b, v25.8b       //mul_res = vmull_u8(src[0_3], coeffabs_3)//
+    umlsl       v29.8h, v0.8b, v24.8b       //mul_res = vmlsl_u8(src[0_2], coeffabs_2)//
     //vext.u8    d4,d0,d1,#4                        //vector extract of src[0_4]
     //vext.u8    d6,d0,d1,#6                        //vector extract of src[0_6]
-    umlal       v8.8h, v2.8b, v26.8b        //mul_res = vmlsl_u8(src[0_0], coeffabs_0)//
-    umlsl       v8.8h, v3.8b, v27.8b        //mul_res = vmlal_u8(src[0_1], coeffabs_1)//
+    umlal       v29.8h, v2.8b, v26.8b       //mul_res = vmlsl_u8(src[0_0], coeffabs_0)//
+    umlsl       v29.8h, v3.8b, v27.8b       //mul_res = vmlal_u8(src[0_1], coeffabs_1)//
 
     //ld1 {v12.2s, v13.2s},[x4],x11                //vector load pu1_src + src_strd
     ld1         {v4.2s},[x4],x11            //vector load pu1_src
@@ -483,7 +486,7 @@ inner_loop_8:
     umlal       v10.8h, v6.8b, v26.8b       //mul_res = vmlsl_u8(src[0_0], coeffabs_0)//
     umlsl       v10.8h, v7.8b, v27.8b       //mul_res = vmlal_u8(src[0_1], coeffabs_1)//
 
-    st1         {v8.8h}, [x1],#16
+    st1         {v29.8h}, [x1],#16
 
     subs        x10,x10,#8                  //decrement the wd loop
     st1         {v10.8h},[x6],#16           //store the result pu1_dst
@@ -530,16 +533,16 @@ inner_loop_ht_4:
     ld1         {v7.2s},[x12],x0            //(2)vector load pu1_src
 
     ld1         {v14.2s},[x12],x11          //(3)vector load pu1_src
-    umull       v8.8h, v1.8b, v25.8b        //(1)mul_res = vmull_u8(src[0_3], coeffabs_3)//
+    umull       v29.8h, v1.8b, v25.8b       //(1)mul_res = vmull_u8(src[0_3], coeffabs_3)//
 
     ld1         {v15.2s},[x12],x11          //(3)vector load pu1_src
-    umlsl       v8.8h, v0.8b, v24.8b        //(1)mul_res = vmlsl_u8(src[0_2], coeffabs_2)//
+    umlsl       v29.8h, v0.8b, v24.8b       //(1)mul_res = vmlsl_u8(src[0_2], coeffabs_2)//
 
     ld1         {v16.2s},[x12],x11          //(3)vector load pu1_src
-    umlal       v8.8h, v2.8b, v26.8b        //(1)mul_res = vmlsl_u8(src[0_0], coeffabs_0)//
+    umlal       v29.8h, v2.8b, v26.8b       //(1)mul_res = vmlsl_u8(src[0_0], coeffabs_0)//
 
     ld1         {v17.2s},[x12],x0           //(3)vector load pu1_src
-    umlsl       v8.8h, v3.8b, v27.8b        //(1)mul_res = vmlal_u8(src[0_1], coeffabs_1)//
+    umlsl       v29.8h, v3.8b, v27.8b       //(1)mul_res = vmlal_u8(src[0_1], coeffabs_1)//
 
     ld1         {v18.2s},[x12],x11          //(4)vector load pu1_src
     umull       v10.8h, v5.8b, v25.8b       //(2)mul_res = vmull_u8(src[0_3], coeffabs_3)//
@@ -559,7 +562,7 @@ inner_loop_ht_4:
     beq         epilogue
 
 core_loop:
-    st1         {v8.8h},[x4],x8             //(1)store the result pu1_dst
+    st1         {v29.8h},[x4],x8            //(1)store the result pu1_dst
     mov         x12,x9
 
     ld1         {v0.2s},[x12],x11           //(1_1)vector load pu1_src
@@ -593,16 +596,16 @@ core_loop:
     add         x1,x1,#16                   //(core loop)
 
     ld1         {v14.2s},[x12],x11          //(3_1)vector load pu1_src
-    umull       v8.8h, v1.8b, v25.8b        //(1_1)mul_res = vmull_u8(src[0_3], coeffabs_3)//
+    umull       v29.8h, v1.8b, v25.8b       //(1_1)mul_res = vmull_u8(src[0_3], coeffabs_3)//
 
     ld1         {v15.2s},[x12],x11          //(3_1)vector load pu1_src
-    umlsl       v8.8h, v0.8b, v24.8b        //(1_1)mul_res = vmlsl_u8(src[0_2], coeffabs_2)//
+    umlsl       v29.8h, v0.8b, v24.8b       //(1_1)mul_res = vmlsl_u8(src[0_2], coeffabs_2)//
 
     ld1         {v16.2s},[x12],x11          //(3_1)vector load pu1_src
-    umlal       v8.8h, v2.8b, v26.8b        //(1_1)mul_res = vmlsl_u8(src[0_0], coeffabs_0)//
+    umlal       v29.8h, v2.8b, v26.8b       //(1_1)mul_res = vmlsl_u8(src[0_0], coeffabs_0)//
 
     ld1         {v17.2s},[x12],x0           //(3_1)vector load pu1_src
-    umlsl       v8.8h, v3.8b, v27.8b        //(1_1)mul_res = vmlal_u8(src[0_1], coeffabs_1)//
+    umlsl       v29.8h, v3.8b, v27.8b       //(1_1)mul_res = vmlal_u8(src[0_1], coeffabs_1)//
 
     st1         {v22.8h}, [x4], x8          //(4)store the result pu1_dst
     subs        x10,x10,#8                  //(core loop)
@@ -634,7 +637,7 @@ epilogue:
 
     umlsl       v12.8h, v17.8b, v27.8b      //(3)mul_res = vmlal_u8(src[0_1], coeffabs_1)//
 
-    st1         {v8.8h},[x4], x8            //(1)store the result pu1_dst
+    st1         {v29.8h},[x4], x8           //(1)store the result pu1_dst
 
     umull       v22.8h, v19.8b, v25.8b      //(4)mul_res = vmull_u8(src[0_3], coeffabs_3)//
     umlsl       v22.8h, v18.8b, v24.8b      //(4)mul_res = vmlsl_u8(src[0_2], coeffabs_2)//
@@ -720,15 +723,15 @@ inner_loop_4:
     zip2        v7.2s, v23.2s, v19.2s
 //**** addn ends
 
-    umull       v8.8h, v1.8b, v25.8b        //arithmetic operations for ii iteration in the same time
-    umlsl       v8.8h, v0.8b, v24.8b
-    umlal       v8.8h, v2.8b, v26.8b
-    umlsl       v8.8h, v3.8b, v27.8b
+    umull       v29.8h, v1.8b, v25.8b       //arithmetic operations for ii iteration in the same time
+    umlsl       v29.8h, v0.8b, v24.8b
+    umlal       v29.8h, v2.8b, v26.8b
+    umlsl       v29.8h, v3.8b, v27.8b
 
-    st1         {v8.d}[0],[x1],#8           //store the i iteration result which is in upper part of the register
+    st1         {v29.d}[0],[x1],#8          //store the i iteration result which is in upper part of the register
     subs        x10,x10,#4                  //decrement the wd by 4
 
-    st1         {v8.d}[1],[x6],#8           //store the ii iteration result which is in lower part of the register
+    st1         {v29.d}[1],[x6],#8          //store the ii iteration result which is in lower part of the register
 
     bgt         inner_loop_4
 
@@ -763,12 +766,12 @@ loop_residue:
     //vext.u8        d6,d0,d1,#6                //vector extract of src[0_6]
     //umlal v8.8h, v4.8b, v26.8b                //mul_res = vmlsl_u8(src[0_0], coeffabs_0)//
     //umlsl v8.8h, v6.8b, v27.8b                //mul_res = vmlal_u8(src[0_1], coeffabs_1)//
-    umull       v8.8h, v21.8b, v25.8b
-    umlsl       v8.8h, v20.8b, v24.8b
-    umlal       v8.8h, v22.8b, v26.8b
-    umlsl       v8.8h, v23.8b, v27.8b
+    umull       v29.8h, v21.8b, v25.8b
+    umlsl       v29.8h, v20.8b, v24.8b
+    umlal       v29.8h, v22.8b, v26.8b
+    umlsl       v29.8h, v23.8b, v27.8b
 
-    st1         {v8.1d},[x1]                //store the result pu1_dst
+    st1         {v29.1d},[x1]               //store the result pu1_dst
     subs        x10,x10,#4                  //decrement the wd loop
     add         x1,x1,#8                    //pi2_dst + 8
 
@@ -788,7 +791,9 @@ end_loops:
 
     // ldmfd sp!,{x4-x12,x15}                  //reload the registers from sp
     ldp         x19, x20,[sp],#16
-    pop_v_regs
+    ldp         d14,d15,[sp],#16
+    ldp         d12,d13,[sp],#16
+    ldp         d10,d11,[sp],#16
     ret
 
 
diff --git a/common/arm64/ihevc_inter_pred_chroma_vert.s b/common/arm64/ihevc_inter_pred_chroma_vert.s
index 2de789f..3d61f6c 100644
--- a/common/arm64/ihevc_inter_pred_chroma_vert.s
+++ b/common/arm64/ihevc_inter_pred_chroma_vert.s
@@ -104,7 +104,7 @@
 ihevc_inter_pred_chroma_vert_av8:
 
     // stmfd sp!,{x4-x12,x14}        //stack stores the values of the arguments
-    push_v_regs
+
     stp         x19, x20,[sp,#-16]!
 
     mov         x15,x4 // pi1_coeff
@@ -142,21 +142,21 @@ ihevc_inter_pred_chroma_vert_av8:
 inner_loop_ht_2:                            //called when wd is multiple of 4 and ht is 4,2
 
     add         x6,x0,x2                    //pu1_src +src_strd
-    ld1         {v9.8b},[x6],x2             //loads pu1_src
+    ld1         {v17.8b},[x6],x2            //loads pu1_src
     subs        x5,x5,#8                    //2wd - 8
     ld1         {v5.8b},[x0],#8             //loads src
-    umull       v6.8h, v9.8b, v1.8b         //vmull_u8(vreinterpret_u8_u32(src_tmp2), coeffabs_1)
+    umull       v6.8h, v17.8b, v1.8b        //vmull_u8(vreinterpret_u8_u32(src_tmp2), coeffabs_1)
     ld1         {v4.8b},[x6],x2             //loads incremented src
     umlsl       v6.8h, v5.8b, v0.8b         //vmlsl_u8(mul_res1, vreinterpret_u8_u32(src_tmp1), coeffabs_0)
-    ld1         {v8.8b},[x6],x2             //loads incremented src
+    ld1         {v16.8b},[x6],x2            //loads incremented src
     umlal       v6.8h, v4.8b, v2.8b         //vmlal_u8(mul_res1, vreinterpret_u8_u32(src_tmp3), coeffabs_2)
     umull       v4.8h, v4.8b, v1.8b
-    umlsl       v6.8h, v8.8b, v3.8b
-    umlsl       v4.8h, v9.8b, v0.8b
-    ld1         {v10.8b},[x6]               //loads the incremented src
-    umlal       v4.8h, v8.8b, v2.8b
+    umlsl       v6.8h, v16.8b, v3.8b
+    umlsl       v4.8h, v17.8b, v0.8b
+    ld1         {v18.8b},[x6]               //loads the incremented src
+    umlal       v4.8h, v16.8b, v2.8b
     sqrshrun    v6.8b, v6.8h,#6             //shifts right
-    umlsl       v4.8h, v10.8b, v3.8b
+    umlsl       v4.8h, v18.8b, v3.8b
     add         x6,x1,x3                    //pu1_dst + dst_strd
     sqrshrun    v4.8b, v4.8h,#6             //shifts right
     st1         {v6.8b},[x1],#8             //stores the loaded value
@@ -240,7 +240,7 @@ prolog:
     add         x7,x1,x3                    //pu1_dst
     umlal       v30.8h, v6.8b, v2.8b
     umlsl       v30.8h, v7.8b, v3.8b
-    ld1         {v8.8b},[x6],x2             //load and increment
+    ld1         {v16.8b},[x6],x2            //load and increment
 
     umull       v28.8h, v6.8b, v1.8b        //mul_res 2
     add         x20,x0,x9                   //pu1_dst += 4*dst_strd - 2*wd
@@ -249,30 +249,30 @@ prolog:
     bic         x20,x10,#7                  //x5 ->wd
     csel        x5, x20, x5,le
     umlal       v28.8h, v7.8b, v2.8b
-    ld1         {v9.8b},[x6],x2
-    umlsl       v28.8h, v8.8b, v3.8b
+    ld1         {v17.8b},[x6],x2
+    umlsl       v28.8h, v16.8b, v3.8b
     sqrshrun    v30.8b, v30.8h,#6
 
-    ld1         {v10.8b},[x6],x2
+    ld1         {v18.8b},[x6],x2
     umull       v26.8h, v7.8b, v1.8b
     add         x6,x0,x2                    //pu1_src + src_strd
     umlsl       v26.8h, v6.8b, v0.8b
     st1         {v30.8b},[x1],#8            //stores the loaded value
-    umlal       v26.8h, v8.8b, v2.8b
+    umlal       v26.8h, v16.8b, v2.8b
     ld1         {v4.8b},[x0],#8             //loads the source
-    umlsl       v26.8h, v9.8b, v3.8b
+    umlsl       v26.8h, v17.8b, v3.8b
     sqrshrun    v28.8b, v28.8h,#6
 
     add         x20,x1,x8                   //pu1_src += 4*src_strd - 2*wd
     csel        x1, x20, x1,le
-    umull       v24.8h, v8.8b, v1.8b
+    umull       v24.8h, v16.8b, v1.8b
     ld1         {v5.8b},[x6],x2             //loads pu1_src
     umlsl       v24.8h, v7.8b, v0.8b
     subs        x12,x12,#4
     ld1         {v6.8b},[x6],x2             //load and increment
-    umlal       v24.8h, v9.8b, v2.8b
+    umlal       v24.8h, v17.8b, v2.8b
     ld1         {v7.8b},[x6],x2             //load and increment
-    umlsl       v24.8h, v10.8b, v3.8b
+    umlsl       v24.8h, v18.8b, v3.8b
 
     lsl         x11,x2,#2
     st1         {v28.8b},[x7],x3            //stores the loaded value
@@ -299,7 +299,7 @@ kernel_8:
     st1         {v26.8b},[x7],x3            //stores the loaded value
     sqrshrun    v24.8b, v24.8h,#6
 
-    ld1         {v8.8b},[x6],x2             //load and increment
+    ld1         {v16.8b},[x6],x2            //load and increment
 
     umull       v28.8h, v6.8b, v1.8b        //mul_res 2
     bic         x20,x10,#7                  //x5 ->wd
@@ -309,11 +309,11 @@ kernel_8:
 
     umlal       v28.8h, v7.8b, v2.8b
 
-    ld1         {v9.8b},[x6],x2
+    ld1         {v17.8b},[x6],x2
     sqrshrun    v30.8b, v30.8h,#6
 
-    umlsl       v28.8h, v8.8b, v3.8b
-    ld1         {v10.8b},[x6],x2
+    umlsl       v28.8h, v16.8b, v3.8b
+    ld1         {v18.8b},[x6],x2
     add         x7,x1,x3                    //pu1_dst
     umull       v26.8h, v7.8b, v1.8b
     add         x6,x0,x2                    //pu1_src + src_strd
@@ -325,16 +325,16 @@ kernel_8:
     umlsl       v26.8h, v6.8b, v0.8b
     ld1         {v4.8b},[x0],#8             //loads the source
 
-    umlal       v26.8h, v8.8b, v2.8b
+    umlal       v26.8h, v16.8b, v2.8b
     st1         {v30.8b},[x1],#8            //stores the loaded value
 
-    umlsl       v26.8h, v9.8b, v3.8b
+    umlsl       v26.8h, v17.8b, v3.8b
     ld1         {v5.8b},[x6],x2             //loads pu1_src
 
     add         x11,x11,x2
     sqrshrun    v28.8b, v28.8h,#6
 
-    umull       v24.8h, v8.8b, v1.8b
+    umull       v24.8h, v16.8b, v1.8b
     ld1         {v6.8b},[x6],x2             //load and increment
     add         x20,x1,x8                   //pu1_src += 4*src_strd - 2*wd
     csel        x1, x20, x1,le
@@ -348,10 +348,10 @@ kernel_8:
     umlsl       v24.8h, v7.8b, v0.8b
     subs        x12,x12,#4
 
-    umlal       v24.8h, v9.8b, v2.8b
+    umlal       v24.8h, v17.8b, v2.8b
     ld1         {v7.8b},[x6],x2             //load and increment
 
-    umlsl       v24.8h, v10.8b, v3.8b
+    umlsl       v24.8h, v18.8b, v3.8b
     st1         {v28.8b},[x7],x3            //stores the loaded value
     sqrshrun    v26.8b, v26.8h,#6
 
@@ -366,39 +366,39 @@ epilog:
     st1         {v26.8b},[x7],x3            //stores the loaded value
     sqrshrun    v24.8b, v24.8h,#6
 
-    ld1         {v8.8b},[x6],x2             //load and increment
+    ld1         {v16.8b},[x6],x2            //load and increment
     umull       v28.8h, v6.8b, v1.8b        //mul_res 2
     umlsl       v28.8h, v5.8b, v0.8b
     umlal       v28.8h, v7.8b, v2.8b
-    umlsl       v28.8h, v8.8b, v3.8b
+    umlsl       v28.8h, v16.8b, v3.8b
     st1         {v24.8b},[x7],x3            //stores the loaded value
     sqrshrun    v30.8b, v30.8h,#6
 
-    ld1         {v9.8b},[x6],x2
+    ld1         {v17.8b},[x6],x2
     umull       v26.8h, v7.8b, v1.8b
     add         x7,x1,x3                    //pu1_dst
     umlsl       v26.8h, v6.8b, v0.8b
     st1         {v30.8b},[x1],#8            //stores the loaded value
 
     sqrshrun    v28.8b, v28.8h,#6
-    umlal       v26.8h, v8.8b, v2.8b
-    ld1         {v10.8b},[x6],x2
-    umlsl       v26.8h, v9.8b, v3.8b
+    umlal       v26.8h, v16.8b, v2.8b
+    ld1         {v18.8b},[x6],x2
+    umlsl       v26.8h, v17.8b, v3.8b
 
-    umull       v24.8h, v8.8b, v1.8b
+    umull       v24.8h, v16.8b, v1.8b
     sqrshrun    v26.8b, v26.8h,#6
     st1         {v28.8b},[x7],x3            //stores the loaded value
     umlsl       v24.8h, v7.8b, v0.8b
-    umlal       v24.8h, v9.8b, v2.8b
+    umlal       v24.8h, v17.8b, v2.8b
     st1         {v26.8b},[x7],x3            //stores the loaded value
-    umlsl       v24.8h, v10.8b, v3.8b
+    umlsl       v24.8h, v18.8b, v3.8b
 
     sqrshrun    v24.8b, v24.8h,#6
     st1         {v24.8b},[x7],x3            //stores the loaded value
 end_loops:
     // ldmfd sp!,{x4-x12,x15}        //reload the registers from sp
     ldp         x19, x20,[sp],#16
-    pop_v_regs
+
     ret
 
 
diff --git a/common/arm64/ihevc_inter_pred_chroma_vert_w16inp.s b/common/arm64/ihevc_inter_pred_chroma_vert_w16inp.s
index 55e7f54..e8f17cc 100644
--- a/common/arm64/ihevc_inter_pred_chroma_vert_w16inp.s
+++ b/common/arm64/ihevc_inter_pred_chroma_vert_w16inp.s
@@ -104,7 +104,7 @@
 ihevc_inter_pred_chroma_vert_w16inp_av8:
 
     // stmfd sp!, {x4-x12, x14}                    //stack stores the values of the arguments
-    push_v_regs
+
     stp         x19, x20,[sp,#-16]!
 
     mov         x15,x4 // pi1_coeff
@@ -120,10 +120,10 @@ ihevc_inter_pred_chroma_vert_w16inp_av8:
     sxtl        v0.8h, v0.8b                //long the value
 
     tst         x6,#3                       //checks wd  == 2
-    dup         v12.4h, v0.4h[0]            //coeff_0
-    dup         v13.4h, v0.4h[1]            //coeff_1
-    dup         v14.4h, v0.4h[2]            //coeff_2
-    dup         v15.4h, v0.4h[3]            //coeff_3
+    dup         v16.4h, v0.4h[0]            //coeff_0
+    dup         v17.4h, v0.4h[1]            //coeff_1
+    dup         v18.4h, v0.4h[2]            //coeff_2
+    dup         v19.4h, v0.4h[3]            //coeff_3
 
     bgt         core_loop_ht_2              //jumps to loop handles wd 2
 
@@ -141,22 +141,22 @@ core_loop_ht_2:
 inner_loop_ht_2:
     add         x0,x4,x2                    //increments pi2_src
     ld1         {v0.4h},[x4],#8             //loads pu1_src
-    smull       v0.4s, v0.4h, v12.4h        //vmull_s16(src_tmp1, coeff_0)
+    smull       v0.4s, v0.4h, v16.4h        //vmull_s16(src_tmp1, coeff_0)
     subs        x12,x12,#8                  //2wd + 8
     ld1         {v2.4h},[x0],x2             //loads pi2_src
-    smull       v8.4s, v2.4h, v12.4h        //vmull_s16(src_tmp2, coeff_0)
+    smull       v7.4s, v2.4h, v16.4h        //vmull_s16(src_tmp2, coeff_0)
     ld1         {v3.4h},[x0],x2             //loads pi2_src
-    smlal       v0.4s, v2.4h, v13.4h
+    smlal       v0.4s, v2.4h, v17.4h
     ld1         {v6.4h},[x0],x2
-    smlal       v8.4s, v3.4h, v13.4h
+    smlal       v7.4s, v3.4h, v17.4h
     ld1         {v2.4h},[x0]
     add         x7,x1,x3                    //pu1_dst + dst_strd
-    smlal       v0.4s, v3.4h, v14.4h
-    smlal       v8.4s, v6.4h, v14.4h
-    smlal       v0.4s, v6.4h, v15.4h
-    smlal       v8.4s, v2.4h, v15.4h
+    smlal       v0.4s, v3.4h, v18.4h
+    smlal       v7.4s, v6.4h, v18.4h
+    smlal       v0.4s, v6.4h, v19.4h
+    smlal       v7.4s, v2.4h, v19.4h
     sqshrn      v0.4h, v0.4s,#6             //right shift
-    sqshrn      v30.4h, v8.4s,#6            //right shift
+    sqshrn      v30.4h, v7.4s,#6            //right shift
     sqrshrun    v0.8b, v0.8h,#6             //rounding shift
     sqrshrun    v30.8b, v30.8h,#6           //rounding shift
     st1         {v0.s}[0],[x1],#4           //stores the loaded value
@@ -189,45 +189,45 @@ prolog:
     ld1         {v1.4h},[x0],x2             //loads pi2_src
     subs        x11,x11,#4
     ld1         {v2.4h},[x0],x2             //loads pi2_src
-    smull       v30.4s, v0.4h, v12.4h       //vmull_s16(src_tmp1, coeff_0)
+    smull       v30.4s, v0.4h, v16.4h       //vmull_s16(src_tmp1, coeff_0)
     ld1         {v3.4h},[x0],x2
-    smlal       v30.4s, v1.4h, v13.4h
-    smlal       v30.4s, v2.4h, v14.4h
+    smlal       v30.4s, v1.4h, v17.4h
+    smlal       v30.4s, v2.4h, v18.4h
     add         x9,x1,x3                    //pu1_dst + dst_strd
-    smlal       v30.4s, v3.4h, v15.4h
+    smlal       v30.4s, v3.4h, v19.4h
 
     ld1         {v4.4h},[x0],x2
-    smull       v28.4s, v1.4h, v12.4h       //vmull_s16(src_tmp2, coeff_0)
+    smull       v28.4s, v1.4h, v16.4h       //vmull_s16(src_tmp2, coeff_0)
     add         x20,x4,x8
     csel        x4, x20, x4,le
-    smlal       v28.4s, v2.4h, v13.4h
+    smlal       v28.4s, v2.4h, v17.4h
     ld1         {v5.4h},[x0],x2
-    smlal       v28.4s, v3.4h, v14.4h
+    smlal       v28.4s, v3.4h, v18.4h
     ld1         {v6.4h},[x0],x2
-    smlal       v28.4s, v4.4h, v15.4h
+    smlal       v28.4s, v4.4h, v19.4h
     lsl         x20,x6,#1
     csel        x11, x20, x11,le
 
     sqshrn      v30.4h, v30.4s,#6           //right shift
 
-    smull       v26.4s, v2.4h, v12.4h       //vmull_s16(src_tmp2, coeff_0)
+    smull       v26.4s, v2.4h, v16.4h       //vmull_s16(src_tmp2, coeff_0)
     add         x0,x4,x2
-    smlal       v26.4s, v3.4h, v13.4h
-    smlal       v26.4s, v4.4h, v14.4h
+    smlal       v26.4s, v3.4h, v17.4h
+    smlal       v26.4s, v4.4h, v18.4h
     ld1         {v0.4h},[x4],#8             //loads pu1_src
-    smlal       v26.4s, v5.4h, v15.4h
+    smlal       v26.4s, v5.4h, v19.4h
 
     sqrshrun    v30.8b, v30.8h,#6           //rounding shift
     sqshrn      v28.4h, v28.4s,#6           //right shift
 
     ld1         {v1.4h},[x0],x2             //loads pi2_src
-    smull       v24.4s, v3.4h, v12.4h       //vmull_s16(src_tmp2, coeff_0)
+    smull       v24.4s, v3.4h, v16.4h       //vmull_s16(src_tmp2, coeff_0)
     st1         {v30.s}[0],[x1],#4          //stores the loaded value
-    smlal       v24.4s, v4.4h, v13.4h
+    smlal       v24.4s, v4.4h, v17.4h
     ld1         {v2.4h},[x0],x2             //loads pi2_src
-    smlal       v24.4s, v5.4h, v14.4h
+    smlal       v24.4s, v5.4h, v18.4h
     ld1         {v3.4h},[x0],x2
-    smlal       v24.4s, v6.4h, v15.4h
+    smlal       v24.4s, v6.4h, v19.4h
     add         x20,x1,x14
     csel        x1, x20, x1,le
 
@@ -238,21 +238,21 @@ prolog:
     beq         epilog                      //jumps to epilog
 
 kernel_4:
-    smull       v30.4s, v0.4h, v12.4h       //vmull_s16(src_tmp1, coeff_0)
+    smull       v30.4s, v0.4h, v16.4h       //vmull_s16(src_tmp1, coeff_0)
     subs        x11,x11,#4
-    smlal       v30.4s, v1.4h, v13.4h
+    smlal       v30.4s, v1.4h, v17.4h
     st1         {v28.s}[0],[x9],x3          //stores the loaded value
-    smlal       v30.4s, v2.4h, v14.4h
-    smlal       v30.4s, v3.4h, v15.4h
+    smlal       v30.4s, v2.4h, v18.4h
+    smlal       v30.4s, v3.4h, v19.4h
 
     sqshrn      v24.4h, v24.4s,#6           //right shift
     sqrshrun    v26.8b, v26.8h,#6           //rounding shift
 
     ld1         {v4.4h},[x0],x2
-    smull       v28.4s, v1.4h, v12.4h       //vmull_s16(src_tmp2, coeff_0)
-    smlal       v28.4s, v2.4h, v13.4h
-    smlal       v28.4s, v3.4h, v14.4h
-    smlal       v28.4s, v4.4h, v15.4h
+    smull       v28.4s, v1.4h, v16.4h       //vmull_s16(src_tmp2, coeff_0)
+    smlal       v28.4s, v2.4h, v17.4h
+    smlal       v28.4s, v3.4h, v18.4h
+    smlal       v28.4s, v4.4h, v19.4h
     st1         {v26.s}[0],[x9],x3          //stores the loaded value
     add         x20,x4,x8
     csel        x4, x20, x4,le
@@ -263,28 +263,28 @@ kernel_4:
     sqrshrun    v24.8b, v24.8h,#6           //rounding shift
 
     ld1         {v5.4h},[x0],x2
-    smull       v26.4s, v2.4h, v12.4h       //vmull_s16(src_tmp2, coeff_0)
+    smull       v26.4s, v2.4h, v16.4h       //vmull_s16(src_tmp2, coeff_0)
     ld1         {v6.4h},[x0],x2
-    smlal       v26.4s, v3.4h, v13.4h
+    smlal       v26.4s, v3.4h, v17.4h
     st1         {v24.s}[0],[x9]             //stores the loaded value
     add         x0,x4,x2
-    smlal       v26.4s, v4.4h, v14.4h
+    smlal       v26.4s, v4.4h, v18.4h
     ld1         {v0.4h},[x4],#8             //loads pu1_src
-    smlal       v26.4s, v5.4h, v15.4h
+    smlal       v26.4s, v5.4h, v19.4h
 
     sqshrn      v28.4h, v28.4s,#6           //right shift
     sqrshrun    v30.8b, v30.8h,#6           //rounding shift
 
     ld1         {v1.4h},[x0],x2             //loads pi2_src
-    smull       v24.4s, v3.4h, v12.4h       //vmull_s16(src_tmp2, coeff_0)
+    smull       v24.4s, v3.4h, v16.4h       //vmull_s16(src_tmp2, coeff_0)
     add         x9,x1,x3                    //pu1_dst + dst_strd
     ld1         {v2.4h},[x0],x2             //loads pi2_src
-    smlal       v24.4s, v4.4h, v13.4h
+    smlal       v24.4s, v4.4h, v17.4h
     ld1         {v3.4h},[x0],x2
-    smlal       v24.4s, v5.4h, v14.4h
+    smlal       v24.4s, v5.4h, v18.4h
 
     st1         {v30.s}[0],[x1],#4          //stores the loaded value
-    smlal       v24.4s, v6.4h, v15.4h
+    smlal       v24.4s, v6.4h, v19.4h
 
     sqshrn      v26.4h, v26.4s,#6           //right shift
     sqrshrun    v28.8b, v28.8h,#6           //rounding shift
@@ -296,41 +296,41 @@ kernel_4:
     bgt         kernel_4                    //jumps to kernel_4
 
 epilog:
-    smull       v30.4s, v0.4h, v12.4h       //vmull_s16(src_tmp1, coeff_0)
+    smull       v30.4s, v0.4h, v16.4h       //vmull_s16(src_tmp1, coeff_0)
     st1         {v28.s}[0],[x9],x3          //stores the loaded value
-    smlal       v30.4s, v1.4h, v13.4h
-    smlal       v30.4s, v2.4h, v14.4h
-    smlal       v30.4s, v3.4h, v15.4h
+    smlal       v30.4s, v1.4h, v17.4h
+    smlal       v30.4s, v2.4h, v18.4h
+    smlal       v30.4s, v3.4h, v19.4h
 
     sqshrn      v24.4h, v24.4s,#6           //right shift
     sqrshrun    v26.8b, v26.8h,#6           //rounding shift
 
-    smull       v28.4s, v1.4h, v12.4h       //vmull_s16(src_tmp2, coeff_0)
+    smull       v28.4s, v1.4h, v16.4h       //vmull_s16(src_tmp2, coeff_0)
     ld1         {v4.4h},[x0],x2
-    smlal       v28.4s, v2.4h, v13.4h
+    smlal       v28.4s, v2.4h, v17.4h
     st1         {v26.s}[0],[x9],x3          //stores the loaded value
-    smlal       v28.4s, v3.4h, v14.4h
-    smlal       v28.4s, v4.4h, v15.4h
+    smlal       v28.4s, v3.4h, v18.4h
+    smlal       v28.4s, v4.4h, v19.4h
 
     sqshrn      v30.4h, v30.4s,#6           //right shift
     sqrshrun    v24.8b, v24.8h,#6           //rounding shift
 
-    smull       v26.4s, v2.4h, v12.4h       //vmull_s16(src_tmp2, coeff_0)
+    smull       v26.4s, v2.4h, v16.4h       //vmull_s16(src_tmp2, coeff_0)
     ld1         {v5.4h},[x0],x2
-    smlal       v26.4s, v3.4h, v13.4h
-    smlal       v26.4s, v4.4h, v14.4h
-    smlal       v26.4s, v5.4h, v15.4h
+    smlal       v26.4s, v3.4h, v17.4h
+    smlal       v26.4s, v4.4h, v18.4h
+    smlal       v26.4s, v5.4h, v19.4h
 
     sqshrn      v28.4h, v28.4s,#6           //right shift
     sqrshrun    v30.8b, v30.8h,#6           //rounding shift
 
     st1         {v24.s}[0],[x9]             //stores the loaded value
-    smull       v24.4s, v3.4h, v12.4h       //vmull_s16(src_tmp2, coeff_0)
-    smlal       v24.4s, v4.4h, v13.4h
+    smull       v24.4s, v3.4h, v16.4h       //vmull_s16(src_tmp2, coeff_0)
+    smlal       v24.4s, v4.4h, v17.4h
     add         x9,x1,x3                    //pu1_dst + dst_strd
     ld1         {v6.4h},[x0],x2
-    smlal       v24.4s, v5.4h, v14.4h
-    smlal       v24.4s, v6.4h, v15.4h
+    smlal       v24.4s, v5.4h, v18.4h
+    smlal       v24.4s, v6.4h, v19.4h
     st1         {v30.s}[0],[x1],#4          //stores the loaded value
 
     sqrshrun    v28.8b, v28.8h,#6           //rounding shift
@@ -348,7 +348,7 @@ epilog:
 end_loops:
     // ldmfd sp!,{x4-x12,x15}                  //reload the registers from sp
     ldp         x19, x20,[sp],#16
-    pop_v_regs
+
     ret
 
 
diff --git a/common/arm64/ihevc_inter_pred_chroma_vert_w16inp_w16out.s b/common/arm64/ihevc_inter_pred_chroma_vert_w16inp_w16out.s
index b6d0eb2..5aaabe6 100644
--- a/common/arm64/ihevc_inter_pred_chroma_vert_w16inp_w16out.s
+++ b/common/arm64/ihevc_inter_pred_chroma_vert_w16inp_w16out.s
@@ -104,7 +104,7 @@
 ihevc_inter_pred_chroma_vert_w16inp_w16out_av8:
 
     // stmfd sp!, {x4-x12, x14}                    //stack stores the values of the arguments
-    push_v_regs
+
     stp         x19, x20,[sp,#-16]!
 
     mov         x15,x4 // pi1_coeff
@@ -120,10 +120,10 @@ ihevc_inter_pred_chroma_vert_w16inp_w16out_av8:
     sxtl        v0.8h, v0.8b                //long the value
 
     tst         x6,#3                       //checks wd  == 2
-    dup         v12.4h, v0.4h[0]            //coeff_0
-    dup         v13.4h, v0.4h[1]            //coeff_1
-    dup         v14.4h, v0.4h[2]            //coeff_2
-    dup         v15.4h, v0.4h[3]            //coeff_3
+    dup         v16.4h, v0.4h[0]            //coeff_0
+    dup         v17.4h, v0.4h[1]            //coeff_1
+    dup         v18.4h, v0.4h[2]            //coeff_2
+    dup         v19.4h, v0.4h[3]            //coeff_3
 
     bgt         core_loop_ht_2              //jumps to loop handles wd 2
 
@@ -141,22 +141,22 @@ core_loop_ht_2:
 inner_loop_ht_2:
     add         x0,x4,x2                    //increments pi2_src
     ld1         {v0.4h},[x4],#8             //loads pu1_src
-    smull       v0.4s, v0.4h, v12.4h        //vmull_s16(src_tmp1, coeff_0)
+    smull       v0.4s, v0.4h, v16.4h        //vmull_s16(src_tmp1, coeff_0)
     subs        x12,x12,#8                  //2wd + 8
     ld1         {v2.4h},[x0],x2             //loads pi2_src
-    smull       v8.4s, v2.4h, v12.4h        //vmull_s16(src_tmp2, coeff_0)
+    smull       v7.4s, v2.4h, v16.4h        //vmull_s16(src_tmp2, coeff_0)
     ld1         {v3.4h},[x0],x2             //loads pi2_src
-    smlal       v0.4s, v2.4h, v13.4h
+    smlal       v0.4s, v2.4h, v17.4h
     ld1         {v6.4h},[x0],x2
-    smlal       v8.4s, v3.4h, v13.4h
+    smlal       v7.4s, v3.4h, v17.4h
     ld1         {v2.4h},[x0]
     add         x7,x1,x3                    //pu1_dst + dst_strd
-    smlal       v0.4s, v3.4h, v14.4h
-    smlal       v8.4s, v6.4h, v14.4h
-    smlal       v0.4s, v6.4h, v15.4h
-    smlal       v8.4s, v2.4h, v15.4h
+    smlal       v0.4s, v3.4h, v18.4h
+    smlal       v7.4s, v6.4h, v18.4h
+    smlal       v0.4s, v6.4h, v19.4h
+    smlal       v7.4s, v2.4h, v19.4h
     sqshrn      v0.4h, v0.4s,#6             //right shift
-    sqshrn      v30.4h, v8.4s,#6            //right shift
+    sqshrn      v30.4h, v7.4s,#6            //right shift
     st1         {v0.2s},[x1],#8             //stores the loaded value
     st1         {v30.2s},[x7]               //stores the loaded value
     bgt         inner_loop_ht_2             //inner loop -again
@@ -188,44 +188,44 @@ prolog:
     ld1         {v1.4h},[x0],x2             //loads pi2_src
     subs        x11,x11,#4
     ld1         {v2.4h},[x0],x2             //loads pi2_src
-    smull       v30.4s, v0.4h, v12.4h       //vmull_s16(src_tmp1, coeff_0)
+    smull       v30.4s, v0.4h, v16.4h       //vmull_s16(src_tmp1, coeff_0)
     ld1         {v3.4h},[x0],x2
-    smlal       v30.4s, v1.4h, v13.4h
-    smlal       v30.4s, v2.4h, v14.4h
+    smlal       v30.4s, v1.4h, v17.4h
+    smlal       v30.4s, v2.4h, v18.4h
     add         x9,x1,x3                    //pu1_dst + dst_strd
-    smlal       v30.4s, v3.4h, v15.4h
+    smlal       v30.4s, v3.4h, v19.4h
 
     ld1         {v4.4h},[x0],x2
-    smull       v28.4s, v1.4h, v12.4h       //vmull_s16(src_tmp2, coeff_0)
+    smull       v28.4s, v1.4h, v16.4h       //vmull_s16(src_tmp2, coeff_0)
     add         x20,x4,x8
     csel        x4, x20, x4,le
     lsl         x20,x6,#1
     csel        x11, x20, x11,le
-    smlal       v28.4s, v2.4h, v13.4h
-    smlal       v28.4s, v3.4h, v14.4h
+    smlal       v28.4s, v2.4h, v17.4h
+    smlal       v28.4s, v3.4h, v18.4h
     ld1         {v5.4h},[x0],x2
-    smlal       v28.4s, v4.4h, v15.4h
+    smlal       v28.4s, v4.4h, v19.4h
 
     sqshrn      v30.4h, v30.4s,#6           //right shift
 
     ld1         {v6.4h},[x0],x2
-    smull       v26.4s, v2.4h, v12.4h       //vmull_s16(src_tmp2, coeff_0)
-    smlal       v26.4s, v3.4h, v13.4h
-    smlal       v26.4s, v4.4h, v14.4h
+    smull       v26.4s, v2.4h, v16.4h       //vmull_s16(src_tmp2, coeff_0)
+    smlal       v26.4s, v3.4h, v17.4h
+    smlal       v26.4s, v4.4h, v18.4h
     add         x0,x4,x2
     ld1         {v0.4h},[x4],#8             //loads pu1_src
-    smlal       v26.4s, v5.4h, v15.4h
+    smlal       v26.4s, v5.4h, v19.4h
 
     sqshrn      v28.4h, v28.4s,#6           //right shift
 
     ld1         {v1.4h},[x0],x2             //loads pi2_src
-    smull       v24.4s, v3.4h, v12.4h       //vmull_s16(src_tmp2, coeff_0)
+    smull       v24.4s, v3.4h, v16.4h       //vmull_s16(src_tmp2, coeff_0)
     st1         {v30.2s},[x1],#8            //stores the loaded value
-    smlal       v24.4s, v4.4h, v13.4h
+    smlal       v24.4s, v4.4h, v17.4h
     ld1         {v2.4h},[x0],x2             //loads pi2_src
-    smlal       v24.4s, v5.4h, v14.4h
+    smlal       v24.4s, v5.4h, v18.4h
     ld1         {v3.4h},[x0],x2
-    smlal       v24.4s, v6.4h, v15.4h
+    smlal       v24.4s, v6.4h, v19.4h
     add         x20,x1,x14,lsl #1
     csel        x1, x20, x1,le
 
@@ -235,20 +235,20 @@ prolog:
     beq         epilog                      //jumps to epilog
 
 kernel_4:
-    smull       v30.4s, v0.4h, v12.4h       //vmull_s16(src_tmp1, coeff_0)
+    smull       v30.4s, v0.4h, v16.4h       //vmull_s16(src_tmp1, coeff_0)
     subs        x11,x11,#4
-    smlal       v30.4s, v1.4h, v13.4h
+    smlal       v30.4s, v1.4h, v17.4h
     st1         {v28.2s},[x9],x3            //stores the loaded value
-    smlal       v30.4s, v2.4h, v14.4h
-    smlal       v30.4s, v3.4h, v15.4h
+    smlal       v30.4s, v2.4h, v18.4h
+    smlal       v30.4s, v3.4h, v19.4h
 
     sqshrn      v24.4h, v24.4s,#6           //right shift
 
     ld1         {v4.4h},[x0],x2
-    smull       v28.4s, v1.4h, v12.4h       //vmull_s16(src_tmp2, coeff_0)
-    smlal       v28.4s, v2.4h, v13.4h
-    smlal       v28.4s, v3.4h, v14.4h
-    smlal       v28.4s, v4.4h, v15.4h
+    smull       v28.4s, v1.4h, v16.4h       //vmull_s16(src_tmp2, coeff_0)
+    smlal       v28.4s, v2.4h, v17.4h
+    smlal       v28.4s, v3.4h, v18.4h
+    smlal       v28.4s, v4.4h, v19.4h
     st1         {v26.2s},[x9],x3            //stores the loaded value
     add         x20,x4,x8
     csel        x4, x20, x4,le
@@ -258,27 +258,27 @@ kernel_4:
     sqshrn      v30.4h, v30.4s,#6           //right shift
 
     ld1         {v5.4h},[x0],x2
-    smull       v26.4s, v2.4h, v12.4h       //vmull_s16(src_tmp2, coeff_0)
+    smull       v26.4s, v2.4h, v16.4h       //vmull_s16(src_tmp2, coeff_0)
     ld1         {v6.4h},[x0],x2
-    smlal       v26.4s, v3.4h, v13.4h
+    smlal       v26.4s, v3.4h, v17.4h
     st1         {v24.2s},[x9]               //stores the loaded value
     add         x0,x4,x2
-    smlal       v26.4s, v4.4h, v14.4h
+    smlal       v26.4s, v4.4h, v18.4h
     ld1         {v0.4h},[x4],#8             //loads pu1_src
-    smlal       v26.4s, v5.4h, v15.4h
+    smlal       v26.4s, v5.4h, v19.4h
 
     sqshrn      v28.4h, v28.4s,#6           //right shift
 
     ld1         {v1.4h},[x0],x2             //loads pi2_src
-    smull       v24.4s, v3.4h, v12.4h       //vmull_s16(src_tmp2, coeff_0)
+    smull       v24.4s, v3.4h, v16.4h       //vmull_s16(src_tmp2, coeff_0)
     ld1         {v2.4h},[x0],x2             //loads pi2_src
-    smlal       v24.4s, v4.4h, v13.4h
+    smlal       v24.4s, v4.4h, v17.4h
     add         x9,x1,x3                    //pu1_dst + dst_strd
     ld1         {v3.4h},[x0],x2
-    smlal       v24.4s, v5.4h, v14.4h
+    smlal       v24.4s, v5.4h, v18.4h
 
     st1         {v30.2s},[x1],#8            //stores the loaded value
-    smlal       v24.4s, v6.4h, v15.4h
+    smlal       v24.4s, v6.4h, v19.4h
 
     sqshrn      v26.4h, v26.4s,#6           //right shift
     add         x20,x1,x14,lsl #1
@@ -289,38 +289,38 @@ kernel_4:
     bgt         kernel_4                    //jumps to kernel_4
 
 epilog:
-    smull       v30.4s, v0.4h, v12.4h       //vmull_s16(src_tmp1, coeff_0)
+    smull       v30.4s, v0.4h, v16.4h       //vmull_s16(src_tmp1, coeff_0)
     st1         {v28.2s},[x9],x3            //stores the loaded value
-    smlal       v30.4s, v1.4h, v13.4h
-    smlal       v30.4s, v2.4h, v14.4h
-    smlal       v30.4s, v3.4h, v15.4h
+    smlal       v30.4s, v1.4h, v17.4h
+    smlal       v30.4s, v2.4h, v18.4h
+    smlal       v30.4s, v3.4h, v19.4h
 
     sqshrn      v24.4h, v24.4s,#6           //right shift
 
-    smull       v28.4s, v1.4h, v12.4h       //vmull_s16(src_tmp2, coeff_0)
+    smull       v28.4s, v1.4h, v16.4h       //vmull_s16(src_tmp2, coeff_0)
     ld1         {v4.4h},[x0],x2
-    smlal       v28.4s, v2.4h, v13.4h
+    smlal       v28.4s, v2.4h, v17.4h
     st1         {v26.2s},[x9],x3            //stores the loaded value
-    smlal       v28.4s, v3.4h, v14.4h
-    smlal       v28.4s, v4.4h, v15.4h
+    smlal       v28.4s, v3.4h, v18.4h
+    smlal       v28.4s, v4.4h, v19.4h
 
     sqshrn      v30.4h, v30.4s,#6           //right shift
 
-    smull       v26.4s, v2.4h, v12.4h       //vmull_s16(src_tmp2, coeff_0)
+    smull       v26.4s, v2.4h, v16.4h       //vmull_s16(src_tmp2, coeff_0)
     ld1         {v5.4h},[x0],x2
-    smlal       v26.4s, v3.4h, v13.4h
-    smlal       v26.4s, v4.4h, v14.4h
-    smlal       v26.4s, v5.4h, v15.4h
+    smlal       v26.4s, v3.4h, v17.4h
+    smlal       v26.4s, v4.4h, v18.4h
+    smlal       v26.4s, v5.4h, v19.4h
 
     sqshrn      v28.4h, v28.4s,#6           //right shift
 
     st1         {v24.2s},[x9]               //stores the loaded value
-    smull       v24.4s, v3.4h, v12.4h       //vmull_s16(src_tmp2, coeff_0)
-    smlal       v24.4s, v4.4h, v13.4h
+    smull       v24.4s, v3.4h, v16.4h       //vmull_s16(src_tmp2, coeff_0)
+    smlal       v24.4s, v4.4h, v17.4h
     add         x9,x1,x3                    //pu1_dst + dst_strd
     ld1         {v6.4h},[x0],x2
-    smlal       v24.4s, v5.4h, v14.4h
-    smlal       v24.4s, v6.4h, v15.4h
+    smlal       v24.4s, v5.4h, v18.4h
+    smlal       v24.4s, v6.4h, v19.4h
     st1         {v30.2s},[x1],#8            //stores the loaded value
 
     sqshrn      v26.4h, v26.4s,#6           //right shift
@@ -335,7 +335,7 @@ epilog:
 end_loops:
     // ldmfd sp!,{x4-x12,x15}                  //reload the registers from sp
     ldp         x19, x20,[sp],#16
-    pop_v_regs
+
     ret
 
 
diff --git a/common/arm64/ihevc_inter_pred_chroma_vert_w16out.s b/common/arm64/ihevc_inter_pred_chroma_vert_w16out.s
index 9f5687f..ec946eb 100644
--- a/common/arm64/ihevc_inter_pred_chroma_vert_w16out.s
+++ b/common/arm64/ihevc_inter_pred_chroma_vert_w16out.s
@@ -105,7 +105,7 @@
 ihevc_inter_pred_chroma_vert_w16out_av8:
 
     // stmfd sp!,{x4-x12,x14}        //stack stores the values of the arguments
-    push_v_regs
+
     stp         x19, x20,[sp,#-16]!
 
     mov         x15,x4 // pi1_coeff
@@ -145,20 +145,20 @@ ihevc_inter_pred_chroma_vert_w16out_av8:
 inner_loop_ht_2:                            //called when wd is multiple of 4 and ht is 4,2
 
     add         x6,x0,x2                    //pu1_src +src_strd
-    ld1         {v9.8b},[x6],x2             //loads pu1_src
+    ld1         {v17.8b},[x6],x2            //loads pu1_src
     subs        x5,x5,#8                    //2wd - 8
     ld1         {v5.8b},[x0],#8             //loads src
-    umull       v6.8h, v9.8b, v1.8b         //vmull_u8(vreinterpret_u8_u32(src_tmp2), coeffabs_1)
+    umull       v6.8h, v17.8b, v1.8b        //vmull_u8(vreinterpret_u8_u32(src_tmp2), coeffabs_1)
     ld1         {v4.8b},[x6],x2             //loads incremented src
     umlsl       v6.8h, v5.8b, v0.8b         //vmlsl_u8(mul_res1, vreinterpret_u8_u32(src_tmp1), coeffabs_0)
-    ld1         {v8.8b},[x6],x2             //loads incremented src
+    ld1         {v16.8b},[x6],x2            //loads incremented src
     umlal       v6.8h, v4.8b, v2.8b         //vmlal_u8(mul_res1, vreinterpret_u8_u32(src_tmp3), coeffabs_2)
     umull       v4.8h, v4.8b, v1.8b
-    ld1         {v10.8b},[x6]               //loads the incremented src
-    umlsl       v6.8h, v8.8b, v3.8b
-    umlsl       v4.8h, v9.8b, v0.8b
-    umlal       v4.8h, v8.8b, v2.8b
-    umlsl       v4.8h, v10.8b, v3.8b
+    ld1         {v18.8b},[x6]               //loads the incremented src
+    umlsl       v6.8h, v16.8b, v3.8b
+    umlsl       v4.8h, v17.8b, v0.8b
+    umlal       v4.8h, v16.8b, v2.8b
+    umlsl       v4.8h, v18.8b, v3.8b
     add         x6,x1,x3                    //pu1_dst + dst_strd
     st1         { v6.8h},[x1],#16           //stores the loaded value
 
@@ -241,7 +241,7 @@ prolog:
     add         x7,x1,x3                    //pu1_dst
     umlal       v30.8h, v6.8b, v2.8b
     umlsl       v30.8h, v7.8b, v3.8b
-    ld1         {v8.8b},[x6],x2             //load and increment
+    ld1         {v16.8b},[x6],x2            //load and increment
 
     umull       v28.8h, v6.8b, v1.8b        //mul_res 2
     add         x20,x0,x9                   //pu1_dst += 4*dst_strd - 2*wd
@@ -250,28 +250,28 @@ prolog:
     bic         x20,x10,#7                  //x5 ->wd
     csel        x5, x20, x5,le
     umlal       v28.8h, v7.8b, v2.8b
-    ld1         {v9.8b},[x6],x2
-    umlsl       v28.8h, v8.8b, v3.8b
+    ld1         {v17.8b},[x6],x2
+    umlsl       v28.8h, v16.8b, v3.8b
 
-    ld1         {v10.8b},[x6],x2
+    ld1         {v18.8b},[x6],x2
     umull       v26.8h, v7.8b, v1.8b
     add         x6,x0,x2                    //pu1_src + src_strd
     umlsl       v26.8h, v6.8b, v0.8b
     st1         { v30.16b},[x1],#16         //stores the loaded value
-    umlal       v26.8h, v8.8b, v2.8b
+    umlal       v26.8h, v16.8b, v2.8b
     ld1         {v4.8b},[x0],#8             //loads the source
-    umlsl       v26.8h, v9.8b, v3.8b
+    umlsl       v26.8h, v17.8b, v3.8b
 
     add         x20,x1,x8                   //pu1_src += 4*src_strd - 2*wd
     csel        x1, x20, x1,le
-    umull       v24.8h, v8.8b, v1.8b
+    umull       v24.8h, v16.8b, v1.8b
     ld1         {v5.8b},[x6],x2             //loads pu1_src
     umlsl       v24.8h, v7.8b, v0.8b
     subs        x12,x12,#4
     ld1         {v6.8b},[x6],x2             //load and increment
-    umlal       v24.8h, v9.8b, v2.8b
+    umlal       v24.8h, v17.8b, v2.8b
     ld1         {v7.8b},[x6],x2             //load and increment
-    umlsl       v24.8h, v10.8b, v3.8b
+    umlsl       v24.8h, v18.8b, v3.8b
     sub         x20,x2,x2,lsl #3
     neg         x11, x20
     add         x14,x2,x2,lsl #1
@@ -296,7 +296,7 @@ kernel_8:
     umlsl       v30.8h, v7.8b, v3.8b
     st1         { v26.16b},[x7],x3          //stores the loaded value
 
-    ld1         {v8.8b},[x6],x2             //load and increment
+    ld1         {v16.8b},[x6],x2            //load and increment
 
     umull       v28.8h, v6.8b, v1.8b        //mul_res 2
     bic         x20,x10,#7                  //x5 ->wd
@@ -305,10 +305,10 @@ kernel_8:
     st1         { v24.16b},[x7],x3          //stores the loaded value
 
     umlal       v28.8h, v7.8b, v2.8b
-    ld1         {v9.8b},[x6],x2
+    ld1         {v17.8b},[x6],x2
 
-    umlsl       v28.8h, v8.8b, v3.8b
-    ld1         {v10.8b},[x6],x2
+    umlsl       v28.8h, v16.8b, v3.8b
+    ld1         {v18.8b},[x6],x2
     add         x7,x1,x3                    //pu1_dst
     umull       v26.8h, v7.8b, v1.8b
     add         x6,x0,x2                    //pu1_src + src_strd
@@ -319,13 +319,13 @@ kernel_8:
     ld1         {v4.8b},[x0],#8             //loads the source
 
     add         x11,x11,x2
-    umlal       v26.8h, v8.8b, v2.8b
+    umlal       v26.8h, v16.8b, v2.8b
     st1         { v30.16b},[x1],#16         //stores the loaded value
 
-    umlsl       v26.8h, v9.8b, v3.8b
+    umlsl       v26.8h, v17.8b, v3.8b
     ld1         {v5.8b},[x6],x2             //loads pu1_src
 
-    umull       v24.8h, v8.8b, v1.8b
+    umull       v24.8h, v16.8b, v1.8b
     ld1         {v6.8b},[x6],x2             //load and increment
     add         x20,x1,x8                   //pu1_src += 4*src_strd - 2*wd
     csel        x1, x20, x1,le
@@ -341,10 +341,10 @@ kernel_8:
     subs        x12,x12,#4
 
 
-    umlal       v24.8h, v9.8b, v2.8b
+    umlal       v24.8h, v17.8b, v2.8b
     ld1         {v7.8b},[x6],x2             //load and increment
 
-    umlsl       v24.8h, v10.8b, v3.8b
+    umlsl       v24.8h, v18.8b, v3.8b
     st1         { v28.16b},[x7],x3          //stores the loaded value
 
     bgt         kernel_8                    //jumps to kernel_8
@@ -357,35 +357,35 @@ epilog:
     umlsl       v30.8h, v7.8b, v3.8b
     st1         { v26.16b},[x7],x3          //stores the loaded value
 
-    ld1         {v8.8b},[x6],x2             //load and increment
+    ld1         {v16.8b},[x6],x2            //load and increment
     umull       v28.8h, v6.8b, v1.8b        //mul_res 2
     umlsl       v28.8h, v5.8b, v0.8b
     umlal       v28.8h, v7.8b, v2.8b
-    umlsl       v28.8h, v8.8b, v3.8b
+    umlsl       v28.8h, v16.8b, v3.8b
     st1         { v24.16b},[x7],x3          //stores the loaded value
 
-    ld1         {v9.8b},[x6],x2
+    ld1         {v17.8b},[x6],x2
     umull       v26.8h, v7.8b, v1.8b
     add         x7,x1,x3                    //pu1_dst
     umlsl       v26.8h, v6.8b, v0.8b
     st1         { v30.16b},[x1],#16         //stores the loaded value
-    umlal       v26.8h, v8.8b, v2.8b
-    ld1         {v10.8b},[x6],x2
-    umlsl       v26.8h, v9.8b, v3.8b
+    umlal       v26.8h, v16.8b, v2.8b
+    ld1         {v18.8b},[x6],x2
+    umlsl       v26.8h, v17.8b, v3.8b
 
-    umull       v24.8h, v8.8b, v1.8b
+    umull       v24.8h, v16.8b, v1.8b
     st1         { v28.16b},[x7],x3          //stores the loaded value
     umlsl       v24.8h, v7.8b, v0.8b
-    umlal       v24.8h, v9.8b, v2.8b
+    umlal       v24.8h, v17.8b, v2.8b
     st1         { v26.16b},[x7],x3          //stores the loaded value
-    umlsl       v24.8h, v10.8b, v3.8b
+    umlsl       v24.8h, v18.8b, v3.8b
 
     st1         { v24.16b},[x7],x3          //stores the loaded value
 
 end_loops:
     // ldmfd sp!,{x4-x12,x15}                  //reload the registers from sp
     ldp         x19, x20,[sp],#16
-    pop_v_regs
+
     ret
 
 
diff --git a/common/arm64/ihevc_inter_pred_filters_luma_vert.s b/common/arm64/ihevc_inter_pred_filters_luma_vert.s
index 48dc30f..bd8b3c4 100644
--- a/common/arm64/ihevc_inter_pred_filters_luma_vert.s
+++ b/common/arm64/ihevc_inter_pred_filters_luma_vert.s
@@ -115,7 +115,7 @@
 ihevc_inter_pred_luma_vert_av8:
 
     // stmfd sp!, {x4-x12, x14}    //stack stores the values of the arguments
-    push_v_regs
+
     stp         x19, x20,[sp,#-16]!
 
     mov         x15,x4 // pi1_coeff
@@ -161,87 +161,87 @@ prolog:
     ld1         {v0.8b},[x0],#8             //src_tmp1 = vld1_u8(pu1_src_tmp)//
     subs        x4,x4,#8
     ld1         {v2.8b},[x3],x2             //src_tmp3 = vld1_u8(pu1_src_tmp)//
-    umull       v8.8h, v1.8b, v23.8b        //mul_res1 = vmull_u8(src_tmp2, coeffabs_1)//
+    umull       v19.8h, v1.8b, v23.8b       //mul_res1 = vmull_u8(src_tmp2, coeffabs_1)//
     ld1         {v3.8b},[x3],x2             //src_tmp4 = vld1_u8(pu1_src_tmp)//
-    umlsl       v8.8h, v0.8b, v22.8b        //mul_res1 = vmlsl_u8(mul_res1, src_tmp1, coeffabs_0)//
+    umlsl       v19.8h, v0.8b, v22.8b       //mul_res1 = vmlsl_u8(mul_res1, src_tmp1, coeffabs_0)//
     ld1         {v4.8b},[x3],x2             //src_tmp1 = vld1_u8(pu1_src_tmp)//
-    umlsl       v8.8h, v2.8b, v24.8b        //mul_res1 = vmlsl_u8(mul_res1, src_tmp3, coeffabs_2)//
+    umlsl       v19.8h, v2.8b, v24.8b       //mul_res1 = vmlsl_u8(mul_res1, src_tmp3, coeffabs_2)//
     ld1         {v5.8b},[x3],x2             //src_tmp2 = vld1_u8(pu1_src_tmp)//
-    umlal       v8.8h, v3.8b, v25.8b        //mul_res1 = vmlal_u8(mul_res1, src_tmp4, coeffabs_3)//
+    umlal       v19.8h, v3.8b, v25.8b       //mul_res1 = vmlal_u8(mul_res1, src_tmp4, coeffabs_3)//
     ld1         {v6.8b},[x3],x2             //src_tmp3 = vld1_u8(pu1_src_tmp)//
-    umlal       v8.8h, v4.8b, v26.8b        //mul_res1 = vmlal_u8(mul_res1, src_tmp1, coeffabs_4)//
+    umlal       v19.8h, v4.8b, v26.8b       //mul_res1 = vmlal_u8(mul_res1, src_tmp1, coeffabs_4)//
     ld1         {v7.8b},[x3],x2             //src_tmp4 = vld1_u8(pu1_src_tmp)//
-    umlsl       v8.8h, v5.8b, v27.8b        //mul_res1 = vmlsl_u8(mul_res1, src_tmp2, coeffabs_5)//
+    umlsl       v19.8h, v5.8b, v27.8b       //mul_res1 = vmlsl_u8(mul_res1, src_tmp2, coeffabs_5)//
     ld1         {v16.8b},[x3],x2            //src_tmp1 = vld1_u8(pu1_src_tmp)//
-    umlal       v8.8h, v6.8b, v28.8b        //mul_res1 = vmlal_u8(mul_res1, src_tmp3, coeffabs_6)//
+    umlal       v19.8h, v6.8b, v28.8b       //mul_res1 = vmlal_u8(mul_res1, src_tmp3, coeffabs_6)//
     ld1         {v17.8b},[x3],x2            //src_tmp2 = vld1_u8(pu1_src_tmp)//
-    umlsl       v8.8h, v7.8b, v29.8b        //mul_res1 = vmlsl_u8(mul_res1, src_tmp4, coeffabs_7)//
+    umlsl       v19.8h, v7.8b, v29.8b       //mul_res1 = vmlsl_u8(mul_res1, src_tmp4, coeffabs_7)//
 
 
     ld1         {v18.8b},[x3],x2            //src_tmp3 = vld1_u8(pu1_src_tmp)//
-    umull       v10.8h, v2.8b, v23.8b       //mul_res2 = vmull_u8(src_tmp3, coeffabs_1)//
+    umull       v20.8h, v2.8b, v23.8b       //mul_res2 = vmull_u8(src_tmp3, coeffabs_1)//
 
     add         x20,x0,x8
     csel        x0, x20, x0,le
-    umlsl       v10.8h, v1.8b, v22.8b       //mul_res2 = vmlsl_u8(mul_res2, src_tmp2, coeffabs_0)//
+    umlsl       v20.8h, v1.8b, v22.8b       //mul_res2 = vmlsl_u8(mul_res2, src_tmp2, coeffabs_0)//
 
     bic         x20,x5,#7                   //x5 ->wd
     csel        x4, x20, x4,le
-    umlsl       v10.8h, v3.8b, v24.8b       //mul_res2 = vmlsl_u8(mul_res2, src_tmp4, coeffabs_2)//
+    umlsl       v20.8h, v3.8b, v24.8b       //mul_res2 = vmlsl_u8(mul_res2, src_tmp4, coeffabs_2)//
 
     prfm        PLDL1KEEP,[x3]
-    umlal       v10.8h, v4.8b, v25.8b       //mul_res2 = vmlal_u8(mul_res2, src_tmp1, coeffabs_3)//
+    umlal       v20.8h, v4.8b, v25.8b       //mul_res2 = vmlal_u8(mul_res2, src_tmp1, coeffabs_3)//
     add         x20,x3, x2
     prfm        PLDL1KEEP,[x20]
-    umlal       v10.8h, v5.8b, v26.8b       //mul_res2 = vmlal_u8(mul_res2, src_tmp2, coeffabs_4)//
+    umlal       v20.8h, v5.8b, v26.8b       //mul_res2 = vmlal_u8(mul_res2, src_tmp2, coeffabs_4)//
     add         x20,x3, x2, lsl #1
     prfm        PLDL1KEEP,[x20]
-    umlsl       v10.8h, v6.8b, v27.8b       //mul_res2 = vmlsl_u8(mul_res2, src_tmp3, coeffabs_5)//
+    umlsl       v20.8h, v6.8b, v27.8b       //mul_res2 = vmlsl_u8(mul_res2, src_tmp3, coeffabs_5)//
 
     add         x3, x3, x2
-    umlal       v10.8h, v7.8b, v28.8b       //mul_res2 = vmlal_u8(mul_res2, src_tmp4, coeffabs_6)//
+    umlal       v20.8h, v7.8b, v28.8b       //mul_res2 = vmlal_u8(mul_res2, src_tmp4, coeffabs_6)//
 
     add         x20,x3, x2, lsl #1
     prfm        PLDL1KEEP,[x20]
-    umlsl       v10.8h, v16.8b, v29.8b      //mul_res2 = vmlsl_u8(mul_res2, src_tmp1, coeffabs_7)//
+    umlsl       v20.8h, v16.8b, v29.8b      //mul_res2 = vmlsl_u8(mul_res2, src_tmp1, coeffabs_7)//
 
     add         x3,x0,x2                    //pu1_src_tmp += src_strd//
-    sqrshrun    v8.8b, v8.8h,#6             //sto_res = vqmovun_s16(sto_res_tmp)//
+    sqrshrun    v19.8b, v19.8h,#6           //sto_res = vqmovun_s16(sto_res_tmp)//
 
     ld1         {v1.8b},[x3],x2             //src_tmp3 = vld1_u8(pu1_src_tmp)//
-    umull       v12.8h, v3.8b, v23.8b
+    umull       v21.8h, v3.8b, v23.8b
     ld1         {v0.8b},[x0],#8             //src_tmp1 = vld1_u8(pu1_src_tmp)//
-    umlsl       v12.8h, v2.8b, v22.8b
+    umlsl       v21.8h, v2.8b, v22.8b
     ld1         {v2.8b},[x3],x2             //src_tmp3 = vld1_u8(pu1_src_tmp)//
-    umlsl       v12.8h, v4.8b, v24.8b
-    umlal       v12.8h, v5.8b, v25.8b
-    umlal       v12.8h, v6.8b, v26.8b
-    umlsl       v12.8h, v7.8b, v27.8b
-    umlal       v12.8h, v16.8b, v28.8b
-    umlsl       v12.8h, v17.8b, v29.8b
+    umlsl       v21.8h, v4.8b, v24.8b
+    umlal       v21.8h, v5.8b, v25.8b
+    umlal       v21.8h, v6.8b, v26.8b
+    umlsl       v21.8h, v7.8b, v27.8b
+    umlal       v21.8h, v16.8b, v28.8b
+    umlsl       v21.8h, v17.8b, v29.8b
     add         x14,x1,x6
-    st1         {v8.8b},[x1],#8             //vst1_u8(pu1_dst,sto_res)//
-    sqrshrun    v10.8b, v10.8h,#6           //sto_res = vqmovun_s16(sto_res_tmp)//
+    st1         {v19.8b},[x1],#8            //vst1_u8(pu1_dst,sto_res)//
+    sqrshrun    v20.8b, v20.8h,#6           //sto_res = vqmovun_s16(sto_res_tmp)//
     add         x20,x1,x9
     csel        x1, x20, x1,le
 
-    umull       v14.8h, v4.8b, v23.8b
+    umull       v30.8h, v4.8b, v23.8b
     subs        x7,x7,#4
-    umlsl       v14.8h, v3.8b, v22.8b
-    umlsl       v14.8h, v5.8b, v24.8b
-    umlal       v14.8h, v6.8b, v25.8b
+    umlsl       v30.8h, v3.8b, v22.8b
+    umlsl       v30.8h, v5.8b, v24.8b
+    umlal       v30.8h, v6.8b, v25.8b
     ld1         {v3.8b},[x3],x2             //src_tmp4 = vld1_u8(pu1_src_tmp)//
-    umlal       v14.8h, v7.8b, v26.8b
+    umlal       v30.8h, v7.8b, v26.8b
     ld1         {v4.8b},[x3],x2             //src_tmp1 = vld1_u8(pu1_src_tmp)//
-    umlsl       v14.8h, v16.8b, v27.8b
+    umlsl       v30.8h, v16.8b, v27.8b
     ld1         {v5.8b},[x3],x2             //src_tmp2 = vld1_u8(pu1_src_tmp)//
-    umlal       v14.8h, v17.8b, v28.8b
+    umlal       v30.8h, v17.8b, v28.8b
     ld1         {v6.8b},[x3],x2             //src_tmp3 = vld1_u8(pu1_src_tmp)//
-    umlsl       v14.8h, v18.8b, v29.8b
+    umlsl       v30.8h, v18.8b, v29.8b
     ld1         {v7.8b},[x3],x2             //src_tmp4 = vld1_u8(pu1_src_tmp)//
 
-    st1         {v10.8b},[x14],x6           //vst1_u8(pu1_dst_tmp,sto_res)//
-    sqrshrun    v12.8b, v12.8h,#6
+    st1         {v20.8b},[x14],x6           //vst1_u8(pu1_dst_tmp,sto_res)//
+    sqrshrun    v21.8b, v21.8h,#6
 
 
     blt         epilog_end                  //jumps to epilog_end
@@ -250,111 +250,111 @@ prolog:
 kernel_8:
 
     subs        x4,x4,#8
-    umull       v8.8h, v1.8b, v23.8b        //mul_res1 = vmull_u8(src_tmp2, coeffabs_1)//
+    umull       v19.8h, v1.8b, v23.8b       //mul_res1 = vmull_u8(src_tmp2, coeffabs_1)//
 
     add         x20,x0,x8
     csel        x0, x20, x0,le
-    umlsl       v8.8h, v0.8b, v22.8b        //mul_res1 = vmlsl_u8(mul_res1, src_tmp1, coeffabs_0)//
+    umlsl       v19.8h, v0.8b, v22.8b       //mul_res1 = vmlsl_u8(mul_res1, src_tmp1, coeffabs_0)//
 
     bic         x20,x5,#7                   //x5 ->wd
     csel        x4, x20, x4,le
-    umlsl       v8.8h, v2.8b, v24.8b        //mul_res1 = vmlsl_u8(mul_res1, src_tmp3, coeffabs_2)//
+    umlsl       v19.8h, v2.8b, v24.8b       //mul_res1 = vmlsl_u8(mul_res1, src_tmp3, coeffabs_2)//
 
     ld1         {v16.8b},[x3],x2            //src_tmp1 = vld1_u8(pu1_src_tmp)//
-    umlal       v8.8h, v3.8b, v25.8b        //mul_res1 = vmlal_u8(mul_res1, src_tmp4, coeffabs_3)//
+    umlal       v19.8h, v3.8b, v25.8b       //mul_res1 = vmlal_u8(mul_res1, src_tmp4, coeffabs_3)//
 
     ld1         {v17.8b},[x3],x2            //src_tmp2 = vld1_u8(pu1_src_tmp)//
-    umlal       v8.8h, v4.8b, v26.8b        //mul_res1 = vmlal_u8(mul_res1, src_tmp1, coeffabs_4)//
+    umlal       v19.8h, v4.8b, v26.8b       //mul_res1 = vmlal_u8(mul_res1, src_tmp1, coeffabs_4)//
 
     ld1         {v18.8b},[x3],x2            //src_tmp3 = vld1_u8(pu1_src_tmp)//
-    umlsl       v8.8h, v5.8b, v27.8b        //mul_res1 = vmlsl_u8(mul_res1, src_tmp2, coeffabs_5)//
+    umlsl       v19.8h, v5.8b, v27.8b       //mul_res1 = vmlsl_u8(mul_res1, src_tmp2, coeffabs_5)//
 
-    umlal       v8.8h, v6.8b, v28.8b        //mul_res1 = vmlal_u8(mul_res1, src_tmp3, coeffabs_6)//
+    umlal       v19.8h, v6.8b, v28.8b       //mul_res1 = vmlal_u8(mul_res1, src_tmp3, coeffabs_6)//
 
-    umlsl       v8.8h, v7.8b, v29.8b        //mul_res1 = vmlsl_u8(mul_res1, src_tmp4, coeffabs_7)//
-    st1         {v12.8b},[x14],x6
+    umlsl       v19.8h, v7.8b, v29.8b       //mul_res1 = vmlsl_u8(mul_res1, src_tmp4, coeffabs_7)//
+    st1         {v21.8b},[x14],x6
 
 //    and            x11, x0, #31
-    sqrshrun    v14.8b, v14.8h,#6
+    sqrshrun    v30.8b, v30.8h,#6
 
     add         x3,x0,x2                    //pu1_src_tmp += src_strd//
-    umull       v10.8h, v2.8b, v23.8b       //mul_res2 = vmull_u8(src_tmp3, coeffabs_1)//
+    umull       v20.8h, v2.8b, v23.8b       //mul_res2 = vmull_u8(src_tmp3, coeffabs_1)//
 
     ld1         {v0.8b},[x0],#8             //src_tmp1 = vld1_u8(pu1_src_tmp)//
-    umlsl       v10.8h, v1.8b, v22.8b       //mul_res2 = vmlsl_u8(mul_res2, src_tmp2, coeffabs_0)//
+    umlsl       v20.8h, v1.8b, v22.8b       //mul_res2 = vmlsl_u8(mul_res2, src_tmp2, coeffabs_0)//
 
-    umlsl       v10.8h, v3.8b, v24.8b       //mul_res2 = vmlsl_u8(mul_res2, src_tmp4, coeffabs_2)//
+    umlsl       v20.8h, v3.8b, v24.8b       //mul_res2 = vmlsl_u8(mul_res2, src_tmp4, coeffabs_2)//
 
     ld1         {v1.8b},[x3],x2             //src_tmp2 = vld1_u8(pu1_src_tmp)//
-    umlal       v10.8h, v4.8b, v25.8b       //mul_res2 = vmlal_u8(mul_res2, src_tmp1, coeffabs_3)//
+    umlal       v20.8h, v4.8b, v25.8b       //mul_res2 = vmlal_u8(mul_res2, src_tmp1, coeffabs_3)//
 
-    st1         {v14.8b},[x14],x6
-    umlal       v10.8h, v5.8b, v26.8b       //mul_res2 = vmlal_u8(mul_res2, src_tmp2, coeffabs_4)//
+    st1         {v30.8b},[x14],x6
+    umlal       v20.8h, v5.8b, v26.8b       //mul_res2 = vmlal_u8(mul_res2, src_tmp2, coeffabs_4)//
 
     add         x14,x1,#0
-    umlsl       v10.8h, v6.8b, v27.8b       //mul_res2 = vmlsl_u8(mul_res2, src_tmp3, coeffabs_5)//
+    umlsl       v20.8h, v6.8b, v27.8b       //mul_res2 = vmlsl_u8(mul_res2, src_tmp3, coeffabs_5)//
 
     add         x1, x1, #8
-    umlal       v10.8h, v7.8b, v28.8b       //mul_res2 = vmlal_u8(mul_res2, src_tmp4, coeffabs_6)//
+    umlal       v20.8h, v7.8b, v28.8b       //mul_res2 = vmlal_u8(mul_res2, src_tmp4, coeffabs_6)//
 
-    umlsl       v10.8h, v16.8b, v29.8b      //mul_res2 = vmlsl_u8(mul_res2, src_tmp1, coeffabs_7)//
+    umlsl       v20.8h, v16.8b, v29.8b      //mul_res2 = vmlsl_u8(mul_res2, src_tmp1, coeffabs_7)//
 
     add         x20,x1,x9
     csel        x1, x20, x1,le
-    sqrshrun    v8.8b, v8.8h,#6             //sto_res = vqmovun_s16(sto_res_tmp)//
+    sqrshrun    v19.8b, v19.8h,#6           //sto_res = vqmovun_s16(sto_res_tmp)//
 
 //    cmp            x11, x10
-    umull       v12.8h, v3.8b, v23.8b
+    umull       v21.8h, v3.8b, v23.8b
 
     add         x10, x3, x2, lsl #3         // 10*strd - 8+2
-    umlsl       v12.8h, v2.8b, v22.8b
+    umlsl       v21.8h, v2.8b, v22.8b
 
     add         x10, x10, x2                // 11*strd
-    umlsl       v12.8h, v4.8b, v24.8b
+    umlsl       v21.8h, v4.8b, v24.8b
 
     ld1         {v2.8b},[x3],x2             //src_tmp3 = vld1_u8(pu1_src_tmp)//
-    umlal       v12.8h, v5.8b, v25.8b
+    umlal       v21.8h, v5.8b, v25.8b
 
-    umlal       v12.8h, v6.8b, v26.8b
-    st1         {v8.8b},[x14],x6            //vst1_u8(pu1_dst,sto_res)//
+    umlal       v21.8h, v6.8b, v26.8b
+    st1         {v19.8b},[x14],x6           //vst1_u8(pu1_dst,sto_res)//
 
     prfm        PLDL1KEEP,[x10]             //11+ 0
-    umlsl       v12.8h, v7.8b, v27.8b
+    umlsl       v21.8h, v7.8b, v27.8b
 
     add         x20,x10, x2
     prfm        PLDL1KEEP,[x20]             //11+ 1*strd
-    umlal       v12.8h, v16.8b, v28.8b
+    umlal       v21.8h, v16.8b, v28.8b
 
     add         x20,x10, x2, lsl #1
     prfm        PLDL1KEEP,[x20]             //11+ 2*strd
-    umlsl       v12.8h, v17.8b, v29.8b
+    umlsl       v21.8h, v17.8b, v29.8b
 
     add         x10, x10, x2                //12*strd
-    sqrshrun    v10.8b, v10.8h,#6           //sto_res = vqmovun_s16(sto_res_tmp)//
+    sqrshrun    v20.8b, v20.8h,#6           //sto_res = vqmovun_s16(sto_res_tmp)//
 
     add         x20,x10, x2, lsl #1
     prfm        PLDL1KEEP,[x20]             //11+ 3*strd
-    umull       v14.8h, v4.8b, v23.8b
+    umull       v30.8h, v4.8b, v23.8b
 
 //    mov            x10, x11
-    umlsl       v14.8h, v3.8b, v22.8b
+    umlsl       v30.8h, v3.8b, v22.8b
 
     subs        x7,x7,#4
-    umlsl       v14.8h, v5.8b, v24.8b
+    umlsl       v30.8h, v5.8b, v24.8b
 
-    umlal       v14.8h, v6.8b, v25.8b
+    umlal       v30.8h, v6.8b, v25.8b
     ld1         {v3.8b},[x3],x2             //src_tmp4 = vld1_u8(pu1_src_tmp)//
-    umlal       v14.8h, v7.8b, v26.8b
+    umlal       v30.8h, v7.8b, v26.8b
     ld1         {v4.8b},[x3],x2             //src_tmp1 = vld1_u8(pu1_src_tmp)//
-    umlsl       v14.8h, v16.8b, v27.8b
+    umlsl       v30.8h, v16.8b, v27.8b
     ld1         {v5.8b},[x3],x2             //src_tmp2 = vld1_u8(pu1_src_tmp)//
-    umlal       v14.8h, v17.8b, v28.8b
+    umlal       v30.8h, v17.8b, v28.8b
     ld1         {v6.8b},[x3],x2             //src_tmp3 = vld1_u8(pu1_src_tmp)//
-    umlsl       v14.8h, v18.8b, v29.8b
+    umlsl       v30.8h, v18.8b, v29.8b
     ld1         {v7.8b},[x3],x2             //src_tmp4 = vld1_u8(pu1_src_tmp)//
 
-    sqrshrun    v12.8b, v12.8h,#6
-    st1         {v10.8b},[x14],x6           //vst1_u8(pu1_dst_tmp,sto_res)//
+    sqrshrun    v21.8b, v21.8h,#6
+    st1         {v20.8b},[x14],x6           //vst1_u8(pu1_dst_tmp,sto_res)//
 
 
 
@@ -362,62 +362,62 @@ kernel_8:
 
 epilog:
 
-    umull       v8.8h, v1.8b, v23.8b        //mul_res1 = vmull_u8(src_tmp2, coeffabs_1)//
-    umlsl       v8.8h, v0.8b, v22.8b        //mul_res1 = vmlsl_u8(mul_res1, src_tmp1, coeffabs_0)//
-    umlsl       v8.8h, v2.8b, v24.8b        //mul_res1 = vmlsl_u8(mul_res1, src_tmp3, coeffabs_2)//
-    umlal       v8.8h, v3.8b, v25.8b        //mul_res1 = vmlal_u8(mul_res1, src_tmp4, coeffabs_3)//
-    umlal       v8.8h, v4.8b, v26.8b        //mul_res1 = vmlal_u8(mul_res1, src_tmp1, coeffabs_4)//
-    umlsl       v8.8h, v5.8b, v27.8b        //mul_res1 = vmlsl_u8(mul_res1, src_tmp2, coeffabs_5)//
-    umlal       v8.8h, v6.8b, v28.8b        //mul_res1 = vmlal_u8(mul_res1, src_tmp3, coeffabs_6)//
-    umlsl       v8.8h, v7.8b, v29.8b        //mul_res1 = vmlsl_u8(mul_res1, src_tmp4, coeffabs_7)//
-    st1         {v12.8b},[x14],x6
+    umull       v19.8h, v1.8b, v23.8b       //mul_res1 = vmull_u8(src_tmp2, coeffabs_1)//
+    umlsl       v19.8h, v0.8b, v22.8b       //mul_res1 = vmlsl_u8(mul_res1, src_tmp1, coeffabs_0)//
+    umlsl       v19.8h, v2.8b, v24.8b       //mul_res1 = vmlsl_u8(mul_res1, src_tmp3, coeffabs_2)//
+    umlal       v19.8h, v3.8b, v25.8b       //mul_res1 = vmlal_u8(mul_res1, src_tmp4, coeffabs_3)//
+    umlal       v19.8h, v4.8b, v26.8b       //mul_res1 = vmlal_u8(mul_res1, src_tmp1, coeffabs_4)//
+    umlsl       v19.8h, v5.8b, v27.8b       //mul_res1 = vmlsl_u8(mul_res1, src_tmp2, coeffabs_5)//
+    umlal       v19.8h, v6.8b, v28.8b       //mul_res1 = vmlal_u8(mul_res1, src_tmp3, coeffabs_6)//
+    umlsl       v19.8h, v7.8b, v29.8b       //mul_res1 = vmlsl_u8(mul_res1, src_tmp4, coeffabs_7)//
+    st1         {v21.8b},[x14],x6
 
-    sqrshrun    v14.8b, v14.8h,#6
+    sqrshrun    v30.8b, v30.8h,#6
 
     ld1         {v16.8b},[x3],x2            //src_tmp1 = vld1_u8(pu1_src_tmp)//
-    umull       v10.8h, v2.8b, v23.8b       //mul_res2 = vmull_u8(src_tmp3, coeffabs_1)//
-    umlsl       v10.8h, v1.8b, v22.8b       //mul_res2 = vmlsl_u8(mul_res2, src_tmp2, coeffabs_0)//
-    umlsl       v10.8h, v3.8b, v24.8b       //mul_res2 = vmlsl_u8(mul_res2, src_tmp4, coeffabs_2)//
-    umlal       v10.8h, v4.8b, v25.8b       //mul_res2 = vmlal_u8(mul_res2, src_tmp1, coeffabs_3)//
-    umlal       v10.8h, v5.8b, v26.8b       //mul_res2 = vmlal_u8(mul_res2, src_tmp2, coeffabs_4)//
-    umlsl       v10.8h, v6.8b, v27.8b       //mul_res2 = vmlsl_u8(mul_res2, src_tmp3, coeffabs_5)//
-    umlal       v10.8h, v7.8b, v28.8b       //mul_res2 = vmlal_u8(mul_res2, src_tmp4, coeffabs_6)//
-    umlsl       v10.8h, v16.8b, v29.8b      //mul_res2 = vmlsl_u8(mul_res2, src_tmp1, coeffabs_7)//
-    st1         {v14.8b},[x14],x6
-
-    sqrshrun    v8.8b, v8.8h,#6             //sto_res = vqmovun_s16(sto_res_tmp)//
+    umull       v20.8h, v2.8b, v23.8b       //mul_res2 = vmull_u8(src_tmp3, coeffabs_1)//
+    umlsl       v20.8h, v1.8b, v22.8b       //mul_res2 = vmlsl_u8(mul_res2, src_tmp2, coeffabs_0)//
+    umlsl       v20.8h, v3.8b, v24.8b       //mul_res2 = vmlsl_u8(mul_res2, src_tmp4, coeffabs_2)//
+    umlal       v20.8h, v4.8b, v25.8b       //mul_res2 = vmlal_u8(mul_res2, src_tmp1, coeffabs_3)//
+    umlal       v20.8h, v5.8b, v26.8b       //mul_res2 = vmlal_u8(mul_res2, src_tmp2, coeffabs_4)//
+    umlsl       v20.8h, v6.8b, v27.8b       //mul_res2 = vmlsl_u8(mul_res2, src_tmp3, coeffabs_5)//
+    umlal       v20.8h, v7.8b, v28.8b       //mul_res2 = vmlal_u8(mul_res2, src_tmp4, coeffabs_6)//
+    umlsl       v20.8h, v16.8b, v29.8b      //mul_res2 = vmlsl_u8(mul_res2, src_tmp1, coeffabs_7)//
+    st1         {v30.8b},[x14],x6
+
+    sqrshrun    v19.8b, v19.8h,#6           //sto_res = vqmovun_s16(sto_res_tmp)//
 
     ld1         {v17.8b},[x3],x2            //src_tmp2 = vld1_u8(pu1_src_tmp)//
-    umull       v12.8h, v3.8b, v23.8b
-    umlsl       v12.8h, v2.8b, v22.8b
-    umlsl       v12.8h, v4.8b, v24.8b
-    umlal       v12.8h, v5.8b, v25.8b
-    umlal       v12.8h, v6.8b, v26.8b
-    umlsl       v12.8h, v7.8b, v27.8b
-    umlal       v12.8h, v16.8b, v28.8b
-    umlsl       v12.8h, v17.8b, v29.8b
+    umull       v21.8h, v3.8b, v23.8b
+    umlsl       v21.8h, v2.8b, v22.8b
+    umlsl       v21.8h, v4.8b, v24.8b
+    umlal       v21.8h, v5.8b, v25.8b
+    umlal       v21.8h, v6.8b, v26.8b
+    umlsl       v21.8h, v7.8b, v27.8b
+    umlal       v21.8h, v16.8b, v28.8b
+    umlsl       v21.8h, v17.8b, v29.8b
     add         x14,x1,x6
-    st1         {v8.8b},[x1],#8             //vst1_u8(pu1_dst,sto_res)//
-    sqrshrun    v10.8b, v10.8h,#6           //sto_res = vqmovun_s16(sto_res_tmp)//
+    st1         {v19.8b},[x1],#8            //vst1_u8(pu1_dst,sto_res)//
+    sqrshrun    v20.8b, v20.8h,#6           //sto_res = vqmovun_s16(sto_res_tmp)//
 
     ld1         {v18.8b},[x3],x2            //src_tmp3 = vld1_u8(pu1_src_tmp)//
-    umull       v14.8h, v4.8b, v23.8b
-    umlsl       v14.8h, v3.8b, v22.8b
-    umlsl       v14.8h, v5.8b, v24.8b
-    umlal       v14.8h, v6.8b, v25.8b
-    umlal       v14.8h, v7.8b, v26.8b
-    umlsl       v14.8h, v16.8b, v27.8b
-    umlal       v14.8h, v17.8b, v28.8b
-    umlsl       v14.8h, v18.8b, v29.8b
-
-    st1         {v10.8b},[x14],x6           //vst1_u8(pu1_dst_tmp,sto_res)//
-    sqrshrun    v12.8b, v12.8h,#6
+    umull       v30.8h, v4.8b, v23.8b
+    umlsl       v30.8h, v3.8b, v22.8b
+    umlsl       v30.8h, v5.8b, v24.8b
+    umlal       v30.8h, v6.8b, v25.8b
+    umlal       v30.8h, v7.8b, v26.8b
+    umlsl       v30.8h, v16.8b, v27.8b
+    umlal       v30.8h, v17.8b, v28.8b
+    umlsl       v30.8h, v18.8b, v29.8b
+
+    st1         {v20.8b},[x14],x6           //vst1_u8(pu1_dst_tmp,sto_res)//
+    sqrshrun    v21.8b, v21.8h,#6
 
 epilog_end:
-    st1         {v12.8b},[x14],x6
-    sqrshrun    v14.8b, v14.8h,#6
+    st1         {v21.8b},[x14],x6
+    sqrshrun    v30.8b, v30.8h,#6
 
-    st1         {v14.8b},[x14],x6
+    st1         {v30.8b},[x14],x6
 
 
 end_loops:
@@ -427,7 +427,7 @@ end_loops:
     // ldmeqfd sp!,{x4-x12,x15}    //reload the registers from sp
     bne         lbl409
     ldp         x19, x20,[sp], #16
-    pop_v_regs
+
     ret
 lbl409:
     mov         x5, #4
@@ -465,34 +465,34 @@ inner_loop_wd_4:
     ld1         {v7.s}[1],[x3],x2           //src_tmp4 = vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp4, 1)//
     umlsl       v0.8h, v6.8b, v24.8b        //mul_res1 = vmlsl_u8(mul_res1, vreinterpret_u8_u32(src_tmp3), coeffabs_2)//
 
-    umull       v8.8h, v7.8b, v23.8b
+    umull       v19.8h, v7.8b, v23.8b
     dup         v4.2s, v7.2s[1]             //src_tmp1 = vdup_lane_u32(src_tmp4, 1)//
     umull       v2.8h, v7.8b, v25.8b        //mul_res2 = vmull_u8(vreinterpret_u8_u32(src_tmp4), coeffabs_3)//
     ld1         {v4.s}[1],[x3],x2           //src_tmp1 = vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp1, 1)//
-    umlsl       v8.8h, v6.8b, v22.8b
+    umlsl       v19.8h, v6.8b, v22.8b
     umlal       v0.8h, v4.8b, v26.8b        //mul_res1 = vmlal_u8(mul_res1, vreinterpret_u8_u32(src_tmp1), coeffabs_4)//
 
     dup         v5.2s, v4.2s[1]             //src_tmp2 = vdup_lane_u32(src_tmp1, 1)//
-    umlsl       v8.8h, v4.8b, v24.8b
+    umlsl       v19.8h, v4.8b, v24.8b
     ld1         {v5.s}[1],[x3],x2           //src_tmp2 = vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp2, 1)//
     umlsl       v2.8h, v5.8b, v27.8b        //mul_res2 = vmlsl_u8(mul_res2, vreinterpret_u8_u32(src_tmp2), coeffabs_5)//
 
     dup         v6.2s, v5.2s[1]             //src_tmp3 = vdup_lane_u32(src_tmp2, 1)//
-    umlal       v8.8h, v5.8b, v25.8b
+    umlal       v19.8h, v5.8b, v25.8b
     ld1         {v6.s}[1],[x3],x2           //src_tmp3 = vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp3, 1)//
     umlal       v0.8h, v6.8b, v28.8b        //mul_res1 = vmlal_u8(mul_res1, vreinterpret_u8_u32(src_tmp3), coeffabs_6)//
 
     dup         v7.2s, v6.2s[1]             //src_tmp4 = vdup_lane_u32(src_tmp3, 1)//
-    umlal       v8.8h, v6.8b, v26.8b
+    umlal       v19.8h, v6.8b, v26.8b
     ld1         {v7.s}[1],[x3],x2           //src_tmp4 = vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp4, 1)//
     umlsl       v2.8h, v7.8b, v29.8b        //mul_res2 = vmlsl_u8(mul_res2, vreinterpret_u8_u32(src_tmp4), coeffabs_7)//
 
     dup         v4.2s, v7.2s[1]
     add         v0.8h,  v0.8h ,  v2.8h      //mul_res1 = vaddq_u16(mul_res1, mul_res2)//
 
-    umlsl       v8.8h, v7.8b, v27.8b
+    umlsl       v19.8h, v7.8b, v27.8b
     ld1         {v4.s}[1],[x3],x2
-    umlal       v8.8h, v4.8b, v28.8b
+    umlal       v19.8h, v4.8b, v28.8b
     dup         v5.2s, v4.2s[1]
     sqrshrun    v0.8b, v0.8h,#6             //sto_res = vqmovun_s16(sto_res_tmp)//
 
@@ -500,13 +500,13 @@ inner_loop_wd_4:
     add         x3,x1,x6
     st1         {v0.s}[0],[x1]              //vst1_lane_u32((uint32_t *)pu1_dst, vreinterpret_u32_u8(sto_res), 0)//
 
-    umlsl       v8.8h, v5.8b, v29.8b
+    umlsl       v19.8h, v5.8b, v29.8b
     st1         {v0.s}[1],[x3],x6           //vst1_lane_u32((uint32_t *)pu1_dst_tmp, vreinterpret_u32_u8(sto_res), 1)//
-    sqrshrun    v8.8b, v8.8h,#6
+    sqrshrun    v19.8b, v19.8h,#6
 
-    st1         {v8.s}[0],[x3],x6
+    st1         {v19.s}[0],[x3],x6
     add         x1,x1,#4
-    st1         {v8.s}[1],[x3]
+    st1         {v19.s}[1],[x3]
     bgt         inner_loop_wd_4
 
 end_inner_loop_wd_4:
@@ -517,6 +517,6 @@ end_inner_loop_wd_4:
 
     // ldmfd sp!, {x4-x12, x15}    //reload the registers from sp
     ldp         x19, x20,[sp], #16
-    pop_v_regs
+
     ret
 
diff --git a/common/arm64/ihevc_inter_pred_filters_luma_vert_w16inp.s b/common/arm64/ihevc_inter_pred_filters_luma_vert_w16inp.s
index 64a00b2..cd8addf 100644
--- a/common/arm64/ihevc_inter_pred_filters_luma_vert_w16inp.s
+++ b/common/arm64/ihevc_inter_pred_filters_luma_vert_w16inp.s
@@ -106,7 +106,7 @@
 ihevc_inter_pred_luma_vert_w16inp_av8:
 
     // stmfd sp!, {x4-x12, x14}    //stack stores the values of the arguments
-    push_v_regs
+
     stp         x19, x20,[sp,#-16]!
 
     mov         x15,x4 // pi1_coeff
@@ -152,70 +152,70 @@ prolog:
     ld1         {v0.4h},[x0],#8             //src_tmp1 = vld1_u8(pu1_src_tmp)//
     subs        x4,x4,#4
     ld1         {v2.4h},[x3],x2             //src_tmp3 = vld1_u8(pu1_src_tmp)//
-    smull       v8.4s, v1.4h, v23.4h        //mul_res1 = vmull_u8(src_tmp2, coeffabs_1)//
+    smull       v19.4s, v1.4h, v23.4h       //mul_res1 = vmull_u8(src_tmp2, coeffabs_1)//
     ld1         {v3.4h},[x3],x2             //src_tmp4 = vld1_u8(pu1_src_tmp)//
-    smlal       v8.4s, v0.4h, v22.4h        //mul_res1 = vmlal_u8(mul_res1, src_tmp1, coeffabs_0)//
+    smlal       v19.4s, v0.4h, v22.4h       //mul_res1 = vmlal_u8(mul_res1, src_tmp1, coeffabs_0)//
     ld1         {v4.4h},[x3],x2             //src_tmp1 = vld1_u8(pu1_src_tmp)//
-    smlal       v8.4s, v2.4h, v24.4h        //mul_res1 = vmlal_u8(mul_res1, src_tmp3, coeffabs_2)//
+    smlal       v19.4s, v2.4h, v24.4h       //mul_res1 = vmlal_u8(mul_res1, src_tmp3, coeffabs_2)//
     ld1         {v5.4h},[x3],x2             //src_tmp2 = vld1_u8(pu1_src_tmp)//
-    smlal       v8.4s, v3.4h, v25.4h        //mul_res1 = vmlal_u8(mul_res1, src_tmp4, coeffabs_3)//
+    smlal       v19.4s, v3.4h, v25.4h       //mul_res1 = vmlal_u8(mul_res1, src_tmp4, coeffabs_3)//
     ld1         {v6.4h},[x3],x2             //src_tmp3 = vld1_u8(pu1_src_tmp)//
-    smlal       v8.4s, v4.4h, v26.4h        //mul_res1 = vmlal_u8(mul_res1, src_tmp1, coeffabs_4)//
+    smlal       v19.4s, v4.4h, v26.4h       //mul_res1 = vmlal_u8(mul_res1, src_tmp1, coeffabs_4)//
     ld1         {v7.4h},[x3],x2             //src_tmp4 = vld1_u8(pu1_src_tmp)//
-    smlal       v8.4s, v5.4h, v27.4h        //mul_res1 = vmlal_u8(mul_res1, src_tmp2, coeffabs_5)//
-    smlal       v8.4s, v6.4h, v28.4h        //mul_res1 = vmlal_u8(mul_res1, src_tmp3, coeffabs_6)//
-    smlal       v8.4s, v7.4h, v29.4h        //mul_res1 = vmlal_u8(mul_res1, src_tmp4, coeffabs_7)//
+    smlal       v19.4s, v5.4h, v27.4h       //mul_res1 = vmlal_u8(mul_res1, src_tmp2, coeffabs_5)//
+    smlal       v19.4s, v6.4h, v28.4h       //mul_res1 = vmlal_u8(mul_res1, src_tmp3, coeffabs_6)//
+    smlal       v19.4s, v7.4h, v29.4h       //mul_res1 = vmlal_u8(mul_res1, src_tmp4, coeffabs_7)//
 
     ld1         {v16.4h},[x3],x2            //src_tmp1 = vld1_u8(pu1_src_tmp)//
 
-    smull       v10.4s, v2.4h, v23.4h       //mul_res2 = vmull_u8(src_tmp3, coeffabs_1)//
+    smull       v20.4s, v2.4h, v23.4h       //mul_res2 = vmull_u8(src_tmp3, coeffabs_1)//
     add         x20,x0,x8,lsl #0
     csel        x0, x20, x0,le
-    smlal       v10.4s, v1.4h, v22.4h       //mul_res2 = vmlal_u8(mul_res2, src_tmp2, coeffabs_0)//
+    smlal       v20.4s, v1.4h, v22.4h       //mul_res2 = vmlal_u8(mul_res2, src_tmp2, coeffabs_0)//
     csel        x4, x5, x4,le               //x5 ->wd
-    smlal       v10.4s, v3.4h, v24.4h       //mul_res2 = vmlal_u8(mul_res2, src_tmp4, coeffabs_2)//
+    smlal       v20.4s, v3.4h, v24.4h       //mul_res2 = vmlal_u8(mul_res2, src_tmp4, coeffabs_2)//
     ld1         {v17.4h},[x3],x2            //src_tmp2 = vld1_u8(pu1_src_tmp)//
-    smlal       v10.4s, v4.4h, v25.4h       //mul_res2 = vmlal_u8(mul_res2, src_tmp1, coeffabs_3)//
+    smlal       v20.4s, v4.4h, v25.4h       //mul_res2 = vmlal_u8(mul_res2, src_tmp1, coeffabs_3)//
     ld1         {v18.4h},[x3],x2            //src_tmp3 = vld1_u8(pu1_src_tmp)//
-    smlal       v10.4s, v5.4h, v26.4h       //mul_res2 = vmlal_u8(mul_res2, src_tmp2, coeffabs_4)//
+    smlal       v20.4s, v5.4h, v26.4h       //mul_res2 = vmlal_u8(mul_res2, src_tmp2, coeffabs_4)//
     add         x3,x0,x2                    //pu1_src_tmp += src_strd//
-    smlal       v10.4s, v6.4h, v27.4h       //mul_res2 = vmlal_u8(mul_res2, src_tmp3, coeffabs_5)//
-    smlal       v10.4s, v7.4h, v28.4h       //mul_res2 = vmlal_u8(mul_res2, src_tmp4, coeffabs_6)//
-    smlal       v10.4s, v16.4h, v29.4h      //mul_res2 = vmlal_u8(mul_res2, src_tmp1, coeffabs_7)//
-    sqshrn      v8.4h, v8.4s,#6
+    smlal       v20.4s, v6.4h, v27.4h       //mul_res2 = vmlal_u8(mul_res2, src_tmp3, coeffabs_5)//
+    smlal       v20.4s, v7.4h, v28.4h       //mul_res2 = vmlal_u8(mul_res2, src_tmp4, coeffabs_6)//
+    smlal       v20.4s, v16.4h, v29.4h      //mul_res2 = vmlal_u8(mul_res2, src_tmp1, coeffabs_7)//
+    sqshrn      v19.4h, v19.4s,#6
 
     ld1         {v1.4h},[x3],x2             //src_tmp3 = vld1_u8(pu1_src_tmp)//
-    smull       v12.4s, v3.4h, v23.4h
+    smull       v21.4s, v3.4h, v23.4h
     ld1         {v0.4h},[x0],#8             //src_tmp1 = vld1_u8(pu1_src_tmp)//
-    smlal       v12.4s, v2.4h, v22.4h
+    smlal       v21.4s, v2.4h, v22.4h
     ld1         {v2.4h},[x3],x2             //src_tmp3 = vld1_u8(pu1_src_tmp)//
-    smlal       v12.4s, v4.4h, v24.4h
-    smlal       v12.4s, v5.4h, v25.4h
-    smlal       v12.4s, v6.4h, v26.4h
-    smlal       v12.4s, v7.4h, v27.4h
-    smlal       v12.4s, v16.4h, v28.4h
-    smlal       v12.4s, v17.4h, v29.4h
+    smlal       v21.4s, v4.4h, v24.4h
+    smlal       v21.4s, v5.4h, v25.4h
+    smlal       v21.4s, v6.4h, v26.4h
+    smlal       v21.4s, v7.4h, v27.4h
+    smlal       v21.4s, v16.4h, v28.4h
+    smlal       v21.4s, v17.4h, v29.4h
     add         x14,x1,x6
-    sqshrn      v10.4h, v10.4s,#6
-    sqrshrun    v8.8b, v8.8h,#6             //sto_res = vqmovun_s16(sto_res_tmp)//
+    sqshrn      v20.4h, v20.4s,#6
+    sqrshrun    v19.8b, v19.8h,#6           //sto_res = vqmovun_s16(sto_res_tmp)//
 
-    smull       v14.4s, v4.4h, v23.4h
-    smlal       v14.4s, v3.4h, v22.4h
-    smlal       v14.4s, v5.4h, v24.4h
-    smlal       v14.4s, v6.4h, v25.4h
+    smull       v30.4s, v4.4h, v23.4h
+    smlal       v30.4s, v3.4h, v22.4h
+    smlal       v30.4s, v5.4h, v24.4h
+    smlal       v30.4s, v6.4h, v25.4h
     ld1         {v3.4h},[x3],x2             //src_tmp4 = vld1_u8(pu1_src_tmp)//
-    smlal       v14.4s, v7.4h, v26.4h
+    smlal       v30.4s, v7.4h, v26.4h
     ld1         {v4.4h},[x3],x2             //src_tmp1 = vld1_u8(pu1_src_tmp)//
-    smlal       v14.4s, v16.4h, v27.4h
+    smlal       v30.4s, v16.4h, v27.4h
     ld1         {v5.4h},[x3],x2             //src_tmp2 = vld1_u8(pu1_src_tmp)//
-    smlal       v14.4s, v17.4h, v28.4h
+    smlal       v30.4s, v17.4h, v28.4h
     ld1         {v6.4h},[x3],x2             //src_tmp3 = vld1_u8(pu1_src_tmp)//
-    smlal       v14.4s, v18.4h, v29.4h
+    smlal       v30.4s, v18.4h, v29.4h
     ld1         {v7.4h},[x3],x2             //src_tmp4 = vld1_u8(pu1_src_tmp)//
 
-    st1         {v8.s}[0],[x1],#4           //vst1_u8(pu1_dst,sto_res)//
-    sqshrn      v12.4h, v12.4s,#6
-    sqrshrun    v10.8b, v10.8h,#6           //sto_res = vqmovun_s16(sto_res_tmp)//
+    st1         {v19.s}[0],[x1],#4          //vst1_u8(pu1_dst,sto_res)//
+    sqshrn      v21.4h, v21.4s,#6
+    sqrshrun    v20.8b, v20.8h,#6           //sto_res = vqmovun_s16(sto_res_tmp)//
     add         x20,x1,x9
     csel        x1, x20, x1,le
 
@@ -226,164 +226,164 @@ prolog:
 
 kernel_8:
 
-    smull       v8.4s, v1.4h, v23.4h        //mul_res1 = vmull_u8(src_tmp2, coeffabs_1)//
+    smull       v19.4s, v1.4h, v23.4h       //mul_res1 = vmull_u8(src_tmp2, coeffabs_1)//
     subs        x4,x4,#4
-    smlal       v8.4s, v0.4h, v22.4h        //mul_res1 = vmlal_u8(mul_res1, src_tmp1, coeffabs_0)//
+    smlal       v19.4s, v0.4h, v22.4h       //mul_res1 = vmlal_u8(mul_res1, src_tmp1, coeffabs_0)//
     add         x20,x0,x8,lsl #0
     csel        x0, x20, x0,le
-    smlal       v8.4s, v2.4h, v24.4h        //mul_res1 = vmlal_u8(mul_res1, src_tmp3, coeffabs_2)//
-    smlal       v8.4s, v3.4h, v25.4h        //mul_res1 = vmlal_u8(mul_res1, src_tmp4, coeffabs_3)//
-    smlal       v8.4s, v4.4h, v26.4h        //mul_res1 = vmlal_u8(mul_res1, src_tmp1, coeffabs_4)//
-    smlal       v8.4s, v5.4h, v27.4h        //mul_res1 = vmlal_u8(mul_res1, src_tmp2, coeffabs_5)//
-    smlal       v8.4s, v6.4h, v28.4h        //mul_res1 = vmlal_u8(mul_res1, src_tmp3, coeffabs_6)//
-    smlal       v8.4s, v7.4h, v29.4h        //mul_res1 = vmlal_u8(mul_res1, src_tmp4, coeffabs_7)//
-    st1         {v10.s}[0],[x14],x6         //vst1_u8(pu1_dst_tmp,sto_res)//
-
-    sqshrn      v14.4h, v14.4s,#6
-    sqrshrun    v12.8b, v12.8h,#6
+    smlal       v19.4s, v2.4h, v24.4h       //mul_res1 = vmlal_u8(mul_res1, src_tmp3, coeffabs_2)//
+    smlal       v19.4s, v3.4h, v25.4h       //mul_res1 = vmlal_u8(mul_res1, src_tmp4, coeffabs_3)//
+    smlal       v19.4s, v4.4h, v26.4h       //mul_res1 = vmlal_u8(mul_res1, src_tmp1, coeffabs_4)//
+    smlal       v19.4s, v5.4h, v27.4h       //mul_res1 = vmlal_u8(mul_res1, src_tmp2, coeffabs_5)//
+    smlal       v19.4s, v6.4h, v28.4h       //mul_res1 = vmlal_u8(mul_res1, src_tmp3, coeffabs_6)//
+    smlal       v19.4s, v7.4h, v29.4h       //mul_res1 = vmlal_u8(mul_res1, src_tmp4, coeffabs_7)//
+    st1         {v20.s}[0],[x14],x6         //vst1_u8(pu1_dst_tmp,sto_res)//
+
+    sqshrn      v30.4h, v30.4s,#6
+    sqrshrun    v21.8b, v21.8h,#6
     ld1         {v16.4h},[x3],x2            //src_tmp1 = vld1_u8(pu1_src_tmp)//
 
-    smull       v10.4s, v2.4h, v23.4h       //mul_res2 = vmull_u8(src_tmp3, coeffabs_1)//
-    smlal       v10.4s, v1.4h, v22.4h       //mul_res2 = vmlal_u8(mul_res2, src_tmp2, coeffabs_0)//
-    smlal       v10.4s, v3.4h, v24.4h       //mul_res2 = vmlal_u8(mul_res2, src_tmp4, coeffabs_2)//
-    smlal       v10.4s, v4.4h, v25.4h       //mul_res2 = vmlal_u8(mul_res2, src_tmp1, coeffabs_3)//
-    smlal       v10.4s, v5.4h, v26.4h       //mul_res2 = vmlal_u8(mul_res2, src_tmp2, coeffabs_4)//
-    smlal       v10.4s, v6.4h, v27.4h       //mul_res2 = vmlal_u8(mul_res2, src_tmp3, coeffabs_5)//
-    st1         {v12.s}[0],[x14],x6
+    smull       v20.4s, v2.4h, v23.4h       //mul_res2 = vmull_u8(src_tmp3, coeffabs_1)//
+    smlal       v20.4s, v1.4h, v22.4h       //mul_res2 = vmlal_u8(mul_res2, src_tmp2, coeffabs_0)//
+    smlal       v20.4s, v3.4h, v24.4h       //mul_res2 = vmlal_u8(mul_res2, src_tmp4, coeffabs_2)//
+    smlal       v20.4s, v4.4h, v25.4h       //mul_res2 = vmlal_u8(mul_res2, src_tmp1, coeffabs_3)//
+    smlal       v20.4s, v5.4h, v26.4h       //mul_res2 = vmlal_u8(mul_res2, src_tmp2, coeffabs_4)//
+    smlal       v20.4s, v6.4h, v27.4h       //mul_res2 = vmlal_u8(mul_res2, src_tmp3, coeffabs_5)//
+    st1         {v21.s}[0],[x14],x6
 
-    smlal       v10.4s, v7.4h, v28.4h       //mul_res2 = vmlal_u8(mul_res2, src_tmp4, coeffabs_6)//
+    smlal       v20.4s, v7.4h, v28.4h       //mul_res2 = vmlal_u8(mul_res2, src_tmp4, coeffabs_6)//
     ld1         {v17.4h},[x3],x2            //src_tmp2 = vld1_u8(pu1_src_tmp)//
 
-    smlal       v10.4s, v16.4h, v29.4h      //mul_res2 = vmlal_u8(mul_res2, src_tmp1, coeffabs_7)//
+    smlal       v20.4s, v16.4h, v29.4h      //mul_res2 = vmlal_u8(mul_res2, src_tmp1, coeffabs_7)//
 
-    sqshrn      v8.4h, v8.4s,#6
-    sqrshrun    v14.8b, v14.8h,#6
+    sqshrn      v19.4h, v19.4s,#6
+    sqrshrun    v30.8b, v30.8h,#6
 
-    smull       v12.4s, v3.4h, v23.4h
+    smull       v21.4s, v3.4h, v23.4h
     csel        x4, x5, x4,le               //x5 ->wd
 
-    smlal       v12.4s, v2.4h, v22.4h
+    smlal       v21.4s, v2.4h, v22.4h
     ld1         {v18.4h},[x3],x2            //src_tmp3 = vld1_u8(pu1_src_tmp)//
 
-    smlal       v12.4s, v4.4h, v24.4h
+    smlal       v21.4s, v4.4h, v24.4h
     add         x3,x0,x2                    //pu1_src_tmp += src_strd//
 
-    smlal       v12.4s, v5.4h, v25.4h
+    smlal       v21.4s, v5.4h, v25.4h
 
-    smlal       v12.4s, v6.4h, v26.4h
-    st1         {v14.s}[0],[x14],x6
+    smlal       v21.4s, v6.4h, v26.4h
+    st1         {v30.s}[0],[x14],x6
 
-    smlal       v12.4s, v7.4h, v27.4h
+    smlal       v21.4s, v7.4h, v27.4h
     ld1         {v1.4h},[x3],x2             //src_tmp2 = vld1_u8(pu1_src_tmp)//
 
-    smlal       v12.4s, v16.4h, v28.4h
+    smlal       v21.4s, v16.4h, v28.4h
     add         x14,x1,x6
 
-    smlal       v12.4s, v17.4h, v29.4h
+    smlal       v21.4s, v17.4h, v29.4h
     ld1         {v0.4h},[x0],#8             //src_tmp1 = vld1_u8(pu1_src_tmp)//
 
-    sqshrn      v10.4h, v10.4s,#6
-    sqrshrun    v8.8b, v8.8h,#6             //sto_res = vqmovun_s16(sto_res_tmp)//
+    sqshrn      v20.4h, v20.4s,#6
+    sqrshrun    v19.8b, v19.8h,#6           //sto_res = vqmovun_s16(sto_res_tmp)//
     ld1         {v2.4h},[x3],x2             //src_tmp3 = vld1_u8(pu1_src_tmp)//
 
-    smull       v14.4s, v4.4h, v23.4h
-    smlal       v14.4s, v3.4h, v22.4h
-    smlal       v14.4s, v5.4h, v24.4h
+    smull       v30.4s, v4.4h, v23.4h
+    smlal       v30.4s, v3.4h, v22.4h
+    smlal       v30.4s, v5.4h, v24.4h
     ld1         {v3.4h},[x3],x2             //src_tmp4 = vld1_u8(pu1_src_tmp)//
 
-    smlal       v14.4s, v6.4h, v25.4h
+    smlal       v30.4s, v6.4h, v25.4h
     ld1         {v4.4h},[x3],x2             //src_tmp1 = vld1_u8(pu1_src_tmp)//
-    smlal       v14.4s, v7.4h, v26.4h
+    smlal       v30.4s, v7.4h, v26.4h
     ld1         {v5.4h},[x3],x2             //src_tmp2 = vld1_u8(pu1_src_tmp)//
-    smlal       v14.4s, v16.4h, v27.4h
+    smlal       v30.4s, v16.4h, v27.4h
     ld1         {v6.4h},[x3],x2             //src_tmp3 = vld1_u8(pu1_src_tmp)//
-    smlal       v14.4s, v17.4h, v28.4h
+    smlal       v30.4s, v17.4h, v28.4h
     ld1         {v7.4h},[x3],x2             //src_tmp4 = vld1_u8(pu1_src_tmp)//
-    smlal       v14.4s, v18.4h, v29.4h
-    st1         {v8.s}[0],[x1],#4           //vst1_u8(pu1_dst,sto_res)//
+    smlal       v30.4s, v18.4h, v29.4h
+    st1         {v19.s}[0],[x1],#4          //vst1_u8(pu1_dst,sto_res)//
 
-    sqshrn      v12.4h, v12.4s,#6
+    sqshrn      v21.4h, v21.4s,#6
     add         x20,x1,x9
     csel        x1, x20, x1,le
 
-    sqrshrun    v10.8b, v10.8h,#6           //sto_res = vqmovun_s16(sto_res_tmp)//
+    sqrshrun    v20.8b, v20.8h,#6           //sto_res = vqmovun_s16(sto_res_tmp)//
     subs        x7,x7,#4
 
     bgt         kernel_8                    //jumps to kernel_8
 
 epilog:
 
-    smull       v8.4s, v1.4h, v23.4h        //mul_res1 = vmull_u8(src_tmp2, coeffabs_1)//
-    smlal       v8.4s, v0.4h, v22.4h        //mul_res1 = vmlal_u8(mul_res1, src_tmp1, coeffabs_0)//
-    smlal       v8.4s, v2.4h, v24.4h        //mul_res1 = vmlal_u8(mul_res1, src_tmp3, coeffabs_2)//
-    smlal       v8.4s, v3.4h, v25.4h        //mul_res1 = vmlal_u8(mul_res1, src_tmp4, coeffabs_3)//
-    smlal       v8.4s, v4.4h, v26.4h        //mul_res1 = vmlal_u8(mul_res1, src_tmp1, coeffabs_4)//
-    smlal       v8.4s, v5.4h, v27.4h        //mul_res1 = vmlal_u8(mul_res1, src_tmp2, coeffabs_5)//
-    smlal       v8.4s, v6.4h, v28.4h        //mul_res1 = vmlal_u8(mul_res1, src_tmp3, coeffabs_6)//
-    smlal       v8.4s, v7.4h, v29.4h        //mul_res1 = vmlal_u8(mul_res1, src_tmp4, coeffabs_7)//
-    st1         {v10.s}[0],[x14],x6
+    smull       v19.4s, v1.4h, v23.4h       //mul_res1 = vmull_u8(src_tmp2, coeffabs_1)//
+    smlal       v19.4s, v0.4h, v22.4h       //mul_res1 = vmlal_u8(mul_res1, src_tmp1, coeffabs_0)//
+    smlal       v19.4s, v2.4h, v24.4h       //mul_res1 = vmlal_u8(mul_res1, src_tmp3, coeffabs_2)//
+    smlal       v19.4s, v3.4h, v25.4h       //mul_res1 = vmlal_u8(mul_res1, src_tmp4, coeffabs_3)//
+    smlal       v19.4s, v4.4h, v26.4h       //mul_res1 = vmlal_u8(mul_res1, src_tmp1, coeffabs_4)//
+    smlal       v19.4s, v5.4h, v27.4h       //mul_res1 = vmlal_u8(mul_res1, src_tmp2, coeffabs_5)//
+    smlal       v19.4s, v6.4h, v28.4h       //mul_res1 = vmlal_u8(mul_res1, src_tmp3, coeffabs_6)//
+    smlal       v19.4s, v7.4h, v29.4h       //mul_res1 = vmlal_u8(mul_res1, src_tmp4, coeffabs_7)//
+    st1         {v20.s}[0],[x14],x6
 
-    sqshrn      v14.4h, v14.4s,#6
-    sqrshrun    v12.8b, v12.8h,#6
+    sqshrn      v30.4h, v30.4s,#6
+    sqrshrun    v21.8b, v21.8h,#6
 
     ld1         {v16.4h},[x3],x2            //src_tmp1 = vld1_u8(pu1_src_tmp)//
-    smull       v10.4s, v2.4h, v23.4h       //mul_res2 = vmull_u8(src_tmp3, coeffabs_1)//
-    smlal       v10.4s, v1.4h, v22.4h       //mul_res2 = vmlal_u8(mul_res2, src_tmp2, coeffabs_0)//
-    smlal       v10.4s, v3.4h, v24.4h       //mul_res2 = vmlal_u8(mul_res2, src_tmp4, coeffabs_2)//
-    smlal       v10.4s, v4.4h, v25.4h       //mul_res2 = vmlal_u8(mul_res2, src_tmp1, coeffabs_3)//
-    smlal       v10.4s, v5.4h, v26.4h       //mul_res2 = vmlal_u8(mul_res2, src_tmp2, coeffabs_4)//
-    smlal       v10.4s, v6.4h, v27.4h       //mul_res2 = vmlal_u8(mul_res2, src_tmp3, coeffabs_5)//
-    smlal       v10.4s, v7.4h, v28.4h       //mul_res2 = vmlal_u8(mul_res2, src_tmp4, coeffabs_6)//
-    smlal       v10.4s, v16.4h, v29.4h      //mul_res2 = vmlal_u8(mul_res2, src_tmp1, coeffabs_7)//
-    st1         {v12.s}[0],[x14],x6
-
-    sqshrn      v8.4h, v8.4s,#6
-    sqrshrun    v14.8b, v14.8h,#6
+    smull       v20.4s, v2.4h, v23.4h       //mul_res2 = vmull_u8(src_tmp3, coeffabs_1)//
+    smlal       v20.4s, v1.4h, v22.4h       //mul_res2 = vmlal_u8(mul_res2, src_tmp2, coeffabs_0)//
+    smlal       v20.4s, v3.4h, v24.4h       //mul_res2 = vmlal_u8(mul_res2, src_tmp4, coeffabs_2)//
+    smlal       v20.4s, v4.4h, v25.4h       //mul_res2 = vmlal_u8(mul_res2, src_tmp1, coeffabs_3)//
+    smlal       v20.4s, v5.4h, v26.4h       //mul_res2 = vmlal_u8(mul_res2, src_tmp2, coeffabs_4)//
+    smlal       v20.4s, v6.4h, v27.4h       //mul_res2 = vmlal_u8(mul_res2, src_tmp3, coeffabs_5)//
+    smlal       v20.4s, v7.4h, v28.4h       //mul_res2 = vmlal_u8(mul_res2, src_tmp4, coeffabs_6)//
+    smlal       v20.4s, v16.4h, v29.4h      //mul_res2 = vmlal_u8(mul_res2, src_tmp1, coeffabs_7)//
+    st1         {v21.s}[0],[x14],x6
+
+    sqshrn      v19.4h, v19.4s,#6
+    sqrshrun    v30.8b, v30.8h,#6
 
     ld1         {v17.4h},[x3],x2            //src_tmp2 = vld1_u8(pu1_src_tmp)//
-    smull       v12.4s, v3.4h, v23.4h
-    smlal       v12.4s, v2.4h, v22.4h
-    smlal       v12.4s, v4.4h, v24.4h
-    smlal       v12.4s, v5.4h, v25.4h
-    smlal       v12.4s, v6.4h, v26.4h
-    smlal       v12.4s, v7.4h, v27.4h
-    smlal       v12.4s, v16.4h, v28.4h
-    smlal       v12.4s, v17.4h, v29.4h
-    st1         {v14.s}[0],[x14],x6
-    sqshrn      v10.4h, v10.4s,#6
-    sqrshrun    v8.8b, v8.8h,#6             //sto_res = vqmovun_s16(sto_res_tmp)//
+    smull       v21.4s, v3.4h, v23.4h
+    smlal       v21.4s, v2.4h, v22.4h
+    smlal       v21.4s, v4.4h, v24.4h
+    smlal       v21.4s, v5.4h, v25.4h
+    smlal       v21.4s, v6.4h, v26.4h
+    smlal       v21.4s, v7.4h, v27.4h
+    smlal       v21.4s, v16.4h, v28.4h
+    smlal       v21.4s, v17.4h, v29.4h
+    st1         {v30.s}[0],[x14],x6
+    sqshrn      v20.4h, v20.4s,#6
+    sqrshrun    v19.8b, v19.8h,#6           //sto_res = vqmovun_s16(sto_res_tmp)//
 
     ld1         {v18.4h},[x3],x2            //src_tmp3 = vld1_u8(pu1_src_tmp)//
-    smull       v14.4s, v4.4h, v23.4h
-    smlal       v14.4s, v3.4h, v22.4h
-    smlal       v14.4s, v5.4h, v24.4h
-    smlal       v14.4s, v6.4h, v25.4h
-    smlal       v14.4s, v7.4h, v26.4h
-    smlal       v14.4s, v16.4h, v27.4h
-    smlal       v14.4s, v17.4h, v28.4h
-    smlal       v14.4s, v18.4h, v29.4h
-    sqshrn      v12.4h, v12.4s,#6
-    sqrshrun    v10.8b, v10.8h,#6           //sto_res = vqmovun_s16(sto_res_tmp)//
+    smull       v30.4s, v4.4h, v23.4h
+    smlal       v30.4s, v3.4h, v22.4h
+    smlal       v30.4s, v5.4h, v24.4h
+    smlal       v30.4s, v6.4h, v25.4h
+    smlal       v30.4s, v7.4h, v26.4h
+    smlal       v30.4s, v16.4h, v27.4h
+    smlal       v30.4s, v17.4h, v28.4h
+    smlal       v30.4s, v18.4h, v29.4h
+    sqshrn      v21.4h, v21.4s,#6
+    sqrshrun    v20.8b, v20.8h,#6           //sto_res = vqmovun_s16(sto_res_tmp)//
 
     add         x14,x1,x6
-    st1         {v8.s}[0],[x1],#4           //vst1_u8(pu1_dst,sto_res)//
+    st1         {v19.s}[0],[x1],#4          //vst1_u8(pu1_dst,sto_res)//
 
 epilog_end:
-    st1         {v10.s}[0],[x14],x6         //vst1_u8(pu1_dst_tmp,sto_res)//
-    sqrshrun    v12.8b, v12.8h,#6
+    st1         {v20.s}[0],[x14],x6         //vst1_u8(pu1_dst_tmp,sto_res)//
+    sqrshrun    v21.8b, v21.8h,#6
 
-    st1         {v12.s}[0],[x14],x6
-    sqshrn      v14.4h, v14.4s,#6
-    sqrshrun    v14.8b, v14.8h,#6
+    st1         {v21.s}[0],[x14],x6
+    sqshrn      v30.4h, v30.4s,#6
+    sqrshrun    v30.8b, v30.8h,#6
 
-    st1         {v14.s}[0],[x14],x6
+    st1         {v30.s}[0],[x14],x6
 
 
 end_loops:
 
     // ldmfd sp!,{x4-x12,x15}                  //reload the registers from sp
     ldp         x19, x20,[sp], #16
-    pop_v_regs
+
     ret
 
 
diff --git a/common/arm64/ihevc_inter_pred_filters_luma_vert_w16out.s b/common/arm64/ihevc_inter_pred_filters_luma_vert_w16out.s
index da316ae..ca48db5 100644
--- a/common/arm64/ihevc_inter_pred_filters_luma_vert_w16out.s
+++ b/common/arm64/ihevc_inter_pred_filters_luma_vert_w16out.s
@@ -70,7 +70,7 @@
 ihevc_inter_pred_luma_vert_w16out_av8:
 
     // stmfd sp!, {x4-x12, x14}    //stack stores the values of the arguments
-    push_v_regs
+
     stp         x19, x20,[sp,#-16]!
 
     mov         x15,x4 // pi1_coeff
@@ -118,83 +118,83 @@ prolog_16out:
     ld1         {v0.8b},[x0],#8             //src_tmp1 = vld1_u8(pu1_src_tmp)//
     subs        x4,x4,#8
     ld1         {v2.8b},[x3],x2             //src_tmp3 = vld1_u8(pu1_src_tmp)//
-    umull       v8.8h, v1.8b, v23.8b        //mul_res1 = vmull_u8(src_tmp2, coeffabs_1)//
+    umull       v19.8h, v1.8b, v23.8b       //mul_res1 = vmull_u8(src_tmp2, coeffabs_1)//
     ld1         {v3.8b},[x3],x2             //src_tmp4 = vld1_u8(pu1_src_tmp)//
-    umlsl       v8.8h, v0.8b, v22.8b        //mul_res1 = vmlsl_u8(mul_res1, src_tmp1, coeffabs_0)//
+    umlsl       v19.8h, v0.8b, v22.8b       //mul_res1 = vmlsl_u8(mul_res1, src_tmp1, coeffabs_0)//
     ld1         {v4.8b},[x3],x2             //src_tmp1 = vld1_u8(pu1_src_tmp)//
-    umlsl       v8.8h, v2.8b, v24.8b        //mul_res1 = vmlsl_u8(mul_res1, src_tmp3, coeffabs_2)//
+    umlsl       v19.8h, v2.8b, v24.8b       //mul_res1 = vmlsl_u8(mul_res1, src_tmp3, coeffabs_2)//
     ld1         {v5.8b},[x3],x2             //src_tmp2 = vld1_u8(pu1_src_tmp)//
-    umlal       v8.8h, v3.8b, v25.8b        //mul_res1 = vmlal_u8(mul_res1, src_tmp4, coeffabs_3)//
+    umlal       v19.8h, v3.8b, v25.8b       //mul_res1 = vmlal_u8(mul_res1, src_tmp4, coeffabs_3)//
     ld1         {v6.8b},[x3],x2             //src_tmp3 = vld1_u8(pu1_src_tmp)//
-    umlal       v8.8h, v4.8b, v26.8b        //mul_res1 = vmlal_u8(mul_res1, src_tmp1, coeffabs_4)//
+    umlal       v19.8h, v4.8b, v26.8b       //mul_res1 = vmlal_u8(mul_res1, src_tmp1, coeffabs_4)//
     ld1         {v7.8b},[x3],x2             //src_tmp4 = vld1_u8(pu1_src_tmp)//
-    umlsl       v8.8h, v5.8b, v27.8b        //mul_res1 = vmlsl_u8(mul_res1, src_tmp2, coeffabs_5)//
+    umlsl       v19.8h, v5.8b, v27.8b       //mul_res1 = vmlsl_u8(mul_res1, src_tmp2, coeffabs_5)//
     ld1         {v16.8b},[x3],x2            //src_tmp1 = vld1_u8(pu1_src_tmp)//
-    umlal       v8.8h, v6.8b, v28.8b        //mul_res1 = vmlal_u8(mul_res1, src_tmp3, coeffabs_6)//
+    umlal       v19.8h, v6.8b, v28.8b       //mul_res1 = vmlal_u8(mul_res1, src_tmp3, coeffabs_6)//
     ld1         {v17.8b},[x3],x2            //src_tmp2 = vld1_u8(pu1_src_tmp)//
-    umlsl       v8.8h, v7.8b, v29.8b        //mul_res1 = vmlsl_u8(mul_res1, src_tmp4, coeffabs_7)//
+    umlsl       v19.8h, v7.8b, v29.8b       //mul_res1 = vmlsl_u8(mul_res1, src_tmp4, coeffabs_7)//
 
 
     add         x20,x0,x8
     csel        x0, x20, x0,le
-    umull       v10.8h, v2.8b, v23.8b       //mul_res2 = vmull_u8(src_tmp3, coeffabs_1)//
+    umull       v20.8h, v2.8b, v23.8b       //mul_res2 = vmull_u8(src_tmp3, coeffabs_1)//
 
     bic         x20,x5,#7                   //x5 ->wd
     csel        x4, x20, x4,le
-    umlsl       v10.8h, v1.8b, v22.8b       //mul_res2 = vmlsl_u8(mul_res2, src_tmp2, coeffabs_0)//
+    umlsl       v20.8h, v1.8b, v22.8b       //mul_res2 = vmlsl_u8(mul_res2, src_tmp2, coeffabs_0)//
 
     ld1         {v18.8b},[x3],x2            //src_tmp3 = vld1_u8(pu1_src_tmp)//
-    umlsl       v10.8h, v3.8b, v24.8b       //mul_res2 = vmlsl_u8(mul_res2, src_tmp4, coeffabs_2)//
+    umlsl       v20.8h, v3.8b, v24.8b       //mul_res2 = vmlsl_u8(mul_res2, src_tmp4, coeffabs_2)//
 
     add         x20,x20,x3
     prfm        PLDL1KEEP,[x20]
-    umlal       v10.8h, v4.8b, v25.8b       //mul_res2 = vmlal_u8(mul_res2, src_tmp1, coeffabs_3)//
+    umlal       v20.8h, v4.8b, v25.8b       //mul_res2 = vmlal_u8(mul_res2, src_tmp1, coeffabs_3)//
     add         x20,x3, x2
     prfm        PLDL1KEEP,[x20]
-    umlal       v10.8h, v5.8b, v26.8b       //mul_res2 = vmlal_u8(mul_res2, src_tmp2, coeffabs_4)//
+    umlal       v20.8h, v5.8b, v26.8b       //mul_res2 = vmlal_u8(mul_res2, src_tmp2, coeffabs_4)//
     add         x20,x3, x2, lsl #1
     prfm        PLDL1KEEP,[x20]
-    umlsl       v10.8h, v6.8b, v27.8b       //mul_res2 = vmlsl_u8(mul_res2, src_tmp3, coeffabs_5)//
+    umlsl       v20.8h, v6.8b, v27.8b       //mul_res2 = vmlsl_u8(mul_res2, src_tmp3, coeffabs_5)//
     add         x3, x3, x2
-    umlal       v10.8h, v7.8b, v28.8b       //mul_res2 = vmlal_u8(mul_res2, src_tmp4, coeffabs_6)//
+    umlal       v20.8h, v7.8b, v28.8b       //mul_res2 = vmlal_u8(mul_res2, src_tmp4, coeffabs_6)//
     add         x20,x3, x2, lsl #1
     prfm        PLDL1KEEP,[x20]
-    umlsl       v10.8h, v16.8b, v29.8b      //mul_res2 = vmlsl_u8(mul_res2, src_tmp1, coeffabs_7)//
+    umlsl       v20.8h, v16.8b, v29.8b      //mul_res2 = vmlsl_u8(mul_res2, src_tmp1, coeffabs_7)//
 
     add         x3,x0,x2                    //pu1_src_tmp += src_strd//
-    umull       v12.8h, v3.8b, v23.8b
+    umull       v21.8h, v3.8b, v23.8b
     ld1         {v1.8b},[x3],x2             //src_tmp3 = vld1_u8(pu1_src_tmp)//
-    umlsl       v12.8h, v2.8b, v22.8b
+    umlsl       v21.8h, v2.8b, v22.8b
     ld1         {v0.8b},[x0],#8             //src_tmp1 = vld1_u8(pu1_src_tmp)//
-    umlsl       v12.8h, v4.8b, v24.8b
+    umlsl       v21.8h, v4.8b, v24.8b
     ld1         {v2.8b},[x3],x2             //src_tmp3 = vld1_u8(pu1_src_tmp)//
-    umlal       v12.8h, v5.8b, v25.8b
-    umlal       v12.8h, v6.8b, v26.8b
-    umlsl       v12.8h, v7.8b, v27.8b
-    umlal       v12.8h, v16.8b, v28.8b
-    umlsl       v12.8h, v17.8b, v29.8b
+    umlal       v21.8h, v5.8b, v25.8b
+    umlal       v21.8h, v6.8b, v26.8b
+    umlsl       v21.8h, v7.8b, v27.8b
+    umlal       v21.8h, v16.8b, v28.8b
+    umlsl       v21.8h, v17.8b, v29.8b
     add         x14,x1,x6
-    st1         {v8.16b},[x1],#16           //vst1_u8(pu1_dst,sto_res)//
+    st1         {v19.16b},[x1],#16          //vst1_u8(pu1_dst,sto_res)//
     //vqrshrun.s16 d10,q5,#6            //sto_res = vqmovun_s16(sto_res_tmp)//
     add         x20,x1,x9,lsl #1
     csel        x1, x20, x1,le
 
-    umull       v14.8h, v4.8b, v23.8b
+    umull       v30.8h, v4.8b, v23.8b
     subs        x7,x7,#4
-    umlsl       v14.8h, v3.8b, v22.8b
-    umlsl       v14.8h, v5.8b, v24.8b
-    umlal       v14.8h, v6.8b, v25.8b
+    umlsl       v30.8h, v3.8b, v22.8b
+    umlsl       v30.8h, v5.8b, v24.8b
+    umlal       v30.8h, v6.8b, v25.8b
     ld1         {v3.8b},[x3],x2             //src_tmp4 = vld1_u8(pu1_src_tmp)//
-    umlal       v14.8h, v7.8b, v26.8b
+    umlal       v30.8h, v7.8b, v26.8b
     ld1         {v4.8b},[x3],x2             //src_tmp1 = vld1_u8(pu1_src_tmp)//
-    umlsl       v14.8h, v16.8b, v27.8b
+    umlsl       v30.8h, v16.8b, v27.8b
     ld1         {v5.8b},[x3],x2             //src_tmp2 = vld1_u8(pu1_src_tmp)//
-    umlal       v14.8h, v17.8b, v28.8b
+    umlal       v30.8h, v17.8b, v28.8b
     ld1         {v6.8b},[x3],x2             //src_tmp3 = vld1_u8(pu1_src_tmp)//
-    umlsl       v14.8h, v18.8b, v29.8b
+    umlsl       v30.8h, v18.8b, v29.8b
     ld1         {v7.8b},[x3],x2             //src_tmp4 = vld1_u8(pu1_src_tmp)//
 
-    st1         {v10.16b},[x14],x6          //vst1_u8(pu1_dst_tmp,sto_res)//
+    st1         {v20.16b},[x14],x6          //vst1_u8(pu1_dst_tmp,sto_res)//
     //vqrshrun.s16 d12,q6,#6
 
 
@@ -204,170 +204,170 @@ prolog_16out:
 kernel_8_16out:
 
     subs        x4,x4,#8
-    umull       v8.8h, v1.8b, v23.8b        //mul_res1 = vmull_u8(src_tmp2, coeffabs_1)//
+    umull       v19.8h, v1.8b, v23.8b       //mul_res1 = vmull_u8(src_tmp2, coeffabs_1)//
 
     add         x20,x0,x8
     csel        x0, x20, x0,le
-    umlsl       v8.8h, v0.8b, v22.8b        //mul_res1 = vmlsl_u8(mul_res1, src_tmp1, coeffabs_0)//
+    umlsl       v19.8h, v0.8b, v22.8b       //mul_res1 = vmlsl_u8(mul_res1, src_tmp1, coeffabs_0)//
 
     ld1         {v16.8b},[x3],x2            //src_tmp1 = vld1_u8(pu1_src_tmp)//
-    umlsl       v8.8h, v2.8b, v24.8b        //mul_res1 = vmlsl_u8(mul_res1, src_tmp3, coeffabs_2)//
+    umlsl       v19.8h, v2.8b, v24.8b       //mul_res1 = vmlsl_u8(mul_res1, src_tmp3, coeffabs_2)//
 
     ld1         {v17.8b},[x3],x2            //src_tmp2 = vld1_u8(pu1_src_tmp)//
-    umlal       v8.8h, v3.8b, v25.8b        //mul_res1 = vmlal_u8(mul_res1, src_tmp4, coeffabs_3)//
+    umlal       v19.8h, v3.8b, v25.8b       //mul_res1 = vmlal_u8(mul_res1, src_tmp4, coeffabs_3)//
 
     bic         x20,x5,#7                   //x5 ->wd
     csel        x4, x20, x4,le
-    umlal       v8.8h, v4.8b, v26.8b        //mul_res1 = vmlal_u8(mul_res1, src_tmp1, coeffabs_4)//
+    umlal       v19.8h, v4.8b, v26.8b       //mul_res1 = vmlal_u8(mul_res1, src_tmp1, coeffabs_4)//
 
     ld1         {v18.8b},[x3],x2            //src_tmp3 = vld1_u8(pu1_src_tmp)//
-    umlsl       v8.8h, v5.8b, v27.8b        //mul_res1 = vmlsl_u8(mul_res1, src_tmp2, coeffabs_5)//
+    umlsl       v19.8h, v5.8b, v27.8b       //mul_res1 = vmlsl_u8(mul_res1, src_tmp2, coeffabs_5)//
 
-    st1         {v12.16b},[x14],x6
-    umlal       v8.8h, v6.8b, v28.8b        //mul_res1 = vmlal_u8(mul_res1, src_tmp3, coeffabs_6)//
+    st1         {v21.16b},[x14],x6
+    umlal       v19.8h, v6.8b, v28.8b       //mul_res1 = vmlal_u8(mul_res1, src_tmp3, coeffabs_6)//
 
     add         x3,x0,x2                    //pu1_src_tmp += src_strd//
-    umlsl       v8.8h, v7.8b, v29.8b        //mul_res1 = vmlsl_u8(mul_res1, src_tmp4, coeffabs_7)//
+    umlsl       v19.8h, v7.8b, v29.8b       //mul_res1 = vmlsl_u8(mul_res1, src_tmp4, coeffabs_7)//
 
 
 //    and            x11, x0, #31
-    umull       v10.8h, v2.8b, v23.8b       //mul_res2 = vmull_u8(src_tmp3, coeffabs_1)//
+    umull       v20.8h, v2.8b, v23.8b       //mul_res2 = vmull_u8(src_tmp3, coeffabs_1)//
 
-    st1         {v14.16b},[x14],x6
-    umlsl       v10.8h, v1.8b, v22.8b       //mul_res2 = vmlsl_u8(mul_res2, src_tmp2, coeffabs_0)//
+    st1         {v30.16b},[x14],x6
+    umlsl       v20.8h, v1.8b, v22.8b       //mul_res2 = vmlsl_u8(mul_res2, src_tmp2, coeffabs_0)//
 
     add         x14,x1,x6
-    umlsl       v10.8h, v3.8b, v24.8b       //mul_res2 = vmlsl_u8(mul_res2, src_tmp4, coeffabs_2)//
+    umlsl       v20.8h, v3.8b, v24.8b       //mul_res2 = vmlsl_u8(mul_res2, src_tmp4, coeffabs_2)//
 
     ld1         {v0.8b},[x0],#8             //src_tmp1 = vld1_u8(pu1_src_tmp)//
-    umlal       v10.8h, v4.8b, v25.8b       //mul_res2 = vmlal_u8(mul_res2, src_tmp1, coeffabs_3)//
+    umlal       v20.8h, v4.8b, v25.8b       //mul_res2 = vmlal_u8(mul_res2, src_tmp1, coeffabs_3)//
 
     ld1         {v1.8b},[x3],x2             //src_tmp2 = vld1_u8(pu1_src_tmp)//
-    umlal       v10.8h, v5.8b, v26.8b       //mul_res2 = vmlal_u8(mul_res2, src_tmp2, coeffabs_4)//
+    umlal       v20.8h, v5.8b, v26.8b       //mul_res2 = vmlal_u8(mul_res2, src_tmp2, coeffabs_4)//
 
-    st1         {v8.16b},[x1],#16           //vst1_u8(pu1_dst,sto_res)//
-    umlsl       v10.8h, v6.8b, v27.8b       //mul_res2 = vmlsl_u8(mul_res2, src_tmp3, coeffabs_5)//
+    st1         {v19.16b},[x1],#16          //vst1_u8(pu1_dst,sto_res)//
+    umlsl       v20.8h, v6.8b, v27.8b       //mul_res2 = vmlsl_u8(mul_res2, src_tmp3, coeffabs_5)//
 
     add         x20,x1,x9,lsl #1
     csel        x1, x20, x1,le
-    umlal       v10.8h, v7.8b, v28.8b       //mul_res2 = vmlal_u8(mul_res2, src_tmp4, coeffabs_6)//
+    umlal       v20.8h, v7.8b, v28.8b       //mul_res2 = vmlal_u8(mul_res2, src_tmp4, coeffabs_6)//
 
 //    cmp            x11, x10
-    umlsl       v10.8h, v16.8b, v29.8b      //mul_res2 = vmlsl_u8(mul_res2, src_tmp1, coeffabs_7)//
+    umlsl       v20.8h, v16.8b, v29.8b      //mul_res2 = vmlsl_u8(mul_res2, src_tmp1, coeffabs_7)//
 
     add         x10, x3, x2, lsl #3         // 10*strd - 8+2
-    umull       v12.8h, v3.8b, v23.8b
+    umull       v21.8h, v3.8b, v23.8b
 
     add         x10, x10, x2                // 11*strd
-    umlsl       v12.8h, v2.8b, v22.8b
+    umlsl       v21.8h, v2.8b, v22.8b
 
     add         x20,x20,x10
     prfm        PLDL1KEEP,[x20]             //11+ 0
-    umlsl       v12.8h, v4.8b, v24.8b
+    umlsl       v21.8h, v4.8b, v24.8b
 
     add         x20,x10, x2
     prfm        PLDL1KEEP,[x20]             //11+ 1*strd
-    umlal       v12.8h, v5.8b, v25.8b
+    umlal       v21.8h, v5.8b, v25.8b
 
     add         x20,x10, x2, lsl #1
     prfm        PLDL1KEEP,[x20]             //11+ 2*strd
-    umlal       v12.8h, v6.8b, v26.8b
+    umlal       v21.8h, v6.8b, v26.8b
 
     add         x10, x10, x2                //12*strd
-    umlsl       v12.8h, v7.8b, v27.8b
+    umlsl       v21.8h, v7.8b, v27.8b
 
     add         x20,x10, x2, lsl #1
     prfm        PLDL1KEEP,[x20]             //11+ 3*strd
-    umlal       v12.8h, v16.8b, v28.8b
+    umlal       v21.8h, v16.8b, v28.8b
 
 //    mov            x10, x11
-    umlsl       v12.8h, v17.8b, v29.8b
+    umlsl       v21.8h, v17.8b, v29.8b
 
     ld1         {v2.8b},[x3],x2             //src_tmp3 = vld1_u8(pu1_src_tmp)//
-    umull       v14.8h, v4.8b, v23.8b
+    umull       v30.8h, v4.8b, v23.8b
 
     subs        x7,x7,#4
-    umlsl       v14.8h, v3.8b, v22.8b
+    umlsl       v30.8h, v3.8b, v22.8b
 
-    st1         {v10.16b},[x14],x6          //vst1_u8(pu1_dst_tmp,sto_res)//
-    umlsl       v14.8h, v5.8b, v24.8b
+    st1         {v20.16b},[x14],x6          //vst1_u8(pu1_dst_tmp,sto_res)//
+    umlsl       v30.8h, v5.8b, v24.8b
 
     ld1         {v3.8b},[x3],x2             //src_tmp4 = vld1_u8(pu1_src_tmp)//
-    umlal       v14.8h, v6.8b, v25.8b
+    umlal       v30.8h, v6.8b, v25.8b
 
     ld1         {v4.8b},[x3],x2             //src_tmp1 = vld1_u8(pu1_src_tmp)//
-    umlal       v14.8h, v7.8b, v26.8b
+    umlal       v30.8h, v7.8b, v26.8b
 
     ld1         {v5.8b},[x3],x2             //src_tmp2 = vld1_u8(pu1_src_tmp)//
-    umlsl       v14.8h, v16.8b, v27.8b
+    umlsl       v30.8h, v16.8b, v27.8b
 
     ld1         {v6.8b},[x3],x2             //src_tmp3 = vld1_u8(pu1_src_tmp)//
-    umlal       v14.8h, v17.8b, v28.8b
+    umlal       v30.8h, v17.8b, v28.8b
 
     ld1         {v7.8b},[x3],x2             //src_tmp4 = vld1_u8(pu1_src_tmp)//
-    umlsl       v14.8h, v18.8b, v29.8b
+    umlsl       v30.8h, v18.8b, v29.8b
 
 
     bgt         kernel_8_16out              //jumps to kernel_8
 
 epilog_16out:
 
-    umull       v8.8h, v1.8b, v23.8b        //mul_res1 = vmull_u8(src_tmp2, coeffabs_1)//
-    umlsl       v8.8h, v0.8b, v22.8b        //mul_res1 = vmlsl_u8(mul_res1, src_tmp1, coeffabs_0)//
-    umlsl       v8.8h, v2.8b, v24.8b        //mul_res1 = vmlsl_u8(mul_res1, src_tmp3, coeffabs_2)//
-    umlal       v8.8h, v3.8b, v25.8b        //mul_res1 = vmlal_u8(mul_res1, src_tmp4, coeffabs_3)//
-    umlal       v8.8h, v4.8b, v26.8b        //mul_res1 = vmlal_u8(mul_res1, src_tmp1, coeffabs_4)//
-    umlsl       v8.8h, v5.8b, v27.8b        //mul_res1 = vmlsl_u8(mul_res1, src_tmp2, coeffabs_5)//
-    umlal       v8.8h, v6.8b, v28.8b        //mul_res1 = vmlal_u8(mul_res1, src_tmp3, coeffabs_6)//
-    umlsl       v8.8h, v7.8b, v29.8b        //mul_res1 = vmlsl_u8(mul_res1, src_tmp4, coeffabs_7)//
-    st1         {v12.16b},[x14],x6
+    umull       v19.8h, v1.8b, v23.8b       //mul_res1 = vmull_u8(src_tmp2, coeffabs_1)//
+    umlsl       v19.8h, v0.8b, v22.8b       //mul_res1 = vmlsl_u8(mul_res1, src_tmp1, coeffabs_0)//
+    umlsl       v19.8h, v2.8b, v24.8b       //mul_res1 = vmlsl_u8(mul_res1, src_tmp3, coeffabs_2)//
+    umlal       v19.8h, v3.8b, v25.8b       //mul_res1 = vmlal_u8(mul_res1, src_tmp4, coeffabs_3)//
+    umlal       v19.8h, v4.8b, v26.8b       //mul_res1 = vmlal_u8(mul_res1, src_tmp1, coeffabs_4)//
+    umlsl       v19.8h, v5.8b, v27.8b       //mul_res1 = vmlsl_u8(mul_res1, src_tmp2, coeffabs_5)//
+    umlal       v19.8h, v6.8b, v28.8b       //mul_res1 = vmlal_u8(mul_res1, src_tmp3, coeffabs_6)//
+    umlsl       v19.8h, v7.8b, v29.8b       //mul_res1 = vmlsl_u8(mul_res1, src_tmp4, coeffabs_7)//
+    st1         {v21.16b},[x14],x6
 
     //vqrshrun.s16 d14,q7,#6
 
     ld1         {v16.8b},[x3],x2            //src_tmp1 = vld1_u8(pu1_src_tmp)//
-    umull       v10.8h, v2.8b, v23.8b       //mul_res2 = vmull_u8(src_tmp3, coeffabs_1)//
-    umlsl       v10.8h, v1.8b, v22.8b       //mul_res2 = vmlsl_u8(mul_res2, src_tmp2, coeffabs_0)//
-    umlsl       v10.8h, v3.8b, v24.8b       //mul_res2 = vmlsl_u8(mul_res2, src_tmp4, coeffabs_2)//
-    umlal       v10.8h, v4.8b, v25.8b       //mul_res2 = vmlal_u8(mul_res2, src_tmp1, coeffabs_3)//
-    umlal       v10.8h, v5.8b, v26.8b       //mul_res2 = vmlal_u8(mul_res2, src_tmp2, coeffabs_4)//
-    umlsl       v10.8h, v6.8b, v27.8b       //mul_res2 = vmlsl_u8(mul_res2, src_tmp3, coeffabs_5)//
-    umlal       v10.8h, v7.8b, v28.8b       //mul_res2 = vmlal_u8(mul_res2, src_tmp4, coeffabs_6)//
-    umlsl       v10.8h, v16.8b, v29.8b      //mul_res2 = vmlsl_u8(mul_res2, src_tmp1, coeffabs_7)//
-    st1         {v14.16b},[x14],x6
+    umull       v20.8h, v2.8b, v23.8b       //mul_res2 = vmull_u8(src_tmp3, coeffabs_1)//
+    umlsl       v20.8h, v1.8b, v22.8b       //mul_res2 = vmlsl_u8(mul_res2, src_tmp2, coeffabs_0)//
+    umlsl       v20.8h, v3.8b, v24.8b       //mul_res2 = vmlsl_u8(mul_res2, src_tmp4, coeffabs_2)//
+    umlal       v20.8h, v4.8b, v25.8b       //mul_res2 = vmlal_u8(mul_res2, src_tmp1, coeffabs_3)//
+    umlal       v20.8h, v5.8b, v26.8b       //mul_res2 = vmlal_u8(mul_res2, src_tmp2, coeffabs_4)//
+    umlsl       v20.8h, v6.8b, v27.8b       //mul_res2 = vmlsl_u8(mul_res2, src_tmp3, coeffabs_5)//
+    umlal       v20.8h, v7.8b, v28.8b       //mul_res2 = vmlal_u8(mul_res2, src_tmp4, coeffabs_6)//
+    umlsl       v20.8h, v16.8b, v29.8b      //mul_res2 = vmlsl_u8(mul_res2, src_tmp1, coeffabs_7)//
+    st1         {v30.16b},[x14],x6
 
     //vqrshrun.s16 d8,q4,#6            //sto_res = vqmovun_s16(sto_res_tmp)//
 
     ld1         {v17.8b},[x3],x2            //src_tmp2 = vld1_u8(pu1_src_tmp)//
-    umull       v12.8h, v3.8b, v23.8b
-    umlsl       v12.8h, v2.8b, v22.8b
-    umlsl       v12.8h, v4.8b, v24.8b
-    umlal       v12.8h, v5.8b, v25.8b
-    umlal       v12.8h, v6.8b, v26.8b
-    umlsl       v12.8h, v7.8b, v27.8b
-    umlal       v12.8h, v16.8b, v28.8b
-    umlsl       v12.8h, v17.8b, v29.8b
+    umull       v21.8h, v3.8b, v23.8b
+    umlsl       v21.8h, v2.8b, v22.8b
+    umlsl       v21.8h, v4.8b, v24.8b
+    umlal       v21.8h, v5.8b, v25.8b
+    umlal       v21.8h, v6.8b, v26.8b
+    umlsl       v21.8h, v7.8b, v27.8b
+    umlal       v21.8h, v16.8b, v28.8b
+    umlsl       v21.8h, v17.8b, v29.8b
     add         x14,x1,x6
-    st1         {v8.16b},[x1],#16           //vst1_u8(pu1_dst,sto_res)//
+    st1         {v19.16b},[x1],#16          //vst1_u8(pu1_dst,sto_res)//
     //vqrshrun.s16 d10,q5,#6            //sto_res = vqmovun_s16(sto_res_tmp)//
 
     ld1         {v18.8b},[x3],x2            //src_tmp3 = vld1_u8(pu1_src_tmp)//
-    umull       v14.8h, v4.8b, v23.8b
-    umlsl       v14.8h, v3.8b, v22.8b
-    umlsl       v14.8h, v5.8b, v24.8b
-    umlal       v14.8h, v6.8b, v25.8b
-    umlal       v14.8h, v7.8b, v26.8b
-    umlsl       v14.8h, v16.8b, v27.8b
-    umlal       v14.8h, v17.8b, v28.8b
-    umlsl       v14.8h, v18.8b, v29.8b
-
-    st1         {v10.16b},[x14],x6          //vst1_u8(pu1_dst_tmp,sto_res)//
+    umull       v30.8h, v4.8b, v23.8b
+    umlsl       v30.8h, v3.8b, v22.8b
+    umlsl       v30.8h, v5.8b, v24.8b
+    umlal       v30.8h, v6.8b, v25.8b
+    umlal       v30.8h, v7.8b, v26.8b
+    umlsl       v30.8h, v16.8b, v27.8b
+    umlal       v30.8h, v17.8b, v28.8b
+    umlsl       v30.8h, v18.8b, v29.8b
+
+    st1         {v20.16b},[x14],x6          //vst1_u8(pu1_dst_tmp,sto_res)//
     //vqrshrun.s16 d12,q6,#6
 
 epilog_end_16out:
-    st1         {v12.16b},[x14],x6
+    st1         {v21.16b},[x14],x6
     //vqrshrun.s16 d14,q7,#6
 
-    st1         {v14.16b},[x14],x6
+    st1         {v30.16b},[x14],x6
 
 
 end_loops_16out:
@@ -377,7 +377,7 @@ end_loops_16out:
     // ldmeqfd sp!,{x4-x12,x15}    //reload the registers from sp
     bne         lbl355
     ldp         x19, x20,[sp], #16
-    pop_v_regs
+
     ret
 lbl355:
     mov         x5, #4
@@ -418,34 +418,34 @@ inner_loop_wd_4_16out:
     ld1         {v7.s}[1],[x3],x2           //src_tmp4 = vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp4, 1)//
     umlsl       v0.8h, v6.8b, v24.8b        //mul_res1 = vmlsl_u8(mul_res1, vreinterpret_u8_u32(src_tmp3), coeffabs_2)//
 
-    umull       v8.8h, v7.8b, v23.8b
+    umull       v19.8h, v7.8b, v23.8b
     dup         v4.2s, v7.2s[1]             //src_tmp1 = vdup_lane_u32(src_tmp4, 1)//
     umull       v2.8h, v7.8b, v25.8b        //mul_res2 = vmull_u8(vreinterpret_u8_u32(src_tmp4), coeffabs_3)//
     ld1         {v4.s}[1],[x3],x2           //src_tmp1 = vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp1, 1)//
-    umlsl       v8.8h, v6.8b, v22.8b
+    umlsl       v19.8h, v6.8b, v22.8b
     umlal       v0.8h, v4.8b, v26.8b        //mul_res1 = vmlal_u8(mul_res1, vreinterpret_u8_u32(src_tmp1), coeffabs_4)//
 
     dup         v5.2s, v4.2s[1]             //src_tmp2 = vdup_lane_u32(src_tmp1, 1)//
-    umlsl       v8.8h, v4.8b, v24.8b
+    umlsl       v19.8h, v4.8b, v24.8b
     ld1         {v5.s}[1],[x3],x2           //src_tmp2 = vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp2, 1)//
     umlsl       v2.8h, v5.8b, v27.8b        //mul_res2 = vmlsl_u8(mul_res2, vreinterpret_u8_u32(src_tmp2), coeffabs_5)//
 
     dup         v6.2s, v5.2s[1]             //src_tmp3 = vdup_lane_u32(src_tmp2, 1)//
-    umlal       v8.8h, v5.8b, v25.8b
+    umlal       v19.8h, v5.8b, v25.8b
     ld1         {v6.s}[1],[x3],x2           //src_tmp3 = vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp3, 1)//
     umlal       v0.8h, v6.8b, v28.8b        //mul_res1 = vmlal_u8(mul_res1, vreinterpret_u8_u32(src_tmp3), coeffabs_6)//
 
     dup         v7.2s, v6.2s[1]             //src_tmp4 = vdup_lane_u32(src_tmp3, 1)//
-    umlal       v8.8h, v6.8b, v26.8b
+    umlal       v19.8h, v6.8b, v26.8b
     ld1         {v7.s}[1],[x3],x2           //src_tmp4 = vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp4, 1)//
     umlsl       v2.8h, v7.8b, v29.8b        //mul_res2 = vmlsl_u8(mul_res2, vreinterpret_u8_u32(src_tmp4), coeffabs_7)//
 
     dup         v4.2s, v7.2s[1]
     add         v0.8h, v0.8h , v2.8h        //mul_res1 = vaddq_u16(mul_res1, mul_res2)//
 
-    umlsl       v8.8h, v7.8b, v27.8b
+    umlsl       v19.8h, v7.8b, v27.8b
     ld1         {v4.s}[1],[x3],x2
-    umlal       v8.8h, v4.8b, v28.8b
+    umlal       v19.8h, v4.8b, v28.8b
     dup         v5.2s, v4.2s[1]
     //vqrshrun.s16 d0,q0,#6            //sto_res = vqmovun_s16(sto_res_tmp)//
 
@@ -453,13 +453,13 @@ inner_loop_wd_4_16out:
     add         x3,x1,x6
     st1         {v0.d}[0],[x1],#8           //vst1_lane_u32((uint32_t *)pu1_dst, vreinterpret_u32_u8(sto_res), 0)//
 
-    umlsl       v8.8h, v5.8b, v29.8b
+    umlsl       v19.8h, v5.8b, v29.8b
     st1         {v0.d}[1],[x3],x6           //vst1_lane_u32((uint32_t *)pu1_dst_tmp, vreinterpret_u32_u8(sto_res), 1)//
     //vqrshrun.s16 d8,q4,#6
 
-    st1         {v8.d}[0],[x3],x6
+    st1         {v19.d}[0],[x3],x6
     //add          x1,x1,#4
-    st1         {v8.d}[1],[x3]
+    st1         {v19.d}[1],[x3]
     bgt         inner_loop_wd_4_16out
 
 end_inner_loop_wd_4_16out:
@@ -470,7 +470,7 @@ end_inner_loop_wd_4_16out:
 
     // ldmfd sp!, {x4-x12, x15}    //reload the registers from sp
     ldp         x19, x20,[sp], #16
-    pop_v_regs
+
     ret
 
 
diff --git a/common/arm64/ihevc_inter_pred_luma_copy_w16out.s b/common/arm64/ihevc_inter_pred_luma_copy_w16out.s
index 86ffdba..b5498cf 100644
--- a/common/arm64/ihevc_inter_pred_luma_copy_w16out.s
+++ b/common/arm64/ihevc_inter_pred_luma_copy_w16out.s
@@ -84,7 +84,7 @@
 ihevc_inter_pred_luma_copy_w16out_av8:
 
     // stmfd sp!, {x4-x12, x14}        //stack stores the values of the arguments
-    push_v_regs
+
     stp         x19, x20,[sp,#-16]!
 
     mov         x15,x4 // pi1_coeff
@@ -138,7 +138,7 @@ end_inner_loop_wd_4:
 end_loops:
     // ldmfd sp!,{x4-x12,x15}        //reload the registers from sp
     ldp         x19, x20,[sp], #16
-    pop_v_regs
+
 
     ret
 
@@ -159,14 +159,14 @@ core_loop_wd_8:
 prolog:
     add         x6,x0,x2                    //pu1_src_tmp += src_strd
     add         x10,x1,x5
-    ld1         {v8.8b},[x0],#8             //vld1_u8(pu1_src_tmp)
-    ld1         {v10.8b},[x6],x2            //vld1_u8(pu1_src_tmp)
-    ld1         {v12.8b},[x6],x2            //vld1_u8(pu1_src_tmp)
-    ld1         {v14.8b},[x6],x2            //vld1_u8(pu1_src_tmp)
-    uxtl        v16.8h, v8.8b               //vmovl_u8(vld1_u8(pu1_src_tmp))
-    uxtl        v18.8h, v10.8b              //vmovl_u8(vld1_u8(pu1_src_tmp)
-    uxtl        v20.8h, v12.8b              //vmovl_u8(vld1_u8(pu1_src_tmp)
-    uxtl        v22.8h, v14.8b              //vmovl_u8(vld1_u8(pu1_src_tmp)
+    ld1         {v1.8b},[x0],#8             //vld1_u8(pu1_src_tmp)
+    ld1         {v3.8b},[x6],x2             //vld1_u8(pu1_src_tmp)
+    ld1         {v5.8b},[x6],x2             //vld1_u8(pu1_src_tmp)
+    ld1         {v7.8b},[x6],x2             //vld1_u8(pu1_src_tmp)
+    uxtl        v16.8h, v1.8b               //vmovl_u8(vld1_u8(pu1_src_tmp))
+    uxtl        v18.8h, v3.8b               //vmovl_u8(vld1_u8(pu1_src_tmp)
+    uxtl        v20.8h, v5.8b               //vmovl_u8(vld1_u8(pu1_src_tmp)
+    uxtl        v22.8h, v7.8b               //vmovl_u8(vld1_u8(pu1_src_tmp)
     subs        x4,x4,#8                    //wd decrements by 8
     shl         v0.8h, v16.8h,#6            //vshlq_n_s16(tmp, 6)
     shl         v2.8h, v18.8h,#6            //vshlq_n_s16(tmp, 6)
@@ -175,10 +175,10 @@ prolog:
     add         x20,x0,x8
     csel        x0, x20, x0,le
     add         x6,x0,x2                    //pu1_src_tmp += src_strd
-    ld1         {v8.8b},[x0],#8             //vld1_u8(pu1_src_tmp)
-    ld1         {v10.8b},[x6],x2            //vld1_u8(pu1_src_tmp)
-    ld1         {v12.8b},[x6],x2            //vld1_u8(pu1_src_tmp)
-    ld1         {v14.8b},[x6],x2            //vld1_u8(pu1_src_tmp)
+    ld1         {v1.8b},[x0],#8             //vld1_u8(pu1_src_tmp)
+    ld1         {v3.8b},[x6],x2             //vld1_u8(pu1_src_tmp)
+    ld1         {v5.8b},[x6],x2             //vld1_u8(pu1_src_tmp)
+    ld1         {v7.8b},[x6],x2             //vld1_u8(pu1_src_tmp)
 
     st1         {v0.8h},[x1],#16            //vst1q_s16(pi2_dst_tmp, tmp)
     add         x20,x1,x11,lsl #1
@@ -196,15 +196,15 @@ prolog:
 outer_loop_wd_8:
 
     st1         {v2.8h},[x10],x5            //vst1q_s16(pi2_dst_tmp, tmp)
-    uxtl        v16.8h, v8.8b               //vmovl_u8(vld1_u8(pu1_src_tmp))
+    uxtl        v16.8h, v1.8b               //vmovl_u8(vld1_u8(pu1_src_tmp))
 
     st1         {v4.8h},[x10],x5            //vst1q_s16(pi2_dst_tmp, tmp)
-    uxtl        v18.8h, v10.8b              //vmovl_u8(vld1_u8(pu1_src_tmp)
+    uxtl        v18.8h, v3.8b               //vmovl_u8(vld1_u8(pu1_src_tmp)
 
     st1         {v6.8h},[x10],x5            //vst1q_s16(pi2_dst_tmp, tmp)
-    uxtl        v20.8h, v12.8b              //vmovl_u8(vld1_u8(pu1_src_tmp)
+    uxtl        v20.8h, v5.8b               //vmovl_u8(vld1_u8(pu1_src_tmp)
 
-    uxtl        v22.8h, v14.8b              //vmovl_u8(vld1_u8(pu1_src_tmp)
+    uxtl        v22.8h, v7.8b               //vmovl_u8(vld1_u8(pu1_src_tmp)
 
     subs        x4,x4,#8                    //wd decrements by 8
     add         x20,x0,x8
@@ -212,16 +212,16 @@ outer_loop_wd_8:
 
     add         x6,x0,x2                    //pu1_src_tmp += src_strd
 
-    ld1         {v8.8b},[x0],#8             //vld1_u8(pu1_src_tmp)
+    ld1         {v1.8b},[x0],#8             //vld1_u8(pu1_src_tmp)
     shl         v0.8h, v16.8h,#6            //vshlq_n_s16(tmp, 6)
 
-    ld1         {v10.8b},[x6],x2            //vld1_u8(pu1_src_tmp)
+    ld1         {v3.8b},[x6],x2             //vld1_u8(pu1_src_tmp)
     shl         v2.8h, v18.8h,#6            //vshlq_n_s16(tmp, 6)
 
-    ld1         {v12.8b},[x6],x2            //vld1_u8(pu1_src_tmp)
+    ld1         {v5.8b},[x6],x2             //vld1_u8(pu1_src_tmp)
     shl         v4.8h, v20.8h,#6            //vshlq_n_s16(tmp, 6)
 
-    ld1         {v14.8b},[x6],x2            //vld1_u8(pu1_src_tmp)
+    ld1         {v7.8b},[x6],x2             //vld1_u8(pu1_src_tmp)
     add         x10,x1,x5
 
     shl         v6.8h, v22.8h,#6            //vshlq_n_s16(tmp, 6)
@@ -238,15 +238,15 @@ outer_loop_wd_8:
 
 epilog:
     st1         {v2.8h},[x10],x5            //vst1q_s16(pi2_dst_tmp, tmp)
-    uxtl        v16.8h, v8.8b               //vmovl_u8(vld1_u8(pu1_src_tmp))
+    uxtl        v16.8h, v1.8b               //vmovl_u8(vld1_u8(pu1_src_tmp))
 
     st1         {v4.8h},[x10],x5            //vst1q_s16(pi2_dst_tmp, tmp)
-    uxtl        v18.8h, v10.8b              //vmovl_u8(vld1_u8(pu1_src_tmp)
+    uxtl        v18.8h, v3.8b               //vmovl_u8(vld1_u8(pu1_src_tmp)
 
     st1         {v6.8h},[x10],x5            //vst1q_s16(pi2_dst_tmp, tmp)
-    uxtl        v20.8h, v12.8b              //vmovl_u8(vld1_u8(pu1_src_tmp)
+    uxtl        v20.8h, v5.8b               //vmovl_u8(vld1_u8(pu1_src_tmp)
 
-    uxtl        v22.8h, v14.8b              //vmovl_u8(vld1_u8(pu1_src_tmp)
+    uxtl        v22.8h, v7.8b               //vmovl_u8(vld1_u8(pu1_src_tmp)
     //add          x6,x0,x2                //pu1_src_tmp += src_strd
 
     shl         v0.8h, v16.8h,#6            //vshlq_n_s16(tmp, 6)
@@ -264,7 +264,7 @@ epilog_end:
 
     // ldmfd sp!,{x4-x12,x15}        //reload the registers from sp
     ldp         x19, x20,[sp], #16
-    pop_v_regs
+
     ret
 
 
diff --git a/common/arm64/ihevc_inter_pred_luma_vert_w16inp_w16out.s b/common/arm64/ihevc_inter_pred_luma_vert_w16inp_w16out.s
index b94ec3c..7147200 100644
--- a/common/arm64/ihevc_inter_pred_luma_vert_w16inp_w16out.s
+++ b/common/arm64/ihevc_inter_pred_luma_vert_w16inp_w16out.s
@@ -114,7 +114,7 @@
 ihevc_inter_pred_luma_vert_w16inp_w16out_av8:
 
     //stmfd     sp!, {r4-r12, r14}  //stack stores the values of the arguments
-    push_v_regs
+
     stp         x19,x20,[sp, #-16]!
 
     mov         x15,x4 // pi1_coeff
@@ -163,71 +163,71 @@ prolog:
     ld1         {v0.4h},[x0], #8            //src_tmp1 = ld1_u8(pu1_src_tmp)//
     subs        x4,x4,#4
     ld1         {v2.4h},[x3],x2             //src_tmp3 = ld1_u8(pu1_src_tmp)//
-    smull       v8.4s,v1.4h,v23.4h          //mul_res1 = smull_u8(src_tmp2, coeffabs_1)//
+    smull       v19.4s,v1.4h,v23.4h         //mul_res1 = smull_u8(src_tmp2, coeffabs_1)//
     ld1         {v3.4h},[x3],x2             //src_tmp4 = ld1_u8(pu1_src_tmp)//
-    smlal       v8.4s,v0.4h,v22.4h          //mul_res1 = smlal_u8(mul_res1, src_tmp1, coeffabs_0)//
+    smlal       v19.4s,v0.4h,v22.4h         //mul_res1 = smlal_u8(mul_res1, src_tmp1, coeffabs_0)//
     ld1         {v4.4h},[x3],x2             //src_tmp1 = ld1_u8(pu1_src_tmp)//
-    smlal       v8.4s,v2.4h,v24.4h          //mul_res1 = smlal_u8(mul_res1, src_tmp3, coeffabs_2)//
+    smlal       v19.4s,v2.4h,v24.4h         //mul_res1 = smlal_u8(mul_res1, src_tmp3, coeffabs_2)//
     ld1         {v5.4h},[x3],x2             //src_tmp2 = ld1_u8(pu1_src_tmp)//
-    smlal       v8.4s,v3.4h,v25.4h          //mul_res1 = smlal_u8(mul_res1, src_tmp4, coeffabs_3)//
+    smlal       v19.4s,v3.4h,v25.4h         //mul_res1 = smlal_u8(mul_res1, src_tmp4, coeffabs_3)//
     ld1         {v6.4h},[x3],x2             //src_tmp3 = ld1_u8(pu1_src_tmp)//
-    smlal       v8.4s,v4.4h,v26.4h          //mul_res1 = smlal_u8(mul_res1, src_tmp1, coeffabs_4)//
+    smlal       v19.4s,v4.4h,v26.4h         //mul_res1 = smlal_u8(mul_res1, src_tmp1, coeffabs_4)//
     ld1         {v7.4h},[x3],x2             //src_tmp4 = ld1_u8(pu1_src_tmp)//
-    smlal       v8.4s,v5.4h,v27.4h          //mul_res1 = smlal_u8(mul_res1, src_tmp2, coeffabs_5)//
-    smlal       v8.4s,v6.4h,v28.4h          //mul_res1 = smlal_u8(mul_res1, src_tmp3, coeffabs_6)//
-    smlal       v8.4s,v7.4h,v29.4h          //mul_res1 = smlal_u8(mul_res1, src_tmp4, coeffabs_7)//
+    smlal       v19.4s,v5.4h,v27.4h         //mul_res1 = smlal_u8(mul_res1, src_tmp2, coeffabs_5)//
+    smlal       v19.4s,v6.4h,v28.4h         //mul_res1 = smlal_u8(mul_res1, src_tmp3, coeffabs_6)//
+    smlal       v19.4s,v7.4h,v29.4h         //mul_res1 = smlal_u8(mul_res1, src_tmp4, coeffabs_7)//
 
     ld1         {v16.4h},[x3],x2            //src_tmp1 = ld1_u8(pu1_src_tmp)//
 
-    smull       v10.4s,v2.4h,v23.4h         //mul_res2 = smull_u8(src_tmp3, coeffabs_1)//
+    smull       v20.4s,v2.4h,v23.4h         //mul_res2 = smull_u8(src_tmp3, coeffabs_1)//
     add         x20,x0,x8,lsl #0
     csel        x0,x20,x0,le
-    smlal       v10.4s,v1.4h,v22.4h         //mul_res2 = smlal_u8(mul_res2, src_tmp2, coeffabs_0)//
+    smlal       v20.4s,v1.4h,v22.4h         //mul_res2 = smlal_u8(mul_res2, src_tmp2, coeffabs_0)//
     csel        x4,x5,x4,le
-    smlal       v10.4s,v3.4h,v24.4h         //mul_res2 = smlal_u8(mul_res2, src_tmp4, coeffabs_2)//
+    smlal       v20.4s,v3.4h,v24.4h         //mul_res2 = smlal_u8(mul_res2, src_tmp4, coeffabs_2)//
     ld1         {v17.4h},[x3],x2            //src_tmp2 = ld1_u8(pu1_src_tmp)//
-    smlal       v10.4s,v4.4h,v25.4h         //mul_res2 = smlal_u8(mul_res2, src_tmp1, coeffabs_3)//
+    smlal       v20.4s,v4.4h,v25.4h         //mul_res2 = smlal_u8(mul_res2, src_tmp1, coeffabs_3)//
     ld1         {v18.4h},[x3],x2            //src_tmp3 = ld1_u8(pu1_src_tmp)//
-    smlal       v10.4s,v5.4h,v26.4h         //mul_res2 = smlal_u8(mul_res2, src_tmp2, coeffabs_4)//
+    smlal       v20.4s,v5.4h,v26.4h         //mul_res2 = smlal_u8(mul_res2, src_tmp2, coeffabs_4)//
     add         x3,x0,x2                    //pu1_src_tmp += src_strd//
-    smlal       v10.4s,v6.4h,v27.4h         //mul_res2 = smlal_u8(mul_res2, src_tmp3, coeffabs_5)//
-    smlal       v10.4s,v7.4h,v28.4h         //mul_res2 = smlal_u8(mul_res2, src_tmp4, coeffabs_6)//
-    smlal       v10.4s,v16.4h,v29.4h        //mul_res2 = smlal_u8(mul_res2, src_tmp1, coeffabs_7)//
-    sub         v8.4s, v8.4s, v30.4s
+    smlal       v20.4s,v6.4h,v27.4h         //mul_res2 = smlal_u8(mul_res2, src_tmp3, coeffabs_5)//
+    smlal       v20.4s,v7.4h,v28.4h         //mul_res2 = smlal_u8(mul_res2, src_tmp4, coeffabs_6)//
+    smlal       v20.4s,v16.4h,v29.4h        //mul_res2 = smlal_u8(mul_res2, src_tmp1, coeffabs_7)//
+    sub         v19.4s, v19.4s, v30.4s
 
     ld1         {v1.4h},[x3],x2             //src_tmp3 = ld1_u8(pu1_src_tmp)//
-    smull       v12.4s,v3.4h,v23.4h
+    smull       v21.4s,v3.4h,v23.4h
     ld1         {v0.4h},[x0],#8             //src_tmp1 = ld1_u8(pu1_src_tmp)//
-    smlal       v12.4s,v2.4h,v22.4h
+    smlal       v21.4s,v2.4h,v22.4h
     ld1         {v2.4h},[x3],x2             //src_tmp3 = ld1_u8(pu1_src_tmp)//
-    smlal       v12.4s,v4.4h,v24.4h
-    smlal       v12.4s,v5.4h,v25.4h
-    smlal       v12.4s,v6.4h,v26.4h
-    smlal       v12.4s,v7.4h,v27.4h
-    smlal       v12.4s,v16.4h,v28.4h
-    smlal       v12.4s,v17.4h,v29.4h
+    smlal       v21.4s,v4.4h,v24.4h
+    smlal       v21.4s,v5.4h,v25.4h
+    smlal       v21.4s,v6.4h,v26.4h
+    smlal       v21.4s,v7.4h,v27.4h
+    smlal       v21.4s,v16.4h,v28.4h
+    smlal       v21.4s,v17.4h,v29.4h
     add         x14,x1,x6
-    sub         v10.4s, v10.4s, v30.4s
-    shrn        v8.4h, v8.4s, #6
+    sub         v20.4s, v20.4s, v30.4s
+    shrn        v19.4h, v19.4s, #6
     //vqrshrun d8,q4,#6         //sto_res = vqmovun_s16(sto_res_tmp)//
 
-    smull       v14.4s,v4.4h,v23.4h
-    smlal       v14.4s,v3.4h,v22.4h
-    smlal       v14.4s,v5.4h,v24.4h
-    smlal       v14.4s,v6.4h,v25.4h
+    smull       v31.4s,v4.4h,v23.4h
+    smlal       v31.4s,v3.4h,v22.4h
+    smlal       v31.4s,v5.4h,v24.4h
+    smlal       v31.4s,v6.4h,v25.4h
     ld1         {v3.4h},[x3],x2             //src_tmp4 = ld1_u8(pu1_src_tmp)//
-    smlal       v14.4s,v7.4h,v26.4h
+    smlal       v31.4s,v7.4h,v26.4h
     ld1         {v4.4h},[x3],x2             //src_tmp1 = ld1_u8(pu1_src_tmp)//
-    smlal       v14.4s,v16.4h,v27.4h
+    smlal       v31.4s,v16.4h,v27.4h
     ld1         {v5.4h},[x3],x2             //src_tmp2 = ld1_u8(pu1_src_tmp)//
-    smlal       v14.4s,v17.4h,v28.4h
+    smlal       v31.4s,v17.4h,v28.4h
     ld1         {v6.4h},[x3],x2             //src_tmp3 = ld1_u8(pu1_src_tmp)//
-    smlal       v14.4s,v18.4h,v29.4h
+    smlal       v31.4s,v18.4h,v29.4h
     ld1         {v7.4h},[x3],x2             //src_tmp4 = ld1_u8(pu1_src_tmp)//
 
-    st1         {v8.2s},[x1],#8             //st1_u8(pu1_dst,sto_res)//
-    sub         v12.4s, v12.4s, v30.4s
-    shrn        v10.4h, v10.4s, #6
+    st1         {v19.2s},[x1],#8            //st1_u8(pu1_dst,sto_res)//
+    sub         v21.4s, v21.4s, v30.4s
+    shrn        v20.4h, v20.4s, #6
     //vqrshrun d10,q5,#6            //sto_res = vqmovun_s16(sto_res_tmp)//
     add         x20, x1, x9
     csel        x1, x20, x1, le
@@ -240,87 +240,87 @@ prolog:
 
 kernel_8:
 
-    smull       v8.4s,v1.4h,v23.4h          //mul_res1 = smull_u8(src_tmp2, coeffabs_1)//
+    smull       v19.4s,v1.4h,v23.4h         //mul_res1 = smull_u8(src_tmp2, coeffabs_1)//
     subs        x4,x4,#4
-    smlal       v8.4s,v0.4h,v22.4h          //mul_res1 = smlal_u8(mul_res1, src_tmp1, coeffabs_0)//
+    smlal       v19.4s,v0.4h,v22.4h         //mul_res1 = smlal_u8(mul_res1, src_tmp1, coeffabs_0)//
     add         x20,x0,x8,lsl #0
     csel        x0,x20,x0,le
-    smlal       v8.4s,v2.4h,v24.4h          //mul_res1 = smlal_u8(mul_res1, src_tmp3, coeffabs_2)//
-    smlal       v8.4s,v3.4h,v25.4h          //mul_res1 = smlal_u8(mul_res1, src_tmp4, coeffabs_3)//
-    smlal       v8.4s,v4.4h,v26.4h          //mul_res1 = smlal_u8(mul_res1, src_tmp1, coeffabs_4)//
-    smlal       v8.4s,v5.4h,v27.4h          //mul_res1 = smlal_u8(mul_res1, src_tmp2, coeffabs_5)//
-    smlal       v8.4s,v6.4h,v28.4h          //mul_res1 = smlal_u8(mul_res1, src_tmp3, coeffabs_6)//
-    smlal       v8.4s,v7.4h,v29.4h          //mul_res1 = smlal_u8(mul_res1, src_tmp4, coeffabs_7)//
-    st1         {v10.2s},[x14],x6           //st1_u8(pu1_dst_tmp,sto_res)//
-
-    sub         v14.4S, v14.4s, v30.4s
-    shrn        v12.4h, v12.4s, #6
+    smlal       v19.4s,v2.4h,v24.4h         //mul_res1 = smlal_u8(mul_res1, src_tmp3, coeffabs_2)//
+    smlal       v19.4s,v3.4h,v25.4h         //mul_res1 = smlal_u8(mul_res1, src_tmp4, coeffabs_3)//
+    smlal       v19.4s,v4.4h,v26.4h         //mul_res1 = smlal_u8(mul_res1, src_tmp1, coeffabs_4)//
+    smlal       v19.4s,v5.4h,v27.4h         //mul_res1 = smlal_u8(mul_res1, src_tmp2, coeffabs_5)//
+    smlal       v19.4s,v6.4h,v28.4h         //mul_res1 = smlal_u8(mul_res1, src_tmp3, coeffabs_6)//
+    smlal       v19.4s,v7.4h,v29.4h         //mul_res1 = smlal_u8(mul_res1, src_tmp4, coeffabs_7)//
+    st1         {v20.2s},[x14],x6           //st1_u8(pu1_dst_tmp,sto_res)//
+
+    sub         v31.4S, v31.4s, v30.4s
+    shrn        v21.4h, v21.4s, #6
     //vqrshrun d12,q6,#6
     ld1         {v16.4h},[x3],x2            //src_tmp1 = ld1_u8(pu1_src_tmp)//
 
-    smull       v10.4s,v2.4h,v23.4h         //mul_res2 = smull_u8(src_tmp3, coeffabs_1)//
-    smlal       v10.4s,v1.4h,v22.4h         //mul_res2 = smlal_u8(mul_res2, src_tmp2, coeffabs_0)//
-    smlal       v10.4s,v3.4h,v24.4h         //mul_res2 = smlal_u8(mul_res2, src_tmp4, coeffabs_2)//
-    smlal       v10.4s,v4.4h,v25.4h         //mul_res2 = smlal_u8(mul_res2, src_tmp1, coeffabs_3)//
-    smlal       v10.4s,v5.4h,v26.4h         //mul_res2 = smlal_u8(mul_res2, src_tmp2, coeffabs_4)//
-    smlal       v10.4s,v6.4h,v27.4h         //mul_res2 = smlal_u8(mul_res2, src_tmp3, coeffabs_5)//
-    st1         {v12.2s},[x14],x6
+    smull       v20.4s,v2.4h,v23.4h         //mul_res2 = smull_u8(src_tmp3, coeffabs_1)//
+    smlal       v20.4s,v1.4h,v22.4h         //mul_res2 = smlal_u8(mul_res2, src_tmp2, coeffabs_0)//
+    smlal       v20.4s,v3.4h,v24.4h         //mul_res2 = smlal_u8(mul_res2, src_tmp4, coeffabs_2)//
+    smlal       v20.4s,v4.4h,v25.4h         //mul_res2 = smlal_u8(mul_res2, src_tmp1, coeffabs_3)//
+    smlal       v20.4s,v5.4h,v26.4h         //mul_res2 = smlal_u8(mul_res2, src_tmp2, coeffabs_4)//
+    smlal       v20.4s,v6.4h,v27.4h         //mul_res2 = smlal_u8(mul_res2, src_tmp3, coeffabs_5)//
+    st1         {v21.2s},[x14],x6
 
-    smlal       v10.4s,v7.4h,v28.4h         //mul_res2 = smlal_u8(mul_res2, src_tmp4, coeffabs_6)//
+    smlal       v20.4s,v7.4h,v28.4h         //mul_res2 = smlal_u8(mul_res2, src_tmp4, coeffabs_6)//
     ld1         {v17.4h},[x3],x2            //src_tmp2 = ld1_u8(pu1_src_tmp)//
 
-    smlal       v10.4s,v16.4h,v29.4h        //mul_res2 = smlal_u8(mul_res2, src_tmp1, coeffabs_7)//
+    smlal       v20.4s,v16.4h,v29.4h        //mul_res2 = smlal_u8(mul_res2, src_tmp1, coeffabs_7)//
 
-    sub         v8.4s, v8.4s, v30.4s
-    shrn        v14.4h, v14.4s, #6
+    sub         v19.4s, v19.4s, v30.4s
+    shrn        v31.4h, v31.4s, #6
     //vqrshrun d14,q7,#6
 
-    smull       v12.4s,v3.4h,v23.4h
+    smull       v21.4s,v3.4h,v23.4h
     csel        x4,x5,x4,le
 
-    smlal       v12.4s,v2.4h,v22.4h
+    smlal       v21.4s,v2.4h,v22.4h
     ld1         {v18.4h},[x3],x2            //src_tmp3 = ld1_u8(pu1_src_tmp)//
 
-    smlal       v12.4s,v4.4h,v24.4h
+    smlal       v21.4s,v4.4h,v24.4h
     add         x3,x0,x2                    //pu1_src_tmp += src_strd//
 
-    smlal       v12.4s,v5.4h,v25.4h
+    smlal       v21.4s,v5.4h,v25.4h
 
-    smlal       v12.4s,v6.4h,v26.4h
-    st1         {v14.2s},[x14],x6
+    smlal       v21.4s,v6.4h,v26.4h
+    st1         {v31.2s},[x14],x6
 
-    smlal       v12.4s,v7.4h,v27.4h
+    smlal       v21.4s,v7.4h,v27.4h
     ld1         {v1.4h},[x3],x2             //src_tmp2 = ld1_u8(pu1_src_tmp)//
 
-    smlal       v12.4s,v16.4h,v28.4h
+    smlal       v21.4s,v16.4h,v28.4h
     add         x14,x1,x6
 
-    smlal       v12.4s,v17.4h,v29.4h
+    smlal       v21.4s,v17.4h,v29.4h
     ld1         {v0.4h},[x0],#8             //src_tmp1 = ld1_u8(pu1_src_tmp)//
 
-    sub         v10.4s, v10.4s, v30.4s
-    shrn        v8.4h, v8.4s, #6
+    sub         v20.4s, v20.4s, v30.4s
+    shrn        v19.4h, v19.4s, #6
     //vqrshrun d8,q4,#6         //sto_res = vqmovun_s16(sto_res_tmp)//
     ld1         {v2.4h},[x3],x2             //src_tmp3 = ld1_u8(pu1_src_tmp)//
 
-    smull       v14.4s,v4.4h,v23.4h
-    smlal       v14.4s,v3.4h,v22.4h
-    smlal       v14.4s,v5.4h,v24.4h
+    smull       v31.4s,v4.4h,v23.4h
+    smlal       v31.4s,v3.4h,v22.4h
+    smlal       v31.4s,v5.4h,v24.4h
     ld1         {v3.4h},[x3],x2             //src_tmp4 = ld1_u8(pu1_src_tmp)//
 
-    smlal       v14.4s,v6.4h,v25.4h
+    smlal       v31.4s,v6.4h,v25.4h
     ld1         {v4.4h},[x3],x2             //src_tmp1 = ld1_u8(pu1_src_tmp)//
-    smlal       v14.4s,v7.4h,v26.4h
+    smlal       v31.4s,v7.4h,v26.4h
     ld1         {v5.4h},[x3],x2             //src_tmp2 = ld1_u8(pu1_src_tmp)//
-    smlal       v14.4s,v16.4h,v27.4h
+    smlal       v31.4s,v16.4h,v27.4h
     ld1         {v6.4h},[x3],x2             //src_tmp3 = ld1_u8(pu1_src_tmp)//
-    smlal       v14.4s,v17.4h,v28.4h
+    smlal       v31.4s,v17.4h,v28.4h
     ld1         {v7.4h},[x3],x2             //src_tmp4 = ld1_u8(pu1_src_tmp)//
-    smlal       v14.4s,v18.4h,v29.4h
-    st1         {v8.2s},[x1],#8             //st1_u8(pu1_dst,sto_res)//
+    smlal       v31.4s,v18.4h,v29.4h
+    st1         {v19.2s},[x1],#8            //st1_u8(pu1_dst,sto_res)//
 
-    sub         v12.4s, v12.4s, v30.4s
-    shrn        v10.4h, v10.4s, #6
+    sub         v21.4s, v21.4s, v30.4s
+    shrn        v20.4h, v20.4s, #6
     add         x20, x1, x9
     csel        x1, x20, x1, le
 
@@ -331,83 +331,83 @@ kernel_8:
 
 epilog:
 
-    smull       v8.4s,v1.4h,v23.4h          //mul_res1 = smull_u8(src_tmp2, coeffabs_1)//
-    smlal       v8.4s,v0.4h,v22.4h          //mul_res1 = smlal_u8(mul_res1, src_tmp1, coeffabs_0)//
-    smlal       v8.4s,v2.4h,v24.4h          //mul_res1 = smlal_u8(mul_res1, src_tmp3, coeffabs_2)//
-    smlal       v8.4s,v3.4h,v25.4h          //mul_res1 = smlal_u8(mul_res1, src_tmp4, coeffabs_3)//
-    smlal       v8.4s,v4.4h,v26.4h          //mul_res1 = smlal_u8(mul_res1, src_tmp1, coeffabs_4)//
-    smlal       v8.4s,v5.4h,v27.4h          //mul_res1 = smlal_u8(mul_res1, src_tmp2, coeffabs_5)//
-    smlal       v8.4s,v6.4h,v28.4h          //mul_res1 = smlal_u8(mul_res1, src_tmp3, coeffabs_6)//
-    smlal       v8.4s,v7.4h,v29.4h          //mul_res1 = smlal_u8(mul_res1, src_tmp4, coeffabs_7)//
-    st1         {v10.2s},[x14],x6
-
-    sub         v14.4s, v14.4s, v30.4s
-    shrn        v12.4h, v12.4s, #6
+    smull       v19.4s,v1.4h,v23.4h         //mul_res1 = smull_u8(src_tmp2, coeffabs_1)//
+    smlal       v19.4s,v0.4h,v22.4h         //mul_res1 = smlal_u8(mul_res1, src_tmp1, coeffabs_0)//
+    smlal       v19.4s,v2.4h,v24.4h         //mul_res1 = smlal_u8(mul_res1, src_tmp3, coeffabs_2)//
+    smlal       v19.4s,v3.4h,v25.4h         //mul_res1 = smlal_u8(mul_res1, src_tmp4, coeffabs_3)//
+    smlal       v19.4s,v4.4h,v26.4h         //mul_res1 = smlal_u8(mul_res1, src_tmp1, coeffabs_4)//
+    smlal       v19.4s,v5.4h,v27.4h         //mul_res1 = smlal_u8(mul_res1, src_tmp2, coeffabs_5)//
+    smlal       v19.4s,v6.4h,v28.4h         //mul_res1 = smlal_u8(mul_res1, src_tmp3, coeffabs_6)//
+    smlal       v19.4s,v7.4h,v29.4h         //mul_res1 = smlal_u8(mul_res1, src_tmp4, coeffabs_7)//
+    st1         {v20.2s},[x14],x6
+
+    sub         v31.4s, v31.4s, v30.4s
+    shrn        v21.4h, v21.4s, #6
     //vqrshrun d12,q6,#6
 
     ld1         {v16.4h},[x3],x2            //src_tmp1 = ld1_u8(pu1_src_tmp)//
-    smull       v10.4s,v2.4h,v23.4h         //mul_res2 = smull_u8(src_tmp3, coeffabs_1)//
-    smlal       v10.4s,v1.4h,v22.4h         //mul_res2 = smlal_u8(mul_res2, src_tmp2, coeffabs_0)//
-    smlal       v10.4s,v3.4h,v24.4h         //mul_res2 = smlal_u8(mul_res2, src_tmp4, coeffabs_2)//
-    smlal       v10.4s,v4.4h,v25.4h         //mul_res2 = smlal_u8(mul_res2, src_tmp1, coeffabs_3)//
-    smlal       v10.4s,v5.4h,v26.4h         //mul_res2 = smlal_u8(mul_res2, src_tmp2, coeffabs_4)//
-    smlal       v10.4s,v6.4h,v27.4h         //mul_res2 = smlal_u8(mul_res2, src_tmp3, coeffabs_5)//
-    smlal       v10.4s,v7.4h,v28.4h         //mul_res2 = smlal_u8(mul_res2, src_tmp4, coeffabs_6)//
-    smlal       v10.4s,v16.4h,v29.4h        //mul_res2 = smlal_u8(mul_res2, src_tmp1, coeffabs_7)//
-    st1         {v12.2s},[x14],x6
-
-    sub         v8.4s, v8.4s, v30.4s
-    shrn        v14.4h, v14.4s, #6
+    smull       v20.4s,v2.4h,v23.4h         //mul_res2 = smull_u8(src_tmp3, coeffabs_1)//
+    smlal       v20.4s,v1.4h,v22.4h         //mul_res2 = smlal_u8(mul_res2, src_tmp2, coeffabs_0)//
+    smlal       v20.4s,v3.4h,v24.4h         //mul_res2 = smlal_u8(mul_res2, src_tmp4, coeffabs_2)//
+    smlal       v20.4s,v4.4h,v25.4h         //mul_res2 = smlal_u8(mul_res2, src_tmp1, coeffabs_3)//
+    smlal       v20.4s,v5.4h,v26.4h         //mul_res2 = smlal_u8(mul_res2, src_tmp2, coeffabs_4)//
+    smlal       v20.4s,v6.4h,v27.4h         //mul_res2 = smlal_u8(mul_res2, src_tmp3, coeffabs_5)//
+    smlal       v20.4s,v7.4h,v28.4h         //mul_res2 = smlal_u8(mul_res2, src_tmp4, coeffabs_6)//
+    smlal       v20.4s,v16.4h,v29.4h        //mul_res2 = smlal_u8(mul_res2, src_tmp1, coeffabs_7)//
+    st1         {v21.2s},[x14],x6
+
+    sub         v19.4s, v19.4s, v30.4s
+    shrn        v31.4h, v31.4s, #6
     //vqrshrun d14,q7,#6
 
     ld1         {v17.4h},[x3],x2            //src_tmp2 = ld1_u8(pu1_src_tmp)//
-    smull       v12.4s,v3.4h,v23.4h
-    smlal       v12.4s,v2.4h,v22.4h
-    smlal       v12.4s,v4.4h,v24.4h
-    smlal       v12.4s,v5.4h,v25.4h
-    smlal       v12.4s,v6.4h,v26.4h
-    smlal       v12.4s,v7.4h,v27.4h
-    smlal       v12.4s,v16.4h,v28.4h
-    smlal       v12.4s,v17.4h,v29.4h
-    st1         {v14.2s},[x14],x6
-    sub         v10.4s, v10.4s, v30.4s
-    shrn        v8.4h, v8.4s, #6
+    smull       v21.4s,v3.4h,v23.4h
+    smlal       v21.4s,v2.4h,v22.4h
+    smlal       v21.4s,v4.4h,v24.4h
+    smlal       v21.4s,v5.4h,v25.4h
+    smlal       v21.4s,v6.4h,v26.4h
+    smlal       v21.4s,v7.4h,v27.4h
+    smlal       v21.4s,v16.4h,v28.4h
+    smlal       v21.4s,v17.4h,v29.4h
+    st1         {v31.2s},[x14],x6
+    sub         v20.4s, v20.4s, v30.4s
+    shrn        v19.4h, v19.4s, #6
     //vqrshrun d8,q4,#6         //sto_res = vqmovun_s16(sto_res_tmp)//
 
     ld1         {v18.4h},[x3],x2            //src_tmp3 = ld1_u8(pu1_src_tmp)//
-    smull       v14.4s,v4.4h,v23.4h
-    smlal       v14.4s,v3.4h,v22.4h
-    smlal       v14.4s,v5.4h,v24.4h
-    smlal       v14.4s,v6.4h,v25.4h
-    smlal       v14.4s,v7.4h,v26.4h
-    smlal       v14.4s,v16.4h,v27.4h
-    smlal       v14.4s,v17.4h,v28.4h
-    smlal       v14.4s,v18.4h,v29.4h
-    sub         v12.4s, v12.4s, v30.4s
-    shrn        v10.4h, v10.4s, #6
+    smull       v31.4s,v4.4h,v23.4h
+    smlal       v31.4s,v3.4h,v22.4h
+    smlal       v31.4s,v5.4h,v24.4h
+    smlal       v31.4s,v6.4h,v25.4h
+    smlal       v31.4s,v7.4h,v26.4h
+    smlal       v31.4s,v16.4h,v27.4h
+    smlal       v31.4s,v17.4h,v28.4h
+    smlal       v31.4s,v18.4h,v29.4h
+    sub         v21.4s, v21.4s, v30.4s
+    shrn        v20.4h, v20.4s, #6
     //vqrshrun d10,q5,#6            //sto_res = vqmovun_s16(sto_res_tmp)//
 
     add         x14,x1,x6
-    st1         {v8.2s},[x1],#8             //st1_u8(pu1_dst,sto_res)//
+    st1         {v19.2s},[x1],#8            //st1_u8(pu1_dst,sto_res)//
 
 epilog_end:
-    st1         {v10.2s},[x14],x6           //st1_u8(pu1_dst_tmp,sto_res)//
-    shrn        v12.4h, v12.4s, #6
+    st1         {v20.2s},[x14],x6           //st1_u8(pu1_dst_tmp,sto_res)//
+    shrn        v21.4h, v21.4s, #6
     //vqrshrun d12,q6,#6
 
-    st1         {v12.2s},[x14],x6
-    sub         v14.4s, v14.4s, v30.4s
-    shrn        v14.4h, v14.4s, #6
+    st1         {v21.2s},[x14],x6
+    sub         v31.4s, v31.4s, v30.4s
+    shrn        v31.4h, v31.4s, #6
     //vqrshrun d14,q7,#6
 
-    st1         {v14.2s},[x14],x6
+    st1         {v31.2s},[x14],x6
 
 
 end_loops:
 
     //ldmfd     sp!,{r4-r12,r15}            //reload the registers from sp
     ldp         x19, x20,[sp], #16
-    pop_v_regs
+
     ret
 
 
diff --git a/common/arm64/ihevc_intra_pred_chroma_horz.s b/common/arm64/ihevc_intra_pred_chroma_horz.s
index da41e59..8de655c 100644
--- a/common/arm64/ihevc_intra_pred_chroma_horz.s
+++ b/common/arm64/ihevc_intra_pred_chroma_horz.s
@@ -96,7 +96,7 @@
 ihevc_intra_pred_chroma_horz_av8:
 
     // stmfd sp!, {x4-x12, x14}                //stack stores the values of the arguments
-    push_v_regs
+
     stp         x19, x20,[sp,#-16]!
 
     lsl         x6,x4,#2                    //four_nt
@@ -117,7 +117,7 @@ ihevc_intra_pred_chroma_horz_av8:
 core_loop_16:
     ld1         { v0.8h},[x12]              //load 16 values. d1[7] will have the 1st value.
     sub         x12,x12,#16
-    ld1         { v10.8h},[x12]             //load 16 values. d1[7] will have the 1st value.
+    ld1         { v18.8h},[x12]             //load 16 values. d1[7] will have the 1st value.
 
     dup         v2.8h, v0.4h[7]             //duplicate the i value.
 
@@ -126,7 +126,7 @@ core_loop_16:
     st1         { v2.8h},[x2],x3            //store in 1st row 0-16 columns
     st1         { v2.8h},[x9],x3            //store in 1st row 16-32 columns
 
-    dup         v8.8h, v0.4h[4]
+    dup         v1.8h, v0.4h[4]
     st1         { v4.8h},[x2],x3
     st1         { v4.8h},[x9],x3
 
@@ -135,47 +135,47 @@ core_loop_16:
     st1         { v6.8h},[x9],x3
 
     dup         v4.8h, v0.4h[2]
-    st1         { v8.8h},[x2],x3
-    st1         { v8.8h},[x9],x3
+    st1         { v1.8h},[x2],x3
+    st1         { v1.8h},[x9],x3
 
     dup         v6.8h, v0.4h[1]
     st1         { v2.8h},[x2],x3
     st1         { v2.8h},[x9],x3
 
-    dup         v8.8h, v0.4h[0]
+    dup         v1.8h, v0.4h[0]
     st1         { v4.8h},[x2],x3
     st1         { v4.8h},[x9],x3
 
-    dup         v2.8h, v10.4h[7]
+    dup         v2.8h, v18.4h[7]
     st1         { v6.8h},[x2],x3
     st1         { v6.8h},[x9],x3
 
-    dup         v4.8h, v10.4h[6]
-    st1         { v8.8h},[x2],x3
-    st1         { v8.8h},[x9],x3
+    dup         v4.8h, v18.4h[6]
+    st1         { v1.8h},[x2],x3
+    st1         { v1.8h},[x9],x3
 
-    dup         v6.8h, v10.4h[5]
+    dup         v6.8h, v18.4h[5]
     st1         { v2.8h},[x2],x3
     st1         { v2.8h},[x9],x3
 
-    dup         v8.8h, v10.4h[4]
+    dup         v1.8h, v18.4h[4]
     st1         { v4.8h},[x2],x3
     st1         { v4.8h},[x9],x3
 
-    dup         v2.8h, v10.4h[3]
+    dup         v2.8h, v18.4h[3]
     st1         { v6.8h},[x2],x3
     st1         { v6.8h},[x9],x3
 
-    dup         v4.8h, v10.4h[2]
-    st1         { v8.8h},[x2],x3
-    st1         { v8.8h},[x9],x3
+    dup         v4.8h, v18.4h[2]
+    st1         { v1.8h},[x2],x3
+    st1         { v1.8h},[x9],x3
 
-    dup         v6.8h, v10.4h[1]
+    dup         v6.8h, v18.4h[1]
     st1         { v2.8h},[x2],x3
     st1         { v2.8h},[x9],x3
     sub         x12,x12,#16                 //move to 16th value pointer
 
-    dup         v8.8h, v10.4h[0]
+    dup         v1.8h, v18.4h[0]
     st1         { v4.8h},[x2],x3
     st1         { v4.8h},[x9],x3
 
@@ -183,12 +183,12 @@ core_loop_16:
     st1         { v6.8h},[x2],x3
     st1         { v6.8h},[x9],x3
 
-    st1         { v8.8h},[x2],x3
-    st1         { v8.8h},[x9],x3
+    st1         { v1.8h},[x2],x3
+    st1         { v1.8h},[x9],x3
     bgt         core_loop_16
     // ldmfd sp!,{x4-x12,x15}                  //reload the registers from sp
     ldp         x19, x20,[sp],#16
-    pop_v_regs
+
     ret
     b           endloop
 
@@ -203,7 +203,7 @@ core_loop_8:
 
     sub         x12,x12,#16
 //    ld1 { v30.16b},[x12]
-    dup         v10.8h, v0.4h[7]
+    dup         v18.8h, v0.4h[7]
     //vmovl.u8    q13,d26
 
     dup         v2.8h, v0.4h[6]
@@ -215,18 +215,18 @@ core_loop_8:
     dup         v6.8h, v0.4h[4]
     //vqadd.s16    q11,q13,q12
 
-    dup         v8.8h, v0.4h[3]
+    dup         v1.8h, v0.4h[3]
     //vqmovun.s16 d22,q11
 
-    st1         { v10.8h},[x2],x3
+    st1         { v18.8h},[x2],x3
 
-    dup         v10.8h, v0.4h[2]
+    dup         v18.8h, v0.4h[2]
     //vsubl.u8    q12,d31,d28
 
-    dup         v12.8h, v0.4h[1]
+    dup         v19.8h, v0.4h[1]
     //vshr.s16    q12,q12,#1
 
-    dup         v14.8h, v0.4h[0]
+    dup         v20.8h, v0.4h[0]
     //vqadd.s16    q11,q13,q12
 
     dup         v16.8h, v0.4h[3]
@@ -238,14 +238,14 @@ core_loop_8:
     st1         { v4.8h},[x2],x3
 
     st1         { v6.8h},[x2],x3
-    st1         { v8.8h},[x2],x3
-    st1         { v10.8h},[x2],x3
+    st1         { v1.8h},[x2],x3
+    st1         { v18.8h},[x2],x3
 
     //vdup.8        q1,d0[2]
-    st1         { v12.8h},[x2],x3
+    st1         { v19.8h},[x2],x3
 
     //vdup.8        q2,d0[1]
-    st1         { v14.8h},[x2],x3
+    st1         { v20.8h},[x2],x3
 
     //vdup.8        q3,d0[0]
     //vst1.8        {q7},[x2],x3
@@ -269,7 +269,7 @@ core_loop_8:
 
     // ldmfd sp!,{x4-x12,x15}                  //reload the registers from sp
     ldp         x19, x20,[sp],#16
-    pop_v_regs
+
     ret
     b           endloop
 
@@ -305,11 +305,11 @@ core_loop_4:
     st1         {v6.8b},[x2],x3
     st1         {v3.8b},[x2],x3
 
-    dup         v8.4h, v0.4h[1]
+    dup         v1.4h, v0.4h[1]
     st1         {v4.8b},[x2],x3
     st1         {v5.8b},[x2],x3
 
-    dup         v9.4h, v0.4h[0]
+    dup         v17.4h, v0.4h[0]
     //vst1.8        {d6},[x2],x3
     //vst1.8        {d7},[x2],x3
 
@@ -317,7 +317,7 @@ core_loop_4:
     //vst1.8        {d9},[x2],x3
     // ldmfd sp!,{x4-x12,x15}                  //reload the registers from sp
     ldp         x19, x20,[sp],#16
-    pop_v_regs
+
     ret
     b           endloop
 
@@ -352,7 +352,7 @@ core_loop_4:
 
     // ldmfd sp!,{x4-x12,x15}                  //reload the registers from sp
     ldp         x19, x20,[sp],#16
-    pop_v_regs
+
     ret
 
 endloop:
diff --git a/common/arm64/ihevc_intra_pred_chroma_mode_18_34.s b/common/arm64/ihevc_intra_pred_chroma_mode_18_34.s
index 52fc702..aacb35e 100644
--- a/common/arm64/ihevc_intra_pred_chroma_mode_18_34.s
+++ b/common/arm64/ihevc_intra_pred_chroma_mode_18_34.s
@@ -105,7 +105,7 @@
 ihevc_intra_pred_chroma_mode_18_34_av8:
 
     // stmfd sp!, {x4-x12, x14}    //stack stores the values of the arguments
-    push_v_regs
+
     stp         x19, x20,[sp,#-16]!
 
 
@@ -141,14 +141,14 @@ kernel:
     st1         {v4.8b, v5.8b},[x10],x3
     ld1         {v6.8b, v7.8b},[x8],x6
     st1         {v6.8b, v7.8b},[x10],x3
-    ld1         {v8.8b, v9.8b},[x8],x6
-    st1         {v8.8b, v9.8b},[x10],x3
-    ld1         {v10.8b, v11.8b},[x8],x6
-    st1         {v10.8b, v11.8b},[x10],x3
-    ld1         {v12.8b, v13.8b},[x8],x6
-    st1         {v12.8b, v13.8b},[x10],x3
-    ld1         {v14.8b, v15.8b},[x8],x6
-    st1         {v14.8b, v15.8b},[x10],x3
+    ld1         {v16.8b, v17.8b},[x8],x6
+    st1         {v16.8b, v17.8b},[x10],x3
+    ld1         {v18.8b, v19.8b},[x8],x6
+    st1         {v18.8b, v19.8b},[x10],x3
+    ld1         {v20.8b, v21.8b},[x8],x6
+    st1         {v20.8b, v21.8b},[x10],x3
+    ld1         {v22.8b, v23.8b},[x8],x6
+    st1         {v22.8b, v23.8b},[x10],x3
 
     subs        x12,x12,#8
     bne         kernel
@@ -188,7 +188,7 @@ mode2_4:
 end_func:
     // ldmfd sp!,{x4-x12,x15}                  //reload the registers from sp
     ldp         x19, x20,[sp],#16
-    pop_v_regs
+
     ret
 
 
diff --git a/common/arm64/ihevc_intra_pred_chroma_mode_27_to_33.s b/common/arm64/ihevc_intra_pred_chroma_mode_27_to_33.s
index 1df4ad0..b22d182 100644
--- a/common/arm64/ihevc_intra_pred_chroma_mode_27_to_33.s
+++ b/common/arm64/ihevc_intra_pred_chroma_mode_27_to_33.s
@@ -95,7 +95,10 @@
 ihevc_intra_pred_chroma_mode_27_to_33_av8:
 
     // stmfd sp!, {x4-x12, x14}                //stack stores the values of the arguments
-    push_v_regs
+
+    stp         d9,d10,[sp,#-16]!
+    stp         d12,d13,[sp,#-16]!
+    stp         d14,d15,[sp,#-16]!
     stp         x19, x20,[sp,#-16]!
 
     adrp        x6,  :got:gai4_ihevc_ang_table //loads word32 gai4_ihevc_ang_table[35]
@@ -151,7 +154,7 @@ prologue:
     add         x10,x8,x9                   //(i row)*pu1_ref[ref_main_idx]
 
     asr         x14,x14,#8                  //(ii)shift by 8
-    ld1         {v8.8b},[x10],x11           //(i row)ref_main_idx
+    ld1         {v23.8b},[x10],x11          //(i row)ref_main_idx
     and         x9,x14,#0xff                //(ii)get the last byte
 
     asr         x14,x14,#8                  //(iii)
@@ -163,7 +166,7 @@ prologue:
     add         x10,x8,x9                   //(iii)*pu1_ref[ref_main_idx]
 
     ld1         {v12.8b},[x12],x11          //(ii)ref_main_idx
-    umull       v10.8h, v8.8b, v30.8b       //(i row)vmull_u8(ref_main_idx, dup_const_32_fract)
+    umull       v10.8h, v23.8b, v30.8b      //(i row)vmull_u8(ref_main_idx, dup_const_32_fract)
 
     ld1         {v13.8b},[x12]              //(ii)ref_main_idx_1
     umlal       v10.8h, v9.8b, v31.8b       //(i row)vmull_u8(ref_main_idx_1, dup_const_fract)
@@ -202,7 +205,7 @@ prologue:
     dup         v29.8b, v4.8b[5]            //(vi)
     add         x10,x8,x9                   //(v)*pu1_ref[ref_main_idx]
 
-    ld1         {v8.8b},[x10],x11           //(v)ref_main_idx
+    ld1         {v23.8b},[x10],x11          //(v)ref_main_idx
     sub         v24.8b,  v1.8b ,  v25.8b    //(iv)32-fract(dup_const_32_fract)
 
     asr         x14,x14,#8                  //(vi)
@@ -224,7 +227,7 @@ prologue:
     add         x10,x8,x9                   //(vii)*pu1_ref[ref_main_idx]
 
     ld1         {v12.8b},[x12],x11          //(vi)ref_main_idx
-    umull       v10.8h, v8.8b, v30.8b       //(v)vmull_u8(ref_main_idx, dup_const_32_fract)
+    umull       v10.8h, v23.8b, v30.8b      //(v)vmull_u8(ref_main_idx, dup_const_32_fract)
 
     ld1         {v13.8b},[x12]              //(vi)ref_main_idx_1
     umlal       v10.8h, v9.8b, v31.8b       //(v)vmull_u8(ref_main_idx_1, dup_const_fract)
@@ -281,7 +284,7 @@ kernel_8_rows:
     dup         v31.8b, v4.8b[0]
     subs        x4,x4,#8
 
-    ld1         {v8.8b},[x10],x11           //(i)ref_main_idx
+    ld1         {v23.8b},[x10],x11          //(i)ref_main_idx
     sub         v24.8b,  v1.8b ,  v25.8b    //(viii)32-fract(dup_const_32_fract)
     and         x9,x14,#0xff                //(ii)
     add         x20,x6,#8                   //increment the row value
@@ -304,7 +307,7 @@ kernel_8_rows:
     add         x10,x8,x9                   //(iii)*pu1_ref[ref_main_idx]
 
     ld1         {v12.8b},[x12],x11          //(ii)ref_main_idx
-    umull       v10.8h, v8.8b, v30.8b       //(i)vmull_u8(ref_main_idx, dup_const_32_fract)
+    umull       v10.8h, v23.8b, v30.8b      //(i)vmull_u8(ref_main_idx, dup_const_32_fract)
     asr         x14,x14,#8                  //(iv)
 
     ld1         {v13.8b},[x12]              //(ii)ref_main_idx_1
@@ -362,7 +365,7 @@ kernel_8_rows:
     rshrn       v18.8b, v18.8h,#5           //(iii)shift_res = vrshrn_n_u16(add_res, 5)
     add         x12,x8,x9                   //(vi)*pu1_ref[ref_main_idx]
 
-    ld1         {v8.8b},[x10],x11           //(v)ref_main_idx
+    ld1         {v23.8b},[x10],x11          //(v)ref_main_idx
     and         v4.16b,  v2.16b ,  v6.16b   //dup_const_fract(fract = pos & (31))
     asr         x14,x14,#8                  //(vii)
 
@@ -379,7 +382,7 @@ kernel_8_rows:
     add         x10,x8,x9                   //(vii)*pu1_ref[ref_main_idx]
 
     ld1         {v13.8b},[x12]              //(vi)ref_main_idx_1
-    umull       v10.8h, v8.8b, v30.8b       //(v)vmull_u8(ref_main_idx, dup_const_32_fract)
+    umull       v10.8h, v23.8b, v30.8b      //(v)vmull_u8(ref_main_idx, dup_const_32_fract)
     and         x9,x14,#0xff                //(viii)
 
     smov        x14, v3.2s[0]               //(i)extract idx to the r register
@@ -479,7 +482,7 @@ core_loop_4:
     dup         v7.8b,w4                    //dup_const_32_fract
     umlal       v4.8h, v3.8b, v0.8b         //vmull_u8(ref_main_idx_1, dup_const_fract)
 
-    ld1         {v8.8b},[x10]               //ref_main_idx
+    ld1         {v23.8b},[x10]              //ref_main_idx
     add         x8,x8,#1
 
     ld1         {v9.8b},[x11]               //ref_main_idx_1
@@ -495,7 +498,7 @@ core_loop_4:
     add         x11,x10,#2                  //pu1_ref_main_idx_1 += 1
 
     dup         v12.8b,w5                   //dup_const_fract
-    umull       v10.8h, v8.8b, v7.8b        //vmull_u8(ref_main_idx, dup_const_32_fract)
+    umull       v10.8h, v23.8b, v7.8b       //vmull_u8(ref_main_idx, dup_const_32_fract)
 
     sub         x20,x5,#32
     neg         x4, x20
@@ -543,7 +546,9 @@ core_loop_4:
 end_loops:
     // ldmfd sp!,{x4-x12,x15}                  //reload the registers from sp
     ldp         x19, x20,[sp],#16
-    pop_v_regs
+    ldp         d14,d15,[sp],#16
+    ldp         d12,d13,[sp],#16
+    ldp         d9,d10,[sp],#16
     ret
 
 
diff --git a/common/arm64/ihevc_intra_pred_chroma_mode_3_to_9.s b/common/arm64/ihevc_intra_pred_chroma_mode_3_to_9.s
index 3c8746c..bf026a3 100644
--- a/common/arm64/ihevc_intra_pred_chroma_mode_3_to_9.s
+++ b/common/arm64/ihevc_intra_pred_chroma_mode_3_to_9.s
@@ -104,7 +104,10 @@
 ihevc_intra_pred_chroma_mode_3_to_9_av8:
 
     // stmfd sp!, {x4-x12, x14}        //stack stores the values of the arguments
-    push_v_regs
+
+    stp         d13,d14,[sp,#-16]!
+    stp         d8,d15,[sp,#-16]!           // Storing d15 using { sub sp,sp,#8; str d15,[sp] } is giving bus error.
+                                            // d8 is used as dummy register and stored along with d15 using stp. d8 is not used in the function.
     stp         x19, x20,[sp,#-16]!
 
     adrp        x7,  :got:gai4_ihevc_ang_table
@@ -157,8 +160,8 @@ prologue_8_16_32:
 
     movi        v28.8b, #32
 
-    sqxtn       v8.8b,  v22.8h
-    shl         v8.8b, v8.8b,#1             // 2 * idx
+    sqxtn       v2.8b,  v22.8h
+    shl         v2.8b, v2.8b,#1             // 2 * idx
 
     and         v6.8b,  v6.8b ,  v29.8b     //fract values in d1/ idx values in d0
     movi        v29.8b, #2                  //contains #2 for adding to get ref_main_idx + 1
@@ -167,58 +170,58 @@ prologue_8_16_32:
     dup         v27.4h,w0
     mov         x0,#0
 
-    movi        v9.8b, #22                  //row 0 to 7
+    movi        v3.8b, #22                  //row 0 to 7
 
-    sub         v8.8b,  v8.8b ,  v27.8b     //ref_main_idx (sub row)
-    sub         v8.8b,  v26.8b ,  v8.8b     //ref_main_idx (row 0)
-    add         v8.8b,  v8.8b ,  v9.8b      //to compensate the pu1_src idx incremented by 8
-    sub         v9.8b,  v8.8b ,  v29.8b     //ref_main_idx + 1 (row 0)
-    tbl         v12.8b, {  v0.16b, v1.16b}, v8.8b //load from ref_main_idx (row 0)
+    sub         v2.8b,  v2.8b ,  v27.8b     //ref_main_idx (sub row)
+    sub         v2.8b,  v26.8b ,  v2.8b     //ref_main_idx (row 0)
+    add         v2.8b,  v2.8b ,  v3.8b      //to compensate the pu1_src idx incremented by 8
+    sub         v3.8b,  v2.8b ,  v29.8b     //ref_main_idx + 1 (row 0)
+    tbl         v25.8b, {  v0.16b, v1.16b}, v2.8b //load from ref_main_idx (row 0)
     sub         v7.8b,  v28.8b ,  v6.8b     //32-fract
 
-    tbl         v13.8b, {  v0.16b, v1.16b}, v9.8b //load from ref_main_idx + 1 (row 0)
-    sub         v4.8b,  v8.8b ,  v29.8b     //ref_main_idx (row 1)
-    sub         v5.8b,  v9.8b ,  v29.8b     //ref_main_idx + 1 (row 1)
+    tbl         v13.8b, {  v0.16b, v1.16b}, v3.8b //load from ref_main_idx + 1 (row 0)
+    sub         v4.8b,  v2.8b ,  v29.8b     //ref_main_idx (row 1)
+    sub         v5.8b,  v3.8b ,  v29.8b     //ref_main_idx + 1 (row 1)
 
     movi        v29.8b, #4
 
     tbl         v16.8b, {  v0.16b, v1.16b}, v4.8b //load from ref_main_idx (row 1)
-    umull       v24.8h, v12.8b, v7.8b       //mul (row 0)
+    umull       v24.8h, v25.8b, v7.8b       //mul (row 0)
     umlal       v24.8h, v13.8b, v6.8b       //mul (row 0)
 
     tbl         v17.8b, {  v0.16b, v1.16b}, v5.8b //load from ref_main_idx + 1 (row 1)
-    sub         v8.8b,  v8.8b ,  v29.8b     //ref_main_idx (row 2)
-    sub         v9.8b,  v9.8b ,  v29.8b     //ref_main_idx + 1 (row 2)
+    sub         v2.8b,  v2.8b ,  v29.8b     //ref_main_idx (row 2)
+    sub         v3.8b,  v3.8b ,  v29.8b     //ref_main_idx + 1 (row 2)
 
     rshrn       v24.8b, v24.8h,#5           //round shft (row 0)
 
-    tbl         v14.8b, {  v0.16b, v1.16b}, v8.8b //load from ref_main_idx (row 2)
+    tbl         v14.8b, {  v0.16b, v1.16b}, v2.8b //load from ref_main_idx (row 2)
     umull       v22.8h, v16.8b, v7.8b       //mul (row 1)
     umlal       v22.8h, v17.8b, v6.8b       //mul (row 1)
 
-    tbl         v15.8b, {  v0.16b, v1.16b}, v9.8b //load from ref_main_idx + 1 (row 2)
+    tbl         v15.8b, {  v0.16b, v1.16b}, v3.8b //load from ref_main_idx + 1 (row 2)
     sub         v4.8b,  v4.8b ,  v29.8b     //ref_main_idx (row 3)
     sub         v5.8b,  v5.8b ,  v29.8b     //ref_main_idx + 1 (row 3)
 
     st1         {v24.8b},[x2], x3           //st (row 0)
     rshrn       v22.8b, v22.8h,#5           //round shft (row 1)
 
-    tbl         v10.8b, {  v0.16b, v1.16b}, v4.8b //load from ref_main_idx (row 3)
+    tbl         v19.8b, {  v0.16b, v1.16b}, v4.8b //load from ref_main_idx (row 3)
     umull       v20.8h, v14.8b, v7.8b       //mul (row 2)
     umlal       v20.8h, v15.8b, v6.8b       //mul (row 2)
 
-    tbl         v11.8b, {  v0.16b, v1.16b}, v5.8b //load from ref_main_idx + 1 (row 3)
-    sub         v8.8b,  v8.8b ,  v29.8b     //ref_main_idx (row 4)
-    sub         v9.8b,  v9.8b ,  v29.8b     //ref_main_idx + 1 (row 4)
+    tbl         v23.8b, {  v0.16b, v1.16b}, v5.8b //load from ref_main_idx + 1 (row 3)
+    sub         v2.8b,  v2.8b ,  v29.8b     //ref_main_idx (row 4)
+    sub         v3.8b,  v3.8b ,  v29.8b     //ref_main_idx + 1 (row 4)
 
     st1         {v22.8b},[x2], x3           //st (row 1)
     rshrn       v20.8b, v20.8h,#5           //round shft (row 2)
 
-    tbl         v12.8b, {  v0.16b, v1.16b}, v8.8b //load from ref_main_idx (row 4)
-    umull       v18.8h, v10.8b, v7.8b       //mul (row 3)
-    umlal       v18.8h, v11.8b, v6.8b       //mul (row 3)
+    tbl         v25.8b, {  v0.16b, v1.16b}, v2.8b //load from ref_main_idx (row 4)
+    umull       v18.8h, v19.8b, v7.8b       //mul (row 3)
+    umlal       v18.8h, v23.8b, v6.8b       //mul (row 3)
 
-    tbl         v13.8b, {  v0.16b, v1.16b}, v9.8b //load from ref_main_idx + 1 (row 4)
+    tbl         v13.8b, {  v0.16b, v1.16b}, v3.8b //load from ref_main_idx + 1 (row 4)
     sub         v4.8b,  v4.8b ,  v29.8b     //ref_main_idx (row 5)
     sub         v5.8b,  v5.8b ,  v29.8b     //ref_main_idx + 1 (row 5)
 
@@ -226,36 +229,36 @@ prologue_8_16_32:
     rshrn       v18.8b, v18.8h,#5           //round shft (row 3)
 
     tbl         v16.8b, {  v0.16b, v1.16b}, v4.8b //load from ref_main_idx (row 5)
-    umull       v24.8h, v12.8b, v7.8b       //mul (row 4)
+    umull       v24.8h, v25.8b, v7.8b       //mul (row 4)
     umlal       v24.8h, v13.8b, v6.8b       //mul (row 4)
 
     tbl         v17.8b, {  v0.16b, v1.16b}, v5.8b //load from ref_main_idx + 1 (row 5)
-    sub         v8.8b,  v8.8b ,  v29.8b     //ref_main_idx (row 6)
-    sub         v9.8b,  v9.8b ,  v29.8b     //ref_main_idx + 1 (row 6)
+    sub         v2.8b,  v2.8b ,  v29.8b     //ref_main_idx (row 6)
+    sub         v3.8b,  v3.8b ,  v29.8b     //ref_main_idx + 1 (row 6)
 
     st1         {v18.8b},[x2], x3           //st (row 3)
     cmp         x4,#4
     beq         end_func
     rshrn       v24.8b, v24.8h,#5           //round shft (row 4)
 
-    tbl         v14.8b, {  v0.16b, v1.16b}, v8.8b //load from ref_main_idx (row 6)
+    tbl         v14.8b, {  v0.16b, v1.16b}, v2.8b //load from ref_main_idx (row 6)
     umull       v22.8h, v16.8b, v7.8b       //mul (row 5)
     umlal       v22.8h, v17.8b, v6.8b       //mul (row 5)
 
-    tbl         v15.8b, {  v0.16b, v1.16b}, v9.8b //load from ref_main_idx + 1 (row 6)
+    tbl         v15.8b, {  v0.16b, v1.16b}, v3.8b //load from ref_main_idx + 1 (row 6)
     sub         v4.8b,  v4.8b ,  v29.8b     //ref_main_idx (row 7)
     sub         v5.8b,  v5.8b ,  v29.8b     //ref_main_idx + 1 (row 7)
 
     st1         {v24.8b},[x2], x3           //st (row 4)
     rshrn       v22.8b, v22.8h,#5           //round shft (row 5)
 
-    tbl         v10.8b, {  v0.16b, v1.16b}, v4.8b //load from ref_main_idx (row 7)
+    tbl         v19.8b, {  v0.16b, v1.16b}, v4.8b //load from ref_main_idx (row 7)
     umull       v20.8h, v14.8b, v7.8b       //mul (row 6)
     umlal       v20.8h, v15.8b, v6.8b       //mul (row 6)
 
-    tbl         v11.8b, {  v0.16b, v1.16b}, v5.8b //load from ref_main_idx + 1 (row 7)
-    umull       v18.8h, v10.8b, v7.8b       //mul (row 7)
-    umlal       v18.8h, v11.8b, v6.8b       //mul (row 7)
+    tbl         v23.8b, {  v0.16b, v1.16b}, v5.8b //load from ref_main_idx + 1 (row 7)
+    umull       v18.8h, v19.8b, v7.8b       //mul (row 7)
+    umlal       v18.8h, v23.8b, v6.8b       //mul (row 7)
 
     st1         {v22.8b},[x2], x3           //st (row 5)
     rshrn       v20.8b, v20.8h,#5           //round shft (row 6)
@@ -289,11 +292,11 @@ lbl284:
     csel        x0, x20, x0,le
 
     ld1         {v31.8b},[x14],#8
-    smull       v12.8h, v30.8b, v31.8b      //(col+1)*intra_pred_angle [0:7](col)
-    xtn         v10.8b,  v12.8h
-    sshr        v12.8h, v12.8h,#5
-    sqxtn       v11.8b,  v12.8h
-    shl         v11.8b, v11.8b,#1
+    smull       v25.8h, v30.8b, v31.8b      //(col+1)*intra_pred_angle [0:7](col)
+    xtn         v19.8b,  v25.8h
+    sshr        v25.8h, v25.8h,#5
+    sqxtn       v23.8b,  v25.8h
+    shl         v23.8b, v23.8b,#1
     mov         x5, #0x302                  //idx value for v is +1 of u
     dup         v27.4h,w5                   //row value inc or reset accordingly
     ldr         w9,  [x8]                   //loads index value
@@ -305,25 +308,25 @@ lbl284:
     dup         v26.8b,w9
 
     mov         x5,x2
-    sub         v11.8b,  v11.8b ,  v27.8b   //ref_main_idx (sub row)
+    sub         v23.8b,  v23.8b ,  v27.8b   //ref_main_idx (sub row)
 
 kernel_8_16_32:
     movi        v29.8b, #2                  //contains #2 for adding to get ref_main_idx + 1
-    sub         v8.8b,  v26.8b ,  v11.8b    //ref_main_idx
-    mov         v26.8b, v10.8b
+    sub         v2.8b,  v26.8b ,  v23.8b    //ref_main_idx
+    mov         v26.8b, v19.8b
 
     subs        x11, x11, #8
     sub         x6, x1, x9
-    tbl         v10.8b, {  v0.16b, v1.16b}, v4.8b //load from ref_main_idx (row 7)
-    add         v8.8b,  v8.8b ,  v16.8b     //to compensate the pu1_src idx incremented by 8
+    tbl         v19.8b, {  v0.16b, v1.16b}, v4.8b //load from ref_main_idx (row 7)
+    add         v2.8b,  v2.8b ,  v16.8b     //to compensate the pu1_src idx incremented by 8
 
     umull       v20.8h, v14.8b, v7.8b       //mul (row 6)
-    tbl         v11.8b, {  v0.16b, v1.16b}, v5.8b //load from ref_main_idx - 1 (row 7)
+    tbl         v23.8b, {  v0.16b, v1.16b}, v5.8b //load from ref_main_idx - 1 (row 7)
     umlal       v20.8h, v15.8b, v6.8b       //mul (row 6)
 
     add         x20, x0, #8
     csel        x0, x20, x0,le
-    sub         v9.8b,  v8.8b ,  v29.8b     //ref_main_idx - 2
+    sub         v3.8b,  v2.8b ,  v29.8b     //ref_main_idx - 2
     add         x20, x8, #4
     csel        x8, x20, x8,gt
 
@@ -339,15 +342,15 @@ lbl326:
 
     mov         x9,#0x302
     dup         v27.4h,w9                   //row value inc or reset accordingly
-    sub         v4.8b,  v8.8b ,  v29.8b     //ref_main_idx (row 1)
+    sub         v4.8b,  v2.8b ,  v29.8b     //ref_main_idx (row 1)
 
-    sub         v5.8b,  v9.8b ,  v29.8b     //ref_main_idx - 1 (row 1)
-    tbl         v12.8b, {  v0.16b, v1.16b}, v8.8b //load from ref_main_idx (row 0)
+    sub         v5.8b,  v3.8b ,  v29.8b     //ref_main_idx - 1 (row 1)
+    tbl         v25.8b, {  v0.16b, v1.16b}, v2.8b //load from ref_main_idx (row 0)
     movi        v29.8b, #31                 //contains #2 for adding to get ref_main_idx + 1
 
-    umull       v18.8h, v10.8b, v7.8b       //mul (row 7)
-    tbl         v13.8b, {  v0.16b, v1.16b}, v9.8b //load from ref_main_idx + 1 (row 0)
-    umlal       v18.8h, v11.8b, v6.8b       //mul (row 7)
+    umull       v18.8h, v19.8b, v7.8b       //mul (row 7)
+    tbl         v13.8b, {  v0.16b, v1.16b}, v3.8b //load from ref_main_idx + 1 (row 0)
+    umlal       v18.8h, v23.8b, v6.8b       //mul (row 7)
 
     ld1         {v31.8b},[x14],#8
     and         v6.8b,  v29.8b ,  v26.8b    //fract values in d1/ idx values in d0
@@ -361,14 +364,14 @@ lbl326:
     st1         {v22.8b},[x5], x3           //(from previous loop)st (row 5)
     rshrn       v20.8b, v20.8h,#5           //(from previous loop)round shft (row 6)
 
-    sub         v8.8b,  v8.8b ,  v29.8b     //ref_main_idx (row 2)
-    tbl         v10.8b, {  v0.16b, v1.16b}, v4.8b //load from ref_main_idx (row 1)
-    sub         v9.8b,  v9.8b ,  v29.8b     //ref_main_idx - 1 (row 2)
+    sub         v2.8b,  v2.8b ,  v29.8b     //ref_main_idx (row 2)
+    tbl         v19.8b, {  v0.16b, v1.16b}, v4.8b //load from ref_main_idx (row 1)
+    sub         v3.8b,  v3.8b ,  v29.8b     //ref_main_idx - 1 (row 2)
 
     lsl         x9, x9, #1
     sub         v7.8b,  v28.8b ,  v6.8b     //32-fract
 
-    umull       v24.8h, v12.8b, v7.8b       //mul (row 0)
+    umull       v24.8h, v25.8b, v7.8b       //mul (row 0)
     tbl         v17.8b, {  v0.16b, v1.16b}, v5.8b //load from ref_main_idx + 1 (row 1)
     umlal       v24.8h, v13.8b, v6.8b       //mul (row 0)
 
@@ -376,22 +379,22 @@ lbl326:
     rshrn       v18.8b, v18.8h,#5           //(from previous loop)round shft (row 7)
 
     sub         v4.8b,  v4.8b ,  v29.8b     //ref_main_idx (row 3)
-    tbl         v14.8b, {  v0.16b, v1.16b}, v8.8b //load from ref_main_idx (row 2)
+    tbl         v14.8b, {  v0.16b, v1.16b}, v2.8b //load from ref_main_idx (row 2)
     sub         v5.8b,  v5.8b ,  v29.8b     //ref_main_idx - 1 (row 3)
 
-    umull       v22.8h, v10.8b, v7.8b       //mul (row 1)
-    tbl         v15.8b, {  v0.16b, v1.16b}, v9.8b //load from ref_main_idx + 1 (row 2)
+    umull       v22.8h, v19.8b, v7.8b       //mul (row 1)
+    tbl         v15.8b, {  v0.16b, v1.16b}, v3.8b //load from ref_main_idx + 1 (row 2)
     umlal       v22.8h, v17.8b, v6.8b       //mul (row 1)
 
     rshrn       v24.8b, v24.8h,#5           //round shft (row 0)
     st1         {v18.8b},[x5], x3           //(from previous loop)st (row 7)
 
-    sub         v8.8b,  v8.8b ,  v29.8b     //ref_main_idx (row 4)
-    tbl         v10.8b, {  v0.16b, v1.16b}, v4.8b //load from ref_main_idx (row 3)
-    sub         v9.8b,  v9.8b ,  v29.8b     //ref_main_idx - 1 (row 4)
+    sub         v2.8b,  v2.8b ,  v29.8b     //ref_main_idx (row 4)
+    tbl         v19.8b, {  v0.16b, v1.16b}, v4.8b //load from ref_main_idx (row 3)
+    sub         v3.8b,  v3.8b ,  v29.8b     //ref_main_idx - 1 (row 4)
 
     umull       v20.8h, v14.8b, v7.8b       //mul (row 2)
-    tbl         v11.8b, {  v0.16b, v1.16b}, v5.8b //load from ref_main_idx + 1 (row 3)
+    tbl         v23.8b, {  v0.16b, v1.16b}, v5.8b //load from ref_main_idx + 1 (row 3)
     umlal       v20.8h, v15.8b, v6.8b       //mul (row 2)
 
     add         x5,x2,x3,lsl#2
@@ -402,26 +405,26 @@ lbl326:
     rshrn       v22.8b, v22.8h,#5           //round shft (row 1)
 
     sub         v4.8b,  v4.8b ,  v29.8b     //ref_main_idx (row 5)
-    tbl         v12.8b, {  v0.16b, v1.16b}, v8.8b //load from ref_main_idx (row 4)
+    tbl         v25.8b, {  v0.16b, v1.16b}, v2.8b //load from ref_main_idx (row 4)
     sub         v5.8b,  v5.8b ,  v29.8b     //ref_main_idx - 1 (row 5)
 
-    umull       v18.8h, v10.8b, v7.8b       //mul (row 3)
-    tbl         v13.8b, {  v0.16b, v1.16b}, v9.8b //load from ref_main_idx + 1 (row 4)
-    umlal       v18.8h, v11.8b, v6.8b       //mul (row 3)
+    umull       v18.8h, v19.8b, v7.8b       //mul (row 3)
+    tbl         v13.8b, {  v0.16b, v1.16b}, v3.8b //load from ref_main_idx + 1 (row 4)
+    umlal       v18.8h, v23.8b, v6.8b       //mul (row 3)
 
     st1         {v22.8b},[x2], x3           //st (row 1)
     rshrn       v20.8b, v20.8h,#5           //round shft (row 2)
 
-    xtn         v10.8b,  v14.8h
+    xtn         v19.8b,  v14.8h
     sshr        v14.8h, v14.8h,#5
 
-    sub         v8.8b,  v8.8b ,  v29.8b     //ref_main_idx (row 6)
+    sub         v2.8b,  v2.8b ,  v29.8b     //ref_main_idx (row 6)
     tbl         v21.8b, {  v0.16b, v1.16b}, v4.8b //load from ref_main_idx (row 5)
-    sub         v9.8b,  v9.8b ,  v29.8b     //ref_main_idx - 1 (row 6)
+    sub         v3.8b,  v3.8b ,  v29.8b     //ref_main_idx - 1 (row 6)
 
-    umull       v24.8h, v12.8b, v7.8b       //mul (row 4)
+    umull       v24.8h, v25.8b, v7.8b       //mul (row 4)
     tbl         v17.8b, {  v0.16b, v1.16b}, v5.8b //load from ref_main_idx + 1 (row 5)
-    sqxtn       v11.8b,  v14.8h
+    sqxtn       v23.8b,  v14.8h
 
     st1         {v20.8b},[x2], x3           //st (row 2)
     umlal       v24.8h, v13.8b, v6.8b       //mul (row 4)
@@ -430,15 +433,15 @@ lbl326:
     dup         v26.8b,w9
 
     sub         v4.8b,  v4.8b ,  v29.8b     //ref_main_idx (row 7)
-    tbl         v14.8b, {  v0.16b, v1.16b}, v8.8b //load from ref_main_idx (row 6)
+    tbl         v14.8b, {  v0.16b, v1.16b}, v2.8b //load from ref_main_idx (row 6)
     sub         v5.8b,  v5.8b ,  v29.8b     //ref_main_idx - 1 (row 7)
 
     mov         x6, #22                     //to compensate the 2*row value
-    shl         v11.8b, v11.8b,#1
+    shl         v23.8b, v23.8b,#1
     sub         x6, x6, x0, lsl #1
 
     umull       v22.8h, v21.8b, v7.8b       //mul (row 5)
-    tbl         v15.8b, {  v0.16b, v1.16b}, v9.8b //load from ref_main_idx + 1 (row 6)
+    tbl         v15.8b, {  v0.16b, v1.16b}, v3.8b //load from ref_main_idx + 1 (row 6)
     umlal       v22.8h, v17.8b, v6.8b       //mul (row 5)
 
     st1         {v18.8b},[x2], x3           //st (row 3)
@@ -451,7 +454,7 @@ lbl326:
 
     sub         x20, x2, x4
     csel        x2, x20, x2,le
-    sub         v11.8b,  v11.8b ,  v27.8b   //ref_main_idx (add row)
+    sub         v23.8b,  v23.8b ,  v27.8b   //ref_main_idx (add row)
     sub         x20,x2,#8
     csel        x2, x20, x2,le
 
@@ -460,17 +463,17 @@ lbl326:
     bne         kernel_8_16_32
 
 epil_8_16_32:
-    tbl         v10.8b, {  v0.16b, v1.16b}, v4.8b //load from ref_main_idx (row 7)
+    tbl         v19.8b, {  v0.16b, v1.16b}, v4.8b //load from ref_main_idx (row 7)
 
     umull       v20.8h, v14.8b, v7.8b       //mul (row 6)
-    tbl         v11.8b, {  v0.16b, v1.16b}, v5.8b //load from ref_main_idx + 1 (row 7)
+    tbl         v23.8b, {  v0.16b, v1.16b}, v5.8b //load from ref_main_idx + 1 (row 7)
     umlal       v20.8h, v15.8b, v6.8b       //mul (row 6)
 
     st1         {v24.8b},[x5], x3           //st (row 4)
     rshrn       v24.8b, v22.8h,#5           //round shft (row 5)
 
-    umull       v18.8h, v10.8b, v7.8b       //mul (row 7)
-    umlal       v18.8h, v11.8b, v6.8b       //mul (row 7)
+    umull       v18.8h, v19.8b, v7.8b       //mul (row 7)
+    umlal       v18.8h, v23.8b, v6.8b       //mul (row 7)
 
     st1         {v24.8b},[x5], x3           //(from previous loop)st (row 5)
     rshrn       v20.8b, v20.8h,#5           //(from previous loop)round shft (row 6)
@@ -481,9 +484,11 @@ epil_8_16_32:
     st1         {v18.8b},[x5], x3           //st (row 7)
 
 end_func:
-    // ldmfd sp!,{x4-x12,x15}          //reload the registers from sp
+    // ldmfd sp!,{x4-x12,x15}               //reload the registers from sp
     ldp         x19, x20,[sp],#16
-    pop_v_regs
+    ldp         d8,d15,[sp],#16             // Loading d15 using { ldr d15,[sp]; add sp,sp,#8 } is giving bus error.
+                                            // d8 is used as dummy register and loaded along with d15 using ldp. d8 is not used in the function.
+    ldp         d13,d14,[sp],#16
     ret
 
 
diff --git a/common/arm64/ihevc_intra_pred_chroma_planar.s b/common/arm64/ihevc_intra_pred_chroma_planar.s
index ac6b362..65c4c56 100644
--- a/common/arm64/ihevc_intra_pred_chroma_planar.s
+++ b/common/arm64/ihevc_intra_pred_chroma_planar.s
@@ -106,7 +106,11 @@
 ihevc_intra_pred_chroma_planar_av8:
 
     // stmfd sp!, {x4-x12, x14}            //stack stores the values of the arguments
-    push_v_regs
+
+    stp         d10,d11,[sp,#-16]!
+    stp         d12,d13,[sp,#-16]!
+    stp         d8,d14,[sp,#-16]!           // Storing d14 using { sub sp,sp,#8; str d14,[sp] } is giving bus error.
+                                            // d8 is used as dummy register and stored along with d14 using stp. d8 is not used in the function.
     stp         x19, x20,[sp,#-16]!
 
     adrp        x11, :got:gau1_ihevc_planar_factor //loads table of coeffs
@@ -165,13 +169,13 @@ ihevc_intra_pred_chroma_planar_av8:
     mov         x10,x6
 tf_sz_8_16:
     ld1         {v10.8b, v11.8b}, [x14],#16 //load src[2nt+1+col]
-    ld1         {v8.8b},[x12],#8
-    mov         v9.8b, v8.8b
-    zip1        v29.8b, v8.8b, v9.8b
-    zip2        v9.8b, v8.8b, v9.8b
-    mov         v8.d[0], v29.d[0]
-    sub         v30.8b,  v2.8b ,  v8.8b     //[nt-1-col]
-    sub         v31.8b,  v2.8b ,  v9.8b
+    ld1         {v17.8b},[x12],#8
+    mov         v25.8b, v17.8b
+    zip1        v29.8b, v17.8b, v25.8b
+    zip2        v25.8b, v17.8b, v25.8b
+    mov         v17.d[0], v29.d[0]
+    sub         v30.8b,  v2.8b ,  v17.8b    //[nt-1-col]
+    sub         v31.8b,  v2.8b ,  v25.8b
 
 
 
@@ -185,7 +189,7 @@ loop_sz_8_16:
     sxtw        x11,w11
     umlal       v12.8h, v6.8b, v10.8b       //(nt-1-row)    *    src[2nt+1+col]
     dup         v4.4h,w7                    //src[2nt-1-row]
-    umlal       v12.8h, v8.8b, v1.8b        //(col+1)    *    src[3nt+1]
+    umlal       v12.8h, v17.8b, v1.8b       //(col+1)    *    src[3nt+1]
     dup         v3.4h,w11                   //src[2nt-1-row]
     umlal       v12.8h, v30.8b, v4.8b       //(nt-1-col)    *    src[2nt-1-row]
 
@@ -200,14 +204,14 @@ loop_sz_8_16:
 
     umlal       v28.8h, v31.8b, v4.8b
     sub         v19.8b,  v6.8b ,  v7.8b     //[nt-1-row]--
-    umlal       v28.8h, v9.8b, v1.8b
+    umlal       v28.8h, v25.8b, v1.8b
     dup         v4.4h,w7                    //src[2nt-1-row]
 
     umull       v26.8h, v18.8b, v0.8b       //(row+1)    *    src[nt-1]
     add         v12.8h,  v12.8h ,  v16.8h   //add (nt)
     umlal       v26.8h, v19.8b, v10.8b      //(nt-1-row)    *    src[2nt+1+col]
     sshl        v12.8h, v12.8h, v14.8h      //shr
-    umlal       v26.8h, v8.8b, v1.8b        //(col+1)    *    src[3nt+1]
+    umlal       v26.8h, v17.8b, v1.8b       //(col+1)    *    src[3nt+1]
     add         v28.8h,  v28.8h ,  v16.8h
     umlal       v26.8h, v30.8b, v3.8b       //(nt-1-col)    *    src[2nt-1-row]
     sshl        v28.8h, v28.8h, v14.8h
@@ -220,7 +224,7 @@ loop_sz_8_16:
     add         v5.8b,  v18.8b ,  v7.8b     //row++ [(row+1)++]
     umlal       v24.8h, v19.8b, v11.8b
     sub         v6.8b,  v19.8b ,  v7.8b     //[nt-1-row]--
-    umlal       v24.8h, v9.8b, v1.8b
+    umlal       v24.8h, v25.8b, v1.8b
     xtn         v12.8b,  v12.8h
     umlal       v24.8h, v31.8b, v3.8b
     xtn         v13.8b,  v28.8h
@@ -233,7 +237,7 @@ loop_sz_8_16:
     sshl        v26.8h, v26.8h, v14.8h      //shr
     umlal       v22.8h, v6.8b, v10.8b       //(nt-1-row)    *    src[2nt+1+col]
     st1         {v12.2s, v13.2s}, [x2], x3
-    umlal       v22.8h, v8.8b, v1.8b        //(col+1)    *    src[3nt+1]
+    umlal       v22.8h, v17.8b, v1.8b       //(col+1)    *    src[3nt+1]
     add         v24.8h,  v24.8h ,  v16.8h
     umlal       v22.8h, v30.8b, v4.8b       //(nt-1-col)    *    src[2nt-1-row]
     sshl        v24.8h, v24.8h, v14.8h
@@ -246,7 +250,7 @@ loop_sz_8_16:
 
     ldr         w11,  [x6], #-2             //src[2nt-1-row] (dec to take into account row)
     sxtw        x11,w11
-    umlal       v20.8h, v9.8b, v1.8b
+    umlal       v20.8h, v25.8b, v1.8b
     dup         v3.4h,w11                   //src[2nt-1-row]
     add         v22.8h,  v22.8h ,  v16.8h   //add (nt)
 
@@ -255,7 +259,7 @@ loop_sz_8_16:
     umlal       v12.8h, v19.8b, v10.8b      //(nt-1-row)    *    src[2nt+1+col]
     xtn         v27.8b,  v24.8h
 
-    umlal       v12.8h, v8.8b, v1.8b        //(col+1)    *    src[3nt+1]
+    umlal       v12.8h, v17.8b, v1.8b       //(col+1)    *    src[3nt+1]
     sshl        v22.8h, v22.8h, v14.8h      //shr
 
     umlal       v12.8h, v30.8b, v3.8b       //(nt-1-col)    *    src[2nt-1-row]
@@ -268,7 +272,7 @@ loop_sz_8_16:
     add         v5.8b,  v18.8b ,  v7.8b     //row++ [(row+1)++]
 
     sub         v6.8b,  v19.8b ,  v7.8b     //[nt-1-row]--
-    umlal       v28.8h, v9.8b, v1.8b
+    umlal       v28.8h, v25.8b, v1.8b
 
     umlal       v28.8h, v31.8b, v3.8b
     sshl        v20.8h, v20.8h, v14.8h
@@ -319,13 +323,13 @@ loop_sz_8_16:
     add         x2,x2,#16
 
     ld1         {v10.8b, v11.8b}, [x14],#16 //load src[2nt+1+col]
-    ld1         {v8.8b},[x12],#8
-    mov         v9.8b, v8.8b
-    zip1        v29.8b, v8.8b, v9.8b
-    zip2        v9.8b, v8.8b, v9.8b
-    mov         v8.d[0], v29.d[0]
-    sub         v30.8b,  v2.8b ,  v8.8b     //[nt-1-col]
-    sub         v31.8b,  v2.8b ,  v9.8b
+    ld1         {v17.8b},[x12],#8
+    mov         v25.8b, v17.8b
+    zip1        v29.8b, v17.8b, v25.8b
+    zip2        v25.8b, v17.8b, v25.8b
+    mov         v17.d[0], v29.d[0]
+    sub         v30.8b,  v2.8b ,  v17.8b    //[nt-1-col]
+    sub         v31.8b,  v2.8b ,  v25.8b
 
     beq         loop_sz_8_16
 
@@ -333,23 +337,23 @@ loop_sz_8_16:
 
 tf_sz_4:
     ld1         {v10.8b},[x14]              //load src[2nt+1+col]
-    ld1         {v8.8b},[x12], x10          //load 8 coeffs [col+1]
-    mov         v9.8b, v8.8b
-    zip1        v29.8b, v8.8b, v9.8b
-    zip2        v9.8b, v8.8b, v9.8b
-    mov         v8.d[0], v29.d[0]
+    ld1         {v17.8b},[x12], x10         //load 8 coeffs [col+1]
+    mov         v25.8b, v17.8b
+    zip1        v29.8b, v17.8b, v25.8b
+    zip2        v25.8b, v17.8b, v25.8b
+    mov         v17.d[0], v29.d[0]
 loop_sz_4:
     //mov        x10, #4                @reduce inc to #4 for 4x4
     ldr         w7,  [x6], #-2              //src[2nt-1-row] (dec to take into account row)
     sxtw        x7,w7
     dup         v4.4h,w7                    //src[2nt-1-row]
 
-    sub         v9.8b,  v2.8b ,  v8.8b      //[nt-1-col]
+    sub         v25.8b,  v2.8b ,  v17.8b    //[nt-1-col]
 
     umull       v12.8h, v5.8b, v0.8b        //(row+1)    *    src[nt-1]
     umlal       v12.8h, v6.8b, v10.8b       //(nt-1-row)    *    src[2nt+1+col]
-    umlal       v12.8h, v8.8b, v1.8b        //(col+1)    *    src[3nt+1]
-    umlal       v12.8h, v9.8b, v4.8b        //(nt-1-col)    *    src[2nt-1-row]
+    umlal       v12.8h, v17.8b, v1.8b       //(col+1)    *    src[3nt+1]
+    umlal       v12.8h, v25.8b, v4.8b       //(nt-1-col)    *    src[2nt-1-row]
 //    vadd.i16    q6, q6, q8            @add (nt)
 //    vshl.s16     q6, q6, q7            @shr
 //    vmovn.i16     d12, q6
@@ -364,9 +368,12 @@ loop_sz_4:
     bne         loop_sz_4
 
 end_loop:
-    // ldmfd sp!,{x4-x12,x15}                  //reload the registers from sp
+    // ldmfd sp!,{x4-x12,x15}                   //reload the registers from sp
     ldp         x19, x20,[sp],#16
-    pop_v_regs
+    ldp         d8,d14,[sp],#16             // Loading d14 using { ldr d14,[sp]; add sp,sp,#8 } is giving bus error.
+                                            // d8 is used as dummy register and loaded along with d14 using ldp. d8 is not used in the function.
+    ldp         d12,d13,[sp],#16
+    ldp         d10,d11,[sp],#16
     ret
 
 
diff --git a/common/arm64/ihevc_intra_pred_filters_chroma_mode_11_to_17.s b/common/arm64/ihevc_intra_pred_filters_chroma_mode_11_to_17.s
index e9f83ff..5d65e63 100644
--- a/common/arm64/ihevc_intra_pred_filters_chroma_mode_11_to_17.s
+++ b/common/arm64/ihevc_intra_pred_filters_chroma_mode_11_to_17.s
@@ -105,7 +105,9 @@
 ihevc_intra_pred_chroma_mode_11_to_17_av8:
 
     // stmfd sp!, {x4-x12, x14}            //stack stores the values of the arguments
-    push_v_regs
+
+    stp         d12,d13,[sp,#-16]!
+    stp         d14,d15,[sp,#-16]!
     stp         x19, x20,[sp,#-16]!
 
     adrp        x7,  :got:gai4_ihevc_ang_table
@@ -279,8 +281,8 @@ prologue_8_16_32:
 //    mov        x0, #32
     movi        v28.8b, #32
 
-    sqxtn       v8.8b,  v22.8h
-    shl         v8.8b, v8.8b,#1             // 2 * idx
+    sqxtn       v19.8b,  v22.8h
+    shl         v19.8b, v19.8b,#1           // 2 * idx
 
     and         v6.8b,  v6.8b ,  v29.8b     //fract values in d1/ idx values in d0
 
@@ -292,15 +294,15 @@ prologue_8_16_32:
     add         v27.8b,  v27.8b ,  v29.8b
     mov         x0,#0
 
-    add         v8.8b,  v8.8b ,  v27.8b     //ref_main_idx (add row)
-    sub         v8.8b,  v8.8b ,  v26.8b     //ref_main_idx (row 0)
-    add         v9.8b,  v8.8b ,  v29.8b     //ref_main_idx + 1 (row 0)
-    tbl         v12.8b, {  v0.16b, v1.16b}, v8.8b //load from ref_main_idx (row 0)
+    add         v19.8b,  v19.8b ,  v27.8b   //ref_main_idx (add row)
+    sub         v19.8b,  v19.8b ,  v26.8b   //ref_main_idx (row 0)
+    add         v21.8b,  v19.8b ,  v29.8b   //ref_main_idx + 1 (row 0)
+    tbl         v12.8b, {  v0.16b, v1.16b}, v19.8b //load from ref_main_idx (row 0)
     sub         v7.8b,  v28.8b ,  v6.8b     //32-fract
 
-    tbl         v13.8b, {  v0.16b, v1.16b}, v9.8b //load from ref_main_idx + 1 (row 0)
-    add         v4.8b,  v8.8b ,  v29.8b     //ref_main_idx (row 1)
-    add         v5.8b,  v9.8b ,  v29.8b     //ref_main_idx + 1 (row 1)
+    tbl         v13.8b, {  v0.16b, v1.16b}, v21.8b //load from ref_main_idx + 1 (row 0)
+    add         v4.8b,  v19.8b ,  v29.8b    //ref_main_idx (row 1)
+    add         v5.8b,  v21.8b ,  v29.8b    //ref_main_idx + 1 (row 1)
 
 //    mov        x0, #4                @ 2 *(row * 2 )
     movi        v29.8b, #4
@@ -310,38 +312,38 @@ prologue_8_16_32:
     umlal       v24.8h, v13.8b, v6.8b       //mul (row 0)
 
     tbl         v17.8b, {  v0.16b, v1.16b}, v5.8b //load from ref_main_idx + 1 (row 1)
-    add         v8.8b,  v8.8b ,  v29.8b     //ref_main_idx (row 2)
-    add         v9.8b,  v9.8b ,  v29.8b     //ref_main_idx + 1 (row 2)
+    add         v19.8b,  v19.8b ,  v29.8b   //ref_main_idx (row 2)
+    add         v21.8b,  v21.8b ,  v29.8b   //ref_main_idx + 1 (row 2)
 
     rshrn       v24.8b, v24.8h,#5           //round shft (row 0)
 
-    tbl         v14.8b, {  v0.16b, v1.16b}, v8.8b //load from ref_main_idx (row 2)
+    tbl         v14.8b, {  v0.16b, v1.16b}, v19.8b //load from ref_main_idx (row 2)
     umull       v22.8h, v16.8b, v7.8b       //mul (row 1)
     umlal       v22.8h, v17.8b, v6.8b       //mul (row 1)
 
-    tbl         v15.8b, {  v0.16b, v1.16b}, v9.8b //load from ref_main_idx + 1 (row 2)
+    tbl         v15.8b, {  v0.16b, v1.16b}, v21.8b //load from ref_main_idx + 1 (row 2)
     add         v4.8b,  v4.8b ,  v29.8b     //ref_main_idx (row 3)
     add         v5.8b,  v5.8b ,  v29.8b     //ref_main_idx + 1 (row 3)
 
     st1         {v24.8b},[x2], x3           //st (row 0)
     rshrn       v22.8b, v22.8h,#5           //round shft (row 1)
 
-    tbl         v10.8b, {  v0.16b, v1.16b}, v4.8b //load from ref_main_idx (row 3)
+    tbl         v23.8b, {  v0.16b, v1.16b}, v4.8b //load from ref_main_idx (row 3)
     umull       v20.8h, v14.8b, v7.8b       //mul (row 2)
     umlal       v20.8h, v15.8b, v6.8b       //mul (row 2)
 
-    tbl         v11.8b, {  v0.16b, v1.16b}, v5.8b //load from ref_main_idx + 1 (row 3)
-    add         v8.8b,  v8.8b ,  v29.8b     //ref_main_idx (row 4)
-    add         v9.8b,  v9.8b ,  v29.8b     //ref_main_idx + 1 (row 4)
+    tbl         v25.8b, {  v0.16b, v1.16b}, v5.8b //load from ref_main_idx + 1 (row 3)
+    add         v19.8b,  v19.8b ,  v29.8b   //ref_main_idx (row 4)
+    add         v21.8b,  v21.8b ,  v29.8b   //ref_main_idx + 1 (row 4)
 
     st1         {v22.8b},[x2], x3           //st (row 1)
     rshrn       v20.8b, v20.8h,#5           //round shft (row 2)
 
-    tbl         v12.8b, {  v0.16b, v1.16b}, v8.8b //load from ref_main_idx (row 4)
-    umull       v18.8h, v10.8b, v7.8b       //mul (row 3)
-    umlal       v18.8h, v11.8b, v6.8b       //mul (row 3)
+    tbl         v12.8b, {  v0.16b, v1.16b}, v19.8b //load from ref_main_idx (row 4)
+    umull       v18.8h, v23.8b, v7.8b       //mul (row 3)
+    umlal       v18.8h, v25.8b, v6.8b       //mul (row 3)
 
-    tbl         v13.8b, {  v0.16b, v1.16b}, v9.8b //load from ref_main_idx + 1 (row 4)
+    tbl         v13.8b, {  v0.16b, v1.16b}, v21.8b //load from ref_main_idx + 1 (row 4)
     add         v4.8b,  v4.8b ,  v29.8b     //ref_main_idx (row 5)
     add         v5.8b,  v5.8b ,  v29.8b     //ref_main_idx + 1 (row 5)
 
@@ -353,32 +355,32 @@ prologue_8_16_32:
     umlal       v24.8h, v13.8b, v6.8b       //mul (row 4)
 
     tbl         v17.8b, {  v0.16b, v1.16b}, v5.8b //load from ref_main_idx + 1 (row 5)
-    add         v8.8b,  v8.8b ,  v29.8b     //ref_main_idx (row 6)
-    add         v9.8b,  v9.8b ,  v29.8b     //ref_main_idx + 1 (row 6)
+    add         v19.8b,  v19.8b ,  v29.8b   //ref_main_idx (row 6)
+    add         v21.8b,  v21.8b ,  v29.8b   //ref_main_idx + 1 (row 6)
 
     st1         {v18.8b},[x2], x3           //st (row 3)
     cmp         x4,#4
     beq         end_func
     rshrn       v24.8b, v24.8h,#5           //round shft (row 4)
 
-    tbl         v14.8b, {  v0.16b, v1.16b}, v8.8b //load from ref_main_idx (row 6)
+    tbl         v14.8b, {  v0.16b, v1.16b}, v19.8b //load from ref_main_idx (row 6)
     umull       v22.8h, v16.8b, v7.8b       //mul (row 5)
     umlal       v22.8h, v17.8b, v6.8b       //mul (row 5)
 
-    tbl         v15.8b, {  v0.16b, v1.16b}, v9.8b //load from ref_main_idx + 1 (row 6)
+    tbl         v15.8b, {  v0.16b, v1.16b}, v21.8b //load from ref_main_idx + 1 (row 6)
     add         v4.8b,  v4.8b ,  v29.8b     //ref_main_idx (row 7)
     add         v5.8b,  v5.8b ,  v29.8b     //ref_main_idx + 1 (row 7)
 
     st1         {v24.8b},[x2], x3           //st (row 4)
     rshrn       v22.8b, v22.8h,#5           //round shft (row 5)
 
-    tbl         v10.8b, {  v0.16b, v1.16b}, v4.8b //load from ref_main_idx (row 7)
+    tbl         v23.8b, {  v0.16b, v1.16b}, v4.8b //load from ref_main_idx (row 7)
     umull       v20.8h, v14.8b, v7.8b       //mul (row 6)
     umlal       v20.8h, v15.8b, v6.8b       //mul (row 6)
 
-    tbl         v11.8b, {  v0.16b, v1.16b}, v5.8b //load from ref_main_idx + 1 (row 7)
-    umull       v18.8h, v10.8b, v7.8b       //mul (row 7)
-    umlal       v18.8h, v11.8b, v6.8b       //mul (row 7)
+    tbl         v25.8b, {  v0.16b, v1.16b}, v5.8b //load from ref_main_idx + 1 (row 7)
+    umull       v18.8h, v23.8b, v7.8b       //mul (row 7)
+    umlal       v18.8h, v25.8b, v6.8b       //mul (row 7)
 
     st1         {v22.8b},[x2], x3           //st (row 5)
     rshrn       v20.8b, v20.8h,#5           //round shft (row 6)
@@ -413,10 +415,10 @@ lbl400:
 
     ld1         {v31.8b},[x14],#8
     smull       v12.8h, v30.8b, v31.8b      //(col+1)*intra_pred_angle [0:7](col)
-    xtn         v10.8b,  v12.8h
+    xtn         v23.8b,  v12.8h
     sshr        v12.8h, v12.8h,#5
-    sqxtn       v11.8b,  v12.8h
-    shl         v11.8b, v11.8b,#1
+    sqxtn       v25.8b,  v12.8h
+    shl         v25.8b, v25.8b,#1
     orr         x5,x0,x0, lsl#8
     add         x5, x5,#0x002
     add         x5, x5,#0x300
@@ -427,7 +429,7 @@ lbl400:
     add         x9, x9, x0, lsl #1
 //    sub        x9, x9, #1
     dup         v26.8b,w9
-    add         v8.8b,  v27.8b ,  v11.8b    //ref_main_idx (add row)
+    add         v19.8b,  v27.8b ,  v25.8b   //ref_main_idx (add row)
     mov         x5,x2
 
 //    sub        x4,x4,#8
@@ -435,16 +437,16 @@ lbl400:
 kernel_8_16_32:
     movi        v29.8b, #2                  //contains #2 for adding to get ref_main_idx + 1
 
-    sub         v8.8b,  v8.8b ,  v26.8b     //ref_main_idx
-    mov         v26.8b, v10.8b
+    sub         v19.8b,  v19.8b ,  v26.8b   //ref_main_idx
+    mov         v26.8b, v23.8b
 
     subs        x11, x11, #8
     add         x6, x1, x9
-    tbl         v10.8b, {  v0.16b, v1.16b}, v4.8b //load from ref_main_idx (row 7)
-    add         v9.8b,  v29.8b ,  v8.8b     //ref_main_idx + 1
+    tbl         v23.8b, {  v0.16b, v1.16b}, v4.8b //load from ref_main_idx (row 7)
+    add         v21.8b,  v29.8b ,  v19.8b   //ref_main_idx + 1
 
     umull       v20.8h, v14.8b, v7.8b       //mul (row 6)
-    tbl         v11.8b, {  v0.16b, v1.16b}, v5.8b //load from ref_main_idx + 1 (row 7)
+    tbl         v25.8b, {  v0.16b, v1.16b}, v5.8b //load from ref_main_idx + 1 (row 7)
     umlal       v20.8h, v15.8b, v6.8b       //mul (row 6)
 
     add         x20, x0, #8
@@ -468,15 +470,15 @@ kernel_8_16_32:
     ldr         x14,  [x14, #:got_lo12:col_for_intra_chroma]
 lbl452:
 
-    add         v4.8b,  v29.8b ,  v8.8b     //ref_main_idx (row 1)
-    tbl         v12.8b, {  v0.16b, v1.16b}, v8.8b //load from ref_main_idx (row 0)
-    add         v5.8b,  v29.8b ,  v9.8b     //ref_main_idx + 1 (row 1)
+    add         v4.8b,  v29.8b ,  v19.8b    //ref_main_idx (row 1)
+    tbl         v12.8b, {  v0.16b, v1.16b}, v19.8b //load from ref_main_idx (row 0)
+    add         v5.8b,  v29.8b ,  v21.8b    //ref_main_idx + 1 (row 1)
 
     movi        v29.8b, #31                 //contains #2 for adding to get ref_main_idx + 1
 
-    umull       v18.8h, v10.8b, v7.8b       //mul (row 7)
-    tbl         v13.8b, {  v0.16b, v1.16b}, v9.8b //load from ref_main_idx + 1 (row 0)
-    umlal       v18.8h, v11.8b, v6.8b       //mul (row 7)
+    umull       v18.8h, v23.8b, v7.8b       //mul (row 7)
+    tbl         v13.8b, {  v0.16b, v1.16b}, v21.8b //load from ref_main_idx + 1 (row 0)
+    umlal       v18.8h, v25.8b, v6.8b       //mul (row 7)
 
     ld1         {v31.8b},[x14],#8
     and         v6.8b,  v29.8b ,  v26.8b    //fract values in d1/ idx values in d0
@@ -486,9 +488,9 @@ lbl452:
     st1         {v24.8b},[x5], x3           //(from previous loop)st (row 5)
     rshrn       v20.8b, v20.8h,#5           //(from previous loop)round shft (row 6)
 
-    add         v8.8b,  v29.8b ,  v8.8b     //ref_main_idx (row 2)
+    add         v19.8b,  v29.8b ,  v19.8b   //ref_main_idx (row 2)
     tbl         v16.8b, {  v0.16b, v1.16b}, v4.8b //load from ref_main_idx (row 1)
-    add         v9.8b,  v29.8b ,  v9.8b     //ref_main_idx + 1 (row 2)
+    add         v21.8b,  v29.8b ,  v21.8b   //ref_main_idx + 1 (row 2)
 
     lsl         x20, x4,  #1
     csel        x11,x20,x11,le
@@ -505,22 +507,22 @@ lbl452:
     rshrn       v18.8b, v18.8h,#5           //(from previous loop)round shft (row 7)
 
     add         v4.8b,  v4.8b ,  v29.8b     //ref_main_idx (row 3)
-    tbl         v14.8b, {  v0.16b, v1.16b}, v8.8b //load from ref_main_idx (row 2)
+    tbl         v14.8b, {  v0.16b, v1.16b}, v19.8b //load from ref_main_idx (row 2)
     add         v5.8b,  v5.8b ,  v29.8b     //ref_main_idx + 1 (row 3)
 
     umull       v22.8h, v16.8b, v7.8b       //mul (row 1)
-    tbl         v15.8b, {  v0.16b, v1.16b}, v9.8b //load from ref_main_idx + 1 (row 2)
+    tbl         v15.8b, {  v0.16b, v1.16b}, v21.8b //load from ref_main_idx + 1 (row 2)
     umlal       v22.8h, v17.8b, v6.8b       //mul (row 1)
 
     rshrn       v24.8b, v24.8h,#5           //round shft (row 0)
     st1         {v18.8b},[x5], x3           //(from previous loop)st (row 7)
 
-    add         v8.8b,  v8.8b ,  v29.8b     //ref_main_idx (row 4)
-    tbl         v10.8b, {  v0.16b, v1.16b}, v4.8b //load from ref_main_idx (row 3)
-    add         v9.8b,  v9.8b ,  v29.8b     //ref_main_idx + 1 (row 4)
+    add         v19.8b,  v19.8b ,  v29.8b   //ref_main_idx (row 4)
+    tbl         v23.8b, {  v0.16b, v1.16b}, v4.8b //load from ref_main_idx (row 3)
+    add         v21.8b,  v21.8b ,  v29.8b   //ref_main_idx + 1 (row 4)
 
     umull       v20.8h, v14.8b, v7.8b       //mul (row 2)
-    tbl         v11.8b, {  v0.16b, v1.16b}, v5.8b //load from ref_main_idx + 1 (row 3)
+    tbl         v25.8b, {  v0.16b, v1.16b}, v5.8b //load from ref_main_idx + 1 (row 3)
     umlal       v20.8h, v15.8b, v6.8b       //mul (row 2)
 
     smull       v14.8h, v30.8b, v31.8b      //(col+1)*intra_pred_angle [0:7](col)
@@ -532,22 +534,22 @@ lbl452:
     rshrn       v22.8b, v22.8h,#5           //round shft (row 1)
 
     add         v4.8b,  v4.8b ,  v29.8b     //ref_main_idx (row 5)
-    tbl         v12.8b, {  v0.16b, v1.16b}, v8.8b //load from ref_main_idx (row 4)
+    tbl         v12.8b, {  v0.16b, v1.16b}, v19.8b //load from ref_main_idx (row 4)
     add         v5.8b,  v5.8b ,  v29.8b     //ref_main_idx + 1 (row 5)
 
-    umull       v18.8h, v10.8b, v7.8b       //mul (row 3)
-    tbl         v13.8b, {  v0.16b, v1.16b}, v9.8b //load from ref_main_idx + 1 (row 4)
-    umlal       v18.8h, v11.8b, v6.8b       //mul (row 3)
+    umull       v18.8h, v23.8b, v7.8b       //mul (row 3)
+    tbl         v13.8b, {  v0.16b, v1.16b}, v21.8b //load from ref_main_idx + 1 (row 4)
+    umlal       v18.8h, v25.8b, v6.8b       //mul (row 3)
 
     st1         {v22.8b},[x2], x3           //st (row 1)
     rshrn       v20.8b, v20.8h,#5           //round shft (row 2)
 
-    xtn         v10.8b,  v14.8h
+    xtn         v23.8b,  v14.8h
     sshr        v14.8h, v14.8h,#5
 
-    add         v8.8b,  v8.8b ,  v29.8b     //ref_main_idx (row 6)
+    add         v19.8b,  v19.8b ,  v29.8b   //ref_main_idx (row 6)
     tbl         v16.8b, {  v0.16b, v1.16b}, v4.8b //load from ref_main_idx (row 5)
-    add         v9.8b,  v9.8b ,  v29.8b     //ref_main_idx + 1 (row 6)
+    add         v21.8b,  v21.8b ,  v29.8b   //ref_main_idx + 1 (row 6)
 
     umull       v24.8h, v12.8b, v7.8b       //mul (row 4)
     tbl         v17.8b, {  v0.16b, v1.16b}, v5.8b //load from ref_main_idx + 1 (row 5)
@@ -557,19 +559,19 @@ lbl452:
     rshrn       v18.8b, v18.8h,#5           //round shft (row 3)
 
 //    sub        x9, x9, #1
-    sqxtn       v11.8b,  v14.8h
+    sqxtn       v25.8b,  v14.8h
 
     add         v4.8b,  v4.8b ,  v29.8b     //ref_main_idx (row 7)
-    tbl         v14.8b, {  v0.16b, v1.16b}, v8.8b //load from ref_main_idx (row 6)
+    tbl         v14.8b, {  v0.16b, v1.16b}, v19.8b //load from ref_main_idx (row 6)
     add         v5.8b,  v5.8b ,  v29.8b     //ref_main_idx + 1 (row 7)
 
-    shl         v11.8b, v11.8b,#1
+    shl         v25.8b, v25.8b,#1
 
     umull       v22.8h, v16.8b, v7.8b       //mul (row 5)
-    tbl         v15.8b, {  v0.16b, v1.16b}, v9.8b //load from ref_main_idx + 1 (row 6)
+    tbl         v15.8b, {  v0.16b, v1.16b}, v21.8b //load from ref_main_idx + 1 (row 6)
     umlal       v22.8h, v17.8b, v6.8b       //mul (row 5)
 
-    add         v8.8b,  v27.8b ,  v11.8b    //ref_main_idx (add row)
+    add         v19.8b,  v27.8b ,  v25.8b   //ref_main_idx (add row)
     dup         v26.8b,w9
 
     st1         {v18.8b},[x2], x3           //st (row 3)
@@ -589,17 +591,17 @@ lbl452:
     bne         kernel_8_16_32
 epil_8_16_32:
 
-    tbl         v10.8b, {  v0.16b, v1.16b}, v4.8b //load from ref_main_idx (row 7)
+    tbl         v23.8b, {  v0.16b, v1.16b}, v4.8b //load from ref_main_idx (row 7)
 
     umull       v20.8h, v14.8b, v7.8b       //mul (row 6)
-    tbl         v11.8b, {  v0.16b, v1.16b}, v5.8b //load from ref_main_idx + 1 (row 7)
+    tbl         v25.8b, {  v0.16b, v1.16b}, v5.8b //load from ref_main_idx + 1 (row 7)
     umlal       v20.8h, v15.8b, v6.8b       //mul (row 6)
 
     st1         {v24.8b},[x5], x3           //st (row 4)
     rshrn       v24.8b, v22.8h,#5           //round shft (row 5)
 
-    umull       v18.8h, v10.8b, v7.8b       //mul (row 7)
-    umlal       v18.8h, v11.8b, v6.8b       //mul (row 7)
+    umull       v18.8h, v23.8b, v7.8b       //mul (row 7)
+    umlal       v18.8h, v25.8b, v6.8b       //mul (row 7)
 
     st1         {v24.8b},[x5], x3           //(from previous loop)st (row 5)
     rshrn       v20.8b, v20.8h,#5           //(from previous loop)round shft (row 6)
@@ -613,7 +615,8 @@ end_func:
     add         sp, sp, #132
     // ldmfd sp!,{x4-x12,x15}                  //reload the registers from sp
     ldp         x19, x20,[sp],#16
-    pop_v_regs
+    ldp         d14,d15,[sp],#16
+    ldp         d12,d13,[sp],#16
     ret
 
 
diff --git a/common/arm64/ihevc_intra_pred_filters_chroma_mode_19_to_25.s b/common/arm64/ihevc_intra_pred_filters_chroma_mode_19_to_25.s
index 3af2da7..261c591 100644
--- a/common/arm64/ihevc_intra_pred_filters_chroma_mode_19_to_25.s
+++ b/common/arm64/ihevc_intra_pred_filters_chroma_mode_19_to_25.s
@@ -102,8 +102,11 @@
 
 ihevc_intra_pred_chroma_mode_19_to_25_av8:
 
-    // stmfd sp!, {x4-x12, x14}            //stack stores the values of the arguments
-    push_v_regs
+    // stmfd sp!, {x4-x12, x14}             //stack stores the values of the arguments
+
+    stp         d12,d13,[sp,#-16]!
+    stp         d8,d14,[sp,#-16]!           // Storing d14 using { sub sp,sp,#8; str d14,[sp] } is giving bus error.
+                                            // d8 is used as dummy register and stored along with d14 using stp. d8 is not used in the function.
     stp         x19, x20,[sp,#-16]!
 
     adrp        x7,  :got:gai4_ihevc_ang_table
@@ -264,10 +267,10 @@ prologue:
 
     add         x10,x8,x9                   //(i row)*pu1_ref[ref_main_idx]
 
-    ld1         {v8.8b},[x10],x11           //(i row)ref_main_idx
+    ld1         {v7.8b},[x10],x11           //(i row)ref_main_idx
     sbfx        x9,x14,#8,#8
 
-    ld1         {v9.8b},[x10]               //(i row)ref_main_idx_1
+    ld1         {v19.8b},[x10]              //(i row)ref_main_idx_1
     add         x12,x8,x9                   //(ii)*pu1_ref[ref_main_idx]
 
     sbfx        x9,x14,#16,#8
@@ -275,10 +278,10 @@ prologue:
     add         x10,x8,x9                   //(iii)*pu1_ref[ref_main_idx]
 
     ld1         {v12.8b},[x12],x11          //(ii)ref_main_idx
-    umull       v10.8h, v8.8b, v30.8b       //(i row)vmull_u8(ref_main_idx, dup_const_32_fract)
+    umull       v23.8h, v7.8b, v30.8b       //(i row)vmull_u8(ref_main_idx, dup_const_32_fract)
 
     ld1         {v13.8b},[x12]              //(ii)ref_main_idx_1
-    umlal       v10.8h, v9.8b, v31.8b       //(i row)vmull_u8(ref_main_idx_1, dup_const_fract)
+    umlal       v23.8h, v19.8b, v31.8b      //(i row)vmull_u8(ref_main_idx_1, dup_const_fract)
 
     dup         v27.8b, v4.8b[2]            //(iii)
     sub         v28.8b,  v1.8b ,  v29.8b    //(ii)32-fract(dup_const_32_fract)
@@ -292,7 +295,7 @@ prologue:
     umlal       v14.8h, v13.8b, v29.8b      //(ii)vmull_u8(ref_main_idx_1, dup_const_fract)
 
     ld1         {v17.8b},[x10]              //(iii)ref_main_idx_1
-    rshrn       v10.8b, v10.8h,#5           //(i row)shift_res = vrshrn_n_u16(add_res, 5)
+    rshrn       v23.8b, v23.8h,#5           //(i row)shift_res = vrshrn_n_u16(add_res, 5)
 
     ld1         {v20.8b},[x12],x11          //(iv)ref_main_idx
     sub         v26.8b,  v1.8b ,  v27.8b    //(iii)32-fract(dup_const_32_fract)
@@ -306,20 +309,20 @@ prologue:
     umlal       v18.8h, v17.8b, v27.8b      //(iii)vmull_u8(ref_main_idx_1, dup_const_fract)
 //    lsl            x14,x14,#1
 
-    st1         {v10.8b},[x2],#8            //(i row)
+    st1         {v23.8b},[x2],#8            //(i row)
     rshrn       v14.8b, v14.8h,#5           //(ii)shift_res = vrshrn_n_u16(add_res, 5)
 
     sbfx        x9,x14,#0,#8
     dup         v29.8b, v4.8b[5]            //(vi)
     add         x10,x8,x9                   //(v)*pu1_ref[ref_main_idx]
 
-    ld1         {v8.8b},[x10],x11           //(v)ref_main_idx
+    ld1         {v7.8b},[x10],x11           //(v)ref_main_idx
     sub         v24.8b,  v1.8b ,  v25.8b    //(iv)32-fract(dup_const_32_fract)
 
     umull       v22.8h, v20.8b, v24.8b      //(iv)vmull_u8(ref_main_idx, dup_const_32_fract)
     sbfx        x9,x14,#8,#8
 
-    ld1         {v9.8b},[x10]               //(v)ref_main_idx_1
+    ld1         {v19.8b},[x10]              //(v)ref_main_idx_1
     umlal       v22.8h, v21.8b, v25.8b      //(iv)vmull_u8(ref_main_idx_1, dup_const_fract)
 
     st1         {v14.8b},[x0],x3            //(ii)
@@ -333,10 +336,10 @@ prologue:
     add         x10,x8,x9                   //(vii)*pu1_ref[ref_main_idx]
 
     ld1         {v12.8b},[x12],x11          //(vi)ref_main_idx
-    umull       v10.8h, v8.8b, v30.8b       //(v)vmull_u8(ref_main_idx, dup_const_32_fract)
+    umull       v23.8h, v7.8b, v30.8b       //(v)vmull_u8(ref_main_idx, dup_const_32_fract)
 
     ld1         {v13.8b},[x12]              //(vi)ref_main_idx_1
-    umlal       v10.8h, v9.8b, v31.8b       //(v)vmull_u8(ref_main_idx_1, dup_const_fract)
+    umlal       v23.8h, v19.8b, v31.8b      //(v)vmull_u8(ref_main_idx_1, dup_const_fract)
 
     st1         {v18.8b},[x0],x3            //(iii)
     rshrn       v22.8b, v22.8h,#5           //(iv)shift_res = vrshrn_n_u16(add_res, 5)
@@ -358,7 +361,7 @@ prologue:
     cmp         x4,#8                       // go to end if 4x4
     beq         end_loops
 
-    rshrn       v10.8b, v10.8h,#5           //(v)shift_res = vrshrn_n_u16(add_res, 5)
+    rshrn       v23.8b, v23.8h,#5           //(v)shift_res = vrshrn_n_u16(add_res, 5)
 
     ld1         {v20.8b},[x12],x11          //(viii)ref_main_idx
     sub         v26.8b,  v1.8b ,  v27.8b    //(vii)32-fract(dup_const_32_fract)
@@ -372,7 +375,7 @@ prologue:
     sub         x20,x4,#8
     csel        x4, x20, x4,gt
 
-    st1         {v10.8b},[x0],x3            //(v)
+    st1         {v23.8b},[x0],x3            //(v)
     rshrn       v14.8b, v14.8h,#5           //(vi)shift_res = vrshrn_n_u16(add_res, 5)
 
     beq         epilogue
@@ -393,14 +396,14 @@ kernel_8_rows:
     subs        x4,x4,#8
     sbfx        x9,x14,#8,#8
 
-    ld1         {v8.8b},[x10],x11           //(i)ref_main_idx
+    ld1         {v7.8b},[x10],x11           //(i)ref_main_idx
     sub         v24.8b,  v1.8b ,  v25.8b    //(viii)32-fract(dup_const_32_fract)
 
     add         x20,x6,#8                   //increment the row value
     csel        x6, x20, x6,le
     add         x12,x8,x9                   //(ii)*pu1_ref[ref_main_idx]
 
-    ld1         {v9.8b},[x10]               //(i)ref_main_idx_1
+    ld1         {v19.8b},[x10]              //(i)ref_main_idx_1
     umull       v22.8h, v20.8b, v24.8b      //(viii)vmull_u8(ref_main_idx, dup_const_32_fract)
 
     ld1         {v5.8b},[x6]                //loads the row value
@@ -417,10 +420,10 @@ kernel_8_rows:
     add         x10,x8,x9                   //(iii)*pu1_ref[ref_main_idx]
 
     ld1         {v12.8b},[x12],x11          //(ii)ref_main_idx
-    umull       v10.8h, v8.8b, v30.8b       //(i)vmull_u8(ref_main_idx, dup_const_32_fract)
+    umull       v23.8h, v7.8b, v30.8b       //(i)vmull_u8(ref_main_idx, dup_const_32_fract)
 
     ld1         {v13.8b},[x12]              //(ii)ref_main_idx_1
-    umlal       v10.8h, v9.8b, v31.8b       //(i)vmull_u8(ref_main_idx_1, dup_const_fract)
+    umlal       v23.8h, v19.8b, v31.8b      //(i)vmull_u8(ref_main_idx_1, dup_const_fract)
 
     sbfx        x9,x14,#24,#8
     csel        x4, x5, x4,le               //reload nt
@@ -439,7 +442,7 @@ kernel_8_rows:
     umlal       v14.8h, v13.8b, v29.8b      //(ii)vmull_u8(ref_main_idx_1, dup_const_fract)
 
     ld1         {v17.8b},[x10]              //(iii)ref_main_idx_1
-    rshrn       v10.8b, v10.8h,#5           //(i)shift_res = vrshrn_n_u16(add_res, 5)
+    rshrn       v23.8b, v23.8h,#5           //(i)shift_res = vrshrn_n_u16(add_res, 5)
 
     dup         v25.8b, v4.8b[3]            //(iv)
     smull       v2.8h, v5.8b, v0.8b         //pos = ((row + 1) * intra_pred_ang)
@@ -463,7 +466,7 @@ kernel_8_rows:
     add         x10,x8,x9                   //(v)*pu1_ref[ref_main_idx]
     sbfx        x9,x14,#8,#8
 
-    st1         {v10.8b},[x2],#8            //(i)
+    st1         {v23.8b},[x2],#8            //(i)
     sub         v24.8b,  v1.8b ,  v25.8b    //(iv)32-fract(dup_const_32_fract)
 
     dup         v29.8b, v4.8b[5]            //(vi)
@@ -478,10 +481,10 @@ kernel_8_rows:
     dup         v25.8b, v4.8b[7]            //(viii)
     rshrn       v18.8b, v18.8h,#5           //(iii)shift_res = vrshrn_n_u16(add_res, 5)
 
-    ld1         {v8.8b},[x10],x11           //(v)ref_main_idx
+    ld1         {v7.8b},[x10],x11           //(v)ref_main_idx
     and         v4.16b,  v2.16b ,  v6.16b   //dup_const_fract(fract = pos & (31))
 
-    ld1         {v9.8b},[x10]               //(v)ref_main_idx_1
+    ld1         {v19.8b},[x10]              //(v)ref_main_idx_1
     shrn        v3.8b, v2.8h,#5             //idx = pos >> 5
 
     st1         {v14.8b},[x0],x3            //(ii)
@@ -496,10 +499,10 @@ kernel_8_rows:
     shl         v3.8b, v3.8b,#1
 
     ld1         {v13.8b},[x12]              //(vi)ref_main_idx_1
-    umull       v10.8h, v8.8b, v30.8b       //(v)vmull_u8(ref_main_idx, dup_const_32_fract)
+    umull       v23.8h, v7.8b, v30.8b       //(v)vmull_u8(ref_main_idx, dup_const_32_fract)
 
     smov        x14, v3.2s[0]               //(i)extract idx to the r register
-    umlal       v10.8h, v9.8b, v31.8b       //(v)vmull_u8(ref_main_idx_1, dup_const_fract)
+    umlal       v23.8h, v19.8b, v31.8b      //(v)vmull_u8(ref_main_idx_1, dup_const_fract)
 
     add         x12,x8,x9                   //(viii)*pu1_ref[ref_main_idx]
     csel        x8, x1, x8,le               //reload the source to pu1_src+2nt
@@ -514,7 +517,7 @@ kernel_8_rows:
     umlal       v14.8h, v13.8b, v29.8b      //(vi)vmull_u8(ref_main_idx_1, dup_const_fract)
 
     ld1         {v20.8b},[x12],x11          //(viii)ref_main_idx
-    rshrn       v10.8b, v10.8h,#5           //(v)shift_res = vrshrn_n_u16(add_res, 5)
+    rshrn       v23.8b, v23.8h,#5           //(v)shift_res = vrshrn_n_u16(add_res, 5)
 
     ld1         {v21.8b},[x12]              //(viii)ref_main_idx_1
     sub         v26.8b,  v1.8b ,  v27.8b    //(vii)32-fract(dup_const_32_fract)
@@ -529,7 +532,7 @@ kernel_8_rows:
     st1         {v22.8b},[x0],x3            //(iv)
     umull       v18.8h, v16.8b, v26.8b      //(vii)vmull_u8(ref_main_idx, dup_const_32_fract)
 
-    st1         {v10.8b},[x0],x3            //(v)
+    st1         {v23.8b},[x0],x3            //(v)
     umlal       v18.8h, v17.8b, v27.8b      //(vii)vmull_u8(ref_main_idx_1, dup_const_fract)
 
     add         x20,x2,x12                  //increment the dst pointer to 8*dst_strd - nt
@@ -563,9 +566,11 @@ core_loop_4:
 
 end_loops:
     add         sp, sp, #132
-    // ldmfd sp!,{x4-x12,x15}                  //reload the registers from sp
+    // ldmfd sp!,{x4-x12,x15}               //reload the registers from sp
     ldp         x19, x20,[sp],#16
-    pop_v_regs
+    ldp         d8,d14,[sp],#16             // Loading d14 using { ldr d14,[sp]; add sp,sp,#8 } is giving bus error.
+                                            // d8 is used as dummy register and loaded along with d14 using ldp. d8 is not used in the function.
+    ldp         d12,d13,[sp],#16
     ret
 
 
diff --git a/common/arm64/ihevc_intra_pred_filters_luma_mode_11_to_17.s b/common/arm64/ihevc_intra_pred_filters_luma_mode_11_to_17.s
index 1502ad6..66f4699 100644
--- a/common/arm64/ihevc_intra_pred_filters_luma_mode_11_to_17.s
+++ b/common/arm64/ihevc_intra_pred_filters_luma_mode_11_to_17.s
@@ -105,7 +105,9 @@
 ihevc_intra_pred_luma_mode_11_to_17_av8:
 
     // stmfd sp!, {x4-x12, x14}            //stack stores the values of the arguments
-    push_v_regs
+
+    stp         d12,d13,[sp,#-16]!
+    stp         d14,d15,[sp,#-16]!
     stp         x19, x20,[sp,#-16]!
 
     adrp        x7,  :got:gai4_ihevc_ang_table
@@ -287,60 +289,60 @@ prologue_8_16_32:
     mov         x0, #32
     dup         v28.8b,w0
 
-    sqxtn       v8.8b,  v22.8h
+    sqxtn       v19.8b,  v22.8h
 
     and         v6.8b,  v6.8b ,  v29.8b     //fract values in d1/ idx values in d0
 
     mov         x0, #1
     dup         v27.8b,w0                   //row value inc or reset accordingly
 
-    add         v8.8b,  v8.8b ,  v27.8b     //ref_main_idx (add row)
-    sub         v8.8b,  v8.8b ,  v26.8b     //ref_main_idx (row 0)
-    add         v9.8b,  v8.8b ,  v2.8b      //ref_main_idx + 1 (row 0)
-    tbl         v12.8b, {v0.16b},v8.8b      //load from ref_main_idx (row 0)
+    add         v19.8b,  v19.8b ,  v27.8b   //ref_main_idx (add row)
+    sub         v19.8b,  v19.8b ,  v26.8b   //ref_main_idx (row 0)
+    add         v21.8b,  v19.8b ,  v2.8b    //ref_main_idx + 1 (row 0)
+    tbl         v12.8b, {v0.16b},v19.8b     //load from ref_main_idx (row 0)
     sub         v7.8b,  v28.8b ,  v6.8b     //32-fract
 
-    tbl         v13.8b, {v0.16b},v9.8b      //load from ref_main_idx + 1 (row 0)
-    add         v4.8b,  v8.8b ,  v2.8b      //ref_main_idx (row 1)
-    add         v5.8b,  v9.8b ,  v2.8b      //ref_main_idx + 1 (row 1)
+    tbl         v13.8b, {v0.16b},v21.8b     //load from ref_main_idx + 1 (row 0)
+    add         v4.8b,  v19.8b ,  v2.8b     //ref_main_idx (row 1)
+    add         v5.8b,  v21.8b ,  v2.8b     //ref_main_idx + 1 (row 1)
 
     tbl         v16.8b, {v0.16b},v4.8b      //load from ref_main_idx (row 1)
     umull       v24.8h, v12.8b, v7.8b       //mul (row 0)
     umlal       v24.8h, v13.8b, v6.8b       //mul (row 0)
 
     tbl         v17.8b, {v0.16b},v5.8b      //load from ref_main_idx + 1 (row 1)
-    add         v8.8b,  v8.8b ,  v3.8b      //ref_main_idx (row 2)
-    add         v9.8b,  v9.8b ,  v3.8b      //ref_main_idx + 1 (row 2)
+    add         v19.8b,  v19.8b ,  v3.8b    //ref_main_idx (row 2)
+    add         v21.8b,  v21.8b ,  v3.8b    //ref_main_idx + 1 (row 2)
 
     rshrn       v24.8b, v24.8h,#5           //round shft (row 0)
 
-    tbl         v14.8b, {v0.16b},v8.8b      //load from ref_main_idx (row 2)
+    tbl         v14.8b, {v0.16b},v19.8b     //load from ref_main_idx (row 2)
     umull       v22.8h, v16.8b, v7.8b       //mul (row 1)
     umlal       v22.8h, v17.8b, v6.8b       //mul (row 1)
 
-    tbl         v15.8b, {v0.16b},v9.8b      //load from ref_main_idx + 1 (row 2)
+    tbl         v15.8b, {v0.16b},v21.8b     //load from ref_main_idx + 1 (row 2)
     add         v4.8b,  v4.8b ,  v3.8b      //ref_main_idx (row 3)
     add         v5.8b,  v5.8b ,  v3.8b      //ref_main_idx + 1 (row 3)
 
     st1         {v24.8b},[x2], x3           //st (row 0)
     rshrn       v22.8b, v22.8h,#5           //round shft (row 1)
 
-    tbl         v10.8b, {v0.16b},v4.8b      //load from ref_main_idx (row 3)
+    tbl         v23.8b, {v0.16b},v4.8b      //load from ref_main_idx (row 3)
     umull       v20.8h, v14.8b, v7.8b       //mul (row 2)
     umlal       v20.8h, v15.8b, v6.8b       //mul (row 2)
 
-    tbl         v11.8b, {v0.16b},v5.8b      //load from ref_main_idx + 1 (row 3)
-    add         v8.8b,  v8.8b ,  v3.8b      //ref_main_idx (row 4)
-    add         v9.8b,  v9.8b ,  v3.8b      //ref_main_idx + 1 (row 4)
+    tbl         v25.8b, {v0.16b},v5.8b      //load from ref_main_idx + 1 (row 3)
+    add         v19.8b,  v19.8b ,  v3.8b    //ref_main_idx (row 4)
+    add         v21.8b,  v21.8b ,  v3.8b    //ref_main_idx + 1 (row 4)
 
     st1         {v22.8b},[x2], x3           //st (row 1)
     rshrn       v20.8b, v20.8h,#5           //round shft (row 2)
 
-    tbl         v12.8b, {v0.16b},v8.8b      //load from ref_main_idx (row 4)
-    umull       v18.8h, v10.8b, v7.8b       //mul (row 3)
-    umlal       v18.8h, v11.8b, v6.8b       //mul (row 3)
+    tbl         v12.8b, {v0.16b},v19.8b     //load from ref_main_idx (row 4)
+    umull       v18.8h, v23.8b, v7.8b       //mul (row 3)
+    umlal       v18.8h, v25.8b, v6.8b       //mul (row 3)
 
-    tbl         v13.8b, {v0.16b},v9.8b      //load from ref_main_idx + 1 (row 4)
+    tbl         v13.8b, {v0.16b},v21.8b     //load from ref_main_idx + 1 (row 4)
     add         v4.8b,  v4.8b ,  v3.8b      //ref_main_idx (row 5)
     add         v5.8b,  v5.8b ,  v3.8b      //ref_main_idx + 1 (row 5)
 
@@ -352,30 +354,30 @@ prologue_8_16_32:
     umlal       v24.8h, v13.8b, v6.8b       //mul (row 4)
 
     tbl         v17.8b, {v0.16b},v5.8b      //load from ref_main_idx + 1 (row 5)
-    add         v8.8b,  v8.8b ,  v3.8b      //ref_main_idx (row 6)
-    add         v9.8b,  v9.8b ,  v3.8b      //ref_main_idx + 1 (row 6)
+    add         v19.8b,  v19.8b ,  v3.8b    //ref_main_idx (row 6)
+    add         v21.8b,  v21.8b ,  v3.8b    //ref_main_idx + 1 (row 6)
 
     st1         {v18.8b},[x2], x3           //st (row 3)
     rshrn       v24.8b, v24.8h,#5           //round shft (row 4)
 
-    tbl         v14.8b, {v0.16b},v8.8b      //load from ref_main_idx (row 6)
+    tbl         v14.8b, {v0.16b},v19.8b     //load from ref_main_idx (row 6)
     umull       v22.8h, v16.8b, v7.8b       //mul (row 5)
     umlal       v22.8h, v17.8b, v6.8b       //mul (row 5)
 
-    tbl         v15.8b, {v0.16b},v9.8b      //load from ref_main_idx + 1 (row 6)
+    tbl         v15.8b, {v0.16b},v21.8b     //load from ref_main_idx + 1 (row 6)
     add         v4.8b,  v4.8b ,  v3.8b      //ref_main_idx (row 7)
     add         v5.8b,  v5.8b ,  v3.8b      //ref_main_idx + 1 (row 7)
 
     st1         {v24.8b},[x2], x3           //st (row 4)
     rshrn       v22.8b, v22.8h,#5           //round shft (row 5)
 
-    tbl         v10.8b, {v0.16b},v4.8b      //load from ref_main_idx (row 7)
+    tbl         v23.8b, {v0.16b},v4.8b      //load from ref_main_idx (row 7)
     umull       v20.8h, v14.8b, v7.8b       //mul (row 6)
     umlal       v20.8h, v15.8b, v6.8b       //mul (row 6)
 
-    tbl         v11.8b, {v0.16b},v5.8b      //load from ref_main_idx + 1 (row 7)
-    umull       v18.8h, v10.8b, v7.8b       //mul (row 7)
-    umlal       v18.8h, v11.8b, v6.8b       //mul (row 7)
+    tbl         v25.8b, {v0.16b},v5.8b      //load from ref_main_idx + 1 (row 7)
+    umull       v18.8h, v23.8b, v7.8b       //mul (row 7)
+    umlal       v18.8h, v25.8b, v6.8b       //mul (row 7)
 
     st1         {v22.8b},[x2], x3           //st (row 5)
     rshrn       v20.8b, v20.8h,#5           //round shft (row 6)
@@ -410,31 +412,31 @@ lbl390:
     mov         x5,x2
     ld1         {v31.8b},[x14],#8
     smull       v12.8h, v30.8b, v31.8b      //(col+1)*intra_pred_angle [0:7](col)
-    xtn         v10.8b,  v12.8h
+    xtn         v23.8b,  v12.8h
     sshr        v12.8h, v12.8h,#5
-    sqxtn       v11.8b,  v12.8h
+    sqxtn       v25.8b,  v12.8h
     dup         v27.8b,w0                   //row value inc or reset accordingly
     ldr         w9,  [x8]
     sxtw        x9,w9
     add         x9, x0, x9
     sub         x9, x9, #1
     dup         v26.8b,w9
-    add         v8.8b,  v27.8b ,  v11.8b    //ref_main_idx (add row)
+    add         v19.8b,  v27.8b ,  v25.8b   //ref_main_idx (add row)
 
     sub         x4,x4,#8
 
 kernel_8_16_32:
 
-    sub         v8.8b,  v8.8b ,  v26.8b     //ref_main_idx
-    mov         v26.8b, v10.8b
+    sub         v19.8b,  v19.8b ,  v26.8b   //ref_main_idx
+    mov         v26.8b, v23.8b
 
     subs        x11, x11, #8
     add         x6, x1, x9
-    tbl         v10.8b, {v0.16b},v4.8b      //load from ref_main_idx (row 7)
-    add         v9.8b,  v2.8b ,  v8.8b      //ref_main_idx + 1
+    tbl         v23.8b, {v0.16b},v4.8b      //load from ref_main_idx (row 7)
+    add         v21.8b,  v2.8b ,  v19.8b    //ref_main_idx + 1
 
     umull       v20.8h, v14.8b, v7.8b       //mul (row 6)
-    tbl         v11.8b, {v0.16b},v5.8b      //load from ref_main_idx + 1 (row 7)
+    tbl         v25.8b, {v0.16b},v5.8b      //load from ref_main_idx + 1 (row 7)
     umlal       v20.8h, v15.8b, v6.8b       //mul (row 6)
 
     add         x20, x0, #8
@@ -453,14 +455,14 @@ lbl429:
     csel        x8, x12, x8,le
     dup         v27.8b,w0                   //row value inc or reset accordingly
 
-    add         v4.8b,  v2.8b ,  v8.8b      //ref_main_idx (row 1)
-    tbl         v12.8b, {v0.16b},v8.8b      //load from ref_main_idx (row 0)
-    add         v5.8b,  v2.8b ,  v9.8b      //ref_main_idx + 1 (row 1)
+    add         v4.8b,  v2.8b ,  v19.8b     //ref_main_idx (row 1)
+    tbl         v12.8b, {v0.16b},v19.8b     //load from ref_main_idx (row 0)
+    add         v5.8b,  v2.8b ,  v21.8b     //ref_main_idx + 1 (row 1)
 
 
-    umull       v18.8h, v10.8b, v7.8b       //mul (row 7)
-    tbl         v13.8b, {v0.16b},v9.8b      //load from ref_main_idx + 1 (row 0)
-    umlal       v18.8h, v11.8b, v6.8b       //mul (row 7)
+    umull       v18.8h, v23.8b, v7.8b       //mul (row 7)
+    tbl         v13.8b, {v0.16b},v21.8b     //load from ref_main_idx + 1 (row 0)
+    umlal       v18.8h, v25.8b, v6.8b       //mul (row 7)
 
     ld1         {v31.8b},[x14],#8
     and         v6.8b,  v29.8b ,  v26.8b    //fract values in d1/ idx values in d0
@@ -468,9 +470,9 @@ lbl429:
     st1         {v24.8b},[x5], x3           //(from previous loop)st (row 5)
     rshrn       v20.8b, v20.8h,#5           //(from previous loop)round shft (row 6)
 
-    add         v8.8b,  v3.8b ,  v8.8b      //ref_main_idx (row 2)
+    add         v19.8b,  v3.8b ,  v19.8b    //ref_main_idx (row 2)
     tbl         v16.8b, {v0.16b},v4.8b      //load from ref_main_idx (row 1)
-    add         v9.8b,  v3.8b ,  v9.8b      //ref_main_idx + 1 (row 2)
+    add         v21.8b,  v3.8b ,  v21.8b    //ref_main_idx + 1 (row 2)
 
     add         x20, x4, #8
     csel        x11, x20, x11,le
@@ -486,22 +488,22 @@ lbl429:
     rshrn       v18.8b, v18.8h,#5           //(from previous loop)round shft (row 7)
 
     add         v4.8b,  v4.8b ,  v3.8b      //ref_main_idx (row 3)
-    tbl         v14.8b, {v0.16b},v8.8b      //load from ref_main_idx (row 2)
+    tbl         v14.8b, {v0.16b},v19.8b     //load from ref_main_idx (row 2)
     add         v5.8b,  v5.8b ,  v3.8b      //ref_main_idx + 1 (row 3)
 
     umull       v22.8h, v16.8b, v7.8b       //mul (row 1)
-    tbl         v15.8b, {v0.16b},v9.8b      //load from ref_main_idx + 1 (row 2)
+    tbl         v15.8b, {v0.16b},v21.8b     //load from ref_main_idx + 1 (row 2)
     umlal       v22.8h, v17.8b, v6.8b       //mul (row 1)
 
     rshrn       v24.8b, v24.8h,#5           //round shft (row 0)
     st1         {v18.8b},[x5], x3           //(from previous loop)st (row 7)
 
-    add         v8.8b,  v8.8b ,  v3.8b      //ref_main_idx (row 4)
-    tbl         v10.8b, {v0.16b},v4.8b      //load from ref_main_idx (row 3)
-    add         v9.8b,  v9.8b ,  v3.8b      //ref_main_idx + 1 (row 4)
+    add         v19.8b,  v19.8b ,  v3.8b    //ref_main_idx (row 4)
+    tbl         v23.8b, {v0.16b},v4.8b      //load from ref_main_idx (row 3)
+    add         v21.8b,  v21.8b ,  v3.8b    //ref_main_idx + 1 (row 4)
 
     umull       v20.8h, v14.8b, v7.8b       //mul (row 2)
-    tbl         v11.8b, {v0.16b},v5.8b      //load from ref_main_idx + 1 (row 3)
+    tbl         v25.8b, {v0.16b},v5.8b      //load from ref_main_idx + 1 (row 3)
     umlal       v20.8h, v15.8b, v6.8b       //mul (row 2)
 
     smull       v14.8h, v30.8b, v31.8b      //(col+1)*intra_pred_angle [0:7](col)
@@ -513,22 +515,22 @@ lbl429:
     rshrn       v22.8b, v22.8h,#5           //round shft (row 1)
 
     add         v4.8b,  v4.8b ,  v3.8b      //ref_main_idx (row 5)
-    tbl         v12.8b, {v0.16b},v8.8b      //load from ref_main_idx (row 4)
+    tbl         v12.8b, {v0.16b},v19.8b     //load from ref_main_idx (row 4)
     add         v5.8b,  v5.8b ,  v3.8b      //ref_main_idx + 1 (row 5)
 
-    umull       v18.8h, v10.8b, v7.8b       //mul (row 3)
-    tbl         v13.8b, {v0.16b},v9.8b      //load from ref_main_idx + 1 (row 4)
-    umlal       v18.8h, v11.8b, v6.8b       //mul (row 3)
+    umull       v18.8h, v23.8b, v7.8b       //mul (row 3)
+    tbl         v13.8b, {v0.16b},v21.8b     //load from ref_main_idx + 1 (row 4)
+    umlal       v18.8h, v25.8b, v6.8b       //mul (row 3)
 
     st1         {v22.8b},[x2], x3           //st (row 1)
     rshrn       v20.8b, v20.8h,#5           //round shft (row 2)
 
-    xtn         v10.8b,  v14.8h
+    xtn         v23.8b,  v14.8h
     sshr        v14.8h, v14.8h,#5
 
-    add         v8.8b,  v8.8b ,  v3.8b      //ref_main_idx (row 6)
+    add         v19.8b,  v19.8b ,  v3.8b    //ref_main_idx (row 6)
     tbl         v16.8b, {v0.16b},v4.8b      //load from ref_main_idx (row 5)
-    add         v9.8b,  v9.8b ,  v3.8b      //ref_main_idx + 1 (row 6)
+    add         v21.8b,  v21.8b ,  v3.8b    //ref_main_idx + 1 (row 6)
 
     umull       v24.8h, v12.8b, v7.8b       //mul (row 4)
     tbl         v17.8b, {v0.16b},v5.8b      //load from ref_main_idx + 1 (row 5)
@@ -538,17 +540,17 @@ lbl429:
     rshrn       v18.8b, v18.8h,#5           //round shft (row 3)
 
     sub         x9, x9, #1
-    sqxtn       v11.8b,  v14.8h
+    sqxtn       v25.8b,  v14.8h
 
     add         v4.8b,  v4.8b ,  v3.8b      //ref_main_idx (row 7)
-    tbl         v14.8b, {v0.16b},v8.8b      //load from ref_main_idx (row 6)
+    tbl         v14.8b, {v0.16b},v19.8b     //load from ref_main_idx (row 6)
     add         v5.8b,  v5.8b ,  v3.8b      //ref_main_idx + 1 (row 7)
 
     umull       v22.8h, v16.8b, v7.8b       //mul (row 5)
-    tbl         v15.8b, {v0.16b},v9.8b      //load from ref_main_idx + 1 (row 6)
+    tbl         v15.8b, {v0.16b},v21.8b     //load from ref_main_idx + 1 (row 6)
     umlal       v22.8h, v17.8b, v6.8b       //mul (row 5)
 
-    add         v8.8b,  v27.8b ,  v11.8b    //ref_main_idx (add row)
+    add         v19.8b,  v27.8b ,  v25.8b   //ref_main_idx (add row)
     dup         v26.8b,w9
 
     st1         {v18.8b},[x2], x3           //st (row 3)
@@ -566,17 +568,17 @@ lbl429:
     bne         kernel_8_16_32
 epil_8_16_32:
 
-    tbl         v10.8b, {v0.16b},v4.8b      //load from ref_main_idx (row 7)
+    tbl         v23.8b, {v0.16b},v4.8b      //load from ref_main_idx (row 7)
 
     umull       v20.8h, v14.8b, v7.8b       //mul (row 6)
-    tbl         v11.8b, {v0.16b},v5.8b      //load from ref_main_idx + 1 (row 7)
+    tbl         v25.8b, {v0.16b},v5.8b      //load from ref_main_idx + 1 (row 7)
     umlal       v20.8h, v15.8b, v6.8b       //mul (row 6)
 
     st1         {v24.8b},[x5], x3           //st (row 4)
     rshrn       v24.8b, v22.8h,#5           //round shft (row 5)
 
-    umull       v18.8h, v10.8b, v7.8b       //mul (row 7)
-    umlal       v18.8h, v11.8b, v6.8b       //mul (row 7)
+    umull       v18.8h, v23.8b, v7.8b       //mul (row 7)
+    umlal       v18.8h, v25.8b, v6.8b       //mul (row 7)
 
     st1         {v24.8b},[x5], x3           //(from previous loop)st (row 5)
     rshrn       v20.8b, v20.8h,#5           //(from previous loop)round shft (row 6)
@@ -628,38 +630,38 @@ sz_4_proc:
     dup         v28.8b,w1
 
     sshr        v22.8h, v22.8h,#5
-    sqxtn       v8.8b,  v22.8h
+    sqxtn       v19.8b,  v22.8h
 
     and         v6.8b,  v6.8b ,  v29.8b     //fract values in d1/ idx values in d0
     sub         v7.8b,  v28.8b ,  v6.8b     //32-fract
 
-    add         v8.8b,  v8.8b ,  v2.8b      //ref_main_idx (add 1)
-    sub         v8.8b,  v8.8b ,  v26.8b     //ref_main_idx
-    add         v9.8b,  v8.8b ,  v2.8b      //ref_main_idx + 1
+    add         v19.8b,  v19.8b ,  v2.8b    //ref_main_idx (add 1)
+    sub         v19.8b,  v19.8b ,  v26.8b   //ref_main_idx
+    add         v21.8b,  v19.8b ,  v2.8b    //ref_main_idx + 1
 
-    add         v4.8b,  v8.8b ,  v2.8b      //row 1 ref_main_idx
-    add         v5.8b,  v9.8b ,  v2.8b
+    add         v4.8b,  v19.8b ,  v2.8b     //row 1 ref_main_idx
+    add         v5.8b,  v21.8b ,  v2.8b
 
-    tbl         v12.8b, {v0.16b},v8.8b      //load from ref_main_idx (row 0)
-    tbl         v13.8b, {v0.16b},v9.8b      //load from ref_main_idx + 1 (row 0)
+    tbl         v12.8b, {v0.16b},v19.8b     //load from ref_main_idx (row 0)
+    tbl         v13.8b, {v0.16b},v21.8b     //load from ref_main_idx + 1 (row 0)
 
 
     umull       v24.8h, v12.8b, v7.8b       //mul (row 0)
     tbl         v16.8b, {v0.16b},v4.8b      //load from ref_main_idx    (row 1)
     umlal       v24.8h, v13.8b, v6.8b       //mul (row 0)
 
-    add         v8.8b,  v8.8b ,  v3.8b      //idx (row 2)
+    add         v19.8b,  v19.8b ,  v3.8b    //idx (row 2)
     tbl         v17.8b, {v0.16b},v5.8b      //load from ref_main_idx + 1 (row 1)
-    add         v9.8b,  v9.8b ,  v3.8b      //idx+1 (row 2)
+    add         v21.8b,  v21.8b ,  v3.8b    //idx+1 (row 2)
 
     umull       v22.8h, v16.8b, v7.8b       //mul (row 1)
-    tbl         v12.8b, {v0.16b},v8.8b      //load from ref_main_idx    (row 2)
+    tbl         v12.8b, {v0.16b},v19.8b     //load from ref_main_idx    (row 2)
     umlal       v22.8h, v17.8b, v6.8b       //mul (row 1)
 
     rshrn       v24.8b, v24.8h,#5           //round shift (row 0)
 
     add         v4.8b,  v4.8b ,  v3.8b      //idx (row 3)
-    tbl         v13.8b, {v0.16b},v9.8b      //load from ref_main_idx + 1 (row 2)
+    tbl         v13.8b, {v0.16b},v21.8b     //load from ref_main_idx + 1 (row 2)
     add         v5.8b,  v5.8b ,  v3.8b      //idx+1 (row 3)
 
     umull       v20.8h, v12.8b, v7.8b       //mul (row 2)
@@ -687,7 +689,8 @@ end_func:
     add         sp, sp, #132
     // ldmfd sp!,{x4-x12,x15}                  //reload the registers from sp
     ldp         x19, x20,[sp],#16
-    pop_v_regs
+    ldp         d14,d15,[sp],#16
+    ldp         d12,d13,[sp],#16
     ret
 
 
diff --git a/common/arm64/ihevc_intra_pred_filters_luma_mode_19_to_25.s b/common/arm64/ihevc_intra_pred_filters_luma_mode_19_to_25.s
index fe7ac11..9b59d58 100644
--- a/common/arm64/ihevc_intra_pred_filters_luma_mode_19_to_25.s
+++ b/common/arm64/ihevc_intra_pred_filters_luma_mode_19_to_25.s
@@ -104,7 +104,10 @@
 ihevc_intra_pred_luma_mode_19_to_25_av8:
 
     // stmfd sp!, {x4-x12, x14}            //stack stores the values of the arguments
-    push_v_regs
+
+    stp         d9,d10,[sp,#-16]!
+    stp         d12,d13,[sp,#-16]!
+    stp         d14,d15,[sp,#-16]!
     stp         x19, x20,[sp,#-16]!
 
     adrp        x7,  :got:gai4_ihevc_ang_table
@@ -267,7 +270,7 @@ prologue:
 
     add         x10,x8,x9                   //(i row)*pu1_ref[ref_main_idx]
 
-    ld1         {v8.8b},[x10],x11           //(i row)ref_main_idx
+    ld1         {v23.8b},[x10],x11          //(i row)ref_main_idx
     sbfx        x9,x14,#8,#8
 
     ld1         {v9.8b},[x10]               //(i row)ref_main_idx_1
@@ -278,7 +281,7 @@ prologue:
     add         x10,x8,x9                   //(iii)*pu1_ref[ref_main_idx]
 
     ld1         {v12.8b},[x12],x11          //(ii)ref_main_idx
-    umull       v10.8h, v8.8b, v30.8b       //(i row)vmull_u8(ref_main_idx, dup_const_32_fract)
+    umull       v10.8h, v23.8b, v30.8b      //(i row)vmull_u8(ref_main_idx, dup_const_32_fract)
 
     ld1         {v13.8b},[x12]              //(ii)ref_main_idx_1
     umlal       v10.8h, v9.8b, v31.8b       //(i row)vmull_u8(ref_main_idx_1, dup_const_fract)
@@ -316,7 +319,7 @@ prologue:
     dup         v29.8b, v4.8b[5]            //(vi)
     add         x10,x8,x9                   //(v)*pu1_ref[ref_main_idx]
 
-    ld1         {v8.8b},[x10],x11           //(v)ref_main_idx
+    ld1         {v23.8b},[x10],x11          //(v)ref_main_idx
     sub         v24.8b,  v1.8b ,  v25.8b    //(iv)32-fract(dup_const_32_fract)
 
     umull       v22.8h, v20.8b, v24.8b      //(iv)vmull_u8(ref_main_idx, dup_const_32_fract)
@@ -336,7 +339,7 @@ prologue:
     add         x10,x8,x9                   //(vii)*pu1_ref[ref_main_idx]
 
     ld1         {v12.8b},[x12],x11          //(vi)ref_main_idx
-    umull       v10.8h, v8.8b, v30.8b       //(v)vmull_u8(ref_main_idx, dup_const_32_fract)
+    umull       v10.8h, v23.8b, v30.8b      //(v)vmull_u8(ref_main_idx, dup_const_32_fract)
 
     ld1         {v13.8b},[x12]              //(vi)ref_main_idx_1
     umlal       v10.8h, v9.8b, v31.8b       //(v)vmull_u8(ref_main_idx_1, dup_const_fract)
@@ -392,7 +395,7 @@ kernel_8_rows:
     subs        x4,x4,#8
     sbfx        x9,x14,#8,#8
 
-    ld1         {v8.8b},[x10],x11           //(i)ref_main_idx
+    ld1         {v23.8b},[x10],x11          //(i)ref_main_idx
     sub         v24.8b,  v1.8b ,  v25.8b    //(viii)32-fract(dup_const_32_fract)
 
     add         x20,x6,#8                   //increment the row value
@@ -416,7 +419,7 @@ kernel_8_rows:
     add         x10,x8,x9                   //(iii)*pu1_ref[ref_main_idx]
 
     ld1         {v12.8b},[x12],x11          //(ii)ref_main_idx
-    umull       v10.8h, v8.8b, v30.8b       //(i)vmull_u8(ref_main_idx, dup_const_32_fract)
+    umull       v10.8h, v23.8b, v30.8b      //(i)vmull_u8(ref_main_idx, dup_const_32_fract)
 
     ld1         {v13.8b},[x12]              //(ii)ref_main_idx_1
     umlal       v10.8h, v9.8b, v31.8b       //(i)vmull_u8(ref_main_idx_1, dup_const_fract)
@@ -477,7 +480,7 @@ kernel_8_rows:
     dup         v25.8b, v4.8b[7]            //(viii)
     rshrn       v18.8b, v18.8h,#5           //(iii)shift_res = vrshrn_n_u16(add_res, 5)
 
-    ld1         {v8.8b},[x10],x11           //(v)ref_main_idx
+    ld1         {v23.8b},[x10],x11          //(v)ref_main_idx
     and         v4.16b,  v2.16b ,  v6.16b   //dup_const_fract(fract = pos & (31))
 
     ld1         {v9.8b},[x10]               //(v)ref_main_idx_1
@@ -493,7 +496,7 @@ kernel_8_rows:
     sub         v30.8b,  v1.8b ,  v31.8b    //(v)32-fract(dup_const_32_fract)
 
     ld1         {v13.8b},[x12]              //(vi)ref_main_idx_1
-    umull       v10.8h, v8.8b, v30.8b       //(v)vmull_u8(ref_main_idx, dup_const_32_fract)
+    umull       v10.8h, v23.8b, v30.8b      //(v)vmull_u8(ref_main_idx, dup_const_32_fract)
 
     umov        w14, v3.2s[0]               //(i)extract idx to the r register
     sxtw        x14,w14
@@ -592,7 +595,7 @@ core_loop_4:
     dup         v7.8b,w4                    //dup_const_32_fract
     umlal       v4.8h, v3.8b, v0.8b         //vmull_u8(ref_main_idx_1, dup_const_fract)
 
-    ld1         {v8.s}[0],[x10]             //ref_main_idx
+    ld1         {v23.s}[0],[x10]            //ref_main_idx
     add         x8,x8,#1
 
     ld1         {v9.s}[0],[x11]             //ref_main_idx_1
@@ -607,7 +610,7 @@ core_loop_4:
     add         x11,x10,#1                  //pu1_ref_main_idx_1 += 1
 
     dup         v12.8b,w5                   //dup_const_fract
-    umull       v10.8h, v8.8b, v7.8b        //vmull_u8(ref_main_idx, dup_const_32_fract)
+    umull       v10.8h, v23.8b, v7.8b       //vmull_u8(ref_main_idx, dup_const_32_fract)
 
     sub         x20,x5,#32
     neg         x4, x20
@@ -655,7 +658,9 @@ end_loops:
     add         sp, sp, #132
     // ldmfd sp!,{x4-x12,x15}                  //reload the registers from sp
     ldp         x19, x20,[sp],#16
-    pop_v_regs
+    ldp         d14,d15,[sp],#16
+    ldp         d12,d13,[sp],#16
+    ldp         d9,d10,[sp],#16
     ret
 
 
diff --git a/common/arm64/ihevc_intra_pred_luma_dc.s b/common/arm64/ihevc_intra_pred_luma_dc.s
index 7683266..e4fdb5d 100644
--- a/common/arm64/ihevc_intra_pred_luma_dc.s
+++ b/common/arm64/ihevc_intra_pred_luma_dc.s
@@ -104,7 +104,7 @@
 ihevc_intra_pred_luma_dc_av8:
 
     // stmfd sp!, {x4-x12, x14}            //stack stores the values of the arguments
-    push_v_regs
+
     stp         x19, x20,[sp,#-16]!
 
 
@@ -128,14 +128,14 @@ ihevc_intra_pred_luma_dc_av8:
     add         x8, x7, #1                  //&src[2nt+1]
     mvn         x5, x5
     add         x5, x5, #1
-    dup         v8.2s,w5
+    dup         v7.2s,w5
 
     ldrb        w14, [x8]
     sxtw        x14,w14
-    shl         d8, d8,#32
+    shl         d7, d7,#32
 
     sub         x9, x7, #1                  //&src[2nt-1]
-    sshr        d8, d8,#32
+    sshr        d7, d7,#32
 
     mov         x7, x8                      //x7 also stores 2nt+1
 
@@ -192,7 +192,7 @@ core_loop_add:
 
 epil_add_loop:
 
-    sshl        d9, d6, d8                  //(dc_val) shr by log2nt+1
+    sshl        d18, d6, d7                 //(dc_val) shr by log2nt+1
     cmp         x4, #32
 
     mov         v28.s[0], w14
@@ -200,25 +200,25 @@ epil_add_loop:
     mov         x20,#128
     csel        x6, x20, x6,eq
 
-    dup         v16.8b, v9.8b[0]            //dc_val
-    shl         d13, d9,#1                  //2*dc
+    dup         v16.8b, v18.8b[0]           //dc_val
+    shl         d25, d18,#1                 //2*dc
 
     beq         prologue_cpy_32
 
-    add         d14,  d13 ,  d28            //src[2nt+1]+2+src[2nt-1]+2dc_val
+    add         d27,  d25 ,  d28            //src[2nt+1]+2+src[2nt-1]+2dc_val
     mov         x20,#0
     csel        x6, x20, x6,ne              //nt
 
-    ushr        v15.4h, v14.4h,#2           //final dst[0]'s value in d15[0]
+    ushr        v29.4h, v27.4h,#2           //final dst[0]'s value in d15[0]
     csel        x10, x4, x10,ne
 
-    add         d11,  d13 ,  d9             //3*dc
+    add         d23,  d25 ,  d18            //3*dc
     sub         x12, x3, x3, lsl #3         //-7*strd
 
-    add         d11,  d11 ,  d17            //3*dc + 2
+    add         d23,  d23 ,  d17            //3*dc + 2
     add         x12, x12, #8                //offset after one 8x8 block (-7*strd + 8)
 
-    dup         v24.8h, v11.4h[0]           //3*dc + 2 (moved to all lanes)
+    dup         v24.8h, v23.4h[0]           //3*dc + 2 (moved to all lanes)
     sub         x0, x3, x4                  //strd - nt
 
 prologue_col:
@@ -248,7 +248,7 @@ prologue_col:
     movi        d19, #0x00000000000000ff    //
     sqshrun     v3.8b, v22.8h,#2            //rows shx2 movn (prol)
 
-    bsl         v19.8b,  v15.8b ,  v2.8b    //first row with dst[0]
+    bsl         v19.8b,  v29.8b ,  v2.8b    //first row with dst[0]
     add         v26.8h,  v26.8h ,  v24.8h   //col 8::15 add 3dc+2 (prol extra)
 
     rev64       v3.8b,  v3.8b
@@ -445,23 +445,23 @@ dc_4:
     mov         v28.s[1], w5                //src[2nt+1]+2+src[2nt-1] moved to d28
     add         d6,  d6 ,  d5               //accumulate all inp into d6 (end for nt==8)
 
-    sshl        d9, d6, d8                  //(dc_val) shr by log2nt+1
+    sshl        d18, d6, d7                 //(dc_val) shr by log2nt+1
     mov         x8, x7                      //&src[2nt+1]
 
-    shl         d13, d9,#1                  //2*dc
+    shl         d25, d18,#1                 //2*dc
     sub         x9, x9, #3                  //&src[2nt-1-row]
 
-    dup         v16.8b, v9.8b[0]            //dc_val
-    add         d14,  d13 ,  d28            //src[2nt+1]+2+src[2nt-1]+2dc_val
+    dup         v16.8b, v18.8b[0]           //dc_val
+    add         d27,  d25 ,  d28            //src[2nt+1]+2+src[2nt-1]+2dc_val
 
-    ushr        v15.4h, v14.4h,#2           //final dst[0]'s value in d15[0]
+    ushr        v29.4h, v27.4h,#2           //final dst[0]'s value in d15[0]
     sub         x12, x3, x3, lsl #2         //-3*strd
-    add         d11,  d13 ,  d9             //3*dc
+    add         d23,  d25 ,  d18            //3*dc
 
-    add         d11,  d11 ,  d17            //3*dc + 2
+    add         d23,  d23 ,  d17            //3*dc + 2
     add         x12, x12, #4                //offset after one 4x4 block (-3*strd + 4)
 
-    dup         v24.8h, v11.4h[0]           //3*dc + 2 (moved to all lanes)
+    dup         v24.8h, v23.4h[0]           //3*dc + 2 (moved to all lanes)
     sub         x0, x3, x4                  //strd - nt
 
 
@@ -482,7 +482,7 @@ dc_4:
     sqshrun     v3.8b, v22.8h,#2            //rows shx2 movn (prol)
 
 
-    bsl         v19.8b,  v15.8b ,  v2.8b    //first row with dst[0]
+    bsl         v19.8b,  v29.8b ,  v2.8b    //first row with dst[0]
 
     rev64       v3.8b,  v3.8b
 
@@ -510,7 +510,7 @@ epilogue_end:
 end_func:
     // ldmfd sp!,{x4-x12,x15}                  //reload the registers from sp
     ldp         x19, x20,[sp],#16
-    pop_v_regs
+
     ret
 
 
diff --git a/common/arm64/ihevc_intra_pred_luma_horz.s b/common/arm64/ihevc_intra_pred_luma_horz.s
index 551fd77..95452e4 100644
--- a/common/arm64/ihevc_intra_pred_luma_horz.s
+++ b/common/arm64/ihevc_intra_pred_luma_horz.s
@@ -97,7 +97,7 @@
 ihevc_intra_pred_luma_horz_av8:
 
     // stmfd sp!, {x4-x12, x14}                //stack stores the values of the arguments
-    push_v_regs
+
     stp         x19, x20,[sp,#-16]!
 
     //ldr          x5,[sp,#44]                        @loads mode
@@ -126,7 +126,7 @@ core_loop_32:
     st1         { v2.16b},[x2],x3           //store in 1st row 0-16 columns
     st1         { v2.16b},[x9],x3           //store in 1st row 16-32 columns
 
-    dup         v8.16b, v0.16b[12]
+    dup         v1.16b, v0.16b[12]
     st1         { v4.16b},[x2],x3
     st1         { v4.16b},[x9],x3
 
@@ -135,14 +135,14 @@ core_loop_32:
     st1         { v6.16b},[x9],x3
 
     dup         v4.16b, v0.16b[10]
-    st1         { v8.16b},[x2],x3
-    st1         { v8.16b},[x9],x3
+    st1         { v1.16b},[x2],x3
+    st1         { v1.16b},[x9],x3
 
     dup         v6.16b, v0.16b[9]
     st1         { v2.16b},[x2],x3
     st1         { v2.16b},[x9],x3
 
-    dup         v8.16b, v0.16b[8]
+    dup         v1.16b, v0.16b[8]
     st1         { v4.16b},[x2],x3
     st1         { v4.16b},[x9],x3
 
@@ -151,14 +151,14 @@ core_loop_32:
     st1         { v6.16b},[x9],x3
 
     dup         v4.16b, v0.8b[6]
-    st1         { v8.16b},[x2],x3
-    st1         { v8.16b},[x9],x3
+    st1         { v1.16b},[x2],x3
+    st1         { v1.16b},[x9],x3
 
     dup         v6.16b, v0.8b[5]
     st1         { v2.16b},[x2],x3
     st1         { v2.16b},[x9],x3
 
-    dup         v8.16b, v0.8b[4]
+    dup         v1.16b, v0.8b[4]
     st1         { v4.16b},[x2],x3
     st1         { v4.16b},[x9],x3
 
@@ -167,15 +167,15 @@ core_loop_32:
     st1         { v6.16b},[x9],x3
 
     dup         v4.16b, v0.8b[2]
-    st1         { v8.16b},[x2],x3
-    st1         { v8.16b},[x9],x3
+    st1         { v1.16b},[x2],x3
+    st1         { v1.16b},[x9],x3
 
     dup         v6.16b, v0.8b[1]
     st1         { v2.16b},[x2],x3
     st1         { v2.16b},[x9],x3
     sub         x12,x12,#16                 //move to 16th value pointer
 
-    dup         v8.16b, v0.8b[0]
+    dup         v1.16b, v0.8b[0]
     st1         { v4.16b},[x2],x3
     st1         { v4.16b},[x9],x3
 
@@ -183,12 +183,12 @@ core_loop_32:
     st1         { v6.16b},[x2],x3
     st1         { v6.16b},[x9],x3
 
-    st1         { v8.16b},[x2],x3
-    st1         { v8.16b},[x9],x3
+    st1         { v1.16b},[x2],x3
+    st1         { v1.16b},[x9],x3
     bgt         core_loop_32
     // ldmfd sp!,{x4-x12,x15}                  //reload the registers from sp
     ldp         x19, x20,[sp],#16
-    pop_v_regs
+
     ret
     b           end_func
 
@@ -214,18 +214,18 @@ core_loop_16:
     dup         v6.16b, v0.16b[12]
     sqadd       v22.8h,  v26.8h ,  v24.8h
 
-    dup         v8.16b, v0.16b[11]
+    dup         v1.16b, v0.16b[11]
     sqxtun      v22.8b, v22.8h
 
     st1         {v22.8b},[x2],#8
 
-    dup         v10.16b, v0.16b[10]
+    dup         v18.16b, v0.16b[10]
     usubl       v24.8h, v31.8b, v28.8b
 
-    dup         v12.16b, v0.16b[9]
+    dup         v19.16b, v0.16b[9]
     sshr        v24.8h, v24.8h,#1
 
-    dup         v14.16b, v0.16b[8]
+    dup         v20.16b, v0.16b[8]
     sqadd       v22.8h,  v26.8h ,  v24.8h
 
     dup         v16.16b, v0.8b[7]
@@ -238,37 +238,37 @@ core_loop_16:
 
     st1         { v4.16b},[x2],x3
     st1         { v6.16b},[x2],x3
-    st1         { v8.16b},[x2],x3
+    st1         { v1.16b},[x2],x3
 
     dup         v2.16b, v0.8b[6]
-    st1         { v10.16b},[x2],x3
+    st1         { v18.16b},[x2],x3
 
     dup         v4.16b, v0.8b[5]
-    st1         { v12.16b},[x2],x3
+    st1         { v19.16b},[x2],x3
 
     dup         v6.16b, v0.8b[4]
-    st1         { v14.16b},[x2],x3
+    st1         { v20.16b},[x2],x3
 
-    dup         v8.16b, v0.8b[3]
+    dup         v1.16b, v0.8b[3]
     st1         { v16.16b},[x2],x3
 
-    dup         v10.16b, v0.8b[2]
+    dup         v18.16b, v0.8b[2]
     st1         { v2.16b},[x2],x3
 
-    dup         v12.16b, v0.8b[1]
+    dup         v19.16b, v0.8b[1]
     st1         { v4.16b},[x2],x3
 
-    dup         v14.16b, v0.8b[0]
+    dup         v20.16b, v0.8b[0]
     st1         { v6.16b},[x2],x3
 
-    st1         { v8.16b},[x2],x3
-    st1         { v10.16b},[x2],x3
-    st1         { v12.16b},[x2],x3
-    st1         { v14.16b},[x2],x3
+    st1         { v1.16b},[x2],x3
+    st1         { v18.16b},[x2],x3
+    st1         { v19.16b},[x2],x3
+    st1         { v20.16b},[x2],x3
 
     // ldmfd sp!,{x4-x12,x15}                  //reload the registers from sp
     ldp         x19, x20,[sp],#16
-    pop_v_regs
+
     ret
     b           end_func
 
@@ -302,19 +302,19 @@ core_loop_8:
     st1         {v22.8b},[x2],x3
     st1         {v3.8b},[x2],x3
 
-    dup         v8.8b, v0.8b[1]
+    dup         v1.8b, v0.8b[1]
     st1         {v4.8b},[x2],x3
     st1         {v5.8b},[x2],x3
 
-    dup         v9.8b, v0.8b[0]
+    dup         v17.8b, v0.8b[0]
     st1         {v6.8b},[x2],x3
     st1         {v7.8b},[x2],x3
 
-    st1         {v8.8b},[x2],x3
-    st1         {v9.8b},[x2],x3
+    st1         {v1.8b},[x2],x3
+    st1         {v17.8b},[x2],x3
     // ldmfd sp!,{x4-x12,x15}                  //reload the registers from sp
     ldp         x19, x20,[sp],#16
-    pop_v_regs
+
     ret
     b           end_func
 
@@ -349,7 +349,7 @@ core_loop_4:
 
     // ldmfd sp!,{x4-x12,x15}                  //reload the registers from sp
     ldp         x19, x20,[sp],#16
-    pop_v_regs
+
     ret
 end_func:
 
diff --git a/common/arm64/ihevc_intra_pred_luma_mode2.s b/common/arm64/ihevc_intra_pred_luma_mode2.s
index 5d7a3c5..598ce5a 100644
--- a/common/arm64/ihevc_intra_pred_luma_mode2.s
+++ b/common/arm64/ihevc_intra_pred_luma_mode2.s
@@ -105,7 +105,7 @@
 ihevc_intra_pred_luma_mode2_av8:
 
     // stmfd sp!, {x4-x12, x14}    //stack stores the values of the arguments
-    push_v_regs
+
     stp         x19, x20,[sp,#-16]!
 
     mov         x8,#-2
@@ -138,20 +138,20 @@ prologue_cpy_32:
     ld1         {v7.8b},[x10],x8
     add         x7,x6,x3
 
-    rev64       v8.8b,  v0.8b
-    rev64       v9.8b,  v1.8b
+    rev64       v16.8b,  v0.8b
+    rev64       v17.8b,  v1.8b
     lsl         x5, x3, #2
 
-    rev64       v10.8b,  v2.8b
-    rev64       v11.8b,  v3.8b
+    rev64       v18.8b,  v2.8b
+    rev64       v19.8b,  v3.8b
     add         x9,x7,x3
 
-    rev64       v12.8b,  v4.8b
+    rev64       v20.8b,  v4.8b
     subs        x1,x1,#8
 
-    rev64       v13.8b,  v5.8b
-    rev64       v14.8b,  v6.8b
-    rev64       v15.8b,  v7.8b
+    rev64       v21.8b,  v5.8b
+    rev64       v22.8b,  v6.8b
+    rev64       v23.8b,  v7.8b
     add         x14,x9,x3
 
     beq         epilogue_mode2
@@ -160,24 +160,24 @@ prologue_cpy_32:
 
 kernel_mode2:
 
-    st1         {v8.8b},[x6],x5
-    st1         {v9.8b},[x7],x5
+    st1         {v16.8b},[x6],x5
+    st1         {v17.8b},[x7],x5
     subs        x11,x11,#8
 
-    st1         {v10.8b},[x9],x5
+    st1         {v18.8b},[x9],x5
     add         x20,x2,#8
     csel        x2, x20, x2,gt
 
-    st1         {v11.8b},[x14],x5
-    st1         {v12.8b},[x6],x5
+    st1         {v19.8b},[x14],x5
+    st1         {v20.8b},[x6],x5
     csel        x11, x4, x11,le
 
-    st1         {v13.8b},[x7],x5
-    st1         {v14.8b},[x9],x5
+    st1         {v21.8b},[x7],x5
+    st1         {v22.8b},[x9],x5
     add         x20, x2, x3, lsl #2
     csel        x2, x20, x2,le
 
-    st1         {v15.8b},[x14],x5
+    st1         {v23.8b},[x14],x5
     ld1         {v0.8b},[x0],x8
     sub         x14,x4,#8
 
@@ -201,42 +201,42 @@ kernel_mode2:
     add         x20, x0, x4
     csel        x0, x20, x0,le
 
-    rev64       v8.8b,  v0.8b
+    rev64       v16.8b,  v0.8b
     add         x7, x6, x3
 
-    rev64       v9.8b,  v1.8b
+    rev64       v17.8b,  v1.8b
     sub         x20, x0, #8
     csel        x0, x20, x0,le
 
-    rev64       v10.8b,  v2.8b
+    rev64       v18.8b,  v2.8b
     csel        x12, x4, x12,le
 
-    rev64       v11.8b,  v3.8b
+    rev64       v19.8b,  v3.8b
     add         x9, x7, x3
 
-    rev64       v12.8b,  v4.8b
+    rev64       v20.8b,  v4.8b
     add         x10,x0,#-1
 
-    rev64       v13.8b,  v5.8b
+    rev64       v21.8b,  v5.8b
     subs        x1, x1, #8
 
-    rev64       v14.8b,  v6.8b
+    rev64       v22.8b,  v6.8b
     add         x14, x9, x3
 
-    rev64       v15.8b,  v7.8b
+    rev64       v23.8b,  v7.8b
 
     bne         kernel_mode2
 
 epilogue_mode2:
 
-    st1         {v8.8b},[x6],x5
-    st1         {v9.8b},[x7],x5
-    st1         {v10.8b},[x9],x5
-    st1         {v11.8b},[x14],x5
-    st1         {v12.8b},[x6],x5
-    st1         {v13.8b},[x7],x5
-    st1         {v14.8b},[x9],x5
-    st1         {v15.8b},[x14],x5
+    st1         {v16.8b},[x6],x5
+    st1         {v17.8b},[x7],x5
+    st1         {v18.8b},[x9],x5
+    st1         {v19.8b},[x14],x5
+    st1         {v20.8b},[x6],x5
+    st1         {v21.8b},[x7],x5
+    st1         {v22.8b},[x9],x5
+    st1         {v23.8b},[x14],x5
 
     b           end_func
 
@@ -269,7 +269,7 @@ mode2_4:
 end_func:
     // ldmfd sp!,{x4-x12,x15}                  //reload the registers from sp
     ldp         x19, x20,[sp],#16
-    pop_v_regs
+
     ret
 
 
diff --git a/common/arm64/ihevc_intra_pred_luma_mode_27_to_33.s b/common/arm64/ihevc_intra_pred_luma_mode_27_to_33.s
index 79964f7..58b2d37 100644
--- a/common/arm64/ihevc_intra_pred_luma_mode_27_to_33.s
+++ b/common/arm64/ihevc_intra_pred_luma_mode_27_to_33.s
@@ -100,7 +100,10 @@
 ihevc_intra_pred_luma_mode_27_to_33_av8:
 
     // stmfd sp!, {x4-x12, x14}                //stack stores the values of the arguments
-    push_v_regs
+
+    stp         d9,d10,[sp,#-16]!
+    stp         d12,d13,[sp,#-16]!
+    stp         d14,d15,[sp,#-16]!
     stp         x19, x20,[sp,#-16]!
 
     adrp        x6,  :got:gai4_ihevc_ang_table //loads word32 gai4_ihevc_ang_table[35]
@@ -156,7 +159,7 @@ prologue:
     add         x10,x8,x9                   //(i row)*pu1_ref[ref_main_idx]
 
     asr         x14,x14,#8                  //(ii)shift by 8
-    ld1         {v8.8b},[x10],x11           //(i row)ref_main_idx
+    ld1         {v23.8b},[x10],x11          //(i row)ref_main_idx
     and         x9,x14,#0xff                //(ii)get the last byte
 
     asr         x14,x14,#8                  //(iii)
@@ -168,7 +171,7 @@ prologue:
     add         x10,x8,x9                   //(iii)*pu1_ref[ref_main_idx]
 
     ld1         {v12.8b},[x12],x11          //(ii)ref_main_idx
-    umull       v10.8h, v8.8b, v30.8b       //(i row)vmull_u8(ref_main_idx, dup_const_32_fract)
+    umull       v10.8h, v23.8b, v30.8b      //(i row)vmull_u8(ref_main_idx, dup_const_32_fract)
 
     ld1         {v13.8b},[x12]              //(ii)ref_main_idx_1
     umlal       v10.8h, v9.8b, v31.8b       //(i row)vmull_u8(ref_main_idx_1, dup_const_fract)
@@ -207,7 +210,7 @@ prologue:
     dup         v29.8b, v4.8b[5]            //(vi)
     add         x10,x8,x9                   //(v)*pu1_ref[ref_main_idx]
 
-    ld1         {v8.8b},[x10],x11           //(v)ref_main_idx
+    ld1         {v23.8b},[x10],x11          //(v)ref_main_idx
     sub         v24.8b,  v1.8b ,  v25.8b    //(iv)32-fract(dup_const_32_fract)
 
     asr         x14,x14,#8                  //(vi)
@@ -229,7 +232,7 @@ prologue:
     add         x10,x8,x9                   //(vii)*pu1_ref[ref_main_idx]
 
     ld1         {v12.8b},[x12],x11          //(vi)ref_main_idx
-    umull       v10.8h, v8.8b, v30.8b       //(v)vmull_u8(ref_main_idx, dup_const_32_fract)
+    umull       v10.8h, v23.8b, v30.8b      //(v)vmull_u8(ref_main_idx, dup_const_32_fract)
 
     ld1         {v13.8b},[x12]              //(vi)ref_main_idx_1
     umlal       v10.8h, v9.8b, v31.8b       //(v)vmull_u8(ref_main_idx_1, dup_const_fract)
@@ -286,7 +289,7 @@ kernel_8_rows:
     dup         v31.8b, v4.8b[0]
     subs        x4,x4,#8
 
-    ld1         {v8.8b},[x10],x11           //(i)ref_main_idx
+    ld1         {v23.8b},[x10],x11          //(i)ref_main_idx
     sub         v24.8b,  v1.8b ,  v25.8b    //(viii)32-fract(dup_const_32_fract)
     and         x9,x14,#0xff                //(ii)
     add         x20,x6,#8                   //increment the row value
@@ -309,7 +312,7 @@ kernel_8_rows:
     add         x10,x8,x9                   //(iii)*pu1_ref[ref_main_idx]
 
     ld1         {v12.8b},[x12],x11          //(ii)ref_main_idx
-    umull       v10.8h, v8.8b, v30.8b       //(i)vmull_u8(ref_main_idx, dup_const_32_fract)
+    umull       v10.8h, v23.8b, v30.8b      //(i)vmull_u8(ref_main_idx, dup_const_32_fract)
     asr         x14,x14,#8                  //(iv)
 
     ld1         {v13.8b},[x12]              //(ii)ref_main_idx_1
@@ -368,7 +371,7 @@ kernel_8_rows:
     rshrn       v18.8b, v18.8h,#5           //(iii)shift_res = vrshrn_n_u16(add_res, 5)
     asr         x14,x14,#8                  //(vii)
 
-    ld1         {v8.8b},[x10],x11           //(v)ref_main_idx
+    ld1         {v23.8b},[x10],x11          //(v)ref_main_idx
     and         v4.16b,  v2.16b ,  v6.16b   //dup_const_fract(fract = pos & (31))
     and         x9,x14,#0xff                //(vii)
 
@@ -385,7 +388,7 @@ kernel_8_rows:
     and         x9,x14,#0xff                //(viii)
 
     ld1         {v13.8b},[x12]              //(vi)ref_main_idx_1
-    umull       v10.8h, v8.8b, v30.8b       //(v)vmull_u8(ref_main_idx, dup_const_32_fract)
+    umull       v10.8h, v23.8b, v30.8b      //(v)vmull_u8(ref_main_idx, dup_const_32_fract)
 
     umov        w14, v3.2s[0]               //(i)extract idx to the r register
     sxtw        x14,w14
@@ -484,7 +487,7 @@ core_loop_4:
     dup         v7.8b,w4                    //dup_const_32_fract
     umlal       v4.8h, v3.8b, v0.8b         //vmull_u8(ref_main_idx_1, dup_const_fract)
 
-    ld1         {v8.s}[0],[x10]             //ref_main_idx
+    ld1         {v23.s}[0],[x10]            //ref_main_idx
     add         x8,x8,#1
 
     ld1         {v9.s}[0],[x11]             //ref_main_idx_1
@@ -500,7 +503,7 @@ core_loop_4:
     add         x11,x10,#1                  //pu1_ref_main_idx_1 += 1
 
     dup         v12.8b,w5                   //dup_const_fract
-    umull       v10.8h, v8.8b, v7.8b        //vmull_u8(ref_main_idx, dup_const_32_fract)
+    umull       v10.8h, v23.8b, v7.8b       //vmull_u8(ref_main_idx, dup_const_32_fract)
 
     sub         x20,x5,#32
     neg         x4, x20
@@ -548,7 +551,9 @@ core_loop_4:
 end_loops:
     // ldmfd sp!,{x4-x12,x15}                  //reload the registers from sp
     ldp         x19, x20,[sp],#16
-    pop_v_regs
+    ldp         d14,d15,[sp],#16
+    ldp         d12,d13,[sp],#16
+    ldp         d9,d10,[sp],#16
     ret
 
 
diff --git a/common/arm64/ihevc_intra_pred_luma_mode_3_to_9.s b/common/arm64/ihevc_intra_pred_luma_mode_3_to_9.s
index b6e8601..56d2f6b 100644
--- a/common/arm64/ihevc_intra_pred_luma_mode_3_to_9.s
+++ b/common/arm64/ihevc_intra_pred_luma_mode_3_to_9.s
@@ -106,7 +106,9 @@
 ihevc_intra_pred_luma_mode_3_to_9_av8:
 
     // stmfd sp!, {x4-x12, x14}        //stack stores the values of the arguments
-    push_v_regs
+
+    stp         d12,d13,[sp,#-16]!
+    stp         d14,d15,[sp,#-16]!
     stp         x19, x20,[sp,#-16]!
 
     adrp        x7,  :got:gai4_ihevc_ang_table
@@ -165,7 +167,7 @@ prologue_8_16_32:
 
     movi        v28.8b, #32
 
-    sqxtn       v8.8b,  v22.8h
+    sqxtn       v1.8b,  v22.8h
 
     and         v6.8b,  v6.8b ,  v29.8b     //fract values in d1/ idx values in d0
 
@@ -173,54 +175,54 @@ prologue_8_16_32:
 
     movi        v27.8b, #7                  //row 0 to 7
 
-    sub         v8.8b,  v8.8b ,  v2.8b      //ref_main_idx (sub row)
-    sub         v8.8b,  v26.8b ,  v8.8b     //ref_main_idx (row 0)
-    add         v8.8b,  v8.8b ,  v27.8b     //t0 compensate the pu1_src idx incremented by 8
-    sub         v9.8b,  v8.8b ,  v2.8b      //ref_main_idx + 1 (row 0)
-    tbl         v12.8b, {v0.16b},v8.8b      //load from ref_main_idx (row 0)
+    sub         v1.8b,  v1.8b ,  v2.8b      //ref_main_idx (sub row)
+    sub         v1.8b,  v26.8b ,  v1.8b     //ref_main_idx (row 0)
+    add         v1.8b,  v1.8b ,  v27.8b     //t0 compensate the pu1_src idx incremented by 8
+    sub         v19.8b,  v1.8b ,  v2.8b     //ref_main_idx + 1 (row 0)
+    tbl         v12.8b, {v0.16b},v1.8b      //load from ref_main_idx (row 0)
     sub         v7.8b,  v28.8b ,  v6.8b     //32-fract
 
-    tbl         v13.8b, {v0.16b},v9.8b      //load from ref_main_idx + 1 (row 0)
-    sub         v4.8b,  v8.8b ,  v2.8b      //ref_main_idx (row 1)
-    sub         v5.8b,  v9.8b ,  v2.8b      //ref_main_idx + 1 (row 1)
+    tbl         v13.8b, {v0.16b},v19.8b     //load from ref_main_idx + 1 (row 0)
+    sub         v4.8b,  v1.8b ,  v2.8b      //ref_main_idx (row 1)
+    sub         v5.8b,  v19.8b ,  v2.8b     //ref_main_idx + 1 (row 1)
 
     tbl         v16.8b, {v0.16b},v4.8b      //load from ref_main_idx (row 1)
     umull       v24.8h, v12.8b, v7.8b       //mul (row 0)
     umlal       v24.8h, v13.8b, v6.8b       //mul (row 0)
 
     tbl         v17.8b, {v0.16b},v5.8b      //load from ref_main_idx + 1 (row 1)
-    sub         v8.8b,  v8.8b ,  v3.8b      //ref_main_idx (row 2)
-    sub         v9.8b,  v9.8b ,  v3.8b      //ref_main_idx + 1 (row 2)
+    sub         v1.8b,  v1.8b ,  v3.8b      //ref_main_idx (row 2)
+    sub         v19.8b,  v19.8b ,  v3.8b    //ref_main_idx + 1 (row 2)
 
     rshrn       v24.8b, v24.8h,#5           //round shft (row 0)
 
-    tbl         v14.8b, {v0.16b},v8.8b      //load from ref_main_idx (row 2)
+    tbl         v14.8b, {v0.16b},v1.8b      //load from ref_main_idx (row 2)
     umull       v22.8h, v16.8b, v7.8b       //mul (row 1)
     umlal       v22.8h, v17.8b, v6.8b       //mul (row 1)
 
-    tbl         v15.8b, {v0.16b},v9.8b      //load from ref_main_idx + 1 (row 2)
+    tbl         v15.8b, {v0.16b},v19.8b     //load from ref_main_idx + 1 (row 2)
     sub         v4.8b,  v4.8b ,  v3.8b      //ref_main_idx (row 3)
     sub         v5.8b,  v5.8b ,  v3.8b      //ref_main_idx + 1 (row 3)
 
     st1         {v24.8b},[x2], x3           //st (row 0)
     rshrn       v22.8b, v22.8h,#5           //round shft (row 1)
 
-    tbl         v10.8b, {v0.16b},v4.8b      //load from ref_main_idx (row 3)
+    tbl         v23.8b, {v0.16b},v4.8b      //load from ref_main_idx (row 3)
     umull       v20.8h, v14.8b, v7.8b       //mul (row 2)
     umlal       v20.8h, v15.8b, v6.8b       //mul (row 2)
 
-    tbl         v11.8b, {v0.16b},v5.8b      //load from ref_main_idx + 1 (row 3)
-    sub         v8.8b,  v8.8b ,  v3.8b      //ref_main_idx (row 4)
-    sub         v9.8b,  v9.8b ,  v3.8b      //ref_main_idx + 1 (row 4)
+    tbl         v25.8b, {v0.16b},v5.8b      //load from ref_main_idx + 1 (row 3)
+    sub         v1.8b,  v1.8b ,  v3.8b      //ref_main_idx (row 4)
+    sub         v19.8b,  v19.8b ,  v3.8b    //ref_main_idx + 1 (row 4)
 
     st1         {v22.8b},[x2], x3           //st (row 1)
     rshrn       v20.8b, v20.8h,#5           //round shft (row 2)
 
-    tbl         v12.8b, {v0.16b},v8.8b      //load from ref_main_idx (row 4)
-    umull       v18.8h, v10.8b, v7.8b       //mul (row 3)
-    umlal       v18.8h, v11.8b, v6.8b       //mul (row 3)
+    tbl         v12.8b, {v0.16b},v1.8b      //load from ref_main_idx (row 4)
+    umull       v18.8h, v23.8b, v7.8b       //mul (row 3)
+    umlal       v18.8h, v25.8b, v6.8b       //mul (row 3)
 
-    tbl         v13.8b, {v0.16b},v9.8b      //load from ref_main_idx + 1 (row 4)
+    tbl         v13.8b, {v0.16b},v19.8b     //load from ref_main_idx + 1 (row 4)
     sub         v4.8b,  v4.8b ,  v3.8b      //ref_main_idx (row 5)
     sub         v5.8b,  v5.8b ,  v3.8b      //ref_main_idx + 1 (row 5)
 
@@ -232,30 +234,30 @@ prologue_8_16_32:
     umlal       v24.8h, v13.8b, v6.8b       //mul (row 4)
 
     tbl         v17.8b, {v0.16b},v5.8b      //load from ref_main_idx + 1 (row 5)
-    sub         v8.8b,  v8.8b ,  v3.8b      //ref_main_idx (row 6)
-    sub         v9.8b,  v9.8b ,  v3.8b      //ref_main_idx + 1 (row 6)
+    sub         v1.8b,  v1.8b ,  v3.8b      //ref_main_idx (row 6)
+    sub         v19.8b,  v19.8b ,  v3.8b    //ref_main_idx + 1 (row 6)
 
     st1         {v18.8b},[x2], x3           //st (row 3)
     rshrn       v24.8b, v24.8h,#5           //round shft (row 4)
 
-    tbl         v14.8b, {v0.16b},v8.8b      //load from ref_main_idx (row 6)
+    tbl         v14.8b, {v0.16b},v1.8b      //load from ref_main_idx (row 6)
     umull       v22.8h, v16.8b, v7.8b       //mul (row 5)
     umlal       v22.8h, v17.8b, v6.8b       //mul (row 5)
 
-    tbl         v15.8b, {v0.16b},v9.8b      //load from ref_main_idx + 1 (row 6)
+    tbl         v15.8b, {v0.16b},v19.8b     //load from ref_main_idx + 1 (row 6)
     sub         v4.8b,  v4.8b ,  v3.8b      //ref_main_idx (row 7)
     sub         v5.8b,  v5.8b ,  v3.8b      //ref_main_idx + 1 (row 7)
 
     st1         {v24.8b},[x2], x3           //st (row 4)
     rshrn       v22.8b, v22.8h,#5           //round shft (row 5)
 
-    tbl         v10.8b, {v0.16b},v4.8b      //load from ref_main_idx (row 7)
+    tbl         v23.8b, {v0.16b},v4.8b      //load from ref_main_idx (row 7)
     umull       v20.8h, v14.8b, v7.8b       //mul (row 6)
     umlal       v20.8h, v15.8b, v6.8b       //mul (row 6)
 
-    tbl         v11.8b, {v0.16b},v5.8b      //load from ref_main_idx + 1 (row 7)
-    umull       v18.8h, v10.8b, v7.8b       //mul (row 7)
-    umlal       v18.8h, v11.8b, v6.8b       //mul (row 7)
+    tbl         v25.8b, {v0.16b},v5.8b      //load from ref_main_idx + 1 (row 7)
+    umull       v18.8h, v23.8b, v7.8b       //mul (row 7)
+    umlal       v18.8h, v25.8b, v6.8b       //mul (row 7)
 
     st1         {v22.8b},[x2], x3           //st (row 5)
     rshrn       v20.8b, v20.8h,#5           //round shft (row 6)
@@ -290,9 +292,9 @@ lbl284:
     mov         x5,x2
     ld1         {v31.8b},[x14],#8
     smull       v12.8h, v30.8b, v31.8b      //(col+1)*intra_pred_angle [0:7](col)
-    xtn         v10.8b,  v12.8h
+    xtn         v23.8b,  v12.8h
     sshr        v12.8h, v12.8h,#5
-    sqxtn       v11.8b,  v12.8h
+    sqxtn       v25.8b,  v12.8h
     ldr         w9,  [x8]
     sxtw        x9,w9
     add         x9, x0, x9
@@ -304,19 +306,19 @@ lbl284:
 
 kernel_8_16_32:
 
-    sub         v8.8b,  v26.8b ,  v11.8b    //ref_main_idx
-    mov         v26.8b, v10.8b
+    sub         v1.8b,  v26.8b ,  v25.8b    //ref_main_idx
+    mov         v26.8b, v23.8b
 
     subs        x11, x11, #8
     sub         x6, x1, x9
-    tbl         v10.8b, {v0.16b},v4.8b      //load from ref_main_idx (row 7)
-    add         v8.8b,  v8.8b ,  v16.8b     //to compensate the pu1_src idx incremented by 8
+    tbl         v23.8b, {v0.16b},v4.8b      //load from ref_main_idx (row 7)
+    add         v1.8b,  v1.8b ,  v16.8b     //to compensate the pu1_src idx incremented by 8
 
     umull       v20.8h, v14.8b, v7.8b       //mul (row 6)
-    tbl         v11.8b, {v0.16b},v5.8b      //load from ref_main_idx - 1 (row 7)
+    tbl         v25.8b, {v0.16b},v5.8b      //load from ref_main_idx - 1 (row 7)
     umlal       v20.8h, v15.8b, v6.8b       //mul (row 6)
 
-    sub         v9.8b,  v8.8b ,  v2.8b      //ref_main_idx - 1
+    sub         v19.8b,  v1.8b ,  v2.8b     //ref_main_idx - 1
     add         x20, x0, #8
     csel        x0, x20, x0,le
     add         x20, x8, #4
@@ -333,14 +335,14 @@ lbl323:
     csel        x8, x12, x8,le
     dup         v27.8b,w0                   //row value inc or reset accordingly
 
-    sub         v4.8b,  v8.8b ,  v2.8b      //ref_main_idx (row 1)
-    tbl         v12.8b, {v0.16b},v8.8b      //load from ref_main_idx (row 0)
-    sub         v5.8b,  v9.8b ,  v2.8b      //ref_main_idx - 1 (row 1)
+    sub         v4.8b,  v1.8b ,  v2.8b      //ref_main_idx (row 1)
+    tbl         v12.8b, {v0.16b},v1.8b      //load from ref_main_idx (row 0)
+    sub         v5.8b,  v19.8b ,  v2.8b     //ref_main_idx - 1 (row 1)
 
 
-    umull       v18.8h, v10.8b, v7.8b       //mul (row 7)
-    tbl         v13.8b, {v0.16b},v9.8b      //load from ref_main_idx + 1 (row 0)
-    umlal       v18.8h, v11.8b, v6.8b       //mul (row 7)
+    umull       v18.8h, v23.8b, v7.8b       //mul (row 7)
+    tbl         v13.8b, {v0.16b},v19.8b     //load from ref_main_idx + 1 (row 0)
+    umlal       v18.8h, v25.8b, v6.8b       //mul (row 7)
 
     ld1         {v31.8b},[x14],#8
     and         v6.8b,  v29.8b ,  v26.8b    //fract values in d1/ idx values in d0
@@ -348,9 +350,9 @@ lbl323:
     st1         {v22.8b},[x5], x3           //(from previous loop)st (row 5)
     rshrn       v20.8b, v20.8h,#5           //(from previous loop)round shft (row 6)
 
-    sub         v8.8b,  v8.8b ,  v3.8b      //ref_main_idx (row 2)
-    tbl         v10.8b, {v0.16b},v4.8b      //load from ref_main_idx (row 1)
-    sub         v9.8b,  v9.8b ,  v3.8b      //ref_main_idx - 1 (row 2)
+    sub         v1.8b,  v1.8b ,  v3.8b      //ref_main_idx (row 2)
+    tbl         v23.8b, {v0.16b},v4.8b      //load from ref_main_idx (row 1)
+    sub         v19.8b,  v19.8b ,  v3.8b    //ref_main_idx - 1 (row 2)
 
     add         x20, x4, #8
     csel        x11, x20, x11,le
@@ -366,22 +368,22 @@ lbl323:
     rshrn       v18.8b, v18.8h,#5           //(from previous loop)round shft (row 7)
 
     sub         v4.8b,  v4.8b ,  v3.8b      //ref_main_idx (row 3)
-    tbl         v14.8b, {v0.16b},v8.8b      //load from ref_main_idx (row 2)
+    tbl         v14.8b, {v0.16b},v1.8b      //load from ref_main_idx (row 2)
     sub         v5.8b,  v5.8b ,  v3.8b      //ref_main_idx - 1 (row 3)
 
-    umull       v22.8h, v10.8b, v7.8b       //mul (row 1)
-    tbl         v15.8b, {v0.16b},v9.8b      //load from ref_main_idx + 1 (row 2)
+    umull       v22.8h, v23.8b, v7.8b       //mul (row 1)
+    tbl         v15.8b, {v0.16b},v19.8b     //load from ref_main_idx + 1 (row 2)
     umlal       v22.8h, v17.8b, v6.8b       //mul (row 1)
 
     rshrn       v24.8b, v24.8h,#5           //round shft (row 0)
     st1         {v18.8b},[x5], x3           //(from previous loop)st (row 7)
 
-    sub         v8.8b,  v8.8b ,  v3.8b      //ref_main_idx (row 4)
-    tbl         v10.8b, {v0.16b},v4.8b      //load from ref_main_idx (row 3)
-    sub         v9.8b,  v9.8b ,  v3.8b      //ref_main_idx - 1 (row 4)
+    sub         v1.8b,  v1.8b ,  v3.8b      //ref_main_idx (row 4)
+    tbl         v23.8b, {v0.16b},v4.8b      //load from ref_main_idx (row 3)
+    sub         v19.8b,  v19.8b ,  v3.8b    //ref_main_idx - 1 (row 4)
 
     umull       v20.8h, v14.8b, v7.8b       //mul (row 2)
-    tbl         v11.8b, {v0.16b},v5.8b      //load from ref_main_idx + 1 (row 3)
+    tbl         v25.8b, {v0.16b},v5.8b      //load from ref_main_idx + 1 (row 3)
     umlal       v20.8h, v15.8b, v6.8b       //mul (row 2)
 
     smull       v14.8h, v30.8b, v31.8b      //(col+1)*intra_pred_angle [0:7](col)
@@ -392,22 +394,22 @@ lbl323:
     rshrn       v22.8b, v22.8h,#5           //round shft (row 1)
 
     sub         v4.8b,  v4.8b ,  v3.8b      //ref_main_idx (row 5)
-    tbl         v12.8b, {v0.16b},v8.8b      //load from ref_main_idx (row 4)
+    tbl         v12.8b, {v0.16b},v1.8b      //load from ref_main_idx (row 4)
     sub         v5.8b,  v5.8b ,  v3.8b      //ref_main_idx - 1 (row 5)
 
-    umull       v18.8h, v10.8b, v7.8b       //mul (row 3)
-    tbl         v13.8b, {v0.16b},v9.8b      //load from ref_main_idx + 1 (row 4)
-    umlal       v18.8h, v11.8b, v6.8b       //mul (row 3)
+    umull       v18.8h, v23.8b, v7.8b       //mul (row 3)
+    tbl         v13.8b, {v0.16b},v19.8b     //load from ref_main_idx + 1 (row 4)
+    umlal       v18.8h, v25.8b, v6.8b       //mul (row 3)
 
     st1         {v22.8b},[x2], x3           //st (row 1)
     rshrn       v20.8b, v20.8h,#5           //round shft (row 2)
 
-    xtn         v10.8b,  v14.8h
+    xtn         v23.8b,  v14.8h
     sshr        v14.8h, v14.8h,#5
 
-    sub         v8.8b,  v8.8b ,  v3.8b      //ref_main_idx (row 6)
+    sub         v1.8b,  v1.8b ,  v3.8b      //ref_main_idx (row 6)
     tbl         v21.8b, {v0.16b},v4.8b      //load from ref_main_idx (row 5)
-    sub         v9.8b,  v9.8b ,  v3.8b      //ref_main_idx - 1 (row 6)
+    sub         v19.8b,  v19.8b ,  v3.8b    //ref_main_idx - 1 (row 6)
 
     umull       v24.8h, v12.8b, v7.8b       //mul (row 4)
     tbl         v17.8b, {v0.16b},v5.8b      //load from ref_main_idx + 1 (row 5)
@@ -417,24 +419,24 @@ lbl323:
     rshrn       v18.8b, v18.8h,#5           //round shft (row 3)
 
     sub         x9, x9, #1
-    sqxtn       v11.8b,  v14.8h
+    sqxtn       v25.8b,  v14.8h
 
     sub         v4.8b,  v4.8b ,  v3.8b      //ref_main_idx (row 7)
-    tbl         v14.8b, {v0.16b},v8.8b      //load from ref_main_idx (row 6)
+    tbl         v14.8b, {v0.16b},v1.8b      //load from ref_main_idx (row 6)
     sub         v5.8b,  v5.8b ,  v3.8b      //ref_main_idx - 1 (row 7)
 
     umull       v22.8h, v21.8b, v7.8b       //mul (row 5)
-    tbl         v15.8b, {v0.16b},v9.8b      //load from ref_main_idx + 1 (row 6)
+    tbl         v15.8b, {v0.16b},v19.8b     //load from ref_main_idx + 1 (row 6)
     umlal       v22.8h, v17.8b, v6.8b       //mul (row 5)
 
-    add         v11.8b,  v27.8b ,  v11.8b   //ref_main_idx (add row)
+    add         v25.8b,  v27.8b ,  v25.8b   //ref_main_idx (add row)
     dup         v26.8b,w9
 
     st1         {v18.8b},[x2], x3           //st (row 3)
     rshrn       v24.8b, v24.8h,#5           //round shft (row 4)
 
     add         x2, x2, x3, lsl #2
-    sub         v11.8b,  v11.8b ,  v2.8b    //ref_main_idx -1 (sub 1)
+    sub         v25.8b,  v25.8b ,  v2.8b    //ref_main_idx -1 (sub 1)
     add         x20, x7, x2
     csel        x2, x20, x2,gt
 
@@ -446,17 +448,17 @@ lbl323:
     bne         kernel_8_16_32
 
 epil_8_16_32:
-    tbl         v10.8b, {v0.16b},v4.8b      //load from ref_main_idx (row 7)
+    tbl         v23.8b, {v0.16b},v4.8b      //load from ref_main_idx (row 7)
 
     umull       v20.8h, v14.8b, v7.8b       //mul (row 6)
-    tbl         v11.8b, {v0.16b},v5.8b      //load from ref_main_idx + 1 (row 7)
+    tbl         v25.8b, {v0.16b},v5.8b      //load from ref_main_idx + 1 (row 7)
     umlal       v20.8h, v15.8b, v6.8b       //mul (row 6)
 
     st1         {v24.8b},[x5], x3           //st (row 4)
     rshrn       v24.8b, v22.8h,#5           //round shft (row 5)
 
-    umull       v18.8h, v10.8b, v7.8b       //mul (row 7)
-    umlal       v18.8h, v11.8b, v6.8b       //mul (row 7)
+    umull       v18.8h, v23.8b, v7.8b       //mul (row 7)
+    umlal       v18.8h, v25.8b, v6.8b       //mul (row 7)
 
     st1         {v24.8b},[x5], x3           //(from previous loop)st (row 5)
     rshrn       v20.8b, v20.8h,#5           //(from previous loop)round shft (row 6)
@@ -499,40 +501,40 @@ sz_4_proc:
     movi        v28.8b, #32
 
     sshr        v22.8h, v22.8h,#5
-    sqxtn       v8.8b,  v22.8h
+    sqxtn       v1.8b,  v22.8h
 
     and         v6.8b,  v6.8b ,  v29.8b     //fract values in d1/ idx values in d0
     sub         v7.8b,  v28.8b ,  v6.8b     //32-fract
 
     movi        v27.8b, #7                  //row 0 to 7(row-1)
-    sub         v8.8b,  v8.8b ,  v2.8b      //ref_main_idx (add 1)
-    sub         v8.8b,  v26.8b ,  v8.8b     //ref_main_idx
-    add         v8.8b,  v8.8b ,  v27.8b     //t0 compensate the pu1_src idx incremented by 8
-    sub         v9.8b,  v8.8b ,  v2.8b      //ref_main_idx - 1
+    sub         v1.8b,  v1.8b ,  v2.8b      //ref_main_idx (add 1)
+    sub         v1.8b,  v26.8b ,  v1.8b     //ref_main_idx
+    add         v1.8b,  v1.8b ,  v27.8b     //t0 compensate the pu1_src idx incremented by 8
+    sub         v19.8b,  v1.8b ,  v2.8b     //ref_main_idx - 1
 
-    sub         v4.8b,  v8.8b ,  v2.8b      //row 1 ref_main_idx
-    sub         v5.8b,  v9.8b ,  v2.8b
+    sub         v4.8b,  v1.8b ,  v2.8b      //row 1 ref_main_idx
+    sub         v5.8b,  v19.8b ,  v2.8b
 
-    tbl         v12.8b, {v0.16b},v8.8b      //load from ref_main_idx (row 0)
-    tbl         v13.8b, {v0.16b},v9.8b      //load from ref_main_idx + 1 (row 0)
+    tbl         v12.8b, {v0.16b},v1.8b      //load from ref_main_idx (row 0)
+    tbl         v13.8b, {v0.16b},v19.8b     //load from ref_main_idx + 1 (row 0)
 
 
     umull       v24.8h, v12.8b, v7.8b       //mul (row 0)
     tbl         v16.8b, {v0.16b},v4.8b      //load from ref_main_idx    (row 1)
     umlal       v24.8h, v13.8b, v6.8b       //mul (row 0)
 
-    sub         v8.8b,  v8.8b ,  v3.8b      //idx (row 2)
+    sub         v1.8b,  v1.8b ,  v3.8b      //idx (row 2)
     tbl         v17.8b, {v0.16b},v5.8b      //load from ref_main_idx + 1 (row 1)
-    sub         v9.8b,  v9.8b ,  v3.8b      //idx+1 (row 2)
+    sub         v19.8b,  v19.8b ,  v3.8b    //idx+1 (row 2)
 
     umull       v22.8h, v16.8b, v7.8b       //mul (row 1)
-    tbl         v12.8b, {v0.16b},v8.8b      //load from ref_main_idx    (row 2)
+    tbl         v12.8b, {v0.16b},v1.8b      //load from ref_main_idx    (row 2)
     umlal       v22.8h, v17.8b, v6.8b       //mul (row 1)
 
     rshrn       v24.8b, v24.8h,#5           //round shift (row 0)
 
     sub         v4.8b,  v4.8b ,  v3.8b      //idx (row 3)
-    tbl         v13.8b, {v0.16b},v9.8b      //load from ref_main_idx + 1 (row 2)
+    tbl         v13.8b, {v0.16b},v19.8b     //load from ref_main_idx + 1 (row 2)
     sub         v5.8b,  v5.8b ,  v3.8b      //idx+1 (row 3)
 
     umull       v20.8h, v12.8b, v7.8b       //mul (row 2)
@@ -559,7 +561,8 @@ sz_4_proc:
 end_func:
     // ldmfd sp!,{x4-x12,x15}          //reload the registers from sp
     ldp         x19, x20,[sp],#16
-    pop_v_regs
+    ldp         d14,d15,[sp],#16
+    ldp         d12,d13,[sp],#16
     ret
 
 
diff --git a/common/arm64/ihevc_intra_pred_luma_planar.s b/common/arm64/ihevc_intra_pred_luma_planar.s
index d2f27a2..ba04f42 100644
--- a/common/arm64/ihevc_intra_pred_luma_planar.s
+++ b/common/arm64/ihevc_intra_pred_luma_planar.s
@@ -107,7 +107,7 @@
 ihevc_intra_pred_luma_planar_av8:
 
     // stmfd sp!, {x4-x12, x14}            //stack stores the values of the arguments
-    push_v_regs
+
     stp         x19, x20,[sp,#-16]!
 
     adrp        x11, :got:gau1_ihevc_planar_factor //loads table of coeffs
@@ -116,8 +116,8 @@ ihevc_intra_pred_luma_planar_av8:
     clz         w5,w4
     sub         x20, x5, #32
     neg         x5, x20
-    dup         v14.8h,w5
-    neg         v14.8h, v14.8h              //shr value (so vneg)
+    dup         v29.8h,w5
+    neg         v29.8h, v29.8h              //shr value (so vneg)
     dup         v2.8b,w4                    //nt
     dup         v16.8h,w4                   //nt
 
@@ -175,22 +175,22 @@ tf_sz_8_16_32:
 
 col_loop_8_16_32:
 
-    ld1         {v8.8b},[x12]               //(1-8)load 8 coeffs [col+1]
-    dup         v12.8h,w4                   //(1)
+    ld1         {v17.8b},[x12]              //(1-8)load 8 coeffs [col+1]
+    dup         v27.8h,w4                   //(1)
     ld1         {v4.8b},[x6]                //(1-8)src[2nt-1-row]
-    sub         v9.8b,  v2.8b ,  v8.8b      //(1-8)[nt-1-col]
+    sub         v19.8b,  v2.8b ,  v17.8b    //(1-8)[nt-1-col]
 
 
-    umlal       v12.8h, v5.8b, v0.8b        //(1)(row+1)    *    src[nt-1]
+    umlal       v27.8h, v5.8b, v0.8b        //(1)(row+1)    *    src[nt-1]
 
     ld1         {v3.8b},[x14]               //(1-8)load 8 src[2nt+1+col]
-    umlal       v12.8h, v8.8b, v1.8b        //(1)(col+1)    *    src[3nt+1]
+    umlal       v27.8h, v17.8b, v1.8b       //(1)(col+1)    *    src[3nt+1]
 
     dup         v20.8b, v4.8b[7]            //(1)
-    umlal       v12.8h, v6.8b, v3.8b        //(1)(nt-1-row)    *    src[2nt+1+col]
+    umlal       v27.8h, v6.8b, v3.8b        //(1)(nt-1-row)    *    src[2nt+1+col]
 
     dup         v21.8b, v4.8b[6]            //(2)
-    umlal       v12.8h, v9.8b, v20.8b       //(1)(nt-1-col)    *    src[2nt-1-row]
+    umlal       v27.8h, v19.8b, v20.8b      //(1)(nt-1-col)    *    src[2nt-1-row]
 
     dup         v30.8h,w4                   //(2)
     add         v5.8b,  v5.8b ,  v7.8b      //(1)
@@ -201,46 +201,46 @@ col_loop_8_16_32:
     umlal       v30.8h, v5.8b, v0.8b        //(2)
 
     dup         v28.8h,w4                   //(3)
-    umlal       v30.8h, v8.8b, v1.8b        //(2)
+    umlal       v30.8h, v17.8b, v1.8b       //(2)
 
     umlal       v30.8h, v6.8b, v3.8b        //(2)
-    umlal       v30.8h, v9.8b, v21.8b       //(2)
+    umlal       v30.8h, v19.8b, v21.8b      //(2)
 
-    sshl        v12.8h, v12.8h, v14.8h      //(1)shr
+    sshl        v27.8h, v27.8h, v29.8h      //(1)shr
 
     add         v5.8b,  v5.8b ,  v7.8b      //(2)
     sub         v6.8b,  v6.8b ,  v7.8b      //(2)
 
-    xtn         v12.8b,  v12.8h             //(1)
+    xtn         v27.8b,  v27.8h             //(1)
     umlal       v28.8h, v5.8b, v0.8b        //(3)
 
     dup         v23.8b, v4.8b[4]            //(4)
-    umlal       v28.8h, v8.8b, v1.8b        //(3)
+    umlal       v28.8h, v17.8b, v1.8b       //(3)
 
-    dup         v10.8h,w4                   //(4)
+    dup         v25.8h,w4                   //(4)
     umlal       v28.8h, v6.8b, v3.8b        //(3)
 
-    st1         {v12.8b},[x2], x3           //(1)str 8 values
-    umlal       v28.8h, v9.8b, v22.8b       //(3)
+    st1         {v27.8b},[x2], x3           //(1)str 8 values
+    umlal       v28.8h, v19.8b, v22.8b      //(3)
 
-    sshl        v30.8h, v30.8h, v14.8h      //(2)shr
+    sshl        v30.8h, v30.8h, v29.8h      //(2)shr
 
     add         v5.8b,  v5.8b ,  v7.8b      //(3)
     sub         v6.8b,  v6.8b ,  v7.8b      //(3)
 
     xtn         v30.8b,  v30.8h             //(2)
-    umlal       v10.8h, v5.8b, v0.8b        //(4)
+    umlal       v25.8h, v5.8b, v0.8b        //(4)
 
     dup         v20.8b, v4.8b[3]            //(5)
-    umlal       v10.8h, v8.8b, v1.8b        //(4)
+    umlal       v25.8h, v17.8b, v1.8b       //(4)
 
     dup         v16.8h,w4                   //(5)
-    umlal       v10.8h, v6.8b, v3.8b        //(4)
+    umlal       v25.8h, v6.8b, v3.8b        //(4)
 
     st1         {v30.8b},[x2], x3           //(2)str 8 values
-    umlal       v10.8h, v9.8b, v23.8b       //(4)
+    umlal       v25.8h, v19.8b, v23.8b      //(4)
 
-    sshl        v28.8h, v28.8h, v14.8h      //(3)shr
+    sshl        v28.8h, v28.8h, v29.8h      //(3)shr
 
     add         v5.8b,  v5.8b ,  v7.8b      //(4)
     sub         v6.8b,  v6.8b ,  v7.8b      //(4)
@@ -249,31 +249,31 @@ col_loop_8_16_32:
     umlal       v16.8h, v5.8b, v0.8b        //(5)
 
     dup         v21.8b, v4.8b[2]            //(6)
-    umlal       v16.8h, v8.8b, v1.8b        //(5)
+    umlal       v16.8h, v17.8b, v1.8b       //(5)
 
     dup         v18.8h,w4                   //(6)
     umlal       v16.8h, v6.8b, v3.8b        //(5)
 
     st1         {v28.8b},[x2], x3           //(3)str 8 values
-    umlal       v16.8h, v9.8b, v20.8b       //(5)
+    umlal       v16.8h, v19.8b, v20.8b      //(5)
 
-    sshl        v10.8h, v10.8h, v14.8h      //(4)shr
+    sshl        v25.8h, v25.8h, v29.8h      //(4)shr
     add         v5.8b,  v5.8b ,  v7.8b      //(5)
     sub         v6.8b,  v6.8b ,  v7.8b      //(5)
 
-    xtn         v10.8b,  v10.8h             //(4)
+    xtn         v25.8b,  v25.8h             //(4)
     umlal       v18.8h, v5.8b, v0.8b        //(6)
 
     dup         v22.8b, v4.8b[1]            //(7)
-    umlal       v18.8h, v8.8b, v1.8b        //(6)
+    umlal       v18.8h, v17.8b, v1.8b       //(6)
 
     dup         v26.8h,w4                   //(7)
     umlal       v18.8h, v6.8b, v3.8b        //(6)
 
-    st1         {v10.8b},[x2], x3           //(4)str 8 values
-    umlal       v18.8h, v9.8b, v21.8b       //(6)
+    st1         {v25.8b},[x2], x3           //(4)str 8 values
+    umlal       v18.8h, v19.8b, v21.8b      //(6)
 
-    sshl        v16.8h, v16.8h, v14.8h      //(5)shr
+    sshl        v16.8h, v16.8h, v29.8h      //(5)shr
 
     add         v5.8b,  v5.8b ,  v7.8b      //(6)
     sub         v6.8b,  v6.8b ,  v7.8b      //(6)
@@ -282,15 +282,15 @@ col_loop_8_16_32:
     umlal       v26.8h, v5.8b, v0.8b        //(7)
 
     dup         v23.8b, v4.8b[0]            //(8)
-    umlal       v26.8h, v8.8b, v1.8b        //(7)
+    umlal       v26.8h, v17.8b, v1.8b       //(7)
 
     dup         v24.8h,w4                   //(8)
     umlal       v26.8h, v6.8b, v3.8b        //(7)
 
     st1         {v16.8b},[x2], x3           //(5)str 8 values
-    umlal       v26.8h, v9.8b, v22.8b       //(7)
+    umlal       v26.8h, v19.8b, v22.8b      //(7)
 
-    sshl        v18.8h, v18.8h, v14.8h      //(6)shr
+    sshl        v18.8h, v18.8h, v29.8h      //(6)shr
 
     add         v5.8b,  v5.8b ,  v7.8b      //(7)
     sub         v6.8b,  v6.8b ,  v7.8b      //(7)
@@ -299,14 +299,14 @@ col_loop_8_16_32:
     umlal       v24.8h, v5.8b, v0.8b        //(8)
 
 
-    umlal       v24.8h, v8.8b, v1.8b        //(8)
+    umlal       v24.8h, v17.8b, v1.8b       //(8)
 
     umlal       v24.8h, v6.8b, v3.8b        //(8)
 
     st1         {v18.8b},[x2], x3           //(6)str 8 values
-    umlal       v24.8h, v9.8b, v23.8b       //(8)
+    umlal       v24.8h, v19.8b, v23.8b      //(8)
 
-    sshl        v26.8h, v26.8h, v14.8h      //(7)shr
+    sshl        v26.8h, v26.8h, v29.8h      //(7)shr
 
     subs        x7, x7, #8
 
@@ -322,7 +322,7 @@ col_loop_8_16_32:
     csel        x12, x20, x12,le
 
     csel        x14, x0, x14,le             //x14 reset
-    ld1         {v8.8b},[x12]               //(1n)(1-8)load 8 coeffs [col+1]
+    ld1         {v17.8b},[x12]              //(1n)(1-8)load 8 coeffs [col+1]
 
     sub         x20, x6, #8                 //for next set of rows
     csel        x6, x20, x6,le
@@ -330,12 +330,12 @@ col_loop_8_16_32:
 
     add         x20, x5, #8
     csel        x5, x20, x5,le
-    dup         v12.8h,w4                   //(1n)(1)
+    dup         v27.8h,w4                   //(1n)(1)
 
     ld1         {v5.8b},[x5]
 
     ld1         {v4.8b},[x6]                //(1n)(1-8)src[2nt-1-row]
-    sub         v9.8b,  v2.8b ,  v8.8b      //(1n)(1-8)[nt-1-col]
+    sub         v19.8b,  v2.8b ,  v17.8b    //(1n)(1-8)[nt-1-col]
 
     dup         v20.8b, v4.8b[7]            //(1n)(1)
     sub         v6.8b,  v2.8b ,  v5.8b
@@ -345,19 +345,19 @@ col_loop_8_16_32:
 kernel_plnr:
 
     cmp         x1, #0                      // (cond loop)
-    sshl        v24.8h, v24.8h, v14.8h      //(8)shr
+    sshl        v24.8h, v24.8h, v29.8h      //(8)shr
 
     xtn         v26.8b,  v26.8h             //(7)
-    umlal       v12.8h, v5.8b, v0.8b        //(1)(row+1)    *    src[nt-1]
+    umlal       v27.8h, v5.8b, v0.8b        //(1)(row+1)    *    src[nt-1]
 
     xtn         v24.8b,  v24.8h             //(8)
-    umlal       v12.8h, v8.8b, v1.8b        //(1)(col+1)    *    src[3nt+1]
+    umlal       v27.8h, v17.8b, v1.8b       //(1)(col+1)    *    src[3nt+1]
 
     dup         v21.8b, v4.8b[6]            //(2)
-    umlal       v12.8h, v6.8b, v3.8b        //(1)(nt-1-row)    *    src[2nt+1+col]
+    umlal       v27.8h, v6.8b, v3.8b        //(1)(nt-1-row)    *    src[2nt+1+col]
 
     dup         v30.8h,w4                   //(2)
-    umlal       v12.8h, v9.8b, v20.8b       //(1)(nt-1-col)    *    src[2nt-1-row]
+    umlal       v27.8h, v19.8b, v20.8b      //(1)(nt-1-col)    *    src[2nt-1-row]
 
     st1         {v26.8b},[x2], x3           //(7)str 8 values
     add         v5.8b,  v5.8b ,  v7.8b      //(1)
@@ -371,15 +371,15 @@ kernel_plnr:
 
     sub         x20, x2, x10                //else go to next set of rows, dst - (nt-8) (cond loop)
     csel        x2, x20, x2,le
-    umlal       v30.8h, v8.8b, v1.8b        //(2)
+    umlal       v30.8h, v17.8b, v1.8b       //(2)
 
     dup         v22.8b, v4.8b[5]            //(3)
     umlal       v30.8h, v6.8b, v3.8b        //(2)
 
     dup         v28.8h,w4                   //(3)
-    umlal       v30.8h, v9.8b, v21.8b       //(2)
+    umlal       v30.8h, v19.8b, v21.8b      //(2)
 
-    sshl        v12.8h, v12.8h, v14.8h      //(1)shr
+    sshl        v27.8h, v27.8h, v29.8h      //(1)shr
 
     add         v5.8b,  v5.8b ,  v7.8b      //(2)
     csel        x1, x4, x1,le               //nt reloaded (refresh the value)    (cond loop)
@@ -387,37 +387,37 @@ kernel_plnr:
     sub         v6.8b,  v6.8b ,  v7.8b      //(2)
     subs        x1, x1, #8                  //row counter (loop)
 
-    xtn         v12.8b,  v12.8h             //(1)
+    xtn         v27.8b,  v27.8h             //(1)
     umlal       v28.8h, v5.8b, v0.8b        //(3)
 
     dup         v23.8b, v4.8b[4]            //(4)
-    umlal       v28.8h, v8.8b, v1.8b        //(3)
+    umlal       v28.8h, v17.8b, v1.8b       //(3)
 
-    dup         v10.8h,w4                   //(4)
+    dup         v25.8h,w4                   //(4)
     umlal       v28.8h, v6.8b, v3.8b        //(3)
 
-    st1         {v12.8b},[x2], x3           //(1)str 8 values
-    umlal       v28.8h, v9.8b, v22.8b       //(3)
+    st1         {v27.8b},[x2], x3           //(1)str 8 values
+    umlal       v28.8h, v19.8b, v22.8b      //(3)
 
-    sshl        v30.8h, v30.8h, v14.8h      //(2)shr
+    sshl        v30.8h, v30.8h, v29.8h      //(2)shr
 
     add         v5.8b,  v5.8b ,  v7.8b      //(3)
 
     sub         v6.8b,  v6.8b ,  v7.8b      //(3)
 
     xtn         v30.8b,  v30.8h             //(2)
-    umlal       v10.8h, v5.8b, v0.8b        //(4)
+    umlal       v25.8h, v5.8b, v0.8b        //(4)
 
     dup         v20.8b, v4.8b[3]            //(5)
-    umlal       v10.8h, v8.8b, v1.8b        //(4)
+    umlal       v25.8h, v17.8b, v1.8b       //(4)
 
     dup         v16.8h,w4                   //(5)
-    umlal       v10.8h, v6.8b, v3.8b        //(4)
+    umlal       v25.8h, v6.8b, v3.8b        //(4)
 
     st1         {v30.8b},[x2], x3           //(2)str 8 values
-    umlal       v10.8h, v9.8b, v23.8b       //(4)
+    umlal       v25.8h, v19.8b, v23.8b      //(4)
 
-    sshl        v28.8h, v28.8h, v14.8h      //(3)shr
+    sshl        v28.8h, v28.8h, v29.8h      //(3)shr
 
     add         v5.8b,  v5.8b ,  v7.8b      //(4)
 
@@ -427,17 +427,17 @@ kernel_plnr:
     umlal       v16.8h, v5.8b, v0.8b        //(5)
 
     dup         v21.8b, v4.8b[2]            //(6)
-    umlal       v16.8h, v8.8b, v1.8b        //(5)
+    umlal       v16.8h, v17.8b, v1.8b       //(5)
 
     dup         v18.8h,w4                   //(6)
     umlal       v16.8h, v6.8b, v3.8b        //(5)
 
     st1         {v28.8b},[x2], x3           //(3)str 8 values
-    umlal       v16.8h, v9.8b, v20.8b       //(5)
+    umlal       v16.8h, v19.8b, v20.8b      //(5)
 
     add         x20, x11, #1                //x12 reset (cond loop)
     csel        x12, x20, x12,le
-    sshl        v10.8h, v10.8h, v14.8h      //(4)shr
+    sshl        v25.8h, v25.8h, v29.8h      //(4)shr
 
     add         x20, x12, #8                //col inc (cond loop)
     csel        x12, x20, x12,gt
@@ -447,20 +447,20 @@ kernel_plnr:
     csel        x14, x20, x14,gt
     sub         v6.8b,  v6.8b ,  v7.8b      //(5)
 
-    xtn         v10.8b,  v10.8h             //(4)
+    xtn         v25.8b,  v25.8h             //(4)
     umlal       v18.8h, v5.8b, v0.8b        //(6)
 
     dup         v22.8b, v4.8b[1]            //(7)
-    umlal       v18.8h, v8.8b, v1.8b        //(6)
+    umlal       v18.8h, v17.8b, v1.8b       //(6)
 
     dup         v26.8h,w4                   //(7)
     umlal       v18.8h, v6.8b, v3.8b        //(6)
 
-    st1         {v10.8b},[x2], x3           //(4)str 8 values
-    umlal       v18.8h, v9.8b, v21.8b       //(6)
+    st1         {v25.8b},[x2], x3           //(4)str 8 values
+    umlal       v18.8h, v19.8b, v21.8b      //(6)
 
     csel        x14, x0, x14,le             //x14 reset (cond loop)
-    sshl        v16.8h, v16.8h, v14.8h      //(5)shr
+    sshl        v16.8h, v16.8h, v29.8h      //(5)shr
 
     sub         x20, x6, #8                 //for next set of rows (cond loop)
     csel        x6, x20, x6,le
@@ -474,16 +474,16 @@ kernel_plnr:
     umlal       v26.8h, v5.8b, v0.8b        //(7)
 
     dup         v23.8b, v4.8b[0]            //(8)
-    umlal       v26.8h, v8.8b, v1.8b        //(7)
+    umlal       v26.8h, v17.8b, v1.8b       //(7)
 
     dup         v24.8h,w4                   //(8)
     umlal       v26.8h, v6.8b, v3.8b        //(7)
 
     st1         {v16.8b},[x2], x3           //(5)str 8 values
-    umlal       v26.8h, v9.8b, v22.8b       //(7)
+    umlal       v26.8h, v19.8b, v22.8b      //(7)
 
     ld1         {v4.8b},[x6]                //(1n)(1-8)src[2nt-1-row]
-    sshl        v18.8h, v18.8h, v14.8h      //(6)shr
+    sshl        v18.8h, v18.8h, v29.8h      //(6)shr
 
     add         v5.8b,  v5.8b ,  v7.8b      //(7)
 
@@ -493,24 +493,24 @@ kernel_plnr:
     umlal       v24.8h, v5.8b, v0.8b        //(8)
 
     ld1         {v5.8b},[x5]                //(row+1 value)
-    umlal       v24.8h, v8.8b, v1.8b        //(8)
+    umlal       v24.8h, v17.8b, v1.8b       //(8)
 
     dup         v20.8b, v4.8b[7]            //(1n)(1)
     umlal       v24.8h, v6.8b, v3.8b        //(8)
 
     st1         {v18.8b},[x2], x3           //(6)str 8 values
-    umlal       v24.8h, v9.8b, v23.8b       //(8)
+    umlal       v24.8h, v19.8b, v23.8b      //(8)
 
-    ld1         {v8.8b},[x12]               //(1n)(1-8)load 8 coeffs [col+1]
+    ld1         {v17.8b},[x12]              //(1n)(1-8)load 8 coeffs [col+1]
     sub         v6.8b,  v2.8b ,  v5.8b      //(nt-1-row) value
 
     subs        x7, x7, #8                  //col counter
 
     ld1         {v3.8b},[x14]               //(1n)(1-8)load 8 src[2nt+1+col]
-    sshl        v26.8h, v26.8h, v14.8h      //(7)shr
+    sshl        v26.8h, v26.8h, v29.8h      //(7)shr
 
-    dup         v12.8h,w4                   //(1n)(1)
-    sub         v9.8b,  v2.8b ,  v8.8b      //(1n)(1-8)[nt-1-col]
+    dup         v27.8h,w4                   //(1n)(1)
+    sub         v19.8b,  v2.8b ,  v17.8b    //(1n)(1-8)[nt-1-col]
 
     bne         kernel_plnr
 
@@ -519,7 +519,7 @@ epilog:
     xtn         v26.8b,  v26.8h             //(7)
     st1         {v26.8b},[x2], x3           //(7)str 8 values
 
-    sshl        v24.8h, v24.8h, v14.8h      //(8)shr
+    sshl        v24.8h, v24.8h, v29.8h      //(8)shr
     xtn         v24.8b,  v24.8h             //(8)
     st1         {v24.8b},[x2], x3           //(8)str 8 values
 
@@ -528,25 +528,25 @@ epilog:
     beq         end_loop
 
 tf_sz_4:
-    ld1         {v10.8b},[x14]              //load src[2nt+1+col]
-    ld1         {v8.8b},[x12], x10          //load 8 coeffs [col+1]
+    ld1         {v25.8b},[x14]              //load src[2nt+1+col]
+    ld1         {v17.8b},[x12], x10         //load 8 coeffs [col+1]
 loop_sz_4:
     mov         x10, #4                     //reduce inc to #4 for 4x4
     ldr         w7,  [x6], #-1              //src[2nt-1-row] (dec to take into account row)
     sxtw        x7,w7
     dup         v4.8b,w7                    //src[2nt-1-row]
 
-    sub         v9.8b,  v2.8b ,  v8.8b      //[nt-1-col]
+    sub         v19.8b,  v2.8b ,  v17.8b    //[nt-1-col]
 
-    umull       v12.8h, v5.8b, v0.8b        //(row+1)    *    src[nt-1]
-    umlal       v12.8h, v6.8b, v10.8b       //(nt-1-row)    *    src[2nt+1+col]
-    umlal       v12.8h, v8.8b, v1.8b        //(col+1)    *    src[3nt+1]
-    umlal       v12.8h, v9.8b, v4.8b        //(nt-1-col)    *    src[2nt-1-row]
+    umull       v27.8h, v5.8b, v0.8b        //(row+1)    *    src[nt-1]
+    umlal       v27.8h, v6.8b, v25.8b       //(nt-1-row)    *    src[2nt+1+col]
+    umlal       v27.8h, v17.8b, v1.8b       //(col+1)    *    src[3nt+1]
+    umlal       v27.8h, v19.8b, v4.8b       //(nt-1-col)    *    src[2nt-1-row]
 //    vadd.i16    q6, q6, q8            @add (nt)
 //    vshl.s16     q6, q6, q7            @shr
 //    vmovn.i16     d12, q6
-    rshrn       v12.8b, v12.8h,#3
-    st1         {v12.s}[0],[x2], x3
+    rshrn       v27.8b, v27.8h,#3
+    st1         {v27.s}[0],[x2], x3
 
     add         v5.8b,  v5.8b ,  v7.8b      //row++ [(row+1)++]
     sub         v6.8b,  v6.8b ,  v7.8b      //[nt-1-row]--
@@ -557,7 +557,7 @@ loop_sz_4:
 end_loop:
     // ldmfd sp!,{x4-x12,x15}                  //reload the registers from sp
     ldp         x19, x20,[sp],#16
-    pop_v_regs
+
     ret
 
 
diff --git a/common/arm64/ihevc_intra_pred_luma_vert.s b/common/arm64/ihevc_intra_pred_luma_vert.s
index 56a20a0..c67f721 100644
--- a/common/arm64/ihevc_intra_pred_luma_vert.s
+++ b/common/arm64/ihevc_intra_pred_luma_vert.s
@@ -101,7 +101,7 @@
 ihevc_intra_pred_luma_ver_av8:
 
     // stmfd sp!, {x4-x12, x14}            //stack stores the values of the arguments
-    push_v_regs
+
     stp         x19, x20,[sp,#-16]!
 
     lsl         x5, x4, #1                  //2nt
@@ -207,7 +207,7 @@ blk_16:
     sqadd       v0.8h,  v0.8h ,  v30.8h
     sqadd       v28.8h,  v28.8h ,  v30.8h
 
-    movi        d10, #0x00000000000000ff
+    movi        d3, #0x00000000000000ff
     //vaddl.s8    q1, d25, d27
 
     sqxtun      v24.8b, v28.8h
@@ -218,13 +218,13 @@ blk_16:
     rev64       v24.16b,  v24.16b
     mov         v25.d[0], v24.d[1]
 
-    mov         v11.d[0],v17.d[0]
+    mov         v4.d[0],v17.d[0]
 
     bsl         v18.8b,  v24.8b ,  v16.8b   //only select row values from q12(predpixel)
-    bsl         v10.8b,  v25.8b ,  v16.8b
+    bsl         v3.8b,  v25.8b ,  v16.8b
 
-    movi        d8, #0x00000000000000ff
-    mov         v9.d[0],v17.d[0]
+    movi        d1, #0x00000000000000ff
+    mov         v2.d[0],v17.d[0]
 
     movi        d6, #0x00000000000000ff
     mov         v7.d[0],v17.d[0]
@@ -232,14 +232,14 @@ blk_16:
     st1         {v18.8b, v19.8b}, [x2], x3
     sshr        d24, d24,#8
 
-    st1         {v10.8b, v11.8b}, [x5], x3
+    st1         {v3.8b, v4.8b}, [x5], x3
     sshr        d25, d25,#8
 
 
-    bsl         v8.8b,  v24.8b ,  v16.8b
+    bsl         v1.8b,  v24.8b ,  v16.8b
     bsl         v6.8b,  v25.8b ,  v16.8b
 
-    st1         {v8.8b, v9.8b}, [x2], x3
+    st1         {v1.8b, v2.8b}, [x2], x3
     sshr        d24, d24,#8
 
     st1         {v6.8b, v7.8b}, [x5], x3
@@ -250,34 +250,34 @@ blk_16:
     movi        d18, #0x00000000000000ff
     //vmov.i64    d19, d17
 
-    movi        d10, #0x00000000000000ff
+    movi        d3, #0x00000000000000ff
     //vmov.i64    d11, d17
 
 
 loop_16:
 
 
-    movi        d8, #0x00000000000000ff
+    movi        d1, #0x00000000000000ff
 
     movi        d6, #0x00000000000000ff
 
     bsl         v18.8b,  v24.8b ,  v16.8b   //only select row values from q12(predpixel)
-    bsl         v10.8b,  v25.8b ,  v16.8b
+    bsl         v3.8b,  v25.8b ,  v16.8b
 
     st1         {v18.8b, v19.8b}, [x2], x3
     sshr        d24, d24,#8
 
-    st1         {v10.8b, v11.8b}, [x5], x3
+    st1         {v3.8b, v4.8b}, [x5], x3
     sshr        d25, d25,#8
 
     movi        d18, #0x00000000000000ff
 
-    movi        d10, #0x00000000000000ff
+    movi        d3, #0x00000000000000ff
 
-    bsl         v8.8b,  v24.8b ,  v16.8b
+    bsl         v1.8b,  v24.8b ,  v16.8b
     bsl         v6.8b,  v25.8b ,  v16.8b
 
-    st1         {v8.8b, v9.8b}, [x2], x3
+    st1         {v1.8b, v2.8b}, [x2], x3
     sshr        d24, d24,#8
 
     st1         {v6.8b, v7.8b}, [x5], x3
@@ -287,23 +287,23 @@ loop_16:
 
     bne         loop_16
 
-    movi        d8, #0x00000000000000ff
+    movi        d1, #0x00000000000000ff
 
     movi        d6, #0x00000000000000ff
 
     bsl         v18.8b,  v24.8b ,  v16.8b   //only select row values from q12(predpixel)
-    bsl         v10.8b,  v25.8b ,  v16.8b
+    bsl         v3.8b,  v25.8b ,  v16.8b
 
     st1         {v18.8b, v19.8b}, [x2], x3
     sshr        d24, d24,#8
 
-    st1         {v10.8b, v11.8b}, [x5], x3
+    st1         {v3.8b, v4.8b}, [x5], x3
     sshr        d25, d25,#8
 
-    bsl         v8.8b,  v24.8b ,  v16.8b
+    bsl         v1.8b,  v24.8b ,  v16.8b
     bsl         v6.8b,  v25.8b ,  v16.8b
 
-    st1         {v8.8b, v9.8b}, [x2], x3
+    st1         {v1.8b, v2.8b}, [x2], x3
 
     st1         {v6.8b, v7.8b}, [x5], x3
 
@@ -311,10 +311,10 @@ loop_16:
 
 
 blk_4_8:
-    movi        d11, #0x00000000000000ff
+    movi        d4, #0x00000000000000ff
     add         x6, x0, x5                  //&src[2nt]
 
-    movi        d10, #0x00000000000000ff
+    movi        d3, #0x00000000000000ff
     ldrb        w11, [x6], #1               //src[2nt]
     sxtw        x11,w11
 
@@ -363,19 +363,19 @@ blk_4_8:
 
     movi        d19, #0x00000000000000ff
 
-    bsl         v10.8b,  v24.8b ,  v16.8b
+    bsl         v3.8b,  v24.8b ,  v16.8b
 
-    st1         {v10.8b},[x2], x3
+    st1         {v3.8b},[x2], x3
     sshr        d24, d24,#8
 
-    movi        d10, #0x00000000000000ff
+    movi        d3, #0x00000000000000ff
 
-    bsl         v11.8b,  v24.8b ,  v16.8b
+    bsl         v4.8b,  v24.8b ,  v16.8b
 
-    st1         {v11.8b},[x2], x3
+    st1         {v4.8b},[x2], x3
     sshr        d24, d24,#8
 
-    movi        d11, #0x00000000000000ff
+    movi        d4, #0x00000000000000ff
 
     bsl         v18.8b,  v24.8b ,  v16.8b   //only select row values from q12(predpixel)
 
@@ -387,14 +387,14 @@ blk_4_8:
     st1         {v19.8b},[x2], x3
     sshr        d24, d24,#8
 
-    bsl         v10.8b,  v24.8b ,  v16.8b
+    bsl         v3.8b,  v24.8b ,  v16.8b
 
-    st1         {v10.8b},[x2], x3
+    st1         {v3.8b},[x2], x3
     sshr        d24, d24,#8
 
-    bsl         v11.8b,  v24.8b ,  v16.8b
+    bsl         v4.8b,  v24.8b ,  v16.8b
 
-    st1         {v11.8b},[x2], x3
+    st1         {v4.8b},[x2], x3
     sshr        d24, d24,#8
 
     b           end_func
@@ -411,19 +411,19 @@ blk_4:
     st1         {v19.s}[0],[x2], x3
     sshr        d24, d24,#8
 
-    bsl         v10.8b,  v24.8b ,  v16.8b
+    bsl         v3.8b,  v24.8b ,  v16.8b
 
-    st1         {v10.s}[0],[x2], x3
+    st1         {v3.s}[0],[x2], x3
     sshr        d24, d24,#8
 
-    bsl         v11.8b,  v24.8b ,  v16.8b
-    st1         {v11.s}[0],[x2], x3
+    bsl         v4.8b,  v24.8b ,  v16.8b
+    st1         {v4.s}[0],[x2], x3
 
 
 end_func:
     // ldmfd sp!,{x4-x12,x15}                  //reload the registers from sp
     ldp         x19, x20,[sp],#16
-    pop_v_regs
+
     ret
 
 
diff --git a/common/arm64/ihevc_itrans_recon_4x4.s b/common/arm64/ihevc_itrans_recon_4x4.s
index b18fb89..1f2c904 100644
--- a/common/arm64/ihevc_itrans_recon_4x4.s
+++ b/common/arm64/ihevc_itrans_recon_4x4.s
@@ -119,7 +119,7 @@
 ihevc_itrans_recon_4x4_av8:
 
     // stmfd sp!, {x4-x12, x14}                //stack stores the values of the arguments
-    push_v_regs
+
     stp         x19, x20,[sp,#-16]!
 
     adrp        x8, :got:g_ai2_ihevc_trans_4_transpose
@@ -142,21 +142,21 @@ ihevc_itrans_recon_4x4_av8:
     // first stage computation starts
     smull       v6.4s, v1.4h, v4.4h[1]      //83 * pi2_src[1]
     smlal       v6.4s, v3.4h, v4.4h[3]      //o[0] = 83 * pi2_src[1] + 36 * pi2_src[3]
-    smull       v8.4s, v1.4h, v4.4h[3]      //36 * pi2_src[1]
+    smull       v5.4s, v1.4h, v4.4h[3]      //36 * pi2_src[1]
     ld1         {v22.s}[0],[x2],x5
-    smlsl       v8.4s, v3.4h, v4.4h[1]      //o[1] = 36 * pi2_src[1] - 83 * pi2_src[3]
+    smlsl       v5.4s, v3.4h, v4.4h[1]      //o[1] = 36 * pi2_src[1] - 83 * pi2_src[3]
 
-    saddl       v10.4s, v0.4h, v2.4h        //pi2_src[0] + pi2_src[2]
-    ssubl       v12.4s, v0.4h, v2.4h        //pi2_src[0] - pi2_src[2]
-    shl         v10.4s, v10.4s,#6           //e[0] = 64*(pi2_src[0] + pi2_src[2])
-    shl         v12.4s, v12.4s,#6           //e[1] = 64*(pi2_src[0] - pi2_src[2])
+    saddl       v7.4s, v0.4h, v2.4h         //pi2_src[0] + pi2_src[2]
+    ssubl       v17.4s, v0.4h, v2.4h        //pi2_src[0] - pi2_src[2]
+    shl         v7.4s, v7.4s,#6             //e[0] = 64*(pi2_src[0] + pi2_src[2])
+    shl         v17.4s, v17.4s,#6           //e[1] = 64*(pi2_src[0] - pi2_src[2])
 
-    add         v14.4s,  v10.4s ,  v6.4s    //((e[0] + o[0] )
-    add         v16.4s,  v12.4s ,  v8.4s    //((e[1] + o[1])
-    sub         v18.4s,  v12.4s ,  v8.4s    //((e[1] - o[1])
-    sub         v20.4s,  v10.4s ,  v6.4s    //((e[0] - o[0])
+    add         v19.4s,  v7.4s ,  v6.4s     //((e[0] + o[0] )
+    add         v16.4s,  v17.4s ,  v5.4s    //((e[1] + o[1])
+    sub         v18.4s,  v17.4s ,  v5.4s    //((e[1] - o[1])
+    sub         v20.4s,  v7.4s ,  v6.4s     //((e[0] - o[0])
 
-    sqrshrn     v28.4h, v14.4s,#shift_stage1_idct //pi2_out[0] = clip_s16((e[0] + o[0] + add)>>shift) )
+    sqrshrn     v28.4h, v19.4s,#shift_stage1_idct //pi2_out[0] = clip_s16((e[0] + o[0] + add)>>shift) )
     sqrshrn     v29.4h, v16.4s,#shift_stage1_idct //pi2_out[1] = clip_s16((e[1] + o[1] + add)>>shift) )
     sqrshrn     v30.4h, v18.4s,#shift_stage1_idct //pi2_out[2] = clip_s16((e[0] - o[0] + add)>>shift) )
     sqrshrn     v31.4h, v20.4s,#shift_stage1_idct //pi2_out[3] = clip_s16((e[0] - o[0] + add)>>shift) )
@@ -176,22 +176,22 @@ ihevc_itrans_recon_4x4_av8:
     smull       v6.4s, v1.4h, v4.4h[1]      //83 * pi2_src[1]
     ld1         {v22.s}[1],[x2],x5
     smlal       v6.4s, v3.4h, v4.4h[3]      //o[0] = 83 * pi2_src[1] + 36 * pi2_src[3]
-    smull       v8.4s, v1.4h, v4.4h[3]      //36 * pi2_src[1]
-    smlsl       v8.4s, v3.4h, v4.4h[1]      //o[1] = 36 * pi2_src[1] - 83 * pi2_src[3]
+    smull       v5.4s, v1.4h, v4.4h[3]      //36 * pi2_src[1]
+    smlsl       v5.4s, v3.4h, v4.4h[1]      //o[1] = 36 * pi2_src[1] - 83 * pi2_src[3]
     ld1         {v23.s}[0],[x2],x5
 
-    saddl       v10.4s, v0.4h, v2.4h        //pi2_src[0] + pi2_src[2]
-    ssubl       v12.4s, v0.4h, v2.4h        //pi2_src[0] - pi2_src[2]
-    shl         v10.4s, v10.4s,#6           //e[0] = 64*(pi2_src[0] + pi2_src[2])
-    shl         v12.4s, v12.4s,#6           //e[1] = 64*(pi2_src[0] - pi2_src[2])
+    saddl       v7.4s, v0.4h, v2.4h         //pi2_src[0] + pi2_src[2]
+    ssubl       v17.4s, v0.4h, v2.4h        //pi2_src[0] - pi2_src[2]
+    shl         v7.4s, v7.4s,#6             //e[0] = 64*(pi2_src[0] + pi2_src[2])
+    shl         v17.4s, v17.4s,#6           //e[1] = 64*(pi2_src[0] - pi2_src[2])
 
 
-    add         v14.4s,  v10.4s ,  v6.4s    //((e[0] + o[0] )
-    add         v16.4s,  v12.4s ,  v8.4s    //((e[1] + o[1])
-    sub         v18.4s,  v12.4s ,  v8.4s    //((e[1] - o[1])
-    sub         v20.4s,  v10.4s ,  v6.4s    //((e[0] - o[0])
+    add         v19.4s,  v7.4s ,  v6.4s     //((e[0] + o[0] )
+    add         v16.4s,  v17.4s ,  v5.4s    //((e[1] + o[1])
+    sub         v18.4s,  v17.4s ,  v5.4s    //((e[1] - o[1])
+    sub         v20.4s,  v7.4s ,  v6.4s     //((e[0] - o[0])
 
-    sqrshrn     v28.4h, v14.4s,#shift_stage2_idct //pi2_out[0] = clip_s16((e[0] + o[0] + add)>>shift) )
+    sqrshrn     v28.4h, v19.4s,#shift_stage2_idct //pi2_out[0] = clip_s16((e[0] + o[0] + add)>>shift) )
     sqrshrn     v29.4h, v16.4s,#shift_stage2_idct //pi2_out[1] = clip_s16((e[1] + o[1] + add)>>shift) )
     sqrshrn     v30.4h, v18.4s,#shift_stage2_idct //pi2_out[2] = clip_s16((e[0] - o[0] + add)>>shift) )
     sqrshrn     v31.4h, v20.4s,#shift_stage2_idct //pi2_out[3] = clip_s16((e[0] - o[0] + add)>>shift) )
@@ -228,7 +228,7 @@ ihevc_itrans_recon_4x4_av8:
 
     // ldmfd sp!,{x4-x12,x15}                //reload the registers from sp
     ldp         x19, x20,[sp],#16
-    pop_v_regs
+
     ret
 
 
diff --git a/common/arm64/ihevc_itrans_recon_4x4_ttype1.s b/common/arm64/ihevc_itrans_recon_4x4_ttype1.s
index fa04b8e..da04c5e 100644
--- a/common/arm64/ihevc_itrans_recon_4x4_ttype1.s
+++ b/common/arm64/ihevc_itrans_recon_4x4_ttype1.s
@@ -118,7 +118,7 @@
 ihevc_itrans_recon_4x4_ttype1_av8:
 
     // stmfd sp!, {x4-x12, x14}    //stack stores the values of the arguments
-    push_v_regs
+
     stp         x19, x20,[sp,#-16]!
 
     add         x4,x4,x4                    // src_strd in terms of word16
@@ -142,33 +142,33 @@ ihevc_itrans_recon_4x4_ttype1_av8:
     smlal       v6.4s, v3.4h, v4.4h[1]      //74 * pi2_src[1] + 29 * pi2_src[0] + 55 * pi2_src[3]
     smlal       v6.4s, v2.4h, v4.4h[3]      //pi2_out[0] = 29* pi2_src[0] + 74 * pi2_src[1] + 84* pi2_src[2] + 55 * pi2_src[3]
 
-    smull       v8.4s, v1.4h, v4.4h[2]      //74 * pi2_src[1]
-    smlal       v8.4s, v0.4h, v4.4h[1]      //74 * pi2_src[1] + 55 * pi2_src[0]
-    smlsl       v8.4s, v2.4h, v4.4h[0]      //74 * pi2_src[1] + 55 * pi2_src[0] -  29 * pi2_src[2]
-    smlsl       v8.4s, v3.4h, v4.4h[3]      //pi2_out[1] = 74 * pi2_src[1] + 55 * pi2_src[0] -  29 * pi2_src[2] - 84 * pi2_src[3])
+    smull       v5.4s, v1.4h, v4.4h[2]      //74 * pi2_src[1]
+    smlal       v5.4s, v0.4h, v4.4h[1]      //74 * pi2_src[1] + 55 * pi2_src[0]
+    smlsl       v5.4s, v2.4h, v4.4h[0]      //74 * pi2_src[1] + 55 * pi2_src[0] -  29 * pi2_src[2]
+    smlsl       v5.4s, v3.4h, v4.4h[3]      //pi2_out[1] = 74 * pi2_src[1] + 55 * pi2_src[0] -  29 * pi2_src[2] - 84 * pi2_src[3])
 
-    smull       v10.4s, v0.4h, v4.4h[2]     // 74 * pi2_src[0]
-    smlsl       v10.4s, v2.4h, v4.4h[2]     // 74 * pi2_src[0] - 74 * pi2_src[2]
-    smlal       v10.4s, v3.4h, v4.4h[2]     //pi2_out[2] = 74 * pi2_src[0] - 74 * pi2_src[2] + 74 * pi2_src[3]
+    smull       v7.4s, v0.4h, v4.4h[2]      // 74 * pi2_src[0]
+    smlsl       v7.4s, v2.4h, v4.4h[2]      // 74 * pi2_src[0] - 74 * pi2_src[2]
+    smlal       v7.4s, v3.4h, v4.4h[2]      //pi2_out[2] = 74 * pi2_src[0] - 74 * pi2_src[2] + 74 * pi2_src[3]
 
-    smull       v12.4s, v2.4h, v4.4h[1]     // 55 * pi2_src[2]
-    smlsl       v12.4s, v1.4h, v4.4h[2]     // 55 * pi2_src[2] - 74 * pi2_src[1]
-    smlsl       v12.4s, v3.4h, v4.4h[0]     // - 74 * pi2_src[1] +   55 * pi2_src[2]    - 29 * pi2_src[3]
-    smlal       v12.4s, v0.4h, v4.4h[3]     //pi2_out[3] = 84 * pi2_src[0] - 74 * pi2_src[1] + 55 * pi2_src[2] - 29 * pi2_src[3]
+    smull       v20.4s, v2.4h, v4.4h[1]     // 55 * pi2_src[2]
+    smlsl       v20.4s, v1.4h, v4.4h[2]     // 55 * pi2_src[2] - 74 * pi2_src[1]
+    smlsl       v20.4s, v3.4h, v4.4h[0]     // - 74 * pi2_src[1] +   55 * pi2_src[2]    - 29 * pi2_src[3]
+    smlal       v20.4s, v0.4h, v4.4h[3]     //pi2_out[3] = 84 * pi2_src[0] - 74 * pi2_src[1] + 55 * pi2_src[2] - 29 * pi2_src[3]
 
     sqrshrn     v28.4h, v6.4s,#shift_stage1_idct // (pi2_out[0] + rounding ) >> shift_stage1_idct
-    sqrshrn     v29.4h, v8.4s,#shift_stage1_idct // (pi2_out[1] + rounding ) >> shift_stage1_idct
-    sqrshrn     v30.4h, v10.4s,#shift_stage1_idct // (pi2_out[2] + rounding ) >> shift_stage1_idct
-    sqrshrn     v31.4h, v12.4s,#shift_stage1_idct // (pi2_out[3] + rounding ) >> shift_stage1_idct
+    sqrshrn     v29.4h, v5.4s,#shift_stage1_idct // (pi2_out[1] + rounding ) >> shift_stage1_idct
+    sqrshrn     v30.4h, v7.4s,#shift_stage1_idct // (pi2_out[2] + rounding ) >> shift_stage1_idct
+    sqrshrn     v31.4h, v20.4s,#shift_stage1_idct // (pi2_out[3] + rounding ) >> shift_stage1_idct
     ld1         {v18.s}[0],[x2],x5
 
     trn1        v24.4h, v28.4h, v29.4h
     trn2        v25.4h, v28.4h, v29.4h
     trn1        v26.4h, v30.4h, v31.4h
     trn2        v27.4h, v30.4h, v31.4h
-    trn1        v14.2s, v24.2s, v26.2s
+    trn1        v21.2s, v24.2s, v26.2s
     trn2        v16.2s, v24.2s, v26.2s
-    trn1        v15.2s, v25.2s, v27.2s
+    trn1        v22.2s, v25.2s, v27.2s
     trn2        v17.2s, v25.2s, v27.2s
     // output in d14,d15,d16,d17
     // first stage computation ends
@@ -180,30 +180,30 @@ ihevc_itrans_recon_4x4_ttype1_av8:
     // d16 - d2
     // d17 - d3
     ld1         {v18.s}[1],[x2],x5
-    smull       v6.4s, v15.4h, v4.4h[2]     //74 * pi2_src[1]
-    smlal       v6.4s, v14.4h, v4.4h[0]     //74 * pi2_src[1] + 29 * pi2_src[0]
+    smull       v6.4s, v22.4h, v4.4h[2]     //74 * pi2_src[1]
+    smlal       v6.4s, v21.4h, v4.4h[0]     //74 * pi2_src[1] + 29 * pi2_src[0]
     smlal       v6.4s, v17.4h, v4.4h[1]     //74 * pi2_src[1] + 29 * pi2_src[0] + 55 * pi2_src[3]
     smlal       v6.4s, v16.4h, v4.4h[3]     //pi2_out[0] = 29* pi2_src[0] + 74 * pi2_src[1] + 84* pi2_src[2] + 55 * pi2_src[3]
 
-    smull       v8.4s, v15.4h, v4.4h[2]     //74 * pi2_src[1]
-    smlal       v8.4s, v14.4h, v4.4h[1]     //74 * pi2_src[1] + 55 * pi2_src[0]
-    smlsl       v8.4s, v16.4h, v4.4h[0]     //74 * pi2_src[1] + 55 * pi2_src[0] -  29 * pi2_src[2]
-    smlsl       v8.4s, v17.4h, v4.4h[3]     //pi2_out[1] = 74 * pi2_src[1] + 55 * pi2_src[0] -  29 * pi2_src[2] - 84 * pi2_src[3])
+    smull       v5.4s, v22.4h, v4.4h[2]     //74 * pi2_src[1]
+    smlal       v5.4s, v21.4h, v4.4h[1]     //74 * pi2_src[1] + 55 * pi2_src[0]
+    smlsl       v5.4s, v16.4h, v4.4h[0]     //74 * pi2_src[1] + 55 * pi2_src[0] -  29 * pi2_src[2]
+    smlsl       v5.4s, v17.4h, v4.4h[3]     //pi2_out[1] = 74 * pi2_src[1] + 55 * pi2_src[0] -  29 * pi2_src[2] - 84 * pi2_src[3])
 
-    smull       v10.4s, v14.4h, v4.4h[2]    // 74 * pi2_src[0]
-    smlsl       v10.4s, v16.4h, v4.4h[2]    // 74 * pi2_src[0] - 74 * pi2_src[2]
-    smlal       v10.4s, v17.4h, v4.4h[2]    //pi2_out[2] = 74 * pi2_src[0] - 74 * pi2_src[2] + 74 * pi2_src[3]
+    smull       v7.4s, v21.4h, v4.4h[2]     // 74 * pi2_src[0]
+    smlsl       v7.4s, v16.4h, v4.4h[2]     // 74 * pi2_src[0] - 74 * pi2_src[2]
+    smlal       v7.4s, v17.4h, v4.4h[2]     //pi2_out[2] = 74 * pi2_src[0] - 74 * pi2_src[2] + 74 * pi2_src[3]
     ld1         {v19.s}[0],[x2],x5
 
-    smull       v12.4s, v16.4h, v4.4h[1]    // 55 * pi2_src[2]
-    smlsl       v12.4s, v15.4h, v4.4h[2]    //  - 74 * pi2_src[1] +   55 * pi2_src[2]
-    smlsl       v12.4s, v17.4h, v4.4h[0]    // - 74 * pi2_src[1] +   55 * pi2_src[2]    - 29 * pi2_src[3]
-    smlal       v12.4s, v14.4h, v4.4h[3]    //pi2_out[3] = 84 * pi2_src[0] - 74 * pi2_src[1] + 55 * pi2_src[2] - 29 * pi2_src[3]
+    smull       v20.4s, v16.4h, v4.4h[1]    // 55 * pi2_src[2]
+    smlsl       v20.4s, v22.4h, v4.4h[2]    //  - 74 * pi2_src[1] +   55 * pi2_src[2]
+    smlsl       v20.4s, v17.4h, v4.4h[0]    // - 74 * pi2_src[1] +   55 * pi2_src[2]    - 29 * pi2_src[3]
+    smlal       v20.4s, v21.4h, v4.4h[3]    //pi2_out[3] = 84 * pi2_src[0] - 74 * pi2_src[1] + 55 * pi2_src[2] - 29 * pi2_src[3]
 
     sqrshrn     v28.4h, v6.4s,#shift_stage2_idct // (pi2_out[0] + rounding ) >> shift_stage1_idct
-    sqrshrn     v29.4h, v8.4s,#shift_stage2_idct // (pi2_out[1] + rounding ) >> shift_stage1_idct
-    sqrshrn     v30.4h, v10.4s,#shift_stage2_idct // (pi2_out[2] + rounding ) >> shift_stage1_idct
-    sqrshrn     v31.4h, v12.4s,#shift_stage2_idct // (pi2_out[3] + rounding ) >> shift_stage1_idct
+    sqrshrn     v29.4h, v5.4s,#shift_stage2_idct // (pi2_out[1] + rounding ) >> shift_stage1_idct
+    sqrshrn     v30.4h, v7.4s,#shift_stage2_idct // (pi2_out[2] + rounding ) >> shift_stage1_idct
+    sqrshrn     v31.4h, v20.4s,#shift_stage2_idct // (pi2_out[3] + rounding ) >> shift_stage1_idct
     ld1         {v19.s}[1],[x2],x5
     trn1        v24.4h, v28.4h, v29.4h
     trn2        v25.4h, v28.4h, v29.4h
@@ -233,7 +233,7 @@ ihevc_itrans_recon_4x4_ttype1_av8:
 
     // ldmfd sp!,{x4-x12,x15}            //reload the registers from sp
     ldp         x19, x20,[sp],#16
-    pop_v_regs
+
     ret
 
 
diff --git a/common/arm64/ihevc_neon_macros.s b/common/arm64/ihevc_neon_macros.s
index 09a1de9..c5e65e5 100644
--- a/common/arm64/ihevc_neon_macros.s
+++ b/common/arm64/ihevc_neon_macros.s
@@ -47,4 +47,3 @@
     ldp         d10,d11,[sp],#16
     ldp         d8,d9,[sp],#16
 .endm
-
diff --git a/common/arm64/ihevc_sao_band_offset_luma.s b/common/arm64/ihevc_sao_band_offset_luma.s
index 099d581..779ee69 100644
--- a/common/arm64/ihevc_sao_band_offset_luma.s
+++ b/common/arm64/ihevc_sao_band_offset_luma.s
@@ -76,7 +76,10 @@ ihevc_sao_band_offset_luma_av8:
 
     LDR         w8,[sp]                     //Loads ht
 
-    push_v_regs
+
+    stp         d13,d14,[sp,#-16]!
+    stp         d8,d15,[sp,#-16]!           // Storing d15 using { sub sp,sp,#8; str d15,[sp] } is giving bus error.
+                                            // d8 is used as dummy register and stored along with d15 using stp. d8 is not used in the function.
     stp         x19, x20,[sp,#-16]!
 
     MOV         x9,x8                       //Move the ht to x9 for loop counter
@@ -127,7 +130,7 @@ SRC_TOP_LOOP:                               //wd is always multiple of 8
     ADD         v7.8b,  v3.8b ,  v31.8b     //band_table.val[2] = vadd_u8(band_table.val[2], band_pos)
 
     dup         v27.8b, v30.8b[3]           //vdup_n_u8(pi1_sao_offset[3])
-    ADD         v8.8b,  v4.8b ,  v31.8b     //band_table.val[3] = vadd_u8(band_table.val[3], band_pos)
+    ADD         v21.8b,  v4.8b ,  v31.8b    //band_table.val[3] = vadd_u8(band_table.val[3], band_pos)
 
     dup         v26.8b, v30.8b[4]           //vdup_n_u8(pi1_sao_offset[4])
     ADD         v1.8b,  v5.8b ,  v29.8b     //band_table.val[0] = vadd_u8(band_table.val[0], vdup_n_u8(pi1_sao_offset[1]))
@@ -138,52 +141,52 @@ SRC_TOP_LOOP:                               //wd is always multiple of 8
     CMP         x5,#28
     ADD         v3.8b,  v7.8b ,  v27.8b     //band_table.val[2] = vadd_u8(band_table.val[2], vdup_n_u8(pi1_sao_offset[3]))
 
-    ADD         v4.8b,  v8.8b ,  v26.8b     //band_table.val[3] = vadd_u8(band_table.val[3], vdup_n_u8(pi1_sao_offset[4]))
+    ADD         v4.8b,  v21.8b ,  v26.8b    //band_table.val[3] = vadd_u8(band_table.val[3], vdup_n_u8(pi1_sao_offset[4]))
     BLT         SAO_BAND_POS_0
 
 SAO_BAND_POS_28:                            //case 28
 
-    cmhs        v12.8b,  v29.8b ,  v4.8b    //vcle_u8(band_table.val[3], vdup_n_u8(16))
+    cmhs        v25.8b,  v29.8b ,  v4.8b    //vcle_u8(band_table.val[3], vdup_n_u8(16))
 
     BNE         SAO_BAND_POS_29
-    ORR         v4.8b,  v4.8b ,  v12.8b     //band_table.val[3] = vorr_u8(band_table.val[3], au1_cmp)
+    ORR         v4.8b,  v4.8b ,  v25.8b     //band_table.val[3] = vorr_u8(band_table.val[3], au1_cmp)
     B           SWITCH_BREAK
 
 SAO_BAND_POS_29:                            //case 29
     CMP         x5,#29
-    cmhs        v11.8b,  v29.8b ,  v3.8b    //vcle_u8(band_table.val[2], vdup_n_u8(16))
+    cmhs        v24.8b,  v29.8b ,  v3.8b    //vcle_u8(band_table.val[2], vdup_n_u8(16))
 
     BNE         SAO_BAND_POS_30
-    ORR         v3.8b,  v3.8b ,  v11.8b     //band_table.val[2] = vorr_u8(band_table.val[2], au1_cmp)
+    ORR         v3.8b,  v3.8b ,  v24.8b     //band_table.val[2] = vorr_u8(band_table.val[2], au1_cmp)
 
-    AND         v4.8b,  v4.8b ,  v12.8b     //band_table.val[3] = vand_u8(band_table.val[3], au1_cmp)
+    AND         v4.8b,  v4.8b ,  v25.8b     //band_table.val[3] = vand_u8(band_table.val[3], au1_cmp)
     B           SWITCH_BREAK
 
 SAO_BAND_POS_30:                            //case 30
     CMP         x5,#30
-    cmhs        v10.8b,  v29.8b ,  v2.8b    //vcle_u8(band_table.val[1], vdup_n_u8(16))
+    cmhs        v23.8b,  v29.8b ,  v2.8b    //vcle_u8(band_table.val[1], vdup_n_u8(16))
 
     BNE         SAO_BAND_POS_31
-    ORR         v2.8b,  v2.8b ,  v10.8b     //band_table.val[1] = vorr_u8(band_table.val[1], au1_cmp)
+    ORR         v2.8b,  v2.8b ,  v23.8b     //band_table.val[1] = vorr_u8(band_table.val[1], au1_cmp)
 
-    AND         v3.8b,  v3.8b ,  v11.8b     //band_table.val[2] = vand_u8(band_table.val[2], au1_cmp)
+    AND         v3.8b,  v3.8b ,  v24.8b     //band_table.val[2] = vand_u8(band_table.val[2], au1_cmp)
     B           SWITCH_BREAK
 
 SAO_BAND_POS_31:                            //case 31
     CMP         x5,#31
     BNE         SWITCH_BREAK
 
-    cmhs        v9.8b,  v29.8b ,  v1.8b     //vcle_u8(band_table.val[0], vdup_n_u8(16))
-    ORR         v1.8b,  v1.8b ,  v9.8b      //band_table.val[0] = vorr_u8(band_table.val[0], au1_cmp)
+    cmhs        v22.8b,  v29.8b ,  v1.8b    //vcle_u8(band_table.val[0], vdup_n_u8(16))
+    ORR         v1.8b,  v1.8b ,  v22.8b     //band_table.val[0] = vorr_u8(band_table.val[0], au1_cmp)
 
-    AND         v2.8b,  v2.8b ,  v10.8b     //band_table.val[1] = vand_u8(band_table.val[1], au1_cmp)
+    AND         v2.8b,  v2.8b ,  v23.8b     //band_table.val[1] = vand_u8(band_table.val[1], au1_cmp)
 
 SAO_BAND_POS_0:
     CMP         x5,#0                       //case 0
     BNE         SWITCH_BREAK
 
-    cmhs        v9.8b,  v29.8b ,  v1.8b     //vcle_u8(band_table.val[0], vdup_n_u8(16))
-    AND         v1.8b,  v1.8b ,  v9.8b      //band_table.val[0] = vand_u8(band_table.val[0], au1_cmp)
+    cmhs        v22.8b,  v29.8b ,  v1.8b    //vcle_u8(band_table.val[0], vdup_n_u8(16))
+    AND         v1.8b,  v1.8b ,  v22.8b     //band_table.val[0] = vand_u8(band_table.val[0], au1_cmp)
 
 SWITCH_BREAK:
 
@@ -236,9 +239,11 @@ HEIGHT_LOOP:
     ADD         x0,x0,#8
     BNE         SWITCH_BREAK_1
 
-    // LDMFD sp!,{x4-x12,x15}              //Reload the registers from SP
+    // LDMFD sp!,{x4-x12,x15}               //Reload the registers from SP
     ldp         x19, x20,[sp], #16
-    pop_v_regs
+    ldp         d8,d15,[sp],#16             // Loading d15 using { ldr d15,[sp]; add sp,sp,#8 } is giving bus error.
+                                            // d8 is used as dummy register and loaded along with d15 using ldp. d8 is not used in the function.
+    ldp         d13,d14,[sp],#16
     ret
 
 
diff --git a/common/arm64/ihevc_sao_edge_offset_class0.s b/common/arm64/ihevc_sao_edge_offset_class0.s
index f7d6621..91146e8 100644
--- a/common/arm64/ihevc_sao_edge_offset_class0.s
+++ b/common/arm64/ihevc_sao_edge_offset_class0.s
@@ -78,7 +78,7 @@ ihevc_sao_edge_offset_class0_av8:
     LDR         x10,[sp,#16]                //Loads ht
     AND         x10,x10,0xFFFFFFFF          // Since argument is passed as WORD32, Using only lower half of x10
 
-    push_v_regs
+
     stp         x19, x20,[sp,#-16]!
 
     movi        v2.16b, #2                  //const_2 = vdupq_n_s8(2)
@@ -93,15 +93,15 @@ ihevc_sao_edge_offset_class0_av8:
     ADRP        x14, :got:gi1_table_edge_idx //table pointer
     LDR         x14, [x14, #:got_lo12:gi1_table_edge_idx]
 
-    movi        v8.16b, #0xFF               //au1_mask = vdupq_n_s8(-1)
+    movi        v3.16b, #0xFF               //au1_mask = vdupq_n_s8(-1)
     STRB        w12,[x4]                    //*pu1_src_top_left = pu1_src_top[wd - 1]
 
     MOV         x6,x0                       //pu1_src_org
-    LD1         {v10.8b},[x14]              //edge_idx_tbl = vld1_s8(gi1_table_edge_idx)
+    LD1         {v5.8b},[x14]               //edge_idx_tbl = vld1_s8(gi1_table_edge_idx)
     SUB         x4,x10,#1                   //(ht - 1)
 
     MOV         x12,x9                      //Move wd to x12 for loop count
-    LD1         {v11.8b},[x8]               //offset_tbl = vld1_s8(pi1_sao_offset)
+    LD1         {v7.8b},[x8]                //offset_tbl = vld1_s8(pi1_sao_offset)
     mul         x4, x4, x1                  //(ht - 1) * src_strd
 
     ADD         x4,x4,x0                    //pu1_src[(ht - 1) * src_strd]
@@ -123,18 +123,18 @@ WIDTH_LOOP_16:
     CMP         x8,x9                       //if(col == wd)
     BNE         AU1_MASK_FF                 //jump to else part
     LDRB        w12,[x7]                    //pu1_avail[0]
-    mov         v8.8b[0], w12               //vsetq_lane_s8(pu1_avail[0], au1_mask, 0)
+    mov         v3.8b[0], w12               //vsetq_lane_s8(pu1_avail[0], au1_mask, 0)
     B           SKIP_AU1_MASK_FF            //Skip the else part
 
 AU1_MASK_FF:
     MOV         x12,#0xFF                   //move -1 to x12
-    mov         v8.8b[0], w12               //au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
+    mov         v3.8b[0], w12               //au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
 
 SKIP_AU1_MASK_FF:
     CMP         x8,#16                      //If col == 16
     BNE         SKIP_MASKING_IF_NOT16       //If not skip masking
     LDRB        w12,[x7,#1]                 //pu1_avail[1]
-    mov         v8.b[15], w12               //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
+    mov         v3.b[15], w12               //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
 
 SKIP_MASKING_IF_NOT16:
     MOV         x12,x0                      //pu1_src_cpy = pu1_src
@@ -142,24 +142,24 @@ SKIP_MASKING_IF_NOT16:
 
 PU1_SRC_LOOP:
     LDRB        w11,[x2]                    //load pu1_src_left since ht - row =0 when it comes first pu1_src_left is incremented later
-    LD1         {v12.16b},[x12],x1          //pu1_cur_row = vld1q_u8(pu1_src_cpy)
+    LD1         {v17.16b},[x12],x1          //pu1_cur_row = vld1q_u8(pu1_src_cpy)
     SUB         x5,x9,x8                    //wd - col
 
     SUB         x14,x10,x4                  //ht - row
-    mov         v14.8b[15], w11             //vsetq_lane_u8(pu1_src_left[ht - row], pu1_cur_row_tmp, 15)
+    mov         v21.8b[15], w11             //vsetq_lane_u8(pu1_src_left[ht - row], pu1_cur_row_tmp, 15)
     mul         x14, x14, x1                //(ht - row) * src_strd
 
     LD1         {v26.16b},[x12]             //II Iteration pu1_cur_row = vld1q_u8(pu1_src_cpy)
-    EXT         v14.16b,  v14.16b ,  v12.16b,#15 //pu1_cur_row_tmp = vextq_u8(pu1_cur_row_tmp, pu1_cur_row, 15)
+    EXT         v21.16b,  v21.16b ,  v17.16b,#15 //pu1_cur_row_tmp = vextq_u8(pu1_cur_row_tmp, pu1_cur_row, 15)
     ADD         x5,x14,x5                   //(ht - row) * src_strd + (wd - col)
 
     LDRB        w11,[x2, #1]                //II Iteration load pu1_src_left since ht - row + 1 =1
-    cmhi        v16.16b,  v12.16b ,  v14.16b //vcgtq_u8(pu1_cur_row, pu1_cur_row_tmp)
+    cmhi        v16.16b,  v17.16b ,  v21.16b //vcgtq_u8(pu1_cur_row, pu1_cur_row_tmp)
     LDRB        w14,[x6,x5]                 //pu1_src_org[(ht - row) * src_strd + 16 - 1 + (wd - col)]
 
     SUB         x4,x4,#1
     mov         v28.8b[15], w11             //II Iteration vsetq_lane_u8(pu1_src_left[ht - row], pu1_cur_row_tmp, 15)
-    cmhi        v18.16b,  v14.16b ,  v12.16b //vcltq_u8(pu1_cur_row, pu1_cur_row_tmp)
+    cmhi        v18.16b,  v21.16b ,  v17.16b //vcltq_u8(pu1_cur_row, pu1_cur_row_tmp)
 
     SUB         x12,x12,x1                  //Decrement the pu1_src pointer by src_strd
     SUB         v20.16b,  v18.16b ,  v16.16b //sign_left = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
@@ -170,11 +170,11 @@ PU1_SRC_LOOP:
     SUB         x5,x9,x8                    //II wd - col
 
     ADD         x12,x12,x1                  //Increment the pu1_src pointer by src_strd
-    mov         v14.8b[0], w11              //pu1_cur_row_tmp = vsetq_lane_u8(pu1_src_cpy[16], pu1_cur_row_tmp, 0)
+    mov         v21.8b[0], w11              //pu1_cur_row_tmp = vsetq_lane_u8(pu1_src_cpy[16], pu1_cur_row_tmp, 0)
     cmhi        v30.16b,  v26.16b ,  v28.16b //II vcgtq_u8(pu1_cur_row, pu1_cur_row_tmp)
 
     LDRB        w11,[x12,#16]               //II pu1_src_cpy[16]
-    EXT         v14.16b,  v12.16b ,  v14.16b,#1 //pu1_cur_row_tmp = vextq_u8(pu1_cur_row, pu1_cur_row_tmp, 1)
+    EXT         v21.16b,  v17.16b ,  v21.16b,#1 //pu1_cur_row_tmp = vextq_u8(pu1_cur_row, pu1_cur_row_tmp, 1)
     SUB         x14,x10,x4                  //II ht - row
 
     cmhi        v0.16b,  v28.16b ,  v26.16b //II vcltq_u8(pu1_cur_row, pu1_cur_row_tmp)
@@ -182,59 +182,59 @@ PU1_SRC_LOOP:
     SUB         x12,x12,x1                  //Decrement the pu1_src pointer by src_strd
 
     mul         x14, x14, x1                //II (ht - row) * src_strd
-    cmhi        v16.16b,  v12.16b ,  v14.16b //vcgtq_u8(pu1_cur_row, pu1_cur_row_tmp)
+    cmhi        v16.16b,  v17.16b ,  v21.16b //vcgtq_u8(pu1_cur_row, pu1_cur_row_tmp)
     ADD         x5,x14,x5                   //II (ht - row) * src_strd + (wd - col)
 
-    cmhi        v18.16b,  v14.16b ,  v12.16b //vcltq_u8(pu1_cur_row, pu1_cur_row_tmp)
+    cmhi        v18.16b,  v21.16b ,  v17.16b //vcltq_u8(pu1_cur_row, pu1_cur_row_tmp)
     EXT         v28.16b,  v26.16b ,  v28.16b,#1 //II pu1_cur_row_tmp = vextq_u8(pu1_cur_row, pu1_cur_row_tmp, 1)
 
     LDRB        w14,[x6,x5]                 //II pu1_src_org[(ht - row) * src_strd + 16 - 1 + (wd - col)]
     SUB         v22.16b,  v18.16b ,  v16.16b //sign_right = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
     SUBS        x4,x4,#1                    //Decrement row by 1
 
-    ADD         v14.16b,  v2.16b ,  v20.16b //edge_idx = vaddq_s8(const_2, sign_left)
+    ADD         v21.16b,  v2.16b ,  v20.16b //edge_idx = vaddq_s8(const_2, sign_left)
     STRB        w14,[x2],#1                 //II pu1_src_left[(ht - row)] = au1_src_left_tmp[(ht - row)]
 
-    ADD         v14.16b,  v14.16b ,  v22.16b //edge_idx = vaddq_s8(edge_idx, sign_right)
-    Uxtl        v18.8h, v12.8b              //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
+    ADD         v21.16b,  v21.16b ,  v22.16b //edge_idx = vaddq_s8(edge_idx, sign_right)
+    Uxtl        v18.8h, v17.8b              //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
 
     SUB         v20.16b,  v0.16b ,  v30.16b //II sign_left = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
-    TBL         v14.16b, {v10.16b},v14.16b  //vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
+    TBL         v21.16b, {v5.16b},v21.16b   //vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
     cmhi        v30.16b,  v26.16b ,  v28.16b //II vcgtq_u8(pu1_cur_row, pu1_cur_row_tmp)
 
     cmhi        v0.16b,  v28.16b ,  v26.16b //II vcltq_u8(pu1_cur_row, pu1_cur_row_tmp)
 //  TBL v15.8b, {v10.16b},v15.8b                //vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
     SUB         v22.16b,  v0.16b ,  v30.16b //II sign_right = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
 
-    AND         v14.16b,  v14.16b ,  v8.16b //edge_idx = vandq_s8(edge_idx, au1_mask)
-    TBL         v16.16b, {v11.16b},v14.16b  //offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx))
+    AND         v21.16b,  v21.16b ,  v3.16b //edge_idx = vandq_s8(edge_idx, au1_mask)
+    TBL         v16.16b, {v7.16b},v21.16b   //offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx))
     Uxtl        v0.8h, v26.8b               //II pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
 
     ADD         v28.16b,  v2.16b ,  v20.16b //II edge_idx = vaddq_s8(const_2, sign_left)
     ADD         v28.16b,  v28.16b ,  v22.16b //II edge_idx = vaddq_s8(edge_idx, sign_right)
 
     SADDW       v18.8h,  v18.8h ,  v16.8b
-    TBL         v28.16b, {v10.16b},v28.16b  //II vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
+    TBL         v28.16b, {v5.16b},v28.16b   //II vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
     SMAX        v18.8h,  v18.8h ,  v4.8h    //pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
 
 //  TBL v29.8b, {v10.16b},v29.8b                //II vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
     UMIN        v18.8h,  v18.8h ,  v6.8h    //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
 
-    AND         v28.16b,  v28.16b ,  v8.16b //II edge_idx = vandq_s8(edge_idx, au1_mask)
+    AND         v28.16b,  v28.16b ,  v3.16b //II edge_idx = vandq_s8(edge_idx, au1_mask)
 //  TBL v17.8b, {v11.16b},v15.8b                    //offset = vtbl1_s8(offset_tbl, vget_high_s8(edge_idx))
 
-    Uxtl2       v14.8h, v12.16b             //pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
-    TBL         v30.16b, {v11.16b},v28.16b  //II offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx))
-    SADDW2      v14.8h,  v14.8h ,  v16.16b  //pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
+    Uxtl2       v21.8h, v17.16b             //pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
+    TBL         v30.16b, {v7.16b},v28.16b   //II offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx))
+    SADDW2      v21.8h,  v21.8h ,  v16.16b  //pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
 
-    SMAX        v14.8h,  v14.8h ,  v4.8h    //pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
+    SMAX        v21.8h,  v21.8h ,  v4.8h    //pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
 //  TBL v31.8b, {v11.16b},v29.8b                    //II offset = vtbl1_s8(offset_tbl, vget_high_s8(edge_idx))
-    UMIN        v14.8h,  v14.8h ,  v6.8h    //pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip))
+    UMIN        v21.8h,  v21.8h ,  v6.8h    //pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip))
 
     xtn         v18.8b,  v18.8h             //vmovn_s16(pi2_tmp_cur_row.val[0])
     SADDW       v0.8h,  v0.8h ,  v30.8b
 
-    xtn         v19.8b,  v14.8h             //vmovn_s16(pi2_tmp_cur_row.val[1])
+    xtn         v19.8b,  v21.8h             //vmovn_s16(pi2_tmp_cur_row.val[1])
     SMAX        v0.8h,  v0.8h ,  v4.8h      //II pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
 
     Uxtl2       v28.8h, v26.16b             //II pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
@@ -271,52 +271,52 @@ WIDTH_RESIDUE:
     CMP         x8,x9                       //if(wd_rem == wd)
     BNE         AU1_MASK_FF_RESIDUE         //jump to else part
     LDRB        w12,[x7]                    //pu1_avail[0]
-    mov         v8.8b[0], w12               //vsetq_lane_s8(pu1_avail[0], au1_mask, 0)
+    mov         v3.8b[0], w12               //vsetq_lane_s8(pu1_avail[0], au1_mask, 0)
     B           SKIP_AU1_MASK_FF_RESIDUE    //Skip the else part
 
 AU1_MASK_FF_RESIDUE:
     MOV         x12,#0xFF                   //move -s to x12
-    mov         v8.8b[0], w12               //au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
+    mov         v3.8b[0], w12               //au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
 
 SKIP_AU1_MASK_FF_RESIDUE:
     LDRB        w11,[x7,#1]                 //pu1_avail[1]
     SUB         x5,x9,#1                    //wd - 1
 
     MOV         x4,x10                      //move ht to x4 for loop count
-    mov         v8.8b[7], w11               //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
+    mov         v3.8b[7], w11               //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
     MOV         x12,x0                      //pu1_src_cpy = pu1_src
 
 PU1_SRC_LOOP_RESIDUE:
-    LD1         {v12.16b},[x12]             //pu1_cur_row = vld1q_u8(pu1_src_cpy)
+    LD1         {v17.16b},[x12]             //pu1_cur_row = vld1q_u8(pu1_src_cpy)
     LDRB        w11,[x2]                    //load pu1_src_left
-    mov         v14.8b[15], w11             //vsetq_lane_u8(pu1_src_left[ht - row], pu1_cur_row_tmp, 15)
-    EXT         v14.16b,  v14.16b ,  v12.16b,#15 //pu1_cur_row_tmp = vextq_u8(pu1_cur_row_tmp, pu1_cur_row, 15)
+    mov         v21.8b[15], w11             //vsetq_lane_u8(pu1_src_left[ht - row], pu1_cur_row_tmp, 15)
+    EXT         v21.16b,  v21.16b ,  v17.16b,#15 //pu1_cur_row_tmp = vextq_u8(pu1_cur_row_tmp, pu1_cur_row, 15)
 
-    cmhi        v16.16b,  v12.16b ,  v14.16b //vcgtq_u8(pu1_cur_row, pu1_cur_row_tmp)
-    cmhi        v18.16b,  v14.16b ,  v12.16b //vcltq_u8(pu1_cur_row, pu1_cur_row_tmp)
+    cmhi        v16.16b,  v17.16b ,  v21.16b //vcgtq_u8(pu1_cur_row, pu1_cur_row_tmp)
+    cmhi        v18.16b,  v21.16b ,  v17.16b //vcltq_u8(pu1_cur_row, pu1_cur_row_tmp)
     SUB         v20.16b,  v18.16b ,  v16.16b //sign_left = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
 
     LDRB        w11,[x12,#16]               //pu1_src_cpy[16]
-    mov         v14.8b[0], w11              //pu1_cur_row_tmp = vsetq_lane_u8(pu1_src_cpy[16], pu1_cur_row_tmp, 0)
-    EXT         v14.16b,  v12.16b ,  v14.16b,#1 //pu1_cur_row_tmp = vextq_u8(pu1_cur_row, pu1_cur_row_tmp, 1)
+    mov         v21.8b[0], w11              //pu1_cur_row_tmp = vsetq_lane_u8(pu1_src_cpy[16], pu1_cur_row_tmp, 0)
+    EXT         v21.16b,  v17.16b ,  v21.16b,#1 //pu1_cur_row_tmp = vextq_u8(pu1_cur_row, pu1_cur_row_tmp, 1)
 
-    cmhi        v16.16b,  v12.16b ,  v14.16b //vcgtq_u8(pu1_cur_row, pu1_cur_row_tmp)
-    cmhi        v18.16b,  v14.16b ,  v12.16b //vcltq_u8(pu1_cur_row, pu1_cur_row_tmp)
+    cmhi        v16.16b,  v17.16b ,  v21.16b //vcgtq_u8(pu1_cur_row, pu1_cur_row_tmp)
+    cmhi        v18.16b,  v21.16b ,  v17.16b //vcltq_u8(pu1_cur_row, pu1_cur_row_tmp)
     SUB         v22.16b,  v18.16b ,  v16.16b //sign_right = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
 
     ADD         v24.16b,  v2.16b ,  v20.16b //edge_idx = vaddq_s8(const_2, sign_left)
     ADD         v24.16b,  v24.16b ,  v22.16b //edge_idx = vaddq_s8(edge_idx, sign_right)
 
-    TBL         v24.16b, {v10.16b},v24.16b  //vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
+    TBL         v24.16b, {v5.16b},v24.16b   //vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
 //  TBL v25.8b, {v10.16b},v25.8b                //vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
 
-    AND         v24.16b,  v24.16b ,  v8.16b //edge_idx = vandq_s8(edge_idx, au1_mask)
+    AND         v24.16b,  v24.16b ,  v3.16b //edge_idx = vandq_s8(edge_idx, au1_mask)
 
     NEG         v20.16b, v22.16b            //sign_left = vnegq_s8(sign_right)
     EXT         v20.16b,  v20.16b ,  v22.16b,#15 //sign_left = vextq_s8(sign_left, sign_left, 15)
 
-    TBL         v26.8b, {v11.16b},v24.8b    //offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx))
-    Uxtl        v28.8h, v12.8b              //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
+    TBL         v26.8b, {v7.16b},v24.8b     //offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx))
+    Uxtl        v28.8h, v17.8b              //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
     SADDW       v28.8h,  v28.8h ,  v26.8b
     SMAX        v28.8h,  v28.8h ,  v4.8h    //pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
     UMIN        v28.8h,  v28.8h ,  v6.8h    //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
@@ -337,7 +337,7 @@ PU1_SRC_LOOP_RESIDUE:
 END_LOOPS:
     // LDMFD sp!,{x4-x12,x15}              //Reload the registers from SP
     ldp         x19, x20,[sp], #16
-    pop_v_regs
+
     ret
 
 
diff --git a/common/arm64/ihevc_sao_edge_offset_class0_chroma.s b/common/arm64/ihevc_sao_edge_offset_class0_chroma.s
index d854c62..c6be41a 100644
--- a/common/arm64/ihevc_sao_edge_offset_class0_chroma.s
+++ b/common/arm64/ihevc_sao_edge_offset_class0_chroma.s
@@ -74,7 +74,7 @@ ihevc_sao_edge_offset_class0_chroma_av8:
     ldr         w10,[sp,#16]
     ldr         w11,[sp,#24]
 
-    push_v_regs
+
 
     // STMFD sp!, {x4-x12, x14}            //stack stores the values of the arguments
     stp         x19, x20,[sp,#-16]!
@@ -111,15 +111,15 @@ ihevc_sao_edge_offset_class0_chroma_av8:
 
     ADRP        x14, :got:gi1_table_edge_idx //table pointer
     LDR         x14, [x14, #:got_lo12:gi1_table_edge_idx]
-    movi        v8.16b, #0xFF               //au1_mask = vdupq_n_s8(-1)
+    movi        v3.16b, #0xFF               //au1_mask = vdupq_n_s8(-1)
     mul         x4, x4, x1                  //(ht - 1) * src_strd
 
     MOV         x5, x23                     //Loads pi1_sao_offset_v
-    LD1         {v11.8b},[x8]               //offset_tbl = vld1_s8(pi1_sao_offset_u)
+    LD1         {v7.8b},[x8]                //offset_tbl = vld1_s8(pi1_sao_offset_u)
     ADD         x4,x4,x0                    //pu1_src[(ht - 1) * src_strd]
 
     MOV         x6,x0                       //pu1_src_org
-    LD1         {v10.8b},[x14]              //edge_idx_tbl = vld1_s8(gi1_table_edge_idx)
+    LD1         {v5.8b},[x14]               //edge_idx_tbl = vld1_s8(gi1_table_edge_idx)
     MOV         x12,x9                      //Move wd to x12 for loop count
 
 SRC_TOP_LOOP:                               //wd is always multiple of 8
@@ -141,20 +141,20 @@ WIDTH_LOOP_16:
     CMP         x8,x9                       //if(col == wd)
     BNE         AU1_MASK_FF                 //jump to else part
     LDRB        w12,[x7]                    //pu1_avail[0]
-    mov         v8.8b[0], w12               //vsetq_lane_s8(pu1_avail[0], au1_mask, 0)
-    mov         v8.8b[1], w12               //vsetq_lane_s8(pu1_avail[0], au1_mask, 1)
+    mov         v3.8b[0], w12               //vsetq_lane_s8(pu1_avail[0], au1_mask, 0)
+    mov         v3.8b[1], w12               //vsetq_lane_s8(pu1_avail[0], au1_mask, 1)
     B           SKIP_AU1_MASK_FF            //Skip the else part
 
 AU1_MASK_FF:
     MOV         x12,#-1                     //move -1 to x12
-    mov         v8.4h[0], w12               //au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
+    mov         v3.4h[0], w12               //au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
 
 SKIP_AU1_MASK_FF:
     CMP         x8,#16                      //If col == 16
     BNE         SKIP_MASKING_IF_NOT16       //If not skip masking
     LDRB        w12,[x7,#1]                 //pu1_avail[1]
-    mov         v8.8b[14], w12              //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 14)
-    mov         v8.8b[15], w12              //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
+    mov         v3.8b[14], w12              //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 14)
+    mov         v3.8b[15], w12              //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
 
 SKIP_MASKING_IF_NOT16:
     MOV         x12,x0                      //pu1_src_cpy = pu1_src
@@ -162,27 +162,27 @@ SKIP_MASKING_IF_NOT16:
 
 PU1_SRC_LOOP:
     LDRH        w11,[x2]                    //load pu1_src_left since ht - row =0 when it comes first pu1_src_left is incremented later
-    LD1         {v12.16b},[x12],x1          //pu1_cur_row = vld1q_u8(pu1_src_cpy)
+    LD1         {v19.16b},[x12],x1          //pu1_cur_row = vld1q_u8(pu1_src_cpy)
     //LD1 {v13.8b},[x12],x1                    //pu1_cur_row = vld1q_u8(pu1_src_cpy)
     //SUB x12, x12,#8
     SUB         x5,x9,x8                    //wd - col
 
     SUB         x14,x10,x4                  //ht - row
-    mov         v14.4h[7], w11              //vsetq_lane_u16(pu1_src_left[ht - row], pu1_cur_row_tmp, 14,15)
+    mov         v21.4h[7], w11              //vsetq_lane_u16(pu1_src_left[ht - row], pu1_cur_row_tmp, 14,15)
     mul         x14, x14, x1                //(ht - row) * src_strd
 
     LD1         {v30.16b},[x12]             //II Iteration pu1_cur_row = vld1q_u8(pu1_src_cpy)
     //LD1 {v31.8b},[x12]                    //II Iteration pu1_cur_row = vld1q_u8(pu1_src_cpy)
     //SUB x12, x12,#8
-    EXT         v14.16b,  v14.16b ,  v12.16b,#14 //pu1_cur_row_tmp = vextq_u8(pu1_cur_row_tmp, pu1_cur_row, 14)
+    EXT         v21.16b,  v21.16b ,  v19.16b,#14 //pu1_cur_row_tmp = vextq_u8(pu1_cur_row_tmp, pu1_cur_row, 14)
     SUB         x12,x12,x1
 
     LDRH        w11,[x2,#2]                 //II load pu1_src_left since ht - row =0
-    cmhi        v16.16b,  v12.16b ,  v14.16b //vcgtq_u8(pu1_cur_row, pu1_cur_row_tmp)
+    cmhi        v16.16b,  v19.16b ,  v21.16b //vcgtq_u8(pu1_cur_row, pu1_cur_row_tmp)
     ADD         x5,x14,x5                   //(ht - row) * src_strd + (wd - col)
 
     mov         v28.4h[7], w11              //II vsetq_lane_u16(pu1_src_left[ht - row], pu1_cur_row_tmp, 14,15)
-    cmhi        v18.16b,  v14.16b ,  v12.16b //vcltq_u8(pu1_cur_row, pu1_cur_row_tmp)
+    cmhi        v18.16b,  v21.16b ,  v19.16b //vcltq_u8(pu1_cur_row, pu1_cur_row_tmp)
 
     LDRH        w14,[x6,x5]                 //pu1_src_org[(ht - row) * src_strd + 14 + (wd - col)]
     SUB         v20.16b,  v18.16b ,  v16.16b //sign_left = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
@@ -191,7 +191,7 @@ PU1_SRC_LOOP:
     LDRB        w11,[x12,#16]               //pu1_src_cpy[16]
     EXT         v28.16b,  v28.16b ,  v30.16b,#14 //II pu1_cur_row_tmp = vextq_u8(pu1_cur_row_tmp, pu1_cur_row, 14)
 
-    mov         v14.8b[0], w11              //pu1_cur_row_tmp = vsetq_lane_u8(pu1_src_cpy[16], pu1_cur_row_tmp, 0)
+    mov         v21.8b[0], w11              //pu1_cur_row_tmp = vsetq_lane_u8(pu1_src_cpy[16], pu1_cur_row_tmp, 0)
     cmhi        v26.16b,  v30.16b ,  v28.16b //II vcgtq_u8(pu1_cur_row, pu1_cur_row_tmp)
 
     LDRB        w11,[x12,#17]               //pu1_src_cpy[17]
@@ -199,62 +199,62 @@ PU1_SRC_LOOP:
     STRH        w14,[x2],#2                 //pu1_src_left[(ht - row)] = au1_src_left_tmp[(ht - row)]
 
     ADD         x12,x12,x1
-    mov         v14.8b[1], w11              //pu1_cur_row_tmp = vsetq_lane_u8(pu1_src_cpy[17], pu1_cur_row_tmp, 1)
+    mov         v21.8b[1], w11              //pu1_cur_row_tmp = vsetq_lane_u8(pu1_src_cpy[17], pu1_cur_row_tmp, 1)
     LDRB        w11,[x12,#16]               //II pu1_src_cpy[16]
 
-    EXT         v14.16b,  v12.16b ,  v14.16b,#2 //pu1_cur_row_tmp = vextq_u8(pu1_cur_row, pu1_cur_row_tmp, 2)
+    EXT         v21.16b,  v19.16b ,  v21.16b,#2 //pu1_cur_row_tmp = vextq_u8(pu1_cur_row, pu1_cur_row_tmp, 2)
     mov         v28.8b[0], w11              //II pu1_cur_row_tmp = vsetq_lane_u8(pu1_src_cpy[16], pu1_cur_row_tmp, 0)
 
     LDRB        w11,[x12,#17]               //II pu1_src_cpy[17]
-    cmhi        v16.16b,  v12.16b ,  v14.16b //vcgtq_u8(pu1_cur_row, pu1_cur_row_tmp)
+    cmhi        v16.16b,  v19.16b ,  v21.16b //vcgtq_u8(pu1_cur_row, pu1_cur_row_tmp)
     SUB         x12,x12,x1
 
-    cmhi        v18.16b,  v14.16b ,  v12.16b //vcltq_u8(pu1_cur_row, pu1_cur_row_tmp)
+    cmhi        v18.16b,  v21.16b ,  v19.16b //vcltq_u8(pu1_cur_row, pu1_cur_row_tmp)
     mov         v28.8b[1], w11              //II pu1_cur_row_tmp = vsetq_lane_u8(pu1_src_cpy[17], pu1_cur_row_tmp, 1)
 
     SUB         v22.16b,  v18.16b ,  v16.16b //sign_right = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
     EXT         v28.16b,  v30.16b ,  v28.16b,#2 //II pu1_cur_row_tmp = vextq_u8(pu1_cur_row, pu1_cur_row_tmp, 2)
 
-    ADD         v14.16b,  v2.16b ,  v20.16b //edge_idx = vaddq_s8(const_2, sign_left)
+    ADD         v21.16b,  v2.16b ,  v20.16b //edge_idx = vaddq_s8(const_2, sign_left)
 
-    mov         v10.d[1],v10.d[0]
-    ADD         v14.16b,  v14.16b ,  v22.16b //edge_idx = vaddq_s8(edge_idx, sign_right)
-    TBL         v14.16b, {v10.16b},v14.16b  //vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
+    mov         v5.d[1],v5.d[0]
+    ADD         v21.16b,  v21.16b ,  v22.16b //edge_idx = vaddq_s8(edge_idx, sign_right)
+    TBL         v21.16b, {v5.16b},v21.16b   //vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
     SUB         v20.16b,  v24.16b ,  v26.16b //II sign_left = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
 
     cmhi        v26.16b,  v30.16b ,  v28.16b //II vcgtq_u8(pu1_cur_row, pu1_cur_row_tmp)
 //    TBL v15.8b, {v10.16b},v15.8b                //vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
     cmhi        v24.16b,  v28.16b ,  v30.16b //II vcltq_u8(pu1_cur_row, pu1_cur_row_tmp)
 
-    AND         v14.16b,  v14.16b ,  v8.16b //edge_idx = vandq_s8(edge_idx, au1_mask)
-    mov         v15.d[0],v14.d[1]
-    UZP1        v1.8b, v14.8b, v15.8b
-    UZP2        v15.8b, v14.8b, v15.8b
-    mov         v14.8b, v1.8b
+    AND         v21.16b,  v21.16b ,  v3.16b //edge_idx = vandq_s8(edge_idx, au1_mask)
+    mov         v23.d[0],v21.d[1]
+    UZP1        v1.8b, v21.8b, v23.8b
+    UZP2        v23.8b, v21.8b, v23.8b
+    mov         v21.8b, v1.8b
 
     //mov v11.d[1],v0.d[0]
     //mov v14.d[1],v15.d[0]
     SUB         v22.16b,  v24.16b ,  v26.16b //II sign_right = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
-    TBL         v16.8b, {v11.16b},v14.8b    //offset = vtbl1_s8(offset_tbl_u, vget_low_s8(edge_idx))
+    TBL         v16.8b, {v7.16b},v21.8b     //offset = vtbl1_s8(offset_tbl_u, vget_low_s8(edge_idx))
     ADD         v24.16b,  v2.16b ,  v20.16b //II edge_idx = vaddq_s8(const_2, sign_left)
 
-    Uxtl        v18.8h, v12.8b              //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
-    TBL         v17.8b, {v0.16b},v15.8b
+    Uxtl        v18.8h, v19.8b              //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
+    TBL         v17.8b, {v0.16b},v23.8b
     ADD         v24.16b,  v24.16b ,  v22.16b //II edge_idx = vaddq_s8(edge_idx, sign_right)
 
     //mov v17.d[0],v16.d[1]
     ZIP1        v1.8b, v16.8b, v17.8b
     ZIP2        v17.8b, v16.8b, v17.8b
     mov         v16.8b, v1.8b
-    TBL         v24.16b, {v10.16b},v24.16b  //II vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
-    Uxtl2       v12.8h, v12.16b             //pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
+    TBL         v24.16b, {v5.16b},v24.16b   //II vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
+    Uxtl2       v19.8h, v19.16b             //pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
 
     //mov v16.d[1],v17.d[0]
     SADDW       v18.8h,  v18.8h ,  v16.8b   //pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
     //TBL v25.8b, {v10.16b},v25.8b                //II vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
     SMAX        v18.8h,  v18.8h ,  v4.8h    //pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
 
-    AND         v24.16b,  v24.16b ,  v8.16b //II edge_idx = vandq_s8(edge_idx, au1_mask)
+    AND         v24.16b,  v24.16b ,  v3.16b //II edge_idx = vandq_s8(edge_idx, au1_mask)
     mov         v25.d[0],v24.d[1]
     UMIN        v18.8h,  v18.8h ,  v6.8h    //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
     UZP1        v1.8b, v24.8b, v25.8b
@@ -262,16 +262,16 @@ PU1_SRC_LOOP:
     mov         v24.8b, v1.8b
 
     //mov v24.d[1],v25.d[0]
-    SADDW       v12.8h,  v12.8h ,  v17.8b   //pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
-    TBL         v26.8b, {v11.16b},v24.8b    //II offset = vtbl1_s8(offset_tbl_u, vget_low_s8(edge_idx))
-    SMAX        v12.8h,  v12.8h ,  v4.8h    //pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
+    SADDW       v19.8h,  v19.8h ,  v17.8b   //pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
+    TBL         v26.8b, {v7.16b},v24.8b     //II offset = vtbl1_s8(offset_tbl_u, vget_low_s8(edge_idx))
+    SMAX        v19.8h,  v19.8h ,  v4.8h    //pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
 
-    UMIN        v12.8h,  v12.8h ,  v6.8h    //pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip))
+    UMIN        v19.8h,  v19.8h ,  v6.8h    //pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip))
     TBL         v27.8b, {v0.16b},v25.8b     //II
-    xtn         v14.8b,  v18.8h             //vmovn_s16(pi2_tmp_cur_row.val[0])
+    xtn         v21.8b,  v18.8h             //vmovn_s16(pi2_tmp_cur_row.val[0])
 
     //mov v27.d[0],v26.d[1]
-    xtn         v15.8b,  v12.8h             //vmovn_s16(pi2_tmp_cur_row.val[1])
+    xtn         v23.8b,  v19.8h             //vmovn_s16(pi2_tmp_cur_row.val[1])
     ZIP1        v1.8b, v26.8b, v27.8b
     ZIP2        v27.8b, v26.8b, v27.8b      //II
     mov         v26.8b, v1.8b
@@ -295,7 +295,9 @@ PU1_SRC_LOOP:
     Uxtl2       v30.8h, v30.16b             //II pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
 
     SADDW       v30.8h,  v30.8h ,  v27.8b   //II pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
-    ST1         {v14.8b, v15.8b},[x12],x1   //vst1q_u8(pu1_src_cpy, pu1_cur_row)
+    ST1         {v21.8b},[x12],#8           //vst1q_u8(pu1_src_cpy, pu1_cur_row)
+    ST1         {v23.8b},[x12],x1
+    SUB         x12,x12,#8
 
     SMAX        v30.8h,  v30.8h ,  v4.8h    //II pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
     SUBS        x4,x4,#1                    //Decrement row by 1
@@ -326,107 +328,107 @@ WIDTH_RESIDUE:
     CMP         x8,x9                       //if(wd_rem == wd)
     BNE         AU1_MASK_FF_RESIDUE         //jump to else part
     LDRB        w12,[x7]                    //pu1_avail[0]
-    mov         v8.8b[0], w12               //vsetq_lane_s8(pu1_avail[0], au1_mask, 0)
-    mov         v8.8b[1], w12               //vsetq_lane_s8(pu1_avail[0], au1_mask, 0)
+    mov         v3.8b[0], w12               //vsetq_lane_s8(pu1_avail[0], au1_mask, 0)
+    mov         v3.8b[1], w12               //vsetq_lane_s8(pu1_avail[0], au1_mask, 0)
     B           SKIP_AU1_MASK_FF_RESIDUE    //Skip the else part
 
 AU1_MASK_FF_RESIDUE:
     MOV         x12,#-1                     //move -1 to x12
-    mov         v8.4h[0], w12               //au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
+    mov         v3.4h[0], w12               //au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
 
 SKIP_AU1_MASK_FF_RESIDUE:
     LDRB        w12,[x7,#1]                 //pu1_avail[1]
-    mov         v8.8b[6], w12               //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
-    mov         v8.8b[7], w12               //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
+    mov         v3.8b[6], w12               //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
+    mov         v3.8b[7], w12               //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
 
     MOV         x12,x0                      //pu1_src_cpy = pu1_src
     MOV         x4,x10                      //move ht to x4 for loop count
 
 PU1_SRC_LOOP_RESIDUE:
     LDRH        w11,[x2]                    //load pu1_src_left
-    LD1         {v12.16b},[x12],x1          //pu1_cur_row = vld1q_u8(pu1_src_cpy)
+    LD1         {v19.16b},[x12],x1          //pu1_cur_row = vld1q_u8(pu1_src_cpy)
     //LD1 {v13.8b},[x12],x1                    //pu1_cur_row = vld1q_u8(pu1_src_cpy)
     //SUB x12, x12,#8
     SUB         x5,x9,#2                    //wd - 2
 
     SUB         x14,x10,x4                  //(ht - row)
-    mov         v14.4h[7], w11              //vsetq_lane_u8(pu1_src_left[ht - row], pu1_cur_row_tmp, 15)
+    mov         v21.4h[7], w11              //vsetq_lane_u8(pu1_src_left[ht - row], pu1_cur_row_tmp, 15)
     LSL         x14,x14,#1                  //(ht - row) * 2
 
     LD1         {v30.16b},[x12]             //II pu1_cur_row = vld1q_u8(pu1_src_cpy)
     //LD1 {v31.8b},[x12]                    //II pu1_cur_row = vld1q_u8(pu1_src_cpy)
     //SUB x12, x12,#8
-    EXT         v14.16b,  v14.16b ,  v12.16b,#14 //pu1_cur_row_tmp = vextq_u8(pu1_cur_row_tmp, pu1_cur_row, 15)
+    EXT         v21.16b,  v21.16b ,  v19.16b,#14 //pu1_cur_row_tmp = vextq_u8(pu1_cur_row_tmp, pu1_cur_row, 15)
     SUB         x12,x12,x1
 
     LDRH        w11,[x2,#2]                 //II load pu1_src_left
-    cmhi        v16.16b,  v12.16b ,  v14.16b //vcgtq_u8(pu1_cur_row, pu1_cur_row_tmp)
+    cmhi        v16.16b,  v19.16b ,  v21.16b //vcgtq_u8(pu1_cur_row, pu1_cur_row_tmp)
     mul         x14, x14, x1                //(ht - row) * 2 * src_strd
 
-    cmhi        v18.16b,  v14.16b ,  v12.16b //vcltq_u8(pu1_cur_row, pu1_cur_row_tmp)
+    cmhi        v18.16b,  v21.16b ,  v19.16b //vcltq_u8(pu1_cur_row, pu1_cur_row_tmp)
     mov         v28.4h[7], w11              //II vsetq_lane_u8(pu1_src_left[ht - row], pu1_cur_row_tmp, 15)
 
     LDRB        w11,[x12,#16]               //pu1_src_cpy[16]
     SUB         v20.16b,  v18.16b ,  v16.16b //sign_left = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
     ADD         x5,x14,x5                   //(ht - row) * 2 * src_strd + (wd - 2)
 
-    mov         v14.8b[0], w11              //pu1_cur_row_tmp = vsetq_lane_u8(pu1_src_cpy[16], pu1_cur_row_tmp, 0)
+    mov         v21.8b[0], w11              //pu1_cur_row_tmp = vsetq_lane_u8(pu1_src_cpy[16], pu1_cur_row_tmp, 0)
     EXT         v28.16b,  v28.16b ,  v30.16b,#14 //II pu1_cur_row_tmp = vextq_u8(pu1_cur_row_tmp, pu1_cur_row, 15)
 
     LDRB        w11,[x12,#17]               //pu1_src_cpy[17]
     cmhi        v26.16b,  v30.16b ,  v28.16b //II vcgtq_u8(pu1_cur_row, pu1_cur_row_tmp)
     LDRH        w14,[x6, x5]                //pu1_src_org[(ht - row)  * 2* src_strd + (wd - 2)]
 
-    mov         v14.8b[1], w11              //pu1_cur_row_tmp = vsetq_lane_u8(pu1_src_cpy[17], pu1_cur_row_tmp, 1)
+    mov         v21.8b[1], w11              //pu1_cur_row_tmp = vsetq_lane_u8(pu1_src_cpy[17], pu1_cur_row_tmp, 1)
     cmhi        v24.16b,  v28.16b ,  v30.16b //II vcltq_u8(pu1_cur_row, pu1_cur_row_tmp)
     ADD         x12,x12,x1
 
     STRH        w14,[x2],#2                 //pu1_src_left[(ht - row) * 2] = au1_src_left_tmp[(ht - row) * 2]
-    EXT         v14.16b,  v12.16b ,  v14.16b,#2 //pu1_cur_row_tmp = vextq_u8(pu1_cur_row, pu1_cur_row_tmp, 1)
+    EXT         v21.16b,  v19.16b ,  v21.16b,#2 //pu1_cur_row_tmp = vextq_u8(pu1_cur_row, pu1_cur_row_tmp, 1)
     LDRB        w11,[x12,#16]               //II pu1_src_cpy[16]
 
-    cmhi        v16.16b,  v12.16b ,  v14.16b //vcgtq_u8(pu1_cur_row, pu1_cur_row_tmp)
+    cmhi        v16.16b,  v19.16b ,  v21.16b //vcgtq_u8(pu1_cur_row, pu1_cur_row_tmp)
     mov         v28.8b[0], w11              //II pu1_cur_row_tmp = vsetq_lane_u8(pu1_src_cpy[16], pu1_cur_row_tmp, 0)
 
     LDRB        w11,[x12,#17]               //II pu1_src_cpy[17]
-    cmhi        v18.16b,  v14.16b ,  v12.16b //vcltq_u8(pu1_cur_row, pu1_cur_row_tmp)
+    cmhi        v18.16b,  v21.16b ,  v19.16b //vcltq_u8(pu1_cur_row, pu1_cur_row_tmp)
     SUB         x4,x4,#1                    //II Decrement row by 1
 
     SUB         v22.16b,  v18.16b ,  v16.16b //sign_right = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
     mov         v28.8b[1], w11              //II pu1_cur_row_tmp = vsetq_lane_u8(pu1_src_cpy[17], pu1_cur_row_tmp, 1)
     SUB         x12,x12,x1
 
-    ADD         v14.16b,  v2.16b ,  v20.16b //edge_idx = vaddq_s8(const_2, sign_left)
+    ADD         v21.16b,  v2.16b ,  v20.16b //edge_idx = vaddq_s8(const_2, sign_left)
     EXT         v28.16b,  v30.16b ,  v28.16b,#2 //II pu1_cur_row_tmp = vextq_u8(pu1_cur_row, pu1_cur_row_tmp, 1)
 
-    ADD         v14.16b,  v14.16b ,  v22.16b //edge_idx = vaddq_s8(edge_idx, sign_right)
+    ADD         v21.16b,  v21.16b ,  v22.16b //edge_idx = vaddq_s8(edge_idx, sign_right)
 
     SUB         v20.16b,  v24.16b ,  v26.16b //II sign_left = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
-    TBL         v14.16b, {v10.16b},v14.16b  //vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
+    TBL         v21.16b, {v5.16b},v21.16b   //vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
     cmhi        v26.16b,  v30.16b ,  v28.16b //II vcgtq_u8(pu1_cur_row, pu1_cur_row_tmp)
 
     cmhi        v24.16b,  v28.16b ,  v30.16b //II vcltq_u8(pu1_cur_row, pu1_cur_row_tmp)
     //TBL v15.8b, {v10.16b},v15.8b                //vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
     SUB         v22.16b,  v24.16b ,  v26.16b //II sign_right = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
 
-    AND         v14.16b,  v14.16b ,  v8.16b //edge_idx = vandq_s8(edge_idx, au1_mask)
-    mov         v15.d[0],v14.d[1]
-    UZP1        v1.8b, v14.8b, v15.8b
-    UZP2        v15.8b, v14.8b, v15.8b
-    mov         v14.8b, v1.8b
+    AND         v21.16b,  v21.16b ,  v3.16b //edge_idx = vandq_s8(edge_idx, au1_mask)
+    mov         v23.d[0],v21.d[1]
+    UZP1        v1.8b, v21.8b, v23.8b
+    UZP2        v23.8b, v21.8b, v23.8b
+    mov         v21.8b, v1.8b
 
     ADD         v28.16b,  v2.16b ,  v20.16b //II edge_idx = vaddq_s8(const_2, sign_left)
-    TBL         v16.8b, {v11.16b},v14.8b    //offset = vtbl1_s8(offset_tbl_u, vget_low_s8(edge_idx))
+    TBL         v16.8b, {v7.16b},v21.8b     //offset = vtbl1_s8(offset_tbl_u, vget_low_s8(edge_idx))
     ADD         v28.16b,  v28.16b ,  v22.16b //II edge_idx = vaddq_s8(edge_idx, sign_right)
 
-    Uxtl        v18.8h, v12.8b              //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
-    TBL         v17.8b, {v0.16b},v15.8b
+    Uxtl        v18.8h, v19.8b              //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
+    TBL         v17.8b, {v0.16b},v23.8b
     Uxtl        v24.8h, v30.8b              //II pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
 
     ZIP1        v1.8b, v16.8b, v17.8b
     ZIP2        v17.8b, v16.8b, v17.8b
     mov         v16.8b, v1.8b
-    TBL         v28.16b, {v10.16b},v28.16b  //II vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
+    TBL         v28.16b, {v5.16b},v28.16b   //II vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
     SADDW       v18.8h,  v18.8h ,  v16.8b   //pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
 
     SMAX        v18.8h,  v18.8h ,  v4.8h    //pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
@@ -434,7 +436,7 @@ PU1_SRC_LOOP_RESIDUE:
     UMIN        v18.8h,  v18.8h ,  v6.8h    //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
 
     xtn         v18.8b,  v18.8h             //vmovn_s16(pi2_tmp_cur_row.val[0])
-    AND         v28.16b,  v28.16b ,  v8.16b //II edge_idx = vandq_s8(edge_idx, au1_mask)
+    AND         v28.16b,  v28.16b ,  v3.16b //II edge_idx = vandq_s8(edge_idx, au1_mask)
     mov         v29.d[0],v28.d[1]
     SUB         x5,x9,#2                    //II wd - 2
     UZP1        v1.8b, v28.8b, v29.8b
@@ -443,7 +445,7 @@ PU1_SRC_LOOP_RESIDUE:
     SUB         x14,x10,x4                  //II (ht - row)
 
     LSL         x14,x14,#1                  //II (ht - row) * 2
-    TBL         v26.8b, {v11.16b},v28.8b    //II offset = vtbl1_s8(offset_tbl_u, vget_low_s8(edge_idx))
+    TBL         v26.8b, {v7.16b},v28.8b     //II offset = vtbl1_s8(offset_tbl_u, vget_low_s8(edge_idx))
     mul         x14, x14, x1                //II (ht - row) * 2 * src_strd
 
     ADD         x5,x14,x5                   //II (ht - row) * 2 * src_strd + (wd - 2)
@@ -474,7 +476,7 @@ END_LOOPS:
     ldp         x23, x24,[sp],#16
     ldp         x21, x22,[sp],#16
     ldp         x19, x20,[sp],#16
-    pop_v_regs
+
     ret
 
 
diff --git a/common/arm64/ihevc_sao_edge_offset_class1.s b/common/arm64/ihevc_sao_edge_offset_class1.s
index 8ed6169..515b349 100644
--- a/common/arm64/ihevc_sao_edge_offset_class1.s
+++ b/common/arm64/ihevc_sao_edge_offset_class1.s
@@ -76,7 +76,7 @@ ihevc_sao_edge_offset_class1_av8:
     LDR         w7,[sp,#8]                  //Loads wd
     LDR         w8,[sp,#16]                 //Loads ht
 
-    push_v_regs
+
     stp         x19, x20,[sp,#-16]!
 
     SUB         x9,x7,#1                    //wd - 1
@@ -128,16 +128,16 @@ WIDTH_LOOP_16:
 
     MOV         x10,x0                      //*pu1_src
 
-    LD1         {v8.16b},[x9],#16           //pu1_top_row = vld1q_u8(pu1_src_top_cpy || pu1_src - src_strd)
-    LD1         {v10.16b},[x0],#16          //pu1_cur_row = vld1q_u8(pu1_src)
+    LD1         {v1.16b},[x9],#16           //pu1_top_row = vld1q_u8(pu1_src_top_cpy || pu1_src - src_strd)
+    LD1         {v3.16b},[x0],#16           //pu1_cur_row = vld1q_u8(pu1_src)
 
     LD1         {v30.16b},[x12],#16         //vld1q_u8(pu1_src[(ht - 1) * src_strd])
-    cmhi        v12.16b,  v10.16b ,  v8.16b //vcgtq_u8(pu1_cur_row, pu1_top_row)
+    cmhi        v5.16b,  v3.16b ,  v1.16b   //vcgtq_u8(pu1_cur_row, pu1_top_row)
 
     ST1         { v30.16b},[x3],#16         //vst1q_u8(pu1_src_top[col])
-    cmhi        v14.16b,  v8.16b ,  v10.16b //vcltq_u8(pu1_cur_row, pu1_top_row)
+    cmhi        v17.16b,  v1.16b ,  v3.16b  //vcltq_u8(pu1_cur_row, pu1_top_row)
 
-    SUB         v16.16b,  v14.16b ,  v12.16b //sign_up = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
+    SUB         v16.16b,  v17.16b ,  v5.16b //sign_up = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
     MOV         x11,x8                      //move ht to x11 for loop count
 
 PU1_SRC_LOOP:
@@ -145,59 +145,59 @@ PU1_SRC_LOOP:
     LD1         {v18.16b},[x10]             //pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
     ADD         x6,x10,x1                   //II Iteration *pu1_src + src_strd
 
-    cmhi        v12.16b,  v10.16b ,  v18.16b //vcgtq_u8(pu1_cur_row, pu1_top_row)
+    cmhi        v5.16b,  v3.16b ,  v18.16b  //vcgtq_u8(pu1_cur_row, pu1_top_row)
     LD1         {v30.16b},[x6]              //II pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
 
-    cmhi        v14.16b,  v18.16b ,  v10.16b //vcltq_u8(pu1_cur_row, pu1_top_row)
+    cmhi        v17.16b,  v18.16b ,  v3.16b //vcltq_u8(pu1_cur_row, pu1_top_row)
     SUB         x10,x10,x1
 
-    SUB         v20.16b,  v14.16b ,  v12.16b //sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
+    SUB         v20.16b,  v17.16b ,  v5.16b //sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
     Uxtl        v26.8h, v18.8b              //II pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
 
-    ADD         v12.16b,  v0.16b ,  v16.16b //edge_idx = vaddq_s8(const_2, sign_up)
+    ADD         v5.16b,  v0.16b ,  v16.16b  //edge_idx = vaddq_s8(const_2, sign_up)
     Uxtl2       v28.8h, v18.16b             //II pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
 
-    ADD         v12.16b,  v12.16b ,  v20.16b //edge_idx = vaddq_s8(edge_idx, sign_down)
+    ADD         v5.16b,  v5.16b ,  v20.16b  //edge_idx = vaddq_s8(edge_idx, sign_down)
     cmhi        v22.16b,  v18.16b ,  v30.16b //II vcgtq_u8(pu1_cur_row, pu1_top_row)
 
     NEG         v16.16b, v20.16b            //sign_up = vnegq_s8(sign_down)
-    TBL         v12.16b, {v6.16b},v12.16b   //vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
+    TBL         v5.16b, {v6.16b},v5.16b     //vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
     cmhi        v24.16b,  v30.16b ,  v18.16b //II vcltq_u8(pu1_cur_row, pu1_top_row)
 
-    SUB         v8.16b,  v24.16b ,  v22.16b //II sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
+    SUB         v1.16b,  v24.16b ,  v22.16b //II sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
 //  TBL v13.8b, {v6.16b},v13.8b                //vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
     ADD         v22.16b,  v0.16b ,  v16.16b //II edge_idx = vaddq_s8(const_2, sign_up)
 
 
-    NEG         v16.16b, v8.16b             //II sign_up = vnegq_s8(sign_down)
-    TBL         v12.16b, {v7.16b},v12.16b   //offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx))
-    ADD         v22.16b,  v22.16b ,  v8.16b //II edge_idx = vaddq_s8(edge_idx, sign_down)
+    NEG         v16.16b, v1.16b             //II sign_up = vnegq_s8(sign_down)
+    TBL         v5.16b, {v7.16b},v5.16b     //offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx))
+    ADD         v22.16b,  v22.16b ,  v1.16b //II edge_idx = vaddq_s8(edge_idx, sign_down)
 
 
-    Uxtl        v20.8h, v10.8b              //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
+    Uxtl        v20.8h, v3.8b               //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
     TBL         v22.16b, {v6.16b},v22.16b   //II vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
-    SADDW       v20.8h,  v20.8h ,  v12.8b   //pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
+    SADDW       v20.8h,  v20.8h ,  v5.8b    //pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
 
     SMAX        v20.8h,  v20.8h ,  v2.8h    //pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
 //  TBL v23.8b, {v6.16b},v23.8b                //II vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
     UMIN        v20.8h,  v20.8h ,  v4.8h    //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
 
 
-    Uxtl2       v8.8h, v10.16b              //pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
+    Uxtl2       v1.8h, v3.16b               //pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
 //  TBL v13.8b, {v7.16b},v13.8b                    //offset = vtbl1_s8(offset_tbl, vget_high_s8(edge_idx))
-    mov         v10.16b, v30.16b            //II pu1_cur_row = pu1_next_row
+    mov         v3.16b, v30.16b             //II pu1_cur_row = pu1_next_row
 
-    SADDW2      v8.8h,  v8.8h ,  v12.16b    //pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
+    SADDW2      v1.8h,  v1.8h ,  v5.16b     //pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
     TBL         v24.16b, {v7.16b},v22.16b   //II offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx))
-    SMAX        v8.8h,  v8.8h ,  v2.8h      //pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
+    SMAX        v1.8h,  v1.8h ,  v2.8h      //pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
 
-    UMIN        v8.8h,  v8.8h ,  v4.8h      //pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip))
+    UMIN        v1.8h,  v1.8h ,  v4.8h      //pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip))
 //  TBL v25.8b, {v7.16b},v23.8b                    //II offset = vtbl1_s8(offset_tbl, vget_high_s8(edge_idx))
 
     xtn         v20.8b,  v20.8h             //vmovn_s16(pi2_tmp_cur_row.val[0])
     SADDW       v26.8h,  v26.8h ,  v24.8b   //II pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
 
-    xtn2        v20.16b,  v8.8h             //vmovn_s16(pi2_tmp_cur_row.val[1])
+    xtn2        v20.16b,  v1.8h             //vmovn_s16(pi2_tmp_cur_row.val[1])
     SADDW2      v28.8h,  v28.8h ,  v24.16b  //II pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
 
 
@@ -220,9 +220,9 @@ PU1_SRC_LOOP:
 
     ADD         x10,x10,x1                  //*pu1_src + src_strd
     LD1         {v18.16b},[x10]             //pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
-    cmhi        v12.16b,  v10.16b ,  v18.16b //vcgtq_u8(pu1_cur_row, pu1_top_row)
-    cmhi        v14.16b,  v18.16b ,  v10.16b //vcltq_u8(pu1_cur_row, pu1_top_row)
-    SUB         v20.16b,  v14.16b ,  v12.16b //sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
+    cmhi        v5.16b,  v3.16b ,  v18.16b  //vcgtq_u8(pu1_cur_row, pu1_top_row)
+    cmhi        v17.16b,  v18.16b ,  v3.16b //vcltq_u8(pu1_cur_row, pu1_top_row)
+    SUB         v20.16b,  v17.16b ,  v5.16b //sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
     SUB         x10,x10,x1
 
     ADD         v22.16b,  v0.16b ,  v16.16b //edge_idx = vaddq_s8(const_2, sign_up)
@@ -231,13 +231,13 @@ PU1_SRC_LOOP:
 //  TBL v23.8b, {v6.16b},v23.8b                //vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
 
     TBL         v24.16b, {v7.16b},v22.16b   //offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx))
-    Uxtl        v26.8h, v10.8b              //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
+    Uxtl        v26.8h, v3.8b               //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
     SADDW       v26.8h,  v26.8h ,  v24.8b   //pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
     SMAX        v26.8h,  v26.8h ,  v2.8h    //pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
     UMIN        v26.8h,  v26.8h ,  v4.8h    //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
 
 //  TBL v25.8b, {v7.16b},v23.8b                    //offset = vtbl1_s8(offset_tbl, vget_high_s8(edge_idx))
-    Uxtl2       v28.8h, v10.16b             //pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
+    Uxtl2       v28.8h, v3.16b              //pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
     SADDW2      v28.8h,  v28.8h ,  v24.16b  //pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
     SMAX        v28.8h,  v28.8h ,  v2.8h    //pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
     UMIN        v28.8h,  v28.8h ,  v4.8h    //pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip))
@@ -248,7 +248,7 @@ PU1_SRC_LOOP:
     ST1         { v30.16b},[x10],x1         //vst1q_u8(pu1_src_cpy, pu1_cur_row)
 
 PU1_SRC_LOOP_END:
-    mov         v10.16b, v18.16b            //pu1_cur_row = pu1_next_row
+    mov         v3.16b, v18.16b             //pu1_cur_row = pu1_next_row
     SUBS        x7,x7,#16                   //Decrement the wd loop count by 16
     CMP         x7,#8                       //Check whether residue remains
     BEQ         WIDTH_RESIDUE               //If residue remains jump to residue loop
@@ -264,15 +264,15 @@ WIDTH_RESIDUE:
     csel        x9, x3, x9,NE               //*pu1_src_top
     MOV         x10,x0
 
-    LD1         {v8.16b},[x9],#16           //pu1_top_row = vld1q_u8(pu1_src_top_cpy || pu1_src - src_strd)
-    LD1         {v10.16b},[x0],#16          //pu1_cur_row = vld1q_u8(pu1_src)
+    LD1         {v1.16b},[x9],#16           //pu1_top_row = vld1q_u8(pu1_src_top_cpy || pu1_src - src_strd)
+    LD1         {v3.16b},[x0],#16           //pu1_cur_row = vld1q_u8(pu1_src)
 
     LD1         {v30.8b},[x12]              //vld1_u8(pu1_src[(ht - 1) * src_strd])
     ST1         {v30.8b},[x3]               //vst1_u8(pu1_src_top[col])
 
-    cmhi        v12.16b,  v10.16b ,  v8.16b //vcgtq_u8(pu1_cur_row, pu1_top_row)
-    cmhi        v14.16b,  v8.16b ,  v10.16b //vcltq_u8(pu1_cur_row, pu1_top_row)
-    SUB         v16.16b,  v14.16b ,  v12.16b //sign_up = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
+    cmhi        v5.16b,  v3.16b ,  v1.16b   //vcgtq_u8(pu1_cur_row, pu1_top_row)
+    cmhi        v17.16b,  v1.16b ,  v3.16b  //vcltq_u8(pu1_cur_row, pu1_top_row)
+    SUB         v16.16b,  v17.16b ,  v5.16b //sign_up = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
     MOV         x11,x8                      //move ht to x11 for loop count
 
 PU1_SRC_LOOP_RESIDUE:
@@ -280,33 +280,33 @@ PU1_SRC_LOOP_RESIDUE:
     LD1         {v18.16b},[x10]             //pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
     ADD         x6,x10,x1                   //II Iteration *pu1_src + src_strd
 
-    cmhi        v12.16b,  v10.16b ,  v18.16b //vcgtq_u8(pu1_cur_row, pu1_next_row)
+    cmhi        v5.16b,  v3.16b ,  v18.16b  //vcgtq_u8(pu1_cur_row, pu1_next_row)
     LD1         {v30.16b},[x6]              //II pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
 
-    cmhi        v14.16b,  v18.16b ,  v10.16b //vcltq_u8(pu1_cur_row, pu1_next_row)
+    cmhi        v17.16b,  v18.16b ,  v3.16b //vcltq_u8(pu1_cur_row, pu1_next_row)
     SUB         x10,x10,x1
 
-    SUB         v20.16b,  v14.16b ,  v12.16b //sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
+    SUB         v20.16b,  v17.16b ,  v5.16b //sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
     Uxtl        v26.8h, v18.8b              //II pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
 
-    ADD         v12.16b,  v0.16b ,  v16.16b //edge_idx = vaddq_s8(const_2, sign_up)
+    ADD         v5.16b,  v0.16b ,  v16.16b  //edge_idx = vaddq_s8(const_2, sign_up)
     cmhi        v22.16b,  v18.16b ,  v30.16b //II vcgtq_u8(pu1_cur_row, pu1_next_row)
 
-    ADD         v12.16b,  v12.16b ,  v20.16b //edge_idx = vaddq_s8(edge_idx, sign_down)
+    ADD         v5.16b,  v5.16b ,  v20.16b  //edge_idx = vaddq_s8(edge_idx, sign_down)
     cmhi        v24.16b,  v30.16b ,  v18.16b //II vcltq_u8(pu1_cur_row, pu1_next_row)
 
     NEG         v16.16b, v20.16b            //sign_up = vnegq_s8(sign_down)
-    TBL         v12.8b, {v6.16b},v12.8b     //vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
+    TBL         v5.8b, {v6.16b},v5.8b       //vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
     SUB         v20.16b,  v24.16b ,  v22.16b //II sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
 
     ADD         v22.16b,  v0.16b ,  v16.16b //II edge_idx = vaddq_s8(const_2, sign_up)
-    TBL         v12.8b, {v7.16b},v12.8b     //offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx))
+    TBL         v5.8b, {v7.16b},v5.8b       //offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx))
     NEG         v16.16b, v20.16b            //II sign_up = vnegq_s8(sign_down)
 
     ADD         v22.16b,  v22.16b ,  v20.16b //II edge_idx = vaddq_s8(edge_idx, sign_down)
-    Uxtl        v20.8h, v10.8b              //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
+    Uxtl        v20.8h, v3.8b               //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
 
-    SADDW       v20.8h,  v20.8h ,  v12.8b   //pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
+    SADDW       v20.8h,  v20.8h ,  v5.8b    //pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
     TBL         v22.8b, {v6.16b},v22.8b     //II vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
     SMAX        v20.8h,  v20.8h ,  v2.8h    //pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
 
@@ -318,7 +318,7 @@ PU1_SRC_LOOP_RESIDUE:
     SMAX        v26.8h,  v26.8h ,  v2.8h    //II pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
     UMIN        v26.8h,  v26.8h ,  v4.8h    //II pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
 
-    mov         v10.16b, v30.16b            //II pu1_cur_row = pu1_next_row
+    mov         v3.16b, v30.16b             //II pu1_cur_row = pu1_next_row
     ST1         {v20.8b},[x10],x1           //vst1q_u8(pu1_src_cpy, pu1_cur_row)
     xtn         v30.8b,  v26.8h             //II vmovn_s16(pi2_tmp_cur_row.val[0])
 
@@ -332,9 +332,9 @@ PU1_SRC_LOOP_RESIDUE:
 
     ADD         x10,x10,x1                  //*pu1_src + src_strd
     LD1         {v18.16b},[x10]             //pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
-    cmhi        v12.16b,  v10.16b ,  v18.16b //vcgtq_u8(pu1_cur_row, pu1_next_row)
-    cmhi        v14.16b,  v18.16b ,  v10.16b //vcltq_u8(pu1_cur_row, pu1_next_row)
-    SUB         v20.16b,  v14.16b ,  v12.16b //sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
+    cmhi        v5.16b,  v3.16b ,  v18.16b  //vcgtq_u8(pu1_cur_row, pu1_next_row)
+    cmhi        v17.16b,  v18.16b ,  v3.16b //vcltq_u8(pu1_cur_row, pu1_next_row)
+    SUB         v20.16b,  v17.16b ,  v5.16b //sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
     SUB         x10,x10,x1
 
     ADD         v22.16b,  v0.16b ,  v16.16b //edge_idx = vaddq_s8(const_2, sign_up)
@@ -342,7 +342,7 @@ PU1_SRC_LOOP_RESIDUE:
     TBL         v22.8b, {v6.16b},v22.8b     //vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
 
     TBL         v24.8b, {v7.16b},v22.8b     //offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx))
-    Uxtl        v26.8h, v10.8b              //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
+    Uxtl        v26.8h, v3.8b               //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
     SADDW       v26.8h,  v26.8h ,  v24.8b   //pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
     SMAX        v26.8h,  v26.8h ,  v2.8h    //pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
     UMIN        v26.8h,  v26.8h ,  v4.8h    //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
@@ -354,7 +354,7 @@ PU1_SRC_LOOP_RESIDUE:
 END_LOOPS:
     // LDMFD sp!,{x4-x12,x15}             //Reload the registers from SP
     ldp         x19, x20,[sp], #16
-    pop_v_regs
+
     ret
 
 
diff --git a/common/arm64/ihevc_sao_edge_offset_class1_chroma.s b/common/arm64/ihevc_sao_edge_offset_class1_chroma.s
index 4baa5bf..894e702 100644
--- a/common/arm64/ihevc_sao_edge_offset_class1_chroma.s
+++ b/common/arm64/ihevc_sao_edge_offset_class1_chroma.s
@@ -76,7 +76,7 @@ ihevc_sao_edge_offset_class1_chroma_av8:
     ldr         w11,[sp,#24]
 
 
-    push_v_regs
+
     // STMFD sp!, {x4-x12, x14}            //stack stores the values of the arguments
     stp         x19, x20,[sp,#-16]!
     stp         x21, x22,[sp,#-16]!
@@ -135,7 +135,7 @@ SRC_LEFT_LOOP:
     LDR         x14, [x14, #:got_lo12:gi1_table_edge_idx]
     LD1         {v6.8b},[x14]               //edge_idx_tbl = vld1_s8(gi1_table_edge_idx)
     LD1         {v7.8b},[x6]                //offset_tbl_u = vld1_s8(pi1_sao_offset_u)
-    LD1         {v8.8b},[x7]                //offset_tbl_v = vld1_s8(pi1_sao_offset_v)
+    LD1         {v1.8b},[x7]                //offset_tbl_v = vld1_s8(pi1_sao_offset_v)
 
     CMP         x8,#16                      //Compare wd with 16
     BLT         WIDTH_RESIDUE               //If not jump to WIDTH_RESIDUE where loop is unrolled for 8 case
@@ -151,17 +151,17 @@ WIDTH_LOOP_16:
 
     LD1         {v28.16b},[x11],#16         //pu1_top_row = vld1q_u8(pu1_src_top_cpy || pu1_src - src_strd)
     //LD1 {v29.8b},[x11],#8                    //pu1_top_row = vld1q_u8(pu1_src_top_cpy || pu1_src - src_strd)
-    LD1         {v10.16b},[x0],#16          //pu1_cur_row = vld1q_u8(pu1_src)
+    LD1         {v3.16b},[x0],#16           //pu1_cur_row = vld1q_u8(pu1_src)
     //LD1 {v11.8b},[x0],#8                    //pu1_cur_row = vld1q_u8(pu1_src)
 
     LD1         {v30.16b},[x12],#16         //vld1q_u8(pu1_src[(ht - 1) * src_strd])
     //LD1 {v31.8b},[x12],#8                    //vld1q_u8(pu1_src[(ht - 1) * src_strd])
-    cmhi        v12.16b,  v10.16b ,  v28.16b //vcgtq_u8(pu1_cur_row, pu1_top_row)
+    cmhi        v5.16b,  v3.16b ,  v28.16b  //vcgtq_u8(pu1_cur_row, pu1_top_row)
 
     ST1         { v30.16b},[x3],#16         //vst1q_u8(pu1_src_top[col])
-    cmhi        v14.16b,  v28.16b ,  v10.16b //vcltq_u8(pu1_cur_row, pu1_top_row)
+    cmhi        v19.16b,  v28.16b ,  v3.16b //vcltq_u8(pu1_cur_row, pu1_top_row)
 
-    SUB         v16.16b,  v14.16b ,  v12.16b //sign_up = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
+    SUB         v16.16b,  v19.16b ,  v5.16b //sign_up = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
     MOV         x11,x9                      //move ht to x11 for loop count
 
 PU1_SRC_LOOP:
@@ -172,47 +172,47 @@ PU1_SRC_LOOP:
     ADD         x6,x10,x1                   //II Iteration *pu1_src + src_strd
 
     //mov   v19.d[0],v18.d[1]
-    cmhi        v12.16b,  v10.16b ,  v18.16b //vcgtq_u8(pu1_cur_row, pu1_top_row)
+    cmhi        v5.16b,  v3.16b ,  v18.16b  //vcgtq_u8(pu1_cur_row, pu1_top_row)
     LD1         {v30.16b},[x6]              //II pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
     //LD1 {v31.8b},[x6]                    //II pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
     //SUB x6, x6,#8
 
-    cmhi        v14.16b,  v18.16b ,  v10.16b //vcltq_u8(pu1_cur_row, pu1_top_row)
+    cmhi        v19.16b,  v18.16b ,  v3.16b //vcltq_u8(pu1_cur_row, pu1_top_row)
     SUB         x10,x10,x1
 
-    SUB         v20.16b,  v14.16b ,  v12.16b //sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
+    SUB         v20.16b,  v19.16b ,  v5.16b //sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
     Uxtl        v26.8h, v18.8b              //II pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
 
-    ADD         v12.16b,  v0.16b ,  v16.16b //edge_idx = vaddq_s8(const_2, sign_up)
+    ADD         v5.16b,  v0.16b ,  v16.16b  //edge_idx = vaddq_s8(const_2, sign_up)
     Uxtl2       v28.8h, v18.16b             //II pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
 
-    ADD         v12.16b,  v12.16b ,  v20.16b //edge_idx = vaddq_s8(edge_idx, sign_down)
+    ADD         v5.16b,  v5.16b ,  v20.16b  //edge_idx = vaddq_s8(edge_idx, sign_down)
     cmhi        v22.16b,  v18.16b ,  v30.16b //II vcgtq_u8(pu1_cur_row, pu1_top_row)
 
     mov         v16.d[1],v16.d[0]
     NEG         v16.16b, v20.16b            //sign_up = vnegq_s8(sign_down)
-    TBL         v12.16b, {v6.16b},v12.16b   //vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
+    TBL         v5.16b, {v6.16b},v5.16b     //vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
     cmhi        v24.16b,  v30.16b ,  v18.16b //II vcltq_u8(pu1_cur_row, pu1_top_row)
 
     SUB         v28.16b,  v24.16b ,  v22.16b //II sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
     //TBL v13.8b, {v6.16b},v13.8b                //vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
     ADD         v22.16b,  v0.16b ,  v16.16b //II edge_idx = vaddq_s8(const_2, sign_up)
 
-    mov         v13.d[0], v12.d[1]
-    UZP1        v27.8b, v12.8b, v13.8b
-    UZP2        v13.8b, v12.8b, v13.8b
-    mov         v12.8b,v27.8b
+    mov         v17.d[0], v5.d[1]
+    UZP1        v27.8b, v5.8b, v17.8b
+    UZP2        v17.8b, v5.8b, v17.8b
+    mov         v5.8b,v27.8b
     NEG         v16.16b, v28.16b            //II sign_up = vnegq_s8(sign_down)
-    TBL         v12.8b, {v7.16b},v12.8b     //offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx))
+    TBL         v5.8b, {v7.16b},v5.8b       //offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx))
     ADD         v22.16b,  v22.16b ,  v28.16b //II edge_idx = vaddq_s8(edge_idx, sign_down)
 
-    Uxtl        v20.8h, v10.8b              //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
-    TBL         v13.8b, {v8.16b},v13.8b
-    ZIP1        v27.8b, v12.8b, v13.8b
-    ZIP2        v13.8b, v12.8b, v13.8b
-    mov         v12.8b,v27.8b
+    Uxtl        v20.8h, v3.8b               //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
+    TBL         v17.8b, {v1.16b},v17.8b
+    ZIP1        v27.8b, v5.8b, v17.8b
+    ZIP2        v17.8b, v5.8b, v17.8b
+    mov         v5.8b,v27.8b
 
-    SADDW       v20.8h,  v20.8h ,  v12.8b   //pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
+    SADDW       v20.8h,  v20.8h ,  v5.8b    //pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
     TBL         v22.16b, {v6.16b},v22.16b   //II vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
     SMAX        v20.8h,  v20.8h ,  v2.8h    //pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
 
@@ -224,15 +224,15 @@ PU1_SRC_LOOP:
     UZP2        v23.8b, v22.8b, v23.8b
     mov         v22.8b,v27.8b
 
-    Uxtl2       v28.8h, v10.16b             //pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
+    Uxtl2       v28.8h, v3.16b              //pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
     //VTBL.8        D13,D7,D13                    @offset = vtbl1_s8(offset_tbl, vget_high_s8(edge_idx))
-    mov         v10.16b, v30.16b            //II pu1_cur_row = pu1_next_row
+    mov         v3.16b, v30.16b             //II pu1_cur_row = pu1_next_row
 
-    SADDW       v28.8h,  v28.8h ,  v13.8b   //pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
+    SADDW       v28.8h,  v28.8h ,  v17.8b   //pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
     TBL         v24.8b, {v7.16b},v22.8b     //offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx))
     SMAX        v28.8h,  v28.8h ,  v2.8h    //pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
 
-    TBL         v25.8b, {v8.16b},v23.8b
+    TBL         v25.8b, {v1.16b},v23.8b
     ZIP1        v27.8b, v24.8b, v25.8b
     ZIP2        v25.8b, v24.8b, v25.8b
     mov         v24.8b,v27.8b
@@ -270,9 +270,9 @@ PU1_SRC_LOOP:
     LD1         {v18.16b},[x10]             //pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
     //LD1 {v19.8b},[x10]                    //pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
     //SUB x10, x10,#8
-    cmhi        v12.16b,  v10.16b ,  v18.16b //vcgtq_u8(pu1_cur_row, pu1_top_row)
-    cmhi        v14.16b,  v18.16b ,  v10.16b //vcltq_u8(pu1_cur_row, pu1_top_row)
-    SUB         v20.16b,  v14.16b ,  v12.16b //sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
+    cmhi        v5.16b,  v3.16b ,  v18.16b  //vcgtq_u8(pu1_cur_row, pu1_top_row)
+    cmhi        v19.16b,  v18.16b ,  v3.16b //vcltq_u8(pu1_cur_row, pu1_top_row)
+    SUB         v20.16b,  v19.16b ,  v5.16b //sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
     SUB         x10,x10,x1
 
     ADD         v22.16b,  v0.16b ,  v16.16b //edge_idx = vaddq_s8(const_2, sign_up)
@@ -285,19 +285,19 @@ PU1_SRC_LOOP:
     UZP2        v23.8b, v22.8b, v23.8b
     mov         v22.8b,v27.8b
     TBL         v24.8b, {v7.16b},v22.8b
-    TBL         v25.8b, {v8.16b},v23.8b
+    TBL         v25.8b, {v1.16b},v23.8b
     ZIP1        v27.8b, v24.8b, v25.8b
     ZIP2        v25.8b, v24.8b, v25.8b
     mov         v24.8b,v27.8b
 
     //VTBL.8        D24,D7,D22                    @offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx))
-    Uxtl        v26.8h, v10.8b              //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
+    Uxtl        v26.8h, v3.8b               //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
     SADDW       v26.8h,  v26.8h ,  v24.8b   //pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
     SMAX        v26.8h,  v26.8h ,  v2.8h    //pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
     UMIN        v26.8h,  v26.8h ,  v4.8h    //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
 
     //VTBL.8        D25,D7,D23                    @offset = vtbl1_s8(offset_tbl, vget_high_s8(edge_idx))
-    Uxtl2       v28.8h, v10.16b             //pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
+    Uxtl2       v28.8h, v3.16b              //pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
     SADDW       v28.8h,  v28.8h ,  v25.8b   //pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
     SMAX        v28.8h,  v28.8h ,  v2.8h    //pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
     UMIN        v28.8h,  v28.8h ,  v4.8h    //pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip))
@@ -308,7 +308,7 @@ PU1_SRC_LOOP:
     ST1         { v30.16b},[x10],x1         //vst1q_u8(pu1_src_cpy, pu1_cur_row)
 
 PU1_SRC_LOOP_END:
-    mov         v10.16b, v18.16b            //pu1_cur_row = pu1_next_row
+    mov         v3.16b, v18.16b             //pu1_cur_row = pu1_next_row
     SUBS        x8,x8,#16                   //Decrement the wd loop count by 16
     CMP         x8,#8                       //Check whether residue remains
     BEQ         WIDTH_RESIDUE               //If residue remains jump to residue loop
@@ -326,15 +326,15 @@ WIDTH_RESIDUE:
 
     LD1         {v28.16b},[x11]             //pu1_top_row = vld1q_u8(pu1_src_top_cpy || pu1_src - src_strd)
     //LD1 {v29.8b},[x11],#8                    //pu1_top_row = vld1q_u8(pu1_src_top_cpy || pu1_src - src_strd)
-    LD1         {v10.16b},[x0]              //pu1_cur_row = vld1q_u8(pu1_src)
+    LD1         {v3.16b},[x0]               //pu1_cur_row = vld1q_u8(pu1_src)
     //LD1 {v11.8b},[x0],#8                    //pu1_cur_row = vld1q_u8(pu1_src)
 
     LD1         {v30.8b},[x12]              //vld1_u8(pu1_src[(ht - 1) * src_strd])
     ST1         {v30.8b},[x3]               //vst1_u8(pu1_src_top[col])
 
-    cmhi        v12.16b,  v10.16b ,  v28.16b //vcgtq_u8(pu1_cur_row, pu1_top_row)
-    cmhi        v14.16b,  v28.16b ,  v10.16b //vcltq_u8(pu1_cur_row, pu1_top_row)
-    SUB         v16.16b,  v14.16b ,  v12.16b //sign_up = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
+    cmhi        v5.16b,  v3.16b ,  v28.16b  //vcgtq_u8(pu1_cur_row, pu1_top_row)
+    cmhi        v19.16b,  v28.16b ,  v3.16b //vcltq_u8(pu1_cur_row, pu1_top_row)
+    SUB         v16.16b,  v19.16b ,  v5.16b //sign_up = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
     MOV         x11,x9                      //move ht to x11 for loop count
 
 PU1_SRC_LOOP_RESIDUE:
@@ -344,46 +344,46 @@ PU1_SRC_LOOP_RESIDUE:
     //SUB x10, x10,#8
     ADD         x6,x10,x1                   //II Iteration *pu1_src + src_strd
 
-    cmhi        v12.16b,  v10.16b ,  v18.16b //vcgtq_u8(pu1_cur_row, pu1_next_row)
+    cmhi        v5.16b,  v3.16b ,  v18.16b  //vcgtq_u8(pu1_cur_row, pu1_next_row)
     LD1         {v30.16b},[x6]              //II pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
     //LD1 {v31.8b},[x6]                    //II pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
     //SUB x6, x6,#8
 
-    cmhi        v14.16b,  v18.16b ,  v10.16b //vcltq_u8(pu1_cur_row, pu1_next_row)
+    cmhi        v19.16b,  v18.16b ,  v3.16b //vcltq_u8(pu1_cur_row, pu1_next_row)
     SUB         x10,x10,x1
 
-    SUB         v20.16b,  v14.16b ,  v12.16b //sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
+    SUB         v20.16b,  v19.16b ,  v5.16b //sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
     Uxtl        v26.8h, v18.8b              //II pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
 
-    ADD         v12.16b,  v0.16b ,  v16.16b //edge_idx = vaddq_s8(const_2, sign_up)
+    ADD         v5.16b,  v0.16b ,  v16.16b  //edge_idx = vaddq_s8(const_2, sign_up)
     cmhi        v22.16b,  v18.16b ,  v30.16b //II vcgtq_u8(pu1_cur_row, pu1_next_row)
 
-    ADD         v12.16b,  v12.16b ,  v20.16b //edge_idx = vaddq_s8(edge_idx, sign_down)
+    ADD         v5.16b,  v5.16b ,  v20.16b  //edge_idx = vaddq_s8(edge_idx, sign_down)
     cmhi        v24.16b,  v30.16b ,  v18.16b //II vcltq_u8(pu1_cur_row, pu1_next_row)
 
     NEG         v16.16b, v20.16b            //sign_up = vnegq_s8(sign_down)
-    TBL         v12.8b, {v6.16b},v12.8b     //vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
+    TBL         v5.8b, {v6.16b},v5.8b       //vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
     SUB         v20.16b,  v24.16b ,  v22.16b //II sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
 
-    UZP1        v27.8b, v12.8b, v13.8b
-    UZP2        v13.8b, v12.8b, v13.8b
-    mov         v12.8b,v27.8b
+    UZP1        v27.8b, v5.8b, v17.8b
+    UZP2        v17.8b, v5.8b, v17.8b
+    mov         v5.8b,v27.8b
 
     ADD         v22.16b,  v0.16b ,  v16.16b //II edge_idx = vaddq_s8(const_2, sign_up)
-    TBL         v12.8b, {v7.16b},v12.8b
+    TBL         v5.8b, {v7.16b},v5.8b
     NEG         v16.16b, v20.16b            //II sign_up = vnegq_s8(sign_down)
 
-    TBL         v13.8b, {v8.16b},v13.8b
-    ZIP1        v27.8b, v12.8b, v13.8b
-    ZIP2        v13.8b, v12.8b, v13.8b
-    mov         v12.8b,v27.8b
+    TBL         v17.8b, {v1.16b},v17.8b
+    ZIP1        v27.8b, v5.8b, v17.8b
+    ZIP2        v17.8b, v5.8b, v17.8b
+    mov         v5.8b,v27.8b
 
     //VTBL.8        D12,D7,D12                    @offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx))
 
     ADD         v22.16b,  v22.16b ,  v20.16b //II edge_idx = vaddq_s8(edge_idx, sign_down)
-    Uxtl        v20.8h, v10.8b              //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
+    Uxtl        v20.8h, v3.8b               //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
 
-    SADDW       v20.8h,  v20.8h ,  v12.8b   //pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
+    SADDW       v20.8h,  v20.8h ,  v5.8b    //pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
     TBL         v22.8b, {v6.16b},v22.8b     //II vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
     SMAX        v20.8h,  v20.8h ,  v2.8h    //pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
 
@@ -395,7 +395,7 @@ PU1_SRC_LOOP_RESIDUE:
     TBL         v24.8b, {v7.16b},v22.8b
     xtn         v20.8b,  v20.8h             //vmovn_s16(pi2_tmp_cur_row.val[0])
 
-    TBL         v25.8b, {v8.16b},v23.8b
+    TBL         v25.8b, {v1.16b},v23.8b
     ZIP1        v27.8b, v24.8b, v25.8b
     ZIP2        v25.8b, v24.8b, v25.8b
     mov         v24.8b,v27.8b
@@ -405,7 +405,7 @@ PU1_SRC_LOOP_RESIDUE:
     SMAX        v26.8h,  v26.8h ,  v2.8h    //II pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
     UMIN        v26.8h,  v26.8h ,  v4.8h    //II pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
 
-    mov         v10.16b, v30.16b            //II pu1_cur_row = pu1_next_row
+    mov         v3.16b, v30.16b             //II pu1_cur_row = pu1_next_row
     ST1         {v20.8b},[x10],x1           //vst1q_u8(pu1_src_cpy, pu1_cur_row)
     xtn         v30.8b,  v26.8h             //II vmovn_s16(pi2_tmp_cur_row.val[0])
 
@@ -421,9 +421,9 @@ PU1_SRC_LOOP_RESIDUE:
     LD1         {v18.16b},[x10]             //pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
     //LD1 {v19.8b},[x10]                    //pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
     //SUB x10, x10,#8
-    cmhi        v12.16b,  v10.16b ,  v18.16b //vcgtq_u8(pu1_cur_row, pu1_next_row)
-    cmhi        v14.16b,  v18.16b ,  v10.16b //vcltq_u8(pu1_cur_row, pu1_next_row)
-    SUB         v20.16b,  v14.16b ,  v12.16b //sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
+    cmhi        v5.16b,  v3.16b ,  v18.16b  //vcgtq_u8(pu1_cur_row, pu1_next_row)
+    cmhi        v19.16b,  v18.16b ,  v3.16b //vcltq_u8(pu1_cur_row, pu1_next_row)
+    SUB         v20.16b,  v19.16b ,  v5.16b //sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
     SUB         x10,x10,x1
 
     ADD         v22.16b,  v0.16b ,  v16.16b //edge_idx = vaddq_s8(const_2, sign_up)
@@ -435,13 +435,13 @@ PU1_SRC_LOOP_RESIDUE:
     mov         v22.8b,v27.8b
 
     TBL         v24.8b, {v7.16b},v22.8b
-    TBL         v25.8b, {v8.16b},v23.8b
+    TBL         v25.8b, {v1.16b},v23.8b
     ZIP1        v27.8b, v24.8b, v25.8b
     ZIP2        v25.8b, v24.8b, v25.8b
     mov         v24.8b,v27.8b
 
     //VTBL.8        D24,D7,D22                    @offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx))
-    Uxtl        v26.8h, v10.8b              //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
+    Uxtl        v26.8h, v3.8b               //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
     SADDW       v26.8h,  v26.8h ,  v24.8b   //pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
     SMAX        v26.8h,  v26.8h ,  v2.8h    //pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
     UMIN        v26.8h,  v26.8h ,  v4.8h    //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
@@ -456,7 +456,7 @@ END_LOOPS:
     ldp         x23, x24,[sp],#16
     ldp         x21, x22,[sp],#16
     ldp         x19, x20,[sp],#16
-    pop_v_regs
+
 
     ret
 
diff --git a/common/arm64/ihevc_sao_edge_offset_class2.s b/common/arm64/ihevc_sao_edge_offset_class2.s
index 3350e5c..31852f3 100644
--- a/common/arm64/ihevc_sao_edge_offset_class2.s
+++ b/common/arm64/ihevc_sao_edge_offset_class2.s
@@ -79,7 +79,7 @@ ihevc_sao_edge_offset_class2_av8:
     MOV         x16,x7 // wd
     MOV         x17,x8 // ht
 
-    push_v_regs
+
     stp         x19, x20,[sp,#-16]!
     stp         x21, x22,[sp,#-16]!
     stp         x23, x24,[sp,#-16]!
@@ -218,7 +218,7 @@ PU1_AVAIL:
     csel        x12, x20, x12,EQ
 
     MOV         x6,x7                       //move wd to x6 loop_count
-    movi        v8.16b, #0xFF               //au1_mask = vdupq_n_s8(-1)
+    movi        v1.16b, #0xFF               //au1_mask = vdupq_n_s8(-1)
     ADD         x20,x14,#1                  //pu1_src_left_cpy += 1
     csel        x14, x20, x14,EQ
 
@@ -239,11 +239,11 @@ WIDTH_LOOP_16:
     MOV         x20,#-1
     csel        x8, x20, x8,NE              //au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
 
-    mov         v8.8b[0], w8                //au1_mask = vsetq_lane_s8((-1||pu1_avail[0]), au1_mask, 0)
+    mov         v1.8b[0], w8                //au1_mask = vsetq_lane_s8((-1||pu1_avail[0]), au1_mask, 0)
     CMP         x6,#16                      //if(col == 16)
     BNE         SKIP_AU1_MASK_VAL
     LDRB        w8,[x5,#1]                  //pu1_avail[1]
-    mov         v8.16b[15], w8              //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
+    mov         v1.16b[15], w8              //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
 
 SKIP_AU1_MASK_VAL:
     LDRB        w11,[x5,#2]                 //pu1_avail[2]
@@ -255,23 +255,23 @@ SKIP_AU1_MASK_VAL:
     SUB         x8,x8,#1                    //pu1_src_top_cpy - 1 || pu1_src - src_strd - 1
 
     MOV         x7,x16                      //Loads wd
-    LD1         {v10.16b},[x8]              //pu1_top_row = vld1q_u8(pu1_src - src_strd - 1) || vld1q_u8(pu1_src_top_cpy - 1)
+    LD1         {v3.16b},[x8]               //pu1_top_row = vld1q_u8(pu1_src - src_strd - 1) || vld1q_u8(pu1_src_top_cpy - 1)
     ADD         x3,x3,#16
 
     ADD         x5,sp,#0x42                 //*au1_src_left_tmp
-    LD1         {v12.16b},[x0]              //pu1_cur_row = vld1q_u8(pu1_src)
+    LD1         {v5.16b},[x0]               //pu1_cur_row = vld1q_u8(pu1_src)
     MOV         x4,x17                      //Loads ht
 
     SUB         x7,x7,x6                    //(wd - col)
-    cmhi        v14.16b,  v12.16b ,  v10.16b //vcgtq_u8(pu1_cur_row, pu1_top_row)
+    cmhi        v17.16b,  v5.16b ,  v3.16b  //vcgtq_u8(pu1_cur_row, pu1_top_row)
     MOV         x8,x19                      //Loads *pu1_src
 
     ADD         x7,x7,#15                   //15 + (wd - col)
-    cmhi        v16.16b,  v10.16b ,  v12.16b //vcltq_u8(pu1_cur_row, pu1_top_row)
+    cmhi        v16.16b,  v3.16b ,  v5.16b  //vcltq_u8(pu1_cur_row, pu1_top_row)
     ADD         x7,x8,x7                    //pu1_src[0 * src_strd + 15 + (wd - col)]
 
     SUB         x5,x5,#1
-    SUB         v14.16b,  v16.16b ,  v14.16b //sign_up = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
+    SUB         v17.16b,  v16.16b ,  v17.16b //sign_up = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
 
 AU1_SRC_LEFT_LOOP:
     LDRB        w8,[x7]                     //load the value and increment by src_strd
@@ -307,36 +307,36 @@ SIGN_UP_CHANGE:
     csel        x4, x20, x4,LT              //I
     MOV         x20,#1
     csel        x4, x20, x4,GT              //I SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[ht_tmp - 1 - row])
-    mov         v14.8b[0], w4               //I sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[ht_tmp - 1 - row]), sign_up, 0)
+    mov         v17.8b[0], w4               //I sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[ht_tmp - 1 - row]), sign_up, 0)
 
 SIGN_UP_CHANGE_DONE:
-    cmhi        v10.16b,  v12.16b ,  v18.16b //I vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
-    ADD         v24.16b,  v0.16b ,  v14.16b //I edge_idx = vaddq_s8(const_2, sign_up)
+    cmhi        v3.16b,  v5.16b ,  v18.16b  //I vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
+    ADD         v24.16b,  v0.16b ,  v17.16b //I edge_idx = vaddq_s8(const_2, sign_up)
 
-    cmhi        v18.16b,  v18.16b ,  v12.16b //I vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
-    SUB         v10.16b,  v18.16b ,  v10.16b //I sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
+    cmhi        v18.16b,  v18.16b ,  v5.16b //I vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
+    SUB         v3.16b,  v18.16b ,  v3.16b  //I sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
 
-    ADD         v24.16b,  v24.16b ,  v10.16b //I edge_idx = vaddq_s8(edge_idx, sign_down)
+    ADD         v24.16b,  v24.16b ,  v3.16b //I edge_idx = vaddq_s8(edge_idx, sign_down)
     TBL         v18.16b, {v6.16b},v24.16b   //I vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
 //  TBL v19.8b, {v6.16b},v25.8b                //I vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
 
-    AND         v18.16b,  v18.16b ,  v8.16b //I edge_idx = vandq_s8(edge_idx, au1_mask)
+    AND         v18.16b,  v18.16b ,  v1.16b //I edge_idx = vandq_s8(edge_idx, au1_mask)
 
-    NEG         v14.16b, v10.16b            //I sign_up = vnegq_s8(sign_down)
-    TBL         v10.16b, {v7.16b},v18.16b   //I offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx))
-    EXT         v14.16b,  v14.16b ,  v14.16b,#15 //I sign_up = vextq_s8(sign_up, sign_up, 15)
+    NEG         v17.16b, v3.16b             //I sign_up = vnegq_s8(sign_down)
+    TBL         v3.16b, {v7.16b},v18.16b    //I offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx))
+    EXT         v17.16b,  v17.16b ,  v17.16b,#15 //I sign_up = vextq_s8(sign_up, sign_up, 15)
 
-    Uxtl        v20.8h, v12.8b              //I pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
+    Uxtl        v20.8h, v5.8b               //I pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
 //  TBL v11.8b, {v7.16b},v19.8b                    //I offset = vtbl1_s8(offset_tbl, vget_high_s8(edge_idx))
-    SADDW       v20.8h,  v20.8h ,  v10.8b   //I pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
+    SADDW       v20.8h,  v20.8h ,  v3.8b    //I pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
 
     SMAX        v20.8h,  v20.8h ,  v2.8h    //I pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
-    Uxtl2       v22.8h, v12.16b             //I pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
+    Uxtl2       v22.8h, v5.16b              //I pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
 
     UMIN        v20.8h,  v20.8h ,  v4.8h    //I pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
-    mov         v12.16b, v16.16b            //I pu1_cur_row = pu1_next_row
+    mov         v5.16b, v16.16b             //I pu1_cur_row = pu1_next_row
 
-    SADDW2      v22.8h,  v22.8h ,  v10.16b  //I pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
+    SADDW2      v22.8h,  v22.8h ,  v3.16b   //I pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
     xtn         v20.8b,  v20.8h             //I vmovn_s16(pi2_tmp_cur_row.val[0])
 
     SMAX        v22.8h,  v22.8h ,  v2.8h    //I pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
@@ -374,12 +374,12 @@ PU1_SRC_LOOP:
     EXT         v18.16b,  v30.16b ,  v18.16b,#1 //III pu1_next_row_tmp = vextq_u8(pu1_next_row, pu1_next_row_tmp, 1)
     LDRB        w2,[x0,x1]                  //III pu1_src_cpy[0]
 
-    cmhi        v24.16b,  v12.16b ,  v22.16b //II vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
+    cmhi        v24.16b,  v5.16b ,  v22.16b //II vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
     SUB         x5,x12,x7                   //III ht_tmp - row
 
     movn        x20,#0
     csel        x4, x20, x4,LT              //II
-    cmhi        v22.16b,  v22.16b ,  v12.16b //II vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
+    cmhi        v22.16b,  v22.16b ,  v5.16b //II vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
     ADD         x5,x14,x5                   //III pu1_src_left_cpy[ht_tmp - row]
 
     MOV         x20,#1
@@ -389,52 +389,52 @@ PU1_SRC_LOOP:
     LDRB        w5,[x5]                     //III load the value
 
     SUBS        x2,x2,x5                    //III pu1_src_cpy[0] - pu1_src_left_cpy[ht_tmp - 1 - row]
-    mov         v14.8b[0], w4               //II sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[ht_tmp - 1 - row]), sign_up, 0)
+    mov         v17.8b[0], w4               //II sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[ht_tmp - 1 - row]), sign_up, 0)
 
     movn        x20,#0
     csel        x2, x20, x2,LT              //III
-    cmhi        v10.16b,  v16.16b ,  v18.16b //III vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
+    cmhi        v3.16b,  v16.16b ,  v18.16b //III vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
     MOV         x20,#1
     csel        x2, x20, x2,GT              //III SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[ht_tmp - 1 - row])
 
-    ADD         v22.16b,  v0.16b ,  v14.16b //II edge_idx = vaddq_s8(const_2, sign_up)
+    ADD         v22.16b,  v0.16b ,  v17.16b //II edge_idx = vaddq_s8(const_2, sign_up)
     ADD         v22.16b,  v22.16b ,  v24.16b //II edge_idx = vaddq_s8(edge_idx, sign_down)
 
     cmhi        v18.16b,  v18.16b ,  v16.16b //III vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
     TBL         v22.16b, {v6.16b},v22.16b   //II vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
-    NEG         v14.16b, v24.16b            //II sign_up = vnegq_s8(sign_down)
+    NEG         v17.16b, v24.16b            //II sign_up = vnegq_s8(sign_down)
 
-    SUB         v10.16b,  v18.16b ,  v10.16b //III sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
+    SUB         v3.16b,  v18.16b ,  v3.16b  //III sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
 //  TBL v23.8b, {v6.16b},v23.8b                //II vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
-    EXT         v14.16b,  v14.16b ,  v14.16b,#15 //II sign_up = vextq_s8(sign_up, sign_up, 15)
+    EXT         v17.16b,  v17.16b ,  v17.16b,#15 //II sign_up = vextq_s8(sign_up, sign_up, 15)
 
-    AND         v22.16b,  v22.16b ,  v8.16b //II edge_idx = vandq_s8(edge_idx, au1_mask)
-    mov         v14.8b[0], w2               //III sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[ht_tmp - 1 - row]), sign_up, 0)
+    AND         v22.16b,  v22.16b ,  v1.16b //II edge_idx = vandq_s8(edge_idx, au1_mask)
+    mov         v17.8b[0], w2               //III sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[ht_tmp - 1 - row]), sign_up, 0)
 
-    ADD         v18.16b,  v0.16b ,  v14.16b //III edge_idx = vaddq_s8(const_2, sign_up)
+    ADD         v18.16b,  v0.16b ,  v17.16b //III edge_idx = vaddq_s8(const_2, sign_up)
     TBL         v24.16b, {v7.16b},v22.16b   //II offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx))
-    ADD         v18.16b,  v18.16b ,  v10.16b //III edge_idx = vaddq_s8(edge_idx, sign_down)
+    ADD         v18.16b,  v18.16b ,  v3.16b //III edge_idx = vaddq_s8(edge_idx, sign_down)
 
-    Uxtl        v26.8h, v12.8b              //II pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
+    Uxtl        v26.8h, v5.8b               //II pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
     TBL         v18.16b, {v6.16b},v18.16b   //III vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
-    NEG         v14.16b, v10.16b            //III sign_up = vnegq_s8(sign_down)
+    NEG         v17.16b, v3.16b             //III sign_up = vnegq_s8(sign_down)
 
     SADDW       v26.8h,  v26.8h ,  v24.8b   //II pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
 //  TBL v19.8b, {v6.16b},v19.8b                //III vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
-    EXT         v14.16b,  v14.16b ,  v14.16b,#15 //III sign_up = vextq_s8(sign_up, sign_up, 15)
+    EXT         v17.16b,  v17.16b ,  v17.16b,#15 //III sign_up = vextq_s8(sign_up, sign_up, 15)
 
-    AND         v18.16b,  v18.16b ,  v8.16b //III edge_idx = vandq_s8(edge_idx, au1_mask)
+    AND         v18.16b,  v18.16b ,  v1.16b //III edge_idx = vandq_s8(edge_idx, au1_mask)
     Uxtl        v20.8h, v16.8b              //III pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
 
     SMAX        v26.8h,  v26.8h ,  v2.8h    //II pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
-    TBL         v10.16b, {v7.16b},v18.16b   //III offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx))
-    SADDW       v20.8h,  v20.8h ,  v10.8b   //III pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
+    TBL         v3.16b, {v7.16b},v18.16b    //III offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx))
+    SADDW       v20.8h,  v20.8h ,  v3.8b    //III pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
 
     UMIN        v26.8h,  v26.8h ,  v4.8h    //II pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
 //  TBL v25.8b, {v7.16b},v23.8b                    //II offset = vtbl1_s8(offset_tbl, vget_high_s8(edge_idx))
     SMAX        v20.8h,  v20.8h ,  v2.8h    //III pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
 
-    Uxtl2       v28.8h, v12.16b             //II pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
+    Uxtl2       v28.8h, v5.16b              //II pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
     UMIN        v20.8h,  v20.8h ,  v4.8h    //III pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
 
     SADDW2      v28.8h,  v28.8h ,  v24.16b  //II pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
@@ -444,11 +444,11 @@ PU1_SRC_LOOP:
     UMIN        v28.8h,  v28.8h ,  v4.8h    //II pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip))
     Uxtl2       v18.8h, v16.16b             //III pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
 
-    mov         v12.16b, v30.16b            //III pu1_cur_row = pu1_next_row
+    mov         v5.16b, v30.16b             //III pu1_cur_row = pu1_next_row
     xtn         v26.8b,  v26.8h             //II vmovn_s16(pi2_tmp_cur_row.val[0])
 
     xtn2        v26.16b,  v28.8h            //II vmovn_s16(pi2_tmp_cur_row.val[1])
-    SADDW2      v18.8h,  v18.8h ,  v10.16b  //III pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
+    SADDW2      v18.8h,  v18.8h ,  v3.16b   //III pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
 
     SMAX        v18.8h,  v18.8h ,  v2.8h    //III pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
     xtn         v20.8b,  v20.8h             //III vmovn_s16(pi2_tmp_cur_row.val[0])
@@ -480,45 +480,45 @@ PU1_SRC_LOOP:
     EXT         v18.16b,  v16.16b ,  v18.16b,#1 //pu1_next_row_tmp = vextq_u8(pu1_next_row, pu1_next_row_tmp, 1)
     SUBS        x4,x2,x5                    //pu1_src_cpy[0] - pu1_src_left_cpy[ht_tmp - 1 - row]
 
-    cmhi        v10.16b,  v12.16b ,  v18.16b //vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
+    cmhi        v3.16b,  v5.16b ,  v18.16b  //vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
     movn        x20,#0
     csel        x4, x20, x4,LT
 
     MOV         x20,#1
     csel        x4, x20, x4,GT              //SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[ht_tmp - 1 - row])
-    cmhi        v18.16b,  v18.16b ,  v12.16b //vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
+    cmhi        v18.16b,  v18.16b ,  v5.16b //vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
 
-    mov         v14.8b[0], w4               //sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[ht_tmp - 1 - row]), sign_up, 0)
-    SUB         v10.16b,  v18.16b ,  v10.16b //sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
+    mov         v17.8b[0], w4               //sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[ht_tmp - 1 - row]), sign_up, 0)
+    SUB         v3.16b,  v18.16b ,  v3.16b  //sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
 
-    ADD         v18.16b,  v0.16b ,  v14.16b //edge_idx = vaddq_s8(const_2, sign_up)
-    ADD         v18.16b,  v18.16b ,  v10.16b //edge_idx = vaddq_s8(edge_idx, sign_down)
+    ADD         v18.16b,  v0.16b ,  v17.16b //edge_idx = vaddq_s8(const_2, sign_up)
+    ADD         v18.16b,  v18.16b ,  v3.16b //edge_idx = vaddq_s8(edge_idx, sign_down)
 
     TBL         v18.16b, {v6.16b},v18.16b   //vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
-    NEG         v14.16b, v10.16b            //sign_up = vnegq_s8(sign_down)
+    NEG         v17.16b, v3.16b             //sign_up = vnegq_s8(sign_down)
 
 //  TBL v19.8b, {v6.16b},v19.8b                //vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
-    EXT         v14.16b,  v14.16b ,  v14.16b,#15 //sign_up = vextq_s8(sign_up, sign_up, 15)
+    EXT         v17.16b,  v17.16b ,  v17.16b,#15 //sign_up = vextq_s8(sign_up, sign_up, 15)
 
-    AND         v18.16b,  v18.16b ,  v8.16b //edge_idx = vandq_s8(edge_idx, au1_mask)
+    AND         v18.16b,  v18.16b ,  v1.16b //edge_idx = vandq_s8(edge_idx, au1_mask)
 
-    TBL         v10.16b, {v7.16b},v18.16b   //offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx))
+    TBL         v3.16b, {v7.16b},v18.16b    //offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx))
 
-    Uxtl        v20.8h, v12.8b              //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
+    Uxtl        v20.8h, v5.8b               //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
 //  TBL v11.8b, {v7.16b},v19.8b                    //offset = vtbl1_s8(offset_tbl, vget_high_s8(edge_idx))
-    SADDW       v20.8h,  v20.8h ,  v10.8b   //pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
+    SADDW       v20.8h,  v20.8h ,  v3.8b    //pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
 
     SMAX        v20.8h,  v20.8h ,  v2.8h    //pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
-    Uxtl2       v12.8h, v12.16b             //pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
+    Uxtl2       v5.8h, v5.16b               //pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
 
     UMIN        v20.8h,  v20.8h ,  v4.8h    //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
-    SADDW2      v12.8h,  v12.8h ,  v10.16b  //pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
+    SADDW2      v5.8h,  v5.8h ,  v3.16b     //pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
 
-    SMAX        v12.8h,  v12.8h ,  v2.8h    //pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
+    SMAX        v5.8h,  v5.8h ,  v2.8h      //pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
     xtn         v20.8b,  v20.8h             //vmovn_s16(pi2_tmp_cur_row.val[0])
 
-    UMIN        v12.8h,  v12.8h ,  v4.8h    //pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip))
-    xtn2        v20.16b,  v12.8h            //vmovn_s16(pi2_tmp_cur_row.val[1])
+    UMIN        v5.8h,  v5.8h ,  v4.8h      //pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip))
+    xtn2        v20.16b,  v5.8h             //vmovn_s16(pi2_tmp_cur_row.val[1])
 
 
 INNER_LOOP_DONE:
@@ -556,11 +556,11 @@ WD_16_HT_4_LOOP:
     MOV         x20,#-1
     csel        x8, x20, x8,NE              //au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
 
-    mov         v8.8b[0], w8                //au1_mask = vsetq_lane_s8((-1||pu1_avail[0]), au1_mask, 0)
+    mov         v1.8b[0], w8                //au1_mask = vsetq_lane_s8((-1||pu1_avail[0]), au1_mask, 0)
     CMP         x6,#16                      //if(col == 16)
     BNE         SKIP_AU1_MASK_VAL_WD_16_HT_4
     LDRB        w8,[x5,#1]                  //pu1_avail[1]
-    mov         v8.16b[15], w8              //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
+    mov         v1.16b[15], w8              //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
 
 SKIP_AU1_MASK_VAL_WD_16_HT_4:
     LDRB        w8,[x5,#2]                  //pu1_avail[2]
@@ -572,23 +572,23 @@ SKIP_AU1_MASK_VAL_WD_16_HT_4:
     SUB         x8,x8,#1                    //pu1_src_top_cpy - 1 || pu1_src - src_strd - 1
 
     MOV         x7,x16                      //Loads wd
-    LD1         {v10.16b},[x8]              //pu1_top_row = vld1q_u8(pu1_src - src_strd - 1) || vld1q_u8(pu1_src_top_cpy - 1)
+    LD1         {v3.16b},[x8]               //pu1_top_row = vld1q_u8(pu1_src - src_strd - 1) || vld1q_u8(pu1_src_top_cpy - 1)
     ADD         x3,x3,#16
 
     ADD         x5,sp,#0x42                 //*au1_src_left_tmp
-    LD1         {v12.16b},[x0]              //pu1_cur_row = vld1q_u8(pu1_src)
+    LD1         {v5.16b},[x0]               //pu1_cur_row = vld1q_u8(pu1_src)
     MOV         x4,x17                      //Loads ht
 
     SUB         x7,x7,x6                    //(wd - col)
-    cmhi        v14.16b,  v12.16b ,  v10.16b //vcgtq_u8(pu1_cur_row, pu1_top_row)
+    cmhi        v17.16b,  v5.16b ,  v3.16b  //vcgtq_u8(pu1_cur_row, pu1_top_row)
     MOV         x8,x19                      //Loads *pu1_src
 
     ADD         x7,x7,#15                   //15 + (wd - col)
-    cmhi        v16.16b,  v10.16b ,  v12.16b //vcltq_u8(pu1_cur_row, pu1_top_row)
+    cmhi        v16.16b,  v3.16b ,  v5.16b  //vcltq_u8(pu1_cur_row, pu1_top_row)
     ADD         x7,x8,x7                    //pu1_src[0 * src_strd + 15 + (wd - col)]
 
     SUB         x5,x5,#1
-    SUB         v14.16b,  v16.16b ,  v14.16b //sign_up = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
+    SUB         v17.16b,  v16.16b ,  v17.16b //sign_up = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
 
 AU1_SRC_LEFT_LOOP_WD_16_HT_4:
     LDRB        w8,[x7]                     //load the value and increment by src_strd
@@ -626,31 +626,31 @@ SIGN_UP_CHANGE_WD_16_HT_4:
     csel        x8, x20, x8,LT
     MOV         x20,#1
     csel        x8, x20, x8,GT              //SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[ht_tmp - 1 - row])
-    mov         v14.8b[0], w8               //sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[ht_tmp - 1 - row]), sign_up, 0)
+    mov         v17.8b[0], w8               //sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[ht_tmp - 1 - row]), sign_up, 0)
 
 SIGN_UP_CHANGE_DONE_WD_16_HT_4:
-    cmhi        v20.16b,  v12.16b ,  v18.16b //vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
-    cmhi        v22.16b,  v18.16b ,  v12.16b //vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
+    cmhi        v20.16b,  v5.16b ,  v18.16b //vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
+    cmhi        v22.16b,  v18.16b ,  v5.16b //vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
     SUB         v24.16b,  v22.16b ,  v20.16b //sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
 
-    ADD         v26.16b,  v0.16b ,  v14.16b //edge_idx = vaddq_s8(const_2, sign_up)
+    ADD         v26.16b,  v0.16b ,  v17.16b //edge_idx = vaddq_s8(const_2, sign_up)
     ADD         v26.16b,  v26.16b ,  v24.16b //edge_idx = vaddq_s8(edge_idx, sign_down)
     TBL         v26.16b, {v6.16b},v26.16b   //vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
 //  TBL v27.8b, {v6.16b},v27.8b                //vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
 
-    AND         v26.16b,  v26.16b ,  v8.16b //edge_idx = vandq_s8(edge_idx, au1_mask)
+    AND         v26.16b,  v26.16b ,  v1.16b //edge_idx = vandq_s8(edge_idx, au1_mask)
 
-    NEG         v14.16b, v24.16b            //sign_up = vnegq_s8(sign_down)
-    EXT         v14.16b,  v14.16b ,  v14.16b,#15 //sign_up = vextq_s8(sign_up, sign_up, 15)
+    NEG         v17.16b, v24.16b            //sign_up = vnegq_s8(sign_down)
+    EXT         v17.16b,  v17.16b ,  v17.16b,#15 //sign_up = vextq_s8(sign_up, sign_up, 15)
 
     TBL         v24.16b, {v7.16b},v26.16b   //offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx))
-    Uxtl        v28.8h, v12.8b              //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
+    Uxtl        v28.8h, v5.8b               //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
     SADDW       v28.8h,  v28.8h ,  v24.8b   //pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
     SMAX        v28.8h,  v28.8h ,  v2.8h    //pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
     UMIN        v28.8h,  v28.8h ,  v4.8h    //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
 
 //  TBL v25.8b, {v7.16b},v27.8b                    //offset = vtbl1_s8(offset_tbl, vget_high_s8(edge_idx))
-    Uxtl2       v30.8h, v12.16b             //pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
+    Uxtl2       v30.8h, v5.16b              //pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
     SADDW2      v30.8h,  v30.8h ,  v24.16b  //pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
     SMAX        v30.8h,  v30.8h ,  v2.8h    //pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
     UMIN        v30.8h,  v30.8h ,  v4.8h    //pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip))
@@ -660,7 +660,7 @@ SIGN_UP_CHANGE_DONE_WD_16_HT_4:
 
     ST1         { v28.16b},[x0],x1          //vst1q_u8(pu1_src_cpy, pu1_cur_row)
 
-    mov         v12.16b, v16.16b            //pu1_cur_row = pu1_next_row
+    mov         v5.16b, v16.16b             //pu1_cur_row = pu1_next_row
     SUBS        x7,x7,#1                    //Decrement the ht_tmp loop count by 1
     BNE         PU1_SRC_LOOP_WD_16_HT_4     //If not equal jump to PU1_SRC_LOOP_WD_16_HT_4
 
@@ -689,14 +689,14 @@ WIDTH_RESIDUE:
 
     MOV         x20,#-1
     csel        x8, x20, x8,NE
-    mov         v8.8b[0], w8                //au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
+    mov         v1.8b[0], w8                //au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
 
     LDRB        w8,[x5,#1]                  //pu1_avail[1]
-    mov         v8.8b[7], w8                //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
+    mov         v1.8b[7], w8                //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
 
 PU1_AVAIL_2_RESIDUE:
     LDRB        w11,[x5,#2]                 //pu1_avail[2]
-    LD1         {v12.16b},[x0]              //pu1_cur_row = vld1q_u8(pu1_src)
+    LD1         {v5.16b},[x0]               //pu1_cur_row = vld1q_u8(pu1_src)
     CMP         x11,#0
 
     SUB         x20,x0,x1                   //pu1_src - src_strd
@@ -706,19 +706,19 @@ PU1_AVAIL_2_RESIDUE:
     SUB         x8,x8,#1
 
     ADD         x5,sp,#0x42                 //*au1_src_left_tmp
-    LD1         {v10.16b},[x8],#16          //pu1_top_row = vld1q_u8(pu1_src_top_cpy - 1)
+    LD1         {v3.16b},[x8],#16           //pu1_top_row = vld1q_u8(pu1_src_top_cpy - 1)
     MOV         x7,x16                      //Loads wd
 
     MOV         x4,x17                      //Loads ht
-    cmhi        v14.16b,  v12.16b ,  v10.16b //vcgtq_u8(pu1_cur_row, pu1_top_row)
+    cmhi        v17.16b,  v5.16b ,  v3.16b  //vcgtq_u8(pu1_cur_row, pu1_top_row)
     SUB         x7,x7,#1                    //(wd - 1)
 
     MOV         x8,x19                      //Loads *pu1_src
-    cmhi        v16.16b,  v10.16b ,  v12.16b //vcltq_u8(pu1_cur_row, pu1_top_row)
+    cmhi        v16.16b,  v3.16b ,  v5.16b  //vcltq_u8(pu1_cur_row, pu1_top_row)
     SUB         x5,x5,#1
 
     ADD         x7,x8,x7                    //pu1_src[0 * src_strd + (wd - 1)]
-    SUB         v14.16b,  v16.16b ,  v14.16b //sign_up = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
+    SUB         v17.16b,  v16.16b ,  v17.16b //sign_up = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
 
 
 AU1_SRC_LEFT_LOOP_RESIDUE:
@@ -759,25 +759,25 @@ SIGN_UP_CHANGE_RESIDUE:
     csel        x8, x20, x8,LT
     MOV         x20,#1
     csel        x8, x20, x8,GT              //SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[ht_tmp - 1 - row])
-    mov         v14.8b[0], w8               //sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[ht_tmp - 1 - row]), sign_up, 0)
+    mov         v17.8b[0], w8               //sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[ht_tmp - 1 - row]), sign_up, 0)
 
 SIGN_UP_CHANGE_DONE_RESIDUE:
-    cmhi        v20.16b,  v12.16b ,  v18.16b //vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
-    cmhi        v22.16b,  v18.16b ,  v12.16b //vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
+    cmhi        v20.16b,  v5.16b ,  v18.16b //vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
+    cmhi        v22.16b,  v18.16b ,  v5.16b //vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
     SUB         v24.16b,  v22.16b ,  v20.16b //sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
 
-    ADD         v26.16b,  v0.16b ,  v14.16b //edge_idx = vaddq_s8(const_2, sign_up)
+    ADD         v26.16b,  v0.16b ,  v17.16b //edge_idx = vaddq_s8(const_2, sign_up)
     ADD         v26.16b,  v26.16b ,  v24.16b //edge_idx = vaddq_s8(edge_idx, sign_down)
     TBL         v26.16b, {v6.16b},v26.16b   //vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
 //  TBL v27.8b, {v6.16b},v27.8b                //vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
 
-    AND         v26.16b,  v26.16b ,  v8.16b //edge_idx = vandq_s8(edge_idx, au1_mask)
+    AND         v26.16b,  v26.16b ,  v1.16b //edge_idx = vandq_s8(edge_idx, au1_mask)
 
-    NEG         v14.16b, v24.16b            //sign_up = vnegq_s8(sign_down)
-    EXT         v14.16b,  v14.16b ,  v14.16b,#15 //sign_up = vextq_s8(sign_up, sign_up, 15)
+    NEG         v17.16b, v24.16b            //sign_up = vnegq_s8(sign_down)
+    EXT         v17.16b,  v17.16b ,  v17.16b,#15 //sign_up = vextq_s8(sign_up, sign_up, 15)
 
     TBL         v24.8b, {v7.16b},v26.8b     //offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx))
-    Uxtl        v28.8h, v12.8b              //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
+    Uxtl        v28.8h, v5.8b               //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
     SADDW       v28.8h,  v28.8h ,  v24.8b   //pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
     SMAX        v28.8h,  v28.8h ,  v2.8h    //pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
     UMIN        v28.8h,  v28.8h ,  v4.8h    //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
@@ -785,7 +785,7 @@ SIGN_UP_CHANGE_DONE_RESIDUE:
     xtn         v30.8b,  v28.8h             //vmovn_s16(pi2_tmp_cur_row.val[0])
 
     ST1         {v30.8b},[x0],x1            //vst1q_u8(pu1_src_cpy, pu1_cur_row)
-    mov         v12.16b, v16.16b            //pu1_cur_row = pu1_next_row
+    mov         v5.16b, v16.16b             //pu1_cur_row = pu1_next_row
     SUBS        x7,x7,#1
     BNE         PU1_SRC_LOOP_RESIDUE
 
@@ -839,7 +839,7 @@ END_LOOPS:
     ldp         x23, x24,[sp],#16
     ldp         x21, x22,[sp],#16
     ldp         x19, x20,[sp],#16
-    pop_v_regs
+
     ret
 
 
diff --git a/common/arm64/ihevc_sao_edge_offset_class2_chroma.s b/common/arm64/ihevc_sao_edge_offset_class2_chroma.s
index 2fa7c22..8e286b4 100644
--- a/common/arm64/ihevc_sao_edge_offset_class2_chroma.s
+++ b/common/arm64/ihevc_sao_edge_offset_class2_chroma.s
@@ -76,7 +76,7 @@ ihevc_sao_edge_offset_class2_chroma_av8:
     ldr         x9,[sp,#8]
     ldr         w10,[sp,#16]
     ldr         w11,[sp,#24]
-    push_v_regs
+
 
 
     // STMFD sp!, {x4-x12, x14}            //stack stores the values of the arguments
@@ -322,7 +322,7 @@ PU1_AVAIL_3_LOOP:
     LDR         x2, [x2, #:got_lo12:gi1_table_edge_idx]
 
     MOV         x6,x7                       //move wd to x6 loop_count
-    movi        v8.16b, #0XFF               //au1_mask = vdupq_n_s8(-1)
+    movi        v1.16b, #0XFF               //au1_mask = vdupq_n_s8(-1)
     CMP         x7,#16                      //Compare wd with 16
 
     BLT         WIDTH_RESIDUE               //If not jump to WIDTH_RESIDUE where loop is unrolled for 8 case
@@ -338,19 +338,19 @@ WIDTH_LOOP_16:
 
     MOV         x20,#-1
     csel        x8, x20, x8,NE
-    mov         v8.8b[0], w8                //au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
+    mov         v1.8b[0], w8                //au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
 
     CMP         x6,#16                      //if(col == 16)
-    mov         v8.8b[1], w8                //au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
+    mov         v1.8b[1], w8                //au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
 
     BNE         SKIP_AU1_MASK_VAL
     LDRB        w8,[x5,#1]                  //pu1_avail[1]
-    mov         v8.16b[14], w8              //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
-    mov         v8.16b[15], w8              //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
+    mov         v1.16b[14], w8              //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
+    mov         v1.16b[15], w8              //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
 
 SKIP_AU1_MASK_VAL:
     LDRB        w9,[x5,#2]                  //pu1_avail[2]
-    LD1         {v12.16b},[x0]              //pu1_cur_row = vld1q_u8(pu1_src)
+    LD1         {v5.16b},[x0]               //pu1_cur_row = vld1q_u8(pu1_src)
     //LD1 {v13.8b},[x0]                        //pu1_cur_row = vld1q_u8(pu1_src)
     //SUB x0, x0,#8
     CMP         x9,#0
@@ -366,17 +366,17 @@ SKIP_AU1_MASK_VAL:
     ADD         x3,x3,#16
 
     ADD         x5,sp,#0x4B                 //*au1_src_left_tmp
-    LD1         {v10.16b},[x8]              //pu1_top_row = vld1q_u8(pu1_src - src_strd - 2) || vld1q_u8(pu1_src_top_cpy - 2)
+    LD1         {v3.16b},[x8]               //pu1_top_row = vld1q_u8(pu1_src - src_strd - 2) || vld1q_u8(pu1_src_top_cpy - 2)
     //LD1 {v11.8b},[x8]                        //pu1_top_row = vld1q_u8(pu1_src - src_strd - 2) || vld1q_u8(pu1_src_top_cpy - 2)
     //SUB x8, x8,#8
     SUB         x7,x7,x6                    //(wd - col)
 
     ADD         x7,x7,#14                   //15 + (wd - col)
-    cmhi        v14.16b,  v12.16b ,  v10.16b //vcgtq_u8(pu1_cur_row, pu1_top_row)
+    cmhi        v17.16b,  v5.16b ,  v3.16b  //vcgtq_u8(pu1_cur_row, pu1_top_row)
     mov         x8, x26                     //Loads *pu1_src
 
     ADD         x7,x8,x7                    //pu1_src[0 * src_strd + 15 + (wd - col)]
-    cmhi        v16.16b,  v10.16b ,  v12.16b //vcltq_u8(pu1_cur_row, pu1_top_row)
+    cmhi        v16.16b,  v3.16b ,  v5.16b  //vcltq_u8(pu1_cur_row, pu1_top_row)
 
 AU1_SRC_LEFT_LOOP:
     LDRH        w8,[x7]                     //load the value and increment by src_strd
@@ -388,7 +388,7 @@ AU1_SRC_LEFT_LOOP:
     BNE         AU1_SRC_LEFT_LOOP
 
     ADD         x8,x0,x1                    //I *pu1_src + src_strd
-    SUB         v14.16b,  v16.16b ,  v14.16b //sign_up = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
+    SUB         v17.16b,  v16.16b ,  v17.16b //sign_up = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
     MOV         x7,x12                      //row count, move ht_tmp to x7
 
     LD1         {v16.16b},[x8]              //I pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
@@ -430,35 +430,35 @@ AU1_SRC_LEFT_LOOP:
     csel        x8, x20, x8,GT              //I SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2])
 
     CMP         x4,#0                       //I
-    mov         v14.8b[0], w8               //I sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2]), sign_up, 0)
+    mov         v17.8b[0], w8               //I sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2]), sign_up, 0)
     movn        x20,#0
     csel        x4, x20, x4,LT              //I
 
     MOV         x20,#1
     csel        x4, x20, x4,GT              //I SIGN(pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2])
-    mov         v14.8b[1], w4               //I sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2 + 1]), sign_up, 1)
+    mov         v17.8b[1], w4               //I sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2 + 1]), sign_up, 1)
 
 SIGN_UP_CHANGE_DONE:
     LD1         {v30.8b},[x2]               //edge_idx_tbl = vld1_s8(gi1_table_edge_idx)
-    cmhi        v20.16b,  v12.16b ,  v18.16b //I vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
+    cmhi        v20.16b,  v5.16b ,  v18.16b //I vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
 
-    cmhi        v22.16b,  v18.16b ,  v12.16b //I vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
+    cmhi        v22.16b,  v18.16b ,  v5.16b //I vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
     SUB         v22.16b,  v22.16b ,  v20.16b //I sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
 
-    ADD         v18.16b,  v0.16b ,  v14.16b //I edge_idx = vaddq_s8(const_2, sign_up)
+    ADD         v18.16b,  v0.16b ,  v17.16b //I edge_idx = vaddq_s8(const_2, sign_up)
     ADD         v18.16b,  v18.16b ,  v22.16b //I edge_idx = vaddq_s8(edge_idx, sign_down)
 
     TBL         v18.16b, {v30.16b},v18.16b  //I vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
-    NEG         v14.16b, v22.16b            //I sign_up = vnegq_s8(sign_down)
+    NEG         v17.16b, v22.16b            //I sign_up = vnegq_s8(sign_down)
 
     //TBL v19.8b, {v30.16b},v19.8b                //I vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
-    EXT         v14.16b,  v14.16b ,  v14.16b,#14 //I sign_up = vextq_s8(sign_up, sign_up, 14)
+    EXT         v17.16b,  v17.16b ,  v17.16b,#14 //I sign_up = vextq_s8(sign_up, sign_up, 14)
 
-    Uxtl        v20.8h, v12.8b              //I pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
-    AND         v22.16b,  v18.16b ,  v8.16b //I edge_idx = vandq_s8(edge_idx, au1_mask)
+    Uxtl        v20.8h, v5.8b               //I pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
+    AND         v22.16b,  v18.16b ,  v1.16b //I edge_idx = vandq_s8(edge_idx, au1_mask)
     mov         v23.d[0],v22.d[1]
 
-    Uxtl2       v18.8h, v12.16b             //I pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
+    Uxtl2       v18.8h, v5.16b              //I pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
     UZP1        v31.8b, v22.8b, v23.8b
     UZP2        v23.8b, v22.8b, v23.8b      //I
     mov         v22.8b,v31.8b
@@ -469,7 +469,7 @@ SIGN_UP_CHANGE_DONE:
     ZIP2        v23.8b, v22.8b, v23.8b      //I
     mov         v22.8b,v31.8b
 
-    mov         v12.16b, v16.16b            //I pu1_cur_row = pu1_next_row
+    mov         v5.16b, v16.16b             //I pu1_cur_row = pu1_next_row
     SADDW       v20.8h,  v20.8h ,  v22.8b   //I pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
 
     SMAX        v20.8h,  v20.8h ,  v2.8h    //I pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
@@ -521,17 +521,17 @@ PU1_SRC_LOOP:
 
     movn        x20,#0
     csel        x8, x20, x8,LT              //II
-    cmhi        v22.16b,  v12.16b ,  v28.16b //II vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
+    cmhi        v22.16b,  v5.16b ,  v28.16b //II vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
     MOV         x20,#1
     csel        x8, x20, x8,GT              //II SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2])
 
     sub         x13,x9,#1
     LDRB        w5,[x13]                    //II load the value
-    mov         v14.8b[0], w8               //II sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2]), sign_up, 0)
+    mov         v17.8b[0], w8               //II sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2]), sign_up, 0)
     SUB         x7,x7,#1                    //II Decrement the ht_tmp loop count by 1
 
     SUB         x11,x11,x5                  //II pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2 + 1]
-    cmhi        v24.16b,  v28.16b ,  v12.16b //II vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
+    cmhi        v24.16b,  v28.16b ,  v5.16b //II vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
     CMP         x11,#0                      //II
 
     movn        x20,#0
@@ -545,11 +545,11 @@ PU1_SRC_LOOP:
     SUB         x5,x12,x7                   //III ht_tmp - row
 
     ADD         x10,x0,x1
-    mov         v14.8b[1], w11              //II sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2 + 1]), sign_up, 1)
+    mov         v17.8b[1], w11              //II sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2 + 1]), sign_up, 1)
     LSL         x5,x5,#1                    //III (ht_tmp - row) * 2
 
     ADD         x9,x14,x5                   //III pu1_src_left_cpy[(ht_tmp - row) * 2]
-    ADD         v26.16b,  v0.16b ,  v14.16b //II edge_idx = vaddq_s8(const_2, sign_up)
+    ADD         v26.16b,  v0.16b ,  v17.16b //II edge_idx = vaddq_s8(const_2, sign_up)
     LDRB        w10,[x10,#1]                //III pu1_src_cpy[0]
 
     sub         x13,x9,#2
@@ -562,24 +562,24 @@ PU1_SRC_LOOP:
     sub         x13,x9,#1
     LDRB        w9,[x13]                    //III load the value
     TBL         v26.16b, {v22.16b},v26.16b  //II vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
-    NEG         v14.16b, v24.16b            //II sign_up = vnegq_s8(sign_down)
+    NEG         v17.16b, v24.16b            //II sign_up = vnegq_s8(sign_down)
 
     movn        x20,#0
     csel        x4, x20, x4,LT              //III
     SUB         x10,x10,x9                  //III pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2 + 1]
     //TBL v27.8b, {v22.16b},v27.8b                //II vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
-    EXT         v14.16b,  v14.16b ,  v14.16b,#14 //II sign_up = vextq_s8(sign_up, sign_up, 14)
+    EXT         v17.16b,  v17.16b ,  v17.16b,#14 //II sign_up = vextq_s8(sign_up, sign_up, 14)
 
     MOV         x20,#1
     csel        x4, x20, x4,GT              //III SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2])
-    AND         v26.16b,  v26.16b ,  v8.16b //II edge_idx = vandq_s8(edge_idx, au1_mask)
+    AND         v26.16b,  v26.16b ,  v1.16b //II edge_idx = vandq_s8(edge_idx, au1_mask)
     CMP         x10,#0                      //III
 
     mov         v27.d[0],v26.d[1]
     UZP1        v31.8b, v26.8b, v27.8b
     UZP2        v27.8b, v26.8b, v27.8b      //II
     mov         v26.8b,v31.8b
-    mov         v14.8b[0], w4               //III sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2]), sign_up, 0)
+    mov         v17.8b[0], w4               //III sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2]), sign_up, 0)
 
     movn        x20,#0
     csel        x10, x20, x10,LT            //III
@@ -592,13 +592,13 @@ PU1_SRC_LOOP:
     TBL         v25.8b, {v7.16b},v27.8b     //II
     SUB         v22.16b,  v22.16b ,  v20.16b //III sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
 
-    mov         v14.8b[1], w10              //III sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2 + 1]), sign_up, 1)
+    mov         v17.8b[1], w10              //III sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2 + 1]), sign_up, 1)
     ZIP1        v31.8b, v24.8b, v25.8b
     ZIP2        v25.8b, v24.8b, v25.8b      //II
     mov         v24.8b,v31.8b
 
-    Uxtl        v28.8h, v12.8b              //II pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
-    ADD         v18.16b,  v0.16b ,  v14.16b //III edge_idx = vaddq_s8(const_2, sign_up)
+    Uxtl        v28.8h, v5.8b               //II pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
+    ADD         v18.16b,  v0.16b ,  v17.16b //III edge_idx = vaddq_s8(const_2, sign_up)
 
     LD1         {v20.8b},[x2]               //edge_idx_tbl = vld1_s8(gi1_table_edge_idx)
     SADDW       v28.8h,  v28.8h ,  v24.8b   //II pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
@@ -608,13 +608,13 @@ PU1_SRC_LOOP:
 
     UMIN        v28.8h,  v28.8h ,  v4.8h    //II pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
     TBL         v18.16b, {v20.16b},v18.16b  //III vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
-    NEG         v14.16b, v22.16b            //III sign_up = vnegq_s8(sign_down)
+    NEG         v17.16b, v22.16b            //III sign_up = vnegq_s8(sign_down)
 
     //TBL v19.8b, {v20.16b},v19.8b                //III vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
-    EXT         v14.16b,  v14.16b ,  v14.16b,#14 //III sign_up = vextq_s8(sign_up, sign_up, 14)
+    EXT         v17.16b,  v17.16b ,  v17.16b,#14 //III sign_up = vextq_s8(sign_up, sign_up, 14)
 
-    Uxtl2       v26.8h, v12.16b             //II pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
-    AND         v18.16b,  v18.16b ,  v8.16b //III edge_idx = vandq_s8(edge_idx, au1_mask)
+    Uxtl2       v26.8h, v5.16b              //II pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
+    AND         v18.16b,  v18.16b ,  v1.16b //III edge_idx = vandq_s8(edge_idx, au1_mask)
 
     mov         v19.d[0],v18.d[1]
     UZP1        v31.8b, v18.8b, v19.8b
@@ -623,7 +623,7 @@ PU1_SRC_LOOP:
     TBL         v22.8b, {v6.16b},v18.8b     //III
     SADDW       v26.8h,  v26.8h ,  v25.8b   //II pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
 
-    mov         v12.16b, v30.16b            //III pu1_cur_row = pu1_next_row
+    mov         v5.16b, v30.16b             //III pu1_cur_row = pu1_next_row
     TBL         v23.8b, {v7.16b},v19.8b     //III
     SMAX        v26.8h,  v26.8h ,  v2.8h    //II pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
 
@@ -686,35 +686,35 @@ PU1_SRC_LOOP:
     LD1         {v30.8b},[x2]               //edge_idx_tbl = vld1_s8(gi1_table_edge_idx)
 
     LDRB        w11,[x0,#1]                 //pu1_src_cpy[0]
-    mov         v14.8b[0], w8               //sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2]), sign_up, 0)
+    mov         v17.8b[0], w8               //sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2]), sign_up, 0)
     sub         x13,x9,#1
     LDRB        w5,[x13]                    //load the value
 
     SUB         x4,x11,x5                   //pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2 + 1]
-    cmhi        v22.16b,  v12.16b ,  v18.16b //vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
+    cmhi        v22.16b,  v5.16b ,  v18.16b //vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
     CMP         x4,#0
 
     movn        x20,#0
     csel        x4, x20, x4,LT
-    cmhi        v24.16b,  v18.16b ,  v12.16b //vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
+    cmhi        v24.16b,  v18.16b ,  v5.16b //vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
     MOV         x20,#1
     csel        x4, x20, x4,GT              //SIGN(pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2])
 
-    mov         v14.8b[1], w4               //sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2 + 1]), sign_up, 1)
+    mov         v17.8b[1], w4               //sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2 + 1]), sign_up, 1)
     SUB         v24.16b,  v24.16b ,  v22.16b //sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
 
-    ADD         v26.16b,  v0.16b ,  v14.16b //edge_idx = vaddq_s8(const_2, sign_up)
+    ADD         v26.16b,  v0.16b ,  v17.16b //edge_idx = vaddq_s8(const_2, sign_up)
     ADD         v26.16b,  v26.16b ,  v24.16b //edge_idx = vaddq_s8(edge_idx, sign_down)
 
     mov         v30.d[1],v30.d[0]
     TBL         v26.16b, {v30.16b},v26.16b  //vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
     //TBL v27.8b, {v30.16b},v27.8b                //vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
 
-    Uxtl        v20.8h, v12.8b              //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
-    AND         v26.16b,  v26.16b ,  v8.16b //edge_idx = vandq_s8(edge_idx, au1_mask)
+    Uxtl        v20.8h, v5.8b               //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
+    AND         v26.16b,  v26.16b ,  v1.16b //edge_idx = vandq_s8(edge_idx, au1_mask)
     mov         v27.d[0],v26.d[1]
 
-    Uxtl2       v18.8h, v12.16b             //pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
+    Uxtl2       v18.8h, v5.16b              //pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
     UZP1        v31.8b, v26.8b, v27.8b
     UZP2        v27.8b, v26.8b, v27.8b
     mov         v26.8b,v31.8b
@@ -771,14 +771,14 @@ WD_16_HT_4_LOOP:
 
     MOV         x20,#-1
     csel        x8, x20, x8,NE
-    mov         v8.8b[0], w8                //au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
-    mov         v8.8b[1], w8                //au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
+    mov         v1.8b[0], w8                //au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
+    mov         v1.8b[1], w8                //au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
 
     CMP         x6,#16                      //if(col == 16)
     BNE         SKIP_AU1_MASK_VAL_WD_16_HT_4
     LDRB        w8,[x5,#1]                  //pu1_avail[1]
-    mov         v8.16b[14], w8              //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
-    mov         v8.16b[15], w8              //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
+    mov         v1.16b[14], w8              //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
+    mov         v1.16b[15], w8              //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
 
 SKIP_AU1_MASK_VAL_WD_16_HT_4:
     LDRB        w8,[x5,#2]                  //pu1_avail[2]
@@ -788,7 +788,7 @@ SKIP_AU1_MASK_VAL_WD_16_HT_4:
     csel        x8, x20, x8,EQ
     csel        x8, x3, x8,NE               //pu1_src_top_cpy
     SUB         x8,x8,#2                    //pu1_src - src_strd - 2
-    LD1         {v10.16b},[x8]              //pu1_top_row = vld1q_u8(pu1_src - src_strd - 2) || vld1q_u8(pu1_src_top_cpy - 2)
+    LD1         {v3.16b},[x8]               //pu1_top_row = vld1q_u8(pu1_src - src_strd - 2) || vld1q_u8(pu1_src_top_cpy - 2)
     //LD1 {v11.8b},[x8]                        //pu1_top_row = vld1q_u8(pu1_src - src_strd - 2) || vld1q_u8(pu1_src_top_cpy - 2)
     //SUB x8, x8,#8
 
@@ -809,13 +809,13 @@ AU1_SRC_LEFT_LOOP_WD_16_HT_4:
     SUBS        x4,x4,#1                    //decrement the loop count
     BNE         AU1_SRC_LEFT_LOOP_WD_16_HT_4
 
-    LD1         {v12.16b},[x0]              //pu1_cur_row = vld1q_u8(pu1_src)
+    LD1         {v5.16b},[x0]               //pu1_cur_row = vld1q_u8(pu1_src)
     //LD1 {v13.8b},[x0]                        //pu1_cur_row = vld1q_u8(pu1_src)
     //SUB x0, x0,#8
 
-    cmhi        v14.16b,  v12.16b ,  v10.16b //vcgtq_u8(pu1_cur_row, pu1_top_row)
-    cmhi        v16.16b,  v10.16b ,  v12.16b //vcltq_u8(pu1_cur_row, pu1_top_row)
-    SUB         v14.16b,  v16.16b ,  v14.16b //sign_up = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
+    cmhi        v17.16b,  v5.16b ,  v3.16b  //vcgtq_u8(pu1_cur_row, pu1_top_row)
+    cmhi        v16.16b,  v3.16b ,  v5.16b  //vcltq_u8(pu1_cur_row, pu1_top_row)
+    SUB         v17.16b,  v16.16b ,  v17.16b //sign_up = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
     movi        v18.16b, #0
     MOV         x7,x12                      //row count, move ht_tmp to x7
 
@@ -851,7 +851,7 @@ SIGN_UP_CHANGE_WD_16_HT_4:
     csel        x8, x20, x8,LT
     MOV         x20,#1
     csel        x8, x20, x8,GT              //SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2])
-    mov         v14.8b[0], w8               //sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2]), sign_up, 0)
+    mov         v17.8b[0], w8               //sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2]), sign_up, 0)
 
     LDRB        w8,[x0,#1]                  //pu1_src_cpy[0]
     sub         x13,x9,#1
@@ -862,25 +862,25 @@ SIGN_UP_CHANGE_WD_16_HT_4:
     csel        x8, x20, x8,LT
     MOV         x20,#1
     csel        x8, x20, x8,GT              //SIGN(pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2])
-    mov         v14.8b[1], w8               //sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2 + 1]), sign_up, 1)
+    mov         v17.8b[1], w8               //sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2 + 1]), sign_up, 1)
 
 SIGN_UP_CHANGE_DONE_WD_16_HT_4:
-    cmhi        v22.16b,  v12.16b ,  v18.16b //vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
-    cmhi        v24.16b,  v18.16b ,  v12.16b //vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
+    cmhi        v22.16b,  v5.16b ,  v18.16b //vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
+    cmhi        v24.16b,  v18.16b ,  v5.16b //vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
     SUB         v24.16b,  v24.16b ,  v22.16b //sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
 
-    ADD         v26.16b,  v0.16b ,  v14.16b //edge_idx = vaddq_s8(const_2, sign_up)
+    ADD         v26.16b,  v0.16b ,  v17.16b //edge_idx = vaddq_s8(const_2, sign_up)
     ADD         v26.16b,  v26.16b ,  v24.16b //edge_idx = vaddq_s8(edge_idx, sign_down)
 
     LD1         {v22.8b},[x2]               //edge_idx_tbl = vld1_s8(gi1_table_edge_idx)
     TBL         v26.16b, {v22.16b},v26.16b  //vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
     //TBL v27.8b, {v22.16b},v27.8b                //vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
 
-    AND         v26.16b,  v26.16b ,  v8.16b //edge_idx = vandq_s8(edge_idx, au1_mask)
+    AND         v26.16b,  v26.16b ,  v1.16b //edge_idx = vandq_s8(edge_idx, au1_mask)
     mov         v27.d[0],v26.d[1]
 
-    NEG         v14.16b, v24.16b            //sign_up = vnegq_s8(sign_down)
-    EXT         v14.16b,  v14.16b ,  v14.16b,#14 //sign_up = vextq_s8(sign_up, sign_up, 14)
+    NEG         v17.16b, v24.16b            //sign_up = vnegq_s8(sign_down)
+    EXT         v17.16b,  v17.16b ,  v17.16b,#14 //sign_up = vextq_s8(sign_up, sign_up, 14)
 
     UZP1        v31.8b, v26.8b, v27.8b
     UZP2        v27.8b, v26.8b, v27.8b
@@ -891,12 +891,12 @@ SIGN_UP_CHANGE_DONE_WD_16_HT_4:
     ZIP2        v25.8b, v24.8b, v25.8b
     mov         v24.8b,v31.8b
 
-    Uxtl        v28.8h, v12.8b              //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
+    Uxtl        v28.8h, v5.8b               //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
     SADDW       v28.8h,  v28.8h ,  v24.8b   //pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
     SMAX        v28.8h,  v28.8h ,  v2.8h    //pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
     UMIN        v28.8h,  v28.8h ,  v4.8h    //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
 
-    Uxtl2       v26.8h, v12.16b             //pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
+    Uxtl2       v26.8h, v5.16b              //pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
     SADDW       v26.8h,  v26.8h ,  v25.8b   //pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
     SMAX        v26.8h,  v26.8h ,  v2.8h    //pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
     UMIN        v26.8h,  v26.8h ,  v4.8h    //pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip))
@@ -906,7 +906,7 @@ SIGN_UP_CHANGE_DONE_WD_16_HT_4:
 
     ST1         { v28.16b},[x0],x1          //vst1q_u8(pu1_src_cpy, pu1_cur_row)
 
-    mov         v12.16b, v16.16b            //pu1_cur_row = pu1_next_row
+    mov         v5.16b, v16.16b             //pu1_cur_row = pu1_next_row
     SUBS        x7,x7,#1                    //Decrement the ht_tmp loop count by 1
     BNE         PU1_SRC_LOOP_WD_16_HT_4     //If not equal jump to PU1_SRC_LOOP_WD_16_HT_4
 
@@ -936,12 +936,12 @@ WIDTH_RESIDUE:
 
     MOV         x20,#-1
     csel        x8, x20, x8,NE
-    mov         v8.8b[0], w8                //au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
-    mov         v8.8b[1], w8                //au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
+    mov         v1.8b[0], w8                //au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
+    mov         v1.8b[1], w8                //au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
 
     LDRB        w8,[x5,#1]                  //pu1_avail[1]
-    mov         v8.8b[6], w8                //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
-    mov         v8.8b[7], w8                //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
+    mov         v1.8b[6], w8                //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
+    mov         v1.8b[7], w8                //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
 
     LDRB        w8,[x5,#2]                  //pu1_avail[2]
     CMP         x8,#0
@@ -950,7 +950,7 @@ WIDTH_RESIDUE:
     csel        x8, x20, x8,EQ
     csel        x8, x3, x8,NE
     SUB         x8,x8,#2                    //pu1_src - src_strd - 2
-    LD1         {v10.16b},[x8]              //pu1_top_row = vld1q_u8(pu1_src - src_strd - 2)
+    LD1         {v3.16b},[x8]               //pu1_top_row = vld1q_u8(pu1_src - src_strd - 2)
     //LD1 {v11.8b},[x8]                        //pu1_top_row = vld1q_u8(pu1_src - src_strd - 2)
     //SUB x8, x8,#8
 
@@ -968,13 +968,13 @@ AU1_SRC_LEFT_LOOP_RESIDUE:
     SUBS        x4,x4,#1                    //decrement the loop count
     BNE         AU1_SRC_LEFT_LOOP_RESIDUE
 
-    LD1         {v12.16b},[x0]              //pu1_cur_row = vld1q_u8(pu1_src)
+    LD1         {v5.16b},[x0]               //pu1_cur_row = vld1q_u8(pu1_src)
     //LD1 {v13.8b},[x0]                        //pu1_cur_row = vld1q_u8(pu1_src)
     //SUB x0, x0,#8
 
-    cmhi        v14.16b,  v12.16b ,  v10.16b //vcgtq_u8(pu1_cur_row, pu1_top_row)
-    cmhi        v16.16b,  v10.16b ,  v12.16b //vcltq_u8(pu1_cur_row, pu1_top_row)
-    SUB         v14.16b,  v16.16b ,  v14.16b //sign_up = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
+    cmhi        v17.16b,  v5.16b ,  v3.16b  //vcgtq_u8(pu1_cur_row, pu1_top_row)
+    cmhi        v16.16b,  v3.16b ,  v5.16b  //vcltq_u8(pu1_cur_row, pu1_top_row)
+    SUB         v17.16b,  v16.16b ,  v17.16b //sign_up = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
     MOV         x7,x12                      //row count, move ht_tmp to x7
 
 PU1_SRC_LOOP_RESIDUE:
@@ -1009,7 +1009,7 @@ SIGN_UP_CHANGE_RESIDUE:
     csel        x8, x20, x8,LT
     MOV         x20,#1
     csel        x8, x20, x8,GT              //SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2])
-    mov         v14.8b[0], w8               //sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2]), sign_up, 0)
+    mov         v17.8b[0], w8               //sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2]), sign_up, 0)
 
     LDRB        w8,[x0,#1]                  //pu1_src_cpy[0]
     sub         x13,x9,#1
@@ -1020,14 +1020,14 @@ SIGN_UP_CHANGE_RESIDUE:
     csel        x8, x20, x8,LT
     MOV         x20,#1
     csel        x8, x20, x8,GT              //SIGN(pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2])
-    mov         v14.8b[1], w8               //sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2 + 1]), sign_up, 1)
+    mov         v17.8b[1], w8               //sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2 + 1]), sign_up, 1)
 
 SIGN_UP_CHANGE_DONE_RESIDUE:
-    cmhi        v22.16b,  v12.16b ,  v18.16b //vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
-    cmhi        v24.16b,  v18.16b ,  v12.16b //vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
+    cmhi        v22.16b,  v5.16b ,  v18.16b //vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
+    cmhi        v24.16b,  v18.16b ,  v5.16b //vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
     SUB         v24.16b,  v24.16b ,  v22.16b //sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
 
-    ADD         v26.16b,  v0.16b ,  v14.16b //edge_idx = vaddq_s8(const_2, sign_up)
+    ADD         v26.16b,  v0.16b ,  v17.16b //edge_idx = vaddq_s8(const_2, sign_up)
     ADD         v26.16b,  v26.16b ,  v24.16b //edge_idx = vaddq_s8(edge_idx, sign_down)
 
     LD1         {v22.8b},[x2]               //edge_idx_tbl = vld1_s8(gi1_table_edge_idx)
@@ -1035,11 +1035,11 @@ SIGN_UP_CHANGE_DONE_RESIDUE:
     TBL         v26.16b, {v22.16b},v26.16b  //vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
     //TBL v27.8b, {v22.16b},v27.8b                //vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
 
-    AND         v26.16b,  v26.16b ,  v8.16b //edge_idx = vandq_s8(edge_idx, au1_mask)
+    AND         v26.16b,  v26.16b ,  v1.16b //edge_idx = vandq_s8(edge_idx, au1_mask)
     mov         v27.d[0],v26.d[1]
 
-    NEG         v14.16b, v24.16b            //sign_up = vnegq_s8(sign_down)
-    EXT         v14.16b,  v14.16b ,  v14.16b,#14 //sign_up = vextq_s8(sign_up, sign_up, 14)
+    NEG         v17.16b, v24.16b            //sign_up = vnegq_s8(sign_down)
+    EXT         v17.16b,  v17.16b ,  v17.16b,#14 //sign_up = vextq_s8(sign_up, sign_up, 14)
 
     UZP1        v31.8b, v26.8b, v27.8b
     UZP2        v27.8b, v26.8b, v27.8b
@@ -1050,7 +1050,7 @@ SIGN_UP_CHANGE_DONE_RESIDUE:
     ZIP2        v25.8b, v24.8b, v25.8b
     mov         v24.8b,v31.8b
 
-    Uxtl        v28.8h, v12.8b              //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
+    Uxtl        v28.8h, v5.8b               //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
     SADDW       v28.8h,  v28.8h ,  v24.8b   //pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
     SMAX        v28.8h,  v28.8h ,  v2.8h    //pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
     UMIN        v28.8h,  v28.8h ,  v4.8h    //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
@@ -1059,7 +1059,7 @@ SIGN_UP_CHANGE_DONE_RESIDUE:
 
     ST1         {v28.8b},[x0],x1            //vst1q_u8(pu1_src_cpy, pu1_cur_row)
 
-    mov         v12.16b, v16.16b            //pu1_cur_row = pu1_next_row
+    mov         v5.16b, v16.16b             //pu1_cur_row = pu1_next_row
     SUBS        x7,x7,#1                    //Decrement the ht_tmp loop count by 1
     BNE         PU1_SRC_LOOP_RESIDUE        //If not equal jump to PU1_SRC_LOOP
 
@@ -1113,7 +1113,7 @@ END_LOOPS:
     ldp         x23, x24,[sp],#16
     ldp         x21, x22,[sp],#16
     ldp         x19, x20,[sp],#16
-    pop_v_regs
+
     ret
 
 
diff --git a/common/arm64/ihevc_sao_edge_offset_class3.s b/common/arm64/ihevc_sao_edge_offset_class3.s
index 6c47abe..f393753 100644
--- a/common/arm64/ihevc_sao_edge_offset_class3.s
+++ b/common/arm64/ihevc_sao_edge_offset_class3.s
@@ -70,7 +70,6 @@ ihevc_sao_edge_offset_class3_av8:
 
 
     // STMFD sp!,{x4-x12,x14}            //stack stores the values of the arguments
-    push_v_regs
     stp         x19, x20,[sp,#-16]!
     stp         x21, x22,[sp,#-16]!
     stp         x23, x24,[sp,#-16]!
@@ -85,9 +84,9 @@ ihevc_sao_edge_offset_class3_av8:
 
     MOV         x5,x7                       //Loads pu1_avail
 
-    LDR         x6,[sp,#112]                //Loads pi1_sao_offset
-    LDR         w7,[sp,#120]                //Loads wd
-    LDR         w8,[sp,#128]                //Loads ht
+    LDR         x6,[sp,#48]                 //Loads pi1_sao_offset
+    LDR         w7,[sp,#56]                 //Loads wd
+    LDR         w8,[sp,#64]                 //Loads ht
 
     MOV         x16,x7 // wd
     MOV         x17,x8 // ht
@@ -226,7 +225,7 @@ PU1_AVAIL_3_LOOP:
     ADRP        x6, :got:gi1_table_edge_idx //table pointer
     LDR         x6, [x6, #:got_lo12:gi1_table_edge_idx]
 
-    movi        v8.16b, #0xFF               //au1_mask = vdupq_n_s8(-1)
+    movi        v1.16b, #0xFF               //au1_mask = vdupq_n_s8(-1)
     ADD         x20,x14,#1                  //pu1_src_left_cpy += 1
     csel        x14, x20, x14,EQ
 
@@ -248,12 +247,12 @@ WIDTH_LOOP_16:
     csel        w8,w20,w8,EQ
     MOV         x20,#-1
     csel        x8, x20, x8,NE
-    mov         v8.8b[0], w8                //au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
+    mov         v1.8b[0], w8                //au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
 
     CMP         x6,#16                      //if(col == 16)
     BNE         SKIP_AU1_MASK_VAL
     LDRB        w8,[x5,#1]                  //pu1_avail[1]
-    mov         v8.16b[15], w8              //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
+    mov         v1.16b[15], w8              //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
 
 SKIP_AU1_MASK_VAL:
     LDRB        w8,[x5,#2]                  //pu1_avail[2]
@@ -270,15 +269,15 @@ SKIP_AU1_MASK_VAL:
     ADD         x8,x8,#1                    //pu1_src - src_strd + 1
 
     SUB         x7,x7,x6                    //(wd - col)
-    LD1         {v10.16b},[x8]              //pu1_top_row = vld1q_u8(pu1_src - src_strd + 1)
+    LD1         {v3.16b},[x8]               //pu1_top_row = vld1q_u8(pu1_src - src_strd + 1)
     ADD         x3,x3,#16
 
     MOV         x8,x19                      //Loads *pu1_src
-    LD1         {v12.16b},[x0]              //pu1_cur_row = vld1q_u8(pu1_src)
+    LD1         {v5.16b},[x0]               //pu1_cur_row = vld1q_u8(pu1_src)
     ADD         x7,x7,#15                   //15 + (wd - col)
 
     ADD         x7,x8,x7                    //pu1_src[0 * src_strd + 15 + (wd - col)]
-    cmhi        v14.16b,  v12.16b ,  v10.16b //vcgtq_u8(pu1_cur_row, pu1_top_row)
+    cmhi        v17.16b,  v5.16b ,  v3.16b  //vcgtq_u8(pu1_cur_row, pu1_top_row)
     SUB         x5,x5,#1
 
 AU1_SRC_LEFT_LOOP:
@@ -289,10 +288,10 @@ AU1_SRC_LEFT_LOOP:
     BNE         AU1_SRC_LEFT_LOOP
 
     movi        v18.16b, #0
-    cmhi        v16.16b,  v10.16b ,  v12.16b //vcltq_u8(pu1_cur_row, pu1_top_row)
+    cmhi        v16.16b,  v3.16b ,  v5.16b  //vcltq_u8(pu1_cur_row, pu1_top_row)
 
     ADD         x8,x0,x1                    //I *pu1_src + src_strd
-    SUB         v14.16b,  v16.16b ,  v14.16b //sign_up = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
+    SUB         v17.16b,  v16.16b ,  v17.16b //sign_up = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
     MOV         x7,x12                      //row count, move ht_tmp to x7
 
     SUB         x5,x12,x7                   //I ht_tmp - row
@@ -321,35 +320,35 @@ SIGN_UP_CHANGE:
     csel        x8, x20, x8,LT              //I
     MOV         x20,#1
     csel        x8, x20, x8,GT              //I SIGN(pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd])
-    mov         v14.16b[15], w8             //I sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd]), sign_up, 15)
+    mov         v17.16b[15], w8             //I sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd]), sign_up, 15)
 
 SIGN_UP_CHANGE_DONE:
-    cmhi        v10.16b,  v12.16b ,  v18.16b //I vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
-    cmhi        v18.16b,  v18.16b ,  v12.16b //I vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
-    SUB         v10.16b,  v18.16b ,  v10.16b //I sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
+    cmhi        v3.16b,  v5.16b ,  v18.16b  //I vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
+    cmhi        v18.16b,  v18.16b ,  v5.16b //I vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
+    SUB         v3.16b,  v18.16b ,  v3.16b  //I sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
 
-    ADD         v18.16b,  v0.16b ,  v14.16b //I edge_idx = vaddq_s8(const_2, sign_up)
-    ADD         v18.16b,  v18.16b ,  v10.16b //I edge_idx = vaddq_s8(edge_idx, sign_down)
+    ADD         v18.16b,  v0.16b ,  v17.16b //I edge_idx = vaddq_s8(const_2, sign_up)
+    ADD         v18.16b,  v18.16b ,  v3.16b //I edge_idx = vaddq_s8(edge_idx, sign_down)
     TBL         v18.16b, {v6.16b},v18.16b   //I vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
-    NEG         v14.16b, v10.16b            //I sign_up = vnegq_s8(sign_down)
+    NEG         v17.16b, v3.16b             //I sign_up = vnegq_s8(sign_down)
 
-    EXT         v14.16b,  v14.16b ,  v14.16b,#1 //I sign_up = vextq_s8(sign_up, sign_up, 1)
+    EXT         v17.16b,  v17.16b ,  v17.16b,#1 //I sign_up = vextq_s8(sign_up, sign_up, 1)
 //  TBL v19.8b, {v6.16b},v19.8b                //I vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
 
-    Uxtl        v20.8h, v12.8b              //I pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
-    AND         v18.16b,  v18.16b ,  v8.16b //I edge_idx = vandq_s8(edge_idx, au1_mask)
+    Uxtl        v20.8h, v5.8b               //I pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
+    AND         v18.16b,  v18.16b ,  v1.16b //I edge_idx = vandq_s8(edge_idx, au1_mask)
 
-    TBL         v10.16b, {v7.16b},v18.16b   //I offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx))
+    TBL         v3.16b, {v7.16b},v18.16b    //I offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx))
 
-    Uxtl2       v22.8h, v12.16b             //I pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
-    SADDW       v20.8h,  v20.8h ,  v10.8b   //I pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
+    Uxtl2       v22.8h, v5.16b              //I pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
+    SADDW       v20.8h,  v20.8h ,  v3.8b    //I pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
 
     SMAX        v20.8h,  v20.8h ,  v2.8h    //I pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
 //  TBL v11.8b, {v7.16b},v19.8b                    //I offset = vtbl1_s8(offset_tbl, vget_high_s8(edge_idx))
     UMIN        v20.8h,  v20.8h ,  v4.8h    //I pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
 
-    mov         v12.16b, v16.16b
-    SADDW2      v22.8h,  v22.8h ,  v10.16b  //I pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
+    mov         v5.16b, v16.16b
+    SADDW2      v22.8h,  v22.8h ,  v3.16b   //I pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
 
     SMAX        v22.8h,  v22.8h ,  v2.8h    //I pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
     UMIN        v22.8h,  v22.8h ,  v4.8h    //I pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip))
@@ -388,7 +387,7 @@ PU1_SRC_LOOP:
     csel        x11, x20, x11,GT            //II SIGN(pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd])
 
     ADD         x8,x14,x5                   //III pu1_src_left_cpy[ht_tmp - row]
-    mov         v14.8b[15], w11             //II sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd]), sign_up, 15)
+    mov         v17.8b[15], w11             //II sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd]), sign_up, 15)
     CMP         x7,#1                       //III
 
     BNE         NEXT_ROW_ELSE_2             //III
@@ -400,11 +399,11 @@ PU1_SRC_LOOP:
 
 NEXT_ROW_ELSE_2:
     LDRB        w8,[x8,#1]                  //III
-    cmhi        v24.16b,  v12.16b ,  v18.16b //II vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
+    cmhi        v24.16b,  v5.16b ,  v18.16b //II vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
     ADD         x5,x0,x1
 
     LDRB        w2,[x5,#15]                 //III pu1_src_cpy[15]
-    cmhi        v26.16b,  v18.16b ,  v12.16b //II vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
+    cmhi        v26.16b,  v18.16b ,  v5.16b //II vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
     LDRB        w5,[x0,#16]                 //III load the value
 
     SUB         x2,x2,x5                    //III pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd]
@@ -418,51 +417,51 @@ NEXT_ROW_ELSE_2:
     csel        x2, x20, x2,GT              //III SIGN(pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd])
 
     SUB         x7,x7,#1                    //III Decrement the ht_tmp loop count by 1
-    ADD         v26.16b,  v0.16b ,  v14.16b //II edge_idx = vaddq_s8(const_2, sign_up)
+    ADD         v26.16b,  v0.16b ,  v17.16b //II edge_idx = vaddq_s8(const_2, sign_up)
 
-    NEG         v14.16b, v24.16b            //II sign_up = vnegq_s8(sign_down)
+    NEG         v17.16b, v24.16b            //II sign_up = vnegq_s8(sign_down)
     EXT         v18.16b,  v18.16b ,  v30.16b,#15 //III pu1_next_row_tmp = vextq_u8(pu1_next_row_tmp, pu1_next_row, 15)
 
     ADD         v26.16b,  v26.16b ,  v24.16b //II edge_idx = vaddq_s8(edge_idx, sign_down)
 
-    EXT         v14.16b,  v14.16b ,  v14.16b,#1 //II sign_up = vextq_s8(sign_up, sign_up, 1)
+    EXT         v17.16b,  v17.16b ,  v17.16b,#1 //II sign_up = vextq_s8(sign_up, sign_up, 1)
     TBL         v26.16b, {v6.16b},v26.16b   //II vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
-    cmhi        v10.16b,  v16.16b ,  v18.16b //III vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
+    cmhi        v3.16b,  v16.16b ,  v18.16b //III vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
 
-    mov         v14.16b[15], w2             //III sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd]), sign_up, 15)
+    mov         v17.16b[15], w2             //III sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd]), sign_up, 15)
 //  TBL v27.8b, {v6.16b},v27.8b                //II vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
     cmhi        v18.16b,  v18.16b ,  v16.16b //III vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
 
-    Uxtl        v28.8h, v12.8b              //II pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
-    AND         v26.16b,  v26.16b ,  v8.16b //II edge_idx = vandq_s8(edge_idx, au1_mask)
+    Uxtl        v28.8h, v5.8b               //II pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
+    AND         v26.16b,  v26.16b ,  v1.16b //II edge_idx = vandq_s8(edge_idx, au1_mask)
 
-    SUB         v10.16b,  v18.16b ,  v10.16b //III sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
+    SUB         v3.16b,  v18.16b ,  v3.16b  //III sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
     TBL         v24.16b, {v7.16b},v26.16b   //II offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx))
-    ADD         v18.16b,  v0.16b ,  v14.16b //III edge_idx = vaddq_s8(const_2, sign_up)
+    ADD         v18.16b,  v0.16b ,  v17.16b //III edge_idx = vaddq_s8(const_2, sign_up)
 
-    ADD         v18.16b,  v18.16b ,  v10.16b //III edge_idx = vaddq_s8(edge_idx, sign_down)
+    ADD         v18.16b,  v18.16b ,  v3.16b //III edge_idx = vaddq_s8(edge_idx, sign_down)
 //  TBL v25.8b, {v7.16b},v27.8b                    //II offset = vtbl1_s8(offset_tbl, vget_high_s8(edge_idx))
-    NEG         v14.16b, v10.16b            //III sign_up = vnegq_s8(sign_down)
+    NEG         v17.16b, v3.16b             //III sign_up = vnegq_s8(sign_down)
 
     SADDW       v28.8h,  v28.8h ,  v24.8b   //II pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
     TBL         v18.16b, {v6.16b},v18.16b   //III vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
     SMAX        v28.8h,  v28.8h ,  v2.8h    //II pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
 
-    EXT         v14.16b,  v14.16b ,  v14.16b,#1 //III sign_up = vextq_s8(sign_up, sign_up, 1)
+    EXT         v17.16b,  v17.16b ,  v17.16b,#1 //III sign_up = vextq_s8(sign_up, sign_up, 1)
 //  TBL v19.8b, {v6.16b},v19.8b                //III vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
     UMIN        v28.8h,  v28.8h ,  v4.8h    //II pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
 
-    Uxtl2       v26.8h, v12.16b             //II pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
-    AND         v18.16b,  v18.16b ,  v8.16b //III edge_idx = vandq_s8(edge_idx, au1_mask)
+    Uxtl2       v26.8h, v5.16b              //II pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
+    AND         v18.16b,  v18.16b ,  v1.16b //III edge_idx = vandq_s8(edge_idx, au1_mask)
 
     SADDW2      v26.8h,  v26.8h ,  v24.16b  //II pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
-    TBL         v10.16b, {v7.16b},v18.16b   //III offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx))
+    TBL         v3.16b, {v7.16b},v18.16b    //III offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx))
     SMAX        v26.8h,  v26.8h ,  v2.8h    //II pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
 
     Uxtl        v20.8h, v16.8b              //III pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
     UMIN        v26.8h,  v26.8h ,  v4.8h    //II pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip))
 
-    SADDW       v20.8h,  v20.8h ,  v10.8b   //III pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
+    SADDW       v20.8h,  v20.8h ,  v3.8b    //III pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
 //  TBL v11.8b, {v7.16b},v19.8b                    //III offset = vtbl1_s8(offset_tbl, vget_high_s8(edge_idx))
     SMAX        v20.8h,  v20.8h ,  v2.8h    //III pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
 
@@ -470,12 +469,12 @@ NEXT_ROW_ELSE_2:
     UMIN        v20.8h,  v20.8h ,  v4.8h    //III pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
 
     xtn         v28.8b,  v28.8h             //II vmovn_s16(pi2_tmp_cur_row.val[0])
-    SADDW2      v22.8h,  v22.8h ,  v10.16b  //III pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
+    SADDW2      v22.8h,  v22.8h ,  v3.16b   //III pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
 
     xtn2        v28.16b,  v26.8h            //II vmovn_s16(pi2_tmp_cur_row.val[1])
     SMAX        v22.8h,  v22.8h ,  v2.8h    //III pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
 
-    mov         v12.16b, v30.16b            //II pu1_cur_row = pu1_next_row
+    mov         v5.16b, v30.16b             //II pu1_cur_row = pu1_next_row
     UMIN        v22.8h,  v22.8h ,  v4.8h    //III pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip))
 
     CMP         x7,#1                       //III
@@ -516,25 +515,25 @@ NEXT_ROW_POINTER_ASSIGNED_3:
     csel        x8, x20, x8,LT
 
     ST1         { v20.16b},[x0],x1          //III vst1q_u8(pu1_src_cpy, pu1_cur_row)
-    cmhi        v24.16b,  v12.16b ,  v18.16b //vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
+    cmhi        v24.16b,  v5.16b ,  v18.16b //vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
 
     MOV         x20,#1
     csel        x8, x20, x8,GT              //SIGN(pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd])
-    cmhi        v26.16b,  v18.16b ,  v12.16b //vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
+    cmhi        v26.16b,  v18.16b ,  v5.16b //vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
 
-    mov         v14.16b[15], w8             //sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd]), sign_up, 15)
+    mov         v17.16b[15], w8             //sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd]), sign_up, 15)
     SUB         v24.16b,  v26.16b ,  v24.16b //sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
 
-    Uxtl        v20.8h, v12.8b              //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
-    ADD         v26.16b,  v0.16b ,  v14.16b //edge_idx = vaddq_s8(const_2, sign_up)
+    Uxtl        v20.8h, v5.8b               //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
+    ADD         v26.16b,  v0.16b ,  v17.16b //edge_idx = vaddq_s8(const_2, sign_up)
 
-    Uxtl2       v22.8h, v12.16b             //pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
+    Uxtl2       v22.8h, v5.16b              //pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
     ADD         v26.16b,  v26.16b ,  v24.16b //edge_idx = vaddq_s8(edge_idx, sign_down)
 
     TBL         v26.16b, {v6.16b},v26.16b   //vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
 //  TBL v27.8b, {v6.16b},v27.8b                //vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
 
-    AND         v26.16b,  v26.16b ,  v8.16b //edge_idx = vandq_s8(edge_idx, au1_mask)
+    AND         v26.16b,  v26.16b ,  v1.16b //edge_idx = vandq_s8(edge_idx, au1_mask)
 
     TBL         v24.16b, {v7.16b},v26.16b   //offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx))
 
@@ -583,12 +582,12 @@ WD_16_HT_4_LOOP:
     csel        w8,w20,w8,EQ
     MOV         x20,#-1
     csel        x8, x20, x8,NE
-    mov         v8.8b[0], w8                //au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
+    mov         v1.8b[0], w8                //au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
 
     CMP         x6,#16                      //if(col == 16)
     BNE         SKIP_AU1_MASK_VAL_WD_16_HT_4
     LDRB        w8,[x5,#1]                  //pu1_avail[1]
-    mov         v8.16b[15], w8              //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
+    mov         v1.16b[15], w8              //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
 
 SKIP_AU1_MASK_VAL_WD_16_HT_4:
     LDRB        w8,[x5,#2]                  //pu1_avail[2]
@@ -598,7 +597,7 @@ SKIP_AU1_MASK_VAL_WD_16_HT_4:
     csel        x8, x20, x8,EQ
     csel        x8, x3, x8,NE
     ADD         x8,x8,#1                    //pu1_src - src_strd + 1
-    LD1         {v10.16b},[x8]              //pu1_top_row = vld1q_u8(pu1_src - src_strd + 1)
+    LD1         {v3.16b},[x8]               //pu1_top_row = vld1q_u8(pu1_src - src_strd + 1)
 
     ADD         x3,x3,#16
     ADD         x5,sp,#0x42                 //*au1_src_left_tmp
@@ -617,11 +616,11 @@ AU1_SRC_LEFT_LOOP_WD_16_HT_4:
     SUBS        x4,x4,#1                    //decrement the loop count
     BNE         AU1_SRC_LEFT_LOOP_WD_16_HT_4
 
-    LD1         {v12.16b},[x0]              //pu1_cur_row = vld1q_u8(pu1_src)
+    LD1         {v5.16b},[x0]               //pu1_cur_row = vld1q_u8(pu1_src)
 
-    cmhi        v14.16b,  v12.16b ,  v10.16b //vcgtq_u8(pu1_cur_row, pu1_top_row)
-    cmhi        v16.16b,  v10.16b ,  v12.16b //vcltq_u8(pu1_cur_row, pu1_top_row)
-    SUB         v14.16b,  v16.16b ,  v14.16b //sign_up = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
+    cmhi        v17.16b,  v5.16b ,  v3.16b  //vcgtq_u8(pu1_cur_row, pu1_top_row)
+    cmhi        v16.16b,  v3.16b ,  v5.16b  //vcltq_u8(pu1_cur_row, pu1_top_row)
+    SUB         v17.16b,  v16.16b ,  v17.16b //sign_up = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
     movi        v18.16b, #0
     MOV         x7,x12                      //row count, move ht_tmp to x7
 
@@ -665,31 +664,31 @@ SIGN_UP_CHANGE_WD_16_HT_4:
     csel        x8, x20, x8,LT
     MOV         x20,#1
     csel        x8, x20, x8,GT              //SIGN(pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd])
-    mov         v14.16b[15], w8             //sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd]), sign_up, 15)
+    mov         v17.16b[15], w8             //sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd]), sign_up, 15)
 
 SIGN_UP_CHANGE_DONE_WD_16_HT_4:
-    cmhi        v20.16b,  v12.16b ,  v18.16b //vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
-    cmhi        v22.16b,  v18.16b ,  v12.16b //vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
+    cmhi        v20.16b,  v5.16b ,  v18.16b //vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
+    cmhi        v22.16b,  v18.16b ,  v5.16b //vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
     SUB         v24.16b,  v22.16b ,  v20.16b //sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
 
-    ADD         v26.16b,  v0.16b ,  v14.16b //edge_idx = vaddq_s8(const_2, sign_up)
+    ADD         v26.16b,  v0.16b ,  v17.16b //edge_idx = vaddq_s8(const_2, sign_up)
     ADD         v26.16b,  v26.16b ,  v24.16b //edge_idx = vaddq_s8(edge_idx, sign_down)
     TBL         v26.16b, {v6.16b},v26.16b   //vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
 //  TBL v27.8b, {v6.16b},v27.8b                //vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
 
-    AND         v26.16b,  v26.16b ,  v8.16b //edge_idx = vandq_s8(edge_idx, au1_mask)
+    AND         v26.16b,  v26.16b ,  v1.16b //edge_idx = vandq_s8(edge_idx, au1_mask)
 
-    NEG         v14.16b, v24.16b            //sign_up = vnegq_s8(sign_down)
-    EXT         v14.16b,  v14.16b ,  v14.16b,#1 //sign_up = vextq_s8(sign_up, sign_up, 1)
+    NEG         v17.16b, v24.16b            //sign_up = vnegq_s8(sign_down)
+    EXT         v17.16b,  v17.16b ,  v17.16b,#1 //sign_up = vextq_s8(sign_up, sign_up, 1)
 
     TBL         v24.16b, {v7.16b},v26.16b   //offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx))
-    Uxtl        v28.8h, v12.8b              //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
+    Uxtl        v28.8h, v5.8b               //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
     SADDW       v28.8h,  v28.8h ,  v24.8b   //pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
     SMAX        v28.8h,  v28.8h ,  v2.8h    //pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
     UMIN        v28.8h,  v28.8h ,  v4.8h    //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
 
 //  TBL v25.8b, {v7.16b},v27.8b                    //offset = vtbl1_s8(offset_tbl, vget_high_s8(edge_idx))
-    Uxtl2       v30.8h, v12.16b             //pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
+    Uxtl2       v30.8h, v5.16b              //pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
     SADDW2      v30.8h,  v30.8h ,  v24.16b  //pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
     SMAX        v30.8h,  v30.8h ,  v2.8h    //pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
     UMIN        v30.8h,  v30.8h ,  v4.8h    //pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip))
@@ -699,7 +698,7 @@ SIGN_UP_CHANGE_DONE_WD_16_HT_4:
 
     ST1         { v28.16b},[x0],x1          //vst1q_u8(pu1_src_cpy, pu1_cur_row)
 
-    mov         v12.16b, v16.16b            //pu1_cur_row = pu1_next_row
+    mov         v5.16b, v16.16b             //pu1_cur_row = pu1_next_row
     SUBS        x7,x7,#1                    //Decrement the ht_tmp loop count by 1
     BNE         PU1_SRC_LOOP_WD_16_HT_4     //If not equal jump to PU1_SRC_LOOP_WD_16_HT_4
 
@@ -726,10 +725,10 @@ WIDTH_RESIDUE:
 
     MOV         x20,#-1
     csel        x8, x20, x8,NE
-    mov         v8.8b[0], w8                //au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
+    mov         v1.8b[0], w8                //au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
 
     LDRB        w8,[x5,#1]                  //pu1_avail[1]
-    mov         v8.8b[7], w8                //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
+    mov         v1.8b[7], w8                //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
 
 PU1_AVAIL_2_RESIDUE:
     LDRB        w8,[x5,#2]                  //pu1_avail[2]
@@ -739,7 +738,7 @@ PU1_AVAIL_2_RESIDUE:
     csel        x8, x20, x8,EQ
     csel        x8, x3, x8,NE
     ADD         x8,x8,#1                    //pu1_src - src_strd + 1
-    LD1         {v10.16b},[x8]              //pu1_top_row = vld1q_u8(pu1_src - src_strd + 1)
+    LD1         {v3.16b},[x8]               //pu1_top_row = vld1q_u8(pu1_src - src_strd + 1)
 
 
     ADD         x5,sp,#0x42                 //*au1_src_left_tmp
@@ -757,11 +756,11 @@ AU1_SRC_LEFT_LOOP_RESIDUE:
     SUBS        x4,x4,#1                    //decrement the loop count
     BNE         AU1_SRC_LEFT_LOOP_RESIDUE
 
-    LD1         {v12.16b},[x0]              //pu1_cur_row = vld1q_u8(pu1_src)
+    LD1         {v5.16b},[x0]               //pu1_cur_row = vld1q_u8(pu1_src)
 
-    cmhi        v14.16b,  v12.16b ,  v10.16b //vcgtq_u8(pu1_cur_row, pu1_top_row)
-    cmhi        v16.16b,  v10.16b ,  v12.16b //vcltq_u8(pu1_cur_row, pu1_top_row)
-    SUB         v14.16b,  v16.16b ,  v14.16b //sign_up = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
+    cmhi        v17.16b,  v5.16b ,  v3.16b  //vcgtq_u8(pu1_cur_row, pu1_top_row)
+    cmhi        v16.16b,  v3.16b ,  v5.16b  //vcltq_u8(pu1_cur_row, pu1_top_row)
+    SUB         v17.16b,  v16.16b ,  v17.16b //sign_up = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
     MOV         x7,x12                      //row count, move ht_tmp to x7
 
 PU1_SRC_LOOP_RESIDUE:
@@ -805,25 +804,25 @@ SIGN_UP_CHANGE_RESIDUE:
     csel        x8, x20, x8,LT
     MOV         x20,#1
     csel        x8, x20, x8,GT              //SIGN(pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd])
-    mov         v14.16b[15], w8             //sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd]), sign_up, 15)
+    mov         v17.16b[15], w8             //sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd]), sign_up, 15)
 
 SIGN_UP_CHANGE_DONE_RESIDUE:
-    cmhi        v20.16b,  v12.16b ,  v18.16b //vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
-    cmhi        v22.16b,  v18.16b ,  v12.16b //vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
+    cmhi        v20.16b,  v5.16b ,  v18.16b //vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
+    cmhi        v22.16b,  v18.16b ,  v5.16b //vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
     SUB         v24.16b,  v22.16b ,  v20.16b //sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
 
-    ADD         v26.16b,  v0.16b ,  v14.16b //edge_idx = vaddq_s8(const_2, sign_up)
+    ADD         v26.16b,  v0.16b ,  v17.16b //edge_idx = vaddq_s8(const_2, sign_up)
     ADD         v26.16b,  v26.16b ,  v24.16b //edge_idx = vaddq_s8(edge_idx, sign_down)
     TBL         v26.16b, {v6.16b},v26.16b   //vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
 //  TBL v27.8b, {v6.16b},v27.8b                //vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
 
-    AND         v26.16b,  v26.16b ,  v8.16b //edge_idx = vandq_s8(edge_idx, au1_mask)
+    AND         v26.16b,  v26.16b ,  v1.16b //edge_idx = vandq_s8(edge_idx, au1_mask)
 
-    NEG         v14.16b, v24.16b            //sign_up = vnegq_s8(sign_down)
-    EXT         v14.16b,  v14.16b ,  v14.16b,#1 //sign_up = vextq_s8(sign_up, sign_up, 1)
+    NEG         v17.16b, v24.16b            //sign_up = vnegq_s8(sign_down)
+    EXT         v17.16b,  v17.16b ,  v17.16b,#1 //sign_up = vextq_s8(sign_up, sign_up, 1)
 
     TBL         v24.8b, {v7.16b},v26.8b     //offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx))
-    Uxtl        v28.8h, v12.8b              //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
+    Uxtl        v28.8h, v5.8b               //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
     SADDW       v28.8h,  v28.8h ,  v24.8b   //pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
     SMAX        v28.8h,  v28.8h ,  v2.8h    //pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
     UMIN        v28.8h,  v28.8h ,  v4.8h    //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
@@ -831,7 +830,7 @@ SIGN_UP_CHANGE_DONE_RESIDUE:
     xtn         v30.8b,  v28.8h             //vmovn_s16(pi2_tmp_cur_row.val[0])
 
     ST1         {v30.8b},[x0],x1            //vst1q_u8(pu1_src_cpy, pu1_cur_row)
-    mov         v12.16b, v16.16b            //pu1_cur_row = pu1_next_row
+    mov         v5.16b, v16.16b             //pu1_cur_row = pu1_next_row
     SUBS        x7,x7,#1
     BNE         PU1_SRC_LOOP_RESIDUE
 
@@ -880,7 +879,6 @@ END_LOOPS:
     ldp         x23, x24,[sp], #16
     ldp         x21, x22,[sp], #16
     ldp         x19, x20,[sp], #16
-    pop_v_regs
     ret
 
 
diff --git a/common/arm64/ihevc_sao_edge_offset_class3_chroma.s b/common/arm64/ihevc_sao_edge_offset_class3_chroma.s
index cf25102..5c444c0 100644
--- a/common/arm64/ihevc_sao_edge_offset_class3_chroma.s
+++ b/common/arm64/ihevc_sao_edge_offset_class3_chroma.s
@@ -77,7 +77,7 @@ ihevc_sao_edge_offset_class3_chroma_av8:
     ldr         w10,[sp,#16]
     ldr         w11,[sp,#24]
 
-    push_v_regs
+
     // STMFD sp!, {x4-x12, x14}            //stack stores the values of the arguments
     stp         x19, x20,[sp,#-16]!
     stp         x21, x22,[sp,#-16]!
@@ -310,7 +310,7 @@ PU1_AVAIL_2_LOOP_END:
     LDR         x2, [x2, #:got_lo12:gi1_table_edge_idx]
 
     //VLD1.8        D6,[x6]                        @edge_idx_tbl = vld1_s8(gi1_table_edge_idx)
-    movi        v8.16b, #0xFF               //au1_mask = vdupq_n_s8(-1)
+    movi        v1.16b, #0xFF               //au1_mask = vdupq_n_s8(-1)
     MOV         x6,x7                       //move wd to x6 loop_count
 
     CMP         x7,#16                      //Compare wd with 16
@@ -328,20 +328,20 @@ WIDTH_LOOP_16:
     MOV         x20,#-1
     csel        x8, x20, x8,NE
 
-    mov         v8.8b[0], w8                //au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
+    mov         v1.8b[0], w8                //au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
     LDRB        w11,[x5,#2]                 //pu1_avail[2]
 
     CMP         x6,#16                      //if(col == 16)
-    mov         v8.8b[1], w8                //au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
+    mov         v1.8b[1], w8                //au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
 
     BNE         SKIP_AU1_MASK_VAL
     LDRB        w8,[x5,#1]                  //pu1_avail[1]
-    mov         v8.16b[14], w8              //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
-    mov         v8.16b[15], w8              //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
+    mov         v1.16b[14], w8              //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
+    mov         v1.16b[15], w8              //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
 
 SKIP_AU1_MASK_VAL:
     CMP         x11,#0
-    LD1         {v12.16b},[x0]              //pu1_cur_row = vld1q_u8(pu1_src)
+    LD1         {v5.16b},[x0]               //pu1_cur_row = vld1q_u8(pu1_src)
     //LD1 {v13.8b},[x0]                        //pu1_cur_row = vld1q_u8(pu1_src)
     //SUB x0, x0,#8
     ADD         x5,sp,#0x4B                 //*au1_src_left_tmp
@@ -352,21 +352,21 @@ SKIP_AU1_MASK_VAL:
     csel        x8, x3, x8,NE
 
     ADD         x8,x8,#2                    //pu1_src - src_strd + 2
-    LD1         {v10.16b},[x8]              //pu1_top_row = vld1q_u8(pu1_src - src_strd + 2)
+    LD1         {v3.16b},[x8]               //pu1_top_row = vld1q_u8(pu1_src - src_strd + 2)
     //LD1 {v11.8b},[x8]                        //pu1_top_row = vld1q_u8(pu1_src - src_strd + 2)
     //SUB x8, x8,#8
     ADD         x3,x3,#16
 
     mov         w4, w25                     //Loads ht
-    cmhi        v14.16b,  v12.16b ,  v10.16b //vcgtq_u8(pu1_cur_row, pu1_top_row)
+    cmhi        v17.16b,  v5.16b ,  v3.16b  //vcgtq_u8(pu1_cur_row, pu1_top_row)
     mov         w7, w24                     //Loads wd
 
     SUB         x7,x7,x6                    //(wd - col)
-    cmhi        v16.16b,  v10.16b ,  v12.16b //vcltq_u8(pu1_cur_row, pu1_top_row)
+    cmhi        v16.16b,  v3.16b ,  v5.16b  //vcltq_u8(pu1_cur_row, pu1_top_row)
     ADD         x7,x7,#14                   //15 + (wd - col)
 
     mov         x8, x26                     //Loads *pu1_src
-    SUB         v14.16b,  v16.16b ,  v14.16b //sign_up = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
+    SUB         v17.16b,  v16.16b ,  v17.16b //sign_up = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
     ADD         x7,x8,x7                    //pu1_src[0 * src_strd + 15 + (wd - col)]
 
 AU1_SRC_LEFT_LOOP:
@@ -418,29 +418,29 @@ AU1_SRC_LEFT_LOOP:
 
     movn        x20,#0
     csel        x9, x20, x9,LT              //I
-    mov         v14.16b[14], w8             //I sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[14] -pu1_src_cpy[16 - src_strd]), sign_up, 0)
+    mov         v17.16b[14], w8             //I sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[14] -pu1_src_cpy[16 - src_strd]), sign_up, 0)
     MOV         x20,#1
     csel        x9, x20, x9,GT              //I SIGN(pu1_src_cpy[15] - pu1_src_cpy[17 - src_strd]
 
-    mov         v14.16b[15], w9             //I sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[15] -pu1_src_cpy[17 - src_strd]), sign_up, 1)
+    mov         v17.16b[15], w9             //I sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[15] -pu1_src_cpy[17 - src_strd]), sign_up, 1)
 
 SIGN_UP_CHANGE_DONE:
     LD1         {v28.8b},[x2]               //edge_idx_tbl = vld1_s8(gi1_table_edge_idx)
-    cmhi        v20.16b,  v12.16b ,  v18.16b //I vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
+    cmhi        v20.16b,  v5.16b ,  v18.16b //I vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
 
-    cmhi        v22.16b,  v18.16b ,  v12.16b //I vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
+    cmhi        v22.16b,  v18.16b ,  v5.16b //I vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
     SUB         v22.16b,  v22.16b ,  v20.16b //I sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
 
-    ADD         v18.16b,  v0.16b ,  v14.16b //I edge_idx = vaddq_s8(const_2, sign_up)
+    ADD         v18.16b,  v0.16b ,  v17.16b //I edge_idx = vaddq_s8(const_2, sign_up)
     ADD         v18.16b,  v18.16b ,  v22.16b //I edge_idx = vaddq_s8(edge_idx, sign_down)
     TBL         v18.16b, {v28.16b},v18.16b  //I vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
-    NEG         v14.16b, v22.16b            //I sign_up = vnegq_s8(sign_down)
+    NEG         v17.16b, v22.16b            //I sign_up = vnegq_s8(sign_down)
 
     //TBL v19.8b, {v28.16b},v19.8b                //I vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
-    EXT         v14.16b,  v14.16b ,  v14.16b,#2 //I sign_up = vextq_s8(sign_up, sign_up, 2)
+    EXT         v17.16b,  v17.16b ,  v17.16b,#2 //I sign_up = vextq_s8(sign_up, sign_up, 2)
 
-    Uxtl        v20.8h, v12.8b              //I pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
-    AND         v18.16b,  v18.16b ,  v8.16b //I edge_idx = vandq_s8(edge_idx, au1_mask)
+    Uxtl        v20.8h, v5.8b               //I pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
+    AND         v18.16b,  v18.16b ,  v1.16b //I edge_idx = vandq_s8(edge_idx, au1_mask)
     mov         v19.d[0],v18.d[1]
 
     UZP1        v31.8b, v18.8b, v19.8b
@@ -452,13 +452,13 @@ SIGN_UP_CHANGE_DONE:
     ZIP2        v23.8b, v22.8b, v23.8b      //I
     mov         v22.8b,v31.8b
 
-    Uxtl2       v18.8h, v12.16b             //I pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
+    Uxtl2       v18.8h, v5.16b              //I pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
     SADDW       v20.8h,  v20.8h ,  v22.8b   //I pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
 
     SMAX        v20.8h,  v20.8h ,  v2.8h    //I pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
     UMIN        v20.8h,  v20.8h ,  v4.8h    //I pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
 
-    mov         v12.16b, v16.16b            //I pu1_cur_row = pu1_next_row
+    mov         v5.16b, v16.16b             //I pu1_cur_row = pu1_next_row
     SADDW       v18.8h,  v18.8h ,  v23.8b   //I pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
 
     SUB         x7,x7,#1                    //I Decrement the ht_tmp loop count by 1
@@ -507,18 +507,18 @@ PU1_SRC_LOOP:
     csel        x10, x20, x10,GT            //II SIGN(pu1_src_cpy[14] - pu1_src_cpy[16 - src_strd])
 
     CMP         x8,#0                       //II
-    mov         v14.8b[14], w10             //II sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[14] -pu1_src_cpy[16 - src_strd]), sign_up, 0)
+    mov         v17.8b[14], w10             //II sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[14] -pu1_src_cpy[16 - src_strd]), sign_up, 0)
     movn        x20,#0
     csel        x8, x20, x8,LT              //II
 
     MOV         x20,#1
     csel        x8, x20, x8,GT              //II SIGN(pu1_src_cpy[15] - pu1_src_cpy[17 - src_strd]
     SUB         x10,x12,x7                  //III ht_tmp - row
-    mov         v14.8b[15], w8              //II sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[15] -pu1_src_cpy[17 - src_strd]), sign_up, 1)
+    mov         v17.8b[15], w8              //II sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[15] -pu1_src_cpy[17 - src_strd]), sign_up, 1)
     ADD         x11,x14,x10,LSL #1          //III pu1_src_left_cpy[(ht_tmp - row) * 2]
 
     CMP         x7,#1                       //III
-    cmhi        v22.16b,  v12.16b ,  v28.16b //II vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
+    cmhi        v22.16b,  v5.16b ,  v28.16b //II vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
     BNE         NEXT_ROW_POINTER_ASSIGNED_2 //III
 
     mov         x5, x21                     //III Loads pu1_avail
@@ -529,7 +529,7 @@ PU1_SRC_LOOP:
 
 NEXT_ROW_POINTER_ASSIGNED_2:
     LDRH        w5,[x11,#2]                 //III
-    cmhi        v24.16b,  v28.16b ,  v12.16b //II vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
+    cmhi        v24.16b,  v28.16b ,  v5.16b //II vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
     ADD         x11,x0,x1                   //III
 
     LDRB        w9,[x11,#14]                //III pu1_src_cpy[14]
@@ -545,7 +545,7 @@ NEXT_ROW_POINTER_ASSIGNED_2:
     SUB         x10,x8,x10                  //III pu1_src_cpy[15] - pu1_src_cpy[17 - src_strd]
 
     CMP         x9,#0                       //III
-    ADD         v26.16b,  v0.16b ,  v14.16b //II edge_idx = vaddq_s8(const_2, sign_up)
+    ADD         v26.16b,  v0.16b ,  v17.16b //II edge_idx = vaddq_s8(const_2, sign_up)
     movn        x20,#0
     csel        x9, x20, x9,LT              //III
 
@@ -554,22 +554,22 @@ NEXT_ROW_POINTER_ASSIGNED_2:
     ADD         v26.16b,  v26.16b ,  v24.16b //II edge_idx = vaddq_s8(edge_idx, sign_down)
     CMP         x10,#0                      //III
 
-    NEG         v14.16b, v24.16b            //II sign_up = vnegq_s8(sign_down)
+    NEG         v17.16b, v24.16b            //II sign_up = vnegq_s8(sign_down)
     TBL         v26.16b, {v21.16b},v26.16b  //II vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
     movn        x20,#0
     csel        x10, x20, x10,LT            //III
     MOV         x20,#1
     csel        x10, x20, x10,GT            //III SIGN(pu1_src_cpy[15] - pu1_src_cpy[17 - src_strd]
 
-    EXT         v14.16b,  v14.16b ,  v14.16b,#2 //II sign_up = vextq_s8(sign_up, sign_up, 2)
+    EXT         v17.16b,  v17.16b ,  v17.16b,#2 //II sign_up = vextq_s8(sign_up, sign_up, 2)
     //TBL v27.8b, {v21.16b},v27.8b                //II vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
     cmhi        v22.16b,  v16.16b ,  v18.16b //III vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
 
-    mov         v14.16b[14], w9             //III sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[14] -pu1_src_cpy[16 - src_strd]), sign_up, 0)
-    AND         v26.16b,  v26.16b ,  v8.16b //II edge_idx = vandq_s8(edge_idx, au1_mask)
+    mov         v17.16b[14], w9             //III sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[14] -pu1_src_cpy[16 - src_strd]), sign_up, 0)
+    AND         v26.16b,  v26.16b ,  v1.16b //II edge_idx = vandq_s8(edge_idx, au1_mask)
     mov         v27.d[0],v26.d[1]
 
-    mov         v14.16b[15], w10            //III sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[15] -pu1_src_cpy[17 - src_strd]), sign_up, 1)
+    mov         v17.16b[15], w10            //III sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[15] -pu1_src_cpy[17 - src_strd]), sign_up, 1)
     UZP1        v31.8b, v26.8b, v27.8b
     UZP2        v27.8b, v26.8b, v27.8b      //II
     mov         v26.8b,v31.8b
@@ -578,7 +578,7 @@ NEXT_ROW_POINTER_ASSIGNED_2:
     TBL         v24.8b, {v6.16b},v26.8b     //II
     SUB         v22.16b,  v20.16b ,  v22.16b //III sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
 
-    ADD         v18.16b,  v0.16b ,  v14.16b //III edge_idx = vaddq_s8(const_2, sign_up)
+    ADD         v18.16b,  v0.16b ,  v17.16b //III edge_idx = vaddq_s8(const_2, sign_up)
     TBL         v25.8b, {v7.16b},v27.8b     //II
     ADD         v18.16b,  v18.16b ,  v22.16b //III edge_idx = vaddq_s8(edge_idx, sign_down)
 
@@ -587,16 +587,16 @@ NEXT_ROW_POINTER_ASSIGNED_2:
     ZIP2        v25.8b, v24.8b, v25.8b      //II
     mov         v24.8b,v31.8b
 
-    Uxtl        v28.8h, v12.8b              //II pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
+    Uxtl        v28.8h, v5.8b               //II pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
     TBL         v18.16b, {v20.16b},v18.16b  //III vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
-    NEG         v14.16b, v22.16b            //III sign_up = vnegq_s8(sign_down)
+    NEG         v17.16b, v22.16b            //III sign_up = vnegq_s8(sign_down)
 
     SADDW       v28.8h,  v28.8h ,  v24.8b   //II pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
     //TBL v19.8b, {v20.16b},v19.8b                //III vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
-    EXT         v14.16b,  v14.16b ,  v14.16b,#2 //III sign_up = vextq_s8(sign_up, sign_up, 2)
+    EXT         v17.16b,  v17.16b ,  v17.16b,#2 //III sign_up = vextq_s8(sign_up, sign_up, 2)
 
-    Uxtl2       v26.8h, v12.16b             //II pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
-    AND         v18.16b,  v18.16b ,  v8.16b //III edge_idx = vandq_s8(edge_idx, au1_mask)
+    Uxtl2       v26.8h, v5.16b              //II pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
+    AND         v18.16b,  v18.16b ,  v1.16b //III edge_idx = vandq_s8(edge_idx, au1_mask)
     mov         v19.d[0],v18.d[1]
 
     Uxtl        v20.8h, v16.8b              //III pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
@@ -620,7 +620,7 @@ NEXT_ROW_POINTER_ASSIGNED_2:
     xtn         v28.8b,  v28.8h             //II vmovn_s16(pi2_tmp_cur_row.val[0])
     SADDW       v20.8h,  v20.8h ,  v22.8b   //III pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
 
-    mov         v12.16b, v30.16b            //III pu1_cur_row = pu1_next_row
+    mov         v5.16b, v30.16b             //III pu1_cur_row = pu1_next_row
     UMIN        v26.8h,  v26.8h ,  v4.8h    //II pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip))
 
     SUB         x7,x7,#1                    //III Decrement the ht_tmp loop count by 1
@@ -682,27 +682,27 @@ NEXT_ROW_POINTER_ASSIGNED_3:
     csel        x8, x20, x8,GT              //SIGN(pu1_src_cpy[14] - pu1_src_cpy[16 - src_strd])
 
     CMP         x10,#0
-    mov         v14.16b[14], w8             //sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[14] -pu1_src_cpy[16 - src_strd]), sign_up, 0)
+    mov         v17.16b[14], w8             //sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[14] -pu1_src_cpy[16 - src_strd]), sign_up, 0)
     movn        x20,#0
     csel        x10, x20, x10,LT
 
     MOV         x20,#1
     csel        x10, x20, x10,GT            //SIGN(pu1_src_cpy[15] - pu1_src_cpy[17 - src_strd]
-    mov         v14.16b[15], w10            //sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[15] -pu1_src_cpy[17 - src_strd]), sign_up, 1)
-    cmhi        v20.16b,  v12.16b ,  v18.16b //vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
+    mov         v17.16b[15], w10            //sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[15] -pu1_src_cpy[17 - src_strd]), sign_up, 1)
+    cmhi        v20.16b,  v5.16b ,  v18.16b //vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
 
-    cmhi        v22.16b,  v18.16b ,  v12.16b //vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
+    cmhi        v22.16b,  v18.16b ,  v5.16b //vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
     SUB         v22.16b,  v22.16b ,  v20.16b //sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
 
-    ADD         v18.16b,  v0.16b ,  v14.16b //edge_idx = vaddq_s8(const_2, sign_up)
+    ADD         v18.16b,  v0.16b ,  v17.16b //edge_idx = vaddq_s8(const_2, sign_up)
     ADD         v18.16b,  v18.16b ,  v22.16b //edge_idx = vaddq_s8(edge_idx, sign_down)
     TBL         v18.16b, {v28.16b},v18.16b  //vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
     //TBL v19.8b, {v28.16b},v19.8b                //vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
 
-    AND         v18.16b,  v18.16b ,  v8.16b //edge_idx = vandq_s8(edge_idx, au1_mask)
+    AND         v18.16b,  v18.16b ,  v1.16b //edge_idx = vandq_s8(edge_idx, au1_mask)
     mov         v19.d[0],v18.d[1]
 
-    Uxtl        v20.8h, v12.8b              //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
+    Uxtl        v20.8h, v5.8b               //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
     UZP1        v31.8b, v18.8b, v19.8b
     UZP2        v19.8b, v18.8b, v19.8b
     mov         v18.8b,v31.8b
@@ -710,7 +710,7 @@ NEXT_ROW_POINTER_ASSIGNED_3:
     TBL         v22.8b, {v6.16b},v18.8b
     TBL         v23.8b, {v7.16b},v19.8b
 
-    Uxtl2       v18.8h, v12.16b             //pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
+    Uxtl2       v18.8h, v5.16b              //pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
     ZIP1        v31.8b, v22.8b, v23.8b
     ZIP2        v23.8b, v22.8b, v23.8b
     mov         v22.8b,v31.8b
@@ -762,15 +762,15 @@ WD_16_HT_4_LOOP:
     csel        w8,w20,w8,EQ
     MOV         x20,#-1
     csel        x8, x20, x8,NE
-    mov         v8.8b[0], w8                //au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
+    mov         v1.8b[0], w8                //au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
 
     CMP         x6,#16                      //if(col == 16)
-    mov         v8.8b[1], w8                //au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
+    mov         v1.8b[1], w8                //au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
 
     BNE         SKIP_AU1_MASK_VAL_WD_16_HT_4
     LDRB        w8,[x5,#1]                  //pu1_avail[1]
-    mov         v8.16b[14], w8              //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
-    mov         v8.16b[15], w8              //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
+    mov         v1.16b[14], w8              //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
+    mov         v1.16b[15], w8              //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
 
 SKIP_AU1_MASK_VAL_WD_16_HT_4:
     LDRB        w11,[x5,#2]                 //pu1_avail[2]
@@ -779,27 +779,27 @@ SKIP_AU1_MASK_VAL_WD_16_HT_4:
 
     CMP         x11,#0
     csel        x8, x3, x8,NE
-    LD1         {v12.16b},[x0]              //pu1_cur_row = vld1q_u8(pu1_src)
+    LD1         {v5.16b},[x0]               //pu1_cur_row = vld1q_u8(pu1_src)
     //LD1 {v13.8b},[x0]                        //pu1_cur_row = vld1q_u8(pu1_src)
     //SUB x0, x0,#8
     ADD         x8,x8,#2                    //pu1_src - src_strd + 2
 
     ADD         x3,x3,#16
-    LD1         {v10.16b},[x8]              //pu1_top_row = vld1q_u8(pu1_src - src_strd + 2)
+    LD1         {v3.16b},[x8]               //pu1_top_row = vld1q_u8(pu1_src - src_strd + 2)
     //LD1 {v11.8b},[x8]                        //pu1_top_row = vld1q_u8(pu1_src - src_strd + 2)
     //SUB x8, x8,#8
     ADD         x5,sp,#0x4B                 //*au1_src_left_tmp
 
     mov         w4, w25                     //Loads ht
-    cmhi        v14.16b,  v12.16b ,  v10.16b //vcgtq_u8(pu1_cur_row, pu1_top_row)
+    cmhi        v17.16b,  v5.16b ,  v3.16b  //vcgtq_u8(pu1_cur_row, pu1_top_row)
     mov         w7, w24                     //Loads wd
 
     SUB         x7,x7,x6                    //(wd - col)
-    cmhi        v16.16b,  v10.16b ,  v12.16b //vcltq_u8(pu1_cur_row, pu1_top_row)
+    cmhi        v16.16b,  v3.16b ,  v5.16b  //vcltq_u8(pu1_cur_row, pu1_top_row)
     ADD         x7,x7,#14                   //15 + (wd - col)
 
     mov         x8, x26                     //Loads *pu1_src
-    SUB         v14.16b,  v16.16b ,  v14.16b //sign_up = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
+    SUB         v17.16b,  v16.16b ,  v17.16b //sign_up = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
     ADD         x7,x8,x7                    //pu1_src[0 * src_strd + 15 + (wd - col)]
 
 AU1_SRC_LEFT_LOOP_WD_16_HT_4:
@@ -864,33 +864,33 @@ SIGN_UP_CHANGE_WD_16_HT_4:
     csel        x8, x20, x8,GT              //SIGN(pu1_src_cpy[14] - pu1_src_cpy[16 - src_strd])
 
     CMP         x10,#0
-    mov         v14.16b[14], w8             //sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[14] -pu1_src_cpy[16 - src_strd]), sign_up, 0)
+    mov         v17.16b[14], w8             //sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[14] -pu1_src_cpy[16 - src_strd]), sign_up, 0)
     movn        x20,#0
     csel        x10, x20, x10,LT
 
     MOV         x20,#1
     csel        x10, x20, x10,GT            //SIGN(pu1_src_cpy[15] - pu1_src_cpy[17 - src_strd]
-    mov         v14.16b[15], w10            //sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[15] -pu1_src_cpy[17 - src_strd]), sign_up, 1)
+    mov         v17.16b[15], w10            //sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[15] -pu1_src_cpy[17 - src_strd]), sign_up, 1)
 
 SIGN_UP_CHANGE_DONE_WD_16_HT_4:
     LD1         {v20.8b},[x2]               //edge_idx_tbl = vld1_s8(gi1_table_edge_idx)
-    cmhi        v22.16b,  v12.16b ,  v18.16b //vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
+    cmhi        v22.16b,  v5.16b ,  v18.16b //vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
 
-    cmhi        v24.16b,  v18.16b ,  v12.16b //vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
+    cmhi        v24.16b,  v18.16b ,  v5.16b //vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
     SUB         v24.16b,  v24.16b ,  v22.16b //sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
 
-    ADD         v26.16b,  v0.16b ,  v14.16b //edge_idx = vaddq_s8(const_2, sign_up)
+    ADD         v26.16b,  v0.16b ,  v17.16b //edge_idx = vaddq_s8(const_2, sign_up)
     ADD         v26.16b,  v26.16b ,  v24.16b //edge_idx = vaddq_s8(edge_idx, sign_down)
 
     mov         v20.d[1],v20.d[0]
-    NEG         v14.16b, v24.16b            //sign_up = vnegq_s8(sign_down)
+    NEG         v17.16b, v24.16b            //sign_up = vnegq_s8(sign_down)
     TBL         v26.16b, {v20.16b},v26.16b  //vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
 
     //TBL v27.8b, {v20.16b},v27.8b                //vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
-    EXT         v14.16b,  v14.16b ,  v14.16b,#2 //sign_up = vextq_s8(sign_up, sign_up, 2)
+    EXT         v17.16b,  v17.16b ,  v17.16b,#2 //sign_up = vextq_s8(sign_up, sign_up, 2)
 
-    Uxtl        v28.8h, v12.8b              //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
-    AND         v26.16b,  v26.16b ,  v8.16b //edge_idx = vandq_s8(edge_idx, au1_mask)
+    Uxtl        v28.8h, v5.8b               //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
+    AND         v26.16b,  v26.16b ,  v1.16b //edge_idx = vandq_s8(edge_idx, au1_mask)
     mov         v27.d[0],v26.d[1]
 
     UZP1        v31.8b, v26.8b, v27.8b
@@ -902,13 +902,13 @@ SIGN_UP_CHANGE_DONE_WD_16_HT_4:
     ZIP2        v25.8b, v24.8b, v25.8b
     mov         v24.8b,v31.8b
 
-    Uxtl2       v30.8h, v12.16b             //pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
+    Uxtl2       v30.8h, v5.16b              //pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
     SADDW       v28.8h,  v28.8h ,  v24.8b   //pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
 
     SMAX        v28.8h,  v28.8h ,  v2.8h    //pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
     UMIN        v28.8h,  v28.8h ,  v4.8h    //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
 
-    mov         v12.16b, v16.16b            //pu1_cur_row = pu1_next_row
+    mov         v5.16b, v16.16b             //pu1_cur_row = pu1_next_row
     SADDW       v30.8h,  v30.8h ,  v25.8b   //pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
 
     SMAX        v30.8h,  v30.8h ,  v2.8h    //pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
@@ -949,24 +949,24 @@ WIDTH_RESIDUE:
     LDRB        w11,[x5,#1]                 //pu1_avail[1]
 
     LDRB        w9,[x5,#2]                  //pu1_avail[2]
-    mov         v8.8b[0], w8                //au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
+    mov         v1.8b[0], w8                //au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
     CMP         x9,#0
 
     SUB         x20,x0,x1                   //pu1_src - src_strd
     csel        x10, x20, x10,EQ
-    mov         v8.8b[1], w8                //au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
+    mov         v1.8b[1], w8                //au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
     csel        x10, x3, x10,NE
 
     ADD         x10,x10,#2                  //pu1_src - src_strd + 2
-    mov         v8.8b[6], w11               //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
+    mov         v1.8b[6], w11               //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
     ADD         x5,sp,#0x4B                 //*au1_src_left_tmp
 
     mov         w4, w25                     //Loads ht
-    mov         v8.8b[7], w11               //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
+    mov         v1.8b[7], w11               //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
     mov         w7, w24                     //Loads wd
 
     mov         x8, x26                     //Loads *pu1_src
-    LD1         {v10.16b},[x10]             //pu1_top_row = vld1q_u8(pu1_src - src_strd + 2)
+    LD1         {v3.16b},[x10]              //pu1_top_row = vld1q_u8(pu1_src - src_strd + 2)
     //LD1 {v11.8b},[x10]                    //pu1_top_row = vld1q_u8(pu1_src - src_strd + 2)
     //SUB x10, x10,#8
     SUB         x7,x7,#2                    //(wd - 2)
@@ -980,15 +980,15 @@ AU1_SRC_LEFT_LOOP_RESIDUE:
     SUBS        x4,x4,#1                    //decrement the loop count
     BNE         AU1_SRC_LEFT_LOOP_RESIDUE
 
-    LD1         {v12.16b},[x0]              //pu1_cur_row = vld1q_u8(pu1_src)
+    LD1         {v5.16b},[x0]               //pu1_cur_row = vld1q_u8(pu1_src)
     //LD1 {v13.8b},[x0]                        //pu1_cur_row = vld1q_u8(pu1_src)
     //SUB x0, x0,#8
 
     movi        v18.16b, #0
-    cmhi        v14.16b,  v12.16b ,  v10.16b //vcgtq_u8(pu1_cur_row, pu1_top_row)
+    cmhi        v17.16b,  v5.16b ,  v3.16b  //vcgtq_u8(pu1_cur_row, pu1_top_row)
 
-    cmhi        v16.16b,  v10.16b ,  v12.16b //vcltq_u8(pu1_cur_row, pu1_top_row)
-    SUB         v14.16b,  v16.16b ,  v14.16b //sign_up = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
+    cmhi        v16.16b,  v3.16b ,  v5.16b  //vcltq_u8(pu1_cur_row, pu1_top_row)
+    SUB         v17.16b,  v16.16b ,  v17.16b //sign_up = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
     MOV         x7,x12                      //row count, move ht_tmp to x7
 
 PU1_SRC_LOOP_RESIDUE:
@@ -1047,33 +1047,33 @@ SIGN_UP_CHANGE_RESIDUE:
     csel        x8, x20, x8,GT              //SIGN(pu1_src_cpy[14] - pu1_src_cpy[16 - src_strd])
 
     CMP         x10,#0
-    mov         v14.16b[14], w8             //sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[14] -pu1_src_cpy[16 - src_strd]), sign_up, 0)
+    mov         v17.16b[14], w8             //sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[14] -pu1_src_cpy[16 - src_strd]), sign_up, 0)
     movn        x20,#0
     csel        x10, x20, x10,LT
 
     MOV         x20,#1
     csel        x10, x20, x10,GT            //SIGN(pu1_src_cpy[15] - pu1_src_cpy[17 - src_strd]
-    mov         v14.16b[15], w10            //sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[15] -pu1_src_cpy[17 - src_strd]), sign_up, 1)
+    mov         v17.16b[15], w10            //sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[15] -pu1_src_cpy[17 - src_strd]), sign_up, 1)
 
 SIGN_UP_CHANGE_DONE_RESIDUE:
     LD1         {v20.8b},[x2]               //edge_idx_tbl = vld1_s8(gi1_table_edge_idx)
-    cmhi        v22.16b,  v12.16b ,  v18.16b //vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
+    cmhi        v22.16b,  v5.16b ,  v18.16b //vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
 
-    cmhi        v24.16b,  v18.16b ,  v12.16b //vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
+    cmhi        v24.16b,  v18.16b ,  v5.16b //vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
     SUB         v24.16b,  v24.16b ,  v22.16b //sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
 
-    ADD         v26.16b,  v0.16b ,  v14.16b //edge_idx = vaddq_s8(const_2, sign_up)
+    ADD         v26.16b,  v0.16b ,  v17.16b //edge_idx = vaddq_s8(const_2, sign_up)
     ADD         v26.16b,  v26.16b ,  v24.16b //edge_idx = vaddq_s8(edge_idx, sign_down)
 
     mov         v20.d[1],v20.d[0]
-    NEG         v14.16b, v24.16b            //sign_up = vnegq_s8(sign_down)
+    NEG         v17.16b, v24.16b            //sign_up = vnegq_s8(sign_down)
     TBL         v26.16b, {v20.16b},v26.16b  //vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
 
     //TBL v27.8b, {v20.16b},v27.8b                //vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
-    EXT         v14.16b,  v14.16b ,  v14.16b,#2 //sign_up = vextq_s8(sign_up, sign_up, 14)
+    EXT         v17.16b,  v17.16b ,  v17.16b,#2 //sign_up = vextq_s8(sign_up, sign_up, 14)
 
-    Uxtl        v28.8h, v12.8b              //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
-    AND         v26.16b,  v26.16b ,  v8.16b //edge_idx = vandq_s8(edge_idx, au1_mask)
+    Uxtl        v28.8h, v5.8b               //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
+    AND         v26.16b,  v26.16b ,  v1.16b //edge_idx = vandq_s8(edge_idx, au1_mask)
     mov         v27.d[0],v26.d[1]
 
     UZP1        v31.8b, v26.8b, v27.8b
@@ -1085,7 +1085,7 @@ SIGN_UP_CHANGE_DONE_RESIDUE:
     ZIP2        v25.8b, v24.8b, v25.8b
     mov         v24.8b,v31.8b
 
-    mov         v12.16b, v16.16b            //pu1_cur_row = pu1_next_row
+    mov         v5.16b, v16.16b             //pu1_cur_row = pu1_next_row
     SADDW       v28.8h,  v28.8h ,  v24.8b   //pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
 
     SMAX        v28.8h,  v28.8h ,  v2.8h    //pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
@@ -1148,7 +1148,7 @@ END_LOOPS:
     ldp         x23, x24,[sp],#16
     ldp         x21, x22,[sp],#16
     ldp         x19, x20,[sp],#16
-    pop_v_regs
+
     ret
 
 
diff --git a/common/arm64/ihevc_weighted_pred_bi.s b/common/arm64/ihevc_weighted_pred_bi.s
index 6851cb4..c0508d8 100644
--- a/common/arm64/ihevc_weighted_pred_bi.s
+++ b/common/arm64/ihevc_weighted_pred_bi.s
@@ -161,7 +161,7 @@ ihevc_weighted_pred_bi_av8:
     sxtw        x11,w11
     sxtw        x12,w12
 
-    push_v_regs
+
     stp         x19, x20,[sp,#-16]!
     stp         x21, x22,[sp,#-16]!
     stp         x23, x24,[sp,#-16]!
@@ -221,64 +221,64 @@ core_loop:
     ld1         {v1.4h},[x1],#8             //load and increment the pi2_src2
     smull       v4.4s, v0.4h, v7.4h[0]      //vmull_n_s16(pi2_src1_val1, (int16_t) wgt0)
     ld1         {v2.4h},[x6],x3             //load and increment the pi2_src_tmp1 ii iteration
-    smull       v8.4s, v1.4h, v7.4h[1]      //vmull_n_s16(pi2_src2_val1, (int16_t) wgt1)
+    smull       v5.4s, v1.4h, v7.4h[1]      //vmull_n_s16(pi2_src2_val1, (int16_t) wgt1)
     ld1         {v3.4h},[x8],x4             //load and increment the pi2_src_tmp1 ii iteration
-    add         v4.4s,  v4.4s ,  v8.4s      //vaddq_s32(i4_tmp1_t1, i4_tmp1_t2)
+    add         v4.4s,  v4.4s ,  v5.4s      //vaddq_s32(i4_tmp1_t1, i4_tmp1_t2)
 
     ld1         {v0.4h},[x6],x3             //load and increment the pi2_src1 iii iteration
-    smull       v10.4s, v2.4h, v7.4h[0]     //vmull_n_s16(pi2_src1_val2, (int16_t) wgt0) ii iteration
+    smull       v6.4s, v2.4h, v7.4h[0]      //vmull_n_s16(pi2_src1_val2, (int16_t) wgt0) ii iteration
 
     ld1         {v1.4h},[x8],x4             //load and increment the pi2_src2 iii iteration
     add         v4.4s,  v4.4s ,  v30.4s     //vaddq_s32(i4_tmp1_t1, tmp_lvl_shift_t)
-    smull       v14.4s, v0.4h, v7.4h[0]     //vmull_n_s16(pi2_src1_val1, (int16_t) wgt0) iii iteration
+    smull       v19.4s, v0.4h, v7.4h[0]     //vmull_n_s16(pi2_src1_val1, (int16_t) wgt0) iii iteration
 
     ld1         {v2.4h},[x6],x3             //load and increment the pi2_src_tmp1 iv iteration
-    smull       v12.4s, v3.4h, v7.4h[1]     //vmull_n_s16(pi2_src2_val2, (int16_t) wgt1) ii iteration
+    smull       v17.4s, v3.4h, v7.4h[1]     //vmull_n_s16(pi2_src2_val2, (int16_t) wgt1) ii iteration
     sshl        v4.4s,v4.4s,v28.4s          //vshlq_s32(i4_tmp1_t1, tmp_shift_t)
 
     ld1         {v3.4h},[x8],x4             //load and increment the pi2_src_tmp1 iv iteration
-    add         v10.4s,  v10.4s ,  v12.4s   //vaddq_s32(i4_tmp2_t1, i4_tmp2_t2) ii iteration
+    add         v6.4s,  v6.4s ,  v17.4s     //vaddq_s32(i4_tmp2_t1, i4_tmp2_t2) ii iteration
 
     sqxtun      v4.4h, v4.4s                //vqmovun_s32(sto_res_tmp1)
     smull       v16.4s, v1.4h, v7.4h[1]     //vmull_n_s16(pi2_src2_val1, (int16_t) wgt1) iii iteration
 
-    add         v10.4s,  v10.4s ,  v30.4s   //vaddq_s32(i4_tmp2_t1, tmp_lvl_shift_t) ii iteration
+    add         v6.4s,  v6.4s ,  v30.4s     //vaddq_s32(i4_tmp2_t1, tmp_lvl_shift_t) ii iteration
     //mov v5, v4                        //vcombine_u16(sto_res_tmp2, sto_res_tmp2)
-    add         v14.4s,  v14.4s ,  v16.4s   //vaddq_s32(i4_tmp1_t1, i4_tmp1_t2) iii iteration
+    add         v19.4s,  v19.4s ,  v16.4s   //vaddq_s32(i4_tmp1_t1, i4_tmp1_t2) iii iteration
 
-    sshl        v10.4s,v10.4s,v28.4s
+    sshl        v6.4s,v6.4s,v28.4s
     //vshl.s32    q5,q5,q14                    //vshlq_s32(i4_tmp2_t1, tmp_shift_t) ii iteration
     smull       v18.4s, v2.4h, v7.4h[0]     //vmull_n_s16(pi2_src1_val2, (int16_t) wgt0) iv iteration
     uqxtn       v4.8b,v4.8h
     //vqmovn.u16    d4,q2                        //vqmovn_u16(sto_res_tmp3)
-    add         v14.4s,  v14.4s ,  v30.4s   //vaddq_s32(i4_tmp1_t1, tmp_lvl_shift_t) iii iteration
+    add         v19.4s,  v19.4s ,  v30.4s   //vaddq_s32(i4_tmp1_t1, tmp_lvl_shift_t) iii iteration
 
-    sqxtun      v10.4h, v10.4s              //vqmovun_s32(sto_res_tmp1) ii iteration
+    sqxtun      v6.4h, v6.4s                //vqmovun_s32(sto_res_tmp1) ii iteration
     smull       v20.4s, v3.4h, v7.4h[1]     //vmull_n_s16(pi2_src2_val2, (int16_t) wgt1) iv iteration
 
-    sshl        v14.4s,v14.4s,v28.4s
+    sshl        v19.4s,v19.4s,v28.4s
     //vshl.s32    q7,q7,q14                    //vshlq_s32(i4_tmp1_t1, tmp_shift_t) iii iteration
     //mov v11, v10                        //vcombine_u16(sto_res_tmp2, sto_res_tmp2) ii iteration
 
     add         v18.4s,  v18.4s ,  v20.4s   //vaddq_s32(i4_tmp2_t1, i4_tmp2_t2) iv iteration
-    sqxtun      v14.4h, v14.4s              //vqmovun_s32(sto_res_tmp1) iii iteration
+    sqxtun      v19.4h, v19.4s              //vqmovun_s32(sto_res_tmp1) iii iteration
 
     add         v18.4s,  v18.4s ,  v30.4s   //vaddq_s32(i4_tmp2_t1, tmp_lvl_shift_t) iv iteration
     st1         {v4.s}[0],[x2],#4           //store pu1_dst i iteration
 
-    uqxtn       v10.8b,v10.8h
+    uqxtn       v6.8b,v6.8h
     //vqmovn.u16    d10,q5                        //vqmovn_u16(sto_res_tmp3) ii iteration
     sshl        v18.4s,v18.4s,v28.4s
     //vshl.s32    q9,q9,q14                    //vshlq_s32(i4_tmp2_t1, tmp_shift_t) iv iteration
-    st1         {v10.s}[0],[x10],x5         //store pu1_dst ii iteration
+    st1         {v6.s}[0],[x10],x5          //store pu1_dst ii iteration
 
 
     //mov v15, v14                        //vcombine_u16(sto_res_tmp2, sto_res_tmp2) iii iteration
-    uqxtn       v14.8b,v14.8h
+    uqxtn       v19.8b,v19.8h
     //vqmovn.u16    d14,q7                        //vqmovn_u16(sto_res_tmp3) iii iteration
     sqxtun      v18.4h, v18.4s              //vqmovun_s32(sto_res_tmp1) iv iteration
     //mov v19, v18                        //vcombine_u16(sto_res_tmp2, sto_res_tmp2)
-    st1         {v14.s}[0],[x10],x5         //store pu1_dst iii iteration
+    st1         {v19.s}[0],[x10],x5         //store pu1_dst iii iteration
     uqxtn       v18.8b,v18.8h
     //vqmovn.u16    d18,q9                        //vqmovn_u16(sto_res_tmp3) iv iteration
     subs        x7,x7,#4                    //decrement wd by 4 and check for 0
@@ -306,7 +306,7 @@ end_loops:
     ldp         x23, x24,[sp],#16
     ldp         x21, x22,[sp],#16
     ldp         x19, x20,[sp],#16
-    pop_v_regs
+
     ret
 
 
diff --git a/common/arm64/ihevc_weighted_pred_bi_default.s b/common/arm64/ihevc_weighted_pred_bi_default.s
index 07fb4ce..d98e025 100644
--- a/common/arm64/ihevc_weighted_pred_bi_default.s
+++ b/common/arm64/ihevc_weighted_pred_bi_default.s
@@ -122,7 +122,7 @@ ihevc_weighted_pred_bi_default_av8:
     ldr         w9,[sp,#8]
 
     // stmfd sp!, {x4-x12, x14}                //stack stores the values of the arguments
-    push_v_regs
+
     stp         x19, x20,[sp,#-16]!
     stp         x21, x22,[sp,#-16]!
 
@@ -195,11 +195,11 @@ core_loop_4:
     ld1         {v6.4h},[x0],#8             //load and increment the pi2_src1
     add         x14,x2,x5                   //pu1_dst_tmp = pu1_dst + dst_strd
     ld1         {v7.4h},[x1],#8             //load and increment the pi2_src2
-    ld1         {v8.4h},[x11],x3            //load and increment the pi2_src1 ii iteration
+    ld1         {v1.4h},[x11],x3            //load and increment the pi2_src1 ii iteration
     sqadd       v18.4h,v6.4h,v7.4h
     sqadd       v18.4h,v18.4h,v0.4h         //vaddq_s32(i4_tmp1_t1, tmp_lvl_shift_t)
-    ld1         {v9.4h},[x12],x4            //load and increment the pi2_src2 ii iteration
-    sqadd       v20.4h,v8.4h,v9.4h          //vaddq_s32(i4_tmp2_t1, i4_tmp2_t2)
+    ld1         {v3.4h},[x12],x4            //load and increment the pi2_src2 ii iteration
+    sqadd       v20.4h,v1.4h,v3.4h          //vaddq_s32(i4_tmp2_t1, i4_tmp2_t2)
     sqadd       v19.4h,v20.4h,v0.4h         //vaddq_s32(i4_tmp2_t1, tmp_lvl_shift_t)
     mov         v18.d[1],v19.d[0]
     sqshrun     v20.8b, v18.8h,#7
@@ -250,11 +250,11 @@ core_loop_chroma_4x2:
     ld1         {v6.4h},[x0],#8             //load and increment the pi2_src1
     add         x14,x2,x5                   //pu1_dst_tmp = pu1_dst + dst_strd
     ld1         {v7.4h},[x1],#8             //load and increment the pi2_src2
-    ld1         {v8.4h},[x11],x3            //load and increment the pi2_src1 ii iteration
+    ld1         {v1.4h},[x11],x3            //load and increment the pi2_src1 ii iteration
     sqadd       v18.4h,v6.4h,v7.4h
     sqadd       v18.4h,v18.4h,v0.4h         //vaddq_s32(i4_tmp1_t1, tmp_lvl_shift_t)
-    ld1         {v9.4h},[x12],x4            //load and increment the pi2_src2 ii iteration
-    sqadd       v20.4h,v8.4h,v9.4h          //vaddq_s32(i4_tmp2_t1, i4_tmp2_t2)
+    ld1         {v3.4h},[x12],x4            //load and increment the pi2_src2 ii iteration
+    sqadd       v20.4h,v1.4h,v3.4h          //vaddq_s32(i4_tmp2_t1, i4_tmp2_t2)
     sqadd       v19.4h,v20.4h,v0.4h         //vaddq_s32(i4_tmp2_t1, tmp_lvl_shift_t)
     mov         v18.d[1],v19.d[0]
     sqshrun     v20.8b, v18.8h,#7
@@ -301,17 +301,17 @@ core_loop_8:
     ld1         { v18.8h},[x12],x4          //load and increment the pi2_src2 iii iteration
     sqadd       v22.8h,v22.8h,v0.8h         //vaddq_s32(i4_tmp2_t1, tmp_lvl_shift_t)
     sqshrun     v20.8b, v24.8h,#7
-    ld1         { v12.8h},[x11],x3          //load and increment the pi2_src1 iv iteration
+    ld1         { v17.8h},[x11],x3          //load and increment the pi2_src1 iv iteration
     sqadd       v30.8h,v16.8h,v18.8h
     sqshrun     v21.8b, v22.8h,#7
-    ld1         { v14.8h},[x12],x4          //load and increment the pi2_src2 iv iteration
+    ld1         { v29.8h},[x12],x4          //load and increment the pi2_src2 iv iteration
     sqadd       v30.8h,v30.8h,v0.8h         //vaddq_s32(i4_tmp1_t1, tmp_lvl_shift_t) iii iteration
     st1         {v20.2s},[x2],#8            //store pu1_dst i iteration
-    sqadd       v8.8h,v12.8h,v14.8h         //vaddq_s32(i4_tmp2_t1, i4_tmp2_t2) iv iteration
+    sqadd       v1.8h,v17.8h,v29.8h         //vaddq_s32(i4_tmp2_t1, i4_tmp2_t2) iv iteration
     st1         {v21.2s},[x14],x5           //store pu1_dst ii iteration
-    sqadd       v8.8h,v8.8h,v0.8h
+    sqadd       v1.8h,v1.8h,v0.8h
     sqshrun     v30.8b, v30.8h,#7
-    sqshrun     v31.8b, v8.8h,#7
+    sqshrun     v31.8b, v1.8h,#7
     add         x11,x0,x3                   //pi2_src_tmp1 = pi2_src1 + 2*src_strd1(2* because pi1_src is a 16 bit pointer)
     add         x12,x1,x4                   //pi2_src_tmp2 = pi2_src2 + 2*src_strd2(2* because pi2_src is a 16 bit pointer)
     st1         {v30.2s},[x14],x5           //store pu1_dst iii iteration                                                //vaddq_s32(i4_tmp2_t1, tmp_lvl_shift_t) iv iteratio
@@ -413,40 +413,40 @@ prolog_16:
 
     ld1         { v2.8h},[x0],#16           //load and increment the pi2_src1
     ld1         { v4.8h},[x1],#16           //load and increment the pi2_src2
-    ld1         { v10.8h},[x0],x11          //load and increment the pi2_src1
-    ld1         { v12.8h},[x1],x11          //load and increment the pi2_src2
+    ld1         { v5.8h},[x0],x11           //load and increment the pi2_src1
+    ld1         { v17.8h},[x1],x11          //load and increment the pi2_src2
     ld1         { v6.8h},[x0],#16           //load and increment the pi2_src1 ii iteration
     subs        x9,x9,#16
-    ld1         { v8.8h},[x1],#16           //load and increment the pi2_src2 ii iteration
+    ld1         { v1.8h},[x1],#16           //load and increment the pi2_src2 ii iteration
     sub         x20,x8,#2
     csel        x8, x20, x8,eq
     sqadd       v22.8h,v2.8h,v4.8h
-    ld1         { v14.8h},[x0],x12          //load and increment the pi2_src1 ii iteration
-    sqadd       v28.8h,v10.8h,v12.8h
+    ld1         { v29.8h},[x0],x12          //load and increment the pi2_src1 ii iteration
+    sqadd       v28.8h,v5.8h,v17.8h
     ld1         { v16.8h},[x1],x12          //load and increment the pi2_src2 ii iteration
     add         x20,x0,x7
     csel        x0, x20, x0,eq
     add         x20,x1,x7
     csel        x1, x20, x1,eq
-    sqadd       v24.8h,v6.8h,v8.8h
+    sqadd       v24.8h,v6.8h,v1.8h
     ld1         { v2.8h},[x0],#16
-    sqadd       v26.8h,v14.8h,v16.8h
+    sqadd       v26.8h,v29.8h,v16.8h
 // if the input is chroma with 8x2 block size
     cmp         x8,#0
     beq         epilog_16
 
     ld1         { v4.8h},[x1],#16           //load and increment the pi2_src2
     sqadd       v22.8h,v22.8h,v0.8h
-    ld1         { v10.8h},[x0],x11          //load and increment the pi2_src1
+    ld1         { v5.8h},[x0],x11           //load and increment the pi2_src1
     sqadd       v28.8h,v28.8h,v0.8h
-    ld1         { v12.8h},[x1],x11          //load and increment the pi2_src2
+    ld1         { v17.8h},[x1],x11          //load and increment the pi2_src2
     sqadd       v24.8h,v24.8h,v0.8h
     ld1         { v6.8h},[x0],#16           //load and increment the pi2_src1 ii iteration
     sqadd       v30.8h,v26.8h,v0.8h
     sqshrun     v20.8b, v22.8h,#7
-    ld1         { v8.8h},[x1],#16           //load and increment the pi2_src2 ii iteration
+    ld1         { v1.8h},[x1],#16           //load and increment the pi2_src2 ii iteration
     sqshrun     v21.8b, v28.8h,#7
-    ld1         { v14.8h},[x0],x12          //load and increment the pi2_src1 ii iteration
+    ld1         { v29.8h},[x0],x12          //load and increment the pi2_src1 ii iteration
     sqshrun     v26.8b, v24.8h,#7
     ld1         { v16.8h},[x1],x12          //load and increment the pi2_src2 ii iteration
     sqshrun     v27.8b, v30.8h,#7
@@ -463,15 +463,15 @@ core_loop_16:
     mov         v20.d[1],v21.d[0]
     mov         v26.d[1],v27.d[0]
     st1         { v20.4s},[x2],x5
-    sqadd       v28.8h,v10.8h,v12.8h
+    sqadd       v28.8h,v5.8h,v17.8h
     st1         { v26.4s},[x2],x10
     add         x20,x2,x14
     csel        x2, x20, x2,eq
-    sqadd       v24.8h,v6.8h,v8.8h
+    sqadd       v24.8h,v6.8h,v1.8h
     subs        x9,x9,#16
     add         x20,x0,x7
     csel        x0, x20, x0,eq
-    sqadd       v26.8h,v14.8h,v16.8h
+    sqadd       v26.8h,v29.8h,v16.8h
 
     add         x20,x1,x7
     csel        x1, x20, x1,eq
@@ -487,15 +487,15 @@ core_loop_16:
     sqadd       v28.8h,v28.8h,v0.8h
     ld1         { v4.8h},[x1],#16           //load and increment the pi2_src2
     sqadd       v24.8h,v24.8h,v0.8h
-    ld1         { v10.8h},[x0],x11          //load and increment the pi2_src1
+    ld1         { v5.8h},[x0],x11           //load and increment the pi2_src1
     sqadd       v30.8h,v26.8h,v0.8h
-    ld1         { v12.8h},[x1],x11          //load and increment the pi2_src2
+    ld1         { v17.8h},[x1],x11          //load and increment the pi2_src2
     sqshrun     v20.8b, v22.8h,#7
     ld1         { v6.8h},[x0],#16           //load and increment the pi2_src1 ii iteration
     sqshrun     v21.8b, v28.8h,#7
-    ld1         { v8.8h},[x1],#16           //load and increment the pi2_src2 ii iteration
+    ld1         { v1.8h},[x1],#16           //load and increment the pi2_src2 ii iteration
     sqshrun     v26.8b, v24.8h,#7
-    ld1         { v14.8h},[x0],x12          //load and increment the pi2_src1 ii iteration
+    ld1         { v29.8h},[x0],x12          //load and increment the pi2_src1 ii iteration
     sqshrun     v27.8b, v30.8h,#7
     ld1         { v16.8h},[x1],x12          //load and increment the pi2_src2 ii iteration
 
@@ -533,7 +533,7 @@ end_loops:
     // ldmfd sp!,{x4-x12,x15}                  //reload the registers from sp
     ldp         x21, x22,[sp],#16
     ldp         x19, x20,[sp],#16
-    pop_v_regs
+
     ret
 
 
diff --git a/common/arm64/ihevc_weighted_pred_uni.s b/common/arm64/ihevc_weighted_pred_uni.s
index d805230..5586679 100644
--- a/common/arm64/ihevc_weighted_pred_uni.s
+++ b/common/arm64/ihevc_weighted_pred_uni.s
@@ -129,7 +129,7 @@ ihevc_weighted_pred_uni_av8:
     ldr         w9,[sp,#8]
 
     // stmfd sp!, {x4-x12, x14}                //stack stores the values of the arguments
-    push_v_regs
+
     stp         x19, x20,[sp,#-16]!
     stp         x21, x22,[sp,#-16]!
 
@@ -175,37 +175,37 @@ core_loop:
     smull       v4.4s, v1.4h, v0.4h[0]      //vmull_n_s16(pi2_src_val1, (int16_t) wgt0)
 
     add         v4.4s,  v4.4s ,  v30.4s     //vaddq_s32(i4_tmp1_t, tmp_lvl_shift_t)
-    ld1         {v8.4h},[x5],x2             //load and increment the pi2_src iii iteration
+    ld1         {v3.4h},[x5],x2             //load and increment the pi2_src iii iteration
 
     smull       v6.4s, v2.4h, v0.4h[0]      //vmull_n_s16(pi2_src_val2, (int16_t) wgt0) ii iteration
-    ld1         {v9.4h},[x5],x2             //load and increment the pi2_src_tmp iv iteration
+    ld1         {v5.4h},[x5],x2             //load and increment the pi2_src_tmp iv iteration
 
     sshl        v4.4s,v4.4s,v28.4s
     //vshl.s32    q2,q2,q14                    //vshlq_s32(i4_tmp1_t, tmp_shift_t)
     add         v6.4s,  v6.4s ,  v30.4s     //vaddq_s32(i4_tmp2_t, tmp_lvl_shift_t) ii iteration
 
-    smull       v10.4s, v8.4h, v0.4h[0]     //vmull_n_s16(pi2_src_val1, (int16_t) wgt0) iii iteration
+    smull       v7.4s, v3.4h, v0.4h[0]      //vmull_n_s16(pi2_src_val1, (int16_t) wgt0) iii iteration
     sqxtun      v4.4h, v4.4s                //vqmovun_s32(sto_res_tmp1)
 
-    add         v10.4s,  v10.4s ,  v30.4s   //vaddq_s32(i4_tmp1_t, tmp_lvl_shift_t) iii iteration
+    add         v7.4s,  v7.4s ,  v30.4s     //vaddq_s32(i4_tmp1_t, tmp_lvl_shift_t) iii iteration
     //mov v5, v4                        //vcombine_u16(sto_res_tmp2, sto_res_tmp2)
 
     sshl        v6.4s,v6.4s,v28.4s
     //vshl.s32    q3,q3,q14                    //vshlq_s32(i4_tmp2_t, tmp_shift_t) ii iteration
 
-    smull       v12.4s, v9.4h, v0.4h[0]     //vmull_n_s16(pi2_src_val2, (int16_t) wgt0) iv iteration
+    smull       v16.4s, v5.4h, v0.4h[0]     //vmull_n_s16(pi2_src_val2, (int16_t) wgt0) iv iteration
     uqxtn       v4.8b,  v4.8h               //vqmovn_u16(sto_res_tmp3)
 
-    sshl        v10.4s,v10.4s,v28.4s
+    sshl        v7.4s,v7.4s,v28.4s
     //vshl.s32    q5,q5,q14                    //vshlq_s32(i4_tmp1_t, tmp_shift_t) iii iteration
     sqxtun      v6.4h, v6.4s                //vqmovun_s32(sto_res_tmp1) ii iteration
 
-    add         v12.4s,  v12.4s ,  v30.4s   //vaddq_s32(i4_tmp2_t, tmp_lvl_shift_t) iv iteration
+    add         v16.4s,  v16.4s ,  v30.4s   //vaddq_s32(i4_tmp2_t, tmp_lvl_shift_t) iv iteration
     //mov v7, v6                        //vcombine_u16(sto_res_tmp2, sto_res_tmp2) ii iteration
 
-    sqxtun      v10.4h, v10.4s              //vqmovun_s32(sto_res_tmp1) iii iteration
+    sqxtun      v7.4h, v7.4s                //vqmovun_s32(sto_res_tmp1) iii iteration
 
-    sshl        v12.4s,v12.4s,v28.4s
+    sshl        v16.4s,v16.4s,v28.4s
     //vshl.s32    q6,q6,q14                    //vshlq_s32(i4_tmp2_t, tmp_shift_t) iv iteration
     st1         {v4.s}[0],[x1],#4           //store pu1_dst i iteration
     //mov v11, v10                        //vcombine_u16(sto_res_tmp2, sto_res_tmp2) iii iteration
@@ -213,15 +213,15 @@ core_loop:
     uqxtn       v6.8b,  v6.8h               //vqmovn_u16(sto_res_tmp3) ii iteration
     st1         {v6.s}[0],[x6],x3           //store pu1_dst ii iteration
 
-    uqxtn       v10.8b,  v10.8h             //vqmovn_u16(sto_res_tmp3) iii iteration
-    sqxtun      v12.4h, v12.4s              //vqmovun_s32(sto_res_tmp1) iv iteration
+    uqxtn       v7.8b,  v7.8h               //vqmovn_u16(sto_res_tmp3) iii iteration
+    sqxtun      v16.4h, v16.4s              //vqmovun_s32(sto_res_tmp1) iv iteration
 
     //mov v13, v12                        //vcombine_u16(sto_res_tmp2, sto_res_tmp2) iv iteration
-    st1         {v10.s}[0],[x6],x3          //store pu1_dst i iteration iii iteration
-    uqxtn       v12.8b,  v12.8h             //vqmovn_u16(sto_res_tmp3) iv iteration
+    st1         {v7.s}[0],[x6],x3           //store pu1_dst i iteration iii iteration
+    uqxtn       v16.8b,  v16.8h             //vqmovn_u16(sto_res_tmp3) iv iteration
 
     subs        x9,x9,#4                    //decrement wd by 4 and check for 0
-    st1         {v12.s}[0],[x6],x3          //store pu1_dst iv iteration
+    st1         {v16.s}[0],[x6],x3          //store pu1_dst iv iteration
     bgt         core_loop                   //if greater than 0 repeat the core loop again
 
 end_core_loop:
@@ -239,7 +239,7 @@ end_loops:
     // ldmfd sp!,{x4-x12,x15}                  //reload the registers from sp
     ldp         x21, x22,[sp],#16
     ldp         x19, x20,[sp],#16
-    pop_v_regs
+
     ret
 
 
diff --git a/decoder/arm64/ihevcd_fmt_conv_420sp_to_rgba8888.s b/decoder/arm64/ihevcd_fmt_conv_420sp_to_rgba8888.s
index 485ee66..a6041f5 100644
--- a/decoder/arm64/ihevcd_fmt_conv_420sp_to_rgba8888.s
+++ b/decoder/arm64/ihevcd_fmt_conv_420sp_to_rgba8888.s
@@ -91,7 +91,10 @@ ihevcd_fmt_conv_420sp_to_rgba8888_av8:
 
     //// push the registers on the stack
     // STMFD sp!,{x4-x12,x14}
-    push_v_regs
+
+    stp         d12,d14,[sp,#-16]!
+    stp         d8,d15,[sp,#-16]!           // Storing d15 using { sub sp,sp,#8; str d15,[sp] } is giving bus error.
+                                            // d8 is used as dummy register and stored along with d15 using stp. d8 is not used in the function.
     stp         x19, x20,[sp,#-16]!
 
 
@@ -194,8 +197,8 @@ LABEL_YUV420SP_TO_RGB8888_WIDTH_LOOP:
     prfm        PLDL1KEEP,[x1]
 
     ////NEED TO MULTIPLY WITH Q2,Q3 WITH CO-EEFICIENTS
-    sMULL       v8.4s, v4.4h, v0.4h[3]      ////(U-128)*C4 FOR B
-    sMULL2      v10.4s, v4.8h, v0.4h[3]     ////(U-128)*C4 FOR B
+    sMULL       v5.4s, v4.4h, v0.4h[3]      ////(U-128)*C4 FOR B
+    sMULL2      v7.4s, v4.8h, v0.4h[3]      ////(U-128)*C4 FOR B
 
     sMULL       v20.4s, v6.4h, v0.4h[0]     ////(V-128)*C1 FOR R
     sMULL2      v22.4s, v6.8h, v0.4h[0]     ////(V-128)*C1 FOR R
@@ -206,13 +209,13 @@ LABEL_YUV420SP_TO_RGB8888_WIDTH_LOOP:
     sMLAL2      v14.4s, v6.8h, v0.4h[2]     ////Q7 = (U-128)*C2 + (V-128)*C3
 
     ////NARROW RIGHT SHIFT BY 13 FOR R&B
-    sqshrn      v8.4h, v8.4s,#13            ////D8 = (U-128)*C4>>13 4 16-BIT VALUES
-    sqshrn2     v8.8h, v10.4s,#13           ////D9 = (U-128)*C4>>13 4 16-BIT VALUES
+    sqshrn      v5.4h, v5.4s,#13            ////D8 = (U-128)*C4>>13 4 16-BIT VALUES
+    sqshrn2     v5.8h, v7.4s,#13            ////D9 = (U-128)*C4>>13 4 16-BIT VALUES
     ////Q4 - WEIGHT FOR B
 
     ////NARROW RIGHT SHIFT BY 13 FOR R&B
-    sqshrn      v10.4h, v20.4s,#13          ////D10 = (V-128)*C1>>13 4 16-BIT VALUES
-    sqshrn2     v10.8h, v22.4s,#13          ////D11 = (V-128)*C1>>13 4 16-BIT VALUES
+    sqshrn      v7.4h, v20.4s,#13           ////D10 = (V-128)*C1>>13 4 16-BIT VALUES
+    sqshrn2     v7.8h, v22.4s,#13           ////D11 = (V-128)*C1>>13 4 16-BIT VALUES
     ////Q5 - WEIGHT FOR R
 
     ////NARROW RIGHT SHIFT BY 13 FOR G
@@ -220,12 +223,12 @@ LABEL_YUV420SP_TO_RGB8888_WIDTH_LOOP:
     sqshrn2     v12.8h, v14.4s,#13          ////D13 = [(U-128)*C2 + (V-128)*C3]>>13 4 16-BIT VALUES
     ////Q6 - WEIGHT FOR G
 
-    UADDW       v14.8h,  v8.8h ,  v30.8b    ////Q7 - HAS Y + B
-    UADDW       v16.8h,  v10.8h ,  v30.8b   ////Q8 - HAS Y + R
+    UADDW       v14.8h,  v5.8h ,  v30.8b    ////Q7 - HAS Y + B
+    UADDW       v16.8h,  v7.8h ,  v30.8b    ////Q8 - HAS Y + R
     UADDW       v18.8h,  v12.8h ,  v30.8b   ////Q9 - HAS Y + G
 
-    UADDW       v20.8h,  v8.8h ,  v31.8b    ////Q10 - HAS Y + B
-    UADDW       v22.8h,  v10.8h ,  v31.8b   ////Q11 - HAS Y + R
+    UADDW       v20.8h,  v5.8h ,  v31.8b    ////Q10 - HAS Y + B
+    UADDW       v22.8h,  v7.8h ,  v31.8b    ////Q11 - HAS Y + R
     UADDW       v24.8h,  v12.8h ,  v31.8b   ////Q12 - HAS Y + G
 
     sqxtun      v14.8b, v14.8h
@@ -276,12 +279,12 @@ LABEL_YUV420SP_TO_RGB8888_WIDTH_LOOP:
 
     ////D14-D20 - TOALLY HAVE 16 VALUES
     ////WE NEED TO SHIFT R,G,B VALUES TO GET 5BIT,6BIT AND 5BIT COMBINATIONS
-    UADDW       v14.8h,  v8.8h ,  v28.8b    ////Q7 - HAS Y + B
-    UADDW       v16.8h,  v10.8h ,  v28.8b   ////Q2 - HAS Y + R
+    UADDW       v14.8h,  v5.8h ,  v28.8b    ////Q7 - HAS Y + B
+    UADDW       v16.8h,  v7.8h ,  v28.8b    ////Q2 - HAS Y + R
     UADDW       v18.8h,  v12.8h ,  v28.8b   ////Q3 - HAS Y + G
 
-    UADDW       v20.8h,  v8.8h ,  v29.8b    ////Q10 - HAS Y + B
-    UADDW       v22.8h,  v10.8h ,  v29.8b   ////Q11 - HAS Y + R
+    UADDW       v20.8h,  v5.8h ,  v29.8b    ////Q10 - HAS Y + B
+    UADDW       v22.8h,  v7.8h ,  v29.8b    ////Q11 - HAS Y + R
     UADDW       v24.8h,  v12.8h ,  v29.8b   ////Q12 - HAS Y + G
 
     ////COMPUTE THE ACTUAL RGB VALUES,WE CAN DO TWO ROWS AT A TIME
@@ -357,8 +360,8 @@ LABEL_YUV420SP_TO_RGB8888_WIDTH_LOOP_SKIP:
 
 
     ////NEED TO MULTIPLY WITH Q2,Q3 WITH CO-EEFICIENTS
-    sMULL       v8.4s, v4.4h, v0.4h[3]      ////(U-128)*C4 FOR B
-    sMULL2      v10.4s, v4.8h, v0.4h[3]     ////(U-128)*C4 FOR B
+    sMULL       v5.4s, v4.4h, v0.4h[3]      ////(U-128)*C4 FOR B
+    sMULL2      v7.4s, v4.8h, v0.4h[3]      ////(U-128)*C4 FOR B
 
     sMULL       v20.4s, v6.4h, v0.4h[0]     ////(V-128)*C1 FOR R
     sMULL2      v22.4s, v6.8h, v0.4h[0]     ////(V-128)*C1 FOR R
@@ -369,13 +372,13 @@ LABEL_YUV420SP_TO_RGB8888_WIDTH_LOOP_SKIP:
     sMLAL2      v14.4s, v6.8h, v0.4h[2]     ////Q7 = (U-128)*C2 + (V-128)*C3
 
     ////NARROW RIGHT SHIFT BY 13 FOR R&B
-    sqshrn      v8.4h, v8.4s,#13            ////D8 = (U-128)*C4>>13 4 16-BIT VALUES
-    sqshrn2     v8.8h, v10.4s,#13           ////D9 = (U-128)*C4>>13 4 16-BIT VALUES
+    sqshrn      v5.4h, v5.4s,#13            ////D8 = (U-128)*C4>>13 4 16-BIT VALUES
+    sqshrn2     v5.8h, v7.4s,#13            ////D9 = (U-128)*C4>>13 4 16-BIT VALUES
     ////Q4 - WEIGHT FOR B
 
     ////NARROW RIGHT SHIFT BY 13 FOR R&B
-    sqshrn      v10.4h, v20.4s,#13          ////D10 = (V-128)*C1>>13 4 16-BIT VALUES
-    sqshrn2     v10.8h, v22.4s,#13          ////D11 = (V-128)*C1>>13 4 16-BIT VALUES
+    sqshrn      v7.4h, v20.4s,#13           ////D10 = (V-128)*C1>>13 4 16-BIT VALUES
+    sqshrn2     v7.8h, v22.4s,#13           ////D11 = (V-128)*C1>>13 4 16-BIT VALUES
     ////Q5 - WEIGHT FOR R
 
     ////NARROW RIGHT SHIFT BY 13 FOR G
@@ -383,12 +386,12 @@ LABEL_YUV420SP_TO_RGB8888_WIDTH_LOOP_SKIP:
     sqshrn2     v12.8h, v14.4s,#13          ////D13 = [(U-128)*C2 + (V-128)*C3]>>13 4 16-BIT VALUES
     ////Q6 - WEIGHT FOR G
 
-    UADDW       v14.8h,  v8.8h ,  v30.8b    ////Q7 - HAS Y + B
-    UADDW       v16.8h,  v10.8h ,  v30.8b   ////Q8 - HAS Y + R
+    UADDW       v14.8h,  v5.8h ,  v30.8b    ////Q7 - HAS Y + B
+    UADDW       v16.8h,  v7.8h ,  v30.8b    ////Q8 - HAS Y + R
     UADDW       v18.8h,  v12.8h ,  v30.8b   ////Q9 - HAS Y + G
 
-    UADDW       v20.8h,  v8.8h ,  v31.8b    ////Q10 - HAS Y + B
-    UADDW       v22.8h,  v10.8h ,  v31.8b   ////Q11 - HAS Y + R
+    UADDW       v20.8h,  v5.8h ,  v31.8b    ////Q10 - HAS Y + B
+    UADDW       v22.8h,  v7.8h ,  v31.8b    ////Q11 - HAS Y + R
     UADDW       v24.8h,  v12.8h ,  v31.8b   ////Q12 - HAS Y + G
 
     sqxtun      v14.8b, v14.8h
@@ -439,12 +442,12 @@ LABEL_YUV420SP_TO_RGB8888_WIDTH_LOOP_SKIP:
 
     ////D14-D20 - TOALLY HAVE 16 VALUES
     ////WE NEED TO SHIFT R,G,B VALUES TO GET 5BIT,6BIT AND 5BIT COMBINATIONS
-    UADDW       v14.8h,  v8.8h ,  v28.8b    ////Q7 - HAS Y + B
-    UADDW       v16.8h,  v10.8h ,  v28.8b   ////Q2 - HAS Y + R
+    UADDW       v14.8h,  v5.8h ,  v28.8b    ////Q7 - HAS Y + B
+    UADDW       v16.8h,  v7.8h ,  v28.8b    ////Q2 - HAS Y + R
     UADDW       v18.8h,  v12.8h ,  v28.8b   ////Q3 - HAS Y + G
 
-    UADDW       v20.8h,  v8.8h ,  v29.8b    ////Q10 - HAS Y + B
-    UADDW       v22.8h,  v10.8h ,  v29.8b   ////Q11 - HAS Y + R
+    UADDW       v20.8h,  v5.8h ,  v29.8b    ////Q10 - HAS Y + B
+    UADDW       v22.8h,  v7.8h ,  v29.8b    ////Q11 - HAS Y + R
     UADDW       v24.8h,  v12.8h ,  v29.8b   ////Q12 - HAS Y + G
 
     sqxtun      v14.8b, v14.8h
@@ -513,7 +516,9 @@ LABEL_YUV420SP_TO_RGB8888_WIDTH_LOOP_SKIP:
     ////POP THE REGISTERS
     // LDMFD sp!,{x4-x12,PC}
     ldp         x19, x20,[sp],#16
-    pop_v_regs
+    ldp         d8,d15,[sp],#16             // Loading d15 using { ldr d15,[sp]; add sp,sp,#8 } is giving bus error.
+                                            // d8 is used as dummy register and loaded along with d15 using ldp. d8 is not used in the function.
+    ldp         d12,d14,[sp],#16
     ret
 
 
diff --git a/decoder/arm64/ihevcd_itrans_recon_dc_luma.s b/decoder/arm64/ihevcd_itrans_recon_dc_luma.s
index 279888b..edc70e7 100644
--- a/decoder/arm64/ihevcd_itrans_recon_dc_luma.s
+++ b/decoder/arm64/ihevcd_itrans_recon_dc_luma.s
@@ -59,7 +59,7 @@ ihevcd_itrans_recon_dc_luma_av8:
 
 
 
-    push_v_regs
+
     stp         x19, x20,[sp,#-16]!
     sxth        x5,w5
 
@@ -120,8 +120,8 @@ col_loop:
 
     ld1         {v6.8b},[x7],x2
     ld1         {v7.8b},[x7],x2
-    ld1         {v8.8b},[x7],x2
-    ld1         {v9.8b},[x7]
+    ld1         {v1.8b},[x7],x2
+    ld1         {v17.8b},[x7]
 
     add         x0,x0,#8
 
@@ -132,8 +132,8 @@ col_loop:
     uaddw       v24.8h,  v0.8h ,  v5.8b
     uaddw       v22.8h,  v0.8h ,  v6.8b
     uaddw       v20.8h,  v0.8h ,  v7.8b
-    uaddw       v18.8h,  v0.8h ,  v8.8b
-    uaddw       v16.8h,  v0.8h ,  v9.8b
+    uaddw       v18.8h,  v0.8h ,  v1.8b
+    uaddw       v16.8h,  v0.8h ,  v17.8b
 
     mov         x11,x1
     sqxtun      v2.8b, v30.8h
@@ -142,8 +142,8 @@ col_loop:
     sqxtun      v5.8b, v24.8h
     sqxtun      v6.8b, v22.8h
     sqxtun      v7.8b, v20.8h
-    sqxtun      v8.8b, v18.8h
-    sqxtun      v9.8b, v16.8h
+    sqxtun      v1.8b, v18.8h
+    sqxtun      v17.8b, v16.8h
 
 
     st1         {v2.2s},[x11],x3
@@ -152,8 +152,8 @@ col_loop:
     st1         {v5.2s},[x11],x3
     st1         {v6.2s},[x11],x3
     st1         {v7.2s},[x11],x3
-    st1         {v8.2s},[x11],x3
-    st1         {v9.2s},[x11]
+    st1         {v1.2s},[x11],x3
+    st1         {v17.2s},[x11]
 
     add         x1,x1,#8
 
@@ -206,7 +206,7 @@ col_loop_4:
 
 end_loops:
     ldp         x19, x20,[sp],#16
-    pop_v_regs
+
     ret
author	Naveen Kumar Ponnusamy <naveenkumar.p@ittiam.com>	2014-06-10 12:14:27 -0700
committer	Lajos Molnar <lajos@google.com>	2014-07-12 15:09:24 -0700
commit	9cbd70a2930875be59d7df68136ac9a1a949a13d (patch)
tree	6d9957d14352fc77e2323f90b49387e577f1ade2
parent	707042fda96ebede81408b854385173483798bcd (diff)
download	android_external_libhevc-9cbd70a2930875be59d7df68136ac9a1a949a13d.tar.gz android_external_libhevc-9cbd70a2930875be59d7df68136ac9a1a949a13d.tar.bz2 android_external_libhevc-9cbd70a2930875be59d7df68136ac9a1a949a13d.zip