diff options
author | Naveen Kumar Ponnusamy <naveenkumar.p@ittiam.com> | 2014-06-10 12:14:27 -0700 |
---|---|---|
committer | Lajos Molnar <lajos@google.com> | 2014-07-12 15:09:24 -0700 |
commit | 9cbd70a2930875be59d7df68136ac9a1a949a13d (patch) | |
tree | 6d9957d14352fc77e2323f90b49387e577f1ade2 | |
parent | 707042fda96ebede81408b854385173483798bcd (diff) | |
download | android_external_libhevc-9cbd70a2930875be59d7df68136ac9a1a949a13d.tar.gz android_external_libhevc-9cbd70a2930875be59d7df68136ac9a1a949a13d.tar.bz2 android_external_libhevc-9cbd70a2930875be59d7df68136ac9a1a949a13d.zip |
Reduced stack operations in arm64 assembly
Change-Id: Ia19a99001fef37334f18521dd8f8710907fe370d
46 files changed, 2398 insertions, 2331 deletions
diff --git a/common/arm64/ihevc_deblk_luma_horz.s b/common/arm64/ihevc_deblk_luma_horz.s index a5c314d..f6989e9 100644 --- a/common/arm64/ihevc_deblk_luma_horz.s +++ b/common/arm64/ihevc_deblk_luma_horz.s @@ -50,7 +50,8 @@ ihevc_deblk_luma_horz_av8: // stmfd sp!, {x3-x12,x14} sxtw x5,w5 sxtw x6,w6 - stp d8,d9,[sp,#-16]! + stp d8,d9,[sp,#-16]! // Storing d9 using { sub sp,sp,#8; str d9,[sp] } is giving bus error. + // d8 is used as dummy register and stored along with d9 using stp. d8 is not used in the function. stp d10,d11,[sp,#-16]! stp d12,d13,[sp,#-16]! stp d14,d15,[sp,#-16]! @@ -212,11 +213,11 @@ l1.1564: neg x19, x1 ldrb w7,[x0,x19] // has the -1 value dup v22.2s,w2 // -4 value - uaddw v8.8h, v6.8h , v27.8b + uaddw v7.8h, v6.8h , v27.8b ldrb w3,[x0,#0] // x4 has the 0 value uqadd v16.8b, v27.8b , v1.8b and x2,x2,#0xff - mul v12.8h, v8.8h, v0.4h[0] + mul v12.8h, v7.8h, v0.4h[0] ldr w8, [x0,x10] // has the 3 value uaddl v10.8h, v24.8b , v28.8b subs x2,x2,x7 @@ -233,7 +234,7 @@ l1.1564: cmp x8,x5,asr #3 bge l1.1840 - uaddw v14.8h, v8.8h , v28.8b + uaddw v14.8h, v7.8h , v28.8b subs x7,x3,x7 umax v4.8b, v18.8b , v31.8b csneg x7,x7,x7,pl @@ -285,13 +286,13 @@ l1.1564: subs x2,x2,x7 umax v3.8b, v18.8b , v31.8b csneg x2,x2,x2,pl - uaddw v8.8h, v6.8h , v26.8b + uaddw v7.8h, v6.8h , v26.8b add x8,x8,x2 uqadd v30.8b, v25.8b , v1.8b cmp x8,x5,asr #3 uqsub v31.8b, v25.8b , v1.8b bge l1.1840 - mul v12.8h, v8.8h, v0.4h[0] + mul v12.8h, v7.8h, v0.4h[0] subs x7,x3,x7 uqadd v16.8b, v24.8b , v1.8b csneg x7,x7,x7,pl @@ -303,7 +304,7 @@ l1.1564: add x10, x10,#1 rshrn v20.8b, v12.8h,#3 cmp x7,x10,asr #1 - uaddw v14.8h, v8.8h , v23.8b + uaddw v14.8h, v7.8h , v23.8b bge l1.1840 umin v18.8b, v20.8b , v30.8b mov x2,#2 @@ -397,7 +398,7 @@ end_dep_deq_decision_horz: cmp x2,#1 uqsub v31.8b, v23.8b , v1.8b beq l1.2408 - uaddl v8.8h, v23.8b , v22.8b + uaddl v7.8h, v23.8b , v22.8b cmp x5,#1 bne strong_filtering_p @@ -412,10 +413,10 @@ strong_filtering_q: strong_filtering_p: umax v5.8b, v18.8b , v17.8b mov x12,x0 - mul v8.8h, v8.8h, v0.4h[0] + mul v7.8h, v7.8h, v0.4h[0] sub x20,x1,#0 neg x11, x20 - add v16.8h, v8.8h , v14.8h + add v16.8h, v7.8h , v14.8h add x12,x12,x11 rshrn v19.8b, v16.8h,#3 st1 {v2.s}[0],[x12],x11 @@ -431,7 +432,8 @@ l1.2404: ldp d14,d15,[sp],#16 ldp d12,d13,[sp],#16 ldp d10,d11,[sp],#16 - ldp d8,d9,[sp],#16 + ldp d8,d9,[sp],#16 // Loading d9 using { ldr d9,[sp]; add sp,sp,#8 } is giving bus error. + // d8 is used as dummy register and loaded along with d9 using ldp. d8 is not used in the function. ret // x4=flag p @@ -486,8 +488,8 @@ l1.2408: srshr v10.8h, v10.8h,#4 // delta = ( 9 * (pu1_src[0] - pu1_src[-1]) - 3 * (pu1_src[1] - pu1_src[-2]) + 8 ) >> 4@ - abs v8.8h, v10.8h - xtn v9.8b, v8.8h + abs v7.8h, v10.8h + xtn v9.8b, v7.8h // storing the absolute values of delta in d9 sqxtn v10.8b, v10.8h @@ -495,16 +497,16 @@ l1.2408: smin v11.8b, v10.8b , v30.8b - smax v8.8b, v31.8b , v11.8b // d8 has the value delta = clip3(delta, -tc, tc)// + smax v7.8b, v31.8b , v11.8b // d8 has the value delta = clip3(delta, -tc, tc)// uxtl v6.8h, v25.8b - saddw v4.8h, v6.8h , v8.8b + saddw v4.8h, v6.8h , v7.8b sqxtun v12.8b, v4.8h uxtl v6.8h, v26.8b - ssubw v4.8h, v6.8h , v8.8b + ssubw v4.8h, v6.8h , v7.8b sqxtun v13.8b, v4.8h @@ -525,7 +527,7 @@ l1.2408: uaddl v14.8h, v23.8b , v25.8b rshrn v14.8b, v14.8h,#1 usubl v14.8h, v14.8b , v24.8b - saddw v14.8h, v14.8h , v8.8b + saddw v14.8h, v14.8h , v7.8b sqshrn v14.8b, v14.8h,#1 smin v15.8b, v14.8b , v0.8b smax v14.8b, v1.8b , v15.8b @@ -558,7 +560,7 @@ l1.2724: uaddl v14.8h, v26.8b , v28.8b rshrn v14.8b, v14.8h,#1 usubl v14.8h, v14.8b , v27.8b - ssubw v14.8h, v14.8h , v8.8b + ssubw v14.8h, v14.8h , v7.8b sqshrn v14.8b, v14.8h,#1 smin v15.8b, v14.8b , v0.8b smax v14.8b, v1.8b , v15.8b @@ -580,7 +582,8 @@ l1.2852: ldp d14,d15,[sp],#16 ldp d12,d13,[sp],#16 ldp d10,d11,[sp],#16 - ldp d8,d9,[sp],#16 + ldp d8,d9,[sp],#16 // Loading d9 using { ldr d9,[sp]; add sp,sp,#8 } is giving bus error. + // d8 is used as dummy register and loaded along with d9 using ldp. d8 is not used in the function. ret diff --git a/common/arm64/ihevc_inter_pred_chroma_copy_w16out.s b/common/arm64/ihevc_inter_pred_chroma_copy_w16out.s index e479651..180e5f5 100644 --- a/common/arm64/ihevc_inter_pred_chroma_copy_w16out.s +++ b/common/arm64/ihevc_inter_pred_chroma_copy_w16out.s @@ -104,7 +104,7 @@ ihevc_inter_pred_chroma_copy_w16out_av8: // stmfd sp!, {x4-x12, x14} //stack stores the values of the arguments - push_v_regs + stp x19, x20,[sp,#-16]! mov x15,x4 // pi1_coeff @@ -172,7 +172,7 @@ end_inner_loop_wd_4: end_loops: // ldmfd sp!,{x4-x12,x15} //reload the registers from sp ldp x19, x20,[sp],#16 - pop_v_regs + ret @@ -219,14 +219,14 @@ core_loop_wd_8: prolog: add x6,x0,x2 //pu1_src_tmp += src_strd add x10,x1,x5 - ld1 {v8.8b},[x0],#8 //vld1_u8(pu1_src_tmp) - ld1 {v10.8b},[x6],x2 //vld1_u8(pu1_src_tmp) - ld1 {v12.8b},[x6],x2 //vld1_u8(pu1_src_tmp) - ld1 {v14.8b},[x6],x2 //vld1_u8(pu1_src_tmp) - uxtl v16.8h, v8.8b //vmovl_u8(vld1_u8(pu1_src_tmp)) - uxtl v18.8h, v10.8b //vmovl_u8(vld1_u8(pu1_src_tmp) - uxtl v20.8h, v12.8b //vmovl_u8(vld1_u8(pu1_src_tmp) - uxtl v22.8h, v14.8b //vmovl_u8(vld1_u8(pu1_src_tmp) + ld1 {v1.8b},[x0],#8 //vld1_u8(pu1_src_tmp) + ld1 {v3.8b},[x6],x2 //vld1_u8(pu1_src_tmp) + ld1 {v5.8b},[x6],x2 //vld1_u8(pu1_src_tmp) + ld1 {v7.8b},[x6],x2 //vld1_u8(pu1_src_tmp) + uxtl v16.8h, v1.8b //vmovl_u8(vld1_u8(pu1_src_tmp)) + uxtl v18.8h, v3.8b //vmovl_u8(vld1_u8(pu1_src_tmp) + uxtl v20.8h, v5.8b //vmovl_u8(vld1_u8(pu1_src_tmp) + uxtl v22.8h, v7.8b //vmovl_u8(vld1_u8(pu1_src_tmp) subs x4,x4,#8 //wd decrements by 8 shl v0.8h, v16.8h,#6 //vshlq_n_s16(tmp, 6) shl v2.8h, v18.8h,#6 //vshlq_n_s16(tmp, 6) @@ -235,10 +235,10 @@ prolog: add x20,x0,x8 csel x0, x20, x0,le add x6,x0,x2 //pu1_src_tmp += src_strd - ld1 {v8.8b},[x0],#8 //vld1_u8(pu1_src_tmp) - ld1 {v10.8b},[x6],x2 //vld1_u8(pu1_src_tmp) - ld1 {v12.8b},[x6],x2 //vld1_u8(pu1_src_tmp) - ld1 {v14.8b},[x6],x2 //vld1_u8(pu1_src_tmp) + ld1 {v1.8b},[x0],#8 //vld1_u8(pu1_src_tmp) + ld1 {v3.8b},[x6],x2 //vld1_u8(pu1_src_tmp) + ld1 {v5.8b},[x6],x2 //vld1_u8(pu1_src_tmp) + ld1 {v7.8b},[x6],x2 //vld1_u8(pu1_src_tmp) st1 {v0.8h},[x1],#16 //vst1q_s16(pi2_dst_tmp, tmp) add x20,x1,x11,lsl #1 @@ -256,15 +256,15 @@ prolog: outer_loop_wd_8: st1 {v2.8h},[x10],x5 //vst1q_s16(pi2_dst_tmp, tmp) - uxtl v16.8h, v8.8b //vmovl_u8(vld1_u8(pu1_src_tmp)) + uxtl v16.8h, v1.8b //vmovl_u8(vld1_u8(pu1_src_tmp)) st1 {v4.8h},[x10],x5 //vst1q_s16(pi2_dst_tmp, tmp) - uxtl v18.8h, v10.8b //vmovl_u8(vld1_u8(pu1_src_tmp) + uxtl v18.8h, v3.8b //vmovl_u8(vld1_u8(pu1_src_tmp) st1 {v6.8h},[x10],x5 //vst1q_s16(pi2_dst_tmp, tmp) - uxtl v20.8h, v12.8b //vmovl_u8(vld1_u8(pu1_src_tmp) + uxtl v20.8h, v5.8b //vmovl_u8(vld1_u8(pu1_src_tmp) - uxtl v22.8h, v14.8b //vmovl_u8(vld1_u8(pu1_src_tmp) + uxtl v22.8h, v7.8b //vmovl_u8(vld1_u8(pu1_src_tmp) subs x4,x4,#8 //wd decrements by 8 add x20,x0,x8 @@ -272,16 +272,16 @@ outer_loop_wd_8: add x6,x0,x2 //pu1_src_tmp += src_strd - ld1 {v8.8b},[x0],#8 //vld1_u8(pu1_src_tmp) + ld1 {v1.8b},[x0],#8 //vld1_u8(pu1_src_tmp) shl v0.8h, v16.8h,#6 //vshlq_n_s16(tmp, 6) - ld1 {v10.8b},[x6],x2 //vld1_u8(pu1_src_tmp) + ld1 {v3.8b},[x6],x2 //vld1_u8(pu1_src_tmp) shl v2.8h, v18.8h,#6 //vshlq_n_s16(tmp, 6) - ld1 {v12.8b},[x6],x2 //vld1_u8(pu1_src_tmp) + ld1 {v5.8b},[x6],x2 //vld1_u8(pu1_src_tmp) shl v4.8h, v20.8h,#6 //vshlq_n_s16(tmp, 6) - ld1 {v14.8b},[x6],x2 //vld1_u8(pu1_src_tmp) + ld1 {v7.8b},[x6],x2 //vld1_u8(pu1_src_tmp) add x10,x1,x5 shl v6.8h, v22.8h,#6 //vshlq_n_s16(tmp, 6) @@ -298,15 +298,15 @@ outer_loop_wd_8: epilog: st1 {v2.8h},[x10],x5 //vst1q_s16(pi2_dst_tmp, tmp) - uxtl v16.8h, v8.8b //vmovl_u8(vld1_u8(pu1_src_tmp)) + uxtl v16.8h, v1.8b //vmovl_u8(vld1_u8(pu1_src_tmp)) st1 {v4.8h},[x10],x5 //vst1q_s16(pi2_dst_tmp, tmp) - uxtl v18.8h, v10.8b //vmovl_u8(vld1_u8(pu1_src_tmp) + uxtl v18.8h, v3.8b //vmovl_u8(vld1_u8(pu1_src_tmp) st1 {v6.8h},[x10],x5 //vst1q_s16(pi2_dst_tmp, tmp) - uxtl v20.8h, v12.8b //vmovl_u8(vld1_u8(pu1_src_tmp) + uxtl v20.8h, v5.8b //vmovl_u8(vld1_u8(pu1_src_tmp) - uxtl v22.8h, v14.8b //vmovl_u8(vld1_u8(pu1_src_tmp) + uxtl v22.8h, v7.8b //vmovl_u8(vld1_u8(pu1_src_tmp) //add x6,x0,x2 //pu1_src_tmp += src_strd shl v0.8h, v16.8h,#6 //vshlq_n_s16(tmp, 6) @@ -325,10 +325,10 @@ epilog_end: core_loop_wd_8_ht_2: add x6,x0,x2 //pu1_src_tmp += src_strd add x10,x1,x5 - ld1 {v8.8b},[x0],#8 //vld1_u8(pu1_src_tmp) - ld1 {v10.8b},[x6],x2 //vld1_u8(pu1_src_tmp) - uxtl v16.8h, v8.8b //vmovl_u8(vld1_u8(pu1_src_tmp)) - uxtl v18.8h, v10.8b //vmovl_u8(vld1_u8(pu1_src_tmp) + ld1 {v1.8b},[x0],#8 //vld1_u8(pu1_src_tmp) + ld1 {v3.8b},[x6],x2 //vld1_u8(pu1_src_tmp) + uxtl v16.8h, v1.8b //vmovl_u8(vld1_u8(pu1_src_tmp)) + uxtl v18.8h, v3.8b //vmovl_u8(vld1_u8(pu1_src_tmp) subs x12,x12,#8 //wd decrements by 8 shl v0.8h, v16.8h,#6 //vshlq_n_s16(tmp, 6) shl v2.8h, v18.8h,#6 //vshlq_n_s16(tmp, 6) @@ -338,7 +338,7 @@ core_loop_wd_8_ht_2: // ldmfd sp!,{x4-x12,x15} //reload the registers from sp ldp x19, x20,[sp],#16 - pop_v_regs + ret diff --git a/common/arm64/ihevc_inter_pred_chroma_horz.s b/common/arm64/ihevc_inter_pred_chroma_horz.s index cf4f0f9..513a362 100644 --- a/common/arm64/ihevc_inter_pred_chroma_horz.s +++ b/common/arm64/ihevc_inter_pred_chroma_horz.s @@ -105,7 +105,12 @@ ihevc_inter_pred_chroma_horz_av8: // stmfd sp!, {x4-x12, x14} //stack stores the values of the arguments - push_v_regs + + stp d9,d10,[sp,#-16]! + stp d11,d12,[sp,#-16]! + stp d13,d14,[sp,#-16]! + stp d8,d15,[sp,#-16]! // Storing d15 using { sub sp,sp,#8; str d15,[sp] } is giving bus error. + // d8 is used as dummy register and stored along with d15 using stp. d8 is not used in the function. stp x19, x20,[sp,#-16]! mov x15,x4 // pi1_coeff @@ -184,7 +189,7 @@ outer_loop_16: add x19,x4,#8 umull v30.8h, v2.8b, v25.8b //mul_res = vmull_u8(src[0_3], coeffabs_3)// - ld1 { v8.2s},[x4],x11 //vector load pu1_src + ld1 { v29.2s},[x4],x11 //vector load pu1_src ld1 { v9.2s},[x19],x11 //vector load pu1_src umlsl v30.8h, v0.8b, v24.8b //mul_res = vmlsl_u8(src[0_2], coeffabs_2)// @@ -239,7 +244,7 @@ inner_loop_16: csel x12, x20, x12,eq add x20,x12,x2 csel x4, x20, x4,eq - umlsl v22.8h, v8.8b, v24.8b //mul_res = vmlsl_u8(src[0_2], coeffabs_2)// + umlsl v22.8h, v29.8b, v24.8b //mul_res = vmlsl_u8(src[0_2], coeffabs_2)// @@ -282,7 +287,7 @@ inner_loop_16: umlal v20.8h, v13.8b, v26.8b //mul_res = vmlsl_u8(src[0_0], coeffabs_0)// add x19,x4,#8 - ld1 { v8.2s},[x4],x11 //vector load pu1_src + ld1 { v29.2s},[x4],x11 //vector load pu1_src ld1 { v9.2s},[x19],x11 //vector load pu1_src umlsl v20.8h, v15.8b, v27.8b //mul_res = vmlal_u8(src[0_1], coeffabs_1)// @@ -351,7 +356,7 @@ epilog: - umlsl v22.8h, v8.8b, v24.8b //mul_res = vmlsl_u8(src[0_2], coeffabs_2)// + umlsl v22.8h, v29.8b, v24.8b //mul_res = vmlsl_u8(src[0_2], coeffabs_2)// subs x10,x10,#16 //decrement the wd loop umlal v22.8h, v12.8b, v26.8b //mul_res = vmlsl_u8(src[0_0], coeffabs_0)// add x20,x12,x8 @@ -383,7 +388,7 @@ epilog: add x19,x4,#8 - ld1 { v8.2s},[x4],x11 //vector load pu1_src + ld1 { v29.2s},[x4],x11 //vector load pu1_src ld1 { v9.2s},[x19],x11 //vector load pu1_src umlsl v30.8h, v0.8b, v24.8b //mul_res = vmlsl_u8(src[0_2], coeffabs_2)// ld1 { v10.2s},[x4],x11 //vector load pu1_src @@ -418,7 +423,7 @@ epilog_end: umull v22.8h, v10.8b, v25.8b //mul_res = vmull_u8(src[0_3], coeffabs_3)// - umlsl v22.8h, v8.8b, v24.8b //mul_res = vmlsl_u8(src[0_2], coeffabs_2)// + umlsl v22.8h, v29.8b, v24.8b //mul_res = vmlsl_u8(src[0_2], coeffabs_2)// umlal v22.8h, v12.8b, v26.8b //mul_res = vmlsl_u8(src[0_0], coeffabs_0)// umlsl v22.8h, v14.8b, v27.8b //mul_res = vmlal_u8(src[0_1], coeffabs_1)// @@ -478,12 +483,12 @@ inner_loop_8: ld1 {v3.2s},[x12],x11 //vector load pu1_src //vext.u8 d2,d0,d1,#2 //vector extract of src[0_2] - umull v8.8h, v1.8b, v25.8b //mul_res = vmull_u8(src[0_3], coeffabs_3)// - umlsl v8.8h, v0.8b, v24.8b //mul_res = vmlsl_u8(src[0_2], coeffabs_2)// + umull v29.8h, v1.8b, v25.8b //mul_res = vmull_u8(src[0_3], coeffabs_3)// + umlsl v29.8h, v0.8b, v24.8b //mul_res = vmlsl_u8(src[0_2], coeffabs_2)// //vext.u8 d4,d0,d1,#4 //vector extract of src[0_4] //vext.u8 d6,d0,d1,#6 //vector extract of src[0_6] - umlal v8.8h, v2.8b, v26.8b //mul_res = vmlsl_u8(src[0_0], coeffabs_0)// - umlsl v8.8h, v3.8b, v27.8b //mul_res = vmlal_u8(src[0_1], coeffabs_1)// + umlal v29.8h, v2.8b, v26.8b //mul_res = vmlsl_u8(src[0_0], coeffabs_0)// + umlsl v29.8h, v3.8b, v27.8b //mul_res = vmlal_u8(src[0_1], coeffabs_1)// ld1 {v4.2s},[x4],x11 //vector load pu1_src ld1 {v5.2s},[x4],x11 //vector load pu1_src @@ -495,11 +500,11 @@ inner_loop_8: umlsl v10.8h, v4.8b, v24.8b //mul_res = vmlsl_u8(src[0_2], coeffabs_2)// //vext.u8 d16,d12,d13,#4 //vector extract of src[0_4] //vext.u8 d18,d12,d13,#6 //vector extract of src[0_6] - sqrshrun v8.8b, v8.8h,#6 //right shift and saturating narrow result 1 + sqrshrun v29.8b, v29.8h,#6 //right shift and saturating narrow result 1 umlal v10.8h, v6.8b, v26.8b //mul_res = vmlsl_u8(src[0_0], coeffabs_0)// umlsl v10.8h, v7.8b, v27.8b //mul_res = vmlal_u8(src[0_1], coeffabs_1)// - st1 {v8.8b},[x1],#8 //store the result pu1_dst + st1 {v29.8b},[x1],#8 //store the result pu1_dst sqrshrun v10.8b, v10.8h,#6 //right shift and saturating narrow result 2 subs x7,x7,#8 //decrement the wd loop @@ -545,17 +550,17 @@ inner_loop_ht_4: //sub x12, x12, #6 //(2) ld1 {v14.2s},[x12],x11 //(3)vector load pu1_src - umull v8.8h, v1.8b, v25.8b //(1)mul_res = vmull_u8(src[0_3], coeffabs_3)// + umull v29.8h, v1.8b, v25.8b //(1)mul_res = vmull_u8(src[0_3], coeffabs_3)// ld1 {v15.2s},[x12],x11 //(3)vector load pu1_src - umlsl v8.8h, v0.8b, v24.8b //(1)mul_res = vmlsl_u8(src[0_2], coeffabs_2)// + umlsl v29.8h, v0.8b, v24.8b //(1)mul_res = vmlsl_u8(src[0_2], coeffabs_2)// ld1 {v16.2s},[x12],x11 //(3)vector load pu1_src - umlal v8.8h, v2.8b, v26.8b //(1)mul_res = vmlsl_u8(src[0_0], coeffabs_0)// + umlal v29.8h, v2.8b, v26.8b //(1)mul_res = vmlsl_u8(src[0_0], coeffabs_0)// //ld1 {v17.2s},[x12],x2 //(3)vector load pu1_src ld1 {v17.2s},[x12],x8 //(3)vector load pu1_src - umlsl v8.8h, v3.8b, v27.8b //(1)mul_res = vmlal_u8(src[0_1], coeffabs_1)// + umlsl v29.8h, v3.8b, v27.8b //(1)mul_res = vmlal_u8(src[0_1], coeffabs_1)// //sub x12, x12, #6 //(3) umull v10.8h, v5.8b, v25.8b //(2)mul_res = vmull_u8(src[0_3], coeffabs_3)// @@ -570,7 +575,7 @@ inner_loop_ht_4: umlsl v10.8h, v7.8b, v27.8b //(2)mul_res = vmlal_u8(src[0_1], coeffabs_1)// ld1 {v21.2s},[x12],x2 //(4)vector load pu1_src - sqrshrun v8.8b, v8.8h,#6 //(1)right shift and saturating narrow result 1 + sqrshrun v29.8b, v29.8h,#6 //(1)right shift and saturating narrow result 1 add x9,x9,#8 //(core loop) @@ -595,7 +600,7 @@ core_loop: //sub x12, x12, #6 //(1_1) - st1 {v8.8b},[x4],x3 //(1)store the result pu1_dst + st1 {v29.8b},[x4],x3 //(1)store the result pu1_dst sqrshrun v10.8b, v10.8h,#6 //(2)right shift and saturating narrow result 2 ld1 {v4.2s},[x12],x11 //(2_1)vector load pu1_src @@ -617,17 +622,17 @@ core_loop: sqrshrun v12.8b, v12.8h,#6 //(3)right shift and saturating narrow result 1 ld1 {v14.2s},[x12],x11 //(3_1)vector load pu1_src - umull v8.8h, v1.8b, v25.8b //(1_1)mul_res = vmull_u8(src[0_3], coeffabs_3)// + umull v29.8h, v1.8b, v25.8b //(1_1)mul_res = vmull_u8(src[0_3], coeffabs_3)// ld1 {v15.2s},[x12],x11 //(3_1)vector load pu1_src - umlsl v8.8h, v0.8b, v24.8b //(1_1)mul_res = vmlsl_u8(src[0_2], coeffabs_2)// + umlsl v29.8h, v0.8b, v24.8b //(1_1)mul_res = vmlsl_u8(src[0_2], coeffabs_2)// ld1 {v16.2s},[x12],x11 //(3_1)vector load pu1_src - umlal v8.8h, v2.8b, v26.8b //(1_1)mul_res = vmlsl_u8(src[0_0], coeffabs_0)// + umlal v29.8h, v2.8b, v26.8b //(1_1)mul_res = vmlsl_u8(src[0_0], coeffabs_0)// //ld1 {v17.2s},[x12],x2 //(3_1)vector load pu1_src ld1 {v17.2s},[x12],x8 //(3_1)vector load pu1_src - umlsl v8.8h, v3.8b, v27.8b //(1_1)mul_res = vmlal_u8(src[0_1], coeffabs_1)// + umlsl v29.8h, v3.8b, v27.8b //(1_1)mul_res = vmlal_u8(src[0_1], coeffabs_1)// //sub x12, x12, #6 //(3_1) @@ -653,7 +658,7 @@ core_loop: subs x7,x7,#8 //(core loop) st1 {v22.8b},[x4], x3 //(4)store the result pu1_dst - sqrshrun v8.8b, v8.8h,#6 //(1_1)right shift and saturating narrow result 1 + sqrshrun v29.8b, v29.8h,#6 //(1_1)right shift and saturating narrow result 1 mov x4, x1 //(core loop) @@ -668,7 +673,7 @@ epilogue: umlsl v12.8h, v17.8b, v27.8b //(3)mul_res = vmlal_u8(src[0_1], coeffabs_1)// - st1 {v8.8b},[x4],x3 //(1)store the result pu1_dst + st1 {v29.8b},[x4],x3 //(1)store the result pu1_dst sqrshrun v10.8b, v10.8h,#6 //(2)right shift and saturating narrow result 2 umull v22.8h, v19.8b, v25.8b //(4)mul_res = vmull_u8(src[0_3], coeffabs_3)// @@ -735,16 +740,16 @@ inner_loop_4: zip1 v3.2s, v23.2s, v19.2s zip2 v7.2s, v23.2s, v19.2s - umull v8.8h, v1.8b, v25.8b //arithmetic operations for ii iteration in the same time - umlsl v8.8h, v0.8b, v24.8b - umlal v8.8h, v2.8b, v26.8b - umlsl v8.8h, v3.8b, v27.8b + umull v29.8h, v1.8b, v25.8b //arithmetic operations for ii iteration in the same time + umlsl v29.8h, v0.8b, v24.8b + umlal v29.8h, v2.8b, v26.8b + umlsl v29.8h, v3.8b, v27.8b - sqrshrun v8.8b, v8.8h,#6 //narrow right shift and saturating the result - st1 {v8.s}[0],[x1],#4 //store the i iteration result which is in upper part of the register + sqrshrun v29.8b, v29.8h,#6 //narrow right shift and saturating the result + st1 {v29.s}[0],[x1],#4 //store the i iteration result which is in upper part of the register subs x7,x7,#4 //decrement the wd by 4 - st1 {v8.s}[1],[x6],#4 //store the ii iteration result which is in lower part of the register + st1 {v29.s}[1],[x6],#4 //store the ii iteration result which is in lower part of the register bgt inner_loop_4 @@ -759,7 +764,11 @@ end_loops: // ldmfd sp!,{x4-x12,x15} //reload the registers from sp ldp x19, x20,[sp],#16 - pop_v_regs + ldp d8,d15,[sp],#16 // Loading d15 using { ldr d15,[sp]; add sp,sp,#8 } is giving bus error. + // d8 is used as dummy register and loaded along with d15 using ldp. d8 is not used in the function. + ldp d13,d14,[sp],#16 + ldp d11,d12,[sp],#16 + ldp d9,d10,[sp],#16 ret diff --git a/common/arm64/ihevc_inter_pred_chroma_horz_w16out.s b/common/arm64/ihevc_inter_pred_chroma_horz_w16out.s index a35fdaa..efc09f9 100644 --- a/common/arm64/ihevc_inter_pred_chroma_horz_w16out.s +++ b/common/arm64/ihevc_inter_pred_chroma_horz_w16out.s @@ -104,7 +104,10 @@ ihevc_inter_pred_chroma_horz_w16out_av8: // stmfd sp!, {x4-x12, x14} //stack stores the values of the arguments - push_v_regs + + stp d10,d11,[sp,#-16]! + stp d12,d13,[sp,#-16]! + stp d14,d15,[sp,#-16]! stp x19, x20,[sp,#-16]! mov x15,x4 // pi1_coeff @@ -201,8 +204,8 @@ outer_loop_16: add x19,x4,#8 umull v30.8h, v2.8b, v25.8b //mul_res = vmull_u8(src[0_3], coeffabs_3)// - ld1 { v8.2s},[x4],x11 //vector load pu1_src - ld1 { v9.2s},[x19],x11 //vector load pu1_src + ld1 { v29.2s},[x4],x11 //vector load pu1_src + ld1 { v31.2s},[x19],x11 //vector load pu1_src umlsl v30.8h, v0.8b, v24.8b //mul_res = vmlsl_u8(src[0_2], coeffabs_2)// @@ -261,7 +264,7 @@ inner_loop_16: st1 { v30.8h}, [x1],#16 - umlsl v22.8h, v8.8b, v24.8b //mul_res = vmlsl_u8(src[0_2], coeffabs_2)// + umlsl v22.8h, v29.8b, v24.8b //mul_res = vmlsl_u8(src[0_2], coeffabs_2)// @@ -284,15 +287,15 @@ inner_loop_16: umull v20.8h, v11.8b, v25.8b //mul_res = vmull_u8(src[0_3], coeffabs_3)// st1 { v28.8h}, [x1],x8 - umlsl v20.8h, v9.8b, v24.8b //mul_res = vmlsl_u8(src[0_2], coeffabs_2)// + umlsl v20.8h, v31.8b, v24.8b //mul_res = vmlsl_u8(src[0_2], coeffabs_2)// ld1 { v6.2s},[x12],x9 //vector load pu1_src ld1 { v7.2s},[x19],x9 //vector load pu1_src umlal v20.8h, v13.8b, v26.8b //mul_res = vmlsl_u8(src[0_0], coeffabs_0)// add x19,x4,#8 - ld1 { v8.2s},[x4],x11 //vector load pu1_src - ld1 { v9.2s},[x19],x11 //vector load pu1_src + ld1 { v29.2s},[x4],x11 //vector load pu1_src + ld1 { v31.2s},[x19],x11 //vector load pu1_src umlsl v20.8h, v15.8b, v27.8b //mul_res = vmlal_u8(src[0_1], coeffabs_1)// @@ -346,7 +349,7 @@ epilog: - umlsl v22.8h, v8.8b, v24.8b //mul_res = vmlsl_u8(src[0_2], coeffabs_2)// + umlsl v22.8h, v29.8b, v24.8b //mul_res = vmlsl_u8(src[0_2], coeffabs_2)// subs x10,x10,#16 //decrement the wd loop umlal v22.8h, v12.8b, v26.8b //mul_res = vmlsl_u8(src[0_0], coeffabs_0)// // add x20,x12,x2,lsl #1 @@ -365,7 +368,7 @@ epilog: ld1 { v0.2s},[x12],x11 //vector load pu1_src ld1 { v1.2s},[x19],x11 //vector load pu1_src - umlsl v20.8h, v9.8b, v24.8b //mul_res = vmlsl_u8(src[0_2], coeffabs_2)// + umlsl v20.8h, v31.8b, v24.8b //mul_res = vmlsl_u8(src[0_2], coeffabs_2)// ld1 { v2.2s},[x12],x11 //vector load pu1_src ld1 { v3.2s},[x19],x11 //vector load pu1_src @@ -381,8 +384,8 @@ epilog: umull v30.8h, v2.8b, v25.8b //mul_res = vmull_u8(src[0_3], coeffabs_3)// add x19,x4,#8 - ld1 { v8.2s},[x4],x11 //vector load pu1_src - ld1 { v9.2s},[x19],x11 //vector load pu1_src + ld1 { v29.2s},[x4],x11 //vector load pu1_src + ld1 { v31.2s},[x19],x11 //vector load pu1_src umlsl v30.8h, v0.8b, v24.8b //mul_res = vmlsl_u8(src[0_2], coeffabs_2)// ld1 { v10.2s},[x4],x11 //vector load pu1_src @@ -410,13 +413,13 @@ epilog: epilog_end: umull v22.8h, v10.8b, v25.8b //mul_res = vmull_u8(src[0_3], coeffabs_3)// - umlsl v22.8h, v8.8b, v24.8b //mul_res = vmlsl_u8(src[0_2], coeffabs_2)// + umlsl v22.8h, v29.8b, v24.8b //mul_res = vmlsl_u8(src[0_2], coeffabs_2)// umlal v22.8h, v12.8b, v26.8b //mul_res = vmlsl_u8(src[0_0], coeffabs_0)// umlsl v22.8h, v14.8b, v27.8b //mul_res = vmlal_u8(src[0_1], coeffabs_1)// umull v20.8h, v11.8b, v25.8b //mul_res = vmull_u8(src[0_3], coeffabs_3)// - umlsl v20.8h, v9.8b, v24.8b //mul_res = vmlsl_u8(src[0_2], coeffabs_2)// + umlsl v20.8h, v31.8b, v24.8b //mul_res = vmlsl_u8(src[0_2], coeffabs_2)// umlal v20.8h, v13.8b, v26.8b //mul_res = vmlsl_u8(src[0_0], coeffabs_0)// umlsl v20.8h, v15.8b, v27.8b //mul_res = vmlal_u8(src[0_1], coeffabs_1)// @@ -463,12 +466,12 @@ inner_loop_8: //vext.u8 d2,d0,d1,#2 //vector extract of src[0_2] - umull v8.8h, v1.8b, v25.8b //mul_res = vmull_u8(src[0_3], coeffabs_3)// - umlsl v8.8h, v0.8b, v24.8b //mul_res = vmlsl_u8(src[0_2], coeffabs_2)// + umull v29.8h, v1.8b, v25.8b //mul_res = vmull_u8(src[0_3], coeffabs_3)// + umlsl v29.8h, v0.8b, v24.8b //mul_res = vmlsl_u8(src[0_2], coeffabs_2)// //vext.u8 d4,d0,d1,#4 //vector extract of src[0_4] //vext.u8 d6,d0,d1,#6 //vector extract of src[0_6] - umlal v8.8h, v2.8b, v26.8b //mul_res = vmlsl_u8(src[0_0], coeffabs_0)// - umlsl v8.8h, v3.8b, v27.8b //mul_res = vmlal_u8(src[0_1], coeffabs_1)// + umlal v29.8h, v2.8b, v26.8b //mul_res = vmlsl_u8(src[0_0], coeffabs_0)// + umlsl v29.8h, v3.8b, v27.8b //mul_res = vmlal_u8(src[0_1], coeffabs_1)// //ld1 {v12.2s, v13.2s},[x4],x11 //vector load pu1_src + src_strd ld1 {v4.2s},[x4],x11 //vector load pu1_src @@ -483,7 +486,7 @@ inner_loop_8: umlal v10.8h, v6.8b, v26.8b //mul_res = vmlsl_u8(src[0_0], coeffabs_0)// umlsl v10.8h, v7.8b, v27.8b //mul_res = vmlal_u8(src[0_1], coeffabs_1)// - st1 {v8.8h}, [x1],#16 + st1 {v29.8h}, [x1],#16 subs x10,x10,#8 //decrement the wd loop st1 {v10.8h},[x6],#16 //store the result pu1_dst @@ -530,16 +533,16 @@ inner_loop_ht_4: ld1 {v7.2s},[x12],x0 //(2)vector load pu1_src ld1 {v14.2s},[x12],x11 //(3)vector load pu1_src - umull v8.8h, v1.8b, v25.8b //(1)mul_res = vmull_u8(src[0_3], coeffabs_3)// + umull v29.8h, v1.8b, v25.8b //(1)mul_res = vmull_u8(src[0_3], coeffabs_3)// ld1 {v15.2s},[x12],x11 //(3)vector load pu1_src - umlsl v8.8h, v0.8b, v24.8b //(1)mul_res = vmlsl_u8(src[0_2], coeffabs_2)// + umlsl v29.8h, v0.8b, v24.8b //(1)mul_res = vmlsl_u8(src[0_2], coeffabs_2)// ld1 {v16.2s},[x12],x11 //(3)vector load pu1_src - umlal v8.8h, v2.8b, v26.8b //(1)mul_res = vmlsl_u8(src[0_0], coeffabs_0)// + umlal v29.8h, v2.8b, v26.8b //(1)mul_res = vmlsl_u8(src[0_0], coeffabs_0)// ld1 {v17.2s},[x12],x0 //(3)vector load pu1_src - umlsl v8.8h, v3.8b, v27.8b //(1)mul_res = vmlal_u8(src[0_1], coeffabs_1)// + umlsl v29.8h, v3.8b, v27.8b //(1)mul_res = vmlal_u8(src[0_1], coeffabs_1)// ld1 {v18.2s},[x12],x11 //(4)vector load pu1_src umull v10.8h, v5.8b, v25.8b //(2)mul_res = vmull_u8(src[0_3], coeffabs_3)// @@ -559,7 +562,7 @@ inner_loop_ht_4: beq epilogue core_loop: - st1 {v8.8h},[x4],x8 //(1)store the result pu1_dst + st1 {v29.8h},[x4],x8 //(1)store the result pu1_dst mov x12,x9 ld1 {v0.2s},[x12],x11 //(1_1)vector load pu1_src @@ -593,16 +596,16 @@ core_loop: add x1,x1,#16 //(core loop) ld1 {v14.2s},[x12],x11 //(3_1)vector load pu1_src - umull v8.8h, v1.8b, v25.8b //(1_1)mul_res = vmull_u8(src[0_3], coeffabs_3)// + umull v29.8h, v1.8b, v25.8b //(1_1)mul_res = vmull_u8(src[0_3], coeffabs_3)// ld1 {v15.2s},[x12],x11 //(3_1)vector load pu1_src - umlsl v8.8h, v0.8b, v24.8b //(1_1)mul_res = vmlsl_u8(src[0_2], coeffabs_2)// + umlsl v29.8h, v0.8b, v24.8b //(1_1)mul_res = vmlsl_u8(src[0_2], coeffabs_2)// ld1 {v16.2s},[x12],x11 //(3_1)vector load pu1_src - umlal v8.8h, v2.8b, v26.8b //(1_1)mul_res = vmlsl_u8(src[0_0], coeffabs_0)// + umlal v29.8h, v2.8b, v26.8b //(1_1)mul_res = vmlsl_u8(src[0_0], coeffabs_0)// ld1 {v17.2s},[x12],x0 //(3_1)vector load pu1_src - umlsl v8.8h, v3.8b, v27.8b //(1_1)mul_res = vmlal_u8(src[0_1], coeffabs_1)// + umlsl v29.8h, v3.8b, v27.8b //(1_1)mul_res = vmlal_u8(src[0_1], coeffabs_1)// st1 {v22.8h}, [x4], x8 //(4)store the result pu1_dst subs x10,x10,#8 //(core loop) @@ -634,7 +637,7 @@ epilogue: umlsl v12.8h, v17.8b, v27.8b //(3)mul_res = vmlal_u8(src[0_1], coeffabs_1)// - st1 {v8.8h},[x4], x8 //(1)store the result pu1_dst + st1 {v29.8h},[x4], x8 //(1)store the result pu1_dst umull v22.8h, v19.8b, v25.8b //(4)mul_res = vmull_u8(src[0_3], coeffabs_3)// umlsl v22.8h, v18.8b, v24.8b //(4)mul_res = vmlsl_u8(src[0_2], coeffabs_2)// @@ -720,15 +723,15 @@ inner_loop_4: zip2 v7.2s, v23.2s, v19.2s //**** addn ends - umull v8.8h, v1.8b, v25.8b //arithmetic operations for ii iteration in the same time - umlsl v8.8h, v0.8b, v24.8b - umlal v8.8h, v2.8b, v26.8b - umlsl v8.8h, v3.8b, v27.8b + umull v29.8h, v1.8b, v25.8b //arithmetic operations for ii iteration in the same time + umlsl v29.8h, v0.8b, v24.8b + umlal v29.8h, v2.8b, v26.8b + umlsl v29.8h, v3.8b, v27.8b - st1 {v8.d}[0],[x1],#8 //store the i iteration result which is in upper part of the register + st1 {v29.d}[0],[x1],#8 //store the i iteration result which is in upper part of the register subs x10,x10,#4 //decrement the wd by 4 - st1 {v8.d}[1],[x6],#8 //store the ii iteration result which is in lower part of the register + st1 {v29.d}[1],[x6],#8 //store the ii iteration result which is in lower part of the register bgt inner_loop_4 @@ -763,12 +766,12 @@ loop_residue: //vext.u8 d6,d0,d1,#6 //vector extract of src[0_6] //umlal v8.8h, v4.8b, v26.8b //mul_res = vmlsl_u8(src[0_0], coeffabs_0)// //umlsl v8.8h, v6.8b, v27.8b //mul_res = vmlal_u8(src[0_1], coeffabs_1)// - umull v8.8h, v21.8b, v25.8b - umlsl v8.8h, v20.8b, v24.8b - umlal v8.8h, v22.8b, v26.8b - umlsl v8.8h, v23.8b, v27.8b + umull v29.8h, v21.8b, v25.8b + umlsl v29.8h, v20.8b, v24.8b + umlal v29.8h, v22.8b, v26.8b + umlsl v29.8h, v23.8b, v27.8b - st1 {v8.1d},[x1] //store the result pu1_dst + st1 {v29.1d},[x1] //store the result pu1_dst subs x10,x10,#4 //decrement the wd loop add x1,x1,#8 //pi2_dst + 8 @@ -788,7 +791,9 @@ end_loops: // ldmfd sp!,{x4-x12,x15} //reload the registers from sp ldp x19, x20,[sp],#16 - pop_v_regs + ldp d14,d15,[sp],#16 + ldp d12,d13,[sp],#16 + ldp d10,d11,[sp],#16 ret diff --git a/common/arm64/ihevc_inter_pred_chroma_vert.s b/common/arm64/ihevc_inter_pred_chroma_vert.s index 2de789f..3d61f6c 100644 --- a/common/arm64/ihevc_inter_pred_chroma_vert.s +++ b/common/arm64/ihevc_inter_pred_chroma_vert.s @@ -104,7 +104,7 @@ ihevc_inter_pred_chroma_vert_av8: // stmfd sp!,{x4-x12,x14} //stack stores the values of the arguments - push_v_regs + stp x19, x20,[sp,#-16]! mov x15,x4 // pi1_coeff @@ -142,21 +142,21 @@ ihevc_inter_pred_chroma_vert_av8: inner_loop_ht_2: //called when wd is multiple of 4 and ht is 4,2 add x6,x0,x2 //pu1_src +src_strd - ld1 {v9.8b},[x6],x2 //loads pu1_src + ld1 {v17.8b},[x6],x2 //loads pu1_src subs x5,x5,#8 //2wd - 8 ld1 {v5.8b},[x0],#8 //loads src - umull v6.8h, v9.8b, v1.8b //vmull_u8(vreinterpret_u8_u32(src_tmp2), coeffabs_1) + umull v6.8h, v17.8b, v1.8b //vmull_u8(vreinterpret_u8_u32(src_tmp2), coeffabs_1) ld1 {v4.8b},[x6],x2 //loads incremented src umlsl v6.8h, v5.8b, v0.8b //vmlsl_u8(mul_res1, vreinterpret_u8_u32(src_tmp1), coeffabs_0) - ld1 {v8.8b},[x6],x2 //loads incremented src + ld1 {v16.8b},[x6],x2 //loads incremented src umlal v6.8h, v4.8b, v2.8b //vmlal_u8(mul_res1, vreinterpret_u8_u32(src_tmp3), coeffabs_2) umull v4.8h, v4.8b, v1.8b - umlsl v6.8h, v8.8b, v3.8b - umlsl v4.8h, v9.8b, v0.8b - ld1 {v10.8b},[x6] //loads the incremented src - umlal v4.8h, v8.8b, v2.8b + umlsl v6.8h, v16.8b, v3.8b + umlsl v4.8h, v17.8b, v0.8b + ld1 {v18.8b},[x6] //loads the incremented src + umlal v4.8h, v16.8b, v2.8b sqrshrun v6.8b, v6.8h,#6 //shifts right - umlsl v4.8h, v10.8b, v3.8b + umlsl v4.8h, v18.8b, v3.8b add x6,x1,x3 //pu1_dst + dst_strd sqrshrun v4.8b, v4.8h,#6 //shifts right st1 {v6.8b},[x1],#8 //stores the loaded value @@ -240,7 +240,7 @@ prolog: add x7,x1,x3 //pu1_dst umlal v30.8h, v6.8b, v2.8b umlsl v30.8h, v7.8b, v3.8b - ld1 {v8.8b},[x6],x2 //load and increment + ld1 {v16.8b},[x6],x2 //load and increment umull v28.8h, v6.8b, v1.8b //mul_res 2 add x20,x0,x9 //pu1_dst += 4*dst_strd - 2*wd @@ -249,30 +249,30 @@ prolog: bic x20,x10,#7 //x5 ->wd csel x5, x20, x5,le umlal v28.8h, v7.8b, v2.8b - ld1 {v9.8b},[x6],x2 - umlsl v28.8h, v8.8b, v3.8b + ld1 {v17.8b},[x6],x2 + umlsl v28.8h, v16.8b, v3.8b sqrshrun v30.8b, v30.8h,#6 - ld1 {v10.8b},[x6],x2 + ld1 {v18.8b},[x6],x2 umull v26.8h, v7.8b, v1.8b add x6,x0,x2 //pu1_src + src_strd umlsl v26.8h, v6.8b, v0.8b st1 {v30.8b},[x1],#8 //stores the loaded value - umlal v26.8h, v8.8b, v2.8b + umlal v26.8h, v16.8b, v2.8b ld1 {v4.8b},[x0],#8 //loads the source - umlsl v26.8h, v9.8b, v3.8b + umlsl v26.8h, v17.8b, v3.8b sqrshrun v28.8b, v28.8h,#6 add x20,x1,x8 //pu1_src += 4*src_strd - 2*wd csel x1, x20, x1,le - umull v24.8h, v8.8b, v1.8b + umull v24.8h, v16.8b, v1.8b ld1 {v5.8b},[x6],x2 //loads pu1_src umlsl v24.8h, v7.8b, v0.8b subs x12,x12,#4 ld1 {v6.8b},[x6],x2 //load and increment - umlal v24.8h, v9.8b, v2.8b + umlal v24.8h, v17.8b, v2.8b ld1 {v7.8b},[x6],x2 //load and increment - umlsl v24.8h, v10.8b, v3.8b + umlsl v24.8h, v18.8b, v3.8b lsl x11,x2,#2 st1 {v28.8b},[x7],x3 //stores the loaded value @@ -299,7 +299,7 @@ kernel_8: st1 {v26.8b},[x7],x3 //stores the loaded value sqrshrun v24.8b, v24.8h,#6 - ld1 {v8.8b},[x6],x2 //load and increment + ld1 {v16.8b},[x6],x2 //load and increment umull v28.8h, v6.8b, v1.8b //mul_res 2 bic x20,x10,#7 //x5 ->wd @@ -309,11 +309,11 @@ kernel_8: umlal v28.8h, v7.8b, v2.8b - ld1 {v9.8b},[x6],x2 + ld1 {v17.8b},[x6],x2 sqrshrun v30.8b, v30.8h,#6 - umlsl v28.8h, v8.8b, v3.8b - ld1 {v10.8b},[x6],x2 + umlsl v28.8h, v16.8b, v3.8b + ld1 {v18.8b},[x6],x2 add x7,x1,x3 //pu1_dst umull v26.8h, v7.8b, v1.8b add x6,x0,x2 //pu1_src + src_strd @@ -325,16 +325,16 @@ kernel_8: umlsl v26.8h, v6.8b, v0.8b ld1 {v4.8b},[x0],#8 //loads the source - umlal v26.8h, v8.8b, v2.8b + umlal v26.8h, v16.8b, v2.8b st1 {v30.8b},[x1],#8 //stores the loaded value - umlsl v26.8h, v9.8b, v3.8b + umlsl v26.8h, v17.8b, v3.8b ld1 {v5.8b},[x6],x2 //loads pu1_src add x11,x11,x2 sqrshrun v28.8b, v28.8h,#6 - umull v24.8h, v8.8b, v1.8b + umull v24.8h, v16.8b, v1.8b ld1 {v6.8b},[x6],x2 //load and increment add x20,x1,x8 //pu1_src += 4*src_strd - 2*wd csel x1, x20, x1,le @@ -348,10 +348,10 @@ kernel_8: umlsl v24.8h, v7.8b, v0.8b subs x12,x12,#4 - umlal v24.8h, v9.8b, v2.8b + umlal v24.8h, v17.8b, v2.8b ld1 {v7.8b},[x6],x2 //load and increment - umlsl v24.8h, v10.8b, v3.8b + umlsl v24.8h, v18.8b, v3.8b st1 {v28.8b},[x7],x3 //stores the loaded value sqrshrun v26.8b, v26.8h,#6 @@ -366,39 +366,39 @@ epilog: st1 {v26.8b},[x7],x3 //stores the loaded value sqrshrun v24.8b, v24.8h,#6 - ld1 {v8.8b},[x6],x2 //load and increment + ld1 {v16.8b},[x6],x2 //load and increment umull v28.8h, v6.8b, v1.8b //mul_res 2 umlsl v28.8h, v5.8b, v0.8b umlal v28.8h, v7.8b, v2.8b - umlsl v28.8h, v8.8b, v3.8b + umlsl v28.8h, v16.8b, v3.8b st1 {v24.8b},[x7],x3 //stores the loaded value sqrshrun v30.8b, v30.8h,#6 - ld1 {v9.8b},[x6],x2 + ld1 {v17.8b},[x6],x2 umull v26.8h, v7.8b, v1.8b add x7,x1,x3 //pu1_dst umlsl v26.8h, v6.8b, v0.8b st1 {v30.8b},[x1],#8 //stores the loaded value sqrshrun v28.8b, v28.8h,#6 - umlal v26.8h, v8.8b, v2.8b - ld1 {v10.8b},[x6],x2 - umlsl v26.8h, v9.8b, v3.8b + umlal v26.8h, v16.8b, v2.8b + ld1 {v18.8b},[x6],x2 + umlsl v26.8h, v17.8b, v3.8b - umull v24.8h, v8.8b, v1.8b + umull v24.8h, v16.8b, v1.8b sqrshrun v26.8b, v26.8h,#6 st1 {v28.8b},[x7],x3 //stores the loaded value umlsl v24.8h, v7.8b, v0.8b - umlal v24.8h, v9.8b, v2.8b + umlal v24.8h, v17.8b, v2.8b st1 {v26.8b},[x7],x3 //stores the loaded value - umlsl v24.8h, v10.8b, v3.8b + umlsl v24.8h, v18.8b, v3.8b sqrshrun v24.8b, v24.8h,#6 st1 {v24.8b},[x7],x3 //stores the loaded value end_loops: // ldmfd sp!,{x4-x12,x15} //reload the registers from sp ldp x19, x20,[sp],#16 - pop_v_regs + ret diff --git a/common/arm64/ihevc_inter_pred_chroma_vert_w16inp.s b/common/arm64/ihevc_inter_pred_chroma_vert_w16inp.s index 55e7f54..e8f17cc 100644 --- a/common/arm64/ihevc_inter_pred_chroma_vert_w16inp.s +++ b/common/arm64/ihevc_inter_pred_chroma_vert_w16inp.s @@ -104,7 +104,7 @@ ihevc_inter_pred_chroma_vert_w16inp_av8: // stmfd sp!, {x4-x12, x14} //stack stores the values of the arguments - push_v_regs + stp x19, x20,[sp,#-16]! mov x15,x4 // pi1_coeff @@ -120,10 +120,10 @@ ihevc_inter_pred_chroma_vert_w16inp_av8: sxtl v0.8h, v0.8b //long the value tst x6,#3 //checks wd == 2 - dup v12.4h, v0.4h[0] //coeff_0 - dup v13.4h, v0.4h[1] //coeff_1 - dup v14.4h, v0.4h[2] //coeff_2 - dup v15.4h, v0.4h[3] //coeff_3 + dup v16.4h, v0.4h[0] //coeff_0 + dup v17.4h, v0.4h[1] //coeff_1 + dup v18.4h, v0.4h[2] //coeff_2 + dup v19.4h, v0.4h[3] //coeff_3 bgt core_loop_ht_2 //jumps to loop handles wd 2 @@ -141,22 +141,22 @@ core_loop_ht_2: inner_loop_ht_2: add x0,x4,x2 //increments pi2_src ld1 {v0.4h},[x4],#8 //loads pu1_src - smull v0.4s, v0.4h, v12.4h //vmull_s16(src_tmp1, coeff_0) + smull v0.4s, v0.4h, v16.4h //vmull_s16(src_tmp1, coeff_0) subs x12,x12,#8 //2wd + 8 ld1 {v2.4h},[x0],x2 //loads pi2_src - smull v8.4s, v2.4h, v12.4h //vmull_s16(src_tmp2, coeff_0) + smull v7.4s, v2.4h, v16.4h //vmull_s16(src_tmp2, coeff_0) ld1 {v3.4h},[x0],x2 //loads pi2_src - smlal v0.4s, v2.4h, v13.4h + smlal v0.4s, v2.4h, v17.4h ld1 {v6.4h},[x0],x2 - smlal v8.4s, v3.4h, v13.4h + smlal v7.4s, v3.4h, v17.4h ld1 {v2.4h},[x0] add x7,x1,x3 //pu1_dst + dst_strd - smlal v0.4s, v3.4h, v14.4h - smlal v8.4s, v6.4h, v14.4h - smlal v0.4s, v6.4h, v15.4h - smlal v8.4s, v2.4h, v15.4h + smlal v0.4s, v3.4h, v18.4h + smlal v7.4s, v6.4h, v18.4h + smlal v0.4s, v6.4h, v19.4h + smlal v7.4s, v2.4h, v19.4h sqshrn v0.4h, v0.4s,#6 //right shift - sqshrn v30.4h, v8.4s,#6 //right shift + sqshrn v30.4h, v7.4s,#6 //right shift sqrshrun v0.8b, v0.8h,#6 //rounding shift sqrshrun v30.8b, v30.8h,#6 //rounding shift st1 {v0.s}[0],[x1],#4 //stores the loaded value @@ -189,45 +189,45 @@ prolog: ld1 {v1.4h},[x0],x2 //loads pi2_src subs x11,x11,#4 ld1 {v2.4h},[x0],x2 //loads pi2_src - smull v30.4s, v0.4h, v12.4h //vmull_s16(src_tmp1, coeff_0) + smull v30.4s, v0.4h, v16.4h //vmull_s16(src_tmp1, coeff_0) ld1 {v3.4h},[x0],x2 - smlal v30.4s, v1.4h, v13.4h - smlal v30.4s, v2.4h, v14.4h + smlal v30.4s, v1.4h, v17.4h + smlal v30.4s, v2.4h, v18.4h add x9,x1,x3 //pu1_dst + dst_strd - smlal v30.4s, v3.4h, v15.4h + smlal v30.4s, v3.4h, v19.4h ld1 {v4.4h},[x0],x2 - smull v28.4s, v1.4h, v12.4h //vmull_s16(src_tmp2, coeff_0) + smull v28.4s, v1.4h, v16.4h //vmull_s16(src_tmp2, coeff_0) add x20,x4,x8 csel x4, x20, x4,le - smlal v28.4s, v2.4h, v13.4h + smlal v28.4s, v2.4h, v17.4h ld1 {v5.4h},[x0],x2 - smlal v28.4s, v3.4h, v14.4h + smlal v28.4s, v3.4h, v18.4h ld1 {v6.4h},[x0],x2 - smlal v28.4s, v4.4h, v15.4h + smlal v28.4s, v4.4h, v19.4h lsl x20,x6,#1 csel x11, x20, x11,le sqshrn v30.4h, v30.4s,#6 //right shift - smull v26.4s, v2.4h, v12.4h //vmull_s16(src_tmp2, coeff_0) + smull v26.4s, v2.4h, v16.4h //vmull_s16(src_tmp2, coeff_0) add x0,x4,x2 - smlal v26.4s, v3.4h, v13.4h - smlal v26.4s, v4.4h, v14.4h + smlal v26.4s, v3.4h, v17.4h + smlal v26.4s, v4.4h, v18.4h ld1 {v0.4h},[x4],#8 //loads pu1_src - smlal v26.4s, v5.4h, v15.4h + smlal v26.4s, v5.4h, v19.4h sqrshrun v30.8b, v30.8h,#6 //rounding shift sqshrn v28.4h, v28.4s,#6 //right shift ld1 {v1.4h},[x0],x2 //loads pi2_src - smull v24.4s, v3.4h, v12.4h //vmull_s16(src_tmp2, coeff_0) + smull v24.4s, v3.4h, v16.4h //vmull_s16(src_tmp2, coeff_0) st1 {v30.s}[0],[x1],#4 //stores the loaded value - smlal v24.4s, v4.4h, v13.4h + smlal v24.4s, v4.4h, v17.4h ld1 {v2.4h},[x0],x2 //loads pi2_src - smlal v24.4s, v5.4h, v14.4h + smlal v24.4s, v5.4h, v18.4h ld1 {v3.4h},[x0],x2 - smlal v24.4s, v6.4h, v15.4h + smlal v24.4s, v6.4h, v19.4h add x20,x1,x14 csel x1, x20, x1,le @@ -238,21 +238,21 @@ prolog: beq epilog //jumps to epilog kernel_4: - smull v30.4s, v0.4h, v12.4h //vmull_s16(src_tmp1, coeff_0) + smull v30.4s, v0.4h, v16.4h //vmull_s16(src_tmp1, coeff_0) subs x11,x11,#4 - smlal v30.4s, v1.4h, v13.4h + smlal v30.4s, v1.4h, v17.4h st1 {v28.s}[0],[x9],x3 //stores the loaded value - smlal v30.4s, v2.4h, v14.4h - smlal v30.4s, v3.4h, v15.4h + smlal v30.4s, v2.4h, v18.4h + smlal v30.4s, v3.4h, v19.4h sqshrn v24.4h, v24.4s,#6 //right shift sqrshrun v26.8b, v26.8h,#6 //rounding shift ld1 {v4.4h},[x0],x2 - smull v28.4s, v1.4h, v12.4h //vmull_s16(src_tmp2, coeff_0) - smlal v28.4s, v2.4h, v13.4h - smlal v28.4s, v3.4h, v14.4h - smlal v28.4s, v4.4h, v15.4h + smull v28.4s, v1.4h, v16.4h //vmull_s16(src_tmp2, coeff_0) + smlal v28.4s, v2.4h, v17.4h + smlal v28.4s, v3.4h, v18.4h + smlal v28.4s, v4.4h, v19.4h st1 {v26.s}[0],[x9],x3 //stores the loaded value add x20,x4,x8 csel x4, x20, x4,le @@ -263,28 +263,28 @@ kernel_4: sqrshrun v24.8b, v24.8h,#6 //rounding shift ld1 {v5.4h},[x0],x2 - smull v26.4s, v2.4h, v12.4h //vmull_s16(src_tmp2, coeff_0) + smull v26.4s, v2.4h, v16.4h //vmull_s16(src_tmp2, coeff_0) ld1 {v6.4h},[x0],x2 - smlal v26.4s, v3.4h, v13.4h + smlal v26.4s, v3.4h, v17.4h st1 {v24.s}[0],[x9] //stores the loaded value add x0,x4,x2 - smlal v26.4s, v4.4h, v14.4h + smlal v26.4s, v4.4h, v18.4h ld1 {v0.4h},[x4],#8 //loads pu1_src - smlal v26.4s, v5.4h, v15.4h + smlal v26.4s, v5.4h, v19.4h sqshrn v28.4h, v28.4s,#6 //right shift sqrshrun v30.8b, v30.8h,#6 //rounding shift ld1 {v1.4h},[x0],x2 //loads pi2_src - smull v24.4s, v3.4h, v12.4h //vmull_s16(src_tmp2, coeff_0) + smull v24.4s, v3.4h, v16.4h //vmull_s16(src_tmp2, coeff_0) add x9,x1,x3 //pu1_dst + dst_strd ld1 {v2.4h},[x0],x2 //loads pi2_src - smlal v24.4s, v4.4h, v13.4h + smlal v24.4s, v4.4h, v17.4h ld1 {v3.4h},[x0],x2 - smlal v24.4s, v5.4h, v14.4h + smlal v24.4s, v5.4h, v18.4h st1 {v30.s}[0],[x1],#4 //stores the loaded value - smlal v24.4s, v6.4h, v15.4h + smlal v24.4s, v6.4h, v19.4h sqshrn v26.4h, v26.4s,#6 //right shift sqrshrun v28.8b, v28.8h,#6 //rounding shift @@ -296,41 +296,41 @@ kernel_4: bgt kernel_4 //jumps to kernel_4 epilog: - smull v30.4s, v0.4h, v12.4h //vmull_s16(src_tmp1, coeff_0) + smull v30.4s, v0.4h, v16.4h //vmull_s16(src_tmp1, coeff_0) st1 {v28.s}[0],[x9],x3 //stores the loaded value - smlal v30.4s, v1.4h, v13.4h - smlal v30.4s, v2.4h, v14.4h - smlal v30.4s, v3.4h, v15.4h + smlal v30.4s, v1.4h, v17.4h + smlal v30.4s, v2.4h, v18.4h + smlal v30.4s, v3.4h, v19.4h sqshrn v24.4h, v24.4s,#6 //right shift sqrshrun v26.8b, v26.8h,#6 //rounding shift - smull v28.4s, v1.4h, v12.4h //vmull_s16(src_tmp2, coeff_0) + smull v28.4s, v1.4h, v16.4h //vmull_s16(src_tmp2, coeff_0) ld1 {v4.4h},[x0],x2 - smlal v28.4s, v2.4h, v13.4h + smlal v28.4s, v2.4h, v17.4h st1 {v26.s}[0],[x9],x3 //stores the loaded value - smlal v28.4s, v3.4h, v14.4h - smlal v28.4s, v4.4h, v15.4h + smlal v28.4s, v3.4h, v18.4h + smlal v28.4s, v4.4h, v19.4h sqshrn v30.4h, v30.4s,#6 //right shift sqrshrun v24.8b, v24.8h,#6 //rounding shift - smull v26.4s, v2.4h, v12.4h //vmull_s16(src_tmp2, coeff_0) + smull v26.4s, v2.4h, v16.4h //vmull_s16(src_tmp2, coeff_0) ld1 {v5.4h},[x0],x2 - smlal v26.4s, v3.4h, v13.4h - smlal v26.4s, v4.4h, v14.4h - smlal v26.4s, v5.4h, v15.4h + smlal v26.4s, v3.4h, v17.4h + smlal v26.4s, v4.4h, v18.4h + smlal v26.4s, v5.4h, v19.4h sqshrn v28.4h, v28.4s,#6 //right shift sqrshrun v30.8b, v30.8h,#6 //rounding shift st1 {v24.s}[0],[x9] //stores the loaded value - smull v24.4s, v3.4h, v12.4h //vmull_s16(src_tmp2, coeff_0) - smlal v24.4s, v4.4h, v13.4h + smull v24.4s, v3.4h, v16.4h //vmull_s16(src_tmp2, coeff_0) + smlal v24.4s, v4.4h, v17.4h add x9,x1,x3 //pu1_dst + dst_strd ld1 {v6.4h},[x0],x2 - smlal v24.4s, v5.4h, v14.4h - smlal v24.4s, v6.4h, v15.4h + smlal v24.4s, v5.4h, v18.4h + smlal v24.4s, v6.4h, v19.4h st1 {v30.s}[0],[x1],#4 //stores the loaded value sqrshrun v28.8b, v28.8h,#6 //rounding shift @@ -348,7 +348,7 @@ epilog: end_loops: // ldmfd sp!,{x4-x12,x15} //reload the registers from sp ldp x19, x20,[sp],#16 - pop_v_regs + ret diff --git a/common/arm64/ihevc_inter_pred_chroma_vert_w16inp_w16out.s b/common/arm64/ihevc_inter_pred_chroma_vert_w16inp_w16out.s index b6d0eb2..5aaabe6 100644 --- a/common/arm64/ihevc_inter_pred_chroma_vert_w16inp_w16out.s +++ b/common/arm64/ihevc_inter_pred_chroma_vert_w16inp_w16out.s @@ -104,7 +104,7 @@ ihevc_inter_pred_chroma_vert_w16inp_w16out_av8: // stmfd sp!, {x4-x12, x14} //stack stores the values of the arguments - push_v_regs + stp x19, x20,[sp,#-16]! mov x15,x4 // pi1_coeff @@ -120,10 +120,10 @@ ihevc_inter_pred_chroma_vert_w16inp_w16out_av8: sxtl v0.8h, v0.8b //long the value tst x6,#3 //checks wd == 2 - dup v12.4h, v0.4h[0] //coeff_0 - dup v13.4h, v0.4h[1] //coeff_1 - dup v14.4h, v0.4h[2] //coeff_2 - dup v15.4h, v0.4h[3] //coeff_3 + dup v16.4h, v0.4h[0] //coeff_0 + dup v17.4h, v0.4h[1] //coeff_1 + dup v18.4h, v0.4h[2] //coeff_2 + dup v19.4h, v0.4h[3] //coeff_3 bgt core_loop_ht_2 //jumps to loop handles wd 2 @@ -141,22 +141,22 @@ core_loop_ht_2: inner_loop_ht_2: add x0,x4,x2 //increments pi2_src ld1 {v0.4h},[x4],#8 //loads pu1_src - smull v0.4s, v0.4h, v12.4h //vmull_s16(src_tmp1, coeff_0) + smull v0.4s, v0.4h, v16.4h //vmull_s16(src_tmp1, coeff_0) subs x12,x12,#8 //2wd + 8 ld1 {v2.4h},[x0],x2 //loads pi2_src - smull v8.4s, v2.4h, v12.4h //vmull_s16(src_tmp2, coeff_0) + smull v7.4s, v2.4h, v16.4h //vmull_s16(src_tmp2, coeff_0) ld1 {v3.4h},[x0],x2 //loads pi2_src - smlal v0.4s, v2.4h, v13.4h + smlal v0.4s, v2.4h, v17.4h ld1 {v6.4h},[x0],x2 - smlal v8.4s, v3.4h, v13.4h + smlal v7.4s, v3.4h, v17.4h ld1 {v2.4h},[x0] add x7,x1,x3 //pu1_dst + dst_strd - smlal v0.4s, v3.4h, v14.4h - smlal v8.4s, v6.4h, v14.4h - smlal v0.4s, v6.4h, v15.4h - smlal v8.4s, v2.4h, v15.4h + smlal v0.4s, v3.4h, v18.4h + smlal v7.4s, v6.4h, v18.4h + smlal v0.4s, v6.4h, v19.4h + smlal v7.4s, v2.4h, v19.4h sqshrn v0.4h, v0.4s,#6 //right shift - sqshrn v30.4h, v8.4s,#6 //right shift + sqshrn v30.4h, v7.4s,#6 //right shift st1 {v0.2s},[x1],#8 //stores the loaded value st1 {v30.2s},[x7] //stores the loaded value bgt inner_loop_ht_2 //inner loop -again @@ -188,44 +188,44 @@ prolog: ld1 {v1.4h},[x0],x2 //loads pi2_src subs x11,x11,#4 ld1 {v2.4h},[x0],x2 //loads pi2_src - smull v30.4s, v0.4h, v12.4h //vmull_s16(src_tmp1, coeff_0) + smull v30.4s, v0.4h, v16.4h //vmull_s16(src_tmp1, coeff_0) ld1 {v3.4h},[x0],x2 - smlal v30.4s, v1.4h, v13.4h - smlal v30.4s, v2.4h, v14.4h + smlal v30.4s, v1.4h, v17.4h + smlal v30.4s, v2.4h, v18.4h add x9,x1,x3 //pu1_dst + dst_strd - smlal v30.4s, v3.4h, v15.4h + smlal v30.4s, v3.4h, v19.4h ld1 {v4.4h},[x0],x2 - smull v28.4s, v1.4h, v12.4h //vmull_s16(src_tmp2, coeff_0) + smull v28.4s, v1.4h, v16.4h //vmull_s16(src_tmp2, coeff_0) add x20,x4,x8 csel x4, x20, x4,le lsl x20,x6,#1 csel x11, x20, x11,le - smlal v28.4s, v2.4h, v13.4h - smlal v28.4s, v3.4h, v14.4h + smlal v28.4s, v2.4h, v17.4h + smlal v28.4s, v3.4h, v18.4h ld1 {v5.4h},[x0],x2 - smlal v28.4s, v4.4h, v15.4h + smlal v28.4s, v4.4h, v19.4h sqshrn v30.4h, v30.4s,#6 //right shift ld1 {v6.4h},[x0],x2 - smull v26.4s, v2.4h, v12.4h //vmull_s16(src_tmp2, coeff_0) - smlal v26.4s, v3.4h, v13.4h - smlal v26.4s, v4.4h, v14.4h + smull v26.4s, v2.4h, v16.4h //vmull_s16(src_tmp2, coeff_0) + smlal v26.4s, v3.4h, v17.4h + smlal v26.4s, v4.4h, v18.4h add x0,x4,x2 ld1 {v0.4h},[x4],#8 //loads pu1_src - smlal v26.4s, v5.4h, v15.4h + smlal v26.4s, v5.4h, v19.4h sqshrn v28.4h, v28.4s,#6 //right shift ld1 {v1.4h},[x0],x2 //loads pi2_src - smull v24.4s, v3.4h, v12.4h //vmull_s16(src_tmp2, coeff_0) + smull v24.4s, v3.4h, v16.4h //vmull_s16(src_tmp2, coeff_0) st1 {v30.2s},[x1],#8 //stores the loaded value - smlal v24.4s, v4.4h, v13.4h + smlal v24.4s, v4.4h, v17.4h ld1 {v2.4h},[x0],x2 //loads pi2_src - smlal v24.4s, v5.4h, v14.4h + smlal v24.4s, v5.4h, v18.4h ld1 {v3.4h},[x0],x2 - smlal v24.4s, v6.4h, v15.4h + smlal v24.4s, v6.4h, v19.4h add x20,x1,x14,lsl #1 csel x1, x20, x1,le @@ -235,20 +235,20 @@ prolog: beq epilog //jumps to epilog kernel_4: - smull v30.4s, v0.4h, v12.4h //vmull_s16(src_tmp1, coeff_0) + smull v30.4s, v0.4h, v16.4h //vmull_s16(src_tmp1, coeff_0) subs x11,x11,#4 - smlal v30.4s, v1.4h, v13.4h + smlal v30.4s, v1.4h, v17.4h st1 {v28.2s},[x9],x3 //stores the loaded value - smlal v30.4s, v2.4h, v14.4h - smlal v30.4s, v3.4h, v15.4h + smlal v30.4s, v2.4h, v18.4h + smlal v30.4s, v3.4h, v19.4h sqshrn v24.4h, v24.4s,#6 //right shift ld1 {v4.4h},[x0],x2 - smull v28.4s, v1.4h, v12.4h //vmull_s16(src_tmp2, coeff_0) - smlal v28.4s, v2.4h, v13.4h - smlal v28.4s, v3.4h, v14.4h - smlal v28.4s, v4.4h, v15.4h + smull v28.4s, v1.4h, v16.4h //vmull_s16(src_tmp2, coeff_0) + smlal v28.4s, v2.4h, v17.4h + smlal v28.4s, v3.4h, v18.4h + smlal v28.4s, v4.4h, v19.4h st1 {v26.2s},[x9],x3 //stores the loaded value add x20,x4,x8 csel x4, x20, x4,le @@ -258,27 +258,27 @@ kernel_4: sqshrn v30.4h, v30.4s,#6 //right shift ld1 {v5.4h},[x0],x2 - smull v26.4s, v2.4h, v12.4h //vmull_s16(src_tmp2, coeff_0) + smull v26.4s, v2.4h, v16.4h //vmull_s16(src_tmp2, coeff_0) ld1 {v6.4h},[x0],x2 - smlal v26.4s, v3.4h, v13.4h + smlal v26.4s, v3.4h, v17.4h st1 {v24.2s},[x9] //stores the loaded value add x0,x4,x2 - smlal v26.4s, v4.4h, v14.4h + smlal v26.4s, v4.4h, v18.4h ld1 {v0.4h},[x4],#8 //loads pu1_src - smlal v26.4s, v5.4h, v15.4h + smlal v26.4s, v5.4h, v19.4h sqshrn v28.4h, v28.4s,#6 //right shift ld1 {v1.4h},[x0],x2 //loads pi2_src - smull v24.4s, v3.4h, v12.4h //vmull_s16(src_tmp2, coeff_0) + smull v24.4s, v3.4h, v16.4h //vmull_s16(src_tmp2, coeff_0) ld1 {v2.4h},[x0],x2 //loads pi2_src - smlal v24.4s, v4.4h, v13.4h + smlal v24.4s, v4.4h, v17.4h add x9,x1,x3 //pu1_dst + dst_strd ld1 {v3.4h},[x0],x2 - smlal v24.4s, v5.4h, v14.4h + smlal v24.4s, v5.4h, v18.4h st1 {v30.2s},[x1],#8 //stores the loaded value - smlal v24.4s, v6.4h, v15.4h + smlal v24.4s, v6.4h, v19.4h sqshrn v26.4h, v26.4s,#6 //right shift add x20,x1,x14,lsl #1 @@ -289,38 +289,38 @@ kernel_4: bgt kernel_4 //jumps to kernel_4 epilog: - smull v30.4s, v0.4h, v12.4h //vmull_s16(src_tmp1, coeff_0) + smull v30.4s, v0.4h, v16.4h //vmull_s16(src_tmp1, coeff_0) st1 {v28.2s},[x9],x3 //stores the loaded value - smlal v30.4s, v1.4h, v13.4h - smlal v30.4s, v2.4h, v14.4h - smlal v30.4s, v3.4h, v15.4h + smlal v30.4s, v1.4h, v17.4h + smlal v30.4s, v2.4h, v18.4h + smlal v30.4s, v3.4h, v19.4h sqshrn v24.4h, v24.4s,#6 //right shift - smull v28.4s, v1.4h, v12.4h //vmull_s16(src_tmp2, coeff_0) + smull v28.4s, v1.4h, v16.4h //vmull_s16(src_tmp2, coeff_0) ld1 {v4.4h},[x0],x2 - smlal v28.4s, v2.4h, v13.4h + smlal v28.4s, v2.4h, v17.4h st1 {v26.2s},[x9],x3 //stores the loaded value - smlal v28.4s, v3.4h, v14.4h - smlal v28.4s, v4.4h, v15.4h + smlal v28.4s, v3.4h, v18.4h + smlal v28.4s, v4.4h, v19.4h sqshrn v30.4h, v30.4s,#6 //right shift - smull v26.4s, v2.4h, v12.4h //vmull_s16(src_tmp2, coeff_0) + smull v26.4s, v2.4h, v16.4h //vmull_s16(src_tmp2, coeff_0) ld1 {v5.4h},[x0],x2 - smlal v26.4s, v3.4h, v13.4h - smlal v26.4s, v4.4h, v14.4h - smlal v26.4s, v5.4h, v15.4h + smlal v26.4s, v3.4h, v17.4h + smlal v26.4s, v4.4h, v18.4h + smlal v26.4s, v5.4h, v19.4h sqshrn v28.4h, v28.4s,#6 //right shift st1 {v24.2s},[x9] //stores the loaded value - smull v24.4s, v3.4h, v12.4h //vmull_s16(src_tmp2, coeff_0) - smlal v24.4s, v4.4h, v13.4h + smull v24.4s, v3.4h, v16.4h //vmull_s16(src_tmp2, coeff_0) + smlal v24.4s, v4.4h, v17.4h add x9,x1,x3 //pu1_dst + dst_strd ld1 {v6.4h},[x0],x2 - smlal v24.4s, v5.4h, v14.4h - smlal v24.4s, v6.4h, v15.4h + smlal v24.4s, v5.4h, v18.4h + smlal v24.4s, v6.4h, v19.4h st1 {v30.2s},[x1],#8 //stores the loaded value sqshrn v26.4h, v26.4s,#6 //right shift @@ -335,7 +335,7 @@ epilog: end_loops: // ldmfd sp!,{x4-x12,x15} //reload the registers from sp ldp x19, x20,[sp],#16 - pop_v_regs + ret diff --git a/common/arm64/ihevc_inter_pred_chroma_vert_w16out.s b/common/arm64/ihevc_inter_pred_chroma_vert_w16out.s index 9f5687f..ec946eb 100644 --- a/common/arm64/ihevc_inter_pred_chroma_vert_w16out.s +++ b/common/arm64/ihevc_inter_pred_chroma_vert_w16out.s @@ -105,7 +105,7 @@ ihevc_inter_pred_chroma_vert_w16out_av8: // stmfd sp!,{x4-x12,x14} //stack stores the values of the arguments - push_v_regs + stp x19, x20,[sp,#-16]! mov x15,x4 // pi1_coeff @@ -145,20 +145,20 @@ ihevc_inter_pred_chroma_vert_w16out_av8: inner_loop_ht_2: //called when wd is multiple of 4 and ht is 4,2 add x6,x0,x2 //pu1_src +src_strd - ld1 {v9.8b},[x6],x2 //loads pu1_src + ld1 {v17.8b},[x6],x2 //loads pu1_src subs x5,x5,#8 //2wd - 8 ld1 {v5.8b},[x0],#8 //loads src - umull v6.8h, v9.8b, v1.8b //vmull_u8(vreinterpret_u8_u32(src_tmp2), coeffabs_1) + umull v6.8h, v17.8b, v1.8b //vmull_u8(vreinterpret_u8_u32(src_tmp2), coeffabs_1) ld1 {v4.8b},[x6],x2 //loads incremented src umlsl v6.8h, v5.8b, v0.8b //vmlsl_u8(mul_res1, vreinterpret_u8_u32(src_tmp1), coeffabs_0) - ld1 {v8.8b},[x6],x2 //loads incremented src + ld1 {v16.8b},[x6],x2 //loads incremented src umlal v6.8h, v4.8b, v2.8b //vmlal_u8(mul_res1, vreinterpret_u8_u32(src_tmp3), coeffabs_2) umull v4.8h, v4.8b, v1.8b - ld1 {v10.8b},[x6] //loads the incremented src - umlsl v6.8h, v8.8b, v3.8b - umlsl v4.8h, v9.8b, v0.8b - umlal v4.8h, v8.8b, v2.8b - umlsl v4.8h, v10.8b, v3.8b + ld1 {v18.8b},[x6] //loads the incremented src + umlsl v6.8h, v16.8b, v3.8b + umlsl v4.8h, v17.8b, v0.8b + umlal v4.8h, v16.8b, v2.8b + umlsl v4.8h, v18.8b, v3.8b add x6,x1,x3 //pu1_dst + dst_strd st1 { v6.8h},[x1],#16 //stores the loaded value @@ -241,7 +241,7 @@ prolog: add x7,x1,x3 //pu1_dst umlal v30.8h, v6.8b, v2.8b umlsl v30.8h, v7.8b, v3.8b - ld1 {v8.8b},[x6],x2 //load and increment + ld1 {v16.8b},[x6],x2 //load and increment umull v28.8h, v6.8b, v1.8b //mul_res 2 add x20,x0,x9 //pu1_dst += 4*dst_strd - 2*wd @@ -250,28 +250,28 @@ prolog: bic x20,x10,#7 //x5 ->wd csel x5, x20, x5,le umlal v28.8h, v7.8b, v2.8b - ld1 {v9.8b},[x6],x2 - umlsl v28.8h, v8.8b, v3.8b + ld1 {v17.8b},[x6],x2 + umlsl v28.8h, v16.8b, v3.8b - ld1 {v10.8b},[x6],x2 + ld1 {v18.8b},[x6],x2 umull v26.8h, v7.8b, v1.8b add x6,x0,x2 //pu1_src + src_strd umlsl v26.8h, v6.8b, v0.8b st1 { v30.16b},[x1],#16 //stores the loaded value - umlal v26.8h, v8.8b, v2.8b + umlal v26.8h, v16.8b, v2.8b ld1 {v4.8b},[x0],#8 //loads the source - umlsl v26.8h, v9.8b, v3.8b + umlsl v26.8h, v17.8b, v3.8b add x20,x1,x8 //pu1_src += 4*src_strd - 2*wd csel x1, x20, x1,le - umull v24.8h, v8.8b, v1.8b + umull v24.8h, v16.8b, v1.8b ld1 {v5.8b},[x6],x2 //loads pu1_src umlsl v24.8h, v7.8b, v0.8b subs x12,x12,#4 ld1 {v6.8b},[x6],x2 //load and increment - umlal v24.8h, v9.8b, v2.8b + umlal v24.8h, v17.8b, v2.8b ld1 {v7.8b},[x6],x2 //load and increment - umlsl v24.8h, v10.8b, v3.8b + umlsl v24.8h, v18.8b, v3.8b sub x20,x2,x2,lsl #3 neg x11, x20 add x14,x2,x2,lsl #1 @@ -296,7 +296,7 @@ kernel_8: umlsl v30.8h, v7.8b, v3.8b st1 { v26.16b},[x7],x3 //stores the loaded value - ld1 {v8.8b},[x6],x2 //load and increment + ld1 {v16.8b},[x6],x2 //load and increment umull v28.8h, v6.8b, v1.8b //mul_res 2 bic x20,x10,#7 //x5 ->wd @@ -305,10 +305,10 @@ kernel_8: st1 { v24.16b},[x7],x3 //stores the loaded value umlal v28.8h, v7.8b, v2.8b - ld1 {v9.8b},[x6],x2 + ld1 {v17.8b},[x6],x2 - umlsl v28.8h, v8.8b, v3.8b - ld1 {v10.8b},[x6],x2 + umlsl v28.8h, v16.8b, v3.8b + ld1 {v18.8b},[x6],x2 add x7,x1,x3 //pu1_dst umull v26.8h, v7.8b, v1.8b add x6,x0,x2 //pu1_src + src_strd @@ -319,13 +319,13 @@ kernel_8: ld1 {v4.8b},[x0],#8 //loads the source add x11,x11,x2 - umlal v26.8h, v8.8b, v2.8b + umlal v26.8h, v16.8b, v2.8b st1 { v30.16b},[x1],#16 //stores the loaded value - umlsl v26.8h, v9.8b, v3.8b + umlsl v26.8h, v17.8b, v3.8b ld1 {v5.8b},[x6],x2 //loads pu1_src - umull v24.8h, v8.8b, v1.8b + umull v24.8h, v16.8b, v1.8b ld1 {v6.8b},[x6],x2 //load and increment add x20,x1,x8 //pu1_src += 4*src_strd - 2*wd csel x1, x20, x1,le @@ -341,10 +341,10 @@ kernel_8: subs x12,x12,#4 - umlal v24.8h, v9.8b, v2.8b + umlal v24.8h, v17.8b, v2.8b ld1 {v7.8b},[x6],x2 //load and increment - umlsl v24.8h, v10.8b, v3.8b + umlsl v24.8h, v18.8b, v3.8b st1 { v28.16b},[x7],x3 //stores the loaded value bgt kernel_8 //jumps to kernel_8 @@ -357,35 +357,35 @@ epilog: umlsl v30.8h, v7.8b, v3.8b st1 { v26.16b},[x7],x3 //stores the loaded value - ld1 {v8.8b},[x6],x2 //load and increment + ld1 {v16.8b},[x6],x2 //load and increment umull v28.8h, v6.8b, v1.8b //mul_res 2 umlsl v28.8h, v5.8b, v0.8b umlal v28.8h, v7.8b, v2.8b - umlsl v28.8h, v8.8b, v3.8b + umlsl v28.8h, v16.8b, v3.8b st1 { v24.16b},[x7],x3 //stores the loaded value - ld1 {v9.8b},[x6],x2 + ld1 {v17.8b},[x6],x2 umull v26.8h, v7.8b, v1.8b add x7,x1,x3 //pu1_dst umlsl v26.8h, v6.8b, v0.8b st1 { v30.16b},[x1],#16 //stores the loaded value - umlal v26.8h, v8.8b, v2.8b - ld1 {v10.8b},[x6],x2 - umlsl v26.8h, v9.8b, v3.8b + umlal v26.8h, v16.8b, v2.8b + ld1 {v18.8b},[x6],x2 + umlsl v26.8h, v17.8b, v3.8b - umull v24.8h, v8.8b, v1.8b + umull v24.8h, v16.8b, v1.8b st1 { v28.16b},[x7],x3 //stores the loaded value umlsl v24.8h, v7.8b, v0.8b - umlal v24.8h, v9.8b, v2.8b + umlal v24.8h, v17.8b, v2.8b st1 { v26.16b},[x7],x3 //stores the loaded value - umlsl v24.8h, v10.8b, v3.8b + umlsl v24.8h, v18.8b, v3.8b st1 { v24.16b},[x7],x3 //stores the loaded value end_loops: // ldmfd sp!,{x4-x12,x15} //reload the registers from sp ldp x19, x20,[sp],#16 - pop_v_regs + ret diff --git a/common/arm64/ihevc_inter_pred_filters_luma_vert.s b/common/arm64/ihevc_inter_pred_filters_luma_vert.s index 48dc30f..bd8b3c4 100644 --- a/common/arm64/ihevc_inter_pred_filters_luma_vert.s +++ b/common/arm64/ihevc_inter_pred_filters_luma_vert.s @@ -115,7 +115,7 @@ ihevc_inter_pred_luma_vert_av8: // stmfd sp!, {x4-x12, x14} //stack stores the values of the arguments - push_v_regs + stp x19, x20,[sp,#-16]! mov x15,x4 // pi1_coeff @@ -161,87 +161,87 @@ prolog: ld1 {v0.8b},[x0],#8 //src_tmp1 = vld1_u8(pu1_src_tmp)// subs x4,x4,#8 ld1 {v2.8b},[x3],x2 //src_tmp3 = vld1_u8(pu1_src_tmp)// - umull v8.8h, v1.8b, v23.8b //mul_res1 = vmull_u8(src_tmp2, coeffabs_1)// + umull v19.8h, v1.8b, v23.8b //mul_res1 = vmull_u8(src_tmp2, coeffabs_1)// ld1 {v3.8b},[x3],x2 //src_tmp4 = vld1_u8(pu1_src_tmp)// - umlsl v8.8h, v0.8b, v22.8b //mul_res1 = vmlsl_u8(mul_res1, src_tmp1, coeffabs_0)// + umlsl v19.8h, v0.8b, v22.8b //mul_res1 = vmlsl_u8(mul_res1, src_tmp1, coeffabs_0)// ld1 {v4.8b},[x3],x2 //src_tmp1 = vld1_u8(pu1_src_tmp)// - umlsl v8.8h, v2.8b, v24.8b //mul_res1 = vmlsl_u8(mul_res1, src_tmp3, coeffabs_2)// + umlsl v19.8h, v2.8b, v24.8b //mul_res1 = vmlsl_u8(mul_res1, src_tmp3, coeffabs_2)// ld1 {v5.8b},[x3],x2 //src_tmp2 = vld1_u8(pu1_src_tmp)// - umlal v8.8h, v3.8b, v25.8b //mul_res1 = vmlal_u8(mul_res1, src_tmp4, coeffabs_3)// + umlal v19.8h, v3.8b, v25.8b //mul_res1 = vmlal_u8(mul_res1, src_tmp4, coeffabs_3)// ld1 {v6.8b},[x3],x2 //src_tmp3 = vld1_u8(pu1_src_tmp)// - umlal v8.8h, v4.8b, v26.8b //mul_res1 = vmlal_u8(mul_res1, src_tmp1, coeffabs_4)// + umlal v19.8h, v4.8b, v26.8b //mul_res1 = vmlal_u8(mul_res1, src_tmp1, coeffabs_4)// ld1 {v7.8b},[x3],x2 //src_tmp4 = vld1_u8(pu1_src_tmp)// - umlsl v8.8h, v5.8b, v27.8b //mul_res1 = vmlsl_u8(mul_res1, src_tmp2, coeffabs_5)// + umlsl v19.8h, v5.8b, v27.8b //mul_res1 = vmlsl_u8(mul_res1, src_tmp2, coeffabs_5)// ld1 {v16.8b},[x3],x2 //src_tmp1 = vld1_u8(pu1_src_tmp)// - umlal v8.8h, v6.8b, v28.8b //mul_res1 = vmlal_u8(mul_res1, src_tmp3, coeffabs_6)// + umlal v19.8h, v6.8b, v28.8b //mul_res1 = vmlal_u8(mul_res1, src_tmp3, coeffabs_6)// ld1 {v17.8b},[x3],x2 //src_tmp2 = vld1_u8(pu1_src_tmp)// - umlsl v8.8h, v7.8b, v29.8b //mul_res1 = vmlsl_u8(mul_res1, src_tmp4, coeffabs_7)// + umlsl v19.8h, v7.8b, v29.8b //mul_res1 = vmlsl_u8(mul_res1, src_tmp4, coeffabs_7)// ld1 {v18.8b},[x3],x2 //src_tmp3 = vld1_u8(pu1_src_tmp)// - umull v10.8h, v2.8b, v23.8b //mul_res2 = vmull_u8(src_tmp3, coeffabs_1)// + umull v20.8h, v2.8b, v23.8b //mul_res2 = vmull_u8(src_tmp3, coeffabs_1)// add x20,x0,x8 csel x0, x20, x0,le - umlsl v10.8h, v1.8b, v22.8b //mul_res2 = vmlsl_u8(mul_res2, src_tmp2, coeffabs_0)// + umlsl v20.8h, v1.8b, v22.8b //mul_res2 = vmlsl_u8(mul_res2, src_tmp2, coeffabs_0)// bic x20,x5,#7 //x5 ->wd csel x4, x20, x4,le - umlsl v10.8h, v3.8b, v24.8b //mul_res2 = vmlsl_u8(mul_res2, src_tmp4, coeffabs_2)// + umlsl v20.8h, v3.8b, v24.8b //mul_res2 = vmlsl_u8(mul_res2, src_tmp4, coeffabs_2)// prfm PLDL1KEEP,[x3] - umlal v10.8h, v4.8b, v25.8b //mul_res2 = vmlal_u8(mul_res2, src_tmp1, coeffabs_3)// + umlal v20.8h, v4.8b, v25.8b //mul_res2 = vmlal_u8(mul_res2, src_tmp1, coeffabs_3)// add x20,x3, x2 prfm PLDL1KEEP,[x20] - umlal v10.8h, v5.8b, v26.8b //mul_res2 = vmlal_u8(mul_res2, src_tmp2, coeffabs_4)// + umlal v20.8h, v5.8b, v26.8b //mul_res2 = vmlal_u8(mul_res2, src_tmp2, coeffabs_4)// add x20,x3, x2, lsl #1 prfm PLDL1KEEP,[x20] - umlsl v10.8h, v6.8b, v27.8b //mul_res2 = vmlsl_u8(mul_res2, src_tmp3, coeffabs_5)// + umlsl v20.8h, v6.8b, v27.8b //mul_res2 = vmlsl_u8(mul_res2, src_tmp3, coeffabs_5)// add x3, x3, x2 - umlal v10.8h, v7.8b, v28.8b //mul_res2 = vmlal_u8(mul_res2, src_tmp4, coeffabs_6)// + umlal v20.8h, v7.8b, v28.8b //mul_res2 = vmlal_u8(mul_res2, src_tmp4, coeffabs_6)// add x20,x3, x2, lsl #1 prfm PLDL1KEEP,[x20] - umlsl v10.8h, v16.8b, v29.8b //mul_res2 = vmlsl_u8(mul_res2, src_tmp1, coeffabs_7)// + umlsl v20.8h, v16.8b, v29.8b //mul_res2 = vmlsl_u8(mul_res2, src_tmp1, coeffabs_7)// add x3,x0,x2 //pu1_src_tmp += src_strd// - sqrshrun v8.8b, v8.8h,#6 //sto_res = vqmovun_s16(sto_res_tmp)// + sqrshrun v19.8b, v19.8h,#6 //sto_res = vqmovun_s16(sto_res_tmp)// ld1 {v1.8b},[x3],x2 //src_tmp3 = vld1_u8(pu1_src_tmp)// - umull v12.8h, v3.8b, v23.8b + umull v21.8h, v3.8b, v23.8b ld1 {v0.8b},[x0],#8 //src_tmp1 = vld1_u8(pu1_src_tmp)// - umlsl v12.8h, v2.8b, v22.8b + umlsl v21.8h, v2.8b, v22.8b ld1 {v2.8b},[x3],x2 //src_tmp3 = vld1_u8(pu1_src_tmp)// - umlsl v12.8h, v4.8b, v24.8b - umlal v12.8h, v5.8b, v25.8b - umlal v12.8h, v6.8b, v26.8b - umlsl v12.8h, v7.8b, v27.8b - umlal v12.8h, v16.8b, v28.8b - umlsl v12.8h, v17.8b, v29.8b + umlsl v21.8h, v4.8b, v24.8b + umlal v21.8h, v5.8b, v25.8b + umlal v21.8h, v6.8b, v26.8b + umlsl v21.8h, v7.8b, v27.8b + umlal v21.8h, v16.8b, v28.8b + umlsl v21.8h, v17.8b, v29.8b add x14,x1,x6 - st1 {v8.8b},[x1],#8 //vst1_u8(pu1_dst,sto_res)// - sqrshrun v10.8b, v10.8h,#6 //sto_res = vqmovun_s16(sto_res_tmp)// + st1 {v19.8b},[x1],#8 //vst1_u8(pu1_dst,sto_res)// + sqrshrun v20.8b, v20.8h,#6 //sto_res = vqmovun_s16(sto_res_tmp)// add x20,x1,x9 csel x1, x20, x1,le - umull v14.8h, v4.8b, v23.8b + umull v30.8h, v4.8b, v23.8b subs x7,x7,#4 - umlsl v14.8h, v3.8b, v22.8b - umlsl v14.8h, v5.8b, v24.8b - umlal v14.8h, v6.8b, v25.8b + umlsl v30.8h, v3.8b, v22.8b + umlsl v30.8h, v5.8b, v24.8b + umlal v30.8h, v6.8b, v25.8b ld1 {v3.8b},[x3],x2 //src_tmp4 = vld1_u8(pu1_src_tmp)// - umlal v14.8h, v7.8b, v26.8b + umlal v30.8h, v7.8b, v26.8b ld1 {v4.8b},[x3],x2 //src_tmp1 = vld1_u8(pu1_src_tmp)// - umlsl v14.8h, v16.8b, v27.8b + umlsl v30.8h, v16.8b, v27.8b ld1 {v5.8b},[x3],x2 //src_tmp2 = vld1_u8(pu1_src_tmp)// - umlal v14.8h, v17.8b, v28.8b + umlal v30.8h, v17.8b, v28.8b ld1 {v6.8b},[x3],x2 //src_tmp3 = vld1_u8(pu1_src_tmp)// - umlsl v14.8h, v18.8b, v29.8b + umlsl v30.8h, v18.8b, v29.8b ld1 {v7.8b},[x3],x2 //src_tmp4 = vld1_u8(pu1_src_tmp)// - st1 {v10.8b},[x14],x6 //vst1_u8(pu1_dst_tmp,sto_res)// - sqrshrun v12.8b, v12.8h,#6 + st1 {v20.8b},[x14],x6 //vst1_u8(pu1_dst_tmp,sto_res)// + sqrshrun v21.8b, v21.8h,#6 blt epilog_end //jumps to epilog_end @@ -250,111 +250,111 @@ prolog: kernel_8: subs x4,x4,#8 - umull v8.8h, v1.8b, v23.8b //mul_res1 = vmull_u8(src_tmp2, coeffabs_1)// + umull v19.8h, v1.8b, v23.8b //mul_res1 = vmull_u8(src_tmp2, coeffabs_1)// add x20,x0,x8 csel x0, x20, x0,le - umlsl v8.8h, v0.8b, v22.8b //mul_res1 = vmlsl_u8(mul_res1, src_tmp1, coeffabs_0)// + umlsl v19.8h, v0.8b, v22.8b //mul_res1 = vmlsl_u8(mul_res1, src_tmp1, coeffabs_0)// bic x20,x5,#7 //x5 ->wd csel x4, x20, x4,le - umlsl v8.8h, v2.8b, v24.8b //mul_res1 = vmlsl_u8(mul_res1, src_tmp3, coeffabs_2)// + umlsl v19.8h, v2.8b, v24.8b //mul_res1 = vmlsl_u8(mul_res1, src_tmp3, coeffabs_2)// ld1 {v16.8b},[x3],x2 //src_tmp1 = vld1_u8(pu1_src_tmp)// - umlal v8.8h, v3.8b, v25.8b //mul_res1 = vmlal_u8(mul_res1, src_tmp4, coeffabs_3)// + umlal v19.8h, v3.8b, v25.8b //mul_res1 = vmlal_u8(mul_res1, src_tmp4, coeffabs_3)// ld1 {v17.8b},[x3],x2 //src_tmp2 = vld1_u8(pu1_src_tmp)// - umlal v8.8h, v4.8b, v26.8b //mul_res1 = vmlal_u8(mul_res1, src_tmp1, coeffabs_4)// + umlal v19.8h, v4.8b, v26.8b //mul_res1 = vmlal_u8(mul_res1, src_tmp1, coeffabs_4)// ld1 {v18.8b},[x3],x2 //src_tmp3 = vld1_u8(pu1_src_tmp)// - umlsl v8.8h, v5.8b, v27.8b //mul_res1 = vmlsl_u8(mul_res1, src_tmp2, coeffabs_5)// + umlsl v19.8h, v5.8b, v27.8b //mul_res1 = vmlsl_u8(mul_res1, src_tmp2, coeffabs_5)// - umlal v8.8h, v6.8b, v28.8b //mul_res1 = vmlal_u8(mul_res1, src_tmp3, coeffabs_6)// + umlal v19.8h, v6.8b, v28.8b //mul_res1 = vmlal_u8(mul_res1, src_tmp3, coeffabs_6)// - umlsl v8.8h, v7.8b, v29.8b //mul_res1 = vmlsl_u8(mul_res1, src_tmp4, coeffabs_7)// - st1 {v12.8b},[x14],x6 + umlsl v19.8h, v7.8b, v29.8b //mul_res1 = vmlsl_u8(mul_res1, src_tmp4, coeffabs_7)// + st1 {v21.8b},[x14],x6 // and x11, x0, #31 - sqrshrun v14.8b, v14.8h,#6 + sqrshrun v30.8b, v30.8h,#6 add x3,x0,x2 //pu1_src_tmp += src_strd// - umull v10.8h, v2.8b, v23.8b //mul_res2 = vmull_u8(src_tmp3, coeffabs_1)// + umull v20.8h, v2.8b, v23.8b //mul_res2 = vmull_u8(src_tmp3, coeffabs_1)// ld1 {v0.8b},[x0],#8 //src_tmp1 = vld1_u8(pu1_src_tmp)// - umlsl v10.8h, v1.8b, v22.8b //mul_res2 = vmlsl_u8(mul_res2, src_tmp2, coeffabs_0)// + umlsl v20.8h, v1.8b, v22.8b //mul_res2 = vmlsl_u8(mul_res2, src_tmp2, coeffabs_0)// - umlsl v10.8h, v3.8b, v24.8b //mul_res2 = vmlsl_u8(mul_res2, src_tmp4, coeffabs_2)// + umlsl v20.8h, v3.8b, v24.8b //mul_res2 = vmlsl_u8(mul_res2, src_tmp4, coeffabs_2)// ld1 {v1.8b},[x3],x2 //src_tmp2 = vld1_u8(pu1_src_tmp)// - umlal v10.8h, v4.8b, v25.8b //mul_res2 = vmlal_u8(mul_res2, src_tmp1, coeffabs_3)// + umlal v20.8h, v4.8b, v25.8b //mul_res2 = vmlal_u8(mul_res2, src_tmp1, coeffabs_3)// - st1 {v14.8b},[x14],x6 - umlal v10.8h, v5.8b, v26.8b //mul_res2 = vmlal_u8(mul_res2, src_tmp2, coeffabs_4)// + st1 {v30.8b},[x14],x6 + umlal v20.8h, v5.8b, v26.8b //mul_res2 = vmlal_u8(mul_res2, src_tmp2, coeffabs_4)// add x14,x1,#0 - umlsl v10.8h, v6.8b, v27.8b //mul_res2 = vmlsl_u8(mul_res2, src_tmp3, coeffabs_5)// + umlsl v20.8h, v6.8b, v27.8b //mul_res2 = vmlsl_u8(mul_res2, src_tmp3, coeffabs_5)// add x1, x1, #8 - umlal v10.8h, v7.8b, v28.8b //mul_res2 = vmlal_u8(mul_res2, src_tmp4, coeffabs_6)// + umlal v20.8h, v7.8b, v28.8b //mul_res2 = vmlal_u8(mul_res2, src_tmp4, coeffabs_6)// - umlsl v10.8h, v16.8b, v29.8b //mul_res2 = vmlsl_u8(mul_res2, src_tmp1, coeffabs_7)// + umlsl v20.8h, v16.8b, v29.8b //mul_res2 = vmlsl_u8(mul_res2, src_tmp1, coeffabs_7)// add x20,x1,x9 csel x1, x20, x1,le - sqrshrun v8.8b, v8.8h,#6 //sto_res = vqmovun_s16(sto_res_tmp)// + sqrshrun v19.8b, v19.8h,#6 //sto_res = vqmovun_s16(sto_res_tmp)// // cmp x11, x10 - umull v12.8h, v3.8b, v23.8b + umull v21.8h, v3.8b, v23.8b add x10, x3, x2, lsl #3 // 10*strd - 8+2 - umlsl v12.8h, v2.8b, v22.8b + umlsl v21.8h, v2.8b, v22.8b add x10, x10, x2 // 11*strd - umlsl v12.8h, v4.8b, v24.8b + umlsl v21.8h, v4.8b, v24.8b ld1 {v2.8b},[x3],x2 //src_tmp3 = vld1_u8(pu1_src_tmp)// - umlal v12.8h, v5.8b, v25.8b + umlal v21.8h, v5.8b, v25.8b - umlal v12.8h, v6.8b, v26.8b - st1 {v8.8b},[x14],x6 //vst1_u8(pu1_dst,sto_res)// + umlal v21.8h, v6.8b, v26.8b + st1 {v19.8b},[x14],x6 //vst1_u8(pu1_dst,sto_res)// prfm PLDL1KEEP,[x10] //11+ 0 - umlsl v12.8h, v7.8b, v27.8b + umlsl v21.8h, v7.8b, v27.8b add x20,x10, x2 prfm PLDL1KEEP,[x20] //11+ 1*strd - umlal v12.8h, v16.8b, v28.8b + umlal v21.8h, v16.8b, v28.8b add x20,x10, x2, lsl #1 prfm PLDL1KEEP,[x20] //11+ 2*strd - umlsl v12.8h, v17.8b, v29.8b + umlsl v21.8h, v17.8b, v29.8b add x10, x10, x2 //12*strd - sqrshrun v10.8b, v10.8h,#6 //sto_res = vqmovun_s16(sto_res_tmp)// + sqrshrun v20.8b, v20.8h,#6 //sto_res = vqmovun_s16(sto_res_tmp)// add x20,x10, x2, lsl #1 prfm PLDL1KEEP,[x20] //11+ 3*strd - umull v14.8h, v4.8b, v23.8b + umull v30.8h, v4.8b, v23.8b // mov x10, x11 - umlsl v14.8h, v3.8b, v22.8b + umlsl v30.8h, v3.8b, v22.8b subs x7,x7,#4 - umlsl v14.8h, v5.8b, v24.8b + umlsl v30.8h, v5.8b, v24.8b - umlal v14.8h, v6.8b, v25.8b + umlal v30.8h, v6.8b, v25.8b ld1 {v3.8b},[x3],x2 //src_tmp4 = vld1_u8(pu1_src_tmp)// - umlal v14.8h, v7.8b, v26.8b + umlal v30.8h, v7.8b, v26.8b ld1 {v4.8b},[x3],x2 //src_tmp1 = vld1_u8(pu1_src_tmp)// - umlsl v14.8h, v16.8b, v27.8b + umlsl v30.8h, v16.8b, v27.8b ld1 {v5.8b},[x3],x2 //src_tmp2 = vld1_u8(pu1_src_tmp)// - umlal v14.8h, v17.8b, v28.8b + umlal v30.8h, v17.8b, v28.8b ld1 {v6.8b},[x3],x2 //src_tmp3 = vld1_u8(pu1_src_tmp)// - umlsl v14.8h, v18.8b, v29.8b + umlsl v30.8h, v18.8b, v29.8b ld1 {v7.8b},[x3],x2 //src_tmp4 = vld1_u8(pu1_src_tmp)// - sqrshrun v12.8b, v12.8h,#6 - st1 {v10.8b},[x14],x6 //vst1_u8(pu1_dst_tmp,sto_res)// + sqrshrun v21.8b, v21.8h,#6 + st1 {v20.8b},[x14],x6 //vst1_u8(pu1_dst_tmp,sto_res)// @@ -362,62 +362,62 @@ kernel_8: epilog: - umull v8.8h, v1.8b, v23.8b //mul_res1 = vmull_u8(src_tmp2, coeffabs_1)// - umlsl v8.8h, v0.8b, v22.8b //mul_res1 = vmlsl_u8(mul_res1, src_tmp1, coeffabs_0)// - umlsl v8.8h, v2.8b, v24.8b //mul_res1 = vmlsl_u8(mul_res1, src_tmp3, coeffabs_2)// - umlal v8.8h, v3.8b, v25.8b //mul_res1 = vmlal_u8(mul_res1, src_tmp4, coeffabs_3)// - umlal v8.8h, v4.8b, v26.8b //mul_res1 = vmlal_u8(mul_res1, src_tmp1, coeffabs_4)// - umlsl v8.8h, v5.8b, v27.8b //mul_res1 = vmlsl_u8(mul_res1, src_tmp2, coeffabs_5)// - umlal v8.8h, v6.8b, v28.8b //mul_res1 = vmlal_u8(mul_res1, src_tmp3, coeffabs_6)// - umlsl v8.8h, v7.8b, v29.8b //mul_res1 = vmlsl_u8(mul_res1, src_tmp4, coeffabs_7)// - st1 {v12.8b},[x14],x6 + umull v19.8h, v1.8b, v23.8b //mul_res1 = vmull_u8(src_tmp2, coeffabs_1)// + umlsl v19.8h, v0.8b, v22.8b //mul_res1 = vmlsl_u8(mul_res1, src_tmp1, coeffabs_0)// + umlsl v19.8h, v2.8b, v24.8b //mul_res1 = vmlsl_u8(mul_res1, src_tmp3, coeffabs_2)// + umlal v19.8h, v3.8b, v25.8b //mul_res1 = vmlal_u8(mul_res1, src_tmp4, coeffabs_3)// + umlal v19.8h, v4.8b, v26.8b //mul_res1 = vmlal_u8(mul_res1, src_tmp1, coeffabs_4)// + umlsl v19.8h, v5.8b, v27.8b //mul_res1 = vmlsl_u8(mul_res1, src_tmp2, coeffabs_5)// + umlal v19.8h, v6.8b, v28.8b //mul_res1 = vmlal_u8(mul_res1, src_tmp3, coeffabs_6)// + umlsl v19.8h, v7.8b, v29.8b //mul_res1 = vmlsl_u8(mul_res1, src_tmp4, coeffabs_7)// + st1 {v21.8b},[x14],x6 - sqrshrun v14.8b, v14.8h,#6 + sqrshrun v30.8b, v30.8h,#6 ld1 {v16.8b},[x3],x2 //src_tmp1 = vld1_u8(pu1_src_tmp)// - umull v10.8h, v2.8b, v23.8b //mul_res2 = vmull_u8(src_tmp3, coeffabs_1)// - umlsl v10.8h, v1.8b, v22.8b //mul_res2 = vmlsl_u8(mul_res2, src_tmp2, coeffabs_0)// - umlsl v10.8h, v3.8b, v24.8b //mul_res2 = vmlsl_u8(mul_res2, src_tmp4, coeffabs_2)// - umlal v10.8h, v4.8b, v25.8b //mul_res2 = vmlal_u8(mul_res2, src_tmp1, coeffabs_3)// - umlal v10.8h, v5.8b, v26.8b //mul_res2 = vmlal_u8(mul_res2, src_tmp2, coeffabs_4)// - umlsl v10.8h, v6.8b, v27.8b //mul_res2 = vmlsl_u8(mul_res2, src_tmp3, coeffabs_5)// - umlal v10.8h, v7.8b, v28.8b //mul_res2 = vmlal_u8(mul_res2, src_tmp4, coeffabs_6)// - umlsl v10.8h, v16.8b, v29.8b //mul_res2 = vmlsl_u8(mul_res2, src_tmp1, coeffabs_7)// - st1 {v14.8b},[x14],x6 - - sqrshrun v8.8b, v8.8h,#6 //sto_res = vqmovun_s16(sto_res_tmp)// + umull v20.8h, v2.8b, v23.8b //mul_res2 = vmull_u8(src_tmp3, coeffabs_1)// + umlsl v20.8h, v1.8b, v22.8b //mul_res2 = vmlsl_u8(mul_res2, src_tmp2, coeffabs_0)// + umlsl v20.8h, v3.8b, v24.8b //mul_res2 = vmlsl_u8(mul_res2, src_tmp4, coeffabs_2)// + umlal v20.8h, v4.8b, v25.8b //mul_res2 = vmlal_u8(mul_res2, src_tmp1, coeffabs_3)// + umlal v20.8h, v5.8b, v26.8b //mul_res2 = vmlal_u8(mul_res2, src_tmp2, coeffabs_4)// + umlsl v20.8h, v6.8b, v27.8b //mul_res2 = vmlsl_u8(mul_res2, src_tmp3, coeffabs_5)// + umlal v20.8h, v7.8b, v28.8b //mul_res2 = vmlal_u8(mul_res2, src_tmp4, coeffabs_6)// + umlsl v20.8h, v16.8b, v29.8b //mul_res2 = vmlsl_u8(mul_res2, src_tmp1, coeffabs_7)// + st1 {v30.8b},[x14],x6 + + sqrshrun v19.8b, v19.8h,#6 //sto_res = vqmovun_s16(sto_res_tmp)// ld1 {v17.8b},[x3],x2 //src_tmp2 = vld1_u8(pu1_src_tmp)// - umull v12.8h, v3.8b, v23.8b - umlsl v12.8h, v2.8b, v22.8b - umlsl v12.8h, v4.8b, v24.8b - umlal v12.8h, v5.8b, v25.8b - umlal v12.8h, v6.8b, v26.8b - umlsl v12.8h, v7.8b, v27.8b - umlal v12.8h, v16.8b, v28.8b - umlsl v12.8h, v17.8b, v29.8b + umull v21.8h, v3.8b, v23.8b + umlsl v21.8h, v2.8b, v22.8b + umlsl v21.8h, v4.8b, v24.8b + umlal v21.8h, v5.8b, v25.8b + umlal v21.8h, v6.8b, v26.8b + umlsl v21.8h, v7.8b, v27.8b + umlal v21.8h, v16.8b, v28.8b + umlsl v21.8h, v17.8b, v29.8b add x14,x1,x6 - st1 {v8.8b},[x1],#8 //vst1_u8(pu1_dst,sto_res)// - sqrshrun v10.8b, v10.8h,#6 //sto_res = vqmovun_s16(sto_res_tmp)// + st1 {v19.8b},[x1],#8 //vst1_u8(pu1_dst,sto_res)// + sqrshrun v20.8b, v20.8h,#6 //sto_res = vqmovun_s16(sto_res_tmp)// ld1 {v18.8b},[x3],x2 //src_tmp3 = vld1_u8(pu1_src_tmp)// - umull v14.8h, v4.8b, v23.8b - umlsl v14.8h, v3.8b, v22.8b - umlsl v14.8h, v5.8b, v24.8b - umlal v14.8h, v6.8b, v25.8b - umlal v14.8h, v7.8b, v26.8b - umlsl v14.8h, v16.8b, v27.8b - umlal v14.8h, v17.8b, v28.8b - umlsl v14.8h, v18.8b, v29.8b - - st1 {v10.8b},[x14],x6 //vst1_u8(pu1_dst_tmp,sto_res)// - sqrshrun v12.8b, v12.8h,#6 + umull v30.8h, v4.8b, v23.8b + umlsl v30.8h, v3.8b, v22.8b + umlsl v30.8h, v5.8b, v24.8b + umlal v30.8h, v6.8b, v25.8b + umlal v30.8h, v7.8b, v26.8b + umlsl v30.8h, v16.8b, v27.8b + umlal v30.8h, v17.8b, v28.8b + umlsl v30.8h, v18.8b, v29.8b + + st1 {v20.8b},[x14],x6 //vst1_u8(pu1_dst_tmp,sto_res)// + sqrshrun v21.8b, v21.8h,#6 epilog_end: - st1 {v12.8b},[x14],x6 - sqrshrun v14.8b, v14.8h,#6 + st1 {v21.8b},[x14],x6 + sqrshrun v30.8b, v30.8h,#6 - st1 {v14.8b},[x14],x6 + st1 {v30.8b},[x14],x6 end_loops: @@ -427,7 +427,7 @@ end_loops: // ldmeqfd sp!,{x4-x12,x15} //reload the registers from sp bne lbl409 ldp x19, x20,[sp], #16 - pop_v_regs + ret lbl409: mov x5, #4 @@ -465,34 +465,34 @@ inner_loop_wd_4: ld1 {v7.s}[1],[x3],x2 //src_tmp4 = vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp4, 1)// umlsl v0.8h, v6.8b, v24.8b //mul_res1 = vmlsl_u8(mul_res1, vreinterpret_u8_u32(src_tmp3), coeffabs_2)// - umull v8.8h, v7.8b, v23.8b + umull v19.8h, v7.8b, v23.8b dup v4.2s, v7.2s[1] //src_tmp1 = vdup_lane_u32(src_tmp4, 1)// umull v2.8h, v7.8b, v25.8b //mul_res2 = vmull_u8(vreinterpret_u8_u32(src_tmp4), coeffabs_3)// ld1 {v4.s}[1],[x3],x2 //src_tmp1 = vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp1, 1)// - umlsl v8.8h, v6.8b, v22.8b + umlsl v19.8h, v6.8b, v22.8b umlal v0.8h, v4.8b, v26.8b //mul_res1 = vmlal_u8(mul_res1, vreinterpret_u8_u32(src_tmp1), coeffabs_4)// dup v5.2s, v4.2s[1] //src_tmp2 = vdup_lane_u32(src_tmp1, 1)// - umlsl v8.8h, v4.8b, v24.8b + umlsl v19.8h, v4.8b, v24.8b ld1 {v5.s}[1],[x3],x2 //src_tmp2 = vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp2, 1)// umlsl v2.8h, v5.8b, v27.8b //mul_res2 = vmlsl_u8(mul_res2, vreinterpret_u8_u32(src_tmp2), coeffabs_5)// dup v6.2s, v5.2s[1] //src_tmp3 = vdup_lane_u32(src_tmp2, 1)// - umlal v8.8h, v5.8b, v25.8b + umlal v19.8h, v5.8b, v25.8b ld1 {v6.s}[1],[x3],x2 //src_tmp3 = vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp3, 1)// umlal v0.8h, v6.8b, v28.8b //mul_res1 = vmlal_u8(mul_res1, vreinterpret_u8_u32(src_tmp3), coeffabs_6)// dup v7.2s, v6.2s[1] //src_tmp4 = vdup_lane_u32(src_tmp3, 1)// - umlal v8.8h, v6.8b, v26.8b + umlal v19.8h, v6.8b, v26.8b ld1 {v7.s}[1],[x3],x2 //src_tmp4 = vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp4, 1)// umlsl v2.8h, v7.8b, v29.8b //mul_res2 = vmlsl_u8(mul_res2, vreinterpret_u8_u32(src_tmp4), coeffabs_7)// dup v4.2s, v7.2s[1] add v0.8h, v0.8h , v2.8h //mul_res1 = vaddq_u16(mul_res1, mul_res2)// - umlsl v8.8h, v7.8b, v27.8b + umlsl v19.8h, v7.8b, v27.8b ld1 {v4.s}[1],[x3],x2 - umlal v8.8h, v4.8b, v28.8b + umlal v19.8h, v4.8b, v28.8b dup v5.2s, v4.2s[1] sqrshrun v0.8b, v0.8h,#6 //sto_res = vqmovun_s16(sto_res_tmp)// @@ -500,13 +500,13 @@ inner_loop_wd_4: add x3,x1,x6 st1 {v0.s}[0],[x1] //vst1_lane_u32((uint32_t *)pu1_dst, vreinterpret_u32_u8(sto_res), 0)// - umlsl v8.8h, v5.8b, v29.8b + umlsl v19.8h, v5.8b, v29.8b st1 {v0.s}[1],[x3],x6 //vst1_lane_u32((uint32_t *)pu1_dst_tmp, vreinterpret_u32_u8(sto_res), 1)// - sqrshrun v8.8b, v8.8h,#6 + sqrshrun v19.8b, v19.8h,#6 - st1 {v8.s}[0],[x3],x6 + st1 {v19.s}[0],[x3],x6 add x1,x1,#4 - st1 {v8.s}[1],[x3] + st1 {v19.s}[1],[x3] bgt inner_loop_wd_4 end_inner_loop_wd_4: @@ -517,6 +517,6 @@ end_inner_loop_wd_4: // ldmfd sp!, {x4-x12, x15} //reload the registers from sp ldp x19, x20,[sp], #16 - pop_v_regs + ret diff --git a/common/arm64/ihevc_inter_pred_filters_luma_vert_w16inp.s b/common/arm64/ihevc_inter_pred_filters_luma_vert_w16inp.s index 64a00b2..cd8addf 100644 --- a/common/arm64/ihevc_inter_pred_filters_luma_vert_w16inp.s +++ b/common/arm64/ihevc_inter_pred_filters_luma_vert_w16inp.s @@ -106,7 +106,7 @@ ihevc_inter_pred_luma_vert_w16inp_av8: // stmfd sp!, {x4-x12, x14} //stack stores the values of the arguments - push_v_regs + stp x19, x20,[sp,#-16]! mov x15,x4 // pi1_coeff @@ -152,70 +152,70 @@ prolog: ld1 {v0.4h},[x0],#8 //src_tmp1 = vld1_u8(pu1_src_tmp)// subs x4,x4,#4 ld1 {v2.4h},[x3],x2 //src_tmp3 = vld1_u8(pu1_src_tmp)// - smull v8.4s, v1.4h, v23.4h //mul_res1 = vmull_u8(src_tmp2, coeffabs_1)// + smull v19.4s, v1.4h, v23.4h //mul_res1 = vmull_u8(src_tmp2, coeffabs_1)// ld1 {v3.4h},[x3],x2 //src_tmp4 = vld1_u8(pu1_src_tmp)// - smlal v8.4s, v0.4h, v22.4h //mul_res1 = vmlal_u8(mul_res1, src_tmp1, coeffabs_0)// + smlal v19.4s, v0.4h, v22.4h //mul_res1 = vmlal_u8(mul_res1, src_tmp1, coeffabs_0)// ld1 {v4.4h},[x3],x2 //src_tmp1 = vld1_u8(pu1_src_tmp)// - smlal v8.4s, v2.4h, v24.4h //mul_res1 = vmlal_u8(mul_res1, src_tmp3, coeffabs_2)// + smlal v19.4s, v2.4h, v24.4h //mul_res1 = vmlal_u8(mul_res1, src_tmp3, coeffabs_2)// ld1 {v5.4h},[x3],x2 //src_tmp2 = vld1_u8(pu1_src_tmp)// - smlal v8.4s, v3.4h, v25.4h //mul_res1 = vmlal_u8(mul_res1, src_tmp4, coeffabs_3)// + smlal v19.4s, v3.4h, v25.4h //mul_res1 = vmlal_u8(mul_res1, src_tmp4, coeffabs_3)// ld1 {v6.4h},[x3],x2 //src_tmp3 = vld1_u8(pu1_src_tmp)// - smlal v8.4s, v4.4h, v26.4h //mul_res1 = vmlal_u8(mul_res1, src_tmp1, coeffabs_4)// + smlal v19.4s, v4.4h, v26.4h //mul_res1 = vmlal_u8(mul_res1, src_tmp1, coeffabs_4)// ld1 {v7.4h},[x3],x2 //src_tmp4 = vld1_u8(pu1_src_tmp)// - smlal v8.4s, v5.4h, v27.4h //mul_res1 = vmlal_u8(mul_res1, src_tmp2, coeffabs_5)// - smlal v8.4s, v6.4h, v28.4h //mul_res1 = vmlal_u8(mul_res1, src_tmp3, coeffabs_6)// - smlal v8.4s, v7.4h, v29.4h //mul_res1 = vmlal_u8(mul_res1, src_tmp4, coeffabs_7)// + smlal v19.4s, v5.4h, v27.4h //mul_res1 = vmlal_u8(mul_res1, src_tmp2, coeffabs_5)// + smlal v19.4s, v6.4h, v28.4h //mul_res1 = vmlal_u8(mul_res1, src_tmp3, coeffabs_6)// + smlal v19.4s, v7.4h, v29.4h //mul_res1 = vmlal_u8(mul_res1, src_tmp4, coeffabs_7)// ld1 {v16.4h},[x3],x2 //src_tmp1 = vld1_u8(pu1_src_tmp)// - smull v10.4s, v2.4h, v23.4h //mul_res2 = vmull_u8(src_tmp3, coeffabs_1)// + smull v20.4s, v2.4h, v23.4h //mul_res2 = vmull_u8(src_tmp3, coeffabs_1)// add x20,x0,x8,lsl #0 csel x0, x20, x0,le - smlal v10.4s, v1.4h, v22.4h //mul_res2 = vmlal_u8(mul_res2, src_tmp2, coeffabs_0)// + smlal v20.4s, v1.4h, v22.4h //mul_res2 = vmlal_u8(mul_res2, src_tmp2, coeffabs_0)// csel x4, x5, x4,le //x5 ->wd - smlal v10.4s, v3.4h, v24.4h //mul_res2 = vmlal_u8(mul_res2, src_tmp4, coeffabs_2)// + smlal v20.4s, v3.4h, v24.4h //mul_res2 = vmlal_u8(mul_res2, src_tmp4, coeffabs_2)// ld1 {v17.4h},[x3],x2 //src_tmp2 = vld1_u8(pu1_src_tmp)// - smlal v10.4s, v4.4h, v25.4h //mul_res2 = vmlal_u8(mul_res2, src_tmp1, coeffabs_3)// + smlal v20.4s, v4.4h, v25.4h //mul_res2 = vmlal_u8(mul_res2, src_tmp1, coeffabs_3)// ld1 {v18.4h},[x3],x2 //src_tmp3 = vld1_u8(pu1_src_tmp)// - smlal v10.4s, v5.4h, v26.4h //mul_res2 = vmlal_u8(mul_res2, src_tmp2, coeffabs_4)// + smlal v20.4s, v5.4h, v26.4h //mul_res2 = vmlal_u8(mul_res2, src_tmp2, coeffabs_4)// add x3,x0,x2 //pu1_src_tmp += src_strd// - smlal v10.4s, v6.4h, v27.4h //mul_res2 = vmlal_u8(mul_res2, src_tmp3, coeffabs_5)// - smlal v10.4s, v7.4h, v28.4h //mul_res2 = vmlal_u8(mul_res2, src_tmp4, coeffabs_6)// - smlal v10.4s, v16.4h, v29.4h //mul_res2 = vmlal_u8(mul_res2, src_tmp1, coeffabs_7)// - sqshrn v8.4h, v8.4s,#6 + smlal v20.4s, v6.4h, v27.4h //mul_res2 = vmlal_u8(mul_res2, src_tmp3, coeffabs_5)// + smlal v20.4s, v7.4h, v28.4h //mul_res2 = vmlal_u8(mul_res2, src_tmp4, coeffabs_6)// + smlal v20.4s, v16.4h, v29.4h //mul_res2 = vmlal_u8(mul_res2, src_tmp1, coeffabs_7)// + sqshrn v19.4h, v19.4s,#6 ld1 {v1.4h},[x3],x2 //src_tmp3 = vld1_u8(pu1_src_tmp)// - smull v12.4s, v3.4h, v23.4h + smull v21.4s, v3.4h, v23.4h ld1 {v0.4h},[x0],#8 //src_tmp1 = vld1_u8(pu1_src_tmp)// - smlal v12.4s, v2.4h, v22.4h + smlal v21.4s, v2.4h, v22.4h ld1 {v2.4h},[x3],x2 //src_tmp3 = vld1_u8(pu1_src_tmp)// - smlal v12.4s, v4.4h, v24.4h - smlal v12.4s, v5.4h, v25.4h - smlal v12.4s, v6.4h, v26.4h - smlal v12.4s, v7.4h, v27.4h - smlal v12.4s, v16.4h, v28.4h - smlal v12.4s, v17.4h, v29.4h + smlal v21.4s, v4.4h, v24.4h + smlal v21.4s, v5.4h, v25.4h + smlal v21.4s, v6.4h, v26.4h + smlal v21.4s, v7.4h, v27.4h + smlal v21.4s, v16.4h, v28.4h + smlal v21.4s, v17.4h, v29.4h add x14,x1,x6 - sqshrn v10.4h, v10.4s,#6 - sqrshrun v8.8b, v8.8h,#6 //sto_res = vqmovun_s16(sto_res_tmp)// + sqshrn v20.4h, v20.4s,#6 + sqrshrun v19.8b, v19.8h,#6 //sto_res = vqmovun_s16(sto_res_tmp)// - smull v14.4s, v4.4h, v23.4h - smlal v14.4s, v3.4h, v22.4h - smlal v14.4s, v5.4h, v24.4h - smlal v14.4s, v6.4h, v25.4h + smull v30.4s, v4.4h, v23.4h + smlal v30.4s, v3.4h, v22.4h + smlal v30.4s, v5.4h, v24.4h + smlal v30.4s, v6.4h, v25.4h ld1 {v3.4h},[x3],x2 //src_tmp4 = vld1_u8(pu1_src_tmp)// - smlal v14.4s, v7.4h, v26.4h + smlal v30.4s, v7.4h, v26.4h ld1 {v4.4h},[x3],x2 //src_tmp1 = vld1_u8(pu1_src_tmp)// - smlal v14.4s, v16.4h, v27.4h + smlal v30.4s, v16.4h, v27.4h ld1 {v5.4h},[x3],x2 //src_tmp2 = vld1_u8(pu1_src_tmp)// - smlal v14.4s, v17.4h, v28.4h + smlal v30.4s, v17.4h, v28.4h ld1 {v6.4h},[x3],x2 //src_tmp3 = vld1_u8(pu1_src_tmp)// - smlal v14.4s, v18.4h, v29.4h + smlal v30.4s, v18.4h, v29.4h ld1 {v7.4h},[x3],x2 //src_tmp4 = vld1_u8(pu1_src_tmp)// - st1 {v8.s}[0],[x1],#4 //vst1_u8(pu1_dst,sto_res)// - sqshrn v12.4h, v12.4s,#6 - sqrshrun v10.8b, v10.8h,#6 //sto_res = vqmovun_s16(sto_res_tmp)// + st1 {v19.s}[0],[x1],#4 //vst1_u8(pu1_dst,sto_res)// + sqshrn v21.4h, v21.4s,#6 + sqrshrun v20.8b, v20.8h,#6 //sto_res = vqmovun_s16(sto_res_tmp)// add x20,x1,x9 csel x1, x20, x1,le @@ -226,164 +226,164 @@ prolog: kernel_8: - smull v8.4s, v1.4h, v23.4h //mul_res1 = vmull_u8(src_tmp2, coeffabs_1)// + smull v19.4s, v1.4h, v23.4h //mul_res1 = vmull_u8(src_tmp2, coeffabs_1)// subs x4,x4,#4 - smlal v8.4s, v0.4h, v22.4h //mul_res1 = vmlal_u8(mul_res1, src_tmp1, coeffabs_0)// + smlal v19.4s, v0.4h, v22.4h //mul_res1 = vmlal_u8(mul_res1, src_tmp1, coeffabs_0)// add x20,x0,x8,lsl #0 csel x0, x20, x0,le - smlal v8.4s, v2.4h, v24.4h //mul_res1 = vmlal_u8(mul_res1, src_tmp3, coeffabs_2)// - smlal v8.4s, v3.4h, v25.4h //mul_res1 = vmlal_u8(mul_res1, src_tmp4, coeffabs_3)// - smlal v8.4s, v4.4h, v26.4h //mul_res1 = vmlal_u8(mul_res1, src_tmp1, coeffabs_4)// - smlal v8.4s, v5.4h, v27.4h //mul_res1 = vmlal_u8(mul_res1, src_tmp2, coeffabs_5)// - smlal v8.4s, v6.4h, v28.4h //mul_res1 = vmlal_u8(mul_res1, src_tmp3, coeffabs_6)// - smlal v8.4s, v7.4h, v29.4h //mul_res1 = vmlal_u8(mul_res1, src_tmp4, coeffabs_7)// - st1 {v10.s}[0],[x14],x6 //vst1_u8(pu1_dst_tmp,sto_res)// - - sqshrn v14.4h, v14.4s,#6 - sqrshrun v12.8b, v12.8h,#6 + smlal v19.4s, v2.4h, v24.4h //mul_res1 = vmlal_u8(mul_res1, src_tmp3, coeffabs_2)// + smlal v19.4s, v3.4h, v25.4h //mul_res1 = vmlal_u8(mul_res1, src_tmp4, coeffabs_3)// + smlal v19.4s, v4.4h, v26.4h //mul_res1 = vmlal_u8(mul_res1, src_tmp1, coeffabs_4)// + smlal v19.4s, v5.4h, v27.4h //mul_res1 = vmlal_u8(mul_res1, src_tmp2, coeffabs_5)// + smlal v19.4s, v6.4h, v28.4h //mul_res1 = vmlal_u8(mul_res1, src_tmp3, coeffabs_6)// + smlal v19.4s, v7.4h, v29.4h //mul_res1 = vmlal_u8(mul_res1, src_tmp4, coeffabs_7)// + st1 {v20.s}[0],[x14],x6 //vst1_u8(pu1_dst_tmp,sto_res)// + + sqshrn v30.4h, v30.4s,#6 + sqrshrun v21.8b, v21.8h,#6 ld1 {v16.4h},[x3],x2 //src_tmp1 = vld1_u8(pu1_src_tmp)// - smull v10.4s, v2.4h, v23.4h //mul_res2 = vmull_u8(src_tmp3, coeffabs_1)// - smlal v10.4s, v1.4h, v22.4h //mul_res2 = vmlal_u8(mul_res2, src_tmp2, coeffabs_0)// - smlal v10.4s, v3.4h, v24.4h //mul_res2 = vmlal_u8(mul_res2, src_tmp4, coeffabs_2)// - smlal v10.4s, v4.4h, v25.4h //mul_res2 = vmlal_u8(mul_res2, src_tmp1, coeffabs_3)// - smlal v10.4s, v5.4h, v26.4h //mul_res2 = vmlal_u8(mul_res2, src_tmp2, coeffabs_4)// - smlal v10.4s, v6.4h, v27.4h //mul_res2 = vmlal_u8(mul_res2, src_tmp3, coeffabs_5)// - st1 {v12.s}[0],[x14],x6 + smull v20.4s, v2.4h, v23.4h //mul_res2 = vmull_u8(src_tmp3, coeffabs_1)// + smlal v20.4s, v1.4h, v22.4h //mul_res2 = vmlal_u8(mul_res2, src_tmp2, coeffabs_0)// + smlal v20.4s, v3.4h, v24.4h //mul_res2 = vmlal_u8(mul_res2, src_tmp4, coeffabs_2)// + smlal v20.4s, v4.4h, v25.4h //mul_res2 = vmlal_u8(mul_res2, src_tmp1, coeffabs_3)// + smlal v20.4s, v5.4h, v26.4h //mul_res2 = vmlal_u8(mul_res2, src_tmp2, coeffabs_4)// + smlal v20.4s, v6.4h, v27.4h //mul_res2 = vmlal_u8(mul_res2, src_tmp3, coeffabs_5)// + st1 {v21.s}[0],[x14],x6 - smlal v10.4s, v7.4h, v28.4h //mul_res2 = vmlal_u8(mul_res2, src_tmp4, coeffabs_6)// + smlal v20.4s, v7.4h, v28.4h //mul_res2 = vmlal_u8(mul_res2, src_tmp4, coeffabs_6)// ld1 {v17.4h},[x3],x2 //src_tmp2 = vld1_u8(pu1_src_tmp)// - smlal v10.4s, v16.4h, v29.4h //mul_res2 = vmlal_u8(mul_res2, src_tmp1, coeffabs_7)// + smlal v20.4s, v16.4h, v29.4h //mul_res2 = vmlal_u8(mul_res2, src_tmp1, coeffabs_7)// - sqshrn v8.4h, v8.4s,#6 - sqrshrun v14.8b, v14.8h,#6 + sqshrn v19.4h, v19.4s,#6 + sqrshrun v30.8b, v30.8h,#6 - smull v12.4s, v3.4h, v23.4h + smull v21.4s, v3.4h, v23.4h csel x4, x5, x4,le //x5 ->wd - smlal v12.4s, v2.4h, v22.4h + smlal v21.4s, v2.4h, v22.4h ld1 {v18.4h},[x3],x2 //src_tmp3 = vld1_u8(pu1_src_tmp)// - smlal v12.4s, v4.4h, v24.4h + smlal v21.4s, v4.4h, v24.4h add x3,x0,x2 //pu1_src_tmp += src_strd// - smlal v12.4s, v5.4h, v25.4h + smlal v21.4s, v5.4h, v25.4h - smlal v12.4s, v6.4h, v26.4h - st1 {v14.s}[0],[x14],x6 + smlal v21.4s, v6.4h, v26.4h + st1 {v30.s}[0],[x14],x6 - smlal v12.4s, v7.4h, v27.4h + smlal v21.4s, v7.4h, v27.4h ld1 {v1.4h},[x3],x2 //src_tmp2 = vld1_u8(pu1_src_tmp)// - smlal v12.4s, v16.4h, v28.4h + smlal v21.4s, v16.4h, v28.4h add x14,x1,x6 - smlal v12.4s, v17.4h, v29.4h + smlal v21.4s, v17.4h, v29.4h ld1 {v0.4h},[x0],#8 //src_tmp1 = vld1_u8(pu1_src_tmp)// - sqshrn v10.4h, v10.4s,#6 - sqrshrun v8.8b, v8.8h,#6 //sto_res = vqmovun_s16(sto_res_tmp)// + sqshrn v20.4h, v20.4s,#6 + sqrshrun v19.8b, v19.8h,#6 //sto_res = vqmovun_s16(sto_res_tmp)// ld1 {v2.4h},[x3],x2 //src_tmp3 = vld1_u8(pu1_src_tmp)// - smull v14.4s, v4.4h, v23.4h - smlal v14.4s, v3.4h, v22.4h - smlal v14.4s, v5.4h, v24.4h + smull v30.4s, v4.4h, v23.4h + smlal v30.4s, v3.4h, v22.4h + smlal v30.4s, v5.4h, v24.4h ld1 {v3.4h},[x3],x2 //src_tmp4 = vld1_u8(pu1_src_tmp)// - smlal v14.4s, v6.4h, v25.4h + smlal v30.4s, v6.4h, v25.4h ld1 {v4.4h},[x3],x2 //src_tmp1 = vld1_u8(pu1_src_tmp)// - smlal v14.4s, v7.4h, v26.4h + smlal v30.4s, v7.4h, v26.4h ld1 {v5.4h},[x3],x2 //src_tmp2 = vld1_u8(pu1_src_tmp)// - smlal v14.4s, v16.4h, v27.4h + smlal v30.4s, v16.4h, v27.4h ld1 {v6.4h},[x3],x2 //src_tmp3 = vld1_u8(pu1_src_tmp)// - smlal v14.4s, v17.4h, v28.4h + smlal v30.4s, v17.4h, v28.4h ld1 {v7.4h},[x3],x2 //src_tmp4 = vld1_u8(pu1_src_tmp)// - smlal v14.4s, v18.4h, v29.4h - st1 {v8.s}[0],[x1],#4 //vst1_u8(pu1_dst,sto_res)// + smlal v30.4s, v18.4h, v29.4h + st1 {v19.s}[0],[x1],#4 //vst1_u8(pu1_dst,sto_res)// - sqshrn v12.4h, v12.4s,#6 + sqshrn v21.4h, v21.4s,#6 add x20,x1,x9 csel x1, x20, x1,le - sqrshrun v10.8b, v10.8h,#6 //sto_res = vqmovun_s16(sto_res_tmp)// + sqrshrun v20.8b, v20.8h,#6 //sto_res = vqmovun_s16(sto_res_tmp)// subs x7,x7,#4 bgt kernel_8 //jumps to kernel_8 epilog: - smull v8.4s, v1.4h, v23.4h //mul_res1 = vmull_u8(src_tmp2, coeffabs_1)// - smlal v8.4s, v0.4h, v22.4h //mul_res1 = vmlal_u8(mul_res1, src_tmp1, coeffabs_0)// - smlal v8.4s, v2.4h, v24.4h //mul_res1 = vmlal_u8(mul_res1, src_tmp3, coeffabs_2)// - smlal v8.4s, v3.4h, v25.4h //mul_res1 = vmlal_u8(mul_res1, src_tmp4, coeffabs_3)// - smlal v8.4s, v4.4h, v26.4h //mul_res1 = vmlal_u8(mul_res1, src_tmp1, coeffabs_4)// - smlal v8.4s, v5.4h, v27.4h //mul_res1 = vmlal_u8(mul_res1, src_tmp2, coeffabs_5)// - smlal v8.4s, v6.4h, v28.4h //mul_res1 = vmlal_u8(mul_res1, src_tmp3, coeffabs_6)// - smlal v8.4s, v7.4h, v29.4h //mul_res1 = vmlal_u8(mul_res1, src_tmp4, coeffabs_7)// - st1 {v10.s}[0],[x14],x6 + smull v19.4s, v1.4h, v23.4h //mul_res1 = vmull_u8(src_tmp2, coeffabs_1)// + smlal v19.4s, v0.4h, v22.4h //mul_res1 = vmlal_u8(mul_res1, src_tmp1, coeffabs_0)// + smlal v19.4s, v2.4h, v24.4h //mul_res1 = vmlal_u8(mul_res1, src_tmp3, coeffabs_2)// + smlal v19.4s, v3.4h, v25.4h //mul_res1 = vmlal_u8(mul_res1, src_tmp4, coeffabs_3)// + smlal v19.4s, v4.4h, v26.4h //mul_res1 = vmlal_u8(mul_res1, src_tmp1, coeffabs_4)// + smlal v19.4s, v5.4h, v27.4h //mul_res1 = vmlal_u8(mul_res1, src_tmp2, coeffabs_5)// + smlal v19.4s, v6.4h, v28.4h //mul_res1 = vmlal_u8(mul_res1, src_tmp3, coeffabs_6)// + smlal v19.4s, v7.4h, v29.4h //mul_res1 = vmlal_u8(mul_res1, src_tmp4, coeffabs_7)// + st1 {v20.s}[0],[x14],x6 - sqshrn v14.4h, v14.4s,#6 - sqrshrun v12.8b, v12.8h,#6 + sqshrn v30.4h, v30.4s,#6 + sqrshrun v21.8b, v21.8h,#6 ld1 {v16.4h},[x3],x2 //src_tmp1 = vld1_u8(pu1_src_tmp)// - smull v10.4s, v2.4h, v23.4h //mul_res2 = vmull_u8(src_tmp3, coeffabs_1)// - smlal v10.4s, v1.4h, v22.4h //mul_res2 = vmlal_u8(mul_res2, src_tmp2, coeffabs_0)// - smlal v10.4s, v3.4h, v24.4h //mul_res2 = vmlal_u8(mul_res2, src_tmp4, coeffabs_2)// - smlal v10.4s, v4.4h, v25.4h //mul_res2 = vmlal_u8(mul_res2, src_tmp1, coeffabs_3)// - smlal v10.4s, v5.4h, v26.4h //mul_res2 = vmlal_u8(mul_res2, src_tmp2, coeffabs_4)// - smlal v10.4s, v6.4h, v27.4h //mul_res2 = vmlal_u8(mul_res2, src_tmp3, coeffabs_5)// - smlal v10.4s, v7.4h, v28.4h //mul_res2 = vmlal_u8(mul_res2, src_tmp4, coeffabs_6)// - smlal v10.4s, v16.4h, v29.4h //mul_res2 = vmlal_u8(mul_res2, src_tmp1, coeffabs_7)// - st1 {v12.s}[0],[x14],x6 - - sqshrn v8.4h, v8.4s,#6 - sqrshrun v14.8b, v14.8h,#6 + smull v20.4s, v2.4h, v23.4h //mul_res2 = vmull_u8(src_tmp3, coeffabs_1)// + smlal v20.4s, v1.4h, v22.4h //mul_res2 = vmlal_u8(mul_res2, src_tmp2, coeffabs_0)// + smlal v20.4s, v3.4h, v24.4h //mul_res2 = vmlal_u8(mul_res2, src_tmp4, coeffabs_2)// + smlal v20.4s, v4.4h, v25.4h //mul_res2 = vmlal_u8(mul_res2, src_tmp1, coeffabs_3)// + smlal v20.4s, v5.4h, v26.4h //mul_res2 = vmlal_u8(mul_res2, src_tmp2, coeffabs_4)// + smlal v20.4s, v6.4h, v27.4h //mul_res2 = vmlal_u8(mul_res2, src_tmp3, coeffabs_5)// + smlal v20.4s, v7.4h, v28.4h //mul_res2 = vmlal_u8(mul_res2, src_tmp4, coeffabs_6)// + smlal v20.4s, v16.4h, v29.4h //mul_res2 = vmlal_u8(mul_res2, src_tmp1, coeffabs_7)// + st1 {v21.s}[0],[x14],x6 + + sqshrn v19.4h, v19.4s,#6 + sqrshrun v30.8b, v30.8h,#6 ld1 {v17.4h},[x3],x2 //src_tmp2 = vld1_u8(pu1_src_tmp)// - smull v12.4s, v3.4h, v23.4h - smlal v12.4s, v2.4h, v22.4h - smlal v12.4s, v4.4h, v24.4h - smlal v12.4s, v5.4h, v25.4h - smlal v12.4s, v6.4h, v26.4h - smlal v12.4s, v7.4h, v27.4h - smlal v12.4s, v16.4h, v28.4h - smlal v12.4s, v17.4h, v29.4h - st1 {v14.s}[0],[x14],x6 - sqshrn v10.4h, v10.4s,#6 - sqrshrun v8.8b, v8.8h,#6 //sto_res = vqmovun_s16(sto_res_tmp)// + smull v21.4s, v3.4h, v23.4h + smlal v21.4s, v2.4h, v22.4h + smlal v21.4s, v4.4h, v24.4h + smlal v21.4s, v5.4h, v25.4h + smlal v21.4s, v6.4h, v26.4h + smlal v21.4s, v7.4h, v27.4h + smlal v21.4s, v16.4h, v28.4h + smlal v21.4s, v17.4h, v29.4h + st1 {v30.s}[0],[x14],x6 + sqshrn v20.4h, v20.4s,#6 + sqrshrun v19.8b, v19.8h,#6 //sto_res = vqmovun_s16(sto_res_tmp)// ld1 {v18.4h},[x3],x2 //src_tmp3 = vld1_u8(pu1_src_tmp)// - smull v14.4s, v4.4h, v23.4h - smlal v14.4s, v3.4h, v22.4h - smlal v14.4s, v5.4h, v24.4h - smlal v14.4s, v6.4h, v25.4h - smlal v14.4s, v7.4h, v26.4h - smlal v14.4s, v16.4h, v27.4h - smlal v14.4s, v17.4h, v28.4h - smlal v14.4s, v18.4h, v29.4h - sqshrn v12.4h, v12.4s,#6 - sqrshrun v10.8b, v10.8h,#6 //sto_res = vqmovun_s16(sto_res_tmp)// + smull v30.4s, v4.4h, v23.4h + smlal v30.4s, v3.4h, v22.4h + smlal v30.4s, v5.4h, v24.4h + smlal v30.4s, v6.4h, v25.4h + smlal v30.4s, v7.4h, v26.4h + smlal v30.4s, v16.4h, v27.4h + smlal v30.4s, v17.4h, v28.4h + smlal v30.4s, v18.4h, v29.4h + sqshrn v21.4h, v21.4s,#6 + sqrshrun v20.8b, v20.8h,#6 //sto_res = vqmovun_s16(sto_res_tmp)// add x14,x1,x6 - st1 {v8.s}[0],[x1],#4 //vst1_u8(pu1_dst,sto_res)// + st1 {v19.s}[0],[x1],#4 //vst1_u8(pu1_dst,sto_res)// epilog_end: - st1 {v10.s}[0],[x14],x6 //vst1_u8(pu1_dst_tmp,sto_res)// - sqrshrun v12.8b, v12.8h,#6 + st1 {v20.s}[0],[x14],x6 //vst1_u8(pu1_dst_tmp,sto_res)// + sqrshrun v21.8b, v21.8h,#6 - st1 {v12.s}[0],[x14],x6 - sqshrn v14.4h, v14.4s,#6 - sqrshrun v14.8b, v14.8h,#6 + st1 {v21.s}[0],[x14],x6 + sqshrn v30.4h, v30.4s,#6 + sqrshrun v30.8b, v30.8h,#6 - st1 {v14.s}[0],[x14],x6 + st1 {v30.s}[0],[x14],x6 end_loops: // ldmfd sp!,{x4-x12,x15} //reload the registers from sp ldp x19, x20,[sp], #16 - pop_v_regs + ret diff --git a/common/arm64/ihevc_inter_pred_filters_luma_vert_w16out.s b/common/arm64/ihevc_inter_pred_filters_luma_vert_w16out.s index da316ae..ca48db5 100644 --- a/common/arm64/ihevc_inter_pred_filters_luma_vert_w16out.s +++ b/common/arm64/ihevc_inter_pred_filters_luma_vert_w16out.s @@ -70,7 +70,7 @@ ihevc_inter_pred_luma_vert_w16out_av8: // stmfd sp!, {x4-x12, x14} //stack stores the values of the arguments - push_v_regs + stp x19, x20,[sp,#-16]! mov x15,x4 // pi1_coeff @@ -118,83 +118,83 @@ prolog_16out: ld1 {v0.8b},[x0],#8 //src_tmp1 = vld1_u8(pu1_src_tmp)// subs x4,x4,#8 ld1 {v2.8b},[x3],x2 //src_tmp3 = vld1_u8(pu1_src_tmp)// - umull v8.8h, v1.8b, v23.8b //mul_res1 = vmull_u8(src_tmp2, coeffabs_1)// + umull v19.8h, v1.8b, v23.8b //mul_res1 = vmull_u8(src_tmp2, coeffabs_1)// ld1 {v3.8b},[x3],x2 //src_tmp4 = vld1_u8(pu1_src_tmp)// - umlsl v8.8h, v0.8b, v22.8b //mul_res1 = vmlsl_u8(mul_res1, src_tmp1, coeffabs_0)// + umlsl v19.8h, v0.8b, v22.8b //mul_res1 = vmlsl_u8(mul_res1, src_tmp1, coeffabs_0)// ld1 {v4.8b},[x3],x2 //src_tmp1 = vld1_u8(pu1_src_tmp)// - umlsl v8.8h, v2.8b, v24.8b //mul_res1 = vmlsl_u8(mul_res1, src_tmp3, coeffabs_2)// + umlsl v19.8h, v2.8b, v24.8b //mul_res1 = vmlsl_u8(mul_res1, src_tmp3, coeffabs_2)// ld1 {v5.8b},[x3],x2 //src_tmp2 = vld1_u8(pu1_src_tmp)// - umlal v8.8h, v3.8b, v25.8b //mul_res1 = vmlal_u8(mul_res1, src_tmp4, coeffabs_3)// + umlal v19.8h, v3.8b, v25.8b //mul_res1 = vmlal_u8(mul_res1, src_tmp4, coeffabs_3)// ld1 {v6.8b},[x3],x2 //src_tmp3 = vld1_u8(pu1_src_tmp)// - umlal v8.8h, v4.8b, v26.8b //mul_res1 = vmlal_u8(mul_res1, src_tmp1, coeffabs_4)// + umlal v19.8h, v4.8b, v26.8b //mul_res1 = vmlal_u8(mul_res1, src_tmp1, coeffabs_4)// ld1 {v7.8b},[x3],x2 //src_tmp4 = vld1_u8(pu1_src_tmp)// - umlsl v8.8h, v5.8b, v27.8b //mul_res1 = vmlsl_u8(mul_res1, src_tmp2, coeffabs_5)// + umlsl v19.8h, v5.8b, v27.8b //mul_res1 = vmlsl_u8(mul_res1, src_tmp2, coeffabs_5)// ld1 {v16.8b},[x3],x2 //src_tmp1 = vld1_u8(pu1_src_tmp)// - umlal v8.8h, v6.8b, v28.8b //mul_res1 = vmlal_u8(mul_res1, src_tmp3, coeffabs_6)// + umlal v19.8h, v6.8b, v28.8b //mul_res1 = vmlal_u8(mul_res1, src_tmp3, coeffabs_6)// ld1 {v17.8b},[x3],x2 //src_tmp2 = vld1_u8(pu1_src_tmp)// - umlsl v8.8h, v7.8b, v29.8b //mul_res1 = vmlsl_u8(mul_res1, src_tmp4, coeffabs_7)// + umlsl v19.8h, v7.8b, v29.8b //mul_res1 = vmlsl_u8(mul_res1, src_tmp4, coeffabs_7)// add x20,x0,x8 csel x0, x20, x0,le - umull v10.8h, v2.8b, v23.8b //mul_res2 = vmull_u8(src_tmp3, coeffabs_1)// + umull v20.8h, v2.8b, v23.8b //mul_res2 = vmull_u8(src_tmp3, coeffabs_1)// bic x20,x5,#7 //x5 ->wd csel x4, x20, x4,le - umlsl v10.8h, v1.8b, v22.8b //mul_res2 = vmlsl_u8(mul_res2, src_tmp2, coeffabs_0)// + umlsl v20.8h, v1.8b, v22.8b //mul_res2 = vmlsl_u8(mul_res2, src_tmp2, coeffabs_0)// ld1 {v18.8b},[x3],x2 //src_tmp3 = vld1_u8(pu1_src_tmp)// - umlsl v10.8h, v3.8b, v24.8b //mul_res2 = vmlsl_u8(mul_res2, src_tmp4, coeffabs_2)// + umlsl v20.8h, v3.8b, v24.8b //mul_res2 = vmlsl_u8(mul_res2, src_tmp4, coeffabs_2)// add x20,x20,x3 prfm PLDL1KEEP,[x20] - umlal v10.8h, v4.8b, v25.8b //mul_res2 = vmlal_u8(mul_res2, src_tmp1, coeffabs_3)// + umlal v20.8h, v4.8b, v25.8b //mul_res2 = vmlal_u8(mul_res2, src_tmp1, coeffabs_3)// add x20,x3, x2 prfm PLDL1KEEP,[x20] - umlal v10.8h, v5.8b, v26.8b //mul_res2 = vmlal_u8(mul_res2, src_tmp2, coeffabs_4)// + umlal v20.8h, v5.8b, v26.8b //mul_res2 = vmlal_u8(mul_res2, src_tmp2, coeffabs_4)// add x20,x3, x2, lsl #1 prfm PLDL1KEEP,[x20] - umlsl v10.8h, v6.8b, v27.8b //mul_res2 = vmlsl_u8(mul_res2, src_tmp3, coeffabs_5)// + umlsl v20.8h, v6.8b, v27.8b //mul_res2 = vmlsl_u8(mul_res2, src_tmp3, coeffabs_5)// add x3, x3, x2 - umlal v10.8h, v7.8b, v28.8b //mul_res2 = vmlal_u8(mul_res2, src_tmp4, coeffabs_6)// + umlal v20.8h, v7.8b, v28.8b //mul_res2 = vmlal_u8(mul_res2, src_tmp4, coeffabs_6)// add x20,x3, x2, lsl #1 prfm PLDL1KEEP,[x20] - umlsl v10.8h, v16.8b, v29.8b //mul_res2 = vmlsl_u8(mul_res2, src_tmp1, coeffabs_7)// + umlsl v20.8h, v16.8b, v29.8b //mul_res2 = vmlsl_u8(mul_res2, src_tmp1, coeffabs_7)// add x3,x0,x2 //pu1_src_tmp += src_strd// - umull v12.8h, v3.8b, v23.8b + umull v21.8h, v3.8b, v23.8b ld1 {v1.8b},[x3],x2 //src_tmp3 = vld1_u8(pu1_src_tmp)// - umlsl v12.8h, v2.8b, v22.8b + umlsl v21.8h, v2.8b, v22.8b ld1 {v0.8b},[x0],#8 //src_tmp1 = vld1_u8(pu1_src_tmp)// - umlsl v12.8h, v4.8b, v24.8b + umlsl v21.8h, v4.8b, v24.8b ld1 {v2.8b},[x3],x2 //src_tmp3 = vld1_u8(pu1_src_tmp)// - umlal v12.8h, v5.8b, v25.8b - umlal v12.8h, v6.8b, v26.8b - umlsl v12.8h, v7.8b, v27.8b - umlal v12.8h, v16.8b, v28.8b - umlsl v12.8h, v17.8b, v29.8b + umlal v21.8h, v5.8b, v25.8b + umlal v21.8h, v6.8b, v26.8b + umlsl v21.8h, v7.8b, v27.8b + umlal v21.8h, v16.8b, v28.8b + umlsl v21.8h, v17.8b, v29.8b add x14,x1,x6 - st1 {v8.16b},[x1],#16 //vst1_u8(pu1_dst,sto_res)// + st1 {v19.16b},[x1],#16 //vst1_u8(pu1_dst,sto_res)// //vqrshrun.s16 d10,q5,#6 //sto_res = vqmovun_s16(sto_res_tmp)// add x20,x1,x9,lsl #1 csel x1, x20, x1,le - umull v14.8h, v4.8b, v23.8b + umull v30.8h, v4.8b, v23.8b subs x7,x7,#4 - umlsl v14.8h, v3.8b, v22.8b - umlsl v14.8h, v5.8b, v24.8b - umlal v14.8h, v6.8b, v25.8b + umlsl v30.8h, v3.8b, v22.8b + umlsl v30.8h, v5.8b, v24.8b + umlal v30.8h, v6.8b, v25.8b ld1 {v3.8b},[x3],x2 //src_tmp4 = vld1_u8(pu1_src_tmp)// - umlal v14.8h, v7.8b, v26.8b + umlal v30.8h, v7.8b, v26.8b ld1 {v4.8b},[x3],x2 //src_tmp1 = vld1_u8(pu1_src_tmp)// - umlsl v14.8h, v16.8b, v27.8b + umlsl v30.8h, v16.8b, v27.8b ld1 {v5.8b},[x3],x2 //src_tmp2 = vld1_u8(pu1_src_tmp)// - umlal v14.8h, v17.8b, v28.8b + umlal v30.8h, v17.8b, v28.8b ld1 {v6.8b},[x3],x2 //src_tmp3 = vld1_u8(pu1_src_tmp)// - umlsl v14.8h, v18.8b, v29.8b + umlsl v30.8h, v18.8b, v29.8b ld1 {v7.8b},[x3],x2 //src_tmp4 = vld1_u8(pu1_src_tmp)// - st1 {v10.16b},[x14],x6 //vst1_u8(pu1_dst_tmp,sto_res)// + st1 {v20.16b},[x14],x6 //vst1_u8(pu1_dst_tmp,sto_res)// //vqrshrun.s16 d12,q6,#6 @@ -204,170 +204,170 @@ prolog_16out: kernel_8_16out: subs x4,x4,#8 - umull v8.8h, v1.8b, v23.8b //mul_res1 = vmull_u8(src_tmp2, coeffabs_1)// + umull v19.8h, v1.8b, v23.8b //mul_res1 = vmull_u8(src_tmp2, coeffabs_1)// add x20,x0,x8 csel x0, x20, x0,le - umlsl v8.8h, v0.8b, v22.8b //mul_res1 = vmlsl_u8(mul_res1, src_tmp1, coeffabs_0)// + umlsl v19.8h, v0.8b, v22.8b //mul_res1 = vmlsl_u8(mul_res1, src_tmp1, coeffabs_0)// ld1 {v16.8b},[x3],x2 //src_tmp1 = vld1_u8(pu1_src_tmp)// - umlsl v8.8h, v2.8b, v24.8b //mul_res1 = vmlsl_u8(mul_res1, src_tmp3, coeffabs_2)// + umlsl v19.8h, v2.8b, v24.8b //mul_res1 = vmlsl_u8(mul_res1, src_tmp3, coeffabs_2)// ld1 {v17.8b},[x3],x2 //src_tmp2 = vld1_u8(pu1_src_tmp)// - umlal v8.8h, v3.8b, v25.8b //mul_res1 = vmlal_u8(mul_res1, src_tmp4, coeffabs_3)// + umlal v19.8h, v3.8b, v25.8b //mul_res1 = vmlal_u8(mul_res1, src_tmp4, coeffabs_3)// bic x20,x5,#7 //x5 ->wd csel x4, x20, x4,le - umlal v8.8h, v4.8b, v26.8b //mul_res1 = vmlal_u8(mul_res1, src_tmp1, coeffabs_4)// + umlal v19.8h, v4.8b, v26.8b //mul_res1 = vmlal_u8(mul_res1, src_tmp1, coeffabs_4)// ld1 {v18.8b},[x3],x2 //src_tmp3 = vld1_u8(pu1_src_tmp)// - umlsl v8.8h, v5.8b, v27.8b //mul_res1 = vmlsl_u8(mul_res1, src_tmp2, coeffabs_5)// + umlsl v19.8h, v5.8b, v27.8b //mul_res1 = vmlsl_u8(mul_res1, src_tmp2, coeffabs_5)// - st1 {v12.16b},[x14],x6 - umlal v8.8h, v6.8b, v28.8b //mul_res1 = vmlal_u8(mul_res1, src_tmp3, coeffabs_6)// + st1 {v21.16b},[x14],x6 + umlal v19.8h, v6.8b, v28.8b //mul_res1 = vmlal_u8(mul_res1, src_tmp3, coeffabs_6)// add x3,x0,x2 //pu1_src_tmp += src_strd// - umlsl v8.8h, v7.8b, v29.8b //mul_res1 = vmlsl_u8(mul_res1, src_tmp4, coeffabs_7)// + umlsl v19.8h, v7.8b, v29.8b //mul_res1 = vmlsl_u8(mul_res1, src_tmp4, coeffabs_7)// // and x11, x0, #31 - umull v10.8h, v2.8b, v23.8b //mul_res2 = vmull_u8(src_tmp3, coeffabs_1)// + umull v20.8h, v2.8b, v23.8b //mul_res2 = vmull_u8(src_tmp3, coeffabs_1)// - st1 {v14.16b},[x14],x6 - umlsl v10.8h, v1.8b, v22.8b //mul_res2 = vmlsl_u8(mul_res2, src_tmp2, coeffabs_0)// + st1 {v30.16b},[x14],x6 + umlsl v20.8h, v1.8b, v22.8b //mul_res2 = vmlsl_u8(mul_res2, src_tmp2, coeffabs_0)// add x14,x1,x6 - umlsl v10.8h, v3.8b, v24.8b //mul_res2 = vmlsl_u8(mul_res2, src_tmp4, coeffabs_2)// + umlsl v20.8h, v3.8b, v24.8b //mul_res2 = vmlsl_u8(mul_res2, src_tmp4, coeffabs_2)// ld1 {v0.8b},[x0],#8 //src_tmp1 = vld1_u8(pu1_src_tmp)// - umlal v10.8h, v4.8b, v25.8b //mul_res2 = vmlal_u8(mul_res2, src_tmp1, coeffabs_3)// + umlal v20.8h, v4.8b, v25.8b //mul_res2 = vmlal_u8(mul_res2, src_tmp1, coeffabs_3)// ld1 {v1.8b},[x3],x2 //src_tmp2 = vld1_u8(pu1_src_tmp)// - umlal v10.8h, v5.8b, v26.8b //mul_res2 = vmlal_u8(mul_res2, src_tmp2, coeffabs_4)// + umlal v20.8h, v5.8b, v26.8b //mul_res2 = vmlal_u8(mul_res2, src_tmp2, coeffabs_4)// - st1 {v8.16b},[x1],#16 //vst1_u8(pu1_dst,sto_res)// - umlsl v10.8h, v6.8b, v27.8b //mul_res2 = vmlsl_u8(mul_res2, src_tmp3, coeffabs_5)// + st1 {v19.16b},[x1],#16 //vst1_u8(pu1_dst,sto_res)// + umlsl v20.8h, v6.8b, v27.8b //mul_res2 = vmlsl_u8(mul_res2, src_tmp3, coeffabs_5)// add x20,x1,x9,lsl #1 csel x1, x20, x1,le - umlal v10.8h, v7.8b, v28.8b //mul_res2 = vmlal_u8(mul_res2, src_tmp4, coeffabs_6)// + umlal v20.8h, v7.8b, v28.8b //mul_res2 = vmlal_u8(mul_res2, src_tmp4, coeffabs_6)// // cmp x11, x10 - umlsl v10.8h, v16.8b, v29.8b //mul_res2 = vmlsl_u8(mul_res2, src_tmp1, coeffabs_7)// + umlsl v20.8h, v16.8b, v29.8b //mul_res2 = vmlsl_u8(mul_res2, src_tmp1, coeffabs_7)// add x10, x3, x2, lsl #3 // 10*strd - 8+2 - umull v12.8h, v3.8b, v23.8b + umull v21.8h, v3.8b, v23.8b add x10, x10, x2 // 11*strd - umlsl v12.8h, v2.8b, v22.8b + umlsl v21.8h, v2.8b, v22.8b add x20,x20,x10 prfm PLDL1KEEP,[x20] //11+ 0 - umlsl v12.8h, v4.8b, v24.8b + umlsl v21.8h, v4.8b, v24.8b add x20,x10, x2 prfm PLDL1KEEP,[x20] //11+ 1*strd - umlal v12.8h, v5.8b, v25.8b + umlal v21.8h, v5.8b, v25.8b add x20,x10, x2, lsl #1 prfm PLDL1KEEP,[x20] //11+ 2*strd - umlal v12.8h, v6.8b, v26.8b + umlal v21.8h, v6.8b, v26.8b add x10, x10, x2 //12*strd - umlsl v12.8h, v7.8b, v27.8b + umlsl v21.8h, v7.8b, v27.8b add x20,x10, x2, lsl #1 prfm PLDL1KEEP,[x20] //11+ 3*strd - umlal v12.8h, v16.8b, v28.8b + umlal v21.8h, v16.8b, v28.8b // mov x10, x11 - umlsl v12.8h, v17.8b, v29.8b + umlsl v21.8h, v17.8b, v29.8b ld1 {v2.8b},[x3],x2 //src_tmp3 = vld1_u8(pu1_src_tmp)// - umull v14.8h, v4.8b, v23.8b + umull v30.8h, v4.8b, v23.8b subs x7,x7,#4 - umlsl v14.8h, v3.8b, v22.8b + umlsl v30.8h, v3.8b, v22.8b - st1 {v10.16b},[x14],x6 //vst1_u8(pu1_dst_tmp,sto_res)// - umlsl v14.8h, v5.8b, v24.8b + st1 {v20.16b},[x14],x6 //vst1_u8(pu1_dst_tmp,sto_res)// + umlsl v30.8h, v5.8b, v24.8b ld1 {v3.8b},[x3],x2 //src_tmp4 = vld1_u8(pu1_src_tmp)// - umlal v14.8h, v6.8b, v25.8b + umlal v30.8h, v6.8b, v25.8b ld1 {v4.8b},[x3],x2 //src_tmp1 = vld1_u8(pu1_src_tmp)// - umlal v14.8h, v7.8b, v26.8b + umlal v30.8h, v7.8b, v26.8b ld1 {v5.8b},[x3],x2 //src_tmp2 = vld1_u8(pu1_src_tmp)// - umlsl v14.8h, v16.8b, v27.8b + umlsl v30.8h, v16.8b, v27.8b ld1 {v6.8b},[x3],x2 //src_tmp3 = vld1_u8(pu1_src_tmp)// - umlal v14.8h, v17.8b, v28.8b + umlal v30.8h, v17.8b, v28.8b ld1 {v7.8b},[x3],x2 //src_tmp4 = vld1_u8(pu1_src_tmp)// - umlsl v14.8h, v18.8b, v29.8b + umlsl v30.8h, v18.8b, v29.8b bgt kernel_8_16out //jumps to kernel_8 epilog_16out: - umull v8.8h, v1.8b, v23.8b //mul_res1 = vmull_u8(src_tmp2, coeffabs_1)// - umlsl v8.8h, v0.8b, v22.8b //mul_res1 = vmlsl_u8(mul_res1, src_tmp1, coeffabs_0)// - umlsl v8.8h, v2.8b, v24.8b //mul_res1 = vmlsl_u8(mul_res1, src_tmp3, coeffabs_2)// - umlal v8.8h, v3.8b, v25.8b //mul_res1 = vmlal_u8(mul_res1, src_tmp4, coeffabs_3)// - umlal v8.8h, v4.8b, v26.8b //mul_res1 = vmlal_u8(mul_res1, src_tmp1, coeffabs_4)// - umlsl v8.8h, v5.8b, v27.8b //mul_res1 = vmlsl_u8(mul_res1, src_tmp2, coeffabs_5)// - umlal v8.8h, v6.8b, v28.8b //mul_res1 = vmlal_u8(mul_res1, src_tmp3, coeffabs_6)// - umlsl v8.8h, v7.8b, v29.8b //mul_res1 = vmlsl_u8(mul_res1, src_tmp4, coeffabs_7)// - st1 {v12.16b},[x14],x6 + umull v19.8h, v1.8b, v23.8b //mul_res1 = vmull_u8(src_tmp2, coeffabs_1)// + umlsl v19.8h, v0.8b, v22.8b //mul_res1 = vmlsl_u8(mul_res1, src_tmp1, coeffabs_0)// + umlsl v19.8h, v2.8b, v24.8b //mul_res1 = vmlsl_u8(mul_res1, src_tmp3, coeffabs_2)// + umlal v19.8h, v3.8b, v25.8b //mul_res1 = vmlal_u8(mul_res1, src_tmp4, coeffabs_3)// + umlal v19.8h, v4.8b, v26.8b //mul_res1 = vmlal_u8(mul_res1, src_tmp1, coeffabs_4)// + umlsl v19.8h, v5.8b, v27.8b //mul_res1 = vmlsl_u8(mul_res1, src_tmp2, coeffabs_5)// + umlal v19.8h, v6.8b, v28.8b //mul_res1 = vmlal_u8(mul_res1, src_tmp3, coeffabs_6)// + umlsl v19.8h, v7.8b, v29.8b //mul_res1 = vmlsl_u8(mul_res1, src_tmp4, coeffabs_7)// + st1 {v21.16b},[x14],x6 //vqrshrun.s16 d14,q7,#6 ld1 {v16.8b},[x3],x2 //src_tmp1 = vld1_u8(pu1_src_tmp)// - umull v10.8h, v2.8b, v23.8b //mul_res2 = vmull_u8(src_tmp3, coeffabs_1)// - umlsl v10.8h, v1.8b, v22.8b //mul_res2 = vmlsl_u8(mul_res2, src_tmp2, coeffabs_0)// - umlsl v10.8h, v3.8b, v24.8b //mul_res2 = vmlsl_u8(mul_res2, src_tmp4, coeffabs_2)// - umlal v10.8h, v4.8b, v25.8b //mul_res2 = vmlal_u8(mul_res2, src_tmp1, coeffabs_3)// - umlal v10.8h, v5.8b, v26.8b //mul_res2 = vmlal_u8(mul_res2, src_tmp2, coeffabs_4)// - umlsl v10.8h, v6.8b, v27.8b //mul_res2 = vmlsl_u8(mul_res2, src_tmp3, coeffabs_5)// - umlal v10.8h, v7.8b, v28.8b //mul_res2 = vmlal_u8(mul_res2, src_tmp4, coeffabs_6)// - umlsl v10.8h, v16.8b, v29.8b //mul_res2 = vmlsl_u8(mul_res2, src_tmp1, coeffabs_7)// - st1 {v14.16b},[x14],x6 + umull v20.8h, v2.8b, v23.8b //mul_res2 = vmull_u8(src_tmp3, coeffabs_1)// + umlsl v20.8h, v1.8b, v22.8b //mul_res2 = vmlsl_u8(mul_res2, src_tmp2, coeffabs_0)// + umlsl v20.8h, v3.8b, v24.8b //mul_res2 = vmlsl_u8(mul_res2, src_tmp4, coeffabs_2)// + umlal v20.8h, v4.8b, v25.8b //mul_res2 = vmlal_u8(mul_res2, src_tmp1, coeffabs_3)// + umlal v20.8h, v5.8b, v26.8b //mul_res2 = vmlal_u8(mul_res2, src_tmp2, coeffabs_4)// + umlsl v20.8h, v6.8b, v27.8b //mul_res2 = vmlsl_u8(mul_res2, src_tmp3, coeffabs_5)// + umlal v20.8h, v7.8b, v28.8b //mul_res2 = vmlal_u8(mul_res2, src_tmp4, coeffabs_6)// + umlsl v20.8h, v16.8b, v29.8b //mul_res2 = vmlsl_u8(mul_res2, src_tmp1, coeffabs_7)// + st1 {v30.16b},[x14],x6 //vqrshrun.s16 d8,q4,#6 //sto_res = vqmovun_s16(sto_res_tmp)// ld1 {v17.8b},[x3],x2 //src_tmp2 = vld1_u8(pu1_src_tmp)// - umull v12.8h, v3.8b, v23.8b - umlsl v12.8h, v2.8b, v22.8b - umlsl v12.8h, v4.8b, v24.8b - umlal v12.8h, v5.8b, v25.8b - umlal v12.8h, v6.8b, v26.8b - umlsl v12.8h, v7.8b, v27.8b - umlal v12.8h, v16.8b, v28.8b - umlsl v12.8h, v17.8b, v29.8b + umull v21.8h, v3.8b, v23.8b + umlsl v21.8h, v2.8b, v22.8b + umlsl v21.8h, v4.8b, v24.8b + umlal v21.8h, v5.8b, v25.8b + umlal v21.8h, v6.8b, v26.8b + umlsl v21.8h, v7.8b, v27.8b + umlal v21.8h, v16.8b, v28.8b + umlsl v21.8h, v17.8b, v29.8b add x14,x1,x6 - st1 {v8.16b},[x1],#16 //vst1_u8(pu1_dst,sto_res)// + st1 {v19.16b},[x1],#16 //vst1_u8(pu1_dst,sto_res)// //vqrshrun.s16 d10,q5,#6 //sto_res = vqmovun_s16(sto_res_tmp)// ld1 {v18.8b},[x3],x2 //src_tmp3 = vld1_u8(pu1_src_tmp)// - umull v14.8h, v4.8b, v23.8b - umlsl v14.8h, v3.8b, v22.8b - umlsl v14.8h, v5.8b, v24.8b - umlal v14.8h, v6.8b, v25.8b - umlal v14.8h, v7.8b, v26.8b - umlsl v14.8h, v16.8b, v27.8b - umlal v14.8h, v17.8b, v28.8b - umlsl v14.8h, v18.8b, v29.8b - - st1 {v10.16b},[x14],x6 //vst1_u8(pu1_dst_tmp,sto_res)// + umull v30.8h, v4.8b, v23.8b + umlsl v30.8h, v3.8b, v22.8b + umlsl v30.8h, v5.8b, v24.8b + umlal v30.8h, v6.8b, v25.8b + umlal v30.8h, v7.8b, v26.8b + umlsl v30.8h, v16.8b, v27.8b + umlal v30.8h, v17.8b, v28.8b + umlsl v30.8h, v18.8b, v29.8b + + st1 {v20.16b},[x14],x6 //vst1_u8(pu1_dst_tmp,sto_res)// //vqrshrun.s16 d12,q6,#6 epilog_end_16out: - st1 {v12.16b},[x14],x6 + st1 {v21.16b},[x14],x6 //vqrshrun.s16 d14,q7,#6 - st1 {v14.16b},[x14],x6 + st1 {v30.16b},[x14],x6 end_loops_16out: @@ -377,7 +377,7 @@ end_loops_16out: // ldmeqfd sp!,{x4-x12,x15} //reload the registers from sp bne lbl355 ldp x19, x20,[sp], #16 - pop_v_regs + ret lbl355: mov x5, #4 @@ -418,34 +418,34 @@ inner_loop_wd_4_16out: ld1 {v7.s}[1],[x3],x2 //src_tmp4 = vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp4, 1)// umlsl v0.8h, v6.8b, v24.8b //mul_res1 = vmlsl_u8(mul_res1, vreinterpret_u8_u32(src_tmp3), coeffabs_2)// - umull v8.8h, v7.8b, v23.8b + umull v19.8h, v7.8b, v23.8b dup v4.2s, v7.2s[1] //src_tmp1 = vdup_lane_u32(src_tmp4, 1)// umull v2.8h, v7.8b, v25.8b //mul_res2 = vmull_u8(vreinterpret_u8_u32(src_tmp4), coeffabs_3)// ld1 {v4.s}[1],[x3],x2 //src_tmp1 = vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp1, 1)// - umlsl v8.8h, v6.8b, v22.8b + umlsl v19.8h, v6.8b, v22.8b umlal v0.8h, v4.8b, v26.8b //mul_res1 = vmlal_u8(mul_res1, vreinterpret_u8_u32(src_tmp1), coeffabs_4)// dup v5.2s, v4.2s[1] //src_tmp2 = vdup_lane_u32(src_tmp1, 1)// - umlsl v8.8h, v4.8b, v24.8b + umlsl v19.8h, v4.8b, v24.8b ld1 {v5.s}[1],[x3],x2 //src_tmp2 = vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp2, 1)// umlsl v2.8h, v5.8b, v27.8b //mul_res2 = vmlsl_u8(mul_res2, vreinterpret_u8_u32(src_tmp2), coeffabs_5)// dup v6.2s, v5.2s[1] //src_tmp3 = vdup_lane_u32(src_tmp2, 1)// - umlal v8.8h, v5.8b, v25.8b + umlal v19.8h, v5.8b, v25.8b ld1 {v6.s}[1],[x3],x2 //src_tmp3 = vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp3, 1)// umlal v0.8h, v6.8b, v28.8b //mul_res1 = vmlal_u8(mul_res1, vreinterpret_u8_u32(src_tmp3), coeffabs_6)// dup v7.2s, v6.2s[1] //src_tmp4 = vdup_lane_u32(src_tmp3, 1)// - umlal v8.8h, v6.8b, v26.8b + umlal v19.8h, v6.8b, v26.8b ld1 {v7.s}[1],[x3],x2 //src_tmp4 = vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp4, 1)// umlsl v2.8h, v7.8b, v29.8b //mul_res2 = vmlsl_u8(mul_res2, vreinterpret_u8_u32(src_tmp4), coeffabs_7)// dup v4.2s, v7.2s[1] add v0.8h, v0.8h , v2.8h //mul_res1 = vaddq_u16(mul_res1, mul_res2)// - umlsl v8.8h, v7.8b, v27.8b + umlsl v19.8h, v7.8b, v27.8b ld1 {v4.s}[1],[x3],x2 - umlal v8.8h, v4.8b, v28.8b + umlal v19.8h, v4.8b, v28.8b dup v5.2s, v4.2s[1] //vqrshrun.s16 d0,q0,#6 //sto_res = vqmovun_s16(sto_res_tmp)// @@ -453,13 +453,13 @@ inner_loop_wd_4_16out: add x3,x1,x6 st1 {v0.d}[0],[x1],#8 //vst1_lane_u32((uint32_t *)pu1_dst, vreinterpret_u32_u8(sto_res), 0)// - umlsl v8.8h, v5.8b, v29.8b + umlsl v19.8h, v5.8b, v29.8b st1 {v0.d}[1],[x3],x6 //vst1_lane_u32((uint32_t *)pu1_dst_tmp, vreinterpret_u32_u8(sto_res), 1)// //vqrshrun.s16 d8,q4,#6 - st1 {v8.d}[0],[x3],x6 + st1 {v19.d}[0],[x3],x6 //add x1,x1,#4 - st1 {v8.d}[1],[x3] + st1 {v19.d}[1],[x3] bgt inner_loop_wd_4_16out end_inner_loop_wd_4_16out: @@ -470,7 +470,7 @@ end_inner_loop_wd_4_16out: // ldmfd sp!, {x4-x12, x15} //reload the registers from sp ldp x19, x20,[sp], #16 - pop_v_regs + ret diff --git a/common/arm64/ihevc_inter_pred_luma_copy_w16out.s b/common/arm64/ihevc_inter_pred_luma_copy_w16out.s index 86ffdba..b5498cf 100644 --- a/common/arm64/ihevc_inter_pred_luma_copy_w16out.s +++ b/common/arm64/ihevc_inter_pred_luma_copy_w16out.s @@ -84,7 +84,7 @@ ihevc_inter_pred_luma_copy_w16out_av8: // stmfd sp!, {x4-x12, x14} //stack stores the values of the arguments - push_v_regs + stp x19, x20,[sp,#-16]! mov x15,x4 // pi1_coeff @@ -138,7 +138,7 @@ end_inner_loop_wd_4: end_loops: // ldmfd sp!,{x4-x12,x15} //reload the registers from sp ldp x19, x20,[sp], #16 - pop_v_regs + ret @@ -159,14 +159,14 @@ core_loop_wd_8: prolog: add x6,x0,x2 //pu1_src_tmp += src_strd add x10,x1,x5 - ld1 {v8.8b},[x0],#8 //vld1_u8(pu1_src_tmp) - ld1 {v10.8b},[x6],x2 //vld1_u8(pu1_src_tmp) - ld1 {v12.8b},[x6],x2 //vld1_u8(pu1_src_tmp) - ld1 {v14.8b},[x6],x2 //vld1_u8(pu1_src_tmp) - uxtl v16.8h, v8.8b //vmovl_u8(vld1_u8(pu1_src_tmp)) - uxtl v18.8h, v10.8b //vmovl_u8(vld1_u8(pu1_src_tmp) - uxtl v20.8h, v12.8b //vmovl_u8(vld1_u8(pu1_src_tmp) - uxtl v22.8h, v14.8b //vmovl_u8(vld1_u8(pu1_src_tmp) + ld1 {v1.8b},[x0],#8 //vld1_u8(pu1_src_tmp) + ld1 {v3.8b},[x6],x2 //vld1_u8(pu1_src_tmp) + ld1 {v5.8b},[x6],x2 //vld1_u8(pu1_src_tmp) + ld1 {v7.8b},[x6],x2 //vld1_u8(pu1_src_tmp) + uxtl v16.8h, v1.8b //vmovl_u8(vld1_u8(pu1_src_tmp)) + uxtl v18.8h, v3.8b //vmovl_u8(vld1_u8(pu1_src_tmp) + uxtl v20.8h, v5.8b //vmovl_u8(vld1_u8(pu1_src_tmp) + uxtl v22.8h, v7.8b //vmovl_u8(vld1_u8(pu1_src_tmp) subs x4,x4,#8 //wd decrements by 8 shl v0.8h, v16.8h,#6 //vshlq_n_s16(tmp, 6) shl v2.8h, v18.8h,#6 //vshlq_n_s16(tmp, 6) @@ -175,10 +175,10 @@ prolog: add x20,x0,x8 csel x0, x20, x0,le add x6,x0,x2 //pu1_src_tmp += src_strd - ld1 {v8.8b},[x0],#8 //vld1_u8(pu1_src_tmp) - ld1 {v10.8b},[x6],x2 //vld1_u8(pu1_src_tmp) - ld1 {v12.8b},[x6],x2 //vld1_u8(pu1_src_tmp) - ld1 {v14.8b},[x6],x2 //vld1_u8(pu1_src_tmp) + ld1 {v1.8b},[x0],#8 //vld1_u8(pu1_src_tmp) + ld1 {v3.8b},[x6],x2 //vld1_u8(pu1_src_tmp) + ld1 {v5.8b},[x6],x2 //vld1_u8(pu1_src_tmp) + ld1 {v7.8b},[x6],x2 //vld1_u8(pu1_src_tmp) st1 {v0.8h},[x1],#16 //vst1q_s16(pi2_dst_tmp, tmp) add x20,x1,x11,lsl #1 @@ -196,15 +196,15 @@ prolog: outer_loop_wd_8: st1 {v2.8h},[x10],x5 //vst1q_s16(pi2_dst_tmp, tmp) - uxtl v16.8h, v8.8b //vmovl_u8(vld1_u8(pu1_src_tmp)) + uxtl v16.8h, v1.8b //vmovl_u8(vld1_u8(pu1_src_tmp)) st1 {v4.8h},[x10],x5 //vst1q_s16(pi2_dst_tmp, tmp) - uxtl v18.8h, v10.8b //vmovl_u8(vld1_u8(pu1_src_tmp) + uxtl v18.8h, v3.8b //vmovl_u8(vld1_u8(pu1_src_tmp) st1 {v6.8h},[x10],x5 //vst1q_s16(pi2_dst_tmp, tmp) - uxtl v20.8h, v12.8b //vmovl_u8(vld1_u8(pu1_src_tmp) + uxtl v20.8h, v5.8b //vmovl_u8(vld1_u8(pu1_src_tmp) - uxtl v22.8h, v14.8b //vmovl_u8(vld1_u8(pu1_src_tmp) + uxtl v22.8h, v7.8b //vmovl_u8(vld1_u8(pu1_src_tmp) subs x4,x4,#8 //wd decrements by 8 add x20,x0,x8 @@ -212,16 +212,16 @@ outer_loop_wd_8: add x6,x0,x2 //pu1_src_tmp += src_strd - ld1 {v8.8b},[x0],#8 //vld1_u8(pu1_src_tmp) + ld1 {v1.8b},[x0],#8 //vld1_u8(pu1_src_tmp) shl v0.8h, v16.8h,#6 //vshlq_n_s16(tmp, 6) - ld1 {v10.8b},[x6],x2 //vld1_u8(pu1_src_tmp) + ld1 {v3.8b},[x6],x2 //vld1_u8(pu1_src_tmp) shl v2.8h, v18.8h,#6 //vshlq_n_s16(tmp, 6) - ld1 {v12.8b},[x6],x2 //vld1_u8(pu1_src_tmp) + ld1 {v5.8b},[x6],x2 //vld1_u8(pu1_src_tmp) shl v4.8h, v20.8h,#6 //vshlq_n_s16(tmp, 6) - ld1 {v14.8b},[x6],x2 //vld1_u8(pu1_src_tmp) + ld1 {v7.8b},[x6],x2 //vld1_u8(pu1_src_tmp) add x10,x1,x5 shl v6.8h, v22.8h,#6 //vshlq_n_s16(tmp, 6) @@ -238,15 +238,15 @@ outer_loop_wd_8: epilog: st1 {v2.8h},[x10],x5 //vst1q_s16(pi2_dst_tmp, tmp) - uxtl v16.8h, v8.8b //vmovl_u8(vld1_u8(pu1_src_tmp)) + uxtl v16.8h, v1.8b //vmovl_u8(vld1_u8(pu1_src_tmp)) st1 {v4.8h},[x10],x5 //vst1q_s16(pi2_dst_tmp, tmp) - uxtl v18.8h, v10.8b //vmovl_u8(vld1_u8(pu1_src_tmp) + uxtl v18.8h, v3.8b //vmovl_u8(vld1_u8(pu1_src_tmp) st1 {v6.8h},[x10],x5 //vst1q_s16(pi2_dst_tmp, tmp) - uxtl v20.8h, v12.8b //vmovl_u8(vld1_u8(pu1_src_tmp) + uxtl v20.8h, v5.8b //vmovl_u8(vld1_u8(pu1_src_tmp) - uxtl v22.8h, v14.8b //vmovl_u8(vld1_u8(pu1_src_tmp) + uxtl v22.8h, v7.8b //vmovl_u8(vld1_u8(pu1_src_tmp) //add x6,x0,x2 //pu1_src_tmp += src_strd shl v0.8h, v16.8h,#6 //vshlq_n_s16(tmp, 6) @@ -264,7 +264,7 @@ epilog_end: // ldmfd sp!,{x4-x12,x15} //reload the registers from sp ldp x19, x20,[sp], #16 - pop_v_regs + ret diff --git a/common/arm64/ihevc_inter_pred_luma_vert_w16inp_w16out.s b/common/arm64/ihevc_inter_pred_luma_vert_w16inp_w16out.s index b94ec3c..7147200 100644 --- a/common/arm64/ihevc_inter_pred_luma_vert_w16inp_w16out.s +++ b/common/arm64/ihevc_inter_pred_luma_vert_w16inp_w16out.s @@ -114,7 +114,7 @@ ihevc_inter_pred_luma_vert_w16inp_w16out_av8: //stmfd sp!, {r4-r12, r14} //stack stores the values of the arguments - push_v_regs + stp x19,x20,[sp, #-16]! mov x15,x4 // pi1_coeff @@ -163,71 +163,71 @@ prolog: ld1 {v0.4h},[x0], #8 //src_tmp1 = ld1_u8(pu1_src_tmp)// subs x4,x4,#4 ld1 {v2.4h},[x3],x2 //src_tmp3 = ld1_u8(pu1_src_tmp)// - smull v8.4s,v1.4h,v23.4h //mul_res1 = smull_u8(src_tmp2, coeffabs_1)// + smull v19.4s,v1.4h,v23.4h //mul_res1 = smull_u8(src_tmp2, coeffabs_1)// ld1 {v3.4h},[x3],x2 //src_tmp4 = ld1_u8(pu1_src_tmp)// - smlal v8.4s,v0.4h,v22.4h //mul_res1 = smlal_u8(mul_res1, src_tmp1, coeffabs_0)// + smlal v19.4s,v0.4h,v22.4h //mul_res1 = smlal_u8(mul_res1, src_tmp1, coeffabs_0)// ld1 {v4.4h},[x3],x2 //src_tmp1 = ld1_u8(pu1_src_tmp)// - smlal v8.4s,v2.4h,v24.4h //mul_res1 = smlal_u8(mul_res1, src_tmp3, coeffabs_2)// + smlal v19.4s,v2.4h,v24.4h //mul_res1 = smlal_u8(mul_res1, src_tmp3, coeffabs_2)// ld1 {v5.4h},[x3],x2 //src_tmp2 = ld1_u8(pu1_src_tmp)// - smlal v8.4s,v3.4h,v25.4h //mul_res1 = smlal_u8(mul_res1, src_tmp4, coeffabs_3)// + smlal v19.4s,v3.4h,v25.4h //mul_res1 = smlal_u8(mul_res1, src_tmp4, coeffabs_3)// ld1 {v6.4h},[x3],x2 //src_tmp3 = ld1_u8(pu1_src_tmp)// - smlal v8.4s,v4.4h,v26.4h //mul_res1 = smlal_u8(mul_res1, src_tmp1, coeffabs_4)// + smlal v19.4s,v4.4h,v26.4h //mul_res1 = smlal_u8(mul_res1, src_tmp1, coeffabs_4)// ld1 {v7.4h},[x3],x2 //src_tmp4 = ld1_u8(pu1_src_tmp)// - smlal v8.4s,v5.4h,v27.4h //mul_res1 = smlal_u8(mul_res1, src_tmp2, coeffabs_5)// - smlal v8.4s,v6.4h,v28.4h //mul_res1 = smlal_u8(mul_res1, src_tmp3, coeffabs_6)// - smlal v8.4s,v7.4h,v29.4h //mul_res1 = smlal_u8(mul_res1, src_tmp4, coeffabs_7)// + smlal v19.4s,v5.4h,v27.4h //mul_res1 = smlal_u8(mul_res1, src_tmp2, coeffabs_5)// + smlal v19.4s,v6.4h,v28.4h //mul_res1 = smlal_u8(mul_res1, src_tmp3, coeffabs_6)// + smlal v19.4s,v7.4h,v29.4h //mul_res1 = smlal_u8(mul_res1, src_tmp4, coeffabs_7)// ld1 {v16.4h},[x3],x2 //src_tmp1 = ld1_u8(pu1_src_tmp)// - smull v10.4s,v2.4h,v23.4h //mul_res2 = smull_u8(src_tmp3, coeffabs_1)// + smull v20.4s,v2.4h,v23.4h //mul_res2 = smull_u8(src_tmp3, coeffabs_1)// add x20,x0,x8,lsl #0 csel x0,x20,x0,le - smlal v10.4s,v1.4h,v22.4h //mul_res2 = smlal_u8(mul_res2, src_tmp2, coeffabs_0)// + smlal v20.4s,v1.4h,v22.4h //mul_res2 = smlal_u8(mul_res2, src_tmp2, coeffabs_0)// csel x4,x5,x4,le - smlal v10.4s,v3.4h,v24.4h //mul_res2 = smlal_u8(mul_res2, src_tmp4, coeffabs_2)// + smlal v20.4s,v3.4h,v24.4h //mul_res2 = smlal_u8(mul_res2, src_tmp4, coeffabs_2)// ld1 {v17.4h},[x3],x2 //src_tmp2 = ld1_u8(pu1_src_tmp)// - smlal v10.4s,v4.4h,v25.4h //mul_res2 = smlal_u8(mul_res2, src_tmp1, coeffabs_3)// + smlal v20.4s,v4.4h,v25.4h //mul_res2 = smlal_u8(mul_res2, src_tmp1, coeffabs_3)// ld1 {v18.4h},[x3],x2 //src_tmp3 = ld1_u8(pu1_src_tmp)// - smlal v10.4s,v5.4h,v26.4h //mul_res2 = smlal_u8(mul_res2, src_tmp2, coeffabs_4)// + smlal v20.4s,v5.4h,v26.4h //mul_res2 = smlal_u8(mul_res2, src_tmp2, coeffabs_4)// add x3,x0,x2 //pu1_src_tmp += src_strd// - smlal v10.4s,v6.4h,v27.4h //mul_res2 = smlal_u8(mul_res2, src_tmp3, coeffabs_5)// - smlal v10.4s,v7.4h,v28.4h //mul_res2 = smlal_u8(mul_res2, src_tmp4, coeffabs_6)// - smlal v10.4s,v16.4h,v29.4h //mul_res2 = smlal_u8(mul_res2, src_tmp1, coeffabs_7)// - sub v8.4s, v8.4s, v30.4s + smlal v20.4s,v6.4h,v27.4h //mul_res2 = smlal_u8(mul_res2, src_tmp3, coeffabs_5)// + smlal v20.4s,v7.4h,v28.4h //mul_res2 = smlal_u8(mul_res2, src_tmp4, coeffabs_6)// + smlal v20.4s,v16.4h,v29.4h //mul_res2 = smlal_u8(mul_res2, src_tmp1, coeffabs_7)// + sub v19.4s, v19.4s, v30.4s ld1 {v1.4h},[x3],x2 //src_tmp3 = ld1_u8(pu1_src_tmp)// - smull v12.4s,v3.4h,v23.4h + smull v21.4s,v3.4h,v23.4h ld1 {v0.4h},[x0],#8 //src_tmp1 = ld1_u8(pu1_src_tmp)// - smlal v12.4s,v2.4h,v22.4h + smlal v21.4s,v2.4h,v22.4h ld1 {v2.4h},[x3],x2 //src_tmp3 = ld1_u8(pu1_src_tmp)// - smlal v12.4s,v4.4h,v24.4h - smlal v12.4s,v5.4h,v25.4h - smlal v12.4s,v6.4h,v26.4h - smlal v12.4s,v7.4h,v27.4h - smlal v12.4s,v16.4h,v28.4h - smlal v12.4s,v17.4h,v29.4h + smlal v21.4s,v4.4h,v24.4h + smlal v21.4s,v5.4h,v25.4h + smlal v21.4s,v6.4h,v26.4h + smlal v21.4s,v7.4h,v27.4h + smlal v21.4s,v16.4h,v28.4h + smlal v21.4s,v17.4h,v29.4h add x14,x1,x6 - sub v10.4s, v10.4s, v30.4s - shrn v8.4h, v8.4s, #6 + sub v20.4s, v20.4s, v30.4s + shrn v19.4h, v19.4s, #6 //vqrshrun d8,q4,#6 //sto_res = vqmovun_s16(sto_res_tmp)// - smull v14.4s,v4.4h,v23.4h - smlal v14.4s,v3.4h,v22.4h - smlal v14.4s,v5.4h,v24.4h - smlal v14.4s,v6.4h,v25.4h + smull v31.4s,v4.4h,v23.4h + smlal v31.4s,v3.4h,v22.4h + smlal v31.4s,v5.4h,v24.4h + smlal v31.4s,v6.4h,v25.4h ld1 {v3.4h},[x3],x2 //src_tmp4 = ld1_u8(pu1_src_tmp)// - smlal v14.4s,v7.4h,v26.4h + smlal v31.4s,v7.4h,v26.4h ld1 {v4.4h},[x3],x2 //src_tmp1 = ld1_u8(pu1_src_tmp)// - smlal v14.4s,v16.4h,v27.4h + smlal v31.4s,v16.4h,v27.4h ld1 {v5.4h},[x3],x2 //src_tmp2 = ld1_u8(pu1_src_tmp)// - smlal v14.4s,v17.4h,v28.4h + smlal v31.4s,v17.4h,v28.4h ld1 {v6.4h},[x3],x2 //src_tmp3 = ld1_u8(pu1_src_tmp)// - smlal v14.4s,v18.4h,v29.4h + smlal v31.4s,v18.4h,v29.4h ld1 {v7.4h},[x3],x2 //src_tmp4 = ld1_u8(pu1_src_tmp)// - st1 {v8.2s},[x1],#8 //st1_u8(pu1_dst,sto_res)// - sub v12.4s, v12.4s, v30.4s - shrn v10.4h, v10.4s, #6 + st1 {v19.2s},[x1],#8 //st1_u8(pu1_dst,sto_res)// + sub v21.4s, v21.4s, v30.4s + shrn v20.4h, v20.4s, #6 //vqrshrun d10,q5,#6 //sto_res = vqmovun_s16(sto_res_tmp)// add x20, x1, x9 csel x1, x20, x1, le @@ -240,87 +240,87 @@ prolog: kernel_8: - smull v8.4s,v1.4h,v23.4h //mul_res1 = smull_u8(src_tmp2, coeffabs_1)// + smull v19.4s,v1.4h,v23.4h //mul_res1 = smull_u8(src_tmp2, coeffabs_1)// subs x4,x4,#4 - smlal v8.4s,v0.4h,v22.4h //mul_res1 = smlal_u8(mul_res1, src_tmp1, coeffabs_0)// + smlal v19.4s,v0.4h,v22.4h //mul_res1 = smlal_u8(mul_res1, src_tmp1, coeffabs_0)// add x20,x0,x8,lsl #0 csel x0,x20,x0,le - smlal v8.4s,v2.4h,v24.4h //mul_res1 = smlal_u8(mul_res1, src_tmp3, coeffabs_2)// - smlal v8.4s,v3.4h,v25.4h //mul_res1 = smlal_u8(mul_res1, src_tmp4, coeffabs_3)// - smlal v8.4s,v4.4h,v26.4h //mul_res1 = smlal_u8(mul_res1, src_tmp1, coeffabs_4)// - smlal v8.4s,v5.4h,v27.4h //mul_res1 = smlal_u8(mul_res1, src_tmp2, coeffabs_5)// - smlal v8.4s,v6.4h,v28.4h //mul_res1 = smlal_u8(mul_res1, src_tmp3, coeffabs_6)// - smlal v8.4s,v7.4h,v29.4h //mul_res1 = smlal_u8(mul_res1, src_tmp4, coeffabs_7)// - st1 {v10.2s},[x14],x6 //st1_u8(pu1_dst_tmp,sto_res)// - - sub v14.4S, v14.4s, v30.4s - shrn v12.4h, v12.4s, #6 + smlal v19.4s,v2.4h,v24.4h //mul_res1 = smlal_u8(mul_res1, src_tmp3, coeffabs_2)// + smlal v19.4s,v3.4h,v25.4h //mul_res1 = smlal_u8(mul_res1, src_tmp4, coeffabs_3)// + smlal v19.4s,v4.4h,v26.4h //mul_res1 = smlal_u8(mul_res1, src_tmp1, coeffabs_4)// + smlal v19.4s,v5.4h,v27.4h //mul_res1 = smlal_u8(mul_res1, src_tmp2, coeffabs_5)// + smlal v19.4s,v6.4h,v28.4h //mul_res1 = smlal_u8(mul_res1, src_tmp3, coeffabs_6)// + smlal v19.4s,v7.4h,v29.4h //mul_res1 = smlal_u8(mul_res1, src_tmp4, coeffabs_7)// + st1 {v20.2s},[x14],x6 //st1_u8(pu1_dst_tmp,sto_res)// + + sub v31.4S, v31.4s, v30.4s + shrn v21.4h, v21.4s, #6 //vqrshrun d12,q6,#6 ld1 {v16.4h},[x3],x2 //src_tmp1 = ld1_u8(pu1_src_tmp)// - smull v10.4s,v2.4h,v23.4h //mul_res2 = smull_u8(src_tmp3, coeffabs_1)// - smlal v10.4s,v1.4h,v22.4h //mul_res2 = smlal_u8(mul_res2, src_tmp2, coeffabs_0)// - smlal v10.4s,v3.4h,v24.4h //mul_res2 = smlal_u8(mul_res2, src_tmp4, coeffabs_2)// - smlal v10.4s,v4.4h,v25.4h //mul_res2 = smlal_u8(mul_res2, src_tmp1, coeffabs_3)// - smlal v10.4s,v5.4h,v26.4h //mul_res2 = smlal_u8(mul_res2, src_tmp2, coeffabs_4)// - smlal v10.4s,v6.4h,v27.4h //mul_res2 = smlal_u8(mul_res2, src_tmp3, coeffabs_5)// - st1 {v12.2s},[x14],x6 + smull v20.4s,v2.4h,v23.4h //mul_res2 = smull_u8(src_tmp3, coeffabs_1)// + smlal v20.4s,v1.4h,v22.4h //mul_res2 = smlal_u8(mul_res2, src_tmp2, coeffabs_0)// + smlal v20.4s,v3.4h,v24.4h //mul_res2 = smlal_u8(mul_res2, src_tmp4, coeffabs_2)// + smlal v20.4s,v4.4h,v25.4h //mul_res2 = smlal_u8(mul_res2, src_tmp1, coeffabs_3)// + smlal v20.4s,v5.4h,v26.4h //mul_res2 = smlal_u8(mul_res2, src_tmp2, coeffabs_4)// + smlal v20.4s,v6.4h,v27.4h //mul_res2 = smlal_u8(mul_res2, src_tmp3, coeffabs_5)// + st1 {v21.2s},[x14],x6 - smlal v10.4s,v7.4h,v28.4h //mul_res2 = smlal_u8(mul_res2, src_tmp4, coeffabs_6)// + smlal v20.4s,v7.4h,v28.4h //mul_res2 = smlal_u8(mul_res2, src_tmp4, coeffabs_6)// ld1 {v17.4h},[x3],x2 //src_tmp2 = ld1_u8(pu1_src_tmp)// - smlal v10.4s,v16.4h,v29.4h //mul_res2 = smlal_u8(mul_res2, src_tmp1, coeffabs_7)// + smlal v20.4s,v16.4h,v29.4h //mul_res2 = smlal_u8(mul_res2, src_tmp1, coeffabs_7)// - sub v8.4s, v8.4s, v30.4s - shrn v14.4h, v14.4s, #6 + sub v19.4s, v19.4s, v30.4s + shrn v31.4h, v31.4s, #6 //vqrshrun d14,q7,#6 - smull v12.4s,v3.4h,v23.4h + smull v21.4s,v3.4h,v23.4h csel x4,x5,x4,le - smlal v12.4s,v2.4h,v22.4h + smlal v21.4s,v2.4h,v22.4h ld1 {v18.4h},[x3],x2 //src_tmp3 = ld1_u8(pu1_src_tmp)// - smlal v12.4s,v4.4h,v24.4h + smlal v21.4s,v4.4h,v24.4h add x3,x0,x2 //pu1_src_tmp += src_strd// - smlal v12.4s,v5.4h,v25.4h + smlal v21.4s,v5.4h,v25.4h - smlal v12.4s,v6.4h,v26.4h - st1 {v14.2s},[x14],x6 + smlal v21.4s,v6.4h,v26.4h + st1 {v31.2s},[x14],x6 - smlal v12.4s,v7.4h,v27.4h + smlal v21.4s,v7.4h,v27.4h ld1 {v1.4h},[x3],x2 //src_tmp2 = ld1_u8(pu1_src_tmp)// - smlal v12.4s,v16.4h,v28.4h + smlal v21.4s,v16.4h,v28.4h add x14,x1,x6 - smlal v12.4s,v17.4h,v29.4h + smlal v21.4s,v17.4h,v29.4h ld1 {v0.4h},[x0],#8 //src_tmp1 = ld1_u8(pu1_src_tmp)// - sub v10.4s, v10.4s, v30.4s - shrn v8.4h, v8.4s, #6 + sub v20.4s, v20.4s, v30.4s + shrn v19.4h, v19.4s, #6 //vqrshrun d8,q4,#6 //sto_res = vqmovun_s16(sto_res_tmp)// ld1 {v2.4h},[x3],x2 //src_tmp3 = ld1_u8(pu1_src_tmp)// - smull v14.4s,v4.4h,v23.4h - smlal v14.4s,v3.4h,v22.4h - smlal v14.4s,v5.4h,v24.4h + smull v31.4s,v4.4h,v23.4h + smlal v31.4s,v3.4h,v22.4h + smlal v31.4s,v5.4h,v24.4h ld1 {v3.4h},[x3],x2 //src_tmp4 = ld1_u8(pu1_src_tmp)// - smlal v14.4s,v6.4h,v25.4h + smlal v31.4s,v6.4h,v25.4h ld1 {v4.4h},[x3],x2 //src_tmp1 = ld1_u8(pu1_src_tmp)// - smlal v14.4s,v7.4h,v26.4h + smlal v31.4s,v7.4h,v26.4h ld1 {v5.4h},[x3],x2 //src_tmp2 = ld1_u8(pu1_src_tmp)// - smlal v14.4s,v16.4h,v27.4h + smlal v31.4s,v16.4h,v27.4h ld1 {v6.4h},[x3],x2 //src_tmp3 = ld1_u8(pu1_src_tmp)// - smlal v14.4s,v17.4h,v28.4h + smlal v31.4s,v17.4h,v28.4h ld1 {v7.4h},[x3],x2 //src_tmp4 = ld1_u8(pu1_src_tmp)// - smlal v14.4s,v18.4h,v29.4h - st1 {v8.2s},[x1],#8 //st1_u8(pu1_dst,sto_res)// + smlal v31.4s,v18.4h,v29.4h + st1 {v19.2s},[x1],#8 //st1_u8(pu1_dst,sto_res)// - sub v12.4s, v12.4s, v30.4s - shrn v10.4h, v10.4s, #6 + sub v21.4s, v21.4s, v30.4s + shrn v20.4h, v20.4s, #6 add x20, x1, x9 csel x1, x20, x1, le @@ -331,83 +331,83 @@ kernel_8: epilog: - smull v8.4s,v1.4h,v23.4h //mul_res1 = smull_u8(src_tmp2, coeffabs_1)// - smlal v8.4s,v0.4h,v22.4h //mul_res1 = smlal_u8(mul_res1, src_tmp1, coeffabs_0)// - smlal v8.4s,v2.4h,v24.4h //mul_res1 = smlal_u8(mul_res1, src_tmp3, coeffabs_2)// - smlal v8.4s,v3.4h,v25.4h //mul_res1 = smlal_u8(mul_res1, src_tmp4, coeffabs_3)// - smlal v8.4s,v4.4h,v26.4h //mul_res1 = smlal_u8(mul_res1, src_tmp1, coeffabs_4)// - smlal v8.4s,v5.4h,v27.4h //mul_res1 = smlal_u8(mul_res1, src_tmp2, coeffabs_5)// - smlal v8.4s,v6.4h,v28.4h //mul_res1 = smlal_u8(mul_res1, src_tmp3, coeffabs_6)// - smlal v8.4s,v7.4h,v29.4h //mul_res1 = smlal_u8(mul_res1, src_tmp4, coeffabs_7)// - st1 {v10.2s},[x14],x6 - - sub v14.4s, v14.4s, v30.4s - shrn v12.4h, v12.4s, #6 + smull v19.4s,v1.4h,v23.4h //mul_res1 = smull_u8(src_tmp2, coeffabs_1)// + smlal v19.4s,v0.4h,v22.4h //mul_res1 = smlal_u8(mul_res1, src_tmp1, coeffabs_0)// + smlal v19.4s,v2.4h,v24.4h //mul_res1 = smlal_u8(mul_res1, src_tmp3, coeffabs_2)// + smlal v19.4s,v3.4h,v25.4h //mul_res1 = smlal_u8(mul_res1, src_tmp4, coeffabs_3)// + smlal v19.4s,v4.4h,v26.4h //mul_res1 = smlal_u8(mul_res1, src_tmp1, coeffabs_4)// + smlal v19.4s,v5.4h,v27.4h //mul_res1 = smlal_u8(mul_res1, src_tmp2, coeffabs_5)// + smlal v19.4s,v6.4h,v28.4h //mul_res1 = smlal_u8(mul_res1, src_tmp3, coeffabs_6)// + smlal v19.4s,v7.4h,v29.4h //mul_res1 = smlal_u8(mul_res1, src_tmp4, coeffabs_7)// + st1 {v20.2s},[x14],x6 + + sub v31.4s, v31.4s, v30.4s + shrn v21.4h, v21.4s, #6 //vqrshrun d12,q6,#6 ld1 {v16.4h},[x3],x2 //src_tmp1 = ld1_u8(pu1_src_tmp)// - smull v10.4s,v2.4h,v23.4h //mul_res2 = smull_u8(src_tmp3, coeffabs_1)// - smlal v10.4s,v1.4h,v22.4h //mul_res2 = smlal_u8(mul_res2, src_tmp2, coeffabs_0)// - smlal v10.4s,v3.4h,v24.4h //mul_res2 = smlal_u8(mul_res2, src_tmp4, coeffabs_2)// - smlal v10.4s,v4.4h,v25.4h //mul_res2 = smlal_u8(mul_res2, src_tmp1, coeffabs_3)// - smlal v10.4s,v5.4h,v26.4h //mul_res2 = smlal_u8(mul_res2, src_tmp2, coeffabs_4)// - smlal v10.4s,v6.4h,v27.4h //mul_res2 = smlal_u8(mul_res2, src_tmp3, coeffabs_5)// - smlal v10.4s,v7.4h,v28.4h //mul_res2 = smlal_u8(mul_res2, src_tmp4, coeffabs_6)// - smlal v10.4s,v16.4h,v29.4h //mul_res2 = smlal_u8(mul_res2, src_tmp1, coeffabs_7)// - st1 {v12.2s},[x14],x6 - - sub v8.4s, v8.4s, v30.4s - shrn v14.4h, v14.4s, #6 + smull v20.4s,v2.4h,v23.4h //mul_res2 = smull_u8(src_tmp3, coeffabs_1)// + smlal v20.4s,v1.4h,v22.4h //mul_res2 = smlal_u8(mul_res2, src_tmp2, coeffabs_0)// + smlal v20.4s,v3.4h,v24.4h //mul_res2 = smlal_u8(mul_res2, src_tmp4, coeffabs_2)// + smlal v20.4s,v4.4h,v25.4h //mul_res2 = smlal_u8(mul_res2, src_tmp1, coeffabs_3)// + smlal v20.4s,v5.4h,v26.4h //mul_res2 = smlal_u8(mul_res2, src_tmp2, coeffabs_4)// + smlal v20.4s,v6.4h,v27.4h //mul_res2 = smlal_u8(mul_res2, src_tmp3, coeffabs_5)// + smlal v20.4s,v7.4h,v28.4h //mul_res2 = smlal_u8(mul_res2, src_tmp4, coeffabs_6)// + smlal v20.4s,v16.4h,v29.4h //mul_res2 = smlal_u8(mul_res2, src_tmp1, coeffabs_7)// + st1 {v21.2s},[x14],x6 + + sub v19.4s, v19.4s, v30.4s + shrn v31.4h, v31.4s, #6 //vqrshrun d14,q7,#6 ld1 {v17.4h},[x3],x2 //src_tmp2 = ld1_u8(pu1_src_tmp)// - smull v12.4s,v3.4h,v23.4h - smlal v12.4s,v2.4h,v22.4h - smlal v12.4s,v4.4h,v24.4h - smlal v12.4s,v5.4h,v25.4h - smlal v12.4s,v6.4h,v26.4h - smlal v12.4s,v7.4h,v27.4h - smlal v12.4s,v16.4h,v28.4h - smlal v12.4s,v17.4h,v29.4h - st1 {v14.2s},[x14],x6 - sub v10.4s, v10.4s, v30.4s - shrn v8.4h, v8.4s, #6 + smull v21.4s,v3.4h,v23.4h + smlal v21.4s,v2.4h,v22.4h + smlal v21.4s,v4.4h,v24.4h + smlal v21.4s,v5.4h,v25.4h + smlal v21.4s,v6.4h,v26.4h + smlal v21.4s,v7.4h,v27.4h + smlal v21.4s,v16.4h,v28.4h + smlal v21.4s,v17.4h,v29.4h + st1 {v31.2s},[x14],x6 + sub v20.4s, v20.4s, v30.4s + shrn v19.4h, v19.4s, #6 //vqrshrun d8,q4,#6 //sto_res = vqmovun_s16(sto_res_tmp)// ld1 {v18.4h},[x3],x2 //src_tmp3 = ld1_u8(pu1_src_tmp)// - smull v14.4s,v4.4h,v23.4h - smlal v14.4s,v3.4h,v22.4h - smlal v14.4s,v5.4h,v24.4h - smlal v14.4s,v6.4h,v25.4h - smlal v14.4s,v7.4h,v26.4h - smlal v14.4s,v16.4h,v27.4h - smlal v14.4s,v17.4h,v28.4h - smlal v14.4s,v18.4h,v29.4h - sub v12.4s, v12.4s, v30.4s - shrn v10.4h, v10.4s, #6 + smull v31.4s,v4.4h,v23.4h + smlal v31.4s,v3.4h,v22.4h + smlal v31.4s,v5.4h,v24.4h + smlal v31.4s,v6.4h,v25.4h + smlal v31.4s,v7.4h,v26.4h + smlal v31.4s,v16.4h,v27.4h + smlal v31.4s,v17.4h,v28.4h + smlal v31.4s,v18.4h,v29.4h + sub v21.4s, v21.4s, v30.4s + shrn v20.4h, v20.4s, #6 //vqrshrun d10,q5,#6 //sto_res = vqmovun_s16(sto_res_tmp)// add x14,x1,x6 - st1 {v8.2s},[x1],#8 //st1_u8(pu1_dst,sto_res)// + st1 {v19.2s},[x1],#8 //st1_u8(pu1_dst,sto_res)// epilog_end: - st1 {v10.2s},[x14],x6 //st1_u8(pu1_dst_tmp,sto_res)// - shrn v12.4h, v12.4s, #6 + st1 {v20.2s},[x14],x6 //st1_u8(pu1_dst_tmp,sto_res)// + shrn v21.4h, v21.4s, #6 //vqrshrun d12,q6,#6 - st1 {v12.2s},[x14],x6 - sub v14.4s, v14.4s, v30.4s - shrn v14.4h, v14.4s, #6 + st1 {v21.2s},[x14],x6 + sub v31.4s, v31.4s, v30.4s + shrn v31.4h, v31.4s, #6 //vqrshrun d14,q7,#6 - st1 {v14.2s},[x14],x6 + st1 {v31.2s},[x14],x6 end_loops: //ldmfd sp!,{r4-r12,r15} //reload the registers from sp ldp x19, x20,[sp], #16 - pop_v_regs + ret diff --git a/common/arm64/ihevc_intra_pred_chroma_horz.s b/common/arm64/ihevc_intra_pred_chroma_horz.s index da41e59..8de655c 100644 --- a/common/arm64/ihevc_intra_pred_chroma_horz.s +++ b/common/arm64/ihevc_intra_pred_chroma_horz.s @@ -96,7 +96,7 @@ ihevc_intra_pred_chroma_horz_av8: // stmfd sp!, {x4-x12, x14} //stack stores the values of the arguments - push_v_regs + stp x19, x20,[sp,#-16]! lsl x6,x4,#2 //four_nt @@ -117,7 +117,7 @@ ihevc_intra_pred_chroma_horz_av8: core_loop_16: ld1 { v0.8h},[x12] //load 16 values. d1[7] will have the 1st value. sub x12,x12,#16 - ld1 { v10.8h},[x12] //load 16 values. d1[7] will have the 1st value. + ld1 { v18.8h},[x12] //load 16 values. d1[7] will have the 1st value. dup v2.8h, v0.4h[7] //duplicate the i value. @@ -126,7 +126,7 @@ core_loop_16: st1 { v2.8h},[x2],x3 //store in 1st row 0-16 columns st1 { v2.8h},[x9],x3 //store in 1st row 16-32 columns - dup v8.8h, v0.4h[4] + dup v1.8h, v0.4h[4] st1 { v4.8h},[x2],x3 st1 { v4.8h},[x9],x3 @@ -135,47 +135,47 @@ core_loop_16: st1 { v6.8h},[x9],x3 dup v4.8h, v0.4h[2] - st1 { v8.8h},[x2],x3 - st1 { v8.8h},[x9],x3 + st1 { v1.8h},[x2],x3 + st1 { v1.8h},[x9],x3 dup v6.8h, v0.4h[1] st1 { v2.8h},[x2],x3 st1 { v2.8h},[x9],x3 - dup v8.8h, v0.4h[0] + dup v1.8h, v0.4h[0] st1 { v4.8h},[x2],x3 st1 { v4.8h},[x9],x3 - dup v2.8h, v10.4h[7] + dup v2.8h, v18.4h[7] st1 { v6.8h},[x2],x3 st1 { v6.8h},[x9],x3 - dup v4.8h, v10.4h[6] - st1 { v8.8h},[x2],x3 - st1 { v8.8h},[x9],x3 + dup v4.8h, v18.4h[6] + st1 { v1.8h},[x2],x3 + st1 { v1.8h},[x9],x3 - dup v6.8h, v10.4h[5] + dup v6.8h, v18.4h[5] st1 { v2.8h},[x2],x3 st1 { v2.8h},[x9],x3 - dup v8.8h, v10.4h[4] + dup v1.8h, v18.4h[4] st1 { v4.8h},[x2],x3 st1 { v4.8h},[x9],x3 - dup v2.8h, v10.4h[3] + dup v2.8h, v18.4h[3] st1 { v6.8h},[x2],x3 st1 { v6.8h},[x9],x3 - dup v4.8h, v10.4h[2] - st1 { v8.8h},[x2],x3 - st1 { v8.8h},[x9],x3 + dup v4.8h, v18.4h[2] + st1 { v1.8h},[x2],x3 + st1 { v1.8h},[x9],x3 - dup v6.8h, v10.4h[1] + dup v6.8h, v18.4h[1] st1 { v2.8h},[x2],x3 st1 { v2.8h},[x9],x3 sub x12,x12,#16 //move to 16th value pointer - dup v8.8h, v10.4h[0] + dup v1.8h, v18.4h[0] st1 { v4.8h},[x2],x3 st1 { v4.8h},[x9],x3 @@ -183,12 +183,12 @@ core_loop_16: st1 { v6.8h},[x2],x3 st1 { v6.8h},[x9],x3 - st1 { v8.8h},[x2],x3 - st1 { v8.8h},[x9],x3 + st1 { v1.8h},[x2],x3 + st1 { v1.8h},[x9],x3 bgt core_loop_16 // ldmfd sp!,{x4-x12,x15} //reload the registers from sp ldp x19, x20,[sp],#16 - pop_v_regs + ret b endloop @@ -203,7 +203,7 @@ core_loop_8: sub x12,x12,#16 // ld1 { v30.16b},[x12] - dup v10.8h, v0.4h[7] + dup v18.8h, v0.4h[7] //vmovl.u8 q13,d26 dup v2.8h, v0.4h[6] @@ -215,18 +215,18 @@ core_loop_8: dup v6.8h, v0.4h[4] //vqadd.s16 q11,q13,q12 - dup v8.8h, v0.4h[3] + dup v1.8h, v0.4h[3] //vqmovun.s16 d22,q11 - st1 { v10.8h},[x2],x3 + st1 { v18.8h},[x2],x3 - dup v10.8h, v0.4h[2] + dup v18.8h, v0.4h[2] //vsubl.u8 q12,d31,d28 - dup v12.8h, v0.4h[1] + dup v19.8h, v0.4h[1] //vshr.s16 q12,q12,#1 - dup v14.8h, v0.4h[0] + dup v20.8h, v0.4h[0] //vqadd.s16 q11,q13,q12 dup v16.8h, v0.4h[3] @@ -238,14 +238,14 @@ core_loop_8: st1 { v4.8h},[x2],x3 st1 { v6.8h},[x2],x3 - st1 { v8.8h},[x2],x3 - st1 { v10.8h},[x2],x3 + st1 { v1.8h},[x2],x3 + st1 { v18.8h},[x2],x3 //vdup.8 q1,d0[2] - st1 { v12.8h},[x2],x3 + st1 { v19.8h},[x2],x3 //vdup.8 q2,d0[1] - st1 { v14.8h},[x2],x3 + st1 { v20.8h},[x2],x3 //vdup.8 q3,d0[0] //vst1.8 {q7},[x2],x3 @@ -269,7 +269,7 @@ core_loop_8: // ldmfd sp!,{x4-x12,x15} //reload the registers from sp ldp x19, x20,[sp],#16 - pop_v_regs + ret b endloop @@ -305,11 +305,11 @@ core_loop_4: st1 {v6.8b},[x2],x3 st1 {v3.8b},[x2],x3 - dup v8.4h, v0.4h[1] + dup v1.4h, v0.4h[1] st1 {v4.8b},[x2],x3 st1 {v5.8b},[x2],x3 - dup v9.4h, v0.4h[0] + dup v17.4h, v0.4h[0] //vst1.8 {d6},[x2],x3 //vst1.8 {d7},[x2],x3 @@ -317,7 +317,7 @@ core_loop_4: //vst1.8 {d9},[x2],x3 // ldmfd sp!,{x4-x12,x15} //reload the registers from sp ldp x19, x20,[sp],#16 - pop_v_regs + ret b endloop @@ -352,7 +352,7 @@ core_loop_4: // ldmfd sp!,{x4-x12,x15} //reload the registers from sp ldp x19, x20,[sp],#16 - pop_v_regs + ret endloop: diff --git a/common/arm64/ihevc_intra_pred_chroma_mode_18_34.s b/common/arm64/ihevc_intra_pred_chroma_mode_18_34.s index 52fc702..aacb35e 100644 --- a/common/arm64/ihevc_intra_pred_chroma_mode_18_34.s +++ b/common/arm64/ihevc_intra_pred_chroma_mode_18_34.s @@ -105,7 +105,7 @@ ihevc_intra_pred_chroma_mode_18_34_av8: // stmfd sp!, {x4-x12, x14} //stack stores the values of the arguments - push_v_regs + stp x19, x20,[sp,#-16]! @@ -141,14 +141,14 @@ kernel: st1 {v4.8b, v5.8b},[x10],x3 ld1 {v6.8b, v7.8b},[x8],x6 st1 {v6.8b, v7.8b},[x10],x3 - ld1 {v8.8b, v9.8b},[x8],x6 - st1 {v8.8b, v9.8b},[x10],x3 - ld1 {v10.8b, v11.8b},[x8],x6 - st1 {v10.8b, v11.8b},[x10],x3 - ld1 {v12.8b, v13.8b},[x8],x6 - st1 {v12.8b, v13.8b},[x10],x3 - ld1 {v14.8b, v15.8b},[x8],x6 - st1 {v14.8b, v15.8b},[x10],x3 + ld1 {v16.8b, v17.8b},[x8],x6 + st1 {v16.8b, v17.8b},[x10],x3 + ld1 {v18.8b, v19.8b},[x8],x6 + st1 {v18.8b, v19.8b},[x10],x3 + ld1 {v20.8b, v21.8b},[x8],x6 + st1 {v20.8b, v21.8b},[x10],x3 + ld1 {v22.8b, v23.8b},[x8],x6 + st1 {v22.8b, v23.8b},[x10],x3 subs x12,x12,#8 bne kernel @@ -188,7 +188,7 @@ mode2_4: end_func: // ldmfd sp!,{x4-x12,x15} //reload the registers from sp ldp x19, x20,[sp],#16 - pop_v_regs + ret diff --git a/common/arm64/ihevc_intra_pred_chroma_mode_27_to_33.s b/common/arm64/ihevc_intra_pred_chroma_mode_27_to_33.s index 1df4ad0..b22d182 100644 --- a/common/arm64/ihevc_intra_pred_chroma_mode_27_to_33.s +++ b/common/arm64/ihevc_intra_pred_chroma_mode_27_to_33.s @@ -95,7 +95,10 @@ ihevc_intra_pred_chroma_mode_27_to_33_av8: // stmfd sp!, {x4-x12, x14} //stack stores the values of the arguments - push_v_regs + + stp d9,d10,[sp,#-16]! + stp d12,d13,[sp,#-16]! + stp d14,d15,[sp,#-16]! stp x19, x20,[sp,#-16]! adrp x6, :got:gai4_ihevc_ang_table //loads word32 gai4_ihevc_ang_table[35] @@ -151,7 +154,7 @@ prologue: add x10,x8,x9 //(i row)*pu1_ref[ref_main_idx] asr x14,x14,#8 //(ii)shift by 8 - ld1 {v8.8b},[x10],x11 //(i row)ref_main_idx + ld1 {v23.8b},[x10],x11 //(i row)ref_main_idx and x9,x14,#0xff //(ii)get the last byte asr x14,x14,#8 //(iii) @@ -163,7 +166,7 @@ prologue: add x10,x8,x9 //(iii)*pu1_ref[ref_main_idx] ld1 {v12.8b},[x12],x11 //(ii)ref_main_idx - umull v10.8h, v8.8b, v30.8b //(i row)vmull_u8(ref_main_idx, dup_const_32_fract) + umull v10.8h, v23.8b, v30.8b //(i row)vmull_u8(ref_main_idx, dup_const_32_fract) ld1 {v13.8b},[x12] //(ii)ref_main_idx_1 umlal v10.8h, v9.8b, v31.8b //(i row)vmull_u8(ref_main_idx_1, dup_const_fract) @@ -202,7 +205,7 @@ prologue: dup v29.8b, v4.8b[5] //(vi) add x10,x8,x9 //(v)*pu1_ref[ref_main_idx] - ld1 {v8.8b},[x10],x11 //(v)ref_main_idx + ld1 {v23.8b},[x10],x11 //(v)ref_main_idx sub v24.8b, v1.8b , v25.8b //(iv)32-fract(dup_const_32_fract) asr x14,x14,#8 //(vi) @@ -224,7 +227,7 @@ prologue: add x10,x8,x9 //(vii)*pu1_ref[ref_main_idx] ld1 {v12.8b},[x12],x11 //(vi)ref_main_idx - umull v10.8h, v8.8b, v30.8b //(v)vmull_u8(ref_main_idx, dup_const_32_fract) + umull v10.8h, v23.8b, v30.8b //(v)vmull_u8(ref_main_idx, dup_const_32_fract) ld1 {v13.8b},[x12] //(vi)ref_main_idx_1 umlal v10.8h, v9.8b, v31.8b //(v)vmull_u8(ref_main_idx_1, dup_const_fract) @@ -281,7 +284,7 @@ kernel_8_rows: dup v31.8b, v4.8b[0] subs x4,x4,#8 - ld1 {v8.8b},[x10],x11 //(i)ref_main_idx + ld1 {v23.8b},[x10],x11 //(i)ref_main_idx sub v24.8b, v1.8b , v25.8b //(viii)32-fract(dup_const_32_fract) and x9,x14,#0xff //(ii) add x20,x6,#8 //increment the row value @@ -304,7 +307,7 @@ kernel_8_rows: add x10,x8,x9 //(iii)*pu1_ref[ref_main_idx] ld1 {v12.8b},[x12],x11 //(ii)ref_main_idx - umull v10.8h, v8.8b, v30.8b //(i)vmull_u8(ref_main_idx, dup_const_32_fract) + umull v10.8h, v23.8b, v30.8b //(i)vmull_u8(ref_main_idx, dup_const_32_fract) asr x14,x14,#8 //(iv) ld1 {v13.8b},[x12] //(ii)ref_main_idx_1 @@ -362,7 +365,7 @@ kernel_8_rows: rshrn v18.8b, v18.8h,#5 //(iii)shift_res = vrshrn_n_u16(add_res, 5) add x12,x8,x9 //(vi)*pu1_ref[ref_main_idx] - ld1 {v8.8b},[x10],x11 //(v)ref_main_idx + ld1 {v23.8b},[x10],x11 //(v)ref_main_idx and v4.16b, v2.16b , v6.16b //dup_const_fract(fract = pos & (31)) asr x14,x14,#8 //(vii) @@ -379,7 +382,7 @@ kernel_8_rows: add x10,x8,x9 //(vii)*pu1_ref[ref_main_idx] ld1 {v13.8b},[x12] //(vi)ref_main_idx_1 - umull v10.8h, v8.8b, v30.8b //(v)vmull_u8(ref_main_idx, dup_const_32_fract) + umull v10.8h, v23.8b, v30.8b //(v)vmull_u8(ref_main_idx, dup_const_32_fract) and x9,x14,#0xff //(viii) smov x14, v3.2s[0] //(i)extract idx to the r register @@ -479,7 +482,7 @@ core_loop_4: dup v7.8b,w4 //dup_const_32_fract umlal v4.8h, v3.8b, v0.8b //vmull_u8(ref_main_idx_1, dup_const_fract) - ld1 {v8.8b},[x10] //ref_main_idx + ld1 {v23.8b},[x10] //ref_main_idx add x8,x8,#1 ld1 {v9.8b},[x11] //ref_main_idx_1 @@ -495,7 +498,7 @@ core_loop_4: add x11,x10,#2 //pu1_ref_main_idx_1 += 1 dup v12.8b,w5 //dup_const_fract - umull v10.8h, v8.8b, v7.8b //vmull_u8(ref_main_idx, dup_const_32_fract) + umull v10.8h, v23.8b, v7.8b //vmull_u8(ref_main_idx, dup_const_32_fract) sub x20,x5,#32 neg x4, x20 @@ -543,7 +546,9 @@ core_loop_4: end_loops: // ldmfd sp!,{x4-x12,x15} //reload the registers from sp ldp x19, x20,[sp],#16 - pop_v_regs + ldp d14,d15,[sp],#16 + ldp d12,d13,[sp],#16 + ldp d9,d10,[sp],#16 ret diff --git a/common/arm64/ihevc_intra_pred_chroma_mode_3_to_9.s b/common/arm64/ihevc_intra_pred_chroma_mode_3_to_9.s index 3c8746c..bf026a3 100644 --- a/common/arm64/ihevc_intra_pred_chroma_mode_3_to_9.s +++ b/common/arm64/ihevc_intra_pred_chroma_mode_3_to_9.s @@ -104,7 +104,10 @@ ihevc_intra_pred_chroma_mode_3_to_9_av8: // stmfd sp!, {x4-x12, x14} //stack stores the values of the arguments - push_v_regs + + stp d13,d14,[sp,#-16]! + stp d8,d15,[sp,#-16]! // Storing d15 using { sub sp,sp,#8; str d15,[sp] } is giving bus error. + // d8 is used as dummy register and stored along with d15 using stp. d8 is not used in the function. stp x19, x20,[sp,#-16]! adrp x7, :got:gai4_ihevc_ang_table @@ -157,8 +160,8 @@ prologue_8_16_32: movi v28.8b, #32 - sqxtn v8.8b, v22.8h - shl v8.8b, v8.8b,#1 // 2 * idx + sqxtn v2.8b, v22.8h + shl v2.8b, v2.8b,#1 // 2 * idx and v6.8b, v6.8b , v29.8b //fract values in d1/ idx values in d0 movi v29.8b, #2 //contains #2 for adding to get ref_main_idx + 1 @@ -167,58 +170,58 @@ prologue_8_16_32: dup v27.4h,w0 mov x0,#0 - movi v9.8b, #22 //row 0 to 7 + movi v3.8b, #22 //row 0 to 7 - sub v8.8b, v8.8b , v27.8b //ref_main_idx (sub row) - sub v8.8b, v26.8b , v8.8b //ref_main_idx (row 0) - add v8.8b, v8.8b , v9.8b //to compensate the pu1_src idx incremented by 8 - sub v9.8b, v8.8b , v29.8b //ref_main_idx + 1 (row 0) - tbl v12.8b, { v0.16b, v1.16b}, v8.8b //load from ref_main_idx (row 0) + sub v2.8b, v2.8b , v27.8b //ref_main_idx (sub row) + sub v2.8b, v26.8b , v2.8b //ref_main_idx (row 0) + add v2.8b, v2.8b , v3.8b //to compensate the pu1_src idx incremented by 8 + sub v3.8b, v2.8b , v29.8b //ref_main_idx + 1 (row 0) + tbl v25.8b, { v0.16b, v1.16b}, v2.8b //load from ref_main_idx (row 0) sub v7.8b, v28.8b , v6.8b //32-fract - tbl v13.8b, { v0.16b, v1.16b}, v9.8b //load from ref_main_idx + 1 (row 0) - sub v4.8b, v8.8b , v29.8b //ref_main_idx (row 1) - sub v5.8b, v9.8b , v29.8b //ref_main_idx + 1 (row 1) + tbl v13.8b, { v0.16b, v1.16b}, v3.8b //load from ref_main_idx + 1 (row 0) + sub v4.8b, v2.8b , v29.8b //ref_main_idx (row 1) + sub v5.8b, v3.8b , v29.8b //ref_main_idx + 1 (row 1) movi v29.8b, #4 tbl v16.8b, { v0.16b, v1.16b}, v4.8b //load from ref_main_idx (row 1) - umull v24.8h, v12.8b, v7.8b //mul (row 0) + umull v24.8h, v25.8b, v7.8b //mul (row 0) umlal v24.8h, v13.8b, v6.8b //mul (row 0) tbl v17.8b, { v0.16b, v1.16b}, v5.8b //load from ref_main_idx + 1 (row 1) - sub v8.8b, v8.8b , v29.8b //ref_main_idx (row 2) - sub v9.8b, v9.8b , v29.8b //ref_main_idx + 1 (row 2) + sub v2.8b, v2.8b , v29.8b //ref_main_idx (row 2) + sub v3.8b, v3.8b , v29.8b //ref_main_idx + 1 (row 2) rshrn v24.8b, v24.8h,#5 //round shft (row 0) - tbl v14.8b, { v0.16b, v1.16b}, v8.8b //load from ref_main_idx (row 2) + tbl v14.8b, { v0.16b, v1.16b}, v2.8b //load from ref_main_idx (row 2) umull v22.8h, v16.8b, v7.8b //mul (row 1) umlal v22.8h, v17.8b, v6.8b //mul (row 1) - tbl v15.8b, { v0.16b, v1.16b}, v9.8b //load from ref_main_idx + 1 (row 2) + tbl v15.8b, { v0.16b, v1.16b}, v3.8b //load from ref_main_idx + 1 (row 2) sub v4.8b, v4.8b , v29.8b //ref_main_idx (row 3) sub v5.8b, v5.8b , v29.8b //ref_main_idx + 1 (row 3) st1 {v24.8b},[x2], x3 //st (row 0) rshrn v22.8b, v22.8h,#5 //round shft (row 1) - tbl v10.8b, { v0.16b, v1.16b}, v4.8b //load from ref_main_idx (row 3) + tbl v19.8b, { v0.16b, v1.16b}, v4.8b //load from ref_main_idx (row 3) umull v20.8h, v14.8b, v7.8b //mul (row 2) umlal v20.8h, v15.8b, v6.8b //mul (row 2) - tbl v11.8b, { v0.16b, v1.16b}, v5.8b //load from ref_main_idx + 1 (row 3) - sub v8.8b, v8.8b , v29.8b //ref_main_idx (row 4) - sub v9.8b, v9.8b , v29.8b //ref_main_idx + 1 (row 4) + tbl v23.8b, { v0.16b, v1.16b}, v5.8b //load from ref_main_idx + 1 (row 3) + sub v2.8b, v2.8b , v29.8b //ref_main_idx (row 4) + sub v3.8b, v3.8b , v29.8b //ref_main_idx + 1 (row 4) st1 {v22.8b},[x2], x3 //st (row 1) rshrn v20.8b, v20.8h,#5 //round shft (row 2) - tbl v12.8b, { v0.16b, v1.16b}, v8.8b //load from ref_main_idx (row 4) - umull v18.8h, v10.8b, v7.8b //mul (row 3) - umlal v18.8h, v11.8b, v6.8b //mul (row 3) + tbl v25.8b, { v0.16b, v1.16b}, v2.8b //load from ref_main_idx (row 4) + umull v18.8h, v19.8b, v7.8b //mul (row 3) + umlal v18.8h, v23.8b, v6.8b //mul (row 3) - tbl v13.8b, { v0.16b, v1.16b}, v9.8b //load from ref_main_idx + 1 (row 4) + tbl v13.8b, { v0.16b, v1.16b}, v3.8b //load from ref_main_idx + 1 (row 4) sub v4.8b, v4.8b , v29.8b //ref_main_idx (row 5) sub v5.8b, v5.8b , v29.8b //ref_main_idx + 1 (row 5) @@ -226,36 +229,36 @@ prologue_8_16_32: rshrn v18.8b, v18.8h,#5 //round shft (row 3) tbl v16.8b, { v0.16b, v1.16b}, v4.8b //load from ref_main_idx (row 5) - umull v24.8h, v12.8b, v7.8b //mul (row 4) + umull v24.8h, v25.8b, v7.8b //mul (row 4) umlal v24.8h, v13.8b, v6.8b //mul (row 4) tbl v17.8b, { v0.16b, v1.16b}, v5.8b //load from ref_main_idx + 1 (row 5) - sub v8.8b, v8.8b , v29.8b //ref_main_idx (row 6) - sub v9.8b, v9.8b , v29.8b //ref_main_idx + 1 (row 6) + sub v2.8b, v2.8b , v29.8b //ref_main_idx (row 6) + sub v3.8b, v3.8b , v29.8b //ref_main_idx + 1 (row 6) st1 {v18.8b},[x2], x3 //st (row 3) cmp x4,#4 beq end_func rshrn v24.8b, v24.8h,#5 //round shft (row 4) - tbl v14.8b, { v0.16b, v1.16b}, v8.8b //load from ref_main_idx (row 6) + tbl v14.8b, { v0.16b, v1.16b}, v2.8b //load from ref_main_idx (row 6) umull v22.8h, v16.8b, v7.8b //mul (row 5) umlal v22.8h, v17.8b, v6.8b //mul (row 5) - tbl v15.8b, { v0.16b, v1.16b}, v9.8b //load from ref_main_idx + 1 (row 6) + tbl v15.8b, { v0.16b, v1.16b}, v3.8b //load from ref_main_idx + 1 (row 6) sub v4.8b, v4.8b , v29.8b //ref_main_idx (row 7) sub v5.8b, v5.8b , v29.8b //ref_main_idx + 1 (row 7) st1 {v24.8b},[x2], x3 //st (row 4) rshrn v22.8b, v22.8h,#5 //round shft (row 5) - tbl v10.8b, { v0.16b, v1.16b}, v4.8b //load from ref_main_idx (row 7) + tbl v19.8b, { v0.16b, v1.16b}, v4.8b //load from ref_main_idx (row 7) umull v20.8h, v14.8b, v7.8b //mul (row 6) umlal v20.8h, v15.8b, v6.8b //mul (row 6) - tbl v11.8b, { v0.16b, v1.16b}, v5.8b //load from ref_main_idx + 1 (row 7) - umull v18.8h, v10.8b, v7.8b //mul (row 7) - umlal v18.8h, v11.8b, v6.8b //mul (row 7) + tbl v23.8b, { v0.16b, v1.16b}, v5.8b //load from ref_main_idx + 1 (row 7) + umull v18.8h, v19.8b, v7.8b //mul (row 7) + umlal v18.8h, v23.8b, v6.8b //mul (row 7) st1 {v22.8b},[x2], x3 //st (row 5) rshrn v20.8b, v20.8h,#5 //round shft (row 6) @@ -289,11 +292,11 @@ lbl284: csel x0, x20, x0,le ld1 {v31.8b},[x14],#8 - smull v12.8h, v30.8b, v31.8b //(col+1)*intra_pred_angle [0:7](col) - xtn v10.8b, v12.8h - sshr v12.8h, v12.8h,#5 - sqxtn v11.8b, v12.8h - shl v11.8b, v11.8b,#1 + smull v25.8h, v30.8b, v31.8b //(col+1)*intra_pred_angle [0:7](col) + xtn v19.8b, v25.8h + sshr v25.8h, v25.8h,#5 + sqxtn v23.8b, v25.8h + shl v23.8b, v23.8b,#1 mov x5, #0x302 //idx value for v is +1 of u dup v27.4h,w5 //row value inc or reset accordingly ldr w9, [x8] //loads index value @@ -305,25 +308,25 @@ lbl284: dup v26.8b,w9 mov x5,x2 - sub v11.8b, v11.8b , v27.8b //ref_main_idx (sub row) + sub v23.8b, v23.8b , v27.8b //ref_main_idx (sub row) kernel_8_16_32: movi v29.8b, #2 //contains #2 for adding to get ref_main_idx + 1 - sub v8.8b, v26.8b , v11.8b //ref_main_idx - mov v26.8b, v10.8b + sub v2.8b, v26.8b , v23.8b //ref_main_idx + mov v26.8b, v19.8b subs x11, x11, #8 sub x6, x1, x9 - tbl v10.8b, { v0.16b, v1.16b}, v4.8b //load from ref_main_idx (row 7) - add v8.8b, v8.8b , v16.8b //to compensate the pu1_src idx incremented by 8 + tbl v19.8b, { v0.16b, v1.16b}, v4.8b //load from ref_main_idx (row 7) + add v2.8b, v2.8b , v16.8b //to compensate the pu1_src idx incremented by 8 umull v20.8h, v14.8b, v7.8b //mul (row 6) - tbl v11.8b, { v0.16b, v1.16b}, v5.8b //load from ref_main_idx - 1 (row 7) + tbl v23.8b, { v0.16b, v1.16b}, v5.8b //load from ref_main_idx - 1 (row 7) umlal v20.8h, v15.8b, v6.8b //mul (row 6) add x20, x0, #8 csel x0, x20, x0,le - sub v9.8b, v8.8b , v29.8b //ref_main_idx - 2 + sub v3.8b, v2.8b , v29.8b //ref_main_idx - 2 add x20, x8, #4 csel x8, x20, x8,gt @@ -339,15 +342,15 @@ lbl326: mov x9,#0x302 dup v27.4h,w9 //row value inc or reset accordingly - sub v4.8b, v8.8b , v29.8b //ref_main_idx (row 1) + sub v4.8b, v2.8b , v29.8b //ref_main_idx (row 1) - sub v5.8b, v9.8b , v29.8b //ref_main_idx - 1 (row 1) - tbl v12.8b, { v0.16b, v1.16b}, v8.8b //load from ref_main_idx (row 0) + sub v5.8b, v3.8b , v29.8b //ref_main_idx - 1 (row 1) + tbl v25.8b, { v0.16b, v1.16b}, v2.8b //load from ref_main_idx (row 0) movi v29.8b, #31 //contains #2 for adding to get ref_main_idx + 1 - umull v18.8h, v10.8b, v7.8b //mul (row 7) - tbl v13.8b, { v0.16b, v1.16b}, v9.8b //load from ref_main_idx + 1 (row 0) - umlal v18.8h, v11.8b, v6.8b //mul (row 7) + umull v18.8h, v19.8b, v7.8b //mul (row 7) + tbl v13.8b, { v0.16b, v1.16b}, v3.8b //load from ref_main_idx + 1 (row 0) + umlal v18.8h, v23.8b, v6.8b //mul (row 7) ld1 {v31.8b},[x14],#8 and v6.8b, v29.8b , v26.8b //fract values in d1/ idx values in d0 @@ -361,14 +364,14 @@ lbl326: st1 {v22.8b},[x5], x3 //(from previous loop)st (row 5) rshrn v20.8b, v20.8h,#5 //(from previous loop)round shft (row 6) - sub v8.8b, v8.8b , v29.8b //ref_main_idx (row 2) - tbl v10.8b, { v0.16b, v1.16b}, v4.8b //load from ref_main_idx (row 1) - sub v9.8b, v9.8b , v29.8b //ref_main_idx - 1 (row 2) + sub v2.8b, v2.8b , v29.8b //ref_main_idx (row 2) + tbl v19.8b, { v0.16b, v1.16b}, v4.8b //load from ref_main_idx (row 1) + sub v3.8b, v3.8b , v29.8b //ref_main_idx - 1 (row 2) lsl x9, x9, #1 sub v7.8b, v28.8b , v6.8b //32-fract - umull v24.8h, v12.8b, v7.8b //mul (row 0) + umull v24.8h, v25.8b, v7.8b //mul (row 0) tbl v17.8b, { v0.16b, v1.16b}, v5.8b //load from ref_main_idx + 1 (row 1) umlal v24.8h, v13.8b, v6.8b //mul (row 0) @@ -376,22 +379,22 @@ lbl326: rshrn v18.8b, v18.8h,#5 //(from previous loop)round shft (row 7) sub v4.8b, v4.8b , v29.8b //ref_main_idx (row 3) - tbl v14.8b, { v0.16b, v1.16b}, v8.8b //load from ref_main_idx (row 2) + tbl v14.8b, { v0.16b, v1.16b}, v2.8b //load from ref_main_idx (row 2) sub v5.8b, v5.8b , v29.8b //ref_main_idx - 1 (row 3) - umull v22.8h, v10.8b, v7.8b //mul (row 1) - tbl v15.8b, { v0.16b, v1.16b}, v9.8b //load from ref_main_idx + 1 (row 2) + umull v22.8h, v19.8b, v7.8b //mul (row 1) + tbl v15.8b, { v0.16b, v1.16b}, v3.8b //load from ref_main_idx + 1 (row 2) umlal v22.8h, v17.8b, v6.8b //mul (row 1) rshrn v24.8b, v24.8h,#5 //round shft (row 0) st1 {v18.8b},[x5], x3 //(from previous loop)st (row 7) - sub v8.8b, v8.8b , v29.8b //ref_main_idx (row 4) - tbl v10.8b, { v0.16b, v1.16b}, v4.8b //load from ref_main_idx (row 3) - sub v9.8b, v9.8b , v29.8b //ref_main_idx - 1 (row 4) + sub v2.8b, v2.8b , v29.8b //ref_main_idx (row 4) + tbl v19.8b, { v0.16b, v1.16b}, v4.8b //load from ref_main_idx (row 3) + sub v3.8b, v3.8b , v29.8b //ref_main_idx - 1 (row 4) umull v20.8h, v14.8b, v7.8b //mul (row 2) - tbl v11.8b, { v0.16b, v1.16b}, v5.8b //load from ref_main_idx + 1 (row 3) + tbl v23.8b, { v0.16b, v1.16b}, v5.8b //load from ref_main_idx + 1 (row 3) umlal v20.8h, v15.8b, v6.8b //mul (row 2) add x5,x2,x3,lsl#2 @@ -402,26 +405,26 @@ lbl326: rshrn v22.8b, v22.8h,#5 //round shft (row 1) sub v4.8b, v4.8b , v29.8b //ref_main_idx (row 5) - tbl v12.8b, { v0.16b, v1.16b}, v8.8b //load from ref_main_idx (row 4) + tbl v25.8b, { v0.16b, v1.16b}, v2.8b //load from ref_main_idx (row 4) sub v5.8b, v5.8b , v29.8b //ref_main_idx - 1 (row 5) - umull v18.8h, v10.8b, v7.8b //mul (row 3) - tbl v13.8b, { v0.16b, v1.16b}, v9.8b //load from ref_main_idx + 1 (row 4) - umlal v18.8h, v11.8b, v6.8b //mul (row 3) + umull v18.8h, v19.8b, v7.8b //mul (row 3) + tbl v13.8b, { v0.16b, v1.16b}, v3.8b //load from ref_main_idx + 1 (row 4) + umlal v18.8h, v23.8b, v6.8b //mul (row 3) st1 {v22.8b},[x2], x3 //st (row 1) rshrn v20.8b, v20.8h,#5 //round shft (row 2) - xtn v10.8b, v14.8h + xtn v19.8b, v14.8h sshr v14.8h, v14.8h,#5 - sub v8.8b, v8.8b , v29.8b //ref_main_idx (row 6) + sub v2.8b, v2.8b , v29.8b //ref_main_idx (row 6) tbl v21.8b, { v0.16b, v1.16b}, v4.8b //load from ref_main_idx (row 5) - sub v9.8b, v9.8b , v29.8b //ref_main_idx - 1 (row 6) + sub v3.8b, v3.8b , v29.8b //ref_main_idx - 1 (row 6) - umull v24.8h, v12.8b, v7.8b //mul (row 4) + umull v24.8h, v25.8b, v7.8b //mul (row 4) tbl v17.8b, { v0.16b, v1.16b}, v5.8b //load from ref_main_idx + 1 (row 5) - sqxtn v11.8b, v14.8h + sqxtn v23.8b, v14.8h st1 {v20.8b},[x2], x3 //st (row 2) umlal v24.8h, v13.8b, v6.8b //mul (row 4) @@ -430,15 +433,15 @@ lbl326: dup v26.8b,w9 sub v4.8b, v4.8b , v29.8b //ref_main_idx (row 7) - tbl v14.8b, { v0.16b, v1.16b}, v8.8b //load from ref_main_idx (row 6) + tbl v14.8b, { v0.16b, v1.16b}, v2.8b //load from ref_main_idx (row 6) sub v5.8b, v5.8b , v29.8b //ref_main_idx - 1 (row 7) mov x6, #22 //to compensate the 2*row value - shl v11.8b, v11.8b,#1 + shl v23.8b, v23.8b,#1 sub x6, x6, x0, lsl #1 umull v22.8h, v21.8b, v7.8b //mul (row 5) - tbl v15.8b, { v0.16b, v1.16b}, v9.8b //load from ref_main_idx + 1 (row 6) + tbl v15.8b, { v0.16b, v1.16b}, v3.8b //load from ref_main_idx + 1 (row 6) umlal v22.8h, v17.8b, v6.8b //mul (row 5) st1 {v18.8b},[x2], x3 //st (row 3) @@ -451,7 +454,7 @@ lbl326: sub x20, x2, x4 csel x2, x20, x2,le - sub v11.8b, v11.8b , v27.8b //ref_main_idx (add row) + sub v23.8b, v23.8b , v27.8b //ref_main_idx (add row) sub x20,x2,#8 csel x2, x20, x2,le @@ -460,17 +463,17 @@ lbl326: bne kernel_8_16_32 epil_8_16_32: - tbl v10.8b, { v0.16b, v1.16b}, v4.8b //load from ref_main_idx (row 7) + tbl v19.8b, { v0.16b, v1.16b}, v4.8b //load from ref_main_idx (row 7) umull v20.8h, v14.8b, v7.8b //mul (row 6) - tbl v11.8b, { v0.16b, v1.16b}, v5.8b //load from ref_main_idx + 1 (row 7) + tbl v23.8b, { v0.16b, v1.16b}, v5.8b //load from ref_main_idx + 1 (row 7) umlal v20.8h, v15.8b, v6.8b //mul (row 6) st1 {v24.8b},[x5], x3 //st (row 4) rshrn v24.8b, v22.8h,#5 //round shft (row 5) - umull v18.8h, v10.8b, v7.8b //mul (row 7) - umlal v18.8h, v11.8b, v6.8b //mul (row 7) + umull v18.8h, v19.8b, v7.8b //mul (row 7) + umlal v18.8h, v23.8b, v6.8b //mul (row 7) st1 {v24.8b},[x5], x3 //(from previous loop)st (row 5) rshrn v20.8b, v20.8h,#5 //(from previous loop)round shft (row 6) @@ -481,9 +484,11 @@ epil_8_16_32: st1 {v18.8b},[x5], x3 //st (row 7) end_func: - // ldmfd sp!,{x4-x12,x15} //reload the registers from sp + // ldmfd sp!,{x4-x12,x15} //reload the registers from sp ldp x19, x20,[sp],#16 - pop_v_regs + ldp d8,d15,[sp],#16 // Loading d15 using { ldr d15,[sp]; add sp,sp,#8 } is giving bus error. + // d8 is used as dummy register and loaded along with d15 using ldp. d8 is not used in the function. + ldp d13,d14,[sp],#16 ret diff --git a/common/arm64/ihevc_intra_pred_chroma_planar.s b/common/arm64/ihevc_intra_pred_chroma_planar.s index ac6b362..65c4c56 100644 --- a/common/arm64/ihevc_intra_pred_chroma_planar.s +++ b/common/arm64/ihevc_intra_pred_chroma_planar.s @@ -106,7 +106,11 @@ ihevc_intra_pred_chroma_planar_av8: // stmfd sp!, {x4-x12, x14} //stack stores the values of the arguments - push_v_regs + + stp d10,d11,[sp,#-16]! + stp d12,d13,[sp,#-16]! + stp d8,d14,[sp,#-16]! // Storing d14 using { sub sp,sp,#8; str d14,[sp] } is giving bus error. + // d8 is used as dummy register and stored along with d14 using stp. d8 is not used in the function. stp x19, x20,[sp,#-16]! adrp x11, :got:gau1_ihevc_planar_factor //loads table of coeffs @@ -165,13 +169,13 @@ ihevc_intra_pred_chroma_planar_av8: mov x10,x6 tf_sz_8_16: ld1 {v10.8b, v11.8b}, [x14],#16 //load src[2nt+1+col] - ld1 {v8.8b},[x12],#8 - mov v9.8b, v8.8b - zip1 v29.8b, v8.8b, v9.8b - zip2 v9.8b, v8.8b, v9.8b - mov v8.d[0], v29.d[0] - sub v30.8b, v2.8b , v8.8b //[nt-1-col] - sub v31.8b, v2.8b , v9.8b + ld1 {v17.8b},[x12],#8 + mov v25.8b, v17.8b + zip1 v29.8b, v17.8b, v25.8b + zip2 v25.8b, v17.8b, v25.8b + mov v17.d[0], v29.d[0] + sub v30.8b, v2.8b , v17.8b //[nt-1-col] + sub v31.8b, v2.8b , v25.8b @@ -185,7 +189,7 @@ loop_sz_8_16: sxtw x11,w11 umlal v12.8h, v6.8b, v10.8b //(nt-1-row) * src[2nt+1+col] dup v4.4h,w7 //src[2nt-1-row] - umlal v12.8h, v8.8b, v1.8b //(col+1) * src[3nt+1] + umlal v12.8h, v17.8b, v1.8b //(col+1) * src[3nt+1] dup v3.4h,w11 //src[2nt-1-row] umlal v12.8h, v30.8b, v4.8b //(nt-1-col) * src[2nt-1-row] @@ -200,14 +204,14 @@ loop_sz_8_16: umlal v28.8h, v31.8b, v4.8b sub v19.8b, v6.8b , v7.8b //[nt-1-row]-- - umlal v28.8h, v9.8b, v1.8b + umlal v28.8h, v25.8b, v1.8b dup v4.4h,w7 //src[2nt-1-row] umull v26.8h, v18.8b, v0.8b //(row+1) * src[nt-1] add v12.8h, v12.8h , v16.8h //add (nt) umlal v26.8h, v19.8b, v10.8b //(nt-1-row) * src[2nt+1+col] sshl v12.8h, v12.8h, v14.8h //shr - umlal v26.8h, v8.8b, v1.8b //(col+1) * src[3nt+1] + umlal v26.8h, v17.8b, v1.8b //(col+1) * src[3nt+1] add v28.8h, v28.8h , v16.8h umlal v26.8h, v30.8b, v3.8b //(nt-1-col) * src[2nt-1-row] sshl v28.8h, v28.8h, v14.8h @@ -220,7 +224,7 @@ loop_sz_8_16: add v5.8b, v18.8b , v7.8b //row++ [(row+1)++] umlal v24.8h, v19.8b, v11.8b sub v6.8b, v19.8b , v7.8b //[nt-1-row]-- - umlal v24.8h, v9.8b, v1.8b + umlal v24.8h, v25.8b, v1.8b xtn v12.8b, v12.8h umlal v24.8h, v31.8b, v3.8b xtn v13.8b, v28.8h @@ -233,7 +237,7 @@ loop_sz_8_16: sshl v26.8h, v26.8h, v14.8h //shr umlal v22.8h, v6.8b, v10.8b //(nt-1-row) * src[2nt+1+col] st1 {v12.2s, v13.2s}, [x2], x3 - umlal v22.8h, v8.8b, v1.8b //(col+1) * src[3nt+1] + umlal v22.8h, v17.8b, v1.8b //(col+1) * src[3nt+1] add v24.8h, v24.8h , v16.8h umlal v22.8h, v30.8b, v4.8b //(nt-1-col) * src[2nt-1-row] sshl v24.8h, v24.8h, v14.8h @@ -246,7 +250,7 @@ loop_sz_8_16: ldr w11, [x6], #-2 //src[2nt-1-row] (dec to take into account row) sxtw x11,w11 - umlal v20.8h, v9.8b, v1.8b + umlal v20.8h, v25.8b, v1.8b dup v3.4h,w11 //src[2nt-1-row] add v22.8h, v22.8h , v16.8h //add (nt) @@ -255,7 +259,7 @@ loop_sz_8_16: umlal v12.8h, v19.8b, v10.8b //(nt-1-row) * src[2nt+1+col] xtn v27.8b, v24.8h - umlal v12.8h, v8.8b, v1.8b //(col+1) * src[3nt+1] + umlal v12.8h, v17.8b, v1.8b //(col+1) * src[3nt+1] sshl v22.8h, v22.8h, v14.8h //shr umlal v12.8h, v30.8b, v3.8b //(nt-1-col) * src[2nt-1-row] @@ -268,7 +272,7 @@ loop_sz_8_16: add v5.8b, v18.8b , v7.8b //row++ [(row+1)++] sub v6.8b, v19.8b , v7.8b //[nt-1-row]-- - umlal v28.8h, v9.8b, v1.8b + umlal v28.8h, v25.8b, v1.8b umlal v28.8h, v31.8b, v3.8b sshl v20.8h, v20.8h, v14.8h @@ -319,13 +323,13 @@ loop_sz_8_16: add x2,x2,#16 ld1 {v10.8b, v11.8b}, [x14],#16 //load src[2nt+1+col] - ld1 {v8.8b},[x12],#8 - mov v9.8b, v8.8b - zip1 v29.8b, v8.8b, v9.8b - zip2 v9.8b, v8.8b, v9.8b - mov v8.d[0], v29.d[0] - sub v30.8b, v2.8b , v8.8b //[nt-1-col] - sub v31.8b, v2.8b , v9.8b + ld1 {v17.8b},[x12],#8 + mov v25.8b, v17.8b + zip1 v29.8b, v17.8b, v25.8b + zip2 v25.8b, v17.8b, v25.8b + mov v17.d[0], v29.d[0] + sub v30.8b, v2.8b , v17.8b //[nt-1-col] + sub v31.8b, v2.8b , v25.8b beq loop_sz_8_16 @@ -333,23 +337,23 @@ loop_sz_8_16: tf_sz_4: ld1 {v10.8b},[x14] //load src[2nt+1+col] - ld1 {v8.8b},[x12], x10 //load 8 coeffs [col+1] - mov v9.8b, v8.8b - zip1 v29.8b, v8.8b, v9.8b - zip2 v9.8b, v8.8b, v9.8b - mov v8.d[0], v29.d[0] + ld1 {v17.8b},[x12], x10 //load 8 coeffs [col+1] + mov v25.8b, v17.8b + zip1 v29.8b, v17.8b, v25.8b + zip2 v25.8b, v17.8b, v25.8b + mov v17.d[0], v29.d[0] loop_sz_4: //mov x10, #4 @reduce inc to #4 for 4x4 ldr w7, [x6], #-2 //src[2nt-1-row] (dec to take into account row) sxtw x7,w7 dup v4.4h,w7 //src[2nt-1-row] - sub v9.8b, v2.8b , v8.8b //[nt-1-col] + sub v25.8b, v2.8b , v17.8b //[nt-1-col] umull v12.8h, v5.8b, v0.8b //(row+1) * src[nt-1] umlal v12.8h, v6.8b, v10.8b //(nt-1-row) * src[2nt+1+col] - umlal v12.8h, v8.8b, v1.8b //(col+1) * src[3nt+1] - umlal v12.8h, v9.8b, v4.8b //(nt-1-col) * src[2nt-1-row] + umlal v12.8h, v17.8b, v1.8b //(col+1) * src[3nt+1] + umlal v12.8h, v25.8b, v4.8b //(nt-1-col) * src[2nt-1-row] // vadd.i16 q6, q6, q8 @add (nt) // vshl.s16 q6, q6, q7 @shr // vmovn.i16 d12, q6 @@ -364,9 +368,12 @@ loop_sz_4: bne loop_sz_4 end_loop: - // ldmfd sp!,{x4-x12,x15} //reload the registers from sp + // ldmfd sp!,{x4-x12,x15} //reload the registers from sp ldp x19, x20,[sp],#16 - pop_v_regs + ldp d8,d14,[sp],#16 // Loading d14 using { ldr d14,[sp]; add sp,sp,#8 } is giving bus error. + // d8 is used as dummy register and loaded along with d14 using ldp. d8 is not used in the function. + ldp d12,d13,[sp],#16 + ldp d10,d11,[sp],#16 ret diff --git a/common/arm64/ihevc_intra_pred_filters_chroma_mode_11_to_17.s b/common/arm64/ihevc_intra_pred_filters_chroma_mode_11_to_17.s index e9f83ff..5d65e63 100644 --- a/common/arm64/ihevc_intra_pred_filters_chroma_mode_11_to_17.s +++ b/common/arm64/ihevc_intra_pred_filters_chroma_mode_11_to_17.s @@ -105,7 +105,9 @@ ihevc_intra_pred_chroma_mode_11_to_17_av8: // stmfd sp!, {x4-x12, x14} //stack stores the values of the arguments - push_v_regs + + stp d12,d13,[sp,#-16]! + stp d14,d15,[sp,#-16]! stp x19, x20,[sp,#-16]! adrp x7, :got:gai4_ihevc_ang_table @@ -279,8 +281,8 @@ prologue_8_16_32: // mov x0, #32 movi v28.8b, #32 - sqxtn v8.8b, v22.8h - shl v8.8b, v8.8b,#1 // 2 * idx + sqxtn v19.8b, v22.8h + shl v19.8b, v19.8b,#1 // 2 * idx and v6.8b, v6.8b , v29.8b //fract values in d1/ idx values in d0 @@ -292,15 +294,15 @@ prologue_8_16_32: add v27.8b, v27.8b , v29.8b mov x0,#0 - add v8.8b, v8.8b , v27.8b //ref_main_idx (add row) - sub v8.8b, v8.8b , v26.8b //ref_main_idx (row 0) - add v9.8b, v8.8b , v29.8b //ref_main_idx + 1 (row 0) - tbl v12.8b, { v0.16b, v1.16b}, v8.8b //load from ref_main_idx (row 0) + add v19.8b, v19.8b , v27.8b //ref_main_idx (add row) + sub v19.8b, v19.8b , v26.8b //ref_main_idx (row 0) + add v21.8b, v19.8b , v29.8b //ref_main_idx + 1 (row 0) + tbl v12.8b, { v0.16b, v1.16b}, v19.8b //load from ref_main_idx (row 0) sub v7.8b, v28.8b , v6.8b //32-fract - tbl v13.8b, { v0.16b, v1.16b}, v9.8b //load from ref_main_idx + 1 (row 0) - add v4.8b, v8.8b , v29.8b //ref_main_idx (row 1) - add v5.8b, v9.8b , v29.8b //ref_main_idx + 1 (row 1) + tbl v13.8b, { v0.16b, v1.16b}, v21.8b //load from ref_main_idx + 1 (row 0) + add v4.8b, v19.8b , v29.8b //ref_main_idx (row 1) + add v5.8b, v21.8b , v29.8b //ref_main_idx + 1 (row 1) // mov x0, #4 @ 2 *(row * 2 ) movi v29.8b, #4 @@ -310,38 +312,38 @@ prologue_8_16_32: umlal v24.8h, v13.8b, v6.8b //mul (row 0) tbl v17.8b, { v0.16b, v1.16b}, v5.8b //load from ref_main_idx + 1 (row 1) - add v8.8b, v8.8b , v29.8b //ref_main_idx (row 2) - add v9.8b, v9.8b , v29.8b //ref_main_idx + 1 (row 2) + add v19.8b, v19.8b , v29.8b //ref_main_idx (row 2) + add v21.8b, v21.8b , v29.8b //ref_main_idx + 1 (row 2) rshrn v24.8b, v24.8h,#5 //round shft (row 0) - tbl v14.8b, { v0.16b, v1.16b}, v8.8b //load from ref_main_idx (row 2) + tbl v14.8b, { v0.16b, v1.16b}, v19.8b //load from ref_main_idx (row 2) umull v22.8h, v16.8b, v7.8b //mul (row 1) umlal v22.8h, v17.8b, v6.8b //mul (row 1) - tbl v15.8b, { v0.16b, v1.16b}, v9.8b //load from ref_main_idx + 1 (row 2) + tbl v15.8b, { v0.16b, v1.16b}, v21.8b //load from ref_main_idx + 1 (row 2) add v4.8b, v4.8b , v29.8b //ref_main_idx (row 3) add v5.8b, v5.8b , v29.8b //ref_main_idx + 1 (row 3) st1 {v24.8b},[x2], x3 //st (row 0) rshrn v22.8b, v22.8h,#5 //round shft (row 1) - tbl v10.8b, { v0.16b, v1.16b}, v4.8b //load from ref_main_idx (row 3) + tbl v23.8b, { v0.16b, v1.16b}, v4.8b //load from ref_main_idx (row 3) umull v20.8h, v14.8b, v7.8b //mul (row 2) umlal v20.8h, v15.8b, v6.8b //mul (row 2) - tbl v11.8b, { v0.16b, v1.16b}, v5.8b //load from ref_main_idx + 1 (row 3) - add v8.8b, v8.8b , v29.8b //ref_main_idx (row 4) - add v9.8b, v9.8b , v29.8b //ref_main_idx + 1 (row 4) + tbl v25.8b, { v0.16b, v1.16b}, v5.8b //load from ref_main_idx + 1 (row 3) + add v19.8b, v19.8b , v29.8b //ref_main_idx (row 4) + add v21.8b, v21.8b , v29.8b //ref_main_idx + 1 (row 4) st1 {v22.8b},[x2], x3 //st (row 1) rshrn v20.8b, v20.8h,#5 //round shft (row 2) - tbl v12.8b, { v0.16b, v1.16b}, v8.8b //load from ref_main_idx (row 4) - umull v18.8h, v10.8b, v7.8b //mul (row 3) - umlal v18.8h, v11.8b, v6.8b //mul (row 3) + tbl v12.8b, { v0.16b, v1.16b}, v19.8b //load from ref_main_idx (row 4) + umull v18.8h, v23.8b, v7.8b //mul (row 3) + umlal v18.8h, v25.8b, v6.8b //mul (row 3) - tbl v13.8b, { v0.16b, v1.16b}, v9.8b //load from ref_main_idx + 1 (row 4) + tbl v13.8b, { v0.16b, v1.16b}, v21.8b //load from ref_main_idx + 1 (row 4) add v4.8b, v4.8b , v29.8b //ref_main_idx (row 5) add v5.8b, v5.8b , v29.8b //ref_main_idx + 1 (row 5) @@ -353,32 +355,32 @@ prologue_8_16_32: umlal v24.8h, v13.8b, v6.8b //mul (row 4) tbl v17.8b, { v0.16b, v1.16b}, v5.8b //load from ref_main_idx + 1 (row 5) - add v8.8b, v8.8b , v29.8b //ref_main_idx (row 6) - add v9.8b, v9.8b , v29.8b //ref_main_idx + 1 (row 6) + add v19.8b, v19.8b , v29.8b //ref_main_idx (row 6) + add v21.8b, v21.8b , v29.8b //ref_main_idx + 1 (row 6) st1 {v18.8b},[x2], x3 //st (row 3) cmp x4,#4 beq end_func rshrn v24.8b, v24.8h,#5 //round shft (row 4) - tbl v14.8b, { v0.16b, v1.16b}, v8.8b //load from ref_main_idx (row 6) + tbl v14.8b, { v0.16b, v1.16b}, v19.8b //load from ref_main_idx (row 6) umull v22.8h, v16.8b, v7.8b //mul (row 5) umlal v22.8h, v17.8b, v6.8b //mul (row 5) - tbl v15.8b, { v0.16b, v1.16b}, v9.8b //load from ref_main_idx + 1 (row 6) + tbl v15.8b, { v0.16b, v1.16b}, v21.8b //load from ref_main_idx + 1 (row 6) add v4.8b, v4.8b , v29.8b //ref_main_idx (row 7) add v5.8b, v5.8b , v29.8b //ref_main_idx + 1 (row 7) st1 {v24.8b},[x2], x3 //st (row 4) rshrn v22.8b, v22.8h,#5 //round shft (row 5) - tbl v10.8b, { v0.16b, v1.16b}, v4.8b //load from ref_main_idx (row 7) + tbl v23.8b, { v0.16b, v1.16b}, v4.8b //load from ref_main_idx (row 7) umull v20.8h, v14.8b, v7.8b //mul (row 6) umlal v20.8h, v15.8b, v6.8b //mul (row 6) - tbl v11.8b, { v0.16b, v1.16b}, v5.8b //load from ref_main_idx + 1 (row 7) - umull v18.8h, v10.8b, v7.8b //mul (row 7) - umlal v18.8h, v11.8b, v6.8b //mul (row 7) + tbl v25.8b, { v0.16b, v1.16b}, v5.8b //load from ref_main_idx + 1 (row 7) + umull v18.8h, v23.8b, v7.8b //mul (row 7) + umlal v18.8h, v25.8b, v6.8b //mul (row 7) st1 {v22.8b},[x2], x3 //st (row 5) rshrn v20.8b, v20.8h,#5 //round shft (row 6) @@ -413,10 +415,10 @@ lbl400: ld1 {v31.8b},[x14],#8 smull v12.8h, v30.8b, v31.8b //(col+1)*intra_pred_angle [0:7](col) - xtn v10.8b, v12.8h + xtn v23.8b, v12.8h sshr v12.8h, v12.8h,#5 - sqxtn v11.8b, v12.8h - shl v11.8b, v11.8b,#1 + sqxtn v25.8b, v12.8h + shl v25.8b, v25.8b,#1 orr x5,x0,x0, lsl#8 add x5, x5,#0x002 add x5, x5,#0x300 @@ -427,7 +429,7 @@ lbl400: add x9, x9, x0, lsl #1 // sub x9, x9, #1 dup v26.8b,w9 - add v8.8b, v27.8b , v11.8b //ref_main_idx (add row) + add v19.8b, v27.8b , v25.8b //ref_main_idx (add row) mov x5,x2 // sub x4,x4,#8 @@ -435,16 +437,16 @@ lbl400: kernel_8_16_32: movi v29.8b, #2 //contains #2 for adding to get ref_main_idx + 1 - sub v8.8b, v8.8b , v26.8b //ref_main_idx - mov v26.8b, v10.8b + sub v19.8b, v19.8b , v26.8b //ref_main_idx + mov v26.8b, v23.8b subs x11, x11, #8 add x6, x1, x9 - tbl v10.8b, { v0.16b, v1.16b}, v4.8b //load from ref_main_idx (row 7) - add v9.8b, v29.8b , v8.8b //ref_main_idx + 1 + tbl v23.8b, { v0.16b, v1.16b}, v4.8b //load from ref_main_idx (row 7) + add v21.8b, v29.8b , v19.8b //ref_main_idx + 1 umull v20.8h, v14.8b, v7.8b //mul (row 6) - tbl v11.8b, { v0.16b, v1.16b}, v5.8b //load from ref_main_idx + 1 (row 7) + tbl v25.8b, { v0.16b, v1.16b}, v5.8b //load from ref_main_idx + 1 (row 7) umlal v20.8h, v15.8b, v6.8b //mul (row 6) add x20, x0, #8 @@ -468,15 +470,15 @@ kernel_8_16_32: ldr x14, [x14, #:got_lo12:col_for_intra_chroma] lbl452: - add v4.8b, v29.8b , v8.8b //ref_main_idx (row 1) - tbl v12.8b, { v0.16b, v1.16b}, v8.8b //load from ref_main_idx (row 0) - add v5.8b, v29.8b , v9.8b //ref_main_idx + 1 (row 1) + add v4.8b, v29.8b , v19.8b //ref_main_idx (row 1) + tbl v12.8b, { v0.16b, v1.16b}, v19.8b //load from ref_main_idx (row 0) + add v5.8b, v29.8b , v21.8b //ref_main_idx + 1 (row 1) movi v29.8b, #31 //contains #2 for adding to get ref_main_idx + 1 - umull v18.8h, v10.8b, v7.8b //mul (row 7) - tbl v13.8b, { v0.16b, v1.16b}, v9.8b //load from ref_main_idx + 1 (row 0) - umlal v18.8h, v11.8b, v6.8b //mul (row 7) + umull v18.8h, v23.8b, v7.8b //mul (row 7) + tbl v13.8b, { v0.16b, v1.16b}, v21.8b //load from ref_main_idx + 1 (row 0) + umlal v18.8h, v25.8b, v6.8b //mul (row 7) ld1 {v31.8b},[x14],#8 and v6.8b, v29.8b , v26.8b //fract values in d1/ idx values in d0 @@ -486,9 +488,9 @@ lbl452: st1 {v24.8b},[x5], x3 //(from previous loop)st (row 5) rshrn v20.8b, v20.8h,#5 //(from previous loop)round shft (row 6) - add v8.8b, v29.8b , v8.8b //ref_main_idx (row 2) + add v19.8b, v29.8b , v19.8b //ref_main_idx (row 2) tbl v16.8b, { v0.16b, v1.16b}, v4.8b //load from ref_main_idx (row 1) - add v9.8b, v29.8b , v9.8b //ref_main_idx + 1 (row 2) + add v21.8b, v29.8b , v21.8b //ref_main_idx + 1 (row 2) lsl x20, x4, #1 csel x11,x20,x11,le @@ -505,22 +507,22 @@ lbl452: rshrn v18.8b, v18.8h,#5 //(from previous loop)round shft (row 7) add v4.8b, v4.8b , v29.8b //ref_main_idx (row 3) - tbl v14.8b, { v0.16b, v1.16b}, v8.8b //load from ref_main_idx (row 2) + tbl v14.8b, { v0.16b, v1.16b}, v19.8b //load from ref_main_idx (row 2) add v5.8b, v5.8b , v29.8b //ref_main_idx + 1 (row 3) umull v22.8h, v16.8b, v7.8b //mul (row 1) - tbl v15.8b, { v0.16b, v1.16b}, v9.8b //load from ref_main_idx + 1 (row 2) + tbl v15.8b, { v0.16b, v1.16b}, v21.8b //load from ref_main_idx + 1 (row 2) umlal v22.8h, v17.8b, v6.8b //mul (row 1) rshrn v24.8b, v24.8h,#5 //round shft (row 0) st1 {v18.8b},[x5], x3 //(from previous loop)st (row 7) - add v8.8b, v8.8b , v29.8b //ref_main_idx (row 4) - tbl v10.8b, { v0.16b, v1.16b}, v4.8b //load from ref_main_idx (row 3) - add v9.8b, v9.8b , v29.8b //ref_main_idx + 1 (row 4) + add v19.8b, v19.8b , v29.8b //ref_main_idx (row 4) + tbl v23.8b, { v0.16b, v1.16b}, v4.8b //load from ref_main_idx (row 3) + add v21.8b, v21.8b , v29.8b //ref_main_idx + 1 (row 4) umull v20.8h, v14.8b, v7.8b //mul (row 2) - tbl v11.8b, { v0.16b, v1.16b}, v5.8b //load from ref_main_idx + 1 (row 3) + tbl v25.8b, { v0.16b, v1.16b}, v5.8b //load from ref_main_idx + 1 (row 3) umlal v20.8h, v15.8b, v6.8b //mul (row 2) smull v14.8h, v30.8b, v31.8b //(col+1)*intra_pred_angle [0:7](col) @@ -532,22 +534,22 @@ lbl452: rshrn v22.8b, v22.8h,#5 //round shft (row 1) add v4.8b, v4.8b , v29.8b //ref_main_idx (row 5) - tbl v12.8b, { v0.16b, v1.16b}, v8.8b //load from ref_main_idx (row 4) + tbl v12.8b, { v0.16b, v1.16b}, v19.8b //load from ref_main_idx (row 4) add v5.8b, v5.8b , v29.8b //ref_main_idx + 1 (row 5) - umull v18.8h, v10.8b, v7.8b //mul (row 3) - tbl v13.8b, { v0.16b, v1.16b}, v9.8b //load from ref_main_idx + 1 (row 4) - umlal v18.8h, v11.8b, v6.8b //mul (row 3) + umull v18.8h, v23.8b, v7.8b //mul (row 3) + tbl v13.8b, { v0.16b, v1.16b}, v21.8b //load from ref_main_idx + 1 (row 4) + umlal v18.8h, v25.8b, v6.8b //mul (row 3) st1 {v22.8b},[x2], x3 //st (row 1) rshrn v20.8b, v20.8h,#5 //round shft (row 2) - xtn v10.8b, v14.8h + xtn v23.8b, v14.8h sshr v14.8h, v14.8h,#5 - add v8.8b, v8.8b , v29.8b //ref_main_idx (row 6) + add v19.8b, v19.8b , v29.8b //ref_main_idx (row 6) tbl v16.8b, { v0.16b, v1.16b}, v4.8b //load from ref_main_idx (row 5) - add v9.8b, v9.8b , v29.8b //ref_main_idx + 1 (row 6) + add v21.8b, v21.8b , v29.8b //ref_main_idx + 1 (row 6) umull v24.8h, v12.8b, v7.8b //mul (row 4) tbl v17.8b, { v0.16b, v1.16b}, v5.8b //load from ref_main_idx + 1 (row 5) @@ -557,19 +559,19 @@ lbl452: rshrn v18.8b, v18.8h,#5 //round shft (row 3) // sub x9, x9, #1 - sqxtn v11.8b, v14.8h + sqxtn v25.8b, v14.8h add v4.8b, v4.8b , v29.8b //ref_main_idx (row 7) - tbl v14.8b, { v0.16b, v1.16b}, v8.8b //load from ref_main_idx (row 6) + tbl v14.8b, { v0.16b, v1.16b}, v19.8b //load from ref_main_idx (row 6) add v5.8b, v5.8b , v29.8b //ref_main_idx + 1 (row 7) - shl v11.8b, v11.8b,#1 + shl v25.8b, v25.8b,#1 umull v22.8h, v16.8b, v7.8b //mul (row 5) - tbl v15.8b, { v0.16b, v1.16b}, v9.8b //load from ref_main_idx + 1 (row 6) + tbl v15.8b, { v0.16b, v1.16b}, v21.8b //load from ref_main_idx + 1 (row 6) umlal v22.8h, v17.8b, v6.8b //mul (row 5) - add v8.8b, v27.8b , v11.8b //ref_main_idx (add row) + add v19.8b, v27.8b , v25.8b //ref_main_idx (add row) dup v26.8b,w9 st1 {v18.8b},[x2], x3 //st (row 3) @@ -589,17 +591,17 @@ lbl452: bne kernel_8_16_32 epil_8_16_32: - tbl v10.8b, { v0.16b, v1.16b}, v4.8b //load from ref_main_idx (row 7) + tbl v23.8b, { v0.16b, v1.16b}, v4.8b //load from ref_main_idx (row 7) umull v20.8h, v14.8b, v7.8b //mul (row 6) - tbl v11.8b, { v0.16b, v1.16b}, v5.8b //load from ref_main_idx + 1 (row 7) + tbl v25.8b, { v0.16b, v1.16b}, v5.8b //load from ref_main_idx + 1 (row 7) umlal v20.8h, v15.8b, v6.8b //mul (row 6) st1 {v24.8b},[x5], x3 //st (row 4) rshrn v24.8b, v22.8h,#5 //round shft (row 5) - umull v18.8h, v10.8b, v7.8b //mul (row 7) - umlal v18.8h, v11.8b, v6.8b //mul (row 7) + umull v18.8h, v23.8b, v7.8b //mul (row 7) + umlal v18.8h, v25.8b, v6.8b //mul (row 7) st1 {v24.8b},[x5], x3 //(from previous loop)st (row 5) rshrn v20.8b, v20.8h,#5 //(from previous loop)round shft (row 6) @@ -613,7 +615,8 @@ end_func: add sp, sp, #132 // ldmfd sp!,{x4-x12,x15} //reload the registers from sp ldp x19, x20,[sp],#16 - pop_v_regs + ldp d14,d15,[sp],#16 + ldp d12,d13,[sp],#16 ret diff --git a/common/arm64/ihevc_intra_pred_filters_chroma_mode_19_to_25.s b/common/arm64/ihevc_intra_pred_filters_chroma_mode_19_to_25.s index 3af2da7..261c591 100644 --- a/common/arm64/ihevc_intra_pred_filters_chroma_mode_19_to_25.s +++ b/common/arm64/ihevc_intra_pred_filters_chroma_mode_19_to_25.s @@ -102,8 +102,11 @@ ihevc_intra_pred_chroma_mode_19_to_25_av8: - // stmfd sp!, {x4-x12, x14} //stack stores the values of the arguments - push_v_regs + // stmfd sp!, {x4-x12, x14} //stack stores the values of the arguments + + stp d12,d13,[sp,#-16]! + stp d8,d14,[sp,#-16]! // Storing d14 using { sub sp,sp,#8; str d14,[sp] } is giving bus error. + // d8 is used as dummy register and stored along with d14 using stp. d8 is not used in the function. stp x19, x20,[sp,#-16]! adrp x7, :got:gai4_ihevc_ang_table @@ -264,10 +267,10 @@ prologue: add x10,x8,x9 //(i row)*pu1_ref[ref_main_idx] - ld1 {v8.8b},[x10],x11 //(i row)ref_main_idx + ld1 {v7.8b},[x10],x11 //(i row)ref_main_idx sbfx x9,x14,#8,#8 - ld1 {v9.8b},[x10] //(i row)ref_main_idx_1 + ld1 {v19.8b},[x10] //(i row)ref_main_idx_1 add x12,x8,x9 //(ii)*pu1_ref[ref_main_idx] sbfx x9,x14,#16,#8 @@ -275,10 +278,10 @@ prologue: add x10,x8,x9 //(iii)*pu1_ref[ref_main_idx] ld1 {v12.8b},[x12],x11 //(ii)ref_main_idx - umull v10.8h, v8.8b, v30.8b //(i row)vmull_u8(ref_main_idx, dup_const_32_fract) + umull v23.8h, v7.8b, v30.8b //(i row)vmull_u8(ref_main_idx, dup_const_32_fract) ld1 {v13.8b},[x12] //(ii)ref_main_idx_1 - umlal v10.8h, v9.8b, v31.8b //(i row)vmull_u8(ref_main_idx_1, dup_const_fract) + umlal v23.8h, v19.8b, v31.8b //(i row)vmull_u8(ref_main_idx_1, dup_const_fract) dup v27.8b, v4.8b[2] //(iii) sub v28.8b, v1.8b , v29.8b //(ii)32-fract(dup_const_32_fract) @@ -292,7 +295,7 @@ prologue: umlal v14.8h, v13.8b, v29.8b //(ii)vmull_u8(ref_main_idx_1, dup_const_fract) ld1 {v17.8b},[x10] //(iii)ref_main_idx_1 - rshrn v10.8b, v10.8h,#5 //(i row)shift_res = vrshrn_n_u16(add_res, 5) + rshrn v23.8b, v23.8h,#5 //(i row)shift_res = vrshrn_n_u16(add_res, 5) ld1 {v20.8b},[x12],x11 //(iv)ref_main_idx sub v26.8b, v1.8b , v27.8b //(iii)32-fract(dup_const_32_fract) @@ -306,20 +309,20 @@ prologue: umlal v18.8h, v17.8b, v27.8b //(iii)vmull_u8(ref_main_idx_1, dup_const_fract) // lsl x14,x14,#1 - st1 {v10.8b},[x2],#8 //(i row) + st1 {v23.8b},[x2],#8 //(i row) rshrn v14.8b, v14.8h,#5 //(ii)shift_res = vrshrn_n_u16(add_res, 5) sbfx x9,x14,#0,#8 dup v29.8b, v4.8b[5] //(vi) add x10,x8,x9 //(v)*pu1_ref[ref_main_idx] - ld1 {v8.8b},[x10],x11 //(v)ref_main_idx + ld1 {v7.8b},[x10],x11 //(v)ref_main_idx sub v24.8b, v1.8b , v25.8b //(iv)32-fract(dup_const_32_fract) umull v22.8h, v20.8b, v24.8b //(iv)vmull_u8(ref_main_idx, dup_const_32_fract) sbfx x9,x14,#8,#8 - ld1 {v9.8b},[x10] //(v)ref_main_idx_1 + ld1 {v19.8b},[x10] //(v)ref_main_idx_1 umlal v22.8h, v21.8b, v25.8b //(iv)vmull_u8(ref_main_idx_1, dup_const_fract) st1 {v14.8b},[x0],x3 //(ii) @@ -333,10 +336,10 @@ prologue: add x10,x8,x9 //(vii)*pu1_ref[ref_main_idx] ld1 {v12.8b},[x12],x11 //(vi)ref_main_idx - umull v10.8h, v8.8b, v30.8b //(v)vmull_u8(ref_main_idx, dup_const_32_fract) + umull v23.8h, v7.8b, v30.8b //(v)vmull_u8(ref_main_idx, dup_const_32_fract) ld1 {v13.8b},[x12] //(vi)ref_main_idx_1 - umlal v10.8h, v9.8b, v31.8b //(v)vmull_u8(ref_main_idx_1, dup_const_fract) + umlal v23.8h, v19.8b, v31.8b //(v)vmull_u8(ref_main_idx_1, dup_const_fract) st1 {v18.8b},[x0],x3 //(iii) rshrn v22.8b, v22.8h,#5 //(iv)shift_res = vrshrn_n_u16(add_res, 5) @@ -358,7 +361,7 @@ prologue: cmp x4,#8 // go to end if 4x4 beq end_loops - rshrn v10.8b, v10.8h,#5 //(v)shift_res = vrshrn_n_u16(add_res, 5) + rshrn v23.8b, v23.8h,#5 //(v)shift_res = vrshrn_n_u16(add_res, 5) ld1 {v20.8b},[x12],x11 //(viii)ref_main_idx sub v26.8b, v1.8b , v27.8b //(vii)32-fract(dup_const_32_fract) @@ -372,7 +375,7 @@ prologue: sub x20,x4,#8 csel x4, x20, x4,gt - st1 {v10.8b},[x0],x3 //(v) + st1 {v23.8b},[x0],x3 //(v) rshrn v14.8b, v14.8h,#5 //(vi)shift_res = vrshrn_n_u16(add_res, 5) beq epilogue @@ -393,14 +396,14 @@ kernel_8_rows: subs x4,x4,#8 sbfx x9,x14,#8,#8 - ld1 {v8.8b},[x10],x11 //(i)ref_main_idx + ld1 {v7.8b},[x10],x11 //(i)ref_main_idx sub v24.8b, v1.8b , v25.8b //(viii)32-fract(dup_const_32_fract) add x20,x6,#8 //increment the row value csel x6, x20, x6,le add x12,x8,x9 //(ii)*pu1_ref[ref_main_idx] - ld1 {v9.8b},[x10] //(i)ref_main_idx_1 + ld1 {v19.8b},[x10] //(i)ref_main_idx_1 umull v22.8h, v20.8b, v24.8b //(viii)vmull_u8(ref_main_idx, dup_const_32_fract) ld1 {v5.8b},[x6] //loads the row value @@ -417,10 +420,10 @@ kernel_8_rows: add x10,x8,x9 //(iii)*pu1_ref[ref_main_idx] ld1 {v12.8b},[x12],x11 //(ii)ref_main_idx - umull v10.8h, v8.8b, v30.8b //(i)vmull_u8(ref_main_idx, dup_const_32_fract) + umull v23.8h, v7.8b, v30.8b //(i)vmull_u8(ref_main_idx, dup_const_32_fract) ld1 {v13.8b},[x12] //(ii)ref_main_idx_1 - umlal v10.8h, v9.8b, v31.8b //(i)vmull_u8(ref_main_idx_1, dup_const_fract) + umlal v23.8h, v19.8b, v31.8b //(i)vmull_u8(ref_main_idx_1, dup_const_fract) sbfx x9,x14,#24,#8 csel x4, x5, x4,le //reload nt @@ -439,7 +442,7 @@ kernel_8_rows: umlal v14.8h, v13.8b, v29.8b //(ii)vmull_u8(ref_main_idx_1, dup_const_fract) ld1 {v17.8b},[x10] //(iii)ref_main_idx_1 - rshrn v10.8b, v10.8h,#5 //(i)shift_res = vrshrn_n_u16(add_res, 5) + rshrn v23.8b, v23.8h,#5 //(i)shift_res = vrshrn_n_u16(add_res, 5) dup v25.8b, v4.8b[3] //(iv) smull v2.8h, v5.8b, v0.8b //pos = ((row + 1) * intra_pred_ang) @@ -463,7 +466,7 @@ kernel_8_rows: add x10,x8,x9 //(v)*pu1_ref[ref_main_idx] sbfx x9,x14,#8,#8 - st1 {v10.8b},[x2],#8 //(i) + st1 {v23.8b},[x2],#8 //(i) sub v24.8b, v1.8b , v25.8b //(iv)32-fract(dup_const_32_fract) dup v29.8b, v4.8b[5] //(vi) @@ -478,10 +481,10 @@ kernel_8_rows: dup v25.8b, v4.8b[7] //(viii) rshrn v18.8b, v18.8h,#5 //(iii)shift_res = vrshrn_n_u16(add_res, 5) - ld1 {v8.8b},[x10],x11 //(v)ref_main_idx + ld1 {v7.8b},[x10],x11 //(v)ref_main_idx and v4.16b, v2.16b , v6.16b //dup_const_fract(fract = pos & (31)) - ld1 {v9.8b},[x10] //(v)ref_main_idx_1 + ld1 {v19.8b},[x10] //(v)ref_main_idx_1 shrn v3.8b, v2.8h,#5 //idx = pos >> 5 st1 {v14.8b},[x0],x3 //(ii) @@ -496,10 +499,10 @@ kernel_8_rows: shl v3.8b, v3.8b,#1 ld1 {v13.8b},[x12] //(vi)ref_main_idx_1 - umull v10.8h, v8.8b, v30.8b //(v)vmull_u8(ref_main_idx, dup_const_32_fract) + umull v23.8h, v7.8b, v30.8b //(v)vmull_u8(ref_main_idx, dup_const_32_fract) smov x14, v3.2s[0] //(i)extract idx to the r register - umlal v10.8h, v9.8b, v31.8b //(v)vmull_u8(ref_main_idx_1, dup_const_fract) + umlal v23.8h, v19.8b, v31.8b //(v)vmull_u8(ref_main_idx_1, dup_const_fract) add x12,x8,x9 //(viii)*pu1_ref[ref_main_idx] csel x8, x1, x8,le //reload the source to pu1_src+2nt @@ -514,7 +517,7 @@ kernel_8_rows: umlal v14.8h, v13.8b, v29.8b //(vi)vmull_u8(ref_main_idx_1, dup_const_fract) ld1 {v20.8b},[x12],x11 //(viii)ref_main_idx - rshrn v10.8b, v10.8h,#5 //(v)shift_res = vrshrn_n_u16(add_res, 5) + rshrn v23.8b, v23.8h,#5 //(v)shift_res = vrshrn_n_u16(add_res, 5) ld1 {v21.8b},[x12] //(viii)ref_main_idx_1 sub v26.8b, v1.8b , v27.8b //(vii)32-fract(dup_const_32_fract) @@ -529,7 +532,7 @@ kernel_8_rows: st1 {v22.8b},[x0],x3 //(iv) umull v18.8h, v16.8b, v26.8b //(vii)vmull_u8(ref_main_idx, dup_const_32_fract) - st1 {v10.8b},[x0],x3 //(v) + st1 {v23.8b},[x0],x3 //(v) umlal v18.8h, v17.8b, v27.8b //(vii)vmull_u8(ref_main_idx_1, dup_const_fract) add x20,x2,x12 //increment the dst pointer to 8*dst_strd - nt @@ -563,9 +566,11 @@ core_loop_4: end_loops: add sp, sp, #132 - // ldmfd sp!,{x4-x12,x15} //reload the registers from sp + // ldmfd sp!,{x4-x12,x15} //reload the registers from sp ldp x19, x20,[sp],#16 - pop_v_regs + ldp d8,d14,[sp],#16 // Loading d14 using { ldr d14,[sp]; add sp,sp,#8 } is giving bus error. + // d8 is used as dummy register and loaded along with d14 using ldp. d8 is not used in the function. + ldp d12,d13,[sp],#16 ret diff --git a/common/arm64/ihevc_intra_pred_filters_luma_mode_11_to_17.s b/common/arm64/ihevc_intra_pred_filters_luma_mode_11_to_17.s index 1502ad6..66f4699 100644 --- a/common/arm64/ihevc_intra_pred_filters_luma_mode_11_to_17.s +++ b/common/arm64/ihevc_intra_pred_filters_luma_mode_11_to_17.s @@ -105,7 +105,9 @@ ihevc_intra_pred_luma_mode_11_to_17_av8: // stmfd sp!, {x4-x12, x14} //stack stores the values of the arguments - push_v_regs + + stp d12,d13,[sp,#-16]! + stp d14,d15,[sp,#-16]! stp x19, x20,[sp,#-16]! adrp x7, :got:gai4_ihevc_ang_table @@ -287,60 +289,60 @@ prologue_8_16_32: mov x0, #32 dup v28.8b,w0 - sqxtn v8.8b, v22.8h + sqxtn v19.8b, v22.8h and v6.8b, v6.8b , v29.8b //fract values in d1/ idx values in d0 mov x0, #1 dup v27.8b,w0 //row value inc or reset accordingly - add v8.8b, v8.8b , v27.8b //ref_main_idx (add row) - sub v8.8b, v8.8b , v26.8b //ref_main_idx (row 0) - add v9.8b, v8.8b , v2.8b //ref_main_idx + 1 (row 0) - tbl v12.8b, {v0.16b},v8.8b //load from ref_main_idx (row 0) + add v19.8b, v19.8b , v27.8b //ref_main_idx (add row) + sub v19.8b, v19.8b , v26.8b //ref_main_idx (row 0) + add v21.8b, v19.8b , v2.8b //ref_main_idx + 1 (row 0) + tbl v12.8b, {v0.16b},v19.8b //load from ref_main_idx (row 0) sub v7.8b, v28.8b , v6.8b //32-fract - tbl v13.8b, {v0.16b},v9.8b //load from ref_main_idx + 1 (row 0) - add v4.8b, v8.8b , v2.8b //ref_main_idx (row 1) - add v5.8b, v9.8b , v2.8b //ref_main_idx + 1 (row 1) + tbl v13.8b, {v0.16b},v21.8b //load from ref_main_idx + 1 (row 0) + add v4.8b, v19.8b , v2.8b //ref_main_idx (row 1) + add v5.8b, v21.8b , v2.8b //ref_main_idx + 1 (row 1) tbl v16.8b, {v0.16b},v4.8b //load from ref_main_idx (row 1) umull v24.8h, v12.8b, v7.8b //mul (row 0) umlal v24.8h, v13.8b, v6.8b //mul (row 0) tbl v17.8b, {v0.16b},v5.8b //load from ref_main_idx + 1 (row 1) - add v8.8b, v8.8b , v3.8b //ref_main_idx (row 2) - add v9.8b, v9.8b , v3.8b //ref_main_idx + 1 (row 2) + add v19.8b, v19.8b , v3.8b //ref_main_idx (row 2) + add v21.8b, v21.8b , v3.8b //ref_main_idx + 1 (row 2) rshrn v24.8b, v24.8h,#5 //round shft (row 0) - tbl v14.8b, {v0.16b},v8.8b //load from ref_main_idx (row 2) + tbl v14.8b, {v0.16b},v19.8b //load from ref_main_idx (row 2) umull v22.8h, v16.8b, v7.8b //mul (row 1) umlal v22.8h, v17.8b, v6.8b //mul (row 1) - tbl v15.8b, {v0.16b},v9.8b //load from ref_main_idx + 1 (row 2) + tbl v15.8b, {v0.16b},v21.8b //load from ref_main_idx + 1 (row 2) add v4.8b, v4.8b , v3.8b //ref_main_idx (row 3) add v5.8b, v5.8b , v3.8b //ref_main_idx + 1 (row 3) st1 {v24.8b},[x2], x3 //st (row 0) rshrn v22.8b, v22.8h,#5 //round shft (row 1) - tbl v10.8b, {v0.16b},v4.8b //load from ref_main_idx (row 3) + tbl v23.8b, {v0.16b},v4.8b //load from ref_main_idx (row 3) umull v20.8h, v14.8b, v7.8b //mul (row 2) umlal v20.8h, v15.8b, v6.8b //mul (row 2) - tbl v11.8b, {v0.16b},v5.8b //load from ref_main_idx + 1 (row 3) - add v8.8b, v8.8b , v3.8b //ref_main_idx (row 4) - add v9.8b, v9.8b , v3.8b //ref_main_idx + 1 (row 4) + tbl v25.8b, {v0.16b},v5.8b //load from ref_main_idx + 1 (row 3) + add v19.8b, v19.8b , v3.8b //ref_main_idx (row 4) + add v21.8b, v21.8b , v3.8b //ref_main_idx + 1 (row 4) st1 {v22.8b},[x2], x3 //st (row 1) rshrn v20.8b, v20.8h,#5 //round shft (row 2) - tbl v12.8b, {v0.16b},v8.8b //load from ref_main_idx (row 4) - umull v18.8h, v10.8b, v7.8b //mul (row 3) - umlal v18.8h, v11.8b, v6.8b //mul (row 3) + tbl v12.8b, {v0.16b},v19.8b //load from ref_main_idx (row 4) + umull v18.8h, v23.8b, v7.8b //mul (row 3) + umlal v18.8h, v25.8b, v6.8b //mul (row 3) - tbl v13.8b, {v0.16b},v9.8b //load from ref_main_idx + 1 (row 4) + tbl v13.8b, {v0.16b},v21.8b //load from ref_main_idx + 1 (row 4) add v4.8b, v4.8b , v3.8b //ref_main_idx (row 5) add v5.8b, v5.8b , v3.8b //ref_main_idx + 1 (row 5) @@ -352,30 +354,30 @@ prologue_8_16_32: umlal v24.8h, v13.8b, v6.8b //mul (row 4) tbl v17.8b, {v0.16b},v5.8b //load from ref_main_idx + 1 (row 5) - add v8.8b, v8.8b , v3.8b //ref_main_idx (row 6) - add v9.8b, v9.8b , v3.8b //ref_main_idx + 1 (row 6) + add v19.8b, v19.8b , v3.8b //ref_main_idx (row 6) + add v21.8b, v21.8b , v3.8b //ref_main_idx + 1 (row 6) st1 {v18.8b},[x2], x3 //st (row 3) rshrn v24.8b, v24.8h,#5 //round shft (row 4) - tbl v14.8b, {v0.16b},v8.8b //load from ref_main_idx (row 6) + tbl v14.8b, {v0.16b},v19.8b //load from ref_main_idx (row 6) umull v22.8h, v16.8b, v7.8b //mul (row 5) umlal v22.8h, v17.8b, v6.8b //mul (row 5) - tbl v15.8b, {v0.16b},v9.8b //load from ref_main_idx + 1 (row 6) + tbl v15.8b, {v0.16b},v21.8b //load from ref_main_idx + 1 (row 6) add v4.8b, v4.8b , v3.8b //ref_main_idx (row 7) add v5.8b, v5.8b , v3.8b //ref_main_idx + 1 (row 7) st1 {v24.8b},[x2], x3 //st (row 4) rshrn v22.8b, v22.8h,#5 //round shft (row 5) - tbl v10.8b, {v0.16b},v4.8b //load from ref_main_idx (row 7) + tbl v23.8b, {v0.16b},v4.8b //load from ref_main_idx (row 7) umull v20.8h, v14.8b, v7.8b //mul (row 6) umlal v20.8h, v15.8b, v6.8b //mul (row 6) - tbl v11.8b, {v0.16b},v5.8b //load from ref_main_idx + 1 (row 7) - umull v18.8h, v10.8b, v7.8b //mul (row 7) - umlal v18.8h, v11.8b, v6.8b //mul (row 7) + tbl v25.8b, {v0.16b},v5.8b //load from ref_main_idx + 1 (row 7) + umull v18.8h, v23.8b, v7.8b //mul (row 7) + umlal v18.8h, v25.8b, v6.8b //mul (row 7) st1 {v22.8b},[x2], x3 //st (row 5) rshrn v20.8b, v20.8h,#5 //round shft (row 6) @@ -410,31 +412,31 @@ lbl390: mov x5,x2 ld1 {v31.8b},[x14],#8 smull v12.8h, v30.8b, v31.8b //(col+1)*intra_pred_angle [0:7](col) - xtn v10.8b, v12.8h + xtn v23.8b, v12.8h sshr v12.8h, v12.8h,#5 - sqxtn v11.8b, v12.8h + sqxtn v25.8b, v12.8h dup v27.8b,w0 //row value inc or reset accordingly ldr w9, [x8] sxtw x9,w9 add x9, x0, x9 sub x9, x9, #1 dup v26.8b,w9 - add v8.8b, v27.8b , v11.8b //ref_main_idx (add row) + add v19.8b, v27.8b , v25.8b //ref_main_idx (add row) sub x4,x4,#8 kernel_8_16_32: - sub v8.8b, v8.8b , v26.8b //ref_main_idx - mov v26.8b, v10.8b + sub v19.8b, v19.8b , v26.8b //ref_main_idx + mov v26.8b, v23.8b subs x11, x11, #8 add x6, x1, x9 - tbl v10.8b, {v0.16b},v4.8b //load from ref_main_idx (row 7) - add v9.8b, v2.8b , v8.8b //ref_main_idx + 1 + tbl v23.8b, {v0.16b},v4.8b //load from ref_main_idx (row 7) + add v21.8b, v2.8b , v19.8b //ref_main_idx + 1 umull v20.8h, v14.8b, v7.8b //mul (row 6) - tbl v11.8b, {v0.16b},v5.8b //load from ref_main_idx + 1 (row 7) + tbl v25.8b, {v0.16b},v5.8b //load from ref_main_idx + 1 (row 7) umlal v20.8h, v15.8b, v6.8b //mul (row 6) add x20, x0, #8 @@ -453,14 +455,14 @@ lbl429: csel x8, x12, x8,le dup v27.8b,w0 //row value inc or reset accordingly - add v4.8b, v2.8b , v8.8b //ref_main_idx (row 1) - tbl v12.8b, {v0.16b},v8.8b //load from ref_main_idx (row 0) - add v5.8b, v2.8b , v9.8b //ref_main_idx + 1 (row 1) + add v4.8b, v2.8b , v19.8b //ref_main_idx (row 1) + tbl v12.8b, {v0.16b},v19.8b //load from ref_main_idx (row 0) + add v5.8b, v2.8b , v21.8b //ref_main_idx + 1 (row 1) - umull v18.8h, v10.8b, v7.8b //mul (row 7) - tbl v13.8b, {v0.16b},v9.8b //load from ref_main_idx + 1 (row 0) - umlal v18.8h, v11.8b, v6.8b //mul (row 7) + umull v18.8h, v23.8b, v7.8b //mul (row 7) + tbl v13.8b, {v0.16b},v21.8b //load from ref_main_idx + 1 (row 0) + umlal v18.8h, v25.8b, v6.8b //mul (row 7) ld1 {v31.8b},[x14],#8 and v6.8b, v29.8b , v26.8b //fract values in d1/ idx values in d0 @@ -468,9 +470,9 @@ lbl429: st1 {v24.8b},[x5], x3 //(from previous loop)st (row 5) rshrn v20.8b, v20.8h,#5 //(from previous loop)round shft (row 6) - add v8.8b, v3.8b , v8.8b //ref_main_idx (row 2) + add v19.8b, v3.8b , v19.8b //ref_main_idx (row 2) tbl v16.8b, {v0.16b},v4.8b //load from ref_main_idx (row 1) - add v9.8b, v3.8b , v9.8b //ref_main_idx + 1 (row 2) + add v21.8b, v3.8b , v21.8b //ref_main_idx + 1 (row 2) add x20, x4, #8 csel x11, x20, x11,le @@ -486,22 +488,22 @@ lbl429: rshrn v18.8b, v18.8h,#5 //(from previous loop)round shft (row 7) add v4.8b, v4.8b , v3.8b //ref_main_idx (row 3) - tbl v14.8b, {v0.16b},v8.8b //load from ref_main_idx (row 2) + tbl v14.8b, {v0.16b},v19.8b //load from ref_main_idx (row 2) add v5.8b, v5.8b , v3.8b //ref_main_idx + 1 (row 3) umull v22.8h, v16.8b, v7.8b //mul (row 1) - tbl v15.8b, {v0.16b},v9.8b //load from ref_main_idx + 1 (row 2) + tbl v15.8b, {v0.16b},v21.8b //load from ref_main_idx + 1 (row 2) umlal v22.8h, v17.8b, v6.8b //mul (row 1) rshrn v24.8b, v24.8h,#5 //round shft (row 0) st1 {v18.8b},[x5], x3 //(from previous loop)st (row 7) - add v8.8b, v8.8b , v3.8b //ref_main_idx (row 4) - tbl v10.8b, {v0.16b},v4.8b //load from ref_main_idx (row 3) - add v9.8b, v9.8b , v3.8b //ref_main_idx + 1 (row 4) + add v19.8b, v19.8b , v3.8b //ref_main_idx (row 4) + tbl v23.8b, {v0.16b},v4.8b //load from ref_main_idx (row 3) + add v21.8b, v21.8b , v3.8b //ref_main_idx + 1 (row 4) umull v20.8h, v14.8b, v7.8b //mul (row 2) - tbl v11.8b, {v0.16b},v5.8b //load from ref_main_idx + 1 (row 3) + tbl v25.8b, {v0.16b},v5.8b //load from ref_main_idx + 1 (row 3) umlal v20.8h, v15.8b, v6.8b //mul (row 2) smull v14.8h, v30.8b, v31.8b //(col+1)*intra_pred_angle [0:7](col) @@ -513,22 +515,22 @@ lbl429: rshrn v22.8b, v22.8h,#5 //round shft (row 1) add v4.8b, v4.8b , v3.8b //ref_main_idx (row 5) - tbl v12.8b, {v0.16b},v8.8b //load from ref_main_idx (row 4) + tbl v12.8b, {v0.16b},v19.8b //load from ref_main_idx (row 4) add v5.8b, v5.8b , v3.8b //ref_main_idx + 1 (row 5) - umull v18.8h, v10.8b, v7.8b //mul (row 3) - tbl v13.8b, {v0.16b},v9.8b //load from ref_main_idx + 1 (row 4) - umlal v18.8h, v11.8b, v6.8b //mul (row 3) + umull v18.8h, v23.8b, v7.8b //mul (row 3) + tbl v13.8b, {v0.16b},v21.8b //load from ref_main_idx + 1 (row 4) + umlal v18.8h, v25.8b, v6.8b //mul (row 3) st1 {v22.8b},[x2], x3 //st (row 1) rshrn v20.8b, v20.8h,#5 //round shft (row 2) - xtn v10.8b, v14.8h + xtn v23.8b, v14.8h sshr v14.8h, v14.8h,#5 - add v8.8b, v8.8b , v3.8b //ref_main_idx (row 6) + add v19.8b, v19.8b , v3.8b //ref_main_idx (row 6) tbl v16.8b, {v0.16b},v4.8b //load from ref_main_idx (row 5) - add v9.8b, v9.8b , v3.8b //ref_main_idx + 1 (row 6) + add v21.8b, v21.8b , v3.8b //ref_main_idx + 1 (row 6) umull v24.8h, v12.8b, v7.8b //mul (row 4) tbl v17.8b, {v0.16b},v5.8b //load from ref_main_idx + 1 (row 5) @@ -538,17 +540,17 @@ lbl429: rshrn v18.8b, v18.8h,#5 //round shft (row 3) sub x9, x9, #1 - sqxtn v11.8b, v14.8h + sqxtn v25.8b, v14.8h add v4.8b, v4.8b , v3.8b //ref_main_idx (row 7) - tbl v14.8b, {v0.16b},v8.8b //load from ref_main_idx (row 6) + tbl v14.8b, {v0.16b},v19.8b //load from ref_main_idx (row 6) add v5.8b, v5.8b , v3.8b //ref_main_idx + 1 (row 7) umull v22.8h, v16.8b, v7.8b //mul (row 5) - tbl v15.8b, {v0.16b},v9.8b //load from ref_main_idx + 1 (row 6) + tbl v15.8b, {v0.16b},v21.8b //load from ref_main_idx + 1 (row 6) umlal v22.8h, v17.8b, v6.8b //mul (row 5) - add v8.8b, v27.8b , v11.8b //ref_main_idx (add row) + add v19.8b, v27.8b , v25.8b //ref_main_idx (add row) dup v26.8b,w9 st1 {v18.8b},[x2], x3 //st (row 3) @@ -566,17 +568,17 @@ lbl429: bne kernel_8_16_32 epil_8_16_32: - tbl v10.8b, {v0.16b},v4.8b //load from ref_main_idx (row 7) + tbl v23.8b, {v0.16b},v4.8b //load from ref_main_idx (row 7) umull v20.8h, v14.8b, v7.8b //mul (row 6) - tbl v11.8b, {v0.16b},v5.8b //load from ref_main_idx + 1 (row 7) + tbl v25.8b, {v0.16b},v5.8b //load from ref_main_idx + 1 (row 7) umlal v20.8h, v15.8b, v6.8b //mul (row 6) st1 {v24.8b},[x5], x3 //st (row 4) rshrn v24.8b, v22.8h,#5 //round shft (row 5) - umull v18.8h, v10.8b, v7.8b //mul (row 7) - umlal v18.8h, v11.8b, v6.8b //mul (row 7) + umull v18.8h, v23.8b, v7.8b //mul (row 7) + umlal v18.8h, v25.8b, v6.8b //mul (row 7) st1 {v24.8b},[x5], x3 //(from previous loop)st (row 5) rshrn v20.8b, v20.8h,#5 //(from previous loop)round shft (row 6) @@ -628,38 +630,38 @@ sz_4_proc: dup v28.8b,w1 sshr v22.8h, v22.8h,#5 - sqxtn v8.8b, v22.8h + sqxtn v19.8b, v22.8h and v6.8b, v6.8b , v29.8b //fract values in d1/ idx values in d0 sub v7.8b, v28.8b , v6.8b //32-fract - add v8.8b, v8.8b , v2.8b //ref_main_idx (add 1) - sub v8.8b, v8.8b , v26.8b //ref_main_idx - add v9.8b, v8.8b , v2.8b //ref_main_idx + 1 + add v19.8b, v19.8b , v2.8b //ref_main_idx (add 1) + sub v19.8b, v19.8b , v26.8b //ref_main_idx + add v21.8b, v19.8b , v2.8b //ref_main_idx + 1 - add v4.8b, v8.8b , v2.8b //row 1 ref_main_idx - add v5.8b, v9.8b , v2.8b + add v4.8b, v19.8b , v2.8b //row 1 ref_main_idx + add v5.8b, v21.8b , v2.8b - tbl v12.8b, {v0.16b},v8.8b //load from ref_main_idx (row 0) - tbl v13.8b, {v0.16b},v9.8b //load from ref_main_idx + 1 (row 0) + tbl v12.8b, {v0.16b},v19.8b //load from ref_main_idx (row 0) + tbl v13.8b, {v0.16b},v21.8b //load from ref_main_idx + 1 (row 0) umull v24.8h, v12.8b, v7.8b //mul (row 0) tbl v16.8b, {v0.16b},v4.8b //load from ref_main_idx (row 1) umlal v24.8h, v13.8b, v6.8b //mul (row 0) - add v8.8b, v8.8b , v3.8b //idx (row 2) + add v19.8b, v19.8b , v3.8b //idx (row 2) tbl v17.8b, {v0.16b},v5.8b //load from ref_main_idx + 1 (row 1) - add v9.8b, v9.8b , v3.8b //idx+1 (row 2) + add v21.8b, v21.8b , v3.8b //idx+1 (row 2) umull v22.8h, v16.8b, v7.8b //mul (row 1) - tbl v12.8b, {v0.16b},v8.8b //load from ref_main_idx (row 2) + tbl v12.8b, {v0.16b},v19.8b //load from ref_main_idx (row 2) umlal v22.8h, v17.8b, v6.8b //mul (row 1) rshrn v24.8b, v24.8h,#5 //round shift (row 0) add v4.8b, v4.8b , v3.8b //idx (row 3) - tbl v13.8b, {v0.16b},v9.8b //load from ref_main_idx + 1 (row 2) + tbl v13.8b, {v0.16b},v21.8b //load from ref_main_idx + 1 (row 2) add v5.8b, v5.8b , v3.8b //idx+1 (row 3) umull v20.8h, v12.8b, v7.8b //mul (row 2) @@ -687,7 +689,8 @@ end_func: add sp, sp, #132 // ldmfd sp!,{x4-x12,x15} //reload the registers from sp ldp x19, x20,[sp],#16 - pop_v_regs + ldp d14,d15,[sp],#16 + ldp d12,d13,[sp],#16 ret diff --git a/common/arm64/ihevc_intra_pred_filters_luma_mode_19_to_25.s b/common/arm64/ihevc_intra_pred_filters_luma_mode_19_to_25.s index fe7ac11..9b59d58 100644 --- a/common/arm64/ihevc_intra_pred_filters_luma_mode_19_to_25.s +++ b/common/arm64/ihevc_intra_pred_filters_luma_mode_19_to_25.s @@ -104,7 +104,10 @@ ihevc_intra_pred_luma_mode_19_to_25_av8: // stmfd sp!, {x4-x12, x14} //stack stores the values of the arguments - push_v_regs + + stp d9,d10,[sp,#-16]! + stp d12,d13,[sp,#-16]! + stp d14,d15,[sp,#-16]! stp x19, x20,[sp,#-16]! adrp x7, :got:gai4_ihevc_ang_table @@ -267,7 +270,7 @@ prologue: add x10,x8,x9 //(i row)*pu1_ref[ref_main_idx] - ld1 {v8.8b},[x10],x11 //(i row)ref_main_idx + ld1 {v23.8b},[x10],x11 //(i row)ref_main_idx sbfx x9,x14,#8,#8 ld1 {v9.8b},[x10] //(i row)ref_main_idx_1 @@ -278,7 +281,7 @@ prologue: add x10,x8,x9 //(iii)*pu1_ref[ref_main_idx] ld1 {v12.8b},[x12],x11 //(ii)ref_main_idx - umull v10.8h, v8.8b, v30.8b //(i row)vmull_u8(ref_main_idx, dup_const_32_fract) + umull v10.8h, v23.8b, v30.8b //(i row)vmull_u8(ref_main_idx, dup_const_32_fract) ld1 {v13.8b},[x12] //(ii)ref_main_idx_1 umlal v10.8h, v9.8b, v31.8b //(i row)vmull_u8(ref_main_idx_1, dup_const_fract) @@ -316,7 +319,7 @@ prologue: dup v29.8b, v4.8b[5] //(vi) add x10,x8,x9 //(v)*pu1_ref[ref_main_idx] - ld1 {v8.8b},[x10],x11 //(v)ref_main_idx + ld1 {v23.8b},[x10],x11 //(v)ref_main_idx sub v24.8b, v1.8b , v25.8b //(iv)32-fract(dup_const_32_fract) umull v22.8h, v20.8b, v24.8b //(iv)vmull_u8(ref_main_idx, dup_const_32_fract) @@ -336,7 +339,7 @@ prologue: add x10,x8,x9 //(vii)*pu1_ref[ref_main_idx] ld1 {v12.8b},[x12],x11 //(vi)ref_main_idx - umull v10.8h, v8.8b, v30.8b //(v)vmull_u8(ref_main_idx, dup_const_32_fract) + umull v10.8h, v23.8b, v30.8b //(v)vmull_u8(ref_main_idx, dup_const_32_fract) ld1 {v13.8b},[x12] //(vi)ref_main_idx_1 umlal v10.8h, v9.8b, v31.8b //(v)vmull_u8(ref_main_idx_1, dup_const_fract) @@ -392,7 +395,7 @@ kernel_8_rows: subs x4,x4,#8 sbfx x9,x14,#8,#8 - ld1 {v8.8b},[x10],x11 //(i)ref_main_idx + ld1 {v23.8b},[x10],x11 //(i)ref_main_idx sub v24.8b, v1.8b , v25.8b //(viii)32-fract(dup_const_32_fract) add x20,x6,#8 //increment the row value @@ -416,7 +419,7 @@ kernel_8_rows: add x10,x8,x9 //(iii)*pu1_ref[ref_main_idx] ld1 {v12.8b},[x12],x11 //(ii)ref_main_idx - umull v10.8h, v8.8b, v30.8b //(i)vmull_u8(ref_main_idx, dup_const_32_fract) + umull v10.8h, v23.8b, v30.8b //(i)vmull_u8(ref_main_idx, dup_const_32_fract) ld1 {v13.8b},[x12] //(ii)ref_main_idx_1 umlal v10.8h, v9.8b, v31.8b //(i)vmull_u8(ref_main_idx_1, dup_const_fract) @@ -477,7 +480,7 @@ kernel_8_rows: dup v25.8b, v4.8b[7] //(viii) rshrn v18.8b, v18.8h,#5 //(iii)shift_res = vrshrn_n_u16(add_res, 5) - ld1 {v8.8b},[x10],x11 //(v)ref_main_idx + ld1 {v23.8b},[x10],x11 //(v)ref_main_idx and v4.16b, v2.16b , v6.16b //dup_const_fract(fract = pos & (31)) ld1 {v9.8b},[x10] //(v)ref_main_idx_1 @@ -493,7 +496,7 @@ kernel_8_rows: sub v30.8b, v1.8b , v31.8b //(v)32-fract(dup_const_32_fract) ld1 {v13.8b},[x12] //(vi)ref_main_idx_1 - umull v10.8h, v8.8b, v30.8b //(v)vmull_u8(ref_main_idx, dup_const_32_fract) + umull v10.8h, v23.8b, v30.8b //(v)vmull_u8(ref_main_idx, dup_const_32_fract) umov w14, v3.2s[0] //(i)extract idx to the r register sxtw x14,w14 @@ -592,7 +595,7 @@ core_loop_4: dup v7.8b,w4 //dup_const_32_fract umlal v4.8h, v3.8b, v0.8b //vmull_u8(ref_main_idx_1, dup_const_fract) - ld1 {v8.s}[0],[x10] //ref_main_idx + ld1 {v23.s}[0],[x10] //ref_main_idx add x8,x8,#1 ld1 {v9.s}[0],[x11] //ref_main_idx_1 @@ -607,7 +610,7 @@ core_loop_4: add x11,x10,#1 //pu1_ref_main_idx_1 += 1 dup v12.8b,w5 //dup_const_fract - umull v10.8h, v8.8b, v7.8b //vmull_u8(ref_main_idx, dup_const_32_fract) + umull v10.8h, v23.8b, v7.8b //vmull_u8(ref_main_idx, dup_const_32_fract) sub x20,x5,#32 neg x4, x20 @@ -655,7 +658,9 @@ end_loops: add sp, sp, #132 // ldmfd sp!,{x4-x12,x15} //reload the registers from sp ldp x19, x20,[sp],#16 - pop_v_regs + ldp d14,d15,[sp],#16 + ldp d12,d13,[sp],#16 + ldp d9,d10,[sp],#16 ret diff --git a/common/arm64/ihevc_intra_pred_luma_dc.s b/common/arm64/ihevc_intra_pred_luma_dc.s index 7683266..e4fdb5d 100644 --- a/common/arm64/ihevc_intra_pred_luma_dc.s +++ b/common/arm64/ihevc_intra_pred_luma_dc.s @@ -104,7 +104,7 @@ ihevc_intra_pred_luma_dc_av8: // stmfd sp!, {x4-x12, x14} //stack stores the values of the arguments - push_v_regs + stp x19, x20,[sp,#-16]! @@ -128,14 +128,14 @@ ihevc_intra_pred_luma_dc_av8: add x8, x7, #1 //&src[2nt+1] mvn x5, x5 add x5, x5, #1 - dup v8.2s,w5 + dup v7.2s,w5 ldrb w14, [x8] sxtw x14,w14 - shl d8, d8,#32 + shl d7, d7,#32 sub x9, x7, #1 //&src[2nt-1] - sshr d8, d8,#32 + sshr d7, d7,#32 mov x7, x8 //x7 also stores 2nt+1 @@ -192,7 +192,7 @@ core_loop_add: epil_add_loop: - sshl d9, d6, d8 //(dc_val) shr by log2nt+1 + sshl d18, d6, d7 //(dc_val) shr by log2nt+1 cmp x4, #32 mov v28.s[0], w14 @@ -200,25 +200,25 @@ epil_add_loop: mov x20,#128 csel x6, x20, x6,eq - dup v16.8b, v9.8b[0] //dc_val - shl d13, d9,#1 //2*dc + dup v16.8b, v18.8b[0] //dc_val + shl d25, d18,#1 //2*dc beq prologue_cpy_32 - add d14, d13 , d28 //src[2nt+1]+2+src[2nt-1]+2dc_val + add d27, d25 , d28 //src[2nt+1]+2+src[2nt-1]+2dc_val mov x20,#0 csel x6, x20, x6,ne //nt - ushr v15.4h, v14.4h,#2 //final dst[0]'s value in d15[0] + ushr v29.4h, v27.4h,#2 //final dst[0]'s value in d15[0] csel x10, x4, x10,ne - add d11, d13 , d9 //3*dc + add d23, d25 , d18 //3*dc sub x12, x3, x3, lsl #3 //-7*strd - add d11, d11 , d17 //3*dc + 2 + add d23, d23 , d17 //3*dc + 2 add x12, x12, #8 //offset after one 8x8 block (-7*strd + 8) - dup v24.8h, v11.4h[0] //3*dc + 2 (moved to all lanes) + dup v24.8h, v23.4h[0] //3*dc + 2 (moved to all lanes) sub x0, x3, x4 //strd - nt prologue_col: @@ -248,7 +248,7 @@ prologue_col: movi d19, #0x00000000000000ff // sqshrun v3.8b, v22.8h,#2 //rows shx2 movn (prol) - bsl v19.8b, v15.8b , v2.8b //first row with dst[0] + bsl v19.8b, v29.8b , v2.8b //first row with dst[0] add v26.8h, v26.8h , v24.8h //col 8::15 add 3dc+2 (prol extra) rev64 v3.8b, v3.8b @@ -445,23 +445,23 @@ dc_4: mov v28.s[1], w5 //src[2nt+1]+2+src[2nt-1] moved to d28 add d6, d6 , d5 //accumulate all inp into d6 (end for nt==8) - sshl d9, d6, d8 //(dc_val) shr by log2nt+1 + sshl d18, d6, d7 //(dc_val) shr by log2nt+1 mov x8, x7 //&src[2nt+1] - shl d13, d9,#1 //2*dc + shl d25, d18,#1 //2*dc sub x9, x9, #3 //&src[2nt-1-row] - dup v16.8b, v9.8b[0] //dc_val - add d14, d13 , d28 //src[2nt+1]+2+src[2nt-1]+2dc_val + dup v16.8b, v18.8b[0] //dc_val + add d27, d25 , d28 //src[2nt+1]+2+src[2nt-1]+2dc_val - ushr v15.4h, v14.4h,#2 //final dst[0]'s value in d15[0] + ushr v29.4h, v27.4h,#2 //final dst[0]'s value in d15[0] sub x12, x3, x3, lsl #2 //-3*strd - add d11, d13 , d9 //3*dc + add d23, d25 , d18 //3*dc - add d11, d11 , d17 //3*dc + 2 + add d23, d23 , d17 //3*dc + 2 add x12, x12, #4 //offset after one 4x4 block (-3*strd + 4) - dup v24.8h, v11.4h[0] //3*dc + 2 (moved to all lanes) + dup v24.8h, v23.4h[0] //3*dc + 2 (moved to all lanes) sub x0, x3, x4 //strd - nt @@ -482,7 +482,7 @@ dc_4: sqshrun v3.8b, v22.8h,#2 //rows shx2 movn (prol) - bsl v19.8b, v15.8b , v2.8b //first row with dst[0] + bsl v19.8b, v29.8b , v2.8b //first row with dst[0] rev64 v3.8b, v3.8b @@ -510,7 +510,7 @@ epilogue_end: end_func: // ldmfd sp!,{x4-x12,x15} //reload the registers from sp ldp x19, x20,[sp],#16 - pop_v_regs + ret diff --git a/common/arm64/ihevc_intra_pred_luma_horz.s b/common/arm64/ihevc_intra_pred_luma_horz.s index 551fd77..95452e4 100644 --- a/common/arm64/ihevc_intra_pred_luma_horz.s +++ b/common/arm64/ihevc_intra_pred_luma_horz.s @@ -97,7 +97,7 @@ ihevc_intra_pred_luma_horz_av8: // stmfd sp!, {x4-x12, x14} //stack stores the values of the arguments - push_v_regs + stp x19, x20,[sp,#-16]! //ldr x5,[sp,#44] @loads mode @@ -126,7 +126,7 @@ core_loop_32: st1 { v2.16b},[x2],x3 //store in 1st row 0-16 columns st1 { v2.16b},[x9],x3 //store in 1st row 16-32 columns - dup v8.16b, v0.16b[12] + dup v1.16b, v0.16b[12] st1 { v4.16b},[x2],x3 st1 { v4.16b},[x9],x3 @@ -135,14 +135,14 @@ core_loop_32: st1 { v6.16b},[x9],x3 dup v4.16b, v0.16b[10] - st1 { v8.16b},[x2],x3 - st1 { v8.16b},[x9],x3 + st1 { v1.16b},[x2],x3 + st1 { v1.16b},[x9],x3 dup v6.16b, v0.16b[9] st1 { v2.16b},[x2],x3 st1 { v2.16b},[x9],x3 - dup v8.16b, v0.16b[8] + dup v1.16b, v0.16b[8] st1 { v4.16b},[x2],x3 st1 { v4.16b},[x9],x3 @@ -151,14 +151,14 @@ core_loop_32: st1 { v6.16b},[x9],x3 dup v4.16b, v0.8b[6] - st1 { v8.16b},[x2],x3 - st1 { v8.16b},[x9],x3 + st1 { v1.16b},[x2],x3 + st1 { v1.16b},[x9],x3 dup v6.16b, v0.8b[5] st1 { v2.16b},[x2],x3 st1 { v2.16b},[x9],x3 - dup v8.16b, v0.8b[4] + dup v1.16b, v0.8b[4] st1 { v4.16b},[x2],x3 st1 { v4.16b},[x9],x3 @@ -167,15 +167,15 @@ core_loop_32: st1 { v6.16b},[x9],x3 dup v4.16b, v0.8b[2] - st1 { v8.16b},[x2],x3 - st1 { v8.16b},[x9],x3 + st1 { v1.16b},[x2],x3 + st1 { v1.16b},[x9],x3 dup v6.16b, v0.8b[1] st1 { v2.16b},[x2],x3 st1 { v2.16b},[x9],x3 sub x12,x12,#16 //move to 16th value pointer - dup v8.16b, v0.8b[0] + dup v1.16b, v0.8b[0] st1 { v4.16b},[x2],x3 st1 { v4.16b},[x9],x3 @@ -183,12 +183,12 @@ core_loop_32: st1 { v6.16b},[x2],x3 st1 { v6.16b},[x9],x3 - st1 { v8.16b},[x2],x3 - st1 { v8.16b},[x9],x3 + st1 { v1.16b},[x2],x3 + st1 { v1.16b},[x9],x3 bgt core_loop_32 // ldmfd sp!,{x4-x12,x15} //reload the registers from sp ldp x19, x20,[sp],#16 - pop_v_regs + ret b end_func @@ -214,18 +214,18 @@ core_loop_16: dup v6.16b, v0.16b[12] sqadd v22.8h, v26.8h , v24.8h - dup v8.16b, v0.16b[11] + dup v1.16b, v0.16b[11] sqxtun v22.8b, v22.8h st1 {v22.8b},[x2],#8 - dup v10.16b, v0.16b[10] + dup v18.16b, v0.16b[10] usubl v24.8h, v31.8b, v28.8b - dup v12.16b, v0.16b[9] + dup v19.16b, v0.16b[9] sshr v24.8h, v24.8h,#1 - dup v14.16b, v0.16b[8] + dup v20.16b, v0.16b[8] sqadd v22.8h, v26.8h , v24.8h dup v16.16b, v0.8b[7] @@ -238,37 +238,37 @@ core_loop_16: st1 { v4.16b},[x2],x3 st1 { v6.16b},[x2],x3 - st1 { v8.16b},[x2],x3 + st1 { v1.16b},[x2],x3 dup v2.16b, v0.8b[6] - st1 { v10.16b},[x2],x3 + st1 { v18.16b},[x2],x3 dup v4.16b, v0.8b[5] - st1 { v12.16b},[x2],x3 + st1 { v19.16b},[x2],x3 dup v6.16b, v0.8b[4] - st1 { v14.16b},[x2],x3 + st1 { v20.16b},[x2],x3 - dup v8.16b, v0.8b[3] + dup v1.16b, v0.8b[3] st1 { v16.16b},[x2],x3 - dup v10.16b, v0.8b[2] + dup v18.16b, v0.8b[2] st1 { v2.16b},[x2],x3 - dup v12.16b, v0.8b[1] + dup v19.16b, v0.8b[1] st1 { v4.16b},[x2],x3 - dup v14.16b, v0.8b[0] + dup v20.16b, v0.8b[0] st1 { v6.16b},[x2],x3 - st1 { v8.16b},[x2],x3 - st1 { v10.16b},[x2],x3 - st1 { v12.16b},[x2],x3 - st1 { v14.16b},[x2],x3 + st1 { v1.16b},[x2],x3 + st1 { v18.16b},[x2],x3 + st1 { v19.16b},[x2],x3 + st1 { v20.16b},[x2],x3 // ldmfd sp!,{x4-x12,x15} //reload the registers from sp ldp x19, x20,[sp],#16 - pop_v_regs + ret b end_func @@ -302,19 +302,19 @@ core_loop_8: st1 {v22.8b},[x2],x3 st1 {v3.8b},[x2],x3 - dup v8.8b, v0.8b[1] + dup v1.8b, v0.8b[1] st1 {v4.8b},[x2],x3 st1 {v5.8b},[x2],x3 - dup v9.8b, v0.8b[0] + dup v17.8b, v0.8b[0] st1 {v6.8b},[x2],x3 st1 {v7.8b},[x2],x3 - st1 {v8.8b},[x2],x3 - st1 {v9.8b},[x2],x3 + st1 {v1.8b},[x2],x3 + st1 {v17.8b},[x2],x3 // ldmfd sp!,{x4-x12,x15} //reload the registers from sp ldp x19, x20,[sp],#16 - pop_v_regs + ret b end_func @@ -349,7 +349,7 @@ core_loop_4: // ldmfd sp!,{x4-x12,x15} //reload the registers from sp ldp x19, x20,[sp],#16 - pop_v_regs + ret end_func: diff --git a/common/arm64/ihevc_intra_pred_luma_mode2.s b/common/arm64/ihevc_intra_pred_luma_mode2.s index 5d7a3c5..598ce5a 100644 --- a/common/arm64/ihevc_intra_pred_luma_mode2.s +++ b/common/arm64/ihevc_intra_pred_luma_mode2.s @@ -105,7 +105,7 @@ ihevc_intra_pred_luma_mode2_av8: // stmfd sp!, {x4-x12, x14} //stack stores the values of the arguments - push_v_regs + stp x19, x20,[sp,#-16]! mov x8,#-2 @@ -138,20 +138,20 @@ prologue_cpy_32: ld1 {v7.8b},[x10],x8 add x7,x6,x3 - rev64 v8.8b, v0.8b - rev64 v9.8b, v1.8b + rev64 v16.8b, v0.8b + rev64 v17.8b, v1.8b lsl x5, x3, #2 - rev64 v10.8b, v2.8b - rev64 v11.8b, v3.8b + rev64 v18.8b, v2.8b + rev64 v19.8b, v3.8b add x9,x7,x3 - rev64 v12.8b, v4.8b + rev64 v20.8b, v4.8b subs x1,x1,#8 - rev64 v13.8b, v5.8b - rev64 v14.8b, v6.8b - rev64 v15.8b, v7.8b + rev64 v21.8b, v5.8b + rev64 v22.8b, v6.8b + rev64 v23.8b, v7.8b add x14,x9,x3 beq epilogue_mode2 @@ -160,24 +160,24 @@ prologue_cpy_32: kernel_mode2: - st1 {v8.8b},[x6],x5 - st1 {v9.8b},[x7],x5 + st1 {v16.8b},[x6],x5 + st1 {v17.8b},[x7],x5 subs x11,x11,#8 - st1 {v10.8b},[x9],x5 + st1 {v18.8b},[x9],x5 add x20,x2,#8 csel x2, x20, x2,gt - st1 {v11.8b},[x14],x5 - st1 {v12.8b},[x6],x5 + st1 {v19.8b},[x14],x5 + st1 {v20.8b},[x6],x5 csel x11, x4, x11,le - st1 {v13.8b},[x7],x5 - st1 {v14.8b},[x9],x5 + st1 {v21.8b},[x7],x5 + st1 {v22.8b},[x9],x5 add x20, x2, x3, lsl #2 csel x2, x20, x2,le - st1 {v15.8b},[x14],x5 + st1 {v23.8b},[x14],x5 ld1 {v0.8b},[x0],x8 sub x14,x4,#8 @@ -201,42 +201,42 @@ kernel_mode2: add x20, x0, x4 csel x0, x20, x0,le - rev64 v8.8b, v0.8b + rev64 v16.8b, v0.8b add x7, x6, x3 - rev64 v9.8b, v1.8b + rev64 v17.8b, v1.8b sub x20, x0, #8 csel x0, x20, x0,le - rev64 v10.8b, v2.8b + rev64 v18.8b, v2.8b csel x12, x4, x12,le - rev64 v11.8b, v3.8b + rev64 v19.8b, v3.8b add x9, x7, x3 - rev64 v12.8b, v4.8b + rev64 v20.8b, v4.8b add x10,x0,#-1 - rev64 v13.8b, v5.8b + rev64 v21.8b, v5.8b subs x1, x1, #8 - rev64 v14.8b, v6.8b + rev64 v22.8b, v6.8b add x14, x9, x3 - rev64 v15.8b, v7.8b + rev64 v23.8b, v7.8b bne kernel_mode2 epilogue_mode2: - st1 {v8.8b},[x6],x5 - st1 {v9.8b},[x7],x5 - st1 {v10.8b},[x9],x5 - st1 {v11.8b},[x14],x5 - st1 {v12.8b},[x6],x5 - st1 {v13.8b},[x7],x5 - st1 {v14.8b},[x9],x5 - st1 {v15.8b},[x14],x5 + st1 {v16.8b},[x6],x5 + st1 {v17.8b},[x7],x5 + st1 {v18.8b},[x9],x5 + st1 {v19.8b},[x14],x5 + st1 {v20.8b},[x6],x5 + st1 {v21.8b},[x7],x5 + st1 {v22.8b},[x9],x5 + st1 {v23.8b},[x14],x5 b end_func @@ -269,7 +269,7 @@ mode2_4: end_func: // ldmfd sp!,{x4-x12,x15} //reload the registers from sp ldp x19, x20,[sp],#16 - pop_v_regs + ret diff --git a/common/arm64/ihevc_intra_pred_luma_mode_27_to_33.s b/common/arm64/ihevc_intra_pred_luma_mode_27_to_33.s index 79964f7..58b2d37 100644 --- a/common/arm64/ihevc_intra_pred_luma_mode_27_to_33.s +++ b/common/arm64/ihevc_intra_pred_luma_mode_27_to_33.s @@ -100,7 +100,10 @@ ihevc_intra_pred_luma_mode_27_to_33_av8: // stmfd sp!, {x4-x12, x14} //stack stores the values of the arguments - push_v_regs + + stp d9,d10,[sp,#-16]! + stp d12,d13,[sp,#-16]! + stp d14,d15,[sp,#-16]! stp x19, x20,[sp,#-16]! adrp x6, :got:gai4_ihevc_ang_table //loads word32 gai4_ihevc_ang_table[35] @@ -156,7 +159,7 @@ prologue: add x10,x8,x9 //(i row)*pu1_ref[ref_main_idx] asr x14,x14,#8 //(ii)shift by 8 - ld1 {v8.8b},[x10],x11 //(i row)ref_main_idx + ld1 {v23.8b},[x10],x11 //(i row)ref_main_idx and x9,x14,#0xff //(ii)get the last byte asr x14,x14,#8 //(iii) @@ -168,7 +171,7 @@ prologue: add x10,x8,x9 //(iii)*pu1_ref[ref_main_idx] ld1 {v12.8b},[x12],x11 //(ii)ref_main_idx - umull v10.8h, v8.8b, v30.8b //(i row)vmull_u8(ref_main_idx, dup_const_32_fract) + umull v10.8h, v23.8b, v30.8b //(i row)vmull_u8(ref_main_idx, dup_const_32_fract) ld1 {v13.8b},[x12] //(ii)ref_main_idx_1 umlal v10.8h, v9.8b, v31.8b //(i row)vmull_u8(ref_main_idx_1, dup_const_fract) @@ -207,7 +210,7 @@ prologue: dup v29.8b, v4.8b[5] //(vi) add x10,x8,x9 //(v)*pu1_ref[ref_main_idx] - ld1 {v8.8b},[x10],x11 //(v)ref_main_idx + ld1 {v23.8b},[x10],x11 //(v)ref_main_idx sub v24.8b, v1.8b , v25.8b //(iv)32-fract(dup_const_32_fract) asr x14,x14,#8 //(vi) @@ -229,7 +232,7 @@ prologue: add x10,x8,x9 //(vii)*pu1_ref[ref_main_idx] ld1 {v12.8b},[x12],x11 //(vi)ref_main_idx - umull v10.8h, v8.8b, v30.8b //(v)vmull_u8(ref_main_idx, dup_const_32_fract) + umull v10.8h, v23.8b, v30.8b //(v)vmull_u8(ref_main_idx, dup_const_32_fract) ld1 {v13.8b},[x12] //(vi)ref_main_idx_1 umlal v10.8h, v9.8b, v31.8b //(v)vmull_u8(ref_main_idx_1, dup_const_fract) @@ -286,7 +289,7 @@ kernel_8_rows: dup v31.8b, v4.8b[0] subs x4,x4,#8 - ld1 {v8.8b},[x10],x11 //(i)ref_main_idx + ld1 {v23.8b},[x10],x11 //(i)ref_main_idx sub v24.8b, v1.8b , v25.8b //(viii)32-fract(dup_const_32_fract) and x9,x14,#0xff //(ii) add x20,x6,#8 //increment the row value @@ -309,7 +312,7 @@ kernel_8_rows: add x10,x8,x9 //(iii)*pu1_ref[ref_main_idx] ld1 {v12.8b},[x12],x11 //(ii)ref_main_idx - umull v10.8h, v8.8b, v30.8b //(i)vmull_u8(ref_main_idx, dup_const_32_fract) + umull v10.8h, v23.8b, v30.8b //(i)vmull_u8(ref_main_idx, dup_const_32_fract) asr x14,x14,#8 //(iv) ld1 {v13.8b},[x12] //(ii)ref_main_idx_1 @@ -368,7 +371,7 @@ kernel_8_rows: rshrn v18.8b, v18.8h,#5 //(iii)shift_res = vrshrn_n_u16(add_res, 5) asr x14,x14,#8 //(vii) - ld1 {v8.8b},[x10],x11 //(v)ref_main_idx + ld1 {v23.8b},[x10],x11 //(v)ref_main_idx and v4.16b, v2.16b , v6.16b //dup_const_fract(fract = pos & (31)) and x9,x14,#0xff //(vii) @@ -385,7 +388,7 @@ kernel_8_rows: and x9,x14,#0xff //(viii) ld1 {v13.8b},[x12] //(vi)ref_main_idx_1 - umull v10.8h, v8.8b, v30.8b //(v)vmull_u8(ref_main_idx, dup_const_32_fract) + umull v10.8h, v23.8b, v30.8b //(v)vmull_u8(ref_main_idx, dup_const_32_fract) umov w14, v3.2s[0] //(i)extract idx to the r register sxtw x14,w14 @@ -484,7 +487,7 @@ core_loop_4: dup v7.8b,w4 //dup_const_32_fract umlal v4.8h, v3.8b, v0.8b //vmull_u8(ref_main_idx_1, dup_const_fract) - ld1 {v8.s}[0],[x10] //ref_main_idx + ld1 {v23.s}[0],[x10] //ref_main_idx add x8,x8,#1 ld1 {v9.s}[0],[x11] //ref_main_idx_1 @@ -500,7 +503,7 @@ core_loop_4: add x11,x10,#1 //pu1_ref_main_idx_1 += 1 dup v12.8b,w5 //dup_const_fract - umull v10.8h, v8.8b, v7.8b //vmull_u8(ref_main_idx, dup_const_32_fract) + umull v10.8h, v23.8b, v7.8b //vmull_u8(ref_main_idx, dup_const_32_fract) sub x20,x5,#32 neg x4, x20 @@ -548,7 +551,9 @@ core_loop_4: end_loops: // ldmfd sp!,{x4-x12,x15} //reload the registers from sp ldp x19, x20,[sp],#16 - pop_v_regs + ldp d14,d15,[sp],#16 + ldp d12,d13,[sp],#16 + ldp d9,d10,[sp],#16 ret diff --git a/common/arm64/ihevc_intra_pred_luma_mode_3_to_9.s b/common/arm64/ihevc_intra_pred_luma_mode_3_to_9.s index b6e8601..56d2f6b 100644 --- a/common/arm64/ihevc_intra_pred_luma_mode_3_to_9.s +++ b/common/arm64/ihevc_intra_pred_luma_mode_3_to_9.s @@ -106,7 +106,9 @@ ihevc_intra_pred_luma_mode_3_to_9_av8: // stmfd sp!, {x4-x12, x14} //stack stores the values of the arguments - push_v_regs + + stp d12,d13,[sp,#-16]! + stp d14,d15,[sp,#-16]! stp x19, x20,[sp,#-16]! adrp x7, :got:gai4_ihevc_ang_table @@ -165,7 +167,7 @@ prologue_8_16_32: movi v28.8b, #32 - sqxtn v8.8b, v22.8h + sqxtn v1.8b, v22.8h and v6.8b, v6.8b , v29.8b //fract values in d1/ idx values in d0 @@ -173,54 +175,54 @@ prologue_8_16_32: movi v27.8b, #7 //row 0 to 7 - sub v8.8b, v8.8b , v2.8b //ref_main_idx (sub row) - sub v8.8b, v26.8b , v8.8b //ref_main_idx (row 0) - add v8.8b, v8.8b , v27.8b //t0 compensate the pu1_src idx incremented by 8 - sub v9.8b, v8.8b , v2.8b //ref_main_idx + 1 (row 0) - tbl v12.8b, {v0.16b},v8.8b //load from ref_main_idx (row 0) + sub v1.8b, v1.8b , v2.8b //ref_main_idx (sub row) + sub v1.8b, v26.8b , v1.8b //ref_main_idx (row 0) + add v1.8b, v1.8b , v27.8b //t0 compensate the pu1_src idx incremented by 8 + sub v19.8b, v1.8b , v2.8b //ref_main_idx + 1 (row 0) + tbl v12.8b, {v0.16b},v1.8b //load from ref_main_idx (row 0) sub v7.8b, v28.8b , v6.8b //32-fract - tbl v13.8b, {v0.16b},v9.8b //load from ref_main_idx + 1 (row 0) - sub v4.8b, v8.8b , v2.8b //ref_main_idx (row 1) - sub v5.8b, v9.8b , v2.8b //ref_main_idx + 1 (row 1) + tbl v13.8b, {v0.16b},v19.8b //load from ref_main_idx + 1 (row 0) + sub v4.8b, v1.8b , v2.8b //ref_main_idx (row 1) + sub v5.8b, v19.8b , v2.8b //ref_main_idx + 1 (row 1) tbl v16.8b, {v0.16b},v4.8b //load from ref_main_idx (row 1) umull v24.8h, v12.8b, v7.8b //mul (row 0) umlal v24.8h, v13.8b, v6.8b //mul (row 0) tbl v17.8b, {v0.16b},v5.8b //load from ref_main_idx + 1 (row 1) - sub v8.8b, v8.8b , v3.8b //ref_main_idx (row 2) - sub v9.8b, v9.8b , v3.8b //ref_main_idx + 1 (row 2) + sub v1.8b, v1.8b , v3.8b //ref_main_idx (row 2) + sub v19.8b, v19.8b , v3.8b //ref_main_idx + 1 (row 2) rshrn v24.8b, v24.8h,#5 //round shft (row 0) - tbl v14.8b, {v0.16b},v8.8b //load from ref_main_idx (row 2) + tbl v14.8b, {v0.16b},v1.8b //load from ref_main_idx (row 2) umull v22.8h, v16.8b, v7.8b //mul (row 1) umlal v22.8h, v17.8b, v6.8b //mul (row 1) - tbl v15.8b, {v0.16b},v9.8b //load from ref_main_idx + 1 (row 2) + tbl v15.8b, {v0.16b},v19.8b //load from ref_main_idx + 1 (row 2) sub v4.8b, v4.8b , v3.8b //ref_main_idx (row 3) sub v5.8b, v5.8b , v3.8b //ref_main_idx + 1 (row 3) st1 {v24.8b},[x2], x3 //st (row 0) rshrn v22.8b, v22.8h,#5 //round shft (row 1) - tbl v10.8b, {v0.16b},v4.8b //load from ref_main_idx (row 3) + tbl v23.8b, {v0.16b},v4.8b //load from ref_main_idx (row 3) umull v20.8h, v14.8b, v7.8b //mul (row 2) umlal v20.8h, v15.8b, v6.8b //mul (row 2) - tbl v11.8b, {v0.16b},v5.8b //load from ref_main_idx + 1 (row 3) - sub v8.8b, v8.8b , v3.8b //ref_main_idx (row 4) - sub v9.8b, v9.8b , v3.8b //ref_main_idx + 1 (row 4) + tbl v25.8b, {v0.16b},v5.8b //load from ref_main_idx + 1 (row 3) + sub v1.8b, v1.8b , v3.8b //ref_main_idx (row 4) + sub v19.8b, v19.8b , v3.8b //ref_main_idx + 1 (row 4) st1 {v22.8b},[x2], x3 //st (row 1) rshrn v20.8b, v20.8h,#5 //round shft (row 2) - tbl v12.8b, {v0.16b},v8.8b //load from ref_main_idx (row 4) - umull v18.8h, v10.8b, v7.8b //mul (row 3) - umlal v18.8h, v11.8b, v6.8b //mul (row 3) + tbl v12.8b, {v0.16b},v1.8b //load from ref_main_idx (row 4) + umull v18.8h, v23.8b, v7.8b //mul (row 3) + umlal v18.8h, v25.8b, v6.8b //mul (row 3) - tbl v13.8b, {v0.16b},v9.8b //load from ref_main_idx + 1 (row 4) + tbl v13.8b, {v0.16b},v19.8b //load from ref_main_idx + 1 (row 4) sub v4.8b, v4.8b , v3.8b //ref_main_idx (row 5) sub v5.8b, v5.8b , v3.8b //ref_main_idx + 1 (row 5) @@ -232,30 +234,30 @@ prologue_8_16_32: umlal v24.8h, v13.8b, v6.8b //mul (row 4) tbl v17.8b, {v0.16b},v5.8b //load from ref_main_idx + 1 (row 5) - sub v8.8b, v8.8b , v3.8b //ref_main_idx (row 6) - sub v9.8b, v9.8b , v3.8b //ref_main_idx + 1 (row 6) + sub v1.8b, v1.8b , v3.8b //ref_main_idx (row 6) + sub v19.8b, v19.8b , v3.8b //ref_main_idx + 1 (row 6) st1 {v18.8b},[x2], x3 //st (row 3) rshrn v24.8b, v24.8h,#5 //round shft (row 4) - tbl v14.8b, {v0.16b},v8.8b //load from ref_main_idx (row 6) + tbl v14.8b, {v0.16b},v1.8b //load from ref_main_idx (row 6) umull v22.8h, v16.8b, v7.8b //mul (row 5) umlal v22.8h, v17.8b, v6.8b //mul (row 5) - tbl v15.8b, {v0.16b},v9.8b //load from ref_main_idx + 1 (row 6) + tbl v15.8b, {v0.16b},v19.8b //load from ref_main_idx + 1 (row 6) sub v4.8b, v4.8b , v3.8b //ref_main_idx (row 7) sub v5.8b, v5.8b , v3.8b //ref_main_idx + 1 (row 7) st1 {v24.8b},[x2], x3 //st (row 4) rshrn v22.8b, v22.8h,#5 //round shft (row 5) - tbl v10.8b, {v0.16b},v4.8b //load from ref_main_idx (row 7) + tbl v23.8b, {v0.16b},v4.8b //load from ref_main_idx (row 7) umull v20.8h, v14.8b, v7.8b //mul (row 6) umlal v20.8h, v15.8b, v6.8b //mul (row 6) - tbl v11.8b, {v0.16b},v5.8b //load from ref_main_idx + 1 (row 7) - umull v18.8h, v10.8b, v7.8b //mul (row 7) - umlal v18.8h, v11.8b, v6.8b //mul (row 7) + tbl v25.8b, {v0.16b},v5.8b //load from ref_main_idx + 1 (row 7) + umull v18.8h, v23.8b, v7.8b //mul (row 7) + umlal v18.8h, v25.8b, v6.8b //mul (row 7) st1 {v22.8b},[x2], x3 //st (row 5) rshrn v20.8b, v20.8h,#5 //round shft (row 6) @@ -290,9 +292,9 @@ lbl284: mov x5,x2 ld1 {v31.8b},[x14],#8 smull v12.8h, v30.8b, v31.8b //(col+1)*intra_pred_angle [0:7](col) - xtn v10.8b, v12.8h + xtn v23.8b, v12.8h sshr v12.8h, v12.8h,#5 - sqxtn v11.8b, v12.8h + sqxtn v25.8b, v12.8h ldr w9, [x8] sxtw x9,w9 add x9, x0, x9 @@ -304,19 +306,19 @@ lbl284: kernel_8_16_32: - sub v8.8b, v26.8b , v11.8b //ref_main_idx - mov v26.8b, v10.8b + sub v1.8b, v26.8b , v25.8b //ref_main_idx + mov v26.8b, v23.8b subs x11, x11, #8 sub x6, x1, x9 - tbl v10.8b, {v0.16b},v4.8b //load from ref_main_idx (row 7) - add v8.8b, v8.8b , v16.8b //to compensate the pu1_src idx incremented by 8 + tbl v23.8b, {v0.16b},v4.8b //load from ref_main_idx (row 7) + add v1.8b, v1.8b , v16.8b //to compensate the pu1_src idx incremented by 8 umull v20.8h, v14.8b, v7.8b //mul (row 6) - tbl v11.8b, {v0.16b},v5.8b //load from ref_main_idx - 1 (row 7) + tbl v25.8b, {v0.16b},v5.8b //load from ref_main_idx - 1 (row 7) umlal v20.8h, v15.8b, v6.8b //mul (row 6) - sub v9.8b, v8.8b , v2.8b //ref_main_idx - 1 + sub v19.8b, v1.8b , v2.8b //ref_main_idx - 1 add x20, x0, #8 csel x0, x20, x0,le add x20, x8, #4 @@ -333,14 +335,14 @@ lbl323: csel x8, x12, x8,le dup v27.8b,w0 //row value inc or reset accordingly - sub v4.8b, v8.8b , v2.8b //ref_main_idx (row 1) - tbl v12.8b, {v0.16b},v8.8b //load from ref_main_idx (row 0) - sub v5.8b, v9.8b , v2.8b //ref_main_idx - 1 (row 1) + sub v4.8b, v1.8b , v2.8b //ref_main_idx (row 1) + tbl v12.8b, {v0.16b},v1.8b //load from ref_main_idx (row 0) + sub v5.8b, v19.8b , v2.8b //ref_main_idx - 1 (row 1) - umull v18.8h, v10.8b, v7.8b //mul (row 7) - tbl v13.8b, {v0.16b},v9.8b //load from ref_main_idx + 1 (row 0) - umlal v18.8h, v11.8b, v6.8b //mul (row 7) + umull v18.8h, v23.8b, v7.8b //mul (row 7) + tbl v13.8b, {v0.16b},v19.8b //load from ref_main_idx + 1 (row 0) + umlal v18.8h, v25.8b, v6.8b //mul (row 7) ld1 {v31.8b},[x14],#8 and v6.8b, v29.8b , v26.8b //fract values in d1/ idx values in d0 @@ -348,9 +350,9 @@ lbl323: st1 {v22.8b},[x5], x3 //(from previous loop)st (row 5) rshrn v20.8b, v20.8h,#5 //(from previous loop)round shft (row 6) - sub v8.8b, v8.8b , v3.8b //ref_main_idx (row 2) - tbl v10.8b, {v0.16b},v4.8b //load from ref_main_idx (row 1) - sub v9.8b, v9.8b , v3.8b //ref_main_idx - 1 (row 2) + sub v1.8b, v1.8b , v3.8b //ref_main_idx (row 2) + tbl v23.8b, {v0.16b},v4.8b //load from ref_main_idx (row 1) + sub v19.8b, v19.8b , v3.8b //ref_main_idx - 1 (row 2) add x20, x4, #8 csel x11, x20, x11,le @@ -366,22 +368,22 @@ lbl323: rshrn v18.8b, v18.8h,#5 //(from previous loop)round shft (row 7) sub v4.8b, v4.8b , v3.8b //ref_main_idx (row 3) - tbl v14.8b, {v0.16b},v8.8b //load from ref_main_idx (row 2) + tbl v14.8b, {v0.16b},v1.8b //load from ref_main_idx (row 2) sub v5.8b, v5.8b , v3.8b //ref_main_idx - 1 (row 3) - umull v22.8h, v10.8b, v7.8b //mul (row 1) - tbl v15.8b, {v0.16b},v9.8b //load from ref_main_idx + 1 (row 2) + umull v22.8h, v23.8b, v7.8b //mul (row 1) + tbl v15.8b, {v0.16b},v19.8b //load from ref_main_idx + 1 (row 2) umlal v22.8h, v17.8b, v6.8b //mul (row 1) rshrn v24.8b, v24.8h,#5 //round shft (row 0) st1 {v18.8b},[x5], x3 //(from previous loop)st (row 7) - sub v8.8b, v8.8b , v3.8b //ref_main_idx (row 4) - tbl v10.8b, {v0.16b},v4.8b //load from ref_main_idx (row 3) - sub v9.8b, v9.8b , v3.8b //ref_main_idx - 1 (row 4) + sub v1.8b, v1.8b , v3.8b //ref_main_idx (row 4) + tbl v23.8b, {v0.16b},v4.8b //load from ref_main_idx (row 3) + sub v19.8b, v19.8b , v3.8b //ref_main_idx - 1 (row 4) umull v20.8h, v14.8b, v7.8b //mul (row 2) - tbl v11.8b, {v0.16b},v5.8b //load from ref_main_idx + 1 (row 3) + tbl v25.8b, {v0.16b},v5.8b //load from ref_main_idx + 1 (row 3) umlal v20.8h, v15.8b, v6.8b //mul (row 2) smull v14.8h, v30.8b, v31.8b //(col+1)*intra_pred_angle [0:7](col) @@ -392,22 +394,22 @@ lbl323: rshrn v22.8b, v22.8h,#5 //round shft (row 1) sub v4.8b, v4.8b , v3.8b //ref_main_idx (row 5) - tbl v12.8b, {v0.16b},v8.8b //load from ref_main_idx (row 4) + tbl v12.8b, {v0.16b},v1.8b //load from ref_main_idx (row 4) sub v5.8b, v5.8b , v3.8b //ref_main_idx - 1 (row 5) - umull v18.8h, v10.8b, v7.8b //mul (row 3) - tbl v13.8b, {v0.16b},v9.8b //load from ref_main_idx + 1 (row 4) - umlal v18.8h, v11.8b, v6.8b //mul (row 3) + umull v18.8h, v23.8b, v7.8b //mul (row 3) + tbl v13.8b, {v0.16b},v19.8b //load from ref_main_idx + 1 (row 4) + umlal v18.8h, v25.8b, v6.8b //mul (row 3) st1 {v22.8b},[x2], x3 //st (row 1) rshrn v20.8b, v20.8h,#5 //round shft (row 2) - xtn v10.8b, v14.8h + xtn v23.8b, v14.8h sshr v14.8h, v14.8h,#5 - sub v8.8b, v8.8b , v3.8b //ref_main_idx (row 6) + sub v1.8b, v1.8b , v3.8b //ref_main_idx (row 6) tbl v21.8b, {v0.16b},v4.8b //load from ref_main_idx (row 5) - sub v9.8b, v9.8b , v3.8b //ref_main_idx - 1 (row 6) + sub v19.8b, v19.8b , v3.8b //ref_main_idx - 1 (row 6) umull v24.8h, v12.8b, v7.8b //mul (row 4) tbl v17.8b, {v0.16b},v5.8b //load from ref_main_idx + 1 (row 5) @@ -417,24 +419,24 @@ lbl323: rshrn v18.8b, v18.8h,#5 //round shft (row 3) sub x9, x9, #1 - sqxtn v11.8b, v14.8h + sqxtn v25.8b, v14.8h sub v4.8b, v4.8b , v3.8b //ref_main_idx (row 7) - tbl v14.8b, {v0.16b},v8.8b //load from ref_main_idx (row 6) + tbl v14.8b, {v0.16b},v1.8b //load from ref_main_idx (row 6) sub v5.8b, v5.8b , v3.8b //ref_main_idx - 1 (row 7) umull v22.8h, v21.8b, v7.8b //mul (row 5) - tbl v15.8b, {v0.16b},v9.8b //load from ref_main_idx + 1 (row 6) + tbl v15.8b, {v0.16b},v19.8b //load from ref_main_idx + 1 (row 6) umlal v22.8h, v17.8b, v6.8b //mul (row 5) - add v11.8b, v27.8b , v11.8b //ref_main_idx (add row) + add v25.8b, v27.8b , v25.8b //ref_main_idx (add row) dup v26.8b,w9 st1 {v18.8b},[x2], x3 //st (row 3) rshrn v24.8b, v24.8h,#5 //round shft (row 4) add x2, x2, x3, lsl #2 - sub v11.8b, v11.8b , v2.8b //ref_main_idx -1 (sub 1) + sub v25.8b, v25.8b , v2.8b //ref_main_idx -1 (sub 1) add x20, x7, x2 csel x2, x20, x2,gt @@ -446,17 +448,17 @@ lbl323: bne kernel_8_16_32 epil_8_16_32: - tbl v10.8b, {v0.16b},v4.8b //load from ref_main_idx (row 7) + tbl v23.8b, {v0.16b},v4.8b //load from ref_main_idx (row 7) umull v20.8h, v14.8b, v7.8b //mul (row 6) - tbl v11.8b, {v0.16b},v5.8b //load from ref_main_idx + 1 (row 7) + tbl v25.8b, {v0.16b},v5.8b //load from ref_main_idx + 1 (row 7) umlal v20.8h, v15.8b, v6.8b //mul (row 6) st1 {v24.8b},[x5], x3 //st (row 4) rshrn v24.8b, v22.8h,#5 //round shft (row 5) - umull v18.8h, v10.8b, v7.8b //mul (row 7) - umlal v18.8h, v11.8b, v6.8b //mul (row 7) + umull v18.8h, v23.8b, v7.8b //mul (row 7) + umlal v18.8h, v25.8b, v6.8b //mul (row 7) st1 {v24.8b},[x5], x3 //(from previous loop)st (row 5) rshrn v20.8b, v20.8h,#5 //(from previous loop)round shft (row 6) @@ -499,40 +501,40 @@ sz_4_proc: movi v28.8b, #32 sshr v22.8h, v22.8h,#5 - sqxtn v8.8b, v22.8h + sqxtn v1.8b, v22.8h and v6.8b, v6.8b , v29.8b //fract values in d1/ idx values in d0 sub v7.8b, v28.8b , v6.8b //32-fract movi v27.8b, #7 //row 0 to 7(row-1) - sub v8.8b, v8.8b , v2.8b //ref_main_idx (add 1) - sub v8.8b, v26.8b , v8.8b //ref_main_idx - add v8.8b, v8.8b , v27.8b //t0 compensate the pu1_src idx incremented by 8 - sub v9.8b, v8.8b , v2.8b //ref_main_idx - 1 + sub v1.8b, v1.8b , v2.8b //ref_main_idx (add 1) + sub v1.8b, v26.8b , v1.8b //ref_main_idx + add v1.8b, v1.8b , v27.8b //t0 compensate the pu1_src idx incremented by 8 + sub v19.8b, v1.8b , v2.8b //ref_main_idx - 1 - sub v4.8b, v8.8b , v2.8b //row 1 ref_main_idx - sub v5.8b, v9.8b , v2.8b + sub v4.8b, v1.8b , v2.8b //row 1 ref_main_idx + sub v5.8b, v19.8b , v2.8b - tbl v12.8b, {v0.16b},v8.8b //load from ref_main_idx (row 0) - tbl v13.8b, {v0.16b},v9.8b //load from ref_main_idx + 1 (row 0) + tbl v12.8b, {v0.16b},v1.8b //load from ref_main_idx (row 0) + tbl v13.8b, {v0.16b},v19.8b //load from ref_main_idx + 1 (row 0) umull v24.8h, v12.8b, v7.8b //mul (row 0) tbl v16.8b, {v0.16b},v4.8b //load from ref_main_idx (row 1) umlal v24.8h, v13.8b, v6.8b //mul (row 0) - sub v8.8b, v8.8b , v3.8b //idx (row 2) + sub v1.8b, v1.8b , v3.8b //idx (row 2) tbl v17.8b, {v0.16b},v5.8b //load from ref_main_idx + 1 (row 1) - sub v9.8b, v9.8b , v3.8b //idx+1 (row 2) + sub v19.8b, v19.8b , v3.8b //idx+1 (row 2) umull v22.8h, v16.8b, v7.8b //mul (row 1) - tbl v12.8b, {v0.16b},v8.8b //load from ref_main_idx (row 2) + tbl v12.8b, {v0.16b},v1.8b //load from ref_main_idx (row 2) umlal v22.8h, v17.8b, v6.8b //mul (row 1) rshrn v24.8b, v24.8h,#5 //round shift (row 0) sub v4.8b, v4.8b , v3.8b //idx (row 3) - tbl v13.8b, {v0.16b},v9.8b //load from ref_main_idx + 1 (row 2) + tbl v13.8b, {v0.16b},v19.8b //load from ref_main_idx + 1 (row 2) sub v5.8b, v5.8b , v3.8b //idx+1 (row 3) umull v20.8h, v12.8b, v7.8b //mul (row 2) @@ -559,7 +561,8 @@ sz_4_proc: end_func: // ldmfd sp!,{x4-x12,x15} //reload the registers from sp ldp x19, x20,[sp],#16 - pop_v_regs + ldp d14,d15,[sp],#16 + ldp d12,d13,[sp],#16 ret diff --git a/common/arm64/ihevc_intra_pred_luma_planar.s b/common/arm64/ihevc_intra_pred_luma_planar.s index d2f27a2..ba04f42 100644 --- a/common/arm64/ihevc_intra_pred_luma_planar.s +++ b/common/arm64/ihevc_intra_pred_luma_planar.s @@ -107,7 +107,7 @@ ihevc_intra_pred_luma_planar_av8: // stmfd sp!, {x4-x12, x14} //stack stores the values of the arguments - push_v_regs + stp x19, x20,[sp,#-16]! adrp x11, :got:gau1_ihevc_planar_factor //loads table of coeffs @@ -116,8 +116,8 @@ ihevc_intra_pred_luma_planar_av8: clz w5,w4 sub x20, x5, #32 neg x5, x20 - dup v14.8h,w5 - neg v14.8h, v14.8h //shr value (so vneg) + dup v29.8h,w5 + neg v29.8h, v29.8h //shr value (so vneg) dup v2.8b,w4 //nt dup v16.8h,w4 //nt @@ -175,22 +175,22 @@ tf_sz_8_16_32: col_loop_8_16_32: - ld1 {v8.8b},[x12] //(1-8)load 8 coeffs [col+1] - dup v12.8h,w4 //(1) + ld1 {v17.8b},[x12] //(1-8)load 8 coeffs [col+1] + dup v27.8h,w4 //(1) ld1 {v4.8b},[x6] //(1-8)src[2nt-1-row] - sub v9.8b, v2.8b , v8.8b //(1-8)[nt-1-col] + sub v19.8b, v2.8b , v17.8b //(1-8)[nt-1-col] - umlal v12.8h, v5.8b, v0.8b //(1)(row+1) * src[nt-1] + umlal v27.8h, v5.8b, v0.8b //(1)(row+1) * src[nt-1] ld1 {v3.8b},[x14] //(1-8)load 8 src[2nt+1+col] - umlal v12.8h, v8.8b, v1.8b //(1)(col+1) * src[3nt+1] + umlal v27.8h, v17.8b, v1.8b //(1)(col+1) * src[3nt+1] dup v20.8b, v4.8b[7] //(1) - umlal v12.8h, v6.8b, v3.8b //(1)(nt-1-row) * src[2nt+1+col] + umlal v27.8h, v6.8b, v3.8b //(1)(nt-1-row) * src[2nt+1+col] dup v21.8b, v4.8b[6] //(2) - umlal v12.8h, v9.8b, v20.8b //(1)(nt-1-col) * src[2nt-1-row] + umlal v27.8h, v19.8b, v20.8b //(1)(nt-1-col) * src[2nt-1-row] dup v30.8h,w4 //(2) add v5.8b, v5.8b , v7.8b //(1) @@ -201,46 +201,46 @@ col_loop_8_16_32: umlal v30.8h, v5.8b, v0.8b //(2) dup v28.8h,w4 //(3) - umlal v30.8h, v8.8b, v1.8b //(2) + umlal v30.8h, v17.8b, v1.8b //(2) umlal v30.8h, v6.8b, v3.8b //(2) - umlal v30.8h, v9.8b, v21.8b //(2) + umlal v30.8h, v19.8b, v21.8b //(2) - sshl v12.8h, v12.8h, v14.8h //(1)shr + sshl v27.8h, v27.8h, v29.8h //(1)shr add v5.8b, v5.8b , v7.8b //(2) sub v6.8b, v6.8b , v7.8b //(2) - xtn v12.8b, v12.8h //(1) + xtn v27.8b, v27.8h //(1) umlal v28.8h, v5.8b, v0.8b //(3) dup v23.8b, v4.8b[4] //(4) - umlal v28.8h, v8.8b, v1.8b //(3) + umlal v28.8h, v17.8b, v1.8b //(3) - dup v10.8h,w4 //(4) + dup v25.8h,w4 //(4) umlal v28.8h, v6.8b, v3.8b //(3) - st1 {v12.8b},[x2], x3 //(1)str 8 values - umlal v28.8h, v9.8b, v22.8b //(3) + st1 {v27.8b},[x2], x3 //(1)str 8 values + umlal v28.8h, v19.8b, v22.8b //(3) - sshl v30.8h, v30.8h, v14.8h //(2)shr + sshl v30.8h, v30.8h, v29.8h //(2)shr add v5.8b, v5.8b , v7.8b //(3) sub v6.8b, v6.8b , v7.8b //(3) xtn v30.8b, v30.8h //(2) - umlal v10.8h, v5.8b, v0.8b //(4) + umlal v25.8h, v5.8b, v0.8b //(4) dup v20.8b, v4.8b[3] //(5) - umlal v10.8h, v8.8b, v1.8b //(4) + umlal v25.8h, v17.8b, v1.8b //(4) dup v16.8h,w4 //(5) - umlal v10.8h, v6.8b, v3.8b //(4) + umlal v25.8h, v6.8b, v3.8b //(4) st1 {v30.8b},[x2], x3 //(2)str 8 values - umlal v10.8h, v9.8b, v23.8b //(4) + umlal v25.8h, v19.8b, v23.8b //(4) - sshl v28.8h, v28.8h, v14.8h //(3)shr + sshl v28.8h, v28.8h, v29.8h //(3)shr add v5.8b, v5.8b , v7.8b //(4) sub v6.8b, v6.8b , v7.8b //(4) @@ -249,31 +249,31 @@ col_loop_8_16_32: umlal v16.8h, v5.8b, v0.8b //(5) dup v21.8b, v4.8b[2] //(6) - umlal v16.8h, v8.8b, v1.8b //(5) + umlal v16.8h, v17.8b, v1.8b //(5) dup v18.8h,w4 //(6) umlal v16.8h, v6.8b, v3.8b //(5) st1 {v28.8b},[x2], x3 //(3)str 8 values - umlal v16.8h, v9.8b, v20.8b //(5) + umlal v16.8h, v19.8b, v20.8b //(5) - sshl v10.8h, v10.8h, v14.8h //(4)shr + sshl v25.8h, v25.8h, v29.8h //(4)shr add v5.8b, v5.8b , v7.8b //(5) sub v6.8b, v6.8b , v7.8b //(5) - xtn v10.8b, v10.8h //(4) + xtn v25.8b, v25.8h //(4) umlal v18.8h, v5.8b, v0.8b //(6) dup v22.8b, v4.8b[1] //(7) - umlal v18.8h, v8.8b, v1.8b //(6) + umlal v18.8h, v17.8b, v1.8b //(6) dup v26.8h,w4 //(7) umlal v18.8h, v6.8b, v3.8b //(6) - st1 {v10.8b},[x2], x3 //(4)str 8 values - umlal v18.8h, v9.8b, v21.8b //(6) + st1 {v25.8b},[x2], x3 //(4)str 8 values + umlal v18.8h, v19.8b, v21.8b //(6) - sshl v16.8h, v16.8h, v14.8h //(5)shr + sshl v16.8h, v16.8h, v29.8h //(5)shr add v5.8b, v5.8b , v7.8b //(6) sub v6.8b, v6.8b , v7.8b //(6) @@ -282,15 +282,15 @@ col_loop_8_16_32: umlal v26.8h, v5.8b, v0.8b //(7) dup v23.8b, v4.8b[0] //(8) - umlal v26.8h, v8.8b, v1.8b //(7) + umlal v26.8h, v17.8b, v1.8b //(7) dup v24.8h,w4 //(8) umlal v26.8h, v6.8b, v3.8b //(7) st1 {v16.8b},[x2], x3 //(5)str 8 values - umlal v26.8h, v9.8b, v22.8b //(7) + umlal v26.8h, v19.8b, v22.8b //(7) - sshl v18.8h, v18.8h, v14.8h //(6)shr + sshl v18.8h, v18.8h, v29.8h //(6)shr add v5.8b, v5.8b , v7.8b //(7) sub v6.8b, v6.8b , v7.8b //(7) @@ -299,14 +299,14 @@ col_loop_8_16_32: umlal v24.8h, v5.8b, v0.8b //(8) - umlal v24.8h, v8.8b, v1.8b //(8) + umlal v24.8h, v17.8b, v1.8b //(8) umlal v24.8h, v6.8b, v3.8b //(8) st1 {v18.8b},[x2], x3 //(6)str 8 values - umlal v24.8h, v9.8b, v23.8b //(8) + umlal v24.8h, v19.8b, v23.8b //(8) - sshl v26.8h, v26.8h, v14.8h //(7)shr + sshl v26.8h, v26.8h, v29.8h //(7)shr subs x7, x7, #8 @@ -322,7 +322,7 @@ col_loop_8_16_32: csel x12, x20, x12,le csel x14, x0, x14,le //x14 reset - ld1 {v8.8b},[x12] //(1n)(1-8)load 8 coeffs [col+1] + ld1 {v17.8b},[x12] //(1n)(1-8)load 8 coeffs [col+1] sub x20, x6, #8 //for next set of rows csel x6, x20, x6,le @@ -330,12 +330,12 @@ col_loop_8_16_32: add x20, x5, #8 csel x5, x20, x5,le - dup v12.8h,w4 //(1n)(1) + dup v27.8h,w4 //(1n)(1) ld1 {v5.8b},[x5] ld1 {v4.8b},[x6] //(1n)(1-8)src[2nt-1-row] - sub v9.8b, v2.8b , v8.8b //(1n)(1-8)[nt-1-col] + sub v19.8b, v2.8b , v17.8b //(1n)(1-8)[nt-1-col] dup v20.8b, v4.8b[7] //(1n)(1) sub v6.8b, v2.8b , v5.8b @@ -345,19 +345,19 @@ col_loop_8_16_32: kernel_plnr: cmp x1, #0 // (cond loop) - sshl v24.8h, v24.8h, v14.8h //(8)shr + sshl v24.8h, v24.8h, v29.8h //(8)shr xtn v26.8b, v26.8h //(7) - umlal v12.8h, v5.8b, v0.8b //(1)(row+1) * src[nt-1] + umlal v27.8h, v5.8b, v0.8b //(1)(row+1) * src[nt-1] xtn v24.8b, v24.8h //(8) - umlal v12.8h, v8.8b, v1.8b //(1)(col+1) * src[3nt+1] + umlal v27.8h, v17.8b, v1.8b //(1)(col+1) * src[3nt+1] dup v21.8b, v4.8b[6] //(2) - umlal v12.8h, v6.8b, v3.8b //(1)(nt-1-row) * src[2nt+1+col] + umlal v27.8h, v6.8b, v3.8b //(1)(nt-1-row) * src[2nt+1+col] dup v30.8h,w4 //(2) - umlal v12.8h, v9.8b, v20.8b //(1)(nt-1-col) * src[2nt-1-row] + umlal v27.8h, v19.8b, v20.8b //(1)(nt-1-col) * src[2nt-1-row] st1 {v26.8b},[x2], x3 //(7)str 8 values add v5.8b, v5.8b , v7.8b //(1) @@ -371,15 +371,15 @@ kernel_plnr: sub x20, x2, x10 //else go to next set of rows, dst - (nt-8) (cond loop) csel x2, x20, x2,le - umlal v30.8h, v8.8b, v1.8b //(2) + umlal v30.8h, v17.8b, v1.8b //(2) dup v22.8b, v4.8b[5] //(3) umlal v30.8h, v6.8b, v3.8b //(2) dup v28.8h,w4 //(3) - umlal v30.8h, v9.8b, v21.8b //(2) + umlal v30.8h, v19.8b, v21.8b //(2) - sshl v12.8h, v12.8h, v14.8h //(1)shr + sshl v27.8h, v27.8h, v29.8h //(1)shr add v5.8b, v5.8b , v7.8b //(2) csel x1, x4, x1,le //nt reloaded (refresh the value) (cond loop) @@ -387,37 +387,37 @@ kernel_plnr: sub v6.8b, v6.8b , v7.8b //(2) subs x1, x1, #8 //row counter (loop) - xtn v12.8b, v12.8h //(1) + xtn v27.8b, v27.8h //(1) umlal v28.8h, v5.8b, v0.8b //(3) dup v23.8b, v4.8b[4] //(4) - umlal v28.8h, v8.8b, v1.8b //(3) + umlal v28.8h, v17.8b, v1.8b //(3) - dup v10.8h,w4 //(4) + dup v25.8h,w4 //(4) umlal v28.8h, v6.8b, v3.8b //(3) - st1 {v12.8b},[x2], x3 //(1)str 8 values - umlal v28.8h, v9.8b, v22.8b //(3) + st1 {v27.8b},[x2], x3 //(1)str 8 values + umlal v28.8h, v19.8b, v22.8b //(3) - sshl v30.8h, v30.8h, v14.8h //(2)shr + sshl v30.8h, v30.8h, v29.8h //(2)shr add v5.8b, v5.8b , v7.8b //(3) sub v6.8b, v6.8b , v7.8b //(3) xtn v30.8b, v30.8h //(2) - umlal v10.8h, v5.8b, v0.8b //(4) + umlal v25.8h, v5.8b, v0.8b //(4) dup v20.8b, v4.8b[3] //(5) - umlal v10.8h, v8.8b, v1.8b //(4) + umlal v25.8h, v17.8b, v1.8b //(4) dup v16.8h,w4 //(5) - umlal v10.8h, v6.8b, v3.8b //(4) + umlal v25.8h, v6.8b, v3.8b //(4) st1 {v30.8b},[x2], x3 //(2)str 8 values - umlal v10.8h, v9.8b, v23.8b //(4) + umlal v25.8h, v19.8b, v23.8b //(4) - sshl v28.8h, v28.8h, v14.8h //(3)shr + sshl v28.8h, v28.8h, v29.8h //(3)shr add v5.8b, v5.8b , v7.8b //(4) @@ -427,17 +427,17 @@ kernel_plnr: umlal v16.8h, v5.8b, v0.8b //(5) dup v21.8b, v4.8b[2] //(6) - umlal v16.8h, v8.8b, v1.8b //(5) + umlal v16.8h, v17.8b, v1.8b //(5) dup v18.8h,w4 //(6) umlal v16.8h, v6.8b, v3.8b //(5) st1 {v28.8b},[x2], x3 //(3)str 8 values - umlal v16.8h, v9.8b, v20.8b //(5) + umlal v16.8h, v19.8b, v20.8b //(5) add x20, x11, #1 //x12 reset (cond loop) csel x12, x20, x12,le - sshl v10.8h, v10.8h, v14.8h //(4)shr + sshl v25.8h, v25.8h, v29.8h //(4)shr add x20, x12, #8 //col inc (cond loop) csel x12, x20, x12,gt @@ -447,20 +447,20 @@ kernel_plnr: csel x14, x20, x14,gt sub v6.8b, v6.8b , v7.8b //(5) - xtn v10.8b, v10.8h //(4) + xtn v25.8b, v25.8h //(4) umlal v18.8h, v5.8b, v0.8b //(6) dup v22.8b, v4.8b[1] //(7) - umlal v18.8h, v8.8b, v1.8b //(6) + umlal v18.8h, v17.8b, v1.8b //(6) dup v26.8h,w4 //(7) umlal v18.8h, v6.8b, v3.8b //(6) - st1 {v10.8b},[x2], x3 //(4)str 8 values - umlal v18.8h, v9.8b, v21.8b //(6) + st1 {v25.8b},[x2], x3 //(4)str 8 values + umlal v18.8h, v19.8b, v21.8b //(6) csel x14, x0, x14,le //x14 reset (cond loop) - sshl v16.8h, v16.8h, v14.8h //(5)shr + sshl v16.8h, v16.8h, v29.8h //(5)shr sub x20, x6, #8 //for next set of rows (cond loop) csel x6, x20, x6,le @@ -474,16 +474,16 @@ kernel_plnr: umlal v26.8h, v5.8b, v0.8b //(7) dup v23.8b, v4.8b[0] //(8) - umlal v26.8h, v8.8b, v1.8b //(7) + umlal v26.8h, v17.8b, v1.8b //(7) dup v24.8h,w4 //(8) umlal v26.8h, v6.8b, v3.8b //(7) st1 {v16.8b},[x2], x3 //(5)str 8 values - umlal v26.8h, v9.8b, v22.8b //(7) + umlal v26.8h, v19.8b, v22.8b //(7) ld1 {v4.8b},[x6] //(1n)(1-8)src[2nt-1-row] - sshl v18.8h, v18.8h, v14.8h //(6)shr + sshl v18.8h, v18.8h, v29.8h //(6)shr add v5.8b, v5.8b , v7.8b //(7) @@ -493,24 +493,24 @@ kernel_plnr: umlal v24.8h, v5.8b, v0.8b //(8) ld1 {v5.8b},[x5] //(row+1 value) - umlal v24.8h, v8.8b, v1.8b //(8) + umlal v24.8h, v17.8b, v1.8b //(8) dup v20.8b, v4.8b[7] //(1n)(1) umlal v24.8h, v6.8b, v3.8b //(8) st1 {v18.8b},[x2], x3 //(6)str 8 values - umlal v24.8h, v9.8b, v23.8b //(8) + umlal v24.8h, v19.8b, v23.8b //(8) - ld1 {v8.8b},[x12] //(1n)(1-8)load 8 coeffs [col+1] + ld1 {v17.8b},[x12] //(1n)(1-8)load 8 coeffs [col+1] sub v6.8b, v2.8b , v5.8b //(nt-1-row) value subs x7, x7, #8 //col counter ld1 {v3.8b},[x14] //(1n)(1-8)load 8 src[2nt+1+col] - sshl v26.8h, v26.8h, v14.8h //(7)shr + sshl v26.8h, v26.8h, v29.8h //(7)shr - dup v12.8h,w4 //(1n)(1) - sub v9.8b, v2.8b , v8.8b //(1n)(1-8)[nt-1-col] + dup v27.8h,w4 //(1n)(1) + sub v19.8b, v2.8b , v17.8b //(1n)(1-8)[nt-1-col] bne kernel_plnr @@ -519,7 +519,7 @@ epilog: xtn v26.8b, v26.8h //(7) st1 {v26.8b},[x2], x3 //(7)str 8 values - sshl v24.8h, v24.8h, v14.8h //(8)shr + sshl v24.8h, v24.8h, v29.8h //(8)shr xtn v24.8b, v24.8h //(8) st1 {v24.8b},[x2], x3 //(8)str 8 values @@ -528,25 +528,25 @@ epilog: beq end_loop tf_sz_4: - ld1 {v10.8b},[x14] //load src[2nt+1+col] - ld1 {v8.8b},[x12], x10 //load 8 coeffs [col+1] + ld1 {v25.8b},[x14] //load src[2nt+1+col] + ld1 {v17.8b},[x12], x10 //load 8 coeffs [col+1] loop_sz_4: mov x10, #4 //reduce inc to #4 for 4x4 ldr w7, [x6], #-1 //src[2nt-1-row] (dec to take into account row) sxtw x7,w7 dup v4.8b,w7 //src[2nt-1-row] - sub v9.8b, v2.8b , v8.8b //[nt-1-col] + sub v19.8b, v2.8b , v17.8b //[nt-1-col] - umull v12.8h, v5.8b, v0.8b //(row+1) * src[nt-1] - umlal v12.8h, v6.8b, v10.8b //(nt-1-row) * src[2nt+1+col] - umlal v12.8h, v8.8b, v1.8b //(col+1) * src[3nt+1] - umlal v12.8h, v9.8b, v4.8b //(nt-1-col) * src[2nt-1-row] + umull v27.8h, v5.8b, v0.8b //(row+1) * src[nt-1] + umlal v27.8h, v6.8b, v25.8b //(nt-1-row) * src[2nt+1+col] + umlal v27.8h, v17.8b, v1.8b //(col+1) * src[3nt+1] + umlal v27.8h, v19.8b, v4.8b //(nt-1-col) * src[2nt-1-row] // vadd.i16 q6, q6, q8 @add (nt) // vshl.s16 q6, q6, q7 @shr // vmovn.i16 d12, q6 - rshrn v12.8b, v12.8h,#3 - st1 {v12.s}[0],[x2], x3 + rshrn v27.8b, v27.8h,#3 + st1 {v27.s}[0],[x2], x3 add v5.8b, v5.8b , v7.8b //row++ [(row+1)++] sub v6.8b, v6.8b , v7.8b //[nt-1-row]-- @@ -557,7 +557,7 @@ loop_sz_4: end_loop: // ldmfd sp!,{x4-x12,x15} //reload the registers from sp ldp x19, x20,[sp],#16 - pop_v_regs + ret diff --git a/common/arm64/ihevc_intra_pred_luma_vert.s b/common/arm64/ihevc_intra_pred_luma_vert.s index 56a20a0..c67f721 100644 --- a/common/arm64/ihevc_intra_pred_luma_vert.s +++ b/common/arm64/ihevc_intra_pred_luma_vert.s @@ -101,7 +101,7 @@ ihevc_intra_pred_luma_ver_av8: // stmfd sp!, {x4-x12, x14} //stack stores the values of the arguments - push_v_regs + stp x19, x20,[sp,#-16]! lsl x5, x4, #1 //2nt @@ -207,7 +207,7 @@ blk_16: sqadd v0.8h, v0.8h , v30.8h sqadd v28.8h, v28.8h , v30.8h - movi d10, #0x00000000000000ff + movi d3, #0x00000000000000ff //vaddl.s8 q1, d25, d27 sqxtun v24.8b, v28.8h @@ -218,13 +218,13 @@ blk_16: rev64 v24.16b, v24.16b mov v25.d[0], v24.d[1] - mov v11.d[0],v17.d[0] + mov v4.d[0],v17.d[0] bsl v18.8b, v24.8b , v16.8b //only select row values from q12(predpixel) - bsl v10.8b, v25.8b , v16.8b + bsl v3.8b, v25.8b , v16.8b - movi d8, #0x00000000000000ff - mov v9.d[0],v17.d[0] + movi d1, #0x00000000000000ff + mov v2.d[0],v17.d[0] movi d6, #0x00000000000000ff mov v7.d[0],v17.d[0] @@ -232,14 +232,14 @@ blk_16: st1 {v18.8b, v19.8b}, [x2], x3 sshr d24, d24,#8 - st1 {v10.8b, v11.8b}, [x5], x3 + st1 {v3.8b, v4.8b}, [x5], x3 sshr d25, d25,#8 - bsl v8.8b, v24.8b , v16.8b + bsl v1.8b, v24.8b , v16.8b bsl v6.8b, v25.8b , v16.8b - st1 {v8.8b, v9.8b}, [x2], x3 + st1 {v1.8b, v2.8b}, [x2], x3 sshr d24, d24,#8 st1 {v6.8b, v7.8b}, [x5], x3 @@ -250,34 +250,34 @@ blk_16: movi d18, #0x00000000000000ff //vmov.i64 d19, d17 - movi d10, #0x00000000000000ff + movi d3, #0x00000000000000ff //vmov.i64 d11, d17 loop_16: - movi d8, #0x00000000000000ff + movi d1, #0x00000000000000ff movi d6, #0x00000000000000ff bsl v18.8b, v24.8b , v16.8b //only select row values from q12(predpixel) - bsl v10.8b, v25.8b , v16.8b + bsl v3.8b, v25.8b , v16.8b st1 {v18.8b, v19.8b}, [x2], x3 sshr d24, d24,#8 - st1 {v10.8b, v11.8b}, [x5], x3 + st1 {v3.8b, v4.8b}, [x5], x3 sshr d25, d25,#8 movi d18, #0x00000000000000ff - movi d10, #0x00000000000000ff + movi d3, #0x00000000000000ff - bsl v8.8b, v24.8b , v16.8b + bsl v1.8b, v24.8b , v16.8b bsl v6.8b, v25.8b , v16.8b - st1 {v8.8b, v9.8b}, [x2], x3 + st1 {v1.8b, v2.8b}, [x2], x3 sshr d24, d24,#8 st1 {v6.8b, v7.8b}, [x5], x3 @@ -287,23 +287,23 @@ loop_16: bne loop_16 - movi d8, #0x00000000000000ff + movi d1, #0x00000000000000ff movi d6, #0x00000000000000ff bsl v18.8b, v24.8b , v16.8b //only select row values from q12(predpixel) - bsl v10.8b, v25.8b , v16.8b + bsl v3.8b, v25.8b , v16.8b st1 {v18.8b, v19.8b}, [x2], x3 sshr d24, d24,#8 - st1 {v10.8b, v11.8b}, [x5], x3 + st1 {v3.8b, v4.8b}, [x5], x3 sshr d25, d25,#8 - bsl v8.8b, v24.8b , v16.8b + bsl v1.8b, v24.8b , v16.8b bsl v6.8b, v25.8b , v16.8b - st1 {v8.8b, v9.8b}, [x2], x3 + st1 {v1.8b, v2.8b}, [x2], x3 st1 {v6.8b, v7.8b}, [x5], x3 @@ -311,10 +311,10 @@ loop_16: blk_4_8: - movi d11, #0x00000000000000ff + movi d4, #0x00000000000000ff add x6, x0, x5 //&src[2nt] - movi d10, #0x00000000000000ff + movi d3, #0x00000000000000ff ldrb w11, [x6], #1 //src[2nt] sxtw x11,w11 @@ -363,19 +363,19 @@ blk_4_8: movi d19, #0x00000000000000ff - bsl v10.8b, v24.8b , v16.8b + bsl v3.8b, v24.8b , v16.8b - st1 {v10.8b},[x2], x3 + st1 {v3.8b},[x2], x3 sshr d24, d24,#8 - movi d10, #0x00000000000000ff + movi d3, #0x00000000000000ff - bsl v11.8b, v24.8b , v16.8b + bsl v4.8b, v24.8b , v16.8b - st1 {v11.8b},[x2], x3 + st1 {v4.8b},[x2], x3 sshr d24, d24,#8 - movi d11, #0x00000000000000ff + movi d4, #0x00000000000000ff bsl v18.8b, v24.8b , v16.8b //only select row values from q12(predpixel) @@ -387,14 +387,14 @@ blk_4_8: st1 {v19.8b},[x2], x3 sshr d24, d24,#8 - bsl v10.8b, v24.8b , v16.8b + bsl v3.8b, v24.8b , v16.8b - st1 {v10.8b},[x2], x3 + st1 {v3.8b},[x2], x3 sshr d24, d24,#8 - bsl v11.8b, v24.8b , v16.8b + bsl v4.8b, v24.8b , v16.8b - st1 {v11.8b},[x2], x3 + st1 {v4.8b},[x2], x3 sshr d24, d24,#8 b end_func @@ -411,19 +411,19 @@ blk_4: st1 {v19.s}[0],[x2], x3 sshr d24, d24,#8 - bsl v10.8b, v24.8b , v16.8b + bsl v3.8b, v24.8b , v16.8b - st1 {v10.s}[0],[x2], x3 + st1 {v3.s}[0],[x2], x3 sshr d24, d24,#8 - bsl v11.8b, v24.8b , v16.8b - st1 {v11.s}[0],[x2], x3 + bsl v4.8b, v24.8b , v16.8b + st1 {v4.s}[0],[x2], x3 end_func: // ldmfd sp!,{x4-x12,x15} //reload the registers from sp ldp x19, x20,[sp],#16 - pop_v_regs + ret diff --git a/common/arm64/ihevc_itrans_recon_4x4.s b/common/arm64/ihevc_itrans_recon_4x4.s index b18fb89..1f2c904 100644 --- a/common/arm64/ihevc_itrans_recon_4x4.s +++ b/common/arm64/ihevc_itrans_recon_4x4.s @@ -119,7 +119,7 @@ ihevc_itrans_recon_4x4_av8: // stmfd sp!, {x4-x12, x14} //stack stores the values of the arguments - push_v_regs + stp x19, x20,[sp,#-16]! adrp x8, :got:g_ai2_ihevc_trans_4_transpose @@ -142,21 +142,21 @@ ihevc_itrans_recon_4x4_av8: // first stage computation starts smull v6.4s, v1.4h, v4.4h[1] //83 * pi2_src[1] smlal v6.4s, v3.4h, v4.4h[3] //o[0] = 83 * pi2_src[1] + 36 * pi2_src[3] - smull v8.4s, v1.4h, v4.4h[3] //36 * pi2_src[1] + smull v5.4s, v1.4h, v4.4h[3] //36 * pi2_src[1] ld1 {v22.s}[0],[x2],x5 - smlsl v8.4s, v3.4h, v4.4h[1] //o[1] = 36 * pi2_src[1] - 83 * pi2_src[3] + smlsl v5.4s, v3.4h, v4.4h[1] //o[1] = 36 * pi2_src[1] - 83 * pi2_src[3] - saddl v10.4s, v0.4h, v2.4h //pi2_src[0] + pi2_src[2] - ssubl v12.4s, v0.4h, v2.4h //pi2_src[0] - pi2_src[2] - shl v10.4s, v10.4s,#6 //e[0] = 64*(pi2_src[0] + pi2_src[2]) - shl v12.4s, v12.4s,#6 //e[1] = 64*(pi2_src[0] - pi2_src[2]) + saddl v7.4s, v0.4h, v2.4h //pi2_src[0] + pi2_src[2] + ssubl v17.4s, v0.4h, v2.4h //pi2_src[0] - pi2_src[2] + shl v7.4s, v7.4s,#6 //e[0] = 64*(pi2_src[0] + pi2_src[2]) + shl v17.4s, v17.4s,#6 //e[1] = 64*(pi2_src[0] - pi2_src[2]) - add v14.4s, v10.4s , v6.4s //((e[0] + o[0] ) - add v16.4s, v12.4s , v8.4s //((e[1] + o[1]) - sub v18.4s, v12.4s , v8.4s //((e[1] - o[1]) - sub v20.4s, v10.4s , v6.4s //((e[0] - o[0]) + add v19.4s, v7.4s , v6.4s //((e[0] + o[0] ) + add v16.4s, v17.4s , v5.4s //((e[1] + o[1]) + sub v18.4s, v17.4s , v5.4s //((e[1] - o[1]) + sub v20.4s, v7.4s , v6.4s //((e[0] - o[0]) - sqrshrn v28.4h, v14.4s,#shift_stage1_idct //pi2_out[0] = clip_s16((e[0] + o[0] + add)>>shift) ) + sqrshrn v28.4h, v19.4s,#shift_stage1_idct //pi2_out[0] = clip_s16((e[0] + o[0] + add)>>shift) ) sqrshrn v29.4h, v16.4s,#shift_stage1_idct //pi2_out[1] = clip_s16((e[1] + o[1] + add)>>shift) ) sqrshrn v30.4h, v18.4s,#shift_stage1_idct //pi2_out[2] = clip_s16((e[0] - o[0] + add)>>shift) ) sqrshrn v31.4h, v20.4s,#shift_stage1_idct //pi2_out[3] = clip_s16((e[0] - o[0] + add)>>shift) ) @@ -176,22 +176,22 @@ ihevc_itrans_recon_4x4_av8: smull v6.4s, v1.4h, v4.4h[1] //83 * pi2_src[1] ld1 {v22.s}[1],[x2],x5 smlal v6.4s, v3.4h, v4.4h[3] //o[0] = 83 * pi2_src[1] + 36 * pi2_src[3] - smull v8.4s, v1.4h, v4.4h[3] //36 * pi2_src[1] - smlsl v8.4s, v3.4h, v4.4h[1] //o[1] = 36 * pi2_src[1] - 83 * pi2_src[3] + smull v5.4s, v1.4h, v4.4h[3] //36 * pi2_src[1] + smlsl v5.4s, v3.4h, v4.4h[1] //o[1] = 36 * pi2_src[1] - 83 * pi2_src[3] ld1 {v23.s}[0],[x2],x5 - saddl v10.4s, v0.4h, v2.4h //pi2_src[0] + pi2_src[2] - ssubl v12.4s, v0.4h, v2.4h //pi2_src[0] - pi2_src[2] - shl v10.4s, v10.4s,#6 //e[0] = 64*(pi2_src[0] + pi2_src[2]) - shl v12.4s, v12.4s,#6 //e[1] = 64*(pi2_src[0] - pi2_src[2]) + saddl v7.4s, v0.4h, v2.4h //pi2_src[0] + pi2_src[2] + ssubl v17.4s, v0.4h, v2.4h //pi2_src[0] - pi2_src[2] + shl v7.4s, v7.4s,#6 //e[0] = 64*(pi2_src[0] + pi2_src[2]) + shl v17.4s, v17.4s,#6 //e[1] = 64*(pi2_src[0] - pi2_src[2]) - add v14.4s, v10.4s , v6.4s //((e[0] + o[0] ) - add v16.4s, v12.4s , v8.4s //((e[1] + o[1]) - sub v18.4s, v12.4s , v8.4s //((e[1] - o[1]) - sub v20.4s, v10.4s , v6.4s //((e[0] - o[0]) + add v19.4s, v7.4s , v6.4s //((e[0] + o[0] ) + add v16.4s, v17.4s , v5.4s //((e[1] + o[1]) + sub v18.4s, v17.4s , v5.4s //((e[1] - o[1]) + sub v20.4s, v7.4s , v6.4s //((e[0] - o[0]) - sqrshrn v28.4h, v14.4s,#shift_stage2_idct //pi2_out[0] = clip_s16((e[0] + o[0] + add)>>shift) ) + sqrshrn v28.4h, v19.4s,#shift_stage2_idct //pi2_out[0] = clip_s16((e[0] + o[0] + add)>>shift) ) sqrshrn v29.4h, v16.4s,#shift_stage2_idct //pi2_out[1] = clip_s16((e[1] + o[1] + add)>>shift) ) sqrshrn v30.4h, v18.4s,#shift_stage2_idct //pi2_out[2] = clip_s16((e[0] - o[0] + add)>>shift) ) sqrshrn v31.4h, v20.4s,#shift_stage2_idct //pi2_out[3] = clip_s16((e[0] - o[0] + add)>>shift) ) @@ -228,7 +228,7 @@ ihevc_itrans_recon_4x4_av8: // ldmfd sp!,{x4-x12,x15} //reload the registers from sp ldp x19, x20,[sp],#16 - pop_v_regs + ret diff --git a/common/arm64/ihevc_itrans_recon_4x4_ttype1.s b/common/arm64/ihevc_itrans_recon_4x4_ttype1.s index fa04b8e..da04c5e 100644 --- a/common/arm64/ihevc_itrans_recon_4x4_ttype1.s +++ b/common/arm64/ihevc_itrans_recon_4x4_ttype1.s @@ -118,7 +118,7 @@ ihevc_itrans_recon_4x4_ttype1_av8: // stmfd sp!, {x4-x12, x14} //stack stores the values of the arguments - push_v_regs + stp x19, x20,[sp,#-16]! add x4,x4,x4 // src_strd in terms of word16 @@ -142,33 +142,33 @@ ihevc_itrans_recon_4x4_ttype1_av8: smlal v6.4s, v3.4h, v4.4h[1] //74 * pi2_src[1] + 29 * pi2_src[0] + 55 * pi2_src[3] smlal v6.4s, v2.4h, v4.4h[3] //pi2_out[0] = 29* pi2_src[0] + 74 * pi2_src[1] + 84* pi2_src[2] + 55 * pi2_src[3] - smull v8.4s, v1.4h, v4.4h[2] //74 * pi2_src[1] - smlal v8.4s, v0.4h, v4.4h[1] //74 * pi2_src[1] + 55 * pi2_src[0] - smlsl v8.4s, v2.4h, v4.4h[0] //74 * pi2_src[1] + 55 * pi2_src[0] - 29 * pi2_src[2] - smlsl v8.4s, v3.4h, v4.4h[3] //pi2_out[1] = 74 * pi2_src[1] + 55 * pi2_src[0] - 29 * pi2_src[2] - 84 * pi2_src[3]) + smull v5.4s, v1.4h, v4.4h[2] //74 * pi2_src[1] + smlal v5.4s, v0.4h, v4.4h[1] //74 * pi2_src[1] + 55 * pi2_src[0] + smlsl v5.4s, v2.4h, v4.4h[0] //74 * pi2_src[1] + 55 * pi2_src[0] - 29 * pi2_src[2] + smlsl v5.4s, v3.4h, v4.4h[3] //pi2_out[1] = 74 * pi2_src[1] + 55 * pi2_src[0] - 29 * pi2_src[2] - 84 * pi2_src[3]) - smull v10.4s, v0.4h, v4.4h[2] // 74 * pi2_src[0] - smlsl v10.4s, v2.4h, v4.4h[2] // 74 * pi2_src[0] - 74 * pi2_src[2] - smlal v10.4s, v3.4h, v4.4h[2] //pi2_out[2] = 74 * pi2_src[0] - 74 * pi2_src[2] + 74 * pi2_src[3] + smull v7.4s, v0.4h, v4.4h[2] // 74 * pi2_src[0] + smlsl v7.4s, v2.4h, v4.4h[2] // 74 * pi2_src[0] - 74 * pi2_src[2] + smlal v7.4s, v3.4h, v4.4h[2] //pi2_out[2] = 74 * pi2_src[0] - 74 * pi2_src[2] + 74 * pi2_src[3] - smull v12.4s, v2.4h, v4.4h[1] // 55 * pi2_src[2] - smlsl v12.4s, v1.4h, v4.4h[2] // 55 * pi2_src[2] - 74 * pi2_src[1] - smlsl v12.4s, v3.4h, v4.4h[0] // - 74 * pi2_src[1] + 55 * pi2_src[2] - 29 * pi2_src[3] - smlal v12.4s, v0.4h, v4.4h[3] //pi2_out[3] = 84 * pi2_src[0] - 74 * pi2_src[1] + 55 * pi2_src[2] - 29 * pi2_src[3] + smull v20.4s, v2.4h, v4.4h[1] // 55 * pi2_src[2] + smlsl v20.4s, v1.4h, v4.4h[2] // 55 * pi2_src[2] - 74 * pi2_src[1] + smlsl v20.4s, v3.4h, v4.4h[0] // - 74 * pi2_src[1] + 55 * pi2_src[2] - 29 * pi2_src[3] + smlal v20.4s, v0.4h, v4.4h[3] //pi2_out[3] = 84 * pi2_src[0] - 74 * pi2_src[1] + 55 * pi2_src[2] - 29 * pi2_src[3] sqrshrn v28.4h, v6.4s,#shift_stage1_idct // (pi2_out[0] + rounding ) >> shift_stage1_idct - sqrshrn v29.4h, v8.4s,#shift_stage1_idct // (pi2_out[1] + rounding ) >> shift_stage1_idct - sqrshrn v30.4h, v10.4s,#shift_stage1_idct // (pi2_out[2] + rounding ) >> shift_stage1_idct - sqrshrn v31.4h, v12.4s,#shift_stage1_idct // (pi2_out[3] + rounding ) >> shift_stage1_idct + sqrshrn v29.4h, v5.4s,#shift_stage1_idct // (pi2_out[1] + rounding ) >> shift_stage1_idct + sqrshrn v30.4h, v7.4s,#shift_stage1_idct // (pi2_out[2] + rounding ) >> shift_stage1_idct + sqrshrn v31.4h, v20.4s,#shift_stage1_idct // (pi2_out[3] + rounding ) >> shift_stage1_idct ld1 {v18.s}[0],[x2],x5 trn1 v24.4h, v28.4h, v29.4h trn2 v25.4h, v28.4h, v29.4h trn1 v26.4h, v30.4h, v31.4h trn2 v27.4h, v30.4h, v31.4h - trn1 v14.2s, v24.2s, v26.2s + trn1 v21.2s, v24.2s, v26.2s trn2 v16.2s, v24.2s, v26.2s - trn1 v15.2s, v25.2s, v27.2s + trn1 v22.2s, v25.2s, v27.2s trn2 v17.2s, v25.2s, v27.2s // output in d14,d15,d16,d17 // first stage computation ends @@ -180,30 +180,30 @@ ihevc_itrans_recon_4x4_ttype1_av8: // d16 - d2 // d17 - d3 ld1 {v18.s}[1],[x2],x5 - smull v6.4s, v15.4h, v4.4h[2] //74 * pi2_src[1] - smlal v6.4s, v14.4h, v4.4h[0] //74 * pi2_src[1] + 29 * pi2_src[0] + smull v6.4s, v22.4h, v4.4h[2] //74 * pi2_src[1] + smlal v6.4s, v21.4h, v4.4h[0] //74 * pi2_src[1] + 29 * pi2_src[0] smlal v6.4s, v17.4h, v4.4h[1] //74 * pi2_src[1] + 29 * pi2_src[0] + 55 * pi2_src[3] smlal v6.4s, v16.4h, v4.4h[3] //pi2_out[0] = 29* pi2_src[0] + 74 * pi2_src[1] + 84* pi2_src[2] + 55 * pi2_src[3] - smull v8.4s, v15.4h, v4.4h[2] //74 * pi2_src[1] - smlal v8.4s, v14.4h, v4.4h[1] //74 * pi2_src[1] + 55 * pi2_src[0] - smlsl v8.4s, v16.4h, v4.4h[0] //74 * pi2_src[1] + 55 * pi2_src[0] - 29 * pi2_src[2] - smlsl v8.4s, v17.4h, v4.4h[3] //pi2_out[1] = 74 * pi2_src[1] + 55 * pi2_src[0] - 29 * pi2_src[2] - 84 * pi2_src[3]) + smull v5.4s, v22.4h, v4.4h[2] //74 * pi2_src[1] + smlal v5.4s, v21.4h, v4.4h[1] //74 * pi2_src[1] + 55 * pi2_src[0] + smlsl v5.4s, v16.4h, v4.4h[0] //74 * pi2_src[1] + 55 * pi2_src[0] - 29 * pi2_src[2] + smlsl v5.4s, v17.4h, v4.4h[3] //pi2_out[1] = 74 * pi2_src[1] + 55 * pi2_src[0] - 29 * pi2_src[2] - 84 * pi2_src[3]) - smull v10.4s, v14.4h, v4.4h[2] // 74 * pi2_src[0] - smlsl v10.4s, v16.4h, v4.4h[2] // 74 * pi2_src[0] - 74 * pi2_src[2] - smlal v10.4s, v17.4h, v4.4h[2] //pi2_out[2] = 74 * pi2_src[0] - 74 * pi2_src[2] + 74 * pi2_src[3] + smull v7.4s, v21.4h, v4.4h[2] // 74 * pi2_src[0] + smlsl v7.4s, v16.4h, v4.4h[2] // 74 * pi2_src[0] - 74 * pi2_src[2] + smlal v7.4s, v17.4h, v4.4h[2] //pi2_out[2] = 74 * pi2_src[0] - 74 * pi2_src[2] + 74 * pi2_src[3] ld1 {v19.s}[0],[x2],x5 - smull v12.4s, v16.4h, v4.4h[1] // 55 * pi2_src[2] - smlsl v12.4s, v15.4h, v4.4h[2] // - 74 * pi2_src[1] + 55 * pi2_src[2] - smlsl v12.4s, v17.4h, v4.4h[0] // - 74 * pi2_src[1] + 55 * pi2_src[2] - 29 * pi2_src[3] - smlal v12.4s, v14.4h, v4.4h[3] //pi2_out[3] = 84 * pi2_src[0] - 74 * pi2_src[1] + 55 * pi2_src[2] - 29 * pi2_src[3] + smull v20.4s, v16.4h, v4.4h[1] // 55 * pi2_src[2] + smlsl v20.4s, v22.4h, v4.4h[2] // - 74 * pi2_src[1] + 55 * pi2_src[2] + smlsl v20.4s, v17.4h, v4.4h[0] // - 74 * pi2_src[1] + 55 * pi2_src[2] - 29 * pi2_src[3] + smlal v20.4s, v21.4h, v4.4h[3] //pi2_out[3] = 84 * pi2_src[0] - 74 * pi2_src[1] + 55 * pi2_src[2] - 29 * pi2_src[3] sqrshrn v28.4h, v6.4s,#shift_stage2_idct // (pi2_out[0] + rounding ) >> shift_stage1_idct - sqrshrn v29.4h, v8.4s,#shift_stage2_idct // (pi2_out[1] + rounding ) >> shift_stage1_idct - sqrshrn v30.4h, v10.4s,#shift_stage2_idct // (pi2_out[2] + rounding ) >> shift_stage1_idct - sqrshrn v31.4h, v12.4s,#shift_stage2_idct // (pi2_out[3] + rounding ) >> shift_stage1_idct + sqrshrn v29.4h, v5.4s,#shift_stage2_idct // (pi2_out[1] + rounding ) >> shift_stage1_idct + sqrshrn v30.4h, v7.4s,#shift_stage2_idct // (pi2_out[2] + rounding ) >> shift_stage1_idct + sqrshrn v31.4h, v20.4s,#shift_stage2_idct // (pi2_out[3] + rounding ) >> shift_stage1_idct ld1 {v19.s}[1],[x2],x5 trn1 v24.4h, v28.4h, v29.4h trn2 v25.4h, v28.4h, v29.4h @@ -233,7 +233,7 @@ ihevc_itrans_recon_4x4_ttype1_av8: // ldmfd sp!,{x4-x12,x15} //reload the registers from sp ldp x19, x20,[sp],#16 - pop_v_regs + ret diff --git a/common/arm64/ihevc_neon_macros.s b/common/arm64/ihevc_neon_macros.s index 09a1de9..c5e65e5 100644 --- a/common/arm64/ihevc_neon_macros.s +++ b/common/arm64/ihevc_neon_macros.s @@ -47,4 +47,3 @@ ldp d10,d11,[sp],#16 ldp d8,d9,[sp],#16 .endm - diff --git a/common/arm64/ihevc_sao_band_offset_luma.s b/common/arm64/ihevc_sao_band_offset_luma.s index 099d581..779ee69 100644 --- a/common/arm64/ihevc_sao_band_offset_luma.s +++ b/common/arm64/ihevc_sao_band_offset_luma.s @@ -76,7 +76,10 @@ ihevc_sao_band_offset_luma_av8: LDR w8,[sp] //Loads ht - push_v_regs + + stp d13,d14,[sp,#-16]! + stp d8,d15,[sp,#-16]! // Storing d15 using { sub sp,sp,#8; str d15,[sp] } is giving bus error. + // d8 is used as dummy register and stored along with d15 using stp. d8 is not used in the function. stp x19, x20,[sp,#-16]! MOV x9,x8 //Move the ht to x9 for loop counter @@ -127,7 +130,7 @@ SRC_TOP_LOOP: //wd is always multiple of 8 ADD v7.8b, v3.8b , v31.8b //band_table.val[2] = vadd_u8(band_table.val[2], band_pos) dup v27.8b, v30.8b[3] //vdup_n_u8(pi1_sao_offset[3]) - ADD v8.8b, v4.8b , v31.8b //band_table.val[3] = vadd_u8(band_table.val[3], band_pos) + ADD v21.8b, v4.8b , v31.8b //band_table.val[3] = vadd_u8(band_table.val[3], band_pos) dup v26.8b, v30.8b[4] //vdup_n_u8(pi1_sao_offset[4]) ADD v1.8b, v5.8b , v29.8b //band_table.val[0] = vadd_u8(band_table.val[0], vdup_n_u8(pi1_sao_offset[1])) @@ -138,52 +141,52 @@ SRC_TOP_LOOP: //wd is always multiple of 8 CMP x5,#28 ADD v3.8b, v7.8b , v27.8b //band_table.val[2] = vadd_u8(band_table.val[2], vdup_n_u8(pi1_sao_offset[3])) - ADD v4.8b, v8.8b , v26.8b //band_table.val[3] = vadd_u8(band_table.val[3], vdup_n_u8(pi1_sao_offset[4])) + ADD v4.8b, v21.8b , v26.8b //band_table.val[3] = vadd_u8(band_table.val[3], vdup_n_u8(pi1_sao_offset[4])) BLT SAO_BAND_POS_0 SAO_BAND_POS_28: //case 28 - cmhs v12.8b, v29.8b , v4.8b //vcle_u8(band_table.val[3], vdup_n_u8(16)) + cmhs v25.8b, v29.8b , v4.8b //vcle_u8(band_table.val[3], vdup_n_u8(16)) BNE SAO_BAND_POS_29 - ORR v4.8b, v4.8b , v12.8b //band_table.val[3] = vorr_u8(band_table.val[3], au1_cmp) + ORR v4.8b, v4.8b , v25.8b //band_table.val[3] = vorr_u8(band_table.val[3], au1_cmp) B SWITCH_BREAK SAO_BAND_POS_29: //case 29 CMP x5,#29 - cmhs v11.8b, v29.8b , v3.8b //vcle_u8(band_table.val[2], vdup_n_u8(16)) + cmhs v24.8b, v29.8b , v3.8b //vcle_u8(band_table.val[2], vdup_n_u8(16)) BNE SAO_BAND_POS_30 - ORR v3.8b, v3.8b , v11.8b //band_table.val[2] = vorr_u8(band_table.val[2], au1_cmp) + ORR v3.8b, v3.8b , v24.8b //band_table.val[2] = vorr_u8(band_table.val[2], au1_cmp) - AND v4.8b, v4.8b , v12.8b //band_table.val[3] = vand_u8(band_table.val[3], au1_cmp) + AND v4.8b, v4.8b , v25.8b //band_table.val[3] = vand_u8(band_table.val[3], au1_cmp) B SWITCH_BREAK SAO_BAND_POS_30: //case 30 CMP x5,#30 - cmhs v10.8b, v29.8b , v2.8b //vcle_u8(band_table.val[1], vdup_n_u8(16)) + cmhs v23.8b, v29.8b , v2.8b //vcle_u8(band_table.val[1], vdup_n_u8(16)) BNE SAO_BAND_POS_31 - ORR v2.8b, v2.8b , v10.8b //band_table.val[1] = vorr_u8(band_table.val[1], au1_cmp) + ORR v2.8b, v2.8b , v23.8b //band_table.val[1] = vorr_u8(band_table.val[1], au1_cmp) - AND v3.8b, v3.8b , v11.8b //band_table.val[2] = vand_u8(band_table.val[2], au1_cmp) + AND v3.8b, v3.8b , v24.8b //band_table.val[2] = vand_u8(band_table.val[2], au1_cmp) B SWITCH_BREAK SAO_BAND_POS_31: //case 31 CMP x5,#31 BNE SWITCH_BREAK - cmhs v9.8b, v29.8b , v1.8b //vcle_u8(band_table.val[0], vdup_n_u8(16)) - ORR v1.8b, v1.8b , v9.8b //band_table.val[0] = vorr_u8(band_table.val[0], au1_cmp) + cmhs v22.8b, v29.8b , v1.8b //vcle_u8(band_table.val[0], vdup_n_u8(16)) + ORR v1.8b, v1.8b , v22.8b //band_table.val[0] = vorr_u8(band_table.val[0], au1_cmp) - AND v2.8b, v2.8b , v10.8b //band_table.val[1] = vand_u8(band_table.val[1], au1_cmp) + AND v2.8b, v2.8b , v23.8b //band_table.val[1] = vand_u8(band_table.val[1], au1_cmp) SAO_BAND_POS_0: CMP x5,#0 //case 0 BNE SWITCH_BREAK - cmhs v9.8b, v29.8b , v1.8b //vcle_u8(band_table.val[0], vdup_n_u8(16)) - AND v1.8b, v1.8b , v9.8b //band_table.val[0] = vand_u8(band_table.val[0], au1_cmp) + cmhs v22.8b, v29.8b , v1.8b //vcle_u8(band_table.val[0], vdup_n_u8(16)) + AND v1.8b, v1.8b , v22.8b //band_table.val[0] = vand_u8(band_table.val[0], au1_cmp) SWITCH_BREAK: @@ -236,9 +239,11 @@ HEIGHT_LOOP: ADD x0,x0,#8 BNE SWITCH_BREAK_1 - // LDMFD sp!,{x4-x12,x15} //Reload the registers from SP + // LDMFD sp!,{x4-x12,x15} //Reload the registers from SP ldp x19, x20,[sp], #16 - pop_v_regs + ldp d8,d15,[sp],#16 // Loading d15 using { ldr d15,[sp]; add sp,sp,#8 } is giving bus error. + // d8 is used as dummy register and loaded along with d15 using ldp. d8 is not used in the function. + ldp d13,d14,[sp],#16 ret diff --git a/common/arm64/ihevc_sao_edge_offset_class0.s b/common/arm64/ihevc_sao_edge_offset_class0.s index f7d6621..91146e8 100644 --- a/common/arm64/ihevc_sao_edge_offset_class0.s +++ b/common/arm64/ihevc_sao_edge_offset_class0.s @@ -78,7 +78,7 @@ ihevc_sao_edge_offset_class0_av8: LDR x10,[sp,#16] //Loads ht AND x10,x10,0xFFFFFFFF // Since argument is passed as WORD32, Using only lower half of x10 - push_v_regs + stp x19, x20,[sp,#-16]! movi v2.16b, #2 //const_2 = vdupq_n_s8(2) @@ -93,15 +93,15 @@ ihevc_sao_edge_offset_class0_av8: ADRP x14, :got:gi1_table_edge_idx //table pointer LDR x14, [x14, #:got_lo12:gi1_table_edge_idx] - movi v8.16b, #0xFF //au1_mask = vdupq_n_s8(-1) + movi v3.16b, #0xFF //au1_mask = vdupq_n_s8(-1) STRB w12,[x4] //*pu1_src_top_left = pu1_src_top[wd - 1] MOV x6,x0 //pu1_src_org - LD1 {v10.8b},[x14] //edge_idx_tbl = vld1_s8(gi1_table_edge_idx) + LD1 {v5.8b},[x14] //edge_idx_tbl = vld1_s8(gi1_table_edge_idx) SUB x4,x10,#1 //(ht - 1) MOV x12,x9 //Move wd to x12 for loop count - LD1 {v11.8b},[x8] //offset_tbl = vld1_s8(pi1_sao_offset) + LD1 {v7.8b},[x8] //offset_tbl = vld1_s8(pi1_sao_offset) mul x4, x4, x1 //(ht - 1) * src_strd ADD x4,x4,x0 //pu1_src[(ht - 1) * src_strd] @@ -123,18 +123,18 @@ WIDTH_LOOP_16: CMP x8,x9 //if(col == wd) BNE AU1_MASK_FF //jump to else part LDRB w12,[x7] //pu1_avail[0] - mov v8.8b[0], w12 //vsetq_lane_s8(pu1_avail[0], au1_mask, 0) + mov v3.8b[0], w12 //vsetq_lane_s8(pu1_avail[0], au1_mask, 0) B SKIP_AU1_MASK_FF //Skip the else part AU1_MASK_FF: MOV x12,#0xFF //move -1 to x12 - mov v8.8b[0], w12 //au1_mask = vsetq_lane_s8(-1, au1_mask, 0) + mov v3.8b[0], w12 //au1_mask = vsetq_lane_s8(-1, au1_mask, 0) SKIP_AU1_MASK_FF: CMP x8,#16 //If col == 16 BNE SKIP_MASKING_IF_NOT16 //If not skip masking LDRB w12,[x7,#1] //pu1_avail[1] - mov v8.b[15], w12 //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15) + mov v3.b[15], w12 //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15) SKIP_MASKING_IF_NOT16: MOV x12,x0 //pu1_src_cpy = pu1_src @@ -142,24 +142,24 @@ SKIP_MASKING_IF_NOT16: PU1_SRC_LOOP: LDRB w11,[x2] //load pu1_src_left since ht - row =0 when it comes first pu1_src_left is incremented later - LD1 {v12.16b},[x12],x1 //pu1_cur_row = vld1q_u8(pu1_src_cpy) + LD1 {v17.16b},[x12],x1 //pu1_cur_row = vld1q_u8(pu1_src_cpy) SUB x5,x9,x8 //wd - col SUB x14,x10,x4 //ht - row - mov v14.8b[15], w11 //vsetq_lane_u8(pu1_src_left[ht - row], pu1_cur_row_tmp, 15) + mov v21.8b[15], w11 //vsetq_lane_u8(pu1_src_left[ht - row], pu1_cur_row_tmp, 15) mul x14, x14, x1 //(ht - row) * src_strd LD1 {v26.16b},[x12] //II Iteration pu1_cur_row = vld1q_u8(pu1_src_cpy) - EXT v14.16b, v14.16b , v12.16b,#15 //pu1_cur_row_tmp = vextq_u8(pu1_cur_row_tmp, pu1_cur_row, 15) + EXT v21.16b, v21.16b , v17.16b,#15 //pu1_cur_row_tmp = vextq_u8(pu1_cur_row_tmp, pu1_cur_row, 15) ADD x5,x14,x5 //(ht - row) * src_strd + (wd - col) LDRB w11,[x2, #1] //II Iteration load pu1_src_left since ht - row + 1 =1 - cmhi v16.16b, v12.16b , v14.16b //vcgtq_u8(pu1_cur_row, pu1_cur_row_tmp) + cmhi v16.16b, v17.16b , v21.16b //vcgtq_u8(pu1_cur_row, pu1_cur_row_tmp) LDRB w14,[x6,x5] //pu1_src_org[(ht - row) * src_strd + 16 - 1 + (wd - col)] SUB x4,x4,#1 mov v28.8b[15], w11 //II Iteration vsetq_lane_u8(pu1_src_left[ht - row], pu1_cur_row_tmp, 15) - cmhi v18.16b, v14.16b , v12.16b //vcltq_u8(pu1_cur_row, pu1_cur_row_tmp) + cmhi v18.16b, v21.16b , v17.16b //vcltq_u8(pu1_cur_row, pu1_cur_row_tmp) SUB x12,x12,x1 //Decrement the pu1_src pointer by src_strd SUB v20.16b, v18.16b , v16.16b //sign_left = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt)) @@ -170,11 +170,11 @@ PU1_SRC_LOOP: SUB x5,x9,x8 //II wd - col ADD x12,x12,x1 //Increment the pu1_src pointer by src_strd - mov v14.8b[0], w11 //pu1_cur_row_tmp = vsetq_lane_u8(pu1_src_cpy[16], pu1_cur_row_tmp, 0) + mov v21.8b[0], w11 //pu1_cur_row_tmp = vsetq_lane_u8(pu1_src_cpy[16], pu1_cur_row_tmp, 0) cmhi v30.16b, v26.16b , v28.16b //II vcgtq_u8(pu1_cur_row, pu1_cur_row_tmp) LDRB w11,[x12,#16] //II pu1_src_cpy[16] - EXT v14.16b, v12.16b , v14.16b,#1 //pu1_cur_row_tmp = vextq_u8(pu1_cur_row, pu1_cur_row_tmp, 1) + EXT v21.16b, v17.16b , v21.16b,#1 //pu1_cur_row_tmp = vextq_u8(pu1_cur_row, pu1_cur_row_tmp, 1) SUB x14,x10,x4 //II ht - row cmhi v0.16b, v28.16b , v26.16b //II vcltq_u8(pu1_cur_row, pu1_cur_row_tmp) @@ -182,59 +182,59 @@ PU1_SRC_LOOP: SUB x12,x12,x1 //Decrement the pu1_src pointer by src_strd mul x14, x14, x1 //II (ht - row) * src_strd - cmhi v16.16b, v12.16b , v14.16b //vcgtq_u8(pu1_cur_row, pu1_cur_row_tmp) + cmhi v16.16b, v17.16b , v21.16b //vcgtq_u8(pu1_cur_row, pu1_cur_row_tmp) ADD x5,x14,x5 //II (ht - row) * src_strd + (wd - col) - cmhi v18.16b, v14.16b , v12.16b //vcltq_u8(pu1_cur_row, pu1_cur_row_tmp) + cmhi v18.16b, v21.16b , v17.16b //vcltq_u8(pu1_cur_row, pu1_cur_row_tmp) EXT v28.16b, v26.16b , v28.16b,#1 //II pu1_cur_row_tmp = vextq_u8(pu1_cur_row, pu1_cur_row_tmp, 1) LDRB w14,[x6,x5] //II pu1_src_org[(ht - row) * src_strd + 16 - 1 + (wd - col)] SUB v22.16b, v18.16b , v16.16b //sign_right = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt)) SUBS x4,x4,#1 //Decrement row by 1 - ADD v14.16b, v2.16b , v20.16b //edge_idx = vaddq_s8(const_2, sign_left) + ADD v21.16b, v2.16b , v20.16b //edge_idx = vaddq_s8(const_2, sign_left) STRB w14,[x2],#1 //II pu1_src_left[(ht - row)] = au1_src_left_tmp[(ht - row)] - ADD v14.16b, v14.16b , v22.16b //edge_idx = vaddq_s8(edge_idx, sign_right) - Uxtl v18.8h, v12.8b //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row))) + ADD v21.16b, v21.16b , v22.16b //edge_idx = vaddq_s8(edge_idx, sign_right) + Uxtl v18.8h, v17.8b //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row))) SUB v20.16b, v0.16b , v30.16b //II sign_left = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt)) - TBL v14.16b, {v10.16b},v14.16b //vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx)) + TBL v21.16b, {v5.16b},v21.16b //vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx)) cmhi v30.16b, v26.16b , v28.16b //II vcgtq_u8(pu1_cur_row, pu1_cur_row_tmp) cmhi v0.16b, v28.16b , v26.16b //II vcltq_u8(pu1_cur_row, pu1_cur_row_tmp) // TBL v15.8b, {v10.16b},v15.8b //vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx)) SUB v22.16b, v0.16b , v30.16b //II sign_right = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt)) - AND v14.16b, v14.16b , v8.16b //edge_idx = vandq_s8(edge_idx, au1_mask) - TBL v16.16b, {v11.16b},v14.16b //offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx)) + AND v21.16b, v21.16b , v3.16b //edge_idx = vandq_s8(edge_idx, au1_mask) + TBL v16.16b, {v7.16b},v21.16b //offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx)) Uxtl v0.8h, v26.8b //II pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row))) ADD v28.16b, v2.16b , v20.16b //II edge_idx = vaddq_s8(const_2, sign_left) ADD v28.16b, v28.16b , v22.16b //II edge_idx = vaddq_s8(edge_idx, sign_right) SADDW v18.8h, v18.8h , v16.8b - TBL v28.16b, {v10.16b},v28.16b //II vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx)) + TBL v28.16b, {v5.16b},v28.16b //II vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx)) SMAX v18.8h, v18.8h , v4.8h //pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip) // TBL v29.8b, {v10.16b},v29.8b //II vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx)) UMIN v18.8h, v18.8h , v6.8h //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip)) - AND v28.16b, v28.16b , v8.16b //II edge_idx = vandq_s8(edge_idx, au1_mask) + AND v28.16b, v28.16b , v3.16b //II edge_idx = vandq_s8(edge_idx, au1_mask) // TBL v17.8b, {v11.16b},v15.8b //offset = vtbl1_s8(offset_tbl, vget_high_s8(edge_idx)) - Uxtl2 v14.8h, v12.16b //pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row))) - TBL v30.16b, {v11.16b},v28.16b //II offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx)) - SADDW2 v14.8h, v14.8h , v16.16b //pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset) + Uxtl2 v21.8h, v17.16b //pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row))) + TBL v30.16b, {v7.16b},v28.16b //II offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx)) + SADDW2 v21.8h, v21.8h , v16.16b //pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset) - SMAX v14.8h, v14.8h , v4.8h //pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip) + SMAX v21.8h, v21.8h , v4.8h //pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip) // TBL v31.8b, {v11.16b},v29.8b //II offset = vtbl1_s8(offset_tbl, vget_high_s8(edge_idx)) - UMIN v14.8h, v14.8h , v6.8h //pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip)) + UMIN v21.8h, v21.8h , v6.8h //pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip)) xtn v18.8b, v18.8h //vmovn_s16(pi2_tmp_cur_row.val[0]) SADDW v0.8h, v0.8h , v30.8b - xtn v19.8b, v14.8h //vmovn_s16(pi2_tmp_cur_row.val[1]) + xtn v19.8b, v21.8h //vmovn_s16(pi2_tmp_cur_row.val[1]) SMAX v0.8h, v0.8h , v4.8h //II pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip) Uxtl2 v28.8h, v26.16b //II pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row))) @@ -271,52 +271,52 @@ WIDTH_RESIDUE: CMP x8,x9 //if(wd_rem == wd) BNE AU1_MASK_FF_RESIDUE //jump to else part LDRB w12,[x7] //pu1_avail[0] - mov v8.8b[0], w12 //vsetq_lane_s8(pu1_avail[0], au1_mask, 0) + mov v3.8b[0], w12 //vsetq_lane_s8(pu1_avail[0], au1_mask, 0) B SKIP_AU1_MASK_FF_RESIDUE //Skip the else part AU1_MASK_FF_RESIDUE: MOV x12,#0xFF //move -s to x12 - mov v8.8b[0], w12 //au1_mask = vsetq_lane_s8(-1, au1_mask, 0) + mov v3.8b[0], w12 //au1_mask = vsetq_lane_s8(-1, au1_mask, 0) SKIP_AU1_MASK_FF_RESIDUE: LDRB w11,[x7,#1] //pu1_avail[1] SUB x5,x9,#1 //wd - 1 MOV x4,x10 //move ht to x4 for loop count - mov v8.8b[7], w11 //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15) + mov v3.8b[7], w11 //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15) MOV x12,x0 //pu1_src_cpy = pu1_src PU1_SRC_LOOP_RESIDUE: - LD1 {v12.16b},[x12] //pu1_cur_row = vld1q_u8(pu1_src_cpy) + LD1 {v17.16b},[x12] //pu1_cur_row = vld1q_u8(pu1_src_cpy) LDRB w11,[x2] //load pu1_src_left - mov v14.8b[15], w11 //vsetq_lane_u8(pu1_src_left[ht - row], pu1_cur_row_tmp, 15) - EXT v14.16b, v14.16b , v12.16b,#15 //pu1_cur_row_tmp = vextq_u8(pu1_cur_row_tmp, pu1_cur_row, 15) + mov v21.8b[15], w11 //vsetq_lane_u8(pu1_src_left[ht - row], pu1_cur_row_tmp, 15) + EXT v21.16b, v21.16b , v17.16b,#15 //pu1_cur_row_tmp = vextq_u8(pu1_cur_row_tmp, pu1_cur_row, 15) - cmhi v16.16b, v12.16b , v14.16b //vcgtq_u8(pu1_cur_row, pu1_cur_row_tmp) - cmhi v18.16b, v14.16b , v12.16b //vcltq_u8(pu1_cur_row, pu1_cur_row_tmp) + cmhi v16.16b, v17.16b , v21.16b //vcgtq_u8(pu1_cur_row, pu1_cur_row_tmp) + cmhi v18.16b, v21.16b , v17.16b //vcltq_u8(pu1_cur_row, pu1_cur_row_tmp) SUB v20.16b, v18.16b , v16.16b //sign_left = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt)) LDRB w11,[x12,#16] //pu1_src_cpy[16] - mov v14.8b[0], w11 //pu1_cur_row_tmp = vsetq_lane_u8(pu1_src_cpy[16], pu1_cur_row_tmp, 0) - EXT v14.16b, v12.16b , v14.16b,#1 //pu1_cur_row_tmp = vextq_u8(pu1_cur_row, pu1_cur_row_tmp, 1) + mov v21.8b[0], w11 //pu1_cur_row_tmp = vsetq_lane_u8(pu1_src_cpy[16], pu1_cur_row_tmp, 0) + EXT v21.16b, v17.16b , v21.16b,#1 //pu1_cur_row_tmp = vextq_u8(pu1_cur_row, pu1_cur_row_tmp, 1) - cmhi v16.16b, v12.16b , v14.16b //vcgtq_u8(pu1_cur_row, pu1_cur_row_tmp) - cmhi v18.16b, v14.16b , v12.16b //vcltq_u8(pu1_cur_row, pu1_cur_row_tmp) + cmhi v16.16b, v17.16b , v21.16b //vcgtq_u8(pu1_cur_row, pu1_cur_row_tmp) + cmhi v18.16b, v21.16b , v17.16b //vcltq_u8(pu1_cur_row, pu1_cur_row_tmp) SUB v22.16b, v18.16b , v16.16b //sign_right = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt)) ADD v24.16b, v2.16b , v20.16b //edge_idx = vaddq_s8(const_2, sign_left) ADD v24.16b, v24.16b , v22.16b //edge_idx = vaddq_s8(edge_idx, sign_right) - TBL v24.16b, {v10.16b},v24.16b //vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx)) + TBL v24.16b, {v5.16b},v24.16b //vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx)) // TBL v25.8b, {v10.16b},v25.8b //vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx)) - AND v24.16b, v24.16b , v8.16b //edge_idx = vandq_s8(edge_idx, au1_mask) + AND v24.16b, v24.16b , v3.16b //edge_idx = vandq_s8(edge_idx, au1_mask) NEG v20.16b, v22.16b //sign_left = vnegq_s8(sign_right) EXT v20.16b, v20.16b , v22.16b,#15 //sign_left = vextq_s8(sign_left, sign_left, 15) - TBL v26.8b, {v11.16b},v24.8b //offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx)) - Uxtl v28.8h, v12.8b //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row))) + TBL v26.8b, {v7.16b},v24.8b //offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx)) + Uxtl v28.8h, v17.8b //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row))) SADDW v28.8h, v28.8h , v26.8b SMAX v28.8h, v28.8h , v4.8h //pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip) UMIN v28.8h, v28.8h , v6.8h //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip)) @@ -337,7 +337,7 @@ PU1_SRC_LOOP_RESIDUE: END_LOOPS: // LDMFD sp!,{x4-x12,x15} //Reload the registers from SP ldp x19, x20,[sp], #16 - pop_v_regs + ret diff --git a/common/arm64/ihevc_sao_edge_offset_class0_chroma.s b/common/arm64/ihevc_sao_edge_offset_class0_chroma.s index d854c62..c6be41a 100644 --- a/common/arm64/ihevc_sao_edge_offset_class0_chroma.s +++ b/common/arm64/ihevc_sao_edge_offset_class0_chroma.s @@ -74,7 +74,7 @@ ihevc_sao_edge_offset_class0_chroma_av8: ldr w10,[sp,#16] ldr w11,[sp,#24] - push_v_regs + // STMFD sp!, {x4-x12, x14} //stack stores the values of the arguments stp x19, x20,[sp,#-16]! @@ -111,15 +111,15 @@ ihevc_sao_edge_offset_class0_chroma_av8: ADRP x14, :got:gi1_table_edge_idx //table pointer LDR x14, [x14, #:got_lo12:gi1_table_edge_idx] - movi v8.16b, #0xFF //au1_mask = vdupq_n_s8(-1) + movi v3.16b, #0xFF //au1_mask = vdupq_n_s8(-1) mul x4, x4, x1 //(ht - 1) * src_strd MOV x5, x23 //Loads pi1_sao_offset_v - LD1 {v11.8b},[x8] //offset_tbl = vld1_s8(pi1_sao_offset_u) + LD1 {v7.8b},[x8] //offset_tbl = vld1_s8(pi1_sao_offset_u) ADD x4,x4,x0 //pu1_src[(ht - 1) * src_strd] MOV x6,x0 //pu1_src_org - LD1 {v10.8b},[x14] //edge_idx_tbl = vld1_s8(gi1_table_edge_idx) + LD1 {v5.8b},[x14] //edge_idx_tbl = vld1_s8(gi1_table_edge_idx) MOV x12,x9 //Move wd to x12 for loop count SRC_TOP_LOOP: //wd is always multiple of 8 @@ -141,20 +141,20 @@ WIDTH_LOOP_16: CMP x8,x9 //if(col == wd) BNE AU1_MASK_FF //jump to else part LDRB w12,[x7] //pu1_avail[0] - mov v8.8b[0], w12 //vsetq_lane_s8(pu1_avail[0], au1_mask, 0) - mov v8.8b[1], w12 //vsetq_lane_s8(pu1_avail[0], au1_mask, 1) + mov v3.8b[0], w12 //vsetq_lane_s8(pu1_avail[0], au1_mask, 0) + mov v3.8b[1], w12 //vsetq_lane_s8(pu1_avail[0], au1_mask, 1) B SKIP_AU1_MASK_FF //Skip the else part AU1_MASK_FF: MOV x12,#-1 //move -1 to x12 - mov v8.4h[0], w12 //au1_mask = vsetq_lane_s8(-1, au1_mask, 0) + mov v3.4h[0], w12 //au1_mask = vsetq_lane_s8(-1, au1_mask, 0) SKIP_AU1_MASK_FF: CMP x8,#16 //If col == 16 BNE SKIP_MASKING_IF_NOT16 //If not skip masking LDRB w12,[x7,#1] //pu1_avail[1] - mov v8.8b[14], w12 //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 14) - mov v8.8b[15], w12 //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15) + mov v3.8b[14], w12 //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 14) + mov v3.8b[15], w12 //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15) SKIP_MASKING_IF_NOT16: MOV x12,x0 //pu1_src_cpy = pu1_src @@ -162,27 +162,27 @@ SKIP_MASKING_IF_NOT16: PU1_SRC_LOOP: LDRH w11,[x2] //load pu1_src_left since ht - row =0 when it comes first pu1_src_left is incremented later - LD1 {v12.16b},[x12],x1 //pu1_cur_row = vld1q_u8(pu1_src_cpy) + LD1 {v19.16b},[x12],x1 //pu1_cur_row = vld1q_u8(pu1_src_cpy) //LD1 {v13.8b},[x12],x1 //pu1_cur_row = vld1q_u8(pu1_src_cpy) //SUB x12, x12,#8 SUB x5,x9,x8 //wd - col SUB x14,x10,x4 //ht - row - mov v14.4h[7], w11 //vsetq_lane_u16(pu1_src_left[ht - row], pu1_cur_row_tmp, 14,15) + mov v21.4h[7], w11 //vsetq_lane_u16(pu1_src_left[ht - row], pu1_cur_row_tmp, 14,15) mul x14, x14, x1 //(ht - row) * src_strd LD1 {v30.16b},[x12] //II Iteration pu1_cur_row = vld1q_u8(pu1_src_cpy) //LD1 {v31.8b},[x12] //II Iteration pu1_cur_row = vld1q_u8(pu1_src_cpy) //SUB x12, x12,#8 - EXT v14.16b, v14.16b , v12.16b,#14 //pu1_cur_row_tmp = vextq_u8(pu1_cur_row_tmp, pu1_cur_row, 14) + EXT v21.16b, v21.16b , v19.16b,#14 //pu1_cur_row_tmp = vextq_u8(pu1_cur_row_tmp, pu1_cur_row, 14) SUB x12,x12,x1 LDRH w11,[x2,#2] //II load pu1_src_left since ht - row =0 - cmhi v16.16b, v12.16b , v14.16b //vcgtq_u8(pu1_cur_row, pu1_cur_row_tmp) + cmhi v16.16b, v19.16b , v21.16b //vcgtq_u8(pu1_cur_row, pu1_cur_row_tmp) ADD x5,x14,x5 //(ht - row) * src_strd + (wd - col) mov v28.4h[7], w11 //II vsetq_lane_u16(pu1_src_left[ht - row], pu1_cur_row_tmp, 14,15) - cmhi v18.16b, v14.16b , v12.16b //vcltq_u8(pu1_cur_row, pu1_cur_row_tmp) + cmhi v18.16b, v21.16b , v19.16b //vcltq_u8(pu1_cur_row, pu1_cur_row_tmp) LDRH w14,[x6,x5] //pu1_src_org[(ht - row) * src_strd + 14 + (wd - col)] SUB v20.16b, v18.16b , v16.16b //sign_left = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt)) @@ -191,7 +191,7 @@ PU1_SRC_LOOP: LDRB w11,[x12,#16] //pu1_src_cpy[16] EXT v28.16b, v28.16b , v30.16b,#14 //II pu1_cur_row_tmp = vextq_u8(pu1_cur_row_tmp, pu1_cur_row, 14) - mov v14.8b[0], w11 //pu1_cur_row_tmp = vsetq_lane_u8(pu1_src_cpy[16], pu1_cur_row_tmp, 0) + mov v21.8b[0], w11 //pu1_cur_row_tmp = vsetq_lane_u8(pu1_src_cpy[16], pu1_cur_row_tmp, 0) cmhi v26.16b, v30.16b , v28.16b //II vcgtq_u8(pu1_cur_row, pu1_cur_row_tmp) LDRB w11,[x12,#17] //pu1_src_cpy[17] @@ -199,62 +199,62 @@ PU1_SRC_LOOP: STRH w14,[x2],#2 //pu1_src_left[(ht - row)] = au1_src_left_tmp[(ht - row)] ADD x12,x12,x1 - mov v14.8b[1], w11 //pu1_cur_row_tmp = vsetq_lane_u8(pu1_src_cpy[17], pu1_cur_row_tmp, 1) + mov v21.8b[1], w11 //pu1_cur_row_tmp = vsetq_lane_u8(pu1_src_cpy[17], pu1_cur_row_tmp, 1) LDRB w11,[x12,#16] //II pu1_src_cpy[16] - EXT v14.16b, v12.16b , v14.16b,#2 //pu1_cur_row_tmp = vextq_u8(pu1_cur_row, pu1_cur_row_tmp, 2) + EXT v21.16b, v19.16b , v21.16b,#2 //pu1_cur_row_tmp = vextq_u8(pu1_cur_row, pu1_cur_row_tmp, 2) mov v28.8b[0], w11 //II pu1_cur_row_tmp = vsetq_lane_u8(pu1_src_cpy[16], pu1_cur_row_tmp, 0) LDRB w11,[x12,#17] //II pu1_src_cpy[17] - cmhi v16.16b, v12.16b , v14.16b //vcgtq_u8(pu1_cur_row, pu1_cur_row_tmp) + cmhi v16.16b, v19.16b , v21.16b //vcgtq_u8(pu1_cur_row, pu1_cur_row_tmp) SUB x12,x12,x1 - cmhi v18.16b, v14.16b , v12.16b //vcltq_u8(pu1_cur_row, pu1_cur_row_tmp) + cmhi v18.16b, v21.16b , v19.16b //vcltq_u8(pu1_cur_row, pu1_cur_row_tmp) mov v28.8b[1], w11 //II pu1_cur_row_tmp = vsetq_lane_u8(pu1_src_cpy[17], pu1_cur_row_tmp, 1) SUB v22.16b, v18.16b , v16.16b //sign_right = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt)) EXT v28.16b, v30.16b , v28.16b,#2 //II pu1_cur_row_tmp = vextq_u8(pu1_cur_row, pu1_cur_row_tmp, 2) - ADD v14.16b, v2.16b , v20.16b //edge_idx = vaddq_s8(const_2, sign_left) + ADD v21.16b, v2.16b , v20.16b //edge_idx = vaddq_s8(const_2, sign_left) - mov v10.d[1],v10.d[0] - ADD v14.16b, v14.16b , v22.16b //edge_idx = vaddq_s8(edge_idx, sign_right) - TBL v14.16b, {v10.16b},v14.16b //vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx)) + mov v5.d[1],v5.d[0] + ADD v21.16b, v21.16b , v22.16b //edge_idx = vaddq_s8(edge_idx, sign_right) + TBL v21.16b, {v5.16b},v21.16b //vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx)) SUB v20.16b, v24.16b , v26.16b //II sign_left = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt)) cmhi v26.16b, v30.16b , v28.16b //II vcgtq_u8(pu1_cur_row, pu1_cur_row_tmp) // TBL v15.8b, {v10.16b},v15.8b //vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx)) cmhi v24.16b, v28.16b , v30.16b //II vcltq_u8(pu1_cur_row, pu1_cur_row_tmp) - AND v14.16b, v14.16b , v8.16b //edge_idx = vandq_s8(edge_idx, au1_mask) - mov v15.d[0],v14.d[1] - UZP1 v1.8b, v14.8b, v15.8b - UZP2 v15.8b, v14.8b, v15.8b - mov v14.8b, v1.8b + AND v21.16b, v21.16b , v3.16b //edge_idx = vandq_s8(edge_idx, au1_mask) + mov v23.d[0],v21.d[1] + UZP1 v1.8b, v21.8b, v23.8b + UZP2 v23.8b, v21.8b, v23.8b + mov v21.8b, v1.8b //mov v11.d[1],v0.d[0] //mov v14.d[1],v15.d[0] SUB v22.16b, v24.16b , v26.16b //II sign_right = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt)) - TBL v16.8b, {v11.16b},v14.8b //offset = vtbl1_s8(offset_tbl_u, vget_low_s8(edge_idx)) + TBL v16.8b, {v7.16b},v21.8b //offset = vtbl1_s8(offset_tbl_u, vget_low_s8(edge_idx)) ADD v24.16b, v2.16b , v20.16b //II edge_idx = vaddq_s8(const_2, sign_left) - Uxtl v18.8h, v12.8b //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row))) - TBL v17.8b, {v0.16b},v15.8b + Uxtl v18.8h, v19.8b //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row))) + TBL v17.8b, {v0.16b},v23.8b ADD v24.16b, v24.16b , v22.16b //II edge_idx = vaddq_s8(edge_idx, sign_right) //mov v17.d[0],v16.d[1] ZIP1 v1.8b, v16.8b, v17.8b ZIP2 v17.8b, v16.8b, v17.8b mov v16.8b, v1.8b - TBL v24.16b, {v10.16b},v24.16b //II vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx)) - Uxtl2 v12.8h, v12.16b //pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row))) + TBL v24.16b, {v5.16b},v24.16b //II vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx)) + Uxtl2 v19.8h, v19.16b //pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row))) //mov v16.d[1],v17.d[0] SADDW v18.8h, v18.8h , v16.8b //pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset) //TBL v25.8b, {v10.16b},v25.8b //II vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx)) SMAX v18.8h, v18.8h , v4.8h //pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip) - AND v24.16b, v24.16b , v8.16b //II edge_idx = vandq_s8(edge_idx, au1_mask) + AND v24.16b, v24.16b , v3.16b //II edge_idx = vandq_s8(edge_idx, au1_mask) mov v25.d[0],v24.d[1] UMIN v18.8h, v18.8h , v6.8h //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip)) UZP1 v1.8b, v24.8b, v25.8b @@ -262,16 +262,16 @@ PU1_SRC_LOOP: mov v24.8b, v1.8b //mov v24.d[1],v25.d[0] - SADDW v12.8h, v12.8h , v17.8b //pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset) - TBL v26.8b, {v11.16b},v24.8b //II offset = vtbl1_s8(offset_tbl_u, vget_low_s8(edge_idx)) - SMAX v12.8h, v12.8h , v4.8h //pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip) + SADDW v19.8h, v19.8h , v17.8b //pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset) + TBL v26.8b, {v7.16b},v24.8b //II offset = vtbl1_s8(offset_tbl_u, vget_low_s8(edge_idx)) + SMAX v19.8h, v19.8h , v4.8h //pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip) - UMIN v12.8h, v12.8h , v6.8h //pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip)) + UMIN v19.8h, v19.8h , v6.8h //pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip)) TBL v27.8b, {v0.16b},v25.8b //II - xtn v14.8b, v18.8h //vmovn_s16(pi2_tmp_cur_row.val[0]) + xtn v21.8b, v18.8h //vmovn_s16(pi2_tmp_cur_row.val[0]) //mov v27.d[0],v26.d[1] - xtn v15.8b, v12.8h //vmovn_s16(pi2_tmp_cur_row.val[1]) + xtn v23.8b, v19.8h //vmovn_s16(pi2_tmp_cur_row.val[1]) ZIP1 v1.8b, v26.8b, v27.8b ZIP2 v27.8b, v26.8b, v27.8b //II mov v26.8b, v1.8b @@ -295,7 +295,9 @@ PU1_SRC_LOOP: Uxtl2 v30.8h, v30.16b //II pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row))) SADDW v30.8h, v30.8h , v27.8b //II pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset) - ST1 {v14.8b, v15.8b},[x12],x1 //vst1q_u8(pu1_src_cpy, pu1_cur_row) + ST1 {v21.8b},[x12],#8 //vst1q_u8(pu1_src_cpy, pu1_cur_row) + ST1 {v23.8b},[x12],x1 + SUB x12,x12,#8 SMAX v30.8h, v30.8h , v4.8h //II pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip) SUBS x4,x4,#1 //Decrement row by 1 @@ -326,107 +328,107 @@ WIDTH_RESIDUE: CMP x8,x9 //if(wd_rem == wd) BNE AU1_MASK_FF_RESIDUE //jump to else part LDRB w12,[x7] //pu1_avail[0] - mov v8.8b[0], w12 //vsetq_lane_s8(pu1_avail[0], au1_mask, 0) - mov v8.8b[1], w12 //vsetq_lane_s8(pu1_avail[0], au1_mask, 0) + mov v3.8b[0], w12 //vsetq_lane_s8(pu1_avail[0], au1_mask, 0) + mov v3.8b[1], w12 //vsetq_lane_s8(pu1_avail[0], au1_mask, 0) B SKIP_AU1_MASK_FF_RESIDUE //Skip the else part AU1_MASK_FF_RESIDUE: MOV x12,#-1 //move -1 to x12 - mov v8.4h[0], w12 //au1_mask = vsetq_lane_s8(-1, au1_mask, 0) + mov v3.4h[0], w12 //au1_mask = vsetq_lane_s8(-1, au1_mask, 0) SKIP_AU1_MASK_FF_RESIDUE: LDRB w12,[x7,#1] //pu1_avail[1] - mov v8.8b[6], w12 //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15) - mov v8.8b[7], w12 //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15) + mov v3.8b[6], w12 //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15) + mov v3.8b[7], w12 //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15) MOV x12,x0 //pu1_src_cpy = pu1_src MOV x4,x10 //move ht to x4 for loop count PU1_SRC_LOOP_RESIDUE: LDRH w11,[x2] //load pu1_src_left - LD1 {v12.16b},[x12],x1 //pu1_cur_row = vld1q_u8(pu1_src_cpy) + LD1 {v19.16b},[x12],x1 //pu1_cur_row = vld1q_u8(pu1_src_cpy) //LD1 {v13.8b},[x12],x1 //pu1_cur_row = vld1q_u8(pu1_src_cpy) //SUB x12, x12,#8 SUB x5,x9,#2 //wd - 2 SUB x14,x10,x4 //(ht - row) - mov v14.4h[7], w11 //vsetq_lane_u8(pu1_src_left[ht - row], pu1_cur_row_tmp, 15) + mov v21.4h[7], w11 //vsetq_lane_u8(pu1_src_left[ht - row], pu1_cur_row_tmp, 15) LSL x14,x14,#1 //(ht - row) * 2 LD1 {v30.16b},[x12] //II pu1_cur_row = vld1q_u8(pu1_src_cpy) //LD1 {v31.8b},[x12] //II pu1_cur_row = vld1q_u8(pu1_src_cpy) //SUB x12, x12,#8 - EXT v14.16b, v14.16b , v12.16b,#14 //pu1_cur_row_tmp = vextq_u8(pu1_cur_row_tmp, pu1_cur_row, 15) + EXT v21.16b, v21.16b , v19.16b,#14 //pu1_cur_row_tmp = vextq_u8(pu1_cur_row_tmp, pu1_cur_row, 15) SUB x12,x12,x1 LDRH w11,[x2,#2] //II load pu1_src_left - cmhi v16.16b, v12.16b , v14.16b //vcgtq_u8(pu1_cur_row, pu1_cur_row_tmp) + cmhi v16.16b, v19.16b , v21.16b //vcgtq_u8(pu1_cur_row, pu1_cur_row_tmp) mul x14, x14, x1 //(ht - row) * 2 * src_strd - cmhi v18.16b, v14.16b , v12.16b //vcltq_u8(pu1_cur_row, pu1_cur_row_tmp) + cmhi v18.16b, v21.16b , v19.16b //vcltq_u8(pu1_cur_row, pu1_cur_row_tmp) mov v28.4h[7], w11 //II vsetq_lane_u8(pu1_src_left[ht - row], pu1_cur_row_tmp, 15) LDRB w11,[x12,#16] //pu1_src_cpy[16] SUB v20.16b, v18.16b , v16.16b //sign_left = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt)) ADD x5,x14,x5 //(ht - row) * 2 * src_strd + (wd - 2) - mov v14.8b[0], w11 //pu1_cur_row_tmp = vsetq_lane_u8(pu1_src_cpy[16], pu1_cur_row_tmp, 0) + mov v21.8b[0], w11 //pu1_cur_row_tmp = vsetq_lane_u8(pu1_src_cpy[16], pu1_cur_row_tmp, 0) EXT v28.16b, v28.16b , v30.16b,#14 //II pu1_cur_row_tmp = vextq_u8(pu1_cur_row_tmp, pu1_cur_row, 15) LDRB w11,[x12,#17] //pu1_src_cpy[17] cmhi v26.16b, v30.16b , v28.16b //II vcgtq_u8(pu1_cur_row, pu1_cur_row_tmp) LDRH w14,[x6, x5] //pu1_src_org[(ht - row) * 2* src_strd + (wd - 2)] - mov v14.8b[1], w11 //pu1_cur_row_tmp = vsetq_lane_u8(pu1_src_cpy[17], pu1_cur_row_tmp, 1) + mov v21.8b[1], w11 //pu1_cur_row_tmp = vsetq_lane_u8(pu1_src_cpy[17], pu1_cur_row_tmp, 1) cmhi v24.16b, v28.16b , v30.16b //II vcltq_u8(pu1_cur_row, pu1_cur_row_tmp) ADD x12,x12,x1 STRH w14,[x2],#2 //pu1_src_left[(ht - row) * 2] = au1_src_left_tmp[(ht - row) * 2] - EXT v14.16b, v12.16b , v14.16b,#2 //pu1_cur_row_tmp = vextq_u8(pu1_cur_row, pu1_cur_row_tmp, 1) + EXT v21.16b, v19.16b , v21.16b,#2 //pu1_cur_row_tmp = vextq_u8(pu1_cur_row, pu1_cur_row_tmp, 1) LDRB w11,[x12,#16] //II pu1_src_cpy[16] - cmhi v16.16b, v12.16b , v14.16b //vcgtq_u8(pu1_cur_row, pu1_cur_row_tmp) + cmhi v16.16b, v19.16b , v21.16b //vcgtq_u8(pu1_cur_row, pu1_cur_row_tmp) mov v28.8b[0], w11 //II pu1_cur_row_tmp = vsetq_lane_u8(pu1_src_cpy[16], pu1_cur_row_tmp, 0) LDRB w11,[x12,#17] //II pu1_src_cpy[17] - cmhi v18.16b, v14.16b , v12.16b //vcltq_u8(pu1_cur_row, pu1_cur_row_tmp) + cmhi v18.16b, v21.16b , v19.16b //vcltq_u8(pu1_cur_row, pu1_cur_row_tmp) SUB x4,x4,#1 //II Decrement row by 1 SUB v22.16b, v18.16b , v16.16b //sign_right = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt)) mov v28.8b[1], w11 //II pu1_cur_row_tmp = vsetq_lane_u8(pu1_src_cpy[17], pu1_cur_row_tmp, 1) SUB x12,x12,x1 - ADD v14.16b, v2.16b , v20.16b //edge_idx = vaddq_s8(const_2, sign_left) + ADD v21.16b, v2.16b , v20.16b //edge_idx = vaddq_s8(const_2, sign_left) EXT v28.16b, v30.16b , v28.16b,#2 //II pu1_cur_row_tmp = vextq_u8(pu1_cur_row, pu1_cur_row_tmp, 1) - ADD v14.16b, v14.16b , v22.16b //edge_idx = vaddq_s8(edge_idx, sign_right) + ADD v21.16b, v21.16b , v22.16b //edge_idx = vaddq_s8(edge_idx, sign_right) SUB v20.16b, v24.16b , v26.16b //II sign_left = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt)) - TBL v14.16b, {v10.16b},v14.16b //vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx)) + TBL v21.16b, {v5.16b},v21.16b //vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx)) cmhi v26.16b, v30.16b , v28.16b //II vcgtq_u8(pu1_cur_row, pu1_cur_row_tmp) cmhi v24.16b, v28.16b , v30.16b //II vcltq_u8(pu1_cur_row, pu1_cur_row_tmp) //TBL v15.8b, {v10.16b},v15.8b //vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx)) SUB v22.16b, v24.16b , v26.16b //II sign_right = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt)) - AND v14.16b, v14.16b , v8.16b //edge_idx = vandq_s8(edge_idx, au1_mask) - mov v15.d[0],v14.d[1] - UZP1 v1.8b, v14.8b, v15.8b - UZP2 v15.8b, v14.8b, v15.8b - mov v14.8b, v1.8b + AND v21.16b, v21.16b , v3.16b //edge_idx = vandq_s8(edge_idx, au1_mask) + mov v23.d[0],v21.d[1] + UZP1 v1.8b, v21.8b, v23.8b + UZP2 v23.8b, v21.8b, v23.8b + mov v21.8b, v1.8b ADD v28.16b, v2.16b , v20.16b //II edge_idx = vaddq_s8(const_2, sign_left) - TBL v16.8b, {v11.16b},v14.8b //offset = vtbl1_s8(offset_tbl_u, vget_low_s8(edge_idx)) + TBL v16.8b, {v7.16b},v21.8b //offset = vtbl1_s8(offset_tbl_u, vget_low_s8(edge_idx)) ADD v28.16b, v28.16b , v22.16b //II edge_idx = vaddq_s8(edge_idx, sign_right) - Uxtl v18.8h, v12.8b //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row))) - TBL v17.8b, {v0.16b},v15.8b + Uxtl v18.8h, v19.8b //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row))) + TBL v17.8b, {v0.16b},v23.8b Uxtl v24.8h, v30.8b //II pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row))) ZIP1 v1.8b, v16.8b, v17.8b ZIP2 v17.8b, v16.8b, v17.8b mov v16.8b, v1.8b - TBL v28.16b, {v10.16b},v28.16b //II vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx)) + TBL v28.16b, {v5.16b},v28.16b //II vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx)) SADDW v18.8h, v18.8h , v16.8b //pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset) SMAX v18.8h, v18.8h , v4.8h //pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip) @@ -434,7 +436,7 @@ PU1_SRC_LOOP_RESIDUE: UMIN v18.8h, v18.8h , v6.8h //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip)) xtn v18.8b, v18.8h //vmovn_s16(pi2_tmp_cur_row.val[0]) - AND v28.16b, v28.16b , v8.16b //II edge_idx = vandq_s8(edge_idx, au1_mask) + AND v28.16b, v28.16b , v3.16b //II edge_idx = vandq_s8(edge_idx, au1_mask) mov v29.d[0],v28.d[1] SUB x5,x9,#2 //II wd - 2 UZP1 v1.8b, v28.8b, v29.8b @@ -443,7 +445,7 @@ PU1_SRC_LOOP_RESIDUE: SUB x14,x10,x4 //II (ht - row) LSL x14,x14,#1 //II (ht - row) * 2 - TBL v26.8b, {v11.16b},v28.8b //II offset = vtbl1_s8(offset_tbl_u, vget_low_s8(edge_idx)) + TBL v26.8b, {v7.16b},v28.8b //II offset = vtbl1_s8(offset_tbl_u, vget_low_s8(edge_idx)) mul x14, x14, x1 //II (ht - row) * 2 * src_strd ADD x5,x14,x5 //II (ht - row) * 2 * src_strd + (wd - 2) @@ -474,7 +476,7 @@ END_LOOPS: ldp x23, x24,[sp],#16 ldp x21, x22,[sp],#16 ldp x19, x20,[sp],#16 - pop_v_regs + ret diff --git a/common/arm64/ihevc_sao_edge_offset_class1.s b/common/arm64/ihevc_sao_edge_offset_class1.s index 8ed6169..515b349 100644 --- a/common/arm64/ihevc_sao_edge_offset_class1.s +++ b/common/arm64/ihevc_sao_edge_offset_class1.s @@ -76,7 +76,7 @@ ihevc_sao_edge_offset_class1_av8: LDR w7,[sp,#8] //Loads wd LDR w8,[sp,#16] //Loads ht - push_v_regs + stp x19, x20,[sp,#-16]! SUB x9,x7,#1 //wd - 1 @@ -128,16 +128,16 @@ WIDTH_LOOP_16: MOV x10,x0 //*pu1_src - LD1 {v8.16b},[x9],#16 //pu1_top_row = vld1q_u8(pu1_src_top_cpy || pu1_src - src_strd) - LD1 {v10.16b},[x0],#16 //pu1_cur_row = vld1q_u8(pu1_src) + LD1 {v1.16b},[x9],#16 //pu1_top_row = vld1q_u8(pu1_src_top_cpy || pu1_src - src_strd) + LD1 {v3.16b},[x0],#16 //pu1_cur_row = vld1q_u8(pu1_src) LD1 {v30.16b},[x12],#16 //vld1q_u8(pu1_src[(ht - 1) * src_strd]) - cmhi v12.16b, v10.16b , v8.16b //vcgtq_u8(pu1_cur_row, pu1_top_row) + cmhi v5.16b, v3.16b , v1.16b //vcgtq_u8(pu1_cur_row, pu1_top_row) ST1 { v30.16b},[x3],#16 //vst1q_u8(pu1_src_top[col]) - cmhi v14.16b, v8.16b , v10.16b //vcltq_u8(pu1_cur_row, pu1_top_row) + cmhi v17.16b, v1.16b , v3.16b //vcltq_u8(pu1_cur_row, pu1_top_row) - SUB v16.16b, v14.16b , v12.16b //sign_up = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt)) + SUB v16.16b, v17.16b , v5.16b //sign_up = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt)) MOV x11,x8 //move ht to x11 for loop count PU1_SRC_LOOP: @@ -145,59 +145,59 @@ PU1_SRC_LOOP: LD1 {v18.16b},[x10] //pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd) ADD x6,x10,x1 //II Iteration *pu1_src + src_strd - cmhi v12.16b, v10.16b , v18.16b //vcgtq_u8(pu1_cur_row, pu1_top_row) + cmhi v5.16b, v3.16b , v18.16b //vcgtq_u8(pu1_cur_row, pu1_top_row) LD1 {v30.16b},[x6] //II pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd) - cmhi v14.16b, v18.16b , v10.16b //vcltq_u8(pu1_cur_row, pu1_top_row) + cmhi v17.16b, v18.16b , v3.16b //vcltq_u8(pu1_cur_row, pu1_top_row) SUB x10,x10,x1 - SUB v20.16b, v14.16b , v12.16b //sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt)) + SUB v20.16b, v17.16b , v5.16b //sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt)) Uxtl v26.8h, v18.8b //II pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row))) - ADD v12.16b, v0.16b , v16.16b //edge_idx = vaddq_s8(const_2, sign_up) + ADD v5.16b, v0.16b , v16.16b //edge_idx = vaddq_s8(const_2, sign_up) Uxtl2 v28.8h, v18.16b //II pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row))) - ADD v12.16b, v12.16b , v20.16b //edge_idx = vaddq_s8(edge_idx, sign_down) + ADD v5.16b, v5.16b , v20.16b //edge_idx = vaddq_s8(edge_idx, sign_down) cmhi v22.16b, v18.16b , v30.16b //II vcgtq_u8(pu1_cur_row, pu1_top_row) NEG v16.16b, v20.16b //sign_up = vnegq_s8(sign_down) - TBL v12.16b, {v6.16b},v12.16b //vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx)) + TBL v5.16b, {v6.16b},v5.16b //vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx)) cmhi v24.16b, v30.16b , v18.16b //II vcltq_u8(pu1_cur_row, pu1_top_row) - SUB v8.16b, v24.16b , v22.16b //II sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt)) + SUB v1.16b, v24.16b , v22.16b //II sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt)) // TBL v13.8b, {v6.16b},v13.8b //vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx)) ADD v22.16b, v0.16b , v16.16b //II edge_idx = vaddq_s8(const_2, sign_up) - NEG v16.16b, v8.16b //II sign_up = vnegq_s8(sign_down) - TBL v12.16b, {v7.16b},v12.16b //offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx)) - ADD v22.16b, v22.16b , v8.16b //II edge_idx = vaddq_s8(edge_idx, sign_down) + NEG v16.16b, v1.16b //II sign_up = vnegq_s8(sign_down) + TBL v5.16b, {v7.16b},v5.16b //offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx)) + ADD v22.16b, v22.16b , v1.16b //II edge_idx = vaddq_s8(edge_idx, sign_down) - Uxtl v20.8h, v10.8b //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row))) + Uxtl v20.8h, v3.8b //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row))) TBL v22.16b, {v6.16b},v22.16b //II vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx)) - SADDW v20.8h, v20.8h , v12.8b //pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset) + SADDW v20.8h, v20.8h , v5.8b //pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset) SMAX v20.8h, v20.8h , v2.8h //pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip) // TBL v23.8b, {v6.16b},v23.8b //II vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx)) UMIN v20.8h, v20.8h , v4.8h //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip)) - Uxtl2 v8.8h, v10.16b //pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row))) + Uxtl2 v1.8h, v3.16b //pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row))) // TBL v13.8b, {v7.16b},v13.8b //offset = vtbl1_s8(offset_tbl, vget_high_s8(edge_idx)) - mov v10.16b, v30.16b //II pu1_cur_row = pu1_next_row + mov v3.16b, v30.16b //II pu1_cur_row = pu1_next_row - SADDW2 v8.8h, v8.8h , v12.16b //pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset) + SADDW2 v1.8h, v1.8h , v5.16b //pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset) TBL v24.16b, {v7.16b},v22.16b //II offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx)) - SMAX v8.8h, v8.8h , v2.8h //pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip) + SMAX v1.8h, v1.8h , v2.8h //pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip) - UMIN v8.8h, v8.8h , v4.8h //pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip)) + UMIN v1.8h, v1.8h , v4.8h //pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip)) // TBL v25.8b, {v7.16b},v23.8b //II offset = vtbl1_s8(offset_tbl, vget_high_s8(edge_idx)) xtn v20.8b, v20.8h //vmovn_s16(pi2_tmp_cur_row.val[0]) SADDW v26.8h, v26.8h , v24.8b //II pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset) - xtn2 v20.16b, v8.8h //vmovn_s16(pi2_tmp_cur_row.val[1]) + xtn2 v20.16b, v1.8h //vmovn_s16(pi2_tmp_cur_row.val[1]) SADDW2 v28.8h, v28.8h , v24.16b //II pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset) @@ -220,9 +220,9 @@ PU1_SRC_LOOP: ADD x10,x10,x1 //*pu1_src + src_strd LD1 {v18.16b},[x10] //pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd) - cmhi v12.16b, v10.16b , v18.16b //vcgtq_u8(pu1_cur_row, pu1_top_row) - cmhi v14.16b, v18.16b , v10.16b //vcltq_u8(pu1_cur_row, pu1_top_row) - SUB v20.16b, v14.16b , v12.16b //sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt)) + cmhi v5.16b, v3.16b , v18.16b //vcgtq_u8(pu1_cur_row, pu1_top_row) + cmhi v17.16b, v18.16b , v3.16b //vcltq_u8(pu1_cur_row, pu1_top_row) + SUB v20.16b, v17.16b , v5.16b //sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt)) SUB x10,x10,x1 ADD v22.16b, v0.16b , v16.16b //edge_idx = vaddq_s8(const_2, sign_up) @@ -231,13 +231,13 @@ PU1_SRC_LOOP: // TBL v23.8b, {v6.16b},v23.8b //vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx)) TBL v24.16b, {v7.16b},v22.16b //offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx)) - Uxtl v26.8h, v10.8b //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row))) + Uxtl v26.8h, v3.8b //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row))) SADDW v26.8h, v26.8h , v24.8b //pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset) SMAX v26.8h, v26.8h , v2.8h //pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip) UMIN v26.8h, v26.8h , v4.8h //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip)) // TBL v25.8b, {v7.16b},v23.8b //offset = vtbl1_s8(offset_tbl, vget_high_s8(edge_idx)) - Uxtl2 v28.8h, v10.16b //pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row))) + Uxtl2 v28.8h, v3.16b //pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row))) SADDW2 v28.8h, v28.8h , v24.16b //pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset) SMAX v28.8h, v28.8h , v2.8h //pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip) UMIN v28.8h, v28.8h , v4.8h //pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip)) @@ -248,7 +248,7 @@ PU1_SRC_LOOP: ST1 { v30.16b},[x10],x1 //vst1q_u8(pu1_src_cpy, pu1_cur_row) PU1_SRC_LOOP_END: - mov v10.16b, v18.16b //pu1_cur_row = pu1_next_row + mov v3.16b, v18.16b //pu1_cur_row = pu1_next_row SUBS x7,x7,#16 //Decrement the wd loop count by 16 CMP x7,#8 //Check whether residue remains BEQ WIDTH_RESIDUE //If residue remains jump to residue loop @@ -264,15 +264,15 @@ WIDTH_RESIDUE: csel x9, x3, x9,NE //*pu1_src_top MOV x10,x0 - LD1 {v8.16b},[x9],#16 //pu1_top_row = vld1q_u8(pu1_src_top_cpy || pu1_src - src_strd) - LD1 {v10.16b},[x0],#16 //pu1_cur_row = vld1q_u8(pu1_src) + LD1 {v1.16b},[x9],#16 //pu1_top_row = vld1q_u8(pu1_src_top_cpy || pu1_src - src_strd) + LD1 {v3.16b},[x0],#16 //pu1_cur_row = vld1q_u8(pu1_src) LD1 {v30.8b},[x12] //vld1_u8(pu1_src[(ht - 1) * src_strd]) ST1 {v30.8b},[x3] //vst1_u8(pu1_src_top[col]) - cmhi v12.16b, v10.16b , v8.16b //vcgtq_u8(pu1_cur_row, pu1_top_row) - cmhi v14.16b, v8.16b , v10.16b //vcltq_u8(pu1_cur_row, pu1_top_row) - SUB v16.16b, v14.16b , v12.16b //sign_up = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt)) + cmhi v5.16b, v3.16b , v1.16b //vcgtq_u8(pu1_cur_row, pu1_top_row) + cmhi v17.16b, v1.16b , v3.16b //vcltq_u8(pu1_cur_row, pu1_top_row) + SUB v16.16b, v17.16b , v5.16b //sign_up = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt)) MOV x11,x8 //move ht to x11 for loop count PU1_SRC_LOOP_RESIDUE: @@ -280,33 +280,33 @@ PU1_SRC_LOOP_RESIDUE: LD1 {v18.16b},[x10] //pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd) ADD x6,x10,x1 //II Iteration *pu1_src + src_strd - cmhi v12.16b, v10.16b , v18.16b //vcgtq_u8(pu1_cur_row, pu1_next_row) + cmhi v5.16b, v3.16b , v18.16b //vcgtq_u8(pu1_cur_row, pu1_next_row) LD1 {v30.16b},[x6] //II pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd) - cmhi v14.16b, v18.16b , v10.16b //vcltq_u8(pu1_cur_row, pu1_next_row) + cmhi v17.16b, v18.16b , v3.16b //vcltq_u8(pu1_cur_row, pu1_next_row) SUB x10,x10,x1 - SUB v20.16b, v14.16b , v12.16b //sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt)) + SUB v20.16b, v17.16b , v5.16b //sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt)) Uxtl v26.8h, v18.8b //II pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row))) - ADD v12.16b, v0.16b , v16.16b //edge_idx = vaddq_s8(const_2, sign_up) + ADD v5.16b, v0.16b , v16.16b //edge_idx = vaddq_s8(const_2, sign_up) cmhi v22.16b, v18.16b , v30.16b //II vcgtq_u8(pu1_cur_row, pu1_next_row) - ADD v12.16b, v12.16b , v20.16b //edge_idx = vaddq_s8(edge_idx, sign_down) + ADD v5.16b, v5.16b , v20.16b //edge_idx = vaddq_s8(edge_idx, sign_down) cmhi v24.16b, v30.16b , v18.16b //II vcltq_u8(pu1_cur_row, pu1_next_row) NEG v16.16b, v20.16b //sign_up = vnegq_s8(sign_down) - TBL v12.8b, {v6.16b},v12.8b //vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx)) + TBL v5.8b, {v6.16b},v5.8b //vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx)) SUB v20.16b, v24.16b , v22.16b //II sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt)) ADD v22.16b, v0.16b , v16.16b //II edge_idx = vaddq_s8(const_2, sign_up) - TBL v12.8b, {v7.16b},v12.8b //offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx)) + TBL v5.8b, {v7.16b},v5.8b //offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx)) NEG v16.16b, v20.16b //II sign_up = vnegq_s8(sign_down) ADD v22.16b, v22.16b , v20.16b //II edge_idx = vaddq_s8(edge_idx, sign_down) - Uxtl v20.8h, v10.8b //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row))) + Uxtl v20.8h, v3.8b //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row))) - SADDW v20.8h, v20.8h , v12.8b //pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset) + SADDW v20.8h, v20.8h , v5.8b //pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset) TBL v22.8b, {v6.16b},v22.8b //II vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx)) SMAX v20.8h, v20.8h , v2.8h //pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip) @@ -318,7 +318,7 @@ PU1_SRC_LOOP_RESIDUE: SMAX v26.8h, v26.8h , v2.8h //II pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip) UMIN v26.8h, v26.8h , v4.8h //II pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip)) - mov v10.16b, v30.16b //II pu1_cur_row = pu1_next_row + mov v3.16b, v30.16b //II pu1_cur_row = pu1_next_row ST1 {v20.8b},[x10],x1 //vst1q_u8(pu1_src_cpy, pu1_cur_row) xtn v30.8b, v26.8h //II vmovn_s16(pi2_tmp_cur_row.val[0]) @@ -332,9 +332,9 @@ PU1_SRC_LOOP_RESIDUE: ADD x10,x10,x1 //*pu1_src + src_strd LD1 {v18.16b},[x10] //pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd) - cmhi v12.16b, v10.16b , v18.16b //vcgtq_u8(pu1_cur_row, pu1_next_row) - cmhi v14.16b, v18.16b , v10.16b //vcltq_u8(pu1_cur_row, pu1_next_row) - SUB v20.16b, v14.16b , v12.16b //sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt)) + cmhi v5.16b, v3.16b , v18.16b //vcgtq_u8(pu1_cur_row, pu1_next_row) + cmhi v17.16b, v18.16b , v3.16b //vcltq_u8(pu1_cur_row, pu1_next_row) + SUB v20.16b, v17.16b , v5.16b //sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt)) SUB x10,x10,x1 ADD v22.16b, v0.16b , v16.16b //edge_idx = vaddq_s8(const_2, sign_up) @@ -342,7 +342,7 @@ PU1_SRC_LOOP_RESIDUE: TBL v22.8b, {v6.16b},v22.8b //vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx)) TBL v24.8b, {v7.16b},v22.8b //offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx)) - Uxtl v26.8h, v10.8b //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row))) + Uxtl v26.8h, v3.8b //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row))) SADDW v26.8h, v26.8h , v24.8b //pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset) SMAX v26.8h, v26.8h , v2.8h //pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip) UMIN v26.8h, v26.8h , v4.8h //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip)) @@ -354,7 +354,7 @@ PU1_SRC_LOOP_RESIDUE: END_LOOPS: // LDMFD sp!,{x4-x12,x15} //Reload the registers from SP ldp x19, x20,[sp], #16 - pop_v_regs + ret diff --git a/common/arm64/ihevc_sao_edge_offset_class1_chroma.s b/common/arm64/ihevc_sao_edge_offset_class1_chroma.s index 4baa5bf..894e702 100644 --- a/common/arm64/ihevc_sao_edge_offset_class1_chroma.s +++ b/common/arm64/ihevc_sao_edge_offset_class1_chroma.s @@ -76,7 +76,7 @@ ihevc_sao_edge_offset_class1_chroma_av8: ldr w11,[sp,#24] - push_v_regs + // STMFD sp!, {x4-x12, x14} //stack stores the values of the arguments stp x19, x20,[sp,#-16]! stp x21, x22,[sp,#-16]! @@ -135,7 +135,7 @@ SRC_LEFT_LOOP: LDR x14, [x14, #:got_lo12:gi1_table_edge_idx] LD1 {v6.8b},[x14] //edge_idx_tbl = vld1_s8(gi1_table_edge_idx) LD1 {v7.8b},[x6] //offset_tbl_u = vld1_s8(pi1_sao_offset_u) - LD1 {v8.8b},[x7] //offset_tbl_v = vld1_s8(pi1_sao_offset_v) + LD1 {v1.8b},[x7] //offset_tbl_v = vld1_s8(pi1_sao_offset_v) CMP x8,#16 //Compare wd with 16 BLT WIDTH_RESIDUE //If not jump to WIDTH_RESIDUE where loop is unrolled for 8 case @@ -151,17 +151,17 @@ WIDTH_LOOP_16: LD1 {v28.16b},[x11],#16 //pu1_top_row = vld1q_u8(pu1_src_top_cpy || pu1_src - src_strd) //LD1 {v29.8b},[x11],#8 //pu1_top_row = vld1q_u8(pu1_src_top_cpy || pu1_src - src_strd) - LD1 {v10.16b},[x0],#16 //pu1_cur_row = vld1q_u8(pu1_src) + LD1 {v3.16b},[x0],#16 //pu1_cur_row = vld1q_u8(pu1_src) //LD1 {v11.8b},[x0],#8 //pu1_cur_row = vld1q_u8(pu1_src) LD1 {v30.16b},[x12],#16 //vld1q_u8(pu1_src[(ht - 1) * src_strd]) //LD1 {v31.8b},[x12],#8 //vld1q_u8(pu1_src[(ht - 1) * src_strd]) - cmhi v12.16b, v10.16b , v28.16b //vcgtq_u8(pu1_cur_row, pu1_top_row) + cmhi v5.16b, v3.16b , v28.16b //vcgtq_u8(pu1_cur_row, pu1_top_row) ST1 { v30.16b},[x3],#16 //vst1q_u8(pu1_src_top[col]) - cmhi v14.16b, v28.16b , v10.16b //vcltq_u8(pu1_cur_row, pu1_top_row) + cmhi v19.16b, v28.16b , v3.16b //vcltq_u8(pu1_cur_row, pu1_top_row) - SUB v16.16b, v14.16b , v12.16b //sign_up = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt)) + SUB v16.16b, v19.16b , v5.16b //sign_up = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt)) MOV x11,x9 //move ht to x11 for loop count PU1_SRC_LOOP: @@ -172,47 +172,47 @@ PU1_SRC_LOOP: ADD x6,x10,x1 //II Iteration *pu1_src + src_strd //mov v19.d[0],v18.d[1] - cmhi v12.16b, v10.16b , v18.16b //vcgtq_u8(pu1_cur_row, pu1_top_row) + cmhi v5.16b, v3.16b , v18.16b //vcgtq_u8(pu1_cur_row, pu1_top_row) LD1 {v30.16b},[x6] //II pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd) //LD1 {v31.8b},[x6] //II pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd) //SUB x6, x6,#8 - cmhi v14.16b, v18.16b , v10.16b //vcltq_u8(pu1_cur_row, pu1_top_row) + cmhi v19.16b, v18.16b , v3.16b //vcltq_u8(pu1_cur_row, pu1_top_row) SUB x10,x10,x1 - SUB v20.16b, v14.16b , v12.16b //sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt)) + SUB v20.16b, v19.16b , v5.16b //sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt)) Uxtl v26.8h, v18.8b //II pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row))) - ADD v12.16b, v0.16b , v16.16b //edge_idx = vaddq_s8(const_2, sign_up) + ADD v5.16b, v0.16b , v16.16b //edge_idx = vaddq_s8(const_2, sign_up) Uxtl2 v28.8h, v18.16b //II pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row))) - ADD v12.16b, v12.16b , v20.16b //edge_idx = vaddq_s8(edge_idx, sign_down) + ADD v5.16b, v5.16b , v20.16b //edge_idx = vaddq_s8(edge_idx, sign_down) cmhi v22.16b, v18.16b , v30.16b //II vcgtq_u8(pu1_cur_row, pu1_top_row) mov v16.d[1],v16.d[0] NEG v16.16b, v20.16b //sign_up = vnegq_s8(sign_down) - TBL v12.16b, {v6.16b},v12.16b //vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx)) + TBL v5.16b, {v6.16b},v5.16b //vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx)) cmhi v24.16b, v30.16b , v18.16b //II vcltq_u8(pu1_cur_row, pu1_top_row) SUB v28.16b, v24.16b , v22.16b //II sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt)) //TBL v13.8b, {v6.16b},v13.8b //vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx)) ADD v22.16b, v0.16b , v16.16b //II edge_idx = vaddq_s8(const_2, sign_up) - mov v13.d[0], v12.d[1] - UZP1 v27.8b, v12.8b, v13.8b - UZP2 v13.8b, v12.8b, v13.8b - mov v12.8b,v27.8b + mov v17.d[0], v5.d[1] + UZP1 v27.8b, v5.8b, v17.8b + UZP2 v17.8b, v5.8b, v17.8b + mov v5.8b,v27.8b NEG v16.16b, v28.16b //II sign_up = vnegq_s8(sign_down) - TBL v12.8b, {v7.16b},v12.8b //offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx)) + TBL v5.8b, {v7.16b},v5.8b //offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx)) ADD v22.16b, v22.16b , v28.16b //II edge_idx = vaddq_s8(edge_idx, sign_down) - Uxtl v20.8h, v10.8b //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row))) - TBL v13.8b, {v8.16b},v13.8b - ZIP1 v27.8b, v12.8b, v13.8b - ZIP2 v13.8b, v12.8b, v13.8b - mov v12.8b,v27.8b + Uxtl v20.8h, v3.8b //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row))) + TBL v17.8b, {v1.16b},v17.8b + ZIP1 v27.8b, v5.8b, v17.8b + ZIP2 v17.8b, v5.8b, v17.8b + mov v5.8b,v27.8b - SADDW v20.8h, v20.8h , v12.8b //pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset) + SADDW v20.8h, v20.8h , v5.8b //pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset) TBL v22.16b, {v6.16b},v22.16b //II vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx)) SMAX v20.8h, v20.8h , v2.8h //pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip) @@ -224,15 +224,15 @@ PU1_SRC_LOOP: UZP2 v23.8b, v22.8b, v23.8b mov v22.8b,v27.8b - Uxtl2 v28.8h, v10.16b //pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row))) + Uxtl2 v28.8h, v3.16b //pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row))) //VTBL.8 D13,D7,D13 @offset = vtbl1_s8(offset_tbl, vget_high_s8(edge_idx)) - mov v10.16b, v30.16b //II pu1_cur_row = pu1_next_row + mov v3.16b, v30.16b //II pu1_cur_row = pu1_next_row - SADDW v28.8h, v28.8h , v13.8b //pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset) + SADDW v28.8h, v28.8h , v17.8b //pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset) TBL v24.8b, {v7.16b},v22.8b //offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx)) SMAX v28.8h, v28.8h , v2.8h //pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip) - TBL v25.8b, {v8.16b},v23.8b + TBL v25.8b, {v1.16b},v23.8b ZIP1 v27.8b, v24.8b, v25.8b ZIP2 v25.8b, v24.8b, v25.8b mov v24.8b,v27.8b @@ -270,9 +270,9 @@ PU1_SRC_LOOP: LD1 {v18.16b},[x10] //pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd) //LD1 {v19.8b},[x10] //pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd) //SUB x10, x10,#8 - cmhi v12.16b, v10.16b , v18.16b //vcgtq_u8(pu1_cur_row, pu1_top_row) - cmhi v14.16b, v18.16b , v10.16b //vcltq_u8(pu1_cur_row, pu1_top_row) - SUB v20.16b, v14.16b , v12.16b //sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt)) + cmhi v5.16b, v3.16b , v18.16b //vcgtq_u8(pu1_cur_row, pu1_top_row) + cmhi v19.16b, v18.16b , v3.16b //vcltq_u8(pu1_cur_row, pu1_top_row) + SUB v20.16b, v19.16b , v5.16b //sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt)) SUB x10,x10,x1 ADD v22.16b, v0.16b , v16.16b //edge_idx = vaddq_s8(const_2, sign_up) @@ -285,19 +285,19 @@ PU1_SRC_LOOP: UZP2 v23.8b, v22.8b, v23.8b mov v22.8b,v27.8b TBL v24.8b, {v7.16b},v22.8b - TBL v25.8b, {v8.16b},v23.8b + TBL v25.8b, {v1.16b},v23.8b ZIP1 v27.8b, v24.8b, v25.8b ZIP2 v25.8b, v24.8b, v25.8b mov v24.8b,v27.8b //VTBL.8 D24,D7,D22 @offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx)) - Uxtl v26.8h, v10.8b //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row))) + Uxtl v26.8h, v3.8b //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row))) SADDW v26.8h, v26.8h , v24.8b //pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset) SMAX v26.8h, v26.8h , v2.8h //pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip) UMIN v26.8h, v26.8h , v4.8h //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip)) //VTBL.8 D25,D7,D23 @offset = vtbl1_s8(offset_tbl, vget_high_s8(edge_idx)) - Uxtl2 v28.8h, v10.16b //pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row))) + Uxtl2 v28.8h, v3.16b //pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row))) SADDW v28.8h, v28.8h , v25.8b //pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset) SMAX v28.8h, v28.8h , v2.8h //pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip) UMIN v28.8h, v28.8h , v4.8h //pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip)) @@ -308,7 +308,7 @@ PU1_SRC_LOOP: ST1 { v30.16b},[x10],x1 //vst1q_u8(pu1_src_cpy, pu1_cur_row) PU1_SRC_LOOP_END: - mov v10.16b, v18.16b //pu1_cur_row = pu1_next_row + mov v3.16b, v18.16b //pu1_cur_row = pu1_next_row SUBS x8,x8,#16 //Decrement the wd loop count by 16 CMP x8,#8 //Check whether residue remains BEQ WIDTH_RESIDUE //If residue remains jump to residue loop @@ -326,15 +326,15 @@ WIDTH_RESIDUE: LD1 {v28.16b},[x11] //pu1_top_row = vld1q_u8(pu1_src_top_cpy || pu1_src - src_strd) //LD1 {v29.8b},[x11],#8 //pu1_top_row = vld1q_u8(pu1_src_top_cpy || pu1_src - src_strd) - LD1 {v10.16b},[x0] //pu1_cur_row = vld1q_u8(pu1_src) + LD1 {v3.16b},[x0] //pu1_cur_row = vld1q_u8(pu1_src) //LD1 {v11.8b},[x0],#8 //pu1_cur_row = vld1q_u8(pu1_src) LD1 {v30.8b},[x12] //vld1_u8(pu1_src[(ht - 1) * src_strd]) ST1 {v30.8b},[x3] //vst1_u8(pu1_src_top[col]) - cmhi v12.16b, v10.16b , v28.16b //vcgtq_u8(pu1_cur_row, pu1_top_row) - cmhi v14.16b, v28.16b , v10.16b //vcltq_u8(pu1_cur_row, pu1_top_row) - SUB v16.16b, v14.16b , v12.16b //sign_up = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt)) + cmhi v5.16b, v3.16b , v28.16b //vcgtq_u8(pu1_cur_row, pu1_top_row) + cmhi v19.16b, v28.16b , v3.16b //vcltq_u8(pu1_cur_row, pu1_top_row) + SUB v16.16b, v19.16b , v5.16b //sign_up = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt)) MOV x11,x9 //move ht to x11 for loop count PU1_SRC_LOOP_RESIDUE: @@ -344,46 +344,46 @@ PU1_SRC_LOOP_RESIDUE: //SUB x10, x10,#8 ADD x6,x10,x1 //II Iteration *pu1_src + src_strd - cmhi v12.16b, v10.16b , v18.16b //vcgtq_u8(pu1_cur_row, pu1_next_row) + cmhi v5.16b, v3.16b , v18.16b //vcgtq_u8(pu1_cur_row, pu1_next_row) LD1 {v30.16b},[x6] //II pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd) //LD1 {v31.8b},[x6] //II pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd) //SUB x6, x6,#8 - cmhi v14.16b, v18.16b , v10.16b //vcltq_u8(pu1_cur_row, pu1_next_row) + cmhi v19.16b, v18.16b , v3.16b //vcltq_u8(pu1_cur_row, pu1_next_row) SUB x10,x10,x1 - SUB v20.16b, v14.16b , v12.16b //sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt)) + SUB v20.16b, v19.16b , v5.16b //sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt)) Uxtl v26.8h, v18.8b //II pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row))) - ADD v12.16b, v0.16b , v16.16b //edge_idx = vaddq_s8(const_2, sign_up) + ADD v5.16b, v0.16b , v16.16b //edge_idx = vaddq_s8(const_2, sign_up) cmhi v22.16b, v18.16b , v30.16b //II vcgtq_u8(pu1_cur_row, pu1_next_row) - ADD v12.16b, v12.16b , v20.16b //edge_idx = vaddq_s8(edge_idx, sign_down) + ADD v5.16b, v5.16b , v20.16b //edge_idx = vaddq_s8(edge_idx, sign_down) cmhi v24.16b, v30.16b , v18.16b //II vcltq_u8(pu1_cur_row, pu1_next_row) NEG v16.16b, v20.16b //sign_up = vnegq_s8(sign_down) - TBL v12.8b, {v6.16b},v12.8b //vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx)) + TBL v5.8b, {v6.16b},v5.8b //vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx)) SUB v20.16b, v24.16b , v22.16b //II sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt)) - UZP1 v27.8b, v12.8b, v13.8b - UZP2 v13.8b, v12.8b, v13.8b - mov v12.8b,v27.8b + UZP1 v27.8b, v5.8b, v17.8b + UZP2 v17.8b, v5.8b, v17.8b + mov v5.8b,v27.8b ADD v22.16b, v0.16b , v16.16b //II edge_idx = vaddq_s8(const_2, sign_up) - TBL v12.8b, {v7.16b},v12.8b + TBL v5.8b, {v7.16b},v5.8b NEG v16.16b, v20.16b //II sign_up = vnegq_s8(sign_down) - TBL v13.8b, {v8.16b},v13.8b - ZIP1 v27.8b, v12.8b, v13.8b - ZIP2 v13.8b, v12.8b, v13.8b - mov v12.8b,v27.8b + TBL v17.8b, {v1.16b},v17.8b + ZIP1 v27.8b, v5.8b, v17.8b + ZIP2 v17.8b, v5.8b, v17.8b + mov v5.8b,v27.8b //VTBL.8 D12,D7,D12 @offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx)) ADD v22.16b, v22.16b , v20.16b //II edge_idx = vaddq_s8(edge_idx, sign_down) - Uxtl v20.8h, v10.8b //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row))) + Uxtl v20.8h, v3.8b //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row))) - SADDW v20.8h, v20.8h , v12.8b //pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset) + SADDW v20.8h, v20.8h , v5.8b //pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset) TBL v22.8b, {v6.16b},v22.8b //II vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx)) SMAX v20.8h, v20.8h , v2.8h //pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip) @@ -395,7 +395,7 @@ PU1_SRC_LOOP_RESIDUE: TBL v24.8b, {v7.16b},v22.8b xtn v20.8b, v20.8h //vmovn_s16(pi2_tmp_cur_row.val[0]) - TBL v25.8b, {v8.16b},v23.8b + TBL v25.8b, {v1.16b},v23.8b ZIP1 v27.8b, v24.8b, v25.8b ZIP2 v25.8b, v24.8b, v25.8b mov v24.8b,v27.8b @@ -405,7 +405,7 @@ PU1_SRC_LOOP_RESIDUE: SMAX v26.8h, v26.8h , v2.8h //II pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip) UMIN v26.8h, v26.8h , v4.8h //II pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip)) - mov v10.16b, v30.16b //II pu1_cur_row = pu1_next_row + mov v3.16b, v30.16b //II pu1_cur_row = pu1_next_row ST1 {v20.8b},[x10],x1 //vst1q_u8(pu1_src_cpy, pu1_cur_row) xtn v30.8b, v26.8h //II vmovn_s16(pi2_tmp_cur_row.val[0]) @@ -421,9 +421,9 @@ PU1_SRC_LOOP_RESIDUE: LD1 {v18.16b},[x10] //pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd) //LD1 {v19.8b},[x10] //pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd) //SUB x10, x10,#8 - cmhi v12.16b, v10.16b , v18.16b //vcgtq_u8(pu1_cur_row, pu1_next_row) - cmhi v14.16b, v18.16b , v10.16b //vcltq_u8(pu1_cur_row, pu1_next_row) - SUB v20.16b, v14.16b , v12.16b //sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt)) + cmhi v5.16b, v3.16b , v18.16b //vcgtq_u8(pu1_cur_row, pu1_next_row) + cmhi v19.16b, v18.16b , v3.16b //vcltq_u8(pu1_cur_row, pu1_next_row) + SUB v20.16b, v19.16b , v5.16b //sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt)) SUB x10,x10,x1 ADD v22.16b, v0.16b , v16.16b //edge_idx = vaddq_s8(const_2, sign_up) @@ -435,13 +435,13 @@ PU1_SRC_LOOP_RESIDUE: mov v22.8b,v27.8b TBL v24.8b, {v7.16b},v22.8b - TBL v25.8b, {v8.16b},v23.8b + TBL v25.8b, {v1.16b},v23.8b ZIP1 v27.8b, v24.8b, v25.8b ZIP2 v25.8b, v24.8b, v25.8b mov v24.8b,v27.8b //VTBL.8 D24,D7,D22 @offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx)) - Uxtl v26.8h, v10.8b //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row))) + Uxtl v26.8h, v3.8b //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row))) SADDW v26.8h, v26.8h , v24.8b //pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset) SMAX v26.8h, v26.8h , v2.8h //pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip) UMIN v26.8h, v26.8h , v4.8h //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip)) @@ -456,7 +456,7 @@ END_LOOPS: ldp x23, x24,[sp],#16 ldp x21, x22,[sp],#16 ldp x19, x20,[sp],#16 - pop_v_regs + ret diff --git a/common/arm64/ihevc_sao_edge_offset_class2.s b/common/arm64/ihevc_sao_edge_offset_class2.s index 3350e5c..31852f3 100644 --- a/common/arm64/ihevc_sao_edge_offset_class2.s +++ b/common/arm64/ihevc_sao_edge_offset_class2.s @@ -79,7 +79,7 @@ ihevc_sao_edge_offset_class2_av8: MOV x16,x7 // wd MOV x17,x8 // ht - push_v_regs + stp x19, x20,[sp,#-16]! stp x21, x22,[sp,#-16]! stp x23, x24,[sp,#-16]! @@ -218,7 +218,7 @@ PU1_AVAIL: csel x12, x20, x12,EQ MOV x6,x7 //move wd to x6 loop_count - movi v8.16b, #0xFF //au1_mask = vdupq_n_s8(-1) + movi v1.16b, #0xFF //au1_mask = vdupq_n_s8(-1) ADD x20,x14,#1 //pu1_src_left_cpy += 1 csel x14, x20, x14,EQ @@ -239,11 +239,11 @@ WIDTH_LOOP_16: MOV x20,#-1 csel x8, x20, x8,NE //au1_mask = vsetq_lane_s8(-1, au1_mask, 0) - mov v8.8b[0], w8 //au1_mask = vsetq_lane_s8((-1||pu1_avail[0]), au1_mask, 0) + mov v1.8b[0], w8 //au1_mask = vsetq_lane_s8((-1||pu1_avail[0]), au1_mask, 0) CMP x6,#16 //if(col == 16) BNE SKIP_AU1_MASK_VAL LDRB w8,[x5,#1] //pu1_avail[1] - mov v8.16b[15], w8 //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15) + mov v1.16b[15], w8 //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15) SKIP_AU1_MASK_VAL: LDRB w11,[x5,#2] //pu1_avail[2] @@ -255,23 +255,23 @@ SKIP_AU1_MASK_VAL: SUB x8,x8,#1 //pu1_src_top_cpy - 1 || pu1_src - src_strd - 1 MOV x7,x16 //Loads wd - LD1 {v10.16b},[x8] //pu1_top_row = vld1q_u8(pu1_src - src_strd - 1) || vld1q_u8(pu1_src_top_cpy - 1) + LD1 {v3.16b},[x8] //pu1_top_row = vld1q_u8(pu1_src - src_strd - 1) || vld1q_u8(pu1_src_top_cpy - 1) ADD x3,x3,#16 ADD x5,sp,#0x42 //*au1_src_left_tmp - LD1 {v12.16b},[x0] //pu1_cur_row = vld1q_u8(pu1_src) + LD1 {v5.16b},[x0] //pu1_cur_row = vld1q_u8(pu1_src) MOV x4,x17 //Loads ht SUB x7,x7,x6 //(wd - col) - cmhi v14.16b, v12.16b , v10.16b //vcgtq_u8(pu1_cur_row, pu1_top_row) + cmhi v17.16b, v5.16b , v3.16b //vcgtq_u8(pu1_cur_row, pu1_top_row) MOV x8,x19 //Loads *pu1_src ADD x7,x7,#15 //15 + (wd - col) - cmhi v16.16b, v10.16b , v12.16b //vcltq_u8(pu1_cur_row, pu1_top_row) + cmhi v16.16b, v3.16b , v5.16b //vcltq_u8(pu1_cur_row, pu1_top_row) ADD x7,x8,x7 //pu1_src[0 * src_strd + 15 + (wd - col)] SUB x5,x5,#1 - SUB v14.16b, v16.16b , v14.16b //sign_up = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt)) + SUB v17.16b, v16.16b , v17.16b //sign_up = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt)) AU1_SRC_LEFT_LOOP: LDRB w8,[x7] //load the value and increment by src_strd @@ -307,36 +307,36 @@ SIGN_UP_CHANGE: csel x4, x20, x4,LT //I MOV x20,#1 csel x4, x20, x4,GT //I SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[ht_tmp - 1 - row]) - mov v14.8b[0], w4 //I sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[ht_tmp - 1 - row]), sign_up, 0) + mov v17.8b[0], w4 //I sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[ht_tmp - 1 - row]), sign_up, 0) SIGN_UP_CHANGE_DONE: - cmhi v10.16b, v12.16b , v18.16b //I vcgtq_u8(pu1_cur_row, pu1_next_row_tmp) - ADD v24.16b, v0.16b , v14.16b //I edge_idx = vaddq_s8(const_2, sign_up) + cmhi v3.16b, v5.16b , v18.16b //I vcgtq_u8(pu1_cur_row, pu1_next_row_tmp) + ADD v24.16b, v0.16b , v17.16b //I edge_idx = vaddq_s8(const_2, sign_up) - cmhi v18.16b, v18.16b , v12.16b //I vcltq_u8(pu1_cur_row, pu1_next_row_tmp) - SUB v10.16b, v18.16b , v10.16b //I sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt)) + cmhi v18.16b, v18.16b , v5.16b //I vcltq_u8(pu1_cur_row, pu1_next_row_tmp) + SUB v3.16b, v18.16b , v3.16b //I sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt)) - ADD v24.16b, v24.16b , v10.16b //I edge_idx = vaddq_s8(edge_idx, sign_down) + ADD v24.16b, v24.16b , v3.16b //I edge_idx = vaddq_s8(edge_idx, sign_down) TBL v18.16b, {v6.16b},v24.16b //I vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx)) // TBL v19.8b, {v6.16b},v25.8b //I vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx)) - AND v18.16b, v18.16b , v8.16b //I edge_idx = vandq_s8(edge_idx, au1_mask) + AND v18.16b, v18.16b , v1.16b //I edge_idx = vandq_s8(edge_idx, au1_mask) - NEG v14.16b, v10.16b //I sign_up = vnegq_s8(sign_down) - TBL v10.16b, {v7.16b},v18.16b //I offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx)) - EXT v14.16b, v14.16b , v14.16b,#15 //I sign_up = vextq_s8(sign_up, sign_up, 15) + NEG v17.16b, v3.16b //I sign_up = vnegq_s8(sign_down) + TBL v3.16b, {v7.16b},v18.16b //I offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx)) + EXT v17.16b, v17.16b , v17.16b,#15 //I sign_up = vextq_s8(sign_up, sign_up, 15) - Uxtl v20.8h, v12.8b //I pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row))) + Uxtl v20.8h, v5.8b //I pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row))) // TBL v11.8b, {v7.16b},v19.8b //I offset = vtbl1_s8(offset_tbl, vget_high_s8(edge_idx)) - SADDW v20.8h, v20.8h , v10.8b //I pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset) + SADDW v20.8h, v20.8h , v3.8b //I pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset) SMAX v20.8h, v20.8h , v2.8h //I pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip) - Uxtl2 v22.8h, v12.16b //I pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row))) + Uxtl2 v22.8h, v5.16b //I pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row))) UMIN v20.8h, v20.8h , v4.8h //I pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip)) - mov v12.16b, v16.16b //I pu1_cur_row = pu1_next_row + mov v5.16b, v16.16b //I pu1_cur_row = pu1_next_row - SADDW2 v22.8h, v22.8h , v10.16b //I pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset) + SADDW2 v22.8h, v22.8h , v3.16b //I pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset) xtn v20.8b, v20.8h //I vmovn_s16(pi2_tmp_cur_row.val[0]) SMAX v22.8h, v22.8h , v2.8h //I pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip) @@ -374,12 +374,12 @@ PU1_SRC_LOOP: EXT v18.16b, v30.16b , v18.16b,#1 //III pu1_next_row_tmp = vextq_u8(pu1_next_row, pu1_next_row_tmp, 1) LDRB w2,[x0,x1] //III pu1_src_cpy[0] - cmhi v24.16b, v12.16b , v22.16b //II vcgtq_u8(pu1_cur_row, pu1_next_row_tmp) + cmhi v24.16b, v5.16b , v22.16b //II vcgtq_u8(pu1_cur_row, pu1_next_row_tmp) SUB x5,x12,x7 //III ht_tmp - row movn x20,#0 csel x4, x20, x4,LT //II - cmhi v22.16b, v22.16b , v12.16b //II vcltq_u8(pu1_cur_row, pu1_next_row_tmp) + cmhi v22.16b, v22.16b , v5.16b //II vcltq_u8(pu1_cur_row, pu1_next_row_tmp) ADD x5,x14,x5 //III pu1_src_left_cpy[ht_tmp - row] MOV x20,#1 @@ -389,52 +389,52 @@ PU1_SRC_LOOP: LDRB w5,[x5] //III load the value SUBS x2,x2,x5 //III pu1_src_cpy[0] - pu1_src_left_cpy[ht_tmp - 1 - row] - mov v14.8b[0], w4 //II sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[ht_tmp - 1 - row]), sign_up, 0) + mov v17.8b[0], w4 //II sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[ht_tmp - 1 - row]), sign_up, 0) movn x20,#0 csel x2, x20, x2,LT //III - cmhi v10.16b, v16.16b , v18.16b //III vcgtq_u8(pu1_cur_row, pu1_next_row_tmp) + cmhi v3.16b, v16.16b , v18.16b //III vcgtq_u8(pu1_cur_row, pu1_next_row_tmp) MOV x20,#1 csel x2, x20, x2,GT //III SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[ht_tmp - 1 - row]) - ADD v22.16b, v0.16b , v14.16b //II edge_idx = vaddq_s8(const_2, sign_up) + ADD v22.16b, v0.16b , v17.16b //II edge_idx = vaddq_s8(const_2, sign_up) ADD v22.16b, v22.16b , v24.16b //II edge_idx = vaddq_s8(edge_idx, sign_down) cmhi v18.16b, v18.16b , v16.16b //III vcltq_u8(pu1_cur_row, pu1_next_row_tmp) TBL v22.16b, {v6.16b},v22.16b //II vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx)) - NEG v14.16b, v24.16b //II sign_up = vnegq_s8(sign_down) + NEG v17.16b, v24.16b //II sign_up = vnegq_s8(sign_down) - SUB v10.16b, v18.16b , v10.16b //III sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt)) + SUB v3.16b, v18.16b , v3.16b //III sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt)) // TBL v23.8b, {v6.16b},v23.8b //II vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx)) - EXT v14.16b, v14.16b , v14.16b,#15 //II sign_up = vextq_s8(sign_up, sign_up, 15) + EXT v17.16b, v17.16b , v17.16b,#15 //II sign_up = vextq_s8(sign_up, sign_up, 15) - AND v22.16b, v22.16b , v8.16b //II edge_idx = vandq_s8(edge_idx, au1_mask) - mov v14.8b[0], w2 //III sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[ht_tmp - 1 - row]), sign_up, 0) + AND v22.16b, v22.16b , v1.16b //II edge_idx = vandq_s8(edge_idx, au1_mask) + mov v17.8b[0], w2 //III sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[ht_tmp - 1 - row]), sign_up, 0) - ADD v18.16b, v0.16b , v14.16b //III edge_idx = vaddq_s8(const_2, sign_up) + ADD v18.16b, v0.16b , v17.16b //III edge_idx = vaddq_s8(const_2, sign_up) TBL v24.16b, {v7.16b},v22.16b //II offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx)) - ADD v18.16b, v18.16b , v10.16b //III edge_idx = vaddq_s8(edge_idx, sign_down) + ADD v18.16b, v18.16b , v3.16b //III edge_idx = vaddq_s8(edge_idx, sign_down) - Uxtl v26.8h, v12.8b //II pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row))) + Uxtl v26.8h, v5.8b //II pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row))) TBL v18.16b, {v6.16b},v18.16b //III vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx)) - NEG v14.16b, v10.16b //III sign_up = vnegq_s8(sign_down) + NEG v17.16b, v3.16b //III sign_up = vnegq_s8(sign_down) SADDW v26.8h, v26.8h , v24.8b //II pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset) // TBL v19.8b, {v6.16b},v19.8b //III vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx)) - EXT v14.16b, v14.16b , v14.16b,#15 //III sign_up = vextq_s8(sign_up, sign_up, 15) + EXT v17.16b, v17.16b , v17.16b,#15 //III sign_up = vextq_s8(sign_up, sign_up, 15) - AND v18.16b, v18.16b , v8.16b //III edge_idx = vandq_s8(edge_idx, au1_mask) + AND v18.16b, v18.16b , v1.16b //III edge_idx = vandq_s8(edge_idx, au1_mask) Uxtl v20.8h, v16.8b //III pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row))) SMAX v26.8h, v26.8h , v2.8h //II pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip) - TBL v10.16b, {v7.16b},v18.16b //III offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx)) - SADDW v20.8h, v20.8h , v10.8b //III pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset) + TBL v3.16b, {v7.16b},v18.16b //III offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx)) + SADDW v20.8h, v20.8h , v3.8b //III pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset) UMIN v26.8h, v26.8h , v4.8h //II pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip)) // TBL v25.8b, {v7.16b},v23.8b //II offset = vtbl1_s8(offset_tbl, vget_high_s8(edge_idx)) SMAX v20.8h, v20.8h , v2.8h //III pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip) - Uxtl2 v28.8h, v12.16b //II pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row))) + Uxtl2 v28.8h, v5.16b //II pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row))) UMIN v20.8h, v20.8h , v4.8h //III pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip)) SADDW2 v28.8h, v28.8h , v24.16b //II pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset) @@ -444,11 +444,11 @@ PU1_SRC_LOOP: UMIN v28.8h, v28.8h , v4.8h //II pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip)) Uxtl2 v18.8h, v16.16b //III pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row))) - mov v12.16b, v30.16b //III pu1_cur_row = pu1_next_row + mov v5.16b, v30.16b //III pu1_cur_row = pu1_next_row xtn v26.8b, v26.8h //II vmovn_s16(pi2_tmp_cur_row.val[0]) xtn2 v26.16b, v28.8h //II vmovn_s16(pi2_tmp_cur_row.val[1]) - SADDW2 v18.8h, v18.8h , v10.16b //III pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset) + SADDW2 v18.8h, v18.8h , v3.16b //III pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset) SMAX v18.8h, v18.8h , v2.8h //III pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip) xtn v20.8b, v20.8h //III vmovn_s16(pi2_tmp_cur_row.val[0]) @@ -480,45 +480,45 @@ PU1_SRC_LOOP: EXT v18.16b, v16.16b , v18.16b,#1 //pu1_next_row_tmp = vextq_u8(pu1_next_row, pu1_next_row_tmp, 1) SUBS x4,x2,x5 //pu1_src_cpy[0] - pu1_src_left_cpy[ht_tmp - 1 - row] - cmhi v10.16b, v12.16b , v18.16b //vcgtq_u8(pu1_cur_row, pu1_next_row_tmp) + cmhi v3.16b, v5.16b , v18.16b //vcgtq_u8(pu1_cur_row, pu1_next_row_tmp) movn x20,#0 csel x4, x20, x4,LT MOV x20,#1 csel x4, x20, x4,GT //SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[ht_tmp - 1 - row]) - cmhi v18.16b, v18.16b , v12.16b //vcltq_u8(pu1_cur_row, pu1_next_row_tmp) + cmhi v18.16b, v18.16b , v5.16b //vcltq_u8(pu1_cur_row, pu1_next_row_tmp) - mov v14.8b[0], w4 //sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[ht_tmp - 1 - row]), sign_up, 0) - SUB v10.16b, v18.16b , v10.16b //sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt)) + mov v17.8b[0], w4 //sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[ht_tmp - 1 - row]), sign_up, 0) + SUB v3.16b, v18.16b , v3.16b //sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt)) - ADD v18.16b, v0.16b , v14.16b //edge_idx = vaddq_s8(const_2, sign_up) - ADD v18.16b, v18.16b , v10.16b //edge_idx = vaddq_s8(edge_idx, sign_down) + ADD v18.16b, v0.16b , v17.16b //edge_idx = vaddq_s8(const_2, sign_up) + ADD v18.16b, v18.16b , v3.16b //edge_idx = vaddq_s8(edge_idx, sign_down) TBL v18.16b, {v6.16b},v18.16b //vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx)) - NEG v14.16b, v10.16b //sign_up = vnegq_s8(sign_down) + NEG v17.16b, v3.16b //sign_up = vnegq_s8(sign_down) // TBL v19.8b, {v6.16b},v19.8b //vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx)) - EXT v14.16b, v14.16b , v14.16b,#15 //sign_up = vextq_s8(sign_up, sign_up, 15) + EXT v17.16b, v17.16b , v17.16b,#15 //sign_up = vextq_s8(sign_up, sign_up, 15) - AND v18.16b, v18.16b , v8.16b //edge_idx = vandq_s8(edge_idx, au1_mask) + AND v18.16b, v18.16b , v1.16b //edge_idx = vandq_s8(edge_idx, au1_mask) - TBL v10.16b, {v7.16b},v18.16b //offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx)) + TBL v3.16b, {v7.16b},v18.16b //offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx)) - Uxtl v20.8h, v12.8b //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row))) + Uxtl v20.8h, v5.8b //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row))) // TBL v11.8b, {v7.16b},v19.8b //offset = vtbl1_s8(offset_tbl, vget_high_s8(edge_idx)) - SADDW v20.8h, v20.8h , v10.8b //pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset) + SADDW v20.8h, v20.8h , v3.8b //pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset) SMAX v20.8h, v20.8h , v2.8h //pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip) - Uxtl2 v12.8h, v12.16b //pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row))) + Uxtl2 v5.8h, v5.16b //pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row))) UMIN v20.8h, v20.8h , v4.8h //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip)) - SADDW2 v12.8h, v12.8h , v10.16b //pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset) + SADDW2 v5.8h, v5.8h , v3.16b //pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset) - SMAX v12.8h, v12.8h , v2.8h //pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip) + SMAX v5.8h, v5.8h , v2.8h //pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip) xtn v20.8b, v20.8h //vmovn_s16(pi2_tmp_cur_row.val[0]) - UMIN v12.8h, v12.8h , v4.8h //pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip)) - xtn2 v20.16b, v12.8h //vmovn_s16(pi2_tmp_cur_row.val[1]) + UMIN v5.8h, v5.8h , v4.8h //pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip)) + xtn2 v20.16b, v5.8h //vmovn_s16(pi2_tmp_cur_row.val[1]) INNER_LOOP_DONE: @@ -556,11 +556,11 @@ WD_16_HT_4_LOOP: MOV x20,#-1 csel x8, x20, x8,NE //au1_mask = vsetq_lane_s8(-1, au1_mask, 0) - mov v8.8b[0], w8 //au1_mask = vsetq_lane_s8((-1||pu1_avail[0]), au1_mask, 0) + mov v1.8b[0], w8 //au1_mask = vsetq_lane_s8((-1||pu1_avail[0]), au1_mask, 0) CMP x6,#16 //if(col == 16) BNE SKIP_AU1_MASK_VAL_WD_16_HT_4 LDRB w8,[x5,#1] //pu1_avail[1] - mov v8.16b[15], w8 //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15) + mov v1.16b[15], w8 //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15) SKIP_AU1_MASK_VAL_WD_16_HT_4: LDRB w8,[x5,#2] //pu1_avail[2] @@ -572,23 +572,23 @@ SKIP_AU1_MASK_VAL_WD_16_HT_4: SUB x8,x8,#1 //pu1_src_top_cpy - 1 || pu1_src - src_strd - 1 MOV x7,x16 //Loads wd - LD1 {v10.16b},[x8] //pu1_top_row = vld1q_u8(pu1_src - src_strd - 1) || vld1q_u8(pu1_src_top_cpy - 1) + LD1 {v3.16b},[x8] //pu1_top_row = vld1q_u8(pu1_src - src_strd - 1) || vld1q_u8(pu1_src_top_cpy - 1) ADD x3,x3,#16 ADD x5,sp,#0x42 //*au1_src_left_tmp - LD1 {v12.16b},[x0] //pu1_cur_row = vld1q_u8(pu1_src) + LD1 {v5.16b},[x0] //pu1_cur_row = vld1q_u8(pu1_src) MOV x4,x17 //Loads ht SUB x7,x7,x6 //(wd - col) - cmhi v14.16b, v12.16b , v10.16b //vcgtq_u8(pu1_cur_row, pu1_top_row) + cmhi v17.16b, v5.16b , v3.16b //vcgtq_u8(pu1_cur_row, pu1_top_row) MOV x8,x19 //Loads *pu1_src ADD x7,x7,#15 //15 + (wd - col) - cmhi v16.16b, v10.16b , v12.16b //vcltq_u8(pu1_cur_row, pu1_top_row) + cmhi v16.16b, v3.16b , v5.16b //vcltq_u8(pu1_cur_row, pu1_top_row) ADD x7,x8,x7 //pu1_src[0 * src_strd + 15 + (wd - col)] SUB x5,x5,#1 - SUB v14.16b, v16.16b , v14.16b //sign_up = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt)) + SUB v17.16b, v16.16b , v17.16b //sign_up = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt)) AU1_SRC_LEFT_LOOP_WD_16_HT_4: LDRB w8,[x7] //load the value and increment by src_strd @@ -626,31 +626,31 @@ SIGN_UP_CHANGE_WD_16_HT_4: csel x8, x20, x8,LT MOV x20,#1 csel x8, x20, x8,GT //SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[ht_tmp - 1 - row]) - mov v14.8b[0], w8 //sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[ht_tmp - 1 - row]), sign_up, 0) + mov v17.8b[0], w8 //sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[ht_tmp - 1 - row]), sign_up, 0) SIGN_UP_CHANGE_DONE_WD_16_HT_4: - cmhi v20.16b, v12.16b , v18.16b //vcgtq_u8(pu1_cur_row, pu1_next_row_tmp) - cmhi v22.16b, v18.16b , v12.16b //vcltq_u8(pu1_cur_row, pu1_next_row_tmp) + cmhi v20.16b, v5.16b , v18.16b //vcgtq_u8(pu1_cur_row, pu1_next_row_tmp) + cmhi v22.16b, v18.16b , v5.16b //vcltq_u8(pu1_cur_row, pu1_next_row_tmp) SUB v24.16b, v22.16b , v20.16b //sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt)) - ADD v26.16b, v0.16b , v14.16b //edge_idx = vaddq_s8(const_2, sign_up) + ADD v26.16b, v0.16b , v17.16b //edge_idx = vaddq_s8(const_2, sign_up) ADD v26.16b, v26.16b , v24.16b //edge_idx = vaddq_s8(edge_idx, sign_down) TBL v26.16b, {v6.16b},v26.16b //vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx)) // TBL v27.8b, {v6.16b},v27.8b //vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx)) - AND v26.16b, v26.16b , v8.16b //edge_idx = vandq_s8(edge_idx, au1_mask) + AND v26.16b, v26.16b , v1.16b //edge_idx = vandq_s8(edge_idx, au1_mask) - NEG v14.16b, v24.16b //sign_up = vnegq_s8(sign_down) - EXT v14.16b, v14.16b , v14.16b,#15 //sign_up = vextq_s8(sign_up, sign_up, 15) + NEG v17.16b, v24.16b //sign_up = vnegq_s8(sign_down) + EXT v17.16b, v17.16b , v17.16b,#15 //sign_up = vextq_s8(sign_up, sign_up, 15) TBL v24.16b, {v7.16b},v26.16b //offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx)) - Uxtl v28.8h, v12.8b //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row))) + Uxtl v28.8h, v5.8b //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row))) SADDW v28.8h, v28.8h , v24.8b //pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset) SMAX v28.8h, v28.8h , v2.8h //pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip) UMIN v28.8h, v28.8h , v4.8h //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip)) // TBL v25.8b, {v7.16b},v27.8b //offset = vtbl1_s8(offset_tbl, vget_high_s8(edge_idx)) - Uxtl2 v30.8h, v12.16b //pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row))) + Uxtl2 v30.8h, v5.16b //pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row))) SADDW2 v30.8h, v30.8h , v24.16b //pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset) SMAX v30.8h, v30.8h , v2.8h //pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip) UMIN v30.8h, v30.8h , v4.8h //pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip)) @@ -660,7 +660,7 @@ SIGN_UP_CHANGE_DONE_WD_16_HT_4: ST1 { v28.16b},[x0],x1 //vst1q_u8(pu1_src_cpy, pu1_cur_row) - mov v12.16b, v16.16b //pu1_cur_row = pu1_next_row + mov v5.16b, v16.16b //pu1_cur_row = pu1_next_row SUBS x7,x7,#1 //Decrement the ht_tmp loop count by 1 BNE PU1_SRC_LOOP_WD_16_HT_4 //If not equal jump to PU1_SRC_LOOP_WD_16_HT_4 @@ -689,14 +689,14 @@ WIDTH_RESIDUE: MOV x20,#-1 csel x8, x20, x8,NE - mov v8.8b[0], w8 //au1_mask = vsetq_lane_s8(-1, au1_mask, 0) + mov v1.8b[0], w8 //au1_mask = vsetq_lane_s8(-1, au1_mask, 0) LDRB w8,[x5,#1] //pu1_avail[1] - mov v8.8b[7], w8 //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15) + mov v1.8b[7], w8 //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15) PU1_AVAIL_2_RESIDUE: LDRB w11,[x5,#2] //pu1_avail[2] - LD1 {v12.16b},[x0] //pu1_cur_row = vld1q_u8(pu1_src) + LD1 {v5.16b},[x0] //pu1_cur_row = vld1q_u8(pu1_src) CMP x11,#0 SUB x20,x0,x1 //pu1_src - src_strd @@ -706,19 +706,19 @@ PU1_AVAIL_2_RESIDUE: SUB x8,x8,#1 ADD x5,sp,#0x42 //*au1_src_left_tmp - LD1 {v10.16b},[x8],#16 //pu1_top_row = vld1q_u8(pu1_src_top_cpy - 1) + LD1 {v3.16b},[x8],#16 //pu1_top_row = vld1q_u8(pu1_src_top_cpy - 1) MOV x7,x16 //Loads wd MOV x4,x17 //Loads ht - cmhi v14.16b, v12.16b , v10.16b //vcgtq_u8(pu1_cur_row, pu1_top_row) + cmhi v17.16b, v5.16b , v3.16b //vcgtq_u8(pu1_cur_row, pu1_top_row) SUB x7,x7,#1 //(wd - 1) MOV x8,x19 //Loads *pu1_src - cmhi v16.16b, v10.16b , v12.16b //vcltq_u8(pu1_cur_row, pu1_top_row) + cmhi v16.16b, v3.16b , v5.16b //vcltq_u8(pu1_cur_row, pu1_top_row) SUB x5,x5,#1 ADD x7,x8,x7 //pu1_src[0 * src_strd + (wd - 1)] - SUB v14.16b, v16.16b , v14.16b //sign_up = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt)) + SUB v17.16b, v16.16b , v17.16b //sign_up = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt)) AU1_SRC_LEFT_LOOP_RESIDUE: @@ -759,25 +759,25 @@ SIGN_UP_CHANGE_RESIDUE: csel x8, x20, x8,LT MOV x20,#1 csel x8, x20, x8,GT //SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[ht_tmp - 1 - row]) - mov v14.8b[0], w8 //sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[ht_tmp - 1 - row]), sign_up, 0) + mov v17.8b[0], w8 //sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[ht_tmp - 1 - row]), sign_up, 0) SIGN_UP_CHANGE_DONE_RESIDUE: - cmhi v20.16b, v12.16b , v18.16b //vcgtq_u8(pu1_cur_row, pu1_next_row_tmp) - cmhi v22.16b, v18.16b , v12.16b //vcltq_u8(pu1_cur_row, pu1_next_row_tmp) + cmhi v20.16b, v5.16b , v18.16b //vcgtq_u8(pu1_cur_row, pu1_next_row_tmp) + cmhi v22.16b, v18.16b , v5.16b //vcltq_u8(pu1_cur_row, pu1_next_row_tmp) SUB v24.16b, v22.16b , v20.16b //sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt)) - ADD v26.16b, v0.16b , v14.16b //edge_idx = vaddq_s8(const_2, sign_up) + ADD v26.16b, v0.16b , v17.16b //edge_idx = vaddq_s8(const_2, sign_up) ADD v26.16b, v26.16b , v24.16b //edge_idx = vaddq_s8(edge_idx, sign_down) TBL v26.16b, {v6.16b},v26.16b //vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx)) // TBL v27.8b, {v6.16b},v27.8b //vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx)) - AND v26.16b, v26.16b , v8.16b //edge_idx = vandq_s8(edge_idx, au1_mask) + AND v26.16b, v26.16b , v1.16b //edge_idx = vandq_s8(edge_idx, au1_mask) - NEG v14.16b, v24.16b //sign_up = vnegq_s8(sign_down) - EXT v14.16b, v14.16b , v14.16b,#15 //sign_up = vextq_s8(sign_up, sign_up, 15) + NEG v17.16b, v24.16b //sign_up = vnegq_s8(sign_down) + EXT v17.16b, v17.16b , v17.16b,#15 //sign_up = vextq_s8(sign_up, sign_up, 15) TBL v24.8b, {v7.16b},v26.8b //offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx)) - Uxtl v28.8h, v12.8b //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row))) + Uxtl v28.8h, v5.8b //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row))) SADDW v28.8h, v28.8h , v24.8b //pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset) SMAX v28.8h, v28.8h , v2.8h //pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip) UMIN v28.8h, v28.8h , v4.8h //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip)) @@ -785,7 +785,7 @@ SIGN_UP_CHANGE_DONE_RESIDUE: xtn v30.8b, v28.8h //vmovn_s16(pi2_tmp_cur_row.val[0]) ST1 {v30.8b},[x0],x1 //vst1q_u8(pu1_src_cpy, pu1_cur_row) - mov v12.16b, v16.16b //pu1_cur_row = pu1_next_row + mov v5.16b, v16.16b //pu1_cur_row = pu1_next_row SUBS x7,x7,#1 BNE PU1_SRC_LOOP_RESIDUE @@ -839,7 +839,7 @@ END_LOOPS: ldp x23, x24,[sp],#16 ldp x21, x22,[sp],#16 ldp x19, x20,[sp],#16 - pop_v_regs + ret diff --git a/common/arm64/ihevc_sao_edge_offset_class2_chroma.s b/common/arm64/ihevc_sao_edge_offset_class2_chroma.s index 2fa7c22..8e286b4 100644 --- a/common/arm64/ihevc_sao_edge_offset_class2_chroma.s +++ b/common/arm64/ihevc_sao_edge_offset_class2_chroma.s @@ -76,7 +76,7 @@ ihevc_sao_edge_offset_class2_chroma_av8: ldr x9,[sp,#8] ldr w10,[sp,#16] ldr w11,[sp,#24] - push_v_regs + // STMFD sp!, {x4-x12, x14} //stack stores the values of the arguments @@ -322,7 +322,7 @@ PU1_AVAIL_3_LOOP: LDR x2, [x2, #:got_lo12:gi1_table_edge_idx] MOV x6,x7 //move wd to x6 loop_count - movi v8.16b, #0XFF //au1_mask = vdupq_n_s8(-1) + movi v1.16b, #0XFF //au1_mask = vdupq_n_s8(-1) CMP x7,#16 //Compare wd with 16 BLT WIDTH_RESIDUE //If not jump to WIDTH_RESIDUE where loop is unrolled for 8 case @@ -338,19 +338,19 @@ WIDTH_LOOP_16: MOV x20,#-1 csel x8, x20, x8,NE - mov v8.8b[0], w8 //au1_mask = vsetq_lane_s8(-1, au1_mask, 0) + mov v1.8b[0], w8 //au1_mask = vsetq_lane_s8(-1, au1_mask, 0) CMP x6,#16 //if(col == 16) - mov v8.8b[1], w8 //au1_mask = vsetq_lane_s8(-1, au1_mask, 0) + mov v1.8b[1], w8 //au1_mask = vsetq_lane_s8(-1, au1_mask, 0) BNE SKIP_AU1_MASK_VAL LDRB w8,[x5,#1] //pu1_avail[1] - mov v8.16b[14], w8 //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15) - mov v8.16b[15], w8 //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15) + mov v1.16b[14], w8 //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15) + mov v1.16b[15], w8 //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15) SKIP_AU1_MASK_VAL: LDRB w9,[x5,#2] //pu1_avail[2] - LD1 {v12.16b},[x0] //pu1_cur_row = vld1q_u8(pu1_src) + LD1 {v5.16b},[x0] //pu1_cur_row = vld1q_u8(pu1_src) //LD1 {v13.8b},[x0] //pu1_cur_row = vld1q_u8(pu1_src) //SUB x0, x0,#8 CMP x9,#0 @@ -366,17 +366,17 @@ SKIP_AU1_MASK_VAL: ADD x3,x3,#16 ADD x5,sp,#0x4B //*au1_src_left_tmp - LD1 {v10.16b},[x8] //pu1_top_row = vld1q_u8(pu1_src - src_strd - 2) || vld1q_u8(pu1_src_top_cpy - 2) + LD1 {v3.16b},[x8] //pu1_top_row = vld1q_u8(pu1_src - src_strd - 2) || vld1q_u8(pu1_src_top_cpy - 2) //LD1 {v11.8b},[x8] //pu1_top_row = vld1q_u8(pu1_src - src_strd - 2) || vld1q_u8(pu1_src_top_cpy - 2) //SUB x8, x8,#8 SUB x7,x7,x6 //(wd - col) ADD x7,x7,#14 //15 + (wd - col) - cmhi v14.16b, v12.16b , v10.16b //vcgtq_u8(pu1_cur_row, pu1_top_row) + cmhi v17.16b, v5.16b , v3.16b //vcgtq_u8(pu1_cur_row, pu1_top_row) mov x8, x26 //Loads *pu1_src ADD x7,x8,x7 //pu1_src[0 * src_strd + 15 + (wd - col)] - cmhi v16.16b, v10.16b , v12.16b //vcltq_u8(pu1_cur_row, pu1_top_row) + cmhi v16.16b, v3.16b , v5.16b //vcltq_u8(pu1_cur_row, pu1_top_row) AU1_SRC_LEFT_LOOP: LDRH w8,[x7] //load the value and increment by src_strd @@ -388,7 +388,7 @@ AU1_SRC_LEFT_LOOP: BNE AU1_SRC_LEFT_LOOP ADD x8,x0,x1 //I *pu1_src + src_strd - SUB v14.16b, v16.16b , v14.16b //sign_up = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt)) + SUB v17.16b, v16.16b , v17.16b //sign_up = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt)) MOV x7,x12 //row count, move ht_tmp to x7 LD1 {v16.16b},[x8] //I pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd) @@ -430,35 +430,35 @@ AU1_SRC_LEFT_LOOP: csel x8, x20, x8,GT //I SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2]) CMP x4,#0 //I - mov v14.8b[0], w8 //I sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2]), sign_up, 0) + mov v17.8b[0], w8 //I sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2]), sign_up, 0) movn x20,#0 csel x4, x20, x4,LT //I MOV x20,#1 csel x4, x20, x4,GT //I SIGN(pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2]) - mov v14.8b[1], w4 //I sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2 + 1]), sign_up, 1) + mov v17.8b[1], w4 //I sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2 + 1]), sign_up, 1) SIGN_UP_CHANGE_DONE: LD1 {v30.8b},[x2] //edge_idx_tbl = vld1_s8(gi1_table_edge_idx) - cmhi v20.16b, v12.16b , v18.16b //I vcgtq_u8(pu1_cur_row, pu1_next_row_tmp) + cmhi v20.16b, v5.16b , v18.16b //I vcgtq_u8(pu1_cur_row, pu1_next_row_tmp) - cmhi v22.16b, v18.16b , v12.16b //I vcltq_u8(pu1_cur_row, pu1_next_row_tmp) + cmhi v22.16b, v18.16b , v5.16b //I vcltq_u8(pu1_cur_row, pu1_next_row_tmp) SUB v22.16b, v22.16b , v20.16b //I sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt)) - ADD v18.16b, v0.16b , v14.16b //I edge_idx = vaddq_s8(const_2, sign_up) + ADD v18.16b, v0.16b , v17.16b //I edge_idx = vaddq_s8(const_2, sign_up) ADD v18.16b, v18.16b , v22.16b //I edge_idx = vaddq_s8(edge_idx, sign_down) TBL v18.16b, {v30.16b},v18.16b //I vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx)) - NEG v14.16b, v22.16b //I sign_up = vnegq_s8(sign_down) + NEG v17.16b, v22.16b //I sign_up = vnegq_s8(sign_down) //TBL v19.8b, {v30.16b},v19.8b //I vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx)) - EXT v14.16b, v14.16b , v14.16b,#14 //I sign_up = vextq_s8(sign_up, sign_up, 14) + EXT v17.16b, v17.16b , v17.16b,#14 //I sign_up = vextq_s8(sign_up, sign_up, 14) - Uxtl v20.8h, v12.8b //I pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row))) - AND v22.16b, v18.16b , v8.16b //I edge_idx = vandq_s8(edge_idx, au1_mask) + Uxtl v20.8h, v5.8b //I pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row))) + AND v22.16b, v18.16b , v1.16b //I edge_idx = vandq_s8(edge_idx, au1_mask) mov v23.d[0],v22.d[1] - Uxtl2 v18.8h, v12.16b //I pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row))) + Uxtl2 v18.8h, v5.16b //I pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row))) UZP1 v31.8b, v22.8b, v23.8b UZP2 v23.8b, v22.8b, v23.8b //I mov v22.8b,v31.8b @@ -469,7 +469,7 @@ SIGN_UP_CHANGE_DONE: ZIP2 v23.8b, v22.8b, v23.8b //I mov v22.8b,v31.8b - mov v12.16b, v16.16b //I pu1_cur_row = pu1_next_row + mov v5.16b, v16.16b //I pu1_cur_row = pu1_next_row SADDW v20.8h, v20.8h , v22.8b //I pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset) SMAX v20.8h, v20.8h , v2.8h //I pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip) @@ -521,17 +521,17 @@ PU1_SRC_LOOP: movn x20,#0 csel x8, x20, x8,LT //II - cmhi v22.16b, v12.16b , v28.16b //II vcgtq_u8(pu1_cur_row, pu1_next_row_tmp) + cmhi v22.16b, v5.16b , v28.16b //II vcgtq_u8(pu1_cur_row, pu1_next_row_tmp) MOV x20,#1 csel x8, x20, x8,GT //II SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2]) sub x13,x9,#1 LDRB w5,[x13] //II load the value - mov v14.8b[0], w8 //II sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2]), sign_up, 0) + mov v17.8b[0], w8 //II sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2]), sign_up, 0) SUB x7,x7,#1 //II Decrement the ht_tmp loop count by 1 SUB x11,x11,x5 //II pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2 + 1] - cmhi v24.16b, v28.16b , v12.16b //II vcltq_u8(pu1_cur_row, pu1_next_row_tmp) + cmhi v24.16b, v28.16b , v5.16b //II vcltq_u8(pu1_cur_row, pu1_next_row_tmp) CMP x11,#0 //II movn x20,#0 @@ -545,11 +545,11 @@ PU1_SRC_LOOP: SUB x5,x12,x7 //III ht_tmp - row ADD x10,x0,x1 - mov v14.8b[1], w11 //II sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2 + 1]), sign_up, 1) + mov v17.8b[1], w11 //II sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2 + 1]), sign_up, 1) LSL x5,x5,#1 //III (ht_tmp - row) * 2 ADD x9,x14,x5 //III pu1_src_left_cpy[(ht_tmp - row) * 2] - ADD v26.16b, v0.16b , v14.16b //II edge_idx = vaddq_s8(const_2, sign_up) + ADD v26.16b, v0.16b , v17.16b //II edge_idx = vaddq_s8(const_2, sign_up) LDRB w10,[x10,#1] //III pu1_src_cpy[0] sub x13,x9,#2 @@ -562,24 +562,24 @@ PU1_SRC_LOOP: sub x13,x9,#1 LDRB w9,[x13] //III load the value TBL v26.16b, {v22.16b},v26.16b //II vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx)) - NEG v14.16b, v24.16b //II sign_up = vnegq_s8(sign_down) + NEG v17.16b, v24.16b //II sign_up = vnegq_s8(sign_down) movn x20,#0 csel x4, x20, x4,LT //III SUB x10,x10,x9 //III pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2 + 1] //TBL v27.8b, {v22.16b},v27.8b //II vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx)) - EXT v14.16b, v14.16b , v14.16b,#14 //II sign_up = vextq_s8(sign_up, sign_up, 14) + EXT v17.16b, v17.16b , v17.16b,#14 //II sign_up = vextq_s8(sign_up, sign_up, 14) MOV x20,#1 csel x4, x20, x4,GT //III SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2]) - AND v26.16b, v26.16b , v8.16b //II edge_idx = vandq_s8(edge_idx, au1_mask) + AND v26.16b, v26.16b , v1.16b //II edge_idx = vandq_s8(edge_idx, au1_mask) CMP x10,#0 //III mov v27.d[0],v26.d[1] UZP1 v31.8b, v26.8b, v27.8b UZP2 v27.8b, v26.8b, v27.8b //II mov v26.8b,v31.8b - mov v14.8b[0], w4 //III sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2]), sign_up, 0) + mov v17.8b[0], w4 //III sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2]), sign_up, 0) movn x20,#0 csel x10, x20, x10,LT //III @@ -592,13 +592,13 @@ PU1_SRC_LOOP: TBL v25.8b, {v7.16b},v27.8b //II SUB v22.16b, v22.16b , v20.16b //III sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt)) - mov v14.8b[1], w10 //III sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2 + 1]), sign_up, 1) + mov v17.8b[1], w10 //III sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2 + 1]), sign_up, 1) ZIP1 v31.8b, v24.8b, v25.8b ZIP2 v25.8b, v24.8b, v25.8b //II mov v24.8b,v31.8b - Uxtl v28.8h, v12.8b //II pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row))) - ADD v18.16b, v0.16b , v14.16b //III edge_idx = vaddq_s8(const_2, sign_up) + Uxtl v28.8h, v5.8b //II pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row))) + ADD v18.16b, v0.16b , v17.16b //III edge_idx = vaddq_s8(const_2, sign_up) LD1 {v20.8b},[x2] //edge_idx_tbl = vld1_s8(gi1_table_edge_idx) SADDW v28.8h, v28.8h , v24.8b //II pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset) @@ -608,13 +608,13 @@ PU1_SRC_LOOP: UMIN v28.8h, v28.8h , v4.8h //II pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip)) TBL v18.16b, {v20.16b},v18.16b //III vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx)) - NEG v14.16b, v22.16b //III sign_up = vnegq_s8(sign_down) + NEG v17.16b, v22.16b //III sign_up = vnegq_s8(sign_down) //TBL v19.8b, {v20.16b},v19.8b //III vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx)) - EXT v14.16b, v14.16b , v14.16b,#14 //III sign_up = vextq_s8(sign_up, sign_up, 14) + EXT v17.16b, v17.16b , v17.16b,#14 //III sign_up = vextq_s8(sign_up, sign_up, 14) - Uxtl2 v26.8h, v12.16b //II pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row))) - AND v18.16b, v18.16b , v8.16b //III edge_idx = vandq_s8(edge_idx, au1_mask) + Uxtl2 v26.8h, v5.16b //II pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row))) + AND v18.16b, v18.16b , v1.16b //III edge_idx = vandq_s8(edge_idx, au1_mask) mov v19.d[0],v18.d[1] UZP1 v31.8b, v18.8b, v19.8b @@ -623,7 +623,7 @@ PU1_SRC_LOOP: TBL v22.8b, {v6.16b},v18.8b //III SADDW v26.8h, v26.8h , v25.8b //II pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset) - mov v12.16b, v30.16b //III pu1_cur_row = pu1_next_row + mov v5.16b, v30.16b //III pu1_cur_row = pu1_next_row TBL v23.8b, {v7.16b},v19.8b //III SMAX v26.8h, v26.8h , v2.8h //II pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip) @@ -686,35 +686,35 @@ PU1_SRC_LOOP: LD1 {v30.8b},[x2] //edge_idx_tbl = vld1_s8(gi1_table_edge_idx) LDRB w11,[x0,#1] //pu1_src_cpy[0] - mov v14.8b[0], w8 //sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2]), sign_up, 0) + mov v17.8b[0], w8 //sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2]), sign_up, 0) sub x13,x9,#1 LDRB w5,[x13] //load the value SUB x4,x11,x5 //pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2 + 1] - cmhi v22.16b, v12.16b , v18.16b //vcgtq_u8(pu1_cur_row, pu1_next_row_tmp) + cmhi v22.16b, v5.16b , v18.16b //vcgtq_u8(pu1_cur_row, pu1_next_row_tmp) CMP x4,#0 movn x20,#0 csel x4, x20, x4,LT - cmhi v24.16b, v18.16b , v12.16b //vcltq_u8(pu1_cur_row, pu1_next_row_tmp) + cmhi v24.16b, v18.16b , v5.16b //vcltq_u8(pu1_cur_row, pu1_next_row_tmp) MOV x20,#1 csel x4, x20, x4,GT //SIGN(pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2]) - mov v14.8b[1], w4 //sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2 + 1]), sign_up, 1) + mov v17.8b[1], w4 //sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2 + 1]), sign_up, 1) SUB v24.16b, v24.16b , v22.16b //sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt)) - ADD v26.16b, v0.16b , v14.16b //edge_idx = vaddq_s8(const_2, sign_up) + ADD v26.16b, v0.16b , v17.16b //edge_idx = vaddq_s8(const_2, sign_up) ADD v26.16b, v26.16b , v24.16b //edge_idx = vaddq_s8(edge_idx, sign_down) mov v30.d[1],v30.d[0] TBL v26.16b, {v30.16b},v26.16b //vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx)) //TBL v27.8b, {v30.16b},v27.8b //vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx)) - Uxtl v20.8h, v12.8b //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row))) - AND v26.16b, v26.16b , v8.16b //edge_idx = vandq_s8(edge_idx, au1_mask) + Uxtl v20.8h, v5.8b //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row))) + AND v26.16b, v26.16b , v1.16b //edge_idx = vandq_s8(edge_idx, au1_mask) mov v27.d[0],v26.d[1] - Uxtl2 v18.8h, v12.16b //pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row))) + Uxtl2 v18.8h, v5.16b //pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row))) UZP1 v31.8b, v26.8b, v27.8b UZP2 v27.8b, v26.8b, v27.8b mov v26.8b,v31.8b @@ -771,14 +771,14 @@ WD_16_HT_4_LOOP: MOV x20,#-1 csel x8, x20, x8,NE - mov v8.8b[0], w8 //au1_mask = vsetq_lane_s8(-1, au1_mask, 0) - mov v8.8b[1], w8 //au1_mask = vsetq_lane_s8(-1, au1_mask, 0) + mov v1.8b[0], w8 //au1_mask = vsetq_lane_s8(-1, au1_mask, 0) + mov v1.8b[1], w8 //au1_mask = vsetq_lane_s8(-1, au1_mask, 0) CMP x6,#16 //if(col == 16) BNE SKIP_AU1_MASK_VAL_WD_16_HT_4 LDRB w8,[x5,#1] //pu1_avail[1] - mov v8.16b[14], w8 //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15) - mov v8.16b[15], w8 //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15) + mov v1.16b[14], w8 //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15) + mov v1.16b[15], w8 //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15) SKIP_AU1_MASK_VAL_WD_16_HT_4: LDRB w8,[x5,#2] //pu1_avail[2] @@ -788,7 +788,7 @@ SKIP_AU1_MASK_VAL_WD_16_HT_4: csel x8, x20, x8,EQ csel x8, x3, x8,NE //pu1_src_top_cpy SUB x8,x8,#2 //pu1_src - src_strd - 2 - LD1 {v10.16b},[x8] //pu1_top_row = vld1q_u8(pu1_src - src_strd - 2) || vld1q_u8(pu1_src_top_cpy - 2) + LD1 {v3.16b},[x8] //pu1_top_row = vld1q_u8(pu1_src - src_strd - 2) || vld1q_u8(pu1_src_top_cpy - 2) //LD1 {v11.8b},[x8] //pu1_top_row = vld1q_u8(pu1_src - src_strd - 2) || vld1q_u8(pu1_src_top_cpy - 2) //SUB x8, x8,#8 @@ -809,13 +809,13 @@ AU1_SRC_LEFT_LOOP_WD_16_HT_4: SUBS x4,x4,#1 //decrement the loop count BNE AU1_SRC_LEFT_LOOP_WD_16_HT_4 - LD1 {v12.16b},[x0] //pu1_cur_row = vld1q_u8(pu1_src) + LD1 {v5.16b},[x0] //pu1_cur_row = vld1q_u8(pu1_src) //LD1 {v13.8b},[x0] //pu1_cur_row = vld1q_u8(pu1_src) //SUB x0, x0,#8 - cmhi v14.16b, v12.16b , v10.16b //vcgtq_u8(pu1_cur_row, pu1_top_row) - cmhi v16.16b, v10.16b , v12.16b //vcltq_u8(pu1_cur_row, pu1_top_row) - SUB v14.16b, v16.16b , v14.16b //sign_up = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt)) + cmhi v17.16b, v5.16b , v3.16b //vcgtq_u8(pu1_cur_row, pu1_top_row) + cmhi v16.16b, v3.16b , v5.16b //vcltq_u8(pu1_cur_row, pu1_top_row) + SUB v17.16b, v16.16b , v17.16b //sign_up = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt)) movi v18.16b, #0 MOV x7,x12 //row count, move ht_tmp to x7 @@ -851,7 +851,7 @@ SIGN_UP_CHANGE_WD_16_HT_4: csel x8, x20, x8,LT MOV x20,#1 csel x8, x20, x8,GT //SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2]) - mov v14.8b[0], w8 //sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2]), sign_up, 0) + mov v17.8b[0], w8 //sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2]), sign_up, 0) LDRB w8,[x0,#1] //pu1_src_cpy[0] sub x13,x9,#1 @@ -862,25 +862,25 @@ SIGN_UP_CHANGE_WD_16_HT_4: csel x8, x20, x8,LT MOV x20,#1 csel x8, x20, x8,GT //SIGN(pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2]) - mov v14.8b[1], w8 //sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2 + 1]), sign_up, 1) + mov v17.8b[1], w8 //sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2 + 1]), sign_up, 1) SIGN_UP_CHANGE_DONE_WD_16_HT_4: - cmhi v22.16b, v12.16b , v18.16b //vcgtq_u8(pu1_cur_row, pu1_next_row_tmp) - cmhi v24.16b, v18.16b , v12.16b //vcltq_u8(pu1_cur_row, pu1_next_row_tmp) + cmhi v22.16b, v5.16b , v18.16b //vcgtq_u8(pu1_cur_row, pu1_next_row_tmp) + cmhi v24.16b, v18.16b , v5.16b //vcltq_u8(pu1_cur_row, pu1_next_row_tmp) SUB v24.16b, v24.16b , v22.16b //sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt)) - ADD v26.16b, v0.16b , v14.16b //edge_idx = vaddq_s8(const_2, sign_up) + ADD v26.16b, v0.16b , v17.16b //edge_idx = vaddq_s8(const_2, sign_up) ADD v26.16b, v26.16b , v24.16b //edge_idx = vaddq_s8(edge_idx, sign_down) LD1 {v22.8b},[x2] //edge_idx_tbl = vld1_s8(gi1_table_edge_idx) TBL v26.16b, {v22.16b},v26.16b //vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx)) //TBL v27.8b, {v22.16b},v27.8b //vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx)) - AND v26.16b, v26.16b , v8.16b //edge_idx = vandq_s8(edge_idx, au1_mask) + AND v26.16b, v26.16b , v1.16b //edge_idx = vandq_s8(edge_idx, au1_mask) mov v27.d[0],v26.d[1] - NEG v14.16b, v24.16b //sign_up = vnegq_s8(sign_down) - EXT v14.16b, v14.16b , v14.16b,#14 //sign_up = vextq_s8(sign_up, sign_up, 14) + NEG v17.16b, v24.16b //sign_up = vnegq_s8(sign_down) + EXT v17.16b, v17.16b , v17.16b,#14 //sign_up = vextq_s8(sign_up, sign_up, 14) UZP1 v31.8b, v26.8b, v27.8b UZP2 v27.8b, v26.8b, v27.8b @@ -891,12 +891,12 @@ SIGN_UP_CHANGE_DONE_WD_16_HT_4: ZIP2 v25.8b, v24.8b, v25.8b mov v24.8b,v31.8b - Uxtl v28.8h, v12.8b //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row))) + Uxtl v28.8h, v5.8b //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row))) SADDW v28.8h, v28.8h , v24.8b //pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset) SMAX v28.8h, v28.8h , v2.8h //pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip) UMIN v28.8h, v28.8h , v4.8h //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip)) - Uxtl2 v26.8h, v12.16b //pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row))) + Uxtl2 v26.8h, v5.16b //pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row))) SADDW v26.8h, v26.8h , v25.8b //pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset) SMAX v26.8h, v26.8h , v2.8h //pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip) UMIN v26.8h, v26.8h , v4.8h //pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip)) @@ -906,7 +906,7 @@ SIGN_UP_CHANGE_DONE_WD_16_HT_4: ST1 { v28.16b},[x0],x1 //vst1q_u8(pu1_src_cpy, pu1_cur_row) - mov v12.16b, v16.16b //pu1_cur_row = pu1_next_row + mov v5.16b, v16.16b //pu1_cur_row = pu1_next_row SUBS x7,x7,#1 //Decrement the ht_tmp loop count by 1 BNE PU1_SRC_LOOP_WD_16_HT_4 //If not equal jump to PU1_SRC_LOOP_WD_16_HT_4 @@ -936,12 +936,12 @@ WIDTH_RESIDUE: MOV x20,#-1 csel x8, x20, x8,NE - mov v8.8b[0], w8 //au1_mask = vsetq_lane_s8(-1, au1_mask, 0) - mov v8.8b[1], w8 //au1_mask = vsetq_lane_s8(-1, au1_mask, 0) + mov v1.8b[0], w8 //au1_mask = vsetq_lane_s8(-1, au1_mask, 0) + mov v1.8b[1], w8 //au1_mask = vsetq_lane_s8(-1, au1_mask, 0) LDRB w8,[x5,#1] //pu1_avail[1] - mov v8.8b[6], w8 //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15) - mov v8.8b[7], w8 //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15) + mov v1.8b[6], w8 //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15) + mov v1.8b[7], w8 //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15) LDRB w8,[x5,#2] //pu1_avail[2] CMP x8,#0 @@ -950,7 +950,7 @@ WIDTH_RESIDUE: csel x8, x20, x8,EQ csel x8, x3, x8,NE SUB x8,x8,#2 //pu1_src - src_strd - 2 - LD1 {v10.16b},[x8] //pu1_top_row = vld1q_u8(pu1_src - src_strd - 2) + LD1 {v3.16b},[x8] //pu1_top_row = vld1q_u8(pu1_src - src_strd - 2) //LD1 {v11.8b},[x8] //pu1_top_row = vld1q_u8(pu1_src - src_strd - 2) //SUB x8, x8,#8 @@ -968,13 +968,13 @@ AU1_SRC_LEFT_LOOP_RESIDUE: SUBS x4,x4,#1 //decrement the loop count BNE AU1_SRC_LEFT_LOOP_RESIDUE - LD1 {v12.16b},[x0] //pu1_cur_row = vld1q_u8(pu1_src) + LD1 {v5.16b},[x0] //pu1_cur_row = vld1q_u8(pu1_src) //LD1 {v13.8b},[x0] //pu1_cur_row = vld1q_u8(pu1_src) //SUB x0, x0,#8 - cmhi v14.16b, v12.16b , v10.16b //vcgtq_u8(pu1_cur_row, pu1_top_row) - cmhi v16.16b, v10.16b , v12.16b //vcltq_u8(pu1_cur_row, pu1_top_row) - SUB v14.16b, v16.16b , v14.16b //sign_up = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt)) + cmhi v17.16b, v5.16b , v3.16b //vcgtq_u8(pu1_cur_row, pu1_top_row) + cmhi v16.16b, v3.16b , v5.16b //vcltq_u8(pu1_cur_row, pu1_top_row) + SUB v17.16b, v16.16b , v17.16b //sign_up = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt)) MOV x7,x12 //row count, move ht_tmp to x7 PU1_SRC_LOOP_RESIDUE: @@ -1009,7 +1009,7 @@ SIGN_UP_CHANGE_RESIDUE: csel x8, x20, x8,LT MOV x20,#1 csel x8, x20, x8,GT //SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2]) - mov v14.8b[0], w8 //sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2]), sign_up, 0) + mov v17.8b[0], w8 //sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2]), sign_up, 0) LDRB w8,[x0,#1] //pu1_src_cpy[0] sub x13,x9,#1 @@ -1020,14 +1020,14 @@ SIGN_UP_CHANGE_RESIDUE: csel x8, x20, x8,LT MOV x20,#1 csel x8, x20, x8,GT //SIGN(pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2]) - mov v14.8b[1], w8 //sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2 + 1]), sign_up, 1) + mov v17.8b[1], w8 //sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2 + 1]), sign_up, 1) SIGN_UP_CHANGE_DONE_RESIDUE: - cmhi v22.16b, v12.16b , v18.16b //vcgtq_u8(pu1_cur_row, pu1_next_row_tmp) - cmhi v24.16b, v18.16b , v12.16b //vcltq_u8(pu1_cur_row, pu1_next_row_tmp) + cmhi v22.16b, v5.16b , v18.16b //vcgtq_u8(pu1_cur_row, pu1_next_row_tmp) + cmhi v24.16b, v18.16b , v5.16b //vcltq_u8(pu1_cur_row, pu1_next_row_tmp) SUB v24.16b, v24.16b , v22.16b //sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt)) - ADD v26.16b, v0.16b , v14.16b //edge_idx = vaddq_s8(const_2, sign_up) + ADD v26.16b, v0.16b , v17.16b //edge_idx = vaddq_s8(const_2, sign_up) ADD v26.16b, v26.16b , v24.16b //edge_idx = vaddq_s8(edge_idx, sign_down) LD1 {v22.8b},[x2] //edge_idx_tbl = vld1_s8(gi1_table_edge_idx) @@ -1035,11 +1035,11 @@ SIGN_UP_CHANGE_DONE_RESIDUE: TBL v26.16b, {v22.16b},v26.16b //vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx)) //TBL v27.8b, {v22.16b},v27.8b //vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx)) - AND v26.16b, v26.16b , v8.16b //edge_idx = vandq_s8(edge_idx, au1_mask) + AND v26.16b, v26.16b , v1.16b //edge_idx = vandq_s8(edge_idx, au1_mask) mov v27.d[0],v26.d[1] - NEG v14.16b, v24.16b //sign_up = vnegq_s8(sign_down) - EXT v14.16b, v14.16b , v14.16b,#14 //sign_up = vextq_s8(sign_up, sign_up, 14) + NEG v17.16b, v24.16b //sign_up = vnegq_s8(sign_down) + EXT v17.16b, v17.16b , v17.16b,#14 //sign_up = vextq_s8(sign_up, sign_up, 14) UZP1 v31.8b, v26.8b, v27.8b UZP2 v27.8b, v26.8b, v27.8b @@ -1050,7 +1050,7 @@ SIGN_UP_CHANGE_DONE_RESIDUE: ZIP2 v25.8b, v24.8b, v25.8b mov v24.8b,v31.8b - Uxtl v28.8h, v12.8b //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row))) + Uxtl v28.8h, v5.8b //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row))) SADDW v28.8h, v28.8h , v24.8b //pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset) SMAX v28.8h, v28.8h , v2.8h //pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip) UMIN v28.8h, v28.8h , v4.8h //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip)) @@ -1059,7 +1059,7 @@ SIGN_UP_CHANGE_DONE_RESIDUE: ST1 {v28.8b},[x0],x1 //vst1q_u8(pu1_src_cpy, pu1_cur_row) - mov v12.16b, v16.16b //pu1_cur_row = pu1_next_row + mov v5.16b, v16.16b //pu1_cur_row = pu1_next_row SUBS x7,x7,#1 //Decrement the ht_tmp loop count by 1 BNE PU1_SRC_LOOP_RESIDUE //If not equal jump to PU1_SRC_LOOP @@ -1113,7 +1113,7 @@ END_LOOPS: ldp x23, x24,[sp],#16 ldp x21, x22,[sp],#16 ldp x19, x20,[sp],#16 - pop_v_regs + ret diff --git a/common/arm64/ihevc_sao_edge_offset_class3.s b/common/arm64/ihevc_sao_edge_offset_class3.s index 6c47abe..f393753 100644 --- a/common/arm64/ihevc_sao_edge_offset_class3.s +++ b/common/arm64/ihevc_sao_edge_offset_class3.s @@ -70,7 +70,6 @@ ihevc_sao_edge_offset_class3_av8: // STMFD sp!,{x4-x12,x14} //stack stores the values of the arguments - push_v_regs stp x19, x20,[sp,#-16]! stp x21, x22,[sp,#-16]! stp x23, x24,[sp,#-16]! @@ -85,9 +84,9 @@ ihevc_sao_edge_offset_class3_av8: MOV x5,x7 //Loads pu1_avail - LDR x6,[sp,#112] //Loads pi1_sao_offset - LDR w7,[sp,#120] //Loads wd - LDR w8,[sp,#128] //Loads ht + LDR x6,[sp,#48] //Loads pi1_sao_offset + LDR w7,[sp,#56] //Loads wd + LDR w8,[sp,#64] //Loads ht MOV x16,x7 // wd MOV x17,x8 // ht @@ -226,7 +225,7 @@ PU1_AVAIL_3_LOOP: ADRP x6, :got:gi1_table_edge_idx //table pointer LDR x6, [x6, #:got_lo12:gi1_table_edge_idx] - movi v8.16b, #0xFF //au1_mask = vdupq_n_s8(-1) + movi v1.16b, #0xFF //au1_mask = vdupq_n_s8(-1) ADD x20,x14,#1 //pu1_src_left_cpy += 1 csel x14, x20, x14,EQ @@ -248,12 +247,12 @@ WIDTH_LOOP_16: csel w8,w20,w8,EQ MOV x20,#-1 csel x8, x20, x8,NE - mov v8.8b[0], w8 //au1_mask = vsetq_lane_s8(-1, au1_mask, 0) + mov v1.8b[0], w8 //au1_mask = vsetq_lane_s8(-1, au1_mask, 0) CMP x6,#16 //if(col == 16) BNE SKIP_AU1_MASK_VAL LDRB w8,[x5,#1] //pu1_avail[1] - mov v8.16b[15], w8 //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15) + mov v1.16b[15], w8 //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15) SKIP_AU1_MASK_VAL: LDRB w8,[x5,#2] //pu1_avail[2] @@ -270,15 +269,15 @@ SKIP_AU1_MASK_VAL: ADD x8,x8,#1 //pu1_src - src_strd + 1 SUB x7,x7,x6 //(wd - col) - LD1 {v10.16b},[x8] //pu1_top_row = vld1q_u8(pu1_src - src_strd + 1) + LD1 {v3.16b},[x8] //pu1_top_row = vld1q_u8(pu1_src - src_strd + 1) ADD x3,x3,#16 MOV x8,x19 //Loads *pu1_src - LD1 {v12.16b},[x0] //pu1_cur_row = vld1q_u8(pu1_src) + LD1 {v5.16b},[x0] //pu1_cur_row = vld1q_u8(pu1_src) ADD x7,x7,#15 //15 + (wd - col) ADD x7,x8,x7 //pu1_src[0 * src_strd + 15 + (wd - col)] - cmhi v14.16b, v12.16b , v10.16b //vcgtq_u8(pu1_cur_row, pu1_top_row) + cmhi v17.16b, v5.16b , v3.16b //vcgtq_u8(pu1_cur_row, pu1_top_row) SUB x5,x5,#1 AU1_SRC_LEFT_LOOP: @@ -289,10 +288,10 @@ AU1_SRC_LEFT_LOOP: BNE AU1_SRC_LEFT_LOOP movi v18.16b, #0 - cmhi v16.16b, v10.16b , v12.16b //vcltq_u8(pu1_cur_row, pu1_top_row) + cmhi v16.16b, v3.16b , v5.16b //vcltq_u8(pu1_cur_row, pu1_top_row) ADD x8,x0,x1 //I *pu1_src + src_strd - SUB v14.16b, v16.16b , v14.16b //sign_up = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt)) + SUB v17.16b, v16.16b , v17.16b //sign_up = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt)) MOV x7,x12 //row count, move ht_tmp to x7 SUB x5,x12,x7 //I ht_tmp - row @@ -321,35 +320,35 @@ SIGN_UP_CHANGE: csel x8, x20, x8,LT //I MOV x20,#1 csel x8, x20, x8,GT //I SIGN(pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd]) - mov v14.16b[15], w8 //I sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd]), sign_up, 15) + mov v17.16b[15], w8 //I sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd]), sign_up, 15) SIGN_UP_CHANGE_DONE: - cmhi v10.16b, v12.16b , v18.16b //I vcgtq_u8(pu1_cur_row, pu1_next_row_tmp) - cmhi v18.16b, v18.16b , v12.16b //I vcltq_u8(pu1_cur_row, pu1_next_row_tmp) - SUB v10.16b, v18.16b , v10.16b //I sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt)) + cmhi v3.16b, v5.16b , v18.16b //I vcgtq_u8(pu1_cur_row, pu1_next_row_tmp) + cmhi v18.16b, v18.16b , v5.16b //I vcltq_u8(pu1_cur_row, pu1_next_row_tmp) + SUB v3.16b, v18.16b , v3.16b //I sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt)) - ADD v18.16b, v0.16b , v14.16b //I edge_idx = vaddq_s8(const_2, sign_up) - ADD v18.16b, v18.16b , v10.16b //I edge_idx = vaddq_s8(edge_idx, sign_down) + ADD v18.16b, v0.16b , v17.16b //I edge_idx = vaddq_s8(const_2, sign_up) + ADD v18.16b, v18.16b , v3.16b //I edge_idx = vaddq_s8(edge_idx, sign_down) TBL v18.16b, {v6.16b},v18.16b //I vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx)) - NEG v14.16b, v10.16b //I sign_up = vnegq_s8(sign_down) + NEG v17.16b, v3.16b //I sign_up = vnegq_s8(sign_down) - EXT v14.16b, v14.16b , v14.16b,#1 //I sign_up = vextq_s8(sign_up, sign_up, 1) + EXT v17.16b, v17.16b , v17.16b,#1 //I sign_up = vextq_s8(sign_up, sign_up, 1) // TBL v19.8b, {v6.16b},v19.8b //I vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx)) - Uxtl v20.8h, v12.8b //I pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row))) - AND v18.16b, v18.16b , v8.16b //I edge_idx = vandq_s8(edge_idx, au1_mask) + Uxtl v20.8h, v5.8b //I pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row))) + AND v18.16b, v18.16b , v1.16b //I edge_idx = vandq_s8(edge_idx, au1_mask) - TBL v10.16b, {v7.16b},v18.16b //I offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx)) + TBL v3.16b, {v7.16b},v18.16b //I offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx)) - Uxtl2 v22.8h, v12.16b //I pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row))) - SADDW v20.8h, v20.8h , v10.8b //I pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset) + Uxtl2 v22.8h, v5.16b //I pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row))) + SADDW v20.8h, v20.8h , v3.8b //I pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset) SMAX v20.8h, v20.8h , v2.8h //I pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip) // TBL v11.8b, {v7.16b},v19.8b //I offset = vtbl1_s8(offset_tbl, vget_high_s8(edge_idx)) UMIN v20.8h, v20.8h , v4.8h //I pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip)) - mov v12.16b, v16.16b - SADDW2 v22.8h, v22.8h , v10.16b //I pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset) + mov v5.16b, v16.16b + SADDW2 v22.8h, v22.8h , v3.16b //I pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset) SMAX v22.8h, v22.8h , v2.8h //I pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip) UMIN v22.8h, v22.8h , v4.8h //I pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip)) @@ -388,7 +387,7 @@ PU1_SRC_LOOP: csel x11, x20, x11,GT //II SIGN(pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd]) ADD x8,x14,x5 //III pu1_src_left_cpy[ht_tmp - row] - mov v14.8b[15], w11 //II sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd]), sign_up, 15) + mov v17.8b[15], w11 //II sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd]), sign_up, 15) CMP x7,#1 //III BNE NEXT_ROW_ELSE_2 //III @@ -400,11 +399,11 @@ PU1_SRC_LOOP: NEXT_ROW_ELSE_2: LDRB w8,[x8,#1] //III - cmhi v24.16b, v12.16b , v18.16b //II vcgtq_u8(pu1_cur_row, pu1_next_row_tmp) + cmhi v24.16b, v5.16b , v18.16b //II vcgtq_u8(pu1_cur_row, pu1_next_row_tmp) ADD x5,x0,x1 LDRB w2,[x5,#15] //III pu1_src_cpy[15] - cmhi v26.16b, v18.16b , v12.16b //II vcltq_u8(pu1_cur_row, pu1_next_row_tmp) + cmhi v26.16b, v18.16b , v5.16b //II vcltq_u8(pu1_cur_row, pu1_next_row_tmp) LDRB w5,[x0,#16] //III load the value SUB x2,x2,x5 //III pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd] @@ -418,51 +417,51 @@ NEXT_ROW_ELSE_2: csel x2, x20, x2,GT //III SIGN(pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd]) SUB x7,x7,#1 //III Decrement the ht_tmp loop count by 1 - ADD v26.16b, v0.16b , v14.16b //II edge_idx = vaddq_s8(const_2, sign_up) + ADD v26.16b, v0.16b , v17.16b //II edge_idx = vaddq_s8(const_2, sign_up) - NEG v14.16b, v24.16b //II sign_up = vnegq_s8(sign_down) + NEG v17.16b, v24.16b //II sign_up = vnegq_s8(sign_down) EXT v18.16b, v18.16b , v30.16b,#15 //III pu1_next_row_tmp = vextq_u8(pu1_next_row_tmp, pu1_next_row, 15) ADD v26.16b, v26.16b , v24.16b //II edge_idx = vaddq_s8(edge_idx, sign_down) - EXT v14.16b, v14.16b , v14.16b,#1 //II sign_up = vextq_s8(sign_up, sign_up, 1) + EXT v17.16b, v17.16b , v17.16b,#1 //II sign_up = vextq_s8(sign_up, sign_up, 1) TBL v26.16b, {v6.16b},v26.16b //II vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx)) - cmhi v10.16b, v16.16b , v18.16b //III vcgtq_u8(pu1_cur_row, pu1_next_row_tmp) + cmhi v3.16b, v16.16b , v18.16b //III vcgtq_u8(pu1_cur_row, pu1_next_row_tmp) - mov v14.16b[15], w2 //III sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd]), sign_up, 15) + mov v17.16b[15], w2 //III sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd]), sign_up, 15) // TBL v27.8b, {v6.16b},v27.8b //II vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx)) cmhi v18.16b, v18.16b , v16.16b //III vcltq_u8(pu1_cur_row, pu1_next_row_tmp) - Uxtl v28.8h, v12.8b //II pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row))) - AND v26.16b, v26.16b , v8.16b //II edge_idx = vandq_s8(edge_idx, au1_mask) + Uxtl v28.8h, v5.8b //II pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row))) + AND v26.16b, v26.16b , v1.16b //II edge_idx = vandq_s8(edge_idx, au1_mask) - SUB v10.16b, v18.16b , v10.16b //III sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt)) + SUB v3.16b, v18.16b , v3.16b //III sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt)) TBL v24.16b, {v7.16b},v26.16b //II offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx)) - ADD v18.16b, v0.16b , v14.16b //III edge_idx = vaddq_s8(const_2, sign_up) + ADD v18.16b, v0.16b , v17.16b //III edge_idx = vaddq_s8(const_2, sign_up) - ADD v18.16b, v18.16b , v10.16b //III edge_idx = vaddq_s8(edge_idx, sign_down) + ADD v18.16b, v18.16b , v3.16b //III edge_idx = vaddq_s8(edge_idx, sign_down) // TBL v25.8b, {v7.16b},v27.8b //II offset = vtbl1_s8(offset_tbl, vget_high_s8(edge_idx)) - NEG v14.16b, v10.16b //III sign_up = vnegq_s8(sign_down) + NEG v17.16b, v3.16b //III sign_up = vnegq_s8(sign_down) SADDW v28.8h, v28.8h , v24.8b //II pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset) TBL v18.16b, {v6.16b},v18.16b //III vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx)) SMAX v28.8h, v28.8h , v2.8h //II pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip) - EXT v14.16b, v14.16b , v14.16b,#1 //III sign_up = vextq_s8(sign_up, sign_up, 1) + EXT v17.16b, v17.16b , v17.16b,#1 //III sign_up = vextq_s8(sign_up, sign_up, 1) // TBL v19.8b, {v6.16b},v19.8b //III vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx)) UMIN v28.8h, v28.8h , v4.8h //II pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip)) - Uxtl2 v26.8h, v12.16b //II pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row))) - AND v18.16b, v18.16b , v8.16b //III edge_idx = vandq_s8(edge_idx, au1_mask) + Uxtl2 v26.8h, v5.16b //II pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row))) + AND v18.16b, v18.16b , v1.16b //III edge_idx = vandq_s8(edge_idx, au1_mask) SADDW2 v26.8h, v26.8h , v24.16b //II pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset) - TBL v10.16b, {v7.16b},v18.16b //III offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx)) + TBL v3.16b, {v7.16b},v18.16b //III offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx)) SMAX v26.8h, v26.8h , v2.8h //II pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip) Uxtl v20.8h, v16.8b //III pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row))) UMIN v26.8h, v26.8h , v4.8h //II pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip)) - SADDW v20.8h, v20.8h , v10.8b //III pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset) + SADDW v20.8h, v20.8h , v3.8b //III pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset) // TBL v11.8b, {v7.16b},v19.8b //III offset = vtbl1_s8(offset_tbl, vget_high_s8(edge_idx)) SMAX v20.8h, v20.8h , v2.8h //III pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip) @@ -470,12 +469,12 @@ NEXT_ROW_ELSE_2: UMIN v20.8h, v20.8h , v4.8h //III pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip)) xtn v28.8b, v28.8h //II vmovn_s16(pi2_tmp_cur_row.val[0]) - SADDW2 v22.8h, v22.8h , v10.16b //III pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset) + SADDW2 v22.8h, v22.8h , v3.16b //III pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset) xtn2 v28.16b, v26.8h //II vmovn_s16(pi2_tmp_cur_row.val[1]) SMAX v22.8h, v22.8h , v2.8h //III pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip) - mov v12.16b, v30.16b //II pu1_cur_row = pu1_next_row + mov v5.16b, v30.16b //II pu1_cur_row = pu1_next_row UMIN v22.8h, v22.8h , v4.8h //III pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip)) CMP x7,#1 //III @@ -516,25 +515,25 @@ NEXT_ROW_POINTER_ASSIGNED_3: csel x8, x20, x8,LT ST1 { v20.16b},[x0],x1 //III vst1q_u8(pu1_src_cpy, pu1_cur_row) - cmhi v24.16b, v12.16b , v18.16b //vcgtq_u8(pu1_cur_row, pu1_next_row_tmp) + cmhi v24.16b, v5.16b , v18.16b //vcgtq_u8(pu1_cur_row, pu1_next_row_tmp) MOV x20,#1 csel x8, x20, x8,GT //SIGN(pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd]) - cmhi v26.16b, v18.16b , v12.16b //vcltq_u8(pu1_cur_row, pu1_next_row_tmp) + cmhi v26.16b, v18.16b , v5.16b //vcltq_u8(pu1_cur_row, pu1_next_row_tmp) - mov v14.16b[15], w8 //sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd]), sign_up, 15) + mov v17.16b[15], w8 //sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd]), sign_up, 15) SUB v24.16b, v26.16b , v24.16b //sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt)) - Uxtl v20.8h, v12.8b //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row))) - ADD v26.16b, v0.16b , v14.16b //edge_idx = vaddq_s8(const_2, sign_up) + Uxtl v20.8h, v5.8b //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row))) + ADD v26.16b, v0.16b , v17.16b //edge_idx = vaddq_s8(const_2, sign_up) - Uxtl2 v22.8h, v12.16b //pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row))) + Uxtl2 v22.8h, v5.16b //pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row))) ADD v26.16b, v26.16b , v24.16b //edge_idx = vaddq_s8(edge_idx, sign_down) TBL v26.16b, {v6.16b},v26.16b //vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx)) // TBL v27.8b, {v6.16b},v27.8b //vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx)) - AND v26.16b, v26.16b , v8.16b //edge_idx = vandq_s8(edge_idx, au1_mask) + AND v26.16b, v26.16b , v1.16b //edge_idx = vandq_s8(edge_idx, au1_mask) TBL v24.16b, {v7.16b},v26.16b //offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx)) @@ -583,12 +582,12 @@ WD_16_HT_4_LOOP: csel w8,w20,w8,EQ MOV x20,#-1 csel x8, x20, x8,NE - mov v8.8b[0], w8 //au1_mask = vsetq_lane_s8(-1, au1_mask, 0) + mov v1.8b[0], w8 //au1_mask = vsetq_lane_s8(-1, au1_mask, 0) CMP x6,#16 //if(col == 16) BNE SKIP_AU1_MASK_VAL_WD_16_HT_4 LDRB w8,[x5,#1] //pu1_avail[1] - mov v8.16b[15], w8 //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15) + mov v1.16b[15], w8 //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15) SKIP_AU1_MASK_VAL_WD_16_HT_4: LDRB w8,[x5,#2] //pu1_avail[2] @@ -598,7 +597,7 @@ SKIP_AU1_MASK_VAL_WD_16_HT_4: csel x8, x20, x8,EQ csel x8, x3, x8,NE ADD x8,x8,#1 //pu1_src - src_strd + 1 - LD1 {v10.16b},[x8] //pu1_top_row = vld1q_u8(pu1_src - src_strd + 1) + LD1 {v3.16b},[x8] //pu1_top_row = vld1q_u8(pu1_src - src_strd + 1) ADD x3,x3,#16 ADD x5,sp,#0x42 //*au1_src_left_tmp @@ -617,11 +616,11 @@ AU1_SRC_LEFT_LOOP_WD_16_HT_4: SUBS x4,x4,#1 //decrement the loop count BNE AU1_SRC_LEFT_LOOP_WD_16_HT_4 - LD1 {v12.16b},[x0] //pu1_cur_row = vld1q_u8(pu1_src) + LD1 {v5.16b},[x0] //pu1_cur_row = vld1q_u8(pu1_src) - cmhi v14.16b, v12.16b , v10.16b //vcgtq_u8(pu1_cur_row, pu1_top_row) - cmhi v16.16b, v10.16b , v12.16b //vcltq_u8(pu1_cur_row, pu1_top_row) - SUB v14.16b, v16.16b , v14.16b //sign_up = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt)) + cmhi v17.16b, v5.16b , v3.16b //vcgtq_u8(pu1_cur_row, pu1_top_row) + cmhi v16.16b, v3.16b , v5.16b //vcltq_u8(pu1_cur_row, pu1_top_row) + SUB v17.16b, v16.16b , v17.16b //sign_up = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt)) movi v18.16b, #0 MOV x7,x12 //row count, move ht_tmp to x7 @@ -665,31 +664,31 @@ SIGN_UP_CHANGE_WD_16_HT_4: csel x8, x20, x8,LT MOV x20,#1 csel x8, x20, x8,GT //SIGN(pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd]) - mov v14.16b[15], w8 //sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd]), sign_up, 15) + mov v17.16b[15], w8 //sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd]), sign_up, 15) SIGN_UP_CHANGE_DONE_WD_16_HT_4: - cmhi v20.16b, v12.16b , v18.16b //vcgtq_u8(pu1_cur_row, pu1_next_row_tmp) - cmhi v22.16b, v18.16b , v12.16b //vcltq_u8(pu1_cur_row, pu1_next_row_tmp) + cmhi v20.16b, v5.16b , v18.16b //vcgtq_u8(pu1_cur_row, pu1_next_row_tmp) + cmhi v22.16b, v18.16b , v5.16b //vcltq_u8(pu1_cur_row, pu1_next_row_tmp) SUB v24.16b, v22.16b , v20.16b //sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt)) - ADD v26.16b, v0.16b , v14.16b //edge_idx = vaddq_s8(const_2, sign_up) + ADD v26.16b, v0.16b , v17.16b //edge_idx = vaddq_s8(const_2, sign_up) ADD v26.16b, v26.16b , v24.16b //edge_idx = vaddq_s8(edge_idx, sign_down) TBL v26.16b, {v6.16b},v26.16b //vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx)) // TBL v27.8b, {v6.16b},v27.8b //vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx)) - AND v26.16b, v26.16b , v8.16b //edge_idx = vandq_s8(edge_idx, au1_mask) + AND v26.16b, v26.16b , v1.16b //edge_idx = vandq_s8(edge_idx, au1_mask) - NEG v14.16b, v24.16b //sign_up = vnegq_s8(sign_down) - EXT v14.16b, v14.16b , v14.16b,#1 //sign_up = vextq_s8(sign_up, sign_up, 1) + NEG v17.16b, v24.16b //sign_up = vnegq_s8(sign_down) + EXT v17.16b, v17.16b , v17.16b,#1 //sign_up = vextq_s8(sign_up, sign_up, 1) TBL v24.16b, {v7.16b},v26.16b //offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx)) - Uxtl v28.8h, v12.8b //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row))) + Uxtl v28.8h, v5.8b //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row))) SADDW v28.8h, v28.8h , v24.8b //pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset) SMAX v28.8h, v28.8h , v2.8h //pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip) UMIN v28.8h, v28.8h , v4.8h //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip)) // TBL v25.8b, {v7.16b},v27.8b //offset = vtbl1_s8(offset_tbl, vget_high_s8(edge_idx)) - Uxtl2 v30.8h, v12.16b //pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row))) + Uxtl2 v30.8h, v5.16b //pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row))) SADDW2 v30.8h, v30.8h , v24.16b //pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset) SMAX v30.8h, v30.8h , v2.8h //pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip) UMIN v30.8h, v30.8h , v4.8h //pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip)) @@ -699,7 +698,7 @@ SIGN_UP_CHANGE_DONE_WD_16_HT_4: ST1 { v28.16b},[x0],x1 //vst1q_u8(pu1_src_cpy, pu1_cur_row) - mov v12.16b, v16.16b //pu1_cur_row = pu1_next_row + mov v5.16b, v16.16b //pu1_cur_row = pu1_next_row SUBS x7,x7,#1 //Decrement the ht_tmp loop count by 1 BNE PU1_SRC_LOOP_WD_16_HT_4 //If not equal jump to PU1_SRC_LOOP_WD_16_HT_4 @@ -726,10 +725,10 @@ WIDTH_RESIDUE: MOV x20,#-1 csel x8, x20, x8,NE - mov v8.8b[0], w8 //au1_mask = vsetq_lane_s8(-1, au1_mask, 0) + mov v1.8b[0], w8 //au1_mask = vsetq_lane_s8(-1, au1_mask, 0) LDRB w8,[x5,#1] //pu1_avail[1] - mov v8.8b[7], w8 //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15) + mov v1.8b[7], w8 //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15) PU1_AVAIL_2_RESIDUE: LDRB w8,[x5,#2] //pu1_avail[2] @@ -739,7 +738,7 @@ PU1_AVAIL_2_RESIDUE: csel x8, x20, x8,EQ csel x8, x3, x8,NE ADD x8,x8,#1 //pu1_src - src_strd + 1 - LD1 {v10.16b},[x8] //pu1_top_row = vld1q_u8(pu1_src - src_strd + 1) + LD1 {v3.16b},[x8] //pu1_top_row = vld1q_u8(pu1_src - src_strd + 1) ADD x5,sp,#0x42 //*au1_src_left_tmp @@ -757,11 +756,11 @@ AU1_SRC_LEFT_LOOP_RESIDUE: SUBS x4,x4,#1 //decrement the loop count BNE AU1_SRC_LEFT_LOOP_RESIDUE - LD1 {v12.16b},[x0] //pu1_cur_row = vld1q_u8(pu1_src) + LD1 {v5.16b},[x0] //pu1_cur_row = vld1q_u8(pu1_src) - cmhi v14.16b, v12.16b , v10.16b //vcgtq_u8(pu1_cur_row, pu1_top_row) - cmhi v16.16b, v10.16b , v12.16b //vcltq_u8(pu1_cur_row, pu1_top_row) - SUB v14.16b, v16.16b , v14.16b //sign_up = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt)) + cmhi v17.16b, v5.16b , v3.16b //vcgtq_u8(pu1_cur_row, pu1_top_row) + cmhi v16.16b, v3.16b , v5.16b //vcltq_u8(pu1_cur_row, pu1_top_row) + SUB v17.16b, v16.16b , v17.16b //sign_up = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt)) MOV x7,x12 //row count, move ht_tmp to x7 PU1_SRC_LOOP_RESIDUE: @@ -805,25 +804,25 @@ SIGN_UP_CHANGE_RESIDUE: csel x8, x20, x8,LT MOV x20,#1 csel x8, x20, x8,GT //SIGN(pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd]) - mov v14.16b[15], w8 //sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd]), sign_up, 15) + mov v17.16b[15], w8 //sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd]), sign_up, 15) SIGN_UP_CHANGE_DONE_RESIDUE: - cmhi v20.16b, v12.16b , v18.16b //vcgtq_u8(pu1_cur_row, pu1_next_row_tmp) - cmhi v22.16b, v18.16b , v12.16b //vcltq_u8(pu1_cur_row, pu1_next_row_tmp) + cmhi v20.16b, v5.16b , v18.16b //vcgtq_u8(pu1_cur_row, pu1_next_row_tmp) + cmhi v22.16b, v18.16b , v5.16b //vcltq_u8(pu1_cur_row, pu1_next_row_tmp) SUB v24.16b, v22.16b , v20.16b //sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt)) - ADD v26.16b, v0.16b , v14.16b //edge_idx = vaddq_s8(const_2, sign_up) + ADD v26.16b, v0.16b , v17.16b //edge_idx = vaddq_s8(const_2, sign_up) ADD v26.16b, v26.16b , v24.16b //edge_idx = vaddq_s8(edge_idx, sign_down) TBL v26.16b, {v6.16b},v26.16b //vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx)) // TBL v27.8b, {v6.16b},v27.8b //vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx)) - AND v26.16b, v26.16b , v8.16b //edge_idx = vandq_s8(edge_idx, au1_mask) + AND v26.16b, v26.16b , v1.16b //edge_idx = vandq_s8(edge_idx, au1_mask) - NEG v14.16b, v24.16b //sign_up = vnegq_s8(sign_down) - EXT v14.16b, v14.16b , v14.16b,#1 //sign_up = vextq_s8(sign_up, sign_up, 1) + NEG v17.16b, v24.16b //sign_up = vnegq_s8(sign_down) + EXT v17.16b, v17.16b , v17.16b,#1 //sign_up = vextq_s8(sign_up, sign_up, 1) TBL v24.8b, {v7.16b},v26.8b //offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx)) - Uxtl v28.8h, v12.8b //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row))) + Uxtl v28.8h, v5.8b //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row))) SADDW v28.8h, v28.8h , v24.8b //pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset) SMAX v28.8h, v28.8h , v2.8h //pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip) UMIN v28.8h, v28.8h , v4.8h //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip)) @@ -831,7 +830,7 @@ SIGN_UP_CHANGE_DONE_RESIDUE: xtn v30.8b, v28.8h //vmovn_s16(pi2_tmp_cur_row.val[0]) ST1 {v30.8b},[x0],x1 //vst1q_u8(pu1_src_cpy, pu1_cur_row) - mov v12.16b, v16.16b //pu1_cur_row = pu1_next_row + mov v5.16b, v16.16b //pu1_cur_row = pu1_next_row SUBS x7,x7,#1 BNE PU1_SRC_LOOP_RESIDUE @@ -880,7 +879,6 @@ END_LOOPS: ldp x23, x24,[sp], #16 ldp x21, x22,[sp], #16 ldp x19, x20,[sp], #16 - pop_v_regs ret diff --git a/common/arm64/ihevc_sao_edge_offset_class3_chroma.s b/common/arm64/ihevc_sao_edge_offset_class3_chroma.s index cf25102..5c444c0 100644 --- a/common/arm64/ihevc_sao_edge_offset_class3_chroma.s +++ b/common/arm64/ihevc_sao_edge_offset_class3_chroma.s @@ -77,7 +77,7 @@ ihevc_sao_edge_offset_class3_chroma_av8: ldr w10,[sp,#16] ldr w11,[sp,#24] - push_v_regs + // STMFD sp!, {x4-x12, x14} //stack stores the values of the arguments stp x19, x20,[sp,#-16]! stp x21, x22,[sp,#-16]! @@ -310,7 +310,7 @@ PU1_AVAIL_2_LOOP_END: LDR x2, [x2, #:got_lo12:gi1_table_edge_idx] //VLD1.8 D6,[x6] @edge_idx_tbl = vld1_s8(gi1_table_edge_idx) - movi v8.16b, #0xFF //au1_mask = vdupq_n_s8(-1) + movi v1.16b, #0xFF //au1_mask = vdupq_n_s8(-1) MOV x6,x7 //move wd to x6 loop_count CMP x7,#16 //Compare wd with 16 @@ -328,20 +328,20 @@ WIDTH_LOOP_16: MOV x20,#-1 csel x8, x20, x8,NE - mov v8.8b[0], w8 //au1_mask = vsetq_lane_s8(-1, au1_mask, 0) + mov v1.8b[0], w8 //au1_mask = vsetq_lane_s8(-1, au1_mask, 0) LDRB w11,[x5,#2] //pu1_avail[2] CMP x6,#16 //if(col == 16) - mov v8.8b[1], w8 //au1_mask = vsetq_lane_s8(-1, au1_mask, 0) + mov v1.8b[1], w8 //au1_mask = vsetq_lane_s8(-1, au1_mask, 0) BNE SKIP_AU1_MASK_VAL LDRB w8,[x5,#1] //pu1_avail[1] - mov v8.16b[14], w8 //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15) - mov v8.16b[15], w8 //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15) + mov v1.16b[14], w8 //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15) + mov v1.16b[15], w8 //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15) SKIP_AU1_MASK_VAL: CMP x11,#0 - LD1 {v12.16b},[x0] //pu1_cur_row = vld1q_u8(pu1_src) + LD1 {v5.16b},[x0] //pu1_cur_row = vld1q_u8(pu1_src) //LD1 {v13.8b},[x0] //pu1_cur_row = vld1q_u8(pu1_src) //SUB x0, x0,#8 ADD x5,sp,#0x4B //*au1_src_left_tmp @@ -352,21 +352,21 @@ SKIP_AU1_MASK_VAL: csel x8, x3, x8,NE ADD x8,x8,#2 //pu1_src - src_strd + 2 - LD1 {v10.16b},[x8] //pu1_top_row = vld1q_u8(pu1_src - src_strd + 2) + LD1 {v3.16b},[x8] //pu1_top_row = vld1q_u8(pu1_src - src_strd + 2) //LD1 {v11.8b},[x8] //pu1_top_row = vld1q_u8(pu1_src - src_strd + 2) //SUB x8, x8,#8 ADD x3,x3,#16 mov w4, w25 //Loads ht - cmhi v14.16b, v12.16b , v10.16b //vcgtq_u8(pu1_cur_row, pu1_top_row) + cmhi v17.16b, v5.16b , v3.16b //vcgtq_u8(pu1_cur_row, pu1_top_row) mov w7, w24 //Loads wd SUB x7,x7,x6 //(wd - col) - cmhi v16.16b, v10.16b , v12.16b //vcltq_u8(pu1_cur_row, pu1_top_row) + cmhi v16.16b, v3.16b , v5.16b //vcltq_u8(pu1_cur_row, pu1_top_row) ADD x7,x7,#14 //15 + (wd - col) mov x8, x26 //Loads *pu1_src - SUB v14.16b, v16.16b , v14.16b //sign_up = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt)) + SUB v17.16b, v16.16b , v17.16b //sign_up = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt)) ADD x7,x8,x7 //pu1_src[0 * src_strd + 15 + (wd - col)] AU1_SRC_LEFT_LOOP: @@ -418,29 +418,29 @@ AU1_SRC_LEFT_LOOP: movn x20,#0 csel x9, x20, x9,LT //I - mov v14.16b[14], w8 //I sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[14] -pu1_src_cpy[16 - src_strd]), sign_up, 0) + mov v17.16b[14], w8 //I sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[14] -pu1_src_cpy[16 - src_strd]), sign_up, 0) MOV x20,#1 csel x9, x20, x9,GT //I SIGN(pu1_src_cpy[15] - pu1_src_cpy[17 - src_strd] - mov v14.16b[15], w9 //I sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[15] -pu1_src_cpy[17 - src_strd]), sign_up, 1) + mov v17.16b[15], w9 //I sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[15] -pu1_src_cpy[17 - src_strd]), sign_up, 1) SIGN_UP_CHANGE_DONE: LD1 {v28.8b},[x2] //edge_idx_tbl = vld1_s8(gi1_table_edge_idx) - cmhi v20.16b, v12.16b , v18.16b //I vcgtq_u8(pu1_cur_row, pu1_next_row_tmp) + cmhi v20.16b, v5.16b , v18.16b //I vcgtq_u8(pu1_cur_row, pu1_next_row_tmp) - cmhi v22.16b, v18.16b , v12.16b //I vcltq_u8(pu1_cur_row, pu1_next_row_tmp) + cmhi v22.16b, v18.16b , v5.16b //I vcltq_u8(pu1_cur_row, pu1_next_row_tmp) SUB v22.16b, v22.16b , v20.16b //I sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt)) - ADD v18.16b, v0.16b , v14.16b //I edge_idx = vaddq_s8(const_2, sign_up) + ADD v18.16b, v0.16b , v17.16b //I edge_idx = vaddq_s8(const_2, sign_up) ADD v18.16b, v18.16b , v22.16b //I edge_idx = vaddq_s8(edge_idx, sign_down) TBL v18.16b, {v28.16b},v18.16b //I vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx)) - NEG v14.16b, v22.16b //I sign_up = vnegq_s8(sign_down) + NEG v17.16b, v22.16b //I sign_up = vnegq_s8(sign_down) //TBL v19.8b, {v28.16b},v19.8b //I vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx)) - EXT v14.16b, v14.16b , v14.16b,#2 //I sign_up = vextq_s8(sign_up, sign_up, 2) + EXT v17.16b, v17.16b , v17.16b,#2 //I sign_up = vextq_s8(sign_up, sign_up, 2) - Uxtl v20.8h, v12.8b //I pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row))) - AND v18.16b, v18.16b , v8.16b //I edge_idx = vandq_s8(edge_idx, au1_mask) + Uxtl v20.8h, v5.8b //I pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row))) + AND v18.16b, v18.16b , v1.16b //I edge_idx = vandq_s8(edge_idx, au1_mask) mov v19.d[0],v18.d[1] UZP1 v31.8b, v18.8b, v19.8b @@ -452,13 +452,13 @@ SIGN_UP_CHANGE_DONE: ZIP2 v23.8b, v22.8b, v23.8b //I mov v22.8b,v31.8b - Uxtl2 v18.8h, v12.16b //I pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row))) + Uxtl2 v18.8h, v5.16b //I pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row))) SADDW v20.8h, v20.8h , v22.8b //I pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset) SMAX v20.8h, v20.8h , v2.8h //I pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip) UMIN v20.8h, v20.8h , v4.8h //I pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip)) - mov v12.16b, v16.16b //I pu1_cur_row = pu1_next_row + mov v5.16b, v16.16b //I pu1_cur_row = pu1_next_row SADDW v18.8h, v18.8h , v23.8b //I pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset) SUB x7,x7,#1 //I Decrement the ht_tmp loop count by 1 @@ -507,18 +507,18 @@ PU1_SRC_LOOP: csel x10, x20, x10,GT //II SIGN(pu1_src_cpy[14] - pu1_src_cpy[16 - src_strd]) CMP x8,#0 //II - mov v14.8b[14], w10 //II sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[14] -pu1_src_cpy[16 - src_strd]), sign_up, 0) + mov v17.8b[14], w10 //II sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[14] -pu1_src_cpy[16 - src_strd]), sign_up, 0) movn x20,#0 csel x8, x20, x8,LT //II MOV x20,#1 csel x8, x20, x8,GT //II SIGN(pu1_src_cpy[15] - pu1_src_cpy[17 - src_strd] SUB x10,x12,x7 //III ht_tmp - row - mov v14.8b[15], w8 //II sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[15] -pu1_src_cpy[17 - src_strd]), sign_up, 1) + mov v17.8b[15], w8 //II sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[15] -pu1_src_cpy[17 - src_strd]), sign_up, 1) ADD x11,x14,x10,LSL #1 //III pu1_src_left_cpy[(ht_tmp - row) * 2] CMP x7,#1 //III - cmhi v22.16b, v12.16b , v28.16b //II vcgtq_u8(pu1_cur_row, pu1_next_row_tmp) + cmhi v22.16b, v5.16b , v28.16b //II vcgtq_u8(pu1_cur_row, pu1_next_row_tmp) BNE NEXT_ROW_POINTER_ASSIGNED_2 //III mov x5, x21 //III Loads pu1_avail @@ -529,7 +529,7 @@ PU1_SRC_LOOP: NEXT_ROW_POINTER_ASSIGNED_2: LDRH w5,[x11,#2] //III - cmhi v24.16b, v28.16b , v12.16b //II vcltq_u8(pu1_cur_row, pu1_next_row_tmp) + cmhi v24.16b, v28.16b , v5.16b //II vcltq_u8(pu1_cur_row, pu1_next_row_tmp) ADD x11,x0,x1 //III LDRB w9,[x11,#14] //III pu1_src_cpy[14] @@ -545,7 +545,7 @@ NEXT_ROW_POINTER_ASSIGNED_2: SUB x10,x8,x10 //III pu1_src_cpy[15] - pu1_src_cpy[17 - src_strd] CMP x9,#0 //III - ADD v26.16b, v0.16b , v14.16b //II edge_idx = vaddq_s8(const_2, sign_up) + ADD v26.16b, v0.16b , v17.16b //II edge_idx = vaddq_s8(const_2, sign_up) movn x20,#0 csel x9, x20, x9,LT //III @@ -554,22 +554,22 @@ NEXT_ROW_POINTER_ASSIGNED_2: ADD v26.16b, v26.16b , v24.16b //II edge_idx = vaddq_s8(edge_idx, sign_down) CMP x10,#0 //III - NEG v14.16b, v24.16b //II sign_up = vnegq_s8(sign_down) + NEG v17.16b, v24.16b //II sign_up = vnegq_s8(sign_down) TBL v26.16b, {v21.16b},v26.16b //II vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx)) movn x20,#0 csel x10, x20, x10,LT //III MOV x20,#1 csel x10, x20, x10,GT //III SIGN(pu1_src_cpy[15] - pu1_src_cpy[17 - src_strd] - EXT v14.16b, v14.16b , v14.16b,#2 //II sign_up = vextq_s8(sign_up, sign_up, 2) + EXT v17.16b, v17.16b , v17.16b,#2 //II sign_up = vextq_s8(sign_up, sign_up, 2) //TBL v27.8b, {v21.16b},v27.8b //II vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx)) cmhi v22.16b, v16.16b , v18.16b //III vcgtq_u8(pu1_cur_row, pu1_next_row_tmp) - mov v14.16b[14], w9 //III sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[14] -pu1_src_cpy[16 - src_strd]), sign_up, 0) - AND v26.16b, v26.16b , v8.16b //II edge_idx = vandq_s8(edge_idx, au1_mask) + mov v17.16b[14], w9 //III sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[14] -pu1_src_cpy[16 - src_strd]), sign_up, 0) + AND v26.16b, v26.16b , v1.16b //II edge_idx = vandq_s8(edge_idx, au1_mask) mov v27.d[0],v26.d[1] - mov v14.16b[15], w10 //III sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[15] -pu1_src_cpy[17 - src_strd]), sign_up, 1) + mov v17.16b[15], w10 //III sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[15] -pu1_src_cpy[17 - src_strd]), sign_up, 1) UZP1 v31.8b, v26.8b, v27.8b UZP2 v27.8b, v26.8b, v27.8b //II mov v26.8b,v31.8b @@ -578,7 +578,7 @@ NEXT_ROW_POINTER_ASSIGNED_2: TBL v24.8b, {v6.16b},v26.8b //II SUB v22.16b, v20.16b , v22.16b //III sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt)) - ADD v18.16b, v0.16b , v14.16b //III edge_idx = vaddq_s8(const_2, sign_up) + ADD v18.16b, v0.16b , v17.16b //III edge_idx = vaddq_s8(const_2, sign_up) TBL v25.8b, {v7.16b},v27.8b //II ADD v18.16b, v18.16b , v22.16b //III edge_idx = vaddq_s8(edge_idx, sign_down) @@ -587,16 +587,16 @@ NEXT_ROW_POINTER_ASSIGNED_2: ZIP2 v25.8b, v24.8b, v25.8b //II mov v24.8b,v31.8b - Uxtl v28.8h, v12.8b //II pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row))) + Uxtl v28.8h, v5.8b //II pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row))) TBL v18.16b, {v20.16b},v18.16b //III vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx)) - NEG v14.16b, v22.16b //III sign_up = vnegq_s8(sign_down) + NEG v17.16b, v22.16b //III sign_up = vnegq_s8(sign_down) SADDW v28.8h, v28.8h , v24.8b //II pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset) //TBL v19.8b, {v20.16b},v19.8b //III vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx)) - EXT v14.16b, v14.16b , v14.16b,#2 //III sign_up = vextq_s8(sign_up, sign_up, 2) + EXT v17.16b, v17.16b , v17.16b,#2 //III sign_up = vextq_s8(sign_up, sign_up, 2) - Uxtl2 v26.8h, v12.16b //II pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row))) - AND v18.16b, v18.16b , v8.16b //III edge_idx = vandq_s8(edge_idx, au1_mask) + Uxtl2 v26.8h, v5.16b //II pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row))) + AND v18.16b, v18.16b , v1.16b //III edge_idx = vandq_s8(edge_idx, au1_mask) mov v19.d[0],v18.d[1] Uxtl v20.8h, v16.8b //III pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row))) @@ -620,7 +620,7 @@ NEXT_ROW_POINTER_ASSIGNED_2: xtn v28.8b, v28.8h //II vmovn_s16(pi2_tmp_cur_row.val[0]) SADDW v20.8h, v20.8h , v22.8b //III pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset) - mov v12.16b, v30.16b //III pu1_cur_row = pu1_next_row + mov v5.16b, v30.16b //III pu1_cur_row = pu1_next_row UMIN v26.8h, v26.8h , v4.8h //II pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip)) SUB x7,x7,#1 //III Decrement the ht_tmp loop count by 1 @@ -682,27 +682,27 @@ NEXT_ROW_POINTER_ASSIGNED_3: csel x8, x20, x8,GT //SIGN(pu1_src_cpy[14] - pu1_src_cpy[16 - src_strd]) CMP x10,#0 - mov v14.16b[14], w8 //sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[14] -pu1_src_cpy[16 - src_strd]), sign_up, 0) + mov v17.16b[14], w8 //sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[14] -pu1_src_cpy[16 - src_strd]), sign_up, 0) movn x20,#0 csel x10, x20, x10,LT MOV x20,#1 csel x10, x20, x10,GT //SIGN(pu1_src_cpy[15] - pu1_src_cpy[17 - src_strd] - mov v14.16b[15], w10 //sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[15] -pu1_src_cpy[17 - src_strd]), sign_up, 1) - cmhi v20.16b, v12.16b , v18.16b //vcgtq_u8(pu1_cur_row, pu1_next_row_tmp) + mov v17.16b[15], w10 //sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[15] -pu1_src_cpy[17 - src_strd]), sign_up, 1) + cmhi v20.16b, v5.16b , v18.16b //vcgtq_u8(pu1_cur_row, pu1_next_row_tmp) - cmhi v22.16b, v18.16b , v12.16b //vcltq_u8(pu1_cur_row, pu1_next_row_tmp) + cmhi v22.16b, v18.16b , v5.16b //vcltq_u8(pu1_cur_row, pu1_next_row_tmp) SUB v22.16b, v22.16b , v20.16b //sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt)) - ADD v18.16b, v0.16b , v14.16b //edge_idx = vaddq_s8(const_2, sign_up) + ADD v18.16b, v0.16b , v17.16b //edge_idx = vaddq_s8(const_2, sign_up) ADD v18.16b, v18.16b , v22.16b //edge_idx = vaddq_s8(edge_idx, sign_down) TBL v18.16b, {v28.16b},v18.16b //vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx)) //TBL v19.8b, {v28.16b},v19.8b //vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx)) - AND v18.16b, v18.16b , v8.16b //edge_idx = vandq_s8(edge_idx, au1_mask) + AND v18.16b, v18.16b , v1.16b //edge_idx = vandq_s8(edge_idx, au1_mask) mov v19.d[0],v18.d[1] - Uxtl v20.8h, v12.8b //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row))) + Uxtl v20.8h, v5.8b //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row))) UZP1 v31.8b, v18.8b, v19.8b UZP2 v19.8b, v18.8b, v19.8b mov v18.8b,v31.8b @@ -710,7 +710,7 @@ NEXT_ROW_POINTER_ASSIGNED_3: TBL v22.8b, {v6.16b},v18.8b TBL v23.8b, {v7.16b},v19.8b - Uxtl2 v18.8h, v12.16b //pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row))) + Uxtl2 v18.8h, v5.16b //pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row))) ZIP1 v31.8b, v22.8b, v23.8b ZIP2 v23.8b, v22.8b, v23.8b mov v22.8b,v31.8b @@ -762,15 +762,15 @@ WD_16_HT_4_LOOP: csel w8,w20,w8,EQ MOV x20,#-1 csel x8, x20, x8,NE - mov v8.8b[0], w8 //au1_mask = vsetq_lane_s8(-1, au1_mask, 0) + mov v1.8b[0], w8 //au1_mask = vsetq_lane_s8(-1, au1_mask, 0) CMP x6,#16 //if(col == 16) - mov v8.8b[1], w8 //au1_mask = vsetq_lane_s8(-1, au1_mask, 0) + mov v1.8b[1], w8 //au1_mask = vsetq_lane_s8(-1, au1_mask, 0) BNE SKIP_AU1_MASK_VAL_WD_16_HT_4 LDRB w8,[x5,#1] //pu1_avail[1] - mov v8.16b[14], w8 //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15) - mov v8.16b[15], w8 //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15) + mov v1.16b[14], w8 //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15) + mov v1.16b[15], w8 //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15) SKIP_AU1_MASK_VAL_WD_16_HT_4: LDRB w11,[x5,#2] //pu1_avail[2] @@ -779,27 +779,27 @@ SKIP_AU1_MASK_VAL_WD_16_HT_4: CMP x11,#0 csel x8, x3, x8,NE - LD1 {v12.16b},[x0] //pu1_cur_row = vld1q_u8(pu1_src) + LD1 {v5.16b},[x0] //pu1_cur_row = vld1q_u8(pu1_src) //LD1 {v13.8b},[x0] //pu1_cur_row = vld1q_u8(pu1_src) //SUB x0, x0,#8 ADD x8,x8,#2 //pu1_src - src_strd + 2 ADD x3,x3,#16 - LD1 {v10.16b},[x8] //pu1_top_row = vld1q_u8(pu1_src - src_strd + 2) + LD1 {v3.16b},[x8] //pu1_top_row = vld1q_u8(pu1_src - src_strd + 2) //LD1 {v11.8b},[x8] //pu1_top_row = vld1q_u8(pu1_src - src_strd + 2) //SUB x8, x8,#8 ADD x5,sp,#0x4B //*au1_src_left_tmp mov w4, w25 //Loads ht - cmhi v14.16b, v12.16b , v10.16b //vcgtq_u8(pu1_cur_row, pu1_top_row) + cmhi v17.16b, v5.16b , v3.16b //vcgtq_u8(pu1_cur_row, pu1_top_row) mov w7, w24 //Loads wd SUB x7,x7,x6 //(wd - col) - cmhi v16.16b, v10.16b , v12.16b //vcltq_u8(pu1_cur_row, pu1_top_row) + cmhi v16.16b, v3.16b , v5.16b //vcltq_u8(pu1_cur_row, pu1_top_row) ADD x7,x7,#14 //15 + (wd - col) mov x8, x26 //Loads *pu1_src - SUB v14.16b, v16.16b , v14.16b //sign_up = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt)) + SUB v17.16b, v16.16b , v17.16b //sign_up = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt)) ADD x7,x8,x7 //pu1_src[0 * src_strd + 15 + (wd - col)] AU1_SRC_LEFT_LOOP_WD_16_HT_4: @@ -864,33 +864,33 @@ SIGN_UP_CHANGE_WD_16_HT_4: csel x8, x20, x8,GT //SIGN(pu1_src_cpy[14] - pu1_src_cpy[16 - src_strd]) CMP x10,#0 - mov v14.16b[14], w8 //sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[14] -pu1_src_cpy[16 - src_strd]), sign_up, 0) + mov v17.16b[14], w8 //sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[14] -pu1_src_cpy[16 - src_strd]), sign_up, 0) movn x20,#0 csel x10, x20, x10,LT MOV x20,#1 csel x10, x20, x10,GT //SIGN(pu1_src_cpy[15] - pu1_src_cpy[17 - src_strd] - mov v14.16b[15], w10 //sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[15] -pu1_src_cpy[17 - src_strd]), sign_up, 1) + mov v17.16b[15], w10 //sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[15] -pu1_src_cpy[17 - src_strd]), sign_up, 1) SIGN_UP_CHANGE_DONE_WD_16_HT_4: LD1 {v20.8b},[x2] //edge_idx_tbl = vld1_s8(gi1_table_edge_idx) - cmhi v22.16b, v12.16b , v18.16b //vcgtq_u8(pu1_cur_row, pu1_next_row_tmp) + cmhi v22.16b, v5.16b , v18.16b //vcgtq_u8(pu1_cur_row, pu1_next_row_tmp) - cmhi v24.16b, v18.16b , v12.16b //vcltq_u8(pu1_cur_row, pu1_next_row_tmp) + cmhi v24.16b, v18.16b , v5.16b //vcltq_u8(pu1_cur_row, pu1_next_row_tmp) SUB v24.16b, v24.16b , v22.16b //sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt)) - ADD v26.16b, v0.16b , v14.16b //edge_idx = vaddq_s8(const_2, sign_up) + ADD v26.16b, v0.16b , v17.16b //edge_idx = vaddq_s8(const_2, sign_up) ADD v26.16b, v26.16b , v24.16b //edge_idx = vaddq_s8(edge_idx, sign_down) mov v20.d[1],v20.d[0] - NEG v14.16b, v24.16b //sign_up = vnegq_s8(sign_down) + NEG v17.16b, v24.16b //sign_up = vnegq_s8(sign_down) TBL v26.16b, {v20.16b},v26.16b //vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx)) //TBL v27.8b, {v20.16b},v27.8b //vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx)) - EXT v14.16b, v14.16b , v14.16b,#2 //sign_up = vextq_s8(sign_up, sign_up, 2) + EXT v17.16b, v17.16b , v17.16b,#2 //sign_up = vextq_s8(sign_up, sign_up, 2) - Uxtl v28.8h, v12.8b //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row))) - AND v26.16b, v26.16b , v8.16b //edge_idx = vandq_s8(edge_idx, au1_mask) + Uxtl v28.8h, v5.8b //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row))) + AND v26.16b, v26.16b , v1.16b //edge_idx = vandq_s8(edge_idx, au1_mask) mov v27.d[0],v26.d[1] UZP1 v31.8b, v26.8b, v27.8b @@ -902,13 +902,13 @@ SIGN_UP_CHANGE_DONE_WD_16_HT_4: ZIP2 v25.8b, v24.8b, v25.8b mov v24.8b,v31.8b - Uxtl2 v30.8h, v12.16b //pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row))) + Uxtl2 v30.8h, v5.16b //pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row))) SADDW v28.8h, v28.8h , v24.8b //pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset) SMAX v28.8h, v28.8h , v2.8h //pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip) UMIN v28.8h, v28.8h , v4.8h //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip)) - mov v12.16b, v16.16b //pu1_cur_row = pu1_next_row + mov v5.16b, v16.16b //pu1_cur_row = pu1_next_row SADDW v30.8h, v30.8h , v25.8b //pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset) SMAX v30.8h, v30.8h , v2.8h //pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip) @@ -949,24 +949,24 @@ WIDTH_RESIDUE: LDRB w11,[x5,#1] //pu1_avail[1] LDRB w9,[x5,#2] //pu1_avail[2] - mov v8.8b[0], w8 //au1_mask = vsetq_lane_s8(-1, au1_mask, 0) + mov v1.8b[0], w8 //au1_mask = vsetq_lane_s8(-1, au1_mask, 0) CMP x9,#0 SUB x20,x0,x1 //pu1_src - src_strd csel x10, x20, x10,EQ - mov v8.8b[1], w8 //au1_mask = vsetq_lane_s8(-1, au1_mask, 0) + mov v1.8b[1], w8 //au1_mask = vsetq_lane_s8(-1, au1_mask, 0) csel x10, x3, x10,NE ADD x10,x10,#2 //pu1_src - src_strd + 2 - mov v8.8b[6], w11 //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15) + mov v1.8b[6], w11 //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15) ADD x5,sp,#0x4B //*au1_src_left_tmp mov w4, w25 //Loads ht - mov v8.8b[7], w11 //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15) + mov v1.8b[7], w11 //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15) mov w7, w24 //Loads wd mov x8, x26 //Loads *pu1_src - LD1 {v10.16b},[x10] //pu1_top_row = vld1q_u8(pu1_src - src_strd + 2) + LD1 {v3.16b},[x10] //pu1_top_row = vld1q_u8(pu1_src - src_strd + 2) //LD1 {v11.8b},[x10] //pu1_top_row = vld1q_u8(pu1_src - src_strd + 2) //SUB x10, x10,#8 SUB x7,x7,#2 //(wd - 2) @@ -980,15 +980,15 @@ AU1_SRC_LEFT_LOOP_RESIDUE: SUBS x4,x4,#1 //decrement the loop count BNE AU1_SRC_LEFT_LOOP_RESIDUE - LD1 {v12.16b},[x0] //pu1_cur_row = vld1q_u8(pu1_src) + LD1 {v5.16b},[x0] //pu1_cur_row = vld1q_u8(pu1_src) //LD1 {v13.8b},[x0] //pu1_cur_row = vld1q_u8(pu1_src) //SUB x0, x0,#8 movi v18.16b, #0 - cmhi v14.16b, v12.16b , v10.16b //vcgtq_u8(pu1_cur_row, pu1_top_row) + cmhi v17.16b, v5.16b , v3.16b //vcgtq_u8(pu1_cur_row, pu1_top_row) - cmhi v16.16b, v10.16b , v12.16b //vcltq_u8(pu1_cur_row, pu1_top_row) - SUB v14.16b, v16.16b , v14.16b //sign_up = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt)) + cmhi v16.16b, v3.16b , v5.16b //vcltq_u8(pu1_cur_row, pu1_top_row) + SUB v17.16b, v16.16b , v17.16b //sign_up = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt)) MOV x7,x12 //row count, move ht_tmp to x7 PU1_SRC_LOOP_RESIDUE: @@ -1047,33 +1047,33 @@ SIGN_UP_CHANGE_RESIDUE: csel x8, x20, x8,GT //SIGN(pu1_src_cpy[14] - pu1_src_cpy[16 - src_strd]) CMP x10,#0 - mov v14.16b[14], w8 //sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[14] -pu1_src_cpy[16 - src_strd]), sign_up, 0) + mov v17.16b[14], w8 //sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[14] -pu1_src_cpy[16 - src_strd]), sign_up, 0) movn x20,#0 csel x10, x20, x10,LT MOV x20,#1 csel x10, x20, x10,GT //SIGN(pu1_src_cpy[15] - pu1_src_cpy[17 - src_strd] - mov v14.16b[15], w10 //sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[15] -pu1_src_cpy[17 - src_strd]), sign_up, 1) + mov v17.16b[15], w10 //sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[15] -pu1_src_cpy[17 - src_strd]), sign_up, 1) SIGN_UP_CHANGE_DONE_RESIDUE: LD1 {v20.8b},[x2] //edge_idx_tbl = vld1_s8(gi1_table_edge_idx) - cmhi v22.16b, v12.16b , v18.16b //vcgtq_u8(pu1_cur_row, pu1_next_row_tmp) + cmhi v22.16b, v5.16b , v18.16b //vcgtq_u8(pu1_cur_row, pu1_next_row_tmp) - cmhi v24.16b, v18.16b , v12.16b //vcltq_u8(pu1_cur_row, pu1_next_row_tmp) + cmhi v24.16b, v18.16b , v5.16b //vcltq_u8(pu1_cur_row, pu1_next_row_tmp) SUB v24.16b, v24.16b , v22.16b //sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt)) - ADD v26.16b, v0.16b , v14.16b //edge_idx = vaddq_s8(const_2, sign_up) + ADD v26.16b, v0.16b , v17.16b //edge_idx = vaddq_s8(const_2, sign_up) ADD v26.16b, v26.16b , v24.16b //edge_idx = vaddq_s8(edge_idx, sign_down) mov v20.d[1],v20.d[0] - NEG v14.16b, v24.16b //sign_up = vnegq_s8(sign_down) + NEG v17.16b, v24.16b //sign_up = vnegq_s8(sign_down) TBL v26.16b, {v20.16b},v26.16b //vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx)) //TBL v27.8b, {v20.16b},v27.8b //vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx)) - EXT v14.16b, v14.16b , v14.16b,#2 //sign_up = vextq_s8(sign_up, sign_up, 14) + EXT v17.16b, v17.16b , v17.16b,#2 //sign_up = vextq_s8(sign_up, sign_up, 14) - Uxtl v28.8h, v12.8b //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row))) - AND v26.16b, v26.16b , v8.16b //edge_idx = vandq_s8(edge_idx, au1_mask) + Uxtl v28.8h, v5.8b //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row))) + AND v26.16b, v26.16b , v1.16b //edge_idx = vandq_s8(edge_idx, au1_mask) mov v27.d[0],v26.d[1] UZP1 v31.8b, v26.8b, v27.8b @@ -1085,7 +1085,7 @@ SIGN_UP_CHANGE_DONE_RESIDUE: ZIP2 v25.8b, v24.8b, v25.8b mov v24.8b,v31.8b - mov v12.16b, v16.16b //pu1_cur_row = pu1_next_row + mov v5.16b, v16.16b //pu1_cur_row = pu1_next_row SADDW v28.8h, v28.8h , v24.8b //pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset) SMAX v28.8h, v28.8h , v2.8h //pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip) @@ -1148,7 +1148,7 @@ END_LOOPS: ldp x23, x24,[sp],#16 ldp x21, x22,[sp],#16 ldp x19, x20,[sp],#16 - pop_v_regs + ret diff --git a/common/arm64/ihevc_weighted_pred_bi.s b/common/arm64/ihevc_weighted_pred_bi.s index 6851cb4..c0508d8 100644 --- a/common/arm64/ihevc_weighted_pred_bi.s +++ b/common/arm64/ihevc_weighted_pred_bi.s @@ -161,7 +161,7 @@ ihevc_weighted_pred_bi_av8: sxtw x11,w11 sxtw x12,w12 - push_v_regs + stp x19, x20,[sp,#-16]! stp x21, x22,[sp,#-16]! stp x23, x24,[sp,#-16]! @@ -221,64 +221,64 @@ core_loop: ld1 {v1.4h},[x1],#8 //load and increment the pi2_src2 smull v4.4s, v0.4h, v7.4h[0] //vmull_n_s16(pi2_src1_val1, (int16_t) wgt0) ld1 {v2.4h},[x6],x3 //load and increment the pi2_src_tmp1 ii iteration - smull v8.4s, v1.4h, v7.4h[1] //vmull_n_s16(pi2_src2_val1, (int16_t) wgt1) + smull v5.4s, v1.4h, v7.4h[1] //vmull_n_s16(pi2_src2_val1, (int16_t) wgt1) ld1 {v3.4h},[x8],x4 //load and increment the pi2_src_tmp1 ii iteration - add v4.4s, v4.4s , v8.4s //vaddq_s32(i4_tmp1_t1, i4_tmp1_t2) + add v4.4s, v4.4s , v5.4s //vaddq_s32(i4_tmp1_t1, i4_tmp1_t2) ld1 {v0.4h},[x6],x3 //load and increment the pi2_src1 iii iteration - smull v10.4s, v2.4h, v7.4h[0] //vmull_n_s16(pi2_src1_val2, (int16_t) wgt0) ii iteration + smull v6.4s, v2.4h, v7.4h[0] //vmull_n_s16(pi2_src1_val2, (int16_t) wgt0) ii iteration ld1 {v1.4h},[x8],x4 //load and increment the pi2_src2 iii iteration add v4.4s, v4.4s , v30.4s //vaddq_s32(i4_tmp1_t1, tmp_lvl_shift_t) - smull v14.4s, v0.4h, v7.4h[0] //vmull_n_s16(pi2_src1_val1, (int16_t) wgt0) iii iteration + smull v19.4s, v0.4h, v7.4h[0] //vmull_n_s16(pi2_src1_val1, (int16_t) wgt0) iii iteration ld1 {v2.4h},[x6],x3 //load and increment the pi2_src_tmp1 iv iteration - smull v12.4s, v3.4h, v7.4h[1] //vmull_n_s16(pi2_src2_val2, (int16_t) wgt1) ii iteration + smull v17.4s, v3.4h, v7.4h[1] //vmull_n_s16(pi2_src2_val2, (int16_t) wgt1) ii iteration sshl v4.4s,v4.4s,v28.4s //vshlq_s32(i4_tmp1_t1, tmp_shift_t) ld1 {v3.4h},[x8],x4 //load and increment the pi2_src_tmp1 iv iteration - add v10.4s, v10.4s , v12.4s //vaddq_s32(i4_tmp2_t1, i4_tmp2_t2) ii iteration + add v6.4s, v6.4s , v17.4s //vaddq_s32(i4_tmp2_t1, i4_tmp2_t2) ii iteration sqxtun v4.4h, v4.4s //vqmovun_s32(sto_res_tmp1) smull v16.4s, v1.4h, v7.4h[1] //vmull_n_s16(pi2_src2_val1, (int16_t) wgt1) iii iteration - add v10.4s, v10.4s , v30.4s //vaddq_s32(i4_tmp2_t1, tmp_lvl_shift_t) ii iteration + add v6.4s, v6.4s , v30.4s //vaddq_s32(i4_tmp2_t1, tmp_lvl_shift_t) ii iteration //mov v5, v4 //vcombine_u16(sto_res_tmp2, sto_res_tmp2) - add v14.4s, v14.4s , v16.4s //vaddq_s32(i4_tmp1_t1, i4_tmp1_t2) iii iteration + add v19.4s, v19.4s , v16.4s //vaddq_s32(i4_tmp1_t1, i4_tmp1_t2) iii iteration - sshl v10.4s,v10.4s,v28.4s + sshl v6.4s,v6.4s,v28.4s //vshl.s32 q5,q5,q14 //vshlq_s32(i4_tmp2_t1, tmp_shift_t) ii iteration smull v18.4s, v2.4h, v7.4h[0] //vmull_n_s16(pi2_src1_val2, (int16_t) wgt0) iv iteration uqxtn v4.8b,v4.8h //vqmovn.u16 d4,q2 //vqmovn_u16(sto_res_tmp3) - add v14.4s, v14.4s , v30.4s //vaddq_s32(i4_tmp1_t1, tmp_lvl_shift_t) iii iteration + add v19.4s, v19.4s , v30.4s //vaddq_s32(i4_tmp1_t1, tmp_lvl_shift_t) iii iteration - sqxtun v10.4h, v10.4s //vqmovun_s32(sto_res_tmp1) ii iteration + sqxtun v6.4h, v6.4s //vqmovun_s32(sto_res_tmp1) ii iteration smull v20.4s, v3.4h, v7.4h[1] //vmull_n_s16(pi2_src2_val2, (int16_t) wgt1) iv iteration - sshl v14.4s,v14.4s,v28.4s + sshl v19.4s,v19.4s,v28.4s //vshl.s32 q7,q7,q14 //vshlq_s32(i4_tmp1_t1, tmp_shift_t) iii iteration //mov v11, v10 //vcombine_u16(sto_res_tmp2, sto_res_tmp2) ii iteration add v18.4s, v18.4s , v20.4s //vaddq_s32(i4_tmp2_t1, i4_tmp2_t2) iv iteration - sqxtun v14.4h, v14.4s //vqmovun_s32(sto_res_tmp1) iii iteration + sqxtun v19.4h, v19.4s //vqmovun_s32(sto_res_tmp1) iii iteration add v18.4s, v18.4s , v30.4s //vaddq_s32(i4_tmp2_t1, tmp_lvl_shift_t) iv iteration st1 {v4.s}[0],[x2],#4 //store pu1_dst i iteration - uqxtn v10.8b,v10.8h + uqxtn v6.8b,v6.8h //vqmovn.u16 d10,q5 //vqmovn_u16(sto_res_tmp3) ii iteration sshl v18.4s,v18.4s,v28.4s //vshl.s32 q9,q9,q14 //vshlq_s32(i4_tmp2_t1, tmp_shift_t) iv iteration - st1 {v10.s}[0],[x10],x5 //store pu1_dst ii iteration + st1 {v6.s}[0],[x10],x5 //store pu1_dst ii iteration //mov v15, v14 //vcombine_u16(sto_res_tmp2, sto_res_tmp2) iii iteration - uqxtn v14.8b,v14.8h + uqxtn v19.8b,v19.8h //vqmovn.u16 d14,q7 //vqmovn_u16(sto_res_tmp3) iii iteration sqxtun v18.4h, v18.4s //vqmovun_s32(sto_res_tmp1) iv iteration //mov v19, v18 //vcombine_u16(sto_res_tmp2, sto_res_tmp2) - st1 {v14.s}[0],[x10],x5 //store pu1_dst iii iteration + st1 {v19.s}[0],[x10],x5 //store pu1_dst iii iteration uqxtn v18.8b,v18.8h //vqmovn.u16 d18,q9 //vqmovn_u16(sto_res_tmp3) iv iteration subs x7,x7,#4 //decrement wd by 4 and check for 0 @@ -306,7 +306,7 @@ end_loops: ldp x23, x24,[sp],#16 ldp x21, x22,[sp],#16 ldp x19, x20,[sp],#16 - pop_v_regs + ret diff --git a/common/arm64/ihevc_weighted_pred_bi_default.s b/common/arm64/ihevc_weighted_pred_bi_default.s index 07fb4ce..d98e025 100644 --- a/common/arm64/ihevc_weighted_pred_bi_default.s +++ b/common/arm64/ihevc_weighted_pred_bi_default.s @@ -122,7 +122,7 @@ ihevc_weighted_pred_bi_default_av8: ldr w9,[sp,#8] // stmfd sp!, {x4-x12, x14} //stack stores the values of the arguments - push_v_regs + stp x19, x20,[sp,#-16]! stp x21, x22,[sp,#-16]! @@ -195,11 +195,11 @@ core_loop_4: ld1 {v6.4h},[x0],#8 //load and increment the pi2_src1 add x14,x2,x5 //pu1_dst_tmp = pu1_dst + dst_strd ld1 {v7.4h},[x1],#8 //load and increment the pi2_src2 - ld1 {v8.4h},[x11],x3 //load and increment the pi2_src1 ii iteration + ld1 {v1.4h},[x11],x3 //load and increment the pi2_src1 ii iteration sqadd v18.4h,v6.4h,v7.4h sqadd v18.4h,v18.4h,v0.4h //vaddq_s32(i4_tmp1_t1, tmp_lvl_shift_t) - ld1 {v9.4h},[x12],x4 //load and increment the pi2_src2 ii iteration - sqadd v20.4h,v8.4h,v9.4h //vaddq_s32(i4_tmp2_t1, i4_tmp2_t2) + ld1 {v3.4h},[x12],x4 //load and increment the pi2_src2 ii iteration + sqadd v20.4h,v1.4h,v3.4h //vaddq_s32(i4_tmp2_t1, i4_tmp2_t2) sqadd v19.4h,v20.4h,v0.4h //vaddq_s32(i4_tmp2_t1, tmp_lvl_shift_t) mov v18.d[1],v19.d[0] sqshrun v20.8b, v18.8h,#7 @@ -250,11 +250,11 @@ core_loop_chroma_4x2: ld1 {v6.4h},[x0],#8 //load and increment the pi2_src1 add x14,x2,x5 //pu1_dst_tmp = pu1_dst + dst_strd ld1 {v7.4h},[x1],#8 //load and increment the pi2_src2 - ld1 {v8.4h},[x11],x3 //load and increment the pi2_src1 ii iteration + ld1 {v1.4h},[x11],x3 //load and increment the pi2_src1 ii iteration sqadd v18.4h,v6.4h,v7.4h sqadd v18.4h,v18.4h,v0.4h //vaddq_s32(i4_tmp1_t1, tmp_lvl_shift_t) - ld1 {v9.4h},[x12],x4 //load and increment the pi2_src2 ii iteration - sqadd v20.4h,v8.4h,v9.4h //vaddq_s32(i4_tmp2_t1, i4_tmp2_t2) + ld1 {v3.4h},[x12],x4 //load and increment the pi2_src2 ii iteration + sqadd v20.4h,v1.4h,v3.4h //vaddq_s32(i4_tmp2_t1, i4_tmp2_t2) sqadd v19.4h,v20.4h,v0.4h //vaddq_s32(i4_tmp2_t1, tmp_lvl_shift_t) mov v18.d[1],v19.d[0] sqshrun v20.8b, v18.8h,#7 @@ -301,17 +301,17 @@ core_loop_8: ld1 { v18.8h},[x12],x4 //load and increment the pi2_src2 iii iteration sqadd v22.8h,v22.8h,v0.8h //vaddq_s32(i4_tmp2_t1, tmp_lvl_shift_t) sqshrun v20.8b, v24.8h,#7 - ld1 { v12.8h},[x11],x3 //load and increment the pi2_src1 iv iteration + ld1 { v17.8h},[x11],x3 //load and increment the pi2_src1 iv iteration sqadd v30.8h,v16.8h,v18.8h sqshrun v21.8b, v22.8h,#7 - ld1 { v14.8h},[x12],x4 //load and increment the pi2_src2 iv iteration + ld1 { v29.8h},[x12],x4 //load and increment the pi2_src2 iv iteration sqadd v30.8h,v30.8h,v0.8h //vaddq_s32(i4_tmp1_t1, tmp_lvl_shift_t) iii iteration st1 {v20.2s},[x2],#8 //store pu1_dst i iteration - sqadd v8.8h,v12.8h,v14.8h //vaddq_s32(i4_tmp2_t1, i4_tmp2_t2) iv iteration + sqadd v1.8h,v17.8h,v29.8h //vaddq_s32(i4_tmp2_t1, i4_tmp2_t2) iv iteration st1 {v21.2s},[x14],x5 //store pu1_dst ii iteration - sqadd v8.8h,v8.8h,v0.8h + sqadd v1.8h,v1.8h,v0.8h sqshrun v30.8b, v30.8h,#7 - sqshrun v31.8b, v8.8h,#7 + sqshrun v31.8b, v1.8h,#7 add x11,x0,x3 //pi2_src_tmp1 = pi2_src1 + 2*src_strd1(2* because pi1_src is a 16 bit pointer) add x12,x1,x4 //pi2_src_tmp2 = pi2_src2 + 2*src_strd2(2* because pi2_src is a 16 bit pointer) st1 {v30.2s},[x14],x5 //store pu1_dst iii iteration //vaddq_s32(i4_tmp2_t1, tmp_lvl_shift_t) iv iteratio @@ -413,40 +413,40 @@ prolog_16: ld1 { v2.8h},[x0],#16 //load and increment the pi2_src1 ld1 { v4.8h},[x1],#16 //load and increment the pi2_src2 - ld1 { v10.8h},[x0],x11 //load and increment the pi2_src1 - ld1 { v12.8h},[x1],x11 //load and increment the pi2_src2 + ld1 { v5.8h},[x0],x11 //load and increment the pi2_src1 + ld1 { v17.8h},[x1],x11 //load and increment the pi2_src2 ld1 { v6.8h},[x0],#16 //load and increment the pi2_src1 ii iteration subs x9,x9,#16 - ld1 { v8.8h},[x1],#16 //load and increment the pi2_src2 ii iteration + ld1 { v1.8h},[x1],#16 //load and increment the pi2_src2 ii iteration sub x20,x8,#2 csel x8, x20, x8,eq sqadd v22.8h,v2.8h,v4.8h - ld1 { v14.8h},[x0],x12 //load and increment the pi2_src1 ii iteration - sqadd v28.8h,v10.8h,v12.8h + ld1 { v29.8h},[x0],x12 //load and increment the pi2_src1 ii iteration + sqadd v28.8h,v5.8h,v17.8h ld1 { v16.8h},[x1],x12 //load and increment the pi2_src2 ii iteration add x20,x0,x7 csel x0, x20, x0,eq add x20,x1,x7 csel x1, x20, x1,eq - sqadd v24.8h,v6.8h,v8.8h + sqadd v24.8h,v6.8h,v1.8h ld1 { v2.8h},[x0],#16 - sqadd v26.8h,v14.8h,v16.8h + sqadd v26.8h,v29.8h,v16.8h // if the input is chroma with 8x2 block size cmp x8,#0 beq epilog_16 ld1 { v4.8h},[x1],#16 //load and increment the pi2_src2 sqadd v22.8h,v22.8h,v0.8h - ld1 { v10.8h},[x0],x11 //load and increment the pi2_src1 + ld1 { v5.8h},[x0],x11 //load and increment the pi2_src1 sqadd v28.8h,v28.8h,v0.8h - ld1 { v12.8h},[x1],x11 //load and increment the pi2_src2 + ld1 { v17.8h},[x1],x11 //load and increment the pi2_src2 sqadd v24.8h,v24.8h,v0.8h ld1 { v6.8h},[x0],#16 //load and increment the pi2_src1 ii iteration sqadd v30.8h,v26.8h,v0.8h sqshrun v20.8b, v22.8h,#7 - ld1 { v8.8h},[x1],#16 //load and increment the pi2_src2 ii iteration + ld1 { v1.8h},[x1],#16 //load and increment the pi2_src2 ii iteration sqshrun v21.8b, v28.8h,#7 - ld1 { v14.8h},[x0],x12 //load and increment the pi2_src1 ii iteration + ld1 { v29.8h},[x0],x12 //load and increment the pi2_src1 ii iteration sqshrun v26.8b, v24.8h,#7 ld1 { v16.8h},[x1],x12 //load and increment the pi2_src2 ii iteration sqshrun v27.8b, v30.8h,#7 @@ -463,15 +463,15 @@ core_loop_16: mov v20.d[1],v21.d[0] mov v26.d[1],v27.d[0] st1 { v20.4s},[x2],x5 - sqadd v28.8h,v10.8h,v12.8h + sqadd v28.8h,v5.8h,v17.8h st1 { v26.4s},[x2],x10 add x20,x2,x14 csel x2, x20, x2,eq - sqadd v24.8h,v6.8h,v8.8h + sqadd v24.8h,v6.8h,v1.8h subs x9,x9,#16 add x20,x0,x7 csel x0, x20, x0,eq - sqadd v26.8h,v14.8h,v16.8h + sqadd v26.8h,v29.8h,v16.8h add x20,x1,x7 csel x1, x20, x1,eq @@ -487,15 +487,15 @@ core_loop_16: sqadd v28.8h,v28.8h,v0.8h ld1 { v4.8h},[x1],#16 //load and increment the pi2_src2 sqadd v24.8h,v24.8h,v0.8h - ld1 { v10.8h},[x0],x11 //load and increment the pi2_src1 + ld1 { v5.8h},[x0],x11 //load and increment the pi2_src1 sqadd v30.8h,v26.8h,v0.8h - ld1 { v12.8h},[x1],x11 //load and increment the pi2_src2 + ld1 { v17.8h},[x1],x11 //load and increment the pi2_src2 sqshrun v20.8b, v22.8h,#7 ld1 { v6.8h},[x0],#16 //load and increment the pi2_src1 ii iteration sqshrun v21.8b, v28.8h,#7 - ld1 { v8.8h},[x1],#16 //load and increment the pi2_src2 ii iteration + ld1 { v1.8h},[x1],#16 //load and increment the pi2_src2 ii iteration sqshrun v26.8b, v24.8h,#7 - ld1 { v14.8h},[x0],x12 //load and increment the pi2_src1 ii iteration + ld1 { v29.8h},[x0],x12 //load and increment the pi2_src1 ii iteration sqshrun v27.8b, v30.8h,#7 ld1 { v16.8h},[x1],x12 //load and increment the pi2_src2 ii iteration @@ -533,7 +533,7 @@ end_loops: // ldmfd sp!,{x4-x12,x15} //reload the registers from sp ldp x21, x22,[sp],#16 ldp x19, x20,[sp],#16 - pop_v_regs + ret diff --git a/common/arm64/ihevc_weighted_pred_uni.s b/common/arm64/ihevc_weighted_pred_uni.s index d805230..5586679 100644 --- a/common/arm64/ihevc_weighted_pred_uni.s +++ b/common/arm64/ihevc_weighted_pred_uni.s @@ -129,7 +129,7 @@ ihevc_weighted_pred_uni_av8: ldr w9,[sp,#8] // stmfd sp!, {x4-x12, x14} //stack stores the values of the arguments - push_v_regs + stp x19, x20,[sp,#-16]! stp x21, x22,[sp,#-16]! @@ -175,37 +175,37 @@ core_loop: smull v4.4s, v1.4h, v0.4h[0] //vmull_n_s16(pi2_src_val1, (int16_t) wgt0) add v4.4s, v4.4s , v30.4s //vaddq_s32(i4_tmp1_t, tmp_lvl_shift_t) - ld1 {v8.4h},[x5],x2 //load and increment the pi2_src iii iteration + ld1 {v3.4h},[x5],x2 //load and increment the pi2_src iii iteration smull v6.4s, v2.4h, v0.4h[0] //vmull_n_s16(pi2_src_val2, (int16_t) wgt0) ii iteration - ld1 {v9.4h},[x5],x2 //load and increment the pi2_src_tmp iv iteration + ld1 {v5.4h},[x5],x2 //load and increment the pi2_src_tmp iv iteration sshl v4.4s,v4.4s,v28.4s //vshl.s32 q2,q2,q14 //vshlq_s32(i4_tmp1_t, tmp_shift_t) add v6.4s, v6.4s , v30.4s //vaddq_s32(i4_tmp2_t, tmp_lvl_shift_t) ii iteration - smull v10.4s, v8.4h, v0.4h[0] //vmull_n_s16(pi2_src_val1, (int16_t) wgt0) iii iteration + smull v7.4s, v3.4h, v0.4h[0] //vmull_n_s16(pi2_src_val1, (int16_t) wgt0) iii iteration sqxtun v4.4h, v4.4s //vqmovun_s32(sto_res_tmp1) - add v10.4s, v10.4s , v30.4s //vaddq_s32(i4_tmp1_t, tmp_lvl_shift_t) iii iteration + add v7.4s, v7.4s , v30.4s //vaddq_s32(i4_tmp1_t, tmp_lvl_shift_t) iii iteration //mov v5, v4 //vcombine_u16(sto_res_tmp2, sto_res_tmp2) sshl v6.4s,v6.4s,v28.4s //vshl.s32 q3,q3,q14 //vshlq_s32(i4_tmp2_t, tmp_shift_t) ii iteration - smull v12.4s, v9.4h, v0.4h[0] //vmull_n_s16(pi2_src_val2, (int16_t) wgt0) iv iteration + smull v16.4s, v5.4h, v0.4h[0] //vmull_n_s16(pi2_src_val2, (int16_t) wgt0) iv iteration uqxtn v4.8b, v4.8h //vqmovn_u16(sto_res_tmp3) - sshl v10.4s,v10.4s,v28.4s + sshl v7.4s,v7.4s,v28.4s //vshl.s32 q5,q5,q14 //vshlq_s32(i4_tmp1_t, tmp_shift_t) iii iteration sqxtun v6.4h, v6.4s //vqmovun_s32(sto_res_tmp1) ii iteration - add v12.4s, v12.4s , v30.4s //vaddq_s32(i4_tmp2_t, tmp_lvl_shift_t) iv iteration + add v16.4s, v16.4s , v30.4s //vaddq_s32(i4_tmp2_t, tmp_lvl_shift_t) iv iteration //mov v7, v6 //vcombine_u16(sto_res_tmp2, sto_res_tmp2) ii iteration - sqxtun v10.4h, v10.4s //vqmovun_s32(sto_res_tmp1) iii iteration + sqxtun v7.4h, v7.4s //vqmovun_s32(sto_res_tmp1) iii iteration - sshl v12.4s,v12.4s,v28.4s + sshl v16.4s,v16.4s,v28.4s //vshl.s32 q6,q6,q14 //vshlq_s32(i4_tmp2_t, tmp_shift_t) iv iteration st1 {v4.s}[0],[x1],#4 //store pu1_dst i iteration //mov v11, v10 //vcombine_u16(sto_res_tmp2, sto_res_tmp2) iii iteration @@ -213,15 +213,15 @@ core_loop: uqxtn v6.8b, v6.8h //vqmovn_u16(sto_res_tmp3) ii iteration st1 {v6.s}[0],[x6],x3 //store pu1_dst ii iteration - uqxtn v10.8b, v10.8h //vqmovn_u16(sto_res_tmp3) iii iteration - sqxtun v12.4h, v12.4s //vqmovun_s32(sto_res_tmp1) iv iteration + uqxtn v7.8b, v7.8h //vqmovn_u16(sto_res_tmp3) iii iteration + sqxtun v16.4h, v16.4s //vqmovun_s32(sto_res_tmp1) iv iteration //mov v13, v12 //vcombine_u16(sto_res_tmp2, sto_res_tmp2) iv iteration - st1 {v10.s}[0],[x6],x3 //store pu1_dst i iteration iii iteration - uqxtn v12.8b, v12.8h //vqmovn_u16(sto_res_tmp3) iv iteration + st1 {v7.s}[0],[x6],x3 //store pu1_dst i iteration iii iteration + uqxtn v16.8b, v16.8h //vqmovn_u16(sto_res_tmp3) iv iteration subs x9,x9,#4 //decrement wd by 4 and check for 0 - st1 {v12.s}[0],[x6],x3 //store pu1_dst iv iteration + st1 {v16.s}[0],[x6],x3 //store pu1_dst iv iteration bgt core_loop //if greater than 0 repeat the core loop again end_core_loop: @@ -239,7 +239,7 @@ end_loops: // ldmfd sp!,{x4-x12,x15} //reload the registers from sp ldp x21, x22,[sp],#16 ldp x19, x20,[sp],#16 - pop_v_regs + ret diff --git a/decoder/arm64/ihevcd_fmt_conv_420sp_to_rgba8888.s b/decoder/arm64/ihevcd_fmt_conv_420sp_to_rgba8888.s index 485ee66..a6041f5 100644 --- a/decoder/arm64/ihevcd_fmt_conv_420sp_to_rgba8888.s +++ b/decoder/arm64/ihevcd_fmt_conv_420sp_to_rgba8888.s @@ -91,7 +91,10 @@ ihevcd_fmt_conv_420sp_to_rgba8888_av8: //// push the registers on the stack // STMFD sp!,{x4-x12,x14} - push_v_regs + + stp d12,d14,[sp,#-16]! + stp d8,d15,[sp,#-16]! // Storing d15 using { sub sp,sp,#8; str d15,[sp] } is giving bus error. + // d8 is used as dummy register and stored along with d15 using stp. d8 is not used in the function. stp x19, x20,[sp,#-16]! @@ -194,8 +197,8 @@ LABEL_YUV420SP_TO_RGB8888_WIDTH_LOOP: prfm PLDL1KEEP,[x1] ////NEED TO MULTIPLY WITH Q2,Q3 WITH CO-EEFICIENTS - sMULL v8.4s, v4.4h, v0.4h[3] ////(U-128)*C4 FOR B - sMULL2 v10.4s, v4.8h, v0.4h[3] ////(U-128)*C4 FOR B + sMULL v5.4s, v4.4h, v0.4h[3] ////(U-128)*C4 FOR B + sMULL2 v7.4s, v4.8h, v0.4h[3] ////(U-128)*C4 FOR B sMULL v20.4s, v6.4h, v0.4h[0] ////(V-128)*C1 FOR R sMULL2 v22.4s, v6.8h, v0.4h[0] ////(V-128)*C1 FOR R @@ -206,13 +209,13 @@ LABEL_YUV420SP_TO_RGB8888_WIDTH_LOOP: sMLAL2 v14.4s, v6.8h, v0.4h[2] ////Q7 = (U-128)*C2 + (V-128)*C3 ////NARROW RIGHT SHIFT BY 13 FOR R&B - sqshrn v8.4h, v8.4s,#13 ////D8 = (U-128)*C4>>13 4 16-BIT VALUES - sqshrn2 v8.8h, v10.4s,#13 ////D9 = (U-128)*C4>>13 4 16-BIT VALUES + sqshrn v5.4h, v5.4s,#13 ////D8 = (U-128)*C4>>13 4 16-BIT VALUES + sqshrn2 v5.8h, v7.4s,#13 ////D9 = (U-128)*C4>>13 4 16-BIT VALUES ////Q4 - WEIGHT FOR B ////NARROW RIGHT SHIFT BY 13 FOR R&B - sqshrn v10.4h, v20.4s,#13 ////D10 = (V-128)*C1>>13 4 16-BIT VALUES - sqshrn2 v10.8h, v22.4s,#13 ////D11 = (V-128)*C1>>13 4 16-BIT VALUES + sqshrn v7.4h, v20.4s,#13 ////D10 = (V-128)*C1>>13 4 16-BIT VALUES + sqshrn2 v7.8h, v22.4s,#13 ////D11 = (V-128)*C1>>13 4 16-BIT VALUES ////Q5 - WEIGHT FOR R ////NARROW RIGHT SHIFT BY 13 FOR G @@ -220,12 +223,12 @@ LABEL_YUV420SP_TO_RGB8888_WIDTH_LOOP: sqshrn2 v12.8h, v14.4s,#13 ////D13 = [(U-128)*C2 + (V-128)*C3]>>13 4 16-BIT VALUES ////Q6 - WEIGHT FOR G - UADDW v14.8h, v8.8h , v30.8b ////Q7 - HAS Y + B - UADDW v16.8h, v10.8h , v30.8b ////Q8 - HAS Y + R + UADDW v14.8h, v5.8h , v30.8b ////Q7 - HAS Y + B + UADDW v16.8h, v7.8h , v30.8b ////Q8 - HAS Y + R UADDW v18.8h, v12.8h , v30.8b ////Q9 - HAS Y + G - UADDW v20.8h, v8.8h , v31.8b ////Q10 - HAS Y + B - UADDW v22.8h, v10.8h , v31.8b ////Q11 - HAS Y + R + UADDW v20.8h, v5.8h , v31.8b ////Q10 - HAS Y + B + UADDW v22.8h, v7.8h , v31.8b ////Q11 - HAS Y + R UADDW v24.8h, v12.8h , v31.8b ////Q12 - HAS Y + G sqxtun v14.8b, v14.8h @@ -276,12 +279,12 @@ LABEL_YUV420SP_TO_RGB8888_WIDTH_LOOP: ////D14-D20 - TOALLY HAVE 16 VALUES ////WE NEED TO SHIFT R,G,B VALUES TO GET 5BIT,6BIT AND 5BIT COMBINATIONS - UADDW v14.8h, v8.8h , v28.8b ////Q7 - HAS Y + B - UADDW v16.8h, v10.8h , v28.8b ////Q2 - HAS Y + R + UADDW v14.8h, v5.8h , v28.8b ////Q7 - HAS Y + B + UADDW v16.8h, v7.8h , v28.8b ////Q2 - HAS Y + R UADDW v18.8h, v12.8h , v28.8b ////Q3 - HAS Y + G - UADDW v20.8h, v8.8h , v29.8b ////Q10 - HAS Y + B - UADDW v22.8h, v10.8h , v29.8b ////Q11 - HAS Y + R + UADDW v20.8h, v5.8h , v29.8b ////Q10 - HAS Y + B + UADDW v22.8h, v7.8h , v29.8b ////Q11 - HAS Y + R UADDW v24.8h, v12.8h , v29.8b ////Q12 - HAS Y + G ////COMPUTE THE ACTUAL RGB VALUES,WE CAN DO TWO ROWS AT A TIME @@ -357,8 +360,8 @@ LABEL_YUV420SP_TO_RGB8888_WIDTH_LOOP_SKIP: ////NEED TO MULTIPLY WITH Q2,Q3 WITH CO-EEFICIENTS - sMULL v8.4s, v4.4h, v0.4h[3] ////(U-128)*C4 FOR B - sMULL2 v10.4s, v4.8h, v0.4h[3] ////(U-128)*C4 FOR B + sMULL v5.4s, v4.4h, v0.4h[3] ////(U-128)*C4 FOR B + sMULL2 v7.4s, v4.8h, v0.4h[3] ////(U-128)*C4 FOR B sMULL v20.4s, v6.4h, v0.4h[0] ////(V-128)*C1 FOR R sMULL2 v22.4s, v6.8h, v0.4h[0] ////(V-128)*C1 FOR R @@ -369,13 +372,13 @@ LABEL_YUV420SP_TO_RGB8888_WIDTH_LOOP_SKIP: sMLAL2 v14.4s, v6.8h, v0.4h[2] ////Q7 = (U-128)*C2 + (V-128)*C3 ////NARROW RIGHT SHIFT BY 13 FOR R&B - sqshrn v8.4h, v8.4s,#13 ////D8 = (U-128)*C4>>13 4 16-BIT VALUES - sqshrn2 v8.8h, v10.4s,#13 ////D9 = (U-128)*C4>>13 4 16-BIT VALUES + sqshrn v5.4h, v5.4s,#13 ////D8 = (U-128)*C4>>13 4 16-BIT VALUES + sqshrn2 v5.8h, v7.4s,#13 ////D9 = (U-128)*C4>>13 4 16-BIT VALUES ////Q4 - WEIGHT FOR B ////NARROW RIGHT SHIFT BY 13 FOR R&B - sqshrn v10.4h, v20.4s,#13 ////D10 = (V-128)*C1>>13 4 16-BIT VALUES - sqshrn2 v10.8h, v22.4s,#13 ////D11 = (V-128)*C1>>13 4 16-BIT VALUES + sqshrn v7.4h, v20.4s,#13 ////D10 = (V-128)*C1>>13 4 16-BIT VALUES + sqshrn2 v7.8h, v22.4s,#13 ////D11 = (V-128)*C1>>13 4 16-BIT VALUES ////Q5 - WEIGHT FOR R ////NARROW RIGHT SHIFT BY 13 FOR G @@ -383,12 +386,12 @@ LABEL_YUV420SP_TO_RGB8888_WIDTH_LOOP_SKIP: sqshrn2 v12.8h, v14.4s,#13 ////D13 = [(U-128)*C2 + (V-128)*C3]>>13 4 16-BIT VALUES ////Q6 - WEIGHT FOR G - UADDW v14.8h, v8.8h , v30.8b ////Q7 - HAS Y + B - UADDW v16.8h, v10.8h , v30.8b ////Q8 - HAS Y + R + UADDW v14.8h, v5.8h , v30.8b ////Q7 - HAS Y + B + UADDW v16.8h, v7.8h , v30.8b ////Q8 - HAS Y + R UADDW v18.8h, v12.8h , v30.8b ////Q9 - HAS Y + G - UADDW v20.8h, v8.8h , v31.8b ////Q10 - HAS Y + B - UADDW v22.8h, v10.8h , v31.8b ////Q11 - HAS Y + R + UADDW v20.8h, v5.8h , v31.8b ////Q10 - HAS Y + B + UADDW v22.8h, v7.8h , v31.8b ////Q11 - HAS Y + R UADDW v24.8h, v12.8h , v31.8b ////Q12 - HAS Y + G sqxtun v14.8b, v14.8h @@ -439,12 +442,12 @@ LABEL_YUV420SP_TO_RGB8888_WIDTH_LOOP_SKIP: ////D14-D20 - TOALLY HAVE 16 VALUES ////WE NEED TO SHIFT R,G,B VALUES TO GET 5BIT,6BIT AND 5BIT COMBINATIONS - UADDW v14.8h, v8.8h , v28.8b ////Q7 - HAS Y + B - UADDW v16.8h, v10.8h , v28.8b ////Q2 - HAS Y + R + UADDW v14.8h, v5.8h , v28.8b ////Q7 - HAS Y + B + UADDW v16.8h, v7.8h , v28.8b ////Q2 - HAS Y + R UADDW v18.8h, v12.8h , v28.8b ////Q3 - HAS Y + G - UADDW v20.8h, v8.8h , v29.8b ////Q10 - HAS Y + B - UADDW v22.8h, v10.8h , v29.8b ////Q11 - HAS Y + R + UADDW v20.8h, v5.8h , v29.8b ////Q10 - HAS Y + B + UADDW v22.8h, v7.8h , v29.8b ////Q11 - HAS Y + R UADDW v24.8h, v12.8h , v29.8b ////Q12 - HAS Y + G sqxtun v14.8b, v14.8h @@ -513,7 +516,9 @@ LABEL_YUV420SP_TO_RGB8888_WIDTH_LOOP_SKIP: ////POP THE REGISTERS // LDMFD sp!,{x4-x12,PC} ldp x19, x20,[sp],#16 - pop_v_regs + ldp d8,d15,[sp],#16 // Loading d15 using { ldr d15,[sp]; add sp,sp,#8 } is giving bus error. + // d8 is used as dummy register and loaded along with d15 using ldp. d8 is not used in the function. + ldp d12,d14,[sp],#16 ret diff --git a/decoder/arm64/ihevcd_itrans_recon_dc_luma.s b/decoder/arm64/ihevcd_itrans_recon_dc_luma.s index 279888b..edc70e7 100644 --- a/decoder/arm64/ihevcd_itrans_recon_dc_luma.s +++ b/decoder/arm64/ihevcd_itrans_recon_dc_luma.s @@ -59,7 +59,7 @@ ihevcd_itrans_recon_dc_luma_av8: - push_v_regs + stp x19, x20,[sp,#-16]! sxth x5,w5 @@ -120,8 +120,8 @@ col_loop: ld1 {v6.8b},[x7],x2 ld1 {v7.8b},[x7],x2 - ld1 {v8.8b},[x7],x2 - ld1 {v9.8b},[x7] + ld1 {v1.8b},[x7],x2 + ld1 {v17.8b},[x7] add x0,x0,#8 @@ -132,8 +132,8 @@ col_loop: uaddw v24.8h, v0.8h , v5.8b uaddw v22.8h, v0.8h , v6.8b uaddw v20.8h, v0.8h , v7.8b - uaddw v18.8h, v0.8h , v8.8b - uaddw v16.8h, v0.8h , v9.8b + uaddw v18.8h, v0.8h , v1.8b + uaddw v16.8h, v0.8h , v17.8b mov x11,x1 sqxtun v2.8b, v30.8h @@ -142,8 +142,8 @@ col_loop: sqxtun v5.8b, v24.8h sqxtun v6.8b, v22.8h sqxtun v7.8b, v20.8h - sqxtun v8.8b, v18.8h - sqxtun v9.8b, v16.8h + sqxtun v1.8b, v18.8h + sqxtun v17.8b, v16.8h st1 {v2.2s},[x11],x3 @@ -152,8 +152,8 @@ col_loop: st1 {v5.2s},[x11],x3 st1 {v6.2s},[x11],x3 st1 {v7.2s},[x11],x3 - st1 {v8.2s},[x11],x3 - st1 {v9.2s},[x11] + st1 {v1.2s},[x11],x3 + st1 {v17.2s},[x11] add x1,x1,#8 @@ -206,7 +206,7 @@ col_loop_4: end_loops: ldp x19, x20,[sp],#16 - pop_v_regs + ret |