diff options
Diffstat (limited to 'common/arm64/ihevc_inter_pred_chroma_copy_w16out.s')
-rw-r--r-- | common/arm64/ihevc_inter_pred_chroma_copy_w16out.s | 62 |
1 files changed, 31 insertions, 31 deletions
diff --git a/common/arm64/ihevc_inter_pred_chroma_copy_w16out.s b/common/arm64/ihevc_inter_pred_chroma_copy_w16out.s index e479651..180e5f5 100644 --- a/common/arm64/ihevc_inter_pred_chroma_copy_w16out.s +++ b/common/arm64/ihevc_inter_pred_chroma_copy_w16out.s @@ -104,7 +104,7 @@ ihevc_inter_pred_chroma_copy_w16out_av8: // stmfd sp!, {x4-x12, x14} //stack stores the values of the arguments - push_v_regs + stp x19, x20,[sp,#-16]! mov x15,x4 // pi1_coeff @@ -172,7 +172,7 @@ end_inner_loop_wd_4: end_loops: // ldmfd sp!,{x4-x12,x15} //reload the registers from sp ldp x19, x20,[sp],#16 - pop_v_regs + ret @@ -219,14 +219,14 @@ core_loop_wd_8: prolog: add x6,x0,x2 //pu1_src_tmp += src_strd add x10,x1,x5 - ld1 {v8.8b},[x0],#8 //vld1_u8(pu1_src_tmp) - ld1 {v10.8b},[x6],x2 //vld1_u8(pu1_src_tmp) - ld1 {v12.8b},[x6],x2 //vld1_u8(pu1_src_tmp) - ld1 {v14.8b},[x6],x2 //vld1_u8(pu1_src_tmp) - uxtl v16.8h, v8.8b //vmovl_u8(vld1_u8(pu1_src_tmp)) - uxtl v18.8h, v10.8b //vmovl_u8(vld1_u8(pu1_src_tmp) - uxtl v20.8h, v12.8b //vmovl_u8(vld1_u8(pu1_src_tmp) - uxtl v22.8h, v14.8b //vmovl_u8(vld1_u8(pu1_src_tmp) + ld1 {v1.8b},[x0],#8 //vld1_u8(pu1_src_tmp) + ld1 {v3.8b},[x6],x2 //vld1_u8(pu1_src_tmp) + ld1 {v5.8b},[x6],x2 //vld1_u8(pu1_src_tmp) + ld1 {v7.8b},[x6],x2 //vld1_u8(pu1_src_tmp) + uxtl v16.8h, v1.8b //vmovl_u8(vld1_u8(pu1_src_tmp)) + uxtl v18.8h, v3.8b //vmovl_u8(vld1_u8(pu1_src_tmp) + uxtl v20.8h, v5.8b //vmovl_u8(vld1_u8(pu1_src_tmp) + uxtl v22.8h, v7.8b //vmovl_u8(vld1_u8(pu1_src_tmp) subs x4,x4,#8 //wd decrements by 8 shl v0.8h, v16.8h,#6 //vshlq_n_s16(tmp, 6) shl v2.8h, v18.8h,#6 //vshlq_n_s16(tmp, 6) @@ -235,10 +235,10 @@ prolog: add x20,x0,x8 csel x0, x20, x0,le add x6,x0,x2 //pu1_src_tmp += src_strd - ld1 {v8.8b},[x0],#8 //vld1_u8(pu1_src_tmp) - ld1 {v10.8b},[x6],x2 //vld1_u8(pu1_src_tmp) - ld1 {v12.8b},[x6],x2 //vld1_u8(pu1_src_tmp) - ld1 {v14.8b},[x6],x2 //vld1_u8(pu1_src_tmp) + ld1 {v1.8b},[x0],#8 //vld1_u8(pu1_src_tmp) + ld1 {v3.8b},[x6],x2 //vld1_u8(pu1_src_tmp) + ld1 {v5.8b},[x6],x2 //vld1_u8(pu1_src_tmp) + ld1 {v7.8b},[x6],x2 //vld1_u8(pu1_src_tmp) st1 {v0.8h},[x1],#16 //vst1q_s16(pi2_dst_tmp, tmp) add x20,x1,x11,lsl #1 @@ -256,15 +256,15 @@ prolog: outer_loop_wd_8: st1 {v2.8h},[x10],x5 //vst1q_s16(pi2_dst_tmp, tmp) - uxtl v16.8h, v8.8b //vmovl_u8(vld1_u8(pu1_src_tmp)) + uxtl v16.8h, v1.8b //vmovl_u8(vld1_u8(pu1_src_tmp)) st1 {v4.8h},[x10],x5 //vst1q_s16(pi2_dst_tmp, tmp) - uxtl v18.8h, v10.8b //vmovl_u8(vld1_u8(pu1_src_tmp) + uxtl v18.8h, v3.8b //vmovl_u8(vld1_u8(pu1_src_tmp) st1 {v6.8h},[x10],x5 //vst1q_s16(pi2_dst_tmp, tmp) - uxtl v20.8h, v12.8b //vmovl_u8(vld1_u8(pu1_src_tmp) + uxtl v20.8h, v5.8b //vmovl_u8(vld1_u8(pu1_src_tmp) - uxtl v22.8h, v14.8b //vmovl_u8(vld1_u8(pu1_src_tmp) + uxtl v22.8h, v7.8b //vmovl_u8(vld1_u8(pu1_src_tmp) subs x4,x4,#8 //wd decrements by 8 add x20,x0,x8 @@ -272,16 +272,16 @@ outer_loop_wd_8: add x6,x0,x2 //pu1_src_tmp += src_strd - ld1 {v8.8b},[x0],#8 //vld1_u8(pu1_src_tmp) + ld1 {v1.8b},[x0],#8 //vld1_u8(pu1_src_tmp) shl v0.8h, v16.8h,#6 //vshlq_n_s16(tmp, 6) - ld1 {v10.8b},[x6],x2 //vld1_u8(pu1_src_tmp) + ld1 {v3.8b},[x6],x2 //vld1_u8(pu1_src_tmp) shl v2.8h, v18.8h,#6 //vshlq_n_s16(tmp, 6) - ld1 {v12.8b},[x6],x2 //vld1_u8(pu1_src_tmp) + ld1 {v5.8b},[x6],x2 //vld1_u8(pu1_src_tmp) shl v4.8h, v20.8h,#6 //vshlq_n_s16(tmp, 6) - ld1 {v14.8b},[x6],x2 //vld1_u8(pu1_src_tmp) + ld1 {v7.8b},[x6],x2 //vld1_u8(pu1_src_tmp) add x10,x1,x5 shl v6.8h, v22.8h,#6 //vshlq_n_s16(tmp, 6) @@ -298,15 +298,15 @@ outer_loop_wd_8: epilog: st1 {v2.8h},[x10],x5 //vst1q_s16(pi2_dst_tmp, tmp) - uxtl v16.8h, v8.8b //vmovl_u8(vld1_u8(pu1_src_tmp)) + uxtl v16.8h, v1.8b //vmovl_u8(vld1_u8(pu1_src_tmp)) st1 {v4.8h},[x10],x5 //vst1q_s16(pi2_dst_tmp, tmp) - uxtl v18.8h, v10.8b //vmovl_u8(vld1_u8(pu1_src_tmp) + uxtl v18.8h, v3.8b //vmovl_u8(vld1_u8(pu1_src_tmp) st1 {v6.8h},[x10],x5 //vst1q_s16(pi2_dst_tmp, tmp) - uxtl v20.8h, v12.8b //vmovl_u8(vld1_u8(pu1_src_tmp) + uxtl v20.8h, v5.8b //vmovl_u8(vld1_u8(pu1_src_tmp) - uxtl v22.8h, v14.8b //vmovl_u8(vld1_u8(pu1_src_tmp) + uxtl v22.8h, v7.8b //vmovl_u8(vld1_u8(pu1_src_tmp) //add x6,x0,x2 //pu1_src_tmp += src_strd shl v0.8h, v16.8h,#6 //vshlq_n_s16(tmp, 6) @@ -325,10 +325,10 @@ epilog_end: core_loop_wd_8_ht_2: add x6,x0,x2 //pu1_src_tmp += src_strd add x10,x1,x5 - ld1 {v8.8b},[x0],#8 //vld1_u8(pu1_src_tmp) - ld1 {v10.8b},[x6],x2 //vld1_u8(pu1_src_tmp) - uxtl v16.8h, v8.8b //vmovl_u8(vld1_u8(pu1_src_tmp)) - uxtl v18.8h, v10.8b //vmovl_u8(vld1_u8(pu1_src_tmp) + ld1 {v1.8b},[x0],#8 //vld1_u8(pu1_src_tmp) + ld1 {v3.8b},[x6],x2 //vld1_u8(pu1_src_tmp) + uxtl v16.8h, v1.8b //vmovl_u8(vld1_u8(pu1_src_tmp)) + uxtl v18.8h, v3.8b //vmovl_u8(vld1_u8(pu1_src_tmp) subs x12,x12,#8 //wd decrements by 8 shl v0.8h, v16.8h,#6 //vshlq_n_s16(tmp, 6) shl v2.8h, v18.8h,#6 //vshlq_n_s16(tmp, 6) @@ -338,7 +338,7 @@ core_loop_wd_8_ht_2: // ldmfd sp!,{x4-x12,x15} //reload the registers from sp ldp x19, x20,[sp],#16 - pop_v_regs + ret |