diff options
Diffstat (limited to 'common/arm64/ihevc_weighted_pred_uni.s')
-rw-r--r-- | common/arm64/ihevc_weighted_pred_uni.s | 32 |
1 files changed, 16 insertions, 16 deletions
diff --git a/common/arm64/ihevc_weighted_pred_uni.s b/common/arm64/ihevc_weighted_pred_uni.s index d805230..5586679 100644 --- a/common/arm64/ihevc_weighted_pred_uni.s +++ b/common/arm64/ihevc_weighted_pred_uni.s @@ -129,7 +129,7 @@ ihevc_weighted_pred_uni_av8: ldr w9,[sp,#8] // stmfd sp!, {x4-x12, x14} //stack stores the values of the arguments - push_v_regs + stp x19, x20,[sp,#-16]! stp x21, x22,[sp,#-16]! @@ -175,37 +175,37 @@ core_loop: smull v4.4s, v1.4h, v0.4h[0] //vmull_n_s16(pi2_src_val1, (int16_t) wgt0) add v4.4s, v4.4s , v30.4s //vaddq_s32(i4_tmp1_t, tmp_lvl_shift_t) - ld1 {v8.4h},[x5],x2 //load and increment the pi2_src iii iteration + ld1 {v3.4h},[x5],x2 //load and increment the pi2_src iii iteration smull v6.4s, v2.4h, v0.4h[0] //vmull_n_s16(pi2_src_val2, (int16_t) wgt0) ii iteration - ld1 {v9.4h},[x5],x2 //load and increment the pi2_src_tmp iv iteration + ld1 {v5.4h},[x5],x2 //load and increment the pi2_src_tmp iv iteration sshl v4.4s,v4.4s,v28.4s //vshl.s32 q2,q2,q14 //vshlq_s32(i4_tmp1_t, tmp_shift_t) add v6.4s, v6.4s , v30.4s //vaddq_s32(i4_tmp2_t, tmp_lvl_shift_t) ii iteration - smull v10.4s, v8.4h, v0.4h[0] //vmull_n_s16(pi2_src_val1, (int16_t) wgt0) iii iteration + smull v7.4s, v3.4h, v0.4h[0] //vmull_n_s16(pi2_src_val1, (int16_t) wgt0) iii iteration sqxtun v4.4h, v4.4s //vqmovun_s32(sto_res_tmp1) - add v10.4s, v10.4s , v30.4s //vaddq_s32(i4_tmp1_t, tmp_lvl_shift_t) iii iteration + add v7.4s, v7.4s , v30.4s //vaddq_s32(i4_tmp1_t, tmp_lvl_shift_t) iii iteration //mov v5, v4 //vcombine_u16(sto_res_tmp2, sto_res_tmp2) sshl v6.4s,v6.4s,v28.4s //vshl.s32 q3,q3,q14 //vshlq_s32(i4_tmp2_t, tmp_shift_t) ii iteration - smull v12.4s, v9.4h, v0.4h[0] //vmull_n_s16(pi2_src_val2, (int16_t) wgt0) iv iteration + smull v16.4s, v5.4h, v0.4h[0] //vmull_n_s16(pi2_src_val2, (int16_t) wgt0) iv iteration uqxtn v4.8b, v4.8h //vqmovn_u16(sto_res_tmp3) - sshl v10.4s,v10.4s,v28.4s + sshl v7.4s,v7.4s,v28.4s //vshl.s32 q5,q5,q14 //vshlq_s32(i4_tmp1_t, tmp_shift_t) iii iteration sqxtun v6.4h, v6.4s //vqmovun_s32(sto_res_tmp1) ii iteration - add v12.4s, v12.4s , v30.4s //vaddq_s32(i4_tmp2_t, tmp_lvl_shift_t) iv iteration + add v16.4s, v16.4s , v30.4s //vaddq_s32(i4_tmp2_t, tmp_lvl_shift_t) iv iteration //mov v7, v6 //vcombine_u16(sto_res_tmp2, sto_res_tmp2) ii iteration - sqxtun v10.4h, v10.4s //vqmovun_s32(sto_res_tmp1) iii iteration + sqxtun v7.4h, v7.4s //vqmovun_s32(sto_res_tmp1) iii iteration - sshl v12.4s,v12.4s,v28.4s + sshl v16.4s,v16.4s,v28.4s //vshl.s32 q6,q6,q14 //vshlq_s32(i4_tmp2_t, tmp_shift_t) iv iteration st1 {v4.s}[0],[x1],#4 //store pu1_dst i iteration //mov v11, v10 //vcombine_u16(sto_res_tmp2, sto_res_tmp2) iii iteration @@ -213,15 +213,15 @@ core_loop: uqxtn v6.8b, v6.8h //vqmovn_u16(sto_res_tmp3) ii iteration st1 {v6.s}[0],[x6],x3 //store pu1_dst ii iteration - uqxtn v10.8b, v10.8h //vqmovn_u16(sto_res_tmp3) iii iteration - sqxtun v12.4h, v12.4s //vqmovun_s32(sto_res_tmp1) iv iteration + uqxtn v7.8b, v7.8h //vqmovn_u16(sto_res_tmp3) iii iteration + sqxtun v16.4h, v16.4s //vqmovun_s32(sto_res_tmp1) iv iteration //mov v13, v12 //vcombine_u16(sto_res_tmp2, sto_res_tmp2) iv iteration - st1 {v10.s}[0],[x6],x3 //store pu1_dst i iteration iii iteration - uqxtn v12.8b, v12.8h //vqmovn_u16(sto_res_tmp3) iv iteration + st1 {v7.s}[0],[x6],x3 //store pu1_dst i iteration iii iteration + uqxtn v16.8b, v16.8h //vqmovn_u16(sto_res_tmp3) iv iteration subs x9,x9,#4 //decrement wd by 4 and check for 0 - st1 {v12.s}[0],[x6],x3 //store pu1_dst iv iteration + st1 {v16.s}[0],[x6],x3 //store pu1_dst iv iteration bgt core_loop //if greater than 0 repeat the core loop again end_core_loop: @@ -239,7 +239,7 @@ end_loops: // ldmfd sp!,{x4-x12,x15} //reload the registers from sp ldp x21, x22,[sp],#16 ldp x19, x20,[sp],#16 - pop_v_regs + ret |