summaryrefslogtreecommitdiffstats
path: root/common/arm64/ihevc_weighted_pred_uni.s
diff options
context:
space:
mode:
Diffstat (limited to 'common/arm64/ihevc_weighted_pred_uni.s')
-rw-r--r--common/arm64/ihevc_weighted_pred_uni.s32
1 files changed, 16 insertions, 16 deletions
diff --git a/common/arm64/ihevc_weighted_pred_uni.s b/common/arm64/ihevc_weighted_pred_uni.s
index d805230..5586679 100644
--- a/common/arm64/ihevc_weighted_pred_uni.s
+++ b/common/arm64/ihevc_weighted_pred_uni.s
@@ -129,7 +129,7 @@ ihevc_weighted_pred_uni_av8:
ldr w9,[sp,#8]
// stmfd sp!, {x4-x12, x14} //stack stores the values of the arguments
- push_v_regs
+
stp x19, x20,[sp,#-16]!
stp x21, x22,[sp,#-16]!
@@ -175,37 +175,37 @@ core_loop:
smull v4.4s, v1.4h, v0.4h[0] //vmull_n_s16(pi2_src_val1, (int16_t) wgt0)
add v4.4s, v4.4s , v30.4s //vaddq_s32(i4_tmp1_t, tmp_lvl_shift_t)
- ld1 {v8.4h},[x5],x2 //load and increment the pi2_src iii iteration
+ ld1 {v3.4h},[x5],x2 //load and increment the pi2_src iii iteration
smull v6.4s, v2.4h, v0.4h[0] //vmull_n_s16(pi2_src_val2, (int16_t) wgt0) ii iteration
- ld1 {v9.4h},[x5],x2 //load and increment the pi2_src_tmp iv iteration
+ ld1 {v5.4h},[x5],x2 //load and increment the pi2_src_tmp iv iteration
sshl v4.4s,v4.4s,v28.4s
//vshl.s32 q2,q2,q14 //vshlq_s32(i4_tmp1_t, tmp_shift_t)
add v6.4s, v6.4s , v30.4s //vaddq_s32(i4_tmp2_t, tmp_lvl_shift_t) ii iteration
- smull v10.4s, v8.4h, v0.4h[0] //vmull_n_s16(pi2_src_val1, (int16_t) wgt0) iii iteration
+ smull v7.4s, v3.4h, v0.4h[0] //vmull_n_s16(pi2_src_val1, (int16_t) wgt0) iii iteration
sqxtun v4.4h, v4.4s //vqmovun_s32(sto_res_tmp1)
- add v10.4s, v10.4s , v30.4s //vaddq_s32(i4_tmp1_t, tmp_lvl_shift_t) iii iteration
+ add v7.4s, v7.4s , v30.4s //vaddq_s32(i4_tmp1_t, tmp_lvl_shift_t) iii iteration
//mov v5, v4 //vcombine_u16(sto_res_tmp2, sto_res_tmp2)
sshl v6.4s,v6.4s,v28.4s
//vshl.s32 q3,q3,q14 //vshlq_s32(i4_tmp2_t, tmp_shift_t) ii iteration
- smull v12.4s, v9.4h, v0.4h[0] //vmull_n_s16(pi2_src_val2, (int16_t) wgt0) iv iteration
+ smull v16.4s, v5.4h, v0.4h[0] //vmull_n_s16(pi2_src_val2, (int16_t) wgt0) iv iteration
uqxtn v4.8b, v4.8h //vqmovn_u16(sto_res_tmp3)
- sshl v10.4s,v10.4s,v28.4s
+ sshl v7.4s,v7.4s,v28.4s
//vshl.s32 q5,q5,q14 //vshlq_s32(i4_tmp1_t, tmp_shift_t) iii iteration
sqxtun v6.4h, v6.4s //vqmovun_s32(sto_res_tmp1) ii iteration
- add v12.4s, v12.4s , v30.4s //vaddq_s32(i4_tmp2_t, tmp_lvl_shift_t) iv iteration
+ add v16.4s, v16.4s , v30.4s //vaddq_s32(i4_tmp2_t, tmp_lvl_shift_t) iv iteration
//mov v7, v6 //vcombine_u16(sto_res_tmp2, sto_res_tmp2) ii iteration
- sqxtun v10.4h, v10.4s //vqmovun_s32(sto_res_tmp1) iii iteration
+ sqxtun v7.4h, v7.4s //vqmovun_s32(sto_res_tmp1) iii iteration
- sshl v12.4s,v12.4s,v28.4s
+ sshl v16.4s,v16.4s,v28.4s
//vshl.s32 q6,q6,q14 //vshlq_s32(i4_tmp2_t, tmp_shift_t) iv iteration
st1 {v4.s}[0],[x1],#4 //store pu1_dst i iteration
//mov v11, v10 //vcombine_u16(sto_res_tmp2, sto_res_tmp2) iii iteration
@@ -213,15 +213,15 @@ core_loop:
uqxtn v6.8b, v6.8h //vqmovn_u16(sto_res_tmp3) ii iteration
st1 {v6.s}[0],[x6],x3 //store pu1_dst ii iteration
- uqxtn v10.8b, v10.8h //vqmovn_u16(sto_res_tmp3) iii iteration
- sqxtun v12.4h, v12.4s //vqmovun_s32(sto_res_tmp1) iv iteration
+ uqxtn v7.8b, v7.8h //vqmovn_u16(sto_res_tmp3) iii iteration
+ sqxtun v16.4h, v16.4s //vqmovun_s32(sto_res_tmp1) iv iteration
//mov v13, v12 //vcombine_u16(sto_res_tmp2, sto_res_tmp2) iv iteration
- st1 {v10.s}[0],[x6],x3 //store pu1_dst i iteration iii iteration
- uqxtn v12.8b, v12.8h //vqmovn_u16(sto_res_tmp3) iv iteration
+ st1 {v7.s}[0],[x6],x3 //store pu1_dst i iteration iii iteration
+ uqxtn v16.8b, v16.8h //vqmovn_u16(sto_res_tmp3) iv iteration
subs x9,x9,#4 //decrement wd by 4 and check for 0
- st1 {v12.s}[0],[x6],x3 //store pu1_dst iv iteration
+ st1 {v16.s}[0],[x6],x3 //store pu1_dst iv iteration
bgt core_loop //if greater than 0 repeat the core loop again
end_core_loop:
@@ -239,7 +239,7 @@ end_loops:
// ldmfd sp!,{x4-x12,x15} //reload the registers from sp
ldp x21, x22,[sp],#16
ldp x19, x20,[sp],#16
- pop_v_regs
+
ret