summaryrefslogtreecommitdiffstats
path: root/common/arm64/ihevc_inter_pred_chroma_vert_w16inp_w16out.s
diff options
context:
space:
mode:
Diffstat (limited to 'common/arm64/ihevc_inter_pred_chroma_vert_w16inp_w16out.s')
-rw-r--r--common/arm64/ihevc_inter_pred_chroma_vert_w16inp_w16out.s126
1 files changed, 63 insertions, 63 deletions
diff --git a/common/arm64/ihevc_inter_pred_chroma_vert_w16inp_w16out.s b/common/arm64/ihevc_inter_pred_chroma_vert_w16inp_w16out.s
index b6d0eb2..5aaabe6 100644
--- a/common/arm64/ihevc_inter_pred_chroma_vert_w16inp_w16out.s
+++ b/common/arm64/ihevc_inter_pred_chroma_vert_w16inp_w16out.s
@@ -104,7 +104,7 @@
ihevc_inter_pred_chroma_vert_w16inp_w16out_av8:
// stmfd sp!, {x4-x12, x14} //stack stores the values of the arguments
- push_v_regs
+
stp x19, x20,[sp,#-16]!
mov x15,x4 // pi1_coeff
@@ -120,10 +120,10 @@ ihevc_inter_pred_chroma_vert_w16inp_w16out_av8:
sxtl v0.8h, v0.8b //long the value
tst x6,#3 //checks wd == 2
- dup v12.4h, v0.4h[0] //coeff_0
- dup v13.4h, v0.4h[1] //coeff_1
- dup v14.4h, v0.4h[2] //coeff_2
- dup v15.4h, v0.4h[3] //coeff_3
+ dup v16.4h, v0.4h[0] //coeff_0
+ dup v17.4h, v0.4h[1] //coeff_1
+ dup v18.4h, v0.4h[2] //coeff_2
+ dup v19.4h, v0.4h[3] //coeff_3
bgt core_loop_ht_2 //jumps to loop handles wd 2
@@ -141,22 +141,22 @@ core_loop_ht_2:
inner_loop_ht_2:
add x0,x4,x2 //increments pi2_src
ld1 {v0.4h},[x4],#8 //loads pu1_src
- smull v0.4s, v0.4h, v12.4h //vmull_s16(src_tmp1, coeff_0)
+ smull v0.4s, v0.4h, v16.4h //vmull_s16(src_tmp1, coeff_0)
subs x12,x12,#8 //2wd + 8
ld1 {v2.4h},[x0],x2 //loads pi2_src
- smull v8.4s, v2.4h, v12.4h //vmull_s16(src_tmp2, coeff_0)
+ smull v7.4s, v2.4h, v16.4h //vmull_s16(src_tmp2, coeff_0)
ld1 {v3.4h},[x0],x2 //loads pi2_src
- smlal v0.4s, v2.4h, v13.4h
+ smlal v0.4s, v2.4h, v17.4h
ld1 {v6.4h},[x0],x2
- smlal v8.4s, v3.4h, v13.4h
+ smlal v7.4s, v3.4h, v17.4h
ld1 {v2.4h},[x0]
add x7,x1,x3 //pu1_dst + dst_strd
- smlal v0.4s, v3.4h, v14.4h
- smlal v8.4s, v6.4h, v14.4h
- smlal v0.4s, v6.4h, v15.4h
- smlal v8.4s, v2.4h, v15.4h
+ smlal v0.4s, v3.4h, v18.4h
+ smlal v7.4s, v6.4h, v18.4h
+ smlal v0.4s, v6.4h, v19.4h
+ smlal v7.4s, v2.4h, v19.4h
sqshrn v0.4h, v0.4s,#6 //right shift
- sqshrn v30.4h, v8.4s,#6 //right shift
+ sqshrn v30.4h, v7.4s,#6 //right shift
st1 {v0.2s},[x1],#8 //stores the loaded value
st1 {v30.2s},[x7] //stores the loaded value
bgt inner_loop_ht_2 //inner loop -again
@@ -188,44 +188,44 @@ prolog:
ld1 {v1.4h},[x0],x2 //loads pi2_src
subs x11,x11,#4
ld1 {v2.4h},[x0],x2 //loads pi2_src
- smull v30.4s, v0.4h, v12.4h //vmull_s16(src_tmp1, coeff_0)
+ smull v30.4s, v0.4h, v16.4h //vmull_s16(src_tmp1, coeff_0)
ld1 {v3.4h},[x0],x2
- smlal v30.4s, v1.4h, v13.4h
- smlal v30.4s, v2.4h, v14.4h
+ smlal v30.4s, v1.4h, v17.4h
+ smlal v30.4s, v2.4h, v18.4h
add x9,x1,x3 //pu1_dst + dst_strd
- smlal v30.4s, v3.4h, v15.4h
+ smlal v30.4s, v3.4h, v19.4h
ld1 {v4.4h},[x0],x2
- smull v28.4s, v1.4h, v12.4h //vmull_s16(src_tmp2, coeff_0)
+ smull v28.4s, v1.4h, v16.4h //vmull_s16(src_tmp2, coeff_0)
add x20,x4,x8
csel x4, x20, x4,le
lsl x20,x6,#1
csel x11, x20, x11,le
- smlal v28.4s, v2.4h, v13.4h
- smlal v28.4s, v3.4h, v14.4h
+ smlal v28.4s, v2.4h, v17.4h
+ smlal v28.4s, v3.4h, v18.4h
ld1 {v5.4h},[x0],x2
- smlal v28.4s, v4.4h, v15.4h
+ smlal v28.4s, v4.4h, v19.4h
sqshrn v30.4h, v30.4s,#6 //right shift
ld1 {v6.4h},[x0],x2
- smull v26.4s, v2.4h, v12.4h //vmull_s16(src_tmp2, coeff_0)
- smlal v26.4s, v3.4h, v13.4h
- smlal v26.4s, v4.4h, v14.4h
+ smull v26.4s, v2.4h, v16.4h //vmull_s16(src_tmp2, coeff_0)
+ smlal v26.4s, v3.4h, v17.4h
+ smlal v26.4s, v4.4h, v18.4h
add x0,x4,x2
ld1 {v0.4h},[x4],#8 //loads pu1_src
- smlal v26.4s, v5.4h, v15.4h
+ smlal v26.4s, v5.4h, v19.4h
sqshrn v28.4h, v28.4s,#6 //right shift
ld1 {v1.4h},[x0],x2 //loads pi2_src
- smull v24.4s, v3.4h, v12.4h //vmull_s16(src_tmp2, coeff_0)
+ smull v24.4s, v3.4h, v16.4h //vmull_s16(src_tmp2, coeff_0)
st1 {v30.2s},[x1],#8 //stores the loaded value
- smlal v24.4s, v4.4h, v13.4h
+ smlal v24.4s, v4.4h, v17.4h
ld1 {v2.4h},[x0],x2 //loads pi2_src
- smlal v24.4s, v5.4h, v14.4h
+ smlal v24.4s, v5.4h, v18.4h
ld1 {v3.4h},[x0],x2
- smlal v24.4s, v6.4h, v15.4h
+ smlal v24.4s, v6.4h, v19.4h
add x20,x1,x14,lsl #1
csel x1, x20, x1,le
@@ -235,20 +235,20 @@ prolog:
beq epilog //jumps to epilog
kernel_4:
- smull v30.4s, v0.4h, v12.4h //vmull_s16(src_tmp1, coeff_0)
+ smull v30.4s, v0.4h, v16.4h //vmull_s16(src_tmp1, coeff_0)
subs x11,x11,#4
- smlal v30.4s, v1.4h, v13.4h
+ smlal v30.4s, v1.4h, v17.4h
st1 {v28.2s},[x9],x3 //stores the loaded value
- smlal v30.4s, v2.4h, v14.4h
- smlal v30.4s, v3.4h, v15.4h
+ smlal v30.4s, v2.4h, v18.4h
+ smlal v30.4s, v3.4h, v19.4h
sqshrn v24.4h, v24.4s,#6 //right shift
ld1 {v4.4h},[x0],x2
- smull v28.4s, v1.4h, v12.4h //vmull_s16(src_tmp2, coeff_0)
- smlal v28.4s, v2.4h, v13.4h
- smlal v28.4s, v3.4h, v14.4h
- smlal v28.4s, v4.4h, v15.4h
+ smull v28.4s, v1.4h, v16.4h //vmull_s16(src_tmp2, coeff_0)
+ smlal v28.4s, v2.4h, v17.4h
+ smlal v28.4s, v3.4h, v18.4h
+ smlal v28.4s, v4.4h, v19.4h
st1 {v26.2s},[x9],x3 //stores the loaded value
add x20,x4,x8
csel x4, x20, x4,le
@@ -258,27 +258,27 @@ kernel_4:
sqshrn v30.4h, v30.4s,#6 //right shift
ld1 {v5.4h},[x0],x2
- smull v26.4s, v2.4h, v12.4h //vmull_s16(src_tmp2, coeff_0)
+ smull v26.4s, v2.4h, v16.4h //vmull_s16(src_tmp2, coeff_0)
ld1 {v6.4h},[x0],x2
- smlal v26.4s, v3.4h, v13.4h
+ smlal v26.4s, v3.4h, v17.4h
st1 {v24.2s},[x9] //stores the loaded value
add x0,x4,x2
- smlal v26.4s, v4.4h, v14.4h
+ smlal v26.4s, v4.4h, v18.4h
ld1 {v0.4h},[x4],#8 //loads pu1_src
- smlal v26.4s, v5.4h, v15.4h
+ smlal v26.4s, v5.4h, v19.4h
sqshrn v28.4h, v28.4s,#6 //right shift
ld1 {v1.4h},[x0],x2 //loads pi2_src
- smull v24.4s, v3.4h, v12.4h //vmull_s16(src_tmp2, coeff_0)
+ smull v24.4s, v3.4h, v16.4h //vmull_s16(src_tmp2, coeff_0)
ld1 {v2.4h},[x0],x2 //loads pi2_src
- smlal v24.4s, v4.4h, v13.4h
+ smlal v24.4s, v4.4h, v17.4h
add x9,x1,x3 //pu1_dst + dst_strd
ld1 {v3.4h},[x0],x2
- smlal v24.4s, v5.4h, v14.4h
+ smlal v24.4s, v5.4h, v18.4h
st1 {v30.2s},[x1],#8 //stores the loaded value
- smlal v24.4s, v6.4h, v15.4h
+ smlal v24.4s, v6.4h, v19.4h
sqshrn v26.4h, v26.4s,#6 //right shift
add x20,x1,x14,lsl #1
@@ -289,38 +289,38 @@ kernel_4:
bgt kernel_4 //jumps to kernel_4
epilog:
- smull v30.4s, v0.4h, v12.4h //vmull_s16(src_tmp1, coeff_0)
+ smull v30.4s, v0.4h, v16.4h //vmull_s16(src_tmp1, coeff_0)
st1 {v28.2s},[x9],x3 //stores the loaded value
- smlal v30.4s, v1.4h, v13.4h
- smlal v30.4s, v2.4h, v14.4h
- smlal v30.4s, v3.4h, v15.4h
+ smlal v30.4s, v1.4h, v17.4h
+ smlal v30.4s, v2.4h, v18.4h
+ smlal v30.4s, v3.4h, v19.4h
sqshrn v24.4h, v24.4s,#6 //right shift
- smull v28.4s, v1.4h, v12.4h //vmull_s16(src_tmp2, coeff_0)
+ smull v28.4s, v1.4h, v16.4h //vmull_s16(src_tmp2, coeff_0)
ld1 {v4.4h},[x0],x2
- smlal v28.4s, v2.4h, v13.4h
+ smlal v28.4s, v2.4h, v17.4h
st1 {v26.2s},[x9],x3 //stores the loaded value
- smlal v28.4s, v3.4h, v14.4h
- smlal v28.4s, v4.4h, v15.4h
+ smlal v28.4s, v3.4h, v18.4h
+ smlal v28.4s, v4.4h, v19.4h
sqshrn v30.4h, v30.4s,#6 //right shift
- smull v26.4s, v2.4h, v12.4h //vmull_s16(src_tmp2, coeff_0)
+ smull v26.4s, v2.4h, v16.4h //vmull_s16(src_tmp2, coeff_0)
ld1 {v5.4h},[x0],x2
- smlal v26.4s, v3.4h, v13.4h
- smlal v26.4s, v4.4h, v14.4h
- smlal v26.4s, v5.4h, v15.4h
+ smlal v26.4s, v3.4h, v17.4h
+ smlal v26.4s, v4.4h, v18.4h
+ smlal v26.4s, v5.4h, v19.4h
sqshrn v28.4h, v28.4s,#6 //right shift
st1 {v24.2s},[x9] //stores the loaded value
- smull v24.4s, v3.4h, v12.4h //vmull_s16(src_tmp2, coeff_0)
- smlal v24.4s, v4.4h, v13.4h
+ smull v24.4s, v3.4h, v16.4h //vmull_s16(src_tmp2, coeff_0)
+ smlal v24.4s, v4.4h, v17.4h
add x9,x1,x3 //pu1_dst + dst_strd
ld1 {v6.4h},[x0],x2
- smlal v24.4s, v5.4h, v14.4h
- smlal v24.4s, v6.4h, v15.4h
+ smlal v24.4s, v5.4h, v18.4h
+ smlal v24.4s, v6.4h, v19.4h
st1 {v30.2s},[x1],#8 //stores the loaded value
sqshrn v26.4h, v26.4s,#6 //right shift
@@ -335,7 +335,7 @@ epilog:
end_loops:
// ldmfd sp!,{x4-x12,x15} //reload the registers from sp
ldp x19, x20,[sp],#16
- pop_v_regs
+
ret