summaryrefslogtreecommitdiffstats
path: root/common/arm64/ihevc_inter_pred_chroma_vert_w16out.s
diff options
context:
space:
mode:
Diffstat (limited to 'common/arm64/ihevc_inter_pred_chroma_vert_w16out.s')
-rw-r--r--common/arm64/ihevc_inter_pred_chroma_vert_w16out.s74
1 files changed, 37 insertions, 37 deletions
diff --git a/common/arm64/ihevc_inter_pred_chroma_vert_w16out.s b/common/arm64/ihevc_inter_pred_chroma_vert_w16out.s
index 9f5687f..ec946eb 100644
--- a/common/arm64/ihevc_inter_pred_chroma_vert_w16out.s
+++ b/common/arm64/ihevc_inter_pred_chroma_vert_w16out.s
@@ -105,7 +105,7 @@
ihevc_inter_pred_chroma_vert_w16out_av8:
// stmfd sp!,{x4-x12,x14} //stack stores the values of the arguments
- push_v_regs
+
stp x19, x20,[sp,#-16]!
mov x15,x4 // pi1_coeff
@@ -145,20 +145,20 @@ ihevc_inter_pred_chroma_vert_w16out_av8:
inner_loop_ht_2: //called when wd is multiple of 4 and ht is 4,2
add x6,x0,x2 //pu1_src +src_strd
- ld1 {v9.8b},[x6],x2 //loads pu1_src
+ ld1 {v17.8b},[x6],x2 //loads pu1_src
subs x5,x5,#8 //2wd - 8
ld1 {v5.8b},[x0],#8 //loads src
- umull v6.8h, v9.8b, v1.8b //vmull_u8(vreinterpret_u8_u32(src_tmp2), coeffabs_1)
+ umull v6.8h, v17.8b, v1.8b //vmull_u8(vreinterpret_u8_u32(src_tmp2), coeffabs_1)
ld1 {v4.8b},[x6],x2 //loads incremented src
umlsl v6.8h, v5.8b, v0.8b //vmlsl_u8(mul_res1, vreinterpret_u8_u32(src_tmp1), coeffabs_0)
- ld1 {v8.8b},[x6],x2 //loads incremented src
+ ld1 {v16.8b},[x6],x2 //loads incremented src
umlal v6.8h, v4.8b, v2.8b //vmlal_u8(mul_res1, vreinterpret_u8_u32(src_tmp3), coeffabs_2)
umull v4.8h, v4.8b, v1.8b
- ld1 {v10.8b},[x6] //loads the incremented src
- umlsl v6.8h, v8.8b, v3.8b
- umlsl v4.8h, v9.8b, v0.8b
- umlal v4.8h, v8.8b, v2.8b
- umlsl v4.8h, v10.8b, v3.8b
+ ld1 {v18.8b},[x6] //loads the incremented src
+ umlsl v6.8h, v16.8b, v3.8b
+ umlsl v4.8h, v17.8b, v0.8b
+ umlal v4.8h, v16.8b, v2.8b
+ umlsl v4.8h, v18.8b, v3.8b
add x6,x1,x3 //pu1_dst + dst_strd
st1 { v6.8h},[x1],#16 //stores the loaded value
@@ -241,7 +241,7 @@ prolog:
add x7,x1,x3 //pu1_dst
umlal v30.8h, v6.8b, v2.8b
umlsl v30.8h, v7.8b, v3.8b
- ld1 {v8.8b},[x6],x2 //load and increment
+ ld1 {v16.8b},[x6],x2 //load and increment
umull v28.8h, v6.8b, v1.8b //mul_res 2
add x20,x0,x9 //pu1_dst += 4*dst_strd - 2*wd
@@ -250,28 +250,28 @@ prolog:
bic x20,x10,#7 //x5 ->wd
csel x5, x20, x5,le
umlal v28.8h, v7.8b, v2.8b
- ld1 {v9.8b},[x6],x2
- umlsl v28.8h, v8.8b, v3.8b
+ ld1 {v17.8b},[x6],x2
+ umlsl v28.8h, v16.8b, v3.8b
- ld1 {v10.8b},[x6],x2
+ ld1 {v18.8b},[x6],x2
umull v26.8h, v7.8b, v1.8b
add x6,x0,x2 //pu1_src + src_strd
umlsl v26.8h, v6.8b, v0.8b
st1 { v30.16b},[x1],#16 //stores the loaded value
- umlal v26.8h, v8.8b, v2.8b
+ umlal v26.8h, v16.8b, v2.8b
ld1 {v4.8b},[x0],#8 //loads the source
- umlsl v26.8h, v9.8b, v3.8b
+ umlsl v26.8h, v17.8b, v3.8b
add x20,x1,x8 //pu1_src += 4*src_strd - 2*wd
csel x1, x20, x1,le
- umull v24.8h, v8.8b, v1.8b
+ umull v24.8h, v16.8b, v1.8b
ld1 {v5.8b},[x6],x2 //loads pu1_src
umlsl v24.8h, v7.8b, v0.8b
subs x12,x12,#4
ld1 {v6.8b},[x6],x2 //load and increment
- umlal v24.8h, v9.8b, v2.8b
+ umlal v24.8h, v17.8b, v2.8b
ld1 {v7.8b},[x6],x2 //load and increment
- umlsl v24.8h, v10.8b, v3.8b
+ umlsl v24.8h, v18.8b, v3.8b
sub x20,x2,x2,lsl #3
neg x11, x20
add x14,x2,x2,lsl #1
@@ -296,7 +296,7 @@ kernel_8:
umlsl v30.8h, v7.8b, v3.8b
st1 { v26.16b},[x7],x3 //stores the loaded value
- ld1 {v8.8b},[x6],x2 //load and increment
+ ld1 {v16.8b},[x6],x2 //load and increment
umull v28.8h, v6.8b, v1.8b //mul_res 2
bic x20,x10,#7 //x5 ->wd
@@ -305,10 +305,10 @@ kernel_8:
st1 { v24.16b},[x7],x3 //stores the loaded value
umlal v28.8h, v7.8b, v2.8b
- ld1 {v9.8b},[x6],x2
+ ld1 {v17.8b},[x6],x2
- umlsl v28.8h, v8.8b, v3.8b
- ld1 {v10.8b},[x6],x2
+ umlsl v28.8h, v16.8b, v3.8b
+ ld1 {v18.8b},[x6],x2
add x7,x1,x3 //pu1_dst
umull v26.8h, v7.8b, v1.8b
add x6,x0,x2 //pu1_src + src_strd
@@ -319,13 +319,13 @@ kernel_8:
ld1 {v4.8b},[x0],#8 //loads the source
add x11,x11,x2
- umlal v26.8h, v8.8b, v2.8b
+ umlal v26.8h, v16.8b, v2.8b
st1 { v30.16b},[x1],#16 //stores the loaded value
- umlsl v26.8h, v9.8b, v3.8b
+ umlsl v26.8h, v17.8b, v3.8b
ld1 {v5.8b},[x6],x2 //loads pu1_src
- umull v24.8h, v8.8b, v1.8b
+ umull v24.8h, v16.8b, v1.8b
ld1 {v6.8b},[x6],x2 //load and increment
add x20,x1,x8 //pu1_src += 4*src_strd - 2*wd
csel x1, x20, x1,le
@@ -341,10 +341,10 @@ kernel_8:
subs x12,x12,#4
- umlal v24.8h, v9.8b, v2.8b
+ umlal v24.8h, v17.8b, v2.8b
ld1 {v7.8b},[x6],x2 //load and increment
- umlsl v24.8h, v10.8b, v3.8b
+ umlsl v24.8h, v18.8b, v3.8b
st1 { v28.16b},[x7],x3 //stores the loaded value
bgt kernel_8 //jumps to kernel_8
@@ -357,35 +357,35 @@ epilog:
umlsl v30.8h, v7.8b, v3.8b
st1 { v26.16b},[x7],x3 //stores the loaded value
- ld1 {v8.8b},[x6],x2 //load and increment
+ ld1 {v16.8b},[x6],x2 //load and increment
umull v28.8h, v6.8b, v1.8b //mul_res 2
umlsl v28.8h, v5.8b, v0.8b
umlal v28.8h, v7.8b, v2.8b
- umlsl v28.8h, v8.8b, v3.8b
+ umlsl v28.8h, v16.8b, v3.8b
st1 { v24.16b},[x7],x3 //stores the loaded value
- ld1 {v9.8b},[x6],x2
+ ld1 {v17.8b},[x6],x2
umull v26.8h, v7.8b, v1.8b
add x7,x1,x3 //pu1_dst
umlsl v26.8h, v6.8b, v0.8b
st1 { v30.16b},[x1],#16 //stores the loaded value
- umlal v26.8h, v8.8b, v2.8b
- ld1 {v10.8b},[x6],x2
- umlsl v26.8h, v9.8b, v3.8b
+ umlal v26.8h, v16.8b, v2.8b
+ ld1 {v18.8b},[x6],x2
+ umlsl v26.8h, v17.8b, v3.8b
- umull v24.8h, v8.8b, v1.8b
+ umull v24.8h, v16.8b, v1.8b
st1 { v28.16b},[x7],x3 //stores the loaded value
umlsl v24.8h, v7.8b, v0.8b
- umlal v24.8h, v9.8b, v2.8b
+ umlal v24.8h, v17.8b, v2.8b
st1 { v26.16b},[x7],x3 //stores the loaded value
- umlsl v24.8h, v10.8b, v3.8b
+ umlsl v24.8h, v18.8b, v3.8b
st1 { v24.16b},[x7],x3 //stores the loaded value
end_loops:
// ldmfd sp!,{x4-x12,x15} //reload the registers from sp
ldp x19, x20,[sp],#16
- pop_v_regs
+
ret