summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorNaveen Kumar Ponnusamy <naveenkumar.p@ittiam.com>2014-06-10 12:14:27 -0700
committerLajos Molnar <lajos@google.com>2014-07-12 15:09:24 -0700
commit9cbd70a2930875be59d7df68136ac9a1a949a13d (patch)
tree6d9957d14352fc77e2323f90b49387e577f1ade2
parent707042fda96ebede81408b854385173483798bcd (diff)
downloadandroid_external_libhevc-9cbd70a2930875be59d7df68136ac9a1a949a13d.tar.gz
android_external_libhevc-9cbd70a2930875be59d7df68136ac9a1a949a13d.tar.bz2
android_external_libhevc-9cbd70a2930875be59d7df68136ac9a1a949a13d.zip
Reduced stack operations in arm64 assembly
Change-Id: Ia19a99001fef37334f18521dd8f8710907fe370d
-rw-r--r--common/arm64/ihevc_deblk_luma_horz.s41
-rw-r--r--common/arm64/ihevc_inter_pred_chroma_copy_w16out.s62
-rw-r--r--common/arm64/ihevc_inter_pred_chroma_horz.s75
-rw-r--r--common/arm64/ihevc_inter_pred_chroma_horz_w16out.s85
-rw-r--r--common/arm64/ihevc_inter_pred_chroma_vert.s74
-rw-r--r--common/arm64/ihevc_inter_pred_chroma_vert_w16inp.s126
-rw-r--r--common/arm64/ihevc_inter_pred_chroma_vert_w16inp_w16out.s126
-rw-r--r--common/arm64/ihevc_inter_pred_chroma_vert_w16out.s74
-rw-r--r--common/arm64/ihevc_inter_pred_filters_luma_vert.s272
-rw-r--r--common/arm64/ihevc_inter_pred_filters_luma_vert_w16inp.s272
-rw-r--r--common/arm64/ihevc_inter_pred_filters_luma_vert_w16out.s244
-rw-r--r--common/arm64/ihevc_inter_pred_luma_copy_w16out.s54
-rw-r--r--common/arm64/ihevc_inter_pred_luma_vert_w16inp_w16out.s274
-rw-r--r--common/arm64/ihevc_intra_pred_chroma_horz.s72
-rw-r--r--common/arm64/ihevc_intra_pred_chroma_mode_18_34.s20
-rw-r--r--common/arm64/ihevc_intra_pred_chroma_mode_27_to_33.s29
-rw-r--r--common/arm64/ihevc_intra_pred_chroma_mode_3_to_9.s169
-rw-r--r--common/arm64/ihevc_intra_pred_chroma_planar.s73
-rw-r--r--common/arm64/ihevc_intra_pred_filters_chroma_mode_11_to_17.s143
-rw-r--r--common/arm64/ihevc_intra_pred_filters_chroma_mode_19_to_25.s61
-rw-r--r--common/arm64/ihevc_intra_pred_filters_luma_mode_11_to_17.s161
-rw-r--r--common/arm64/ihevc_intra_pred_filters_luma_mode_19_to_25.s29
-rw-r--r--common/arm64/ihevc_intra_pred_luma_dc.s46
-rw-r--r--common/arm64/ihevc_intra_pred_luma_horz.s74
-rw-r--r--common/arm64/ihevc_intra_pred_luma_mode2.s68
-rw-r--r--common/arm64/ihevc_intra_pred_luma_mode_27_to_33.s29
-rw-r--r--common/arm64/ihevc_intra_pred_luma_mode_3_to_9.s171
-rw-r--r--common/arm64/ihevc_intra_pred_luma_planar.s176
-rw-r--r--common/arm64/ihevc_intra_pred_luma_vert.s76
-rw-r--r--common/arm64/ihevc_itrans_recon_4x4.s48
-rw-r--r--common/arm64/ihevc_itrans_recon_4x4_ttype1.s68
-rw-r--r--common/arm64/ihevc_neon_macros.s1
-rw-r--r--common/arm64/ihevc_sao_band_offset_luma.s41
-rw-r--r--common/arm64/ihevc_sao_edge_offset_class0.s94
-rw-r--r--common/arm64/ihevc_sao_edge_offset_class0_chroma.s144
-rw-r--r--common/arm64/ihevc_sao_edge_offset_class1.s100
-rw-r--r--common/arm64/ihevc_sao_edge_offset_class1_chroma.s124
-rw-r--r--common/arm64/ihevc_sao_edge_offset_class2.s192
-rw-r--r--common/arm64/ihevc_sao_edge_offset_class2_chroma.s174
-rw-r--r--common/arm64/ihevc_sao_edge_offset_class3.s176
-rw-r--r--common/arm64/ihevc_sao_edge_offset_class3_chroma.s174
-rw-r--r--common/arm64/ihevc_weighted_pred_bi.s38
-rw-r--r--common/arm64/ihevc_weighted_pred_bi_default.s62
-rw-r--r--common/arm64/ihevc_weighted_pred_uni.s32
-rw-r--r--decoder/arm64/ihevcd_fmt_conv_420sp_to_rgba8888.s65
-rw-r--r--decoder/arm64/ihevcd_itrans_recon_dc_luma.s20
46 files changed, 2398 insertions, 2331 deletions
diff --git a/common/arm64/ihevc_deblk_luma_horz.s b/common/arm64/ihevc_deblk_luma_horz.s
index a5c314d..f6989e9 100644
--- a/common/arm64/ihevc_deblk_luma_horz.s
+++ b/common/arm64/ihevc_deblk_luma_horz.s
@@ -50,7 +50,8 @@ ihevc_deblk_luma_horz_av8:
// stmfd sp!, {x3-x12,x14}
sxtw x5,w5
sxtw x6,w6
- stp d8,d9,[sp,#-16]!
+ stp d8,d9,[sp,#-16]! // Storing d9 using { sub sp,sp,#8; str d9,[sp] } is giving bus error.
+ // d8 is used as dummy register and stored along with d9 using stp. d8 is not used in the function.
stp d10,d11,[sp,#-16]!
stp d12,d13,[sp,#-16]!
stp d14,d15,[sp,#-16]!
@@ -212,11 +213,11 @@ l1.1564:
neg x19, x1
ldrb w7,[x0,x19] // has the -1 value
dup v22.2s,w2 // -4 value
- uaddw v8.8h, v6.8h , v27.8b
+ uaddw v7.8h, v6.8h , v27.8b
ldrb w3,[x0,#0] // x4 has the 0 value
uqadd v16.8b, v27.8b , v1.8b
and x2,x2,#0xff
- mul v12.8h, v8.8h, v0.4h[0]
+ mul v12.8h, v7.8h, v0.4h[0]
ldr w8, [x0,x10] // has the 3 value
uaddl v10.8h, v24.8b , v28.8b
subs x2,x2,x7
@@ -233,7 +234,7 @@ l1.1564:
cmp x8,x5,asr #3
bge l1.1840
- uaddw v14.8h, v8.8h , v28.8b
+ uaddw v14.8h, v7.8h , v28.8b
subs x7,x3,x7
umax v4.8b, v18.8b , v31.8b
csneg x7,x7,x7,pl
@@ -285,13 +286,13 @@ l1.1564:
subs x2,x2,x7
umax v3.8b, v18.8b , v31.8b
csneg x2,x2,x2,pl
- uaddw v8.8h, v6.8h , v26.8b
+ uaddw v7.8h, v6.8h , v26.8b
add x8,x8,x2
uqadd v30.8b, v25.8b , v1.8b
cmp x8,x5,asr #3
uqsub v31.8b, v25.8b , v1.8b
bge l1.1840
- mul v12.8h, v8.8h, v0.4h[0]
+ mul v12.8h, v7.8h, v0.4h[0]
subs x7,x3,x7
uqadd v16.8b, v24.8b , v1.8b
csneg x7,x7,x7,pl
@@ -303,7 +304,7 @@ l1.1564:
add x10, x10,#1
rshrn v20.8b, v12.8h,#3
cmp x7,x10,asr #1
- uaddw v14.8h, v8.8h , v23.8b
+ uaddw v14.8h, v7.8h , v23.8b
bge l1.1840
umin v18.8b, v20.8b , v30.8b
mov x2,#2
@@ -397,7 +398,7 @@ end_dep_deq_decision_horz:
cmp x2,#1
uqsub v31.8b, v23.8b , v1.8b
beq l1.2408
- uaddl v8.8h, v23.8b , v22.8b
+ uaddl v7.8h, v23.8b , v22.8b
cmp x5,#1
bne strong_filtering_p
@@ -412,10 +413,10 @@ strong_filtering_q:
strong_filtering_p:
umax v5.8b, v18.8b , v17.8b
mov x12,x0
- mul v8.8h, v8.8h, v0.4h[0]
+ mul v7.8h, v7.8h, v0.4h[0]
sub x20,x1,#0
neg x11, x20
- add v16.8h, v8.8h , v14.8h
+ add v16.8h, v7.8h , v14.8h
add x12,x12,x11
rshrn v19.8b, v16.8h,#3
st1 {v2.s}[0],[x12],x11
@@ -431,7 +432,8 @@ l1.2404:
ldp d14,d15,[sp],#16
ldp d12,d13,[sp],#16
ldp d10,d11,[sp],#16
- ldp d8,d9,[sp],#16
+ ldp d8,d9,[sp],#16 // Loading d9 using { ldr d9,[sp]; add sp,sp,#8 } is giving bus error.
+ // d8 is used as dummy register and loaded along with d9 using ldp. d8 is not used in the function.
ret
// x4=flag p
@@ -486,8 +488,8 @@ l1.2408:
srshr v10.8h, v10.8h,#4
// delta = ( 9 * (pu1_src[0] - pu1_src[-1]) - 3 * (pu1_src[1] - pu1_src[-2]) + 8 ) >> 4@
- abs v8.8h, v10.8h
- xtn v9.8b, v8.8h
+ abs v7.8h, v10.8h
+ xtn v9.8b, v7.8h
// storing the absolute values of delta in d9
sqxtn v10.8b, v10.8h
@@ -495,16 +497,16 @@ l1.2408:
smin v11.8b, v10.8b , v30.8b
- smax v8.8b, v31.8b , v11.8b // d8 has the value delta = clip3(delta, -tc, tc)//
+ smax v7.8b, v31.8b , v11.8b // d8 has the value delta = clip3(delta, -tc, tc)//
uxtl v6.8h, v25.8b
- saddw v4.8h, v6.8h , v8.8b
+ saddw v4.8h, v6.8h , v7.8b
sqxtun v12.8b, v4.8h
uxtl v6.8h, v26.8b
- ssubw v4.8h, v6.8h , v8.8b
+ ssubw v4.8h, v6.8h , v7.8b
sqxtun v13.8b, v4.8h
@@ -525,7 +527,7 @@ l1.2408:
uaddl v14.8h, v23.8b , v25.8b
rshrn v14.8b, v14.8h,#1
usubl v14.8h, v14.8b , v24.8b
- saddw v14.8h, v14.8h , v8.8b
+ saddw v14.8h, v14.8h , v7.8b
sqshrn v14.8b, v14.8h,#1
smin v15.8b, v14.8b , v0.8b
smax v14.8b, v1.8b , v15.8b
@@ -558,7 +560,7 @@ l1.2724:
uaddl v14.8h, v26.8b , v28.8b
rshrn v14.8b, v14.8h,#1
usubl v14.8h, v14.8b , v27.8b
- ssubw v14.8h, v14.8h , v8.8b
+ ssubw v14.8h, v14.8h , v7.8b
sqshrn v14.8b, v14.8h,#1
smin v15.8b, v14.8b , v0.8b
smax v14.8b, v1.8b , v15.8b
@@ -580,7 +582,8 @@ l1.2852:
ldp d14,d15,[sp],#16
ldp d12,d13,[sp],#16
ldp d10,d11,[sp],#16
- ldp d8,d9,[sp],#16
+ ldp d8,d9,[sp],#16 // Loading d9 using { ldr d9,[sp]; add sp,sp,#8 } is giving bus error.
+ // d8 is used as dummy register and loaded along with d9 using ldp. d8 is not used in the function.
ret
diff --git a/common/arm64/ihevc_inter_pred_chroma_copy_w16out.s b/common/arm64/ihevc_inter_pred_chroma_copy_w16out.s
index e479651..180e5f5 100644
--- a/common/arm64/ihevc_inter_pred_chroma_copy_w16out.s
+++ b/common/arm64/ihevc_inter_pred_chroma_copy_w16out.s
@@ -104,7 +104,7 @@
ihevc_inter_pred_chroma_copy_w16out_av8:
// stmfd sp!, {x4-x12, x14} //stack stores the values of the arguments
- push_v_regs
+
stp x19, x20,[sp,#-16]!
mov x15,x4 // pi1_coeff
@@ -172,7 +172,7 @@ end_inner_loop_wd_4:
end_loops:
// ldmfd sp!,{x4-x12,x15} //reload the registers from sp
ldp x19, x20,[sp],#16
- pop_v_regs
+
ret
@@ -219,14 +219,14 @@ core_loop_wd_8:
prolog:
add x6,x0,x2 //pu1_src_tmp += src_strd
add x10,x1,x5
- ld1 {v8.8b},[x0],#8 //vld1_u8(pu1_src_tmp)
- ld1 {v10.8b},[x6],x2 //vld1_u8(pu1_src_tmp)
- ld1 {v12.8b},[x6],x2 //vld1_u8(pu1_src_tmp)
- ld1 {v14.8b},[x6],x2 //vld1_u8(pu1_src_tmp)
- uxtl v16.8h, v8.8b //vmovl_u8(vld1_u8(pu1_src_tmp))
- uxtl v18.8h, v10.8b //vmovl_u8(vld1_u8(pu1_src_tmp)
- uxtl v20.8h, v12.8b //vmovl_u8(vld1_u8(pu1_src_tmp)
- uxtl v22.8h, v14.8b //vmovl_u8(vld1_u8(pu1_src_tmp)
+ ld1 {v1.8b},[x0],#8 //vld1_u8(pu1_src_tmp)
+ ld1 {v3.8b},[x6],x2 //vld1_u8(pu1_src_tmp)
+ ld1 {v5.8b},[x6],x2 //vld1_u8(pu1_src_tmp)
+ ld1 {v7.8b},[x6],x2 //vld1_u8(pu1_src_tmp)
+ uxtl v16.8h, v1.8b //vmovl_u8(vld1_u8(pu1_src_tmp))
+ uxtl v18.8h, v3.8b //vmovl_u8(vld1_u8(pu1_src_tmp)
+ uxtl v20.8h, v5.8b //vmovl_u8(vld1_u8(pu1_src_tmp)
+ uxtl v22.8h, v7.8b //vmovl_u8(vld1_u8(pu1_src_tmp)
subs x4,x4,#8 //wd decrements by 8
shl v0.8h, v16.8h,#6 //vshlq_n_s16(tmp, 6)
shl v2.8h, v18.8h,#6 //vshlq_n_s16(tmp, 6)
@@ -235,10 +235,10 @@ prolog:
add x20,x0,x8
csel x0, x20, x0,le
add x6,x0,x2 //pu1_src_tmp += src_strd
- ld1 {v8.8b},[x0],#8 //vld1_u8(pu1_src_tmp)
- ld1 {v10.8b},[x6],x2 //vld1_u8(pu1_src_tmp)
- ld1 {v12.8b},[x6],x2 //vld1_u8(pu1_src_tmp)
- ld1 {v14.8b},[x6],x2 //vld1_u8(pu1_src_tmp)
+ ld1 {v1.8b},[x0],#8 //vld1_u8(pu1_src_tmp)
+ ld1 {v3.8b},[x6],x2 //vld1_u8(pu1_src_tmp)
+ ld1 {v5.8b},[x6],x2 //vld1_u8(pu1_src_tmp)
+ ld1 {v7.8b},[x6],x2 //vld1_u8(pu1_src_tmp)
st1 {v0.8h},[x1],#16 //vst1q_s16(pi2_dst_tmp, tmp)
add x20,x1,x11,lsl #1
@@ -256,15 +256,15 @@ prolog:
outer_loop_wd_8:
st1 {v2.8h},[x10],x5 //vst1q_s16(pi2_dst_tmp, tmp)
- uxtl v16.8h, v8.8b //vmovl_u8(vld1_u8(pu1_src_tmp))
+ uxtl v16.8h, v1.8b //vmovl_u8(vld1_u8(pu1_src_tmp))
st1 {v4.8h},[x10],x5 //vst1q_s16(pi2_dst_tmp, tmp)
- uxtl v18.8h, v10.8b //vmovl_u8(vld1_u8(pu1_src_tmp)
+ uxtl v18.8h, v3.8b //vmovl_u8(vld1_u8(pu1_src_tmp)
st1 {v6.8h},[x10],x5 //vst1q_s16(pi2_dst_tmp, tmp)
- uxtl v20.8h, v12.8b //vmovl_u8(vld1_u8(pu1_src_tmp)
+ uxtl v20.8h, v5.8b //vmovl_u8(vld1_u8(pu1_src_tmp)
- uxtl v22.8h, v14.8b //vmovl_u8(vld1_u8(pu1_src_tmp)
+ uxtl v22.8h, v7.8b //vmovl_u8(vld1_u8(pu1_src_tmp)
subs x4,x4,#8 //wd decrements by 8
add x20,x0,x8
@@ -272,16 +272,16 @@ outer_loop_wd_8:
add x6,x0,x2 //pu1_src_tmp += src_strd
- ld1 {v8.8b},[x0],#8 //vld1_u8(pu1_src_tmp)
+ ld1 {v1.8b},[x0],#8 //vld1_u8(pu1_src_tmp)
shl v0.8h, v16.8h,#6 //vshlq_n_s16(tmp, 6)
- ld1 {v10.8b},[x6],x2 //vld1_u8(pu1_src_tmp)
+ ld1 {v3.8b},[x6],x2 //vld1_u8(pu1_src_tmp)
shl v2.8h, v18.8h,#6 //vshlq_n_s16(tmp, 6)
- ld1 {v12.8b},[x6],x2 //vld1_u8(pu1_src_tmp)
+ ld1 {v5.8b},[x6],x2 //vld1_u8(pu1_src_tmp)
shl v4.8h, v20.8h,#6 //vshlq_n_s16(tmp, 6)
- ld1 {v14.8b},[x6],x2 //vld1_u8(pu1_src_tmp)
+ ld1 {v7.8b},[x6],x2 //vld1_u8(pu1_src_tmp)
add x10,x1,x5
shl v6.8h, v22.8h,#6 //vshlq_n_s16(tmp, 6)
@@ -298,15 +298,15 @@ outer_loop_wd_8:
epilog:
st1 {v2.8h},[x10],x5 //vst1q_s16(pi2_dst_tmp, tmp)
- uxtl v16.8h, v8.8b //vmovl_u8(vld1_u8(pu1_src_tmp))
+ uxtl v16.8h, v1.8b //vmovl_u8(vld1_u8(pu1_src_tmp))
st1 {v4.8h},[x10],x5 //vst1q_s16(pi2_dst_tmp, tmp)
- uxtl v18.8h, v10.8b //vmovl_u8(vld1_u8(pu1_src_tmp)
+ uxtl v18.8h, v3.8b //vmovl_u8(vld1_u8(pu1_src_tmp)
st1 {v6.8h},[x10],x5 //vst1q_s16(pi2_dst_tmp, tmp)
- uxtl v20.8h, v12.8b //vmovl_u8(vld1_u8(pu1_src_tmp)
+ uxtl v20.8h, v5.8b //vmovl_u8(vld1_u8(pu1_src_tmp)
- uxtl v22.8h, v14.8b //vmovl_u8(vld1_u8(pu1_src_tmp)
+ uxtl v22.8h, v7.8b //vmovl_u8(vld1_u8(pu1_src_tmp)
//add x6,x0,x2 //pu1_src_tmp += src_strd
shl v0.8h, v16.8h,#6 //vshlq_n_s16(tmp, 6)
@@ -325,10 +325,10 @@ epilog_end:
core_loop_wd_8_ht_2:
add x6,x0,x2 //pu1_src_tmp += src_strd
add x10,x1,x5
- ld1 {v8.8b},[x0],#8 //vld1_u8(pu1_src_tmp)
- ld1 {v10.8b},[x6],x2 //vld1_u8(pu1_src_tmp)
- uxtl v16.8h, v8.8b //vmovl_u8(vld1_u8(pu1_src_tmp))
- uxtl v18.8h, v10.8b //vmovl_u8(vld1_u8(pu1_src_tmp)
+ ld1 {v1.8b},[x0],#8 //vld1_u8(pu1_src_tmp)
+ ld1 {v3.8b},[x6],x2 //vld1_u8(pu1_src_tmp)
+ uxtl v16.8h, v1.8b //vmovl_u8(vld1_u8(pu1_src_tmp))
+ uxtl v18.8h, v3.8b //vmovl_u8(vld1_u8(pu1_src_tmp)
subs x12,x12,#8 //wd decrements by 8
shl v0.8h, v16.8h,#6 //vshlq_n_s16(tmp, 6)
shl v2.8h, v18.8h,#6 //vshlq_n_s16(tmp, 6)
@@ -338,7 +338,7 @@ core_loop_wd_8_ht_2:
// ldmfd sp!,{x4-x12,x15} //reload the registers from sp
ldp x19, x20,[sp],#16
- pop_v_regs
+
ret
diff --git a/common/arm64/ihevc_inter_pred_chroma_horz.s b/common/arm64/ihevc_inter_pred_chroma_horz.s
index cf4f0f9..513a362 100644
--- a/common/arm64/ihevc_inter_pred_chroma_horz.s
+++ b/common/arm64/ihevc_inter_pred_chroma_horz.s
@@ -105,7 +105,12 @@
ihevc_inter_pred_chroma_horz_av8:
// stmfd sp!, {x4-x12, x14} //stack stores the values of the arguments
- push_v_regs
+
+ stp d9,d10,[sp,#-16]!
+ stp d11,d12,[sp,#-16]!
+ stp d13,d14,[sp,#-16]!
+ stp d8,d15,[sp,#-16]! // Storing d15 using { sub sp,sp,#8; str d15,[sp] } is giving bus error.
+ // d8 is used as dummy register and stored along with d15 using stp. d8 is not used in the function.
stp x19, x20,[sp,#-16]!
mov x15,x4 // pi1_coeff
@@ -184,7 +189,7 @@ outer_loop_16:
add x19,x4,#8
umull v30.8h, v2.8b, v25.8b //mul_res = vmull_u8(src[0_3], coeffabs_3)//
- ld1 { v8.2s},[x4],x11 //vector load pu1_src
+ ld1 { v29.2s},[x4],x11 //vector load pu1_src
ld1 { v9.2s},[x19],x11 //vector load pu1_src
umlsl v30.8h, v0.8b, v24.8b //mul_res = vmlsl_u8(src[0_2], coeffabs_2)//
@@ -239,7 +244,7 @@ inner_loop_16:
csel x12, x20, x12,eq
add x20,x12,x2
csel x4, x20, x4,eq
- umlsl v22.8h, v8.8b, v24.8b //mul_res = vmlsl_u8(src[0_2], coeffabs_2)//
+ umlsl v22.8h, v29.8b, v24.8b //mul_res = vmlsl_u8(src[0_2], coeffabs_2)//
@@ -282,7 +287,7 @@ inner_loop_16:
umlal v20.8h, v13.8b, v26.8b //mul_res = vmlsl_u8(src[0_0], coeffabs_0)//
add x19,x4,#8
- ld1 { v8.2s},[x4],x11 //vector load pu1_src
+ ld1 { v29.2s},[x4],x11 //vector load pu1_src
ld1 { v9.2s},[x19],x11 //vector load pu1_src
umlsl v20.8h, v15.8b, v27.8b //mul_res = vmlal_u8(src[0_1], coeffabs_1)//
@@ -351,7 +356,7 @@ epilog:
- umlsl v22.8h, v8.8b, v24.8b //mul_res = vmlsl_u8(src[0_2], coeffabs_2)//
+ umlsl v22.8h, v29.8b, v24.8b //mul_res = vmlsl_u8(src[0_2], coeffabs_2)//
subs x10,x10,#16 //decrement the wd loop
umlal v22.8h, v12.8b, v26.8b //mul_res = vmlsl_u8(src[0_0], coeffabs_0)//
add x20,x12,x8
@@ -383,7 +388,7 @@ epilog:
add x19,x4,#8
- ld1 { v8.2s},[x4],x11 //vector load pu1_src
+ ld1 { v29.2s},[x4],x11 //vector load pu1_src
ld1 { v9.2s},[x19],x11 //vector load pu1_src
umlsl v30.8h, v0.8b, v24.8b //mul_res = vmlsl_u8(src[0_2], coeffabs_2)//
ld1 { v10.2s},[x4],x11 //vector load pu1_src
@@ -418,7 +423,7 @@ epilog_end:
umull v22.8h, v10.8b, v25.8b //mul_res = vmull_u8(src[0_3], coeffabs_3)//
- umlsl v22.8h, v8.8b, v24.8b //mul_res = vmlsl_u8(src[0_2], coeffabs_2)//
+ umlsl v22.8h, v29.8b, v24.8b //mul_res = vmlsl_u8(src[0_2], coeffabs_2)//
umlal v22.8h, v12.8b, v26.8b //mul_res = vmlsl_u8(src[0_0], coeffabs_0)//
umlsl v22.8h, v14.8b, v27.8b //mul_res = vmlal_u8(src[0_1], coeffabs_1)//
@@ -478,12 +483,12 @@ inner_loop_8:
ld1 {v3.2s},[x12],x11 //vector load pu1_src
//vext.u8 d2,d0,d1,#2 //vector extract of src[0_2]
- umull v8.8h, v1.8b, v25.8b //mul_res = vmull_u8(src[0_3], coeffabs_3)//
- umlsl v8.8h, v0.8b, v24.8b //mul_res = vmlsl_u8(src[0_2], coeffabs_2)//
+ umull v29.8h, v1.8b, v25.8b //mul_res = vmull_u8(src[0_3], coeffabs_3)//
+ umlsl v29.8h, v0.8b, v24.8b //mul_res = vmlsl_u8(src[0_2], coeffabs_2)//
//vext.u8 d4,d0,d1,#4 //vector extract of src[0_4]
//vext.u8 d6,d0,d1,#6 //vector extract of src[0_6]
- umlal v8.8h, v2.8b, v26.8b //mul_res = vmlsl_u8(src[0_0], coeffabs_0)//
- umlsl v8.8h, v3.8b, v27.8b //mul_res = vmlal_u8(src[0_1], coeffabs_1)//
+ umlal v29.8h, v2.8b, v26.8b //mul_res = vmlsl_u8(src[0_0], coeffabs_0)//
+ umlsl v29.8h, v3.8b, v27.8b //mul_res = vmlal_u8(src[0_1], coeffabs_1)//
ld1 {v4.2s},[x4],x11 //vector load pu1_src
ld1 {v5.2s},[x4],x11 //vector load pu1_src
@@ -495,11 +500,11 @@ inner_loop_8:
umlsl v10.8h, v4.8b, v24.8b //mul_res = vmlsl_u8(src[0_2], coeffabs_2)//
//vext.u8 d16,d12,d13,#4 //vector extract of src[0_4]
//vext.u8 d18,d12,d13,#6 //vector extract of src[0_6]
- sqrshrun v8.8b, v8.8h,#6 //right shift and saturating narrow result 1
+ sqrshrun v29.8b, v29.8h,#6 //right shift and saturating narrow result 1
umlal v10.8h, v6.8b, v26.8b //mul_res = vmlsl_u8(src[0_0], coeffabs_0)//
umlsl v10.8h, v7.8b, v27.8b //mul_res = vmlal_u8(src[0_1], coeffabs_1)//
- st1 {v8.8b},[x1],#8 //store the result pu1_dst
+ st1 {v29.8b},[x1],#8 //store the result pu1_dst
sqrshrun v10.8b, v10.8h,#6 //right shift and saturating narrow result 2
subs x7,x7,#8 //decrement the wd loop
@@ -545,17 +550,17 @@ inner_loop_ht_4:
//sub x12, x12, #6 //(2)
ld1 {v14.2s},[x12],x11 //(3)vector load pu1_src
- umull v8.8h, v1.8b, v25.8b //(1)mul_res = vmull_u8(src[0_3], coeffabs_3)//
+ umull v29.8h, v1.8b, v25.8b //(1)mul_res = vmull_u8(src[0_3], coeffabs_3)//
ld1 {v15.2s},[x12],x11 //(3)vector load pu1_src
- umlsl v8.8h, v0.8b, v24.8b //(1)mul_res = vmlsl_u8(src[0_2], coeffabs_2)//
+ umlsl v29.8h, v0.8b, v24.8b //(1)mul_res = vmlsl_u8(src[0_2], coeffabs_2)//
ld1 {v16.2s},[x12],x11 //(3)vector load pu1_src
- umlal v8.8h, v2.8b, v26.8b //(1)mul_res = vmlsl_u8(src[0_0], coeffabs_0)//
+ umlal v29.8h, v2.8b, v26.8b //(1)mul_res = vmlsl_u8(src[0_0], coeffabs_0)//
//ld1 {v17.2s},[x12],x2 //(3)vector load pu1_src
ld1 {v17.2s},[x12],x8 //(3)vector load pu1_src
- umlsl v8.8h, v3.8b, v27.8b //(1)mul_res = vmlal_u8(src[0_1], coeffabs_1)//
+ umlsl v29.8h, v3.8b, v27.8b //(1)mul_res = vmlal_u8(src[0_1], coeffabs_1)//
//sub x12, x12, #6 //(3)
umull v10.8h, v5.8b, v25.8b //(2)mul_res = vmull_u8(src[0_3], coeffabs_3)//
@@ -570,7 +575,7 @@ inner_loop_ht_4:
umlsl v10.8h, v7.8b, v27.8b //(2)mul_res = vmlal_u8(src[0_1], coeffabs_1)//
ld1 {v21.2s},[x12],x2 //(4)vector load pu1_src
- sqrshrun v8.8b, v8.8h,#6 //(1)right shift and saturating narrow result 1
+ sqrshrun v29.8b, v29.8h,#6 //(1)right shift and saturating narrow result 1
add x9,x9,#8 //(core loop)
@@ -595,7 +600,7 @@ core_loop:
//sub x12, x12, #6 //(1_1)
- st1 {v8.8b},[x4],x3 //(1)store the result pu1_dst
+ st1 {v29.8b},[x4],x3 //(1)store the result pu1_dst
sqrshrun v10.8b, v10.8h,#6 //(2)right shift and saturating narrow result 2
ld1 {v4.2s},[x12],x11 //(2_1)vector load pu1_src
@@ -617,17 +622,17 @@ core_loop:
sqrshrun v12.8b, v12.8h,#6 //(3)right shift and saturating narrow result 1
ld1 {v14.2s},[x12],x11 //(3_1)vector load pu1_src
- umull v8.8h, v1.8b, v25.8b //(1_1)mul_res = vmull_u8(src[0_3], coeffabs_3)//
+ umull v29.8h, v1.8b, v25.8b //(1_1)mul_res = vmull_u8(src[0_3], coeffabs_3)//
ld1 {v15.2s},[x12],x11 //(3_1)vector load pu1_src
- umlsl v8.8h, v0.8b, v24.8b //(1_1)mul_res = vmlsl_u8(src[0_2], coeffabs_2)//
+ umlsl v29.8h, v0.8b, v24.8b //(1_1)mul_res = vmlsl_u8(src[0_2], coeffabs_2)//
ld1 {v16.2s},[x12],x11 //(3_1)vector load pu1_src
- umlal v8.8h, v2.8b, v26.8b //(1_1)mul_res = vmlsl_u8(src[0_0], coeffabs_0)//
+ umlal v29.8h, v2.8b, v26.8b //(1_1)mul_res = vmlsl_u8(src[0_0], coeffabs_0)//
//ld1 {v17.2s},[x12],x2 //(3_1)vector load pu1_src
ld1 {v17.2s},[x12],x8 //(3_1)vector load pu1_src
- umlsl v8.8h, v3.8b, v27.8b //(1_1)mul_res = vmlal_u8(src[0_1], coeffabs_1)//
+ umlsl v29.8h, v3.8b, v27.8b //(1_1)mul_res = vmlal_u8(src[0_1], coeffabs_1)//
//sub x12, x12, #6 //(3_1)
@@ -653,7 +658,7 @@ core_loop:
subs x7,x7,#8 //(core loop)
st1 {v22.8b},[x4], x3 //(4)store the result pu1_dst
- sqrshrun v8.8b, v8.8h,#6 //(1_1)right shift and saturating narrow result 1
+ sqrshrun v29.8b, v29.8h,#6 //(1_1)right shift and saturating narrow result 1
mov x4, x1 //(core loop)
@@ -668,7 +673,7 @@ epilogue:
umlsl v12.8h, v17.8b, v27.8b //(3)mul_res = vmlal_u8(src[0_1], coeffabs_1)//
- st1 {v8.8b},[x4],x3 //(1)store the result pu1_dst
+ st1 {v29.8b},[x4],x3 //(1)store the result pu1_dst
sqrshrun v10.8b, v10.8h,#6 //(2)right shift and saturating narrow result 2
umull v22.8h, v19.8b, v25.8b //(4)mul_res = vmull_u8(src[0_3], coeffabs_3)//
@@ -735,16 +740,16 @@ inner_loop_4:
zip1 v3.2s, v23.2s, v19.2s
zip2 v7.2s, v23.2s, v19.2s
- umull v8.8h, v1.8b, v25.8b //arithmetic operations for ii iteration in the same time
- umlsl v8.8h, v0.8b, v24.8b
- umlal v8.8h, v2.8b, v26.8b
- umlsl v8.8h, v3.8b, v27.8b
+ umull v29.8h, v1.8b, v25.8b //arithmetic operations for ii iteration in the same time
+ umlsl v29.8h, v0.8b, v24.8b
+ umlal v29.8h, v2.8b, v26.8b
+ umlsl v29.8h, v3.8b, v27.8b
- sqrshrun v8.8b, v8.8h,#6 //narrow right shift and saturating the result
- st1 {v8.s}[0],[x1],#4 //store the i iteration result which is in upper part of the register
+ sqrshrun v29.8b, v29.8h,#6 //narrow right shift and saturating the result
+ st1 {v29.s}[0],[x1],#4 //store the i iteration result which is in upper part of the register
subs x7,x7,#4 //decrement the wd by 4
- st1 {v8.s}[1],[x6],#4 //store the ii iteration result which is in lower part of the register
+ st1 {v29.s}[1],[x6],#4 //store the ii iteration result which is in lower part of the register
bgt inner_loop_4
@@ -759,7 +764,11 @@ end_loops:
// ldmfd sp!,{x4-x12,x15} //reload the registers from sp
ldp x19, x20,[sp],#16
- pop_v_regs
+ ldp d8,d15,[sp],#16 // Loading d15 using { ldr d15,[sp]; add sp,sp,#8 } is giving bus error.
+ // d8 is used as dummy register and loaded along with d15 using ldp. d8 is not used in the function.
+ ldp d13,d14,[sp],#16
+ ldp d11,d12,[sp],#16
+ ldp d9,d10,[sp],#16
ret
diff --git a/common/arm64/ihevc_inter_pred_chroma_horz_w16out.s b/common/arm64/ihevc_inter_pred_chroma_horz_w16out.s
index a35fdaa..efc09f9 100644
--- a/common/arm64/ihevc_inter_pred_chroma_horz_w16out.s
+++ b/common/arm64/ihevc_inter_pred_chroma_horz_w16out.s
@@ -104,7 +104,10 @@
ihevc_inter_pred_chroma_horz_w16out_av8:
// stmfd sp!, {x4-x12, x14} //stack stores the values of the arguments
- push_v_regs
+
+ stp d10,d11,[sp,#-16]!
+ stp d12,d13,[sp,#-16]!
+ stp d14,d15,[sp,#-16]!
stp x19, x20,[sp,#-16]!
mov x15,x4 // pi1_coeff
@@ -201,8 +204,8 @@ outer_loop_16:
add x19,x4,#8
umull v30.8h, v2.8b, v25.8b //mul_res = vmull_u8(src[0_3], coeffabs_3)//
- ld1 { v8.2s},[x4],x11 //vector load pu1_src
- ld1 { v9.2s},[x19],x11 //vector load pu1_src
+ ld1 { v29.2s},[x4],x11 //vector load pu1_src
+ ld1 { v31.2s},[x19],x11 //vector load pu1_src
umlsl v30.8h, v0.8b, v24.8b //mul_res = vmlsl_u8(src[0_2], coeffabs_2)//
@@ -261,7 +264,7 @@ inner_loop_16:
st1 { v30.8h}, [x1],#16
- umlsl v22.8h, v8.8b, v24.8b //mul_res = vmlsl_u8(src[0_2], coeffabs_2)//
+ umlsl v22.8h, v29.8b, v24.8b //mul_res = vmlsl_u8(src[0_2], coeffabs_2)//
@@ -284,15 +287,15 @@ inner_loop_16:
umull v20.8h, v11.8b, v25.8b //mul_res = vmull_u8(src[0_3], coeffabs_3)//
st1 { v28.8h}, [x1],x8
- umlsl v20.8h, v9.8b, v24.8b //mul_res = vmlsl_u8(src[0_2], coeffabs_2)//
+ umlsl v20.8h, v31.8b, v24.8b //mul_res = vmlsl_u8(src[0_2], coeffabs_2)//
ld1 { v6.2s},[x12],x9 //vector load pu1_src
ld1 { v7.2s},[x19],x9 //vector load pu1_src
umlal v20.8h, v13.8b, v26.8b //mul_res = vmlsl_u8(src[0_0], coeffabs_0)//
add x19,x4,#8
- ld1 { v8.2s},[x4],x11 //vector load pu1_src
- ld1 { v9.2s},[x19],x11 //vector load pu1_src
+ ld1 { v29.2s},[x4],x11 //vector load pu1_src
+ ld1 { v31.2s},[x19],x11 //vector load pu1_src
umlsl v20.8h, v15.8b, v27.8b //mul_res = vmlal_u8(src[0_1], coeffabs_1)//
@@ -346,7 +349,7 @@ epilog:
- umlsl v22.8h, v8.8b, v24.8b //mul_res = vmlsl_u8(src[0_2], coeffabs_2)//
+ umlsl v22.8h, v29.8b, v24.8b //mul_res = vmlsl_u8(src[0_2], coeffabs_2)//
subs x10,x10,#16 //decrement the wd loop
umlal v22.8h, v12.8b, v26.8b //mul_res = vmlsl_u8(src[0_0], coeffabs_0)//
// add x20,x12,x2,lsl #1
@@ -365,7 +368,7 @@ epilog:
ld1 { v0.2s},[x12],x11 //vector load pu1_src
ld1 { v1.2s},[x19],x11 //vector load pu1_src
- umlsl v20.8h, v9.8b, v24.8b //mul_res = vmlsl_u8(src[0_2], coeffabs_2)//
+ umlsl v20.8h, v31.8b, v24.8b //mul_res = vmlsl_u8(src[0_2], coeffabs_2)//
ld1 { v2.2s},[x12],x11 //vector load pu1_src
ld1 { v3.2s},[x19],x11 //vector load pu1_src
@@ -381,8 +384,8 @@ epilog:
umull v30.8h, v2.8b, v25.8b //mul_res = vmull_u8(src[0_3], coeffabs_3)//
add x19,x4,#8
- ld1 { v8.2s},[x4],x11 //vector load pu1_src
- ld1 { v9.2s},[x19],x11 //vector load pu1_src
+ ld1 { v29.2s},[x4],x11 //vector load pu1_src
+ ld1 { v31.2s},[x19],x11 //vector load pu1_src
umlsl v30.8h, v0.8b, v24.8b //mul_res = vmlsl_u8(src[0_2], coeffabs_2)//
ld1 { v10.2s},[x4],x11 //vector load pu1_src
@@ -410,13 +413,13 @@ epilog:
epilog_end:
umull v22.8h, v10.8b, v25.8b //mul_res = vmull_u8(src[0_3], coeffabs_3)//
- umlsl v22.8h, v8.8b, v24.8b //mul_res = vmlsl_u8(src[0_2], coeffabs_2)//
+ umlsl v22.8h, v29.8b, v24.8b //mul_res = vmlsl_u8(src[0_2], coeffabs_2)//
umlal v22.8h, v12.8b, v26.8b //mul_res = vmlsl_u8(src[0_0], coeffabs_0)//
umlsl v22.8h, v14.8b, v27.8b //mul_res = vmlal_u8(src[0_1], coeffabs_1)//
umull v20.8h, v11.8b, v25.8b //mul_res = vmull_u8(src[0_3], coeffabs_3)//
- umlsl v20.8h, v9.8b, v24.8b //mul_res = vmlsl_u8(src[0_2], coeffabs_2)//
+ umlsl v20.8h, v31.8b, v24.8b //mul_res = vmlsl_u8(src[0_2], coeffabs_2)//
umlal v20.8h, v13.8b, v26.8b //mul_res = vmlsl_u8(src[0_0], coeffabs_0)//
umlsl v20.8h, v15.8b, v27.8b //mul_res = vmlal_u8(src[0_1], coeffabs_1)//
@@ -463,12 +466,12 @@ inner_loop_8:
//vext.u8 d2,d0,d1,#2 //vector extract of src[0_2]
- umull v8.8h, v1.8b, v25.8b //mul_res = vmull_u8(src[0_3], coeffabs_3)//
- umlsl v8.8h, v0.8b, v24.8b //mul_res = vmlsl_u8(src[0_2], coeffabs_2)//
+ umull v29.8h, v1.8b, v25.8b //mul_res = vmull_u8(src[0_3], coeffabs_3)//
+ umlsl v29.8h, v0.8b, v24.8b //mul_res = vmlsl_u8(src[0_2], coeffabs_2)//
//vext.u8 d4,d0,d1,#4 //vector extract of src[0_4]
//vext.u8 d6,d0,d1,#6 //vector extract of src[0_6]
- umlal v8.8h, v2.8b, v26.8b //mul_res = vmlsl_u8(src[0_0], coeffabs_0)//
- umlsl v8.8h, v3.8b, v27.8b //mul_res = vmlal_u8(src[0_1], coeffabs_1)//
+ umlal v29.8h, v2.8b, v26.8b //mul_res = vmlsl_u8(src[0_0], coeffabs_0)//
+ umlsl v29.8h, v3.8b, v27.8b //mul_res = vmlal_u8(src[0_1], coeffabs_1)//
//ld1 {v12.2s, v13.2s},[x4],x11 //vector load pu1_src + src_strd
ld1 {v4.2s},[x4],x11 //vector load pu1_src
@@ -483,7 +486,7 @@ inner_loop_8:
umlal v10.8h, v6.8b, v26.8b //mul_res = vmlsl_u8(src[0_0], coeffabs_0)//
umlsl v10.8h, v7.8b, v27.8b //mul_res = vmlal_u8(src[0_1], coeffabs_1)//
- st1 {v8.8h}, [x1],#16
+ st1 {v29.8h}, [x1],#16
subs x10,x10,#8 //decrement the wd loop
st1 {v10.8h},[x6],#16 //store the result pu1_dst
@@ -530,16 +533,16 @@ inner_loop_ht_4:
ld1 {v7.2s},[x12],x0 //(2)vector load pu1_src
ld1 {v14.2s},[x12],x11 //(3)vector load pu1_src
- umull v8.8h, v1.8b, v25.8b //(1)mul_res = vmull_u8(src[0_3], coeffabs_3)//
+ umull v29.8h, v1.8b, v25.8b //(1)mul_res = vmull_u8(src[0_3], coeffabs_3)//
ld1 {v15.2s},[x12],x11 //(3)vector load pu1_src
- umlsl v8.8h, v0.8b, v24.8b //(1)mul_res = vmlsl_u8(src[0_2], coeffabs_2)//
+ umlsl v29.8h, v0.8b, v24.8b //(1)mul_res = vmlsl_u8(src[0_2], coeffabs_2)//
ld1 {v16.2s},[x12],x11 //(3)vector load pu1_src
- umlal v8.8h, v2.8b, v26.8b //(1)mul_res = vmlsl_u8(src[0_0], coeffabs_0)//
+ umlal v29.8h, v2.8b, v26.8b //(1)mul_res = vmlsl_u8(src[0_0], coeffabs_0)//
ld1 {v17.2s},[x12],x0 //(3)vector load pu1_src
- umlsl v8.8h, v3.8b, v27.8b //(1)mul_res = vmlal_u8(src[0_1], coeffabs_1)//
+ umlsl v29.8h, v3.8b, v27.8b //(1)mul_res = vmlal_u8(src[0_1], coeffabs_1)//
ld1 {v18.2s},[x12],x11 //(4)vector load pu1_src
umull v10.8h, v5.8b, v25.8b //(2)mul_res = vmull_u8(src[0_3], coeffabs_3)//
@@ -559,7 +562,7 @@ inner_loop_ht_4:
beq epilogue
core_loop:
- st1 {v8.8h},[x4],x8 //(1)store the result pu1_dst
+ st1 {v29.8h},[x4],x8 //(1)store the result pu1_dst
mov x12,x9
ld1 {v0.2s},[x12],x11 //(1_1)vector load pu1_src
@@ -593,16 +596,16 @@ core_loop:
add x1,x1,#16 //(core loop)
ld1 {v14.2s},[x12],x11 //(3_1)vector load pu1_src
- umull v8.8h, v1.8b, v25.8b //(1_1)mul_res = vmull_u8(src[0_3], coeffabs_3)//
+ umull v29.8h, v1.8b, v25.8b //(1_1)mul_res = vmull_u8(src[0_3], coeffabs_3)//
ld1 {v15.2s},[x12],x11 //(3_1)vector load pu1_src
- umlsl v8.8h, v0.8b, v24.8b //(1_1)mul_res = vmlsl_u8(src[0_2], coeffabs_2)//
+ umlsl v29.8h, v0.8b, v24.8b //(1_1)mul_res = vmlsl_u8(src[0_2], coeffabs_2)//
ld1 {v16.2s},[x12],x11 //(3_1)vector load pu1_src
- umlal v8.8h, v2.8b, v26.8b //(1_1)mul_res = vmlsl_u8(src[0_0], coeffabs_0)//
+ umlal v29.8h, v2.8b, v26.8b //(1_1)mul_res = vmlsl_u8(src[0_0], coeffabs_0)//
ld1 {v17.2s},[x12],x0 //(3_1)vector load pu1_src
- umlsl v8.8h, v3.8b, v27.8b //(1_1)mul_res = vmlal_u8(src[0_1], coeffabs_1)//
+ umlsl v29.8h, v3.8b, v27.8b //(1_1)mul_res = vmlal_u8(src[0_1], coeffabs_1)//
st1 {v22.8h}, [x4], x8 //(4)store the result pu1_dst
subs x10,x10,#8 //(core loop)
@@ -634,7 +637,7 @@ epilogue:
umlsl v12.8h, v17.8b, v27.8b //(3)mul_res = vmlal_u8(src[0_1], coeffabs_1)//
- st1 {v8.8h},[x4], x8 //(1)store the result pu1_dst
+ st1 {v29.8h},[x4], x8 //(1)store the result pu1_dst
umull v22.8h, v19.8b, v25.8b //(4)mul_res = vmull_u8(src[0_3], coeffabs_3)//
umlsl v22.8h, v18.8b, v24.8b //(4)mul_res = vmlsl_u8(src[0_2], coeffabs_2)//
@@ -720,15 +723,15 @@ inner_loop_4:
zip2 v7.2s, v23.2s, v19.2s
//**** addn ends
- umull v8.8h, v1.8b, v25.8b //arithmetic operations for ii iteration in the same time
- umlsl v8.8h, v0.8b, v24.8b
- umlal v8.8h, v2.8b, v26.8b
- umlsl v8.8h, v3.8b, v27.8b
+ umull v29.8h, v1.8b, v25.8b //arithmetic operations for ii iteration in the same time
+ umlsl v29.8h, v0.8b, v24.8b
+ umlal v29.8h, v2.8b, v26.8b
+ umlsl v29.8h, v3.8b, v27.8b
- st1 {v8.d}[0],[x1],#8 //store the i iteration result which is in upper part of the register
+ st1 {v29.d}[0],[x1],#8 //store the i iteration result which is in upper part of the register
subs x10,x10,#4 //decrement the wd by 4
- st1 {v8.d}[1],[x6],#8 //store the ii iteration result which is in lower part of the register
+ st1 {v29.d}[1],[x6],#8 //store the ii iteration result which is in lower part of the register
bgt inner_loop_4
@@ -763,12 +766,12 @@ loop_residue:
//vext.u8 d6,d0,d1,#6 //vector extract of src[0_6]
//umlal v8.8h, v4.8b, v26.8b //mul_res = vmlsl_u8(src[0_0], coeffabs_0)//
//umlsl v8.8h, v6.8b, v27.8b //mul_res = vmlal_u8(src[0_1], coeffabs_1)//
- umull v8.8h, v21.8b, v25.8b
- umlsl v8.8h, v20.8b, v24.8b
- umlal v8.8h, v22.8b, v26.8b
- umlsl v8.8h, v23.8b, v27.8b
+ umull v29.8h, v21.8b, v25.8b
+ umlsl v29.8h, v20.8b, v24.8b
+ umlal v29.8h, v22.8b, v26.8b
+ umlsl v29.8h, v23.8b, v27.8b
- st1 {v8.1d},[x1] //store the result pu1_dst
+ st1 {v29.1d},[x1] //store the result pu1_dst
subs x10,x10,#4 //decrement the wd loop
add x1,x1,#8 //pi2_dst + 8
@@ -788,7 +791,9 @@ end_loops:
// ldmfd sp!,{x4-x12,x15} //reload the registers from sp
ldp x19, x20,[sp],#16
- pop_v_regs
+ ldp d14,d15,[sp],#16
+ ldp d12,d13,[sp],#16
+ ldp d10,d11,[sp],#16
ret
diff --git a/common/arm64/ihevc_inter_pred_chroma_vert.s b/common/arm64/ihevc_inter_pred_chroma_vert.s
index 2de789f..3d61f6c 100644
--- a/common/arm64/ihevc_inter_pred_chroma_vert.s
+++ b/common/arm64/ihevc_inter_pred_chroma_vert.s
@@ -104,7 +104,7 @@
ihevc_inter_pred_chroma_vert_av8:
// stmfd sp!,{x4-x12,x14} //stack stores the values of the arguments
- push_v_regs
+
stp x19, x20,[sp,#-16]!
mov x15,x4 // pi1_coeff
@@ -142,21 +142,21 @@ ihevc_inter_pred_chroma_vert_av8:
inner_loop_ht_2: //called when wd is multiple of 4 and ht is 4,2
add x6,x0,x2 //pu1_src +src_strd
- ld1 {v9.8b},[x6],x2 //loads pu1_src
+ ld1 {v17.8b},[x6],x2 //loads pu1_src
subs x5,x5,#8 //2wd - 8
ld1 {v5.8b},[x0],#8 //loads src
- umull v6.8h, v9.8b, v1.8b //vmull_u8(vreinterpret_u8_u32(src_tmp2), coeffabs_1)
+ umull v6.8h, v17.8b, v1.8b //vmull_u8(vreinterpret_u8_u32(src_tmp2), coeffabs_1)
ld1 {v4.8b},[x6],x2 //loads incremented src
umlsl v6.8h, v5.8b, v0.8b //vmlsl_u8(mul_res1, vreinterpret_u8_u32(src_tmp1), coeffabs_0)
- ld1 {v8.8b},[x6],x2 //loads incremented src
+ ld1 {v16.8b},[x6],x2 //loads incremented src
umlal v6.8h, v4.8b, v2.8b //vmlal_u8(mul_res1, vreinterpret_u8_u32(src_tmp3), coeffabs_2)
umull v4.8h, v4.8b, v1.8b
- umlsl v6.8h, v8.8b, v3.8b
- umlsl v4.8h, v9.8b, v0.8b
- ld1 {v10.8b},[x6] //loads the incremented src
- umlal v4.8h, v8.8b, v2.8b
+ umlsl v6.8h, v16.8b, v3.8b
+ umlsl v4.8h, v17.8b, v0.8b
+ ld1 {v18.8b},[x6] //loads the incremented src
+ umlal v4.8h, v16.8b, v2.8b
sqrshrun v6.8b, v6.8h,#6 //shifts right
- umlsl v4.8h, v10.8b, v3.8b
+ umlsl v4.8h, v18.8b, v3.8b
add x6,x1,x3 //pu1_dst + dst_strd
sqrshrun v4.8b, v4.8h,#6 //shifts right
st1 {v6.8b},[x1],#8 //stores the loaded value
@@ -240,7 +240,7 @@ prolog:
add x7,x1,x3 //pu1_dst
umlal v30.8h, v6.8b, v2.8b
umlsl v30.8h, v7.8b, v3.8b
- ld1 {v8.8b},[x6],x2 //load and increment
+ ld1 {v16.8b},[x6],x2 //load and increment
umull v28.8h, v6.8b, v1.8b //mul_res 2
add x20,x0,x9 //pu1_dst += 4*dst_strd - 2*wd
@@ -249,30 +249,30 @@ prolog:
bic x20,x10,#7 //x5 ->wd
csel x5, x20, x5,le
umlal v28.8h, v7.8b, v2.8b
- ld1 {v9.8b},[x6],x2
- umlsl v28.8h, v8.8b, v3.8b
+ ld1 {v17.8b},[x6],x2
+ umlsl v28.8h, v16.8b, v3.8b
sqrshrun v30.8b, v30.8h,#6
- ld1 {v10.8b},[x6],x2
+ ld1 {v18.8b},[x6],x2
umull v26.8h, v7.8b, v1.8b
add x6,x0,x2 //pu1_src + src_strd
umlsl v26.8h, v6.8b, v0.8b
st1 {v30.8b},[x1],#8 //stores the loaded value
- umlal v26.8h, v8.8b, v2.8b
+ umlal v26.8h, v16.8b, v2.8b
ld1 {v4.8b},[x0],#8 //loads the source
- umlsl v26.8h, v9.8b, v3.8b
+ umlsl v26.8h, v17.8b, v3.8b
sqrshrun v28.8b, v28.8h,#6
add x20,x1,x8 //pu1_src += 4*src_strd - 2*wd
csel x1, x20, x1,le
- umull v24.8h, v8.8b, v1.8b
+ umull v24.8h, v16.8b, v1.8b
ld1 {v5.8b},[x6],x2 //loads pu1_src
umlsl v24.8h, v7.8b, v0.8b
subs x12,x12,#4
ld1 {v6.8b},[x6],x2 //load and increment
- umlal v24.8h, v9.8b, v2.8b
+ umlal v24.8h, v17.8b, v2.8b
ld1 {v7.8b},[x6],x2 //load and increment
- umlsl v24.8h, v10.8b, v3.8b
+ umlsl v24.8h, v18.8b, v3.8b
lsl x11,x2,#2
st1 {v28.8b},[x7],x3 //stores the loaded value
@@ -299,7 +299,7 @@ kernel_8:
st1 {v26.8b},[x7],x3 //stores the loaded value
sqrshrun v24.8b, v24.8h,#6
- ld1 {v8.8b},[x6],x2 //load and increment
+ ld1 {v16.8b},[x6],x2 //load and increment
umull v28.8h, v6.8b, v1.8b //mul_res 2
bic x20,x10,#7 //x5 ->wd
@@ -309,11 +309,11 @@ kernel_8:
umlal v28.8h, v7.8b, v2.8b
- ld1 {v9.8b},[x6],x2
+ ld1 {v17.8b},[x6],x2
sqrshrun v30.8b, v30.8h,#6
- umlsl v28.8h, v8.8b, v3.8b
- ld1 {v10.8b},[x6],x2
+ umlsl v28.8h, v16.8b, v3.8b
+ ld1 {v18.8b},[x6],x2
add x7,x1,x3 //pu1_dst
umull v26.8h, v7.8b, v1.8b
add x6,x0,x2 //pu1_src + src_strd
@@ -325,16 +325,16 @@ kernel_8:
umlsl v26.8h, v6.8b, v0.8b
ld1 {v4.8b},[x0],#8 //loads the source
- umlal v26.8h, v8.8b, v2.8b
+ umlal v26.8h, v16.8b, v2.8b
st1 {v30.8b},[x1],#8 //stores the loaded value
- umlsl v26.8h, v9.8b, v3.8b
+ umlsl v26.8h, v17.8b, v3.8b
ld1 {v5.8b},[x6],x2 //loads pu1_src
add x11,x11,x2
sqrshrun v28.8b, v28.8h,#6
- umull v24.8h, v8.8b, v1.8b
+ umull v24.8h, v16.8b, v1.8b
ld1 {v6.8b},[x6],x2 //load and increment
add x20,x1,x8 //pu1_src += 4*src_strd - 2*wd
csel x1, x20, x1,le
@@ -348,10 +348,10 @@ kernel_8:
umlsl v24.8h, v7.8b, v0.8b
subs x12,x12,#4
- umlal v24.8h, v9.8b, v2.8b
+ umlal v24.8h, v17.8b, v2.8b
ld1 {v7.8b},[x6],x2 //load and increment
- umlsl v24.8h, v10.8b, v3.8b
+ umlsl v24.8h, v18.8b, v3.8b
st1 {v28.8b},[x7],x3 //stores the loaded value
sqrshrun v26.8b, v26.8h,#6
@@ -366,39 +366,39 @@ epilog:
st1 {v26.8b},[x7],x3 //stores the loaded value
sqrshrun v24.8b, v24.8h,#6
- ld1 {v8.8b},[x6],x2 //load and increment
+ ld1 {v16.8b},[x6],x2 //load and increment
umull v28.8h, v6.8b, v1.8b //mul_res 2
umlsl v28.8h, v5.8b, v0.8b
umlal v28.8h, v7.8b, v2.8b
- umlsl v28.8h, v8.8b, v3.8b
+ umlsl v28.8h, v16.8b, v3.8b
st1 {v24.8b},[x7],x3 //stores the loaded value
sqrshrun v30.8b, v30.8h,#6
- ld1 {v9.8b},[x6],x2
+ ld1 {v17.8b},[x6],x2
umull v26.8h, v7.8b, v1.8b
add x7,x1,x3 //pu1_dst
umlsl v26.8h, v6.8b, v0.8b
st1 {v30.8b},[x1],#8 //stores the loaded value
sqrshrun v28.8b, v28.8h,#6
- umlal v26.8h, v8.8b, v2.8b
- ld1 {v10.8b},[x6],x2
- umlsl v26.8h, v9.8b, v3.8b
+ umlal v26.8h, v16.8b, v2.8b
+ ld1 {v18.8b},[x6],x2
+ umlsl v26.8h, v17.8b, v3.8b
- umull v24.8h, v8.8b, v1.8b
+ umull v24.8h, v16.8b, v1.8b
sqrshrun v26.8b, v26.8h,#6
st1 {v28.8b},[x7],x3 //stores the loaded value
umlsl v24.8h, v7.8b, v0.8b
- umlal v24.8h, v9.8b, v2.8b
+ umlal v24.8h, v17.8b, v2.8b
st1 {v26.8b},[x7],x3 //stores the loaded value
- umlsl v24.8h, v10.8b, v3.8b
+ umlsl v24.8h, v18.8b, v3.8b
sqrshrun v24.8b, v24.8h,#6
st1 {v24.8b},[x7],x3 //stores the loaded value
end_loops:
// ldmfd sp!,{x4-x12,x15} //reload the registers from sp
ldp x19, x20,[sp],#16
- pop_v_regs
+
ret
diff --git a/common/arm64/ihevc_inter_pred_chroma_vert_w16inp.s b/common/arm64/ihevc_inter_pred_chroma_vert_w16inp.s
index 55e7f54..e8f17cc 100644
--- a/common/arm64/ihevc_inter_pred_chroma_vert_w16inp.s
+++ b/common/arm64/ihevc_inter_pred_chroma_vert_w16inp.s
@@ -104,7 +104,7 @@
ihevc_inter_pred_chroma_vert_w16inp_av8:
// stmfd sp!, {x4-x12, x14} //stack stores the values of the arguments
- push_v_regs
+
stp x19, x20,[sp,#-16]!
mov x15,x4 // pi1_coeff
@@ -120,10 +120,10 @@ ihevc_inter_pred_chroma_vert_w16inp_av8:
sxtl v0.8h, v0.8b //long the value
tst x6,#3 //checks wd == 2
- dup v12.4h, v0.4h[0] //coeff_0
- dup v13.4h, v0.4h[1] //coeff_1
- dup v14.4h, v0.4h[2] //coeff_2
- dup v15.4h, v0.4h[3] //coeff_3
+ dup v16.4h, v0.4h[0] //coeff_0
+ dup v17.4h, v0.4h[1] //coeff_1
+ dup v18.4h, v0.4h[2] //coeff_2
+ dup v19.4h, v0.4h[3] //coeff_3
bgt core_loop_ht_2 //jumps to loop handles wd 2
@@ -141,22 +141,22 @@ core_loop_ht_2:
inner_loop_ht_2:
add x0,x4,x2 //increments pi2_src
ld1 {v0.4h},[x4],#8 //loads pu1_src
- smull v0.4s, v0.4h, v12.4h //vmull_s16(src_tmp1, coeff_0)
+ smull v0.4s, v0.4h, v16.4h //vmull_s16(src_tmp1, coeff_0)
subs x12,x12,#8 //2wd + 8
ld1 {v2.4h},[x0],x2 //loads pi2_src
- smull v8.4s, v2.4h, v12.4h //vmull_s16(src_tmp2, coeff_0)
+ smull v7.4s, v2.4h, v16.4h //vmull_s16(src_tmp2, coeff_0)
ld1 {v3.4h},[x0],x2 //loads pi2_src
- smlal v0.4s, v2.4h, v13.4h
+ smlal v0.4s, v2.4h, v17.4h
ld1 {v6.4h},[x0],x2
- smlal v8.4s, v3.4h, v13.4h
+ smlal v7.4s, v3.4h, v17.4h
ld1 {v2.4h},[x0]
add x7,x1,x3 //pu1_dst + dst_strd
- smlal v0.4s, v3.4h, v14.4h
- smlal v8.4s, v6.4h, v14.4h
- smlal v0.4s, v6.4h, v15.4h
- smlal v8.4s, v2.4h, v15.4h
+ smlal v0.4s, v3.4h, v18.4h
+ smlal v7.4s, v6.4h, v18.4h
+ smlal v0.4s, v6.4h, v19.4h
+ smlal v7.4s, v2.4h, v19.4h
sqshrn v0.4h, v0.4s,#6 //right shift
- sqshrn v30.4h, v8.4s,#6 //right shift
+ sqshrn v30.4h, v7.4s,#6 //right shift
sqrshrun v0.8b, v0.8h,#6 //rounding shift
sqrshrun v30.8b, v30.8h,#6 //rounding shift
st1 {v0.s}[0],[x1],#4 //stores the loaded value
@@ -189,45 +189,45 @@ prolog:
ld1 {v1.4h},[x0],x2 //loads pi2_src
subs x11,x11,#4
ld1 {v2.4h},[x0],x2 //loads pi2_src
- smull v30.4s, v0.4h, v12.4h //vmull_s16(src_tmp1, coeff_0)
+ smull v30.4s, v0.4h, v16.4h //vmull_s16(src_tmp1, coeff_0)
ld1 {v3.4h},[x0],x2
- smlal v30.4s, v1.4h, v13.4h
- smlal v30.4s, v2.4h, v14.4h
+ smlal v30.4s, v1.4h, v17.4h
+ smlal v30.4s, v2.4h, v18.4h
add x9,x1,x3 //pu1_dst + dst_strd
- smlal v30.4s, v3.4h, v15.4h
+ smlal v30.4s, v3.4h, v19.4h
ld1 {v4.4h},[x0],x2
- smull v28.4s, v1.4h, v12.4h //vmull_s16(src_tmp2, coeff_0)
+ smull v28.4s, v1.4h, v16.4h //vmull_s16(src_tmp2, coeff_0)
add x20,x4,x8
csel x4, x20, x4,le
- smlal v28.4s, v2.4h, v13.4h
+ smlal v28.4s, v2.4h, v17.4h
ld1 {v5.4h},[x0],x2
- smlal v28.4s, v3.4h, v14.4h
+ smlal v28.4s, v3.4h, v18.4h
ld1 {v6.4h},[x0],x2
- smlal v28.4s, v4.4h, v15.4h
+ smlal v28.4s, v4.4h, v19.4h
lsl x20,x6,#1
csel x11, x20, x11,le
sqshrn v30.4h, v30.4s,#6 //right shift
- smull v26.4s, v2.4h, v12.4h //vmull_s16(src_tmp2, coeff_0)
+ smull v26.4s, v2.4h, v16.4h //vmull_s16(src_tmp2, coeff_0)
add x0,x4,x2
- smlal v26.4s, v3.4h, v13.4h
- smlal v26.4s, v4.4h, v14.4h
+ smlal v26.4s, v3.4h, v17.4h
+ smlal v26.4s, v4.4h, v18.4h
ld1 {v0.4h},[x4],#8 //loads pu1_src
- smlal v26.4s, v5.4h, v15.4h
+ smlal v26.4s, v5.4h, v19.4h
sqrshrun v30.8b, v30.8h,#6 //rounding shift
sqshrn v28.4h, v28.4s,#6 //right shift
ld1 {v1.4h},[x0],x2 //loads pi2_src
- smull v24.4s, v3.4h, v12.4h //vmull_s16(src_tmp2, coeff_0)
+ smull v24.4s, v3.4h, v16.4h //vmull_s16(src_tmp2, coeff_0)
st1 {v30.s}[0],[x1],#4 //stores the loaded value
- smlal v24.4s, v4.4h, v13.4h
+ smlal v24.4s, v4.4h, v17.4h
ld1 {v2.4h},[x0],x2 //loads pi2_src
- smlal v24.4s, v5.4h, v14.4h
+ smlal v24.4s, v5.4h, v18.4h
ld1 {v3.4h},[x0],x2
- smlal v24.4s, v6.4h, v15.4h
+ smlal v24.4s, v6.4h, v19.4h
add x20,x1,x14
csel x1, x20, x1,le
@@ -238,21 +238,21 @@ prolog:
beq epilog //jumps to epilog
kernel_4:
- smull v30.4s, v0.4h, v12.4h //vmull_s16(src_tmp1, coeff_0)
+ smull v30.4s, v0.4h, v16.4h //vmull_s16(src_tmp1, coeff_0)
subs x11,x11,#4
- smlal v30.4s, v1.4h, v13.4h
+ smlal v30.4s, v1.4h, v17.4h
st1 {v28.s}[0],[x9],x3 //stores the loaded value
- smlal v30.4s, v2.4h, v14.4h
- smlal v30.4s, v3.4h, v15.4h
+ smlal v30.4s, v2.4h, v18.4h
+ smlal v30.4s, v3.4h, v19.4h
sqshrn v24.4h, v24.4s,#6 //right shift
sqrshrun v26.8b, v26.8h,#6 //rounding shift
ld1 {v4.4h},[x0],x2
- smull v28.4s, v1.4h, v12.4h //vmull_s16(src_tmp2, coeff_0)
- smlal v28.4s, v2.4h, v13.4h
- smlal v28.4s, v3.4h, v14.4h
- smlal v28.4s, v4.4h, v15.4h
+ smull v28.4s, v1.4h, v16.4h //vmull_s16(src_tmp2, coeff_0)
+ smlal v28.4s, v2.4h, v17.4h
+ smlal v28.4s, v3.4h, v18.4h
+ smlal v28.4s, v4.4h, v19.4h
st1 {v26.s}[0],[x9],x3 //stores the loaded value
add x20,x4,x8
csel x4, x20, x4,le
@@ -263,28 +263,28 @@ kernel_4:
sqrshrun v24.8b, v24.8h,#6 //rounding shift
ld1 {v5.4h},[x0],x2
- smull v26.4s, v2.4h, v12.4h //vmull_s16(src_tmp2, coeff_0)
+ smull v26.4s, v2.4h, v16.4h //vmull_s16(src_tmp2, coeff_0)
ld1 {v6.4h},[x0],x2
- smlal v26.4s, v3.4h, v13.4h
+ smlal v26.4s, v3.4h, v17.4h
st1 {v24.s}[0],[x9] //stores the loaded value
add x0,x4,x2
- smlal v26.4s, v4.4h, v14.4h
+ smlal v26.4s, v4.4h, v18.4h
ld1 {v0.4h},[x4],#8 //loads pu1_src
- smlal v26.4s, v5.4h, v15.4h
+ smlal v26.4s, v5.4h, v19.4h
sqshrn v28.4h, v28.4s,#6 //right shift
sqrshrun v30.8b, v30.8h,#6 //rounding shift
ld1 {v1.4h},[x0],x2 //loads pi2_src
- smull v24.4s, v3.4h, v12.4h //vmull_s16(src_tmp2, coeff_0)
+ smull v24.4s, v3.4h, v16.4h //vmull_s16(src_tmp2, coeff_0)
add x9,x1,x3 //pu1_dst + dst_strd
ld1 {v2.4h},[x0],x2 //loads pi2_src
- smlal v24.4s, v4.4h, v13.4h
+ smlal v24.4s, v4.4h, v17.4h
ld1 {v3.4h},[x0],x2
- smlal v24.4s, v5.4h, v14.4h
+ smlal v24.4s, v5.4h, v18.4h
st1 {v30.s}[0],[x1],#4 //stores the loaded value
- smlal v24.4s, v6.4h, v15.4h
+ smlal v24.4s, v6.4h, v19.4h
sqshrn v26.4h, v26.4s,#6 //right shift
sqrshrun v28.8b, v28.8h,#6 //rounding shift
@@ -296,41 +296,41 @@ kernel_4:
bgt kernel_4 //jumps to kernel_4
epilog:
- smull v30.4s, v0.4h, v12.4h //vmull_s16(src_tmp1, coeff_0)
+ smull v30.4s, v0.4h, v16.4h //vmull_s16(src_tmp1, coeff_0)
st1 {v28.s}[0],[x9],x3 //stores the loaded value
- smlal v30.4s, v1.4h, v13.4h
- smlal v30.4s, v2.4h, v14.4h
- smlal v30.4s, v3.4h, v15.4h
+ smlal v30.4s, v1.4h, v17.4h
+ smlal v30.4s, v2.4h, v18.4h
+ smlal v30.4s, v3.4h, v19.4h
sqshrn v24.4h, v24.4s,#6 //right shift
sqrshrun v26.8b, v26.8h,#6 //rounding shift
- smull v28.4s, v1.4h, v12.4h //vmull_s16(src_tmp2, coeff_0)
+ smull v28.4s, v1.4h, v16.4h //vmull_s16(src_tmp2, coeff_0)
ld1 {v4.4h},[x0],x2
- smlal v28.4s, v2.4h, v13.4h
+ smlal v28.4s, v2.4h, v17.4h
st1 {v26.s}[0],[x9],x3 //stores the loaded value
- smlal v28.4s, v3.4h, v14.4h
- smlal v28.4s, v4.4h, v15.4h
+ smlal v28.4s, v3.4h, v18.4h
+ smlal v28.4s, v4.4h, v19.4h
sqshrn v30.4h, v30.4s,#6 //right shift
sqrshrun v24.8b, v24.8h,#6 //rounding shift
- smull v26.4s, v2.4h, v12.4h //vmull_s16(src_tmp2, coeff_0)
+ smull v26.4s, v2.4h, v16.4h //vmull_s16(src_tmp2, coeff_0)
ld1 {v5.4h},[x0],x2
- smlal v26.4s, v3.4h, v13.4h
- smlal v26.4s, v4.4h, v14.4h
- smlal v26.4s, v5.4h, v15.4h
+ smlal v26.4s, v3.4h, v17.4h
+ smlal v26.4s, v4.4h, v18.4h
+ smlal v26.4s, v5.4h, v19.4h
sqshrn v28.4h, v28.4s,#6 //right shift
sqrshrun v30.8b, v30.8h,#6 //rounding shift
st1 {v24.s}[0],[x9] //stores the loaded value
- smull v24.4s, v3.4h, v12.4h //vmull_s16(src_tmp2, coeff_0)
- smlal v24.4s, v4.4h, v13.4h
+ smull v24.4s, v3.4h, v16.4h //vmull_s16(src_tmp2, coeff_0)
+ smlal v24.4s, v4.4h, v17.4h
add x9,x1,x3 //pu1_dst + dst_strd
ld1 {v6.4h},[x0],x2
- smlal v24.4s, v5.4h, v14.4h
- smlal v24.4s, v6.4h, v15.4h
+ smlal v24.4s, v5.4h, v18.4h
+ smlal v24.4s, v6.4h, v19.4h
st1 {v30.s}[0],[x1],#4 //stores the loaded value
sqrshrun v28.8b, v28.8h,#6 //rounding shift
@@ -348,7 +348,7 @@ epilog:
end_loops:
// ldmfd sp!,{x4-x12,x15} //reload the registers from sp
ldp x19, x20,[sp],#16
- pop_v_regs
+
ret
diff --git a/common/arm64/ihevc_inter_pred_chroma_vert_w16inp_w16out.s b/common/arm64/ihevc_inter_pred_chroma_vert_w16inp_w16out.s
index b6d0eb2..5aaabe6 100644
--- a/common/arm64/ihevc_inter_pred_chroma_vert_w16inp_w16out.s
+++ b/common/arm64/ihevc_inter_pred_chroma_vert_w16inp_w16out.s
@@ -104,7 +104,7 @@
ihevc_inter_pred_chroma_vert_w16inp_w16out_av8:
// stmfd sp!, {x4-x12, x14} //stack stores the values of the arguments
- push_v_regs
+
stp x19, x20,[sp,#-16]!
mov x15,x4 // pi1_coeff
@@ -120,10 +120,10 @@ ihevc_inter_pred_chroma_vert_w16inp_w16out_av8:
sxtl v0.8h, v0.8b //long the value
tst x6,#3 //checks wd == 2
- dup v12.4h, v0.4h[0] //coeff_0
- dup v13.4h, v0.4h[1] //coeff_1
- dup v14.4h, v0.4h[2] //coeff_2
- dup v15.4h, v0.4h[3] //coeff_3
+ dup v16.4h, v0.4h[0] //coeff_0
+ dup v17.4h, v0.4h[1] //coeff_1
+ dup v18.4h, v0.4h[2] //coeff_2
+ dup v19.4h, v0.4h[3] //coeff_3
bgt core_loop_ht_2 //jumps to loop handles wd 2
@@ -141,22 +141,22 @@ core_loop_ht_2:
inner_loop_ht_2:
add x0,x4,x2 //increments pi2_src
ld1 {v0.4h},[x4],#8 //loads pu1_src
- smull v0.4s, v0.4h, v12.4h //vmull_s16(src_tmp1, coeff_0)
+ smull v0.4s, v0.4h, v16.4h //vmull_s16(src_tmp1, coeff_0)
subs x12,x12,#8 //2wd + 8
ld1 {v2.4h},[x0],x2 //loads pi2_src
- smull v8.4s, v2.4h, v12.4h //vmull_s16(src_tmp2, coeff_0)
+ smull v7.4s, v2.4h, v16.4h //vmull_s16(src_tmp2, coeff_0)
ld1 {v3.4h},[x0],x2 //loads pi2_src
- smlal v0.4s, v2.4h, v13.4h
+ smlal v0.4s, v2.4h, v17.4h
ld1 {v6.4h},[x0],x2
- smlal v8.4s, v3.4h, v13.4h
+ smlal v7.4s, v3.4h, v17.4h
ld1 {v2.4h},[x0]
add x7,x1,x3 //pu1_dst + dst_strd
- smlal v0.4s, v3.4h, v14.4h
- smlal v8.4s, v6.4h, v14.4h
- smlal v0.4s, v6.4h, v15.4h
- smlal v8.4s, v2.4h, v15.4h
+ smlal v0.4s, v3.4h, v18.4h
+ smlal v7.4s, v6.4h, v18.4h
+ smlal v0.4s, v6.4h, v19.4h
+ smlal v7.4s, v2.4h, v19.4h
sqshrn v0.4h, v0.4s,#6 //right shift
- sqshrn v30.4h, v8.4s,#6 //right shift
+ sqshrn v30.4h, v7.4s,#6 //right shift
st1 {v0.2s},[x1],#8 //stores the loaded value
st1 {v30.2s},[x7] //stores the loaded value
bgt inner_loop_ht_2 //inner loop -again
@@ -188,44 +188,44 @@ prolog:
ld1 {v1.4h},[x0],x2 //loads pi2_src
subs x11,x11,#4
ld1 {v2.4h},[x0],x2 //loads pi2_src
- smull v30.4s, v0.4h, v12.4h //vmull_s16(src_tmp1, coeff_0)
+ smull v30.4s, v0.4h, v16.4h //vmull_s16(src_tmp1, coeff_0)
ld1 {v3.4h},[x0],x2
- smlal v30.4s, v1.4h, v13.4h
- smlal v30.4s, v2.4h, v14.4h
+ smlal v30.4s, v1.4h, v17.4h
+ smlal v30.4s, v2.4h, v18.4h
add x9,x1,x3 //pu1_dst + dst_strd
- smlal v30.4s, v3.4h, v15.4h
+ smlal v30.4s, v3.4h, v19.4h
ld1 {v4.4h},[x0],x2
- smull v28.4s, v1.4h, v12.4h //vmull_s16(src_tmp2, coeff_0)
+ smull v28.4s, v1.4h, v16.4h //vmull_s16(src_tmp2, coeff_0)
add x20,x4,x8
csel x4, x20, x4,le
lsl x20,x6,#1
csel x11, x20, x11,le
- smlal v28.4s, v2.4h, v13.4h
- smlal v28.4s, v3.4h, v14.4h
+ smlal v28.4s, v2.4h, v17.4h
+ smlal v28.4s, v3.4h, v18.4h
ld1 {v5.4h},[x0],x2
- smlal v28.4s, v4.4h, v15.4h
+ smlal v28.4s, v4.4h, v19.4h
sqshrn v30.4h, v30.4s,#6 //right shift
ld1 {v6.4h},[x0],x2
- smull v26.4s, v2.4h, v12.4h //vmull_s16(src_tmp2, coeff_0)
- smlal v26.4s, v3.4h, v13.4h
- smlal v26.4s, v4.4h, v14.4h
+ smull v26.4s, v2.4h, v16.4h //vmull_s16(src_tmp2, coeff_0)
+ smlal v26.4s, v3.4h, v17.4h
+ smlal v26.4s, v4.4h, v18.4h
add x0,x4,x2
ld1 {v0.4h},[x4],#8 //loads pu1_src
- smlal v26.4s, v5.4h, v15.4h
+ smlal v26.4s, v5.4h, v19.4h
sqshrn v28.4h, v28.4s,#6 //right shift
ld1 {v1.4h},[x0],x2 //loads pi2_src
- smull v24.4s, v3.4h, v12.4h //vmull_s16(src_tmp2, coeff_0)
+ smull v24.4s, v3.4h, v16.4h //vmull_s16(src_tmp2, coeff_0)
st1 {v30.2s},[x1],#8 //stores the loaded value
- smlal v24.4s, v4.4h, v13.4h
+ smlal v24.4s, v4.4h, v17.4h
ld1 {v2.4h},[x0],x2 //loads pi2_src
- smlal v24.4s, v5.4h, v14.4h
+ smlal v24.4s, v5.4h, v18.4h
ld1 {v3.4h},[x0],x2
- smlal v24.4s, v6.4h, v15.4h
+ smlal v24.4s, v6.4h, v19.4h
add x20,x1,x14,lsl #1
csel x1, x20, x1,le
@@ -235,20 +235,20 @@ prolog:
beq epilog //jumps to epilog
kernel_4:
- smull v30.4s, v0.4h, v12.4h //vmull_s16(src_tmp1, coeff_0)
+ smull v30.4s, v0.4h, v16.4h //vmull_s16(src_tmp1, coeff_0)
subs x11,x11,#4
- smlal v30.4s, v1.4h, v13.4h
+ smlal v30.4s, v1.4h, v17.4h
st1 {v28.2s},[x9],x3 //stores the loaded value
- smlal v30.4s, v2.4h, v14.4h
- smlal v30.4s, v3.4h, v15.4h
+ smlal v30.4s, v2.4h, v18.4h
+ smlal v30.4s, v3.4h, v19.4h
sqshrn v24.4h, v24.4s,#6 //right shift
ld1 {v4.4h},[x0],x2
- smull v28.4s, v1.4h, v12.4h //vmull_s16(src_tmp2, coeff_0)
- smlal v28.4s, v2.4h, v13.4h
- smlal v28.4s, v3.4h, v14.4h
- smlal v28.4s, v4.4h, v15.4h
+ smull v28.4s, v1.4h, v16.4h //vmull_s16(src_tmp2, coeff_0)
+ smlal v28.4s, v2.4h, v17.4h
+ smlal v28.4s, v3.4h, v18.4h
+ smlal v28.4s, v4.4h, v19.4h
st1 {v26.2s},[x9],x3 //stores the loaded value
add x20,x4,x8
csel x4, x20, x4,le
@@ -258,27 +258,27 @@ kernel_4:
sqshrn v30.4h, v30.4s,#6 //right shift
ld1 {v5.4h},[x0],x2
- smull v26.4s, v2.4h, v12.4h //vmull_s16(src_tmp2, coeff_0)
+ smull v26.4s, v2.4h, v16.4h //vmull_s16(src_tmp2, coeff_0)
ld1 {v6.4h},[x0],x2
- smlal v26.4s, v3.4h, v13.4h
+ smlal v26.4s, v3.4h, v17.4h
st1 {v24.2s},[x9] //stores the loaded value
add x0,x4,x2
- smlal v26.4s, v4.4h, v14.4h
+ smlal v26.4s, v4.4h, v18.4h
ld1 {v0.4h},[x4],#8 //loads pu1_src
- smlal v26.4s, v5.4h, v15.4h
+ smlal v26.4s, v5.4h, v19.4h
sqshrn v28.4h, v28.4s,#6 //right shift
ld1 {v1.4h},[x0],x2 //loads pi2_src
- smull v24.4s, v3.4h, v12.4h //vmull_s16(src_tmp2, coeff_0)
+ smull v24.4s, v3.4h, v16.4h //vmull_s16(src_tmp2, coeff_0)
ld1 {v2.4h},[x0],x2 //loads pi2_src
- smlal v24.4s, v4.4h, v13.4h
+ smlal v24.4s, v4.4h, v17.4h
add x9,x1,x3 //pu1_dst + dst_strd
ld1 {v3.4h},[x0],x2
- smlal v24.4s, v5.4h, v14.4h
+ smlal v24.4s, v5.4h, v18.4h
st1 {v30.2s},[x1],#8 //stores the loaded value
- smlal v24.4s, v6.4h, v15.4h
+ smlal v24.4s, v6.4h, v19.4h
sqshrn v26.4h, v26.4s,#6 //right shift
add x20,x1,x14,lsl #1
@@ -289,38 +289,38 @@ kernel_4:
bgt kernel_4 //jumps to kernel_4
epilog:
- smull v30.4s, v0.4h, v12.4h //vmull_s16(src_tmp1, coeff_0)
+ smull v30.4s, v0.4h, v16.4h //vmull_s16(src_tmp1, coeff_0)
st1 {v28.2s},[x9],x3 //stores the loaded value
- smlal v30.4s, v1.4h, v13.4h
- smlal v30.4s, v2.4h, v14.4h
- smlal v30.4s, v3.4h, v15.4h
+ smlal v30.4s, v1.4h, v17.4h
+ smlal v30.4s, v2.4h, v18.4h
+ smlal v30.4s, v3.4h, v19.4h
sqshrn v24.4h, v24.4s,#6 //right shift
- smull v28.4s, v1.4h, v12.4h //vmull_s16(src_tmp2, coeff_0)
+ smull v28.4s, v1.4h, v16.4h //vmull_s16(src_tmp2, coeff_0)
ld1 {v4.4h},[x0],x2
- smlal v28.4s, v2.4h, v13.4h
+ smlal v28.4s, v2.4h, v17.4h
st1 {v26.2s},[x9],x3 //stores the loaded value
- smlal v28.4s, v3.4h, v14.4h
- smlal v28.4s, v4.4h, v15.4h
+ smlal v28.4s, v3.4h, v18.4h
+ smlal v28.4s, v4.4h, v19.4h
sqshrn v30.4h, v30.4s,#6 //right shift
- smull v26.4s, v2.4h, v12.4h //vmull_s16(src_tmp2, coeff_0)
+ smull v26.4s, v2.4h, v16.4h //vmull_s16(src_tmp2, coeff_0)
ld1 {v5.4h},[x0],x2
- smlal v26.4s, v3.4h, v13.4h
- smlal v26.4s, v4.4h, v14.4h
- smlal v26.4s, v5.4h, v15.4h
+ smlal v26.4s, v3.4h, v17.4h
+ smlal v26.4s, v4.4h, v18.4h
+ smlal v26.4s, v5.4h, v19.4h
sqshrn v28.4h, v28.4s,#6 //right shift
st1 {v24.2s},[x9] //stores the loaded value
- smull v24.4s, v3.4h, v12.4h //vmull_s16(src_tmp2, coeff_0)
- smlal v24.4s, v4.4h, v13.4h
+ smull v24.4s, v3.4h, v16.4h //vmull_s16(src_tmp2, coeff_0)
+ smlal v24.4s, v4.4h, v17.4h
add x9,x1,x3 //pu1_dst + dst_strd
ld1 {v6.4h},[x0],x2
- smlal v24.4s, v5.4h, v14.4h
- smlal v24.4s, v6.4h, v15.4h
+ smlal v24.4s, v5.4h, v18.4h
+ smlal v24.4s, v6.4h, v19.4h
st1 {v30.2s},[x1],#8 //stores the loaded value
sqshrn v26.4h, v26.4s,#6 //right shift
@@ -335,7 +335,7 @@ epilog:
end_loops:
// ldmfd sp!,{x4-x12,x15} //reload the registers from sp
ldp x19, x20,[sp],#16
- pop_v_regs
+
ret
diff --git a/common/arm64/ihevc_inter_pred_chroma_vert_w16out.s b/common/arm64/ihevc_inter_pred_chroma_vert_w16out.s
index 9f5687f..ec946eb 100644
--- a/common/arm64/ihevc_inter_pred_chroma_vert_w16out.s
+++ b/common/arm64/ihevc_inter_pred_chroma_vert_w16out.s
@@ -105,7 +105,7 @@
ihevc_inter_pred_chroma_vert_w16out_av8:
// stmfd sp!,{x4-x12,x14} //stack stores the values of the arguments
- push_v_regs
+
stp x19, x20,[sp,#-16]!
mov x15,x4 // pi1_coeff
@@ -145,20 +145,20 @@ ihevc_inter_pred_chroma_vert_w16out_av8:
inner_loop_ht_2: //called when wd is multiple of 4 and ht is 4,2
add x6,x0,x2 //pu1_src +src_strd
- ld1 {v9.8b},[x6],x2 //loads pu1_src
+ ld1 {v17.8b},[x6],x2 //loads pu1_src
subs x5,x5,#8 //2wd - 8
ld1 {v5.8b},[x0],#8 //loads src
- umull v6.8h, v9.8b, v1.8b //vmull_u8(vreinterpret_u8_u32(src_tmp2), coeffabs_1)
+ umull v6.8h, v17.8b, v1.8b //vmull_u8(vreinterpret_u8_u32(src_tmp2), coeffabs_1)
ld1 {v4.8b},[x6],x2 //loads incremented src
umlsl v6.8h, v5.8b, v0.8b //vmlsl_u8(mul_res1, vreinterpret_u8_u32(src_tmp1), coeffabs_0)
- ld1 {v8.8b},[x6],x2 //loads incremented src
+ ld1 {v16.8b},[x6],x2 //loads incremented src
umlal v6.8h, v4.8b, v2.8b //vmlal_u8(mul_res1, vreinterpret_u8_u32(src_tmp3), coeffabs_2)
umull v4.8h, v4.8b, v1.8b
- ld1 {v10.8b},[x6] //loads the incremented src
- umlsl v6.8h, v8.8b, v3.8b
- umlsl v4.8h, v9.8b, v0.8b
- umlal v4.8h, v8.8b, v2.8b
- umlsl v4.8h, v10.8b, v3.8b
+ ld1 {v18.8b},[x6] //loads the incremented src
+ umlsl v6.8h, v16.8b, v3.8b
+ umlsl v4.8h, v17.8b, v0.8b
+ umlal v4.8h, v16.8b, v2.8b
+ umlsl v4.8h, v18.8b, v3.8b
add x6,x1,x3 //pu1_dst + dst_strd
st1 { v6.8h},[x1],#16 //stores the loaded value
@@ -241,7 +241,7 @@ prolog:
add x7,x1,x3 //pu1_dst
umlal v30.8h, v6.8b, v2.8b
umlsl v30.8h, v7.8b, v3.8b
- ld1 {v8.8b},[x6],x2 //load and increment
+ ld1 {v16.8b},[x6],x2 //load and increment
umull v28.8h, v6.8b, v1.8b //mul_res 2
add x20,x0,x9 //pu1_dst += 4*dst_strd - 2*wd
@@ -250,28 +250,28 @@ prolog:
bic x20,x10,#7 //x5 ->wd
csel x5, x20, x5,le
umlal v28.8h, v7.8b, v2.8b
- ld1 {v9.8b},[x6],x2
- umlsl v28.8h, v8.8b, v3.8b
+ ld1 {v17.8b},[x6],x2
+ umlsl v28.8h, v16.8b, v3.8b
- ld1 {v10.8b},[x6],x2
+ ld1 {v18.8b},[x6],x2
umull v26.8h, v7.8b, v1.8b
add x6,x0,x2 //pu1_src + src_strd
umlsl v26.8h, v6.8b, v0.8b
st1 { v30.16b},[x1],#16 //stores the loaded value
- umlal v26.8h, v8.8b, v2.8b
+ umlal v26.8h, v16.8b, v2.8b
ld1 {v4.8b},[x0],#8 //loads the source
- umlsl v26.8h, v9.8b, v3.8b
+ umlsl v26.8h, v17.8b, v3.8b
add x20,x1,x8 //pu1_src += 4*src_strd - 2*wd
csel x1, x20, x1,le
- umull v24.8h, v8.8b, v1.8b
+ umull v24.8h, v16.8b, v1.8b
ld1 {v5.8b},[x6],x2 //loads pu1_src
umlsl v24.8h, v7.8b, v0.8b
subs x12,x12,#4
ld1 {v6.8b},[x6],x2 //load and increment
- umlal v24.8h, v9.8b, v2.8b
+ umlal v24.8h, v17.8b, v2.8b
ld1 {v7.8b},[x6],x2 //load and increment
- umlsl v24.8h, v10.8b, v3.8b
+ umlsl v24.8h, v18.8b, v3.8b
sub x20,x2,x2,lsl #3
neg x11, x20
add x14,x2,x2,lsl #1
@@ -296,7 +296,7 @@ kernel_8:
umlsl v30.8h, v7.8b, v3.8b
st1 { v26.16b},[x7],x3 //stores the loaded value
- ld1 {v8.8b},[x6],x2 //load and increment
+ ld1 {v16.8b},[x6],x2 //load and increment
umull v28.8h, v6.8b, v1.8b //mul_res 2
bic x20,x10,#7 //x5 ->wd
@@ -305,10 +305,10 @@ kernel_8:
st1 { v24.16b},[x7],x3 //stores the loaded value
umlal v28.8h, v7.8b, v2.8b
- ld1 {v9.8b},[x6],x2
+ ld1 {v17.8b},[x6],x2
- umlsl v28.8h, v8.8b, v3.8b
- ld1 {v10.8b},[x6],x2
+ umlsl v28.8h, v16.8b, v3.8b
+ ld1 {v18.8b},[x6],x2
add x7,x1,x3 //pu1_dst
umull v26.8h, v7.8b, v1.8b
add x6,x0,x2 //pu1_src + src_strd
@@ -319,13 +319,13 @@ kernel_8:
ld1 {v4.8b},[x0],#8 //loads the source
add x11,x11,x2
- umlal v26.8h, v8.8b, v2.8b
+ umlal v26.8h, v16.8b, v2.8b
st1 { v30.16b},[x1],#16 //stores the loaded value
- umlsl v26.8h, v9.8b, v3.8b
+ umlsl v26.8h, v17.8b, v3.8b
ld1 {v5.8b},[x6],x2 //loads pu1_src
- umull v24.8h, v8.8b, v1.8b
+ umull v24.8h, v16.8b, v1.8b
ld1 {v6.8b},[x6],x2 //load and increment
add x20,x1,x8 //pu1_src += 4*src_strd - 2*wd
csel x1, x20, x1,le
@@ -341,10 +341,10 @@ kernel_8:
subs x12,x12,#4
- umlal v24.8h, v9.8b, v2.8b
+ umlal v24.8h, v17.8b, v2.8b
ld1 {v7.8b},[x6],x2 //load and increment
- umlsl v24.8h, v10.8b, v3.8b
+ umlsl v24.8h, v18.8b, v3.8b
st1 { v28.16b},[x7],x3 //stores the loaded value
bgt kernel_8 //jumps to kernel_8
@@ -357,35 +357,35 @@ epilog:
umlsl v30.8h, v7.8b, v3.8b
st1 { v26.16b},[x7],x3 //stores the loaded value
- ld1 {v8.8b},[x6],x2 //load and increment
+ ld1 {v16.8b},[x6],x2 //load and increment
umull v28.8h, v6.8b, v1.8b //mul_res 2
umlsl v28.8h, v5.8b, v0.8b
umlal v28.8h, v7.8b, v2.8b
- umlsl v28.8h, v8.8b, v3.8b
+ umlsl v28.8h, v16.8b, v3.8b
st1 { v24.16b},[x7],x3 //stores the loaded value
- ld1 {v9.8b},[x6],x2
+ ld1 {v17.8b},[x6],x2
umull v26.8h, v7.8b, v1.8b
add x7,x1,x3 //pu1_dst
umlsl v26.8h, v6.8b, v0.8b
st1 { v30.16b},[x1],#16 //stores the loaded value
- umlal v26.8h, v8.8b, v2.8b
- ld1 {v10.8b},[x6],x2
- umlsl v26.8h, v9.8b, v3.8b
+ umlal v26.8h, v16.8b, v2.8b
+ ld1 {v18.8b},[x6],x2
+ umlsl v26.8h, v17.8b, v3.8b
- umull v24.8h, v8.8b, v1.8b
+ umull v24.8h, v16.8b, v1.8b
st1 { v28.16b},[x7],x3 //stores the loaded value
umlsl v24.8h, v7.8b, v0.8b
- umlal v24.8h, v9.8b, v2.8b
+ umlal v24.8h, v17.8b, v2.8b
st1 { v26.16b},[x7],x3 //stores the loaded value
- umlsl v24.8h, v10.8b, v3.8b
+ umlsl v24.8h, v18.8b, v3.8b
st1 { v24.16b},[x7],x3 //stores the loaded value
end_loops:
// ldmfd sp!,{x4-x12,x15} //reload the registers from sp
ldp x19, x20,[sp],#16
- pop_v_regs
+
ret
diff --git a/common/arm64/ihevc_inter_pred_filters_luma_vert.s b/common/arm64/ihevc_inter_pred_filters_luma_vert.s
index 48dc30f..bd8b3c4 100644
--- a/common/arm64/ihevc_inter_pred_filters_luma_vert.s
+++ b/common/arm64/ihevc_inter_pred_filters_luma_vert.s
@@ -115,7 +115,7 @@
ihevc_inter_pred_luma_vert_av8:
// stmfd sp!, {x4-x12, x14} //stack stores the values of the arguments
- push_v_regs
+
stp x19, x20,[sp,#-16]!
mov x15,x4 // pi1_coeff
@@ -161,87 +161,87 @@ prolog:
ld1 {v0.8b},[x0],#8 //src_tmp1 = vld1_u8(pu1_src_tmp)//
subs x4,x4,#8
ld1 {v2.8b},[x3],x2 //src_tmp3 = vld1_u8(pu1_src_tmp)//
- umull v8.8h, v1.8b, v23.8b //mul_res1 = vmull_u8(src_tmp2, coeffabs_1)//
+ umull v19.8h, v1.8b, v23.8b //mul_res1 = vmull_u8(src_tmp2, coeffabs_1)//
ld1 {v3.8b},[x3],x2 //src_tmp4 = vld1_u8(pu1_src_tmp)//
- umlsl v8.8h, v0.8b, v22.8b //mul_res1 = vmlsl_u8(mul_res1, src_tmp1, coeffabs_0)//
+ umlsl v19.8h, v0.8b, v22.8b //mul_res1 = vmlsl_u8(mul_res1, src_tmp1, coeffabs_0)//
ld1 {v4.8b},[x3],x2 //src_tmp1 = vld1_u8(pu1_src_tmp)//
- umlsl v8.8h, v2.8b, v24.8b //mul_res1 = vmlsl_u8(mul_res1, src_tmp3, coeffabs_2)//
+ umlsl v19.8h, v2.8b, v24.8b //mul_res1 = vmlsl_u8(mul_res1, src_tmp3, coeffabs_2)//
ld1 {v5.8b},[x3],x2 //src_tmp2 = vld1_u8(pu1_src_tmp)//
- umlal v8.8h, v3.8b, v25.8b //mul_res1 = vmlal_u8(mul_res1, src_tmp4, coeffabs_3)//
+ umlal v19.8h, v3.8b, v25.8b //mul_res1 = vmlal_u8(mul_res1, src_tmp4, coeffabs_3)//
ld1 {v6.8b},[x3],x2 //src_tmp3 = vld1_u8(pu1_src_tmp)//
- umlal v8.8h, v4.8b, v26.8b //mul_res1 = vmlal_u8(mul_res1, src_tmp1, coeffabs_4)//
+ umlal v19.8h, v4.8b, v26.8b //mul_res1 = vmlal_u8(mul_res1, src_tmp1, coeffabs_4)//
ld1 {v7.8b},[x3],x2 //src_tmp4 = vld1_u8(pu1_src_tmp)//
- umlsl v8.8h, v5.8b, v27.8b //mul_res1 = vmlsl_u8(mul_res1, src_tmp2, coeffabs_5)//
+ umlsl v19.8h, v5.8b, v27.8b //mul_res1 = vmlsl_u8(mul_res1, src_tmp2, coeffabs_5)//
ld1 {v16.8b},[x3],x2 //src_tmp1 = vld1_u8(pu1_src_tmp)//
- umlal v8.8h, v6.8b, v28.8b //mul_res1 = vmlal_u8(mul_res1, src_tmp3, coeffabs_6)//
+ umlal v19.8h, v6.8b, v28.8b //mul_res1 = vmlal_u8(mul_res1, src_tmp3, coeffabs_6)//
ld1 {v17.8b},[x3],x2 //src_tmp2 = vld1_u8(pu1_src_tmp)//
- umlsl v8.8h, v7.8b, v29.8b //mul_res1 = vmlsl_u8(mul_res1, src_tmp4, coeffabs_7)//
+ umlsl v19.8h, v7.8b, v29.8b //mul_res1 = vmlsl_u8(mul_res1, src_tmp4, coeffabs_7)//
ld1 {v18.8b},[x3],x2 //src_tmp3 = vld1_u8(pu1_src_tmp)//
- umull v10.8h, v2.8b, v23.8b //mul_res2 = vmull_u8(src_tmp3, coeffabs_1)//
+ umull v20.8h, v2.8b, v23.8b //mul_res2 = vmull_u8(src_tmp3, coeffabs_1)//
add x20,x0,x8
csel x0, x20, x0,le
- umlsl v10.8h, v1.8b, v22.8b //mul_res2 = vmlsl_u8(mul_res2, src_tmp2, coeffabs_0)//
+ umlsl v20.8h, v1.8b, v22.8b //mul_res2 = vmlsl_u8(mul_res2, src_tmp2, coeffabs_0)//
bic x20,x5,#7 //x5 ->wd
csel x4, x20, x4,le
- umlsl v10.8h, v3.8b, v24.8b //mul_res2 = vmlsl_u8(mul_res2, src_tmp4, coeffabs_2)//
+ umlsl v20.8h, v3.8b, v24.8b //mul_res2 = vmlsl_u8(mul_res2, src_tmp4, coeffabs_2)//
prfm PLDL1KEEP,[x3]
- umlal v10.8h, v4.8b, v25.8b //mul_res2 = vmlal_u8(mul_res2, src_tmp1, coeffabs_3)//
+ umlal v20.8h, v4.8b, v25.8b //mul_res2 = vmlal_u8(mul_res2, src_tmp1, coeffabs_3)//
add x20,x3, x2
prfm PLDL1KEEP,[x20]
- umlal v10.8h, v5.8b, v26.8b //mul_res2 = vmlal_u8(mul_res2, src_tmp2, coeffabs_4)//
+ umlal v20.8h, v5.8b, v26.8b //mul_res2 = vmlal_u8(mul_res2, src_tmp2, coeffabs_4)//
add x20,x3, x2, lsl #1
prfm PLDL1KEEP,[x20]
- umlsl v10.8h, v6.8b, v27.8b //mul_res2 = vmlsl_u8(mul_res2, src_tmp3, coeffabs_5)//
+ umlsl v20.8h, v6.8b, v27.8b //mul_res2 = vmlsl_u8(mul_res2, src_tmp3, coeffabs_5)//
add x3, x3, x2
- umlal v10.8h, v7.8b, v28.8b //mul_res2 = vmlal_u8(mul_res2, src_tmp4, coeffabs_6)//
+ umlal v20.8h, v7.8b, v28.8b //mul_res2 = vmlal_u8(mul_res2, src_tmp4, coeffabs_6)//
add x20,x3, x2, lsl #1
prfm PLDL1KEEP,[x20]
- umlsl v10.8h, v16.8b, v29.8b //mul_res2 = vmlsl_u8(mul_res2, src_tmp1, coeffabs_7)//
+ umlsl v20.8h, v16.8b, v29.8b //mul_res2 = vmlsl_u8(mul_res2, src_tmp1, coeffabs_7)//
add x3,x0,x2 //pu1_src_tmp += src_strd//
- sqrshrun v8.8b, v8.8h,#6 //sto_res = vqmovun_s16(sto_res_tmp)//
+ sqrshrun v19.8b, v19.8h,#6 //sto_res = vqmovun_s16(sto_res_tmp)//
ld1 {v1.8b},[x3],x2 //src_tmp3 = vld1_u8(pu1_src_tmp)//
- umull v12.8h, v3.8b, v23.8b
+ umull v21.8h, v3.8b, v23.8b
ld1 {v0.8b},[x0],#8 //src_tmp1 = vld1_u8(pu1_src_tmp)//
- umlsl v12.8h, v2.8b, v22.8b
+ umlsl v21.8h, v2.8b, v22.8b
ld1 {v2.8b},[x3],x2 //src_tmp3 = vld1_u8(pu1_src_tmp)//
- umlsl v12.8h, v4.8b, v24.8b
- umlal v12.8h, v5.8b, v25.8b
- umlal v12.8h, v6.8b, v26.8b
- umlsl v12.8h, v7.8b, v27.8b
- umlal v12.8h, v16.8b, v28.8b
- umlsl v12.8h, v17.8b, v29.8b
+ umlsl v21.8h, v4.8b, v24.8b
+ umlal v21.8h, v5.8b, v25.8b
+ umlal v21.8h, v6.8b, v26.8b
+ umlsl v21.8h, v7.8b, v27.8b
+ umlal v21.8h, v16.8b, v28.8b
+ umlsl v21.8h, v17.8b, v29.8b
add x14,x1,x6
- st1 {v8.8b},[x1],#8 //vst1_u8(pu1_dst,sto_res)//
- sqrshrun v10.8b, v10.8h,#6 //sto_res = vqmovun_s16(sto_res_tmp)//
+ st1 {v19.8b},[x1],#8 //vst1_u8(pu1_dst,sto_res)//
+ sqrshrun v20.8b, v20.8h,#6 //sto_res = vqmovun_s16(sto_res_tmp)//
add x20,x1,x9
csel x1, x20, x1,le
- umull v14.8h, v4.8b, v23.8b
+ umull v30.8h, v4.8b, v23.8b
subs x7,x7,#4
- umlsl v14.8h, v3.8b, v22.8b
- umlsl v14.8h, v5.8b, v24.8b
- umlal v14.8h, v6.8b, v25.8b
+ umlsl v30.8h, v3.8b, v22.8b
+ umlsl v30.8h, v5.8b, v24.8b
+ umlal v30.8h, v6.8b, v25.8b
ld1 {v3.8b},[x3],x2 //src_tmp4 = vld1_u8(pu1_src_tmp)//
- umlal v14.8h, v7.8b, v26.8b
+ umlal v30.8h, v7.8b, v26.8b
ld1 {v4.8b},[x3],x2 //src_tmp1 = vld1_u8(pu1_src_tmp)//
- umlsl v14.8h, v16.8b, v27.8b
+ umlsl v30.8h, v16.8b, v27.8b
ld1 {v5.8b},[x3],x2 //src_tmp2 = vld1_u8(pu1_src_tmp)//
- umlal v14.8h, v17.8b, v28.8b
+ umlal v30.8h, v17.8b, v28.8b
ld1 {v6.8b},[x3],x2 //src_tmp3 = vld1_u8(pu1_src_tmp)//
- umlsl v14.8h, v18.8b, v29.8b
+ umlsl v30.8h, v18.8b, v29.8b
ld1 {v7.8b},[x3],x2 //src_tmp4 = vld1_u8(pu1_src_tmp)//
- st1 {v10.8b},[x14],x6 //vst1_u8(pu1_dst_tmp,sto_res)//
- sqrshrun v12.8b, v12.8h,#6
+ st1 {v20.8b},[x14],x6 //vst1_u8(pu1_dst_tmp,sto_res)//
+ sqrshrun v21.8b, v21.8h,#6
blt epilog_end //jumps to epilog_end
@@ -250,111 +250,111 @@ prolog:
kernel_8:
subs x4,x4,#8
- umull v8.8h, v1.8b, v23.8b //mul_res1 = vmull_u8(src_tmp2, coeffabs_1)//
+ umull v19.8h, v1.8b, v23.8b //mul_res1 = vmull_u8(src_tmp2, coeffabs_1)//
add x20,x0,x8
csel x0, x20, x0,le
- umlsl v8.8h, v0.8b, v22.8b //mul_res1 = vmlsl_u8(mul_res1, src_tmp1, coeffabs_0)//
+ umlsl v19.8h, v0.8b, v22.8b //mul_res1 = vmlsl_u8(mul_res1, src_tmp1, coeffabs_0)//
bic x20,x5,#7 //x5 ->wd
csel x4, x20, x4,le
- umlsl v8.8h, v2.8b, v24.8b //mul_res1 = vmlsl_u8(mul_res1, src_tmp3, coeffabs_2)//
+ umlsl v19.8h, v2.8b, v24.8b //mul_res1 = vmlsl_u8(mul_res1, src_tmp3, coeffabs_2)//
ld1 {v16.8b},[x3],x2 //src_tmp1 = vld1_u8(pu1_src_tmp)//
- umlal v8.8h, v3.8b, v25.8b //mul_res1 = vmlal_u8(mul_res1, src_tmp4, coeffabs_3)//
+ umlal v19.8h, v3.8b, v25.8b //mul_res1 = vmlal_u8(mul_res1, src_tmp4, coeffabs_3)//
ld1 {v17.8b},[x3],x2 //src_tmp2 = vld1_u8(pu1_src_tmp)//
- umlal v8.8h, v4.8b, v26.8b //mul_res1 = vmlal_u8(mul_res1, src_tmp1, coeffabs_4)//
+ umlal v19.8h, v4.8b, v26.8b //mul_res1 = vmlal_u8(mul_res1, src_tmp1, coeffabs_4)//
ld1 {v18.8b},[x3],x2 //src_tmp3 = vld1_u8(pu1_src_tmp)//
- umlsl v8.8h, v5.8b, v27.8b //mul_res1 = vmlsl_u8(mul_res1, src_tmp2, coeffabs_5)//
+ umlsl v19.8h, v5.8b, v27.8b //mul_res1 = vmlsl_u8(mul_res1, src_tmp2, coeffabs_5)//
- umlal v8.8h, v6.8b, v28.8b //mul_res1 = vmlal_u8(mul_res1, src_tmp3, coeffabs_6)//
+ umlal v19.8h, v6.8b, v28.8b //mul_res1 = vmlal_u8(mul_res1, src_tmp3, coeffabs_6)//
- umlsl v8.8h, v7.8b, v29.8b //mul_res1 = vmlsl_u8(mul_res1, src_tmp4, coeffabs_7)//
- st1 {v12.8b},[x14],x6
+ umlsl v19.8h, v7.8b, v29.8b //mul_res1 = vmlsl_u8(mul_res1, src_tmp4, coeffabs_7)//
+ st1 {v21.8b},[x14],x6
// and x11, x0, #31
- sqrshrun v14.8b, v14.8h,#6
+ sqrshrun v30.8b, v30.8h,#6
add x3,x0,x2 //pu1_src_tmp += src_strd//
- umull v10.8h, v2.8b, v23.8b //mul_res2 = vmull_u8(src_tmp3, coeffabs_1)//
+ umull v20.8h, v2.8b, v23.8b //mul_res2 = vmull_u8(src_tmp3, coeffabs_1)//
ld1 {v0.8b},[x0],#8 //src_tmp1 = vld1_u8(pu1_src_tmp)//
- umlsl v10.8h, v1.8b, v22.8b //mul_res2 = vmlsl_u8(mul_res2, src_tmp2, coeffabs_0)//
+ umlsl v20.8h, v1.8b, v22.8b //mul_res2 = vmlsl_u8(mul_res2, src_tmp2, coeffabs_0)//
- umlsl v10.8h, v3.8b, v24.8b //mul_res2 = vmlsl_u8(mul_res2, src_tmp4, coeffabs_2)//
+ umlsl v20.8h, v3.8b, v24.8b //mul_res2 = vmlsl_u8(mul_res2, src_tmp4, coeffabs_2)//
ld1 {v1.8b},[x3],x2 //src_tmp2 = vld1_u8(pu1_src_tmp)//
- umlal v10.8h, v4.8b, v25.8b //mul_res2 = vmlal_u8(mul_res2, src_tmp1, coeffabs_3)//
+ umlal v20.8h, v4.8b, v25.8b //mul_res2 = vmlal_u8(mul_res2, src_tmp1, coeffabs_3)//
- st1 {v14.8b},[x14],x6
- umlal v10.8h, v5.8b, v26.8b //mul_res2 = vmlal_u8(mul_res2, src_tmp2, coeffabs_4)//
+ st1 {v30.8b},[x14],x6
+ umlal v20.8h, v5.8b, v26.8b //mul_res2 = vmlal_u8(mul_res2, src_tmp2, coeffabs_4)//
add x14,x1,#0
- umlsl v10.8h, v6.8b, v27.8b //mul_res2 = vmlsl_u8(mul_res2, src_tmp3, coeffabs_5)//
+ umlsl v20.8h, v6.8b, v27.8b //mul_res2 = vmlsl_u8(mul_res2, src_tmp3, coeffabs_5)//
add x1, x1, #8
- umlal v10.8h, v7.8b, v28.8b //mul_res2 = vmlal_u8(mul_res2, src_tmp4, coeffabs_6)//
+ umlal v20.8h, v7.8b, v28.8b //mul_res2 = vmlal_u8(mul_res2, src_tmp4, coeffabs_6)//
- umlsl v10.8h, v16.8b, v29.8b //mul_res2 = vmlsl_u8(mul_res2, src_tmp1, coeffabs_7)//
+ umlsl v20.8h, v16.8b, v29.8b //mul_res2 = vmlsl_u8(mul_res2, src_tmp1, coeffabs_7)//
add x20,x1,x9
csel x1, x20, x1,le
- sqrshrun v8.8b, v8.8h,#6 //sto_res = vqmovun_s16(sto_res_tmp)//
+ sqrshrun v19.8b, v19.8h,#6 //sto_res = vqmovun_s16(sto_res_tmp)//
// cmp x11, x10
- umull v12.8h, v3.8b, v23.8b
+ umull v21.8h, v3.8b, v23.8b
add x10, x3, x2, lsl #3 // 10*strd - 8+2
- umlsl v12.8h, v2.8b, v22.8b
+ umlsl v21.8h, v2.8b, v22.8b
add x10, x10, x2 // 11*strd
- umlsl v12.8h, v4.8b, v24.8b
+ umlsl v21.8h, v4.8b, v24.8b
ld1 {v2.8b},[x3],x2 //src_tmp3 = vld1_u8(pu1_src_tmp)//
- umlal v12.8h, v5.8b, v25.8b
+ umlal v21.8h, v5.8b, v25.8b
- umlal v12.8h, v6.8b, v26.8b
- st1 {v8.8b},[x14],x6 //vst1_u8(pu1_dst,sto_res)//
+ umlal v21.8h, v6.8b, v26.8b
+ st1 {v19.8b},[x14],x6 //vst1_u8(pu1_dst,sto_res)//
prfm PLDL1KEEP,[x10] //11+ 0
- umlsl v12.8h, v7.8b, v27.8b
+ umlsl v21.8h, v7.8b, v27.8b
add x20,x10, x2
prfm PLDL1KEEP,[x20] //11+ 1*strd
- umlal v12.8h, v16.8b, v28.8b
+ umlal v21.8h, v16.8b, v28.8b
add x20,x10, x2, lsl #1
prfm PLDL1KEEP,[x20] //11+ 2*strd
- umlsl v12.8h, v17.8b, v29.8b
+ umlsl v21.8h, v17.8b, v29.8b
add x10, x10, x2 //12*strd
- sqrshrun v10.8b, v10.8h,#6 //sto_res = vqmovun_s16(sto_res_tmp)//
+ sqrshrun v20.8b, v20.8h,#6 //sto_res = vqmovun_s16(sto_res_tmp)//
add x20,x10, x2, lsl #1
prfm PLDL1KEEP,[x20] //11+ 3*strd
- umull v14.8h, v4.8b, v23.8b
+ umull v30.8h, v4.8b, v23.8b
// mov x10, x11
- umlsl v14.8h, v3.8b, v22.8b
+ umlsl v30.8h, v3.8b, v22.8b
subs x7,x7,#4
- umlsl v14.8h, v5.8b, v24.8b
+ umlsl v30.8h, v5.8b, v24.8b
- umlal v14.8h, v6.8b, v25.8b
+ umlal v30.8h, v6.8b, v25.8b
ld1 {v3.8b},[x3],x2 //src_tmp4 = vld1_u8(pu1_src_tmp)//
- umlal v14.8h, v7.8b, v26.8b
+ umlal v30.8h, v7.8b, v26.8b
ld1 {v4.8b},[x3],x2 //src_tmp1 = vld1_u8(pu1_src_tmp)//
- umlsl v14.8h, v16.8b, v27.8b
+ umlsl v30.8h, v16.8b, v27.8b
ld1 {v5.8b},[x3],x2 //src_tmp2 = vld1_u8(pu1_src_tmp)//
- umlal v14.8h, v17.8b, v28.8b
+ umlal v30.8h, v17.8b, v28.8b
ld1 {v6.8b},[x3],x2 //src_tmp3 = vld1_u8(pu1_src_tmp)//
- umlsl v14.8h, v18.8b, v29.8b
+ umlsl v30.8h, v18.8b, v29.8b
ld1 {v7.8b},[x3],x2 //src_tmp4 = vld1_u8(pu1_src_tmp)//
- sqrshrun v12.8b, v12.8h,#6
- st1 {v10.8b},[x14],x6 //vst1_u8(pu1_dst_tmp,sto_res)//
+ sqrshrun v21.8b, v21.8h,#6
+ st1 {v20.8b},[x14],x6 //vst1_u8(pu1_dst_tmp,sto_res)//
@@ -362,62 +362,62 @@ kernel_8:
epilog:
- umull v8.8h, v1.8b, v23.8b //mul_res1 = vmull_u8(src_tmp2, coeffabs_1)//
- umlsl v8.8h, v0.8b, v22.8b //mul_res1 = vmlsl_u8(mul_res1, src_tmp1, coeffabs_0)//
- umlsl v8.8h, v2.8b, v24.8b //mul_res1 = vmlsl_u8(mul_res1, src_tmp3, coeffabs_2)//
- umlal v8.8h, v3.8b, v25.8b //mul_res1 = vmlal_u8(mul_res1, src_tmp4, coeffabs_3)//
- umlal v8.8h, v4.8b, v26.8b //mul_res1 = vmlal_u8(mul_res1, src_tmp1, coeffabs_4)//
- umlsl v8.8h, v5.8b, v27.8b //mul_res1 = vmlsl_u8(mul_res1, src_tmp2, coeffabs_5)//
- umlal v8.8h, v6.8b, v28.8b //mul_res1 = vmlal_u8(mul_res1, src_tmp3, coeffabs_6)//
- umlsl v8.8h, v7.8b, v29.8b //mul_res1 = vmlsl_u8(mul_res1, src_tmp4, coeffabs_7)//
- st1 {v12.8b},[x14],x6
+ umull v19.8h, v1.8b, v23.8b //mul_res1 = vmull_u8(src_tmp2, coeffabs_1)//
+ umlsl v19.8h, v0.8b, v22.8b //mul_res1 = vmlsl_u8(mul_res1, src_tmp1, coeffabs_0)//
+ umlsl v19.8h, v2.8b, v24.8b //mul_res1 = vmlsl_u8(mul_res1, src_tmp3, coeffabs_2)//
+ umlal v19.8h, v3.8b, v25.8b //mul_res1 = vmlal_u8(mul_res1, src_tmp4, coeffabs_3)//
+ umlal v19.8h, v4.8b, v26.8b //mul_res1 = vmlal_u8(mul_res1, src_tmp1, coeffabs_4)//
+ umlsl v19.8h, v5.8b, v27.8b //mul_res1 = vmlsl_u8(mul_res1, src_tmp2, coeffabs_5)//
+ umlal v19.8h, v6.8b, v28.8b //mul_res1 = vmlal_u8(mul_res1, src_tmp3, coeffabs_6)//
+ umlsl v19.8h, v7.8b, v29.8b //mul_res1 = vmlsl_u8(mul_res1, src_tmp4, coeffabs_7)//
+ st1 {v21.8b},[x14],x6
- sqrshrun v14.8b, v14.8h,#6
+ sqrshrun v30.8b, v30.8h,#6
ld1 {v16.8b},[x3],x2 //src_tmp1 = vld1_u8(pu1_src_tmp)//
- umull v10.8h, v2.8b, v23.8b //mul_res2 = vmull_u8(src_tmp3, coeffabs_1)//
- umlsl v10.8h, v1.8b, v22.8b //mul_res2 = vmlsl_u8(mul_res2, src_tmp2, coeffabs_0)//
- umlsl v10.8h, v3.8b, v24.8b //mul_res2 = vmlsl_u8(mul_res2, src_tmp4, coeffabs_2)//
- umlal v10.8h, v4.8b, v25.8b //mul_res2 = vmlal_u8(mul_res2, src_tmp1, coeffabs_3)//
- umlal v10.8h, v5.8b, v26.8b //mul_res2 = vmlal_u8(mul_res2, src_tmp2, coeffabs_4)//
- umlsl v10.8h, v6.8b, v27.8b //mul_res2 = vmlsl_u8(mul_res2, src_tmp3, coeffabs_5)//
- umlal v10.8h, v7.8b, v28.8b //mul_res2 = vmlal_u8(mul_res2, src_tmp4, coeffabs_6)//
- umlsl v10.8h, v16.8b, v29.8b //mul_res2 = vmlsl_u8(mul_res2, src_tmp1, coeffabs_7)//
- st1 {v14.8b},[x14],x6
-
- sqrshrun v8.8b, v8.8h,#6 //sto_res = vqmovun_s16(sto_res_tmp)//
+ umull v20.8h, v2.8b, v23.8b //mul_res2 = vmull_u8(src_tmp3, coeffabs_1)//
+ umlsl v20.8h, v1.8b, v22.8b //mul_res2 = vmlsl_u8(mul_res2, src_tmp2, coeffabs_0)//
+ umlsl v20.8h, v3.8b, v24.8b //mul_res2 = vmlsl_u8(mul_res2, src_tmp4, coeffabs_2)//
+ umlal v20.8h, v4.8b, v25.8b //mul_res2 = vmlal_u8(mul_res2, src_tmp1, coeffabs_3)//
+ umlal v20.8h, v5.8b, v26.8b //mul_res2 = vmlal_u8(mul_res2, src_tmp2, coeffabs_4)//
+ umlsl v20.8h, v6.8b, v27.8b //mul_res2 = vmlsl_u8(mul_res2, src_tmp3, coeffabs_5)//
+ umlal v20.8h, v7.8b, v28.8b //mul_res2 = vmlal_u8(mul_res2, src_tmp4, coeffabs_6)//
+ umlsl v20.8h, v16.8b, v29.8b //mul_res2 = vmlsl_u8(mul_res2, src_tmp1, coeffabs_7)//
+ st1 {v30.8b},[x14],x6
+
+ sqrshrun v19.8b, v19.8h,#6 //sto_res = vqmovun_s16(sto_res_tmp)//
ld1 {v17.8b},[x3],x2 //src_tmp2 = vld1_u8(pu1_src_tmp)//
- umull v12.8h, v3.8b, v23.8b
- umlsl v12.8h, v2.8b, v22.8b
- umlsl v12.8h, v4.8b, v24.8b
- umlal v12.8h, v5.8b, v25.8b
- umlal v12.8h, v6.8b, v26.8b
- umlsl v12.8h, v7.8b, v27.8b
- umlal v12.8h, v16.8b, v28.8b
- umlsl v12.8h, v17.8b, v29.8b
+ umull v21.8h, v3.8b, v23.8b
+ umlsl v21.8h, v2.8b, v22.8b
+ umlsl v21.8h, v4.8b, v24.8b
+ umlal v21.8h, v5.8b, v25.8b
+ umlal v21.8h, v6.8b, v26.8b
+ umlsl v21.8h, v7.8b, v27.8b
+ umlal v21.8h, v16.8b, v28.8b
+ umlsl v21.8h, v17.8b, v29.8b
add x14,x1,x6
- st1 {v8.8b},[x1],#8 //vst1_u8(pu1_dst,sto_res)//
- sqrshrun v10.8b, v10.8h,#6 //sto_res = vqmovun_s16(sto_res_tmp)//
+ st1 {v19.8b},[x1],#8 //vst1_u8(pu1_dst,sto_res)//
+ sqrshrun v20.8b, v20.8h,#6 //sto_res = vqmovun_s16(sto_res_tmp)//
ld1 {v18.8b},[x3],x2 //src_tmp3 = vld1_u8(pu1_src_tmp)//
- umull v14.8h, v4.8b, v23.8b
- umlsl v14.8h, v3.8b, v22.8b
- umlsl v14.8h, v5.8b, v24.8b
- umlal v14.8h, v6.8b, v25.8b
- umlal v14.8h, v7.8b, v26.8b
- umlsl v14.8h, v16.8b, v27.8b
- umlal v14.8h, v17.8b, v28.8b
- umlsl v14.8h, v18.8b, v29.8b
-
- st1 {v10.8b},[x14],x6 //vst1_u8(pu1_dst_tmp,sto_res)//
- sqrshrun v12.8b, v12.8h,#6
+ umull v30.8h, v4.8b, v23.8b
+ umlsl v30.8h, v3.8b, v22.8b
+ umlsl v30.8h, v5.8b, v24.8b
+ umlal v30.8h, v6.8b, v25.8b
+ umlal v30.8h, v7.8b, v26.8b
+ umlsl v30.8h, v16.8b, v27.8b
+ umlal v30.8h, v17.8b, v28.8b
+ umlsl v30.8h, v18.8b, v29.8b
+
+ st1 {v20.8b},[x14],x6 //vst1_u8(pu1_dst_tmp,sto_res)//
+ sqrshrun v21.8b, v21.8h,#6
epilog_end:
- st1 {v12.8b},[x14],x6
- sqrshrun v14.8b, v14.8h,#6
+ st1 {v21.8b},[x14],x6
+ sqrshrun v30.8b, v30.8h,#6
- st1 {v14.8b},[x14],x6
+ st1 {v30.8b},[x14],x6
end_loops:
@@ -427,7 +427,7 @@ end_loops:
// ldmeqfd sp!,{x4-x12,x15} //reload the registers from sp
bne lbl409
ldp x19, x20,[sp], #16
- pop_v_regs
+
ret
lbl409:
mov x5, #4
@@ -465,34 +465,34 @@ inner_loop_wd_4:
ld1 {v7.s}[1],[x3],x2 //src_tmp4 = vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp4, 1)//
umlsl v0.8h, v6.8b, v24.8b //mul_res1 = vmlsl_u8(mul_res1, vreinterpret_u8_u32(src_tmp3), coeffabs_2)//
- umull v8.8h, v7.8b, v23.8b
+ umull v19.8h, v7.8b, v23.8b
dup v4.2s, v7.2s[1] //src_tmp1 = vdup_lane_u32(src_tmp4, 1)//
umull v2.8h, v7.8b, v25.8b //mul_res2 = vmull_u8(vreinterpret_u8_u32(src_tmp4), coeffabs_3)//
ld1 {v4.s}[1],[x3],x2 //src_tmp1 = vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp1, 1)//
- umlsl v8.8h, v6.8b, v22.8b
+ umlsl v19.8h, v6.8b, v22.8b
umlal v0.8h, v4.8b, v26.8b //mul_res1 = vmlal_u8(mul_res1, vreinterpret_u8_u32(src_tmp1), coeffabs_4)//
dup v5.2s, v4.2s[1] //src_tmp2 = vdup_lane_u32(src_tmp1, 1)//
- umlsl v8.8h, v4.8b, v24.8b
+ umlsl v19.8h, v4.8b, v24.8b
ld1 {v5.s}[1],[x3],x2 //src_tmp2 = vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp2, 1)//
umlsl v2.8h, v5.8b, v27.8b //mul_res2 = vmlsl_u8(mul_res2, vreinterpret_u8_u32(src_tmp2), coeffabs_5)//
dup v6.2s, v5.2s[1] //src_tmp3 = vdup_lane_u32(src_tmp2, 1)//
- umlal v8.8h, v5.8b, v25.8b
+ umlal v19.8h, v5.8b, v25.8b
ld1 {v6.s}[1],[x3],x2 //src_tmp3 = vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp3, 1)//
umlal v0.8h, v6.8b, v28.8b //mul_res1 = vmlal_u8(mul_res1, vreinterpret_u8_u32(src_tmp3), coeffabs_6)//
dup v7.2s, v6.2s[1] //src_tmp4 = vdup_lane_u32(src_tmp3, 1)//
- umlal v8.8h, v6.8b, v26.8b
+ umlal v19.8h, v6.8b, v26.8b
ld1 {v7.s}[1],[x3],x2 //src_tmp4 = vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp4, 1)//
umlsl v2.8h, v7.8b, v29.8b //mul_res2 = vmlsl_u8(mul_res2, vreinterpret_u8_u32(src_tmp4), coeffabs_7)//
dup v4.2s, v7.2s[1]
add v0.8h, v0.8h , v2.8h //mul_res1 = vaddq_u16(mul_res1, mul_res2)//
- umlsl v8.8h, v7.8b, v27.8b
+ umlsl v19.8h, v7.8b, v27.8b
ld1 {v4.s}[1],[x3],x2
- umlal v8.8h, v4.8b, v28.8b
+ umlal v19.8h, v4.8b, v28.8b
dup v5.2s, v4.2s[1]
sqrshrun v0.8b, v0.8h,#6 //sto_res = vqmovun_s16(sto_res_tmp)//
@@ -500,13 +500,13 @@ inner_loop_wd_4:
add x3,x1,x6
st1 {v0.s}[0],[x1] //vst1_lane_u32((uint32_t *)pu1_dst, vreinterpret_u32_u8(sto_res), 0)//
- umlsl v8.8h, v5.8b, v29.8b
+ umlsl v19.8h, v5.8b, v29.8b
st1 {v0.s}[1],[x3],x6 //vst1_lane_u32((uint32_t *)pu1_dst_tmp, vreinterpret_u32_u8(sto_res), 1)//
- sqrshrun v8.8b, v8.8h,#6
+ sqrshrun v19.8b, v19.8h,#6
- st1 {v8.s}[0],[x3],x6
+ st1 {v19.s}[0],[x3],x6
add x1,x1,#4
- st1 {v8.s}[1],[x3]
+ st1 {v19.s}[1],[x3]
bgt inner_loop_wd_4
end_inner_loop_wd_4:
@@ -517,6 +517,6 @@ end_inner_loop_wd_4:
// ldmfd sp!, {x4-x12, x15} //reload the registers from sp
ldp x19, x20,[sp], #16
- pop_v_regs
+
ret
diff --git a/common/arm64/ihevc_inter_pred_filters_luma_vert_w16inp.s b/common/arm64/ihevc_inter_pred_filters_luma_vert_w16inp.s
index 64a00b2..cd8addf 100644
--- a/common/arm64/ihevc_inter_pred_filters_luma_vert_w16inp.s
+++ b/common/arm64/ihevc_inter_pred_filters_luma_vert_w16inp.s
@@ -106,7 +106,7 @@
ihevc_inter_pred_luma_vert_w16inp_av8:
// stmfd sp!, {x4-x12, x14} //stack stores the values of the arguments
- push_v_regs
+
stp x19, x20,[sp,#-16]!
mov x15,x4 // pi1_coeff
@@ -152,70 +152,70 @@ prolog:
ld1 {v0.4h},[x0],#8 //src_tmp1 = vld1_u8(pu1_src_tmp)//
subs x4,x4,#4
ld1 {v2.4h},[x3],x2 //src_tmp3 = vld1_u8(pu1_src_tmp)//
- smull v8.4s, v1.4h, v23.4h //mul_res1 = vmull_u8(src_tmp2, coeffabs_1)//
+ smull v19.4s, v1.4h, v23.4h //mul_res1 = vmull_u8(src_tmp2, coeffabs_1)//
ld1 {v3.4h},[x3],x2 //src_tmp4 = vld1_u8(pu1_src_tmp)//
- smlal v8.4s, v0.4h, v22.4h //mul_res1 = vmlal_u8(mul_res1, src_tmp1, coeffabs_0)//
+ smlal v19.4s, v0.4h, v22.4h //mul_res1 = vmlal_u8(mul_res1, src_tmp1, coeffabs_0)//
ld1 {v4.4h},[x3],x2 //src_tmp1 = vld1_u8(pu1_src_tmp)//
- smlal v8.4s, v2.4h, v24.4h //mul_res1 = vmlal_u8(mul_res1, src_tmp3, coeffabs_2)//
+ smlal v19.4s, v2.4h, v24.4h //mul_res1 = vmlal_u8(mul_res1, src_tmp3, coeffabs_2)//
ld1 {v5.4h},[x3],x2 //src_tmp2 = vld1_u8(pu1_src_tmp)//
- smlal v8.4s, v3.4h, v25.4h //mul_res1 = vmlal_u8(mul_res1, src_tmp4, coeffabs_3)//
+ smlal v19.4s, v3.4h, v25.4h //mul_res1 = vmlal_u8(mul_res1, src_tmp4, coeffabs_3)//
ld1 {v6.4h},[x3],x2 //src_tmp3 = vld1_u8(pu1_src_tmp)//
- smlal v8.4s, v4.4h, v26.4h //mul_res1 = vmlal_u8(mul_res1, src_tmp1, coeffabs_4)//
+ smlal v19.4s, v4.4h, v26.4h //mul_res1 = vmlal_u8(mul_res1, src_tmp1, coeffabs_4)//
ld1 {v7.4h},[x3],x2 //src_tmp4 = vld1_u8(pu1_src_tmp)//
- smlal v8.4s, v5.4h, v27.4h //mul_res1 = vmlal_u8(mul_res1, src_tmp2, coeffabs_5)//
- smlal v8.4s, v6.4h, v28.4h //mul_res1 = vmlal_u8(mul_res1, src_tmp3, coeffabs_6)//
- smlal v8.4s, v7.4h, v29.4h //mul_res1 = vmlal_u8(mul_res1, src_tmp4, coeffabs_7)//
+ smlal v19.4s, v5.4h, v27.4h //mul_res1 = vmlal_u8(mul_res1, src_tmp2, coeffabs_5)//
+ smlal v19.4s, v6.4h, v28.4h //mul_res1 = vmlal_u8(mul_res1, src_tmp3, coeffabs_6)//
+ smlal v19.4s, v7.4h, v29.4h //mul_res1 = vmlal_u8(mul_res1, src_tmp4, coeffabs_7)//
ld1 {v16.4h},[x3],x2 //src_tmp1 = vld1_u8(pu1_src_tmp)//
- smull v10.4s, v2.4h, v23.4h //mul_res2 = vmull_u8(src_tmp3, coeffabs_1)//
+ smull v20.4s, v2.4h, v23.4h //mul_res2 = vmull_u8(src_tmp3, coeffabs_1)//
add x20,x0,x8,lsl #0
csel x0, x20, x0,le
- smlal v10.4s, v1.4h, v22.4h //mul_res2 = vmlal_u8(mul_res2, src_tmp2, coeffabs_0)//
+ smlal v20.4s, v1.4h, v22.4h //mul_res2 = vmlal_u8(mul_res2, src_tmp2, coeffabs_0)//
csel x4, x5, x4,le //x5 ->wd
- smlal v10.4s, v3.4h, v24.4h //mul_res2 = vmlal_u8(mul_res2, src_tmp4, coeffabs_2)//
+ smlal v20.4s, v3.4h, v24.4h //mul_res2 = vmlal_u8(mul_res2, src_tmp4, coeffabs_2)//
ld1 {v17.4h},[x3],x2 //src_tmp2 = vld1_u8(pu1_src_tmp)//
- smlal v10.4s, v4.4h, v25.4h //mul_res2 = vmlal_u8(mul_res2, src_tmp1, coeffabs_3)//
+ smlal v20.4s, v4.4h, v25.4h //mul_res2 = vmlal_u8(mul_res2, src_tmp1, coeffabs_3)//
ld1 {v18.4h},[x3],x2 //src_tmp3 = vld1_u8(pu1_src_tmp)//
- smlal v10.4s, v5.4h, v26.4h //mul_res2 = vmlal_u8(mul_res2, src_tmp2, coeffabs_4)//
+ smlal v20.4s, v5.4h, v26.4h //mul_res2 = vmlal_u8(mul_res2, src_tmp2, coeffabs_4)//
add x3,x0,x2 //pu1_src_tmp += src_strd//
- smlal v10.4s, v6.4h, v27.4h //mul_res2 = vmlal_u8(mul_res2, src_tmp3, coeffabs_5)//
- smlal v10.4s, v7.4h, v28.4h //mul_res2 = vmlal_u8(mul_res2, src_tmp4, coeffabs_6)//
- smlal v10.4s, v16.4h, v29.4h //mul_res2 = vmlal_u8(mul_res2, src_tmp1, coeffabs_7)//
- sqshrn v8.4h, v8.4s,#6
+ smlal v20.4s, v6.4h, v27.4h //mul_res2 = vmlal_u8(mul_res2, src_tmp3, coeffabs_5)//
+ smlal v20.4s, v7.4h, v28.4h //mul_res2 = vmlal_u8(mul_res2, src_tmp4, coeffabs_6)//
+ smlal v20.4s, v16.4h, v29.4h //mul_res2 = vmlal_u8(mul_res2, src_tmp1, coeffabs_7)//
+ sqshrn v19.4h, v19.4s,#6
ld1 {v1.4h},[x3],x2 //src_tmp3 = vld1_u8(pu1_src_tmp)//
- smull v12.4s, v3.4h, v23.4h
+ smull v21.4s, v3.4h, v23.4h
ld1 {v0.4h},[x0],#8 //src_tmp1 = vld1_u8(pu1_src_tmp)//
- smlal v12.4s, v2.4h, v22.4h
+ smlal v21.4s, v2.4h, v22.4h
ld1 {v2.4h},[x3],x2 //src_tmp3 = vld1_u8(pu1_src_tmp)//
- smlal v12.4s, v4.4h, v24.4h
- smlal v12.4s, v5.4h, v25.4h
- smlal v12.4s, v6.4h, v26.4h
- smlal v12.4s, v7.4h, v27.4h
- smlal v12.4s, v16.4h, v28.4h
- smlal v12.4s, v17.4h, v29.4h
+ smlal v21.4s, v4.4h, v24.4h
+ smlal v21.4s, v5.4h, v25.4h
+ smlal v21.4s, v6.4h, v26.4h
+ smlal v21.4s, v7.4h, v27.4h
+ smlal v21.4s, v16.4h, v28.4h
+ smlal v21.4s, v17.4h, v29.4h
add x14,x1,x6
- sqshrn v10.4h, v10.4s,#6
- sqrshrun v8.8b, v8.8h,#6 //sto_res = vqmovun_s16(sto_res_tmp)//
+ sqshrn v20.4h, v20.4s,#6
+ sqrshrun v19.8b, v19.8h,#6 //sto_res = vqmovun_s16(sto_res_tmp)//
- smull v14.4s, v4.4h, v23.4h
- smlal v14.4s, v3.4h, v22.4h
- smlal v14.4s, v5.4h, v24.4h
- smlal v14.4s, v6.4h, v25.4h
+ smull v30.4s, v4.4h, v23.4h
+ smlal v30.4s, v3.4h, v22.4h
+ smlal v30.4s, v5.4h, v24.4h
+ smlal v30.4s, v6.4h, v25.4h
ld1 {v3.4h},[x3],x2 //src_tmp4 = vld1_u8(pu1_src_tmp)//
- smlal v14.4s, v7.4h, v26.4h
+ smlal v30.4s, v7.4h, v26.4h
ld1 {v4.4h},[x3],x2 //src_tmp1 = vld1_u8(pu1_src_tmp)//
- smlal v14.4s, v16.4h, v27.4h
+ smlal v30.4s, v16.4h, v27.4h
ld1 {v5.4h},[x3],x2 //src_tmp2 = vld1_u8(pu1_src_tmp)//
- smlal v14.4s, v17.4h, v28.4h
+ smlal v30.4s, v17.4h, v28.4h
ld1 {v6.4h},[x3],x2 //src_tmp3 = vld1_u8(pu1_src_tmp)//
- smlal v14.4s, v18.4h, v29.4h
+ smlal v30.4s, v18.4h, v29.4h
ld1 {v7.4h},[x3],x2 //src_tmp4 = vld1_u8(pu1_src_tmp)//
- st1 {v8.s}[0],[x1],#4 //vst1_u8(pu1_dst,sto_res)//
- sqshrn v12.4h, v12.4s,#6
- sqrshrun v10.8b, v10.8h,#6 //sto_res = vqmovun_s16(sto_res_tmp)//
+ st1 {v19.s}[0],[x1],#4 //vst1_u8(pu1_dst,sto_res)//
+ sqshrn v21.4h, v21.4s,#6
+ sqrshrun v20.8b, v20.8h,#6 //sto_res = vqmovun_s16(sto_res_tmp)//
add x20,x1,x9
csel x1, x20, x1,le
@@ -226,164 +226,164 @@ prolog:
kernel_8:
- smull v8.4s, v1.4h, v23.4h //mul_res1 = vmull_u8(src_tmp2, coeffabs_1)//
+ smull v19.4s, v1.4h, v23.4h //mul_res1 = vmull_u8(src_tmp2, coeffabs_1)//
subs x4,x4,#4
- smlal v8.4s, v0.4h, v22.4h //mul_res1 = vmlal_u8(mul_res1, src_tmp1, coeffabs_0)//
+ smlal v19.4s, v0.4h, v22.4h //mul_res1 = vmlal_u8(mul_res1, src_tmp1, coeffabs_0)//
add x20,x0,x8,lsl #0
csel x0, x20, x0,le
- smlal v8.4s, v2.4h, v24.4h //mul_res1 = vmlal_u8(mul_res1, src_tmp3, coeffabs_2)//
- smlal v8.4s, v3.4h, v25.4h //mul_res1 = vmlal_u8(mul_res1, src_tmp4, coeffabs_3)//
- smlal v8.4s, v4.4h, v26.4h //mul_res1 = vmlal_u8(mul_res1, src_tmp1, coeffabs_4)//
- smlal v8.4s, v5.4h, v27.4h //mul_res1 = vmlal_u8(mul_res1, src_tmp2, coeffabs_5)//
- smlal v8.4s, v6.4h, v28.4h //mul_res1 = vmlal_u8(mul_res1, src_tmp3, coeffabs_6)//
- smlal v8.4s, v7.4h, v29.4h //mul_res1 = vmlal_u8(mul_res1, src_tmp4, coeffabs_7)//
- st1 {v10.s}[0],[x14],x6 //vst1_u8(pu1_dst_tmp,sto_res)//
-
- sqshrn v14.4h, v14.4s,#6
- sqrshrun v12.8b, v12.8h,#6
+ smlal v19.4s, v2.4h, v24.4h //mul_res1 = vmlal_u8(mul_res1, src_tmp3, coeffabs_2)//
+ smlal v19.4s, v3.4h, v25.4h //mul_res1 = vmlal_u8(mul_res1, src_tmp4, coeffabs_3)//
+ smlal v19.4s, v4.4h, v26.4h //mul_res1 = vmlal_u8(mul_res1, src_tmp1, coeffabs_4)//
+ smlal v19.4s, v5.4h, v27.4h //mul_res1 = vmlal_u8(mul_res1, src_tmp2, coeffabs_5)//
+ smlal v19.4s, v6.4h, v28.4h //mul_res1 = vmlal_u8(mul_res1, src_tmp3, coeffabs_6)//
+ smlal v19.4s, v7.4h, v29.4h //mul_res1 = vmlal_u8(mul_res1, src_tmp4, coeffabs_7)//
+ st1 {v20.s}[0],[x14],x6 //vst1_u8(pu1_dst_tmp,sto_res)//
+
+ sqshrn v30.4h, v30.4s,#6
+ sqrshrun v21.8b, v21.8h,#6
ld1 {v16.4h},[x3],x2 //src_tmp1 = vld1_u8(pu1_src_tmp)//
- smull v10.4s, v2.4h, v23.4h //mul_res2 = vmull_u8(src_tmp3, coeffabs_1)//
- smlal v10.4s, v1.4h, v22.4h //mul_res2 = vmlal_u8(mul_res2, src_tmp2, coeffabs_0)//
- smlal v10.4s, v3.4h, v24.4h //mul_res2 = vmlal_u8(mul_res2, src_tmp4, coeffabs_2)//
- smlal v10.4s, v4.4h, v25.4h //mul_res2 = vmlal_u8(mul_res2, src_tmp1, coeffabs_3)//
- smlal v10.4s, v5.4h, v26.4h //mul_res2 = vmlal_u8(mul_res2, src_tmp2, coeffabs_4)//
- smlal v10.4s, v6.4h, v27.4h //mul_res2 = vmlal_u8(mul_res2, src_tmp3, coeffabs_5)//
- st1 {v12.s}[0],[x14],x6
+ smull v20.4s, v2.4h, v23.4h //mul_res2 = vmull_u8(src_tmp3, coeffabs_1)//
+ smlal v20.4s, v1.4h, v22.4h //mul_res2 = vmlal_u8(mul_res2, src_tmp2, coeffabs_0)//
+ smlal v20.4s, v3.4h, v24.4h //mul_res2 = vmlal_u8(mul_res2, src_tmp4, coeffabs_2)//
+ smlal v20.4s, v4.4h, v25.4h //mul_res2 = vmlal_u8(mul_res2, src_tmp1, coeffabs_3)//
+ smlal v20.4s, v5.4h, v26.4h //mul_res2 = vmlal_u8(mul_res2, src_tmp2, coeffabs_4)//
+ smlal v20.4s, v6.4h, v27.4h //mul_res2 = vmlal_u8(mul_res2, src_tmp3, coeffabs_5)//
+ st1 {v21.s}[0],[x14],x6
- smlal v10.4s, v7.4h, v28.4h //mul_res2 = vmlal_u8(mul_res2, src_tmp4, coeffabs_6)//
+ smlal v20.4s, v7.4h, v28.4h //mul_res2 = vmlal_u8(mul_res2, src_tmp4, coeffabs_6)//
ld1 {v17.4h},[x3],x2 //src_tmp2 = vld1_u8(pu1_src_tmp)//
- smlal v10.4s, v16.4h, v29.4h //mul_res2 = vmlal_u8(mul_res2, src_tmp1, coeffabs_7)//
+ smlal v20.4s, v16.4h, v29.4h //mul_res2 = vmlal_u8(mul_res2, src_tmp1, coeffabs_7)//
- sqshrn v8.4h, v8.4s,#6
- sqrshrun v14.8b, v14.8h,#6
+ sqshrn v19.4h, v19.4s,#6
+ sqrshrun v30.8b, v30.8h,#6
- smull v12.4s, v3.4h, v23.4h
+ smull v21.4s, v3.4h, v23.4h
csel x4, x5, x4,le //x5 ->wd
- smlal v12.4s, v2.4h, v22.4h
+ smlal v21.4s, v2.4h, v22.4h
ld1 {v18.4h},[x3],x2 //src_tmp3 = vld1_u8(pu1_src_tmp)//
- smlal v12.4s, v4.4h, v24.4h
+ smlal v21.4s, v4.4h, v24.4h
add x3,x0,x2 //pu1_src_tmp += src_strd//
- smlal v12.4s, v5.4h, v25.4h
+ smlal v21.4s, v5.4h, v25.4h
- smlal v12.4s, v6.4h, v26.4h
- st1 {v14.s}[0],[x14],x6
+ smlal v21.4s, v6.4h, v26.4h
+ st1 {v30.s}[0],[x14],x6
- smlal v12.4s, v7.4h, v27.4h
+ smlal v21.4s, v7.4h, v27.4h
ld1 {v1.4h},[x3],x2 //src_tmp2 = vld1_u8(pu1_src_tmp)//
- smlal v12.4s, v16.4h, v28.4h
+ smlal v21.4s, v16.4h, v28.4h
add x14,x1,x6
- smlal v12.4s, v17.4h, v29.4h
+ smlal v21.4s, v17.4h, v29.4h
ld1 {v0.4h},[x0],#8 //src_tmp1 = vld1_u8(pu1_src_tmp)//
- sqshrn v10.4h, v10.4s,#6
- sqrshrun v8.8b, v8.8h,#6 //sto_res = vqmovun_s16(sto_res_tmp)//
+ sqshrn v20.4h, v20.4s,#6
+ sqrshrun v19.8b, v19.8h,#6 //sto_res = vqmovun_s16(sto_res_tmp)//
ld1 {v2.4h},[x3],x2 //src_tmp3 = vld1_u8(pu1_src_tmp)//
- smull v14.4s, v4.4h, v23.4h
- smlal v14.4s, v3.4h, v22.4h
- smlal v14.4s, v5.4h, v24.4h
+ smull v30.4s, v4.4h, v23.4h
+ smlal v30.4s, v3.4h, v22.4h
+ smlal v30.4s, v5.4h, v24.4h
ld1 {v3.4h},[x3],x2 //src_tmp4 = vld1_u8(pu1_src_tmp)//
- smlal v14.4s, v6.4h, v25.4h
+ smlal v30.4s, v6.4h, v25.4h
ld1 {v4.4h},[x3],x2 //src_tmp1 = vld1_u8(pu1_src_tmp)//
- smlal v14.4s, v7.4h, v26.4h
+ smlal v30.4s, v7.4h, v26.4h
ld1 {v5.4h},[x3],x2 //src_tmp2 = vld1_u8(pu1_src_tmp)//
- smlal v14.4s, v16.4h, v27.4h
+ smlal v30.4s, v16.4h, v27.4h
ld1 {v6.4h},[x3],x2 //src_tmp3 = vld1_u8(pu1_src_tmp)//
- smlal v14.4s, v17.4h, v28.4h
+ smlal v30.4s, v17.4h, v28.4h
ld1 {v7.4h},[x3],x2 //src_tmp4 = vld1_u8(pu1_src_tmp)//
- smlal v14.4s, v18.4h, v29.4h
- st1 {v8.s}[0],[x1],#4 //vst1_u8(pu1_dst,sto_res)//
+ smlal v30.4s, v18.4h, v29.4h
+ st1 {v19.s}[0],[x1],#4 //vst1_u8(pu1_dst,sto_res)//
- sqshrn v12.4h, v12.4s,#6
+ sqshrn v21.4h, v21.4s,#6
add x20,x1,x9
csel x1, x20, x1,le
- sqrshrun v10.8b, v10.8h,#6 //sto_res = vqmovun_s16(sto_res_tmp)//
+ sqrshrun v20.8b, v20.8h,#6 //sto_res = vqmovun_s16(sto_res_tmp)//
subs x7,x7,#4
bgt kernel_8 //jumps to kernel_8
epilog:
- smull v8.4s, v1.4h, v23.4h //mul_res1 = vmull_u8(src_tmp2, coeffabs_1)//
- smlal v8.4s, v0.4h, v22.4h //mul_res1 = vmlal_u8(mul_res1, src_tmp1, coeffabs_0)//
- smlal v8.4s, v2.4h, v24.4h //mul_res1 = vmlal_u8(mul_res1, src_tmp3, coeffabs_2)//
- smlal v8.4s, v3.4h, v25.4h //mul_res1 = vmlal_u8(mul_res1, src_tmp4, coeffabs_3)//
- smlal v8.4s, v4.4h, v26.4h //mul_res1 = vmlal_u8(mul_res1, src_tmp1, coeffabs_4)//
- smlal v8.4s, v5.4h, v27.4h //mul_res1 = vmlal_u8(mul_res1, src_tmp2, coeffabs_5)//
- smlal v8.4s, v6.4h, v28.4h //mul_res1 = vmlal_u8(mul_res1, src_tmp3, coeffabs_6)//
- smlal v8.4s, v7.4h, v29.4h //mul_res1 = vmlal_u8(mul_res1, src_tmp4, coeffabs_7)//
- st1 {v10.s}[0],[x14],x6
+ smull v19.4s, v1.4h, v23.4h //mul_res1 = vmull_u8(src_tmp2, coeffabs_1)//
+ smlal v19.4s, v0.4h, v22.4h //mul_res1 = vmlal_u8(mul_res1, src_tmp1, coeffabs_0)//
+ smlal v19.4s, v2.4h, v24.4h //mul_res1 = vmlal_u8(mul_res1, src_tmp3, coeffabs_2)//
+ smlal v19.4s, v3.4h, v25.4h //mul_res1 = vmlal_u8(mul_res1, src_tmp4, coeffabs_3)//
+ smlal v19.4s, v4.4h, v26.4h //mul_res1 = vmlal_u8(mul_res1, src_tmp1, coeffabs_4)//
+ smlal v19.4s, v5.4h, v27.4h //mul_res1 = vmlal_u8(mul_res1, src_tmp2, coeffabs_5)//
+ smlal v19.4s, v6.4h, v28.4h //mul_res1 = vmlal_u8(mul_res1, src_tmp3, coeffabs_6)//
+ smlal v19.4s, v7.4h, v29.4h //mul_res1 = vmlal_u8(mul_res1, src_tmp4, coeffabs_7)//
+ st1 {v20.s}[0],[x14],x6
- sqshrn v14.4h, v14.4s,#6
- sqrshrun v12.8b, v12.8h,#6
+ sqshrn v30.4h, v30.4s,#6
+ sqrshrun v21.8b, v21.8h,#6
ld1 {v16.4h},[x3],x2 //src_tmp1 = vld1_u8(pu1_src_tmp)//
- smull v10.4s, v2.4h, v23.4h //mul_res2 = vmull_u8(src_tmp3, coeffabs_1)//
- smlal v10.4s, v1.4h, v22.4h //mul_res2 = vmlal_u8(mul_res2, src_tmp2, coeffabs_0)//
- smlal v10.4s, v3.4h, v24.4h //mul_res2 = vmlal_u8(mul_res2, src_tmp4, coeffabs_2)//
- smlal v10.4s, v4.4h, v25.4h //mul_res2 = vmlal_u8(mul_res2, src_tmp1, coeffabs_3)//
- smlal v10.4s, v5.4h, v26.4h //mul_res2 = vmlal_u8(mul_res2, src_tmp2, coeffabs_4)//
- smlal v10.4s, v6.4h, v27.4h //mul_res2 = vmlal_u8(mul_res2, src_tmp3, coeffabs_5)//
- smlal v10.4s, v7.4h, v28.4h //mul_res2 = vmlal_u8(mul_res2, src_tmp4, coeffabs_6)//
- smlal v10.4s, v16.4h, v29.4h //mul_res2 = vmlal_u8(mul_res2, src_tmp1, coeffabs_7)//
- st1 {v12.s}[0],[x14],x6
-
- sqshrn v8.4h, v8.4s,#6
- sqrshrun v14.8b, v14.8h,#6
+ smull v20.4s, v2.4h, v23.4h //mul_res2 = vmull_u8(src_tmp3, coeffabs_1)//
+ smlal v20.4s, v1.4h, v22.4h //mul_res2 = vmlal_u8(mul_res2, src_tmp2, coeffabs_0)//
+ smlal v20.4s, v3.4h, v24.4h //mul_res2 = vmlal_u8(mul_res2, src_tmp4, coeffabs_2)//
+ smlal v20.4s, v4.4h, v25.4h //mul_res2 = vmlal_u8(mul_res2, src_tmp1, coeffabs_3)//
+ smlal v20.4s, v5.4h, v26.4h //mul_res2 = vmlal_u8(mul_res2, src_tmp2, coeffabs_4)//
+ smlal v20.4s, v6.4h, v27.4h //mul_res2 = vmlal_u8(mul_res2, src_tmp3, coeffabs_5)//
+ smlal v20.4s, v7.4h, v28.4h //mul_res2 = vmlal_u8(mul_res2, src_tmp4, coeffabs_6)//
+ smlal v20.4s, v16.4h, v29.4h //mul_res2 = vmlal_u8(mul_res2, src_tmp1, coeffabs_7)//
+ st1 {v21.s}[0],[x14],x6
+
+ sqshrn v19.4h, v19.4s,#6
+ sqrshrun v30.8b, v30.8h,#6
ld1 {v17.4h},[x3],x2 //src_tmp2 = vld1_u8(pu1_src_tmp)//
- smull v12.4s, v3.4h, v23.4h
- smlal v12.4s, v2.4h, v22.4h
- smlal v12.4s, v4.4h, v24.4h
- smlal v12.4s, v5.4h, v25.4h
- smlal v12.4s, v6.4h, v26.4h
- smlal v12.4s, v7.4h, v27.4h
- smlal v12.4s, v16.4h, v28.4h
- smlal v12.4s, v17.4h, v29.4h
- st1 {v14.s}[0],[x14],x6
- sqshrn v10.4h, v10.4s,#6
- sqrshrun v8.8b, v8.8h,#6 //sto_res = vqmovun_s16(sto_res_tmp)//
+ smull v21.4s, v3.4h, v23.4h
+ smlal v21.4s, v2.4h, v22.4h
+ smlal v21.4s, v4.4h, v24.4h
+ smlal v21.4s, v5.4h, v25.4h
+ smlal v21.4s, v6.4h, v26.4h
+ smlal v21.4s, v7.4h, v27.4h
+ smlal v21.4s, v16.4h, v28.4h
+ smlal v21.4s, v17.4h, v29.4h
+ st1 {v30.s}[0],[x14],x6
+ sqshrn v20.4h, v20.4s,#6
+ sqrshrun v19.8b, v19.8h,#6 //sto_res = vqmovun_s16(sto_res_tmp)//
ld1 {v18.4h},[x3],x2 //src_tmp3 = vld1_u8(pu1_src_tmp)//
- smull v14.4s, v4.4h, v23.4h
- smlal v14.4s, v3.4h, v22.4h
- smlal v14.4s, v5.4h, v24.4h
- smlal v14.4s, v6.4h, v25.4h
- smlal v14.4s, v7.4h, v26.4h
- smlal v14.4s, v16.4h, v27.4h
- smlal v14.4s, v17.4h, v28.4h
- smlal v14.4s, v18.4h, v29.4h
- sqshrn v12.4h, v12.4s,#6
- sqrshrun v10.8b, v10.8h,#6 //sto_res = vqmovun_s16(sto_res_tmp)//
+ smull v30.4s, v4.4h, v23.4h
+ smlal v30.4s, v3.4h, v22.4h
+ smlal v30.4s, v5.4h, v24.4h
+ smlal v30.4s, v6.4h, v25.4h
+ smlal v30.4s, v7.4h, v26.4h
+ smlal v30.4s, v16.4h, v27.4h
+ smlal v30.4s, v17.4h, v28.4h
+ smlal v30.4s, v18.4h, v29.4h
+ sqshrn v21.4h, v21.4s,#6
+ sqrshrun v20.8b, v20.8h,#6 //sto_res = vqmovun_s16(sto_res_tmp)//
add x14,x1,x6
- st1 {v8.s}[0],[x1],#4 //vst1_u8(pu1_dst,sto_res)//
+ st1 {v19.s}[0],[x1],#4 //vst1_u8(pu1_dst,sto_res)//
epilog_end:
- st1 {v10.s}[0],[x14],x6 //vst1_u8(pu1_dst_tmp,sto_res)//
- sqrshrun v12.8b, v12.8h,#6
+ st1 {v20.s}[0],[x14],x6 //vst1_u8(pu1_dst_tmp,sto_res)//
+ sqrshrun v21.8b, v21.8h,#6
- st1 {v12.s}[0],[x14],x6
- sqshrn v14.4h, v14.4s,#6
- sqrshrun v14.8b, v14.8h,#6
+ st1 {v21.s}[0],[x14],x6
+ sqshrn v30.4h, v30.4s,#6
+ sqrshrun v30.8b, v30.8h,#6
- st1 {v14.s}[0],[x14],x6
+ st1 {v30.s}[0],[x14],x6
end_loops:
// ldmfd sp!,{x4-x12,x15} //reload the registers from sp
ldp x19, x20,[sp], #16
- pop_v_regs
+
ret
diff --git a/common/arm64/ihevc_inter_pred_filters_luma_vert_w16out.s b/common/arm64/ihevc_inter_pred_filters_luma_vert_w16out.s
index da316ae..ca48db5 100644
--- a/common/arm64/ihevc_inter_pred_filters_luma_vert_w16out.s
+++ b/common/arm64/ihevc_inter_pred_filters_luma_vert_w16out.s
@@ -70,7 +70,7 @@
ihevc_inter_pred_luma_vert_w16out_av8:
// stmfd sp!, {x4-x12, x14} //stack stores the values of the arguments
- push_v_regs
+
stp x19, x20,[sp,#-16]!
mov x15,x4 // pi1_coeff
@@ -118,83 +118,83 @@ prolog_16out:
ld1 {v0.8b},[x0],#8 //src_tmp1 = vld1_u8(pu1_src_tmp)//
subs x4,x4,#8
ld1 {v2.8b},[x3],x2 //src_tmp3 = vld1_u8(pu1_src_tmp)//
- umull v8.8h, v1.8b, v23.8b //mul_res1 = vmull_u8(src_tmp2, coeffabs_1)//
+ umull v19.8h, v1.8b, v23.8b //mul_res1 = vmull_u8(src_tmp2, coeffabs_1)//
ld1 {v3.8b},[x3],x2 //src_tmp4 = vld1_u8(pu1_src_tmp)//
- umlsl v8.8h, v0.8b, v22.8b //mul_res1 = vmlsl_u8(mul_res1, src_tmp1, coeffabs_0)//
+ umlsl v19.8h, v0.8b, v22.8b //mul_res1 = vmlsl_u8(mul_res1, src_tmp1, coeffabs_0)//
ld1 {v4.8b},[x3],x2 //src_tmp1 = vld1_u8(pu1_src_tmp)//
- umlsl v8.8h, v2.8b, v24.8b //mul_res1 = vmlsl_u8(mul_res1, src_tmp3, coeffabs_2)//
+ umlsl v19.8h, v2.8b, v24.8b //mul_res1 = vmlsl_u8(mul_res1, src_tmp3, coeffabs_2)//
ld1 {v5.8b},[x3],x2 //src_tmp2 = vld1_u8(pu1_src_tmp)//
- umlal v8.8h, v3.8b, v25.8b //mul_res1 = vmlal_u8(mul_res1, src_tmp4, coeffabs_3)//
+ umlal v19.8h, v3.8b, v25.8b //mul_res1 = vmlal_u8(mul_res1, src_tmp4, coeffabs_3)//
ld1 {v6.8b},[x3],x2 //src_tmp3 = vld1_u8(pu1_src_tmp)//
- umlal v8.8h, v4.8b, v26.8b //mul_res1 = vmlal_u8(mul_res1, src_tmp1, coeffabs_4)//
+ umlal v19.8h, v4.8b, v26.8b //mul_res1 = vmlal_u8(mul_res1, src_tmp1, coeffabs_4)//
ld1 {v7.8b},[x3],x2 //src_tmp4 = vld1_u8(pu1_src_tmp)//
- umlsl v8.8h, v5.8b, v27.8b //mul_res1 = vmlsl_u8(mul_res1, src_tmp2, coeffabs_5)//
+ umlsl v19.8h, v5.8b, v27.8b //mul_res1 = vmlsl_u8(mul_res1, src_tmp2, coeffabs_5)//
ld1 {v16.8b},[x3],x2 //src_tmp1 = vld1_u8(pu1_src_tmp)//
- umlal v8.8h, v6.8b, v28.8b //mul_res1 = vmlal_u8(mul_res1, src_tmp3, coeffabs_6)//
+ umlal v19.8h, v6.8b, v28.8b //mul_res1 = vmlal_u8(mul_res1, src_tmp3, coeffabs_6)//
ld1 {v17.8b},[x3],x2 //src_tmp2 = vld1_u8(pu1_src_tmp)//
- umlsl v8.8h, v7.8b, v29.8b //mul_res1 = vmlsl_u8(mul_res1, src_tmp4, coeffabs_7)//
+ umlsl v19.8h, v7.8b, v29.8b //mul_res1 = vmlsl_u8(mul_res1, src_tmp4, coeffabs_7)//
add x20,x0,x8
csel x0, x20, x0,le
- umull v10.8h, v2.8b, v23.8b //mul_res2 = vmull_u8(src_tmp3, coeffabs_1)//
+ umull v20.8h, v2.8b, v23.8b //mul_res2 = vmull_u8(src_tmp3, coeffabs_1)//
bic x20,x5,#7 //x5 ->wd
csel x4, x20, x4,le
- umlsl v10.8h, v1.8b, v22.8b //mul_res2 = vmlsl_u8(mul_res2, src_tmp2, coeffabs_0)//
+ umlsl v20.8h, v1.8b, v22.8b //mul_res2 = vmlsl_u8(mul_res2, src_tmp2, coeffabs_0)//
ld1 {v18.8b},[x3],x2 //src_tmp3 = vld1_u8(pu1_src_tmp)//
- umlsl v10.8h, v3.8b, v24.8b //mul_res2 = vmlsl_u8(mul_res2, src_tmp4, coeffabs_2)//
+ umlsl v20.8h, v3.8b, v24.8b //mul_res2 = vmlsl_u8(mul_res2, src_tmp4, coeffabs_2)//
add x20,x20,x3
prfm PLDL1KEEP,[x20]
- umlal v10.8h, v4.8b, v25.8b //mul_res2 = vmlal_u8(mul_res2, src_tmp1, coeffabs_3)//
+ umlal v20.8h, v4.8b, v25.8b //mul_res2 = vmlal_u8(mul_res2, src_tmp1, coeffabs_3)//
add x20,x3, x2
prfm PLDL1KEEP,[x20]
- umlal v10.8h, v5.8b, v26.8b //mul_res2 = vmlal_u8(mul_res2, src_tmp2, coeffabs_4)//
+ umlal v20.8h, v5.8b, v26.8b //mul_res2 = vmlal_u8(mul_res2, src_tmp2, coeffabs_4)//
add x20,x3, x2, lsl #1
prfm PLDL1KEEP,[x20]
- umlsl v10.8h, v6.8b, v27.8b //mul_res2 = vmlsl_u8(mul_res2, src_tmp3, coeffabs_5)//
+ umlsl v20.8h, v6.8b, v27.8b //mul_res2 = vmlsl_u8(mul_res2, src_tmp3, coeffabs_5)//
add x3, x3, x2
- umlal v10.8h, v7.8b, v28.8b //mul_res2 = vmlal_u8(mul_res2, src_tmp4, coeffabs_6)//
+ umlal v20.8h, v7.8b, v28.8b //mul_res2 = vmlal_u8(mul_res2, src_tmp4, coeffabs_6)//
add x20,x3, x2, lsl #1
prfm PLDL1KEEP,[x20]
- umlsl v10.8h, v16.8b, v29.8b //mul_res2 = vmlsl_u8(mul_res2, src_tmp1, coeffabs_7)//
+ umlsl v20.8h, v16.8b, v29.8b //mul_res2 = vmlsl_u8(mul_res2, src_tmp1, coeffabs_7)//
add x3,x0,x2 //pu1_src_tmp += src_strd//
- umull v12.8h, v3.8b, v23.8b
+ umull v21.8h, v3.8b, v23.8b
ld1 {v1.8b},[x3],x2 //src_tmp3 = vld1_u8(pu1_src_tmp)//
- umlsl v12.8h, v2.8b, v22.8b
+ umlsl v21.8h, v2.8b, v22.8b
ld1 {v0.8b},[x0],#8 //src_tmp1 = vld1_u8(pu1_src_tmp)//
- umlsl v12.8h, v4.8b, v24.8b
+ umlsl v21.8h, v4.8b, v24.8b
ld1 {v2.8b},[x3],x2 //src_tmp3 = vld1_u8(pu1_src_tmp)//
- umlal v12.8h, v5.8b, v25.8b
- umlal v12.8h, v6.8b, v26.8b
- umlsl v12.8h, v7.8b, v27.8b
- umlal v12.8h, v16.8b, v28.8b
- umlsl v12.8h, v17.8b, v29.8b
+ umlal v21.8h, v5.8b, v25.8b
+ umlal v21.8h, v6.8b, v26.8b
+ umlsl v21.8h, v7.8b, v27.8b
+ umlal v21.8h, v16.8b, v28.8b
+ umlsl v21.8h, v17.8b, v29.8b
add x14,x1,x6
- st1 {v8.16b},[x1],#16 //vst1_u8(pu1_dst,sto_res)//
+ st1 {v19.16b},[x1],#16 //vst1_u8(pu1_dst,sto_res)//
//vqrshrun.s16 d10,q5,#6 //sto_res = vqmovun_s16(sto_res_tmp)//
add x20,x1,x9,lsl #1
csel x1, x20, x1,le
- umull v14.8h, v4.8b, v23.8b
+ umull v30.8h, v4.8b, v23.8b
subs x7,x7,#4
- umlsl v14.8h, v3.8b, v22.8b
- umlsl v14.8h, v5.8b, v24.8b
- umlal v14.8h, v6.8b, v25.8b
+ umlsl v30.8h, v3.8b, v22.8b
+ umlsl v30.8h, v5.8b, v24.8b
+ umlal v30.8h, v6.8b, v25.8b
ld1 {v3.8b},[x3],x2 //src_tmp4 = vld1_u8(pu1_src_tmp)//
- umlal v14.8h, v7.8b, v26.8b
+ umlal v30.8h, v7.8b, v26.8b
ld1 {v4.8b},[x3],x2 //src_tmp1 = vld1_u8(pu1_src_tmp)//
- umlsl v14.8h, v16.8b, v27.8b
+ umlsl v30.8h, v16.8b, v27.8b
ld1 {v5.8b},[x3],x2 //src_tmp2 = vld1_u8(pu1_src_tmp)//
- umlal v14.8h, v17.8b, v28.8b
+ umlal v30.8h, v17.8b, v28.8b
ld1 {v6.8b},[x3],x2 //src_tmp3 = vld1_u8(pu1_src_tmp)//
- umlsl v14.8h, v18.8b, v29.8b
+ umlsl v30.8h, v18.8b, v29.8b
ld1 {v7.8b},[x3],x2 //src_tmp4 = vld1_u8(pu1_src_tmp)//
- st1 {v10.16b},[x14],x6 //vst1_u8(pu1_dst_tmp,sto_res)//
+ st1 {v20.16b},[x14],x6 //vst1_u8(pu1_dst_tmp,sto_res)//
//vqrshrun.s16 d12,q6,#6
@@ -204,170 +204,170 @@ prolog_16out:
kernel_8_16out:
subs x4,x4,#8
- umull v8.8h, v1.8b, v23.8b //mul_res1 = vmull_u8(src_tmp2, coeffabs_1)//
+ umull v19.8h, v1.8b, v23.8b //mul_res1 = vmull_u8(src_tmp2, coeffabs_1)//
add x20,x0,x8
csel x0, x20, x0,le
- umlsl v8.8h, v0.8b, v22.8b //mul_res1 = vmlsl_u8(mul_res1, src_tmp1, coeffabs_0)//
+ umlsl v19.8h, v0.8b, v22.8b //mul_res1 = vmlsl_u8(mul_res1, src_tmp1, coeffabs_0)//
ld1 {v16.8b},[x3],x2 //src_tmp1 = vld1_u8(pu1_src_tmp)//
- umlsl v8.8h, v2.8b, v24.8b //mul_res1 = vmlsl_u8(mul_res1, src_tmp3, coeffabs_2)//
+ umlsl v19.8h, v2.8b, v24.8b //mul_res1 = vmlsl_u8(mul_res1, src_tmp3, coeffabs_2)//
ld1 {v17.8b},[x3],x2 //src_tmp2 = vld1_u8(pu1_src_tmp)//
- umlal v8.8h, v3.8b, v25.8b //mul_res1 = vmlal_u8(mul_res1, src_tmp4, coeffabs_3)//
+ umlal v19.8h, v3.8b, v25.8b //mul_res1 = vmlal_u8(mul_res1, src_tmp4, coeffabs_3)//
bic x20,x5,#7 //x5 ->wd
csel x4, x20, x4,le
- umlal v8.8h, v4.8b, v26.8b //mul_res1 = vmlal_u8(mul_res1, src_tmp1, coeffabs_4)//
+ umlal v19.8h, v4.8b, v26.8b //mul_res1 = vmlal_u8(mul_res1, src_tmp1, coeffabs_4)//
ld1 {v18.8b},[x3],x2 //src_tmp3 = vld1_u8(pu1_src_tmp)//
- umlsl v8.8h, v5.8b, v27.8b //mul_res1 = vmlsl_u8(mul_res1, src_tmp2, coeffabs_5)//
+ umlsl v19.8h, v5.8b, v27.8b //mul_res1 = vmlsl_u8(mul_res1, src_tmp2, coeffabs_5)//
- st1 {v12.16b},[x14],x6
- umlal v8.8h, v6.8b, v28.8b //mul_res1 = vmlal_u8(mul_res1, src_tmp3, coeffabs_6)//
+ st1 {v21.16b},[x14],x6
+ umlal v19.8h, v6.8b, v28.8b //mul_res1 = vmlal_u8(mul_res1, src_tmp3, coeffabs_6)//
add x3,x0,x2 //pu1_src_tmp += src_strd//
- umlsl v8.8h, v7.8b, v29.8b //mul_res1 = vmlsl_u8(mul_res1, src_tmp4, coeffabs_7)//
+ umlsl v19.8h, v7.8b, v29.8b //mul_res1 = vmlsl_u8(mul_res1, src_tmp4, coeffabs_7)//
// and x11, x0, #31
- umull v10.8h, v2.8b, v23.8b //mul_res2 = vmull_u8(src_tmp3, coeffabs_1)//
+ umull v20.8h, v2.8b, v23.8b //mul_res2 = vmull_u8(src_tmp3, coeffabs_1)//
- st1 {v14.16b},[x14],x6
- umlsl v10.8h, v1.8b, v22.8b //mul_res2 = vmlsl_u8(mul_res2, src_tmp2, coeffabs_0)//
+ st1 {v30.16b},[x14],x6
+ umlsl v20.8h, v1.8b, v22.8b //mul_res2 = vmlsl_u8(mul_res2, src_tmp2, coeffabs_0)//
add x14,x1,x6
- umlsl v10.8h, v3.8b, v24.8b //mul_res2 = vmlsl_u8(mul_res2, src_tmp4, coeffabs_2)//
+ umlsl v20.8h, v3.8b, v24.8b //mul_res2 = vmlsl_u8(mul_res2, src_tmp4, coeffabs_2)//
ld1 {v0.8b},[x0],#8 //src_tmp1 = vld1_u8(pu1_src_tmp)//
- umlal v10.8h, v4.8b, v25.8b //mul_res2 = vmlal_u8(mul_res2, src_tmp1, coeffabs_3)//
+ umlal v20.8h, v4.8b, v25.8b //mul_res2 = vmlal_u8(mul_res2, src_tmp1, coeffabs_3)//
ld1 {v1.8b},[x3],x2 //src_tmp2 = vld1_u8(pu1_src_tmp)//
- umlal v10.8h, v5.8b, v26.8b //mul_res2 = vmlal_u8(mul_res2, src_tmp2, coeffabs_4)//
+ umlal v20.8h, v5.8b, v26.8b //mul_res2 = vmlal_u8(mul_res2, src_tmp2, coeffabs_4)//
- st1 {v8.16b},[x1],#16 //vst1_u8(pu1_dst,sto_res)//
- umlsl v10.8h, v6.8b, v27.8b //mul_res2 = vmlsl_u8(mul_res2, src_tmp3, coeffabs_5)//
+ st1 {v19.16b},[x1],#16 //vst1_u8(pu1_dst,sto_res)//
+ umlsl v20.8h, v6.8b, v27.8b //mul_res2 = vmlsl_u8(mul_res2, src_tmp3, coeffabs_5)//
add x20,x1,x9,lsl #1
csel x1, x20, x1,le
- umlal v10.8h, v7.8b, v28.8b //mul_res2 = vmlal_u8(mul_res2, src_tmp4, coeffabs_6)//
+ umlal v20.8h, v7.8b, v28.8b //mul_res2 = vmlal_u8(mul_res2, src_tmp4, coeffabs_6)//
// cmp x11, x10
- umlsl v10.8h, v16.8b, v29.8b //mul_res2 = vmlsl_u8(mul_res2, src_tmp1, coeffabs_7)//
+ umlsl v20.8h, v16.8b, v29.8b //mul_res2 = vmlsl_u8(mul_res2, src_tmp1, coeffabs_7)//
add x10, x3, x2, lsl #3 // 10*strd - 8+2
- umull v12.8h, v3.8b, v23.8b
+ umull v21.8h, v3.8b, v23.8b
add x10, x10, x2 // 11*strd
- umlsl v12.8h, v2.8b, v22.8b
+ umlsl v21.8h, v2.8b, v22.8b
add x20,x20,x10
prfm PLDL1KEEP,[x20] //11+ 0
- umlsl v12.8h, v4.8b, v24.8b
+ umlsl v21.8h, v4.8b, v24.8b
add x20,x10, x2
prfm PLDL1KEEP,[x20] //11+ 1*strd
- umlal v12.8h, v5.8b, v25.8b
+ umlal v21.8h, v5.8b, v25.8b
add x20,x10, x2, lsl #1
prfm PLDL1KEEP,[x20] //11+ 2*strd
- umlal v12.8h, v6.8b, v26.8b
+ umlal v21.8h, v6.8b, v26.8b
add x10, x10, x2 //12*strd
- umlsl v12.8h, v7.8b, v27.8b
+ umlsl v21.8h, v7.8b, v27.8b
add x20,x10, x2, lsl #1
prfm PLDL1KEEP,[x20] //11+ 3*strd
- umlal v12.8h, v16.8b, v28.8b
+ umlal v21.8h, v16.8b, v28.8b
// mov x10, x11
- umlsl v12.8h, v17.8b, v29.8b
+ umlsl v21.8h, v17.8b, v29.8b
ld1 {v2.8b},[x3],x2 //src_tmp3 = vld1_u8(pu1_src_tmp)//
- umull v14.8h, v4.8b, v23.8b
+ umull v30.8h, v4.8b, v23.8b
subs x7,x7,#4
- umlsl v14.8h, v3.8b, v22.8b
+ umlsl v30.8h, v3.8b, v22.8b
- st1 {v10.16b},[x14],x6 //vst1_u8(pu1_dst_tmp,sto_res)//
- umlsl v14.8h, v5.8b, v24.8b
+ st1 {v20.16b},[x14],x6 //vst1_u8(pu1_dst_tmp,sto_res)//
+ umlsl v30.8h, v5.8b, v24.8b
ld1 {v3.8b},[x3],x2 //src_tmp4 = vld1_u8(pu1_src_tmp)//
- umlal v14.8h, v6.8b, v25.8b
+ umlal v30.8h, v6.8b, v25.8b
ld1 {v4.8b},[x3],x2 //src_tmp1 = vld1_u8(pu1_src_tmp)//
- umlal v14.8h, v7.8b, v26.8b
+ umlal v30.8h, v7.8b, v26.8b
ld1 {v5.8b},[x3],x2 //src_tmp2 = vld1_u8(pu1_src_tmp)//
- umlsl v14.8h, v16.8b, v27.8b
+ umlsl v30.8h, v16.8b, v27.8b
ld1 {v6.8b},[x3],x2 //src_tmp3 = vld1_u8(pu1_src_tmp)//
- umlal v14.8h, v17.8b, v28.8b
+ umlal v30.8h, v17.8b, v28.8b
ld1 {v7.8b},[x3],x2 //src_tmp4 = vld1_u8(pu1_src_tmp)//
- umlsl v14.8h, v18.8b, v29.8b
+ umlsl v30.8h, v18.8b, v29.8b
bgt kernel_8_16out //jumps to kernel_8
epilog_16out:
- umull v8.8h, v1.8b, v23.8b //mul_res1 = vmull_u8(src_tmp2, coeffabs_1)//
- umlsl v8.8h, v0.8b, v22.8b //mul_res1 = vmlsl_u8(mul_res1, src_tmp1, coeffabs_0)//
- umlsl v8.8h, v2.8b, v24.8b //mul_res1 = vmlsl_u8(mul_res1, src_tmp3, coeffabs_2)//
- umlal v8.8h, v3.8b, v25.8b //mul_res1 = vmlal_u8(mul_res1, src_tmp4, coeffabs_3)//
- umlal v8.8h, v4.8b, v26.8b //mul_res1 = vmlal_u8(mul_res1, src_tmp1, coeffabs_4)//
- umlsl v8.8h, v5.8b, v27.8b //mul_res1 = vmlsl_u8(mul_res1, src_tmp2, coeffabs_5)//
- umlal v8.8h, v6.8b, v28.8b //mul_res1 = vmlal_u8(mul_res1, src_tmp3, coeffabs_6)//
- umlsl v8.8h, v7.8b, v29.8b //mul_res1 = vmlsl_u8(mul_res1, src_tmp4, coeffabs_7)//
- st1 {v12.16b},[x14],x6
+ umull v19.8h, v1.8b, v23.8b //mul_res1 = vmull_u8(src_tmp2, coeffabs_1)//
+ umlsl v19.8h, v0.8b, v22.8b //mul_res1 = vmlsl_u8(mul_res1, src_tmp1, coeffabs_0)//
+ umlsl v19.8h, v2.8b, v24.8b //mul_res1 = vmlsl_u8(mul_res1, src_tmp3, coeffabs_2)//
+ umlal v19.8h, v3.8b, v25.8b //mul_res1 = vmlal_u8(mul_res1, src_tmp4, coeffabs_3)//
+ umlal v19.8h, v4.8b, v26.8b //mul_res1 = vmlal_u8(mul_res1, src_tmp1, coeffabs_4)//
+ umlsl v19.8h, v5.8b, v27.8b //mul_res1 = vmlsl_u8(mul_res1, src_tmp2, coeffabs_5)//
+ umlal v19.8h, v6.8b, v28.8b //mul_res1 = vmlal_u8(mul_res1, src_tmp3, coeffabs_6)//
+ umlsl v19.8h, v7.8b, v29.8b //mul_res1 = vmlsl_u8(mul_res1, src_tmp4, coeffabs_7)//
+ st1 {v21.16b},[x14],x6
//vqrshrun.s16 d14,q7,#6
ld1 {v16.8b},[x3],x2 //src_tmp1 = vld1_u8(pu1_src_tmp)//
- umull v10.8h, v2.8b, v23.8b //mul_res2 = vmull_u8(src_tmp3, coeffabs_1)//
- umlsl v10.8h, v1.8b, v22.8b //mul_res2 = vmlsl_u8(mul_res2, src_tmp2, coeffabs_0)//
- umlsl v10.8h, v3.8b, v24.8b //mul_res2 = vmlsl_u8(mul_res2, src_tmp4, coeffabs_2)//
- umlal v10.8h, v4.8b, v25.8b //mul_res2 = vmlal_u8(mul_res2, src_tmp1, coeffabs_3)//
- umlal v10.8h, v5.8b, v26.8b //mul_res2 = vmlal_u8(mul_res2, src_tmp2, coeffabs_4)//
- umlsl v10.8h, v6.8b, v27.8b //mul_res2 = vmlsl_u8(mul_res2, src_tmp3, coeffabs_5)//
- umlal v10.8h, v7.8b, v28.8b //mul_res2 = vmlal_u8(mul_res2, src_tmp4, coeffabs_6)//
- umlsl v10.8h, v16.8b, v29.8b //mul_res2 = vmlsl_u8(mul_res2, src_tmp1, coeffabs_7)//
- st1 {v14.16b},[x14],x6
+ umull v20.8h, v2.8b, v23.8b //mul_res2 = vmull_u8(src_tmp3, coeffabs_1)//
+ umlsl v20.8h, v1.8b, v22.8b //mul_res2 = vmlsl_u8(mul_res2, src_tmp2, coeffabs_0)//
+ umlsl v20.8h, v3.8b, v24.8b //mul_res2 = vmlsl_u8(mul_res2, src_tmp4, coeffabs_2)//
+ umlal v20.8h, v4.8b, v25.8b //mul_res2 = vmlal_u8(mul_res2, src_tmp1, coeffabs_3)//
+ umlal v20.8h, v5.8b, v26.8b //mul_res2 = vmlal_u8(mul_res2, src_tmp2, coeffabs_4)//
+ umlsl v20.8h, v6.8b, v27.8b //mul_res2 = vmlsl_u8(mul_res2, src_tmp3, coeffabs_5)//
+ umlal v20.8h, v7.8b, v28.8b //mul_res2 = vmlal_u8(mul_res2, src_tmp4, coeffabs_6)//
+ umlsl v20.8h, v16.8b, v29.8b //mul_res2 = vmlsl_u8(mul_res2, src_tmp1, coeffabs_7)//
+ st1 {v30.16b},[x14],x6
//vqrshrun.s16 d8,q4,#6 //sto_res = vqmovun_s16(sto_res_tmp)//
ld1 {v17.8b},[x3],x2 //src_tmp2 = vld1_u8(pu1_src_tmp)//
- umull v12.8h, v3.8b, v23.8b
- umlsl v12.8h, v2.8b, v22.8b
- umlsl v12.8h, v4.8b, v24.8b
- umlal v12.8h, v5.8b, v25.8b
- umlal v12.8h, v6.8b, v26.8b
- umlsl v12.8h, v7.8b, v27.8b
- umlal v12.8h, v16.8b, v28.8b
- umlsl v12.8h, v17.8b, v29.8b
+ umull v21.8h, v3.8b, v23.8b
+ umlsl v21.8h, v2.8b, v22.8b
+ umlsl v21.8h, v4.8b, v24.8b
+ umlal v21.8h, v5.8b, v25.8b
+ umlal v21.8h, v6.8b, v26.8b
+ umlsl v21.8h, v7.8b, v27.8b
+ umlal v21.8h, v16.8b, v28.8b
+ umlsl v21.8h, v17.8b, v29.8b
add x14,x1,x6
- st1 {v8.16b},[x1],#16 //vst1_u8(pu1_dst,sto_res)//
+ st1 {v19.16b},[x1],#16 //vst1_u8(pu1_dst,sto_res)//
//vqrshrun.s16 d10,q5,#6 //sto_res = vqmovun_s16(sto_res_tmp)//
ld1 {v18.8b},[x3],x2 //src_tmp3 = vld1_u8(pu1_src_tmp)//
- umull v14.8h, v4.8b, v23.8b
- umlsl v14.8h, v3.8b, v22.8b
- umlsl v14.8h, v5.8b, v24.8b
- umlal v14.8h, v6.8b, v25.8b
- umlal v14.8h, v7.8b, v26.8b
- umlsl v14.8h, v16.8b, v27.8b
- umlal v14.8h, v17.8b, v28.8b
- umlsl v14.8h, v18.8b, v29.8b
-
- st1 {v10.16b},[x14],x6 //vst1_u8(pu1_dst_tmp,sto_res)//
+ umull v30.8h, v4.8b, v23.8b
+ umlsl v30.8h, v3.8b, v22.8b
+ umlsl v30.8h, v5.8b, v24.8b
+ umlal v30.8h, v6.8b, v25.8b
+ umlal v30.8h, v7.8b, v26.8b
+ umlsl v30.8h, v16.8b, v27.8b
+ umlal v30.8h, v17.8b, v28.8b
+ umlsl v30.8h, v18.8b, v29.8b
+
+ st1 {v20.16b},[x14],x6 //vst1_u8(pu1_dst_tmp,sto_res)//
//vqrshrun.s16 d12,q6,#6
epilog_end_16out:
- st1 {v12.16b},[x14],x6
+ st1 {v21.16b},[x14],x6
//vqrshrun.s16 d14,q7,#6
- st1 {v14.16b},[x14],x6
+ st1 {v30.16b},[x14],x6
end_loops_16out:
@@ -377,7 +377,7 @@ end_loops_16out:
// ldmeqfd sp!,{x4-x12,x15} //reload the registers from sp
bne lbl355
ldp x19, x20,[sp], #16
- pop_v_regs
+
ret
lbl355:
mov x5, #4
@@ -418,34 +418,34 @@ inner_loop_wd_4_16out:
ld1 {v7.s}[1],[x3],x2 //src_tmp4 = vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp4, 1)//
umlsl v0.8h, v6.8b, v24.8b //mul_res1 = vmlsl_u8(mul_res1, vreinterpret_u8_u32(src_tmp3), coeffabs_2)//
- umull v8.8h, v7.8b, v23.8b
+ umull v19.8h, v7.8b, v23.8b
dup v4.2s, v7.2s[1] //src_tmp1 = vdup_lane_u32(src_tmp4, 1)//
umull v2.8h, v7.8b, v25.8b //mul_res2 = vmull_u8(vreinterpret_u8_u32(src_tmp4), coeffabs_3)//
ld1 {v4.s}[1],[x3],x2 //src_tmp1 = vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp1, 1)//
- umlsl v8.8h, v6.8b, v22.8b
+ umlsl v19.8h, v6.8b, v22.8b
umlal v0.8h, v4.8b, v26.8b //mul_res1 = vmlal_u8(mul_res1, vreinterpret_u8_u32(src_tmp1), coeffabs_4)//
dup v5.2s, v4.2s[1] //src_tmp2 = vdup_lane_u32(src_tmp1, 1)//
- umlsl v8.8h, v4.8b, v24.8b
+ umlsl v19.8h, v4.8b, v24.8b
ld1 {v5.s}[1],[x3],x2 //src_tmp2 = vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp2, 1)//
umlsl v2.8h, v5.8b, v27.8b //mul_res2 = vmlsl_u8(mul_res2, vreinterpret_u8_u32(src_tmp2), coeffabs_5)//
dup v6.2s, v5.2s[1] //src_tmp3 = vdup_lane_u32(src_tmp2, 1)//
- umlal v8.8h, v5.8b, v25.8b
+ umlal v19.8h, v5.8b, v25.8b
ld1 {v6.s}[1],[x3],x2 //src_tmp3 = vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp3, 1)//
umlal v0.8h, v6.8b, v28.8b //mul_res1 = vmlal_u8(mul_res1, vreinterpret_u8_u32(src_tmp3), coeffabs_6)//
dup v7.2s, v6.2s[1] //src_tmp4 = vdup_lane_u32(src_tmp3, 1)//
- umlal v8.8h, v6.8b, v26.8b
+ umlal v19.8h, v6.8b, v26.8b
ld1 {v7.s}[1],[x3],x2 //src_tmp4 = vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp4, 1)//
umlsl v2.8h, v7.8b, v29.8b //mul_res2 = vmlsl_u8(mul_res2, vreinterpret_u8_u32(src_tmp4), coeffabs_7)//
dup v4.2s, v7.2s[1]
add v0.8h, v0.8h , v2.8h //mul_res1 = vaddq_u16(mul_res1, mul_res2)//
- umlsl v8.8h, v7.8b, v27.8b
+ umlsl v19.8h, v7.8b, v27.8b
ld1 {v4.s}[1],[x3],x2
- umlal v8.8h, v4.8b, v28.8b
+ umlal v19.8h, v4.8b, v28.8b
dup v5.2s, v4.2s[1]
//vqrshrun.s16 d0,q0,#6 //sto_res = vqmovun_s16(sto_res_tmp)//
@@ -453,13 +453,13 @@ inner_loop_wd_4_16out:
add x3,x1,x6
st1 {v0.d}[0],[x1],#8 //vst1_lane_u32((uint32_t *)pu1_dst, vreinterpret_u32_u8(sto_res), 0)//
- umlsl v8.8h, v5.8b, v29.8b
+ umlsl v19.8h, v5.8b, v29.8b
st1 {v0.d}[1],[x3],x6 //vst1_lane_u32((uint32_t *)pu1_dst_tmp, vreinterpret_u32_u8(sto_res), 1)//
//vqrshrun.s16 d8,q4,#6
- st1 {v8.d}[0],[x3],x6
+ st1 {v19.d}[0],[x3],x6
//add x1,x1,#4
- st1 {v8.d}[1],[x3]
+ st1 {v19.d}[1],[x3]
bgt inner_loop_wd_4_16out
end_inner_loop_wd_4_16out:
@@ -470,7 +470,7 @@ end_inner_loop_wd_4_16out:
// ldmfd sp!, {x4-x12, x15} //reload the registers from sp
ldp x19, x20,[sp], #16
- pop_v_regs
+
ret
diff --git a/common/arm64/ihevc_inter_pred_luma_copy_w16out.s b/common/arm64/ihevc_inter_pred_luma_copy_w16out.s
index 86ffdba..b5498cf 100644
--- a/common/arm64/ihevc_inter_pred_luma_copy_w16out.s
+++ b/common/arm64/ihevc_inter_pred_luma_copy_w16out.s
@@ -84,7 +84,7 @@
ihevc_inter_pred_luma_copy_w16out_av8:
// stmfd sp!, {x4-x12, x14} //stack stores the values of the arguments
- push_v_regs
+
stp x19, x20,[sp,#-16]!
mov x15,x4 // pi1_coeff
@@ -138,7 +138,7 @@ end_inner_loop_wd_4:
end_loops:
// ldmfd sp!,{x4-x12,x15} //reload the registers from sp
ldp x19, x20,[sp], #16
- pop_v_regs
+
ret
@@ -159,14 +159,14 @@ core_loop_wd_8:
prolog:
add x6,x0,x2 //pu1_src_tmp += src_strd
add x10,x1,x5
- ld1 {v8.8b},[x0],#8 //vld1_u8(pu1_src_tmp)
- ld1 {v10.8b},[x6],x2 //vld1_u8(pu1_src_tmp)
- ld1 {v12.8b},[x6],x2 //vld1_u8(pu1_src_tmp)
- ld1 {v14.8b},[x6],x2 //vld1_u8(pu1_src_tmp)
- uxtl v16.8h, v8.8b //vmovl_u8(vld1_u8(pu1_src_tmp))
- uxtl v18.8h, v10.8b //vmovl_u8(vld1_u8(pu1_src_tmp)
- uxtl v20.8h, v12.8b //vmovl_u8(vld1_u8(pu1_src_tmp)
- uxtl v22.8h, v14.8b //vmovl_u8(vld1_u8(pu1_src_tmp)
+ ld1 {v1.8b},[x0],#8 //vld1_u8(pu1_src_tmp)
+ ld1 {v3.8b},[x6],x2 //vld1_u8(pu1_src_tmp)
+ ld1 {v5.8b},[x6],x2 //vld1_u8(pu1_src_tmp)
+ ld1 {v7.8b},[x6],x2 //vld1_u8(pu1_src_tmp)
+ uxtl v16.8h, v1.8b //vmovl_u8(vld1_u8(pu1_src_tmp))
+ uxtl v18.8h, v3.8b //vmovl_u8(vld1_u8(pu1_src_tmp)
+ uxtl v20.8h, v5.8b //vmovl_u8(vld1_u8(pu1_src_tmp)
+ uxtl v22.8h, v7.8b //vmovl_u8(vld1_u8(pu1_src_tmp)
subs x4,x4,#8 //wd decrements by 8
shl v0.8h, v16.8h,#6 //vshlq_n_s16(tmp, 6)
shl v2.8h, v18.8h,#6 //vshlq_n_s16(tmp, 6)
@@ -175,10 +175,10 @@ prolog:
add x20,x0,x8
csel x0, x20, x0,le
add x6,x0,x2 //pu1_src_tmp += src_strd
- ld1 {v8.8b},[x0],#8 //vld1_u8(pu1_src_tmp)
- ld1 {v10.8b},[x6],x2 //vld1_u8(pu1_src_tmp)
- ld1 {v12.8b},[x6],x2 //vld1_u8(pu1_src_tmp)
- ld1 {v14.8b},[x6],x2 //vld1_u8(pu1_src_tmp)
+ ld1 {v1.8b},[x0],#8 //vld1_u8(pu1_src_tmp)
+ ld1 {v3.8b},[x6],x2 //vld1_u8(pu1_src_tmp)
+ ld1 {v5.8b},[x6],x2 //vld1_u8(pu1_src_tmp)
+ ld1 {v7.8b},[x6],x2 //vld1_u8(pu1_src_tmp)
st1 {v0.8h},[x1],#16 //vst1q_s16(pi2_dst_tmp, tmp)
add x20,x1,x11,lsl #1
@@ -196,15 +196,15 @@ prolog:
outer_loop_wd_8:
st1 {v2.8h},[x10],x5 //vst1q_s16(pi2_dst_tmp, tmp)
- uxtl v16.8h, v8.8b //vmovl_u8(vld1_u8(pu1_src_tmp))
+ uxtl v16.8h, v1.8b //vmovl_u8(vld1_u8(pu1_src_tmp))
st1 {v4.8h},[x10],x5 //vst1q_s16(pi2_dst_tmp, tmp)
- uxtl v18.8h, v10.8b //vmovl_u8(vld1_u8(pu1_src_tmp)
+ uxtl v18.8h, v3.8b //vmovl_u8(vld1_u8(pu1_src_tmp)
st1 {v6.8h},[x10],x5 //vst1q_s16(pi2_dst_tmp, tmp)
- uxtl v20.8h, v12.8b //vmovl_u8(vld1_u8(pu1_src_tmp)
+ uxtl v20.8h, v5.8b //vmovl_u8(vld1_u8(pu1_src_tmp)
- uxtl v22.8h, v14.8b //vmovl_u8(vld1_u8(pu1_src_tmp)
+ uxtl v22.8h, v7.8b //vmovl_u8(vld1_u8(pu1_src_tmp)
subs x4,x4,#8 //wd decrements by 8
add x20,x0,x8
@@ -212,16 +212,16 @@ outer_loop_wd_8:
add x6,x0,x2 //pu1_src_tmp += src_strd
- ld1 {v8.8b},[x0],#8 //vld1_u8(pu1_src_tmp)
+ ld1 {v1.8b},[x0],#8 //vld1_u8(pu1_src_tmp)
shl v0.8h, v16.8h,#6 //vshlq_n_s16(tmp, 6)
- ld1 {v10.8b},[x6],x2 //vld1_u8(pu1_src_tmp)
+ ld1 {v3.8b},[x6],x2 //vld1_u8(pu1_src_tmp)
shl v2.8h, v18.8h,#6 //vshlq_n_s16(tmp, 6)
- ld1 {v12.8b},[x6],x2 //vld1_u8(pu1_src_tmp)
+ ld1 {v5.8b},[x6],x2 //vld1_u8(pu1_src_tmp)
shl v4.8h, v20.8h,#6 //vshlq_n_s16(tmp, 6)
- ld1 {v14.8b},[x6],x2 //vld1_u8(pu1_src_tmp)
+ ld1 {v7.8b},[x6],x2 //vld1_u8(pu1_src_tmp)
add x10,x1,x5
shl v6.8h, v22.8h,#6 //vshlq_n_s16(tmp, 6)
@@ -238,15 +238,15 @@ outer_loop_wd_8:
epilog:
st1 {v2.8h},[x10],x5 //vst1q_s16(pi2_dst_tmp, tmp)
- uxtl v16.8h, v8.8b //vmovl_u8(vld1_u8(pu1_src_tmp))
+ uxtl v16.8h, v1.8b //vmovl_u8(vld1_u8(pu1_src_tmp))
st1 {v4.8h},[x10],x5 //vst1q_s16(pi2_dst_tmp, tmp)
- uxtl v18.8h, v10.8b //vmovl_u8(vld1_u8(pu1_src_tmp)
+ uxtl v18.8h, v3.8b //vmovl_u8(vld1_u8(pu1_src_tmp)
st1 {v6.8h},[x10],x5 //vst1q_s16(pi2_dst_tmp, tmp)
- uxtl v20.8h, v12.8b //vmovl_u8(vld1_u8(pu1_src_tmp)
+ uxtl v20.8h, v5.8b //vmovl_u8(vld1_u8(pu1_src_tmp)
- uxtl v22.8h, v14.8b //vmovl_u8(vld1_u8(pu1_src_tmp)
+ uxtl v22.8h, v7.8b //vmovl_u8(vld1_u8(pu1_src_tmp)
//add x6,x0,x2 //pu1_src_tmp += src_strd
shl v0.8h, v16.8h,#6 //vshlq_n_s16(tmp, 6)
@@ -264,7 +264,7 @@ epilog_end:
// ldmfd sp!,{x4-x12,x15} //reload the registers from sp
ldp x19, x20,[sp], #16
- pop_v_regs
+
ret
diff --git a/common/arm64/ihevc_inter_pred_luma_vert_w16inp_w16out.s b/common/arm64/ihevc_inter_pred_luma_vert_w16inp_w16out.s
index b94ec3c..7147200 100644
--- a/common/arm64/ihevc_inter_pred_luma_vert_w16inp_w16out.s
+++ b/common/arm64/ihevc_inter_pred_luma_vert_w16inp_w16out.s
@@ -114,7 +114,7 @@
ihevc_inter_pred_luma_vert_w16inp_w16out_av8:
//stmfd sp!, {r4-r12, r14} //stack stores the values of the arguments
- push_v_regs
+
stp x19,x20,[sp, #-16]!
mov x15,x4 // pi1_coeff
@@ -163,71 +163,71 @@ prolog:
ld1 {v0.4h},[x0], #8 //src_tmp1 = ld1_u8(pu1_src_tmp)//
subs x4,x4,#4
ld1 {v2.4h},[x3],x2 //src_tmp3 = ld1_u8(pu1_src_tmp)//
- smull v8.4s,v1.4h,v23.4h //mul_res1 = smull_u8(src_tmp2, coeffabs_1)//
+ smull v19.4s,v1.4h,v23.4h //mul_res1 = smull_u8(src_tmp2, coeffabs_1)//
ld1 {v3.4h},[x3],x2 //src_tmp4 = ld1_u8(pu1_src_tmp)//
- smlal v8.4s,v0.4h,v22.4h //mul_res1 = smlal_u8(mul_res1, src_tmp1, coeffabs_0)//
+ smlal v19.4s,v0.4h,v22.4h //mul_res1 = smlal_u8(mul_res1, src_tmp1, coeffabs_0)//
ld1 {v4.4h},[x3],x2 //src_tmp1 = ld1_u8(pu1_src_tmp)//
- smlal v8.4s,v2.4h,v24.4h //mul_res1 = smlal_u8(mul_res1, src_tmp3, coeffabs_2)//
+ smlal v19.4s,v2.4h,v24.4h //mul_res1 = smlal_u8(mul_res1, src_tmp3, coeffabs_2)//
ld1 {v5.4h},[x3],x2 //src_tmp2 = ld1_u8(pu1_src_tmp)//
- smlal v8.4s,v3.4h,v25.4h //mul_res1 = smlal_u8(mul_res1, src_tmp4, coeffabs_3)//
+ smlal v19.4s,v3.4h,v25.4h //mul_res1 = smlal_u8(mul_res1, src_tmp4, coeffabs_3)//
ld1 {v6.4h},[x3],x2 //src_tmp3 = ld1_u8(pu1_src_tmp)//
- smlal v8.4s,v4.4h,v26.4h //mul_res1 = smlal_u8(mul_res1, src_tmp1, coeffabs_4)//
+ smlal v19.4s,v4.4h,v26.4h //mul_res1 = smlal_u8(mul_res1, src_tmp1, coeffabs_4)//
ld1 {v7.4h},[x3],x2 //src_tmp4 = ld1_u8(pu1_src_tmp)//
- smlal v8.4s,v5.4h,v27.4h //mul_res1 = smlal_u8(mul_res1, src_tmp2, coeffabs_5)//
- smlal v8.4s,v6.4h,v28.4h //mul_res1 = smlal_u8(mul_res1, src_tmp3, coeffabs_6)//
- smlal v8.4s,v7.4h,v29.4h //mul_res1 = smlal_u8(mul_res1, src_tmp4, coeffabs_7)//
+ smlal v19.4s,v5.4h,v27.4h //mul_res1 = smlal_u8(mul_res1, src_tmp2, coeffabs_5)//
+ smlal v19.4s,v6.4h,v28.4h //mul_res1 = smlal_u8(mul_res1, src_tmp3, coeffabs_6)//
+ smlal v19.4s,v7.4h,v29.4h //mul_res1 = smlal_u8(mul_res1, src_tmp4, coeffabs_7)//
ld1 {v16.4h},[x3],x2 //src_tmp1 = ld1_u8(pu1_src_tmp)//
- smull v10.4s,v2.4h,v23.4h //mul_res2 = smull_u8(src_tmp3, coeffabs_1)//
+ smull v20.4s,v2.4h,v23.4h //mul_res2 = smull_u8(src_tmp3, coeffabs_1)//
add x20,x0,x8,lsl #0
csel x0,x20,x0,le
- smlal v10.4s,v1.4h,v22.4h //mul_res2 = smlal_u8(mul_res2, src_tmp2, coeffabs_0)//
+ smlal v20.4s,v1.4h,v22.4h //mul_res2 = smlal_u8(mul_res2, src_tmp2, coeffabs_0)//
csel x4,x5,x4,le
- smlal v10.4s,v3.4h,v24.4h //mul_res2 = smlal_u8(mul_res2, src_tmp4, coeffabs_2)//
+ smlal v20.4s,v3.4h,v24.4h //mul_res2 = smlal_u8(mul_res2, src_tmp4, coeffabs_2)//
ld1 {v17.4h},[x3],x2 //src_tmp2 = ld1_u8(pu1_src_tmp)//
- smlal v10.4s,v4.4h,v25.4h //mul_res2 = smlal_u8(mul_res2, src_tmp1, coeffabs_3)//
+ smlal v20.4s,v4.4h,v25.4h //mul_res2 = smlal_u8(mul_res2, src_tmp1, coeffabs_3)//
ld1 {v18.4h},[x3],x2 //src_tmp3 = ld1_u8(pu1_src_tmp)//
- smlal v10.4s,v5.4h,v26.4h //mul_res2 = smlal_u8(mul_res2, src_tmp2, coeffabs_4)//
+ smlal v20.4s,v5.4h,v26.4h //mul_res2 = smlal_u8(mul_res2, src_tmp2, coeffabs_4)//
add x3,x0,x2 //pu1_src_tmp += src_strd//
- smlal v10.4s,v6.4h,v27.4h //mul_res2 = smlal_u8(mul_res2, src_tmp3, coeffabs_5)//
- smlal v10.4s,v7.4h,v28.4h //mul_res2 = smlal_u8(mul_res2, src_tmp4, coeffabs_6)//
- smlal v10.4s,v16.4h,v29.4h //mul_res2 = smlal_u8(mul_res2, src_tmp1, coeffabs_7)//
- sub v8.4s, v8.4s, v30.4s
+ smlal v20.4s,v6.4h,v27.4h //mul_res2 = smlal_u8(mul_res2, src_tmp3, coeffabs_5)//
+ smlal v20.4s,v7.4h,v28.4h //mul_res2 = smlal_u8(mul_res2, src_tmp4, coeffabs_6)//
+ smlal v20.4s,v16.4h,v29.4h //mul_res2 = smlal_u8(mul_res2, src_tmp1, coeffabs_7)//
+ sub v19.4s, v19.4s, v30.4s
ld1 {v1.4h},[x3],x2 //src_tmp3 = ld1_u8(pu1_src_tmp)//
- smull v12.4s,v3.4h,v23.4h
+ smull v21.4s,v3.4h,v23.4h
ld1 {v0.4h},[x0],#8 //src_tmp1 = ld1_u8(pu1_src_tmp)//
- smlal v12.4s,v2.4h,v22.4h
+ smlal v21.4s,v2.4h,v22.4h
ld1 {v2.4h},[x3],x2 //src_tmp3 = ld1_u8(pu1_src_tmp)//
- smlal v12.4s,v4.4h,v24.4h
- smlal v12.4s,v5.4h,v25.4h
- smlal v12.4s,v6.4h,v26.4h
- smlal v12.4s,v7.4h,v27.4h
- smlal v12.4s,v16.4h,v28.4h
- smlal v12.4s,v17.4h,v29.4h
+ smlal v21.4s,v4.4h,v24.4h
+ smlal v21.4s,v5.4h,v25.4h
+ smlal v21.4s,v6.4h,v26.4h
+ smlal v21.4s,v7.4h,v27.4h
+ smlal v21.4s,v16.4h,v28.4h
+ smlal v21.4s,v17.4h,v29.4h
add x14,x1,x6
- sub v10.4s, v10.4s, v30.4s
- shrn v8.4h, v8.4s, #6
+ sub v20.4s, v20.4s, v30.4s
+ shrn v19.4h, v19.4s, #6
//vqrshrun d8,q4,#6 //sto_res = vqmovun_s16(sto_res_tmp)//
- smull v14.4s,v4.4h,v23.4h
- smlal v14.4s,v3.4h,v22.4h
- smlal v14.4s,v5.4h,v24.4h
- smlal v14.4s,v6.4h,v25.4h
+ smull v31.4s,v4.4h,v23.4h
+ smlal v31.4s,v3.4h,v22.4h
+ smlal v31.4s,v5.4h,v24.4h
+ smlal v31.4s,v6.4h,v25.4h
ld1 {v3.4h},[x3],x2 //src_tmp4 = ld1_u8(pu1_src_tmp)//
- smlal v14.4s,v7.4h,v26.4h
+ smlal v31.4s,v7.4h,v26.4h
ld1 {v4.4h},[x3],x2 //src_tmp1 = ld1_u8(pu1_src_tmp)//
- smlal v14.4s,v16.4h,v27.4h
+ smlal v31.4s,v16.4h,v27.4h
ld1 {v5.4h},[x3],x2 //src_tmp2 = ld1_u8(pu1_src_tmp)//
- smlal v14.4s,v17.4h,v28.4h
+ smlal v31.4s,v17.4h,v28.4h
ld1 {v6.4h},[x3],x2 //src_tmp3 = ld1_u8(pu1_src_tmp)//
- smlal v14.4s,v18.4h,v29.4h
+ smlal v31.4s,v18.4h,v29.4h
ld1 {v7.4h},[x3],x2 //src_tmp4 = ld1_u8(pu1_src_tmp)//
- st1 {v8.2s},[x1],#8 //st1_u8(pu1_dst,sto_res)//
- sub v12.4s, v12.4s, v30.4s
- shrn v10.4h, v10.4s, #6
+ st1 {v19.2s},[x1],#8 //st1_u8(pu1_dst,sto_res)//
+ sub v21.4s, v21.4s, v30.4s
+ shrn v20.4h, v20.4s, #6
//vqrshrun d10,q5,#6 //sto_res = vqmovun_s16(sto_res_tmp)//
add x20, x1, x9
csel x1, x20, x1, le
@@ -240,87 +240,87 @@ prolog:
kernel_8:
- smull v8.4s,v1.4h,v23.4h //mul_res1 = smull_u8(src_tmp2, coeffabs_1)//
+ smull v19.4s,v1.4h,v23.4h //mul_res1 = smull_u8(src_tmp2, coeffabs_1)//
subs x4,x4,#4
- smlal v8.4s,v0.4h,v22.4h //mul_res1 = smlal_u8(mul_res1, src_tmp1, coeffabs_0)//
+ smlal v19.4s,v0.4h,v22.4h //mul_res1 = smlal_u8(mul_res1, src_tmp1, coeffabs_0)//
add x20,x0,x8,lsl #0
csel x0,x20,x0,le
- smlal v8.4s,v2.4h,v24.4h //mul_res1 = smlal_u8(mul_res1, src_tmp3, coeffabs_2)//
- smlal v8.4s,v3.4h,v25.4h //mul_res1 = smlal_u8(mul_res1, src_tmp4, coeffabs_3)//
- smlal v8.4s,v4.4h,v26.4h //mul_res1 = smlal_u8(mul_res1, src_tmp1, coeffabs_4)//
- smlal v8.4s,v5.4h,v27.4h //mul_res1 = smlal_u8(mul_res1, src_tmp2, coeffabs_5)//
- smlal v8.4s,v6.4h,v28.4h //mul_res1 = smlal_u8(mul_res1, src_tmp3, coeffabs_6)//
- smlal v8.4s,v7.4h,v29.4h //mul_res1 = smlal_u8(mul_res1, src_tmp4, coeffabs_7)//
- st1 {v10.2s},[x14],x6 //st1_u8(pu1_dst_tmp,sto_res)//
-
- sub v14.4S, v14.4s, v30.4s
- shrn v12.4h, v12.4s, #6
+ smlal v19.4s,v2.4h,v24.4h //mul_res1 = smlal_u8(mul_res1, src_tmp3, coeffabs_2)//
+ smlal v19.4s,v3.4h,v25.4h //mul_res1 = smlal_u8(mul_res1, src_tmp4, coeffabs_3)//
+ smlal v19.4s,v4.4h,v26.4h //mul_res1 = smlal_u8(mul_res1, src_tmp1, coeffabs_4)//
+ smlal v19.4s,v5.4h,v27.4h //mul_res1 = smlal_u8(mul_res1, src_tmp2, coeffabs_5)//
+ smlal v19.4s,v6.4h,v28.4h //mul_res1 = smlal_u8(mul_res1, src_tmp3, coeffabs_6)//
+ smlal v19.4s,v7.4h,v29.4h //mul_res1 = smlal_u8(mul_res1, src_tmp4, coeffabs_7)//
+ st1 {v20.2s},[x14],x6 //st1_u8(pu1_dst_tmp,sto_res)//
+
+ sub v31.4S, v31.4s, v30.4s
+ shrn v21.4h, v21.4s, #6
//vqrshrun d12,q6,#6
ld1 {v16.4h},[x3],x2 //src_tmp1 = ld1_u8(pu1_src_tmp)//
- smull v10.4s,v2.4h,v23.4h //mul_res2 = smull_u8(src_tmp3, coeffabs_1)//
- smlal v10.4s,v1.4h,v22.4h //mul_res2 = smlal_u8(mul_res2, src_tmp2, coeffabs_0)//
- smlal v10.4s,v3.4h,v24.4h //mul_res2 = smlal_u8(mul_res2, src_tmp4, coeffabs_2)//
- smlal v10.4s,v4.4h,v25.4h //mul_res2 = smlal_u8(mul_res2, src_tmp1, coeffabs_3)//
- smlal v10.4s,v5.4h,v26.4h //mul_res2 = smlal_u8(mul_res2, src_tmp2, coeffabs_4)//
- smlal v10.4s,v6.4h,v27.4h //mul_res2 = smlal_u8(mul_res2, src_tmp3, coeffabs_5)//
- st1 {v12.2s},[x14],x6
+ smull v20.4s,v2.4h,v23.4h //mul_res2 = smull_u8(src_tmp3, coeffabs_1)//
+ smlal v20.4s,v1.4h,v22.4h //mul_res2 = smlal_u8(mul_res2, src_tmp2, coeffabs_0)//
+ smlal v20.4s,v3.4h,v24.4h //mul_res2 = smlal_u8(mul_res2, src_tmp4, coeffabs_2)//
+ smlal v20.4s,v4.4h,v25.4h //mul_res2 = smlal_u8(mul_res2, src_tmp1, coeffabs_3)//
+ smlal v20.4s,v5.4h,v26.4h //mul_res2 = smlal_u8(mul_res2, src_tmp2, coeffabs_4)//
+ smlal v20.4s,v6.4h,v27.4h //mul_res2 = smlal_u8(mul_res2, src_tmp3, coeffabs_5)//
+ st1 {v21.2s},[x14],x6
- smlal v10.4s,v7.4h,v28.4h //mul_res2 = smlal_u8(mul_res2, src_tmp4, coeffabs_6)//
+ smlal v20.4s,v7.4h,v28.4h //mul_res2 = smlal_u8(mul_res2, src_tmp4, coeffabs_6)//
ld1 {v17.4h},[x3],x2 //src_tmp2 = ld1_u8(pu1_src_tmp)//
- smlal v10.4s,v16.4h,v29.4h //mul_res2 = smlal_u8(mul_res2, src_tmp1, coeffabs_7)//
+ smlal v20.4s,v16.4h,v29.4h //mul_res2 = smlal_u8(mul_res2, src_tmp1, coeffabs_7)//
- sub v8.4s, v8.4s, v30.4s
- shrn v14.4h, v14.4s, #6
+ sub v19.4s, v19.4s, v30.4s
+ shrn v31.4h, v31.4s, #6
//vqrshrun d14,q7,#6
- smull v12.4s,v3.4h,v23.4h
+ smull v21.4s,v3.4h,v23.4h
csel x4,x5,x4,le
- smlal v12.4s,v2.4h,v22.4h
+ smlal v21.4s,v2.4h,v22.4h
ld1 {v18.4h},[x3],x2 //src_tmp3 = ld1_u8(pu1_src_tmp)//
- smlal v12.4s,v4.4h,v24.4h
+ smlal v21.4s,v4.4h,v24.4h
add x3,x0,x2 //pu1_src_tmp += src_strd//
- smlal v12.4s,v5.4h,v25.4h
+ smlal v21.4s,v5.4h,v25.4h
- smlal v12.4s,v6.4h,v26.4h
- st1 {v14.2s},[x14],x6
+ smlal v21.4s,v6.4h,v26.4h
+ st1 {v31.2s},[x14],x6
- smlal v12.4s,v7.4h,v27.4h
+ smlal v21.4s,v7.4h,v27.4h
ld1 {v1.4h},[x3],x2 //src_tmp2 = ld1_u8(pu1_src_tmp)//
- smlal v12.4s,v16.4h,v28.4h
+ smlal v21.4s,v16.4h,v28.4h
add x14,x1,x6
- smlal v12.4s,v17.4h,v29.4h
+ smlal v21.4s,v17.4h,v29.4h
ld1 {v0.4h},[x0],#8 //src_tmp1 = ld1_u8(pu1_src_tmp)//
- sub v10.4s, v10.4s, v30.4s
- shrn v8.4h, v8.4s, #6
+ sub v20.4s, v20.4s, v30.4s
+ shrn v19.4h, v19.4s, #6
//vqrshrun d8,q4,#6 //sto_res = vqmovun_s16(sto_res_tmp)//
ld1 {v2.4h},[x3],x2 //src_tmp3 = ld1_u8(pu1_src_tmp)//
- smull v14.4s,v4.4h,v23.4h
- smlal v14.4s,v3.4h,v22.4h
- smlal v14.4s,v5.4h,v24.4h
+ smull v31.4s,v4.4h,v23.4h
+ smlal v31.4s,v3.4h,v22.4h
+ smlal v31.4s,v5.4h,v24.4h
ld1 {v3.4h},[x3],x2 //src_tmp4 = ld1_u8(pu1_src_tmp)//
- smlal v14.4s,v6.4h,v25.4h
+ smlal v31.4s,v6.4h,v25.4h
ld1 {v4.4h},[x3],x2 //src_tmp1 = ld1_u8(pu1_src_tmp)//
- smlal v14.4s,v7.4h,v26.4h
+ smlal v31.4s,v7.4h,v26.4h
ld1 {v5.4h},[x3],x2 //src_tmp2 = ld1_u8(pu1_src_tmp)//
- smlal v14.4s,v16.4h,v27.4h
+ smlal v31.4s,v16.4h,v27.4h
ld1 {v6.4h},[x3],x2 //src_tmp3 = ld1_u8(pu1_src_tmp)//
- smlal v14.4s,v17.4h,v28.4h
+ smlal v31.4s,v17.4h,v28.4h
ld1 {v7.4h},[x3],x2 //src_tmp4 = ld1_u8(pu1_src_tmp)//
- smlal v14.4s,v18.4h,v29.4h
- st1 {v8.2s},[x1],#8 //st1_u8(pu1_dst,sto_res)//
+ smlal v31.4s,v18.4h,v29.4h
+ st1 {v19.2s},[x1],#8 //st1_u8(pu1_dst,sto_res)//
- sub v12.4s, v12.4s, v30.4s
- shrn v10.4h, v10.4s, #6
+ sub v21.4s, v21.4s, v30.4s
+ shrn v20.4h, v20.4s, #6
add x20, x1, x9
csel x1, x20, x1, le
@@ -331,83 +331,83 @@ kernel_8:
epilog:
- smull v8.4s,v1.4h,v23.4h //mul_res1 = smull_u8(src_tmp2, coeffabs_1)//
- smlal v8.4s,v0.4h,v22.4h //mul_res1 = smlal_u8(mul_res1, src_tmp1, coeffabs_0)//
- smlal v8.4s,v2.4h,v24.4h //mul_res1 = smlal_u8(mul_res1, src_tmp3, coeffabs_2)//
- smlal v8.4s,v3.4h,v25.4h //mul_res1 = smlal_u8(mul_res1, src_tmp4, coeffabs_3)//
- smlal v8.4s,v4.4h,v26.4h //mul_res1 = smlal_u8(mul_res1, src_tmp1, coeffabs_4)//
- smlal v8.4s,v5.4h,v27.4h //mul_res1 = smlal_u8(mul_res1, src_tmp2, coeffabs_5)//
- smlal v8.4s,v6.4h,v28.4h //mul_res1 = smlal_u8(mul_res1, src_tmp3, coeffabs_6)//
- smlal v8.4s,v7.4h,v29.4h //mul_res1 = smlal_u8(mul_res1, src_tmp4, coeffabs_7)//
- st1 {v10.2s},[x14],x6
-
- sub v14.4s, v14.4s, v30.4s
- shrn v12.4h, v12.4s, #6
+ smull v19.4s,v1.4h,v23.4h //mul_res1 = smull_u8(src_tmp2, coeffabs_1)//
+ smlal v19.4s,v0.4h,v22.4h //mul_res1 = smlal_u8(mul_res1, src_tmp1, coeffabs_0)//
+ smlal v19.4s,v2.4h,v24.4h //mul_res1 = smlal_u8(mul_res1, src_tmp3, coeffabs_2)//
+ smlal v19.4s,v3.4h,v25.4h //mul_res1 = smlal_u8(mul_res1, src_tmp4, coeffabs_3)//
+ smlal v19.4s,v4.4h,v26.4h //mul_res1 = smlal_u8(mul_res1, src_tmp1, coeffabs_4)//
+ smlal v19.4s,v5.4h,v27.4h //mul_res1 = smlal_u8(mul_res1, src_tmp2, coeffabs_5)//
+ smlal v19.4s,v6.4h,v28.4h //mul_res1 = smlal_u8(mul_res1, src_tmp3, coeffabs_6)//
+ smlal v19.4s,v7.4h,v29.4h //mul_res1 = smlal_u8(mul_res1, src_tmp4, coeffabs_7)//
+ st1 {v20.2s},[x14],x6
+
+ sub v31.4s, v31.4s, v30.4s
+ shrn v21.4h, v21.4s, #6
//vqrshrun d12,q6,#6
ld1 {v16.4h},[x3],x2 //src_tmp1 = ld1_u8(pu1_src_tmp)//
- smull v10.4s,v2.4h,v23.4h //mul_res2 = smull_u8(src_tmp3, coeffabs_1)//
- smlal v10.4s,v1.4h,v22.4h //mul_res2 = smlal_u8(mul_res2, src_tmp2, coeffabs_0)//
- smlal v10.4s,v3.4h,v24.4h //mul_res2 = smlal_u8(mul_res2, src_tmp4, coeffabs_2)//
- smlal v10.4s,v4.4h,v25.4h //mul_res2 = smlal_u8(mul_res2, src_tmp1, coeffabs_3)//
- smlal v10.4s,v5.4h,v26.4h //mul_res2 = smlal_u8(mul_res2, src_tmp2, coeffabs_4)//
- smlal v10.4s,v6.4h,v27.4h //mul_res2 = smlal_u8(mul_res2, src_tmp3, coeffabs_5)//
- smlal v10.4s,v7.4h,v28.4h //mul_res2 = smlal_u8(mul_res2, src_tmp4, coeffabs_6)//
- smlal v10.4s,v16.4h,v29.4h //mul_res2 = smlal_u8(mul_res2, src_tmp1, coeffabs_7)//
- st1 {v12.2s},[x14],x6
-
- sub v8.4s, v8.4s, v30.4s
- shrn v14.4h, v14.4s, #6
+ smull v20.4s,v2.4h,v23.4h //mul_res2 = smull_u8(src_tmp3, coeffabs_1)//
+ smlal v20.4s,v1.4h,v22.4h //mul_res2 = smlal_u8(mul_res2, src_tmp2, coeffabs_0)//
+ smlal v20.4s,v3.4h,v24.4h //mul_res2 = smlal_u8(mul_res2, src_tmp4, coeffabs_2)//
+ smlal v20.4s,v4.4h,v25.4h //mul_res2 = smlal_u8(mul_res2, src_tmp1, coeffabs_3)//
+ smlal v20.4s,v5.4h,v26.4h //mul_res2 = smlal_u8(mul_res2, src_tmp2, coeffabs_4)//
+ smlal v20.4s,v6.4h,v27.4h //mul_res2 = smlal_u8(mul_res2, src_tmp3, coeffabs_5)//
+ smlal v20.4s,v7.4h,v28.4h //mul_res2 = smlal_u8(mul_res2, src_tmp4, coeffabs_6)//
+ smlal v20.4s,v16.4h,v29.4h //mul_res2 = smlal_u8(mul_res2, src_tmp1, coeffabs_7)//
+ st1 {v21.2s},[x14],x6
+
+ sub v19.4s, v19.4s, v30.4s
+ shrn v31.4h, v31.4s, #6
//vqrshrun d14,q7,#6
ld1 {v17.4h},[x3],x2 //src_tmp2 = ld1_u8(pu1_src_tmp)//
- smull v12.4s,v3.4h,v23.4h
- smlal v12.4s,v2.4h,v22.4h
- smlal v12.4s,v4.4h,v24.4h
- smlal v12.4s,v5.4h,v25.4h
- smlal v12.4s,v6.4h,v26.4h
- smlal v12.4s,v7.4h,v27.4h
- smlal v12.4s,v16.4h,v28.4h
- smlal v12.4s,v17.4h,v29.4h
- st1 {v14.2s},[x14],x6
- sub v10.4s, v10.4s, v30.4s
- shrn v8.4h, v8.4s, #6
+ smull v21.4s,v3.4h,v23.4h
+ smlal v21.4s,v2.4h,v22.4h
+ smlal v21.4s,v4.4h,v24.4h
+ smlal v21.4s,v5.4h,v25.4h
+ smlal v21.4s,v6.4h,v26.4h
+ smlal v21.4s,v7.4h,v27.4h
+ smlal v21.4s,v16.4h,v28.4h
+ smlal v21.4s,v17.4h,v29.4h
+ st1 {v31.2s},[x14],x6
+ sub v20.4s, v20.4s, v30.4s
+ shrn v19.4h, v19.4s, #6
//vqrshrun d8,q4,#6 //sto_res = vqmovun_s16(sto_res_tmp)//
ld1 {v18.4h},[x3],x2 //src_tmp3 = ld1_u8(pu1_src_tmp)//
- smull v14.4s,v4.4h,v23.4h
- smlal v14.4s,v3.4h,v22.4h
- smlal v14.4s,v5.4h,v24.4h
- smlal v14.4s,v6.4h,v25.4h
- smlal v14.4s,v7.4h,v26.4h
- smlal v14.4s,v16.4h,v27.4h
- smlal v14.4s,v17.4h,v28.4h
- smlal v14.4s,v18.4h,v29.4h
- sub v12.4s, v12.4s, v30.4s
- shrn v10.4h, v10.4s, #6
+ smull v31.4s,v4.4h,v23.4h
+ smlal v31.4s,v3.4h,v22.4h
+ smlal v31.4s,v5.4h,v24.4h
+ smlal v31.4s,v6.4h,v25.4h
+ smlal v31.4s,v7.4h,v26.4h
+ smlal v31.4s,v16.4h,v27.4h
+ smlal v31.4s,v17.4h,v28.4h
+ smlal v31.4s,v18.4h,v29.4h
+ sub v21.4s, v21.4s, v30.4s
+ shrn v20.4h, v20.4s, #6
//vqrshrun d10,q5,#6 //sto_res = vqmovun_s16(sto_res_tmp)//
add x14,x1,x6
- st1 {v8.2s},[x1],#8 //st1_u8(pu1_dst,sto_res)//
+ st1 {v19.2s},[x1],#8 //st1_u8(pu1_dst,sto_res)//
epilog_end:
- st1 {v10.2s},[x14],x6 //st1_u8(pu1_dst_tmp,sto_res)//
- shrn v12.4h, v12.4s, #6
+ st1 {v20.2s},[x14],x6 //st1_u8(pu1_dst_tmp,sto_res)//
+ shrn v21.4h, v21.4s, #6
//vqrshrun d12,q6,#6
- st1 {v12.2s},[x14],x6
- sub v14.4s, v14.4s, v30.4s
- shrn v14.4h, v14.4s, #6
+ st1 {v21.2s},[x14],x6
+ sub v31.4s, v31.4s, v30.4s
+ shrn v31.4h, v31.4s, #6
//vqrshrun d14,q7,#6
- st1 {v14.2s},[x14],x6
+ st1 {v31.2s},[x14],x6
end_loops:
//ldmfd sp!,{r4-r12,r15} //reload the registers from sp
ldp x19, x20,[sp], #16
- pop_v_regs
+
ret
diff --git a/common/arm64/ihevc_intra_pred_chroma_horz.s b/common/arm64/ihevc_intra_pred_chroma_horz.s
index da41e59..8de655c 100644
--- a/common/arm64/ihevc_intra_pred_chroma_horz.s
+++ b/common/arm64/ihevc_intra_pred_chroma_horz.s
@@ -96,7 +96,7 @@
ihevc_intra_pred_chroma_horz_av8:
// stmfd sp!, {x4-x12, x14} //stack stores the values of the arguments
- push_v_regs
+
stp x19, x20,[sp,#-16]!
lsl x6,x4,#2 //four_nt
@@ -117,7 +117,7 @@ ihevc_intra_pred_chroma_horz_av8:
core_loop_16:
ld1 { v0.8h},[x12] //load 16 values. d1[7] will have the 1st value.
sub x12,x12,#16
- ld1 { v10.8h},[x12] //load 16 values. d1[7] will have the 1st value.
+ ld1 { v18.8h},[x12] //load 16 values. d1[7] will have the 1st value.
dup v2.8h, v0.4h[7] //duplicate the i value.
@@ -126,7 +126,7 @@ core_loop_16:
st1 { v2.8h},[x2],x3 //store in 1st row 0-16 columns
st1 { v2.8h},[x9],x3 //store in 1st row 16-32 columns
- dup v8.8h, v0.4h[4]
+ dup v1.8h, v0.4h[4]
st1 { v4.8h},[x2],x3
st1 { v4.8h},[x9],x3
@@ -135,47 +135,47 @@ core_loop_16:
st1 { v6.8h},[x9],x3
dup v4.8h, v0.4h[2]
- st1 { v8.8h},[x2],x3
- st1 { v8.8h},[x9],x3
+ st1 { v1.8h},[x2],x3
+ st1 { v1.8h},[x9],x3
dup v6.8h, v0.4h[1]
st1 { v2.8h},[x2],x3
st1 { v2.8h},[x9],x3
- dup v8.8h, v0.4h[0]
+ dup v1.8h, v0.4h[0]
st1 { v4.8h},[x2],x3
st1 { v4.8h},[x9],x3
- dup v2.8h, v10.4h[7]
+ dup v2.8h, v18.4h[7]
st1 { v6.8h},[x2],x3
st1 { v6.8h},[x9],x3
- dup v4.8h, v10.4h[6]
- st1 { v8.8h},[x2],x3
- st1 { v8.8h},[x9],x3
+ dup v4.8h, v18.4h[6]
+ st1 { v1.8h},[x2],x3
+ st1 { v1.8h},[x9],x3
- dup v6.8h, v10.4h[5]
+ dup v6.8h, v18.4h[5]
st1 { v2.8h},[x2],x3
st1 { v2.8h},[x9],x3
- dup v8.8h, v10.4h[4]
+ dup v1.8h, v18.4h[4]
st1 { v4.8h},[x2],x3
st1 { v4.8h},[x9],x3
- dup v2.8h, v10.4h[3]
+ dup v2.8h, v18.4h[3]
st1 { v6.8h},[x2],x3
st1 { v6.8h},[x9],x3
- dup v4.8h, v10.4h[2]
- st1 { v8.8h},[x2],x3
- st1 { v8.8h},[x9],x3
+ dup v4.8h, v18.4h[2]
+ st1 { v1.8h},[x2],x3
+ st1 { v1.8h},[x9],x3
- dup v6.8h, v10.4h[1]
+ dup v6.8h, v18.4h[1]
st1 { v2.8h},[x2],x3
st1 { v2.8h},[x9],x3
sub x12,x12,#16 //move to 16th value pointer
- dup v8.8h, v10.4h[0]
+ dup v1.8h, v18.4h[0]
st1 { v4.8h},[x2],x3
st1 { v4.8h},[x9],x3
@@ -183,12 +183,12 @@ core_loop_16:
st1 { v6.8h},[x2],x3
st1 { v6.8h},[x9],x3
- st1 { v8.8h},[x2],x3
- st1 { v8.8h},[x9],x3
+ st1 { v1.8h},[x2],x3
+ st1 { v1.8h},[x9],x3
bgt core_loop_16
// ldmfd sp!,{x4-x12,x15} //reload the registers from sp
ldp x19, x20,[sp],#16
- pop_v_regs
+
ret
b endloop
@@ -203,7 +203,7 @@ core_loop_8:
sub x12,x12,#16
// ld1 { v30.16b},[x12]
- dup v10.8h, v0.4h[7]
+ dup v18.8h, v0.4h[7]
//vmovl.u8 q13,d26
dup v2.8h, v0.4h[6]
@@ -215,18 +215,18 @@ core_loop_8:
dup v6.8h, v0.4h[4]
//vqadd.s16 q11,q13,q12
- dup v8.8h, v0.4h[3]
+ dup v1.8h, v0.4h[3]
//vqmovun.s16 d22,q11
- st1 { v10.8h},[x2],x3
+ st1 { v18.8h},[x2],x3
- dup v10.8h, v0.4h[2]
+ dup v18.8h, v0.4h[2]
//vsubl.u8 q12,d31,d28
- dup v12.8h, v0.4h[1]
+ dup v19.8h, v0.4h[1]
//vshr.s16 q12,q12,#1
- dup v14.8h, v0.4h[0]
+ dup v20.8h, v0.4h[0]
//vqadd.s16 q11,q13,q12
dup v16.8h, v0.4h[3]
@@ -238,14 +238,14 @@ core_loop_8:
st1 { v4.8h},[x2],x3
st1 { v6.8h},[x2],x3
- st1 { v8.8h},[x2],x3
- st1 { v10.8h},[x2],x3
+ st1 { v1.8h},[x2],x3
+ st1 { v18.8h},[x2],x3
//vdup.8 q1,d0[2]
- st1 { v12.8h},[x2],x3
+ st1 { v19.8h},[x2],x3
//vdup.8 q2,d0[1]
- st1 { v14.8h},[x2],x3
+ st1 { v20.8h},[x2],x3
//vdup.8 q3,d0[0]
//vst1.8 {q7},[x2],x3
@@ -269,7 +269,7 @@ core_loop_8:
// ldmfd sp!,{x4-x12,x15} //reload the registers from sp
ldp x19, x20,[sp],#16
- pop_v_regs
+
ret
b endloop
@@ -305,11 +305,11 @@ core_loop_4:
st1 {v6.8b},[x2],x3
st1 {v3.8b},[x2],x3
- dup v8.4h, v0.4h[1]
+ dup v1.4h, v0.4h[1]
st1 {v4.8b},[x2],x3
st1 {v5.8b},[x2],x3
- dup v9.4h, v0.4h[0]
+ dup v17.4h, v0.4h[0]
//vst1.8 {d6},[x2],x3
//vst1.8 {d7},[x2],x3
@@ -317,7 +317,7 @@ core_loop_4:
//vst1.8 {d9},[x2],x3
// ldmfd sp!,{x4-x12,x15} //reload the registers from sp
ldp x19, x20,[sp],#16
- pop_v_regs
+
ret
b endloop
@@ -352,7 +352,7 @@ core_loop_4:
// ldmfd sp!,{x4-x12,x15} //reload the registers from sp
ldp x19, x20,[sp],#16
- pop_v_regs
+
ret
endloop:
diff --git a/common/arm64/ihevc_intra_pred_chroma_mode_18_34.s b/common/arm64/ihevc_intra_pred_chroma_mode_18_34.s
index 52fc702..aacb35e 100644
--- a/common/arm64/ihevc_intra_pred_chroma_mode_18_34.s
+++ b/common/arm64/ihevc_intra_pred_chroma_mode_18_34.s
@@ -105,7 +105,7 @@
ihevc_intra_pred_chroma_mode_18_34_av8:
// stmfd sp!, {x4-x12, x14} //stack stores the values of the arguments
- push_v_regs
+
stp x19, x20,[sp,#-16]!
@@ -141,14 +141,14 @@ kernel:
st1 {v4.8b, v5.8b},[x10],x3
ld1 {v6.8b, v7.8b},[x8],x6
st1 {v6.8b, v7.8b},[x10],x3
- ld1 {v8.8b, v9.8b},[x8],x6
- st1 {v8.8b, v9.8b},[x10],x3
- ld1 {v10.8b, v11.8b},[x8],x6
- st1 {v10.8b, v11.8b},[x10],x3
- ld1 {v12.8b, v13.8b},[x8],x6
- st1 {v12.8b, v13.8b},[x10],x3
- ld1 {v14.8b, v15.8b},[x8],x6
- st1 {v14.8b, v15.8b},[x10],x3
+ ld1 {v16.8b, v17.8b},[x8],x6
+ st1 {v16.8b, v17.8b},[x10],x3
+ ld1 {v18.8b, v19.8b},[x8],x6
+ st1 {v18.8b, v19.8b},[x10],x3
+ ld1 {v20.8b, v21.8b},[x8],x6
+ st1 {v20.8b, v21.8b},[x10],x3
+ ld1 {v22.8b, v23.8b},[x8],x6
+ st1 {v22.8b, v23.8b},[x10],x3
subs x12,x12,#8
bne kernel
@@ -188,7 +188,7 @@ mode2_4:
end_func:
// ldmfd sp!,{x4-x12,x15} //reload the registers from sp
ldp x19, x20,[sp],#16
- pop_v_regs
+
ret
diff --git a/common/arm64/ihevc_intra_pred_chroma_mode_27_to_33.s b/common/arm64/ihevc_intra_pred_chroma_mode_27_to_33.s
index 1df4ad0..b22d182 100644
--- a/common/arm64/ihevc_intra_pred_chroma_mode_27_to_33.s
+++ b/common/arm64/ihevc_intra_pred_chroma_mode_27_to_33.s
@@ -95,7 +95,10 @@
ihevc_intra_pred_chroma_mode_27_to_33_av8:
// stmfd sp!, {x4-x12, x14} //stack stores the values of the arguments
- push_v_regs
+
+ stp d9,d10,[sp,#-16]!
+ stp d12,d13,[sp,#-16]!
+ stp d14,d15,[sp,#-16]!
stp x19, x20,[sp,#-16]!
adrp x6, :got:gai4_ihevc_ang_table //loads word32 gai4_ihevc_ang_table[35]
@@ -151,7 +154,7 @@ prologue:
add x10,x8,x9 //(i row)*pu1_ref[ref_main_idx]
asr x14,x14,#8 //(ii)shift by 8
- ld1 {v8.8b},[x10],x11 //(i row)ref_main_idx
+ ld1 {v23.8b},[x10],x11 //(i row)ref_main_idx
and x9,x14,#0xff //(ii)get the last byte
asr x14,x14,#8 //(iii)
@@ -163,7 +166,7 @@ prologue:
add x10,x8,x9 //(iii)*pu1_ref[ref_main_idx]
ld1 {v12.8b},[x12],x11 //(ii)ref_main_idx
- umull v10.8h, v8.8b, v30.8b //(i row)vmull_u8(ref_main_idx, dup_const_32_fract)
+ umull v10.8h, v23.8b, v30.8b //(i row)vmull_u8(ref_main_idx, dup_const_32_fract)
ld1 {v13.8b},[x12] //(ii)ref_main_idx_1
umlal v10.8h, v9.8b, v31.8b //(i row)vmull_u8(ref_main_idx_1, dup_const_fract)
@@ -202,7 +205,7 @@ prologue:
dup v29.8b, v4.8b[5] //(vi)
add x10,x8,x9 //(v)*pu1_ref[ref_main_idx]
- ld1 {v8.8b},[x10],x11 //(v)ref_main_idx
+ ld1 {v23.8b},[x10],x11 //(v)ref_main_idx
sub v24.8b, v1.8b , v25.8b //(iv)32-fract(dup_const_32_fract)
asr x14,x14,#8 //(vi)
@@ -224,7 +227,7 @@ prologue:
add x10,x8,x9 //(vii)*pu1_ref[ref_main_idx]
ld1 {v12.8b},[x12],x11 //(vi)ref_main_idx
- umull v10.8h, v8.8b, v30.8b //(v)vmull_u8(ref_main_idx, dup_const_32_fract)
+ umull v10.8h, v23.8b, v30.8b //(v)vmull_u8(ref_main_idx, dup_const_32_fract)
ld1 {v13.8b},[x12] //(vi)ref_main_idx_1
umlal v10.8h, v9.8b, v31.8b //(v)vmull_u8(ref_main_idx_1, dup_const_fract)
@@ -281,7 +284,7 @@ kernel_8_rows:
dup v31.8b, v4.8b[0]
subs x4,x4,#8
- ld1 {v8.8b},[x10],x11 //(i)ref_main_idx
+ ld1 {v23.8b},[x10],x11 //(i)ref_main_idx
sub v24.8b, v1.8b , v25.8b //(viii)32-fract(dup_const_32_fract)
and x9,x14,#0xff //(ii)
add x20,x6,#8 //increment the row value
@@ -304,7 +307,7 @@ kernel_8_rows:
add x10,x8,x9 //(iii)*pu1_ref[ref_main_idx]
ld1 {v12.8b},[x12],x11 //(ii)ref_main_idx
- umull v10.8h, v8.8b, v30.8b //(i)vmull_u8(ref_main_idx, dup_const_32_fract)
+ umull v10.8h, v23.8b, v30.8b //(i)vmull_u8(ref_main_idx, dup_const_32_fract)
asr x14,x14,#8 //(iv)
ld1 {v13.8b},[x12] //(ii)ref_main_idx_1
@@ -362,7 +365,7 @@ kernel_8_rows:
rshrn v18.8b, v18.8h,#5 //(iii)shift_res = vrshrn_n_u16(add_res, 5)
add x12,x8,x9 //(vi)*pu1_ref[ref_main_idx]
- ld1 {v8.8b},[x10],x11 //(v)ref_main_idx
+ ld1 {v23.8b},[x10],x11 //(v)ref_main_idx
and v4.16b, v2.16b , v6.16b //dup_const_fract(fract = pos & (31))
asr x14,x14,#8 //(vii)
@@ -379,7 +382,7 @@ kernel_8_rows:
add x10,x8,x9 //(vii)*pu1_ref[ref_main_idx]
ld1 {v13.8b},[x12] //(vi)ref_main_idx_1
- umull v10.8h, v8.8b, v30.8b //(v)vmull_u8(ref_main_idx, dup_const_32_fract)
+ umull v10.8h, v23.8b, v30.8b //(v)vmull_u8(ref_main_idx, dup_const_32_fract)
and x9,x14,#0xff //(viii)
smov x14, v3.2s[0] //(i)extract idx to the r register
@@ -479,7 +482,7 @@ core_loop_4:
dup v7.8b,w4 //dup_const_32_fract
umlal v4.8h, v3.8b, v0.8b //vmull_u8(ref_main_idx_1, dup_const_fract)
- ld1 {v8.8b},[x10] //ref_main_idx
+ ld1 {v23.8b},[x10] //ref_main_idx
add x8,x8,#1
ld1 {v9.8b},[x11] //ref_main_idx_1
@@ -495,7 +498,7 @@ core_loop_4:
add x11,x10,#2 //pu1_ref_main_idx_1 += 1
dup v12.8b,w5 //dup_const_fract
- umull v10.8h, v8.8b, v7.8b //vmull_u8(ref_main_idx, dup_const_32_fract)
+ umull v10.8h, v23.8b, v7.8b //vmull_u8(ref_main_idx, dup_const_32_fract)
sub x20,x5,#32
neg x4, x20
@@ -543,7 +546,9 @@ core_loop_4:
end_loops:
// ldmfd sp!,{x4-x12,x15} //reload the registers from sp
ldp x19, x20,[sp],#16
- pop_v_regs
+ ldp d14,d15,[sp],#16
+ ldp d12,d13,[sp],#16
+ ldp d9,d10,[sp],#16
ret
diff --git a/common/arm64/ihevc_intra_pred_chroma_mode_3_to_9.s b/common/arm64/ihevc_intra_pred_chroma_mode_3_to_9.s
index 3c8746c..bf026a3 100644
--- a/common/arm64/ihevc_intra_pred_chroma_mode_3_to_9.s
+++ b/common/arm64/ihevc_intra_pred_chroma_mode_3_to_9.s
@@ -104,7 +104,10 @@
ihevc_intra_pred_chroma_mode_3_to_9_av8:
// stmfd sp!, {x4-x12, x14} //stack stores the values of the arguments
- push_v_regs
+
+ stp d13,d14,[sp,#-16]!
+ stp d8,d15,[sp,#-16]! // Storing d15 using { sub sp,sp,#8; str d15,[sp] } is giving bus error.
+ // d8 is used as dummy register and stored along with d15 using stp. d8 is not used in the function.
stp x19, x20,[sp,#-16]!
adrp x7, :got:gai4_ihevc_ang_table
@@ -157,8 +160,8 @@ prologue_8_16_32:
movi v28.8b, #32
- sqxtn v8.8b, v22.8h
- shl v8.8b, v8.8b,#1 // 2 * idx
+ sqxtn v2.8b, v22.8h
+ shl v2.8b, v2.8b,#1 // 2 * idx
and v6.8b, v6.8b , v29.8b //fract values in d1/ idx values in d0
movi v29.8b, #2 //contains #2 for adding to get ref_main_idx + 1
@@ -167,58 +170,58 @@ prologue_8_16_32:
dup v27.4h,w0
mov x0,#0
- movi v9.8b, #22 //row 0 to 7
+ movi v3.8b, #22 //row 0 to 7
- sub v8.8b, v8.8b , v27.8b //ref_main_idx (sub row)
- sub v8.8b, v26.8b , v8.8b //ref_main_idx (row 0)
- add v8.8b, v8.8b , v9.8b //to compensate the pu1_src idx incremented by 8
- sub v9.8b, v8.8b , v29.8b //ref_main_idx + 1 (row 0)
- tbl v12.8b, { v0.16b, v1.16b}, v8.8b //load from ref_main_idx (row 0)
+ sub v2.8b, v2.8b , v27.8b //ref_main_idx (sub row)
+ sub v2.8b, v26.8b , v2.8b //ref_main_idx (row 0)
+ add v2.8b, v2.8b , v3.8b //to compensate the pu1_src idx incremented by 8
+ sub v3.8b, v2.8b , v29.8b //ref_main_idx + 1 (row 0)
+ tbl v25.8b, { v0.16b, v1.16b}, v2.8b //load from ref_main_idx (row 0)
sub v7.8b, v28.8b , v6.8b //32-fract
- tbl v13.8b, { v0.16b, v1.16b}, v9.8b //load from ref_main_idx + 1 (row 0)
- sub v4.8b, v8.8b , v29.8b //ref_main_idx (row 1)
- sub v5.8b, v9.8b , v29.8b //ref_main_idx + 1 (row 1)
+ tbl v13.8b, { v0.16b, v1.16b}, v3.8b //load from ref_main_idx + 1 (row 0)
+ sub v4.8b, v2.8b , v29.8b //ref_main_idx (row 1)
+ sub v5.8b, v3.8b , v29.8b //ref_main_idx + 1 (row 1)
movi v29.8b, #4
tbl v16.8b, { v0.16b, v1.16b}, v4.8b //load from ref_main_idx (row 1)
- umull v24.8h, v12.8b, v7.8b //mul (row 0)
+ umull v24.8h, v25.8b, v7.8b //mul (row 0)
umlal v24.8h, v13.8b, v6.8b //mul (row 0)
tbl v17.8b, { v0.16b, v1.16b}, v5.8b //load from ref_main_idx + 1 (row 1)
- sub v8.8b, v8.8b , v29.8b //ref_main_idx (row 2)
- sub v9.8b, v9.8b , v29.8b //ref_main_idx + 1 (row 2)
+ sub v2.8b, v2.8b , v29.8b //ref_main_idx (row 2)
+ sub v3.8b, v3.8b , v29.8b //ref_main_idx + 1 (row 2)
rshrn v24.8b, v24.8h,#5 //round shft (row 0)
- tbl v14.8b, { v0.16b, v1.16b}, v8.8b //load from ref_main_idx (row 2)
+ tbl v14.8b, { v0.16b, v1.16b}, v2.8b //load from ref_main_idx (row 2)
umull v22.8h, v16.8b, v7.8b //mul (row 1)
umlal v22.8h, v17.8b, v6.8b //mul (row 1)
- tbl v15.8b, { v0.16b, v1.16b}, v9.8b //load from ref_main_idx + 1 (row 2)
+ tbl v15.8b, { v0.16b, v1.16b}, v3.8b //load from ref_main_idx + 1 (row 2)
sub v4.8b, v4.8b , v29.8b //ref_main_idx (row 3)
sub v5.8b, v5.8b , v29.8b //ref_main_idx + 1 (row 3)
st1 {v24.8b},[x2], x3 //st (row 0)
rshrn v22.8b, v22.8h,#5 //round shft (row 1)
- tbl v10.8b, { v0.16b, v1.16b}, v4.8b //load from ref_main_idx (row 3)
+ tbl v19.8b, { v0.16b, v1.16b}, v4.8b //load from ref_main_idx (row 3)
umull v20.8h, v14.8b, v7.8b //mul (row 2)
umlal v20.8h, v15.8b, v6.8b //mul (row 2)
- tbl v11.8b, { v0.16b, v1.16b}, v5.8b //load from ref_main_idx + 1 (row 3)
- sub v8.8b, v8.8b , v29.8b //ref_main_idx (row 4)
- sub v9.8b, v9.8b , v29.8b //ref_main_idx + 1 (row 4)
+ tbl v23.8b, { v0.16b, v1.16b}, v5.8b //load from ref_main_idx + 1 (row 3)
+ sub v2.8b, v2.8b , v29.8b //ref_main_idx (row 4)
+ sub v3.8b, v3.8b , v29.8b //ref_main_idx + 1 (row 4)
st1 {v22.8b},[x2], x3 //st (row 1)
rshrn v20.8b, v20.8h,#5 //round shft (row 2)
- tbl v12.8b, { v0.16b, v1.16b}, v8.8b //load from ref_main_idx (row 4)
- umull v18.8h, v10.8b, v7.8b //mul (row 3)
- umlal v18.8h, v11.8b, v6.8b //mul (row 3)
+ tbl v25.8b, { v0.16b, v1.16b}, v2.8b //load from ref_main_idx (row 4)
+ umull v18.8h, v19.8b, v7.8b //mul (row 3)
+ umlal v18.8h, v23.8b, v6.8b //mul (row 3)
- tbl v13.8b, { v0.16b, v1.16b}, v9.8b //load from ref_main_idx + 1 (row 4)
+ tbl v13.8b, { v0.16b, v1.16b}, v3.8b //load from ref_main_idx + 1 (row 4)
sub v4.8b, v4.8b , v29.8b //ref_main_idx (row 5)
sub v5.8b, v5.8b , v29.8b //ref_main_idx + 1 (row 5)
@@ -226,36 +229,36 @@ prologue_8_16_32:
rshrn v18.8b, v18.8h,#5 //round shft (row 3)
tbl v16.8b, { v0.16b, v1.16b}, v4.8b //load from ref_main_idx (row 5)
- umull v24.8h, v12.8b, v7.8b //mul (row 4)
+ umull v24.8h, v25.8b, v7.8b //mul (row 4)
umlal v24.8h, v13.8b, v6.8b //mul (row 4)
tbl v17.8b, { v0.16b, v1.16b}, v5.8b //load from ref_main_idx + 1 (row 5)
- sub v8.8b, v8.8b , v29.8b //ref_main_idx (row 6)
- sub v9.8b, v9.8b , v29.8b //ref_main_idx + 1 (row 6)
+ sub v2.8b, v2.8b , v29.8b //ref_main_idx (row 6)
+ sub v3.8b, v3.8b , v29.8b //ref_main_idx + 1 (row 6)
st1 {v18.8b},[x2], x3 //st (row 3)
cmp x4,#4
beq end_func
rshrn v24.8b, v24.8h,#5 //round shft (row 4)
- tbl v14.8b, { v0.16b, v1.16b}, v8.8b //load from ref_main_idx (row 6)
+ tbl v14.8b, { v0.16b, v1.16b}, v2.8b //load from ref_main_idx (row 6)
umull v22.8h, v16.8b, v7.8b //mul (row 5)
umlal v22.8h, v17.8b, v6.8b //mul (row 5)
- tbl v15.8b, { v0.16b, v1.16b}, v9.8b //load from ref_main_idx + 1 (row 6)
+ tbl v15.8b, { v0.16b, v1.16b}, v3.8b //load from ref_main_idx + 1 (row 6)
sub v4.8b, v4.8b , v29.8b //ref_main_idx (row 7)
sub v5.8b, v5.8b , v29.8b //ref_main_idx + 1 (row 7)
st1 {v24.8b},[x2], x3 //st (row 4)
rshrn v22.8b, v22.8h,#5 //round shft (row 5)
- tbl v10.8b, { v0.16b, v1.16b}, v4.8b //load from ref_main_idx (row 7)
+ tbl v19.8b, { v0.16b, v1.16b}, v4.8b //load from ref_main_idx (row 7)
umull v20.8h, v14.8b, v7.8b //mul (row 6)
umlal v20.8h, v15.8b, v6.8b //mul (row 6)
- tbl v11.8b, { v0.16b, v1.16b}, v5.8b //load from ref_main_idx + 1 (row 7)
- umull v18.8h, v10.8b, v7.8b //mul (row 7)
- umlal v18.8h, v11.8b, v6.8b //mul (row 7)
+ tbl v23.8b, { v0.16b, v1.16b}, v5.8b //load from ref_main_idx + 1 (row 7)
+ umull v18.8h, v19.8b, v7.8b //mul (row 7)
+ umlal v18.8h, v23.8b, v6.8b //mul (row 7)
st1 {v22.8b},[x2], x3 //st (row 5)
rshrn v20.8b, v20.8h,#5 //round shft (row 6)
@@ -289,11 +292,11 @@ lbl284:
csel x0, x20, x0,le
ld1 {v31.8b},[x14],#8
- smull v12.8h, v30.8b, v31.8b //(col+1)*intra_pred_angle [0:7](col)
- xtn v10.8b, v12.8h
- sshr v12.8h, v12.8h,#5
- sqxtn v11.8b, v12.8h
- shl v11.8b, v11.8b,#1
+ smull v25.8h, v30.8b, v31.8b //(col+1)*intra_pred_angle [0:7](col)
+ xtn v19.8b, v25.8h
+ sshr v25.8h, v25.8h,#5
+ sqxtn v23.8b, v25.8h
+ shl v23.8b, v23.8b,#1
mov x5, #0x302 //idx value for v is +1 of u
dup v27.4h,w5 //row value inc or reset accordingly
ldr w9, [x8] //loads index value
@@ -305,25 +308,25 @@ lbl284:
dup v26.8b,w9
mov x5,x2
- sub v11.8b, v11.8b , v27.8b //ref_main_idx (sub row)
+ sub v23.8b, v23.8b , v27.8b //ref_main_idx (sub row)
kernel_8_16_32:
movi v29.8b, #2 //contains #2 for adding to get ref_main_idx + 1
- sub v8.8b, v26.8b , v11.8b //ref_main_idx
- mov v26.8b, v10.8b
+ sub v2.8b, v26.8b , v23.8b //ref_main_idx
+ mov v26.8b, v19.8b
subs x11, x11, #8
sub x6, x1, x9
- tbl v10.8b, { v0.16b, v1.16b}, v4.8b //load from ref_main_idx (row 7)
- add v8.8b, v8.8b , v16.8b //to compensate the pu1_src idx incremented by 8
+ tbl v19.8b, { v0.16b, v1.16b}, v4.8b //load from ref_main_idx (row 7)
+ add v2.8b, v2.8b , v16.8b //to compensate the pu1_src idx incremented by 8
umull v20.8h, v14.8b, v7.8b //mul (row 6)
- tbl v11.8b, { v0.16b, v1.16b}, v5.8b //load from ref_main_idx - 1 (row 7)
+ tbl v23.8b, { v0.16b, v1.16b}, v5.8b //load from ref_main_idx - 1 (row 7)
umlal v20.8h, v15.8b, v6.8b //mul (row 6)
add x20, x0, #8
csel x0, x20, x0,le
- sub v9.8b, v8.8b , v29.8b //ref_main_idx - 2
+ sub v3.8b, v2.8b , v29.8b //ref_main_idx - 2
add x20, x8, #4
csel x8, x20, x8,gt
@@ -339,15 +342,15 @@ lbl326:
mov x9,#0x302
dup v27.4h,w9 //row value inc or reset accordingly
- sub v4.8b, v8.8b , v29.8b //ref_main_idx (row 1)
+ sub v4.8b, v2.8b , v29.8b //ref_main_idx (row 1)
- sub v5.8b, v9.8b , v29.8b //ref_main_idx - 1 (row 1)
- tbl v12.8b, { v0.16b, v1.16b}, v8.8b //load from ref_main_idx (row 0)
+ sub v5.8b, v3.8b , v29.8b //ref_main_idx - 1 (row 1)
+ tbl v25.8b, { v0.16b, v1.16b}, v2.8b //load from ref_main_idx (row 0)
movi v29.8b, #31 //contains #2 for adding to get ref_main_idx + 1
- umull v18.8h, v10.8b, v7.8b //mul (row 7)
- tbl v13.8b, { v0.16b, v1.16b}, v9.8b //load from ref_main_idx + 1 (row 0)
- umlal v18.8h, v11.8b, v6.8b //mul (row 7)
+ umull v18.8h, v19.8b, v7.8b //mul (row 7)
+ tbl v13.8b, { v0.16b, v1.16b}, v3.8b //load from ref_main_idx + 1 (row 0)
+ umlal v18.8h, v23.8b, v6.8b //mul (row 7)
ld1 {v31.8b},[x14],#8
and v6.8b, v29.8b , v26.8b //fract values in d1/ idx values in d0
@@ -361,14 +364,14 @@ lbl326:
st1 {v22.8b},[x5], x3 //(from previous loop)st (row 5)
rshrn v20.8b, v20.8h,#5 //(from previous loop)round shft (row 6)
- sub v8.8b, v8.8b , v29.8b //ref_main_idx (row 2)
- tbl v10.8b, { v0.16b, v1.16b}, v4.8b //load from ref_main_idx (row 1)
- sub v9.8b, v9.8b , v29.8b //ref_main_idx - 1 (row 2)
+ sub v2.8b, v2.8b , v29.8b //ref_main_idx (row 2)
+ tbl v19.8b, { v0.16b, v1.16b}, v4.8b //load from ref_main_idx (row 1)
+ sub v3.8b, v3.8b , v29.8b //ref_main_idx - 1 (row 2)
lsl x9, x9, #1
sub v7.8b, v28.8b , v6.8b //32-fract
- umull v24.8h, v12.8b, v7.8b //mul (row 0)
+ umull v24.8h, v25.8b, v7.8b //mul (row 0)
tbl v17.8b, { v0.16b, v1.16b}, v5.8b //load from ref_main_idx + 1 (row 1)
umlal v24.8h, v13.8b, v6.8b //mul (row 0)
@@ -376,22 +379,22 @@ lbl326:
rshrn v18.8b, v18.8h,#5 //(from previous loop)round shft (row 7)
sub v4.8b, v4.8b , v29.8b //ref_main_idx (row 3)
- tbl v14.8b, { v0.16b, v1.16b}, v8.8b //load from ref_main_idx (row 2)
+ tbl v14.8b, { v0.16b, v1.16b}, v2.8b //load from ref_main_idx (row 2)
sub v5.8b, v5.8b , v29.8b //ref_main_idx - 1 (row 3)
- umull v22.8h, v10.8b, v7.8b //mul (row 1)
- tbl v15.8b, { v0.16b, v1.16b}, v9.8b //load from ref_main_idx + 1 (row 2)
+ umull v22.8h, v19.8b, v7.8b //mul (row 1)
+ tbl v15.8b, { v0.16b, v1.16b}, v3.8b //load from ref_main_idx + 1 (row 2)
umlal v22.8h, v17.8b, v6.8b //mul (row 1)
rshrn v24.8b, v24.8h,#5 //round shft (row 0)
st1 {v18.8b},[x5], x3 //(from previous loop)st (row 7)
- sub v8.8b, v8.8b , v29.8b //ref_main_idx (row 4)
- tbl v10.8b, { v0.16b, v1.16b}, v4.8b //load from ref_main_idx (row 3)
- sub v9.8b, v9.8b , v29.8b //ref_main_idx - 1 (row 4)
+ sub v2.8b, v2.8b , v29.8b //ref_main_idx (row 4)
+ tbl v19.8b, { v0.16b, v1.16b}, v4.8b //load from ref_main_idx (row 3)
+ sub v3.8b, v3.8b , v29.8b //ref_main_idx - 1 (row 4)
umull v20.8h, v14.8b, v7.8b //mul (row 2)
- tbl v11.8b, { v0.16b, v1.16b}, v5.8b //load from ref_main_idx + 1 (row 3)
+ tbl v23.8b, { v0.16b, v1.16b}, v5.8b //load from ref_main_idx + 1 (row 3)
umlal v20.8h, v15.8b, v6.8b //mul (row 2)
add x5,x2,x3,lsl#2
@@ -402,26 +405,26 @@ lbl326:
rshrn v22.8b, v22.8h,#5 //round shft (row 1)
sub v4.8b, v4.8b , v29.8b //ref_main_idx (row 5)
- tbl v12.8b, { v0.16b, v1.16b}, v8.8b //load from ref_main_idx (row 4)
+ tbl v25.8b, { v0.16b, v1.16b}, v2.8b //load from ref_main_idx (row 4)
sub v5.8b, v5.8b , v29.8b //ref_main_idx - 1 (row 5)
- umull v18.8h, v10.8b, v7.8b //mul (row 3)
- tbl v13.8b, { v0.16b, v1.16b}, v9.8b //load from ref_main_idx + 1 (row 4)
- umlal v18.8h, v11.8b, v6.8b //mul (row 3)
+ umull v18.8h, v19.8b, v7.8b //mul (row 3)
+ tbl v13.8b, { v0.16b, v1.16b}, v3.8b //load from ref_main_idx + 1 (row 4)
+ umlal v18.8h, v23.8b, v6.8b //mul (row 3)
st1 {v22.8b},[x2], x3 //st (row 1)
rshrn v20.8b, v20.8h,#5 //round shft (row 2)
- xtn v10.8b, v14.8h
+ xtn v19.8b, v14.8h
sshr v14.8h, v14.8h,#5
- sub v8.8b, v8.8b , v29.8b //ref_main_idx (row 6)
+ sub v2.8b, v2.8b , v29.8b //ref_main_idx (row 6)
tbl v21.8b, { v0.16b, v1.16b}, v4.8b //load from ref_main_idx (row 5)
- sub v9.8b, v9.8b , v29.8b //ref_main_idx - 1 (row 6)
+ sub v3.8b, v3.8b , v29.8b //ref_main_idx - 1 (row 6)
- umull v24.8h, v12.8b, v7.8b //mul (row 4)
+ umull v24.8h, v25.8b, v7.8b //mul (row 4)
tbl v17.8b, { v0.16b, v1.16b}, v5.8b //load from ref_main_idx + 1 (row 5)
- sqxtn v11.8b, v14.8h
+ sqxtn v23.8b, v14.8h
st1 {v20.8b},[x2], x3 //st (row 2)
umlal v24.8h, v13.8b, v6.8b //mul (row 4)
@@ -430,15 +433,15 @@ lbl326:
dup v26.8b,w9
sub v4.8b, v4.8b , v29.8b //ref_main_idx (row 7)
- tbl v14.8b, { v0.16b, v1.16b}, v8.8b //load from ref_main_idx (row 6)
+ tbl v14.8b, { v0.16b, v1.16b}, v2.8b //load from ref_main_idx (row 6)
sub v5.8b, v5.8b , v29.8b //ref_main_idx - 1 (row 7)
mov x6, #22 //to compensate the 2*row value
- shl v11.8b, v11.8b,#1
+ shl v23.8b, v23.8b,#1
sub x6, x6, x0, lsl #1
umull v22.8h, v21.8b, v7.8b //mul (row 5)
- tbl v15.8b, { v0.16b, v1.16b}, v9.8b //load from ref_main_idx + 1 (row 6)
+ tbl v15.8b, { v0.16b, v1.16b}, v3.8b //load from ref_main_idx + 1 (row 6)
umlal v22.8h, v17.8b, v6.8b //mul (row 5)
st1 {v18.8b},[x2], x3 //st (row 3)
@@ -451,7 +454,7 @@ lbl326:
sub x20, x2, x4
csel x2, x20, x2,le
- sub v11.8b, v11.8b , v27.8b //ref_main_idx (add row)
+ sub v23.8b, v23.8b , v27.8b //ref_main_idx (add row)
sub x20,x2,#8
csel x2, x20, x2,le
@@ -460,17 +463,17 @@ lbl326:
bne kernel_8_16_32
epil_8_16_32:
- tbl v10.8b, { v0.16b, v1.16b}, v4.8b //load from ref_main_idx (row 7)
+ tbl v19.8b, { v0.16b, v1.16b}, v4.8b //load from ref_main_idx (row 7)
umull v20.8h, v14.8b, v7.8b //mul (row 6)
- tbl v11.8b, { v0.16b, v1.16b}, v5.8b //load from ref_main_idx + 1 (row 7)
+ tbl v23.8b, { v0.16b, v1.16b}, v5.8b //load from ref_main_idx + 1 (row 7)
umlal v20.8h, v15.8b, v6.8b //mul (row 6)
st1 {v24.8b},[x5], x3 //st (row 4)
rshrn v24.8b, v22.8h,#5 //round shft (row 5)
- umull v18.8h, v10.8b, v7.8b //mul (row 7)
- umlal v18.8h, v11.8b, v6.8b //mul (row 7)
+ umull v18.8h, v19.8b, v7.8b //mul (row 7)
+ umlal v18.8h, v23.8b, v6.8b //mul (row 7)
st1 {v24.8b},[x5], x3 //(from previous loop)st (row 5)
rshrn v20.8b, v20.8h,#5 //(from previous loop)round shft (row 6)
@@ -481,9 +484,11 @@ epil_8_16_32:
st1 {v18.8b},[x5], x3 //st (row 7)
end_func:
- // ldmfd sp!,{x4-x12,x15} //reload the registers from sp
+ // ldmfd sp!,{x4-x12,x15} //reload the registers from sp
ldp x19, x20,[sp],#16
- pop_v_regs
+ ldp d8,d15,[sp],#16 // Loading d15 using { ldr d15,[sp]; add sp,sp,#8 } is giving bus error.
+ // d8 is used as dummy register and loaded along with d15 using ldp. d8 is not used in the function.
+ ldp d13,d14,[sp],#16
ret
diff --git a/common/arm64/ihevc_intra_pred_chroma_planar.s b/common/arm64/ihevc_intra_pred_chroma_planar.s
index ac6b362..65c4c56 100644
--- a/common/arm64/ihevc_intra_pred_chroma_planar.s
+++ b/common/arm64/ihevc_intra_pred_chroma_planar.s
@@ -106,7 +106,11 @@
ihevc_intra_pred_chroma_planar_av8:
// stmfd sp!, {x4-x12, x14} //stack stores the values of the arguments
- push_v_regs
+
+ stp d10,d11,[sp,#-16]!
+ stp d12,d13,[sp,#-16]!
+ stp d8,d14,[sp,#-16]! // Storing d14 using { sub sp,sp,#8; str d14,[sp] } is giving bus error.
+ // d8 is used as dummy register and stored along with d14 using stp. d8 is not used in the function.
stp x19, x20,[sp,#-16]!
adrp x11, :got:gau1_ihevc_planar_factor //loads table of coeffs
@@ -165,13 +169,13 @@ ihevc_intra_pred_chroma_planar_av8:
mov x10,x6
tf_sz_8_16:
ld1 {v10.8b, v11.8b}, [x14],#16 //load src[2nt+1+col]
- ld1 {v8.8b},[x12],#8
- mov v9.8b, v8.8b
- zip1 v29.8b, v8.8b, v9.8b
- zip2 v9.8b, v8.8b, v9.8b
- mov v8.d[0], v29.d[0]
- sub v30.8b, v2.8b , v8.8b //[nt-1-col]
- sub v31.8b, v2.8b , v9.8b
+ ld1 {v17.8b},[x12],#8
+ mov v25.8b, v17.8b
+ zip1 v29.8b, v17.8b, v25.8b
+ zip2 v25.8b, v17.8b, v25.8b
+ mov v17.d[0], v29.d[0]
+ sub v30.8b, v2.8b , v17.8b //[nt-1-col]
+ sub v31.8b, v2.8b , v25.8b
@@ -185,7 +189,7 @@ loop_sz_8_16:
sxtw x11,w11
umlal v12.8h, v6.8b, v10.8b //(nt-1-row) * src[2nt+1+col]
dup v4.4h,w7 //src[2nt-1-row]
- umlal v12.8h, v8.8b, v1.8b //(col+1) * src[3nt+1]
+ umlal v12.8h, v17.8b, v1.8b //(col+1) * src[3nt+1]
dup v3.4h,w11 //src[2nt-1-row]
umlal v12.8h, v30.8b, v4.8b //(nt-1-col) * src[2nt-1-row]
@@ -200,14 +204,14 @@ loop_sz_8_16:
umlal v28.8h, v31.8b, v4.8b
sub v19.8b, v6.8b , v7.8b //[nt-1-row]--
- umlal v28.8h, v9.8b, v1.8b
+ umlal v28.8h, v25.8b, v1.8b
dup v4.4h,w7 //src[2nt-1-row]
umull v26.8h, v18.8b, v0.8b //(row+1) * src[nt-1]
add v12.8h, v12.8h , v16.8h //add (nt)
umlal v26.8h, v19.8b, v10.8b //(nt-1-row) * src[2nt+1+col]
sshl v12.8h, v12.8h, v14.8h //shr
- umlal v26.8h, v8.8b, v1.8b //(col+1) * src[3nt+1]
+ umlal v26.8h, v17.8b, v1.8b //(col+1) * src[3nt+1]
add v28.8h, v28.8h , v16.8h
umlal v26.8h, v30.8b, v3.8b //(nt-1-col) * src[2nt-1-row]
sshl v28.8h, v28.8h, v14.8h
@@ -220,7 +224,7 @@ loop_sz_8_16:
add v5.8b, v18.8b , v7.8b //row++ [(row+1)++]
umlal v24.8h, v19.8b, v11.8b
sub v6.8b, v19.8b , v7.8b //[nt-1-row]--
- umlal v24.8h, v9.8b, v1.8b
+ umlal v24.8h, v25.8b, v1.8b
xtn v12.8b, v12.8h
umlal v24.8h, v31.8b, v3.8b
xtn v13.8b, v28.8h
@@ -233,7 +237,7 @@ loop_sz_8_16:
sshl v26.8h, v26.8h, v14.8h //shr
umlal v22.8h, v6.8b, v10.8b //(nt-1-row) * src[2nt+1+col]
st1 {v12.2s, v13.2s}, [x2], x3
- umlal v22.8h, v8.8b, v1.8b //(col+1) * src[3nt+1]
+ umlal v22.8h, v17.8b, v1.8b //(col+1) * src[3nt+1]
add v24.8h, v24.8h , v16.8h
umlal v22.8h, v30.8b, v4.8b //(nt-1-col) * src[2nt-1-row]
sshl v24.8h, v24.8h, v14.8h
@@ -246,7 +250,7 @@ loop_sz_8_16:
ldr w11, [x6], #-2 //src[2nt-1-row] (dec to take into account row)
sxtw x11,w11
- umlal v20.8h, v9.8b, v1.8b
+ umlal v20.8h, v25.8b, v1.8b
dup v3.4h,w11 //src[2nt-1-row]
add v22.8h, v22.8h , v16.8h //add (nt)
@@ -255,7 +259,7 @@ loop_sz_8_16:
umlal v12.8h, v19.8b, v10.8b //(nt-1-row) * src[2nt+1+col]
xtn v27.8b, v24.8h
- umlal v12.8h, v8.8b, v1.8b //(col+1) * src[3nt+1]
+ umlal v12.8h, v17.8b, v1.8b //(col+1) * src[3nt+1]
sshl v22.8h, v22.8h, v14.8h //shr
umlal v12.8h, v30.8b, v3.8b //(nt-1-col) * src[2nt-1-row]
@@ -268,7 +272,7 @@ loop_sz_8_16:
add v5.8b, v18.8b , v7.8b //row++ [(row+1)++]
sub v6.8b, v19.8b , v7.8b //[nt-1-row]--
- umlal v28.8h, v9.8b, v1.8b
+ umlal v28.8h, v25.8b, v1.8b
umlal v28.8h, v31.8b, v3.8b
sshl v20.8h, v20.8h, v14.8h
@@ -319,13 +323,13 @@ loop_sz_8_16:
add x2,x2,#16
ld1 {v10.8b, v11.8b}, [x14],#16 //load src[2nt+1+col]
- ld1 {v8.8b},[x12],#8
- mov v9.8b, v8.8b
- zip1 v29.8b, v8.8b, v9.8b
- zip2 v9.8b, v8.8b, v9.8b
- mov v8.d[0], v29.d[0]
- sub v30.8b, v2.8b , v8.8b //[nt-1-col]
- sub v31.8b, v2.8b , v9.8b
+ ld1 {v17.8b},[x12],#8
+ mov v25.8b, v17.8b
+ zip1 v29.8b, v17.8b, v25.8b
+ zip2 v25.8b, v17.8b, v25.8b
+ mov v17.d[0], v29.d[0]
+ sub v30.8b, v2.8b , v17.8b //[nt-1-col]
+ sub v31.8b, v2.8b , v25.8b
beq loop_sz_8_16
@@ -333,23 +337,23 @@ loop_sz_8_16:
tf_sz_4:
ld1 {v10.8b},[x14] //load src[2nt+1+col]
- ld1 {v8.8b},[x12], x10 //load 8 coeffs [col+1]
- mov v9.8b, v8.8b
- zip1 v29.8b, v8.8b, v9.8b
- zip2 v9.8b, v8.8b, v9.8b
- mov v8.d[0], v29.d[0]
+ ld1 {v17.8b},[x12], x10 //load 8 coeffs [col+1]
+ mov v25.8b, v17.8b
+ zip1 v29.8b, v17.8b, v25.8b
+ zip2 v25.8b, v17.8b, v25.8b
+ mov v17.d[0], v29.d[0]
loop_sz_4:
//mov x10, #4 @reduce inc to #4 for 4x4
ldr w7, [x6], #-2 //src[2nt-1-row] (dec to take into account row)
sxtw x7,w7
dup v4.4h,w7 //src[2nt-1-row]
- sub v9.8b, v2.8b , v8.8b //[nt-1-col]
+ sub v25.8b, v2.8b , v17.8b //[nt-1-col]
umull v12.8h, v5.8b, v0.8b //(row+1) * src[nt-1]
umlal v12.8h, v6.8b, v10.8b //(nt-1-row) * src[2nt+1+col]
- umlal v12.8h, v8.8b, v1.8b //(col+1) * src[3nt+1]
- umlal v12.8h, v9.8b, v4.8b //(nt-1-col) * src[2nt-1-row]
+ umlal v12.8h, v17.8b, v1.8b //(col+1) * src[3nt+1]
+ umlal v12.8h, v25.8b, v4.8b //(nt-1-col) * src[2nt-1-row]
// vadd.i16 q6, q6, q8 @add (nt)
// vshl.s16 q6, q6, q7 @shr
// vmovn.i16 d12, q6
@@ -364,9 +368,12 @@ loop_sz_4:
bne loop_sz_4
end_loop:
- // ldmfd sp!,{x4-x12,x15} //reload the registers from sp
+ // ldmfd sp!,{x4-x12,x15} //reload the registers from sp
ldp x19, x20,[sp],#16
- pop_v_regs
+ ldp d8,d14,[sp],#16 // Loading d14 using { ldr d14,[sp]; add sp,sp,#8 } is giving bus error.
+ // d8 is used as dummy register and loaded along with d14 using ldp. d8 is not used in the function.
+ ldp d12,d13,[sp],#16
+ ldp d10,d11,[sp],#16
ret
diff --git a/common/arm64/ihevc_intra_pred_filters_chroma_mode_11_to_17.s b/common/arm64/ihevc_intra_pred_filters_chroma_mode_11_to_17.s
index e9f83ff..5d65e63 100644
--- a/common/arm64/ihevc_intra_pred_filters_chroma_mode_11_to_17.s
+++ b/common/arm64/ihevc_intra_pred_filters_chroma_mode_11_to_17.s
@@ -105,7 +105,9 @@
ihevc_intra_pred_chroma_mode_11_to_17_av8:
// stmfd sp!, {x4-x12, x14} //stack stores the values of the arguments
- push_v_regs
+
+ stp d12,d13,[sp,#-16]!
+ stp d14,d15,[sp,#-16]!
stp x19, x20,[sp,#-16]!
adrp x7, :got:gai4_ihevc_ang_table
@@ -279,8 +281,8 @@ prologue_8_16_32:
// mov x0, #32
movi v28.8b, #32
- sqxtn v8.8b, v22.8h
- shl v8.8b, v8.8b,#1 // 2 * idx
+ sqxtn v19.8b, v22.8h
+ shl v19.8b, v19.8b,#1 // 2 * idx
and v6.8b, v6.8b , v29.8b //fract values in d1/ idx values in d0
@@ -292,15 +294,15 @@ prologue_8_16_32:
add v27.8b, v27.8b , v29.8b
mov x0,#0
- add v8.8b, v8.8b , v27.8b //ref_main_idx (add row)
- sub v8.8b, v8.8b , v26.8b //ref_main_idx (row 0)
- add v9.8b, v8.8b , v29.8b //ref_main_idx + 1 (row 0)
- tbl v12.8b, { v0.16b, v1.16b}, v8.8b //load from ref_main_idx (row 0)
+ add v19.8b, v19.8b , v27.8b //ref_main_idx (add row)
+ sub v19.8b, v19.8b , v26.8b //ref_main_idx (row 0)
+ add v21.8b, v19.8b , v29.8b //ref_main_idx + 1 (row 0)
+ tbl v12.8b, { v0.16b, v1.16b}, v19.8b //load from ref_main_idx (row 0)
sub v7.8b, v28.8b , v6.8b //32-fract
- tbl v13.8b, { v0.16b, v1.16b}, v9.8b //load from ref_main_idx + 1 (row 0)
- add v4.8b, v8.8b , v29.8b //ref_main_idx (row 1)
- add v5.8b, v9.8b , v29.8b //ref_main_idx + 1 (row 1)
+ tbl v13.8b, { v0.16b, v1.16b}, v21.8b //load from ref_main_idx + 1 (row 0)
+ add v4.8b, v19.8b , v29.8b //ref_main_idx (row 1)
+ add v5.8b, v21.8b , v29.8b //ref_main_idx + 1 (row 1)
// mov x0, #4 @ 2 *(row * 2 )
movi v29.8b, #4
@@ -310,38 +312,38 @@ prologue_8_16_32:
umlal v24.8h, v13.8b, v6.8b //mul (row 0)
tbl v17.8b, { v0.16b, v1.16b}, v5.8b //load from ref_main_idx + 1 (row 1)
- add v8.8b, v8.8b , v29.8b //ref_main_idx (row 2)
- add v9.8b, v9.8b , v29.8b //ref_main_idx + 1 (row 2)
+ add v19.8b, v19.8b , v29.8b //ref_main_idx (row 2)
+ add v21.8b, v21.8b , v29.8b //ref_main_idx + 1 (row 2)
rshrn v24.8b, v24.8h,#5 //round shft (row 0)
- tbl v14.8b, { v0.16b, v1.16b}, v8.8b //load from ref_main_idx (row 2)
+ tbl v14.8b, { v0.16b, v1.16b}, v19.8b //load from ref_main_idx (row 2)
umull v22.8h, v16.8b, v7.8b //mul (row 1)
umlal v22.8h, v17.8b, v6.8b //mul (row 1)
- tbl v15.8b, { v0.16b, v1.16b}, v9.8b //load from ref_main_idx + 1 (row 2)
+ tbl v15.8b, { v0.16b, v1.16b}, v21.8b //load from ref_main_idx + 1 (row 2)
add v4.8b, v4.8b , v29.8b //ref_main_idx (row 3)
add v5.8b, v5.8b , v29.8b //ref_main_idx + 1 (row 3)
st1 {v24.8b},[x2], x3 //st (row 0)
rshrn v22.8b, v22.8h,#5 //round shft (row 1)
- tbl v10.8b, { v0.16b, v1.16b}, v4.8b //load from ref_main_idx (row 3)
+ tbl v23.8b, { v0.16b, v1.16b}, v4.8b //load from ref_main_idx (row 3)
umull v20.8h, v14.8b, v7.8b //mul (row 2)
umlal v20.8h, v15.8b, v6.8b //mul (row 2)
- tbl v11.8b, { v0.16b, v1.16b}, v5.8b //load from ref_main_idx + 1 (row 3)
- add v8.8b, v8.8b , v29.8b //ref_main_idx (row 4)
- add v9.8b, v9.8b , v29.8b //ref_main_idx + 1 (row 4)
+ tbl v25.8b, { v0.16b, v1.16b}, v5.8b //load from ref_main_idx + 1 (row 3)
+ add v19.8b, v19.8b , v29.8b //ref_main_idx (row 4)
+ add v21.8b, v21.8b , v29.8b //ref_main_idx + 1 (row 4)
st1 {v22.8b},[x2], x3 //st (row 1)
rshrn v20.8b, v20.8h,#5 //round shft (row 2)
- tbl v12.8b, { v0.16b, v1.16b}, v8.8b //load from ref_main_idx (row 4)
- umull v18.8h, v10.8b, v7.8b //mul (row 3)
- umlal v18.8h, v11.8b, v6.8b //mul (row 3)
+ tbl v12.8b, { v0.16b, v1.16b}, v19.8b //load from ref_main_idx (row 4)
+ umull v18.8h, v23.8b, v7.8b //mul (row 3)
+ umlal v18.8h, v25.8b, v6.8b //mul (row 3)
- tbl v13.8b, { v0.16b, v1.16b}, v9.8b //load from ref_main_idx + 1 (row 4)
+ tbl v13.8b, { v0.16b, v1.16b}, v21.8b //load from ref_main_idx + 1 (row 4)
add v4.8b, v4.8b , v29.8b //ref_main_idx (row 5)
add v5.8b, v5.8b , v29.8b //ref_main_idx + 1 (row 5)
@@ -353,32 +355,32 @@ prologue_8_16_32:
umlal v24.8h, v13.8b, v6.8b //mul (row 4)
tbl v17.8b, { v0.16b, v1.16b}, v5.8b //load from ref_main_idx + 1 (row 5)
- add v8.8b, v8.8b , v29.8b //ref_main_idx (row 6)
- add v9.8b, v9.8b , v29.8b //ref_main_idx + 1 (row 6)
+ add v19.8b, v19.8b , v29.8b //ref_main_idx (row 6)
+ add v21.8b, v21.8b , v29.8b //ref_main_idx + 1 (row 6)
st1 {v18.8b},[x2], x3 //st (row 3)
cmp x4,#4
beq end_func
rshrn v24.8b, v24.8h,#5 //round shft (row 4)
- tbl v14.8b, { v0.16b, v1.16b}, v8.8b //load from ref_main_idx (row 6)
+ tbl v14.8b, { v0.16b, v1.16b}, v19.8b //load from ref_main_idx (row 6)
umull v22.8h, v16.8b, v7.8b //mul (row 5)
umlal v22.8h, v17.8b, v6.8b //mul (row 5)
- tbl v15.8b, { v0.16b, v1.16b}, v9.8b //load from ref_main_idx + 1 (row 6)
+ tbl v15.8b, { v0.16b, v1.16b}, v21.8b //load from ref_main_idx + 1 (row 6)
add v4.8b, v4.8b , v29.8b //ref_main_idx (row 7)
add v5.8b, v5.8b , v29.8b //ref_main_idx + 1 (row 7)
st1 {v24.8b},[x2], x3 //st (row 4)
rshrn v22.8b, v22.8h,#5 //round shft (row 5)
- tbl v10.8b, { v0.16b, v1.16b}, v4.8b //load from ref_main_idx (row 7)
+ tbl v23.8b, { v0.16b, v1.16b}, v4.8b //load from ref_main_idx (row 7)
umull v20.8h, v14.8b, v7.8b //mul (row 6)
umlal v20.8h, v15.8b, v6.8b //mul (row 6)
- tbl v11.8b, { v0.16b, v1.16b}, v5.8b //load from ref_main_idx + 1 (row 7)
- umull v18.8h, v10.8b, v7.8b //mul (row 7)
- umlal v18.8h, v11.8b, v6.8b //mul (row 7)
+ tbl v25.8b, { v0.16b, v1.16b}, v5.8b //load from ref_main_idx + 1 (row 7)
+ umull v18.8h, v23.8b, v7.8b //mul (row 7)
+ umlal v18.8h, v25.8b, v6.8b //mul (row 7)
st1 {v22.8b},[x2], x3 //st (row 5)
rshrn v20.8b, v20.8h,#5 //round shft (row 6)
@@ -413,10 +415,10 @@ lbl400:
ld1 {v31.8b},[x14],#8
smull v12.8h, v30.8b, v31.8b //(col+1)*intra_pred_angle [0:7](col)
- xtn v10.8b, v12.8h
+ xtn v23.8b, v12.8h
sshr v12.8h, v12.8h,#5
- sqxtn v11.8b, v12.8h
- shl v11.8b, v11.8b,#1
+ sqxtn v25.8b, v12.8h
+ shl v25.8b, v25.8b,#1
orr x5,x0,x0, lsl#8
add x5, x5,#0x002
add x5, x5,#0x300
@@ -427,7 +429,7 @@ lbl400:
add x9, x9, x0, lsl #1
// sub x9, x9, #1
dup v26.8b,w9
- add v8.8b, v27.8b , v11.8b //ref_main_idx (add row)
+ add v19.8b, v27.8b , v25.8b //ref_main_idx (add row)
mov x5,x2
// sub x4,x4,#8
@@ -435,16 +437,16 @@ lbl400:
kernel_8_16_32:
movi v29.8b, #2 //contains #2 for adding to get ref_main_idx + 1
- sub v8.8b, v8.8b , v26.8b //ref_main_idx
- mov v26.8b, v10.8b
+ sub v19.8b, v19.8b , v26.8b //ref_main_idx
+ mov v26.8b, v23.8b
subs x11, x11, #8
add x6, x1, x9
- tbl v10.8b, { v0.16b, v1.16b}, v4.8b //load from ref_main_idx (row 7)
- add v9.8b, v29.8b , v8.8b //ref_main_idx + 1
+ tbl v23.8b, { v0.16b, v1.16b}, v4.8b //load from ref_main_idx (row 7)
+ add v21.8b, v29.8b , v19.8b //ref_main_idx + 1
umull v20.8h, v14.8b, v7.8b //mul (row 6)
- tbl v11.8b, { v0.16b, v1.16b}, v5.8b //load from ref_main_idx + 1 (row 7)
+ tbl v25.8b, { v0.16b, v1.16b}, v5.8b //load from ref_main_idx + 1 (row 7)
umlal v20.8h, v15.8b, v6.8b //mul (row 6)
add x20, x0, #8
@@ -468,15 +470,15 @@ kernel_8_16_32:
ldr x14, [x14, #:got_lo12:col_for_intra_chroma]
lbl452:
- add v4.8b, v29.8b , v8.8b //ref_main_idx (row 1)
- tbl v12.8b, { v0.16b, v1.16b}, v8.8b //load from ref_main_idx (row 0)
- add v5.8b, v29.8b , v9.8b //ref_main_idx + 1 (row 1)
+ add v4.8b, v29.8b , v19.8b //ref_main_idx (row 1)
+ tbl v12.8b, { v0.16b, v1.16b}, v19.8b //load from ref_main_idx (row 0)
+ add v5.8b, v29.8b , v21.8b //ref_main_idx + 1 (row 1)
movi v29.8b, #31 //contains #2 for adding to get ref_main_idx + 1
- umull v18.8h, v10.8b, v7.8b //mul (row 7)
- tbl v13.8b, { v0.16b, v1.16b}, v9.8b //load from ref_main_idx + 1 (row 0)
- umlal v18.8h, v11.8b, v6.8b //mul (row 7)
+ umull v18.8h, v23.8b, v7.8b //mul (row 7)
+ tbl v13.8b, { v0.16b, v1.16b}, v21.8b //load from ref_main_idx + 1 (row 0)
+ umlal v18.8h, v25.8b, v6.8b //mul (row 7)
ld1 {v31.8b},[x14],#8
and v6.8b, v29.8b , v26.8b //fract values in d1/ idx values in d0
@@ -486,9 +488,9 @@ lbl452:
st1 {v24.8b},[x5], x3 //(from previous loop)st (row 5)
rshrn v20.8b, v20.8h,#5 //(from previous loop)round shft (row 6)
- add v8.8b, v29.8b , v8.8b //ref_main_idx (row 2)
+ add v19.8b, v29.8b , v19.8b //ref_main_idx (row 2)
tbl v16.8b, { v0.16b, v1.16b}, v4.8b //load from ref_main_idx (row 1)
- add v9.8b, v29.8b , v9.8b //ref_main_idx + 1 (row 2)
+ add v21.8b, v29.8b , v21.8b //ref_main_idx + 1 (row 2)
lsl x20, x4, #1
csel x11,x20,x11,le
@@ -505,22 +507,22 @@ lbl452:
rshrn v18.8b, v18.8h,#5 //(from previous loop)round shft (row 7)
add v4.8b, v4.8b , v29.8b //ref_main_idx (row 3)
- tbl v14.8b, { v0.16b, v1.16b}, v8.8b //load from ref_main_idx (row 2)
+ tbl v14.8b, { v0.16b, v1.16b}, v19.8b //load from ref_main_idx (row 2)
add v5.8b, v5.8b , v29.8b //ref_main_idx + 1 (row 3)
umull v22.8h, v16.8b, v7.8b //mul (row 1)
- tbl v15.8b, { v0.16b, v1.16b}, v9.8b //load from ref_main_idx + 1 (row 2)
+ tbl v15.8b, { v0.16b, v1.16b}, v21.8b //load from ref_main_idx + 1 (row 2)
umlal v22.8h, v17.8b, v6.8b //mul (row 1)
rshrn v24.8b, v24.8h,#5 //round shft (row 0)
st1 {v18.8b},[x5], x3 //(from previous loop)st (row 7)
- add v8.8b, v8.8b , v29.8b //ref_main_idx (row 4)
- tbl v10.8b, { v0.16b, v1.16b}, v4.8b //load from ref_main_idx (row 3)
- add v9.8b, v9.8b , v29.8b //ref_main_idx + 1 (row 4)
+ add v19.8b, v19.8b , v29.8b //ref_main_idx (row 4)
+ tbl v23.8b, { v0.16b, v1.16b}, v4.8b //load from ref_main_idx (row 3)
+ add v21.8b, v21.8b , v29.8b //ref_main_idx + 1 (row 4)
umull v20.8h, v14.8b, v7.8b //mul (row 2)
- tbl v11.8b, { v0.16b, v1.16b}, v5.8b //load from ref_main_idx + 1 (row 3)
+ tbl v25.8b, { v0.16b, v1.16b}, v5.8b //load from ref_main_idx + 1 (row 3)
umlal v20.8h, v15.8b, v6.8b //mul (row 2)
smull v14.8h, v30.8b, v31.8b //(col+1)*intra_pred_angle [0:7](col)
@@ -532,22 +534,22 @@ lbl452:
rshrn v22.8b, v22.8h,#5 //round shft (row 1)
add v4.8b, v4.8b , v29.8b //ref_main_idx (row 5)
- tbl v12.8b, { v0.16b, v1.16b}, v8.8b //load from ref_main_idx (row 4)
+ tbl v12.8b, { v0.16b, v1.16b}, v19.8b //load from ref_main_idx (row 4)
add v5.8b, v5.8b , v29.8b //ref_main_idx + 1 (row 5)
- umull v18.8h, v10.8b, v7.8b //mul (row 3)
- tbl v13.8b, { v0.16b, v1.16b}, v9.8b //load from ref_main_idx + 1 (row 4)
- umlal v18.8h, v11.8b, v6.8b //mul (row 3)
+ umull v18.8h, v23.8b, v7.8b //mul (row 3)
+ tbl v13.8b, { v0.16b, v1.16b}, v21.8b //load from ref_main_idx + 1 (row 4)
+ umlal v18.8h, v25.8b, v6.8b //mul (row 3)
st1 {v22.8b},[x2], x3 //st (row 1)
rshrn v20.8b, v20.8h,#5 //round shft (row 2)
- xtn v10.8b, v14.8h
+ xtn v23.8b, v14.8h
sshr v14.8h, v14.8h,#5
- add v8.8b, v8.8b , v29.8b //ref_main_idx (row 6)
+ add v19.8b, v19.8b , v29.8b //ref_main_idx (row 6)
tbl v16.8b, { v0.16b, v1.16b}, v4.8b //load from ref_main_idx (row 5)
- add v9.8b, v9.8b , v29.8b //ref_main_idx + 1 (row 6)
+ add v21.8b, v21.8b , v29.8b //ref_main_idx + 1 (row 6)
umull v24.8h, v12.8b, v7.8b //mul (row 4)
tbl v17.8b, { v0.16b, v1.16b}, v5.8b //load from ref_main_idx + 1 (row 5)
@@ -557,19 +559,19 @@ lbl452:
rshrn v18.8b, v18.8h,#5 //round shft (row 3)
// sub x9, x9, #1
- sqxtn v11.8b, v14.8h
+ sqxtn v25.8b, v14.8h
add v4.8b, v4.8b , v29.8b //ref_main_idx (row 7)
- tbl v14.8b, { v0.16b, v1.16b}, v8.8b //load from ref_main_idx (row 6)
+ tbl v14.8b, { v0.16b, v1.16b}, v19.8b //load from ref_main_idx (row 6)
add v5.8b, v5.8b , v29.8b //ref_main_idx + 1 (row 7)
- shl v11.8b, v11.8b,#1
+ shl v25.8b, v25.8b,#1
umull v22.8h, v16.8b, v7.8b //mul (row 5)
- tbl v15.8b, { v0.16b, v1.16b}, v9.8b //load from ref_main_idx + 1 (row 6)
+ tbl v15.8b, { v0.16b, v1.16b}, v21.8b //load from ref_main_idx + 1 (row 6)
umlal v22.8h, v17.8b, v6.8b //mul (row 5)
- add v8.8b, v27.8b , v11.8b //ref_main_idx (add row)
+ add v19.8b, v27.8b , v25.8b //ref_main_idx (add row)
dup v26.8b,w9
st1 {v18.8b},[x2], x3 //st (row 3)
@@ -589,17 +591,17 @@ lbl452:
bne kernel_8_16_32
epil_8_16_32:
- tbl v10.8b, { v0.16b, v1.16b}, v4.8b //load from ref_main_idx (row 7)
+ tbl v23.8b, { v0.16b, v1.16b}, v4.8b //load from ref_main_idx (row 7)
umull v20.8h, v14.8b, v7.8b //mul (row 6)
- tbl v11.8b, { v0.16b, v1.16b}, v5.8b //load from ref_main_idx + 1 (row 7)
+ tbl v25.8b, { v0.16b, v1.16b}, v5.8b //load from ref_main_idx + 1 (row 7)
umlal v20.8h, v15.8b, v6.8b //mul (row 6)
st1 {v24.8b},[x5], x3 //st (row 4)
rshrn v24.8b, v22.8h,#5 //round shft (row 5)
- umull v18.8h, v10.8b, v7.8b //mul (row 7)
- umlal v18.8h, v11.8b, v6.8b //mul (row 7)
+ umull v18.8h, v23.8b, v7.8b //mul (row 7)
+ umlal v18.8h, v25.8b, v6.8b //mul (row 7)
st1 {v24.8b},[x5], x3 //(from previous loop)st (row 5)
rshrn v20.8b, v20.8h,#5 //(from previous loop)round shft (row 6)
@@ -613,7 +615,8 @@ end_func:
add sp, sp, #132
// ldmfd sp!,{x4-x12,x15} //reload the registers from sp
ldp x19, x20,[sp],#16
- pop_v_regs
+ ldp d14,d15,[sp],#16
+ ldp d12,d13,[sp],#16
ret
diff --git a/common/arm64/ihevc_intra_pred_filters_chroma_mode_19_to_25.s b/common/arm64/ihevc_intra_pred_filters_chroma_mode_19_to_25.s
index 3af2da7..261c591 100644
--- a/common/arm64/ihevc_intra_pred_filters_chroma_mode_19_to_25.s
+++ b/common/arm64/ihevc_intra_pred_filters_chroma_mode_19_to_25.s
@@ -102,8 +102,11 @@
ihevc_intra_pred_chroma_mode_19_to_25_av8:
- // stmfd sp!, {x4-x12, x14} //stack stores the values of the arguments
- push_v_regs
+ // stmfd sp!, {x4-x12, x14} //stack stores the values of the arguments
+
+ stp d12,d13,[sp,#-16]!
+ stp d8,d14,[sp,#-16]! // Storing d14 using { sub sp,sp,#8; str d14,[sp] } is giving bus error.
+ // d8 is used as dummy register and stored along with d14 using stp. d8 is not used in the function.
stp x19, x20,[sp,#-16]!
adrp x7, :got:gai4_ihevc_ang_table
@@ -264,10 +267,10 @@ prologue:
add x10,x8,x9 //(i row)*pu1_ref[ref_main_idx]
- ld1 {v8.8b},[x10],x11 //(i row)ref_main_idx
+ ld1 {v7.8b},[x10],x11 //(i row)ref_main_idx
sbfx x9,x14,#8,#8
- ld1 {v9.8b},[x10] //(i row)ref_main_idx_1
+ ld1 {v19.8b},[x10] //(i row)ref_main_idx_1
add x12,x8,x9 //(ii)*pu1_ref[ref_main_idx]
sbfx x9,x14,#16,#8
@@ -275,10 +278,10 @@ prologue:
add x10,x8,x9 //(iii)*pu1_ref[ref_main_idx]
ld1 {v12.8b},[x12],x11 //(ii)ref_main_idx
- umull v10.8h, v8.8b, v30.8b //(i row)vmull_u8(ref_main_idx, dup_const_32_fract)
+ umull v23.8h, v7.8b, v30.8b //(i row)vmull_u8(ref_main_idx, dup_const_32_fract)
ld1 {v13.8b},[x12] //(ii)ref_main_idx_1
- umlal v10.8h, v9.8b, v31.8b //(i row)vmull_u8(ref_main_idx_1, dup_const_fract)
+ umlal v23.8h, v19.8b, v31.8b //(i row)vmull_u8(ref_main_idx_1, dup_const_fract)
dup v27.8b, v4.8b[2] //(iii)
sub v28.8b, v1.8b , v29.8b //(ii)32-fract(dup_const_32_fract)
@@ -292,7 +295,7 @@ prologue:
umlal v14.8h, v13.8b, v29.8b //(ii)vmull_u8(ref_main_idx_1, dup_const_fract)
ld1 {v17.8b},[x10] //(iii)ref_main_idx_1
- rshrn v10.8b, v10.8h,#5 //(i row)shift_res = vrshrn_n_u16(add_res, 5)
+ rshrn v23.8b, v23.8h,#5 //(i row)shift_res = vrshrn_n_u16(add_res, 5)
ld1 {v20.8b},[x12],x11 //(iv)ref_main_idx
sub v26.8b, v1.8b , v27.8b //(iii)32-fract(dup_const_32_fract)
@@ -306,20 +309,20 @@ prologue:
umlal v18.8h, v17.8b, v27.8b //(iii)vmull_u8(ref_main_idx_1, dup_const_fract)
// lsl x14,x14,#1
- st1 {v10.8b},[x2],#8 //(i row)
+ st1 {v23.8b},[x2],#8 //(i row)
rshrn v14.8b, v14.8h,#5 //(ii)shift_res = vrshrn_n_u16(add_res, 5)
sbfx x9,x14,#0,#8
dup v29.8b, v4.8b[5] //(vi)
add x10,x8,x9 //(v)*pu1_ref[ref_main_idx]
- ld1 {v8.8b},[x10],x11 //(v)ref_main_idx
+ ld1 {v7.8b},[x10],x11 //(v)ref_main_idx
sub v24.8b, v1.8b , v25.8b //(iv)32-fract(dup_const_32_fract)
umull v22.8h, v20.8b, v24.8b //(iv)vmull_u8(ref_main_idx, dup_const_32_fract)
sbfx x9,x14,#8,#8
- ld1 {v9.8b},[x10] //(v)ref_main_idx_1
+ ld1 {v19.8b},[x10] //(v)ref_main_idx_1
umlal v22.8h, v21.8b, v25.8b //(iv)vmull_u8(ref_main_idx_1, dup_const_fract)
st1 {v14.8b},[x0],x3 //(ii)
@@ -333,10 +336,10 @@ prologue:
add x10,x8,x9 //(vii)*pu1_ref[ref_main_idx]
ld1 {v12.8b},[x12],x11 //(vi)ref_main_idx
- umull v10.8h, v8.8b, v30.8b //(v)vmull_u8(ref_main_idx, dup_const_32_fract)
+ umull v23.8h, v7.8b, v30.8b //(v)vmull_u8(ref_main_idx, dup_const_32_fract)
ld1 {v13.8b},[x12] //(vi)ref_main_idx_1
- umlal v10.8h, v9.8b, v31.8b //(v)vmull_u8(ref_main_idx_1, dup_const_fract)
+ umlal v23.8h, v19.8b, v31.8b //(v)vmull_u8(ref_main_idx_1, dup_const_fract)
st1 {v18.8b},[x0],x3 //(iii)
rshrn v22.8b, v22.8h,#5 //(iv)shift_res = vrshrn_n_u16(add_res, 5)
@@ -358,7 +361,7 @@ prologue:
cmp x4,#8 // go to end if 4x4
beq end_loops
- rshrn v10.8b, v10.8h,#5 //(v)shift_res = vrshrn_n_u16(add_res, 5)
+ rshrn v23.8b, v23.8h,#5 //(v)shift_res = vrshrn_n_u16(add_res, 5)
ld1 {v20.8b},[x12],x11 //(viii)ref_main_idx
sub v26.8b, v1.8b , v27.8b //(vii)32-fract(dup_const_32_fract)
@@ -372,7 +375,7 @@ prologue:
sub x20,x4,#8
csel x4, x20, x4,gt
- st1 {v10.8b},[x0],x3 //(v)
+ st1 {v23.8b},[x0],x3 //(v)
rshrn v14.8b, v14.8h,#5 //(vi)shift_res = vrshrn_n_u16(add_res, 5)
beq epilogue
@@ -393,14 +396,14 @@ kernel_8_rows:
subs x4,x4,#8
sbfx x9,x14,#8,#8
- ld1 {v8.8b},[x10],x11 //(i)ref_main_idx
+ ld1 {v7.8b},[x10],x11 //(i)ref_main_idx
sub v24.8b, v1.8b , v25.8b //(viii)32-fract(dup_const_32_fract)
add x20,x6,#8 //increment the row value
csel x6, x20, x6,le
add x12,x8,x9 //(ii)*pu1_ref[ref_main_idx]
- ld1 {v9.8b},[x10] //(i)ref_main_idx_1
+ ld1 {v19.8b},[x10] //(i)ref_main_idx_1
umull v22.8h, v20.8b, v24.8b //(viii)vmull_u8(ref_main_idx, dup_const_32_fract)
ld1 {v5.8b},[x6] //loads the row value
@@ -417,10 +420,10 @@ kernel_8_rows:
add x10,x8,x9 //(iii)*pu1_ref[ref_main_idx]
ld1 {v12.8b},[x12],x11 //(ii)ref_main_idx
- umull v10.8h, v8.8b, v30.8b //(i)vmull_u8(ref_main_idx, dup_const_32_fract)
+ umull v23.8h, v7.8b, v30.8b //(i)vmull_u8(ref_main_idx, dup_const_32_fract)
ld1 {v13.8b},[x12] //(ii)ref_main_idx_1
- umlal v10.8h, v9.8b, v31.8b //(i)vmull_u8(ref_main_idx_1, dup_const_fract)
+ umlal v23.8h, v19.8b, v31.8b //(i)vmull_u8(ref_main_idx_1, dup_const_fract)
sbfx x9,x14,#24,#8
csel x4, x5, x4,le //reload nt
@@ -439,7 +442,7 @@ kernel_8_rows:
umlal v14.8h, v13.8b, v29.8b //(ii)vmull_u8(ref_main_idx_1, dup_const_fract)
ld1 {v17.8b},[x10] //(iii)ref_main_idx_1
- rshrn v10.8b, v10.8h,#5 //(i)shift_res = vrshrn_n_u16(add_res, 5)
+ rshrn v23.8b, v23.8h,#5 //(i)shift_res = vrshrn_n_u16(add_res, 5)
dup v25.8b, v4.8b[3] //(iv)
smull v2.8h, v5.8b, v0.8b //pos = ((row + 1) * intra_pred_ang)
@@ -463,7 +466,7 @@ kernel_8_rows:
add x10,x8,x9 //(v)*pu1_ref[ref_main_idx]
sbfx x9,x14,#8,#8
- st1 {v10.8b},[x2],#8 //(i)
+ st1 {v23.8b},[x2],#8 //(i)
sub v24.8b, v1.8b , v25.8b //(iv)32-fract(dup_const_32_fract)
dup v29.8b, v4.8b[5] //(vi)
@@ -478,10 +481,10 @@ kernel_8_rows:
dup v25.8b, v4.8b[7] //(viii)
rshrn v18.8b, v18.8h,#5 //(iii)shift_res = vrshrn_n_u16(add_res, 5)
- ld1 {v8.8b},[x10],x11 //(v)ref_main_idx
+ ld1 {v7.8b},[x10],x11 //(v)ref_main_idx
and v4.16b, v2.16b , v6.16b //dup_const_fract(fract = pos & (31))
- ld1 {v9.8b},[x10] //(v)ref_main_idx_1
+ ld1 {v19.8b},[x10] //(v)ref_main_idx_1
shrn v3.8b, v2.8h,#5 //idx = pos >> 5
st1 {v14.8b},[x0],x3 //(ii)
@@ -496,10 +499,10 @@ kernel_8_rows:
shl v3.8b, v3.8b,#1
ld1 {v13.8b},[x12] //(vi)ref_main_idx_1
- umull v10.8h, v8.8b, v30.8b //(v)vmull_u8(ref_main_idx, dup_const_32_fract)
+ umull v23.8h, v7.8b, v30.8b //(v)vmull_u8(ref_main_idx, dup_const_32_fract)
smov x14, v3.2s[0] //(i)extract idx to the r register
- umlal v10.8h, v9.8b, v31.8b //(v)vmull_u8(ref_main_idx_1, dup_const_fract)
+ umlal v23.8h, v19.8b, v31.8b //(v)vmull_u8(ref_main_idx_1, dup_const_fract)
add x12,x8,x9 //(viii)*pu1_ref[ref_main_idx]
csel x8, x1, x8,le //reload the source to pu1_src+2nt
@@ -514,7 +517,7 @@ kernel_8_rows:
umlal v14.8h, v13.8b, v29.8b //(vi)vmull_u8(ref_main_idx_1, dup_const_fract)
ld1 {v20.8b},[x12],x11 //(viii)ref_main_idx
- rshrn v10.8b, v10.8h,#5 //(v)shift_res = vrshrn_n_u16(add_res, 5)
+ rshrn v23.8b, v23.8h,#5 //(v)shift_res = vrshrn_n_u16(add_res, 5)
ld1 {v21.8b},[x12] //(viii)ref_main_idx_1
sub v26.8b, v1.8b , v27.8b //(vii)32-fract(dup_const_32_fract)
@@ -529,7 +532,7 @@ kernel_8_rows:
st1 {v22.8b},[x0],x3 //(iv)
umull v18.8h, v16.8b, v26.8b //(vii)vmull_u8(ref_main_idx, dup_const_32_fract)
- st1 {v10.8b},[x0],x3 //(v)
+ st1 {v23.8b},[x0],x3 //(v)
umlal v18.8h, v17.8b, v27.8b //(vii)vmull_u8(ref_main_idx_1, dup_const_fract)
add x20,x2,x12 //increment the dst pointer to 8*dst_strd - nt
@@ -563,9 +566,11 @@ core_loop_4:
end_loops:
add sp, sp, #132
- // ldmfd sp!,{x4-x12,x15} //reload the registers from sp
+ // ldmfd sp!,{x4-x12,x15} //reload the registers from sp
ldp x19, x20,[sp],#16
- pop_v_regs
+ ldp d8,d14,[sp],#16 // Loading d14 using { ldr d14,[sp]; add sp,sp,#8 } is giving bus error.
+ // d8 is used as dummy register and loaded along with d14 using ldp. d8 is not used in the function.
+ ldp d12,d13,[sp],#16
ret
diff --git a/common/arm64/ihevc_intra_pred_filters_luma_mode_11_to_17.s b/common/arm64/ihevc_intra_pred_filters_luma_mode_11_to_17.s
index 1502ad6..66f4699 100644
--- a/common/arm64/ihevc_intra_pred_filters_luma_mode_11_to_17.s
+++ b/common/arm64/ihevc_intra_pred_filters_luma_mode_11_to_17.s
@@ -105,7 +105,9 @@
ihevc_intra_pred_luma_mode_11_to_17_av8:
// stmfd sp!, {x4-x12, x14} //stack stores the values of the arguments
- push_v_regs
+
+ stp d12,d13,[sp,#-16]!
+ stp d14,d15,[sp,#-16]!
stp x19, x20,[sp,#-16]!
adrp x7, :got:gai4_ihevc_ang_table
@@ -287,60 +289,60 @@ prologue_8_16_32:
mov x0, #32
dup v28.8b,w0
- sqxtn v8.8b, v22.8h
+ sqxtn v19.8b, v22.8h
and v6.8b, v6.8b , v29.8b //fract values in d1/ idx values in d0
mov x0, #1
dup v27.8b,w0 //row value inc or reset accordingly
- add v8.8b, v8.8b , v27.8b //ref_main_idx (add row)
- sub v8.8b, v8.8b , v26.8b //ref_main_idx (row 0)
- add v9.8b, v8.8b , v2.8b //ref_main_idx + 1 (row 0)
- tbl v12.8b, {v0.16b},v8.8b //load from ref_main_idx (row 0)
+ add v19.8b, v19.8b , v27.8b //ref_main_idx (add row)
+ sub v19.8b, v19.8b , v26.8b //ref_main_idx (row 0)
+ add v21.8b, v19.8b , v2.8b //ref_main_idx + 1 (row 0)
+ tbl v12.8b, {v0.16b},v19.8b //load from ref_main_idx (row 0)
sub v7.8b, v28.8b , v6.8b //32-fract
- tbl v13.8b, {v0.16b},v9.8b //load from ref_main_idx + 1 (row 0)
- add v4.8b, v8.8b , v2.8b //ref_main_idx (row 1)
- add v5.8b, v9.8b , v2.8b //ref_main_idx + 1 (row 1)
+ tbl v13.8b, {v0.16b},v21.8b //load from ref_main_idx + 1 (row 0)
+ add v4.8b, v19.8b , v2.8b //ref_main_idx (row 1)
+ add v5.8b, v21.8b , v2.8b //ref_main_idx + 1 (row 1)
tbl v16.8b, {v0.16b},v4.8b //load from ref_main_idx (row 1)
umull v24.8h, v12.8b, v7.8b //mul (row 0)
umlal v24.8h, v13.8b, v6.8b //mul (row 0)
tbl v17.8b, {v0.16b},v5.8b //load from ref_main_idx + 1 (row 1)
- add v8.8b, v8.8b , v3.8b //ref_main_idx (row 2)
- add v9.8b, v9.8b , v3.8b //ref_main_idx + 1 (row 2)
+ add v19.8b, v19.8b , v3.8b //ref_main_idx (row 2)
+ add v21.8b, v21.8b , v3.8b //ref_main_idx + 1 (row 2)
rshrn v24.8b, v24.8h,#5 //round shft (row 0)
- tbl v14.8b, {v0.16b},v8.8b //load from ref_main_idx (row 2)
+ tbl v14.8b, {v0.16b},v19.8b //load from ref_main_idx (row 2)
umull v22.8h, v16.8b, v7.8b //mul (row 1)
umlal v22.8h, v17.8b, v6.8b //mul (row 1)
- tbl v15.8b, {v0.16b},v9.8b //load from ref_main_idx + 1 (row 2)
+ tbl v15.8b, {v0.16b},v21.8b //load from ref_main_idx + 1 (row 2)
add v4.8b, v4.8b , v3.8b //ref_main_idx (row 3)
add v5.8b, v5.8b , v3.8b //ref_main_idx + 1 (row 3)
st1 {v24.8b},[x2], x3 //st (row 0)
rshrn v22.8b, v22.8h,#5 //round shft (row 1)
- tbl v10.8b, {v0.16b},v4.8b //load from ref_main_idx (row 3)
+ tbl v23.8b, {v0.16b},v4.8b //load from ref_main_idx (row 3)
umull v20.8h, v14.8b, v7.8b //mul (row 2)
umlal v20.8h, v15.8b, v6.8b //mul (row 2)
- tbl v11.8b, {v0.16b},v5.8b //load from ref_main_idx + 1 (row 3)
- add v8.8b, v8.8b , v3.8b //ref_main_idx (row 4)
- add v9.8b, v9.8b , v3.8b //ref_main_idx + 1 (row 4)
+ tbl v25.8b, {v0.16b},v5.8b //load from ref_main_idx + 1 (row 3)
+ add v19.8b, v19.8b , v3.8b //ref_main_idx (row 4)
+ add v21.8b, v21.8b , v3.8b //ref_main_idx + 1 (row 4)
st1 {v22.8b},[x2], x3 //st (row 1)
rshrn v20.8b, v20.8h,#5 //round shft (row 2)
- tbl v12.8b, {v0.16b},v8.8b //load from ref_main_idx (row 4)
- umull v18.8h, v10.8b, v7.8b //mul (row 3)
- umlal v18.8h, v11.8b, v6.8b //mul (row 3)
+ tbl v12.8b, {v0.16b},v19.8b //load from ref_main_idx (row 4)
+ umull v18.8h, v23.8b, v7.8b //mul (row 3)
+ umlal v18.8h, v25.8b, v6.8b //mul (row 3)
- tbl v13.8b, {v0.16b},v9.8b //load from ref_main_idx + 1 (row 4)
+ tbl v13.8b, {v0.16b},v21.8b //load from ref_main_idx + 1 (row 4)
add v4.8b, v4.8b , v3.8b //ref_main_idx (row 5)
add v5.8b, v5.8b , v3.8b //ref_main_idx + 1 (row 5)
@@ -352,30 +354,30 @@ prologue_8_16_32:
umlal v24.8h, v13.8b, v6.8b //mul (row 4)
tbl v17.8b, {v0.16b},v5.8b //load from ref_main_idx + 1 (row 5)
- add v8.8b, v8.8b , v3.8b //ref_main_idx (row 6)
- add v9.8b, v9.8b , v3.8b //ref_main_idx + 1 (row 6)
+ add v19.8b, v19.8b , v3.8b //ref_main_idx (row 6)
+ add v21.8b, v21.8b , v3.8b //ref_main_idx + 1 (row 6)
st1 {v18.8b},[x2], x3 //st (row 3)
rshrn v24.8b, v24.8h,#5 //round shft (row 4)
- tbl v14.8b, {v0.16b},v8.8b //load from ref_main_idx (row 6)
+ tbl v14.8b, {v0.16b},v19.8b //load from ref_main_idx (row 6)
umull v22.8h, v16.8b, v7.8b //mul (row 5)
umlal v22.8h, v17.8b, v6.8b //mul (row 5)
- tbl v15.8b, {v0.16b},v9.8b //load from ref_main_idx + 1 (row 6)
+ tbl v15.8b, {v0.16b},v21.8b //load from ref_main_idx + 1 (row 6)
add v4.8b, v4.8b , v3.8b //ref_main_idx (row 7)
add v5.8b, v5.8b , v3.8b //ref_main_idx + 1 (row 7)
st1 {v24.8b},[x2], x3 //st (row 4)
rshrn v22.8b, v22.8h,#5 //round shft (row 5)
- tbl v10.8b, {v0.16b},v4.8b //load from ref_main_idx (row 7)
+ tbl v23.8b, {v0.16b},v4.8b //load from ref_main_idx (row 7)
umull v20.8h, v14.8b, v7.8b //mul (row 6)
umlal v20.8h, v15.8b, v6.8b //mul (row 6)
- tbl v11.8b, {v0.16b},v5.8b //load from ref_main_idx + 1 (row 7)
- umull v18.8h, v10.8b, v7.8b //mul (row 7)
- umlal v18.8h, v11.8b, v6.8b //mul (row 7)
+ tbl v25.8b, {v0.16b},v5.8b //load from ref_main_idx + 1 (row 7)
+ umull v18.8h, v23.8b, v7.8b //mul (row 7)
+ umlal v18.8h, v25.8b, v6.8b //mul (row 7)
st1 {v22.8b},[x2], x3 //st (row 5)
rshrn v20.8b, v20.8h,#5 //round shft (row 6)
@@ -410,31 +412,31 @@ lbl390:
mov x5,x2
ld1 {v31.8b},[x14],#8
smull v12.8h, v30.8b, v31.8b //(col+1)*intra_pred_angle [0:7](col)
- xtn v10.8b, v12.8h
+ xtn v23.8b, v12.8h
sshr v12.8h, v12.8h,#5
- sqxtn v11.8b, v12.8h
+ sqxtn v25.8b, v12.8h
dup v27.8b,w0 //row value inc or reset accordingly
ldr w9, [x8]
sxtw x9,w9
add x9, x0, x9
sub x9, x9, #1
dup v26.8b,w9
- add v8.8b, v27.8b , v11.8b //ref_main_idx (add row)
+ add v19.8b, v27.8b , v25.8b //ref_main_idx (add row)
sub x4,x4,#8
kernel_8_16_32:
- sub v8.8b, v8.8b , v26.8b //ref_main_idx
- mov v26.8b, v10.8b
+ sub v19.8b, v19.8b , v26.8b //ref_main_idx
+ mov v26.8b, v23.8b
subs x11, x11, #8
add x6, x1, x9
- tbl v10.8b, {v0.16b},v4.8b //load from ref_main_idx (row 7)
- add v9.8b, v2.8b , v8.8b //ref_main_idx + 1
+ tbl v23.8b, {v0.16b},v4.8b //load from ref_main_idx (row 7)
+ add v21.8b, v2.8b , v19.8b //ref_main_idx + 1
umull v20.8h, v14.8b, v7.8b //mul (row 6)
- tbl v11.8b, {v0.16b},v5.8b //load from ref_main_idx + 1 (row 7)
+ tbl v25.8b, {v0.16b},v5.8b //load from ref_main_idx + 1 (row 7)
umlal v20.8h, v15.8b, v6.8b //mul (row 6)
add x20, x0, #8
@@ -453,14 +455,14 @@ lbl429:
csel x8, x12, x8,le
dup v27.8b,w0 //row value inc or reset accordingly
- add v4.8b, v2.8b , v8.8b //ref_main_idx (row 1)
- tbl v12.8b, {v0.16b},v8.8b //load from ref_main_idx (row 0)
- add v5.8b, v2.8b , v9.8b //ref_main_idx + 1 (row 1)
+ add v4.8b, v2.8b , v19.8b //ref_main_idx (row 1)
+ tbl v12.8b, {v0.16b},v19.8b //load from ref_main_idx (row 0)
+ add v5.8b, v2.8b , v21.8b //ref_main_idx + 1 (row 1)
- umull v18.8h, v10.8b, v7.8b //mul (row 7)
- tbl v13.8b, {v0.16b},v9.8b //load from ref_main_idx + 1 (row 0)
- umlal v18.8h, v11.8b, v6.8b //mul (row 7)
+ umull v18.8h, v23.8b, v7.8b //mul (row 7)
+ tbl v13.8b, {v0.16b},v21.8b //load from ref_main_idx + 1 (row 0)
+ umlal v18.8h, v25.8b, v6.8b //mul (row 7)
ld1 {v31.8b},[x14],#8
and v6.8b, v29.8b , v26.8b //fract values in d1/ idx values in d0
@@ -468,9 +470,9 @@ lbl429:
st1 {v24.8b},[x5], x3 //(from previous loop)st (row 5)
rshrn v20.8b, v20.8h,#5 //(from previous loop)round shft (row 6)
- add v8.8b, v3.8b , v8.8b //ref_main_idx (row 2)
+ add v19.8b, v3.8b , v19.8b //ref_main_idx (row 2)
tbl v16.8b, {v0.16b},v4.8b //load from ref_main_idx (row 1)
- add v9.8b, v3.8b , v9.8b //ref_main_idx + 1 (row 2)
+ add v21.8b, v3.8b , v21.8b //ref_main_idx + 1 (row 2)
add x20, x4, #8
csel x11, x20, x11,le
@@ -486,22 +488,22 @@ lbl429:
rshrn v18.8b, v18.8h,#5 //(from previous loop)round shft (row 7)
add v4.8b, v4.8b , v3.8b //ref_main_idx (row 3)
- tbl v14.8b, {v0.16b},v8.8b //load from ref_main_idx (row 2)
+ tbl v14.8b, {v0.16b},v19.8b //load from ref_main_idx (row 2)
add v5.8b, v5.8b , v3.8b //ref_main_idx + 1 (row 3)
umull v22.8h, v16.8b, v7.8b //mul (row 1)
- tbl v15.8b, {v0.16b},v9.8b //load from ref_main_idx + 1 (row 2)
+ tbl v15.8b, {v0.16b},v21.8b //load from ref_main_idx + 1 (row 2)
umlal v22.8h, v17.8b, v6.8b //mul (row 1)
rshrn v24.8b, v24.8h,#5 //round shft (row 0)
st1 {v18.8b},[x5], x3 //(from previous loop)st (row 7)
- add v8.8b, v8.8b , v3.8b //ref_main_idx (row 4)
- tbl v10.8b, {v0.16b},v4.8b //load from ref_main_idx (row 3)
- add v9.8b, v9.8b , v3.8b //ref_main_idx + 1 (row 4)
+ add v19.8b, v19.8b , v3.8b //ref_main_idx (row 4)
+ tbl v23.8b, {v0.16b},v4.8b //load from ref_main_idx (row 3)
+ add v21.8b, v21.8b , v3.8b //ref_main_idx + 1 (row 4)
umull v20.8h, v14.8b, v7.8b //mul (row 2)
- tbl v11.8b, {v0.16b},v5.8b //load from ref_main_idx + 1 (row 3)
+ tbl v25.8b, {v0.16b},v5.8b //load from ref_main_idx + 1 (row 3)
umlal v20.8h, v15.8b, v6.8b //mul (row 2)
smull v14.8h, v30.8b, v31.8b //(col+1)*intra_pred_angle [0:7](col)
@@ -513,22 +515,22 @@ lbl429:
rshrn v22.8b, v22.8h,#5 //round shft (row 1)
add v4.8b, v4.8b , v3.8b //ref_main_idx (row 5)
- tbl v12.8b, {v0.16b},v8.8b //load from ref_main_idx (row 4)
+ tbl v12.8b, {v0.16b},v19.8b //load from ref_main_idx (row 4)
add v5.8b, v5.8b , v3.8b //ref_main_idx + 1 (row 5)
- umull v18.8h, v10.8b, v7.8b //mul (row 3)
- tbl v13.8b, {v0.16b},v9.8b //load from ref_main_idx + 1 (row 4)
- umlal v18.8h, v11.8b, v6.8b //mul (row 3)
+ umull v18.8h, v23.8b, v7.8b //mul (row 3)
+ tbl v13.8b, {v0.16b},v21.8b //load from ref_main_idx + 1 (row 4)
+ umlal v18.8h, v25.8b, v6.8b //mul (row 3)
st1 {v22.8b},[x2], x3 //st (row 1)
rshrn v20.8b, v20.8h,#5 //round shft (row 2)
- xtn v10.8b, v14.8h
+ xtn v23.8b, v14.8h
sshr v14.8h, v14.8h,#5
- add v8.8b, v8.8b , v3.8b //ref_main_idx (row 6)
+ add v19.8b, v19.8b , v3.8b //ref_main_idx (row 6)
tbl v16.8b, {v0.16b},v4.8b //load from ref_main_idx (row 5)
- add v9.8b, v9.8b , v3.8b //ref_main_idx + 1 (row 6)
+ add v21.8b, v21.8b , v3.8b //ref_main_idx + 1 (row 6)
umull v24.8h, v12.8b, v7.8b //mul (row 4)
tbl v17.8b, {v0.16b},v5.8b //load from ref_main_idx + 1 (row 5)
@@ -538,17 +540,17 @@ lbl429:
rshrn v18.8b, v18.8h,#5 //round shft (row 3)
sub x9, x9, #1
- sqxtn v11.8b, v14.8h
+ sqxtn v25.8b, v14.8h
add v4.8b, v4.8b , v3.8b //ref_main_idx (row 7)
- tbl v14.8b, {v0.16b},v8.8b //load from ref_main_idx (row 6)
+ tbl v14.8b, {v0.16b},v19.8b //load from ref_main_idx (row 6)
add v5.8b, v5.8b , v3.8b //ref_main_idx + 1 (row 7)
umull v22.8h, v16.8b, v7.8b //mul (row 5)
- tbl v15.8b, {v0.16b},v9.8b //load from ref_main_idx + 1 (row 6)
+ tbl v15.8b, {v0.16b},v21.8b //load from ref_main_idx + 1 (row 6)
umlal v22.8h, v17.8b, v6.8b //mul (row 5)
- add v8.8b, v27.8b , v11.8b //ref_main_idx (add row)
+ add v19.8b, v27.8b , v25.8b //ref_main_idx (add row)
dup v26.8b,w9
st1 {v18.8b},[x2], x3 //st (row 3)
@@ -566,17 +568,17 @@ lbl429:
bne kernel_8_16_32
epil_8_16_32:
- tbl v10.8b, {v0.16b},v4.8b //load from ref_main_idx (row 7)
+ tbl v23.8b, {v0.16b},v4.8b //load from ref_main_idx (row 7)
umull v20.8h, v14.8b, v7.8b //mul (row 6)
- tbl v11.8b, {v0.16b},v5.8b //load from ref_main_idx + 1 (row 7)
+ tbl v25.8b, {v0.16b},v5.8b //load from ref_main_idx + 1 (row 7)
umlal v20.8h, v15.8b, v6.8b //mul (row 6)
st1 {v24.8b},[x5], x3 //st (row 4)
rshrn v24.8b, v22.8h,#5 //round shft (row 5)
- umull v18.8h, v10.8b, v7.8b //mul (row 7)
- umlal v18.8h, v11.8b, v6.8b //mul (row 7)
+ umull v18.8h, v23.8b, v7.8b //mul (row 7)
+ umlal v18.8h, v25.8b, v6.8b //mul (row 7)
st1 {v24.8b},[x5], x3 //(from previous loop)st (row 5)
rshrn v20.8b, v20.8h,#5 //(from previous loop)round shft (row 6)
@@ -628,38 +630,38 @@ sz_4_proc:
dup v28.8b,w1
sshr v22.8h, v22.8h,#5
- sqxtn v8.8b, v22.8h
+ sqxtn v19.8b, v22.8h
and v6.8b, v6.8b , v29.8b //fract values in d1/ idx values in d0
sub v7.8b, v28.8b , v6.8b //32-fract
- add v8.8b, v8.8b , v2.8b //ref_main_idx (add 1)
- sub v8.8b, v8.8b , v26.8b //ref_main_idx
- add v9.8b, v8.8b , v2.8b //ref_main_idx + 1
+ add v19.8b, v19.8b , v2.8b //ref_main_idx (add 1)
+ sub v19.8b, v19.8b , v26.8b //ref_main_idx
+ add v21.8b, v19.8b , v2.8b //ref_main_idx + 1
- add v4.8b, v8.8b , v2.8b //row 1 ref_main_idx
- add v5.8b, v9.8b , v2.8b
+ add v4.8b, v19.8b , v2.8b //row 1 ref_main_idx
+ add v5.8b, v21.8b , v2.8b
- tbl v12.8b, {v0.16b},v8.8b //load from ref_main_idx (row 0)
- tbl v13.8b, {v0.16b},v9.8b //load from ref_main_idx + 1 (row 0)
+ tbl v12.8b, {v0.16b},v19.8b //load from ref_main_idx (row 0)
+ tbl v13.8b, {v0.16b},v21.8b //load from ref_main_idx + 1 (row 0)
umull v24.8h, v12.8b, v7.8b //mul (row 0)
tbl v16.8b, {v0.16b},v4.8b //load from ref_main_idx (row 1)
umlal v24.8h, v13.8b, v6.8b //mul (row 0)
- add v8.8b, v8.8b , v3.8b //idx (row 2)
+ add v19.8b, v19.8b , v3.8b //idx (row 2)
tbl v17.8b, {v0.16b},v5.8b //load from ref_main_idx + 1 (row 1)
- add v9.8b, v9.8b , v3.8b //idx+1 (row 2)
+ add v21.8b, v21.8b , v3.8b //idx+1 (row 2)
umull v22.8h, v16.8b, v7.8b //mul (row 1)
- tbl v12.8b, {v0.16b},v8.8b //load from ref_main_idx (row 2)
+ tbl v12.8b, {v0.16b},v19.8b //load from ref_main_idx (row 2)
umlal v22.8h, v17.8b, v6.8b //mul (row 1)
rshrn v24.8b, v24.8h,#5 //round shift (row 0)
add v4.8b, v4.8b , v3.8b //idx (row 3)
- tbl v13.8b, {v0.16b},v9.8b //load from ref_main_idx + 1 (row 2)
+ tbl v13.8b, {v0.16b},v21.8b //load from ref_main_idx + 1 (row 2)
add v5.8b, v5.8b , v3.8b //idx+1 (row 3)
umull v20.8h, v12.8b, v7.8b //mul (row 2)
@@ -687,7 +689,8 @@ end_func:
add sp, sp, #132
// ldmfd sp!,{x4-x12,x15} //reload the registers from sp
ldp x19, x20,[sp],#16
- pop_v_regs
+ ldp d14,d15,[sp],#16
+ ldp d12,d13,[sp],#16
ret
diff --git a/common/arm64/ihevc_intra_pred_filters_luma_mode_19_to_25.s b/common/arm64/ihevc_intra_pred_filters_luma_mode_19_to_25.s
index fe7ac11..9b59d58 100644
--- a/common/arm64/ihevc_intra_pred_filters_luma_mode_19_to_25.s
+++ b/common/arm64/ihevc_intra_pred_filters_luma_mode_19_to_25.s
@@ -104,7 +104,10 @@
ihevc_intra_pred_luma_mode_19_to_25_av8:
// stmfd sp!, {x4-x12, x14} //stack stores the values of the arguments
- push_v_regs
+
+ stp d9,d10,[sp,#-16]!
+ stp d12,d13,[sp,#-16]!
+ stp d14,d15,[sp,#-16]!
stp x19, x20,[sp,#-16]!
adrp x7, :got:gai4_ihevc_ang_table
@@ -267,7 +270,7 @@ prologue:
add x10,x8,x9 //(i row)*pu1_ref[ref_main_idx]
- ld1 {v8.8b},[x10],x11 //(i row)ref_main_idx
+ ld1 {v23.8b},[x10],x11 //(i row)ref_main_idx
sbfx x9,x14,#8,#8
ld1 {v9.8b},[x10] //(i row)ref_main_idx_1
@@ -278,7 +281,7 @@ prologue:
add x10,x8,x9 //(iii)*pu1_ref[ref_main_idx]
ld1 {v12.8b},[x12],x11 //(ii)ref_main_idx
- umull v10.8h, v8.8b, v30.8b //(i row)vmull_u8(ref_main_idx, dup_const_32_fract)
+ umull v10.8h, v23.8b, v30.8b //(i row)vmull_u8(ref_main_idx, dup_const_32_fract)
ld1 {v13.8b},[x12] //(ii)ref_main_idx_1
umlal v10.8h, v9.8b, v31.8b //(i row)vmull_u8(ref_main_idx_1, dup_const_fract)
@@ -316,7 +319,7 @@ prologue:
dup v29.8b, v4.8b[5] //(vi)
add x10,x8,x9 //(v)*pu1_ref[ref_main_idx]
- ld1 {v8.8b},[x10],x11 //(v)ref_main_idx
+ ld1 {v23.8b},[x10],x11 //(v)ref_main_idx
sub v24.8b, v1.8b , v25.8b //(iv)32-fract(dup_const_32_fract)
umull v22.8h, v20.8b, v24.8b //(iv)vmull_u8(ref_main_idx, dup_const_32_fract)
@@ -336,7 +339,7 @@ prologue:
add x10,x8,x9 //(vii)*pu1_ref[ref_main_idx]
ld1 {v12.8b},[x12],x11 //(vi)ref_main_idx
- umull v10.8h, v8.8b, v30.8b //(v)vmull_u8(ref_main_idx, dup_const_32_fract)
+ umull v10.8h, v23.8b, v30.8b //(v)vmull_u8(ref_main_idx, dup_const_32_fract)
ld1 {v13.8b},[x12] //(vi)ref_main_idx_1
umlal v10.8h, v9.8b, v31.8b //(v)vmull_u8(ref_main_idx_1, dup_const_fract)
@@ -392,7 +395,7 @@ kernel_8_rows:
subs x4,x4,#8
sbfx x9,x14,#8,#8
- ld1 {v8.8b},[x10],x11 //(i)ref_main_idx
+ ld1 {v23.8b},[x10],x11 //(i)ref_main_idx
sub v24.8b, v1.8b , v25.8b //(viii)32-fract(dup_const_32_fract)
add x20,x6,#8 //increment the row value
@@ -416,7 +419,7 @@ kernel_8_rows:
add x10,x8,x9 //(iii)*pu1_ref[ref_main_idx]
ld1 {v12.8b},[x12],x11 //(ii)ref_main_idx
- umull v10.8h, v8.8b, v30.8b //(i)vmull_u8(ref_main_idx, dup_const_32_fract)
+ umull v10.8h, v23.8b, v30.8b //(i)vmull_u8(ref_main_idx, dup_const_32_fract)
ld1 {v13.8b},[x12] //(ii)ref_main_idx_1
umlal v10.8h, v9.8b, v31.8b //(i)vmull_u8(ref_main_idx_1, dup_const_fract)
@@ -477,7 +480,7 @@ kernel_8_rows:
dup v25.8b, v4.8b[7] //(viii)
rshrn v18.8b, v18.8h,#5 //(iii)shift_res = vrshrn_n_u16(add_res, 5)
- ld1 {v8.8b},[x10],x11 //(v)ref_main_idx
+ ld1 {v23.8b},[x10],x11 //(v)ref_main_idx
and v4.16b, v2.16b , v6.16b //dup_const_fract(fract = pos & (31))
ld1 {v9.8b},[x10] //(v)ref_main_idx_1
@@ -493,7 +496,7 @@ kernel_8_rows:
sub v30.8b, v1.8b , v31.8b //(v)32-fract(dup_const_32_fract)
ld1 {v13.8b},[x12] //(vi)ref_main_idx_1
- umull v10.8h, v8.8b, v30.8b //(v)vmull_u8(ref_main_idx, dup_const_32_fract)
+ umull v10.8h, v23.8b, v30.8b //(v)vmull_u8(ref_main_idx, dup_const_32_fract)
umov w14, v3.2s[0] //(i)extract idx to the r register
sxtw x14,w14
@@ -592,7 +595,7 @@ core_loop_4:
dup v7.8b,w4 //dup_const_32_fract
umlal v4.8h, v3.8b, v0.8b //vmull_u8(ref_main_idx_1, dup_const_fract)
- ld1 {v8.s}[0],[x10] //ref_main_idx
+ ld1 {v23.s}[0],[x10] //ref_main_idx
add x8,x8,#1
ld1 {v9.s}[0],[x11] //ref_main_idx_1
@@ -607,7 +610,7 @@ core_loop_4:
add x11,x10,#1 //pu1_ref_main_idx_1 += 1
dup v12.8b,w5 //dup_const_fract
- umull v10.8h, v8.8b, v7.8b //vmull_u8(ref_main_idx, dup_const_32_fract)
+ umull v10.8h, v23.8b, v7.8b //vmull_u8(ref_main_idx, dup_const_32_fract)
sub x20,x5,#32
neg x4, x20
@@ -655,7 +658,9 @@ end_loops:
add sp, sp, #132
// ldmfd sp!,{x4-x12,x15} //reload the registers from sp
ldp x19, x20,[sp],#16
- pop_v_regs
+ ldp d14,d15,[sp],#16
+ ldp d12,d13,[sp],#16
+ ldp d9,d10,[sp],#16
ret
diff --git a/common/arm64/ihevc_intra_pred_luma_dc.s b/common/arm64/ihevc_intra_pred_luma_dc.s
index 7683266..e4fdb5d 100644
--- a/common/arm64/ihevc_intra_pred_luma_dc.s
+++ b/common/arm64/ihevc_intra_pred_luma_dc.s
@@ -104,7 +104,7 @@
ihevc_intra_pred_luma_dc_av8:
// stmfd sp!, {x4-x12, x14} //stack stores the values of the arguments
- push_v_regs
+
stp x19, x20,[sp,#-16]!
@@ -128,14 +128,14 @@ ihevc_intra_pred_luma_dc_av8:
add x8, x7, #1 //&src[2nt+1]
mvn x5, x5
add x5, x5, #1
- dup v8.2s,w5
+ dup v7.2s,w5
ldrb w14, [x8]
sxtw x14,w14
- shl d8, d8,#32
+ shl d7, d7,#32
sub x9, x7, #1 //&src[2nt-1]
- sshr d8, d8,#32
+ sshr d7, d7,#32
mov x7, x8 //x7 also stores 2nt+1
@@ -192,7 +192,7 @@ core_loop_add:
epil_add_loop:
- sshl d9, d6, d8 //(dc_val) shr by log2nt+1
+ sshl d18, d6, d7 //(dc_val) shr by log2nt+1
cmp x4, #32
mov v28.s[0], w14
@@ -200,25 +200,25 @@ epil_add_loop:
mov x20,#128
csel x6, x20, x6,eq
- dup v16.8b, v9.8b[0] //dc_val
- shl d13, d9,#1 //2*dc
+ dup v16.8b, v18.8b[0] //dc_val
+ shl d25, d18,#1 //2*dc
beq prologue_cpy_32
- add d14, d13 , d28 //src[2nt+1]+2+src[2nt-1]+2dc_val
+ add d27, d25 , d28 //src[2nt+1]+2+src[2nt-1]+2dc_val
mov x20,#0
csel x6, x20, x6,ne //nt
- ushr v15.4h, v14.4h,#2 //final dst[0]'s value in d15[0]
+ ushr v29.4h, v27.4h,#2 //final dst[0]'s value in d15[0]
csel x10, x4, x10,ne
- add d11, d13 , d9 //3*dc
+ add d23, d25 , d18 //3*dc
sub x12, x3, x3, lsl #3 //-7*strd
- add d11, d11 , d17 //3*dc + 2
+ add d23, d23 , d17 //3*dc + 2
add x12, x12, #8 //offset after one 8x8 block (-7*strd + 8)
- dup v24.8h, v11.4h[0] //3*dc + 2 (moved to all lanes)
+ dup v24.8h, v23.4h[0] //3*dc + 2 (moved to all lanes)
sub x0, x3, x4 //strd - nt
prologue_col:
@@ -248,7 +248,7 @@ prologue_col:
movi d19, #0x00000000000000ff //
sqshrun v3.8b, v22.8h,#2 //rows shx2 movn (prol)
- bsl v19.8b, v15.8b , v2.8b //first row with dst[0]
+ bsl v19.8b, v29.8b , v2.8b //first row with dst[0]
add v26.8h, v26.8h , v24.8h //col 8::15 add 3dc+2 (prol extra)
rev64 v3.8b, v3.8b
@@ -445,23 +445,23 @@ dc_4:
mov v28.s[1], w5 //src[2nt+1]+2+src[2nt-1] moved to d28
add d6, d6 , d5 //accumulate all inp into d6 (end for nt==8)
- sshl d9, d6, d8 //(dc_val) shr by log2nt+1
+ sshl d18, d6, d7 //(dc_val) shr by log2nt+1
mov x8, x7 //&src[2nt+1]
- shl d13, d9,#1 //2*dc
+ shl d25, d18,#1 //2*dc
sub x9, x9, #3 //&src[2nt-1-row]
- dup v16.8b, v9.8b[0] //dc_val
- add d14, d13 , d28 //src[2nt+1]+2+src[2nt-1]+2dc_val
+ dup v16.8b, v18.8b[0] //dc_val
+ add d27, d25 , d28 //src[2nt+1]+2+src[2nt-1]+2dc_val
- ushr v15.4h, v14.4h,#2 //final dst[0]'s value in d15[0]
+ ushr v29.4h, v27.4h,#2 //final dst[0]'s value in d15[0]
sub x12, x3, x3, lsl #2 //-3*strd
- add d11, d13 , d9 //3*dc
+ add d23, d25 , d18 //3*dc
- add d11, d11 , d17 //3*dc + 2
+ add d23, d23 , d17 //3*dc + 2
add x12, x12, #4 //offset after one 4x4 block (-3*strd + 4)
- dup v24.8h, v11.4h[0] //3*dc + 2 (moved to all lanes)
+ dup v24.8h, v23.4h[0] //3*dc + 2 (moved to all lanes)
sub x0, x3, x4 //strd - nt
@@ -482,7 +482,7 @@ dc_4:
sqshrun v3.8b, v22.8h,#2 //rows shx2 movn (prol)
- bsl v19.8b, v15.8b , v2.8b //first row with dst[0]
+ bsl v19.8b, v29.8b , v2.8b //first row with dst[0]
rev64 v3.8b, v3.8b
@@ -510,7 +510,7 @@ epilogue_end:
end_func:
// ldmfd sp!,{x4-x12,x15} //reload the registers from sp
ldp x19, x20,[sp],#16
- pop_v_regs
+
ret
diff --git a/common/arm64/ihevc_intra_pred_luma_horz.s b/common/arm64/ihevc_intra_pred_luma_horz.s
index 551fd77..95452e4 100644
--- a/common/arm64/ihevc_intra_pred_luma_horz.s
+++ b/common/arm64/ihevc_intra_pred_luma_horz.s
@@ -97,7 +97,7 @@
ihevc_intra_pred_luma_horz_av8:
// stmfd sp!, {x4-x12, x14} //stack stores the values of the arguments
- push_v_regs
+
stp x19, x20,[sp,#-16]!
//ldr x5,[sp,#44] @loads mode
@@ -126,7 +126,7 @@ core_loop_32:
st1 { v2.16b},[x2],x3 //store in 1st row 0-16 columns
st1 { v2.16b},[x9],x3 //store in 1st row 16-32 columns
- dup v8.16b, v0.16b[12]
+ dup v1.16b, v0.16b[12]
st1 { v4.16b},[x2],x3
st1 { v4.16b},[x9],x3
@@ -135,14 +135,14 @@ core_loop_32:
st1 { v6.16b},[x9],x3
dup v4.16b, v0.16b[10]
- st1 { v8.16b},[x2],x3
- st1 { v8.16b},[x9],x3
+ st1 { v1.16b},[x2],x3
+ st1 { v1.16b},[x9],x3
dup v6.16b, v0.16b[9]
st1 { v2.16b},[x2],x3
st1 { v2.16b},[x9],x3
- dup v8.16b, v0.16b[8]
+ dup v1.16b, v0.16b[8]
st1 { v4.16b},[x2],x3
st1 { v4.16b},[x9],x3
@@ -151,14 +151,14 @@ core_loop_32:
st1 { v6.16b},[x9],x3
dup v4.16b, v0.8b[6]
- st1 { v8.16b},[x2],x3
- st1 { v8.16b},[x9],x3
+ st1 { v1.16b},[x2],x3
+ st1 { v1.16b},[x9],x3
dup v6.16b, v0.8b[5]
st1 { v2.16b},[x2],x3
st1 { v2.16b},[x9],x3
- dup v8.16b, v0.8b[4]
+ dup v1.16b, v0.8b[4]
st1 { v4.16b},[x2],x3
st1 { v4.16b},[x9],x3
@@ -167,15 +167,15 @@ core_loop_32:
st1 { v6.16b},[x9],x3
dup v4.16b, v0.8b[2]
- st1 { v8.16b},[x2],x3
- st1 { v8.16b},[x9],x3
+ st1 { v1.16b},[x2],x3
+ st1 { v1.16b},[x9],x3
dup v6.16b, v0.8b[1]
st1 { v2.16b},[x2],x3
st1 { v2.16b},[x9],x3
sub x12,x12,#16 //move to 16th value pointer
- dup v8.16b, v0.8b[0]
+ dup v1.16b, v0.8b[0]
st1 { v4.16b},[x2],x3
st1 { v4.16b},[x9],x3
@@ -183,12 +183,12 @@ core_loop_32:
st1 { v6.16b},[x2],x3
st1 { v6.16b},[x9],x3
- st1 { v8.16b},[x2],x3
- st1 { v8.16b},[x9],x3
+ st1 { v1.16b},[x2],x3
+ st1 { v1.16b},[x9],x3
bgt core_loop_32
// ldmfd sp!,{x4-x12,x15} //reload the registers from sp
ldp x19, x20,[sp],#16
- pop_v_regs
+
ret
b end_func
@@ -214,18 +214,18 @@ core_loop_16:
dup v6.16b, v0.16b[12]
sqadd v22.8h, v26.8h , v24.8h
- dup v8.16b, v0.16b[11]
+ dup v1.16b, v0.16b[11]
sqxtun v22.8b, v22.8h
st1 {v22.8b},[x2],#8
- dup v10.16b, v0.16b[10]
+ dup v18.16b, v0.16b[10]
usubl v24.8h, v31.8b, v28.8b
- dup v12.16b, v0.16b[9]
+ dup v19.16b, v0.16b[9]
sshr v24.8h, v24.8h,#1
- dup v14.16b, v0.16b[8]
+ dup v20.16b, v0.16b[8]
sqadd v22.8h, v26.8h , v24.8h
dup v16.16b, v0.8b[7]
@@ -238,37 +238,37 @@ core_loop_16:
st1 { v4.16b},[x2],x3
st1 { v6.16b},[x2],x3
- st1 { v8.16b},[x2],x3
+ st1 { v1.16b},[x2],x3
dup v2.16b, v0.8b[6]
- st1 { v10.16b},[x2],x3
+ st1 { v18.16b},[x2],x3
dup v4.16b, v0.8b[5]
- st1 { v12.16b},[x2],x3
+ st1 { v19.16b},[x2],x3
dup v6.16b, v0.8b[4]
- st1 { v14.16b},[x2],x3
+ st1 { v20.16b},[x2],x3
- dup v8.16b, v0.8b[3]
+ dup v1.16b, v0.8b[3]
st1 { v16.16b},[x2],x3
- dup v10.16b, v0.8b[2]
+ dup v18.16b, v0.8b[2]
st1 { v2.16b},[x2],x3
- dup v12.16b, v0.8b[1]
+ dup v19.16b, v0.8b[1]
st1 { v4.16b},[x2],x3
- dup v14.16b, v0.8b[0]
+ dup v20.16b, v0.8b[0]
st1 { v6.16b},[x2],x3
- st1 { v8.16b},[x2],x3
- st1 { v10.16b},[x2],x3
- st1 { v12.16b},[x2],x3
- st1 { v14.16b},[x2],x3
+ st1 { v1.16b},[x2],x3
+ st1 { v18.16b},[x2],x3
+ st1 { v19.16b},[x2],x3
+ st1 { v20.16b},[x2],x3
// ldmfd sp!,{x4-x12,x15} //reload the registers from sp
ldp x19, x20,[sp],#16
- pop_v_regs
+
ret
b end_func
@@ -302,19 +302,19 @@ core_loop_8:
st1 {v22.8b},[x2],x3
st1 {v3.8b},[x2],x3
- dup v8.8b, v0.8b[1]
+ dup v1.8b, v0.8b[1]
st1 {v4.8b},[x2],x3
st1 {v5.8b},[x2],x3
- dup v9.8b, v0.8b[0]
+ dup v17.8b, v0.8b[0]
st1 {v6.8b},[x2],x3
st1 {v7.8b},[x2],x3
- st1 {v8.8b},[x2],x3
- st1 {v9.8b},[x2],x3
+ st1 {v1.8b},[x2],x3
+ st1 {v17.8b},[x2],x3
// ldmfd sp!,{x4-x12,x15} //reload the registers from sp
ldp x19, x20,[sp],#16
- pop_v_regs
+
ret
b end_func
@@ -349,7 +349,7 @@ core_loop_4:
// ldmfd sp!,{x4-x12,x15} //reload the registers from sp
ldp x19, x20,[sp],#16
- pop_v_regs
+
ret
end_func:
diff --git a/common/arm64/ihevc_intra_pred_luma_mode2.s b/common/arm64/ihevc_intra_pred_luma_mode2.s
index 5d7a3c5..598ce5a 100644
--- a/common/arm64/ihevc_intra_pred_luma_mode2.s
+++ b/common/arm64/ihevc_intra_pred_luma_mode2.s
@@ -105,7 +105,7 @@
ihevc_intra_pred_luma_mode2_av8:
// stmfd sp!, {x4-x12, x14} //stack stores the values of the arguments
- push_v_regs
+
stp x19, x20,[sp,#-16]!
mov x8,#-2
@@ -138,20 +138,20 @@ prologue_cpy_32:
ld1 {v7.8b},[x10],x8
add x7,x6,x3
- rev64 v8.8b, v0.8b
- rev64 v9.8b, v1.8b
+ rev64 v16.8b, v0.8b
+ rev64 v17.8b, v1.8b
lsl x5, x3, #2
- rev64 v10.8b, v2.8b
- rev64 v11.8b, v3.8b
+ rev64 v18.8b, v2.8b
+ rev64 v19.8b, v3.8b
add x9,x7,x3
- rev64 v12.8b, v4.8b
+ rev64 v20.8b, v4.8b
subs x1,x1,#8
- rev64 v13.8b, v5.8b
- rev64 v14.8b, v6.8b
- rev64 v15.8b, v7.8b
+ rev64 v21.8b, v5.8b
+ rev64 v22.8b, v6.8b
+ rev64 v23.8b, v7.8b
add x14,x9,x3
beq epilogue_mode2
@@ -160,24 +160,24 @@ prologue_cpy_32:
kernel_mode2:
- st1 {v8.8b},[x6],x5
- st1 {v9.8b},[x7],x5
+ st1 {v16.8b},[x6],x5
+ st1 {v17.8b},[x7],x5
subs x11,x11,#8
- st1 {v10.8b},[x9],x5
+ st1 {v18.8b},[x9],x5
add x20,x2,#8
csel x2, x20, x2,gt
- st1 {v11.8b},[x14],x5
- st1 {v12.8b},[x6],x5
+ st1 {v19.8b},[x14],x5
+ st1 {v20.8b},[x6],x5
csel x11, x4, x11,le
- st1 {v13.8b},[x7],x5
- st1 {v14.8b},[x9],x5
+ st1 {v21.8b},[x7],x5
+ st1 {v22.8b},[x9],x5
add x20, x2, x3, lsl #2
csel x2, x20, x2,le
- st1 {v15.8b},[x14],x5
+ st1 {v23.8b},[x14],x5
ld1 {v0.8b},[x0],x8
sub x14,x4,#8
@@ -201,42 +201,42 @@ kernel_mode2:
add x20, x0, x4
csel x0, x20, x0,le
- rev64 v8.8b, v0.8b
+ rev64 v16.8b, v0.8b
add x7, x6, x3
- rev64 v9.8b, v1.8b
+ rev64 v17.8b, v1.8b
sub x20, x0, #8
csel x0, x20, x0,le
- rev64 v10.8b, v2.8b
+ rev64 v18.8b, v2.8b
csel x12, x4, x12,le
- rev64 v11.8b, v3.8b
+ rev64 v19.8b, v3.8b
add x9, x7, x3
- rev64 v12.8b, v4.8b
+ rev64 v20.8b, v4.8b
add x10,x0,#-1
- rev64 v13.8b, v5.8b
+ rev64 v21.8b, v5.8b
subs x1, x1, #8
- rev64 v14.8b, v6.8b
+ rev64 v22.8b, v6.8b
add x14, x9, x3
- rev64 v15.8b, v7.8b
+ rev64 v23.8b, v7.8b
bne kernel_mode2
epilogue_mode2:
- st1 {v8.8b},[x6],x5
- st1 {v9.8b},[x7],x5
- st1 {v10.8b},[x9],x5
- st1 {v11.8b},[x14],x5
- st1 {v12.8b},[x6],x5
- st1 {v13.8b},[x7],x5
- st1 {v14.8b},[x9],x5
- st1 {v15.8b},[x14],x5
+ st1 {v16.8b},[x6],x5
+ st1 {v17.8b},[x7],x5
+ st1 {v18.8b},[x9],x5
+ st1 {v19.8b},[x14],x5
+ st1 {v20.8b},[x6],x5
+ st1 {v21.8b},[x7],x5
+ st1 {v22.8b},[x9],x5
+ st1 {v23.8b},[x14],x5
b end_func
@@ -269,7 +269,7 @@ mode2_4:
end_func:
// ldmfd sp!,{x4-x12,x15} //reload the registers from sp
ldp x19, x20,[sp],#16
- pop_v_regs
+
ret
diff --git a/common/arm64/ihevc_intra_pred_luma_mode_27_to_33.s b/common/arm64/ihevc_intra_pred_luma_mode_27_to_33.s
index 79964f7..58b2d37 100644
--- a/common/arm64/ihevc_intra_pred_luma_mode_27_to_33.s
+++ b/common/arm64/ihevc_intra_pred_luma_mode_27_to_33.s
@@ -100,7 +100,10 @@
ihevc_intra_pred_luma_mode_27_to_33_av8:
// stmfd sp!, {x4-x12, x14} //stack stores the values of the arguments
- push_v_regs
+
+ stp d9,d10,[sp,#-16]!
+ stp d12,d13,[sp,#-16]!
+ stp d14,d15,[sp,#-16]!
stp x19, x20,[sp,#-16]!
adrp x6, :got:gai4_ihevc_ang_table //loads word32 gai4_ihevc_ang_table[35]
@@ -156,7 +159,7 @@ prologue:
add x10,x8,x9 //(i row)*pu1_ref[ref_main_idx]
asr x14,x14,#8 //(ii)shift by 8
- ld1 {v8.8b},[x10],x11 //(i row)ref_main_idx
+ ld1 {v23.8b},[x10],x11 //(i row)ref_main_idx
and x9,x14,#0xff //(ii)get the last byte
asr x14,x14,#8 //(iii)
@@ -168,7 +171,7 @@ prologue:
add x10,x8,x9 //(iii)*pu1_ref[ref_main_idx]
ld1 {v12.8b},[x12],x11 //(ii)ref_main_idx
- umull v10.8h, v8.8b, v30.8b //(i row)vmull_u8(ref_main_idx, dup_const_32_fract)
+ umull v10.8h, v23.8b, v30.8b //(i row)vmull_u8(ref_main_idx, dup_const_32_fract)
ld1 {v13.8b},[x12] //(ii)ref_main_idx_1
umlal v10.8h, v9.8b, v31.8b //(i row)vmull_u8(ref_main_idx_1, dup_const_fract)
@@ -207,7 +210,7 @@ prologue:
dup v29.8b, v4.8b[5] //(vi)
add x10,x8,x9 //(v)*pu1_ref[ref_main_idx]
- ld1 {v8.8b},[x10],x11 //(v)ref_main_idx
+ ld1 {v23.8b},[x10],x11 //(v)ref_main_idx
sub v24.8b, v1.8b , v25.8b //(iv)32-fract(dup_const_32_fract)
asr x14,x14,#8 //(vi)
@@ -229,7 +232,7 @@ prologue:
add x10,x8,x9 //(vii)*pu1_ref[ref_main_idx]
ld1 {v12.8b},[x12],x11 //(vi)ref_main_idx
- umull v10.8h, v8.8b, v30.8b //(v)vmull_u8(ref_main_idx, dup_const_32_fract)
+ umull v10.8h, v23.8b, v30.8b //(v)vmull_u8(ref_main_idx, dup_const_32_fract)
ld1 {v13.8b},[x12] //(vi)ref_main_idx_1
umlal v10.8h, v9.8b, v31.8b //(v)vmull_u8(ref_main_idx_1, dup_const_fract)
@@ -286,7 +289,7 @@ kernel_8_rows:
dup v31.8b, v4.8b[0]
subs x4,x4,#8
- ld1 {v8.8b},[x10],x11 //(i)ref_main_idx
+ ld1 {v23.8b},[x10],x11 //(i)ref_main_idx
sub v24.8b, v1.8b , v25.8b //(viii)32-fract(dup_const_32_fract)
and x9,x14,#0xff //(ii)
add x20,x6,#8 //increment the row value
@@ -309,7 +312,7 @@ kernel_8_rows:
add x10,x8,x9 //(iii)*pu1_ref[ref_main_idx]
ld1 {v12.8b},[x12],x11 //(ii)ref_main_idx
- umull v10.8h, v8.8b, v30.8b //(i)vmull_u8(ref_main_idx, dup_const_32_fract)
+ umull v10.8h, v23.8b, v30.8b //(i)vmull_u8(ref_main_idx, dup_const_32_fract)
asr x14,x14,#8 //(iv)
ld1 {v13.8b},[x12] //(ii)ref_main_idx_1
@@ -368,7 +371,7 @@ kernel_8_rows:
rshrn v18.8b, v18.8h,#5 //(iii)shift_res = vrshrn_n_u16(add_res, 5)
asr x14,x14,#8 //(vii)
- ld1 {v8.8b},[x10],x11 //(v)ref_main_idx
+ ld1 {v23.8b},[x10],x11 //(v)ref_main_idx
and v4.16b, v2.16b , v6.16b //dup_const_fract(fract = pos & (31))
and x9,x14,#0xff //(vii)
@@ -385,7 +388,7 @@ kernel_8_rows:
and x9,x14,#0xff //(viii)
ld1 {v13.8b},[x12] //(vi)ref_main_idx_1
- umull v10.8h, v8.8b, v30.8b //(v)vmull_u8(ref_main_idx, dup_const_32_fract)
+ umull v10.8h, v23.8b, v30.8b //(v)vmull_u8(ref_main_idx, dup_const_32_fract)
umov w14, v3.2s[0] //(i)extract idx to the r register
sxtw x14,w14
@@ -484,7 +487,7 @@ core_loop_4:
dup v7.8b,w4 //dup_const_32_fract
umlal v4.8h, v3.8b, v0.8b //vmull_u8(ref_main_idx_1, dup_const_fract)
- ld1 {v8.s}[0],[x10] //ref_main_idx
+ ld1 {v23.s}[0],[x10] //ref_main_idx
add x8,x8,#1
ld1 {v9.s}[0],[x11] //ref_main_idx_1
@@ -500,7 +503,7 @@ core_loop_4:
add x11,x10,#1 //pu1_ref_main_idx_1 += 1
dup v12.8b,w5 //dup_const_fract
- umull v10.8h, v8.8b, v7.8b //vmull_u8(ref_main_idx, dup_const_32_fract)
+ umull v10.8h, v23.8b, v7.8b //vmull_u8(ref_main_idx, dup_const_32_fract)
sub x20,x5,#32
neg x4, x20
@@ -548,7 +551,9 @@ core_loop_4:
end_loops:
// ldmfd sp!,{x4-x12,x15} //reload the registers from sp
ldp x19, x20,[sp],#16
- pop_v_regs
+ ldp d14,d15,[sp],#16
+ ldp d12,d13,[sp],#16
+ ldp d9,d10,[sp],#16
ret
diff --git a/common/arm64/ihevc_intra_pred_luma_mode_3_to_9.s b/common/arm64/ihevc_intra_pred_luma_mode_3_to_9.s
index b6e8601..56d2f6b 100644
--- a/common/arm64/ihevc_intra_pred_luma_mode_3_to_9.s
+++ b/common/arm64/ihevc_intra_pred_luma_mode_3_to_9.s
@@ -106,7 +106,9 @@
ihevc_intra_pred_luma_mode_3_to_9_av8:
// stmfd sp!, {x4-x12, x14} //stack stores the values of the arguments
- push_v_regs
+
+ stp d12,d13,[sp,#-16]!
+ stp d14,d15,[sp,#-16]!
stp x19, x20,[sp,#-16]!
adrp x7, :got:gai4_ihevc_ang_table
@@ -165,7 +167,7 @@ prologue_8_16_32:
movi v28.8b, #32
- sqxtn v8.8b, v22.8h
+ sqxtn v1.8b, v22.8h
and v6.8b, v6.8b , v29.8b //fract values in d1/ idx values in d0
@@ -173,54 +175,54 @@ prologue_8_16_32:
movi v27.8b, #7 //row 0 to 7
- sub v8.8b, v8.8b , v2.8b //ref_main_idx (sub row)
- sub v8.8b, v26.8b , v8.8b //ref_main_idx (row 0)
- add v8.8b, v8.8b , v27.8b //t0 compensate the pu1_src idx incremented by 8
- sub v9.8b, v8.8b , v2.8b //ref_main_idx + 1 (row 0)
- tbl v12.8b, {v0.16b},v8.8b //load from ref_main_idx (row 0)
+ sub v1.8b, v1.8b , v2.8b //ref_main_idx (sub row)
+ sub v1.8b, v26.8b , v1.8b //ref_main_idx (row 0)
+ add v1.8b, v1.8b , v27.8b //t0 compensate the pu1_src idx incremented by 8
+ sub v19.8b, v1.8b , v2.8b //ref_main_idx + 1 (row 0)
+ tbl v12.8b, {v0.16b},v1.8b //load from ref_main_idx (row 0)
sub v7.8b, v28.8b , v6.8b //32-fract
- tbl v13.8b, {v0.16b},v9.8b //load from ref_main_idx + 1 (row 0)
- sub v4.8b, v8.8b , v2.8b //ref_main_idx (row 1)
- sub v5.8b, v9.8b , v2.8b //ref_main_idx + 1 (row 1)
+ tbl v13.8b, {v0.16b},v19.8b //load from ref_main_idx + 1 (row 0)
+ sub v4.8b, v1.8b , v2.8b //ref_main_idx (row 1)
+ sub v5.8b, v19.8b , v2.8b //ref_main_idx + 1 (row 1)
tbl v16.8b, {v0.16b},v4.8b //load from ref_main_idx (row 1)
umull v24.8h, v12.8b, v7.8b //mul (row 0)
umlal v24.8h, v13.8b, v6.8b //mul (row 0)
tbl v17.8b, {v0.16b},v5.8b //load from ref_main_idx + 1 (row 1)
- sub v8.8b, v8.8b , v3.8b //ref_main_idx (row 2)
- sub v9.8b, v9.8b , v3.8b //ref_main_idx + 1 (row 2)
+ sub v1.8b, v1.8b , v3.8b //ref_main_idx (row 2)
+ sub v19.8b, v19.8b , v3.8b //ref_main_idx + 1 (row 2)
rshrn v24.8b, v24.8h,#5 //round shft (row 0)
- tbl v14.8b, {v0.16b},v8.8b //load from ref_main_idx (row 2)
+ tbl v14.8b, {v0.16b},v1.8b //load from ref_main_idx (row 2)
umull v22.8h, v16.8b, v7.8b //mul (row 1)
umlal v22.8h, v17.8b, v6.8b //mul (row 1)
- tbl v15.8b, {v0.16b},v9.8b //load from ref_main_idx + 1 (row 2)
+ tbl v15.8b, {v0.16b},v19.8b //load from ref_main_idx + 1 (row 2)
sub v4.8b, v4.8b , v3.8b //ref_main_idx (row 3)
sub v5.8b, v5.8b , v3.8b //ref_main_idx + 1 (row 3)
st1 {v24.8b},[x2], x3 //st (row 0)
rshrn v22.8b, v22.8h,#5 //round shft (row 1)
- tbl v10.8b, {v0.16b},v4.8b //load from ref_main_idx (row 3)
+ tbl v23.8b, {v0.16b},v4.8b //load from ref_main_idx (row 3)
umull v20.8h, v14.8b, v7.8b //mul (row 2)
umlal v20.8h, v15.8b, v6.8b //mul (row 2)
- tbl v11.8b, {v0.16b},v5.8b //load from ref_main_idx + 1 (row 3)
- sub v8.8b, v8.8b , v3.8b //ref_main_idx (row 4)
- sub v9.8b, v9.8b , v3.8b //ref_main_idx + 1 (row 4)
+ tbl v25.8b, {v0.16b},v5.8b //load from ref_main_idx + 1 (row 3)
+ sub v1.8b, v1.8b , v3.8b //ref_main_idx (row 4)
+ sub v19.8b, v19.8b , v3.8b //ref_main_idx + 1 (row 4)
st1 {v22.8b},[x2], x3 //st (row 1)
rshrn v20.8b, v20.8h,#5 //round shft (row 2)
- tbl v12.8b, {v0.16b},v8.8b //load from ref_main_idx (row 4)
- umull v18.8h, v10.8b, v7.8b //mul (row 3)
- umlal v18.8h, v11.8b, v6.8b //mul (row 3)
+ tbl v12.8b, {v0.16b},v1.8b //load from ref_main_idx (row 4)
+ umull v18.8h, v23.8b, v7.8b //mul (row 3)
+ umlal v18.8h, v25.8b, v6.8b //mul (row 3)
- tbl v13.8b, {v0.16b},v9.8b //load from ref_main_idx + 1 (row 4)
+ tbl v13.8b, {v0.16b},v19.8b //load from ref_main_idx + 1 (row 4)
sub v4.8b, v4.8b , v3.8b //ref_main_idx (row 5)
sub v5.8b, v5.8b , v3.8b //ref_main_idx + 1 (row 5)
@@ -232,30 +234,30 @@ prologue_8_16_32:
umlal v24.8h, v13.8b, v6.8b //mul (row 4)
tbl v17.8b, {v0.16b},v5.8b //load from ref_main_idx + 1 (row 5)
- sub v8.8b, v8.8b , v3.8b //ref_main_idx (row 6)
- sub v9.8b, v9.8b , v3.8b //ref_main_idx + 1 (row 6)
+ sub v1.8b, v1.8b , v3.8b //ref_main_idx (row 6)
+ sub v19.8b, v19.8b , v3.8b //ref_main_idx + 1 (row 6)
st1 {v18.8b},[x2], x3 //st (row 3)
rshrn v24.8b, v24.8h,#5 //round shft (row 4)
- tbl v14.8b, {v0.16b},v8.8b //load from ref_main_idx (row 6)
+ tbl v14.8b, {v0.16b},v1.8b //load from ref_main_idx (row 6)
umull v22.8h, v16.8b, v7.8b //mul (row 5)
umlal v22.8h, v17.8b, v6.8b //mul (row 5)
- tbl v15.8b, {v0.16b},v9.8b //load from ref_main_idx + 1 (row 6)
+ tbl v15.8b, {v0.16b},v19.8b //load from ref_main_idx + 1 (row 6)
sub v4.8b, v4.8b , v3.8b //ref_main_idx (row 7)
sub v5.8b, v5.8b , v3.8b //ref_main_idx + 1 (row 7)
st1 {v24.8b},[x2], x3 //st (row 4)
rshrn v22.8b, v22.8h,#5 //round shft (row 5)
- tbl v10.8b, {v0.16b},v4.8b //load from ref_main_idx (row 7)
+ tbl v23.8b, {v0.16b},v4.8b //load from ref_main_idx (row 7)
umull v20.8h, v14.8b, v7.8b //mul (row 6)
umlal v20.8h, v15.8b, v6.8b //mul (row 6)
- tbl v11.8b, {v0.16b},v5.8b //load from ref_main_idx + 1 (row 7)
- umull v18.8h, v10.8b, v7.8b //mul (row 7)
- umlal v18.8h, v11.8b, v6.8b //mul (row 7)
+ tbl v25.8b, {v0.16b},v5.8b //load from ref_main_idx + 1 (row 7)
+ umull v18.8h, v23.8b, v7.8b //mul (row 7)
+ umlal v18.8h, v25.8b, v6.8b //mul (row 7)
st1 {v22.8b},[x2], x3 //st (row 5)
rshrn v20.8b, v20.8h,#5 //round shft (row 6)
@@ -290,9 +292,9 @@ lbl284:
mov x5,x2
ld1 {v31.8b},[x14],#8
smull v12.8h, v30.8b, v31.8b //(col+1)*intra_pred_angle [0:7](col)
- xtn v10.8b, v12.8h
+ xtn v23.8b, v12.8h
sshr v12.8h, v12.8h,#5
- sqxtn v11.8b, v12.8h
+ sqxtn v25.8b, v12.8h
ldr w9, [x8]
sxtw x9,w9
add x9, x0, x9
@@ -304,19 +306,19 @@ lbl284:
kernel_8_16_32:
- sub v8.8b, v26.8b , v11.8b //ref_main_idx
- mov v26.8b, v10.8b
+ sub v1.8b, v26.8b , v25.8b //ref_main_idx
+ mov v26.8b, v23.8b
subs x11, x11, #8
sub x6, x1, x9
- tbl v10.8b, {v0.16b},v4.8b //load from ref_main_idx (row 7)
- add v8.8b, v8.8b , v16.8b //to compensate the pu1_src idx incremented by 8
+ tbl v23.8b, {v0.16b},v4.8b //load from ref_main_idx (row 7)
+ add v1.8b, v1.8b , v16.8b //to compensate the pu1_src idx incremented by 8
umull v20.8h, v14.8b, v7.8b //mul (row 6)
- tbl v11.8b, {v0.16b},v5.8b //load from ref_main_idx - 1 (row 7)
+ tbl v25.8b, {v0.16b},v5.8b //load from ref_main_idx - 1 (row 7)
umlal v20.8h, v15.8b, v6.8b //mul (row 6)
- sub v9.8b, v8.8b , v2.8b //ref_main_idx - 1
+ sub v19.8b, v1.8b , v2.8b //ref_main_idx - 1
add x20, x0, #8
csel x0, x20, x0,le
add x20, x8, #4
@@ -333,14 +335,14 @@ lbl323:
csel x8, x12, x8,le
dup v27.8b,w0 //row value inc or reset accordingly
- sub v4.8b, v8.8b , v2.8b //ref_main_idx (row 1)
- tbl v12.8b, {v0.16b},v8.8b //load from ref_main_idx (row 0)
- sub v5.8b, v9.8b , v2.8b //ref_main_idx - 1 (row 1)
+ sub v4.8b, v1.8b , v2.8b //ref_main_idx (row 1)
+ tbl v12.8b, {v0.16b},v1.8b //load from ref_main_idx (row 0)
+ sub v5.8b, v19.8b , v2.8b //ref_main_idx - 1 (row 1)
- umull v18.8h, v10.8b, v7.8b //mul (row 7)
- tbl v13.8b, {v0.16b},v9.8b //load from ref_main_idx + 1 (row 0)
- umlal v18.8h, v11.8b, v6.8b //mul (row 7)
+ umull v18.8h, v23.8b, v7.8b //mul (row 7)
+ tbl v13.8b, {v0.16b},v19.8b //load from ref_main_idx + 1 (row 0)
+ umlal v18.8h, v25.8b, v6.8b //mul (row 7)
ld1 {v31.8b},[x14],#8
and v6.8b, v29.8b , v26.8b //fract values in d1/ idx values in d0
@@ -348,9 +350,9 @@ lbl323:
st1 {v22.8b},[x5], x3 //(from previous loop)st (row 5)
rshrn v20.8b, v20.8h,#5 //(from previous loop)round shft (row 6)
- sub v8.8b, v8.8b , v3.8b //ref_main_idx (row 2)
- tbl v10.8b, {v0.16b},v4.8b //load from ref_main_idx (row 1)
- sub v9.8b, v9.8b , v3.8b //ref_main_idx - 1 (row 2)
+ sub v1.8b, v1.8b , v3.8b //ref_main_idx (row 2)
+ tbl v23.8b, {v0.16b},v4.8b //load from ref_main_idx (row 1)
+ sub v19.8b, v19.8b , v3.8b //ref_main_idx - 1 (row 2)
add x20, x4, #8
csel x11, x20, x11,le
@@ -366,22 +368,22 @@ lbl323:
rshrn v18.8b, v18.8h,#5 //(from previous loop)round shft (row 7)
sub v4.8b, v4.8b , v3.8b //ref_main_idx (row 3)
- tbl v14.8b, {v0.16b},v8.8b //load from ref_main_idx (row 2)
+ tbl v14.8b, {v0.16b},v1.8b //load from ref_main_idx (row 2)
sub v5.8b, v5.8b , v3.8b //ref_main_idx - 1 (row 3)
- umull v22.8h, v10.8b, v7.8b //mul (row 1)
- tbl v15.8b, {v0.16b},v9.8b //load from ref_main_idx + 1 (row 2)
+ umull v22.8h, v23.8b, v7.8b //mul (row 1)
+ tbl v15.8b, {v0.16b},v19.8b //load from ref_main_idx + 1 (row 2)
umlal v22.8h, v17.8b, v6.8b //mul (row 1)
rshrn v24.8b, v24.8h,#5 //round shft (row 0)
st1 {v18.8b},[x5], x3 //(from previous loop)st (row 7)
- sub v8.8b, v8.8b , v3.8b //ref_main_idx (row 4)
- tbl v10.8b, {v0.16b},v4.8b //load from ref_main_idx (row 3)
- sub v9.8b, v9.8b , v3.8b //ref_main_idx - 1 (row 4)
+ sub v1.8b, v1.8b , v3.8b //ref_main_idx (row 4)
+ tbl v23.8b, {v0.16b},v4.8b //load from ref_main_idx (row 3)
+ sub v19.8b, v19.8b , v3.8b //ref_main_idx - 1 (row 4)
umull v20.8h, v14.8b, v7.8b //mul (row 2)
- tbl v11.8b, {v0.16b},v5.8b //load from ref_main_idx + 1 (row 3)
+ tbl v25.8b, {v0.16b},v5.8b //load from ref_main_idx + 1 (row 3)
umlal v20.8h, v15.8b, v6.8b //mul (row 2)
smull v14.8h, v30.8b, v31.8b //(col+1)*intra_pred_angle [0:7](col)
@@ -392,22 +394,22 @@ lbl323:
rshrn v22.8b, v22.8h,#5 //round shft (row 1)
sub v4.8b, v4.8b , v3.8b //ref_main_idx (row 5)
- tbl v12.8b, {v0.16b},v8.8b //load from ref_main_idx (row 4)
+ tbl v12.8b, {v0.16b},v1.8b //load from ref_main_idx (row 4)
sub v5.8b, v5.8b , v3.8b //ref_main_idx - 1 (row 5)
- umull v18.8h, v10.8b, v7.8b //mul (row 3)
- tbl v13.8b, {v0.16b},v9.8b //load from ref_main_idx + 1 (row 4)
- umlal v18.8h, v11.8b, v6.8b //mul (row 3)
+ umull v18.8h, v23.8b, v7.8b //mul (row 3)
+ tbl v13.8b, {v0.16b},v19.8b //load from ref_main_idx + 1 (row 4)
+ umlal v18.8h, v25.8b, v6.8b //mul (row 3)
st1 {v22.8b},[x2], x3 //st (row 1)
rshrn v20.8b, v20.8h,#5 //round shft (row 2)
- xtn v10.8b, v14.8h
+ xtn v23.8b, v14.8h
sshr v14.8h, v14.8h,#5
- sub v8.8b, v8.8b , v3.8b //ref_main_idx (row 6)
+ sub v1.8b, v1.8b , v3.8b //ref_main_idx (row 6)
tbl v21.8b, {v0.16b},v4.8b //load from ref_main_idx (row 5)
- sub v9.8b, v9.8b , v3.8b //ref_main_idx - 1 (row 6)
+ sub v19.8b, v19.8b , v3.8b //ref_main_idx - 1 (row 6)
umull v24.8h, v12.8b, v7.8b //mul (row 4)
tbl v17.8b, {v0.16b},v5.8b //load from ref_main_idx + 1 (row 5)
@@ -417,24 +419,24 @@ lbl323:
rshrn v18.8b, v18.8h,#5 //round shft (row 3)
sub x9, x9, #1
- sqxtn v11.8b, v14.8h
+ sqxtn v25.8b, v14.8h
sub v4.8b, v4.8b , v3.8b //ref_main_idx (row 7)
- tbl v14.8b, {v0.16b},v8.8b //load from ref_main_idx (row 6)
+ tbl v14.8b, {v0.16b},v1.8b //load from ref_main_idx (row 6)
sub v5.8b, v5.8b , v3.8b //ref_main_idx - 1 (row 7)
umull v22.8h, v21.8b, v7.8b //mul (row 5)
- tbl v15.8b, {v0.16b},v9.8b //load from ref_main_idx + 1 (row 6)
+ tbl v15.8b, {v0.16b},v19.8b //load from ref_main_idx + 1 (row 6)
umlal v22.8h, v17.8b, v6.8b //mul (row 5)
- add v11.8b, v27.8b , v11.8b //ref_main_idx (add row)
+ add v25.8b, v27.8b , v25.8b //ref_main_idx (add row)
dup v26.8b,w9
st1 {v18.8b},[x2], x3 //st (row 3)
rshrn v24.8b, v24.8h,#5 //round shft (row 4)
add x2, x2, x3, lsl #2
- sub v11.8b, v11.8b , v2.8b //ref_main_idx -1 (sub 1)
+ sub v25.8b, v25.8b , v2.8b //ref_main_idx -1 (sub 1)
add x20, x7, x2
csel x2, x20, x2,gt
@@ -446,17 +448,17 @@ lbl323:
bne kernel_8_16_32
epil_8_16_32:
- tbl v10.8b, {v0.16b},v4.8b //load from ref_main_idx (row 7)
+ tbl v23.8b, {v0.16b},v4.8b //load from ref_main_idx (row 7)
umull v20.8h, v14.8b, v7.8b //mul (row 6)
- tbl v11.8b, {v0.16b},v5.8b //load from ref_main_idx + 1 (row 7)
+ tbl v25.8b, {v0.16b},v5.8b //load from ref_main_idx + 1 (row 7)
umlal v20.8h, v15.8b, v6.8b //mul (row 6)
st1 {v24.8b},[x5], x3 //st (row 4)
rshrn v24.8b, v22.8h,#5 //round shft (row 5)
- umull v18.8h, v10.8b, v7.8b //mul (row 7)
- umlal v18.8h, v11.8b, v6.8b //mul (row 7)
+ umull v18.8h, v23.8b, v7.8b //mul (row 7)
+ umlal v18.8h, v25.8b, v6.8b //mul (row 7)
st1 {v24.8b},[x5], x3 //(from previous loop)st (row 5)
rshrn v20.8b, v20.8h,#5 //(from previous loop)round shft (row 6)
@@ -499,40 +501,40 @@ sz_4_proc:
movi v28.8b, #32
sshr v22.8h, v22.8h,#5
- sqxtn v8.8b, v22.8h
+ sqxtn v1.8b, v22.8h
and v6.8b, v6.8b , v29.8b //fract values in d1/ idx values in d0
sub v7.8b, v28.8b , v6.8b //32-fract
movi v27.8b, #7 //row 0 to 7(row-1)
- sub v8.8b, v8.8b , v2.8b //ref_main_idx (add 1)
- sub v8.8b, v26.8b , v8.8b //ref_main_idx
- add v8.8b, v8.8b , v27.8b //t0 compensate the pu1_src idx incremented by 8
- sub v9.8b, v8.8b , v2.8b //ref_main_idx - 1
+ sub v1.8b, v1.8b , v2.8b //ref_main_idx (add 1)
+ sub v1.8b, v26.8b , v1.8b //ref_main_idx
+ add v1.8b, v1.8b , v27.8b //t0 compensate the pu1_src idx incremented by 8
+ sub v19.8b, v1.8b , v2.8b //ref_main_idx - 1
- sub v4.8b, v8.8b , v2.8b //row 1 ref_main_idx
- sub v5.8b, v9.8b , v2.8b
+ sub v4.8b, v1.8b , v2.8b //row 1 ref_main_idx
+ sub v5.8b, v19.8b , v2.8b
- tbl v12.8b, {v0.16b},v8.8b //load from ref_main_idx (row 0)
- tbl v13.8b, {v0.16b},v9.8b //load from ref_main_idx + 1 (row 0)
+ tbl v12.8b, {v0.16b},v1.8b //load from ref_main_idx (row 0)
+ tbl v13.8b, {v0.16b},v19.8b //load from ref_main_idx + 1 (row 0)
umull v24.8h, v12.8b, v7.8b //mul (row 0)
tbl v16.8b, {v0.16b},v4.8b //load from ref_main_idx (row 1)
umlal v24.8h, v13.8b, v6.8b //mul (row 0)
- sub v8.8b, v8.8b , v3.8b //idx (row 2)
+ sub v1.8b, v1.8b , v3.8b //idx (row 2)
tbl v17.8b, {v0.16b},v5.8b //load from ref_main_idx + 1 (row 1)
- sub v9.8b, v9.8b , v3.8b //idx+1 (row 2)
+ sub v19.8b, v19.8b , v3.8b //idx+1 (row 2)
umull v22.8h, v16.8b, v7.8b //mul (row 1)
- tbl v12.8b, {v0.16b},v8.8b //load from ref_main_idx (row 2)
+ tbl v12.8b, {v0.16b},v1.8b //load from ref_main_idx (row 2)
umlal v22.8h, v17.8b, v6.8b //mul (row 1)
rshrn v24.8b, v24.8h,#5 //round shift (row 0)
sub v4.8b, v4.8b , v3.8b //idx (row 3)
- tbl v13.8b, {v0.16b},v9.8b //load from ref_main_idx + 1 (row 2)
+ tbl v13.8b, {v0.16b},v19.8b //load from ref_main_idx + 1 (row 2)
sub v5.8b, v5.8b , v3.8b //idx+1 (row 3)
umull v20.8h, v12.8b, v7.8b //mul (row 2)
@@ -559,7 +561,8 @@ sz_4_proc:
end_func:
// ldmfd sp!,{x4-x12,x15} //reload the registers from sp
ldp x19, x20,[sp],#16
- pop_v_regs
+ ldp d14,d15,[sp],#16
+ ldp d12,d13,[sp],#16
ret
diff --git a/common/arm64/ihevc_intra_pred_luma_planar.s b/common/arm64/ihevc_intra_pred_luma_planar.s
index d2f27a2..ba04f42 100644
--- a/common/arm64/ihevc_intra_pred_luma_planar.s
+++ b/common/arm64/ihevc_intra_pred_luma_planar.s
@@ -107,7 +107,7 @@
ihevc_intra_pred_luma_planar_av8:
// stmfd sp!, {x4-x12, x14} //stack stores the values of the arguments
- push_v_regs
+
stp x19, x20,[sp,#-16]!
adrp x11, :got:gau1_ihevc_planar_factor //loads table of coeffs
@@ -116,8 +116,8 @@ ihevc_intra_pred_luma_planar_av8:
clz w5,w4
sub x20, x5, #32
neg x5, x20
- dup v14.8h,w5
- neg v14.8h, v14.8h //shr value (so vneg)
+ dup v29.8h,w5
+ neg v29.8h, v29.8h //shr value (so vneg)
dup v2.8b,w4 //nt
dup v16.8h,w4 //nt
@@ -175,22 +175,22 @@ tf_sz_8_16_32:
col_loop_8_16_32:
- ld1 {v8.8b},[x12] //(1-8)load 8 coeffs [col+1]
- dup v12.8h,w4 //(1)
+ ld1 {v17.8b},[x12] //(1-8)load 8 coeffs [col+1]
+ dup v27.8h,w4 //(1)
ld1 {v4.8b},[x6] //(1-8)src[2nt-1-row]
- sub v9.8b, v2.8b , v8.8b //(1-8)[nt-1-col]
+ sub v19.8b, v2.8b , v17.8b //(1-8)[nt-1-col]
- umlal v12.8h, v5.8b, v0.8b //(1)(row+1) * src[nt-1]
+ umlal v27.8h, v5.8b, v0.8b //(1)(row+1) * src[nt-1]
ld1 {v3.8b},[x14] //(1-8)load 8 src[2nt+1+col]
- umlal v12.8h, v8.8b, v1.8b //(1)(col+1) * src[3nt+1]
+ umlal v27.8h, v17.8b, v1.8b //(1)(col+1) * src[3nt+1]
dup v20.8b, v4.8b[7] //(1)
- umlal v12.8h, v6.8b, v3.8b //(1)(nt-1-row) * src[2nt+1+col]
+ umlal v27.8h, v6.8b, v3.8b //(1)(nt-1-row) * src[2nt+1+col]
dup v21.8b, v4.8b[6] //(2)
- umlal v12.8h, v9.8b, v20.8b //(1)(nt-1-col) * src[2nt-1-row]
+ umlal v27.8h, v19.8b, v20.8b //(1)(nt-1-col) * src[2nt-1-row]
dup v30.8h,w4 //(2)
add v5.8b, v5.8b , v7.8b //(1)
@@ -201,46 +201,46 @@ col_loop_8_16_32:
umlal v30.8h, v5.8b, v0.8b //(2)
dup v28.8h,w4 //(3)
- umlal v30.8h, v8.8b, v1.8b //(2)
+ umlal v30.8h, v17.8b, v1.8b //(2)
umlal v30.8h, v6.8b, v3.8b //(2)
- umlal v30.8h, v9.8b, v21.8b //(2)
+ umlal v30.8h, v19.8b, v21.8b //(2)
- sshl v12.8h, v12.8h, v14.8h //(1)shr
+ sshl v27.8h, v27.8h, v29.8h //(1)shr
add v5.8b, v5.8b , v7.8b //(2)
sub v6.8b, v6.8b , v7.8b //(2)
- xtn v12.8b, v12.8h //(1)
+ xtn v27.8b, v27.8h //(1)
umlal v28.8h, v5.8b, v0.8b //(3)
dup v23.8b, v4.8b[4] //(4)
- umlal v28.8h, v8.8b, v1.8b //(3)
+ umlal v28.8h, v17.8b, v1.8b //(3)
- dup v10.8h,w4 //(4)
+ dup v25.8h,w4 //(4)
umlal v28.8h, v6.8b, v3.8b //(3)
- st1 {v12.8b},[x2], x3 //(1)str 8 values
- umlal v28.8h, v9.8b, v22.8b //(3)
+ st1 {v27.8b},[x2], x3 //(1)str 8 values
+ umlal v28.8h, v19.8b, v22.8b //(3)
- sshl v30.8h, v30.8h, v14.8h //(2)shr
+ sshl v30.8h, v30.8h, v29.8h //(2)shr
add v5.8b, v5.8b , v7.8b //(3)
sub v6.8b, v6.8b , v7.8b //(3)
xtn v30.8b, v30.8h //(2)
- umlal v10.8h, v5.8b, v0.8b //(4)
+ umlal v25.8h, v5.8b, v0.8b //(4)
dup v20.8b, v4.8b[3] //(5)
- umlal v10.8h, v8.8b, v1.8b //(4)
+ umlal v25.8h, v17.8b, v1.8b //(4)
dup v16.8h,w4 //(5)
- umlal v10.8h, v6.8b, v3.8b //(4)
+ umlal v25.8h, v6.8b, v3.8b //(4)
st1 {v30.8b},[x2], x3 //(2)str 8 values
- umlal v10.8h, v9.8b, v23.8b //(4)
+ umlal v25.8h, v19.8b, v23.8b //(4)
- sshl v28.8h, v28.8h, v14.8h //(3)shr
+ sshl v28.8h, v28.8h, v29.8h //(3)shr
add v5.8b, v5.8b , v7.8b //(4)
sub v6.8b, v6.8b , v7.8b //(4)
@@ -249,31 +249,31 @@ col_loop_8_16_32:
umlal v16.8h, v5.8b, v0.8b //(5)
dup v21.8b, v4.8b[2] //(6)
- umlal v16.8h, v8.8b, v1.8b //(5)
+ umlal v16.8h, v17.8b, v1.8b //(5)
dup v18.8h,w4 //(6)
umlal v16.8h, v6.8b, v3.8b //(5)
st1 {v28.8b},[x2], x3 //(3)str 8 values
- umlal v16.8h, v9.8b, v20.8b //(5)
+ umlal v16.8h, v19.8b, v20.8b //(5)
- sshl v10.8h, v10.8h, v14.8h //(4)shr
+ sshl v25.8h, v25.8h, v29.8h //(4)shr
add v5.8b, v5.8b , v7.8b //(5)
sub v6.8b, v6.8b , v7.8b //(5)
- xtn v10.8b, v10.8h //(4)
+ xtn v25.8b, v25.8h //(4)
umlal v18.8h, v5.8b, v0.8b //(6)
dup v22.8b, v4.8b[1] //(7)
- umlal v18.8h, v8.8b, v1.8b //(6)
+ umlal v18.8h, v17.8b, v1.8b //(6)
dup v26.8h,w4 //(7)
umlal v18.8h, v6.8b, v3.8b //(6)
- st1 {v10.8b},[x2], x3 //(4)str 8 values
- umlal v18.8h, v9.8b, v21.8b //(6)
+ st1 {v25.8b},[x2], x3 //(4)str 8 values
+ umlal v18.8h, v19.8b, v21.8b //(6)
- sshl v16.8h, v16.8h, v14.8h //(5)shr
+ sshl v16.8h, v16.8h, v29.8h //(5)shr
add v5.8b, v5.8b , v7.8b //(6)
sub v6.8b, v6.8b , v7.8b //(6)
@@ -282,15 +282,15 @@ col_loop_8_16_32:
umlal v26.8h, v5.8b, v0.8b //(7)
dup v23.8b, v4.8b[0] //(8)
- umlal v26.8h, v8.8b, v1.8b //(7)
+ umlal v26.8h, v17.8b, v1.8b //(7)
dup v24.8h,w4 //(8)
umlal v26.8h, v6.8b, v3.8b //(7)
st1 {v16.8b},[x2], x3 //(5)str 8 values
- umlal v26.8h, v9.8b, v22.8b //(7)
+ umlal v26.8h, v19.8b, v22.8b //(7)
- sshl v18.8h, v18.8h, v14.8h //(6)shr
+ sshl v18.8h, v18.8h, v29.8h //(6)shr
add v5.8b, v5.8b , v7.8b //(7)
sub v6.8b, v6.8b , v7.8b //(7)
@@ -299,14 +299,14 @@ col_loop_8_16_32:
umlal v24.8h, v5.8b, v0.8b //(8)
- umlal v24.8h, v8.8b, v1.8b //(8)
+ umlal v24.8h, v17.8b, v1.8b //(8)
umlal v24.8h, v6.8b, v3.8b //(8)
st1 {v18.8b},[x2], x3 //(6)str 8 values
- umlal v24.8h, v9.8b, v23.8b //(8)
+ umlal v24.8h, v19.8b, v23.8b //(8)
- sshl v26.8h, v26.8h, v14.8h //(7)shr
+ sshl v26.8h, v26.8h, v29.8h //(7)shr
subs x7, x7, #8
@@ -322,7 +322,7 @@ col_loop_8_16_32:
csel x12, x20, x12,le
csel x14, x0, x14,le //x14 reset
- ld1 {v8.8b},[x12] //(1n)(1-8)load 8 coeffs [col+1]
+ ld1 {v17.8b},[x12] //(1n)(1-8)load 8 coeffs [col+1]
sub x20, x6, #8 //for next set of rows
csel x6, x20, x6,le
@@ -330,12 +330,12 @@ col_loop_8_16_32:
add x20, x5, #8
csel x5, x20, x5,le
- dup v12.8h,w4 //(1n)(1)
+ dup v27.8h,w4 //(1n)(1)
ld1 {v5.8b},[x5]
ld1 {v4.8b},[x6] //(1n)(1-8)src[2nt-1-row]
- sub v9.8b, v2.8b , v8.8b //(1n)(1-8)[nt-1-col]
+ sub v19.8b, v2.8b , v17.8b //(1n)(1-8)[nt-1-col]
dup v20.8b, v4.8b[7] //(1n)(1)
sub v6.8b, v2.8b , v5.8b
@@ -345,19 +345,19 @@ col_loop_8_16_32:
kernel_plnr:
cmp x1, #0 // (cond loop)
- sshl v24.8h, v24.8h, v14.8h //(8)shr
+ sshl v24.8h, v24.8h, v29.8h //(8)shr
xtn v26.8b, v26.8h //(7)
- umlal v12.8h, v5.8b, v0.8b //(1)(row+1) * src[nt-1]
+ umlal v27.8h, v5.8b, v0.8b //(1)(row+1) * src[nt-1]
xtn v24.8b, v24.8h //(8)
- umlal v12.8h, v8.8b, v1.8b //(1)(col+1) * src[3nt+1]
+ umlal v27.8h, v17.8b, v1.8b //(1)(col+1) * src[3nt+1]
dup v21.8b, v4.8b[6] //(2)
- umlal v12.8h, v6.8b, v3.8b //(1)(nt-1-row) * src[2nt+1+col]
+ umlal v27.8h, v6.8b, v3.8b //(1)(nt-1-row) * src[2nt+1+col]
dup v30.8h,w4 //(2)
- umlal v12.8h, v9.8b, v20.8b //(1)(nt-1-col) * src[2nt-1-row]
+ umlal v27.8h, v19.8b, v20.8b //(1)(nt-1-col) * src[2nt-1-row]
st1 {v26.8b},[x2], x3 //(7)str 8 values
add v5.8b, v5.8b , v7.8b //(1)
@@ -371,15 +371,15 @@ kernel_plnr:
sub x20, x2, x10 //else go to next set of rows, dst - (nt-8) (cond loop)
csel x2, x20, x2,le
- umlal v30.8h, v8.8b, v1.8b //(2)
+ umlal v30.8h, v17.8b, v1.8b //(2)
dup v22.8b, v4.8b[5] //(3)
umlal v30.8h, v6.8b, v3.8b //(2)
dup v28.8h,w4 //(3)
- umlal v30.8h, v9.8b, v21.8b //(2)
+ umlal v30.8h, v19.8b, v21.8b //(2)
- sshl v12.8h, v12.8h, v14.8h //(1)shr
+ sshl v27.8h, v27.8h, v29.8h //(1)shr
add v5.8b, v5.8b , v7.8b //(2)
csel x1, x4, x1,le //nt reloaded (refresh the value) (cond loop)
@@ -387,37 +387,37 @@ kernel_plnr:
sub v6.8b, v6.8b , v7.8b //(2)
subs x1, x1, #8 //row counter (loop)
- xtn v12.8b, v12.8h //(1)
+ xtn v27.8b, v27.8h //(1)
umlal v28.8h, v5.8b, v0.8b //(3)
dup v23.8b, v4.8b[4] //(4)
- umlal v28.8h, v8.8b, v1.8b //(3)
+ umlal v28.8h, v17.8b, v1.8b //(3)
- dup v10.8h,w4 //(4)
+ dup v25.8h,w4 //(4)
umlal v28.8h, v6.8b, v3.8b //(3)
- st1 {v12.8b},[x2], x3 //(1)str 8 values
- umlal v28.8h, v9.8b, v22.8b //(3)
+ st1 {v27.8b},[x2], x3 //(1)str 8 values
+ umlal v28.8h, v19.8b, v22.8b //(3)
- sshl v30.8h, v30.8h, v14.8h //(2)shr
+ sshl v30.8h, v30.8h, v29.8h //(2)shr
add v5.8b, v5.8b , v7.8b //(3)
sub v6.8b, v6.8b , v7.8b //(3)
xtn v30.8b, v30.8h //(2)
- umlal v10.8h, v5.8b, v0.8b //(4)
+ umlal v25.8h, v5.8b, v0.8b //(4)
dup v20.8b, v4.8b[3] //(5)
- umlal v10.8h, v8.8b, v1.8b //(4)
+ umlal v25.8h, v17.8b, v1.8b //(4)
dup v16.8h,w4 //(5)
- umlal v10.8h, v6.8b, v3.8b //(4)
+ umlal v25.8h, v6.8b, v3.8b //(4)
st1 {v30.8b},[x2], x3 //(2)str 8 values
- umlal v10.8h, v9.8b, v23.8b //(4)
+ umlal v25.8h, v19.8b, v23.8b //(4)
- sshl v28.8h, v28.8h, v14.8h //(3)shr
+ sshl v28.8h, v28.8h, v29.8h //(3)shr
add v5.8b, v5.8b , v7.8b //(4)
@@ -427,17 +427,17 @@ kernel_plnr:
umlal v16.8h, v5.8b, v0.8b //(5)
dup v21.8b, v4.8b[2] //(6)
- umlal v16.8h, v8.8b, v1.8b //(5)
+ umlal v16.8h, v17.8b, v1.8b //(5)
dup v18.8h,w4 //(6)
umlal v16.8h, v6.8b, v3.8b //(5)
st1 {v28.8b},[x2], x3 //(3)str 8 values
- umlal v16.8h, v9.8b, v20.8b //(5)
+ umlal v16.8h, v19.8b, v20.8b //(5)
add x20, x11, #1 //x12 reset (cond loop)
csel x12, x20, x12,le
- sshl v10.8h, v10.8h, v14.8h //(4)shr
+ sshl v25.8h, v25.8h, v29.8h //(4)shr
add x20, x12, #8 //col inc (cond loop)
csel x12, x20, x12,gt
@@ -447,20 +447,20 @@ kernel_plnr:
csel x14, x20, x14,gt
sub v6.8b, v6.8b , v7.8b //(5)
- xtn v10.8b, v10.8h //(4)
+ xtn v25.8b, v25.8h //(4)
umlal v18.8h, v5.8b, v0.8b //(6)
dup v22.8b, v4.8b[1] //(7)
- umlal v18.8h, v8.8b, v1.8b //(6)
+ umlal v18.8h, v17.8b, v1.8b //(6)
dup v26.8h,w4 //(7)
umlal v18.8h, v6.8b, v3.8b //(6)
- st1 {v10.8b},[x2], x3 //(4)str 8 values
- umlal v18.8h, v9.8b, v21.8b //(6)
+ st1 {v25.8b},[x2], x3 //(4)str 8 values
+ umlal v18.8h, v19.8b, v21.8b //(6)
csel x14, x0, x14,le //x14 reset (cond loop)
- sshl v16.8h, v16.8h, v14.8h //(5)shr
+ sshl v16.8h, v16.8h, v29.8h //(5)shr
sub x20, x6, #8 //for next set of rows (cond loop)
csel x6, x20, x6,le
@@ -474,16 +474,16 @@ kernel_plnr:
umlal v26.8h, v5.8b, v0.8b //(7)
dup v23.8b, v4.8b[0] //(8)
- umlal v26.8h, v8.8b, v1.8b //(7)
+ umlal v26.8h, v17.8b, v1.8b //(7)
dup v24.8h,w4 //(8)
umlal v26.8h, v6.8b, v3.8b //(7)
st1 {v16.8b},[x2], x3 //(5)str 8 values
- umlal v26.8h, v9.8b, v22.8b //(7)
+ umlal v26.8h, v19.8b, v22.8b //(7)
ld1 {v4.8b},[x6] //(1n)(1-8)src[2nt-1-row]
- sshl v18.8h, v18.8h, v14.8h //(6)shr
+ sshl v18.8h, v18.8h, v29.8h //(6)shr
add v5.8b, v5.8b , v7.8b //(7)
@@ -493,24 +493,24 @@ kernel_plnr:
umlal v24.8h, v5.8b, v0.8b //(8)
ld1 {v5.8b},[x5] //(row+1 value)
- umlal v24.8h, v8.8b, v1.8b //(8)
+ umlal v24.8h, v17.8b, v1.8b //(8)
dup v20.8b, v4.8b[7] //(1n)(1)
umlal v24.8h, v6.8b, v3.8b //(8)
st1 {v18.8b},[x2], x3 //(6)str 8 values
- umlal v24.8h, v9.8b, v23.8b //(8)
+ umlal v24.8h, v19.8b, v23.8b //(8)
- ld1 {v8.8b},[x12] //(1n)(1-8)load 8 coeffs [col+1]
+ ld1 {v17.8b},[x12] //(1n)(1-8)load 8 coeffs [col+1]
sub v6.8b, v2.8b , v5.8b //(nt-1-row) value
subs x7, x7, #8 //col counter
ld1 {v3.8b},[x14] //(1n)(1-8)load 8 src[2nt+1+col]
- sshl v26.8h, v26.8h, v14.8h //(7)shr
+ sshl v26.8h, v26.8h, v29.8h //(7)shr
- dup v12.8h,w4 //(1n)(1)
- sub v9.8b, v2.8b , v8.8b //(1n)(1-8)[nt-1-col]
+ dup v27.8h,w4 //(1n)(1)
+ sub v19.8b, v2.8b , v17.8b //(1n)(1-8)[nt-1-col]
bne kernel_plnr
@@ -519,7 +519,7 @@ epilog:
xtn v26.8b, v26.8h //(7)
st1 {v26.8b},[x2], x3 //(7)str 8 values
- sshl v24.8h, v24.8h, v14.8h //(8)shr
+ sshl v24.8h, v24.8h, v29.8h //(8)shr
xtn v24.8b, v24.8h //(8)
st1 {v24.8b},[x2], x3 //(8)str 8 values
@@ -528,25 +528,25 @@ epilog:
beq end_loop
tf_sz_4:
- ld1 {v10.8b},[x14] //load src[2nt+1+col]
- ld1 {v8.8b},[x12], x10 //load 8 coeffs [col+1]
+ ld1 {v25.8b},[x14] //load src[2nt+1+col]
+ ld1 {v17.8b},[x12], x10 //load 8 coeffs [col+1]
loop_sz_4:
mov x10, #4 //reduce inc to #4 for 4x4
ldr w7, [x6], #-1 //src[2nt-1-row] (dec to take into account row)
sxtw x7,w7
dup v4.8b,w7 //src[2nt-1-row]
- sub v9.8b, v2.8b , v8.8b //[nt-1-col]
+ sub v19.8b, v2.8b , v17.8b //[nt-1-col]
- umull v12.8h, v5.8b, v0.8b //(row+1) * src[nt-1]
- umlal v12.8h, v6.8b, v10.8b //(nt-1-row) * src[2nt+1+col]
- umlal v12.8h, v8.8b, v1.8b //(col+1) * src[3nt+1]
- umlal v12.8h, v9.8b, v4.8b //(nt-1-col) * src[2nt-1-row]
+ umull v27.8h, v5.8b, v0.8b //(row+1) * src[nt-1]
+ umlal v27.8h, v6.8b, v25.8b //(nt-1-row) * src[2nt+1+col]
+ umlal v27.8h, v17.8b, v1.8b //(col+1) * src[3nt+1]
+ umlal v27.8h, v19.8b, v4.8b //(nt-1-col) * src[2nt-1-row]
// vadd.i16 q6, q6, q8 @add (nt)
// vshl.s16 q6, q6, q7 @shr
// vmovn.i16 d12, q6
- rshrn v12.8b, v12.8h,#3
- st1 {v12.s}[0],[x2], x3
+ rshrn v27.8b, v27.8h,#3
+ st1 {v27.s}[0],[x2], x3
add v5.8b, v5.8b , v7.8b //row++ [(row+1)++]
sub v6.8b, v6.8b , v7.8b //[nt-1-row]--
@@ -557,7 +557,7 @@ loop_sz_4:
end_loop:
// ldmfd sp!,{x4-x12,x15} //reload the registers from sp
ldp x19, x20,[sp],#16
- pop_v_regs
+
ret
diff --git a/common/arm64/ihevc_intra_pred_luma_vert.s b/common/arm64/ihevc_intra_pred_luma_vert.s
index 56a20a0..c67f721 100644
--- a/common/arm64/ihevc_intra_pred_luma_vert.s
+++ b/common/arm64/ihevc_intra_pred_luma_vert.s
@@ -101,7 +101,7 @@
ihevc_intra_pred_luma_ver_av8:
// stmfd sp!, {x4-x12, x14} //stack stores the values of the arguments
- push_v_regs
+
stp x19, x20,[sp,#-16]!
lsl x5, x4, #1 //2nt
@@ -207,7 +207,7 @@ blk_16:
sqadd v0.8h, v0.8h , v30.8h
sqadd v28.8h, v28.8h , v30.8h
- movi d10, #0x00000000000000ff
+ movi d3, #0x00000000000000ff
//vaddl.s8 q1, d25, d27
sqxtun v24.8b, v28.8h
@@ -218,13 +218,13 @@ blk_16:
rev64 v24.16b, v24.16b
mov v25.d[0], v24.d[1]
- mov v11.d[0],v17.d[0]
+ mov v4.d[0],v17.d[0]
bsl v18.8b, v24.8b , v16.8b //only select row values from q12(predpixel)
- bsl v10.8b, v25.8b , v16.8b
+ bsl v3.8b, v25.8b , v16.8b
- movi d8, #0x00000000000000ff
- mov v9.d[0],v17.d[0]
+ movi d1, #0x00000000000000ff
+ mov v2.d[0],v17.d[0]
movi d6, #0x00000000000000ff
mov v7.d[0],v17.d[0]
@@ -232,14 +232,14 @@ blk_16:
st1 {v18.8b, v19.8b}, [x2], x3
sshr d24, d24,#8
- st1 {v10.8b, v11.8b}, [x5], x3
+ st1 {v3.8b, v4.8b}, [x5], x3
sshr d25, d25,#8
- bsl v8.8b, v24.8b , v16.8b
+ bsl v1.8b, v24.8b , v16.8b
bsl v6.8b, v25.8b , v16.8b
- st1 {v8.8b, v9.8b}, [x2], x3
+ st1 {v1.8b, v2.8b}, [x2], x3
sshr d24, d24,#8
st1 {v6.8b, v7.8b}, [x5], x3
@@ -250,34 +250,34 @@ blk_16:
movi d18, #0x00000000000000ff
//vmov.i64 d19, d17
- movi d10, #0x00000000000000ff
+ movi d3, #0x00000000000000ff
//vmov.i64 d11, d17
loop_16:
- movi d8, #0x00000000000000ff
+ movi d1, #0x00000000000000ff
movi d6, #0x00000000000000ff
bsl v18.8b, v24.8b , v16.8b //only select row values from q12(predpixel)
- bsl v10.8b, v25.8b , v16.8b
+ bsl v3.8b, v25.8b , v16.8b
st1 {v18.8b, v19.8b}, [x2], x3
sshr d24, d24,#8
- st1 {v10.8b, v11.8b}, [x5], x3
+ st1 {v3.8b, v4.8b}, [x5], x3
sshr d25, d25,#8
movi d18, #0x00000000000000ff
- movi d10, #0x00000000000000ff
+ movi d3, #0x00000000000000ff
- bsl v8.8b, v24.8b , v16.8b
+ bsl v1.8b, v24.8b , v16.8b
bsl v6.8b, v25.8b , v16.8b
- st1 {v8.8b, v9.8b}, [x2], x3
+ st1 {v1.8b, v2.8b}, [x2], x3
sshr d24, d24,#8
st1 {v6.8b, v7.8b}, [x5], x3
@@ -287,23 +287,23 @@ loop_16:
bne loop_16
- movi d8, #0x00000000000000ff
+ movi d1, #0x00000000000000ff
movi d6, #0x00000000000000ff
bsl v18.8b, v24.8b , v16.8b //only select row values from q12(predpixel)
- bsl v10.8b, v25.8b , v16.8b
+ bsl v3.8b, v25.8b , v16.8b
st1 {v18.8b, v19.8b}, [x2], x3
sshr d24, d24,#8
- st1 {v10.8b, v11.8b}, [x5], x3
+ st1 {v3.8b, v4.8b}, [x5], x3
sshr d25, d25,#8
- bsl v8.8b, v24.8b , v16.8b
+ bsl v1.8b, v24.8b , v16.8b
bsl v6.8b, v25.8b , v16.8b
- st1 {v8.8b, v9.8b}, [x2], x3
+ st1 {v1.8b, v2.8b}, [x2], x3
st1 {v6.8b, v7.8b}, [x5], x3
@@ -311,10 +311,10 @@ loop_16:
blk_4_8:
- movi d11, #0x00000000000000ff
+ movi d4, #0x00000000000000ff
add x6, x0, x5 //&src[2nt]
- movi d10, #0x00000000000000ff
+ movi d3, #0x00000000000000ff
ldrb w11, [x6], #1 //src[2nt]
sxtw x11,w11
@@ -363,19 +363,19 @@ blk_4_8:
movi d19, #0x00000000000000ff
- bsl v10.8b, v24.8b , v16.8b
+ bsl v3.8b, v24.8b , v16.8b
- st1 {v10.8b},[x2], x3
+ st1 {v3.8b},[x2], x3
sshr d24, d24,#8
- movi d10, #0x00000000000000ff
+ movi d3, #0x00000000000000ff
- bsl v11.8b, v24.8b , v16.8b
+ bsl v4.8b, v24.8b , v16.8b
- st1 {v11.8b},[x2], x3
+ st1 {v4.8b},[x2], x3
sshr d24, d24,#8
- movi d11, #0x00000000000000ff
+ movi d4, #0x00000000000000ff
bsl v18.8b, v24.8b , v16.8b //only select row values from q12(predpixel)
@@ -387,14 +387,14 @@ blk_4_8:
st1 {v19.8b},[x2], x3
sshr d24, d24,#8
- bsl v10.8b, v24.8b , v16.8b
+ bsl v3.8b, v24.8b , v16.8b
- st1 {v10.8b},[x2], x3
+ st1 {v3.8b},[x2], x3
sshr d24, d24,#8
- bsl v11.8b, v24.8b , v16.8b
+ bsl v4.8b, v24.8b , v16.8b
- st1 {v11.8b},[x2], x3
+ st1 {v4.8b},[x2], x3
sshr d24, d24,#8
b end_func
@@ -411,19 +411,19 @@ blk_4:
st1 {v19.s}[0],[x2], x3
sshr d24, d24,#8
- bsl v10.8b, v24.8b , v16.8b
+ bsl v3.8b, v24.8b , v16.8b
- st1 {v10.s}[0],[x2], x3
+ st1 {v3.s}[0],[x2], x3
sshr d24, d24,#8
- bsl v11.8b, v24.8b , v16.8b
- st1 {v11.s}[0],[x2], x3
+ bsl v4.8b, v24.8b , v16.8b
+ st1 {v4.s}[0],[x2], x3
end_func:
// ldmfd sp!,{x4-x12,x15} //reload the registers from sp
ldp x19, x20,[sp],#16
- pop_v_regs
+
ret
diff --git a/common/arm64/ihevc_itrans_recon_4x4.s b/common/arm64/ihevc_itrans_recon_4x4.s
index b18fb89..1f2c904 100644
--- a/common/arm64/ihevc_itrans_recon_4x4.s
+++ b/common/arm64/ihevc_itrans_recon_4x4.s
@@ -119,7 +119,7 @@
ihevc_itrans_recon_4x4_av8:
// stmfd sp!, {x4-x12, x14} //stack stores the values of the arguments
- push_v_regs
+
stp x19, x20,[sp,#-16]!
adrp x8, :got:g_ai2_ihevc_trans_4_transpose
@@ -142,21 +142,21 @@ ihevc_itrans_recon_4x4_av8:
// first stage computation starts
smull v6.4s, v1.4h, v4.4h[1] //83 * pi2_src[1]
smlal v6.4s, v3.4h, v4.4h[3] //o[0] = 83 * pi2_src[1] + 36 * pi2_src[3]
- smull v8.4s, v1.4h, v4.4h[3] //36 * pi2_src[1]
+ smull v5.4s, v1.4h, v4.4h[3] //36 * pi2_src[1]
ld1 {v22.s}[0],[x2],x5
- smlsl v8.4s, v3.4h, v4.4h[1] //o[1] = 36 * pi2_src[1] - 83 * pi2_src[3]
+ smlsl v5.4s, v3.4h, v4.4h[1] //o[1] = 36 * pi2_src[1] - 83 * pi2_src[3]
- saddl v10.4s, v0.4h, v2.4h //pi2_src[0] + pi2_src[2]
- ssubl v12.4s, v0.4h, v2.4h //pi2_src[0] - pi2_src[2]
- shl v10.4s, v10.4s,#6 //e[0] = 64*(pi2_src[0] + pi2_src[2])
- shl v12.4s, v12.4s,#6 //e[1] = 64*(pi2_src[0] - pi2_src[2])
+ saddl v7.4s, v0.4h, v2.4h //pi2_src[0] + pi2_src[2]
+ ssubl v17.4s, v0.4h, v2.4h //pi2_src[0] - pi2_src[2]
+ shl v7.4s, v7.4s,#6 //e[0] = 64*(pi2_src[0] + pi2_src[2])
+ shl v17.4s, v17.4s,#6 //e[1] = 64*(pi2_src[0] - pi2_src[2])
- add v14.4s, v10.4s , v6.4s //((e[0] + o[0] )
- add v16.4s, v12.4s , v8.4s //((e[1] + o[1])
- sub v18.4s, v12.4s , v8.4s //((e[1] - o[1])
- sub v20.4s, v10.4s , v6.4s //((e[0] - o[0])
+ add v19.4s, v7.4s , v6.4s //((e[0] + o[0] )
+ add v16.4s, v17.4s , v5.4s //((e[1] + o[1])
+ sub v18.4s, v17.4s , v5.4s //((e[1] - o[1])
+ sub v20.4s, v7.4s , v6.4s //((e[0] - o[0])
- sqrshrn v28.4h, v14.4s,#shift_stage1_idct //pi2_out[0] = clip_s16((e[0] + o[0] + add)>>shift) )
+ sqrshrn v28.4h, v19.4s,#shift_stage1_idct //pi2_out[0] = clip_s16((e[0] + o[0] + add)>>shift) )
sqrshrn v29.4h, v16.4s,#shift_stage1_idct //pi2_out[1] = clip_s16((e[1] + o[1] + add)>>shift) )
sqrshrn v30.4h, v18.4s,#shift_stage1_idct //pi2_out[2] = clip_s16((e[0] - o[0] + add)>>shift) )
sqrshrn v31.4h, v20.4s,#shift_stage1_idct //pi2_out[3] = clip_s16((e[0] - o[0] + add)>>shift) )
@@ -176,22 +176,22 @@ ihevc_itrans_recon_4x4_av8:
smull v6.4s, v1.4h, v4.4h[1] //83 * pi2_src[1]
ld1 {v22.s}[1],[x2],x5
smlal v6.4s, v3.4h, v4.4h[3] //o[0] = 83 * pi2_src[1] + 36 * pi2_src[3]
- smull v8.4s, v1.4h, v4.4h[3] //36 * pi2_src[1]
- smlsl v8.4s, v3.4h, v4.4h[1] //o[1] = 36 * pi2_src[1] - 83 * pi2_src[3]
+ smull v5.4s, v1.4h, v4.4h[3] //36 * pi2_src[1]
+ smlsl v5.4s, v3.4h, v4.4h[1] //o[1] = 36 * pi2_src[1] - 83 * pi2_src[3]
ld1 {v23.s}[0],[x2],x5
- saddl v10.4s, v0.4h, v2.4h //pi2_src[0] + pi2_src[2]
- ssubl v12.4s, v0.4h, v2.4h //pi2_src[0] - pi2_src[2]
- shl v10.4s, v10.4s,#6 //e[0] = 64*(pi2_src[0] + pi2_src[2])
- shl v12.4s, v12.4s,#6 //e[1] = 64*(pi2_src[0] - pi2_src[2])
+ saddl v7.4s, v0.4h, v2.4h //pi2_src[0] + pi2_src[2]
+ ssubl v17.4s, v0.4h, v2.4h //pi2_src[0] - pi2_src[2]
+ shl v7.4s, v7.4s,#6 //e[0] = 64*(pi2_src[0] + pi2_src[2])
+ shl v17.4s, v17.4s,#6 //e[1] = 64*(pi2_src[0] - pi2_src[2])
- add v14.4s, v10.4s , v6.4s //((e[0] + o[0] )
- add v16.4s, v12.4s , v8.4s //((e[1] + o[1])
- sub v18.4s, v12.4s , v8.4s //((e[1] - o[1])
- sub v20.4s, v10.4s , v6.4s //((e[0] - o[0])
+ add v19.4s, v7.4s , v6.4s //((e[0] + o[0] )
+ add v16.4s, v17.4s , v5.4s //((e[1] + o[1])
+ sub v18.4s, v17.4s , v5.4s //((e[1] - o[1])
+ sub v20.4s, v7.4s , v6.4s //((e[0] - o[0])
- sqrshrn v28.4h, v14.4s,#shift_stage2_idct //pi2_out[0] = clip_s16((e[0] + o[0] + add)>>shift) )
+ sqrshrn v28.4h, v19.4s,#shift_stage2_idct //pi2_out[0] = clip_s16((e[0] + o[0] + add)>>shift) )
sqrshrn v29.4h, v16.4s,#shift_stage2_idct //pi2_out[1] = clip_s16((e[1] + o[1] + add)>>shift) )
sqrshrn v30.4h, v18.4s,#shift_stage2_idct //pi2_out[2] = clip_s16((e[0] - o[0] + add)>>shift) )
sqrshrn v31.4h, v20.4s,#shift_stage2_idct //pi2_out[3] = clip_s16((e[0] - o[0] + add)>>shift) )
@@ -228,7 +228,7 @@ ihevc_itrans_recon_4x4_av8:
// ldmfd sp!,{x4-x12,x15} //reload the registers from sp
ldp x19, x20,[sp],#16
- pop_v_regs
+
ret
diff --git a/common/arm64/ihevc_itrans_recon_4x4_ttype1.s b/common/arm64/ihevc_itrans_recon_4x4_ttype1.s
index fa04b8e..da04c5e 100644
--- a/common/arm64/ihevc_itrans_recon_4x4_ttype1.s
+++ b/common/arm64/ihevc_itrans_recon_4x4_ttype1.s
@@ -118,7 +118,7 @@
ihevc_itrans_recon_4x4_ttype1_av8:
// stmfd sp!, {x4-x12, x14} //stack stores the values of the arguments
- push_v_regs
+
stp x19, x20,[sp,#-16]!
add x4,x4,x4 // src_strd in terms of word16
@@ -142,33 +142,33 @@ ihevc_itrans_recon_4x4_ttype1_av8:
smlal v6.4s, v3.4h, v4.4h[1] //74 * pi2_src[1] + 29 * pi2_src[0] + 55 * pi2_src[3]
smlal v6.4s, v2.4h, v4.4h[3] //pi2_out[0] = 29* pi2_src[0] + 74 * pi2_src[1] + 84* pi2_src[2] + 55 * pi2_src[3]
- smull v8.4s, v1.4h, v4.4h[2] //74 * pi2_src[1]
- smlal v8.4s, v0.4h, v4.4h[1] //74 * pi2_src[1] + 55 * pi2_src[0]
- smlsl v8.4s, v2.4h, v4.4h[0] //74 * pi2_src[1] + 55 * pi2_src[0] - 29 * pi2_src[2]
- smlsl v8.4s, v3.4h, v4.4h[3] //pi2_out[1] = 74 * pi2_src[1] + 55 * pi2_src[0] - 29 * pi2_src[2] - 84 * pi2_src[3])
+ smull v5.4s, v1.4h, v4.4h[2] //74 * pi2_src[1]
+ smlal v5.4s, v0.4h, v4.4h[1] //74 * pi2_src[1] + 55 * pi2_src[0]
+ smlsl v5.4s, v2.4h, v4.4h[0] //74 * pi2_src[1] + 55 * pi2_src[0] - 29 * pi2_src[2]
+ smlsl v5.4s, v3.4h, v4.4h[3] //pi2_out[1] = 74 * pi2_src[1] + 55 * pi2_src[0] - 29 * pi2_src[2] - 84 * pi2_src[3])
- smull v10.4s, v0.4h, v4.4h[2] // 74 * pi2_src[0]
- smlsl v10.4s, v2.4h, v4.4h[2] // 74 * pi2_src[0] - 74 * pi2_src[2]
- smlal v10.4s, v3.4h, v4.4h[2] //pi2_out[2] = 74 * pi2_src[0] - 74 * pi2_src[2] + 74 * pi2_src[3]
+ smull v7.4s, v0.4h, v4.4h[2] // 74 * pi2_src[0]
+ smlsl v7.4s, v2.4h, v4.4h[2] // 74 * pi2_src[0] - 74 * pi2_src[2]
+ smlal v7.4s, v3.4h, v4.4h[2] //pi2_out[2] = 74 * pi2_src[0] - 74 * pi2_src[2] + 74 * pi2_src[3]
- smull v12.4s, v2.4h, v4.4h[1] // 55 * pi2_src[2]
- smlsl v12.4s, v1.4h, v4.4h[2] // 55 * pi2_src[2] - 74 * pi2_src[1]
- smlsl v12.4s, v3.4h, v4.4h[0] // - 74 * pi2_src[1] + 55 * pi2_src[2] - 29 * pi2_src[3]
- smlal v12.4s, v0.4h, v4.4h[3] //pi2_out[3] = 84 * pi2_src[0] - 74 * pi2_src[1] + 55 * pi2_src[2] - 29 * pi2_src[3]
+ smull v20.4s, v2.4h, v4.4h[1] // 55 * pi2_src[2]
+ smlsl v20.4s, v1.4h, v4.4h[2] // 55 * pi2_src[2] - 74 * pi2_src[1]
+ smlsl v20.4s, v3.4h, v4.4h[0] // - 74 * pi2_src[1] + 55 * pi2_src[2] - 29 * pi2_src[3]
+ smlal v20.4s, v0.4h, v4.4h[3] //pi2_out[3] = 84 * pi2_src[0] - 74 * pi2_src[1] + 55 * pi2_src[2] - 29 * pi2_src[3]
sqrshrn v28.4h, v6.4s,#shift_stage1_idct // (pi2_out[0] + rounding ) >> shift_stage1_idct
- sqrshrn v29.4h, v8.4s,#shift_stage1_idct // (pi2_out[1] + rounding ) >> shift_stage1_idct
- sqrshrn v30.4h, v10.4s,#shift_stage1_idct // (pi2_out[2] + rounding ) >> shift_stage1_idct
- sqrshrn v31.4h, v12.4s,#shift_stage1_idct // (pi2_out[3] + rounding ) >> shift_stage1_idct
+ sqrshrn v29.4h, v5.4s,#shift_stage1_idct // (pi2_out[1] + rounding ) >> shift_stage1_idct
+ sqrshrn v30.4h, v7.4s,#shift_stage1_idct // (pi2_out[2] + rounding ) >> shift_stage1_idct
+ sqrshrn v31.4h, v20.4s,#shift_stage1_idct // (pi2_out[3] + rounding ) >> shift_stage1_idct
ld1 {v18.s}[0],[x2],x5
trn1 v24.4h, v28.4h, v29.4h
trn2 v25.4h, v28.4h, v29.4h
trn1 v26.4h, v30.4h, v31.4h
trn2 v27.4h, v30.4h, v31.4h
- trn1 v14.2s, v24.2s, v26.2s
+ trn1 v21.2s, v24.2s, v26.2s
trn2 v16.2s, v24.2s, v26.2s
- trn1 v15.2s, v25.2s, v27.2s
+ trn1 v22.2s, v25.2s, v27.2s
trn2 v17.2s, v25.2s, v27.2s
// output in d14,d15,d16,d17
// first stage computation ends
@@ -180,30 +180,30 @@ ihevc_itrans_recon_4x4_ttype1_av8:
// d16 - d2
// d17 - d3
ld1 {v18.s}[1],[x2],x5
- smull v6.4s, v15.4h, v4.4h[2] //74 * pi2_src[1]
- smlal v6.4s, v14.4h, v4.4h[0] //74 * pi2_src[1] + 29 * pi2_src[0]
+ smull v6.4s, v22.4h, v4.4h[2] //74 * pi2_src[1]
+ smlal v6.4s, v21.4h, v4.4h[0] //74 * pi2_src[1] + 29 * pi2_src[0]
smlal v6.4s, v17.4h, v4.4h[1] //74 * pi2_src[1] + 29 * pi2_src[0] + 55 * pi2_src[3]
smlal v6.4s, v16.4h, v4.4h[3] //pi2_out[0] = 29* pi2_src[0] + 74 * pi2_src[1] + 84* pi2_src[2] + 55 * pi2_src[3]
- smull v8.4s, v15.4h, v4.4h[2] //74 * pi2_src[1]
- smlal v8.4s, v14.4h, v4.4h[1] //74 * pi2_src[1] + 55 * pi2_src[0]
- smlsl v8.4s, v16.4h, v4.4h[0] //74 * pi2_src[1] + 55 * pi2_src[0] - 29 * pi2_src[2]
- smlsl v8.4s, v17.4h, v4.4h[3] //pi2_out[1] = 74 * pi2_src[1] + 55 * pi2_src[0] - 29 * pi2_src[2] - 84 * pi2_src[3])
+ smull v5.4s, v22.4h, v4.4h[2] //74 * pi2_src[1]
+ smlal v5.4s, v21.4h, v4.4h[1] //74 * pi2_src[1] + 55 * pi2_src[0]
+ smlsl v5.4s, v16.4h, v4.4h[0] //74 * pi2_src[1] + 55 * pi2_src[0] - 29 * pi2_src[2]
+ smlsl v5.4s, v17.4h, v4.4h[3] //pi2_out[1] = 74 * pi2_src[1] + 55 * pi2_src[0] - 29 * pi2_src[2] - 84 * pi2_src[3])
- smull v10.4s, v14.4h, v4.4h[2] // 74 * pi2_src[0]
- smlsl v10.4s, v16.4h, v4.4h[2] // 74 * pi2_src[0] - 74 * pi2_src[2]
- smlal v10.4s, v17.4h, v4.4h[2] //pi2_out[2] = 74 * pi2_src[0] - 74 * pi2_src[2] + 74 * pi2_src[3]
+ smull v7.4s, v21.4h, v4.4h[2] // 74 * pi2_src[0]
+ smlsl v7.4s, v16.4h, v4.4h[2] // 74 * pi2_src[0] - 74 * pi2_src[2]
+ smlal v7.4s, v17.4h, v4.4h[2] //pi2_out[2] = 74 * pi2_src[0] - 74 * pi2_src[2] + 74 * pi2_src[3]
ld1 {v19.s}[0],[x2],x5
- smull v12.4s, v16.4h, v4.4h[1] // 55 * pi2_src[2]
- smlsl v12.4s, v15.4h, v4.4h[2] // - 74 * pi2_src[1] + 55 * pi2_src[2]
- smlsl v12.4s, v17.4h, v4.4h[0] // - 74 * pi2_src[1] + 55 * pi2_src[2] - 29 * pi2_src[3]
- smlal v12.4s, v14.4h, v4.4h[3] //pi2_out[3] = 84 * pi2_src[0] - 74 * pi2_src[1] + 55 * pi2_src[2] - 29 * pi2_src[3]
+ smull v20.4s, v16.4h, v4.4h[1] // 55 * pi2_src[2]
+ smlsl v20.4s, v22.4h, v4.4h[2] // - 74 * pi2_src[1] + 55 * pi2_src[2]
+ smlsl v20.4s, v17.4h, v4.4h[0] // - 74 * pi2_src[1] + 55 * pi2_src[2] - 29 * pi2_src[3]
+ smlal v20.4s, v21.4h, v4.4h[3] //pi2_out[3] = 84 * pi2_src[0] - 74 * pi2_src[1] + 55 * pi2_src[2] - 29 * pi2_src[3]
sqrshrn v28.4h, v6.4s,#shift_stage2_idct // (pi2_out[0] + rounding ) >> shift_stage1_idct
- sqrshrn v29.4h, v8.4s,#shift_stage2_idct // (pi2_out[1] + rounding ) >> shift_stage1_idct
- sqrshrn v30.4h, v10.4s,#shift_stage2_idct // (pi2_out[2] + rounding ) >> shift_stage1_idct
- sqrshrn v31.4h, v12.4s,#shift_stage2_idct // (pi2_out[3] + rounding ) >> shift_stage1_idct
+ sqrshrn v29.4h, v5.4s,#shift_stage2_idct // (pi2_out[1] + rounding ) >> shift_stage1_idct
+ sqrshrn v30.4h, v7.4s,#shift_stage2_idct // (pi2_out[2] + rounding ) >> shift_stage1_idct
+ sqrshrn v31.4h, v20.4s,#shift_stage2_idct // (pi2_out[3] + rounding ) >> shift_stage1_idct
ld1 {v19.s}[1],[x2],x5
trn1 v24.4h, v28.4h, v29.4h
trn2 v25.4h, v28.4h, v29.4h
@@ -233,7 +233,7 @@ ihevc_itrans_recon_4x4_ttype1_av8:
// ldmfd sp!,{x4-x12,x15} //reload the registers from sp
ldp x19, x20,[sp],#16
- pop_v_regs
+
ret
diff --git a/common/arm64/ihevc_neon_macros.s b/common/arm64/ihevc_neon_macros.s
index 09a1de9..c5e65e5 100644
--- a/common/arm64/ihevc_neon_macros.s
+++ b/common/arm64/ihevc_neon_macros.s
@@ -47,4 +47,3 @@
ldp d10,d11,[sp],#16
ldp d8,d9,[sp],#16
.endm
-
diff --git a/common/arm64/ihevc_sao_band_offset_luma.s b/common/arm64/ihevc_sao_band_offset_luma.s
index 099d581..779ee69 100644
--- a/common/arm64/ihevc_sao_band_offset_luma.s
+++ b/common/arm64/ihevc_sao_band_offset_luma.s
@@ -76,7 +76,10 @@ ihevc_sao_band_offset_luma_av8:
LDR w8,[sp] //Loads ht
- push_v_regs
+
+ stp d13,d14,[sp,#-16]!
+ stp d8,d15,[sp,#-16]! // Storing d15 using { sub sp,sp,#8; str d15,[sp] } is giving bus error.
+ // d8 is used as dummy register and stored along with d15 using stp. d8 is not used in the function.
stp x19, x20,[sp,#-16]!
MOV x9,x8 //Move the ht to x9 for loop counter
@@ -127,7 +130,7 @@ SRC_TOP_LOOP: //wd is always multiple of 8
ADD v7.8b, v3.8b , v31.8b //band_table.val[2] = vadd_u8(band_table.val[2], band_pos)
dup v27.8b, v30.8b[3] //vdup_n_u8(pi1_sao_offset[3])
- ADD v8.8b, v4.8b , v31.8b //band_table.val[3] = vadd_u8(band_table.val[3], band_pos)
+ ADD v21.8b, v4.8b , v31.8b //band_table.val[3] = vadd_u8(band_table.val[3], band_pos)
dup v26.8b, v30.8b[4] //vdup_n_u8(pi1_sao_offset[4])
ADD v1.8b, v5.8b , v29.8b //band_table.val[0] = vadd_u8(band_table.val[0], vdup_n_u8(pi1_sao_offset[1]))
@@ -138,52 +141,52 @@ SRC_TOP_LOOP: //wd is always multiple of 8
CMP x5,#28
ADD v3.8b, v7.8b , v27.8b //band_table.val[2] = vadd_u8(band_table.val[2], vdup_n_u8(pi1_sao_offset[3]))
- ADD v4.8b, v8.8b , v26.8b //band_table.val[3] = vadd_u8(band_table.val[3], vdup_n_u8(pi1_sao_offset[4]))
+ ADD v4.8b, v21.8b , v26.8b //band_table.val[3] = vadd_u8(band_table.val[3], vdup_n_u8(pi1_sao_offset[4]))
BLT SAO_BAND_POS_0
SAO_BAND_POS_28: //case 28
- cmhs v12.8b, v29.8b , v4.8b //vcle_u8(band_table.val[3], vdup_n_u8(16))
+ cmhs v25.8b, v29.8b , v4.8b //vcle_u8(band_table.val[3], vdup_n_u8(16))
BNE SAO_BAND_POS_29
- ORR v4.8b, v4.8b , v12.8b //band_table.val[3] = vorr_u8(band_table.val[3], au1_cmp)
+ ORR v4.8b, v4.8b , v25.8b //band_table.val[3] = vorr_u8(band_table.val[3], au1_cmp)
B SWITCH_BREAK
SAO_BAND_POS_29: //case 29
CMP x5,#29
- cmhs v11.8b, v29.8b , v3.8b //vcle_u8(band_table.val[2], vdup_n_u8(16))
+ cmhs v24.8b, v29.8b , v3.8b //vcle_u8(band_table.val[2], vdup_n_u8(16))
BNE SAO_BAND_POS_30
- ORR v3.8b, v3.8b , v11.8b //band_table.val[2] = vorr_u8(band_table.val[2], au1_cmp)
+ ORR v3.8b, v3.8b , v24.8b //band_table.val[2] = vorr_u8(band_table.val[2], au1_cmp)
- AND v4.8b, v4.8b , v12.8b //band_table.val[3] = vand_u8(band_table.val[3], au1_cmp)
+ AND v4.8b, v4.8b , v25.8b //band_table.val[3] = vand_u8(band_table.val[3], au1_cmp)
B SWITCH_BREAK
SAO_BAND_POS_30: //case 30
CMP x5,#30
- cmhs v10.8b, v29.8b , v2.8b //vcle_u8(band_table.val[1], vdup_n_u8(16))
+ cmhs v23.8b, v29.8b , v2.8b //vcle_u8(band_table.val[1], vdup_n_u8(16))
BNE SAO_BAND_POS_31
- ORR v2.8b, v2.8b , v10.8b //band_table.val[1] = vorr_u8(band_table.val[1], au1_cmp)
+ ORR v2.8b, v2.8b , v23.8b //band_table.val[1] = vorr_u8(band_table.val[1], au1_cmp)
- AND v3.8b, v3.8b , v11.8b //band_table.val[2] = vand_u8(band_table.val[2], au1_cmp)
+ AND v3.8b, v3.8b , v24.8b //band_table.val[2] = vand_u8(band_table.val[2], au1_cmp)
B SWITCH_BREAK
SAO_BAND_POS_31: //case 31
CMP x5,#31
BNE SWITCH_BREAK
- cmhs v9.8b, v29.8b , v1.8b //vcle_u8(band_table.val[0], vdup_n_u8(16))
- ORR v1.8b, v1.8b , v9.8b //band_table.val[0] = vorr_u8(band_table.val[0], au1_cmp)
+ cmhs v22.8b, v29.8b , v1.8b //vcle_u8(band_table.val[0], vdup_n_u8(16))
+ ORR v1.8b, v1.8b , v22.8b //band_table.val[0] = vorr_u8(band_table.val[0], au1_cmp)
- AND v2.8b, v2.8b , v10.8b //band_table.val[1] = vand_u8(band_table.val[1], au1_cmp)
+ AND v2.8b, v2.8b , v23.8b //band_table.val[1] = vand_u8(band_table.val[1], au1_cmp)
SAO_BAND_POS_0:
CMP x5,#0 //case 0
BNE SWITCH_BREAK
- cmhs v9.8b, v29.8b , v1.8b //vcle_u8(band_table.val[0], vdup_n_u8(16))
- AND v1.8b, v1.8b , v9.8b //band_table.val[0] = vand_u8(band_table.val[0], au1_cmp)
+ cmhs v22.8b, v29.8b , v1.8b //vcle_u8(band_table.val[0], vdup_n_u8(16))
+ AND v1.8b, v1.8b , v22.8b //band_table.val[0] = vand_u8(band_table.val[0], au1_cmp)
SWITCH_BREAK:
@@ -236,9 +239,11 @@ HEIGHT_LOOP:
ADD x0,x0,#8
BNE SWITCH_BREAK_1
- // LDMFD sp!,{x4-x12,x15} //Reload the registers from SP
+ // LDMFD sp!,{x4-x12,x15} //Reload the registers from SP
ldp x19, x20,[sp], #16
- pop_v_regs
+ ldp d8,d15,[sp],#16 // Loading d15 using { ldr d15,[sp]; add sp,sp,#8 } is giving bus error.
+ // d8 is used as dummy register and loaded along with d15 using ldp. d8 is not used in the function.
+ ldp d13,d14,[sp],#16
ret
diff --git a/common/arm64/ihevc_sao_edge_offset_class0.s b/common/arm64/ihevc_sao_edge_offset_class0.s
index f7d6621..91146e8 100644
--- a/common/arm64/ihevc_sao_edge_offset_class0.s
+++ b/common/arm64/ihevc_sao_edge_offset_class0.s
@@ -78,7 +78,7 @@ ihevc_sao_edge_offset_class0_av8:
LDR x10,[sp,#16] //Loads ht
AND x10,x10,0xFFFFFFFF // Since argument is passed as WORD32, Using only lower half of x10
- push_v_regs
+
stp x19, x20,[sp,#-16]!
movi v2.16b, #2 //const_2 = vdupq_n_s8(2)
@@ -93,15 +93,15 @@ ihevc_sao_edge_offset_class0_av8:
ADRP x14, :got:gi1_table_edge_idx //table pointer
LDR x14, [x14, #:got_lo12:gi1_table_edge_idx]
- movi v8.16b, #0xFF //au1_mask = vdupq_n_s8(-1)
+ movi v3.16b, #0xFF //au1_mask = vdupq_n_s8(-1)
STRB w12,[x4] //*pu1_src_top_left = pu1_src_top[wd - 1]
MOV x6,x0 //pu1_src_org
- LD1 {v10.8b},[x14] //edge_idx_tbl = vld1_s8(gi1_table_edge_idx)
+ LD1 {v5.8b},[x14] //edge_idx_tbl = vld1_s8(gi1_table_edge_idx)
SUB x4,x10,#1 //(ht - 1)
MOV x12,x9 //Move wd to x12 for loop count
- LD1 {v11.8b},[x8] //offset_tbl = vld1_s8(pi1_sao_offset)
+ LD1 {v7.8b},[x8] //offset_tbl = vld1_s8(pi1_sao_offset)
mul x4, x4, x1 //(ht - 1) * src_strd
ADD x4,x4,x0 //pu1_src[(ht - 1) * src_strd]
@@ -123,18 +123,18 @@ WIDTH_LOOP_16:
CMP x8,x9 //if(col == wd)
BNE AU1_MASK_FF //jump to else part
LDRB w12,[x7] //pu1_avail[0]
- mov v8.8b[0], w12 //vsetq_lane_s8(pu1_avail[0], au1_mask, 0)
+ mov v3.8b[0], w12 //vsetq_lane_s8(pu1_avail[0], au1_mask, 0)
B SKIP_AU1_MASK_FF //Skip the else part
AU1_MASK_FF:
MOV x12,#0xFF //move -1 to x12
- mov v8.8b[0], w12 //au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
+ mov v3.8b[0], w12 //au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
SKIP_AU1_MASK_FF:
CMP x8,#16 //If col == 16
BNE SKIP_MASKING_IF_NOT16 //If not skip masking
LDRB w12,[x7,#1] //pu1_avail[1]
- mov v8.b[15], w12 //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
+ mov v3.b[15], w12 //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
SKIP_MASKING_IF_NOT16:
MOV x12,x0 //pu1_src_cpy = pu1_src
@@ -142,24 +142,24 @@ SKIP_MASKING_IF_NOT16:
PU1_SRC_LOOP:
LDRB w11,[x2] //load pu1_src_left since ht - row =0 when it comes first pu1_src_left is incremented later
- LD1 {v12.16b},[x12],x1 //pu1_cur_row = vld1q_u8(pu1_src_cpy)
+ LD1 {v17.16b},[x12],x1 //pu1_cur_row = vld1q_u8(pu1_src_cpy)
SUB x5,x9,x8 //wd - col
SUB x14,x10,x4 //ht - row
- mov v14.8b[15], w11 //vsetq_lane_u8(pu1_src_left[ht - row], pu1_cur_row_tmp, 15)
+ mov v21.8b[15], w11 //vsetq_lane_u8(pu1_src_left[ht - row], pu1_cur_row_tmp, 15)
mul x14, x14, x1 //(ht - row) * src_strd
LD1 {v26.16b},[x12] //II Iteration pu1_cur_row = vld1q_u8(pu1_src_cpy)
- EXT v14.16b, v14.16b , v12.16b,#15 //pu1_cur_row_tmp = vextq_u8(pu1_cur_row_tmp, pu1_cur_row, 15)
+ EXT v21.16b, v21.16b , v17.16b,#15 //pu1_cur_row_tmp = vextq_u8(pu1_cur_row_tmp, pu1_cur_row, 15)
ADD x5,x14,x5 //(ht - row) * src_strd + (wd - col)
LDRB w11,[x2, #1] //II Iteration load pu1_src_left since ht - row + 1 =1
- cmhi v16.16b, v12.16b , v14.16b //vcgtq_u8(pu1_cur_row, pu1_cur_row_tmp)
+ cmhi v16.16b, v17.16b , v21.16b //vcgtq_u8(pu1_cur_row, pu1_cur_row_tmp)
LDRB w14,[x6,x5] //pu1_src_org[(ht - row) * src_strd + 16 - 1 + (wd - col)]
SUB x4,x4,#1
mov v28.8b[15], w11 //II Iteration vsetq_lane_u8(pu1_src_left[ht - row], pu1_cur_row_tmp, 15)
- cmhi v18.16b, v14.16b , v12.16b //vcltq_u8(pu1_cur_row, pu1_cur_row_tmp)
+ cmhi v18.16b, v21.16b , v17.16b //vcltq_u8(pu1_cur_row, pu1_cur_row_tmp)
SUB x12,x12,x1 //Decrement the pu1_src pointer by src_strd
SUB v20.16b, v18.16b , v16.16b //sign_left = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
@@ -170,11 +170,11 @@ PU1_SRC_LOOP:
SUB x5,x9,x8 //II wd - col
ADD x12,x12,x1 //Increment the pu1_src pointer by src_strd
- mov v14.8b[0], w11 //pu1_cur_row_tmp = vsetq_lane_u8(pu1_src_cpy[16], pu1_cur_row_tmp, 0)
+ mov v21.8b[0], w11 //pu1_cur_row_tmp = vsetq_lane_u8(pu1_src_cpy[16], pu1_cur_row_tmp, 0)
cmhi v30.16b, v26.16b , v28.16b //II vcgtq_u8(pu1_cur_row, pu1_cur_row_tmp)
LDRB w11,[x12,#16] //II pu1_src_cpy[16]
- EXT v14.16b, v12.16b , v14.16b,#1 //pu1_cur_row_tmp = vextq_u8(pu1_cur_row, pu1_cur_row_tmp, 1)
+ EXT v21.16b, v17.16b , v21.16b,#1 //pu1_cur_row_tmp = vextq_u8(pu1_cur_row, pu1_cur_row_tmp, 1)
SUB x14,x10,x4 //II ht - row
cmhi v0.16b, v28.16b , v26.16b //II vcltq_u8(pu1_cur_row, pu1_cur_row_tmp)
@@ -182,59 +182,59 @@ PU1_SRC_LOOP:
SUB x12,x12,x1 //Decrement the pu1_src pointer by src_strd
mul x14, x14, x1 //II (ht - row) * src_strd
- cmhi v16.16b, v12.16b , v14.16b //vcgtq_u8(pu1_cur_row, pu1_cur_row_tmp)
+ cmhi v16.16b, v17.16b , v21.16b //vcgtq_u8(pu1_cur_row, pu1_cur_row_tmp)
ADD x5,x14,x5 //II (ht - row) * src_strd + (wd - col)
- cmhi v18.16b, v14.16b , v12.16b //vcltq_u8(pu1_cur_row, pu1_cur_row_tmp)
+ cmhi v18.16b, v21.16b , v17.16b //vcltq_u8(pu1_cur_row, pu1_cur_row_tmp)
EXT v28.16b, v26.16b , v28.16b,#1 //II pu1_cur_row_tmp = vextq_u8(pu1_cur_row, pu1_cur_row_tmp, 1)
LDRB w14,[x6,x5] //II pu1_src_org[(ht - row) * src_strd + 16 - 1 + (wd - col)]
SUB v22.16b, v18.16b , v16.16b //sign_right = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
SUBS x4,x4,#1 //Decrement row by 1
- ADD v14.16b, v2.16b , v20.16b //edge_idx = vaddq_s8(const_2, sign_left)
+ ADD v21.16b, v2.16b , v20.16b //edge_idx = vaddq_s8(const_2, sign_left)
STRB w14,[x2],#1 //II pu1_src_left[(ht - row)] = au1_src_left_tmp[(ht - row)]
- ADD v14.16b, v14.16b , v22.16b //edge_idx = vaddq_s8(edge_idx, sign_right)
- Uxtl v18.8h, v12.8b //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
+ ADD v21.16b, v21.16b , v22.16b //edge_idx = vaddq_s8(edge_idx, sign_right)
+ Uxtl v18.8h, v17.8b //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
SUB v20.16b, v0.16b , v30.16b //II sign_left = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
- TBL v14.16b, {v10.16b},v14.16b //vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
+ TBL v21.16b, {v5.16b},v21.16b //vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
cmhi v30.16b, v26.16b , v28.16b //II vcgtq_u8(pu1_cur_row, pu1_cur_row_tmp)
cmhi v0.16b, v28.16b , v26.16b //II vcltq_u8(pu1_cur_row, pu1_cur_row_tmp)
// TBL v15.8b, {v10.16b},v15.8b //vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
SUB v22.16b, v0.16b , v30.16b //II sign_right = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
- AND v14.16b, v14.16b , v8.16b //edge_idx = vandq_s8(edge_idx, au1_mask)
- TBL v16.16b, {v11.16b},v14.16b //offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx))
+ AND v21.16b, v21.16b , v3.16b //edge_idx = vandq_s8(edge_idx, au1_mask)
+ TBL v16.16b, {v7.16b},v21.16b //offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx))
Uxtl v0.8h, v26.8b //II pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
ADD v28.16b, v2.16b , v20.16b //II edge_idx = vaddq_s8(const_2, sign_left)
ADD v28.16b, v28.16b , v22.16b //II edge_idx = vaddq_s8(edge_idx, sign_right)
SADDW v18.8h, v18.8h , v16.8b
- TBL v28.16b, {v10.16b},v28.16b //II vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
+ TBL v28.16b, {v5.16b},v28.16b //II vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
SMAX v18.8h, v18.8h , v4.8h //pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
// TBL v29.8b, {v10.16b},v29.8b //II vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
UMIN v18.8h, v18.8h , v6.8h //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
- AND v28.16b, v28.16b , v8.16b //II edge_idx = vandq_s8(edge_idx, au1_mask)
+ AND v28.16b, v28.16b , v3.16b //II edge_idx = vandq_s8(edge_idx, au1_mask)
// TBL v17.8b, {v11.16b},v15.8b //offset = vtbl1_s8(offset_tbl, vget_high_s8(edge_idx))
- Uxtl2 v14.8h, v12.16b //pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
- TBL v30.16b, {v11.16b},v28.16b //II offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx))
- SADDW2 v14.8h, v14.8h , v16.16b //pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
+ Uxtl2 v21.8h, v17.16b //pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
+ TBL v30.16b, {v7.16b},v28.16b //II offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx))
+ SADDW2 v21.8h, v21.8h , v16.16b //pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
- SMAX v14.8h, v14.8h , v4.8h //pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
+ SMAX v21.8h, v21.8h , v4.8h //pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
// TBL v31.8b, {v11.16b},v29.8b //II offset = vtbl1_s8(offset_tbl, vget_high_s8(edge_idx))
- UMIN v14.8h, v14.8h , v6.8h //pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip))
+ UMIN v21.8h, v21.8h , v6.8h //pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip))
xtn v18.8b, v18.8h //vmovn_s16(pi2_tmp_cur_row.val[0])
SADDW v0.8h, v0.8h , v30.8b
- xtn v19.8b, v14.8h //vmovn_s16(pi2_tmp_cur_row.val[1])
+ xtn v19.8b, v21.8h //vmovn_s16(pi2_tmp_cur_row.val[1])
SMAX v0.8h, v0.8h , v4.8h //II pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
Uxtl2 v28.8h, v26.16b //II pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
@@ -271,52 +271,52 @@ WIDTH_RESIDUE:
CMP x8,x9 //if(wd_rem == wd)
BNE AU1_MASK_FF_RESIDUE //jump to else part
LDRB w12,[x7] //pu1_avail[0]
- mov v8.8b[0], w12 //vsetq_lane_s8(pu1_avail[0], au1_mask, 0)
+ mov v3.8b[0], w12 //vsetq_lane_s8(pu1_avail[0], au1_mask, 0)
B SKIP_AU1_MASK_FF_RESIDUE //Skip the else part
AU1_MASK_FF_RESIDUE:
MOV x12,#0xFF //move -s to x12
- mov v8.8b[0], w12 //au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
+ mov v3.8b[0], w12 //au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
SKIP_AU1_MASK_FF_RESIDUE:
LDRB w11,[x7,#1] //pu1_avail[1]
SUB x5,x9,#1 //wd - 1
MOV x4,x10 //move ht to x4 for loop count
- mov v8.8b[7], w11 //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
+ mov v3.8b[7], w11 //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
MOV x12,x0 //pu1_src_cpy = pu1_src
PU1_SRC_LOOP_RESIDUE:
- LD1 {v12.16b},[x12] //pu1_cur_row = vld1q_u8(pu1_src_cpy)
+ LD1 {v17.16b},[x12] //pu1_cur_row = vld1q_u8(pu1_src_cpy)
LDRB w11,[x2] //load pu1_src_left
- mov v14.8b[15], w11 //vsetq_lane_u8(pu1_src_left[ht - row], pu1_cur_row_tmp, 15)
- EXT v14.16b, v14.16b , v12.16b,#15 //pu1_cur_row_tmp = vextq_u8(pu1_cur_row_tmp, pu1_cur_row, 15)
+ mov v21.8b[15], w11 //vsetq_lane_u8(pu1_src_left[ht - row], pu1_cur_row_tmp, 15)
+ EXT v21.16b, v21.16b , v17.16b,#15 //pu1_cur_row_tmp = vextq_u8(pu1_cur_row_tmp, pu1_cur_row, 15)
- cmhi v16.16b, v12.16b , v14.16b //vcgtq_u8(pu1_cur_row, pu1_cur_row_tmp)
- cmhi v18.16b, v14.16b , v12.16b //vcltq_u8(pu1_cur_row, pu1_cur_row_tmp)
+ cmhi v16.16b, v17.16b , v21.16b //vcgtq_u8(pu1_cur_row, pu1_cur_row_tmp)
+ cmhi v18.16b, v21.16b , v17.16b //vcltq_u8(pu1_cur_row, pu1_cur_row_tmp)
SUB v20.16b, v18.16b , v16.16b //sign_left = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
LDRB w11,[x12,#16] //pu1_src_cpy[16]
- mov v14.8b[0], w11 //pu1_cur_row_tmp = vsetq_lane_u8(pu1_src_cpy[16], pu1_cur_row_tmp, 0)
- EXT v14.16b, v12.16b , v14.16b,#1 //pu1_cur_row_tmp = vextq_u8(pu1_cur_row, pu1_cur_row_tmp, 1)
+ mov v21.8b[0], w11 //pu1_cur_row_tmp = vsetq_lane_u8(pu1_src_cpy[16], pu1_cur_row_tmp, 0)
+ EXT v21.16b, v17.16b , v21.16b,#1 //pu1_cur_row_tmp = vextq_u8(pu1_cur_row, pu1_cur_row_tmp, 1)
- cmhi v16.16b, v12.16b , v14.16b //vcgtq_u8(pu1_cur_row, pu1_cur_row_tmp)
- cmhi v18.16b, v14.16b , v12.16b //vcltq_u8(pu1_cur_row, pu1_cur_row_tmp)
+ cmhi v16.16b, v17.16b , v21.16b //vcgtq_u8(pu1_cur_row, pu1_cur_row_tmp)
+ cmhi v18.16b, v21.16b , v17.16b //vcltq_u8(pu1_cur_row, pu1_cur_row_tmp)
SUB v22.16b, v18.16b , v16.16b //sign_right = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
ADD v24.16b, v2.16b , v20.16b //edge_idx = vaddq_s8(const_2, sign_left)
ADD v24.16b, v24.16b , v22.16b //edge_idx = vaddq_s8(edge_idx, sign_right)
- TBL v24.16b, {v10.16b},v24.16b //vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
+ TBL v24.16b, {v5.16b},v24.16b //vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
// TBL v25.8b, {v10.16b},v25.8b //vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
- AND v24.16b, v24.16b , v8.16b //edge_idx = vandq_s8(edge_idx, au1_mask)
+ AND v24.16b, v24.16b , v3.16b //edge_idx = vandq_s8(edge_idx, au1_mask)
NEG v20.16b, v22.16b //sign_left = vnegq_s8(sign_right)
EXT v20.16b, v20.16b , v22.16b,#15 //sign_left = vextq_s8(sign_left, sign_left, 15)
- TBL v26.8b, {v11.16b},v24.8b //offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx))
- Uxtl v28.8h, v12.8b //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
+ TBL v26.8b, {v7.16b},v24.8b //offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx))
+ Uxtl v28.8h, v17.8b //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
SADDW v28.8h, v28.8h , v26.8b
SMAX v28.8h, v28.8h , v4.8h //pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
UMIN v28.8h, v28.8h , v6.8h //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
@@ -337,7 +337,7 @@ PU1_SRC_LOOP_RESIDUE:
END_LOOPS:
// LDMFD sp!,{x4-x12,x15} //Reload the registers from SP
ldp x19, x20,[sp], #16
- pop_v_regs
+
ret
diff --git a/common/arm64/ihevc_sao_edge_offset_class0_chroma.s b/common/arm64/ihevc_sao_edge_offset_class0_chroma.s
index d854c62..c6be41a 100644
--- a/common/arm64/ihevc_sao_edge_offset_class0_chroma.s
+++ b/common/arm64/ihevc_sao_edge_offset_class0_chroma.s
@@ -74,7 +74,7 @@ ihevc_sao_edge_offset_class0_chroma_av8:
ldr w10,[sp,#16]
ldr w11,[sp,#24]
- push_v_regs
+
// STMFD sp!, {x4-x12, x14} //stack stores the values of the arguments
stp x19, x20,[sp,#-16]!
@@ -111,15 +111,15 @@ ihevc_sao_edge_offset_class0_chroma_av8:
ADRP x14, :got:gi1_table_edge_idx //table pointer
LDR x14, [x14, #:got_lo12:gi1_table_edge_idx]
- movi v8.16b, #0xFF //au1_mask = vdupq_n_s8(-1)
+ movi v3.16b, #0xFF //au1_mask = vdupq_n_s8(-1)
mul x4, x4, x1 //(ht - 1) * src_strd
MOV x5, x23 //Loads pi1_sao_offset_v
- LD1 {v11.8b},[x8] //offset_tbl = vld1_s8(pi1_sao_offset_u)
+ LD1 {v7.8b},[x8] //offset_tbl = vld1_s8(pi1_sao_offset_u)
ADD x4,x4,x0 //pu1_src[(ht - 1) * src_strd]
MOV x6,x0 //pu1_src_org
- LD1 {v10.8b},[x14] //edge_idx_tbl = vld1_s8(gi1_table_edge_idx)
+ LD1 {v5.8b},[x14] //edge_idx_tbl = vld1_s8(gi1_table_edge_idx)
MOV x12,x9 //Move wd to x12 for loop count
SRC_TOP_LOOP: //wd is always multiple of 8
@@ -141,20 +141,20 @@ WIDTH_LOOP_16:
CMP x8,x9 //if(col == wd)
BNE AU1_MASK_FF //jump to else part
LDRB w12,[x7] //pu1_avail[0]
- mov v8.8b[0], w12 //vsetq_lane_s8(pu1_avail[0], au1_mask, 0)
- mov v8.8b[1], w12 //vsetq_lane_s8(pu1_avail[0], au1_mask, 1)
+ mov v3.8b[0], w12 //vsetq_lane_s8(pu1_avail[0], au1_mask, 0)
+ mov v3.8b[1], w12 //vsetq_lane_s8(pu1_avail[0], au1_mask, 1)
B SKIP_AU1_MASK_FF //Skip the else part
AU1_MASK_FF:
MOV x12,#-1 //move -1 to x12
- mov v8.4h[0], w12 //au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
+ mov v3.4h[0], w12 //au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
SKIP_AU1_MASK_FF:
CMP x8,#16 //If col == 16
BNE SKIP_MASKING_IF_NOT16 //If not skip masking
LDRB w12,[x7,#1] //pu1_avail[1]
- mov v8.8b[14], w12 //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 14)
- mov v8.8b[15], w12 //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
+ mov v3.8b[14], w12 //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 14)
+ mov v3.8b[15], w12 //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
SKIP_MASKING_IF_NOT16:
MOV x12,x0 //pu1_src_cpy = pu1_src
@@ -162,27 +162,27 @@ SKIP_MASKING_IF_NOT16:
PU1_SRC_LOOP:
LDRH w11,[x2] //load pu1_src_left since ht - row =0 when it comes first pu1_src_left is incremented later
- LD1 {v12.16b},[x12],x1 //pu1_cur_row = vld1q_u8(pu1_src_cpy)
+ LD1 {v19.16b},[x12],x1 //pu1_cur_row = vld1q_u8(pu1_src_cpy)
//LD1 {v13.8b},[x12],x1 //pu1_cur_row = vld1q_u8(pu1_src_cpy)
//SUB x12, x12,#8
SUB x5,x9,x8 //wd - col
SUB x14,x10,x4 //ht - row
- mov v14.4h[7], w11 //vsetq_lane_u16(pu1_src_left[ht - row], pu1_cur_row_tmp, 14,15)
+ mov v21.4h[7], w11 //vsetq_lane_u16(pu1_src_left[ht - row], pu1_cur_row_tmp, 14,15)
mul x14, x14, x1 //(ht - row) * src_strd
LD1 {v30.16b},[x12] //II Iteration pu1_cur_row = vld1q_u8(pu1_src_cpy)
//LD1 {v31.8b},[x12] //II Iteration pu1_cur_row = vld1q_u8(pu1_src_cpy)
//SUB x12, x12,#8
- EXT v14.16b, v14.16b , v12.16b,#14 //pu1_cur_row_tmp = vextq_u8(pu1_cur_row_tmp, pu1_cur_row, 14)
+ EXT v21.16b, v21.16b , v19.16b,#14 //pu1_cur_row_tmp = vextq_u8(pu1_cur_row_tmp, pu1_cur_row, 14)
SUB x12,x12,x1
LDRH w11,[x2,#2] //II load pu1_src_left since ht - row =0
- cmhi v16.16b, v12.16b , v14.16b //vcgtq_u8(pu1_cur_row, pu1_cur_row_tmp)
+ cmhi v16.16b, v19.16b , v21.16b //vcgtq_u8(pu1_cur_row, pu1_cur_row_tmp)
ADD x5,x14,x5 //(ht - row) * src_strd + (wd - col)
mov v28.4h[7], w11 //II vsetq_lane_u16(pu1_src_left[ht - row], pu1_cur_row_tmp, 14,15)
- cmhi v18.16b, v14.16b , v12.16b //vcltq_u8(pu1_cur_row, pu1_cur_row_tmp)
+ cmhi v18.16b, v21.16b , v19.16b //vcltq_u8(pu1_cur_row, pu1_cur_row_tmp)
LDRH w14,[x6,x5] //pu1_src_org[(ht - row) * src_strd + 14 + (wd - col)]
SUB v20.16b, v18.16b , v16.16b //sign_left = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
@@ -191,7 +191,7 @@ PU1_SRC_LOOP:
LDRB w11,[x12,#16] //pu1_src_cpy[16]
EXT v28.16b, v28.16b , v30.16b,#14 //II pu1_cur_row_tmp = vextq_u8(pu1_cur_row_tmp, pu1_cur_row, 14)
- mov v14.8b[0], w11 //pu1_cur_row_tmp = vsetq_lane_u8(pu1_src_cpy[16], pu1_cur_row_tmp, 0)
+ mov v21.8b[0], w11 //pu1_cur_row_tmp = vsetq_lane_u8(pu1_src_cpy[16], pu1_cur_row_tmp, 0)
cmhi v26.16b, v30.16b , v28.16b //II vcgtq_u8(pu1_cur_row, pu1_cur_row_tmp)
LDRB w11,[x12,#17] //pu1_src_cpy[17]
@@ -199,62 +199,62 @@ PU1_SRC_LOOP:
STRH w14,[x2],#2 //pu1_src_left[(ht - row)] = au1_src_left_tmp[(ht - row)]
ADD x12,x12,x1
- mov v14.8b[1], w11 //pu1_cur_row_tmp = vsetq_lane_u8(pu1_src_cpy[17], pu1_cur_row_tmp, 1)
+ mov v21.8b[1], w11 //pu1_cur_row_tmp = vsetq_lane_u8(pu1_src_cpy[17], pu1_cur_row_tmp, 1)
LDRB w11,[x12,#16] //II pu1_src_cpy[16]
- EXT v14.16b, v12.16b , v14.16b,#2 //pu1_cur_row_tmp = vextq_u8(pu1_cur_row, pu1_cur_row_tmp, 2)
+ EXT v21.16b, v19.16b , v21.16b,#2 //pu1_cur_row_tmp = vextq_u8(pu1_cur_row, pu1_cur_row_tmp, 2)
mov v28.8b[0], w11 //II pu1_cur_row_tmp = vsetq_lane_u8(pu1_src_cpy[16], pu1_cur_row_tmp, 0)
LDRB w11,[x12,#17] //II pu1_src_cpy[17]
- cmhi v16.16b, v12.16b , v14.16b //vcgtq_u8(pu1_cur_row, pu1_cur_row_tmp)
+ cmhi v16.16b, v19.16b , v21.16b //vcgtq_u8(pu1_cur_row, pu1_cur_row_tmp)
SUB x12,x12,x1
- cmhi v18.16b, v14.16b , v12.16b //vcltq_u8(pu1_cur_row, pu1_cur_row_tmp)
+ cmhi v18.16b, v21.16b , v19.16b //vcltq_u8(pu1_cur_row, pu1_cur_row_tmp)
mov v28.8b[1], w11 //II pu1_cur_row_tmp = vsetq_lane_u8(pu1_src_cpy[17], pu1_cur_row_tmp, 1)
SUB v22.16b, v18.16b , v16.16b //sign_right = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
EXT v28.16b, v30.16b , v28.16b,#2 //II pu1_cur_row_tmp = vextq_u8(pu1_cur_row, pu1_cur_row_tmp, 2)
- ADD v14.16b, v2.16b , v20.16b //edge_idx = vaddq_s8(const_2, sign_left)
+ ADD v21.16b, v2.16b , v20.16b //edge_idx = vaddq_s8(const_2, sign_left)
- mov v10.d[1],v10.d[0]
- ADD v14.16b, v14.16b , v22.16b //edge_idx = vaddq_s8(edge_idx, sign_right)
- TBL v14.16b, {v10.16b},v14.16b //vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
+ mov v5.d[1],v5.d[0]
+ ADD v21.16b, v21.16b , v22.16b //edge_idx = vaddq_s8(edge_idx, sign_right)
+ TBL v21.16b, {v5.16b},v21.16b //vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
SUB v20.16b, v24.16b , v26.16b //II sign_left = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
cmhi v26.16b, v30.16b , v28.16b //II vcgtq_u8(pu1_cur_row, pu1_cur_row_tmp)
// TBL v15.8b, {v10.16b},v15.8b //vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
cmhi v24.16b, v28.16b , v30.16b //II vcltq_u8(pu1_cur_row, pu1_cur_row_tmp)
- AND v14.16b, v14.16b , v8.16b //edge_idx = vandq_s8(edge_idx, au1_mask)
- mov v15.d[0],v14.d[1]
- UZP1 v1.8b, v14.8b, v15.8b
- UZP2 v15.8b, v14.8b, v15.8b
- mov v14.8b, v1.8b
+ AND v21.16b, v21.16b , v3.16b //edge_idx = vandq_s8(edge_idx, au1_mask)
+ mov v23.d[0],v21.d[1]
+ UZP1 v1.8b, v21.8b, v23.8b
+ UZP2 v23.8b, v21.8b, v23.8b
+ mov v21.8b, v1.8b
//mov v11.d[1],v0.d[0]
//mov v14.d[1],v15.d[0]
SUB v22.16b, v24.16b , v26.16b //II sign_right = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
- TBL v16.8b, {v11.16b},v14.8b //offset = vtbl1_s8(offset_tbl_u, vget_low_s8(edge_idx))
+ TBL v16.8b, {v7.16b},v21.8b //offset = vtbl1_s8(offset_tbl_u, vget_low_s8(edge_idx))
ADD v24.16b, v2.16b , v20.16b //II edge_idx = vaddq_s8(const_2, sign_left)
- Uxtl v18.8h, v12.8b //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
- TBL v17.8b, {v0.16b},v15.8b
+ Uxtl v18.8h, v19.8b //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
+ TBL v17.8b, {v0.16b},v23.8b
ADD v24.16b, v24.16b , v22.16b //II edge_idx = vaddq_s8(edge_idx, sign_right)
//mov v17.d[0],v16.d[1]
ZIP1 v1.8b, v16.8b, v17.8b
ZIP2 v17.8b, v16.8b, v17.8b
mov v16.8b, v1.8b
- TBL v24.16b, {v10.16b},v24.16b //II vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
- Uxtl2 v12.8h, v12.16b //pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
+ TBL v24.16b, {v5.16b},v24.16b //II vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
+ Uxtl2 v19.8h, v19.16b //pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
//mov v16.d[1],v17.d[0]
SADDW v18.8h, v18.8h , v16.8b //pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
//TBL v25.8b, {v10.16b},v25.8b //II vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
SMAX v18.8h, v18.8h , v4.8h //pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
- AND v24.16b, v24.16b , v8.16b //II edge_idx = vandq_s8(edge_idx, au1_mask)
+ AND v24.16b, v24.16b , v3.16b //II edge_idx = vandq_s8(edge_idx, au1_mask)
mov v25.d[0],v24.d[1]
UMIN v18.8h, v18.8h , v6.8h //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
UZP1 v1.8b, v24.8b, v25.8b
@@ -262,16 +262,16 @@ PU1_SRC_LOOP:
mov v24.8b, v1.8b
//mov v24.d[1],v25.d[0]
- SADDW v12.8h, v12.8h , v17.8b //pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
- TBL v26.8b, {v11.16b},v24.8b //II offset = vtbl1_s8(offset_tbl_u, vget_low_s8(edge_idx))
- SMAX v12.8h, v12.8h , v4.8h //pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
+ SADDW v19.8h, v19.8h , v17.8b //pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
+ TBL v26.8b, {v7.16b},v24.8b //II offset = vtbl1_s8(offset_tbl_u, vget_low_s8(edge_idx))
+ SMAX v19.8h, v19.8h , v4.8h //pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
- UMIN v12.8h, v12.8h , v6.8h //pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip))
+ UMIN v19.8h, v19.8h , v6.8h //pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip))
TBL v27.8b, {v0.16b},v25.8b //II
- xtn v14.8b, v18.8h //vmovn_s16(pi2_tmp_cur_row.val[0])
+ xtn v21.8b, v18.8h //vmovn_s16(pi2_tmp_cur_row.val[0])
//mov v27.d[0],v26.d[1]
- xtn v15.8b, v12.8h //vmovn_s16(pi2_tmp_cur_row.val[1])
+ xtn v23.8b, v19.8h //vmovn_s16(pi2_tmp_cur_row.val[1])
ZIP1 v1.8b, v26.8b, v27.8b
ZIP2 v27.8b, v26.8b, v27.8b //II
mov v26.8b, v1.8b
@@ -295,7 +295,9 @@ PU1_SRC_LOOP:
Uxtl2 v30.8h, v30.16b //II pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
SADDW v30.8h, v30.8h , v27.8b //II pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
- ST1 {v14.8b, v15.8b},[x12],x1 //vst1q_u8(pu1_src_cpy, pu1_cur_row)
+ ST1 {v21.8b},[x12],#8 //vst1q_u8(pu1_src_cpy, pu1_cur_row)
+ ST1 {v23.8b},[x12],x1
+ SUB x12,x12,#8
SMAX v30.8h, v30.8h , v4.8h //II pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
SUBS x4,x4,#1 //Decrement row by 1
@@ -326,107 +328,107 @@ WIDTH_RESIDUE:
CMP x8,x9 //if(wd_rem == wd)
BNE AU1_MASK_FF_RESIDUE //jump to else part
LDRB w12,[x7] //pu1_avail[0]
- mov v8.8b[0], w12 //vsetq_lane_s8(pu1_avail[0], au1_mask, 0)
- mov v8.8b[1], w12 //vsetq_lane_s8(pu1_avail[0], au1_mask, 0)
+ mov v3.8b[0], w12 //vsetq_lane_s8(pu1_avail[0], au1_mask, 0)
+ mov v3.8b[1], w12 //vsetq_lane_s8(pu1_avail[0], au1_mask, 0)
B SKIP_AU1_MASK_FF_RESIDUE //Skip the else part
AU1_MASK_FF_RESIDUE:
MOV x12,#-1 //move -1 to x12
- mov v8.4h[0], w12 //au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
+ mov v3.4h[0], w12 //au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
SKIP_AU1_MASK_FF_RESIDUE:
LDRB w12,[x7,#1] //pu1_avail[1]
- mov v8.8b[6], w12 //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
- mov v8.8b[7], w12 //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
+ mov v3.8b[6], w12 //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
+ mov v3.8b[7], w12 //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
MOV x12,x0 //pu1_src_cpy = pu1_src
MOV x4,x10 //move ht to x4 for loop count
PU1_SRC_LOOP_RESIDUE:
LDRH w11,[x2] //load pu1_src_left
- LD1 {v12.16b},[x12],x1 //pu1_cur_row = vld1q_u8(pu1_src_cpy)
+ LD1 {v19.16b},[x12],x1 //pu1_cur_row = vld1q_u8(pu1_src_cpy)
//LD1 {v13.8b},[x12],x1 //pu1_cur_row = vld1q_u8(pu1_src_cpy)
//SUB x12, x12,#8
SUB x5,x9,#2 //wd - 2
SUB x14,x10,x4 //(ht - row)
- mov v14.4h[7], w11 //vsetq_lane_u8(pu1_src_left[ht - row], pu1_cur_row_tmp, 15)
+ mov v21.4h[7], w11 //vsetq_lane_u8(pu1_src_left[ht - row], pu1_cur_row_tmp, 15)
LSL x14,x14,#1 //(ht - row) * 2
LD1 {v30.16b},[x12] //II pu1_cur_row = vld1q_u8(pu1_src_cpy)
//LD1 {v31.8b},[x12] //II pu1_cur_row = vld1q_u8(pu1_src_cpy)
//SUB x12, x12,#8
- EXT v14.16b, v14.16b , v12.16b,#14 //pu1_cur_row_tmp = vextq_u8(pu1_cur_row_tmp, pu1_cur_row, 15)
+ EXT v21.16b, v21.16b , v19.16b,#14 //pu1_cur_row_tmp = vextq_u8(pu1_cur_row_tmp, pu1_cur_row, 15)
SUB x12,x12,x1
LDRH w11,[x2,#2] //II load pu1_src_left
- cmhi v16.16b, v12.16b , v14.16b //vcgtq_u8(pu1_cur_row, pu1_cur_row_tmp)
+ cmhi v16.16b, v19.16b , v21.16b //vcgtq_u8(pu1_cur_row, pu1_cur_row_tmp)
mul x14, x14, x1 //(ht - row) * 2 * src_strd
- cmhi v18.16b, v14.16b , v12.16b //vcltq_u8(pu1_cur_row, pu1_cur_row_tmp)
+ cmhi v18.16b, v21.16b , v19.16b //vcltq_u8(pu1_cur_row, pu1_cur_row_tmp)
mov v28.4h[7], w11 //II vsetq_lane_u8(pu1_src_left[ht - row], pu1_cur_row_tmp, 15)
LDRB w11,[x12,#16] //pu1_src_cpy[16]
SUB v20.16b, v18.16b , v16.16b //sign_left = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
ADD x5,x14,x5 //(ht - row) * 2 * src_strd + (wd - 2)
- mov v14.8b[0], w11 //pu1_cur_row_tmp = vsetq_lane_u8(pu1_src_cpy[16], pu1_cur_row_tmp, 0)
+ mov v21.8b[0], w11 //pu1_cur_row_tmp = vsetq_lane_u8(pu1_src_cpy[16], pu1_cur_row_tmp, 0)
EXT v28.16b, v28.16b , v30.16b,#14 //II pu1_cur_row_tmp = vextq_u8(pu1_cur_row_tmp, pu1_cur_row, 15)
LDRB w11,[x12,#17] //pu1_src_cpy[17]
cmhi v26.16b, v30.16b , v28.16b //II vcgtq_u8(pu1_cur_row, pu1_cur_row_tmp)
LDRH w14,[x6, x5] //pu1_src_org[(ht - row) * 2* src_strd + (wd - 2)]
- mov v14.8b[1], w11 //pu1_cur_row_tmp = vsetq_lane_u8(pu1_src_cpy[17], pu1_cur_row_tmp, 1)
+ mov v21.8b[1], w11 //pu1_cur_row_tmp = vsetq_lane_u8(pu1_src_cpy[17], pu1_cur_row_tmp, 1)
cmhi v24.16b, v28.16b , v30.16b //II vcltq_u8(pu1_cur_row, pu1_cur_row_tmp)
ADD x12,x12,x1
STRH w14,[x2],#2 //pu1_src_left[(ht - row) * 2] = au1_src_left_tmp[(ht - row) * 2]
- EXT v14.16b, v12.16b , v14.16b,#2 //pu1_cur_row_tmp = vextq_u8(pu1_cur_row, pu1_cur_row_tmp, 1)
+ EXT v21.16b, v19.16b , v21.16b,#2 //pu1_cur_row_tmp = vextq_u8(pu1_cur_row, pu1_cur_row_tmp, 1)
LDRB w11,[x12,#16] //II pu1_src_cpy[16]
- cmhi v16.16b, v12.16b , v14.16b //vcgtq_u8(pu1_cur_row, pu1_cur_row_tmp)
+ cmhi v16.16b, v19.16b , v21.16b //vcgtq_u8(pu1_cur_row, pu1_cur_row_tmp)
mov v28.8b[0], w11 //II pu1_cur_row_tmp = vsetq_lane_u8(pu1_src_cpy[16], pu1_cur_row_tmp, 0)
LDRB w11,[x12,#17] //II pu1_src_cpy[17]
- cmhi v18.16b, v14.16b , v12.16b //vcltq_u8(pu1_cur_row, pu1_cur_row_tmp)
+ cmhi v18.16b, v21.16b , v19.16b //vcltq_u8(pu1_cur_row, pu1_cur_row_tmp)
SUB x4,x4,#1 //II Decrement row by 1
SUB v22.16b, v18.16b , v16.16b //sign_right = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
mov v28.8b[1], w11 //II pu1_cur_row_tmp = vsetq_lane_u8(pu1_src_cpy[17], pu1_cur_row_tmp, 1)
SUB x12,x12,x1
- ADD v14.16b, v2.16b , v20.16b //edge_idx = vaddq_s8(const_2, sign_left)
+ ADD v21.16b, v2.16b , v20.16b //edge_idx = vaddq_s8(const_2, sign_left)
EXT v28.16b, v30.16b , v28.16b,#2 //II pu1_cur_row_tmp = vextq_u8(pu1_cur_row, pu1_cur_row_tmp, 1)
- ADD v14.16b, v14.16b , v22.16b //edge_idx = vaddq_s8(edge_idx, sign_right)
+ ADD v21.16b, v21.16b , v22.16b //edge_idx = vaddq_s8(edge_idx, sign_right)
SUB v20.16b, v24.16b , v26.16b //II sign_left = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
- TBL v14.16b, {v10.16b},v14.16b //vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
+ TBL v21.16b, {v5.16b},v21.16b //vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
cmhi v26.16b, v30.16b , v28.16b //II vcgtq_u8(pu1_cur_row, pu1_cur_row_tmp)
cmhi v24.16b, v28.16b , v30.16b //II vcltq_u8(pu1_cur_row, pu1_cur_row_tmp)
//TBL v15.8b, {v10.16b},v15.8b //vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
SUB v22.16b, v24.16b , v26.16b //II sign_right = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
- AND v14.16b, v14.16b , v8.16b //edge_idx = vandq_s8(edge_idx, au1_mask)
- mov v15.d[0],v14.d[1]
- UZP1 v1.8b, v14.8b, v15.8b
- UZP2 v15.8b, v14.8b, v15.8b
- mov v14.8b, v1.8b
+ AND v21.16b, v21.16b , v3.16b //edge_idx = vandq_s8(edge_idx, au1_mask)
+ mov v23.d[0],v21.d[1]
+ UZP1 v1.8b, v21.8b, v23.8b
+ UZP2 v23.8b, v21.8b, v23.8b
+ mov v21.8b, v1.8b
ADD v28.16b, v2.16b , v20.16b //II edge_idx = vaddq_s8(const_2, sign_left)
- TBL v16.8b, {v11.16b},v14.8b //offset = vtbl1_s8(offset_tbl_u, vget_low_s8(edge_idx))
+ TBL v16.8b, {v7.16b},v21.8b //offset = vtbl1_s8(offset_tbl_u, vget_low_s8(edge_idx))
ADD v28.16b, v28.16b , v22.16b //II edge_idx = vaddq_s8(edge_idx, sign_right)
- Uxtl v18.8h, v12.8b //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
- TBL v17.8b, {v0.16b},v15.8b
+ Uxtl v18.8h, v19.8b //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
+ TBL v17.8b, {v0.16b},v23.8b
Uxtl v24.8h, v30.8b //II pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
ZIP1 v1.8b, v16.8b, v17.8b
ZIP2 v17.8b, v16.8b, v17.8b
mov v16.8b, v1.8b
- TBL v28.16b, {v10.16b},v28.16b //II vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
+ TBL v28.16b, {v5.16b},v28.16b //II vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
SADDW v18.8h, v18.8h , v16.8b //pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
SMAX v18.8h, v18.8h , v4.8h //pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
@@ -434,7 +436,7 @@ PU1_SRC_LOOP_RESIDUE:
UMIN v18.8h, v18.8h , v6.8h //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
xtn v18.8b, v18.8h //vmovn_s16(pi2_tmp_cur_row.val[0])
- AND v28.16b, v28.16b , v8.16b //II edge_idx = vandq_s8(edge_idx, au1_mask)
+ AND v28.16b, v28.16b , v3.16b //II edge_idx = vandq_s8(edge_idx, au1_mask)
mov v29.d[0],v28.d[1]
SUB x5,x9,#2 //II wd - 2
UZP1 v1.8b, v28.8b, v29.8b
@@ -443,7 +445,7 @@ PU1_SRC_LOOP_RESIDUE:
SUB x14,x10,x4 //II (ht - row)
LSL x14,x14,#1 //II (ht - row) * 2
- TBL v26.8b, {v11.16b},v28.8b //II offset = vtbl1_s8(offset_tbl_u, vget_low_s8(edge_idx))
+ TBL v26.8b, {v7.16b},v28.8b //II offset = vtbl1_s8(offset_tbl_u, vget_low_s8(edge_idx))
mul x14, x14, x1 //II (ht - row) * 2 * src_strd
ADD x5,x14,x5 //II (ht - row) * 2 * src_strd + (wd - 2)
@@ -474,7 +476,7 @@ END_LOOPS:
ldp x23, x24,[sp],#16
ldp x21, x22,[sp],#16
ldp x19, x20,[sp],#16
- pop_v_regs
+
ret
diff --git a/common/arm64/ihevc_sao_edge_offset_class1.s b/common/arm64/ihevc_sao_edge_offset_class1.s
index 8ed6169..515b349 100644
--- a/common/arm64/ihevc_sao_edge_offset_class1.s
+++ b/common/arm64/ihevc_sao_edge_offset_class1.s
@@ -76,7 +76,7 @@ ihevc_sao_edge_offset_class1_av8:
LDR w7,[sp,#8] //Loads wd
LDR w8,[sp,#16] //Loads ht
- push_v_regs
+
stp x19, x20,[sp,#-16]!
SUB x9,x7,#1 //wd - 1
@@ -128,16 +128,16 @@ WIDTH_LOOP_16:
MOV x10,x0 //*pu1_src
- LD1 {v8.16b},[x9],#16 //pu1_top_row = vld1q_u8(pu1_src_top_cpy || pu1_src - src_strd)
- LD1 {v10.16b},[x0],#16 //pu1_cur_row = vld1q_u8(pu1_src)
+ LD1 {v1.16b},[x9],#16 //pu1_top_row = vld1q_u8(pu1_src_top_cpy || pu1_src - src_strd)
+ LD1 {v3.16b},[x0],#16 //pu1_cur_row = vld1q_u8(pu1_src)
LD1 {v30.16b},[x12],#16 //vld1q_u8(pu1_src[(ht - 1) * src_strd])
- cmhi v12.16b, v10.16b , v8.16b //vcgtq_u8(pu1_cur_row, pu1_top_row)
+ cmhi v5.16b, v3.16b , v1.16b //vcgtq_u8(pu1_cur_row, pu1_top_row)
ST1 { v30.16b},[x3],#16 //vst1q_u8(pu1_src_top[col])
- cmhi v14.16b, v8.16b , v10.16b //vcltq_u8(pu1_cur_row, pu1_top_row)
+ cmhi v17.16b, v1.16b , v3.16b //vcltq_u8(pu1_cur_row, pu1_top_row)
- SUB v16.16b, v14.16b , v12.16b //sign_up = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
+ SUB v16.16b, v17.16b , v5.16b //sign_up = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
MOV x11,x8 //move ht to x11 for loop count
PU1_SRC_LOOP:
@@ -145,59 +145,59 @@ PU1_SRC_LOOP:
LD1 {v18.16b},[x10] //pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
ADD x6,x10,x1 //II Iteration *pu1_src + src_strd
- cmhi v12.16b, v10.16b , v18.16b //vcgtq_u8(pu1_cur_row, pu1_top_row)
+ cmhi v5.16b, v3.16b , v18.16b //vcgtq_u8(pu1_cur_row, pu1_top_row)
LD1 {v30.16b},[x6] //II pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
- cmhi v14.16b, v18.16b , v10.16b //vcltq_u8(pu1_cur_row, pu1_top_row)
+ cmhi v17.16b, v18.16b , v3.16b //vcltq_u8(pu1_cur_row, pu1_top_row)
SUB x10,x10,x1
- SUB v20.16b, v14.16b , v12.16b //sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
+ SUB v20.16b, v17.16b , v5.16b //sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
Uxtl v26.8h, v18.8b //II pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
- ADD v12.16b, v0.16b , v16.16b //edge_idx = vaddq_s8(const_2, sign_up)
+ ADD v5.16b, v0.16b , v16.16b //edge_idx = vaddq_s8(const_2, sign_up)
Uxtl2 v28.8h, v18.16b //II pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
- ADD v12.16b, v12.16b , v20.16b //edge_idx = vaddq_s8(edge_idx, sign_down)
+ ADD v5.16b, v5.16b , v20.16b //edge_idx = vaddq_s8(edge_idx, sign_down)
cmhi v22.16b, v18.16b , v30.16b //II vcgtq_u8(pu1_cur_row, pu1_top_row)
NEG v16.16b, v20.16b //sign_up = vnegq_s8(sign_down)
- TBL v12.16b, {v6.16b},v12.16b //vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
+ TBL v5.16b, {v6.16b},v5.16b //vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
cmhi v24.16b, v30.16b , v18.16b //II vcltq_u8(pu1_cur_row, pu1_top_row)
- SUB v8.16b, v24.16b , v22.16b //II sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
+ SUB v1.16b, v24.16b , v22.16b //II sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
// TBL v13.8b, {v6.16b},v13.8b //vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
ADD v22.16b, v0.16b , v16.16b //II edge_idx = vaddq_s8(const_2, sign_up)
- NEG v16.16b, v8.16b //II sign_up = vnegq_s8(sign_down)
- TBL v12.16b, {v7.16b},v12.16b //offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx))
- ADD v22.16b, v22.16b , v8.16b //II edge_idx = vaddq_s8(edge_idx, sign_down)
+ NEG v16.16b, v1.16b //II sign_up = vnegq_s8(sign_down)
+ TBL v5.16b, {v7.16b},v5.16b //offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx))
+ ADD v22.16b, v22.16b , v1.16b //II edge_idx = vaddq_s8(edge_idx, sign_down)
- Uxtl v20.8h, v10.8b //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
+ Uxtl v20.8h, v3.8b //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
TBL v22.16b, {v6.16b},v22.16b //II vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
- SADDW v20.8h, v20.8h , v12.8b //pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
+ SADDW v20.8h, v20.8h , v5.8b //pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
SMAX v20.8h, v20.8h , v2.8h //pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
// TBL v23.8b, {v6.16b},v23.8b //II vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
UMIN v20.8h, v20.8h , v4.8h //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
- Uxtl2 v8.8h, v10.16b //pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
+ Uxtl2 v1.8h, v3.16b //pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
// TBL v13.8b, {v7.16b},v13.8b //offset = vtbl1_s8(offset_tbl, vget_high_s8(edge_idx))
- mov v10.16b, v30.16b //II pu1_cur_row = pu1_next_row
+ mov v3.16b, v30.16b //II pu1_cur_row = pu1_next_row
- SADDW2 v8.8h, v8.8h , v12.16b //pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
+ SADDW2 v1.8h, v1.8h , v5.16b //pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
TBL v24.16b, {v7.16b},v22.16b //II offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx))
- SMAX v8.8h, v8.8h , v2.8h //pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
+ SMAX v1.8h, v1.8h , v2.8h //pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
- UMIN v8.8h, v8.8h , v4.8h //pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip))
+ UMIN v1.8h, v1.8h , v4.8h //pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip))
// TBL v25.8b, {v7.16b},v23.8b //II offset = vtbl1_s8(offset_tbl, vget_high_s8(edge_idx))
xtn v20.8b, v20.8h //vmovn_s16(pi2_tmp_cur_row.val[0])
SADDW v26.8h, v26.8h , v24.8b //II pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
- xtn2 v20.16b, v8.8h //vmovn_s16(pi2_tmp_cur_row.val[1])
+ xtn2 v20.16b, v1.8h //vmovn_s16(pi2_tmp_cur_row.val[1])
SADDW2 v28.8h, v28.8h , v24.16b //II pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
@@ -220,9 +220,9 @@ PU1_SRC_LOOP:
ADD x10,x10,x1 //*pu1_src + src_strd
LD1 {v18.16b},[x10] //pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
- cmhi v12.16b, v10.16b , v18.16b //vcgtq_u8(pu1_cur_row, pu1_top_row)
- cmhi v14.16b, v18.16b , v10.16b //vcltq_u8(pu1_cur_row, pu1_top_row)
- SUB v20.16b, v14.16b , v12.16b //sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
+ cmhi v5.16b, v3.16b , v18.16b //vcgtq_u8(pu1_cur_row, pu1_top_row)
+ cmhi v17.16b, v18.16b , v3.16b //vcltq_u8(pu1_cur_row, pu1_top_row)
+ SUB v20.16b, v17.16b , v5.16b //sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
SUB x10,x10,x1
ADD v22.16b, v0.16b , v16.16b //edge_idx = vaddq_s8(const_2, sign_up)
@@ -231,13 +231,13 @@ PU1_SRC_LOOP:
// TBL v23.8b, {v6.16b},v23.8b //vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
TBL v24.16b, {v7.16b},v22.16b //offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx))
- Uxtl v26.8h, v10.8b //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
+ Uxtl v26.8h, v3.8b //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
SADDW v26.8h, v26.8h , v24.8b //pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
SMAX v26.8h, v26.8h , v2.8h //pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
UMIN v26.8h, v26.8h , v4.8h //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
// TBL v25.8b, {v7.16b},v23.8b //offset = vtbl1_s8(offset_tbl, vget_high_s8(edge_idx))
- Uxtl2 v28.8h, v10.16b //pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
+ Uxtl2 v28.8h, v3.16b //pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
SADDW2 v28.8h, v28.8h , v24.16b //pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
SMAX v28.8h, v28.8h , v2.8h //pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
UMIN v28.8h, v28.8h , v4.8h //pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip))
@@ -248,7 +248,7 @@ PU1_SRC_LOOP:
ST1 { v30.16b},[x10],x1 //vst1q_u8(pu1_src_cpy, pu1_cur_row)
PU1_SRC_LOOP_END:
- mov v10.16b, v18.16b //pu1_cur_row = pu1_next_row
+ mov v3.16b, v18.16b //pu1_cur_row = pu1_next_row
SUBS x7,x7,#16 //Decrement the wd loop count by 16
CMP x7,#8 //Check whether residue remains
BEQ WIDTH_RESIDUE //If residue remains jump to residue loop
@@ -264,15 +264,15 @@ WIDTH_RESIDUE:
csel x9, x3, x9,NE //*pu1_src_top
MOV x10,x0
- LD1 {v8.16b},[x9],#16 //pu1_top_row = vld1q_u8(pu1_src_top_cpy || pu1_src - src_strd)
- LD1 {v10.16b},[x0],#16 //pu1_cur_row = vld1q_u8(pu1_src)
+ LD1 {v1.16b},[x9],#16 //pu1_top_row = vld1q_u8(pu1_src_top_cpy || pu1_src - src_strd)
+ LD1 {v3.16b},[x0],#16 //pu1_cur_row = vld1q_u8(pu1_src)
LD1 {v30.8b},[x12] //vld1_u8(pu1_src[(ht - 1) * src_strd])
ST1 {v30.8b},[x3] //vst1_u8(pu1_src_top[col])
- cmhi v12.16b, v10.16b , v8.16b //vcgtq_u8(pu1_cur_row, pu1_top_row)
- cmhi v14.16b, v8.16b , v10.16b //vcltq_u8(pu1_cur_row, pu1_top_row)
- SUB v16.16b, v14.16b , v12.16b //sign_up = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
+ cmhi v5.16b, v3.16b , v1.16b //vcgtq_u8(pu1_cur_row, pu1_top_row)
+ cmhi v17.16b, v1.16b , v3.16b //vcltq_u8(pu1_cur_row, pu1_top_row)
+ SUB v16.16b, v17.16b , v5.16b //sign_up = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
MOV x11,x8 //move ht to x11 for loop count
PU1_SRC_LOOP_RESIDUE:
@@ -280,33 +280,33 @@ PU1_SRC_LOOP_RESIDUE:
LD1 {v18.16b},[x10] //pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
ADD x6,x10,x1 //II Iteration *pu1_src + src_strd
- cmhi v12.16b, v10.16b , v18.16b //vcgtq_u8(pu1_cur_row, pu1_next_row)
+ cmhi v5.16b, v3.16b , v18.16b //vcgtq_u8(pu1_cur_row, pu1_next_row)
LD1 {v30.16b},[x6] //II pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
- cmhi v14.16b, v18.16b , v10.16b //vcltq_u8(pu1_cur_row, pu1_next_row)
+ cmhi v17.16b, v18.16b , v3.16b //vcltq_u8(pu1_cur_row, pu1_next_row)
SUB x10,x10,x1
- SUB v20.16b, v14.16b , v12.16b //sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
+ SUB v20.16b, v17.16b , v5.16b //sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
Uxtl v26.8h, v18.8b //II pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
- ADD v12.16b, v0.16b , v16.16b //edge_idx = vaddq_s8(const_2, sign_up)
+ ADD v5.16b, v0.16b , v16.16b //edge_idx = vaddq_s8(const_2, sign_up)
cmhi v22.16b, v18.16b , v30.16b //II vcgtq_u8(pu1_cur_row, pu1_next_row)
- ADD v12.16b, v12.16b , v20.16b //edge_idx = vaddq_s8(edge_idx, sign_down)
+ ADD v5.16b, v5.16b , v20.16b //edge_idx = vaddq_s8(edge_idx, sign_down)
cmhi v24.16b, v30.16b , v18.16b //II vcltq_u8(pu1_cur_row, pu1_next_row)
NEG v16.16b, v20.16b //sign_up = vnegq_s8(sign_down)
- TBL v12.8b, {v6.16b},v12.8b //vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
+ TBL v5.8b, {v6.16b},v5.8b //vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
SUB v20.16b, v24.16b , v22.16b //II sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
ADD v22.16b, v0.16b , v16.16b //II edge_idx = vaddq_s8(const_2, sign_up)
- TBL v12.8b, {v7.16b},v12.8b //offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx))
+ TBL v5.8b, {v7.16b},v5.8b //offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx))
NEG v16.16b, v20.16b //II sign_up = vnegq_s8(sign_down)
ADD v22.16b, v22.16b , v20.16b //II edge_idx = vaddq_s8(edge_idx, sign_down)
- Uxtl v20.8h, v10.8b //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
+ Uxtl v20.8h, v3.8b //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
- SADDW v20.8h, v20.8h , v12.8b //pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
+ SADDW v20.8h, v20.8h , v5.8b //pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
TBL v22.8b, {v6.16b},v22.8b //II vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
SMAX v20.8h, v20.8h , v2.8h //pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
@@ -318,7 +318,7 @@ PU1_SRC_LOOP_RESIDUE:
SMAX v26.8h, v26.8h , v2.8h //II pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
UMIN v26.8h, v26.8h , v4.8h //II pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
- mov v10.16b, v30.16b //II pu1_cur_row = pu1_next_row
+ mov v3.16b, v30.16b //II pu1_cur_row = pu1_next_row
ST1 {v20.8b},[x10],x1 //vst1q_u8(pu1_src_cpy, pu1_cur_row)
xtn v30.8b, v26.8h //II vmovn_s16(pi2_tmp_cur_row.val[0])
@@ -332,9 +332,9 @@ PU1_SRC_LOOP_RESIDUE:
ADD x10,x10,x1 //*pu1_src + src_strd
LD1 {v18.16b},[x10] //pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
- cmhi v12.16b, v10.16b , v18.16b //vcgtq_u8(pu1_cur_row, pu1_next_row)
- cmhi v14.16b, v18.16b , v10.16b //vcltq_u8(pu1_cur_row, pu1_next_row)
- SUB v20.16b, v14.16b , v12.16b //sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
+ cmhi v5.16b, v3.16b , v18.16b //vcgtq_u8(pu1_cur_row, pu1_next_row)
+ cmhi v17.16b, v18.16b , v3.16b //vcltq_u8(pu1_cur_row, pu1_next_row)
+ SUB v20.16b, v17.16b , v5.16b //sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
SUB x10,x10,x1
ADD v22.16b, v0.16b , v16.16b //edge_idx = vaddq_s8(const_2, sign_up)
@@ -342,7 +342,7 @@ PU1_SRC_LOOP_RESIDUE:
TBL v22.8b, {v6.16b},v22.8b //vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
TBL v24.8b, {v7.16b},v22.8b //offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx))
- Uxtl v26.8h, v10.8b //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
+ Uxtl v26.8h, v3.8b //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
SADDW v26.8h, v26.8h , v24.8b //pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
SMAX v26.8h, v26.8h , v2.8h //pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
UMIN v26.8h, v26.8h , v4.8h //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
@@ -354,7 +354,7 @@ PU1_SRC_LOOP_RESIDUE:
END_LOOPS:
// LDMFD sp!,{x4-x12,x15} //Reload the registers from SP
ldp x19, x20,[sp], #16
- pop_v_regs
+
ret
diff --git a/common/arm64/ihevc_sao_edge_offset_class1_chroma.s b/common/arm64/ihevc_sao_edge_offset_class1_chroma.s
index 4baa5bf..894e702 100644
--- a/common/arm64/ihevc_sao_edge_offset_class1_chroma.s
+++ b/common/arm64/ihevc_sao_edge_offset_class1_chroma.s
@@ -76,7 +76,7 @@ ihevc_sao_edge_offset_class1_chroma_av8:
ldr w11,[sp,#24]
- push_v_regs
+
// STMFD sp!, {x4-x12, x14} //stack stores the values of the arguments
stp x19, x20,[sp,#-16]!
stp x21, x22,[sp,#-16]!
@@ -135,7 +135,7 @@ SRC_LEFT_LOOP:
LDR x14, [x14, #:got_lo12:gi1_table_edge_idx]
LD1 {v6.8b},[x14] //edge_idx_tbl = vld1_s8(gi1_table_edge_idx)
LD1 {v7.8b},[x6] //offset_tbl_u = vld1_s8(pi1_sao_offset_u)
- LD1 {v8.8b},[x7] //offset_tbl_v = vld1_s8(pi1_sao_offset_v)
+ LD1 {v1.8b},[x7] //offset_tbl_v = vld1_s8(pi1_sao_offset_v)
CMP x8,#16 //Compare wd with 16
BLT WIDTH_RESIDUE //If not jump to WIDTH_RESIDUE where loop is unrolled for 8 case
@@ -151,17 +151,17 @@ WIDTH_LOOP_16:
LD1 {v28.16b},[x11],#16 //pu1_top_row = vld1q_u8(pu1_src_top_cpy || pu1_src - src_strd)
//LD1 {v29.8b},[x11],#8 //pu1_top_row = vld1q_u8(pu1_src_top_cpy || pu1_src - src_strd)
- LD1 {v10.16b},[x0],#16 //pu1_cur_row = vld1q_u8(pu1_src)
+ LD1 {v3.16b},[x0],#16 //pu1_cur_row = vld1q_u8(pu1_src)
//LD1 {v11.8b},[x0],#8 //pu1_cur_row = vld1q_u8(pu1_src)
LD1 {v30.16b},[x12],#16 //vld1q_u8(pu1_src[(ht - 1) * src_strd])
//LD1 {v31.8b},[x12],#8 //vld1q_u8(pu1_src[(ht - 1) * src_strd])
- cmhi v12.16b, v10.16b , v28.16b //vcgtq_u8(pu1_cur_row, pu1_top_row)
+ cmhi v5.16b, v3.16b , v28.16b //vcgtq_u8(pu1_cur_row, pu1_top_row)
ST1 { v30.16b},[x3],#16 //vst1q_u8(pu1_src_top[col])
- cmhi v14.16b, v28.16b , v10.16b //vcltq_u8(pu1_cur_row, pu1_top_row)
+ cmhi v19.16b, v28.16b , v3.16b //vcltq_u8(pu1_cur_row, pu1_top_row)
- SUB v16.16b, v14.16b , v12.16b //sign_up = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
+ SUB v16.16b, v19.16b , v5.16b //sign_up = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
MOV x11,x9 //move ht to x11 for loop count
PU1_SRC_LOOP:
@@ -172,47 +172,47 @@ PU1_SRC_LOOP:
ADD x6,x10,x1 //II Iteration *pu1_src + src_strd
//mov v19.d[0],v18.d[1]
- cmhi v12.16b, v10.16b , v18.16b //vcgtq_u8(pu1_cur_row, pu1_top_row)
+ cmhi v5.16b, v3.16b , v18.16b //vcgtq_u8(pu1_cur_row, pu1_top_row)
LD1 {v30.16b},[x6] //II pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
//LD1 {v31.8b},[x6] //II pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
//SUB x6, x6,#8
- cmhi v14.16b, v18.16b , v10.16b //vcltq_u8(pu1_cur_row, pu1_top_row)
+ cmhi v19.16b, v18.16b , v3.16b //vcltq_u8(pu1_cur_row, pu1_top_row)
SUB x10,x10,x1
- SUB v20.16b, v14.16b , v12.16b //sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
+ SUB v20.16b, v19.16b , v5.16b //sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
Uxtl v26.8h, v18.8b //II pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
- ADD v12.16b, v0.16b , v16.16b //edge_idx = vaddq_s8(const_2, sign_up)
+ ADD v5.16b, v0.16b , v16.16b //edge_idx = vaddq_s8(const_2, sign_up)
Uxtl2 v28.8h, v18.16b //II pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
- ADD v12.16b, v12.16b , v20.16b //edge_idx = vaddq_s8(edge_idx, sign_down)
+ ADD v5.16b, v5.16b , v20.16b //edge_idx = vaddq_s8(edge_idx, sign_down)
cmhi v22.16b, v18.16b , v30.16b //II vcgtq_u8(pu1_cur_row, pu1_top_row)
mov v16.d[1],v16.d[0]
NEG v16.16b, v20.16b //sign_up = vnegq_s8(sign_down)
- TBL v12.16b, {v6.16b},v12.16b //vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
+ TBL v5.16b, {v6.16b},v5.16b //vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
cmhi v24.16b, v30.16b , v18.16b //II vcltq_u8(pu1_cur_row, pu1_top_row)
SUB v28.16b, v24.16b , v22.16b //II sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
//TBL v13.8b, {v6.16b},v13.8b //vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
ADD v22.16b, v0.16b , v16.16b //II edge_idx = vaddq_s8(const_2, sign_up)
- mov v13.d[0], v12.d[1]
- UZP1 v27.8b, v12.8b, v13.8b
- UZP2 v13.8b, v12.8b, v13.8b
- mov v12.8b,v27.8b
+ mov v17.d[0], v5.d[1]
+ UZP1 v27.8b, v5.8b, v17.8b
+ UZP2 v17.8b, v5.8b, v17.8b
+ mov v5.8b,v27.8b
NEG v16.16b, v28.16b //II sign_up = vnegq_s8(sign_down)
- TBL v12.8b, {v7.16b},v12.8b //offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx))
+ TBL v5.8b, {v7.16b},v5.8b //offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx))
ADD v22.16b, v22.16b , v28.16b //II edge_idx = vaddq_s8(edge_idx, sign_down)
- Uxtl v20.8h, v10.8b //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
- TBL v13.8b, {v8.16b},v13.8b
- ZIP1 v27.8b, v12.8b, v13.8b
- ZIP2 v13.8b, v12.8b, v13.8b
- mov v12.8b,v27.8b
+ Uxtl v20.8h, v3.8b //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
+ TBL v17.8b, {v1.16b},v17.8b
+ ZIP1 v27.8b, v5.8b, v17.8b
+ ZIP2 v17.8b, v5.8b, v17.8b
+ mov v5.8b,v27.8b
- SADDW v20.8h, v20.8h , v12.8b //pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
+ SADDW v20.8h, v20.8h , v5.8b //pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
TBL v22.16b, {v6.16b},v22.16b //II vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
SMAX v20.8h, v20.8h , v2.8h //pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
@@ -224,15 +224,15 @@ PU1_SRC_LOOP:
UZP2 v23.8b, v22.8b, v23.8b
mov v22.8b,v27.8b
- Uxtl2 v28.8h, v10.16b //pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
+ Uxtl2 v28.8h, v3.16b //pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
//VTBL.8 D13,D7,D13 @offset = vtbl1_s8(offset_tbl, vget_high_s8(edge_idx))
- mov v10.16b, v30.16b //II pu1_cur_row = pu1_next_row
+ mov v3.16b, v30.16b //II pu1_cur_row = pu1_next_row
- SADDW v28.8h, v28.8h , v13.8b //pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
+ SADDW v28.8h, v28.8h , v17.8b //pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
TBL v24.8b, {v7.16b},v22.8b //offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx))
SMAX v28.8h, v28.8h , v2.8h //pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
- TBL v25.8b, {v8.16b},v23.8b
+ TBL v25.8b, {v1.16b},v23.8b
ZIP1 v27.8b, v24.8b, v25.8b
ZIP2 v25.8b, v24.8b, v25.8b
mov v24.8b,v27.8b
@@ -270,9 +270,9 @@ PU1_SRC_LOOP:
LD1 {v18.16b},[x10] //pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
//LD1 {v19.8b},[x10] //pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
//SUB x10, x10,#8
- cmhi v12.16b, v10.16b , v18.16b //vcgtq_u8(pu1_cur_row, pu1_top_row)
- cmhi v14.16b, v18.16b , v10.16b //vcltq_u8(pu1_cur_row, pu1_top_row)
- SUB v20.16b, v14.16b , v12.16b //sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
+ cmhi v5.16b, v3.16b , v18.16b //vcgtq_u8(pu1_cur_row, pu1_top_row)
+ cmhi v19.16b, v18.16b , v3.16b //vcltq_u8(pu1_cur_row, pu1_top_row)
+ SUB v20.16b, v19.16b , v5.16b //sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
SUB x10,x10,x1
ADD v22.16b, v0.16b , v16.16b //edge_idx = vaddq_s8(const_2, sign_up)
@@ -285,19 +285,19 @@ PU1_SRC_LOOP:
UZP2 v23.8b, v22.8b, v23.8b
mov v22.8b,v27.8b
TBL v24.8b, {v7.16b},v22.8b
- TBL v25.8b, {v8.16b},v23.8b
+ TBL v25.8b, {v1.16b},v23.8b
ZIP1 v27.8b, v24.8b, v25.8b
ZIP2 v25.8b, v24.8b, v25.8b
mov v24.8b,v27.8b
//VTBL.8 D24,D7,D22 @offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx))
- Uxtl v26.8h, v10.8b //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
+ Uxtl v26.8h, v3.8b //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
SADDW v26.8h, v26.8h , v24.8b //pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
SMAX v26.8h, v26.8h , v2.8h //pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
UMIN v26.8h, v26.8h , v4.8h //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
//VTBL.8 D25,D7,D23 @offset = vtbl1_s8(offset_tbl, vget_high_s8(edge_idx))
- Uxtl2 v28.8h, v10.16b //pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
+ Uxtl2 v28.8h, v3.16b //pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
SADDW v28.8h, v28.8h , v25.8b //pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
SMAX v28.8h, v28.8h , v2.8h //pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
UMIN v28.8h, v28.8h , v4.8h //pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip))
@@ -308,7 +308,7 @@ PU1_SRC_LOOP:
ST1 { v30.16b},[x10],x1 //vst1q_u8(pu1_src_cpy, pu1_cur_row)
PU1_SRC_LOOP_END:
- mov v10.16b, v18.16b //pu1_cur_row = pu1_next_row
+ mov v3.16b, v18.16b //pu1_cur_row = pu1_next_row
SUBS x8,x8,#16 //Decrement the wd loop count by 16
CMP x8,#8 //Check whether residue remains
BEQ WIDTH_RESIDUE //If residue remains jump to residue loop
@@ -326,15 +326,15 @@ WIDTH_RESIDUE:
LD1 {v28.16b},[x11] //pu1_top_row = vld1q_u8(pu1_src_top_cpy || pu1_src - src_strd)
//LD1 {v29.8b},[x11],#8 //pu1_top_row = vld1q_u8(pu1_src_top_cpy || pu1_src - src_strd)
- LD1 {v10.16b},[x0] //pu1_cur_row = vld1q_u8(pu1_src)
+ LD1 {v3.16b},[x0] //pu1_cur_row = vld1q_u8(pu1_src)
//LD1 {v11.8b},[x0],#8 //pu1_cur_row = vld1q_u8(pu1_src)
LD1 {v30.8b},[x12] //vld1_u8(pu1_src[(ht - 1) * src_strd])
ST1 {v30.8b},[x3] //vst1_u8(pu1_src_top[col])
- cmhi v12.16b, v10.16b , v28.16b //vcgtq_u8(pu1_cur_row, pu1_top_row)
- cmhi v14.16b, v28.16b , v10.16b //vcltq_u8(pu1_cur_row, pu1_top_row)
- SUB v16.16b, v14.16b , v12.16b //sign_up = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
+ cmhi v5.16b, v3.16b , v28.16b //vcgtq_u8(pu1_cur_row, pu1_top_row)
+ cmhi v19.16b, v28.16b , v3.16b //vcltq_u8(pu1_cur_row, pu1_top_row)
+ SUB v16.16b, v19.16b , v5.16b //sign_up = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
MOV x11,x9 //move ht to x11 for loop count
PU1_SRC_LOOP_RESIDUE:
@@ -344,46 +344,46 @@ PU1_SRC_LOOP_RESIDUE:
//SUB x10, x10,#8
ADD x6,x10,x1 //II Iteration *pu1_src + src_strd
- cmhi v12.16b, v10.16b , v18.16b //vcgtq_u8(pu1_cur_row, pu1_next_row)
+ cmhi v5.16b, v3.16b , v18.16b //vcgtq_u8(pu1_cur_row, pu1_next_row)
LD1 {v30.16b},[x6] //II pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
//LD1 {v31.8b},[x6] //II pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
//SUB x6, x6,#8
- cmhi v14.16b, v18.16b , v10.16b //vcltq_u8(pu1_cur_row, pu1_next_row)
+ cmhi v19.16b, v18.16b , v3.16b //vcltq_u8(pu1_cur_row, pu1_next_row)
SUB x10,x10,x1
- SUB v20.16b, v14.16b , v12.16b //sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
+ SUB v20.16b, v19.16b , v5.16b //sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
Uxtl v26.8h, v18.8b //II pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
- ADD v12.16b, v0.16b , v16.16b //edge_idx = vaddq_s8(const_2, sign_up)
+ ADD v5.16b, v0.16b , v16.16b //edge_idx = vaddq_s8(const_2, sign_up)
cmhi v22.16b, v18.16b , v30.16b //II vcgtq_u8(pu1_cur_row, pu1_next_row)
- ADD v12.16b, v12.16b , v20.16b //edge_idx = vaddq_s8(edge_idx, sign_down)
+ ADD v5.16b, v5.16b , v20.16b //edge_idx = vaddq_s8(edge_idx, sign_down)
cmhi v24.16b, v30.16b , v18.16b //II vcltq_u8(pu1_cur_row, pu1_next_row)
NEG v16.16b, v20.16b //sign_up = vnegq_s8(sign_down)
- TBL v12.8b, {v6.16b},v12.8b //vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
+ TBL v5.8b, {v6.16b},v5.8b //vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
SUB v20.16b, v24.16b , v22.16b //II sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
- UZP1 v27.8b, v12.8b, v13.8b
- UZP2 v13.8b, v12.8b, v13.8b
- mov v12.8b,v27.8b
+ UZP1 v27.8b, v5.8b, v17.8b
+ UZP2 v17.8b, v5.8b, v17.8b
+ mov v5.8b,v27.8b
ADD v22.16b, v0.16b , v16.16b //II edge_idx = vaddq_s8(const_2, sign_up)
- TBL v12.8b, {v7.16b},v12.8b
+ TBL v5.8b, {v7.16b},v5.8b
NEG v16.16b, v20.16b //II sign_up = vnegq_s8(sign_down)
- TBL v13.8b, {v8.16b},v13.8b
- ZIP1 v27.8b, v12.8b, v13.8b
- ZIP2 v13.8b, v12.8b, v13.8b
- mov v12.8b,v27.8b
+ TBL v17.8b, {v1.16b},v17.8b
+ ZIP1 v27.8b, v5.8b, v17.8b
+ ZIP2 v17.8b, v5.8b, v17.8b
+ mov v5.8b,v27.8b
//VTBL.8 D12,D7,D12 @offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx))
ADD v22.16b, v22.16b , v20.16b //II edge_idx = vaddq_s8(edge_idx, sign_down)
- Uxtl v20.8h, v10.8b //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
+ Uxtl v20.8h, v3.8b //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
- SADDW v20.8h, v20.8h , v12.8b //pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
+ SADDW v20.8h, v20.8h , v5.8b //pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
TBL v22.8b, {v6.16b},v22.8b //II vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
SMAX v20.8h, v20.8h , v2.8h //pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
@@ -395,7 +395,7 @@ PU1_SRC_LOOP_RESIDUE:
TBL v24.8b, {v7.16b},v22.8b
xtn v20.8b, v20.8h //vmovn_s16(pi2_tmp_cur_row.val[0])
- TBL v25.8b, {v8.16b},v23.8b
+ TBL v25.8b, {v1.16b},v23.8b
ZIP1 v27.8b, v24.8b, v25.8b
ZIP2 v25.8b, v24.8b, v25.8b
mov v24.8b,v27.8b
@@ -405,7 +405,7 @@ PU1_SRC_LOOP_RESIDUE:
SMAX v26.8h, v26.8h , v2.8h //II pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
UMIN v26.8h, v26.8h , v4.8h //II pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
- mov v10.16b, v30.16b //II pu1_cur_row = pu1_next_row
+ mov v3.16b, v30.16b //II pu1_cur_row = pu1_next_row
ST1 {v20.8b},[x10],x1 //vst1q_u8(pu1_src_cpy, pu1_cur_row)
xtn v30.8b, v26.8h //II vmovn_s16(pi2_tmp_cur_row.val[0])
@@ -421,9 +421,9 @@ PU1_SRC_LOOP_RESIDUE:
LD1 {v18.16b},[x10] //pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
//LD1 {v19.8b},[x10] //pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
//SUB x10, x10,#8
- cmhi v12.16b, v10.16b , v18.16b //vcgtq_u8(pu1_cur_row, pu1_next_row)
- cmhi v14.16b, v18.16b , v10.16b //vcltq_u8(pu1_cur_row, pu1_next_row)
- SUB v20.16b, v14.16b , v12.16b //sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
+ cmhi v5.16b, v3.16b , v18.16b //vcgtq_u8(pu1_cur_row, pu1_next_row)
+ cmhi v19.16b, v18.16b , v3.16b //vcltq_u8(pu1_cur_row, pu1_next_row)
+ SUB v20.16b, v19.16b , v5.16b //sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
SUB x10,x10,x1
ADD v22.16b, v0.16b , v16.16b //edge_idx = vaddq_s8(const_2, sign_up)
@@ -435,13 +435,13 @@ PU1_SRC_LOOP_RESIDUE:
mov v22.8b,v27.8b
TBL v24.8b, {v7.16b},v22.8b
- TBL v25.8b, {v8.16b},v23.8b
+ TBL v25.8b, {v1.16b},v23.8b
ZIP1 v27.8b, v24.8b, v25.8b
ZIP2 v25.8b, v24.8b, v25.8b
mov v24.8b,v27.8b
//VTBL.8 D24,D7,D22 @offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx))
- Uxtl v26.8h, v10.8b //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
+ Uxtl v26.8h, v3.8b //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
SADDW v26.8h, v26.8h , v24.8b //pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
SMAX v26.8h, v26.8h , v2.8h //pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
UMIN v26.8h, v26.8h , v4.8h //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
@@ -456,7 +456,7 @@ END_LOOPS:
ldp x23, x24,[sp],#16
ldp x21, x22,[sp],#16
ldp x19, x20,[sp],#16
- pop_v_regs
+
ret
diff --git a/common/arm64/ihevc_sao_edge_offset_class2.s b/common/arm64/ihevc_sao_edge_offset_class2.s
index 3350e5c..31852f3 100644
--- a/common/arm64/ihevc_sao_edge_offset_class2.s
+++ b/common/arm64/ihevc_sao_edge_offset_class2.s
@@ -79,7 +79,7 @@ ihevc_sao_edge_offset_class2_av8:
MOV x16,x7 // wd
MOV x17,x8 // ht
- push_v_regs
+
stp x19, x20,[sp,#-16]!
stp x21, x22,[sp,#-16]!
stp x23, x24,[sp,#-16]!
@@ -218,7 +218,7 @@ PU1_AVAIL:
csel x12, x20, x12,EQ
MOV x6,x7 //move wd to x6 loop_count
- movi v8.16b, #0xFF //au1_mask = vdupq_n_s8(-1)
+ movi v1.16b, #0xFF //au1_mask = vdupq_n_s8(-1)
ADD x20,x14,#1 //pu1_src_left_cpy += 1
csel x14, x20, x14,EQ
@@ -239,11 +239,11 @@ WIDTH_LOOP_16:
MOV x20,#-1
csel x8, x20, x8,NE //au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
- mov v8.8b[0], w8 //au1_mask = vsetq_lane_s8((-1||pu1_avail[0]), au1_mask, 0)
+ mov v1.8b[0], w8 //au1_mask = vsetq_lane_s8((-1||pu1_avail[0]), au1_mask, 0)
CMP x6,#16 //if(col == 16)
BNE SKIP_AU1_MASK_VAL
LDRB w8,[x5,#1] //pu1_avail[1]
- mov v8.16b[15], w8 //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
+ mov v1.16b[15], w8 //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
SKIP_AU1_MASK_VAL:
LDRB w11,[x5,#2] //pu1_avail[2]
@@ -255,23 +255,23 @@ SKIP_AU1_MASK_VAL:
SUB x8,x8,#1 //pu1_src_top_cpy - 1 || pu1_src - src_strd - 1
MOV x7,x16 //Loads wd
- LD1 {v10.16b},[x8] //pu1_top_row = vld1q_u8(pu1_src - src_strd - 1) || vld1q_u8(pu1_src_top_cpy - 1)
+ LD1 {v3.16b},[x8] //pu1_top_row = vld1q_u8(pu1_src - src_strd - 1) || vld1q_u8(pu1_src_top_cpy - 1)
ADD x3,x3,#16
ADD x5,sp,#0x42 //*au1_src_left_tmp
- LD1 {v12.16b},[x0] //pu1_cur_row = vld1q_u8(pu1_src)
+ LD1 {v5.16b},[x0] //pu1_cur_row = vld1q_u8(pu1_src)
MOV x4,x17 //Loads ht
SUB x7,x7,x6 //(wd - col)
- cmhi v14.16b, v12.16b , v10.16b //vcgtq_u8(pu1_cur_row, pu1_top_row)
+ cmhi v17.16b, v5.16b , v3.16b //vcgtq_u8(pu1_cur_row, pu1_top_row)
MOV x8,x19 //Loads *pu1_src
ADD x7,x7,#15 //15 + (wd - col)
- cmhi v16.16b, v10.16b , v12.16b //vcltq_u8(pu1_cur_row, pu1_top_row)
+ cmhi v16.16b, v3.16b , v5.16b //vcltq_u8(pu1_cur_row, pu1_top_row)
ADD x7,x8,x7 //pu1_src[0 * src_strd + 15 + (wd - col)]
SUB x5,x5,#1
- SUB v14.16b, v16.16b , v14.16b //sign_up = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
+ SUB v17.16b, v16.16b , v17.16b //sign_up = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
AU1_SRC_LEFT_LOOP:
LDRB w8,[x7] //load the value and increment by src_strd
@@ -307,36 +307,36 @@ SIGN_UP_CHANGE:
csel x4, x20, x4,LT //I
MOV x20,#1
csel x4, x20, x4,GT //I SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[ht_tmp - 1 - row])
- mov v14.8b[0], w4 //I sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[ht_tmp - 1 - row]), sign_up, 0)
+ mov v17.8b[0], w4 //I sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[ht_tmp - 1 - row]), sign_up, 0)
SIGN_UP_CHANGE_DONE:
- cmhi v10.16b, v12.16b , v18.16b //I vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
- ADD v24.16b, v0.16b , v14.16b //I edge_idx = vaddq_s8(const_2, sign_up)
+ cmhi v3.16b, v5.16b , v18.16b //I vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
+ ADD v24.16b, v0.16b , v17.16b //I edge_idx = vaddq_s8(const_2, sign_up)
- cmhi v18.16b, v18.16b , v12.16b //I vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
- SUB v10.16b, v18.16b , v10.16b //I sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
+ cmhi v18.16b, v18.16b , v5.16b //I vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
+ SUB v3.16b, v18.16b , v3.16b //I sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
- ADD v24.16b, v24.16b , v10.16b //I edge_idx = vaddq_s8(edge_idx, sign_down)
+ ADD v24.16b, v24.16b , v3.16b //I edge_idx = vaddq_s8(edge_idx, sign_down)
TBL v18.16b, {v6.16b},v24.16b //I vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
// TBL v19.8b, {v6.16b},v25.8b //I vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
- AND v18.16b, v18.16b , v8.16b //I edge_idx = vandq_s8(edge_idx, au1_mask)
+ AND v18.16b, v18.16b , v1.16b //I edge_idx = vandq_s8(edge_idx, au1_mask)
- NEG v14.16b, v10.16b //I sign_up = vnegq_s8(sign_down)
- TBL v10.16b, {v7.16b},v18.16b //I offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx))
- EXT v14.16b, v14.16b , v14.16b,#15 //I sign_up = vextq_s8(sign_up, sign_up, 15)
+ NEG v17.16b, v3.16b //I sign_up = vnegq_s8(sign_down)
+ TBL v3.16b, {v7.16b},v18.16b //I offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx))
+ EXT v17.16b, v17.16b , v17.16b,#15 //I sign_up = vextq_s8(sign_up, sign_up, 15)
- Uxtl v20.8h, v12.8b //I pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
+ Uxtl v20.8h, v5.8b //I pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
// TBL v11.8b, {v7.16b},v19.8b //I offset = vtbl1_s8(offset_tbl, vget_high_s8(edge_idx))
- SADDW v20.8h, v20.8h , v10.8b //I pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
+ SADDW v20.8h, v20.8h , v3.8b //I pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
SMAX v20.8h, v20.8h , v2.8h //I pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
- Uxtl2 v22.8h, v12.16b //I pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
+ Uxtl2 v22.8h, v5.16b //I pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
UMIN v20.8h, v20.8h , v4.8h //I pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
- mov v12.16b, v16.16b //I pu1_cur_row = pu1_next_row
+ mov v5.16b, v16.16b //I pu1_cur_row = pu1_next_row
- SADDW2 v22.8h, v22.8h , v10.16b //I pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
+ SADDW2 v22.8h, v22.8h , v3.16b //I pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
xtn v20.8b, v20.8h //I vmovn_s16(pi2_tmp_cur_row.val[0])
SMAX v22.8h, v22.8h , v2.8h //I pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
@@ -374,12 +374,12 @@ PU1_SRC_LOOP:
EXT v18.16b, v30.16b , v18.16b,#1 //III pu1_next_row_tmp = vextq_u8(pu1_next_row, pu1_next_row_tmp, 1)
LDRB w2,[x0,x1] //III pu1_src_cpy[0]
- cmhi v24.16b, v12.16b , v22.16b //II vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
+ cmhi v24.16b, v5.16b , v22.16b //II vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
SUB x5,x12,x7 //III ht_tmp - row
movn x20,#0
csel x4, x20, x4,LT //II
- cmhi v22.16b, v22.16b , v12.16b //II vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
+ cmhi v22.16b, v22.16b , v5.16b //II vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
ADD x5,x14,x5 //III pu1_src_left_cpy[ht_tmp - row]
MOV x20,#1
@@ -389,52 +389,52 @@ PU1_SRC_LOOP:
LDRB w5,[x5] //III load the value
SUBS x2,x2,x5 //III pu1_src_cpy[0] - pu1_src_left_cpy[ht_tmp - 1 - row]
- mov v14.8b[0], w4 //II sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[ht_tmp - 1 - row]), sign_up, 0)
+ mov v17.8b[0], w4 //II sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[ht_tmp - 1 - row]), sign_up, 0)
movn x20,#0
csel x2, x20, x2,LT //III
- cmhi v10.16b, v16.16b , v18.16b //III vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
+ cmhi v3.16b, v16.16b , v18.16b //III vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
MOV x20,#1
csel x2, x20, x2,GT //III SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[ht_tmp - 1 - row])
- ADD v22.16b, v0.16b , v14.16b //II edge_idx = vaddq_s8(const_2, sign_up)
+ ADD v22.16b, v0.16b , v17.16b //II edge_idx = vaddq_s8(const_2, sign_up)
ADD v22.16b, v22.16b , v24.16b //II edge_idx = vaddq_s8(edge_idx, sign_down)
cmhi v18.16b, v18.16b , v16.16b //III vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
TBL v22.16b, {v6.16b},v22.16b //II vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
- NEG v14.16b, v24.16b //II sign_up = vnegq_s8(sign_down)
+ NEG v17.16b, v24.16b //II sign_up = vnegq_s8(sign_down)
- SUB v10.16b, v18.16b , v10.16b //III sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
+ SUB v3.16b, v18.16b , v3.16b //III sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
// TBL v23.8b, {v6.16b},v23.8b //II vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
- EXT v14.16b, v14.16b , v14.16b,#15 //II sign_up = vextq_s8(sign_up, sign_up, 15)
+ EXT v17.16b, v17.16b , v17.16b,#15 //II sign_up = vextq_s8(sign_up, sign_up, 15)
- AND v22.16b, v22.16b , v8.16b //II edge_idx = vandq_s8(edge_idx, au1_mask)
- mov v14.8b[0], w2 //III sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[ht_tmp - 1 - row]), sign_up, 0)
+ AND v22.16b, v22.16b , v1.16b //II edge_idx = vandq_s8(edge_idx, au1_mask)
+ mov v17.8b[0], w2 //III sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[ht_tmp - 1 - row]), sign_up, 0)
- ADD v18.16b, v0.16b , v14.16b //III edge_idx = vaddq_s8(const_2, sign_up)
+ ADD v18.16b, v0.16b , v17.16b //III edge_idx = vaddq_s8(const_2, sign_up)
TBL v24.16b, {v7.16b},v22.16b //II offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx))
- ADD v18.16b, v18.16b , v10.16b //III edge_idx = vaddq_s8(edge_idx, sign_down)
+ ADD v18.16b, v18.16b , v3.16b //III edge_idx = vaddq_s8(edge_idx, sign_down)
- Uxtl v26.8h, v12.8b //II pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
+ Uxtl v26.8h, v5.8b //II pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
TBL v18.16b, {v6.16b},v18.16b //III vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
- NEG v14.16b, v10.16b //III sign_up = vnegq_s8(sign_down)
+ NEG v17.16b, v3.16b //III sign_up = vnegq_s8(sign_down)
SADDW v26.8h, v26.8h , v24.8b //II pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
// TBL v19.8b, {v6.16b},v19.8b //III vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
- EXT v14.16b, v14.16b , v14.16b,#15 //III sign_up = vextq_s8(sign_up, sign_up, 15)
+ EXT v17.16b, v17.16b , v17.16b,#15 //III sign_up = vextq_s8(sign_up, sign_up, 15)
- AND v18.16b, v18.16b , v8.16b //III edge_idx = vandq_s8(edge_idx, au1_mask)
+ AND v18.16b, v18.16b , v1.16b //III edge_idx = vandq_s8(edge_idx, au1_mask)
Uxtl v20.8h, v16.8b //III pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
SMAX v26.8h, v26.8h , v2.8h //II pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
- TBL v10.16b, {v7.16b},v18.16b //III offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx))
- SADDW v20.8h, v20.8h , v10.8b //III pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
+ TBL v3.16b, {v7.16b},v18.16b //III offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx))
+ SADDW v20.8h, v20.8h , v3.8b //III pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
UMIN v26.8h, v26.8h , v4.8h //II pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
// TBL v25.8b, {v7.16b},v23.8b //II offset = vtbl1_s8(offset_tbl, vget_high_s8(edge_idx))
SMAX v20.8h, v20.8h , v2.8h //III pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
- Uxtl2 v28.8h, v12.16b //II pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
+ Uxtl2 v28.8h, v5.16b //II pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
UMIN v20.8h, v20.8h , v4.8h //III pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
SADDW2 v28.8h, v28.8h , v24.16b //II pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
@@ -444,11 +444,11 @@ PU1_SRC_LOOP:
UMIN v28.8h, v28.8h , v4.8h //II pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip))
Uxtl2 v18.8h, v16.16b //III pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
- mov v12.16b, v30.16b //III pu1_cur_row = pu1_next_row
+ mov v5.16b, v30.16b //III pu1_cur_row = pu1_next_row
xtn v26.8b, v26.8h //II vmovn_s16(pi2_tmp_cur_row.val[0])
xtn2 v26.16b, v28.8h //II vmovn_s16(pi2_tmp_cur_row.val[1])
- SADDW2 v18.8h, v18.8h , v10.16b //III pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
+ SADDW2 v18.8h, v18.8h , v3.16b //III pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
SMAX v18.8h, v18.8h , v2.8h //III pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
xtn v20.8b, v20.8h //III vmovn_s16(pi2_tmp_cur_row.val[0])
@@ -480,45 +480,45 @@ PU1_SRC_LOOP:
EXT v18.16b, v16.16b , v18.16b,#1 //pu1_next_row_tmp = vextq_u8(pu1_next_row, pu1_next_row_tmp, 1)
SUBS x4,x2,x5 //pu1_src_cpy[0] - pu1_src_left_cpy[ht_tmp - 1 - row]
- cmhi v10.16b, v12.16b , v18.16b //vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
+ cmhi v3.16b, v5.16b , v18.16b //vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
movn x20,#0
csel x4, x20, x4,LT
MOV x20,#1
csel x4, x20, x4,GT //SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[ht_tmp - 1 - row])
- cmhi v18.16b, v18.16b , v12.16b //vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
+ cmhi v18.16b, v18.16b , v5.16b //vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
- mov v14.8b[0], w4 //sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[ht_tmp - 1 - row]), sign_up, 0)
- SUB v10.16b, v18.16b , v10.16b //sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
+ mov v17.8b[0], w4 //sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[ht_tmp - 1 - row]), sign_up, 0)
+ SUB v3.16b, v18.16b , v3.16b //sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
- ADD v18.16b, v0.16b , v14.16b //edge_idx = vaddq_s8(const_2, sign_up)
- ADD v18.16b, v18.16b , v10.16b //edge_idx = vaddq_s8(edge_idx, sign_down)
+ ADD v18.16b, v0.16b , v17.16b //edge_idx = vaddq_s8(const_2, sign_up)
+ ADD v18.16b, v18.16b , v3.16b //edge_idx = vaddq_s8(edge_idx, sign_down)
TBL v18.16b, {v6.16b},v18.16b //vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
- NEG v14.16b, v10.16b //sign_up = vnegq_s8(sign_down)
+ NEG v17.16b, v3.16b //sign_up = vnegq_s8(sign_down)
// TBL v19.8b, {v6.16b},v19.8b //vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
- EXT v14.16b, v14.16b , v14.16b,#15 //sign_up = vextq_s8(sign_up, sign_up, 15)
+ EXT v17.16b, v17.16b , v17.16b,#15 //sign_up = vextq_s8(sign_up, sign_up, 15)
- AND v18.16b, v18.16b , v8.16b //edge_idx = vandq_s8(edge_idx, au1_mask)
+ AND v18.16b, v18.16b , v1.16b //edge_idx = vandq_s8(edge_idx, au1_mask)
- TBL v10.16b, {v7.16b},v18.16b //offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx))
+ TBL v3.16b, {v7.16b},v18.16b //offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx))
- Uxtl v20.8h, v12.8b //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
+ Uxtl v20.8h, v5.8b //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
// TBL v11.8b, {v7.16b},v19.8b //offset = vtbl1_s8(offset_tbl, vget_high_s8(edge_idx))
- SADDW v20.8h, v20.8h , v10.8b //pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
+ SADDW v20.8h, v20.8h , v3.8b //pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
SMAX v20.8h, v20.8h , v2.8h //pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
- Uxtl2 v12.8h, v12.16b //pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
+ Uxtl2 v5.8h, v5.16b //pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
UMIN v20.8h, v20.8h , v4.8h //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
- SADDW2 v12.8h, v12.8h , v10.16b //pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
+ SADDW2 v5.8h, v5.8h , v3.16b //pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
- SMAX v12.8h, v12.8h , v2.8h //pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
+ SMAX v5.8h, v5.8h , v2.8h //pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
xtn v20.8b, v20.8h //vmovn_s16(pi2_tmp_cur_row.val[0])
- UMIN v12.8h, v12.8h , v4.8h //pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip))
- xtn2 v20.16b, v12.8h //vmovn_s16(pi2_tmp_cur_row.val[1])
+ UMIN v5.8h, v5.8h , v4.8h //pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip))
+ xtn2 v20.16b, v5.8h //vmovn_s16(pi2_tmp_cur_row.val[1])
INNER_LOOP_DONE:
@@ -556,11 +556,11 @@ WD_16_HT_4_LOOP:
MOV x20,#-1
csel x8, x20, x8,NE //au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
- mov v8.8b[0], w8 //au1_mask = vsetq_lane_s8((-1||pu1_avail[0]), au1_mask, 0)
+ mov v1.8b[0], w8 //au1_mask = vsetq_lane_s8((-1||pu1_avail[0]), au1_mask, 0)
CMP x6,#16 //if(col == 16)
BNE SKIP_AU1_MASK_VAL_WD_16_HT_4
LDRB w8,[x5,#1] //pu1_avail[1]
- mov v8.16b[15], w8 //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
+ mov v1.16b[15], w8 //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
SKIP_AU1_MASK_VAL_WD_16_HT_4:
LDRB w8,[x5,#2] //pu1_avail[2]
@@ -572,23 +572,23 @@ SKIP_AU1_MASK_VAL_WD_16_HT_4:
SUB x8,x8,#1 //pu1_src_top_cpy - 1 || pu1_src - src_strd - 1
MOV x7,x16 //Loads wd
- LD1 {v10.16b},[x8] //pu1_top_row = vld1q_u8(pu1_src - src_strd - 1) || vld1q_u8(pu1_src_top_cpy - 1)
+ LD1 {v3.16b},[x8] //pu1_top_row = vld1q_u8(pu1_src - src_strd - 1) || vld1q_u8(pu1_src_top_cpy - 1)
ADD x3,x3,#16
ADD x5,sp,#0x42 //*au1_src_left_tmp
- LD1 {v12.16b},[x0] //pu1_cur_row = vld1q_u8(pu1_src)
+ LD1 {v5.16b},[x0] //pu1_cur_row = vld1q_u8(pu1_src)
MOV x4,x17 //Loads ht
SUB x7,x7,x6 //(wd - col)
- cmhi v14.16b, v12.16b , v10.16b //vcgtq_u8(pu1_cur_row, pu1_top_row)
+ cmhi v17.16b, v5.16b , v3.16b //vcgtq_u8(pu1_cur_row, pu1_top_row)
MOV x8,x19 //Loads *pu1_src
ADD x7,x7,#15 //15 + (wd - col)
- cmhi v16.16b, v10.16b , v12.16b //vcltq_u8(pu1_cur_row, pu1_top_row)
+ cmhi v16.16b, v3.16b , v5.16b //vcltq_u8(pu1_cur_row, pu1_top_row)
ADD x7,x8,x7 //pu1_src[0 * src_strd + 15 + (wd - col)]
SUB x5,x5,#1
- SUB v14.16b, v16.16b , v14.16b //sign_up = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
+ SUB v17.16b, v16.16b , v17.16b //sign_up = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
AU1_SRC_LEFT_LOOP_WD_16_HT_4:
LDRB w8,[x7] //load the value and increment by src_strd
@@ -626,31 +626,31 @@ SIGN_UP_CHANGE_WD_16_HT_4:
csel x8, x20, x8,LT
MOV x20,#1
csel x8, x20, x8,GT //SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[ht_tmp - 1 - row])
- mov v14.8b[0], w8 //sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[ht_tmp - 1 - row]), sign_up, 0)
+ mov v17.8b[0], w8 //sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[ht_tmp - 1 - row]), sign_up, 0)
SIGN_UP_CHANGE_DONE_WD_16_HT_4:
- cmhi v20.16b, v12.16b , v18.16b //vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
- cmhi v22.16b, v18.16b , v12.16b //vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
+ cmhi v20.16b, v5.16b , v18.16b //vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
+ cmhi v22.16b, v18.16b , v5.16b //vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
SUB v24.16b, v22.16b , v20.16b //sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
- ADD v26.16b, v0.16b , v14.16b //edge_idx = vaddq_s8(const_2, sign_up)
+ ADD v26.16b, v0.16b , v17.16b //edge_idx = vaddq_s8(const_2, sign_up)
ADD v26.16b, v26.16b , v24.16b //edge_idx = vaddq_s8(edge_idx, sign_down)
TBL v26.16b, {v6.16b},v26.16b //vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
// TBL v27.8b, {v6.16b},v27.8b //vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
- AND v26.16b, v26.16b , v8.16b //edge_idx = vandq_s8(edge_idx, au1_mask)
+ AND v26.16b, v26.16b , v1.16b //edge_idx = vandq_s8(edge_idx, au1_mask)
- NEG v14.16b, v24.16b //sign_up = vnegq_s8(sign_down)
- EXT v14.16b, v14.16b , v14.16b,#15 //sign_up = vextq_s8(sign_up, sign_up, 15)
+ NEG v17.16b, v24.16b //sign_up = vnegq_s8(sign_down)
+ EXT v17.16b, v17.16b , v17.16b,#15 //sign_up = vextq_s8(sign_up, sign_up, 15)
TBL v24.16b, {v7.16b},v26.16b //offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx))
- Uxtl v28.8h, v12.8b //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
+ Uxtl v28.8h, v5.8b //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
SADDW v28.8h, v28.8h , v24.8b //pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
SMAX v28.8h, v28.8h , v2.8h //pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
UMIN v28.8h, v28.8h , v4.8h //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
// TBL v25.8b, {v7.16b},v27.8b //offset = vtbl1_s8(offset_tbl, vget_high_s8(edge_idx))
- Uxtl2 v30.8h, v12.16b //pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
+ Uxtl2 v30.8h, v5.16b //pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
SADDW2 v30.8h, v30.8h , v24.16b //pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
SMAX v30.8h, v30.8h , v2.8h //pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
UMIN v30.8h, v30.8h , v4.8h //pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip))
@@ -660,7 +660,7 @@ SIGN_UP_CHANGE_DONE_WD_16_HT_4:
ST1 { v28.16b},[x0],x1 //vst1q_u8(pu1_src_cpy, pu1_cur_row)
- mov v12.16b, v16.16b //pu1_cur_row = pu1_next_row
+ mov v5.16b, v16.16b //pu1_cur_row = pu1_next_row
SUBS x7,x7,#1 //Decrement the ht_tmp loop count by 1
BNE PU1_SRC_LOOP_WD_16_HT_4 //If not equal jump to PU1_SRC_LOOP_WD_16_HT_4
@@ -689,14 +689,14 @@ WIDTH_RESIDUE:
MOV x20,#-1
csel x8, x20, x8,NE
- mov v8.8b[0], w8 //au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
+ mov v1.8b[0], w8 //au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
LDRB w8,[x5,#1] //pu1_avail[1]
- mov v8.8b[7], w8 //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
+ mov v1.8b[7], w8 //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
PU1_AVAIL_2_RESIDUE:
LDRB w11,[x5,#2] //pu1_avail[2]
- LD1 {v12.16b},[x0] //pu1_cur_row = vld1q_u8(pu1_src)
+ LD1 {v5.16b},[x0] //pu1_cur_row = vld1q_u8(pu1_src)
CMP x11,#0
SUB x20,x0,x1 //pu1_src - src_strd
@@ -706,19 +706,19 @@ PU1_AVAIL_2_RESIDUE:
SUB x8,x8,#1
ADD x5,sp,#0x42 //*au1_src_left_tmp
- LD1 {v10.16b},[x8],#16 //pu1_top_row = vld1q_u8(pu1_src_top_cpy - 1)
+ LD1 {v3.16b},[x8],#16 //pu1_top_row = vld1q_u8(pu1_src_top_cpy - 1)
MOV x7,x16 //Loads wd
MOV x4,x17 //Loads ht
- cmhi v14.16b, v12.16b , v10.16b //vcgtq_u8(pu1_cur_row, pu1_top_row)
+ cmhi v17.16b, v5.16b , v3.16b //vcgtq_u8(pu1_cur_row, pu1_top_row)
SUB x7,x7,#1 //(wd - 1)
MOV x8,x19 //Loads *pu1_src
- cmhi v16.16b, v10.16b , v12.16b //vcltq_u8(pu1_cur_row, pu1_top_row)
+ cmhi v16.16b, v3.16b , v5.16b //vcltq_u8(pu1_cur_row, pu1_top_row)
SUB x5,x5,#1
ADD x7,x8,x7 //pu1_src[0 * src_strd + (wd - 1)]
- SUB v14.16b, v16.16b , v14.16b //sign_up = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
+ SUB v17.16b, v16.16b , v17.16b //sign_up = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
AU1_SRC_LEFT_LOOP_RESIDUE:
@@ -759,25 +759,25 @@ SIGN_UP_CHANGE_RESIDUE:
csel x8, x20, x8,LT
MOV x20,#1
csel x8, x20, x8,GT //SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[ht_tmp - 1 - row])
- mov v14.8b[0], w8 //sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[ht_tmp - 1 - row]), sign_up, 0)
+ mov v17.8b[0], w8 //sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[ht_tmp - 1 - row]), sign_up, 0)
SIGN_UP_CHANGE_DONE_RESIDUE:
- cmhi v20.16b, v12.16b , v18.16b //vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
- cmhi v22.16b, v18.16b , v12.16b //vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
+ cmhi v20.16b, v5.16b , v18.16b //vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
+ cmhi v22.16b, v18.16b , v5.16b //vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
SUB v24.16b, v22.16b , v20.16b //sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
- ADD v26.16b, v0.16b , v14.16b //edge_idx = vaddq_s8(const_2, sign_up)
+ ADD v26.16b, v0.16b , v17.16b //edge_idx = vaddq_s8(const_2, sign_up)
ADD v26.16b, v26.16b , v24.16b //edge_idx = vaddq_s8(edge_idx, sign_down)
TBL v26.16b, {v6.16b},v26.16b //vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
// TBL v27.8b, {v6.16b},v27.8b //vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
- AND v26.16b, v26.16b , v8.16b //edge_idx = vandq_s8(edge_idx, au1_mask)
+ AND v26.16b, v26.16b , v1.16b //edge_idx = vandq_s8(edge_idx, au1_mask)
- NEG v14.16b, v24.16b //sign_up = vnegq_s8(sign_down)
- EXT v14.16b, v14.16b , v14.16b,#15 //sign_up = vextq_s8(sign_up, sign_up, 15)
+ NEG v17.16b, v24.16b //sign_up = vnegq_s8(sign_down)
+ EXT v17.16b, v17.16b , v17.16b,#15 //sign_up = vextq_s8(sign_up, sign_up, 15)
TBL v24.8b, {v7.16b},v26.8b //offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx))
- Uxtl v28.8h, v12.8b //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
+ Uxtl v28.8h, v5.8b //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
SADDW v28.8h, v28.8h , v24.8b //pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
SMAX v28.8h, v28.8h , v2.8h //pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
UMIN v28.8h, v28.8h , v4.8h //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
@@ -785,7 +785,7 @@ SIGN_UP_CHANGE_DONE_RESIDUE:
xtn v30.8b, v28.8h //vmovn_s16(pi2_tmp_cur_row.val[0])
ST1 {v30.8b},[x0],x1 //vst1q_u8(pu1_src_cpy, pu1_cur_row)
- mov v12.16b, v16.16b //pu1_cur_row = pu1_next_row
+ mov v5.16b, v16.16b //pu1_cur_row = pu1_next_row
SUBS x7,x7,#1
BNE PU1_SRC_LOOP_RESIDUE
@@ -839,7 +839,7 @@ END_LOOPS:
ldp x23, x24,[sp],#16
ldp x21, x22,[sp],#16
ldp x19, x20,[sp],#16
- pop_v_regs
+
ret
diff --git a/common/arm64/ihevc_sao_edge_offset_class2_chroma.s b/common/arm64/ihevc_sao_edge_offset_class2_chroma.s
index 2fa7c22..8e286b4 100644
--- a/common/arm64/ihevc_sao_edge_offset_class2_chroma.s
+++ b/common/arm64/ihevc_sao_edge_offset_class2_chroma.s
@@ -76,7 +76,7 @@ ihevc_sao_edge_offset_class2_chroma_av8:
ldr x9,[sp,#8]
ldr w10,[sp,#16]
ldr w11,[sp,#24]
- push_v_regs
+
// STMFD sp!, {x4-x12, x14} //stack stores the values of the arguments
@@ -322,7 +322,7 @@ PU1_AVAIL_3_LOOP:
LDR x2, [x2, #:got_lo12:gi1_table_edge_idx]
MOV x6,x7 //move wd to x6 loop_count
- movi v8.16b, #0XFF //au1_mask = vdupq_n_s8(-1)
+ movi v1.16b, #0XFF //au1_mask = vdupq_n_s8(-1)
CMP x7,#16 //Compare wd with 16
BLT WIDTH_RESIDUE //If not jump to WIDTH_RESIDUE where loop is unrolled for 8 case
@@ -338,19 +338,19 @@ WIDTH_LOOP_16:
MOV x20,#-1
csel x8, x20, x8,NE
- mov v8.8b[0], w8 //au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
+ mov v1.8b[0], w8 //au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
CMP x6,#16 //if(col == 16)
- mov v8.8b[1], w8 //au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
+ mov v1.8b[1], w8 //au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
BNE SKIP_AU1_MASK_VAL
LDRB w8,[x5,#1] //pu1_avail[1]
- mov v8.16b[14], w8 //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
- mov v8.16b[15], w8 //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
+ mov v1.16b[14], w8 //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
+ mov v1.16b[15], w8 //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
SKIP_AU1_MASK_VAL:
LDRB w9,[x5,#2] //pu1_avail[2]
- LD1 {v12.16b},[x0] //pu1_cur_row = vld1q_u8(pu1_src)
+ LD1 {v5.16b},[x0] //pu1_cur_row = vld1q_u8(pu1_src)
//LD1 {v13.8b},[x0] //pu1_cur_row = vld1q_u8(pu1_src)
//SUB x0, x0,#8
CMP x9,#0
@@ -366,17 +366,17 @@ SKIP_AU1_MASK_VAL:
ADD x3,x3,#16
ADD x5,sp,#0x4B //*au1_src_left_tmp
- LD1 {v10.16b},[x8] //pu1_top_row = vld1q_u8(pu1_src - src_strd - 2) || vld1q_u8(pu1_src_top_cpy - 2)
+ LD1 {v3.16b},[x8] //pu1_top_row = vld1q_u8(pu1_src - src_strd - 2) || vld1q_u8(pu1_src_top_cpy - 2)
//LD1 {v11.8b},[x8] //pu1_top_row = vld1q_u8(pu1_src - src_strd - 2) || vld1q_u8(pu1_src_top_cpy - 2)
//SUB x8, x8,#8
SUB x7,x7,x6 //(wd - col)
ADD x7,x7,#14 //15 + (wd - col)
- cmhi v14.16b, v12.16b , v10.16b //vcgtq_u8(pu1_cur_row, pu1_top_row)
+ cmhi v17.16b, v5.16b , v3.16b //vcgtq_u8(pu1_cur_row, pu1_top_row)
mov x8, x26 //Loads *pu1_src
ADD x7,x8,x7 //pu1_src[0 * src_strd + 15 + (wd - col)]
- cmhi v16.16b, v10.16b , v12.16b //vcltq_u8(pu1_cur_row, pu1_top_row)
+ cmhi v16.16b, v3.16b , v5.16b //vcltq_u8(pu1_cur_row, pu1_top_row)
AU1_SRC_LEFT_LOOP:
LDRH w8,[x7] //load the value and increment by src_strd
@@ -388,7 +388,7 @@ AU1_SRC_LEFT_LOOP:
BNE AU1_SRC_LEFT_LOOP
ADD x8,x0,x1 //I *pu1_src + src_strd
- SUB v14.16b, v16.16b , v14.16b //sign_up = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
+ SUB v17.16b, v16.16b , v17.16b //sign_up = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
MOV x7,x12 //row count, move ht_tmp to x7
LD1 {v16.16b},[x8] //I pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
@@ -430,35 +430,35 @@ AU1_SRC_LEFT_LOOP:
csel x8, x20, x8,GT //I SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2])
CMP x4,#0 //I
- mov v14.8b[0], w8 //I sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2]), sign_up, 0)
+ mov v17.8b[0], w8 //I sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2]), sign_up, 0)
movn x20,#0
csel x4, x20, x4,LT //I
MOV x20,#1
csel x4, x20, x4,GT //I SIGN(pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2])
- mov v14.8b[1], w4 //I sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2 + 1]), sign_up, 1)
+ mov v17.8b[1], w4 //I sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2 + 1]), sign_up, 1)
SIGN_UP_CHANGE_DONE:
LD1 {v30.8b},[x2] //edge_idx_tbl = vld1_s8(gi1_table_edge_idx)
- cmhi v20.16b, v12.16b , v18.16b //I vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
+ cmhi v20.16b, v5.16b , v18.16b //I vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
- cmhi v22.16b, v18.16b , v12.16b //I vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
+ cmhi v22.16b, v18.16b , v5.16b //I vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
SUB v22.16b, v22.16b , v20.16b //I sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
- ADD v18.16b, v0.16b , v14.16b //I edge_idx = vaddq_s8(const_2, sign_up)
+ ADD v18.16b, v0.16b , v17.16b //I edge_idx = vaddq_s8(const_2, sign_up)
ADD v18.16b, v18.16b , v22.16b //I edge_idx = vaddq_s8(edge_idx, sign_down)
TBL v18.16b, {v30.16b},v18.16b //I vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
- NEG v14.16b, v22.16b //I sign_up = vnegq_s8(sign_down)
+ NEG v17.16b, v22.16b //I sign_up = vnegq_s8(sign_down)
//TBL v19.8b, {v30.16b},v19.8b //I vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
- EXT v14.16b, v14.16b , v14.16b,#14 //I sign_up = vextq_s8(sign_up, sign_up, 14)
+ EXT v17.16b, v17.16b , v17.16b,#14 //I sign_up = vextq_s8(sign_up, sign_up, 14)
- Uxtl v20.8h, v12.8b //I pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
- AND v22.16b, v18.16b , v8.16b //I edge_idx = vandq_s8(edge_idx, au1_mask)
+ Uxtl v20.8h, v5.8b //I pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
+ AND v22.16b, v18.16b , v1.16b //I edge_idx = vandq_s8(edge_idx, au1_mask)
mov v23.d[0],v22.d[1]
- Uxtl2 v18.8h, v12.16b //I pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
+ Uxtl2 v18.8h, v5.16b //I pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
UZP1 v31.8b, v22.8b, v23.8b
UZP2 v23.8b, v22.8b, v23.8b //I
mov v22.8b,v31.8b
@@ -469,7 +469,7 @@ SIGN_UP_CHANGE_DONE:
ZIP2 v23.8b, v22.8b, v23.8b //I
mov v22.8b,v31.8b
- mov v12.16b, v16.16b //I pu1_cur_row = pu1_next_row
+ mov v5.16b, v16.16b //I pu1_cur_row = pu1_next_row
SADDW v20.8h, v20.8h , v22.8b //I pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
SMAX v20.8h, v20.8h , v2.8h //I pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
@@ -521,17 +521,17 @@ PU1_SRC_LOOP:
movn x20,#0
csel x8, x20, x8,LT //II
- cmhi v22.16b, v12.16b , v28.16b //II vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
+ cmhi v22.16b, v5.16b , v28.16b //II vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
MOV x20,#1
csel x8, x20, x8,GT //II SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2])
sub x13,x9,#1
LDRB w5,[x13] //II load the value
- mov v14.8b[0], w8 //II sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2]), sign_up, 0)
+ mov v17.8b[0], w8 //II sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2]), sign_up, 0)
SUB x7,x7,#1 //II Decrement the ht_tmp loop count by 1
SUB x11,x11,x5 //II pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2 + 1]
- cmhi v24.16b, v28.16b , v12.16b //II vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
+ cmhi v24.16b, v28.16b , v5.16b //II vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
CMP x11,#0 //II
movn x20,#0
@@ -545,11 +545,11 @@ PU1_SRC_LOOP:
SUB x5,x12,x7 //III ht_tmp - row
ADD x10,x0,x1
- mov v14.8b[1], w11 //II sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2 + 1]), sign_up, 1)
+ mov v17.8b[1], w11 //II sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2 + 1]), sign_up, 1)
LSL x5,x5,#1 //III (ht_tmp - row) * 2
ADD x9,x14,x5 //III pu1_src_left_cpy[(ht_tmp - row) * 2]
- ADD v26.16b, v0.16b , v14.16b //II edge_idx = vaddq_s8(const_2, sign_up)
+ ADD v26.16b, v0.16b , v17.16b //II edge_idx = vaddq_s8(const_2, sign_up)
LDRB w10,[x10,#1] //III pu1_src_cpy[0]
sub x13,x9,#2
@@ -562,24 +562,24 @@ PU1_SRC_LOOP:
sub x13,x9,#1
LDRB w9,[x13] //III load the value
TBL v26.16b, {v22.16b},v26.16b //II vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
- NEG v14.16b, v24.16b //II sign_up = vnegq_s8(sign_down)
+ NEG v17.16b, v24.16b //II sign_up = vnegq_s8(sign_down)
movn x20,#0
csel x4, x20, x4,LT //III
SUB x10,x10,x9 //III pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2 + 1]
//TBL v27.8b, {v22.16b},v27.8b //II vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
- EXT v14.16b, v14.16b , v14.16b,#14 //II sign_up = vextq_s8(sign_up, sign_up, 14)
+ EXT v17.16b, v17.16b , v17.16b,#14 //II sign_up = vextq_s8(sign_up, sign_up, 14)
MOV x20,#1
csel x4, x20, x4,GT //III SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2])
- AND v26.16b, v26.16b , v8.16b //II edge_idx = vandq_s8(edge_idx, au1_mask)
+ AND v26.16b, v26.16b , v1.16b //II edge_idx = vandq_s8(edge_idx, au1_mask)
CMP x10,#0 //III
mov v27.d[0],v26.d[1]
UZP1 v31.8b, v26.8b, v27.8b
UZP2 v27.8b, v26.8b, v27.8b //II
mov v26.8b,v31.8b
- mov v14.8b[0], w4 //III sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2]), sign_up, 0)
+ mov v17.8b[0], w4 //III sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2]), sign_up, 0)
movn x20,#0
csel x10, x20, x10,LT //III
@@ -592,13 +592,13 @@ PU1_SRC_LOOP:
TBL v25.8b, {v7.16b},v27.8b //II
SUB v22.16b, v22.16b , v20.16b //III sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
- mov v14.8b[1], w10 //III sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2 + 1]), sign_up, 1)
+ mov v17.8b[1], w10 //III sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2 + 1]), sign_up, 1)
ZIP1 v31.8b, v24.8b, v25.8b
ZIP2 v25.8b, v24.8b, v25.8b //II
mov v24.8b,v31.8b
- Uxtl v28.8h, v12.8b //II pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
- ADD v18.16b, v0.16b , v14.16b //III edge_idx = vaddq_s8(const_2, sign_up)
+ Uxtl v28.8h, v5.8b //II pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
+ ADD v18.16b, v0.16b , v17.16b //III edge_idx = vaddq_s8(const_2, sign_up)
LD1 {v20.8b},[x2] //edge_idx_tbl = vld1_s8(gi1_table_edge_idx)
SADDW v28.8h, v28.8h , v24.8b //II pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
@@ -608,13 +608,13 @@ PU1_SRC_LOOP:
UMIN v28.8h, v28.8h , v4.8h //II pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
TBL v18.16b, {v20.16b},v18.16b //III vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
- NEG v14.16b, v22.16b //III sign_up = vnegq_s8(sign_down)
+ NEG v17.16b, v22.16b //III sign_up = vnegq_s8(sign_down)
//TBL v19.8b, {v20.16b},v19.8b //III vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
- EXT v14.16b, v14.16b , v14.16b,#14 //III sign_up = vextq_s8(sign_up, sign_up, 14)
+ EXT v17.16b, v17.16b , v17.16b,#14 //III sign_up = vextq_s8(sign_up, sign_up, 14)
- Uxtl2 v26.8h, v12.16b //II pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
- AND v18.16b, v18.16b , v8.16b //III edge_idx = vandq_s8(edge_idx, au1_mask)
+ Uxtl2 v26.8h, v5.16b //II pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
+ AND v18.16b, v18.16b , v1.16b //III edge_idx = vandq_s8(edge_idx, au1_mask)
mov v19.d[0],v18.d[1]
UZP1 v31.8b, v18.8b, v19.8b
@@ -623,7 +623,7 @@ PU1_SRC_LOOP:
TBL v22.8b, {v6.16b},v18.8b //III
SADDW v26.8h, v26.8h , v25.8b //II pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
- mov v12.16b, v30.16b //III pu1_cur_row = pu1_next_row
+ mov v5.16b, v30.16b //III pu1_cur_row = pu1_next_row
TBL v23.8b, {v7.16b},v19.8b //III
SMAX v26.8h, v26.8h , v2.8h //II pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
@@ -686,35 +686,35 @@ PU1_SRC_LOOP:
LD1 {v30.8b},[x2] //edge_idx_tbl = vld1_s8(gi1_table_edge_idx)
LDRB w11,[x0,#1] //pu1_src_cpy[0]
- mov v14.8b[0], w8 //sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2]), sign_up, 0)
+ mov v17.8b[0], w8 //sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2]), sign_up, 0)
sub x13,x9,#1
LDRB w5,[x13] //load the value
SUB x4,x11,x5 //pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2 + 1]
- cmhi v22.16b, v12.16b , v18.16b //vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
+ cmhi v22.16b, v5.16b , v18.16b //vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
CMP x4,#0
movn x20,#0
csel x4, x20, x4,LT
- cmhi v24.16b, v18.16b , v12.16b //vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
+ cmhi v24.16b, v18.16b , v5.16b //vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
MOV x20,#1
csel x4, x20, x4,GT //SIGN(pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2])
- mov v14.8b[1], w4 //sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2 + 1]), sign_up, 1)
+ mov v17.8b[1], w4 //sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2 + 1]), sign_up, 1)
SUB v24.16b, v24.16b , v22.16b //sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
- ADD v26.16b, v0.16b , v14.16b //edge_idx = vaddq_s8(const_2, sign_up)
+ ADD v26.16b, v0.16b , v17.16b //edge_idx = vaddq_s8(const_2, sign_up)
ADD v26.16b, v26.16b , v24.16b //edge_idx = vaddq_s8(edge_idx, sign_down)
mov v30.d[1],v30.d[0]
TBL v26.16b, {v30.16b},v26.16b //vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
//TBL v27.8b, {v30.16b},v27.8b //vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
- Uxtl v20.8h, v12.8b //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
- AND v26.16b, v26.16b , v8.16b //edge_idx = vandq_s8(edge_idx, au1_mask)
+ Uxtl v20.8h, v5.8b //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
+ AND v26.16b, v26.16b , v1.16b //edge_idx = vandq_s8(edge_idx, au1_mask)
mov v27.d[0],v26.d[1]
- Uxtl2 v18.8h, v12.16b //pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
+ Uxtl2 v18.8h, v5.16b //pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
UZP1 v31.8b, v26.8b, v27.8b
UZP2 v27.8b, v26.8b, v27.8b
mov v26.8b,v31.8b
@@ -771,14 +771,14 @@ WD_16_HT_4_LOOP:
MOV x20,#-1
csel x8, x20, x8,NE
- mov v8.8b[0], w8 //au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
- mov v8.8b[1], w8 //au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
+ mov v1.8b[0], w8 //au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
+ mov v1.8b[1], w8 //au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
CMP x6,#16 //if(col == 16)
BNE SKIP_AU1_MASK_VAL_WD_16_HT_4
LDRB w8,[x5,#1] //pu1_avail[1]
- mov v8.16b[14], w8 //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
- mov v8.16b[15], w8 //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
+ mov v1.16b[14], w8 //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
+ mov v1.16b[15], w8 //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
SKIP_AU1_MASK_VAL_WD_16_HT_4:
LDRB w8,[x5,#2] //pu1_avail[2]
@@ -788,7 +788,7 @@ SKIP_AU1_MASK_VAL_WD_16_HT_4:
csel x8, x20, x8,EQ
csel x8, x3, x8,NE //pu1_src_top_cpy
SUB x8,x8,#2 //pu1_src - src_strd - 2
- LD1 {v10.16b},[x8] //pu1_top_row = vld1q_u8(pu1_src - src_strd - 2) || vld1q_u8(pu1_src_top_cpy - 2)
+ LD1 {v3.16b},[x8] //pu1_top_row = vld1q_u8(pu1_src - src_strd - 2) || vld1q_u8(pu1_src_top_cpy - 2)
//LD1 {v11.8b},[x8] //pu1_top_row = vld1q_u8(pu1_src - src_strd - 2) || vld1q_u8(pu1_src_top_cpy - 2)
//SUB x8, x8,#8
@@ -809,13 +809,13 @@ AU1_SRC_LEFT_LOOP_WD_16_HT_4:
SUBS x4,x4,#1 //decrement the loop count
BNE AU1_SRC_LEFT_LOOP_WD_16_HT_4
- LD1 {v12.16b},[x0] //pu1_cur_row = vld1q_u8(pu1_src)
+ LD1 {v5.16b},[x0] //pu1_cur_row = vld1q_u8(pu1_src)
//LD1 {v13.8b},[x0] //pu1_cur_row = vld1q_u8(pu1_src)
//SUB x0, x0,#8
- cmhi v14.16b, v12.16b , v10.16b //vcgtq_u8(pu1_cur_row, pu1_top_row)
- cmhi v16.16b, v10.16b , v12.16b //vcltq_u8(pu1_cur_row, pu1_top_row)
- SUB v14.16b, v16.16b , v14.16b //sign_up = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
+ cmhi v17.16b, v5.16b , v3.16b //vcgtq_u8(pu1_cur_row, pu1_top_row)
+ cmhi v16.16b, v3.16b , v5.16b //vcltq_u8(pu1_cur_row, pu1_top_row)
+ SUB v17.16b, v16.16b , v17.16b //sign_up = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
movi v18.16b, #0
MOV x7,x12 //row count, move ht_tmp to x7
@@ -851,7 +851,7 @@ SIGN_UP_CHANGE_WD_16_HT_4:
csel x8, x20, x8,LT
MOV x20,#1
csel x8, x20, x8,GT //SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2])
- mov v14.8b[0], w8 //sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2]), sign_up, 0)
+ mov v17.8b[0], w8 //sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2]), sign_up, 0)
LDRB w8,[x0,#1] //pu1_src_cpy[0]
sub x13,x9,#1
@@ -862,25 +862,25 @@ SIGN_UP_CHANGE_WD_16_HT_4:
csel x8, x20, x8,LT
MOV x20,#1
csel x8, x20, x8,GT //SIGN(pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2])
- mov v14.8b[1], w8 //sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2 + 1]), sign_up, 1)
+ mov v17.8b[1], w8 //sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2 + 1]), sign_up, 1)
SIGN_UP_CHANGE_DONE_WD_16_HT_4:
- cmhi v22.16b, v12.16b , v18.16b //vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
- cmhi v24.16b, v18.16b , v12.16b //vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
+ cmhi v22.16b, v5.16b , v18.16b //vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
+ cmhi v24.16b, v18.16b , v5.16b //vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
SUB v24.16b, v24.16b , v22.16b //sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
- ADD v26.16b, v0.16b , v14.16b //edge_idx = vaddq_s8(const_2, sign_up)
+ ADD v26.16b, v0.16b , v17.16b //edge_idx = vaddq_s8(const_2, sign_up)
ADD v26.16b, v26.16b , v24.16b //edge_idx = vaddq_s8(edge_idx, sign_down)
LD1 {v22.8b},[x2] //edge_idx_tbl = vld1_s8(gi1_table_edge_idx)
TBL v26.16b, {v22.16b},v26.16b //vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
//TBL v27.8b, {v22.16b},v27.8b //vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
- AND v26.16b, v26.16b , v8.16b //edge_idx = vandq_s8(edge_idx, au1_mask)
+ AND v26.16b, v26.16b , v1.16b //edge_idx = vandq_s8(edge_idx, au1_mask)
mov v27.d[0],v26.d[1]
- NEG v14.16b, v24.16b //sign_up = vnegq_s8(sign_down)
- EXT v14.16b, v14.16b , v14.16b,#14 //sign_up = vextq_s8(sign_up, sign_up, 14)
+ NEG v17.16b, v24.16b //sign_up = vnegq_s8(sign_down)
+ EXT v17.16b, v17.16b , v17.16b,#14 //sign_up = vextq_s8(sign_up, sign_up, 14)
UZP1 v31.8b, v26.8b, v27.8b
UZP2 v27.8b, v26.8b, v27.8b
@@ -891,12 +891,12 @@ SIGN_UP_CHANGE_DONE_WD_16_HT_4:
ZIP2 v25.8b, v24.8b, v25.8b
mov v24.8b,v31.8b
- Uxtl v28.8h, v12.8b //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
+ Uxtl v28.8h, v5.8b //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
SADDW v28.8h, v28.8h , v24.8b //pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
SMAX v28.8h, v28.8h , v2.8h //pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
UMIN v28.8h, v28.8h , v4.8h //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
- Uxtl2 v26.8h, v12.16b //pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
+ Uxtl2 v26.8h, v5.16b //pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
SADDW v26.8h, v26.8h , v25.8b //pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
SMAX v26.8h, v26.8h , v2.8h //pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
UMIN v26.8h, v26.8h , v4.8h //pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip))
@@ -906,7 +906,7 @@ SIGN_UP_CHANGE_DONE_WD_16_HT_4:
ST1 { v28.16b},[x0],x1 //vst1q_u8(pu1_src_cpy, pu1_cur_row)
- mov v12.16b, v16.16b //pu1_cur_row = pu1_next_row
+ mov v5.16b, v16.16b //pu1_cur_row = pu1_next_row
SUBS x7,x7,#1 //Decrement the ht_tmp loop count by 1
BNE PU1_SRC_LOOP_WD_16_HT_4 //If not equal jump to PU1_SRC_LOOP_WD_16_HT_4
@@ -936,12 +936,12 @@ WIDTH_RESIDUE:
MOV x20,#-1
csel x8, x20, x8,NE
- mov v8.8b[0], w8 //au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
- mov v8.8b[1], w8 //au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
+ mov v1.8b[0], w8 //au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
+ mov v1.8b[1], w8 //au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
LDRB w8,[x5,#1] //pu1_avail[1]
- mov v8.8b[6], w8 //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
- mov v8.8b[7], w8 //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
+ mov v1.8b[6], w8 //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
+ mov v1.8b[7], w8 //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
LDRB w8,[x5,#2] //pu1_avail[2]
CMP x8,#0
@@ -950,7 +950,7 @@ WIDTH_RESIDUE:
csel x8, x20, x8,EQ
csel x8, x3, x8,NE
SUB x8,x8,#2 //pu1_src - src_strd - 2
- LD1 {v10.16b},[x8] //pu1_top_row = vld1q_u8(pu1_src - src_strd - 2)
+ LD1 {v3.16b},[x8] //pu1_top_row = vld1q_u8(pu1_src - src_strd - 2)
//LD1 {v11.8b},[x8] //pu1_top_row = vld1q_u8(pu1_src - src_strd - 2)
//SUB x8, x8,#8
@@ -968,13 +968,13 @@ AU1_SRC_LEFT_LOOP_RESIDUE:
SUBS x4,x4,#1 //decrement the loop count
BNE AU1_SRC_LEFT_LOOP_RESIDUE
- LD1 {v12.16b},[x0] //pu1_cur_row = vld1q_u8(pu1_src)
+ LD1 {v5.16b},[x0] //pu1_cur_row = vld1q_u8(pu1_src)
//LD1 {v13.8b},[x0] //pu1_cur_row = vld1q_u8(pu1_src)
//SUB x0, x0,#8
- cmhi v14.16b, v12.16b , v10.16b //vcgtq_u8(pu1_cur_row, pu1_top_row)
- cmhi v16.16b, v10.16b , v12.16b //vcltq_u8(pu1_cur_row, pu1_top_row)
- SUB v14.16b, v16.16b , v14.16b //sign_up = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
+ cmhi v17.16b, v5.16b , v3.16b //vcgtq_u8(pu1_cur_row, pu1_top_row)
+ cmhi v16.16b, v3.16b , v5.16b //vcltq_u8(pu1_cur_row, pu1_top_row)
+ SUB v17.16b, v16.16b , v17.16b //sign_up = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
MOV x7,x12 //row count, move ht_tmp to x7
PU1_SRC_LOOP_RESIDUE:
@@ -1009,7 +1009,7 @@ SIGN_UP_CHANGE_RESIDUE:
csel x8, x20, x8,LT
MOV x20,#1
csel x8, x20, x8,GT //SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2])
- mov v14.8b[0], w8 //sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2]), sign_up, 0)
+ mov v17.8b[0], w8 //sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2]), sign_up, 0)
LDRB w8,[x0,#1] //pu1_src_cpy[0]
sub x13,x9,#1
@@ -1020,14 +1020,14 @@ SIGN_UP_CHANGE_RESIDUE:
csel x8, x20, x8,LT
MOV x20,#1
csel x8, x20, x8,GT //SIGN(pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2])
- mov v14.8b[1], w8 //sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2 + 1]), sign_up, 1)
+ mov v17.8b[1], w8 //sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2 + 1]), sign_up, 1)
SIGN_UP_CHANGE_DONE_RESIDUE:
- cmhi v22.16b, v12.16b , v18.16b //vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
- cmhi v24.16b, v18.16b , v12.16b //vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
+ cmhi v22.16b, v5.16b , v18.16b //vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
+ cmhi v24.16b, v18.16b , v5.16b //vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
SUB v24.16b, v24.16b , v22.16b //sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
- ADD v26.16b, v0.16b , v14.16b //edge_idx = vaddq_s8(const_2, sign_up)
+ ADD v26.16b, v0.16b , v17.16b //edge_idx = vaddq_s8(const_2, sign_up)
ADD v26.16b, v26.16b , v24.16b //edge_idx = vaddq_s8(edge_idx, sign_down)
LD1 {v22.8b},[x2] //edge_idx_tbl = vld1_s8(gi1_table_edge_idx)
@@ -1035,11 +1035,11 @@ SIGN_UP_CHANGE_DONE_RESIDUE:
TBL v26.16b, {v22.16b},v26.16b //vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
//TBL v27.8b, {v22.16b},v27.8b //vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
- AND v26.16b, v26.16b , v8.16b //edge_idx = vandq_s8(edge_idx, au1_mask)
+ AND v26.16b, v26.16b , v1.16b //edge_idx = vandq_s8(edge_idx, au1_mask)
mov v27.d[0],v26.d[1]
- NEG v14.16b, v24.16b //sign_up = vnegq_s8(sign_down)
- EXT v14.16b, v14.16b , v14.16b,#14 //sign_up = vextq_s8(sign_up, sign_up, 14)
+ NEG v17.16b, v24.16b //sign_up = vnegq_s8(sign_down)
+ EXT v17.16b, v17.16b , v17.16b,#14 //sign_up = vextq_s8(sign_up, sign_up, 14)
UZP1 v31.8b, v26.8b, v27.8b
UZP2 v27.8b, v26.8b, v27.8b
@@ -1050,7 +1050,7 @@ SIGN_UP_CHANGE_DONE_RESIDUE:
ZIP2 v25.8b, v24.8b, v25.8b
mov v24.8b,v31.8b
- Uxtl v28.8h, v12.8b //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
+ Uxtl v28.8h, v5.8b //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
SADDW v28.8h, v28.8h , v24.8b //pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
SMAX v28.8h, v28.8h , v2.8h //pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
UMIN v28.8h, v28.8h , v4.8h //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
@@ -1059,7 +1059,7 @@ SIGN_UP_CHANGE_DONE_RESIDUE:
ST1 {v28.8b},[x0],x1 //vst1q_u8(pu1_src_cpy, pu1_cur_row)
- mov v12.16b, v16.16b //pu1_cur_row = pu1_next_row
+ mov v5.16b, v16.16b //pu1_cur_row = pu1_next_row
SUBS x7,x7,#1 //Decrement the ht_tmp loop count by 1
BNE PU1_SRC_LOOP_RESIDUE //If not equal jump to PU1_SRC_LOOP
@@ -1113,7 +1113,7 @@ END_LOOPS:
ldp x23, x24,[sp],#16
ldp x21, x22,[sp],#16
ldp x19, x20,[sp],#16
- pop_v_regs
+
ret
diff --git a/common/arm64/ihevc_sao_edge_offset_class3.s b/common/arm64/ihevc_sao_edge_offset_class3.s
index 6c47abe..f393753 100644
--- a/common/arm64/ihevc_sao_edge_offset_class3.s
+++ b/common/arm64/ihevc_sao_edge_offset_class3.s
@@ -70,7 +70,6 @@ ihevc_sao_edge_offset_class3_av8:
// STMFD sp!,{x4-x12,x14} //stack stores the values of the arguments
- push_v_regs
stp x19, x20,[sp,#-16]!
stp x21, x22,[sp,#-16]!
stp x23, x24,[sp,#-16]!
@@ -85,9 +84,9 @@ ihevc_sao_edge_offset_class3_av8:
MOV x5,x7 //Loads pu1_avail
- LDR x6,[sp,#112] //Loads pi1_sao_offset
- LDR w7,[sp,#120] //Loads wd
- LDR w8,[sp,#128] //Loads ht
+ LDR x6,[sp,#48] //Loads pi1_sao_offset
+ LDR w7,[sp,#56] //Loads wd
+ LDR w8,[sp,#64] //Loads ht
MOV x16,x7 // wd
MOV x17,x8 // ht
@@ -226,7 +225,7 @@ PU1_AVAIL_3_LOOP:
ADRP x6, :got:gi1_table_edge_idx //table pointer
LDR x6, [x6, #:got_lo12:gi1_table_edge_idx]
- movi v8.16b, #0xFF //au1_mask = vdupq_n_s8(-1)
+ movi v1.16b, #0xFF //au1_mask = vdupq_n_s8(-1)
ADD x20,x14,#1 //pu1_src_left_cpy += 1
csel x14, x20, x14,EQ
@@ -248,12 +247,12 @@ WIDTH_LOOP_16:
csel w8,w20,w8,EQ
MOV x20,#-1
csel x8, x20, x8,NE
- mov v8.8b[0], w8 //au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
+ mov v1.8b[0], w8 //au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
CMP x6,#16 //if(col == 16)
BNE SKIP_AU1_MASK_VAL
LDRB w8,[x5,#1] //pu1_avail[1]
- mov v8.16b[15], w8 //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
+ mov v1.16b[15], w8 //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
SKIP_AU1_MASK_VAL:
LDRB w8,[x5,#2] //pu1_avail[2]
@@ -270,15 +269,15 @@ SKIP_AU1_MASK_VAL:
ADD x8,x8,#1 //pu1_src - src_strd + 1
SUB x7,x7,x6 //(wd - col)
- LD1 {v10.16b},[x8] //pu1_top_row = vld1q_u8(pu1_src - src_strd + 1)
+ LD1 {v3.16b},[x8] //pu1_top_row = vld1q_u8(pu1_src - src_strd + 1)
ADD x3,x3,#16
MOV x8,x19 //Loads *pu1_src
- LD1 {v12.16b},[x0] //pu1_cur_row = vld1q_u8(pu1_src)
+ LD1 {v5.16b},[x0] //pu1_cur_row = vld1q_u8(pu1_src)
ADD x7,x7,#15 //15 + (wd - col)
ADD x7,x8,x7 //pu1_src[0 * src_strd + 15 + (wd - col)]
- cmhi v14.16b, v12.16b , v10.16b //vcgtq_u8(pu1_cur_row, pu1_top_row)
+ cmhi v17.16b, v5.16b , v3.16b //vcgtq_u8(pu1_cur_row, pu1_top_row)
SUB x5,x5,#1
AU1_SRC_LEFT_LOOP:
@@ -289,10 +288,10 @@ AU1_SRC_LEFT_LOOP:
BNE AU1_SRC_LEFT_LOOP
movi v18.16b, #0
- cmhi v16.16b, v10.16b , v12.16b //vcltq_u8(pu1_cur_row, pu1_top_row)
+ cmhi v16.16b, v3.16b , v5.16b //vcltq_u8(pu1_cur_row, pu1_top_row)
ADD x8,x0,x1 //I *pu1_src + src_strd
- SUB v14.16b, v16.16b , v14.16b //sign_up = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
+ SUB v17.16b, v16.16b , v17.16b //sign_up = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
MOV x7,x12 //row count, move ht_tmp to x7
SUB x5,x12,x7 //I ht_tmp - row
@@ -321,35 +320,35 @@ SIGN_UP_CHANGE:
csel x8, x20, x8,LT //I
MOV x20,#1
csel x8, x20, x8,GT //I SIGN(pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd])
- mov v14.16b[15], w8 //I sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd]), sign_up, 15)
+ mov v17.16b[15], w8 //I sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd]), sign_up, 15)
SIGN_UP_CHANGE_DONE:
- cmhi v10.16b, v12.16b , v18.16b //I vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
- cmhi v18.16b, v18.16b , v12.16b //I vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
- SUB v10.16b, v18.16b , v10.16b //I sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
+ cmhi v3.16b, v5.16b , v18.16b //I vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
+ cmhi v18.16b, v18.16b , v5.16b //I vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
+ SUB v3.16b, v18.16b , v3.16b //I sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
- ADD v18.16b, v0.16b , v14.16b //I edge_idx = vaddq_s8(const_2, sign_up)
- ADD v18.16b, v18.16b , v10.16b //I edge_idx = vaddq_s8(edge_idx, sign_down)
+ ADD v18.16b, v0.16b , v17.16b //I edge_idx = vaddq_s8(const_2, sign_up)
+ ADD v18.16b, v18.16b , v3.16b //I edge_idx = vaddq_s8(edge_idx, sign_down)
TBL v18.16b, {v6.16b},v18.16b //I vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
- NEG v14.16b, v10.16b //I sign_up = vnegq_s8(sign_down)
+ NEG v17.16b, v3.16b //I sign_up = vnegq_s8(sign_down)
- EXT v14.16b, v14.16b , v14.16b,#1 //I sign_up = vextq_s8(sign_up, sign_up, 1)
+ EXT v17.16b, v17.16b , v17.16b,#1 //I sign_up = vextq_s8(sign_up, sign_up, 1)
// TBL v19.8b, {v6.16b},v19.8b //I vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
- Uxtl v20.8h, v12.8b //I pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
- AND v18.16b, v18.16b , v8.16b //I edge_idx = vandq_s8(edge_idx, au1_mask)
+ Uxtl v20.8h, v5.8b //I pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
+ AND v18.16b, v18.16b , v1.16b //I edge_idx = vandq_s8(edge_idx, au1_mask)
- TBL v10.16b, {v7.16b},v18.16b //I offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx))
+ TBL v3.16b, {v7.16b},v18.16b //I offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx))
- Uxtl2 v22.8h, v12.16b //I pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
- SADDW v20.8h, v20.8h , v10.8b //I pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
+ Uxtl2 v22.8h, v5.16b //I pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
+ SADDW v20.8h, v20.8h , v3.8b //I pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
SMAX v20.8h, v20.8h , v2.8h //I pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
// TBL v11.8b, {v7.16b},v19.8b //I offset = vtbl1_s8(offset_tbl, vget_high_s8(edge_idx))
UMIN v20.8h, v20.8h , v4.8h //I pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
- mov v12.16b, v16.16b
- SADDW2 v22.8h, v22.8h , v10.16b //I pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
+ mov v5.16b, v16.16b
+ SADDW2 v22.8h, v22.8h , v3.16b //I pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
SMAX v22.8h, v22.8h , v2.8h //I pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
UMIN v22.8h, v22.8h , v4.8h //I pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip))
@@ -388,7 +387,7 @@ PU1_SRC_LOOP:
csel x11, x20, x11,GT //II SIGN(pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd])
ADD x8,x14,x5 //III pu1_src_left_cpy[ht_tmp - row]
- mov v14.8b[15], w11 //II sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd]), sign_up, 15)
+ mov v17.8b[15], w11 //II sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd]), sign_up, 15)
CMP x7,#1 //III
BNE NEXT_ROW_ELSE_2 //III
@@ -400,11 +399,11 @@ PU1_SRC_LOOP:
NEXT_ROW_ELSE_2:
LDRB w8,[x8,#1] //III
- cmhi v24.16b, v12.16b , v18.16b //II vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
+ cmhi v24.16b, v5.16b , v18.16b //II vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
ADD x5,x0,x1
LDRB w2,[x5,#15] //III pu1_src_cpy[15]
- cmhi v26.16b, v18.16b , v12.16b //II vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
+ cmhi v26.16b, v18.16b , v5.16b //II vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
LDRB w5,[x0,#16] //III load the value
SUB x2,x2,x5 //III pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd]
@@ -418,51 +417,51 @@ NEXT_ROW_ELSE_2:
csel x2, x20, x2,GT //III SIGN(pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd])
SUB x7,x7,#1 //III Decrement the ht_tmp loop count by 1
- ADD v26.16b, v0.16b , v14.16b //II edge_idx = vaddq_s8(const_2, sign_up)
+ ADD v26.16b, v0.16b , v17.16b //II edge_idx = vaddq_s8(const_2, sign_up)
- NEG v14.16b, v24.16b //II sign_up = vnegq_s8(sign_down)
+ NEG v17.16b, v24.16b //II sign_up = vnegq_s8(sign_down)
EXT v18.16b, v18.16b , v30.16b,#15 //III pu1_next_row_tmp = vextq_u8(pu1_next_row_tmp, pu1_next_row, 15)
ADD v26.16b, v26.16b , v24.16b //II edge_idx = vaddq_s8(edge_idx, sign_down)
- EXT v14.16b, v14.16b , v14.16b,#1 //II sign_up = vextq_s8(sign_up, sign_up, 1)
+ EXT v17.16b, v17.16b , v17.16b,#1 //II sign_up = vextq_s8(sign_up, sign_up, 1)
TBL v26.16b, {v6.16b},v26.16b //II vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
- cmhi v10.16b, v16.16b , v18.16b //III vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
+ cmhi v3.16b, v16.16b , v18.16b //III vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
- mov v14.16b[15], w2 //III sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd]), sign_up, 15)
+ mov v17.16b[15], w2 //III sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd]), sign_up, 15)
// TBL v27.8b, {v6.16b},v27.8b //II vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
cmhi v18.16b, v18.16b , v16.16b //III vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
- Uxtl v28.8h, v12.8b //II pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
- AND v26.16b, v26.16b , v8.16b //II edge_idx = vandq_s8(edge_idx, au1_mask)
+ Uxtl v28.8h, v5.8b //II pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
+ AND v26.16b, v26.16b , v1.16b //II edge_idx = vandq_s8(edge_idx, au1_mask)
- SUB v10.16b, v18.16b , v10.16b //III sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
+ SUB v3.16b, v18.16b , v3.16b //III sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
TBL v24.16b, {v7.16b},v26.16b //II offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx))
- ADD v18.16b, v0.16b , v14.16b //III edge_idx = vaddq_s8(const_2, sign_up)
+ ADD v18.16b, v0.16b , v17.16b //III edge_idx = vaddq_s8(const_2, sign_up)
- ADD v18.16b, v18.16b , v10.16b //III edge_idx = vaddq_s8(edge_idx, sign_down)
+ ADD v18.16b, v18.16b , v3.16b //III edge_idx = vaddq_s8(edge_idx, sign_down)
// TBL v25.8b, {v7.16b},v27.8b //II offset = vtbl1_s8(offset_tbl, vget_high_s8(edge_idx))
- NEG v14.16b, v10.16b //III sign_up = vnegq_s8(sign_down)
+ NEG v17.16b, v3.16b //III sign_up = vnegq_s8(sign_down)
SADDW v28.8h, v28.8h , v24.8b //II pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
TBL v18.16b, {v6.16b},v18.16b //III vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
SMAX v28.8h, v28.8h , v2.8h //II pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
- EXT v14.16b, v14.16b , v14.16b,#1 //III sign_up = vextq_s8(sign_up, sign_up, 1)
+ EXT v17.16b, v17.16b , v17.16b,#1 //III sign_up = vextq_s8(sign_up, sign_up, 1)
// TBL v19.8b, {v6.16b},v19.8b //III vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
UMIN v28.8h, v28.8h , v4.8h //II pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
- Uxtl2 v26.8h, v12.16b //II pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
- AND v18.16b, v18.16b , v8.16b //III edge_idx = vandq_s8(edge_idx, au1_mask)
+ Uxtl2 v26.8h, v5.16b //II pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
+ AND v18.16b, v18.16b , v1.16b //III edge_idx = vandq_s8(edge_idx, au1_mask)
SADDW2 v26.8h, v26.8h , v24.16b //II pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
- TBL v10.16b, {v7.16b},v18.16b //III offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx))
+ TBL v3.16b, {v7.16b},v18.16b //III offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx))
SMAX v26.8h, v26.8h , v2.8h //II pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
Uxtl v20.8h, v16.8b //III pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
UMIN v26.8h, v26.8h , v4.8h //II pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip))
- SADDW v20.8h, v20.8h , v10.8b //III pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
+ SADDW v20.8h, v20.8h , v3.8b //III pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
// TBL v11.8b, {v7.16b},v19.8b //III offset = vtbl1_s8(offset_tbl, vget_high_s8(edge_idx))
SMAX v20.8h, v20.8h , v2.8h //III pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
@@ -470,12 +469,12 @@ NEXT_ROW_ELSE_2:
UMIN v20.8h, v20.8h , v4.8h //III pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
xtn v28.8b, v28.8h //II vmovn_s16(pi2_tmp_cur_row.val[0])
- SADDW2 v22.8h, v22.8h , v10.16b //III pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
+ SADDW2 v22.8h, v22.8h , v3.16b //III pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
xtn2 v28.16b, v26.8h //II vmovn_s16(pi2_tmp_cur_row.val[1])
SMAX v22.8h, v22.8h , v2.8h //III pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
- mov v12.16b, v30.16b //II pu1_cur_row = pu1_next_row
+ mov v5.16b, v30.16b //II pu1_cur_row = pu1_next_row
UMIN v22.8h, v22.8h , v4.8h //III pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip))
CMP x7,#1 //III
@@ -516,25 +515,25 @@ NEXT_ROW_POINTER_ASSIGNED_3:
csel x8, x20, x8,LT
ST1 { v20.16b},[x0],x1 //III vst1q_u8(pu1_src_cpy, pu1_cur_row)
- cmhi v24.16b, v12.16b , v18.16b //vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
+ cmhi v24.16b, v5.16b , v18.16b //vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
MOV x20,#1
csel x8, x20, x8,GT //SIGN(pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd])
- cmhi v26.16b, v18.16b , v12.16b //vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
+ cmhi v26.16b, v18.16b , v5.16b //vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
- mov v14.16b[15], w8 //sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd]), sign_up, 15)
+ mov v17.16b[15], w8 //sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd]), sign_up, 15)
SUB v24.16b, v26.16b , v24.16b //sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
- Uxtl v20.8h, v12.8b //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
- ADD v26.16b, v0.16b , v14.16b //edge_idx = vaddq_s8(const_2, sign_up)
+ Uxtl v20.8h, v5.8b //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
+ ADD v26.16b, v0.16b , v17.16b //edge_idx = vaddq_s8(const_2, sign_up)
- Uxtl2 v22.8h, v12.16b //pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
+ Uxtl2 v22.8h, v5.16b //pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
ADD v26.16b, v26.16b , v24.16b //edge_idx = vaddq_s8(edge_idx, sign_down)
TBL v26.16b, {v6.16b},v26.16b //vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
// TBL v27.8b, {v6.16b},v27.8b //vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
- AND v26.16b, v26.16b , v8.16b //edge_idx = vandq_s8(edge_idx, au1_mask)
+ AND v26.16b, v26.16b , v1.16b //edge_idx = vandq_s8(edge_idx, au1_mask)
TBL v24.16b, {v7.16b},v26.16b //offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx))
@@ -583,12 +582,12 @@ WD_16_HT_4_LOOP:
csel w8,w20,w8,EQ
MOV x20,#-1
csel x8, x20, x8,NE
- mov v8.8b[0], w8 //au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
+ mov v1.8b[0], w8 //au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
CMP x6,#16 //if(col == 16)
BNE SKIP_AU1_MASK_VAL_WD_16_HT_4
LDRB w8,[x5,#1] //pu1_avail[1]
- mov v8.16b[15], w8 //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
+ mov v1.16b[15], w8 //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
SKIP_AU1_MASK_VAL_WD_16_HT_4:
LDRB w8,[x5,#2] //pu1_avail[2]
@@ -598,7 +597,7 @@ SKIP_AU1_MASK_VAL_WD_16_HT_4:
csel x8, x20, x8,EQ
csel x8, x3, x8,NE
ADD x8,x8,#1 //pu1_src - src_strd + 1
- LD1 {v10.16b},[x8] //pu1_top_row = vld1q_u8(pu1_src - src_strd + 1)
+ LD1 {v3.16b},[x8] //pu1_top_row = vld1q_u8(pu1_src - src_strd + 1)
ADD x3,x3,#16
ADD x5,sp,#0x42 //*au1_src_left_tmp
@@ -617,11 +616,11 @@ AU1_SRC_LEFT_LOOP_WD_16_HT_4:
SUBS x4,x4,#1 //decrement the loop count
BNE AU1_SRC_LEFT_LOOP_WD_16_HT_4
- LD1 {v12.16b},[x0] //pu1_cur_row = vld1q_u8(pu1_src)
+ LD1 {v5.16b},[x0] //pu1_cur_row = vld1q_u8(pu1_src)
- cmhi v14.16b, v12.16b , v10.16b //vcgtq_u8(pu1_cur_row, pu1_top_row)
- cmhi v16.16b, v10.16b , v12.16b //vcltq_u8(pu1_cur_row, pu1_top_row)
- SUB v14.16b, v16.16b , v14.16b //sign_up = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
+ cmhi v17.16b, v5.16b , v3.16b //vcgtq_u8(pu1_cur_row, pu1_top_row)
+ cmhi v16.16b, v3.16b , v5.16b //vcltq_u8(pu1_cur_row, pu1_top_row)
+ SUB v17.16b, v16.16b , v17.16b //sign_up = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
movi v18.16b, #0
MOV x7,x12 //row count, move ht_tmp to x7
@@ -665,31 +664,31 @@ SIGN_UP_CHANGE_WD_16_HT_4:
csel x8, x20, x8,LT
MOV x20,#1
csel x8, x20, x8,GT //SIGN(pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd])
- mov v14.16b[15], w8 //sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd]), sign_up, 15)
+ mov v17.16b[15], w8 //sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd]), sign_up, 15)
SIGN_UP_CHANGE_DONE_WD_16_HT_4:
- cmhi v20.16b, v12.16b , v18.16b //vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
- cmhi v22.16b, v18.16b , v12.16b //vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
+ cmhi v20.16b, v5.16b , v18.16b //vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
+ cmhi v22.16b, v18.16b , v5.16b //vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
SUB v24.16b, v22.16b , v20.16b //sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
- ADD v26.16b, v0.16b , v14.16b //edge_idx = vaddq_s8(const_2, sign_up)
+ ADD v26.16b, v0.16b , v17.16b //edge_idx = vaddq_s8(const_2, sign_up)
ADD v26.16b, v26.16b , v24.16b //edge_idx = vaddq_s8(edge_idx, sign_down)
TBL v26.16b, {v6.16b},v26.16b //vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
// TBL v27.8b, {v6.16b},v27.8b //vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
- AND v26.16b, v26.16b , v8.16b //edge_idx = vandq_s8(edge_idx, au1_mask)
+ AND v26.16b, v26.16b , v1.16b //edge_idx = vandq_s8(edge_idx, au1_mask)
- NEG v14.16b, v24.16b //sign_up = vnegq_s8(sign_down)
- EXT v14.16b, v14.16b , v14.16b,#1 //sign_up = vextq_s8(sign_up, sign_up, 1)
+ NEG v17.16b, v24.16b //sign_up = vnegq_s8(sign_down)
+ EXT v17.16b, v17.16b , v17.16b,#1 //sign_up = vextq_s8(sign_up, sign_up, 1)
TBL v24.16b, {v7.16b},v26.16b //offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx))
- Uxtl v28.8h, v12.8b //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
+ Uxtl v28.8h, v5.8b //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
SADDW v28.8h, v28.8h , v24.8b //pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
SMAX v28.8h, v28.8h , v2.8h //pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
UMIN v28.8h, v28.8h , v4.8h //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
// TBL v25.8b, {v7.16b},v27.8b //offset = vtbl1_s8(offset_tbl, vget_high_s8(edge_idx))
- Uxtl2 v30.8h, v12.16b //pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
+ Uxtl2 v30.8h, v5.16b //pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
SADDW2 v30.8h, v30.8h , v24.16b //pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
SMAX v30.8h, v30.8h , v2.8h //pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
UMIN v30.8h, v30.8h , v4.8h //pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip))
@@ -699,7 +698,7 @@ SIGN_UP_CHANGE_DONE_WD_16_HT_4:
ST1 { v28.16b},[x0],x1 //vst1q_u8(pu1_src_cpy, pu1_cur_row)
- mov v12.16b, v16.16b //pu1_cur_row = pu1_next_row
+ mov v5.16b, v16.16b //pu1_cur_row = pu1_next_row
SUBS x7,x7,#1 //Decrement the ht_tmp loop count by 1
BNE PU1_SRC_LOOP_WD_16_HT_4 //If not equal jump to PU1_SRC_LOOP_WD_16_HT_4
@@ -726,10 +725,10 @@ WIDTH_RESIDUE:
MOV x20,#-1
csel x8, x20, x8,NE
- mov v8.8b[0], w8 //au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
+ mov v1.8b[0], w8 //au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
LDRB w8,[x5,#1] //pu1_avail[1]
- mov v8.8b[7], w8 //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
+ mov v1.8b[7], w8 //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
PU1_AVAIL_2_RESIDUE:
LDRB w8,[x5,#2] //pu1_avail[2]
@@ -739,7 +738,7 @@ PU1_AVAIL_2_RESIDUE:
csel x8, x20, x8,EQ
csel x8, x3, x8,NE
ADD x8,x8,#1 //pu1_src - src_strd + 1
- LD1 {v10.16b},[x8] //pu1_top_row = vld1q_u8(pu1_src - src_strd + 1)
+ LD1 {v3.16b},[x8] //pu1_top_row = vld1q_u8(pu1_src - src_strd + 1)
ADD x5,sp,#0x42 //*au1_src_left_tmp
@@ -757,11 +756,11 @@ AU1_SRC_LEFT_LOOP_RESIDUE:
SUBS x4,x4,#1 //decrement the loop count
BNE AU1_SRC_LEFT_LOOP_RESIDUE
- LD1 {v12.16b},[x0] //pu1_cur_row = vld1q_u8(pu1_src)
+ LD1 {v5.16b},[x0] //pu1_cur_row = vld1q_u8(pu1_src)
- cmhi v14.16b, v12.16b , v10.16b //vcgtq_u8(pu1_cur_row, pu1_top_row)
- cmhi v16.16b, v10.16b , v12.16b //vcltq_u8(pu1_cur_row, pu1_top_row)
- SUB v14.16b, v16.16b , v14.16b //sign_up = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
+ cmhi v17.16b, v5.16b , v3.16b //vcgtq_u8(pu1_cur_row, pu1_top_row)
+ cmhi v16.16b, v3.16b , v5.16b //vcltq_u8(pu1_cur_row, pu1_top_row)
+ SUB v17.16b, v16.16b , v17.16b //sign_up = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
MOV x7,x12 //row count, move ht_tmp to x7
PU1_SRC_LOOP_RESIDUE:
@@ -805,25 +804,25 @@ SIGN_UP_CHANGE_RESIDUE:
csel x8, x20, x8,LT
MOV x20,#1
csel x8, x20, x8,GT //SIGN(pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd])
- mov v14.16b[15], w8 //sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd]), sign_up, 15)
+ mov v17.16b[15], w8 //sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd]), sign_up, 15)
SIGN_UP_CHANGE_DONE_RESIDUE:
- cmhi v20.16b, v12.16b , v18.16b //vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
- cmhi v22.16b, v18.16b , v12.16b //vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
+ cmhi v20.16b, v5.16b , v18.16b //vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
+ cmhi v22.16b, v18.16b , v5.16b //vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
SUB v24.16b, v22.16b , v20.16b //sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
- ADD v26.16b, v0.16b , v14.16b //edge_idx = vaddq_s8(const_2, sign_up)
+ ADD v26.16b, v0.16b , v17.16b //edge_idx = vaddq_s8(const_2, sign_up)
ADD v26.16b, v26.16b , v24.16b //edge_idx = vaddq_s8(edge_idx, sign_down)
TBL v26.16b, {v6.16b},v26.16b //vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
// TBL v27.8b, {v6.16b},v27.8b //vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
- AND v26.16b, v26.16b , v8.16b //edge_idx = vandq_s8(edge_idx, au1_mask)
+ AND v26.16b, v26.16b , v1.16b //edge_idx = vandq_s8(edge_idx, au1_mask)
- NEG v14.16b, v24.16b //sign_up = vnegq_s8(sign_down)
- EXT v14.16b, v14.16b , v14.16b,#1 //sign_up = vextq_s8(sign_up, sign_up, 1)
+ NEG v17.16b, v24.16b //sign_up = vnegq_s8(sign_down)
+ EXT v17.16b, v17.16b , v17.16b,#1 //sign_up = vextq_s8(sign_up, sign_up, 1)
TBL v24.8b, {v7.16b},v26.8b //offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx))
- Uxtl v28.8h, v12.8b //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
+ Uxtl v28.8h, v5.8b //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
SADDW v28.8h, v28.8h , v24.8b //pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
SMAX v28.8h, v28.8h , v2.8h //pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
UMIN v28.8h, v28.8h , v4.8h //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
@@ -831,7 +830,7 @@ SIGN_UP_CHANGE_DONE_RESIDUE:
xtn v30.8b, v28.8h //vmovn_s16(pi2_tmp_cur_row.val[0])
ST1 {v30.8b},[x0],x1 //vst1q_u8(pu1_src_cpy, pu1_cur_row)
- mov v12.16b, v16.16b //pu1_cur_row = pu1_next_row
+ mov v5.16b, v16.16b //pu1_cur_row = pu1_next_row
SUBS x7,x7,#1
BNE PU1_SRC_LOOP_RESIDUE
@@ -880,7 +879,6 @@ END_LOOPS:
ldp x23, x24,[sp], #16
ldp x21, x22,[sp], #16
ldp x19, x20,[sp], #16
- pop_v_regs
ret
diff --git a/common/arm64/ihevc_sao_edge_offset_class3_chroma.s b/common/arm64/ihevc_sao_edge_offset_class3_chroma.s
index cf25102..5c444c0 100644
--- a/common/arm64/ihevc_sao_edge_offset_class3_chroma.s
+++ b/common/arm64/ihevc_sao_edge_offset_class3_chroma.s
@@ -77,7 +77,7 @@ ihevc_sao_edge_offset_class3_chroma_av8:
ldr w10,[sp,#16]
ldr w11,[sp,#24]
- push_v_regs
+
// STMFD sp!, {x4-x12, x14} //stack stores the values of the arguments
stp x19, x20,[sp,#-16]!
stp x21, x22,[sp,#-16]!
@@ -310,7 +310,7 @@ PU1_AVAIL_2_LOOP_END:
LDR x2, [x2, #:got_lo12:gi1_table_edge_idx]
//VLD1.8 D6,[x6] @edge_idx_tbl = vld1_s8(gi1_table_edge_idx)
- movi v8.16b, #0xFF //au1_mask = vdupq_n_s8(-1)
+ movi v1.16b, #0xFF //au1_mask = vdupq_n_s8(-1)
MOV x6,x7 //move wd to x6 loop_count
CMP x7,#16 //Compare wd with 16
@@ -328,20 +328,20 @@ WIDTH_LOOP_16:
MOV x20,#-1
csel x8, x20, x8,NE
- mov v8.8b[0], w8 //au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
+ mov v1.8b[0], w8 //au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
LDRB w11,[x5,#2] //pu1_avail[2]
CMP x6,#16 //if(col == 16)
- mov v8.8b[1], w8 //au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
+ mov v1.8b[1], w8 //au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
BNE SKIP_AU1_MASK_VAL
LDRB w8,[x5,#1] //pu1_avail[1]
- mov v8.16b[14], w8 //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
- mov v8.16b[15], w8 //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
+ mov v1.16b[14], w8 //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
+ mov v1.16b[15], w8 //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
SKIP_AU1_MASK_VAL:
CMP x11,#0
- LD1 {v12.16b},[x0] //pu1_cur_row = vld1q_u8(pu1_src)
+ LD1 {v5.16b},[x0] //pu1_cur_row = vld1q_u8(pu1_src)
//LD1 {v13.8b},[x0] //pu1_cur_row = vld1q_u8(pu1_src)
//SUB x0, x0,#8
ADD x5,sp,#0x4B //*au1_src_left_tmp
@@ -352,21 +352,21 @@ SKIP_AU1_MASK_VAL:
csel x8, x3, x8,NE
ADD x8,x8,#2 //pu1_src - src_strd + 2
- LD1 {v10.16b},[x8] //pu1_top_row = vld1q_u8(pu1_src - src_strd + 2)
+ LD1 {v3.16b},[x8] //pu1_top_row = vld1q_u8(pu1_src - src_strd + 2)
//LD1 {v11.8b},[x8] //pu1_top_row = vld1q_u8(pu1_src - src_strd + 2)
//SUB x8, x8,#8
ADD x3,x3,#16
mov w4, w25 //Loads ht
- cmhi v14.16b, v12.16b , v10.16b //vcgtq_u8(pu1_cur_row, pu1_top_row)
+ cmhi v17.16b, v5.16b , v3.16b //vcgtq_u8(pu1_cur_row, pu1_top_row)
mov w7, w24 //Loads wd
SUB x7,x7,x6 //(wd - col)
- cmhi v16.16b, v10.16b , v12.16b //vcltq_u8(pu1_cur_row, pu1_top_row)
+ cmhi v16.16b, v3.16b , v5.16b //vcltq_u8(pu1_cur_row, pu1_top_row)
ADD x7,x7,#14 //15 + (wd - col)
mov x8, x26 //Loads *pu1_src
- SUB v14.16b, v16.16b , v14.16b //sign_up = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
+ SUB v17.16b, v16.16b , v17.16b //sign_up = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
ADD x7,x8,x7 //pu1_src[0 * src_strd + 15 + (wd - col)]
AU1_SRC_LEFT_LOOP:
@@ -418,29 +418,29 @@ AU1_SRC_LEFT_LOOP:
movn x20,#0
csel x9, x20, x9,LT //I
- mov v14.16b[14], w8 //I sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[14] -pu1_src_cpy[16 - src_strd]), sign_up, 0)
+ mov v17.16b[14], w8 //I sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[14] -pu1_src_cpy[16 - src_strd]), sign_up, 0)
MOV x20,#1
csel x9, x20, x9,GT //I SIGN(pu1_src_cpy[15] - pu1_src_cpy[17 - src_strd]
- mov v14.16b[15], w9 //I sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[15] -pu1_src_cpy[17 - src_strd]), sign_up, 1)
+ mov v17.16b[15], w9 //I sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[15] -pu1_src_cpy[17 - src_strd]), sign_up, 1)
SIGN_UP_CHANGE_DONE:
LD1 {v28.8b},[x2] //edge_idx_tbl = vld1_s8(gi1_table_edge_idx)
- cmhi v20.16b, v12.16b , v18.16b //I vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
+ cmhi v20.16b, v5.16b , v18.16b //I vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
- cmhi v22.16b, v18.16b , v12.16b //I vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
+ cmhi v22.16b, v18.16b , v5.16b //I vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
SUB v22.16b, v22.16b , v20.16b //I sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
- ADD v18.16b, v0.16b , v14.16b //I edge_idx = vaddq_s8(const_2, sign_up)
+ ADD v18.16b, v0.16b , v17.16b //I edge_idx = vaddq_s8(const_2, sign_up)
ADD v18.16b, v18.16b , v22.16b //I edge_idx = vaddq_s8(edge_idx, sign_down)
TBL v18.16b, {v28.16b},v18.16b //I vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
- NEG v14.16b, v22.16b //I sign_up = vnegq_s8(sign_down)
+ NEG v17.16b, v22.16b //I sign_up = vnegq_s8(sign_down)
//TBL v19.8b, {v28.16b},v19.8b //I vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
- EXT v14.16b, v14.16b , v14.16b,#2 //I sign_up = vextq_s8(sign_up, sign_up, 2)
+ EXT v17.16b, v17.16b , v17.16b,#2 //I sign_up = vextq_s8(sign_up, sign_up, 2)
- Uxtl v20.8h, v12.8b //I pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
- AND v18.16b, v18.16b , v8.16b //I edge_idx = vandq_s8(edge_idx, au1_mask)
+ Uxtl v20.8h, v5.8b //I pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
+ AND v18.16b, v18.16b , v1.16b //I edge_idx = vandq_s8(edge_idx, au1_mask)
mov v19.d[0],v18.d[1]
UZP1 v31.8b, v18.8b, v19.8b
@@ -452,13 +452,13 @@ SIGN_UP_CHANGE_DONE:
ZIP2 v23.8b, v22.8b, v23.8b //I
mov v22.8b,v31.8b
- Uxtl2 v18.8h, v12.16b //I pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
+ Uxtl2 v18.8h, v5.16b //I pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
SADDW v20.8h, v20.8h , v22.8b //I pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
SMAX v20.8h, v20.8h , v2.8h //I pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
UMIN v20.8h, v20.8h , v4.8h //I pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
- mov v12.16b, v16.16b //I pu1_cur_row = pu1_next_row
+ mov v5.16b, v16.16b //I pu1_cur_row = pu1_next_row
SADDW v18.8h, v18.8h , v23.8b //I pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
SUB x7,x7,#1 //I Decrement the ht_tmp loop count by 1
@@ -507,18 +507,18 @@ PU1_SRC_LOOP:
csel x10, x20, x10,GT //II SIGN(pu1_src_cpy[14] - pu1_src_cpy[16 - src_strd])
CMP x8,#0 //II
- mov v14.8b[14], w10 //II sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[14] -pu1_src_cpy[16 - src_strd]), sign_up, 0)
+ mov v17.8b[14], w10 //II sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[14] -pu1_src_cpy[16 - src_strd]), sign_up, 0)
movn x20,#0
csel x8, x20, x8,LT //II
MOV x20,#1
csel x8, x20, x8,GT //II SIGN(pu1_src_cpy[15] - pu1_src_cpy[17 - src_strd]
SUB x10,x12,x7 //III ht_tmp - row
- mov v14.8b[15], w8 //II sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[15] -pu1_src_cpy[17 - src_strd]), sign_up, 1)
+ mov v17.8b[15], w8 //II sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[15] -pu1_src_cpy[17 - src_strd]), sign_up, 1)
ADD x11,x14,x10,LSL #1 //III pu1_src_left_cpy[(ht_tmp - row) * 2]
CMP x7,#1 //III
- cmhi v22.16b, v12.16b , v28.16b //II vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
+ cmhi v22.16b, v5.16b , v28.16b //II vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
BNE NEXT_ROW_POINTER_ASSIGNED_2 //III
mov x5, x21 //III Loads pu1_avail
@@ -529,7 +529,7 @@ PU1_SRC_LOOP:
NEXT_ROW_POINTER_ASSIGNED_2:
LDRH w5,[x11,#2] //III
- cmhi v24.16b, v28.16b , v12.16b //II vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
+ cmhi v24.16b, v28.16b , v5.16b //II vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
ADD x11,x0,x1 //III
LDRB w9,[x11,#14] //III pu1_src_cpy[14]
@@ -545,7 +545,7 @@ NEXT_ROW_POINTER_ASSIGNED_2:
SUB x10,x8,x10 //III pu1_src_cpy[15] - pu1_src_cpy[17 - src_strd]
CMP x9,#0 //III
- ADD v26.16b, v0.16b , v14.16b //II edge_idx = vaddq_s8(const_2, sign_up)
+ ADD v26.16b, v0.16b , v17.16b //II edge_idx = vaddq_s8(const_2, sign_up)
movn x20,#0
csel x9, x20, x9,LT //III
@@ -554,22 +554,22 @@ NEXT_ROW_POINTER_ASSIGNED_2:
ADD v26.16b, v26.16b , v24.16b //II edge_idx = vaddq_s8(edge_idx, sign_down)
CMP x10,#0 //III
- NEG v14.16b, v24.16b //II sign_up = vnegq_s8(sign_down)
+ NEG v17.16b, v24.16b //II sign_up = vnegq_s8(sign_down)
TBL v26.16b, {v21.16b},v26.16b //II vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
movn x20,#0
csel x10, x20, x10,LT //III
MOV x20,#1
csel x10, x20, x10,GT //III SIGN(pu1_src_cpy[15] - pu1_src_cpy[17 - src_strd]
- EXT v14.16b, v14.16b , v14.16b,#2 //II sign_up = vextq_s8(sign_up, sign_up, 2)
+ EXT v17.16b, v17.16b , v17.16b,#2 //II sign_up = vextq_s8(sign_up, sign_up, 2)
//TBL v27.8b, {v21.16b},v27.8b //II vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
cmhi v22.16b, v16.16b , v18.16b //III vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
- mov v14.16b[14], w9 //III sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[14] -pu1_src_cpy[16 - src_strd]), sign_up, 0)
- AND v26.16b, v26.16b , v8.16b //II edge_idx = vandq_s8(edge_idx, au1_mask)
+ mov v17.16b[14], w9 //III sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[14] -pu1_src_cpy[16 - src_strd]), sign_up, 0)
+ AND v26.16b, v26.16b , v1.16b //II edge_idx = vandq_s8(edge_idx, au1_mask)
mov v27.d[0],v26.d[1]
- mov v14.16b[15], w10 //III sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[15] -pu1_src_cpy[17 - src_strd]), sign_up, 1)
+ mov v17.16b[15], w10 //III sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[15] -pu1_src_cpy[17 - src_strd]), sign_up, 1)
UZP1 v31.8b, v26.8b, v27.8b
UZP2 v27.8b, v26.8b, v27.8b //II
mov v26.8b,v31.8b
@@ -578,7 +578,7 @@ NEXT_ROW_POINTER_ASSIGNED_2:
TBL v24.8b, {v6.16b},v26.8b //II
SUB v22.16b, v20.16b , v22.16b //III sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
- ADD v18.16b, v0.16b , v14.16b //III edge_idx = vaddq_s8(const_2, sign_up)
+ ADD v18.16b, v0.16b , v17.16b //III edge_idx = vaddq_s8(const_2, sign_up)
TBL v25.8b, {v7.16b},v27.8b //II
ADD v18.16b, v18.16b , v22.16b //III edge_idx = vaddq_s8(edge_idx, sign_down)
@@ -587,16 +587,16 @@ NEXT_ROW_POINTER_ASSIGNED_2:
ZIP2 v25.8b, v24.8b, v25.8b //II
mov v24.8b,v31.8b
- Uxtl v28.8h, v12.8b //II pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
+ Uxtl v28.8h, v5.8b //II pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
TBL v18.16b, {v20.16b},v18.16b //III vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
- NEG v14.16b, v22.16b //III sign_up = vnegq_s8(sign_down)
+ NEG v17.16b, v22.16b //III sign_up = vnegq_s8(sign_down)
SADDW v28.8h, v28.8h , v24.8b //II pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
//TBL v19.8b, {v20.16b},v19.8b //III vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
- EXT v14.16b, v14.16b , v14.16b,#2 //III sign_up = vextq_s8(sign_up, sign_up, 2)
+ EXT v17.16b, v17.16b , v17.16b,#2 //III sign_up = vextq_s8(sign_up, sign_up, 2)
- Uxtl2 v26.8h, v12.16b //II pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
- AND v18.16b, v18.16b , v8.16b //III edge_idx = vandq_s8(edge_idx, au1_mask)
+ Uxtl2 v26.8h, v5.16b //II pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
+ AND v18.16b, v18.16b , v1.16b //III edge_idx = vandq_s8(edge_idx, au1_mask)
mov v19.d[0],v18.d[1]
Uxtl v20.8h, v16.8b //III pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
@@ -620,7 +620,7 @@ NEXT_ROW_POINTER_ASSIGNED_2:
xtn v28.8b, v28.8h //II vmovn_s16(pi2_tmp_cur_row.val[0])
SADDW v20.8h, v20.8h , v22.8b //III pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
- mov v12.16b, v30.16b //III pu1_cur_row = pu1_next_row
+ mov v5.16b, v30.16b //III pu1_cur_row = pu1_next_row
UMIN v26.8h, v26.8h , v4.8h //II pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip))
SUB x7,x7,#1 //III Decrement the ht_tmp loop count by 1
@@ -682,27 +682,27 @@ NEXT_ROW_POINTER_ASSIGNED_3:
csel x8, x20, x8,GT //SIGN(pu1_src_cpy[14] - pu1_src_cpy[16 - src_strd])
CMP x10,#0
- mov v14.16b[14], w8 //sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[14] -pu1_src_cpy[16 - src_strd]), sign_up, 0)
+ mov v17.16b[14], w8 //sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[14] -pu1_src_cpy[16 - src_strd]), sign_up, 0)
movn x20,#0
csel x10, x20, x10,LT
MOV x20,#1
csel x10, x20, x10,GT //SIGN(pu1_src_cpy[15] - pu1_src_cpy[17 - src_strd]
- mov v14.16b[15], w10 //sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[15] -pu1_src_cpy[17 - src_strd]), sign_up, 1)
- cmhi v20.16b, v12.16b , v18.16b //vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
+ mov v17.16b[15], w10 //sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[15] -pu1_src_cpy[17 - src_strd]), sign_up, 1)
+ cmhi v20.16b, v5.16b , v18.16b //vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
- cmhi v22.16b, v18.16b , v12.16b //vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
+ cmhi v22.16b, v18.16b , v5.16b //vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
SUB v22.16b, v22.16b , v20.16b //sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
- ADD v18.16b, v0.16b , v14.16b //edge_idx = vaddq_s8(const_2, sign_up)
+ ADD v18.16b, v0.16b , v17.16b //edge_idx = vaddq_s8(const_2, sign_up)
ADD v18.16b, v18.16b , v22.16b //edge_idx = vaddq_s8(edge_idx, sign_down)
TBL v18.16b, {v28.16b},v18.16b //vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
//TBL v19.8b, {v28.16b},v19.8b //vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
- AND v18.16b, v18.16b , v8.16b //edge_idx = vandq_s8(edge_idx, au1_mask)
+ AND v18.16b, v18.16b , v1.16b //edge_idx = vandq_s8(edge_idx, au1_mask)
mov v19.d[0],v18.d[1]
- Uxtl v20.8h, v12.8b //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
+ Uxtl v20.8h, v5.8b //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
UZP1 v31.8b, v18.8b, v19.8b
UZP2 v19.8b, v18.8b, v19.8b
mov v18.8b,v31.8b
@@ -710,7 +710,7 @@ NEXT_ROW_POINTER_ASSIGNED_3:
TBL v22.8b, {v6.16b},v18.8b
TBL v23.8b, {v7.16b},v19.8b
- Uxtl2 v18.8h, v12.16b //pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
+ Uxtl2 v18.8h, v5.16b //pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
ZIP1 v31.8b, v22.8b, v23.8b
ZIP2 v23.8b, v22.8b, v23.8b
mov v22.8b,v31.8b
@@ -762,15 +762,15 @@ WD_16_HT_4_LOOP:
csel w8,w20,w8,EQ
MOV x20,#-1
csel x8, x20, x8,NE
- mov v8.8b[0], w8 //au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
+ mov v1.8b[0], w8 //au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
CMP x6,#16 //if(col == 16)
- mov v8.8b[1], w8 //au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
+ mov v1.8b[1], w8 //au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
BNE SKIP_AU1_MASK_VAL_WD_16_HT_4
LDRB w8,[x5,#1] //pu1_avail[1]
- mov v8.16b[14], w8 //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
- mov v8.16b[15], w8 //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
+ mov v1.16b[14], w8 //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
+ mov v1.16b[15], w8 //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
SKIP_AU1_MASK_VAL_WD_16_HT_4:
LDRB w11,[x5,#2] //pu1_avail[2]
@@ -779,27 +779,27 @@ SKIP_AU1_MASK_VAL_WD_16_HT_4:
CMP x11,#0
csel x8, x3, x8,NE
- LD1 {v12.16b},[x0] //pu1_cur_row = vld1q_u8(pu1_src)
+ LD1 {v5.16b},[x0] //pu1_cur_row = vld1q_u8(pu1_src)
//LD1 {v13.8b},[x0] //pu1_cur_row = vld1q_u8(pu1_src)
//SUB x0, x0,#8
ADD x8,x8,#2 //pu1_src - src_strd + 2
ADD x3,x3,#16
- LD1 {v10.16b},[x8] //pu1_top_row = vld1q_u8(pu1_src - src_strd + 2)
+ LD1 {v3.16b},[x8] //pu1_top_row = vld1q_u8(pu1_src - src_strd + 2)
//LD1 {v11.8b},[x8] //pu1_top_row = vld1q_u8(pu1_src - src_strd + 2)
//SUB x8, x8,#8
ADD x5,sp,#0x4B //*au1_src_left_tmp
mov w4, w25 //Loads ht
- cmhi v14.16b, v12.16b , v10.16b //vcgtq_u8(pu1_cur_row, pu1_top_row)
+ cmhi v17.16b, v5.16b , v3.16b //vcgtq_u8(pu1_cur_row, pu1_top_row)
mov w7, w24 //Loads wd
SUB x7,x7,x6 //(wd - col)
- cmhi v16.16b, v10.16b , v12.16b //vcltq_u8(pu1_cur_row, pu1_top_row)
+ cmhi v16.16b, v3.16b , v5.16b //vcltq_u8(pu1_cur_row, pu1_top_row)
ADD x7,x7,#14 //15 + (wd - col)
mov x8, x26 //Loads *pu1_src
- SUB v14.16b, v16.16b , v14.16b //sign_up = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
+ SUB v17.16b, v16.16b , v17.16b //sign_up = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
ADD x7,x8,x7 //pu1_src[0 * src_strd + 15 + (wd - col)]
AU1_SRC_LEFT_LOOP_WD_16_HT_4:
@@ -864,33 +864,33 @@ SIGN_UP_CHANGE_WD_16_HT_4:
csel x8, x20, x8,GT //SIGN(pu1_src_cpy[14] - pu1_src_cpy[16 - src_strd])
CMP x10,#0
- mov v14.16b[14], w8 //sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[14] -pu1_src_cpy[16 - src_strd]), sign_up, 0)
+ mov v17.16b[14], w8 //sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[14] -pu1_src_cpy[16 - src_strd]), sign_up, 0)
movn x20,#0
csel x10, x20, x10,LT
MOV x20,#1
csel x10, x20, x10,GT //SIGN(pu1_src_cpy[15] - pu1_src_cpy[17 - src_strd]
- mov v14.16b[15], w10 //sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[15] -pu1_src_cpy[17 - src_strd]), sign_up, 1)
+ mov v17.16b[15], w10 //sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[15] -pu1_src_cpy[17 - src_strd]), sign_up, 1)
SIGN_UP_CHANGE_DONE_WD_16_HT_4:
LD1 {v20.8b},[x2] //edge_idx_tbl = vld1_s8(gi1_table_edge_idx)
- cmhi v22.16b, v12.16b , v18.16b //vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
+ cmhi v22.16b, v5.16b , v18.16b //vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
- cmhi v24.16b, v18.16b , v12.16b //vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
+ cmhi v24.16b, v18.16b , v5.16b //vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
SUB v24.16b, v24.16b , v22.16b //sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
- ADD v26.16b, v0.16b , v14.16b //edge_idx = vaddq_s8(const_2, sign_up)
+ ADD v26.16b, v0.16b , v17.16b //edge_idx = vaddq_s8(const_2, sign_up)
ADD v26.16b, v26.16b , v24.16b //edge_idx = vaddq_s8(edge_idx, sign_down)
mov v20.d[1],v20.d[0]
- NEG v14.16b, v24.16b //sign_up = vnegq_s8(sign_down)
+ NEG v17.16b, v24.16b //sign_up = vnegq_s8(sign_down)
TBL v26.16b, {v20.16b},v26.16b //vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
//TBL v27.8b, {v20.16b},v27.8b //vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
- EXT v14.16b, v14.16b , v14.16b,#2 //sign_up = vextq_s8(sign_up, sign_up, 2)
+ EXT v17.16b, v17.16b , v17.16b,#2 //sign_up = vextq_s8(sign_up, sign_up, 2)
- Uxtl v28.8h, v12.8b //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
- AND v26.16b, v26.16b , v8.16b //edge_idx = vandq_s8(edge_idx, au1_mask)
+ Uxtl v28.8h, v5.8b //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
+ AND v26.16b, v26.16b , v1.16b //edge_idx = vandq_s8(edge_idx, au1_mask)
mov v27.d[0],v26.d[1]
UZP1 v31.8b, v26.8b, v27.8b
@@ -902,13 +902,13 @@ SIGN_UP_CHANGE_DONE_WD_16_HT_4:
ZIP2 v25.8b, v24.8b, v25.8b
mov v24.8b,v31.8b
- Uxtl2 v30.8h, v12.16b //pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
+ Uxtl2 v30.8h, v5.16b //pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
SADDW v28.8h, v28.8h , v24.8b //pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
SMAX v28.8h, v28.8h , v2.8h //pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
UMIN v28.8h, v28.8h , v4.8h //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
- mov v12.16b, v16.16b //pu1_cur_row = pu1_next_row
+ mov v5.16b, v16.16b //pu1_cur_row = pu1_next_row
SADDW v30.8h, v30.8h , v25.8b //pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
SMAX v30.8h, v30.8h , v2.8h //pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
@@ -949,24 +949,24 @@ WIDTH_RESIDUE:
LDRB w11,[x5,#1] //pu1_avail[1]
LDRB w9,[x5,#2] //pu1_avail[2]
- mov v8.8b[0], w8 //au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
+ mov v1.8b[0], w8 //au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
CMP x9,#0
SUB x20,x0,x1 //pu1_src - src_strd
csel x10, x20, x10,EQ
- mov v8.8b[1], w8 //au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
+ mov v1.8b[1], w8 //au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
csel x10, x3, x10,NE
ADD x10,x10,#2 //pu1_src - src_strd + 2
- mov v8.8b[6], w11 //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
+ mov v1.8b[6], w11 //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
ADD x5,sp,#0x4B //*au1_src_left_tmp
mov w4, w25 //Loads ht
- mov v8.8b[7], w11 //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
+ mov v1.8b[7], w11 //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
mov w7, w24 //Loads wd
mov x8, x26 //Loads *pu1_src
- LD1 {v10.16b},[x10] //pu1_top_row = vld1q_u8(pu1_src - src_strd + 2)
+ LD1 {v3.16b},[x10] //pu1_top_row = vld1q_u8(pu1_src - src_strd + 2)
//LD1 {v11.8b},[x10] //pu1_top_row = vld1q_u8(pu1_src - src_strd + 2)
//SUB x10, x10,#8
SUB x7,x7,#2 //(wd - 2)
@@ -980,15 +980,15 @@ AU1_SRC_LEFT_LOOP_RESIDUE:
SUBS x4,x4,#1 //decrement the loop count
BNE AU1_SRC_LEFT_LOOP_RESIDUE
- LD1 {v12.16b},[x0] //pu1_cur_row = vld1q_u8(pu1_src)
+ LD1 {v5.16b},[x0] //pu1_cur_row = vld1q_u8(pu1_src)
//LD1 {v13.8b},[x0] //pu1_cur_row = vld1q_u8(pu1_src)
//SUB x0, x0,#8
movi v18.16b, #0
- cmhi v14.16b, v12.16b , v10.16b //vcgtq_u8(pu1_cur_row, pu1_top_row)
+ cmhi v17.16b, v5.16b , v3.16b //vcgtq_u8(pu1_cur_row, pu1_top_row)
- cmhi v16.16b, v10.16b , v12.16b //vcltq_u8(pu1_cur_row, pu1_top_row)
- SUB v14.16b, v16.16b , v14.16b //sign_up = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
+ cmhi v16.16b, v3.16b , v5.16b //vcltq_u8(pu1_cur_row, pu1_top_row)
+ SUB v17.16b, v16.16b , v17.16b //sign_up = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
MOV x7,x12 //row count, move ht_tmp to x7
PU1_SRC_LOOP_RESIDUE:
@@ -1047,33 +1047,33 @@ SIGN_UP_CHANGE_RESIDUE:
csel x8, x20, x8,GT //SIGN(pu1_src_cpy[14] - pu1_src_cpy[16 - src_strd])
CMP x10,#0
- mov v14.16b[14], w8 //sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[14] -pu1_src_cpy[16 - src_strd]), sign_up, 0)
+ mov v17.16b[14], w8 //sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[14] -pu1_src_cpy[16 - src_strd]), sign_up, 0)
movn x20,#0
csel x10, x20, x10,LT
MOV x20,#1
csel x10, x20, x10,GT //SIGN(pu1_src_cpy[15] - pu1_src_cpy[17 - src_strd]
- mov v14.16b[15], w10 //sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[15] -pu1_src_cpy[17 - src_strd]), sign_up, 1)
+ mov v17.16b[15], w10 //sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[15] -pu1_src_cpy[17 - src_strd]), sign_up, 1)
SIGN_UP_CHANGE_DONE_RESIDUE:
LD1 {v20.8b},[x2] //edge_idx_tbl = vld1_s8(gi1_table_edge_idx)
- cmhi v22.16b, v12.16b , v18.16b //vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
+ cmhi v22.16b, v5.16b , v18.16b //vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
- cmhi v24.16b, v18.16b , v12.16b //vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
+ cmhi v24.16b, v18.16b , v5.16b //vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
SUB v24.16b, v24.16b , v22.16b //sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
- ADD v26.16b, v0.16b , v14.16b //edge_idx = vaddq_s8(const_2, sign_up)
+ ADD v26.16b, v0.16b , v17.16b //edge_idx = vaddq_s8(const_2, sign_up)
ADD v26.16b, v26.16b , v24.16b //edge_idx = vaddq_s8(edge_idx, sign_down)
mov v20.d[1],v20.d[0]
- NEG v14.16b, v24.16b //sign_up = vnegq_s8(sign_down)
+ NEG v17.16b, v24.16b //sign_up = vnegq_s8(sign_down)
TBL v26.16b, {v20.16b},v26.16b //vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
//TBL v27.8b, {v20.16b},v27.8b //vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
- EXT v14.16b, v14.16b , v14.16b,#2 //sign_up = vextq_s8(sign_up, sign_up, 14)
+ EXT v17.16b, v17.16b , v17.16b,#2 //sign_up = vextq_s8(sign_up, sign_up, 14)
- Uxtl v28.8h, v12.8b //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
- AND v26.16b, v26.16b , v8.16b //edge_idx = vandq_s8(edge_idx, au1_mask)
+ Uxtl v28.8h, v5.8b //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
+ AND v26.16b, v26.16b , v1.16b //edge_idx = vandq_s8(edge_idx, au1_mask)
mov v27.d[0],v26.d[1]
UZP1 v31.8b, v26.8b, v27.8b
@@ -1085,7 +1085,7 @@ SIGN_UP_CHANGE_DONE_RESIDUE:
ZIP2 v25.8b, v24.8b, v25.8b
mov v24.8b,v31.8b
- mov v12.16b, v16.16b //pu1_cur_row = pu1_next_row
+ mov v5.16b, v16.16b //pu1_cur_row = pu1_next_row
SADDW v28.8h, v28.8h , v24.8b //pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
SMAX v28.8h, v28.8h , v2.8h //pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
@@ -1148,7 +1148,7 @@ END_LOOPS:
ldp x23, x24,[sp],#16
ldp x21, x22,[sp],#16
ldp x19, x20,[sp],#16
- pop_v_regs
+
ret
diff --git a/common/arm64/ihevc_weighted_pred_bi.s b/common/arm64/ihevc_weighted_pred_bi.s
index 6851cb4..c0508d8 100644
--- a/common/arm64/ihevc_weighted_pred_bi.s
+++ b/common/arm64/ihevc_weighted_pred_bi.s
@@ -161,7 +161,7 @@ ihevc_weighted_pred_bi_av8:
sxtw x11,w11
sxtw x12,w12
- push_v_regs
+
stp x19, x20,[sp,#-16]!
stp x21, x22,[sp,#-16]!
stp x23, x24,[sp,#-16]!
@@ -221,64 +221,64 @@ core_loop:
ld1 {v1.4h},[x1],#8 //load and increment the pi2_src2
smull v4.4s, v0.4h, v7.4h[0] //vmull_n_s16(pi2_src1_val1, (int16_t) wgt0)
ld1 {v2.4h},[x6],x3 //load and increment the pi2_src_tmp1 ii iteration
- smull v8.4s, v1.4h, v7.4h[1] //vmull_n_s16(pi2_src2_val1, (int16_t) wgt1)
+ smull v5.4s, v1.4h, v7.4h[1] //vmull_n_s16(pi2_src2_val1, (int16_t) wgt1)
ld1 {v3.4h},[x8],x4 //load and increment the pi2_src_tmp1 ii iteration
- add v4.4s, v4.4s , v8.4s //vaddq_s32(i4_tmp1_t1, i4_tmp1_t2)
+ add v4.4s, v4.4s , v5.4s //vaddq_s32(i4_tmp1_t1, i4_tmp1_t2)
ld1 {v0.4h},[x6],x3 //load and increment the pi2_src1 iii iteration
- smull v10.4s, v2.4h, v7.4h[0] //vmull_n_s16(pi2_src1_val2, (int16_t) wgt0) ii iteration
+ smull v6.4s, v2.4h, v7.4h[0] //vmull_n_s16(pi2_src1_val2, (int16_t) wgt0) ii iteration
ld1 {v1.4h},[x8],x4 //load and increment the pi2_src2 iii iteration
add v4.4s, v4.4s , v30.4s //vaddq_s32(i4_tmp1_t1, tmp_lvl_shift_t)
- smull v14.4s, v0.4h, v7.4h[0] //vmull_n_s16(pi2_src1_val1, (int16_t) wgt0) iii iteration
+ smull v19.4s, v0.4h, v7.4h[0] //vmull_n_s16(pi2_src1_val1, (int16_t) wgt0) iii iteration
ld1 {v2.4h},[x6],x3 //load and increment the pi2_src_tmp1 iv iteration
- smull v12.4s, v3.4h, v7.4h[1] //vmull_n_s16(pi2_src2_val2, (int16_t) wgt1) ii iteration
+ smull v17.4s, v3.4h, v7.4h[1] //vmull_n_s16(pi2_src2_val2, (int16_t) wgt1) ii iteration
sshl v4.4s,v4.4s,v28.4s //vshlq_s32(i4_tmp1_t1, tmp_shift_t)
ld1 {v3.4h},[x8],x4 //load and increment the pi2_src_tmp1 iv iteration
- add v10.4s, v10.4s , v12.4s //vaddq_s32(i4_tmp2_t1, i4_tmp2_t2) ii iteration
+ add v6.4s, v6.4s , v17.4s //vaddq_s32(i4_tmp2_t1, i4_tmp2_t2) ii iteration
sqxtun v4.4h, v4.4s //vqmovun_s32(sto_res_tmp1)
smull v16.4s, v1.4h, v7.4h[1] //vmull_n_s16(pi2_src2_val1, (int16_t) wgt1) iii iteration
- add v10.4s, v10.4s , v30.4s //vaddq_s32(i4_tmp2_t1, tmp_lvl_shift_t) ii iteration
+ add v6.4s, v6.4s , v30.4s //vaddq_s32(i4_tmp2_t1, tmp_lvl_shift_t) ii iteration
//mov v5, v4 //vcombine_u16(sto_res_tmp2, sto_res_tmp2)
- add v14.4s, v14.4s , v16.4s //vaddq_s32(i4_tmp1_t1, i4_tmp1_t2) iii iteration
+ add v19.4s, v19.4s , v16.4s //vaddq_s32(i4_tmp1_t1, i4_tmp1_t2) iii iteration
- sshl v10.4s,v10.4s,v28.4s
+ sshl v6.4s,v6.4s,v28.4s
//vshl.s32 q5,q5,q14 //vshlq_s32(i4_tmp2_t1, tmp_shift_t) ii iteration
smull v18.4s, v2.4h, v7.4h[0] //vmull_n_s16(pi2_src1_val2, (int16_t) wgt0) iv iteration
uqxtn v4.8b,v4.8h
//vqmovn.u16 d4,q2 //vqmovn_u16(sto_res_tmp3)
- add v14.4s, v14.4s , v30.4s //vaddq_s32(i4_tmp1_t1, tmp_lvl_shift_t) iii iteration
+ add v19.4s, v19.4s , v30.4s //vaddq_s32(i4_tmp1_t1, tmp_lvl_shift_t) iii iteration
- sqxtun v10.4h, v10.4s //vqmovun_s32(sto_res_tmp1) ii iteration
+ sqxtun v6.4h, v6.4s //vqmovun_s32(sto_res_tmp1) ii iteration
smull v20.4s, v3.4h, v7.4h[1] //vmull_n_s16(pi2_src2_val2, (int16_t) wgt1) iv iteration
- sshl v14.4s,v14.4s,v28.4s
+ sshl v19.4s,v19.4s,v28.4s
//vshl.s32 q7,q7,q14 //vshlq_s32(i4_tmp1_t1, tmp_shift_t) iii iteration
//mov v11, v10 //vcombine_u16(sto_res_tmp2, sto_res_tmp2) ii iteration
add v18.4s, v18.4s , v20.4s //vaddq_s32(i4_tmp2_t1, i4_tmp2_t2) iv iteration
- sqxtun v14.4h, v14.4s //vqmovun_s32(sto_res_tmp1) iii iteration
+ sqxtun v19.4h, v19.4s //vqmovun_s32(sto_res_tmp1) iii iteration
add v18.4s, v18.4s , v30.4s //vaddq_s32(i4_tmp2_t1, tmp_lvl_shift_t) iv iteration
st1 {v4.s}[0],[x2],#4 //store pu1_dst i iteration
- uqxtn v10.8b,v10.8h
+ uqxtn v6.8b,v6.8h
//vqmovn.u16 d10,q5 //vqmovn_u16(sto_res_tmp3) ii iteration
sshl v18.4s,v18.4s,v28.4s
//vshl.s32 q9,q9,q14 //vshlq_s32(i4_tmp2_t1, tmp_shift_t) iv iteration
- st1 {v10.s}[0],[x10],x5 //store pu1_dst ii iteration
+ st1 {v6.s}[0],[x10],x5 //store pu1_dst ii iteration
//mov v15, v14 //vcombine_u16(sto_res_tmp2, sto_res_tmp2) iii iteration
- uqxtn v14.8b,v14.8h
+ uqxtn v19.8b,v19.8h
//vqmovn.u16 d14,q7 //vqmovn_u16(sto_res_tmp3) iii iteration
sqxtun v18.4h, v18.4s //vqmovun_s32(sto_res_tmp1) iv iteration
//mov v19, v18 //vcombine_u16(sto_res_tmp2, sto_res_tmp2)
- st1 {v14.s}[0],[x10],x5 //store pu1_dst iii iteration
+ st1 {v19.s}[0],[x10],x5 //store pu1_dst iii iteration
uqxtn v18.8b,v18.8h
//vqmovn.u16 d18,q9 //vqmovn_u16(sto_res_tmp3) iv iteration
subs x7,x7,#4 //decrement wd by 4 and check for 0
@@ -306,7 +306,7 @@ end_loops:
ldp x23, x24,[sp],#16
ldp x21, x22,[sp],#16
ldp x19, x20,[sp],#16
- pop_v_regs
+
ret
diff --git a/common/arm64/ihevc_weighted_pred_bi_default.s b/common/arm64/ihevc_weighted_pred_bi_default.s
index 07fb4ce..d98e025 100644
--- a/common/arm64/ihevc_weighted_pred_bi_default.s
+++ b/common/arm64/ihevc_weighted_pred_bi_default.s
@@ -122,7 +122,7 @@ ihevc_weighted_pred_bi_default_av8:
ldr w9,[sp,#8]
// stmfd sp!, {x4-x12, x14} //stack stores the values of the arguments
- push_v_regs
+
stp x19, x20,[sp,#-16]!
stp x21, x22,[sp,#-16]!
@@ -195,11 +195,11 @@ core_loop_4:
ld1 {v6.4h},[x0],#8 //load and increment the pi2_src1
add x14,x2,x5 //pu1_dst_tmp = pu1_dst + dst_strd
ld1 {v7.4h},[x1],#8 //load and increment the pi2_src2
- ld1 {v8.4h},[x11],x3 //load and increment the pi2_src1 ii iteration
+ ld1 {v1.4h},[x11],x3 //load and increment the pi2_src1 ii iteration
sqadd v18.4h,v6.4h,v7.4h
sqadd v18.4h,v18.4h,v0.4h //vaddq_s32(i4_tmp1_t1, tmp_lvl_shift_t)
- ld1 {v9.4h},[x12],x4 //load and increment the pi2_src2 ii iteration
- sqadd v20.4h,v8.4h,v9.4h //vaddq_s32(i4_tmp2_t1, i4_tmp2_t2)
+ ld1 {v3.4h},[x12],x4 //load and increment the pi2_src2 ii iteration
+ sqadd v20.4h,v1.4h,v3.4h //vaddq_s32(i4_tmp2_t1, i4_tmp2_t2)
sqadd v19.4h,v20.4h,v0.4h //vaddq_s32(i4_tmp2_t1, tmp_lvl_shift_t)
mov v18.d[1],v19.d[0]
sqshrun v20.8b, v18.8h,#7
@@ -250,11 +250,11 @@ core_loop_chroma_4x2:
ld1 {v6.4h},[x0],#8 //load and increment the pi2_src1
add x14,x2,x5 //pu1_dst_tmp = pu1_dst + dst_strd
ld1 {v7.4h},[x1],#8 //load and increment the pi2_src2
- ld1 {v8.4h},[x11],x3 //load and increment the pi2_src1 ii iteration
+ ld1 {v1.4h},[x11],x3 //load and increment the pi2_src1 ii iteration
sqadd v18.4h,v6.4h,v7.4h
sqadd v18.4h,v18.4h,v0.4h //vaddq_s32(i4_tmp1_t1, tmp_lvl_shift_t)
- ld1 {v9.4h},[x12],x4 //load and increment the pi2_src2 ii iteration
- sqadd v20.4h,v8.4h,v9.4h //vaddq_s32(i4_tmp2_t1, i4_tmp2_t2)
+ ld1 {v3.4h},[x12],x4 //load and increment the pi2_src2 ii iteration
+ sqadd v20.4h,v1.4h,v3.4h //vaddq_s32(i4_tmp2_t1, i4_tmp2_t2)
sqadd v19.4h,v20.4h,v0.4h //vaddq_s32(i4_tmp2_t1, tmp_lvl_shift_t)
mov v18.d[1],v19.d[0]
sqshrun v20.8b, v18.8h,#7
@@ -301,17 +301,17 @@ core_loop_8:
ld1 { v18.8h},[x12],x4 //load and increment the pi2_src2 iii iteration
sqadd v22.8h,v22.8h,v0.8h //vaddq_s32(i4_tmp2_t1, tmp_lvl_shift_t)
sqshrun v20.8b, v24.8h,#7
- ld1 { v12.8h},[x11],x3 //load and increment the pi2_src1 iv iteration
+ ld1 { v17.8h},[x11],x3 //load and increment the pi2_src1 iv iteration
sqadd v30.8h,v16.8h,v18.8h
sqshrun v21.8b, v22.8h,#7
- ld1 { v14.8h},[x12],x4 //load and increment the pi2_src2 iv iteration
+ ld1 { v29.8h},[x12],x4 //load and increment the pi2_src2 iv iteration
sqadd v30.8h,v30.8h,v0.8h //vaddq_s32(i4_tmp1_t1, tmp_lvl_shift_t) iii iteration
st1 {v20.2s},[x2],#8 //store pu1_dst i iteration
- sqadd v8.8h,v12.8h,v14.8h //vaddq_s32(i4_tmp2_t1, i4_tmp2_t2) iv iteration
+ sqadd v1.8h,v17.8h,v29.8h //vaddq_s32(i4_tmp2_t1, i4_tmp2_t2) iv iteration
st1 {v21.2s},[x14],x5 //store pu1_dst ii iteration
- sqadd v8.8h,v8.8h,v0.8h
+ sqadd v1.8h,v1.8h,v0.8h
sqshrun v30.8b, v30.8h,#7
- sqshrun v31.8b, v8.8h,#7
+ sqshrun v31.8b, v1.8h,#7
add x11,x0,x3 //pi2_src_tmp1 = pi2_src1 + 2*src_strd1(2* because pi1_src is a 16 bit pointer)
add x12,x1,x4 //pi2_src_tmp2 = pi2_src2 + 2*src_strd2(2* because pi2_src is a 16 bit pointer)
st1 {v30.2s},[x14],x5 //store pu1_dst iii iteration //vaddq_s32(i4_tmp2_t1, tmp_lvl_shift_t) iv iteratio
@@ -413,40 +413,40 @@ prolog_16:
ld1 { v2.8h},[x0],#16 //load and increment the pi2_src1
ld1 { v4.8h},[x1],#16 //load and increment the pi2_src2
- ld1 { v10.8h},[x0],x11 //load and increment the pi2_src1
- ld1 { v12.8h},[x1],x11 //load and increment the pi2_src2
+ ld1 { v5.8h},[x0],x11 //load and increment the pi2_src1
+ ld1 { v17.8h},[x1],x11 //load and increment the pi2_src2
ld1 { v6.8h},[x0],#16 //load and increment the pi2_src1 ii iteration
subs x9,x9,#16
- ld1 { v8.8h},[x1],#16 //load and increment the pi2_src2 ii iteration
+ ld1 { v1.8h},[x1],#16 //load and increment the pi2_src2 ii iteration
sub x20,x8,#2
csel x8, x20, x8,eq
sqadd v22.8h,v2.8h,v4.8h
- ld1 { v14.8h},[x0],x12 //load and increment the pi2_src1 ii iteration
- sqadd v28.8h,v10.8h,v12.8h
+ ld1 { v29.8h},[x0],x12 //load and increment the pi2_src1 ii iteration
+ sqadd v28.8h,v5.8h,v17.8h
ld1 { v16.8h},[x1],x12 //load and increment the pi2_src2 ii iteration
add x20,x0,x7
csel x0, x20, x0,eq
add x20,x1,x7
csel x1, x20, x1,eq
- sqadd v24.8h,v6.8h,v8.8h
+ sqadd v24.8h,v6.8h,v1.8h
ld1 { v2.8h},[x0],#16
- sqadd v26.8h,v14.8h,v16.8h
+ sqadd v26.8h,v29.8h,v16.8h
// if the input is chroma with 8x2 block size
cmp x8,#0
beq epilog_16
ld1 { v4.8h},[x1],#16 //load and increment the pi2_src2
sqadd v22.8h,v22.8h,v0.8h
- ld1 { v10.8h},[x0],x11 //load and increment the pi2_src1
+ ld1 { v5.8h},[x0],x11 //load and increment the pi2_src1
sqadd v28.8h,v28.8h,v0.8h
- ld1 { v12.8h},[x1],x11 //load and increment the pi2_src2
+ ld1 { v17.8h},[x1],x11 //load and increment the pi2_src2
sqadd v24.8h,v24.8h,v0.8h
ld1 { v6.8h},[x0],#16 //load and increment the pi2_src1 ii iteration
sqadd v30.8h,v26.8h,v0.8h
sqshrun v20.8b, v22.8h,#7
- ld1 { v8.8h},[x1],#16 //load and increment the pi2_src2 ii iteration
+ ld1 { v1.8h},[x1],#16 //load and increment the pi2_src2 ii iteration
sqshrun v21.8b, v28.8h,#7
- ld1 { v14.8h},[x0],x12 //load and increment the pi2_src1 ii iteration
+ ld1 { v29.8h},[x0],x12 //load and increment the pi2_src1 ii iteration
sqshrun v26.8b, v24.8h,#7
ld1 { v16.8h},[x1],x12 //load and increment the pi2_src2 ii iteration
sqshrun v27.8b, v30.8h,#7
@@ -463,15 +463,15 @@ core_loop_16:
mov v20.d[1],v21.d[0]
mov v26.d[1],v27.d[0]
st1 { v20.4s},[x2],x5
- sqadd v28.8h,v10.8h,v12.8h
+ sqadd v28.8h,v5.8h,v17.8h
st1 { v26.4s},[x2],x10
add x20,x2,x14
csel x2, x20, x2,eq
- sqadd v24.8h,v6.8h,v8.8h
+ sqadd v24.8h,v6.8h,v1.8h
subs x9,x9,#16
add x20,x0,x7
csel x0, x20, x0,eq
- sqadd v26.8h,v14.8h,v16.8h
+ sqadd v26.8h,v29.8h,v16.8h
add x20,x1,x7
csel x1, x20, x1,eq
@@ -487,15 +487,15 @@ core_loop_16:
sqadd v28.8h,v28.8h,v0.8h
ld1 { v4.8h},[x1],#16 //load and increment the pi2_src2
sqadd v24.8h,v24.8h,v0.8h
- ld1 { v10.8h},[x0],x11 //load and increment the pi2_src1
+ ld1 { v5.8h},[x0],x11 //load and increment the pi2_src1
sqadd v30.8h,v26.8h,v0.8h
- ld1 { v12.8h},[x1],x11 //load and increment the pi2_src2
+ ld1 { v17.8h},[x1],x11 //load and increment the pi2_src2
sqshrun v20.8b, v22.8h,#7
ld1 { v6.8h},[x0],#16 //load and increment the pi2_src1 ii iteration
sqshrun v21.8b, v28.8h,#7
- ld1 { v8.8h},[x1],#16 //load and increment the pi2_src2 ii iteration
+ ld1 { v1.8h},[x1],#16 //load and increment the pi2_src2 ii iteration
sqshrun v26.8b, v24.8h,#7
- ld1 { v14.8h},[x0],x12 //load and increment the pi2_src1 ii iteration
+ ld1 { v29.8h},[x0],x12 //load and increment the pi2_src1 ii iteration
sqshrun v27.8b, v30.8h,#7
ld1 { v16.8h},[x1],x12 //load and increment the pi2_src2 ii iteration
@@ -533,7 +533,7 @@ end_loops:
// ldmfd sp!,{x4-x12,x15} //reload the registers from sp
ldp x21, x22,[sp],#16
ldp x19, x20,[sp],#16
- pop_v_regs
+
ret
diff --git a/common/arm64/ihevc_weighted_pred_uni.s b/common/arm64/ihevc_weighted_pred_uni.s
index d805230..5586679 100644
--- a/common/arm64/ihevc_weighted_pred_uni.s
+++ b/common/arm64/ihevc_weighted_pred_uni.s
@@ -129,7 +129,7 @@ ihevc_weighted_pred_uni_av8:
ldr w9,[sp,#8]
// stmfd sp!, {x4-x12, x14} //stack stores the values of the arguments
- push_v_regs
+
stp x19, x20,[sp,#-16]!
stp x21, x22,[sp,#-16]!
@@ -175,37 +175,37 @@ core_loop:
smull v4.4s, v1.4h, v0.4h[0] //vmull_n_s16(pi2_src_val1, (int16_t) wgt0)
add v4.4s, v4.4s , v30.4s //vaddq_s32(i4_tmp1_t, tmp_lvl_shift_t)
- ld1 {v8.4h},[x5],x2 //load and increment the pi2_src iii iteration
+ ld1 {v3.4h},[x5],x2 //load and increment the pi2_src iii iteration
smull v6.4s, v2.4h, v0.4h[0] //vmull_n_s16(pi2_src_val2, (int16_t) wgt0) ii iteration
- ld1 {v9.4h},[x5],x2 //load and increment the pi2_src_tmp iv iteration
+ ld1 {v5.4h},[x5],x2 //load and increment the pi2_src_tmp iv iteration
sshl v4.4s,v4.4s,v28.4s
//vshl.s32 q2,q2,q14 //vshlq_s32(i4_tmp1_t, tmp_shift_t)
add v6.4s, v6.4s , v30.4s //vaddq_s32(i4_tmp2_t, tmp_lvl_shift_t) ii iteration
- smull v10.4s, v8.4h, v0.4h[0] //vmull_n_s16(pi2_src_val1, (int16_t) wgt0) iii iteration
+ smull v7.4s, v3.4h, v0.4h[0] //vmull_n_s16(pi2_src_val1, (int16_t) wgt0) iii iteration
sqxtun v4.4h, v4.4s //vqmovun_s32(sto_res_tmp1)
- add v10.4s, v10.4s , v30.4s //vaddq_s32(i4_tmp1_t, tmp_lvl_shift_t) iii iteration
+ add v7.4s, v7.4s , v30.4s //vaddq_s32(i4_tmp1_t, tmp_lvl_shift_t) iii iteration
//mov v5, v4 //vcombine_u16(sto_res_tmp2, sto_res_tmp2)
sshl v6.4s,v6.4s,v28.4s
//vshl.s32 q3,q3,q14 //vshlq_s32(i4_tmp2_t, tmp_shift_t) ii iteration
- smull v12.4s, v9.4h, v0.4h[0] //vmull_n_s16(pi2_src_val2, (int16_t) wgt0) iv iteration
+ smull v16.4s, v5.4h, v0.4h[0] //vmull_n_s16(pi2_src_val2, (int16_t) wgt0) iv iteration
uqxtn v4.8b, v4.8h //vqmovn_u16(sto_res_tmp3)
- sshl v10.4s,v10.4s,v28.4s
+ sshl v7.4s,v7.4s,v28.4s
//vshl.s32 q5,q5,q14 //vshlq_s32(i4_tmp1_t, tmp_shift_t) iii iteration
sqxtun v6.4h, v6.4s //vqmovun_s32(sto_res_tmp1) ii iteration
- add v12.4s, v12.4s , v30.4s //vaddq_s32(i4_tmp2_t, tmp_lvl_shift_t) iv iteration
+ add v16.4s, v16.4s , v30.4s //vaddq_s32(i4_tmp2_t, tmp_lvl_shift_t) iv iteration
//mov v7, v6 //vcombine_u16(sto_res_tmp2, sto_res_tmp2) ii iteration
- sqxtun v10.4h, v10.4s //vqmovun_s32(sto_res_tmp1) iii iteration
+ sqxtun v7.4h, v7.4s //vqmovun_s32(sto_res_tmp1) iii iteration
- sshl v12.4s,v12.4s,v28.4s
+ sshl v16.4s,v16.4s,v28.4s
//vshl.s32 q6,q6,q14 //vshlq_s32(i4_tmp2_t, tmp_shift_t) iv iteration
st1 {v4.s}[0],[x1],#4 //store pu1_dst i iteration
//mov v11, v10 //vcombine_u16(sto_res_tmp2, sto_res_tmp2) iii iteration
@@ -213,15 +213,15 @@ core_loop:
uqxtn v6.8b, v6.8h //vqmovn_u16(sto_res_tmp3) ii iteration
st1 {v6.s}[0],[x6],x3 //store pu1_dst ii iteration
- uqxtn v10.8b, v10.8h //vqmovn_u16(sto_res_tmp3) iii iteration
- sqxtun v12.4h, v12.4s //vqmovun_s32(sto_res_tmp1) iv iteration
+ uqxtn v7.8b, v7.8h //vqmovn_u16(sto_res_tmp3) iii iteration
+ sqxtun v16.4h, v16.4s //vqmovun_s32(sto_res_tmp1) iv iteration
//mov v13, v12 //vcombine_u16(sto_res_tmp2, sto_res_tmp2) iv iteration
- st1 {v10.s}[0],[x6],x3 //store pu1_dst i iteration iii iteration
- uqxtn v12.8b, v12.8h //vqmovn_u16(sto_res_tmp3) iv iteration
+ st1 {v7.s}[0],[x6],x3 //store pu1_dst i iteration iii iteration
+ uqxtn v16.8b, v16.8h //vqmovn_u16(sto_res_tmp3) iv iteration
subs x9,x9,#4 //decrement wd by 4 and check for 0
- st1 {v12.s}[0],[x6],x3 //store pu1_dst iv iteration
+ st1 {v16.s}[0],[x6],x3 //store pu1_dst iv iteration
bgt core_loop //if greater than 0 repeat the core loop again
end_core_loop:
@@ -239,7 +239,7 @@ end_loops:
// ldmfd sp!,{x4-x12,x15} //reload the registers from sp
ldp x21, x22,[sp],#16
ldp x19, x20,[sp],#16
- pop_v_regs
+
ret
diff --git a/decoder/arm64/ihevcd_fmt_conv_420sp_to_rgba8888.s b/decoder/arm64/ihevcd_fmt_conv_420sp_to_rgba8888.s
index 485ee66..a6041f5 100644
--- a/decoder/arm64/ihevcd_fmt_conv_420sp_to_rgba8888.s
+++ b/decoder/arm64/ihevcd_fmt_conv_420sp_to_rgba8888.s
@@ -91,7 +91,10 @@ ihevcd_fmt_conv_420sp_to_rgba8888_av8:
//// push the registers on the stack
// STMFD sp!,{x4-x12,x14}
- push_v_regs
+
+ stp d12,d14,[sp,#-16]!
+ stp d8,d15,[sp,#-16]! // Storing d15 using { sub sp,sp,#8; str d15,[sp] } is giving bus error.
+ // d8 is used as dummy register and stored along with d15 using stp. d8 is not used in the function.
stp x19, x20,[sp,#-16]!
@@ -194,8 +197,8 @@ LABEL_YUV420SP_TO_RGB8888_WIDTH_LOOP:
prfm PLDL1KEEP,[x1]
////NEED TO MULTIPLY WITH Q2,Q3 WITH CO-EEFICIENTS
- sMULL v8.4s, v4.4h, v0.4h[3] ////(U-128)*C4 FOR B
- sMULL2 v10.4s, v4.8h, v0.4h[3] ////(U-128)*C4 FOR B
+ sMULL v5.4s, v4.4h, v0.4h[3] ////(U-128)*C4 FOR B
+ sMULL2 v7.4s, v4.8h, v0.4h[3] ////(U-128)*C4 FOR B
sMULL v20.4s, v6.4h, v0.4h[0] ////(V-128)*C1 FOR R
sMULL2 v22.4s, v6.8h, v0.4h[0] ////(V-128)*C1 FOR R
@@ -206,13 +209,13 @@ LABEL_YUV420SP_TO_RGB8888_WIDTH_LOOP:
sMLAL2 v14.4s, v6.8h, v0.4h[2] ////Q7 = (U-128)*C2 + (V-128)*C3
////NARROW RIGHT SHIFT BY 13 FOR R&B
- sqshrn v8.4h, v8.4s,#13 ////D8 = (U-128)*C4>>13 4 16-BIT VALUES
- sqshrn2 v8.8h, v10.4s,#13 ////D9 = (U-128)*C4>>13 4 16-BIT VALUES
+ sqshrn v5.4h, v5.4s,#13 ////D8 = (U-128)*C4>>13 4 16-BIT VALUES
+ sqshrn2 v5.8h, v7.4s,#13 ////D9 = (U-128)*C4>>13 4 16-BIT VALUES
////Q4 - WEIGHT FOR B
////NARROW RIGHT SHIFT BY 13 FOR R&B
- sqshrn v10.4h, v20.4s,#13 ////D10 = (V-128)*C1>>13 4 16-BIT VALUES
- sqshrn2 v10.8h, v22.4s,#13 ////D11 = (V-128)*C1>>13 4 16-BIT VALUES
+ sqshrn v7.4h, v20.4s,#13 ////D10 = (V-128)*C1>>13 4 16-BIT VALUES
+ sqshrn2 v7.8h, v22.4s,#13 ////D11 = (V-128)*C1>>13 4 16-BIT VALUES
////Q5 - WEIGHT FOR R
////NARROW RIGHT SHIFT BY 13 FOR G
@@ -220,12 +223,12 @@ LABEL_YUV420SP_TO_RGB8888_WIDTH_LOOP:
sqshrn2 v12.8h, v14.4s,#13 ////D13 = [(U-128)*C2 + (V-128)*C3]>>13 4 16-BIT VALUES
////Q6 - WEIGHT FOR G
- UADDW v14.8h, v8.8h , v30.8b ////Q7 - HAS Y + B
- UADDW v16.8h, v10.8h , v30.8b ////Q8 - HAS Y + R
+ UADDW v14.8h, v5.8h , v30.8b ////Q7 - HAS Y + B
+ UADDW v16.8h, v7.8h , v30.8b ////Q8 - HAS Y + R
UADDW v18.8h, v12.8h , v30.8b ////Q9 - HAS Y + G
- UADDW v20.8h, v8.8h , v31.8b ////Q10 - HAS Y + B
- UADDW v22.8h, v10.8h , v31.8b ////Q11 - HAS Y + R
+ UADDW v20.8h, v5.8h , v31.8b ////Q10 - HAS Y + B
+ UADDW v22.8h, v7.8h , v31.8b ////Q11 - HAS Y + R
UADDW v24.8h, v12.8h , v31.8b ////Q12 - HAS Y + G
sqxtun v14.8b, v14.8h
@@ -276,12 +279,12 @@ LABEL_YUV420SP_TO_RGB8888_WIDTH_LOOP:
////D14-D20 - TOALLY HAVE 16 VALUES
////WE NEED TO SHIFT R,G,B VALUES TO GET 5BIT,6BIT AND 5BIT COMBINATIONS
- UADDW v14.8h, v8.8h , v28.8b ////Q7 - HAS Y + B
- UADDW v16.8h, v10.8h , v28.8b ////Q2 - HAS Y + R
+ UADDW v14.8h, v5.8h , v28.8b ////Q7 - HAS Y + B
+ UADDW v16.8h, v7.8h , v28.8b ////Q2 - HAS Y + R
UADDW v18.8h, v12.8h , v28.8b ////Q3 - HAS Y + G
- UADDW v20.8h, v8.8h , v29.8b ////Q10 - HAS Y + B
- UADDW v22.8h, v10.8h , v29.8b ////Q11 - HAS Y + R
+ UADDW v20.8h, v5.8h , v29.8b ////Q10 - HAS Y + B
+ UADDW v22.8h, v7.8h , v29.8b ////Q11 - HAS Y + R
UADDW v24.8h, v12.8h , v29.8b ////Q12 - HAS Y + G
////COMPUTE THE ACTUAL RGB VALUES,WE CAN DO TWO ROWS AT A TIME
@@ -357,8 +360,8 @@ LABEL_YUV420SP_TO_RGB8888_WIDTH_LOOP_SKIP:
////NEED TO MULTIPLY WITH Q2,Q3 WITH CO-EEFICIENTS
- sMULL v8.4s, v4.4h, v0.4h[3] ////(U-128)*C4 FOR B
- sMULL2 v10.4s, v4.8h, v0.4h[3] ////(U-128)*C4 FOR B
+ sMULL v5.4s, v4.4h, v0.4h[3] ////(U-128)*C4 FOR B
+ sMULL2 v7.4s, v4.8h, v0.4h[3] ////(U-128)*C4 FOR B
sMULL v20.4s, v6.4h, v0.4h[0] ////(V-128)*C1 FOR R
sMULL2 v22.4s, v6.8h, v0.4h[0] ////(V-128)*C1 FOR R
@@ -369,13 +372,13 @@ LABEL_YUV420SP_TO_RGB8888_WIDTH_LOOP_SKIP:
sMLAL2 v14.4s, v6.8h, v0.4h[2] ////Q7 = (U-128)*C2 + (V-128)*C3
////NARROW RIGHT SHIFT BY 13 FOR R&B
- sqshrn v8.4h, v8.4s,#13 ////D8 = (U-128)*C4>>13 4 16-BIT VALUES
- sqshrn2 v8.8h, v10.4s,#13 ////D9 = (U-128)*C4>>13 4 16-BIT VALUES
+ sqshrn v5.4h, v5.4s,#13 ////D8 = (U-128)*C4>>13 4 16-BIT VALUES
+ sqshrn2 v5.8h, v7.4s,#13 ////D9 = (U-128)*C4>>13 4 16-BIT VALUES
////Q4 - WEIGHT FOR B
////NARROW RIGHT SHIFT BY 13 FOR R&B
- sqshrn v10.4h, v20.4s,#13 ////D10 = (V-128)*C1>>13 4 16-BIT VALUES
- sqshrn2 v10.8h, v22.4s,#13 ////D11 = (V-128)*C1>>13 4 16-BIT VALUES
+ sqshrn v7.4h, v20.4s,#13 ////D10 = (V-128)*C1>>13 4 16-BIT VALUES
+ sqshrn2 v7.8h, v22.4s,#13 ////D11 = (V-128)*C1>>13 4 16-BIT VALUES
////Q5 - WEIGHT FOR R
////NARROW RIGHT SHIFT BY 13 FOR G
@@ -383,12 +386,12 @@ LABEL_YUV420SP_TO_RGB8888_WIDTH_LOOP_SKIP:
sqshrn2 v12.8h, v14.4s,#13 ////D13 = [(U-128)*C2 + (V-128)*C3]>>13 4 16-BIT VALUES
////Q6 - WEIGHT FOR G
- UADDW v14.8h, v8.8h , v30.8b ////Q7 - HAS Y + B
- UADDW v16.8h, v10.8h , v30.8b ////Q8 - HAS Y + R
+ UADDW v14.8h, v5.8h , v30.8b ////Q7 - HAS Y + B
+ UADDW v16.8h, v7.8h , v30.8b ////Q8 - HAS Y + R
UADDW v18.8h, v12.8h , v30.8b ////Q9 - HAS Y + G
- UADDW v20.8h, v8.8h , v31.8b ////Q10 - HAS Y + B
- UADDW v22.8h, v10.8h , v31.8b ////Q11 - HAS Y + R
+ UADDW v20.8h, v5.8h , v31.8b ////Q10 - HAS Y + B
+ UADDW v22.8h, v7.8h , v31.8b ////Q11 - HAS Y + R
UADDW v24.8h, v12.8h , v31.8b ////Q12 - HAS Y + G
sqxtun v14.8b, v14.8h
@@ -439,12 +442,12 @@ LABEL_YUV420SP_TO_RGB8888_WIDTH_LOOP_SKIP:
////D14-D20 - TOALLY HAVE 16 VALUES
////WE NEED TO SHIFT R,G,B VALUES TO GET 5BIT,6BIT AND 5BIT COMBINATIONS
- UADDW v14.8h, v8.8h , v28.8b ////Q7 - HAS Y + B
- UADDW v16.8h, v10.8h , v28.8b ////Q2 - HAS Y + R
+ UADDW v14.8h, v5.8h , v28.8b ////Q7 - HAS Y + B
+ UADDW v16.8h, v7.8h , v28.8b ////Q2 - HAS Y + R
UADDW v18.8h, v12.8h , v28.8b ////Q3 - HAS Y + G
- UADDW v20.8h, v8.8h , v29.8b ////Q10 - HAS Y + B
- UADDW v22.8h, v10.8h , v29.8b ////Q11 - HAS Y + R
+ UADDW v20.8h, v5.8h , v29.8b ////Q10 - HAS Y + B
+ UADDW v22.8h, v7.8h , v29.8b ////Q11 - HAS Y + R
UADDW v24.8h, v12.8h , v29.8b ////Q12 - HAS Y + G
sqxtun v14.8b, v14.8h
@@ -513,7 +516,9 @@ LABEL_YUV420SP_TO_RGB8888_WIDTH_LOOP_SKIP:
////POP THE REGISTERS
// LDMFD sp!,{x4-x12,PC}
ldp x19, x20,[sp],#16
- pop_v_regs
+ ldp d8,d15,[sp],#16 // Loading d15 using { ldr d15,[sp]; add sp,sp,#8 } is giving bus error.
+ // d8 is used as dummy register and loaded along with d15 using ldp. d8 is not used in the function.
+ ldp d12,d14,[sp],#16
ret
diff --git a/decoder/arm64/ihevcd_itrans_recon_dc_luma.s b/decoder/arm64/ihevcd_itrans_recon_dc_luma.s
index 279888b..edc70e7 100644
--- a/decoder/arm64/ihevcd_itrans_recon_dc_luma.s
+++ b/decoder/arm64/ihevcd_itrans_recon_dc_luma.s
@@ -59,7 +59,7 @@ ihevcd_itrans_recon_dc_luma_av8:
- push_v_regs
+
stp x19, x20,[sp,#-16]!
sxth x5,w5
@@ -120,8 +120,8 @@ col_loop:
ld1 {v6.8b},[x7],x2
ld1 {v7.8b},[x7],x2
- ld1 {v8.8b},[x7],x2
- ld1 {v9.8b},[x7]
+ ld1 {v1.8b},[x7],x2
+ ld1 {v17.8b},[x7]
add x0,x0,#8
@@ -132,8 +132,8 @@ col_loop:
uaddw v24.8h, v0.8h , v5.8b
uaddw v22.8h, v0.8h , v6.8b
uaddw v20.8h, v0.8h , v7.8b
- uaddw v18.8h, v0.8h , v8.8b
- uaddw v16.8h, v0.8h , v9.8b
+ uaddw v18.8h, v0.8h , v1.8b
+ uaddw v16.8h, v0.8h , v17.8b
mov x11,x1
sqxtun v2.8b, v30.8h
@@ -142,8 +142,8 @@ col_loop:
sqxtun v5.8b, v24.8h
sqxtun v6.8b, v22.8h
sqxtun v7.8b, v20.8h
- sqxtun v8.8b, v18.8h
- sqxtun v9.8b, v16.8h
+ sqxtun v1.8b, v18.8h
+ sqxtun v17.8b, v16.8h
st1 {v2.2s},[x11],x3
@@ -152,8 +152,8 @@ col_loop:
st1 {v5.2s},[x11],x3
st1 {v6.2s},[x11],x3
st1 {v7.2s},[x11],x3
- st1 {v8.2s},[x11],x3
- st1 {v9.2s},[x11]
+ st1 {v1.2s},[x11],x3
+ st1 {v17.2s},[x11]
add x1,x1,#8
@@ -206,7 +206,7 @@ col_loop_4:
end_loops:
ldp x19, x20,[sp],#16
- pop_v_regs
+
ret