summaryrefslogtreecommitdiffstats
path: root/common/arm64/ihevc_intra_pred_filters_luma_mode_19_to_25.s
diff options
context:
space:
mode:
Diffstat (limited to 'common/arm64/ihevc_intra_pred_filters_luma_mode_19_to_25.s')
-rw-r--r--common/arm64/ihevc_intra_pred_filters_luma_mode_19_to_25.s29
1 files changed, 17 insertions, 12 deletions
diff --git a/common/arm64/ihevc_intra_pred_filters_luma_mode_19_to_25.s b/common/arm64/ihevc_intra_pred_filters_luma_mode_19_to_25.s
index fe7ac11..9b59d58 100644
--- a/common/arm64/ihevc_intra_pred_filters_luma_mode_19_to_25.s
+++ b/common/arm64/ihevc_intra_pred_filters_luma_mode_19_to_25.s
@@ -104,7 +104,10 @@
ihevc_intra_pred_luma_mode_19_to_25_av8:
// stmfd sp!, {x4-x12, x14} //stack stores the values of the arguments
- push_v_regs
+
+ stp d9,d10,[sp,#-16]!
+ stp d12,d13,[sp,#-16]!
+ stp d14,d15,[sp,#-16]!
stp x19, x20,[sp,#-16]!
adrp x7, :got:gai4_ihevc_ang_table
@@ -267,7 +270,7 @@ prologue:
add x10,x8,x9 //(i row)*pu1_ref[ref_main_idx]
- ld1 {v8.8b},[x10],x11 //(i row)ref_main_idx
+ ld1 {v23.8b},[x10],x11 //(i row)ref_main_idx
sbfx x9,x14,#8,#8
ld1 {v9.8b},[x10] //(i row)ref_main_idx_1
@@ -278,7 +281,7 @@ prologue:
add x10,x8,x9 //(iii)*pu1_ref[ref_main_idx]
ld1 {v12.8b},[x12],x11 //(ii)ref_main_idx
- umull v10.8h, v8.8b, v30.8b //(i row)vmull_u8(ref_main_idx, dup_const_32_fract)
+ umull v10.8h, v23.8b, v30.8b //(i row)vmull_u8(ref_main_idx, dup_const_32_fract)
ld1 {v13.8b},[x12] //(ii)ref_main_idx_1
umlal v10.8h, v9.8b, v31.8b //(i row)vmull_u8(ref_main_idx_1, dup_const_fract)
@@ -316,7 +319,7 @@ prologue:
dup v29.8b, v4.8b[5] //(vi)
add x10,x8,x9 //(v)*pu1_ref[ref_main_idx]
- ld1 {v8.8b},[x10],x11 //(v)ref_main_idx
+ ld1 {v23.8b},[x10],x11 //(v)ref_main_idx
sub v24.8b, v1.8b , v25.8b //(iv)32-fract(dup_const_32_fract)
umull v22.8h, v20.8b, v24.8b //(iv)vmull_u8(ref_main_idx, dup_const_32_fract)
@@ -336,7 +339,7 @@ prologue:
add x10,x8,x9 //(vii)*pu1_ref[ref_main_idx]
ld1 {v12.8b},[x12],x11 //(vi)ref_main_idx
- umull v10.8h, v8.8b, v30.8b //(v)vmull_u8(ref_main_idx, dup_const_32_fract)
+ umull v10.8h, v23.8b, v30.8b //(v)vmull_u8(ref_main_idx, dup_const_32_fract)
ld1 {v13.8b},[x12] //(vi)ref_main_idx_1
umlal v10.8h, v9.8b, v31.8b //(v)vmull_u8(ref_main_idx_1, dup_const_fract)
@@ -392,7 +395,7 @@ kernel_8_rows:
subs x4,x4,#8
sbfx x9,x14,#8,#8
- ld1 {v8.8b},[x10],x11 //(i)ref_main_idx
+ ld1 {v23.8b},[x10],x11 //(i)ref_main_idx
sub v24.8b, v1.8b , v25.8b //(viii)32-fract(dup_const_32_fract)
add x20,x6,#8 //increment the row value
@@ -416,7 +419,7 @@ kernel_8_rows:
add x10,x8,x9 //(iii)*pu1_ref[ref_main_idx]
ld1 {v12.8b},[x12],x11 //(ii)ref_main_idx
- umull v10.8h, v8.8b, v30.8b //(i)vmull_u8(ref_main_idx, dup_const_32_fract)
+ umull v10.8h, v23.8b, v30.8b //(i)vmull_u8(ref_main_idx, dup_const_32_fract)
ld1 {v13.8b},[x12] //(ii)ref_main_idx_1
umlal v10.8h, v9.8b, v31.8b //(i)vmull_u8(ref_main_idx_1, dup_const_fract)
@@ -477,7 +480,7 @@ kernel_8_rows:
dup v25.8b, v4.8b[7] //(viii)
rshrn v18.8b, v18.8h,#5 //(iii)shift_res = vrshrn_n_u16(add_res, 5)
- ld1 {v8.8b},[x10],x11 //(v)ref_main_idx
+ ld1 {v23.8b},[x10],x11 //(v)ref_main_idx
and v4.16b, v2.16b , v6.16b //dup_const_fract(fract = pos & (31))
ld1 {v9.8b},[x10] //(v)ref_main_idx_1
@@ -493,7 +496,7 @@ kernel_8_rows:
sub v30.8b, v1.8b , v31.8b //(v)32-fract(dup_const_32_fract)
ld1 {v13.8b},[x12] //(vi)ref_main_idx_1
- umull v10.8h, v8.8b, v30.8b //(v)vmull_u8(ref_main_idx, dup_const_32_fract)
+ umull v10.8h, v23.8b, v30.8b //(v)vmull_u8(ref_main_idx, dup_const_32_fract)
umov w14, v3.2s[0] //(i)extract idx to the r register
sxtw x14,w14
@@ -592,7 +595,7 @@ core_loop_4:
dup v7.8b,w4 //dup_const_32_fract
umlal v4.8h, v3.8b, v0.8b //vmull_u8(ref_main_idx_1, dup_const_fract)
- ld1 {v8.s}[0],[x10] //ref_main_idx
+ ld1 {v23.s}[0],[x10] //ref_main_idx
add x8,x8,#1
ld1 {v9.s}[0],[x11] //ref_main_idx_1
@@ -607,7 +610,7 @@ core_loop_4:
add x11,x10,#1 //pu1_ref_main_idx_1 += 1
dup v12.8b,w5 //dup_const_fract
- umull v10.8h, v8.8b, v7.8b //vmull_u8(ref_main_idx, dup_const_32_fract)
+ umull v10.8h, v23.8b, v7.8b //vmull_u8(ref_main_idx, dup_const_32_fract)
sub x20,x5,#32
neg x4, x20
@@ -655,7 +658,9 @@ end_loops:
add sp, sp, #132
// ldmfd sp!,{x4-x12,x15} //reload the registers from sp
ldp x19, x20,[sp],#16
- pop_v_regs
+ ldp d14,d15,[sp],#16
+ ldp d12,d13,[sp],#16
+ ldp d9,d10,[sp],#16
ret