summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorRakesh Kumar <rakesh.kumar@ittiam.com>2017-10-09 17:14:42 +0530
committerMSe <mse1969@posteo.de>2018-01-10 20:56:47 +0100
commit53b4d3b3ff61f79a095bf762b460d674270597bc (patch)
tree0539daa1404cf5669fd2bd053a87460accf50d41
parent6a73e1e1443309687799cff7c958ee2386ca19ae (diff)
downloadandroid_external_libhevc-53b4d3b3ff61f79a095bf762b460d674270597bc.tar.gz
android_external_libhevc-53b4d3b3ff61f79a095bf762b460d674270597bc.tar.bz2
android_external_libhevc-53b4d3b3ff61f79a095bf762b460d674270597bc.zip
Add PUSH-POP of D registers in Arm Neon 32 bit functions
According to ARM calling conventions, D8-D15 are callee saved registers. Hence have to be pushed before used as scratch. Added Push Pop in inter_pred, intra_pred, deblk_luma, itrans, itrans_recon, sao, weighted_pred ARM NEON 32 bit functions. Bug: 68320413 Test: Tested hevcdec Change-Id: I71f8868ac4205b0a3680d7ce5b82511653e9c747 (cherry picked from commit a47cb8865a33a87f163d87781f417884d30d46ed) CVE-2017-13177
-rw-r--r--common/arm/ihevc_deblk_chroma_horz.s16
-rw-r--r--common/arm/ihevc_deblk_chroma_vert.s16
-rw-r--r--common/arm/ihevc_deblk_luma_horz.s25
-rw-r--r--common/arm/ihevc_deblk_luma_vert.s21
-rw-r--r--common/arm/ihevc_inter_pred_chroma_copy.s7
-rw-r--r--common/arm/ihevc_inter_pred_chroma_copy_w16out.s13
-rw-r--r--common/arm/ihevc_inter_pred_chroma_horz.s12
-rw-r--r--common/arm/ihevc_inter_pred_chroma_horz_w16out.s13
-rw-r--r--common/arm/ihevc_inter_pred_chroma_vert.s13
-rw-r--r--common/arm/ihevc_inter_pred_chroma_vert_w16inp.s13
-rw-r--r--common/arm/ihevc_inter_pred_chroma_vert_w16inp_w16out.s13
-rw-r--r--common/arm/ihevc_inter_pred_chroma_vert_w16out.s12
-rw-r--r--common/arm/ihevc_inter_pred_filters_luma_horz.s25
-rw-r--r--common/arm/ihevc_inter_pred_filters_luma_vert.s30
-rw-r--r--common/arm/ihevc_inter_pred_filters_luma_vert_w16inp.s12
-rw-r--r--common/arm/ihevc_inter_pred_luma_copy.s12
-rw-r--r--common/arm/ihevc_inter_pred_luma_copy_w16out.s11
-rw-r--r--common/arm/ihevc_inter_pred_luma_horz_w16out.s35
-rw-r--r--common/arm/ihevc_inter_pred_luma_vert_w16inp_w16out.s12
-rw-r--r--common/arm/ihevc_intra_pred_chroma_dc.s4
-rw-r--r--common/arm/ihevc_intra_pred_chroma_horz.s9
-rw-r--r--common/arm/ihevc_intra_pred_chroma_mode2.s8
-rw-r--r--common/arm/ihevc_intra_pred_chroma_mode_18_34.s12
-rw-r--r--common/arm/ihevc_intra_pred_chroma_mode_27_to_33.s9
-rw-r--r--common/arm/ihevc_intra_pred_chroma_mode_3_to_9.s11
-rw-r--r--common/arm/ihevc_intra_pred_chroma_planar.s8
-rw-r--r--common/arm/ihevc_intra_pred_chroma_ver.s4
-rw-r--r--common/arm/ihevc_intra_pred_filters_chroma_mode_11_to_17.s13
-rw-r--r--common/arm/ihevc_intra_pred_filters_chroma_mode_19_to_25.s13
-rw-r--r--common/arm/ihevc_intra_pred_filters_luma_mode_11_to_17.s14
-rw-r--r--common/arm/ihevc_intra_pred_filters_luma_mode_19_to_25.s13
-rw-r--r--common/arm/ihevc_intra_pred_luma_dc.s9
-rw-r--r--common/arm/ihevc_intra_pred_luma_horz.s13
-rw-r--r--common/arm/ihevc_intra_pred_luma_mode2.s9
-rw-r--r--common/arm/ihevc_intra_pred_luma_mode_18_34.s7
-rw-r--r--common/arm/ihevc_intra_pred_luma_mode_27_to_33.s10
-rw-r--r--common/arm/ihevc_intra_pred_luma_mode_3_to_9.s12
-rw-r--r--common/arm/ihevc_intra_pred_luma_planar.s9
-rw-r--r--common/arm/ihevc_intra_pred_luma_vert.s9
-rw-r--r--common/arm/ihevc_itrans_recon_16x16.s25
-rw-r--r--common/arm/ihevc_itrans_recon_32x32.s27
-rw-r--r--common/arm/ihevc_itrans_recon_4x4.s15
-rw-r--r--common/arm/ihevc_itrans_recon_4x4_ttype1.s16
-rw-r--r--common/arm/ihevc_itrans_recon_8x8.s18
-rw-r--r--common/arm/ihevc_sao_band_offset_chroma.s24
-rw-r--r--common/arm/ihevc_sao_band_offset_luma.s18
-rw-r--r--common/arm/ihevc_sao_edge_offset_class0.s21
-rw-r--r--common/arm/ihevc_sao_edge_offset_class0_chroma.s24
-rw-r--r--common/arm/ihevc_sao_edge_offset_class1.s21
-rw-r--r--common/arm/ihevc_sao_edge_offset_class1_chroma.s24
-rw-r--r--common/arm/ihevc_sao_edge_offset_class2.s110
-rw-r--r--common/arm/ihevc_sao_edge_offset_class2_chroma.s116
-rw-r--r--common/arm/ihevc_sao_edge_offset_class3.s126
-rw-r--r--common/arm/ihevc_sao_edge_offset_class3_chroma.s127
-rw-r--r--common/arm/ihevc_weighted_pred_bi.s36
-rw-r--r--common/arm/ihevc_weighted_pred_bi_default.s22
-rw-r--r--common/arm/ihevc_weighted_pred_uni.s21
57 files changed, 835 insertions, 463 deletions
diff --git a/common/arm/ihevc_deblk_chroma_horz.s b/common/arm/ihevc_deblk_chroma_horz.s
index 34422ff..b0a79eb 100644
--- a/common/arm/ihevc_deblk_chroma_horz.s
+++ b/common/arm/ihevc_deblk_chroma_horz.s
@@ -36,6 +36,12 @@
@*
@*******************************************************************************/
+.equ qp_offset_u_offset, 40
+.equ qp_offset_v_offset, 44
+.equ tc_offset_div2_offset, 48
+.equ filter_p_offset, 52
+.equ filter_q_offset, 56
+
.text
.align 4
@@ -62,17 +68,17 @@ ihevc_deblk_chroma_horz_a9q:
add r6,r0,r1
add r1,r2,r3
vmovl.u8 q0,d0
- ldr r10,[sp,#0x28]
+ ldr r10,[sp,#qp_offset_u_offset]
vld1.8 {d2},[r12]
add r2,r1,#1
- ldr r4,[sp,#0x30]
+ ldr r4,[sp,#tc_offset_div2_offset]
vld1.8 {d4},[r5]
- ldr r8,[sp,#0x34]
+ ldr r8,[sp,#filter_p_offset]
vld1.8 {d16},[r6]
- ldr r9,[sp,#0x38]
+ ldr r9,[sp,#filter_q_offset]
adds r1,r10,r2,asr #1
vmovl.u8 q1,d2
- ldr r7,[sp,#0x2c]
+ ldr r7,[sp,#qp_offset_v_offset]
ldr r3,gai4_ihevc_qp_table_addr
ulbl1:
add r3, r3, pc
diff --git a/common/arm/ihevc_deblk_chroma_vert.s b/common/arm/ihevc_deblk_chroma_vert.s
index 4cb305f..3962b28 100644
--- a/common/arm/ihevc_deblk_chroma_vert.s
+++ b/common/arm/ihevc_deblk_chroma_vert.s
@@ -37,6 +37,12 @@
@*
@*******************************************************************************/
+.equ qp_offset_u_offset, 40
+.equ qp_offset_v_offset, 44
+.equ tc_offset_div2_offset, 48
+.equ filter_p_offset, 52
+.equ filter_q_offset, 56
+
.text
.align 4
@@ -63,19 +69,19 @@ ihevc_deblk_chroma_vert_a9q:
vld1.8 {d5},[r8],r1
add r2,r2,#1
vld1.8 {d17},[r8],r1
- ldr r7,[sp,#0x28]
+ ldr r7,[sp,#qp_offset_u_offset]
vld1.8 {d16},[r8],r1
- ldr r4,[sp,#0x38]
+ ldr r4,[sp,#filter_q_offset]
vld1.8 {d4},[r8]
- ldr r5,[sp,#0x30]
+ ldr r5,[sp,#tc_offset_div2_offset]
vtrn.8 d5,d17
adds r3,r7,r2,asr #1
vtrn.8 d16,d4
ldr r7,gai4_ihevc_qp_table_addr
ulbl1:
add r7,r7,pc
- ldr r12,[sp,#0x34]
- ldr r6,[sp,#0x2c]
+ ldr r12,[sp,#filter_p_offset]
+ ldr r6,[sp,#qp_offset_v_offset]
bmi l1.2944
cmp r3,#0x39
ldrle r3,[r7,r3,lsl #2]
diff --git a/common/arm/ihevc_deblk_luma_horz.s b/common/arm/ihevc_deblk_luma_horz.s
index b12ceb9..76660b3 100644
--- a/common/arm/ihevc_deblk_luma_horz.s
+++ b/common/arm/ihevc_deblk_luma_horz.s
@@ -36,6 +36,12 @@
@*
@*******************************************************************************/
+.equ qp_q_offset, 108
+.equ beta_offset_div2_offset, 112
+.equ tc_offset_div2_offset, 116
+.equ filter_p_offset, 120
+.equ filter_q_offset, 124
+
.text
.align 4
@@ -57,12 +63,14 @@ gai4_ihevc_beta_table_addr:
ihevc_deblk_luma_horz_a9q:
stmfd sp!, {r3-r12,lr}
- ldr r4,[sp,#0x2c]
- ldr r5,[sp,#0x30]
+ vpush {d8 - d15}
+
+ ldr r4,[sp,#qp_q_offset]
+ ldr r5,[sp,#beta_offset_div2_offset]
add r3,r3,r4
add r3,r3,#1
- ldr r6, [sp,#0x34]
+ ldr r6, [sp,#tc_offset_div2_offset]
asr r3,r3,#1
add r7,r3,r5,lsl #1
add r3,r3,r6,lsl #1
@@ -291,9 +299,9 @@ ulbl1:
vmin.u8 d18,d20,d30
mov r2,#2
vqadd.u8 d30,d23,d1
- ldr r4,[sp,#0x38] @ loading the filter_flag_p
+ ldr r4,[sp,#filter_p_offset] @ loading the filter_flag_p
vmax.u8 d2,d18,d31
- ldr r5,[sp,#0x3c] @ loading the filter_flag_q
+ ldr r5,[sp,#filter_q_offset] @ loading the filter_flag_q
vrshrn.i16 d21,q7,#2
b end_dep_deq_decision_horz
@ r2 has the value of de
@@ -308,8 +316,8 @@ l1.1840:
mov r2,#1
mov r11,r5
- ldr r4,[sp,#0x38] @ loading the filter_flag_p
- ldr r5,[sp,#0x3c] @ loading the filter_flag_q
+ ldr r4,[sp,#filter_p_offset] @ loading the filter_flag_p
+ ldr r5,[sp,#filter_q_offset] @ loading the filter_flag_q
cmp r6,#1
moveq r9,#0
@@ -397,6 +405,7 @@ strong_filtering_p:
vst1.32 d3[0],[r12]
l1.2404:
+ vpop {d8 - d15}
ldmfd sp!, {r3-r12,pc}
@ r4=flag p
@@ -537,6 +546,8 @@ l1.2852:
vbsl d19,d26,d13
vst1.32 {d19[0]},[r12],r1
vst1.32 {d18[0]},[r12]
+
+ vpop {d8 - d15}
ldmfd sp!, {r3-r12,r15}
diff --git a/common/arm/ihevc_deblk_luma_vert.s b/common/arm/ihevc_deblk_luma_vert.s
index ee247cc..91662c9 100644
--- a/common/arm/ihevc_deblk_luma_vert.s
+++ b/common/arm/ihevc_deblk_luma_vert.s
@@ -37,6 +37,12 @@
@*
@*******************************************************************************/
+.equ qp_q_offset, 44
+.equ beta_offset_div2_offset, 48
+.equ tc_offset_div2_offset, 52
+.equ filter_p_offset, 56
+.equ filter_q_offset, 60
+
.text
.align 4
@@ -60,12 +66,12 @@ gai4_ihevc_beta_table_addr:
ihevc_deblk_luma_vert_a9q:
push {r3-r12,lr}
- ldr r4,[sp,#0x2c]
- ldr r5,[sp,#0x30]
+ ldr r4,[sp,#qp_q_offset]
+ ldr r5,[sp,#beta_offset_div2_offset]
add r3,r3,r4
add r3,r3,#1
- ldr r6, [sp,#0x34]
+ ldr r6, [sp,#tc_offset_div2_offset]
asr r3,r3,#1
add r7,r3,r5,lsl #1
add r3,r3,r6,lsl #1
@@ -291,9 +297,9 @@ ulbl1:
vqadd.u8 d30,d6,d19
mov r2,#2
- ldr r4,[sp,#0x38] @ loading the filter_flag_p
+ ldr r4,[sp,#filter_p_offset] @ loading the filter_flag_p
vqsub.u8 d31,d6,d19
- ldr r5,[sp,#0x3c] @ loading the filter_flag_q
+ ldr r5,[sp,#filter_q_offset] @ loading the filter_flag_q
b end_dep_deq_decision
@ r2 has the value of de
@ r6 has teh value of tc
@@ -307,8 +313,8 @@ l1.336:
mov r2,#1
l1.424:
mov r11,r5
- ldr r4,[sp,#0x38] @ loading the filter_flag_p
- ldr r5,[sp,#0x3c] @ loading the filter_flag_q
+ ldr r4,[sp,#filter_p_offset] @ loading the filter_flag_p
+ ldr r5,[sp,#filter_q_offset] @ loading the filter_flag_q
cmp r6,#1
moveq r9,#0
@@ -532,7 +538,6 @@ l1.1212:
vst1.16 {d3[1]},[r12]
vst1.8 {d16[3]},[r3]
l1.1272:
- @ ldr r3,[sp,#0x38]
cmp r5,#0
beq l1.964
@ checks for the flag q
diff --git a/common/arm/ihevc_inter_pred_chroma_copy.s b/common/arm/ihevc_inter_pred_chroma_copy.s
index 0da34cc..1b38dbb 100644
--- a/common/arm/ihevc_inter_pred_chroma_copy.s
+++ b/common/arm/ihevc_inter_pred_chroma_copy.s
@@ -92,6 +92,9 @@
@ r5 => ht
@ r6 => wd
+.equ ht_offset, 44
+.equ wd_offset, 48
+
.text
.align 4
@@ -104,9 +107,9 @@
ihevc_inter_pred_chroma_copy_a9q:
stmfd sp!, {r4-r12, r14} @stack stores the values of the arguments
- ldr r12,[sp,#48] @loads wd
+ ldr r12,[sp,#wd_offset] @loads wd
lsl r12,r12,#1
- ldr r7,[sp,#44] @loads ht
+ ldr r7,[sp,#ht_offset] @loads ht
cmp r7,#0 @checks ht == 0
ble end_loops
and r8,r7,#3 @check ht for mul of 2
diff --git a/common/arm/ihevc_inter_pred_chroma_copy_w16out.s b/common/arm/ihevc_inter_pred_chroma_copy_w16out.s
index a927fa7..4997b84 100644
--- a/common/arm/ihevc_inter_pred_chroma_copy_w16out.s
+++ b/common/arm/ihevc_inter_pred_chroma_copy_w16out.s
@@ -92,6 +92,11 @@
@r5 => ht
@r6 => wd
+.equ coeff_offset, 104
+.equ ht_offset, 108
+.equ wd_offset, 112
+
+
.text
.align 4
@@ -105,9 +110,11 @@
ihevc_inter_pred_chroma_copy_w16out_a9q:
stmfd sp!, {r4-r12, r14} @stack stores the values of the arguments
- ldr r12,[sp,#48] @loads wd
+ vpush {d8 - d15}
+
+ ldr r12,[sp,#wd_offset] @loads wd
lsl r12,r12,#1 @2*wd
- ldr r7,[sp,#44] @loads ht
+ ldr r7,[sp,#ht_offset] @loads ht
cmp r7,#0 @ht condition(ht == 0)
ble end_loops @loop
and r8,r7,#3 @check ht for mul of 2
@@ -162,6 +169,7 @@ end_inner_loop_wd_4:
end_loops:
+ vpop {d8 - d15}
ldmfd sp!,{r4-r12,r15} @reload the registers from sp
@@ -316,6 +324,7 @@ core_loop_wd_8_ht_2:
vst1.16 {d2,d3},[r10],r5 @vst1q_s16(pi2_dst_tmp, tmp)
bgt core_loop_wd_8_ht_2
+ vpop {d8 - d15}
ldmfd sp!,{r4-r12,r15} @reload the registers from sp
diff --git a/common/arm/ihevc_inter_pred_chroma_horz.s b/common/arm/ihevc_inter_pred_chroma_horz.s
index 4781d3e..c69b417 100644
--- a/common/arm/ihevc_inter_pred_chroma_horz.s
+++ b/common/arm/ihevc_inter_pred_chroma_horz.s
@@ -93,6 +93,10 @@
@r2 => src_strd
@r3 => dst_strd
+.equ coeff_offset, 104
+.equ ht_offset, 108
+.equ wd_offset, 112
+
.text
.align 4
@@ -106,10 +110,11 @@
ihevc_inter_pred_chroma_horz_a9q:
stmfd sp!, {r4-r12, r14} @stack stores the values of the arguments
+ vpush {d8 - d15}
- ldr r4,[sp,#40] @loads pi1_coeff
- ldr r7,[sp,#44] @loads ht
- ldr r10,[sp,#48] @loads wd
+ ldr r4,[sp,#coeff_offset] @loads pi1_coeff
+ ldr r7,[sp,#ht_offset] @loads ht
+ ldr r10,[sp,#wd_offset] @loads wd
vld1.8 {d0},[r4] @coeff = vld1_s8(pi1_coeff)
subs r14,r7,#0 @checks for ht == 0
@@ -672,6 +677,7 @@ inner_loop_4:
end_loops:
+ vpop {d8 - d15}
ldmfd sp!,{r4-r12,r15} @reload the registers from sp
diff --git a/common/arm/ihevc_inter_pred_chroma_horz_w16out.s b/common/arm/ihevc_inter_pred_chroma_horz_w16out.s
index f95937c..9c498e8 100644
--- a/common/arm/ihevc_inter_pred_chroma_horz_w16out.s
+++ b/common/arm/ihevc_inter_pred_chroma_horz_w16out.s
@@ -90,6 +90,9 @@
@r2 => src_strd
@r3 => dst_strd
+.equ coeff_offset, 104
+.equ ht_offset, 108
+.equ wd_offset, 112
.text
.align 4
@@ -105,10 +108,11 @@
ihevc_inter_pred_chroma_horz_w16out_a9q:
stmfd sp!, {r4-r12, r14} @stack stores the values of the arguments
+ vpush {d8 - d15}
- ldr r4,[sp,#40] @loads pi1_coeff
- ldr r6,[sp,#44] @loads ht
- ldr r10,[sp,#48] @loads wd
+ ldr r4,[sp,#coeff_offset] @loads pi1_coeff
+ ldr r6,[sp,#ht_offset] @loads ht
+ ldr r10,[sp,#wd_offset] @loads wd
vld1.8 {d0},[r4] @coeff = vld1_s8(pi1_coeff)
subs r14,r6,#0 @checks for ht == 0
@@ -362,7 +366,7 @@ epilog_end:
vst1.16 {q10},[r1],r6 @store the result pu1_dst
- ldr r6,[sp,#44] @loads ht
+ ldr r6,[sp,#ht_offset] @loads ht
and r7,r6,#1
@@ -710,6 +714,7 @@ loop_residue:
end_loops:
+ vpop {d8 - d15}
ldmfd sp!,{r4-r12,r15} @reload the registers from sp
diff --git a/common/arm/ihevc_inter_pred_chroma_vert.s b/common/arm/ihevc_inter_pred_chroma_vert.s
index e786497..8b4e48b 100644
--- a/common/arm/ihevc_inter_pred_chroma_vert.s
+++ b/common/arm/ihevc_inter_pred_chroma_vert.s
@@ -92,6 +92,11 @@
@r1 => *pi2_dst
@r2 => src_strd
@r3 => dst_strd
+
+.equ coeff_offset, 104
+.equ ht_offset, 108
+.equ wd_offset, 112
+
.text
.align 4
@@ -105,11 +110,12 @@
ihevc_inter_pred_chroma_vert_a9q:
stmfd sp!,{r4-r12,r14} @stack stores the values of the arguments
+ vpush {d8 - d15}
- ldr r4,[sp,#44] @loads ht
- ldr r12,[sp,#40] @loads pi1_coeff
+ ldr r4,[sp,#ht_offset] @loads ht
+ ldr r12,[sp,#coeff_offset] @loads pi1_coeff
cmp r4,#0 @checks ht == 0
- ldr r6,[sp,#48] @loads wd
+ ldr r6,[sp,#wd_offset] @loads wd
sub r0,r0,r2 @pu1_src - src_strd
vld1.8 {d0},[r12] @loads pi1_coeff
@@ -377,6 +383,7 @@ epilog:
vqrshrun.s16 d24,q12,#6
vst1.8 {d24},[r7],r3 @stores the loaded value
end_loops:
+ vpop {d8 - d15}
ldmfd sp!,{r4-r12,r15} @reload the registers from sp
diff --git a/common/arm/ihevc_inter_pred_chroma_vert_w16inp.s b/common/arm/ihevc_inter_pred_chroma_vert_w16inp.s
index ba2ea8e..f9e513a 100644
--- a/common/arm/ihevc_inter_pred_chroma_vert_w16inp.s
+++ b/common/arm/ihevc_inter_pred_chroma_vert_w16inp.s
@@ -92,6 +92,11 @@
@r2 => src_strd
@r3 => dst_strd
+.equ coeff_offset, 104
+.equ ht_offset, 108
+.equ wd_offset, 112
+
+
.text
.align 4
@@ -105,11 +110,12 @@
ihevc_inter_pred_chroma_vert_w16inp_a9q:
stmfd sp!, {r4-r12, r14} @stack stores the values of the arguments
+ vpush {d8 - d15}
- ldr r4, [sp,#40] @loads pi1_coeff
- ldr r6, [sp,#48] @wd
+ ldr r4, [sp,#coeff_offset] @loads pi1_coeff
+ ldr r6, [sp,#wd_offset] @wd
lsl r2,r2,#1 @src_strd = 2* src_strd
- ldr r5,[sp,#44] @loads ht
+ ldr r5,[sp,#ht_offset] @loads ht
vld1.8 {d0},[r4] @loads pi1_coeff
sub r4,r0,r2 @pu1_src - src_strd
vmovl.s8 q0,d0 @long the value
@@ -335,6 +341,7 @@ epilog:
vst1.32 {d24[0]},[r9] @stores the loaded value
end_loops:
+ vpop {d8 - d15}
ldmfd sp!,{r4-r12,r15} @reload the registers from sp
diff --git a/common/arm/ihevc_inter_pred_chroma_vert_w16inp_w16out.s b/common/arm/ihevc_inter_pred_chroma_vert_w16inp_w16out.s
index 00b3011..0c2ffbd 100644
--- a/common/arm/ihevc_inter_pred_chroma_vert_w16inp_w16out.s
+++ b/common/arm/ihevc_inter_pred_chroma_vert_w16inp_w16out.s
@@ -92,6 +92,11 @@
@r1 => *pi2_dst
@r2 => src_strd
@r3 => dst_strd
+
+.equ coeff_offset, 104
+.equ ht_offset, 108
+.equ wd_offset, 112
+
.text
.align 4
@@ -105,11 +110,12 @@
ihevc_inter_pred_chroma_vert_w16inp_w16out_a9q:
stmfd sp!, {r4-r12, r14} @stack stores the values of the arguments
+ vpush {d8 - d15}
- ldr r4, [sp,#40] @loads pi1_coeff
- ldr r6, [sp,#48] @wd
+ ldr r4, [sp,#coeff_offset] @loads pi1_coeff
+ ldr r6, [sp,#wd_offset] @wd
lsl r2,r2,#1 @src_strd = 2* src_strd
- ldr r5,[sp,#44] @loads ht
+ ldr r5,[sp,#ht_offset] @loads ht
vld1.8 {d0},[r4] @loads pi1_coeff
sub r4,r0,r2 @pu1_src - src_strd
vmovl.s8 q0,d0 @long the value
@@ -322,6 +328,7 @@ epilog:
vst1.32 {d24},[r9] @stores the loaded value
end_loops:
+ vpop {d8 - d15}
ldmfd sp!,{r4-r12,r15} @reload the registers from sp
diff --git a/common/arm/ihevc_inter_pred_chroma_vert_w16out.s b/common/arm/ihevc_inter_pred_chroma_vert_w16out.s
index 6e6776c..84b0792 100644
--- a/common/arm/ihevc_inter_pred_chroma_vert_w16out.s
+++ b/common/arm/ihevc_inter_pred_chroma_vert_w16out.s
@@ -93,6 +93,10 @@
@r2 => src_strd
@r3 => dst_strd
+.equ coeff_offset, 104
+.equ ht_offset, 108
+.equ wd_offset, 112
+
.text
.align 4
@@ -106,11 +110,12 @@
ihevc_inter_pred_chroma_vert_w16out_a9q:
stmfd sp!,{r4-r12,r14} @stack stores the values of the arguments
+ vpush {d8 - d15}
- ldr r4,[sp,#44] @loads ht
- ldr r12,[sp,#40] @loads pi1_coeff
+ ldr r4,[sp,#ht_offset] @loads ht
+ ldr r12,[sp,#coeff_offset] @loads pi1_coeff
cmp r4,#0 @checks ht == 0
- ldr r6,[sp,#48] @loads wd
+ ldr r6,[sp,#wd_offset] @loads wd
sub r0,r0,r2 @pu1_src - src_strd
vld1.8 {d0},[r12] @loads pi1_coeff
@@ -361,6 +366,7 @@ epilog:
vst1.8 {q12},[r7],r3 @stores the loaded value
end_loops:
+ vpop {d8 - d15}
ldmfd sp!,{r4-r12,r15} @reload the registers from sp
diff --git a/common/arm/ihevc_inter_pred_filters_luma_horz.s b/common/arm/ihevc_inter_pred_filters_luma_horz.s
index 215f8fd..5559aa7 100644
--- a/common/arm/ihevc_inter_pred_filters_luma_horz.s
+++ b/common/arm/ihevc_inter_pred_filters_luma_horz.s
@@ -103,6 +103,11 @@
@ r5 => ht
@ r6 => wd
+
+.equ coeff_offset, 104
+.equ ht_offset, 108
+.equ wd_offset, 112
+
.text
.align 4
@@ -116,15 +121,15 @@
ihevc_inter_pred_luma_horz_a9q:
stmfd sp!, {r4-r12, r14} @stack stores the values of the arguments
- @str r1,[sp,#-4]
- @ mov r7,#8192
+ vpush {d8 - d15}
+
+
start_loop_count:
- @ ldr r1,[sp,#-4]
- ldr r4,[sp,#40] @loads pi1_coeff
- ldr r8,[sp,#44] @loads ht
- ldr r10,[sp,#48] @loads wd
+ ldr r4,[sp,#coeff_offset] @loads pi1_coeff
+ ldr r8,[sp,#ht_offset] @loads ht
+ ldr r10,[sp,#wd_offset] @loads wd
vld1.8 {d0},[r4] @coeff = vld1_s8(pi1_coeff)
mov r11,#1
@@ -262,7 +267,8 @@ end_inner_loop_8:
- ldr r10,[sp,#48] @loads wd
+ ldr r10,[sp,#wd_offset] @loads wd
+
cmp r10,#12
beq outer_loop4_residual
@@ -270,6 +276,7 @@ end_inner_loop_8:
end_loops:
+ vpop {d8 - d15}
ldmfd sp!,{r4-r12,r15} @reload the registers from sp
@@ -417,7 +424,7 @@ epilog_16:
ldr r7, [sp], #4
ldr r0, [sp], #4
- ldr r10,[sp,#48]
+ ldr r10,[sp,#wd_offset]
cmp r10,#24
beq outer_loop8_residual
@@ -426,6 +433,7 @@ epilog_16:
end_loops1:
+ vpop {d8 - d15}
ldmfd sp!,{r4-r12,r15} @reload the registers from sp
@@ -527,6 +535,7 @@ end_inner_loop_4:
@subs r7,r7,#1
@ bgt start_loop_count
+ vpop {d8 - d15}
ldmfd sp!,{r4-r12,r15} @reload the registers from sp
diff --git a/common/arm/ihevc_inter_pred_filters_luma_vert.s b/common/arm/ihevc_inter_pred_filters_luma_vert.s
index f51d68c..3d9ab1c 100644
--- a/common/arm/ihevc_inter_pred_filters_luma_vert.s
+++ b/common/arm/ihevc_inter_pred_filters_luma_vert.s
@@ -103,6 +103,11 @@
@ r12 => *pi1_coeff
@ r5 => ht
@ r3 => wd
+
+.equ coeff_offset, 104
+.equ ht_offset, 108
+.equ wd_offset, 112
+
.text
.align 4
.syntax unified
@@ -116,15 +121,16 @@
ihevc_inter_pred_luma_vert_a9q:
stmfd sp!, {r4-r12, r14} @stack stores the values of the arguments
+ vpush {d8 - d15}
- ldr r12,[sp,#40] @load pi1_coeff
+ ldr r12,[sp,#coeff_offset] @load pi1_coeff
mov r6,r3
- ldr r5,[sp,#48] @load wd
+ ldr r5,[sp,#wd_offset] @load wd
vld1.u8 {d0},[r12] @coeff = vld1_s8(pi1_coeff)
sub r12,r2,r2,lsl #2 @src_ctrd & pi1_coeff
vabs.s8 d0,d0 @vabs_s8(coeff)
add r0,r0,r12 @r0->pu1_src r12->pi1_coeff
- ldr r3,[sp,#44] @load ht
+ ldr r3,[sp,#ht_offset] @load ht
subs r7,r3,#0 @r3->ht
@ble end_loops @end loop jump
vdup.u8 d22,d0[0] @coeffabs_0 = vdup_lane_u8(coeffabs, 0)@
@@ -407,7 +413,8 @@ end_loops:
ldr r1, [sp], #4
ldr r0, [sp], #4
- ldmfdeq sp!,{r4-r12,r15} @reload the registers from sp
+ beq end1
+
mov r5, #4
add r0, r0, #8
add r1, r1, #8
@@ -491,6 +498,8 @@ end_inner_loop_wd_4:
add r0,r0,r8
bgt outer_loop_wd_4
+end1:
+ vpop {d8 - d15}
ldmfd sp!, {r4-r12, r15} @reload the registers from sp
@@ -564,15 +573,16 @@ end_inner_loop_wd_4:
ihevc_inter_pred_luma_vert_w16out_a9q:
stmfd sp!, {r4-r12, r14} @stack stores the values of the arguments
+ vpush {d8 - d15}
- ldr r12,[sp,#40] @load pi1_coeff
+ ldr r12,[sp,#coeff_offset] @load pi1_coeff
mov r6,r3
- ldr r5,[sp,#48] @load wd
+ ldr r5,[sp,#wd_offset] @load wd
vld1.u8 {d0},[r12] @coeff = vld1_s8(pi1_coeff)
sub r12,r2,r2,lsl #2 @src_ctrd & pi1_coeff
vabs.s8 d0,d0 @vabs_s8(coeff)
add r0,r0,r12 @r0->pu1_src r12->pi1_coeff
- ldr r3,[sp,#44] @load ht
+ ldr r3,[sp,#ht_offset] @load ht
subs r7,r3,#0 @r3->ht
@ble end_loops_16out @end loop jump
vdup.u8 d22,d0[0] @coeffabs_0 = vdup_lane_u8(coeffabs, 0)@
@@ -848,7 +858,8 @@ end_loops_16out:
ldr r1, [sp], #4
ldr r0, [sp], #4
- ldmfdeq sp!,{r4-r12,r15} @reload the registers from sp
+ beq end2
+
mov r5, #4
add r0, r0, #8
add r1, r1, #16
@@ -934,7 +945,8 @@ end_inner_loop_wd_4_16out:
add r1,r1,r9,lsl #1
add r0,r0,r8
bgt outer_loop_wd_4_16out
-
+end2:
+ vpop {d8 - d15}
ldmfd sp!, {r4-r12, r15} @reload the registers from sp
diff --git a/common/arm/ihevc_inter_pred_filters_luma_vert_w16inp.s b/common/arm/ihevc_inter_pred_filters_luma_vert_w16inp.s
index 4fbc5d1..9726710 100644
--- a/common/arm/ihevc_inter_pred_filters_luma_vert_w16inp.s
+++ b/common/arm/ihevc_inter_pred_filters_luma_vert_w16inp.s
@@ -94,6 +94,10 @@
@ word32 ht,
@ word32 wd )
+.equ coeff_offset, 104
+.equ ht_offset, 108
+.equ wd_offset, 112
+
.text
.align 4
@@ -107,16 +111,17 @@
ihevc_inter_pred_luma_vert_w16inp_a9q:
stmfd sp!, {r4-r12, r14} @stack stores the values of the arguments
+ vpush {d8 - d15}
- ldr r12,[sp,#40] @load pi1_coeff
+ ldr r12,[sp,#coeff_offset] @load pi1_coeff
mov r6,r3
- ldr r5,[sp,#48] @load wd
+ ldr r5,[sp,#wd_offset] @load wd
vld1.8 {d0},[r12] @coeff = vld1_s8(pi1_coeff)
mov r2, r2, lsl #1
sub r12,r2,r2,lsl #2 @src_ctrd & pi1_coeff
@vabs.s8 d0,d0 @vabs_s8(coeff)
add r0,r0,r12 @r0->pu1_src r12->pi1_coeff
- ldr r3,[sp,#44] @load ht
+ ldr r3,[sp,#ht_offset] @load ht
subs r7,r3,#0 @r3->ht
@ble end_loops @end loop jump
vmovl.s8 q0,d0
@@ -370,6 +375,7 @@ epilog_end:
end_loops:
+ vpop {d8 - d15}
ldmfd sp!,{r4-r12,r15} @reload the registers from sp
diff --git a/common/arm/ihevc_inter_pred_luma_copy.s b/common/arm/ihevc_inter_pred_luma_copy.s
index 8a61369..e4f5573 100644
--- a/common/arm/ihevc_inter_pred_luma_copy.s
+++ b/common/arm/ihevc_inter_pred_luma_copy.s
@@ -71,6 +71,10 @@
@ r7 => ht
@ r12 => wd
+.equ coeff_offset, 104
+.equ ht_offset, 108
+.equ wd_offset, 112
+
.text
.align 4
@@ -83,8 +87,9 @@
ihevc_inter_pred_luma_copy_a9q:
stmfd sp!, {r4-r12, r14} @stack stores the values of the arguments
- ldr r12,[sp,#48] @loads wd
- ldr r7,[sp,#44] @loads ht
+ vpush {d8 - d15}
+ ldr r12,[sp,#wd_offset] @loads wd
+ ldr r7,[sp,#ht_offset] @loads ht
cmp r7,#0 @checks ht == 0
ble end_loops
tst r12,#15 @checks wd for multiples for 4 & 8
@@ -121,6 +126,7 @@ end_inner_loop_wd_4:
bgt outer_loop_wd_4
end_loops:
+ vpop {d8 - d15}
ldmfd sp!,{r4-r12,r15} @reload the registers from sp
@@ -151,6 +157,7 @@ end_inner_loop_wd_8:
sub r1,r6,r11 @pu1_dst = pu1_dst_tmp
bgt outer_loop_wd_8
+ vpop {d8 - d15}
ldmfd sp!,{r4-r12,r15} @reload the registers from sp
core_loop_wd_16:
@@ -180,6 +187,7 @@ end_inner_loop_wd_16:
sub r1,r6,r11 @pu1_dst = pu1_dst_tmp
bgt outer_loop_wd_16
+ vpop {d8 - d15}
ldmfd sp!,{r4-r12,r15} @reload the registers from sp
diff --git a/common/arm/ihevc_inter_pred_luma_copy_w16out.s b/common/arm/ihevc_inter_pred_luma_copy_w16out.s
index 771bcb3..84dbbad 100644
--- a/common/arm/ihevc_inter_pred_luma_copy_w16out.s
+++ b/common/arm/ihevc_inter_pred_luma_copy_w16out.s
@@ -72,6 +72,10 @@
@ r7 => ht
@ r12 => wd
+.equ coeff_offset, 104
+.equ ht_offset, 108
+.equ wd_offset, 112
+
.text
.align 4
@@ -85,8 +89,9 @@
ihevc_inter_pred_luma_copy_w16out_a9q:
stmfd sp!, {r4-r12, r14} @stack stores the values of the arguments
- ldr r12,[sp,#48] @loads wd
- ldr r7,[sp,#44] @loads ht
+ vpush {d8 - d15}
+ ldr r12,[sp,#wd_offset] @loads wd
+ ldr r7,[sp,#ht_offset] @loads ht
cmp r7,#0 @ht condition(ht == 0)
ble end_loops @loop
tst r12,#7 @conditional check for wd (multiples)
@@ -129,6 +134,7 @@ end_inner_loop_wd_4:
bgt outer_loop_wd_4
end_loops:
+ vpop {d8 - d15}
ldmfd sp!,{r4-r12,r15} @reload the registers from sp
@@ -242,6 +248,7 @@ epilog_end:
vst1.16 {d6,d7},[r10],r5 @vst1q_s16(pi2_dst_tmp, tmp)
+ vpop {d8 - d15}
ldmfd sp!,{r4-r12,r15} @reload the registers from sp
diff --git a/common/arm/ihevc_inter_pred_luma_horz_w16out.s b/common/arm/ihevc_inter_pred_luma_horz_w16out.s
index e8800e0..a60bb08 100644
--- a/common/arm/ihevc_inter_pred_luma_horz_w16out.s
+++ b/common/arm/ihevc_inter_pred_luma_horz_w16out.s
@@ -107,6 +107,11 @@
@r11 - #1
@r12 - src_ptr1
@r14 - loop_counter
+
+.equ coeff_offset, 104
+.equ ht_offset, 108
+.equ wd_offset, 112
+
.text
.align 4
.syntax unified
@@ -122,16 +127,16 @@ ihevc_inter_pred_luma_horz_w16out_a9q:
bic r14, #1 @ clearing bit[0], so that it goes back to mode
stmfd sp!, {r4-r12, r14} @stack stores the values of the arguments
- ldr r4,[sp,#40] @loads pi1_coeff
- ldr r7,[sp,#44] @loads ht
+ vpush {d8 - d15}
+ ldr r4,[sp,#coeff_offset] @loads pi1_coeff
+ ldr r7,[sp,#ht_offset] @loads ht
vld1.8 {d0},[r4] @coeff = vld1_s8(pi1_coeff)
sub r14,r7,#0 @checks for ht == 0
vabs.s8 d2,d0 @vabs_s8(coeff)
mov r11,#1
- @ble end_loops
- ldr r10,[sp,#48] @loads wd
+ ldr r10,[sp,#wd_offset] @loads wd
vdup.8 d24,d2[0] @coeffabs_0 = vdup_lane_u8(coeffabs, 0)
sub r12,r0,#3 @pu1_src - 3
vdup.8 d25,d2[1] @coeffabs_1 = vdup_lane_u8(coeffabs, 1)
@@ -274,11 +279,10 @@ end_inner_loop_4:
height_residue_4:
- ldr r7,[sp,#44] @loads ht
+ ldr r7,[sp,#ht_offset] @loads ht
and r7,r7,#1 @calculating ht_residue ht_residue = (ht & 1)
cmp r7,#0
- @beq end_loops
- ldmfdeq sp!,{r4-r12,r15} @reload the registers from sp
+ beq end_loops
outer_loop_height_residue_4:
@@ -331,7 +335,7 @@ end_inner_loop_height_residue_4:
add r12,r12,r9 @increment the input pointer src_strd-wd
add r1,r1,r8 @increment the output pointer dst_strd-wd
bgt outer_loop_height_residue_4
-
+ vpop {d8 - d15}
ldmfd sp!,{r4-r12,r15} @reload the registers from sp
outer_loop8_residual:
@@ -427,18 +431,18 @@ end_inner_loop_8:
- ldr r10,[sp,#48] @loads wd
+ ldr r10,[sp,#wd_offset] @loads wd
cmp r10,#12
beq outer_loop4_residual
- ldr r7,[sp,#44] @loads ht
+ ldr r7,[sp,#ht_offset] @loads ht
and r7,r7,#1
cmp r7,#1
beq height_residue_4
-@end_loops
+ vpop {d8 - d15}
ldmfd sp!,{r4-r12,r15} @reload the registers from sp
@@ -452,7 +456,6 @@ outer_loop_16:
add r4,r12,r2 @pu1_src + src_strd
and r0, r12, #31
sub r5,r10,#0 @checks wd
- @ble end_loops1
pld [r12, r2, lsl #1]
vld1.u32 {q0},[r12],r11 @vector load pu1_src
pld [r4, r2, lsl #1]
@@ -580,17 +583,17 @@ epilog_16:
ldr r7, [sp], #4
ldr r0, [sp], #4
- ldr r10,[sp,#48]
+ ldr r10,[sp,#wd_offset]
cmp r10,#24
beq outer_loop8_residual
add r1,r6,r8,lsl #1
- ldr r7,[sp,#44] @loads ht
+ ldr r7,[sp,#ht_offset] @loads ht
and r7,r7,#1
cmp r7,#1
beq height_residue_4
-end_loops1:
-
+end_loops:
+ vpop {d8 - d15}
ldmfd sp!,{r4-r12,r15} @reload the registers from sp
diff --git a/common/arm/ihevc_inter_pred_luma_vert_w16inp_w16out.s b/common/arm/ihevc_inter_pred_luma_vert_w16inp_w16out.s
index c6716fe..6e0f1ed 100644
--- a/common/arm/ihevc_inter_pred_luma_vert_w16inp_w16out.s
+++ b/common/arm/ihevc_inter_pred_luma_vert_w16inp_w16out.s
@@ -102,6 +102,10 @@
@ r5 => ht
@ r6 => wd
+.equ coeff_offset, 104
+.equ ht_offset, 108
+.equ wd_offset, 112
+
.text
.align 4
@@ -115,16 +119,17 @@
ihevc_inter_pred_luma_vert_w16inp_w16out_a9q:
stmfd sp!, {r4-r12, r14} @stack stores the values of the arguments
+ vpush {d8 - d15}
- ldr r12,[sp,#40] @load pi1_coeff
+ ldr r12,[sp,#coeff_offset] @load pi1_coeff
mov r6,r3,lsl #1
- ldr r5,[sp,#48] @load wd
+ ldr r5,[sp,#wd_offset] @load wd
vld1.8 {d0},[r12] @coeff = vld1_s8(pi1_coeff)
mov r2, r2, lsl #1
sub r12,r2,r2,lsl #2 @src_ctrd & pi1_coeff
@vabs.s8 d0,d0 @vabs_s8(coeff)
add r0,r0,r12 @r0->pu1_src r12->pi1_coeff
- ldr r3,[sp,#44] @load ht
+ ldr r3,[sp,#ht_offset] @load ht
subs r7,r3,#0 @r3->ht
@ble end_loops @end loop jump
vmovl.s8 q0,d0
@@ -393,6 +398,7 @@ epilog_end:
end_loops:
+ vpop {d8 - d15}
ldmfd sp!,{r4-r12,r15} @reload the registers from sp
diff --git a/common/arm/ihevc_intra_pred_chroma_dc.s b/common/arm/ihevc_intra_pred_chroma_dc.s
index 72d9730..6e5900a 100644
--- a/common/arm/ihevc_intra_pred_chroma_dc.s
+++ b/common/arm/ihevc_intra_pred_chroma_dc.s
@@ -92,6 +92,8 @@
@ mode
@ pi1_coeff
+.equ nt_offset, 40
+
.text
.align 4
@@ -106,7 +108,7 @@ ihevc_intra_pred_chroma_dc_a9q:
stmfd sp!, {r4-r12, r14} @stack stores the values of the arguments
- ldr r4,[sp,#40] @loads nt
+ ldr r4,[sp,#nt_offset] @loads nt
mov r9, #0
vmov d17, r9, r9
diff --git a/common/arm/ihevc_intra_pred_chroma_horz.s b/common/arm/ihevc_intra_pred_chroma_horz.s
index 6089fd8..4512d72 100644
--- a/common/arm/ihevc_intra_pred_chroma_horz.s
+++ b/common/arm/ihevc_intra_pred_chroma_horz.s
@@ -84,6 +84,8 @@
@r2 => *pu1_dst
@r3 => dst_strd
+.equ nt_offset, 104
+
.text
.align 4
@@ -97,8 +99,9 @@
ihevc_intra_pred_chroma_horz_a9q:
stmfd sp!, {r4-r12, r14} @stack stores the values of the arguments
+ vpush {d8 - d15}
- ldr r4,[sp,#40] @loads nt
+ ldr r4,[sp,#nt_offset] @loads nt
lsl r6,r4,#2 @four_nt
@@ -187,6 +190,7 @@ core_loop_16:
vst1.16 {q4},[r2],r3
vst1.16 {q4},[r9],r3
bgt core_loop_16
+ vpop {d8 - d15}
ldmfd sp!,{r4-r12,r15} @reload the registers from sp
b endloop
@@ -263,6 +267,7 @@ core_loop_8:
@vst1.8 {q5},[r2],r3
@vst1.8 {q6},[r2],r3
@vst1.8 {q7},[r2],r3
+ vpop {d8 - d15}
ldmfd sp!,{r4-r12,r15} @reload the registers from sp
b endloop
@@ -308,6 +313,7 @@ core_loop_4:
@vst1.8 {d8},[r2],r3
@vst1.8 {d9},[r2],r3
+ vpop {d8 - d15}
ldmfd sp!,{r4-r12,r15} @reload the registers from sp
b endloop
@@ -339,6 +345,7 @@ core_loop_4:
vst1.32 {d4[0]},[r2],r3
vst1.32 {d5[0]},[r2],r3
+ vpop {d8 - d15}
ldmfd sp!,{r4-r12,r15} @reload the registers from sp
endloop:
diff --git a/common/arm/ihevc_intra_pred_chroma_mode2.s b/common/arm/ihevc_intra_pred_chroma_mode2.s
index cfa2ddb..013700d 100644
--- a/common/arm/ihevc_intra_pred_chroma_mode2.s
+++ b/common/arm/ihevc_intra_pred_chroma_mode2.s
@@ -87,11 +87,13 @@
@r2 => *pu1_dst
@r3 => dst_strd
-@stack contents from #40
+@stack contents from #104
@ nt
@ mode
@ pi1_coeff
+.equ nt_offset, 104
+
.text
.align 4
@@ -105,8 +107,9 @@
ihevc_intra_pred_chroma_mode2_a9q:
stmfd sp!, {r4-r12, r14} @stack stores the values of the arguments
+ vpush {d8 - d15}
- ldr r4,[sp,#40] @loads nt
+ ldr r4,[sp,#nt_offset] @loads nt
mov r8,#-4
cmp r4,#4
@@ -290,6 +293,7 @@ mode2_4:
vst1.8 {d6},[r2],r3
end_func:
+ vpop {d8 - d15}
ldmfd sp!,{r4-r12,r15} @reload the registers from sp
diff --git a/common/arm/ihevc_intra_pred_chroma_mode_18_34.s b/common/arm/ihevc_intra_pred_chroma_mode_18_34.s
index b0dd1fa..6af6450 100644
--- a/common/arm/ihevc_intra_pred_chroma_mode_18_34.s
+++ b/common/arm/ihevc_intra_pred_chroma_mode_18_34.s
@@ -87,11 +87,14 @@
@r2 => *pu1_dst
@r3 => dst_strd
-@stack contents from #40
+@stack contents from #104
@ nt
@ mode
@ pi1_coeff
+.equ nt_offset, 104
+.equ mode_offset, 108
+
.text
.align 4
@@ -105,10 +108,10 @@
ihevc_intra_pred_chroma_mode_18_34_a9q:
stmfd sp!, {r4-r12, r14} @stack stores the values of the arguments
+ vpush {d8 - d15}
-
- ldr r4,[sp,#40]
- ldr r5,[sp,#44]
+ ldr r4,[sp,#nt_offset]
+ ldr r5,[sp,#mode_offset]
cmp r4,#4
beq mode2_4
@@ -181,6 +184,7 @@ mode2_4:
vst1.32 {d0},[r2],r3
end_func:
+ vpop {d8 - d15}
ldmfd sp!,{r4-r12,r15} @reload the registers from sp
diff --git a/common/arm/ihevc_intra_pred_chroma_mode_27_to_33.s b/common/arm/ihevc_intra_pred_chroma_mode_27_to_33.s
index fb75e96..21b54da 100644
--- a/common/arm/ihevc_intra_pred_chroma_mode_27_to_33.s
+++ b/common/arm/ihevc_intra_pred_chroma_mode_27_to_33.s
@@ -81,6 +81,9 @@
@ word32 nt,
@ word32 mode)
+.equ nt_offset, 104
+.equ mode_offset, 108
+
.text
.align 4
@@ -103,9 +106,10 @@ gau1_ihevc_planar_factor_addr:
ihevc_intra_pred_chroma_mode_27_to_33_a9q:
stmfd sp!, {r4-r12, r14} @stack stores the values of the arguments
+ vpush {d8 - d15}
- ldr r4,[sp,#40] @loads nt
- ldr r5,[sp,#44] @loads mode
+ ldr r4,[sp,#nt_offset] @loads nt
+ ldr r5,[sp,#mode_offset] @loads mode
ldr r6,gai4_ihevc_ang_table_addr @loads word32 gai4_ihevc_ang_table[35]
ulbl1:
add r6,r6,pc
@@ -535,6 +539,7 @@ core_loop_4:
vst1.8 {d22},[r2],r3
end_loops:
+ vpop {d8 - d15}
ldmfd sp!,{r4-r12,r15} @reload the registers from sp
diff --git a/common/arm/ihevc_intra_pred_chroma_mode_3_to_9.s b/common/arm/ihevc_intra_pred_chroma_mode_3_to_9.s
index a5eb3ca..b7dcbfb 100644
--- a/common/arm/ihevc_intra_pred_chroma_mode_3_to_9.s
+++ b/common/arm/ihevc_intra_pred_chroma_mode_3_to_9.s
@@ -82,10 +82,13 @@
@r2 => *pu1_dst
@r3 => dst_strd
-@stack contents from #40
+@stack contents from #104
@ nt
@ mode
+.equ nt_offset, 104
+.equ mode_offset, 108
+
.text
.align 4
@@ -123,13 +126,14 @@ col_for_intra_chroma_addr_3:
ihevc_intra_pred_chroma_mode_3_to_9_a9q:
stmfd sp!, {r4-r12, r14} @stack stores the values of the arguments
+ vpush {d8 - d15}
- ldr r4,[sp,#40] @loads nt
+ ldr r4,[sp,#nt_offset] @loads nt
ldr r7, gai4_ihevc_ang_table_addr
ulbl1:
add r7,r7,pc
- ldr r5,[sp,#44] @mode (3 to 9)
+ ldr r5,[sp,#mode_offset] @mode (3 to 9)
ldr r8, gai4_ihevc_inv_ang_table_addr
ulbl2:
add r8,r8,pc
@@ -486,6 +490,7 @@ epil_8_16_32:
vst1.8 d18, [r5], r3 @st (row 7)
end_func:
+ vpop {d8 - d15}
ldmfd sp!,{r4-r12,r15} @reload the registers from sp
diff --git a/common/arm/ihevc_intra_pred_chroma_planar.s b/common/arm/ihevc_intra_pred_chroma_planar.s
index 30b3144..7d03d55 100644
--- a/common/arm/ihevc_intra_pred_chroma_planar.s
+++ b/common/arm/ihevc_intra_pred_chroma_planar.s
@@ -87,11 +87,13 @@
@r2 => *pu1_dst
@r3 => dst_strd
-@stack contents from #40
+@stack contents from #104
@ nt
@ mode
@ pi1_coeff
+.equ nt_offset, 104
+
.text
.align 4
@@ -109,8 +111,9 @@ gau1_ihevc_planar_factor_addr:
ihevc_intra_pred_chroma_planar_a9q:
stmfd sp!, {r4-r12, r14} @stack stores the values of the arguments
+ vpush {d8 - d15}
- ldr r4,[sp,#40] @loads nt
+ ldr r4,[sp,#nt_offset] @loads nt
ldr r11, gau1_ihevc_planar_factor_addr @loads table of coeffs
ulbl1:
add r11,r11,pc
@@ -353,6 +356,7 @@ loop_sz_4:
bne loop_sz_4
end_loop:
+ vpop {d8 - d15}
ldmfd sp!,{r4-r12,r15} @reload the registers from sp
diff --git a/common/arm/ihevc_intra_pred_chroma_ver.s b/common/arm/ihevc_intra_pred_chroma_ver.s
index b68a045..ce2ad73 100644
--- a/common/arm/ihevc_intra_pred_chroma_ver.s
+++ b/common/arm/ihevc_intra_pred_chroma_ver.s
@@ -87,6 +87,8 @@
@ nt
@ mode
+.equ nt_offset, 40
+
.text
.align 4
@@ -101,7 +103,7 @@ ihevc_intra_pred_chroma_ver_a9q:
stmfd sp!, {r4-r12, r14} @stack stores the values of the arguments
- ldr r4,[sp,#40] @loads nt
+ ldr r4,[sp,#nt_offset] @loads nt
lsl r5, r4, #2 @4nt
diff --git a/common/arm/ihevc_intra_pred_filters_chroma_mode_11_to_17.s b/common/arm/ihevc_intra_pred_filters_chroma_mode_11_to_17.s
index 6c882cf..8644cc8 100644
--- a/common/arm/ihevc_intra_pred_filters_chroma_mode_11_to_17.s
+++ b/common/arm/ihevc_intra_pred_filters_chroma_mode_11_to_17.s
@@ -84,10 +84,13 @@
@r2 => *pu1_dst
@r3 => dst_strd
-@stack contents from #40
+@stack contents from #236
@ nt
@ mode
+.equ nt_offset, 236
+.equ mode_offset, 240
+
.text
.align 4
@@ -123,13 +126,15 @@ col_for_intra_chroma_addr_3:
ihevc_intra_pred_chroma_mode_11_to_17_a9q:
stmfd sp!, {r4-r12, r14} @stack stores the values of the arguments
+ vpush {d8 - d15}
+ sub sp, sp, #132 @ref_temp[2 * max_cu_size + 2]
- ldr r4,[sp,#40] @loads nt
+ ldr r4,[sp,#nt_offset] @loads wd
ldr r7, gai4_ihevc_ang_table_addr
ulbl1:
add r7,r7,pc
- ldr r5,[sp,#44] @mode (11 to 17)
+ ldr r5,[sp,#mode_offset] @mode (11 to 17)
ldr r8, gai4_ihevc_inv_ang_table_addr
ulbl2:
add r8,r8,pc
@@ -139,7 +144,6 @@ ulbl2:
sub r8, r8, #44
ldr r7, [r7] @intra_pred_ang
- sub sp, sp, #132 @ref_temp[2 * max_cu_size + 2]
ldr r8, [r8] @inv_ang
add r6, sp, r4, lsl #1 @ref_temp + 2 * nt
@@ -607,6 +611,7 @@ epil_8_16_32:
end_func:
add sp, sp, #132
+ vpop {d8 - d15}
ldmfd sp!,{r4-r12,r15} @reload the registers from sp
diff --git a/common/arm/ihevc_intra_pred_filters_chroma_mode_19_to_25.s b/common/arm/ihevc_intra_pred_filters_chroma_mode_19_to_25.s
index 2ede914..a555646 100644
--- a/common/arm/ihevc_intra_pred_filters_chroma_mode_19_to_25.s
+++ b/common/arm/ihevc_intra_pred_filters_chroma_mode_19_to_25.s
@@ -84,10 +84,13 @@
@r2 => *pu1_dst
@r3 => dst_strd
-@stack contents from #40
+@stack contents from #236
@ nt
@ mode
+.equ nt_offset, 236
+.equ mode_offset, 240
+
.text
.align 4
@@ -116,13 +119,15 @@ gai4_ihevc_ang_table_addr_2:
ihevc_intra_pred_chroma_mode_19_to_25_a9q:
stmfd sp!, {r4-r12, r14} @stack stores the values of the arguments
+ vpush {d8 - d15}
+ sub sp, sp, #132 @ref_temp[2 * max_cu_size + 2]
- ldr r4,[sp,#40] @loads nt
+ ldr r4,[sp,#nt_offset] @loads nt
ldr r7, gai4_ihevc_ang_table_addr_1
ulbl3:
add r7,r7,pc
- ldr r5,[sp,#44] @mode (19 to 25)
+ ldr r5,[sp,#mode_offset] @mode (19 to 25)
ldr r8, gai4_ihevc_inv_ang_table_addr
ulbl1:
add r8,r8,pc
@@ -132,7 +137,6 @@ ulbl1:
sub r8, r8, #48 @gai4_ihevc_inv_ang_table[mode - 12]
ldr r7, [r7] @intra_pred_ang
- sub sp, sp, #132 @ref_temp[2 * max_cu_size + 2]
ldr r8, [r8] @inv_ang
add r6, sp, r4 , lsl #1 @ref_temp + 2 * nt
@@ -562,6 +566,7 @@ core_loop_4:
end_loops:
add sp, sp, #132
+ vpop {d8 - d15}
ldmfd sp!,{r4-r12,r15} @reload the registers from sp
diff --git a/common/arm/ihevc_intra_pred_filters_luma_mode_11_to_17.s b/common/arm/ihevc_intra_pred_filters_luma_mode_11_to_17.s
index ec38786..336af06 100644
--- a/common/arm/ihevc_intra_pred_filters_luma_mode_11_to_17.s
+++ b/common/arm/ihevc_intra_pred_filters_luma_mode_11_to_17.s
@@ -84,10 +84,13 @@
@r2 => *pu1_dst
@r3 => dst_strd
-@stack contents from #40
+@stack contents from #236
@ nt
@ mode
+.equ nt_offset, 236
+.equ mode_offset, 240
+
.text
.align 4
@@ -129,13 +132,14 @@ col_for_intra_luma_addr_4:
ihevc_intra_pred_luma_mode_11_to_17_a9q:
stmfd sp!, {r4-r12, r14} @stack stores the values of the arguments
-
- ldr r4,[sp,#40] @loads nt
+ vpush {d8 - d15}
+ sub sp, sp, #132 @ref_temp[2 * max_cu_size + 1]
+ ldr r4,[sp,#nt_offset] @loads nt
ldr r7, gai4_ihevc_ang_table_addr
ulbl1:
add r7,r7,pc
- ldr r5,[sp,#44] @mode (11 to 17)
+ ldr r5,[sp,#mode_offset] @mode (11 to 17)
ldr r8, gai4_ihevc_inv_ang_table_addr
ulbl2:
add r8,r8,pc
@@ -145,7 +149,6 @@ ulbl2:
sub r8, r8, #44
ldr r7, [r7] @intra_pred_ang
- sub sp, sp, #132 @ref_temp[2 * max_cu_size + 1]
ldr r8, [r8] @inv_ang
add r6, sp, r4 @ref_temp + nt
@@ -684,6 +687,7 @@ ulbl4:
end_func:
add sp, sp, #132
+ vpop {d8 - d15}
ldmfd sp!,{r4-r12,r15} @reload the registers from sp
diff --git a/common/arm/ihevc_intra_pred_filters_luma_mode_19_to_25.s b/common/arm/ihevc_intra_pred_filters_luma_mode_19_to_25.s
index af342bf..32268a2 100644
--- a/common/arm/ihevc_intra_pred_filters_luma_mode_19_to_25.s
+++ b/common/arm/ihevc_intra_pred_filters_luma_mode_19_to_25.s
@@ -84,10 +84,13 @@
@r2 => *pu1_dst
@r3 => dst_strd
-@stack contents from #40
+@stack contents from #236
@ nt
@ mode
+.equ nt_offset, 236
+.equ mode_offset, 240
+
.text
.align 4
@@ -116,13 +119,15 @@ gai4_ihevc_ang_table_addr_2:
ihevc_intra_pred_luma_mode_19_to_25_a9q:
stmfd sp!, {r4-r12, r14} @stack stores the values of the arguments
+ vpush {d8 - d15}
+ sub sp, sp, #132 @ref_temp[2 * max_cu_size + 1]
- ldr r4,[sp,#40] @loads nt
+ ldr r4,[sp,#nt_offset] @loads nt
ldr r7, gai4_ihevc_ang_table_addr_1
ulbl_1:
add r7,r7,pc
- ldr r5,[sp,#44] @mode (19 to 25)
+ ldr r5,[sp,#mode_offset] @mode (19 to 25)
ldr r8, gai4_ihevc_inv_ang_table_addr
ulbl1:
add r8,r8,pc
@@ -132,7 +137,6 @@ ulbl1:
sub r8, r8, #48 @gai4_ihevc_inv_ang_table[mode - 12]
ldr r7, [r7] @intra_pred_ang
- sub sp, sp, #132 @ref_temp[2 * max_cu_size + 1]
ldr r8, [r8] @inv_ang
add r6, sp, r4 @ref_temp + nt
@@ -644,6 +648,7 @@ core_loop_4:
end_loops:
add sp, sp, #132
+ vpop {d8 - d15}
ldmfd sp!,{r4-r12,r15} @reload the registers from sp
diff --git a/common/arm/ihevc_intra_pred_luma_dc.s b/common/arm/ihevc_intra_pred_luma_dc.s
index f380d94..7d8cb91 100644
--- a/common/arm/ihevc_intra_pred_luma_dc.s
+++ b/common/arm/ihevc_intra_pred_luma_dc.s
@@ -87,11 +87,13 @@
@r2 => *pu1_dst
@r3 => dst_strd
-@stack contents from #40
+@stack contents from #104
@ nt
@ mode
@ pi1_coeff
+.equ nt_offset, 104
+
.text
.align 4
@@ -105,8 +107,8 @@
ihevc_intra_pred_luma_dc_a9q:
stmfd sp!, {r4-r12, r14} @stack stores the values of the arguments
-
- ldr r4,[sp,#40] @loads nt
+ vpush {d8 - d15}
+ ldr r4,[sp,#nt_offset] @loads nt
@********** testing
@mov r6, #128
@@ -498,6 +500,7 @@ dc_4:
epilogue_end:
end_func:
+ vpop {d8 - d15}
ldmfd sp!,{r4-r12,r15} @reload the registers from sp
diff --git a/common/arm/ihevc_intra_pred_luma_horz.s b/common/arm/ihevc_intra_pred_luma_horz.s
index 581b673..2a44404 100644
--- a/common/arm/ihevc_intra_pred_luma_horz.s
+++ b/common/arm/ihevc_intra_pred_luma_horz.s
@@ -84,6 +84,8 @@
@r2 => *pu1_dst
@r3 => dst_strd
+.equ nt_offset, 104
+
.text
.align 4
@@ -97,9 +99,8 @@
ihevc_intra_pred_luma_horz_a9q:
stmfd sp!, {r4-r12, r14} @stack stores the values of the arguments
-
- ldr r4,[sp,#40] @loads nt
- @ldr r5,[sp,#44] @loads mode
+ vpush {d8 - d15}
+ ldr r4,[sp,#nt_offset] @loads nt
lsl r6,r4,#1 @two_nt
@@ -185,6 +186,7 @@ core_loop_32:
vst1.8 {q4},[r2],r3
vst1.8 {q4},[r9],r3
bgt core_loop_32
+ vpop {d8 - d15}
ldmfd sp!,{r4-r12,r15} @reload the registers from sp
b end_func
@@ -258,7 +260,7 @@ core_loop_16:
vst1.8 {q5},[r2],r3
vst1.8 {q6},[r2],r3
vst1.8 {q7},[r2],r3
-
+ vpop {d8 - d15}
ldmfd sp!,{r4-r12,r15} @reload the registers from sp
b end_func
@@ -301,6 +303,7 @@ core_loop_8:
vst1.8 {d8},[r2],r3
vst1.8 {d9},[r2],r3
+ vpop {d8 - d15}
ldmfd sp!,{r4-r12,r15} @reload the registers from sp
b end_func
@@ -331,7 +334,7 @@ core_loop_4:
vst1.32 {d3[0]},[r2],r3
vst1.32 {d4[0]},[r2],r3
vst1.32 {d5[0]},[r2],r3
-
+ vpop {d8 - d15}
ldmfd sp!,{r4-r12,r15} @reload the registers from sp
end_func:
diff --git a/common/arm/ihevc_intra_pred_luma_mode2.s b/common/arm/ihevc_intra_pred_luma_mode2.s
index cf7999b..935f02d 100644
--- a/common/arm/ihevc_intra_pred_luma_mode2.s
+++ b/common/arm/ihevc_intra_pred_luma_mode2.s
@@ -87,11 +87,13 @@
@r2 => *pu1_dst
@r3 => dst_strd
-@stack contents from #40
+@stack contents from #104
@ nt
@ mode
@ pi1_coeff
+.equ nt_offset, 104
+
.text
.align 4
@@ -105,8 +107,8 @@
ihevc_intra_pred_luma_mode2_a9q:
stmfd sp!, {r4-r12, r14} @stack stores the values of the arguments
-
- ldr r4,[sp,#40] @loads nt
+ vpush {d8 - d15}
+ ldr r4,[sp,#nt_offset] @loads nt
mov r8,#-2
cmp r4,#4
@@ -260,6 +262,7 @@ mode2_4:
vst1.32 {d7[0]},[r7]
end_func:
+ vpop {d8 - d15}
ldmfd sp!,{r4-r12,r15} @reload the registers from sp
diff --git a/common/arm/ihevc_intra_pred_luma_mode_18_34.s b/common/arm/ihevc_intra_pred_luma_mode_18_34.s
index 438c0f5..9287371 100644
--- a/common/arm/ihevc_intra_pred_luma_mode_18_34.s
+++ b/common/arm/ihevc_intra_pred_luma_mode_18_34.s
@@ -92,6 +92,9 @@
@ mode
@ pi1_coeff
+.equ nt_offset, 40
+.equ mode_offset, 44
+
.text
.align 4
@@ -107,8 +110,8 @@ ihevc_intra_pred_luma_mode_18_34_a9q:
stmfd sp!, {r4-r12, r14} @stack stores the values of the arguments
- ldr r4,[sp,#40]
- ldr r5,[sp,#44]
+ ldr r4,[sp,#nt_offset]
+ ldr r5,[sp,#mode_offset]
cmp r4,#4
beq mode2_4
diff --git a/common/arm/ihevc_intra_pred_luma_mode_27_to_33.s b/common/arm/ihevc_intra_pred_luma_mode_27_to_33.s
index 595d82a..9d95719 100644
--- a/common/arm/ihevc_intra_pred_luma_mode_27_to_33.s
+++ b/common/arm/ihevc_intra_pred_luma_mode_27_to_33.s
@@ -85,6 +85,9 @@
@r2 => *pu1_dst
@r3 => dst_strd
+.equ nt_offset, 104
+.equ mode_offset, 108
+
.text
.align 4
@@ -107,9 +110,9 @@ gau1_ihevc_planar_factor_addr:
ihevc_intra_pred_luma_mode_27_to_33_a9q:
stmfd sp!, {r4-r12, r14} @stack stores the values of the arguments
-
- ldr r4,[sp,#40] @loads nt
- ldr r5,[sp,#44] @loads mode
+ vpush {d8 - d15}
+ ldr r4,[sp,#nt_offset] @loads nt
+ ldr r5,[sp,#mode_offset] @loads mode
ldr r6,gai4_ihevc_ang_table_addr @loads word32 gai4_ihevc_ang_table[35]
ulbl1:
add r6,r6,pc
@@ -534,6 +537,7 @@ core_loop_4:
vst1.32 {d22[0]},[r2],r3
end_loops:
+ vpop {d8 - d15}
ldmfd sp!,{r4-r12,r15} @reload the registers from sp
diff --git a/common/arm/ihevc_intra_pred_luma_mode_3_to_9.s b/common/arm/ihevc_intra_pred_luma_mode_3_to_9.s
index a8e93c8..e9c871c 100644
--- a/common/arm/ihevc_intra_pred_luma_mode_3_to_9.s
+++ b/common/arm/ihevc_intra_pred_luma_mode_3_to_9.s
@@ -84,10 +84,13 @@
@r2 => *pu1_dst
@r3 => dst_strd
-@stack contents from #40
+@stack contents from #104
@ nt
@ mode
+.equ nt_offset, 104
+.equ mode_offset, 108
+
.text
.align 4
@@ -126,13 +129,13 @@ col_for_intra_luma_addr_3:
ihevc_intra_pred_luma_mode_3_to_9_a9q:
stmfd sp!, {r4-r12, r14} @stack stores the values of the arguments
-
- ldr r4,[sp,#40] @loads nt
+ vpush {d8 - d15}
+ ldr r4,[sp,#nt_offset] @loads nt
ldr r7, gai4_ihevc_ang_table_addr
ulbl1:
add r7,r7,pc
- ldr r5,[sp,#44] @mode (3 to 9)
+ ldr r5,[sp,#mode_offset] @mode (3 to 9)
ldr r8, gai4_ihevc_inv_ang_table_addr
ulbl2:
add r8,r8,pc
@@ -566,6 +569,7 @@ ulbl3_2:
vst1.32 d18[0], [r2], r3 @st (row 3)
end_func:
+ vpop {d8 - d15}
ldmfd sp!,{r4-r12,r15} @reload the registers from sp
diff --git a/common/arm/ihevc_intra_pred_luma_planar.s b/common/arm/ihevc_intra_pred_luma_planar.s
index 666798e..50b6b1b 100644
--- a/common/arm/ihevc_intra_pred_luma_planar.s
+++ b/common/arm/ihevc_intra_pred_luma_planar.s
@@ -87,11 +87,13 @@
@r2 => *pu1_dst
@r3 => dst_strd
-@stack contents from #40
+@stack contents from #104
@ nt
@ mode
@ pi1_coeff
+.equ nt_offset, 104
+
.text
.align 4
@@ -114,8 +116,8 @@ gau1_ihevc_planar_factor_1_addr:
ihevc_intra_pred_luma_planar_a9q:
stmfd sp!, {r4-r12, r14} @stack stores the values of the arguments
-
- ldr r4,[sp,#40] @loads nt
+ vpush {d8 - d15}
+ ldr r4,[sp,#nt_offset] @loads nt
ldr r11, gau1_ihevc_planar_factor_addr @loads table of coeffs
ulbl1:
add r11,r11,pc
@@ -546,6 +548,7 @@ loop_sz_4:
bne loop_sz_4
end_loop:
+ vpop {d8 - d15}
ldmfd sp!,{r4-r12,r15} @reload the registers from sp
diff --git a/common/arm/ihevc_intra_pred_luma_vert.s b/common/arm/ihevc_intra_pred_luma_vert.s
index 5eeaeb3..9610773 100644
--- a/common/arm/ihevc_intra_pred_luma_vert.s
+++ b/common/arm/ihevc_intra_pred_luma_vert.s
@@ -84,10 +84,12 @@
@r2 => *pu1_dst
@r3 => dst_strd
-@stack contents from #40
+@stack contents from #104
@ nt
@ mode
+.equ nt_offset, 104
+
.text
.align 4
@@ -101,8 +103,8 @@
ihevc_intra_pred_luma_ver_a9q:
stmfd sp!, {r4-r12, r14} @stack stores the values of the arguments
-
- ldr r4,[sp,#40] @loads nt
+ vpush {d8 - d15}
+ ldr r4,[sp,#nt_offset] @loads nt
lsl r5, r4, #1 @2nt
@@ -417,5 +419,6 @@ blk_4:
end_func:
+ vpop {d8 - d15}
ldmfd sp!,{r4-r12,r15} @reload the registers from sp
diff --git a/common/arm/ihevc_itrans_recon_16x16.s b/common/arm/ihevc_itrans_recon_16x16.s
index 82055ad..198fd52 100644
--- a/common/arm/ihevc_itrans_recon_16x16.s
+++ b/common/arm/ihevc_itrans_recon_16x16.s
@@ -105,6 +105,12 @@
@ r12
@ r11
+.equ src_stride_offset, 104
+.equ pred_stride_offset, 108
+.equ out_stride_offset, 112
+.equ zero_cols_offset, 116
+.equ zero_rows_offset, 120
+
.text
.align 4
@@ -129,15 +135,10 @@ g_ai2_ihevc_trans_16_transpose_addr:
ihevc_itrans_recon_16x16_a9q:
stmfd sp!,{r4-r12,lr}
-@ add sp,sp,#40
-
-
-
-@ ldr r8,[sp,#4] @ prediction stride
-@ ldr r7,[sp,#8] @ destination stride
- ldr r6,[sp,#40] @ src stride
- ldr r12,[sp,#52]
- ldr r11,[sp,#56]
+ vpush {d8 - d15}
+ ldr r6,[sp,#src_stride_offset] @ src stride
+ ldr r12,[sp,#zero_cols_offset]
+ ldr r11,[sp,#zero_rows_offset]
@@ -661,8 +662,8 @@ skip_last12rows_kernel2:
mov r6,r7
- ldr r8,[sp,#44] @ prediction stride
- ldr r7,[sp,#48] @ destination stride
+ ldr r8,[sp,#pred_stride_offset] @ prediction stride
+ ldr r7,[sp,#out_stride_offset] @ destination stride
mov r10,#16
@@ -1126,7 +1127,7 @@ skip_last8rows_stage2_kernel2:
bne second_stage
-@ sub sp,sp,#40
+ vpop {d8 - d15}
ldmfd sp!,{r4-r12,pc}
diff --git a/common/arm/ihevc_itrans_recon_32x32.s b/common/arm/ihevc_itrans_recon_32x32.s
index eeb1d66..65b6ffd 100644
--- a/common/arm/ihevc_itrans_recon_32x32.s
+++ b/common/arm/ihevc_itrans_recon_32x32.s
@@ -124,6 +124,14 @@
@d5[2]= 43 d7[2]=9
@d5[3]= 38 d7[3]=4
+.equ pi2_src_offset, 64
+.equ pi2_tmp_offset, 68
+.equ src_strd_offset, 120
+.equ pred_strd_offset, 124
+.equ dst_strd_offset, 128
+.equ zero_cols_offset, 132
+.equ zero_rows_offset, 136
+
.text
.align 4
@@ -152,13 +160,11 @@ r9_addr: .word 0xffff0000
ihevc_itrans_recon_32x32_a9q:
stmfd sp!,{r0-r12,lr}
+ vpush {d8 - d15}
-
-@ldr r8,[sp,#56] @ prediction stride
-@ldr r7,[sp,#64] @ destination stride
- ldr r6,[sp,#56] @ src stride
- ldr r12,[sp,#68]
- ldr r11,[sp,#72]
+ ldr r6,[sp,#src_strd_offset] @ src stride
+ ldr r12,[sp,#zero_cols_offset]
+ ldr r11,[sp,#zero_rows_offset]
mov r6,r6,lsl #1 @ x sizeof(word16)
add r10,r6,r6, lsl #1 @ 3 rows
@@ -1493,10 +1499,10 @@ shift4:
bne dct_stage1
second_stage_dct:
@ mov r0,r1
- ldr r0,[sp]
- ldr r1,[sp,#4]
- ldr r8,[sp,#60] @ prediction stride
- ldr r7,[sp,#64] @ destination stride
+ ldr r0,[sp,#pi2_src_offset]
+ ldr r1,[sp,#pi2_tmp_offset]
+ ldr r8,[sp,#pred_strd_offset] @ prediction stride
+ ldr r7,[sp,#dst_strd_offset] @ destination stride
@ add r4,r2,r8, lsl #1 @ r4 = r2 + pred_strd * 2 => r4 points to 3rd row of pred data
@ add r5,r8,r8, lsl #1 @
@@ -2855,6 +2861,7 @@ prediction_buffer:
subs r14,r14,#1
bne dct_stage2
+ vpop {d8 - d15}
ldmfd sp!,{r0-r12,pc}
diff --git a/common/arm/ihevc_itrans_recon_4x4.s b/common/arm/ihevc_itrans_recon_4x4.s
index c955502..fb5796c 100644
--- a/common/arm/ihevc_itrans_recon_4x4.s
+++ b/common/arm/ihevc_itrans_recon_4x4.s
@@ -100,6 +100,10 @@
@ r6 => dst_strd
@ r7 => zero_cols
+.equ src_strd_offset, 104
+.equ pred_strd_offset, 108
+.equ dst_strd_offset, 112
+.equ zero_cols_offset, 116
.text
.align 4
@@ -122,17 +126,18 @@ g_ai2_ihevc_trans_4_transpose_addr:
ihevc_itrans_recon_4x4_a9q:
stmfd sp!, {r4-r12, r14} @stack stores the values of the arguments
+ vpush {d8 - d15}
ldr r8,g_ai2_ihevc_trans_4_transpose_addr
ulbl1:
add r8,r8,pc
- ldr r4,[sp,#40] @loading src_strd
- ldr r5,[sp,#44] @loading pred_strd
+ ldr r4,[sp,#src_strd_offset] @loading src_strd
+ ldr r5,[sp,#pred_strd_offset] @loading pred_strd
add r4,r4,r4 @ src_strd in terms of word16
- ldr r6,[sp,#48] @loading dst_strd
- ldr r7,[sp,#52] @loading zero_cols
+ ldr r6,[sp,#dst_strd_offset] @loading dst_strd
+ ldr r7,[sp,#zero_cols_offset] @loading zero_cols
add r9,r0,r4 @ pi2_src[0] + src_strd
@@ -223,7 +228,7 @@ ulbl1:
vst1.32 {d1[0]},[r3],r6
vst1.32 {d1[1]},[r3],r6
-
+ vpop {d8 - d15}
ldmfd sp!,{r4-r12,r15} @reload the registers from sp
diff --git a/common/arm/ihevc_itrans_recon_4x4_ttype1.s b/common/arm/ihevc_itrans_recon_4x4_ttype1.s
index ab65dae..82ed8a0 100644
--- a/common/arm/ihevc_itrans_recon_4x4_ttype1.s
+++ b/common/arm/ihevc_itrans_recon_4x4_ttype1.s
@@ -103,6 +103,11 @@
@ r6 => dst_strd
@ r7 => zero_cols
+.equ src_strd_offset, 104
+.equ pred_strd_offset, 108
+.equ dst_strd_offset, 112
+.equ zero_cols_offset, 116
+
.text
.align 4
@@ -119,10 +124,12 @@
ihevc_itrans_recon_4x4_ttype1_a9q:
stmfd sp!, {r4-r12, r14} @stack stores the values of the arguments
- ldr r4,[sp,#40] @loading src_strd
- ldr r5,[sp,#44] @loading pred_strd
- ldr r6,[sp,#48] @loading dst_strd
- ldr r7,[sp,#52] @loading zero_cols
+ vpush {d8 - d15}
+
+ ldr r4,[sp,#src_strd_offset] @loading src_strd
+ ldr r5,[sp,#pred_strd_offset] @loading pred_strd
+ ldr r6,[sp,#dst_strd_offset] @loading dst_strd
+ ldr r7,[sp,#zero_cols_offset] @loading zero_cols
add r4,r4,r4 @ src_strd in terms of word16
@@ -224,6 +231,7 @@ ihevc_itrans_recon_4x4_ttype1_a9q:
vst1.32 {d1[0]},[r3],r6
vst1.32 {d1[1]},[r3],r6
+ vpop {d8 - d15}
ldmfd sp!,{r4-r12,r15} @reload the registers from sp
diff --git a/common/arm/ihevc_itrans_recon_8x8.s b/common/arm/ihevc_itrans_recon_8x8.s
index e9b53b4..94113d8 100644
--- a/common/arm/ihevc_itrans_recon_8x8.s
+++ b/common/arm/ihevc_itrans_recon_8x8.s
@@ -104,6 +104,11 @@
@ dst_strd
@ zero_cols
+.equ src_stride_offset, 104
+.equ pred_stride_offset, 108
+.equ out_stride_offset, 112
+.equ zero_cols_offset, 116
+.equ zero_rows_offset, 120
.text
@@ -151,12 +156,13 @@ ihevc_itrans_recon_8x8_a9q:
@// copy the input pointer to another register
@// step 1 : load all constants
stmfd sp!,{r4-r12,lr}
+ vpush {d8 - d15}
- ldr r8,[sp,#44] @ prediction stride
- ldr r7,[sp,#48] @ destination stride
- ldr r6,[sp, #40] @ src stride
- ldr r12,[sp,#52]
- ldr r11,[sp,#56]
+ ldr r8, [sp, #pred_stride_offset] @ prediction stride
+ ldr r7, [sp, #out_stride_offset] @ destination stride
+ ldr r6, [sp, #src_stride_offset] @ src stride
+ ldr r12, [sp, #zero_cols_offset]
+ ldr r11, [sp, #zero_rows_offset]
mov r6,r6,lsl #1 @ x sizeof(word16)
add r9,r0,r6, lsl #1 @ 2 rows
@@ -925,7 +931,7 @@ pred_buff_addition:
-
+ vpop {d8 - d15}
ldmfd sp!,{r4-r12,pc}
diff --git a/common/arm/ihevc_sao_band_offset_chroma.s b/common/arm/ihevc_sao_band_offset_chroma.s
index 32e149d..a9da725 100644
--- a/common/arm/ihevc_sao_band_offset_chroma.s
+++ b/common/arm/ihevc_sao_band_offset_chroma.s
@@ -61,6 +61,14 @@
@r9 => wd
@r10=> ht
+.equ pu1_src_top_left_offset, 104
+.equ sao_band_pos_u_offset, 108
+.equ sao_band_pos_v_offset, 112
+.equ pi1_sao_u_offset, 116
+.equ pi1_sao_v_offset, 120
+.equ wd_offset, 124
+.equ ht_offset, 128
+
.text
.p2align 2
@@ -76,10 +84,11 @@ gu1_table_band_idx_addr_2:
ihevc_sao_band_offset_chroma_a9q:
STMFD sp!, {r4-r12, r14} @stack stores the values of the arguments
- LDR r4,[sp,#40] @Loads pu1_src_top_left
- LDR r10,[sp,#64] @Loads ht
+ vpush {d8 - d15}
+ LDR r4,[sp,#pu1_src_top_left_offset] @Loads pu1_src_top_left
+ LDR r10,[sp,#ht_offset] @Loads ht
- LDR r9,[sp,#60] @Loads wd
+ LDR r9,[sp,#wd_offset] @Loads wd
MOV r11,r10 @Move the ht to r9 for loop counter
ADD r12,r0,r9 @pu1_src[row * src_strd + (wd)]
@@ -94,7 +103,7 @@ SRC_LEFT_LOOP:
STRH r5,[r2],#2 @Store the value in pu1_src_left pointer
BNE SRC_LEFT_LOOP
- LDR r5,[sp,#44] @Loads sao_band_pos_u
+ LDR r5,[sp,#sao_band_pos_u_offset] @Loads sao_band_pos_u
VLD1.8 D1,[r14]! @band_table_u.val[0]
ADD r12,r3,r9 @pu1_src_top[wd]
@@ -104,7 +113,7 @@ SRC_LEFT_LOOP:
STRH r11,[r4] @store to pu1_src_top_left[0]
VLD1.8 D3,[r14]! @band_table_u.val[2]
- LDR r7,[sp,#52] @Loads pi1_sao_offset_u
+ LDR r7,[sp,#pi1_sao_u_offset] @Loads pi1_sao_offset_u
SUB r4,r10,#1 @ht-1
VDUP.8 D31,r6 @band_pos_u
@@ -147,7 +156,7 @@ ulbl2:
VLD1.8 D10,[r14]! @band_table_v.val[1]
VADD.I8 D3,D7,D27 @band_table_u.val[2] = vadd_u8(band_table_u.val[2], vdup_n_u8(pi1_sao_offset_u[3]))
- LDR r6,[sp,#48] @Loads sao_band_pos_v
+ LDR r6,[sp,#sao_band_pos_v_offset] @Loads sao_band_pos_v
VADD.I8 D4,D8,D26 @band_table_u.val[3] = vadd_u8(band_table_u.val[3], vdup_n_u8(pi1_sao_offset_u[4]))
LSL r11,r6,#3 @sao_band_pos_v
@@ -198,7 +207,7 @@ SAO_BAND_POS_U_0:
SWITCH_BREAK_U:
VDUP.8 D30,r11 @band_pos_v
- LDR r8,[sp,#56] @Loads pi1_sao_offset_v
+ LDR r8,[sp,#pi1_sao_v_offset] @Loads pi1_sao_offset_v
VLD1.8 D11,[r14]! @band_table_v.val[2]
VADD.I8 D13,D9,D30 @band_table_v.val[0] = vadd_u8(band_table_v.val[0], band_pos_v)
@@ -387,6 +396,7 @@ WIDTH_RESIDUE: @If width is not multiple of 16
BNE WIDTH_RESIDUE
END_LOOP:
+ vpop {d8 - d15}
LDMFD sp!,{r4-r12,r15} @Reload the registers from SP
diff --git a/common/arm/ihevc_sao_band_offset_luma.s b/common/arm/ihevc_sao_band_offset_luma.s
index 3875377..66f2968 100644
--- a/common/arm/ihevc_sao_band_offset_luma.s
+++ b/common/arm/ihevc_sao_band_offset_luma.s
@@ -57,6 +57,12 @@
@r7 => wd
@r8 => ht
+.equ pu1_src_top_left_offset, 104
+.equ sao_band_pos_offset, 108
+.equ pi1_sao_offset, 112
+.equ wd_offset, 116
+.equ ht_offset, 120
+
.text
.p2align 2
@@ -69,15 +75,16 @@ gu1_table_band_idx_addr:
ihevc_sao_band_offset_luma_a9q:
STMFD sp!, {r4-r12, r14} @stack stores the values of the arguments
+ vpush {d8 - d15}
- LDR r8,[sp,#56] @Loads ht
- LDR r7,[sp,#52] @Loads wd
+ LDR r8,[sp,#ht_offset] @Loads ht
+ LDR r7,[sp,#wd_offset] @Loads wd
MOV r9,r8 @Move the ht to r9 for loop counter
- LDR r5,[sp,#44] @Loads sao_band_pos
+ LDR r5,[sp,#sao_band_pos_offset] @Loads sao_band_pos
ADD r10,r0,r7 @pu1_src[row * src_strd + (wd)]
- LDR r4,[sp,#40] @Loads pu1_src_top_left
+ LDR r4,[sp,#pu1_src_top_left_offset] @Loads pu1_src_top_left
SUB r10,r10,#1 @wd-1
LDR r14, gu1_table_band_idx_addr
ulbl1:
@@ -91,7 +98,7 @@ SRC_LEFT_LOOP:
ADD r9,r3,r7 @pu1_src_top[wd]
VLD1.8 D1,[r14]! @band_table.val[0]
- LDR r6,[sp,#48] @Loads pi1_sao_offset
+ LDR r6,[sp,#pi1_sao_offset] @Loads pi1_sao_offset
LSL r11,r5,#3
VLD1.8 D2,[r14]! @band_table.val[1]
@@ -226,6 +233,7 @@ HEIGHT_LOOP:
ADD r0,r0,#8
BNE SWITCH_BREAK
+ vpop {d8 - d15}
LDMFD sp!,{r4-r12,r15} @Reload the registers from SP
diff --git a/common/arm/ihevc_sao_edge_offset_class0.s b/common/arm/ihevc_sao_edge_offset_class0.s
index a9fe046..e4bb455 100644
--- a/common/arm/ihevc_sao_edge_offset_class0.s
+++ b/common/arm/ihevc_sao_edge_offset_class0.s
@@ -59,6 +59,14 @@
@r9 => wd
@r10=> ht
+.equ pu1_src_top_left_offset, 104
+.equ pu1_src_top_right_offset, 108
+.equ pu1_src_bot_left_offset, 112
+.equ pu1_avail_offset, 116
+.equ pi1_sao_offset, 120
+.equ wd_offset, 124
+.equ ht_offset, 128
+
.text
.p2align 2
@@ -72,23 +80,25 @@ ihevc_sao_edge_offset_class0_a9q:
STMFD sp!, {r4-r12, r14} @stack stores the values of the arguments
- LDR r9,[sp,#60] @Loads wd
+ vpush {d8 - d15}
+
+ LDR r9,[sp,#wd_offset] @Loads wd
- LDR r4,[sp,#40] @Loads pu1_src_top_left
+ LDR r4,[sp,#pu1_src_top_left_offset] @Loads pu1_src_top_left
VMOV.I8 Q1,#2 @const_2 = vdupq_n_s8(2)
ADD r11,r3,r9 @pu1_src_top[wd]
- LDR r10,[sp,#64] @Loads ht
+ LDR r10,[sp,#ht_offset] @Loads ht
VMOV.I16 Q2,#0 @const_min_clip = vdupq_n_s16(0)
LDRB r12,[r11,#-1] @pu1_src_top[wd - 1]
- LDR r7,[sp,#52] @Loads pu1_avail
+ LDR r7,[sp,#pu1_avail_offset] @Loads pu1_avail
VMOV.I16 Q3,#255 @const_max_clip = vdupq_n_u16((1 << bit_depth) - 1)
LDR r14, gi1_table_edge_idx_addr @table pointer
ulbl1:
add r14,r14,pc
- LDR r8,[sp,#56] @Loads pi1_sao_offset
+ LDR r8,[sp,#pi1_sao_offset] @Loads pi1_sao_offset
VMOV.S8 Q4,#0xFF @au1_mask = vdupq_n_s8(-1)
STRB r12,[r4] @*pu1_src_top_left = pu1_src_top[wd - 1]
@@ -337,6 +347,7 @@ PU1_SRC_LOOP_RESIDUE:
BNE PU1_SRC_LOOP_RESIDUE @If not equal jump to the pu1_src loop
END_LOOPS:
+ vpop {d8 - d15}
LDMFD sp!,{r4-r12,r15} @Reload the registers from SP
diff --git a/common/arm/ihevc_sao_edge_offset_class0_chroma.s b/common/arm/ihevc_sao_edge_offset_class0_chroma.s
index 1dd56f6..e11cd4f 100644
--- a/common/arm/ihevc_sao_edge_offset_class0_chroma.s
+++ b/common/arm/ihevc_sao_edge_offset_class0_chroma.s
@@ -60,6 +60,15 @@
@r9 => wd
@r10=> ht
+.equ pu1_src_top_left_offset, 104
+.equ pu1_src_top_right_offset, 108
+.equ pu1_src_bot_left_offset, 112
+.equ pu1_avail_offset, 116
+.equ pi1_sao_u_offset, 120
+.equ pi1_sao_v_offset, 124
+.equ wd_offset, 128
+.equ ht_offset, 132
+
.text
.p2align 2
@@ -73,20 +82,22 @@ ihevc_sao_edge_offset_class0_chroma_a9q:
STMFD sp!, {r4-r12, r14} @stack stores the values of the arguments
- LDR r9,[sp,#64] @Loads wd
+ vpush {d8 - d15}
+
+ LDR r9,[sp,#wd_offset] @Loads wd
- LDR r4,[sp,#40] @Loads pu1_src_top_left
+ LDR r4,[sp,#pu1_src_top_left_offset] @Loads pu1_src_top_left
ADD r11,r3,r9 @pu1_src_top[wd]
- LDR r10,[sp,#68] @Loads ht
+ LDR r10,[sp,#ht_offset] @Loads ht
VMOV.I8 Q1,#2 @const_2 = vdupq_n_s8(2)
LDRH r12,[r11,#-2] @pu1_src_top[wd - 1]
- LDR r7,[sp,#52] @Loads pu1_avail
+ LDR r7,[sp,#pu1_avail_offset] @Loads pu1_avail
VMOV.I16 Q2,#0 @const_min_clip = vdupq_n_s16(0)
STRH r12,[r4] @*pu1_src_top_left = pu1_src_top[wd - 1]
- LDR r8,[sp,#56] @Loads pi1_sao_offset_u
+ LDR r8,[sp,#pi1_sao_u_offset] @Loads pi1_sao_offset_u
VMOV.I16 Q3,#255 @const_max_clip = vdupq_n_u16((1 << bit_depth) - 1)
SUB r4,r10,#1 @(ht - 1)
@@ -96,7 +107,7 @@ ulbl1:
VMOV.S8 Q4,#0xFF @au1_mask = vdupq_n_s8(-1)
MUL r4,r4,r1 @(ht - 1) * src_strd
- LDR r5,[sp,#60] @Loads pi1_sao_offset_v
+ LDR r5,[sp,#pi1_sao_v_offset] @Loads pi1_sao_offset_v
VLD1.8 D11,[r8] @offset_tbl = vld1_s8(pi1_sao_offset_u)
ADD r4,r4,r0 @pu1_src[(ht - 1) * src_strd]
@@ -423,6 +434,7 @@ PU1_SRC_LOOP_RESIDUE:
BNE PU1_SRC_LOOP_RESIDUE @If not equal jump to the pu1_src loop
END_LOOPS:
+ vpop {d8 - d15}
LDMFD sp!,{r4-r12,r15} @Reload the registers from SP
diff --git a/common/arm/ihevc_sao_edge_offset_class1.s b/common/arm/ihevc_sao_edge_offset_class1.s
index aa1337f..029ac46 100644
--- a/common/arm/ihevc_sao_edge_offset_class1.s
+++ b/common/arm/ihevc_sao_edge_offset_class1.s
@@ -58,6 +58,14 @@
@r7 => wd
@r8 => ht
+.equ pu1_src_top_left_offset, 104
+.equ pu1_src_top_right_offset, 108
+.equ pu1_src_bot_left_offset, 112
+.equ pu1_avail_offset, 116
+.equ pi1_sao_offset, 120
+.equ wd_offset, 124
+.equ ht_offset, 128
+
.text
.p2align 2
@@ -71,11 +79,13 @@ ihevc_sao_edge_offset_class1_a9q:
STMFD sp!, {r4-r12, r14} @stack stores the values of the arguments
- LDR r7,[sp,#60] @Loads wd
- LDR r4,[sp,#40] @Loads pu1_src_top_left
- LDR r5,[sp,#52] @Loads pu1_avail
- LDR r6,[sp,#56] @Loads pi1_sao_offset
- LDR r8,[sp,#64] @Loads ht
+ vpush {d8 - d15}
+
+ LDR r7,[sp,#wd_offset] @Loads wd
+ LDR r4,[sp,#pu1_src_top_left_offset] @Loads pu1_src_top_left
+ LDR r5,[sp,#pu1_avail_offset] @Loads pu1_avail
+ LDR r6,[sp,#pi1_sao_offset] @Loads pi1_sao_offset
+ LDR r8,[sp,#ht_offset] @Loads ht
SUB r9,r7,#1 @wd - 1
LDRB r10,[r3,r9] @pu1_src_top[wd - 1]
@@ -362,6 +372,7 @@ PU1_SRC_LOOP_RESIDUE:
VST1.8 {D30},[r10],r1 @vst1q_u8(pu1_src_cpy, pu1_cur_row)
END_LOOPS:
+ vpop {d8 - d15}
LDMFD sp!,{r4-r12,r15} @Reload the registers from SP
diff --git a/common/arm/ihevc_sao_edge_offset_class1_chroma.s b/common/arm/ihevc_sao_edge_offset_class1_chroma.s
index 09d925f..b377220 100644
--- a/common/arm/ihevc_sao_edge_offset_class1_chroma.s
+++ b/common/arm/ihevc_sao_edge_offset_class1_chroma.s
@@ -60,6 +60,15 @@
@r8 => wd
@r9 => ht
+.equ pu1_src_top_left_offset, 104
+.equ pu1_src_top_right_offset, 108
+.equ pu1_src_bot_left_offset, 112
+.equ pu1_avail_offset, 116
+.equ pi1_sao_u_offset, 120
+.equ pi1_sao_v_offset, 124
+.equ wd_offset, 128
+.equ ht_offset, 132
+
.text
.p2align 2
@@ -73,13 +82,13 @@ ihevc_sao_edge_offset_class1_chroma_a9q:
STMFD sp!, {r4-r12, r14} @stack stores the values of the arguments
- LDR r7,[sp,#60] @Loads wd
- LDR r4,[sp,#40] @Loads pu1_src_top_left
- LDR r5,[sp,#52] @Loads pu1_avail
- LDR r6,[sp,#56] @Loads pi1_sao_offset_u
- LDR r7,[sp,#60] @Loads pi1_sao_offset_v
- LDR r8,[sp,#64] @Loads wd
- LDR r9,[sp,#68] @Loads ht
+ vpush {d8 - d15}
+ LDR r4,[sp,#pu1_src_top_left_offset] @Loads pu1_src_top_left
+ LDR r5,[sp,#pu1_avail_offset] @Loads pu1_avail
+ LDR r6,[sp,#pi1_sao_u_offset] @Loads pi1_sao_offset_u
+ LDR r7,[sp,#pi1_sao_v_offset] @Loads pi1_sao_offset_v
+ LDR r8,[sp,#wd_offset] @Loads wd
+ LDR r9,[sp,#ht_offset] @Loads ht
SUB r10,r8,#2 @wd - 2
LDRH r11,[r3,r10] @pu1_src_top[wd - 2]
@@ -398,6 +407,7 @@ PU1_SRC_LOOP_RESIDUE:
VST1.8 {D30},[r10],r1 @vst1q_u8(pu1_src_cpy, pu1_cur_row)
END_LOOPS:
+ vpop {d8 - d15}
LDMFD sp!,{r4-r12,r15} @Reload the registers from SP
diff --git a/common/arm/ihevc_sao_edge_offset_class2.s b/common/arm/ihevc_sao_edge_offset_class2.s
index 536f941..15d6efa 100644
--- a/common/arm/ihevc_sao_edge_offset_class2.s
+++ b/common/arm/ihevc_sao_edge_offset_class2.s
@@ -58,6 +58,14 @@
@r7 => wd
@r8=> ht
+.equ pu1_src_top_left_offset, 264
+.equ pu1_src_top_right_offset, 268
+.equ pu1_src_bot_left_offset, 272
+.equ pu1_avail_offset, 276
+.equ pi1_sao_offset, 280
+.equ wd_offset, 284
+.equ ht_offset, 288
+
.text
.syntax unified
.p2align 2
@@ -78,28 +86,29 @@ ihevc_sao_edge_offset_class2_a9q:
STMFD sp!,{r4-r12,r14} @stack stores the values of the arguments
- LDR r7,[sp,#0x3C] @Loads wd
+ vpush {d8 - d15}
+ SUB sp,sp,#160 @Decrement the stack pointer to store some temp arr values
- LDR r8,[sp,#0x40] @Loads ht
+ LDR r7,[sp,#wd_offset] @Loads wd
+ LDR r8,[sp,#ht_offset] @Loads ht
SUB r9,r7,#1 @wd - 1
- LDR r4,[sp,#0x28] @Loads pu1_src_top_left
+ LDR r4,[sp,#pu1_src_top_left_offset] @Loads pu1_src_top_left
LDRB r10,[r3,r9] @pu1_src_top[wd - 1]
- STR r0,[sp,#0x2C] @Store pu1_src in sp
+ STR r0,[sp,#152] @Store pu1_src in sp
MOV r9,r7 @Move width to r9 for loop count
- STR r2,[sp,#0x30] @Store pu1_src_left in sp
- LDR r5,[sp,#0x34] @Loads pu1_avail
- LDR r6,[sp,#0x38] @Loads pi1_sao_offset
- STR r3,[sp,#0x38] @Store pu1_src_top in sp
+ STR r2,[sp,#156] @Store pu1_src_left in sp
+ LDR r5,[sp,#pu1_avail_offset] @Loads pu1_avail
+ LDR r6,[sp,#pi1_sao_offset] @Loads pi1_sao_offset
+ STR r3,[sp,#148] @Store pu1_src_top in sp
- SUB sp,sp,#0x94 @Decrement the stack pointer to store some temp arr values
STRB r10,[sp] @u1_src_top_left_tmp = pu1_src_top[wd - 1]
SUB r10,r8,#1 @ht-1
MLA r11,r10,r1,r0 @pu1_src[(ht - 1) * src_strd + col]
- ADD r12,sp,#0x02 @temp array
+ ADD r12,sp,#2 @temp array
AU1_SRC_TOP_LOOP:
VLD1.8 D0,[r11]! @pu1_src[(ht - 1) * src_strd + col]
@@ -203,7 +212,7 @@ ulbl3:
VMOV.S8 Q4,#0xFF @au1_mask = vdupq_n_s8(-1)
ADDEQ r14,r14,#1 @pu1_src_left_cpy += 1
- STR r0,[sp,#0x90] @Store pu1_src in sp
+ STR r0,[sp,#144] @Store pu1_src in sp
CMP r7,#16 @Compare wd with 16
BLT WIDTH_RESIDUE @If not jump to WIDTH_RESIDUE where loop is unrolled for 8 case
@@ -211,9 +220,9 @@ ulbl3:
BLE WD_16_HT_4_LOOP @If jump to WD_16_HT_4_LOOP
WIDTH_LOOP_16:
- LDR r7,[sp,#0xD0] @Loads wd
+ LDR r7,[sp,#wd_offset] @Loads wd
- LDR r5,[sp,#0xC8] @Loads pu1_avail
+ LDR r5,[sp,#pu1_avail_offset] @Loads pu1_avail
CMP r6,r7 @col == wd
LDRBEQ r8,[r5] @pu1_avail[0]
MOVNE r8,#-1 @au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
@@ -232,21 +241,21 @@ SKIP_AU1_MASK_VAL:
MOVNE r8,r3 @pu1_src_top_cpy
SUB r8,r8,#1 @pu1_src_top_cpy - 1 || pu1_src - src_strd - 1
- LDR r7,[sp,#0xD0] @Loads wd
+ LDR r7,[sp,#wd_offset] @Loads wd
VLD1.8 D10,[r8]! @pu1_top_row = vld1q_u8(pu1_src - src_strd - 1) || vld1q_u8(pu1_src_top_cpy - 1)
VLD1.8 D11,[r8] @pu1_top_row = vld1q_u8(pu1_src - src_strd - 1) || vld1q_u8(pu1_src_top_cpy - 1)
SUB r8,#8
ADD r3,r3,#16
- ADD r5,sp,#0x42 @*au1_src_left_tmp
+ ADD r5,sp,#66 @*au1_src_left_tmp
VLD1.8 D12,[r0]! @pu1_cur_row = vld1q_u8(pu1_src)
VLD1.8 D13,[r0] @pu1_cur_row = vld1q_u8(pu1_src)
SUB r0,#8
- LDR r4,[sp,#0xD4] @Loads ht
+ LDR r4,[sp,#ht_offset] @Loads ht
SUB r7,r7,r6 @(wd - col)
VCGT.U8 Q7,Q6,Q5 @vcgtq_u8(pu1_cur_row, pu1_top_row)
- LDR r8,[sp,#0xC0] @Loads *pu1_src
+ LDR r8,[sp,#152] @Loads *pu1_src
ADD r7,r7,#15 @15 + (wd - col)
VCLT.U8 Q8,Q6,Q5 @vcltq_u8(pu1_cur_row, pu1_top_row)
@@ -263,7 +272,7 @@ AU1_SRC_LEFT_LOOP:
ADD r8,r0,r1 @I Iteration *pu1_src + src_strd
VMOV.I8 Q9,#0
- LDR r4,[sp,#0xC8] @I Loads pu1_avail
+ LDR r4,[sp,#pu1_avail_offset] @I Loads pu1_avail
MOV r7,r12 @row count, move ht_tmp to r7
VLD1.8 D16,[r8]! @I pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
@@ -498,11 +507,11 @@ PU1_SRC_LOOP:
INNER_LOOP_DONE:
- ADD r5,sp,#0x42 @*au1_src_left_tmp
+ ADD r5,sp,#66 @*au1_src_left_tmp
VST1.8 {Q10},[r0],r1 @vst1q_u8(pu1_src_cpy, pu1_cur_row)
- LDR r2,[sp,#0xC4] @Loads *pu1_src_left
+ LDR r2,[sp,#156] @Loads *pu1_src_left
- LDR r8,[sp,#0xD4] @Loads ht
+ LDR r8,[sp,#ht_offset] @Loads ht
SUB r5,r5,#1
SUB r2,r2,#1
@@ -515,8 +524,8 @@ SRC_LEFT_LOOP:
SUB r6,r6,#16 @Decrement the wd loop count by 16
CMP r6,#8 @Check whether residue remains
BLT RE_ASSINING_LOOP @Jump to re-assigning loop
- LDR r7,[sp,#0xD0] @Loads wd
- LDR r0,[sp,#0x90] @Loads *pu1_src
+ LDR r7,[sp,#wd_offset] @Loads wd
+ LDR r0,[sp,#144] @Loads *pu1_src
SUB r7,r7,r6
ADD r0,r0,r7
BGT WIDTH_LOOP_16 @If not equal jump to width_loop
@@ -524,8 +533,8 @@ SRC_LEFT_LOOP:
WD_16_HT_4_LOOP:
- LDR r7,[sp,#0xD0] @Loads wd
- LDR r5,[sp,#0xC8] @Loads pu1_avail
+ LDR r7,[sp,#wd_offset] @Loads wd
+ LDR r5,[sp,#pu1_avail_offset] @Loads pu1_avail
CMP r6,r7 @col == wd
LDRBEQ r8,[r5] @pu1_avail[0]
MOVNE r8,#-1 @au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
@@ -544,21 +553,21 @@ SKIP_AU1_MASK_VAL_WD_16_HT_4:
MOVNE r8,r3
SUB r8,r8,#1 @pu1_src_top_cpy - 1 || pu1_src - src_strd - 1
- LDR r7,[sp,#0xD0] @Loads wd
+ LDR r7,[sp,#wd_offset] @Loads wd
VLD1.8 D10,[r8]! @pu1_top_row = vld1q_u8(pu1_src - src_strd - 1) || vld1q_u8(pu1_src_top_cpy - 1)
VLD1.8 D11,[r8] @pu1_top_row = vld1q_u8(pu1_src - src_strd - 1) || vld1q_u8(pu1_src_top_cpy - 1)
SUB r8,#8
ADD r3,r3,#16
- ADD r5,sp,#0x42 @*au1_src_left_tmp
+ ADD r5,sp,#66 @*au1_src_left_tmp
VLD1.8 D12,[r0]! @pu1_cur_row = vld1q_u8(pu1_src)
VLD1.8 D13,[r0] @pu1_cur_row = vld1q_u8(pu1_src)
SUB r0,#8
- LDR r4,[sp,#0xD4] @Loads ht
+ LDR r4,[sp,#ht_offset] @Loads ht
SUB r7,r7,r6 @(wd - col)
VCGT.U8 Q7,Q6,Q5 @vcgtq_u8(pu1_cur_row, pu1_top_row)
- LDR r8,[sp,#0xC0] @Loads *pu1_src
+ LDR r8,[sp,#152] @Loads *pu1_src
ADD r7,r7,#15 @15 + (wd - col)
VCLT.U8 Q8,Q6,Q5 @vcltq_u8(pu1_cur_row, pu1_top_row)
@@ -588,7 +597,7 @@ PU1_SRC_LOOP_WD_16_HT_4:
CMP r7,r12
BLT SIGN_UP_CHANGE_WD_16_HT_4
- LDR r5,[sp,#0xC8] @Loads pu1_avail
+ LDR r5,[sp,#pu1_avail_offset] @Loads pu1_avail
LDRB r5,[r5,#2] @pu1_avail[2]
CMP r5,#0
BNE SIGN_UP_CHANGE_DONE_WD_16_HT_4
@@ -639,9 +648,9 @@ SIGN_UP_CHANGE_DONE_WD_16_HT_4:
SUBS r7,r7,#1 @Decrement the ht_tmp loop count by 1
BNE PU1_SRC_LOOP_WD_16_HT_4 @If not equal jump to PU1_SRC_LOOP_WD_16_HT_4
- LDR r8,[sp,#0xD4] @Loads ht
- ADD r5,sp,#0x42 @*au1_src_left_tmp
- LDR r2,[sp,#0xC4] @Loads *pu1_src_left
+ LDR r8,[sp,#ht_offset] @Loads ht
+ ADD r5,sp,#66 @*au1_src_left_tmp
+ LDR r2,[sp,#156] @Loads *pu1_src_left
SUB r5,r5,#1
SUB r2,r2,#1
@@ -656,8 +665,8 @@ SRC_LEFT_LOOP_WD_16_HT_4:
WIDTH_RESIDUE:
- LDR r7,[sp,#0xD0] @Loads wd
- LDR r5,[sp,#0xC8] @Loads pu1_avail
+ LDR r7,[sp,#wd_offset] @Loads wd
+ LDR r5,[sp,#pu1_avail_offset] @Loads pu1_avail
CMP r6,r7 @wd_residue == wd
LDRBEQ r8,[r5] @pu1_avail[0]
@@ -679,16 +688,16 @@ PU1_AVAIL_2_RESIDUE:
SUB r8,r8,#1
- ADD r5,sp,#0x42 @*au1_src_left_tmp
+ ADD r5,sp,#66 @*au1_src_left_tmp
VLD1.8 D10,[r8]! @pu1_top_row = vld1q_u8(pu1_src_top_cpy - 1)
VLD1.8 D11,[r8]! @pu1_top_row = vld1q_u8(pu1_src_top_cpy - 1)
- LDR r7,[sp,#0xD0] @Loads wd
+ LDR r7,[sp,#wd_offset] @Loads wd
- LDR r4,[sp,#0xD4] @Loads ht
+ LDR r4,[sp,#ht_offset] @Loads ht
VCGT.U8 Q7,Q6,Q5 @vcgtq_u8(pu1_cur_row, pu1_top_row)
SUB r7,r7,#1 @(wd - 1)
- LDR r8,[sp,#0xC0] @Loads *pu1_src
+ LDR r8,[sp,#152] @Loads *pu1_src
VCLT.U8 Q8,Q6,Q5 @vcltq_u8(pu1_cur_row, pu1_top_row)
SUB r5,r5,#1
@@ -718,7 +727,7 @@ PU1_SRC_LOOP_RESIDUE:
CMP r7,r12
BLT SIGN_UP_CHANGE_RESIDUE
- LDR r5,[sp,#0xC8] @Loads pu1_avail
+ LDR r5,[sp,#pu1_avail_offset] @Loads pu1_avail
LDRB r5,[r5,#2] @pu1_avail[2]
CMP r5,#0
BNE SIGN_UP_CHANGE_DONE_RESIDUE
@@ -762,10 +771,10 @@ SIGN_UP_CHANGE_DONE_RESIDUE:
SUBS r7,r7,#1
BNE PU1_SRC_LOOP_RESIDUE
- LDR r8,[sp,#0xD4] @Loads ht
- ADD r5,sp,#0x42 @*au1_src_left_tmp
+ LDR r8,[sp,#ht_offset] @Loads ht
+ ADD r5,sp,#66 @*au1_src_left_tmp
- LDR r2,[sp,#0xC4] @Loads *pu1_src_left
+ LDR r2,[sp,#156] @Loads *pu1_src_left
SUB r5,r5,#1
SUB r2,r2,#1
@@ -778,23 +787,23 @@ SRC_LEFT_LOOP_RESIDUE:
RE_ASSINING_LOOP:
- LDR r8,[sp,#0xD4] @Loads ht
- LDR r7,[sp,#0xD0] @Loads wd
+ LDR r8,[sp,#ht_offset] @Loads ht
+ LDR r7,[sp,#wd_offset] @Loads wd
- LDR r0,[sp,#0xC0] @Loads *pu1_src
+ LDR r0,[sp,#152] @Loads *pu1_src
SUB r8,r8,#1 @ht - 1
MLA r6,r8,r1,r7 @wd - 1 + (ht - 1) * src_strd
STRB r9,[r0] @pu1_src_org[0] = u1_pos_0_0_tmp
- LDR r4,[sp,#0xBC] @Loads pu1_src_top_left
+ LDR r4,[sp,#pu1_src_top_left_offset] @Loads pu1_src_top_left
ADD r6,r0,r6 @pu1_src[wd - 1 + (ht - 1) * src_strd]
- ADD r12,sp,#0x02
+ ADD r12,sp,#2
STRB r10,[r6,#-1] @pu1_src_org[wd - 1 + (ht - 1) * src_strd] = u1_pos_wd_ht_tmp
LDRB r11,[sp] @load u1_src_top_left_tmp from stack pointer
- LDR r3,[sp,#0xCC] @Loads pu1_src_top
+ LDR r3,[sp,#148] @Loads pu1_src_top
STRB r11,[r4] @*pu1_src_top_left = u1_src_top_left_tmp
@@ -805,7 +814,8 @@ SRC_TOP_LOOP:
BNE SRC_TOP_LOOP
END_LOOPS:
- ADD sp,sp,#0x94
+ ADD sp,sp,#160
+ vpop {d8 - d15}
LDMFD sp!,{r4-r12,r15} @Reload the registers from SP
diff --git a/common/arm/ihevc_sao_edge_offset_class2_chroma.s b/common/arm/ihevc_sao_edge_offset_class2_chroma.s
index 6a301cb..f7ab3f8 100644
--- a/common/arm/ihevc_sao_edge_offset_class2_chroma.s
+++ b/common/arm/ihevc_sao_edge_offset_class2_chroma.s
@@ -60,6 +60,15 @@
@r7 => wd
@r8=> ht
+.equ pu1_src_top_left_offset, 328
+.equ pu1_src_top_right_offset, 332
+.equ pu1_src_bot_left_offset, 336
+.equ pu1_avail_offset, 340
+.equ pi1_sao_u_offset, 344
+.equ pi1_sao_v_offset, 348
+.equ wd_offset, 352
+.equ ht_offset, 356
+
.text
.syntax unified
.p2align 2
@@ -86,23 +95,24 @@ ihevc_sao_edge_offset_class2_chroma_a9q:
STMFD sp!,{r4-r12,r14} @stack stores the values of the arguments
+ vpush {d8 - d15}
+ SUB sp,sp,#224 @Decrement the stack pointer to store some temp arr values
- LDR r7,[sp,#0x40] @Loads wd
- LDR r8,[sp,#0x44] @Loads ht
+ LDR r7,[sp,#wd_offset] @Loads wd
+ LDR r8,[sp,#ht_offset] @Loads ht
SUB r9,r7,#2 @wd - 2
- LDR r4,[sp,#0x28] @Loads pu1_src_top_left
+ LDR r4,[sp,#pu1_src_top_left_offset] @Loads pu1_src_top_left
LDRH r10,[r3,r9] @pu1_src_top[wd - 2]
- STR r0,[sp,#0x2C] @Store pu1_src in sp
+ STR r0,[sp,#212] @Store pu1_src in sp
MOV r9,r7 @Move width to r9 for loop count
- STR r2,[sp,#0x30] @Store pu1_src_left in sp
- LDR r5,[sp,#0x34] @Loads pu1_avail
- LDR r6,[sp,#0x38] @Loads pi1_sao_offset_u
+ STR r2,[sp,#216] @Store pu1_src_left in sp
+ LDR r5,[sp,#pu1_avail_offset] @Loads pu1_avail
+ LDR r6,[sp,#pi1_sao_u_offset] @Loads pi1_sao_offset_u
- STR r3,[sp,#0x38] @Store pu1_src_top in sp
- SUB sp,sp,#0xD4 @Decrement the stack pointer to store some temp arr values
+ STR r3,[sp,#220] @Store pu1_src_top in sp
STRH r10,[sp] @u1_src_top_left_tmp = pu1_src_top[wd - 2]
SUB r10,r8,#1 @ht-1
@@ -178,7 +188,7 @@ ulbl2:
LDRSB r12,[r14,r11] @edge_idx = gi1_table_edge_idx[edge_idx]
CMP r12,#0 @0 != edge_idx
BEQ PU1_AVAIL_7_LOOP_U
- LDR r11,[sp,#0x110] @Loads pi1_sao_offset_v
+ LDR r11,[sp,#pi1_sao_v_offset] @Loads pi1_sao_offset_v
LDRSB r11,[r11,r12] @pi1_sao_offset_v[edge_idx]
ADD r10,r10,r11 @pu1_src[0] + pi1_sao_offset_v[edge_idx]
USAT r10,#8,r10 @u1_pos_0_0_tmp_v = CLIP3(pu1_src[0] + pi1_sao_offset_v[edge_idx], 0, (1 << bit_depth) - 1)
@@ -253,7 +263,7 @@ ulbl4:
LDRSB r12,[r14,r11] @edge_idx = gi1_table_edge_idx[edge_idx]
CMP r12,#0
BEQ PU1_AVAIL_3_LOOP
- LDR r14,[sp,#0x110] @Loads pi1_sao_offset_v
+ LDR r14,[sp,#pi1_sao_v_offset] @Loads pi1_sao_offset_v
LDRSB r11,[r14,r12] @pi1_sao_offset_v[edge_idx]
ADD r9,r9,r11 @pu1_src[wd - 1 + (ht - 1) * src_strd] + pi1_sao_offset[edge_idx]
USAT r9,#8,r9 @u1_pos_wd_ht_tmp_v = CLIP3(pu1_src[wd - 1 + (ht - 1) * src_strd] + pi1_sao_offset[edge_idx], 0, (1 << bit_depth) - 1)
@@ -280,7 +290,7 @@ PU1_AVAIL_3_LOOP:
VLD1.8 D6,[r6] @offset_tbl_u = vld1_s8(pi1_sao_offset_u)
SUBEQ r12,r12,#1 @ht_tmp--
- LDR r6,[sp,#0x110] @Loads pi1_sao_offset_v
+ LDR r6,[sp,#pi1_sao_v_offset] @Loads pi1_sao_offset_v
ADDEQ r14,r14,#2 @pu1_src_left_cpy += 2
STR r0,[sp,#2] @Store pu1_src in sp
@@ -298,8 +308,8 @@ ulbl5:
BLE WD_16_HT_4_LOOP @If jump to WD_16_HT_4_LOOP
WIDTH_LOOP_16:
- LDR r5,[sp,#0x108] @Loads pu1_avail
- LDR r7,[sp,#0x114] @Loads wd
+ LDR r5,[sp,#pu1_avail_offset] @Loads pu1_avail
+ LDR r7,[sp,#wd_offset] @Loads wd
CMP r6,r7 @col == wd
LDRBEQ r8,[r5] @pu1_avail[0]
@@ -321,16 +331,16 @@ SKIP_AU1_MASK_VAL:
SUB r0,#8
CMP r9,#0
- LDR r4,[sp,#0x118] @Loads ht
+ LDR r4,[sp,#ht_offset] @Loads ht
SUBEQ r8,r0,r1 @pu1_src - src_strd
- LDR r7,[sp,#0x114] @Loads wd
+ LDR r7,[sp,#wd_offset] @Loads wd
MOVNE r8,r3 @pu1_src_top_cpy
SUB r8,r8,#2 @pu1_src - src_strd - 2
ADD r3,r3,#16
- ADD r5,sp,#0x4B @*au1_src_left_tmp
+ ADD r5,sp,#75 @*au1_src_left_tmp
VLD1.8 D10,[r8]! @pu1_top_row = vld1q_u8(pu1_src - src_strd - 2) || vld1q_u8(pu1_src_top_cpy - 2)
VLD1.8 D11,[r8] @pu1_top_row = vld1q_u8(pu1_src - src_strd - 2) || vld1q_u8(pu1_src_top_cpy - 2)
SUB r8,#8
@@ -338,7 +348,7 @@ SKIP_AU1_MASK_VAL:
ADD r7,r7,#14 @15 + (wd - col)
VCGT.U8 Q7,Q6,Q5 @vcgtq_u8(pu1_cur_row, pu1_top_row)
- LDR r8,[sp,#0x100] @Loads *pu1_src
+ LDR r8,[sp,#212] @Loads *pu1_src
ADD r7,r8,r7 @pu1_src[0 * src_strd + 15 + (wd - col)]
VCLT.U8 Q8,Q6,Q5 @vcltq_u8(pu1_cur_row, pu1_top_row)
@@ -364,7 +374,7 @@ AU1_SRC_LEFT_LOOP:
VMOV.I8 Q9,#0
LDRH r5,[r8] @I pu1_src_cpy[src_strd + 16]
- LDR r10,[sp,#0x108] @I Loads pu1_avail
+ LDR r10,[sp,#pu1_avail_offset] @I Loads pu1_avail
VMOV.16 D18[0],r5 @I pu1_next_row_tmp = vsetq_lane_u8(pu1_src_cpy[src_strd + 16], pu1_next_row_tmp, 0)
LDRB r10,[r10,#2] @I pu1_avail[2]
@@ -654,11 +664,11 @@ PU1_SRC_LOOP:
INNER_LOOP_DONE:
- LDR r8,[sp,#0x118] @Loads ht
+ LDR r8,[sp,#ht_offset] @Loads ht
VMOVN.I16 D20,Q10 @vmovn_s16(pi2_tmp_cur_row.val[0])
- ADD r5,sp,#0x4B @*au1_src_left_tmp
+ ADD r5,sp,#75 @*au1_src_left_tmp
- LDR r11,[sp,#0x104] @Loads *pu1_src_left
+ LDR r11,[sp,#216] @Loads *pu1_src_left
VMOVN.I16 D21,Q9 @vmovn_s16(pi2_tmp_cur_row.val[1])
@@ -673,8 +683,8 @@ SRC_LEFT_LOOP:
CMP r6,#8 @Check whether residue remains
BLT RE_ASSINING_LOOP @Jump to re-assigning loop
- LDR r7,[sp,#0x114] @Loads wd
- LDR r0,[sp,#0x02] @Loads *pu1_src
+ LDR r7,[sp,#wd_offset] @Loads wd
+ LDR r0,[sp,#2] @Loads *pu1_src
SUB r7,r7,r6
ADD r0,r0,r7
BGT WIDTH_LOOP_16 @If not equal jump to width_loop
@@ -682,8 +692,8 @@ SRC_LEFT_LOOP:
WD_16_HT_4_LOOP:
- LDR r5,[sp,#0x108] @Loads pu1_avail
- LDR r7,[sp,#0x114] @Loads wd
+ LDR r5,[sp,#pu1_avail_offset] @Loads pu1_avail
+ LDR r7,[sp,#wd_offset] @Loads wd
CMP r6,r7 @col == wd
LDRBEQ r8,[r5] @pu1_avail[0]
@@ -709,12 +719,12 @@ SKIP_AU1_MASK_VAL_WD_16_HT_4:
SUB r8,#8
ADD r3,r3,#16
- ADD r5,sp,#0x4B @*au1_src_left_tmp
- LDR r4,[sp,#0x118] @Loads ht
- LDR r7,[sp,#0x114] @Loads wd
+ ADD r5,sp,#75 @*au1_src_left_tmp
+ LDR r4,[sp,#ht_offset] @Loads ht
+ LDR r7,[sp,#wd_offset] @Loads wd
SUB r7,r7,r6 @(wd - col)
ADD r7,r7,#14 @15 + (wd - col)
- LDR r8,[sp,#0x100] @Loads *pu1_src
+ LDR r8,[sp,#212] @Loads *pu1_src
ADD r7,r8,r7 @pu1_src[0 * src_strd + 15 + (wd - col)]
AU1_SRC_LEFT_LOOP_WD_16_HT_4:
@@ -749,7 +759,7 @@ PU1_SRC_LOOP_WD_16_HT_4:
CMP r7,r12
BLT SIGN_UP_CHANGE_WD_16_HT_4
- LDR r5,[sp,#0x108] @Loads pu1_avail
+ LDR r5,[sp,#pu1_avail_offset] @Loads pu1_avail
LDRB r5,[r5,#2] @pu1_avail[2]
CMP r5,#0
BNE SIGN_UP_CHANGE_DONE_WD_16_HT_4
@@ -815,9 +825,9 @@ SIGN_UP_CHANGE_DONE_WD_16_HT_4:
SUBS r7,r7,#1 @Decrement the ht_tmp loop count by 1
BNE PU1_SRC_LOOP_WD_16_HT_4 @If not equal jump to PU1_SRC_LOOP_WD_16_HT_4
- LDR r8,[sp,#0x118] @Loads ht
- ADD r5,sp,#0x4B @*au1_src_left_tmp
- LDR r11,[sp,#0x104] @Loads *pu1_src_left
+ LDR r8,[sp,#ht_offset] @Loads ht
+ ADD r5,sp,#75 @*au1_src_left_tmp
+ LDR r11,[sp,#216] @Loads *pu1_src_left
SRC_LEFT_LOOP_WD_16_HT_4:
LDR r7,[r5],#4 @au1_src_left_tmp[row]
@@ -829,16 +839,16 @@ SRC_LEFT_LOOP_WD_16_HT_4:
SUBS r6,r6,#16 @Decrement the wd loop count by 16
BLE RE_ASSINING_LOOP @Jump to re-assigning loop
- LDR r7,[sp,#0x114] @Loads wd
- LDR r0,[sp,#0x02] @Loads *pu1_src
+ LDR r7,[sp,#wd_offset] @Loads wd
+ LDR r0,[sp,#2] @Loads *pu1_src
SUB r7,r7,r6
ADD r0,r0,r7
BGT WD_16_HT_4_LOOP
WIDTH_RESIDUE:
- LDR r7,[sp,#0x114] @Loads wd
- LDR r5,[sp,#0x108] @Loads pu1_avail
+ LDR r7,[sp,#wd_offset] @Loads wd
+ LDR r5,[sp,#pu1_avail_offset] @Loads pu1_avail
CMP r6,r7 @wd_residue == wd
LDRBEQ r8,[r5] @pu1_avail[0]
@@ -860,10 +870,10 @@ WIDTH_RESIDUE:
VLD1.8 D11,[r8] @pu1_top_row = vld1q_u8(pu1_src - src_strd - 2)
SUB r8,#8
- ADD r5,sp,#0x4B @*au1_src_left_tmp
- LDR r4,[sp,#0x118] @Loads ht
- LDR r7,[sp,#0x114] @Loads wd
- LDR r8,[sp,#0x100] @Loads *pu1_src
+ ADD r5,sp,#75 @*au1_src_left_tmp
+ LDR r4,[sp,#ht_offset] @Loads ht
+ LDR r7,[sp,#wd_offset] @Loads wd
+ LDR r8,[sp,#212] @Loads *pu1_src
SUB r7,r7,#2 @(wd - 2)
ADD r7,r8,r7 @pu1_src[0 * src_strd + (wd - 2)]
@@ -897,7 +907,7 @@ PU1_SRC_LOOP_RESIDUE:
CMP r7,r12
BLT SIGN_UP_CHANGE_RESIDUE
- LDR r5,[sp,#0x108] @Loads pu1_avail
+ LDR r5,[sp,#pu1_avail_offset] @Loads pu1_avail
LDRB r5,[r5,#2] @pu1_avail[2]
CMP r5,#0
BNE SIGN_UP_CHANGE_DONE_RESIDUE
@@ -957,9 +967,9 @@ SIGN_UP_CHANGE_DONE_RESIDUE:
SUBS r7,r7,#1 @Decrement the ht_tmp loop count by 1
BNE PU1_SRC_LOOP_RESIDUE @If not equal jump to PU1_SRC_LOOP
- LDR r8,[sp,#0x118] @Loads ht
- LDR r11,[sp,#0x104] @Loads *pu1_src_left
- ADD r5,sp,#0x4B @*au1_src_left_tmp
+ LDR r8,[sp,#ht_offset] @Loads ht
+ LDR r11,[sp,#216] @Loads *pu1_src_left
+ ADD r5,sp,#75 @*au1_src_left_tmp
SRC_LEFT_LOOP_RESIDUE:
LDR r7,[r5],#4 @au1_src_left_tmp[row]
@@ -970,12 +980,12 @@ SRC_LEFT_LOOP_RESIDUE:
RE_ASSINING_LOOP:
- LDR r8,[sp,#0x118] @Loads ht
+ LDR r8,[sp,#ht_offset] @Loads ht
- LDR r0,[sp,#0x100] @Loads *pu1_src
+ LDR r0,[sp,#212] @Loads *pu1_src
SUB r8,r8,#1 @ht - 1
- LDR r7,[sp,#0x114] @Loads wd
+ LDR r7,[sp,#wd_offset] @Loads wd
LDRH r9,[sp,#6]
MLA r6,r8,r1,r7 @wd - 2 + (ht - 1) * src_strd
@@ -987,10 +997,10 @@ RE_ASSINING_LOOP:
ADD r12,sp,#10
STRH r9,[r6,#-2] @pu1_src_org[wd - 1 + (ht - 1) * src_strd] = u1_pos_wd_ht_tmp_u
- LDR r4,[sp,#0xFC] @Loads pu1_src_top_left
+ LDR r4,[sp,#pu1_src_top_left_offset] @Loads pu1_src_top_left
LDRH r10,[sp] @load u1_src_top_left_tmp from stack pointer
STRH r10,[r4] @*pu1_src_top_left = u1_src_top_left_tmp
- LDR r3,[sp,#0x10C] @Loads pu1_src_top
+ LDR r3,[sp,#220] @Loads pu1_src_top
SRC_TOP_LOOP:
VLD1.8 D0,[r12]! @pu1_src_top[col] = au1_src_top_tmp[col]
@@ -999,7 +1009,9 @@ SRC_TOP_LOOP:
BNE SRC_TOP_LOOP
END_LOOPS:
- ADD sp,sp,#0xD4
+ ADD sp,sp,#224
+
+ vpop {d8 - d15}
LDMFD sp!,{r4-r12,r15} @Reload the registers from SP
diff --git a/common/arm/ihevc_sao_edge_offset_class3.s b/common/arm/ihevc_sao_edge_offset_class3.s
index f3482dc..fb3b05c 100644
--- a/common/arm/ihevc_sao_edge_offset_class3.s
+++ b/common/arm/ihevc_sao_edge_offset_class3.s
@@ -58,6 +58,14 @@
@r7 => wd
@r8=> ht
+.equ pu1_src_top_left_offset, 264
+.equ pu1_src_top_right_offset, 268
+.equ pu1_src_bot_left_offset, 272
+.equ pu1_avail_offset, 276
+.equ pi1_sao_offset, 280
+.equ wd_offset, 284
+.equ ht_offset, 288
+
.text
.syntax unified
.p2align 2
@@ -78,26 +86,27 @@ ihevc_sao_edge_offset_class3_a9q:
STMFD sp!,{r4-r12,r14} @stack stores the values of the arguments
- LDR r7,[sp,#0x3C] @Loads wd
+ vpush {d8 - d15}
+ SUB sp,sp,#160 @Decrement the stack pointer to store some temp arr values
+ LDR r7,[sp,#wd_offset] @Loads wd
- LDR r8,[sp,#0x40] @Loads ht
+ LDR r8,[sp,#ht_offset] @Loads ht
SUB r9,r7,#1 @wd - 1
- LDR r4,[sp,#0x28] @Loads pu1_src_top_left
+ LDR r4,[sp,#pu1_src_top_left_offset] @Loads pu1_src_top_left
LDRB r10,[r3,r9] @pu1_src_top[wd - 1]
MOV r9,r7 @Move width to r9 for loop count
- LDR r5,[sp,#0x34] @Loads pu1_avail
- LDR r6,[sp,#0x38] @Loads pi1_sao_offset
- STR r3,[sp,#0x38] @Store pu1_src_top in sp
+ LDR r5,[sp,#pu1_avail_offset] @Loads pu1_avail
+ LDR r6,[sp,#pi1_sao_offset] @Loads pi1_sao_offset
+ STR r3,[sp,#156] @Store pu1_src_top in sp
- SUB sp,sp,#0x94 @Decrement the stack pointer to store some temp arr values
STRB r10,[sp] @u1_src_top_left_tmp = pu1_src_top[wd - 1]
SUB r10,r8,#1 @ht-1
MLA r11,r10,r1,r0 @pu1_src[(ht - 1) * src_strd + col]
- ADD r12,sp,#0x02 @temp array
+ ADD r12,sp,#2 @temp array
AU1_SRC_TOP_LOOP:
VLD1.8 D0,[r11]! @pu1_src[(ht - 1) * src_strd + col]
@@ -112,7 +121,7 @@ PU1_AVAIL_5_LOOP:
LDRB r9,[r0,r10] @u1_pos_0_0_tmp = pu1_src[wd - 1]
BEQ PU1_AVAIL_6_LOOP
- LDR r11,[sp,#0xC0] @Load pu1_src_top_right from sp
+ LDR r11,[sp,#pu1_src_top_right_offset] @Load pu1_src_top_right from sp
SUB r10,r10,#1 @[wd - 1 - 1]
LDRB r11,[r11] @pu1_src_top_right[0]
@@ -147,13 +156,13 @@ PU1_AVAIL_6_LOOP:
SUB r11,r8,#1 @ht - 1
CMP r10,#0
- STR r0,[sp,#0xC0] @Store pu1_src in sp
+ STR r0,[sp,#148] @Store pu1_src in sp
MLA r12,r11,r1,r0 @pu1_src[(ht - 1) * src_strd]
LDRB r10,[r12] @u1_pos_wd_ht_tmp = pu1_src[(ht - 1) * src_strd]
BEQ PU1_AVAIL_3_LOOP
- LDR r14,[sp,#0xC4] @Load pu1_src_bot_left from sp
+ LDR r14,[sp,#pu1_src_bot_left_offset] @Load pu1_src_bot_left from sp
SUB r11,r12,r1 @pu1_src[(ht - 1) * src_strd) - src_strd]
LDRB r14,[r14] @Load pu1_src_bot_left[0]
@@ -186,7 +195,7 @@ ulbl2:
USAT r10,#8,r10 @u1_pos_wd_ht_tmp = CLIP3(pu1_src[(ht - 1) * src_strd] + pi1_sao_offset[edge_idx], 0, (1 << bit_depth) - 1)
PU1_AVAIL_3_LOOP:
- STR r2,[sp,#0xC4] @Store pu1_src_left in sp
+ STR r2,[sp,#152] @Store pu1_src_left in sp
MOV r12,r8 @Move ht
MOV r14,r2 @Move pu1_src_left to pu1_src_left_cpy
@@ -211,7 +220,7 @@ ulbl3:
VMOV.S8 Q4,#0xFF @au1_mask = vdupq_n_s8(-1)
ADDEQ r14,r14,#1 @pu1_src_left_cpy += 1
- STR r0,[sp,#0x90] @Store pu1_src in sp
+ STR r0,[sp,#144] @Store pu1_src in sp
VLD1.8 D6,[r6] @edge_idx_tbl = vld1_s8(gi1_table_edge_idx)
MOV r6,r7 @move wd to r6 loop_count
@@ -221,9 +230,9 @@ ulbl3:
BLE WD_16_HT_4_LOOP @If jump to WD_16_HT_4_LOOP
WIDTH_LOOP_16:
- LDR r7,[sp,#0xD0] @Loads wd
+ LDR r7,[sp,#wd_offset] @Loads wd
- LDR r5,[sp,#0xC8] @Loads pu1_avail
+ LDR r5,[sp,#pu1_avail_offset] @Loads pu1_avail
CMP r6,r7 @col == wd
LDRBEQ r8,[r5] @pu1_avail[0]
MOVNE r8,#-1
@@ -238,13 +247,13 @@ SKIP_AU1_MASK_VAL:
LDRB r8,[r5,#2] @pu1_avail[2]
CMP r8,#0
- LDR r4,[sp,#0xD4] @Loads ht
+ LDR r4,[sp,#ht_offset] @Loads ht
SUBEQ r8,r0,r1 @pu1_src - src_strd
MOVNE r8,r3
- ADD r5,sp,#0x42 @*au1_src_left_tmp
+ ADD r5,sp,#66 @*au1_src_left_tmp
- LDR r7,[sp,#0xD0] @Loads wd
+ LDR r7,[sp,#wd_offset] @Loads wd
ADD r8,r8,#1 @pu1_src - src_strd + 1
SUB r7,r7,r6 @(wd - col)
@@ -253,7 +262,7 @@ SKIP_AU1_MASK_VAL:
SUB r8,#8
ADD r3,r3,#16
- LDR r8,[sp,#0xC0] @Loads *pu1_src
+ LDR r8,[sp,#148] @Loads *pu1_src
VLD1.8 D12,[r0]! @pu1_cur_row = vld1q_u8(pu1_src)
VLD1.8 D13,[r0] @pu1_cur_row = vld1q_u8(pu1_src)
SUB r0,#8
@@ -285,7 +294,7 @@ AU1_SRC_LEFT_LOOP:
ADD r8,r8,#1 @I pu1_src_left_cpy[ht_tmp - row + 1]
LDRB r8,[r8]
- LDR r5,[sp,#0xC8] @I Loads pu1_avail
+ LDR r5,[sp,#pu1_avail_offset] @I Loads pu1_avail
VMOV.8 D19[7],r8 @I vsetq_lane_u8
LDRB r5,[r5,#2] @I pu1_avail[2]
@@ -375,7 +384,7 @@ PU1_SRC_LOOP:
CMP r7,#1 @III
BNE NEXT_ROW_ELSE_2 @III
- LDR r5,[sp,#0xC8] @III Loads pu1_avail
+ LDR r5,[sp,#pu1_avail_offset] @III Loads pu1_avail
LDRB r5,[r5,#3] @III pu1_avail[3]
CMP r5,#0 @III
SUBNE r8,r2,#2 @III pu1_src_cpy[src_strd - 1]
@@ -465,7 +474,7 @@ NEXT_ROW_ELSE_2:
ADD r8,r0,r1,LSL #1 @*pu1_src + src_strd
VMOVN.I16 D20,Q10 @III vmovn_s16(pi2_tmp_cur_row.val[0])
- LDR r5,[sp,#0xC8] @Loads pu1_avail
+ LDR r5,[sp,#pu1_avail_offset] @Loads pu1_avail
LDRB r5,[r5,#3] @pu1_avail[3]
VMOVN.I16 D21,Q11 @III vmovn_s16(pi2_tmp_cur_row.val[1])
@@ -529,13 +538,13 @@ NEXT_ROW_POINTER_ASSIGNED_3:
INNER_LOOP_DONE:
VMOVN.I16 D20,Q10 @vmovn_s16(pi2_tmp_cur_row.val[0])
- LDR r8,[sp,#0xD4] @Loads ht
+ LDR r8,[sp,#ht_offset] @Loads ht
VMOVN.I16 D21,Q11 @vmovn_s16(pi2_tmp_cur_row.val[1])
- ADD r5,sp,#0x42 @*au1_src_left_tmp
+ ADD r5,sp,#66 @*au1_src_left_tmp
VST1.8 {Q10},[r0],r1 @vst1q_u8(pu1_src_cpy, pu1_cur_row)
- LDR r2,[sp,#0xC4] @Loads *pu1_src_left
+ LDR r2,[sp,#152] @Loads *pu1_src_left
SRC_LEFT_LOOP:
LDR r7,[r5],#4 @au1_src_left_tmp[row]
SUBS r8,r8,#4
@@ -545,8 +554,8 @@ SRC_LEFT_LOOP:
SUBS r6,r6,#16 @Decrement the wd loop count by 16
CMP r6,#8 @Check whether residue remains
BLT RE_ASSINING_LOOP @Jump to re-assigning loop
- LDR r7,[sp,#0xD0] @Loads wd
- LDR r0,[sp,#0x90] @Loads *pu1_src
+ LDR r7,[sp,#wd_offset] @Loads wd
+ LDR r0,[sp,#144] @Loads *pu1_src
SUB r7,r7,r6
ADD r0,r0,r7
BGT WIDTH_LOOP_16 @If not equal jump to width_loop
@@ -555,8 +564,8 @@ SRC_LEFT_LOOP:
WD_16_HT_4_LOOP:
- LDR r5,[sp,#0xC8] @Loads pu1_avail
- LDR r7,[sp,#0xD0] @Loads wd
+ LDR r5,[sp,#pu1_avail_offset] @Loads pu1_avail
+ LDR r7,[sp,#wd_offset] @Loads wd
CMP r6,r7 @col == wd
LDRBEQ r8,[r5] @pu1_avail[0]
MOVNE r8,#-1
@@ -579,12 +588,12 @@ SKIP_AU1_MASK_VAL_WD_16_HT_4:
SUB r8,#8
ADD r3,r3,#16
- ADD r5,sp,#0x42 @*au1_src_left_tmp
- LDR r4,[sp,#0xD4] @Loads ht
- LDR r7,[sp,#0xD0] @Loads wd
+ ADD r5,sp,#66 @*au1_src_left_tmp
+ LDR r4,[sp,#ht_offset] @Loads ht
+ LDR r7,[sp,#wd_offset] @Loads wd
SUB r7,r7,r6 @(wd - col)
ADD r7,r7,#15 @15 + (wd - col)
- LDR r8,[sp,#0xC0] @Loads *pu1_src
+ LDR r8,[sp,#148] @Loads *pu1_src
ADD r7,r8,r7 @pu1_src[0 * src_strd + 15 + (wd - col)]
SUB r5,r5,#1
@@ -609,7 +618,7 @@ PU1_SRC_LOOP_WD_16_HT_4:
VLD1.8 D16,[r8]! @pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
VLD1.8 D17,[r8] @pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
SUB r8,#8
- LDR r5,[sp,#0xC8] @Loads pu1_avail
+ LDR r5,[sp,#pu1_avail_offset] @Loads pu1_avail
LDRB r5,[r5,#3] @pu1_avail[3]
CMP r5,#0
BEQ NEXT_ROW_ELSE_WD_16_HT_4
@@ -628,7 +637,7 @@ NEXT_ROW_POINTER_ASSIGNED_WD_16_HT_4:
CMP r7,r12
BNE SIGN_UP_CHANGE_WD_16_HT_4
- LDR r5,[sp,#0xC8] @Loads pu1_avail
+ LDR r5,[sp,#pu1_avail_offset] @Loads pu1_avail
LDRB r5,[r5,#2] @pu1_avail[2]
CMP r5,#0
BNE SIGN_UP_CHANGE_DONE_WD_16_HT_4
@@ -680,9 +689,9 @@ SIGN_UP_CHANGE_DONE_WD_16_HT_4:
SUBS r7,r7,#1 @Decrement the ht_tmp loop count by 1
BNE PU1_SRC_LOOP_WD_16_HT_4 @If not equal jump to PU1_SRC_LOOP_WD_16_HT_4
- LDR r8,[sp,#0xD4] @Loads ht
- ADD r5,sp,#0x42 @*au1_src_left_tmp
- LDR r2,[sp,#0xC4] @Loads *pu1_src_left
+ LDR r8,[sp,#ht_offset] @Loads ht
+ ADD r5,sp,#66 @*au1_src_left_tmp
+ LDR r2,[sp,#152] @Loads *pu1_src_left
SRC_LEFT_LOOP_WD_16_HT_4:
LDR r7,[r5],#4 @au1_src_left_tmp[row]
STR r7,[r2],#4 @pu1_src_left[row] = au1_src_left_tmp[row]
@@ -691,16 +700,16 @@ SRC_LEFT_LOOP_WD_16_HT_4:
SUBS r6,r6,#16 @Decrement the wd loop count by 16
BLE RE_ASSINING_LOOP @Jump to re-assigning loop
- LDR r7,[sp,#0xD0] @Loads wd
- LDR r0,[sp,#0x90] @Loads *pu1_src
+ LDR r7,[sp,#wd_offset] @Loads wd
+ LDR r0,[sp,#144] @Loads *pu1_src
SUB r7,r7,r6
ADD r0,r0,r7
BGT WD_16_HT_4_LOOP @If not equal jump to width_loop
WIDTH_RESIDUE:
- LDR r7,[sp,#0xD0] @Loads wd
- LDR r5,[sp,#0xC8] @Loads pu1_avail
+ LDR r7,[sp,#wd_offset] @Loads wd
+ LDR r5,[sp,#pu1_avail_offset] @Loads pu1_avail
CMP r6,r7 @wd_residue == wd
LDRBEQ r8,[r5] @pu1_avail[0]
@@ -722,10 +731,10 @@ PU1_AVAIL_2_RESIDUE:
SUB r8,#8
- ADD r5,sp,#0x42 @*au1_src_left_tmp
- LDR r4,[sp,#0xD4] @Loads ht
- LDR r7,[sp,#0xD0] @Loads wd
- LDR r8,[sp,#0xC0] @Loads *pu1_src
+ ADD r5,sp,#66 @*au1_src_left_tmp
+ LDR r4,[sp,#ht_offset] @Loads ht
+ LDR r7,[sp,#wd_offset] @Loads wd
+ LDR r8,[sp,#148] @Loads *pu1_src
SUB r7,r7,#1 @(wd - 1)
ADD r7,r8,r7 @pu1_src[0 * src_strd + (wd - 1)]
SUB r5,r5,#1
@@ -751,7 +760,7 @@ PU1_SRC_LOOP_RESIDUE:
VLD1.8 D16,[r8]! @pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
VLD1.8 D17,[r8] @pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
SUB r8,#8
- LDR r5,[sp,#0xC8] @Loads pu1_avail
+ LDR r5,[sp,#pu1_avail_offset] @Loads pu1_avail
LDRB r5,[r5,#3] @pu1_avail[3]
CMP r5,#0
BEQ NEXT_ROW_ELSE_RESIDUE
@@ -770,7 +779,7 @@ NEXT_ROW_POINTER_ASSIGNED_RESIDUE:
CMP r7,r12
BNE SIGN_UP_CHANGE_RESIDUE
- LDR r5,[sp,#0xC8] @Loads pu1_avail
+ LDR r5,[sp,#pu1_avail_offset] @Loads pu1_avail
LDRB r5,[r5,#2] @pu1_avail[2]
CMP r5,#0
BNE SIGN_UP_CHANGE_DONE_RESIDUE
@@ -814,9 +823,9 @@ SIGN_UP_CHANGE_DONE_RESIDUE:
SUBS r7,r7,#1
BNE PU1_SRC_LOOP_RESIDUE
- LDR r8,[sp,#0xD4] @Loads ht
- LDR r2,[sp,#0xC4] @Loads *pu1_src_left
- ADD r5,sp,#0x42 @*au1_src_left_tmp
+ LDR r8,[sp,#ht_offset] @Loads ht
+ LDR r2,[sp,#152] @Loads *pu1_src_left
+ ADD r5,sp,#66 @*au1_src_left_tmp
SRC_LEFT_LOOP_RESIDUE:
LDR r7,[r5],#4 @au1_src_left_tmp[row]
@@ -826,24 +835,24 @@ SRC_LEFT_LOOP_RESIDUE:
RE_ASSINING_LOOP:
- LDR r7,[sp,#0xD0] @Loads wd
- LDR r0,[sp,#0xC0] @Loads *pu1_src
+ LDR r7,[sp,#wd_offset] @Loads wd
+ LDR r0,[sp,#148] @Loads *pu1_src
- LDR r11,[sp,#0xD4] @Loads ht
+ LDR r11,[sp,#ht_offset] @Loads ht
ADD r8,r0,r7 @pu1_src[wd]
- LDR r4,[sp,#0xBC] @Loads pu1_src_top_left
+ LDR r4,[sp,#pu1_src_top_left_offset] @Loads pu1_src_top_left
SUB r11,r11,#1 @ht - 1
STRB r9,[r8,#-1] @pu1_src_org[wd - 1] = u1_pos_wd_0_tmp
MLA r6,r11,r1,r0 @pu1_src_org[(ht - 1) * src_strd]
LDRB r8,[sp] @load u1_src_top_left_tmp from stack pointer
- ADD r12,sp,#0x02
+ ADD r12,sp,#2
STRB r10,[r6] @pu1_src_org[wd - 1 + (ht - 1) * src_strd] = u1_pos_wd_ht_tmp
STRB r8,[r4] @*pu1_src_top_left = u1_src_top_left_tmp
- LDR r3,[sp,#0xCC] @Loads pu1_src_top
+ LDR r3,[sp,#156] @Loads pu1_src_top
SRC_TOP_LOOP:
VLD1.8 D0,[r12]! @pu1_src_top[col] = au1_src_top_tmp[col]
@@ -852,7 +861,8 @@ SRC_TOP_LOOP:
BNE SRC_TOP_LOOP
END_LOOPS:
- ADD sp,sp,#0x94
+ ADD sp,sp,#160
+ vpop {d8 - d15}
LDMFD sp!,{r4-r12,r15} @Reload the registers from SP
diff --git a/common/arm/ihevc_sao_edge_offset_class3_chroma.s b/common/arm/ihevc_sao_edge_offset_class3_chroma.s
index fe3b459..9f4eb62 100644
--- a/common/arm/ihevc_sao_edge_offset_class3_chroma.s
+++ b/common/arm/ihevc_sao_edge_offset_class3_chroma.s
@@ -60,6 +60,15 @@
@r7 => wd
@r8=> ht
+.equ pu1_src_top_left_offset, 328
+.equ pu1_src_top_right_offset, 332
+.equ pu1_src_bot_left_offset, 336
+.equ pu1_avail_offset, 340
+.equ pi1_sao_u_offset, 344
+.equ pi1_sao_v_offset, 348
+.equ wd_offset, 352
+.equ ht_offset, 356
+
.text
.syntax unified
.p2align 2
@@ -86,21 +95,22 @@ ihevc_sao_edge_offset_class3_chroma_a9q:
STMFD sp!,{r4-r12,r14} @stack stores the values of the arguments
+ vpush {d8 - d15}
+ SUB sp,sp,#224 @Decrement the stack pointer to store some temp arr values
- LDR r7,[sp,#0x40] @Loads wd
- LDR r8,[sp,#0x44] @Loads ht
+ LDR r7,[sp,#wd_offset] @Loads wd
+ LDR r8,[sp,#ht_offset] @Loads ht
SUB r9,r7,#2 @wd - 2
- LDR r4,[sp,#0x28] @Loads pu1_src_top_left
+ LDR r4,[sp,#pu1_src_top_left_offset] @Loads pu1_src_top_left
LDRH r10,[r3,r9] @pu1_src_top[wd - 2]
MOV r9,r7 @Move width to r9 for loop count
- LDR r5,[sp,#0x34] @Loads pu1_avail
- LDR r6,[sp,#0x38] @Loads pi1_sao_offset_u
+ LDR r5,[sp,#pu1_avail_offset] @Loads pu1_avail
+ LDR r6,[sp,#pi1_sao_u_offset] @Loads pi1_sao_offset_u
- STR r3,[sp,#0x38] @Store pu1_src_top in sp
- SUB sp,sp,#0xD4 @Decrement the stack pointer to store some temp arr values
+ STR r3,[sp,#220] @Store pu1_src_top in sp
STRH r10,[sp] @u1_src_top_left_tmp = pu1_src_top[wd - 2]
SUB r10,r8,#1 @ht-1
@@ -122,7 +132,7 @@ PU1_AVAIL_5_LOOP_U:
LDRB r10,[r0,r11] @u1_pos_0_0_tmp_v = pu1_src[wd - 1]
BEQ PU1_AVAIL_6_LOOP_U
- LDR r11,[sp,#0x100] @Load pu1_src_top_right from sp
+ LDR r11,[sp,#pu1_src_top_right_offset] @Load pu1_src_top_right from sp
LDRB r11,[r11] @pu1_src_top_right[0]
SUB r12,r9,r11 @pu1_src[wd - 2] - pu1_src_top_right[0]
CMP r12,#0
@@ -150,7 +160,7 @@ ulbl1:
PU1_AVAIL_5_LOOP_V:
- LDR r11,[sp,#0x100] @Load pu1_src_top_right from sp
+ LDR r11,[sp,#pu1_src_top_right_offset] @Load pu1_src_top_right from sp
LDRB r11,[r11,#1] @pu1_src_top_right[1]
SUB r12,r10,r11 @pu1_src[wd - 1] - pu1_src_top_right[1]
CMP r12,#0
@@ -172,7 +182,7 @@ ulbl2:
LDRSB r12,[r14,r11] @edge_idx = gi1_table_edge_idx[edge_idx]
CMP r12,#0 @0 != edge_idx
BEQ PU1_AVAIL_6_LOOP_U
- LDR r11,[sp,#0x110] @Loads pi1_sao_offset_v
+ LDR r11,[sp,#pi1_sao_v_offset] @Loads pi1_sao_offset_v
LDRSB r11,[r11,r12] @pi1_sao_offset_v[edge_idx]
ADD r10,r10,r11 @pu1_src[wd - 1] + pi1_sao_offset_v[edge_idx]
USAT r10,#8,r10 @u1_pos_0_0_tmp_v = CLIP3(pu1_src[wd - 1] + pi1_sao_offset_v[edge_idx], 0, (1 << bit_depth) - 1)
@@ -180,7 +190,7 @@ ulbl2:
PU1_AVAIL_6_LOOP_U:
STRB r9,[sp,#6]
STRB r10,[sp,#7]
- STR r0,[sp,#0x100] @Store pu1_src in sp
+ STR r0,[sp,#212] @Store pu1_src in sp
LDRB r10,[r5,#6] @pu1_avail[6]
CMP r10,#0
@@ -198,7 +208,7 @@ PU1_AVAIL_6_LOOP_U:
MVNLT r11,#0
MOVGT r11,#1 @SIGN(pu1_src[(ht - 1) * src_strd] - pu1_src[(ht - 1) * src_strd + 2 - src_strd])
- LDR r14,[sp,#0x104] @Load pu1_src_bot_left from sp
+ LDR r14,[sp,#pu1_src_bot_left_offset] @Load pu1_src_bot_left from sp
LDRB r14,[r14] @Load pu1_src_bot_left[0]
SUB r14,r10,r14 @pu1_src[(ht - 1) * src_strd] - pu1_src_bot_left[0]
CMP r14,#0
@@ -228,7 +238,7 @@ PU1_AVAIL_6_LOOP_V:
MVNLT r11,#0
MOVGT r11,#1 @SIGN(pu1_src[(ht - 1) * src_strd + 1] - pu1_src[(ht - 1) * src_strd + 1 + 2 - src_strd])
- LDR r14,[sp,#0x104] @Load pu1_src_bot_left from sp
+ LDR r14,[sp,#pu1_src_bot_left_offset] @Load pu1_src_bot_left from sp
LDRB r14,[r14,#1] @Load pu1_src_bot_left[1]
SUB r14,r9,r14 @pu1_src[(ht - 1) * src_strd + 1] - pu1_src_bot_left[1]
CMP r14,#0
@@ -244,7 +254,7 @@ ulbl4:
LDRSB r12,[r14,r11] @edge_idx = gi1_table_edge_idx[edge_idx]
CMP r12,#0
BEQ PU1_AVAIL_3_LOOP
- LDR r14,[sp,#0x110] @Loads pi1_sao_offset_v
+ LDR r14,[sp,#pi1_sao_v_offset] @Loads pi1_sao_offset_v
LDRSB r11,[r14,r12] @pi1_sao_offset_v[edge_idx]
ADD r9,r9,r11 @pu1_src[(ht - 1) * src_strd] + pi1_sao_offset[edge_idx]
USAT r9,#8,r9 @u1_pos_wd_ht_tmp_v = CLIP3(pu1_src[(ht - 1) * src_strd] + pi1_sao_offset[edge_idx], 0, (1 << bit_depth) - 1)
@@ -252,7 +262,7 @@ ulbl4:
PU1_AVAIL_3_LOOP:
STRB r10,[sp,#8]
STRB r9,[sp,#9]
- STR r2,[sp,#0x104] @Store pu1_src_left in sp
+ STR r2,[sp,#216] @Store pu1_src_left in sp
MOV r12,r8 @Move ht
MOV r14,r2 @Move pu1_src_left to pu1_src_left_cpy
@@ -276,7 +286,7 @@ PU1_AVAIL_2_LOOP_END:
VMOV.I16 Q1,#0 @const_min_clip = vdupq_n_s16(0)
VMOV.I16 Q2,#255 @const_max_clip = vdupq_n_u16((1 << bit_depth) - 1)
VLD1.8 D6,[r6] @offset_tbl_u = vld1_s8(pi1_sao_offset_u)
- LDR r6,[sp,#0x110] @Loads pi1_sao_offset_v
+ LDR r6,[sp,#pi1_sao_v_offset] @Loads pi1_sao_offset_v
VLD1.8 D7,[r6] @offset_tbl_v = vld1_s8(pi1_sao_offset_v)
LDR r2, gi1_table_edge_idx_addr_5 @table pointer
ulbl5:
@@ -291,9 +301,9 @@ ulbl5:
BLE WD_16_HT_4_LOOP @If jump to WD_16_HT_4_LOOP
WIDTH_LOOP_16:
- LDR r7,[sp,#0x114] @Loads wd
+ LDR r7,[sp,#wd_offset] @Loads wd
CMP r6,r7 @col == wd
- LDR r5,[sp,#0x108] @Loads pu1_avail
+ LDR r5,[sp,#pu1_avail_offset] @Loads pu1_avail
LDRBEQ r8,[r5] @pu1_avail[0]
MOVNE r8,#-1
@@ -314,7 +324,7 @@ SKIP_AU1_MASK_VAL:
VLD1.8 D12,[r0]! @pu1_cur_row = vld1q_u8(pu1_src)
VLD1.8 D13,[r0] @pu1_cur_row = vld1q_u8(pu1_src)
SUB r0,#8
- ADD r5,sp,#0x4B @*au1_src_left_tmp
+ ADD r5,sp,#75 @*au1_src_left_tmp
SUBEQ r8,r0,r1 @pu1_src - src_strd
VMOV.I8 Q9,#0
@@ -326,15 +336,15 @@ SKIP_AU1_MASK_VAL:
SUB r8,#8
ADD r3,r3,#16
- LDR r4,[sp,#0x118] @Loads ht
+ LDR r4,[sp,#ht_offset] @Loads ht
VCGT.U8 Q7,Q6,Q5 @vcgtq_u8(pu1_cur_row, pu1_top_row)
- LDR r7,[sp,#0x114] @Loads wd
+ LDR r7,[sp,#wd_offset] @Loads wd
SUB r7,r7,r6 @(wd - col)
VCLT.U8 Q8,Q6,Q5 @vcltq_u8(pu1_cur_row, pu1_top_row)
ADD r7,r7,#14 @15 + (wd - col)
- LDR r8,[sp,#0x100] @Loads *pu1_src
+ LDR r8,[sp,#212] @Loads *pu1_src
VSUB.U8 Q7,Q8,Q7 @sign_up = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
ADD r7,r8,r7 @pu1_src[0 * src_strd + 15 + (wd - col)]
@@ -359,7 +369,7 @@ AU1_SRC_LEFT_LOOP:
LDRH r5,[r8,#2] @I
VMOV.16 D19[3],r5 @I vsetq_lane_u8
- LDR r11,[sp,#0x108] @I Loads pu1_avail
+ LDR r11,[sp,#pu1_avail_offset] @I Loads pu1_avail
LDRB r11,[r11,#2] @I pu1_avail[2]
VEXT.8 Q9,Q9,Q8,#14 @I pu1_next_row_tmp = vextq_u8(pu1_next_row_tmp, pu1_next_row, 14)
@@ -477,7 +487,7 @@ PU1_SRC_LOOP:
VCGT.U8 Q11,Q6,Q14 @II vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
BNE NEXT_ROW_POINTER_ASSIGNED_2 @III
- LDR r5,[sp,#0x108] @III Loads pu1_avail
+ LDR r5,[sp,#pu1_avail_offset] @III Loads pu1_avail
LDRB r5,[r5,#3] @III pu1_avail[3]
CMP r5,#0 @III
SUBNE r11,r4,#4 @III pu1_src[src_strd - 2]
@@ -597,7 +607,7 @@ NEXT_ROW_POINTER_ASSIGNED_2:
LDRB r9,[r0,#17] @load the value pu1_src_cpy[17 - src_strd]
BNE NEXT_ROW_POINTER_ASSIGNED_3
- LDR r5,[sp,#0x108] @Loads pu1_avail
+ LDR r5,[sp,#pu1_avail_offset] @Loads pu1_avail
LDRB r5,[r5,#3] @pu1_avail[3]
CMP r5,#0
SUBNE r8,r11,#4 @pu1_src[src_strd - 2]
@@ -657,13 +667,13 @@ NEXT_ROW_POINTER_ASSIGNED_3:
INNER_LOOP_DONE:
- LDR r8,[sp,#0x118] @Loads ht
+ LDR r8,[sp,#ht_offset] @Loads ht
VMOVN.I16 D20,Q10 @III vmovn_s16(pi2_tmp_cur_row.val[0])
- ADD r5,sp,#0x4B @*au1_src_left_tmp
+ ADD r5,sp,#75 @*au1_src_left_tmp
LSL r8,r8,#1
VMOVN.I16 D21,Q9 @III vmovn_s16(pi2_tmp_cur_row.val[1])
- LDR r11,[sp,#0x104] @Loads *pu1_src_left
+ LDR r11,[sp,#216] @Loads *pu1_src_left
SRC_LEFT_LOOP:
LDR r7,[r5],#4 @au1_src_left_tmp[row]
@@ -676,7 +686,7 @@ SRC_LEFT_LOOP:
CMP r6,#8 @Check whether residue remains
BLT RE_ASSINING_LOOP @Jump to re-assigning loop
- LDR r7,[sp,#0x114] @Loads wd
+ LDR r7,[sp,#wd_offset] @Loads wd
LDR r0,[sp,#0x02] @Loads *pu1_src
SUB r7,r7,r6
ADD r0,r0,r7
@@ -684,9 +694,9 @@ SRC_LEFT_LOOP:
BEQ WIDTH_RESIDUE @If residue remains jump to residue loop
WD_16_HT_4_LOOP:
- LDR r7,[sp,#0x114] @Loads wd
+ LDR r7,[sp,#wd_offset] @Loads wd
- LDR r5,[sp,#0x108] @Loads pu1_avail
+ LDR r5,[sp,#pu1_avail_offset] @Loads pu1_avail
CMP r6,r7 @col == wd
LDRBEQ r8,[r5] @pu1_avail[0]
@@ -716,17 +726,17 @@ SKIP_AU1_MASK_VAL_WD_16_HT_4:
VLD1.8 D10,[r8]! @pu1_top_row = vld1q_u8(pu1_src - src_strd + 2)
VLD1.8 D11,[r8] @pu1_top_row = vld1q_u8(pu1_src - src_strd + 2)
SUB r8,#8
- ADD r5,sp,#0x4B @*au1_src_left_tmp
+ ADD r5,sp,#75 @*au1_src_left_tmp
- LDR r4,[sp,#0x118] @Loads ht
+ LDR r4,[sp,#ht_offset] @Loads ht
VCGT.U8 Q7,Q6,Q5 @vcgtq_u8(pu1_cur_row, pu1_top_row)
- LDR r7,[sp,#0x114] @Loads wd
+ LDR r7,[sp,#wd_offset] @Loads wd
SUB r7,r7,r6 @(wd - col)
VCLT.U8 Q8,Q6,Q5 @vcltq_u8(pu1_cur_row, pu1_top_row)
ADD r7,r7,#14 @15 + (wd - col)
- LDR r8,[sp,#0x100] @Loads *pu1_src
+ LDR r8,[sp,#212] @Loads *pu1_src
VSUB.U8 Q7,Q8,Q7 @sign_up = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
ADD r7,r8,r7 @pu1_src[0 * src_strd + 15 + (wd - col)]
@@ -744,7 +754,7 @@ AU1_SRC_LEFT_LOOP_WD_16_HT_4:
PU1_SRC_LOOP_WD_16_HT_4:
ADD r9,r0,r1 @*pu1_src + src_strd
- LDR r5,[sp,#0x108] @Loads pu1_avail
+ LDR r5,[sp,#pu1_avail_offset] @Loads pu1_avail
VLD1.8 D16,[r9]! @pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
VLD1.8 D17,[r9] @pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
SUB r9,#8
@@ -766,7 +776,7 @@ NEXT_ROW_POINTER_ASSIGNED_WD_16_HT_4:
CMP r7,r12
BLT SIGN_UP_CHANGE_WD_16_HT_4
- LDR r5,[sp,#0x108] @Loads pu1_avail
+ LDR r5,[sp,#pu1_avail_offset] @Loads pu1_avail
LDRB r5,[r5,#2] @pu1_avail[2]
CMP r5,#0
BNE SIGN_UP_CHANGE_DONE_WD_16_HT_4
@@ -839,9 +849,9 @@ SIGN_UP_CHANGE_DONE_WD_16_HT_4:
VST1.8 {Q14},[r0],r1 @vst1q_u8(pu1_src_cpy, pu1_cur_row)
BNE PU1_SRC_LOOP_WD_16_HT_4 @If not equal jump to PU1_SRC_LOOP_WD_16_HT_4
- LDR r8,[sp,#0x118] @Loads ht
- ADD r5,sp,#0x4B @*au1_src_left_tmp
- LDR r11,[sp,#0x104] @Loads *pu1_src_left
+ LDR r8,[sp,#ht_offset] @Loads ht
+ ADD r5,sp,#75 @*au1_src_left_tmp
+ LDR r11,[sp,#216] @Loads *pu1_src_left
SRC_LEFT_LOOP_WD_16_HT_4:
LDR r7,[r5],#4 @au1_src_left_tmp[row]
@@ -851,16 +861,16 @@ SRC_LEFT_LOOP_WD_16_HT_4:
SUBS r6,r6,#16 @Decrement the wd loop count by 16
BLE RE_ASSINING_LOOP @Jump to re-assigning loop
- LDR r7,[sp,#0x114] @Loads wd
+ LDR r7,[sp,#wd_offset] @Loads wd
LDR r0,[sp,#0x02] @Loads *pu1_src
SUB r7,r7,r6
ADD r0,r0,r7
BGT WD_16_HT_4_LOOP @If not equal jump to width_loop
WIDTH_RESIDUE:
- LDR r7,[sp,#0x114] @Loads wd
+ LDR r7,[sp,#wd_offset] @Loads wd
- LDR r5,[sp,#0x108] @Loads pu1_avail
+ LDR r5,[sp,#pu1_avail_offset] @Loads pu1_avail
CMP r6,r7 @wd_residue == wd
LDRBEQ r8,[r5] @pu1_avail[0]
@@ -878,13 +888,13 @@ WIDTH_RESIDUE:
ADD r10,r10,#2 @pu1_src - src_strd + 2
VMOV.8 d8[6],r11 @au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
- ADD r5,sp,#0x4B @*au1_src_left_tmp
+ ADD r5,sp,#75 @*au1_src_left_tmp
- LDR r4,[sp,#0x118] @Loads ht
+ LDR r4,[sp,#ht_offset] @Loads ht
VMOV.8 d8[7],r11 @au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
- LDR r7,[sp,#0x114] @Loads wd
+ LDR r7,[sp,#wd_offset] @Loads wd
- LDR r8,[sp,#0x100] @Loads *pu1_src
+ LDR r8,[sp,#212] @Loads *pu1_src
VLD1.8 D10,[r10]! @pu1_top_row = vld1q_u8(pu1_src - src_strd + 2)
VLD1.8 D11,[r10] @pu1_top_row = vld1q_u8(pu1_src - src_strd + 2)
SUB r10,#8
@@ -917,7 +927,7 @@ PU1_SRC_LOOP_RESIDUE:
VLD1.8 D16,[r9]! @pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
VLD1.8 D17,[r9] @pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
SUB r9,#8
- LDR r5,[sp,#0x108] @Loads pu1_avail
+ LDR r5,[sp,#pu1_avail_offset] @Loads pu1_avail
LDRB r5,[r5,#3] @pu1_avail[3]
ADD r8,r14,r11,LSL #1 @pu1_src_left_cpy[(ht_tmp - row) * 2]
@@ -940,7 +950,7 @@ NEXT_ROW_POINTER_ASSIGNED_RESIDUE:
VEXT.8 Q9,Q9,Q8,#14 @pu1_next_row_tmp = vextq_u8(pu1_next_row_tmp, pu1_next_row, 14)
BLT SIGN_UP_CHANGE_RESIDUE
- LDR r5,[sp,#0x108] @Loads pu1_avail
+ LDR r5,[sp,#pu1_avail_offset] @Loads pu1_avail
LDRB r5,[r5,#2] @pu1_avail[2]
CMP r5,#0
BNE SIGN_UP_CHANGE_DONE_RESIDUE
@@ -1007,10 +1017,10 @@ SIGN_UP_CHANGE_DONE_RESIDUE:
BNE PU1_SRC_LOOP_RESIDUE @If not equal jump to PU1_SRC_LOOP
- LDR r8,[sp,#0x118] @Loads ht
- ADD r5,sp,#0x4B @*au1_src_left_tmp
+ LDR r8,[sp,#ht_offset] @Loads ht
+ ADD r5,sp,#75 @*au1_src_left_tmp
- LDR r11,[sp,#0x104] @Loads *pu1_src_left
+ LDR r11,[sp,#216] @Loads *pu1_src_left
SRC_LEFT_LOOP_RESIDUE:
LDR r7,[r5],#4 @au1_src_left_tmp[row]
@@ -1020,10 +1030,10 @@ SRC_LEFT_LOOP_RESIDUE:
RE_ASSINING_LOOP:
- LDR r7,[sp,#0x114] @Loads wd
- LDR r8,[sp,#0x118] @Loads ht
+ LDR r7,[sp,#wd_offset] @Loads wd
+ LDR r8,[sp,#ht_offset] @Loads ht
- LDR r0,[sp,#0x100] @Loads *pu1_src
+ LDR r0,[sp,#212] @Loads *pu1_src
SUB r10,r7,#2 @wd - 2
LDRH r9,[sp,#6]
@@ -1032,7 +1042,7 @@ RE_ASSINING_LOOP:
STRH r9,[r0,r10] @pu1_src_org[0] = u1_pos_0_0_tmp
MLA r6,r8,r1,r0 @pu1_src[(ht - 1) * src_strd]
- LDR r4,[sp,#0xFC] @Loads pu1_src_top_left
+ LDR r4,[sp,#pu1_src_top_left_offset] @Loads pu1_src_top_left
LDRH r9,[sp,#8]
ADD r12,sp,#10
@@ -1041,7 +1051,7 @@ RE_ASSINING_LOOP:
LDRH r10,[sp] @load u1_src_top_left_tmp from stack pointer
STRH r10,[r4] @*pu1_src_top_left = u1_src_top_left_tmp
- LDR r3,[sp,#0x10C] @Loads pu1_src_top
+ LDR r3,[sp,#220] @Loads pu1_src_top
SRC_TOP_LOOP:
VLD1.8 D0,[r12]! @pu1_src_top[col] = au1_src_top_tmp[col]
@@ -1050,7 +1060,8 @@ SRC_TOP_LOOP:
BNE SRC_TOP_LOOP
END_LOOPS:
- ADD sp,sp,#0xD4
+ ADD sp,sp,#224
+ vpop {d8 - d15}
LDMFD sp!,{r4-r12,r15} @Reload the registers from SP
diff --git a/common/arm/ihevc_weighted_pred_bi.s b/common/arm/ihevc_weighted_pred_bi.s
index 5308423..8845b8b 100644
--- a/common/arm/ihevc_weighted_pred_bi.s
+++ b/common/arm/ihevc_weighted_pred_bi.s
@@ -134,6 +134,18 @@
@ r14 => ht
@ r7 => wd
+.equ src_strd2_offset, 104
+.equ dst_strd_offset, 108
+.equ wgt0_offset, 112
+.equ off0_offset, 116
+.equ wgt1_offset, 120
+.equ off1_offset, 124
+.equ shift_offset, 128
+.equ lvl_shift1_offset, 132
+.equ lvl_shift2_offset, 136
+.equ ht_offset, 140
+.equ wd_offset, 144
+
.text
.align 4
@@ -147,32 +159,33 @@
ihevc_weighted_pred_bi_a9q:
stmfd sp!, {r4-r12, r14} @stack stores the values of the arguments
+ vpush {d8 - d15}
- ldr r6,[sp,#48] @load wgt0
- ldr r11,[sp,#68] @load lvl_shift1
- ldr r12,[sp,#72] @load lvl_shift2
+ ldr r6,[sp,#wgt0_offset] @load wgt0
+ ldr r11,[sp,#lvl_shift1_offset] @load lvl_shift1
+ ldr r12,[sp,#lvl_shift2_offset] @load lvl_shift2
vmov.s16 d7[0],r6 @moved for scalar multiplication
mul r4,r11,r6 @lvl_shift1 * wgt0
- ldr r8,[sp,#56] @load wgt1
- ldr r7,[sp,#52] @load off0
+ ldr r8,[sp,#wgt1_offset] @load wgt1
+ ldr r7,[sp,#off0_offset] @load off0
vmov.s16 d7[1],r8 @moved for scalar multiplication
mla r4,r12,r8,r4 @(lvl_shift1 * wgt0) + (lvl_shift2 * wgt1)
- ldr r9,[sp,#60] @load off1
+ ldr r9,[sp,#off1_offset] @load off1
add r5,r7,r9 @off0 + off1
- ldr r10,[sp,#64] @load shift
+ ldr r10,[sp,#shift_offset] @load shift
add r5,r5,#1 @off0 + off1 + 1
sub r14,r10,#1 @shift - 1
- ldr r7,[sp,#80] @load wd
+ ldr r7,[sp,#wd_offset] @load wd
lsl r5,r5,r14 @((off0 + off1 + 1) << (shift - 1))
vdup.u32 q14,r10 @vmovq_n_s32(0-shift)
add r4,r4,r5 @tmp_lvl_shift += ((off0 + off1 + 1) << (shift - 1))
vdup.u32 q15,r4 @vmovq_n_s32(tmp_lvl_shift)
vneg.s32 q14,q14
- ldr r4,[sp,#40] @load src_strd2
+ ldr r4,[sp,#src_strd2_offset] @load src_strd2
lsl r9,r7,#1
- ldr r5,[sp,#44] @load dst_strd
+ ldr r5,[sp,#dst_strd_offset] @load dst_strd
lsl r3,r3,#1
- ldr r14,[sp,#76] @load ht
+ ldr r14,[sp,#ht_offset] @load ht
lsl r4,r4,#1
cmp r14,#0 @check ht == 0
@@ -260,6 +273,7 @@ end_core_loop:
bgt core_loop @if ht is greater than 0 goto outer_loop
end_loops:
+ vpop {d8 - d15}
ldmfd sp!,{r4-r12,r15} @reload the registers from sp
diff --git a/common/arm/ihevc_weighted_pred_bi_default.s b/common/arm/ihevc_weighted_pred_bi_default.s
index 6bdb8cc..5b369be 100644
--- a/common/arm/ihevc_weighted_pred_bi_default.s
+++ b/common/arm/ihevc_weighted_pred_bi_default.s
@@ -107,6 +107,14 @@
@ r7 => lvl_shift2
@ r8 => ht
@ r9 => wd
+
+.equ src_strd2_offset, 104
+.equ dst_strd_offset, 108
+.equ lvl_shift1_offset, 112
+.equ lvl_shift2_offset, 116
+.equ ht_offset, 120
+.equ wd_offset, 124
+
.text
.syntax unified
.align 4
@@ -121,14 +129,15 @@
ihevc_weighted_pred_bi_default_a9q:
stmfd sp!, {r4-r12, r14} @stack stores the values of the arguments
- ldr r4,[sp,#40] @load src_strd2
+ vpush {d8 - d15}
+ ldr r4,[sp,#src_strd2_offset] @load src_strd2
lsl r3,r3,#1
- ldr r5,[sp,#44] @load dst_strd
- ldr r6,[sp,#48] @load lvl_shift1
+ ldr r5,[sp,#dst_strd_offset] @load dst_strd
+ ldr r6,[sp,#lvl_shift1_offset] @load lvl_shift1
lsl r4,r4,#1
- ldr r7,[sp,#52] @load lvl_shift2
- ldr r8,[sp,#56] @load ht
- ldr r9,[sp,#60] @load wd
+ ldr r7,[sp,#lvl_shift2_offset] @load lvl_shift2
+ ldr r8,[sp,#ht_offset] @load ht
+ ldr r9,[sp,#wd_offset] @load wd
vdup.16 q2,r6 @lvl_shift1_t = vmov_n_s16((int16_t)lvl_shift1)
vdup.16 q3,r7 @lvl_shift2_t = vmov_n_s16((int16_t)lvl_shift2)
vmov.i16 q0,#0x40 @tmp_lvl_shift = 1 << (shift - 1)
@@ -488,6 +497,7 @@ end_core_loop_16:
end_loops:
+ vpop {d8 - d15}
ldmfd sp!,{r4-r12,r15} @reload the registers from sp
diff --git a/common/arm/ihevc_weighted_pred_uni.s b/common/arm/ihevc_weighted_pred_uni.s
index e9b69c1..1f99ff8 100644
--- a/common/arm/ihevc_weighted_pred_uni.s
+++ b/common/arm/ihevc_weighted_pred_uni.s
@@ -112,6 +112,13 @@
@ r8 => ht
@ r9 => wd
+.equ wgt0_offset, 104
+.equ off0_offset, 108
+.equ shift_offset, 112
+.equ lvl_shift_offset, 116
+.equ ht_offset, 120
+.equ wd_offset, 124
+
.text
.align 4
@@ -125,16 +132,17 @@
ihevc_weighted_pred_uni_a9q:
stmfd sp!, {r4-r12, r14} @stack stores the values of the arguments
+ vpush {d8 - d15}
- ldr r4,[sp,#40] @load wgt0
- ldr r7,[sp,#52] @load lvl_shift
+ ldr r4,[sp,#wgt0_offset] @load wgt0
+ ldr r7,[sp,#lvl_shift_offset] @load lvl_shift
mov r11,#1
- ldr r5,[sp,#44] @load off0
+ ldr r5,[sp,#off0_offset] @load off0
mul r10,r7,r4 @lvl_shift * wgt0
- ldr r6,[sp,#48] @load shift
- ldr r8,[sp,#56] @load ht
+ ldr r6,[sp,#shift_offset] @load shift
+ ldr r8,[sp,#ht_offset] @load ht
add r10,r10,r5,lsl r6 @lvl_shift * wgt0 + (off0 << shift)
- ldr r9,[sp,#60] @load wt
+ ldr r9,[sp,#wd_offset] @load wt
sub r12,r6,#1
vmov.s16 d0[0],r4 @moved for scalar multiplication
lsl r2,r2,#1
@@ -214,6 +222,7 @@ end_core_loop:
bgt core_loop @if ht is greater than 0 goto outer_loop
end_loops:
+ vpop {d8 - d15}
ldmfd sp!,{r4-r12,r15} @reload the registers from sp