summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--common/arm/ihevc_inter_pred_filters_luma_vert.s6
-rw-r--r--common/arm/ihevc_inter_pred_luma_horz_w16out.s6
-rw-r--r--common/arm/ihevc_sao_edge_offset_class2.s7
-rw-r--r--common/arm/ihevc_sao_edge_offset_class2_chroma.s9
-rw-r--r--common/arm/ihevc_sao_edge_offset_class3.s11
-rw-r--r--common/arm/ihevc_sao_edge_offset_class3_chroma.s7
-rw-r--r--common/arm/ihevc_weighted_pred_bi_default.s3
-rw-r--r--common/arm64/ihevc_deblk_luma_horz.s12
-rw-r--r--common/arm64/ihevc_deblk_luma_vert.s16
-rw-r--r--common/arm64/ihevc_inter_pred_chroma_horz.s8
-rw-r--r--common/arm64/ihevc_inter_pred_chroma_horz_w16out.s8
-rw-r--r--common/arm64/ihevc_inter_pred_chroma_vert.s14
-rw-r--r--common/arm64/ihevc_inter_pred_chroma_vert_w16inp.s8
-rw-r--r--common/arm64/ihevc_inter_pred_chroma_vert_w16inp_w16out.s8
-rw-r--r--common/arm64/ihevc_inter_pred_chroma_vert_w16out.s14
-rw-r--r--common/arm64/ihevc_inter_pred_filters_luma_horz.s16
-rw-r--r--common/arm64/ihevc_inter_pred_filters_luma_vert.s34
-rw-r--r--common/arm64/ihevc_inter_pred_filters_luma_vert_w16inp.s16
-rw-r--r--common/arm64/ihevc_inter_pred_filters_luma_vert_w16out.s34
-rw-r--r--common/arm64/ihevc_inter_pred_luma_horz_w16out.s16
-rw-r--r--common/arm64/ihevc_intra_pred_chroma_dc.s10
-rw-r--r--common/arm64/ihevc_intra_pred_chroma_horz.s74
-rw-r--r--common/arm64/ihevc_intra_pred_chroma_mode2.s4
-rw-r--r--common/arm64/ihevc_intra_pred_chroma_mode_27_to_33.s42
-rw-r--r--common/arm64/ihevc_intra_pred_chroma_ver.s2
-rw-r--r--common/arm64/ihevc_intra_pred_filters_chroma_mode_11_to_17.s2
-rw-r--r--common/arm64/ihevc_intra_pred_filters_chroma_mode_19_to_25.s44
-rw-r--r--common/arm64/ihevc_intra_pred_filters_luma_mode_11_to_17.s2
-rw-r--r--common/arm64/ihevc_intra_pred_filters_luma_mode_19_to_25.s44
-rw-r--r--common/arm64/ihevc_intra_pred_luma_dc.s12
-rw-r--r--common/arm64/ihevc_intra_pred_luma_horz.s88
-rw-r--r--common/arm64/ihevc_intra_pred_luma_mode2.s6
-rw-r--r--common/arm64/ihevc_intra_pred_luma_mode_27_to_33.s42
-rw-r--r--common/arm64/ihevc_intra_pred_luma_planar.s34
-rw-r--r--common/arm64/ihevc_intra_pred_luma_vert.s6
-rw-r--r--common/arm64/ihevc_itrans_recon_16x16.s512
-rw-r--r--common/arm64/ihevc_itrans_recon_32x32.s2048
-rw-r--r--common/arm64/ihevc_itrans_recon_4x4.s16
-rw-r--r--common/arm64/ihevc_itrans_recon_4x4_ttype1.s72
-rw-r--r--common/arm64/ihevc_itrans_recon_8x8.s242
-rw-r--r--common/arm64/ihevc_mem_fns.s6
-rw-r--r--common/arm64/ihevc_sao_band_offset_chroma.s16
-rw-r--r--common/arm64/ihevc_sao_band_offset_luma.s8
-rw-r--r--common/arm64/ihevc_sao_edge_offset_class0.s22
-rw-r--r--common/arm64/ihevc_sao_edge_offset_class0_chroma.s44
-rw-r--r--common/arm64/ihevc_sao_edge_offset_class2.s36
-rw-r--r--common/arm64/ihevc_sao_edge_offset_class2_chroma.s62
-rw-r--r--common/arm64/ihevc_sao_edge_offset_class3.s36
-rw-r--r--common/arm64/ihevc_sao_edge_offset_class3_chroma.s62
-rw-r--r--common/arm64/ihevc_weighted_pred_bi.s16
-rw-r--r--common/arm64/ihevc_weighted_pred_uni.s10
-rw-r--r--decoder.arm64.mk7
-rw-r--r--decoder.mips64.mk8
-rw-r--r--decoder/arm64/ihevcd_fmt_conv_420sp_to_rgba8888.s40
54 files changed, 1970 insertions, 1958 deletions
diff --git a/common/arm/ihevc_inter_pred_filters_luma_vert.s b/common/arm/ihevc_inter_pred_filters_luma_vert.s
index 04942ae..f51d68c 100644
--- a/common/arm/ihevc_inter_pred_filters_luma_vert.s
+++ b/common/arm/ihevc_inter_pred_filters_luma_vert.s
@@ -105,7 +105,7 @@
@ r3 => wd
.text
.align 4
-
+.syntax unified
@@ -407,7 +407,7 @@ end_loops:
ldr r1, [sp], #4
ldr r0, [sp], #4
- ldmeqfd sp!,{r4-r12,r15} @reload the registers from sp
+ ldmfdeq sp!,{r4-r12,r15} @reload the registers from sp
mov r5, #4
add r0, r0, #8
add r1, r1, #8
@@ -848,7 +848,7 @@ end_loops_16out:
ldr r1, [sp], #4
ldr r0, [sp], #4
- ldmeqfd sp!,{r4-r12,r15} @reload the registers from sp
+ ldmfdeq sp!,{r4-r12,r15} @reload the registers from sp
mov r5, #4
add r0, r0, #8
add r1, r1, #16
diff --git a/common/arm/ihevc_inter_pred_luma_horz_w16out.s b/common/arm/ihevc_inter_pred_luma_horz_w16out.s
index b27b2e8..e8800e0 100644
--- a/common/arm/ihevc_inter_pred_luma_horz_w16out.s
+++ b/common/arm/ihevc_inter_pred_luma_horz_w16out.s
@@ -109,7 +109,7 @@
@r14 - loop_counter
.text
.align 4
-
+.syntax unified
@@ -277,8 +277,8 @@ height_residue_4:
ldr r7,[sp,#44] @loads ht
and r7,r7,#1 @calculating ht_residue ht_residue = (ht & 1)
cmp r7,#0
- @beq end_loops
- ldmeqfd sp!,{r4-r12,r15} @reload the registers from sp
+ @beq end_loops
+ ldmfdeq sp!,{r4-r12,r15} @reload the registers from sp
outer_loop_height_residue_4:
diff --git a/common/arm/ihevc_sao_edge_offset_class2.s b/common/arm/ihevc_sao_edge_offset_class2.s
index 33b4961..536f941 100644
--- a/common/arm/ihevc_sao_edge_offset_class2.s
+++ b/common/arm/ihevc_sao_edge_offset_class2.s
@@ -59,6 +59,7 @@
@r8=> ht
.text
+.syntax unified
.p2align 2
.extern gi1_table_edge_idx
@@ -214,7 +215,7 @@ WIDTH_LOOP_16:
LDR r5,[sp,#0xC8] @Loads pu1_avail
CMP r6,r7 @col == wd
- LDREQB r8,[r5] @pu1_avail[0]
+ LDRBEQ r8,[r5] @pu1_avail[0]
MOVNE r8,#-1 @au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
VMOV.8 d8[0],r8 @au1_mask = vsetq_lane_s8((-1||pu1_avail[0]), au1_mask, 0)
@@ -526,7 +527,7 @@ WD_16_HT_4_LOOP:
LDR r7,[sp,#0xD0] @Loads wd
LDR r5,[sp,#0xC8] @Loads pu1_avail
CMP r6,r7 @col == wd
- LDREQB r8,[r5] @pu1_avail[0]
+ LDRBEQ r8,[r5] @pu1_avail[0]
MOVNE r8,#-1 @au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
VMOV.8 d8[0],r8 @au1_mask = vsetq_lane_s8((-1||pu1_avail[0]), au1_mask, 0)
@@ -658,7 +659,7 @@ WIDTH_RESIDUE:
LDR r7,[sp,#0xD0] @Loads wd
LDR r5,[sp,#0xC8] @Loads pu1_avail
CMP r6,r7 @wd_residue == wd
- LDREQB r8,[r5] @pu1_avail[0]
+ LDRBEQ r8,[r5] @pu1_avail[0]
MOVNE r8,#-1
VMOV.8 d8[0],r8 @au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
diff --git a/common/arm/ihevc_sao_edge_offset_class2_chroma.s b/common/arm/ihevc_sao_edge_offset_class2_chroma.s
index c6fb391..b74a8f6 100644
--- a/common/arm/ihevc_sao_edge_offset_class2_chroma.s
+++ b/common/arm/ihevc_sao_edge_offset_class2_chroma.s
@@ -61,6 +61,7 @@
@r8=> ht
.text
+.syntax unified
.p2align 2
.extern gi1_table_edge_idx
@@ -289,7 +290,7 @@ ulbl5:
add r2,r2,pc
MOV r6,r7 @move wd to r6 loop_count
- VMOV.S8 Q4,#0XFF @au1_mask = vdupq_n_s8(-1)
+ VMOV.S8 Q4,#0xFF @au1_mask = vdupq_n_s8(-1)
CMP r7,#16 @Compare wd with 16
BLT WIDTH_RESIDUE @If not jump to WIDTH_RESIDUE where loop is unrolled for 8 case
@@ -300,7 +301,7 @@ WIDTH_LOOP_16:
LDR r5,[sp,#0x108] @Loads pu1_avail
LDR r7,[sp,#0x114] @Loads wd
CMP r6,r7 @col == wd
- LDREQB r8,[r5] @pu1_avail[0]
+ LDRBEQ r8,[r5] @pu1_avail[0]
MOVNE r8,#-1
VMOV.8 D8[0],r8 @au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
@@ -684,7 +685,7 @@ WD_16_HT_4_LOOP:
LDR r5,[sp,#0x108] @Loads pu1_avail
LDR r7,[sp,#0x114] @Loads wd
CMP r6,r7 @col == wd
- LDREQB r8,[r5] @pu1_avail[0]
+ LDRBEQ r8,[r5] @pu1_avail[0]
MOVNE r8,#-1
VMOV.8 D8[0],r8 @au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
@@ -835,7 +836,7 @@ WIDTH_RESIDUE:
LDR r7,[sp,#0x114] @Loads wd
LDR r5,[sp,#0x108] @Loads pu1_avail
CMP r6,r7 @wd_residue == wd
- LDREQB r8,[r5] @pu1_avail[0]
+ LDRBEQ r8,[r5] @pu1_avail[0]
MOVNE r8,#-1
VMOV.8 d8[0],r8 @au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
diff --git a/common/arm/ihevc_sao_edge_offset_class3.s b/common/arm/ihevc_sao_edge_offset_class3.s
index 268d4d8..de09d6c 100644
--- a/common/arm/ihevc_sao_edge_offset_class3.s
+++ b/common/arm/ihevc_sao_edge_offset_class3.s
@@ -59,6 +59,7 @@
@r8=> ht
.text
+.syntax unified
.p2align 2
.extern gi1_table_edge_idx
@@ -224,7 +225,7 @@ WIDTH_LOOP_16:
LDR r5,[sp,#0xC8] @Loads pu1_avail
CMP r6,r7 @col == wd
- LDREQB r8,[r5] @pu1_avail[0]
+ LDRBEQ r8,[r5] @pu1_avail[0]
MOVNE r8,#-1
VMOV.8 d8[0],r8 @au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
@@ -557,7 +558,7 @@ WD_16_HT_4_LOOP:
LDR r5,[sp,#0xC8] @Loads pu1_avail
LDR r7,[sp,#0xD0] @Loads wd
CMP r6,r7 @col == wd
- LDREQB r8,[r5] @pu1_avail[0]
+ LDRBEQ r8,[r5] @pu1_avail[0]
MOVNE r8,#-1
VMOV.8 d8[0],r8 @au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
@@ -613,7 +614,7 @@ PU1_SRC_LOOP_WD_16_HT_4:
CMP r5,#0
BEQ NEXT_ROW_ELSE_WD_16_HT_4
CMP r7,#1
- LDREQB r8,[r8,#-1] @pu1_src_cpy[src_strd - 1]
+ LDRBEQ r8,[r8,#-1] @pu1_src_cpy[src_strd - 1]
BEQ NEXT_ROW_POINTER_ASSIGNED_WD_16_HT_4
NEXT_ROW_ELSE_WD_16_HT_4:
SUB r5,r12,r7 @ht_tmp - row
@@ -697,7 +698,7 @@ WIDTH_RESIDUE:
LDR r7,[sp,#0xD0] @Loads wd
LDR r5,[sp,#0xC8] @Loads pu1_avail
CMP r6,r7 @wd_residue == wd
- LDREQB r8,[r5] @pu1_avail[0]
+ LDRBEQ r8,[r5] @pu1_avail[0]
MOVNE r8,#-1
VMOV.8 d8[0],r8 @au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
@@ -751,7 +752,7 @@ PU1_SRC_LOOP_RESIDUE:
CMP r5,#0
BEQ NEXT_ROW_ELSE_RESIDUE
CMP r7,#1
- LDREQB r8,[r8,#-1] @pu1_src_cpy[src_strd - 1]
+ LDRBEQ r8,[r8,#-1] @pu1_src_cpy[src_strd - 1]
BEQ NEXT_ROW_POINTER_ASSIGNED_RESIDUE
NEXT_ROW_ELSE_RESIDUE:
SUB r5,r12,r7 @ht_tmp - row
diff --git a/common/arm/ihevc_sao_edge_offset_class3_chroma.s b/common/arm/ihevc_sao_edge_offset_class3_chroma.s
index 2ecabe9..6561a8a 100644
--- a/common/arm/ihevc_sao_edge_offset_class3_chroma.s
+++ b/common/arm/ihevc_sao_edge_offset_class3_chroma.s
@@ -61,6 +61,7 @@
@r8=> ht
.text
+.syntax unified
.p2align 2
.extern gi1_table_edge_idx
@@ -294,7 +295,7 @@ WIDTH_LOOP_16:
CMP r6,r7 @col == wd
LDR r5,[sp,#0x108] @Loads pu1_avail
- LDREQB r8,[r5] @pu1_avail[0]
+ LDRBEQ r8,[r5] @pu1_avail[0]
MOVNE r8,#-1
VMOV.8 D8[0],r8 @au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
@@ -688,7 +689,7 @@ WD_16_HT_4_LOOP:
LDR r5,[sp,#0x108] @Loads pu1_avail
CMP r6,r7 @col == wd
- LDREQB r8,[r5] @pu1_avail[0]
+ LDRBEQ r8,[r5] @pu1_avail[0]
MOVNE r8,#-1
VMOV.8 D8[0],r8 @au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
@@ -858,7 +859,7 @@ WIDTH_RESIDUE:
LDR r5,[sp,#0x108] @Loads pu1_avail
CMP r6,r7 @wd_residue == wd
- LDREQB r8,[r5] @pu1_avail[0]
+ LDRBEQ r8,[r5] @pu1_avail[0]
MOVNE r8,#-1
LDRB r11,[r5,#1] @pu1_avail[1]
diff --git a/common/arm/ihevc_weighted_pred_bi_default.s b/common/arm/ihevc_weighted_pred_bi_default.s
index b560c15..6bdb8cc 100644
--- a/common/arm/ihevc_weighted_pred_bi_default.s
+++ b/common/arm/ihevc_weighted_pred_bi_default.s
@@ -108,6 +108,7 @@
@ r8 => ht
@ r9 => wd
.text
+.syntax unified
.align 4
@@ -437,7 +438,7 @@ core_loop_16:
vqadd.s16 q13,q7,q8
addeq r1,r1,r7
- subeqs r8,r8,#2 @decrement the ht by 2
+ subseq r8,r8,#2 @decrement the ht by 2
beq epilog_16
diff --git a/common/arm64/ihevc_deblk_luma_horz.s b/common/arm64/ihevc_deblk_luma_horz.s
index f6989e9..db9e347 100644
--- a/common/arm64/ihevc_deblk_luma_horz.s
+++ b/common/arm64/ihevc_deblk_luma_horz.s
@@ -217,7 +217,7 @@ l1.1564:
ldrb w3,[x0,#0] // x4 has the 0 value
uqadd v16.8b, v27.8b , v1.8b
and x2,x2,#0xff
- mul v12.8h, v7.8h, v0.4h[0]
+ mul v12.8h, v7.8h, v0.h[0]
ldr w8, [x0,x10] // has the 3 value
uaddl v10.8h, v24.8b , v28.8b
subs x2,x2,x7
@@ -259,7 +259,7 @@ l1.1564:
ble l1.1840
add x10,x1,x1,lsl #1
- mul v16.8h, v16.8h, v0.4h[0]
+ mul v16.8h, v16.8h, v0.h[0]
add x4,x0,#3
@@ -292,7 +292,7 @@ l1.1564:
cmp x8,x5,asr #3
uqsub v31.8b, v25.8b , v1.8b
bge l1.1840
- mul v12.8h, v7.8h, v0.4h[0]
+ mul v12.8h, v7.8h, v0.h[0]
subs x7,x3,x7
uqadd v16.8b, v24.8b , v1.8b
csneg x7,x7,x7,pl
@@ -413,7 +413,7 @@ strong_filtering_q:
strong_filtering_p:
umax v5.8b, v18.8b , v17.8b
mov x12,x0
- mul v7.8h, v7.8h, v0.4h[0]
+ mul v7.8h, v7.8h, v0.h[0]
sub x20,x1,#0
neg x11, x20
add v16.8h, v7.8h , v14.8h
@@ -465,12 +465,12 @@ l1.2408:
usubl v10.8h, v26.8b , v25.8b
- mul v10.8h, v10.8h, v0.4h[0]
+ mul v10.8h, v10.8h, v0.h[0]
movi v0.4h, #0x3
usubl v12.8h, v27.8b , v24.8b
- mul v12.8h, v12.8h, v0.4h[0]
+ mul v12.8h, v12.8h, v0.h[0]
dup v30.8b,w6 // duplicating the +tc value
diff --git a/common/arm64/ihevc_deblk_luma_vert.s b/common/arm64/ihevc_deblk_luma_vert.s
index bc3cc6c..4379a69 100644
--- a/common/arm64/ihevc_deblk_luma_vert.s
+++ b/common/arm64/ihevc_deblk_luma_vert.s
@@ -146,17 +146,17 @@ l1.88:
add x14,x0,x14
sub x19,x14,#3
- dup v4.2s, v24.2s[1]
+ dup v4.2s, v24.s[1]
ldrb w2,[x19] // -2 value
- dup v7.2s, v2.2s[1]
+ dup v7.2s, v2.s[1]
ldrb w10,[x19,#1] // -2 value
- dup v3.2s, v2.2s[0]
+ dup v3.2s, v2.s[0]
ldrb w11,[x19,#2] // -1 value
- dup v5.2s, v1.2s[1]
+ dup v5.2s, v1.s[1]
ldrb w12,[x14,#0] // 0 value
- dup v6.2s, v1.2s[0]
+ dup v6.2s, v1.s[0]
ldrb w3,[x14,#1] // 1 value
- dup v2.2s, v0.2s[0]
+ dup v2.2s, v0.s[0]
ldrb w4,[x14,#2] // 2 value
@@ -191,7 +191,7 @@ l1.88:
cmp x11,x5
- dup v22.2s, v0.2s[1]
+ dup v22.2s, v0.s[1]
bge l1.964
// if(d < beta)
@@ -415,7 +415,7 @@ l1.780:
// x4 has the flag p
- dup v7.2s, v24.2s[0]
+ dup v7.2s, v24.s[0]
sub x3,x0,#1
uaddw v16.8h, v0.8h , v6.8b
add x7,x3,x1
diff --git a/common/arm64/ihevc_inter_pred_chroma_horz.s b/common/arm64/ihevc_inter_pred_chroma_horz.s
index 513a362..425ac41 100644
--- a/common/arm64/ihevc_inter_pred_chroma_horz.s
+++ b/common/arm64/ihevc_inter_pred_chroma_horz.s
@@ -128,16 +128,16 @@ ihevc_inter_pred_chroma_horz_av8:
mov x11,#2
ble end_loops
- dup v24.8b, v2.8b[0] //coeffabs_0 = vdup_lane_u8(coeffabs, 0)
+ dup v24.8b, v2.b[0] //coeffabs_0 = vdup_lane_u8(coeffabs, 0)
sub x12,x0,#2 //pu1_src - 2
- dup v25.8b, v2.8b[1] //coeffabs_1 = vdup_lane_u8(coeffabs, 1)
+ dup v25.8b, v2.b[1] //coeffabs_1 = vdup_lane_u8(coeffabs, 1)
add x4,x12,x2 //pu1_src_tmp2_8 = pu1_src + src_strd
- dup v26.8b, v2.8b[2] //coeffabs_2 = vdup_lane_u8(coeffabs, 2)
+ dup v26.8b, v2.b[2] //coeffabs_2 = vdup_lane_u8(coeffabs, 2)
tst x10,#3 //checks wd for multiples
lsl x5, x10, #1
- dup v27.8b, v2.8b[3] //coeffabs_3 = vdup_lane_u8(coeffabs, 3)
+ dup v27.8b, v2.b[3] //coeffabs_3 = vdup_lane_u8(coeffabs, 3)
bne outer_loop_4
cmp x10,#12
diff --git a/common/arm64/ihevc_inter_pred_chroma_horz_w16out.s b/common/arm64/ihevc_inter_pred_chroma_horz_w16out.s
index efc09f9..0f53c08 100644
--- a/common/arm64/ihevc_inter_pred_chroma_horz_w16out.s
+++ b/common/arm64/ihevc_inter_pred_chroma_horz_w16out.s
@@ -128,16 +128,16 @@ ihevc_inter_pred_chroma_horz_w16out_av8:
ble end_loops
- dup v24.8b, v2.8b[0] //coeffabs_0 = vdup_lane_u8(coeffabs, 0)
+ dup v24.8b, v2.b[0] //coeffabs_0 = vdup_lane_u8(coeffabs, 0)
sub x12,x0,#2 //pu1_src - 2
- dup v25.8b, v2.8b[1] //coeffabs_1 = vdup_lane_u8(coeffabs, 1)
+ dup v25.8b, v2.b[1] //coeffabs_1 = vdup_lane_u8(coeffabs, 1)
add x4,x12,x2 //pu1_src_tmp2_8 = pu1_src + src_strd
- dup v26.8b, v2.8b[2] //coeffabs_2 = vdup_lane_u8(coeffabs, 2)
+ dup v26.8b, v2.b[2] //coeffabs_2 = vdup_lane_u8(coeffabs, 2)
tst x10,#3 //checks wd for multiples of 4
lsl x5, x10, #1 //2wd
- dup v27.8b, v2.8b[3] //coeffabs_3 = vdup_lane_u8(coeffabs, 3)
+ dup v27.8b, v2.b[3] //coeffabs_3 = vdup_lane_u8(coeffabs, 3)
and x7,x14,#1 //added //calculating ht_residue ht_residue = (ht & 1)
sub x14,x14,x7 //added //decrement height by ht_residue(residue value is calculated outside)
diff --git a/common/arm64/ihevc_inter_pred_chroma_vert.s b/common/arm64/ihevc_inter_pred_chroma_vert.s
index 3d61f6c..dd1fba4 100644
--- a/common/arm64/ihevc_inter_pred_chroma_vert.s
+++ b/common/arm64/ihevc_inter_pred_chroma_vert.s
@@ -123,10 +123,10 @@ ihevc_inter_pred_chroma_vert_av8:
tst x6,#3 //checks (wd & 3)
abs v3.8b, v0.8b //vabs_s8(coeff)
lsl x10,x6,#1 //2*wd
- dup v0.8b, v3.8b[0] //coeffabs_0
- dup v1.8b, v3.8b[1] //coeffabs_1
- dup v2.8b, v3.8b[2] //coeffabs_2
- dup v3.8b, v3.8b[3] //coeffabs_3
+ dup v0.8b, v3.b[0] //coeffabs_0
+ dup v1.8b, v3.b[1] //coeffabs_1
+ dup v2.8b, v3.b[2] //coeffabs_2
+ dup v3.8b, v3.b[3] //coeffabs_3
bgt outer_loop_wd_2 //jumps to loop handling wd ==2
@@ -188,14 +188,14 @@ inner_loop_wd_2:
subs x12,x12,#4 //2wd - 4
add x0,x0,#4 //pu1_src + 4
ld1 {v6.s}[1],[x6],x2 //loads pu1_src_tmp
- dup v7.2s, v6.2s[1]
+ dup v7.2s, v6.s[1]
ld1 {v7.s}[1],[x6],x2 //loads pu1_src_tmp
umull v4.8h, v7.8b, v1.8b //vmull_u8(vreinterpret_u8_u32(src_tmp2), coeffabs_1)
- dup v7.2s, v7.2s[1]
+ dup v7.2s, v7.s[1]
ld1 {v7.s}[1],[x6],x2
umlsl v4.8h, v6.8b, v0.8b
umlal v4.8h, v7.8b, v2.8b
- dup v7.2s, v7.2s[1]
+ dup v7.2s, v7.s[1]
ld1 {v7.s}[1],[x6]
add x6,x1,x3 //pu1_dst + dst_strd
umlsl v4.8h, v7.8b, v3.8b
diff --git a/common/arm64/ihevc_inter_pred_chroma_vert_w16inp.s b/common/arm64/ihevc_inter_pred_chroma_vert_w16inp.s
index e8f17cc..e6cc617 100644
--- a/common/arm64/ihevc_inter_pred_chroma_vert_w16inp.s
+++ b/common/arm64/ihevc_inter_pred_chroma_vert_w16inp.s
@@ -120,10 +120,10 @@ ihevc_inter_pred_chroma_vert_w16inp_av8:
sxtl v0.8h, v0.8b //long the value
tst x6,#3 //checks wd == 2
- dup v16.4h, v0.4h[0] //coeff_0
- dup v17.4h, v0.4h[1] //coeff_1
- dup v18.4h, v0.4h[2] //coeff_2
- dup v19.4h, v0.4h[3] //coeff_3
+ dup v16.4h, v0.h[0] //coeff_0
+ dup v17.4h, v0.h[1] //coeff_1
+ dup v18.4h, v0.h[2] //coeff_2
+ dup v19.4h, v0.h[3] //coeff_3
bgt core_loop_ht_2 //jumps to loop handles wd 2
diff --git a/common/arm64/ihevc_inter_pred_chroma_vert_w16inp_w16out.s b/common/arm64/ihevc_inter_pred_chroma_vert_w16inp_w16out.s
index 5aaabe6..022f166 100644
--- a/common/arm64/ihevc_inter_pred_chroma_vert_w16inp_w16out.s
+++ b/common/arm64/ihevc_inter_pred_chroma_vert_w16inp_w16out.s
@@ -120,10 +120,10 @@ ihevc_inter_pred_chroma_vert_w16inp_w16out_av8:
sxtl v0.8h, v0.8b //long the value
tst x6,#3 //checks wd == 2
- dup v16.4h, v0.4h[0] //coeff_0
- dup v17.4h, v0.4h[1] //coeff_1
- dup v18.4h, v0.4h[2] //coeff_2
- dup v19.4h, v0.4h[3] //coeff_3
+ dup v16.4h, v0.h[0] //coeff_0
+ dup v17.4h, v0.h[1] //coeff_1
+ dup v18.4h, v0.h[2] //coeff_2
+ dup v19.4h, v0.h[3] //coeff_3
bgt core_loop_ht_2 //jumps to loop handles wd 2
diff --git a/common/arm64/ihevc_inter_pred_chroma_vert_w16out.s b/common/arm64/ihevc_inter_pred_chroma_vert_w16out.s
index ec946eb..352214b 100644
--- a/common/arm64/ihevc_inter_pred_chroma_vert_w16out.s
+++ b/common/arm64/ihevc_inter_pred_chroma_vert_w16out.s
@@ -125,10 +125,10 @@ ihevc_inter_pred_chroma_vert_w16out_av8:
tst x6,#3 //checks (wd & 3)
abs v3.8b, v0.8b //vabs_s8(coeff)
lsl x10,x6,#1 //2*wd
- dup v0.8b, v3.8b[0] //coeffabs_0
- dup v1.8b, v3.8b[1] //coeffabs_1
- dup v2.8b, v3.8b[2] //coeffabs_2
- dup v3.8b, v3.8b[3] //coeffabs_3
+ dup v0.8b, v3.b[0] //coeffabs_0
+ dup v1.8b, v3.b[1] //coeffabs_1
+ dup v2.8b, v3.b[2] //coeffabs_2
+ dup v3.8b, v3.b[3] //coeffabs_3
bgt outer_loop_wd_2 //jumps to loop handling wd ==2
@@ -189,14 +189,14 @@ inner_loop_wd_2:
subs x12,x12,#4 //2wd - 4
add x0,x0,#4 //pu1_src + 4
ld1 {v6.s}[1],[x6],x2 //loads pu1_src_tmp
- dup v7.2s, v6.2s[1]
+ dup v7.2s, v6.s[1]
ld1 {v7.s}[1],[x6],x2 //loads pu1_src_tmp
umull v4.8h, v7.8b, v1.8b //vmull_u8(vreinterpret_u8_u32(src_tmp2), coeffabs_1)
- dup v7.2s, v7.2s[1]
+ dup v7.2s, v7.s[1]
ld1 {v7.s}[1],[x6],x2
umlsl v4.8h, v6.8b, v0.8b
umlal v4.8h, v7.8b, v2.8b
- dup v7.2s, v7.2s[1]
+ dup v7.2s, v7.s[1]
ld1 {v7.s}[1],[x6]
add x6,x1,x3,lsl #1 //pu1_dst + dst_strd
umlsl v4.8h, v7.8b, v3.8b
diff --git a/common/arm64/ihevc_inter_pred_filters_luma_horz.s b/common/arm64/ihevc_inter_pred_filters_luma_horz.s
index 1e246da..d4830d6 100644
--- a/common/arm64/ihevc_inter_pred_filters_luma_horz.s
+++ b/common/arm64/ihevc_inter_pred_filters_luma_horz.s
@@ -141,22 +141,22 @@ start_loop_count:
//ble end_loops
- dup v24.8b, v2.8b[0] //coeffabs_0 = vdup_lane_u8(coeffabs, 0)
+ dup v24.8b, v2.b[0] //coeffabs_0 = vdup_lane_u8(coeffabs, 0)
sub x12,x0,#3 //pu1_src - 3
- dup v25.8b, v2.8b[1] //coeffabs_1 = vdup_lane_u8(coeffabs, 1)
+ dup v25.8b, v2.b[1] //coeffabs_1 = vdup_lane_u8(coeffabs, 1)
add x4,x12,x2 //pu1_src_tmp2_8 = pu1_src + src_strd
- dup v26.8b, v2.8b[2] //coeffabs_2 = vdup_lane_u8(coeffabs, 2)
+ dup v26.8b, v2.b[2] //coeffabs_2 = vdup_lane_u8(coeffabs, 2)
sub x20,x10,x2,lsl #1 //2*src_strd - wd
neg x9, x20
- dup v27.8b, v2.8b[3] //coeffabs_3 = vdup_lane_u8(coeffabs, 3)
+ dup v27.8b, v2.b[3] //coeffabs_3 = vdup_lane_u8(coeffabs, 3)
sub x20,x10,x3,lsl #1 //2*dst_strd - wd
neg x8, x20
- dup v28.8b, v2.8b[4] //coeffabs_4 = vdup_lane_u8(coeffabs, 4)
+ dup v28.8b, v2.b[4] //coeffabs_4 = vdup_lane_u8(coeffabs, 4)
- dup v29.8b, v2.8b[5] //coeffabs_5 = vdup_lane_u8(coeffabs, 5)
+ dup v29.8b, v2.b[5] //coeffabs_5 = vdup_lane_u8(coeffabs, 5)
// tst x10,#7 //checks wd for multiples
- dup v30.8b, v2.8b[6] //coeffabs_6 = vdup_lane_u8(coeffabs, 6)
- dup v31.8b, v2.8b[7] //coeffabs_7 = vdup_lane_u8(coeffabs, 7)
+ dup v30.8b, v2.b[6] //coeffabs_6 = vdup_lane_u8(coeffabs, 6)
+ dup v31.8b, v2.b[7] //coeffabs_7 = vdup_lane_u8(coeffabs, 7)
mov x7,x1
diff --git a/common/arm64/ihevc_inter_pred_filters_luma_vert.s b/common/arm64/ihevc_inter_pred_filters_luma_vert.s
index bd8b3c4..f8b8031 100644
--- a/common/arm64/ihevc_inter_pred_filters_luma_vert.s
+++ b/common/arm64/ihevc_inter_pred_filters_luma_vert.s
@@ -132,15 +132,15 @@ ihevc_inter_pred_luma_vert_av8:
mov x3,x16 //load ht
subs x7,x3,#0 //x3->ht
//ble end_loops //end loop jump
- dup v22.8b, v0.8b[0] //coeffabs_0 = vdup_lane_u8(coeffabs, 0)//
+ dup v22.8b, v0.b[0] //coeffabs_0 = vdup_lane_u8(coeffabs, 0)//
cmp x5,#8
- dup v23.8b, v0.8b[1] //coeffabs_1 = vdup_lane_u8(coeffabs, 1)//
- dup v24.8b, v0.8b[2] //coeffabs_2 = vdup_lane_u8(coeffabs, 2)//
- dup v25.8b, v0.8b[3] //coeffabs_3 = vdup_lane_u8(coeffabs, 3)//
- dup v26.8b, v0.8b[4] //coeffabs_4 = vdup_lane_u8(coeffabs, 4)//
- dup v27.8b, v0.8b[5] //coeffabs_5 = vdup_lane_u8(coeffabs, 5)//
- dup v28.8b, v0.8b[6] //coeffabs_6 = vdup_lane_u8(coeffabs, 6)//
- dup v29.8b, v0.8b[7] //coeffabs_7 = vdup_lane_u8(coeffabs, 7)//
+ dup v23.8b, v0.b[1] //coeffabs_1 = vdup_lane_u8(coeffabs, 1)//
+ dup v24.8b, v0.b[2] //coeffabs_2 = vdup_lane_u8(coeffabs, 2)//
+ dup v25.8b, v0.b[3] //coeffabs_3 = vdup_lane_u8(coeffabs, 3)//
+ dup v26.8b, v0.b[4] //coeffabs_4 = vdup_lane_u8(coeffabs, 4)//
+ dup v27.8b, v0.b[5] //coeffabs_5 = vdup_lane_u8(coeffabs, 5)//
+ dup v28.8b, v0.b[6] //coeffabs_6 = vdup_lane_u8(coeffabs, 6)//
+ dup v29.8b, v0.b[7] //coeffabs_7 = vdup_lane_u8(coeffabs, 7)//
blt core_loop_wd_4 //core loop wd 4 jump
stp x0,x1, [sp, #-16]!
@@ -451,49 +451,49 @@ inner_loop_wd_4:
add x3,x0,x2
ld1 {v4.s}[1],[x3],x2 //src_tmp1 = vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp1, 1)//
subs x12,x12,#4
- dup v5.2s, v4.2s[1] //src_tmp2 = vdup_lane_u32(src_tmp1, 1)//
+ dup v5.2s, v4.s[1] //src_tmp2 = vdup_lane_u32(src_tmp1, 1)//
ld1 {v5.s}[1],[x3],x2 //src_tmp2 = vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp2, 1)//
ld1 {v4.s}[0],[x0] //src_tmp1 = vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp1, 0)//
umull v0.8h, v5.8b, v23.8b //mul_res1 = vmull_u8(vreinterpret_u8_u32(src_tmp2), coeffabs_1)//
- dup v6.2s, v5.2s[1] //src_tmp3 = vdup_lane_u32(src_tmp2, 1)//
+ dup v6.2s, v5.s[1] //src_tmp3 = vdup_lane_u32(src_tmp2, 1)//
add x0,x0,#4
ld1 {v6.s}[1],[x3],x2 //src_tmp3 = vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp3, 1)//
umlsl v0.8h, v4.8b, v22.8b //mul_res1 = vmlsl_u8(mul_res1, vreinterpret_u8_u32(src_tmp1), coeffabs_0)//
- dup v7.2s, v6.2s[1] //src_tmp4 = vdup_lane_u32(src_tmp3, 1)//
+ dup v7.2s, v6.s[1] //src_tmp4 = vdup_lane_u32(src_tmp3, 1)//
ld1 {v7.s}[1],[x3],x2 //src_tmp4 = vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp4, 1)//
umlsl v0.8h, v6.8b, v24.8b //mul_res1 = vmlsl_u8(mul_res1, vreinterpret_u8_u32(src_tmp3), coeffabs_2)//
umull v19.8h, v7.8b, v23.8b
- dup v4.2s, v7.2s[1] //src_tmp1 = vdup_lane_u32(src_tmp4, 1)//
+ dup v4.2s, v7.s[1] //src_tmp1 = vdup_lane_u32(src_tmp4, 1)//
umull v2.8h, v7.8b, v25.8b //mul_res2 = vmull_u8(vreinterpret_u8_u32(src_tmp4), coeffabs_3)//
ld1 {v4.s}[1],[x3],x2 //src_tmp1 = vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp1, 1)//
umlsl v19.8h, v6.8b, v22.8b
umlal v0.8h, v4.8b, v26.8b //mul_res1 = vmlal_u8(mul_res1, vreinterpret_u8_u32(src_tmp1), coeffabs_4)//
- dup v5.2s, v4.2s[1] //src_tmp2 = vdup_lane_u32(src_tmp1, 1)//
+ dup v5.2s, v4.s[1] //src_tmp2 = vdup_lane_u32(src_tmp1, 1)//
umlsl v19.8h, v4.8b, v24.8b
ld1 {v5.s}[1],[x3],x2 //src_tmp2 = vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp2, 1)//
umlsl v2.8h, v5.8b, v27.8b //mul_res2 = vmlsl_u8(mul_res2, vreinterpret_u8_u32(src_tmp2), coeffabs_5)//
- dup v6.2s, v5.2s[1] //src_tmp3 = vdup_lane_u32(src_tmp2, 1)//
+ dup v6.2s, v5.s[1] //src_tmp3 = vdup_lane_u32(src_tmp2, 1)//
umlal v19.8h, v5.8b, v25.8b
ld1 {v6.s}[1],[x3],x2 //src_tmp3 = vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp3, 1)//
umlal v0.8h, v6.8b, v28.8b //mul_res1 = vmlal_u8(mul_res1, vreinterpret_u8_u32(src_tmp3), coeffabs_6)//
- dup v7.2s, v6.2s[1] //src_tmp4 = vdup_lane_u32(src_tmp3, 1)//
+ dup v7.2s, v6.s[1] //src_tmp4 = vdup_lane_u32(src_tmp3, 1)//
umlal v19.8h, v6.8b, v26.8b
ld1 {v7.s}[1],[x3],x2 //src_tmp4 = vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp4, 1)//
umlsl v2.8h, v7.8b, v29.8b //mul_res2 = vmlsl_u8(mul_res2, vreinterpret_u8_u32(src_tmp4), coeffabs_7)//
- dup v4.2s, v7.2s[1]
+ dup v4.2s, v7.s[1]
add v0.8h, v0.8h , v2.8h //mul_res1 = vaddq_u16(mul_res1, mul_res2)//
umlsl v19.8h, v7.8b, v27.8b
ld1 {v4.s}[1],[x3],x2
umlal v19.8h, v4.8b, v28.8b
- dup v5.2s, v4.2s[1]
+ dup v5.2s, v4.s[1]
sqrshrun v0.8b, v0.8h,#6 //sto_res = vqmovun_s16(sto_res_tmp)//
ld1 {v5.s}[1],[x3]
diff --git a/common/arm64/ihevc_inter_pred_filters_luma_vert_w16inp.s b/common/arm64/ihevc_inter_pred_filters_luma_vert_w16inp.s
index cd8addf..1c3807e 100644
--- a/common/arm64/ihevc_inter_pred_filters_luma_vert_w16inp.s
+++ b/common/arm64/ihevc_inter_pred_filters_luma_vert_w16inp.s
@@ -125,14 +125,14 @@ ihevc_inter_pred_luma_vert_w16inp_av8:
subs x7,x3,#0 //x3->ht
//ble end_loops //end loop jump
sxtl v0.8h, v0.8b
- dup v22.4h, v0.4h[0] //coeffabs_0 = vdup_lane_u8(coeffabs, 0)//
- dup v23.4h, v0.4h[1] //coeffabs_1 = vdup_lane_u8(coeffabs, 1)//
- dup v24.4h, v0.4h[2] //coeffabs_2 = vdup_lane_u8(coeffabs, 2)//
- dup v25.4h, v0.4h[3] //coeffabs_3 = vdup_lane_u8(coeffabs, 3)//
- dup v26.4h, v0.4h[4] //coeffabs_4 = vdup_lane_u8(coeffabs, 4)//
- dup v27.4h, v0.4h[5] //coeffabs_5 = vdup_lane_u8(coeffabs, 5)//
- dup v28.4h, v0.4h[6] //coeffabs_6 = vdup_lane_u8(coeffabs, 6)//
- dup v29.4h, v0.4h[7] //coeffabs_7 = vdup_lane_u8(coeffabs, 7)//
+ dup v22.4h, v0.h[0] //coeffabs_0 = vdup_lane_u8(coeffabs, 0)//
+ dup v23.4h, v0.h[1] //coeffabs_1 = vdup_lane_u8(coeffabs, 1)//
+ dup v24.4h, v0.h[2] //coeffabs_2 = vdup_lane_u8(coeffabs, 2)//
+ dup v25.4h, v0.h[3] //coeffabs_3 = vdup_lane_u8(coeffabs, 3)//
+ dup v26.4h, v0.h[4] //coeffabs_4 = vdup_lane_u8(coeffabs, 4)//
+ dup v27.4h, v0.h[5] //coeffabs_5 = vdup_lane_u8(coeffabs, 5)//
+ dup v28.4h, v0.h[6] //coeffabs_6 = vdup_lane_u8(coeffabs, 6)//
+ dup v29.4h, v0.h[7] //coeffabs_7 = vdup_lane_u8(coeffabs, 7)//
sub x20,x5,x6,lsl #2 //x6->dst_strd x5 ->wd
neg x9, x20
diff --git a/common/arm64/ihevc_inter_pred_filters_luma_vert_w16out.s b/common/arm64/ihevc_inter_pred_filters_luma_vert_w16out.s
index ca48db5..79a1a9d 100644
--- a/common/arm64/ihevc_inter_pred_filters_luma_vert_w16out.s
+++ b/common/arm64/ihevc_inter_pred_filters_luma_vert_w16out.s
@@ -87,15 +87,15 @@ ihevc_inter_pred_luma_vert_w16out_av8:
mov x3,x16 //load ht
subs x7,x3,#0 //x3->ht
//ble end_loops_16out //end loop jump
- dup v22.8b, v0.8b[0] //coeffabs_0 = vdup_lane_u8(coeffabs, 0)//
+ dup v22.8b, v0.b[0] //coeffabs_0 = vdup_lane_u8(coeffabs, 0)//
cmp x5,#8
- dup v23.8b, v0.8b[1] //coeffabs_1 = vdup_lane_u8(coeffabs, 1)//
- dup v24.8b, v0.8b[2] //coeffabs_2 = vdup_lane_u8(coeffabs, 2)//
- dup v25.8b, v0.8b[3] //coeffabs_3 = vdup_lane_u8(coeffabs, 3)//
- dup v26.8b, v0.8b[4] //coeffabs_4 = vdup_lane_u8(coeffabs, 4)//
- dup v27.8b, v0.8b[5] //coeffabs_5 = vdup_lane_u8(coeffabs, 5)//
- dup v28.8b, v0.8b[6] //coeffabs_6 = vdup_lane_u8(coeffabs, 6)//
- dup v29.8b, v0.8b[7] //coeffabs_7 = vdup_lane_u8(coeffabs, 7)//
+ dup v23.8b, v0.b[1] //coeffabs_1 = vdup_lane_u8(coeffabs, 1)//
+ dup v24.8b, v0.b[2] //coeffabs_2 = vdup_lane_u8(coeffabs, 2)//
+ dup v25.8b, v0.b[3] //coeffabs_3 = vdup_lane_u8(coeffabs, 3)//
+ dup v26.8b, v0.b[4] //coeffabs_4 = vdup_lane_u8(coeffabs, 4)//
+ dup v27.8b, v0.b[5] //coeffabs_5 = vdup_lane_u8(coeffabs, 5)//
+ dup v28.8b, v0.b[6] //coeffabs_6 = vdup_lane_u8(coeffabs, 6)//
+ dup v29.8b, v0.b[7] //coeffabs_7 = vdup_lane_u8(coeffabs, 7)//
blt core_loop_wd_4_16out //core loop wd 4 jump
stp x0,x1, [sp, #-16]!
@@ -404,49 +404,49 @@ inner_loop_wd_4_16out:
add x3,x0,x2
ld1 {v4.s}[1],[x3],x2 //src_tmp1 = vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp1, 1)//
subs x12,x12,#4
- dup v5.2s, v4.2s[1] //src_tmp2 = vdup_lane_u32(src_tmp1, 1)//
+ dup v5.2s, v4.s[1] //src_tmp2 = vdup_lane_u32(src_tmp1, 1)//
ld1 {v5.s}[1],[x3],x2 //src_tmp2 = vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp2, 1)//
ld1 {v4.s}[0],[x0] //src_tmp1 = vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp1, 0)//
umull v0.8h, v5.8b, v23.8b //mul_res1 = vmull_u8(vreinterpret_u8_u32(src_tmp2), coeffabs_1)//
- dup v6.2s, v5.2s[1] //src_tmp3 = vdup_lane_u32(src_tmp2, 1)//
+ dup v6.2s, v5.s[1] //src_tmp3 = vdup_lane_u32(src_tmp2, 1)//
add x0,x0,#4
ld1 {v6.s}[1],[x3],x2 //src_tmp3 = vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp3, 1)//
umlsl v0.8h, v4.8b, v22.8b //mul_res1 = vmlsl_u8(mul_res1, vreinterpret_u8_u32(src_tmp1), coeffabs_0)//
- dup v7.2s, v6.2s[1] //src_tmp4 = vdup_lane_u32(src_tmp3, 1)//
+ dup v7.2s, v6.s[1] //src_tmp4 = vdup_lane_u32(src_tmp3, 1)//
ld1 {v7.s}[1],[x3],x2 //src_tmp4 = vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp4, 1)//
umlsl v0.8h, v6.8b, v24.8b //mul_res1 = vmlsl_u8(mul_res1, vreinterpret_u8_u32(src_tmp3), coeffabs_2)//
umull v19.8h, v7.8b, v23.8b
- dup v4.2s, v7.2s[1] //src_tmp1 = vdup_lane_u32(src_tmp4, 1)//
+ dup v4.2s, v7.s[1] //src_tmp1 = vdup_lane_u32(src_tmp4, 1)//
umull v2.8h, v7.8b, v25.8b //mul_res2 = vmull_u8(vreinterpret_u8_u32(src_tmp4), coeffabs_3)//
ld1 {v4.s}[1],[x3],x2 //src_tmp1 = vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp1, 1)//
umlsl v19.8h, v6.8b, v22.8b
umlal v0.8h, v4.8b, v26.8b //mul_res1 = vmlal_u8(mul_res1, vreinterpret_u8_u32(src_tmp1), coeffabs_4)//
- dup v5.2s, v4.2s[1] //src_tmp2 = vdup_lane_u32(src_tmp1, 1)//
+ dup v5.2s, v4.s[1] //src_tmp2 = vdup_lane_u32(src_tmp1, 1)//
umlsl v19.8h, v4.8b, v24.8b
ld1 {v5.s}[1],[x3],x2 //src_tmp2 = vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp2, 1)//
umlsl v2.8h, v5.8b, v27.8b //mul_res2 = vmlsl_u8(mul_res2, vreinterpret_u8_u32(src_tmp2), coeffabs_5)//
- dup v6.2s, v5.2s[1] //src_tmp3 = vdup_lane_u32(src_tmp2, 1)//
+ dup v6.2s, v5.s[1] //src_tmp3 = vdup_lane_u32(src_tmp2, 1)//
umlal v19.8h, v5.8b, v25.8b
ld1 {v6.s}[1],[x3],x2 //src_tmp3 = vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp3, 1)//
umlal v0.8h, v6.8b, v28.8b //mul_res1 = vmlal_u8(mul_res1, vreinterpret_u8_u32(src_tmp3), coeffabs_6)//
- dup v7.2s, v6.2s[1] //src_tmp4 = vdup_lane_u32(src_tmp3, 1)//
+ dup v7.2s, v6.s[1] //src_tmp4 = vdup_lane_u32(src_tmp3, 1)//
umlal v19.8h, v6.8b, v26.8b
ld1 {v7.s}[1],[x3],x2 //src_tmp4 = vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp4, 1)//
umlsl v2.8h, v7.8b, v29.8b //mul_res2 = vmlsl_u8(mul_res2, vreinterpret_u8_u32(src_tmp4), coeffabs_7)//
- dup v4.2s, v7.2s[1]
+ dup v4.2s, v7.s[1]
add v0.8h, v0.8h , v2.8h //mul_res1 = vaddq_u16(mul_res1, mul_res2)//
umlsl v19.8h, v7.8b, v27.8b
ld1 {v4.s}[1],[x3],x2
umlal v19.8h, v4.8b, v28.8b
- dup v5.2s, v4.2s[1]
+ dup v5.2s, v4.s[1]
//vqrshrun.s16 d0,q0,#6 //sto_res = vqmovun_s16(sto_res_tmp)//
ld1 {v5.s}[1],[x3]
diff --git a/common/arm64/ihevc_inter_pred_luma_horz_w16out.s b/common/arm64/ihevc_inter_pred_luma_horz_w16out.s
index f7b6644..b39059b 100644
--- a/common/arm64/ihevc_inter_pred_luma_horz_w16out.s
+++ b/common/arm64/ihevc_inter_pred_luma_horz_w16out.s
@@ -133,23 +133,23 @@ ihevc_inter_pred_luma_horz_w16out_av8:
mov x15,#1
//ble end_loops
mov x14,x6 //loads wd
- dup v24.8b, v2.8b[0] //coeffabs_0 = vdup_lane_u8(coeffabs, 0)
+ dup v24.8b, v2.b[0] //coeffabs_0 = vdup_lane_u8(coeffabs, 0)
sub x16,x0,#3 //pu1_src - 3
- dup v25.8b, v2.8b[1] //coeffabs_1 = vdup_lane_u8(coeffabs, 1)
+ dup v25.8b, v2.b[1] //coeffabs_1 = vdup_lane_u8(coeffabs, 1)
add x8,x16,x2 //pu1_src_tmp2_8 = pu1_src + src_strd
- dup v26.8b, v2.8b[2] //coeffabs_2 = vdup_lane_u8(coeffabs, 2)
+ dup v26.8b, v2.b[2] //coeffabs_2 = vdup_lane_u8(coeffabs, 2)
sub x20,x14,x2,lsl #1 //2*src_strd - wd
neg x13, x20
- dup v27.8b, v2.8b[3] //coeffabs_3 = vdup_lane_u8(coeffabs, 3)
+ dup v27.8b, v2.b[3] //coeffabs_3 = vdup_lane_u8(coeffabs, 3)
sub x20,x14,x3 //dst_strd - wd
neg x12, x20
- dup v28.8b, v2.8b[4] //coeffabs_4 = vdup_lane_u8(coeffabs, 4)
+ dup v28.8b, v2.b[4] //coeffabs_4 = vdup_lane_u8(coeffabs, 4)
- dup v29.8b, v2.8b[5] //coeffabs_5 = vdup_lane_u8(coeffabs, 5)
+ dup v29.8b, v2.b[5] //coeffabs_5 = vdup_lane_u8(coeffabs, 5)
and x11,x19,#1 //calculating ht_residue ht_residue = (ht & 1)
- dup v30.8b, v2.8b[6] //coeffabs_6 = vdup_lane_u8(coeffabs, 6)
+ dup v30.8b, v2.b[6] //coeffabs_6 = vdup_lane_u8(coeffabs, 6)
sub x19,x19,x11 //decrement height by ht_residue(residue value is calculated outside)
- dup v31.8b, v2.8b[7] //coeffabs_7 = vdup_lane_u8(coeffabs, 7)
+ dup v31.8b, v2.b[7] //coeffabs_7 = vdup_lane_u8(coeffabs, 7)
cmp x11,#1
beq odd_height_decision
diff --git a/common/arm64/ihevc_intra_pred_chroma_dc.s b/common/arm64/ihevc_intra_pred_chroma_dc.s
index 2fdee98..a6969dd 100644
--- a/common/arm64/ihevc_intra_pred_chroma_dc.s
+++ b/common/arm64/ihevc_intra_pred_chroma_dc.s
@@ -180,8 +180,8 @@ core_loop_add:
epil_add_loop:
- smov x1, v18.2s[0]
- smov x11, v17.2s[0]
+ smov x1, v18.s[0]
+ smov x11, v17.s[0]
add x1,x1,x4
add x11,x11,x4
@@ -204,7 +204,7 @@ prologue_cpy_32:
beq epilogue_copy
st2 {v16.8b, v17.8b}, [x2],#16
- add x6, x6, #-16
+ sub x6, x6, #16
st2 {v16.8b, v17.8b}, [x5],#16
st2 {v16.8b, v17.8b}, [x8],#16
@@ -274,8 +274,8 @@ dc_4:
uadalp v17.1d, v3.2s
uadalp v18.1d, v2.2s
- smov x10, v17.2s[0]
- smov x11, v18.2s[0]
+ smov x10, v17.s[0]
+ smov x11, v18.s[0]
add x10,x10,x4
add x11,x11,x4
diff --git a/common/arm64/ihevc_intra_pred_chroma_horz.s b/common/arm64/ihevc_intra_pred_chroma_horz.s
index 8de655c..d2f3102 100644
--- a/common/arm64/ihevc_intra_pred_chroma_horz.s
+++ b/common/arm64/ihevc_intra_pred_chroma_horz.s
@@ -119,63 +119,63 @@ core_loop_16:
sub x12,x12,#16
ld1 { v18.8h},[x12] //load 16 values. d1[7] will have the 1st value.
- dup v2.8h, v0.4h[7] //duplicate the i value.
+ dup v2.8h, v0.h[7] //duplicate the i value.
- dup v4.8h, v0.4h[6] //duplicate the ii value.
- dup v6.8h, v0.4h[5] //duplicate the iii value.
+ dup v4.8h, v0.h[6] //duplicate the ii value.
+ dup v6.8h, v0.h[5] //duplicate the iii value.
st1 { v2.8h},[x2],x3 //store in 1st row 0-16 columns
st1 { v2.8h},[x9],x3 //store in 1st row 16-32 columns
- dup v1.8h, v0.4h[4]
+ dup v1.8h, v0.h[4]
st1 { v4.8h},[x2],x3
st1 { v4.8h},[x9],x3
- dup v2.8h, v0.4h[3]
+ dup v2.8h, v0.h[3]
st1 { v6.8h},[x2],x3
st1 { v6.8h},[x9],x3
- dup v4.8h, v0.4h[2]
+ dup v4.8h, v0.h[2]
st1 { v1.8h},[x2],x3
st1 { v1.8h},[x9],x3
- dup v6.8h, v0.4h[1]
+ dup v6.8h, v0.h[1]
st1 { v2.8h},[x2],x3
st1 { v2.8h},[x9],x3
- dup v1.8h, v0.4h[0]
+ dup v1.8h, v0.h[0]
st1 { v4.8h},[x2],x3
st1 { v4.8h},[x9],x3
- dup v2.8h, v18.4h[7]
+ dup v2.8h, v18.h[7]
st1 { v6.8h},[x2],x3
st1 { v6.8h},[x9],x3
- dup v4.8h, v18.4h[6]
+ dup v4.8h, v18.h[6]
st1 { v1.8h},[x2],x3
st1 { v1.8h},[x9],x3
- dup v6.8h, v18.4h[5]
+ dup v6.8h, v18.h[5]
st1 { v2.8h},[x2],x3
st1 { v2.8h},[x9],x3
- dup v1.8h, v18.4h[4]
+ dup v1.8h, v18.h[4]
st1 { v4.8h},[x2],x3
st1 { v4.8h},[x9],x3
- dup v2.8h, v18.4h[3]
+ dup v2.8h, v18.h[3]
st1 { v6.8h},[x2],x3
st1 { v6.8h},[x9],x3
- dup v4.8h, v18.4h[2]
+ dup v4.8h, v18.h[2]
st1 { v1.8h},[x2],x3
st1 { v1.8h},[x9],x3
- dup v6.8h, v18.4h[1]
+ dup v6.8h, v18.h[1]
st1 { v2.8h},[x2],x3
st1 { v2.8h},[x9],x3
sub x12,x12,#16 //move to 16th value pointer
- dup v1.8h, v18.4h[0]
+ dup v1.8h, v18.h[0]
st1 { v4.8h},[x2],x3
st1 { v4.8h},[x9],x3
@@ -203,33 +203,33 @@ core_loop_8:
sub x12,x12,#16
// ld1 { v30.16b},[x12]
- dup v18.8h, v0.4h[7]
+ dup v18.8h, v0.h[7]
//vmovl.u8 q13,d26
- dup v2.8h, v0.4h[6]
+ dup v2.8h, v0.h[6]
//vsubl.u8 q12,d30,d28
- dup v4.8h, v0.4h[5]
+ dup v4.8h, v0.h[5]
//vshr.s16 q12,q12,#1
- dup v6.8h, v0.4h[4]
+ dup v6.8h, v0.h[4]
//vqadd.s16 q11,q13,q12
- dup v1.8h, v0.4h[3]
+ dup v1.8h, v0.h[3]
//vqmovun.s16 d22,q11
st1 { v18.8h},[x2],x3
- dup v18.8h, v0.4h[2]
+ dup v18.8h, v0.h[2]
//vsubl.u8 q12,d31,d28
- dup v19.8h, v0.4h[1]
+ dup v19.8h, v0.h[1]
//vshr.s16 q12,q12,#1
- dup v20.8h, v0.4h[0]
+ dup v20.8h, v0.h[0]
//vqadd.s16 q11,q13,q12
- dup v16.8h, v0.4h[3]
+ dup v16.8h, v0.h[3]
//vqmovun.s16 d22,q11
st1 { v2.8h},[x2],x3
@@ -284,32 +284,32 @@ core_loop_4:
ld1 {v0.8b},[x12]
sub x12,x12,#8
ld1 {v30.8b},[x12]
- dup v26.4h, v0.4h[3]
+ dup v26.4h, v0.h[3]
dup v28.8b,w14
- dup v3.4h, v0.4h[2]
+ dup v3.4h, v0.h[2]
uxtl v26.8h, v26.8b
- dup v4.4h, v0.4h[1]
+ dup v4.4h, v0.h[1]
usubl v24.8h, v30.8b, v28.8b
- dup v5.4h, v0.4h[0]
+ dup v5.4h, v0.h[0]
sshr v24.8h, v24.8h,#1
- dup v6.4h, v0.4h[3]
+ dup v6.4h, v0.h[3]
sqadd v22.8h, v26.8h , v24.8h
- dup v7.4h, v0.4h[2]
+ dup v7.4h, v0.h[2]
sqxtun v22.8b, v22.8h
st1 {v6.8b},[x2],x3
st1 {v3.8b},[x2],x3
- dup v1.4h, v0.4h[1]
+ dup v1.4h, v0.h[1]
st1 {v4.8b},[x2],x3
st1 {v5.8b},[x2],x3
- dup v17.4h, v0.4h[0]
+ dup v17.4h, v0.h[0]
//vst1.8 {d6},[x2],x3
//vst1.8 {d7},[x2],x3
@@ -331,16 +331,16 @@ core_loop_4:
sub x12,x12,#5
ld1 {v0.8b},[x12]
dup v28.8b,w14
- dup v26.8b, v0.8b[3]
+ dup v26.8b, v0.b[3]
uxtl v26.8h, v26.8b
- dup v3.8b, v0.8b[2]
+ dup v3.8b, v0.b[2]
usubl v24.8h, v30.8b, v28.8b
- dup v4.8b, v0.8b[1]
+ dup v4.8b, v0.b[1]
sshr v24.8h, v24.8h,#1
- dup v5.8b, v0.8b[0]
+ dup v5.8b, v0.b[0]
sqadd v22.8h, v26.8h , v24.8h
sqxtun v22.8b, v22.8h
diff --git a/common/arm64/ihevc_intra_pred_chroma_mode2.s b/common/arm64/ihevc_intra_pred_chroma_mode2.s
index d2c0730..aec3da4 100644
--- a/common/arm64/ihevc_intra_pred_chroma_mode2.s
+++ b/common/arm64/ihevc_intra_pred_chroma_mode2.s
@@ -116,7 +116,7 @@ ihevc_intra_pred_chroma_mode2_av8:
add x0,x0,x4,lsl #2
sub x0,x0,#0x12 //src[1]
- add x10,x0,#-2
+ sub x10,x0,#2
prologue_cpy_32:
@@ -223,7 +223,7 @@ kernel_mode2:
rev64 v23.8b, v7.8b
rev64 v24.8b, v8.8b
- add x10,x0,#-2
+ sub x10,x0,#2
rev64 v25.8b, v9.8b
rev64 v26.8b, v10.8b
diff --git a/common/arm64/ihevc_intra_pred_chroma_mode_27_to_33.s b/common/arm64/ihevc_intra_pred_chroma_mode_27_to_33.s
index b22d182..3230136 100644
--- a/common/arm64/ihevc_intra_pred_chroma_mode_27_to_33.s
+++ b/common/arm64/ihevc_intra_pred_chroma_mode_27_to_33.s
@@ -142,13 +142,13 @@ prologue:
xtn v4.8b, v4.8h
shrn v5.8b, v2.8h,#5 //idx = pos >> 5
- dup v31.8b, v4.8b[0]
+ dup v31.8b, v4.b[0]
add x0,x2,x3
- smov x14, v5.2s[0] //(i row)extract idx to the r register
+ smov x14, v5.s[0] //(i row)extract idx to the r register
lsl x14,x14,#1
- dup v29.8b, v4.8b[1] //(ii)
+ dup v29.8b, v4.b[1] //(ii)
and x9,x14,#0xff //(i row) get the last byte
add x10,x8,x9 //(i row)*pu1_ref[ref_main_idx]
@@ -172,11 +172,11 @@ prologue:
umlal v10.8h, v9.8b, v31.8b //(i row)vmull_u8(ref_main_idx_1, dup_const_fract)
asr x14,x14,#8 //(iv)
- dup v27.8b, v4.8b[2] //(iii)
+ dup v27.8b, v4.b[2] //(iii)
sub v28.8b, v1.8b , v29.8b //(ii)32-fract(dup_const_32_fract)
and x9,x14,#0xff //(iv)
- dup v25.8b, v4.8b[3] //(iv)
+ dup v25.8b, v4.b[3] //(iv)
umull v14.8h, v12.8b, v28.8b //(ii)vmull_u8(ref_main_idx, dup_const_32_fract)
add x12,x8,x9 //(iv)*pu1_ref[ref_main_idx]
@@ -191,10 +191,10 @@ prologue:
ld1 {v21.8b},[x12] //(iv)ref_main_idx_1
- dup v31.8b, v4.8b[4] //(v)
+ dup v31.8b, v4.b[4] //(v)
umull v18.8h, v16.8b, v26.8b //(iii)vmull_u8(ref_main_idx, dup_const_32_fract)
- smov x14, v5.2s[1] //extract idx to the r register
+ smov x14, v5.s[1] //extract idx to the r register
umlal v18.8h, v17.8b, v27.8b //(iii)vmull_u8(ref_main_idx_1, dup_const_fract)
lsl x14,x14,#1
@@ -202,7 +202,7 @@ prologue:
rshrn v14.8b, v14.8h,#5 //(ii)shift_res = vrshrn_n_u16(add_res, 5)
and x9,x14,#0xff //(v)
- dup v29.8b, v4.8b[5] //(vi)
+ dup v29.8b, v4.b[5] //(vi)
add x10,x8,x9 //(v)*pu1_ref[ref_main_idx]
ld1 {v23.8b},[x10],x11 //(v)ref_main_idx
@@ -219,7 +219,7 @@ prologue:
rshrn v18.8b, v18.8h,#5 //(iii)shift_res = vrshrn_n_u16(add_res, 5)
add x12,x8,x9 //(vi)*pu1_ref[ref_main_idx]
- dup v27.8b, v4.8b[6] //(vii)
+ dup v27.8b, v4.b[6] //(vii)
asr x14,x14,#8 //(vii)
and x9,x14,#0xff //(vii)
@@ -236,7 +236,7 @@ prologue:
rshrn v22.8b, v22.8h,#5 //(iv)shift_res = vrshrn_n_u16(add_res, 5)
asr x14,x14,#8 //(viii)
- dup v25.8b, v4.8b[7] //(viii)
+ dup v25.8b, v4.b[7] //(viii)
and x9,x14,#0xff //(viii)
ld1 {v16.8b},[x10],x11 //(vii)ref_main_idx
@@ -274,14 +274,14 @@ prologue:
and v4.16b, v2.16b , v6.16b //dup_const_fract(fract = pos & (31))
xtn v4.8b, v4.8h
shrn v3.8b, v2.8h,#5 //idx = pos >> 5
- smov x14, v3.2s[0] //(i)extract idx to the r register
+ smov x14, v3.s[0] //(i)extract idx to the r register
lsl x14,x14,#1
and x9,x14,#0xff //(i)
add x10,x8,x9 //(i)*pu1_ref[ref_main_idx]
kernel_8_rows:
asr x14,x14,#8 //(ii)
- dup v31.8b, v4.8b[0]
+ dup v31.8b, v4.b[0]
subs x4,x4,#8
ld1 {v23.8b},[x10],x11 //(i)ref_main_idx
@@ -298,7 +298,7 @@ kernel_8_rows:
umlal v22.8h, v21.8b, v25.8b //(viii)vmull_u8(ref_main_idx_1, dup_const_fract)
asr x14,x14,#8 //(iii)
- dup v29.8b, v4.8b[1] //(ii)
+ dup v29.8b, v4.b[1] //(ii)
rshrn v18.8b, v18.8h,#5 //(vii)shift_res = vrshrn_n_u16(add_res, 5)
and x9,x14,#0xff //(iii)
@@ -314,10 +314,10 @@ kernel_8_rows:
umlal v10.8h, v9.8b, v31.8b //(i)vmull_u8(ref_main_idx_1, dup_const_fract)
and x9,x14,#0xff //(iv)
- smov x14, v3.2s[1] //extract idx to the r register
+ smov x14, v3.s[1] //extract idx to the r register
rshrn v22.8b, v22.8h,#5 //(viii)shift_res = vrshrn_n_u16(add_res, 5)
- dup v27.8b, v4.8b[2] //(iii)
+ dup v27.8b, v4.b[2] //(iii)
sub v28.8b, v1.8b , v29.8b //(ii)32-fract(dup_const_32_fract)
csel x4, x5, x4,le //reload nt
@@ -331,7 +331,7 @@ kernel_8_rows:
ld1 {v17.8b},[x10] //(iii)ref_main_idx_1
rshrn v10.8b, v10.8h,#5 //(i)shift_res = vrshrn_n_u16(add_res, 5)
- dup v25.8b, v4.8b[3] //(iv)
+ dup v25.8b, v4.b[3] //(iv)
umull v2.8h, v5.8b, v0.8b //pos = ((row + 1) * intra_pred_ang)
st1 {v22.8b},[x0] //(viii)
@@ -345,7 +345,7 @@ kernel_8_rows:
umlal v18.8h, v17.8b, v27.8b //(iii)vmull_u8(ref_main_idx_1, dup_const_fract)
add x0,x2,x3
- dup v31.8b, v4.8b[4] //(v)
+ dup v31.8b, v4.b[4] //(v)
rshrn v14.8b, v14.8h,#5 //(ii)shift_res = vrshrn_n_u16(add_res, 5)
and x9,x14,#0xff //(v)
@@ -353,15 +353,15 @@ kernel_8_rows:
sub v24.8b, v1.8b , v25.8b //(iv)32-fract(dup_const_32_fract)
add x10,x8,x9 //(v)*pu1_ref[ref_main_idx]
- dup v29.8b, v4.8b[5] //(vi)
+ dup v29.8b, v4.b[5] //(vi)
umull v22.8h, v20.8b, v24.8b //(iv)vmull_u8(ref_main_idx, dup_const_32_fract)
asr x14,x14,#8 //(vi)
- dup v27.8b, v4.8b[6] //(vii)
+ dup v27.8b, v4.b[6] //(vii)
umlal v22.8h, v21.8b, v25.8b //(iv)vmull_u8(ref_main_idx_1, dup_const_fract)
and x9,x14,#0xff //(vi)
- dup v25.8b, v4.8b[7] //(viii)
+ dup v25.8b, v4.b[7] //(viii)
rshrn v18.8b, v18.8h,#5 //(iii)shift_res = vrshrn_n_u16(add_res, 5)
add x12,x8,x9 //(vi)*pu1_ref[ref_main_idx]
@@ -385,7 +385,7 @@ kernel_8_rows:
umull v10.8h, v23.8b, v30.8b //(v)vmull_u8(ref_main_idx, dup_const_32_fract)
and x9,x14,#0xff //(viii)
- smov x14, v3.2s[0] //(i)extract idx to the r register
+ smov x14, v3.s[0] //(i)extract idx to the r register
umlal v10.8h, v9.8b, v31.8b //(v)vmull_u8(ref_main_idx_1, dup_const_fract)
add x12,x8,x9 //(viii)*pu1_ref[ref_main_idx]
diff --git a/common/arm64/ihevc_intra_pred_chroma_ver.s b/common/arm64/ihevc_intra_pred_chroma_ver.s
index 8d1daf7..451cae9 100644
--- a/common/arm64/ihevc_intra_pred_chroma_ver.s
+++ b/common/arm64/ihevc_intra_pred_chroma_ver.s
@@ -121,7 +121,7 @@ copy_16:
ld2 {v22.8b, v23.8b}, [x6] //16 loads (col 16:31)
lsl x11, x3, #2
- add x11, x11, #-16
+ sub x11, x11, #16
st2 {v20.8b, v21.8b}, [x2],#16
diff --git a/common/arm64/ihevc_intra_pred_filters_chroma_mode_11_to_17.s b/common/arm64/ihevc_intra_pred_filters_chroma_mode_11_to_17.s
index 5d65e63..bfb92bc 100644
--- a/common/arm64/ihevc_intra_pred_filters_chroma_mode_11_to_17.s
+++ b/common/arm64/ihevc_intra_pred_filters_chroma_mode_11_to_17.s
@@ -207,7 +207,7 @@ end_loop_copy:
strh w11, [x6], #2
sxtw x11,w11
- cmp x9, #-1
+ cmn x9, #1
bge prologue_8_16_32
add x6, sp, x4, lsl #1 //ref_temp + 2 * nt
diff --git a/common/arm64/ihevc_intra_pred_filters_chroma_mode_19_to_25.s b/common/arm64/ihevc_intra_pred_filters_chroma_mode_19_to_25.s
index 261c591..c7feebd 100644
--- a/common/arm64/ihevc_intra_pred_filters_chroma_mode_19_to_25.s
+++ b/common/arm64/ihevc_intra_pred_filters_chroma_mode_19_to_25.s
@@ -176,7 +176,7 @@ end_loop_copy:
strh w11, [x6]
sxtw x11,w11
- cmp x9, #-1
+ cmn x9, #1
bge linear_filtering
add x6, sp, x4 ,lsl #1 //ref_temp + 2 * nt
@@ -256,13 +256,13 @@ prologue:
shrn v5.8b, v2.8h,#5 //idx = pos >> 5
shl v5.8b, v5.8b,#1
- dup v31.8b, v4.8b[0]
+ dup v31.8b, v4.b[0]
add x0,x2,x3
- smov x14, v5.2s[0] //(i row)extract idx to the r register
+ smov x14, v5.s[0] //(i row)extract idx to the r register
// lsl x14,x14,#1
- dup v29.8b, v4.8b[1] //(ii)
+ dup v29.8b, v4.b[1] //(ii)
sbfx x9,x14,#0,#8
add x10,x8,x9 //(i row)*pu1_ref[ref_main_idx]
@@ -283,11 +283,11 @@ prologue:
ld1 {v13.8b},[x12] //(ii)ref_main_idx_1
umlal v23.8h, v19.8b, v31.8b //(i row)vmull_u8(ref_main_idx_1, dup_const_fract)
- dup v27.8b, v4.8b[2] //(iii)
+ dup v27.8b, v4.b[2] //(iii)
sub v28.8b, v1.8b , v29.8b //(ii)32-fract(dup_const_32_fract)
sbfx x9,x14,#24,#8
- dup v25.8b, v4.8b[3] //(iv)
+ dup v25.8b, v4.b[3] //(iv)
umull v14.8h, v12.8b, v28.8b //(ii)vmull_u8(ref_main_idx, dup_const_32_fract)
add x12,x8,x9 //(iv)*pu1_ref[ref_main_idx]
@@ -302,10 +302,10 @@ prologue:
ld1 {v21.8b},[x12] //(iv)ref_main_idx_1
- dup v31.8b, v4.8b[4] //(v)
+ dup v31.8b, v4.b[4] //(v)
umull v18.8h, v16.8b, v26.8b //(iii)vmull_u8(ref_main_idx, dup_const_32_fract)
- smov x14, v5.2s[1] //extract idx to the r register
+ smov x14, v5.s[1] //extract idx to the r register
umlal v18.8h, v17.8b, v27.8b //(iii)vmull_u8(ref_main_idx_1, dup_const_fract)
// lsl x14,x14,#1
@@ -313,7 +313,7 @@ prologue:
rshrn v14.8b, v14.8h,#5 //(ii)shift_res = vrshrn_n_u16(add_res, 5)
sbfx x9,x14,#0,#8
- dup v29.8b, v4.8b[5] //(vi)
+ dup v29.8b, v4.b[5] //(vi)
add x10,x8,x9 //(v)*pu1_ref[ref_main_idx]
ld1 {v7.8b},[x10],x11 //(v)ref_main_idx
@@ -329,7 +329,7 @@ prologue:
rshrn v18.8b, v18.8h,#5 //(iii)shift_res = vrshrn_n_u16(add_res, 5)
add x12,x8,x9 //(vi)*pu1_ref[ref_main_idx]
- dup v27.8b, v4.8b[6] //(vii)
+ dup v27.8b, v4.b[6] //(vii)
sbfx x9,x14,#16,#8
sub v30.8b, v1.8b , v31.8b //(v)32-fract(dup_const_32_fract)
@@ -344,7 +344,7 @@ prologue:
st1 {v18.8b},[x0],x3 //(iii)
rshrn v22.8b, v22.8h,#5 //(iv)shift_res = vrshrn_n_u16(add_res, 5)
- dup v25.8b, v4.8b[7] //(viii)
+ dup v25.8b, v4.b[7] //(viii)
sbfx x9,x14,#24,#8
ld1 {v16.8b},[x10],x11 //(vii)ref_main_idx
@@ -386,13 +386,13 @@ prologue:
xtn v4.8b, v4.8h
shrn v3.8b, v2.8h,#5 //idx = pos >> 5
shl v3.8b, v3.8b,#1
- smov x14, v3.2s[0] //(i)extract idx to the r register
+ smov x14, v3.s[0] //(i)extract idx to the r register
// lsl x14,x14,#1
sbfx x9,x14,#0,#8
add x10,x8,x9 //(i)*pu1_ref[ref_main_idx]
kernel_8_rows:
- dup v31.8b, v4.8b[0]
+ dup v31.8b, v4.b[0]
subs x4,x4,#8
sbfx x9,x14,#8,#8
@@ -409,7 +409,7 @@ kernel_8_rows:
ld1 {v5.8b},[x6] //loads the row value
umlal v22.8h, v21.8b, v25.8b //(viii)vmull_u8(ref_main_idx_1, dup_const_fract)
- dup v29.8b, v4.8b[1] //(ii)
+ dup v29.8b, v4.b[1] //(ii)
rshrn v18.8b, v18.8h,#5 //(vii)shift_res = vrshrn_n_u16(add_res, 5)
sbfx x9,x14,#16,#8
@@ -428,10 +428,10 @@ kernel_8_rows:
sbfx x9,x14,#24,#8
csel x4, x5, x4,le //reload nt
- smov x14, v3.2s[1] //extract idx to the r register
+ smov x14, v3.s[1] //extract idx to the r register
rshrn v22.8b, v22.8h,#5 //(viii)shift_res = vrshrn_n_u16(add_res, 5)
- dup v27.8b, v4.8b[2] //(iii)
+ dup v27.8b, v4.b[2] //(iii)
sub v28.8b, v1.8b , v29.8b //(ii)32-fract(dup_const_32_fract)
add x12,x8,x9 //(iv)*pu1_ref[ref_main_idx]
@@ -444,7 +444,7 @@ kernel_8_rows:
ld1 {v17.8b},[x10] //(iii)ref_main_idx_1
rshrn v23.8b, v23.8h,#5 //(i)shift_res = vrshrn_n_u16(add_res, 5)
- dup v25.8b, v4.8b[3] //(iv)
+ dup v25.8b, v4.b[3] //(iv)
smull v2.8h, v5.8b, v0.8b //pos = ((row + 1) * intra_pred_ang)
st1 {v22.8b},[x0] //(viii)
@@ -460,7 +460,7 @@ kernel_8_rows:
sbfx x9,x14,#0,#8
add x0,x2,x3
- dup v31.8b, v4.8b[4] //(v)
+ dup v31.8b, v4.b[4] //(v)
rshrn v14.8b, v14.8h,#5 //(ii)shift_res = vrshrn_n_u16(add_res, 5)
add x10,x8,x9 //(v)*pu1_ref[ref_main_idx]
@@ -469,16 +469,16 @@ kernel_8_rows:
st1 {v23.8b},[x2],#8 //(i)
sub v24.8b, v1.8b , v25.8b //(iv)32-fract(dup_const_32_fract)
- dup v29.8b, v4.8b[5] //(vi)
+ dup v29.8b, v4.b[5] //(vi)
umull v22.8h, v20.8b, v24.8b //(iv)vmull_u8(ref_main_idx, dup_const_32_fract)
- dup v27.8b, v4.8b[6] //(vii)
+ dup v27.8b, v4.b[6] //(vii)
umlal v22.8h, v21.8b, v25.8b //(iv)vmull_u8(ref_main_idx_1, dup_const_fract)
add x12,x8,x9 //(vi)*pu1_ref[ref_main_idx]
sbfx x9,x14,#16,#8
- dup v25.8b, v4.8b[7] //(viii)
+ dup v25.8b, v4.b[7] //(viii)
rshrn v18.8b, v18.8h,#5 //(iii)shift_res = vrshrn_n_u16(add_res, 5)
ld1 {v7.8b},[x10],x11 //(v)ref_main_idx
@@ -501,7 +501,7 @@ kernel_8_rows:
ld1 {v13.8b},[x12] //(vi)ref_main_idx_1
umull v23.8h, v7.8b, v30.8b //(v)vmull_u8(ref_main_idx, dup_const_32_fract)
- smov x14, v3.2s[0] //(i)extract idx to the r register
+ smov x14, v3.s[0] //(i)extract idx to the r register
umlal v23.8h, v19.8b, v31.8b //(v)vmull_u8(ref_main_idx_1, dup_const_fract)
add x12,x8,x9 //(viii)*pu1_ref[ref_main_idx]
diff --git a/common/arm64/ihevc_intra_pred_filters_luma_mode_11_to_17.s b/common/arm64/ihevc_intra_pred_filters_luma_mode_11_to_17.s
index 66f4699..dcc0fc7 100644
--- a/common/arm64/ihevc_intra_pred_filters_luma_mode_11_to_17.s
+++ b/common/arm64/ihevc_intra_pred_filters_luma_mode_11_to_17.s
@@ -214,7 +214,7 @@ end_loop_copy:
strb w11, [x6], #1
sxtw x11,w11
- cmp x9, #-1
+ cmn x9, #1
bge prologue_8_16_32
add x6, sp, x4 //ref_temp + nt
diff --git a/common/arm64/ihevc_intra_pred_filters_luma_mode_19_to_25.s b/common/arm64/ihevc_intra_pred_filters_luma_mode_19_to_25.s
index 9b59d58..322e4c7 100644
--- a/common/arm64/ihevc_intra_pred_filters_luma_mode_19_to_25.s
+++ b/common/arm64/ihevc_intra_pred_filters_luma_mode_19_to_25.s
@@ -183,7 +183,7 @@ end_loop_copy:
strb w11, [x6]
sxtw x11,w11
- cmp x9, #-1
+ cmn x9, #1
bge linear_filtering
add x6, sp, x4 //ref_temp + nt
@@ -259,13 +259,13 @@ prologue:
xtn v4.8b, v4.8h
shrn v5.8b, v2.8h,#5 //idx = pos >> 5
- dup v31.8b, v4.8b[0]
+ dup v31.8b, v4.b[0]
add x0,x2,x3
- umov w14, v5.2s[0] //(i row)extract idx to the r register
+ umov w14, v5.s[0] //(i row)extract idx to the r register
sxtw x14,w14
- dup v29.8b, v4.8b[1] //(ii)
+ dup v29.8b, v4.b[1] //(ii)
sbfx x9,x14,#0,#8
add x10,x8,x9 //(i row)*pu1_ref[ref_main_idx]
@@ -286,11 +286,11 @@ prologue:
ld1 {v13.8b},[x12] //(ii)ref_main_idx_1
umlal v10.8h, v9.8b, v31.8b //(i row)vmull_u8(ref_main_idx_1, dup_const_fract)
- dup v27.8b, v4.8b[2] //(iii)
+ dup v27.8b, v4.b[2] //(iii)
sub v28.8b, v1.8b , v29.8b //(ii)32-fract(dup_const_32_fract)
sbfx x9,x14,#24,#8
- dup v25.8b, v4.8b[3] //(iv)
+ dup v25.8b, v4.b[3] //(iv)
umull v14.8h, v12.8b, v28.8b //(ii)vmull_u8(ref_main_idx, dup_const_32_fract)
add x12,x8,x9 //(iv)*pu1_ref[ref_main_idx]
@@ -305,10 +305,10 @@ prologue:
ld1 {v21.8b},[x12] //(iv)ref_main_idx_1
- dup v31.8b, v4.8b[4] //(v)
+ dup v31.8b, v4.b[4] //(v)
umull v18.8h, v16.8b, v26.8b //(iii)vmull_u8(ref_main_idx, dup_const_32_fract)
- umov w14, v5.2s[1] //extract idx to the r register
+ umov w14, v5.s[1] //extract idx to the r register
sxtw x14,w14
umlal v18.8h, v17.8b, v27.8b //(iii)vmull_u8(ref_main_idx_1, dup_const_fract)
@@ -316,7 +316,7 @@ prologue:
rshrn v14.8b, v14.8h,#5 //(ii)shift_res = vrshrn_n_u16(add_res, 5)
sbfx x9,x14,#0,#8
- dup v29.8b, v4.8b[5] //(vi)
+ dup v29.8b, v4.b[5] //(vi)
add x10,x8,x9 //(v)*pu1_ref[ref_main_idx]
ld1 {v23.8b},[x10],x11 //(v)ref_main_idx
@@ -332,7 +332,7 @@ prologue:
rshrn v18.8b, v18.8h,#5 //(iii)shift_res = vrshrn_n_u16(add_res, 5)
add x12,x8,x9 //(vi)*pu1_ref[ref_main_idx]
- dup v27.8b, v4.8b[6] //(vii)
+ dup v27.8b, v4.b[6] //(vii)
sbfx x9,x14,#16,#8
sub v30.8b, v1.8b , v31.8b //(v)32-fract(dup_const_32_fract)
@@ -347,7 +347,7 @@ prologue:
st1 {v18.8b},[x0],x3 //(iii)
rshrn v22.8b, v22.8h,#5 //(iv)shift_res = vrshrn_n_u16(add_res, 5)
- dup v25.8b, v4.8b[7] //(viii)
+ dup v25.8b, v4.b[7] //(viii)
sbfx x9,x14,#24,#8
ld1 {v16.8b},[x10],x11 //(vii)ref_main_idx
@@ -385,13 +385,13 @@ prologue:
and v4.16b, v2.16b , v6.16b //dup_const_fract(fract = pos & (31))
xtn v4.8b, v4.8h
shrn v3.8b, v2.8h,#5 //idx = pos >> 5
- umov w14, v3.2s[0] //(i)extract idx to the r register
+ umov w14, v3.s[0] //(i)extract idx to the r register
sxtw x14,w14
sbfx x9,x14,#0,#8
add x10,x8,x9 //(i)*pu1_ref[ref_main_idx]
kernel_8_rows:
- dup v31.8b, v4.8b[0]
+ dup v31.8b, v4.b[0]
subs x4,x4,#8
sbfx x9,x14,#8,#8
@@ -408,7 +408,7 @@ kernel_8_rows:
ld1 {v5.8b},[x6] //loads the row value
umlal v22.8h, v21.8b, v25.8b //(viii)vmull_u8(ref_main_idx_1, dup_const_fract)
- dup v29.8b, v4.8b[1] //(ii)
+ dup v29.8b, v4.b[1] //(ii)
rshrn v18.8b, v18.8h,#5 //(vii)shift_res = vrshrn_n_u16(add_res, 5)
sbfx x9,x14,#16,#8
@@ -427,11 +427,11 @@ kernel_8_rows:
sbfx x9,x14,#24,#8
csel x4, x5, x4,le //reload nt
- umov w14, v3.2s[1] //extract idx to the r register
+ umov w14, v3.s[1] //extract idx to the r register
sxtw x14,w14
rshrn v22.8b, v22.8h,#5 //(viii)shift_res = vrshrn_n_u16(add_res, 5)
- dup v27.8b, v4.8b[2] //(iii)
+ dup v27.8b, v4.b[2] //(iii)
sub v28.8b, v1.8b , v29.8b //(ii)32-fract(dup_const_32_fract)
add x12,x8,x9 //(iv)*pu1_ref[ref_main_idx]
@@ -444,7 +444,7 @@ kernel_8_rows:
ld1 {v17.8b},[x10] //(iii)ref_main_idx_1
rshrn v10.8b, v10.8h,#5 //(i)shift_res = vrshrn_n_u16(add_res, 5)
- dup v25.8b, v4.8b[3] //(iv)
+ dup v25.8b, v4.b[3] //(iv)
smull v2.8h, v5.8b, v0.8b //pos = ((row + 1) * intra_pred_ang)
st1 {v22.8b},[x0] //(viii)
@@ -459,7 +459,7 @@ kernel_8_rows:
sbfx x9,x14,#0,#8
add x0,x2,x3
- dup v31.8b, v4.8b[4] //(v)
+ dup v31.8b, v4.b[4] //(v)
rshrn v14.8b, v14.8h,#5 //(ii)shift_res = vrshrn_n_u16(add_res, 5)
add x10,x8,x9 //(v)*pu1_ref[ref_main_idx]
@@ -468,16 +468,16 @@ kernel_8_rows:
st1 {v10.8b},[x2],#8 //(i)
sub v24.8b, v1.8b , v25.8b //(iv)32-fract(dup_const_32_fract)
- dup v29.8b, v4.8b[5] //(vi)
+ dup v29.8b, v4.b[5] //(vi)
umull v22.8h, v20.8b, v24.8b //(iv)vmull_u8(ref_main_idx, dup_const_32_fract)
- dup v27.8b, v4.8b[6] //(vii)
+ dup v27.8b, v4.b[6] //(vii)
umlal v22.8h, v21.8b, v25.8b //(iv)vmull_u8(ref_main_idx_1, dup_const_fract)
add x12,x8,x9 //(vi)*pu1_ref[ref_main_idx]
sbfx x9,x14,#16,#8
- dup v25.8b, v4.8b[7] //(viii)
+ dup v25.8b, v4.b[7] //(viii)
rshrn v18.8b, v18.8h,#5 //(iii)shift_res = vrshrn_n_u16(add_res, 5)
ld1 {v23.8b},[x10],x11 //(v)ref_main_idx
@@ -498,7 +498,7 @@ kernel_8_rows:
ld1 {v13.8b},[x12] //(vi)ref_main_idx_1
umull v10.8h, v23.8b, v30.8b //(v)vmull_u8(ref_main_idx, dup_const_32_fract)
- umov w14, v3.2s[0] //(i)extract idx to the r register
+ umov w14, v3.s[0] //(i)extract idx to the r register
sxtw x14,w14
umlal v10.8h, v9.8b, v31.8b //(v)vmull_u8(ref_main_idx_1, dup_const_fract)
diff --git a/common/arm64/ihevc_intra_pred_luma_dc.s b/common/arm64/ihevc_intra_pred_luma_dc.s
index e4fdb5d..fc86ffa 100644
--- a/common/arm64/ihevc_intra_pred_luma_dc.s
+++ b/common/arm64/ihevc_intra_pred_luma_dc.s
@@ -200,7 +200,7 @@ epil_add_loop:
mov x20,#128
csel x6, x20, x6,eq
- dup v16.8b, v18.8b[0] //dc_val
+ dup v16.8b, v18.b[0] //dc_val
shl d25, d18,#1 //2*dc
beq prologue_cpy_32
@@ -218,7 +218,7 @@ epil_add_loop:
add d23, d23 , d17 //3*dc + 2
add x12, x12, #8 //offset after one 8x8 block (-7*strd + 8)
- dup v24.8h, v23.4h[0] //3*dc + 2 (moved to all lanes)
+ dup v24.8h, v23.h[0] //3*dc + 2 (moved to all lanes)
sub x0, x3, x4 //strd - nt
prologue_col:
@@ -368,9 +368,9 @@ prologue_cpy_32:
add x5, x2, x3
add x8, x5, x3
add x10, x8, x3
- dup v20.16b, v16.8b[0]
+ dup v20.16b, v16.b[0]
lsl x6, x3, #2
- add x6, x6, #-16
+ sub x6, x6, #16
st1 {v20.16b}, [x2],#16
st1 {v20.16b}, [x5],#16
@@ -451,7 +451,7 @@ dc_4:
shl d25, d18,#1 //2*dc
sub x9, x9, #3 //&src[2nt-1-row]
- dup v16.8b, v18.8b[0] //dc_val
+ dup v16.8b, v18.b[0] //dc_val
add d27, d25 , d28 //src[2nt+1]+2+src[2nt-1]+2dc_val
ushr v29.4h, v27.4h,#2 //final dst[0]'s value in d15[0]
@@ -461,7 +461,7 @@ dc_4:
add d23, d23 , d17 //3*dc + 2
add x12, x12, #4 //offset after one 4x4 block (-3*strd + 4)
- dup v24.8h, v23.4h[0] //3*dc + 2 (moved to all lanes)
+ dup v24.8h, v23.h[0] //3*dc + 2 (moved to all lanes)
sub x0, x3, x4 //strd - nt
diff --git a/common/arm64/ihevc_intra_pred_luma_horz.s b/common/arm64/ihevc_intra_pred_luma_horz.s
index 95452e4..3d1f27f 100644
--- a/common/arm64/ihevc_intra_pred_luma_horz.s
+++ b/common/arm64/ihevc_intra_pred_luma_horz.s
@@ -119,63 +119,63 @@ ihevc_intra_pred_luma_horz_av8:
core_loop_32:
ld1 { v0.16b},[x12] //load 16 values. d1[7] will have the 1st value.
- dup v2.16b, v0.16b[15] //duplicate the i value.
+ dup v2.16b, v0.b[15] //duplicate the i value.
- dup v4.16b, v0.16b[14] //duplicate the ii value.
- dup v6.16b, v0.16b[13] //duplicate the iii value.
+ dup v4.16b, v0.b[14] //duplicate the ii value.
+ dup v6.16b, v0.b[13] //duplicate the iii value.
st1 { v2.16b},[x2],x3 //store in 1st row 0-16 columns
st1 { v2.16b},[x9],x3 //store in 1st row 16-32 columns
- dup v1.16b, v0.16b[12]
+ dup v1.16b, v0.b[12]
st1 { v4.16b},[x2],x3
st1 { v4.16b},[x9],x3
- dup v2.16b, v0.16b[11]
+ dup v2.16b, v0.b[11]
st1 { v6.16b},[x2],x3
st1 { v6.16b},[x9],x3
- dup v4.16b, v0.16b[10]
+ dup v4.16b, v0.b[10]
st1 { v1.16b},[x2],x3
st1 { v1.16b},[x9],x3
- dup v6.16b, v0.16b[9]
+ dup v6.16b, v0.b[9]
st1 { v2.16b},[x2],x3
st1 { v2.16b},[x9],x3
- dup v1.16b, v0.16b[8]
+ dup v1.16b, v0.b[8]
st1 { v4.16b},[x2],x3
st1 { v4.16b},[x9],x3
- dup v2.16b, v0.8b[7]
+ dup v2.16b, v0.b[7]
st1 { v6.16b},[x2],x3
st1 { v6.16b},[x9],x3
- dup v4.16b, v0.8b[6]
+ dup v4.16b, v0.b[6]
st1 { v1.16b},[x2],x3
st1 { v1.16b},[x9],x3
- dup v6.16b, v0.8b[5]
+ dup v6.16b, v0.b[5]
st1 { v2.16b},[x2],x3
st1 { v2.16b},[x9],x3
- dup v1.16b, v0.8b[4]
+ dup v1.16b, v0.b[4]
st1 { v4.16b},[x2],x3
st1 { v4.16b},[x9],x3
- dup v2.16b, v0.8b[3]
+ dup v2.16b, v0.b[3]
st1 { v6.16b},[x2],x3
st1 { v6.16b},[x9],x3
- dup v4.16b, v0.8b[2]
+ dup v4.16b, v0.b[2]
st1 { v1.16b},[x2],x3
st1 { v1.16b},[x9],x3
- dup v6.16b, v0.8b[1]
+ dup v6.16b, v0.b[1]
st1 { v2.16b},[x2],x3
st1 { v2.16b},[x9],x3
sub x12,x12,#16 //move to 16th value pointer
- dup v1.16b, v0.8b[0]
+ dup v1.16b, v0.b[0]
st1 { v4.16b},[x2],x3
st1 { v4.16b},[x9],x3
@@ -202,33 +202,33 @@ core_loop_16:
dup v28.8b,w14
sub x12,x12,#17
ld1 { v0.16b},[x12]
- dup v26.8b, v0.16b[15]
+ dup v26.8b, v0.b[15]
uxtl v26.8h, v26.8b
- dup v2.16b, v0.16b[14]
+ dup v2.16b, v0.b[14]
usubl v24.8h, v30.8b, v28.8b
- dup v4.16b, v0.16b[13]
+ dup v4.16b, v0.b[13]
sshr v24.8h, v24.8h,#1
- dup v6.16b, v0.16b[12]
+ dup v6.16b, v0.b[12]
sqadd v22.8h, v26.8h , v24.8h
- dup v1.16b, v0.16b[11]
+ dup v1.16b, v0.b[11]
sqxtun v22.8b, v22.8h
st1 {v22.8b},[x2],#8
- dup v18.16b, v0.16b[10]
+ dup v18.16b, v0.b[10]
usubl v24.8h, v31.8b, v28.8b
- dup v19.16b, v0.16b[9]
+ dup v19.16b, v0.b[9]
sshr v24.8h, v24.8h,#1
- dup v20.16b, v0.16b[8]
+ dup v20.16b, v0.b[8]
sqadd v22.8h, v26.8h , v24.8h
- dup v16.16b, v0.8b[7]
+ dup v16.16b, v0.b[7]
sqxtun v22.8b, v22.8h
st1 {v22.8b},[x2],x3
@@ -240,25 +240,25 @@ core_loop_16:
st1 { v6.16b},[x2],x3
st1 { v1.16b},[x2],x3
- dup v2.16b, v0.8b[6]
+ dup v2.16b, v0.b[6]
st1 { v18.16b},[x2],x3
- dup v4.16b, v0.8b[5]
+ dup v4.16b, v0.b[5]
st1 { v19.16b},[x2],x3
- dup v6.16b, v0.8b[4]
+ dup v6.16b, v0.b[4]
st1 { v20.16b},[x2],x3
- dup v1.16b, v0.8b[3]
+ dup v1.16b, v0.b[3]
st1 { v16.16b},[x2],x3
- dup v18.16b, v0.8b[2]
+ dup v18.16b, v0.b[2]
st1 { v2.16b},[x2],x3
- dup v19.16b, v0.8b[1]
+ dup v19.16b, v0.b[1]
st1 { v4.16b},[x2],x3
- dup v20.16b, v0.8b[0]
+ dup v20.16b, v0.b[0]
st1 { v6.16b},[x2],x3
st1 { v1.16b},[x2],x3
@@ -281,32 +281,32 @@ core_loop_8:
sub x12,x12,#9
ld1 {v0.8b},[x12]
- dup v26.8b, v0.8b[7]
+ dup v26.8b, v0.b[7]
dup v28.8b,w14
- dup v3.8b, v0.8b[6]
+ dup v3.8b, v0.b[6]
uxtl v26.8h, v26.8b
- dup v4.8b, v0.8b[5]
+ dup v4.8b, v0.b[5]
usubl v24.8h, v30.8b, v28.8b
- dup v5.8b, v0.8b[4]
+ dup v5.8b, v0.b[4]
sshr v24.8h, v24.8h,#1
- dup v6.8b, v0.8b[3]
+ dup v6.8b, v0.b[3]
sqadd v22.8h, v26.8h , v24.8h
- dup v7.8b, v0.8b[2]
+ dup v7.8b, v0.b[2]
sqxtun v22.8b, v22.8h
st1 {v22.8b},[x2],x3
st1 {v3.8b},[x2],x3
- dup v1.8b, v0.8b[1]
+ dup v1.8b, v0.b[1]
st1 {v4.8b},[x2],x3
st1 {v5.8b},[x2],x3
- dup v17.8b, v0.8b[0]
+ dup v17.8b, v0.b[0]
st1 {v6.8b},[x2],x3
st1 {v7.8b},[x2],x3
@@ -328,16 +328,16 @@ core_loop_4:
sub x12,x12,#5
ld1 {v0.8b},[x12]
dup v28.8b,w14
- dup v26.8b, v0.8b[3]
+ dup v26.8b, v0.b[3]
uxtl v26.8h, v26.8b
- dup v3.8b, v0.8b[2]
+ dup v3.8b, v0.b[2]
usubl v24.8h, v30.8b, v28.8b
- dup v4.8b, v0.8b[1]
+ dup v4.8b, v0.b[1]
sshr v24.8h, v24.8h,#1
- dup v5.8b, v0.8b[0]
+ dup v5.8b, v0.b[0]
sqadd v22.8h, v26.8h , v24.8h
sqxtun v22.8b, v22.8h
diff --git a/common/arm64/ihevc_intra_pred_luma_mode2.s b/common/arm64/ihevc_intra_pred_luma_mode2.s
index 598ce5a..6eec479 100644
--- a/common/arm64/ihevc_intra_pred_luma_mode2.s
+++ b/common/arm64/ihevc_intra_pred_luma_mode2.s
@@ -116,7 +116,7 @@ ihevc_intra_pred_luma_mode2_av8:
add x0,x0,x4,lsl #1
sub x0,x0,#9 //src[1]
- add x10,x0,#-1
+ sub x10,x0,#1
prologue_cpy_32:
@@ -215,7 +215,7 @@ kernel_mode2:
add x9, x7, x3
rev64 v20.8b, v4.8b
- add x10,x0,#-1
+ sub x10,x0,#1
rev64 v21.8b, v5.8b
subs x1, x1, #8
@@ -244,7 +244,7 @@ mode2_4:
mov x8,#-2
sub x0,x0,#1
- add x10,x0,#-1
+ sub x10,x0,#1
ld1 {v0.8b},[x0],x8
add x5,x2,x3
diff --git a/common/arm64/ihevc_intra_pred_luma_mode_27_to_33.s b/common/arm64/ihevc_intra_pred_luma_mode_27_to_33.s
index 58b2d37..dcc9e43 100644
--- a/common/arm64/ihevc_intra_pred_luma_mode_27_to_33.s
+++ b/common/arm64/ihevc_intra_pred_luma_mode_27_to_33.s
@@ -147,13 +147,13 @@ prologue:
xtn v4.8b, v4.8h
shrn v5.8b, v2.8h,#5 //idx = pos >> 5
- dup v31.8b, v4.8b[0]
+ dup v31.8b, v4.b[0]
add x0,x2,x3
- umov w14, v5.2s[0] //(i row)extract idx to the r register
+ umov w14, v5.s[0] //(i row)extract idx to the r register
sxtw x14,w14
- dup v29.8b, v4.8b[1] //(ii)
+ dup v29.8b, v4.b[1] //(ii)
and x9,x14,#0xff //(i row) get the last byte
add x10,x8,x9 //(i row)*pu1_ref[ref_main_idx]
@@ -177,11 +177,11 @@ prologue:
umlal v10.8h, v9.8b, v31.8b //(i row)vmull_u8(ref_main_idx_1, dup_const_fract)
asr x14,x14,#8 //(iv)
- dup v27.8b, v4.8b[2] //(iii)
+ dup v27.8b, v4.b[2] //(iii)
sub v28.8b, v1.8b , v29.8b //(ii)32-fract(dup_const_32_fract)
and x9,x14,#0xff //(iv)
- dup v25.8b, v4.8b[3] //(iv)
+ dup v25.8b, v4.b[3] //(iv)
umull v14.8h, v12.8b, v28.8b //(ii)vmull_u8(ref_main_idx, dup_const_32_fract)
add x12,x8,x9 //(iv)*pu1_ref[ref_main_idx]
@@ -196,10 +196,10 @@ prologue:
ld1 {v21.8b},[x12] //(iv)ref_main_idx_1
- dup v31.8b, v4.8b[4] //(v)
+ dup v31.8b, v4.b[4] //(v)
umull v18.8h, v16.8b, v26.8b //(iii)vmull_u8(ref_main_idx, dup_const_32_fract)
- umov w14, v5.2s[1] //extract idx to the r register
+ umov w14, v5.s[1] //extract idx to the r register
sxtw x14,w14
umlal v18.8h, v17.8b, v27.8b //(iii)vmull_u8(ref_main_idx_1, dup_const_fract)
@@ -207,7 +207,7 @@ prologue:
rshrn v14.8b, v14.8h,#5 //(ii)shift_res = vrshrn_n_u16(add_res, 5)
and x9,x14,#0xff //(v)
- dup v29.8b, v4.8b[5] //(vi)
+ dup v29.8b, v4.b[5] //(vi)
add x10,x8,x9 //(v)*pu1_ref[ref_main_idx]
ld1 {v23.8b},[x10],x11 //(v)ref_main_idx
@@ -224,7 +224,7 @@ prologue:
rshrn v18.8b, v18.8h,#5 //(iii)shift_res = vrshrn_n_u16(add_res, 5)
add x12,x8,x9 //(vi)*pu1_ref[ref_main_idx]
- dup v27.8b, v4.8b[6] //(vii)
+ dup v27.8b, v4.b[6] //(vii)
asr x14,x14,#8 //(vii)
and x9,x14,#0xff //(vii)
@@ -241,7 +241,7 @@ prologue:
rshrn v22.8b, v22.8h,#5 //(iv)shift_res = vrshrn_n_u16(add_res, 5)
asr x14,x14,#8 //(viii)
- dup v25.8b, v4.8b[7] //(viii)
+ dup v25.8b, v4.b[7] //(viii)
and x9,x14,#0xff //(viii)
ld1 {v16.8b},[x10],x11 //(vii)ref_main_idx
@@ -279,14 +279,14 @@ prologue:
and v4.16b, v2.16b , v6.16b //dup_const_fract(fract = pos & (31))
xtn v4.8b, v4.8h
shrn v3.8b, v2.8h,#5 //idx = pos >> 5
- umov w14, v3.2s[0] //(i)extract idx to the r register
+ umov w14, v3.s[0] //(i)extract idx to the r register
sxtw x14,w14
and x9,x14,#0xff //(i)
add x10,x8,x9 //(i)*pu1_ref[ref_main_idx]
kernel_8_rows:
asr x14,x14,#8 //(ii)
- dup v31.8b, v4.8b[0]
+ dup v31.8b, v4.b[0]
subs x4,x4,#8
ld1 {v23.8b},[x10],x11 //(i)ref_main_idx
@@ -303,7 +303,7 @@ kernel_8_rows:
umlal v22.8h, v21.8b, v25.8b //(viii)vmull_u8(ref_main_idx_1, dup_const_fract)
asr x14,x14,#8 //(iii)
- dup v29.8b, v4.8b[1] //(ii)
+ dup v29.8b, v4.b[1] //(ii)
rshrn v18.8b, v18.8h,#5 //(vii)shift_res = vrshrn_n_u16(add_res, 5)
and x9,x14,#0xff //(iii)
@@ -319,11 +319,11 @@ kernel_8_rows:
umlal v10.8h, v9.8b, v31.8b //(i)vmull_u8(ref_main_idx_1, dup_const_fract)
and x9,x14,#0xff //(iv)
- umov w14, v3.2s[1] //extract idx to the r register
+ umov w14, v3.s[1] //extract idx to the r register
sxtw x14,w14
rshrn v22.8b, v22.8h,#5 //(viii)shift_res = vrshrn_n_u16(add_res, 5)
- dup v27.8b, v4.8b[2] //(iii)
+ dup v27.8b, v4.b[2] //(iii)
sub v28.8b, v1.8b , v29.8b //(ii)32-fract(dup_const_32_fract)
csel x4, x5, x4,le //reload nt
@@ -337,7 +337,7 @@ kernel_8_rows:
ld1 {v17.8b},[x10] //(iii)ref_main_idx_1
rshrn v10.8b, v10.8h,#5 //(i)shift_res = vrshrn_n_u16(add_res, 5)
- dup v25.8b, v4.8b[3] //(iv)
+ dup v25.8b, v4.b[3] //(iv)
umull v2.8h, v5.8b, v0.8b //pos = ((row + 1) * intra_pred_ang)
st1 {v22.8b},[x0] //(viii)
@@ -351,7 +351,7 @@ kernel_8_rows:
umlal v18.8h, v17.8b, v27.8b //(iii)vmull_u8(ref_main_idx_1, dup_const_fract)
and x9,x14,#0xff //(v)
- dup v31.8b, v4.8b[4] //(v)
+ dup v31.8b, v4.b[4] //(v)
rshrn v14.8b, v14.8h,#5 //(ii)shift_res = vrshrn_n_u16(add_res, 5)
add x10,x8,x9 //(v)*pu1_ref[ref_main_idx]
@@ -359,15 +359,15 @@ kernel_8_rows:
sub v24.8b, v1.8b , v25.8b //(iv)32-fract(dup_const_32_fract)
asr x14,x14,#8 //(vi)
- dup v29.8b, v4.8b[5] //(vi)
+ dup v29.8b, v4.b[5] //(vi)
umull v22.8h, v20.8b, v24.8b //(iv)vmull_u8(ref_main_idx, dup_const_32_fract)
and x9,x14,#0xff //(vi)
- dup v27.8b, v4.8b[6] //(vii)
+ dup v27.8b, v4.b[6] //(vii)
umlal v22.8h, v21.8b, v25.8b //(iv)vmull_u8(ref_main_idx_1, dup_const_fract)
add x12,x8,x9 //(vi)*pu1_ref[ref_main_idx]
- dup v25.8b, v4.8b[7] //(viii)
+ dup v25.8b, v4.b[7] //(viii)
rshrn v18.8b, v18.8h,#5 //(iii)shift_res = vrshrn_n_u16(add_res, 5)
asr x14,x14,#8 //(vii)
@@ -390,7 +390,7 @@ kernel_8_rows:
ld1 {v13.8b},[x12] //(vi)ref_main_idx_1
umull v10.8h, v23.8b, v30.8b //(v)vmull_u8(ref_main_idx, dup_const_32_fract)
- umov w14, v3.2s[0] //(i)extract idx to the r register
+ umov w14, v3.s[0] //(i)extract idx to the r register
sxtw x14,w14
umlal v10.8h, v9.8b, v31.8b //(v)vmull_u8(ref_main_idx_1, dup_const_fract)
add x12,x8,x9 //(viii)*pu1_ref[ref_main_idx]
diff --git a/common/arm64/ihevc_intra_pred_luma_planar.s b/common/arm64/ihevc_intra_pred_luma_planar.s
index ba04f42..ec9d3ca 100644
--- a/common/arm64/ihevc_intra_pred_luma_planar.s
+++ b/common/arm64/ihevc_intra_pred_luma_planar.s
@@ -186,10 +186,10 @@ col_loop_8_16_32:
ld1 {v3.8b},[x14] //(1-8)load 8 src[2nt+1+col]
umlal v27.8h, v17.8b, v1.8b //(1)(col+1) * src[3nt+1]
- dup v20.8b, v4.8b[7] //(1)
+ dup v20.8b, v4.b[7] //(1)
umlal v27.8h, v6.8b, v3.8b //(1)(nt-1-row) * src[2nt+1+col]
- dup v21.8b, v4.8b[6] //(2)
+ dup v21.8b, v4.b[6] //(2)
umlal v27.8h, v19.8b, v20.8b //(1)(nt-1-col) * src[2nt-1-row]
dup v30.8h,w4 //(2)
@@ -197,7 +197,7 @@ col_loop_8_16_32:
sub v6.8b, v6.8b , v7.8b //(1)
- dup v22.8b, v4.8b[5] //(3)
+ dup v22.8b, v4.b[5] //(3)
umlal v30.8h, v5.8b, v0.8b //(2)
dup v28.8h,w4 //(3)
@@ -214,7 +214,7 @@ col_loop_8_16_32:
xtn v27.8b, v27.8h //(1)
umlal v28.8h, v5.8b, v0.8b //(3)
- dup v23.8b, v4.8b[4] //(4)
+ dup v23.8b, v4.b[4] //(4)
umlal v28.8h, v17.8b, v1.8b //(3)
dup v25.8h,w4 //(4)
@@ -231,7 +231,7 @@ col_loop_8_16_32:
xtn v30.8b, v30.8h //(2)
umlal v25.8h, v5.8b, v0.8b //(4)
- dup v20.8b, v4.8b[3] //(5)
+ dup v20.8b, v4.b[3] //(5)
umlal v25.8h, v17.8b, v1.8b //(4)
dup v16.8h,w4 //(5)
@@ -248,7 +248,7 @@ col_loop_8_16_32:
xtn v28.8b, v28.8h //(3)
umlal v16.8h, v5.8b, v0.8b //(5)
- dup v21.8b, v4.8b[2] //(6)
+ dup v21.8b, v4.b[2] //(6)
umlal v16.8h, v17.8b, v1.8b //(5)
dup v18.8h,w4 //(6)
@@ -264,7 +264,7 @@ col_loop_8_16_32:
xtn v25.8b, v25.8h //(4)
umlal v18.8h, v5.8b, v0.8b //(6)
- dup v22.8b, v4.8b[1] //(7)
+ dup v22.8b, v4.b[1] //(7)
umlal v18.8h, v17.8b, v1.8b //(6)
dup v26.8h,w4 //(7)
@@ -281,7 +281,7 @@ col_loop_8_16_32:
xtn v16.8b, v16.8h //(5)
umlal v26.8h, v5.8b, v0.8b //(7)
- dup v23.8b, v4.8b[0] //(8)
+ dup v23.8b, v4.b[0] //(8)
umlal v26.8h, v17.8b, v1.8b //(7)
dup v24.8h,w4 //(8)
@@ -337,7 +337,7 @@ col_loop_8_16_32:
ld1 {v4.8b},[x6] //(1n)(1-8)src[2nt-1-row]
sub v19.8b, v2.8b , v17.8b //(1n)(1-8)[nt-1-col]
- dup v20.8b, v4.8b[7] //(1n)(1)
+ dup v20.8b, v4.b[7] //(1n)(1)
sub v6.8b, v2.8b , v5.8b
beq epilog
@@ -353,7 +353,7 @@ kernel_plnr:
xtn v24.8b, v24.8h //(8)
umlal v27.8h, v17.8b, v1.8b //(1)(col+1) * src[3nt+1]
- dup v21.8b, v4.8b[6] //(2)
+ dup v21.8b, v4.b[6] //(2)
umlal v27.8h, v6.8b, v3.8b //(1)(nt-1-row) * src[2nt+1+col]
dup v30.8h,w4 //(2)
@@ -373,7 +373,7 @@ kernel_plnr:
csel x2, x20, x2,le
umlal v30.8h, v17.8b, v1.8b //(2)
- dup v22.8b, v4.8b[5] //(3)
+ dup v22.8b, v4.b[5] //(3)
umlal v30.8h, v6.8b, v3.8b //(2)
dup v28.8h,w4 //(3)
@@ -390,7 +390,7 @@ kernel_plnr:
xtn v27.8b, v27.8h //(1)
umlal v28.8h, v5.8b, v0.8b //(3)
- dup v23.8b, v4.8b[4] //(4)
+ dup v23.8b, v4.b[4] //(4)
umlal v28.8h, v17.8b, v1.8b //(3)
dup v25.8h,w4 //(4)
@@ -408,7 +408,7 @@ kernel_plnr:
xtn v30.8b, v30.8h //(2)
umlal v25.8h, v5.8b, v0.8b //(4)
- dup v20.8b, v4.8b[3] //(5)
+ dup v20.8b, v4.b[3] //(5)
umlal v25.8h, v17.8b, v1.8b //(4)
dup v16.8h,w4 //(5)
@@ -426,7 +426,7 @@ kernel_plnr:
xtn v28.8b, v28.8h //(3)
umlal v16.8h, v5.8b, v0.8b //(5)
- dup v21.8b, v4.8b[2] //(6)
+ dup v21.8b, v4.b[2] //(6)
umlal v16.8h, v17.8b, v1.8b //(5)
dup v18.8h,w4 //(6)
@@ -450,7 +450,7 @@ kernel_plnr:
xtn v25.8b, v25.8h //(4)
umlal v18.8h, v5.8b, v0.8b //(6)
- dup v22.8b, v4.8b[1] //(7)
+ dup v22.8b, v4.b[1] //(7)
umlal v18.8h, v17.8b, v1.8b //(6)
dup v26.8h,w4 //(7)
@@ -473,7 +473,7 @@ kernel_plnr:
xtn v16.8b, v16.8h //(5)
umlal v26.8h, v5.8b, v0.8b //(7)
- dup v23.8b, v4.8b[0] //(8)
+ dup v23.8b, v4.b[0] //(8)
umlal v26.8h, v17.8b, v1.8b //(7)
dup v24.8h,w4 //(8)
@@ -495,7 +495,7 @@ kernel_plnr:
ld1 {v5.8b},[x5] //(row+1 value)
umlal v24.8h, v17.8b, v1.8b //(8)
- dup v20.8b, v4.8b[7] //(1n)(1)
+ dup v20.8b, v4.b[7] //(1n)(1)
umlal v24.8h, v6.8b, v3.8b //(8)
st1 {v18.8b},[x2], x3 //(6)str 8 values
diff --git a/common/arm64/ihevc_intra_pred_luma_vert.s b/common/arm64/ihevc_intra_pred_luma_vert.s
index c67f721..a8b111e 100644
--- a/common/arm64/ihevc_intra_pred_luma_vert.s
+++ b/common/arm64/ihevc_intra_pred_luma_vert.s
@@ -122,7 +122,7 @@ copy_32:
ld1 {v22.8b, v23.8b}, [x6] //16 loads (col 16:31)
lsl x11, x3, #2
- add x11, x11, #-16
+ sub x11, x11, #16
st1 {v20.8b, v21.8b}, [x2],#16
st1 {v20.8b, v21.8b}, [x5],#16
st1 {v20.8b, v21.8b}, [x8],#16
@@ -183,7 +183,7 @@ blk_16:
sxtw x12,w12
ld1 {v16.8b, v17.8b}, [x6] //ld for repl to cols src[2nt+1+col(0:15)] (0 ignored for stores)
- add x6, x6, #-17 //subtract -9 to take it to src[2nt-1-row(15)]
+ sub x6, x6, #17 //subtract -9 to take it to src[2nt-1-row(15)]
dup v24.16b,w12 //src[2nt+1]
dup v30.8h,w12
@@ -323,7 +323,7 @@ blk_4_8:
sxtw x12,w12
ld1 {v16.8b},[x6] //ld for repl to cols src[2nt+1+col(0:3 or 0:7)](0 ignored for st)
- add x6, x6, #-9 //subtract -9 to take it to src[2nt-1-row(15)]
+ sub x6, x6, #9 //subtract -9 to take it to src[2nt-1-row(15)]
dup v24.8b,w12 //src[2nt+1]
dup v30.8h,w12
diff --git a/common/arm64/ihevc_itrans_recon_16x16.s b/common/arm64/ihevc_itrans_recon_16x16.s
index 90df840..fe76678 100644
--- a/common/arm64/ihevc_itrans_recon_16x16.s
+++ b/common/arm64/ihevc_itrans_recon_16x16.s
@@ -252,56 +252,56 @@ first_stage_top_four_bottom_four:
//d7=x3
skip_load4rows:
- smull v24.4s, v6.4h, v0.4h[1] //// y1 * cos1(part of b0)
- smull v26.4s, v6.4h, v0.4h[3] //// y1 * cos3(part of b1)
- smull v28.4s, v6.4h, v1.4h[1] //// y1 * sin3(part of b2)
- smull v30.4s, v6.4h, v1.4h[3] //// y1 * sin1(part of b3)
+ smull v24.4s, v6.4h, v0.h[1] //// y1 * cos1(part of b0)
+ smull v26.4s, v6.4h, v0.h[3] //// y1 * cos3(part of b1)
+ smull v28.4s, v6.4h, v1.h[1] //// y1 * sin3(part of b2)
+ smull v30.4s, v6.4h, v1.h[3] //// y1 * sin1(part of b3)
- smlal v24.4s, v7.4h, v0.4h[3] //// y1 * cos1 + y3 * cos3(part of b0)
- smlal v26.4s, v7.4h, v2.4h[1] //// y1 * cos3 - y3 * sin1(part of b1)
- smlal v28.4s, v7.4h, v3.4h[3] //// y1 * sin3 - y3 * cos1(part of b2)
- smlsl v30.4s, v7.4h, v2.4h[3] //// y1 * sin1 - y3 * sin3(part of b3)
+ smlal v24.4s, v7.4h, v0.h[3] //// y1 * cos1 + y3 * cos3(part of b0)
+ smlal v26.4s, v7.4h, v2.h[1] //// y1 * cos3 - y3 * sin1(part of b1)
+ smlal v28.4s, v7.4h, v3.h[3] //// y1 * sin3 - y3 * cos1(part of b2)
+ smlsl v30.4s, v7.4h, v2.h[3] //// y1 * sin1 - y3 * sin3(part of b3)
- smull v12.4s, v10.4h, v0.4h[0]
- smlal v12.4s, v11.4h, v0.4h[2]
- smull v14.4s, v10.4h, v0.4h[0]
- smlal v14.4s, v11.4h, v1.4h[2]
- smull v16.4s, v10.4h, v0.4h[0]
- smlal v16.4s, v11.4h, v2.4h[2]
- smull v18.4s, v10.4h, v0.4h[0]
- smlal v18.4s, v11.4h, v3.4h[2]
+ smull v12.4s, v10.4h, v0.h[0]
+ smlal v12.4s, v11.4h, v0.h[2]
+ smull v14.4s, v10.4h, v0.h[0]
+ smlal v14.4s, v11.4h, v1.h[2]
+ smull v16.4s, v10.4h, v0.h[0]
+ smlal v16.4s, v11.4h, v2.h[2]
+ smull v18.4s, v10.4h, v0.h[0]
+ smlal v18.4s, v11.4h, v3.h[2]
bge skip_last12rows_kernel1
- smlal v24.4s, v8.4h, v1.4h[1]
- smlal v26.4s, v8.4h, v3.4h[3]
- smlsl v28.4s, v8.4h, v1.4h[3]
- smlsl v30.4s, v8.4h, v0.4h[3]
+ smlal v24.4s, v8.4h, v1.h[1]
+ smlal v26.4s, v8.4h, v3.h[3]
+ smlsl v28.4s, v8.4h, v1.h[3]
+ smlsl v30.4s, v8.4h, v0.h[3]
- smlal v24.4s, v9.4h, v1.4h[3]
- smlsl v26.4s, v9.4h, v2.4h[3]
- smlsl v28.4s, v9.4h, v0.4h[3]
- smlal v30.4s, v9.4h, v3.4h[3]
+ smlal v24.4s, v9.4h, v1.h[3]
+ smlsl v26.4s, v9.4h, v2.h[3]
+ smlsl v28.4s, v9.4h, v0.h[3]
+ smlal v30.4s, v9.4h, v3.h[3]
- smlal v12.4s, v4.4h, v1.4h[0]
- smlal v12.4s, v5.4h, v1.4h[2]
- smlal v14.4s, v4.4h, v3.4h[0]
- smlsl v14.4s, v5.4h, v3.4h[2]
- smlsl v16.4s, v4.4h, v3.4h[0]
- smlsl v16.4s, v5.4h, v0.4h[2]
- smlsl v18.4s, v4.4h, v1.4h[0]
- smlsl v18.4s, v5.4h, v2.4h[2]
+ smlal v12.4s, v4.4h, v1.h[0]
+ smlal v12.4s, v5.4h, v1.h[2]
+ smlal v14.4s, v4.4h, v3.h[0]
+ smlsl v14.4s, v5.4h, v3.h[2]
+ smlsl v16.4s, v4.4h, v3.h[0]
+ smlsl v16.4s, v5.4h, v0.h[2]
+ smlsl v18.4s, v4.4h, v1.h[0]
+ smlsl v18.4s, v5.4h, v2.h[2]
//d0[0]= 64 d2[0]=64
//d0[1]= 90 d2[1]=57
@@ -328,57 +328,57 @@ skip_load4rows:
- smlal v24.4s, v6.4h, v2.4h[1] //// y1 * cos1(part of b0)
- smlsl v26.4s, v6.4h, v1.4h[1] //// y1 * cos3(part of b1)
- smlsl v28.4s, v6.4h, v3.4h[1] //// y1 * sin3(part of b2)
- smlal v30.4s, v6.4h, v0.4h[1] //// y1 * sin1(part of b3)
+ smlal v24.4s, v6.4h, v2.h[1] //// y1 * cos1(part of b0)
+ smlsl v26.4s, v6.4h, v1.h[1] //// y1 * cos3(part of b1)
+ smlsl v28.4s, v6.4h, v3.h[1] //// y1 * sin3(part of b2)
+ smlal v30.4s, v6.4h, v0.h[1] //// y1 * sin1(part of b3)
- smlal v24.4s, v7.4h, v2.4h[3] //// y1 * cos1 + y3 * cos3(part of b0)
- smlsl v26.4s, v7.4h, v0.4h[1] //// y1 * cos3 - y3 * sin1(part of b1)
- smlal v28.4s, v7.4h, v2.4h[1] //// y1 * sin3 - y3 * cos1(part of b2)
- smlal v30.4s, v7.4h, v3.4h[1] //// y1 * sin1 - y3 * sin3(part of b3)
+ smlal v24.4s, v7.4h, v2.h[3] //// y1 * cos1 + y3 * cos3(part of b0)
+ smlsl v26.4s, v7.4h, v0.h[1] //// y1 * cos3 - y3 * sin1(part of b1)
+ smlal v28.4s, v7.4h, v2.h[1] //// y1 * sin3 - y3 * cos1(part of b2)
+ smlal v30.4s, v7.4h, v3.h[1] //// y1 * sin1 - y3 * sin3(part of b3)
- smlal v24.4s, v8.4h, v3.4h[1]
- smlsl v26.4s, v8.4h, v1.4h[3]
- smlal v28.4s, v8.4h, v0.4h[1]
- smlsl v30.4s, v8.4h, v1.4h[1]
+ smlal v24.4s, v8.4h, v3.h[1]
+ smlsl v26.4s, v8.4h, v1.h[3]
+ smlal v28.4s, v8.4h, v0.h[1]
+ smlsl v30.4s, v8.4h, v1.h[1]
- smlal v24.4s, v9.4h, v3.4h[3]
- smlsl v26.4s, v9.4h, v3.4h[1]
- smlal v28.4s, v9.4h, v2.4h[3]
- smlsl v30.4s, v9.4h, v2.4h[1]
+ smlal v24.4s, v9.4h, v3.h[3]
+ smlsl v26.4s, v9.4h, v3.h[1]
+ smlal v28.4s, v9.4h, v2.h[3]
+ smlsl v30.4s, v9.4h, v2.h[1]
- smlal v12.4s, v10.4h, v0.4h[0]
- smlal v12.4s, v11.4h, v2.4h[2]
- smlal v12.4s, v4.4h, v3.4h[0]
- smlal v12.4s, v5.4h, v3.4h[2]
+ smlal v12.4s, v10.4h, v0.h[0]
+ smlal v12.4s, v11.4h, v2.h[2]
+ smlal v12.4s, v4.4h, v3.h[0]
+ smlal v12.4s, v5.4h, v3.h[2]
- smlsl v14.4s, v10.4h, v0.4h[0]
- smlsl v14.4s, v11.4h, v0.4h[2]
- smlsl v14.4s, v4.4h, v1.4h[0]
- smlsl v14.4s, v5.4h, v2.4h[2]
+ smlsl v14.4s, v10.4h, v0.h[0]
+ smlsl v14.4s, v11.4h, v0.h[2]
+ smlsl v14.4s, v4.4h, v1.h[0]
+ smlsl v14.4s, v5.4h, v2.h[2]
- smlsl v16.4s, v10.4h, v0.4h[0]
- smlal v16.4s, v11.4h, v3.4h[2]
- smlal v16.4s, v4.4h, v1.4h[0]
- smlal v16.4s, v5.4h, v1.4h[2]
+ smlsl v16.4s, v10.4h, v0.h[0]
+ smlal v16.4s, v11.4h, v3.h[2]
+ smlal v16.4s, v4.4h, v1.h[0]
+ smlal v16.4s, v5.4h, v1.h[2]
- smlal v18.4s, v10.4h, v0.4h[0]
- smlal v18.4s, v11.4h, v1.4h[2]
- smlsl v18.4s, v4.4h, v3.4h[0]
- smlsl v18.4s, v5.4h, v0.4h[2]
+ smlal v18.4s, v10.4h, v0.h[0]
+ smlal v18.4s, v11.4h, v1.h[2]
+ smlsl v18.4s, v4.4h, v3.h[0]
+ smlsl v18.4s, v5.4h, v0.h[2]
skip_last12rows_kernel1:
add v20.4s, v12.4s , v24.4s
@@ -430,55 +430,55 @@ first_stage_middle_eight:
skip_stage1_kernel_load:
- smull v24.4s, v6.4h, v2.4h[1] //// y1 * cos1(part of b0)
- smull v26.4s, v6.4h, v2.4h[3] //// y1 * cos3(part of b1)
- smull v28.4s, v6.4h, v3.4h[1] //// y1 * sin3(part of b2)
- smull v30.4s, v6.4h, v3.4h[3] //// y1 * sin1(part of b3)
+ smull v24.4s, v6.4h, v2.h[1] //// y1 * cos1(part of b0)
+ smull v26.4s, v6.4h, v2.h[3] //// y1 * cos3(part of b1)
+ smull v28.4s, v6.4h, v3.h[1] //// y1 * sin3(part of b2)
+ smull v30.4s, v6.4h, v3.h[3] //// y1 * sin1(part of b3)
- smlsl v24.4s, v7.4h, v1.4h[1] //// y1 * cos1 + y3 * cos3(part of b0)
- smlsl v26.4s, v7.4h, v0.4h[1] //// y1 * cos3 - y3 * sin1(part of b1)
- smlsl v28.4s, v7.4h, v1.4h[3] //// y1 * sin3 - y3 * cos1(part of b2)
- smlsl v30.4s, v7.4h, v3.4h[1] //// y1 * sin1 - y3 * sin3(part of b3)
+ smlsl v24.4s, v7.4h, v1.h[1] //// y1 * cos1 + y3 * cos3(part of b0)
+ smlsl v26.4s, v7.4h, v0.h[1] //// y1 * cos3 - y3 * sin1(part of b1)
+ smlsl v28.4s, v7.4h, v1.h[3] //// y1 * sin3 - y3 * cos1(part of b2)
+ smlsl v30.4s, v7.4h, v3.h[1] //// y1 * sin1 - y3 * sin3(part of b3)
- smull v22.4s, v10.4h, v0.4h[0]
- smlsl v22.4s, v11.4h, v3.4h[2]
- smull v20.4s, v10.4h, v0.4h[0]
- smlsl v20.4s, v11.4h, v2.4h[2]
- smull v16.4s, v10.4h, v0.4h[0]
- smlsl v16.4s, v11.4h, v1.4h[2]
- smull v18.4s, v10.4h, v0.4h[0]
- smlsl v18.4s, v11.4h, v0.4h[2]
+ smull v22.4s, v10.4h, v0.h[0]
+ smlsl v22.4s, v11.4h, v3.h[2]
+ smull v20.4s, v10.4h, v0.h[0]
+ smlsl v20.4s, v11.4h, v2.h[2]
+ smull v16.4s, v10.4h, v0.h[0]
+ smlsl v16.4s, v11.4h, v1.h[2]
+ smull v18.4s, v10.4h, v0.h[0]
+ smlsl v18.4s, v11.4h, v0.h[2]
cmp x11,x7
bge skip_last12rows_kernel2
- smlsl v24.4s, v8.4h, v3.4h[1]
- smlal v26.4s, v8.4h, v2.4h[1]
- smlal v28.4s, v8.4h, v0.4h[1]
- smlal v30.4s, v8.4h, v2.4h[3]
+ smlsl v24.4s, v8.4h, v3.h[1]
+ smlal v26.4s, v8.4h, v2.h[1]
+ smlal v28.4s, v8.4h, v0.h[1]
+ smlal v30.4s, v8.4h, v2.h[3]
- smlal v24.4s, v9.4h, v0.4h[1]
- smlal v26.4s, v9.4h, v3.4h[1]
- smlsl v28.4s, v9.4h, v1.4h[1]
- smlsl v30.4s, v9.4h, v2.4h[1]
+ smlal v24.4s, v9.4h, v0.h[1]
+ smlal v26.4s, v9.4h, v3.h[1]
+ smlsl v28.4s, v9.4h, v1.h[1]
+ smlsl v30.4s, v9.4h, v2.h[1]
- smlsl v22.4s, v4.4h, v1.4h[0]
- smlal v22.4s, v5.4h, v2.4h[2]
- smlsl v20.4s, v4.4h, v3.4h[0]
- smlal v20.4s, v5.4h, v0.4h[2]
- smlal v16.4s, v4.4h, v3.4h[0]
- smlal v16.4s, v5.4h, v3.4h[2]
- smlal v18.4s, v4.4h, v1.4h[0]
- smlsl v18.4s, v5.4h, v1.4h[2]
+ smlsl v22.4s, v4.4h, v1.h[0]
+ smlal v22.4s, v5.4h, v2.h[2]
+ smlsl v20.4s, v4.4h, v3.h[0]
+ smlal v20.4s, v5.4h, v0.h[2]
+ smlal v16.4s, v4.4h, v3.h[0]
+ smlal v16.4s, v5.4h, v3.h[2]
+ smlal v18.4s, v4.4h, v1.h[0]
+ smlsl v18.4s, v5.4h, v1.h[2]
//d0[0]= 64 d2[0]=64
//d0[1]= 90 d2[1]=57
@@ -502,55 +502,55 @@ skip_stage1_kernel_load:
ld1 {v9.4h},[x9],x5
- smlsl v24.4s, v6.4h, v3.4h[3] //// y1 * cos1(part of b0)
- smlsl v26.4s, v6.4h, v0.4h[3] //// y1 * cos3(part of b1)
- smlal v28.4s, v6.4h, v2.4h[3] //// y1 * sin3(part of b2)
- smlal v30.4s, v6.4h, v1.4h[3] //// y1 * sin1(part of b3)
+ smlsl v24.4s, v6.4h, v3.h[3] //// y1 * cos1(part of b0)
+ smlsl v26.4s, v6.4h, v0.h[3] //// y1 * cos3(part of b1)
+ smlal v28.4s, v6.4h, v2.h[3] //// y1 * sin3(part of b2)
+ smlal v30.4s, v6.4h, v1.h[3] //// y1 * sin1(part of b3)
- smlsl v24.4s, v7.4h, v0.4h[3] //// y1 * cos1 + y3 * cos3(part of b0)
- smlal v26.4s, v7.4h, v1.4h[3] //// y1 * cos3 - y3 * sin1(part of b1)
- smlal v28.4s, v7.4h, v3.4h[3] //// y1 * sin3 - y3 * cos1(part of b2)
- smlsl v30.4s, v7.4h, v1.4h[1] //// y1 * sin1 - y3 * sin3(part of b3)
+ smlsl v24.4s, v7.4h, v0.h[3] //// y1 * cos1 + y3 * cos3(part of b0)
+ smlal v26.4s, v7.4h, v1.h[3] //// y1 * cos3 - y3 * sin1(part of b1)
+ smlal v28.4s, v7.4h, v3.h[3] //// y1 * sin3 - y3 * cos1(part of b2)
+ smlsl v30.4s, v7.4h, v1.h[1] //// y1 * sin1 - y3 * sin3(part of b3)
- smlal v24.4s, v8.4h, v2.4h[3]
- smlal v26.4s, v8.4h, v3.4h[3]
- smlsl v28.4s, v8.4h, v2.4h[1]
- smlal v30.4s, v8.4h, v0.4h[3]
+ smlal v24.4s, v8.4h, v2.h[3]
+ smlal v26.4s, v8.4h, v3.h[3]
+ smlsl v28.4s, v8.4h, v2.h[1]
+ smlal v30.4s, v8.4h, v0.h[3]
- smlal v24.4s, v9.4h, v1.4h[3]
- smlsl v26.4s, v9.4h, v1.4h[1]
- smlal v28.4s, v9.4h, v0.4h[3]
- smlsl v30.4s, v9.4h, v0.4h[1]
+ smlal v24.4s, v9.4h, v1.h[3]
+ smlsl v26.4s, v9.4h, v1.h[1]
+ smlal v28.4s, v9.4h, v0.h[3]
+ smlsl v30.4s, v9.4h, v0.h[1]
- smlal v22.4s, v10.4h, v0.4h[0]
- smlsl v22.4s, v11.4h, v1.4h[2]
- smlsl v22.4s, v4.4h, v3.4h[0]
- smlal v22.4s, v5.4h, v0.4h[2]
+ smlal v22.4s, v10.4h, v0.h[0]
+ smlsl v22.4s, v11.4h, v1.h[2]
+ smlsl v22.4s, v4.4h, v3.h[0]
+ smlal v22.4s, v5.4h, v0.h[2]
- smlsl v20.4s, v10.4h, v0.4h[0]
- smlsl v20.4s, v11.4h, v3.4h[2]
- smlal v20.4s, v4.4h, v1.4h[0]
- smlsl v20.4s, v5.4h, v1.4h[2]
+ smlsl v20.4s, v10.4h, v0.h[0]
+ smlsl v20.4s, v11.4h, v3.h[2]
+ smlal v20.4s, v4.4h, v1.h[0]
+ smlsl v20.4s, v5.4h, v1.h[2]
- smlsl v16.4s, v10.4h, v0.4h[0]
- smlal v16.4s, v11.4h, v0.4h[2]
- smlsl v16.4s, v4.4h, v1.4h[0]
- smlal v16.4s, v5.4h, v2.4h[2]
+ smlsl v16.4s, v10.4h, v0.h[0]
+ smlal v16.4s, v11.4h, v0.h[2]
+ smlsl v16.4s, v4.4h, v1.h[0]
+ smlal v16.4s, v5.4h, v2.h[2]
- smlal v18.4s, v10.4h, v0.4h[0]
- smlsl v18.4s, v11.4h, v2.4h[2]
- smlal v18.4s, v4.4h, v3.4h[0]
- smlsl v18.4s, v5.4h, v3.4h[2]
+ smlal v18.4s, v10.4h, v0.h[0]
+ smlsl v18.4s, v11.4h, v2.h[2]
+ smlal v18.4s, v4.4h, v3.h[0]
+ smlsl v18.4s, v5.4h, v3.h[2]
skip_last12rows_kernel2:
@@ -755,48 +755,48 @@ second_stage:
second_stage_process:
- smull v24.4s, v6.4h, v0.4h[1] //// y1 * cos1(part of b0)
- smull v26.4s, v6.4h, v0.4h[3] //// y1 * cos3(part of b1)
- smull v28.4s, v6.4h, v1.4h[1] //// y1 * sin3(part of b2)
- smull v30.4s, v6.4h, v1.4h[3] //// y1 * sin1(part of b3)
+ smull v24.4s, v6.4h, v0.h[1] //// y1 * cos1(part of b0)
+ smull v26.4s, v6.4h, v0.h[3] //// y1 * cos3(part of b1)
+ smull v28.4s, v6.4h, v1.h[1] //// y1 * sin3(part of b2)
+ smull v30.4s, v6.4h, v1.h[3] //// y1 * sin1(part of b3)
- smlal v24.4s, v7.4h, v0.4h[3] //// y1 * cos1 + y3 * cos3(part of b0)
- smlal v26.4s, v7.4h, v2.4h[1] //// y1 * cos3 - y3 * sin1(part of b1)
- smlal v28.4s, v7.4h, v3.4h[3] //// y1 * sin3 - y3 * cos1(part of b2)
- smlsl v30.4s, v7.4h, v2.4h[3] //// y1 * sin1 - y3 * sin3(part of b3)
+ smlal v24.4s, v7.4h, v0.h[3] //// y1 * cos1 + y3 * cos3(part of b0)
+ smlal v26.4s, v7.4h, v2.h[1] //// y1 * cos3 - y3 * sin1(part of b1)
+ smlal v28.4s, v7.4h, v3.h[3] //// y1 * sin3 - y3 * cos1(part of b2)
+ smlsl v30.4s, v7.4h, v2.h[3] //// y1 * sin1 - y3 * sin3(part of b3)
- smull v12.4s, v10.4h, v0.4h[0]
- smlal v12.4s, v11.4h, v0.4h[2]
- smull v14.4s, v10.4h, v0.4h[0]
- smlal v14.4s, v11.4h, v1.4h[2]
- smull v16.4s, v10.4h, v0.4h[0]
- smlal v16.4s, v11.4h, v2.4h[2]
- smull v18.4s, v10.4h, v0.4h[0]
- smlal v18.4s, v11.4h, v3.4h[2]
+ smull v12.4s, v10.4h, v0.h[0]
+ smlal v12.4s, v11.4h, v0.h[2]
+ smull v14.4s, v10.4h, v0.h[0]
+ smlal v14.4s, v11.4h, v1.h[2]
+ smull v16.4s, v10.4h, v0.h[0]
+ smlal v16.4s, v11.4h, v2.h[2]
+ smull v18.4s, v10.4h, v0.h[0]
+ smlal v18.4s, v11.4h, v3.h[2]
bge skip_last8rows_stage2_kernel1
- smlal v24.4s, v8.4h, v1.4h[1]
- smlal v26.4s, v8.4h, v3.4h[3]
- smlsl v28.4s, v8.4h, v1.4h[3]
- smlsl v30.4s, v8.4h, v0.4h[3]
+ smlal v24.4s, v8.4h, v1.h[1]
+ smlal v26.4s, v8.4h, v3.h[3]
+ smlsl v28.4s, v8.4h, v1.h[3]
+ smlsl v30.4s, v8.4h, v0.h[3]
- smlal v24.4s, v9.4h, v1.4h[3]
- smlsl v26.4s, v9.4h, v2.4h[3]
- smlsl v28.4s, v9.4h, v0.4h[3]
- smlal v30.4s, v9.4h, v3.4h[3]
+ smlal v24.4s, v9.4h, v1.h[3]
+ smlsl v26.4s, v9.4h, v2.h[3]
+ smlsl v28.4s, v9.4h, v0.h[3]
+ smlal v30.4s, v9.4h, v3.h[3]
- smlal v12.4s, v4.4h, v1.4h[0]
- smlal v12.4s, v5.4h, v1.4h[2]
- smlal v14.4s, v4.4h, v3.4h[0]
- smlsl v14.4s, v5.4h, v3.4h[2]
- smlsl v16.4s, v4.4h, v3.4h[0]
- smlsl v16.4s, v5.4h, v0.4h[2]
- smlsl v18.4s, v4.4h, v1.4h[0]
- smlsl v18.4s, v5.4h, v2.4h[2]
+ smlal v12.4s, v4.4h, v1.h[0]
+ smlal v12.4s, v5.4h, v1.h[2]
+ smlal v14.4s, v4.4h, v3.h[0]
+ smlsl v14.4s, v5.4h, v3.h[2]
+ smlsl v16.4s, v4.4h, v3.h[0]
+ smlsl v16.4s, v5.4h, v0.h[2]
+ smlsl v18.4s, v4.4h, v1.h[0]
+ smlsl v18.4s, v5.4h, v2.h[2]
mov x19,#0xff00
cmp x12,x19
@@ -812,57 +812,57 @@ second_stage_process:
- smlal v24.4s, v6.4h, v2.4h[1] //// y1 * cos1(part of b0)
- smlsl v26.4s, v6.4h, v1.4h[1] //// y1 * cos3(part of b1)
- smlsl v28.4s, v6.4h, v3.4h[1] //// y1 * sin3(part of b2)
- smlal v30.4s, v6.4h, v0.4h[1] //// y1 * sin1(part of b3)
+ smlal v24.4s, v6.4h, v2.h[1] //// y1 * cos1(part of b0)
+ smlsl v26.4s, v6.4h, v1.h[1] //// y1 * cos3(part of b1)
+ smlsl v28.4s, v6.4h, v3.h[1] //// y1 * sin3(part of b2)
+ smlal v30.4s, v6.4h, v0.h[1] //// y1 * sin1(part of b3)
- smlal v24.4s, v7.4h, v2.4h[3] //// y1 * cos1 + y3 * cos3(part of b0)
- smlsl v26.4s, v7.4h, v0.4h[1] //// y1 * cos3 - y3 * sin1(part of b1)
- smlal v28.4s, v7.4h, v2.4h[1] //// y1 * sin3 - y3 * cos1(part of b2)
- smlal v30.4s, v7.4h, v3.4h[1] //// y1 * sin1 - y3 * sin3(part of b3)
+ smlal v24.4s, v7.4h, v2.h[3] //// y1 * cos1 + y3 * cos3(part of b0)
+ smlsl v26.4s, v7.4h, v0.h[1] //// y1 * cos3 - y3 * sin1(part of b1)
+ smlal v28.4s, v7.4h, v2.h[1] //// y1 * sin3 - y3 * cos1(part of b2)
+ smlal v30.4s, v7.4h, v3.h[1] //// y1 * sin1 - y3 * sin3(part of b3)
- smlal v24.4s, v8.4h, v3.4h[1]
- smlsl v26.4s, v8.4h, v1.4h[3]
- smlal v28.4s, v8.4h, v0.4h[1]
- smlsl v30.4s, v8.4h, v1.4h[1]
+ smlal v24.4s, v8.4h, v3.h[1]
+ smlsl v26.4s, v8.4h, v1.h[3]
+ smlal v28.4s, v8.4h, v0.h[1]
+ smlsl v30.4s, v8.4h, v1.h[1]
- smlal v24.4s, v9.4h, v3.4h[3]
- smlsl v26.4s, v9.4h, v3.4h[1]
- smlal v28.4s, v9.4h, v2.4h[3]
- smlsl v30.4s, v9.4h, v2.4h[1]
+ smlal v24.4s, v9.4h, v3.h[3]
+ smlsl v26.4s, v9.4h, v3.h[1]
+ smlal v28.4s, v9.4h, v2.h[3]
+ smlsl v30.4s, v9.4h, v2.h[1]
- smlal v12.4s, v10.4h, v0.4h[0]
- smlal v12.4s, v11.4h, v2.4h[2]
- smlal v12.4s, v4.4h, v3.4h[0]
- smlal v12.4s, v5.4h, v3.4h[2]
+ smlal v12.4s, v10.4h, v0.h[0]
+ smlal v12.4s, v11.4h, v2.h[2]
+ smlal v12.4s, v4.4h, v3.h[0]
+ smlal v12.4s, v5.4h, v3.h[2]
- smlsl v14.4s, v10.4h, v0.4h[0]
- smlsl v14.4s, v11.4h, v0.4h[2]
- smlsl v14.4s, v4.4h, v1.4h[0]
- smlsl v14.4s, v5.4h, v2.4h[2]
+ smlsl v14.4s, v10.4h, v0.h[0]
+ smlsl v14.4s, v11.4h, v0.h[2]
+ smlsl v14.4s, v4.4h, v1.h[0]
+ smlsl v14.4s, v5.4h, v2.h[2]
- smlsl v16.4s, v10.4h, v0.4h[0]
- smlal v16.4s, v11.4h, v3.4h[2]
- smlal v16.4s, v4.4h, v1.4h[0]
- smlal v16.4s, v5.4h, v1.4h[2]
+ smlsl v16.4s, v10.4h, v0.h[0]
+ smlal v16.4s, v11.4h, v3.h[2]
+ smlal v16.4s, v4.4h, v1.h[0]
+ smlal v16.4s, v5.4h, v1.h[2]
- smlal v18.4s, v10.4h, v0.4h[0]
- smlal v18.4s, v11.4h, v1.4h[2]
- smlsl v18.4s, v4.4h, v3.4h[0]
- smlsl v18.4s, v5.4h, v0.4h[2]
+ smlal v18.4s, v10.4h, v0.h[0]
+ smlal v18.4s, v11.4h, v1.h[2]
+ smlsl v18.4s, v4.4h, v3.h[0]
+ smlsl v18.4s, v5.4h, v0.h[2]
@@ -914,25 +914,25 @@ skip_stage2_kernel_load:
st1 {v18.4h, v19.4h},[x1],#16
sub x1,x1,#32
- smull v24.4s, v6.4h, v2.4h[1] //// y1 * cos1(part of b0)
- smull v26.4s, v6.4h, v2.4h[3] //// y1 * cos3(part of b1)
- smull v28.4s, v6.4h, v3.4h[1] //// y1 * sin3(part of b2)
- smull v30.4s, v6.4h, v3.4h[3] //// y1 * sin1(part of b3)
+ smull v24.4s, v6.4h, v2.h[1] //// y1 * cos1(part of b0)
+ smull v26.4s, v6.4h, v2.h[3] //// y1 * cos3(part of b1)
+ smull v28.4s, v6.4h, v3.h[1] //// y1 * sin3(part of b2)
+ smull v30.4s, v6.4h, v3.h[3] //// y1 * sin1(part of b3)
- smlsl v24.4s, v7.4h, v1.4h[1] //// y1 * cos1 + y3 * cos3(part of b0)
- smlsl v26.4s, v7.4h, v0.4h[1] //// y1 * cos3 - y3 * sin1(part of b1)
- smlsl v28.4s, v7.4h, v1.4h[3] //// y1 * sin3 - y3 * cos1(part of b2)
- smlsl v30.4s, v7.4h, v3.4h[1] //// y1 * sin1 - y3 * sin3(part of b3)
+ smlsl v24.4s, v7.4h, v1.h[1] //// y1 * cos1 + y3 * cos3(part of b0)
+ smlsl v26.4s, v7.4h, v0.h[1] //// y1 * cos3 - y3 * sin1(part of b1)
+ smlsl v28.4s, v7.4h, v1.h[3] //// y1 * sin3 - y3 * cos1(part of b2)
+ smlsl v30.4s, v7.4h, v3.h[1] //// y1 * sin1 - y3 * sin3(part of b3)
- smull v22.4s, v10.4h, v0.4h[0]
- smlsl v22.4s, v11.4h, v3.4h[2]
- smull v20.4s, v10.4h, v0.4h[0]
- smlsl v20.4s, v11.4h, v2.4h[2]
- smull v16.4s, v10.4h, v0.4h[0]
- smlsl v16.4s, v11.4h, v1.4h[2]
- smull v18.4s, v10.4h, v0.4h[0]
- smlsl v18.4s, v11.4h, v0.4h[2]
+ smull v22.4s, v10.4h, v0.h[0]
+ smlsl v22.4s, v11.4h, v3.h[2]
+ smull v20.4s, v10.4h, v0.h[0]
+ smlsl v20.4s, v11.4h, v2.h[2]
+ smull v16.4s, v10.4h, v0.h[0]
+ smlsl v16.4s, v11.4h, v1.h[2]
+ smull v18.4s, v10.4h, v0.h[0]
+ smlsl v18.4s, v11.4h, v0.h[2]
@@ -940,27 +940,27 @@ skip_stage2_kernel_load:
bge skip_last8rows_stage2_kernel2
- smlsl v24.4s, v8.4h, v3.4h[1]
- smlal v26.4s, v8.4h, v2.4h[1]
- smlal v28.4s, v8.4h, v0.4h[1]
- smlal v30.4s, v8.4h, v2.4h[3]
+ smlsl v24.4s, v8.4h, v3.h[1]
+ smlal v26.4s, v8.4h, v2.h[1]
+ smlal v28.4s, v8.4h, v0.h[1]
+ smlal v30.4s, v8.4h, v2.h[3]
- smlal v24.4s, v9.4h, v0.4h[1]
- smlal v26.4s, v9.4h, v3.4h[1]
- smlsl v28.4s, v9.4h, v1.4h[1]
- smlsl v30.4s, v9.4h, v2.4h[1]
+ smlal v24.4s, v9.4h, v0.h[1]
+ smlal v26.4s, v9.4h, v3.h[1]
+ smlsl v28.4s, v9.4h, v1.h[1]
+ smlsl v30.4s, v9.4h, v2.h[1]
- smlsl v22.4s, v4.4h, v1.4h[0]
- smlal v22.4s, v5.4h, v2.4h[2]
- smlsl v20.4s, v4.4h, v3.4h[0]
- smlal v20.4s, v5.4h, v0.4h[2]
- smlal v16.4s, v4.4h, v3.4h[0]
- smlal v16.4s, v5.4h, v3.4h[2]
- smlal v18.4s, v4.4h, v1.4h[0]
- smlsl v18.4s, v5.4h, v1.4h[2]
+ smlsl v22.4s, v4.4h, v1.h[0]
+ smlal v22.4s, v5.4h, v2.h[2]
+ smlsl v20.4s, v4.4h, v3.h[0]
+ smlal v20.4s, v5.4h, v0.h[2]
+ smlal v16.4s, v4.4h, v3.h[0]
+ smlal v16.4s, v5.4h, v3.h[2]
+ smlal v18.4s, v4.4h, v1.h[0]
+ smlsl v18.4s, v5.4h, v1.h[2]
mov x19,#0xff00
cmp x12,x19
bge skip_last8rows_stage2_kernel2
@@ -970,55 +970,55 @@ skip_stage2_kernel_load:
ld1 {v4.4h, v5.4h},[x0],#16
ld1 {v8.4h, v9.4h},[x0],#16
- smlsl v24.4s, v6.4h, v3.4h[3] //// y1 * cos1(part of b0)
- smlsl v26.4s, v6.4h, v0.4h[3] //// y1 * cos3(part of b1)
- smlal v28.4s, v6.4h, v2.4h[3] //// y1 * sin3(part of b2)
- smlal v30.4s, v6.4h, v1.4h[3] //// y1 * sin1(part of b3)
+ smlsl v24.4s, v6.4h, v3.h[3] //// y1 * cos1(part of b0)
+ smlsl v26.4s, v6.4h, v0.h[3] //// y1 * cos3(part of b1)
+ smlal v28.4s, v6.4h, v2.h[3] //// y1 * sin3(part of b2)
+ smlal v30.4s, v6.4h, v1.h[3] //// y1 * sin1(part of b3)
- smlsl v24.4s, v7.4h, v0.4h[3] //// y1 * cos1 + y3 * cos3(part of b0)
- smlal v26.4s, v7.4h, v1.4h[3] //// y1 * cos3 - y3 * sin1(part of b1)
- smlal v28.4s, v7.4h, v3.4h[3] //// y1 * sin3 - y3 * cos1(part of b2)
- smlsl v30.4s, v7.4h, v1.4h[1] //// y1 * sin1 - y3 * sin3(part of b3)
+ smlsl v24.4s, v7.4h, v0.h[3] //// y1 * cos1 + y3 * cos3(part of b0)
+ smlal v26.4s, v7.4h, v1.h[3] //// y1 * cos3 - y3 * sin1(part of b1)
+ smlal v28.4s, v7.4h, v3.h[3] //// y1 * sin3 - y3 * cos1(part of b2)
+ smlsl v30.4s, v7.4h, v1.h[1] //// y1 * sin1 - y3 * sin3(part of b3)
- smlal v24.4s, v8.4h, v2.4h[3]
- smlal v26.4s, v8.4h, v3.4h[3]
- smlsl v28.4s, v8.4h, v2.4h[1]
- smlal v30.4s, v8.4h, v0.4h[3]
+ smlal v24.4s, v8.4h, v2.h[3]
+ smlal v26.4s, v8.4h, v3.h[3]
+ smlsl v28.4s, v8.4h, v2.h[1]
+ smlal v30.4s, v8.4h, v0.h[3]
- smlal v24.4s, v9.4h, v1.4h[3]
- smlsl v26.4s, v9.4h, v1.4h[1]
- smlal v28.4s, v9.4h, v0.4h[3]
- smlsl v30.4s, v9.4h, v0.4h[1]
+ smlal v24.4s, v9.4h, v1.h[3]
+ smlsl v26.4s, v9.4h, v1.h[1]
+ smlal v28.4s, v9.4h, v0.h[3]
+ smlsl v30.4s, v9.4h, v0.h[1]
- smlal v22.4s, v10.4h, v0.4h[0]
- smlsl v22.4s, v11.4h, v1.4h[2]
- smlsl v22.4s, v4.4h, v3.4h[0]
- smlal v22.4s, v5.4h, v0.4h[2]
+ smlal v22.4s, v10.4h, v0.h[0]
+ smlsl v22.4s, v11.4h, v1.h[2]
+ smlsl v22.4s, v4.4h, v3.h[0]
+ smlal v22.4s, v5.4h, v0.h[2]
- smlsl v20.4s, v10.4h, v0.4h[0]
- smlsl v20.4s, v11.4h, v3.4h[2]
- smlal v20.4s, v4.4h, v1.4h[0]
- smlsl v20.4s, v5.4h, v1.4h[2]
+ smlsl v20.4s, v10.4h, v0.h[0]
+ smlsl v20.4s, v11.4h, v3.h[2]
+ smlal v20.4s, v4.4h, v1.h[0]
+ smlsl v20.4s, v5.4h, v1.h[2]
- smlsl v16.4s, v10.4h, v0.4h[0]
- smlal v16.4s, v11.4h, v0.4h[2]
- smlsl v16.4s, v4.4h, v1.4h[0]
- smlal v16.4s, v5.4h, v2.4h[2]
+ smlsl v16.4s, v10.4h, v0.h[0]
+ smlal v16.4s, v11.4h, v0.h[2]
+ smlsl v16.4s, v4.4h, v1.h[0]
+ smlal v16.4s, v5.4h, v2.h[2]
- smlal v18.4s, v10.4h, v0.4h[0]
- smlsl v18.4s, v11.4h, v2.4h[2]
- smlal v18.4s, v4.4h, v3.4h[0]
- smlsl v18.4s, v5.4h, v3.4h[2]
+ smlal v18.4s, v10.4h, v0.h[0]
+ smlsl v18.4s, v11.4h, v2.h[2]
+ smlal v18.4s, v4.4h, v3.h[0]
+ smlsl v18.4s, v5.4h, v3.h[2]
skip_last8rows_stage2_kernel2:
diff --git a/common/arm64/ihevc_itrans_recon_32x32.s b/common/arm64/ihevc_itrans_recon_32x32.s
index 6f40747..51646ac 100644
--- a/common/arm64/ihevc_itrans_recon_32x32.s
+++ b/common/arm64/ihevc_itrans_recon_32x32.s
@@ -213,32 +213,32 @@ stage1:
ld1 {v11.4h},[x0],x6
ld1 {v9.4h},[x0],x6
- smull v24.4s, v8.4h, v0.4h[1] //// y1 * cos1(part of b0)
- smull v26.4s, v8.4h, v0.4h[3] //// y1 * cos3(part of b1)
- smull v28.4s, v8.4h, v1.4h[1] //// y1 * sin3(part of b2)
- smull v30.4s, v8.4h, v1.4h[3] //// y1 * sin1(part of b3)
+ smull v24.4s, v8.4h, v0.h[1] //// y1 * cos1(part of b0)
+ smull v26.4s, v8.4h, v0.h[3] //// y1 * cos3(part of b1)
+ smull v28.4s, v8.4h, v1.h[1] //// y1 * sin3(part of b2)
+ smull v30.4s, v8.4h, v1.h[3] //// y1 * sin1(part of b3)
- smlal v24.4s, v9.4h, v0.4h[3] //// y1 * cos1 + y3 * cos3(part of b0)
- smlal v26.4s, v9.4h, v2.4h[1] //// y1 * cos3 - y3 * sin1(part of b1)
- smlal v28.4s, v9.4h, v3.4h[3] //// y1 * sin3 - y3 * cos1(part of b2)
- smlal v30.4s, v9.4h, v5.4h[1] //// y1 * sin1 - y3 * sin3(part of b3)
+ smlal v24.4s, v9.4h, v0.h[3] //// y1 * cos1 + y3 * cos3(part of b0)
+ smlal v26.4s, v9.4h, v2.h[1] //// y1 * cos3 - y3 * sin1(part of b1)
+ smlal v28.4s, v9.4h, v3.h[3] //// y1 * sin3 - y3 * cos1(part of b2)
+ smlal v30.4s, v9.4h, v5.h[1] //// y1 * sin1 - y3 * sin3(part of b3)
- smull v20.4s, v10.4h, v0.4h[0]
- smlal v20.4s, v11.4h, v0.4h[2]
+ smull v20.4s, v10.4h, v0.h[0]
+ smlal v20.4s, v11.4h, v0.h[2]
- smull v22.4s, v10.4h, v0.4h[0]
- smlal v22.4s, v11.4h, v1.4h[2]
+ smull v22.4s, v10.4h, v0.h[0]
+ smlal v22.4s, v11.4h, v1.h[2]
- smull v16.4s, v10.4h, v0.4h[0]
- smlal v16.4s, v11.4h, v2.4h[2]
+ smull v16.4s, v10.4h, v0.h[0]
+ smlal v16.4s, v11.4h, v2.h[2]
- smull v18.4s, v10.4h, v0.4h[0]
- smlal v18.4s, v11.4h, v3.4h[2]
+ smull v18.4s, v10.4h, v0.h[0]
+ smlal v18.4s, v11.4h, v3.h[2]
cmp x11,x10
bhs shift1
@@ -253,26 +253,26 @@ stage1:
- smlal v24.4s, v14.4h, v1.4h[1]
- smlal v26.4s, v14.4h, v3.4h[3]
- smlal v28.4s, v14.4h, v6.4h[1]
- smlsl v30.4s, v14.4h, v7.4h[1]
+ smlal v24.4s, v14.4h, v1.h[1]
+ smlal v26.4s, v14.4h, v3.h[3]
+ smlal v28.4s, v14.4h, v6.h[1]
+ smlsl v30.4s, v14.4h, v7.h[1]
- smlal v24.4s, v15.4h, v1.4h[3]
- smlal v26.4s, v15.4h, v5.4h[1]
- smlsl v28.4s, v15.4h, v7.4h[1]
- smlsl v30.4s, v15.4h, v3.4h[3]
+ smlal v24.4s, v15.4h, v1.h[3]
+ smlal v26.4s, v15.4h, v5.h[1]
+ smlsl v28.4s, v15.4h, v7.h[1]
+ smlsl v30.4s, v15.4h, v3.h[3]
- smlal v20.4s, v12.4h, v1.4h[0]
- smlal v20.4s, v13.4h, v1.4h[2]
- smlal v22.4s, v12.4h, v3.4h[0]
- smlal v22.4s, v13.4h, v4.4h[2]
- smlal v16.4s, v12.4h, v5.4h[0]
- smlal v16.4s, v13.4h, v7.4h[2]
- smlal v18.4s, v12.4h, v7.4h[0]
- smlsl v18.4s, v13.4h, v5.4h[2]
+ smlal v20.4s, v12.4h, v1.h[0]
+ smlal v20.4s, v13.4h, v1.h[2]
+ smlal v22.4s, v12.4h, v3.h[0]
+ smlal v22.4s, v13.4h, v4.h[2]
+ smlal v16.4s, v12.4h, v5.h[0]
+ smlal v16.4s, v13.4h, v7.h[2]
+ smlal v18.4s, v12.4h, v7.h[0]
+ smlsl v18.4s, v13.4h, v5.h[2]
cmp x11,x9
bhs shift1
@@ -283,32 +283,32 @@ stage1:
ld1 {v9.4h},[x0],x6
- smlal v24.4s, v8.4h, v2.4h[1] //// y1 * cos1(part of b0)
- smlal v26.4s, v8.4h, v6.4h[3] //// y1 * cos3(part of b1)
- smlsl v28.4s, v8.4h, v4.4h[3] //// y1 * sin3(part of b2)
- smlsl v30.4s, v8.4h, v0.4h[1] //// y1 * sin1(part of b3)
+ smlal v24.4s, v8.4h, v2.h[1] //// y1 * cos1(part of b0)
+ smlal v26.4s, v8.4h, v6.h[3] //// y1 * cos3(part of b1)
+ smlsl v28.4s, v8.4h, v4.h[3] //// y1 * sin3(part of b2)
+ smlsl v30.4s, v8.4h, v0.h[1] //// y1 * sin1(part of b3)
- smlal v24.4s, v9.4h, v2.4h[3] //// y1 * cos1 + y3 * cos3(part of b0)
- smlsl v26.4s, v9.4h, v7.4h[3] //// y1 * cos3 - y3 * sin1(part of b1)
- smlsl v28.4s, v9.4h, v2.4h[1] //// y1 * sin3 - y3 * cos1(part of b2)
- smlsl v30.4s, v9.4h, v3.4h[1] //// y1 * sin1 - y3 * sin3(part of b3)
+ smlal v24.4s, v9.4h, v2.h[3] //// y1 * cos1 + y3 * cos3(part of b0)
+ smlsl v26.4s, v9.4h, v7.h[3] //// y1 * cos3 - y3 * sin1(part of b1)
+ smlsl v28.4s, v9.4h, v2.h[1] //// y1 * sin3 - y3 * cos1(part of b2)
+ smlsl v30.4s, v9.4h, v3.h[1] //// y1 * sin1 - y3 * sin3(part of b3)
- smlal v20.4s, v10.4h, v2.4h[0]
- smlal v20.4s, v11.4h, v2.4h[2]
+ smlal v20.4s, v10.4h, v2.h[0]
+ smlal v20.4s, v11.4h, v2.h[2]
- smlal v22.4s, v10.4h, v6.4h[0]
- smlal v22.4s, v11.4h, v7.4h[2]
+ smlal v22.4s, v10.4h, v6.h[0]
+ smlal v22.4s, v11.4h, v7.h[2]
- smlsl v16.4s, v10.4h, v6.4h[0]
- smlsl v16.4s, v11.4h, v3.4h[2]
+ smlsl v16.4s, v10.4h, v6.h[0]
+ smlsl v16.4s, v11.4h, v3.h[2]
- smlsl v18.4s, v10.4h, v2.4h[0]
- smlsl v18.4s, v11.4h, v1.4h[2]
+ smlsl v18.4s, v10.4h, v2.h[0]
+ smlsl v18.4s, v11.4h, v1.h[2]
cmp x11,x5
bhs shift1
@@ -327,26 +327,26 @@ stage1:
- smlal v24.4s, v14.4h, v3.4h[1]
- smlsl v26.4s, v14.4h, v6.4h[1]
- smlsl v28.4s, v14.4h, v0.4h[1]
- smlsl v30.4s, v14.4h, v6.4h[3]
+ smlal v24.4s, v14.4h, v3.h[1]
+ smlsl v26.4s, v14.4h, v6.h[1]
+ smlsl v28.4s, v14.4h, v0.h[1]
+ smlsl v30.4s, v14.4h, v6.h[3]
- smlal v24.4s, v15.4h, v3.4h[3]
- smlsl v26.4s, v15.4h, v4.4h[3]
- smlsl v28.4s, v15.4h, v2.4h[3]
- smlal v30.4s, v15.4h, v5.4h[3]
+ smlal v24.4s, v15.4h, v3.h[3]
+ smlsl v26.4s, v15.4h, v4.h[3]
+ smlsl v28.4s, v15.4h, v2.h[3]
+ smlal v30.4s, v15.4h, v5.h[3]
- smlal v20.4s, v12.4h, v3.4h[0]
- smlal v20.4s, v13.4h, v3.4h[2]
- smlsl v22.4s, v12.4h, v7.4h[0]
- smlsl v22.4s, v13.4h, v5.4h[2]
- smlsl v16.4s, v12.4h, v1.4h[0]
- smlsl v16.4s, v13.4h, v1.4h[2]
- smlsl v18.4s, v12.4h, v5.4h[0]
- smlal v18.4s, v13.4h, v7.4h[2]
+ smlal v20.4s, v12.4h, v3.h[0]
+ smlal v20.4s, v13.4h, v3.h[2]
+ smlsl v22.4s, v12.4h, v7.h[0]
+ smlsl v22.4s, v13.4h, v5.h[2]
+ smlsl v16.4s, v12.4h, v1.h[0]
+ smlsl v16.4s, v13.4h, v1.h[2]
+ smlsl v18.4s, v12.4h, v5.h[0]
+ smlal v18.4s, v13.4h, v7.h[2]
cmp x11,x7
bhs shift1
@@ -359,32 +359,32 @@ stage1:
- smlal v24.4s, v8.4h, v4.4h[1] //// y1 * cos1(part of b0)
- smlsl v26.4s, v8.4h, v3.4h[1] //// y1 * cos3(part of b1)
- smlsl v28.4s, v8.4h, v5.4h[1] //// y1 * sin3(part of b2)
- smlal v30.4s, v8.4h, v2.4h[1] //// y1 * sin1(part of b3)
+ smlal v24.4s, v8.4h, v4.h[1] //// y1 * cos1(part of b0)
+ smlsl v26.4s, v8.4h, v3.h[1] //// y1 * cos3(part of b1)
+ smlsl v28.4s, v8.4h, v5.h[1] //// y1 * sin3(part of b2)
+ smlal v30.4s, v8.4h, v2.h[1] //// y1 * sin1(part of b3)
- smlal v24.4s, v9.4h, v4.4h[3] //// y1 * cos1 + y3 * cos3(part of b0)
- smlsl v26.4s, v9.4h, v1.4h[3] //// y1 * cos3 - y3 * sin1(part of b1)
- smlsl v28.4s, v9.4h, v7.4h[3] //// y1 * sin3 - y3 * cos1(part of b2)
- smlal v30.4s, v9.4h, v1.4h[1] //// y1 * sin1 - y3 * sin3(part of b3)
+ smlal v24.4s, v9.4h, v4.h[3] //// y1 * cos1 + y3 * cos3(part of b0)
+ smlsl v26.4s, v9.4h, v1.h[3] //// y1 * cos3 - y3 * sin1(part of b1)
+ smlsl v28.4s, v9.4h, v7.h[3] //// y1 * sin3 - y3 * cos1(part of b2)
+ smlal v30.4s, v9.4h, v1.h[1] //// y1 * sin1 - y3 * sin3(part of b3)
- smlal v20.4s, v10.4h, v0.4h[0]
- smlal v20.4s, v11.4h, v4.4h[2]
+ smlal v20.4s, v10.4h, v0.h[0]
+ smlal v20.4s, v11.4h, v4.h[2]
- smlsl v22.4s, v10.4h, v0.4h[0]
- smlsl v22.4s, v11.4h, v2.4h[2]
+ smlsl v22.4s, v10.4h, v0.h[0]
+ smlsl v22.4s, v11.4h, v2.h[2]
- smlsl v16.4s, v10.4h, v0.4h[0]
- smlsl v16.4s, v11.4h, v6.4h[2]
+ smlsl v16.4s, v10.4h, v0.h[0]
+ smlsl v16.4s, v11.4h, v6.h[2]
- smlal v18.4s, v10.4h, v0.4h[0]
- smlal v18.4s, v11.4h, v0.4h[2]
+ smlal v18.4s, v10.4h, v0.h[0]
+ smlal v18.4s, v11.4h, v0.h[2]
@@ -396,26 +396,26 @@ stage1:
- smlal v24.4s, v14.4h, v5.4h[1]
- smlsl v26.4s, v14.4h, v0.4h[2]
- smlal v28.4s, v14.4h, v5.4h[3]
- smlal v30.4s, v14.4h, v4.4h[3]
+ smlal v24.4s, v14.4h, v5.h[1]
+ smlsl v26.4s, v14.4h, v0.h[2]
+ smlal v28.4s, v14.4h, v5.h[3]
+ smlal v30.4s, v14.4h, v4.h[3]
- smlal v24.4s, v15.4h, v5.4h[3]
- smlsl v26.4s, v15.4h, v1.4h[1]
- smlal v28.4s, v15.4h, v3.4h[1]
- smlsl v30.4s, v15.4h, v7.4h[3]
+ smlal v24.4s, v15.4h, v5.h[3]
+ smlsl v26.4s, v15.4h, v1.h[1]
+ smlal v28.4s, v15.4h, v3.h[1]
+ smlsl v30.4s, v15.4h, v7.h[3]
- smlal v20.4s, v12.4h, v5.4h[0]
- smlal v20.4s, v13.4h, v5.4h[2]
- smlsl v22.4s, v12.4h, v1.4h[0]
- smlsl v22.4s, v13.4h, v0.4h[2]
- smlal v16.4s, v12.4h, v7.4h[0]
- smlal v16.4s, v13.4h, v4.4h[2]
- smlal v18.4s, v12.4h, v3.4h[0]
- smlal v18.4s, v13.4h, v6.4h[2]
+ smlal v20.4s, v12.4h, v5.h[0]
+ smlal v20.4s, v13.4h, v5.h[2]
+ smlsl v22.4s, v12.4h, v1.h[0]
+ smlsl v22.4s, v13.4h, v0.h[2]
+ smlal v16.4s, v12.4h, v7.h[0]
+ smlal v16.4s, v13.4h, v4.h[2]
+ smlal v18.4s, v12.4h, v3.h[0]
+ smlal v18.4s, v13.4h, v6.h[2]
ld1 {v10.4h},[x0],x6
@@ -429,32 +429,32 @@ stage1:
- smlal v24.4s, v8.4h, v6.4h[1] //// y1 * cos1(part of b0)
- smlsl v26.4s, v8.4h, v2.4h[3] //// y1 * cos3(part of b1)
- smlal v28.4s, v8.4h, v0.4h[1] //// y1 * sin3(part of b2)
- smlsl v30.4s, v8.4h, v4.4h[1] //// y1 * sin1(part of b3)
+ smlal v24.4s, v8.4h, v6.h[1] //// y1 * cos1(part of b0)
+ smlsl v26.4s, v8.4h, v2.h[3] //// y1 * cos3(part of b1)
+ smlal v28.4s, v8.4h, v0.h[1] //// y1 * sin3(part of b2)
+ smlsl v30.4s, v8.4h, v4.h[1] //// y1 * sin1(part of b3)
- smlal v24.4s, v9.4h, v6.4h[3] //// y1 * cos1 + y3 * cos3(part of b0)
- smlsl v26.4s, v9.4h, v4.4h[1] //// y1 * cos3 - y3 * sin1(part of b1)
- smlal v28.4s, v9.4h, v1.4h[3] //// y1 * sin3 - y3 * cos1(part of b2)
- smlsl v30.4s, v9.4h, v0.4h[1] //// y1 * sin1 - y3 * sin3(part of b3)
+ smlal v24.4s, v9.4h, v6.h[3] //// y1 * cos1 + y3 * cos3(part of b0)
+ smlsl v26.4s, v9.4h, v4.h[1] //// y1 * cos3 - y3 * sin1(part of b1)
+ smlal v28.4s, v9.4h, v1.h[3] //// y1 * sin3 - y3 * cos1(part of b2)
+ smlsl v30.4s, v9.4h, v0.h[1] //// y1 * sin1 - y3 * sin3(part of b3)
- smlal v20.4s, v10.4h, v6.4h[0]
- smlal v20.4s, v11.4h, v6.4h[2]
+ smlal v20.4s, v10.4h, v6.h[0]
+ smlal v20.4s, v11.4h, v6.h[2]
- smlsl v22.4s, v10.4h, v2.4h[0]
- smlsl v22.4s, v11.4h, v3.4h[2]
+ smlsl v22.4s, v10.4h, v2.h[0]
+ smlsl v22.4s, v11.4h, v3.h[2]
- smlal v16.4s, v10.4h, v2.4h[0]
- smlal v16.4s, v11.4h, v0.4h[2]
+ smlal v16.4s, v10.4h, v2.h[0]
+ smlal v16.4s, v11.4h, v0.h[2]
- smlsl v18.4s, v10.4h, v6.4h[0]
- smlsl v18.4s, v11.4h, v2.4h[2]
+ smlsl v18.4s, v10.4h, v6.h[0]
+ smlsl v18.4s, v11.4h, v2.h[2]
ld1 {v12.4h},[x0],x6
ld1 {v14.4h},[x0],x6
@@ -462,26 +462,26 @@ stage1:
ld1 {v15.4h},[x0],x6
- smlal v24.4s, v14.4h, v7.4h[1]
- smlsl v26.4s, v14.4h, v5.4h[3]
- smlal v28.4s, v14.4h, v4.4h[1]
- smlsl v30.4s, v14.4h, v2.4h[3]
+ smlal v24.4s, v14.4h, v7.h[1]
+ smlsl v26.4s, v14.4h, v5.h[3]
+ smlal v28.4s, v14.4h, v4.h[1]
+ smlsl v30.4s, v14.4h, v2.h[3]
- smlal v24.4s, v15.4h, v7.4h[3]
- smlsl v26.4s, v15.4h, v7.4h[1]
- smlal v28.4s, v15.4h, v6.4h[3]
- smlsl v30.4s, v15.4h, v6.4h[1]
+ smlal v24.4s, v15.4h, v7.h[3]
+ smlsl v26.4s, v15.4h, v7.h[1]
+ smlal v28.4s, v15.4h, v6.h[3]
+ smlsl v30.4s, v15.4h, v6.h[1]
- smlal v20.4s, v12.4h, v7.4h[0]
- smlal v20.4s, v13.4h, v7.4h[2]
- smlsl v22.4s, v12.4h, v5.4h[0]
- smlsl v22.4s, v13.4h, v6.4h[2]
- smlal v16.4s, v12.4h, v3.4h[0]
- smlal v16.4s, v13.4h, v5.4h[2]
- smlsl v18.4s, v12.4h, v1.4h[0]
- smlsl v18.4s, v13.4h, v4.4h[2]
+ smlal v20.4s, v12.4h, v7.h[0]
+ smlal v20.4s, v13.4h, v7.h[2]
+ smlsl v22.4s, v12.4h, v5.h[0]
+ smlsl v22.4s, v13.4h, v6.h[2]
+ smlal v16.4s, v12.4h, v3.h[0]
+ smlal v16.4s, v13.4h, v5.h[2]
+ smlsl v18.4s, v12.4h, v1.h[0]
+ smlsl v18.4s, v13.4h, v4.h[2]
@@ -574,32 +574,32 @@ shift1:
- smull v24.4s, v8.4h, v2.4h[1] //// y1 * cos1(part of b0)
- smull v26.4s, v8.4h, v2.4h[3] //// y1 * cos3(part of b1)
- smull v28.4s, v8.4h, v3.4h[1] //// y1 * sin3(part of b2)
- smull v30.4s, v8.4h, v3.4h[3] //// y1 * sin1(part of b3)
+ smull v24.4s, v8.4h, v2.h[1] //// y1 * cos1(part of b0)
+ smull v26.4s, v8.4h, v2.h[3] //// y1 * cos3(part of b1)
+ smull v28.4s, v8.4h, v3.h[1] //// y1 * sin3(part of b2)
+ smull v30.4s, v8.4h, v3.h[3] //// y1 * sin1(part of b3)
- smlal v24.4s, v9.4h, v6.4h[3] //// y1 * cos1 + y3 * cos3(part of b0)
- smlsl v26.4s, v9.4h, v7.4h[3] //// y1 * cos3 - y3 * sin1(part of b1)
- smlsl v28.4s, v9.4h, v6.4h[1] //// y1 * sin3 - y3 * cos1(part of b2)
- smlsl v30.4s, v9.4h, v4.4h[3] //// y1 * sin1 - y3 * sin3(part of b3)
+ smlal v24.4s, v9.4h, v6.h[3] //// y1 * cos1 + y3 * cos3(part of b0)
+ smlsl v26.4s, v9.4h, v7.h[3] //// y1 * cos3 - y3 * sin1(part of b1)
+ smlsl v28.4s, v9.4h, v6.h[1] //// y1 * sin3 - y3 * cos1(part of b2)
+ smlsl v30.4s, v9.4h, v4.h[3] //// y1 * sin1 - y3 * sin3(part of b3)
- smull v20.4s, v10.4h, v0.4h[0]
- smlal v20.4s, v11.4h, v4.4h[2]
+ smull v20.4s, v10.4h, v0.h[0]
+ smlal v20.4s, v11.4h, v4.h[2]
- smull v22.4s, v10.4h, v0.4h[0]
- smlal v22.4s, v11.4h, v5.4h[2]
+ smull v22.4s, v10.4h, v0.h[0]
+ smlal v22.4s, v11.4h, v5.h[2]
- smull v16.4s, v10.4h, v0.4h[0]
- smlal v16.4s, v11.4h, v6.4h[2]
+ smull v16.4s, v10.4h, v0.h[0]
+ smlal v16.4s, v11.4h, v6.h[2]
- smull v18.4s, v10.4h, v0.4h[0]
- smlal v18.4s, v11.4h, v7.4h[2]
+ smull v18.4s, v10.4h, v0.h[0]
+ smlal v18.4s, v11.4h, v7.h[2]
cmp x11,x10
bhs shift2
@@ -609,26 +609,26 @@ shift1:
ld1 {v15.4h},[x0],x6
- smlsl v24.4s, v14.4h, v4.4h[3]
- smlsl v26.4s, v14.4h, v2.4h[1]
- smlsl v28.4s, v14.4h, v0.4h[1]
- smlsl v30.4s, v14.4h, v2.4h[3]
+ smlsl v24.4s, v14.4h, v4.h[3]
+ smlsl v26.4s, v14.4h, v2.h[1]
+ smlsl v28.4s, v14.4h, v0.h[1]
+ smlsl v30.4s, v14.4h, v2.h[3]
- smlsl v24.4s, v15.4h, v0.4h[3]
- smlsl v26.4s, v15.4h, v3.4h[1]
- smlsl v28.4s, v15.4h, v6.4h[3]
- smlal v30.4s, v15.4h, v5.4h[3]
+ smlsl v24.4s, v15.4h, v0.h[3]
+ smlsl v26.4s, v15.4h, v3.h[1]
+ smlsl v28.4s, v15.4h, v6.h[3]
+ smlal v30.4s, v15.4h, v5.h[3]
- smlsl v20.4s, v12.4h, v7.4h[0]
- smlsl v20.4s, v13.4h, v2.4h[2]
- smlsl v22.4s, v12.4h, v5.4h[0]
- smlsl v22.4s, v13.4h, v0.4h[2]
- smlsl v16.4s, v12.4h, v3.4h[0]
- smlsl v16.4s, v13.4h, v3.4h[2]
- smlsl v18.4s, v12.4h, v1.4h[0]
- smlsl v18.4s, v13.4h, v6.4h[2]
+ smlsl v20.4s, v12.4h, v7.h[0]
+ smlsl v20.4s, v13.4h, v2.h[2]
+ smlsl v22.4s, v12.4h, v5.h[0]
+ smlsl v22.4s, v13.4h, v0.h[2]
+ smlsl v16.4s, v12.4h, v3.h[0]
+ smlsl v16.4s, v13.4h, v3.h[2]
+ smlsl v18.4s, v12.4h, v1.h[0]
+ smlsl v18.4s, v13.4h, v6.h[2]
cmp x11,x9
bhs shift2
@@ -645,32 +645,32 @@ shift1:
- smlsl v24.4s, v8.4h, v4.4h[1] //// y1 * cos1(part of b0)
- smlal v26.4s, v8.4h, v7.4h[1] //// y1 * cos3(part of b1)
- smlal v28.4s, v8.4h, v2.4h[3] //// y1 * sin3(part of b2)
- smlal v30.4s, v8.4h, v1.4h[3] //// y1 * sin1(part of b3)
+ smlsl v24.4s, v8.4h, v4.h[1] //// y1 * cos1(part of b0)
+ smlal v26.4s, v8.4h, v7.h[1] //// y1 * cos3(part of b1)
+ smlal v28.4s, v8.4h, v2.h[3] //// y1 * sin3(part of b2)
+ smlal v30.4s, v8.4h, v1.h[3] //// y1 * sin1(part of b3)
- smlal v24.4s, v9.4h, v7.4h[1] //// y1 * cos1 + y3 * cos3(part of b0)
- smlal v26.4s, v9.4h, v1.4h[3] //// y1 * cos3 - y3 * sin1(part of b1)
- smlal v28.4s, v9.4h, v3.4h[3] //// y1 * sin3 - y3 * cos1(part of b2)
- smlsl v30.4s, v9.4h, v6.4h[3] //// y1 * sin1 - y3 * sin3(part of b3)
+ smlal v24.4s, v9.4h, v7.h[1] //// y1 * cos1 + y3 * cos3(part of b0)
+ smlal v26.4s, v9.4h, v1.h[3] //// y1 * cos3 - y3 * sin1(part of b1)
+ smlal v28.4s, v9.4h, v3.h[3] //// y1 * sin3 - y3 * cos1(part of b2)
+ smlsl v30.4s, v9.4h, v6.h[3] //// y1 * sin1 - y3 * sin3(part of b3)
- smlsl v20.4s, v10.4h, v2.4h[0]
- smlsl v20.4s, v11.4h, v6.4h[2]
+ smlsl v20.4s, v10.4h, v2.h[0]
+ smlsl v20.4s, v11.4h, v6.h[2]
- smlsl v22.4s, v10.4h, v6.4h[0]
- smlal v22.4s, v11.4h, v4.4h[2]
+ smlsl v22.4s, v10.4h, v6.h[0]
+ smlal v22.4s, v11.4h, v4.h[2]
- smlal v16.4s, v10.4h, v6.4h[0]
- smlal v16.4s, v11.4h, v0.4h[2]
+ smlal v16.4s, v10.4h, v6.h[0]
+ smlal v16.4s, v11.4h, v0.h[2]
- smlal v18.4s, v10.4h, v2.4h[0]
- smlal v18.4s, v11.4h, v5.4h[2]
+ smlal v18.4s, v10.4h, v2.h[0]
+ smlal v18.4s, v11.4h, v5.h[2]
cmp x11,x5
bhs shift2
@@ -685,26 +685,26 @@ shift1:
- smlal v24.4s, v14.4h, v2.4h[3]
- smlal v26.4s, v14.4h, v3.4h[3]
- smlsl v28.4s, v14.4h, v5.4h[3]
- smlsl v30.4s, v14.4h, v0.4h[3]
+ smlal v24.4s, v14.4h, v2.h[3]
+ smlal v26.4s, v14.4h, v3.h[3]
+ smlsl v28.4s, v14.4h, v5.h[3]
+ smlsl v30.4s, v14.4h, v0.h[3]
- smlal v24.4s, v15.4h, v1.4h[3]
- smlsl v26.4s, v15.4h, v6.4h[3]
- smlsl v28.4s, v15.4h, v0.4h[3]
- smlal v30.4s, v15.4h, v7.4h[3]
+ smlal v24.4s, v15.4h, v1.h[3]
+ smlsl v26.4s, v15.4h, v6.h[3]
+ smlsl v28.4s, v15.4h, v0.h[3]
+ smlal v30.4s, v15.4h, v7.h[3]
- smlal v20.4s, v12.4h, v5.4h[0]
- smlal v20.4s, v13.4h, v0.4h[2]
- smlal v22.4s, v12.4h, v1.4h[0]
- smlal v22.4s, v13.4h, v6.4h[2]
- smlal v16.4s, v12.4h, v7.4h[0]
- smlsl v16.4s, v13.4h, v2.4h[2]
- smlsl v18.4s, v12.4h, v3.4h[0]
- smlsl v18.4s, v13.4h, v4.4h[2]
+ smlal v20.4s, v12.4h, v5.h[0]
+ smlal v20.4s, v13.4h, v0.h[2]
+ smlal v22.4s, v12.4h, v1.h[0]
+ smlal v22.4s, v13.4h, v6.h[2]
+ smlal v16.4s, v12.4h, v7.h[0]
+ smlsl v16.4s, v13.4h, v2.h[2]
+ smlsl v18.4s, v12.4h, v3.h[0]
+ smlsl v18.4s, v13.4h, v4.h[2]
cmp x11,x7
@@ -722,32 +722,32 @@ shift1:
- smlal v24.4s, v8.4h, v6.4h[1] //// y1 * cos1(part of b0)
- smlsl v26.4s, v8.4h, v1.4h[1] //// y1 * cos3(part of b1)
- smlsl v28.4s, v8.4h, v7.4h[1] //// y1 * sin3(part of b2)
- smlal v30.4s, v8.4h, v0.4h[3] //// y1 * sin1(part of b3)
+ smlal v24.4s, v8.4h, v6.h[1] //// y1 * cos1(part of b0)
+ smlsl v26.4s, v8.4h, v1.h[1] //// y1 * cos3(part of b1)
+ smlsl v28.4s, v8.4h, v7.h[1] //// y1 * sin3(part of b2)
+ smlal v30.4s, v8.4h, v0.h[3] //// y1 * sin1(part of b3)
- smlsl v24.4s, v9.4h, v5.4h[1] //// y1 * cos1 + y3 * cos3(part of b0)
- smlsl v26.4s, v9.4h, v4.4h[1] //// y1 * cos3 - y3 * sin1(part of b1)
- smlal v28.4s, v9.4h, v2.4h[1] //// y1 * sin3 - y3 * cos1(part of b2)
- smlal v30.4s, v9.4h, v7.4h[1] //// y1 * sin1 - y3 * sin3(part of b3)
+ smlsl v24.4s, v9.4h, v5.h[1] //// y1 * cos1 + y3 * cos3(part of b0)
+ smlsl v26.4s, v9.4h, v4.h[1] //// y1 * cos3 - y3 * sin1(part of b1)
+ smlal v28.4s, v9.4h, v2.h[1] //// y1 * sin3 - y3 * cos1(part of b2)
+ smlal v30.4s, v9.4h, v7.h[1] //// y1 * sin1 - y3 * sin3(part of b3)
- smlal v20.4s, v10.4h, v0.4h[0]
- smlsl v20.4s, v11.4h, v7.4h[2]
+ smlal v20.4s, v10.4h, v0.h[0]
+ smlsl v20.4s, v11.4h, v7.h[2]
- smlsl v22.4s, v10.4h, v0.4h[0]
- smlsl v22.4s, v11.4h, v1.4h[2]
+ smlsl v22.4s, v10.4h, v0.h[0]
+ smlsl v22.4s, v11.4h, v1.h[2]
- smlsl v16.4s, v10.4h, v0.4h[0]
- smlal v16.4s, v11.4h, v5.4h[2]
+ smlsl v16.4s, v10.4h, v0.h[0]
+ smlal v16.4s, v11.4h, v5.h[2]
- smlal v18.4s, v10.4h, v0.4h[0]
- smlal v18.4s, v11.4h, v3.4h[2]
+ smlal v18.4s, v10.4h, v0.h[0]
+ smlal v18.4s, v11.4h, v3.h[2]
@@ -757,26 +757,26 @@ shift1:
ld1 {v15.4h},[x0],x6
- smlsl v24.4s, v14.4h, v0.4h[1]
- smlal v26.4s, v14.4h, v6.4h[1]
- smlal v28.4s, v14.4h, v4.4h[1]
- smlsl v30.4s, v14.4h, v1.4h[1]
+ smlsl v24.4s, v14.4h, v0.h[1]
+ smlal v26.4s, v14.4h, v6.h[1]
+ smlal v28.4s, v14.4h, v4.h[1]
+ smlsl v30.4s, v14.4h, v1.h[1]
- smlsl v24.4s, v15.4h, v3.4h[3]
- smlal v26.4s, v15.4h, v0.4h[1]
- smlsl v28.4s, v15.4h, v5.4h[1]
- smlsl v30.4s, v15.4h, v6.4h[1]
+ smlsl v24.4s, v15.4h, v3.h[3]
+ smlal v26.4s, v15.4h, v0.h[1]
+ smlsl v28.4s, v15.4h, v5.h[1]
+ smlsl v30.4s, v15.4h, v6.h[1]
- smlsl v20.4s, v12.4h, v3.4h[0]
- smlsl v20.4s, v13.4h, v1.4h[2]
- smlsl v22.4s, v12.4h, v7.4h[0]
- smlal v22.4s, v13.4h, v3.4h[2]
- smlal v16.4s, v12.4h, v1.4h[0]
- smlal v16.4s, v13.4h, v7.4h[2]
- smlsl v18.4s, v12.4h, v5.4h[0]
- smlsl v18.4s, v13.4h, v2.4h[2]
+ smlsl v20.4s, v12.4h, v3.h[0]
+ smlsl v20.4s, v13.4h, v1.h[2]
+ smlsl v22.4s, v12.4h, v7.h[0]
+ smlal v22.4s, v13.4h, v3.h[2]
+ smlal v16.4s, v12.4h, v1.h[0]
+ smlal v16.4s, v13.4h, v7.h[2]
+ smlsl v18.4s, v12.4h, v5.h[0]
+ smlsl v18.4s, v13.4h, v2.h[2]
ld1 {v10.4h},[x0],x6
ld1 {v8.4h},[x0],x6
@@ -786,32 +786,32 @@ shift1:
- smlal v24.4s, v8.4h, v7.4h[3] //// y1 * cos1(part of b0)
- smlal v26.4s, v8.4h, v4.4h[3] //// y1 * cos3(part of b1)
- smlsl v28.4s, v8.4h, v1.4h[1] //// y1 * sin3(part of b2)
- smlal v30.4s, v8.4h, v2.4h[1] //// y1 * sin1(part of b3)
+ smlal v24.4s, v8.4h, v7.h[3] //// y1 * cos1(part of b0)
+ smlal v26.4s, v8.4h, v4.h[3] //// y1 * cos3(part of b1)
+ smlsl v28.4s, v8.4h, v1.h[1] //// y1 * sin3(part of b2)
+ smlal v30.4s, v8.4h, v2.h[1] //// y1 * sin1(part of b3)
- smlal v24.4s, v9.4h, v3.4h[1] //// y1 * cos1 + y3 * cos3(part of b0)
- smlsl v26.4s, v9.4h, v5.4h[3] //// y1 * cos3 - y3 * sin1(part of b1)
- smlsl v28.4s, v9.4h, v7.4h[3] //// y1 * sin3 - y3 * cos1(part of b2)
- smlal v30.4s, v9.4h, v5.4h[1] //// y1 * sin1 - y3 * sin3(part of b3)
+ smlal v24.4s, v9.4h, v3.h[1] //// y1 * cos1 + y3 * cos3(part of b0)
+ smlsl v26.4s, v9.4h, v5.h[3] //// y1 * cos3 - y3 * sin1(part of b1)
+ smlsl v28.4s, v9.4h, v7.h[3] //// y1 * sin3 - y3 * cos1(part of b2)
+ smlal v30.4s, v9.4h, v5.h[1] //// y1 * sin1 - y3 * sin3(part of b3)
- smlsl v20.4s, v10.4h, v6.4h[0]
- smlal v20.4s, v11.4h, v5.4h[2]
+ smlsl v20.4s, v10.4h, v6.h[0]
+ smlal v20.4s, v11.4h, v5.h[2]
- smlal v22.4s, v10.4h, v2.4h[0]
- smlal v22.4s, v11.4h, v7.4h[2]
+ smlal v22.4s, v10.4h, v2.h[0]
+ smlal v22.4s, v11.4h, v7.h[2]
- smlsl v16.4s, v10.4h, v2.4h[0]
- smlsl v16.4s, v11.4h, v4.4h[2]
+ smlsl v16.4s, v10.4h, v2.h[0]
+ smlsl v16.4s, v11.4h, v4.h[2]
- smlal v18.4s, v10.4h, v6.4h[0]
- smlal v18.4s, v11.4h, v1.4h[2]
+ smlal v18.4s, v10.4h, v6.h[0]
+ smlal v18.4s, v11.4h, v1.h[2]
ld1 {v12.4h},[x0],x6
@@ -823,26 +823,26 @@ shift1:
- smlal v24.4s, v14.4h, v1.4h[1]
- smlsl v26.4s, v14.4h, v0.4h[3]
- smlal v28.4s, v14.4h, v1.4h[3]
- smlsl v30.4s, v14.4h, v3.4h[1]
+ smlal v24.4s, v14.4h, v1.h[1]
+ smlsl v26.4s, v14.4h, v0.h[3]
+ smlal v28.4s, v14.4h, v1.h[3]
+ smlsl v30.4s, v14.4h, v3.h[1]
- smlal v24.4s, v15.4h, v5.4h[3]
- smlsl v26.4s, v15.4h, v5.4h[1]
- smlal v28.4s, v15.4h, v4.4h[3]
- smlsl v30.4s, v15.4h, v4.4h[1]
+ smlal v24.4s, v15.4h, v5.h[3]
+ smlsl v26.4s, v15.4h, v5.h[1]
+ smlal v28.4s, v15.4h, v4.h[3]
+ smlsl v30.4s, v15.4h, v4.h[1]
- smlal v20.4s, v12.4h, v1.4h[0]
- smlal v20.4s, v13.4h, v3.4h[2]
- smlsl v22.4s, v12.4h, v3.4h[0]
- smlsl v22.4s, v13.4h, v2.4h[2]
- smlal v16.4s, v12.4h, v5.4h[0]
- smlal v16.4s, v13.4h, v1.4h[2]
- smlsl v18.4s, v12.4h, v7.4h[0]
- smlsl v18.4s, v13.4h, v0.4h[2]
+ smlal v20.4s, v12.4h, v1.h[0]
+ smlal v20.4s, v13.4h, v3.h[2]
+ smlsl v22.4s, v12.4h, v3.h[0]
+ smlsl v22.4s, v13.4h, v2.h[2]
+ smlal v16.4s, v12.4h, v5.h[0]
+ smlal v16.4s, v13.4h, v1.h[2]
+ smlsl v18.4s, v12.4h, v7.h[0]
+ smlsl v18.4s, v13.4h, v0.h[2]
shift2:
add v8.4s, v20.4s , v24.4s
@@ -914,32 +914,32 @@ shift2:
ld1 {v9.4h},[x0],x6
- smull v24.4s, v8.4h, v4.4h[1] //// y1 * cos1(part of b0)
- smull v26.4s, v8.4h, v4.4h[3] //// y1 * cos3(part of b1)
- smull v28.4s, v8.4h, v5.4h[1] //// y1 * sin3(part of b2)
- smull v30.4s, v8.4h, v5.4h[3] //// y1 * sin1(part of b3)
+ smull v24.4s, v8.4h, v4.h[1] //// y1 * cos1(part of b0)
+ smull v26.4s, v8.4h, v4.h[3] //// y1 * cos3(part of b1)
+ smull v28.4s, v8.4h, v5.h[1] //// y1 * sin3(part of b2)
+ smull v30.4s, v8.4h, v5.h[3] //// y1 * sin1(part of b3)
- smlsl v24.4s, v9.4h, v3.4h[1] //// y1 * cos1 + y3 * cos3(part of b0)
- smlsl v26.4s, v9.4h, v1.4h[3] //// y1 * cos3 - y3 * sin1(part of b1)
- smlsl v28.4s, v9.4h, v0.4h[2] //// y1 * sin3 - y3 * cos1(part of b2)
- smlsl v30.4s, v9.4h, v1.4h[1] //// y1 * sin1 - y3 * sin3(part of b3)
+ smlsl v24.4s, v9.4h, v3.h[1] //// y1 * cos1 + y3 * cos3(part of b0)
+ smlsl v26.4s, v9.4h, v1.h[3] //// y1 * cos3 - y3 * sin1(part of b1)
+ smlsl v28.4s, v9.4h, v0.h[2] //// y1 * sin3 - y3 * cos1(part of b2)
+ smlsl v30.4s, v9.4h, v1.h[1] //// y1 * sin1 - y3 * sin3(part of b3)
- smull v20.4s, v10.4h, v0.4h[0]
- smlsl v20.4s, v11.4h, v7.4h[2]
+ smull v20.4s, v10.4h, v0.h[0]
+ smlsl v20.4s, v11.4h, v7.h[2]
- smull v22.4s, v10.4h, v0.4h[0]
- smlsl v22.4s, v11.4h, v6.4h[2]
+ smull v22.4s, v10.4h, v0.h[0]
+ smlsl v22.4s, v11.4h, v6.h[2]
- smull v16.4s, v10.4h, v0.4h[0]
- smlsl v16.4s, v11.4h, v5.4h[2]
+ smull v16.4s, v10.4h, v0.h[0]
+ smlsl v16.4s, v11.4h, v5.h[2]
- smull v18.4s, v10.4h, v0.4h[0]
- smlsl v18.4s, v11.4h, v4.4h[2]
+ smull v18.4s, v10.4h, v0.h[0]
+ smlsl v18.4s, v11.4h, v4.h[2]
cmp x11,x10
bhs shift3
@@ -952,26 +952,26 @@ shift2:
- smlsl v24.4s, v14.4h, v5.4h[1]
- smlsl v26.4s, v14.4h, v7.4h[3]
- smlal v28.4s, v14.4h, v5.4h[3]
- smlal v30.4s, v14.4h, v3.4h[1]
+ smlsl v24.4s, v14.4h, v5.h[1]
+ smlsl v26.4s, v14.4h, v7.h[3]
+ smlal v28.4s, v14.4h, v5.h[3]
+ smlal v30.4s, v14.4h, v3.h[1]
- smlal v24.4s, v15.4h, v2.4h[1]
- smlal v26.4s, v15.4h, v1.4h[1]
- smlal v28.4s, v15.4h, v4.4h[3]
- smlsl v30.4s, v15.4h, v7.4h[3]
+ smlal v24.4s, v15.4h, v2.h[1]
+ smlal v26.4s, v15.4h, v1.h[1]
+ smlal v28.4s, v15.4h, v4.h[3]
+ smlsl v30.4s, v15.4h, v7.h[3]
- smlsl v20.4s, v12.4h, v1.4h[0]
- smlal v20.4s, v13.4h, v6.4h[2]
- smlsl v22.4s, v12.4h, v3.4h[0]
- smlal v22.4s, v13.4h, v3.4h[2]
- smlsl v16.4s, v12.4h, v5.4h[0]
- smlal v16.4s, v13.4h, v0.4h[2]
- smlsl v18.4s, v12.4h, v7.4h[0]
- smlal v18.4s, v13.4h, v2.4h[2]
+ smlsl v20.4s, v12.4h, v1.h[0]
+ smlal v20.4s, v13.4h, v6.h[2]
+ smlsl v22.4s, v12.4h, v3.h[0]
+ smlal v22.4s, v13.4h, v3.h[2]
+ smlsl v16.4s, v12.4h, v5.h[0]
+ smlal v16.4s, v13.4h, v0.h[2]
+ smlsl v18.4s, v12.4h, v7.h[0]
+ smlal v18.4s, v13.4h, v2.h[2]
cmp x11,x9
bhs shift3
@@ -981,32 +981,32 @@ shift2:
ld1 {v11.4h},[x0],x6
ld1 {v9.4h},[x0],x6
- smlal v24.4s, v8.4h, v6.4h[1] //// y1 * cos1(part of b0)
- smlsl v26.4s, v8.4h, v5.4h[1] //// y1 * cos3(part of b1)
- smlsl v28.4s, v8.4h, v0.4h[3] //// y1 * sin3(part of b2)
- smlsl v30.4s, v8.4h, v3.4h[3] //// y1 * sin1(part of b3)
+ smlal v24.4s, v8.4h, v6.h[1] //// y1 * cos1(part of b0)
+ smlsl v26.4s, v8.4h, v5.h[1] //// y1 * cos3(part of b1)
+ smlsl v28.4s, v8.4h, v0.h[3] //// y1 * sin3(part of b2)
+ smlsl v30.4s, v8.4h, v3.h[3] //// y1 * sin1(part of b3)
- smlsl v24.4s, v9.4h, v1.4h[1] //// y1 * cos1 + y3 * cos3(part of b0)
- smlsl v26.4s, v9.4h, v4.4h[1] //// y1 * cos3 - y3 * sin1(part of b1)
- smlal v28.4s, v9.4h, v6.4h[1] //// y1 * sin3 - y3 * cos1(part of b2)
- smlal v30.4s, v9.4h, v0.4h[1] //// y1 * sin1 - y3 * sin3(part of b3)
+ smlsl v24.4s, v9.4h, v1.h[1] //// y1 * cos1 + y3 * cos3(part of b0)
+ smlsl v26.4s, v9.4h, v4.h[1] //// y1 * cos3 - y3 * sin1(part of b1)
+ smlal v28.4s, v9.4h, v6.h[1] //// y1 * sin3 - y3 * cos1(part of b2)
+ smlal v30.4s, v9.4h, v0.h[1] //// y1 * sin1 - y3 * sin3(part of b3)
- smlal v20.4s, v10.4h, v2.4h[0]
- smlsl v20.4s, v11.4h, v5.4h[2]
+ smlal v20.4s, v10.4h, v2.h[0]
+ smlsl v20.4s, v11.4h, v5.h[2]
- smlal v22.4s, v10.4h, v6.4h[0]
- smlsl v22.4s, v11.4h, v0.4h[2]
+ smlal v22.4s, v10.4h, v6.h[0]
+ smlsl v22.4s, v11.4h, v0.h[2]
- smlsl v16.4s, v10.4h, v6.4h[0]
- smlsl v16.4s, v11.4h, v4.4h[2]
+ smlsl v16.4s, v10.4h, v6.h[0]
+ smlsl v16.4s, v11.4h, v4.h[2]
- smlsl v18.4s, v10.4h, v2.4h[0]
- smlal v18.4s, v11.4h, v6.4h[2]
+ smlsl v18.4s, v10.4h, v2.h[0]
+ smlal v18.4s, v11.4h, v6.h[2]
cmp x11,x5
bhs shift3
@@ -1022,26 +1022,26 @@ shift2:
- smlsl v24.4s, v14.4h, v7.4h[1]
- smlal v26.4s, v14.4h, v2.4h[1]
- smlal v28.4s, v14.4h, v4.4h[1]
- smlsl v30.4s, v14.4h, v5.4h[1]
+ smlsl v24.4s, v14.4h, v7.h[1]
+ smlal v26.4s, v14.4h, v2.h[1]
+ smlal v28.4s, v14.4h, v4.h[1]
+ smlsl v30.4s, v14.4h, v5.h[1]
- smlal v24.4s, v15.4h, v0.4h[3]
- smlal v26.4s, v15.4h, v7.4h[1]
- smlsl v28.4s, v15.4h, v1.4h[1]
- smlsl v30.4s, v15.4h, v6.4h[1]
+ smlal v24.4s, v15.4h, v0.h[3]
+ smlal v26.4s, v15.4h, v7.h[1]
+ smlsl v28.4s, v15.4h, v1.h[1]
+ smlsl v30.4s, v15.4h, v6.h[1]
- smlsl v20.4s, v12.4h, v3.4h[0]
- smlal v20.4s, v13.4h, v4.4h[2]
- smlal v22.4s, v12.4h, v7.4h[0]
- smlal v22.4s, v13.4h, v2.4h[2]
- smlal v16.4s, v12.4h, v1.4h[0]
- smlsl v16.4s, v13.4h, v6.4h[2]
- smlal v18.4s, v12.4h, v5.4h[0]
- smlsl v18.4s, v13.4h, v0.4h[2]
+ smlsl v20.4s, v12.4h, v3.h[0]
+ smlal v20.4s, v13.4h, v4.h[2]
+ smlal v22.4s, v12.4h, v7.h[0]
+ smlal v22.4s, v13.4h, v2.h[2]
+ smlal v16.4s, v12.4h, v1.h[0]
+ smlsl v16.4s, v13.4h, v6.h[2]
+ smlal v18.4s, v12.4h, v5.h[0]
+ smlsl v18.4s, v13.4h, v0.h[2]
cmp x11,x7
@@ -1054,32 +1054,32 @@ shift2:
ld1 {v9.4h},[x0],x6
- smlsl v24.4s, v8.4h, v7.4h[3] //// y1 * cos1(part of b0)
- smlsl v26.4s, v8.4h, v0.4h[1] //// y1 * cos3(part of b1)
- smlal v28.4s, v8.4h, v6.4h[3] //// y1 * sin3(part of b2)
- smlal v30.4s, v8.4h, v1.4h[3] //// y1 * sin1(part of b3)
+ smlsl v24.4s, v8.4h, v7.h[3] //// y1 * cos1(part of b0)
+ smlsl v26.4s, v8.4h, v0.h[1] //// y1 * cos3(part of b1)
+ smlal v28.4s, v8.4h, v6.h[3] //// y1 * sin3(part of b2)
+ smlal v30.4s, v8.4h, v1.h[3] //// y1 * sin1(part of b3)
- smlsl v24.4s, v9.4h, v0.4h[1] //// y1 * cos1 + y3 * cos3(part of b0)
- smlal v26.4s, v9.4h, v5.4h[3] //// y1 * cos3 - y3 * sin1(part of b1)
- smlal v28.4s, v9.4h, v3.4h[3] //// y1 * sin3 - y3 * cos1(part of b2)
- smlsl v30.4s, v9.4h, v2.4h[3] //// y1 * sin1 - y3 * sin3(part of b3)
+ smlsl v24.4s, v9.4h, v0.h[1] //// y1 * cos1 + y3 * cos3(part of b0)
+ smlal v26.4s, v9.4h, v5.h[3] //// y1 * cos3 - y3 * sin1(part of b1)
+ smlal v28.4s, v9.4h, v3.h[3] //// y1 * sin3 - y3 * cos1(part of b2)
+ smlsl v30.4s, v9.4h, v2.h[3] //// y1 * sin1 - y3 * sin3(part of b3)
- smlal v20.4s, v10.4h, v0.4h[0]
- smlsl v20.4s, v11.4h, v3.4h[2]
+ smlal v20.4s, v10.4h, v0.h[0]
+ smlsl v20.4s, v11.4h, v3.h[2]
- smlsl v22.4s, v10.4h, v0.4h[0]
- smlsl v22.4s, v11.4h, v5.4h[2]
+ smlsl v22.4s, v10.4h, v0.h[0]
+ smlsl v22.4s, v11.4h, v5.h[2]
- smlsl v16.4s, v10.4h, v0.4h[0]
- smlal v16.4s, v11.4h, v1.4h[2]
+ smlsl v16.4s, v10.4h, v0.h[0]
+ smlal v16.4s, v11.4h, v1.h[2]
- smlal v18.4s, v10.4h, v0.4h[0]
- smlal v18.4s, v11.4h, v7.4h[2]
+ smlal v18.4s, v10.4h, v0.h[0]
+ smlal v18.4s, v11.4h, v7.h[2]
ld1 {v12.4h},[x0],x6
@@ -1089,26 +1089,26 @@ shift2:
- smlal v24.4s, v14.4h, v6.4h[3]
- smlal v26.4s, v14.4h, v3.4h[3]
- smlsl v28.4s, v14.4h, v1.4h[3]
- smlal v30.4s, v14.4h, v7.4h[1]
+ smlal v24.4s, v14.4h, v6.h[3]
+ smlal v26.4s, v14.4h, v3.h[3]
+ smlsl v28.4s, v14.4h, v1.h[3]
+ smlal v30.4s, v14.4h, v7.h[1]
- smlal v24.4s, v15.4h, v1.4h[3]
- smlsl v26.4s, v15.4h, v2.4h[3]
- smlal v28.4s, v15.4h, v7.4h[1]
- smlal v30.4s, v15.4h, v4.4h[1]
+ smlal v24.4s, v15.4h, v1.h[3]
+ smlsl v26.4s, v15.4h, v2.h[3]
+ smlal v28.4s, v15.4h, v7.h[1]
+ smlal v30.4s, v15.4h, v4.h[1]
- smlsl v20.4s, v12.4h, v5.4h[0]
- smlal v20.4s, v13.4h, v2.4h[2]
- smlal v22.4s, v12.4h, v1.4h[0]
- smlsl v22.4s, v13.4h, v7.4h[2]
- smlsl v16.4s, v12.4h, v7.4h[0]
- smlsl v16.4s, v13.4h, v3.4h[2]
- smlsl v18.4s, v12.4h, v3.4h[0]
- smlal v18.4s, v13.4h, v1.4h[2]
+ smlsl v20.4s, v12.4h, v5.h[0]
+ smlal v20.4s, v13.4h, v2.h[2]
+ smlal v22.4s, v12.4h, v1.h[0]
+ smlsl v22.4s, v13.4h, v7.h[2]
+ smlsl v16.4s, v12.4h, v7.h[0]
+ smlsl v16.4s, v13.4h, v3.h[2]
+ smlsl v18.4s, v12.4h, v3.h[0]
+ smlal v18.4s, v13.4h, v1.h[2]
@@ -1120,32 +1120,32 @@ shift2:
- smlsl v24.4s, v8.4h, v5.4h[3] //// y1 * cos1(part of b0)
- smlsl v26.4s, v8.4h, v6.4h[3] //// y1 * cos3(part of b1)
- smlal v28.4s, v8.4h, v3.4h[1] //// y1 * sin3(part of b2)
- smlsl v30.4s, v8.4h, v0.4h[1] //// y1 * sin1(part of b3)
+ smlsl v24.4s, v8.4h, v5.h[3] //// y1 * cos1(part of b0)
+ smlsl v26.4s, v8.4h, v6.h[3] //// y1 * cos3(part of b1)
+ smlal v28.4s, v8.4h, v3.h[1] //// y1 * sin3(part of b2)
+ smlsl v30.4s, v8.4h, v0.h[1] //// y1 * sin1(part of b3)
- smlsl v24.4s, v9.4h, v2.4h[3] //// y1 * cos1 + y3 * cos3(part of b0)
- smlal v26.4s, v9.4h, v0.4h[1] //// y1 * cos3 - y3 * sin1(part of b1)
- smlsl v28.4s, v9.4h, v2.4h[1] //// y1 * sin3 - y3 * cos1(part of b2)
- smlal v30.4s, v9.4h, v4.4h[3] //// y1 * sin1 - y3 * sin3(part of b3)
+ smlsl v24.4s, v9.4h, v2.h[3] //// y1 * cos1 + y3 * cos3(part of b0)
+ smlal v26.4s, v9.4h, v0.h[1] //// y1 * cos3 - y3 * sin1(part of b1)
+ smlsl v28.4s, v9.4h, v2.h[1] //// y1 * sin3 - y3 * cos1(part of b2)
+ smlal v30.4s, v9.4h, v4.h[3] //// y1 * sin1 - y3 * sin3(part of b3)
- smlal v20.4s, v10.4h, v6.4h[0]
- smlsl v20.4s, v11.4h, v1.4h[2]
+ smlal v20.4s, v10.4h, v6.h[0]
+ smlsl v20.4s, v11.4h, v1.h[2]
- smlsl v22.4s, v10.4h, v2.4h[0]
- smlal v22.4s, v11.4h, v4.4h[2]
+ smlsl v22.4s, v10.4h, v2.h[0]
+ smlal v22.4s, v11.4h, v4.h[2]
- smlal v16.4s, v10.4h, v2.4h[0]
- smlsl v16.4s, v11.4h, v7.4h[2]
+ smlal v16.4s, v10.4h, v2.h[0]
+ smlsl v16.4s, v11.4h, v7.h[2]
- smlsl v18.4s, v10.4h, v6.4h[0]
- smlsl v18.4s, v11.4h, v5.4h[2]
+ smlsl v18.4s, v10.4h, v6.h[0]
+ smlsl v18.4s, v11.4h, v5.h[2]
ld1 {v12.4h},[x0],x6
@@ -1153,26 +1153,26 @@ shift2:
ld1 {v13.4h},[x0],x6
ld1 {v15.4h},[x0],x6
- smlal v24.4s, v14.4h, v4.4h[3]
- smlsl v26.4s, v14.4h, v6.4h[1]
- smlal v28.4s, v14.4h, v7.4h[3]
- smlal v30.4s, v14.4h, v6.4h[3]
+ smlal v24.4s, v14.4h, v4.h[3]
+ smlsl v26.4s, v14.4h, v6.h[1]
+ smlal v28.4s, v14.4h, v7.h[3]
+ smlal v30.4s, v14.4h, v6.h[3]
- smlal v24.4s, v15.4h, v3.4h[3]
- smlsl v26.4s, v15.4h, v3.4h[1]
- smlal v28.4s, v15.4h, v2.4h[3]
- smlsl v30.4s, v15.4h, v2.4h[1]
+ smlal v24.4s, v15.4h, v3.h[3]
+ smlsl v26.4s, v15.4h, v3.h[1]
+ smlal v28.4s, v15.4h, v2.h[3]
+ smlsl v30.4s, v15.4h, v2.h[1]
- smlsl v20.4s, v12.4h, v7.4h[0]
- smlal v20.4s, v13.4h, v0.4h[2]
- smlal v22.4s, v12.4h, v5.4h[0]
- smlsl v22.4s, v13.4h, v1.4h[2]
- smlsl v16.4s, v12.4h, v3.4h[0]
- smlal v16.4s, v13.4h, v2.4h[2]
- smlal v18.4s, v12.4h, v1.4h[0]
- smlsl v18.4s, v13.4h, v3.4h[2]
+ smlsl v20.4s, v12.4h, v7.h[0]
+ smlal v20.4s, v13.4h, v0.h[2]
+ smlal v22.4s, v12.4h, v5.h[0]
+ smlsl v22.4s, v13.4h, v1.h[2]
+ smlsl v16.4s, v12.4h, v3.h[0]
+ smlal v16.4s, v13.4h, v2.h[2]
+ smlal v18.4s, v12.4h, v1.h[0]
+ smlsl v18.4s, v13.4h, v3.h[2]
shift3:
add v8.4s, v20.4s , v24.4s
@@ -1244,32 +1244,32 @@ shift3:
ld1 {v9.4h},[x0],x6
- smull v24.4s, v8.4h, v6.4h[1] //// y1 * cos1(part of b0)
- smull v26.4s, v8.4h, v6.4h[3] //// y1 * cos3(part of b1)
- smull v28.4s, v8.4h, v7.4h[1] //// y1 * sin3(part of b2)
- smull v30.4s, v8.4h, v7.4h[3] //// y1 * sin1(part of b3)
+ smull v24.4s, v8.4h, v6.h[1] //// y1 * cos1(part of b0)
+ smull v26.4s, v8.4h, v6.h[3] //// y1 * cos3(part of b1)
+ smull v28.4s, v8.4h, v7.h[1] //// y1 * sin3(part of b2)
+ smull v30.4s, v8.4h, v7.h[3] //// y1 * sin1(part of b3)
- smlsl v24.4s, v9.4h, v2.4h[3] //// y1 * cos1 + y3 * cos3(part of b0)
- smlsl v26.4s, v9.4h, v4.4h[1] //// y1 * cos3 - y3 * sin1(part of b1)
- smlsl v28.4s, v9.4h, v5.4h[3] //// y1 * sin3 - y3 * cos1(part of b2)
- smlsl v30.4s, v9.4h, v7.4h[1] //// y1 * sin1 - y3 * sin3(part of b3)
+ smlsl v24.4s, v9.4h, v2.h[3] //// y1 * cos1 + y3 * cos3(part of b0)
+ smlsl v26.4s, v9.4h, v4.h[1] //// y1 * cos3 - y3 * sin1(part of b1)
+ smlsl v28.4s, v9.4h, v5.h[3] //// y1 * sin3 - y3 * cos1(part of b2)
+ smlsl v30.4s, v9.4h, v7.h[1] //// y1 * sin1 - y3 * sin3(part of b3)
- smull v20.4s, v10.4h, v0.4h[0]
- smlsl v20.4s, v11.4h, v3.4h[2]
+ smull v20.4s, v10.4h, v0.h[0]
+ smlsl v20.4s, v11.4h, v3.h[2]
- smull v22.4s, v10.4h, v0.4h[0]
- smlsl v22.4s, v11.4h, v2.4h[2]
+ smull v22.4s, v10.4h, v0.h[0]
+ smlsl v22.4s, v11.4h, v2.h[2]
- smull v16.4s, v10.4h, v0.4h[0]
- smlsl v16.4s, v11.4h, v1.4h[2]
+ smull v16.4s, v10.4h, v0.h[0]
+ smlsl v16.4s, v11.4h, v1.h[2]
- smull v18.4s, v10.4h, v0.4h[0]
- smlsl v18.4s, v11.4h, v0.4h[2]
+ smull v18.4s, v10.4h, v0.h[0]
+ smlsl v18.4s, v11.4h, v0.h[2]
cmp x11,x10
bhs shift4
@@ -1284,26 +1284,26 @@ shift3:
- smlal v24.4s, v14.4h, v0.4h[1]
- smlal v26.4s, v14.4h, v1.4h[3]
- smlal v28.4s, v14.4h, v4.4h[1]
- smlal v30.4s, v14.4h, v6.4h[3]
+ smlal v24.4s, v14.4h, v0.h[1]
+ smlal v26.4s, v14.4h, v1.h[3]
+ smlal v28.4s, v14.4h, v4.h[1]
+ smlal v30.4s, v14.4h, v6.h[3]
- smlsl v24.4s, v15.4h, v4.4h[1]
- smlsl v26.4s, v15.4h, v0.4h[3]
- smlsl v28.4s, v15.4h, v2.4h[3]
- smlsl v30.4s, v15.4h, v6.4h[1]
+ smlsl v24.4s, v15.4h, v4.h[1]
+ smlsl v26.4s, v15.4h, v0.h[3]
+ smlsl v28.4s, v15.4h, v2.h[3]
+ smlsl v30.4s, v15.4h, v6.h[1]
- smlal v20.4s, v12.4h, v7.4h[0]
- smlal v20.4s, v13.4h, v5.4h[2]
- smlal v22.4s, v12.4h, v5.4h[0]
- smlsl v22.4s, v13.4h, v7.4h[2]
- smlal v16.4s, v12.4h, v3.4h[0]
- smlsl v16.4s, v13.4h, v4.4h[2]
- smlal v18.4s, v12.4h, v1.4h[0]
- smlsl v18.4s, v13.4h, v1.4h[2]
+ smlal v20.4s, v12.4h, v7.h[0]
+ smlal v20.4s, v13.4h, v5.h[2]
+ smlal v22.4s, v12.4h, v5.h[0]
+ smlsl v22.4s, v13.4h, v7.h[2]
+ smlal v16.4s, v12.4h, v3.h[0]
+ smlsl v16.4s, v13.4h, v4.h[2]
+ smlal v18.4s, v12.4h, v1.h[0]
+ smlsl v18.4s, v13.4h, v1.h[2]
cmp x11,x9
bhs shift4
@@ -1315,32 +1315,32 @@ shift3:
- smlal v24.4s, v8.4h, v7.4h[3] //// y1 * cos1(part of b0)
- smlal v26.4s, v8.4h, v3.4h[1] //// y1 * cos3(part of b1)
- smlal v28.4s, v8.4h, v1.4h[1] //// y1 * sin3(part of b2)
- smlal v30.4s, v8.4h, v5.4h[3] //// y1 * sin1(part of b3)
+ smlal v24.4s, v8.4h, v7.h[3] //// y1 * cos1(part of b0)
+ smlal v26.4s, v8.4h, v3.h[1] //// y1 * cos3(part of b1)
+ smlal v28.4s, v8.4h, v1.h[1] //// y1 * sin3(part of b2)
+ smlal v30.4s, v8.4h, v5.h[3] //// y1 * sin1(part of b3)
- smlal v24.4s, v9.4h, v4.4h[3] //// y1 * cos1 + y3 * cos3(part of b0)
- smlsl v26.4s, v9.4h, v5.4h[3] //// y1 * cos3 - y3 * sin1(part of b1)
- smlsl v28.4s, v9.4h, v0.4h[1] //// y1 * sin3 - y3 * cos1(part of b2)
- smlsl v30.4s, v9.4h, v5.4h[1] //// y1 * sin1 - y3 * sin3(part of b3)
+ smlal v24.4s, v9.4h, v4.h[3] //// y1 * cos1 + y3 * cos3(part of b0)
+ smlsl v26.4s, v9.4h, v5.h[3] //// y1 * cos3 - y3 * sin1(part of b1)
+ smlsl v28.4s, v9.4h, v0.h[1] //// y1 * sin3 - y3 * cos1(part of b2)
+ smlsl v30.4s, v9.4h, v5.h[1] //// y1 * sin1 - y3 * sin3(part of b3)
- smlsl v20.4s, v10.4h, v2.4h[0]
- smlal v20.4s, v11.4h, v1.4h[2]
+ smlsl v20.4s, v10.4h, v2.h[0]
+ smlal v20.4s, v11.4h, v1.h[2]
- smlsl v22.4s, v10.4h, v6.4h[0]
- smlal v22.4s, v11.4h, v3.4h[2]
+ smlsl v22.4s, v10.4h, v6.h[0]
+ smlal v22.4s, v11.4h, v3.h[2]
- smlal v16.4s, v10.4h, v6.4h[0]
- smlsl v16.4s, v11.4h, v7.4h[2]
+ smlal v16.4s, v10.4h, v6.h[0]
+ smlsl v16.4s, v11.4h, v7.h[2]
- smlal v18.4s, v10.4h, v2.4h[0]
- smlsl v18.4s, v11.4h, v2.4h[2]
+ smlal v18.4s, v10.4h, v2.h[0]
+ smlsl v18.4s, v11.4h, v2.h[2]
cmp x11,x5
bhs shift4
@@ -1356,26 +1356,26 @@ shift3:
- smlsl v24.4s, v14.4h, v1.4h[1]
- smlsl v26.4s, v14.4h, v7.4h[3]
- smlal v28.4s, v14.4h, v1.4h[3]
- smlal v30.4s, v14.4h, v4.4h[3]
+ smlsl v24.4s, v14.4h, v1.h[1]
+ smlsl v26.4s, v14.4h, v7.h[3]
+ smlal v28.4s, v14.4h, v1.h[3]
+ smlal v30.4s, v14.4h, v4.h[3]
- smlal v24.4s, v15.4h, v2.4h[1]
- smlal v26.4s, v15.4h, v5.4h[1]
- smlsl v28.4s, v15.4h, v3.4h[1]
- smlsl v30.4s, v15.4h, v4.4h[1]
+ smlal v24.4s, v15.4h, v2.h[1]
+ smlal v26.4s, v15.4h, v5.h[1]
+ smlsl v28.4s, v15.4h, v3.h[1]
+ smlsl v30.4s, v15.4h, v4.h[1]
- smlsl v20.4s, v12.4h, v5.4h[0]
- smlsl v20.4s, v13.4h, v7.4h[2]
- smlsl v22.4s, v12.4h, v1.4h[0]
- smlal v22.4s, v13.4h, v1.4h[2]
- smlsl v16.4s, v12.4h, v7.4h[0]
- smlal v16.4s, v13.4h, v5.4h[2]
- smlal v18.4s, v12.4h, v3.4h[0]
- smlsl v18.4s, v13.4h, v3.4h[2]
+ smlsl v20.4s, v12.4h, v5.h[0]
+ smlsl v20.4s, v13.4h, v7.h[2]
+ smlsl v22.4s, v12.4h, v1.h[0]
+ smlal v22.4s, v13.4h, v1.h[2]
+ smlsl v16.4s, v12.4h, v7.h[0]
+ smlal v16.4s, v13.4h, v5.h[2]
+ smlal v18.4s, v12.4h, v3.h[0]
+ smlsl v18.4s, v13.4h, v3.h[2]
cmp x11,x7
bhs shift4
@@ -1387,32 +1387,32 @@ shift3:
ld1 {v9.4h},[x0],x6
- smlsl v24.4s, v8.4h, v5.4h[3] //// y1 * cos1(part of b0)
- smlsl v26.4s, v8.4h, v2.4h[3] //// y1 * cos3(part of b1)
- smlal v28.4s, v8.4h, v4.4h[3] //// y1 * sin3(part of b2)
- smlal v30.4s, v8.4h, v3.4h[3] //// y1 * sin1(part of b3)
+ smlsl v24.4s, v8.4h, v5.h[3] //// y1 * cos1(part of b0)
+ smlsl v26.4s, v8.4h, v2.h[3] //// y1 * cos3(part of b1)
+ smlal v28.4s, v8.4h, v4.h[3] //// y1 * sin3(part of b2)
+ smlal v30.4s, v8.4h, v3.h[3] //// y1 * sin1(part of b3)
- smlsl v24.4s, v9.4h, v6.4h[3] //// y1 * cos1 + y3 * cos3(part of b0)
- smlal v26.4s, v9.4h, v0.4h[3] //// y1 * cos3 - y3 * sin1(part of b1)
- smlsl v28.4s, v9.4h, v6.4h[1] //// y1 * sin3 - y3 * cos1(part of b2)
- smlsl v30.4s, v9.4h, v3.4h[1] //// y1 * sin1 - y3 * sin3(part of b3)
+ smlsl v24.4s, v9.4h, v6.h[3] //// y1 * cos1 + y3 * cos3(part of b0)
+ smlal v26.4s, v9.4h, v0.h[3] //// y1 * cos3 - y3 * sin1(part of b1)
+ smlsl v28.4s, v9.4h, v6.h[1] //// y1 * sin3 - y3 * cos1(part of b2)
+ smlsl v30.4s, v9.4h, v3.h[1] //// y1 * sin1 - y3 * sin3(part of b3)
- smlal v20.4s, v10.4h, v0.4h[0]
- smlsl v20.4s, v11.4h, v0.4h[2]
+ smlal v20.4s, v10.4h, v0.h[0]
+ smlsl v20.4s, v11.4h, v0.h[2]
- smlsl v22.4s, v10.4h, v0.4h[0]
- smlal v22.4s, v11.4h, v6.4h[2]
+ smlsl v22.4s, v10.4h, v0.h[0]
+ smlal v22.4s, v11.4h, v6.h[2]
- smlsl v16.4s, v10.4h, v0.4h[0]
- smlal v16.4s, v11.4h, v2.4h[2]
+ smlsl v16.4s, v10.4h, v0.h[0]
+ smlal v16.4s, v11.4h, v2.h[2]
- smlal v18.4s, v10.4h, v0.4h[0]
- smlsl v18.4s, v11.4h, v4.4h[2]
+ smlal v18.4s, v10.4h, v0.h[0]
+ smlsl v18.4s, v11.4h, v4.h[2]
@@ -1427,26 +1427,26 @@ shift3:
- smlal v24.4s, v14.4h, v3.4h[1]
- smlsl v26.4s, v14.4h, v2.4h[1]
- smlal v28.4s, v14.4h, v7.4h[3]
- smlal v30.4s, v14.4h, v2.4h[3]
+ smlal v24.4s, v14.4h, v3.h[1]
+ smlsl v26.4s, v14.4h, v2.h[1]
+ smlal v28.4s, v14.4h, v7.h[3]
+ smlal v30.4s, v14.4h, v2.h[3]
- smlsl v24.4s, v15.4h, v0.4h[3]
- smlal v26.4s, v15.4h, v4.4h[3]
- smlal v28.4s, v15.4h, v6.4h[3]
- smlsl v30.4s, v15.4h, v2.4h[1]
+ smlsl v24.4s, v15.4h, v0.h[3]
+ smlal v26.4s, v15.4h, v4.h[3]
+ smlal v28.4s, v15.4h, v6.h[3]
+ smlsl v30.4s, v15.4h, v2.h[1]
- smlal v20.4s, v12.4h, v3.4h[0]
- smlsl v20.4s, v13.4h, v6.4h[2]
- smlal v22.4s, v12.4h, v7.4h[0]
- smlsl v22.4s, v13.4h, v4.4h[2]
- smlsl v16.4s, v12.4h, v1.4h[0]
- smlal v16.4s, v13.4h, v0.4h[2]
- smlal v18.4s, v12.4h, v5.4h[0]
- smlsl v18.4s, v13.4h, v5.4h[2]
+ smlal v20.4s, v12.4h, v3.h[0]
+ smlsl v20.4s, v13.4h, v6.h[2]
+ smlal v22.4s, v12.4h, v7.h[0]
+ smlsl v22.4s, v13.4h, v4.h[2]
+ smlsl v16.4s, v12.4h, v1.h[0]
+ smlal v16.4s, v13.4h, v0.h[2]
+ smlal v18.4s, v12.4h, v5.h[0]
+ smlsl v18.4s, v13.4h, v5.h[2]
ld1 {v10.4h},[x0],x6
@@ -1458,32 +1458,32 @@ shift3:
- smlal v24.4s, v8.4h, v3.4h[3] //// y1 * cos1(part of b0)
- smlsl v26.4s, v8.4h, v7.4h[1] //// y1 * cos3(part of b1)
- smlsl v28.4s, v8.4h, v5.4h[1] //// y1 * sin3(part of b2)
- smlal v30.4s, v8.4h, v1.4h[3] //// y1 * sin1(part of b3)
+ smlal v24.4s, v8.4h, v3.h[3] //// y1 * cos1(part of b0)
+ smlsl v26.4s, v8.4h, v7.h[1] //// y1 * cos3(part of b1)
+ smlsl v28.4s, v8.4h, v5.h[1] //// y1 * sin3(part of b2)
+ smlal v30.4s, v8.4h, v1.h[3] //// y1 * sin1(part of b3)
- smlsl v24.4s, v9.4h, v7.4h[1] //// y1 * cos1 + y3 * cos3(part of b0)
- smlsl v26.4s, v9.4h, v6.4h[1] //// y1 * cos3 - y3 * sin1(part of b1)
- smlal v28.4s, v9.4h, v3.4h[3] //// y1 * sin3 - y3 * cos1(part of b2)
- smlsl v30.4s, v9.4h, v1.4h[1] //// y1 * sin1 - y3 * sin3(part of b3)
+ smlsl v24.4s, v9.4h, v7.h[1] //// y1 * cos1 + y3 * cos3(part of b0)
+ smlsl v26.4s, v9.4h, v6.h[1] //// y1 * cos3 - y3 * sin1(part of b1)
+ smlal v28.4s, v9.4h, v3.h[3] //// y1 * sin3 - y3 * cos1(part of b2)
+ smlsl v30.4s, v9.4h, v1.h[1] //// y1 * sin1 - y3 * sin3(part of b3)
- smlsl v20.4s, v10.4h, v6.4h[0]
- smlal v20.4s, v11.4h, v2.4h[2]
+ smlsl v20.4s, v10.4h, v6.h[0]
+ smlal v20.4s, v11.4h, v2.h[2]
- smlal v22.4s, v10.4h, v2.4h[0]
- smlsl v22.4s, v11.4h, v0.4h[2]
+ smlal v22.4s, v10.4h, v2.h[0]
+ smlsl v22.4s, v11.4h, v0.h[2]
- smlsl v16.4s, v10.4h, v2.4h[0]
- smlal v16.4s, v11.4h, v3.4h[2]
+ smlsl v16.4s, v10.4h, v2.h[0]
+ smlal v16.4s, v11.4h, v3.h[2]
- smlal v18.4s, v10.4h, v6.4h[0]
- smlsl v18.4s, v11.4h, v6.4h[2]
+ smlal v18.4s, v10.4h, v6.h[0]
+ smlsl v18.4s, v11.4h, v6.h[2]
ld1 {v12.4h},[x0],x6
@@ -1494,26 +1494,26 @@ shift3:
- smlsl v24.4s, v14.4h, v5.4h[1]
- smlal v26.4s, v14.4h, v3.4h[3]
- smlsl v28.4s, v14.4h, v2.4h[1]
- smlal v30.4s, v14.4h, v0.4h[3]
+ smlsl v24.4s, v14.4h, v5.h[1]
+ smlal v26.4s, v14.4h, v3.h[3]
+ smlsl v28.4s, v14.4h, v2.h[1]
+ smlal v30.4s, v14.4h, v0.h[3]
- smlal v24.4s, v15.4h, v1.4h[3]
- smlsl v26.4s, v15.4h, v1.4h[1]
- smlal v28.4s, v15.4h, v0.4h[3]
- smlsl v30.4s, v15.4h, v0.4h[1]
+ smlal v24.4s, v15.4h, v1.h[3]
+ smlsl v26.4s, v15.4h, v1.h[1]
+ smlal v28.4s, v15.4h, v0.h[3]
+ smlsl v30.4s, v15.4h, v0.h[1]
- smlsl v20.4s, v12.4h, v1.4h[0]
- smlal v20.4s, v13.4h, v4.4h[2]
- smlal v22.4s, v12.4h, v3.4h[0]
- smlsl v22.4s, v13.4h, v5.4h[2]
- smlsl v16.4s, v12.4h, v5.4h[0]
- smlal v16.4s, v13.4h, v6.4h[2]
- smlal v18.4s, v12.4h, v7.4h[0]
- smlsl v18.4s, v13.4h, v7.4h[2]
+ smlsl v20.4s, v12.4h, v1.h[0]
+ smlal v20.4s, v13.4h, v4.h[2]
+ smlal v22.4s, v12.4h, v3.h[0]
+ smlsl v22.4s, v13.4h, v5.h[2]
+ smlsl v16.4s, v12.4h, v5.h[0]
+ smlal v16.4s, v13.4h, v6.h[2]
+ smlal v18.4s, v12.4h, v7.h[0]
+ smlsl v18.4s, v13.4h, v7.h[2]
shift4:
add v8.4s, v20.4s , v24.4s
@@ -1618,30 +1618,30 @@ stage2:
ld1 {v10.4h, v11.4h},[x1],#16
ld1 {v8.4h, v9.4h},[x1],x10
- smull v24.4s, v8.4h, v0.4h[1] //// y1 * cos1(part of b0)
- smull v26.4s, v8.4h, v0.4h[3] //// y1 * cos3(part of b1)
- smull v28.4s, v8.4h, v1.4h[1] //// y1 * sin3(part of b2)
- smull v30.4s, v8.4h, v1.4h[3] //// y1 * sin1(part of b3)
+ smull v24.4s, v8.4h, v0.h[1] //// y1 * cos1(part of b0)
+ smull v26.4s, v8.4h, v0.h[3] //// y1 * cos3(part of b1)
+ smull v28.4s, v8.4h, v1.h[1] //// y1 * sin3(part of b2)
+ smull v30.4s, v8.4h, v1.h[3] //// y1 * sin1(part of b3)
- smlal v24.4s, v9.4h, v0.4h[3] //// y1 * cos1 + y3 * cos3(part of b0)
- smlal v26.4s, v9.4h, v2.4h[1] //// y1 * cos3 - y3 * sin1(part of b1)
- smlal v28.4s, v9.4h, v3.4h[3] //// y1 * sin3 - y3 * cos1(part of b2)
- smlal v30.4s, v9.4h, v5.4h[1] //// y1 * sin1 - y3 * sin3(part of b3)
+ smlal v24.4s, v9.4h, v0.h[3] //// y1 * cos1 + y3 * cos3(part of b0)
+ smlal v26.4s, v9.4h, v2.h[1] //// y1 * cos3 - y3 * sin1(part of b1)
+ smlal v28.4s, v9.4h, v3.h[3] //// y1 * sin3 - y3 * cos1(part of b2)
+ smlal v30.4s, v9.4h, v5.h[1] //// y1 * sin1 - y3 * sin3(part of b3)
- smull v20.4s, v10.4h, v0.4h[0]
- smlal v20.4s, v11.4h, v0.4h[2]
+ smull v20.4s, v10.4h, v0.h[0]
+ smlal v20.4s, v11.4h, v0.h[2]
- smull v22.4s, v10.4h, v0.4h[0]
- smlal v22.4s, v11.4h, v1.4h[2]
+ smull v22.4s, v10.4h, v0.h[0]
+ smlal v22.4s, v11.4h, v1.h[2]
- smull v16.4s, v10.4h, v0.4h[0]
- smlal v16.4s, v11.4h, v2.4h[2]
+ smull v16.4s, v10.4h, v0.h[0]
+ smlal v16.4s, v11.4h, v2.h[2]
- smull v18.4s, v10.4h, v0.4h[0]
- smlal v18.4s, v11.4h, v3.4h[2]
+ smull v18.4s, v10.4h, v0.h[0]
+ smlal v18.4s, v11.4h, v3.h[2]
cmp x12,x11
bhs stage2_shift1
@@ -1653,58 +1653,58 @@ stage2:
- smlal v24.4s, v14.4h, v1.4h[1]
- smlal v26.4s, v14.4h, v3.4h[3]
- smlal v28.4s, v14.4h, v6.4h[1]
- smlsl v30.4s, v14.4h, v7.4h[1]
+ smlal v24.4s, v14.4h, v1.h[1]
+ smlal v26.4s, v14.4h, v3.h[3]
+ smlal v28.4s, v14.4h, v6.h[1]
+ smlsl v30.4s, v14.4h, v7.h[1]
- smlal v24.4s, v15.4h, v1.4h[3]
- smlal v26.4s, v15.4h, v5.4h[1]
- smlsl v28.4s, v15.4h, v7.4h[1]
- smlsl v30.4s, v15.4h, v3.4h[3]
+ smlal v24.4s, v15.4h, v1.h[3]
+ smlal v26.4s, v15.4h, v5.h[1]
+ smlsl v28.4s, v15.4h, v7.h[1]
+ smlsl v30.4s, v15.4h, v3.h[3]
- smlal v20.4s, v12.4h, v1.4h[0]
- smlal v20.4s, v13.4h, v1.4h[2]
- smlal v22.4s, v12.4h, v3.4h[0]
- smlal v22.4s, v13.4h, v4.4h[2]
- smlal v16.4s, v12.4h, v5.4h[0]
- smlal v16.4s, v13.4h, v7.4h[2]
- smlal v18.4s, v12.4h, v7.4h[0]
- smlsl v18.4s, v13.4h, v5.4h[2]
+ smlal v20.4s, v12.4h, v1.h[0]
+ smlal v20.4s, v13.4h, v1.h[2]
+ smlal v22.4s, v12.4h, v3.h[0]
+ smlal v22.4s, v13.4h, v4.h[2]
+ smlal v16.4s, v12.4h, v5.h[0]
+ smlal v16.4s, v13.4h, v7.h[2]
+ smlal v18.4s, v12.4h, v7.h[0]
+ smlsl v18.4s, v13.4h, v5.h[2]
cmp x12,x5
bhs stage2_shift1
ld1 {v10.4h, v11.4h},[x1],#16
ld1 {v8.4h, v9.4h},[x1],x10
- smlal v24.4s, v8.4h, v2.4h[1] //// y1 * cos1(part of b0)
- smlal v26.4s, v8.4h, v6.4h[3] //// y1 * cos3(part of b1)
- smlsl v28.4s, v8.4h, v4.4h[3] //// y1 * sin3(part of b2)
- smlsl v30.4s, v8.4h, v0.4h[1] //// y1 * sin1(part of b3)
+ smlal v24.4s, v8.4h, v2.h[1] //// y1 * cos1(part of b0)
+ smlal v26.4s, v8.4h, v6.h[3] //// y1 * cos3(part of b1)
+ smlsl v28.4s, v8.4h, v4.h[3] //// y1 * sin3(part of b2)
+ smlsl v30.4s, v8.4h, v0.h[1] //// y1 * sin1(part of b3)
- smlal v24.4s, v9.4h, v2.4h[3] //// y1 * cos1 + y3 * cos3(part of b0)
- smlsl v26.4s, v9.4h, v7.4h[3] //// y1 * cos3 - y3 * sin1(part of b1)
- smlsl v28.4s, v9.4h, v2.4h[1] //// y1 * sin3 - y3 * cos1(part of b2)
- smlsl v30.4s, v9.4h, v3.4h[1] //// y1 * sin1 - y3 * sin3(part of b3)
+ smlal v24.4s, v9.4h, v2.h[3] //// y1 * cos1 + y3 * cos3(part of b0)
+ smlsl v26.4s, v9.4h, v7.h[3] //// y1 * cos3 - y3 * sin1(part of b1)
+ smlsl v28.4s, v9.4h, v2.h[1] //// y1 * sin3 - y3 * cos1(part of b2)
+ smlsl v30.4s, v9.4h, v3.h[1] //// y1 * sin1 - y3 * sin3(part of b3)
- smlal v20.4s, v10.4h, v2.4h[0]
- smlal v20.4s, v11.4h, v2.4h[2]
+ smlal v20.4s, v10.4h, v2.h[0]
+ smlal v20.4s, v11.4h, v2.h[2]
- smlal v22.4s, v10.4h, v6.4h[0]
- smlal v22.4s, v11.4h, v7.4h[2]
+ smlal v22.4s, v10.4h, v6.h[0]
+ smlal v22.4s, v11.4h, v7.h[2]
- smlsl v16.4s, v10.4h, v6.4h[0]
- smlsl v16.4s, v11.4h, v3.4h[2]
+ smlsl v16.4s, v10.4h, v6.h[0]
+ smlsl v16.4s, v11.4h, v3.h[2]
- smlsl v18.4s, v10.4h, v2.4h[0]
- smlsl v18.4s, v11.4h, v1.4h[2]
+ smlsl v18.4s, v10.4h, v2.h[0]
+ smlsl v18.4s, v11.4h, v1.h[2]
cmp x12,x6
bhs stage2_shift1
@@ -1717,26 +1717,26 @@ stage2:
- smlal v24.4s, v14.4h, v3.4h[1]
- smlsl v26.4s, v14.4h, v6.4h[1]
- smlsl v28.4s, v14.4h, v0.4h[1]
- smlsl v30.4s, v14.4h, v6.4h[3]
+ smlal v24.4s, v14.4h, v3.h[1]
+ smlsl v26.4s, v14.4h, v6.h[1]
+ smlsl v28.4s, v14.4h, v0.h[1]
+ smlsl v30.4s, v14.4h, v6.h[3]
- smlal v24.4s, v15.4h, v3.4h[3]
- smlsl v26.4s, v15.4h, v4.4h[3]
- smlsl v28.4s, v15.4h, v2.4h[3]
- smlal v30.4s, v15.4h, v5.4h[3]
+ smlal v24.4s, v15.4h, v3.h[3]
+ smlsl v26.4s, v15.4h, v4.h[3]
+ smlsl v28.4s, v15.4h, v2.h[3]
+ smlal v30.4s, v15.4h, v5.h[3]
- smlal v20.4s, v12.4h, v3.4h[0]
- smlal v20.4s, v13.4h, v3.4h[2]
- smlsl v22.4s, v12.4h, v7.4h[0]
- smlsl v22.4s, v13.4h, v5.4h[2]
- smlsl v16.4s, v12.4h, v1.4h[0]
- smlsl v16.4s, v13.4h, v1.4h[2]
- smlsl v18.4s, v12.4h, v5.4h[0]
- smlal v18.4s, v13.4h, v7.4h[2]
+ smlal v20.4s, v12.4h, v3.h[0]
+ smlal v20.4s, v13.4h, v3.h[2]
+ smlsl v22.4s, v12.4h, v7.h[0]
+ smlsl v22.4s, v13.4h, v5.h[2]
+ smlsl v16.4s, v12.4h, v1.h[0]
+ smlsl v16.4s, v13.4h, v1.h[2]
+ smlsl v18.4s, v12.4h, v5.h[0]
+ smlal v18.4s, v13.4h, v7.h[2]
cmp x12,x9
bhs stage2_shift1
@@ -1746,32 +1746,32 @@ stage2:
ld1 {v8.4h, v9.4h},[x1],x10
- smlal v24.4s, v8.4h, v4.4h[1] //// y1 * cos1(part of b0)
- smlsl v26.4s, v8.4h, v3.4h[1] //// y1 * cos3(part of b1)
- smlsl v28.4s, v8.4h, v5.4h[1] //// y1 * sin3(part of b2)
- smlal v30.4s, v8.4h, v2.4h[1] //// y1 * sin1(part of b3)
+ smlal v24.4s, v8.4h, v4.h[1] //// y1 * cos1(part of b0)
+ smlsl v26.4s, v8.4h, v3.h[1] //// y1 * cos3(part of b1)
+ smlsl v28.4s, v8.4h, v5.h[1] //// y1 * sin3(part of b2)
+ smlal v30.4s, v8.4h, v2.h[1] //// y1 * sin1(part of b3)
- smlal v24.4s, v9.4h, v4.4h[3] //// y1 * cos1 + y3 * cos3(part of b0)
- smlsl v26.4s, v9.4h, v1.4h[3] //// y1 * cos3 - y3 * sin1(part of b1)
- smlsl v28.4s, v9.4h, v7.4h[3] //// y1 * sin3 - y3 * cos1(part of b2)
- smlal v30.4s, v9.4h, v1.4h[1] //// y1 * sin1 - y3 * sin3(part of b3)
+ smlal v24.4s, v9.4h, v4.h[3] //// y1 * cos1 + y3 * cos3(part of b0)
+ smlsl v26.4s, v9.4h, v1.h[3] //// y1 * cos3 - y3 * sin1(part of b1)
+ smlsl v28.4s, v9.4h, v7.h[3] //// y1 * sin3 - y3 * cos1(part of b2)
+ smlal v30.4s, v9.4h, v1.h[1] //// y1 * sin1 - y3 * sin3(part of b3)
- smlal v20.4s, v10.4h, v0.4h[0]
- smlal v20.4s, v11.4h, v4.4h[2]
+ smlal v20.4s, v10.4h, v0.h[0]
+ smlal v20.4s, v11.4h, v4.h[2]
- smlsl v22.4s, v10.4h, v0.4h[0]
- smlsl v22.4s, v11.4h, v2.4h[2]
+ smlsl v22.4s, v10.4h, v0.h[0]
+ smlsl v22.4s, v11.4h, v2.h[2]
- smlsl v16.4s, v10.4h, v0.4h[0]
- smlsl v16.4s, v11.4h, v6.4h[2]
+ smlsl v16.4s, v10.4h, v0.h[0]
+ smlsl v16.4s, v11.4h, v6.h[2]
- smlal v18.4s, v10.4h, v0.4h[0]
- smlal v18.4s, v11.4h, v0.4h[2]
+ smlal v18.4s, v10.4h, v0.h[0]
+ smlal v18.4s, v11.4h, v0.h[2]
ld1 {v12.4h, v13.4h},[x1],#16
ld1 {v14.4h, v15.4h},[x1],x10
@@ -1780,26 +1780,26 @@ stage2:
- smlal v24.4s, v14.4h, v5.4h[1]
- smlsl v26.4s, v14.4h, v0.4h[2]
- smlal v28.4s, v14.4h, v5.4h[3]
- smlal v30.4s, v14.4h, v4.4h[3]
+ smlal v24.4s, v14.4h, v5.h[1]
+ smlsl v26.4s, v14.4h, v0.h[2]
+ smlal v28.4s, v14.4h, v5.h[3]
+ smlal v30.4s, v14.4h, v4.h[3]
- smlal v24.4s, v15.4h, v5.4h[3]
- smlsl v26.4s, v15.4h, v1.4h[1]
- smlal v28.4s, v15.4h, v3.4h[1]
- smlsl v30.4s, v15.4h, v7.4h[3]
+ smlal v24.4s, v15.4h, v5.h[3]
+ smlsl v26.4s, v15.4h, v1.h[1]
+ smlal v28.4s, v15.4h, v3.h[1]
+ smlsl v30.4s, v15.4h, v7.h[3]
- smlal v20.4s, v12.4h, v5.4h[0]
- smlal v20.4s, v13.4h, v5.4h[2]
- smlsl v22.4s, v12.4h, v1.4h[0]
- smlsl v22.4s, v13.4h, v0.4h[2]
- smlal v16.4s, v12.4h, v7.4h[0]
- smlal v16.4s, v13.4h, v4.4h[2]
- smlal v18.4s, v12.4h, v3.4h[0]
- smlal v18.4s, v13.4h, v6.4h[2]
+ smlal v20.4s, v12.4h, v5.h[0]
+ smlal v20.4s, v13.4h, v5.h[2]
+ smlsl v22.4s, v12.4h, v1.h[0]
+ smlsl v22.4s, v13.4h, v0.h[2]
+ smlal v16.4s, v12.4h, v7.h[0]
+ smlal v16.4s, v13.4h, v4.h[2]
+ smlal v18.4s, v12.4h, v3.h[0]
+ smlal v18.4s, v13.4h, v6.h[2]
ld1 {v10.4h, v11.4h},[x1],#16
@@ -1808,56 +1808,56 @@ stage2:
- smlal v24.4s, v8.4h, v6.4h[1] //// y1 * cos1(part of b0)
- smlsl v26.4s, v8.4h, v2.4h[3] //// y1 * cos3(part of b1)
- smlal v28.4s, v8.4h, v0.4h[1] //// y1 * sin3(part of b2)
- smlsl v30.4s, v8.4h, v4.4h[1] //// y1 * sin1(part of b3)
+ smlal v24.4s, v8.4h, v6.h[1] //// y1 * cos1(part of b0)
+ smlsl v26.4s, v8.4h, v2.h[3] //// y1 * cos3(part of b1)
+ smlal v28.4s, v8.4h, v0.h[1] //// y1 * sin3(part of b2)
+ smlsl v30.4s, v8.4h, v4.h[1] //// y1 * sin1(part of b3)
- smlal v24.4s, v9.4h, v6.4h[3] //// y1 * cos1 + y3 * cos3(part of b0)
- smlsl v26.4s, v9.4h, v4.4h[1] //// y1 * cos3 - y3 * sin1(part of b1)
- smlal v28.4s, v9.4h, v1.4h[3] //// y1 * sin3 - y3 * cos1(part of b2)
- smlsl v30.4s, v9.4h, v0.4h[1] //// y1 * sin1 - y3 * sin3(part of b3)
+ smlal v24.4s, v9.4h, v6.h[3] //// y1 * cos1 + y3 * cos3(part of b0)
+ smlsl v26.4s, v9.4h, v4.h[1] //// y1 * cos3 - y3 * sin1(part of b1)
+ smlal v28.4s, v9.4h, v1.h[3] //// y1 * sin3 - y3 * cos1(part of b2)
+ smlsl v30.4s, v9.4h, v0.h[1] //// y1 * sin1 - y3 * sin3(part of b3)
- smlal v20.4s, v10.4h, v6.4h[0]
- smlal v20.4s, v11.4h, v6.4h[2]
+ smlal v20.4s, v10.4h, v6.h[0]
+ smlal v20.4s, v11.4h, v6.h[2]
- smlsl v22.4s, v10.4h, v2.4h[0]
- smlsl v22.4s, v11.4h, v3.4h[2]
+ smlsl v22.4s, v10.4h, v2.h[0]
+ smlsl v22.4s, v11.4h, v3.h[2]
- smlal v16.4s, v10.4h, v2.4h[0]
- smlal v16.4s, v11.4h, v0.4h[2]
+ smlal v16.4s, v10.4h, v2.h[0]
+ smlal v16.4s, v11.4h, v0.h[2]
- smlsl v18.4s, v10.4h, v6.4h[0]
- smlsl v18.4s, v11.4h, v2.4h[2]
+ smlsl v18.4s, v10.4h, v6.h[0]
+ smlsl v18.4s, v11.4h, v2.h[2]
ld1 {v12.4h, v13.4h},[x1],#16
ld1 {v14.4h, v15.4h},[x1],x10
- smlal v24.4s, v14.4h, v7.4h[1]
- smlsl v26.4s, v14.4h, v5.4h[3]
- smlal v28.4s, v14.4h, v4.4h[1]
- smlsl v30.4s, v14.4h, v2.4h[3]
+ smlal v24.4s, v14.4h, v7.h[1]
+ smlsl v26.4s, v14.4h, v5.h[3]
+ smlal v28.4s, v14.4h, v4.h[1]
+ smlsl v30.4s, v14.4h, v2.h[3]
- smlal v24.4s, v15.4h, v7.4h[3]
- smlsl v26.4s, v15.4h, v7.4h[1]
- smlal v28.4s, v15.4h, v6.4h[3]
- smlsl v30.4s, v15.4h, v6.4h[1]
+ smlal v24.4s, v15.4h, v7.h[3]
+ smlsl v26.4s, v15.4h, v7.h[1]
+ smlal v28.4s, v15.4h, v6.h[3]
+ smlsl v30.4s, v15.4h, v6.h[1]
- smlal v20.4s, v12.4h, v7.4h[0]
- smlal v20.4s, v13.4h, v7.4h[2]
- smlsl v22.4s, v12.4h, v5.4h[0]
- smlsl v22.4s, v13.4h, v6.4h[2]
- smlal v16.4s, v12.4h, v3.4h[0]
- smlal v16.4s, v13.4h, v5.4h[2]
- smlsl v18.4s, v12.4h, v1.4h[0]
- smlsl v18.4s, v13.4h, v4.4h[2]
+ smlal v20.4s, v12.4h, v7.h[0]
+ smlal v20.4s, v13.4h, v7.h[2]
+ smlsl v22.4s, v12.4h, v5.h[0]
+ smlsl v22.4s, v13.4h, v6.h[2]
+ smlal v16.4s, v12.4h, v3.h[0]
+ smlal v16.4s, v13.4h, v5.h[2]
+ smlsl v18.4s, v12.4h, v1.h[0]
+ smlsl v18.4s, v13.4h, v4.h[2]
stage2_shift1:
add v8.4s, v20.4s , v24.4s
@@ -1930,32 +1930,32 @@ stage2_shift1:
ld1 {v8.4h, v9.4h},[x1],x10
- smull v24.4s, v8.4h, v2.4h[1] //// y1 * cos1(part of b0)
- smull v26.4s, v8.4h, v2.4h[3] //// y1 * cos3(part of b1)
- smull v28.4s, v8.4h, v3.4h[1] //// y1 * sin3(part of b2)
- smull v30.4s, v8.4h, v3.4h[3] //// y1 * sin1(part of b3)
+ smull v24.4s, v8.4h, v2.h[1] //// y1 * cos1(part of b0)
+ smull v26.4s, v8.4h, v2.h[3] //// y1 * cos3(part of b1)
+ smull v28.4s, v8.4h, v3.h[1] //// y1 * sin3(part of b2)
+ smull v30.4s, v8.4h, v3.h[3] //// y1 * sin1(part of b3)
- smlal v24.4s, v9.4h, v6.4h[3] //// y1 * cos1 + y3 * cos3(part of b0)
- smlsl v26.4s, v9.4h, v7.4h[3] //// y1 * cos3 - y3 * sin1(part of b1)
- smlsl v28.4s, v9.4h, v6.4h[1] //// y1 * sin3 - y3 * cos1(part of b2)
- smlsl v30.4s, v9.4h, v4.4h[3] //// y1 * sin1 - y3 * sin3(part of b3)
+ smlal v24.4s, v9.4h, v6.h[3] //// y1 * cos1 + y3 * cos3(part of b0)
+ smlsl v26.4s, v9.4h, v7.h[3] //// y1 * cos3 - y3 * sin1(part of b1)
+ smlsl v28.4s, v9.4h, v6.h[1] //// y1 * sin3 - y3 * cos1(part of b2)
+ smlsl v30.4s, v9.4h, v4.h[3] //// y1 * sin1 - y3 * sin3(part of b3)
- smull v20.4s, v10.4h, v0.4h[0]
- smlal v20.4s, v11.4h, v4.4h[2]
+ smull v20.4s, v10.4h, v0.h[0]
+ smlal v20.4s, v11.4h, v4.h[2]
- smull v22.4s, v10.4h, v0.4h[0]
- smlal v22.4s, v11.4h, v5.4h[2]
+ smull v22.4s, v10.4h, v0.h[0]
+ smlal v22.4s, v11.4h, v5.h[2]
- smull v16.4s, v10.4h, v0.4h[0]
- smlal v16.4s, v11.4h, v6.4h[2]
+ smull v16.4s, v10.4h, v0.h[0]
+ smlal v16.4s, v11.4h, v6.h[2]
- smull v18.4s, v10.4h, v0.4h[0]
- smlal v18.4s, v11.4h, v7.4h[2]
+ smull v18.4s, v10.4h, v0.h[0]
+ smlal v18.4s, v11.4h, v7.h[2]
cmp x12,x11
bhs stage2_shift2
@@ -1964,26 +1964,26 @@ stage2_shift1:
ld1 {v14.4h, v15.4h},[x1],x10
- smlsl v24.4s, v14.4h, v4.4h[3]
- smlsl v26.4s, v14.4h, v2.4h[1]
- smlsl v28.4s, v14.4h, v0.4h[1]
- smlsl v30.4s, v14.4h, v2.4h[3]
+ smlsl v24.4s, v14.4h, v4.h[3]
+ smlsl v26.4s, v14.4h, v2.h[1]
+ smlsl v28.4s, v14.4h, v0.h[1]
+ smlsl v30.4s, v14.4h, v2.h[3]
- smlsl v24.4s, v15.4h, v0.4h[3]
- smlsl v26.4s, v15.4h, v3.4h[1]
- smlsl v28.4s, v15.4h, v6.4h[3]
- smlal v30.4s, v15.4h, v5.4h[3]
+ smlsl v24.4s, v15.4h, v0.h[3]
+ smlsl v26.4s, v15.4h, v3.h[1]
+ smlsl v28.4s, v15.4h, v6.h[3]
+ smlal v30.4s, v15.4h, v5.h[3]
- smlsl v20.4s, v12.4h, v7.4h[0]
- smlsl v20.4s, v13.4h, v2.4h[2]
- smlsl v22.4s, v12.4h, v5.4h[0]
- smlsl v22.4s, v13.4h, v0.4h[2]
- smlsl v16.4s, v12.4h, v3.4h[0]
- smlsl v16.4s, v13.4h, v3.4h[2]
- smlsl v18.4s, v12.4h, v1.4h[0]
- smlsl v18.4s, v13.4h, v6.4h[2]
+ smlsl v20.4s, v12.4h, v7.h[0]
+ smlsl v20.4s, v13.4h, v2.h[2]
+ smlsl v22.4s, v12.4h, v5.h[0]
+ smlsl v22.4s, v13.4h, v0.h[2]
+ smlsl v16.4s, v12.4h, v3.h[0]
+ smlsl v16.4s, v13.4h, v3.h[2]
+ smlsl v18.4s, v12.4h, v1.h[0]
+ smlsl v18.4s, v13.4h, v6.h[2]
cmp x12,x5
bhs stage2_shift2
@@ -1995,32 +1995,32 @@ stage2_shift1:
- smlsl v24.4s, v8.4h, v4.4h[1] //// y1 * cos1(part of b0)
- smlal v26.4s, v8.4h, v7.4h[1] //// y1 * cos3(part of b1)
- smlal v28.4s, v8.4h, v2.4h[3] //// y1 * sin3(part of b2)
- smlal v30.4s, v8.4h, v1.4h[3] //// y1 * sin1(part of b3)
+ smlsl v24.4s, v8.4h, v4.h[1] //// y1 * cos1(part of b0)
+ smlal v26.4s, v8.4h, v7.h[1] //// y1 * cos3(part of b1)
+ smlal v28.4s, v8.4h, v2.h[3] //// y1 * sin3(part of b2)
+ smlal v30.4s, v8.4h, v1.h[3] //// y1 * sin1(part of b3)
- smlal v24.4s, v9.4h, v7.4h[1] //// y1 * cos1 + y3 * cos3(part of b0)
- smlal v26.4s, v9.4h, v1.4h[3] //// y1 * cos3 - y3 * sin1(part of b1)
- smlal v28.4s, v9.4h, v3.4h[3] //// y1 * sin3 - y3 * cos1(part of b2)
- smlsl v30.4s, v9.4h, v6.4h[3] //// y1 * sin1 - y3 * sin3(part of b3)
+ smlal v24.4s, v9.4h, v7.h[1] //// y1 * cos1 + y3 * cos3(part of b0)
+ smlal v26.4s, v9.4h, v1.h[3] //// y1 * cos3 - y3 * sin1(part of b1)
+ smlal v28.4s, v9.4h, v3.h[3] //// y1 * sin3 - y3 * cos1(part of b2)
+ smlsl v30.4s, v9.4h, v6.h[3] //// y1 * sin1 - y3 * sin3(part of b3)
- smlsl v20.4s, v10.4h, v2.4h[0]
- smlsl v20.4s, v11.4h, v6.4h[2]
+ smlsl v20.4s, v10.4h, v2.h[0]
+ smlsl v20.4s, v11.4h, v6.h[2]
- smlsl v22.4s, v10.4h, v6.4h[0]
- smlal v22.4s, v11.4h, v4.4h[2]
+ smlsl v22.4s, v10.4h, v6.h[0]
+ smlal v22.4s, v11.4h, v4.h[2]
- smlal v16.4s, v10.4h, v6.4h[0]
- smlal v16.4s, v11.4h, v0.4h[2]
+ smlal v16.4s, v10.4h, v6.h[0]
+ smlal v16.4s, v11.4h, v0.h[2]
- smlal v18.4s, v10.4h, v2.4h[0]
- smlal v18.4s, v11.4h, v5.4h[2]
+ smlal v18.4s, v10.4h, v2.h[0]
+ smlal v18.4s, v11.4h, v5.h[2]
cmp x12,x6
bhs stage2_shift2
@@ -2034,26 +2034,26 @@ stage2_shift1:
- smlal v24.4s, v14.4h, v2.4h[3]
- smlal v26.4s, v14.4h, v3.4h[3]
- smlsl v28.4s, v14.4h, v5.4h[3]
- smlsl v30.4s, v14.4h, v0.4h[3]
+ smlal v24.4s, v14.4h, v2.h[3]
+ smlal v26.4s, v14.4h, v3.h[3]
+ smlsl v28.4s, v14.4h, v5.h[3]
+ smlsl v30.4s, v14.4h, v0.h[3]
- smlal v24.4s, v15.4h, v1.4h[3]
- smlsl v26.4s, v15.4h, v6.4h[3]
- smlsl v28.4s, v15.4h, v0.4h[3]
- smlal v30.4s, v15.4h, v7.4h[3]
+ smlal v24.4s, v15.4h, v1.h[3]
+ smlsl v26.4s, v15.4h, v6.h[3]
+ smlsl v28.4s, v15.4h, v0.h[3]
+ smlal v30.4s, v15.4h, v7.h[3]
- smlal v20.4s, v12.4h, v5.4h[0]
- smlal v20.4s, v13.4h, v0.4h[2]
- smlal v22.4s, v12.4h, v1.4h[0]
- smlal v22.4s, v13.4h, v6.4h[2]
- smlal v16.4s, v12.4h, v7.4h[0]
- smlsl v16.4s, v13.4h, v2.4h[2]
- smlsl v18.4s, v12.4h, v3.4h[0]
- smlsl v18.4s, v13.4h, v4.4h[2]
+ smlal v20.4s, v12.4h, v5.h[0]
+ smlal v20.4s, v13.4h, v0.h[2]
+ smlal v22.4s, v12.4h, v1.h[0]
+ smlal v22.4s, v13.4h, v6.h[2]
+ smlal v16.4s, v12.4h, v7.h[0]
+ smlsl v16.4s, v13.4h, v2.h[2]
+ smlsl v18.4s, v12.4h, v3.h[0]
+ smlsl v18.4s, v13.4h, v4.h[2]
cmp x12,x9
bhs stage2_shift2
@@ -2064,32 +2064,32 @@ stage2_shift1:
- smlal v24.4s, v8.4h, v6.4h[1] //// y1 * cos1(part of b0)
- smlsl v26.4s, v8.4h, v1.4h[1] //// y1 * cos3(part of b1)
- smlsl v28.4s, v8.4h, v7.4h[1] //// y1 * sin3(part of b2)
- smlal v30.4s, v8.4h, v0.4h[3] //// y1 * sin1(part of b3)
+ smlal v24.4s, v8.4h, v6.h[1] //// y1 * cos1(part of b0)
+ smlsl v26.4s, v8.4h, v1.h[1] //// y1 * cos3(part of b1)
+ smlsl v28.4s, v8.4h, v7.h[1] //// y1 * sin3(part of b2)
+ smlal v30.4s, v8.4h, v0.h[3] //// y1 * sin1(part of b3)
- smlsl v24.4s, v9.4h, v5.4h[1] //// y1 * cos1 + y3 * cos3(part of b0)
- smlsl v26.4s, v9.4h, v4.4h[1] //// y1 * cos3 - y3 * sin1(part of b1)
- smlal v28.4s, v9.4h, v2.4h[1] //// y1 * sin3 - y3 * cos1(part of b2)
- smlal v30.4s, v9.4h, v7.4h[1] //// y1 * sin1 - y3 * sin3(part of b3)
+ smlsl v24.4s, v9.4h, v5.h[1] //// y1 * cos1 + y3 * cos3(part of b0)
+ smlsl v26.4s, v9.4h, v4.h[1] //// y1 * cos3 - y3 * sin1(part of b1)
+ smlal v28.4s, v9.4h, v2.h[1] //// y1 * sin3 - y3 * cos1(part of b2)
+ smlal v30.4s, v9.4h, v7.h[1] //// y1 * sin1 - y3 * sin3(part of b3)
- smlal v20.4s, v10.4h, v0.4h[0]
- smlsl v20.4s, v11.4h, v7.4h[2]
+ smlal v20.4s, v10.4h, v0.h[0]
+ smlsl v20.4s, v11.4h, v7.h[2]
- smlsl v22.4s, v10.4h, v0.4h[0]
- smlsl v22.4s, v11.4h, v1.4h[2]
+ smlsl v22.4s, v10.4h, v0.h[0]
+ smlsl v22.4s, v11.4h, v1.h[2]
- smlsl v16.4s, v10.4h, v0.4h[0]
- smlal v16.4s, v11.4h, v5.4h[2]
+ smlsl v16.4s, v10.4h, v0.h[0]
+ smlal v16.4s, v11.4h, v5.h[2]
- smlal v18.4s, v10.4h, v0.4h[0]
- smlal v18.4s, v11.4h, v3.4h[2]
+ smlal v18.4s, v10.4h, v0.h[0]
+ smlal v18.4s, v11.4h, v3.h[2]
ld1 {v12.4h, v13.4h},[x1],#16
ld1 {v14.4h, v15.4h},[x1],x10
@@ -2097,58 +2097,58 @@ stage2_shift1:
- smlsl v24.4s, v14.4h, v0.4h[1]
- smlal v26.4s, v14.4h, v6.4h[1]
- smlal v28.4s, v14.4h, v4.4h[1]
- smlsl v30.4s, v14.4h, v1.4h[1]
+ smlsl v24.4s, v14.4h, v0.h[1]
+ smlal v26.4s, v14.4h, v6.h[1]
+ smlal v28.4s, v14.4h, v4.h[1]
+ smlsl v30.4s, v14.4h, v1.h[1]
- smlsl v24.4s, v15.4h, v3.4h[3]
- smlal v26.4s, v15.4h, v0.4h[1]
- smlsl v28.4s, v15.4h, v5.4h[1]
- smlsl v30.4s, v15.4h, v6.4h[1]
+ smlsl v24.4s, v15.4h, v3.h[3]
+ smlal v26.4s, v15.4h, v0.h[1]
+ smlsl v28.4s, v15.4h, v5.h[1]
+ smlsl v30.4s, v15.4h, v6.h[1]
- smlsl v20.4s, v12.4h, v3.4h[0]
- smlsl v20.4s, v13.4h, v1.4h[2]
- smlsl v22.4s, v12.4h, v7.4h[0]
- smlal v22.4s, v13.4h, v3.4h[2]
- smlal v16.4s, v12.4h, v1.4h[0]
- smlal v16.4s, v13.4h, v7.4h[2]
- smlsl v18.4s, v12.4h, v5.4h[0]
- smlsl v18.4s, v13.4h, v2.4h[2]
+ smlsl v20.4s, v12.4h, v3.h[0]
+ smlsl v20.4s, v13.4h, v1.h[2]
+ smlsl v22.4s, v12.4h, v7.h[0]
+ smlal v22.4s, v13.4h, v3.h[2]
+ smlal v16.4s, v12.4h, v1.h[0]
+ smlal v16.4s, v13.4h, v7.h[2]
+ smlsl v18.4s, v12.4h, v5.h[0]
+ smlsl v18.4s, v13.4h, v2.h[2]
ld1 {v10.4h, v11.4h},[x1],#16
ld1 {v8.4h, v9.4h},[x1],x10
- smlal v24.4s, v8.4h, v7.4h[3] //// y1 * cos1(part of b0)
- smlal v26.4s, v8.4h, v4.4h[3] //// y1 * cos3(part of b1)
- smlsl v28.4s, v8.4h, v1.4h[1] //// y1 * sin3(part of b2)
- smlal v30.4s, v8.4h, v2.4h[1] //// y1 * sin1(part of b3)
+ smlal v24.4s, v8.4h, v7.h[3] //// y1 * cos1(part of b0)
+ smlal v26.4s, v8.4h, v4.h[3] //// y1 * cos3(part of b1)
+ smlsl v28.4s, v8.4h, v1.h[1] //// y1 * sin3(part of b2)
+ smlal v30.4s, v8.4h, v2.h[1] //// y1 * sin1(part of b3)
- smlal v24.4s, v9.4h, v3.4h[1] //// y1 * cos1 + y3 * cos3(part of b0)
- smlsl v26.4s, v9.4h, v5.4h[3] //// y1 * cos3 - y3 * sin1(part of b1)
- smlsl v28.4s, v9.4h, v7.4h[3] //// y1 * sin3 - y3 * cos1(part of b2)
- smlal v30.4s, v9.4h, v5.4h[1] //// y1 * sin1 - y3 * sin3(part of b3)
+ smlal v24.4s, v9.4h, v3.h[1] //// y1 * cos1 + y3 * cos3(part of b0)
+ smlsl v26.4s, v9.4h, v5.h[3] //// y1 * cos3 - y3 * sin1(part of b1)
+ smlsl v28.4s, v9.4h, v7.h[3] //// y1 * sin3 - y3 * cos1(part of b2)
+ smlal v30.4s, v9.4h, v5.h[1] //// y1 * sin1 - y3 * sin3(part of b3)
- smlsl v20.4s, v10.4h, v6.4h[0]
- smlal v20.4s, v11.4h, v5.4h[2]
+ smlsl v20.4s, v10.4h, v6.h[0]
+ smlal v20.4s, v11.4h, v5.h[2]
- smlal v22.4s, v10.4h, v2.4h[0]
- smlal v22.4s, v11.4h, v7.4h[2]
+ smlal v22.4s, v10.4h, v2.h[0]
+ smlal v22.4s, v11.4h, v7.h[2]
- smlsl v16.4s, v10.4h, v2.4h[0]
- smlsl v16.4s, v11.4h, v4.4h[2]
+ smlsl v16.4s, v10.4h, v2.h[0]
+ smlsl v16.4s, v11.4h, v4.h[2]
- smlal v18.4s, v10.4h, v6.4h[0]
- smlal v18.4s, v11.4h, v1.4h[2]
+ smlal v18.4s, v10.4h, v6.h[0]
+ smlal v18.4s, v11.4h, v1.h[2]
ld1 {v12.4h, v13.4h},[x1],#16
@@ -2156,26 +2156,26 @@ stage2_shift1:
- smlal v24.4s, v14.4h, v1.4h[1]
- smlsl v26.4s, v14.4h, v0.4h[3]
- smlal v28.4s, v14.4h, v1.4h[3]
- smlsl v30.4s, v14.4h, v3.4h[1]
+ smlal v24.4s, v14.4h, v1.h[1]
+ smlsl v26.4s, v14.4h, v0.h[3]
+ smlal v28.4s, v14.4h, v1.h[3]
+ smlsl v30.4s, v14.4h, v3.h[1]
- smlal v24.4s, v15.4h, v5.4h[3]
- smlsl v26.4s, v15.4h, v5.4h[1]
- smlal v28.4s, v15.4h, v4.4h[3]
- smlsl v30.4s, v15.4h, v4.4h[1]
+ smlal v24.4s, v15.4h, v5.h[3]
+ smlsl v26.4s, v15.4h, v5.h[1]
+ smlal v28.4s, v15.4h, v4.h[3]
+ smlsl v30.4s, v15.4h, v4.h[1]
- smlal v20.4s, v12.4h, v1.4h[0]
- smlal v20.4s, v13.4h, v3.4h[2]
- smlsl v22.4s, v12.4h, v3.4h[0]
- smlsl v22.4s, v13.4h, v2.4h[2]
- smlal v16.4s, v12.4h, v5.4h[0]
- smlal v16.4s, v13.4h, v1.4h[2]
- smlsl v18.4s, v12.4h, v7.4h[0]
- smlsl v18.4s, v13.4h, v0.4h[2]
+ smlal v20.4s, v12.4h, v1.h[0]
+ smlal v20.4s, v13.4h, v3.h[2]
+ smlsl v22.4s, v12.4h, v3.h[0]
+ smlsl v22.4s, v13.4h, v2.h[2]
+ smlal v16.4s, v12.4h, v5.h[0]
+ smlal v16.4s, v13.4h, v1.h[2]
+ smlsl v18.4s, v12.4h, v7.h[0]
+ smlsl v18.4s, v13.4h, v0.h[2]
stage2_shift2:
add v8.4s, v20.4s , v24.4s
@@ -2245,32 +2245,32 @@ stage2_shift2:
ld1 {v10.4h, v11.4h},[x1],#16
ld1 {v8.4h, v9.4h},[x1],x10
- smull v24.4s, v8.4h, v4.4h[1] //// y1 * cos1(part of b0)
- smull v26.4s, v8.4h, v4.4h[3] //// y1 * cos3(part of b1)
- smull v28.4s, v8.4h, v5.4h[1] //// y1 * sin3(part of b2)
- smull v30.4s, v8.4h, v5.4h[3] //// y1 * sin1(part of b3)
+ smull v24.4s, v8.4h, v4.h[1] //// y1 * cos1(part of b0)
+ smull v26.4s, v8.4h, v4.h[3] //// y1 * cos3(part of b1)
+ smull v28.4s, v8.4h, v5.h[1] //// y1 * sin3(part of b2)
+ smull v30.4s, v8.4h, v5.h[3] //// y1 * sin1(part of b3)
- smlsl v24.4s, v9.4h, v3.4h[1] //// y1 * cos1 + y3 * cos3(part of b0)
- smlsl v26.4s, v9.4h, v1.4h[3] //// y1 * cos3 - y3 * sin1(part of b1)
- smlsl v28.4s, v9.4h, v0.4h[2] //// y1 * sin3 - y3 * cos1(part of b2)
- smlsl v30.4s, v9.4h, v1.4h[1] //// y1 * sin1 - y3 * sin3(part of b3)
+ smlsl v24.4s, v9.4h, v3.h[1] //// y1 * cos1 + y3 * cos3(part of b0)
+ smlsl v26.4s, v9.4h, v1.h[3] //// y1 * cos3 - y3 * sin1(part of b1)
+ smlsl v28.4s, v9.4h, v0.h[2] //// y1 * sin3 - y3 * cos1(part of b2)
+ smlsl v30.4s, v9.4h, v1.h[1] //// y1 * sin1 - y3 * sin3(part of b3)
- smull v20.4s, v10.4h, v0.4h[0]
- smlsl v20.4s, v11.4h, v7.4h[2]
+ smull v20.4s, v10.4h, v0.h[0]
+ smlsl v20.4s, v11.4h, v7.h[2]
- smull v22.4s, v10.4h, v0.4h[0]
- smlsl v22.4s, v11.4h, v6.4h[2]
+ smull v22.4s, v10.4h, v0.h[0]
+ smlsl v22.4s, v11.4h, v6.h[2]
- smull v16.4s, v10.4h, v0.4h[0]
- smlsl v16.4s, v11.4h, v5.4h[2]
+ smull v16.4s, v10.4h, v0.h[0]
+ smlsl v16.4s, v11.4h, v5.h[2]
- smull v18.4s, v10.4h, v0.4h[0]
- smlsl v18.4s, v11.4h, v4.4h[2]
+ smull v18.4s, v10.4h, v0.h[0]
+ smlsl v18.4s, v11.4h, v4.h[2]
cmp x12,x11
bhs stage2_shift3
@@ -2278,26 +2278,26 @@ stage2_shift2:
ld1 {v12.4h, v13.4h},[x1],#16
ld1 {v14.4h, v15.4h},[x1],x10
- smlsl v24.4s, v14.4h, v5.4h[1]
- smlsl v26.4s, v14.4h, v7.4h[3]
- smlal v28.4s, v14.4h, v5.4h[3]
- smlal v30.4s, v14.4h, v3.4h[1]
+ smlsl v24.4s, v14.4h, v5.h[1]
+ smlsl v26.4s, v14.4h, v7.h[3]
+ smlal v28.4s, v14.4h, v5.h[3]
+ smlal v30.4s, v14.4h, v3.h[1]
- smlal v24.4s, v15.4h, v2.4h[1]
- smlal v26.4s, v15.4h, v1.4h[1]
- smlal v28.4s, v15.4h, v4.4h[3]
- smlsl v30.4s, v15.4h, v7.4h[3]
+ smlal v24.4s, v15.4h, v2.h[1]
+ smlal v26.4s, v15.4h, v1.h[1]
+ smlal v28.4s, v15.4h, v4.h[3]
+ smlsl v30.4s, v15.4h, v7.h[3]
- smlsl v20.4s, v12.4h, v1.4h[0]
- smlal v20.4s, v13.4h, v6.4h[2]
- smlsl v22.4s, v12.4h, v3.4h[0]
- smlal v22.4s, v13.4h, v3.4h[2]
- smlsl v16.4s, v12.4h, v5.4h[0]
- smlal v16.4s, v13.4h, v0.4h[2]
- smlsl v18.4s, v12.4h, v7.4h[0]
- smlal v18.4s, v13.4h, v2.4h[2]
+ smlsl v20.4s, v12.4h, v1.h[0]
+ smlal v20.4s, v13.4h, v6.h[2]
+ smlsl v22.4s, v12.4h, v3.h[0]
+ smlal v22.4s, v13.4h, v3.h[2]
+ smlsl v16.4s, v12.4h, v5.h[0]
+ smlal v16.4s, v13.4h, v0.h[2]
+ smlsl v18.4s, v12.4h, v7.h[0]
+ smlal v18.4s, v13.4h, v2.h[2]
cmp x12,x5
bhs stage2_shift3
@@ -2307,32 +2307,32 @@ stage2_shift2:
- smlal v24.4s, v8.4h, v6.4h[1] //// y1 * cos1(part of b0)
- smlsl v26.4s, v8.4h, v5.4h[1] //// y1 * cos3(part of b1)
- smlsl v28.4s, v8.4h, v0.4h[3] //// y1 * sin3(part of b2)
- smlsl v30.4s, v8.4h, v3.4h[3] //// y1 * sin1(part of b3)
+ smlal v24.4s, v8.4h, v6.h[1] //// y1 * cos1(part of b0)
+ smlsl v26.4s, v8.4h, v5.h[1] //// y1 * cos3(part of b1)
+ smlsl v28.4s, v8.4h, v0.h[3] //// y1 * sin3(part of b2)
+ smlsl v30.4s, v8.4h, v3.h[3] //// y1 * sin1(part of b3)
- smlsl v24.4s, v9.4h, v1.4h[1] //// y1 * cos1 + y3 * cos3(part of b0)
- smlsl v26.4s, v9.4h, v4.4h[1] //// y1 * cos3 - y3 * sin1(part of b1)
- smlal v28.4s, v9.4h, v6.4h[1] //// y1 * sin3 - y3 * cos1(part of b2)
- smlal v30.4s, v9.4h, v0.4h[1] //// y1 * sin1 - y3 * sin3(part of b3)
+ smlsl v24.4s, v9.4h, v1.h[1] //// y1 * cos1 + y3 * cos3(part of b0)
+ smlsl v26.4s, v9.4h, v4.h[1] //// y1 * cos3 - y3 * sin1(part of b1)
+ smlal v28.4s, v9.4h, v6.h[1] //// y1 * sin3 - y3 * cos1(part of b2)
+ smlal v30.4s, v9.4h, v0.h[1] //// y1 * sin1 - y3 * sin3(part of b3)
- smlal v20.4s, v10.4h, v2.4h[0]
- smlsl v20.4s, v11.4h, v5.4h[2]
+ smlal v20.4s, v10.4h, v2.h[0]
+ smlsl v20.4s, v11.4h, v5.h[2]
- smlal v22.4s, v10.4h, v6.4h[0]
- smlsl v22.4s, v11.4h, v0.4h[2]
+ smlal v22.4s, v10.4h, v6.h[0]
+ smlsl v22.4s, v11.4h, v0.h[2]
- smlsl v16.4s, v10.4h, v6.4h[0]
- smlsl v16.4s, v11.4h, v4.4h[2]
+ smlsl v16.4s, v10.4h, v6.h[0]
+ smlsl v16.4s, v11.4h, v4.h[2]
- smlsl v18.4s, v10.4h, v2.4h[0]
- smlal v18.4s, v11.4h, v6.4h[2]
+ smlsl v18.4s, v10.4h, v2.h[0]
+ smlal v18.4s, v11.4h, v6.h[2]
cmp x12,x6
bhs stage2_shift3
@@ -2344,26 +2344,26 @@ stage2_shift2:
- smlsl v24.4s, v14.4h, v7.4h[1]
- smlal v26.4s, v14.4h, v2.4h[1]
- smlal v28.4s, v14.4h, v4.4h[1]
- smlsl v30.4s, v14.4h, v5.4h[1]
+ smlsl v24.4s, v14.4h, v7.h[1]
+ smlal v26.4s, v14.4h, v2.h[1]
+ smlal v28.4s, v14.4h, v4.h[1]
+ smlsl v30.4s, v14.4h, v5.h[1]
- smlal v24.4s, v15.4h, v0.4h[3]
- smlal v26.4s, v15.4h, v7.4h[1]
- smlsl v28.4s, v15.4h, v1.4h[1]
- smlsl v30.4s, v15.4h, v6.4h[1]
+ smlal v24.4s, v15.4h, v0.h[3]
+ smlal v26.4s, v15.4h, v7.h[1]
+ smlsl v28.4s, v15.4h, v1.h[1]
+ smlsl v30.4s, v15.4h, v6.h[1]
- smlsl v20.4s, v12.4h, v3.4h[0]
- smlal v20.4s, v13.4h, v4.4h[2]
- smlal v22.4s, v12.4h, v7.4h[0]
- smlal v22.4s, v13.4h, v2.4h[2]
- smlal v16.4s, v12.4h, v1.4h[0]
- smlsl v16.4s, v13.4h, v6.4h[2]
- smlal v18.4s, v12.4h, v5.4h[0]
- smlsl v18.4s, v13.4h, v0.4h[2]
+ smlsl v20.4s, v12.4h, v3.h[0]
+ smlal v20.4s, v13.4h, v4.h[2]
+ smlal v22.4s, v12.4h, v7.h[0]
+ smlal v22.4s, v13.4h, v2.h[2]
+ smlal v16.4s, v12.4h, v1.h[0]
+ smlsl v16.4s, v13.4h, v6.h[2]
+ smlal v18.4s, v12.4h, v5.h[0]
+ smlsl v18.4s, v13.4h, v0.h[2]
cmp x12,x9
bhs stage2_shift3
@@ -2373,32 +2373,32 @@ stage2_shift2:
ld1 {v8.4h, v9.4h},[x1],x10
- smlsl v24.4s, v8.4h, v7.4h[3] //// y1 * cos1(part of b0)
- smlsl v26.4s, v8.4h, v0.4h[1] //// y1 * cos3(part of b1)
- smlal v28.4s, v8.4h, v6.4h[3] //// y1 * sin3(part of b2)
- smlal v30.4s, v8.4h, v1.4h[3] //// y1 * sin1(part of b3)
+ smlsl v24.4s, v8.4h, v7.h[3] //// y1 * cos1(part of b0)
+ smlsl v26.4s, v8.4h, v0.h[1] //// y1 * cos3(part of b1)
+ smlal v28.4s, v8.4h, v6.h[3] //// y1 * sin3(part of b2)
+ smlal v30.4s, v8.4h, v1.h[3] //// y1 * sin1(part of b3)
- smlsl v24.4s, v9.4h, v0.4h[1] //// y1 * cos1 + y3 * cos3(part of b0)
- smlal v26.4s, v9.4h, v5.4h[3] //// y1 * cos3 - y3 * sin1(part of b1)
- smlal v28.4s, v9.4h, v3.4h[3] //// y1 * sin3 - y3 * cos1(part of b2)
- smlsl v30.4s, v9.4h, v2.4h[3] //// y1 * sin1 - y3 * sin3(part of b3)
+ smlsl v24.4s, v9.4h, v0.h[1] //// y1 * cos1 + y3 * cos3(part of b0)
+ smlal v26.4s, v9.4h, v5.h[3] //// y1 * cos3 - y3 * sin1(part of b1)
+ smlal v28.4s, v9.4h, v3.h[3] //// y1 * sin3 - y3 * cos1(part of b2)
+ smlsl v30.4s, v9.4h, v2.h[3] //// y1 * sin1 - y3 * sin3(part of b3)
- smlal v20.4s, v10.4h, v0.4h[0]
- smlsl v20.4s, v11.4h, v3.4h[2]
+ smlal v20.4s, v10.4h, v0.h[0]
+ smlsl v20.4s, v11.4h, v3.h[2]
- smlsl v22.4s, v10.4h, v0.4h[0]
- smlsl v22.4s, v11.4h, v5.4h[2]
+ smlsl v22.4s, v10.4h, v0.h[0]
+ smlsl v22.4s, v11.4h, v5.h[2]
- smlsl v16.4s, v10.4h, v0.4h[0]
- smlal v16.4s, v11.4h, v1.4h[2]
+ smlsl v16.4s, v10.4h, v0.h[0]
+ smlal v16.4s, v11.4h, v1.h[2]
- smlal v18.4s, v10.4h, v0.4h[0]
- smlal v18.4s, v11.4h, v7.4h[2]
+ smlal v18.4s, v10.4h, v0.h[0]
+ smlal v18.4s, v11.4h, v7.h[2]
ld1 {v12.4h, v13.4h},[x1],#16
ld1 {v14.4h, v15.4h},[x1],x10
@@ -2406,84 +2406,84 @@ stage2_shift2:
- smlal v24.4s, v14.4h, v6.4h[3]
- smlal v26.4s, v14.4h, v3.4h[3]
- smlsl v28.4s, v14.4h, v1.4h[3]
- smlal v30.4s, v14.4h, v7.4h[1]
+ smlal v24.4s, v14.4h, v6.h[3]
+ smlal v26.4s, v14.4h, v3.h[3]
+ smlsl v28.4s, v14.4h, v1.h[3]
+ smlal v30.4s, v14.4h, v7.h[1]
- smlal v24.4s, v15.4h, v1.4h[3]
- smlsl v26.4s, v15.4h, v2.4h[3]
- smlal v28.4s, v15.4h, v7.4h[1]
- smlal v30.4s, v15.4h, v4.4h[1]
+ smlal v24.4s, v15.4h, v1.h[3]
+ smlsl v26.4s, v15.4h, v2.h[3]
+ smlal v28.4s, v15.4h, v7.h[1]
+ smlal v30.4s, v15.4h, v4.h[1]
- smlsl v20.4s, v12.4h, v5.4h[0]
- smlal v20.4s, v13.4h, v2.4h[2]
- smlal v22.4s, v12.4h, v1.4h[0]
- smlsl v22.4s, v13.4h, v7.4h[2]
- smlsl v16.4s, v12.4h, v7.4h[0]
- smlsl v16.4s, v13.4h, v3.4h[2]
- smlsl v18.4s, v12.4h, v3.4h[0]
- smlal v18.4s, v13.4h, v1.4h[2]
+ smlsl v20.4s, v12.4h, v5.h[0]
+ smlal v20.4s, v13.4h, v2.h[2]
+ smlal v22.4s, v12.4h, v1.h[0]
+ smlsl v22.4s, v13.4h, v7.h[2]
+ smlsl v16.4s, v12.4h, v7.h[0]
+ smlsl v16.4s, v13.4h, v3.h[2]
+ smlsl v18.4s, v12.4h, v3.h[0]
+ smlal v18.4s, v13.4h, v1.h[2]
ld1 {v10.4h, v11.4h},[x1],#16
ld1 {v8.4h, v9.4h},[x1],x10
- smlsl v24.4s, v8.4h, v5.4h[3] //// y1 * cos1(part of b0)
- smlsl v26.4s, v8.4h, v6.4h[3] //// y1 * cos3(part of b1)
- smlal v28.4s, v8.4h, v3.4h[1] //// y1 * sin3(part of b2)
- smlsl v30.4s, v8.4h, v0.4h[1] //// y1 * sin1(part of b3)
+ smlsl v24.4s, v8.4h, v5.h[3] //// y1 * cos1(part of b0)
+ smlsl v26.4s, v8.4h, v6.h[3] //// y1 * cos3(part of b1)
+ smlal v28.4s, v8.4h, v3.h[1] //// y1 * sin3(part of b2)
+ smlsl v30.4s, v8.4h, v0.h[1] //// y1 * sin1(part of b3)
- smlsl v24.4s, v9.4h, v2.4h[3] //// y1 * cos1 + y3 * cos3(part of b0)
- smlal v26.4s, v9.4h, v0.4h[1] //// y1 * cos3 - y3 * sin1(part of b1)
- smlsl v28.4s, v9.4h, v2.4h[1] //// y1 * sin3 - y3 * cos1(part of b2)
- smlal v30.4s, v9.4h, v4.4h[3] //// y1 * sin1 - y3 * sin3(part of b3)
+ smlsl v24.4s, v9.4h, v2.h[3] //// y1 * cos1 + y3 * cos3(part of b0)
+ smlal v26.4s, v9.4h, v0.h[1] //// y1 * cos3 - y3 * sin1(part of b1)
+ smlsl v28.4s, v9.4h, v2.h[1] //// y1 * sin3 - y3 * cos1(part of b2)
+ smlal v30.4s, v9.4h, v4.h[3] //// y1 * sin1 - y3 * sin3(part of b3)
- smlal v20.4s, v10.4h, v6.4h[0]
- smlsl v20.4s, v11.4h, v1.4h[2]
+ smlal v20.4s, v10.4h, v6.h[0]
+ smlsl v20.4s, v11.4h, v1.h[2]
- smlsl v22.4s, v10.4h, v2.4h[0]
- smlal v22.4s, v11.4h, v4.4h[2]
+ smlsl v22.4s, v10.4h, v2.h[0]
+ smlal v22.4s, v11.4h, v4.h[2]
- smlal v16.4s, v10.4h, v2.4h[0]
- smlsl v16.4s, v11.4h, v7.4h[2]
+ smlal v16.4s, v10.4h, v2.h[0]
+ smlsl v16.4s, v11.4h, v7.h[2]
- smlsl v18.4s, v10.4h, v6.4h[0]
- smlsl v18.4s, v11.4h, v5.4h[2]
+ smlsl v18.4s, v10.4h, v6.h[0]
+ smlsl v18.4s, v11.4h, v5.h[2]
ld1 {v12.4h, v13.4h},[x1],#16
ld1 {v14.4h, v15.4h},[x1],x10
- smlal v24.4s, v14.4h, v4.4h[3]
- smlsl v26.4s, v14.4h, v6.4h[1]
- smlal v28.4s, v14.4h, v7.4h[3]
- smlal v30.4s, v14.4h, v6.4h[3]
+ smlal v24.4s, v14.4h, v4.h[3]
+ smlsl v26.4s, v14.4h, v6.h[1]
+ smlal v28.4s, v14.4h, v7.h[3]
+ smlal v30.4s, v14.4h, v6.h[3]
- smlal v24.4s, v15.4h, v3.4h[3]
- smlsl v26.4s, v15.4h, v3.4h[1]
- smlal v28.4s, v15.4h, v2.4h[3]
- smlsl v30.4s, v15.4h, v2.4h[1]
+ smlal v24.4s, v15.4h, v3.h[3]
+ smlsl v26.4s, v15.4h, v3.h[1]
+ smlal v28.4s, v15.4h, v2.h[3]
+ smlsl v30.4s, v15.4h, v2.h[1]
- smlsl v20.4s, v12.4h, v7.4h[0]
- smlal v20.4s, v13.4h, v0.4h[2]
- smlal v22.4s, v12.4h, v5.4h[0]
- smlsl v22.4s, v13.4h, v1.4h[2]
- smlsl v16.4s, v12.4h, v3.4h[0]
- smlal v16.4s, v13.4h, v2.4h[2]
- smlal v18.4s, v12.4h, v1.4h[0]
- smlsl v18.4s, v13.4h, v3.4h[2]
+ smlsl v20.4s, v12.4h, v7.h[0]
+ smlal v20.4s, v13.4h, v0.h[2]
+ smlal v22.4s, v12.4h, v5.h[0]
+ smlsl v22.4s, v13.4h, v1.h[2]
+ smlsl v16.4s, v12.4h, v3.h[0]
+ smlal v16.4s, v13.4h, v2.h[2]
+ smlal v18.4s, v12.4h, v1.h[0]
+ smlsl v18.4s, v13.4h, v3.h[2]
stage2_shift3:
add v8.4s, v20.4s , v24.4s
@@ -2555,32 +2555,32 @@ stage2_shift3:
ld1 {v8.4h, v9.4h},[x1],x10
- smull v24.4s, v8.4h, v6.4h[1] //// y1 * cos1(part of b0)
- smull v26.4s, v8.4h, v6.4h[3] //// y1 * cos3(part of b1)
- smull v28.4s, v8.4h, v7.4h[1] //// y1 * sin3(part of b2)
- smull v30.4s, v8.4h, v7.4h[3] //// y1 * sin1(part of b3)
+ smull v24.4s, v8.4h, v6.h[1] //// y1 * cos1(part of b0)
+ smull v26.4s, v8.4h, v6.h[3] //// y1 * cos3(part of b1)
+ smull v28.4s, v8.4h, v7.h[1] //// y1 * sin3(part of b2)
+ smull v30.4s, v8.4h, v7.h[3] //// y1 * sin1(part of b3)
- smlsl v24.4s, v9.4h, v2.4h[3] //// y1 * cos1 + y3 * cos3(part of b0)
- smlsl v26.4s, v9.4h, v4.4h[1] //// y1 * cos3 - y3 * sin1(part of b1)
- smlsl v28.4s, v9.4h, v5.4h[3] //// y1 * sin3 - y3 * cos1(part of b2)
- smlsl v30.4s, v9.4h, v7.4h[1] //// y1 * sin1 - y3 * sin3(part of b3)
+ smlsl v24.4s, v9.4h, v2.h[3] //// y1 * cos1 + y3 * cos3(part of b0)
+ smlsl v26.4s, v9.4h, v4.h[1] //// y1 * cos3 - y3 * sin1(part of b1)
+ smlsl v28.4s, v9.4h, v5.h[3] //// y1 * sin3 - y3 * cos1(part of b2)
+ smlsl v30.4s, v9.4h, v7.h[1] //// y1 * sin1 - y3 * sin3(part of b3)
- smull v20.4s, v10.4h, v0.4h[0]
- smlsl v20.4s, v11.4h, v3.4h[2]
+ smull v20.4s, v10.4h, v0.h[0]
+ smlsl v20.4s, v11.4h, v3.h[2]
- smull v22.4s, v10.4h, v0.4h[0]
- smlsl v22.4s, v11.4h, v2.4h[2]
+ smull v22.4s, v10.4h, v0.h[0]
+ smlsl v22.4s, v11.4h, v2.h[2]
- smull v16.4s, v10.4h, v0.4h[0]
- smlsl v16.4s, v11.4h, v1.4h[2]
+ smull v16.4s, v10.4h, v0.h[0]
+ smlsl v16.4s, v11.4h, v1.h[2]
- smull v18.4s, v10.4h, v0.4h[0]
- smlsl v18.4s, v11.4h, v0.4h[2]
+ smull v18.4s, v10.4h, v0.h[0]
+ smlsl v18.4s, v11.4h, v0.h[2]
cmp x12,x11
bhs stage2_shift4
@@ -2592,26 +2592,26 @@ stage2_shift3:
- smlal v24.4s, v14.4h, v0.4h[1]
- smlal v26.4s, v14.4h, v1.4h[3]
- smlal v28.4s, v14.4h, v4.4h[1]
- smlal v30.4s, v14.4h, v6.4h[3]
+ smlal v24.4s, v14.4h, v0.h[1]
+ smlal v26.4s, v14.4h, v1.h[3]
+ smlal v28.4s, v14.4h, v4.h[1]
+ smlal v30.4s, v14.4h, v6.h[3]
- smlsl v24.4s, v15.4h, v4.4h[1]
- smlsl v26.4s, v15.4h, v0.4h[3]
- smlsl v28.4s, v15.4h, v2.4h[3]
- smlsl v30.4s, v15.4h, v6.4h[1]
+ smlsl v24.4s, v15.4h, v4.h[1]
+ smlsl v26.4s, v15.4h, v0.h[3]
+ smlsl v28.4s, v15.4h, v2.h[3]
+ smlsl v30.4s, v15.4h, v6.h[1]
- smlal v20.4s, v12.4h, v7.4h[0]
- smlal v20.4s, v13.4h, v5.4h[2]
- smlal v22.4s, v12.4h, v5.4h[0]
- smlsl v22.4s, v13.4h, v7.4h[2]
- smlal v16.4s, v12.4h, v3.4h[0]
- smlsl v16.4s, v13.4h, v4.4h[2]
- smlal v18.4s, v12.4h, v1.4h[0]
- smlsl v18.4s, v13.4h, v1.4h[2]
+ smlal v20.4s, v12.4h, v7.h[0]
+ smlal v20.4s, v13.4h, v5.h[2]
+ smlal v22.4s, v12.4h, v5.h[0]
+ smlsl v22.4s, v13.4h, v7.h[2]
+ smlal v16.4s, v12.4h, v3.h[0]
+ smlsl v16.4s, v13.4h, v4.h[2]
+ smlal v18.4s, v12.4h, v1.h[0]
+ smlsl v18.4s, v13.4h, v1.h[2]
cmp x12,x5
bhs stage2_shift4
@@ -2621,32 +2621,32 @@ stage2_shift3:
- smlal v24.4s, v8.4h, v7.4h[3] //// y1 * cos1(part of b0)
- smlal v26.4s, v8.4h, v3.4h[1] //// y1 * cos3(part of b1)
- smlal v28.4s, v8.4h, v1.4h[1] //// y1 * sin3(part of b2)
- smlal v30.4s, v8.4h, v5.4h[3] //// y1 * sin1(part of b3)
+ smlal v24.4s, v8.4h, v7.h[3] //// y1 * cos1(part of b0)
+ smlal v26.4s, v8.4h, v3.h[1] //// y1 * cos3(part of b1)
+ smlal v28.4s, v8.4h, v1.h[1] //// y1 * sin3(part of b2)
+ smlal v30.4s, v8.4h, v5.h[3] //// y1 * sin1(part of b3)
- smlal v24.4s, v9.4h, v4.4h[3] //// y1 * cos1 + y3 * cos3(part of b0)
- smlsl v26.4s, v9.4h, v5.4h[3] //// y1 * cos3 - y3 * sin1(part of b1)
- smlsl v28.4s, v9.4h, v0.4h[1] //// y1 * sin3 - y3 * cos1(part of b2)
- smlsl v30.4s, v9.4h, v5.4h[1] //// y1 * sin1 - y3 * sin3(part of b3)
+ smlal v24.4s, v9.4h, v4.h[3] //// y1 * cos1 + y3 * cos3(part of b0)
+ smlsl v26.4s, v9.4h, v5.h[3] //// y1 * cos3 - y3 * sin1(part of b1)
+ smlsl v28.4s, v9.4h, v0.h[1] //// y1 * sin3 - y3 * cos1(part of b2)
+ smlsl v30.4s, v9.4h, v5.h[1] //// y1 * sin1 - y3 * sin3(part of b3)
- smlsl v20.4s, v10.4h, v2.4h[0]
- smlal v20.4s, v11.4h, v1.4h[2]
+ smlsl v20.4s, v10.4h, v2.h[0]
+ smlal v20.4s, v11.4h, v1.h[2]
- smlsl v22.4s, v10.4h, v6.4h[0]
- smlal v22.4s, v11.4h, v3.4h[2]
+ smlsl v22.4s, v10.4h, v6.h[0]
+ smlal v22.4s, v11.4h, v3.h[2]
- smlal v16.4s, v10.4h, v6.4h[0]
- smlsl v16.4s, v11.4h, v7.4h[2]
+ smlal v16.4s, v10.4h, v6.h[0]
+ smlsl v16.4s, v11.4h, v7.h[2]
- smlal v18.4s, v10.4h, v2.4h[0]
- smlsl v18.4s, v11.4h, v2.4h[2]
+ smlal v18.4s, v10.4h, v2.h[0]
+ smlsl v18.4s, v11.4h, v2.h[2]
cmp x12,x6
bhs stage2_shift4
@@ -2660,26 +2660,26 @@ stage2_shift3:
- smlsl v24.4s, v14.4h, v1.4h[1]
- smlsl v26.4s, v14.4h, v7.4h[3]
- smlal v28.4s, v14.4h, v1.4h[3]
- smlal v30.4s, v14.4h, v4.4h[3]
+ smlsl v24.4s, v14.4h, v1.h[1]
+ smlsl v26.4s, v14.4h, v7.h[3]
+ smlal v28.4s, v14.4h, v1.h[3]
+ smlal v30.4s, v14.4h, v4.h[3]
- smlal v24.4s, v15.4h, v2.4h[1]
- smlal v26.4s, v15.4h, v5.4h[1]
- smlsl v28.4s, v15.4h, v3.4h[1]
- smlsl v30.4s, v15.4h, v4.4h[1]
+ smlal v24.4s, v15.4h, v2.h[1]
+ smlal v26.4s, v15.4h, v5.h[1]
+ smlsl v28.4s, v15.4h, v3.h[1]
+ smlsl v30.4s, v15.4h, v4.h[1]
- smlsl v20.4s, v12.4h, v5.4h[0]
- smlsl v20.4s, v13.4h, v7.4h[2]
- smlsl v22.4s, v12.4h, v1.4h[0]
- smlal v22.4s, v13.4h, v1.4h[2]
- smlsl v16.4s, v12.4h, v7.4h[0]
- smlal v16.4s, v13.4h, v5.4h[2]
- smlal v18.4s, v12.4h, v3.4h[0]
- smlsl v18.4s, v13.4h, v3.4h[2]
+ smlsl v20.4s, v12.4h, v5.h[0]
+ smlsl v20.4s, v13.4h, v7.h[2]
+ smlsl v22.4s, v12.4h, v1.h[0]
+ smlal v22.4s, v13.4h, v1.h[2]
+ smlsl v16.4s, v12.4h, v7.h[0]
+ smlal v16.4s, v13.4h, v5.h[2]
+ smlal v18.4s, v12.4h, v3.h[0]
+ smlsl v18.4s, v13.4h, v3.h[2]
cmp x12,x9
bhs stage2_shift4
@@ -2689,32 +2689,32 @@ stage2_shift3:
ld1 {v8.4h, v9.4h},[x1],x10
- smlsl v24.4s, v8.4h, v5.4h[3] //// y1 * cos1(part of b0)
- smlsl v26.4s, v8.4h, v2.4h[3] //// y1 * cos3(part of b1)
- smlal v28.4s, v8.4h, v4.4h[3] //// y1 * sin3(part of b2)
- smlal v30.4s, v8.4h, v3.4h[3] //// y1 * sin1(part of b3)
+ smlsl v24.4s, v8.4h, v5.h[3] //// y1 * cos1(part of b0)
+ smlsl v26.4s, v8.4h, v2.h[3] //// y1 * cos3(part of b1)
+ smlal v28.4s, v8.4h, v4.h[3] //// y1 * sin3(part of b2)
+ smlal v30.4s, v8.4h, v3.h[3] //// y1 * sin1(part of b3)
- smlsl v24.4s, v9.4h, v6.4h[3] //// y1 * cos1 + y3 * cos3(part of b0)
- smlal v26.4s, v9.4h, v0.4h[3] //// y1 * cos3 - y3 * sin1(part of b1)
- smlsl v28.4s, v9.4h, v6.4h[1] //// y1 * sin3 - y3 * cos1(part of b2)
- smlsl v30.4s, v9.4h, v3.4h[1] //// y1 * sin1 - y3 * sin3(part of b3)
+ smlsl v24.4s, v9.4h, v6.h[3] //// y1 * cos1 + y3 * cos3(part of b0)
+ smlal v26.4s, v9.4h, v0.h[3] //// y1 * cos3 - y3 * sin1(part of b1)
+ smlsl v28.4s, v9.4h, v6.h[1] //// y1 * sin3 - y3 * cos1(part of b2)
+ smlsl v30.4s, v9.4h, v3.h[1] //// y1 * sin1 - y3 * sin3(part of b3)
- smlal v20.4s, v10.4h, v0.4h[0]
- smlsl v20.4s, v11.4h, v0.4h[2]
+ smlal v20.4s, v10.4h, v0.h[0]
+ smlsl v20.4s, v11.4h, v0.h[2]
- smlsl v22.4s, v10.4h, v0.4h[0]
- smlal v22.4s, v11.4h, v6.4h[2]
+ smlsl v22.4s, v10.4h, v0.h[0]
+ smlal v22.4s, v11.4h, v6.h[2]
- smlsl v16.4s, v10.4h, v0.4h[0]
- smlal v16.4s, v11.4h, v2.4h[2]
+ smlsl v16.4s, v10.4h, v0.h[0]
+ smlal v16.4s, v11.4h, v2.h[2]
- smlal v18.4s, v10.4h, v0.4h[0]
- smlsl v18.4s, v11.4h, v4.4h[2]
+ smlal v18.4s, v10.4h, v0.h[0]
+ smlsl v18.4s, v11.4h, v4.h[2]
ld1 {v12.4h, v13.4h},[x1],#16
ld1 {v14.4h, v15.4h},[x1],x10
@@ -2722,26 +2722,26 @@ stage2_shift3:
- smlal v24.4s, v14.4h, v3.4h[1]
- smlsl v26.4s, v14.4h, v2.4h[1]
- smlal v28.4s, v14.4h, v7.4h[3]
- smlal v30.4s, v14.4h, v2.4h[3]
+ smlal v24.4s, v14.4h, v3.h[1]
+ smlsl v26.4s, v14.4h, v2.h[1]
+ smlal v28.4s, v14.4h, v7.h[3]
+ smlal v30.4s, v14.4h, v2.h[3]
- smlsl v24.4s, v15.4h, v0.4h[3]
- smlal v26.4s, v15.4h, v4.4h[3]
- smlal v28.4s, v15.4h, v6.4h[3]
- smlsl v30.4s, v15.4h, v2.4h[1]
+ smlsl v24.4s, v15.4h, v0.h[3]
+ smlal v26.4s, v15.4h, v4.h[3]
+ smlal v28.4s, v15.4h, v6.h[3]
+ smlsl v30.4s, v15.4h, v2.h[1]
- smlal v20.4s, v12.4h, v3.4h[0]
- smlsl v20.4s, v13.4h, v6.4h[2]
- smlal v22.4s, v12.4h, v7.4h[0]
- smlsl v22.4s, v13.4h, v4.4h[2]
- smlsl v16.4s, v12.4h, v1.4h[0]
- smlal v16.4s, v13.4h, v0.4h[2]
- smlal v18.4s, v12.4h, v5.4h[0]
- smlsl v18.4s, v13.4h, v5.4h[2]
+ smlal v20.4s, v12.4h, v3.h[0]
+ smlsl v20.4s, v13.4h, v6.h[2]
+ smlal v22.4s, v12.4h, v7.h[0]
+ smlsl v22.4s, v13.4h, v4.h[2]
+ smlsl v16.4s, v12.4h, v1.h[0]
+ smlal v16.4s, v13.4h, v0.h[2]
+ smlal v18.4s, v12.4h, v5.h[0]
+ smlsl v18.4s, v13.4h, v5.h[2]
ld1 {v10.4h, v11.4h},[x1],#16
@@ -2750,32 +2750,32 @@ stage2_shift3:
- smlal v24.4s, v8.4h, v3.4h[3] //// y1 * cos1(part of b0)
- smlsl v26.4s, v8.4h, v7.4h[1] //// y1 * cos3(part of b1)
- smlsl v28.4s, v8.4h, v5.4h[1] //// y1 * sin3(part of b2)
- smlal v30.4s, v8.4h, v1.4h[3] //// y1 * sin1(part of b3)
+ smlal v24.4s, v8.4h, v3.h[3] //// y1 * cos1(part of b0)
+ smlsl v26.4s, v8.4h, v7.h[1] //// y1 * cos3(part of b1)
+ smlsl v28.4s, v8.4h, v5.h[1] //// y1 * sin3(part of b2)
+ smlal v30.4s, v8.4h, v1.h[3] //// y1 * sin1(part of b3)
- smlsl v24.4s, v9.4h, v7.4h[1] //// y1 * cos1 + y3 * cos3(part of b0)
- smlsl v26.4s, v9.4h, v6.4h[1] //// y1 * cos3 - y3 * sin1(part of b1)
- smlal v28.4s, v9.4h, v3.4h[3] //// y1 * sin3 - y3 * cos1(part of b2)
- smlsl v30.4s, v9.4h, v1.4h[1] //// y1 * sin1 - y3 * sin3(part of b3)
+ smlsl v24.4s, v9.4h, v7.h[1] //// y1 * cos1 + y3 * cos3(part of b0)
+ smlsl v26.4s, v9.4h, v6.h[1] //// y1 * cos3 - y3 * sin1(part of b1)
+ smlal v28.4s, v9.4h, v3.h[3] //// y1 * sin3 - y3 * cos1(part of b2)
+ smlsl v30.4s, v9.4h, v1.h[1] //// y1 * sin1 - y3 * sin3(part of b3)
- smlsl v20.4s, v10.4h, v6.4h[0]
- smlal v20.4s, v11.4h, v2.4h[2]
+ smlsl v20.4s, v10.4h, v6.h[0]
+ smlal v20.4s, v11.4h, v2.h[2]
- smlal v22.4s, v10.4h, v2.4h[0]
- smlsl v22.4s, v11.4h, v0.4h[2]
+ smlal v22.4s, v10.4h, v2.h[0]
+ smlsl v22.4s, v11.4h, v0.h[2]
- smlsl v16.4s, v10.4h, v2.4h[0]
- smlal v16.4s, v11.4h, v3.4h[2]
+ smlsl v16.4s, v10.4h, v2.h[0]
+ smlal v16.4s, v11.4h, v3.h[2]
- smlal v18.4s, v10.4h, v6.4h[0]
- smlsl v18.4s, v11.4h, v6.4h[2]
+ smlal v18.4s, v10.4h, v6.h[0]
+ smlsl v18.4s, v11.4h, v6.h[2]
ld1 {v12.4h, v13.4h},[x1],#16
@@ -2783,26 +2783,26 @@ stage2_shift3:
- smlsl v24.4s, v14.4h, v5.4h[1]
- smlal v26.4s, v14.4h, v3.4h[3]
- smlsl v28.4s, v14.4h, v2.4h[1]
- smlal v30.4s, v14.4h, v0.4h[3]
+ smlsl v24.4s, v14.4h, v5.h[1]
+ smlal v26.4s, v14.4h, v3.h[3]
+ smlsl v28.4s, v14.4h, v2.h[1]
+ smlal v30.4s, v14.4h, v0.h[3]
- smlal v24.4s, v15.4h, v1.4h[3]
- smlsl v26.4s, v15.4h, v1.4h[1]
- smlal v28.4s, v15.4h, v0.4h[3]
- smlsl v30.4s, v15.4h, v0.4h[1]
+ smlal v24.4s, v15.4h, v1.h[3]
+ smlsl v26.4s, v15.4h, v1.h[1]
+ smlal v28.4s, v15.4h, v0.h[3]
+ smlsl v30.4s, v15.4h, v0.h[1]
- smlsl v20.4s, v12.4h, v1.4h[0]
- smlal v20.4s, v13.4h, v4.4h[2]
- smlal v22.4s, v12.4h, v3.4h[0]
- smlsl v22.4s, v13.4h, v5.4h[2]
- smlsl v16.4s, v12.4h, v5.4h[0]
- smlal v16.4s, v13.4h, v6.4h[2]
- smlal v18.4s, v12.4h, v7.4h[0]
- smlsl v18.4s, v13.4h, v7.4h[2]
+ smlsl v20.4s, v12.4h, v1.h[0]
+ smlal v20.4s, v13.4h, v4.h[2]
+ smlal v22.4s, v12.4h, v3.h[0]
+ smlsl v22.4s, v13.4h, v5.h[2]
+ smlsl v16.4s, v12.4h, v5.h[0]
+ smlal v16.4s, v13.4h, v6.h[2]
+ smlal v18.4s, v12.4h, v7.h[0]
+ smlsl v18.4s, v13.4h, v7.h[2]
stage2_shift4:
add v8.4s, v20.4s , v24.4s
diff --git a/common/arm64/ihevc_itrans_recon_4x4.s b/common/arm64/ihevc_itrans_recon_4x4.s
index 1f2c904..61fa5d7 100644
--- a/common/arm64/ihevc_itrans_recon_4x4.s
+++ b/common/arm64/ihevc_itrans_recon_4x4.s
@@ -140,11 +140,11 @@ ihevc_itrans_recon_4x4_av8:
// first stage computation starts
- smull v6.4s, v1.4h, v4.4h[1] //83 * pi2_src[1]
- smlal v6.4s, v3.4h, v4.4h[3] //o[0] = 83 * pi2_src[1] + 36 * pi2_src[3]
- smull v5.4s, v1.4h, v4.4h[3] //36 * pi2_src[1]
+ smull v6.4s, v1.4h, v4.h[1] //83 * pi2_src[1]
+ smlal v6.4s, v3.4h, v4.h[3] //o[0] = 83 * pi2_src[1] + 36 * pi2_src[3]
+ smull v5.4s, v1.4h, v4.h[3] //36 * pi2_src[1]
ld1 {v22.s}[0],[x2],x5
- smlsl v5.4s, v3.4h, v4.4h[1] //o[1] = 36 * pi2_src[1] - 83 * pi2_src[3]
+ smlsl v5.4s, v3.4h, v4.h[1] //o[1] = 36 * pi2_src[1] - 83 * pi2_src[3]
saddl v7.4s, v0.4h, v2.4h //pi2_src[0] + pi2_src[2]
ssubl v17.4s, v0.4h, v2.4h //pi2_src[0] - pi2_src[2]
@@ -173,11 +173,11 @@ ihevc_itrans_recon_4x4_av8:
// first stage ends
// output in d0,d1,d2,d3
// second stage starts
- smull v6.4s, v1.4h, v4.4h[1] //83 * pi2_src[1]
+ smull v6.4s, v1.4h, v4.h[1] //83 * pi2_src[1]
ld1 {v22.s}[1],[x2],x5
- smlal v6.4s, v3.4h, v4.4h[3] //o[0] = 83 * pi2_src[1] + 36 * pi2_src[3]
- smull v5.4s, v1.4h, v4.4h[3] //36 * pi2_src[1]
- smlsl v5.4s, v3.4h, v4.4h[1] //o[1] = 36 * pi2_src[1] - 83 * pi2_src[3]
+ smlal v6.4s, v3.4h, v4.h[3] //o[0] = 83 * pi2_src[1] + 36 * pi2_src[3]
+ smull v5.4s, v1.4h, v4.h[3] //36 * pi2_src[1]
+ smlsl v5.4s, v3.4h, v4.h[1] //o[1] = 36 * pi2_src[1] - 83 * pi2_src[3]
ld1 {v23.s}[0],[x2],x5
saddl v7.4s, v0.4h, v2.4h //pi2_src[0] + pi2_src[2]
diff --git a/common/arm64/ihevc_itrans_recon_4x4_ttype1.s b/common/arm64/ihevc_itrans_recon_4x4_ttype1.s
index da04c5e..c30f358 100644
--- a/common/arm64/ihevc_itrans_recon_4x4_ttype1.s
+++ b/common/arm64/ihevc_itrans_recon_4x4_ttype1.s
@@ -127,34 +127,34 @@ ihevc_itrans_recon_4x4_ttype1_av8:
mov x9,#55
mov x10,#74
mov x11,#84
- mov v4.4h[0], w8
+ mov v4.h[0], w8
ld1 {v0.4h},[x0],x4 //loading pi2_src 1st row
- mov v4.4h[1], w9
+ mov v4.h[1], w9
ld1 {v1.4h},[x0],x4 //loading pi2_src 2nd row
- mov v4.4h[2], w10
+ mov v4.h[2], w10
ld1 {v2.4h},[x0],x4 //loading pi2_src 3rd row
- mov v4.4h[3], w11
+ mov v4.h[3], w11
ld1 {v3.4h},[x0],x4 //loading pi2_src 4th row
// first stage computation starts
- smull v6.4s, v1.4h, v4.4h[2] //74 * pi2_src[1]
- smlal v6.4s, v0.4h, v4.4h[0] //74 * pi2_src[1] + 29 * pi2_src[0]
- smlal v6.4s, v3.4h, v4.4h[1] //74 * pi2_src[1] + 29 * pi2_src[0] + 55 * pi2_src[3]
- smlal v6.4s, v2.4h, v4.4h[3] //pi2_out[0] = 29* pi2_src[0] + 74 * pi2_src[1] + 84* pi2_src[2] + 55 * pi2_src[3]
+ smull v6.4s, v1.4h, v4.h[2] //74 * pi2_src[1]
+ smlal v6.4s, v0.4h, v4.h[0] //74 * pi2_src[1] + 29 * pi2_src[0]
+ smlal v6.4s, v3.4h, v4.h[1] //74 * pi2_src[1] + 29 * pi2_src[0] + 55 * pi2_src[3]
+ smlal v6.4s, v2.4h, v4.h[3] //pi2_out[0] = 29* pi2_src[0] + 74 * pi2_src[1] + 84* pi2_src[2] + 55 * pi2_src[3]
- smull v5.4s, v1.4h, v4.4h[2] //74 * pi2_src[1]
- smlal v5.4s, v0.4h, v4.4h[1] //74 * pi2_src[1] + 55 * pi2_src[0]
- smlsl v5.4s, v2.4h, v4.4h[0] //74 * pi2_src[1] + 55 * pi2_src[0] - 29 * pi2_src[2]
- smlsl v5.4s, v3.4h, v4.4h[3] //pi2_out[1] = 74 * pi2_src[1] + 55 * pi2_src[0] - 29 * pi2_src[2] - 84 * pi2_src[3])
+ smull v5.4s, v1.4h, v4.h[2] //74 * pi2_src[1]
+ smlal v5.4s, v0.4h, v4.h[1] //74 * pi2_src[1] + 55 * pi2_src[0]
+ smlsl v5.4s, v2.4h, v4.h[0] //74 * pi2_src[1] + 55 * pi2_src[0] - 29 * pi2_src[2]
+ smlsl v5.4s, v3.4h, v4.h[3] //pi2_out[1] = 74 * pi2_src[1] + 55 * pi2_src[0] - 29 * pi2_src[2] - 84 * pi2_src[3])
- smull v7.4s, v0.4h, v4.4h[2] // 74 * pi2_src[0]
- smlsl v7.4s, v2.4h, v4.4h[2] // 74 * pi2_src[0] - 74 * pi2_src[2]
- smlal v7.4s, v3.4h, v4.4h[2] //pi2_out[2] = 74 * pi2_src[0] - 74 * pi2_src[2] + 74 * pi2_src[3]
+ smull v7.4s, v0.4h, v4.h[2] // 74 * pi2_src[0]
+ smlsl v7.4s, v2.4h, v4.h[2] // 74 * pi2_src[0] - 74 * pi2_src[2]
+ smlal v7.4s, v3.4h, v4.h[2] //pi2_out[2] = 74 * pi2_src[0] - 74 * pi2_src[2] + 74 * pi2_src[3]
- smull v20.4s, v2.4h, v4.4h[1] // 55 * pi2_src[2]
- smlsl v20.4s, v1.4h, v4.4h[2] // 55 * pi2_src[2] - 74 * pi2_src[1]
- smlsl v20.4s, v3.4h, v4.4h[0] // - 74 * pi2_src[1] + 55 * pi2_src[2] - 29 * pi2_src[3]
- smlal v20.4s, v0.4h, v4.4h[3] //pi2_out[3] = 84 * pi2_src[0] - 74 * pi2_src[1] + 55 * pi2_src[2] - 29 * pi2_src[3]
+ smull v20.4s, v2.4h, v4.h[1] // 55 * pi2_src[2]
+ smlsl v20.4s, v1.4h, v4.h[2] // 55 * pi2_src[2] - 74 * pi2_src[1]
+ smlsl v20.4s, v3.4h, v4.h[0] // - 74 * pi2_src[1] + 55 * pi2_src[2] - 29 * pi2_src[3]
+ smlal v20.4s, v0.4h, v4.h[3] //pi2_out[3] = 84 * pi2_src[0] - 74 * pi2_src[1] + 55 * pi2_src[2] - 29 * pi2_src[3]
sqrshrn v28.4h, v6.4s,#shift_stage1_idct // (pi2_out[0] + rounding ) >> shift_stage1_idct
sqrshrn v29.4h, v5.4s,#shift_stage1_idct // (pi2_out[1] + rounding ) >> shift_stage1_idct
@@ -180,25 +180,25 @@ ihevc_itrans_recon_4x4_ttype1_av8:
// d16 - d2
// d17 - d3
ld1 {v18.s}[1],[x2],x5
- smull v6.4s, v22.4h, v4.4h[2] //74 * pi2_src[1]
- smlal v6.4s, v21.4h, v4.4h[0] //74 * pi2_src[1] + 29 * pi2_src[0]
- smlal v6.4s, v17.4h, v4.4h[1] //74 * pi2_src[1] + 29 * pi2_src[0] + 55 * pi2_src[3]
- smlal v6.4s, v16.4h, v4.4h[3] //pi2_out[0] = 29* pi2_src[0] + 74 * pi2_src[1] + 84* pi2_src[2] + 55 * pi2_src[3]
-
- smull v5.4s, v22.4h, v4.4h[2] //74 * pi2_src[1]
- smlal v5.4s, v21.4h, v4.4h[1] //74 * pi2_src[1] + 55 * pi2_src[0]
- smlsl v5.4s, v16.4h, v4.4h[0] //74 * pi2_src[1] + 55 * pi2_src[0] - 29 * pi2_src[2]
- smlsl v5.4s, v17.4h, v4.4h[3] //pi2_out[1] = 74 * pi2_src[1] + 55 * pi2_src[0] - 29 * pi2_src[2] - 84 * pi2_src[3])
-
- smull v7.4s, v21.4h, v4.4h[2] // 74 * pi2_src[0]
- smlsl v7.4s, v16.4h, v4.4h[2] // 74 * pi2_src[0] - 74 * pi2_src[2]
- smlal v7.4s, v17.4h, v4.4h[2] //pi2_out[2] = 74 * pi2_src[0] - 74 * pi2_src[2] + 74 * pi2_src[3]
+ smull v6.4s, v22.4h, v4.h[2] //74 * pi2_src[1]
+ smlal v6.4s, v21.4h, v4.h[0] //74 * pi2_src[1] + 29 * pi2_src[0]
+ smlal v6.4s, v17.4h, v4.h[1] //74 * pi2_src[1] + 29 * pi2_src[0] + 55 * pi2_src[3]
+ smlal v6.4s, v16.4h, v4.h[3] //pi2_out[0] = 29* pi2_src[0] + 74 * pi2_src[1] + 84* pi2_src[2] + 55 * pi2_src[3]
+
+ smull v5.4s, v22.4h, v4.h[2] //74 * pi2_src[1]
+ smlal v5.4s, v21.4h, v4.h[1] //74 * pi2_src[1] + 55 * pi2_src[0]
+ smlsl v5.4s, v16.4h, v4.h[0] //74 * pi2_src[1] + 55 * pi2_src[0] - 29 * pi2_src[2]
+ smlsl v5.4s, v17.4h, v4.h[3] //pi2_out[1] = 74 * pi2_src[1] + 55 * pi2_src[0] - 29 * pi2_src[2] - 84 * pi2_src[3])
+
+ smull v7.4s, v21.4h, v4.h[2] // 74 * pi2_src[0]
+ smlsl v7.4s, v16.4h, v4.h[2] // 74 * pi2_src[0] - 74 * pi2_src[2]
+ smlal v7.4s, v17.4h, v4.h[2] //pi2_out[2] = 74 * pi2_src[0] - 74 * pi2_src[2] + 74 * pi2_src[3]
ld1 {v19.s}[0],[x2],x5
- smull v20.4s, v16.4h, v4.4h[1] // 55 * pi2_src[2]
- smlsl v20.4s, v22.4h, v4.4h[2] // - 74 * pi2_src[1] + 55 * pi2_src[2]
- smlsl v20.4s, v17.4h, v4.4h[0] // - 74 * pi2_src[1] + 55 * pi2_src[2] - 29 * pi2_src[3]
- smlal v20.4s, v21.4h, v4.4h[3] //pi2_out[3] = 84 * pi2_src[0] - 74 * pi2_src[1] + 55 * pi2_src[2] - 29 * pi2_src[3]
+ smull v20.4s, v16.4h, v4.h[1] // 55 * pi2_src[2]
+ smlsl v20.4s, v22.4h, v4.h[2] // - 74 * pi2_src[1] + 55 * pi2_src[2]
+ smlsl v20.4s, v17.4h, v4.h[0] // - 74 * pi2_src[1] + 55 * pi2_src[2] - 29 * pi2_src[3]
+ smlal v20.4s, v21.4h, v4.h[3] //pi2_out[3] = 84 * pi2_src[0] - 74 * pi2_src[1] + 55 * pi2_src[2] - 29 * pi2_src[3]
sqrshrn v28.4h, v6.4s,#shift_stage2_idct // (pi2_out[0] + rounding ) >> shift_stage1_idct
sqrshrn v29.4h, v5.4s,#shift_stage2_idct // (pi2_out[1] + rounding ) >> shift_stage1_idct
diff --git a/common/arm64/ihevc_itrans_recon_8x8.s b/common/arm64/ihevc_itrans_recon_8x8.s
index 332677e..86ad136 100644
--- a/common/arm64/ihevc_itrans_recon_8x8.s
+++ b/common/arm64/ihevc_itrans_recon_8x8.s
@@ -184,30 +184,30 @@ ihevc_itrans_recon_8x8_av8:
ld1 {v2.4h},[x0],#8
ld1 {v3.4h},[x9],#8
ld1 {v4.4h},[x0],x5
- smull v20.4s, v2.4h, v0.4h[0] //// y0 * cos4(part of c0 and c1)
+ smull v20.4s, v2.4h, v0.h[0] //// y0 * cos4(part of c0 and c1)
ld1 {v5.4h},[x9],x5
- smull v18.4s, v3.4h, v1.4h[2] //// y2 * sin2 (q3 is freed by this time)(part of d1)
+ smull v18.4s, v3.4h, v1.h[2] //// y2 * sin2 (q3 is freed by this time)(part of d1)
ld1 {v6.4h},[x0],#8
ld1 {v7.4h},[x9],#8
- smull v24.4s, v6.4h, v0.4h[1] //// y1 * cos1(part of b0)
+ smull v24.4s, v6.4h, v0.h[1] //// y1 * cos1(part of b0)
ld1 {v8.4h},[x0],x10
- smull v26.4s, v6.4h, v0.4h[3] //// y1 * cos3(part of b1)
+ smull v26.4s, v6.4h, v0.h[3] //// y1 * cos3(part of b1)
ld1 {v9.4h},[x9],x10
- smull v28.4s, v6.4h, v1.4h[1] //// y1 * sin3(part of b2)
+ smull v28.4s, v6.4h, v1.h[1] //// y1 * sin3(part of b2)
ld1 {v10.4h},[x0],#8
- smull v30.4s, v6.4h, v1.4h[3] //// y1 * sin1(part of b3)
+ smull v30.4s, v6.4h, v1.h[3] //// y1 * sin1(part of b3)
ld1 {v11.4h},[x9],#8
- smlal v24.4s, v7.4h, v0.4h[3] //// y1 * cos1 + y3 * cos3(part of b0)
+ smlal v24.4s, v7.4h, v0.h[3] //// y1 * cos1 + y3 * cos3(part of b0)
ld1 {v12.4h},[x0],x5
- smlsl v26.4s, v7.4h, v1.4h[3] //// y1 * cos3 - y3 * sin1(part of b1)
+ smlsl v26.4s, v7.4h, v1.h[3] //// y1 * cos3 - y3 * sin1(part of b1)
ld1 {v13.4h},[x9],x5
- smlsl v28.4s, v7.4h, v0.4h[1] //// y1 * sin3 - y3 * cos1(part of b2)
+ smlsl v28.4s, v7.4h, v0.h[1] //// y1 * sin3 - y3 * cos1(part of b2)
ld1 {v14.4h},[x0],#8
- smlsl v30.4s, v7.4h, v1.4h[1] //// y1 * sin1 - y3 * sin3(part of b3)
+ smlsl v30.4s, v7.4h, v1.h[1] //// y1 * sin1 - y3 * sin3(part of b3)
ld1 {v15.4h},[x9],#8
- smull v22.4s, v10.4h, v0.4h[0] //// y4 * cos4(part of c0 and c1)
+ smull v22.4s, v10.4h, v0.h[0] //// y4 * cos4(part of c0 and c1)
ld1 {v16.4h},[x0],x10
- smull v6.4s, v3.4h, v0.4h[2] //// y2 * cos2(part of d0)
+ smull v6.4s, v3.4h, v0.h[2] //// y2 * cos2(part of d0)
ld1 {v17.4h},[x9],x10
///* this following was activated when alignment is not there */
@@ -231,21 +231,21 @@ ihevc_itrans_recon_8x8_av8:
- smlal v24.4s, v14.4h, v1.4h[1] //// y1 * cos1 + y3 * cos3 + y5 * sin3(part of b0)
- smlsl v26.4s, v14.4h, v0.4h[1] //// y1 * cos3 - y3 * sin1 - y5 * cos1(part of b1)
- smlal v28.4s, v14.4h, v1.4h[3] //// y1 * sin3 - y3 * cos1 + y5 * sin1(part of b2)
- smlal v30.4s, v14.4h, v0.4h[3] //// y1 * sin1 - y3 * sin3 + y5 * cos3(part of b3)
+ smlal v24.4s, v14.4h, v1.h[1] //// y1 * cos1 + y3 * cos3 + y5 * sin3(part of b0)
+ smlsl v26.4s, v14.4h, v0.h[1] //// y1 * cos3 - y3 * sin1 - y5 * cos1(part of b1)
+ smlal v28.4s, v14.4h, v1.h[3] //// y1 * sin3 - y3 * cos1 + y5 * sin1(part of b2)
+ smlal v30.4s, v14.4h, v0.h[3] //// y1 * sin1 - y3 * sin3 + y5 * cos3(part of b3)
- smlsl v18.4s, v11.4h, v0.4h[2] //// d1 = y2 * sin2 - y6 * cos2(part of a0 and a1)
- smlal v6.4s, v11.4h, v1.4h[2] //// d0 = y2 * cos2 + y6 * sin2(part of a0 and a1)
+ smlsl v18.4s, v11.4h, v0.h[2] //// d1 = y2 * sin2 - y6 * cos2(part of a0 and a1)
+ smlal v6.4s, v11.4h, v1.h[2] //// d0 = y2 * cos2 + y6 * sin2(part of a0 and a1)
add v10.4s, v20.4s , v22.4s //// c0 = y0 * cos4 + y4 * cos4(part of a0 and a1)
sub v20.4s, v20.4s , v22.4s //// c1 = y0 * cos4 - y4 * cos4(part of a0 and a1)
- smlal v24.4s, v15.4h, v1.4h[3] //// b0 = y1 * cos1 + y3 * cos3 + y5 * sin3 + y7 * sin1(part of x0,x7)
- smlsl v26.4s, v15.4h, v1.4h[1] //// b1 = y1 * cos3 - y3 * sin1 - y5 * cos1 - y7 * sin3(part of x1,x6)
- smlal v28.4s, v15.4h, v0.4h[3] //// b2 = y1 * sin3 - y3 * cos1 + y5 * sin1 + y7 * cos3(part of x2,x5)
- smlsl v30.4s, v15.4h, v0.4h[1] //// b3 = y1 * sin1 - y3 * sin3 + y5 * cos3 - y7 * cos1(part of x3,x4)
+ smlal v24.4s, v15.4h, v1.h[3] //// b0 = y1 * cos1 + y3 * cos3 + y5 * sin3 + y7 * sin1(part of x0,x7)
+ smlsl v26.4s, v15.4h, v1.h[1] //// b1 = y1 * cos3 - y3 * sin1 - y5 * cos1 - y7 * sin3(part of x1,x6)
+ smlal v28.4s, v15.4h, v0.h[3] //// b2 = y1 * sin3 - y3 * cos1 + y5 * sin1 + y7 * cos3(part of x2,x5)
+ smlsl v30.4s, v15.4h, v0.h[1] //// b3 = y1 * sin1 - y3 * sin3 + y5 * cos3 - y7 * cos1(part of x3,x4)
add v14.4s, v10.4s , v6.4s //// a0 = c0 + d0(part of x0,x7)
sub v10.4s, v10.4s , v6.4s //// a3 = c0 - d0(part of x3,x4)
@@ -301,20 +301,20 @@ skip_last4_rows:
- smull v24.4s, v6.4h, v0.4h[1] //// y1 * cos1(part of b0)
- smull v26.4s, v6.4h, v0.4h[3] //// y1 * cos3(part of b1)
- smull v28.4s, v6.4h, v1.4h[1] //// y1 * sin3(part of b2)
- smull v30.4s, v6.4h, v1.4h[3] //// y1 * sin1(part of b3)
+ smull v24.4s, v6.4h, v0.h[1] //// y1 * cos1(part of b0)
+ smull v26.4s, v6.4h, v0.h[3] //// y1 * cos3(part of b1)
+ smull v28.4s, v6.4h, v1.h[1] //// y1 * sin3(part of b2)
+ smull v30.4s, v6.4h, v1.h[3] //// y1 * sin1(part of b3)
- smlal v24.4s, v7.4h, v0.4h[3] //// y1 * cos1 + y3 * cos3(part of b0)
- smlsl v26.4s, v7.4h, v1.4h[3] //// y1 * cos3 - y3 * sin1(part of b1)
- smlsl v28.4s, v7.4h, v0.4h[1] //// y1 * sin3 - y3 * cos1(part of b2)
- smlsl v30.4s, v7.4h, v1.4h[1] //// y1 * sin1 - y3 * sin3(part of b3)
+ smlal v24.4s, v7.4h, v0.h[3] //// y1 * cos1 + y3 * cos3(part of b0)
+ smlsl v26.4s, v7.4h, v1.h[3] //// y1 * cos3 - y3 * sin1(part of b1)
+ smlsl v28.4s, v7.4h, v0.h[1] //// y1 * sin3 - y3 * cos1(part of b2)
+ smlsl v30.4s, v7.4h, v1.h[1] //// y1 * sin1 - y3 * sin3(part of b3)
- smull v18.4s, v3.4h, v1.4h[2] //// y2 * sin2 (q3 is freed by this time)(part of d1)
- smull v6.4s, v3.4h, v0.4h[2] //// y2 * cos2(part of d0)
+ smull v18.4s, v3.4h, v1.h[2] //// y2 * sin2 (q3 is freed by this time)(part of d1)
+ smull v6.4s, v3.4h, v0.h[2] //// y2 * cos2(part of d0)
- smull v20.4s, v2.4h, v0.4h[0] //// y0 * cos4(part of c0 and c1)
+ smull v20.4s, v2.4h, v0.h[0] //// y0 * cos4(part of c0 and c1)
add v14.4s, v20.4s , v6.4s //// a0 = c0 + d0(part of x0,x7)
@@ -350,37 +350,37 @@ last4_cols:
cmp x12,#0xf0
bge skip_last4cols
- smull v24.4s, v8.4h, v0.4h[1] //// y1 * cos1(part of b0)
- smull v26.4s, v8.4h, v0.4h[3] //// y1 * cos3(part of b1)
- smull v28.4s, v8.4h, v1.4h[1] //// y1 * sin3(part of b2)
- smull v30.4s, v8.4h, v1.4h[3] //// y1 * sin1(part of b3)
+ smull v24.4s, v8.4h, v0.h[1] //// y1 * cos1(part of b0)
+ smull v26.4s, v8.4h, v0.h[3] //// y1 * cos3(part of b1)
+ smull v28.4s, v8.4h, v1.h[1] //// y1 * sin3(part of b2)
+ smull v30.4s, v8.4h, v1.h[3] //// y1 * sin1(part of b3)
- smlal v24.4s, v9.4h, v0.4h[3] //// y1 * cos1 + y3 * cos3(part of b0)
- smlsl v26.4s, v9.4h, v1.4h[3] //// y1 * cos3 - y3 * sin1(part of b1)
- smlsl v28.4s, v9.4h, v0.4h[1] //// y1 * sin3 - y3 * cos1(part of b2)
- smlsl v30.4s, v9.4h, v1.4h[1] //// y1 * sin1 - y3 * sin3(part of b3)
+ smlal v24.4s, v9.4h, v0.h[3] //// y1 * cos1 + y3 * cos3(part of b0)
+ smlsl v26.4s, v9.4h, v1.h[3] //// y1 * cos3 - y3 * sin1(part of b1)
+ smlsl v28.4s, v9.4h, v0.h[1] //// y1 * sin3 - y3 * cos1(part of b2)
+ smlsl v30.4s, v9.4h, v1.h[1] //// y1 * sin1 - y3 * sin3(part of b3)
- smull v18.4s, v5.4h, v1.4h[2] //// y2 * sin2 (q4 is freed by this time)(part of d1)
- smull v8.4s, v5.4h, v0.4h[2] //// y2 * cos2(part of d0)
+ smull v18.4s, v5.4h, v1.h[2] //// y2 * sin2 (q4 is freed by this time)(part of d1)
+ smull v8.4s, v5.4h, v0.h[2] //// y2 * cos2(part of d0)
- smull v20.4s, v4.4h, v0.4h[0] //// y0 * cos4(part of c0 and c1)
- smull v22.4s, v12.4h, v0.4h[0] //// y4 * cos4(part of c0 and c1)
+ smull v20.4s, v4.4h, v0.h[0] //// y0 * cos4(part of c0 and c1)
+ smull v22.4s, v12.4h, v0.h[0] //// y4 * cos4(part of c0 and c1)
- smlal v24.4s, v16.4h, v1.4h[1] //// y1 * cos1 + y3 * cos3 + y5 * sin3(part of b0)
- smlsl v26.4s, v16.4h, v0.4h[1] //// y1 * cos3 - y3 * sin1 - y5 * cos1(part of b1)
- smlal v28.4s, v16.4h, v1.4h[3] //// y1 * sin3 - y3 * cos1 + y5 * sin1(part of b2)
- smlal v30.4s, v16.4h, v0.4h[3] //// y1 * sin1 - y3 * sin3 + y5 * cos3(part of b3)
+ smlal v24.4s, v16.4h, v1.h[1] //// y1 * cos1 + y3 * cos3 + y5 * sin3(part of b0)
+ smlsl v26.4s, v16.4h, v0.h[1] //// y1 * cos3 - y3 * sin1 - y5 * cos1(part of b1)
+ smlal v28.4s, v16.4h, v1.h[3] //// y1 * sin3 - y3 * cos1 + y5 * sin1(part of b2)
+ smlal v30.4s, v16.4h, v0.h[3] //// y1 * sin1 - y3 * sin3 + y5 * cos3(part of b3)
- smlsl v18.4s, v13.4h, v0.4h[2] //// d1 = y2 * sin2 - y6 * cos2(part of a0 and a1)
- smlal v8.4s, v13.4h, v1.4h[2] //// d0 = y2 * cos2 + y6 * sin2(part of a0 and a1)
+ smlsl v18.4s, v13.4h, v0.h[2] //// d1 = y2 * sin2 - y6 * cos2(part of a0 and a1)
+ smlal v8.4s, v13.4h, v1.h[2] //// d0 = y2 * cos2 + y6 * sin2(part of a0 and a1)
add v12.4s, v20.4s , v22.4s //// c0 = y0 * cos4 + y4 * cos4(part of a0 and a1)
sub v20.4s, v20.4s , v22.4s //// c1 = y0 * cos4 - y4 * cos4(part of a0 and a1)
- smlal v24.4s, v17.4h, v1.4h[3] //// b0 = y1 * cos1 + y3 * cos3 + y5 * sin3 + y7 * sin1(part of e0,e7)
- smlsl v26.4s, v17.4h, v1.4h[1] //// b1 = y1 * cos3 - y3 * sin1 - y5 * cos1 - y7 * sin3(part of e1,e6)
- smlal v28.4s, v17.4h, v0.4h[3] //// b2 = y1 * sin3 - y3 * cos1 + y5 * sin1 + y7 * cos3(part of e2,e5)
- smlsl v30.4s, v17.4h, v0.4h[1] //// b3 = y1 * sin1 - y3 * sin3 + y5 * cos3 - y7 * cos1(part of e3,e4)
+ smlal v24.4s, v17.4h, v1.h[3] //// b0 = y1 * cos1 + y3 * cos3 + y5 * sin3 + y7 * sin1(part of e0,e7)
+ smlsl v26.4s, v17.4h, v1.h[1] //// b1 = y1 * cos3 - y3 * sin1 - y5 * cos1 - y7 * sin3(part of e1,e6)
+ smlal v28.4s, v17.4h, v0.h[3] //// b2 = y1 * sin3 - y3 * cos1 + y5 * sin1 + y7 * cos3(part of e2,e5)
+ smlsl v30.4s, v17.4h, v0.h[1] //// b3 = y1 * sin1 - y3 * sin3 + y5 * cos3 - y7 * cos1(part of e3,e4)
add v16.4s, v12.4s , v8.4s //// a0 = c0 + d0(part of e0,e7)
sub v12.4s, v12.4s , v8.4s //// a3 = c0 - d0(part of e3,e4)
@@ -440,21 +440,21 @@ skip_last4cols:
mov v25.d[0],x15
- smull v24.4s, v6.4h, v0.4h[1] //// y1 * cos1(part of b0)
- smull v26.4s, v6.4h, v0.4h[3] //// y1 * cos3(part of b1)
- smull v28.4s, v6.4h, v1.4h[1] //// y1 * sin3(part of b2)
- smull v30.4s, v6.4h, v1.4h[3] //// y1 * sin1(part of b3)
+ smull v24.4s, v6.4h, v0.h[1] //// y1 * cos1(part of b0)
+ smull v26.4s, v6.4h, v0.h[3] //// y1 * cos3(part of b1)
+ smull v28.4s, v6.4h, v1.h[1] //// y1 * sin3(part of b2)
+ smull v30.4s, v6.4h, v1.h[3] //// y1 * sin1(part of b3)
- smlal v24.4s, v7.4h, v0.4h[3] //// y1 * cos1 + y3 * cos3(part of b0)
- smlsl v26.4s, v7.4h, v1.4h[3] //// y1 * cos3 - y3 * sin1(part of b1)
- smlsl v28.4s, v7.4h, v0.4h[1] //// y1 * sin3 - y3 * cos1(part of b2)
- smlsl v30.4s, v7.4h, v1.4h[1] //// y1 * sin1 - y3 * sin3(part of b3)
+ smlal v24.4s, v7.4h, v0.h[3] //// y1 * cos1 + y3 * cos3(part of b0)
+ smlsl v26.4s, v7.4h, v1.h[3] //// y1 * cos3 - y3 * sin1(part of b1)
+ smlsl v28.4s, v7.4h, v0.h[1] //// y1 * sin3 - y3 * cos1(part of b2)
+ smlsl v30.4s, v7.4h, v1.h[1] //// y1 * sin1 - y3 * sin3(part of b3)
- smull v20.4s, v2.4h, v0.4h[0] //// y0 * cos4(part of c0 and c1)
+ smull v20.4s, v2.4h, v0.h[0] //// y0 * cos4(part of c0 and c1)
// vmull.s16 q11,d4,d0[0] @// y4 * cos4(part of c0 and c1)
- smull v18.4s, v3.4h, v1.4h[2] //// y2 * sin2 (q3 is freed by this time)(part of d1)
- smull v6.4s, v3.4h, v0.4h[2] //// y2 * cos2(part of d0)
+ smull v18.4s, v3.4h, v1.h[2] //// y2 * sin2 (q3 is freed by this time)(part of d1)
+ smull v6.4s, v3.4h, v0.h[2] //// y2 * cos2(part of d0)
@@ -520,19 +520,19 @@ skip_last4cols:
mov v25.d[0],x19
mov v25.d[1],x20
- smull v24.4s, v14.4h, v0.4h[1] //// y1 * cos1(part of b0)
+ smull v24.4s, v14.4h, v0.h[1] //// y1 * cos1(part of b0)
- smull v26.4s, v14.4h, v0.4h[3] //// y1 * cos3(part of b1)
- smull v28.4s, v14.4h, v1.4h[1] //// y1 * sin3(part of b2)
- smull v30.4s, v14.4h, v1.4h[3] //// y1 * sin1(part of b3)
+ smull v26.4s, v14.4h, v0.h[3] //// y1 * cos3(part of b1)
+ smull v28.4s, v14.4h, v1.h[1] //// y1 * sin3(part of b2)
+ smull v30.4s, v14.4h, v1.h[3] //// y1 * sin1(part of b3)
- smlal v24.4s, v15.4h, v0.4h[3] //// y1 * cos1 + y3 * cos3(part of b0)
- smlsl v26.4s, v15.4h, v1.4h[3] //// y1 * cos3 - y3 * sin1(part of b1)
- smlsl v28.4s, v15.4h, v0.4h[1] //// y1 * sin3 - y3 * cos1(part of b2)
- smlsl v30.4s, v15.4h, v1.4h[1] //// y1 * sin1 - y3 * sin3(part of b3)
- smull v20.4s, v10.4h, v0.4h[0] //// y0 * cos4(part of c0 and c1)
- smull v18.4s, v11.4h, v1.4h[2] //// y2 * sin2 (q7 is freed by this time)(part of d1)
- smull v14.4s, v11.4h, v0.4h[2] //// y2 * cos2(part of d0)
+ smlal v24.4s, v15.4h, v0.h[3] //// y1 * cos1 + y3 * cos3(part of b0)
+ smlsl v26.4s, v15.4h, v1.h[3] //// y1 * cos3 - y3 * sin1(part of b1)
+ smlsl v28.4s, v15.4h, v0.h[1] //// y1 * sin3 - y3 * cos1(part of b2)
+ smlsl v30.4s, v15.4h, v1.h[1] //// y1 * sin1 - y3 * sin3(part of b3)
+ smull v20.4s, v10.4h, v0.h[0] //// y0 * cos4(part of c0 and c1)
+ smull v18.4s, v11.4h, v1.h[2] //// y2 * sin2 (q7 is freed by this time)(part of d1)
+ smull v14.4s, v11.4h, v0.h[2] //// y2 * cos2(part of d0)
add x4,x2,x8, lsl #1 // x4 = x2 + pred_strd * 2 => x4 points to 3rd row of pred data
@@ -698,38 +698,38 @@ end_skip_last4cols:
//// q5 -> q2
//// q7 -> q4
- smull v24.4s, v6.4h, v0.4h[1] //// y1 * cos1(part of b0)
- smull v26.4s, v6.4h, v0.4h[3] //// y1 * cos3(part of b1)
- smull v28.4s, v6.4h, v1.4h[1] //// y1 * sin3(part of b2)
- smull v30.4s, v6.4h, v1.4h[3] //// y1 * sin1(part of b3)
+ smull v24.4s, v6.4h, v0.h[1] //// y1 * cos1(part of b0)
+ smull v26.4s, v6.4h, v0.h[3] //// y1 * cos3(part of b1)
+ smull v28.4s, v6.4h, v1.h[1] //// y1 * sin3(part of b2)
+ smull v30.4s, v6.4h, v1.h[3] //// y1 * sin1(part of b3)
- smlal v24.4s, v7.4h, v0.4h[3] //// y1 * cos1 + y3 * cos3(part of b0)
- smlsl v26.4s, v7.4h, v1.4h[3] //// y1 * cos3 - y3 * sin1(part of b1)
- smlsl v28.4s, v7.4h, v0.4h[1] //// y1 * sin3 - y3 * cos1(part of b2)
- smlsl v30.4s, v7.4h, v1.4h[1] //// y1 * sin1 - y3 * sin3(part of b3)
+ smlal v24.4s, v7.4h, v0.h[3] //// y1 * cos1 + y3 * cos3(part of b0)
+ smlsl v26.4s, v7.4h, v1.h[3] //// y1 * cos3 - y3 * sin1(part of b1)
+ smlsl v28.4s, v7.4h, v0.h[1] //// y1 * sin3 - y3 * cos1(part of b2)
+ smlsl v30.4s, v7.4h, v1.h[1] //// y1 * sin1 - y3 * sin3(part of b3)
- smull v20.4s, v2.4h, v0.4h[0] //// y0 * cos4(part of c0 and c1)
- smull v22.4s, v4.4h, v0.4h[0] //// y4 * cos4(part of c0 and c1)
+ smull v20.4s, v2.4h, v0.h[0] //// y0 * cos4(part of c0 and c1)
+ smull v22.4s, v4.4h, v0.h[0] //// y4 * cos4(part of c0 and c1)
- smull v18.4s, v3.4h, v1.4h[2] //// y2 * sin2 (q3 is freed by this time)(part of d1)
- smull v6.4s, v3.4h, v0.4h[2] //// y2 * cos2(part of d0)
+ smull v18.4s, v3.4h, v1.h[2] //// y2 * sin2 (q3 is freed by this time)(part of d1)
+ smull v6.4s, v3.4h, v0.h[2] //// y2 * cos2(part of d0)
- smlal v24.4s, v8.4h, v1.4h[1] //// y1 * cos1 + y3 * cos3 + y5 * sin3(part of b0)
- smlsl v26.4s, v8.4h, v0.4h[1] //// y1 * cos3 - y3 * sin1 - y5 * cos1(part of b1)
- smlal v28.4s, v8.4h, v1.4h[3] //// y1 * sin3 - y3 * cos1 + y5 * sin1(part of b2)
- smlal v30.4s, v8.4h, v0.4h[3] //// y1 * sin1 - y3 * sin3 + y5 * cos3(part of b3)
+ smlal v24.4s, v8.4h, v1.h[1] //// y1 * cos1 + y3 * cos3 + y5 * sin3(part of b0)
+ smlsl v26.4s, v8.4h, v0.h[1] //// y1 * cos3 - y3 * sin1 - y5 * cos1(part of b1)
+ smlal v28.4s, v8.4h, v1.h[3] //// y1 * sin3 - y3 * cos1 + y5 * sin1(part of b2)
+ smlal v30.4s, v8.4h, v0.h[3] //// y1 * sin1 - y3 * sin3 + y5 * cos3(part of b3)
- smlsl v18.4s, v5.4h, v0.4h[2] //// d1 = y2 * sin2 - y6 * cos2(part of a0 and a1)
- smlal v6.4s, v5.4h, v1.4h[2] //// d0 = y2 * cos2 + y6 * sin2(part of a0 and a1)
+ smlsl v18.4s, v5.4h, v0.h[2] //// d1 = y2 * sin2 - y6 * cos2(part of a0 and a1)
+ smlal v6.4s, v5.4h, v1.h[2] //// d0 = y2 * cos2 + y6 * sin2(part of a0 and a1)
add v2.4s, v20.4s , v22.4s //// c0 = y0 * cos4 + y4 * cos4(part of a0 and a1)
sub v20.4s, v20.4s , v22.4s //// c1 = y0 * cos4 - y4 * cos4(part of a0 and a1)
- smlal v24.4s, v9.4h, v1.4h[3] //// b0 = y1 * cos1 + y3 * cos3 + y5 * sin3 + y7 * sin1(part of x0,x7)
- smlsl v26.4s, v9.4h, v1.4h[1] //// b1 = y1 * cos3 - y3 * sin1 - y5 * cos1 - y7 * sin3(part of x1,x6)
- smlal v28.4s, v9.4h, v0.4h[3] //// b2 = y1 * sin3 - y3 * cos1 + y5 * sin1 + y7 * cos3(part of x2,x5)
- smlsl v30.4s, v9.4h, v0.4h[1] //// b3 = y1 * sin1 - y3 * sin3 + y5 * cos3 - y7 * cos1(part of x3,x4)
+ smlal v24.4s, v9.4h, v1.h[3] //// b0 = y1 * cos1 + y3 * cos3 + y5 * sin3 + y7 * sin1(part of x0,x7)
+ smlsl v26.4s, v9.4h, v1.h[1] //// b1 = y1 * cos3 - y3 * sin1 - y5 * cos1 - y7 * sin3(part of x1,x6)
+ smlal v28.4s, v9.4h, v0.h[3] //// b2 = y1 * sin3 - y3 * cos1 + y5 * sin1 + y7 * cos3(part of x2,x5)
+ smlsl v30.4s, v9.4h, v0.h[1] //// b3 = y1 * sin1 - y3 * sin3 + y5 * cos3 - y7 * cos1(part of x3,x4)
sub v22.4s, v2.4s , v6.4s //// a3 = c0 - d0(part of x3,x4)
add v4.4s, v2.4s , v6.4s //// a0 = c0 + d0(part of x0,x7)
@@ -794,53 +794,53 @@ end_skip_last4cols:
- smull v24.4s, v14.4h, v0.4h[1] //// y1 * cos1(part of b0)
- smull v26.4s, v14.4h, v0.4h[3] //// y1 * cos3(part of b1)
- smull v28.4s, v14.4h, v1.4h[1] //// y1 * sin3(part of b2)
- smull v30.4s, v14.4h, v1.4h[3] //// y1 * sin1(part of b3)
- smlal v24.4s, v15.4h, v0.4h[3] //// y1 * cos1 + y3 * cos3(part of b0)
- smlsl v26.4s, v15.4h, v1.4h[3] //// y1 * cos3 - y3 * sin1(part of b1)
- smlsl v28.4s, v15.4h, v0.4h[1] //// y1 * sin3 - y3 * cos1(part of b2)
- smlsl v30.4s, v15.4h, v1.4h[1] //// y1 * sin1 - y3 * sin3(part of b3)
- smull v20.4s, v10.4h, v0.4h[0] //// y0 * cos4(part of c0 and c1)
- smull v22.4s, v12.4h, v0.4h[0] //// y4 * cos4(part of c0 and c1)
- smull v18.4s, v11.4h, v1.4h[2] //// y2 * sin2 (q7 is freed by this time)(part of d1)
- smull v14.4s, v11.4h, v0.4h[2] //// y2 * cos2(part of d0)
- smlal v24.4s, v16.4h, v1.4h[1] //// y1 * cos1 + y3 * cos3 + y5 * sin3(part of b0)
+ smull v24.4s, v14.4h, v0.h[1] //// y1 * cos1(part of b0)
+ smull v26.4s, v14.4h, v0.h[3] //// y1 * cos3(part of b1)
+ smull v28.4s, v14.4h, v1.h[1] //// y1 * sin3(part of b2)
+ smull v30.4s, v14.4h, v1.h[3] //// y1 * sin1(part of b3)
+ smlal v24.4s, v15.4h, v0.h[3] //// y1 * cos1 + y3 * cos3(part of b0)
+ smlsl v26.4s, v15.4h, v1.h[3] //// y1 * cos3 - y3 * sin1(part of b1)
+ smlsl v28.4s, v15.4h, v0.h[1] //// y1 * sin3 - y3 * cos1(part of b2)
+ smlsl v30.4s, v15.4h, v1.h[1] //// y1 * sin1 - y3 * sin3(part of b3)
+ smull v20.4s, v10.4h, v0.h[0] //// y0 * cos4(part of c0 and c1)
+ smull v22.4s, v12.4h, v0.h[0] //// y4 * cos4(part of c0 and c1)
+ smull v18.4s, v11.4h, v1.h[2] //// y2 * sin2 (q7 is freed by this time)(part of d1)
+ smull v14.4s, v11.4h, v0.h[2] //// y2 * cos2(part of d0)
+ smlal v24.4s, v16.4h, v1.h[1] //// y1 * cos1 + y3 * cos3 + y5 * sin3(part of b0)
add x4,x2,x8, lsl #1 // x4 = x2 + pred_strd * 2 => x4 points to 3rd row of pred data
- smlsl v26.4s, v16.4h, v0.4h[1] //// y1 * cos3 - y3 * sin1 - y5 * cos1(part of b1)
+ smlsl v26.4s, v16.4h, v0.h[1] //// y1 * cos3 - y3 * sin1 - y5 * cos1(part of b1)
add x5,x8,x8, lsl #1 //
- smlal v28.4s, v16.4h, v1.4h[3] //// y1 * sin3 - y3 * cos1 + y5 * sin1(part of b2)
+ smlal v28.4s, v16.4h, v1.h[3] //// y1 * sin3 - y3 * cos1 + y5 * sin1(part of b2)
add x0,x3,x7, lsl #1 // x0 points to 3rd row of dest data
- smlal v30.4s, v16.4h, v0.4h[3] //// y1 * sin1 - y3 * sin3 + y5 * cos3(part of b3)
+ smlal v30.4s, v16.4h, v0.h[3] //// y1 * sin1 - y3 * sin3 + y5 * cos3(part of b3)
add x10,x7,x7, lsl #1 //
- smlsl v18.4s, v13.4h, v0.4h[2] //// d1 = y2 * sin2 - y6 * cos2(part of a0 and a1)
+ smlsl v18.4s, v13.4h, v0.h[2] //// d1 = y2 * sin2 - y6 * cos2(part of a0 and a1)
- smlal v14.4s, v13.4h, v1.4h[2] //// d0 = y2 * cos2 + y6 * sin2(part of a0 and a1)
+ smlal v14.4s, v13.4h, v1.h[2] //// d0 = y2 * cos2 + y6 * sin2(part of a0 and a1)
add v12.4s, v20.4s , v22.4s //// c0 = y0 * cos4 + y4 * cos4(part of a0 and a1)
sub v20.4s, v20.4s , v22.4s //// c1 = y0 * cos4 - y4 * cos4(part of a0 and a1)
- smlal v24.4s, v17.4h, v1.4h[3] //// b0 = y1 * cos1 + y3 * cos3 + y5 * sin3 + y7 * sin1(part of x0,x7)
+ smlal v24.4s, v17.4h, v1.h[3] //// b0 = y1 * cos1 + y3 * cos3 + y5 * sin3 + y7 * sin1(part of x0,x7)
// swapping v3 and v6
mov v31.d[0], v3.d[0]
mov v3.d[0], v6.d[0]
mov v6.d[0], v31.d[0]
- smlsl v26.4s, v17.4h, v1.4h[1] //// b1 = y1 * cos3 - y3 * sin1 - y5 * cos1 - y7 * sin3(part of x1,x6)
+ smlsl v26.4s, v17.4h, v1.h[1] //// b1 = y1 * cos3 - y3 * sin1 - y5 * cos1 - y7 * sin3(part of x1,x6)
// swapping v5 and v8
mov v31.d[0], v5.d[0]
mov v5.d[0], v8.d[0]
mov v8.d[0], v31.d[0]
- smlal v28.4s, v17.4h, v0.4h[3] //// b2 = y1 * sin3 - y3 * cos1 + y5 * sin1 + y7 * cos3(part of x2,x5)
- smlsl v30.4s, v17.4h, v0.4h[1] //// b3 = y1 * sin1 - y3 * sin3 + y5 * cos3 - y7 * cos1(part of x3,x4)
+ smlal v28.4s, v17.4h, v0.h[3] //// b2 = y1 * sin3 - y3 * cos1 + y5 * sin1 + y7 * cos3(part of x2,x5)
+ smlsl v30.4s, v17.4h, v0.h[1] //// b3 = y1 * sin1 - y3 * sin3 + y5 * cos3 - y7 * cos1(part of x3,x4)
sub v22.4s, v12.4s , v14.4s //// a3 = c0 - d0(part of x3,x4)
add v12.4s, v12.4s , v14.4s //// a0 = c0 + d0(part of x0,x7)
diff --git a/common/arm64/ihevc_mem_fns.s b/common/arm64/ihevc_mem_fns.s
index 6619c6c..5b1026b 100644
--- a/common/arm64/ihevc_mem_fns.s
+++ b/common/arm64/ihevc_mem_fns.s
@@ -114,7 +114,7 @@ LOOP_NEON_MEMCPY:
SUBS x2,x2,#8
BGE LOOP_NEON_MEMCPY
- CMP x2,#-8
+ CMN x2,#8
BEQ MEMCPY_RETURN
ARM_MEMCPY:
@@ -186,7 +186,7 @@ LOOP_NEON_MEMSET:
SUBS x2,x2,#8
BGE LOOP_NEON_MEMSET
- CMP x2,#-8
+ CMN x2,#8
BEQ MEMSET_RETURN
ARM_MEMSET:
@@ -259,7 +259,7 @@ LOOP_NEON_MEMSET_16BIT:
SUBS x2,x2,#8
BGE LOOP_NEON_MEMSET_16BIT
- CMP x2,#-8
+ CMN x2,#8
BEQ MEMSET_16BIT_RETURN
ARM_MEMSET_16BIT:
diff --git a/common/arm64/ihevc_sao_band_offset_chroma.s b/common/arm64/ihevc_sao_band_offset_chroma.s
index f67a3de..41042ae 100644
--- a/common/arm64/ihevc_sao_band_offset_chroma.s
+++ b/common/arm64/ihevc_sao_band_offset_chroma.s
@@ -140,17 +140,17 @@ SRC_TOP_LOOP: //wd is always multiple of 8
LD1 {v30.8b},[x7] //pi1_sao_offset_u load
ADD v5.8b, v1.8b , v31.8b //band_table_u.val[0] = vadd_u8(band_table_u.val[0], sao_band_pos_u)
- dup v29.8b, v30.8b[1] //vdup_n_u8(pi1_sao_offset_u[1])
+ dup v29.8b, v30.b[1] //vdup_n_u8(pi1_sao_offset_u[1])
ADD v6.8b, v2.8b , v31.8b //band_table_u.val[1] = vadd_u8(band_table_u.val[1], sao_band_pos_u)
- dup v28.8b, v30.8b[2] //vdup_n_u8(pi1_sao_offset_u[2])
+ dup v28.8b, v30.b[2] //vdup_n_u8(pi1_sao_offset_u[2])
ADD v7.8b, v3.8b , v31.8b //band_table_u.val[2] = vadd_u8(band_table_u.val[2], sao_band_pos_u)
- dup v27.8b, v30.8b[3] //vdup_n_u8(pi1_sao_offset_u[3])
+ dup v27.8b, v30.b[3] //vdup_n_u8(pi1_sao_offset_u[3])
ADD v8.8b, v4.8b , v31.8b //band_table_u.val[3] = vadd_u8(band_table_u.val[3], sao_band_pos_u)
CMP x5,#28
- dup v26.8b, v30.8b[4] //vdup_n_u8(pi1_sao_offset_u[4])
+ dup v26.8b, v30.b[4] //vdup_n_u8(pi1_sao_offset_u[4])
ADRP x14, :got:gu1_table_band_idx
LDR x14, [x14, #:got_lo12:gu1_table_band_idx]
@@ -225,16 +225,16 @@ SWITCH_BREAK_U:
LD1 {v25.8b},[x8] //pi1_sao_offset_v load
ADD v15.8b, v11.8b , v30.8b //band_table_v.val[2] = vadd_u8(band_table_v.val[2], band_pos_v)
- dup v29.8b, v25.8b[1] //vdup_n_u8(pi1_sao_offset_v[1])
+ dup v29.8b, v25.b[1] //vdup_n_u8(pi1_sao_offset_v[1])
ADD v16.8b, v12.8b , v30.8b //band_table_v.val[3] = vadd_u8(band_table_v.val[3], band_pos_v)
- dup v28.8b, v25.8b[2] //vdup_n_u8(pi1_sao_offset_v[2])
+ dup v28.8b, v25.b[2] //vdup_n_u8(pi1_sao_offset_v[2])
ADD v9.8b, v13.8b , v29.8b //band_table_v.val[0] = vadd_u8(band_table_v.val[0], vdup_n_u8(pi1_sao_offset_v[1]))
- dup v27.8b, v25.8b[3] //vdup_n_u8(pi1_sao_offset_v[3])
+ dup v27.8b, v25.b[3] //vdup_n_u8(pi1_sao_offset_v[3])
ADD v10.8b, v14.8b , v28.8b //band_table_v.val[1] = vadd_u8(band_table_v.val[1], vdup_n_u8(pi1_sao_offset_v[2]))
- dup v26.8b, v25.8b[4] //vdup_n_u8(pi1_sao_offset_v[4])
+ dup v26.8b, v25.b[4] //vdup_n_u8(pi1_sao_offset_v[4])
ADD v11.8b, v15.8b , v27.8b //band_table_v.val[2] = vadd_u8(band_table_v.val[2], vdup_n_u8(pi1_sao_offset_v[3]))
movi v29.8b, #16 //vdup_n_u8(16)
diff --git a/common/arm64/ihevc_sao_band_offset_luma.s b/common/arm64/ihevc_sao_band_offset_luma.s
index 779ee69..d283a90 100644
--- a/common/arm64/ihevc_sao_band_offset_luma.s
+++ b/common/arm64/ihevc_sao_band_offset_luma.s
@@ -123,16 +123,16 @@ SRC_TOP_LOOP: //wd is always multiple of 8
LD1 {v30.8b},[x6] //pi1_sao_offset load
ADD v5.8b, v1.8b , v31.8b //band_table.val[0] = vadd_u8(band_table.val[0], band_pos)
- dup v29.8b, v30.8b[1] //vdup_n_u8(pi1_sao_offset[1])
+ dup v29.8b, v30.b[1] //vdup_n_u8(pi1_sao_offset[1])
ADD v6.8b, v2.8b , v31.8b //band_table.val[1] = vadd_u8(band_table.val[1], band_pos)
- dup v28.8b, v30.8b[2] //vdup_n_u8(pi1_sao_offset[2])
+ dup v28.8b, v30.b[2] //vdup_n_u8(pi1_sao_offset[2])
ADD v7.8b, v3.8b , v31.8b //band_table.val[2] = vadd_u8(band_table.val[2], band_pos)
- dup v27.8b, v30.8b[3] //vdup_n_u8(pi1_sao_offset[3])
+ dup v27.8b, v30.b[3] //vdup_n_u8(pi1_sao_offset[3])
ADD v21.8b, v4.8b , v31.8b //band_table.val[3] = vadd_u8(band_table.val[3], band_pos)
- dup v26.8b, v30.8b[4] //vdup_n_u8(pi1_sao_offset[4])
+ dup v26.8b, v30.b[4] //vdup_n_u8(pi1_sao_offset[4])
ADD v1.8b, v5.8b , v29.8b //band_table.val[0] = vadd_u8(band_table.val[0], vdup_n_u8(pi1_sao_offset[1]))
movi v29.8b, #16 //vdup_n_u8(16)
diff --git a/common/arm64/ihevc_sao_edge_offset_class0.s b/common/arm64/ihevc_sao_edge_offset_class0.s
index 91146e8..7c61aa2 100644
--- a/common/arm64/ihevc_sao_edge_offset_class0.s
+++ b/common/arm64/ihevc_sao_edge_offset_class0.s
@@ -123,12 +123,12 @@ WIDTH_LOOP_16:
CMP x8,x9 //if(col == wd)
BNE AU1_MASK_FF //jump to else part
LDRB w12,[x7] //pu1_avail[0]
- mov v3.8b[0], w12 //vsetq_lane_s8(pu1_avail[0], au1_mask, 0)
+ mov v3.b[0], w12 //vsetq_lane_s8(pu1_avail[0], au1_mask, 0)
B SKIP_AU1_MASK_FF //Skip the else part
AU1_MASK_FF:
MOV x12,#0xFF //move -1 to x12
- mov v3.8b[0], w12 //au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
+ mov v3.b[0], w12 //au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
SKIP_AU1_MASK_FF:
CMP x8,#16 //If col == 16
@@ -146,7 +146,7 @@ PU1_SRC_LOOP:
SUB x5,x9,x8 //wd - col
SUB x14,x10,x4 //ht - row
- mov v21.8b[15], w11 //vsetq_lane_u8(pu1_src_left[ht - row], pu1_cur_row_tmp, 15)
+ mov v21.b[15], w11 //vsetq_lane_u8(pu1_src_left[ht - row], pu1_cur_row_tmp, 15)
mul x14, x14, x1 //(ht - row) * src_strd
LD1 {v26.16b},[x12] //II Iteration pu1_cur_row = vld1q_u8(pu1_src_cpy)
@@ -158,7 +158,7 @@ PU1_SRC_LOOP:
LDRB w14,[x6,x5] //pu1_src_org[(ht - row) * src_strd + 16 - 1 + (wd - col)]
SUB x4,x4,#1
- mov v28.8b[15], w11 //II Iteration vsetq_lane_u8(pu1_src_left[ht - row], pu1_cur_row_tmp, 15)
+ mov v28.b[15], w11 //II Iteration vsetq_lane_u8(pu1_src_left[ht - row], pu1_cur_row_tmp, 15)
cmhi v18.16b, v21.16b , v17.16b //vcltq_u8(pu1_cur_row, pu1_cur_row_tmp)
SUB x12,x12,x1 //Decrement the pu1_src pointer by src_strd
@@ -170,7 +170,7 @@ PU1_SRC_LOOP:
SUB x5,x9,x8 //II wd - col
ADD x12,x12,x1 //Increment the pu1_src pointer by src_strd
- mov v21.8b[0], w11 //pu1_cur_row_tmp = vsetq_lane_u8(pu1_src_cpy[16], pu1_cur_row_tmp, 0)
+ mov v21.b[0], w11 //pu1_cur_row_tmp = vsetq_lane_u8(pu1_src_cpy[16], pu1_cur_row_tmp, 0)
cmhi v30.16b, v26.16b , v28.16b //II vcgtq_u8(pu1_cur_row, pu1_cur_row_tmp)
LDRB w11,[x12,#16] //II pu1_src_cpy[16]
@@ -178,7 +178,7 @@ PU1_SRC_LOOP:
SUB x14,x10,x4 //II ht - row
cmhi v0.16b, v28.16b , v26.16b //II vcltq_u8(pu1_cur_row, pu1_cur_row_tmp)
- mov v28.8b[0], w11 //II pu1_cur_row_tmp = vsetq_lane_u8(pu1_src_cpy[16], pu1_cur_row_tmp, 0)
+ mov v28.b[0], w11 //II pu1_cur_row_tmp = vsetq_lane_u8(pu1_src_cpy[16], pu1_cur_row_tmp, 0)
SUB x12,x12,x1 //Decrement the pu1_src pointer by src_strd
mul x14, x14, x1 //II (ht - row) * src_strd
@@ -271,25 +271,25 @@ WIDTH_RESIDUE:
CMP x8,x9 //if(wd_rem == wd)
BNE AU1_MASK_FF_RESIDUE //jump to else part
LDRB w12,[x7] //pu1_avail[0]
- mov v3.8b[0], w12 //vsetq_lane_s8(pu1_avail[0], au1_mask, 0)
+ mov v3.b[0], w12 //vsetq_lane_s8(pu1_avail[0], au1_mask, 0)
B SKIP_AU1_MASK_FF_RESIDUE //Skip the else part
AU1_MASK_FF_RESIDUE:
MOV x12,#0xFF //move -s to x12
- mov v3.8b[0], w12 //au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
+ mov v3.b[0], w12 //au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
SKIP_AU1_MASK_FF_RESIDUE:
LDRB w11,[x7,#1] //pu1_avail[1]
SUB x5,x9,#1 //wd - 1
MOV x4,x10 //move ht to x4 for loop count
- mov v3.8b[7], w11 //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
+ mov v3.b[7], w11 //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
MOV x12,x0 //pu1_src_cpy = pu1_src
PU1_SRC_LOOP_RESIDUE:
LD1 {v17.16b},[x12] //pu1_cur_row = vld1q_u8(pu1_src_cpy)
LDRB w11,[x2] //load pu1_src_left
- mov v21.8b[15], w11 //vsetq_lane_u8(pu1_src_left[ht - row], pu1_cur_row_tmp, 15)
+ mov v21.b[15], w11 //vsetq_lane_u8(pu1_src_left[ht - row], pu1_cur_row_tmp, 15)
EXT v21.16b, v21.16b , v17.16b,#15 //pu1_cur_row_tmp = vextq_u8(pu1_cur_row_tmp, pu1_cur_row, 15)
cmhi v16.16b, v17.16b , v21.16b //vcgtq_u8(pu1_cur_row, pu1_cur_row_tmp)
@@ -297,7 +297,7 @@ PU1_SRC_LOOP_RESIDUE:
SUB v20.16b, v18.16b , v16.16b //sign_left = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
LDRB w11,[x12,#16] //pu1_src_cpy[16]
- mov v21.8b[0], w11 //pu1_cur_row_tmp = vsetq_lane_u8(pu1_src_cpy[16], pu1_cur_row_tmp, 0)
+ mov v21.b[0], w11 //pu1_cur_row_tmp = vsetq_lane_u8(pu1_src_cpy[16], pu1_cur_row_tmp, 0)
EXT v21.16b, v17.16b , v21.16b,#1 //pu1_cur_row_tmp = vextq_u8(pu1_cur_row, pu1_cur_row_tmp, 1)
cmhi v16.16b, v17.16b , v21.16b //vcgtq_u8(pu1_cur_row, pu1_cur_row_tmp)
diff --git a/common/arm64/ihevc_sao_edge_offset_class0_chroma.s b/common/arm64/ihevc_sao_edge_offset_class0_chroma.s
index c6be41a..2a1eb7e 100644
--- a/common/arm64/ihevc_sao_edge_offset_class0_chroma.s
+++ b/common/arm64/ihevc_sao_edge_offset_class0_chroma.s
@@ -141,20 +141,20 @@ WIDTH_LOOP_16:
CMP x8,x9 //if(col == wd)
BNE AU1_MASK_FF //jump to else part
LDRB w12,[x7] //pu1_avail[0]
- mov v3.8b[0], w12 //vsetq_lane_s8(pu1_avail[0], au1_mask, 0)
- mov v3.8b[1], w12 //vsetq_lane_s8(pu1_avail[0], au1_mask, 1)
+ mov v3.b[0], w12 //vsetq_lane_s8(pu1_avail[0], au1_mask, 0)
+ mov v3.b[1], w12 //vsetq_lane_s8(pu1_avail[0], au1_mask, 1)
B SKIP_AU1_MASK_FF //Skip the else part
AU1_MASK_FF:
MOV x12,#-1 //move -1 to x12
- mov v3.4h[0], w12 //au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
+ mov v3.h[0], w12 //au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
SKIP_AU1_MASK_FF:
CMP x8,#16 //If col == 16
BNE SKIP_MASKING_IF_NOT16 //If not skip masking
LDRB w12,[x7,#1] //pu1_avail[1]
- mov v3.8b[14], w12 //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 14)
- mov v3.8b[15], w12 //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
+ mov v3.b[14], w12 //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 14)
+ mov v3.b[15], w12 //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
SKIP_MASKING_IF_NOT16:
MOV x12,x0 //pu1_src_cpy = pu1_src
@@ -168,7 +168,7 @@ PU1_SRC_LOOP:
SUB x5,x9,x8 //wd - col
SUB x14,x10,x4 //ht - row
- mov v21.4h[7], w11 //vsetq_lane_u16(pu1_src_left[ht - row], pu1_cur_row_tmp, 14,15)
+ mov v21.h[7], w11 //vsetq_lane_u16(pu1_src_left[ht - row], pu1_cur_row_tmp, 14,15)
mul x14, x14, x1 //(ht - row) * src_strd
LD1 {v30.16b},[x12] //II Iteration pu1_cur_row = vld1q_u8(pu1_src_cpy)
@@ -181,7 +181,7 @@ PU1_SRC_LOOP:
cmhi v16.16b, v19.16b , v21.16b //vcgtq_u8(pu1_cur_row, pu1_cur_row_tmp)
ADD x5,x14,x5 //(ht - row) * src_strd + (wd - col)
- mov v28.4h[7], w11 //II vsetq_lane_u16(pu1_src_left[ht - row], pu1_cur_row_tmp, 14,15)
+ mov v28.h[7], w11 //II vsetq_lane_u16(pu1_src_left[ht - row], pu1_cur_row_tmp, 14,15)
cmhi v18.16b, v21.16b , v19.16b //vcltq_u8(pu1_cur_row, pu1_cur_row_tmp)
LDRH w14,[x6,x5] //pu1_src_org[(ht - row) * src_strd + 14 + (wd - col)]
@@ -191,7 +191,7 @@ PU1_SRC_LOOP:
LDRB w11,[x12,#16] //pu1_src_cpy[16]
EXT v28.16b, v28.16b , v30.16b,#14 //II pu1_cur_row_tmp = vextq_u8(pu1_cur_row_tmp, pu1_cur_row, 14)
- mov v21.8b[0], w11 //pu1_cur_row_tmp = vsetq_lane_u8(pu1_src_cpy[16], pu1_cur_row_tmp, 0)
+ mov v21.b[0], w11 //pu1_cur_row_tmp = vsetq_lane_u8(pu1_src_cpy[16], pu1_cur_row_tmp, 0)
cmhi v26.16b, v30.16b , v28.16b //II vcgtq_u8(pu1_cur_row, pu1_cur_row_tmp)
LDRB w11,[x12,#17] //pu1_src_cpy[17]
@@ -199,18 +199,18 @@ PU1_SRC_LOOP:
STRH w14,[x2],#2 //pu1_src_left[(ht - row)] = au1_src_left_tmp[(ht - row)]
ADD x12,x12,x1
- mov v21.8b[1], w11 //pu1_cur_row_tmp = vsetq_lane_u8(pu1_src_cpy[17], pu1_cur_row_tmp, 1)
+ mov v21.b[1], w11 //pu1_cur_row_tmp = vsetq_lane_u8(pu1_src_cpy[17], pu1_cur_row_tmp, 1)
LDRB w11,[x12,#16] //II pu1_src_cpy[16]
EXT v21.16b, v19.16b , v21.16b,#2 //pu1_cur_row_tmp = vextq_u8(pu1_cur_row, pu1_cur_row_tmp, 2)
- mov v28.8b[0], w11 //II pu1_cur_row_tmp = vsetq_lane_u8(pu1_src_cpy[16], pu1_cur_row_tmp, 0)
+ mov v28.b[0], w11 //II pu1_cur_row_tmp = vsetq_lane_u8(pu1_src_cpy[16], pu1_cur_row_tmp, 0)
LDRB w11,[x12,#17] //II pu1_src_cpy[17]
cmhi v16.16b, v19.16b , v21.16b //vcgtq_u8(pu1_cur_row, pu1_cur_row_tmp)
SUB x12,x12,x1
cmhi v18.16b, v21.16b , v19.16b //vcltq_u8(pu1_cur_row, pu1_cur_row_tmp)
- mov v28.8b[1], w11 //II pu1_cur_row_tmp = vsetq_lane_u8(pu1_src_cpy[17], pu1_cur_row_tmp, 1)
+ mov v28.b[1], w11 //II pu1_cur_row_tmp = vsetq_lane_u8(pu1_src_cpy[17], pu1_cur_row_tmp, 1)
SUB v22.16b, v18.16b , v16.16b //sign_right = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
EXT v28.16b, v30.16b , v28.16b,#2 //II pu1_cur_row_tmp = vextq_u8(pu1_cur_row, pu1_cur_row_tmp, 2)
@@ -328,18 +328,18 @@ WIDTH_RESIDUE:
CMP x8,x9 //if(wd_rem == wd)
BNE AU1_MASK_FF_RESIDUE //jump to else part
LDRB w12,[x7] //pu1_avail[0]
- mov v3.8b[0], w12 //vsetq_lane_s8(pu1_avail[0], au1_mask, 0)
- mov v3.8b[1], w12 //vsetq_lane_s8(pu1_avail[0], au1_mask, 0)
+ mov v3.b[0], w12 //vsetq_lane_s8(pu1_avail[0], au1_mask, 0)
+ mov v3.b[1], w12 //vsetq_lane_s8(pu1_avail[0], au1_mask, 0)
B SKIP_AU1_MASK_FF_RESIDUE //Skip the else part
AU1_MASK_FF_RESIDUE:
MOV x12,#-1 //move -1 to x12
- mov v3.4h[0], w12 //au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
+ mov v3.h[0], w12 //au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
SKIP_AU1_MASK_FF_RESIDUE:
LDRB w12,[x7,#1] //pu1_avail[1]
- mov v3.8b[6], w12 //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
- mov v3.8b[7], w12 //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
+ mov v3.b[6], w12 //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
+ mov v3.b[7], w12 //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
MOV x12,x0 //pu1_src_cpy = pu1_src
MOV x4,x10 //move ht to x4 for loop count
@@ -352,7 +352,7 @@ PU1_SRC_LOOP_RESIDUE:
SUB x5,x9,#2 //wd - 2
SUB x14,x10,x4 //(ht - row)
- mov v21.4h[7], w11 //vsetq_lane_u8(pu1_src_left[ht - row], pu1_cur_row_tmp, 15)
+ mov v21.h[7], w11 //vsetq_lane_u8(pu1_src_left[ht - row], pu1_cur_row_tmp, 15)
LSL x14,x14,#1 //(ht - row) * 2
LD1 {v30.16b},[x12] //II pu1_cur_row = vld1q_u8(pu1_src_cpy)
@@ -366,20 +366,20 @@ PU1_SRC_LOOP_RESIDUE:
mul x14, x14, x1 //(ht - row) * 2 * src_strd
cmhi v18.16b, v21.16b , v19.16b //vcltq_u8(pu1_cur_row, pu1_cur_row_tmp)
- mov v28.4h[7], w11 //II vsetq_lane_u8(pu1_src_left[ht - row], pu1_cur_row_tmp, 15)
+ mov v28.h[7], w11 //II vsetq_lane_u8(pu1_src_left[ht - row], pu1_cur_row_tmp, 15)
LDRB w11,[x12,#16] //pu1_src_cpy[16]
SUB v20.16b, v18.16b , v16.16b //sign_left = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
ADD x5,x14,x5 //(ht - row) * 2 * src_strd + (wd - 2)
- mov v21.8b[0], w11 //pu1_cur_row_tmp = vsetq_lane_u8(pu1_src_cpy[16], pu1_cur_row_tmp, 0)
+ mov v21.b[0], w11 //pu1_cur_row_tmp = vsetq_lane_u8(pu1_src_cpy[16], pu1_cur_row_tmp, 0)
EXT v28.16b, v28.16b , v30.16b,#14 //II pu1_cur_row_tmp = vextq_u8(pu1_cur_row_tmp, pu1_cur_row, 15)
LDRB w11,[x12,#17] //pu1_src_cpy[17]
cmhi v26.16b, v30.16b , v28.16b //II vcgtq_u8(pu1_cur_row, pu1_cur_row_tmp)
LDRH w14,[x6, x5] //pu1_src_org[(ht - row) * 2* src_strd + (wd - 2)]
- mov v21.8b[1], w11 //pu1_cur_row_tmp = vsetq_lane_u8(pu1_src_cpy[17], pu1_cur_row_tmp, 1)
+ mov v21.b[1], w11 //pu1_cur_row_tmp = vsetq_lane_u8(pu1_src_cpy[17], pu1_cur_row_tmp, 1)
cmhi v24.16b, v28.16b , v30.16b //II vcltq_u8(pu1_cur_row, pu1_cur_row_tmp)
ADD x12,x12,x1
@@ -388,14 +388,14 @@ PU1_SRC_LOOP_RESIDUE:
LDRB w11,[x12,#16] //II pu1_src_cpy[16]
cmhi v16.16b, v19.16b , v21.16b //vcgtq_u8(pu1_cur_row, pu1_cur_row_tmp)
- mov v28.8b[0], w11 //II pu1_cur_row_tmp = vsetq_lane_u8(pu1_src_cpy[16], pu1_cur_row_tmp, 0)
+ mov v28.b[0], w11 //II pu1_cur_row_tmp = vsetq_lane_u8(pu1_src_cpy[16], pu1_cur_row_tmp, 0)
LDRB w11,[x12,#17] //II pu1_src_cpy[17]
cmhi v18.16b, v21.16b , v19.16b //vcltq_u8(pu1_cur_row, pu1_cur_row_tmp)
SUB x4,x4,#1 //II Decrement row by 1
SUB v22.16b, v18.16b , v16.16b //sign_right = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
- mov v28.8b[1], w11 //II pu1_cur_row_tmp = vsetq_lane_u8(pu1_src_cpy[17], pu1_cur_row_tmp, 1)
+ mov v28.b[1], w11 //II pu1_cur_row_tmp = vsetq_lane_u8(pu1_src_cpy[17], pu1_cur_row_tmp, 1)
SUB x12,x12,x1
ADD v21.16b, v2.16b , v20.16b //edge_idx = vaddq_s8(const_2, sign_left)
diff --git a/common/arm64/ihevc_sao_edge_offset_class2.s b/common/arm64/ihevc_sao_edge_offset_class2.s
index 31852f3..59eeadd 100644
--- a/common/arm64/ihevc_sao_edge_offset_class2.s
+++ b/common/arm64/ihevc_sao_edge_offset_class2.s
@@ -239,11 +239,11 @@ WIDTH_LOOP_16:
MOV x20,#-1
csel x8, x20, x8,NE //au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
- mov v1.8b[0], w8 //au1_mask = vsetq_lane_s8((-1||pu1_avail[0]), au1_mask, 0)
+ mov v1.b[0], w8 //au1_mask = vsetq_lane_s8((-1||pu1_avail[0]), au1_mask, 0)
CMP x6,#16 //if(col == 16)
BNE SKIP_AU1_MASK_VAL
LDRB w8,[x5,#1] //pu1_avail[1]
- mov v1.16b[15], w8 //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
+ mov v1.b[15], w8 //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
SKIP_AU1_MASK_VAL:
LDRB w11,[x5,#2] //pu1_avail[2]
@@ -289,7 +289,7 @@ AU1_SRC_LEFT_LOOP:
LDRB w4,[x4,#2] //I pu1_avail[2]
LDRB w5,[x8,#16] //I pu1_src_cpy[src_strd + 16]
- mov v18.8b[0], w5 //I pu1_next_row_tmp = vsetq_lane_u8(pu1_src_cpy[src_strd + 16], pu1_next_row_tmp, 0)
+ mov v18.b[0], w5 //I pu1_next_row_tmp = vsetq_lane_u8(pu1_src_cpy[src_strd + 16], pu1_next_row_tmp, 0)
EXT v18.16b, v16.16b , v18.16b,#1 //I pu1_next_row_tmp = vextq_u8(pu1_next_row, pu1_next_row_tmp, 1)
CMP x4,#0 //I
@@ -307,7 +307,7 @@ SIGN_UP_CHANGE:
csel x4, x20, x4,LT //I
MOV x20,#1
csel x4, x20, x4,GT //I SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[ht_tmp - 1 - row])
- mov v17.8b[0], w4 //I sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[ht_tmp - 1 - row]), sign_up, 0)
+ mov v17.b[0], w4 //I sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[ht_tmp - 1 - row]), sign_up, 0)
SIGN_UP_CHANGE_DONE:
cmhi v3.16b, v5.16b , v18.16b //I vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
@@ -359,7 +359,7 @@ PU1_SRC_LOOP:
LDRB w4,[x0] //II pu1_src_cpy[0]
LDRB w8,[x11,#16] //III pu1_src_cpy[src_strd + 16]
- mov v28.8b[0], w5 //II pu1_next_row_tmp = vsetq_lane_u8(pu1_src_cpy[src_strd + 16], pu1_next_row_tmp, 0)
+ mov v28.b[0], w5 //II pu1_next_row_tmp = vsetq_lane_u8(pu1_src_cpy[src_strd + 16], pu1_next_row_tmp, 0)
SUB x5,x12,x7 //II ht_tmp - row
EXT v22.16b, v16.16b , v28.16b,#1 //II pu1_next_row_tmp = vextq_u8(pu1_next_row, pu1_next_row_tmp, 1)
@@ -367,7 +367,7 @@ PU1_SRC_LOOP:
SUB x5,x5,#1
LDRB w5,[x5] //II load the value
- mov v18.8b[0], w8 //III pu1_next_row_tmp = vsetq_lane_u8(pu1_src_cpy[src_strd + 16], pu1_next_row_tmp, 0)
+ mov v18.b[0], w8 //III pu1_next_row_tmp = vsetq_lane_u8(pu1_src_cpy[src_strd + 16], pu1_next_row_tmp, 0)
SUB x7,x7,#1 //II Decrement the ht_tmp loop count by 1
SUBS x4,x4,x5 //II pu1_src_cpy[0] - pu1_src_left_cpy[ht_tmp - 1 - row]
@@ -389,7 +389,7 @@ PU1_SRC_LOOP:
LDRB w5,[x5] //III load the value
SUBS x2,x2,x5 //III pu1_src_cpy[0] - pu1_src_left_cpy[ht_tmp - 1 - row]
- mov v17.8b[0], w4 //II sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[ht_tmp - 1 - row]), sign_up, 0)
+ mov v17.b[0], w4 //II sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[ht_tmp - 1 - row]), sign_up, 0)
movn x20,#0
csel x2, x20, x2,LT //III
@@ -409,7 +409,7 @@ PU1_SRC_LOOP:
EXT v17.16b, v17.16b , v17.16b,#15 //II sign_up = vextq_s8(sign_up, sign_up, 15)
AND v22.16b, v22.16b , v1.16b //II edge_idx = vandq_s8(edge_idx, au1_mask)
- mov v17.8b[0], w2 //III sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[ht_tmp - 1 - row]), sign_up, 0)
+ mov v17.b[0], w2 //III sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[ht_tmp - 1 - row]), sign_up, 0)
ADD v18.16b, v0.16b , v17.16b //III edge_idx = vaddq_s8(const_2, sign_up)
TBL v24.16b, {v7.16b},v22.16b //II offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx))
@@ -471,7 +471,7 @@ PU1_SRC_LOOP:
LDRB w5,[x8,#16] //pu1_src_cpy[src_strd + 16]
SUB x11,x12,x7 //ht_tmp - row
- mov v18.8b[0], w5 //pu1_next_row_tmp = vsetq_lane_u8(pu1_src_cpy[src_strd + 16], pu1_next_row_tmp, 0)
+ mov v18.b[0], w5 //pu1_next_row_tmp = vsetq_lane_u8(pu1_src_cpy[src_strd + 16], pu1_next_row_tmp, 0)
ADD x11,x14,x11 //pu1_src_left_cpy[ht_tmp - row]
SUB x11,x11,#1
@@ -488,7 +488,7 @@ PU1_SRC_LOOP:
csel x4, x20, x4,GT //SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[ht_tmp - 1 - row])
cmhi v18.16b, v18.16b , v5.16b //vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
- mov v17.8b[0], w4 //sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[ht_tmp - 1 - row]), sign_up, 0)
+ mov v17.b[0], w4 //sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[ht_tmp - 1 - row]), sign_up, 0)
SUB v3.16b, v18.16b , v3.16b //sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
ADD v18.16b, v0.16b , v17.16b //edge_idx = vaddq_s8(const_2, sign_up)
@@ -556,11 +556,11 @@ WD_16_HT_4_LOOP:
MOV x20,#-1
csel x8, x20, x8,NE //au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
- mov v1.8b[0], w8 //au1_mask = vsetq_lane_s8((-1||pu1_avail[0]), au1_mask, 0)
+ mov v1.b[0], w8 //au1_mask = vsetq_lane_s8((-1||pu1_avail[0]), au1_mask, 0)
CMP x6,#16 //if(col == 16)
BNE SKIP_AU1_MASK_VAL_WD_16_HT_4
LDRB w8,[x5,#1] //pu1_avail[1]
- mov v1.16b[15], w8 //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
+ mov v1.b[15], w8 //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
SKIP_AU1_MASK_VAL_WD_16_HT_4:
LDRB w8,[x5,#2] //pu1_avail[2]
@@ -605,7 +605,7 @@ PU1_SRC_LOOP_WD_16_HT_4:
LD1 {v16.16b},[x8] //pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
LDRB w5,[x8,#16] //pu1_src_cpy[src_strd + 16]
- mov v18.8b[0], w5 //pu1_next_row_tmp = vsetq_lane_u8(pu1_src_cpy[src_strd + 16], pu1_next_row_tmp, 0)
+ mov v18.b[0], w5 //pu1_next_row_tmp = vsetq_lane_u8(pu1_src_cpy[src_strd + 16], pu1_next_row_tmp, 0)
EXT v18.16b, v16.16b , v18.16b,#1 //pu1_next_row_tmp = vextq_u8(pu1_next_row, pu1_next_row_tmp, 1)
CMP x7,x12
@@ -626,7 +626,7 @@ SIGN_UP_CHANGE_WD_16_HT_4:
csel x8, x20, x8,LT
MOV x20,#1
csel x8, x20, x8,GT //SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[ht_tmp - 1 - row])
- mov v17.8b[0], w8 //sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[ht_tmp - 1 - row]), sign_up, 0)
+ mov v17.b[0], w8 //sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[ht_tmp - 1 - row]), sign_up, 0)
SIGN_UP_CHANGE_DONE_WD_16_HT_4:
cmhi v20.16b, v5.16b , v18.16b //vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
@@ -689,10 +689,10 @@ WIDTH_RESIDUE:
MOV x20,#-1
csel x8, x20, x8,NE
- mov v1.8b[0], w8 //au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
+ mov v1.b[0], w8 //au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
LDRB w8,[x5,#1] //pu1_avail[1]
- mov v1.8b[7], w8 //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
+ mov v1.b[7], w8 //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
PU1_AVAIL_2_RESIDUE:
LDRB w11,[x5,#2] //pu1_avail[2]
@@ -737,7 +737,7 @@ PU1_SRC_LOOP_RESIDUE:
LD1 {v16.16b},[x8] //pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
LDRB w8,[x8,#16] //pu1_src_cpy[src_strd + 16]
- mov v18.8b[0], w8 //pu1_next_row_tmp = vsetq_lane_u8(pu1_src_cpy[src_strd + 16], pu1_next_row_tmp, 0)
+ mov v18.b[0], w8 //pu1_next_row_tmp = vsetq_lane_u8(pu1_src_cpy[src_strd + 16], pu1_next_row_tmp, 0)
EXT v18.16b, v16.16b , v18.16b,#1 //pu1_next_row_tmp = vextq_u8(pu1_next_row, pu1_next_row_tmp, 1)
CMP x7,x12
@@ -759,7 +759,7 @@ SIGN_UP_CHANGE_RESIDUE:
csel x8, x20, x8,LT
MOV x20,#1
csel x8, x20, x8,GT //SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[ht_tmp - 1 - row])
- mov v17.8b[0], w8 //sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[ht_tmp - 1 - row]), sign_up, 0)
+ mov v17.b[0], w8 //sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[ht_tmp - 1 - row]), sign_up, 0)
SIGN_UP_CHANGE_DONE_RESIDUE:
cmhi v20.16b, v5.16b , v18.16b //vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
diff --git a/common/arm64/ihevc_sao_edge_offset_class2_chroma.s b/common/arm64/ihevc_sao_edge_offset_class2_chroma.s
index 8e286b4..b430709 100644
--- a/common/arm64/ihevc_sao_edge_offset_class2_chroma.s
+++ b/common/arm64/ihevc_sao_edge_offset_class2_chroma.s
@@ -322,7 +322,7 @@ PU1_AVAIL_3_LOOP:
LDR x2, [x2, #:got_lo12:gi1_table_edge_idx]
MOV x6,x7 //move wd to x6 loop_count
- movi v1.16b, #0XFF //au1_mask = vdupq_n_s8(-1)
+ movi v1.16b, #0xFF //au1_mask = vdupq_n_s8(-1)
CMP x7,#16 //Compare wd with 16
BLT WIDTH_RESIDUE //If not jump to WIDTH_RESIDUE where loop is unrolled for 8 case
@@ -338,15 +338,15 @@ WIDTH_LOOP_16:
MOV x20,#-1
csel x8, x20, x8,NE
- mov v1.8b[0], w8 //au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
+ mov v1.b[0], w8 //au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
CMP x6,#16 //if(col == 16)
- mov v1.8b[1], w8 //au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
+ mov v1.b[1], w8 //au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
BNE SKIP_AU1_MASK_VAL
LDRB w8,[x5,#1] //pu1_avail[1]
- mov v1.16b[14], w8 //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
- mov v1.16b[15], w8 //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
+ mov v1.b[14], w8 //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
+ mov v1.b[15], w8 //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
SKIP_AU1_MASK_VAL:
LDRB w9,[x5,#2] //pu1_avail[2]
@@ -400,7 +400,7 @@ AU1_SRC_LEFT_LOOP:
LDRH w5,[x8] //I pu1_src_cpy[src_strd + 16]
mov x10, x21 //I Loads pu1_avail
- mov v18.4h[0], w5 //I pu1_next_row_tmp = vsetq_lane_u8(pu1_src_cpy[src_strd + 16], pu1_next_row_tmp, 0)
+ mov v18.h[0], w5 //I pu1_next_row_tmp = vsetq_lane_u8(pu1_src_cpy[src_strd + 16], pu1_next_row_tmp, 0)
LDRB w10,[x10,#2] //I pu1_avail[2]
CMP x10,#0 //I
@@ -430,13 +430,13 @@ AU1_SRC_LEFT_LOOP:
csel x8, x20, x8,GT //I SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2])
CMP x4,#0 //I
- mov v17.8b[0], w8 //I sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2]), sign_up, 0)
+ mov v17.b[0], w8 //I sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2]), sign_up, 0)
movn x20,#0
csel x4, x20, x4,LT //I
MOV x20,#1
csel x4, x20, x4,GT //I SIGN(pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2])
- mov v17.8b[1], w4 //I sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2 + 1]), sign_up, 1)
+ mov v17.b[1], w4 //I sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2 + 1]), sign_up, 1)
SIGN_UP_CHANGE_DONE:
LD1 {v30.8b},[x2] //edge_idx_tbl = vld1_s8(gi1_table_edge_idx)
@@ -499,7 +499,7 @@ PU1_SRC_LOOP:
LDRH w5,[x8] //II pu1_src_cpy[src_strd + 16]
ADD x11,x11,#16 //III
- mov v28.4h[0], w5 //II pu1_next_row_tmp = vsetq_lane_u8(pu1_src_cpy[src_strd + 16], pu1_next_row_tmp, 0)
+ mov v28.h[0], w5 //II pu1_next_row_tmp = vsetq_lane_u8(pu1_src_cpy[src_strd + 16], pu1_next_row_tmp, 0)
LDRH w4,[x11] //III pu1_src_cpy[src_strd + 16]
LDRB w8,[x0,x1] //II pu1_src_cpy[0]
@@ -507,7 +507,7 @@ PU1_SRC_LOOP:
SUB x5,x12,x7 //II ht_tmp - row
LSL x5,x5,#1 //II (ht_tmp - row) * 2
- mov v18.4h[0], w4 //III pu1_next_row_tmp = vsetq_lane_u8(pu1_src_cpy[src_strd + 16], pu1_next_row_tmp, 0)
+ mov v18.h[0], w4 //III pu1_next_row_tmp = vsetq_lane_u8(pu1_src_cpy[src_strd + 16], pu1_next_row_tmp, 0)
ADD x9,x14,x5 //II pu1_src_left_cpy[(ht_tmp - row) * 2]
sub x13,x9,#2
@@ -527,7 +527,7 @@ PU1_SRC_LOOP:
sub x13,x9,#1
LDRB w5,[x13] //II load the value
- mov v17.8b[0], w8 //II sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2]), sign_up, 0)
+ mov v17.b[0], w8 //II sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2]), sign_up, 0)
SUB x7,x7,#1 //II Decrement the ht_tmp loop count by 1
SUB x11,x11,x5 //II pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2 + 1]
@@ -545,7 +545,7 @@ PU1_SRC_LOOP:
SUB x5,x12,x7 //III ht_tmp - row
ADD x10,x0,x1
- mov v17.8b[1], w11 //II sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2 + 1]), sign_up, 1)
+ mov v17.b[1], w11 //II sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2 + 1]), sign_up, 1)
LSL x5,x5,#1 //III (ht_tmp - row) * 2
ADD x9,x14,x5 //III pu1_src_left_cpy[(ht_tmp - row) * 2]
@@ -579,7 +579,7 @@ PU1_SRC_LOOP:
UZP1 v31.8b, v26.8b, v27.8b
UZP2 v27.8b, v26.8b, v27.8b //II
mov v26.8b,v31.8b
- mov v17.8b[0], w4 //III sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2]), sign_up, 0)
+ mov v17.b[0], w4 //III sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2]), sign_up, 0)
movn x20,#0
csel x10, x20, x10,LT //III
@@ -592,7 +592,7 @@ PU1_SRC_LOOP:
TBL v25.8b, {v7.16b},v27.8b //II
SUB v22.16b, v22.16b , v20.16b //III sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
- mov v17.8b[1], w10 //III sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2 + 1]), sign_up, 1)
+ mov v17.b[1], w10 //III sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2 + 1]), sign_up, 1)
ZIP1 v31.8b, v24.8b, v25.8b
ZIP2 v25.8b, v24.8b, v25.8b //II
mov v24.8b,v31.8b
@@ -668,7 +668,7 @@ PU1_SRC_LOOP:
LDRH w5,[x8] //pu1_src_cpy[src_strd + 16]
LSL x4,x4,#1 //(ht_tmp - row) * 2
- mov v18.4h[0], w5 //pu1_next_row_tmp = vsetq_lane_u8(pu1_src_cpy[src_strd + 16], pu1_next_row_tmp, 0)
+ mov v18.h[0], w5 //pu1_next_row_tmp = vsetq_lane_u8(pu1_src_cpy[src_strd + 16], pu1_next_row_tmp, 0)
ADD x9,x14,x4 //pu1_src_left_cpy[(ht_tmp - row) * 2]
sub x13,x9,#2
@@ -686,7 +686,7 @@ PU1_SRC_LOOP:
LD1 {v30.8b},[x2] //edge_idx_tbl = vld1_s8(gi1_table_edge_idx)
LDRB w11,[x0,#1] //pu1_src_cpy[0]
- mov v17.8b[0], w8 //sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2]), sign_up, 0)
+ mov v17.b[0], w8 //sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2]), sign_up, 0)
sub x13,x9,#1
LDRB w5,[x13] //load the value
@@ -700,7 +700,7 @@ PU1_SRC_LOOP:
MOV x20,#1
csel x4, x20, x4,GT //SIGN(pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2])
- mov v17.8b[1], w4 //sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2 + 1]), sign_up, 1)
+ mov v17.b[1], w4 //sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2 + 1]), sign_up, 1)
SUB v24.16b, v24.16b , v22.16b //sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
ADD v26.16b, v0.16b , v17.16b //edge_idx = vaddq_s8(const_2, sign_up)
@@ -771,14 +771,14 @@ WD_16_HT_4_LOOP:
MOV x20,#-1
csel x8, x20, x8,NE
- mov v1.8b[0], w8 //au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
- mov v1.8b[1], w8 //au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
+ mov v1.b[0], w8 //au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
+ mov v1.b[1], w8 //au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
CMP x6,#16 //if(col == 16)
BNE SKIP_AU1_MASK_VAL_WD_16_HT_4
LDRB w8,[x5,#1] //pu1_avail[1]
- mov v1.16b[14], w8 //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
- mov v1.16b[15], w8 //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
+ mov v1.b[14], w8 //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
+ mov v1.b[15], w8 //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
SKIP_AU1_MASK_VAL_WD_16_HT_4:
LDRB w8,[x5,#2] //pu1_avail[2]
@@ -828,7 +828,7 @@ PU1_SRC_LOOP_WD_16_HT_4:
ADD x8,x8,#16
LDRH w5,[x8] //pu1_src_cpy[src_strd + 16]
- mov v18.4h[0], w5 //pu1_next_row_tmp = vsetq_lane_u8(pu1_src_cpy[src_strd + 16], pu1_next_row_tmp, 0)
+ mov v18.h[0], w5 //pu1_next_row_tmp = vsetq_lane_u8(pu1_src_cpy[src_strd + 16], pu1_next_row_tmp, 0)
EXT v18.16b, v16.16b , v18.16b,#2 //pu1_next_row_tmp = vextq_u8(pu1_next_row, pu1_next_row_tmp, 2)
CMP x7,x12
@@ -851,7 +851,7 @@ SIGN_UP_CHANGE_WD_16_HT_4:
csel x8, x20, x8,LT
MOV x20,#1
csel x8, x20, x8,GT //SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2])
- mov v17.8b[0], w8 //sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2]), sign_up, 0)
+ mov v17.b[0], w8 //sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2]), sign_up, 0)
LDRB w8,[x0,#1] //pu1_src_cpy[0]
sub x13,x9,#1
@@ -862,7 +862,7 @@ SIGN_UP_CHANGE_WD_16_HT_4:
csel x8, x20, x8,LT
MOV x20,#1
csel x8, x20, x8,GT //SIGN(pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2])
- mov v17.8b[1], w8 //sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2 + 1]), sign_up, 1)
+ mov v17.b[1], w8 //sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2 + 1]), sign_up, 1)
SIGN_UP_CHANGE_DONE_WD_16_HT_4:
cmhi v22.16b, v5.16b , v18.16b //vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
@@ -936,12 +936,12 @@ WIDTH_RESIDUE:
MOV x20,#-1
csel x8, x20, x8,NE
- mov v1.8b[0], w8 //au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
- mov v1.8b[1], w8 //au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
+ mov v1.b[0], w8 //au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
+ mov v1.b[1], w8 //au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
LDRB w8,[x5,#1] //pu1_avail[1]
- mov v1.8b[6], w8 //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
- mov v1.8b[7], w8 //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
+ mov v1.b[6], w8 //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
+ mov v1.b[7], w8 //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
LDRB w8,[x5,#2] //pu1_avail[2]
CMP x8,#0
@@ -986,7 +986,7 @@ PU1_SRC_LOOP_RESIDUE:
ADD x8,x8,#16
LDRH w5,[x8] //pu1_src_cpy[src_strd + 16]
- mov v18.4h[0], w5 //pu1_next_row_tmp = vsetq_lane_u8(pu1_src_cpy[src_strd + 16], pu1_next_row_tmp, 0)
+ mov v18.h[0], w5 //pu1_next_row_tmp = vsetq_lane_u8(pu1_src_cpy[src_strd + 16], pu1_next_row_tmp, 0)
EXT v18.16b, v16.16b , v18.16b,#2 //pu1_next_row_tmp = vextq_u8(pu1_next_row, pu1_next_row_tmp, 2)
CMP x7,x12
@@ -1009,7 +1009,7 @@ SIGN_UP_CHANGE_RESIDUE:
csel x8, x20, x8,LT
MOV x20,#1
csel x8, x20, x8,GT //SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2])
- mov v17.8b[0], w8 //sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2]), sign_up, 0)
+ mov v17.b[0], w8 //sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2]), sign_up, 0)
LDRB w8,[x0,#1] //pu1_src_cpy[0]
sub x13,x9,#1
@@ -1020,7 +1020,7 @@ SIGN_UP_CHANGE_RESIDUE:
csel x8, x20, x8,LT
MOV x20,#1
csel x8, x20, x8,GT //SIGN(pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2])
- mov v17.8b[1], w8 //sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2 + 1]), sign_up, 1)
+ mov v17.b[1], w8 //sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2 + 1]), sign_up, 1)
SIGN_UP_CHANGE_DONE_RESIDUE:
cmhi v22.16b, v5.16b , v18.16b //vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
diff --git a/common/arm64/ihevc_sao_edge_offset_class3.s b/common/arm64/ihevc_sao_edge_offset_class3.s
index f393753..9d4f26a 100644
--- a/common/arm64/ihevc_sao_edge_offset_class3.s
+++ b/common/arm64/ihevc_sao_edge_offset_class3.s
@@ -247,12 +247,12 @@ WIDTH_LOOP_16:
csel w8,w20,w8,EQ
MOV x20,#-1
csel x8, x20, x8,NE
- mov v1.8b[0], w8 //au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
+ mov v1.b[0], w8 //au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
CMP x6,#16 //if(col == 16)
BNE SKIP_AU1_MASK_VAL
LDRB w8,[x5,#1] //pu1_avail[1]
- mov v1.16b[15], w8 //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
+ mov v1.b[15], w8 //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
SKIP_AU1_MASK_VAL:
LDRB w8,[x5,#2] //pu1_avail[2]
@@ -302,7 +302,7 @@ AU1_SRC_LEFT_LOOP:
LDRB w8,[x8]
MOV x5,x23 //I Loads pu1_avail
- mov v18.16b[15], w8 //I vsetq_lane_u8
+ mov v18.b[15], w8 //I vsetq_lane_u8
LDRB w5,[x5,#2] //I pu1_avail[2]
EXT v18.16b, v18.16b , v16.16b,#15 //I pu1_next_row_tmp = vextq_u8(pu1_next_row_tmp, pu1_next_row, 15)
@@ -320,7 +320,7 @@ SIGN_UP_CHANGE:
csel x8, x20, x8,LT //I
MOV x20,#1
csel x8, x20, x8,GT //I SIGN(pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd])
- mov v17.16b[15], w8 //I sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd]), sign_up, 15)
+ mov v17.b[15], w8 //I sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd]), sign_up, 15)
SIGN_UP_CHANGE_DONE:
cmhi v3.16b, v5.16b , v18.16b //I vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
@@ -373,7 +373,7 @@ PU1_SRC_LOOP:
LDRB w8,[x8,#1]
LDRB w4,[x0,#16] //II load the value
- mov v18.16b[15], w8 //II vsetq_lane_u8
+ mov v18.b[15], w8 //II vsetq_lane_u8
SUB x11,x11,x4 //II pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd]
CMP x11,#0 //II
@@ -387,7 +387,7 @@ PU1_SRC_LOOP:
csel x11, x20, x11,GT //II SIGN(pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd])
ADD x8,x14,x5 //III pu1_src_left_cpy[ht_tmp - row]
- mov v17.8b[15], w11 //II sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd]), sign_up, 15)
+ mov v17.b[15], w11 //II sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd]), sign_up, 15)
CMP x7,#1 //III
BNE NEXT_ROW_ELSE_2 //III
@@ -412,7 +412,7 @@ NEXT_ROW_ELSE_2:
movn x20,#0
csel x2, x20, x2,LT //III
- mov v18.16b[15], w8 //III vsetq_lane_u8
+ mov v18.b[15], w8 //III vsetq_lane_u8
MOV x20,#1
csel x2, x20, x2,GT //III SIGN(pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd])
@@ -428,7 +428,7 @@ NEXT_ROW_ELSE_2:
TBL v26.16b, {v6.16b},v26.16b //II vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
cmhi v3.16b, v16.16b , v18.16b //III vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
- mov v17.16b[15], w2 //III sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd]), sign_up, 15)
+ mov v17.b[15], w2 //III sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd]), sign_up, 15)
// TBL v27.8b, {v6.16b},v27.8b //II vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
cmhi v18.16b, v18.16b , v16.16b //III vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
@@ -506,7 +506,7 @@ NEXT_ROW_ELSE_3:
NEXT_ROW_POINTER_ASSIGNED_3:
LDRB w11,[x4,#15] //pu1_src_cpy[15]
- mov v18.16b[15], w8 //vsetq_lane_u8
+ mov v18.b[15], w8 //vsetq_lane_u8
SUB x8,x11,x5 //pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd]
CMP x8,#0
@@ -521,7 +521,7 @@ NEXT_ROW_POINTER_ASSIGNED_3:
csel x8, x20, x8,GT //SIGN(pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd])
cmhi v26.16b, v18.16b , v5.16b //vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
- mov v17.16b[15], w8 //sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd]), sign_up, 15)
+ mov v17.b[15], w8 //sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd]), sign_up, 15)
SUB v24.16b, v26.16b , v24.16b //sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
Uxtl v20.8h, v5.8b //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
@@ -582,12 +582,12 @@ WD_16_HT_4_LOOP:
csel w8,w20,w8,EQ
MOV x20,#-1
csel x8, x20, x8,NE
- mov v1.8b[0], w8 //au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
+ mov v1.b[0], w8 //au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
CMP x6,#16 //if(col == 16)
BNE SKIP_AU1_MASK_VAL_WD_16_HT_4
LDRB w8,[x5,#1] //pu1_avail[1]
- mov v1.16b[15], w8 //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
+ mov v1.b[15], w8 //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
SKIP_AU1_MASK_VAL_WD_16_HT_4:
LDRB w8,[x5,#2] //pu1_avail[2]
@@ -643,7 +643,7 @@ NEXT_ROW_ELSE_WD_16_HT_4:
LDRB w8,[x8]
NEXT_ROW_POINTER_ASSIGNED_WD_16_HT_4:
- mov v18.16b[15], w8 //vsetq_lane_u8
+ mov v18.b[15], w8 //vsetq_lane_u8
EXT v18.16b, v18.16b , v16.16b,#15 //pu1_next_row_tmp = vextq_u8(pu1_next_row_tmp, pu1_next_row, 15)
CMP x7,x12
@@ -664,7 +664,7 @@ SIGN_UP_CHANGE_WD_16_HT_4:
csel x8, x20, x8,LT
MOV x20,#1
csel x8, x20, x8,GT //SIGN(pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd])
- mov v17.16b[15], w8 //sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd]), sign_up, 15)
+ mov v17.b[15], w8 //sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd]), sign_up, 15)
SIGN_UP_CHANGE_DONE_WD_16_HT_4:
cmhi v20.16b, v5.16b , v18.16b //vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
@@ -725,10 +725,10 @@ WIDTH_RESIDUE:
MOV x20,#-1
csel x8, x20, x8,NE
- mov v1.8b[0], w8 //au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
+ mov v1.b[0], w8 //au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
LDRB w8,[x5,#1] //pu1_avail[1]
- mov v1.8b[7], w8 //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
+ mov v1.b[7], w8 //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
PU1_AVAIL_2_RESIDUE:
LDRB w8,[x5,#2] //pu1_avail[2]
@@ -783,7 +783,7 @@ NEXT_ROW_ELSE_RESIDUE:
LDRB w8,[x8]
NEXT_ROW_POINTER_ASSIGNED_RESIDUE:
- mov v18.16b[15], w8 //vsetq_lane_u8
+ mov v18.b[15], w8 //vsetq_lane_u8
EXT v18.16b, v18.16b , v16.16b,#15 //pu1_next_row_tmp = vextq_u8(pu1_next_row_tmp, pu1_next_row, 15)
CMP x7,x12
@@ -804,7 +804,7 @@ SIGN_UP_CHANGE_RESIDUE:
csel x8, x20, x8,LT
MOV x20,#1
csel x8, x20, x8,GT //SIGN(pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd])
- mov v17.16b[15], w8 //sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd]), sign_up, 15)
+ mov v17.b[15], w8 //sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd]), sign_up, 15)
SIGN_UP_CHANGE_DONE_RESIDUE:
cmhi v20.16b, v5.16b , v18.16b //vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
diff --git a/common/arm64/ihevc_sao_edge_offset_class3_chroma.s b/common/arm64/ihevc_sao_edge_offset_class3_chroma.s
index 5c444c0..8e93110 100644
--- a/common/arm64/ihevc_sao_edge_offset_class3_chroma.s
+++ b/common/arm64/ihevc_sao_edge_offset_class3_chroma.s
@@ -328,16 +328,16 @@ WIDTH_LOOP_16:
MOV x20,#-1
csel x8, x20, x8,NE
- mov v1.8b[0], w8 //au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
+ mov v1.b[0], w8 //au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
LDRB w11,[x5,#2] //pu1_avail[2]
CMP x6,#16 //if(col == 16)
- mov v1.8b[1], w8 //au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
+ mov v1.b[1], w8 //au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
BNE SKIP_AU1_MASK_VAL
LDRB w8,[x5,#1] //pu1_avail[1]
- mov v1.16b[14], w8 //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
- mov v1.16b[15], w8 //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
+ mov v1.b[14], w8 //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
+ mov v1.b[15], w8 //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
SKIP_AU1_MASK_VAL:
CMP x11,#0
@@ -389,7 +389,7 @@ AU1_SRC_LEFT_LOOP:
ADD x8,x14,x5,LSL #1 //I pu1_src_left_cpy[(ht_tmp - row) * 2]
LDRH w5,[x8,#2] //I
- mov v18.4h[7], w5 //I vsetq_lane_u8
+ mov v18.h[7], w5 //I vsetq_lane_u8
mov x11, x21 //I Loads pu1_avail
LDRB w11,[x11,#2] //I pu1_avail[2]
@@ -418,11 +418,11 @@ AU1_SRC_LEFT_LOOP:
movn x20,#0
csel x9, x20, x9,LT //I
- mov v17.16b[14], w8 //I sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[14] -pu1_src_cpy[16 - src_strd]), sign_up, 0)
+ mov v17.b[14], w8 //I sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[14] -pu1_src_cpy[16 - src_strd]), sign_up, 0)
MOV x20,#1
csel x9, x20, x9,GT //I SIGN(pu1_src_cpy[15] - pu1_src_cpy[17 - src_strd]
- mov v17.16b[15], w9 //I sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[15] -pu1_src_cpy[17 - src_strd]), sign_up, 1)
+ mov v17.b[15], w9 //I sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[15] -pu1_src_cpy[17 - src_strd]), sign_up, 1)
SIGN_UP_CHANGE_DONE:
LD1 {v28.8b},[x2] //edge_idx_tbl = vld1_s8(gi1_table_edge_idx)
@@ -483,7 +483,7 @@ PU1_SRC_LOOP:
LDRB w10,[x4,#14] //II pu1_src_cpy[14]
LDRB w8,[x4,#15] //II pu1_src_cpy[15]
- mov v28.4h[7], w9 //II vsetq_lane_u8
+ mov v28.h[7], w9 //II vsetq_lane_u8
ADD x4,x11,x1 //III *pu1_src + src_strd
LDRB w5,[x0,#17] //II load the value pu1_src_cpy[17 - src_strd]
@@ -507,14 +507,14 @@ PU1_SRC_LOOP:
csel x10, x20, x10,GT //II SIGN(pu1_src_cpy[14] - pu1_src_cpy[16 - src_strd])
CMP x8,#0 //II
- mov v17.8b[14], w10 //II sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[14] -pu1_src_cpy[16 - src_strd]), sign_up, 0)
+ mov v17.b[14], w10 //II sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[14] -pu1_src_cpy[16 - src_strd]), sign_up, 0)
movn x20,#0
csel x8, x20, x8,LT //II
MOV x20,#1
csel x8, x20, x8,GT //II SIGN(pu1_src_cpy[15] - pu1_src_cpy[17 - src_strd]
SUB x10,x12,x7 //III ht_tmp - row
- mov v17.8b[15], w8 //II sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[15] -pu1_src_cpy[17 - src_strd]), sign_up, 1)
+ mov v17.b[15], w8 //II sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[15] -pu1_src_cpy[17 - src_strd]), sign_up, 1)
ADD x11,x14,x10,LSL #1 //III pu1_src_left_cpy[(ht_tmp - row) * 2]
CMP x7,#1 //III
@@ -533,7 +533,7 @@ NEXT_ROW_POINTER_ASSIGNED_2:
ADD x11,x0,x1 //III
LDRB w9,[x11,#14] //III pu1_src_cpy[14]
- mov v18.4h[7], w5 //III vsetq_lane_u8
+ mov v18.h[7], w5 //III vsetq_lane_u8
LDRB w8,[x11,#15] //III pu1_src_cpy[15]
LDRB w11,[x0,#16] //III load the value pu1_src_cpy[16 - src_strd]
@@ -565,11 +565,11 @@ NEXT_ROW_POINTER_ASSIGNED_2:
//TBL v27.8b, {v21.16b},v27.8b //II vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
cmhi v22.16b, v16.16b , v18.16b //III vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
- mov v17.16b[14], w9 //III sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[14] -pu1_src_cpy[16 - src_strd]), sign_up, 0)
+ mov v17.b[14], w9 //III sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[14] -pu1_src_cpy[16 - src_strd]), sign_up, 0)
AND v26.16b, v26.16b , v1.16b //II edge_idx = vandq_s8(edge_idx, au1_mask)
mov v27.d[0],v26.d[1]
- mov v17.16b[15], w10 //III sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[15] -pu1_src_cpy[17 - src_strd]), sign_up, 1)
+ mov v17.b[15], w10 //III sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[15] -pu1_src_cpy[17 - src_strd]), sign_up, 1)
UZP1 v31.8b, v26.8b, v27.8b
UZP2 v27.8b, v26.8b, v27.8b //II
mov v26.8b,v31.8b
@@ -668,7 +668,7 @@ NEXT_ROW_POINTER_ASSIGNED_3:
LDRB w8,[x0,#14] //pu1_src_cpy[14]
SUB x8,x8,x4 //pu1_src_cpy[14] - pu1_src_cpy[16 - src_strd]
- mov v18.4h[7], w5 //vsetq_lane_u8
+ mov v18.h[7], w5 //vsetq_lane_u8
LDRB w10,[x0,#15] //pu1_src_cpy[15]
CMP x8,#0
@@ -682,13 +682,13 @@ NEXT_ROW_POINTER_ASSIGNED_3:
csel x8, x20, x8,GT //SIGN(pu1_src_cpy[14] - pu1_src_cpy[16 - src_strd])
CMP x10,#0
- mov v17.16b[14], w8 //sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[14] -pu1_src_cpy[16 - src_strd]), sign_up, 0)
+ mov v17.b[14], w8 //sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[14] -pu1_src_cpy[16 - src_strd]), sign_up, 0)
movn x20,#0
csel x10, x20, x10,LT
MOV x20,#1
csel x10, x20, x10,GT //SIGN(pu1_src_cpy[15] - pu1_src_cpy[17 - src_strd]
- mov v17.16b[15], w10 //sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[15] -pu1_src_cpy[17 - src_strd]), sign_up, 1)
+ mov v17.b[15], w10 //sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[15] -pu1_src_cpy[17 - src_strd]), sign_up, 1)
cmhi v20.16b, v5.16b , v18.16b //vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
cmhi v22.16b, v18.16b , v5.16b //vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
@@ -762,15 +762,15 @@ WD_16_HT_4_LOOP:
csel w8,w20,w8,EQ
MOV x20,#-1
csel x8, x20, x8,NE
- mov v1.8b[0], w8 //au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
+ mov v1.b[0], w8 //au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
CMP x6,#16 //if(col == 16)
- mov v1.8b[1], w8 //au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
+ mov v1.b[1], w8 //au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
BNE SKIP_AU1_MASK_VAL_WD_16_HT_4
LDRB w8,[x5,#1] //pu1_avail[1]
- mov v1.16b[14], w8 //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
- mov v1.16b[15], w8 //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
+ mov v1.b[14], w8 //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
+ mov v1.b[15], w8 //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
SKIP_AU1_MASK_VAL_WD_16_HT_4:
LDRB w11,[x5,#2] //pu1_avail[2]
@@ -834,7 +834,7 @@ PU1_SRC_LOOP_WD_16_HT_4:
NEXT_ROW_POINTER_ASSIGNED_WD_16_HT_4:
LDRH w5,[x8]
- mov v18.8h[7], w5 //vsetq_lane_u8
+ mov v18.h[7], w5 //vsetq_lane_u8
EXT v18.16b, v18.16b , v16.16b,#14 //pu1_next_row_tmp = vextq_u8(pu1_next_row_tmp, pu1_next_row, 14)
CMP x7,x12
@@ -864,13 +864,13 @@ SIGN_UP_CHANGE_WD_16_HT_4:
csel x8, x20, x8,GT //SIGN(pu1_src_cpy[14] - pu1_src_cpy[16 - src_strd])
CMP x10,#0
- mov v17.16b[14], w8 //sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[14] -pu1_src_cpy[16 - src_strd]), sign_up, 0)
+ mov v17.b[14], w8 //sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[14] -pu1_src_cpy[16 - src_strd]), sign_up, 0)
movn x20,#0
csel x10, x20, x10,LT
MOV x20,#1
csel x10, x20, x10,GT //SIGN(pu1_src_cpy[15] - pu1_src_cpy[17 - src_strd]
- mov v17.16b[15], w10 //sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[15] -pu1_src_cpy[17 - src_strd]), sign_up, 1)
+ mov v17.b[15], w10 //sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[15] -pu1_src_cpy[17 - src_strd]), sign_up, 1)
SIGN_UP_CHANGE_DONE_WD_16_HT_4:
LD1 {v20.8b},[x2] //edge_idx_tbl = vld1_s8(gi1_table_edge_idx)
@@ -949,20 +949,20 @@ WIDTH_RESIDUE:
LDRB w11,[x5,#1] //pu1_avail[1]
LDRB w9,[x5,#2] //pu1_avail[2]
- mov v1.8b[0], w8 //au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
+ mov v1.b[0], w8 //au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
CMP x9,#0
SUB x20,x0,x1 //pu1_src - src_strd
csel x10, x20, x10,EQ
- mov v1.8b[1], w8 //au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
+ mov v1.b[1], w8 //au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
csel x10, x3, x10,NE
ADD x10,x10,#2 //pu1_src - src_strd + 2
- mov v1.8b[6], w11 //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
+ mov v1.b[6], w11 //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
ADD x5,sp,#0x4B //*au1_src_left_tmp
mov w4, w25 //Loads ht
- mov v1.8b[7], w11 //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
+ mov v1.b[7], w11 //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
mov w7, w24 //Loads wd
mov x8, x26 //Loads *pu1_src
@@ -1015,10 +1015,10 @@ NEXT_ROW_POINTER_ASSIGNED_RESIDUE:
LDRB w5,[x8]
LDRB w8,[x8,#1]
- mov v18.16b[14], w5 //vsetq_lane_u8
+ mov v18.b[14], w5 //vsetq_lane_u8
CMP x7,x12
- mov v18.16b[15], w8 //vsetq_lane_u8
+ mov v18.b[15], w8 //vsetq_lane_u8
EXT v18.16b, v18.16b , v16.16b,#14 //pu1_next_row_tmp = vextq_u8(pu1_next_row_tmp, pu1_next_row, 14)
BLT SIGN_UP_CHANGE_RESIDUE
@@ -1047,13 +1047,13 @@ SIGN_UP_CHANGE_RESIDUE:
csel x8, x20, x8,GT //SIGN(pu1_src_cpy[14] - pu1_src_cpy[16 - src_strd])
CMP x10,#0
- mov v17.16b[14], w8 //sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[14] -pu1_src_cpy[16 - src_strd]), sign_up, 0)
+ mov v17.b[14], w8 //sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[14] -pu1_src_cpy[16 - src_strd]), sign_up, 0)
movn x20,#0
csel x10, x20, x10,LT
MOV x20,#1
csel x10, x20, x10,GT //SIGN(pu1_src_cpy[15] - pu1_src_cpy[17 - src_strd]
- mov v17.16b[15], w10 //sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[15] -pu1_src_cpy[17 - src_strd]), sign_up, 1)
+ mov v17.b[15], w10 //sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[15] -pu1_src_cpy[17 - src_strd]), sign_up, 1)
SIGN_UP_CHANGE_DONE_RESIDUE:
LD1 {v20.8b},[x2] //edge_idx_tbl = vld1_s8(gi1_table_edge_idx)
diff --git a/common/arm64/ihevc_weighted_pred_bi.s b/common/arm64/ihevc_weighted_pred_bi.s
index c0508d8..299b042 100644
--- a/common/arm64/ihevc_weighted_pred_bi.s
+++ b/common/arm64/ihevc_weighted_pred_bi.s
@@ -219,28 +219,28 @@ core_loop:
ld1 {v0.4h},[x0],#8 //load and increment the pi2_src1
add x10,x2,x5 //pu1_dst_tmp = pu1_dst + dst_strd
ld1 {v1.4h},[x1],#8 //load and increment the pi2_src2
- smull v4.4s, v0.4h, v7.4h[0] //vmull_n_s16(pi2_src1_val1, (int16_t) wgt0)
+ smull v4.4s, v0.4h, v7.h[0] //vmull_n_s16(pi2_src1_val1, (int16_t) wgt0)
ld1 {v2.4h},[x6],x3 //load and increment the pi2_src_tmp1 ii iteration
- smull v5.4s, v1.4h, v7.4h[1] //vmull_n_s16(pi2_src2_val1, (int16_t) wgt1)
+ smull v5.4s, v1.4h, v7.h[1] //vmull_n_s16(pi2_src2_val1, (int16_t) wgt1)
ld1 {v3.4h},[x8],x4 //load and increment the pi2_src_tmp1 ii iteration
add v4.4s, v4.4s , v5.4s //vaddq_s32(i4_tmp1_t1, i4_tmp1_t2)
ld1 {v0.4h},[x6],x3 //load and increment the pi2_src1 iii iteration
- smull v6.4s, v2.4h, v7.4h[0] //vmull_n_s16(pi2_src1_val2, (int16_t) wgt0) ii iteration
+ smull v6.4s, v2.4h, v7.h[0] //vmull_n_s16(pi2_src1_val2, (int16_t) wgt0) ii iteration
ld1 {v1.4h},[x8],x4 //load and increment the pi2_src2 iii iteration
add v4.4s, v4.4s , v30.4s //vaddq_s32(i4_tmp1_t1, tmp_lvl_shift_t)
- smull v19.4s, v0.4h, v7.4h[0] //vmull_n_s16(pi2_src1_val1, (int16_t) wgt0) iii iteration
+ smull v19.4s, v0.4h, v7.h[0] //vmull_n_s16(pi2_src1_val1, (int16_t) wgt0) iii iteration
ld1 {v2.4h},[x6],x3 //load and increment the pi2_src_tmp1 iv iteration
- smull v17.4s, v3.4h, v7.4h[1] //vmull_n_s16(pi2_src2_val2, (int16_t) wgt1) ii iteration
+ smull v17.4s, v3.4h, v7.h[1] //vmull_n_s16(pi2_src2_val2, (int16_t) wgt1) ii iteration
sshl v4.4s,v4.4s,v28.4s //vshlq_s32(i4_tmp1_t1, tmp_shift_t)
ld1 {v3.4h},[x8],x4 //load and increment the pi2_src_tmp1 iv iteration
add v6.4s, v6.4s , v17.4s //vaddq_s32(i4_tmp2_t1, i4_tmp2_t2) ii iteration
sqxtun v4.4h, v4.4s //vqmovun_s32(sto_res_tmp1)
- smull v16.4s, v1.4h, v7.4h[1] //vmull_n_s16(pi2_src2_val1, (int16_t) wgt1) iii iteration
+ smull v16.4s, v1.4h, v7.h[1] //vmull_n_s16(pi2_src2_val1, (int16_t) wgt1) iii iteration
add v6.4s, v6.4s , v30.4s //vaddq_s32(i4_tmp2_t1, tmp_lvl_shift_t) ii iteration
//mov v5, v4 //vcombine_u16(sto_res_tmp2, sto_res_tmp2)
@@ -248,13 +248,13 @@ core_loop:
sshl v6.4s,v6.4s,v28.4s
//vshl.s32 q5,q5,q14 //vshlq_s32(i4_tmp2_t1, tmp_shift_t) ii iteration
- smull v18.4s, v2.4h, v7.4h[0] //vmull_n_s16(pi2_src1_val2, (int16_t) wgt0) iv iteration
+ smull v18.4s, v2.4h, v7.h[0] //vmull_n_s16(pi2_src1_val2, (int16_t) wgt0) iv iteration
uqxtn v4.8b,v4.8h
//vqmovn.u16 d4,q2 //vqmovn_u16(sto_res_tmp3)
add v19.4s, v19.4s , v30.4s //vaddq_s32(i4_tmp1_t1, tmp_lvl_shift_t) iii iteration
sqxtun v6.4h, v6.4s //vqmovun_s32(sto_res_tmp1) ii iteration
- smull v20.4s, v3.4h, v7.4h[1] //vmull_n_s16(pi2_src2_val2, (int16_t) wgt1) iv iteration
+ smull v20.4s, v3.4h, v7.h[1] //vmull_n_s16(pi2_src2_val2, (int16_t) wgt1) iv iteration
sshl v19.4s,v19.4s,v28.4s
//vshl.s32 q7,q7,q14 //vshlq_s32(i4_tmp1_t1, tmp_shift_t) iii iteration
diff --git a/common/arm64/ihevc_weighted_pred_uni.s b/common/arm64/ihevc_weighted_pred_uni.s
index 5586679..c6dee6f 100644
--- a/common/arm64/ihevc_weighted_pred_uni.s
+++ b/common/arm64/ihevc_weighted_pred_uni.s
@@ -151,7 +151,7 @@ ihevc_weighted_pred_uni_av8:
add x10,x10,x22 //lvl_shift * wgt0 + (off0 << shift)
mov x9,x21 //load wt
sub x12,x6,#1
- mov v0.4h[0], w4 //moved for scalar multiplication
+ mov v0.h[0], w4 //moved for scalar multiplication
lsl x2,x2,#1
dup v28.4s,w6 //vmovq_n_s32(tmp_shift)
lsl x22,x11,x12
@@ -172,19 +172,19 @@ core_loop:
add x6,x1,x3 //pu1_dst_tmp = pu1_dst + dst_strd
ld1 {v1.4h},[x0],#8 //load and increment the pi2_src
ld1 {v2.4h},[x5],x2 //load and increment the pi2_src_tmp ii iteration
- smull v4.4s, v1.4h, v0.4h[0] //vmull_n_s16(pi2_src_val1, (int16_t) wgt0)
+ smull v4.4s, v1.4h, v0.h[0] //vmull_n_s16(pi2_src_val1, (int16_t) wgt0)
add v4.4s, v4.4s , v30.4s //vaddq_s32(i4_tmp1_t, tmp_lvl_shift_t)
ld1 {v3.4h},[x5],x2 //load and increment the pi2_src iii iteration
- smull v6.4s, v2.4h, v0.4h[0] //vmull_n_s16(pi2_src_val2, (int16_t) wgt0) ii iteration
+ smull v6.4s, v2.4h, v0.h[0] //vmull_n_s16(pi2_src_val2, (int16_t) wgt0) ii iteration
ld1 {v5.4h},[x5],x2 //load and increment the pi2_src_tmp iv iteration
sshl v4.4s,v4.4s,v28.4s
//vshl.s32 q2,q2,q14 //vshlq_s32(i4_tmp1_t, tmp_shift_t)
add v6.4s, v6.4s , v30.4s //vaddq_s32(i4_tmp2_t, tmp_lvl_shift_t) ii iteration
- smull v7.4s, v3.4h, v0.4h[0] //vmull_n_s16(pi2_src_val1, (int16_t) wgt0) iii iteration
+ smull v7.4s, v3.4h, v0.h[0] //vmull_n_s16(pi2_src_val1, (int16_t) wgt0) iii iteration
sqxtun v4.4h, v4.4s //vqmovun_s32(sto_res_tmp1)
add v7.4s, v7.4s , v30.4s //vaddq_s32(i4_tmp1_t, tmp_lvl_shift_t) iii iteration
@@ -193,7 +193,7 @@ core_loop:
sshl v6.4s,v6.4s,v28.4s
//vshl.s32 q3,q3,q14 //vshlq_s32(i4_tmp2_t, tmp_shift_t) ii iteration
- smull v16.4s, v5.4h, v0.4h[0] //vmull_n_s16(pi2_src_val2, (int16_t) wgt0) iv iteration
+ smull v16.4s, v5.4h, v0.h[0] //vmull_n_s16(pi2_src_val2, (int16_t) wgt0) iv iteration
uqxtn v4.8b, v4.8h //vqmovn_u16(sto_res_tmp3)
sshl v7.4s,v7.4s,v28.4s
diff --git a/decoder.arm64.mk b/decoder.arm64.mk
index 316cc26..2e6ec23 100644
--- a/decoder.arm64.mk
+++ b/decoder.arm64.mk
@@ -91,3 +91,10 @@ libhevcd_cflags_arm64 += -DDEFAULT_ARCH=D_ARCH_ARMV8_GENERIC
LOCAL_SRC_FILES_arm64 += $(libhevcd_srcs_c_arm64) $(libhevcd_srcs_asm_arm64)
LOCAL_C_INCLUDES_arm64 += $(libhevcd_inc_dir_arm64)
LOCAL_CFLAGS_arm64 += $(libhevcd_cflags_arm64)
+
+# Clang doesn't pass -I flags to the assembler when building a .s file.
+# We need to tell it to pass them to the assembler specifically (doesn't hurt
+# with gcc either, and may actually help future gcc versions if they decide
+# to start making a difference between assembly and C includes).
+comma := ,
+LOCAL_ASFLAGS_arm64 += $(addprefix -Wa$(comma)-I,$(libhevcd_inc_dir_arm64))
diff --git a/decoder.mips64.mk b/decoder.mips64.mk
index 5ac515e..81b5852 100644
--- a/decoder.mips64.mk
+++ b/decoder.mips64.mk
@@ -1,8 +1,8 @@
-libhevcd_inc_dir_mips += $(LOCAL_PATH)/decoder/mips
-libhevcd_inc_dir_mips += $(LOCAL_PATH)/common/mips
+libhevcd_inc_dir_mips64 += $(LOCAL_PATH)/decoder/mips
+libhevcd_inc_dir_mips64 += $(LOCAL_PATH)/common/mips
-libhevcd_srcs_c_mips += decoder/mips/ihevcd_function_selector.c
-libhevcd_srcs_c_mips += decoder/mips/ihevcd_function_selector_mips_generic.c
+libhevcd_srcs_c_mips64 += decoder/mips/ihevcd_function_selector.c
+libhevcd_srcs_c_mips64 += decoder/mips/ihevcd_function_selector_mips_generic.c
LOCAL_SRC_FILES_mips64 += $(libhevcd_srcs_c_mips64) $(libhevcd_srcs_asm_mips64)
LOCAL_C_INCLUDES_mips64 += $(libhevcd_inc_dir_mips64)
diff --git a/decoder/arm64/ihevcd_fmt_conv_420sp_to_rgba8888.s b/decoder/arm64/ihevcd_fmt_conv_420sp_to_rgba8888.s
index a6041f5..026b65f 100644
--- a/decoder/arm64/ihevcd_fmt_conv_420sp_to_rgba8888.s
+++ b/decoder/arm64/ihevcd_fmt_conv_420sp_to_rgba8888.s
@@ -121,16 +121,16 @@ ihevcd_fmt_conv_420sp_to_rgba8888_av8:
///* can be loaded from a defined const type */
mov x10,#0x3311
- mov v0.4h[0], w10 ////C1
+ mov v0.h[0], w10 ////C1
mov x10,#0xF379
- mov v0.4h[1], w10 ////C2
+ mov v0.h[1], w10 ////C2
mov x10,#0xE5F8
- mov v0.4h[2], w10 ////C3
+ mov v0.h[2], w10 ////C3
mov x10,#0x4092
- mov v0.4h[3], w10 ////C4
+ mov v0.h[3], w10 ////C4
////LOAD CONSTANT 128 INTO A CORTEX REGISTER
MOV x10,#128
@@ -197,16 +197,16 @@ LABEL_YUV420SP_TO_RGB8888_WIDTH_LOOP:
prfm PLDL1KEEP,[x1]
////NEED TO MULTIPLY WITH Q2,Q3 WITH CO-EEFICIENTS
- sMULL v5.4s, v4.4h, v0.4h[3] ////(U-128)*C4 FOR B
- sMULL2 v7.4s, v4.8h, v0.4h[3] ////(U-128)*C4 FOR B
+ sMULL v5.4s, v4.4h, v0.h[3] ////(U-128)*C4 FOR B
+ sMULL2 v7.4s, v4.8h, v0.h[3] ////(U-128)*C4 FOR B
- sMULL v20.4s, v6.4h, v0.4h[0] ////(V-128)*C1 FOR R
- sMULL2 v22.4s, v6.8h, v0.4h[0] ////(V-128)*C1 FOR R
+ sMULL v20.4s, v6.4h, v0.h[0] ////(V-128)*C1 FOR R
+ sMULL2 v22.4s, v6.8h, v0.h[0] ////(V-128)*C1 FOR R
- sMULL v12.4s, v4.4h, v0.4h[1] ////(U-128)*C2 FOR G
- sMLAL v12.4s, v6.4h, v0.4h[2] ////Q6 = (U-128)*C2 + (V-128)*C3
- sMULL2 v14.4s, v4.8h, v0.4h[1] ////(U-128)*C2 FOR G
- sMLAL2 v14.4s, v6.8h, v0.4h[2] ////Q7 = (U-128)*C2 + (V-128)*C3
+ sMULL v12.4s, v4.4h, v0.h[1] ////(U-128)*C2 FOR G
+ sMLAL v12.4s, v6.4h, v0.h[2] ////Q6 = (U-128)*C2 + (V-128)*C3
+ sMULL2 v14.4s, v4.8h, v0.h[1] ////(U-128)*C2 FOR G
+ sMLAL2 v14.4s, v6.8h, v0.h[2] ////Q7 = (U-128)*C2 + (V-128)*C3
////NARROW RIGHT SHIFT BY 13 FOR R&B
sqshrn v5.4h, v5.4s,#13 ////D8 = (U-128)*C4>>13 4 16-BIT VALUES
@@ -360,16 +360,16 @@ LABEL_YUV420SP_TO_RGB8888_WIDTH_LOOP_SKIP:
////NEED TO MULTIPLY WITH Q2,Q3 WITH CO-EEFICIENTS
- sMULL v5.4s, v4.4h, v0.4h[3] ////(U-128)*C4 FOR B
- sMULL2 v7.4s, v4.8h, v0.4h[3] ////(U-128)*C4 FOR B
+ sMULL v5.4s, v4.4h, v0.h[3] ////(U-128)*C4 FOR B
+ sMULL2 v7.4s, v4.8h, v0.h[3] ////(U-128)*C4 FOR B
- sMULL v20.4s, v6.4h, v0.4h[0] ////(V-128)*C1 FOR R
- sMULL2 v22.4s, v6.8h, v0.4h[0] ////(V-128)*C1 FOR R
+ sMULL v20.4s, v6.4h, v0.h[0] ////(V-128)*C1 FOR R
+ sMULL2 v22.4s, v6.8h, v0.h[0] ////(V-128)*C1 FOR R
- sMULL v12.4s, v4.4h, v0.4h[1] ////(U-128)*C2 FOR G
- sMLAL v12.4s, v6.4h, v0.4h[2] ////Q6 = (U-128)*C2 + (V-128)*C3
- sMULL2 v14.4s, v4.8h, v0.4h[1] ////(U-128)*C2 FOR G
- sMLAL2 v14.4s, v6.8h, v0.4h[2] ////Q7 = (U-128)*C2 + (V-128)*C3
+ sMULL v12.4s, v4.4h, v0.h[1] ////(U-128)*C2 FOR G
+ sMLAL v12.4s, v6.4h, v0.h[2] ////Q6 = (U-128)*C2 + (V-128)*C3
+ sMULL2 v14.4s, v4.8h, v0.h[1] ////(U-128)*C2 FOR G
+ sMLAL2 v14.4s, v6.8h, v0.h[2] ////Q7 = (U-128)*C2 + (V-128)*C3
////NARROW RIGHT SHIFT BY 13 FOR R&B
sqshrn v5.4h, v5.4s,#13 ////D8 = (U-128)*C4>>13 4 16-BIT VALUES