diff options
54 files changed, 1970 insertions, 1958 deletions
diff --git a/common/arm/ihevc_inter_pred_filters_luma_vert.s b/common/arm/ihevc_inter_pred_filters_luma_vert.s index 04942ae..f51d68c 100644 --- a/common/arm/ihevc_inter_pred_filters_luma_vert.s +++ b/common/arm/ihevc_inter_pred_filters_luma_vert.s @@ -105,7 +105,7 @@ @ r3 => wd .text .align 4 - +.syntax unified @@ -407,7 +407,7 @@ end_loops: ldr r1, [sp], #4 ldr r0, [sp], #4 - ldmeqfd sp!,{r4-r12,r15} @reload the registers from sp + ldmfdeq sp!,{r4-r12,r15} @reload the registers from sp mov r5, #4 add r0, r0, #8 add r1, r1, #8 @@ -848,7 +848,7 @@ end_loops_16out: ldr r1, [sp], #4 ldr r0, [sp], #4 - ldmeqfd sp!,{r4-r12,r15} @reload the registers from sp + ldmfdeq sp!,{r4-r12,r15} @reload the registers from sp mov r5, #4 add r0, r0, #8 add r1, r1, #16 diff --git a/common/arm/ihevc_inter_pred_luma_horz_w16out.s b/common/arm/ihevc_inter_pred_luma_horz_w16out.s index b27b2e8..e8800e0 100644 --- a/common/arm/ihevc_inter_pred_luma_horz_w16out.s +++ b/common/arm/ihevc_inter_pred_luma_horz_w16out.s @@ -109,7 +109,7 @@ @r14 - loop_counter .text .align 4 - +.syntax unified @@ -277,8 +277,8 @@ height_residue_4: ldr r7,[sp,#44] @loads ht and r7,r7,#1 @calculating ht_residue ht_residue = (ht & 1) cmp r7,#0 - @beq end_loops - ldmeqfd sp!,{r4-r12,r15} @reload the registers from sp + @beq end_loops + ldmfdeq sp!,{r4-r12,r15} @reload the registers from sp outer_loop_height_residue_4: diff --git a/common/arm/ihevc_sao_edge_offset_class2.s b/common/arm/ihevc_sao_edge_offset_class2.s index 33b4961..536f941 100644 --- a/common/arm/ihevc_sao_edge_offset_class2.s +++ b/common/arm/ihevc_sao_edge_offset_class2.s @@ -59,6 +59,7 @@ @r8=> ht .text +.syntax unified .p2align 2 .extern gi1_table_edge_idx @@ -214,7 +215,7 @@ WIDTH_LOOP_16: LDR r5,[sp,#0xC8] @Loads pu1_avail CMP r6,r7 @col == wd - LDREQB r8,[r5] @pu1_avail[0] + LDRBEQ r8,[r5] @pu1_avail[0] MOVNE r8,#-1 @au1_mask = vsetq_lane_s8(-1, au1_mask, 0) VMOV.8 d8[0],r8 @au1_mask = vsetq_lane_s8((-1||pu1_avail[0]), au1_mask, 0) @@ -526,7 +527,7 @@ WD_16_HT_4_LOOP: LDR r7,[sp,#0xD0] @Loads wd LDR r5,[sp,#0xC8] @Loads pu1_avail CMP r6,r7 @col == wd - LDREQB r8,[r5] @pu1_avail[0] + LDRBEQ r8,[r5] @pu1_avail[0] MOVNE r8,#-1 @au1_mask = vsetq_lane_s8(-1, au1_mask, 0) VMOV.8 d8[0],r8 @au1_mask = vsetq_lane_s8((-1||pu1_avail[0]), au1_mask, 0) @@ -658,7 +659,7 @@ WIDTH_RESIDUE: LDR r7,[sp,#0xD0] @Loads wd LDR r5,[sp,#0xC8] @Loads pu1_avail CMP r6,r7 @wd_residue == wd - LDREQB r8,[r5] @pu1_avail[0] + LDRBEQ r8,[r5] @pu1_avail[0] MOVNE r8,#-1 VMOV.8 d8[0],r8 @au1_mask = vsetq_lane_s8(-1, au1_mask, 0) diff --git a/common/arm/ihevc_sao_edge_offset_class2_chroma.s b/common/arm/ihevc_sao_edge_offset_class2_chroma.s index c6fb391..b74a8f6 100644 --- a/common/arm/ihevc_sao_edge_offset_class2_chroma.s +++ b/common/arm/ihevc_sao_edge_offset_class2_chroma.s @@ -61,6 +61,7 @@ @r8=> ht .text +.syntax unified .p2align 2 .extern gi1_table_edge_idx @@ -289,7 +290,7 @@ ulbl5: add r2,r2,pc MOV r6,r7 @move wd to r6 loop_count - VMOV.S8 Q4,#0XFF @au1_mask = vdupq_n_s8(-1) + VMOV.S8 Q4,#0xFF @au1_mask = vdupq_n_s8(-1) CMP r7,#16 @Compare wd with 16 BLT WIDTH_RESIDUE @If not jump to WIDTH_RESIDUE where loop is unrolled for 8 case @@ -300,7 +301,7 @@ WIDTH_LOOP_16: LDR r5,[sp,#0x108] @Loads pu1_avail LDR r7,[sp,#0x114] @Loads wd CMP r6,r7 @col == wd - LDREQB r8,[r5] @pu1_avail[0] + LDRBEQ r8,[r5] @pu1_avail[0] MOVNE r8,#-1 VMOV.8 D8[0],r8 @au1_mask = vsetq_lane_s8(-1, au1_mask, 0) @@ -684,7 +685,7 @@ WD_16_HT_4_LOOP: LDR r5,[sp,#0x108] @Loads pu1_avail LDR r7,[sp,#0x114] @Loads wd CMP r6,r7 @col == wd - LDREQB r8,[r5] @pu1_avail[0] + LDRBEQ r8,[r5] @pu1_avail[0] MOVNE r8,#-1 VMOV.8 D8[0],r8 @au1_mask = vsetq_lane_s8(-1, au1_mask, 0) @@ -835,7 +836,7 @@ WIDTH_RESIDUE: LDR r7,[sp,#0x114] @Loads wd LDR r5,[sp,#0x108] @Loads pu1_avail CMP r6,r7 @wd_residue == wd - LDREQB r8,[r5] @pu1_avail[0] + LDRBEQ r8,[r5] @pu1_avail[0] MOVNE r8,#-1 VMOV.8 d8[0],r8 @au1_mask = vsetq_lane_s8(-1, au1_mask, 0) diff --git a/common/arm/ihevc_sao_edge_offset_class3.s b/common/arm/ihevc_sao_edge_offset_class3.s index 268d4d8..de09d6c 100644 --- a/common/arm/ihevc_sao_edge_offset_class3.s +++ b/common/arm/ihevc_sao_edge_offset_class3.s @@ -59,6 +59,7 @@ @r8=> ht .text +.syntax unified .p2align 2 .extern gi1_table_edge_idx @@ -224,7 +225,7 @@ WIDTH_LOOP_16: LDR r5,[sp,#0xC8] @Loads pu1_avail CMP r6,r7 @col == wd - LDREQB r8,[r5] @pu1_avail[0] + LDRBEQ r8,[r5] @pu1_avail[0] MOVNE r8,#-1 VMOV.8 d8[0],r8 @au1_mask = vsetq_lane_s8(-1, au1_mask, 0) @@ -557,7 +558,7 @@ WD_16_HT_4_LOOP: LDR r5,[sp,#0xC8] @Loads pu1_avail LDR r7,[sp,#0xD0] @Loads wd CMP r6,r7 @col == wd - LDREQB r8,[r5] @pu1_avail[0] + LDRBEQ r8,[r5] @pu1_avail[0] MOVNE r8,#-1 VMOV.8 d8[0],r8 @au1_mask = vsetq_lane_s8(-1, au1_mask, 0) @@ -613,7 +614,7 @@ PU1_SRC_LOOP_WD_16_HT_4: CMP r5,#0 BEQ NEXT_ROW_ELSE_WD_16_HT_4 CMP r7,#1 - LDREQB r8,[r8,#-1] @pu1_src_cpy[src_strd - 1] + LDRBEQ r8,[r8,#-1] @pu1_src_cpy[src_strd - 1] BEQ NEXT_ROW_POINTER_ASSIGNED_WD_16_HT_4 NEXT_ROW_ELSE_WD_16_HT_4: SUB r5,r12,r7 @ht_tmp - row @@ -697,7 +698,7 @@ WIDTH_RESIDUE: LDR r7,[sp,#0xD0] @Loads wd LDR r5,[sp,#0xC8] @Loads pu1_avail CMP r6,r7 @wd_residue == wd - LDREQB r8,[r5] @pu1_avail[0] + LDRBEQ r8,[r5] @pu1_avail[0] MOVNE r8,#-1 VMOV.8 d8[0],r8 @au1_mask = vsetq_lane_s8(-1, au1_mask, 0) @@ -751,7 +752,7 @@ PU1_SRC_LOOP_RESIDUE: CMP r5,#0 BEQ NEXT_ROW_ELSE_RESIDUE CMP r7,#1 - LDREQB r8,[r8,#-1] @pu1_src_cpy[src_strd - 1] + LDRBEQ r8,[r8,#-1] @pu1_src_cpy[src_strd - 1] BEQ NEXT_ROW_POINTER_ASSIGNED_RESIDUE NEXT_ROW_ELSE_RESIDUE: SUB r5,r12,r7 @ht_tmp - row diff --git a/common/arm/ihevc_sao_edge_offset_class3_chroma.s b/common/arm/ihevc_sao_edge_offset_class3_chroma.s index 2ecabe9..6561a8a 100644 --- a/common/arm/ihevc_sao_edge_offset_class3_chroma.s +++ b/common/arm/ihevc_sao_edge_offset_class3_chroma.s @@ -61,6 +61,7 @@ @r8=> ht .text +.syntax unified .p2align 2 .extern gi1_table_edge_idx @@ -294,7 +295,7 @@ WIDTH_LOOP_16: CMP r6,r7 @col == wd LDR r5,[sp,#0x108] @Loads pu1_avail - LDREQB r8,[r5] @pu1_avail[0] + LDRBEQ r8,[r5] @pu1_avail[0] MOVNE r8,#-1 VMOV.8 D8[0],r8 @au1_mask = vsetq_lane_s8(-1, au1_mask, 0) @@ -688,7 +689,7 @@ WD_16_HT_4_LOOP: LDR r5,[sp,#0x108] @Loads pu1_avail CMP r6,r7 @col == wd - LDREQB r8,[r5] @pu1_avail[0] + LDRBEQ r8,[r5] @pu1_avail[0] MOVNE r8,#-1 VMOV.8 D8[0],r8 @au1_mask = vsetq_lane_s8(-1, au1_mask, 0) @@ -858,7 +859,7 @@ WIDTH_RESIDUE: LDR r5,[sp,#0x108] @Loads pu1_avail CMP r6,r7 @wd_residue == wd - LDREQB r8,[r5] @pu1_avail[0] + LDRBEQ r8,[r5] @pu1_avail[0] MOVNE r8,#-1 LDRB r11,[r5,#1] @pu1_avail[1] diff --git a/common/arm/ihevc_weighted_pred_bi_default.s b/common/arm/ihevc_weighted_pred_bi_default.s index b560c15..6bdb8cc 100644 --- a/common/arm/ihevc_weighted_pred_bi_default.s +++ b/common/arm/ihevc_weighted_pred_bi_default.s @@ -108,6 +108,7 @@ @ r8 => ht @ r9 => wd .text +.syntax unified .align 4 @@ -437,7 +438,7 @@ core_loop_16: vqadd.s16 q13,q7,q8 addeq r1,r1,r7 - subeqs r8,r8,#2 @decrement the ht by 2 + subseq r8,r8,#2 @decrement the ht by 2 beq epilog_16 diff --git a/common/arm64/ihevc_deblk_luma_horz.s b/common/arm64/ihevc_deblk_luma_horz.s index f6989e9..db9e347 100644 --- a/common/arm64/ihevc_deblk_luma_horz.s +++ b/common/arm64/ihevc_deblk_luma_horz.s @@ -217,7 +217,7 @@ l1.1564: ldrb w3,[x0,#0] // x4 has the 0 value uqadd v16.8b, v27.8b , v1.8b and x2,x2,#0xff - mul v12.8h, v7.8h, v0.4h[0] + mul v12.8h, v7.8h, v0.h[0] ldr w8, [x0,x10] // has the 3 value uaddl v10.8h, v24.8b , v28.8b subs x2,x2,x7 @@ -259,7 +259,7 @@ l1.1564: ble l1.1840 add x10,x1,x1,lsl #1 - mul v16.8h, v16.8h, v0.4h[0] + mul v16.8h, v16.8h, v0.h[0] add x4,x0,#3 @@ -292,7 +292,7 @@ l1.1564: cmp x8,x5,asr #3 uqsub v31.8b, v25.8b , v1.8b bge l1.1840 - mul v12.8h, v7.8h, v0.4h[0] + mul v12.8h, v7.8h, v0.h[0] subs x7,x3,x7 uqadd v16.8b, v24.8b , v1.8b csneg x7,x7,x7,pl @@ -413,7 +413,7 @@ strong_filtering_q: strong_filtering_p: umax v5.8b, v18.8b , v17.8b mov x12,x0 - mul v7.8h, v7.8h, v0.4h[0] + mul v7.8h, v7.8h, v0.h[0] sub x20,x1,#0 neg x11, x20 add v16.8h, v7.8h , v14.8h @@ -465,12 +465,12 @@ l1.2408: usubl v10.8h, v26.8b , v25.8b - mul v10.8h, v10.8h, v0.4h[0] + mul v10.8h, v10.8h, v0.h[0] movi v0.4h, #0x3 usubl v12.8h, v27.8b , v24.8b - mul v12.8h, v12.8h, v0.4h[0] + mul v12.8h, v12.8h, v0.h[0] dup v30.8b,w6 // duplicating the +tc value diff --git a/common/arm64/ihevc_deblk_luma_vert.s b/common/arm64/ihevc_deblk_luma_vert.s index bc3cc6c..4379a69 100644 --- a/common/arm64/ihevc_deblk_luma_vert.s +++ b/common/arm64/ihevc_deblk_luma_vert.s @@ -146,17 +146,17 @@ l1.88: add x14,x0,x14 sub x19,x14,#3 - dup v4.2s, v24.2s[1] + dup v4.2s, v24.s[1] ldrb w2,[x19] // -2 value - dup v7.2s, v2.2s[1] + dup v7.2s, v2.s[1] ldrb w10,[x19,#1] // -2 value - dup v3.2s, v2.2s[0] + dup v3.2s, v2.s[0] ldrb w11,[x19,#2] // -1 value - dup v5.2s, v1.2s[1] + dup v5.2s, v1.s[1] ldrb w12,[x14,#0] // 0 value - dup v6.2s, v1.2s[0] + dup v6.2s, v1.s[0] ldrb w3,[x14,#1] // 1 value - dup v2.2s, v0.2s[0] + dup v2.2s, v0.s[0] ldrb w4,[x14,#2] // 2 value @@ -191,7 +191,7 @@ l1.88: cmp x11,x5 - dup v22.2s, v0.2s[1] + dup v22.2s, v0.s[1] bge l1.964 // if(d < beta) @@ -415,7 +415,7 @@ l1.780: // x4 has the flag p - dup v7.2s, v24.2s[0] + dup v7.2s, v24.s[0] sub x3,x0,#1 uaddw v16.8h, v0.8h , v6.8b add x7,x3,x1 diff --git a/common/arm64/ihevc_inter_pred_chroma_horz.s b/common/arm64/ihevc_inter_pred_chroma_horz.s index 513a362..425ac41 100644 --- a/common/arm64/ihevc_inter_pred_chroma_horz.s +++ b/common/arm64/ihevc_inter_pred_chroma_horz.s @@ -128,16 +128,16 @@ ihevc_inter_pred_chroma_horz_av8: mov x11,#2 ble end_loops - dup v24.8b, v2.8b[0] //coeffabs_0 = vdup_lane_u8(coeffabs, 0) + dup v24.8b, v2.b[0] //coeffabs_0 = vdup_lane_u8(coeffabs, 0) sub x12,x0,#2 //pu1_src - 2 - dup v25.8b, v2.8b[1] //coeffabs_1 = vdup_lane_u8(coeffabs, 1) + dup v25.8b, v2.b[1] //coeffabs_1 = vdup_lane_u8(coeffabs, 1) add x4,x12,x2 //pu1_src_tmp2_8 = pu1_src + src_strd - dup v26.8b, v2.8b[2] //coeffabs_2 = vdup_lane_u8(coeffabs, 2) + dup v26.8b, v2.b[2] //coeffabs_2 = vdup_lane_u8(coeffabs, 2) tst x10,#3 //checks wd for multiples lsl x5, x10, #1 - dup v27.8b, v2.8b[3] //coeffabs_3 = vdup_lane_u8(coeffabs, 3) + dup v27.8b, v2.b[3] //coeffabs_3 = vdup_lane_u8(coeffabs, 3) bne outer_loop_4 cmp x10,#12 diff --git a/common/arm64/ihevc_inter_pred_chroma_horz_w16out.s b/common/arm64/ihevc_inter_pred_chroma_horz_w16out.s index efc09f9..0f53c08 100644 --- a/common/arm64/ihevc_inter_pred_chroma_horz_w16out.s +++ b/common/arm64/ihevc_inter_pred_chroma_horz_w16out.s @@ -128,16 +128,16 @@ ihevc_inter_pred_chroma_horz_w16out_av8: ble end_loops - dup v24.8b, v2.8b[0] //coeffabs_0 = vdup_lane_u8(coeffabs, 0) + dup v24.8b, v2.b[0] //coeffabs_0 = vdup_lane_u8(coeffabs, 0) sub x12,x0,#2 //pu1_src - 2 - dup v25.8b, v2.8b[1] //coeffabs_1 = vdup_lane_u8(coeffabs, 1) + dup v25.8b, v2.b[1] //coeffabs_1 = vdup_lane_u8(coeffabs, 1) add x4,x12,x2 //pu1_src_tmp2_8 = pu1_src + src_strd - dup v26.8b, v2.8b[2] //coeffabs_2 = vdup_lane_u8(coeffabs, 2) + dup v26.8b, v2.b[2] //coeffabs_2 = vdup_lane_u8(coeffabs, 2) tst x10,#3 //checks wd for multiples of 4 lsl x5, x10, #1 //2wd - dup v27.8b, v2.8b[3] //coeffabs_3 = vdup_lane_u8(coeffabs, 3) + dup v27.8b, v2.b[3] //coeffabs_3 = vdup_lane_u8(coeffabs, 3) and x7,x14,#1 //added //calculating ht_residue ht_residue = (ht & 1) sub x14,x14,x7 //added //decrement height by ht_residue(residue value is calculated outside) diff --git a/common/arm64/ihevc_inter_pred_chroma_vert.s b/common/arm64/ihevc_inter_pred_chroma_vert.s index 3d61f6c..dd1fba4 100644 --- a/common/arm64/ihevc_inter_pred_chroma_vert.s +++ b/common/arm64/ihevc_inter_pred_chroma_vert.s @@ -123,10 +123,10 @@ ihevc_inter_pred_chroma_vert_av8: tst x6,#3 //checks (wd & 3) abs v3.8b, v0.8b //vabs_s8(coeff) lsl x10,x6,#1 //2*wd - dup v0.8b, v3.8b[0] //coeffabs_0 - dup v1.8b, v3.8b[1] //coeffabs_1 - dup v2.8b, v3.8b[2] //coeffabs_2 - dup v3.8b, v3.8b[3] //coeffabs_3 + dup v0.8b, v3.b[0] //coeffabs_0 + dup v1.8b, v3.b[1] //coeffabs_1 + dup v2.8b, v3.b[2] //coeffabs_2 + dup v3.8b, v3.b[3] //coeffabs_3 bgt outer_loop_wd_2 //jumps to loop handling wd ==2 @@ -188,14 +188,14 @@ inner_loop_wd_2: subs x12,x12,#4 //2wd - 4 add x0,x0,#4 //pu1_src + 4 ld1 {v6.s}[1],[x6],x2 //loads pu1_src_tmp - dup v7.2s, v6.2s[1] + dup v7.2s, v6.s[1] ld1 {v7.s}[1],[x6],x2 //loads pu1_src_tmp umull v4.8h, v7.8b, v1.8b //vmull_u8(vreinterpret_u8_u32(src_tmp2), coeffabs_1) - dup v7.2s, v7.2s[1] + dup v7.2s, v7.s[1] ld1 {v7.s}[1],[x6],x2 umlsl v4.8h, v6.8b, v0.8b umlal v4.8h, v7.8b, v2.8b - dup v7.2s, v7.2s[1] + dup v7.2s, v7.s[1] ld1 {v7.s}[1],[x6] add x6,x1,x3 //pu1_dst + dst_strd umlsl v4.8h, v7.8b, v3.8b diff --git a/common/arm64/ihevc_inter_pred_chroma_vert_w16inp.s b/common/arm64/ihevc_inter_pred_chroma_vert_w16inp.s index e8f17cc..e6cc617 100644 --- a/common/arm64/ihevc_inter_pred_chroma_vert_w16inp.s +++ b/common/arm64/ihevc_inter_pred_chroma_vert_w16inp.s @@ -120,10 +120,10 @@ ihevc_inter_pred_chroma_vert_w16inp_av8: sxtl v0.8h, v0.8b //long the value tst x6,#3 //checks wd == 2 - dup v16.4h, v0.4h[0] //coeff_0 - dup v17.4h, v0.4h[1] //coeff_1 - dup v18.4h, v0.4h[2] //coeff_2 - dup v19.4h, v0.4h[3] //coeff_3 + dup v16.4h, v0.h[0] //coeff_0 + dup v17.4h, v0.h[1] //coeff_1 + dup v18.4h, v0.h[2] //coeff_2 + dup v19.4h, v0.h[3] //coeff_3 bgt core_loop_ht_2 //jumps to loop handles wd 2 diff --git a/common/arm64/ihevc_inter_pred_chroma_vert_w16inp_w16out.s b/common/arm64/ihevc_inter_pred_chroma_vert_w16inp_w16out.s index 5aaabe6..022f166 100644 --- a/common/arm64/ihevc_inter_pred_chroma_vert_w16inp_w16out.s +++ b/common/arm64/ihevc_inter_pred_chroma_vert_w16inp_w16out.s @@ -120,10 +120,10 @@ ihevc_inter_pred_chroma_vert_w16inp_w16out_av8: sxtl v0.8h, v0.8b //long the value tst x6,#3 //checks wd == 2 - dup v16.4h, v0.4h[0] //coeff_0 - dup v17.4h, v0.4h[1] //coeff_1 - dup v18.4h, v0.4h[2] //coeff_2 - dup v19.4h, v0.4h[3] //coeff_3 + dup v16.4h, v0.h[0] //coeff_0 + dup v17.4h, v0.h[1] //coeff_1 + dup v18.4h, v0.h[2] //coeff_2 + dup v19.4h, v0.h[3] //coeff_3 bgt core_loop_ht_2 //jumps to loop handles wd 2 diff --git a/common/arm64/ihevc_inter_pred_chroma_vert_w16out.s b/common/arm64/ihevc_inter_pred_chroma_vert_w16out.s index ec946eb..352214b 100644 --- a/common/arm64/ihevc_inter_pred_chroma_vert_w16out.s +++ b/common/arm64/ihevc_inter_pred_chroma_vert_w16out.s @@ -125,10 +125,10 @@ ihevc_inter_pred_chroma_vert_w16out_av8: tst x6,#3 //checks (wd & 3) abs v3.8b, v0.8b //vabs_s8(coeff) lsl x10,x6,#1 //2*wd - dup v0.8b, v3.8b[0] //coeffabs_0 - dup v1.8b, v3.8b[1] //coeffabs_1 - dup v2.8b, v3.8b[2] //coeffabs_2 - dup v3.8b, v3.8b[3] //coeffabs_3 + dup v0.8b, v3.b[0] //coeffabs_0 + dup v1.8b, v3.b[1] //coeffabs_1 + dup v2.8b, v3.b[2] //coeffabs_2 + dup v3.8b, v3.b[3] //coeffabs_3 bgt outer_loop_wd_2 //jumps to loop handling wd ==2 @@ -189,14 +189,14 @@ inner_loop_wd_2: subs x12,x12,#4 //2wd - 4 add x0,x0,#4 //pu1_src + 4 ld1 {v6.s}[1],[x6],x2 //loads pu1_src_tmp - dup v7.2s, v6.2s[1] + dup v7.2s, v6.s[1] ld1 {v7.s}[1],[x6],x2 //loads pu1_src_tmp umull v4.8h, v7.8b, v1.8b //vmull_u8(vreinterpret_u8_u32(src_tmp2), coeffabs_1) - dup v7.2s, v7.2s[1] + dup v7.2s, v7.s[1] ld1 {v7.s}[1],[x6],x2 umlsl v4.8h, v6.8b, v0.8b umlal v4.8h, v7.8b, v2.8b - dup v7.2s, v7.2s[1] + dup v7.2s, v7.s[1] ld1 {v7.s}[1],[x6] add x6,x1,x3,lsl #1 //pu1_dst + dst_strd umlsl v4.8h, v7.8b, v3.8b diff --git a/common/arm64/ihevc_inter_pred_filters_luma_horz.s b/common/arm64/ihevc_inter_pred_filters_luma_horz.s index 1e246da..d4830d6 100644 --- a/common/arm64/ihevc_inter_pred_filters_luma_horz.s +++ b/common/arm64/ihevc_inter_pred_filters_luma_horz.s @@ -141,22 +141,22 @@ start_loop_count: //ble end_loops - dup v24.8b, v2.8b[0] //coeffabs_0 = vdup_lane_u8(coeffabs, 0) + dup v24.8b, v2.b[0] //coeffabs_0 = vdup_lane_u8(coeffabs, 0) sub x12,x0,#3 //pu1_src - 3 - dup v25.8b, v2.8b[1] //coeffabs_1 = vdup_lane_u8(coeffabs, 1) + dup v25.8b, v2.b[1] //coeffabs_1 = vdup_lane_u8(coeffabs, 1) add x4,x12,x2 //pu1_src_tmp2_8 = pu1_src + src_strd - dup v26.8b, v2.8b[2] //coeffabs_2 = vdup_lane_u8(coeffabs, 2) + dup v26.8b, v2.b[2] //coeffabs_2 = vdup_lane_u8(coeffabs, 2) sub x20,x10,x2,lsl #1 //2*src_strd - wd neg x9, x20 - dup v27.8b, v2.8b[3] //coeffabs_3 = vdup_lane_u8(coeffabs, 3) + dup v27.8b, v2.b[3] //coeffabs_3 = vdup_lane_u8(coeffabs, 3) sub x20,x10,x3,lsl #1 //2*dst_strd - wd neg x8, x20 - dup v28.8b, v2.8b[4] //coeffabs_4 = vdup_lane_u8(coeffabs, 4) + dup v28.8b, v2.b[4] //coeffabs_4 = vdup_lane_u8(coeffabs, 4) - dup v29.8b, v2.8b[5] //coeffabs_5 = vdup_lane_u8(coeffabs, 5) + dup v29.8b, v2.b[5] //coeffabs_5 = vdup_lane_u8(coeffabs, 5) // tst x10,#7 //checks wd for multiples - dup v30.8b, v2.8b[6] //coeffabs_6 = vdup_lane_u8(coeffabs, 6) - dup v31.8b, v2.8b[7] //coeffabs_7 = vdup_lane_u8(coeffabs, 7) + dup v30.8b, v2.b[6] //coeffabs_6 = vdup_lane_u8(coeffabs, 6) + dup v31.8b, v2.b[7] //coeffabs_7 = vdup_lane_u8(coeffabs, 7) mov x7,x1 diff --git a/common/arm64/ihevc_inter_pred_filters_luma_vert.s b/common/arm64/ihevc_inter_pred_filters_luma_vert.s index bd8b3c4..f8b8031 100644 --- a/common/arm64/ihevc_inter_pred_filters_luma_vert.s +++ b/common/arm64/ihevc_inter_pred_filters_luma_vert.s @@ -132,15 +132,15 @@ ihevc_inter_pred_luma_vert_av8: mov x3,x16 //load ht subs x7,x3,#0 //x3->ht //ble end_loops //end loop jump - dup v22.8b, v0.8b[0] //coeffabs_0 = vdup_lane_u8(coeffabs, 0)// + dup v22.8b, v0.b[0] //coeffabs_0 = vdup_lane_u8(coeffabs, 0)// cmp x5,#8 - dup v23.8b, v0.8b[1] //coeffabs_1 = vdup_lane_u8(coeffabs, 1)// - dup v24.8b, v0.8b[2] //coeffabs_2 = vdup_lane_u8(coeffabs, 2)// - dup v25.8b, v0.8b[3] //coeffabs_3 = vdup_lane_u8(coeffabs, 3)// - dup v26.8b, v0.8b[4] //coeffabs_4 = vdup_lane_u8(coeffabs, 4)// - dup v27.8b, v0.8b[5] //coeffabs_5 = vdup_lane_u8(coeffabs, 5)// - dup v28.8b, v0.8b[6] //coeffabs_6 = vdup_lane_u8(coeffabs, 6)// - dup v29.8b, v0.8b[7] //coeffabs_7 = vdup_lane_u8(coeffabs, 7)// + dup v23.8b, v0.b[1] //coeffabs_1 = vdup_lane_u8(coeffabs, 1)// + dup v24.8b, v0.b[2] //coeffabs_2 = vdup_lane_u8(coeffabs, 2)// + dup v25.8b, v0.b[3] //coeffabs_3 = vdup_lane_u8(coeffabs, 3)// + dup v26.8b, v0.b[4] //coeffabs_4 = vdup_lane_u8(coeffabs, 4)// + dup v27.8b, v0.b[5] //coeffabs_5 = vdup_lane_u8(coeffabs, 5)// + dup v28.8b, v0.b[6] //coeffabs_6 = vdup_lane_u8(coeffabs, 6)// + dup v29.8b, v0.b[7] //coeffabs_7 = vdup_lane_u8(coeffabs, 7)// blt core_loop_wd_4 //core loop wd 4 jump stp x0,x1, [sp, #-16]! @@ -451,49 +451,49 @@ inner_loop_wd_4: add x3,x0,x2 ld1 {v4.s}[1],[x3],x2 //src_tmp1 = vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp1, 1)// subs x12,x12,#4 - dup v5.2s, v4.2s[1] //src_tmp2 = vdup_lane_u32(src_tmp1, 1)// + dup v5.2s, v4.s[1] //src_tmp2 = vdup_lane_u32(src_tmp1, 1)// ld1 {v5.s}[1],[x3],x2 //src_tmp2 = vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp2, 1)// ld1 {v4.s}[0],[x0] //src_tmp1 = vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp1, 0)// umull v0.8h, v5.8b, v23.8b //mul_res1 = vmull_u8(vreinterpret_u8_u32(src_tmp2), coeffabs_1)// - dup v6.2s, v5.2s[1] //src_tmp3 = vdup_lane_u32(src_tmp2, 1)// + dup v6.2s, v5.s[1] //src_tmp3 = vdup_lane_u32(src_tmp2, 1)// add x0,x0,#4 ld1 {v6.s}[1],[x3],x2 //src_tmp3 = vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp3, 1)// umlsl v0.8h, v4.8b, v22.8b //mul_res1 = vmlsl_u8(mul_res1, vreinterpret_u8_u32(src_tmp1), coeffabs_0)// - dup v7.2s, v6.2s[1] //src_tmp4 = vdup_lane_u32(src_tmp3, 1)// + dup v7.2s, v6.s[1] //src_tmp4 = vdup_lane_u32(src_tmp3, 1)// ld1 {v7.s}[1],[x3],x2 //src_tmp4 = vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp4, 1)// umlsl v0.8h, v6.8b, v24.8b //mul_res1 = vmlsl_u8(mul_res1, vreinterpret_u8_u32(src_tmp3), coeffabs_2)// umull v19.8h, v7.8b, v23.8b - dup v4.2s, v7.2s[1] //src_tmp1 = vdup_lane_u32(src_tmp4, 1)// + dup v4.2s, v7.s[1] //src_tmp1 = vdup_lane_u32(src_tmp4, 1)// umull v2.8h, v7.8b, v25.8b //mul_res2 = vmull_u8(vreinterpret_u8_u32(src_tmp4), coeffabs_3)// ld1 {v4.s}[1],[x3],x2 //src_tmp1 = vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp1, 1)// umlsl v19.8h, v6.8b, v22.8b umlal v0.8h, v4.8b, v26.8b //mul_res1 = vmlal_u8(mul_res1, vreinterpret_u8_u32(src_tmp1), coeffabs_4)// - dup v5.2s, v4.2s[1] //src_tmp2 = vdup_lane_u32(src_tmp1, 1)// + dup v5.2s, v4.s[1] //src_tmp2 = vdup_lane_u32(src_tmp1, 1)// umlsl v19.8h, v4.8b, v24.8b ld1 {v5.s}[1],[x3],x2 //src_tmp2 = vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp2, 1)// umlsl v2.8h, v5.8b, v27.8b //mul_res2 = vmlsl_u8(mul_res2, vreinterpret_u8_u32(src_tmp2), coeffabs_5)// - dup v6.2s, v5.2s[1] //src_tmp3 = vdup_lane_u32(src_tmp2, 1)// + dup v6.2s, v5.s[1] //src_tmp3 = vdup_lane_u32(src_tmp2, 1)// umlal v19.8h, v5.8b, v25.8b ld1 {v6.s}[1],[x3],x2 //src_tmp3 = vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp3, 1)// umlal v0.8h, v6.8b, v28.8b //mul_res1 = vmlal_u8(mul_res1, vreinterpret_u8_u32(src_tmp3), coeffabs_6)// - dup v7.2s, v6.2s[1] //src_tmp4 = vdup_lane_u32(src_tmp3, 1)// + dup v7.2s, v6.s[1] //src_tmp4 = vdup_lane_u32(src_tmp3, 1)// umlal v19.8h, v6.8b, v26.8b ld1 {v7.s}[1],[x3],x2 //src_tmp4 = vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp4, 1)// umlsl v2.8h, v7.8b, v29.8b //mul_res2 = vmlsl_u8(mul_res2, vreinterpret_u8_u32(src_tmp4), coeffabs_7)// - dup v4.2s, v7.2s[1] + dup v4.2s, v7.s[1] add v0.8h, v0.8h , v2.8h //mul_res1 = vaddq_u16(mul_res1, mul_res2)// umlsl v19.8h, v7.8b, v27.8b ld1 {v4.s}[1],[x3],x2 umlal v19.8h, v4.8b, v28.8b - dup v5.2s, v4.2s[1] + dup v5.2s, v4.s[1] sqrshrun v0.8b, v0.8h,#6 //sto_res = vqmovun_s16(sto_res_tmp)// ld1 {v5.s}[1],[x3] diff --git a/common/arm64/ihevc_inter_pred_filters_luma_vert_w16inp.s b/common/arm64/ihevc_inter_pred_filters_luma_vert_w16inp.s index cd8addf..1c3807e 100644 --- a/common/arm64/ihevc_inter_pred_filters_luma_vert_w16inp.s +++ b/common/arm64/ihevc_inter_pred_filters_luma_vert_w16inp.s @@ -125,14 +125,14 @@ ihevc_inter_pred_luma_vert_w16inp_av8: subs x7,x3,#0 //x3->ht //ble end_loops //end loop jump sxtl v0.8h, v0.8b - dup v22.4h, v0.4h[0] //coeffabs_0 = vdup_lane_u8(coeffabs, 0)// - dup v23.4h, v0.4h[1] //coeffabs_1 = vdup_lane_u8(coeffabs, 1)// - dup v24.4h, v0.4h[2] //coeffabs_2 = vdup_lane_u8(coeffabs, 2)// - dup v25.4h, v0.4h[3] //coeffabs_3 = vdup_lane_u8(coeffabs, 3)// - dup v26.4h, v0.4h[4] //coeffabs_4 = vdup_lane_u8(coeffabs, 4)// - dup v27.4h, v0.4h[5] //coeffabs_5 = vdup_lane_u8(coeffabs, 5)// - dup v28.4h, v0.4h[6] //coeffabs_6 = vdup_lane_u8(coeffabs, 6)// - dup v29.4h, v0.4h[7] //coeffabs_7 = vdup_lane_u8(coeffabs, 7)// + dup v22.4h, v0.h[0] //coeffabs_0 = vdup_lane_u8(coeffabs, 0)// + dup v23.4h, v0.h[1] //coeffabs_1 = vdup_lane_u8(coeffabs, 1)// + dup v24.4h, v0.h[2] //coeffabs_2 = vdup_lane_u8(coeffabs, 2)// + dup v25.4h, v0.h[3] //coeffabs_3 = vdup_lane_u8(coeffabs, 3)// + dup v26.4h, v0.h[4] //coeffabs_4 = vdup_lane_u8(coeffabs, 4)// + dup v27.4h, v0.h[5] //coeffabs_5 = vdup_lane_u8(coeffabs, 5)// + dup v28.4h, v0.h[6] //coeffabs_6 = vdup_lane_u8(coeffabs, 6)// + dup v29.4h, v0.h[7] //coeffabs_7 = vdup_lane_u8(coeffabs, 7)// sub x20,x5,x6,lsl #2 //x6->dst_strd x5 ->wd neg x9, x20 diff --git a/common/arm64/ihevc_inter_pred_filters_luma_vert_w16out.s b/common/arm64/ihevc_inter_pred_filters_luma_vert_w16out.s index ca48db5..79a1a9d 100644 --- a/common/arm64/ihevc_inter_pred_filters_luma_vert_w16out.s +++ b/common/arm64/ihevc_inter_pred_filters_luma_vert_w16out.s @@ -87,15 +87,15 @@ ihevc_inter_pred_luma_vert_w16out_av8: mov x3,x16 //load ht subs x7,x3,#0 //x3->ht //ble end_loops_16out //end loop jump - dup v22.8b, v0.8b[0] //coeffabs_0 = vdup_lane_u8(coeffabs, 0)// + dup v22.8b, v0.b[0] //coeffabs_0 = vdup_lane_u8(coeffabs, 0)// cmp x5,#8 - dup v23.8b, v0.8b[1] //coeffabs_1 = vdup_lane_u8(coeffabs, 1)// - dup v24.8b, v0.8b[2] //coeffabs_2 = vdup_lane_u8(coeffabs, 2)// - dup v25.8b, v0.8b[3] //coeffabs_3 = vdup_lane_u8(coeffabs, 3)// - dup v26.8b, v0.8b[4] //coeffabs_4 = vdup_lane_u8(coeffabs, 4)// - dup v27.8b, v0.8b[5] //coeffabs_5 = vdup_lane_u8(coeffabs, 5)// - dup v28.8b, v0.8b[6] //coeffabs_6 = vdup_lane_u8(coeffabs, 6)// - dup v29.8b, v0.8b[7] //coeffabs_7 = vdup_lane_u8(coeffabs, 7)// + dup v23.8b, v0.b[1] //coeffabs_1 = vdup_lane_u8(coeffabs, 1)// + dup v24.8b, v0.b[2] //coeffabs_2 = vdup_lane_u8(coeffabs, 2)// + dup v25.8b, v0.b[3] //coeffabs_3 = vdup_lane_u8(coeffabs, 3)// + dup v26.8b, v0.b[4] //coeffabs_4 = vdup_lane_u8(coeffabs, 4)// + dup v27.8b, v0.b[5] //coeffabs_5 = vdup_lane_u8(coeffabs, 5)// + dup v28.8b, v0.b[6] //coeffabs_6 = vdup_lane_u8(coeffabs, 6)// + dup v29.8b, v0.b[7] //coeffabs_7 = vdup_lane_u8(coeffabs, 7)// blt core_loop_wd_4_16out //core loop wd 4 jump stp x0,x1, [sp, #-16]! @@ -404,49 +404,49 @@ inner_loop_wd_4_16out: add x3,x0,x2 ld1 {v4.s}[1],[x3],x2 //src_tmp1 = vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp1, 1)// subs x12,x12,#4 - dup v5.2s, v4.2s[1] //src_tmp2 = vdup_lane_u32(src_tmp1, 1)// + dup v5.2s, v4.s[1] //src_tmp2 = vdup_lane_u32(src_tmp1, 1)// ld1 {v5.s}[1],[x3],x2 //src_tmp2 = vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp2, 1)// ld1 {v4.s}[0],[x0] //src_tmp1 = vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp1, 0)// umull v0.8h, v5.8b, v23.8b //mul_res1 = vmull_u8(vreinterpret_u8_u32(src_tmp2), coeffabs_1)// - dup v6.2s, v5.2s[1] //src_tmp3 = vdup_lane_u32(src_tmp2, 1)// + dup v6.2s, v5.s[1] //src_tmp3 = vdup_lane_u32(src_tmp2, 1)// add x0,x0,#4 ld1 {v6.s}[1],[x3],x2 //src_tmp3 = vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp3, 1)// umlsl v0.8h, v4.8b, v22.8b //mul_res1 = vmlsl_u8(mul_res1, vreinterpret_u8_u32(src_tmp1), coeffabs_0)// - dup v7.2s, v6.2s[1] //src_tmp4 = vdup_lane_u32(src_tmp3, 1)// + dup v7.2s, v6.s[1] //src_tmp4 = vdup_lane_u32(src_tmp3, 1)// ld1 {v7.s}[1],[x3],x2 //src_tmp4 = vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp4, 1)// umlsl v0.8h, v6.8b, v24.8b //mul_res1 = vmlsl_u8(mul_res1, vreinterpret_u8_u32(src_tmp3), coeffabs_2)// umull v19.8h, v7.8b, v23.8b - dup v4.2s, v7.2s[1] //src_tmp1 = vdup_lane_u32(src_tmp4, 1)// + dup v4.2s, v7.s[1] //src_tmp1 = vdup_lane_u32(src_tmp4, 1)// umull v2.8h, v7.8b, v25.8b //mul_res2 = vmull_u8(vreinterpret_u8_u32(src_tmp4), coeffabs_3)// ld1 {v4.s}[1],[x3],x2 //src_tmp1 = vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp1, 1)// umlsl v19.8h, v6.8b, v22.8b umlal v0.8h, v4.8b, v26.8b //mul_res1 = vmlal_u8(mul_res1, vreinterpret_u8_u32(src_tmp1), coeffabs_4)// - dup v5.2s, v4.2s[1] //src_tmp2 = vdup_lane_u32(src_tmp1, 1)// + dup v5.2s, v4.s[1] //src_tmp2 = vdup_lane_u32(src_tmp1, 1)// umlsl v19.8h, v4.8b, v24.8b ld1 {v5.s}[1],[x3],x2 //src_tmp2 = vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp2, 1)// umlsl v2.8h, v5.8b, v27.8b //mul_res2 = vmlsl_u8(mul_res2, vreinterpret_u8_u32(src_tmp2), coeffabs_5)// - dup v6.2s, v5.2s[1] //src_tmp3 = vdup_lane_u32(src_tmp2, 1)// + dup v6.2s, v5.s[1] //src_tmp3 = vdup_lane_u32(src_tmp2, 1)// umlal v19.8h, v5.8b, v25.8b ld1 {v6.s}[1],[x3],x2 //src_tmp3 = vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp3, 1)// umlal v0.8h, v6.8b, v28.8b //mul_res1 = vmlal_u8(mul_res1, vreinterpret_u8_u32(src_tmp3), coeffabs_6)// - dup v7.2s, v6.2s[1] //src_tmp4 = vdup_lane_u32(src_tmp3, 1)// + dup v7.2s, v6.s[1] //src_tmp4 = vdup_lane_u32(src_tmp3, 1)// umlal v19.8h, v6.8b, v26.8b ld1 {v7.s}[1],[x3],x2 //src_tmp4 = vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp4, 1)// umlsl v2.8h, v7.8b, v29.8b //mul_res2 = vmlsl_u8(mul_res2, vreinterpret_u8_u32(src_tmp4), coeffabs_7)// - dup v4.2s, v7.2s[1] + dup v4.2s, v7.s[1] add v0.8h, v0.8h , v2.8h //mul_res1 = vaddq_u16(mul_res1, mul_res2)// umlsl v19.8h, v7.8b, v27.8b ld1 {v4.s}[1],[x3],x2 umlal v19.8h, v4.8b, v28.8b - dup v5.2s, v4.2s[1] + dup v5.2s, v4.s[1] //vqrshrun.s16 d0,q0,#6 //sto_res = vqmovun_s16(sto_res_tmp)// ld1 {v5.s}[1],[x3] diff --git a/common/arm64/ihevc_inter_pred_luma_horz_w16out.s b/common/arm64/ihevc_inter_pred_luma_horz_w16out.s index f7b6644..b39059b 100644 --- a/common/arm64/ihevc_inter_pred_luma_horz_w16out.s +++ b/common/arm64/ihevc_inter_pred_luma_horz_w16out.s @@ -133,23 +133,23 @@ ihevc_inter_pred_luma_horz_w16out_av8: mov x15,#1 //ble end_loops mov x14,x6 //loads wd - dup v24.8b, v2.8b[0] //coeffabs_0 = vdup_lane_u8(coeffabs, 0) + dup v24.8b, v2.b[0] //coeffabs_0 = vdup_lane_u8(coeffabs, 0) sub x16,x0,#3 //pu1_src - 3 - dup v25.8b, v2.8b[1] //coeffabs_1 = vdup_lane_u8(coeffabs, 1) + dup v25.8b, v2.b[1] //coeffabs_1 = vdup_lane_u8(coeffabs, 1) add x8,x16,x2 //pu1_src_tmp2_8 = pu1_src + src_strd - dup v26.8b, v2.8b[2] //coeffabs_2 = vdup_lane_u8(coeffabs, 2) + dup v26.8b, v2.b[2] //coeffabs_2 = vdup_lane_u8(coeffabs, 2) sub x20,x14,x2,lsl #1 //2*src_strd - wd neg x13, x20 - dup v27.8b, v2.8b[3] //coeffabs_3 = vdup_lane_u8(coeffabs, 3) + dup v27.8b, v2.b[3] //coeffabs_3 = vdup_lane_u8(coeffabs, 3) sub x20,x14,x3 //dst_strd - wd neg x12, x20 - dup v28.8b, v2.8b[4] //coeffabs_4 = vdup_lane_u8(coeffabs, 4) + dup v28.8b, v2.b[4] //coeffabs_4 = vdup_lane_u8(coeffabs, 4) - dup v29.8b, v2.8b[5] //coeffabs_5 = vdup_lane_u8(coeffabs, 5) + dup v29.8b, v2.b[5] //coeffabs_5 = vdup_lane_u8(coeffabs, 5) and x11,x19,#1 //calculating ht_residue ht_residue = (ht & 1) - dup v30.8b, v2.8b[6] //coeffabs_6 = vdup_lane_u8(coeffabs, 6) + dup v30.8b, v2.b[6] //coeffabs_6 = vdup_lane_u8(coeffabs, 6) sub x19,x19,x11 //decrement height by ht_residue(residue value is calculated outside) - dup v31.8b, v2.8b[7] //coeffabs_7 = vdup_lane_u8(coeffabs, 7) + dup v31.8b, v2.b[7] //coeffabs_7 = vdup_lane_u8(coeffabs, 7) cmp x11,#1 beq odd_height_decision diff --git a/common/arm64/ihevc_intra_pred_chroma_dc.s b/common/arm64/ihevc_intra_pred_chroma_dc.s index 2fdee98..a6969dd 100644 --- a/common/arm64/ihevc_intra_pred_chroma_dc.s +++ b/common/arm64/ihevc_intra_pred_chroma_dc.s @@ -180,8 +180,8 @@ core_loop_add: epil_add_loop: - smov x1, v18.2s[0] - smov x11, v17.2s[0] + smov x1, v18.s[0] + smov x11, v17.s[0] add x1,x1,x4 add x11,x11,x4 @@ -204,7 +204,7 @@ prologue_cpy_32: beq epilogue_copy st2 {v16.8b, v17.8b}, [x2],#16 - add x6, x6, #-16 + sub x6, x6, #16 st2 {v16.8b, v17.8b}, [x5],#16 st2 {v16.8b, v17.8b}, [x8],#16 @@ -274,8 +274,8 @@ dc_4: uadalp v17.1d, v3.2s uadalp v18.1d, v2.2s - smov x10, v17.2s[0] - smov x11, v18.2s[0] + smov x10, v17.s[0] + smov x11, v18.s[0] add x10,x10,x4 add x11,x11,x4 diff --git a/common/arm64/ihevc_intra_pred_chroma_horz.s b/common/arm64/ihevc_intra_pred_chroma_horz.s index 8de655c..d2f3102 100644 --- a/common/arm64/ihevc_intra_pred_chroma_horz.s +++ b/common/arm64/ihevc_intra_pred_chroma_horz.s @@ -119,63 +119,63 @@ core_loop_16: sub x12,x12,#16 ld1 { v18.8h},[x12] //load 16 values. d1[7] will have the 1st value. - dup v2.8h, v0.4h[7] //duplicate the i value. + dup v2.8h, v0.h[7] //duplicate the i value. - dup v4.8h, v0.4h[6] //duplicate the ii value. - dup v6.8h, v0.4h[5] //duplicate the iii value. + dup v4.8h, v0.h[6] //duplicate the ii value. + dup v6.8h, v0.h[5] //duplicate the iii value. st1 { v2.8h},[x2],x3 //store in 1st row 0-16 columns st1 { v2.8h},[x9],x3 //store in 1st row 16-32 columns - dup v1.8h, v0.4h[4] + dup v1.8h, v0.h[4] st1 { v4.8h},[x2],x3 st1 { v4.8h},[x9],x3 - dup v2.8h, v0.4h[3] + dup v2.8h, v0.h[3] st1 { v6.8h},[x2],x3 st1 { v6.8h},[x9],x3 - dup v4.8h, v0.4h[2] + dup v4.8h, v0.h[2] st1 { v1.8h},[x2],x3 st1 { v1.8h},[x9],x3 - dup v6.8h, v0.4h[1] + dup v6.8h, v0.h[1] st1 { v2.8h},[x2],x3 st1 { v2.8h},[x9],x3 - dup v1.8h, v0.4h[0] + dup v1.8h, v0.h[0] st1 { v4.8h},[x2],x3 st1 { v4.8h},[x9],x3 - dup v2.8h, v18.4h[7] + dup v2.8h, v18.h[7] st1 { v6.8h},[x2],x3 st1 { v6.8h},[x9],x3 - dup v4.8h, v18.4h[6] + dup v4.8h, v18.h[6] st1 { v1.8h},[x2],x3 st1 { v1.8h},[x9],x3 - dup v6.8h, v18.4h[5] + dup v6.8h, v18.h[5] st1 { v2.8h},[x2],x3 st1 { v2.8h},[x9],x3 - dup v1.8h, v18.4h[4] + dup v1.8h, v18.h[4] st1 { v4.8h},[x2],x3 st1 { v4.8h},[x9],x3 - dup v2.8h, v18.4h[3] + dup v2.8h, v18.h[3] st1 { v6.8h},[x2],x3 st1 { v6.8h},[x9],x3 - dup v4.8h, v18.4h[2] + dup v4.8h, v18.h[2] st1 { v1.8h},[x2],x3 st1 { v1.8h},[x9],x3 - dup v6.8h, v18.4h[1] + dup v6.8h, v18.h[1] st1 { v2.8h},[x2],x3 st1 { v2.8h},[x9],x3 sub x12,x12,#16 //move to 16th value pointer - dup v1.8h, v18.4h[0] + dup v1.8h, v18.h[0] st1 { v4.8h},[x2],x3 st1 { v4.8h},[x9],x3 @@ -203,33 +203,33 @@ core_loop_8: sub x12,x12,#16 // ld1 { v30.16b},[x12] - dup v18.8h, v0.4h[7] + dup v18.8h, v0.h[7] //vmovl.u8 q13,d26 - dup v2.8h, v0.4h[6] + dup v2.8h, v0.h[6] //vsubl.u8 q12,d30,d28 - dup v4.8h, v0.4h[5] + dup v4.8h, v0.h[5] //vshr.s16 q12,q12,#1 - dup v6.8h, v0.4h[4] + dup v6.8h, v0.h[4] //vqadd.s16 q11,q13,q12 - dup v1.8h, v0.4h[3] + dup v1.8h, v0.h[3] //vqmovun.s16 d22,q11 st1 { v18.8h},[x2],x3 - dup v18.8h, v0.4h[2] + dup v18.8h, v0.h[2] //vsubl.u8 q12,d31,d28 - dup v19.8h, v0.4h[1] + dup v19.8h, v0.h[1] //vshr.s16 q12,q12,#1 - dup v20.8h, v0.4h[0] + dup v20.8h, v0.h[0] //vqadd.s16 q11,q13,q12 - dup v16.8h, v0.4h[3] + dup v16.8h, v0.h[3] //vqmovun.s16 d22,q11 st1 { v2.8h},[x2],x3 @@ -284,32 +284,32 @@ core_loop_4: ld1 {v0.8b},[x12] sub x12,x12,#8 ld1 {v30.8b},[x12] - dup v26.4h, v0.4h[3] + dup v26.4h, v0.h[3] dup v28.8b,w14 - dup v3.4h, v0.4h[2] + dup v3.4h, v0.h[2] uxtl v26.8h, v26.8b - dup v4.4h, v0.4h[1] + dup v4.4h, v0.h[1] usubl v24.8h, v30.8b, v28.8b - dup v5.4h, v0.4h[0] + dup v5.4h, v0.h[0] sshr v24.8h, v24.8h,#1 - dup v6.4h, v0.4h[3] + dup v6.4h, v0.h[3] sqadd v22.8h, v26.8h , v24.8h - dup v7.4h, v0.4h[2] + dup v7.4h, v0.h[2] sqxtun v22.8b, v22.8h st1 {v6.8b},[x2],x3 st1 {v3.8b},[x2],x3 - dup v1.4h, v0.4h[1] + dup v1.4h, v0.h[1] st1 {v4.8b},[x2],x3 st1 {v5.8b},[x2],x3 - dup v17.4h, v0.4h[0] + dup v17.4h, v0.h[0] //vst1.8 {d6},[x2],x3 //vst1.8 {d7},[x2],x3 @@ -331,16 +331,16 @@ core_loop_4: sub x12,x12,#5 ld1 {v0.8b},[x12] dup v28.8b,w14 - dup v26.8b, v0.8b[3] + dup v26.8b, v0.b[3] uxtl v26.8h, v26.8b - dup v3.8b, v0.8b[2] + dup v3.8b, v0.b[2] usubl v24.8h, v30.8b, v28.8b - dup v4.8b, v0.8b[1] + dup v4.8b, v0.b[1] sshr v24.8h, v24.8h,#1 - dup v5.8b, v0.8b[0] + dup v5.8b, v0.b[0] sqadd v22.8h, v26.8h , v24.8h sqxtun v22.8b, v22.8h diff --git a/common/arm64/ihevc_intra_pred_chroma_mode2.s b/common/arm64/ihevc_intra_pred_chroma_mode2.s index d2c0730..aec3da4 100644 --- a/common/arm64/ihevc_intra_pred_chroma_mode2.s +++ b/common/arm64/ihevc_intra_pred_chroma_mode2.s @@ -116,7 +116,7 @@ ihevc_intra_pred_chroma_mode2_av8: add x0,x0,x4,lsl #2 sub x0,x0,#0x12 //src[1] - add x10,x0,#-2 + sub x10,x0,#2 prologue_cpy_32: @@ -223,7 +223,7 @@ kernel_mode2: rev64 v23.8b, v7.8b rev64 v24.8b, v8.8b - add x10,x0,#-2 + sub x10,x0,#2 rev64 v25.8b, v9.8b rev64 v26.8b, v10.8b diff --git a/common/arm64/ihevc_intra_pred_chroma_mode_27_to_33.s b/common/arm64/ihevc_intra_pred_chroma_mode_27_to_33.s index b22d182..3230136 100644 --- a/common/arm64/ihevc_intra_pred_chroma_mode_27_to_33.s +++ b/common/arm64/ihevc_intra_pred_chroma_mode_27_to_33.s @@ -142,13 +142,13 @@ prologue: xtn v4.8b, v4.8h shrn v5.8b, v2.8h,#5 //idx = pos >> 5 - dup v31.8b, v4.8b[0] + dup v31.8b, v4.b[0] add x0,x2,x3 - smov x14, v5.2s[0] //(i row)extract idx to the r register + smov x14, v5.s[0] //(i row)extract idx to the r register lsl x14,x14,#1 - dup v29.8b, v4.8b[1] //(ii) + dup v29.8b, v4.b[1] //(ii) and x9,x14,#0xff //(i row) get the last byte add x10,x8,x9 //(i row)*pu1_ref[ref_main_idx] @@ -172,11 +172,11 @@ prologue: umlal v10.8h, v9.8b, v31.8b //(i row)vmull_u8(ref_main_idx_1, dup_const_fract) asr x14,x14,#8 //(iv) - dup v27.8b, v4.8b[2] //(iii) + dup v27.8b, v4.b[2] //(iii) sub v28.8b, v1.8b , v29.8b //(ii)32-fract(dup_const_32_fract) and x9,x14,#0xff //(iv) - dup v25.8b, v4.8b[3] //(iv) + dup v25.8b, v4.b[3] //(iv) umull v14.8h, v12.8b, v28.8b //(ii)vmull_u8(ref_main_idx, dup_const_32_fract) add x12,x8,x9 //(iv)*pu1_ref[ref_main_idx] @@ -191,10 +191,10 @@ prologue: ld1 {v21.8b},[x12] //(iv)ref_main_idx_1 - dup v31.8b, v4.8b[4] //(v) + dup v31.8b, v4.b[4] //(v) umull v18.8h, v16.8b, v26.8b //(iii)vmull_u8(ref_main_idx, dup_const_32_fract) - smov x14, v5.2s[1] //extract idx to the r register + smov x14, v5.s[1] //extract idx to the r register umlal v18.8h, v17.8b, v27.8b //(iii)vmull_u8(ref_main_idx_1, dup_const_fract) lsl x14,x14,#1 @@ -202,7 +202,7 @@ prologue: rshrn v14.8b, v14.8h,#5 //(ii)shift_res = vrshrn_n_u16(add_res, 5) and x9,x14,#0xff //(v) - dup v29.8b, v4.8b[5] //(vi) + dup v29.8b, v4.b[5] //(vi) add x10,x8,x9 //(v)*pu1_ref[ref_main_idx] ld1 {v23.8b},[x10],x11 //(v)ref_main_idx @@ -219,7 +219,7 @@ prologue: rshrn v18.8b, v18.8h,#5 //(iii)shift_res = vrshrn_n_u16(add_res, 5) add x12,x8,x9 //(vi)*pu1_ref[ref_main_idx] - dup v27.8b, v4.8b[6] //(vii) + dup v27.8b, v4.b[6] //(vii) asr x14,x14,#8 //(vii) and x9,x14,#0xff //(vii) @@ -236,7 +236,7 @@ prologue: rshrn v22.8b, v22.8h,#5 //(iv)shift_res = vrshrn_n_u16(add_res, 5) asr x14,x14,#8 //(viii) - dup v25.8b, v4.8b[7] //(viii) + dup v25.8b, v4.b[7] //(viii) and x9,x14,#0xff //(viii) ld1 {v16.8b},[x10],x11 //(vii)ref_main_idx @@ -274,14 +274,14 @@ prologue: and v4.16b, v2.16b , v6.16b //dup_const_fract(fract = pos & (31)) xtn v4.8b, v4.8h shrn v3.8b, v2.8h,#5 //idx = pos >> 5 - smov x14, v3.2s[0] //(i)extract idx to the r register + smov x14, v3.s[0] //(i)extract idx to the r register lsl x14,x14,#1 and x9,x14,#0xff //(i) add x10,x8,x9 //(i)*pu1_ref[ref_main_idx] kernel_8_rows: asr x14,x14,#8 //(ii) - dup v31.8b, v4.8b[0] + dup v31.8b, v4.b[0] subs x4,x4,#8 ld1 {v23.8b},[x10],x11 //(i)ref_main_idx @@ -298,7 +298,7 @@ kernel_8_rows: umlal v22.8h, v21.8b, v25.8b //(viii)vmull_u8(ref_main_idx_1, dup_const_fract) asr x14,x14,#8 //(iii) - dup v29.8b, v4.8b[1] //(ii) + dup v29.8b, v4.b[1] //(ii) rshrn v18.8b, v18.8h,#5 //(vii)shift_res = vrshrn_n_u16(add_res, 5) and x9,x14,#0xff //(iii) @@ -314,10 +314,10 @@ kernel_8_rows: umlal v10.8h, v9.8b, v31.8b //(i)vmull_u8(ref_main_idx_1, dup_const_fract) and x9,x14,#0xff //(iv) - smov x14, v3.2s[1] //extract idx to the r register + smov x14, v3.s[1] //extract idx to the r register rshrn v22.8b, v22.8h,#5 //(viii)shift_res = vrshrn_n_u16(add_res, 5) - dup v27.8b, v4.8b[2] //(iii) + dup v27.8b, v4.b[2] //(iii) sub v28.8b, v1.8b , v29.8b //(ii)32-fract(dup_const_32_fract) csel x4, x5, x4,le //reload nt @@ -331,7 +331,7 @@ kernel_8_rows: ld1 {v17.8b},[x10] //(iii)ref_main_idx_1 rshrn v10.8b, v10.8h,#5 //(i)shift_res = vrshrn_n_u16(add_res, 5) - dup v25.8b, v4.8b[3] //(iv) + dup v25.8b, v4.b[3] //(iv) umull v2.8h, v5.8b, v0.8b //pos = ((row + 1) * intra_pred_ang) st1 {v22.8b},[x0] //(viii) @@ -345,7 +345,7 @@ kernel_8_rows: umlal v18.8h, v17.8b, v27.8b //(iii)vmull_u8(ref_main_idx_1, dup_const_fract) add x0,x2,x3 - dup v31.8b, v4.8b[4] //(v) + dup v31.8b, v4.b[4] //(v) rshrn v14.8b, v14.8h,#5 //(ii)shift_res = vrshrn_n_u16(add_res, 5) and x9,x14,#0xff //(v) @@ -353,15 +353,15 @@ kernel_8_rows: sub v24.8b, v1.8b , v25.8b //(iv)32-fract(dup_const_32_fract) add x10,x8,x9 //(v)*pu1_ref[ref_main_idx] - dup v29.8b, v4.8b[5] //(vi) + dup v29.8b, v4.b[5] //(vi) umull v22.8h, v20.8b, v24.8b //(iv)vmull_u8(ref_main_idx, dup_const_32_fract) asr x14,x14,#8 //(vi) - dup v27.8b, v4.8b[6] //(vii) + dup v27.8b, v4.b[6] //(vii) umlal v22.8h, v21.8b, v25.8b //(iv)vmull_u8(ref_main_idx_1, dup_const_fract) and x9,x14,#0xff //(vi) - dup v25.8b, v4.8b[7] //(viii) + dup v25.8b, v4.b[7] //(viii) rshrn v18.8b, v18.8h,#5 //(iii)shift_res = vrshrn_n_u16(add_res, 5) add x12,x8,x9 //(vi)*pu1_ref[ref_main_idx] @@ -385,7 +385,7 @@ kernel_8_rows: umull v10.8h, v23.8b, v30.8b //(v)vmull_u8(ref_main_idx, dup_const_32_fract) and x9,x14,#0xff //(viii) - smov x14, v3.2s[0] //(i)extract idx to the r register + smov x14, v3.s[0] //(i)extract idx to the r register umlal v10.8h, v9.8b, v31.8b //(v)vmull_u8(ref_main_idx_1, dup_const_fract) add x12,x8,x9 //(viii)*pu1_ref[ref_main_idx] diff --git a/common/arm64/ihevc_intra_pred_chroma_ver.s b/common/arm64/ihevc_intra_pred_chroma_ver.s index 8d1daf7..451cae9 100644 --- a/common/arm64/ihevc_intra_pred_chroma_ver.s +++ b/common/arm64/ihevc_intra_pred_chroma_ver.s @@ -121,7 +121,7 @@ copy_16: ld2 {v22.8b, v23.8b}, [x6] //16 loads (col 16:31) lsl x11, x3, #2 - add x11, x11, #-16 + sub x11, x11, #16 st2 {v20.8b, v21.8b}, [x2],#16 diff --git a/common/arm64/ihevc_intra_pred_filters_chroma_mode_11_to_17.s b/common/arm64/ihevc_intra_pred_filters_chroma_mode_11_to_17.s index 5d65e63..bfb92bc 100644 --- a/common/arm64/ihevc_intra_pred_filters_chroma_mode_11_to_17.s +++ b/common/arm64/ihevc_intra_pred_filters_chroma_mode_11_to_17.s @@ -207,7 +207,7 @@ end_loop_copy: strh w11, [x6], #2 sxtw x11,w11 - cmp x9, #-1 + cmn x9, #1 bge prologue_8_16_32 add x6, sp, x4, lsl #1 //ref_temp + 2 * nt diff --git a/common/arm64/ihevc_intra_pred_filters_chroma_mode_19_to_25.s b/common/arm64/ihevc_intra_pred_filters_chroma_mode_19_to_25.s index 261c591..c7feebd 100644 --- a/common/arm64/ihevc_intra_pred_filters_chroma_mode_19_to_25.s +++ b/common/arm64/ihevc_intra_pred_filters_chroma_mode_19_to_25.s @@ -176,7 +176,7 @@ end_loop_copy: strh w11, [x6] sxtw x11,w11 - cmp x9, #-1 + cmn x9, #1 bge linear_filtering add x6, sp, x4 ,lsl #1 //ref_temp + 2 * nt @@ -256,13 +256,13 @@ prologue: shrn v5.8b, v2.8h,#5 //idx = pos >> 5 shl v5.8b, v5.8b,#1 - dup v31.8b, v4.8b[0] + dup v31.8b, v4.b[0] add x0,x2,x3 - smov x14, v5.2s[0] //(i row)extract idx to the r register + smov x14, v5.s[0] //(i row)extract idx to the r register // lsl x14,x14,#1 - dup v29.8b, v4.8b[1] //(ii) + dup v29.8b, v4.b[1] //(ii) sbfx x9,x14,#0,#8 add x10,x8,x9 //(i row)*pu1_ref[ref_main_idx] @@ -283,11 +283,11 @@ prologue: ld1 {v13.8b},[x12] //(ii)ref_main_idx_1 umlal v23.8h, v19.8b, v31.8b //(i row)vmull_u8(ref_main_idx_1, dup_const_fract) - dup v27.8b, v4.8b[2] //(iii) + dup v27.8b, v4.b[2] //(iii) sub v28.8b, v1.8b , v29.8b //(ii)32-fract(dup_const_32_fract) sbfx x9,x14,#24,#8 - dup v25.8b, v4.8b[3] //(iv) + dup v25.8b, v4.b[3] //(iv) umull v14.8h, v12.8b, v28.8b //(ii)vmull_u8(ref_main_idx, dup_const_32_fract) add x12,x8,x9 //(iv)*pu1_ref[ref_main_idx] @@ -302,10 +302,10 @@ prologue: ld1 {v21.8b},[x12] //(iv)ref_main_idx_1 - dup v31.8b, v4.8b[4] //(v) + dup v31.8b, v4.b[4] //(v) umull v18.8h, v16.8b, v26.8b //(iii)vmull_u8(ref_main_idx, dup_const_32_fract) - smov x14, v5.2s[1] //extract idx to the r register + smov x14, v5.s[1] //extract idx to the r register umlal v18.8h, v17.8b, v27.8b //(iii)vmull_u8(ref_main_idx_1, dup_const_fract) // lsl x14,x14,#1 @@ -313,7 +313,7 @@ prologue: rshrn v14.8b, v14.8h,#5 //(ii)shift_res = vrshrn_n_u16(add_res, 5) sbfx x9,x14,#0,#8 - dup v29.8b, v4.8b[5] //(vi) + dup v29.8b, v4.b[5] //(vi) add x10,x8,x9 //(v)*pu1_ref[ref_main_idx] ld1 {v7.8b},[x10],x11 //(v)ref_main_idx @@ -329,7 +329,7 @@ prologue: rshrn v18.8b, v18.8h,#5 //(iii)shift_res = vrshrn_n_u16(add_res, 5) add x12,x8,x9 //(vi)*pu1_ref[ref_main_idx] - dup v27.8b, v4.8b[6] //(vii) + dup v27.8b, v4.b[6] //(vii) sbfx x9,x14,#16,#8 sub v30.8b, v1.8b , v31.8b //(v)32-fract(dup_const_32_fract) @@ -344,7 +344,7 @@ prologue: st1 {v18.8b},[x0],x3 //(iii) rshrn v22.8b, v22.8h,#5 //(iv)shift_res = vrshrn_n_u16(add_res, 5) - dup v25.8b, v4.8b[7] //(viii) + dup v25.8b, v4.b[7] //(viii) sbfx x9,x14,#24,#8 ld1 {v16.8b},[x10],x11 //(vii)ref_main_idx @@ -386,13 +386,13 @@ prologue: xtn v4.8b, v4.8h shrn v3.8b, v2.8h,#5 //idx = pos >> 5 shl v3.8b, v3.8b,#1 - smov x14, v3.2s[0] //(i)extract idx to the r register + smov x14, v3.s[0] //(i)extract idx to the r register // lsl x14,x14,#1 sbfx x9,x14,#0,#8 add x10,x8,x9 //(i)*pu1_ref[ref_main_idx] kernel_8_rows: - dup v31.8b, v4.8b[0] + dup v31.8b, v4.b[0] subs x4,x4,#8 sbfx x9,x14,#8,#8 @@ -409,7 +409,7 @@ kernel_8_rows: ld1 {v5.8b},[x6] //loads the row value umlal v22.8h, v21.8b, v25.8b //(viii)vmull_u8(ref_main_idx_1, dup_const_fract) - dup v29.8b, v4.8b[1] //(ii) + dup v29.8b, v4.b[1] //(ii) rshrn v18.8b, v18.8h,#5 //(vii)shift_res = vrshrn_n_u16(add_res, 5) sbfx x9,x14,#16,#8 @@ -428,10 +428,10 @@ kernel_8_rows: sbfx x9,x14,#24,#8 csel x4, x5, x4,le //reload nt - smov x14, v3.2s[1] //extract idx to the r register + smov x14, v3.s[1] //extract idx to the r register rshrn v22.8b, v22.8h,#5 //(viii)shift_res = vrshrn_n_u16(add_res, 5) - dup v27.8b, v4.8b[2] //(iii) + dup v27.8b, v4.b[2] //(iii) sub v28.8b, v1.8b , v29.8b //(ii)32-fract(dup_const_32_fract) add x12,x8,x9 //(iv)*pu1_ref[ref_main_idx] @@ -444,7 +444,7 @@ kernel_8_rows: ld1 {v17.8b},[x10] //(iii)ref_main_idx_1 rshrn v23.8b, v23.8h,#5 //(i)shift_res = vrshrn_n_u16(add_res, 5) - dup v25.8b, v4.8b[3] //(iv) + dup v25.8b, v4.b[3] //(iv) smull v2.8h, v5.8b, v0.8b //pos = ((row + 1) * intra_pred_ang) st1 {v22.8b},[x0] //(viii) @@ -460,7 +460,7 @@ kernel_8_rows: sbfx x9,x14,#0,#8 add x0,x2,x3 - dup v31.8b, v4.8b[4] //(v) + dup v31.8b, v4.b[4] //(v) rshrn v14.8b, v14.8h,#5 //(ii)shift_res = vrshrn_n_u16(add_res, 5) add x10,x8,x9 //(v)*pu1_ref[ref_main_idx] @@ -469,16 +469,16 @@ kernel_8_rows: st1 {v23.8b},[x2],#8 //(i) sub v24.8b, v1.8b , v25.8b //(iv)32-fract(dup_const_32_fract) - dup v29.8b, v4.8b[5] //(vi) + dup v29.8b, v4.b[5] //(vi) umull v22.8h, v20.8b, v24.8b //(iv)vmull_u8(ref_main_idx, dup_const_32_fract) - dup v27.8b, v4.8b[6] //(vii) + dup v27.8b, v4.b[6] //(vii) umlal v22.8h, v21.8b, v25.8b //(iv)vmull_u8(ref_main_idx_1, dup_const_fract) add x12,x8,x9 //(vi)*pu1_ref[ref_main_idx] sbfx x9,x14,#16,#8 - dup v25.8b, v4.8b[7] //(viii) + dup v25.8b, v4.b[7] //(viii) rshrn v18.8b, v18.8h,#5 //(iii)shift_res = vrshrn_n_u16(add_res, 5) ld1 {v7.8b},[x10],x11 //(v)ref_main_idx @@ -501,7 +501,7 @@ kernel_8_rows: ld1 {v13.8b},[x12] //(vi)ref_main_idx_1 umull v23.8h, v7.8b, v30.8b //(v)vmull_u8(ref_main_idx, dup_const_32_fract) - smov x14, v3.2s[0] //(i)extract idx to the r register + smov x14, v3.s[0] //(i)extract idx to the r register umlal v23.8h, v19.8b, v31.8b //(v)vmull_u8(ref_main_idx_1, dup_const_fract) add x12,x8,x9 //(viii)*pu1_ref[ref_main_idx] diff --git a/common/arm64/ihevc_intra_pred_filters_luma_mode_11_to_17.s b/common/arm64/ihevc_intra_pred_filters_luma_mode_11_to_17.s index 66f4699..dcc0fc7 100644 --- a/common/arm64/ihevc_intra_pred_filters_luma_mode_11_to_17.s +++ b/common/arm64/ihevc_intra_pred_filters_luma_mode_11_to_17.s @@ -214,7 +214,7 @@ end_loop_copy: strb w11, [x6], #1 sxtw x11,w11 - cmp x9, #-1 + cmn x9, #1 bge prologue_8_16_32 add x6, sp, x4 //ref_temp + nt diff --git a/common/arm64/ihevc_intra_pred_filters_luma_mode_19_to_25.s b/common/arm64/ihevc_intra_pred_filters_luma_mode_19_to_25.s index 9b59d58..322e4c7 100644 --- a/common/arm64/ihevc_intra_pred_filters_luma_mode_19_to_25.s +++ b/common/arm64/ihevc_intra_pred_filters_luma_mode_19_to_25.s @@ -183,7 +183,7 @@ end_loop_copy: strb w11, [x6] sxtw x11,w11 - cmp x9, #-1 + cmn x9, #1 bge linear_filtering add x6, sp, x4 //ref_temp + nt @@ -259,13 +259,13 @@ prologue: xtn v4.8b, v4.8h shrn v5.8b, v2.8h,#5 //idx = pos >> 5 - dup v31.8b, v4.8b[0] + dup v31.8b, v4.b[0] add x0,x2,x3 - umov w14, v5.2s[0] //(i row)extract idx to the r register + umov w14, v5.s[0] //(i row)extract idx to the r register sxtw x14,w14 - dup v29.8b, v4.8b[1] //(ii) + dup v29.8b, v4.b[1] //(ii) sbfx x9,x14,#0,#8 add x10,x8,x9 //(i row)*pu1_ref[ref_main_idx] @@ -286,11 +286,11 @@ prologue: ld1 {v13.8b},[x12] //(ii)ref_main_idx_1 umlal v10.8h, v9.8b, v31.8b //(i row)vmull_u8(ref_main_idx_1, dup_const_fract) - dup v27.8b, v4.8b[2] //(iii) + dup v27.8b, v4.b[2] //(iii) sub v28.8b, v1.8b , v29.8b //(ii)32-fract(dup_const_32_fract) sbfx x9,x14,#24,#8 - dup v25.8b, v4.8b[3] //(iv) + dup v25.8b, v4.b[3] //(iv) umull v14.8h, v12.8b, v28.8b //(ii)vmull_u8(ref_main_idx, dup_const_32_fract) add x12,x8,x9 //(iv)*pu1_ref[ref_main_idx] @@ -305,10 +305,10 @@ prologue: ld1 {v21.8b},[x12] //(iv)ref_main_idx_1 - dup v31.8b, v4.8b[4] //(v) + dup v31.8b, v4.b[4] //(v) umull v18.8h, v16.8b, v26.8b //(iii)vmull_u8(ref_main_idx, dup_const_32_fract) - umov w14, v5.2s[1] //extract idx to the r register + umov w14, v5.s[1] //extract idx to the r register sxtw x14,w14 umlal v18.8h, v17.8b, v27.8b //(iii)vmull_u8(ref_main_idx_1, dup_const_fract) @@ -316,7 +316,7 @@ prologue: rshrn v14.8b, v14.8h,#5 //(ii)shift_res = vrshrn_n_u16(add_res, 5) sbfx x9,x14,#0,#8 - dup v29.8b, v4.8b[5] //(vi) + dup v29.8b, v4.b[5] //(vi) add x10,x8,x9 //(v)*pu1_ref[ref_main_idx] ld1 {v23.8b},[x10],x11 //(v)ref_main_idx @@ -332,7 +332,7 @@ prologue: rshrn v18.8b, v18.8h,#5 //(iii)shift_res = vrshrn_n_u16(add_res, 5) add x12,x8,x9 //(vi)*pu1_ref[ref_main_idx] - dup v27.8b, v4.8b[6] //(vii) + dup v27.8b, v4.b[6] //(vii) sbfx x9,x14,#16,#8 sub v30.8b, v1.8b , v31.8b //(v)32-fract(dup_const_32_fract) @@ -347,7 +347,7 @@ prologue: st1 {v18.8b},[x0],x3 //(iii) rshrn v22.8b, v22.8h,#5 //(iv)shift_res = vrshrn_n_u16(add_res, 5) - dup v25.8b, v4.8b[7] //(viii) + dup v25.8b, v4.b[7] //(viii) sbfx x9,x14,#24,#8 ld1 {v16.8b},[x10],x11 //(vii)ref_main_idx @@ -385,13 +385,13 @@ prologue: and v4.16b, v2.16b , v6.16b //dup_const_fract(fract = pos & (31)) xtn v4.8b, v4.8h shrn v3.8b, v2.8h,#5 //idx = pos >> 5 - umov w14, v3.2s[0] //(i)extract idx to the r register + umov w14, v3.s[0] //(i)extract idx to the r register sxtw x14,w14 sbfx x9,x14,#0,#8 add x10,x8,x9 //(i)*pu1_ref[ref_main_idx] kernel_8_rows: - dup v31.8b, v4.8b[0] + dup v31.8b, v4.b[0] subs x4,x4,#8 sbfx x9,x14,#8,#8 @@ -408,7 +408,7 @@ kernel_8_rows: ld1 {v5.8b},[x6] //loads the row value umlal v22.8h, v21.8b, v25.8b //(viii)vmull_u8(ref_main_idx_1, dup_const_fract) - dup v29.8b, v4.8b[1] //(ii) + dup v29.8b, v4.b[1] //(ii) rshrn v18.8b, v18.8h,#5 //(vii)shift_res = vrshrn_n_u16(add_res, 5) sbfx x9,x14,#16,#8 @@ -427,11 +427,11 @@ kernel_8_rows: sbfx x9,x14,#24,#8 csel x4, x5, x4,le //reload nt - umov w14, v3.2s[1] //extract idx to the r register + umov w14, v3.s[1] //extract idx to the r register sxtw x14,w14 rshrn v22.8b, v22.8h,#5 //(viii)shift_res = vrshrn_n_u16(add_res, 5) - dup v27.8b, v4.8b[2] //(iii) + dup v27.8b, v4.b[2] //(iii) sub v28.8b, v1.8b , v29.8b //(ii)32-fract(dup_const_32_fract) add x12,x8,x9 //(iv)*pu1_ref[ref_main_idx] @@ -444,7 +444,7 @@ kernel_8_rows: ld1 {v17.8b},[x10] //(iii)ref_main_idx_1 rshrn v10.8b, v10.8h,#5 //(i)shift_res = vrshrn_n_u16(add_res, 5) - dup v25.8b, v4.8b[3] //(iv) + dup v25.8b, v4.b[3] //(iv) smull v2.8h, v5.8b, v0.8b //pos = ((row + 1) * intra_pred_ang) st1 {v22.8b},[x0] //(viii) @@ -459,7 +459,7 @@ kernel_8_rows: sbfx x9,x14,#0,#8 add x0,x2,x3 - dup v31.8b, v4.8b[4] //(v) + dup v31.8b, v4.b[4] //(v) rshrn v14.8b, v14.8h,#5 //(ii)shift_res = vrshrn_n_u16(add_res, 5) add x10,x8,x9 //(v)*pu1_ref[ref_main_idx] @@ -468,16 +468,16 @@ kernel_8_rows: st1 {v10.8b},[x2],#8 //(i) sub v24.8b, v1.8b , v25.8b //(iv)32-fract(dup_const_32_fract) - dup v29.8b, v4.8b[5] //(vi) + dup v29.8b, v4.b[5] //(vi) umull v22.8h, v20.8b, v24.8b //(iv)vmull_u8(ref_main_idx, dup_const_32_fract) - dup v27.8b, v4.8b[6] //(vii) + dup v27.8b, v4.b[6] //(vii) umlal v22.8h, v21.8b, v25.8b //(iv)vmull_u8(ref_main_idx_1, dup_const_fract) add x12,x8,x9 //(vi)*pu1_ref[ref_main_idx] sbfx x9,x14,#16,#8 - dup v25.8b, v4.8b[7] //(viii) + dup v25.8b, v4.b[7] //(viii) rshrn v18.8b, v18.8h,#5 //(iii)shift_res = vrshrn_n_u16(add_res, 5) ld1 {v23.8b},[x10],x11 //(v)ref_main_idx @@ -498,7 +498,7 @@ kernel_8_rows: ld1 {v13.8b},[x12] //(vi)ref_main_idx_1 umull v10.8h, v23.8b, v30.8b //(v)vmull_u8(ref_main_idx, dup_const_32_fract) - umov w14, v3.2s[0] //(i)extract idx to the r register + umov w14, v3.s[0] //(i)extract idx to the r register sxtw x14,w14 umlal v10.8h, v9.8b, v31.8b //(v)vmull_u8(ref_main_idx_1, dup_const_fract) diff --git a/common/arm64/ihevc_intra_pred_luma_dc.s b/common/arm64/ihevc_intra_pred_luma_dc.s index e4fdb5d..fc86ffa 100644 --- a/common/arm64/ihevc_intra_pred_luma_dc.s +++ b/common/arm64/ihevc_intra_pred_luma_dc.s @@ -200,7 +200,7 @@ epil_add_loop: mov x20,#128 csel x6, x20, x6,eq - dup v16.8b, v18.8b[0] //dc_val + dup v16.8b, v18.b[0] //dc_val shl d25, d18,#1 //2*dc beq prologue_cpy_32 @@ -218,7 +218,7 @@ epil_add_loop: add d23, d23 , d17 //3*dc + 2 add x12, x12, #8 //offset after one 8x8 block (-7*strd + 8) - dup v24.8h, v23.4h[0] //3*dc + 2 (moved to all lanes) + dup v24.8h, v23.h[0] //3*dc + 2 (moved to all lanes) sub x0, x3, x4 //strd - nt prologue_col: @@ -368,9 +368,9 @@ prologue_cpy_32: add x5, x2, x3 add x8, x5, x3 add x10, x8, x3 - dup v20.16b, v16.8b[0] + dup v20.16b, v16.b[0] lsl x6, x3, #2 - add x6, x6, #-16 + sub x6, x6, #16 st1 {v20.16b}, [x2],#16 st1 {v20.16b}, [x5],#16 @@ -451,7 +451,7 @@ dc_4: shl d25, d18,#1 //2*dc sub x9, x9, #3 //&src[2nt-1-row] - dup v16.8b, v18.8b[0] //dc_val + dup v16.8b, v18.b[0] //dc_val add d27, d25 , d28 //src[2nt+1]+2+src[2nt-1]+2dc_val ushr v29.4h, v27.4h,#2 //final dst[0]'s value in d15[0] @@ -461,7 +461,7 @@ dc_4: add d23, d23 , d17 //3*dc + 2 add x12, x12, #4 //offset after one 4x4 block (-3*strd + 4) - dup v24.8h, v23.4h[0] //3*dc + 2 (moved to all lanes) + dup v24.8h, v23.h[0] //3*dc + 2 (moved to all lanes) sub x0, x3, x4 //strd - nt diff --git a/common/arm64/ihevc_intra_pred_luma_horz.s b/common/arm64/ihevc_intra_pred_luma_horz.s index 95452e4..3d1f27f 100644 --- a/common/arm64/ihevc_intra_pred_luma_horz.s +++ b/common/arm64/ihevc_intra_pred_luma_horz.s @@ -119,63 +119,63 @@ ihevc_intra_pred_luma_horz_av8: core_loop_32: ld1 { v0.16b},[x12] //load 16 values. d1[7] will have the 1st value. - dup v2.16b, v0.16b[15] //duplicate the i value. + dup v2.16b, v0.b[15] //duplicate the i value. - dup v4.16b, v0.16b[14] //duplicate the ii value. - dup v6.16b, v0.16b[13] //duplicate the iii value. + dup v4.16b, v0.b[14] //duplicate the ii value. + dup v6.16b, v0.b[13] //duplicate the iii value. st1 { v2.16b},[x2],x3 //store in 1st row 0-16 columns st1 { v2.16b},[x9],x3 //store in 1st row 16-32 columns - dup v1.16b, v0.16b[12] + dup v1.16b, v0.b[12] st1 { v4.16b},[x2],x3 st1 { v4.16b},[x9],x3 - dup v2.16b, v0.16b[11] + dup v2.16b, v0.b[11] st1 { v6.16b},[x2],x3 st1 { v6.16b},[x9],x3 - dup v4.16b, v0.16b[10] + dup v4.16b, v0.b[10] st1 { v1.16b},[x2],x3 st1 { v1.16b},[x9],x3 - dup v6.16b, v0.16b[9] + dup v6.16b, v0.b[9] st1 { v2.16b},[x2],x3 st1 { v2.16b},[x9],x3 - dup v1.16b, v0.16b[8] + dup v1.16b, v0.b[8] st1 { v4.16b},[x2],x3 st1 { v4.16b},[x9],x3 - dup v2.16b, v0.8b[7] + dup v2.16b, v0.b[7] st1 { v6.16b},[x2],x3 st1 { v6.16b},[x9],x3 - dup v4.16b, v0.8b[6] + dup v4.16b, v0.b[6] st1 { v1.16b},[x2],x3 st1 { v1.16b},[x9],x3 - dup v6.16b, v0.8b[5] + dup v6.16b, v0.b[5] st1 { v2.16b},[x2],x3 st1 { v2.16b},[x9],x3 - dup v1.16b, v0.8b[4] + dup v1.16b, v0.b[4] st1 { v4.16b},[x2],x3 st1 { v4.16b},[x9],x3 - dup v2.16b, v0.8b[3] + dup v2.16b, v0.b[3] st1 { v6.16b},[x2],x3 st1 { v6.16b},[x9],x3 - dup v4.16b, v0.8b[2] + dup v4.16b, v0.b[2] st1 { v1.16b},[x2],x3 st1 { v1.16b},[x9],x3 - dup v6.16b, v0.8b[1] + dup v6.16b, v0.b[1] st1 { v2.16b},[x2],x3 st1 { v2.16b},[x9],x3 sub x12,x12,#16 //move to 16th value pointer - dup v1.16b, v0.8b[0] + dup v1.16b, v0.b[0] st1 { v4.16b},[x2],x3 st1 { v4.16b},[x9],x3 @@ -202,33 +202,33 @@ core_loop_16: dup v28.8b,w14 sub x12,x12,#17 ld1 { v0.16b},[x12] - dup v26.8b, v0.16b[15] + dup v26.8b, v0.b[15] uxtl v26.8h, v26.8b - dup v2.16b, v0.16b[14] + dup v2.16b, v0.b[14] usubl v24.8h, v30.8b, v28.8b - dup v4.16b, v0.16b[13] + dup v4.16b, v0.b[13] sshr v24.8h, v24.8h,#1 - dup v6.16b, v0.16b[12] + dup v6.16b, v0.b[12] sqadd v22.8h, v26.8h , v24.8h - dup v1.16b, v0.16b[11] + dup v1.16b, v0.b[11] sqxtun v22.8b, v22.8h st1 {v22.8b},[x2],#8 - dup v18.16b, v0.16b[10] + dup v18.16b, v0.b[10] usubl v24.8h, v31.8b, v28.8b - dup v19.16b, v0.16b[9] + dup v19.16b, v0.b[9] sshr v24.8h, v24.8h,#1 - dup v20.16b, v0.16b[8] + dup v20.16b, v0.b[8] sqadd v22.8h, v26.8h , v24.8h - dup v16.16b, v0.8b[7] + dup v16.16b, v0.b[7] sqxtun v22.8b, v22.8h st1 {v22.8b},[x2],x3 @@ -240,25 +240,25 @@ core_loop_16: st1 { v6.16b},[x2],x3 st1 { v1.16b},[x2],x3 - dup v2.16b, v0.8b[6] + dup v2.16b, v0.b[6] st1 { v18.16b},[x2],x3 - dup v4.16b, v0.8b[5] + dup v4.16b, v0.b[5] st1 { v19.16b},[x2],x3 - dup v6.16b, v0.8b[4] + dup v6.16b, v0.b[4] st1 { v20.16b},[x2],x3 - dup v1.16b, v0.8b[3] + dup v1.16b, v0.b[3] st1 { v16.16b},[x2],x3 - dup v18.16b, v0.8b[2] + dup v18.16b, v0.b[2] st1 { v2.16b},[x2],x3 - dup v19.16b, v0.8b[1] + dup v19.16b, v0.b[1] st1 { v4.16b},[x2],x3 - dup v20.16b, v0.8b[0] + dup v20.16b, v0.b[0] st1 { v6.16b},[x2],x3 st1 { v1.16b},[x2],x3 @@ -281,32 +281,32 @@ core_loop_8: sub x12,x12,#9 ld1 {v0.8b},[x12] - dup v26.8b, v0.8b[7] + dup v26.8b, v0.b[7] dup v28.8b,w14 - dup v3.8b, v0.8b[6] + dup v3.8b, v0.b[6] uxtl v26.8h, v26.8b - dup v4.8b, v0.8b[5] + dup v4.8b, v0.b[5] usubl v24.8h, v30.8b, v28.8b - dup v5.8b, v0.8b[4] + dup v5.8b, v0.b[4] sshr v24.8h, v24.8h,#1 - dup v6.8b, v0.8b[3] + dup v6.8b, v0.b[3] sqadd v22.8h, v26.8h , v24.8h - dup v7.8b, v0.8b[2] + dup v7.8b, v0.b[2] sqxtun v22.8b, v22.8h st1 {v22.8b},[x2],x3 st1 {v3.8b},[x2],x3 - dup v1.8b, v0.8b[1] + dup v1.8b, v0.b[1] st1 {v4.8b},[x2],x3 st1 {v5.8b},[x2],x3 - dup v17.8b, v0.8b[0] + dup v17.8b, v0.b[0] st1 {v6.8b},[x2],x3 st1 {v7.8b},[x2],x3 @@ -328,16 +328,16 @@ core_loop_4: sub x12,x12,#5 ld1 {v0.8b},[x12] dup v28.8b,w14 - dup v26.8b, v0.8b[3] + dup v26.8b, v0.b[3] uxtl v26.8h, v26.8b - dup v3.8b, v0.8b[2] + dup v3.8b, v0.b[2] usubl v24.8h, v30.8b, v28.8b - dup v4.8b, v0.8b[1] + dup v4.8b, v0.b[1] sshr v24.8h, v24.8h,#1 - dup v5.8b, v0.8b[0] + dup v5.8b, v0.b[0] sqadd v22.8h, v26.8h , v24.8h sqxtun v22.8b, v22.8h diff --git a/common/arm64/ihevc_intra_pred_luma_mode2.s b/common/arm64/ihevc_intra_pred_luma_mode2.s index 598ce5a..6eec479 100644 --- a/common/arm64/ihevc_intra_pred_luma_mode2.s +++ b/common/arm64/ihevc_intra_pred_luma_mode2.s @@ -116,7 +116,7 @@ ihevc_intra_pred_luma_mode2_av8: add x0,x0,x4,lsl #1 sub x0,x0,#9 //src[1] - add x10,x0,#-1 + sub x10,x0,#1 prologue_cpy_32: @@ -215,7 +215,7 @@ kernel_mode2: add x9, x7, x3 rev64 v20.8b, v4.8b - add x10,x0,#-1 + sub x10,x0,#1 rev64 v21.8b, v5.8b subs x1, x1, #8 @@ -244,7 +244,7 @@ mode2_4: mov x8,#-2 sub x0,x0,#1 - add x10,x0,#-1 + sub x10,x0,#1 ld1 {v0.8b},[x0],x8 add x5,x2,x3 diff --git a/common/arm64/ihevc_intra_pred_luma_mode_27_to_33.s b/common/arm64/ihevc_intra_pred_luma_mode_27_to_33.s index 58b2d37..dcc9e43 100644 --- a/common/arm64/ihevc_intra_pred_luma_mode_27_to_33.s +++ b/common/arm64/ihevc_intra_pred_luma_mode_27_to_33.s @@ -147,13 +147,13 @@ prologue: xtn v4.8b, v4.8h shrn v5.8b, v2.8h,#5 //idx = pos >> 5 - dup v31.8b, v4.8b[0] + dup v31.8b, v4.b[0] add x0,x2,x3 - umov w14, v5.2s[0] //(i row)extract idx to the r register + umov w14, v5.s[0] //(i row)extract idx to the r register sxtw x14,w14 - dup v29.8b, v4.8b[1] //(ii) + dup v29.8b, v4.b[1] //(ii) and x9,x14,#0xff //(i row) get the last byte add x10,x8,x9 //(i row)*pu1_ref[ref_main_idx] @@ -177,11 +177,11 @@ prologue: umlal v10.8h, v9.8b, v31.8b //(i row)vmull_u8(ref_main_idx_1, dup_const_fract) asr x14,x14,#8 //(iv) - dup v27.8b, v4.8b[2] //(iii) + dup v27.8b, v4.b[2] //(iii) sub v28.8b, v1.8b , v29.8b //(ii)32-fract(dup_const_32_fract) and x9,x14,#0xff //(iv) - dup v25.8b, v4.8b[3] //(iv) + dup v25.8b, v4.b[3] //(iv) umull v14.8h, v12.8b, v28.8b //(ii)vmull_u8(ref_main_idx, dup_const_32_fract) add x12,x8,x9 //(iv)*pu1_ref[ref_main_idx] @@ -196,10 +196,10 @@ prologue: ld1 {v21.8b},[x12] //(iv)ref_main_idx_1 - dup v31.8b, v4.8b[4] //(v) + dup v31.8b, v4.b[4] //(v) umull v18.8h, v16.8b, v26.8b //(iii)vmull_u8(ref_main_idx, dup_const_32_fract) - umov w14, v5.2s[1] //extract idx to the r register + umov w14, v5.s[1] //extract idx to the r register sxtw x14,w14 umlal v18.8h, v17.8b, v27.8b //(iii)vmull_u8(ref_main_idx_1, dup_const_fract) @@ -207,7 +207,7 @@ prologue: rshrn v14.8b, v14.8h,#5 //(ii)shift_res = vrshrn_n_u16(add_res, 5) and x9,x14,#0xff //(v) - dup v29.8b, v4.8b[5] //(vi) + dup v29.8b, v4.b[5] //(vi) add x10,x8,x9 //(v)*pu1_ref[ref_main_idx] ld1 {v23.8b},[x10],x11 //(v)ref_main_idx @@ -224,7 +224,7 @@ prologue: rshrn v18.8b, v18.8h,#5 //(iii)shift_res = vrshrn_n_u16(add_res, 5) add x12,x8,x9 //(vi)*pu1_ref[ref_main_idx] - dup v27.8b, v4.8b[6] //(vii) + dup v27.8b, v4.b[6] //(vii) asr x14,x14,#8 //(vii) and x9,x14,#0xff //(vii) @@ -241,7 +241,7 @@ prologue: rshrn v22.8b, v22.8h,#5 //(iv)shift_res = vrshrn_n_u16(add_res, 5) asr x14,x14,#8 //(viii) - dup v25.8b, v4.8b[7] //(viii) + dup v25.8b, v4.b[7] //(viii) and x9,x14,#0xff //(viii) ld1 {v16.8b},[x10],x11 //(vii)ref_main_idx @@ -279,14 +279,14 @@ prologue: and v4.16b, v2.16b , v6.16b //dup_const_fract(fract = pos & (31)) xtn v4.8b, v4.8h shrn v3.8b, v2.8h,#5 //idx = pos >> 5 - umov w14, v3.2s[0] //(i)extract idx to the r register + umov w14, v3.s[0] //(i)extract idx to the r register sxtw x14,w14 and x9,x14,#0xff //(i) add x10,x8,x9 //(i)*pu1_ref[ref_main_idx] kernel_8_rows: asr x14,x14,#8 //(ii) - dup v31.8b, v4.8b[0] + dup v31.8b, v4.b[0] subs x4,x4,#8 ld1 {v23.8b},[x10],x11 //(i)ref_main_idx @@ -303,7 +303,7 @@ kernel_8_rows: umlal v22.8h, v21.8b, v25.8b //(viii)vmull_u8(ref_main_idx_1, dup_const_fract) asr x14,x14,#8 //(iii) - dup v29.8b, v4.8b[1] //(ii) + dup v29.8b, v4.b[1] //(ii) rshrn v18.8b, v18.8h,#5 //(vii)shift_res = vrshrn_n_u16(add_res, 5) and x9,x14,#0xff //(iii) @@ -319,11 +319,11 @@ kernel_8_rows: umlal v10.8h, v9.8b, v31.8b //(i)vmull_u8(ref_main_idx_1, dup_const_fract) and x9,x14,#0xff //(iv) - umov w14, v3.2s[1] //extract idx to the r register + umov w14, v3.s[1] //extract idx to the r register sxtw x14,w14 rshrn v22.8b, v22.8h,#5 //(viii)shift_res = vrshrn_n_u16(add_res, 5) - dup v27.8b, v4.8b[2] //(iii) + dup v27.8b, v4.b[2] //(iii) sub v28.8b, v1.8b , v29.8b //(ii)32-fract(dup_const_32_fract) csel x4, x5, x4,le //reload nt @@ -337,7 +337,7 @@ kernel_8_rows: ld1 {v17.8b},[x10] //(iii)ref_main_idx_1 rshrn v10.8b, v10.8h,#5 //(i)shift_res = vrshrn_n_u16(add_res, 5) - dup v25.8b, v4.8b[3] //(iv) + dup v25.8b, v4.b[3] //(iv) umull v2.8h, v5.8b, v0.8b //pos = ((row + 1) * intra_pred_ang) st1 {v22.8b},[x0] //(viii) @@ -351,7 +351,7 @@ kernel_8_rows: umlal v18.8h, v17.8b, v27.8b //(iii)vmull_u8(ref_main_idx_1, dup_const_fract) and x9,x14,#0xff //(v) - dup v31.8b, v4.8b[4] //(v) + dup v31.8b, v4.b[4] //(v) rshrn v14.8b, v14.8h,#5 //(ii)shift_res = vrshrn_n_u16(add_res, 5) add x10,x8,x9 //(v)*pu1_ref[ref_main_idx] @@ -359,15 +359,15 @@ kernel_8_rows: sub v24.8b, v1.8b , v25.8b //(iv)32-fract(dup_const_32_fract) asr x14,x14,#8 //(vi) - dup v29.8b, v4.8b[5] //(vi) + dup v29.8b, v4.b[5] //(vi) umull v22.8h, v20.8b, v24.8b //(iv)vmull_u8(ref_main_idx, dup_const_32_fract) and x9,x14,#0xff //(vi) - dup v27.8b, v4.8b[6] //(vii) + dup v27.8b, v4.b[6] //(vii) umlal v22.8h, v21.8b, v25.8b //(iv)vmull_u8(ref_main_idx_1, dup_const_fract) add x12,x8,x9 //(vi)*pu1_ref[ref_main_idx] - dup v25.8b, v4.8b[7] //(viii) + dup v25.8b, v4.b[7] //(viii) rshrn v18.8b, v18.8h,#5 //(iii)shift_res = vrshrn_n_u16(add_res, 5) asr x14,x14,#8 //(vii) @@ -390,7 +390,7 @@ kernel_8_rows: ld1 {v13.8b},[x12] //(vi)ref_main_idx_1 umull v10.8h, v23.8b, v30.8b //(v)vmull_u8(ref_main_idx, dup_const_32_fract) - umov w14, v3.2s[0] //(i)extract idx to the r register + umov w14, v3.s[0] //(i)extract idx to the r register sxtw x14,w14 umlal v10.8h, v9.8b, v31.8b //(v)vmull_u8(ref_main_idx_1, dup_const_fract) add x12,x8,x9 //(viii)*pu1_ref[ref_main_idx] diff --git a/common/arm64/ihevc_intra_pred_luma_planar.s b/common/arm64/ihevc_intra_pred_luma_planar.s index ba04f42..ec9d3ca 100644 --- a/common/arm64/ihevc_intra_pred_luma_planar.s +++ b/common/arm64/ihevc_intra_pred_luma_planar.s @@ -186,10 +186,10 @@ col_loop_8_16_32: ld1 {v3.8b},[x14] //(1-8)load 8 src[2nt+1+col] umlal v27.8h, v17.8b, v1.8b //(1)(col+1) * src[3nt+1] - dup v20.8b, v4.8b[7] //(1) + dup v20.8b, v4.b[7] //(1) umlal v27.8h, v6.8b, v3.8b //(1)(nt-1-row) * src[2nt+1+col] - dup v21.8b, v4.8b[6] //(2) + dup v21.8b, v4.b[6] //(2) umlal v27.8h, v19.8b, v20.8b //(1)(nt-1-col) * src[2nt-1-row] dup v30.8h,w4 //(2) @@ -197,7 +197,7 @@ col_loop_8_16_32: sub v6.8b, v6.8b , v7.8b //(1) - dup v22.8b, v4.8b[5] //(3) + dup v22.8b, v4.b[5] //(3) umlal v30.8h, v5.8b, v0.8b //(2) dup v28.8h,w4 //(3) @@ -214,7 +214,7 @@ col_loop_8_16_32: xtn v27.8b, v27.8h //(1) umlal v28.8h, v5.8b, v0.8b //(3) - dup v23.8b, v4.8b[4] //(4) + dup v23.8b, v4.b[4] //(4) umlal v28.8h, v17.8b, v1.8b //(3) dup v25.8h,w4 //(4) @@ -231,7 +231,7 @@ col_loop_8_16_32: xtn v30.8b, v30.8h //(2) umlal v25.8h, v5.8b, v0.8b //(4) - dup v20.8b, v4.8b[3] //(5) + dup v20.8b, v4.b[3] //(5) umlal v25.8h, v17.8b, v1.8b //(4) dup v16.8h,w4 //(5) @@ -248,7 +248,7 @@ col_loop_8_16_32: xtn v28.8b, v28.8h //(3) umlal v16.8h, v5.8b, v0.8b //(5) - dup v21.8b, v4.8b[2] //(6) + dup v21.8b, v4.b[2] //(6) umlal v16.8h, v17.8b, v1.8b //(5) dup v18.8h,w4 //(6) @@ -264,7 +264,7 @@ col_loop_8_16_32: xtn v25.8b, v25.8h //(4) umlal v18.8h, v5.8b, v0.8b //(6) - dup v22.8b, v4.8b[1] //(7) + dup v22.8b, v4.b[1] //(7) umlal v18.8h, v17.8b, v1.8b //(6) dup v26.8h,w4 //(7) @@ -281,7 +281,7 @@ col_loop_8_16_32: xtn v16.8b, v16.8h //(5) umlal v26.8h, v5.8b, v0.8b //(7) - dup v23.8b, v4.8b[0] //(8) + dup v23.8b, v4.b[0] //(8) umlal v26.8h, v17.8b, v1.8b //(7) dup v24.8h,w4 //(8) @@ -337,7 +337,7 @@ col_loop_8_16_32: ld1 {v4.8b},[x6] //(1n)(1-8)src[2nt-1-row] sub v19.8b, v2.8b , v17.8b //(1n)(1-8)[nt-1-col] - dup v20.8b, v4.8b[7] //(1n)(1) + dup v20.8b, v4.b[7] //(1n)(1) sub v6.8b, v2.8b , v5.8b beq epilog @@ -353,7 +353,7 @@ kernel_plnr: xtn v24.8b, v24.8h //(8) umlal v27.8h, v17.8b, v1.8b //(1)(col+1) * src[3nt+1] - dup v21.8b, v4.8b[6] //(2) + dup v21.8b, v4.b[6] //(2) umlal v27.8h, v6.8b, v3.8b //(1)(nt-1-row) * src[2nt+1+col] dup v30.8h,w4 //(2) @@ -373,7 +373,7 @@ kernel_plnr: csel x2, x20, x2,le umlal v30.8h, v17.8b, v1.8b //(2) - dup v22.8b, v4.8b[5] //(3) + dup v22.8b, v4.b[5] //(3) umlal v30.8h, v6.8b, v3.8b //(2) dup v28.8h,w4 //(3) @@ -390,7 +390,7 @@ kernel_plnr: xtn v27.8b, v27.8h //(1) umlal v28.8h, v5.8b, v0.8b //(3) - dup v23.8b, v4.8b[4] //(4) + dup v23.8b, v4.b[4] //(4) umlal v28.8h, v17.8b, v1.8b //(3) dup v25.8h,w4 //(4) @@ -408,7 +408,7 @@ kernel_plnr: xtn v30.8b, v30.8h //(2) umlal v25.8h, v5.8b, v0.8b //(4) - dup v20.8b, v4.8b[3] //(5) + dup v20.8b, v4.b[3] //(5) umlal v25.8h, v17.8b, v1.8b //(4) dup v16.8h,w4 //(5) @@ -426,7 +426,7 @@ kernel_plnr: xtn v28.8b, v28.8h //(3) umlal v16.8h, v5.8b, v0.8b //(5) - dup v21.8b, v4.8b[2] //(6) + dup v21.8b, v4.b[2] //(6) umlal v16.8h, v17.8b, v1.8b //(5) dup v18.8h,w4 //(6) @@ -450,7 +450,7 @@ kernel_plnr: xtn v25.8b, v25.8h //(4) umlal v18.8h, v5.8b, v0.8b //(6) - dup v22.8b, v4.8b[1] //(7) + dup v22.8b, v4.b[1] //(7) umlal v18.8h, v17.8b, v1.8b //(6) dup v26.8h,w4 //(7) @@ -473,7 +473,7 @@ kernel_plnr: xtn v16.8b, v16.8h //(5) umlal v26.8h, v5.8b, v0.8b //(7) - dup v23.8b, v4.8b[0] //(8) + dup v23.8b, v4.b[0] //(8) umlal v26.8h, v17.8b, v1.8b //(7) dup v24.8h,w4 //(8) @@ -495,7 +495,7 @@ kernel_plnr: ld1 {v5.8b},[x5] //(row+1 value) umlal v24.8h, v17.8b, v1.8b //(8) - dup v20.8b, v4.8b[7] //(1n)(1) + dup v20.8b, v4.b[7] //(1n)(1) umlal v24.8h, v6.8b, v3.8b //(8) st1 {v18.8b},[x2], x3 //(6)str 8 values diff --git a/common/arm64/ihevc_intra_pred_luma_vert.s b/common/arm64/ihevc_intra_pred_luma_vert.s index c67f721..a8b111e 100644 --- a/common/arm64/ihevc_intra_pred_luma_vert.s +++ b/common/arm64/ihevc_intra_pred_luma_vert.s @@ -122,7 +122,7 @@ copy_32: ld1 {v22.8b, v23.8b}, [x6] //16 loads (col 16:31) lsl x11, x3, #2 - add x11, x11, #-16 + sub x11, x11, #16 st1 {v20.8b, v21.8b}, [x2],#16 st1 {v20.8b, v21.8b}, [x5],#16 st1 {v20.8b, v21.8b}, [x8],#16 @@ -183,7 +183,7 @@ blk_16: sxtw x12,w12 ld1 {v16.8b, v17.8b}, [x6] //ld for repl to cols src[2nt+1+col(0:15)] (0 ignored for stores) - add x6, x6, #-17 //subtract -9 to take it to src[2nt-1-row(15)] + sub x6, x6, #17 //subtract -9 to take it to src[2nt-1-row(15)] dup v24.16b,w12 //src[2nt+1] dup v30.8h,w12 @@ -323,7 +323,7 @@ blk_4_8: sxtw x12,w12 ld1 {v16.8b},[x6] //ld for repl to cols src[2nt+1+col(0:3 or 0:7)](0 ignored for st) - add x6, x6, #-9 //subtract -9 to take it to src[2nt-1-row(15)] + sub x6, x6, #9 //subtract -9 to take it to src[2nt-1-row(15)] dup v24.8b,w12 //src[2nt+1] dup v30.8h,w12 diff --git a/common/arm64/ihevc_itrans_recon_16x16.s b/common/arm64/ihevc_itrans_recon_16x16.s index 90df840..fe76678 100644 --- a/common/arm64/ihevc_itrans_recon_16x16.s +++ b/common/arm64/ihevc_itrans_recon_16x16.s @@ -252,56 +252,56 @@ first_stage_top_four_bottom_four: //d7=x3 skip_load4rows: - smull v24.4s, v6.4h, v0.4h[1] //// y1 * cos1(part of b0) - smull v26.4s, v6.4h, v0.4h[3] //// y1 * cos3(part of b1) - smull v28.4s, v6.4h, v1.4h[1] //// y1 * sin3(part of b2) - smull v30.4s, v6.4h, v1.4h[3] //// y1 * sin1(part of b3) + smull v24.4s, v6.4h, v0.h[1] //// y1 * cos1(part of b0) + smull v26.4s, v6.4h, v0.h[3] //// y1 * cos3(part of b1) + smull v28.4s, v6.4h, v1.h[1] //// y1 * sin3(part of b2) + smull v30.4s, v6.4h, v1.h[3] //// y1 * sin1(part of b3) - smlal v24.4s, v7.4h, v0.4h[3] //// y1 * cos1 + y3 * cos3(part of b0) - smlal v26.4s, v7.4h, v2.4h[1] //// y1 * cos3 - y3 * sin1(part of b1) - smlal v28.4s, v7.4h, v3.4h[3] //// y1 * sin3 - y3 * cos1(part of b2) - smlsl v30.4s, v7.4h, v2.4h[3] //// y1 * sin1 - y3 * sin3(part of b3) + smlal v24.4s, v7.4h, v0.h[3] //// y1 * cos1 + y3 * cos3(part of b0) + smlal v26.4s, v7.4h, v2.h[1] //// y1 * cos3 - y3 * sin1(part of b1) + smlal v28.4s, v7.4h, v3.h[3] //// y1 * sin3 - y3 * cos1(part of b2) + smlsl v30.4s, v7.4h, v2.h[3] //// y1 * sin1 - y3 * sin3(part of b3) - smull v12.4s, v10.4h, v0.4h[0] - smlal v12.4s, v11.4h, v0.4h[2] - smull v14.4s, v10.4h, v0.4h[0] - smlal v14.4s, v11.4h, v1.4h[2] - smull v16.4s, v10.4h, v0.4h[0] - smlal v16.4s, v11.4h, v2.4h[2] - smull v18.4s, v10.4h, v0.4h[0] - smlal v18.4s, v11.4h, v3.4h[2] + smull v12.4s, v10.4h, v0.h[0] + smlal v12.4s, v11.4h, v0.h[2] + smull v14.4s, v10.4h, v0.h[0] + smlal v14.4s, v11.4h, v1.h[2] + smull v16.4s, v10.4h, v0.h[0] + smlal v16.4s, v11.4h, v2.h[2] + smull v18.4s, v10.4h, v0.h[0] + smlal v18.4s, v11.4h, v3.h[2] bge skip_last12rows_kernel1 - smlal v24.4s, v8.4h, v1.4h[1] - smlal v26.4s, v8.4h, v3.4h[3] - smlsl v28.4s, v8.4h, v1.4h[3] - smlsl v30.4s, v8.4h, v0.4h[3] + smlal v24.4s, v8.4h, v1.h[1] + smlal v26.4s, v8.4h, v3.h[3] + smlsl v28.4s, v8.4h, v1.h[3] + smlsl v30.4s, v8.4h, v0.h[3] - smlal v24.4s, v9.4h, v1.4h[3] - smlsl v26.4s, v9.4h, v2.4h[3] - smlsl v28.4s, v9.4h, v0.4h[3] - smlal v30.4s, v9.4h, v3.4h[3] + smlal v24.4s, v9.4h, v1.h[3] + smlsl v26.4s, v9.4h, v2.h[3] + smlsl v28.4s, v9.4h, v0.h[3] + smlal v30.4s, v9.4h, v3.h[3] - smlal v12.4s, v4.4h, v1.4h[0] - smlal v12.4s, v5.4h, v1.4h[2] - smlal v14.4s, v4.4h, v3.4h[0] - smlsl v14.4s, v5.4h, v3.4h[2] - smlsl v16.4s, v4.4h, v3.4h[0] - smlsl v16.4s, v5.4h, v0.4h[2] - smlsl v18.4s, v4.4h, v1.4h[0] - smlsl v18.4s, v5.4h, v2.4h[2] + smlal v12.4s, v4.4h, v1.h[0] + smlal v12.4s, v5.4h, v1.h[2] + smlal v14.4s, v4.4h, v3.h[0] + smlsl v14.4s, v5.4h, v3.h[2] + smlsl v16.4s, v4.4h, v3.h[0] + smlsl v16.4s, v5.4h, v0.h[2] + smlsl v18.4s, v4.4h, v1.h[0] + smlsl v18.4s, v5.4h, v2.h[2] //d0[0]= 64 d2[0]=64 //d0[1]= 90 d2[1]=57 @@ -328,57 +328,57 @@ skip_load4rows: - smlal v24.4s, v6.4h, v2.4h[1] //// y1 * cos1(part of b0) - smlsl v26.4s, v6.4h, v1.4h[1] //// y1 * cos3(part of b1) - smlsl v28.4s, v6.4h, v3.4h[1] //// y1 * sin3(part of b2) - smlal v30.4s, v6.4h, v0.4h[1] //// y1 * sin1(part of b3) + smlal v24.4s, v6.4h, v2.h[1] //// y1 * cos1(part of b0) + smlsl v26.4s, v6.4h, v1.h[1] //// y1 * cos3(part of b1) + smlsl v28.4s, v6.4h, v3.h[1] //// y1 * sin3(part of b2) + smlal v30.4s, v6.4h, v0.h[1] //// y1 * sin1(part of b3) - smlal v24.4s, v7.4h, v2.4h[3] //// y1 * cos1 + y3 * cos3(part of b0) - smlsl v26.4s, v7.4h, v0.4h[1] //// y1 * cos3 - y3 * sin1(part of b1) - smlal v28.4s, v7.4h, v2.4h[1] //// y1 * sin3 - y3 * cos1(part of b2) - smlal v30.4s, v7.4h, v3.4h[1] //// y1 * sin1 - y3 * sin3(part of b3) + smlal v24.4s, v7.4h, v2.h[3] //// y1 * cos1 + y3 * cos3(part of b0) + smlsl v26.4s, v7.4h, v0.h[1] //// y1 * cos3 - y3 * sin1(part of b1) + smlal v28.4s, v7.4h, v2.h[1] //// y1 * sin3 - y3 * cos1(part of b2) + smlal v30.4s, v7.4h, v3.h[1] //// y1 * sin1 - y3 * sin3(part of b3) - smlal v24.4s, v8.4h, v3.4h[1] - smlsl v26.4s, v8.4h, v1.4h[3] - smlal v28.4s, v8.4h, v0.4h[1] - smlsl v30.4s, v8.4h, v1.4h[1] + smlal v24.4s, v8.4h, v3.h[1] + smlsl v26.4s, v8.4h, v1.h[3] + smlal v28.4s, v8.4h, v0.h[1] + smlsl v30.4s, v8.4h, v1.h[1] - smlal v24.4s, v9.4h, v3.4h[3] - smlsl v26.4s, v9.4h, v3.4h[1] - smlal v28.4s, v9.4h, v2.4h[3] - smlsl v30.4s, v9.4h, v2.4h[1] + smlal v24.4s, v9.4h, v3.h[3] + smlsl v26.4s, v9.4h, v3.h[1] + smlal v28.4s, v9.4h, v2.h[3] + smlsl v30.4s, v9.4h, v2.h[1] - smlal v12.4s, v10.4h, v0.4h[0] - smlal v12.4s, v11.4h, v2.4h[2] - smlal v12.4s, v4.4h, v3.4h[0] - smlal v12.4s, v5.4h, v3.4h[2] + smlal v12.4s, v10.4h, v0.h[0] + smlal v12.4s, v11.4h, v2.h[2] + smlal v12.4s, v4.4h, v3.h[0] + smlal v12.4s, v5.4h, v3.h[2] - smlsl v14.4s, v10.4h, v0.4h[0] - smlsl v14.4s, v11.4h, v0.4h[2] - smlsl v14.4s, v4.4h, v1.4h[0] - smlsl v14.4s, v5.4h, v2.4h[2] + smlsl v14.4s, v10.4h, v0.h[0] + smlsl v14.4s, v11.4h, v0.h[2] + smlsl v14.4s, v4.4h, v1.h[0] + smlsl v14.4s, v5.4h, v2.h[2] - smlsl v16.4s, v10.4h, v0.4h[0] - smlal v16.4s, v11.4h, v3.4h[2] - smlal v16.4s, v4.4h, v1.4h[0] - smlal v16.4s, v5.4h, v1.4h[2] + smlsl v16.4s, v10.4h, v0.h[0] + smlal v16.4s, v11.4h, v3.h[2] + smlal v16.4s, v4.4h, v1.h[0] + smlal v16.4s, v5.4h, v1.h[2] - smlal v18.4s, v10.4h, v0.4h[0] - smlal v18.4s, v11.4h, v1.4h[2] - smlsl v18.4s, v4.4h, v3.4h[0] - smlsl v18.4s, v5.4h, v0.4h[2] + smlal v18.4s, v10.4h, v0.h[0] + smlal v18.4s, v11.4h, v1.h[2] + smlsl v18.4s, v4.4h, v3.h[0] + smlsl v18.4s, v5.4h, v0.h[2] skip_last12rows_kernel1: add v20.4s, v12.4s , v24.4s @@ -430,55 +430,55 @@ first_stage_middle_eight: skip_stage1_kernel_load: - smull v24.4s, v6.4h, v2.4h[1] //// y1 * cos1(part of b0) - smull v26.4s, v6.4h, v2.4h[3] //// y1 * cos3(part of b1) - smull v28.4s, v6.4h, v3.4h[1] //// y1 * sin3(part of b2) - smull v30.4s, v6.4h, v3.4h[3] //// y1 * sin1(part of b3) + smull v24.4s, v6.4h, v2.h[1] //// y1 * cos1(part of b0) + smull v26.4s, v6.4h, v2.h[3] //// y1 * cos3(part of b1) + smull v28.4s, v6.4h, v3.h[1] //// y1 * sin3(part of b2) + smull v30.4s, v6.4h, v3.h[3] //// y1 * sin1(part of b3) - smlsl v24.4s, v7.4h, v1.4h[1] //// y1 * cos1 + y3 * cos3(part of b0) - smlsl v26.4s, v7.4h, v0.4h[1] //// y1 * cos3 - y3 * sin1(part of b1) - smlsl v28.4s, v7.4h, v1.4h[3] //// y1 * sin3 - y3 * cos1(part of b2) - smlsl v30.4s, v7.4h, v3.4h[1] //// y1 * sin1 - y3 * sin3(part of b3) + smlsl v24.4s, v7.4h, v1.h[1] //// y1 * cos1 + y3 * cos3(part of b0) + smlsl v26.4s, v7.4h, v0.h[1] //// y1 * cos3 - y3 * sin1(part of b1) + smlsl v28.4s, v7.4h, v1.h[3] //// y1 * sin3 - y3 * cos1(part of b2) + smlsl v30.4s, v7.4h, v3.h[1] //// y1 * sin1 - y3 * sin3(part of b3) - smull v22.4s, v10.4h, v0.4h[0] - smlsl v22.4s, v11.4h, v3.4h[2] - smull v20.4s, v10.4h, v0.4h[0] - smlsl v20.4s, v11.4h, v2.4h[2] - smull v16.4s, v10.4h, v0.4h[0] - smlsl v16.4s, v11.4h, v1.4h[2] - smull v18.4s, v10.4h, v0.4h[0] - smlsl v18.4s, v11.4h, v0.4h[2] + smull v22.4s, v10.4h, v0.h[0] + smlsl v22.4s, v11.4h, v3.h[2] + smull v20.4s, v10.4h, v0.h[0] + smlsl v20.4s, v11.4h, v2.h[2] + smull v16.4s, v10.4h, v0.h[0] + smlsl v16.4s, v11.4h, v1.h[2] + smull v18.4s, v10.4h, v0.h[0] + smlsl v18.4s, v11.4h, v0.h[2] cmp x11,x7 bge skip_last12rows_kernel2 - smlsl v24.4s, v8.4h, v3.4h[1] - smlal v26.4s, v8.4h, v2.4h[1] - smlal v28.4s, v8.4h, v0.4h[1] - smlal v30.4s, v8.4h, v2.4h[3] + smlsl v24.4s, v8.4h, v3.h[1] + smlal v26.4s, v8.4h, v2.h[1] + smlal v28.4s, v8.4h, v0.h[1] + smlal v30.4s, v8.4h, v2.h[3] - smlal v24.4s, v9.4h, v0.4h[1] - smlal v26.4s, v9.4h, v3.4h[1] - smlsl v28.4s, v9.4h, v1.4h[1] - smlsl v30.4s, v9.4h, v2.4h[1] + smlal v24.4s, v9.4h, v0.h[1] + smlal v26.4s, v9.4h, v3.h[1] + smlsl v28.4s, v9.4h, v1.h[1] + smlsl v30.4s, v9.4h, v2.h[1] - smlsl v22.4s, v4.4h, v1.4h[0] - smlal v22.4s, v5.4h, v2.4h[2] - smlsl v20.4s, v4.4h, v3.4h[0] - smlal v20.4s, v5.4h, v0.4h[2] - smlal v16.4s, v4.4h, v3.4h[0] - smlal v16.4s, v5.4h, v3.4h[2] - smlal v18.4s, v4.4h, v1.4h[0] - smlsl v18.4s, v5.4h, v1.4h[2] + smlsl v22.4s, v4.4h, v1.h[0] + smlal v22.4s, v5.4h, v2.h[2] + smlsl v20.4s, v4.4h, v3.h[0] + smlal v20.4s, v5.4h, v0.h[2] + smlal v16.4s, v4.4h, v3.h[0] + smlal v16.4s, v5.4h, v3.h[2] + smlal v18.4s, v4.4h, v1.h[0] + smlsl v18.4s, v5.4h, v1.h[2] //d0[0]= 64 d2[0]=64 //d0[1]= 90 d2[1]=57 @@ -502,55 +502,55 @@ skip_stage1_kernel_load: ld1 {v9.4h},[x9],x5 - smlsl v24.4s, v6.4h, v3.4h[3] //// y1 * cos1(part of b0) - smlsl v26.4s, v6.4h, v0.4h[3] //// y1 * cos3(part of b1) - smlal v28.4s, v6.4h, v2.4h[3] //// y1 * sin3(part of b2) - smlal v30.4s, v6.4h, v1.4h[3] //// y1 * sin1(part of b3) + smlsl v24.4s, v6.4h, v3.h[3] //// y1 * cos1(part of b0) + smlsl v26.4s, v6.4h, v0.h[3] //// y1 * cos3(part of b1) + smlal v28.4s, v6.4h, v2.h[3] //// y1 * sin3(part of b2) + smlal v30.4s, v6.4h, v1.h[3] //// y1 * sin1(part of b3) - smlsl v24.4s, v7.4h, v0.4h[3] //// y1 * cos1 + y3 * cos3(part of b0) - smlal v26.4s, v7.4h, v1.4h[3] //// y1 * cos3 - y3 * sin1(part of b1) - smlal v28.4s, v7.4h, v3.4h[3] //// y1 * sin3 - y3 * cos1(part of b2) - smlsl v30.4s, v7.4h, v1.4h[1] //// y1 * sin1 - y3 * sin3(part of b3) + smlsl v24.4s, v7.4h, v0.h[3] //// y1 * cos1 + y3 * cos3(part of b0) + smlal v26.4s, v7.4h, v1.h[3] //// y1 * cos3 - y3 * sin1(part of b1) + smlal v28.4s, v7.4h, v3.h[3] //// y1 * sin3 - y3 * cos1(part of b2) + smlsl v30.4s, v7.4h, v1.h[1] //// y1 * sin1 - y3 * sin3(part of b3) - smlal v24.4s, v8.4h, v2.4h[3] - smlal v26.4s, v8.4h, v3.4h[3] - smlsl v28.4s, v8.4h, v2.4h[1] - smlal v30.4s, v8.4h, v0.4h[3] + smlal v24.4s, v8.4h, v2.h[3] + smlal v26.4s, v8.4h, v3.h[3] + smlsl v28.4s, v8.4h, v2.h[1] + smlal v30.4s, v8.4h, v0.h[3] - smlal v24.4s, v9.4h, v1.4h[3] - smlsl v26.4s, v9.4h, v1.4h[1] - smlal v28.4s, v9.4h, v0.4h[3] - smlsl v30.4s, v9.4h, v0.4h[1] + smlal v24.4s, v9.4h, v1.h[3] + smlsl v26.4s, v9.4h, v1.h[1] + smlal v28.4s, v9.4h, v0.h[3] + smlsl v30.4s, v9.4h, v0.h[1] - smlal v22.4s, v10.4h, v0.4h[0] - smlsl v22.4s, v11.4h, v1.4h[2] - smlsl v22.4s, v4.4h, v3.4h[0] - smlal v22.4s, v5.4h, v0.4h[2] + smlal v22.4s, v10.4h, v0.h[0] + smlsl v22.4s, v11.4h, v1.h[2] + smlsl v22.4s, v4.4h, v3.h[0] + smlal v22.4s, v5.4h, v0.h[2] - smlsl v20.4s, v10.4h, v0.4h[0] - smlsl v20.4s, v11.4h, v3.4h[2] - smlal v20.4s, v4.4h, v1.4h[0] - smlsl v20.4s, v5.4h, v1.4h[2] + smlsl v20.4s, v10.4h, v0.h[0] + smlsl v20.4s, v11.4h, v3.h[2] + smlal v20.4s, v4.4h, v1.h[0] + smlsl v20.4s, v5.4h, v1.h[2] - smlsl v16.4s, v10.4h, v0.4h[0] - smlal v16.4s, v11.4h, v0.4h[2] - smlsl v16.4s, v4.4h, v1.4h[0] - smlal v16.4s, v5.4h, v2.4h[2] + smlsl v16.4s, v10.4h, v0.h[0] + smlal v16.4s, v11.4h, v0.h[2] + smlsl v16.4s, v4.4h, v1.h[0] + smlal v16.4s, v5.4h, v2.h[2] - smlal v18.4s, v10.4h, v0.4h[0] - smlsl v18.4s, v11.4h, v2.4h[2] - smlal v18.4s, v4.4h, v3.4h[0] - smlsl v18.4s, v5.4h, v3.4h[2] + smlal v18.4s, v10.4h, v0.h[0] + smlsl v18.4s, v11.4h, v2.h[2] + smlal v18.4s, v4.4h, v3.h[0] + smlsl v18.4s, v5.4h, v3.h[2] skip_last12rows_kernel2: @@ -755,48 +755,48 @@ second_stage: second_stage_process: - smull v24.4s, v6.4h, v0.4h[1] //// y1 * cos1(part of b0) - smull v26.4s, v6.4h, v0.4h[3] //// y1 * cos3(part of b1) - smull v28.4s, v6.4h, v1.4h[1] //// y1 * sin3(part of b2) - smull v30.4s, v6.4h, v1.4h[3] //// y1 * sin1(part of b3) + smull v24.4s, v6.4h, v0.h[1] //// y1 * cos1(part of b0) + smull v26.4s, v6.4h, v0.h[3] //// y1 * cos3(part of b1) + smull v28.4s, v6.4h, v1.h[1] //// y1 * sin3(part of b2) + smull v30.4s, v6.4h, v1.h[3] //// y1 * sin1(part of b3) - smlal v24.4s, v7.4h, v0.4h[3] //// y1 * cos1 + y3 * cos3(part of b0) - smlal v26.4s, v7.4h, v2.4h[1] //// y1 * cos3 - y3 * sin1(part of b1) - smlal v28.4s, v7.4h, v3.4h[3] //// y1 * sin3 - y3 * cos1(part of b2) - smlsl v30.4s, v7.4h, v2.4h[3] //// y1 * sin1 - y3 * sin3(part of b3) + smlal v24.4s, v7.4h, v0.h[3] //// y1 * cos1 + y3 * cos3(part of b0) + smlal v26.4s, v7.4h, v2.h[1] //// y1 * cos3 - y3 * sin1(part of b1) + smlal v28.4s, v7.4h, v3.h[3] //// y1 * sin3 - y3 * cos1(part of b2) + smlsl v30.4s, v7.4h, v2.h[3] //// y1 * sin1 - y3 * sin3(part of b3) - smull v12.4s, v10.4h, v0.4h[0] - smlal v12.4s, v11.4h, v0.4h[2] - smull v14.4s, v10.4h, v0.4h[0] - smlal v14.4s, v11.4h, v1.4h[2] - smull v16.4s, v10.4h, v0.4h[0] - smlal v16.4s, v11.4h, v2.4h[2] - smull v18.4s, v10.4h, v0.4h[0] - smlal v18.4s, v11.4h, v3.4h[2] + smull v12.4s, v10.4h, v0.h[0] + smlal v12.4s, v11.4h, v0.h[2] + smull v14.4s, v10.4h, v0.h[0] + smlal v14.4s, v11.4h, v1.h[2] + smull v16.4s, v10.4h, v0.h[0] + smlal v16.4s, v11.4h, v2.h[2] + smull v18.4s, v10.4h, v0.h[0] + smlal v18.4s, v11.4h, v3.h[2] bge skip_last8rows_stage2_kernel1 - smlal v24.4s, v8.4h, v1.4h[1] - smlal v26.4s, v8.4h, v3.4h[3] - smlsl v28.4s, v8.4h, v1.4h[3] - smlsl v30.4s, v8.4h, v0.4h[3] + smlal v24.4s, v8.4h, v1.h[1] + smlal v26.4s, v8.4h, v3.h[3] + smlsl v28.4s, v8.4h, v1.h[3] + smlsl v30.4s, v8.4h, v0.h[3] - smlal v24.4s, v9.4h, v1.4h[3] - smlsl v26.4s, v9.4h, v2.4h[3] - smlsl v28.4s, v9.4h, v0.4h[3] - smlal v30.4s, v9.4h, v3.4h[3] + smlal v24.4s, v9.4h, v1.h[3] + smlsl v26.4s, v9.4h, v2.h[3] + smlsl v28.4s, v9.4h, v0.h[3] + smlal v30.4s, v9.4h, v3.h[3] - smlal v12.4s, v4.4h, v1.4h[0] - smlal v12.4s, v5.4h, v1.4h[2] - smlal v14.4s, v4.4h, v3.4h[0] - smlsl v14.4s, v5.4h, v3.4h[2] - smlsl v16.4s, v4.4h, v3.4h[0] - smlsl v16.4s, v5.4h, v0.4h[2] - smlsl v18.4s, v4.4h, v1.4h[0] - smlsl v18.4s, v5.4h, v2.4h[2] + smlal v12.4s, v4.4h, v1.h[0] + smlal v12.4s, v5.4h, v1.h[2] + smlal v14.4s, v4.4h, v3.h[0] + smlsl v14.4s, v5.4h, v3.h[2] + smlsl v16.4s, v4.4h, v3.h[0] + smlsl v16.4s, v5.4h, v0.h[2] + smlsl v18.4s, v4.4h, v1.h[0] + smlsl v18.4s, v5.4h, v2.h[2] mov x19,#0xff00 cmp x12,x19 @@ -812,57 +812,57 @@ second_stage_process: - smlal v24.4s, v6.4h, v2.4h[1] //// y1 * cos1(part of b0) - smlsl v26.4s, v6.4h, v1.4h[1] //// y1 * cos3(part of b1) - smlsl v28.4s, v6.4h, v3.4h[1] //// y1 * sin3(part of b2) - smlal v30.4s, v6.4h, v0.4h[1] //// y1 * sin1(part of b3) + smlal v24.4s, v6.4h, v2.h[1] //// y1 * cos1(part of b0) + smlsl v26.4s, v6.4h, v1.h[1] //// y1 * cos3(part of b1) + smlsl v28.4s, v6.4h, v3.h[1] //// y1 * sin3(part of b2) + smlal v30.4s, v6.4h, v0.h[1] //// y1 * sin1(part of b3) - smlal v24.4s, v7.4h, v2.4h[3] //// y1 * cos1 + y3 * cos3(part of b0) - smlsl v26.4s, v7.4h, v0.4h[1] //// y1 * cos3 - y3 * sin1(part of b1) - smlal v28.4s, v7.4h, v2.4h[1] //// y1 * sin3 - y3 * cos1(part of b2) - smlal v30.4s, v7.4h, v3.4h[1] //// y1 * sin1 - y3 * sin3(part of b3) + smlal v24.4s, v7.4h, v2.h[3] //// y1 * cos1 + y3 * cos3(part of b0) + smlsl v26.4s, v7.4h, v0.h[1] //// y1 * cos3 - y3 * sin1(part of b1) + smlal v28.4s, v7.4h, v2.h[1] //// y1 * sin3 - y3 * cos1(part of b2) + smlal v30.4s, v7.4h, v3.h[1] //// y1 * sin1 - y3 * sin3(part of b3) - smlal v24.4s, v8.4h, v3.4h[1] - smlsl v26.4s, v8.4h, v1.4h[3] - smlal v28.4s, v8.4h, v0.4h[1] - smlsl v30.4s, v8.4h, v1.4h[1] + smlal v24.4s, v8.4h, v3.h[1] + smlsl v26.4s, v8.4h, v1.h[3] + smlal v28.4s, v8.4h, v0.h[1] + smlsl v30.4s, v8.4h, v1.h[1] - smlal v24.4s, v9.4h, v3.4h[3] - smlsl v26.4s, v9.4h, v3.4h[1] - smlal v28.4s, v9.4h, v2.4h[3] - smlsl v30.4s, v9.4h, v2.4h[1] + smlal v24.4s, v9.4h, v3.h[3] + smlsl v26.4s, v9.4h, v3.h[1] + smlal v28.4s, v9.4h, v2.h[3] + smlsl v30.4s, v9.4h, v2.h[1] - smlal v12.4s, v10.4h, v0.4h[0] - smlal v12.4s, v11.4h, v2.4h[2] - smlal v12.4s, v4.4h, v3.4h[0] - smlal v12.4s, v5.4h, v3.4h[2] + smlal v12.4s, v10.4h, v0.h[0] + smlal v12.4s, v11.4h, v2.h[2] + smlal v12.4s, v4.4h, v3.h[0] + smlal v12.4s, v5.4h, v3.h[2] - smlsl v14.4s, v10.4h, v0.4h[0] - smlsl v14.4s, v11.4h, v0.4h[2] - smlsl v14.4s, v4.4h, v1.4h[0] - smlsl v14.4s, v5.4h, v2.4h[2] + smlsl v14.4s, v10.4h, v0.h[0] + smlsl v14.4s, v11.4h, v0.h[2] + smlsl v14.4s, v4.4h, v1.h[0] + smlsl v14.4s, v5.4h, v2.h[2] - smlsl v16.4s, v10.4h, v0.4h[0] - smlal v16.4s, v11.4h, v3.4h[2] - smlal v16.4s, v4.4h, v1.4h[0] - smlal v16.4s, v5.4h, v1.4h[2] + smlsl v16.4s, v10.4h, v0.h[0] + smlal v16.4s, v11.4h, v3.h[2] + smlal v16.4s, v4.4h, v1.h[0] + smlal v16.4s, v5.4h, v1.h[2] - smlal v18.4s, v10.4h, v0.4h[0] - smlal v18.4s, v11.4h, v1.4h[2] - smlsl v18.4s, v4.4h, v3.4h[0] - smlsl v18.4s, v5.4h, v0.4h[2] + smlal v18.4s, v10.4h, v0.h[0] + smlal v18.4s, v11.4h, v1.h[2] + smlsl v18.4s, v4.4h, v3.h[0] + smlsl v18.4s, v5.4h, v0.h[2] @@ -914,25 +914,25 @@ skip_stage2_kernel_load: st1 {v18.4h, v19.4h},[x1],#16 sub x1,x1,#32 - smull v24.4s, v6.4h, v2.4h[1] //// y1 * cos1(part of b0) - smull v26.4s, v6.4h, v2.4h[3] //// y1 * cos3(part of b1) - smull v28.4s, v6.4h, v3.4h[1] //// y1 * sin3(part of b2) - smull v30.4s, v6.4h, v3.4h[3] //// y1 * sin1(part of b3) + smull v24.4s, v6.4h, v2.h[1] //// y1 * cos1(part of b0) + smull v26.4s, v6.4h, v2.h[3] //// y1 * cos3(part of b1) + smull v28.4s, v6.4h, v3.h[1] //// y1 * sin3(part of b2) + smull v30.4s, v6.4h, v3.h[3] //// y1 * sin1(part of b3) - smlsl v24.4s, v7.4h, v1.4h[1] //// y1 * cos1 + y3 * cos3(part of b0) - smlsl v26.4s, v7.4h, v0.4h[1] //// y1 * cos3 - y3 * sin1(part of b1) - smlsl v28.4s, v7.4h, v1.4h[3] //// y1 * sin3 - y3 * cos1(part of b2) - smlsl v30.4s, v7.4h, v3.4h[1] //// y1 * sin1 - y3 * sin3(part of b3) + smlsl v24.4s, v7.4h, v1.h[1] //// y1 * cos1 + y3 * cos3(part of b0) + smlsl v26.4s, v7.4h, v0.h[1] //// y1 * cos3 - y3 * sin1(part of b1) + smlsl v28.4s, v7.4h, v1.h[3] //// y1 * sin3 - y3 * cos1(part of b2) + smlsl v30.4s, v7.4h, v3.h[1] //// y1 * sin1 - y3 * sin3(part of b3) - smull v22.4s, v10.4h, v0.4h[0] - smlsl v22.4s, v11.4h, v3.4h[2] - smull v20.4s, v10.4h, v0.4h[0] - smlsl v20.4s, v11.4h, v2.4h[2] - smull v16.4s, v10.4h, v0.4h[0] - smlsl v16.4s, v11.4h, v1.4h[2] - smull v18.4s, v10.4h, v0.4h[0] - smlsl v18.4s, v11.4h, v0.4h[2] + smull v22.4s, v10.4h, v0.h[0] + smlsl v22.4s, v11.4h, v3.h[2] + smull v20.4s, v10.4h, v0.h[0] + smlsl v20.4s, v11.4h, v2.h[2] + smull v16.4s, v10.4h, v0.h[0] + smlsl v16.4s, v11.4h, v1.h[2] + smull v18.4s, v10.4h, v0.h[0] + smlsl v18.4s, v11.4h, v0.h[2] @@ -940,27 +940,27 @@ skip_stage2_kernel_load: bge skip_last8rows_stage2_kernel2 - smlsl v24.4s, v8.4h, v3.4h[1] - smlal v26.4s, v8.4h, v2.4h[1] - smlal v28.4s, v8.4h, v0.4h[1] - smlal v30.4s, v8.4h, v2.4h[3] + smlsl v24.4s, v8.4h, v3.h[1] + smlal v26.4s, v8.4h, v2.h[1] + smlal v28.4s, v8.4h, v0.h[1] + smlal v30.4s, v8.4h, v2.h[3] - smlal v24.4s, v9.4h, v0.4h[1] - smlal v26.4s, v9.4h, v3.4h[1] - smlsl v28.4s, v9.4h, v1.4h[1] - smlsl v30.4s, v9.4h, v2.4h[1] + smlal v24.4s, v9.4h, v0.h[1] + smlal v26.4s, v9.4h, v3.h[1] + smlsl v28.4s, v9.4h, v1.h[1] + smlsl v30.4s, v9.4h, v2.h[1] - smlsl v22.4s, v4.4h, v1.4h[0] - smlal v22.4s, v5.4h, v2.4h[2] - smlsl v20.4s, v4.4h, v3.4h[0] - smlal v20.4s, v5.4h, v0.4h[2] - smlal v16.4s, v4.4h, v3.4h[0] - smlal v16.4s, v5.4h, v3.4h[2] - smlal v18.4s, v4.4h, v1.4h[0] - smlsl v18.4s, v5.4h, v1.4h[2] + smlsl v22.4s, v4.4h, v1.h[0] + smlal v22.4s, v5.4h, v2.h[2] + smlsl v20.4s, v4.4h, v3.h[0] + smlal v20.4s, v5.4h, v0.h[2] + smlal v16.4s, v4.4h, v3.h[0] + smlal v16.4s, v5.4h, v3.h[2] + smlal v18.4s, v4.4h, v1.h[0] + smlsl v18.4s, v5.4h, v1.h[2] mov x19,#0xff00 cmp x12,x19 bge skip_last8rows_stage2_kernel2 @@ -970,55 +970,55 @@ skip_stage2_kernel_load: ld1 {v4.4h, v5.4h},[x0],#16 ld1 {v8.4h, v9.4h},[x0],#16 - smlsl v24.4s, v6.4h, v3.4h[3] //// y1 * cos1(part of b0) - smlsl v26.4s, v6.4h, v0.4h[3] //// y1 * cos3(part of b1) - smlal v28.4s, v6.4h, v2.4h[3] //// y1 * sin3(part of b2) - smlal v30.4s, v6.4h, v1.4h[3] //// y1 * sin1(part of b3) + smlsl v24.4s, v6.4h, v3.h[3] //// y1 * cos1(part of b0) + smlsl v26.4s, v6.4h, v0.h[3] //// y1 * cos3(part of b1) + smlal v28.4s, v6.4h, v2.h[3] //// y1 * sin3(part of b2) + smlal v30.4s, v6.4h, v1.h[3] //// y1 * sin1(part of b3) - smlsl v24.4s, v7.4h, v0.4h[3] //// y1 * cos1 + y3 * cos3(part of b0) - smlal v26.4s, v7.4h, v1.4h[3] //// y1 * cos3 - y3 * sin1(part of b1) - smlal v28.4s, v7.4h, v3.4h[3] //// y1 * sin3 - y3 * cos1(part of b2) - smlsl v30.4s, v7.4h, v1.4h[1] //// y1 * sin1 - y3 * sin3(part of b3) + smlsl v24.4s, v7.4h, v0.h[3] //// y1 * cos1 + y3 * cos3(part of b0) + smlal v26.4s, v7.4h, v1.h[3] //// y1 * cos3 - y3 * sin1(part of b1) + smlal v28.4s, v7.4h, v3.h[3] //// y1 * sin3 - y3 * cos1(part of b2) + smlsl v30.4s, v7.4h, v1.h[1] //// y1 * sin1 - y3 * sin3(part of b3) - smlal v24.4s, v8.4h, v2.4h[3] - smlal v26.4s, v8.4h, v3.4h[3] - smlsl v28.4s, v8.4h, v2.4h[1] - smlal v30.4s, v8.4h, v0.4h[3] + smlal v24.4s, v8.4h, v2.h[3] + smlal v26.4s, v8.4h, v3.h[3] + smlsl v28.4s, v8.4h, v2.h[1] + smlal v30.4s, v8.4h, v0.h[3] - smlal v24.4s, v9.4h, v1.4h[3] - smlsl v26.4s, v9.4h, v1.4h[1] - smlal v28.4s, v9.4h, v0.4h[3] - smlsl v30.4s, v9.4h, v0.4h[1] + smlal v24.4s, v9.4h, v1.h[3] + smlsl v26.4s, v9.4h, v1.h[1] + smlal v28.4s, v9.4h, v0.h[3] + smlsl v30.4s, v9.4h, v0.h[1] - smlal v22.4s, v10.4h, v0.4h[0] - smlsl v22.4s, v11.4h, v1.4h[2] - smlsl v22.4s, v4.4h, v3.4h[0] - smlal v22.4s, v5.4h, v0.4h[2] + smlal v22.4s, v10.4h, v0.h[0] + smlsl v22.4s, v11.4h, v1.h[2] + smlsl v22.4s, v4.4h, v3.h[0] + smlal v22.4s, v5.4h, v0.h[2] - smlsl v20.4s, v10.4h, v0.4h[0] - smlsl v20.4s, v11.4h, v3.4h[2] - smlal v20.4s, v4.4h, v1.4h[0] - smlsl v20.4s, v5.4h, v1.4h[2] + smlsl v20.4s, v10.4h, v0.h[0] + smlsl v20.4s, v11.4h, v3.h[2] + smlal v20.4s, v4.4h, v1.h[0] + smlsl v20.4s, v5.4h, v1.h[2] - smlsl v16.4s, v10.4h, v0.4h[0] - smlal v16.4s, v11.4h, v0.4h[2] - smlsl v16.4s, v4.4h, v1.4h[0] - smlal v16.4s, v5.4h, v2.4h[2] + smlsl v16.4s, v10.4h, v0.h[0] + smlal v16.4s, v11.4h, v0.h[2] + smlsl v16.4s, v4.4h, v1.h[0] + smlal v16.4s, v5.4h, v2.h[2] - smlal v18.4s, v10.4h, v0.4h[0] - smlsl v18.4s, v11.4h, v2.4h[2] - smlal v18.4s, v4.4h, v3.4h[0] - smlsl v18.4s, v5.4h, v3.4h[2] + smlal v18.4s, v10.4h, v0.h[0] + smlsl v18.4s, v11.4h, v2.h[2] + smlal v18.4s, v4.4h, v3.h[0] + smlsl v18.4s, v5.4h, v3.h[2] skip_last8rows_stage2_kernel2: diff --git a/common/arm64/ihevc_itrans_recon_32x32.s b/common/arm64/ihevc_itrans_recon_32x32.s index 6f40747..51646ac 100644 --- a/common/arm64/ihevc_itrans_recon_32x32.s +++ b/common/arm64/ihevc_itrans_recon_32x32.s @@ -213,32 +213,32 @@ stage1: ld1 {v11.4h},[x0],x6 ld1 {v9.4h},[x0],x6 - smull v24.4s, v8.4h, v0.4h[1] //// y1 * cos1(part of b0) - smull v26.4s, v8.4h, v0.4h[3] //// y1 * cos3(part of b1) - smull v28.4s, v8.4h, v1.4h[1] //// y1 * sin3(part of b2) - smull v30.4s, v8.4h, v1.4h[3] //// y1 * sin1(part of b3) + smull v24.4s, v8.4h, v0.h[1] //// y1 * cos1(part of b0) + smull v26.4s, v8.4h, v0.h[3] //// y1 * cos3(part of b1) + smull v28.4s, v8.4h, v1.h[1] //// y1 * sin3(part of b2) + smull v30.4s, v8.4h, v1.h[3] //// y1 * sin1(part of b3) - smlal v24.4s, v9.4h, v0.4h[3] //// y1 * cos1 + y3 * cos3(part of b0) - smlal v26.4s, v9.4h, v2.4h[1] //// y1 * cos3 - y3 * sin1(part of b1) - smlal v28.4s, v9.4h, v3.4h[3] //// y1 * sin3 - y3 * cos1(part of b2) - smlal v30.4s, v9.4h, v5.4h[1] //// y1 * sin1 - y3 * sin3(part of b3) + smlal v24.4s, v9.4h, v0.h[3] //// y1 * cos1 + y3 * cos3(part of b0) + smlal v26.4s, v9.4h, v2.h[1] //// y1 * cos3 - y3 * sin1(part of b1) + smlal v28.4s, v9.4h, v3.h[3] //// y1 * sin3 - y3 * cos1(part of b2) + smlal v30.4s, v9.4h, v5.h[1] //// y1 * sin1 - y3 * sin3(part of b3) - smull v20.4s, v10.4h, v0.4h[0] - smlal v20.4s, v11.4h, v0.4h[2] + smull v20.4s, v10.4h, v0.h[0] + smlal v20.4s, v11.4h, v0.h[2] - smull v22.4s, v10.4h, v0.4h[0] - smlal v22.4s, v11.4h, v1.4h[2] + smull v22.4s, v10.4h, v0.h[0] + smlal v22.4s, v11.4h, v1.h[2] - smull v16.4s, v10.4h, v0.4h[0] - smlal v16.4s, v11.4h, v2.4h[2] + smull v16.4s, v10.4h, v0.h[0] + smlal v16.4s, v11.4h, v2.h[2] - smull v18.4s, v10.4h, v0.4h[0] - smlal v18.4s, v11.4h, v3.4h[2] + smull v18.4s, v10.4h, v0.h[0] + smlal v18.4s, v11.4h, v3.h[2] cmp x11,x10 bhs shift1 @@ -253,26 +253,26 @@ stage1: - smlal v24.4s, v14.4h, v1.4h[1] - smlal v26.4s, v14.4h, v3.4h[3] - smlal v28.4s, v14.4h, v6.4h[1] - smlsl v30.4s, v14.4h, v7.4h[1] + smlal v24.4s, v14.4h, v1.h[1] + smlal v26.4s, v14.4h, v3.h[3] + smlal v28.4s, v14.4h, v6.h[1] + smlsl v30.4s, v14.4h, v7.h[1] - smlal v24.4s, v15.4h, v1.4h[3] - smlal v26.4s, v15.4h, v5.4h[1] - smlsl v28.4s, v15.4h, v7.4h[1] - smlsl v30.4s, v15.4h, v3.4h[3] + smlal v24.4s, v15.4h, v1.h[3] + smlal v26.4s, v15.4h, v5.h[1] + smlsl v28.4s, v15.4h, v7.h[1] + smlsl v30.4s, v15.4h, v3.h[3] - smlal v20.4s, v12.4h, v1.4h[0] - smlal v20.4s, v13.4h, v1.4h[2] - smlal v22.4s, v12.4h, v3.4h[0] - smlal v22.4s, v13.4h, v4.4h[2] - smlal v16.4s, v12.4h, v5.4h[0] - smlal v16.4s, v13.4h, v7.4h[2] - smlal v18.4s, v12.4h, v7.4h[0] - smlsl v18.4s, v13.4h, v5.4h[2] + smlal v20.4s, v12.4h, v1.h[0] + smlal v20.4s, v13.4h, v1.h[2] + smlal v22.4s, v12.4h, v3.h[0] + smlal v22.4s, v13.4h, v4.h[2] + smlal v16.4s, v12.4h, v5.h[0] + smlal v16.4s, v13.4h, v7.h[2] + smlal v18.4s, v12.4h, v7.h[0] + smlsl v18.4s, v13.4h, v5.h[2] cmp x11,x9 bhs shift1 @@ -283,32 +283,32 @@ stage1: ld1 {v9.4h},[x0],x6 - smlal v24.4s, v8.4h, v2.4h[1] //// y1 * cos1(part of b0) - smlal v26.4s, v8.4h, v6.4h[3] //// y1 * cos3(part of b1) - smlsl v28.4s, v8.4h, v4.4h[3] //// y1 * sin3(part of b2) - smlsl v30.4s, v8.4h, v0.4h[1] //// y1 * sin1(part of b3) + smlal v24.4s, v8.4h, v2.h[1] //// y1 * cos1(part of b0) + smlal v26.4s, v8.4h, v6.h[3] //// y1 * cos3(part of b1) + smlsl v28.4s, v8.4h, v4.h[3] //// y1 * sin3(part of b2) + smlsl v30.4s, v8.4h, v0.h[1] //// y1 * sin1(part of b3) - smlal v24.4s, v9.4h, v2.4h[3] //// y1 * cos1 + y3 * cos3(part of b0) - smlsl v26.4s, v9.4h, v7.4h[3] //// y1 * cos3 - y3 * sin1(part of b1) - smlsl v28.4s, v9.4h, v2.4h[1] //// y1 * sin3 - y3 * cos1(part of b2) - smlsl v30.4s, v9.4h, v3.4h[1] //// y1 * sin1 - y3 * sin3(part of b3) + smlal v24.4s, v9.4h, v2.h[3] //// y1 * cos1 + y3 * cos3(part of b0) + smlsl v26.4s, v9.4h, v7.h[3] //// y1 * cos3 - y3 * sin1(part of b1) + smlsl v28.4s, v9.4h, v2.h[1] //// y1 * sin3 - y3 * cos1(part of b2) + smlsl v30.4s, v9.4h, v3.h[1] //// y1 * sin1 - y3 * sin3(part of b3) - smlal v20.4s, v10.4h, v2.4h[0] - smlal v20.4s, v11.4h, v2.4h[2] + smlal v20.4s, v10.4h, v2.h[0] + smlal v20.4s, v11.4h, v2.h[2] - smlal v22.4s, v10.4h, v6.4h[0] - smlal v22.4s, v11.4h, v7.4h[2] + smlal v22.4s, v10.4h, v6.h[0] + smlal v22.4s, v11.4h, v7.h[2] - smlsl v16.4s, v10.4h, v6.4h[0] - smlsl v16.4s, v11.4h, v3.4h[2] + smlsl v16.4s, v10.4h, v6.h[0] + smlsl v16.4s, v11.4h, v3.h[2] - smlsl v18.4s, v10.4h, v2.4h[0] - smlsl v18.4s, v11.4h, v1.4h[2] + smlsl v18.4s, v10.4h, v2.h[0] + smlsl v18.4s, v11.4h, v1.h[2] cmp x11,x5 bhs shift1 @@ -327,26 +327,26 @@ stage1: - smlal v24.4s, v14.4h, v3.4h[1] - smlsl v26.4s, v14.4h, v6.4h[1] - smlsl v28.4s, v14.4h, v0.4h[1] - smlsl v30.4s, v14.4h, v6.4h[3] + smlal v24.4s, v14.4h, v3.h[1] + smlsl v26.4s, v14.4h, v6.h[1] + smlsl v28.4s, v14.4h, v0.h[1] + smlsl v30.4s, v14.4h, v6.h[3] - smlal v24.4s, v15.4h, v3.4h[3] - smlsl v26.4s, v15.4h, v4.4h[3] - smlsl v28.4s, v15.4h, v2.4h[3] - smlal v30.4s, v15.4h, v5.4h[3] + smlal v24.4s, v15.4h, v3.h[3] + smlsl v26.4s, v15.4h, v4.h[3] + smlsl v28.4s, v15.4h, v2.h[3] + smlal v30.4s, v15.4h, v5.h[3] - smlal v20.4s, v12.4h, v3.4h[0] - smlal v20.4s, v13.4h, v3.4h[2] - smlsl v22.4s, v12.4h, v7.4h[0] - smlsl v22.4s, v13.4h, v5.4h[2] - smlsl v16.4s, v12.4h, v1.4h[0] - smlsl v16.4s, v13.4h, v1.4h[2] - smlsl v18.4s, v12.4h, v5.4h[0] - smlal v18.4s, v13.4h, v7.4h[2] + smlal v20.4s, v12.4h, v3.h[0] + smlal v20.4s, v13.4h, v3.h[2] + smlsl v22.4s, v12.4h, v7.h[0] + smlsl v22.4s, v13.4h, v5.h[2] + smlsl v16.4s, v12.4h, v1.h[0] + smlsl v16.4s, v13.4h, v1.h[2] + smlsl v18.4s, v12.4h, v5.h[0] + smlal v18.4s, v13.4h, v7.h[2] cmp x11,x7 bhs shift1 @@ -359,32 +359,32 @@ stage1: - smlal v24.4s, v8.4h, v4.4h[1] //// y1 * cos1(part of b0) - smlsl v26.4s, v8.4h, v3.4h[1] //// y1 * cos3(part of b1) - smlsl v28.4s, v8.4h, v5.4h[1] //// y1 * sin3(part of b2) - smlal v30.4s, v8.4h, v2.4h[1] //// y1 * sin1(part of b3) + smlal v24.4s, v8.4h, v4.h[1] //// y1 * cos1(part of b0) + smlsl v26.4s, v8.4h, v3.h[1] //// y1 * cos3(part of b1) + smlsl v28.4s, v8.4h, v5.h[1] //// y1 * sin3(part of b2) + smlal v30.4s, v8.4h, v2.h[1] //// y1 * sin1(part of b3) - smlal v24.4s, v9.4h, v4.4h[3] //// y1 * cos1 + y3 * cos3(part of b0) - smlsl v26.4s, v9.4h, v1.4h[3] //// y1 * cos3 - y3 * sin1(part of b1) - smlsl v28.4s, v9.4h, v7.4h[3] //// y1 * sin3 - y3 * cos1(part of b2) - smlal v30.4s, v9.4h, v1.4h[1] //// y1 * sin1 - y3 * sin3(part of b3) + smlal v24.4s, v9.4h, v4.h[3] //// y1 * cos1 + y3 * cos3(part of b0) + smlsl v26.4s, v9.4h, v1.h[3] //// y1 * cos3 - y3 * sin1(part of b1) + smlsl v28.4s, v9.4h, v7.h[3] //// y1 * sin3 - y3 * cos1(part of b2) + smlal v30.4s, v9.4h, v1.h[1] //// y1 * sin1 - y3 * sin3(part of b3) - smlal v20.4s, v10.4h, v0.4h[0] - smlal v20.4s, v11.4h, v4.4h[2] + smlal v20.4s, v10.4h, v0.h[0] + smlal v20.4s, v11.4h, v4.h[2] - smlsl v22.4s, v10.4h, v0.4h[0] - smlsl v22.4s, v11.4h, v2.4h[2] + smlsl v22.4s, v10.4h, v0.h[0] + smlsl v22.4s, v11.4h, v2.h[2] - smlsl v16.4s, v10.4h, v0.4h[0] - smlsl v16.4s, v11.4h, v6.4h[2] + smlsl v16.4s, v10.4h, v0.h[0] + smlsl v16.4s, v11.4h, v6.h[2] - smlal v18.4s, v10.4h, v0.4h[0] - smlal v18.4s, v11.4h, v0.4h[2] + smlal v18.4s, v10.4h, v0.h[0] + smlal v18.4s, v11.4h, v0.h[2] @@ -396,26 +396,26 @@ stage1: - smlal v24.4s, v14.4h, v5.4h[1] - smlsl v26.4s, v14.4h, v0.4h[2] - smlal v28.4s, v14.4h, v5.4h[3] - smlal v30.4s, v14.4h, v4.4h[3] + smlal v24.4s, v14.4h, v5.h[1] + smlsl v26.4s, v14.4h, v0.h[2] + smlal v28.4s, v14.4h, v5.h[3] + smlal v30.4s, v14.4h, v4.h[3] - smlal v24.4s, v15.4h, v5.4h[3] - smlsl v26.4s, v15.4h, v1.4h[1] - smlal v28.4s, v15.4h, v3.4h[1] - smlsl v30.4s, v15.4h, v7.4h[3] + smlal v24.4s, v15.4h, v5.h[3] + smlsl v26.4s, v15.4h, v1.h[1] + smlal v28.4s, v15.4h, v3.h[1] + smlsl v30.4s, v15.4h, v7.h[3] - smlal v20.4s, v12.4h, v5.4h[0] - smlal v20.4s, v13.4h, v5.4h[2] - smlsl v22.4s, v12.4h, v1.4h[0] - smlsl v22.4s, v13.4h, v0.4h[2] - smlal v16.4s, v12.4h, v7.4h[0] - smlal v16.4s, v13.4h, v4.4h[2] - smlal v18.4s, v12.4h, v3.4h[0] - smlal v18.4s, v13.4h, v6.4h[2] + smlal v20.4s, v12.4h, v5.h[0] + smlal v20.4s, v13.4h, v5.h[2] + smlsl v22.4s, v12.4h, v1.h[0] + smlsl v22.4s, v13.4h, v0.h[2] + smlal v16.4s, v12.4h, v7.h[0] + smlal v16.4s, v13.4h, v4.h[2] + smlal v18.4s, v12.4h, v3.h[0] + smlal v18.4s, v13.4h, v6.h[2] ld1 {v10.4h},[x0],x6 @@ -429,32 +429,32 @@ stage1: - smlal v24.4s, v8.4h, v6.4h[1] //// y1 * cos1(part of b0) - smlsl v26.4s, v8.4h, v2.4h[3] //// y1 * cos3(part of b1) - smlal v28.4s, v8.4h, v0.4h[1] //// y1 * sin3(part of b2) - smlsl v30.4s, v8.4h, v4.4h[1] //// y1 * sin1(part of b3) + smlal v24.4s, v8.4h, v6.h[1] //// y1 * cos1(part of b0) + smlsl v26.4s, v8.4h, v2.h[3] //// y1 * cos3(part of b1) + smlal v28.4s, v8.4h, v0.h[1] //// y1 * sin3(part of b2) + smlsl v30.4s, v8.4h, v4.h[1] //// y1 * sin1(part of b3) - smlal v24.4s, v9.4h, v6.4h[3] //// y1 * cos1 + y3 * cos3(part of b0) - smlsl v26.4s, v9.4h, v4.4h[1] //// y1 * cos3 - y3 * sin1(part of b1) - smlal v28.4s, v9.4h, v1.4h[3] //// y1 * sin3 - y3 * cos1(part of b2) - smlsl v30.4s, v9.4h, v0.4h[1] //// y1 * sin1 - y3 * sin3(part of b3) + smlal v24.4s, v9.4h, v6.h[3] //// y1 * cos1 + y3 * cos3(part of b0) + smlsl v26.4s, v9.4h, v4.h[1] //// y1 * cos3 - y3 * sin1(part of b1) + smlal v28.4s, v9.4h, v1.h[3] //// y1 * sin3 - y3 * cos1(part of b2) + smlsl v30.4s, v9.4h, v0.h[1] //// y1 * sin1 - y3 * sin3(part of b3) - smlal v20.4s, v10.4h, v6.4h[0] - smlal v20.4s, v11.4h, v6.4h[2] + smlal v20.4s, v10.4h, v6.h[0] + smlal v20.4s, v11.4h, v6.h[2] - smlsl v22.4s, v10.4h, v2.4h[0] - smlsl v22.4s, v11.4h, v3.4h[2] + smlsl v22.4s, v10.4h, v2.h[0] + smlsl v22.4s, v11.4h, v3.h[2] - smlal v16.4s, v10.4h, v2.4h[0] - smlal v16.4s, v11.4h, v0.4h[2] + smlal v16.4s, v10.4h, v2.h[0] + smlal v16.4s, v11.4h, v0.h[2] - smlsl v18.4s, v10.4h, v6.4h[0] - smlsl v18.4s, v11.4h, v2.4h[2] + smlsl v18.4s, v10.4h, v6.h[0] + smlsl v18.4s, v11.4h, v2.h[2] ld1 {v12.4h},[x0],x6 ld1 {v14.4h},[x0],x6 @@ -462,26 +462,26 @@ stage1: ld1 {v15.4h},[x0],x6 - smlal v24.4s, v14.4h, v7.4h[1] - smlsl v26.4s, v14.4h, v5.4h[3] - smlal v28.4s, v14.4h, v4.4h[1] - smlsl v30.4s, v14.4h, v2.4h[3] + smlal v24.4s, v14.4h, v7.h[1] + smlsl v26.4s, v14.4h, v5.h[3] + smlal v28.4s, v14.4h, v4.h[1] + smlsl v30.4s, v14.4h, v2.h[3] - smlal v24.4s, v15.4h, v7.4h[3] - smlsl v26.4s, v15.4h, v7.4h[1] - smlal v28.4s, v15.4h, v6.4h[3] - smlsl v30.4s, v15.4h, v6.4h[1] + smlal v24.4s, v15.4h, v7.h[3] + smlsl v26.4s, v15.4h, v7.h[1] + smlal v28.4s, v15.4h, v6.h[3] + smlsl v30.4s, v15.4h, v6.h[1] - smlal v20.4s, v12.4h, v7.4h[0] - smlal v20.4s, v13.4h, v7.4h[2] - smlsl v22.4s, v12.4h, v5.4h[0] - smlsl v22.4s, v13.4h, v6.4h[2] - smlal v16.4s, v12.4h, v3.4h[0] - smlal v16.4s, v13.4h, v5.4h[2] - smlsl v18.4s, v12.4h, v1.4h[0] - smlsl v18.4s, v13.4h, v4.4h[2] + smlal v20.4s, v12.4h, v7.h[0] + smlal v20.4s, v13.4h, v7.h[2] + smlsl v22.4s, v12.4h, v5.h[0] + smlsl v22.4s, v13.4h, v6.h[2] + smlal v16.4s, v12.4h, v3.h[0] + smlal v16.4s, v13.4h, v5.h[2] + smlsl v18.4s, v12.4h, v1.h[0] + smlsl v18.4s, v13.4h, v4.h[2] @@ -574,32 +574,32 @@ shift1: - smull v24.4s, v8.4h, v2.4h[1] //// y1 * cos1(part of b0) - smull v26.4s, v8.4h, v2.4h[3] //// y1 * cos3(part of b1) - smull v28.4s, v8.4h, v3.4h[1] //// y1 * sin3(part of b2) - smull v30.4s, v8.4h, v3.4h[3] //// y1 * sin1(part of b3) + smull v24.4s, v8.4h, v2.h[1] //// y1 * cos1(part of b0) + smull v26.4s, v8.4h, v2.h[3] //// y1 * cos3(part of b1) + smull v28.4s, v8.4h, v3.h[1] //// y1 * sin3(part of b2) + smull v30.4s, v8.4h, v3.h[3] //// y1 * sin1(part of b3) - smlal v24.4s, v9.4h, v6.4h[3] //// y1 * cos1 + y3 * cos3(part of b0) - smlsl v26.4s, v9.4h, v7.4h[3] //// y1 * cos3 - y3 * sin1(part of b1) - smlsl v28.4s, v9.4h, v6.4h[1] //// y1 * sin3 - y3 * cos1(part of b2) - smlsl v30.4s, v9.4h, v4.4h[3] //// y1 * sin1 - y3 * sin3(part of b3) + smlal v24.4s, v9.4h, v6.h[3] //// y1 * cos1 + y3 * cos3(part of b0) + smlsl v26.4s, v9.4h, v7.h[3] //// y1 * cos3 - y3 * sin1(part of b1) + smlsl v28.4s, v9.4h, v6.h[1] //// y1 * sin3 - y3 * cos1(part of b2) + smlsl v30.4s, v9.4h, v4.h[3] //// y1 * sin1 - y3 * sin3(part of b3) - smull v20.4s, v10.4h, v0.4h[0] - smlal v20.4s, v11.4h, v4.4h[2] + smull v20.4s, v10.4h, v0.h[0] + smlal v20.4s, v11.4h, v4.h[2] - smull v22.4s, v10.4h, v0.4h[0] - smlal v22.4s, v11.4h, v5.4h[2] + smull v22.4s, v10.4h, v0.h[0] + smlal v22.4s, v11.4h, v5.h[2] - smull v16.4s, v10.4h, v0.4h[0] - smlal v16.4s, v11.4h, v6.4h[2] + smull v16.4s, v10.4h, v0.h[0] + smlal v16.4s, v11.4h, v6.h[2] - smull v18.4s, v10.4h, v0.4h[0] - smlal v18.4s, v11.4h, v7.4h[2] + smull v18.4s, v10.4h, v0.h[0] + smlal v18.4s, v11.4h, v7.h[2] cmp x11,x10 bhs shift2 @@ -609,26 +609,26 @@ shift1: ld1 {v15.4h},[x0],x6 - smlsl v24.4s, v14.4h, v4.4h[3] - smlsl v26.4s, v14.4h, v2.4h[1] - smlsl v28.4s, v14.4h, v0.4h[1] - smlsl v30.4s, v14.4h, v2.4h[3] + smlsl v24.4s, v14.4h, v4.h[3] + smlsl v26.4s, v14.4h, v2.h[1] + smlsl v28.4s, v14.4h, v0.h[1] + smlsl v30.4s, v14.4h, v2.h[3] - smlsl v24.4s, v15.4h, v0.4h[3] - smlsl v26.4s, v15.4h, v3.4h[1] - smlsl v28.4s, v15.4h, v6.4h[3] - smlal v30.4s, v15.4h, v5.4h[3] + smlsl v24.4s, v15.4h, v0.h[3] + smlsl v26.4s, v15.4h, v3.h[1] + smlsl v28.4s, v15.4h, v6.h[3] + smlal v30.4s, v15.4h, v5.h[3] - smlsl v20.4s, v12.4h, v7.4h[0] - smlsl v20.4s, v13.4h, v2.4h[2] - smlsl v22.4s, v12.4h, v5.4h[0] - smlsl v22.4s, v13.4h, v0.4h[2] - smlsl v16.4s, v12.4h, v3.4h[0] - smlsl v16.4s, v13.4h, v3.4h[2] - smlsl v18.4s, v12.4h, v1.4h[0] - smlsl v18.4s, v13.4h, v6.4h[2] + smlsl v20.4s, v12.4h, v7.h[0] + smlsl v20.4s, v13.4h, v2.h[2] + smlsl v22.4s, v12.4h, v5.h[0] + smlsl v22.4s, v13.4h, v0.h[2] + smlsl v16.4s, v12.4h, v3.h[0] + smlsl v16.4s, v13.4h, v3.h[2] + smlsl v18.4s, v12.4h, v1.h[0] + smlsl v18.4s, v13.4h, v6.h[2] cmp x11,x9 bhs shift2 @@ -645,32 +645,32 @@ shift1: - smlsl v24.4s, v8.4h, v4.4h[1] //// y1 * cos1(part of b0) - smlal v26.4s, v8.4h, v7.4h[1] //// y1 * cos3(part of b1) - smlal v28.4s, v8.4h, v2.4h[3] //// y1 * sin3(part of b2) - smlal v30.4s, v8.4h, v1.4h[3] //// y1 * sin1(part of b3) + smlsl v24.4s, v8.4h, v4.h[1] //// y1 * cos1(part of b0) + smlal v26.4s, v8.4h, v7.h[1] //// y1 * cos3(part of b1) + smlal v28.4s, v8.4h, v2.h[3] //// y1 * sin3(part of b2) + smlal v30.4s, v8.4h, v1.h[3] //// y1 * sin1(part of b3) - smlal v24.4s, v9.4h, v7.4h[1] //// y1 * cos1 + y3 * cos3(part of b0) - smlal v26.4s, v9.4h, v1.4h[3] //// y1 * cos3 - y3 * sin1(part of b1) - smlal v28.4s, v9.4h, v3.4h[3] //// y1 * sin3 - y3 * cos1(part of b2) - smlsl v30.4s, v9.4h, v6.4h[3] //// y1 * sin1 - y3 * sin3(part of b3) + smlal v24.4s, v9.4h, v7.h[1] //// y1 * cos1 + y3 * cos3(part of b0) + smlal v26.4s, v9.4h, v1.h[3] //// y1 * cos3 - y3 * sin1(part of b1) + smlal v28.4s, v9.4h, v3.h[3] //// y1 * sin3 - y3 * cos1(part of b2) + smlsl v30.4s, v9.4h, v6.h[3] //// y1 * sin1 - y3 * sin3(part of b3) - smlsl v20.4s, v10.4h, v2.4h[0] - smlsl v20.4s, v11.4h, v6.4h[2] + smlsl v20.4s, v10.4h, v2.h[0] + smlsl v20.4s, v11.4h, v6.h[2] - smlsl v22.4s, v10.4h, v6.4h[0] - smlal v22.4s, v11.4h, v4.4h[2] + smlsl v22.4s, v10.4h, v6.h[0] + smlal v22.4s, v11.4h, v4.h[2] - smlal v16.4s, v10.4h, v6.4h[0] - smlal v16.4s, v11.4h, v0.4h[2] + smlal v16.4s, v10.4h, v6.h[0] + smlal v16.4s, v11.4h, v0.h[2] - smlal v18.4s, v10.4h, v2.4h[0] - smlal v18.4s, v11.4h, v5.4h[2] + smlal v18.4s, v10.4h, v2.h[0] + smlal v18.4s, v11.4h, v5.h[2] cmp x11,x5 bhs shift2 @@ -685,26 +685,26 @@ shift1: - smlal v24.4s, v14.4h, v2.4h[3] - smlal v26.4s, v14.4h, v3.4h[3] - smlsl v28.4s, v14.4h, v5.4h[3] - smlsl v30.4s, v14.4h, v0.4h[3] + smlal v24.4s, v14.4h, v2.h[3] + smlal v26.4s, v14.4h, v3.h[3] + smlsl v28.4s, v14.4h, v5.h[3] + smlsl v30.4s, v14.4h, v0.h[3] - smlal v24.4s, v15.4h, v1.4h[3] - smlsl v26.4s, v15.4h, v6.4h[3] - smlsl v28.4s, v15.4h, v0.4h[3] - smlal v30.4s, v15.4h, v7.4h[3] + smlal v24.4s, v15.4h, v1.h[3] + smlsl v26.4s, v15.4h, v6.h[3] + smlsl v28.4s, v15.4h, v0.h[3] + smlal v30.4s, v15.4h, v7.h[3] - smlal v20.4s, v12.4h, v5.4h[0] - smlal v20.4s, v13.4h, v0.4h[2] - smlal v22.4s, v12.4h, v1.4h[0] - smlal v22.4s, v13.4h, v6.4h[2] - smlal v16.4s, v12.4h, v7.4h[0] - smlsl v16.4s, v13.4h, v2.4h[2] - smlsl v18.4s, v12.4h, v3.4h[0] - smlsl v18.4s, v13.4h, v4.4h[2] + smlal v20.4s, v12.4h, v5.h[0] + smlal v20.4s, v13.4h, v0.h[2] + smlal v22.4s, v12.4h, v1.h[0] + smlal v22.4s, v13.4h, v6.h[2] + smlal v16.4s, v12.4h, v7.h[0] + smlsl v16.4s, v13.4h, v2.h[2] + smlsl v18.4s, v12.4h, v3.h[0] + smlsl v18.4s, v13.4h, v4.h[2] cmp x11,x7 @@ -722,32 +722,32 @@ shift1: - smlal v24.4s, v8.4h, v6.4h[1] //// y1 * cos1(part of b0) - smlsl v26.4s, v8.4h, v1.4h[1] //// y1 * cos3(part of b1) - smlsl v28.4s, v8.4h, v7.4h[1] //// y1 * sin3(part of b2) - smlal v30.4s, v8.4h, v0.4h[3] //// y1 * sin1(part of b3) + smlal v24.4s, v8.4h, v6.h[1] //// y1 * cos1(part of b0) + smlsl v26.4s, v8.4h, v1.h[1] //// y1 * cos3(part of b1) + smlsl v28.4s, v8.4h, v7.h[1] //// y1 * sin3(part of b2) + smlal v30.4s, v8.4h, v0.h[3] //// y1 * sin1(part of b3) - smlsl v24.4s, v9.4h, v5.4h[1] //// y1 * cos1 + y3 * cos3(part of b0) - smlsl v26.4s, v9.4h, v4.4h[1] //// y1 * cos3 - y3 * sin1(part of b1) - smlal v28.4s, v9.4h, v2.4h[1] //// y1 * sin3 - y3 * cos1(part of b2) - smlal v30.4s, v9.4h, v7.4h[1] //// y1 * sin1 - y3 * sin3(part of b3) + smlsl v24.4s, v9.4h, v5.h[1] //// y1 * cos1 + y3 * cos3(part of b0) + smlsl v26.4s, v9.4h, v4.h[1] //// y1 * cos3 - y3 * sin1(part of b1) + smlal v28.4s, v9.4h, v2.h[1] //// y1 * sin3 - y3 * cos1(part of b2) + smlal v30.4s, v9.4h, v7.h[1] //// y1 * sin1 - y3 * sin3(part of b3) - smlal v20.4s, v10.4h, v0.4h[0] - smlsl v20.4s, v11.4h, v7.4h[2] + smlal v20.4s, v10.4h, v0.h[0] + smlsl v20.4s, v11.4h, v7.h[2] - smlsl v22.4s, v10.4h, v0.4h[0] - smlsl v22.4s, v11.4h, v1.4h[2] + smlsl v22.4s, v10.4h, v0.h[0] + smlsl v22.4s, v11.4h, v1.h[2] - smlsl v16.4s, v10.4h, v0.4h[0] - smlal v16.4s, v11.4h, v5.4h[2] + smlsl v16.4s, v10.4h, v0.h[0] + smlal v16.4s, v11.4h, v5.h[2] - smlal v18.4s, v10.4h, v0.4h[0] - smlal v18.4s, v11.4h, v3.4h[2] + smlal v18.4s, v10.4h, v0.h[0] + smlal v18.4s, v11.4h, v3.h[2] @@ -757,26 +757,26 @@ shift1: ld1 {v15.4h},[x0],x6 - smlsl v24.4s, v14.4h, v0.4h[1] - smlal v26.4s, v14.4h, v6.4h[1] - smlal v28.4s, v14.4h, v4.4h[1] - smlsl v30.4s, v14.4h, v1.4h[1] + smlsl v24.4s, v14.4h, v0.h[1] + smlal v26.4s, v14.4h, v6.h[1] + smlal v28.4s, v14.4h, v4.h[1] + smlsl v30.4s, v14.4h, v1.h[1] - smlsl v24.4s, v15.4h, v3.4h[3] - smlal v26.4s, v15.4h, v0.4h[1] - smlsl v28.4s, v15.4h, v5.4h[1] - smlsl v30.4s, v15.4h, v6.4h[1] + smlsl v24.4s, v15.4h, v3.h[3] + smlal v26.4s, v15.4h, v0.h[1] + smlsl v28.4s, v15.4h, v5.h[1] + smlsl v30.4s, v15.4h, v6.h[1] - smlsl v20.4s, v12.4h, v3.4h[0] - smlsl v20.4s, v13.4h, v1.4h[2] - smlsl v22.4s, v12.4h, v7.4h[0] - smlal v22.4s, v13.4h, v3.4h[2] - smlal v16.4s, v12.4h, v1.4h[0] - smlal v16.4s, v13.4h, v7.4h[2] - smlsl v18.4s, v12.4h, v5.4h[0] - smlsl v18.4s, v13.4h, v2.4h[2] + smlsl v20.4s, v12.4h, v3.h[0] + smlsl v20.4s, v13.4h, v1.h[2] + smlsl v22.4s, v12.4h, v7.h[0] + smlal v22.4s, v13.4h, v3.h[2] + smlal v16.4s, v12.4h, v1.h[0] + smlal v16.4s, v13.4h, v7.h[2] + smlsl v18.4s, v12.4h, v5.h[0] + smlsl v18.4s, v13.4h, v2.h[2] ld1 {v10.4h},[x0],x6 ld1 {v8.4h},[x0],x6 @@ -786,32 +786,32 @@ shift1: - smlal v24.4s, v8.4h, v7.4h[3] //// y1 * cos1(part of b0) - smlal v26.4s, v8.4h, v4.4h[3] //// y1 * cos3(part of b1) - smlsl v28.4s, v8.4h, v1.4h[1] //// y1 * sin3(part of b2) - smlal v30.4s, v8.4h, v2.4h[1] //// y1 * sin1(part of b3) + smlal v24.4s, v8.4h, v7.h[3] //// y1 * cos1(part of b0) + smlal v26.4s, v8.4h, v4.h[3] //// y1 * cos3(part of b1) + smlsl v28.4s, v8.4h, v1.h[1] //// y1 * sin3(part of b2) + smlal v30.4s, v8.4h, v2.h[1] //// y1 * sin1(part of b3) - smlal v24.4s, v9.4h, v3.4h[1] //// y1 * cos1 + y3 * cos3(part of b0) - smlsl v26.4s, v9.4h, v5.4h[3] //// y1 * cos3 - y3 * sin1(part of b1) - smlsl v28.4s, v9.4h, v7.4h[3] //// y1 * sin3 - y3 * cos1(part of b2) - smlal v30.4s, v9.4h, v5.4h[1] //// y1 * sin1 - y3 * sin3(part of b3) + smlal v24.4s, v9.4h, v3.h[1] //// y1 * cos1 + y3 * cos3(part of b0) + smlsl v26.4s, v9.4h, v5.h[3] //// y1 * cos3 - y3 * sin1(part of b1) + smlsl v28.4s, v9.4h, v7.h[3] //// y1 * sin3 - y3 * cos1(part of b2) + smlal v30.4s, v9.4h, v5.h[1] //// y1 * sin1 - y3 * sin3(part of b3) - smlsl v20.4s, v10.4h, v6.4h[0] - smlal v20.4s, v11.4h, v5.4h[2] + smlsl v20.4s, v10.4h, v6.h[0] + smlal v20.4s, v11.4h, v5.h[2] - smlal v22.4s, v10.4h, v2.4h[0] - smlal v22.4s, v11.4h, v7.4h[2] + smlal v22.4s, v10.4h, v2.h[0] + smlal v22.4s, v11.4h, v7.h[2] - smlsl v16.4s, v10.4h, v2.4h[0] - smlsl v16.4s, v11.4h, v4.4h[2] + smlsl v16.4s, v10.4h, v2.h[0] + smlsl v16.4s, v11.4h, v4.h[2] - smlal v18.4s, v10.4h, v6.4h[0] - smlal v18.4s, v11.4h, v1.4h[2] + smlal v18.4s, v10.4h, v6.h[0] + smlal v18.4s, v11.4h, v1.h[2] ld1 {v12.4h},[x0],x6 @@ -823,26 +823,26 @@ shift1: - smlal v24.4s, v14.4h, v1.4h[1] - smlsl v26.4s, v14.4h, v0.4h[3] - smlal v28.4s, v14.4h, v1.4h[3] - smlsl v30.4s, v14.4h, v3.4h[1] + smlal v24.4s, v14.4h, v1.h[1] + smlsl v26.4s, v14.4h, v0.h[3] + smlal v28.4s, v14.4h, v1.h[3] + smlsl v30.4s, v14.4h, v3.h[1] - smlal v24.4s, v15.4h, v5.4h[3] - smlsl v26.4s, v15.4h, v5.4h[1] - smlal v28.4s, v15.4h, v4.4h[3] - smlsl v30.4s, v15.4h, v4.4h[1] + smlal v24.4s, v15.4h, v5.h[3] + smlsl v26.4s, v15.4h, v5.h[1] + smlal v28.4s, v15.4h, v4.h[3] + smlsl v30.4s, v15.4h, v4.h[1] - smlal v20.4s, v12.4h, v1.4h[0] - smlal v20.4s, v13.4h, v3.4h[2] - smlsl v22.4s, v12.4h, v3.4h[0] - smlsl v22.4s, v13.4h, v2.4h[2] - smlal v16.4s, v12.4h, v5.4h[0] - smlal v16.4s, v13.4h, v1.4h[2] - smlsl v18.4s, v12.4h, v7.4h[0] - smlsl v18.4s, v13.4h, v0.4h[2] + smlal v20.4s, v12.4h, v1.h[0] + smlal v20.4s, v13.4h, v3.h[2] + smlsl v22.4s, v12.4h, v3.h[0] + smlsl v22.4s, v13.4h, v2.h[2] + smlal v16.4s, v12.4h, v5.h[0] + smlal v16.4s, v13.4h, v1.h[2] + smlsl v18.4s, v12.4h, v7.h[0] + smlsl v18.4s, v13.4h, v0.h[2] shift2: add v8.4s, v20.4s , v24.4s @@ -914,32 +914,32 @@ shift2: ld1 {v9.4h},[x0],x6 - smull v24.4s, v8.4h, v4.4h[1] //// y1 * cos1(part of b0) - smull v26.4s, v8.4h, v4.4h[3] //// y1 * cos3(part of b1) - smull v28.4s, v8.4h, v5.4h[1] //// y1 * sin3(part of b2) - smull v30.4s, v8.4h, v5.4h[3] //// y1 * sin1(part of b3) + smull v24.4s, v8.4h, v4.h[1] //// y1 * cos1(part of b0) + smull v26.4s, v8.4h, v4.h[3] //// y1 * cos3(part of b1) + smull v28.4s, v8.4h, v5.h[1] //// y1 * sin3(part of b2) + smull v30.4s, v8.4h, v5.h[3] //// y1 * sin1(part of b3) - smlsl v24.4s, v9.4h, v3.4h[1] //// y1 * cos1 + y3 * cos3(part of b0) - smlsl v26.4s, v9.4h, v1.4h[3] //// y1 * cos3 - y3 * sin1(part of b1) - smlsl v28.4s, v9.4h, v0.4h[2] //// y1 * sin3 - y3 * cos1(part of b2) - smlsl v30.4s, v9.4h, v1.4h[1] //// y1 * sin1 - y3 * sin3(part of b3) + smlsl v24.4s, v9.4h, v3.h[1] //// y1 * cos1 + y3 * cos3(part of b0) + smlsl v26.4s, v9.4h, v1.h[3] //// y1 * cos3 - y3 * sin1(part of b1) + smlsl v28.4s, v9.4h, v0.h[2] //// y1 * sin3 - y3 * cos1(part of b2) + smlsl v30.4s, v9.4h, v1.h[1] //// y1 * sin1 - y3 * sin3(part of b3) - smull v20.4s, v10.4h, v0.4h[0] - smlsl v20.4s, v11.4h, v7.4h[2] + smull v20.4s, v10.4h, v0.h[0] + smlsl v20.4s, v11.4h, v7.h[2] - smull v22.4s, v10.4h, v0.4h[0] - smlsl v22.4s, v11.4h, v6.4h[2] + smull v22.4s, v10.4h, v0.h[0] + smlsl v22.4s, v11.4h, v6.h[2] - smull v16.4s, v10.4h, v0.4h[0] - smlsl v16.4s, v11.4h, v5.4h[2] + smull v16.4s, v10.4h, v0.h[0] + smlsl v16.4s, v11.4h, v5.h[2] - smull v18.4s, v10.4h, v0.4h[0] - smlsl v18.4s, v11.4h, v4.4h[2] + smull v18.4s, v10.4h, v0.h[0] + smlsl v18.4s, v11.4h, v4.h[2] cmp x11,x10 bhs shift3 @@ -952,26 +952,26 @@ shift2: - smlsl v24.4s, v14.4h, v5.4h[1] - smlsl v26.4s, v14.4h, v7.4h[3] - smlal v28.4s, v14.4h, v5.4h[3] - smlal v30.4s, v14.4h, v3.4h[1] + smlsl v24.4s, v14.4h, v5.h[1] + smlsl v26.4s, v14.4h, v7.h[3] + smlal v28.4s, v14.4h, v5.h[3] + smlal v30.4s, v14.4h, v3.h[1] - smlal v24.4s, v15.4h, v2.4h[1] - smlal v26.4s, v15.4h, v1.4h[1] - smlal v28.4s, v15.4h, v4.4h[3] - smlsl v30.4s, v15.4h, v7.4h[3] + smlal v24.4s, v15.4h, v2.h[1] + smlal v26.4s, v15.4h, v1.h[1] + smlal v28.4s, v15.4h, v4.h[3] + smlsl v30.4s, v15.4h, v7.h[3] - smlsl v20.4s, v12.4h, v1.4h[0] - smlal v20.4s, v13.4h, v6.4h[2] - smlsl v22.4s, v12.4h, v3.4h[0] - smlal v22.4s, v13.4h, v3.4h[2] - smlsl v16.4s, v12.4h, v5.4h[0] - smlal v16.4s, v13.4h, v0.4h[2] - smlsl v18.4s, v12.4h, v7.4h[0] - smlal v18.4s, v13.4h, v2.4h[2] + smlsl v20.4s, v12.4h, v1.h[0] + smlal v20.4s, v13.4h, v6.h[2] + smlsl v22.4s, v12.4h, v3.h[0] + smlal v22.4s, v13.4h, v3.h[2] + smlsl v16.4s, v12.4h, v5.h[0] + smlal v16.4s, v13.4h, v0.h[2] + smlsl v18.4s, v12.4h, v7.h[0] + smlal v18.4s, v13.4h, v2.h[2] cmp x11,x9 bhs shift3 @@ -981,32 +981,32 @@ shift2: ld1 {v11.4h},[x0],x6 ld1 {v9.4h},[x0],x6 - smlal v24.4s, v8.4h, v6.4h[1] //// y1 * cos1(part of b0) - smlsl v26.4s, v8.4h, v5.4h[1] //// y1 * cos3(part of b1) - smlsl v28.4s, v8.4h, v0.4h[3] //// y1 * sin3(part of b2) - smlsl v30.4s, v8.4h, v3.4h[3] //// y1 * sin1(part of b3) + smlal v24.4s, v8.4h, v6.h[1] //// y1 * cos1(part of b0) + smlsl v26.4s, v8.4h, v5.h[1] //// y1 * cos3(part of b1) + smlsl v28.4s, v8.4h, v0.h[3] //// y1 * sin3(part of b2) + smlsl v30.4s, v8.4h, v3.h[3] //// y1 * sin1(part of b3) - smlsl v24.4s, v9.4h, v1.4h[1] //// y1 * cos1 + y3 * cos3(part of b0) - smlsl v26.4s, v9.4h, v4.4h[1] //// y1 * cos3 - y3 * sin1(part of b1) - smlal v28.4s, v9.4h, v6.4h[1] //// y1 * sin3 - y3 * cos1(part of b2) - smlal v30.4s, v9.4h, v0.4h[1] //// y1 * sin1 - y3 * sin3(part of b3) + smlsl v24.4s, v9.4h, v1.h[1] //// y1 * cos1 + y3 * cos3(part of b0) + smlsl v26.4s, v9.4h, v4.h[1] //// y1 * cos3 - y3 * sin1(part of b1) + smlal v28.4s, v9.4h, v6.h[1] //// y1 * sin3 - y3 * cos1(part of b2) + smlal v30.4s, v9.4h, v0.h[1] //// y1 * sin1 - y3 * sin3(part of b3) - smlal v20.4s, v10.4h, v2.4h[0] - smlsl v20.4s, v11.4h, v5.4h[2] + smlal v20.4s, v10.4h, v2.h[0] + smlsl v20.4s, v11.4h, v5.h[2] - smlal v22.4s, v10.4h, v6.4h[0] - smlsl v22.4s, v11.4h, v0.4h[2] + smlal v22.4s, v10.4h, v6.h[0] + smlsl v22.4s, v11.4h, v0.h[2] - smlsl v16.4s, v10.4h, v6.4h[0] - smlsl v16.4s, v11.4h, v4.4h[2] + smlsl v16.4s, v10.4h, v6.h[0] + smlsl v16.4s, v11.4h, v4.h[2] - smlsl v18.4s, v10.4h, v2.4h[0] - smlal v18.4s, v11.4h, v6.4h[2] + smlsl v18.4s, v10.4h, v2.h[0] + smlal v18.4s, v11.4h, v6.h[2] cmp x11,x5 bhs shift3 @@ -1022,26 +1022,26 @@ shift2: - smlsl v24.4s, v14.4h, v7.4h[1] - smlal v26.4s, v14.4h, v2.4h[1] - smlal v28.4s, v14.4h, v4.4h[1] - smlsl v30.4s, v14.4h, v5.4h[1] + smlsl v24.4s, v14.4h, v7.h[1] + smlal v26.4s, v14.4h, v2.h[1] + smlal v28.4s, v14.4h, v4.h[1] + smlsl v30.4s, v14.4h, v5.h[1] - smlal v24.4s, v15.4h, v0.4h[3] - smlal v26.4s, v15.4h, v7.4h[1] - smlsl v28.4s, v15.4h, v1.4h[1] - smlsl v30.4s, v15.4h, v6.4h[1] + smlal v24.4s, v15.4h, v0.h[3] + smlal v26.4s, v15.4h, v7.h[1] + smlsl v28.4s, v15.4h, v1.h[1] + smlsl v30.4s, v15.4h, v6.h[1] - smlsl v20.4s, v12.4h, v3.4h[0] - smlal v20.4s, v13.4h, v4.4h[2] - smlal v22.4s, v12.4h, v7.4h[0] - smlal v22.4s, v13.4h, v2.4h[2] - smlal v16.4s, v12.4h, v1.4h[0] - smlsl v16.4s, v13.4h, v6.4h[2] - smlal v18.4s, v12.4h, v5.4h[0] - smlsl v18.4s, v13.4h, v0.4h[2] + smlsl v20.4s, v12.4h, v3.h[0] + smlal v20.4s, v13.4h, v4.h[2] + smlal v22.4s, v12.4h, v7.h[0] + smlal v22.4s, v13.4h, v2.h[2] + smlal v16.4s, v12.4h, v1.h[0] + smlsl v16.4s, v13.4h, v6.h[2] + smlal v18.4s, v12.4h, v5.h[0] + smlsl v18.4s, v13.4h, v0.h[2] cmp x11,x7 @@ -1054,32 +1054,32 @@ shift2: ld1 {v9.4h},[x0],x6 - smlsl v24.4s, v8.4h, v7.4h[3] //// y1 * cos1(part of b0) - smlsl v26.4s, v8.4h, v0.4h[1] //// y1 * cos3(part of b1) - smlal v28.4s, v8.4h, v6.4h[3] //// y1 * sin3(part of b2) - smlal v30.4s, v8.4h, v1.4h[3] //// y1 * sin1(part of b3) + smlsl v24.4s, v8.4h, v7.h[3] //// y1 * cos1(part of b0) + smlsl v26.4s, v8.4h, v0.h[1] //// y1 * cos3(part of b1) + smlal v28.4s, v8.4h, v6.h[3] //// y1 * sin3(part of b2) + smlal v30.4s, v8.4h, v1.h[3] //// y1 * sin1(part of b3) - smlsl v24.4s, v9.4h, v0.4h[1] //// y1 * cos1 + y3 * cos3(part of b0) - smlal v26.4s, v9.4h, v5.4h[3] //// y1 * cos3 - y3 * sin1(part of b1) - smlal v28.4s, v9.4h, v3.4h[3] //// y1 * sin3 - y3 * cos1(part of b2) - smlsl v30.4s, v9.4h, v2.4h[3] //// y1 * sin1 - y3 * sin3(part of b3) + smlsl v24.4s, v9.4h, v0.h[1] //// y1 * cos1 + y3 * cos3(part of b0) + smlal v26.4s, v9.4h, v5.h[3] //// y1 * cos3 - y3 * sin1(part of b1) + smlal v28.4s, v9.4h, v3.h[3] //// y1 * sin3 - y3 * cos1(part of b2) + smlsl v30.4s, v9.4h, v2.h[3] //// y1 * sin1 - y3 * sin3(part of b3) - smlal v20.4s, v10.4h, v0.4h[0] - smlsl v20.4s, v11.4h, v3.4h[2] + smlal v20.4s, v10.4h, v0.h[0] + smlsl v20.4s, v11.4h, v3.h[2] - smlsl v22.4s, v10.4h, v0.4h[0] - smlsl v22.4s, v11.4h, v5.4h[2] + smlsl v22.4s, v10.4h, v0.h[0] + smlsl v22.4s, v11.4h, v5.h[2] - smlsl v16.4s, v10.4h, v0.4h[0] - smlal v16.4s, v11.4h, v1.4h[2] + smlsl v16.4s, v10.4h, v0.h[0] + smlal v16.4s, v11.4h, v1.h[2] - smlal v18.4s, v10.4h, v0.4h[0] - smlal v18.4s, v11.4h, v7.4h[2] + smlal v18.4s, v10.4h, v0.h[0] + smlal v18.4s, v11.4h, v7.h[2] ld1 {v12.4h},[x0],x6 @@ -1089,26 +1089,26 @@ shift2: - smlal v24.4s, v14.4h, v6.4h[3] - smlal v26.4s, v14.4h, v3.4h[3] - smlsl v28.4s, v14.4h, v1.4h[3] - smlal v30.4s, v14.4h, v7.4h[1] + smlal v24.4s, v14.4h, v6.h[3] + smlal v26.4s, v14.4h, v3.h[3] + smlsl v28.4s, v14.4h, v1.h[3] + smlal v30.4s, v14.4h, v7.h[1] - smlal v24.4s, v15.4h, v1.4h[3] - smlsl v26.4s, v15.4h, v2.4h[3] - smlal v28.4s, v15.4h, v7.4h[1] - smlal v30.4s, v15.4h, v4.4h[1] + smlal v24.4s, v15.4h, v1.h[3] + smlsl v26.4s, v15.4h, v2.h[3] + smlal v28.4s, v15.4h, v7.h[1] + smlal v30.4s, v15.4h, v4.h[1] - smlsl v20.4s, v12.4h, v5.4h[0] - smlal v20.4s, v13.4h, v2.4h[2] - smlal v22.4s, v12.4h, v1.4h[0] - smlsl v22.4s, v13.4h, v7.4h[2] - smlsl v16.4s, v12.4h, v7.4h[0] - smlsl v16.4s, v13.4h, v3.4h[2] - smlsl v18.4s, v12.4h, v3.4h[0] - smlal v18.4s, v13.4h, v1.4h[2] + smlsl v20.4s, v12.4h, v5.h[0] + smlal v20.4s, v13.4h, v2.h[2] + smlal v22.4s, v12.4h, v1.h[0] + smlsl v22.4s, v13.4h, v7.h[2] + smlsl v16.4s, v12.4h, v7.h[0] + smlsl v16.4s, v13.4h, v3.h[2] + smlsl v18.4s, v12.4h, v3.h[0] + smlal v18.4s, v13.4h, v1.h[2] @@ -1120,32 +1120,32 @@ shift2: - smlsl v24.4s, v8.4h, v5.4h[3] //// y1 * cos1(part of b0) - smlsl v26.4s, v8.4h, v6.4h[3] //// y1 * cos3(part of b1) - smlal v28.4s, v8.4h, v3.4h[1] //// y1 * sin3(part of b2) - smlsl v30.4s, v8.4h, v0.4h[1] //// y1 * sin1(part of b3) + smlsl v24.4s, v8.4h, v5.h[3] //// y1 * cos1(part of b0) + smlsl v26.4s, v8.4h, v6.h[3] //// y1 * cos3(part of b1) + smlal v28.4s, v8.4h, v3.h[1] //// y1 * sin3(part of b2) + smlsl v30.4s, v8.4h, v0.h[1] //// y1 * sin1(part of b3) - smlsl v24.4s, v9.4h, v2.4h[3] //// y1 * cos1 + y3 * cos3(part of b0) - smlal v26.4s, v9.4h, v0.4h[1] //// y1 * cos3 - y3 * sin1(part of b1) - smlsl v28.4s, v9.4h, v2.4h[1] //// y1 * sin3 - y3 * cos1(part of b2) - smlal v30.4s, v9.4h, v4.4h[3] //// y1 * sin1 - y3 * sin3(part of b3) + smlsl v24.4s, v9.4h, v2.h[3] //// y1 * cos1 + y3 * cos3(part of b0) + smlal v26.4s, v9.4h, v0.h[1] //// y1 * cos3 - y3 * sin1(part of b1) + smlsl v28.4s, v9.4h, v2.h[1] //// y1 * sin3 - y3 * cos1(part of b2) + smlal v30.4s, v9.4h, v4.h[3] //// y1 * sin1 - y3 * sin3(part of b3) - smlal v20.4s, v10.4h, v6.4h[0] - smlsl v20.4s, v11.4h, v1.4h[2] + smlal v20.4s, v10.4h, v6.h[0] + smlsl v20.4s, v11.4h, v1.h[2] - smlsl v22.4s, v10.4h, v2.4h[0] - smlal v22.4s, v11.4h, v4.4h[2] + smlsl v22.4s, v10.4h, v2.h[0] + smlal v22.4s, v11.4h, v4.h[2] - smlal v16.4s, v10.4h, v2.4h[0] - smlsl v16.4s, v11.4h, v7.4h[2] + smlal v16.4s, v10.4h, v2.h[0] + smlsl v16.4s, v11.4h, v7.h[2] - smlsl v18.4s, v10.4h, v6.4h[0] - smlsl v18.4s, v11.4h, v5.4h[2] + smlsl v18.4s, v10.4h, v6.h[0] + smlsl v18.4s, v11.4h, v5.h[2] ld1 {v12.4h},[x0],x6 @@ -1153,26 +1153,26 @@ shift2: ld1 {v13.4h},[x0],x6 ld1 {v15.4h},[x0],x6 - smlal v24.4s, v14.4h, v4.4h[3] - smlsl v26.4s, v14.4h, v6.4h[1] - smlal v28.4s, v14.4h, v7.4h[3] - smlal v30.4s, v14.4h, v6.4h[3] + smlal v24.4s, v14.4h, v4.h[3] + smlsl v26.4s, v14.4h, v6.h[1] + smlal v28.4s, v14.4h, v7.h[3] + smlal v30.4s, v14.4h, v6.h[3] - smlal v24.4s, v15.4h, v3.4h[3] - smlsl v26.4s, v15.4h, v3.4h[1] - smlal v28.4s, v15.4h, v2.4h[3] - smlsl v30.4s, v15.4h, v2.4h[1] + smlal v24.4s, v15.4h, v3.h[3] + smlsl v26.4s, v15.4h, v3.h[1] + smlal v28.4s, v15.4h, v2.h[3] + smlsl v30.4s, v15.4h, v2.h[1] - smlsl v20.4s, v12.4h, v7.4h[0] - smlal v20.4s, v13.4h, v0.4h[2] - smlal v22.4s, v12.4h, v5.4h[0] - smlsl v22.4s, v13.4h, v1.4h[2] - smlsl v16.4s, v12.4h, v3.4h[0] - smlal v16.4s, v13.4h, v2.4h[2] - smlal v18.4s, v12.4h, v1.4h[0] - smlsl v18.4s, v13.4h, v3.4h[2] + smlsl v20.4s, v12.4h, v7.h[0] + smlal v20.4s, v13.4h, v0.h[2] + smlal v22.4s, v12.4h, v5.h[0] + smlsl v22.4s, v13.4h, v1.h[2] + smlsl v16.4s, v12.4h, v3.h[0] + smlal v16.4s, v13.4h, v2.h[2] + smlal v18.4s, v12.4h, v1.h[0] + smlsl v18.4s, v13.4h, v3.h[2] shift3: add v8.4s, v20.4s , v24.4s @@ -1244,32 +1244,32 @@ shift3: ld1 {v9.4h},[x0],x6 - smull v24.4s, v8.4h, v6.4h[1] //// y1 * cos1(part of b0) - smull v26.4s, v8.4h, v6.4h[3] //// y1 * cos3(part of b1) - smull v28.4s, v8.4h, v7.4h[1] //// y1 * sin3(part of b2) - smull v30.4s, v8.4h, v7.4h[3] //// y1 * sin1(part of b3) + smull v24.4s, v8.4h, v6.h[1] //// y1 * cos1(part of b0) + smull v26.4s, v8.4h, v6.h[3] //// y1 * cos3(part of b1) + smull v28.4s, v8.4h, v7.h[1] //// y1 * sin3(part of b2) + smull v30.4s, v8.4h, v7.h[3] //// y1 * sin1(part of b3) - smlsl v24.4s, v9.4h, v2.4h[3] //// y1 * cos1 + y3 * cos3(part of b0) - smlsl v26.4s, v9.4h, v4.4h[1] //// y1 * cos3 - y3 * sin1(part of b1) - smlsl v28.4s, v9.4h, v5.4h[3] //// y1 * sin3 - y3 * cos1(part of b2) - smlsl v30.4s, v9.4h, v7.4h[1] //// y1 * sin1 - y3 * sin3(part of b3) + smlsl v24.4s, v9.4h, v2.h[3] //// y1 * cos1 + y3 * cos3(part of b0) + smlsl v26.4s, v9.4h, v4.h[1] //// y1 * cos3 - y3 * sin1(part of b1) + smlsl v28.4s, v9.4h, v5.h[3] //// y1 * sin3 - y3 * cos1(part of b2) + smlsl v30.4s, v9.4h, v7.h[1] //// y1 * sin1 - y3 * sin3(part of b3) - smull v20.4s, v10.4h, v0.4h[0] - smlsl v20.4s, v11.4h, v3.4h[2] + smull v20.4s, v10.4h, v0.h[0] + smlsl v20.4s, v11.4h, v3.h[2] - smull v22.4s, v10.4h, v0.4h[0] - smlsl v22.4s, v11.4h, v2.4h[2] + smull v22.4s, v10.4h, v0.h[0] + smlsl v22.4s, v11.4h, v2.h[2] - smull v16.4s, v10.4h, v0.4h[0] - smlsl v16.4s, v11.4h, v1.4h[2] + smull v16.4s, v10.4h, v0.h[0] + smlsl v16.4s, v11.4h, v1.h[2] - smull v18.4s, v10.4h, v0.4h[0] - smlsl v18.4s, v11.4h, v0.4h[2] + smull v18.4s, v10.4h, v0.h[0] + smlsl v18.4s, v11.4h, v0.h[2] cmp x11,x10 bhs shift4 @@ -1284,26 +1284,26 @@ shift3: - smlal v24.4s, v14.4h, v0.4h[1] - smlal v26.4s, v14.4h, v1.4h[3] - smlal v28.4s, v14.4h, v4.4h[1] - smlal v30.4s, v14.4h, v6.4h[3] + smlal v24.4s, v14.4h, v0.h[1] + smlal v26.4s, v14.4h, v1.h[3] + smlal v28.4s, v14.4h, v4.h[1] + smlal v30.4s, v14.4h, v6.h[3] - smlsl v24.4s, v15.4h, v4.4h[1] - smlsl v26.4s, v15.4h, v0.4h[3] - smlsl v28.4s, v15.4h, v2.4h[3] - smlsl v30.4s, v15.4h, v6.4h[1] + smlsl v24.4s, v15.4h, v4.h[1] + smlsl v26.4s, v15.4h, v0.h[3] + smlsl v28.4s, v15.4h, v2.h[3] + smlsl v30.4s, v15.4h, v6.h[1] - smlal v20.4s, v12.4h, v7.4h[0] - smlal v20.4s, v13.4h, v5.4h[2] - smlal v22.4s, v12.4h, v5.4h[0] - smlsl v22.4s, v13.4h, v7.4h[2] - smlal v16.4s, v12.4h, v3.4h[0] - smlsl v16.4s, v13.4h, v4.4h[2] - smlal v18.4s, v12.4h, v1.4h[0] - smlsl v18.4s, v13.4h, v1.4h[2] + smlal v20.4s, v12.4h, v7.h[0] + smlal v20.4s, v13.4h, v5.h[2] + smlal v22.4s, v12.4h, v5.h[0] + smlsl v22.4s, v13.4h, v7.h[2] + smlal v16.4s, v12.4h, v3.h[0] + smlsl v16.4s, v13.4h, v4.h[2] + smlal v18.4s, v12.4h, v1.h[0] + smlsl v18.4s, v13.4h, v1.h[2] cmp x11,x9 bhs shift4 @@ -1315,32 +1315,32 @@ shift3: - smlal v24.4s, v8.4h, v7.4h[3] //// y1 * cos1(part of b0) - smlal v26.4s, v8.4h, v3.4h[1] //// y1 * cos3(part of b1) - smlal v28.4s, v8.4h, v1.4h[1] //// y1 * sin3(part of b2) - smlal v30.4s, v8.4h, v5.4h[3] //// y1 * sin1(part of b3) + smlal v24.4s, v8.4h, v7.h[3] //// y1 * cos1(part of b0) + smlal v26.4s, v8.4h, v3.h[1] //// y1 * cos3(part of b1) + smlal v28.4s, v8.4h, v1.h[1] //// y1 * sin3(part of b2) + smlal v30.4s, v8.4h, v5.h[3] //// y1 * sin1(part of b3) - smlal v24.4s, v9.4h, v4.4h[3] //// y1 * cos1 + y3 * cos3(part of b0) - smlsl v26.4s, v9.4h, v5.4h[3] //// y1 * cos3 - y3 * sin1(part of b1) - smlsl v28.4s, v9.4h, v0.4h[1] //// y1 * sin3 - y3 * cos1(part of b2) - smlsl v30.4s, v9.4h, v5.4h[1] //// y1 * sin1 - y3 * sin3(part of b3) + smlal v24.4s, v9.4h, v4.h[3] //// y1 * cos1 + y3 * cos3(part of b0) + smlsl v26.4s, v9.4h, v5.h[3] //// y1 * cos3 - y3 * sin1(part of b1) + smlsl v28.4s, v9.4h, v0.h[1] //// y1 * sin3 - y3 * cos1(part of b2) + smlsl v30.4s, v9.4h, v5.h[1] //// y1 * sin1 - y3 * sin3(part of b3) - smlsl v20.4s, v10.4h, v2.4h[0] - smlal v20.4s, v11.4h, v1.4h[2] + smlsl v20.4s, v10.4h, v2.h[0] + smlal v20.4s, v11.4h, v1.h[2] - smlsl v22.4s, v10.4h, v6.4h[0] - smlal v22.4s, v11.4h, v3.4h[2] + smlsl v22.4s, v10.4h, v6.h[0] + smlal v22.4s, v11.4h, v3.h[2] - smlal v16.4s, v10.4h, v6.4h[0] - smlsl v16.4s, v11.4h, v7.4h[2] + smlal v16.4s, v10.4h, v6.h[0] + smlsl v16.4s, v11.4h, v7.h[2] - smlal v18.4s, v10.4h, v2.4h[0] - smlsl v18.4s, v11.4h, v2.4h[2] + smlal v18.4s, v10.4h, v2.h[0] + smlsl v18.4s, v11.4h, v2.h[2] cmp x11,x5 bhs shift4 @@ -1356,26 +1356,26 @@ shift3: - smlsl v24.4s, v14.4h, v1.4h[1] - smlsl v26.4s, v14.4h, v7.4h[3] - smlal v28.4s, v14.4h, v1.4h[3] - smlal v30.4s, v14.4h, v4.4h[3] + smlsl v24.4s, v14.4h, v1.h[1] + smlsl v26.4s, v14.4h, v7.h[3] + smlal v28.4s, v14.4h, v1.h[3] + smlal v30.4s, v14.4h, v4.h[3] - smlal v24.4s, v15.4h, v2.4h[1] - smlal v26.4s, v15.4h, v5.4h[1] - smlsl v28.4s, v15.4h, v3.4h[1] - smlsl v30.4s, v15.4h, v4.4h[1] + smlal v24.4s, v15.4h, v2.h[1] + smlal v26.4s, v15.4h, v5.h[1] + smlsl v28.4s, v15.4h, v3.h[1] + smlsl v30.4s, v15.4h, v4.h[1] - smlsl v20.4s, v12.4h, v5.4h[0] - smlsl v20.4s, v13.4h, v7.4h[2] - smlsl v22.4s, v12.4h, v1.4h[0] - smlal v22.4s, v13.4h, v1.4h[2] - smlsl v16.4s, v12.4h, v7.4h[0] - smlal v16.4s, v13.4h, v5.4h[2] - smlal v18.4s, v12.4h, v3.4h[0] - smlsl v18.4s, v13.4h, v3.4h[2] + smlsl v20.4s, v12.4h, v5.h[0] + smlsl v20.4s, v13.4h, v7.h[2] + smlsl v22.4s, v12.4h, v1.h[0] + smlal v22.4s, v13.4h, v1.h[2] + smlsl v16.4s, v12.4h, v7.h[0] + smlal v16.4s, v13.4h, v5.h[2] + smlal v18.4s, v12.4h, v3.h[0] + smlsl v18.4s, v13.4h, v3.h[2] cmp x11,x7 bhs shift4 @@ -1387,32 +1387,32 @@ shift3: ld1 {v9.4h},[x0],x6 - smlsl v24.4s, v8.4h, v5.4h[3] //// y1 * cos1(part of b0) - smlsl v26.4s, v8.4h, v2.4h[3] //// y1 * cos3(part of b1) - smlal v28.4s, v8.4h, v4.4h[3] //// y1 * sin3(part of b2) - smlal v30.4s, v8.4h, v3.4h[3] //// y1 * sin1(part of b3) + smlsl v24.4s, v8.4h, v5.h[3] //// y1 * cos1(part of b0) + smlsl v26.4s, v8.4h, v2.h[3] //// y1 * cos3(part of b1) + smlal v28.4s, v8.4h, v4.h[3] //// y1 * sin3(part of b2) + smlal v30.4s, v8.4h, v3.h[3] //// y1 * sin1(part of b3) - smlsl v24.4s, v9.4h, v6.4h[3] //// y1 * cos1 + y3 * cos3(part of b0) - smlal v26.4s, v9.4h, v0.4h[3] //// y1 * cos3 - y3 * sin1(part of b1) - smlsl v28.4s, v9.4h, v6.4h[1] //// y1 * sin3 - y3 * cos1(part of b2) - smlsl v30.4s, v9.4h, v3.4h[1] //// y1 * sin1 - y3 * sin3(part of b3) + smlsl v24.4s, v9.4h, v6.h[3] //// y1 * cos1 + y3 * cos3(part of b0) + smlal v26.4s, v9.4h, v0.h[3] //// y1 * cos3 - y3 * sin1(part of b1) + smlsl v28.4s, v9.4h, v6.h[1] //// y1 * sin3 - y3 * cos1(part of b2) + smlsl v30.4s, v9.4h, v3.h[1] //// y1 * sin1 - y3 * sin3(part of b3) - smlal v20.4s, v10.4h, v0.4h[0] - smlsl v20.4s, v11.4h, v0.4h[2] + smlal v20.4s, v10.4h, v0.h[0] + smlsl v20.4s, v11.4h, v0.h[2] - smlsl v22.4s, v10.4h, v0.4h[0] - smlal v22.4s, v11.4h, v6.4h[2] + smlsl v22.4s, v10.4h, v0.h[0] + smlal v22.4s, v11.4h, v6.h[2] - smlsl v16.4s, v10.4h, v0.4h[0] - smlal v16.4s, v11.4h, v2.4h[2] + smlsl v16.4s, v10.4h, v0.h[0] + smlal v16.4s, v11.4h, v2.h[2] - smlal v18.4s, v10.4h, v0.4h[0] - smlsl v18.4s, v11.4h, v4.4h[2] + smlal v18.4s, v10.4h, v0.h[0] + smlsl v18.4s, v11.4h, v4.h[2] @@ -1427,26 +1427,26 @@ shift3: - smlal v24.4s, v14.4h, v3.4h[1] - smlsl v26.4s, v14.4h, v2.4h[1] - smlal v28.4s, v14.4h, v7.4h[3] - smlal v30.4s, v14.4h, v2.4h[3] + smlal v24.4s, v14.4h, v3.h[1] + smlsl v26.4s, v14.4h, v2.h[1] + smlal v28.4s, v14.4h, v7.h[3] + smlal v30.4s, v14.4h, v2.h[3] - smlsl v24.4s, v15.4h, v0.4h[3] - smlal v26.4s, v15.4h, v4.4h[3] - smlal v28.4s, v15.4h, v6.4h[3] - smlsl v30.4s, v15.4h, v2.4h[1] + smlsl v24.4s, v15.4h, v0.h[3] + smlal v26.4s, v15.4h, v4.h[3] + smlal v28.4s, v15.4h, v6.h[3] + smlsl v30.4s, v15.4h, v2.h[1] - smlal v20.4s, v12.4h, v3.4h[0] - smlsl v20.4s, v13.4h, v6.4h[2] - smlal v22.4s, v12.4h, v7.4h[0] - smlsl v22.4s, v13.4h, v4.4h[2] - smlsl v16.4s, v12.4h, v1.4h[0] - smlal v16.4s, v13.4h, v0.4h[2] - smlal v18.4s, v12.4h, v5.4h[0] - smlsl v18.4s, v13.4h, v5.4h[2] + smlal v20.4s, v12.4h, v3.h[0] + smlsl v20.4s, v13.4h, v6.h[2] + smlal v22.4s, v12.4h, v7.h[0] + smlsl v22.4s, v13.4h, v4.h[2] + smlsl v16.4s, v12.4h, v1.h[0] + smlal v16.4s, v13.4h, v0.h[2] + smlal v18.4s, v12.4h, v5.h[0] + smlsl v18.4s, v13.4h, v5.h[2] ld1 {v10.4h},[x0],x6 @@ -1458,32 +1458,32 @@ shift3: - smlal v24.4s, v8.4h, v3.4h[3] //// y1 * cos1(part of b0) - smlsl v26.4s, v8.4h, v7.4h[1] //// y1 * cos3(part of b1) - smlsl v28.4s, v8.4h, v5.4h[1] //// y1 * sin3(part of b2) - smlal v30.4s, v8.4h, v1.4h[3] //// y1 * sin1(part of b3) + smlal v24.4s, v8.4h, v3.h[3] //// y1 * cos1(part of b0) + smlsl v26.4s, v8.4h, v7.h[1] //// y1 * cos3(part of b1) + smlsl v28.4s, v8.4h, v5.h[1] //// y1 * sin3(part of b2) + smlal v30.4s, v8.4h, v1.h[3] //// y1 * sin1(part of b3) - smlsl v24.4s, v9.4h, v7.4h[1] //// y1 * cos1 + y3 * cos3(part of b0) - smlsl v26.4s, v9.4h, v6.4h[1] //// y1 * cos3 - y3 * sin1(part of b1) - smlal v28.4s, v9.4h, v3.4h[3] //// y1 * sin3 - y3 * cos1(part of b2) - smlsl v30.4s, v9.4h, v1.4h[1] //// y1 * sin1 - y3 * sin3(part of b3) + smlsl v24.4s, v9.4h, v7.h[1] //// y1 * cos1 + y3 * cos3(part of b0) + smlsl v26.4s, v9.4h, v6.h[1] //// y1 * cos3 - y3 * sin1(part of b1) + smlal v28.4s, v9.4h, v3.h[3] //// y1 * sin3 - y3 * cos1(part of b2) + smlsl v30.4s, v9.4h, v1.h[1] //// y1 * sin1 - y3 * sin3(part of b3) - smlsl v20.4s, v10.4h, v6.4h[0] - smlal v20.4s, v11.4h, v2.4h[2] + smlsl v20.4s, v10.4h, v6.h[0] + smlal v20.4s, v11.4h, v2.h[2] - smlal v22.4s, v10.4h, v2.4h[0] - smlsl v22.4s, v11.4h, v0.4h[2] + smlal v22.4s, v10.4h, v2.h[0] + smlsl v22.4s, v11.4h, v0.h[2] - smlsl v16.4s, v10.4h, v2.4h[0] - smlal v16.4s, v11.4h, v3.4h[2] + smlsl v16.4s, v10.4h, v2.h[0] + smlal v16.4s, v11.4h, v3.h[2] - smlal v18.4s, v10.4h, v6.4h[0] - smlsl v18.4s, v11.4h, v6.4h[2] + smlal v18.4s, v10.4h, v6.h[0] + smlsl v18.4s, v11.4h, v6.h[2] ld1 {v12.4h},[x0],x6 @@ -1494,26 +1494,26 @@ shift3: - smlsl v24.4s, v14.4h, v5.4h[1] - smlal v26.4s, v14.4h, v3.4h[3] - smlsl v28.4s, v14.4h, v2.4h[1] - smlal v30.4s, v14.4h, v0.4h[3] + smlsl v24.4s, v14.4h, v5.h[1] + smlal v26.4s, v14.4h, v3.h[3] + smlsl v28.4s, v14.4h, v2.h[1] + smlal v30.4s, v14.4h, v0.h[3] - smlal v24.4s, v15.4h, v1.4h[3] - smlsl v26.4s, v15.4h, v1.4h[1] - smlal v28.4s, v15.4h, v0.4h[3] - smlsl v30.4s, v15.4h, v0.4h[1] + smlal v24.4s, v15.4h, v1.h[3] + smlsl v26.4s, v15.4h, v1.h[1] + smlal v28.4s, v15.4h, v0.h[3] + smlsl v30.4s, v15.4h, v0.h[1] - smlsl v20.4s, v12.4h, v1.4h[0] - smlal v20.4s, v13.4h, v4.4h[2] - smlal v22.4s, v12.4h, v3.4h[0] - smlsl v22.4s, v13.4h, v5.4h[2] - smlsl v16.4s, v12.4h, v5.4h[0] - smlal v16.4s, v13.4h, v6.4h[2] - smlal v18.4s, v12.4h, v7.4h[0] - smlsl v18.4s, v13.4h, v7.4h[2] + smlsl v20.4s, v12.4h, v1.h[0] + smlal v20.4s, v13.4h, v4.h[2] + smlal v22.4s, v12.4h, v3.h[0] + smlsl v22.4s, v13.4h, v5.h[2] + smlsl v16.4s, v12.4h, v5.h[0] + smlal v16.4s, v13.4h, v6.h[2] + smlal v18.4s, v12.4h, v7.h[0] + smlsl v18.4s, v13.4h, v7.h[2] shift4: add v8.4s, v20.4s , v24.4s @@ -1618,30 +1618,30 @@ stage2: ld1 {v10.4h, v11.4h},[x1],#16 ld1 {v8.4h, v9.4h},[x1],x10 - smull v24.4s, v8.4h, v0.4h[1] //// y1 * cos1(part of b0) - smull v26.4s, v8.4h, v0.4h[3] //// y1 * cos3(part of b1) - smull v28.4s, v8.4h, v1.4h[1] //// y1 * sin3(part of b2) - smull v30.4s, v8.4h, v1.4h[3] //// y1 * sin1(part of b3) + smull v24.4s, v8.4h, v0.h[1] //// y1 * cos1(part of b0) + smull v26.4s, v8.4h, v0.h[3] //// y1 * cos3(part of b1) + smull v28.4s, v8.4h, v1.h[1] //// y1 * sin3(part of b2) + smull v30.4s, v8.4h, v1.h[3] //// y1 * sin1(part of b3) - smlal v24.4s, v9.4h, v0.4h[3] //// y1 * cos1 + y3 * cos3(part of b0) - smlal v26.4s, v9.4h, v2.4h[1] //// y1 * cos3 - y3 * sin1(part of b1) - smlal v28.4s, v9.4h, v3.4h[3] //// y1 * sin3 - y3 * cos1(part of b2) - smlal v30.4s, v9.4h, v5.4h[1] //// y1 * sin1 - y3 * sin3(part of b3) + smlal v24.4s, v9.4h, v0.h[3] //// y1 * cos1 + y3 * cos3(part of b0) + smlal v26.4s, v9.4h, v2.h[1] //// y1 * cos3 - y3 * sin1(part of b1) + smlal v28.4s, v9.4h, v3.h[3] //// y1 * sin3 - y3 * cos1(part of b2) + smlal v30.4s, v9.4h, v5.h[1] //// y1 * sin1 - y3 * sin3(part of b3) - smull v20.4s, v10.4h, v0.4h[0] - smlal v20.4s, v11.4h, v0.4h[2] + smull v20.4s, v10.4h, v0.h[0] + smlal v20.4s, v11.4h, v0.h[2] - smull v22.4s, v10.4h, v0.4h[0] - smlal v22.4s, v11.4h, v1.4h[2] + smull v22.4s, v10.4h, v0.h[0] + smlal v22.4s, v11.4h, v1.h[2] - smull v16.4s, v10.4h, v0.4h[0] - smlal v16.4s, v11.4h, v2.4h[2] + smull v16.4s, v10.4h, v0.h[0] + smlal v16.4s, v11.4h, v2.h[2] - smull v18.4s, v10.4h, v0.4h[0] - smlal v18.4s, v11.4h, v3.4h[2] + smull v18.4s, v10.4h, v0.h[0] + smlal v18.4s, v11.4h, v3.h[2] cmp x12,x11 bhs stage2_shift1 @@ -1653,58 +1653,58 @@ stage2: - smlal v24.4s, v14.4h, v1.4h[1] - smlal v26.4s, v14.4h, v3.4h[3] - smlal v28.4s, v14.4h, v6.4h[1] - smlsl v30.4s, v14.4h, v7.4h[1] + smlal v24.4s, v14.4h, v1.h[1] + smlal v26.4s, v14.4h, v3.h[3] + smlal v28.4s, v14.4h, v6.h[1] + smlsl v30.4s, v14.4h, v7.h[1] - smlal v24.4s, v15.4h, v1.4h[3] - smlal v26.4s, v15.4h, v5.4h[1] - smlsl v28.4s, v15.4h, v7.4h[1] - smlsl v30.4s, v15.4h, v3.4h[3] + smlal v24.4s, v15.4h, v1.h[3] + smlal v26.4s, v15.4h, v5.h[1] + smlsl v28.4s, v15.4h, v7.h[1] + smlsl v30.4s, v15.4h, v3.h[3] - smlal v20.4s, v12.4h, v1.4h[0] - smlal v20.4s, v13.4h, v1.4h[2] - smlal v22.4s, v12.4h, v3.4h[0] - smlal v22.4s, v13.4h, v4.4h[2] - smlal v16.4s, v12.4h, v5.4h[0] - smlal v16.4s, v13.4h, v7.4h[2] - smlal v18.4s, v12.4h, v7.4h[0] - smlsl v18.4s, v13.4h, v5.4h[2] + smlal v20.4s, v12.4h, v1.h[0] + smlal v20.4s, v13.4h, v1.h[2] + smlal v22.4s, v12.4h, v3.h[0] + smlal v22.4s, v13.4h, v4.h[2] + smlal v16.4s, v12.4h, v5.h[0] + smlal v16.4s, v13.4h, v7.h[2] + smlal v18.4s, v12.4h, v7.h[0] + smlsl v18.4s, v13.4h, v5.h[2] cmp x12,x5 bhs stage2_shift1 ld1 {v10.4h, v11.4h},[x1],#16 ld1 {v8.4h, v9.4h},[x1],x10 - smlal v24.4s, v8.4h, v2.4h[1] //// y1 * cos1(part of b0) - smlal v26.4s, v8.4h, v6.4h[3] //// y1 * cos3(part of b1) - smlsl v28.4s, v8.4h, v4.4h[3] //// y1 * sin3(part of b2) - smlsl v30.4s, v8.4h, v0.4h[1] //// y1 * sin1(part of b3) + smlal v24.4s, v8.4h, v2.h[1] //// y1 * cos1(part of b0) + smlal v26.4s, v8.4h, v6.h[3] //// y1 * cos3(part of b1) + smlsl v28.4s, v8.4h, v4.h[3] //// y1 * sin3(part of b2) + smlsl v30.4s, v8.4h, v0.h[1] //// y1 * sin1(part of b3) - smlal v24.4s, v9.4h, v2.4h[3] //// y1 * cos1 + y3 * cos3(part of b0) - smlsl v26.4s, v9.4h, v7.4h[3] //// y1 * cos3 - y3 * sin1(part of b1) - smlsl v28.4s, v9.4h, v2.4h[1] //// y1 * sin3 - y3 * cos1(part of b2) - smlsl v30.4s, v9.4h, v3.4h[1] //// y1 * sin1 - y3 * sin3(part of b3) + smlal v24.4s, v9.4h, v2.h[3] //// y1 * cos1 + y3 * cos3(part of b0) + smlsl v26.4s, v9.4h, v7.h[3] //// y1 * cos3 - y3 * sin1(part of b1) + smlsl v28.4s, v9.4h, v2.h[1] //// y1 * sin3 - y3 * cos1(part of b2) + smlsl v30.4s, v9.4h, v3.h[1] //// y1 * sin1 - y3 * sin3(part of b3) - smlal v20.4s, v10.4h, v2.4h[0] - smlal v20.4s, v11.4h, v2.4h[2] + smlal v20.4s, v10.4h, v2.h[0] + smlal v20.4s, v11.4h, v2.h[2] - smlal v22.4s, v10.4h, v6.4h[0] - smlal v22.4s, v11.4h, v7.4h[2] + smlal v22.4s, v10.4h, v6.h[0] + smlal v22.4s, v11.4h, v7.h[2] - smlsl v16.4s, v10.4h, v6.4h[0] - smlsl v16.4s, v11.4h, v3.4h[2] + smlsl v16.4s, v10.4h, v6.h[0] + smlsl v16.4s, v11.4h, v3.h[2] - smlsl v18.4s, v10.4h, v2.4h[0] - smlsl v18.4s, v11.4h, v1.4h[2] + smlsl v18.4s, v10.4h, v2.h[0] + smlsl v18.4s, v11.4h, v1.h[2] cmp x12,x6 bhs stage2_shift1 @@ -1717,26 +1717,26 @@ stage2: - smlal v24.4s, v14.4h, v3.4h[1] - smlsl v26.4s, v14.4h, v6.4h[1] - smlsl v28.4s, v14.4h, v0.4h[1] - smlsl v30.4s, v14.4h, v6.4h[3] + smlal v24.4s, v14.4h, v3.h[1] + smlsl v26.4s, v14.4h, v6.h[1] + smlsl v28.4s, v14.4h, v0.h[1] + smlsl v30.4s, v14.4h, v6.h[3] - smlal v24.4s, v15.4h, v3.4h[3] - smlsl v26.4s, v15.4h, v4.4h[3] - smlsl v28.4s, v15.4h, v2.4h[3] - smlal v30.4s, v15.4h, v5.4h[3] + smlal v24.4s, v15.4h, v3.h[3] + smlsl v26.4s, v15.4h, v4.h[3] + smlsl v28.4s, v15.4h, v2.h[3] + smlal v30.4s, v15.4h, v5.h[3] - smlal v20.4s, v12.4h, v3.4h[0] - smlal v20.4s, v13.4h, v3.4h[2] - smlsl v22.4s, v12.4h, v7.4h[0] - smlsl v22.4s, v13.4h, v5.4h[2] - smlsl v16.4s, v12.4h, v1.4h[0] - smlsl v16.4s, v13.4h, v1.4h[2] - smlsl v18.4s, v12.4h, v5.4h[0] - smlal v18.4s, v13.4h, v7.4h[2] + smlal v20.4s, v12.4h, v3.h[0] + smlal v20.4s, v13.4h, v3.h[2] + smlsl v22.4s, v12.4h, v7.h[0] + smlsl v22.4s, v13.4h, v5.h[2] + smlsl v16.4s, v12.4h, v1.h[0] + smlsl v16.4s, v13.4h, v1.h[2] + smlsl v18.4s, v12.4h, v5.h[0] + smlal v18.4s, v13.4h, v7.h[2] cmp x12,x9 bhs stage2_shift1 @@ -1746,32 +1746,32 @@ stage2: ld1 {v8.4h, v9.4h},[x1],x10 - smlal v24.4s, v8.4h, v4.4h[1] //// y1 * cos1(part of b0) - smlsl v26.4s, v8.4h, v3.4h[1] //// y1 * cos3(part of b1) - smlsl v28.4s, v8.4h, v5.4h[1] //// y1 * sin3(part of b2) - smlal v30.4s, v8.4h, v2.4h[1] //// y1 * sin1(part of b3) + smlal v24.4s, v8.4h, v4.h[1] //// y1 * cos1(part of b0) + smlsl v26.4s, v8.4h, v3.h[1] //// y1 * cos3(part of b1) + smlsl v28.4s, v8.4h, v5.h[1] //// y1 * sin3(part of b2) + smlal v30.4s, v8.4h, v2.h[1] //// y1 * sin1(part of b3) - smlal v24.4s, v9.4h, v4.4h[3] //// y1 * cos1 + y3 * cos3(part of b0) - smlsl v26.4s, v9.4h, v1.4h[3] //// y1 * cos3 - y3 * sin1(part of b1) - smlsl v28.4s, v9.4h, v7.4h[3] //// y1 * sin3 - y3 * cos1(part of b2) - smlal v30.4s, v9.4h, v1.4h[1] //// y1 * sin1 - y3 * sin3(part of b3) + smlal v24.4s, v9.4h, v4.h[3] //// y1 * cos1 + y3 * cos3(part of b0) + smlsl v26.4s, v9.4h, v1.h[3] //// y1 * cos3 - y3 * sin1(part of b1) + smlsl v28.4s, v9.4h, v7.h[3] //// y1 * sin3 - y3 * cos1(part of b2) + smlal v30.4s, v9.4h, v1.h[1] //// y1 * sin1 - y3 * sin3(part of b3) - smlal v20.4s, v10.4h, v0.4h[0] - smlal v20.4s, v11.4h, v4.4h[2] + smlal v20.4s, v10.4h, v0.h[0] + smlal v20.4s, v11.4h, v4.h[2] - smlsl v22.4s, v10.4h, v0.4h[0] - smlsl v22.4s, v11.4h, v2.4h[2] + smlsl v22.4s, v10.4h, v0.h[0] + smlsl v22.4s, v11.4h, v2.h[2] - smlsl v16.4s, v10.4h, v0.4h[0] - smlsl v16.4s, v11.4h, v6.4h[2] + smlsl v16.4s, v10.4h, v0.h[0] + smlsl v16.4s, v11.4h, v6.h[2] - smlal v18.4s, v10.4h, v0.4h[0] - smlal v18.4s, v11.4h, v0.4h[2] + smlal v18.4s, v10.4h, v0.h[0] + smlal v18.4s, v11.4h, v0.h[2] ld1 {v12.4h, v13.4h},[x1],#16 ld1 {v14.4h, v15.4h},[x1],x10 @@ -1780,26 +1780,26 @@ stage2: - smlal v24.4s, v14.4h, v5.4h[1] - smlsl v26.4s, v14.4h, v0.4h[2] - smlal v28.4s, v14.4h, v5.4h[3] - smlal v30.4s, v14.4h, v4.4h[3] + smlal v24.4s, v14.4h, v5.h[1] + smlsl v26.4s, v14.4h, v0.h[2] + smlal v28.4s, v14.4h, v5.h[3] + smlal v30.4s, v14.4h, v4.h[3] - smlal v24.4s, v15.4h, v5.4h[3] - smlsl v26.4s, v15.4h, v1.4h[1] - smlal v28.4s, v15.4h, v3.4h[1] - smlsl v30.4s, v15.4h, v7.4h[3] + smlal v24.4s, v15.4h, v5.h[3] + smlsl v26.4s, v15.4h, v1.h[1] + smlal v28.4s, v15.4h, v3.h[1] + smlsl v30.4s, v15.4h, v7.h[3] - smlal v20.4s, v12.4h, v5.4h[0] - smlal v20.4s, v13.4h, v5.4h[2] - smlsl v22.4s, v12.4h, v1.4h[0] - smlsl v22.4s, v13.4h, v0.4h[2] - smlal v16.4s, v12.4h, v7.4h[0] - smlal v16.4s, v13.4h, v4.4h[2] - smlal v18.4s, v12.4h, v3.4h[0] - smlal v18.4s, v13.4h, v6.4h[2] + smlal v20.4s, v12.4h, v5.h[0] + smlal v20.4s, v13.4h, v5.h[2] + smlsl v22.4s, v12.4h, v1.h[0] + smlsl v22.4s, v13.4h, v0.h[2] + smlal v16.4s, v12.4h, v7.h[0] + smlal v16.4s, v13.4h, v4.h[2] + smlal v18.4s, v12.4h, v3.h[0] + smlal v18.4s, v13.4h, v6.h[2] ld1 {v10.4h, v11.4h},[x1],#16 @@ -1808,56 +1808,56 @@ stage2: - smlal v24.4s, v8.4h, v6.4h[1] //// y1 * cos1(part of b0) - smlsl v26.4s, v8.4h, v2.4h[3] //// y1 * cos3(part of b1) - smlal v28.4s, v8.4h, v0.4h[1] //// y1 * sin3(part of b2) - smlsl v30.4s, v8.4h, v4.4h[1] //// y1 * sin1(part of b3) + smlal v24.4s, v8.4h, v6.h[1] //// y1 * cos1(part of b0) + smlsl v26.4s, v8.4h, v2.h[3] //// y1 * cos3(part of b1) + smlal v28.4s, v8.4h, v0.h[1] //// y1 * sin3(part of b2) + smlsl v30.4s, v8.4h, v4.h[1] //// y1 * sin1(part of b3) - smlal v24.4s, v9.4h, v6.4h[3] //// y1 * cos1 + y3 * cos3(part of b0) - smlsl v26.4s, v9.4h, v4.4h[1] //// y1 * cos3 - y3 * sin1(part of b1) - smlal v28.4s, v9.4h, v1.4h[3] //// y1 * sin3 - y3 * cos1(part of b2) - smlsl v30.4s, v9.4h, v0.4h[1] //// y1 * sin1 - y3 * sin3(part of b3) + smlal v24.4s, v9.4h, v6.h[3] //// y1 * cos1 + y3 * cos3(part of b0) + smlsl v26.4s, v9.4h, v4.h[1] //// y1 * cos3 - y3 * sin1(part of b1) + smlal v28.4s, v9.4h, v1.h[3] //// y1 * sin3 - y3 * cos1(part of b2) + smlsl v30.4s, v9.4h, v0.h[1] //// y1 * sin1 - y3 * sin3(part of b3) - smlal v20.4s, v10.4h, v6.4h[0] - smlal v20.4s, v11.4h, v6.4h[2] + smlal v20.4s, v10.4h, v6.h[0] + smlal v20.4s, v11.4h, v6.h[2] - smlsl v22.4s, v10.4h, v2.4h[0] - smlsl v22.4s, v11.4h, v3.4h[2] + smlsl v22.4s, v10.4h, v2.h[0] + smlsl v22.4s, v11.4h, v3.h[2] - smlal v16.4s, v10.4h, v2.4h[0] - smlal v16.4s, v11.4h, v0.4h[2] + smlal v16.4s, v10.4h, v2.h[0] + smlal v16.4s, v11.4h, v0.h[2] - smlsl v18.4s, v10.4h, v6.4h[0] - smlsl v18.4s, v11.4h, v2.4h[2] + smlsl v18.4s, v10.4h, v6.h[0] + smlsl v18.4s, v11.4h, v2.h[2] ld1 {v12.4h, v13.4h},[x1],#16 ld1 {v14.4h, v15.4h},[x1],x10 - smlal v24.4s, v14.4h, v7.4h[1] - smlsl v26.4s, v14.4h, v5.4h[3] - smlal v28.4s, v14.4h, v4.4h[1] - smlsl v30.4s, v14.4h, v2.4h[3] + smlal v24.4s, v14.4h, v7.h[1] + smlsl v26.4s, v14.4h, v5.h[3] + smlal v28.4s, v14.4h, v4.h[1] + smlsl v30.4s, v14.4h, v2.h[3] - smlal v24.4s, v15.4h, v7.4h[3] - smlsl v26.4s, v15.4h, v7.4h[1] - smlal v28.4s, v15.4h, v6.4h[3] - smlsl v30.4s, v15.4h, v6.4h[1] + smlal v24.4s, v15.4h, v7.h[3] + smlsl v26.4s, v15.4h, v7.h[1] + smlal v28.4s, v15.4h, v6.h[3] + smlsl v30.4s, v15.4h, v6.h[1] - smlal v20.4s, v12.4h, v7.4h[0] - smlal v20.4s, v13.4h, v7.4h[2] - smlsl v22.4s, v12.4h, v5.4h[0] - smlsl v22.4s, v13.4h, v6.4h[2] - smlal v16.4s, v12.4h, v3.4h[0] - smlal v16.4s, v13.4h, v5.4h[2] - smlsl v18.4s, v12.4h, v1.4h[0] - smlsl v18.4s, v13.4h, v4.4h[2] + smlal v20.4s, v12.4h, v7.h[0] + smlal v20.4s, v13.4h, v7.h[2] + smlsl v22.4s, v12.4h, v5.h[0] + smlsl v22.4s, v13.4h, v6.h[2] + smlal v16.4s, v12.4h, v3.h[0] + smlal v16.4s, v13.4h, v5.h[2] + smlsl v18.4s, v12.4h, v1.h[0] + smlsl v18.4s, v13.4h, v4.h[2] stage2_shift1: add v8.4s, v20.4s , v24.4s @@ -1930,32 +1930,32 @@ stage2_shift1: ld1 {v8.4h, v9.4h},[x1],x10 - smull v24.4s, v8.4h, v2.4h[1] //// y1 * cos1(part of b0) - smull v26.4s, v8.4h, v2.4h[3] //// y1 * cos3(part of b1) - smull v28.4s, v8.4h, v3.4h[1] //// y1 * sin3(part of b2) - smull v30.4s, v8.4h, v3.4h[3] //// y1 * sin1(part of b3) + smull v24.4s, v8.4h, v2.h[1] //// y1 * cos1(part of b0) + smull v26.4s, v8.4h, v2.h[3] //// y1 * cos3(part of b1) + smull v28.4s, v8.4h, v3.h[1] //// y1 * sin3(part of b2) + smull v30.4s, v8.4h, v3.h[3] //// y1 * sin1(part of b3) - smlal v24.4s, v9.4h, v6.4h[3] //// y1 * cos1 + y3 * cos3(part of b0) - smlsl v26.4s, v9.4h, v7.4h[3] //// y1 * cos3 - y3 * sin1(part of b1) - smlsl v28.4s, v9.4h, v6.4h[1] //// y1 * sin3 - y3 * cos1(part of b2) - smlsl v30.4s, v9.4h, v4.4h[3] //// y1 * sin1 - y3 * sin3(part of b3) + smlal v24.4s, v9.4h, v6.h[3] //// y1 * cos1 + y3 * cos3(part of b0) + smlsl v26.4s, v9.4h, v7.h[3] //// y1 * cos3 - y3 * sin1(part of b1) + smlsl v28.4s, v9.4h, v6.h[1] //// y1 * sin3 - y3 * cos1(part of b2) + smlsl v30.4s, v9.4h, v4.h[3] //// y1 * sin1 - y3 * sin3(part of b3) - smull v20.4s, v10.4h, v0.4h[0] - smlal v20.4s, v11.4h, v4.4h[2] + smull v20.4s, v10.4h, v0.h[0] + smlal v20.4s, v11.4h, v4.h[2] - smull v22.4s, v10.4h, v0.4h[0] - smlal v22.4s, v11.4h, v5.4h[2] + smull v22.4s, v10.4h, v0.h[0] + smlal v22.4s, v11.4h, v5.h[2] - smull v16.4s, v10.4h, v0.4h[0] - smlal v16.4s, v11.4h, v6.4h[2] + smull v16.4s, v10.4h, v0.h[0] + smlal v16.4s, v11.4h, v6.h[2] - smull v18.4s, v10.4h, v0.4h[0] - smlal v18.4s, v11.4h, v7.4h[2] + smull v18.4s, v10.4h, v0.h[0] + smlal v18.4s, v11.4h, v7.h[2] cmp x12,x11 bhs stage2_shift2 @@ -1964,26 +1964,26 @@ stage2_shift1: ld1 {v14.4h, v15.4h},[x1],x10 - smlsl v24.4s, v14.4h, v4.4h[3] - smlsl v26.4s, v14.4h, v2.4h[1] - smlsl v28.4s, v14.4h, v0.4h[1] - smlsl v30.4s, v14.4h, v2.4h[3] + smlsl v24.4s, v14.4h, v4.h[3] + smlsl v26.4s, v14.4h, v2.h[1] + smlsl v28.4s, v14.4h, v0.h[1] + smlsl v30.4s, v14.4h, v2.h[3] - smlsl v24.4s, v15.4h, v0.4h[3] - smlsl v26.4s, v15.4h, v3.4h[1] - smlsl v28.4s, v15.4h, v6.4h[3] - smlal v30.4s, v15.4h, v5.4h[3] + smlsl v24.4s, v15.4h, v0.h[3] + smlsl v26.4s, v15.4h, v3.h[1] + smlsl v28.4s, v15.4h, v6.h[3] + smlal v30.4s, v15.4h, v5.h[3] - smlsl v20.4s, v12.4h, v7.4h[0] - smlsl v20.4s, v13.4h, v2.4h[2] - smlsl v22.4s, v12.4h, v5.4h[0] - smlsl v22.4s, v13.4h, v0.4h[2] - smlsl v16.4s, v12.4h, v3.4h[0] - smlsl v16.4s, v13.4h, v3.4h[2] - smlsl v18.4s, v12.4h, v1.4h[0] - smlsl v18.4s, v13.4h, v6.4h[2] + smlsl v20.4s, v12.4h, v7.h[0] + smlsl v20.4s, v13.4h, v2.h[2] + smlsl v22.4s, v12.4h, v5.h[0] + smlsl v22.4s, v13.4h, v0.h[2] + smlsl v16.4s, v12.4h, v3.h[0] + smlsl v16.4s, v13.4h, v3.h[2] + smlsl v18.4s, v12.4h, v1.h[0] + smlsl v18.4s, v13.4h, v6.h[2] cmp x12,x5 bhs stage2_shift2 @@ -1995,32 +1995,32 @@ stage2_shift1: - smlsl v24.4s, v8.4h, v4.4h[1] //// y1 * cos1(part of b0) - smlal v26.4s, v8.4h, v7.4h[1] //// y1 * cos3(part of b1) - smlal v28.4s, v8.4h, v2.4h[3] //// y1 * sin3(part of b2) - smlal v30.4s, v8.4h, v1.4h[3] //// y1 * sin1(part of b3) + smlsl v24.4s, v8.4h, v4.h[1] //// y1 * cos1(part of b0) + smlal v26.4s, v8.4h, v7.h[1] //// y1 * cos3(part of b1) + smlal v28.4s, v8.4h, v2.h[3] //// y1 * sin3(part of b2) + smlal v30.4s, v8.4h, v1.h[3] //// y1 * sin1(part of b3) - smlal v24.4s, v9.4h, v7.4h[1] //// y1 * cos1 + y3 * cos3(part of b0) - smlal v26.4s, v9.4h, v1.4h[3] //// y1 * cos3 - y3 * sin1(part of b1) - smlal v28.4s, v9.4h, v3.4h[3] //// y1 * sin3 - y3 * cos1(part of b2) - smlsl v30.4s, v9.4h, v6.4h[3] //// y1 * sin1 - y3 * sin3(part of b3) + smlal v24.4s, v9.4h, v7.h[1] //// y1 * cos1 + y3 * cos3(part of b0) + smlal v26.4s, v9.4h, v1.h[3] //// y1 * cos3 - y3 * sin1(part of b1) + smlal v28.4s, v9.4h, v3.h[3] //// y1 * sin3 - y3 * cos1(part of b2) + smlsl v30.4s, v9.4h, v6.h[3] //// y1 * sin1 - y3 * sin3(part of b3) - smlsl v20.4s, v10.4h, v2.4h[0] - smlsl v20.4s, v11.4h, v6.4h[2] + smlsl v20.4s, v10.4h, v2.h[0] + smlsl v20.4s, v11.4h, v6.h[2] - smlsl v22.4s, v10.4h, v6.4h[0] - smlal v22.4s, v11.4h, v4.4h[2] + smlsl v22.4s, v10.4h, v6.h[0] + smlal v22.4s, v11.4h, v4.h[2] - smlal v16.4s, v10.4h, v6.4h[0] - smlal v16.4s, v11.4h, v0.4h[2] + smlal v16.4s, v10.4h, v6.h[0] + smlal v16.4s, v11.4h, v0.h[2] - smlal v18.4s, v10.4h, v2.4h[0] - smlal v18.4s, v11.4h, v5.4h[2] + smlal v18.4s, v10.4h, v2.h[0] + smlal v18.4s, v11.4h, v5.h[2] cmp x12,x6 bhs stage2_shift2 @@ -2034,26 +2034,26 @@ stage2_shift1: - smlal v24.4s, v14.4h, v2.4h[3] - smlal v26.4s, v14.4h, v3.4h[3] - smlsl v28.4s, v14.4h, v5.4h[3] - smlsl v30.4s, v14.4h, v0.4h[3] + smlal v24.4s, v14.4h, v2.h[3] + smlal v26.4s, v14.4h, v3.h[3] + smlsl v28.4s, v14.4h, v5.h[3] + smlsl v30.4s, v14.4h, v0.h[3] - smlal v24.4s, v15.4h, v1.4h[3] - smlsl v26.4s, v15.4h, v6.4h[3] - smlsl v28.4s, v15.4h, v0.4h[3] - smlal v30.4s, v15.4h, v7.4h[3] + smlal v24.4s, v15.4h, v1.h[3] + smlsl v26.4s, v15.4h, v6.h[3] + smlsl v28.4s, v15.4h, v0.h[3] + smlal v30.4s, v15.4h, v7.h[3] - smlal v20.4s, v12.4h, v5.4h[0] - smlal v20.4s, v13.4h, v0.4h[2] - smlal v22.4s, v12.4h, v1.4h[0] - smlal v22.4s, v13.4h, v6.4h[2] - smlal v16.4s, v12.4h, v7.4h[0] - smlsl v16.4s, v13.4h, v2.4h[2] - smlsl v18.4s, v12.4h, v3.4h[0] - smlsl v18.4s, v13.4h, v4.4h[2] + smlal v20.4s, v12.4h, v5.h[0] + smlal v20.4s, v13.4h, v0.h[2] + smlal v22.4s, v12.4h, v1.h[0] + smlal v22.4s, v13.4h, v6.h[2] + smlal v16.4s, v12.4h, v7.h[0] + smlsl v16.4s, v13.4h, v2.h[2] + smlsl v18.4s, v12.4h, v3.h[0] + smlsl v18.4s, v13.4h, v4.h[2] cmp x12,x9 bhs stage2_shift2 @@ -2064,32 +2064,32 @@ stage2_shift1: - smlal v24.4s, v8.4h, v6.4h[1] //// y1 * cos1(part of b0) - smlsl v26.4s, v8.4h, v1.4h[1] //// y1 * cos3(part of b1) - smlsl v28.4s, v8.4h, v7.4h[1] //// y1 * sin3(part of b2) - smlal v30.4s, v8.4h, v0.4h[3] //// y1 * sin1(part of b3) + smlal v24.4s, v8.4h, v6.h[1] //// y1 * cos1(part of b0) + smlsl v26.4s, v8.4h, v1.h[1] //// y1 * cos3(part of b1) + smlsl v28.4s, v8.4h, v7.h[1] //// y1 * sin3(part of b2) + smlal v30.4s, v8.4h, v0.h[3] //// y1 * sin1(part of b3) - smlsl v24.4s, v9.4h, v5.4h[1] //// y1 * cos1 + y3 * cos3(part of b0) - smlsl v26.4s, v9.4h, v4.4h[1] //// y1 * cos3 - y3 * sin1(part of b1) - smlal v28.4s, v9.4h, v2.4h[1] //// y1 * sin3 - y3 * cos1(part of b2) - smlal v30.4s, v9.4h, v7.4h[1] //// y1 * sin1 - y3 * sin3(part of b3) + smlsl v24.4s, v9.4h, v5.h[1] //// y1 * cos1 + y3 * cos3(part of b0) + smlsl v26.4s, v9.4h, v4.h[1] //// y1 * cos3 - y3 * sin1(part of b1) + smlal v28.4s, v9.4h, v2.h[1] //// y1 * sin3 - y3 * cos1(part of b2) + smlal v30.4s, v9.4h, v7.h[1] //// y1 * sin1 - y3 * sin3(part of b3) - smlal v20.4s, v10.4h, v0.4h[0] - smlsl v20.4s, v11.4h, v7.4h[2] + smlal v20.4s, v10.4h, v0.h[0] + smlsl v20.4s, v11.4h, v7.h[2] - smlsl v22.4s, v10.4h, v0.4h[0] - smlsl v22.4s, v11.4h, v1.4h[2] + smlsl v22.4s, v10.4h, v0.h[0] + smlsl v22.4s, v11.4h, v1.h[2] - smlsl v16.4s, v10.4h, v0.4h[0] - smlal v16.4s, v11.4h, v5.4h[2] + smlsl v16.4s, v10.4h, v0.h[0] + smlal v16.4s, v11.4h, v5.h[2] - smlal v18.4s, v10.4h, v0.4h[0] - smlal v18.4s, v11.4h, v3.4h[2] + smlal v18.4s, v10.4h, v0.h[0] + smlal v18.4s, v11.4h, v3.h[2] ld1 {v12.4h, v13.4h},[x1],#16 ld1 {v14.4h, v15.4h},[x1],x10 @@ -2097,58 +2097,58 @@ stage2_shift1: - smlsl v24.4s, v14.4h, v0.4h[1] - smlal v26.4s, v14.4h, v6.4h[1] - smlal v28.4s, v14.4h, v4.4h[1] - smlsl v30.4s, v14.4h, v1.4h[1] + smlsl v24.4s, v14.4h, v0.h[1] + smlal v26.4s, v14.4h, v6.h[1] + smlal v28.4s, v14.4h, v4.h[1] + smlsl v30.4s, v14.4h, v1.h[1] - smlsl v24.4s, v15.4h, v3.4h[3] - smlal v26.4s, v15.4h, v0.4h[1] - smlsl v28.4s, v15.4h, v5.4h[1] - smlsl v30.4s, v15.4h, v6.4h[1] + smlsl v24.4s, v15.4h, v3.h[3] + smlal v26.4s, v15.4h, v0.h[1] + smlsl v28.4s, v15.4h, v5.h[1] + smlsl v30.4s, v15.4h, v6.h[1] - smlsl v20.4s, v12.4h, v3.4h[0] - smlsl v20.4s, v13.4h, v1.4h[2] - smlsl v22.4s, v12.4h, v7.4h[0] - smlal v22.4s, v13.4h, v3.4h[2] - smlal v16.4s, v12.4h, v1.4h[0] - smlal v16.4s, v13.4h, v7.4h[2] - smlsl v18.4s, v12.4h, v5.4h[0] - smlsl v18.4s, v13.4h, v2.4h[2] + smlsl v20.4s, v12.4h, v3.h[0] + smlsl v20.4s, v13.4h, v1.h[2] + smlsl v22.4s, v12.4h, v7.h[0] + smlal v22.4s, v13.4h, v3.h[2] + smlal v16.4s, v12.4h, v1.h[0] + smlal v16.4s, v13.4h, v7.h[2] + smlsl v18.4s, v12.4h, v5.h[0] + smlsl v18.4s, v13.4h, v2.h[2] ld1 {v10.4h, v11.4h},[x1],#16 ld1 {v8.4h, v9.4h},[x1],x10 - smlal v24.4s, v8.4h, v7.4h[3] //// y1 * cos1(part of b0) - smlal v26.4s, v8.4h, v4.4h[3] //// y1 * cos3(part of b1) - smlsl v28.4s, v8.4h, v1.4h[1] //// y1 * sin3(part of b2) - smlal v30.4s, v8.4h, v2.4h[1] //// y1 * sin1(part of b3) + smlal v24.4s, v8.4h, v7.h[3] //// y1 * cos1(part of b0) + smlal v26.4s, v8.4h, v4.h[3] //// y1 * cos3(part of b1) + smlsl v28.4s, v8.4h, v1.h[1] //// y1 * sin3(part of b2) + smlal v30.4s, v8.4h, v2.h[1] //// y1 * sin1(part of b3) - smlal v24.4s, v9.4h, v3.4h[1] //// y1 * cos1 + y3 * cos3(part of b0) - smlsl v26.4s, v9.4h, v5.4h[3] //// y1 * cos3 - y3 * sin1(part of b1) - smlsl v28.4s, v9.4h, v7.4h[3] //// y1 * sin3 - y3 * cos1(part of b2) - smlal v30.4s, v9.4h, v5.4h[1] //// y1 * sin1 - y3 * sin3(part of b3) + smlal v24.4s, v9.4h, v3.h[1] //// y1 * cos1 + y3 * cos3(part of b0) + smlsl v26.4s, v9.4h, v5.h[3] //// y1 * cos3 - y3 * sin1(part of b1) + smlsl v28.4s, v9.4h, v7.h[3] //// y1 * sin3 - y3 * cos1(part of b2) + smlal v30.4s, v9.4h, v5.h[1] //// y1 * sin1 - y3 * sin3(part of b3) - smlsl v20.4s, v10.4h, v6.4h[0] - smlal v20.4s, v11.4h, v5.4h[2] + smlsl v20.4s, v10.4h, v6.h[0] + smlal v20.4s, v11.4h, v5.h[2] - smlal v22.4s, v10.4h, v2.4h[0] - smlal v22.4s, v11.4h, v7.4h[2] + smlal v22.4s, v10.4h, v2.h[0] + smlal v22.4s, v11.4h, v7.h[2] - smlsl v16.4s, v10.4h, v2.4h[0] - smlsl v16.4s, v11.4h, v4.4h[2] + smlsl v16.4s, v10.4h, v2.h[0] + smlsl v16.4s, v11.4h, v4.h[2] - smlal v18.4s, v10.4h, v6.4h[0] - smlal v18.4s, v11.4h, v1.4h[2] + smlal v18.4s, v10.4h, v6.h[0] + smlal v18.4s, v11.4h, v1.h[2] ld1 {v12.4h, v13.4h},[x1],#16 @@ -2156,26 +2156,26 @@ stage2_shift1: - smlal v24.4s, v14.4h, v1.4h[1] - smlsl v26.4s, v14.4h, v0.4h[3] - smlal v28.4s, v14.4h, v1.4h[3] - smlsl v30.4s, v14.4h, v3.4h[1] + smlal v24.4s, v14.4h, v1.h[1] + smlsl v26.4s, v14.4h, v0.h[3] + smlal v28.4s, v14.4h, v1.h[3] + smlsl v30.4s, v14.4h, v3.h[1] - smlal v24.4s, v15.4h, v5.4h[3] - smlsl v26.4s, v15.4h, v5.4h[1] - smlal v28.4s, v15.4h, v4.4h[3] - smlsl v30.4s, v15.4h, v4.4h[1] + smlal v24.4s, v15.4h, v5.h[3] + smlsl v26.4s, v15.4h, v5.h[1] + smlal v28.4s, v15.4h, v4.h[3] + smlsl v30.4s, v15.4h, v4.h[1] - smlal v20.4s, v12.4h, v1.4h[0] - smlal v20.4s, v13.4h, v3.4h[2] - smlsl v22.4s, v12.4h, v3.4h[0] - smlsl v22.4s, v13.4h, v2.4h[2] - smlal v16.4s, v12.4h, v5.4h[0] - smlal v16.4s, v13.4h, v1.4h[2] - smlsl v18.4s, v12.4h, v7.4h[0] - smlsl v18.4s, v13.4h, v0.4h[2] + smlal v20.4s, v12.4h, v1.h[0] + smlal v20.4s, v13.4h, v3.h[2] + smlsl v22.4s, v12.4h, v3.h[0] + smlsl v22.4s, v13.4h, v2.h[2] + smlal v16.4s, v12.4h, v5.h[0] + smlal v16.4s, v13.4h, v1.h[2] + smlsl v18.4s, v12.4h, v7.h[0] + smlsl v18.4s, v13.4h, v0.h[2] stage2_shift2: add v8.4s, v20.4s , v24.4s @@ -2245,32 +2245,32 @@ stage2_shift2: ld1 {v10.4h, v11.4h},[x1],#16 ld1 {v8.4h, v9.4h},[x1],x10 - smull v24.4s, v8.4h, v4.4h[1] //// y1 * cos1(part of b0) - smull v26.4s, v8.4h, v4.4h[3] //// y1 * cos3(part of b1) - smull v28.4s, v8.4h, v5.4h[1] //// y1 * sin3(part of b2) - smull v30.4s, v8.4h, v5.4h[3] //// y1 * sin1(part of b3) + smull v24.4s, v8.4h, v4.h[1] //// y1 * cos1(part of b0) + smull v26.4s, v8.4h, v4.h[3] //// y1 * cos3(part of b1) + smull v28.4s, v8.4h, v5.h[1] //// y1 * sin3(part of b2) + smull v30.4s, v8.4h, v5.h[3] //// y1 * sin1(part of b3) - smlsl v24.4s, v9.4h, v3.4h[1] //// y1 * cos1 + y3 * cos3(part of b0) - smlsl v26.4s, v9.4h, v1.4h[3] //// y1 * cos3 - y3 * sin1(part of b1) - smlsl v28.4s, v9.4h, v0.4h[2] //// y1 * sin3 - y3 * cos1(part of b2) - smlsl v30.4s, v9.4h, v1.4h[1] //// y1 * sin1 - y3 * sin3(part of b3) + smlsl v24.4s, v9.4h, v3.h[1] //// y1 * cos1 + y3 * cos3(part of b0) + smlsl v26.4s, v9.4h, v1.h[3] //// y1 * cos3 - y3 * sin1(part of b1) + smlsl v28.4s, v9.4h, v0.h[2] //// y1 * sin3 - y3 * cos1(part of b2) + smlsl v30.4s, v9.4h, v1.h[1] //// y1 * sin1 - y3 * sin3(part of b3) - smull v20.4s, v10.4h, v0.4h[0] - smlsl v20.4s, v11.4h, v7.4h[2] + smull v20.4s, v10.4h, v0.h[0] + smlsl v20.4s, v11.4h, v7.h[2] - smull v22.4s, v10.4h, v0.4h[0] - smlsl v22.4s, v11.4h, v6.4h[2] + smull v22.4s, v10.4h, v0.h[0] + smlsl v22.4s, v11.4h, v6.h[2] - smull v16.4s, v10.4h, v0.4h[0] - smlsl v16.4s, v11.4h, v5.4h[2] + smull v16.4s, v10.4h, v0.h[0] + smlsl v16.4s, v11.4h, v5.h[2] - smull v18.4s, v10.4h, v0.4h[0] - smlsl v18.4s, v11.4h, v4.4h[2] + smull v18.4s, v10.4h, v0.h[0] + smlsl v18.4s, v11.4h, v4.h[2] cmp x12,x11 bhs stage2_shift3 @@ -2278,26 +2278,26 @@ stage2_shift2: ld1 {v12.4h, v13.4h},[x1],#16 ld1 {v14.4h, v15.4h},[x1],x10 - smlsl v24.4s, v14.4h, v5.4h[1] - smlsl v26.4s, v14.4h, v7.4h[3] - smlal v28.4s, v14.4h, v5.4h[3] - smlal v30.4s, v14.4h, v3.4h[1] + smlsl v24.4s, v14.4h, v5.h[1] + smlsl v26.4s, v14.4h, v7.h[3] + smlal v28.4s, v14.4h, v5.h[3] + smlal v30.4s, v14.4h, v3.h[1] - smlal v24.4s, v15.4h, v2.4h[1] - smlal v26.4s, v15.4h, v1.4h[1] - smlal v28.4s, v15.4h, v4.4h[3] - smlsl v30.4s, v15.4h, v7.4h[3] + smlal v24.4s, v15.4h, v2.h[1] + smlal v26.4s, v15.4h, v1.h[1] + smlal v28.4s, v15.4h, v4.h[3] + smlsl v30.4s, v15.4h, v7.h[3] - smlsl v20.4s, v12.4h, v1.4h[0] - smlal v20.4s, v13.4h, v6.4h[2] - smlsl v22.4s, v12.4h, v3.4h[0] - smlal v22.4s, v13.4h, v3.4h[2] - smlsl v16.4s, v12.4h, v5.4h[0] - smlal v16.4s, v13.4h, v0.4h[2] - smlsl v18.4s, v12.4h, v7.4h[0] - smlal v18.4s, v13.4h, v2.4h[2] + smlsl v20.4s, v12.4h, v1.h[0] + smlal v20.4s, v13.4h, v6.h[2] + smlsl v22.4s, v12.4h, v3.h[0] + smlal v22.4s, v13.4h, v3.h[2] + smlsl v16.4s, v12.4h, v5.h[0] + smlal v16.4s, v13.4h, v0.h[2] + smlsl v18.4s, v12.4h, v7.h[0] + smlal v18.4s, v13.4h, v2.h[2] cmp x12,x5 bhs stage2_shift3 @@ -2307,32 +2307,32 @@ stage2_shift2: - smlal v24.4s, v8.4h, v6.4h[1] //// y1 * cos1(part of b0) - smlsl v26.4s, v8.4h, v5.4h[1] //// y1 * cos3(part of b1) - smlsl v28.4s, v8.4h, v0.4h[3] //// y1 * sin3(part of b2) - smlsl v30.4s, v8.4h, v3.4h[3] //// y1 * sin1(part of b3) + smlal v24.4s, v8.4h, v6.h[1] //// y1 * cos1(part of b0) + smlsl v26.4s, v8.4h, v5.h[1] //// y1 * cos3(part of b1) + smlsl v28.4s, v8.4h, v0.h[3] //// y1 * sin3(part of b2) + smlsl v30.4s, v8.4h, v3.h[3] //// y1 * sin1(part of b3) - smlsl v24.4s, v9.4h, v1.4h[1] //// y1 * cos1 + y3 * cos3(part of b0) - smlsl v26.4s, v9.4h, v4.4h[1] //// y1 * cos3 - y3 * sin1(part of b1) - smlal v28.4s, v9.4h, v6.4h[1] //// y1 * sin3 - y3 * cos1(part of b2) - smlal v30.4s, v9.4h, v0.4h[1] //// y1 * sin1 - y3 * sin3(part of b3) + smlsl v24.4s, v9.4h, v1.h[1] //// y1 * cos1 + y3 * cos3(part of b0) + smlsl v26.4s, v9.4h, v4.h[1] //// y1 * cos3 - y3 * sin1(part of b1) + smlal v28.4s, v9.4h, v6.h[1] //// y1 * sin3 - y3 * cos1(part of b2) + smlal v30.4s, v9.4h, v0.h[1] //// y1 * sin1 - y3 * sin3(part of b3) - smlal v20.4s, v10.4h, v2.4h[0] - smlsl v20.4s, v11.4h, v5.4h[2] + smlal v20.4s, v10.4h, v2.h[0] + smlsl v20.4s, v11.4h, v5.h[2] - smlal v22.4s, v10.4h, v6.4h[0] - smlsl v22.4s, v11.4h, v0.4h[2] + smlal v22.4s, v10.4h, v6.h[0] + smlsl v22.4s, v11.4h, v0.h[2] - smlsl v16.4s, v10.4h, v6.4h[0] - smlsl v16.4s, v11.4h, v4.4h[2] + smlsl v16.4s, v10.4h, v6.h[0] + smlsl v16.4s, v11.4h, v4.h[2] - smlsl v18.4s, v10.4h, v2.4h[0] - smlal v18.4s, v11.4h, v6.4h[2] + smlsl v18.4s, v10.4h, v2.h[0] + smlal v18.4s, v11.4h, v6.h[2] cmp x12,x6 bhs stage2_shift3 @@ -2344,26 +2344,26 @@ stage2_shift2: - smlsl v24.4s, v14.4h, v7.4h[1] - smlal v26.4s, v14.4h, v2.4h[1] - smlal v28.4s, v14.4h, v4.4h[1] - smlsl v30.4s, v14.4h, v5.4h[1] + smlsl v24.4s, v14.4h, v7.h[1] + smlal v26.4s, v14.4h, v2.h[1] + smlal v28.4s, v14.4h, v4.h[1] + smlsl v30.4s, v14.4h, v5.h[1] - smlal v24.4s, v15.4h, v0.4h[3] - smlal v26.4s, v15.4h, v7.4h[1] - smlsl v28.4s, v15.4h, v1.4h[1] - smlsl v30.4s, v15.4h, v6.4h[1] + smlal v24.4s, v15.4h, v0.h[3] + smlal v26.4s, v15.4h, v7.h[1] + smlsl v28.4s, v15.4h, v1.h[1] + smlsl v30.4s, v15.4h, v6.h[1] - smlsl v20.4s, v12.4h, v3.4h[0] - smlal v20.4s, v13.4h, v4.4h[2] - smlal v22.4s, v12.4h, v7.4h[0] - smlal v22.4s, v13.4h, v2.4h[2] - smlal v16.4s, v12.4h, v1.4h[0] - smlsl v16.4s, v13.4h, v6.4h[2] - smlal v18.4s, v12.4h, v5.4h[0] - smlsl v18.4s, v13.4h, v0.4h[2] + smlsl v20.4s, v12.4h, v3.h[0] + smlal v20.4s, v13.4h, v4.h[2] + smlal v22.4s, v12.4h, v7.h[0] + smlal v22.4s, v13.4h, v2.h[2] + smlal v16.4s, v12.4h, v1.h[0] + smlsl v16.4s, v13.4h, v6.h[2] + smlal v18.4s, v12.4h, v5.h[0] + smlsl v18.4s, v13.4h, v0.h[2] cmp x12,x9 bhs stage2_shift3 @@ -2373,32 +2373,32 @@ stage2_shift2: ld1 {v8.4h, v9.4h},[x1],x10 - smlsl v24.4s, v8.4h, v7.4h[3] //// y1 * cos1(part of b0) - smlsl v26.4s, v8.4h, v0.4h[1] //// y1 * cos3(part of b1) - smlal v28.4s, v8.4h, v6.4h[3] //// y1 * sin3(part of b2) - smlal v30.4s, v8.4h, v1.4h[3] //// y1 * sin1(part of b3) + smlsl v24.4s, v8.4h, v7.h[3] //// y1 * cos1(part of b0) + smlsl v26.4s, v8.4h, v0.h[1] //// y1 * cos3(part of b1) + smlal v28.4s, v8.4h, v6.h[3] //// y1 * sin3(part of b2) + smlal v30.4s, v8.4h, v1.h[3] //// y1 * sin1(part of b3) - smlsl v24.4s, v9.4h, v0.4h[1] //// y1 * cos1 + y3 * cos3(part of b0) - smlal v26.4s, v9.4h, v5.4h[3] //// y1 * cos3 - y3 * sin1(part of b1) - smlal v28.4s, v9.4h, v3.4h[3] //// y1 * sin3 - y3 * cos1(part of b2) - smlsl v30.4s, v9.4h, v2.4h[3] //// y1 * sin1 - y3 * sin3(part of b3) + smlsl v24.4s, v9.4h, v0.h[1] //// y1 * cos1 + y3 * cos3(part of b0) + smlal v26.4s, v9.4h, v5.h[3] //// y1 * cos3 - y3 * sin1(part of b1) + smlal v28.4s, v9.4h, v3.h[3] //// y1 * sin3 - y3 * cos1(part of b2) + smlsl v30.4s, v9.4h, v2.h[3] //// y1 * sin1 - y3 * sin3(part of b3) - smlal v20.4s, v10.4h, v0.4h[0] - smlsl v20.4s, v11.4h, v3.4h[2] + smlal v20.4s, v10.4h, v0.h[0] + smlsl v20.4s, v11.4h, v3.h[2] - smlsl v22.4s, v10.4h, v0.4h[0] - smlsl v22.4s, v11.4h, v5.4h[2] + smlsl v22.4s, v10.4h, v0.h[0] + smlsl v22.4s, v11.4h, v5.h[2] - smlsl v16.4s, v10.4h, v0.4h[0] - smlal v16.4s, v11.4h, v1.4h[2] + smlsl v16.4s, v10.4h, v0.h[0] + smlal v16.4s, v11.4h, v1.h[2] - smlal v18.4s, v10.4h, v0.4h[0] - smlal v18.4s, v11.4h, v7.4h[2] + smlal v18.4s, v10.4h, v0.h[0] + smlal v18.4s, v11.4h, v7.h[2] ld1 {v12.4h, v13.4h},[x1],#16 ld1 {v14.4h, v15.4h},[x1],x10 @@ -2406,84 +2406,84 @@ stage2_shift2: - smlal v24.4s, v14.4h, v6.4h[3] - smlal v26.4s, v14.4h, v3.4h[3] - smlsl v28.4s, v14.4h, v1.4h[3] - smlal v30.4s, v14.4h, v7.4h[1] + smlal v24.4s, v14.4h, v6.h[3] + smlal v26.4s, v14.4h, v3.h[3] + smlsl v28.4s, v14.4h, v1.h[3] + smlal v30.4s, v14.4h, v7.h[1] - smlal v24.4s, v15.4h, v1.4h[3] - smlsl v26.4s, v15.4h, v2.4h[3] - smlal v28.4s, v15.4h, v7.4h[1] - smlal v30.4s, v15.4h, v4.4h[1] + smlal v24.4s, v15.4h, v1.h[3] + smlsl v26.4s, v15.4h, v2.h[3] + smlal v28.4s, v15.4h, v7.h[1] + smlal v30.4s, v15.4h, v4.h[1] - smlsl v20.4s, v12.4h, v5.4h[0] - smlal v20.4s, v13.4h, v2.4h[2] - smlal v22.4s, v12.4h, v1.4h[0] - smlsl v22.4s, v13.4h, v7.4h[2] - smlsl v16.4s, v12.4h, v7.4h[0] - smlsl v16.4s, v13.4h, v3.4h[2] - smlsl v18.4s, v12.4h, v3.4h[0] - smlal v18.4s, v13.4h, v1.4h[2] + smlsl v20.4s, v12.4h, v5.h[0] + smlal v20.4s, v13.4h, v2.h[2] + smlal v22.4s, v12.4h, v1.h[0] + smlsl v22.4s, v13.4h, v7.h[2] + smlsl v16.4s, v12.4h, v7.h[0] + smlsl v16.4s, v13.4h, v3.h[2] + smlsl v18.4s, v12.4h, v3.h[0] + smlal v18.4s, v13.4h, v1.h[2] ld1 {v10.4h, v11.4h},[x1],#16 ld1 {v8.4h, v9.4h},[x1],x10 - smlsl v24.4s, v8.4h, v5.4h[3] //// y1 * cos1(part of b0) - smlsl v26.4s, v8.4h, v6.4h[3] //// y1 * cos3(part of b1) - smlal v28.4s, v8.4h, v3.4h[1] //// y1 * sin3(part of b2) - smlsl v30.4s, v8.4h, v0.4h[1] //// y1 * sin1(part of b3) + smlsl v24.4s, v8.4h, v5.h[3] //// y1 * cos1(part of b0) + smlsl v26.4s, v8.4h, v6.h[3] //// y1 * cos3(part of b1) + smlal v28.4s, v8.4h, v3.h[1] //// y1 * sin3(part of b2) + smlsl v30.4s, v8.4h, v0.h[1] //// y1 * sin1(part of b3) - smlsl v24.4s, v9.4h, v2.4h[3] //// y1 * cos1 + y3 * cos3(part of b0) - smlal v26.4s, v9.4h, v0.4h[1] //// y1 * cos3 - y3 * sin1(part of b1) - smlsl v28.4s, v9.4h, v2.4h[1] //// y1 * sin3 - y3 * cos1(part of b2) - smlal v30.4s, v9.4h, v4.4h[3] //// y1 * sin1 - y3 * sin3(part of b3) + smlsl v24.4s, v9.4h, v2.h[3] //// y1 * cos1 + y3 * cos3(part of b0) + smlal v26.4s, v9.4h, v0.h[1] //// y1 * cos3 - y3 * sin1(part of b1) + smlsl v28.4s, v9.4h, v2.h[1] //// y1 * sin3 - y3 * cos1(part of b2) + smlal v30.4s, v9.4h, v4.h[3] //// y1 * sin1 - y3 * sin3(part of b3) - smlal v20.4s, v10.4h, v6.4h[0] - smlsl v20.4s, v11.4h, v1.4h[2] + smlal v20.4s, v10.4h, v6.h[0] + smlsl v20.4s, v11.4h, v1.h[2] - smlsl v22.4s, v10.4h, v2.4h[0] - smlal v22.4s, v11.4h, v4.4h[2] + smlsl v22.4s, v10.4h, v2.h[0] + smlal v22.4s, v11.4h, v4.h[2] - smlal v16.4s, v10.4h, v2.4h[0] - smlsl v16.4s, v11.4h, v7.4h[2] + smlal v16.4s, v10.4h, v2.h[0] + smlsl v16.4s, v11.4h, v7.h[2] - smlsl v18.4s, v10.4h, v6.4h[0] - smlsl v18.4s, v11.4h, v5.4h[2] + smlsl v18.4s, v10.4h, v6.h[0] + smlsl v18.4s, v11.4h, v5.h[2] ld1 {v12.4h, v13.4h},[x1],#16 ld1 {v14.4h, v15.4h},[x1],x10 - smlal v24.4s, v14.4h, v4.4h[3] - smlsl v26.4s, v14.4h, v6.4h[1] - smlal v28.4s, v14.4h, v7.4h[3] - smlal v30.4s, v14.4h, v6.4h[3] + smlal v24.4s, v14.4h, v4.h[3] + smlsl v26.4s, v14.4h, v6.h[1] + smlal v28.4s, v14.4h, v7.h[3] + smlal v30.4s, v14.4h, v6.h[3] - smlal v24.4s, v15.4h, v3.4h[3] - smlsl v26.4s, v15.4h, v3.4h[1] - smlal v28.4s, v15.4h, v2.4h[3] - smlsl v30.4s, v15.4h, v2.4h[1] + smlal v24.4s, v15.4h, v3.h[3] + smlsl v26.4s, v15.4h, v3.h[1] + smlal v28.4s, v15.4h, v2.h[3] + smlsl v30.4s, v15.4h, v2.h[1] - smlsl v20.4s, v12.4h, v7.4h[0] - smlal v20.4s, v13.4h, v0.4h[2] - smlal v22.4s, v12.4h, v5.4h[0] - smlsl v22.4s, v13.4h, v1.4h[2] - smlsl v16.4s, v12.4h, v3.4h[0] - smlal v16.4s, v13.4h, v2.4h[2] - smlal v18.4s, v12.4h, v1.4h[0] - smlsl v18.4s, v13.4h, v3.4h[2] + smlsl v20.4s, v12.4h, v7.h[0] + smlal v20.4s, v13.4h, v0.h[2] + smlal v22.4s, v12.4h, v5.h[0] + smlsl v22.4s, v13.4h, v1.h[2] + smlsl v16.4s, v12.4h, v3.h[0] + smlal v16.4s, v13.4h, v2.h[2] + smlal v18.4s, v12.4h, v1.h[0] + smlsl v18.4s, v13.4h, v3.h[2] stage2_shift3: add v8.4s, v20.4s , v24.4s @@ -2555,32 +2555,32 @@ stage2_shift3: ld1 {v8.4h, v9.4h},[x1],x10 - smull v24.4s, v8.4h, v6.4h[1] //// y1 * cos1(part of b0) - smull v26.4s, v8.4h, v6.4h[3] //// y1 * cos3(part of b1) - smull v28.4s, v8.4h, v7.4h[1] //// y1 * sin3(part of b2) - smull v30.4s, v8.4h, v7.4h[3] //// y1 * sin1(part of b3) + smull v24.4s, v8.4h, v6.h[1] //// y1 * cos1(part of b0) + smull v26.4s, v8.4h, v6.h[3] //// y1 * cos3(part of b1) + smull v28.4s, v8.4h, v7.h[1] //// y1 * sin3(part of b2) + smull v30.4s, v8.4h, v7.h[3] //// y1 * sin1(part of b3) - smlsl v24.4s, v9.4h, v2.4h[3] //// y1 * cos1 + y3 * cos3(part of b0) - smlsl v26.4s, v9.4h, v4.4h[1] //// y1 * cos3 - y3 * sin1(part of b1) - smlsl v28.4s, v9.4h, v5.4h[3] //// y1 * sin3 - y3 * cos1(part of b2) - smlsl v30.4s, v9.4h, v7.4h[1] //// y1 * sin1 - y3 * sin3(part of b3) + smlsl v24.4s, v9.4h, v2.h[3] //// y1 * cos1 + y3 * cos3(part of b0) + smlsl v26.4s, v9.4h, v4.h[1] //// y1 * cos3 - y3 * sin1(part of b1) + smlsl v28.4s, v9.4h, v5.h[3] //// y1 * sin3 - y3 * cos1(part of b2) + smlsl v30.4s, v9.4h, v7.h[1] //// y1 * sin1 - y3 * sin3(part of b3) - smull v20.4s, v10.4h, v0.4h[0] - smlsl v20.4s, v11.4h, v3.4h[2] + smull v20.4s, v10.4h, v0.h[0] + smlsl v20.4s, v11.4h, v3.h[2] - smull v22.4s, v10.4h, v0.4h[0] - smlsl v22.4s, v11.4h, v2.4h[2] + smull v22.4s, v10.4h, v0.h[0] + smlsl v22.4s, v11.4h, v2.h[2] - smull v16.4s, v10.4h, v0.4h[0] - smlsl v16.4s, v11.4h, v1.4h[2] + smull v16.4s, v10.4h, v0.h[0] + smlsl v16.4s, v11.4h, v1.h[2] - smull v18.4s, v10.4h, v0.4h[0] - smlsl v18.4s, v11.4h, v0.4h[2] + smull v18.4s, v10.4h, v0.h[0] + smlsl v18.4s, v11.4h, v0.h[2] cmp x12,x11 bhs stage2_shift4 @@ -2592,26 +2592,26 @@ stage2_shift3: - smlal v24.4s, v14.4h, v0.4h[1] - smlal v26.4s, v14.4h, v1.4h[3] - smlal v28.4s, v14.4h, v4.4h[1] - smlal v30.4s, v14.4h, v6.4h[3] + smlal v24.4s, v14.4h, v0.h[1] + smlal v26.4s, v14.4h, v1.h[3] + smlal v28.4s, v14.4h, v4.h[1] + smlal v30.4s, v14.4h, v6.h[3] - smlsl v24.4s, v15.4h, v4.4h[1] - smlsl v26.4s, v15.4h, v0.4h[3] - smlsl v28.4s, v15.4h, v2.4h[3] - smlsl v30.4s, v15.4h, v6.4h[1] + smlsl v24.4s, v15.4h, v4.h[1] + smlsl v26.4s, v15.4h, v0.h[3] + smlsl v28.4s, v15.4h, v2.h[3] + smlsl v30.4s, v15.4h, v6.h[1] - smlal v20.4s, v12.4h, v7.4h[0] - smlal v20.4s, v13.4h, v5.4h[2] - smlal v22.4s, v12.4h, v5.4h[0] - smlsl v22.4s, v13.4h, v7.4h[2] - smlal v16.4s, v12.4h, v3.4h[0] - smlsl v16.4s, v13.4h, v4.4h[2] - smlal v18.4s, v12.4h, v1.4h[0] - smlsl v18.4s, v13.4h, v1.4h[2] + smlal v20.4s, v12.4h, v7.h[0] + smlal v20.4s, v13.4h, v5.h[2] + smlal v22.4s, v12.4h, v5.h[0] + smlsl v22.4s, v13.4h, v7.h[2] + smlal v16.4s, v12.4h, v3.h[0] + smlsl v16.4s, v13.4h, v4.h[2] + smlal v18.4s, v12.4h, v1.h[0] + smlsl v18.4s, v13.4h, v1.h[2] cmp x12,x5 bhs stage2_shift4 @@ -2621,32 +2621,32 @@ stage2_shift3: - smlal v24.4s, v8.4h, v7.4h[3] //// y1 * cos1(part of b0) - smlal v26.4s, v8.4h, v3.4h[1] //// y1 * cos3(part of b1) - smlal v28.4s, v8.4h, v1.4h[1] //// y1 * sin3(part of b2) - smlal v30.4s, v8.4h, v5.4h[3] //// y1 * sin1(part of b3) + smlal v24.4s, v8.4h, v7.h[3] //// y1 * cos1(part of b0) + smlal v26.4s, v8.4h, v3.h[1] //// y1 * cos3(part of b1) + smlal v28.4s, v8.4h, v1.h[1] //// y1 * sin3(part of b2) + smlal v30.4s, v8.4h, v5.h[3] //// y1 * sin1(part of b3) - smlal v24.4s, v9.4h, v4.4h[3] //// y1 * cos1 + y3 * cos3(part of b0) - smlsl v26.4s, v9.4h, v5.4h[3] //// y1 * cos3 - y3 * sin1(part of b1) - smlsl v28.4s, v9.4h, v0.4h[1] //// y1 * sin3 - y3 * cos1(part of b2) - smlsl v30.4s, v9.4h, v5.4h[1] //// y1 * sin1 - y3 * sin3(part of b3) + smlal v24.4s, v9.4h, v4.h[3] //// y1 * cos1 + y3 * cos3(part of b0) + smlsl v26.4s, v9.4h, v5.h[3] //// y1 * cos3 - y3 * sin1(part of b1) + smlsl v28.4s, v9.4h, v0.h[1] //// y1 * sin3 - y3 * cos1(part of b2) + smlsl v30.4s, v9.4h, v5.h[1] //// y1 * sin1 - y3 * sin3(part of b3) - smlsl v20.4s, v10.4h, v2.4h[0] - smlal v20.4s, v11.4h, v1.4h[2] + smlsl v20.4s, v10.4h, v2.h[0] + smlal v20.4s, v11.4h, v1.h[2] - smlsl v22.4s, v10.4h, v6.4h[0] - smlal v22.4s, v11.4h, v3.4h[2] + smlsl v22.4s, v10.4h, v6.h[0] + smlal v22.4s, v11.4h, v3.h[2] - smlal v16.4s, v10.4h, v6.4h[0] - smlsl v16.4s, v11.4h, v7.4h[2] + smlal v16.4s, v10.4h, v6.h[0] + smlsl v16.4s, v11.4h, v7.h[2] - smlal v18.4s, v10.4h, v2.4h[0] - smlsl v18.4s, v11.4h, v2.4h[2] + smlal v18.4s, v10.4h, v2.h[0] + smlsl v18.4s, v11.4h, v2.h[2] cmp x12,x6 bhs stage2_shift4 @@ -2660,26 +2660,26 @@ stage2_shift3: - smlsl v24.4s, v14.4h, v1.4h[1] - smlsl v26.4s, v14.4h, v7.4h[3] - smlal v28.4s, v14.4h, v1.4h[3] - smlal v30.4s, v14.4h, v4.4h[3] + smlsl v24.4s, v14.4h, v1.h[1] + smlsl v26.4s, v14.4h, v7.h[3] + smlal v28.4s, v14.4h, v1.h[3] + smlal v30.4s, v14.4h, v4.h[3] - smlal v24.4s, v15.4h, v2.4h[1] - smlal v26.4s, v15.4h, v5.4h[1] - smlsl v28.4s, v15.4h, v3.4h[1] - smlsl v30.4s, v15.4h, v4.4h[1] + smlal v24.4s, v15.4h, v2.h[1] + smlal v26.4s, v15.4h, v5.h[1] + smlsl v28.4s, v15.4h, v3.h[1] + smlsl v30.4s, v15.4h, v4.h[1] - smlsl v20.4s, v12.4h, v5.4h[0] - smlsl v20.4s, v13.4h, v7.4h[2] - smlsl v22.4s, v12.4h, v1.4h[0] - smlal v22.4s, v13.4h, v1.4h[2] - smlsl v16.4s, v12.4h, v7.4h[0] - smlal v16.4s, v13.4h, v5.4h[2] - smlal v18.4s, v12.4h, v3.4h[0] - smlsl v18.4s, v13.4h, v3.4h[2] + smlsl v20.4s, v12.4h, v5.h[0] + smlsl v20.4s, v13.4h, v7.h[2] + smlsl v22.4s, v12.4h, v1.h[0] + smlal v22.4s, v13.4h, v1.h[2] + smlsl v16.4s, v12.4h, v7.h[0] + smlal v16.4s, v13.4h, v5.h[2] + smlal v18.4s, v12.4h, v3.h[0] + smlsl v18.4s, v13.4h, v3.h[2] cmp x12,x9 bhs stage2_shift4 @@ -2689,32 +2689,32 @@ stage2_shift3: ld1 {v8.4h, v9.4h},[x1],x10 - smlsl v24.4s, v8.4h, v5.4h[3] //// y1 * cos1(part of b0) - smlsl v26.4s, v8.4h, v2.4h[3] //// y1 * cos3(part of b1) - smlal v28.4s, v8.4h, v4.4h[3] //// y1 * sin3(part of b2) - smlal v30.4s, v8.4h, v3.4h[3] //// y1 * sin1(part of b3) + smlsl v24.4s, v8.4h, v5.h[3] //// y1 * cos1(part of b0) + smlsl v26.4s, v8.4h, v2.h[3] //// y1 * cos3(part of b1) + smlal v28.4s, v8.4h, v4.h[3] //// y1 * sin3(part of b2) + smlal v30.4s, v8.4h, v3.h[3] //// y1 * sin1(part of b3) - smlsl v24.4s, v9.4h, v6.4h[3] //// y1 * cos1 + y3 * cos3(part of b0) - smlal v26.4s, v9.4h, v0.4h[3] //// y1 * cos3 - y3 * sin1(part of b1) - smlsl v28.4s, v9.4h, v6.4h[1] //// y1 * sin3 - y3 * cos1(part of b2) - smlsl v30.4s, v9.4h, v3.4h[1] //// y1 * sin1 - y3 * sin3(part of b3) + smlsl v24.4s, v9.4h, v6.h[3] //// y1 * cos1 + y3 * cos3(part of b0) + smlal v26.4s, v9.4h, v0.h[3] //// y1 * cos3 - y3 * sin1(part of b1) + smlsl v28.4s, v9.4h, v6.h[1] //// y1 * sin3 - y3 * cos1(part of b2) + smlsl v30.4s, v9.4h, v3.h[1] //// y1 * sin1 - y3 * sin3(part of b3) - smlal v20.4s, v10.4h, v0.4h[0] - smlsl v20.4s, v11.4h, v0.4h[2] + smlal v20.4s, v10.4h, v0.h[0] + smlsl v20.4s, v11.4h, v0.h[2] - smlsl v22.4s, v10.4h, v0.4h[0] - smlal v22.4s, v11.4h, v6.4h[2] + smlsl v22.4s, v10.4h, v0.h[0] + smlal v22.4s, v11.4h, v6.h[2] - smlsl v16.4s, v10.4h, v0.4h[0] - smlal v16.4s, v11.4h, v2.4h[2] + smlsl v16.4s, v10.4h, v0.h[0] + smlal v16.4s, v11.4h, v2.h[2] - smlal v18.4s, v10.4h, v0.4h[0] - smlsl v18.4s, v11.4h, v4.4h[2] + smlal v18.4s, v10.4h, v0.h[0] + smlsl v18.4s, v11.4h, v4.h[2] ld1 {v12.4h, v13.4h},[x1],#16 ld1 {v14.4h, v15.4h},[x1],x10 @@ -2722,26 +2722,26 @@ stage2_shift3: - smlal v24.4s, v14.4h, v3.4h[1] - smlsl v26.4s, v14.4h, v2.4h[1] - smlal v28.4s, v14.4h, v7.4h[3] - smlal v30.4s, v14.4h, v2.4h[3] + smlal v24.4s, v14.4h, v3.h[1] + smlsl v26.4s, v14.4h, v2.h[1] + smlal v28.4s, v14.4h, v7.h[3] + smlal v30.4s, v14.4h, v2.h[3] - smlsl v24.4s, v15.4h, v0.4h[3] - smlal v26.4s, v15.4h, v4.4h[3] - smlal v28.4s, v15.4h, v6.4h[3] - smlsl v30.4s, v15.4h, v2.4h[1] + smlsl v24.4s, v15.4h, v0.h[3] + smlal v26.4s, v15.4h, v4.h[3] + smlal v28.4s, v15.4h, v6.h[3] + smlsl v30.4s, v15.4h, v2.h[1] - smlal v20.4s, v12.4h, v3.4h[0] - smlsl v20.4s, v13.4h, v6.4h[2] - smlal v22.4s, v12.4h, v7.4h[0] - smlsl v22.4s, v13.4h, v4.4h[2] - smlsl v16.4s, v12.4h, v1.4h[0] - smlal v16.4s, v13.4h, v0.4h[2] - smlal v18.4s, v12.4h, v5.4h[0] - smlsl v18.4s, v13.4h, v5.4h[2] + smlal v20.4s, v12.4h, v3.h[0] + smlsl v20.4s, v13.4h, v6.h[2] + smlal v22.4s, v12.4h, v7.h[0] + smlsl v22.4s, v13.4h, v4.h[2] + smlsl v16.4s, v12.4h, v1.h[0] + smlal v16.4s, v13.4h, v0.h[2] + smlal v18.4s, v12.4h, v5.h[0] + smlsl v18.4s, v13.4h, v5.h[2] ld1 {v10.4h, v11.4h},[x1],#16 @@ -2750,32 +2750,32 @@ stage2_shift3: - smlal v24.4s, v8.4h, v3.4h[3] //// y1 * cos1(part of b0) - smlsl v26.4s, v8.4h, v7.4h[1] //// y1 * cos3(part of b1) - smlsl v28.4s, v8.4h, v5.4h[1] //// y1 * sin3(part of b2) - smlal v30.4s, v8.4h, v1.4h[3] //// y1 * sin1(part of b3) + smlal v24.4s, v8.4h, v3.h[3] //// y1 * cos1(part of b0) + smlsl v26.4s, v8.4h, v7.h[1] //// y1 * cos3(part of b1) + smlsl v28.4s, v8.4h, v5.h[1] //// y1 * sin3(part of b2) + smlal v30.4s, v8.4h, v1.h[3] //// y1 * sin1(part of b3) - smlsl v24.4s, v9.4h, v7.4h[1] //// y1 * cos1 + y3 * cos3(part of b0) - smlsl v26.4s, v9.4h, v6.4h[1] //// y1 * cos3 - y3 * sin1(part of b1) - smlal v28.4s, v9.4h, v3.4h[3] //// y1 * sin3 - y3 * cos1(part of b2) - smlsl v30.4s, v9.4h, v1.4h[1] //// y1 * sin1 - y3 * sin3(part of b3) + smlsl v24.4s, v9.4h, v7.h[1] //// y1 * cos1 + y3 * cos3(part of b0) + smlsl v26.4s, v9.4h, v6.h[1] //// y1 * cos3 - y3 * sin1(part of b1) + smlal v28.4s, v9.4h, v3.h[3] //// y1 * sin3 - y3 * cos1(part of b2) + smlsl v30.4s, v9.4h, v1.h[1] //// y1 * sin1 - y3 * sin3(part of b3) - smlsl v20.4s, v10.4h, v6.4h[0] - smlal v20.4s, v11.4h, v2.4h[2] + smlsl v20.4s, v10.4h, v6.h[0] + smlal v20.4s, v11.4h, v2.h[2] - smlal v22.4s, v10.4h, v2.4h[0] - smlsl v22.4s, v11.4h, v0.4h[2] + smlal v22.4s, v10.4h, v2.h[0] + smlsl v22.4s, v11.4h, v0.h[2] - smlsl v16.4s, v10.4h, v2.4h[0] - smlal v16.4s, v11.4h, v3.4h[2] + smlsl v16.4s, v10.4h, v2.h[0] + smlal v16.4s, v11.4h, v3.h[2] - smlal v18.4s, v10.4h, v6.4h[0] - smlsl v18.4s, v11.4h, v6.4h[2] + smlal v18.4s, v10.4h, v6.h[0] + smlsl v18.4s, v11.4h, v6.h[2] ld1 {v12.4h, v13.4h},[x1],#16 @@ -2783,26 +2783,26 @@ stage2_shift3: - smlsl v24.4s, v14.4h, v5.4h[1] - smlal v26.4s, v14.4h, v3.4h[3] - smlsl v28.4s, v14.4h, v2.4h[1] - smlal v30.4s, v14.4h, v0.4h[3] + smlsl v24.4s, v14.4h, v5.h[1] + smlal v26.4s, v14.4h, v3.h[3] + smlsl v28.4s, v14.4h, v2.h[1] + smlal v30.4s, v14.4h, v0.h[3] - smlal v24.4s, v15.4h, v1.4h[3] - smlsl v26.4s, v15.4h, v1.4h[1] - smlal v28.4s, v15.4h, v0.4h[3] - smlsl v30.4s, v15.4h, v0.4h[1] + smlal v24.4s, v15.4h, v1.h[3] + smlsl v26.4s, v15.4h, v1.h[1] + smlal v28.4s, v15.4h, v0.h[3] + smlsl v30.4s, v15.4h, v0.h[1] - smlsl v20.4s, v12.4h, v1.4h[0] - smlal v20.4s, v13.4h, v4.4h[2] - smlal v22.4s, v12.4h, v3.4h[0] - smlsl v22.4s, v13.4h, v5.4h[2] - smlsl v16.4s, v12.4h, v5.4h[0] - smlal v16.4s, v13.4h, v6.4h[2] - smlal v18.4s, v12.4h, v7.4h[0] - smlsl v18.4s, v13.4h, v7.4h[2] + smlsl v20.4s, v12.4h, v1.h[0] + smlal v20.4s, v13.4h, v4.h[2] + smlal v22.4s, v12.4h, v3.h[0] + smlsl v22.4s, v13.4h, v5.h[2] + smlsl v16.4s, v12.4h, v5.h[0] + smlal v16.4s, v13.4h, v6.h[2] + smlal v18.4s, v12.4h, v7.h[0] + smlsl v18.4s, v13.4h, v7.h[2] stage2_shift4: add v8.4s, v20.4s , v24.4s diff --git a/common/arm64/ihevc_itrans_recon_4x4.s b/common/arm64/ihevc_itrans_recon_4x4.s index 1f2c904..61fa5d7 100644 --- a/common/arm64/ihevc_itrans_recon_4x4.s +++ b/common/arm64/ihevc_itrans_recon_4x4.s @@ -140,11 +140,11 @@ ihevc_itrans_recon_4x4_av8: // first stage computation starts - smull v6.4s, v1.4h, v4.4h[1] //83 * pi2_src[1] - smlal v6.4s, v3.4h, v4.4h[3] //o[0] = 83 * pi2_src[1] + 36 * pi2_src[3] - smull v5.4s, v1.4h, v4.4h[3] //36 * pi2_src[1] + smull v6.4s, v1.4h, v4.h[1] //83 * pi2_src[1] + smlal v6.4s, v3.4h, v4.h[3] //o[0] = 83 * pi2_src[1] + 36 * pi2_src[3] + smull v5.4s, v1.4h, v4.h[3] //36 * pi2_src[1] ld1 {v22.s}[0],[x2],x5 - smlsl v5.4s, v3.4h, v4.4h[1] //o[1] = 36 * pi2_src[1] - 83 * pi2_src[3] + smlsl v5.4s, v3.4h, v4.h[1] //o[1] = 36 * pi2_src[1] - 83 * pi2_src[3] saddl v7.4s, v0.4h, v2.4h //pi2_src[0] + pi2_src[2] ssubl v17.4s, v0.4h, v2.4h //pi2_src[0] - pi2_src[2] @@ -173,11 +173,11 @@ ihevc_itrans_recon_4x4_av8: // first stage ends // output in d0,d1,d2,d3 // second stage starts - smull v6.4s, v1.4h, v4.4h[1] //83 * pi2_src[1] + smull v6.4s, v1.4h, v4.h[1] //83 * pi2_src[1] ld1 {v22.s}[1],[x2],x5 - smlal v6.4s, v3.4h, v4.4h[3] //o[0] = 83 * pi2_src[1] + 36 * pi2_src[3] - smull v5.4s, v1.4h, v4.4h[3] //36 * pi2_src[1] - smlsl v5.4s, v3.4h, v4.4h[1] //o[1] = 36 * pi2_src[1] - 83 * pi2_src[3] + smlal v6.4s, v3.4h, v4.h[3] //o[0] = 83 * pi2_src[1] + 36 * pi2_src[3] + smull v5.4s, v1.4h, v4.h[3] //36 * pi2_src[1] + smlsl v5.4s, v3.4h, v4.h[1] //o[1] = 36 * pi2_src[1] - 83 * pi2_src[3] ld1 {v23.s}[0],[x2],x5 saddl v7.4s, v0.4h, v2.4h //pi2_src[0] + pi2_src[2] diff --git a/common/arm64/ihevc_itrans_recon_4x4_ttype1.s b/common/arm64/ihevc_itrans_recon_4x4_ttype1.s index da04c5e..c30f358 100644 --- a/common/arm64/ihevc_itrans_recon_4x4_ttype1.s +++ b/common/arm64/ihevc_itrans_recon_4x4_ttype1.s @@ -127,34 +127,34 @@ ihevc_itrans_recon_4x4_ttype1_av8: mov x9,#55 mov x10,#74 mov x11,#84 - mov v4.4h[0], w8 + mov v4.h[0], w8 ld1 {v0.4h},[x0],x4 //loading pi2_src 1st row - mov v4.4h[1], w9 + mov v4.h[1], w9 ld1 {v1.4h},[x0],x4 //loading pi2_src 2nd row - mov v4.4h[2], w10 + mov v4.h[2], w10 ld1 {v2.4h},[x0],x4 //loading pi2_src 3rd row - mov v4.4h[3], w11 + mov v4.h[3], w11 ld1 {v3.4h},[x0],x4 //loading pi2_src 4th row // first stage computation starts - smull v6.4s, v1.4h, v4.4h[2] //74 * pi2_src[1] - smlal v6.4s, v0.4h, v4.4h[0] //74 * pi2_src[1] + 29 * pi2_src[0] - smlal v6.4s, v3.4h, v4.4h[1] //74 * pi2_src[1] + 29 * pi2_src[0] + 55 * pi2_src[3] - smlal v6.4s, v2.4h, v4.4h[3] //pi2_out[0] = 29* pi2_src[0] + 74 * pi2_src[1] + 84* pi2_src[2] + 55 * pi2_src[3] + smull v6.4s, v1.4h, v4.h[2] //74 * pi2_src[1] + smlal v6.4s, v0.4h, v4.h[0] //74 * pi2_src[1] + 29 * pi2_src[0] + smlal v6.4s, v3.4h, v4.h[1] //74 * pi2_src[1] + 29 * pi2_src[0] + 55 * pi2_src[3] + smlal v6.4s, v2.4h, v4.h[3] //pi2_out[0] = 29* pi2_src[0] + 74 * pi2_src[1] + 84* pi2_src[2] + 55 * pi2_src[3] - smull v5.4s, v1.4h, v4.4h[2] //74 * pi2_src[1] - smlal v5.4s, v0.4h, v4.4h[1] //74 * pi2_src[1] + 55 * pi2_src[0] - smlsl v5.4s, v2.4h, v4.4h[0] //74 * pi2_src[1] + 55 * pi2_src[0] - 29 * pi2_src[2] - smlsl v5.4s, v3.4h, v4.4h[3] //pi2_out[1] = 74 * pi2_src[1] + 55 * pi2_src[0] - 29 * pi2_src[2] - 84 * pi2_src[3]) + smull v5.4s, v1.4h, v4.h[2] //74 * pi2_src[1] + smlal v5.4s, v0.4h, v4.h[1] //74 * pi2_src[1] + 55 * pi2_src[0] + smlsl v5.4s, v2.4h, v4.h[0] //74 * pi2_src[1] + 55 * pi2_src[0] - 29 * pi2_src[2] + smlsl v5.4s, v3.4h, v4.h[3] //pi2_out[1] = 74 * pi2_src[1] + 55 * pi2_src[0] - 29 * pi2_src[2] - 84 * pi2_src[3]) - smull v7.4s, v0.4h, v4.4h[2] // 74 * pi2_src[0] - smlsl v7.4s, v2.4h, v4.4h[2] // 74 * pi2_src[0] - 74 * pi2_src[2] - smlal v7.4s, v3.4h, v4.4h[2] //pi2_out[2] = 74 * pi2_src[0] - 74 * pi2_src[2] + 74 * pi2_src[3] + smull v7.4s, v0.4h, v4.h[2] // 74 * pi2_src[0] + smlsl v7.4s, v2.4h, v4.h[2] // 74 * pi2_src[0] - 74 * pi2_src[2] + smlal v7.4s, v3.4h, v4.h[2] //pi2_out[2] = 74 * pi2_src[0] - 74 * pi2_src[2] + 74 * pi2_src[3] - smull v20.4s, v2.4h, v4.4h[1] // 55 * pi2_src[2] - smlsl v20.4s, v1.4h, v4.4h[2] // 55 * pi2_src[2] - 74 * pi2_src[1] - smlsl v20.4s, v3.4h, v4.4h[0] // - 74 * pi2_src[1] + 55 * pi2_src[2] - 29 * pi2_src[3] - smlal v20.4s, v0.4h, v4.4h[3] //pi2_out[3] = 84 * pi2_src[0] - 74 * pi2_src[1] + 55 * pi2_src[2] - 29 * pi2_src[3] + smull v20.4s, v2.4h, v4.h[1] // 55 * pi2_src[2] + smlsl v20.4s, v1.4h, v4.h[2] // 55 * pi2_src[2] - 74 * pi2_src[1] + smlsl v20.4s, v3.4h, v4.h[0] // - 74 * pi2_src[1] + 55 * pi2_src[2] - 29 * pi2_src[3] + smlal v20.4s, v0.4h, v4.h[3] //pi2_out[3] = 84 * pi2_src[0] - 74 * pi2_src[1] + 55 * pi2_src[2] - 29 * pi2_src[3] sqrshrn v28.4h, v6.4s,#shift_stage1_idct // (pi2_out[0] + rounding ) >> shift_stage1_idct sqrshrn v29.4h, v5.4s,#shift_stage1_idct // (pi2_out[1] + rounding ) >> shift_stage1_idct @@ -180,25 +180,25 @@ ihevc_itrans_recon_4x4_ttype1_av8: // d16 - d2 // d17 - d3 ld1 {v18.s}[1],[x2],x5 - smull v6.4s, v22.4h, v4.4h[2] //74 * pi2_src[1] - smlal v6.4s, v21.4h, v4.4h[0] //74 * pi2_src[1] + 29 * pi2_src[0] - smlal v6.4s, v17.4h, v4.4h[1] //74 * pi2_src[1] + 29 * pi2_src[0] + 55 * pi2_src[3] - smlal v6.4s, v16.4h, v4.4h[3] //pi2_out[0] = 29* pi2_src[0] + 74 * pi2_src[1] + 84* pi2_src[2] + 55 * pi2_src[3] - - smull v5.4s, v22.4h, v4.4h[2] //74 * pi2_src[1] - smlal v5.4s, v21.4h, v4.4h[1] //74 * pi2_src[1] + 55 * pi2_src[0] - smlsl v5.4s, v16.4h, v4.4h[0] //74 * pi2_src[1] + 55 * pi2_src[0] - 29 * pi2_src[2] - smlsl v5.4s, v17.4h, v4.4h[3] //pi2_out[1] = 74 * pi2_src[1] + 55 * pi2_src[0] - 29 * pi2_src[2] - 84 * pi2_src[3]) - - smull v7.4s, v21.4h, v4.4h[2] // 74 * pi2_src[0] - smlsl v7.4s, v16.4h, v4.4h[2] // 74 * pi2_src[0] - 74 * pi2_src[2] - smlal v7.4s, v17.4h, v4.4h[2] //pi2_out[2] = 74 * pi2_src[0] - 74 * pi2_src[2] + 74 * pi2_src[3] + smull v6.4s, v22.4h, v4.h[2] //74 * pi2_src[1] + smlal v6.4s, v21.4h, v4.h[0] //74 * pi2_src[1] + 29 * pi2_src[0] + smlal v6.4s, v17.4h, v4.h[1] //74 * pi2_src[1] + 29 * pi2_src[0] + 55 * pi2_src[3] + smlal v6.4s, v16.4h, v4.h[3] //pi2_out[0] = 29* pi2_src[0] + 74 * pi2_src[1] + 84* pi2_src[2] + 55 * pi2_src[3] + + smull v5.4s, v22.4h, v4.h[2] //74 * pi2_src[1] + smlal v5.4s, v21.4h, v4.h[1] //74 * pi2_src[1] + 55 * pi2_src[0] + smlsl v5.4s, v16.4h, v4.h[0] //74 * pi2_src[1] + 55 * pi2_src[0] - 29 * pi2_src[2] + smlsl v5.4s, v17.4h, v4.h[3] //pi2_out[1] = 74 * pi2_src[1] + 55 * pi2_src[0] - 29 * pi2_src[2] - 84 * pi2_src[3]) + + smull v7.4s, v21.4h, v4.h[2] // 74 * pi2_src[0] + smlsl v7.4s, v16.4h, v4.h[2] // 74 * pi2_src[0] - 74 * pi2_src[2] + smlal v7.4s, v17.4h, v4.h[2] //pi2_out[2] = 74 * pi2_src[0] - 74 * pi2_src[2] + 74 * pi2_src[3] ld1 {v19.s}[0],[x2],x5 - smull v20.4s, v16.4h, v4.4h[1] // 55 * pi2_src[2] - smlsl v20.4s, v22.4h, v4.4h[2] // - 74 * pi2_src[1] + 55 * pi2_src[2] - smlsl v20.4s, v17.4h, v4.4h[0] // - 74 * pi2_src[1] + 55 * pi2_src[2] - 29 * pi2_src[3] - smlal v20.4s, v21.4h, v4.4h[3] //pi2_out[3] = 84 * pi2_src[0] - 74 * pi2_src[1] + 55 * pi2_src[2] - 29 * pi2_src[3] + smull v20.4s, v16.4h, v4.h[1] // 55 * pi2_src[2] + smlsl v20.4s, v22.4h, v4.h[2] // - 74 * pi2_src[1] + 55 * pi2_src[2] + smlsl v20.4s, v17.4h, v4.h[0] // - 74 * pi2_src[1] + 55 * pi2_src[2] - 29 * pi2_src[3] + smlal v20.4s, v21.4h, v4.h[3] //pi2_out[3] = 84 * pi2_src[0] - 74 * pi2_src[1] + 55 * pi2_src[2] - 29 * pi2_src[3] sqrshrn v28.4h, v6.4s,#shift_stage2_idct // (pi2_out[0] + rounding ) >> shift_stage1_idct sqrshrn v29.4h, v5.4s,#shift_stage2_idct // (pi2_out[1] + rounding ) >> shift_stage1_idct diff --git a/common/arm64/ihevc_itrans_recon_8x8.s b/common/arm64/ihevc_itrans_recon_8x8.s index 332677e..86ad136 100644 --- a/common/arm64/ihevc_itrans_recon_8x8.s +++ b/common/arm64/ihevc_itrans_recon_8x8.s @@ -184,30 +184,30 @@ ihevc_itrans_recon_8x8_av8: ld1 {v2.4h},[x0],#8 ld1 {v3.4h},[x9],#8 ld1 {v4.4h},[x0],x5 - smull v20.4s, v2.4h, v0.4h[0] //// y0 * cos4(part of c0 and c1) + smull v20.4s, v2.4h, v0.h[0] //// y0 * cos4(part of c0 and c1) ld1 {v5.4h},[x9],x5 - smull v18.4s, v3.4h, v1.4h[2] //// y2 * sin2 (q3 is freed by this time)(part of d1) + smull v18.4s, v3.4h, v1.h[2] //// y2 * sin2 (q3 is freed by this time)(part of d1) ld1 {v6.4h},[x0],#8 ld1 {v7.4h},[x9],#8 - smull v24.4s, v6.4h, v0.4h[1] //// y1 * cos1(part of b0) + smull v24.4s, v6.4h, v0.h[1] //// y1 * cos1(part of b0) ld1 {v8.4h},[x0],x10 - smull v26.4s, v6.4h, v0.4h[3] //// y1 * cos3(part of b1) + smull v26.4s, v6.4h, v0.h[3] //// y1 * cos3(part of b1) ld1 {v9.4h},[x9],x10 - smull v28.4s, v6.4h, v1.4h[1] //// y1 * sin3(part of b2) + smull v28.4s, v6.4h, v1.h[1] //// y1 * sin3(part of b2) ld1 {v10.4h},[x0],#8 - smull v30.4s, v6.4h, v1.4h[3] //// y1 * sin1(part of b3) + smull v30.4s, v6.4h, v1.h[3] //// y1 * sin1(part of b3) ld1 {v11.4h},[x9],#8 - smlal v24.4s, v7.4h, v0.4h[3] //// y1 * cos1 + y3 * cos3(part of b0) + smlal v24.4s, v7.4h, v0.h[3] //// y1 * cos1 + y3 * cos3(part of b0) ld1 {v12.4h},[x0],x5 - smlsl v26.4s, v7.4h, v1.4h[3] //// y1 * cos3 - y3 * sin1(part of b1) + smlsl v26.4s, v7.4h, v1.h[3] //// y1 * cos3 - y3 * sin1(part of b1) ld1 {v13.4h},[x9],x5 - smlsl v28.4s, v7.4h, v0.4h[1] //// y1 * sin3 - y3 * cos1(part of b2) + smlsl v28.4s, v7.4h, v0.h[1] //// y1 * sin3 - y3 * cos1(part of b2) ld1 {v14.4h},[x0],#8 - smlsl v30.4s, v7.4h, v1.4h[1] //// y1 * sin1 - y3 * sin3(part of b3) + smlsl v30.4s, v7.4h, v1.h[1] //// y1 * sin1 - y3 * sin3(part of b3) ld1 {v15.4h},[x9],#8 - smull v22.4s, v10.4h, v0.4h[0] //// y4 * cos4(part of c0 and c1) + smull v22.4s, v10.4h, v0.h[0] //// y4 * cos4(part of c0 and c1) ld1 {v16.4h},[x0],x10 - smull v6.4s, v3.4h, v0.4h[2] //// y2 * cos2(part of d0) + smull v6.4s, v3.4h, v0.h[2] //// y2 * cos2(part of d0) ld1 {v17.4h},[x9],x10 ///* this following was activated when alignment is not there */ @@ -231,21 +231,21 @@ ihevc_itrans_recon_8x8_av8: - smlal v24.4s, v14.4h, v1.4h[1] //// y1 * cos1 + y3 * cos3 + y5 * sin3(part of b0) - smlsl v26.4s, v14.4h, v0.4h[1] //// y1 * cos3 - y3 * sin1 - y5 * cos1(part of b1) - smlal v28.4s, v14.4h, v1.4h[3] //// y1 * sin3 - y3 * cos1 + y5 * sin1(part of b2) - smlal v30.4s, v14.4h, v0.4h[3] //// y1 * sin1 - y3 * sin3 + y5 * cos3(part of b3) + smlal v24.4s, v14.4h, v1.h[1] //// y1 * cos1 + y3 * cos3 + y5 * sin3(part of b0) + smlsl v26.4s, v14.4h, v0.h[1] //// y1 * cos3 - y3 * sin1 - y5 * cos1(part of b1) + smlal v28.4s, v14.4h, v1.h[3] //// y1 * sin3 - y3 * cos1 + y5 * sin1(part of b2) + smlal v30.4s, v14.4h, v0.h[3] //// y1 * sin1 - y3 * sin3 + y5 * cos3(part of b3) - smlsl v18.4s, v11.4h, v0.4h[2] //// d1 = y2 * sin2 - y6 * cos2(part of a0 and a1) - smlal v6.4s, v11.4h, v1.4h[2] //// d0 = y2 * cos2 + y6 * sin2(part of a0 and a1) + smlsl v18.4s, v11.4h, v0.h[2] //// d1 = y2 * sin2 - y6 * cos2(part of a0 and a1) + smlal v6.4s, v11.4h, v1.h[2] //// d0 = y2 * cos2 + y6 * sin2(part of a0 and a1) add v10.4s, v20.4s , v22.4s //// c0 = y0 * cos4 + y4 * cos4(part of a0 and a1) sub v20.4s, v20.4s , v22.4s //// c1 = y0 * cos4 - y4 * cos4(part of a0 and a1) - smlal v24.4s, v15.4h, v1.4h[3] //// b0 = y1 * cos1 + y3 * cos3 + y5 * sin3 + y7 * sin1(part of x0,x7) - smlsl v26.4s, v15.4h, v1.4h[1] //// b1 = y1 * cos3 - y3 * sin1 - y5 * cos1 - y7 * sin3(part of x1,x6) - smlal v28.4s, v15.4h, v0.4h[3] //// b2 = y1 * sin3 - y3 * cos1 + y5 * sin1 + y7 * cos3(part of x2,x5) - smlsl v30.4s, v15.4h, v0.4h[1] //// b3 = y1 * sin1 - y3 * sin3 + y5 * cos3 - y7 * cos1(part of x3,x4) + smlal v24.4s, v15.4h, v1.h[3] //// b0 = y1 * cos1 + y3 * cos3 + y5 * sin3 + y7 * sin1(part of x0,x7) + smlsl v26.4s, v15.4h, v1.h[1] //// b1 = y1 * cos3 - y3 * sin1 - y5 * cos1 - y7 * sin3(part of x1,x6) + smlal v28.4s, v15.4h, v0.h[3] //// b2 = y1 * sin3 - y3 * cos1 + y5 * sin1 + y7 * cos3(part of x2,x5) + smlsl v30.4s, v15.4h, v0.h[1] //// b3 = y1 * sin1 - y3 * sin3 + y5 * cos3 - y7 * cos1(part of x3,x4) add v14.4s, v10.4s , v6.4s //// a0 = c0 + d0(part of x0,x7) sub v10.4s, v10.4s , v6.4s //// a3 = c0 - d0(part of x3,x4) @@ -301,20 +301,20 @@ skip_last4_rows: - smull v24.4s, v6.4h, v0.4h[1] //// y1 * cos1(part of b0) - smull v26.4s, v6.4h, v0.4h[3] //// y1 * cos3(part of b1) - smull v28.4s, v6.4h, v1.4h[1] //// y1 * sin3(part of b2) - smull v30.4s, v6.4h, v1.4h[3] //// y1 * sin1(part of b3) + smull v24.4s, v6.4h, v0.h[1] //// y1 * cos1(part of b0) + smull v26.4s, v6.4h, v0.h[3] //// y1 * cos3(part of b1) + smull v28.4s, v6.4h, v1.h[1] //// y1 * sin3(part of b2) + smull v30.4s, v6.4h, v1.h[3] //// y1 * sin1(part of b3) - smlal v24.4s, v7.4h, v0.4h[3] //// y1 * cos1 + y3 * cos3(part of b0) - smlsl v26.4s, v7.4h, v1.4h[3] //// y1 * cos3 - y3 * sin1(part of b1) - smlsl v28.4s, v7.4h, v0.4h[1] //// y1 * sin3 - y3 * cos1(part of b2) - smlsl v30.4s, v7.4h, v1.4h[1] //// y1 * sin1 - y3 * sin3(part of b3) + smlal v24.4s, v7.4h, v0.h[3] //// y1 * cos1 + y3 * cos3(part of b0) + smlsl v26.4s, v7.4h, v1.h[3] //// y1 * cos3 - y3 * sin1(part of b1) + smlsl v28.4s, v7.4h, v0.h[1] //// y1 * sin3 - y3 * cos1(part of b2) + smlsl v30.4s, v7.4h, v1.h[1] //// y1 * sin1 - y3 * sin3(part of b3) - smull v18.4s, v3.4h, v1.4h[2] //// y2 * sin2 (q3 is freed by this time)(part of d1) - smull v6.4s, v3.4h, v0.4h[2] //// y2 * cos2(part of d0) + smull v18.4s, v3.4h, v1.h[2] //// y2 * sin2 (q3 is freed by this time)(part of d1) + smull v6.4s, v3.4h, v0.h[2] //// y2 * cos2(part of d0) - smull v20.4s, v2.4h, v0.4h[0] //// y0 * cos4(part of c0 and c1) + smull v20.4s, v2.4h, v0.h[0] //// y0 * cos4(part of c0 and c1) add v14.4s, v20.4s , v6.4s //// a0 = c0 + d0(part of x0,x7) @@ -350,37 +350,37 @@ last4_cols: cmp x12,#0xf0 bge skip_last4cols - smull v24.4s, v8.4h, v0.4h[1] //// y1 * cos1(part of b0) - smull v26.4s, v8.4h, v0.4h[3] //// y1 * cos3(part of b1) - smull v28.4s, v8.4h, v1.4h[1] //// y1 * sin3(part of b2) - smull v30.4s, v8.4h, v1.4h[3] //// y1 * sin1(part of b3) + smull v24.4s, v8.4h, v0.h[1] //// y1 * cos1(part of b0) + smull v26.4s, v8.4h, v0.h[3] //// y1 * cos3(part of b1) + smull v28.4s, v8.4h, v1.h[1] //// y1 * sin3(part of b2) + smull v30.4s, v8.4h, v1.h[3] //// y1 * sin1(part of b3) - smlal v24.4s, v9.4h, v0.4h[3] //// y1 * cos1 + y3 * cos3(part of b0) - smlsl v26.4s, v9.4h, v1.4h[3] //// y1 * cos3 - y3 * sin1(part of b1) - smlsl v28.4s, v9.4h, v0.4h[1] //// y1 * sin3 - y3 * cos1(part of b2) - smlsl v30.4s, v9.4h, v1.4h[1] //// y1 * sin1 - y3 * sin3(part of b3) + smlal v24.4s, v9.4h, v0.h[3] //// y1 * cos1 + y3 * cos3(part of b0) + smlsl v26.4s, v9.4h, v1.h[3] //// y1 * cos3 - y3 * sin1(part of b1) + smlsl v28.4s, v9.4h, v0.h[1] //// y1 * sin3 - y3 * cos1(part of b2) + smlsl v30.4s, v9.4h, v1.h[1] //// y1 * sin1 - y3 * sin3(part of b3) - smull v18.4s, v5.4h, v1.4h[2] //// y2 * sin2 (q4 is freed by this time)(part of d1) - smull v8.4s, v5.4h, v0.4h[2] //// y2 * cos2(part of d0) + smull v18.4s, v5.4h, v1.h[2] //// y2 * sin2 (q4 is freed by this time)(part of d1) + smull v8.4s, v5.4h, v0.h[2] //// y2 * cos2(part of d0) - smull v20.4s, v4.4h, v0.4h[0] //// y0 * cos4(part of c0 and c1) - smull v22.4s, v12.4h, v0.4h[0] //// y4 * cos4(part of c0 and c1) + smull v20.4s, v4.4h, v0.h[0] //// y0 * cos4(part of c0 and c1) + smull v22.4s, v12.4h, v0.h[0] //// y4 * cos4(part of c0 and c1) - smlal v24.4s, v16.4h, v1.4h[1] //// y1 * cos1 + y3 * cos3 + y5 * sin3(part of b0) - smlsl v26.4s, v16.4h, v0.4h[1] //// y1 * cos3 - y3 * sin1 - y5 * cos1(part of b1) - smlal v28.4s, v16.4h, v1.4h[3] //// y1 * sin3 - y3 * cos1 + y5 * sin1(part of b2) - smlal v30.4s, v16.4h, v0.4h[3] //// y1 * sin1 - y3 * sin3 + y5 * cos3(part of b3) + smlal v24.4s, v16.4h, v1.h[1] //// y1 * cos1 + y3 * cos3 + y5 * sin3(part of b0) + smlsl v26.4s, v16.4h, v0.h[1] //// y1 * cos3 - y3 * sin1 - y5 * cos1(part of b1) + smlal v28.4s, v16.4h, v1.h[3] //// y1 * sin3 - y3 * cos1 + y5 * sin1(part of b2) + smlal v30.4s, v16.4h, v0.h[3] //// y1 * sin1 - y3 * sin3 + y5 * cos3(part of b3) - smlsl v18.4s, v13.4h, v0.4h[2] //// d1 = y2 * sin2 - y6 * cos2(part of a0 and a1) - smlal v8.4s, v13.4h, v1.4h[2] //// d0 = y2 * cos2 + y6 * sin2(part of a0 and a1) + smlsl v18.4s, v13.4h, v0.h[2] //// d1 = y2 * sin2 - y6 * cos2(part of a0 and a1) + smlal v8.4s, v13.4h, v1.h[2] //// d0 = y2 * cos2 + y6 * sin2(part of a0 and a1) add v12.4s, v20.4s , v22.4s //// c0 = y0 * cos4 + y4 * cos4(part of a0 and a1) sub v20.4s, v20.4s , v22.4s //// c1 = y0 * cos4 - y4 * cos4(part of a0 and a1) - smlal v24.4s, v17.4h, v1.4h[3] //// b0 = y1 * cos1 + y3 * cos3 + y5 * sin3 + y7 * sin1(part of e0,e7) - smlsl v26.4s, v17.4h, v1.4h[1] //// b1 = y1 * cos3 - y3 * sin1 - y5 * cos1 - y7 * sin3(part of e1,e6) - smlal v28.4s, v17.4h, v0.4h[3] //// b2 = y1 * sin3 - y3 * cos1 + y5 * sin1 + y7 * cos3(part of e2,e5) - smlsl v30.4s, v17.4h, v0.4h[1] //// b3 = y1 * sin1 - y3 * sin3 + y5 * cos3 - y7 * cos1(part of e3,e4) + smlal v24.4s, v17.4h, v1.h[3] //// b0 = y1 * cos1 + y3 * cos3 + y5 * sin3 + y7 * sin1(part of e0,e7) + smlsl v26.4s, v17.4h, v1.h[1] //// b1 = y1 * cos3 - y3 * sin1 - y5 * cos1 - y7 * sin3(part of e1,e6) + smlal v28.4s, v17.4h, v0.h[3] //// b2 = y1 * sin3 - y3 * cos1 + y5 * sin1 + y7 * cos3(part of e2,e5) + smlsl v30.4s, v17.4h, v0.h[1] //// b3 = y1 * sin1 - y3 * sin3 + y5 * cos3 - y7 * cos1(part of e3,e4) add v16.4s, v12.4s , v8.4s //// a0 = c0 + d0(part of e0,e7) sub v12.4s, v12.4s , v8.4s //// a3 = c0 - d0(part of e3,e4) @@ -440,21 +440,21 @@ skip_last4cols: mov v25.d[0],x15 - smull v24.4s, v6.4h, v0.4h[1] //// y1 * cos1(part of b0) - smull v26.4s, v6.4h, v0.4h[3] //// y1 * cos3(part of b1) - smull v28.4s, v6.4h, v1.4h[1] //// y1 * sin3(part of b2) - smull v30.4s, v6.4h, v1.4h[3] //// y1 * sin1(part of b3) + smull v24.4s, v6.4h, v0.h[1] //// y1 * cos1(part of b0) + smull v26.4s, v6.4h, v0.h[3] //// y1 * cos3(part of b1) + smull v28.4s, v6.4h, v1.h[1] //// y1 * sin3(part of b2) + smull v30.4s, v6.4h, v1.h[3] //// y1 * sin1(part of b3) - smlal v24.4s, v7.4h, v0.4h[3] //// y1 * cos1 + y3 * cos3(part of b0) - smlsl v26.4s, v7.4h, v1.4h[3] //// y1 * cos3 - y3 * sin1(part of b1) - smlsl v28.4s, v7.4h, v0.4h[1] //// y1 * sin3 - y3 * cos1(part of b2) - smlsl v30.4s, v7.4h, v1.4h[1] //// y1 * sin1 - y3 * sin3(part of b3) + smlal v24.4s, v7.4h, v0.h[3] //// y1 * cos1 + y3 * cos3(part of b0) + smlsl v26.4s, v7.4h, v1.h[3] //// y1 * cos3 - y3 * sin1(part of b1) + smlsl v28.4s, v7.4h, v0.h[1] //// y1 * sin3 - y3 * cos1(part of b2) + smlsl v30.4s, v7.4h, v1.h[1] //// y1 * sin1 - y3 * sin3(part of b3) - smull v20.4s, v2.4h, v0.4h[0] //// y0 * cos4(part of c0 and c1) + smull v20.4s, v2.4h, v0.h[0] //// y0 * cos4(part of c0 and c1) // vmull.s16 q11,d4,d0[0] @// y4 * cos4(part of c0 and c1) - smull v18.4s, v3.4h, v1.4h[2] //// y2 * sin2 (q3 is freed by this time)(part of d1) - smull v6.4s, v3.4h, v0.4h[2] //// y2 * cos2(part of d0) + smull v18.4s, v3.4h, v1.h[2] //// y2 * sin2 (q3 is freed by this time)(part of d1) + smull v6.4s, v3.4h, v0.h[2] //// y2 * cos2(part of d0) @@ -520,19 +520,19 @@ skip_last4cols: mov v25.d[0],x19 mov v25.d[1],x20 - smull v24.4s, v14.4h, v0.4h[1] //// y1 * cos1(part of b0) + smull v24.4s, v14.4h, v0.h[1] //// y1 * cos1(part of b0) - smull v26.4s, v14.4h, v0.4h[3] //// y1 * cos3(part of b1) - smull v28.4s, v14.4h, v1.4h[1] //// y1 * sin3(part of b2) - smull v30.4s, v14.4h, v1.4h[3] //// y1 * sin1(part of b3) + smull v26.4s, v14.4h, v0.h[3] //// y1 * cos3(part of b1) + smull v28.4s, v14.4h, v1.h[1] //// y1 * sin3(part of b2) + smull v30.4s, v14.4h, v1.h[3] //// y1 * sin1(part of b3) - smlal v24.4s, v15.4h, v0.4h[3] //// y1 * cos1 + y3 * cos3(part of b0) - smlsl v26.4s, v15.4h, v1.4h[3] //// y1 * cos3 - y3 * sin1(part of b1) - smlsl v28.4s, v15.4h, v0.4h[1] //// y1 * sin3 - y3 * cos1(part of b2) - smlsl v30.4s, v15.4h, v1.4h[1] //// y1 * sin1 - y3 * sin3(part of b3) - smull v20.4s, v10.4h, v0.4h[0] //// y0 * cos4(part of c0 and c1) - smull v18.4s, v11.4h, v1.4h[2] //// y2 * sin2 (q7 is freed by this time)(part of d1) - smull v14.4s, v11.4h, v0.4h[2] //// y2 * cos2(part of d0) + smlal v24.4s, v15.4h, v0.h[3] //// y1 * cos1 + y3 * cos3(part of b0) + smlsl v26.4s, v15.4h, v1.h[3] //// y1 * cos3 - y3 * sin1(part of b1) + smlsl v28.4s, v15.4h, v0.h[1] //// y1 * sin3 - y3 * cos1(part of b2) + smlsl v30.4s, v15.4h, v1.h[1] //// y1 * sin1 - y3 * sin3(part of b3) + smull v20.4s, v10.4h, v0.h[0] //// y0 * cos4(part of c0 and c1) + smull v18.4s, v11.4h, v1.h[2] //// y2 * sin2 (q7 is freed by this time)(part of d1) + smull v14.4s, v11.4h, v0.h[2] //// y2 * cos2(part of d0) add x4,x2,x8, lsl #1 // x4 = x2 + pred_strd * 2 => x4 points to 3rd row of pred data @@ -698,38 +698,38 @@ end_skip_last4cols: //// q5 -> q2 //// q7 -> q4 - smull v24.4s, v6.4h, v0.4h[1] //// y1 * cos1(part of b0) - smull v26.4s, v6.4h, v0.4h[3] //// y1 * cos3(part of b1) - smull v28.4s, v6.4h, v1.4h[1] //// y1 * sin3(part of b2) - smull v30.4s, v6.4h, v1.4h[3] //// y1 * sin1(part of b3) + smull v24.4s, v6.4h, v0.h[1] //// y1 * cos1(part of b0) + smull v26.4s, v6.4h, v0.h[3] //// y1 * cos3(part of b1) + smull v28.4s, v6.4h, v1.h[1] //// y1 * sin3(part of b2) + smull v30.4s, v6.4h, v1.h[3] //// y1 * sin1(part of b3) - smlal v24.4s, v7.4h, v0.4h[3] //// y1 * cos1 + y3 * cos3(part of b0) - smlsl v26.4s, v7.4h, v1.4h[3] //// y1 * cos3 - y3 * sin1(part of b1) - smlsl v28.4s, v7.4h, v0.4h[1] //// y1 * sin3 - y3 * cos1(part of b2) - smlsl v30.4s, v7.4h, v1.4h[1] //// y1 * sin1 - y3 * sin3(part of b3) + smlal v24.4s, v7.4h, v0.h[3] //// y1 * cos1 + y3 * cos3(part of b0) + smlsl v26.4s, v7.4h, v1.h[3] //// y1 * cos3 - y3 * sin1(part of b1) + smlsl v28.4s, v7.4h, v0.h[1] //// y1 * sin3 - y3 * cos1(part of b2) + smlsl v30.4s, v7.4h, v1.h[1] //// y1 * sin1 - y3 * sin3(part of b3) - smull v20.4s, v2.4h, v0.4h[0] //// y0 * cos4(part of c0 and c1) - smull v22.4s, v4.4h, v0.4h[0] //// y4 * cos4(part of c0 and c1) + smull v20.4s, v2.4h, v0.h[0] //// y0 * cos4(part of c0 and c1) + smull v22.4s, v4.4h, v0.h[0] //// y4 * cos4(part of c0 and c1) - smull v18.4s, v3.4h, v1.4h[2] //// y2 * sin2 (q3 is freed by this time)(part of d1) - smull v6.4s, v3.4h, v0.4h[2] //// y2 * cos2(part of d0) + smull v18.4s, v3.4h, v1.h[2] //// y2 * sin2 (q3 is freed by this time)(part of d1) + smull v6.4s, v3.4h, v0.h[2] //// y2 * cos2(part of d0) - smlal v24.4s, v8.4h, v1.4h[1] //// y1 * cos1 + y3 * cos3 + y5 * sin3(part of b0) - smlsl v26.4s, v8.4h, v0.4h[1] //// y1 * cos3 - y3 * sin1 - y5 * cos1(part of b1) - smlal v28.4s, v8.4h, v1.4h[3] //// y1 * sin3 - y3 * cos1 + y5 * sin1(part of b2) - smlal v30.4s, v8.4h, v0.4h[3] //// y1 * sin1 - y3 * sin3 + y5 * cos3(part of b3) + smlal v24.4s, v8.4h, v1.h[1] //// y1 * cos1 + y3 * cos3 + y5 * sin3(part of b0) + smlsl v26.4s, v8.4h, v0.h[1] //// y1 * cos3 - y3 * sin1 - y5 * cos1(part of b1) + smlal v28.4s, v8.4h, v1.h[3] //// y1 * sin3 - y3 * cos1 + y5 * sin1(part of b2) + smlal v30.4s, v8.4h, v0.h[3] //// y1 * sin1 - y3 * sin3 + y5 * cos3(part of b3) - smlsl v18.4s, v5.4h, v0.4h[2] //// d1 = y2 * sin2 - y6 * cos2(part of a0 and a1) - smlal v6.4s, v5.4h, v1.4h[2] //// d0 = y2 * cos2 + y6 * sin2(part of a0 and a1) + smlsl v18.4s, v5.4h, v0.h[2] //// d1 = y2 * sin2 - y6 * cos2(part of a0 and a1) + smlal v6.4s, v5.4h, v1.h[2] //// d0 = y2 * cos2 + y6 * sin2(part of a0 and a1) add v2.4s, v20.4s , v22.4s //// c0 = y0 * cos4 + y4 * cos4(part of a0 and a1) sub v20.4s, v20.4s , v22.4s //// c1 = y0 * cos4 - y4 * cos4(part of a0 and a1) - smlal v24.4s, v9.4h, v1.4h[3] //// b0 = y1 * cos1 + y3 * cos3 + y5 * sin3 + y7 * sin1(part of x0,x7) - smlsl v26.4s, v9.4h, v1.4h[1] //// b1 = y1 * cos3 - y3 * sin1 - y5 * cos1 - y7 * sin3(part of x1,x6) - smlal v28.4s, v9.4h, v0.4h[3] //// b2 = y1 * sin3 - y3 * cos1 + y5 * sin1 + y7 * cos3(part of x2,x5) - smlsl v30.4s, v9.4h, v0.4h[1] //// b3 = y1 * sin1 - y3 * sin3 + y5 * cos3 - y7 * cos1(part of x3,x4) + smlal v24.4s, v9.4h, v1.h[3] //// b0 = y1 * cos1 + y3 * cos3 + y5 * sin3 + y7 * sin1(part of x0,x7) + smlsl v26.4s, v9.4h, v1.h[1] //// b1 = y1 * cos3 - y3 * sin1 - y5 * cos1 - y7 * sin3(part of x1,x6) + smlal v28.4s, v9.4h, v0.h[3] //// b2 = y1 * sin3 - y3 * cos1 + y5 * sin1 + y7 * cos3(part of x2,x5) + smlsl v30.4s, v9.4h, v0.h[1] //// b3 = y1 * sin1 - y3 * sin3 + y5 * cos3 - y7 * cos1(part of x3,x4) sub v22.4s, v2.4s , v6.4s //// a3 = c0 - d0(part of x3,x4) add v4.4s, v2.4s , v6.4s //// a0 = c0 + d0(part of x0,x7) @@ -794,53 +794,53 @@ end_skip_last4cols: - smull v24.4s, v14.4h, v0.4h[1] //// y1 * cos1(part of b0) - smull v26.4s, v14.4h, v0.4h[3] //// y1 * cos3(part of b1) - smull v28.4s, v14.4h, v1.4h[1] //// y1 * sin3(part of b2) - smull v30.4s, v14.4h, v1.4h[3] //// y1 * sin1(part of b3) - smlal v24.4s, v15.4h, v0.4h[3] //// y1 * cos1 + y3 * cos3(part of b0) - smlsl v26.4s, v15.4h, v1.4h[3] //// y1 * cos3 - y3 * sin1(part of b1) - smlsl v28.4s, v15.4h, v0.4h[1] //// y1 * sin3 - y3 * cos1(part of b2) - smlsl v30.4s, v15.4h, v1.4h[1] //// y1 * sin1 - y3 * sin3(part of b3) - smull v20.4s, v10.4h, v0.4h[0] //// y0 * cos4(part of c0 and c1) - smull v22.4s, v12.4h, v0.4h[0] //// y4 * cos4(part of c0 and c1) - smull v18.4s, v11.4h, v1.4h[2] //// y2 * sin2 (q7 is freed by this time)(part of d1) - smull v14.4s, v11.4h, v0.4h[2] //// y2 * cos2(part of d0) - smlal v24.4s, v16.4h, v1.4h[1] //// y1 * cos1 + y3 * cos3 + y5 * sin3(part of b0) + smull v24.4s, v14.4h, v0.h[1] //// y1 * cos1(part of b0) + smull v26.4s, v14.4h, v0.h[3] //// y1 * cos3(part of b1) + smull v28.4s, v14.4h, v1.h[1] //// y1 * sin3(part of b2) + smull v30.4s, v14.4h, v1.h[3] //// y1 * sin1(part of b3) + smlal v24.4s, v15.4h, v0.h[3] //// y1 * cos1 + y3 * cos3(part of b0) + smlsl v26.4s, v15.4h, v1.h[3] //// y1 * cos3 - y3 * sin1(part of b1) + smlsl v28.4s, v15.4h, v0.h[1] //// y1 * sin3 - y3 * cos1(part of b2) + smlsl v30.4s, v15.4h, v1.h[1] //// y1 * sin1 - y3 * sin3(part of b3) + smull v20.4s, v10.4h, v0.h[0] //// y0 * cos4(part of c0 and c1) + smull v22.4s, v12.4h, v0.h[0] //// y4 * cos4(part of c0 and c1) + smull v18.4s, v11.4h, v1.h[2] //// y2 * sin2 (q7 is freed by this time)(part of d1) + smull v14.4s, v11.4h, v0.h[2] //// y2 * cos2(part of d0) + smlal v24.4s, v16.4h, v1.h[1] //// y1 * cos1 + y3 * cos3 + y5 * sin3(part of b0) add x4,x2,x8, lsl #1 // x4 = x2 + pred_strd * 2 => x4 points to 3rd row of pred data - smlsl v26.4s, v16.4h, v0.4h[1] //// y1 * cos3 - y3 * sin1 - y5 * cos1(part of b1) + smlsl v26.4s, v16.4h, v0.h[1] //// y1 * cos3 - y3 * sin1 - y5 * cos1(part of b1) add x5,x8,x8, lsl #1 // - smlal v28.4s, v16.4h, v1.4h[3] //// y1 * sin3 - y3 * cos1 + y5 * sin1(part of b2) + smlal v28.4s, v16.4h, v1.h[3] //// y1 * sin3 - y3 * cos1 + y5 * sin1(part of b2) add x0,x3,x7, lsl #1 // x0 points to 3rd row of dest data - smlal v30.4s, v16.4h, v0.4h[3] //// y1 * sin1 - y3 * sin3 + y5 * cos3(part of b3) + smlal v30.4s, v16.4h, v0.h[3] //// y1 * sin1 - y3 * sin3 + y5 * cos3(part of b3) add x10,x7,x7, lsl #1 // - smlsl v18.4s, v13.4h, v0.4h[2] //// d1 = y2 * sin2 - y6 * cos2(part of a0 and a1) + smlsl v18.4s, v13.4h, v0.h[2] //// d1 = y2 * sin2 - y6 * cos2(part of a0 and a1) - smlal v14.4s, v13.4h, v1.4h[2] //// d0 = y2 * cos2 + y6 * sin2(part of a0 and a1) + smlal v14.4s, v13.4h, v1.h[2] //// d0 = y2 * cos2 + y6 * sin2(part of a0 and a1) add v12.4s, v20.4s , v22.4s //// c0 = y0 * cos4 + y4 * cos4(part of a0 and a1) sub v20.4s, v20.4s , v22.4s //// c1 = y0 * cos4 - y4 * cos4(part of a0 and a1) - smlal v24.4s, v17.4h, v1.4h[3] //// b0 = y1 * cos1 + y3 * cos3 + y5 * sin3 + y7 * sin1(part of x0,x7) + smlal v24.4s, v17.4h, v1.h[3] //// b0 = y1 * cos1 + y3 * cos3 + y5 * sin3 + y7 * sin1(part of x0,x7) // swapping v3 and v6 mov v31.d[0], v3.d[0] mov v3.d[0], v6.d[0] mov v6.d[0], v31.d[0] - smlsl v26.4s, v17.4h, v1.4h[1] //// b1 = y1 * cos3 - y3 * sin1 - y5 * cos1 - y7 * sin3(part of x1,x6) + smlsl v26.4s, v17.4h, v1.h[1] //// b1 = y1 * cos3 - y3 * sin1 - y5 * cos1 - y7 * sin3(part of x1,x6) // swapping v5 and v8 mov v31.d[0], v5.d[0] mov v5.d[0], v8.d[0] mov v8.d[0], v31.d[0] - smlal v28.4s, v17.4h, v0.4h[3] //// b2 = y1 * sin3 - y3 * cos1 + y5 * sin1 + y7 * cos3(part of x2,x5) - smlsl v30.4s, v17.4h, v0.4h[1] //// b3 = y1 * sin1 - y3 * sin3 + y5 * cos3 - y7 * cos1(part of x3,x4) + smlal v28.4s, v17.4h, v0.h[3] //// b2 = y1 * sin3 - y3 * cos1 + y5 * sin1 + y7 * cos3(part of x2,x5) + smlsl v30.4s, v17.4h, v0.h[1] //// b3 = y1 * sin1 - y3 * sin3 + y5 * cos3 - y7 * cos1(part of x3,x4) sub v22.4s, v12.4s , v14.4s //// a3 = c0 - d0(part of x3,x4) add v12.4s, v12.4s , v14.4s //// a0 = c0 + d0(part of x0,x7) diff --git a/common/arm64/ihevc_mem_fns.s b/common/arm64/ihevc_mem_fns.s index 6619c6c..5b1026b 100644 --- a/common/arm64/ihevc_mem_fns.s +++ b/common/arm64/ihevc_mem_fns.s @@ -114,7 +114,7 @@ LOOP_NEON_MEMCPY: SUBS x2,x2,#8 BGE LOOP_NEON_MEMCPY - CMP x2,#-8 + CMN x2,#8 BEQ MEMCPY_RETURN ARM_MEMCPY: @@ -186,7 +186,7 @@ LOOP_NEON_MEMSET: SUBS x2,x2,#8 BGE LOOP_NEON_MEMSET - CMP x2,#-8 + CMN x2,#8 BEQ MEMSET_RETURN ARM_MEMSET: @@ -259,7 +259,7 @@ LOOP_NEON_MEMSET_16BIT: SUBS x2,x2,#8 BGE LOOP_NEON_MEMSET_16BIT - CMP x2,#-8 + CMN x2,#8 BEQ MEMSET_16BIT_RETURN ARM_MEMSET_16BIT: diff --git a/common/arm64/ihevc_sao_band_offset_chroma.s b/common/arm64/ihevc_sao_band_offset_chroma.s index f67a3de..41042ae 100644 --- a/common/arm64/ihevc_sao_band_offset_chroma.s +++ b/common/arm64/ihevc_sao_band_offset_chroma.s @@ -140,17 +140,17 @@ SRC_TOP_LOOP: //wd is always multiple of 8 LD1 {v30.8b},[x7] //pi1_sao_offset_u load ADD v5.8b, v1.8b , v31.8b //band_table_u.val[0] = vadd_u8(band_table_u.val[0], sao_band_pos_u) - dup v29.8b, v30.8b[1] //vdup_n_u8(pi1_sao_offset_u[1]) + dup v29.8b, v30.b[1] //vdup_n_u8(pi1_sao_offset_u[1]) ADD v6.8b, v2.8b , v31.8b //band_table_u.val[1] = vadd_u8(band_table_u.val[1], sao_band_pos_u) - dup v28.8b, v30.8b[2] //vdup_n_u8(pi1_sao_offset_u[2]) + dup v28.8b, v30.b[2] //vdup_n_u8(pi1_sao_offset_u[2]) ADD v7.8b, v3.8b , v31.8b //band_table_u.val[2] = vadd_u8(band_table_u.val[2], sao_band_pos_u) - dup v27.8b, v30.8b[3] //vdup_n_u8(pi1_sao_offset_u[3]) + dup v27.8b, v30.b[3] //vdup_n_u8(pi1_sao_offset_u[3]) ADD v8.8b, v4.8b , v31.8b //band_table_u.val[3] = vadd_u8(band_table_u.val[3], sao_band_pos_u) CMP x5,#28 - dup v26.8b, v30.8b[4] //vdup_n_u8(pi1_sao_offset_u[4]) + dup v26.8b, v30.b[4] //vdup_n_u8(pi1_sao_offset_u[4]) ADRP x14, :got:gu1_table_band_idx LDR x14, [x14, #:got_lo12:gu1_table_band_idx] @@ -225,16 +225,16 @@ SWITCH_BREAK_U: LD1 {v25.8b},[x8] //pi1_sao_offset_v load ADD v15.8b, v11.8b , v30.8b //band_table_v.val[2] = vadd_u8(band_table_v.val[2], band_pos_v) - dup v29.8b, v25.8b[1] //vdup_n_u8(pi1_sao_offset_v[1]) + dup v29.8b, v25.b[1] //vdup_n_u8(pi1_sao_offset_v[1]) ADD v16.8b, v12.8b , v30.8b //band_table_v.val[3] = vadd_u8(band_table_v.val[3], band_pos_v) - dup v28.8b, v25.8b[2] //vdup_n_u8(pi1_sao_offset_v[2]) + dup v28.8b, v25.b[2] //vdup_n_u8(pi1_sao_offset_v[2]) ADD v9.8b, v13.8b , v29.8b //band_table_v.val[0] = vadd_u8(band_table_v.val[0], vdup_n_u8(pi1_sao_offset_v[1])) - dup v27.8b, v25.8b[3] //vdup_n_u8(pi1_sao_offset_v[3]) + dup v27.8b, v25.b[3] //vdup_n_u8(pi1_sao_offset_v[3]) ADD v10.8b, v14.8b , v28.8b //band_table_v.val[1] = vadd_u8(band_table_v.val[1], vdup_n_u8(pi1_sao_offset_v[2])) - dup v26.8b, v25.8b[4] //vdup_n_u8(pi1_sao_offset_v[4]) + dup v26.8b, v25.b[4] //vdup_n_u8(pi1_sao_offset_v[4]) ADD v11.8b, v15.8b , v27.8b //band_table_v.val[2] = vadd_u8(band_table_v.val[2], vdup_n_u8(pi1_sao_offset_v[3])) movi v29.8b, #16 //vdup_n_u8(16) diff --git a/common/arm64/ihevc_sao_band_offset_luma.s b/common/arm64/ihevc_sao_band_offset_luma.s index 779ee69..d283a90 100644 --- a/common/arm64/ihevc_sao_band_offset_luma.s +++ b/common/arm64/ihevc_sao_band_offset_luma.s @@ -123,16 +123,16 @@ SRC_TOP_LOOP: //wd is always multiple of 8 LD1 {v30.8b},[x6] //pi1_sao_offset load ADD v5.8b, v1.8b , v31.8b //band_table.val[0] = vadd_u8(band_table.val[0], band_pos) - dup v29.8b, v30.8b[1] //vdup_n_u8(pi1_sao_offset[1]) + dup v29.8b, v30.b[1] //vdup_n_u8(pi1_sao_offset[1]) ADD v6.8b, v2.8b , v31.8b //band_table.val[1] = vadd_u8(band_table.val[1], band_pos) - dup v28.8b, v30.8b[2] //vdup_n_u8(pi1_sao_offset[2]) + dup v28.8b, v30.b[2] //vdup_n_u8(pi1_sao_offset[2]) ADD v7.8b, v3.8b , v31.8b //band_table.val[2] = vadd_u8(band_table.val[2], band_pos) - dup v27.8b, v30.8b[3] //vdup_n_u8(pi1_sao_offset[3]) + dup v27.8b, v30.b[3] //vdup_n_u8(pi1_sao_offset[3]) ADD v21.8b, v4.8b , v31.8b //band_table.val[3] = vadd_u8(band_table.val[3], band_pos) - dup v26.8b, v30.8b[4] //vdup_n_u8(pi1_sao_offset[4]) + dup v26.8b, v30.b[4] //vdup_n_u8(pi1_sao_offset[4]) ADD v1.8b, v5.8b , v29.8b //band_table.val[0] = vadd_u8(band_table.val[0], vdup_n_u8(pi1_sao_offset[1])) movi v29.8b, #16 //vdup_n_u8(16) diff --git a/common/arm64/ihevc_sao_edge_offset_class0.s b/common/arm64/ihevc_sao_edge_offset_class0.s index 91146e8..7c61aa2 100644 --- a/common/arm64/ihevc_sao_edge_offset_class0.s +++ b/common/arm64/ihevc_sao_edge_offset_class0.s @@ -123,12 +123,12 @@ WIDTH_LOOP_16: CMP x8,x9 //if(col == wd) BNE AU1_MASK_FF //jump to else part LDRB w12,[x7] //pu1_avail[0] - mov v3.8b[0], w12 //vsetq_lane_s8(pu1_avail[0], au1_mask, 0) + mov v3.b[0], w12 //vsetq_lane_s8(pu1_avail[0], au1_mask, 0) B SKIP_AU1_MASK_FF //Skip the else part AU1_MASK_FF: MOV x12,#0xFF //move -1 to x12 - mov v3.8b[0], w12 //au1_mask = vsetq_lane_s8(-1, au1_mask, 0) + mov v3.b[0], w12 //au1_mask = vsetq_lane_s8(-1, au1_mask, 0) SKIP_AU1_MASK_FF: CMP x8,#16 //If col == 16 @@ -146,7 +146,7 @@ PU1_SRC_LOOP: SUB x5,x9,x8 //wd - col SUB x14,x10,x4 //ht - row - mov v21.8b[15], w11 //vsetq_lane_u8(pu1_src_left[ht - row], pu1_cur_row_tmp, 15) + mov v21.b[15], w11 //vsetq_lane_u8(pu1_src_left[ht - row], pu1_cur_row_tmp, 15) mul x14, x14, x1 //(ht - row) * src_strd LD1 {v26.16b},[x12] //II Iteration pu1_cur_row = vld1q_u8(pu1_src_cpy) @@ -158,7 +158,7 @@ PU1_SRC_LOOP: LDRB w14,[x6,x5] //pu1_src_org[(ht - row) * src_strd + 16 - 1 + (wd - col)] SUB x4,x4,#1 - mov v28.8b[15], w11 //II Iteration vsetq_lane_u8(pu1_src_left[ht - row], pu1_cur_row_tmp, 15) + mov v28.b[15], w11 //II Iteration vsetq_lane_u8(pu1_src_left[ht - row], pu1_cur_row_tmp, 15) cmhi v18.16b, v21.16b , v17.16b //vcltq_u8(pu1_cur_row, pu1_cur_row_tmp) SUB x12,x12,x1 //Decrement the pu1_src pointer by src_strd @@ -170,7 +170,7 @@ PU1_SRC_LOOP: SUB x5,x9,x8 //II wd - col ADD x12,x12,x1 //Increment the pu1_src pointer by src_strd - mov v21.8b[0], w11 //pu1_cur_row_tmp = vsetq_lane_u8(pu1_src_cpy[16], pu1_cur_row_tmp, 0) + mov v21.b[0], w11 //pu1_cur_row_tmp = vsetq_lane_u8(pu1_src_cpy[16], pu1_cur_row_tmp, 0) cmhi v30.16b, v26.16b , v28.16b //II vcgtq_u8(pu1_cur_row, pu1_cur_row_tmp) LDRB w11,[x12,#16] //II pu1_src_cpy[16] @@ -178,7 +178,7 @@ PU1_SRC_LOOP: SUB x14,x10,x4 //II ht - row cmhi v0.16b, v28.16b , v26.16b //II vcltq_u8(pu1_cur_row, pu1_cur_row_tmp) - mov v28.8b[0], w11 //II pu1_cur_row_tmp = vsetq_lane_u8(pu1_src_cpy[16], pu1_cur_row_tmp, 0) + mov v28.b[0], w11 //II pu1_cur_row_tmp = vsetq_lane_u8(pu1_src_cpy[16], pu1_cur_row_tmp, 0) SUB x12,x12,x1 //Decrement the pu1_src pointer by src_strd mul x14, x14, x1 //II (ht - row) * src_strd @@ -271,25 +271,25 @@ WIDTH_RESIDUE: CMP x8,x9 //if(wd_rem == wd) BNE AU1_MASK_FF_RESIDUE //jump to else part LDRB w12,[x7] //pu1_avail[0] - mov v3.8b[0], w12 //vsetq_lane_s8(pu1_avail[0], au1_mask, 0) + mov v3.b[0], w12 //vsetq_lane_s8(pu1_avail[0], au1_mask, 0) B SKIP_AU1_MASK_FF_RESIDUE //Skip the else part AU1_MASK_FF_RESIDUE: MOV x12,#0xFF //move -s to x12 - mov v3.8b[0], w12 //au1_mask = vsetq_lane_s8(-1, au1_mask, 0) + mov v3.b[0], w12 //au1_mask = vsetq_lane_s8(-1, au1_mask, 0) SKIP_AU1_MASK_FF_RESIDUE: LDRB w11,[x7,#1] //pu1_avail[1] SUB x5,x9,#1 //wd - 1 MOV x4,x10 //move ht to x4 for loop count - mov v3.8b[7], w11 //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15) + mov v3.b[7], w11 //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15) MOV x12,x0 //pu1_src_cpy = pu1_src PU1_SRC_LOOP_RESIDUE: LD1 {v17.16b},[x12] //pu1_cur_row = vld1q_u8(pu1_src_cpy) LDRB w11,[x2] //load pu1_src_left - mov v21.8b[15], w11 //vsetq_lane_u8(pu1_src_left[ht - row], pu1_cur_row_tmp, 15) + mov v21.b[15], w11 //vsetq_lane_u8(pu1_src_left[ht - row], pu1_cur_row_tmp, 15) EXT v21.16b, v21.16b , v17.16b,#15 //pu1_cur_row_tmp = vextq_u8(pu1_cur_row_tmp, pu1_cur_row, 15) cmhi v16.16b, v17.16b , v21.16b //vcgtq_u8(pu1_cur_row, pu1_cur_row_tmp) @@ -297,7 +297,7 @@ PU1_SRC_LOOP_RESIDUE: SUB v20.16b, v18.16b , v16.16b //sign_left = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt)) LDRB w11,[x12,#16] //pu1_src_cpy[16] - mov v21.8b[0], w11 //pu1_cur_row_tmp = vsetq_lane_u8(pu1_src_cpy[16], pu1_cur_row_tmp, 0) + mov v21.b[0], w11 //pu1_cur_row_tmp = vsetq_lane_u8(pu1_src_cpy[16], pu1_cur_row_tmp, 0) EXT v21.16b, v17.16b , v21.16b,#1 //pu1_cur_row_tmp = vextq_u8(pu1_cur_row, pu1_cur_row_tmp, 1) cmhi v16.16b, v17.16b , v21.16b //vcgtq_u8(pu1_cur_row, pu1_cur_row_tmp) diff --git a/common/arm64/ihevc_sao_edge_offset_class0_chroma.s b/common/arm64/ihevc_sao_edge_offset_class0_chroma.s index c6be41a..2a1eb7e 100644 --- a/common/arm64/ihevc_sao_edge_offset_class0_chroma.s +++ b/common/arm64/ihevc_sao_edge_offset_class0_chroma.s @@ -141,20 +141,20 @@ WIDTH_LOOP_16: CMP x8,x9 //if(col == wd) BNE AU1_MASK_FF //jump to else part LDRB w12,[x7] //pu1_avail[0] - mov v3.8b[0], w12 //vsetq_lane_s8(pu1_avail[0], au1_mask, 0) - mov v3.8b[1], w12 //vsetq_lane_s8(pu1_avail[0], au1_mask, 1) + mov v3.b[0], w12 //vsetq_lane_s8(pu1_avail[0], au1_mask, 0) + mov v3.b[1], w12 //vsetq_lane_s8(pu1_avail[0], au1_mask, 1) B SKIP_AU1_MASK_FF //Skip the else part AU1_MASK_FF: MOV x12,#-1 //move -1 to x12 - mov v3.4h[0], w12 //au1_mask = vsetq_lane_s8(-1, au1_mask, 0) + mov v3.h[0], w12 //au1_mask = vsetq_lane_s8(-1, au1_mask, 0) SKIP_AU1_MASK_FF: CMP x8,#16 //If col == 16 BNE SKIP_MASKING_IF_NOT16 //If not skip masking LDRB w12,[x7,#1] //pu1_avail[1] - mov v3.8b[14], w12 //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 14) - mov v3.8b[15], w12 //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15) + mov v3.b[14], w12 //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 14) + mov v3.b[15], w12 //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15) SKIP_MASKING_IF_NOT16: MOV x12,x0 //pu1_src_cpy = pu1_src @@ -168,7 +168,7 @@ PU1_SRC_LOOP: SUB x5,x9,x8 //wd - col SUB x14,x10,x4 //ht - row - mov v21.4h[7], w11 //vsetq_lane_u16(pu1_src_left[ht - row], pu1_cur_row_tmp, 14,15) + mov v21.h[7], w11 //vsetq_lane_u16(pu1_src_left[ht - row], pu1_cur_row_tmp, 14,15) mul x14, x14, x1 //(ht - row) * src_strd LD1 {v30.16b},[x12] //II Iteration pu1_cur_row = vld1q_u8(pu1_src_cpy) @@ -181,7 +181,7 @@ PU1_SRC_LOOP: cmhi v16.16b, v19.16b , v21.16b //vcgtq_u8(pu1_cur_row, pu1_cur_row_tmp) ADD x5,x14,x5 //(ht - row) * src_strd + (wd - col) - mov v28.4h[7], w11 //II vsetq_lane_u16(pu1_src_left[ht - row], pu1_cur_row_tmp, 14,15) + mov v28.h[7], w11 //II vsetq_lane_u16(pu1_src_left[ht - row], pu1_cur_row_tmp, 14,15) cmhi v18.16b, v21.16b , v19.16b //vcltq_u8(pu1_cur_row, pu1_cur_row_tmp) LDRH w14,[x6,x5] //pu1_src_org[(ht - row) * src_strd + 14 + (wd - col)] @@ -191,7 +191,7 @@ PU1_SRC_LOOP: LDRB w11,[x12,#16] //pu1_src_cpy[16] EXT v28.16b, v28.16b , v30.16b,#14 //II pu1_cur_row_tmp = vextq_u8(pu1_cur_row_tmp, pu1_cur_row, 14) - mov v21.8b[0], w11 //pu1_cur_row_tmp = vsetq_lane_u8(pu1_src_cpy[16], pu1_cur_row_tmp, 0) + mov v21.b[0], w11 //pu1_cur_row_tmp = vsetq_lane_u8(pu1_src_cpy[16], pu1_cur_row_tmp, 0) cmhi v26.16b, v30.16b , v28.16b //II vcgtq_u8(pu1_cur_row, pu1_cur_row_tmp) LDRB w11,[x12,#17] //pu1_src_cpy[17] @@ -199,18 +199,18 @@ PU1_SRC_LOOP: STRH w14,[x2],#2 //pu1_src_left[(ht - row)] = au1_src_left_tmp[(ht - row)] ADD x12,x12,x1 - mov v21.8b[1], w11 //pu1_cur_row_tmp = vsetq_lane_u8(pu1_src_cpy[17], pu1_cur_row_tmp, 1) + mov v21.b[1], w11 //pu1_cur_row_tmp = vsetq_lane_u8(pu1_src_cpy[17], pu1_cur_row_tmp, 1) LDRB w11,[x12,#16] //II pu1_src_cpy[16] EXT v21.16b, v19.16b , v21.16b,#2 //pu1_cur_row_tmp = vextq_u8(pu1_cur_row, pu1_cur_row_tmp, 2) - mov v28.8b[0], w11 //II pu1_cur_row_tmp = vsetq_lane_u8(pu1_src_cpy[16], pu1_cur_row_tmp, 0) + mov v28.b[0], w11 //II pu1_cur_row_tmp = vsetq_lane_u8(pu1_src_cpy[16], pu1_cur_row_tmp, 0) LDRB w11,[x12,#17] //II pu1_src_cpy[17] cmhi v16.16b, v19.16b , v21.16b //vcgtq_u8(pu1_cur_row, pu1_cur_row_tmp) SUB x12,x12,x1 cmhi v18.16b, v21.16b , v19.16b //vcltq_u8(pu1_cur_row, pu1_cur_row_tmp) - mov v28.8b[1], w11 //II pu1_cur_row_tmp = vsetq_lane_u8(pu1_src_cpy[17], pu1_cur_row_tmp, 1) + mov v28.b[1], w11 //II pu1_cur_row_tmp = vsetq_lane_u8(pu1_src_cpy[17], pu1_cur_row_tmp, 1) SUB v22.16b, v18.16b , v16.16b //sign_right = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt)) EXT v28.16b, v30.16b , v28.16b,#2 //II pu1_cur_row_tmp = vextq_u8(pu1_cur_row, pu1_cur_row_tmp, 2) @@ -328,18 +328,18 @@ WIDTH_RESIDUE: CMP x8,x9 //if(wd_rem == wd) BNE AU1_MASK_FF_RESIDUE //jump to else part LDRB w12,[x7] //pu1_avail[0] - mov v3.8b[0], w12 //vsetq_lane_s8(pu1_avail[0], au1_mask, 0) - mov v3.8b[1], w12 //vsetq_lane_s8(pu1_avail[0], au1_mask, 0) + mov v3.b[0], w12 //vsetq_lane_s8(pu1_avail[0], au1_mask, 0) + mov v3.b[1], w12 //vsetq_lane_s8(pu1_avail[0], au1_mask, 0) B SKIP_AU1_MASK_FF_RESIDUE //Skip the else part AU1_MASK_FF_RESIDUE: MOV x12,#-1 //move -1 to x12 - mov v3.4h[0], w12 //au1_mask = vsetq_lane_s8(-1, au1_mask, 0) + mov v3.h[0], w12 //au1_mask = vsetq_lane_s8(-1, au1_mask, 0) SKIP_AU1_MASK_FF_RESIDUE: LDRB w12,[x7,#1] //pu1_avail[1] - mov v3.8b[6], w12 //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15) - mov v3.8b[7], w12 //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15) + mov v3.b[6], w12 //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15) + mov v3.b[7], w12 //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15) MOV x12,x0 //pu1_src_cpy = pu1_src MOV x4,x10 //move ht to x4 for loop count @@ -352,7 +352,7 @@ PU1_SRC_LOOP_RESIDUE: SUB x5,x9,#2 //wd - 2 SUB x14,x10,x4 //(ht - row) - mov v21.4h[7], w11 //vsetq_lane_u8(pu1_src_left[ht - row], pu1_cur_row_tmp, 15) + mov v21.h[7], w11 //vsetq_lane_u8(pu1_src_left[ht - row], pu1_cur_row_tmp, 15) LSL x14,x14,#1 //(ht - row) * 2 LD1 {v30.16b},[x12] //II pu1_cur_row = vld1q_u8(pu1_src_cpy) @@ -366,20 +366,20 @@ PU1_SRC_LOOP_RESIDUE: mul x14, x14, x1 //(ht - row) * 2 * src_strd cmhi v18.16b, v21.16b , v19.16b //vcltq_u8(pu1_cur_row, pu1_cur_row_tmp) - mov v28.4h[7], w11 //II vsetq_lane_u8(pu1_src_left[ht - row], pu1_cur_row_tmp, 15) + mov v28.h[7], w11 //II vsetq_lane_u8(pu1_src_left[ht - row], pu1_cur_row_tmp, 15) LDRB w11,[x12,#16] //pu1_src_cpy[16] SUB v20.16b, v18.16b , v16.16b //sign_left = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt)) ADD x5,x14,x5 //(ht - row) * 2 * src_strd + (wd - 2) - mov v21.8b[0], w11 //pu1_cur_row_tmp = vsetq_lane_u8(pu1_src_cpy[16], pu1_cur_row_tmp, 0) + mov v21.b[0], w11 //pu1_cur_row_tmp = vsetq_lane_u8(pu1_src_cpy[16], pu1_cur_row_tmp, 0) EXT v28.16b, v28.16b , v30.16b,#14 //II pu1_cur_row_tmp = vextq_u8(pu1_cur_row_tmp, pu1_cur_row, 15) LDRB w11,[x12,#17] //pu1_src_cpy[17] cmhi v26.16b, v30.16b , v28.16b //II vcgtq_u8(pu1_cur_row, pu1_cur_row_tmp) LDRH w14,[x6, x5] //pu1_src_org[(ht - row) * 2* src_strd + (wd - 2)] - mov v21.8b[1], w11 //pu1_cur_row_tmp = vsetq_lane_u8(pu1_src_cpy[17], pu1_cur_row_tmp, 1) + mov v21.b[1], w11 //pu1_cur_row_tmp = vsetq_lane_u8(pu1_src_cpy[17], pu1_cur_row_tmp, 1) cmhi v24.16b, v28.16b , v30.16b //II vcltq_u8(pu1_cur_row, pu1_cur_row_tmp) ADD x12,x12,x1 @@ -388,14 +388,14 @@ PU1_SRC_LOOP_RESIDUE: LDRB w11,[x12,#16] //II pu1_src_cpy[16] cmhi v16.16b, v19.16b , v21.16b //vcgtq_u8(pu1_cur_row, pu1_cur_row_tmp) - mov v28.8b[0], w11 //II pu1_cur_row_tmp = vsetq_lane_u8(pu1_src_cpy[16], pu1_cur_row_tmp, 0) + mov v28.b[0], w11 //II pu1_cur_row_tmp = vsetq_lane_u8(pu1_src_cpy[16], pu1_cur_row_tmp, 0) LDRB w11,[x12,#17] //II pu1_src_cpy[17] cmhi v18.16b, v21.16b , v19.16b //vcltq_u8(pu1_cur_row, pu1_cur_row_tmp) SUB x4,x4,#1 //II Decrement row by 1 SUB v22.16b, v18.16b , v16.16b //sign_right = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt)) - mov v28.8b[1], w11 //II pu1_cur_row_tmp = vsetq_lane_u8(pu1_src_cpy[17], pu1_cur_row_tmp, 1) + mov v28.b[1], w11 //II pu1_cur_row_tmp = vsetq_lane_u8(pu1_src_cpy[17], pu1_cur_row_tmp, 1) SUB x12,x12,x1 ADD v21.16b, v2.16b , v20.16b //edge_idx = vaddq_s8(const_2, sign_left) diff --git a/common/arm64/ihevc_sao_edge_offset_class2.s b/common/arm64/ihevc_sao_edge_offset_class2.s index 31852f3..59eeadd 100644 --- a/common/arm64/ihevc_sao_edge_offset_class2.s +++ b/common/arm64/ihevc_sao_edge_offset_class2.s @@ -239,11 +239,11 @@ WIDTH_LOOP_16: MOV x20,#-1 csel x8, x20, x8,NE //au1_mask = vsetq_lane_s8(-1, au1_mask, 0) - mov v1.8b[0], w8 //au1_mask = vsetq_lane_s8((-1||pu1_avail[0]), au1_mask, 0) + mov v1.b[0], w8 //au1_mask = vsetq_lane_s8((-1||pu1_avail[0]), au1_mask, 0) CMP x6,#16 //if(col == 16) BNE SKIP_AU1_MASK_VAL LDRB w8,[x5,#1] //pu1_avail[1] - mov v1.16b[15], w8 //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15) + mov v1.b[15], w8 //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15) SKIP_AU1_MASK_VAL: LDRB w11,[x5,#2] //pu1_avail[2] @@ -289,7 +289,7 @@ AU1_SRC_LEFT_LOOP: LDRB w4,[x4,#2] //I pu1_avail[2] LDRB w5,[x8,#16] //I pu1_src_cpy[src_strd + 16] - mov v18.8b[0], w5 //I pu1_next_row_tmp = vsetq_lane_u8(pu1_src_cpy[src_strd + 16], pu1_next_row_tmp, 0) + mov v18.b[0], w5 //I pu1_next_row_tmp = vsetq_lane_u8(pu1_src_cpy[src_strd + 16], pu1_next_row_tmp, 0) EXT v18.16b, v16.16b , v18.16b,#1 //I pu1_next_row_tmp = vextq_u8(pu1_next_row, pu1_next_row_tmp, 1) CMP x4,#0 //I @@ -307,7 +307,7 @@ SIGN_UP_CHANGE: csel x4, x20, x4,LT //I MOV x20,#1 csel x4, x20, x4,GT //I SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[ht_tmp - 1 - row]) - mov v17.8b[0], w4 //I sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[ht_tmp - 1 - row]), sign_up, 0) + mov v17.b[0], w4 //I sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[ht_tmp - 1 - row]), sign_up, 0) SIGN_UP_CHANGE_DONE: cmhi v3.16b, v5.16b , v18.16b //I vcgtq_u8(pu1_cur_row, pu1_next_row_tmp) @@ -359,7 +359,7 @@ PU1_SRC_LOOP: LDRB w4,[x0] //II pu1_src_cpy[0] LDRB w8,[x11,#16] //III pu1_src_cpy[src_strd + 16] - mov v28.8b[0], w5 //II pu1_next_row_tmp = vsetq_lane_u8(pu1_src_cpy[src_strd + 16], pu1_next_row_tmp, 0) + mov v28.b[0], w5 //II pu1_next_row_tmp = vsetq_lane_u8(pu1_src_cpy[src_strd + 16], pu1_next_row_tmp, 0) SUB x5,x12,x7 //II ht_tmp - row EXT v22.16b, v16.16b , v28.16b,#1 //II pu1_next_row_tmp = vextq_u8(pu1_next_row, pu1_next_row_tmp, 1) @@ -367,7 +367,7 @@ PU1_SRC_LOOP: SUB x5,x5,#1 LDRB w5,[x5] //II load the value - mov v18.8b[0], w8 //III pu1_next_row_tmp = vsetq_lane_u8(pu1_src_cpy[src_strd + 16], pu1_next_row_tmp, 0) + mov v18.b[0], w8 //III pu1_next_row_tmp = vsetq_lane_u8(pu1_src_cpy[src_strd + 16], pu1_next_row_tmp, 0) SUB x7,x7,#1 //II Decrement the ht_tmp loop count by 1 SUBS x4,x4,x5 //II pu1_src_cpy[0] - pu1_src_left_cpy[ht_tmp - 1 - row] @@ -389,7 +389,7 @@ PU1_SRC_LOOP: LDRB w5,[x5] //III load the value SUBS x2,x2,x5 //III pu1_src_cpy[0] - pu1_src_left_cpy[ht_tmp - 1 - row] - mov v17.8b[0], w4 //II sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[ht_tmp - 1 - row]), sign_up, 0) + mov v17.b[0], w4 //II sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[ht_tmp - 1 - row]), sign_up, 0) movn x20,#0 csel x2, x20, x2,LT //III @@ -409,7 +409,7 @@ PU1_SRC_LOOP: EXT v17.16b, v17.16b , v17.16b,#15 //II sign_up = vextq_s8(sign_up, sign_up, 15) AND v22.16b, v22.16b , v1.16b //II edge_idx = vandq_s8(edge_idx, au1_mask) - mov v17.8b[0], w2 //III sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[ht_tmp - 1 - row]), sign_up, 0) + mov v17.b[0], w2 //III sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[ht_tmp - 1 - row]), sign_up, 0) ADD v18.16b, v0.16b , v17.16b //III edge_idx = vaddq_s8(const_2, sign_up) TBL v24.16b, {v7.16b},v22.16b //II offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx)) @@ -471,7 +471,7 @@ PU1_SRC_LOOP: LDRB w5,[x8,#16] //pu1_src_cpy[src_strd + 16] SUB x11,x12,x7 //ht_tmp - row - mov v18.8b[0], w5 //pu1_next_row_tmp = vsetq_lane_u8(pu1_src_cpy[src_strd + 16], pu1_next_row_tmp, 0) + mov v18.b[0], w5 //pu1_next_row_tmp = vsetq_lane_u8(pu1_src_cpy[src_strd + 16], pu1_next_row_tmp, 0) ADD x11,x14,x11 //pu1_src_left_cpy[ht_tmp - row] SUB x11,x11,#1 @@ -488,7 +488,7 @@ PU1_SRC_LOOP: csel x4, x20, x4,GT //SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[ht_tmp - 1 - row]) cmhi v18.16b, v18.16b , v5.16b //vcltq_u8(pu1_cur_row, pu1_next_row_tmp) - mov v17.8b[0], w4 //sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[ht_tmp - 1 - row]), sign_up, 0) + mov v17.b[0], w4 //sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[ht_tmp - 1 - row]), sign_up, 0) SUB v3.16b, v18.16b , v3.16b //sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt)) ADD v18.16b, v0.16b , v17.16b //edge_idx = vaddq_s8(const_2, sign_up) @@ -556,11 +556,11 @@ WD_16_HT_4_LOOP: MOV x20,#-1 csel x8, x20, x8,NE //au1_mask = vsetq_lane_s8(-1, au1_mask, 0) - mov v1.8b[0], w8 //au1_mask = vsetq_lane_s8((-1||pu1_avail[0]), au1_mask, 0) + mov v1.b[0], w8 //au1_mask = vsetq_lane_s8((-1||pu1_avail[0]), au1_mask, 0) CMP x6,#16 //if(col == 16) BNE SKIP_AU1_MASK_VAL_WD_16_HT_4 LDRB w8,[x5,#1] //pu1_avail[1] - mov v1.16b[15], w8 //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15) + mov v1.b[15], w8 //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15) SKIP_AU1_MASK_VAL_WD_16_HT_4: LDRB w8,[x5,#2] //pu1_avail[2] @@ -605,7 +605,7 @@ PU1_SRC_LOOP_WD_16_HT_4: LD1 {v16.16b},[x8] //pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd) LDRB w5,[x8,#16] //pu1_src_cpy[src_strd + 16] - mov v18.8b[0], w5 //pu1_next_row_tmp = vsetq_lane_u8(pu1_src_cpy[src_strd + 16], pu1_next_row_tmp, 0) + mov v18.b[0], w5 //pu1_next_row_tmp = vsetq_lane_u8(pu1_src_cpy[src_strd + 16], pu1_next_row_tmp, 0) EXT v18.16b, v16.16b , v18.16b,#1 //pu1_next_row_tmp = vextq_u8(pu1_next_row, pu1_next_row_tmp, 1) CMP x7,x12 @@ -626,7 +626,7 @@ SIGN_UP_CHANGE_WD_16_HT_4: csel x8, x20, x8,LT MOV x20,#1 csel x8, x20, x8,GT //SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[ht_tmp - 1 - row]) - mov v17.8b[0], w8 //sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[ht_tmp - 1 - row]), sign_up, 0) + mov v17.b[0], w8 //sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[ht_tmp - 1 - row]), sign_up, 0) SIGN_UP_CHANGE_DONE_WD_16_HT_4: cmhi v20.16b, v5.16b , v18.16b //vcgtq_u8(pu1_cur_row, pu1_next_row_tmp) @@ -689,10 +689,10 @@ WIDTH_RESIDUE: MOV x20,#-1 csel x8, x20, x8,NE - mov v1.8b[0], w8 //au1_mask = vsetq_lane_s8(-1, au1_mask, 0) + mov v1.b[0], w8 //au1_mask = vsetq_lane_s8(-1, au1_mask, 0) LDRB w8,[x5,#1] //pu1_avail[1] - mov v1.8b[7], w8 //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15) + mov v1.b[7], w8 //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15) PU1_AVAIL_2_RESIDUE: LDRB w11,[x5,#2] //pu1_avail[2] @@ -737,7 +737,7 @@ PU1_SRC_LOOP_RESIDUE: LD1 {v16.16b},[x8] //pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd) LDRB w8,[x8,#16] //pu1_src_cpy[src_strd + 16] - mov v18.8b[0], w8 //pu1_next_row_tmp = vsetq_lane_u8(pu1_src_cpy[src_strd + 16], pu1_next_row_tmp, 0) + mov v18.b[0], w8 //pu1_next_row_tmp = vsetq_lane_u8(pu1_src_cpy[src_strd + 16], pu1_next_row_tmp, 0) EXT v18.16b, v16.16b , v18.16b,#1 //pu1_next_row_tmp = vextq_u8(pu1_next_row, pu1_next_row_tmp, 1) CMP x7,x12 @@ -759,7 +759,7 @@ SIGN_UP_CHANGE_RESIDUE: csel x8, x20, x8,LT MOV x20,#1 csel x8, x20, x8,GT //SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[ht_tmp - 1 - row]) - mov v17.8b[0], w8 //sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[ht_tmp - 1 - row]), sign_up, 0) + mov v17.b[0], w8 //sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[ht_tmp - 1 - row]), sign_up, 0) SIGN_UP_CHANGE_DONE_RESIDUE: cmhi v20.16b, v5.16b , v18.16b //vcgtq_u8(pu1_cur_row, pu1_next_row_tmp) diff --git a/common/arm64/ihevc_sao_edge_offset_class2_chroma.s b/common/arm64/ihevc_sao_edge_offset_class2_chroma.s index 8e286b4..b430709 100644 --- a/common/arm64/ihevc_sao_edge_offset_class2_chroma.s +++ b/common/arm64/ihevc_sao_edge_offset_class2_chroma.s @@ -322,7 +322,7 @@ PU1_AVAIL_3_LOOP: LDR x2, [x2, #:got_lo12:gi1_table_edge_idx] MOV x6,x7 //move wd to x6 loop_count - movi v1.16b, #0XFF //au1_mask = vdupq_n_s8(-1) + movi v1.16b, #0xFF //au1_mask = vdupq_n_s8(-1) CMP x7,#16 //Compare wd with 16 BLT WIDTH_RESIDUE //If not jump to WIDTH_RESIDUE where loop is unrolled for 8 case @@ -338,15 +338,15 @@ WIDTH_LOOP_16: MOV x20,#-1 csel x8, x20, x8,NE - mov v1.8b[0], w8 //au1_mask = vsetq_lane_s8(-1, au1_mask, 0) + mov v1.b[0], w8 //au1_mask = vsetq_lane_s8(-1, au1_mask, 0) CMP x6,#16 //if(col == 16) - mov v1.8b[1], w8 //au1_mask = vsetq_lane_s8(-1, au1_mask, 0) + mov v1.b[1], w8 //au1_mask = vsetq_lane_s8(-1, au1_mask, 0) BNE SKIP_AU1_MASK_VAL LDRB w8,[x5,#1] //pu1_avail[1] - mov v1.16b[14], w8 //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15) - mov v1.16b[15], w8 //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15) + mov v1.b[14], w8 //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15) + mov v1.b[15], w8 //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15) SKIP_AU1_MASK_VAL: LDRB w9,[x5,#2] //pu1_avail[2] @@ -400,7 +400,7 @@ AU1_SRC_LEFT_LOOP: LDRH w5,[x8] //I pu1_src_cpy[src_strd + 16] mov x10, x21 //I Loads pu1_avail - mov v18.4h[0], w5 //I pu1_next_row_tmp = vsetq_lane_u8(pu1_src_cpy[src_strd + 16], pu1_next_row_tmp, 0) + mov v18.h[0], w5 //I pu1_next_row_tmp = vsetq_lane_u8(pu1_src_cpy[src_strd + 16], pu1_next_row_tmp, 0) LDRB w10,[x10,#2] //I pu1_avail[2] CMP x10,#0 //I @@ -430,13 +430,13 @@ AU1_SRC_LEFT_LOOP: csel x8, x20, x8,GT //I SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2]) CMP x4,#0 //I - mov v17.8b[0], w8 //I sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2]), sign_up, 0) + mov v17.b[0], w8 //I sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2]), sign_up, 0) movn x20,#0 csel x4, x20, x4,LT //I MOV x20,#1 csel x4, x20, x4,GT //I SIGN(pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2]) - mov v17.8b[1], w4 //I sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2 + 1]), sign_up, 1) + mov v17.b[1], w4 //I sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2 + 1]), sign_up, 1) SIGN_UP_CHANGE_DONE: LD1 {v30.8b},[x2] //edge_idx_tbl = vld1_s8(gi1_table_edge_idx) @@ -499,7 +499,7 @@ PU1_SRC_LOOP: LDRH w5,[x8] //II pu1_src_cpy[src_strd + 16] ADD x11,x11,#16 //III - mov v28.4h[0], w5 //II pu1_next_row_tmp = vsetq_lane_u8(pu1_src_cpy[src_strd + 16], pu1_next_row_tmp, 0) + mov v28.h[0], w5 //II pu1_next_row_tmp = vsetq_lane_u8(pu1_src_cpy[src_strd + 16], pu1_next_row_tmp, 0) LDRH w4,[x11] //III pu1_src_cpy[src_strd + 16] LDRB w8,[x0,x1] //II pu1_src_cpy[0] @@ -507,7 +507,7 @@ PU1_SRC_LOOP: SUB x5,x12,x7 //II ht_tmp - row LSL x5,x5,#1 //II (ht_tmp - row) * 2 - mov v18.4h[0], w4 //III pu1_next_row_tmp = vsetq_lane_u8(pu1_src_cpy[src_strd + 16], pu1_next_row_tmp, 0) + mov v18.h[0], w4 //III pu1_next_row_tmp = vsetq_lane_u8(pu1_src_cpy[src_strd + 16], pu1_next_row_tmp, 0) ADD x9,x14,x5 //II pu1_src_left_cpy[(ht_tmp - row) * 2] sub x13,x9,#2 @@ -527,7 +527,7 @@ PU1_SRC_LOOP: sub x13,x9,#1 LDRB w5,[x13] //II load the value - mov v17.8b[0], w8 //II sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2]), sign_up, 0) + mov v17.b[0], w8 //II sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2]), sign_up, 0) SUB x7,x7,#1 //II Decrement the ht_tmp loop count by 1 SUB x11,x11,x5 //II pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2 + 1] @@ -545,7 +545,7 @@ PU1_SRC_LOOP: SUB x5,x12,x7 //III ht_tmp - row ADD x10,x0,x1 - mov v17.8b[1], w11 //II sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2 + 1]), sign_up, 1) + mov v17.b[1], w11 //II sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2 + 1]), sign_up, 1) LSL x5,x5,#1 //III (ht_tmp - row) * 2 ADD x9,x14,x5 //III pu1_src_left_cpy[(ht_tmp - row) * 2] @@ -579,7 +579,7 @@ PU1_SRC_LOOP: UZP1 v31.8b, v26.8b, v27.8b UZP2 v27.8b, v26.8b, v27.8b //II mov v26.8b,v31.8b - mov v17.8b[0], w4 //III sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2]), sign_up, 0) + mov v17.b[0], w4 //III sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2]), sign_up, 0) movn x20,#0 csel x10, x20, x10,LT //III @@ -592,7 +592,7 @@ PU1_SRC_LOOP: TBL v25.8b, {v7.16b},v27.8b //II SUB v22.16b, v22.16b , v20.16b //III sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt)) - mov v17.8b[1], w10 //III sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2 + 1]), sign_up, 1) + mov v17.b[1], w10 //III sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2 + 1]), sign_up, 1) ZIP1 v31.8b, v24.8b, v25.8b ZIP2 v25.8b, v24.8b, v25.8b //II mov v24.8b,v31.8b @@ -668,7 +668,7 @@ PU1_SRC_LOOP: LDRH w5,[x8] //pu1_src_cpy[src_strd + 16] LSL x4,x4,#1 //(ht_tmp - row) * 2 - mov v18.4h[0], w5 //pu1_next_row_tmp = vsetq_lane_u8(pu1_src_cpy[src_strd + 16], pu1_next_row_tmp, 0) + mov v18.h[0], w5 //pu1_next_row_tmp = vsetq_lane_u8(pu1_src_cpy[src_strd + 16], pu1_next_row_tmp, 0) ADD x9,x14,x4 //pu1_src_left_cpy[(ht_tmp - row) * 2] sub x13,x9,#2 @@ -686,7 +686,7 @@ PU1_SRC_LOOP: LD1 {v30.8b},[x2] //edge_idx_tbl = vld1_s8(gi1_table_edge_idx) LDRB w11,[x0,#1] //pu1_src_cpy[0] - mov v17.8b[0], w8 //sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2]), sign_up, 0) + mov v17.b[0], w8 //sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2]), sign_up, 0) sub x13,x9,#1 LDRB w5,[x13] //load the value @@ -700,7 +700,7 @@ PU1_SRC_LOOP: MOV x20,#1 csel x4, x20, x4,GT //SIGN(pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2]) - mov v17.8b[1], w4 //sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2 + 1]), sign_up, 1) + mov v17.b[1], w4 //sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2 + 1]), sign_up, 1) SUB v24.16b, v24.16b , v22.16b //sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt)) ADD v26.16b, v0.16b , v17.16b //edge_idx = vaddq_s8(const_2, sign_up) @@ -771,14 +771,14 @@ WD_16_HT_4_LOOP: MOV x20,#-1 csel x8, x20, x8,NE - mov v1.8b[0], w8 //au1_mask = vsetq_lane_s8(-1, au1_mask, 0) - mov v1.8b[1], w8 //au1_mask = vsetq_lane_s8(-1, au1_mask, 0) + mov v1.b[0], w8 //au1_mask = vsetq_lane_s8(-1, au1_mask, 0) + mov v1.b[1], w8 //au1_mask = vsetq_lane_s8(-1, au1_mask, 0) CMP x6,#16 //if(col == 16) BNE SKIP_AU1_MASK_VAL_WD_16_HT_4 LDRB w8,[x5,#1] //pu1_avail[1] - mov v1.16b[14], w8 //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15) - mov v1.16b[15], w8 //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15) + mov v1.b[14], w8 //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15) + mov v1.b[15], w8 //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15) SKIP_AU1_MASK_VAL_WD_16_HT_4: LDRB w8,[x5,#2] //pu1_avail[2] @@ -828,7 +828,7 @@ PU1_SRC_LOOP_WD_16_HT_4: ADD x8,x8,#16 LDRH w5,[x8] //pu1_src_cpy[src_strd + 16] - mov v18.4h[0], w5 //pu1_next_row_tmp = vsetq_lane_u8(pu1_src_cpy[src_strd + 16], pu1_next_row_tmp, 0) + mov v18.h[0], w5 //pu1_next_row_tmp = vsetq_lane_u8(pu1_src_cpy[src_strd + 16], pu1_next_row_tmp, 0) EXT v18.16b, v16.16b , v18.16b,#2 //pu1_next_row_tmp = vextq_u8(pu1_next_row, pu1_next_row_tmp, 2) CMP x7,x12 @@ -851,7 +851,7 @@ SIGN_UP_CHANGE_WD_16_HT_4: csel x8, x20, x8,LT MOV x20,#1 csel x8, x20, x8,GT //SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2]) - mov v17.8b[0], w8 //sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2]), sign_up, 0) + mov v17.b[0], w8 //sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2]), sign_up, 0) LDRB w8,[x0,#1] //pu1_src_cpy[0] sub x13,x9,#1 @@ -862,7 +862,7 @@ SIGN_UP_CHANGE_WD_16_HT_4: csel x8, x20, x8,LT MOV x20,#1 csel x8, x20, x8,GT //SIGN(pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2]) - mov v17.8b[1], w8 //sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2 + 1]), sign_up, 1) + mov v17.b[1], w8 //sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2 + 1]), sign_up, 1) SIGN_UP_CHANGE_DONE_WD_16_HT_4: cmhi v22.16b, v5.16b , v18.16b //vcgtq_u8(pu1_cur_row, pu1_next_row_tmp) @@ -936,12 +936,12 @@ WIDTH_RESIDUE: MOV x20,#-1 csel x8, x20, x8,NE - mov v1.8b[0], w8 //au1_mask = vsetq_lane_s8(-1, au1_mask, 0) - mov v1.8b[1], w8 //au1_mask = vsetq_lane_s8(-1, au1_mask, 0) + mov v1.b[0], w8 //au1_mask = vsetq_lane_s8(-1, au1_mask, 0) + mov v1.b[1], w8 //au1_mask = vsetq_lane_s8(-1, au1_mask, 0) LDRB w8,[x5,#1] //pu1_avail[1] - mov v1.8b[6], w8 //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15) - mov v1.8b[7], w8 //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15) + mov v1.b[6], w8 //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15) + mov v1.b[7], w8 //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15) LDRB w8,[x5,#2] //pu1_avail[2] CMP x8,#0 @@ -986,7 +986,7 @@ PU1_SRC_LOOP_RESIDUE: ADD x8,x8,#16 LDRH w5,[x8] //pu1_src_cpy[src_strd + 16] - mov v18.4h[0], w5 //pu1_next_row_tmp = vsetq_lane_u8(pu1_src_cpy[src_strd + 16], pu1_next_row_tmp, 0) + mov v18.h[0], w5 //pu1_next_row_tmp = vsetq_lane_u8(pu1_src_cpy[src_strd + 16], pu1_next_row_tmp, 0) EXT v18.16b, v16.16b , v18.16b,#2 //pu1_next_row_tmp = vextq_u8(pu1_next_row, pu1_next_row_tmp, 2) CMP x7,x12 @@ -1009,7 +1009,7 @@ SIGN_UP_CHANGE_RESIDUE: csel x8, x20, x8,LT MOV x20,#1 csel x8, x20, x8,GT //SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2]) - mov v17.8b[0], w8 //sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2]), sign_up, 0) + mov v17.b[0], w8 //sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2]), sign_up, 0) LDRB w8,[x0,#1] //pu1_src_cpy[0] sub x13,x9,#1 @@ -1020,7 +1020,7 @@ SIGN_UP_CHANGE_RESIDUE: csel x8, x20, x8,LT MOV x20,#1 csel x8, x20, x8,GT //SIGN(pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2]) - mov v17.8b[1], w8 //sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2 + 1]), sign_up, 1) + mov v17.b[1], w8 //sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2 + 1]), sign_up, 1) SIGN_UP_CHANGE_DONE_RESIDUE: cmhi v22.16b, v5.16b , v18.16b //vcgtq_u8(pu1_cur_row, pu1_next_row_tmp) diff --git a/common/arm64/ihevc_sao_edge_offset_class3.s b/common/arm64/ihevc_sao_edge_offset_class3.s index f393753..9d4f26a 100644 --- a/common/arm64/ihevc_sao_edge_offset_class3.s +++ b/common/arm64/ihevc_sao_edge_offset_class3.s @@ -247,12 +247,12 @@ WIDTH_LOOP_16: csel w8,w20,w8,EQ MOV x20,#-1 csel x8, x20, x8,NE - mov v1.8b[0], w8 //au1_mask = vsetq_lane_s8(-1, au1_mask, 0) + mov v1.b[0], w8 //au1_mask = vsetq_lane_s8(-1, au1_mask, 0) CMP x6,#16 //if(col == 16) BNE SKIP_AU1_MASK_VAL LDRB w8,[x5,#1] //pu1_avail[1] - mov v1.16b[15], w8 //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15) + mov v1.b[15], w8 //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15) SKIP_AU1_MASK_VAL: LDRB w8,[x5,#2] //pu1_avail[2] @@ -302,7 +302,7 @@ AU1_SRC_LEFT_LOOP: LDRB w8,[x8] MOV x5,x23 //I Loads pu1_avail - mov v18.16b[15], w8 //I vsetq_lane_u8 + mov v18.b[15], w8 //I vsetq_lane_u8 LDRB w5,[x5,#2] //I pu1_avail[2] EXT v18.16b, v18.16b , v16.16b,#15 //I pu1_next_row_tmp = vextq_u8(pu1_next_row_tmp, pu1_next_row, 15) @@ -320,7 +320,7 @@ SIGN_UP_CHANGE: csel x8, x20, x8,LT //I MOV x20,#1 csel x8, x20, x8,GT //I SIGN(pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd]) - mov v17.16b[15], w8 //I sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd]), sign_up, 15) + mov v17.b[15], w8 //I sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd]), sign_up, 15) SIGN_UP_CHANGE_DONE: cmhi v3.16b, v5.16b , v18.16b //I vcgtq_u8(pu1_cur_row, pu1_next_row_tmp) @@ -373,7 +373,7 @@ PU1_SRC_LOOP: LDRB w8,[x8,#1] LDRB w4,[x0,#16] //II load the value - mov v18.16b[15], w8 //II vsetq_lane_u8 + mov v18.b[15], w8 //II vsetq_lane_u8 SUB x11,x11,x4 //II pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd] CMP x11,#0 //II @@ -387,7 +387,7 @@ PU1_SRC_LOOP: csel x11, x20, x11,GT //II SIGN(pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd]) ADD x8,x14,x5 //III pu1_src_left_cpy[ht_tmp - row] - mov v17.8b[15], w11 //II sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd]), sign_up, 15) + mov v17.b[15], w11 //II sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd]), sign_up, 15) CMP x7,#1 //III BNE NEXT_ROW_ELSE_2 //III @@ -412,7 +412,7 @@ NEXT_ROW_ELSE_2: movn x20,#0 csel x2, x20, x2,LT //III - mov v18.16b[15], w8 //III vsetq_lane_u8 + mov v18.b[15], w8 //III vsetq_lane_u8 MOV x20,#1 csel x2, x20, x2,GT //III SIGN(pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd]) @@ -428,7 +428,7 @@ NEXT_ROW_ELSE_2: TBL v26.16b, {v6.16b},v26.16b //II vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx)) cmhi v3.16b, v16.16b , v18.16b //III vcgtq_u8(pu1_cur_row, pu1_next_row_tmp) - mov v17.16b[15], w2 //III sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd]), sign_up, 15) + mov v17.b[15], w2 //III sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd]), sign_up, 15) // TBL v27.8b, {v6.16b},v27.8b //II vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx)) cmhi v18.16b, v18.16b , v16.16b //III vcltq_u8(pu1_cur_row, pu1_next_row_tmp) @@ -506,7 +506,7 @@ NEXT_ROW_ELSE_3: NEXT_ROW_POINTER_ASSIGNED_3: LDRB w11,[x4,#15] //pu1_src_cpy[15] - mov v18.16b[15], w8 //vsetq_lane_u8 + mov v18.b[15], w8 //vsetq_lane_u8 SUB x8,x11,x5 //pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd] CMP x8,#0 @@ -521,7 +521,7 @@ NEXT_ROW_POINTER_ASSIGNED_3: csel x8, x20, x8,GT //SIGN(pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd]) cmhi v26.16b, v18.16b , v5.16b //vcltq_u8(pu1_cur_row, pu1_next_row_tmp) - mov v17.16b[15], w8 //sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd]), sign_up, 15) + mov v17.b[15], w8 //sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd]), sign_up, 15) SUB v24.16b, v26.16b , v24.16b //sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt)) Uxtl v20.8h, v5.8b //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row))) @@ -582,12 +582,12 @@ WD_16_HT_4_LOOP: csel w8,w20,w8,EQ MOV x20,#-1 csel x8, x20, x8,NE - mov v1.8b[0], w8 //au1_mask = vsetq_lane_s8(-1, au1_mask, 0) + mov v1.b[0], w8 //au1_mask = vsetq_lane_s8(-1, au1_mask, 0) CMP x6,#16 //if(col == 16) BNE SKIP_AU1_MASK_VAL_WD_16_HT_4 LDRB w8,[x5,#1] //pu1_avail[1] - mov v1.16b[15], w8 //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15) + mov v1.b[15], w8 //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15) SKIP_AU1_MASK_VAL_WD_16_HT_4: LDRB w8,[x5,#2] //pu1_avail[2] @@ -643,7 +643,7 @@ NEXT_ROW_ELSE_WD_16_HT_4: LDRB w8,[x8] NEXT_ROW_POINTER_ASSIGNED_WD_16_HT_4: - mov v18.16b[15], w8 //vsetq_lane_u8 + mov v18.b[15], w8 //vsetq_lane_u8 EXT v18.16b, v18.16b , v16.16b,#15 //pu1_next_row_tmp = vextq_u8(pu1_next_row_tmp, pu1_next_row, 15) CMP x7,x12 @@ -664,7 +664,7 @@ SIGN_UP_CHANGE_WD_16_HT_4: csel x8, x20, x8,LT MOV x20,#1 csel x8, x20, x8,GT //SIGN(pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd]) - mov v17.16b[15], w8 //sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd]), sign_up, 15) + mov v17.b[15], w8 //sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd]), sign_up, 15) SIGN_UP_CHANGE_DONE_WD_16_HT_4: cmhi v20.16b, v5.16b , v18.16b //vcgtq_u8(pu1_cur_row, pu1_next_row_tmp) @@ -725,10 +725,10 @@ WIDTH_RESIDUE: MOV x20,#-1 csel x8, x20, x8,NE - mov v1.8b[0], w8 //au1_mask = vsetq_lane_s8(-1, au1_mask, 0) + mov v1.b[0], w8 //au1_mask = vsetq_lane_s8(-1, au1_mask, 0) LDRB w8,[x5,#1] //pu1_avail[1] - mov v1.8b[7], w8 //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15) + mov v1.b[7], w8 //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15) PU1_AVAIL_2_RESIDUE: LDRB w8,[x5,#2] //pu1_avail[2] @@ -783,7 +783,7 @@ NEXT_ROW_ELSE_RESIDUE: LDRB w8,[x8] NEXT_ROW_POINTER_ASSIGNED_RESIDUE: - mov v18.16b[15], w8 //vsetq_lane_u8 + mov v18.b[15], w8 //vsetq_lane_u8 EXT v18.16b, v18.16b , v16.16b,#15 //pu1_next_row_tmp = vextq_u8(pu1_next_row_tmp, pu1_next_row, 15) CMP x7,x12 @@ -804,7 +804,7 @@ SIGN_UP_CHANGE_RESIDUE: csel x8, x20, x8,LT MOV x20,#1 csel x8, x20, x8,GT //SIGN(pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd]) - mov v17.16b[15], w8 //sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd]), sign_up, 15) + mov v17.b[15], w8 //sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd]), sign_up, 15) SIGN_UP_CHANGE_DONE_RESIDUE: cmhi v20.16b, v5.16b , v18.16b //vcgtq_u8(pu1_cur_row, pu1_next_row_tmp) diff --git a/common/arm64/ihevc_sao_edge_offset_class3_chroma.s b/common/arm64/ihevc_sao_edge_offset_class3_chroma.s index 5c444c0..8e93110 100644 --- a/common/arm64/ihevc_sao_edge_offset_class3_chroma.s +++ b/common/arm64/ihevc_sao_edge_offset_class3_chroma.s @@ -328,16 +328,16 @@ WIDTH_LOOP_16: MOV x20,#-1 csel x8, x20, x8,NE - mov v1.8b[0], w8 //au1_mask = vsetq_lane_s8(-1, au1_mask, 0) + mov v1.b[0], w8 //au1_mask = vsetq_lane_s8(-1, au1_mask, 0) LDRB w11,[x5,#2] //pu1_avail[2] CMP x6,#16 //if(col == 16) - mov v1.8b[1], w8 //au1_mask = vsetq_lane_s8(-1, au1_mask, 0) + mov v1.b[1], w8 //au1_mask = vsetq_lane_s8(-1, au1_mask, 0) BNE SKIP_AU1_MASK_VAL LDRB w8,[x5,#1] //pu1_avail[1] - mov v1.16b[14], w8 //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15) - mov v1.16b[15], w8 //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15) + mov v1.b[14], w8 //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15) + mov v1.b[15], w8 //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15) SKIP_AU1_MASK_VAL: CMP x11,#0 @@ -389,7 +389,7 @@ AU1_SRC_LEFT_LOOP: ADD x8,x14,x5,LSL #1 //I pu1_src_left_cpy[(ht_tmp - row) * 2] LDRH w5,[x8,#2] //I - mov v18.4h[7], w5 //I vsetq_lane_u8 + mov v18.h[7], w5 //I vsetq_lane_u8 mov x11, x21 //I Loads pu1_avail LDRB w11,[x11,#2] //I pu1_avail[2] @@ -418,11 +418,11 @@ AU1_SRC_LEFT_LOOP: movn x20,#0 csel x9, x20, x9,LT //I - mov v17.16b[14], w8 //I sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[14] -pu1_src_cpy[16 - src_strd]), sign_up, 0) + mov v17.b[14], w8 //I sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[14] -pu1_src_cpy[16 - src_strd]), sign_up, 0) MOV x20,#1 csel x9, x20, x9,GT //I SIGN(pu1_src_cpy[15] - pu1_src_cpy[17 - src_strd] - mov v17.16b[15], w9 //I sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[15] -pu1_src_cpy[17 - src_strd]), sign_up, 1) + mov v17.b[15], w9 //I sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[15] -pu1_src_cpy[17 - src_strd]), sign_up, 1) SIGN_UP_CHANGE_DONE: LD1 {v28.8b},[x2] //edge_idx_tbl = vld1_s8(gi1_table_edge_idx) @@ -483,7 +483,7 @@ PU1_SRC_LOOP: LDRB w10,[x4,#14] //II pu1_src_cpy[14] LDRB w8,[x4,#15] //II pu1_src_cpy[15] - mov v28.4h[7], w9 //II vsetq_lane_u8 + mov v28.h[7], w9 //II vsetq_lane_u8 ADD x4,x11,x1 //III *pu1_src + src_strd LDRB w5,[x0,#17] //II load the value pu1_src_cpy[17 - src_strd] @@ -507,14 +507,14 @@ PU1_SRC_LOOP: csel x10, x20, x10,GT //II SIGN(pu1_src_cpy[14] - pu1_src_cpy[16 - src_strd]) CMP x8,#0 //II - mov v17.8b[14], w10 //II sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[14] -pu1_src_cpy[16 - src_strd]), sign_up, 0) + mov v17.b[14], w10 //II sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[14] -pu1_src_cpy[16 - src_strd]), sign_up, 0) movn x20,#0 csel x8, x20, x8,LT //II MOV x20,#1 csel x8, x20, x8,GT //II SIGN(pu1_src_cpy[15] - pu1_src_cpy[17 - src_strd] SUB x10,x12,x7 //III ht_tmp - row - mov v17.8b[15], w8 //II sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[15] -pu1_src_cpy[17 - src_strd]), sign_up, 1) + mov v17.b[15], w8 //II sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[15] -pu1_src_cpy[17 - src_strd]), sign_up, 1) ADD x11,x14,x10,LSL #1 //III pu1_src_left_cpy[(ht_tmp - row) * 2] CMP x7,#1 //III @@ -533,7 +533,7 @@ NEXT_ROW_POINTER_ASSIGNED_2: ADD x11,x0,x1 //III LDRB w9,[x11,#14] //III pu1_src_cpy[14] - mov v18.4h[7], w5 //III vsetq_lane_u8 + mov v18.h[7], w5 //III vsetq_lane_u8 LDRB w8,[x11,#15] //III pu1_src_cpy[15] LDRB w11,[x0,#16] //III load the value pu1_src_cpy[16 - src_strd] @@ -565,11 +565,11 @@ NEXT_ROW_POINTER_ASSIGNED_2: //TBL v27.8b, {v21.16b},v27.8b //II vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx)) cmhi v22.16b, v16.16b , v18.16b //III vcgtq_u8(pu1_cur_row, pu1_next_row_tmp) - mov v17.16b[14], w9 //III sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[14] -pu1_src_cpy[16 - src_strd]), sign_up, 0) + mov v17.b[14], w9 //III sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[14] -pu1_src_cpy[16 - src_strd]), sign_up, 0) AND v26.16b, v26.16b , v1.16b //II edge_idx = vandq_s8(edge_idx, au1_mask) mov v27.d[0],v26.d[1] - mov v17.16b[15], w10 //III sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[15] -pu1_src_cpy[17 - src_strd]), sign_up, 1) + mov v17.b[15], w10 //III sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[15] -pu1_src_cpy[17 - src_strd]), sign_up, 1) UZP1 v31.8b, v26.8b, v27.8b UZP2 v27.8b, v26.8b, v27.8b //II mov v26.8b,v31.8b @@ -668,7 +668,7 @@ NEXT_ROW_POINTER_ASSIGNED_3: LDRB w8,[x0,#14] //pu1_src_cpy[14] SUB x8,x8,x4 //pu1_src_cpy[14] - pu1_src_cpy[16 - src_strd] - mov v18.4h[7], w5 //vsetq_lane_u8 + mov v18.h[7], w5 //vsetq_lane_u8 LDRB w10,[x0,#15] //pu1_src_cpy[15] CMP x8,#0 @@ -682,13 +682,13 @@ NEXT_ROW_POINTER_ASSIGNED_3: csel x8, x20, x8,GT //SIGN(pu1_src_cpy[14] - pu1_src_cpy[16 - src_strd]) CMP x10,#0 - mov v17.16b[14], w8 //sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[14] -pu1_src_cpy[16 - src_strd]), sign_up, 0) + mov v17.b[14], w8 //sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[14] -pu1_src_cpy[16 - src_strd]), sign_up, 0) movn x20,#0 csel x10, x20, x10,LT MOV x20,#1 csel x10, x20, x10,GT //SIGN(pu1_src_cpy[15] - pu1_src_cpy[17 - src_strd] - mov v17.16b[15], w10 //sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[15] -pu1_src_cpy[17 - src_strd]), sign_up, 1) + mov v17.b[15], w10 //sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[15] -pu1_src_cpy[17 - src_strd]), sign_up, 1) cmhi v20.16b, v5.16b , v18.16b //vcgtq_u8(pu1_cur_row, pu1_next_row_tmp) cmhi v22.16b, v18.16b , v5.16b //vcltq_u8(pu1_cur_row, pu1_next_row_tmp) @@ -762,15 +762,15 @@ WD_16_HT_4_LOOP: csel w8,w20,w8,EQ MOV x20,#-1 csel x8, x20, x8,NE - mov v1.8b[0], w8 //au1_mask = vsetq_lane_s8(-1, au1_mask, 0) + mov v1.b[0], w8 //au1_mask = vsetq_lane_s8(-1, au1_mask, 0) CMP x6,#16 //if(col == 16) - mov v1.8b[1], w8 //au1_mask = vsetq_lane_s8(-1, au1_mask, 0) + mov v1.b[1], w8 //au1_mask = vsetq_lane_s8(-1, au1_mask, 0) BNE SKIP_AU1_MASK_VAL_WD_16_HT_4 LDRB w8,[x5,#1] //pu1_avail[1] - mov v1.16b[14], w8 //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15) - mov v1.16b[15], w8 //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15) + mov v1.b[14], w8 //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15) + mov v1.b[15], w8 //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15) SKIP_AU1_MASK_VAL_WD_16_HT_4: LDRB w11,[x5,#2] //pu1_avail[2] @@ -834,7 +834,7 @@ PU1_SRC_LOOP_WD_16_HT_4: NEXT_ROW_POINTER_ASSIGNED_WD_16_HT_4: LDRH w5,[x8] - mov v18.8h[7], w5 //vsetq_lane_u8 + mov v18.h[7], w5 //vsetq_lane_u8 EXT v18.16b, v18.16b , v16.16b,#14 //pu1_next_row_tmp = vextq_u8(pu1_next_row_tmp, pu1_next_row, 14) CMP x7,x12 @@ -864,13 +864,13 @@ SIGN_UP_CHANGE_WD_16_HT_4: csel x8, x20, x8,GT //SIGN(pu1_src_cpy[14] - pu1_src_cpy[16 - src_strd]) CMP x10,#0 - mov v17.16b[14], w8 //sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[14] -pu1_src_cpy[16 - src_strd]), sign_up, 0) + mov v17.b[14], w8 //sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[14] -pu1_src_cpy[16 - src_strd]), sign_up, 0) movn x20,#0 csel x10, x20, x10,LT MOV x20,#1 csel x10, x20, x10,GT //SIGN(pu1_src_cpy[15] - pu1_src_cpy[17 - src_strd] - mov v17.16b[15], w10 //sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[15] -pu1_src_cpy[17 - src_strd]), sign_up, 1) + mov v17.b[15], w10 //sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[15] -pu1_src_cpy[17 - src_strd]), sign_up, 1) SIGN_UP_CHANGE_DONE_WD_16_HT_4: LD1 {v20.8b},[x2] //edge_idx_tbl = vld1_s8(gi1_table_edge_idx) @@ -949,20 +949,20 @@ WIDTH_RESIDUE: LDRB w11,[x5,#1] //pu1_avail[1] LDRB w9,[x5,#2] //pu1_avail[2] - mov v1.8b[0], w8 //au1_mask = vsetq_lane_s8(-1, au1_mask, 0) + mov v1.b[0], w8 //au1_mask = vsetq_lane_s8(-1, au1_mask, 0) CMP x9,#0 SUB x20,x0,x1 //pu1_src - src_strd csel x10, x20, x10,EQ - mov v1.8b[1], w8 //au1_mask = vsetq_lane_s8(-1, au1_mask, 0) + mov v1.b[1], w8 //au1_mask = vsetq_lane_s8(-1, au1_mask, 0) csel x10, x3, x10,NE ADD x10,x10,#2 //pu1_src - src_strd + 2 - mov v1.8b[6], w11 //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15) + mov v1.b[6], w11 //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15) ADD x5,sp,#0x4B //*au1_src_left_tmp mov w4, w25 //Loads ht - mov v1.8b[7], w11 //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15) + mov v1.b[7], w11 //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15) mov w7, w24 //Loads wd mov x8, x26 //Loads *pu1_src @@ -1015,10 +1015,10 @@ NEXT_ROW_POINTER_ASSIGNED_RESIDUE: LDRB w5,[x8] LDRB w8,[x8,#1] - mov v18.16b[14], w5 //vsetq_lane_u8 + mov v18.b[14], w5 //vsetq_lane_u8 CMP x7,x12 - mov v18.16b[15], w8 //vsetq_lane_u8 + mov v18.b[15], w8 //vsetq_lane_u8 EXT v18.16b, v18.16b , v16.16b,#14 //pu1_next_row_tmp = vextq_u8(pu1_next_row_tmp, pu1_next_row, 14) BLT SIGN_UP_CHANGE_RESIDUE @@ -1047,13 +1047,13 @@ SIGN_UP_CHANGE_RESIDUE: csel x8, x20, x8,GT //SIGN(pu1_src_cpy[14] - pu1_src_cpy[16 - src_strd]) CMP x10,#0 - mov v17.16b[14], w8 //sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[14] -pu1_src_cpy[16 - src_strd]), sign_up, 0) + mov v17.b[14], w8 //sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[14] -pu1_src_cpy[16 - src_strd]), sign_up, 0) movn x20,#0 csel x10, x20, x10,LT MOV x20,#1 csel x10, x20, x10,GT //SIGN(pu1_src_cpy[15] - pu1_src_cpy[17 - src_strd] - mov v17.16b[15], w10 //sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[15] -pu1_src_cpy[17 - src_strd]), sign_up, 1) + mov v17.b[15], w10 //sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[15] -pu1_src_cpy[17 - src_strd]), sign_up, 1) SIGN_UP_CHANGE_DONE_RESIDUE: LD1 {v20.8b},[x2] //edge_idx_tbl = vld1_s8(gi1_table_edge_idx) diff --git a/common/arm64/ihevc_weighted_pred_bi.s b/common/arm64/ihevc_weighted_pred_bi.s index c0508d8..299b042 100644 --- a/common/arm64/ihevc_weighted_pred_bi.s +++ b/common/arm64/ihevc_weighted_pred_bi.s @@ -219,28 +219,28 @@ core_loop: ld1 {v0.4h},[x0],#8 //load and increment the pi2_src1 add x10,x2,x5 //pu1_dst_tmp = pu1_dst + dst_strd ld1 {v1.4h},[x1],#8 //load and increment the pi2_src2 - smull v4.4s, v0.4h, v7.4h[0] //vmull_n_s16(pi2_src1_val1, (int16_t) wgt0) + smull v4.4s, v0.4h, v7.h[0] //vmull_n_s16(pi2_src1_val1, (int16_t) wgt0) ld1 {v2.4h},[x6],x3 //load and increment the pi2_src_tmp1 ii iteration - smull v5.4s, v1.4h, v7.4h[1] //vmull_n_s16(pi2_src2_val1, (int16_t) wgt1) + smull v5.4s, v1.4h, v7.h[1] //vmull_n_s16(pi2_src2_val1, (int16_t) wgt1) ld1 {v3.4h},[x8],x4 //load and increment the pi2_src_tmp1 ii iteration add v4.4s, v4.4s , v5.4s //vaddq_s32(i4_tmp1_t1, i4_tmp1_t2) ld1 {v0.4h},[x6],x3 //load and increment the pi2_src1 iii iteration - smull v6.4s, v2.4h, v7.4h[0] //vmull_n_s16(pi2_src1_val2, (int16_t) wgt0) ii iteration + smull v6.4s, v2.4h, v7.h[0] //vmull_n_s16(pi2_src1_val2, (int16_t) wgt0) ii iteration ld1 {v1.4h},[x8],x4 //load and increment the pi2_src2 iii iteration add v4.4s, v4.4s , v30.4s //vaddq_s32(i4_tmp1_t1, tmp_lvl_shift_t) - smull v19.4s, v0.4h, v7.4h[0] //vmull_n_s16(pi2_src1_val1, (int16_t) wgt0) iii iteration + smull v19.4s, v0.4h, v7.h[0] //vmull_n_s16(pi2_src1_val1, (int16_t) wgt0) iii iteration ld1 {v2.4h},[x6],x3 //load and increment the pi2_src_tmp1 iv iteration - smull v17.4s, v3.4h, v7.4h[1] //vmull_n_s16(pi2_src2_val2, (int16_t) wgt1) ii iteration + smull v17.4s, v3.4h, v7.h[1] //vmull_n_s16(pi2_src2_val2, (int16_t) wgt1) ii iteration sshl v4.4s,v4.4s,v28.4s //vshlq_s32(i4_tmp1_t1, tmp_shift_t) ld1 {v3.4h},[x8],x4 //load and increment the pi2_src_tmp1 iv iteration add v6.4s, v6.4s , v17.4s //vaddq_s32(i4_tmp2_t1, i4_tmp2_t2) ii iteration sqxtun v4.4h, v4.4s //vqmovun_s32(sto_res_tmp1) - smull v16.4s, v1.4h, v7.4h[1] //vmull_n_s16(pi2_src2_val1, (int16_t) wgt1) iii iteration + smull v16.4s, v1.4h, v7.h[1] //vmull_n_s16(pi2_src2_val1, (int16_t) wgt1) iii iteration add v6.4s, v6.4s , v30.4s //vaddq_s32(i4_tmp2_t1, tmp_lvl_shift_t) ii iteration //mov v5, v4 //vcombine_u16(sto_res_tmp2, sto_res_tmp2) @@ -248,13 +248,13 @@ core_loop: sshl v6.4s,v6.4s,v28.4s //vshl.s32 q5,q5,q14 //vshlq_s32(i4_tmp2_t1, tmp_shift_t) ii iteration - smull v18.4s, v2.4h, v7.4h[0] //vmull_n_s16(pi2_src1_val2, (int16_t) wgt0) iv iteration + smull v18.4s, v2.4h, v7.h[0] //vmull_n_s16(pi2_src1_val2, (int16_t) wgt0) iv iteration uqxtn v4.8b,v4.8h //vqmovn.u16 d4,q2 //vqmovn_u16(sto_res_tmp3) add v19.4s, v19.4s , v30.4s //vaddq_s32(i4_tmp1_t1, tmp_lvl_shift_t) iii iteration sqxtun v6.4h, v6.4s //vqmovun_s32(sto_res_tmp1) ii iteration - smull v20.4s, v3.4h, v7.4h[1] //vmull_n_s16(pi2_src2_val2, (int16_t) wgt1) iv iteration + smull v20.4s, v3.4h, v7.h[1] //vmull_n_s16(pi2_src2_val2, (int16_t) wgt1) iv iteration sshl v19.4s,v19.4s,v28.4s //vshl.s32 q7,q7,q14 //vshlq_s32(i4_tmp1_t1, tmp_shift_t) iii iteration diff --git a/common/arm64/ihevc_weighted_pred_uni.s b/common/arm64/ihevc_weighted_pred_uni.s index 5586679..c6dee6f 100644 --- a/common/arm64/ihevc_weighted_pred_uni.s +++ b/common/arm64/ihevc_weighted_pred_uni.s @@ -151,7 +151,7 @@ ihevc_weighted_pred_uni_av8: add x10,x10,x22 //lvl_shift * wgt0 + (off0 << shift) mov x9,x21 //load wt sub x12,x6,#1 - mov v0.4h[0], w4 //moved for scalar multiplication + mov v0.h[0], w4 //moved for scalar multiplication lsl x2,x2,#1 dup v28.4s,w6 //vmovq_n_s32(tmp_shift) lsl x22,x11,x12 @@ -172,19 +172,19 @@ core_loop: add x6,x1,x3 //pu1_dst_tmp = pu1_dst + dst_strd ld1 {v1.4h},[x0],#8 //load and increment the pi2_src ld1 {v2.4h},[x5],x2 //load and increment the pi2_src_tmp ii iteration - smull v4.4s, v1.4h, v0.4h[0] //vmull_n_s16(pi2_src_val1, (int16_t) wgt0) + smull v4.4s, v1.4h, v0.h[0] //vmull_n_s16(pi2_src_val1, (int16_t) wgt0) add v4.4s, v4.4s , v30.4s //vaddq_s32(i4_tmp1_t, tmp_lvl_shift_t) ld1 {v3.4h},[x5],x2 //load and increment the pi2_src iii iteration - smull v6.4s, v2.4h, v0.4h[0] //vmull_n_s16(pi2_src_val2, (int16_t) wgt0) ii iteration + smull v6.4s, v2.4h, v0.h[0] //vmull_n_s16(pi2_src_val2, (int16_t) wgt0) ii iteration ld1 {v5.4h},[x5],x2 //load and increment the pi2_src_tmp iv iteration sshl v4.4s,v4.4s,v28.4s //vshl.s32 q2,q2,q14 //vshlq_s32(i4_tmp1_t, tmp_shift_t) add v6.4s, v6.4s , v30.4s //vaddq_s32(i4_tmp2_t, tmp_lvl_shift_t) ii iteration - smull v7.4s, v3.4h, v0.4h[0] //vmull_n_s16(pi2_src_val1, (int16_t) wgt0) iii iteration + smull v7.4s, v3.4h, v0.h[0] //vmull_n_s16(pi2_src_val1, (int16_t) wgt0) iii iteration sqxtun v4.4h, v4.4s //vqmovun_s32(sto_res_tmp1) add v7.4s, v7.4s , v30.4s //vaddq_s32(i4_tmp1_t, tmp_lvl_shift_t) iii iteration @@ -193,7 +193,7 @@ core_loop: sshl v6.4s,v6.4s,v28.4s //vshl.s32 q3,q3,q14 //vshlq_s32(i4_tmp2_t, tmp_shift_t) ii iteration - smull v16.4s, v5.4h, v0.4h[0] //vmull_n_s16(pi2_src_val2, (int16_t) wgt0) iv iteration + smull v16.4s, v5.4h, v0.h[0] //vmull_n_s16(pi2_src_val2, (int16_t) wgt0) iv iteration uqxtn v4.8b, v4.8h //vqmovn_u16(sto_res_tmp3) sshl v7.4s,v7.4s,v28.4s diff --git a/decoder.arm64.mk b/decoder.arm64.mk index 316cc26..2e6ec23 100644 --- a/decoder.arm64.mk +++ b/decoder.arm64.mk @@ -91,3 +91,10 @@ libhevcd_cflags_arm64 += -DDEFAULT_ARCH=D_ARCH_ARMV8_GENERIC LOCAL_SRC_FILES_arm64 += $(libhevcd_srcs_c_arm64) $(libhevcd_srcs_asm_arm64) LOCAL_C_INCLUDES_arm64 += $(libhevcd_inc_dir_arm64) LOCAL_CFLAGS_arm64 += $(libhevcd_cflags_arm64) + +# Clang doesn't pass -I flags to the assembler when building a .s file. +# We need to tell it to pass them to the assembler specifically (doesn't hurt +# with gcc either, and may actually help future gcc versions if they decide +# to start making a difference between assembly and C includes). +comma := , +LOCAL_ASFLAGS_arm64 += $(addprefix -Wa$(comma)-I,$(libhevcd_inc_dir_arm64)) diff --git a/decoder.mips64.mk b/decoder.mips64.mk index 5ac515e..81b5852 100644 --- a/decoder.mips64.mk +++ b/decoder.mips64.mk @@ -1,8 +1,8 @@ -libhevcd_inc_dir_mips += $(LOCAL_PATH)/decoder/mips -libhevcd_inc_dir_mips += $(LOCAL_PATH)/common/mips +libhevcd_inc_dir_mips64 += $(LOCAL_PATH)/decoder/mips +libhevcd_inc_dir_mips64 += $(LOCAL_PATH)/common/mips -libhevcd_srcs_c_mips += decoder/mips/ihevcd_function_selector.c -libhevcd_srcs_c_mips += decoder/mips/ihevcd_function_selector_mips_generic.c +libhevcd_srcs_c_mips64 += decoder/mips/ihevcd_function_selector.c +libhevcd_srcs_c_mips64 += decoder/mips/ihevcd_function_selector_mips_generic.c LOCAL_SRC_FILES_mips64 += $(libhevcd_srcs_c_mips64) $(libhevcd_srcs_asm_mips64) LOCAL_C_INCLUDES_mips64 += $(libhevcd_inc_dir_mips64) diff --git a/decoder/arm64/ihevcd_fmt_conv_420sp_to_rgba8888.s b/decoder/arm64/ihevcd_fmt_conv_420sp_to_rgba8888.s index a6041f5..026b65f 100644 --- a/decoder/arm64/ihevcd_fmt_conv_420sp_to_rgba8888.s +++ b/decoder/arm64/ihevcd_fmt_conv_420sp_to_rgba8888.s @@ -121,16 +121,16 @@ ihevcd_fmt_conv_420sp_to_rgba8888_av8: ///* can be loaded from a defined const type */ mov x10,#0x3311 - mov v0.4h[0], w10 ////C1 + mov v0.h[0], w10 ////C1 mov x10,#0xF379 - mov v0.4h[1], w10 ////C2 + mov v0.h[1], w10 ////C2 mov x10,#0xE5F8 - mov v0.4h[2], w10 ////C3 + mov v0.h[2], w10 ////C3 mov x10,#0x4092 - mov v0.4h[3], w10 ////C4 + mov v0.h[3], w10 ////C4 ////LOAD CONSTANT 128 INTO A CORTEX REGISTER MOV x10,#128 @@ -197,16 +197,16 @@ LABEL_YUV420SP_TO_RGB8888_WIDTH_LOOP: prfm PLDL1KEEP,[x1] ////NEED TO MULTIPLY WITH Q2,Q3 WITH CO-EEFICIENTS - sMULL v5.4s, v4.4h, v0.4h[3] ////(U-128)*C4 FOR B - sMULL2 v7.4s, v4.8h, v0.4h[3] ////(U-128)*C4 FOR B + sMULL v5.4s, v4.4h, v0.h[3] ////(U-128)*C4 FOR B + sMULL2 v7.4s, v4.8h, v0.h[3] ////(U-128)*C4 FOR B - sMULL v20.4s, v6.4h, v0.4h[0] ////(V-128)*C1 FOR R - sMULL2 v22.4s, v6.8h, v0.4h[0] ////(V-128)*C1 FOR R + sMULL v20.4s, v6.4h, v0.h[0] ////(V-128)*C1 FOR R + sMULL2 v22.4s, v6.8h, v0.h[0] ////(V-128)*C1 FOR R - sMULL v12.4s, v4.4h, v0.4h[1] ////(U-128)*C2 FOR G - sMLAL v12.4s, v6.4h, v0.4h[2] ////Q6 = (U-128)*C2 + (V-128)*C3 - sMULL2 v14.4s, v4.8h, v0.4h[1] ////(U-128)*C2 FOR G - sMLAL2 v14.4s, v6.8h, v0.4h[2] ////Q7 = (U-128)*C2 + (V-128)*C3 + sMULL v12.4s, v4.4h, v0.h[1] ////(U-128)*C2 FOR G + sMLAL v12.4s, v6.4h, v0.h[2] ////Q6 = (U-128)*C2 + (V-128)*C3 + sMULL2 v14.4s, v4.8h, v0.h[1] ////(U-128)*C2 FOR G + sMLAL2 v14.4s, v6.8h, v0.h[2] ////Q7 = (U-128)*C2 + (V-128)*C3 ////NARROW RIGHT SHIFT BY 13 FOR R&B sqshrn v5.4h, v5.4s,#13 ////D8 = (U-128)*C4>>13 4 16-BIT VALUES @@ -360,16 +360,16 @@ LABEL_YUV420SP_TO_RGB8888_WIDTH_LOOP_SKIP: ////NEED TO MULTIPLY WITH Q2,Q3 WITH CO-EEFICIENTS - sMULL v5.4s, v4.4h, v0.4h[3] ////(U-128)*C4 FOR B - sMULL2 v7.4s, v4.8h, v0.4h[3] ////(U-128)*C4 FOR B + sMULL v5.4s, v4.4h, v0.h[3] ////(U-128)*C4 FOR B + sMULL2 v7.4s, v4.8h, v0.h[3] ////(U-128)*C4 FOR B - sMULL v20.4s, v6.4h, v0.4h[0] ////(V-128)*C1 FOR R - sMULL2 v22.4s, v6.8h, v0.4h[0] ////(V-128)*C1 FOR R + sMULL v20.4s, v6.4h, v0.h[0] ////(V-128)*C1 FOR R + sMULL2 v22.4s, v6.8h, v0.h[0] ////(V-128)*C1 FOR R - sMULL v12.4s, v4.4h, v0.4h[1] ////(U-128)*C2 FOR G - sMLAL v12.4s, v6.4h, v0.4h[2] ////Q6 = (U-128)*C2 + (V-128)*C3 - sMULL2 v14.4s, v4.8h, v0.4h[1] ////(U-128)*C2 FOR G - sMLAL2 v14.4s, v6.8h, v0.4h[2] ////Q7 = (U-128)*C2 + (V-128)*C3 + sMULL v12.4s, v4.4h, v0.h[1] ////(U-128)*C2 FOR G + sMLAL v12.4s, v6.4h, v0.h[2] ////Q6 = (U-128)*C2 + (V-128)*C3 + sMULL2 v14.4s, v4.8h, v0.h[1] ////(U-128)*C2 FOR G + sMLAL2 v14.4s, v6.8h, v0.h[2] ////Q7 = (U-128)*C2 + (V-128)*C3 ////NARROW RIGHT SHIFT BY 13 FOR R&B sqshrn v5.4h, v5.4s,#13 ////D8 = (U-128)*C4>>13 4 16-BIT VALUES |