summaryrefslogtreecommitdiffstats
path: root/common/arm64/ihevc_itrans_recon_16x16.s
diff options
context:
space:
mode:
Diffstat (limited to 'common/arm64/ihevc_itrans_recon_16x16.s')
-rw-r--r--common/arm64/ihevc_itrans_recon_16x16.s1240
1 files changed, 1240 insertions, 0 deletions
diff --git a/common/arm64/ihevc_itrans_recon_16x16.s b/common/arm64/ihevc_itrans_recon_16x16.s
new file mode 100644
index 0000000..90df840
--- /dev/null
+++ b/common/arm64/ihevc_itrans_recon_16x16.s
@@ -0,0 +1,1240 @@
+///*****************************************************************************
+//*
+//* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+//*
+//* Licensed under the Apache License, Version 2.0 (the "License");
+//* you may not use this file except in compliance with the License.
+//* You may obtain a copy of the License at:
+//*
+//* http://www.apache.org/licenses/LICENSE-2.0
+//*
+//* Unless required by applicable law or agreed to in writing, software
+//* distributed under the License is distributed on an "AS IS" BASIS,
+//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//* See the License for the specific language governing permissions and
+//* limitations under the License.
+//*
+//*****************************************************************************/
+///**
+// *******************************************************************************
+// * @file
+// * ihevc_itrans_recon_8x8_neon.s
+// *
+// * @brief
+// * contains function definitions for single stage inverse transform
+// *
+// * @author
+// * anand s
+// *
+// * @par list of functions:
+// * - ihevc_itrans_recon_16x16()
+// *
+// * @remarks
+// * none
+// *
+// *******************************************************************************
+//*/
+
+///**
+// *******************************************************************************
+// *
+// * @brief
+// * this function performs inverse transform and reconstruction for 8x8
+// * input block
+// *
+// * @par description:
+// * performs inverse transform and adds the prediction data and clips output
+// * to 8 bit
+// *
+// * @param[in] pi2_src
+// * input 16x16 coefficients
+// *
+// * @param[in] pi2_tmp
+// * temporary 16x16 buffer for storing inverse
+// *
+// * transform
+// * 1st stage output
+// *
+// * @param[in] pu1_pred
+// * prediction 16x16 block
+// *
+// * @param[out] pu1_dst
+// * output 8x8 block
+// *
+// * @param[in] src_strd
+// * input stride
+// *
+// * @param[in] pred_strd
+// * prediction stride
+// *
+// * @param[in] dst_strd
+// * output stride
+// *
+// * @param[in] shift
+// * output shift
+// *
+// * @param[in] x12
+// * zero columns in pi2_src
+// *
+// * @returns void
+// *
+// * @remarks
+// * none
+// *
+// *******************************************************************************
+// */
+
+//void ihevc_itrans_recon_16x16(word16 *pi2_src,
+// word16 *pi2_tmp,
+// uword8 *pu1_pred,
+// uword8 *pu1_dst,
+// word32 src_strd,
+// word32 pred_strd,
+// word32 dst_strd,
+// word32 x12
+// word32 x11 )
+
+//**************variables vs registers*************************
+// x0 => *pi2_src
+// x1 => *pi2_tmp
+// x2 => *pu1_pred
+// x3 => *pu1_dst
+// src_strd
+// pred_strd
+// dst_strd
+// x12
+// x11
+
+.text
+.align 4
+
+.include "ihevc_neon_macros.s"
+
+
+
+
+.set shift_stage1_idct , 7
+.set shift_stage2_idct , 12
+//#define zero_cols x12
+//#define zero_rows x11
+.globl ihevc_itrans_recon_16x16_av8
+
+.extern g_ai2_ihevc_trans_16_transpose
+
+.type ihevc_itrans_recon_16x16_av8, %function
+
+ihevc_itrans_recon_16x16_av8:
+
+ ldr w11, [sp]
+ // stmfd sp!,{x4-x12,x14}
+ push_v_regs
+ stp x19, x20,[sp,#-16]!
+ stp x5, x6,[sp,#-16]!
+// add sp,sp,#40
+
+
+
+// ldr x8,[sp,#4] @ prediction stride
+// ldr x7,[sp,#8] @ destination stride
+ mov x6, x4 // src stride
+ mov x12, x7
+
+
+
+ adrp x14, :got:g_ai2_ihevc_trans_16_transpose
+ ldr x14, [x14, #:got_lo12:g_ai2_ihevc_trans_16_transpose]
+ ld1 {v0.4h, v1.4h, v2.4h, v3.4h},[x14] ////d0,d1 are used for storing the constant data
+ mov x7,#0xffff
+ and x12,x12,x7
+ and x11,x11,x7
+ lsl x6, x6, #1 // x sizeof(word16)
+ add x9,x0,x6, lsl #1 // 2 rows
+
+ add x10,x6,x6, lsl #1 // 3 rows
+ add x5,x6,x6,lsl #2
+ mov x7,#0xfff0
+
+ cmp x12,x7
+ bge zero_12cols_decision
+
+ mov x19,#0xff00
+ cmp x12,x19
+ bge zero_8cols_decision
+
+
+
+
+ mov x14,#4
+ cmp x11,x7
+ sub x20,x6,#0
+ neg x20, x20
+ csel x10,x20,x10,ge
+
+ mov x19,#0xff00
+ cmp x11,x19
+ csel x8, x5, x8,ge
+ sub x20,x8,#0
+ neg x20, x20
+ csel x8,x20,x8,ge
+ csel x8, x10, x8,lt
+ add x5,x5,x6,lsl #3
+ sub x20,x5,#0
+ neg x5, x20
+
+ b first_stage_top_four_bottom_four
+
+zero_12cols_decision:
+ mov x14,#1
+ mov x19,#0xff00
+ cmp x11,x19
+ csel x8, x5, x8,ge
+ csel x8, x10, x8,lt
+ add x5,x5,x6,lsl #3
+ sub x20,x5,#0
+ neg x5, x20
+
+ b first_stage_top_four_bottom_four
+
+zero_8cols_decision:
+ mov x14,#2
+ mov x8,x5
+ sub x20,x8,#0
+ neg x8, x20
+ mov x19,#0xff00
+ cmp x11,x19
+ csel x8, x10, x8,lt
+ add x5,x5,x6,lsl #3
+ sub x20,x5,#0
+ neg x5, x20
+ cmp x11,x7
+ sub x20,x6,#0
+ neg x20, x20
+ csel x10,x20,x10,ge
+
+
+ b first_stage_top_four_bottom_four
+
+
+//d0[0]= 64 d2[0]=64
+//d0[1]= 90 d2[1]=57
+//d0[2]= 89 d2[2]=50
+//d0[3]= 87 d2[3]=43
+//d1[0]= 83 d3[0]=36
+//d1[1]= 80 d3[1]=25
+//d1[2]= 75 d3[2]=18
+//d1[3]= 70 d3[3]=9
+
+
+
+first_stage:
+ add x0,x0,#8
+ add x9,x9,#8
+
+first_stage_top_four_bottom_four:
+
+ ld1 {v10.4h},[x0],x6
+ ld1 {v11.4h},[x9],x6
+ ld1 {v6.4h},[x0],x10
+ ld1 {v7.4h},[x9],x10
+ cmp x11,x7
+ bge skip_load4rows
+
+ ld1 {v4.4h},[x0],x6
+ ld1 {v5.4h},[x9],x6
+ ld1 {v8.4h},[x0],x8
+ ld1 {v9.4h},[x9],x8
+
+// registers used: q0,q1,q3,q5,q2,q4
+
+// d10 =x0
+//d6= x1
+//d11=x2
+//d7=x3
+
+skip_load4rows:
+ smull v24.4s, v6.4h, v0.4h[1] //// y1 * cos1(part of b0)
+ smull v26.4s, v6.4h, v0.4h[3] //// y1 * cos3(part of b1)
+ smull v28.4s, v6.4h, v1.4h[1] //// y1 * sin3(part of b2)
+ smull v30.4s, v6.4h, v1.4h[3] //// y1 * sin1(part of b3)
+
+ smlal v24.4s, v7.4h, v0.4h[3] //// y1 * cos1 + y3 * cos3(part of b0)
+ smlal v26.4s, v7.4h, v2.4h[1] //// y1 * cos3 - y3 * sin1(part of b1)
+ smlal v28.4s, v7.4h, v3.4h[3] //// y1 * sin3 - y3 * cos1(part of b2)
+ smlsl v30.4s, v7.4h, v2.4h[3] //// y1 * sin1 - y3 * sin3(part of b3)
+
+
+
+
+
+
+ smull v12.4s, v10.4h, v0.4h[0]
+ smlal v12.4s, v11.4h, v0.4h[2]
+ smull v14.4s, v10.4h, v0.4h[0]
+ smlal v14.4s, v11.4h, v1.4h[2]
+ smull v16.4s, v10.4h, v0.4h[0]
+ smlal v16.4s, v11.4h, v2.4h[2]
+ smull v18.4s, v10.4h, v0.4h[0]
+ smlal v18.4s, v11.4h, v3.4h[2]
+
+ bge skip_last12rows_kernel1
+
+
+ smlal v24.4s, v8.4h, v1.4h[1]
+ smlal v26.4s, v8.4h, v3.4h[3]
+ smlsl v28.4s, v8.4h, v1.4h[3]
+ smlsl v30.4s, v8.4h, v0.4h[3]
+
+
+ smlal v24.4s, v9.4h, v1.4h[3]
+ smlsl v26.4s, v9.4h, v2.4h[3]
+ smlsl v28.4s, v9.4h, v0.4h[3]
+ smlal v30.4s, v9.4h, v3.4h[3]
+
+
+
+
+
+ smlal v12.4s, v4.4h, v1.4h[0]
+ smlal v12.4s, v5.4h, v1.4h[2]
+ smlal v14.4s, v4.4h, v3.4h[0]
+ smlsl v14.4s, v5.4h, v3.4h[2]
+ smlsl v16.4s, v4.4h, v3.4h[0]
+ smlsl v16.4s, v5.4h, v0.4h[2]
+ smlsl v18.4s, v4.4h, v1.4h[0]
+ smlsl v18.4s, v5.4h, v2.4h[2]
+
+//d0[0]= 64 d2[0]=64
+//d0[1]= 90 d2[1]=57
+//d0[2]= 89 d2[2]=50
+//d0[3]= 87 d2[3]=43
+//d1[0]= 83 d3[0]=36
+//d1[1]= 80 d3[1]=25
+//d1[2]= 75 d3[2]=18
+//d1[3]= 70 d3[3]=9
+ mov x19,#0xff00
+ cmp x11,x19
+ bge skip_last12rows_kernel1
+
+
+ ld1 {v10.4h},[x0],x6
+ ld1 {v11.4h},[x9],x6
+ ld1 {v6.4h},[x0],x10
+ ld1 {v7.4h},[x9],x10
+ ld1 {v4.4h},[x0],x6
+ ld1 {v5.4h},[x9],x6
+ ld1 {v8.4h},[x0],x5
+ ld1 {v9.4h},[x9],x5
+
+
+
+
+ smlal v24.4s, v6.4h, v2.4h[1] //// y1 * cos1(part of b0)
+ smlsl v26.4s, v6.4h, v1.4h[1] //// y1 * cos3(part of b1)
+ smlsl v28.4s, v6.4h, v3.4h[1] //// y1 * sin3(part of b2)
+ smlal v30.4s, v6.4h, v0.4h[1] //// y1 * sin1(part of b3)
+
+ smlal v24.4s, v7.4h, v2.4h[3] //// y1 * cos1 + y3 * cos3(part of b0)
+ smlsl v26.4s, v7.4h, v0.4h[1] //// y1 * cos3 - y3 * sin1(part of b1)
+ smlal v28.4s, v7.4h, v2.4h[1] //// y1 * sin3 - y3 * cos1(part of b2)
+ smlal v30.4s, v7.4h, v3.4h[1] //// y1 * sin1 - y3 * sin3(part of b3)
+
+
+
+ smlal v24.4s, v8.4h, v3.4h[1]
+ smlsl v26.4s, v8.4h, v1.4h[3]
+ smlal v28.4s, v8.4h, v0.4h[1]
+ smlsl v30.4s, v8.4h, v1.4h[1]
+
+
+ smlal v24.4s, v9.4h, v3.4h[3]
+ smlsl v26.4s, v9.4h, v3.4h[1]
+ smlal v28.4s, v9.4h, v2.4h[3]
+ smlsl v30.4s, v9.4h, v2.4h[1]
+
+
+
+
+
+ smlal v12.4s, v10.4h, v0.4h[0]
+ smlal v12.4s, v11.4h, v2.4h[2]
+ smlal v12.4s, v4.4h, v3.4h[0]
+ smlal v12.4s, v5.4h, v3.4h[2]
+
+
+
+
+ smlsl v14.4s, v10.4h, v0.4h[0]
+ smlsl v14.4s, v11.4h, v0.4h[2]
+ smlsl v14.4s, v4.4h, v1.4h[0]
+ smlsl v14.4s, v5.4h, v2.4h[2]
+
+
+ smlsl v16.4s, v10.4h, v0.4h[0]
+ smlal v16.4s, v11.4h, v3.4h[2]
+ smlal v16.4s, v4.4h, v1.4h[0]
+ smlal v16.4s, v5.4h, v1.4h[2]
+
+
+ smlal v18.4s, v10.4h, v0.4h[0]
+ smlal v18.4s, v11.4h, v1.4h[2]
+ smlsl v18.4s, v4.4h, v3.4h[0]
+ smlsl v18.4s, v5.4h, v0.4h[2]
+
+skip_last12rows_kernel1:
+ add v20.4s, v12.4s , v24.4s
+ sub v22.4s, v12.4s , v24.4s
+
+ add v12.4s, v14.4s , v26.4s
+ sub v24.4s, v14.4s , v26.4s
+
+ add v14.4s, v16.4s , v28.4s
+ sub v26.4s, v16.4s , v28.4s
+
+
+ add v16.4s, v18.4s , v30.4s
+ sub v28.4s, v18.4s , v30.4s
+
+
+
+
+
+
+
+ sqrshrn v30.4h, v20.4s,#shift_stage1_idct //// x0 = (a0 + b0 + rnd) >> 7(shift_stage1_idct)
+ sqrshrn v19.4h, v22.4s,#shift_stage1_idct //// x7 = (a0 - b0 + rnd) >> 7(shift_stage1_idct)
+ sqrshrn v31.4h, v14.4s,#shift_stage1_idct //// x2 = (a2 + b2 + rnd) >> 7(shift_stage1_idct)
+ sqrshrn v18.4h, v26.4s,#shift_stage1_idct //// x5 = (a2 - b2 + rnd) >> 7(shift_stage1_idct)
+ sqrshrn v12.4h, v12.4s,#shift_stage1_idct //// x1 = (a1 + b1 + rnd) >> 7(shift_stage1_idct)
+ sqrshrn v15.4h, v24.4s,#shift_stage1_idct //// x6 = (a1 - b1 + rnd) >> 7(shift_stage1_idct)
+ sqrshrn v13.4h, v16.4s,#shift_stage1_idct //// x3 = (a3 + b3 + rnd) >> 7(shift_stage1_idct)
+ sqrshrn v14.4h, v28.4s,#shift_stage1_idct //// x4 = (a3 - b3 + rnd) >> 7(shift_stage1_idct)
+
+ st1 {v30.4h, v31.4h},[x1],#16
+ st1 {v18.4h, v19.4h},[x1],#16
+ sub x1,x1,#32
+
+ bge skip_stage1_kernel_load
+
+first_stage_middle_eight:
+
+
+
+ ld1 {v10.4h},[x0],x6
+ ld1 {v11.4h},[x9],x6
+ ld1 {v6.4h},[x0],x10
+ ld1 {v7.4h},[x9],x10
+ ld1 {v4.4h},[x0],x6
+ ld1 {v5.4h},[x9],x6
+ ld1 {v8.4h},[x0],x8
+ ld1 {v9.4h},[x9],x8
+
+
+skip_stage1_kernel_load:
+ smull v24.4s, v6.4h, v2.4h[1] //// y1 * cos1(part of b0)
+ smull v26.4s, v6.4h, v2.4h[3] //// y1 * cos3(part of b1)
+ smull v28.4s, v6.4h, v3.4h[1] //// y1 * sin3(part of b2)
+ smull v30.4s, v6.4h, v3.4h[3] //// y1 * sin1(part of b3)
+
+ smlsl v24.4s, v7.4h, v1.4h[1] //// y1 * cos1 + y3 * cos3(part of b0)
+ smlsl v26.4s, v7.4h, v0.4h[1] //// y1 * cos3 - y3 * sin1(part of b1)
+ smlsl v28.4s, v7.4h, v1.4h[3] //// y1 * sin3 - y3 * cos1(part of b2)
+ smlsl v30.4s, v7.4h, v3.4h[1] //// y1 * sin1 - y3 * sin3(part of b3)
+
+
+
+
+
+
+ smull v22.4s, v10.4h, v0.4h[0]
+ smlsl v22.4s, v11.4h, v3.4h[2]
+ smull v20.4s, v10.4h, v0.4h[0]
+ smlsl v20.4s, v11.4h, v2.4h[2]
+ smull v16.4s, v10.4h, v0.4h[0]
+ smlsl v16.4s, v11.4h, v1.4h[2]
+ smull v18.4s, v10.4h, v0.4h[0]
+ smlsl v18.4s, v11.4h, v0.4h[2]
+
+
+ cmp x11,x7
+ bge skip_last12rows_kernel2
+
+ smlsl v24.4s, v8.4h, v3.4h[1]
+ smlal v26.4s, v8.4h, v2.4h[1]
+ smlal v28.4s, v8.4h, v0.4h[1]
+ smlal v30.4s, v8.4h, v2.4h[3]
+
+
+ smlal v24.4s, v9.4h, v0.4h[1]
+ smlal v26.4s, v9.4h, v3.4h[1]
+ smlsl v28.4s, v9.4h, v1.4h[1]
+ smlsl v30.4s, v9.4h, v2.4h[1]
+
+
+
+ smlsl v22.4s, v4.4h, v1.4h[0]
+ smlal v22.4s, v5.4h, v2.4h[2]
+ smlsl v20.4s, v4.4h, v3.4h[0]
+ smlal v20.4s, v5.4h, v0.4h[2]
+ smlal v16.4s, v4.4h, v3.4h[0]
+ smlal v16.4s, v5.4h, v3.4h[2]
+ smlal v18.4s, v4.4h, v1.4h[0]
+ smlsl v18.4s, v5.4h, v1.4h[2]
+
+//d0[0]= 64 d2[0]=64
+//d0[1]= 90 d2[1]=57
+//d0[2]= 89 d2[2]=50
+//d0[3]= 87 d2[3]=43
+//d1[0]= 83 d3[0]=36
+//d1[1]= 80 d3[1]=25
+//d1[2]= 75 d3[2]=18
+//d1[3]= 70 d3[3]=9
+ mov x19,#0xff00
+ cmp x11,x19
+ bge skip_last12rows_kernel2
+
+ ld1 {v10.4h},[x0],x6
+ ld1 {v11.4h},[x9],x6
+ ld1 {v6.4h},[x0],x10
+ ld1 {v7.4h},[x9],x10
+ ld1 {v4.4h},[x0],x6
+ ld1 {v5.4h},[x9],x6
+ ld1 {v8.4h},[x0],x5
+ ld1 {v9.4h},[x9],x5
+
+
+ smlsl v24.4s, v6.4h, v3.4h[3] //// y1 * cos1(part of b0)
+ smlsl v26.4s, v6.4h, v0.4h[3] //// y1 * cos3(part of b1)
+ smlal v28.4s, v6.4h, v2.4h[3] //// y1 * sin3(part of b2)
+ smlal v30.4s, v6.4h, v1.4h[3] //// y1 * sin1(part of b3)
+
+ smlsl v24.4s, v7.4h, v0.4h[3] //// y1 * cos1 + y3 * cos3(part of b0)
+ smlal v26.4s, v7.4h, v1.4h[3] //// y1 * cos3 - y3 * sin1(part of b1)
+ smlal v28.4s, v7.4h, v3.4h[3] //// y1 * sin3 - y3 * cos1(part of b2)
+ smlsl v30.4s, v7.4h, v1.4h[1] //// y1 * sin1 - y3 * sin3(part of b3)
+
+
+ smlal v24.4s, v8.4h, v2.4h[3]
+ smlal v26.4s, v8.4h, v3.4h[3]
+ smlsl v28.4s, v8.4h, v2.4h[1]
+ smlal v30.4s, v8.4h, v0.4h[3]
+
+
+ smlal v24.4s, v9.4h, v1.4h[3]
+ smlsl v26.4s, v9.4h, v1.4h[1]
+ smlal v28.4s, v9.4h, v0.4h[3]
+ smlsl v30.4s, v9.4h, v0.4h[1]
+
+
+
+
+ smlal v22.4s, v10.4h, v0.4h[0]
+ smlsl v22.4s, v11.4h, v1.4h[2]
+ smlsl v22.4s, v4.4h, v3.4h[0]
+ smlal v22.4s, v5.4h, v0.4h[2]
+
+
+
+ smlsl v20.4s, v10.4h, v0.4h[0]
+ smlsl v20.4s, v11.4h, v3.4h[2]
+ smlal v20.4s, v4.4h, v1.4h[0]
+ smlsl v20.4s, v5.4h, v1.4h[2]
+
+
+ smlsl v16.4s, v10.4h, v0.4h[0]
+ smlal v16.4s, v11.4h, v0.4h[2]
+ smlsl v16.4s, v4.4h, v1.4h[0]
+ smlal v16.4s, v5.4h, v2.4h[2]
+
+
+
+ smlal v18.4s, v10.4h, v0.4h[0]
+ smlsl v18.4s, v11.4h, v2.4h[2]
+ smlal v18.4s, v4.4h, v3.4h[0]
+ smlsl v18.4s, v5.4h, v3.4h[2]
+
+skip_last12rows_kernel2:
+
+ add v4.4s, v22.4s , v24.4s
+ sub v22.4s, v22.4s , v24.4s
+
+ add v6.4s, v20.4s , v26.4s
+ sub v24.4s, v20.4s , v26.4s
+
+ add v10.4s, v16.4s , v28.4s
+ sub v26.4s, v16.4s , v28.4s
+
+
+ add v16.4s, v18.4s , v30.4s
+ sub v28.4s, v18.4s , v30.4s
+
+
+ sqrshrn v18.4h, v4.4s,#shift_stage1_idct //// x0 = (a0 + b0 + rnd) >> 7(shift_stage1_idct)
+ sqrshrn v31.4h, v22.4s,#shift_stage1_idct //// x7 = (a0 - b0 + rnd) >> 7(shift_stage1_idct)
+ sqrshrn v19.4h, v10.4s,#shift_stage1_idct //// x2 = (a2 + b2 + rnd) >> 7(shift_stage1_idct)
+ sqrshrn v30.4h, v26.4s,#shift_stage1_idct //// x5 = (a2 - b2 + rnd) >> 7(shift_stage1_idct)
+ sqrshrn v20.4h, v6.4s,#shift_stage1_idct //// x1 = (a1 + b1 + rnd) >> 7(shift_stage1_idct)
+ sqrshrn v23.4h, v24.4s,#shift_stage1_idct //// x6 = (a1 - b1 + rnd) >> 7(shift_stage1_idct)
+ sqrshrn v21.4h, v16.4s,#shift_stage1_idct //// x3 = (a3 + b3 + rnd) >> 7(shift_stage1_idct)
+ sqrshrn v22.4h, v28.4s,#shift_stage1_idct //// x4 = (a3 - b3 + rnd) >> 7(shift_stage1_idct)
+
+
+ // registers used: {q2,q4,q6,q7}, {q9,q15,q10,q11}
+
+
+
+
+
+
+ ld1 {v4.4h, v5.4h},[x1],#16
+ ld1 {v8.4h, v9.4h},[x1],#16
+ sub x1,x1,#32
+
+//d4=x0
+//d12=x1
+//d5=x2
+//d13=x3
+
+//d18=x4
+//d20=x5
+//d19=x6
+//d21=x7
+
+//d22=x8
+//d30=x9
+//d23=x10
+//d31=x11
+
+//d14=x12
+//d8=x13
+//d15=x14
+//d9=x15
+
+ umov x15,v26.d[0]
+ umov x16,v27.d[0]
+ umov x19,v28.d[0]
+ umov x20,v29.d[0]
+
+ trn1 v26.4h, v4.4h, v12.4h
+ trn2 v27.4h, v4.4h, v12.4h
+ trn1 v28.4h, v5.4h, v13.4h
+ trn2 v29.4h, v5.4h, v13.4h
+
+ trn1 v4.2s, v26.2s, v28.2s
+ trn2 v5.2s, v26.2s, v28.2s
+ trn1 v12.2s, v27.2s, v29.2s
+ trn2 v13.2s, v27.2s, v29.2s
+
+ trn1 v26.4h, v18.4h, v20.4h
+ trn2 v27.4h, v18.4h, v20.4h
+ trn1 v28.4h, v19.4h, v21.4h
+ trn2 v29.4h, v19.4h, v21.4h
+
+ trn1 v18.2s, v26.2s, v28.2s
+ trn2 v19.2s, v26.2s, v28.2s
+ trn1 v20.2s, v27.2s, v29.2s
+ trn2 v21.2s, v27.2s, v29.2s
+
+ trn1 v26.4h, v22.4h, v30.4h
+ trn2 v27.4h, v22.4h, v30.4h
+ trn1 v28.4h, v23.4h, v31.4h
+ trn2 v29.4h, v23.4h, v31.4h
+
+ trn1 v22.2s, v26.2s, v28.2s
+ trn2 v23.2s, v26.2s, v28.2s
+ trn1 v30.2s, v27.2s, v29.2s
+ trn2 v31.2s, v27.2s, v29.2s
+
+ trn1 v26.4h, v14.4h, v8.4h
+ trn2 v27.4h, v14.4h, v8.4h
+ trn1 v28.4h, v15.4h, v9.4h
+ trn2 v29.4h, v15.4h, v9.4h
+
+ trn1 v14.2s, v26.2s, v28.2s
+ trn2 v15.2s, v26.2s, v28.2s
+ trn1 v8.2s, v27.2s, v29.2s
+ trn2 v9.2s, v27.2s, v29.2s
+
+ mov v26.d[0],x15
+ mov v27.d[0],x16
+ mov v28.d[0],x19
+ mov v29.d[0],x20
+
+// d4 =x0 1- 4 values
+// d5 =x2 1- 4 values
+// d12=x1 1- 4 values
+// d13=x3 1- 4 values
+
+// d18 =x0 5- 8 values
+// d19 =x2 5- 8 values
+// d20=x1 5- 8 values
+// d21=x3 5- 8 values
+
+// d22 =x0 9- 12 values
+// d23 =x2 9- 12 values
+// d30=x1 9- 12 values
+// d31=x3 9- 12 values
+
+// d14 =x0 13-16 values
+// d15 =x2 13- 16 values
+// d8=x1 13- 16 values
+// d9=x3 13- 16 values
+
+
+ st1 { v4.4h, v5.4h},[x1],#16
+ st1 { v12.4h, v13.4h},[x1],#16
+
+ st1 { v18.4h, v19.4h},[x1],#16
+ st1 { v20.4h, v21.4h},[x1],#16
+ st1 { v22.4h, v23.4h},[x1],#16
+ st1 { v30.4h, v31.4h},[x1],#16
+ st1 { v14.4h, v15.4h},[x1],#16
+ st1 { v8.4h, v9.4h},[x1],#16
+
+
+ subs x14,x14,#1
+ bne first_stage
+
+
+
+
+
+
+
+
+
+
+ mov x6,x7
+
+ ldp x8, x7,[sp],#16
+
+ mov x10,#16
+
+ cmp x12,x6
+ sub x20,x1,#128
+ csel x1, x20, x1,ge
+ bge label1
+
+ mov x19,#0xff00
+ cmp x12,x19
+ sub x20,x1,#256
+ csel x1, x20, x1,ge
+ bge label_2
+
+ sub x1,x1,#512
+ sub x20,x10,#0
+ neg x10, x20
+
+label_2:
+ add x9,x1,#128
+ add x11,x9,#128
+ add x0,x11,#128
+
+
+
+label1:
+// mov x6,x1
+
+
+ mov x14,#4
+ add x4,x2,x8, lsl #1 // x4 = x2 + pred_strd * 2 => x4 points to 3rd row of pred data
+ add x5,x8,x8, lsl #1 //
+// add x0,x3,x7, lsl #1 @ x0 points to 3rd row of dest data
+// add x10,x7,x7, lsl #1 @
+
+
+
+
+second_stage:
+ ld1 {v10.4h, v11.4h},[x1],#16
+ ld1 {v6.4h, v7.4h},[x1],x10
+ cmp x12,x6
+ bge second_stage_process
+ ld1 {v4.4h, v5.4h},[x9],#16
+ ld1 {v8.4h, v9.4h},[x9],x10
+
+second_stage_process:
+
+
+ smull v24.4s, v6.4h, v0.4h[1] //// y1 * cos1(part of b0)
+ smull v26.4s, v6.4h, v0.4h[3] //// y1 * cos3(part of b1)
+ smull v28.4s, v6.4h, v1.4h[1] //// y1 * sin3(part of b2)
+ smull v30.4s, v6.4h, v1.4h[3] //// y1 * sin1(part of b3)
+
+ smlal v24.4s, v7.4h, v0.4h[3] //// y1 * cos1 + y3 * cos3(part of b0)
+ smlal v26.4s, v7.4h, v2.4h[1] //// y1 * cos3 - y3 * sin1(part of b1)
+ smlal v28.4s, v7.4h, v3.4h[3] //// y1 * sin3 - y3 * cos1(part of b2)
+ smlsl v30.4s, v7.4h, v2.4h[3] //// y1 * sin1 - y3 * sin3(part of b3)
+
+
+ smull v12.4s, v10.4h, v0.4h[0]
+ smlal v12.4s, v11.4h, v0.4h[2]
+ smull v14.4s, v10.4h, v0.4h[0]
+ smlal v14.4s, v11.4h, v1.4h[2]
+ smull v16.4s, v10.4h, v0.4h[0]
+ smlal v16.4s, v11.4h, v2.4h[2]
+ smull v18.4s, v10.4h, v0.4h[0]
+ smlal v18.4s, v11.4h, v3.4h[2]
+
+ bge skip_last8rows_stage2_kernel1
+
+ smlal v24.4s, v8.4h, v1.4h[1]
+ smlal v26.4s, v8.4h, v3.4h[3]
+ smlsl v28.4s, v8.4h, v1.4h[3]
+ smlsl v30.4s, v8.4h, v0.4h[3]
+
+
+ smlal v24.4s, v9.4h, v1.4h[3]
+ smlsl v26.4s, v9.4h, v2.4h[3]
+ smlsl v28.4s, v9.4h, v0.4h[3]
+ smlal v30.4s, v9.4h, v3.4h[3]
+
+
+ smlal v12.4s, v4.4h, v1.4h[0]
+ smlal v12.4s, v5.4h, v1.4h[2]
+ smlal v14.4s, v4.4h, v3.4h[0]
+ smlsl v14.4s, v5.4h, v3.4h[2]
+ smlsl v16.4s, v4.4h, v3.4h[0]
+ smlsl v16.4s, v5.4h, v0.4h[2]
+ smlsl v18.4s, v4.4h, v1.4h[0]
+ smlsl v18.4s, v5.4h, v2.4h[2]
+
+ mov x19,#0xff00
+ cmp x12,x19
+ bge skip_last8rows_stage2_kernel1
+
+
+ ld1 {v10.4h, v11.4h},[x11],#16
+ ld1 {v6.4h, v7.4h},[x11],x10
+ ld1 {v4.4h, v5.4h},[x0],#16
+ ld1 {v8.4h, v9.4h},[x0],x10
+
+
+
+
+
+ smlal v24.4s, v6.4h, v2.4h[1] //// y1 * cos1(part of b0)
+ smlsl v26.4s, v6.4h, v1.4h[1] //// y1 * cos3(part of b1)
+ smlsl v28.4s, v6.4h, v3.4h[1] //// y1 * sin3(part of b2)
+ smlal v30.4s, v6.4h, v0.4h[1] //// y1 * sin1(part of b3)
+
+ smlal v24.4s, v7.4h, v2.4h[3] //// y1 * cos1 + y3 * cos3(part of b0)
+ smlsl v26.4s, v7.4h, v0.4h[1] //// y1 * cos3 - y3 * sin1(part of b1)
+ smlal v28.4s, v7.4h, v2.4h[1] //// y1 * sin3 - y3 * cos1(part of b2)
+ smlal v30.4s, v7.4h, v3.4h[1] //// y1 * sin1 - y3 * sin3(part of b3)
+
+
+
+ smlal v24.4s, v8.4h, v3.4h[1]
+ smlsl v26.4s, v8.4h, v1.4h[3]
+ smlal v28.4s, v8.4h, v0.4h[1]
+ smlsl v30.4s, v8.4h, v1.4h[1]
+
+
+ smlal v24.4s, v9.4h, v3.4h[3]
+ smlsl v26.4s, v9.4h, v3.4h[1]
+ smlal v28.4s, v9.4h, v2.4h[3]
+ smlsl v30.4s, v9.4h, v2.4h[1]
+
+
+
+
+
+ smlal v12.4s, v10.4h, v0.4h[0]
+ smlal v12.4s, v11.4h, v2.4h[2]
+ smlal v12.4s, v4.4h, v3.4h[0]
+ smlal v12.4s, v5.4h, v3.4h[2]
+
+
+
+
+ smlsl v14.4s, v10.4h, v0.4h[0]
+ smlsl v14.4s, v11.4h, v0.4h[2]
+ smlsl v14.4s, v4.4h, v1.4h[0]
+ smlsl v14.4s, v5.4h, v2.4h[2]
+
+
+ smlsl v16.4s, v10.4h, v0.4h[0]
+ smlal v16.4s, v11.4h, v3.4h[2]
+ smlal v16.4s, v4.4h, v1.4h[0]
+ smlal v16.4s, v5.4h, v1.4h[2]
+
+
+ smlal v18.4s, v10.4h, v0.4h[0]
+ smlal v18.4s, v11.4h, v1.4h[2]
+ smlsl v18.4s, v4.4h, v3.4h[0]
+ smlsl v18.4s, v5.4h, v0.4h[2]
+
+
+
+
+
+
+skip_last8rows_stage2_kernel1:
+
+
+
+ add v20.4s, v12.4s , v24.4s
+ sub v22.4s, v12.4s , v24.4s
+
+ add v12.4s, v14.4s , v26.4s
+ sub v24.4s, v14.4s , v26.4s
+
+ add v14.4s, v16.4s , v28.4s
+ sub v26.4s, v16.4s , v28.4s
+
+
+ add v16.4s, v18.4s , v30.4s
+ sub v28.4s, v18.4s , v30.4s
+
+
+
+
+
+
+
+ sqrshrn v30.4h, v20.4s,#shift_stage2_idct //// x0 = (a0 + b0 + rnd) >> 7(shift_stage1_idct)
+ sqrshrn v19.4h, v22.4s,#shift_stage2_idct //// x7 = (a0 - b0 + rnd) >> 7(shift_stage1_idct)
+ sqrshrn v31.4h, v14.4s,#shift_stage2_idct //// x2 = (a2 + b2 + rnd) >> 7(shift_stage1_idct)
+ sqrshrn v18.4h, v26.4s,#shift_stage2_idct //// x5 = (a2 - b2 + rnd) >> 7(shift_stage1_idct)
+ sqrshrn v12.4h, v12.4s,#shift_stage2_idct //// x1 = (a1 + b1 + rnd) >> 7(shift_stage1_idct)
+ sqrshrn v15.4h, v24.4s,#shift_stage2_idct //// x6 = (a1 - b1 + rnd) >> 7(shift_stage1_idct)
+ sqrshrn v13.4h, v16.4s,#shift_stage2_idct //// x3 = (a3 + b3 + rnd) >> 7(shift_stage1_idct)
+ sqrshrn v14.4h, v28.4s,#shift_stage2_idct //// x4 = (a3 - b3 + rnd) >> 7(shift_stage1_idct)
+
+ bge skip_stage2_kernel_load
+
+ //q2,q4,q6,q7 is used
+ ld1 {v10.4h, v11.4h},[x1],#16
+ ld1 {v6.4h, v7.4h},[x1],#16
+ ld1 {v4.4h, v5.4h},[x9],#16
+ ld1 {v8.4h, v9.4h},[x9],#16
+skip_stage2_kernel_load:
+ sub x1,x1,#32
+ st1 {v30.4h, v31.4h},[x1],#16
+ st1 {v18.4h, v19.4h},[x1],#16
+ sub x1,x1,#32
+
+ smull v24.4s, v6.4h, v2.4h[1] //// y1 * cos1(part of b0)
+ smull v26.4s, v6.4h, v2.4h[3] //// y1 * cos3(part of b1)
+ smull v28.4s, v6.4h, v3.4h[1] //// y1 * sin3(part of b2)
+ smull v30.4s, v6.4h, v3.4h[3] //// y1 * sin1(part of b3)
+
+ smlsl v24.4s, v7.4h, v1.4h[1] //// y1 * cos1 + y3 * cos3(part of b0)
+ smlsl v26.4s, v7.4h, v0.4h[1] //// y1 * cos3 - y3 * sin1(part of b1)
+ smlsl v28.4s, v7.4h, v1.4h[3] //// y1 * sin3 - y3 * cos1(part of b2)
+ smlsl v30.4s, v7.4h, v3.4h[1] //// y1 * sin1 - y3 * sin3(part of b3)
+
+
+ smull v22.4s, v10.4h, v0.4h[0]
+ smlsl v22.4s, v11.4h, v3.4h[2]
+ smull v20.4s, v10.4h, v0.4h[0]
+ smlsl v20.4s, v11.4h, v2.4h[2]
+ smull v16.4s, v10.4h, v0.4h[0]
+ smlsl v16.4s, v11.4h, v1.4h[2]
+ smull v18.4s, v10.4h, v0.4h[0]
+ smlsl v18.4s, v11.4h, v0.4h[2]
+
+
+
+ cmp x12,x6
+ bge skip_last8rows_stage2_kernel2
+
+
+ smlsl v24.4s, v8.4h, v3.4h[1]
+ smlal v26.4s, v8.4h, v2.4h[1]
+ smlal v28.4s, v8.4h, v0.4h[1]
+ smlal v30.4s, v8.4h, v2.4h[3]
+
+
+ smlal v24.4s, v9.4h, v0.4h[1]
+ smlal v26.4s, v9.4h, v3.4h[1]
+ smlsl v28.4s, v9.4h, v1.4h[1]
+ smlsl v30.4s, v9.4h, v2.4h[1]
+
+
+
+ smlsl v22.4s, v4.4h, v1.4h[0]
+ smlal v22.4s, v5.4h, v2.4h[2]
+ smlsl v20.4s, v4.4h, v3.4h[0]
+ smlal v20.4s, v5.4h, v0.4h[2]
+ smlal v16.4s, v4.4h, v3.4h[0]
+ smlal v16.4s, v5.4h, v3.4h[2]
+ smlal v18.4s, v4.4h, v1.4h[0]
+ smlsl v18.4s, v5.4h, v1.4h[2]
+ mov x19,#0xff00
+ cmp x12,x19
+ bge skip_last8rows_stage2_kernel2
+
+ ld1 {v10.4h, v11.4h},[x11],#16
+ ld1 {v6.4h, v7.4h},[x11],#16
+ ld1 {v4.4h, v5.4h},[x0],#16
+ ld1 {v8.4h, v9.4h},[x0],#16
+
+ smlsl v24.4s, v6.4h, v3.4h[3] //// y1 * cos1(part of b0)
+ smlsl v26.4s, v6.4h, v0.4h[3] //// y1 * cos3(part of b1)
+ smlal v28.4s, v6.4h, v2.4h[3] //// y1 * sin3(part of b2)
+ smlal v30.4s, v6.4h, v1.4h[3] //// y1 * sin1(part of b3)
+
+ smlsl v24.4s, v7.4h, v0.4h[3] //// y1 * cos1 + y3 * cos3(part of b0)
+ smlal v26.4s, v7.4h, v1.4h[3] //// y1 * cos3 - y3 * sin1(part of b1)
+ smlal v28.4s, v7.4h, v3.4h[3] //// y1 * sin3 - y3 * cos1(part of b2)
+ smlsl v30.4s, v7.4h, v1.4h[1] //// y1 * sin1 - y3 * sin3(part of b3)
+
+
+ smlal v24.4s, v8.4h, v2.4h[3]
+ smlal v26.4s, v8.4h, v3.4h[3]
+ smlsl v28.4s, v8.4h, v2.4h[1]
+ smlal v30.4s, v8.4h, v0.4h[3]
+
+
+ smlal v24.4s, v9.4h, v1.4h[3]
+ smlsl v26.4s, v9.4h, v1.4h[1]
+ smlal v28.4s, v9.4h, v0.4h[3]
+ smlsl v30.4s, v9.4h, v0.4h[1]
+
+
+
+
+ smlal v22.4s, v10.4h, v0.4h[0]
+ smlsl v22.4s, v11.4h, v1.4h[2]
+ smlsl v22.4s, v4.4h, v3.4h[0]
+ smlal v22.4s, v5.4h, v0.4h[2]
+
+
+
+ smlsl v20.4s, v10.4h, v0.4h[0]
+ smlsl v20.4s, v11.4h, v3.4h[2]
+ smlal v20.4s, v4.4h, v1.4h[0]
+ smlsl v20.4s, v5.4h, v1.4h[2]
+
+
+ smlsl v16.4s, v10.4h, v0.4h[0]
+ smlal v16.4s, v11.4h, v0.4h[2]
+ smlsl v16.4s, v4.4h, v1.4h[0]
+ smlal v16.4s, v5.4h, v2.4h[2]
+
+
+
+ smlal v18.4s, v10.4h, v0.4h[0]
+ smlsl v18.4s, v11.4h, v2.4h[2]
+ smlal v18.4s, v4.4h, v3.4h[0]
+ smlsl v18.4s, v5.4h, v3.4h[2]
+
+
+skip_last8rows_stage2_kernel2:
+
+
+
+ add v4.4s, v22.4s , v24.4s
+ sub v22.4s, v22.4s , v24.4s
+
+ add v6.4s, v20.4s , v26.4s
+ sub v24.4s, v20.4s , v26.4s
+
+ add v10.4s, v16.4s , v28.4s
+ sub v26.4s, v16.4s , v28.4s
+
+
+ add v16.4s, v18.4s , v30.4s
+ sub v28.4s, v18.4s , v30.4s
+
+
+ sqrshrn v18.4h, v4.4s,#shift_stage2_idct //// x0 = (a0 + b0 + rnd) >> 7(shift_stage1_idct)
+ sqrshrn v31.4h, v22.4s,#shift_stage2_idct //// x7 = (a0 - b0 + rnd) >> 7(shift_stage1_idct)
+ sqrshrn v19.4h, v10.4s,#shift_stage2_idct //// x2 = (a2 + b2 + rnd) >> 7(shift_stage1_idct)
+ sqrshrn v30.4h, v26.4s,#shift_stage2_idct //// x5 = (a2 - b2 + rnd) >> 7(shift_stage1_idct)
+ sqrshrn v20.4h, v6.4s,#shift_stage2_idct //// x1 = (a1 + b1 + rnd) >> 7(shift_stage1_idct)
+ sqrshrn v23.4h, v24.4s,#shift_stage2_idct //// x6 = (a1 - b1 + rnd) >> 7(shift_stage1_idct)
+ sqrshrn v21.4h, v16.4s,#shift_stage2_idct //// x3 = (a3 + b3 + rnd) >> 7(shift_stage1_idct)
+ sqrshrn v22.4h, v28.4s,#shift_stage2_idct //// x4 = (a3 - b3 + rnd) >> 7(shift_stage1_idct)
+
+ ld1 {v4.4h, v5.4h},[x1],#16
+ ld1 {v8.4h, v9.4h},[x1],#16
+
+
+
+ // registers used: {q2,q4,q6,q7}, {q9,q15,q10,q11}
+
+//d4=x0
+//d12=x1
+//d5=x2
+//d13=x3
+
+//d18=x4
+//d20=x5
+//d19=x6
+//d21=x7
+
+//d22=x8
+//d30=x9
+//d23=x10
+//d31=x11
+
+//d14=x12
+//d8=x13
+//d15=x14
+//d9=x15
+
+ umov x15,v26.d[0]
+ umov x16,v27.d[0]
+ umov x19,v28.d[0]
+ umov x20,v29.d[0]
+
+ trn1 v26.4h, v4.4h, v12.4h
+ trn2 v27.4h, v4.4h, v12.4h
+ trn1 v28.4h, v5.4h, v13.4h
+ trn2 v29.4h, v5.4h, v13.4h
+
+ trn1 v4.2s, v26.2s, v28.2s
+ trn2 v5.2s, v26.2s, v28.2s
+ trn1 v12.2s, v27.2s, v29.2s
+ trn2 v13.2s, v27.2s, v29.2s
+
+ trn1 v26.4h, v18.4h, v20.4h
+ trn2 v27.4h, v18.4h, v20.4h
+ trn1 v28.4h, v19.4h, v21.4h
+ trn2 v29.4h, v19.4h, v21.4h
+
+ trn1 v18.2s, v26.2s, v28.2s
+ trn2 v19.2s, v26.2s, v28.2s
+ trn1 v20.2s, v27.2s, v29.2s
+ trn2 v21.2s, v27.2s, v29.2s
+
+ trn1 v26.4h, v22.4h, v30.4h
+ trn2 v27.4h, v22.4h, v30.4h
+ trn1 v28.4h, v23.4h, v31.4h
+ trn2 v29.4h, v23.4h, v31.4h
+
+ trn1 v22.2s, v26.2s, v28.2s
+ trn2 v23.2s, v26.2s, v28.2s
+ trn1 v30.2s, v27.2s, v29.2s
+ trn2 v31.2s, v27.2s, v29.2s
+
+ trn1 v26.4h, v14.4h, v8.4h
+ trn2 v27.4h, v14.4h, v8.4h
+ trn1 v28.4h, v15.4h, v9.4h
+ trn2 v29.4h, v15.4h, v9.4h
+
+ trn1 v14.2s, v26.2s, v28.2s
+ trn2 v15.2s, v26.2s, v28.2s
+ trn1 v8.2s, v27.2s, v29.2s
+ trn2 v9.2s, v27.2s, v29.2s
+
+ mov v26.d[0],x15
+ mov v27.d[0],x16
+ mov v28.d[0],x19
+ mov v29.d[0],x20
+
+// d4 =x0 1- 4 values
+// d5 =x2 1- 4 values
+// d12=x1 1- 4 values
+// d13=x3 1- 4 values
+
+// d18 =x0 5- 8 values
+// d19 =x2 5- 8 values
+// d20=x1 5- 8 values
+// d21=x3 5- 8 values
+
+// d22 =x0 9- 12 values
+// d23 =x2 9- 12 values
+// d30=x1 9- 12 values
+// d31=x3 9- 12 values
+
+// d14 =x0 13-16 values
+// d15 =x2 13- 16 values
+// d8=x1 13- 16 values
+// d9=x3 13- 16 values
+
+ // swapping v5 and v15
+ mov v5.d[1],v5.d[0]
+ mov v5.d[0],v18.d[0]
+ mov v18.d[0],v5.d[1]
+ // swapping v23 and v14
+ mov v23.d[1],v23.d[0]
+ mov v23.d[0],v14.d[0]
+ mov v14.d[0],v23.d[1]
+ // swapping v13 and v20
+ mov v13.d[1],v13.d[0]
+ mov v13.d[0],v20.d[0]
+ mov v20.d[0],v13.d[1]
+ // swapping v31 and v8
+ mov v31.d[1],v31.d[0]
+ mov v31.d[0],v8.d[0]
+ mov v8.d[0],v31.d[1]
+
+// q2: x0 1-8 values
+// q11: x0 9-16 values
+// q9 : x2 1-8 values
+// q7 : x2 9-16 values
+// q6 : x1 1- 8 values
+// q10: x3 1-8 values
+// q15: x1 9-16 values
+// q4: x3 9-16 values
+
+
+// registers free: q8,q14,q12,q13
+
+
+ ld1 {v16.8b, v17.8b},[x2],x8
+ ld1 {v28.8b, v29.8b},[x2],x5
+ ld1 {v24.8b, v25.8b},[x4],x8
+ ld1 {v26.8b, v27.8b},[x4],x5
+
+ mov v4.d[1] ,v5.d[0]
+ mov v22.d[1] ,v23.d[0]
+ mov v12.d[1] ,v13.d[0]
+ mov v30.d[1] ,v31.d[0]
+ mov v18.d[1] ,v19.d[0]
+ mov v14.d[1] ,v15.d[0]
+ mov v20.d[1] ,v21.d[0]
+ mov v8.d[1] ,v9.d[0]
+
+ uaddw v4.8h, v4.8h , v16.8b
+ uaddw v22.8h, v22.8h , v17.8b
+ uaddw v12.8h, v12.8h , v28.8b
+ uaddw v30.8h, v30.8h , v29.8b
+ uaddw v18.8h, v18.8h , v24.8b
+ uaddw v14.8h, v14.8h , v25.8b
+ uaddw v20.8h, v20.8h , v26.8b
+ uaddw v8.8h, v8.8h , v27.8b
+
+
+ sqxtun v16.8b, v4.8h
+ sqxtun v17.8b, v22.8h
+ sqxtun v28.8b, v12.8h
+ sqxtun v29.8b, v30.8h
+ sqxtun v24.8b, v18.8h
+ sqxtun v25.8b, v14.8h
+ sqxtun v26.8b, v20.8h
+ sqxtun v27.8b, v8.8h
+
+
+
+ st1 {v16.8b, v17.8b},[x3],x7
+ st1 {v28.8b, v29.8b},[x3],x7
+ st1 {v24.8b, v25.8b},[x3],x7
+ st1 {v26.8b, v27.8b},[x3],x7
+
+ subs x14,x14,#1
+
+
+
+ bne second_stage
+
+
+// sub sp,sp,#40
+ // ldmfd sp!,{x4-x12,pc}
+ ldp x19, x20,[sp],#16
+ pop_v_regs
+ ret
+
+
+
+
+
+
+
+
+
+
+