1 files changed, 1240 insertions, 0 deletions
diff --git a/common/arm64/ihevc_itrans_recon_16x16.s b/common/arm64/ihevc_itrans_recon_16x16.s
new file mode 100644
index 0000000..90df840
--- /dev/null
+++ b/common/arm64/ihevc_itrans_recon_16x16.s
@@ -0,0 +1,1240 @@
+///*****************************************************************************
+//*
+//* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+//*
+//* Licensed under the Apache License, Version 2.0 (the "License");
+//* you may not use this file except in compliance with the License.
+//* You may obtain a copy of the License at:
+//*
+//* http://www.apache.org/licenses/LICENSE-2.0
+//*
+//* Unless required by applicable law or agreed to in writing, software
+//* distributed under the License is distributed on an "AS IS" BASIS,
+//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//* See the License for the specific language governing permissions and
+//* limitations under the License.
+//*
+//*****************************************************************************/
+///**
+// *******************************************************************************
+// * @file
+// *  ihevc_itrans_recon_8x8_neon.s
+// *
+// * @brief
+// *  contains function definitions for single stage  inverse transform
+// *
+// * @author
+// * anand s
+// *
+// * @par list of functions:
+// *  - ihevc_itrans_recon_16x16()
+// *
+// * @remarks
+// *  none
+// *
+// *******************************************************************************
+//*/
+
+///**
+// *******************************************************************************
+// *
+// * @brief
+// *  this function performs inverse transform  and reconstruction for 8x8
+// * input block
+// *
+// * @par description:
+// *  performs inverse transform and adds the prediction  data and clips output
+// * to 8 bit
+// *
+// * @param[in] pi2_src
+// *  input 16x16 coefficients
+// *
+// * @param[in] pi2_tmp
+// *  temporary 16x16 buffer for storing inverse
+// *
+// *  transform
+// *  1st stage output
+// *
+// * @param[in] pu1_pred
+// *  prediction 16x16 block
+// *
+// * @param[out] pu1_dst
+// *  output 8x8 block
+// *
+// * @param[in] src_strd
+// *  input stride
+// *
+// * @param[in] pred_strd
+// *  prediction stride
+// *
+// * @param[in] dst_strd
+// *  output stride
+// *
+// * @param[in] shift
+// *  output shift
+// *
+// * @param[in] x12
+// *  zero columns in pi2_src
+// *
+// * @returns  void
+// *
+// * @remarks
+// *  none
+// *
+// *******************************************************************************
+// */
+
+//void ihevc_itrans_recon_16x16(word16 *pi2_src,
+//                            word16 *pi2_tmp,
+//                            uword8 *pu1_pred,
+//                            uword8 *pu1_dst,
+//                            word32 src_strd,
+//                            word32 pred_strd,
+//                            word32 dst_strd,
+//                            word32 x12
+//                             word32    x11                )
+
+//**************variables vs registers*************************
+//    x0 => *pi2_src
+//    x1 => *pi2_tmp
+//    x2 => *pu1_pred
+//    x3 => *pu1_dst
+//    src_strd
+//    pred_strd
+//    dst_strd
+//    x12
+//    x11
+
+.text
+.align 4
+
+.include "ihevc_neon_macros.s"
+
+
+
+
+.set shift_stage1_idct ,   7
+.set shift_stage2_idct ,   12
+//#define zero_cols         x12
+//#define zero_rows         x11
+.globl ihevc_itrans_recon_16x16_av8
+
+.extern g_ai2_ihevc_trans_16_transpose
+
+.type ihevc_itrans_recon_16x16_av8, %function
+
+ihevc_itrans_recon_16x16_av8:
+
+    ldr         w11, [sp]
+    // stmfd sp!,{x4-x12,x14}
+    push_v_regs
+    stp         x19, x20,[sp,#-16]!
+    stp         x5, x6,[sp,#-16]!
+//    add             sp,sp,#40
+
+
+
+//    ldr            x8,[sp,#4]     @ prediction stride
+//    ldr            x7,[sp,#8]     @ destination stride
+    mov         x6, x4 // src stride
+    mov         x12, x7
+
+
+
+    adrp        x14, :got:g_ai2_ihevc_trans_16_transpose
+    ldr         x14, [x14, #:got_lo12:g_ai2_ihevc_trans_16_transpose]
+    ld1         {v0.4h, v1.4h, v2.4h, v3.4h},[x14] ////d0,d1 are used for storing the constant data
+    mov         x7,#0xffff
+    and         x12,x12,x7
+    and         x11,x11,x7
+    lsl         x6, x6, #1                  // x sizeof(word16)
+    add         x9,x0,x6, lsl #1            // 2 rows
+
+    add         x10,x6,x6, lsl #1           // 3 rows
+    add         x5,x6,x6,lsl #2
+    mov         x7,#0xfff0
+
+    cmp         x12,x7
+    bge         zero_12cols_decision
+
+    mov         x19,#0xff00
+    cmp         x12,x19
+    bge         zero_8cols_decision
+
+
+
+
+    mov         x14,#4
+    cmp         x11,x7
+    sub         x20,x6,#0
+    neg         x20, x20
+    csel        x10,x20,x10,ge
+
+    mov         x19,#0xff00
+    cmp         x11,x19
+    csel        x8, x5, x8,ge
+    sub         x20,x8,#0
+    neg         x20, x20
+    csel        x8,x20,x8,ge
+    csel        x8, x10, x8,lt
+    add         x5,x5,x6,lsl #3
+    sub         x20,x5,#0
+    neg         x5, x20
+
+    b           first_stage_top_four_bottom_four
+
+zero_12cols_decision:
+    mov         x14,#1
+    mov         x19,#0xff00
+    cmp         x11,x19
+    csel        x8, x5, x8,ge
+    csel        x8, x10, x8,lt
+    add         x5,x5,x6,lsl #3
+    sub         x20,x5,#0
+    neg         x5, x20
+
+    b           first_stage_top_four_bottom_four
+
+zero_8cols_decision:
+    mov         x14,#2
+    mov         x8,x5
+    sub         x20,x8,#0
+    neg         x8, x20
+    mov         x19,#0xff00
+    cmp         x11,x19
+    csel        x8, x10, x8,lt
+    add         x5,x5,x6,lsl #3
+    sub         x20,x5,#0
+    neg         x5, x20
+    cmp         x11,x7
+    sub         x20,x6,#0
+    neg         x20, x20
+    csel        x10,x20,x10,ge
+
+
+    b           first_stage_top_four_bottom_four
+
+
+//d0[0]=    64        d2[0]=64
+//d0[1]= 90        d2[1]=57
+//d0[2]= 89        d2[2]=50
+//d0[3]= 87        d2[3]=43
+//d1[0]= 83         d3[0]=36
+//d1[1]= 80        d3[1]=25
+//d1[2]= 75        d3[2]=18
+//d1[3]= 70        d3[3]=9
+
+
+
+first_stage:
+    add         x0,x0,#8
+    add         x9,x9,#8
+
+first_stage_top_four_bottom_four:
+
+    ld1         {v10.4h},[x0],x6
+    ld1         {v11.4h},[x9],x6
+    ld1         {v6.4h},[x0],x10
+    ld1         {v7.4h},[x9],x10
+    cmp         x11,x7
+    bge         skip_load4rows
+
+    ld1         {v4.4h},[x0],x6
+    ld1         {v5.4h},[x9],x6
+    ld1         {v8.4h},[x0],x8
+    ld1         {v9.4h},[x9],x8
+
+// registers used: q0,q1,q3,q5,q2,q4
+
+// d10 =x0
+//d6= x1
+//d11=x2
+//d7=x3
+
+skip_load4rows:
+    smull       v24.4s, v6.4h, v0.4h[1]     //// y1 * cos1(part of b0)
+    smull       v26.4s, v6.4h, v0.4h[3]     //// y1 * cos3(part of b1)
+    smull       v28.4s, v6.4h, v1.4h[1]     //// y1 * sin3(part of b2)
+    smull       v30.4s, v6.4h, v1.4h[3]     //// y1 * sin1(part of b3)
+
+    smlal       v24.4s, v7.4h, v0.4h[3]     //// y1 * cos1 + y3 * cos3(part of b0)
+    smlal       v26.4s, v7.4h, v2.4h[1]     //// y1 * cos3 - y3 * sin1(part of b1)
+    smlal       v28.4s, v7.4h, v3.4h[3]     //// y1 * sin3 - y3 * cos1(part of b2)
+    smlsl       v30.4s, v7.4h, v2.4h[3]     //// y1 * sin1 - y3 * sin3(part of b3)
+
+
+
+
+
+
+    smull       v12.4s, v10.4h, v0.4h[0]
+    smlal       v12.4s, v11.4h, v0.4h[2]
+    smull       v14.4s, v10.4h, v0.4h[0]
+    smlal       v14.4s, v11.4h, v1.4h[2]
+    smull       v16.4s, v10.4h, v0.4h[0]
+    smlal       v16.4s, v11.4h, v2.4h[2]
+    smull       v18.4s, v10.4h, v0.4h[0]
+    smlal       v18.4s, v11.4h, v3.4h[2]
+
+    bge         skip_last12rows_kernel1
+
+
+    smlal       v24.4s, v8.4h, v1.4h[1]
+    smlal       v26.4s, v8.4h, v3.4h[3]
+    smlsl       v28.4s, v8.4h, v1.4h[3]
+    smlsl       v30.4s, v8.4h, v0.4h[3]
+
+
+    smlal       v24.4s, v9.4h, v1.4h[3]
+    smlsl       v26.4s, v9.4h, v2.4h[3]
+    smlsl       v28.4s, v9.4h, v0.4h[3]
+    smlal       v30.4s, v9.4h, v3.4h[3]
+
+
+
+
+
+    smlal       v12.4s, v4.4h, v1.4h[0]
+    smlal       v12.4s, v5.4h, v1.4h[2]
+    smlal       v14.4s, v4.4h, v3.4h[0]
+    smlsl       v14.4s, v5.4h, v3.4h[2]
+    smlsl       v16.4s, v4.4h, v3.4h[0]
+    smlsl       v16.4s, v5.4h, v0.4h[2]
+    smlsl       v18.4s, v4.4h, v1.4h[0]
+    smlsl       v18.4s, v5.4h, v2.4h[2]
+
+//d0[0]=    64        d2[0]=64
+//d0[1]= 90        d2[1]=57
+//d0[2]= 89        d2[2]=50
+//d0[3]= 87        d2[3]=43
+//d1[0]= 83         d3[0]=36
+//d1[1]= 80        d3[1]=25
+//d1[2]= 75        d3[2]=18
+//d1[3]= 70        d3[3]=9
+    mov         x19,#0xff00
+    cmp         x11,x19
+    bge         skip_last12rows_kernel1
+
+
+    ld1         {v10.4h},[x0],x6
+    ld1         {v11.4h},[x9],x6
+    ld1         {v6.4h},[x0],x10
+    ld1         {v7.4h},[x9],x10
+    ld1         {v4.4h},[x0],x6
+    ld1         {v5.4h},[x9],x6
+    ld1         {v8.4h},[x0],x5
+    ld1         {v9.4h},[x9],x5
+
+
+
+
+    smlal       v24.4s, v6.4h, v2.4h[1]     //// y1 * cos1(part of b0)
+    smlsl       v26.4s, v6.4h, v1.4h[1]     //// y1 * cos3(part of b1)
+    smlsl       v28.4s, v6.4h, v3.4h[1]     //// y1 * sin3(part of b2)
+    smlal       v30.4s, v6.4h, v0.4h[1]     //// y1 * sin1(part of b3)
+
+    smlal       v24.4s, v7.4h, v2.4h[3]     //// y1 * cos1 + y3 * cos3(part of b0)
+    smlsl       v26.4s, v7.4h, v0.4h[1]     //// y1 * cos3 - y3 * sin1(part of b1)
+    smlal       v28.4s, v7.4h, v2.4h[1]     //// y1 * sin3 - y3 * cos1(part of b2)
+    smlal       v30.4s, v7.4h, v3.4h[1]     //// y1 * sin1 - y3 * sin3(part of b3)
+
+
+
+    smlal       v24.4s, v8.4h, v3.4h[1]
+    smlsl       v26.4s, v8.4h, v1.4h[3]
+    smlal       v28.4s, v8.4h, v0.4h[1]
+    smlsl       v30.4s, v8.4h, v1.4h[1]
+
+
+    smlal       v24.4s, v9.4h, v3.4h[3]
+    smlsl       v26.4s, v9.4h, v3.4h[1]
+    smlal       v28.4s, v9.4h, v2.4h[3]
+    smlsl       v30.4s, v9.4h, v2.4h[1]
+
+
+
+
+
+    smlal       v12.4s, v10.4h, v0.4h[0]
+    smlal       v12.4s, v11.4h, v2.4h[2]
+    smlal       v12.4s, v4.4h, v3.4h[0]
+    smlal       v12.4s, v5.4h, v3.4h[2]
+
+
+
+
+    smlsl       v14.4s, v10.4h, v0.4h[0]
+    smlsl       v14.4s, v11.4h, v0.4h[2]
+    smlsl       v14.4s, v4.4h, v1.4h[0]
+    smlsl       v14.4s, v5.4h, v2.4h[2]
+
+
+    smlsl       v16.4s, v10.4h, v0.4h[0]
+    smlal       v16.4s, v11.4h, v3.4h[2]
+    smlal       v16.4s, v4.4h, v1.4h[0]
+    smlal       v16.4s, v5.4h, v1.4h[2]
+
+
+    smlal       v18.4s, v10.4h, v0.4h[0]
+    smlal       v18.4s, v11.4h, v1.4h[2]
+    smlsl       v18.4s, v4.4h, v3.4h[0]
+    smlsl       v18.4s, v5.4h, v0.4h[2]
+
+skip_last12rows_kernel1:
+    add         v20.4s,  v12.4s ,  v24.4s
+    sub         v22.4s,  v12.4s ,  v24.4s
+
+    add         v12.4s,  v14.4s ,  v26.4s
+    sub         v24.4s,  v14.4s ,  v26.4s
+
+    add         v14.4s,  v16.4s ,  v28.4s
+    sub         v26.4s,  v16.4s ,  v28.4s
+
+
+    add         v16.4s,  v18.4s ,  v30.4s
+    sub         v28.4s,  v18.4s ,  v30.4s
+
+
+
+
+
+
+
+    sqrshrn     v30.4h, v20.4s,#shift_stage1_idct //// x0 = (a0 + b0 + rnd) >> 7(shift_stage1_idct)
+    sqrshrn     v19.4h, v22.4s,#shift_stage1_idct //// x7 = (a0 - b0 + rnd) >> 7(shift_stage1_idct)
+    sqrshrn     v31.4h, v14.4s,#shift_stage1_idct //// x2 = (a2 + b2 + rnd) >> 7(shift_stage1_idct)
+    sqrshrn     v18.4h, v26.4s,#shift_stage1_idct //// x5 = (a2 - b2 + rnd) >> 7(shift_stage1_idct)
+    sqrshrn     v12.4h, v12.4s,#shift_stage1_idct //// x1 = (a1 + b1 + rnd) >> 7(shift_stage1_idct)
+    sqrshrn     v15.4h, v24.4s,#shift_stage1_idct //// x6 = (a1 - b1 + rnd) >> 7(shift_stage1_idct)
+    sqrshrn     v13.4h, v16.4s,#shift_stage1_idct //// x3 = (a3 + b3 + rnd) >> 7(shift_stage1_idct)
+    sqrshrn     v14.4h, v28.4s,#shift_stage1_idct //// x4 = (a3 - b3 + rnd) >> 7(shift_stage1_idct)
+
+    st1         {v30.4h, v31.4h},[x1],#16
+    st1         {v18.4h, v19.4h},[x1],#16
+    sub         x1,x1,#32
+
+    bge         skip_stage1_kernel_load
+
+first_stage_middle_eight:
+
+
+
+    ld1         {v10.4h},[x0],x6
+    ld1         {v11.4h},[x9],x6
+    ld1         {v6.4h},[x0],x10
+    ld1         {v7.4h},[x9],x10
+    ld1         {v4.4h},[x0],x6
+    ld1         {v5.4h},[x9],x6
+    ld1         {v8.4h},[x0],x8
+    ld1         {v9.4h},[x9],x8
+
+
+skip_stage1_kernel_load:
+    smull       v24.4s, v6.4h, v2.4h[1]     //// y1 * cos1(part of b0)
+    smull       v26.4s, v6.4h, v2.4h[3]     //// y1 * cos3(part of b1)
+    smull       v28.4s, v6.4h, v3.4h[1]     //// y1 * sin3(part of b2)
+    smull       v30.4s, v6.4h, v3.4h[3]     //// y1 * sin1(part of b3)
+
+    smlsl       v24.4s, v7.4h, v1.4h[1]     //// y1 * cos1 + y3 * cos3(part of b0)
+    smlsl       v26.4s, v7.4h, v0.4h[1]     //// y1 * cos3 - y3 * sin1(part of b1)
+    smlsl       v28.4s, v7.4h, v1.4h[3]     //// y1 * sin3 - y3 * cos1(part of b2)
+    smlsl       v30.4s, v7.4h, v3.4h[1]     //// y1 * sin1 - y3 * sin3(part of b3)
+
+
+
+
+
+
+    smull       v22.4s, v10.4h, v0.4h[0]
+    smlsl       v22.4s, v11.4h, v3.4h[2]
+    smull       v20.4s, v10.4h, v0.4h[0]
+    smlsl       v20.4s, v11.4h, v2.4h[2]
+    smull       v16.4s, v10.4h, v0.4h[0]
+    smlsl       v16.4s, v11.4h, v1.4h[2]
+    smull       v18.4s, v10.4h, v0.4h[0]
+    smlsl       v18.4s, v11.4h, v0.4h[2]
+
+
+    cmp         x11,x7
+    bge         skip_last12rows_kernel2
+
+    smlsl       v24.4s, v8.4h, v3.4h[1]
+    smlal       v26.4s, v8.4h, v2.4h[1]
+    smlal       v28.4s, v8.4h, v0.4h[1]
+    smlal       v30.4s, v8.4h, v2.4h[3]
+
+
+    smlal       v24.4s, v9.4h, v0.4h[1]
+    smlal       v26.4s, v9.4h, v3.4h[1]
+    smlsl       v28.4s, v9.4h, v1.4h[1]
+    smlsl       v30.4s, v9.4h, v2.4h[1]
+
+
+
+    smlsl       v22.4s, v4.4h, v1.4h[0]
+    smlal       v22.4s, v5.4h, v2.4h[2]
+    smlsl       v20.4s, v4.4h, v3.4h[0]
+    smlal       v20.4s, v5.4h, v0.4h[2]
+    smlal       v16.4s, v4.4h, v3.4h[0]
+    smlal       v16.4s, v5.4h, v3.4h[2]
+    smlal       v18.4s, v4.4h, v1.4h[0]
+    smlsl       v18.4s, v5.4h, v1.4h[2]
+
+//d0[0]=    64        d2[0]=64
+//d0[1]= 90        d2[1]=57
+//d0[2]= 89        d2[2]=50
+//d0[3]= 87        d2[3]=43
+//d1[0]= 83         d3[0]=36
+//d1[1]= 80        d3[1]=25
+//d1[2]= 75        d3[2]=18
+//d1[3]= 70        d3[3]=9
+    mov         x19,#0xff00
+    cmp         x11,x19
+    bge         skip_last12rows_kernel2
+
+    ld1         {v10.4h},[x0],x6
+    ld1         {v11.4h},[x9],x6
+    ld1         {v6.4h},[x0],x10
+    ld1         {v7.4h},[x9],x10
+    ld1         {v4.4h},[x0],x6
+    ld1         {v5.4h},[x9],x6
+    ld1         {v8.4h},[x0],x5
+    ld1         {v9.4h},[x9],x5
+
+
+    smlsl       v24.4s, v6.4h, v3.4h[3]     //// y1 * cos1(part of b0)
+    smlsl       v26.4s, v6.4h, v0.4h[3]     //// y1 * cos3(part of b1)
+    smlal       v28.4s, v6.4h, v2.4h[3]     //// y1 * sin3(part of b2)
+    smlal       v30.4s, v6.4h, v1.4h[3]     //// y1 * sin1(part of b3)
+
+    smlsl       v24.4s, v7.4h, v0.4h[3]     //// y1 * cos1 + y3 * cos3(part of b0)
+    smlal       v26.4s, v7.4h, v1.4h[3]     //// y1 * cos3 - y3 * sin1(part of b1)
+    smlal       v28.4s, v7.4h, v3.4h[3]     //// y1 * sin3 - y3 * cos1(part of b2)
+    smlsl       v30.4s, v7.4h, v1.4h[1]     //// y1 * sin1 - y3 * sin3(part of b3)
+
+
+    smlal       v24.4s, v8.4h, v2.4h[3]
+    smlal       v26.4s, v8.4h, v3.4h[3]
+    smlsl       v28.4s, v8.4h, v2.4h[1]
+    smlal       v30.4s, v8.4h, v0.4h[3]
+
+
+    smlal       v24.4s, v9.4h, v1.4h[3]
+    smlsl       v26.4s, v9.4h, v1.4h[1]
+    smlal       v28.4s, v9.4h, v0.4h[3]
+    smlsl       v30.4s, v9.4h, v0.4h[1]
+
+
+
+
+    smlal       v22.4s, v10.4h, v0.4h[0]
+    smlsl       v22.4s, v11.4h, v1.4h[2]
+    smlsl       v22.4s, v4.4h, v3.4h[0]
+    smlal       v22.4s, v5.4h, v0.4h[2]
+
+
+
+    smlsl       v20.4s, v10.4h, v0.4h[0]
+    smlsl       v20.4s, v11.4h, v3.4h[2]
+    smlal       v20.4s, v4.4h, v1.4h[0]
+    smlsl       v20.4s, v5.4h, v1.4h[2]
+
+
+    smlsl       v16.4s, v10.4h, v0.4h[0]
+    smlal       v16.4s, v11.4h, v0.4h[2]
+    smlsl       v16.4s, v4.4h, v1.4h[0]
+    smlal       v16.4s, v5.4h, v2.4h[2]
+
+
+
+    smlal       v18.4s, v10.4h, v0.4h[0]
+    smlsl       v18.4s, v11.4h, v2.4h[2]
+    smlal       v18.4s, v4.4h, v3.4h[0]
+    smlsl       v18.4s, v5.4h, v3.4h[2]
+
+skip_last12rows_kernel2:
+
+    add         v4.4s,  v22.4s ,  v24.4s
+    sub         v22.4s,  v22.4s ,  v24.4s
+
+    add         v6.4s,  v20.4s ,  v26.4s
+    sub         v24.4s,  v20.4s ,  v26.4s
+
+    add         v10.4s,  v16.4s ,  v28.4s
+    sub         v26.4s,  v16.4s ,  v28.4s
+
+
+    add         v16.4s,  v18.4s ,  v30.4s
+    sub         v28.4s,  v18.4s ,  v30.4s
+
+
+    sqrshrn     v18.4h, v4.4s,#shift_stage1_idct //// x0 = (a0 + b0 + rnd) >> 7(shift_stage1_idct)
+    sqrshrn     v31.4h, v22.4s,#shift_stage1_idct //// x7 = (a0 - b0 + rnd) >> 7(shift_stage1_idct)
+    sqrshrn     v19.4h, v10.4s,#shift_stage1_idct //// x2 = (a2 + b2 + rnd) >> 7(shift_stage1_idct)
+    sqrshrn     v30.4h, v26.4s,#shift_stage1_idct //// x5 = (a2 - b2 + rnd) >> 7(shift_stage1_idct)
+    sqrshrn     v20.4h, v6.4s,#shift_stage1_idct //// x1 = (a1 + b1 + rnd) >> 7(shift_stage1_idct)
+    sqrshrn     v23.4h, v24.4s,#shift_stage1_idct //// x6 = (a1 - b1 + rnd) >> 7(shift_stage1_idct)
+    sqrshrn     v21.4h, v16.4s,#shift_stage1_idct //// x3 = (a3 + b3 + rnd) >> 7(shift_stage1_idct)
+    sqrshrn     v22.4h, v28.4s,#shift_stage1_idct //// x4 = (a3 - b3 + rnd) >> 7(shift_stage1_idct)
+
+
+    // registers used:    {q2,q4,q6,q7}, {q9,q15,q10,q11}
+
+
+
+
+
+
+    ld1         {v4.4h, v5.4h},[x1],#16
+    ld1         {v8.4h, v9.4h},[x1],#16
+    sub         x1,x1,#32
+
+//d4=x0
+//d12=x1
+//d5=x2
+//d13=x3
+
+//d18=x4
+//d20=x5
+//d19=x6
+//d21=x7
+
+//d22=x8
+//d30=x9
+//d23=x10
+//d31=x11
+
+//d14=x12
+//d8=x13
+//d15=x14
+//d9=x15
+
+    umov        x15,v26.d[0]
+    umov        x16,v27.d[0]
+    umov        x19,v28.d[0]
+    umov        x20,v29.d[0]
+
+    trn1        v26.4h, v4.4h, v12.4h
+    trn2        v27.4h, v4.4h, v12.4h
+    trn1        v28.4h, v5.4h, v13.4h
+    trn2        v29.4h, v5.4h, v13.4h
+
+    trn1        v4.2s, v26.2s, v28.2s
+    trn2        v5.2s, v26.2s, v28.2s
+    trn1        v12.2s, v27.2s, v29.2s
+    trn2        v13.2s, v27.2s, v29.2s
+
+    trn1        v26.4h, v18.4h, v20.4h
+    trn2        v27.4h, v18.4h, v20.4h
+    trn1        v28.4h, v19.4h, v21.4h
+    trn2        v29.4h, v19.4h, v21.4h
+
+    trn1        v18.2s, v26.2s, v28.2s
+    trn2        v19.2s, v26.2s, v28.2s
+    trn1        v20.2s, v27.2s, v29.2s
+    trn2        v21.2s, v27.2s, v29.2s
+
+    trn1        v26.4h, v22.4h, v30.4h
+    trn2        v27.4h, v22.4h, v30.4h
+    trn1        v28.4h, v23.4h, v31.4h
+    trn2        v29.4h, v23.4h, v31.4h
+
+    trn1        v22.2s, v26.2s, v28.2s
+    trn2        v23.2s, v26.2s, v28.2s
+    trn1        v30.2s, v27.2s, v29.2s
+    trn2        v31.2s, v27.2s, v29.2s
+
+    trn1        v26.4h, v14.4h, v8.4h
+    trn2        v27.4h, v14.4h, v8.4h
+    trn1        v28.4h, v15.4h, v9.4h
+    trn2        v29.4h, v15.4h, v9.4h
+
+    trn1        v14.2s, v26.2s, v28.2s
+    trn2        v15.2s, v26.2s, v28.2s
+    trn1        v8.2s, v27.2s, v29.2s
+    trn2        v9.2s, v27.2s, v29.2s
+
+    mov         v26.d[0],x15
+    mov         v27.d[0],x16
+    mov         v28.d[0],x19
+    mov         v29.d[0],x20
+
+// d4 =x0 1- 4 values
+// d5 =x2 1- 4 values
+// d12=x1 1- 4 values
+// d13=x3 1- 4 values
+
+// d18 =x0 5- 8 values
+// d19 =x2 5- 8 values
+// d20=x1 5- 8 values
+// d21=x3 5- 8 values
+
+// d22 =x0 9- 12 values
+// d23 =x2 9- 12 values
+// d30=x1 9- 12 values
+// d31=x3 9- 12 values
+
+// d14 =x0 13-16 values
+// d15 =x2 13- 16 values
+// d8=x1 13- 16 values
+// d9=x3 13- 16 values
+
+
+    st1         { v4.4h, v5.4h},[x1],#16
+    st1         { v12.4h, v13.4h},[x1],#16
+
+    st1         { v18.4h, v19.4h},[x1],#16
+    st1         { v20.4h, v21.4h},[x1],#16
+    st1         { v22.4h, v23.4h},[x1],#16
+    st1         { v30.4h, v31.4h},[x1],#16
+    st1         { v14.4h, v15.4h},[x1],#16
+    st1         { v8.4h, v9.4h},[x1],#16
+
+
+    subs        x14,x14,#1
+    bne         first_stage
+
+
+
+
+
+
+
+
+
+
+    mov         x6,x7
+
+    ldp         x8, x7,[sp],#16
+
+    mov         x10,#16
+
+    cmp         x12,x6
+    sub         x20,x1,#128
+    csel        x1, x20, x1,ge
+    bge         label1
+
+    mov         x19,#0xff00
+    cmp         x12,x19
+    sub         x20,x1,#256
+    csel        x1, x20, x1,ge
+    bge         label_2
+
+    sub         x1,x1,#512
+    sub         x20,x10,#0
+    neg         x10, x20
+
+label_2:
+    add         x9,x1,#128
+    add         x11,x9,#128
+    add         x0,x11,#128
+
+
+
+label1:
+//    mov   x6,x1
+
+
+    mov         x14,#4
+    add         x4,x2,x8, lsl #1            // x4 = x2 + pred_strd * 2    => x4 points to 3rd row of pred data
+    add         x5,x8,x8, lsl #1            //
+//    add x0,x3,x7, lsl #1    @ x0 points to 3rd row of dest data
+//    add x10,x7,x7, lsl #1    @
+
+
+
+
+second_stage:
+    ld1         {v10.4h, v11.4h},[x1],#16
+    ld1         {v6.4h, v7.4h},[x1],x10
+    cmp         x12,x6
+    bge         second_stage_process
+    ld1         {v4.4h, v5.4h},[x9],#16
+    ld1         {v8.4h, v9.4h},[x9],x10
+
+second_stage_process:
+
+
+    smull       v24.4s, v6.4h, v0.4h[1]     //// y1 * cos1(part of b0)
+    smull       v26.4s, v6.4h, v0.4h[3]     //// y1 * cos3(part of b1)
+    smull       v28.4s, v6.4h, v1.4h[1]     //// y1 * sin3(part of b2)
+    smull       v30.4s, v6.4h, v1.4h[3]     //// y1 * sin1(part of b3)
+
+    smlal       v24.4s, v7.4h, v0.4h[3]     //// y1 * cos1 + y3 * cos3(part of b0)
+    smlal       v26.4s, v7.4h, v2.4h[1]     //// y1 * cos3 - y3 * sin1(part of b1)
+    smlal       v28.4s, v7.4h, v3.4h[3]     //// y1 * sin3 - y3 * cos1(part of b2)
+    smlsl       v30.4s, v7.4h, v2.4h[3]     //// y1 * sin1 - y3 * sin3(part of b3)
+
+
+    smull       v12.4s, v10.4h, v0.4h[0]
+    smlal       v12.4s, v11.4h, v0.4h[2]
+    smull       v14.4s, v10.4h, v0.4h[0]
+    smlal       v14.4s, v11.4h, v1.4h[2]
+    smull       v16.4s, v10.4h, v0.4h[0]
+    smlal       v16.4s, v11.4h, v2.4h[2]
+    smull       v18.4s, v10.4h, v0.4h[0]
+    smlal       v18.4s, v11.4h, v3.4h[2]
+
+    bge         skip_last8rows_stage2_kernel1
+
+    smlal       v24.4s, v8.4h, v1.4h[1]
+    smlal       v26.4s, v8.4h, v3.4h[3]
+    smlsl       v28.4s, v8.4h, v1.4h[3]
+    smlsl       v30.4s, v8.4h, v0.4h[3]
+
+
+    smlal       v24.4s, v9.4h, v1.4h[3]
+    smlsl       v26.4s, v9.4h, v2.4h[3]
+    smlsl       v28.4s, v9.4h, v0.4h[3]
+    smlal       v30.4s, v9.4h, v3.4h[3]
+
+
+    smlal       v12.4s, v4.4h, v1.4h[0]
+    smlal       v12.4s, v5.4h, v1.4h[2]
+    smlal       v14.4s, v4.4h, v3.4h[0]
+    smlsl       v14.4s, v5.4h, v3.4h[2]
+    smlsl       v16.4s, v4.4h, v3.4h[0]
+    smlsl       v16.4s, v5.4h, v0.4h[2]
+    smlsl       v18.4s, v4.4h, v1.4h[0]
+    smlsl       v18.4s, v5.4h, v2.4h[2]
+
+    mov         x19,#0xff00
+    cmp         x12,x19
+    bge         skip_last8rows_stage2_kernel1
+
+
+    ld1         {v10.4h, v11.4h},[x11],#16
+    ld1         {v6.4h, v7.4h},[x11],x10
+    ld1         {v4.4h, v5.4h},[x0],#16
+    ld1         {v8.4h, v9.4h},[x0],x10
+
+
+
+
+
+    smlal       v24.4s, v6.4h, v2.4h[1]     //// y1 * cos1(part of b0)
+    smlsl       v26.4s, v6.4h, v1.4h[1]     //// y1 * cos3(part of b1)
+    smlsl       v28.4s, v6.4h, v3.4h[1]     //// y1 * sin3(part of b2)
+    smlal       v30.4s, v6.4h, v0.4h[1]     //// y1 * sin1(part of b3)
+
+    smlal       v24.4s, v7.4h, v2.4h[3]     //// y1 * cos1 + y3 * cos3(part of b0)
+    smlsl       v26.4s, v7.4h, v0.4h[1]     //// y1 * cos3 - y3 * sin1(part of b1)
+    smlal       v28.4s, v7.4h, v2.4h[1]     //// y1 * sin3 - y3 * cos1(part of b2)
+    smlal       v30.4s, v7.4h, v3.4h[1]     //// y1 * sin1 - y3 * sin3(part of b3)
+
+
+
+    smlal       v24.4s, v8.4h, v3.4h[1]
+    smlsl       v26.4s, v8.4h, v1.4h[3]
+    smlal       v28.4s, v8.4h, v0.4h[1]
+    smlsl       v30.4s, v8.4h, v1.4h[1]
+
+
+    smlal       v24.4s, v9.4h, v3.4h[3]
+    smlsl       v26.4s, v9.4h, v3.4h[1]
+    smlal       v28.4s, v9.4h, v2.4h[3]
+    smlsl       v30.4s, v9.4h, v2.4h[1]
+
+
+
+
+
+    smlal       v12.4s, v10.4h, v0.4h[0]
+    smlal       v12.4s, v11.4h, v2.4h[2]
+    smlal       v12.4s, v4.4h, v3.4h[0]
+    smlal       v12.4s, v5.4h, v3.4h[2]
+
+
+
+
+    smlsl       v14.4s, v10.4h, v0.4h[0]
+    smlsl       v14.4s, v11.4h, v0.4h[2]
+    smlsl       v14.4s, v4.4h, v1.4h[0]
+    smlsl       v14.4s, v5.4h, v2.4h[2]
+
+
+    smlsl       v16.4s, v10.4h, v0.4h[0]
+    smlal       v16.4s, v11.4h, v3.4h[2]
+    smlal       v16.4s, v4.4h, v1.4h[0]
+    smlal       v16.4s, v5.4h, v1.4h[2]
+
+
+    smlal       v18.4s, v10.4h, v0.4h[0]
+    smlal       v18.4s, v11.4h, v1.4h[2]
+    smlsl       v18.4s, v4.4h, v3.4h[0]
+    smlsl       v18.4s, v5.4h, v0.4h[2]
+
+
+
+
+
+
+skip_last8rows_stage2_kernel1:
+
+
+
+    add         v20.4s,  v12.4s ,  v24.4s
+    sub         v22.4s,  v12.4s ,  v24.4s
+
+    add         v12.4s,  v14.4s ,  v26.4s
+    sub         v24.4s,  v14.4s ,  v26.4s
+
+    add         v14.4s,  v16.4s ,  v28.4s
+    sub         v26.4s,  v16.4s ,  v28.4s
+
+
+    add         v16.4s,  v18.4s ,  v30.4s
+    sub         v28.4s,  v18.4s ,  v30.4s
+
+
+
+
+
+
+
+    sqrshrn     v30.4h, v20.4s,#shift_stage2_idct //// x0 = (a0 + b0 + rnd) >> 7(shift_stage1_idct)
+    sqrshrn     v19.4h, v22.4s,#shift_stage2_idct //// x7 = (a0 - b0 + rnd) >> 7(shift_stage1_idct)
+    sqrshrn     v31.4h, v14.4s,#shift_stage2_idct //// x2 = (a2 + b2 + rnd) >> 7(shift_stage1_idct)
+    sqrshrn     v18.4h, v26.4s,#shift_stage2_idct //// x5 = (a2 - b2 + rnd) >> 7(shift_stage1_idct)
+    sqrshrn     v12.4h, v12.4s,#shift_stage2_idct //// x1 = (a1 + b1 + rnd) >> 7(shift_stage1_idct)
+    sqrshrn     v15.4h, v24.4s,#shift_stage2_idct //// x6 = (a1 - b1 + rnd) >> 7(shift_stage1_idct)
+    sqrshrn     v13.4h, v16.4s,#shift_stage2_idct //// x3 = (a3 + b3 + rnd) >> 7(shift_stage1_idct)
+    sqrshrn     v14.4h, v28.4s,#shift_stage2_idct //// x4 = (a3 - b3 + rnd) >> 7(shift_stage1_idct)
+
+    bge         skip_stage2_kernel_load
+
+    //q2,q4,q6,q7 is used
+    ld1         {v10.4h, v11.4h},[x1],#16
+    ld1         {v6.4h, v7.4h},[x1],#16
+    ld1         {v4.4h, v5.4h},[x9],#16
+    ld1         {v8.4h, v9.4h},[x9],#16
+skip_stage2_kernel_load:
+    sub         x1,x1,#32
+    st1         {v30.4h, v31.4h},[x1],#16
+    st1         {v18.4h, v19.4h},[x1],#16
+    sub         x1,x1,#32
+
+    smull       v24.4s, v6.4h, v2.4h[1]     //// y1 * cos1(part of b0)
+    smull       v26.4s, v6.4h, v2.4h[3]     //// y1 * cos3(part of b1)
+    smull       v28.4s, v6.4h, v3.4h[1]     //// y1 * sin3(part of b2)
+    smull       v30.4s, v6.4h, v3.4h[3]     //// y1 * sin1(part of b3)
+
+    smlsl       v24.4s, v7.4h, v1.4h[1]     //// y1 * cos1 + y3 * cos3(part of b0)
+    smlsl       v26.4s, v7.4h, v0.4h[1]     //// y1 * cos3 - y3 * sin1(part of b1)
+    smlsl       v28.4s, v7.4h, v1.4h[3]     //// y1 * sin3 - y3 * cos1(part of b2)
+    smlsl       v30.4s, v7.4h, v3.4h[1]     //// y1 * sin1 - y3 * sin3(part of b3)
+
+
+    smull       v22.4s, v10.4h, v0.4h[0]
+    smlsl       v22.4s, v11.4h, v3.4h[2]
+    smull       v20.4s, v10.4h, v0.4h[0]
+    smlsl       v20.4s, v11.4h, v2.4h[2]
+    smull       v16.4s, v10.4h, v0.4h[0]
+    smlsl       v16.4s, v11.4h, v1.4h[2]
+    smull       v18.4s, v10.4h, v0.4h[0]
+    smlsl       v18.4s, v11.4h, v0.4h[2]
+
+
+
+    cmp         x12,x6
+    bge         skip_last8rows_stage2_kernel2
+
+
+    smlsl       v24.4s, v8.4h, v3.4h[1]
+    smlal       v26.4s, v8.4h, v2.4h[1]
+    smlal       v28.4s, v8.4h, v0.4h[1]
+    smlal       v30.4s, v8.4h, v2.4h[3]
+
+
+    smlal       v24.4s, v9.4h, v0.4h[1]
+    smlal       v26.4s, v9.4h, v3.4h[1]
+    smlsl       v28.4s, v9.4h, v1.4h[1]
+    smlsl       v30.4s, v9.4h, v2.4h[1]
+
+
+
+    smlsl       v22.4s, v4.4h, v1.4h[0]
+    smlal       v22.4s, v5.4h, v2.4h[2]
+    smlsl       v20.4s, v4.4h, v3.4h[0]
+    smlal       v20.4s, v5.4h, v0.4h[2]
+    smlal       v16.4s, v4.4h, v3.4h[0]
+    smlal       v16.4s, v5.4h, v3.4h[2]
+    smlal       v18.4s, v4.4h, v1.4h[0]
+    smlsl       v18.4s, v5.4h, v1.4h[2]
+    mov         x19,#0xff00
+    cmp         x12,x19
+    bge         skip_last8rows_stage2_kernel2
+
+    ld1         {v10.4h, v11.4h},[x11],#16
+    ld1         {v6.4h, v7.4h},[x11],#16
+    ld1         {v4.4h, v5.4h},[x0],#16
+    ld1         {v8.4h, v9.4h},[x0],#16
+
+    smlsl       v24.4s, v6.4h, v3.4h[3]     //// y1 * cos1(part of b0)
+    smlsl       v26.4s, v6.4h, v0.4h[3]     //// y1 * cos3(part of b1)
+    smlal       v28.4s, v6.4h, v2.4h[3]     //// y1 * sin3(part of b2)
+    smlal       v30.4s, v6.4h, v1.4h[3]     //// y1 * sin1(part of b3)
+
+    smlsl       v24.4s, v7.4h, v0.4h[3]     //// y1 * cos1 + y3 * cos3(part of b0)
+    smlal       v26.4s, v7.4h, v1.4h[3]     //// y1 * cos3 - y3 * sin1(part of b1)
+    smlal       v28.4s, v7.4h, v3.4h[3]     //// y1 * sin3 - y3 * cos1(part of b2)
+    smlsl       v30.4s, v7.4h, v1.4h[1]     //// y1 * sin1 - y3 * sin3(part of b3)
+
+
+    smlal       v24.4s, v8.4h, v2.4h[3]
+    smlal       v26.4s, v8.4h, v3.4h[3]
+    smlsl       v28.4s, v8.4h, v2.4h[1]
+    smlal       v30.4s, v8.4h, v0.4h[3]
+
+
+    smlal       v24.4s, v9.4h, v1.4h[3]
+    smlsl       v26.4s, v9.4h, v1.4h[1]
+    smlal       v28.4s, v9.4h, v0.4h[3]
+    smlsl       v30.4s, v9.4h, v0.4h[1]
+
+
+
+
+    smlal       v22.4s, v10.4h, v0.4h[0]
+    smlsl       v22.4s, v11.4h, v1.4h[2]
+    smlsl       v22.4s, v4.4h, v3.4h[0]
+    smlal       v22.4s, v5.4h, v0.4h[2]
+
+
+
+    smlsl       v20.4s, v10.4h, v0.4h[0]
+    smlsl       v20.4s, v11.4h, v3.4h[2]
+    smlal       v20.4s, v4.4h, v1.4h[0]
+    smlsl       v20.4s, v5.4h, v1.4h[2]
+
+
+    smlsl       v16.4s, v10.4h, v0.4h[0]
+    smlal       v16.4s, v11.4h, v0.4h[2]
+    smlsl       v16.4s, v4.4h, v1.4h[0]
+    smlal       v16.4s, v5.4h, v2.4h[2]
+
+
+
+    smlal       v18.4s, v10.4h, v0.4h[0]
+    smlsl       v18.4s, v11.4h, v2.4h[2]
+    smlal       v18.4s, v4.4h, v3.4h[0]
+    smlsl       v18.4s, v5.4h, v3.4h[2]
+
+
+skip_last8rows_stage2_kernel2:
+
+
+
+    add         v4.4s,  v22.4s ,  v24.4s
+    sub         v22.4s,  v22.4s ,  v24.4s
+
+    add         v6.4s,  v20.4s ,  v26.4s
+    sub         v24.4s,  v20.4s ,  v26.4s
+
+    add         v10.4s,  v16.4s ,  v28.4s
+    sub         v26.4s,  v16.4s ,  v28.4s
+
+
+    add         v16.4s,  v18.4s ,  v30.4s
+    sub         v28.4s,  v18.4s ,  v30.4s
+
+
+    sqrshrn     v18.4h, v4.4s,#shift_stage2_idct //// x0 = (a0 + b0 + rnd) >> 7(shift_stage1_idct)
+    sqrshrn     v31.4h, v22.4s,#shift_stage2_idct //// x7 = (a0 - b0 + rnd) >> 7(shift_stage1_idct)
+    sqrshrn     v19.4h, v10.4s,#shift_stage2_idct //// x2 = (a2 + b2 + rnd) >> 7(shift_stage1_idct)
+    sqrshrn     v30.4h, v26.4s,#shift_stage2_idct //// x5 = (a2 - b2 + rnd) >> 7(shift_stage1_idct)
+    sqrshrn     v20.4h, v6.4s,#shift_stage2_idct //// x1 = (a1 + b1 + rnd) >> 7(shift_stage1_idct)
+    sqrshrn     v23.4h, v24.4s,#shift_stage2_idct //// x6 = (a1 - b1 + rnd) >> 7(shift_stage1_idct)
+    sqrshrn     v21.4h, v16.4s,#shift_stage2_idct //// x3 = (a3 + b3 + rnd) >> 7(shift_stage1_idct)
+    sqrshrn     v22.4h, v28.4s,#shift_stage2_idct //// x4 = (a3 - b3 + rnd) >> 7(shift_stage1_idct)
+
+    ld1         {v4.4h, v5.4h},[x1],#16
+    ld1         {v8.4h, v9.4h},[x1],#16
+
+
+
+    // registers used:    {q2,q4,q6,q7}, {q9,q15,q10,q11}
+
+//d4=x0
+//d12=x1
+//d5=x2
+//d13=x3
+
+//d18=x4
+//d20=x5
+//d19=x6
+//d21=x7
+
+//d22=x8
+//d30=x9
+//d23=x10
+//d31=x11
+
+//d14=x12
+//d8=x13
+//d15=x14
+//d9=x15
+
+    umov        x15,v26.d[0]
+    umov        x16,v27.d[0]
+    umov        x19,v28.d[0]
+    umov        x20,v29.d[0]
+
+    trn1        v26.4h, v4.4h, v12.4h
+    trn2        v27.4h, v4.4h, v12.4h
+    trn1        v28.4h, v5.4h, v13.4h
+    trn2        v29.4h, v5.4h, v13.4h
+
+    trn1        v4.2s, v26.2s, v28.2s
+    trn2        v5.2s, v26.2s, v28.2s
+    trn1        v12.2s, v27.2s, v29.2s
+    trn2        v13.2s, v27.2s, v29.2s
+
+    trn1        v26.4h, v18.4h, v20.4h
+    trn2        v27.4h, v18.4h, v20.4h
+    trn1        v28.4h, v19.4h, v21.4h
+    trn2        v29.4h, v19.4h, v21.4h
+
+    trn1        v18.2s, v26.2s, v28.2s
+    trn2        v19.2s, v26.2s, v28.2s
+    trn1        v20.2s, v27.2s, v29.2s
+    trn2        v21.2s, v27.2s, v29.2s
+
+    trn1        v26.4h, v22.4h, v30.4h
+    trn2        v27.4h, v22.4h, v30.4h
+    trn1        v28.4h, v23.4h, v31.4h
+    trn2        v29.4h, v23.4h, v31.4h
+
+    trn1        v22.2s, v26.2s, v28.2s
+    trn2        v23.2s, v26.2s, v28.2s
+    trn1        v30.2s, v27.2s, v29.2s
+    trn2        v31.2s, v27.2s, v29.2s
+
+    trn1        v26.4h, v14.4h, v8.4h
+    trn2        v27.4h, v14.4h, v8.4h
+    trn1        v28.4h, v15.4h, v9.4h
+    trn2        v29.4h, v15.4h, v9.4h
+
+    trn1        v14.2s, v26.2s, v28.2s
+    trn2        v15.2s, v26.2s, v28.2s
+    trn1        v8.2s, v27.2s, v29.2s
+    trn2        v9.2s, v27.2s, v29.2s
+
+    mov         v26.d[0],x15
+    mov         v27.d[0],x16
+    mov         v28.d[0],x19
+    mov         v29.d[0],x20
+
+// d4 =x0 1- 4 values
+// d5 =x2 1- 4 values
+// d12=x1 1- 4 values
+// d13=x3 1- 4 values
+
+// d18 =x0 5- 8 values
+// d19 =x2 5- 8 values
+// d20=x1 5- 8 values
+// d21=x3 5- 8 values
+
+// d22 =x0 9- 12 values
+// d23 =x2 9- 12 values
+// d30=x1 9- 12 values
+// d31=x3 9- 12 values
+
+// d14 =x0 13-16 values
+// d15 =x2 13- 16 values
+// d8=x1 13- 16 values
+// d9=x3 13- 16 values
+
+    // swapping v5 and v15
+    mov         v5.d[1],v5.d[0]
+    mov         v5.d[0],v18.d[0]
+    mov         v18.d[0],v5.d[1]
+    // swapping v23 and v14
+    mov         v23.d[1],v23.d[0]
+    mov         v23.d[0],v14.d[0]
+    mov         v14.d[0],v23.d[1]
+    // swapping v13 and v20
+    mov         v13.d[1],v13.d[0]
+    mov         v13.d[0],v20.d[0]
+    mov         v20.d[0],v13.d[1]
+    // swapping v31 and v8
+    mov         v31.d[1],v31.d[0]
+    mov         v31.d[0],v8.d[0]
+    mov         v8.d[0],v31.d[1]
+
+// q2: x0 1-8 values
+// q11: x0 9-16 values
+// q9 : x2 1-8 values
+// q7 : x2 9-16 values
+// q6 : x1 1- 8 values
+// q10: x3 1-8 values
+// q15: x1 9-16 values
+// q4:  x3 9-16 values
+
+
+//    registers free: q8,q14,q12,q13
+
+
+    ld1         {v16.8b, v17.8b},[x2],x8
+    ld1         {v28.8b, v29.8b},[x2],x5
+    ld1         {v24.8b, v25.8b},[x4],x8
+    ld1         {v26.8b, v27.8b},[x4],x5
+
+    mov         v4.d[1] ,v5.d[0]
+    mov         v22.d[1] ,v23.d[0]
+    mov         v12.d[1] ,v13.d[0]
+    mov         v30.d[1] ,v31.d[0]
+    mov         v18.d[1] ,v19.d[0]
+    mov         v14.d[1] ,v15.d[0]
+    mov         v20.d[1] ,v21.d[0]
+    mov         v8.d[1] ,v9.d[0]
+
+    uaddw       v4.8h,  v4.8h ,  v16.8b
+    uaddw       v22.8h,  v22.8h ,  v17.8b
+    uaddw       v12.8h,  v12.8h ,  v28.8b
+    uaddw       v30.8h,  v30.8h ,  v29.8b
+    uaddw       v18.8h,  v18.8h ,  v24.8b
+    uaddw       v14.8h,  v14.8h ,  v25.8b
+    uaddw       v20.8h,  v20.8h ,  v26.8b
+    uaddw       v8.8h,  v8.8h ,  v27.8b
+
+
+    sqxtun      v16.8b, v4.8h
+    sqxtun      v17.8b, v22.8h
+    sqxtun      v28.8b, v12.8h
+    sqxtun      v29.8b, v30.8h
+    sqxtun      v24.8b, v18.8h
+    sqxtun      v25.8b, v14.8h
+    sqxtun      v26.8b, v20.8h
+    sqxtun      v27.8b, v8.8h
+
+
+
+    st1         {v16.8b, v17.8b},[x3],x7
+    st1         {v28.8b, v29.8b},[x3],x7
+    st1         {v24.8b, v25.8b},[x3],x7
+    st1         {v26.8b, v27.8b},[x3],x7
+
+    subs        x14,x14,#1
+
+
+
+    bne         second_stage
+
+
+//    sub         sp,sp,#40
+    // ldmfd sp!,{x4-x12,pc}
+    ldp         x19, x20,[sp],#16
+    pop_v_regs
+    ret
+
+
+
+
+
+
+
+
+
+
+