diff options
Diffstat (limited to 'common/arm64/ihevc_itrans_recon_16x16.s')
-rw-r--r-- | common/arm64/ihevc_itrans_recon_16x16.s | 1240 |
1 files changed, 1240 insertions, 0 deletions
diff --git a/common/arm64/ihevc_itrans_recon_16x16.s b/common/arm64/ihevc_itrans_recon_16x16.s new file mode 100644 index 0000000..90df840 --- /dev/null +++ b/common/arm64/ihevc_itrans_recon_16x16.s @@ -0,0 +1,1240 @@ +///***************************************************************************** +//* +//* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore +//* +//* Licensed under the Apache License, Version 2.0 (the "License"); +//* you may not use this file except in compliance with the License. +//* You may obtain a copy of the License at: +//* +//* http://www.apache.org/licenses/LICENSE-2.0 +//* +//* Unless required by applicable law or agreed to in writing, software +//* distributed under the License is distributed on an "AS IS" BASIS, +//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +//* See the License for the specific language governing permissions and +//* limitations under the License. +//* +//*****************************************************************************/ +///** +// ******************************************************************************* +// * @file +// * ihevc_itrans_recon_8x8_neon.s +// * +// * @brief +// * contains function definitions for single stage inverse transform +// * +// * @author +// * anand s +// * +// * @par list of functions: +// * - ihevc_itrans_recon_16x16() +// * +// * @remarks +// * none +// * +// ******************************************************************************* +//*/ + +///** +// ******************************************************************************* +// * +// * @brief +// * this function performs inverse transform and reconstruction for 8x8 +// * input block +// * +// * @par description: +// * performs inverse transform and adds the prediction data and clips output +// * to 8 bit +// * +// * @param[in] pi2_src +// * input 16x16 coefficients +// * +// * @param[in] pi2_tmp +// * temporary 16x16 buffer for storing inverse +// * +// * transform +// * 1st stage output +// * +// * @param[in] pu1_pred +// * prediction 16x16 block +// * +// * @param[out] pu1_dst +// * output 8x8 block +// * +// * @param[in] src_strd +// * input stride +// * +// * @param[in] pred_strd +// * prediction stride +// * +// * @param[in] dst_strd +// * output stride +// * +// * @param[in] shift +// * output shift +// * +// * @param[in] x12 +// * zero columns in pi2_src +// * +// * @returns void +// * +// * @remarks +// * none +// * +// ******************************************************************************* +// */ + +//void ihevc_itrans_recon_16x16(word16 *pi2_src, +// word16 *pi2_tmp, +// uword8 *pu1_pred, +// uword8 *pu1_dst, +// word32 src_strd, +// word32 pred_strd, +// word32 dst_strd, +// word32 x12 +// word32 x11 ) + +//**************variables vs registers************************* +// x0 => *pi2_src +// x1 => *pi2_tmp +// x2 => *pu1_pred +// x3 => *pu1_dst +// src_strd +// pred_strd +// dst_strd +// x12 +// x11 + +.text +.align 4 + +.include "ihevc_neon_macros.s" + + + + +.set shift_stage1_idct , 7 +.set shift_stage2_idct , 12 +//#define zero_cols x12 +//#define zero_rows x11 +.globl ihevc_itrans_recon_16x16_av8 + +.extern g_ai2_ihevc_trans_16_transpose + +.type ihevc_itrans_recon_16x16_av8, %function + +ihevc_itrans_recon_16x16_av8: + + ldr w11, [sp] + // stmfd sp!,{x4-x12,x14} + push_v_regs + stp x19, x20,[sp,#-16]! + stp x5, x6,[sp,#-16]! +// add sp,sp,#40 + + + +// ldr x8,[sp,#4] @ prediction stride +// ldr x7,[sp,#8] @ destination stride + mov x6, x4 // src stride + mov x12, x7 + + + + adrp x14, :got:g_ai2_ihevc_trans_16_transpose + ldr x14, [x14, #:got_lo12:g_ai2_ihevc_trans_16_transpose] + ld1 {v0.4h, v1.4h, v2.4h, v3.4h},[x14] ////d0,d1 are used for storing the constant data + mov x7,#0xffff + and x12,x12,x7 + and x11,x11,x7 + lsl x6, x6, #1 // x sizeof(word16) + add x9,x0,x6, lsl #1 // 2 rows + + add x10,x6,x6, lsl #1 // 3 rows + add x5,x6,x6,lsl #2 + mov x7,#0xfff0 + + cmp x12,x7 + bge zero_12cols_decision + + mov x19,#0xff00 + cmp x12,x19 + bge zero_8cols_decision + + + + + mov x14,#4 + cmp x11,x7 + sub x20,x6,#0 + neg x20, x20 + csel x10,x20,x10,ge + + mov x19,#0xff00 + cmp x11,x19 + csel x8, x5, x8,ge + sub x20,x8,#0 + neg x20, x20 + csel x8,x20,x8,ge + csel x8, x10, x8,lt + add x5,x5,x6,lsl #3 + sub x20,x5,#0 + neg x5, x20 + + b first_stage_top_four_bottom_four + +zero_12cols_decision: + mov x14,#1 + mov x19,#0xff00 + cmp x11,x19 + csel x8, x5, x8,ge + csel x8, x10, x8,lt + add x5,x5,x6,lsl #3 + sub x20,x5,#0 + neg x5, x20 + + b first_stage_top_four_bottom_four + +zero_8cols_decision: + mov x14,#2 + mov x8,x5 + sub x20,x8,#0 + neg x8, x20 + mov x19,#0xff00 + cmp x11,x19 + csel x8, x10, x8,lt + add x5,x5,x6,lsl #3 + sub x20,x5,#0 + neg x5, x20 + cmp x11,x7 + sub x20,x6,#0 + neg x20, x20 + csel x10,x20,x10,ge + + + b first_stage_top_four_bottom_four + + +//d0[0]= 64 d2[0]=64 +//d0[1]= 90 d2[1]=57 +//d0[2]= 89 d2[2]=50 +//d0[3]= 87 d2[3]=43 +//d1[0]= 83 d3[0]=36 +//d1[1]= 80 d3[1]=25 +//d1[2]= 75 d3[2]=18 +//d1[3]= 70 d3[3]=9 + + + +first_stage: + add x0,x0,#8 + add x9,x9,#8 + +first_stage_top_four_bottom_four: + + ld1 {v10.4h},[x0],x6 + ld1 {v11.4h},[x9],x6 + ld1 {v6.4h},[x0],x10 + ld1 {v7.4h},[x9],x10 + cmp x11,x7 + bge skip_load4rows + + ld1 {v4.4h},[x0],x6 + ld1 {v5.4h},[x9],x6 + ld1 {v8.4h},[x0],x8 + ld1 {v9.4h},[x9],x8 + +// registers used: q0,q1,q3,q5,q2,q4 + +// d10 =x0 +//d6= x1 +//d11=x2 +//d7=x3 + +skip_load4rows: + smull v24.4s, v6.4h, v0.4h[1] //// y1 * cos1(part of b0) + smull v26.4s, v6.4h, v0.4h[3] //// y1 * cos3(part of b1) + smull v28.4s, v6.4h, v1.4h[1] //// y1 * sin3(part of b2) + smull v30.4s, v6.4h, v1.4h[3] //// y1 * sin1(part of b3) + + smlal v24.4s, v7.4h, v0.4h[3] //// y1 * cos1 + y3 * cos3(part of b0) + smlal v26.4s, v7.4h, v2.4h[1] //// y1 * cos3 - y3 * sin1(part of b1) + smlal v28.4s, v7.4h, v3.4h[3] //// y1 * sin3 - y3 * cos1(part of b2) + smlsl v30.4s, v7.4h, v2.4h[3] //// y1 * sin1 - y3 * sin3(part of b3) + + + + + + + smull v12.4s, v10.4h, v0.4h[0] + smlal v12.4s, v11.4h, v0.4h[2] + smull v14.4s, v10.4h, v0.4h[0] + smlal v14.4s, v11.4h, v1.4h[2] + smull v16.4s, v10.4h, v0.4h[0] + smlal v16.4s, v11.4h, v2.4h[2] + smull v18.4s, v10.4h, v0.4h[0] + smlal v18.4s, v11.4h, v3.4h[2] + + bge skip_last12rows_kernel1 + + + smlal v24.4s, v8.4h, v1.4h[1] + smlal v26.4s, v8.4h, v3.4h[3] + smlsl v28.4s, v8.4h, v1.4h[3] + smlsl v30.4s, v8.4h, v0.4h[3] + + + smlal v24.4s, v9.4h, v1.4h[3] + smlsl v26.4s, v9.4h, v2.4h[3] + smlsl v28.4s, v9.4h, v0.4h[3] + smlal v30.4s, v9.4h, v3.4h[3] + + + + + + smlal v12.4s, v4.4h, v1.4h[0] + smlal v12.4s, v5.4h, v1.4h[2] + smlal v14.4s, v4.4h, v3.4h[0] + smlsl v14.4s, v5.4h, v3.4h[2] + smlsl v16.4s, v4.4h, v3.4h[0] + smlsl v16.4s, v5.4h, v0.4h[2] + smlsl v18.4s, v4.4h, v1.4h[0] + smlsl v18.4s, v5.4h, v2.4h[2] + +//d0[0]= 64 d2[0]=64 +//d0[1]= 90 d2[1]=57 +//d0[2]= 89 d2[2]=50 +//d0[3]= 87 d2[3]=43 +//d1[0]= 83 d3[0]=36 +//d1[1]= 80 d3[1]=25 +//d1[2]= 75 d3[2]=18 +//d1[3]= 70 d3[3]=9 + mov x19,#0xff00 + cmp x11,x19 + bge skip_last12rows_kernel1 + + + ld1 {v10.4h},[x0],x6 + ld1 {v11.4h},[x9],x6 + ld1 {v6.4h},[x0],x10 + ld1 {v7.4h},[x9],x10 + ld1 {v4.4h},[x0],x6 + ld1 {v5.4h},[x9],x6 + ld1 {v8.4h},[x0],x5 + ld1 {v9.4h},[x9],x5 + + + + + smlal v24.4s, v6.4h, v2.4h[1] //// y1 * cos1(part of b0) + smlsl v26.4s, v6.4h, v1.4h[1] //// y1 * cos3(part of b1) + smlsl v28.4s, v6.4h, v3.4h[1] //// y1 * sin3(part of b2) + smlal v30.4s, v6.4h, v0.4h[1] //// y1 * sin1(part of b3) + + smlal v24.4s, v7.4h, v2.4h[3] //// y1 * cos1 + y3 * cos3(part of b0) + smlsl v26.4s, v7.4h, v0.4h[1] //// y1 * cos3 - y3 * sin1(part of b1) + smlal v28.4s, v7.4h, v2.4h[1] //// y1 * sin3 - y3 * cos1(part of b2) + smlal v30.4s, v7.4h, v3.4h[1] //// y1 * sin1 - y3 * sin3(part of b3) + + + + smlal v24.4s, v8.4h, v3.4h[1] + smlsl v26.4s, v8.4h, v1.4h[3] + smlal v28.4s, v8.4h, v0.4h[1] + smlsl v30.4s, v8.4h, v1.4h[1] + + + smlal v24.4s, v9.4h, v3.4h[3] + smlsl v26.4s, v9.4h, v3.4h[1] + smlal v28.4s, v9.4h, v2.4h[3] + smlsl v30.4s, v9.4h, v2.4h[1] + + + + + + smlal v12.4s, v10.4h, v0.4h[0] + smlal v12.4s, v11.4h, v2.4h[2] + smlal v12.4s, v4.4h, v3.4h[0] + smlal v12.4s, v5.4h, v3.4h[2] + + + + + smlsl v14.4s, v10.4h, v0.4h[0] + smlsl v14.4s, v11.4h, v0.4h[2] + smlsl v14.4s, v4.4h, v1.4h[0] + smlsl v14.4s, v5.4h, v2.4h[2] + + + smlsl v16.4s, v10.4h, v0.4h[0] + smlal v16.4s, v11.4h, v3.4h[2] + smlal v16.4s, v4.4h, v1.4h[0] + smlal v16.4s, v5.4h, v1.4h[2] + + + smlal v18.4s, v10.4h, v0.4h[0] + smlal v18.4s, v11.4h, v1.4h[2] + smlsl v18.4s, v4.4h, v3.4h[0] + smlsl v18.4s, v5.4h, v0.4h[2] + +skip_last12rows_kernel1: + add v20.4s, v12.4s , v24.4s + sub v22.4s, v12.4s , v24.4s + + add v12.4s, v14.4s , v26.4s + sub v24.4s, v14.4s , v26.4s + + add v14.4s, v16.4s , v28.4s + sub v26.4s, v16.4s , v28.4s + + + add v16.4s, v18.4s , v30.4s + sub v28.4s, v18.4s , v30.4s + + + + + + + + sqrshrn v30.4h, v20.4s,#shift_stage1_idct //// x0 = (a0 + b0 + rnd) >> 7(shift_stage1_idct) + sqrshrn v19.4h, v22.4s,#shift_stage1_idct //// x7 = (a0 - b0 + rnd) >> 7(shift_stage1_idct) + sqrshrn v31.4h, v14.4s,#shift_stage1_idct //// x2 = (a2 + b2 + rnd) >> 7(shift_stage1_idct) + sqrshrn v18.4h, v26.4s,#shift_stage1_idct //// x5 = (a2 - b2 + rnd) >> 7(shift_stage1_idct) + sqrshrn v12.4h, v12.4s,#shift_stage1_idct //// x1 = (a1 + b1 + rnd) >> 7(shift_stage1_idct) + sqrshrn v15.4h, v24.4s,#shift_stage1_idct //// x6 = (a1 - b1 + rnd) >> 7(shift_stage1_idct) + sqrshrn v13.4h, v16.4s,#shift_stage1_idct //// x3 = (a3 + b3 + rnd) >> 7(shift_stage1_idct) + sqrshrn v14.4h, v28.4s,#shift_stage1_idct //// x4 = (a3 - b3 + rnd) >> 7(shift_stage1_idct) + + st1 {v30.4h, v31.4h},[x1],#16 + st1 {v18.4h, v19.4h},[x1],#16 + sub x1,x1,#32 + + bge skip_stage1_kernel_load + +first_stage_middle_eight: + + + + ld1 {v10.4h},[x0],x6 + ld1 {v11.4h},[x9],x6 + ld1 {v6.4h},[x0],x10 + ld1 {v7.4h},[x9],x10 + ld1 {v4.4h},[x0],x6 + ld1 {v5.4h},[x9],x6 + ld1 {v8.4h},[x0],x8 + ld1 {v9.4h},[x9],x8 + + +skip_stage1_kernel_load: + smull v24.4s, v6.4h, v2.4h[1] //// y1 * cos1(part of b0) + smull v26.4s, v6.4h, v2.4h[3] //// y1 * cos3(part of b1) + smull v28.4s, v6.4h, v3.4h[1] //// y1 * sin3(part of b2) + smull v30.4s, v6.4h, v3.4h[3] //// y1 * sin1(part of b3) + + smlsl v24.4s, v7.4h, v1.4h[1] //// y1 * cos1 + y3 * cos3(part of b0) + smlsl v26.4s, v7.4h, v0.4h[1] //// y1 * cos3 - y3 * sin1(part of b1) + smlsl v28.4s, v7.4h, v1.4h[3] //// y1 * sin3 - y3 * cos1(part of b2) + smlsl v30.4s, v7.4h, v3.4h[1] //// y1 * sin1 - y3 * sin3(part of b3) + + + + + + + smull v22.4s, v10.4h, v0.4h[0] + smlsl v22.4s, v11.4h, v3.4h[2] + smull v20.4s, v10.4h, v0.4h[0] + smlsl v20.4s, v11.4h, v2.4h[2] + smull v16.4s, v10.4h, v0.4h[0] + smlsl v16.4s, v11.4h, v1.4h[2] + smull v18.4s, v10.4h, v0.4h[0] + smlsl v18.4s, v11.4h, v0.4h[2] + + + cmp x11,x7 + bge skip_last12rows_kernel2 + + smlsl v24.4s, v8.4h, v3.4h[1] + smlal v26.4s, v8.4h, v2.4h[1] + smlal v28.4s, v8.4h, v0.4h[1] + smlal v30.4s, v8.4h, v2.4h[3] + + + smlal v24.4s, v9.4h, v0.4h[1] + smlal v26.4s, v9.4h, v3.4h[1] + smlsl v28.4s, v9.4h, v1.4h[1] + smlsl v30.4s, v9.4h, v2.4h[1] + + + + smlsl v22.4s, v4.4h, v1.4h[0] + smlal v22.4s, v5.4h, v2.4h[2] + smlsl v20.4s, v4.4h, v3.4h[0] + smlal v20.4s, v5.4h, v0.4h[2] + smlal v16.4s, v4.4h, v3.4h[0] + smlal v16.4s, v5.4h, v3.4h[2] + smlal v18.4s, v4.4h, v1.4h[0] + smlsl v18.4s, v5.4h, v1.4h[2] + +//d0[0]= 64 d2[0]=64 +//d0[1]= 90 d2[1]=57 +//d0[2]= 89 d2[2]=50 +//d0[3]= 87 d2[3]=43 +//d1[0]= 83 d3[0]=36 +//d1[1]= 80 d3[1]=25 +//d1[2]= 75 d3[2]=18 +//d1[3]= 70 d3[3]=9 + mov x19,#0xff00 + cmp x11,x19 + bge skip_last12rows_kernel2 + + ld1 {v10.4h},[x0],x6 + ld1 {v11.4h},[x9],x6 + ld1 {v6.4h},[x0],x10 + ld1 {v7.4h},[x9],x10 + ld1 {v4.4h},[x0],x6 + ld1 {v5.4h},[x9],x6 + ld1 {v8.4h},[x0],x5 + ld1 {v9.4h},[x9],x5 + + + smlsl v24.4s, v6.4h, v3.4h[3] //// y1 * cos1(part of b0) + smlsl v26.4s, v6.4h, v0.4h[3] //// y1 * cos3(part of b1) + smlal v28.4s, v6.4h, v2.4h[3] //// y1 * sin3(part of b2) + smlal v30.4s, v6.4h, v1.4h[3] //// y1 * sin1(part of b3) + + smlsl v24.4s, v7.4h, v0.4h[3] //// y1 * cos1 + y3 * cos3(part of b0) + smlal v26.4s, v7.4h, v1.4h[3] //// y1 * cos3 - y3 * sin1(part of b1) + smlal v28.4s, v7.4h, v3.4h[3] //// y1 * sin3 - y3 * cos1(part of b2) + smlsl v30.4s, v7.4h, v1.4h[1] //// y1 * sin1 - y3 * sin3(part of b3) + + + smlal v24.4s, v8.4h, v2.4h[3] + smlal v26.4s, v8.4h, v3.4h[3] + smlsl v28.4s, v8.4h, v2.4h[1] + smlal v30.4s, v8.4h, v0.4h[3] + + + smlal v24.4s, v9.4h, v1.4h[3] + smlsl v26.4s, v9.4h, v1.4h[1] + smlal v28.4s, v9.4h, v0.4h[3] + smlsl v30.4s, v9.4h, v0.4h[1] + + + + + smlal v22.4s, v10.4h, v0.4h[0] + smlsl v22.4s, v11.4h, v1.4h[2] + smlsl v22.4s, v4.4h, v3.4h[0] + smlal v22.4s, v5.4h, v0.4h[2] + + + + smlsl v20.4s, v10.4h, v0.4h[0] + smlsl v20.4s, v11.4h, v3.4h[2] + smlal v20.4s, v4.4h, v1.4h[0] + smlsl v20.4s, v5.4h, v1.4h[2] + + + smlsl v16.4s, v10.4h, v0.4h[0] + smlal v16.4s, v11.4h, v0.4h[2] + smlsl v16.4s, v4.4h, v1.4h[0] + smlal v16.4s, v5.4h, v2.4h[2] + + + + smlal v18.4s, v10.4h, v0.4h[0] + smlsl v18.4s, v11.4h, v2.4h[2] + smlal v18.4s, v4.4h, v3.4h[0] + smlsl v18.4s, v5.4h, v3.4h[2] + +skip_last12rows_kernel2: + + add v4.4s, v22.4s , v24.4s + sub v22.4s, v22.4s , v24.4s + + add v6.4s, v20.4s , v26.4s + sub v24.4s, v20.4s , v26.4s + + add v10.4s, v16.4s , v28.4s + sub v26.4s, v16.4s , v28.4s + + + add v16.4s, v18.4s , v30.4s + sub v28.4s, v18.4s , v30.4s + + + sqrshrn v18.4h, v4.4s,#shift_stage1_idct //// x0 = (a0 + b0 + rnd) >> 7(shift_stage1_idct) + sqrshrn v31.4h, v22.4s,#shift_stage1_idct //// x7 = (a0 - b0 + rnd) >> 7(shift_stage1_idct) + sqrshrn v19.4h, v10.4s,#shift_stage1_idct //// x2 = (a2 + b2 + rnd) >> 7(shift_stage1_idct) + sqrshrn v30.4h, v26.4s,#shift_stage1_idct //// x5 = (a2 - b2 + rnd) >> 7(shift_stage1_idct) + sqrshrn v20.4h, v6.4s,#shift_stage1_idct //// x1 = (a1 + b1 + rnd) >> 7(shift_stage1_idct) + sqrshrn v23.4h, v24.4s,#shift_stage1_idct //// x6 = (a1 - b1 + rnd) >> 7(shift_stage1_idct) + sqrshrn v21.4h, v16.4s,#shift_stage1_idct //// x3 = (a3 + b3 + rnd) >> 7(shift_stage1_idct) + sqrshrn v22.4h, v28.4s,#shift_stage1_idct //// x4 = (a3 - b3 + rnd) >> 7(shift_stage1_idct) + + + // registers used: {q2,q4,q6,q7}, {q9,q15,q10,q11} + + + + + + + ld1 {v4.4h, v5.4h},[x1],#16 + ld1 {v8.4h, v9.4h},[x1],#16 + sub x1,x1,#32 + +//d4=x0 +//d12=x1 +//d5=x2 +//d13=x3 + +//d18=x4 +//d20=x5 +//d19=x6 +//d21=x7 + +//d22=x8 +//d30=x9 +//d23=x10 +//d31=x11 + +//d14=x12 +//d8=x13 +//d15=x14 +//d9=x15 + + umov x15,v26.d[0] + umov x16,v27.d[0] + umov x19,v28.d[0] + umov x20,v29.d[0] + + trn1 v26.4h, v4.4h, v12.4h + trn2 v27.4h, v4.4h, v12.4h + trn1 v28.4h, v5.4h, v13.4h + trn2 v29.4h, v5.4h, v13.4h + + trn1 v4.2s, v26.2s, v28.2s + trn2 v5.2s, v26.2s, v28.2s + trn1 v12.2s, v27.2s, v29.2s + trn2 v13.2s, v27.2s, v29.2s + + trn1 v26.4h, v18.4h, v20.4h + trn2 v27.4h, v18.4h, v20.4h + trn1 v28.4h, v19.4h, v21.4h + trn2 v29.4h, v19.4h, v21.4h + + trn1 v18.2s, v26.2s, v28.2s + trn2 v19.2s, v26.2s, v28.2s + trn1 v20.2s, v27.2s, v29.2s + trn2 v21.2s, v27.2s, v29.2s + + trn1 v26.4h, v22.4h, v30.4h + trn2 v27.4h, v22.4h, v30.4h + trn1 v28.4h, v23.4h, v31.4h + trn2 v29.4h, v23.4h, v31.4h + + trn1 v22.2s, v26.2s, v28.2s + trn2 v23.2s, v26.2s, v28.2s + trn1 v30.2s, v27.2s, v29.2s + trn2 v31.2s, v27.2s, v29.2s + + trn1 v26.4h, v14.4h, v8.4h + trn2 v27.4h, v14.4h, v8.4h + trn1 v28.4h, v15.4h, v9.4h + trn2 v29.4h, v15.4h, v9.4h + + trn1 v14.2s, v26.2s, v28.2s + trn2 v15.2s, v26.2s, v28.2s + trn1 v8.2s, v27.2s, v29.2s + trn2 v9.2s, v27.2s, v29.2s + + mov v26.d[0],x15 + mov v27.d[0],x16 + mov v28.d[0],x19 + mov v29.d[0],x20 + +// d4 =x0 1- 4 values +// d5 =x2 1- 4 values +// d12=x1 1- 4 values +// d13=x3 1- 4 values + +// d18 =x0 5- 8 values +// d19 =x2 5- 8 values +// d20=x1 5- 8 values +// d21=x3 5- 8 values + +// d22 =x0 9- 12 values +// d23 =x2 9- 12 values +// d30=x1 9- 12 values +// d31=x3 9- 12 values + +// d14 =x0 13-16 values +// d15 =x2 13- 16 values +// d8=x1 13- 16 values +// d9=x3 13- 16 values + + + st1 { v4.4h, v5.4h},[x1],#16 + st1 { v12.4h, v13.4h},[x1],#16 + + st1 { v18.4h, v19.4h},[x1],#16 + st1 { v20.4h, v21.4h},[x1],#16 + st1 { v22.4h, v23.4h},[x1],#16 + st1 { v30.4h, v31.4h},[x1],#16 + st1 { v14.4h, v15.4h},[x1],#16 + st1 { v8.4h, v9.4h},[x1],#16 + + + subs x14,x14,#1 + bne first_stage + + + + + + + + + + + mov x6,x7 + + ldp x8, x7,[sp],#16 + + mov x10,#16 + + cmp x12,x6 + sub x20,x1,#128 + csel x1, x20, x1,ge + bge label1 + + mov x19,#0xff00 + cmp x12,x19 + sub x20,x1,#256 + csel x1, x20, x1,ge + bge label_2 + + sub x1,x1,#512 + sub x20,x10,#0 + neg x10, x20 + +label_2: + add x9,x1,#128 + add x11,x9,#128 + add x0,x11,#128 + + + +label1: +// mov x6,x1 + + + mov x14,#4 + add x4,x2,x8, lsl #1 // x4 = x2 + pred_strd * 2 => x4 points to 3rd row of pred data + add x5,x8,x8, lsl #1 // +// add x0,x3,x7, lsl #1 @ x0 points to 3rd row of dest data +// add x10,x7,x7, lsl #1 @ + + + + +second_stage: + ld1 {v10.4h, v11.4h},[x1],#16 + ld1 {v6.4h, v7.4h},[x1],x10 + cmp x12,x6 + bge second_stage_process + ld1 {v4.4h, v5.4h},[x9],#16 + ld1 {v8.4h, v9.4h},[x9],x10 + +second_stage_process: + + + smull v24.4s, v6.4h, v0.4h[1] //// y1 * cos1(part of b0) + smull v26.4s, v6.4h, v0.4h[3] //// y1 * cos3(part of b1) + smull v28.4s, v6.4h, v1.4h[1] //// y1 * sin3(part of b2) + smull v30.4s, v6.4h, v1.4h[3] //// y1 * sin1(part of b3) + + smlal v24.4s, v7.4h, v0.4h[3] //// y1 * cos1 + y3 * cos3(part of b0) + smlal v26.4s, v7.4h, v2.4h[1] //// y1 * cos3 - y3 * sin1(part of b1) + smlal v28.4s, v7.4h, v3.4h[3] //// y1 * sin3 - y3 * cos1(part of b2) + smlsl v30.4s, v7.4h, v2.4h[3] //// y1 * sin1 - y3 * sin3(part of b3) + + + smull v12.4s, v10.4h, v0.4h[0] + smlal v12.4s, v11.4h, v0.4h[2] + smull v14.4s, v10.4h, v0.4h[0] + smlal v14.4s, v11.4h, v1.4h[2] + smull v16.4s, v10.4h, v0.4h[0] + smlal v16.4s, v11.4h, v2.4h[2] + smull v18.4s, v10.4h, v0.4h[0] + smlal v18.4s, v11.4h, v3.4h[2] + + bge skip_last8rows_stage2_kernel1 + + smlal v24.4s, v8.4h, v1.4h[1] + smlal v26.4s, v8.4h, v3.4h[3] + smlsl v28.4s, v8.4h, v1.4h[3] + smlsl v30.4s, v8.4h, v0.4h[3] + + + smlal v24.4s, v9.4h, v1.4h[3] + smlsl v26.4s, v9.4h, v2.4h[3] + smlsl v28.4s, v9.4h, v0.4h[3] + smlal v30.4s, v9.4h, v3.4h[3] + + + smlal v12.4s, v4.4h, v1.4h[0] + smlal v12.4s, v5.4h, v1.4h[2] + smlal v14.4s, v4.4h, v3.4h[0] + smlsl v14.4s, v5.4h, v3.4h[2] + smlsl v16.4s, v4.4h, v3.4h[0] + smlsl v16.4s, v5.4h, v0.4h[2] + smlsl v18.4s, v4.4h, v1.4h[0] + smlsl v18.4s, v5.4h, v2.4h[2] + + mov x19,#0xff00 + cmp x12,x19 + bge skip_last8rows_stage2_kernel1 + + + ld1 {v10.4h, v11.4h},[x11],#16 + ld1 {v6.4h, v7.4h},[x11],x10 + ld1 {v4.4h, v5.4h},[x0],#16 + ld1 {v8.4h, v9.4h},[x0],x10 + + + + + + smlal v24.4s, v6.4h, v2.4h[1] //// y1 * cos1(part of b0) + smlsl v26.4s, v6.4h, v1.4h[1] //// y1 * cos3(part of b1) + smlsl v28.4s, v6.4h, v3.4h[1] //// y1 * sin3(part of b2) + smlal v30.4s, v6.4h, v0.4h[1] //// y1 * sin1(part of b3) + + smlal v24.4s, v7.4h, v2.4h[3] //// y1 * cos1 + y3 * cos3(part of b0) + smlsl v26.4s, v7.4h, v0.4h[1] //// y1 * cos3 - y3 * sin1(part of b1) + smlal v28.4s, v7.4h, v2.4h[1] //// y1 * sin3 - y3 * cos1(part of b2) + smlal v30.4s, v7.4h, v3.4h[1] //// y1 * sin1 - y3 * sin3(part of b3) + + + + smlal v24.4s, v8.4h, v3.4h[1] + smlsl v26.4s, v8.4h, v1.4h[3] + smlal v28.4s, v8.4h, v0.4h[1] + smlsl v30.4s, v8.4h, v1.4h[1] + + + smlal v24.4s, v9.4h, v3.4h[3] + smlsl v26.4s, v9.4h, v3.4h[1] + smlal v28.4s, v9.4h, v2.4h[3] + smlsl v30.4s, v9.4h, v2.4h[1] + + + + + + smlal v12.4s, v10.4h, v0.4h[0] + smlal v12.4s, v11.4h, v2.4h[2] + smlal v12.4s, v4.4h, v3.4h[0] + smlal v12.4s, v5.4h, v3.4h[2] + + + + + smlsl v14.4s, v10.4h, v0.4h[0] + smlsl v14.4s, v11.4h, v0.4h[2] + smlsl v14.4s, v4.4h, v1.4h[0] + smlsl v14.4s, v5.4h, v2.4h[2] + + + smlsl v16.4s, v10.4h, v0.4h[0] + smlal v16.4s, v11.4h, v3.4h[2] + smlal v16.4s, v4.4h, v1.4h[0] + smlal v16.4s, v5.4h, v1.4h[2] + + + smlal v18.4s, v10.4h, v0.4h[0] + smlal v18.4s, v11.4h, v1.4h[2] + smlsl v18.4s, v4.4h, v3.4h[0] + smlsl v18.4s, v5.4h, v0.4h[2] + + + + + + +skip_last8rows_stage2_kernel1: + + + + add v20.4s, v12.4s , v24.4s + sub v22.4s, v12.4s , v24.4s + + add v12.4s, v14.4s , v26.4s + sub v24.4s, v14.4s , v26.4s + + add v14.4s, v16.4s , v28.4s + sub v26.4s, v16.4s , v28.4s + + + add v16.4s, v18.4s , v30.4s + sub v28.4s, v18.4s , v30.4s + + + + + + + + sqrshrn v30.4h, v20.4s,#shift_stage2_idct //// x0 = (a0 + b0 + rnd) >> 7(shift_stage1_idct) + sqrshrn v19.4h, v22.4s,#shift_stage2_idct //// x7 = (a0 - b0 + rnd) >> 7(shift_stage1_idct) + sqrshrn v31.4h, v14.4s,#shift_stage2_idct //// x2 = (a2 + b2 + rnd) >> 7(shift_stage1_idct) + sqrshrn v18.4h, v26.4s,#shift_stage2_idct //// x5 = (a2 - b2 + rnd) >> 7(shift_stage1_idct) + sqrshrn v12.4h, v12.4s,#shift_stage2_idct //// x1 = (a1 + b1 + rnd) >> 7(shift_stage1_idct) + sqrshrn v15.4h, v24.4s,#shift_stage2_idct //// x6 = (a1 - b1 + rnd) >> 7(shift_stage1_idct) + sqrshrn v13.4h, v16.4s,#shift_stage2_idct //// x3 = (a3 + b3 + rnd) >> 7(shift_stage1_idct) + sqrshrn v14.4h, v28.4s,#shift_stage2_idct //// x4 = (a3 - b3 + rnd) >> 7(shift_stage1_idct) + + bge skip_stage2_kernel_load + + //q2,q4,q6,q7 is used + ld1 {v10.4h, v11.4h},[x1],#16 + ld1 {v6.4h, v7.4h},[x1],#16 + ld1 {v4.4h, v5.4h},[x9],#16 + ld1 {v8.4h, v9.4h},[x9],#16 +skip_stage2_kernel_load: + sub x1,x1,#32 + st1 {v30.4h, v31.4h},[x1],#16 + st1 {v18.4h, v19.4h},[x1],#16 + sub x1,x1,#32 + + smull v24.4s, v6.4h, v2.4h[1] //// y1 * cos1(part of b0) + smull v26.4s, v6.4h, v2.4h[3] //// y1 * cos3(part of b1) + smull v28.4s, v6.4h, v3.4h[1] //// y1 * sin3(part of b2) + smull v30.4s, v6.4h, v3.4h[3] //// y1 * sin1(part of b3) + + smlsl v24.4s, v7.4h, v1.4h[1] //// y1 * cos1 + y3 * cos3(part of b0) + smlsl v26.4s, v7.4h, v0.4h[1] //// y1 * cos3 - y3 * sin1(part of b1) + smlsl v28.4s, v7.4h, v1.4h[3] //// y1 * sin3 - y3 * cos1(part of b2) + smlsl v30.4s, v7.4h, v3.4h[1] //// y1 * sin1 - y3 * sin3(part of b3) + + + smull v22.4s, v10.4h, v0.4h[0] + smlsl v22.4s, v11.4h, v3.4h[2] + smull v20.4s, v10.4h, v0.4h[0] + smlsl v20.4s, v11.4h, v2.4h[2] + smull v16.4s, v10.4h, v0.4h[0] + smlsl v16.4s, v11.4h, v1.4h[2] + smull v18.4s, v10.4h, v0.4h[0] + smlsl v18.4s, v11.4h, v0.4h[2] + + + + cmp x12,x6 + bge skip_last8rows_stage2_kernel2 + + + smlsl v24.4s, v8.4h, v3.4h[1] + smlal v26.4s, v8.4h, v2.4h[1] + smlal v28.4s, v8.4h, v0.4h[1] + smlal v30.4s, v8.4h, v2.4h[3] + + + smlal v24.4s, v9.4h, v0.4h[1] + smlal v26.4s, v9.4h, v3.4h[1] + smlsl v28.4s, v9.4h, v1.4h[1] + smlsl v30.4s, v9.4h, v2.4h[1] + + + + smlsl v22.4s, v4.4h, v1.4h[0] + smlal v22.4s, v5.4h, v2.4h[2] + smlsl v20.4s, v4.4h, v3.4h[0] + smlal v20.4s, v5.4h, v0.4h[2] + smlal v16.4s, v4.4h, v3.4h[0] + smlal v16.4s, v5.4h, v3.4h[2] + smlal v18.4s, v4.4h, v1.4h[0] + smlsl v18.4s, v5.4h, v1.4h[2] + mov x19,#0xff00 + cmp x12,x19 + bge skip_last8rows_stage2_kernel2 + + ld1 {v10.4h, v11.4h},[x11],#16 + ld1 {v6.4h, v7.4h},[x11],#16 + ld1 {v4.4h, v5.4h},[x0],#16 + ld1 {v8.4h, v9.4h},[x0],#16 + + smlsl v24.4s, v6.4h, v3.4h[3] //// y1 * cos1(part of b0) + smlsl v26.4s, v6.4h, v0.4h[3] //// y1 * cos3(part of b1) + smlal v28.4s, v6.4h, v2.4h[3] //// y1 * sin3(part of b2) + smlal v30.4s, v6.4h, v1.4h[3] //// y1 * sin1(part of b3) + + smlsl v24.4s, v7.4h, v0.4h[3] //// y1 * cos1 + y3 * cos3(part of b0) + smlal v26.4s, v7.4h, v1.4h[3] //// y1 * cos3 - y3 * sin1(part of b1) + smlal v28.4s, v7.4h, v3.4h[3] //// y1 * sin3 - y3 * cos1(part of b2) + smlsl v30.4s, v7.4h, v1.4h[1] //// y1 * sin1 - y3 * sin3(part of b3) + + + smlal v24.4s, v8.4h, v2.4h[3] + smlal v26.4s, v8.4h, v3.4h[3] + smlsl v28.4s, v8.4h, v2.4h[1] + smlal v30.4s, v8.4h, v0.4h[3] + + + smlal v24.4s, v9.4h, v1.4h[3] + smlsl v26.4s, v9.4h, v1.4h[1] + smlal v28.4s, v9.4h, v0.4h[3] + smlsl v30.4s, v9.4h, v0.4h[1] + + + + + smlal v22.4s, v10.4h, v0.4h[0] + smlsl v22.4s, v11.4h, v1.4h[2] + smlsl v22.4s, v4.4h, v3.4h[0] + smlal v22.4s, v5.4h, v0.4h[2] + + + + smlsl v20.4s, v10.4h, v0.4h[0] + smlsl v20.4s, v11.4h, v3.4h[2] + smlal v20.4s, v4.4h, v1.4h[0] + smlsl v20.4s, v5.4h, v1.4h[2] + + + smlsl v16.4s, v10.4h, v0.4h[0] + smlal v16.4s, v11.4h, v0.4h[2] + smlsl v16.4s, v4.4h, v1.4h[0] + smlal v16.4s, v5.4h, v2.4h[2] + + + + smlal v18.4s, v10.4h, v0.4h[0] + smlsl v18.4s, v11.4h, v2.4h[2] + smlal v18.4s, v4.4h, v3.4h[0] + smlsl v18.4s, v5.4h, v3.4h[2] + + +skip_last8rows_stage2_kernel2: + + + + add v4.4s, v22.4s , v24.4s + sub v22.4s, v22.4s , v24.4s + + add v6.4s, v20.4s , v26.4s + sub v24.4s, v20.4s , v26.4s + + add v10.4s, v16.4s , v28.4s + sub v26.4s, v16.4s , v28.4s + + + add v16.4s, v18.4s , v30.4s + sub v28.4s, v18.4s , v30.4s + + + sqrshrn v18.4h, v4.4s,#shift_stage2_idct //// x0 = (a0 + b0 + rnd) >> 7(shift_stage1_idct) + sqrshrn v31.4h, v22.4s,#shift_stage2_idct //// x7 = (a0 - b0 + rnd) >> 7(shift_stage1_idct) + sqrshrn v19.4h, v10.4s,#shift_stage2_idct //// x2 = (a2 + b2 + rnd) >> 7(shift_stage1_idct) + sqrshrn v30.4h, v26.4s,#shift_stage2_idct //// x5 = (a2 - b2 + rnd) >> 7(shift_stage1_idct) + sqrshrn v20.4h, v6.4s,#shift_stage2_idct //// x1 = (a1 + b1 + rnd) >> 7(shift_stage1_idct) + sqrshrn v23.4h, v24.4s,#shift_stage2_idct //// x6 = (a1 - b1 + rnd) >> 7(shift_stage1_idct) + sqrshrn v21.4h, v16.4s,#shift_stage2_idct //// x3 = (a3 + b3 + rnd) >> 7(shift_stage1_idct) + sqrshrn v22.4h, v28.4s,#shift_stage2_idct //// x4 = (a3 - b3 + rnd) >> 7(shift_stage1_idct) + + ld1 {v4.4h, v5.4h},[x1],#16 + ld1 {v8.4h, v9.4h},[x1],#16 + + + + // registers used: {q2,q4,q6,q7}, {q9,q15,q10,q11} + +//d4=x0 +//d12=x1 +//d5=x2 +//d13=x3 + +//d18=x4 +//d20=x5 +//d19=x6 +//d21=x7 + +//d22=x8 +//d30=x9 +//d23=x10 +//d31=x11 + +//d14=x12 +//d8=x13 +//d15=x14 +//d9=x15 + + umov x15,v26.d[0] + umov x16,v27.d[0] + umov x19,v28.d[0] + umov x20,v29.d[0] + + trn1 v26.4h, v4.4h, v12.4h + trn2 v27.4h, v4.4h, v12.4h + trn1 v28.4h, v5.4h, v13.4h + trn2 v29.4h, v5.4h, v13.4h + + trn1 v4.2s, v26.2s, v28.2s + trn2 v5.2s, v26.2s, v28.2s + trn1 v12.2s, v27.2s, v29.2s + trn2 v13.2s, v27.2s, v29.2s + + trn1 v26.4h, v18.4h, v20.4h + trn2 v27.4h, v18.4h, v20.4h + trn1 v28.4h, v19.4h, v21.4h + trn2 v29.4h, v19.4h, v21.4h + + trn1 v18.2s, v26.2s, v28.2s + trn2 v19.2s, v26.2s, v28.2s + trn1 v20.2s, v27.2s, v29.2s + trn2 v21.2s, v27.2s, v29.2s + + trn1 v26.4h, v22.4h, v30.4h + trn2 v27.4h, v22.4h, v30.4h + trn1 v28.4h, v23.4h, v31.4h + trn2 v29.4h, v23.4h, v31.4h + + trn1 v22.2s, v26.2s, v28.2s + trn2 v23.2s, v26.2s, v28.2s + trn1 v30.2s, v27.2s, v29.2s + trn2 v31.2s, v27.2s, v29.2s + + trn1 v26.4h, v14.4h, v8.4h + trn2 v27.4h, v14.4h, v8.4h + trn1 v28.4h, v15.4h, v9.4h + trn2 v29.4h, v15.4h, v9.4h + + trn1 v14.2s, v26.2s, v28.2s + trn2 v15.2s, v26.2s, v28.2s + trn1 v8.2s, v27.2s, v29.2s + trn2 v9.2s, v27.2s, v29.2s + + mov v26.d[0],x15 + mov v27.d[0],x16 + mov v28.d[0],x19 + mov v29.d[0],x20 + +// d4 =x0 1- 4 values +// d5 =x2 1- 4 values +// d12=x1 1- 4 values +// d13=x3 1- 4 values + +// d18 =x0 5- 8 values +// d19 =x2 5- 8 values +// d20=x1 5- 8 values +// d21=x3 5- 8 values + +// d22 =x0 9- 12 values +// d23 =x2 9- 12 values +// d30=x1 9- 12 values +// d31=x3 9- 12 values + +// d14 =x0 13-16 values +// d15 =x2 13- 16 values +// d8=x1 13- 16 values +// d9=x3 13- 16 values + + // swapping v5 and v15 + mov v5.d[1],v5.d[0] + mov v5.d[0],v18.d[0] + mov v18.d[0],v5.d[1] + // swapping v23 and v14 + mov v23.d[1],v23.d[0] + mov v23.d[0],v14.d[0] + mov v14.d[0],v23.d[1] + // swapping v13 and v20 + mov v13.d[1],v13.d[0] + mov v13.d[0],v20.d[0] + mov v20.d[0],v13.d[1] + // swapping v31 and v8 + mov v31.d[1],v31.d[0] + mov v31.d[0],v8.d[0] + mov v8.d[0],v31.d[1] + +// q2: x0 1-8 values +// q11: x0 9-16 values +// q9 : x2 1-8 values +// q7 : x2 9-16 values +// q6 : x1 1- 8 values +// q10: x3 1-8 values +// q15: x1 9-16 values +// q4: x3 9-16 values + + +// registers free: q8,q14,q12,q13 + + + ld1 {v16.8b, v17.8b},[x2],x8 + ld1 {v28.8b, v29.8b},[x2],x5 + ld1 {v24.8b, v25.8b},[x4],x8 + ld1 {v26.8b, v27.8b},[x4],x5 + + mov v4.d[1] ,v5.d[0] + mov v22.d[1] ,v23.d[0] + mov v12.d[1] ,v13.d[0] + mov v30.d[1] ,v31.d[0] + mov v18.d[1] ,v19.d[0] + mov v14.d[1] ,v15.d[0] + mov v20.d[1] ,v21.d[0] + mov v8.d[1] ,v9.d[0] + + uaddw v4.8h, v4.8h , v16.8b + uaddw v22.8h, v22.8h , v17.8b + uaddw v12.8h, v12.8h , v28.8b + uaddw v30.8h, v30.8h , v29.8b + uaddw v18.8h, v18.8h , v24.8b + uaddw v14.8h, v14.8h , v25.8b + uaddw v20.8h, v20.8h , v26.8b + uaddw v8.8h, v8.8h , v27.8b + + + sqxtun v16.8b, v4.8h + sqxtun v17.8b, v22.8h + sqxtun v28.8b, v12.8h + sqxtun v29.8b, v30.8h + sqxtun v24.8b, v18.8h + sqxtun v25.8b, v14.8h + sqxtun v26.8b, v20.8h + sqxtun v27.8b, v8.8h + + + + st1 {v16.8b, v17.8b},[x3],x7 + st1 {v28.8b, v29.8b},[x3],x7 + st1 {v24.8b, v25.8b},[x3],x7 + st1 {v26.8b, v27.8b},[x3],x7 + + subs x14,x14,#1 + + + + bne second_stage + + +// sub sp,sp,#40 + // ldmfd sp!,{x4-x12,pc} + ldp x19, x20,[sp],#16 + pop_v_regs + ret + + + + + + + + + + + |