1 files changed, 250 insertions, 0 deletions
diff --git a/common/armv8/ih264_ihadamard_scaling_av8.s b/common/armv8/ih264_ihadamard_scaling_av8.s
new file mode 100755
index 0000000..712c9ae
--- /dev/null
+++ b/common/armv8/ih264_ihadamard_scaling_av8.s
@@ -0,0 +1,250 @@
+//******************************************************************************
+//*
+//* Copyright (C) 2015 The Android Open Source Project
+//*
+//* Licensed under the Apache License, Version 2.0 (the "License");
+//* you may not use this file except in compliance with the License.
+//* You may obtain a copy of the License at:
+//*
+//* http://www.apache.org/licenses/LICENSE-2.0
+//*
+//* Unless required by applicable law or agreed to in writing, software
+//* distributed under the License is distributed on an "AS IS" BASIS,
+//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//* See the License for the specific language governing permissions and
+//* limitations under the License.
+//*
+//*****************************************************************************
+//* Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
+//*/
+///**
+// *******************************************************************************
+// * @file
+// *  ih264_ihadamard_scaling_av8.s
+// *
+// * @brief
+// *  Contains function definitions for inverse hadamard transform on 4x4 DC outputs
+// *  of 16x16 intra-prediction
+// *
+// * @author
+// *  Mohit
+// *
+// * @par List of Functions:
+// *  - ih264_ihadamard_scaling_4x4_av8()
+// *
+// * @remarks
+// *  None
+// *
+.include "ih264_neon_macros.s"
+
+// *******************************************************************************
+// */
+// * @brief This function performs a 4x4 inverse hadamard transform on the 4x4 DC coefficients
+// * of a 16x16 intra prediction macroblock, and then performs scaling.
+// * prediction buffer
+// *
+// * @par Description:
+// *  The DC coefficients pass through a 2-stage inverse hadamard transform.
+// *  This inverse transformed content is scaled to based on Qp value.
+// *
+// * @param[in] pi2_src
+// *  input 4x4 block of DC coefficients
+// *
+// * @param[out] pi2_out
+// *  output 4x4 block
+// *
+// * @param[in] pu2_iscal_mat
+// *  pointer to scaling list
+// *
+// * @param[in] pu2_weigh_mat
+// *  pointer to weight matrix
+// *
+// * @param[in] u4_qp_div_6
+// *  Floor (qp/6)
+// *
+// * @param[in] pi4_tmp
+// * temporary buffer of size 1*16
+// *
+// * @returns none
+// *
+// * @remarks none
+// *
+// *******************************************************************************
+// */
+// *
+// *******************************************************************************
+// */
+// void ih264_ihadamard_scaling_4x4(word16* pi2_src,
+//        word16* pi2_out,
+//        const uword16 *pu2_iscal_mat,
+//        const uword16 *pu2_weigh_mat,
+//        uword32 u4_qp_div_6,
+//        word32* pi4_tmp)
+//**************variables vs registers*****************************************
+//x0 => *pi2_src
+//x1 => *pi2_out
+//x2 => *pu2_iscal_mat
+//x3 => *pu2_weigh_mat
+//x4=>   u4_qp_div_6
+
+.text
+.p2align 2
+
+    .global ih264_ihadamard_scaling_4x4_av8
+ih264_ihadamard_scaling_4x4_av8:
+
+//only one shift is done in horizontal inverse because,
+//if u4_qp_div_6 is lesser than 4 then shift value will be neagative and do negative left shift, in this case rnd_factor has value
+//if u4_qp_div_6 is greater than 4 then shift value will be positive and do left shift, here rnd_factor is 0
+    push_v_regs
+
+//=======================inverse hadamard transform================================
+
+    ld4       {v0.4h-v3.4h}, [x0]       //load x4,x5,x6,x7
+
+    dup       v14.4s, w4                // populate the u4_qp_div_6
+    ld1       {v15.h}[0], [x3]          // pu2_weigh_mat
+    ld1       {v16.h}[0], [x2]          //pu2_iscal_mat
+
+    saddl     v4.4s, v0.4h, v3.4h       //x0 = x4 + x7
+    saddl     v5.4s, v1.4h, v2.4h       //x1 = x5 + x6
+    ssubl     v6.4s, v1.4h, v2.4h       //x2 = x5 - x6
+    ssubl     v7.4s, v0.4h, v3.4h       //x3 = x4 - x7
+
+    add       v0.4s, v4.4s, v5.4s       //pi4_tmp_ptr[0] = x0 + x1
+    add       v1.4s, v7.4s, v6.4s       //pi4_tmp_ptr[1] = x3 + x2
+    sub       v2.4s, v4.4s, v5.4s       //pi4_tmp_ptr[2] = x0 - x1
+    sub       v3.4s, v7.4s, v6.4s       //pi4_tmp_ptr[3] = x3 - x2
+
+    umull     v15.4s, v15.4h, v16.4h
+    dup       v15.4s, v15.s[0]          //pu2_weigh_mat[0]*pu2_iscal_mat[0]
+
+    //transpose
+    trn1      v4.4s, v0.4s, v1.4s
+    trn2      v5.4s, v0.4s, v1.4s
+    trn1      v6.4s, v2.4s, v3.4s
+    trn2      v7.4s, v2.4s, v3.4s
+
+    trn1      v0.2d, v4.2d, v6.2d
+    trn2      v2.2d, v4.2d, v6.2d
+    trn1      v1.2d, v5.2d, v7.2d
+    trn2      v3.2d, v5.2d, v7.2d
+    //end transpose
+
+    add       v4.4s, v0.4s, v3.4s       //x0 = x4+x7
+    add       v5.4s, v1.4s, v2.4s       //x1 = x5+x6
+    sub       v6.4s, v1.4s, v2.4s       //x2 = x5-x6
+    sub       v7.4s, v0.4s, v3.4s       //x3 = x4-x7
+
+    add       v0.4s, v4.4s, v5.4s       //pi4_tmp_ptr[0] = x0 + x1
+    add       v1.4s, v7.4s, v6.4s       //pi4_tmp_ptr[1] = x3 + x2
+    sub       v2.4s, v4.4s, v5.4s       //pi4_tmp_ptr[2] = x0 - x1
+    sub       v3.4s, v7.4s, v6.4s       //pi4_tmp_ptr[3] = x3 - x2
+
+    mul       v0.4s, v0.4s, v15.4s      // q0  = p[i] = (x[i] * trns_coeff[i]) where i = 0..3
+    mul       v1.4s, v1.4s, v15.4s      // q1  = p[i] = (x[i] * trns_coeff[i]) where i = 4..7
+    mul       v2.4s, v2.4s, v15.4s      // q2  = p[i] = (x[i] * trns_coeff[i]) where i = 8..11
+    mul       v3.4s, v3.4s, v15.4s      // q3  = p[i] = (x[i] * trns_coeff[i]) where i = 12..15
+
+    sshl      v0.4s, v0.4s, v14.4s      // q0  = q[i] = (p[i] << (qp/6)) where i = 0..3
+    sshl      v1.4s, v1.4s, v14.4s      // q1  = q[i] = (p[i] << (qp/6)) where i = 4..7
+    sshl      v2.4s, v2.4s, v14.4s      // q2  = q[i] = (p[i] << (qp/6)) where i = 8..11
+    sshl      v3.4s, v3.4s, v14.4s      // q3  = q[i] = (p[i] << (qp/6)) where i = 12..15
+
+    sqrshrn   v0.4h, v0.4s, #6          // d0  = c[i] = ((q[i] + 32) >> 4) where i = 0..3
+    sqrshrn   v1.4h, v1.4s, #6          // d1  = c[i] = ((q[i] + 32) >> 4) where i = 4..7
+    sqrshrn   v2.4h, v2.4s, #6          // d2  = c[i] = ((q[i] + 32) >> 4) where i = 8..11
+    sqrshrn   v3.4h, v3.4s, #6          // d3  = c[i] = ((q[i] + 32) >> 4) where i = 12..15
+
+    st1       {v0.4h-v3.4h}, [x1]       //store the result
+
+    pop_v_regs
+    ret
+
+
+// *******************************************************************************
+// */
+// * @brief This function performs a 2x2 inverse hadamard transform for chroma block
+// *
+// * @par Description:
+// *  The DC coefficients pass through a 2-stage inverse hadamard transform.
+// *  This inverse transformed content is scaled to based on Qp value.
+// *  Both DC blocks of U and v blocks are processesd
+// *
+// * @param[in] pi2_src
+// *  input 1x8 block of ceffs. First 4 are from U and next from V
+// *
+// * @param[out] pi2_out
+// *  output 1x8 block
+// *
+// * @param[in] pu2_iscal_mat
+// *  pointer to scaling list
+// *
+// * @param[in] pu2_weigh_mat
+// *  pointer to weight matrix
+// *
+// * @param[in] u4_qp_div_6
+// *  Floor (qp/6)
+// *
+// * @returns none
+// *
+// * @remarks none
+// *
+// *******************************************************************************
+// */
+// *
+// *******************************************************************************
+// */
+// void ih264_ihadamard_scaling_2x2_uv(WORD16* pi2_src,
+//                                  WORD16* pi2_out,
+//                                  const UWORD16 *pu2_iscal_mat,
+//                                  const UWORD16 *pu2_weigh_mat,
+//                                  UWORD32 u4_qp_div_6,
+
+    .global ih264_ihadamard_scaling_2x2_uv_av8
+ih264_ihadamard_scaling_2x2_uv_av8:
+
+//Registers used
+//   x0 : *pi2_src
+//   x1 : *pi2_out
+//   x2 : *pu2_iscal_mat
+//   x3 : *pu2_weigh_mat
+//   x4 : u4_qp_div_6
+    push_v_regs
+    ld1       {v26.h}[0], [x2]
+    ld1       {v27.h}[0], [x3]
+
+    sub       w4, w4, #5                //qp/6 - 4
+    dup       v28.4s, w4                //load qp/6
+
+    ld2       {v0.4h, v1.4h}, [x0]      //load 8 dc coeffs
+                                        //i2_x4,i2_x6,i2_y4,i1_y6 -> d0
+                                        //i2_x5,i2_x7,i2_y5,i1_y6 -> d1
+
+    saddl     v2.4s, v0.4h, v1.4h       //i4_x0 = i4_x4 + i4_x5;...x2
+    ssubl     v4.4s, v0.4h, v1.4h       //i4_x1 = i4_x4 - i4_x5;...x3
+
+    umull     v30.4s, v26.4h, v27.4h    //pu2_iscal_mat[0]*pu2_weigh_mat[0]
+    dup       v30.4s, v30.s[0]
+
+    trn1      v0.4s, v2.4s, v4.4s
+    trn2      v1.4s, v2.4s, v4.4s       //i4_x0 i4_x1 -> q1
+
+    add       v2.4s, v0.4s, v1.4s       //i4_x4 = i4_x0+i4_x2;.. i4_x5
+    sub       v3.4s, v0.4s, v1.4s       //i4_x6 = i4_x0-i4_x2;.. i4_x7
+
+    mul       v2.4s, v2.4s, v30.4s
+    mul       v3.4s, v3.4s, v30.4s
+
+    sshl      v2.4s, v2.4s, v28.4s
+    sshl      v3.4s, v3.4s, v28.4s
+
+    xtn       v0.4h, v2.4s              //i4_x4 i4_x5 i4_y4 i4_y5
+    xtn       v1.4h, v3.4s              //i4_x6 i4_x7 i4_y6 i4_y7
+
+    st2       {v0.4s-v1.4s}, [x1]
+    pop_v_regs
+    ret
+
+
+