summaryrefslogtreecommitdiffstats
path: root/common/armv8/ih264_ihadamard_scaling_av8.s
diff options
context:
space:
mode:
Diffstat (limited to 'common/armv8/ih264_ihadamard_scaling_av8.s')
-rwxr-xr-xcommon/armv8/ih264_ihadamard_scaling_av8.s250
1 files changed, 250 insertions, 0 deletions
diff --git a/common/armv8/ih264_ihadamard_scaling_av8.s b/common/armv8/ih264_ihadamard_scaling_av8.s
new file mode 100755
index 0000000..712c9ae
--- /dev/null
+++ b/common/armv8/ih264_ihadamard_scaling_av8.s
@@ -0,0 +1,250 @@
+//******************************************************************************
+//*
+//* Copyright (C) 2015 The Android Open Source Project
+//*
+//* Licensed under the Apache License, Version 2.0 (the "License");
+//* you may not use this file except in compliance with the License.
+//* You may obtain a copy of the License at:
+//*
+//* http://www.apache.org/licenses/LICENSE-2.0
+//*
+//* Unless required by applicable law or agreed to in writing, software
+//* distributed under the License is distributed on an "AS IS" BASIS,
+//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//* See the License for the specific language governing permissions and
+//* limitations under the License.
+//*
+//*****************************************************************************
+//* Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
+//*/
+///**
+// *******************************************************************************
+// * @file
+// * ih264_ihadamard_scaling_av8.s
+// *
+// * @brief
+// * Contains function definitions for inverse hadamard transform on 4x4 DC outputs
+// * of 16x16 intra-prediction
+// *
+// * @author
+// * Mohit
+// *
+// * @par List of Functions:
+// * - ih264_ihadamard_scaling_4x4_av8()
+// *
+// * @remarks
+// * None
+// *
+.include "ih264_neon_macros.s"
+
+// *******************************************************************************
+// */
+// * @brief This function performs a 4x4 inverse hadamard transform on the 4x4 DC coefficients
+// * of a 16x16 intra prediction macroblock, and then performs scaling.
+// * prediction buffer
+// *
+// * @par Description:
+// * The DC coefficients pass through a 2-stage inverse hadamard transform.
+// * This inverse transformed content is scaled to based on Qp value.
+// *
+// * @param[in] pi2_src
+// * input 4x4 block of DC coefficients
+// *
+// * @param[out] pi2_out
+// * output 4x4 block
+// *
+// * @param[in] pu2_iscal_mat
+// * pointer to scaling list
+// *
+// * @param[in] pu2_weigh_mat
+// * pointer to weight matrix
+// *
+// * @param[in] u4_qp_div_6
+// * Floor (qp/6)
+// *
+// * @param[in] pi4_tmp
+// * temporary buffer of size 1*16
+// *
+// * @returns none
+// *
+// * @remarks none
+// *
+// *******************************************************************************
+// */
+// *
+// *******************************************************************************
+// */
+// void ih264_ihadamard_scaling_4x4(word16* pi2_src,
+// word16* pi2_out,
+// const uword16 *pu2_iscal_mat,
+// const uword16 *pu2_weigh_mat,
+// uword32 u4_qp_div_6,
+// word32* pi4_tmp)
+//**************variables vs registers*****************************************
+//x0 => *pi2_src
+//x1 => *pi2_out
+//x2 => *pu2_iscal_mat
+//x3 => *pu2_weigh_mat
+//x4=> u4_qp_div_6
+
+.text
+.p2align 2
+
+ .global ih264_ihadamard_scaling_4x4_av8
+ih264_ihadamard_scaling_4x4_av8:
+
+//only one shift is done in horizontal inverse because,
+//if u4_qp_div_6 is lesser than 4 then shift value will be neagative and do negative left shift, in this case rnd_factor has value
+//if u4_qp_div_6 is greater than 4 then shift value will be positive and do left shift, here rnd_factor is 0
+ push_v_regs
+
+//=======================inverse hadamard transform================================
+
+ ld4 {v0.4h-v3.4h}, [x0] //load x4,x5,x6,x7
+
+ dup v14.4s, w4 // populate the u4_qp_div_6
+ ld1 {v15.h}[0], [x3] // pu2_weigh_mat
+ ld1 {v16.h}[0], [x2] //pu2_iscal_mat
+
+ saddl v4.4s, v0.4h, v3.4h //x0 = x4 + x7
+ saddl v5.4s, v1.4h, v2.4h //x1 = x5 + x6
+ ssubl v6.4s, v1.4h, v2.4h //x2 = x5 - x6
+ ssubl v7.4s, v0.4h, v3.4h //x3 = x4 - x7
+
+ add v0.4s, v4.4s, v5.4s //pi4_tmp_ptr[0] = x0 + x1
+ add v1.4s, v7.4s, v6.4s //pi4_tmp_ptr[1] = x3 + x2
+ sub v2.4s, v4.4s, v5.4s //pi4_tmp_ptr[2] = x0 - x1
+ sub v3.4s, v7.4s, v6.4s //pi4_tmp_ptr[3] = x3 - x2
+
+ umull v15.4s, v15.4h, v16.4h
+ dup v15.4s, v15.s[0] //pu2_weigh_mat[0]*pu2_iscal_mat[0]
+
+ //transpose
+ trn1 v4.4s, v0.4s, v1.4s
+ trn2 v5.4s, v0.4s, v1.4s
+ trn1 v6.4s, v2.4s, v3.4s
+ trn2 v7.4s, v2.4s, v3.4s
+
+ trn1 v0.2d, v4.2d, v6.2d
+ trn2 v2.2d, v4.2d, v6.2d
+ trn1 v1.2d, v5.2d, v7.2d
+ trn2 v3.2d, v5.2d, v7.2d
+ //end transpose
+
+ add v4.4s, v0.4s, v3.4s //x0 = x4+x7
+ add v5.4s, v1.4s, v2.4s //x1 = x5+x6
+ sub v6.4s, v1.4s, v2.4s //x2 = x5-x6
+ sub v7.4s, v0.4s, v3.4s //x3 = x4-x7
+
+ add v0.4s, v4.4s, v5.4s //pi4_tmp_ptr[0] = x0 + x1
+ add v1.4s, v7.4s, v6.4s //pi4_tmp_ptr[1] = x3 + x2
+ sub v2.4s, v4.4s, v5.4s //pi4_tmp_ptr[2] = x0 - x1
+ sub v3.4s, v7.4s, v6.4s //pi4_tmp_ptr[3] = x3 - x2
+
+ mul v0.4s, v0.4s, v15.4s // q0 = p[i] = (x[i] * trns_coeff[i]) where i = 0..3
+ mul v1.4s, v1.4s, v15.4s // q1 = p[i] = (x[i] * trns_coeff[i]) where i = 4..7
+ mul v2.4s, v2.4s, v15.4s // q2 = p[i] = (x[i] * trns_coeff[i]) where i = 8..11
+ mul v3.4s, v3.4s, v15.4s // q3 = p[i] = (x[i] * trns_coeff[i]) where i = 12..15
+
+ sshl v0.4s, v0.4s, v14.4s // q0 = q[i] = (p[i] << (qp/6)) where i = 0..3
+ sshl v1.4s, v1.4s, v14.4s // q1 = q[i] = (p[i] << (qp/6)) where i = 4..7
+ sshl v2.4s, v2.4s, v14.4s // q2 = q[i] = (p[i] << (qp/6)) where i = 8..11
+ sshl v3.4s, v3.4s, v14.4s // q3 = q[i] = (p[i] << (qp/6)) where i = 12..15
+
+ sqrshrn v0.4h, v0.4s, #6 // d0 = c[i] = ((q[i] + 32) >> 4) where i = 0..3
+ sqrshrn v1.4h, v1.4s, #6 // d1 = c[i] = ((q[i] + 32) >> 4) where i = 4..7
+ sqrshrn v2.4h, v2.4s, #6 // d2 = c[i] = ((q[i] + 32) >> 4) where i = 8..11
+ sqrshrn v3.4h, v3.4s, #6 // d3 = c[i] = ((q[i] + 32) >> 4) where i = 12..15
+
+ st1 {v0.4h-v3.4h}, [x1] //store the result
+
+ pop_v_regs
+ ret
+
+
+// *******************************************************************************
+// */
+// * @brief This function performs a 2x2 inverse hadamard transform for chroma block
+// *
+// * @par Description:
+// * The DC coefficients pass through a 2-stage inverse hadamard transform.
+// * This inverse transformed content is scaled to based on Qp value.
+// * Both DC blocks of U and v blocks are processesd
+// *
+// * @param[in] pi2_src
+// * input 1x8 block of ceffs. First 4 are from U and next from V
+// *
+// * @param[out] pi2_out
+// * output 1x8 block
+// *
+// * @param[in] pu2_iscal_mat
+// * pointer to scaling list
+// *
+// * @param[in] pu2_weigh_mat
+// * pointer to weight matrix
+// *
+// * @param[in] u4_qp_div_6
+// * Floor (qp/6)
+// *
+// * @returns none
+// *
+// * @remarks none
+// *
+// *******************************************************************************
+// */
+// *
+// *******************************************************************************
+// */
+// void ih264_ihadamard_scaling_2x2_uv(WORD16* pi2_src,
+// WORD16* pi2_out,
+// const UWORD16 *pu2_iscal_mat,
+// const UWORD16 *pu2_weigh_mat,
+// UWORD32 u4_qp_div_6,
+
+ .global ih264_ihadamard_scaling_2x2_uv_av8
+ih264_ihadamard_scaling_2x2_uv_av8:
+
+//Registers used
+// x0 : *pi2_src
+// x1 : *pi2_out
+// x2 : *pu2_iscal_mat
+// x3 : *pu2_weigh_mat
+// x4 : u4_qp_div_6
+ push_v_regs
+ ld1 {v26.h}[0], [x2]
+ ld1 {v27.h}[0], [x3]
+
+ sub w4, w4, #5 //qp/6 - 4
+ dup v28.4s, w4 //load qp/6
+
+ ld2 {v0.4h, v1.4h}, [x0] //load 8 dc coeffs
+ //i2_x4,i2_x6,i2_y4,i1_y6 -> d0
+ //i2_x5,i2_x7,i2_y5,i1_y6 -> d1
+
+ saddl v2.4s, v0.4h, v1.4h //i4_x0 = i4_x4 + i4_x5;...x2
+ ssubl v4.4s, v0.4h, v1.4h //i4_x1 = i4_x4 - i4_x5;...x3
+
+ umull v30.4s, v26.4h, v27.4h //pu2_iscal_mat[0]*pu2_weigh_mat[0]
+ dup v30.4s, v30.s[0]
+
+ trn1 v0.4s, v2.4s, v4.4s
+ trn2 v1.4s, v2.4s, v4.4s //i4_x0 i4_x1 -> q1
+
+ add v2.4s, v0.4s, v1.4s //i4_x4 = i4_x0+i4_x2;.. i4_x5
+ sub v3.4s, v0.4s, v1.4s //i4_x6 = i4_x0-i4_x2;.. i4_x7
+
+ mul v2.4s, v2.4s, v30.4s
+ mul v3.4s, v3.4s, v30.4s
+
+ sshl v2.4s, v2.4s, v28.4s
+ sshl v3.4s, v3.4s, v28.4s
+
+ xtn v0.4h, v2.4s //i4_x4 i4_x5 i4_y4 i4_y5
+ xtn v1.4h, v3.4s //i4_x6 i4_x7 i4_y6 i4_y7
+
+ st2 {v0.4s-v1.4s}, [x1]
+ pop_v_regs
+ ret
+
+
+