common/arm/ih264_ihadamard_scaling_a9.s


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250

@/******************************************************************************
@ *
@ * Copyright (C) 2015 The Android Open Source Project
@ *
@ * Licensed under the Apache License, Version 2.0 (the "License");
@ * you may not use this file except in compliance with the License.
@ * You may obtain a copy of the License at:
@ *
@ * http://www.apache.org/licenses/LICENSE-2.0
@ *
@ * Unless required by applicable law or agreed to in writing, software
@ * distributed under the License is distributed on an "AS IS" BASIS,
@ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@ * See the License for the specific language governing permissions and
@ * limitations under the License.
@ *
@ *****************************************************************************
@ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
@*/
@**
@ *******************************************************************************
@ * @file
@ *  ih264_ihadamard_scaling_a9.s
@ *
@ * @brief
@ *  Contains function definitions for inverse hadamard transform on 4x4 DC outputs
@ *  of 16x16 intra-prediction
@ *
@ * @author
@ *  Mohit
@ *
@ * @par List of Functions:
@ *  - ih264_ihadamard_scaling_4x4_a9()
@ *  - ih264_ihadamard_scaling_2x2_uv_a9()
@ *
@ * @remarks
@ *  None
@ *
@ *******************************************************************************
@ *
@ * @brief This function performs a 4x4 inverse hadamard transform on the 4x4 DC coefficients
@ * of a 16x16 intra prediction macroblock, and then performs scaling.
@ * prediction buffer
@ *
@ * @par Description:
@ *  The DC coefficients pass through a 2-stage inverse hadamard transform.
@ *  This inverse transformed content is scaled to based on Qp value.
@ *
@ * @param[in] pi2_src
@ *  input 4x4 block of DC coefficients
@ *
@ * @param[out] pi2_out
@ *  output 4x4 block
@ *
@ * @param[in] pu2_iscal_mat
@ *  pointer to scaling list
@ *
@ * @param[in] pu2_weigh_mat
@ *  pointer to weight matrix
@ *
@ * @param[in] u4_qp_div_6
@ *  Floor (qp/6)
@ *
@ * @param[in] pi4_tmp
@ * temporary buffer of size 1*16
@ *
@ * @returns none
@ *
@ * @remarks none
@ *
@ *******************************************************************************
@ *
@ *
@ *******************************************************************************
@ *
@ void ih264_ihadamard_scaling_4x4(WORD16* pi2_src,
@       WORD16* pi2_out,
@       const UWORD16 *pu2_iscal_mat,
@       const UWORD16 *pu2_weigh_mat,
@       UWORD32 u4_qp_div_6,
@       WORD32* pi4_tmp)
@**************Variables Vs Registers*****************************************
@r0 => *pi2_src
@r1 => *pi2_out
@r2 =>  *pu2_iscal_mat
@r3 =>  *pu2_weigh_mat
@r4 =>  u4_qp_div_6

.text
.p2align 2

    .global ih264_ihadamard_scaling_4x4_a9

ih264_ihadamard_scaling_4x4_a9:

@VLD4.S16 is used because the pointer is incremented by SUB_BLK_WIDTH_4x4
@If the macro value changes need to change the instruction according to it.
@Only one shift is done in horizontal inverse because,
@if u4_qp_div_6 is lesser than 4 then shift value will be neagative and do negative left shift, in this case rnd_factor has value
@if u4_qp_div_6 is greater than 4 then shift value will be positive and do left shift, here rnd_factor is 0

    stmfd         sp!, {r4-r12, r14}    @ stack stores the values of the arguments
    ldr           r4, [sp, #40]         @ Loads u4_qp_div_6
    vdup.s32      q10, r4               @ Populate the u4_qp_div_6 in Q10
    ldrh          r6, [r3]              @ load pu2_weight_mat[0] , H for unsigned halfword load
    ldrh          r7, [r2]              @ load pu2_iscal_mat[0] , H for unsigned halfword load
    mul           r6, r6, r7            @ pu2_iscal_mat[0]*pu2_weigh_mat[0]
    vdup.s32      q9, r6                @ Populate pu2_iscal_mat[0]*pu2_weigh_mat[0] 32-bit in Q9
    vpush         {d8-d15}
@=======================INVERSE HADAMARD TRANSFORM================================

    vld4.s16      {d0, d1, d2, d3}, [r0] @load x4,x5,x6,x7
    vaddl.s16     q12, d0, d3           @x0 = x4 + x7
    vaddl.s16     q13, d1, d2           @x1 = x5 + x6
    vsubl.s16     q14, d1, d2           @x2 = x5 - x6
    vsubl.s16     q15, d0, d3           @x3 = x4 - x7

    vadd.s32      q2, q12, q13          @pi4_tmp_ptr[0] = x0 + x1
    vadd.s32      q3, q15, q14          @pi4_tmp_ptr[1] = x3 + x2
    vsub.s32      q4, q12, q13          @pi4_tmp_ptr[2] = x0 - x1
    vsub.s32      q5, q15, q14          @pi4_tmp_ptr[3] = x3 - x2

    vtrn.32       q2, q3                @Transpose the register for vertical transform
    vtrn.32       q4, q5

    vswp          d5, d8                @Q2 = x4, Q4 = x6
    vswp          d7, d10               @Q3 = x5, Q5 = x7


    vadd.s32      q12, q2, q5           @x0 = x4+x7
    vadd.s32      q13, q3, q4           @x1 = x5+x6
    vsub.s32      q14, q3, q4           @x2 = x5-x6
    vsub.s32      q15, q2, q5           @x3 = x4-x7

    vadd.s32      q0, q12, q13          @pi4_tmp_ptr[0] = x0 + x1
    vadd.s32      q1, q15, q14          @pi4_tmp_ptr[1] = x3 + x2
    vsub.s32      q2, q12, q13          @pi4_tmp_ptr[2] = x0 - x1
    vsub.s32      q3, q15, q14          @pi4_tmp_ptr[3] = x3 - x2


    vmul.s32      q0, q0, q9            @ Q0  = p[i] = (x[i] * trns_coeff[i]) where i = 0..3
    vmul.s32      q1, q1, q9            @ Q1  = p[i] = (x[i] * trns_coeff[i]) where i = 4..7
    vmul.s32      q2, q2, q9            @ Q2  = p[i] = (x[i] * trns_coeff[i]) where i = 8..11
    vmul.s32      q3, q3, q9            @ Q3  = p[i] = (x[i] * trns_coeff[i]) where i = 12..15

    vshl.s32      q0, q0, q10           @ Q0  = q[i] = (p[i] << (qP/6)) where i = 0..3
    vshl.s32      q1, q1, q10           @ Q1  = q[i] = (p[i] << (qP/6)) where i = 4..7
    vshl.s32      q2, q2, q10           @ Q2  = q[i] = (p[i] << (qP/6)) where i = 8..11
    vshl.s32      q3, q3, q10           @ Q3  = q[i] = (p[i] << (qP/6)) where i = 12..15

    vqrshrn.s32   d0, q0, #0x6          @ D0  = c[i] = ((q[i] + 32) >> 4) where i = 0..3
    vqrshrn.s32   d1, q1, #0x6          @ D1  = c[i] = ((q[i] + 32) >> 4) where i = 4..7
    vqrshrn.s32   d2, q2, #0x6          @ D2  = c[i] = ((q[i] + 32) >> 4) where i = 8..11
    vqrshrn.s32   d3, q3, #0x6          @ D3  = c[i] = ((q[i] + 32) >> 4) where i = 12..15

    vst1.s16      {d0, d1, d2, d3}, [r1] @IV row store the value

    vpop          {d8-d15}
    ldmfd         sp!, {r4-r12, r15}    @Reload the registers from SP


@ *******************************************************************************
@ *
@ * @brief This function performs a 2x2 inverse hadamard transform for chroma block
@ *
@ * @par Description:
@ *  The DC coefficients pass through a 2-stage inverse hadamard transform.
@ *  This inverse transformed content is scaled to based on Qp value.
@ *  Both DC blocks of U and v blocks are processesd
@ *
@ * @param[in] pi2_src
@ *  input 1x8 block of ceffs. First 4 are from U and next from V
@ *
@ * @param[out] pi2_out
@ *  output 1x8 block
@ *
@ * @param[in] pu2_iscal_mat
@ *  pointer to scaling list
@ *
@ * @param[in] pu2_weigh_mat
@ *  pointer to weight matrix
@ *
@ * @param[in] u4_qp_div_6
@ *  Floor (qp/6)
@ *
@ * @returns none
@ *
@ * @remarks none
@ *
@ *******************************************************************************
@ *
@ *
@ *******************************************************************************
@ *
@ void ih264_ihadamard_scaling_2x2_uv(WORD16* pi2_src,
@                                  WORD16* pi2_out,
@                                  const UWORD16 *pu2_iscal_mat,
@                                  const UWORD16 *pu2_weigh_mat,
@                                  UWORD32 u4_qp_div_6,

    .global ih264_ihadamard_scaling_2x2_uv_a9
ih264_ihadamard_scaling_2x2_uv_a9:

@Registers used
@   r0 : *pi2_src
@   r1 : *pi2_out
@   r2 : *pu2_iscal_mat
@   r3 : *pu2_weigh_mat

    vld1.u16      d26[0], [r2]
    vld1.u16      d27[0], [r3]
    vmull.u16     q15, d26, d27         @pu2_iscal_mat[0] *  pu2_weigh_mat[0]
    vdup.u32      q15, d30[0]

    vld1.u16      d28[0], [sp]          @load qp/6

    vpush         {d8-d15}

    vmov.u16      d29, #5
    vsubl.u16     q14, d28, d29         @qp\6 - 5
    vdup.s32      q14, d28[0]

    vld2.s16      {d0, d1}, [r0]        @load 8 dc coeffs
                                        @i2_x4,i2_x6,i2_y4,i1_y6 -> d0
                                        @i2_x5,i2_x7,i2_y5,i1_y6 -> d1

    vaddl.s16     q1, d0, d1            @  i4_x0 = i4_x4 + i4_x5;...x2
    vsubl.s16     q2, d0, d1            @  i4_x1 = i4_x4 - i4_x5;...x3

    vtrn.s32      q1, q2                @i4_x0 i4_x1 -> q1

    vadd.s32      q3, q1, q2            @i4_x4 = i4_x0+i4_x2;.. i4_x5
    vsub.s32      q1, q1, q2            @i4_x6 = i4_x0-i4_x2;.. i4_x7

    vmul.s32      q5, q3, q15
    vmul.s32      q6, q1, q15

    vshl.s32      q7, q5, q14
    vshl.s32      q8, q6, q14

    vmovn.s32     d18, q7               @i4_x4 i4_x5 i4_y4 i4_y5
    vmovn.s32     d19, q8               @i4_x6 i4_x7 i4_y6 i4_y7

    vst2.s32      {d18-d19}, [r1]

    vpop          {d8-d15}
    bx            lr