summaryrefslogtreecommitdiffstats
path: root/encoder/arm/ih264e_fmt_conv.s
blob: 2c04141dbf4c2da7ebe8d61ec9e2b575184b2f84 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
@/******************************************************************************
@ *
@ * Copyright (C) 2015 The Android Open Source Project
@ *
@ * Licensed under the Apache License, Version 2.0 (the "License");
@ * you may not use this file except in compliance with the License.
@ * You may obtain a copy of the License at:
@ *
@ * http://www.apache.org/licenses/LICENSE-2.0
@ *
@ * Unless required by applicable law or agreed to in writing, software
@ * distributed under the License is distributed on an "AS IS" BASIS,
@ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@ * See the License for the specific language governing permissions and
@ * limitations under the License.
@ *
@ *****************************************************************************
@ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
@*/
@/**

.text
.p2align 2
@/**

@/*****************************************************************************
@*                                                                            *
@*  Function Name    : IH264D_CXA8_YUV420toYUV420SP_UV()                      *
@*                                                                            *
@*  Description      : This function conversts the image from YUV420P color   *
@*                     space to 420SP color space(UV interleaved).        *
@*                                                                            *
@*  Arguments        : R0           pu1_y                                     *
@*                     R1           pu1_u                                     *
@*                     R2           pu1_v                                     *
@*                     R3           pu1_dest_y                                *
@*                     [R13 #40]    pu1_dest_uv                               *
@*                     [R13 #44]    u2_height                                 *
@*                     [R13 #48]    u2_width                                  *
@*                     [R13 #52]    u2_stridey                                *
@*                     [R13 #56]    u2_strideu                                *
@*                     [R13 #60]    u2_stridev                                *
@*                     [R13 #64]    u2_dest_stride_y                          *
@*                     [R13 #68]    u2_dest_stride_uv                         *
@*                     [R13 #72]    convert_uv_only                           *
@*                                                                            *
@*  Values Returned  : None                                                   *
@*                                                                            *
@*  Register Usage   : R0 - R14                                               *
@*                                                                            *
@*  Stack Usage      : 40 Bytes                                               *
@*                                                                            *
@*  Interruptibility : Interruptible                                          *
@*                                                                            *
@*  Known Limitations                                                         *
@*       Assumptions: Image Width:     Assumed to be multiple of 16 and       *
@*                     greater than or equal to 16                *
@*                     Image Height:    Assumed to be even.                   *
@*                                                                            *
@*  Revision History :                                                        *
@*         DD MM YYYY   Author(s)       Changes (Describe the changes made)   *
@*         07 06 2010   Varshita        Draft                                 *
@*         07 06 2010   Naveen Kr T     Completed                             *
@*                                                                            *
@*****************************************************************************/
    .global ih264e_fmt_conv_420p_to_420sp_a9q

ih264e_fmt_conv_420p_to_420sp_a9q:

    @// push the registers on the stack
    stmfd         sp!, {r4-r12, lr}

    ldr           r4, [sp, #72]         @// Load convert_uv_only

    cmp           r4, #1
    beq           yuv420sp_uv_chroma
    @/* Do the preprocessing before the main loops start */
    @// Load the parameters from stack
    ldr           r4, [sp, #44]         @// Load u2_height from stack
    ldr           r5, [sp, #48]         @// Load u2_width from stack
    ldr           r7, [sp, #52]         @// Load u2_stridey from stack
    ldr           r8, [sp, #64]         @// Load u2_dest_stride_y from stack
    sub           r7, r7, r5            @// Source increment
    sub           r8, r8, r5            @// Destination increment

yuv420sp_uv_row_loop_y:
    mov           r6, r5

yuv420sp_uv_col_loop_y:
    pld           [r0, #128]
    vld1.8        {d0, d1}, [r0]!
    vst1.8        {d0, d1}, [r3]!
    sub           r6, r6, #16
    cmp           r6, #15
    bgt           yuv420sp_uv_col_loop_y

    cmp           r6, #0
    beq           yuv420sp_uv_row_loop_end_y
    @//If non-multiple of 16, then go back by few bytes to ensure 16 bytes can be read
    @//Ex if width is 162, above loop will process 160 pixels. And
    @//Both source and destination will point to 146th pixel and then 16 bytes will be read
    @// and written using VLD1 and VST1
    rsb           r6, r6, #16
    sub           r0, r0, r6
    sub           r3, r3, r6

    vld1.8        {d0, d1}, [r0]!
    vst1.8        {d0, d1}, [r3]!

yuv420sp_uv_row_loop_end_y:
    add           r0, r0, r7
    add           r3, r3, r8
    subs          r4, r4, #1
    bgt           yuv420sp_uv_row_loop_y

yuv420sp_uv_chroma:

    ldr           r3, [sp, #40]         @// Load pu1_dest_uv from stack

    ldr           r4, [sp, #44]         @// Load u2_height from stack

    ldr           r5, [sp, #48]         @// Load u2_width from stack


    ldr           r7, [sp, #56]         @// Load u2_strideu from stack

    ldr           r8, [sp, #68]         @// Load u2_dest_stride_uv from stack

    sub           r7, r7, r5, lsr #1    @// Source increment

    sub           r8, r8, r5            @// Destination increment

    mov           r5, r5, lsr #1
    mov           r4, r4, lsr #1
    ldr           r3, [sp, #40]         @// Load pu1_dest_uv from stack

yuv420sp_uv_row_loop_uv:
    mov           r6, r5


yuv420sp_uv_col_loop_uv:
    pld           [r1, #128]
    pld           [r2, #128]
    vld1.8        d0, [r1]!
    vld1.8        d1, [r2]!
    vst2.8        {d0, d1}, [r3]!
    sub           r6, r6, #8
    cmp           r6, #7
    bgt           yuv420sp_uv_col_loop_uv

    cmp           r6, #0
    beq           yuv420sp_uv_row_loop_end_uv
    @//If non-multiple of 16, then go back by few bytes to ensure 16 bytes can be read
    @//Ex if width is 162, above loop will process 160 pixels. And
    @//Both source and destination will point to 146th pixel and then 16 bytes will be read
    @// and written using VLD1 and VST1
    rsb           r6, r6, #8
    sub           r1, r1, r6
    sub           r2, r2, r6
    sub           r3, r3, r6, lsl #1

    vld1.8        d0, [r1]!
    vld1.8        d1, [r2]!
    vst2.8        {d0, d1}, [r3]!

yuv420sp_uv_row_loop_end_uv:
    add           r1, r1, r7
    add           r2, r2, r7
    add           r3, r3, r8
    subs          r4, r4, #1
    bgt           yuv420sp_uv_row_loop_uv
    @//POP THE REGISTERS
    ldmfd         sp!, {r4-r12, pc}





@ /**
@ *******************************************************************************
@ *
@ * @brief ih264e_fmt_conv_422i_to_420sp_a9q
@ *     Function used from format conversion or frame copy
@ *
@ *
@ *
@ *Inputs             : r0 - pu1_y            -   UWORD8 pointer to y plane.
@ *                     r1 - pu1_u            -   UWORD8 pointer to u plane.
@ *                     r2 - pu1_v            -   UWORD8 pointer to u plane.
@ *                     r3 - pu2_yuv422i      -   UWORD16 pointer to yuv422iimage.
@ *             stack + 40 - u4_width         -   Width of the Y plane.
@ *                     44 - u4_height        -   Height of the Y plane.
@ *                     48 - u4_stride_y      -   Stride in pixels of Y plane.
@ *                     52 - u4_stride_u      -   Stride in pixels of U plane.
@ *                     56 - u4_stride_v      -   Stride in pixels of V plane.
@ *                     60 - u4_stride_yuv422i-   Stride in pixels of yuv422i image.
@ *
@ * @par   Description
@ * Function used from copying or converting a reference frame to display buffer
@ * in non shared mode
@ *
@ * @param[in] pu1_y_dst
@ *   Output Y pointer
@ *
@ * @param[in] pu1_u_dst
@ *   Output U/UV pointer ( UV is interleaved in the same format as that of input)
@ *
@ * @param[in] pu1_v_dst
@ *   Output V pointer ( used in 420P output case)
@ *
@ * @param[in] u4_dst_y_strd
@ *   Stride of destination Y buffer
@ *
@ * @param[in] u4_dst_u_strd
@ *   Stride of destination  U/V buffer
@ *
@ *
@ * @param[in] blocking
@ *   To indicate whether format conversion should wait till frame is reconstructed
@ *   and then return after complete copy is done. To be set to 1 when called at the
@ *   end of frame processing and set to 0 when called between frame processing modules
@ *   in order to utilize available MCPS
@ *
@ * @returns Error from IH264E_ERROR_T
@ *
@ * @remarks
@ * Assumes that the stride of U and V buffers are same.
@ * This is correct in most cases
@ * If a case comes where this is not true we need to modify the fmt conversion funcnions called inside also
@ * Since we read 4 pixels ata time the width should be aligned to 4
@ * In assembly width should be aligned to 16 and height to 2.
@ *
@ *
@ * Revision History :
@ *         DD MM YYYY   Author(s)              Changes (Describe the changes made)
@ *         07 06 2010   Harinarayanan K K       Adapeted to 422p
@ *
@ *******************************************************************************
@ */

@//`
@*/
    .global ih264e_fmt_conv_422i_to_420sp_a9q
ih264e_fmt_conv_422i_to_420sp_a9q:
    stmfd         sp!, {r4-r12, lr}     @// Back the register which are used



    @/* Do the preprocessing before the main loops start */
    @// Load the parameters from stack
    ldr           r4, [sp, #48]         @// Load u4_stride_y       from stack

    ldr           r5, [sp, #60]         @// Load u4_stride_yuv422i from stack
    add           r6, r0, r4            @// pu1_y_nxt_row       = pu1_y + u4_stride_y

    ldr           r7, [sp, #40]         @// Load u4_width          from stack
    add           r8, r3, r5, lsl #1    @// pu2_yuv422i_nxt_row = pu2_yuv422i_y + u4_stride_yuv422i(2 Bytes for each pixel)

    ldr           r9, [sp, #52]         @// Load u4_stride_u       from stack
    sub           r12, r4, r7           @// u2_offset1          = u4_stride_y - u4_width

@LDR            r10,[sp,#56]                ;// Load u4_stride_v       from stack
    sub           r14, r5, r7           @// u2_offset_yuv422i   = u4_stride_yuv422i - u4_width

    ldr           r11, [sp, #44]        @// Load u4_height         from stack
    sub           r9, r9, r7            @// u2_offset2          = u4_stride_u - u4_width >> 1

@   SUB         r10,r10,r7,ASR #1           ;// u2_offset3          = u4_stride_v - u4_width >> 1
    mov           r14, r14, lsl #1      @// u2_offset_yuv422i   = u2_offset_yuv422i * 2

    mov           r7, r7, asr #4        @// u4_width = u4_width / 16 (u4_width >> 4)
    mov           r11, r11, asr #1      @// u4_width = u4_width / 2 (u4_width >> 1)

    add           r4, r12, r4           @// u2_offset1 = u2_offset1 + u4_stride_y
    add           r5, r14, r5, lsl #1   @// u2_offset_yuv422i = u2_offset_yuv422i + u4_stride_yuv422i

@// Register Assignment
@// pu1_y               - r0
@// pu1_y_nxt_row       - r6
@// pu1_u               - r1
@// pu1_v               - r2
@// pu2_yuv422i         - r3
@// pu2_yuv422i_nxt_row - r8
@// u2_offset1          - r4
@// u2_offset2          - r9
@// u2_offset3          - r10
@// u2_offset_yuv422i   - r5
@// u4_width / 16       - r7
@// u4_height / 2       - r11
@// inner loop count    - r12
yuv420_to_yuv422i_hight_loop:

    mov           r12, r7               @// Inner loop count = u4_width / 16

yuv420_to_yuv422i_width_loop:
    vld4.8        {d0, d1, d2, d3}, [r3]! @// Load the 16 elements of row 1
    vld4.8        {d4, d5, d6, d7}, [r8]! @// Load the 16 elements of row 2
    subs          r12, r12, #1

    vrhadd.u8     d0, d0, d4
    vrhadd.u8     d2, d2, d6

    vst2.8        {d1, d3}, [r0]!       @// Store the 16 elements of row1 Y
    vst2.8        {d5, d7}, [r6]!       @// Store the 16 elements of row2 Y

    vst2.8        {d0, d2}, [r1]!       @// Store the 8 elements of row1/2 U

    bgt           yuv420_to_yuv422i_width_loop

    @// Update the buffer pointer so that they will refer to next pair of rows
    add           r0, r0, r4            @// pu1_y               = pu1_y                 + u2_offset1
    add           r6, r6, r4            @// pu1_y_nxt_row       = pu1_y_nxt_row         + u2_offset1

    add           r1, r1, r9            @// pu1_u               = pu1_u                 + u2_offset2
    subs          r11, r11, #1

    add           r3, r3, r5            @// pu2_yuv422i         = pu2_yuv422i           + u2_offset_yuv422i

    add           r8, r8, r5            @// pu2_yuv422i_nxt_row = pu2_yuv422i_nxt_row   + u2_offset_yuv422i
    bgt           yuv420_to_yuv422i_hight_loop
    ldmfd         sp!, {r4-r12, pc}     @// Restore the register which are used