@/******************************************************************************
@ *
@ * Copyright (C) 2015 The Android Open Source Project
@ *
@ * Licensed under the Apache License, Version 2.0 (the "License");
@ * you may not use this file except in compliance with the License.
@ * You may obtain a copy of the License at:
@ *
@ * http://www.apache.org/licenses/LICENSE-2.0
@ *
@ * Unless required by applicable law or agreed to in writing, software
@ * distributed under the License is distributed on an "AS IS" BASIS,
@ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@ * See the License for the specific language governing permissions and
@ * limitations under the License.
@ *
@ *****************************************************************************
@ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
@*/

.text
.p2align 2

@/*****************************************************************************
@*                                                                            *
@*  Function Name    : IH264D_CXA8_YUV420toYUV420SP_UV()                      *
@*                                                                            *
@*  Description      : This function conversts the image from YUV420P color   *
@*                     space to 420SP color space(UV interleaved).        *
@*                                                                            *
@*  Arguments        : R0           pu1_y                                     *
@*                     R1           pu1_u                                     *
@*                     R2           pu1_v                                     *
@*                     R3           pu1_dest_y                                *
@*                     [R13 #40]    pu1_dest_uv                               *
@*                     [R13 #44]    u2_height                                 *
@*                     [R13 #48]    u2_width                                  *
@*                     [R13 #52]    u2_stridey                                *
@*                     [R13 #56]    u2_strideu                                *
@*                     [R13 #60]    u2_stridev                                *
@*                     [R13 #64]    u2_dest_stride_y                          *
@*                     [R13 #68]    u2_dest_stride_uv                         *
@*                     [R13 #72]    convert_uv_only                           *
@*                                                                            *
@*  Values Returned  : None                                                   *
@*                                                                            *
@*  Register Usage   : R0 - R14                                               *
@*                                                                            *
@*  Stack Usage      : 40 Bytes                                               *
@*                                                                            *
@*  Interruptibility : Interruptible                                          *
@*                                                                            *
@*  Known Limitations                                                         *
@*       Assumptions: Image Width:     Assumed to be multiple of 16 and       *
@*                     greater than or equal to 16                *
@*                     Image Height:    Assumed to be even.                   *
@*                                                                            *
@*  Revision History :                                                        *
@*         DD MM YYYY   Author(s)       Changes (Describe the changes made)   *
@*         07 06 2010   Varshita        Draft                                 *
@*         07 06 2010   Naveen Kr T     Completed                             *
@*                                                                            *
@*****************************************************************************/
    .global ih264e_fmt_conv_420p_to_420sp_a9q

ih264e_fmt_conv_420p_to_420sp_a9q:

    @// push the registers on the stack
    stmfd         sp!, {r4-r12, lr}

    ldr           r4, [sp, #72]         @// Load convert_uv_only

    cmp           r4, #1
    beq           yuv420sp_uv_chroma
    @/* Do the preprocessing before the main loops start */
    @// Load the parameters from stack
    ldr           r4, [sp, #44]         @// Load u2_height from stack
    ldr           r5, [sp, #48]         @// Load u2_width from stack
    ldr           r7, [sp, #52]         @// Load u2_stridey from stack
    ldr           r8, [sp, #64]         @// Load u2_dest_stride_y from stack
    sub           r7, r7, r5            @// Source increment
    sub           r8, r8, r5            @// Destination increment

yuv420sp_uv_row_loop_y:
    mov           r6, r5

yuv420sp_uv_col_loop_y:
    pld           [r0, #128]
    vld1.8        {d0, d1}, [r0]!
    vst1.8        {d0, d1}, [r3]!
    sub           r6, r6, #16
    cmp           r6, #15
    bgt           yuv420sp_uv_col_loop_y

    cmp           r6, #0
    beq           yuv420sp_uv_row_loop_end_y
    @//If non-multiple of 16, then go back by few bytes to ensure 16 bytes can be read
    @//Ex if width is 162, above loop will process 160 pixels. And
    @//Both source and destination will point to 146th pixel and then 16 bytes will be read
    @// and written using VLD1 and VST1
    rsb           r6, r6, #16
    sub           r0, r0, r6
    sub           r3, r3, r6

    vld1.8        {d0, d1}, [r0]!
    vst1.8        {d0, d1}, [r3]!

yuv420sp_uv_row_loop_end_y:
    add           r0, r0, r7
    add           r3, r3, r8
    subs          r4, r4, #1
    bgt           yuv420sp_uv_row_loop_y

yuv420sp_uv_chroma:

    ldr           r3, [sp, #40]         @// Load pu1_dest_uv from stack

    ldr           r4, [sp, #44]         @// Load u2_height from stack

    ldr           r5, [sp, #48]         @// Load u2_width from stack


    ldr           r7, [sp, #56]         @// Load u2_strideu from stack

    ldr           r8, [sp, #68]         @// Load u2_dest_stride_uv from stack

    sub           r7, r7, r5, lsr #1    @// Source increment

    sub           r8, r8, r5            @// Destination increment

    mov           r5, r5, lsr #1
    mov           r4, r4, lsr #1
    ldr           r3, [sp, #40]         @// Load pu1_dest_uv from stack

yuv420sp_uv_row_loop_uv:
    mov           r6, r5


yuv420sp_uv_col_loop_uv:
    pld           [r1, #128]
    pld           [r2, #128]
    vld1.8        d0, [r1]!
    vld1.8        d1, [r2]!
    vst2.8        {d0, d1}, [r3]!
    sub           r6, r6, #8
    cmp           r6, #7
    bgt           yuv420sp_uv_col_loop_uv

    cmp           r6, #0
    beq           yuv420sp_uv_row_loop_end_uv
    @//If non-multiple of 16, then go back by few bytes to ensure 16 bytes can be read
    @//Ex if width is 162, above loop will process 160 pixels. And
    @//Both source and destination will point to 146th pixel and then 16 bytes will be read
    @// and written using VLD1 and VST1
    rsb           r6, r6, #8
    sub           r1, r1, r6
    sub           r2, r2, r6
    sub           r3, r3, r6, lsl #1

    vld1.8        d0, [r1]!
    vld1.8        d1, [r2]!
    vst2.8        {d0, d1}, [r3]!

yuv420sp_uv_row_loop_end_uv:
    add           r1, r1, r7
    add           r2, r2, r7
    add           r3, r3, r8
    subs          r4, r4, #1
    bgt           yuv420sp_uv_row_loop_uv
    @//POP THE REGISTERS
    ldmfd         sp!, {r4-r12, pc}


@ /**
@ *******************************************************************************
@ *
@ * @brief ih264e_fmt_conv_422i_to_420sp_a9q
@ *     Function used from format conversion or frame copy
@ *
@ *
@ *
@ *Inputs             : r0 - pu1_y            -   UWORD8 pointer to y plane.
@ *                     r1 - pu1_u            -   UWORD8 pointer to u plane.
@ *                     r2 - pu1_v            -   UWORD8 pointer to u plane.
@ *                     r3 - pu2_yuv422i      -   UWORD16 pointer to yuv422iimage.
@ *             stack + 40 - u4_width         -   Width of the Y plane.
@ *                     44 - u4_height        -   Height of the Y plane.
@ *                     48 - u4_stride_y      -   Stride in pixels of Y plane.
@ *                     52 - u4_stride_u      -   Stride in pixels of U plane.
@ *                     56 - u4_stride_v      -   Stride in pixels of V plane.
@ *                     60 - u4_stride_yuv422i-   Stride in pixels of yuv422i image.
@ *
@ * @par   Description
@ * Function used from copying or converting a reference frame to display buffer
@ * in non shared mode
@ *
@ * @param[in] pu1_y_dst
@ *   Output Y pointer
@ *
@ * @param[in] pu1_u_dst
@ *   Output U/UV pointer ( UV is interleaved in the same format as that of input)
@ *
@ * @param[in] pu1_v_dst
@ *   Output V pointer ( used in 420P output case)
@ *
@ * @param[in] u4_dst_y_strd
@ *   Stride of destination Y buffer
@ *
@ * @param[in] u4_dst_u_strd
@ *   Stride of destination  U/V buffer
@ *
@ *
@ * @param[in] blocking
@ *   To indicate whether format conversion should wait till frame is reconstructed
@ *   and then return after complete copy is done. To be set to 1 when called at the
@ *   end of frame processing and set to 0 when called between frame processing modules
@ *   in order to utilize available MCPS
@ *
@ * @returns Error from IH264E_ERROR_T
@ *
@ * @remarks
@ * Assumes that the stride of U and V buffers are same.
@ * This is correct in most cases
@ * If a case comes where this is not true we need to modify the fmt conversion funcnions called inside also
@ * Since we read 4 pixels ata time the width should be aligned to 4
@ * In assembly width should be aligned to 16 and height to 2.
@ *
@ *
@ * Revision History :
@ *         DD MM YYYY   Author(s)              Changes (Describe the changes made)
@ *         07 06 2010   Harinarayanan K K       Adapeted to 422p
@ *
@ *******************************************************************************
@ */

@//`
@*/
    .global ih264e_fmt_conv_422i_to_420sp_a9q
ih264e_fmt_conv_422i_to_420sp_a9q:
    stmfd         sp!, {r4-r12, lr}     @// Back the register which are used


    @/* Do the preprocessing before the main loops start */
    @// Load the parameters from stack
    ldr           r4, [sp, #48]         @// Load u4_stride_y       from stack

    ldr           r5, [sp, #60]         @// Load u4_stride_yuv422i from stack
    add           r6, r0, r4            @// pu1_y_nxt_row       = pu1_y + u4_stride_y

    ldr           r7, [sp, #40]         @// Load u4_width          from stack
    add           r8, r3, r5, lsl #1    @// pu2_yuv422i_nxt_row = pu2_yuv422i_y + u4_stride_yuv422i(2 Bytes for each pixel)

    ldr           r9, [sp, #52]         @// Load u4_stride_u       from stack
    sub           r12, r4, r7           @// u2_offset1          = u4_stride_y - u4_width

@LDR            r10,[sp,#56]                ;// Load u4_stride_v       from stack
    sub           r14, r5, r7           @// u2_offset_yuv422i   = u4_stride_yuv422i - u4_width

    ldr           r11, [sp, #44]        @// Load u4_height         from stack
    sub           r9, r9, r7            @// u2_offset2          = u4_stride_u - u4_width >> 1

@   SUB         r10,r10,r7,ASR #1           ;// u2_offset3          = u4_stride_v - u4_width >> 1
    mov           r14, r14, lsl #1      @// u2_offset_yuv422i   = u2_offset_yuv422i * 2

    mov           r11, r11, asr #1      @// u4_width = u4_width / 2 (u4_width >> 1)

    add           r4, r12, r4           @// u2_offset1 = u2_offset1 + u4_stride_y
    add           r5, r14, r5, lsl #1   @// u2_offset_yuv422i = u2_offset_yuv422i + u4_stride_yuv422i

@// Register Assignment
@// pu1_y               - r0
@// pu1_y_nxt_row       - r6
@// pu1_u               - r1
@// pu1_v               - r2
@// pu2_yuv422i         - r3
@// pu2_yuv422i_nxt_row - r8
@// u2_offset1          - r4
@// u2_offset2          - r9
@// u2_offset3          - r10
@// u2_offset_yuv422i   - r5
@// u4_width / 16       - r7
@// u4_height / 2       - r11
@// inner loop count    - r12
yuv422i_to_420sp_height_loop:

    mov           r12, r7               @// Inner loop count = u4_width / 16

yuv422i_to_420sp_width_loop:
    vld4.8        {d0, d1, d2, d3}, [r3]! @// Load the 16 elements of row 1
    vld4.8        {d4, d5, d6, d7}, [r8]! @// Load the 16 elements of row 2
    sub           r12, r12, #16

    vrhadd.u8     d0, d0, d4
    vrhadd.u8     d2, d2, d6

    vst2.8        {d1, d3}, [r0]!       @// Store the 16 elements of row1 Y
    vst2.8        {d5, d7}, [r6]!       @// Store the 16 elements of row2 Y

    vst2.8        {d0, d2}, [r1]!       @// Store the 8 elements of row1/2 U

    cmp           r12, #15
    bgt           yuv422i_to_420sp_width_loop
    cmp           r12, #0
    beq           yuv422i_to_420sp_row_loop_end

    @//If non-multiple of 16, then go back by few bytes to ensure 16 bytes can be read
    @//Ex if width is 162, above loop will process 160 pixels. And
    @//Both source and destination will point to 146th pixel and then 16 bytes will be read
    @// and written using VLD1 and VST1
    rsb           r12, r12, #16
    sub           r3, r3, r12, lsl #1
    sub           r8, r8, r12, lsl #1
    sub           r0, r0, r12
    sub           r6, r6, r12
    sub           r1, r1, r12

    vld4.8        {d0, d1, d2, d3}, [r3]! @// Load the 16 elements of row 1
    vld4.8        {d4, d5, d6, d7}, [r8]! @// Load the 16 elements of row 2

    vrhadd.u8     d0, d0, d4
    vrhadd.u8     d2, d2, d6

    vst2.8        {d1, d3}, [r0]!       @// Store the 16 elements of row1 Y
    vst2.8        {d5, d7}, [r6]!       @// Store the 16 elements of row2 Y

    vst2.8        {d0, d2}, [r1]!       @// Store the 8 elements of row1/2 U

yuv422i_to_420sp_row_loop_end:
    @// Update the buffer pointer so that they will refer to next pair of rows
    add           r0, r0, r4            @// pu1_y               = pu1_y                 + u2_offset1
    add           r6, r6, r4            @// pu1_y_nxt_row       = pu1_y_nxt_row         + u2_offset1

    add           r1, r1, r9            @// pu1_u               = pu1_u                 + u2_offset2
    subs          r11, r11, #1

    add           r3, r3, r5            @// pu2_yuv422i         = pu2_yuv422i           + u2_offset_yuv422i

    add           r8, r8, r5            @// pu2_yuv422i_nxt_row = pu2_yuv422i_nxt_row   + u2_offset_yuv422i
    bgt           yuv422i_to_420sp_height_loop
    ldmfd         sp!, {r4-r12, pc}     @// Restore the register which are used