From 8d3d303c7942ced6a987a52db8977d768dc3605f Mon Sep 17 00:00:00 2001
From: Hamsalekha S <hamsalekha.s@ittiam.com>
Date: Fri, 13 Mar 2015 21:24:58 +0530
Subject: Initial version

Change-Id: I7efe9a589cd24edf86e8d086b40c27cbbf8b4017
---
 encoder/arm/ih264e_fmt_conv.s | 329 ++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 329 insertions(+)
 create mode 100755 encoder/arm/ih264e_fmt_conv.s

(limited to 'encoder/arm/ih264e_fmt_conv.s')

diff --git a/encoder/arm/ih264e_fmt_conv.s b/encoder/arm/ih264e_fmt_conv.s
new file mode 100755
index 0000000..2bf1479
--- /dev/null
+++ b/encoder/arm/ih264e_fmt_conv.s
@@ -0,0 +1,329 @@
+@/******************************************************************************
+@ *
+@ * Copyright (C) 2015 The Android Open Source Project
+@ *
+@ * Licensed under the Apache License, Version 2.0 (the "License");
+@ * you may not use this file except in compliance with the License.
+@ * You may obtain a copy of the License at:
+@ *
+@ * http://www.apache.org/licenses/LICENSE-2.0
+@ *
+@ * Unless required by applicable law or agreed to in writing, software
+@ * distributed under the License is distributed on an "AS IS" BASIS,
+@ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@ * See the License for the specific language governing permissions and
+@ * limitations under the License.
+@ *
+@ *****************************************************************************
+@ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
+@*/
+@/**
+
+.text
+.p2align 2
+@/**
+
+@/*****************************************************************************
+@*                                                                            *
+@*  Function Name    : IH264D_CXA8_YUV420toYUV420SP_UV()                      *
+@*                                                                            *
+@*  Description      : This function conversts the image from YUV420P color   *
+@*                     space to 420SP color space(UV interleaved).        *
+@*                                                                            *
+@*  Arguments        : R0           pu1_y                                     *
+@*                     R1           pu1_u                                     *
+@*                     R2           pu1_v                                     *
+@*                     R3           pu1_dest_y                                *
+@*                     [R13 #40]    pu1_dest_uv                               *
+@*                     [R13 #44]    u2_height                                 *
+@*                     [R13 #48]    u2_width                                  *
+@*                     [R13 #52]    u2_stridey                                *
+@*                     [R13 #56]    u2_strideu                                *
+@*                     [R13 #60]    u2_stridev                                *
+@*                     [R13 #64]    u2_dest_stride_y                          *
+@*                     [R13 #68]    u2_dest_stride_uv                         *
+@*                     [R13 #72]    convert_uv_only                           *
+@*                                                                            *
+@*  Values Returned  : None                                                   *
+@*                                                                            *
+@*  Register Usage   : R0 - R14                                               *
+@*                                                                            *
+@*  Stack Usage      : 40 Bytes                                               *
+@*                                                                            *
+@*  Interruptibility : Interruptible                                          *
+@*                                                                            *
+@*  Known Limitations                                                         *
+@*       Assumptions: Image Width:     Assumed to be multiple of 16 and       *
+@*                     greater than or equal to 16                *
+@*                     Image Height:    Assumed to be even.                   *
+@*                                                                            *
+@*  Revision History :                                                        *
+@*         DD MM YYYY   Author(s)       Changes (Describe the changes made)   *
+@*         07 06 2010   Varshita        Draft                                 *
+@*         07 06 2010   Naveen Kr T     Completed                             *
+@*                                                                            *
+@*****************************************************************************/
+    .global ih264e_fmt_conv_420p_to_420sp_a9q
+
+ih264e_fmt_conv_420p_to_420sp_a9q:
+
+    @// push the registers on the stack
+    stmfd         sp!, {r4-r12, lr}
+
+    ldr           r4, [sp, #72]         @// Load convert_uv_only
+
+    cmp           r4, #1
+    beq           yuv420sp_uv_chroma
+    @/* Do the preprocessing before the main loops start */
+    @// Load the parameters from stack
+    ldr           r4, [sp, #44]         @// Load u2_height from stack
+    ldr           r5, [sp, #48]         @// Load u2_width from stack
+    ldr           r7, [sp, #52]         @// Load u2_stridey from stack
+    ldr           r8, [sp, #64]         @// Load u2_dest_stride_y from stack
+    sub           r7, r7, r5            @// Source increment
+    sub           r8, r8, r5            @// Destination increment
+
+    vpush         {d8-d15}
+yuv420sp_uv_row_loop_y:
+    mov           r6, r5
+
+yuv420sp_uv_col_loop_y:
+    pld           [r0, #128]
+    vld1.8        {d0, d1}, [r0]!
+    vst1.8        {d0, d1}, [r3]!
+    sub           r6, r6, #16
+    cmp           r6, #15
+    bgt           yuv420sp_uv_col_loop_y
+
+    cmp           r6, #0
+    beq           yuv420sp_uv_row_loop_end_y
+    @//If non-multiple of 16, then go back by few bytes to ensure 16 bytes can be read
+    @//Ex if width is 162, above loop will process 160 pixels. And
+    @//Both source and destination will point to 146th pixel and then 16 bytes will be read
+    @// and written using VLD1 and VST1
+    rsb           r6, r6, #16
+    sub           r0, r0, r6
+    sub           r3, r3, r6
+
+    vld1.8        {d0, d1}, [r0]!
+    vst1.8        {d0, d1}, [r3]!
+
+yuv420sp_uv_row_loop_end_y:
+    add           r0, r0, r7
+    add           r3, r3, r8
+    subs          r4, r4, #1
+    bgt           yuv420sp_uv_row_loop_y
+
+yuv420sp_uv_chroma:
+
+    ldr           r3, [sp, #40]         @// Load pu1_dest_uv from stack
+
+    ldr           r4, [sp, #44]         @// Load u2_height from stack
+
+    ldr           r5, [sp, #48]         @// Load u2_width from stack
+
+
+    ldr           r7, [sp, #56]         @// Load u2_strideu from stack
+
+    ldr           r8, [sp, #68]         @// Load u2_dest_stride_uv from stack
+
+    sub           r7, r7, r5, lsr #1    @// Source increment
+
+    sub           r8, r8, r5            @// Destination increment
+
+    mov           r5, r5, lsr #1
+    mov           r4, r4, lsr #1
+    ldr           r3, [sp, #40]         @// Load pu1_dest_uv from stack
+    vpush         {d8-d15}
+yuv420sp_uv_row_loop_uv:
+    mov           r6, r5
+
+
+yuv420sp_uv_col_loop_uv:
+    pld           [r1, #128]
+    pld           [r2, #128]
+    vld1.8        d0, [r1]!
+    vld1.8        d1, [r2]!
+    vst2.8        {d0, d1}, [r3]!
+    sub           r6, r6, #8
+    cmp           r6, #7
+    bgt           yuv420sp_uv_col_loop_uv
+
+    cmp           r6, #0
+    beq           yuv420sp_uv_row_loop_end_uv
+    @//If non-multiple of 16, then go back by few bytes to ensure 16 bytes can be read
+    @//Ex if width is 162, above loop will process 160 pixels. And
+    @//Both source and destination will point to 146th pixel and then 16 bytes will be read
+    @// and written using VLD1 and VST1
+    rsb           r6, r6, #8
+    sub           r1, r1, r6
+    sub           r2, r2, r6
+    sub           r3, r3, r6, lsl #1
+
+    vld1.8        d0, [r1]!
+    vld1.8        d1, [r2]!
+    vst2.8        {d0, d1}, [r3]!
+
+yuv420sp_uv_row_loop_end_uv:
+    add           r1, r1, r7
+    add           r2, r2, r7
+    add           r3, r3, r8
+    subs          r4, r4, #1
+    bgt           yuv420sp_uv_row_loop_uv
+    @//POP THE REGISTERS
+    vpop          {d8-d15}
+    ldmfd         sp!, {r4-r12, pc}
+
+
+
+
+
+@ /**
+@ *******************************************************************************
+@ *
+@ * @brief ih264e_fmt_conv_422i_to_420sp_a9q
+@ *     Function used from format conversion or frame copy
+@ *
+@ *
+@ *
+@ *Inputs             : r0 - pu1_y            -   UWORD8 pointer to y plane.
+@ *                     r1 - pu1_u            -   UWORD8 pointer to u plane.
+@ *                     r2 - pu1_v            -   UWORD8 pointer to u plane.
+@ *                     r3 - pu2_yuv422i      -   UWORD16 pointer to yuv422iimage.
+@ *             stack + 40 - u4_width         -   Width of the Y plane.
+@ *                     44 - u4_height        -   Height of the Y plane.
+@ *                     48 - u4_stride_y      -   Stride in pixels of Y plane.
+@ *                     52 - u4_stride_u      -   Stride in pixels of U plane.
+@ *                     56 - u4_stride_v      -   Stride in pixels of V plane.
+@ *                     60 - u4_stride_yuv422i-   Stride in pixels of yuv422i image.
+@ *
+@ * @par   Description
+@ * Function used from copying or converting a reference frame to display buffer
+@ * in non shared mode
+@ *
+@ * @param[in] pu1_y_dst
+@ *   Output Y pointer
+@ *
+@ * @param[in] pu1_u_dst
+@ *   Output U/UV pointer ( UV is interleaved in the same format as that of input)
+@ *
+@ * @param[in] pu1_v_dst
+@ *   Output V pointer ( used in 420P output case)
+@ *
+@ * @param[in] u4_dst_y_strd
+@ *   Stride of destination Y buffer
+@ *
+@ * @param[in] u4_dst_u_strd
+@ *   Stride of destination  U/V buffer
+@ *
+@ *
+@ * @param[in] blocking
+@ *   To indicate whether format conversion should wait till frame is reconstructed
+@ *   and then return after complete copy is done. To be set to 1 when called at the
+@ *   end of frame processing and set to 0 when called between frame processing modules
+@ *   in order to utilize available MCPS
+@ *
+@ * @returns Error from IH264E_ERROR_T
+@ *
+@ * @remarks
+@ * Assumes that the stride of U and V buffers are same.
+@ * This is correct in most cases
+@ * If a case comes where this is not true we need to modify the fmt conversion funcnions called inside also
+@ * Since we read 4 pixels ata time the width should be aligned to 4
+@ * In assembly width should be aligned to 16 and height to 2.
+@ *
+@ *
+@ * Revision History :
+@ *         DD MM YYYY   Author(s)              Changes (Describe the changes made)
+@ *         07 06 2010   Harinarayanan K K       Adapeted to 422p
+@ *
+@ *******************************************************************************
+@ */
+
+@//`
+@*/
+    .global ih264e_fmt_conv_422i_to_420sp_a9q
+ih264e_fmt_conv_422i_to_420sp_a9q:
+    stmfd         sp!, {r4-r12, lr}     @// Back the register which are used
+
+
+
+    @/* Do the preprocessing before the main loops start */
+    @// Load the parameters from stack
+    ldr           r4, [sp, #48]         @// Load u4_stride_y       from stack
+
+    ldr           r5, [sp, #60]         @// Load u4_stride_yuv422i from stack
+    add           r6, r0, r4            @// pu1_y_nxt_row       = pu1_y + u4_stride_y
+
+    ldr           r7, [sp, #40]         @// Load u4_width          from stack
+    add           r8, r3, r5, lsl #1    @// pu2_yuv422i_nxt_row = pu2_yuv422i_y + u4_stride_yuv422i(2 Bytes for each pixel)
+
+    ldr           r9, [sp, #52]         @// Load u4_stride_u       from stack
+    sub           r12, r4, r7           @// u2_offset1          = u4_stride_y - u4_width
+
+@LDR            r10,[sp,#56]                ;// Load u4_stride_v       from stack
+    sub           r14, r5, r7           @// u2_offset_yuv422i   = u4_stride_yuv422i - u4_width
+
+    ldr           r11, [sp, #44]        @// Load u4_height         from stack
+    sub           r9, r9, r7            @// u2_offset2          = u4_stride_u - u4_width >> 1
+
+@   SUB         r10,r10,r7,ASR #1           ;// u2_offset3          = u4_stride_v - u4_width >> 1
+    mov           r14, r14, lsl #1      @// u2_offset_yuv422i   = u2_offset_yuv422i * 2
+
+    mov           r7, r7, asr #4        @// u4_width = u4_width / 16 (u4_width >> 4)
+    mov           r11, r11, asr #1      @// u4_width = u4_width / 2 (u4_width >> 1)
+
+    add           r4, r12, r4           @// u2_offset1 = u2_offset1 + u4_stride_y
+    add           r5, r14, r5, lsl #1   @// u2_offset_yuv422i = u2_offset_yuv422i + u4_stride_yuv422i
+
+    vpush         {d8-d15}
+
+@// Register Assignment
+@// pu1_y               - r0
+@// pu1_y_nxt_row       - r6
+@// pu1_u               - r1
+@// pu1_v               - r2
+@// pu2_yuv422i         - r3
+@// pu2_yuv422i_nxt_row - r8
+@// u2_offset1          - r4
+@// u2_offset2          - r9
+@// u2_offset3          - r10
+@// u2_offset_yuv422i   - r5
+@// u4_width / 16       - r7
+@// u4_height / 2       - r11
+@// inner loop count    - r12
+yuv420_to_yuv422i_hight_loop:
+
+    mov           r12, r7               @// Inner loop count = u4_width / 16
+
+yuv420_to_yuv422i_width_loop:
+    vld4.8        {d0, d1, d2, d3}, [r3]! @// Load the 16 elements of row 1
+    vld4.8        {d4, d5, d6, d7}, [r8]! @// Load the 16 elements of row 2
+    subs          r12, r12, #1
+
+    vrhadd.u8     d0, d0, d4
+    vrhadd.u8     d2, d2, d6
+
+    vst2.8        {d1, d3}, [r0]!       @// Store the 16 elements of row1 Y
+    vst2.8        {d5, d7}, [r6]!       @// Store the 16 elements of row2 Y
+
+    vst2.8        {d0, d2}, [r1]!       @// Store the 8 elements of row1/2 U
+
+    bgt           yuv420_to_yuv422i_width_loop
+
+    @// Update the buffer pointer so that they will refer to next pair of rows
+    add           r0, r0, r4            @// pu1_y               = pu1_y                 + u2_offset1
+    add           r6, r6, r4            @// pu1_y_nxt_row       = pu1_y_nxt_row         + u2_offset1
+
+    add           r1, r1, r9            @// pu1_u               = pu1_u                 + u2_offset2
+    subs          r11, r11, #1
+
+    add           r3, r3, r5            @// pu2_yuv422i         = pu2_yuv422i           + u2_offset_yuv422i
+
+    add           r8, r8, r5            @// pu2_yuv422i_nxt_row = pu2_yuv422i_nxt_row   + u2_offset_yuv422i
+    bgt           yuv420_to_yuv422i_hight_loop
+    vpop          {d8-d15}
+    ldmfd         sp!, {r4-r12, pc}     @// Restore the register which are used
+
+
+
-- 
cgit v1.2.3