From 8d3d303c7942ced6a987a52db8977d768dc3605f Mon Sep 17 00:00:00 2001 From: Hamsalekha S Date: Fri, 13 Mar 2015 21:24:58 +0530 Subject: Initial version Change-Id: I7efe9a589cd24edf86e8d086b40c27cbbf8b4017 --- encoder/arm/ih264e_fmt_conv.s | 329 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 329 insertions(+) create mode 100755 encoder/arm/ih264e_fmt_conv.s (limited to 'encoder/arm/ih264e_fmt_conv.s') diff --git a/encoder/arm/ih264e_fmt_conv.s b/encoder/arm/ih264e_fmt_conv.s new file mode 100755 index 0000000..2bf1479 --- /dev/null +++ b/encoder/arm/ih264e_fmt_conv.s @@ -0,0 +1,329 @@ +@/****************************************************************************** +@ * +@ * Copyright (C) 2015 The Android Open Source Project +@ * +@ * Licensed under the Apache License, Version 2.0 (the "License"); +@ * you may not use this file except in compliance with the License. +@ * You may obtain a copy of the License at: +@ * +@ * http://www.apache.org/licenses/LICENSE-2.0 +@ * +@ * Unless required by applicable law or agreed to in writing, software +@ * distributed under the License is distributed on an "AS IS" BASIS, +@ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +@ * See the License for the specific language governing permissions and +@ * limitations under the License. +@ * +@ ***************************************************************************** +@ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore +@*/ +@/** + +.text +.p2align 2 +@/** + +@/***************************************************************************** +@* * +@* Function Name : IH264D_CXA8_YUV420toYUV420SP_UV() * +@* * +@* Description : This function conversts the image from YUV420P color * +@* space to 420SP color space(UV interleaved). * +@* * +@* Arguments : R0 pu1_y * +@* R1 pu1_u * +@* R2 pu1_v * +@* R3 pu1_dest_y * +@* [R13 #40] pu1_dest_uv * +@* [R13 #44] u2_height * +@* [R13 #48] u2_width * +@* [R13 #52] u2_stridey * +@* [R13 #56] u2_strideu * +@* [R13 #60] u2_stridev * +@* [R13 #64] u2_dest_stride_y * +@* [R13 #68] u2_dest_stride_uv * +@* [R13 #72] convert_uv_only * +@* * +@* Values Returned : None * +@* * +@* Register Usage : R0 - R14 * +@* * +@* Stack Usage : 40 Bytes * +@* * +@* Interruptibility : Interruptible * +@* * +@* Known Limitations * +@* Assumptions: Image Width: Assumed to be multiple of 16 and * +@* greater than or equal to 16 * +@* Image Height: Assumed to be even. * +@* * +@* Revision History : * +@* DD MM YYYY Author(s) Changes (Describe the changes made) * +@* 07 06 2010 Varshita Draft * +@* 07 06 2010 Naveen Kr T Completed * +@* * +@*****************************************************************************/ + .global ih264e_fmt_conv_420p_to_420sp_a9q + +ih264e_fmt_conv_420p_to_420sp_a9q: + + @// push the registers on the stack + stmfd sp!, {r4-r12, lr} + + ldr r4, [sp, #72] @// Load convert_uv_only + + cmp r4, #1 + beq yuv420sp_uv_chroma + @/* Do the preprocessing before the main loops start */ + @// Load the parameters from stack + ldr r4, [sp, #44] @// Load u2_height from stack + ldr r5, [sp, #48] @// Load u2_width from stack + ldr r7, [sp, #52] @// Load u2_stridey from stack + ldr r8, [sp, #64] @// Load u2_dest_stride_y from stack + sub r7, r7, r5 @// Source increment + sub r8, r8, r5 @// Destination increment + + vpush {d8-d15} +yuv420sp_uv_row_loop_y: + mov r6, r5 + +yuv420sp_uv_col_loop_y: + pld [r0, #128] + vld1.8 {d0, d1}, [r0]! + vst1.8 {d0, d1}, [r3]! + sub r6, r6, #16 + cmp r6, #15 + bgt yuv420sp_uv_col_loop_y + + cmp r6, #0 + beq yuv420sp_uv_row_loop_end_y + @//If non-multiple of 16, then go back by few bytes to ensure 16 bytes can be read + @//Ex if width is 162, above loop will process 160 pixels. And + @//Both source and destination will point to 146th pixel and then 16 bytes will be read + @// and written using VLD1 and VST1 + rsb r6, r6, #16 + sub r0, r0, r6 + sub r3, r3, r6 + + vld1.8 {d0, d1}, [r0]! + vst1.8 {d0, d1}, [r3]! + +yuv420sp_uv_row_loop_end_y: + add r0, r0, r7 + add r3, r3, r8 + subs r4, r4, #1 + bgt yuv420sp_uv_row_loop_y + +yuv420sp_uv_chroma: + + ldr r3, [sp, #40] @// Load pu1_dest_uv from stack + + ldr r4, [sp, #44] @// Load u2_height from stack + + ldr r5, [sp, #48] @// Load u2_width from stack + + + ldr r7, [sp, #56] @// Load u2_strideu from stack + + ldr r8, [sp, #68] @// Load u2_dest_stride_uv from stack + + sub r7, r7, r5, lsr #1 @// Source increment + + sub r8, r8, r5 @// Destination increment + + mov r5, r5, lsr #1 + mov r4, r4, lsr #1 + ldr r3, [sp, #40] @// Load pu1_dest_uv from stack + vpush {d8-d15} +yuv420sp_uv_row_loop_uv: + mov r6, r5 + + +yuv420sp_uv_col_loop_uv: + pld [r1, #128] + pld [r2, #128] + vld1.8 d0, [r1]! + vld1.8 d1, [r2]! + vst2.8 {d0, d1}, [r3]! + sub r6, r6, #8 + cmp r6, #7 + bgt yuv420sp_uv_col_loop_uv + + cmp r6, #0 + beq yuv420sp_uv_row_loop_end_uv + @//If non-multiple of 16, then go back by few bytes to ensure 16 bytes can be read + @//Ex if width is 162, above loop will process 160 pixels. And + @//Both source and destination will point to 146th pixel and then 16 bytes will be read + @// and written using VLD1 and VST1 + rsb r6, r6, #8 + sub r1, r1, r6 + sub r2, r2, r6 + sub r3, r3, r6, lsl #1 + + vld1.8 d0, [r1]! + vld1.8 d1, [r2]! + vst2.8 {d0, d1}, [r3]! + +yuv420sp_uv_row_loop_end_uv: + add r1, r1, r7 + add r2, r2, r7 + add r3, r3, r8 + subs r4, r4, #1 + bgt yuv420sp_uv_row_loop_uv + @//POP THE REGISTERS + vpop {d8-d15} + ldmfd sp!, {r4-r12, pc} + + + + + +@ /** +@ ******************************************************************************* +@ * +@ * @brief ih264e_fmt_conv_422i_to_420sp_a9q +@ * Function used from format conversion or frame copy +@ * +@ * +@ * +@ *Inputs : r0 - pu1_y - UWORD8 pointer to y plane. +@ * r1 - pu1_u - UWORD8 pointer to u plane. +@ * r2 - pu1_v - UWORD8 pointer to u plane. +@ * r3 - pu2_yuv422i - UWORD16 pointer to yuv422iimage. +@ * stack + 40 - u4_width - Width of the Y plane. +@ * 44 - u4_height - Height of the Y plane. +@ * 48 - u4_stride_y - Stride in pixels of Y plane. +@ * 52 - u4_stride_u - Stride in pixels of U plane. +@ * 56 - u4_stride_v - Stride in pixels of V plane. +@ * 60 - u4_stride_yuv422i- Stride in pixels of yuv422i image. +@ * +@ * @par Description +@ * Function used from copying or converting a reference frame to display buffer +@ * in non shared mode +@ * +@ * @param[in] pu1_y_dst +@ * Output Y pointer +@ * +@ * @param[in] pu1_u_dst +@ * Output U/UV pointer ( UV is interleaved in the same format as that of input) +@ * +@ * @param[in] pu1_v_dst +@ * Output V pointer ( used in 420P output case) +@ * +@ * @param[in] u4_dst_y_strd +@ * Stride of destination Y buffer +@ * +@ * @param[in] u4_dst_u_strd +@ * Stride of destination U/V buffer +@ * +@ * +@ * @param[in] blocking +@ * To indicate whether format conversion should wait till frame is reconstructed +@ * and then return after complete copy is done. To be set to 1 when called at the +@ * end of frame processing and set to 0 when called between frame processing modules +@ * in order to utilize available MCPS +@ * +@ * @returns Error from IH264E_ERROR_T +@ * +@ * @remarks +@ * Assumes that the stride of U and V buffers are same. +@ * This is correct in most cases +@ * If a case comes where this is not true we need to modify the fmt conversion funcnions called inside also +@ * Since we read 4 pixels ata time the width should be aligned to 4 +@ * In assembly width should be aligned to 16 and height to 2. +@ * +@ * +@ * Revision History : +@ * DD MM YYYY Author(s) Changes (Describe the changes made) +@ * 07 06 2010 Harinarayanan K K Adapeted to 422p +@ * +@ ******************************************************************************* +@ */ + +@//` +@*/ + .global ih264e_fmt_conv_422i_to_420sp_a9q +ih264e_fmt_conv_422i_to_420sp_a9q: + stmfd sp!, {r4-r12, lr} @// Back the register which are used + + + + @/* Do the preprocessing before the main loops start */ + @// Load the parameters from stack + ldr r4, [sp, #48] @// Load u4_stride_y from stack + + ldr r5, [sp, #60] @// Load u4_stride_yuv422i from stack + add r6, r0, r4 @// pu1_y_nxt_row = pu1_y + u4_stride_y + + ldr r7, [sp, #40] @// Load u4_width from stack + add r8, r3, r5, lsl #1 @// pu2_yuv422i_nxt_row = pu2_yuv422i_y + u4_stride_yuv422i(2 Bytes for each pixel) + + ldr r9, [sp, #52] @// Load u4_stride_u from stack + sub r12, r4, r7 @// u2_offset1 = u4_stride_y - u4_width + +@LDR r10,[sp,#56] ;// Load u4_stride_v from stack + sub r14, r5, r7 @// u2_offset_yuv422i = u4_stride_yuv422i - u4_width + + ldr r11, [sp, #44] @// Load u4_height from stack + sub r9, r9, r7 @// u2_offset2 = u4_stride_u - u4_width >> 1 + +@ SUB r10,r10,r7,ASR #1 ;// u2_offset3 = u4_stride_v - u4_width >> 1 + mov r14, r14, lsl #1 @// u2_offset_yuv422i = u2_offset_yuv422i * 2 + + mov r7, r7, asr #4 @// u4_width = u4_width / 16 (u4_width >> 4) + mov r11, r11, asr #1 @// u4_width = u4_width / 2 (u4_width >> 1) + + add r4, r12, r4 @// u2_offset1 = u2_offset1 + u4_stride_y + add r5, r14, r5, lsl #1 @// u2_offset_yuv422i = u2_offset_yuv422i + u4_stride_yuv422i + + vpush {d8-d15} + +@// Register Assignment +@// pu1_y - r0 +@// pu1_y_nxt_row - r6 +@// pu1_u - r1 +@// pu1_v - r2 +@// pu2_yuv422i - r3 +@// pu2_yuv422i_nxt_row - r8 +@// u2_offset1 - r4 +@// u2_offset2 - r9 +@// u2_offset3 - r10 +@// u2_offset_yuv422i - r5 +@// u4_width / 16 - r7 +@// u4_height / 2 - r11 +@// inner loop count - r12 +yuv420_to_yuv422i_hight_loop: + + mov r12, r7 @// Inner loop count = u4_width / 16 + +yuv420_to_yuv422i_width_loop: + vld4.8 {d0, d1, d2, d3}, [r3]! @// Load the 16 elements of row 1 + vld4.8 {d4, d5, d6, d7}, [r8]! @// Load the 16 elements of row 2 + subs r12, r12, #1 + + vrhadd.u8 d0, d0, d4 + vrhadd.u8 d2, d2, d6 + + vst2.8 {d1, d3}, [r0]! @// Store the 16 elements of row1 Y + vst2.8 {d5, d7}, [r6]! @// Store the 16 elements of row2 Y + + vst2.8 {d0, d2}, [r1]! @// Store the 8 elements of row1/2 U + + bgt yuv420_to_yuv422i_width_loop + + @// Update the buffer pointer so that they will refer to next pair of rows + add r0, r0, r4 @// pu1_y = pu1_y + u2_offset1 + add r6, r6, r4 @// pu1_y_nxt_row = pu1_y_nxt_row + u2_offset1 + + add r1, r1, r9 @// pu1_u = pu1_u + u2_offset2 + subs r11, r11, #1 + + add r3, r3, r5 @// pu2_yuv422i = pu2_yuv422i + u2_offset_yuv422i + + add r8, r8, r5 @// pu2_yuv422i_nxt_row = pu2_yuv422i_nxt_row + u2_offset_yuv422i + bgt yuv420_to_yuv422i_hight_loop + vpop {d8-d15} + ldmfd sp!, {r4-r12, pc} @// Restore the register which are used + + + -- cgit v1.2.3