summaryrefslogtreecommitdiffstats
path: root/encoder/arm/ih264e_fmt_conv.s
diff options
context:
space:
mode:
authorHamsalekha S <hamsalekha.s@ittiam.com>2015-03-13 21:24:58 +0530
committerHamsalekha S <hamsalekha.s@ittiam.com>2015-04-02 15:59:02 +0530
commit8d3d303c7942ced6a987a52db8977d768dc3605f (patch)
treecc806c96794356996b13ba9970941d0aed74a97e /encoder/arm/ih264e_fmt_conv.s
parent3956d913d37327dcb340f836e604b04bd478b158 (diff)
downloadandroid_external_libavc-8d3d303c7942ced6a987a52db8977d768dc3605f.tar.gz
android_external_libavc-8d3d303c7942ced6a987a52db8977d768dc3605f.tar.bz2
android_external_libavc-8d3d303c7942ced6a987a52db8977d768dc3605f.zip
Initial version
Change-Id: I7efe9a589cd24edf86e8d086b40c27cbbf8b4017
Diffstat (limited to 'encoder/arm/ih264e_fmt_conv.s')
-rwxr-xr-xencoder/arm/ih264e_fmt_conv.s329
1 files changed, 329 insertions, 0 deletions
diff --git a/encoder/arm/ih264e_fmt_conv.s b/encoder/arm/ih264e_fmt_conv.s
new file mode 100755
index 0000000..2bf1479
--- /dev/null
+++ b/encoder/arm/ih264e_fmt_conv.s
@@ -0,0 +1,329 @@
+@/******************************************************************************
+@ *
+@ * Copyright (C) 2015 The Android Open Source Project
+@ *
+@ * Licensed under the Apache License, Version 2.0 (the "License");
+@ * you may not use this file except in compliance with the License.
+@ * You may obtain a copy of the License at:
+@ *
+@ * http://www.apache.org/licenses/LICENSE-2.0
+@ *
+@ * Unless required by applicable law or agreed to in writing, software
+@ * distributed under the License is distributed on an "AS IS" BASIS,
+@ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@ * See the License for the specific language governing permissions and
+@ * limitations under the License.
+@ *
+@ *****************************************************************************
+@ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
+@*/
+@/**
+
+.text
+.p2align 2
+@/**
+
+@/*****************************************************************************
+@* *
+@* Function Name : IH264D_CXA8_YUV420toYUV420SP_UV() *
+@* *
+@* Description : This function conversts the image from YUV420P color *
+@* space to 420SP color space(UV interleaved). *
+@* *
+@* Arguments : R0 pu1_y *
+@* R1 pu1_u *
+@* R2 pu1_v *
+@* R3 pu1_dest_y *
+@* [R13 #40] pu1_dest_uv *
+@* [R13 #44] u2_height *
+@* [R13 #48] u2_width *
+@* [R13 #52] u2_stridey *
+@* [R13 #56] u2_strideu *
+@* [R13 #60] u2_stridev *
+@* [R13 #64] u2_dest_stride_y *
+@* [R13 #68] u2_dest_stride_uv *
+@* [R13 #72] convert_uv_only *
+@* *
+@* Values Returned : None *
+@* *
+@* Register Usage : R0 - R14 *
+@* *
+@* Stack Usage : 40 Bytes *
+@* *
+@* Interruptibility : Interruptible *
+@* *
+@* Known Limitations *
+@* Assumptions: Image Width: Assumed to be multiple of 16 and *
+@* greater than or equal to 16 *
+@* Image Height: Assumed to be even. *
+@* *
+@* Revision History : *
+@* DD MM YYYY Author(s) Changes (Describe the changes made) *
+@* 07 06 2010 Varshita Draft *
+@* 07 06 2010 Naveen Kr T Completed *
+@* *
+@*****************************************************************************/
+ .global ih264e_fmt_conv_420p_to_420sp_a9q
+
+ih264e_fmt_conv_420p_to_420sp_a9q:
+
+ @// push the registers on the stack
+ stmfd sp!, {r4-r12, lr}
+
+ ldr r4, [sp, #72] @// Load convert_uv_only
+
+ cmp r4, #1
+ beq yuv420sp_uv_chroma
+ @/* Do the preprocessing before the main loops start */
+ @// Load the parameters from stack
+ ldr r4, [sp, #44] @// Load u2_height from stack
+ ldr r5, [sp, #48] @// Load u2_width from stack
+ ldr r7, [sp, #52] @// Load u2_stridey from stack
+ ldr r8, [sp, #64] @// Load u2_dest_stride_y from stack
+ sub r7, r7, r5 @// Source increment
+ sub r8, r8, r5 @// Destination increment
+
+ vpush {d8-d15}
+yuv420sp_uv_row_loop_y:
+ mov r6, r5
+
+yuv420sp_uv_col_loop_y:
+ pld [r0, #128]
+ vld1.8 {d0, d1}, [r0]!
+ vst1.8 {d0, d1}, [r3]!
+ sub r6, r6, #16
+ cmp r6, #15
+ bgt yuv420sp_uv_col_loop_y
+
+ cmp r6, #0
+ beq yuv420sp_uv_row_loop_end_y
+ @//If non-multiple of 16, then go back by few bytes to ensure 16 bytes can be read
+ @//Ex if width is 162, above loop will process 160 pixels. And
+ @//Both source and destination will point to 146th pixel and then 16 bytes will be read
+ @// and written using VLD1 and VST1
+ rsb r6, r6, #16
+ sub r0, r0, r6
+ sub r3, r3, r6
+
+ vld1.8 {d0, d1}, [r0]!
+ vst1.8 {d0, d1}, [r3]!
+
+yuv420sp_uv_row_loop_end_y:
+ add r0, r0, r7
+ add r3, r3, r8
+ subs r4, r4, #1
+ bgt yuv420sp_uv_row_loop_y
+
+yuv420sp_uv_chroma:
+
+ ldr r3, [sp, #40] @// Load pu1_dest_uv from stack
+
+ ldr r4, [sp, #44] @// Load u2_height from stack
+
+ ldr r5, [sp, #48] @// Load u2_width from stack
+
+
+ ldr r7, [sp, #56] @// Load u2_strideu from stack
+
+ ldr r8, [sp, #68] @// Load u2_dest_stride_uv from stack
+
+ sub r7, r7, r5, lsr #1 @// Source increment
+
+ sub r8, r8, r5 @// Destination increment
+
+ mov r5, r5, lsr #1
+ mov r4, r4, lsr #1
+ ldr r3, [sp, #40] @// Load pu1_dest_uv from stack
+ vpush {d8-d15}
+yuv420sp_uv_row_loop_uv:
+ mov r6, r5
+
+
+yuv420sp_uv_col_loop_uv:
+ pld [r1, #128]
+ pld [r2, #128]
+ vld1.8 d0, [r1]!
+ vld1.8 d1, [r2]!
+ vst2.8 {d0, d1}, [r3]!
+ sub r6, r6, #8
+ cmp r6, #7
+ bgt yuv420sp_uv_col_loop_uv
+
+ cmp r6, #0
+ beq yuv420sp_uv_row_loop_end_uv
+ @//If non-multiple of 16, then go back by few bytes to ensure 16 bytes can be read
+ @//Ex if width is 162, above loop will process 160 pixels. And
+ @//Both source and destination will point to 146th pixel and then 16 bytes will be read
+ @// and written using VLD1 and VST1
+ rsb r6, r6, #8
+ sub r1, r1, r6
+ sub r2, r2, r6
+ sub r3, r3, r6, lsl #1
+
+ vld1.8 d0, [r1]!
+ vld1.8 d1, [r2]!
+ vst2.8 {d0, d1}, [r3]!
+
+yuv420sp_uv_row_loop_end_uv:
+ add r1, r1, r7
+ add r2, r2, r7
+ add r3, r3, r8
+ subs r4, r4, #1
+ bgt yuv420sp_uv_row_loop_uv
+ @//POP THE REGISTERS
+ vpop {d8-d15}
+ ldmfd sp!, {r4-r12, pc}
+
+
+
+
+
+@ /**
+@ *******************************************************************************
+@ *
+@ * @brief ih264e_fmt_conv_422i_to_420sp_a9q
+@ * Function used from format conversion or frame copy
+@ *
+@ *
+@ *
+@ *Inputs : r0 - pu1_y - UWORD8 pointer to y plane.
+@ * r1 - pu1_u - UWORD8 pointer to u plane.
+@ * r2 - pu1_v - UWORD8 pointer to u plane.
+@ * r3 - pu2_yuv422i - UWORD16 pointer to yuv422iimage.
+@ * stack + 40 - u4_width - Width of the Y plane.
+@ * 44 - u4_height - Height of the Y plane.
+@ * 48 - u4_stride_y - Stride in pixels of Y plane.
+@ * 52 - u4_stride_u - Stride in pixels of U plane.
+@ * 56 - u4_stride_v - Stride in pixels of V plane.
+@ * 60 - u4_stride_yuv422i- Stride in pixels of yuv422i image.
+@ *
+@ * @par Description
+@ * Function used from copying or converting a reference frame to display buffer
+@ * in non shared mode
+@ *
+@ * @param[in] pu1_y_dst
+@ * Output Y pointer
+@ *
+@ * @param[in] pu1_u_dst
+@ * Output U/UV pointer ( UV is interleaved in the same format as that of input)
+@ *
+@ * @param[in] pu1_v_dst
+@ * Output V pointer ( used in 420P output case)
+@ *
+@ * @param[in] u4_dst_y_strd
+@ * Stride of destination Y buffer
+@ *
+@ * @param[in] u4_dst_u_strd
+@ * Stride of destination U/V buffer
+@ *
+@ *
+@ * @param[in] blocking
+@ * To indicate whether format conversion should wait till frame is reconstructed
+@ * and then return after complete copy is done. To be set to 1 when called at the
+@ * end of frame processing and set to 0 when called between frame processing modules
+@ * in order to utilize available MCPS
+@ *
+@ * @returns Error from IH264E_ERROR_T
+@ *
+@ * @remarks
+@ * Assumes that the stride of U and V buffers are same.
+@ * This is correct in most cases
+@ * If a case comes where this is not true we need to modify the fmt conversion funcnions called inside also
+@ * Since we read 4 pixels ata time the width should be aligned to 4
+@ * In assembly width should be aligned to 16 and height to 2.
+@ *
+@ *
+@ * Revision History :
+@ * DD MM YYYY Author(s) Changes (Describe the changes made)
+@ * 07 06 2010 Harinarayanan K K Adapeted to 422p
+@ *
+@ *******************************************************************************
+@ */
+
+@//`
+@*/
+ .global ih264e_fmt_conv_422i_to_420sp_a9q
+ih264e_fmt_conv_422i_to_420sp_a9q:
+ stmfd sp!, {r4-r12, lr} @// Back the register which are used
+
+
+
+ @/* Do the preprocessing before the main loops start */
+ @// Load the parameters from stack
+ ldr r4, [sp, #48] @// Load u4_stride_y from stack
+
+ ldr r5, [sp, #60] @// Load u4_stride_yuv422i from stack
+ add r6, r0, r4 @// pu1_y_nxt_row = pu1_y + u4_stride_y
+
+ ldr r7, [sp, #40] @// Load u4_width from stack
+ add r8, r3, r5, lsl #1 @// pu2_yuv422i_nxt_row = pu2_yuv422i_y + u4_stride_yuv422i(2 Bytes for each pixel)
+
+ ldr r9, [sp, #52] @// Load u4_stride_u from stack
+ sub r12, r4, r7 @// u2_offset1 = u4_stride_y - u4_width
+
+@LDR r10,[sp,#56] ;// Load u4_stride_v from stack
+ sub r14, r5, r7 @// u2_offset_yuv422i = u4_stride_yuv422i - u4_width
+
+ ldr r11, [sp, #44] @// Load u4_height from stack
+ sub r9, r9, r7 @// u2_offset2 = u4_stride_u - u4_width >> 1
+
+@ SUB r10,r10,r7,ASR #1 ;// u2_offset3 = u4_stride_v - u4_width >> 1
+ mov r14, r14, lsl #1 @// u2_offset_yuv422i = u2_offset_yuv422i * 2
+
+ mov r7, r7, asr #4 @// u4_width = u4_width / 16 (u4_width >> 4)
+ mov r11, r11, asr #1 @// u4_width = u4_width / 2 (u4_width >> 1)
+
+ add r4, r12, r4 @// u2_offset1 = u2_offset1 + u4_stride_y
+ add r5, r14, r5, lsl #1 @// u2_offset_yuv422i = u2_offset_yuv422i + u4_stride_yuv422i
+
+ vpush {d8-d15}
+
+@// Register Assignment
+@// pu1_y - r0
+@// pu1_y_nxt_row - r6
+@// pu1_u - r1
+@// pu1_v - r2
+@// pu2_yuv422i - r3
+@// pu2_yuv422i_nxt_row - r8
+@// u2_offset1 - r4
+@// u2_offset2 - r9
+@// u2_offset3 - r10
+@// u2_offset_yuv422i - r5
+@// u4_width / 16 - r7
+@// u4_height / 2 - r11
+@// inner loop count - r12
+yuv420_to_yuv422i_hight_loop:
+
+ mov r12, r7 @// Inner loop count = u4_width / 16
+
+yuv420_to_yuv422i_width_loop:
+ vld4.8 {d0, d1, d2, d3}, [r3]! @// Load the 16 elements of row 1
+ vld4.8 {d4, d5, d6, d7}, [r8]! @// Load the 16 elements of row 2
+ subs r12, r12, #1
+
+ vrhadd.u8 d0, d0, d4
+ vrhadd.u8 d2, d2, d6
+
+ vst2.8 {d1, d3}, [r0]! @// Store the 16 elements of row1 Y
+ vst2.8 {d5, d7}, [r6]! @// Store the 16 elements of row2 Y
+
+ vst2.8 {d0, d2}, [r1]! @// Store the 8 elements of row1/2 U
+
+ bgt yuv420_to_yuv422i_width_loop
+
+ @// Update the buffer pointer so that they will refer to next pair of rows
+ add r0, r0, r4 @// pu1_y = pu1_y + u2_offset1
+ add r6, r6, r4 @// pu1_y_nxt_row = pu1_y_nxt_row + u2_offset1
+
+ add r1, r1, r9 @// pu1_u = pu1_u + u2_offset2
+ subs r11, r11, #1
+
+ add r3, r3, r5 @// pu2_yuv422i = pu2_yuv422i + u2_offset_yuv422i
+
+ add r8, r8, r5 @// pu2_yuv422i_nxt_row = pu2_yuv422i_nxt_row + u2_offset_yuv422i
+ bgt yuv420_to_yuv422i_hight_loop
+ vpop {d8-d15}
+ ldmfd sp!, {r4-r12, pc} @// Restore the register which are used
+
+
+