1 files changed, 203 insertions, 0 deletions
diff --git a/decoder/arm/ihevcd_fmt_conv_420sp_to_420p.s b/decoder/arm/ihevcd_fmt_conv_420sp_to_420p.s
new file mode 100644
index 0000000..c1d09ed
--- /dev/null
+++ b/decoder/arm/ihevcd_fmt_conv_420sp_to_420p.s
@@ -0,0 +1,203 @@
+@/*****************************************************************************
+@*
+@* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+@*
+@* Licensed under the Apache License, Version 2.0 (the "License");
+@* you may not use this file except in compliance with the License.
+@* You may obtain a copy of the License at:
+@*
+@* http://www.apache.org/licenses/LICENSE-2.0
+@*
+@* Unless required by applicable law or agreed to in writing, software
+@* distributed under the License is distributed on an "AS IS" BASIS,
+@* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@* See the License for the specific language governing permissions and
+@* limitations under the License.
+@*
+@*****************************************************************************/
+@/**
+@/*******************************************************************************
+@* @file
+@*  ihevcd_fmt_conv_420sp_to_420p.s
+@*
+@* @brief
+@*  contains function definitions for format conversions
+@*
+@* @author
+@*  ittiam
+@*
+@* @par list of functions:
+@*
+@*
+@* @remarks
+@*  none
+@*
+@*******************************************************************************/
+
+
+
+
+
+
+
+
+.text
+
+
+
+
+
+@/*****************************************************************************
+@*                                                                            *
+@*  Function Name    : neon_copy_yuv420sp_to_yuv420p()                       *
+@*                                                                            *
+@*  Description      : This function conversts the image from YUV420sP color  *
+@*                     space to 420SP color space(UV interleaved).            *
+@*                                                                            *
+@*  Arguments        : R0           pu1_src_y                                 *
+@*                     R1           pu1_src_uv                                *
+@*                     R2           pu1_dest_y                                *
+@*                     R3           pu1_dest_u                               *
+@*                     [R13 #40]    pu1_dest_v                               *
+@*                     [R13 #44]    u2_width                                 *
+@*                     [R13 #48]    u2_height                                   *
+@*                     [R13 #52]    u2_stridey                                *
+@*                     [R13 #56]    u2_strideuv                               *
+@*                     [R13 #60]    u2_dest_stridey                           *
+@*                     [R13 #64]    u2_dest_strideuv                          *
+@*                     [R13 #68]    is_u_first                                *
+@*                     [R13 #72]    disable_luma_copy                         *
+@*                                                                            *
+@*  Values Returned  : None                                                   *
+@*                                                                            *
+@*  Register Usage   : R0 - R14                                               *
+@*                                                                            *
+@*  Stack Usage      : 40 Bytes                                               *
+@*                                                                            *
+@*  Interruptibility : Interruptible                                          *
+@*                                                                            *
+@*  Known Limitations                                                         *
+@*       Assumptions: Image Width:     Assumed to be multiple of 2 and       *
+@*                     Image Height:    Assumed to be even.                   *
+@*                                                                            *
+@*  Revision History :                                                        *
+@*         DD MM YYYY   Author(s)       Changes (Describe the changes made)   *
+@*         16 05 2012   Naveen SR     draft                                   *
+@*                                                                            *
+@*****************************************************************************/
+
+.globl ihevcd_fmt_conv_420sp_to_420p_a9q
+
+.type ihevcd_fmt_conv_420sp_to_420p_a9q, %function
+
+ihevcd_fmt_conv_420sp_to_420p_a9q:
+    STMFD       sp!,{r4-r12, lr}
+
+    LDR         r5,[sp,#60]                 @//Load u2_dest_stridey
+@   LDR     r6,[sp,#56]             @//Load u2_strideuv
+    LDR         r7,[sp,#52]                 @//Load u2_stridey
+    LDR         r8,[sp,#44]                 @//Load u2_width
+    LDR         r9,[sp,#48]                 @//Load u2_height
+
+    SUB         r10,r7,r8                   @// Src Y increment
+    SUB         r11,r5,r8                   @// Dst Y increment
+
+    LDR         r5,[sp,#72]                 @//Load disable_luma_copy flag
+    CMP         r5,#0                       @//skip luma if disable_luma_copy is non-zero
+    BNE         uv_copy_start
+
+    @/* Copy Y */
+
+    MOV         r4,r9                       @// Copying height
+y_row_loop:
+    MOV         r6,r8                       @// Copying width
+
+y_col_loop:
+
+    SUB         r6,r6,#16
+    vld1.8      {d0,d1},[r0]!
+    vst1.8      {d0,d1},[r2]!
+    CMP         r6,#16
+    BGE         y_col_loop
+    CMP         r6,#0
+    BEQ         y_col_loop_end
+    @//If non-multiple of 16, then go back by few bytes to ensure 16 bytes can be read
+    @//Ex if width is 162, above loop will process 160 pixels. And
+    @//Both source and destination will point to 146th pixel and then 16 bytes will be read
+    @// and written using VLD1 and VST1
+    RSB         r6,r6,#16
+    SUB         r0,r0,r6
+    SUB         r2,r2,r6
+    vld1.8      {d0,d1}, [r0]!
+    vst1.8      {d0,d1}, [r2]!
+
+y_col_loop_end:
+    ADD         r0, r0, r10
+    ADD         r2, r2, r11
+    SUBS        r4, r4, #1
+    BGT         y_row_loop
+
+
+    @/* Copy UV */
+uv_copy_start:
+
+    LDR         r5,[sp,#64]                 @//Load u2_dest_strideuv
+    LDR         r7,[sp,#56]                 @//Load u2_strideuv
+
+    MOV         r9,r9,LSR #1                @// height/2
+@   MOV     r8,r8,LSR #1            @// Width/2
+
+    SUB         r10,r7,r8                   @// Src UV increment
+    MOV         r11,r8,LSR #1
+    SUB         r11,r5,r11                  @// Dst U and V increment
+
+    LDR         r5,[sp,#40]                 @//Load pu1_dest_v
+
+    LDR         r4,[sp,#68]                 @//Load is_u_first_flag
+    CMP         r4,#0                       @//Swap U and V dest if is_u_first_flag is zero
+    MOVEQ       r4,r5
+    MOVEQ       r5,r3
+    MOVEQ       r3,r4
+
+    MOV         r4,r9                       @// Copying height
+uv_row_loop:
+    MOV         r6,r8                       @// Copying width
+
+uv_col_loop:
+
+    SUB         r6,r6,#16
+
+    PLD         [r1,#128]
+    vld2.8      {d0,d1},[r1]!
+    VST1.8      D0,[r3]!
+    VST1.8      D1,[r5]!
+    CMP         r6,#16
+    BGE         uv_col_loop
+    CMP         r6,#0
+    BEQ         uv_col_loop_end
+    @//If non-multiple of 16, then go back by few bytes to ensure 16 bytes can be read
+    @//Ex if width is 162, above loop will process 160 pixels. And
+    @//Both source and destination will point to 146th pixel and then 16 bytes will be read
+    @// and written using VLD1 and VST1
+    RSB         r6,r6,#16
+    SUB         r1,r1,r6
+    SUB         r3,r3,r6,LSR #1
+    SUB         r5,r5,r6,LSR #1
+    vld2.8      {d0,d1}, [r1]!
+    VST1.8      D0, [r3]!
+    VST1.8      D1, [r5]!
+uv_col_loop_end:
+    ADD         r1, r1, r10
+    ADD         r3, r3, r11
+    ADD         r5, r5, r11
+    SUBS        r4, r4, #1
+    BGT         uv_row_loop
+
+exit:
+    LDMFD       sp!,{r4-r12, pc}
+
+
+
+
+
+