///*****************************************************************************
//*
//* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
//*
//* Licensed under the Apache License, Version 2.0 (the "License");
//* you may not use this file except in compliance with the License.
//* You may obtain a copy of the License at:
//*
//* http://www.apache.org/licenses/LICENSE-2.0
//*
//* Unless required by applicable law or agreed to in writing, software
//* distributed under the License is distributed on an "AS IS" BASIS,
//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
//* See the License for the specific language governing permissions and
//* limitations under the License.
//*
//*****************************************************************************/
///**
///*******************************************************************************
//* //file
//*  ihevcd_fmt_conv_420sp_to_420sp.s
//*
//* //brief
//*  contains function definitions for format conversions
//*
//* //author
//*  ittiam
//*
//* //par list of functions:
//*
//*
//* //remarks
//*  none
//*
//*******************************************************************************/
    .equ DO1STROUNDING, 0

    // ARM
    //
    // PRESERVE8

.text
.p2align 2

.include "ihevc_neon_macros.s"


///*****************************************************************************
//*                                                                            *
//*  Function Name    : ihevcd_fmt_conv_420sp_to_420sp()                       *
//*                                                                            *
//*  Description      : This function conversts the image from YUV420SP color  *
//*                     space to 420SP color space(UV interleaved).                 *
//*                                                                            *
//*  Arguments        : x0           pu1_y                                     *
//*                     x1           pu1_uv                                    *
//*                     x2           pu1_dest_y                                *
//*                     x3           pu1_dest_uv                               *
//*                     [x13 #40]    u2_width                                  *
//*                     [x13 #44]    u2_height                                 *
//*                     [x13 #48]    u2_stridey                                *
//*                     [x13 #52]    u2_stridechroma                           *
//*                     [x13 #56]    u2_dest_stridey                           *
//*                     [x13 #60]    u2_dest_stridechroma                      *
//*                                                                            *
//*  Values Returned  : None                                                   *
//*                                                                            *
//*  Register Usage   : x0 - x14                                               *
//*                                                                            *
//*  Stack Usage      : 40 Bytes                                               *
//*                                                                            *
//*  Interruptibility : Interruptible                                          *
//*                                                                            *
//*  Known Limitations                                                         *
//*       Assumptions: Image Width:     Assumed to be multiple of 2 and       *
//*                     Image Height:    Assumed to be even.                   *
//*                                                                            *
//*  Revision History :                                                        *
//*         DD MM YYYY   Author(s)       Changes (Describe the changes made)   *
//*         16 05 2012   Naveen SR     draft                                     *
//*                                                                            *
//*****************************************************************************/

    .global ihevcd_fmt_conv_420sp_to_420sp_av8
.type ihevcd_fmt_conv_420sp_to_420sp_a9q, %function
ihevcd_fmt_conv_420sp_to_420sp_av8:

    // STMFD sp!,{x4-x12, x14}
    push_v_regs
    stp         x19, x20,[sp,#-16]!

    mov         x8, x4                      ////Load u2_width
    mov         x9, x5                      ////Load u2_height

    LDR         w5, [sp,#80]                ////Load u2_dest_stridey
    sxtw        x5,w5

    mov         x7, x6                      ////Load u2_stridey

    SUB         x10,x7,x8                   //// Src Y increment
    SUB         x11,x5,x8                   //// Dst Y increment

    ///* Copy Y */

    MOV         x4,x9                       //// Copying height
y_row_loop:
    MOV         x6,x8                       //// Copying width

y_col_loop:
    prfm        PLDL1KEEP,[x0, #128]
    SUB         x6,x6,#32
    LD1         {v0.8b},[x0],#8
    LD1         {v1.8b},[x0],#8
    LD1         {v2.8b},[x0],#8
    LD1         {v3.8b},[x0],#8
    ST1         {v0.8b},[x2],#8
    ST1         {v1.8b},[x2],#8
    ST1         {v2.8b},[x2],#8
    ST1         {v3.8b},[x2],#8
    CMP         x6,#32
    BGE         y_col_loop
    CMP         x6,#0
    BEQ         y_col_loop_end
    ////If non-multiple of 16, then go back by few bytes to ensure 16 bytes can be read
    ////Ex if width is 162, above loop will process 160 pixels. And
    ////Both source and destination will point to 146th pixel and then 16 bytes will be read
    //// and written using VLD1 and VST1
    sub         x20,x6,#32
    neg         x6, x20
    SUB         x0,x0,x6
    SUB         x2,x2,x6
    LD1         {v0.8b},[x0],#8
    LD1         {v1.8b},[x0],#8
    LD1         {v2.8b},[x0],#8
    LD1         {v3.8b},[x0],#8
    ST1         {v0.8b},[x2],#8
    ST1         {v1.8b},[x2],#8
    ST1         {v2.8b},[x2],#8
    ST1         {v3.8b},[x2],#8

y_col_loop_end:
    ADD         x0, x0, x10
    ADD         x2, x2, x11
    SUBS        x4, x4, #1
    BGT         y_row_loop


    ///* Copy UV */

    LDR         w5, [sp,#88]                ////Load u2_dest_stridechroma
    sxtw        x5,w5

    LSR         x9, x9, #1                  //// height/2
//    MOV     x8,x8,LSR #1            @// Width/2

    MOV         x2,x3                       //pu1_dest_uv

    SUB         x10,x7,x8                   //// Src UV increment
    SUB         x11,x5,x8                   //// Dst UV increment

    MOV         x4,x9                       //// Copying height
uv_row_loop:
    MOV         x6,x8                       //// Copying width

uv_col_loop:

    prfm        PLDL1KEEP,[x1, #128]
    SUB         x6,x6,#16
    LD1         {v0.8b},[x1],#8
    LD1         {v1.8b},[x1],#8
    ST1         {v0.8b},[x2],#8
    ST1         {v1.8b},[x2],#8
    CMP         x6,#16
    BGE         uv_col_loop
    CMP         x6,#0
    BEQ         u_col_loop_end
    ////If non-multiple of 16, then go back by few bytes to ensure 16 bytes can be read
    ////Ex if width is 162, above loop will process 160 pixels. And
    ////Both source and destination will point to 146th pixel and then 16 bytes will be read
    //// and written using VLD1 and VST1
    sub         x20,x6,#16
    neg         x6, x20
    SUB         x1,x1,x6
    SUB         x2,x2,x6
    LD1         {v0.8b},[x1],#8
    LD1         {v1.8b},[x1],#8
    ST1         {v0.8b},[x2],#8
    ST1         {v1.8b},[x2],#8

u_col_loop_end:
    ADD         x1, x1, x10
    ADD         x2, x2, x11
    SUBS        x4, x4, #1
    BGT         uv_row_loop

exit:
    // LDMFD sp!,{x4-x12, pc}
    ldp         x19, x20,[sp],#16
    pop_v_regs
    ret


    .section .note.GNU-stack,"",%progbits