//******************************************************************************
//*
//* Copyright (C) 2015 The Android Open Source Project
//*
//* Licensed under the Apache License, Version 2.0 (the "License");
//* you may not use this file except in compliance with the License.
//* You may obtain a copy of the License at:
//*
//* http://www.apache.org/licenses/LICENSE-2.0
//*
//* Unless required by applicable law or agreed to in writing, software
//* distributed under the License is distributed on an "AS IS" BASIS,
//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
//* See the License for the specific language governing permissions and
//* limitations under the License.
//*
//*****************************************************************************
//* Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
//*/
///**
// *******************************************************************************
// * @file
// *  ih264_padding_neon.s
// *
// * @brief
// *  Contains function definitions padding
// *
// * @author
// *     Ittiam
// *
// * @par List of Functions:
// *  - ih264_pad_top_av8()
// *  - ih264_pad_left_luma_av8()
// *  - ih264_pad_left_chroma_av8()
// *  - ih264_pad_right_luma_av8()
// *  - ih264_pad_right_chroma_av8()
// *
// * @remarks
// *  None
// *
// *******************************************************************************
//*/

.text
.p2align 2
.include "ih264_neon_macros.s"
///**
//*******************************************************************************
//*
//* @brief pad at the top of a 2d array
//*
//* @par Description:
//*  The top row of a 2d array is replicated for pad_size times at the top
//*
//* @param[in] pu1_src
//*  UWORD8 pointer to the source
//*
//* @param[in] src_strd
//*  integer source stride
//*
//* @param[in] wd
//*  integer width of the array
//*
//* @param[in] pad_size
//*  integer -padding size of the array
//*
//* @returns none
//*
//* @remarks none
//*
//*******************************************************************************
//*/
//void ih264_pad_top(UWORD8 *pu1_src,
//                   WORD32 src_strd,
//                   WORD32 wd,
//                   WORD32 pad_size)
//**************Variables Vs Registers*************************
//    x0 => *pu1_src
//    x1 => src_strd
//    x2 => wd
//    x3 => pad_size

    .global ih264_pad_top_av8

ih264_pad_top_av8:

    // STMFD sp!, {x4-x11,x14}                //stack stores the values of the arguments
    push_v_regs
    stp       x19, x20, [sp, #-16]!

    sub       x5, x0, x1
    sub       x20, x1, #0
    neg       x6, x20

loop_neon_memcpy_mul_16:
    // Load 16 bytes
    ld1       {v0.8b, v1.8b}, [x0], #16
    mov       x4, x5
    mov       x7, x3
    add       x5, x5, #16

loop_neon_pad_top:
    st1       {v0.8b, v1.8b}, [x4], x6
    subs      x7, x7, #1
    bne       loop_neon_pad_top

    subs      x2, x2, #16
    bne       loop_neon_memcpy_mul_16

    // LDMFD sp!,{x4-x11,pc}                //Reload the registers from SP
    ldp       x19, x20, [sp], #16
    pop_v_regs
    ret


///**
//*******************************************************************************
//*
//* @brief
//*   Padding (luma block) at the left of a 2d array
//*
//* @par Description:
//*   The left column of a 2d array is replicated for pad_size times at the left
//*
//*
//* @param[in] pu1_src
//*  UWORD8 pointer to the source
//*
//* @param[in] src_strd
//*  integer source stride
//*
//* @param[in] ht
//*  integer height of the array
//*
//* @param[in] wd
//*  integer width of the array
//*
//* @param[in] pad_size
//*  integer -padding size of the array
//*
//* @param[in] ht
//*  integer height of the array
//*
//* @param[in] wd
//*  integer width of the array
//*
//* @returns
//*
//* @remarks
//*  None
//*
//*******************************************************************************
//*/
//#if PAD_LEFT_LUMA == C
//void ih264_pad_left_luma(UWORD8 *pu1_src,
//                        WORD32 src_strd,
//                        WORD32 ht,
//                        WORD32 pad_size)
//**************Variables Vs Registers*************************
//    x0 => *pu1_src
//    x1 => src_strd
//    x2 => ht
//    x3 => pad_size


    .global ih264_pad_left_luma_av8

ih264_pad_left_luma_av8:

    // STMFD sp!, {x4-x11,x14}                //stack stores the values of the arguments
    push_v_regs
    stp       x19, x20, [sp, #-16]!


    sub       x4, x0, x3
    sub       x6, x1, #16
    subs      x5, x3, #16
    bne       loop_32
loop_16:                                //  /*hard coded for width=16  ,height =8,16*/
    ldrb      w8, [x0]
    add       x0, x0, x1
    sxtw      x8, w8
    ldrb      w9, [x0]
    add       x0, x0, x1
    sxtw      x9, w9
    dup       v0.16b, w8
    ldrb      w10, [x0]
    add       x0, x0, x1
    sxtw      x10, w10
    st1       {v0.16b}, [x4], x1        // 16 bytes store
    dup       v2.16b, w9
    st1       {v2.16b}, [x4], x1        // 16 bytes store
    ldrb      w11, [x0]
    add       x0, x0, x1
    sxtw      x11, w11
    dup       v4.16b, w10
    dup       v6.16b, w11
    st1       {v4.16b}, [x4], x1        // 16 bytes store
    ldrb      w8, [x0]
    add       x0, x0, x1
    sxtw      x8, w8
    st1       {v6.16b}, [x4], x1        // 16 bytes store
    ldrb      w9, [x0]
    add       x0, x0, x1
    sxtw      x9, w9
    dup       v0.16b, w8
    ldrb      w10, [x0]
    add       x0, x0, x1
    sxtw      x10, w10
    st1       {v0.16b}, [x4], x1        // 16 bytes store
    dup       v2.16b, w9
    ldrb      w11, [x0]
    add       x0, x0, x1
    sxtw      x11, w11
    st1       {v2.16b}, [x4], x1        // 16 bytes store
    dup       v4.16b, w10
    dup       v6.16b, w11
    subs      x2, x2, #8
    st1       {v4.16b}, [x4], x1        // 16 bytes store
    st1       {v6.16b}, [x4], x1        // 16 bytes store
    bne       loop_16
    b         end_func

loop_32:                                //  /*hard coded for width=32 ,height =8,16*/
    ldrb      w8, [x0]
    add       x0, x0, x1
    sxtw      x8, w8
    ldrb      w9, [x0]
    add       x0, x0, x1
    sxtw      x9, w9
    dup       v0.16b, w8
    ldrb      w10, [x0]
    add       x0, x0, x1
    sxtw      x10, w10
    st1       {v0.16b}, [x4], #16       // 16 bytes store
    dup       v2.16b, w9
    st1       {v0.16b}, [x4], x6
    st1       {v2.16b}, [x4], #16       // 16 bytes store
    dup       v4.16b, w10
    st1       {v2.16b}, [x4], x6        // 16 bytes store
    ldrb      w11, [x0]
    add       x0, x0, x1
    sxtw      x11, w11
    st1       {v4.16b}, [x4], #16       // 16 bytes store
    dup       v6.16b, w11
    st1       {v4.16b}, [x4], x6        // 16 bytes store
    ldrb      w8, [x0]
    add       x0, x0, x1
    sxtw      x8, w8
    st1       {v6.16b}, [x4], #16       // 16 bytes store
    dup       v0.16b, w8
    ldrb      w9, [x0]
    add       x0, x0, x1
    sxtw      x9, w9
    st1       {v6.16b}, [x4], x6        // 16 bytes store
    ldrb      w10, [x0]
    add       x0, x0, x1
    sxtw      x10, w10
    st1       {v0.16b}, [x4], #16       // 16 bytes store
    dup       v2.16b, w9
    st1       {v0.16b}, [x4], x6        // 16 bytes store
    ldrb      w11, [x0]
    add       x0, x0, x1
    sxtw      x11, w11
    st1       {v2.16b}, [x4], #16       // 16 bytes store
    dup       v4.16b, w10
    st1       {v2.16b}, [x4], x6        // 16 bytes store
    st1       {v4.16b}, [x4], #16       // 16 bytes store
    dup       v6.16b, w11
    st1       {v4.16b}, [x4], x6        // 16 bytes store
    subs      x2, x2, #8
    st1       {v6.16b}, [x4], #16       // 16 bytes store
    st1       {v6.16b}, [x4], x6        // 16 bytes store
    bne       loop_32


end_func:
    // LDMFD sp!,{x4-x11,pc}                //Reload the registers from SP
    ldp       x19, x20, [sp], #16
    pop_v_regs
    ret


///**
//*******************************************************************************
//*
//* @brief
//*   Padding (chroma block) at the left of a 2d array
//*
//* @par Description:
//*   The left column of a 2d array is replicated for pad_size times at the left
//*
//*
//* @param[in] pu1_src
//*  UWORD8 pointer to the source
//*
//* @param[in] src_strd
//*  integer source stride
//*
//* @param[in] ht
//*  integer height of the array
//*
//* @param[in] wd
//*  integer width of the array (each colour component)
//*
//* @param[in] pad_size
//*  integer -padding size of the array
//*
//* @param[in] ht
//*  integer height of the array
//*
//* @param[in] wd
//*  integer width of the array
//*
//* @returns
//*
//* @remarks
//*  None
//*
//*******************************************************************************
//*/
//#if PAD_LEFT_CHROMA == C
//void ih264_pad_left_chroma(UWORD8 *pu1_src,
//                            WORD32 src_strd,
//                            WORD32 ht,
//                            WORD32 pad_size)
//{
//    x0 => *pu1_src
//    x1 => src_strd
//    x2 => ht
//    x3 => pad_size


    .global ih264_pad_left_chroma_av8

ih264_pad_left_chroma_av8:

    // STMFD sp!, {x4-x11, x14}                //stack stores the values of the arguments
    push_v_regs
    stp       x19, x20, [sp, #-16]!

    sub       x4, x0, x3
    sub       x6, x1, #16


loop_32_l_c:                            //  /*hard coded for width=32  ,height =4,8,12*/
    ldrh      w8, [x0]
    add       x0, x0, x1
    sxtw      x8, w8
    ldrh      w9, [x0]
    add       x0, x0, x1
    sxtw      x9, w9
    dup       v0.8h, w8
    ldrh      w10, [x0]
    add       x0, x0, x1
    sxtw      x10, w10
    st1       {v0.16b}, [x4], #16       // 16 bytes store
    dup       v2.8h, w9
    st1       {v0.16b}, [x4], x6        // 16 bytes store
    ldrh      w11, [x0]
    add       x0, x0, x1
    sxtw      x11, w11
    st1       {v2.16b}, [x4], #16       // 16 bytes store
    dup       v4.8h, w10
    st1       {v2.16b}, [x4], x6        // 16 bytes store
    dup       v6.8h, w11
    st1       {v4.16b}, [x4], #16       // 16 bytes store
    st1       {v4.16b}, [x4], x6        // 16 bytes store
    subs      x2, x2, #4
    st1       {v6.16b}, [x4], #16       // 16 bytes store
    st1       {v6.16b}, [x4], x6        // 16 bytes store


    beq       end_func_l_c              ///* Branching when ht=4*/

    ldrh      w8, [x0]
    add       x0, x0, x1
    sxtw      x8, w8
    ldrh      w9, [x0]
    add       x0, x0, x1
    sxtw      x9, w9
    dup       v0.8h, w8
    ldrh      w10, [x0]
    add       x0, x0, x1
    sxtw      x10, w10
    st1       {v0.16b}, [x4], #16       // 16 bytes store
    dup       v2.8h, w9
    st1       {v0.16b}, [x4], x6
    ldrh      w11, [x0]
    add       x0, x0, x1
    sxtw      x11, w11
    st1       {v2.16b}, [x4], #16       // 16 bytes store
    dup       v4.8h, w10
    st1       {v2.16b}, [x4], x6        // 16 bytes store
    dup       v6.8h, w11
    st1       {v4.16b}, [x4], #16       // 16 bytes store
    st1       {v4.16b}, [x4], x6        // 16 bytes store
    subs      x2, x2, #4
    st1       {v6.16b}, [x4], #16       // 16 bytes store
    st1       {v6.16b}, [x4], x6        // 16 bytes store

    beq       end_func_l_c              ///* Branching when ht=8*/
    bne       loop_32_l_c

    ldrh      w8, [x0]
    add       x0, x0, x1
    sxtw      x8, w8
    ldrh      w9, [x0]
    add       x0, x0, x1
    sxtw      x9, w9
    dup       v0.8h, w8
    ldrh      w10, [x0]
    add       x0, x0, x1
    sxtw      x10, w10
    st1       {v0.16b}, [x4], #16       // 16 bytes store
    dup       v2.8h, w9
    st1       {v0.16b}, [x4], x6
    ldrh      w11, [x0]
    add       x0, x0, x1
    sxtw      x11, w11
    st1       {v2.16b}, [x4], #16       // 16 bytes store
    dup       v4.8h, w10
    st1       {v2.16b}, [x4], x6        // 16 bytes store
    dup       v6.8h, w11
    st1       {v4.16b}, [x4], #16       // 16 bytes store
    st1       {v4.16b}, [x4], x6        // 16 bytes store
    st1       {v6.16b}, [x4], #16       // 16 bytes store
    st1       {v6.16b}, [x4], x6        // 16 bytes store

end_func_l_c:
    // LDMFD sp!,{x4-x11,pc}                //Reload the registers from SP
    ldp       x19, x20, [sp], #16
    pop_v_regs
    ret


///**
//*******************************************************************************
//*
//* @brief
//* Padding (luma block) at the right of a 2d array
//*
//* @par Description:
//* The right column of a 2d array is replicated for pad_size times at the right
//*
//*
//* @param[in] pu1_src
//*  UWORD8 pointer to the source
//*
//* @param[in] src_strd
//*  integer source stride
//*
//* @param[in] ht
//*  integer height of the array
//*
//* @param[in] wd
//*  integer width of the array
//*
//* @param[in] pad_size
//*  integer -padding size of the array
//*
//* @param[in] ht
//*  integer height of the array
//*
//* @param[in] wd
//*  integer width of the array
//*
//* @returns
//*
//* @remarks
//*  None
//*
//*******************************************************************************
//*/
//#if PAD_RIGHT_LUMA == C
//void ih264_pad_right_luma(UWORD8 *pu1_src,
//                        WORD32 src_strd,
//                        WORD32 ht,
//                        WORD32 pad_size)
//{
//    WORD32 row;
//
//    for(row = 0; row < ht; row++)
//    {
//        memset(pu1_src, *(pu1_src -1), pad_size);
//
//        pu1_src += src_strd;
//    }
//}
//
//    x0 => *pu1_src
//    x1 => src_strd
//    x2 => ht
//    x3 => pad_size


    .global ih264_pad_right_luma_av8

ih264_pad_right_luma_av8:

    // STMFD sp!, {x4-x11, x14}                //stack stores the values of the arguments
    push_v_regs
    stp       x19, x20, [sp, #-16]!

    mov       x4, x0
    sub       x6, x1, #16
    sub       x0, x0, #1
    subs      x5, x3, #16
    bne       loop_32
loop_16_r: //  /*hard coded for width=16  ,height =8,16*/
    ldrb      w8, [x0]
    add       x0, x0, x1
    sxtw      x8, w8
    ldrb      w9, [x0]
    add       x0, x0, x1
    sxtw      x9, w9
    dup       v0.16b, w8
    ldrb      w10, [x0]
    add       x0, x0, x1
    sxtw      x10, w10
    st1       {v0.16b}, [x4], x1        // 16 bytes store
    dup       v2.16b, w9
    st1       {v2.16b}, [x4], x1        // 16 bytes store
    ldrb      w11, [x0]
    add       x0, x0, x1
    sxtw      x11, w11
    dup       v4.16b, w10
    dup       v6.16b, w11
    st1       {v4.16b}, [x4], x1        // 16 bytes store
    ldrb      w8, [x0]
    add       x0, x0, x1
    sxtw      x8, w8
    st1       {v6.16b}, [x4], x1        // 16 bytes store
    ldrb      w9, [x0]
    add       x0, x0, x1
    sxtw      x9, w9
    dup       v0.16b, w8
    ldrb      w10, [x0]
    add       x0, x0, x1
    sxtw      x10, w10
    st1       {v0.16b}, [x4], x1        // 16 bytes store
    dup       v2.16b, w9
    ldrb      w11, [x0]
    add       x0, x0, x1
    sxtw      x11, w11
    st1       {v2.16b}, [x4], x1        // 16 bytes store
    dup       v4.16b, w10
    dup       v6.16b, w11
    subs      x2, x2, #8
    st1       {v4.16b}, [x4], x1        // 16 bytes store
    st1       {v6.16b}, [x4], x1        // 16 bytes store
    bne       loop_16_r
    b         end_func_r

loop_32_r:                              //  /*hard coded for width=32  ,height =8,16*/
    ldrb      w8, [x0]
    add       x0, x0, x1
    sxtw      x8, w8
    ldrb      w9, [x0]
    add       x0, x0, x1
    sxtw      x9, w9
    dup       v0.16b, w8
    ldrb      w10, [x0]
    add       x0, x0, x1
    sxtw      x10, w10
    st1       {v0.16b}, [x4], #16       // 16 bytes store
    dup       v2.16b, w9
    st1       {v0.16b}, [x4], x6
    st1       {v2.16b}, [x4], #16       // 16 bytes store
    dup       v4.16b, w10
    st1       {v2.16b}, [x4], x6        // 16 bytes store
    ldrb      w11, [x0]
    add       x0, x0, x1
    sxtw      x11, w11
    st1       {v4.16b}, [x4], #16       // 16 bytes store
    dup       v6.16b, w11
    st1       {v4.16b}, [x4], x6        // 16 bytes store
    ldrb      w8, [x0]
    add       x0, x0, x1
    sxtw      x8, w8
    st1       {v6.16b}, [x4], #16       // 16 bytes store
    ldrb      w9, [x0]
    add       x0, x0, x1
    sxtw      x9, w9
    dup       v0.16b, w8
    st1       {v6.16b}, [x4], x6        // 16 bytes store
    ldrb      w10, [x0]
    add       x0, x0, x1
    sxtw      x10, w10
    st1       {v0.16b}, [x4], #16       // 16 bytes store
    dup       v2.16b, w9
    st1       {v0.16b}, [x4], x6        // 16 bytes store
    ldrb      w11, [x0]
    add       x0, x0, x1
    sxtw      x11, w11
    st1       {v2.16b}, [x4], #16       // 16 bytes store
    dup       v4.16b, w10
    st1       {v2.16b}, [x4], x6        // 16 bytes store
    st1       {v4.16b}, [x4], #16       // 16 bytes store
    dup       v6.16b, w11
    st1       {v4.16b}, [x4], x6        // 16 bytes store
    subs      x2, x2, #8
    st1       {v6.16b}, [x4], #16       // 16 bytes store
    st1       {v6.16b}, [x4], x6        // 16 bytes store
    bne       loop_32_r


end_func_r:
    // LDMFD sp!,{x4-x11,pc}                //Reload the registers from SP
    ldp       x19, x20, [sp], #16
    pop_v_regs
    ret


///**
//*******************************************************************************
//*
//* @brief
//;* Padding (chroma block) at the right of a 2d array
//*
//* @par Description:
//* The right column of a 2d array is replicated for pad_size times at the right
//*
//*
//* @param[in] pu1_src
//;*  UWORD8 pointer to the source
//*
//* @param[in] src_strd
//*  integer source stride
//*
//* @param[in] ht
//;*  integer height of the array
//*
//* @param[in] wd
//*  integer width of the array (each colour component)
//*
//* @param[in] pad_size
//*  integer -padding size of the array
//*
//* @param[in] ht
//;*  integer height of the array
//*
//* @param[in] wd
//*  integer width of the array
//*
//* @returns
//*
//* @remarks
//*  None
//*
//*******************************************************************************
//*/
//#if PAD_RIGHT_CHROMA == C
//void ih264_pad_right_chroma(UWORD8 *pu1_src,
//                        WORD32 src_strd,
//                        WORD32 ht,
//                        WORD32 pad_size)
//    x0 => *pu1_src
//    x1 => src_strd
//    x2 => ht
//    x3 => pad_size


    .global ih264_pad_right_chroma_av8

ih264_pad_right_chroma_av8:

    // STMFD sp!, {x4-x11, x14}                //stack stores the values of the arguments
    push_v_regs
    stp       x19, x20, [sp, #-16]!

    mov       x4, x0
    sub       x6, x1, #16
    sub       x0, x0, #2
loop_32_r_c: //  /*hard coded for width=32 ,height =8,4*/
    ldrh      w8, [x0]
    add       x0, x0, x1
    sxtw      x8, w8
    ldrh      w9, [x0]
    add       x0, x0, x1
    sxtw      x9, w9
    dup       v0.8h, w8
    ldrh      w10, [x0]
    add       x0, x0, x1
    sxtw      x10, w10
    st1       {v0.16b}, [x4], #16       // 16 bytes store
    dup       v2.8h, w9
    st1       {v0.16b}, [x4], x6
    st1       {v2.16b}, [x4], #16       // 16 bytes store
    dup       v4.8h, w10
    st1       {v2.16b}, [x4], x6        // 16 bytes store
    subs      x2, x2, #4
    ldrh      w11, [x0]
    add       x0, x0, x1
    sxtw      x11, w11
    st1       {v4.16b}, [x4], #16       // 16 bytes store
    dup       v6.8h, w11
    st1       {v4.16b}, [x4], x6        // 16 bytes store
    st1       {v6.16b}, [x4], #16       // 16 bytes store
    st1       {v6.16b}, [x4], x6        // 16 bytes store

    beq       end_func_r_c              ///* Branching when ht=4*/

    ldrh      w8, [x0]
    add       x0, x0, x1
    sxtw      x8, w8
    dup       v0.8h, w8
    ldrh      w9, [x0]
    add       x0, x0, x1
    sxtw      x9, w9
    ldrh      w10, [x0]
    add       x0, x0, x1
    sxtw      x10, w10
    st1       {v0.16b}, [x4], #16       // 16 bytes store
    dup       v2.8h, w9
    st1       {v0.16b}, [x4], x6        // 16 bytes store
    ldrh      w11, [x0]
    add       x0, x0, x1
    sxtw      x11, w11
    st1       {v2.16b}, [x4], #16       // 16 bytes store
    dup       v4.8h, w10
    st1       {v2.16b}, [x4], x6        // 16 bytes store
    st1       {v4.16b}, [x4], #16       // 16 bytes store
    dup       v6.8h, w11
    st1       {v4.16b}, [x4], x6        // 16 bytes store
    subs      x2, x2, #4
    st1       {v6.16b}, [x4], #16       // 16 bytes store
    st1       {v6.16b}, [x4], x6        // 16 bytes store

    beq       end_func_r_c              ///* Branching when ht=8*/
    bne       loop_32_r_c
    ldrh      w8, [x0]
    add       x0, x0, x1
    sxtw      x8, w8
    dup       v0.8h, w8
    ldrh      w9, [x0]
    add       x0, x0, x1
    sxtw      x9, w9
    ldrh      w10, [x0]
    add       x0, x0, x1
    sxtw      x10, w10
    st1       {v0.16b}, [x4], #16       // 16 bytes store
    dup       v2.8h, w9
    st1       {v0.16b}, [x4], x6        // 16 bytes store
    ldrh      w11, [x0]
    add       x0, x0, x1
    sxtw      x11, w11
    st1       {v2.16b}, [x4], #16       // 16 bytes store
    dup       v4.8h, w10
    st1       {v2.16b}, [x4], x6        // 16 bytes store
    st1       {v4.16b}, [x4], #16       // 16 bytes store
    dup       v6.8h, w11
    st1       {v4.16b}, [x4], x6        // 16 bytes store
    st1       {v6.16b}, [x4], #16       // 16 bytes store
    st1       {v6.16b}, [x4], x6        // 16 bytes store

end_func_r_c:
    // LDMFD sp!,{x4-x11,pc}                //Reload the registers from SP
    ldp       x19, x20, [sp], #16
    pop_v_regs
    ret