@/******************************************************************************
@ *
@ * Copyright (C) 2015 The Android Open Source Project
@ *
@ * Licensed under the Apache License, Version 2.0 (the "License");
@ * you may not use this file except in compliance with the License.
@ * You may obtain a copy of the License at:
@ *
@ * http://www.apache.org/licenses/LICENSE-2.0
@ *
@ * Unless required by applicable law or agreed to in writing, software
@ * distributed under the License is distributed on an "AS IS" BASIS,
@ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@ * See the License for the specific language governing permissions and
@ * limitations under the License.
@ *
@ *****************************************************************************
@ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
@*/
@**
@**
@*******************************************************************************
@*
@* @brief
@*     Interprediction luma function for copy
@*
@* @par Description:
@*   Copies the array of width 'wd' and height 'ht' from the  location pointed
@*   by 'src' to the location pointed by 'dst'
@*
@* @param[in] pu1_src
@*  UWORD8 pointer to the source
@*
@* @param[out] pu1_dst
@*  UWORD8 pointer to the destination
@*
@* @param[in] src_strd
@*  integer source stride
@*
@* @param[in] dst_strd
@*  integer destination stride
@*
@*
@* @param[in] ht
@*  integer height of the array
@*
@* @param[in] wd
@*  integer width of the array
@*
@* @returns
@*
@* @remarks
@*  None
@*
@*******************************************************************************
@*
@void ih264_inter_pred_luma_copy (
@                            UWORD8 *pu1_src,
@                            UWORD8 *pu1_dst,
@                            WORD32 src_strd,
@                            WORD32 dst_strd,
@                            WORD32 ht,
@                            WORD32 wd   )

@**************Variables Vs Registers*****************************************
@   r0 => *pu1_src
@   r1 => *pu1_dst
@   r2 =>  src_strd
@   r3 =>  dst_strd
@   r7 =>  ht
@   r12 => wd

.text
.p2align 2

    .global ih264_inter_pred_luma_copy_a9q

ih264_inter_pred_luma_copy_a9q:
    stmfd         sp!, {r4-r12, r14}    @stack stores the values of the arguments
    vstmdb        sp!, {d8-d15}         @push neon registers to stack
    ldr           r12, [sp, #108]       @Loads wd
    ldr           r7, [sp, #104]        @Loads ht
    cmp           r7, #0                @checks ht == 0
    ble           end_loops
    tst           r12, #15              @checks wd for multiples for 4 & 8
    beq           core_loop_wd_16
    tst           r12, #7               @checks wd for multiples for 4 & 8
    beq           core_loop_wd_8
    sub           r11, r12, #4

outer_loop_wd_4:
    subs          r4, r12, #0           @checks wd == 0
    ble           end_inner_loop_wd_4

inner_loop_wd_4:
    vld1.32       {d0[0]}, [r0]         @vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp, 0)
    add           r5, r0, r2            @pu1_src_tmp += src_strd
    add           r6, r1, r3            @pu1_dst_tmp += dst_strd
    vst1.32       {d0[0]}, [r1]         @vst1_lane_u32((uint32_t *)pu1_dst_tmp, src_tmp, 0)
    vld1.32       {d0[0]}, [r5], r2     @vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp, 0)
    add           r0, r0, #4            @pu1_src += 4
    vst1.32       {d0[0]}, [r6], r3     @vst1_lane_u32((uint32_t *)pu1_dst_tmp, src_tmp, 0)
    vld1.32       {d0[0]}, [r5], r2     @vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp, 0)
    subs          r4, r4, #4            @(wd -4)
    vst1.32       {d0[0]}, [r6], r3     @vst1_lane_u32((uint32_t *)pu1_dst_tmp, src_tmp, 0)
    vld1.32       {d0[0]}, [r5], r2     @vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp, 0)
    add           r1, r1, #4            @pu1_dst += 4
    vst1.32       {d0[0]}, [r6], r3     @vst1_lane_u32((uint32_t *)pu1_dst_tmp, src_tmp, 0)

    bgt           inner_loop_wd_4

end_inner_loop_wd_4:
    subs          r7, r7, #4            @ht - 4
    sub           r0, r5, r11           @pu1_src = pu1_src_tmp
    sub           r1, r6, r11           @pu1_dst = pu1_dst_tmp
    bgt           outer_loop_wd_4

end_loops:
    vldmia        sp!, {d8-d15}         @ Restore neon registers that were saved
    ldmfd         sp!, {r4-r12, r15}    @Reload the registers from SP


core_loop_wd_8:
    sub           r11, r12, #8

outer_loop_wd_8:
    subs          r4, r12, #0           @checks wd
    ble           end_inner_loop_wd_8

inner_loop_wd_8:
    add           r5, r0, r2            @pu1_src_tmp += src_strd
    vld1.8        {d0}, [r0]!           @vld1_u8(pu1_src_tmp)
    add           r6, r1, r3            @pu1_dst_tmp += dst_strd
    vst1.8        {d0}, [r1]!           @vst1_u8(pu1_dst_tmp, tmp_src)
    vld1.8        {d1}, [r5], r2        @vld1_u8(pu1_src_tmp)
    vst1.8        {d1}, [r6], r3        @vst1_u8(pu1_dst_tmp, tmp_src)
    subs          r4, r4, #8            @wd - 8(Loop condition)
    vld1.8        {d2}, [r5], r2        @vld1_u8(pu1_src_tmp)
    vst1.8        {d2}, [r6], r3        @vst1_u8(pu1_dst_tmp, tmp_src)
    vld1.8        {d3}, [r5], r2        @vld1_u8(pu1_src_tmp)
    vst1.8        {d3}, [r6], r3        @vst1_u8(pu1_dst_tmp, tmp_src)
    bgt           inner_loop_wd_8

end_inner_loop_wd_8:
    subs          r7, r7, #4            @ht -= 4
    sub           r0, r5, r11           @pu1_src = pu1_src_tmp
    sub           r1, r6, r11           @pu1_dst = pu1_dst_tmp
    bgt           outer_loop_wd_8

    vldmia        sp!, {d8-d15}         @ Restore neon registers that were saved
    ldmfd         sp!, {r4-r12, r15}    @Reload the registers from SP

core_loop_wd_16:
    sub           r11, r12, #16

outer_loop_wd_16:
    subs          r4, r12, #0           @checks wd
    ble           end_inner_loop_wd_16

inner_loop_wd_16:
    add           r5, r0, r2            @pu1_src_tmp += src_strd
    vld1.8        {q0}, [r0]!           @vld1_u8(pu1_src_tmp)
    add           r6, r1, r3            @pu1_dst_tmp += dst_strd
    vst1.8        {q0}, [r1]!           @vst1_u8(pu1_dst_tmp, tmp_src)
    vld1.8        {q1}, [r5], r2        @vld1_u8(pu1_src_tmp)
    vst1.8        {q1}, [r6], r3        @vst1_u8(pu1_dst_tmp, tmp_src)
    subs          r4, r4, #16           @wd - 8(Loop condition)
    vld1.8        {q2}, [r5], r2        @vld1_u8(pu1_src_tmp)
    vst1.8        {q2}, [r6], r3        @vst1_u8(pu1_dst_tmp, tmp_src)
    vld1.8        {q3}, [r5], r2        @vld1_u8(pu1_src_tmp)
    vst1.8        {q3}, [r6], r3        @vst1_u8(pu1_dst_tmp, tmp_src)
    bgt           inner_loop_wd_16

end_inner_loop_wd_16:
    subs          r7, r7, #4            @ht -= 4
    sub           r0, r5, r11           @pu1_src = pu1_src_tmp
    sub           r1, r6, r11           @pu1_dst = pu1_dst_tmp
    bgt           outer_loop_wd_16

    vldmia        sp!, {d8-d15}         @ Restore neon registers that were saved
    ldmfd         sp!, {r4-r12, r15}    @Reload the registers from SP


@ *
@ ********************************************************************************
@ *
@ * @brief This function copies a 4x4 block to destination
@ *
@ * @par Description:
@ * Copies a 4x4 block to destination, where both src and dst are interleaved
@ *
@ * @param[in] pi2_src
@ *  Source
@ *
@ * @param[in] pu1_out
@ *  Output pointer
@ *
@ * @param[in] pred_strd,
@ *  Prediction buffer stride
@ *
@ * @param[in] out_strd
@ *  output buffer buffer Stride
@ *
@ * @returns none
@ *
@ * @remarks none
@ * Currently wd and height is not used, ie a 4x4 block is always copied
@ *
@ *******************************************************************************
@ *
@ void ih264_interleave_copy(WORD16 *pi2_src,
@                            UWORD8 *pu1_out,
@                            WORD32 pred_strd,
@                            WORD32 out_strd
@                            WORD32 wd
@                            WORD32 ht)
@ Register Usage
@ r0 : pi2_src
@ r1 : pu1_out
@ r2 : src_strd
@ r3 : out_strd
@ Neon registers d0-d7, d16-d30 are used
@ No need for pushing  arm and neon registers

    .global ih264_interleave_copy_a9
ih264_interleave_copy_a9:

    vld1.u8       d2, [r0], r2          @load src plane 1 => d2 &pred palne 2 => d3
    vld1.u8       d3, [r0], r2
    vld1.u8       d4, [r0], r2
    vld1.u8       d5, [r0], r2

    mov           r0, r1

    vld1.u8       d18, [r1], r3         @load out [8 bit size) -8 coeffs
    vld1.u8       d19, [r1], r3
    vmov.u16      q15, #0x00ff
    vld1.u8       d20, [r1], r3
    vld1.u8       d21, [r1], r3

    vbit.u8       q9, q1, q15
    vbit.u8       q10, q2, q15

    vst1.u8       d18, [r0], r3         @store  out
    vst1.u8       d19, [r0], r3
    vst1.u8       d20, [r0], r3
    vst1.u8       d21, [r0], r3

    bx            lr