@/*****************************************************************************
@*
@* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
@*
@* Licensed under the Apache License, Version 2.0 (the "License");
@* you may not use this file except in compliance with the License.
@* You may obtain a copy of the License at:
@*
@* http://www.apache.org/licenses/LICENSE-2.0
@*
@* Unless required by applicable law or agreed to in writing, software
@* distributed under the License is distributed on an "AS IS" BASIS,
@* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@* See the License for the specific language governing permissions and
@* limitations under the License.
@*
@*****************************************************************************/
@/**
@*******************************************************************************
@* @file
@*  ihevc_weighted_pred_bi_default.s
@*
@* @brief
@*  contains function definitions for weighted prediction used in inter
@* prediction
@*
@* @author
@*  parthiban v
@*
@* @par list of functions:
@*  - ihevc_weighted_pred_bi_default()
@*
@* @remarks
@*  none
@*
@*******************************************************************************
@*/
@/**
@*******************************************************************************
@*
@* @brief
@*  does default bi-weighted prediction on the arrays pointed by pi2_src1 and
@* pi2_src2 and stores it at location  pointed by pi2_dst assumptions : the
@* function is optimized considering the fact width and  height are multiple
@* of 2.
@*
@* @par description:
@*  dst = ( (src1 + lvl_shift1) +  (src2 + lvl_shift2) +  1 << (shift - 1) )
@* >> shift  where shift = 15 - bitdepth
@*
@* @param[in] pi2_src1
@*  pointer to source 1
@*
@* @param[in] pi2_src2
@*  pointer to source 2
@*
@* @param[out] pu1_dst
@*  pointer to destination
@*
@* @param[in] src_strd1
@*  source stride 1
@*
@* @param[in] src_strd2
@*  source stride 2
@*
@* @param[in] dst_strd
@*  destination stride
@*
@* @param[in] lvl_shift1
@*  added before shift and offset
@*
@* @param[in] lvl_shift2
@*  added before shift and offset
@*
@* @param[in] ht
@*  height of the source
@*
@* @param[in] wd
@*  width of the source
@*
@* @returns
@*
@* @remarks
@*  none
@*
@*******************************************************************************
@*/
@void ihevc_weighted_pred_bi_default(word16 *pi2_src1,
@                                    word16 *pi2_src2,
@                                    uword8 *pu1_dst,
@                                    word32 src_strd1,
@                                    word32 src_strd2,
@                                    word32 dst_strd,
@                                    word32 lvl_shift1,
@                                    word32 lvl_shift2,
@                                    word32 ht,
@                                    word32 wd)

@**************variables vs registers*****************************************
@   r0 => *pi2_src1
@   r1 => *pi2_src2
@   r2 => *pu1_dst
@   r3 =>  src_strd1
@   r4 =>  src_strd2
@   r5 =>  dst_strd
@   r6 =>  lvl_shift1
@   r7 =>  lvl_shift2
@   r8 =>  ht
@   r9 =>  wd
.text
.syntax unified
.align 4


.globl ihevc_weighted_pred_bi_default_a9q

.type ihevc_weighted_pred_bi_default_a9q, %function

ihevc_weighted_pred_bi_default_a9q:

    stmfd       sp!, {r4-r12, r14}          @stack stores the values of the arguments
    ldr         r4,[sp,#40]                 @load src_strd2
    lsl         r3,r3,#1
    ldr         r5,[sp,#44]                 @load dst_strd
    ldr         r6,[sp,#48]                 @load lvl_shift1
    lsl         r4,r4,#1
    ldr         r7,[sp,#52]                 @load lvl_shift2
    ldr         r8,[sp,#56]                 @load ht
    ldr         r9,[sp,#60]                 @load wd
    vdup.16     q2,r6                       @lvl_shift1_t = vmov_n_s16((int16_t)lvl_shift1)
    vdup.16     q3,r7                       @lvl_shift2_t = vmov_n_s16((int16_t)lvl_shift2)
    vmov.i16    q0,#0x40                    @tmp_lvl_shift = 1 << (shift - 1)
    vadd.i16    q2,q3
    vadd.s16    q0,q0,q2
@   vmvn.i32    q1,#0x6                         @vmovq_n_s32(tmp_shift)
    lsl         r6,r9,#1
    rsb         r7,r6,r3,lsl #2             @4*src_strd1 - wd
    rsb         r10,r6,r4,lsl #2            @4*src_strd2 - wd
    @asr            r6,#1
    @rsb            r6,r6,r5,lsl #2             @4*dst_strd - wd

    cmp         r8,#0                       @check ht == 0
    beq         end_loops                   @if equal, then end the function

chroma_decision:
    orr         r14,r8,r9
    cmp         r14,#10
    beq         outer_loop_chroma_8x2

    cmp         r14,#6
    beq         outer_loop_chroma_4x2


luma_decision:
    cmp         r9,#24
    beq         outer_loop_8

    cmp         r9,#16
    bge         outer_loop_16

    cmp         r9,#12
    beq         outer_loop_4

    cmp         r9,#8
    bge         outer_loop_8


outer_loop_4:
    cmp         r9,#0                       @check wd == 0
    beq         end_loops                   @if equal, then end the function

core_loop_4:
    add         r11,r0,r3                   @pi2_src_tmp1 = pi2_src1 + 2*src_strd1(2* because pi1_src is a 16 bit pointer)
    add         r12,r1,r4                   @pi2_src_tmp2 = pi2_src2 + 2*src_strd2(2* because pi2_src is a 16 bit pointer)
    vld1.s16    {d6},[r0]!                  @load and increment the pi2_src1
    add         r14,r2,r5                   @pu1_dst_tmp = pu1_dst + dst_strd
    vld1.s16    {d7},[r1]!                  @load and increment the pi2_src2
    vld1.s16    {d8},[r11],r3               @load and increment the pi2_src1 ii iteration
    vqadd.s16   d18,d6,d7
    vqadd.s16   d18,d18,d0                  @vaddq_s32(i4_tmp1_t1, tmp_lvl_shift_t)
    vld1.s16    {d9},[r12],r4               @load and increment the pi2_src2 ii iteration
    vqadd.s16   d20,d8,d9                   @vaddq_s32(i4_tmp2_t1, i4_tmp2_t2)
    vqadd.s16   d19,d20,d0                  @vaddq_s32(i4_tmp2_t1, tmp_lvl_shift_t)
    vqshrun.s16 d20,q9,#7
    vld1.s16    {d22},[r11],r3              @load and increment the pi2_src1 iii iteration
    vld1.s16    {d23},[r12],r4              @load and increment the pi2_src2 iii iteration
    vqadd.s16   d30,d22,d23
    vqadd.s16   d30,d30,d0                  @vaddq_s32(i4_tmp1_t1, tmp_lvl_shift_t) iii iteration
    vld1.s16    {d24},[r11],r3              @load and increment the pi2_src1 iv iteration
    vld1.s16    {d25},[r12],r4              @load and increment the pi2_src2 iv iteration
    vqadd.s16   d18,d24,d25                 @vaddq_s32(i4_tmp2_t1, i4_tmp2_t2) iv iteration
    vqadd.s16   d31,d18,d0
    vst1.32     {d20[0]},[r2]!              @store pu1_dst i iteration
    vst1.32     {d20[1]},[r14],r5           @store pu1_dst ii iteration
    vqshrun.s16 d30,q15,#7
    vst1.32     {d30[0]},[r14],r5           @store pu1_dst iii iteration                                                @vaddq_s32(i4_tmp2_t1, tmp_lvl_shift_t) iv iteratio
    subs        r9,r9,#4                    @decrement wd by 4 and check for 0
    vst1.32     {d30[1]},[r14],r5           @store pu1_dst iv iteration
    bgt         core_loop_4                 @if greater than 0 repeat the core loop again

end_core_loop_4:

    subs        r8,r8,#4                    @decrement the ht by 4

    add         r0,r0,r7                    @pi2_src1 + 4*src_strd1 - 2*wd(since pi2_src1 is 16 bit pointer double the increment with double the wd decrement)
    asr         r9,r6,#1
    add         r1,r1,r10                   @pi2_src2 + 4*src_strd2 - 2*wd
    rsb         r14,r9,r5,lsl #2            @4*dst_strd - wd
    add         r2,r2,r14
                                            @pu1_dst + dst_std - wd
    bgt         core_loop_4                 @if ht is greater than 0 goto outer_loop

    b           end_loops


@ this is only for chroma module with input 2x2
outer_loop_chroma_4x2:
    cmp         r9,#0                       @check wd == 0
    beq         end_loops                   @if equal, then end the function
    rsb         r7,r6,r3,lsl #1             @2*src_strd1 - wd
    rsb         r10,r6,r4,lsl #1            @2*src_strd2 - wd
core_loop_chroma_4x2:
    add         r11,r0,r3                   @pi2_src_tmp1 = pi2_src1 + 2*src_strd1(2* because pi1_src is a 16 bit pointer)
    add         r12,r1,r4                   @pi2_src_tmp2 = pi2_src2 + 2*src_strd2(2* because pi2_src is a 16 bit pointer)
    vld1.s16    {d6},[r0]!                  @load and increment the pi2_src1
    add         r14,r2,r5                   @pu1_dst_tmp = pu1_dst + dst_strd
    vld1.s16    {d7},[r1]!                  @load and increment the pi2_src2
    vld1.s16    {d8},[r11],r3               @load and increment the pi2_src1 ii iteration
    vqadd.s16   d18,d6,d7
    vqadd.s16   d18,d18,d0                  @vaddq_s32(i4_tmp1_t1, tmp_lvl_shift_t)
    vld1.s16    {d9},[r12],r4               @load and increment the pi2_src2 ii iteration
    vqadd.s16   d20,d8,d9                   @vaddq_s32(i4_tmp2_t1, i4_tmp2_t2)
    vqadd.s16   d19,d20,d0                  @vaddq_s32(i4_tmp2_t1, tmp_lvl_shift_t)
    vqshrun.s16 d20,q9,#7
    vst1.32     {d20[0]},[r2]!              @store pu1_dst i iteration
    vst1.32     {d20[1]},[r14],r5           @store pu1_dst ii iteration

    subs        r9,r9,#4                    @decrement wd by 4 and check for 0

    bgt         core_loop_chroma_4x2        @if greater than 0 repeat the core loop again

end_core_loop_chorma_4x2:

    subs        r8,r8,#2                    @decrement the ht by 4

    add         r0,r0,r7                    @pi2_src1 + 2*src_strd1 - 2*wd(since pi2_src1 is 16 bit pointer double the increment with double the wd decrement)
    asr         r9,r6,#1
    add         r1,r1,r10                   @pi2_src2 + 2*src_strd2 - 2*wd
    rsb         r14,r9,r5,lsl #1            @2*dst_strd - wd
    add         r2,r2,r14
                                            @pu1_dst + dst_std - wd
    bgt         core_loop_chroma_4x2        @if ht is greater than 0 goto outer_loop

    b           end_loops


outer_loop_8:
    cmp         r9,#0                       @check wd == 0
    beq         end_loops                   @if equal, then end the function
    add         r11,r0,r3                   @pi2_src_tmp1 = pi2_src1 + 2*src_strd1(2* because pi1_src is a 16 bit pointer)
    add         r12,r1,r4                   @pi2_src_tmp2 = pi2_src2 + 2*src_strd2(2* because pi2_src is a 16 bit pointer)
core_loop_8:

    vld1.s16    {q12},[r0]!                 @load and increment the pi2_src1
    add         r14,r2,r5                   @pu1_dst_tmp = pu1_dst + dst_strd
    vld1.s16    {q13},[r1]!                 @load and increment the pi2_src2
    vqadd.s16   q12,q12,q13
    vld1.s16    {q14},[r11],r3              @load and increment the pi2_src1 ii iteration
    vqadd.s16   q12,q12,q0                  @vaddq_s32(i4_tmp1_t1, tmp_lvl_shift_t)
    vld1.s16    {q15},[r12],r4              @load and increment the pi2_src2 ii iteration
    vld1.s16    {q8},[r11],r3               @load and increment the pi2_src1 iii iteration
    vqadd.s16   q11,q14,q15                 @vaddq_s32(i4_tmp2_t1, i4_tmp2_t2)
    vld1.s16    {q9},[r12],r4               @load and increment the pi2_src2 iii iteration
    vqadd.s16   q11,q11,q0                  @vaddq_s32(i4_tmp2_t1, tmp_lvl_shift_t)
    vqshrun.s16 d20,q12,#7
    vld1.s16    {q6},[r11],r3               @load and increment the pi2_src1 iv iteration
    vqadd.s16   q15,q8,q9
    vqshrun.s16 d21,q11,#7
    vld1.s16    {q7},[r12],r4               @load and increment the pi2_src2 iv iteration
    vqadd.s16   q15,q15,q0                  @vaddq_s32(i4_tmp1_t1, tmp_lvl_shift_t) iii iteration
    vst1.32     {d20},[r2]!                 @store pu1_dst i iteration
    vqadd.s16   q4,q6,q7                    @vaddq_s32(i4_tmp2_t1, i4_tmp2_t2) iv iteration
    vst1.32     {d21},[r14],r5              @store pu1_dst ii iteration
    vqadd.s16   q4,q4,q0
    vqshrun.s16 d30,q15,#7
    vqshrun.s16 d31,q4,#7
    add         r11,r0,r3                   @pi2_src_tmp1 = pi2_src1 + 2*src_strd1(2* because pi1_src is a 16 bit pointer)
    add         r12,r1,r4                   @pi2_src_tmp2 = pi2_src2 + 2*src_strd2(2* because pi2_src is a 16 bit pointer)
    vst1.32     {d30},[r14],r5              @store pu1_dst iii iteration                                                @vaddq_s32(i4_tmp2_t1, tmp_lvl_shift_t) iv iteratio
    subs        r9,r9,#8                    @decrement wd by 4 and check for 0
    vst1.32     {d31},[r14],r5              @store pu1_dst iv iteration
    bgt         core_loop_8                 @if greater than 0 repeat the core loop again

end_core_loop_8:

    subs        r8,r8,#4                    @decrement the ht by 4

    add         r0,r0,r7                    @pi2_src1 + 4*src_strd1 - 2*wd(since pi2_src1 is 16 bit pointer double the increment with double the wd decrement)
    asr         r9,r6,#1
    add         r1,r1,r10                   @pi2_src2 + 4*src_strd2 - 2*wd
    rsb         r14,r9,r5,lsl #2            @4*dst_strd - wd
    add         r2,r2,r14
    add         r11,r0,r3                   @pi2_src_tmp1 = pi2_src1 + 2*src_strd1(2* because pi1_src is a 16 bit pointer)
    add         r12,r1,r4                   @pi2_src_tmp2 = pi2_src2 + 2*src_strd2(2* because pi2_src is a 16 bit pointer)                                  @pu1_dst + dst_std - wd

    bgt         core_loop_8
    b           end_loops


@ this is only for chroma module with inpput 4x2
outer_loop_chroma_8x2:
    cmp         r9,#0                       @check wd == 0
    beq         end_loops                   @if equal, then end the function
    add         r11,r0,r3                   @pi2_src_tmp1 = pi2_src1 + 2*src_strd1(2* because pi1_src is a 16 bit pointer)
    add         r12,r1,r4                   @pi2_src_tmp2 = pi2_src2 + 2*src_strd2(2* because pi2_src is a 16 bit pointer)
    rsb         r7,r6,r3,lsl #1             @2*src_strd1 - wd
    rsb         r10,r6,r4,lsl #1            @2*src_strd2 - wd
core_loop_chroma_8x2:

    vld1.s16    {q12},[r0]!                 @load and increment the pi2_src1
    add         r14,r2,r5                   @pu1_dst_tmp = pu1_dst + dst_strd
    vld1.s16    {q13},[r1]!                 @load and increment the pi2_src2
    vqadd.s16   q12,q12,q13
    vld1.s16    {q14},[r11],r3              @load and increment the pi2_src1 ii iteration
    vqadd.s16   q12,q12,q0                  @vaddq_s32(i4_tmp1_t1, tmp_lvl_shift_t)
    vld1.s16    {q15},[r12],r4              @load and increment the pi2_src2 ii iteration
    vld1.s16    {q8},[r11],r3               @load and increment the pi2_src1 iii iteration
    vqadd.s16   q11,q14,q15                 @vaddq_s32(i4_tmp2_t1, i4_tmp2_t2)
    vqadd.s16   q11,q11,q0                  @vaddq_s32(i4_tmp2_t1, tmp_lvl_shift_t)
    vqshrun.s16 d20,q12,#7
    vqshrun.s16 d21,q11,#7
    vst1.32     {d20},[r2]!                 @store pu1_dst i iteration
    vst1.32     {d21},[r14],r5              @store pu1_dst ii iteration

    add         r11,r0,r3                   @pi2_src_tmp1 = pi2_src1 + 2*src_strd1(2* because pi1_src is a 16 bit pointer)
    add         r12,r1,r4                   @pi2_src_tmp2 = pi2_src2 + 2*src_strd2(2* because pi2_src is a 16 bit pointer)
                                            @vaddq_s32(i4_tmp2_t1, tmp_lvl_shift_t) iv iteratio
    subs        r9,r9,#8                    @decrement wd by 4 and check for 0

    bgt         core_loop_chroma_8x2        @if greater than 0 repeat the core loop again

end_core_loop_chroma_8x2:

    subs        r8,r8,#2                    @decrement the ht by 4

    add         r0,r0,r7                    @pi2_src1 + 4*src_strd1 - 2*wd(since pi2_src1 is 16 bit pointer double the increment with double the wd decrement)
    asr         r9,r6,#1
    add         r1,r1,r10                   @pi2_src2 + 4*src_strd2 - 2*wd
    rsb         r14,r9,r5,lsl #1            @4*dst_strd - wd
    add         r2,r2,r14
    add         r11,r0,r3                   @pi2_src_tmp1 = pi2_src1 + 2*src_strd1(2* because pi1_src is a 16 bit pointer)
    add         r12,r1,r4                   @pi2_src_tmp2 = pi2_src2 + 2*src_strd2(2* because pi2_src is a 16 bit pointer)                                  @pu1_dst + dst_std - wd

    bgt         core_loop_chroma_8x2

    b           end_loops


outer_loop_16:
    cmp         r9,#0                       @check wd == 0
    beq         end_loops                   @if equal, then end the function
    add         r11,r0,r3                   @pi2_src_tmp1 = pi2_src1 + 2*src_strd1(2* because pi1_src is a 16 bit pointer)
    add         r12,r1,r4                   @pi2_src_tmp2 = pi2_src2 + 2*src_strd2(2* because pi2_src is a 16 bit pointer)
    rsb         r7,r6,r3,lsl #1             @2*src_strd1 - wd
    mov         r14,#16
    sub         r10,r14,r5
    sub         r11,r3,r14
    sub         r12,r14,r3

    rsb         r14,r9,r5,lsl #1            @2*dst_strd - wd


prolog_16:


    vld1.s16    {q1},[r0]!                  @load and increment the pi2_src1
    vld1.s16    {q2},[r1]!                  @load and increment the pi2_src2
    vld1.s16    {q5},[r0],r11               @load and increment the pi2_src1
    vld1.s16    {q6},[r1],r11               @load and increment the pi2_src2
    vld1.s16    {q3},[r0]!                  @load and increment the pi2_src1 ii iteration
    subs        r9,r9,#16
    vld1.s16    {q4},[r1]!                  @load and increment the pi2_src2 ii iteration
    subeq       r8,r8,#2
    vqadd.s16   q11,q1,q2
    vld1.s16    {q7},[r0],r12               @load and increment the pi2_src1 ii iteration
    vqadd.s16   q14,q5,q6
    vld1.s16    {q8},[r1],r12               @load and increment the pi2_src2 ii iteration
    addeq       r0,r0,r7
    addeq       r1,r1,r7
    vqadd.s16   q12,q3,q4
    vld1.s16    {q1},[r0]!
    vqadd.s16   q13,q7,q8
@ if the input is chroma with 8x2 block size
    cmp         r8,#0
    beq         epilog_16

    vld1.s16    {q2},[r1]!                  @load and increment the pi2_src2
    vqadd.s16   q11,q11,q0
    vld1.s16    {q5},[r0],r11               @load and increment the pi2_src1
    vqadd.s16   q14,q14,q0
    vld1.s16    {q6},[r1],r11               @load and increment the pi2_src2
    vqadd.s16   q12,q12,q0
    vld1.s16    {q3},[r0]!                  @load and increment the pi2_src1 ii iteration
    vqadd.s16   q15,q13,q0
    vqshrun.s16 d20,q11,#7
    vld1.s16    {q4},[r1]!                  @load and increment the pi2_src2 ii iteration
    vqshrun.s16 d21,q14,#7
    vld1.s16    {q7},[r0],r12               @load and increment the pi2_src1 ii iteration
    vqshrun.s16 d26,q12,#7
    vld1.s16    {q8},[r1],r12               @load and increment the pi2_src2 ii iteration
    vqshrun.s16 d27,q15,#7


core_loop_16:

    cmp         r9,#0
    vqadd.s16   q11,q1,q2
    asreq       r9,r6,#1
    vst1.32     {q10},[r2],r5
    vqadd.s16   q14,q5,q6
    vst1.32     {q13},[r2],r10
    addeq       r2,r2,r14
    vqadd.s16   q12,q3,q4
    subs        r9,r9,#16
    addeq       r0,r0,r7
    vqadd.s16   q13,q7,q8

    addeq       r1,r1,r7
    subseq      r8,r8,#2                    @decrement the ht by 2
    beq         epilog_16


    vqadd.s16   q11,q11,q0
    vld1.s16    {q1},[r0]!                  @load and increment the pi2_src1
    vqadd.s16   q14,q14,q0
    vld1.s16    {q2},[r1]!                  @load and increment the pi2_src2
    vqadd.s16   q12,q12,q0
    vld1.s16    {q5},[r0],r11               @load and increment the pi2_src1
    vqadd.s16   q15,q13,q0
    vld1.s16    {q6},[r1],r11               @load and increment the pi2_src2
    vqshrun.s16 d20,q11,#7
    vld1.s16    {q3},[r0]!                  @load and increment the pi2_src1 ii iteration
    vqshrun.s16 d21,q14,#7
    vld1.s16    {q4},[r1]!                  @load and increment the pi2_src2 ii iteration
    vqshrun.s16 d26,q12,#7
    vld1.s16    {q7},[r0],r12               @load and increment the pi2_src1 ii iteration
    vqshrun.s16 d27,q15,#7
    vld1.s16    {q8},[r1],r12               @load and increment the pi2_src2 ii iteration


    b           core_loop_16


epilog_16:

    vqadd.s16   q11,q11,q0
    vqadd.s16   q14,q14,q0
    vqadd.s16   q12,q12,q0
    vqadd.s16   q15,q13,q0
    vqshrun.s16 d20,q11,#7
    vqshrun.s16 d21,q14,#7
    vqshrun.s16 d26,q12,#7
    vqshrun.s16 d27,q15,#7
    vst1.32     {q10},[r2],r5
    vst1.32     {q13},[r2]


end_core_loop_16:


end_loops:
    ldmfd       sp!,{r4-r12,r15}            @reload the registers from sp