/*
 * Copyright (C) 2012 The Android Open Source Project
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

/*
        r0 = dst
        r1 = y0 base pointer
        r2 = y1 base pointer
        r3 = y2 base pointer
        sp = coeffs
        sp = length / 2
*/

#define ENTRY(f) .text; .align 0; .globl f; .type f,#function; f: .fnstart
#define END(f) .fnend; .size f, .-f;

ENTRY(rsdIntrinsicConvolve3x3_K)
        push            {r4-r8, r10, r11, lr}
        vpush           {q4-q7}

        /* Get the coeffs pointer from the stack and load the
           coefficients in the q0, q1 NEON registers */
        ldr r4, [sp, #32+64]
        vld1.16 {q0, q1}, [r4]

        /* Get count from the stack */
        ldr r4, [sp, #36+64]

        /* Load the frequently used immediate in a register */
        mov r5, #8

1:
        /* Load and post-increase the address by r5=#8 */
        vld1.8 {q13}, [r1], r5
        vld1.8 {q14}, [r2], r5
        vld1.8 {q15}, [r3], r5

        /* Signal memory for data that will be used in the loop after the next */
        pld         [r1, r5]
        pld         [r2, r5]
        pld         [r3, r5]

        vmovl.u8 q2, d26
        vmovl.u8 q3, d27
        vmovl.u8 q4, d28
        vmovl.u8 q5, d29
        vmovl.u8 q6, d30
        vmovl.u8 q7, d31

/*
        The two pixel source array is
        d4,  d5,  d6,  d7
        d8,  d9,  d10, d11
        d12, d13, d14, d15
*/

        vmull.s16 q8, d4, d0[0]
        vmlal.s16 q8, d5, d0[1]
        vmlal.s16 q8, d6, d0[2]
        vmlal.s16 q8, d8, d0[3]
        vmlal.s16 q8, d9, d1[0]
        vmlal.s16 q8, d10, d1[1]
        vmlal.s16 q8, d12, d1[2]
        vmlal.s16 q8, d13, d1[3]
        vmlal.s16 q8, d14, d2[0]

        vmull.s16 q9, d5, d0[0]
        vmlal.s16 q9, d6, d0[1]
        vmlal.s16 q9, d7, d0[2]
        vmlal.s16 q9, d9, d0[3]
        vmlal.s16 q9, d10, d1[0]
        vmlal.s16 q9, d11, d1[1]
        vmlal.s16 q9, d13, d1[2]
        vmlal.s16 q9, d14, d1[3]
        vmlal.s16 q9, d15, d2[0]

        vshrn.i32 d16, q8, #8
        vshrn.i32 d17, q9, #8

        vqmovun.s16 d16, q8
        vst1.8 d16, [r0]!

        /* Are we done yet? */
        subs r4, r4, #1
        bne 1b

        /* We're done, bye! */
        vpop            {q4-q7}
        pop             {r4-r8, r10, r11, lr}
        bx              lr
END(rsdIntrinsicConvolve3x3_K)


/* Convolve 5x5 */

/*
        r0 = dst
        r1 = y0 base pointer
        r2 = y1 base pointer
        r3 = y2 base pointer
        r4 = y3 base pointer
        r5 = y4 base pointer
        r6 = coeffs
        r7 = length
*/
ENTRY(rsdIntrinsicConvolve5x5_K)
        push        {r4-r7, lr}
        vpush       {q4-q7}

        /* load y3 in r4 */
        ldr     r4, [sp, #20 + 64]

        /* load y4 in r5 */
        ldr     r5, [sp, #24 + 64]

        /* Load the coefficients pointer */
        ldr     r6, [sp, #28 + 64]

        /* Create the coefficients vector */
        vld1.16     {d0, d1, d2, d3}, [r6]!
        vld1.16     {d4, d5, d6}, [r6]

        vmov.u32  q15, #0x7f

        /* load the count */
        ldr     r6, [sp, #32 + 64]

        /* Load the frequently used immediate in a register */
        mov     r7, #8

1:
        /* Load the y base pointers in Qregs and post-increase the address by r7=#8 */
        vld1.8  {d24, d25, d26}, [r1], r7      @  y0 ( y - 2 )
        vld1.8  {d27, d28, d29}, [r2], r7      @  y0 ( y - 1 )

        /* Signal memory for data that will be used in the loop after the next */
        pld         [r1, r7]
        pld         [r2, r7]

        /* Promoting the 8bit channels to 16bit */
        vmovl.u8 q9,  d24
        vmovl.u8 q10, d25
        vmovl.u8 q11, d26
        vmovl.u8 q12, d27
        vmovl.u8 q13, d28
        vmovl.u8 q14, d29

/*
        d18,  d19,  d20, d21, d22, d23,
        d24,  d25
*/
        vmull.s16 q4, d18, d0[0]
        vmlal.s16 q4, d19, d0[1]
        vmlal.s16 q4, d20, d0[2]
        vmlal.s16 q4, d21, d0[3]
        vmlal.s16 q4, d22, d1[0]

        vmlal.s16 q4, d24, d1[1]
        vmlal.s16 q4, d25, d1[2]
        vmlal.s16 q4, d26, d1[3]
        vmlal.s16 q4, d27, d2[0]
        vmlal.s16 q4, d28, d2[1]

        vmull.s16 q5, d19, d0[0]
        vmlal.s16 q5, d20, d0[1]
        vmlal.s16 q5, d21, d0[2]
        vmlal.s16 q5, d22, d0[3]
        vmlal.s16 q5, d23, d1[0]

        vmlal.s16 q5, d25, d1[1]
        vmlal.s16 q5, d26, d1[2]
        vmlal.s16 q5, d27, d1[3]
        vmlal.s16 q5, d28, d2[0]
        vmlal.s16 q5, d29, d2[1]


        /* Next 2 rows */
        /* Load the y base pointers in Qregs and post-increase the address by r7=#8 */
        vld1.8  {d24, d25, d26}, [r3], r7      @  y0 ( y )
        vld1.8  {d27, d28, d29}, [r4], r7      @  y0 ( y + 1 )

        /* Signal memory for data that will be used in the loop after the next */
        pld         [r3, r7]
        pld         [r4, r7]

        /* Promoting the 8bit channels to 16bit */
        vmovl.u8 q9,  d24
        vmovl.u8 q10, d25
        vmovl.u8 q11, d26
        vmovl.u8 q12, d27
        vmovl.u8 q13, d28
        vmovl.u8 q14, d29

/*
        d18,  d19,  d20, d21, d22, d23,
        d24,  d25
*/
        vmlal.s16 q4, d18, d2[2]
        vmlal.s16 q4, d19, d2[3]
        vmlal.s16 q4, d20, d3[0]
        vmlal.s16 q4, d21, d3[1]
        vmlal.s16 q4, d22, d3[2]

        vmlal.s16 q4, d24, d3[3]
        vmlal.s16 q4, d25, d4[0]
        vmlal.s16 q4, d26, d4[1]
        vmlal.s16 q4, d27, d4[2]
        vmlal.s16 q4, d28, d4[3]

        vmlal.s16 q5, d19, d2[2]
        vmlal.s16 q5, d20, d2[3]
        vmlal.s16 q5, d21, d3[0]
        vmlal.s16 q5, d22, d3[1]
        vmlal.s16 q5, d23, d3[2]

        vmlal.s16 q5, d25, d3[3]
        vmlal.s16 q5, d26, d4[0]
        vmlal.s16 q5, d27, d4[1]
        vmlal.s16 q5, d28, d4[2]
        vmlal.s16 q5, d29, d4[3]

        /* Last row */
        /* Load the y base pointers in Qregs and post-increase the address by r7=#8 */
        vld1.8  {d24, d25, d26}, [r5], r7      @  y0 ( y + 2 )

        /* Signal memory for data that will be used in the loop after the next */
        pld         [r5, r7]

        /* Promoting the 8bit channels to 16bit */
        vmovl.u8 q9,  d24
        vmovl.u8 q10, d25
        vmovl.u8 q11, d26

/*
        d18,  d19,  d20, d21, d22, d23,
        d24,  d25
*/

        vmlal.s16 q4, d18, d5[0]
        vmlal.s16 q4, d19, d5[1]
        vmlal.s16 q4, d20, d5[2]
        vmlal.s16 q4, d21, d5[3]
        vmlal.s16 q4, d22, d6[0]

        vmlal.s16 q5, d19, d5[0]
        vmlal.s16 q5, d20, d5[1]
        vmlal.s16 q5, d21, d5[2]
        vmlal.s16 q5, d22, d5[3]
        vmlal.s16 q5, d23, d6[0]


        vadd.i32 q4, q4, q15
        vadd.i32 q5, q5, q15

/*      Narrow it to a d-reg 32 -> 16 bit */
        vrshrn.i32 d8, q4, #8
        vrshrn.i32 d9, q5, #8


/*      Pack 16 -> 8 bit, saturate, put two pixels into D reg */
        vqmovun.s16 d8, q4

        vst1.8 d8, [r0]!           @ return the output and increase the address of r0

        /* Are we done? */
        subs r6, r6, #1
        bne 1b

        /* Yup, bye */
        vpop        {q4-q7}
        pop         {r4-r7, lr}
        bx          lr

END(rsdIntrinsicConvolve5x5_K)