/* * Copyright (C) 2012 The Android Open Source Project * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ /* r0 = dst r1 = y0 base pointer r2 = y1 base pointer r3 = y2 base pointer sp = coeffs sp = length / 2 */ #define ENTRY(f) .text; .align 0; .globl f; .type f,#function; f: .fnstart #define END(f) .fnend; .size f, .-f; ENTRY(rsdIntrinsicConvolve3x3_K) push {r4-r8, r10, r11, lr} vpush {q4-q7} /* Get the coeffs pointer from the stack and load the coefficients in the q0, q1 NEON registers */ ldr r4, [sp, #32+64] vld1.16 {q0, q1}, [r4] /* Get count from the stack */ ldr r4, [sp, #36+64] /* Load the frequently used immediate in a register */ mov r5, #8 1: /* Load and post-increase the address by r5=#8 */ vld1.8 {q13}, [r1], r5 vld1.8 {q14}, [r2], r5 vld1.8 {q15}, [r3], r5 /* Signal memory for data that will be used in the loop after the next */ pld [r1, r5] pld [r2, r5] pld [r3, r5] vmovl.u8 q2, d26 vmovl.u8 q3, d27 vmovl.u8 q4, d28 vmovl.u8 q5, d29 vmovl.u8 q6, d30 vmovl.u8 q7, d31 /* The two pixel source array is d4, d5, d6, d7 d8, d9, d10, d11 d12, d13, d14, d15 */ vmull.s16 q8, d4, d0[0] vmlal.s16 q8, d5, d0[1] vmlal.s16 q8, d6, d0[2] vmlal.s16 q8, d8, d0[3] vmlal.s16 q8, d9, d1[0] vmlal.s16 q8, d10, d1[1] vmlal.s16 q8, d12, d1[2] vmlal.s16 q8, d13, d1[3] vmlal.s16 q8, d14, d2[0] vmull.s16 q9, d5, d0[0] vmlal.s16 q9, d6, d0[1] vmlal.s16 q9, d7, d0[2] vmlal.s16 q9, d9, d0[3] vmlal.s16 q9, d10, d1[0] vmlal.s16 q9, d11, d1[1] vmlal.s16 q9, d13, d1[2] vmlal.s16 q9, d14, d1[3] vmlal.s16 q9, d15, d2[0] vshrn.i32 d16, q8, #8 vshrn.i32 d17, q9, #8 vqmovun.s16 d16, q8 vst1.8 d16, [r0]! /* Are we done yet? */ subs r4, r4, #1 bne 1b /* We're done, bye! */ vpop {q4-q7} pop {r4-r8, r10, r11, lr} bx lr END(rsdIntrinsicConvolve3x3_K) /* Convolve 5x5 */ /* r0 = dst r1 = y0 base pointer r2 = y1 base pointer r3 = y2 base pointer r4 = y3 base pointer r5 = y4 base pointer r6 = coeffs r7 = length */ ENTRY(rsdIntrinsicConvolve5x5_K) push {r4-r7, lr} vpush {q4-q7} /* load y3 in r4 */ ldr r4, [sp, #20 + 64] /* load y4 in r5 */ ldr r5, [sp, #24 + 64] /* Load the coefficients pointer */ ldr r6, [sp, #28 + 64] /* Create the coefficients vector */ vld1.16 {d0, d1, d2, d3}, [r6]! vld1.16 {d4, d5, d6}, [r6] vmov.u32 q15, #0x7f /* load the count */ ldr r6, [sp, #32 + 64] /* Load the frequently used immediate in a register */ mov r7, #8 1: /* Load the y base pointers in Qregs and post-increase the address by r7=#8 */ vld1.8 {d24, d25, d26}, [r1], r7 @ y0 ( y - 2 ) vld1.8 {d27, d28, d29}, [r2], r7 @ y0 ( y - 1 ) /* Signal memory for data that will be used in the loop after the next */ pld [r1, r7] pld [r2, r7] /* Promoting the 8bit channels to 16bit */ vmovl.u8 q9, d24 vmovl.u8 q10, d25 vmovl.u8 q11, d26 vmovl.u8 q12, d27 vmovl.u8 q13, d28 vmovl.u8 q14, d29 /* d18, d19, d20, d21, d22, d23, d24, d25 */ vmull.s16 q4, d18, d0[0] vmlal.s16 q4, d19, d0[1] vmlal.s16 q4, d20, d0[2] vmlal.s16 q4, d21, d0[3] vmlal.s16 q4, d22, d1[0] vmlal.s16 q4, d24, d1[1] vmlal.s16 q4, d25, d1[2] vmlal.s16 q4, d26, d1[3] vmlal.s16 q4, d27, d2[0] vmlal.s16 q4, d28, d2[1] vmull.s16 q5, d19, d0[0] vmlal.s16 q5, d20, d0[1] vmlal.s16 q5, d21, d0[2] vmlal.s16 q5, d22, d0[3] vmlal.s16 q5, d23, d1[0] vmlal.s16 q5, d25, d1[1] vmlal.s16 q5, d26, d1[2] vmlal.s16 q5, d27, d1[3] vmlal.s16 q5, d28, d2[0] vmlal.s16 q5, d29, d2[1] /* Next 2 rows */ /* Load the y base pointers in Qregs and post-increase the address by r7=#8 */ vld1.8 {d24, d25, d26}, [r3], r7 @ y0 ( y ) vld1.8 {d27, d28, d29}, [r4], r7 @ y0 ( y + 1 ) /* Signal memory for data that will be used in the loop after the next */ pld [r3, r7] pld [r4, r7] /* Promoting the 8bit channels to 16bit */ vmovl.u8 q9, d24 vmovl.u8 q10, d25 vmovl.u8 q11, d26 vmovl.u8 q12, d27 vmovl.u8 q13, d28 vmovl.u8 q14, d29 /* d18, d19, d20, d21, d22, d23, d24, d25 */ vmlal.s16 q4, d18, d2[2] vmlal.s16 q4, d19, d2[3] vmlal.s16 q4, d20, d3[0] vmlal.s16 q4, d21, d3[1] vmlal.s16 q4, d22, d3[2] vmlal.s16 q4, d24, d3[3] vmlal.s16 q4, d25, d4[0] vmlal.s16 q4, d26, d4[1] vmlal.s16 q4, d27, d4[2] vmlal.s16 q4, d28, d4[3] vmlal.s16 q5, d19, d2[2] vmlal.s16 q5, d20, d2[3] vmlal.s16 q5, d21, d3[0] vmlal.s16 q5, d22, d3[1] vmlal.s16 q5, d23, d3[2] vmlal.s16 q5, d25, d3[3] vmlal.s16 q5, d26, d4[0] vmlal.s16 q5, d27, d4[1] vmlal.s16 q5, d28, d4[2] vmlal.s16 q5, d29, d4[3] /* Last row */ /* Load the y base pointers in Qregs and post-increase the address by r7=#8 */ vld1.8 {d24, d25, d26}, [r5], r7 @ y0 ( y + 2 ) /* Signal memory for data that will be used in the loop after the next */ pld [r5, r7] /* Promoting the 8bit channels to 16bit */ vmovl.u8 q9, d24 vmovl.u8 q10, d25 vmovl.u8 q11, d26 /* d18, d19, d20, d21, d22, d23, d24, d25 */ vmlal.s16 q4, d18, d5[0] vmlal.s16 q4, d19, d5[1] vmlal.s16 q4, d20, d5[2] vmlal.s16 q4, d21, d5[3] vmlal.s16 q4, d22, d6[0] vmlal.s16 q5, d19, d5[0] vmlal.s16 q5, d20, d5[1] vmlal.s16 q5, d21, d5[2] vmlal.s16 q5, d22, d5[3] vmlal.s16 q5, d23, d6[0] vadd.i32 q4, q4, q15 vadd.i32 q5, q5, q15 /* Narrow it to a d-reg 32 -> 16 bit */ vrshrn.i32 d8, q4, #8 vrshrn.i32 d9, q5, #8 /* Pack 16 -> 8 bit, saturate, put two pixels into D reg */ vqmovun.s16 d8, q4 vst1.8 d8, [r0]! @ return the output and increase the address of r0 /* Are we done? */ subs r6, r6, #1 bne 1b /* Yup, bye */ vpop {q4-q7} pop {r4-r7, lr} bx lr END(rsdIntrinsicConvolve5x5_K)