diff options
author | Hamsalekha S <hamsalekha.s@ittiam.com> | 2015-03-13 21:24:58 +0530 |
---|---|---|
committer | Hamsalekha S <hamsalekha.s@ittiam.com> | 2015-04-02 15:59:02 +0530 |
commit | 8d3d303c7942ced6a987a52db8977d768dc3605f (patch) | |
tree | cc806c96794356996b13ba9970941d0aed74a97e /encoder/arm | |
parent | 3956d913d37327dcb340f836e604b04bd478b158 (diff) | |
download | android_external_libavc-8d3d303c7942ced6a987a52db8977d768dc3605f.tar.gz android_external_libavc-8d3d303c7942ced6a987a52db8977d768dc3605f.tar.bz2 android_external_libavc-8d3d303c7942ced6a987a52db8977d768dc3605f.zip |
Initial version
Change-Id: I7efe9a589cd24edf86e8d086b40c27cbbf8b4017
Diffstat (limited to 'encoder/arm')
-rwxr-xr-x | encoder/arm/ih264e_evaluate_intra16x16_modes_a9q.s | 313 | ||||
-rwxr-xr-x | encoder/arm/ih264e_evaluate_intra4x4_modes_a9q.s | 529 | ||||
-rwxr-xr-x | encoder/arm/ih264e_evaluate_intra_chroma_modes_a9q.s | 346 | ||||
-rwxr-xr-x | encoder/arm/ih264e_fmt_conv.s | 329 | ||||
-rwxr-xr-x | encoder/arm/ih264e_function_selector.c | 170 | ||||
-rwxr-xr-x | encoder/arm/ih264e_function_selector_a9q.c | 252 | ||||
-rwxr-xr-x | encoder/arm/ih264e_function_selector_av8.c | 259 | ||||
-rwxr-xr-x | encoder/arm/ih264e_half_pel.s | 951 | ||||
-rwxr-xr-x | encoder/arm/ih264e_platform_macros.h | 143 | ||||
-rwxr-xr-x | encoder/arm/ime_distortion_metrics_a9q.s | 1353 | ||||
-rwxr-xr-x | encoder/arm/ime_platform_macros.h | 51 |
11 files changed, 4696 insertions, 0 deletions
diff --git a/encoder/arm/ih264e_evaluate_intra16x16_modes_a9q.s b/encoder/arm/ih264e_evaluate_intra16x16_modes_a9q.s new file mode 100755 index 0000000..fe0ce17 --- /dev/null +++ b/encoder/arm/ih264e_evaluate_intra16x16_modes_a9q.s @@ -0,0 +1,313 @@ +@/****************************************************************************** +@ * +@ * Copyright (C) 2015 The Android Open Source Project +@ * +@ * Licensed under the Apache License, Version 2.0 (the "License"); +@ * you may not use this file except in compliance with the License. +@ * You may obtain a copy of the License at: +@ * +@ * http://www.apache.org/licenses/LICENSE-2.0 +@ * +@ * Unless required by applicable law or agreed to in writing, software +@ * distributed under the License is distributed on an "AS IS" BASIS, +@ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +@ * See the License for the specific language governing permissions and +@ * limitations under the License. +@ * +@ ***************************************************************************** +@ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore +@*/ +@/** + +@/** +@****************************************************************************** +@* +@* @brief :Evaluate best intra 16x16 mode (among VERT, HORZ and DC ) +@* and do the prediction. +@* +@* @par Description +@* This function evaluates first three 16x16 modes and compute corresponding sad +@* and return the buffer predicted with best mode. +@* +@* @param[in] pu1_src +@* UWORD8 pointer to the source +@* +@** @param[in] pu1_ngbr_pels_i16 +@* UWORD8 pointer to neighbouring pels +@* +@* @param[out] pu1_dst +@* UWORD8 pointer to the destination +@* +@* @param[in] src_strd +@* integer source stride +@* +@* @param[in] dst_strd +@* integer destination stride +@* +@* @param[in] u4_n_avblty +@* availability of neighbouring pixels +@* +@* @param[in] u4_intra_mode +@* Pointer to the variable in which best mode is returned +@* +@* @param[in] pu4_sadmin +@* Pointer to the variable in which minimum sad is returned +@* +@* @param[in] u4_valid_intra_modes +@* Says what all modes are valid +@* +@* +@* @return none +@* +@****************************************************************************** +@*/ +@ +@void ih264e_evaluate_intra16x16_modes(UWORD8 *pu1_src, +@ UWORD8 *pu1_ngbr_pels_i16, +@ UWORD8 *pu1_dst, +@ UWORD32 src_strd, +@ UWORD32 dst_strd, +@ WORD32 u4_n_avblty, +@ UWORD32 *u4_intra_mode, +@ WORD32 *pu4_sadmin, +@ UWORD32 u4_valid_intra_modes) +@ +.text +.p2align 2 + + .global ih264e_evaluate_intra16x16_modes_a9q + +ih264e_evaluate_intra16x16_modes_a9q: + +@r0 = pu1_src, +@r1 = pu1_ngbr_pels_i16, +@r2 = pu1_dst, +@r3 = src_strd, +@r4 = dst_strd, +@r5 = u4_n_avblty, +@r6 = u4_intra_mode, +@r7 = pu4_sadmin + + + + stmfd sp!, {r4-r12, r14} @store register values to stack + ldr r5, [sp, #44] + + + vpush {d8-d15} + vld1.32 {q4}, [r1]! + sub r6, r1, #1 + add r1, r1, #1 + mov r10, #0 + vld1.32 {q5}, [r1]! + mov r11, #0 + mov r4, #0 + @/* Left available ???? + ands r7, r5, #01 + movne r10, #1 + + @/* Top available ???? + ands r8, r5, #04 + lsl r9, r10, #3 + movne r11, #1 + lsl r12, r11, #3 + adds r8, r9, r12 + + + @/* None available :( + moveq r4, #128 + + + +@/fINDING dc val*/ + @---------------------- + vaddl.u8 q15, d8, d9 + + vaddl.u8 q14, d10, d11 + + vadd.u16 q15, q14, q15 + @ VLD1.32 {q2},[r0],r3;row 2 + vadd.u16 d30, d31, d30 + vpadd.u16 d30, d30 + @ VLD1.32 {q3},[r0],r3 ;row 3 + vpadd.u16 d30, d30 + @--------------------- + + + vmov.u16 r7, d30[0] + add r7, r7, r8 + add r11, r11, #3 + add r8, r10, r11 + + lsr r7, r8 + add r7, r4, r7 + vld1.32 {q0}, [r0], r3 @ source r0w 0 + vdup.8 q15, r7 @dc val + +@/* computing SADs for all three modes*/ + ldrb r7, [r6] + vdup.8 q10, r7 @/HORIZONTAL VALUE ROW=0; + @/vertical row 0; + vabdl.u8 q8, d0, d10 + vabdl.u8 q9, d1, d11 + sub r6, r6, #1 + @/HORZ row 0; + vabdl.u8 q13, d0, d20 + vabdl.u8 q14, d1, d21 + mov r1, #15 + @/dc row 0; + vabdl.u8 q11, d0, d30 + vabdl.u8 q12, d1, d31 + + +loop: + vld1.32 {q1}, [r0], r3 @row i + @/dc row i; + vabal.u8 q11, d2, d30 + ldrb r7, [r6] + vabal.u8 q12, d3, d31 + + @/vertical row i; + vabal.u8 q8, d2, d10 + vdup.8 q10, r7 @/HORIZONTAL VALUE ROW=i; + sub r6, r6, #1 + vabal.u8 q9, d3, d11 + + subs r1, r1, #1 + @/HORZ row i; + vabal.u8 q13, d2, d20 + vabal.u8 q14, d3, d21 + bne loop + + @------------------------------------------------------------------------------ + + vadd.i16 q9, q9, q8 @/VERT + vadd.i16 d18, d19, d18 @/VERT + vpaddl.u16 d18, d18 @/VERT + vadd.i16 q14, q13, q14 @/HORZ + vadd.i16 d28, d29, d28 @/HORZ + vpaddl.u32 d18, d18 @/VERT + vpaddl.u16 d28, d28 @/HORZ + + vpaddl.u32 d28, d28 @/HORZ + vmov.u32 r8, d18[0] @ vert + vadd.i16 q12, q11, q12 @/DC + vmov.u32 r9, d28[0] @horz + mov r11, #1 + vadd.i16 d24, d24, d25 @/DC + lsl r11 , #30 + + @----------------------- + ldr r0, [sp, #120] @ u4_valid_intra_modes + @-------------------------------------------- + ands r7, r0, #01 @ vert mode valid???????????? + moveq r8, r11 + vpaddl.u16 d24, d24 @/DC + + ands r6, r0, #02 @ horz mode valid???????????? + moveq r9, r11 + vpaddl.u32 d24, d24 @/DC + + vmov.u32 r10, d24[0] @dc +@-------------------------------- + ldr r4, [sp, #104] @r4 = dst_strd, + ldr r7, [sp, #116] @r7 = pu4_sadmin +@---------------------------------------------- + ands r6, r0, #04 @ dc mode valid???????????? + moveq r10, r11 + + @--------------------------- + ldr r6, [sp, #112] @ R6 =MODE + @-------------------------- + + cmp r8, r9 + bgt not_vert + cmp r8, r10 + bgt do_dc + + @/---------------------- + @DO VERTICAL PREDICTION + str r8 , [r7] @MIN SAD + mov r8, #0 + str r8 , [r6] @ MODE + vmov q15, q5 + + b do_dc_vert + @----------------------------- +not_vert: + cmp r9, r10 + bgt do_dc + + @/---------------------- + @DO HORIZONTAL + vdup.8 q5, d9[7] @0 + str r9 , [r7] @MIN SAD + vdup.8 q6, d9[6] @1 + mov r9, #1 + vdup.8 q7, d9[5] @2 + vst1.32 {d10, d11} , [r2], r4 @0 + vdup.8 q8, d9[4] @3 + str r9 , [r6] @ MODE + vdup.8 q9, d9[3] @4 + vst1.32 {d12, d13} , [r2], r4 @1 + vdup.8 q10, d9[2] @5 + vst1.32 {d14, d15} , [r2], r4 @2 + vdup.8 q11, d9[1] @6 + vst1.32 {d16, d17} , [r2], r4 @3 + vdup.8 q12, d9[0] @7 + vst1.32 {d18, d19} , [r2], r4 @4 + vdup.8 q13, d8[7] @8 + vst1.32 {d20, d21} , [r2], r4 @5 + vdup.8 q14, d8[6] @9 + vst1.32 {d22, d23} , [r2], r4 @6 + vdup.8 q15, d8[5] @10 + vst1.32 {d24, d25} , [r2], r4 @7 + vdup.8 q1, d8[4] @11 + vst1.32 {d26, d27} , [r2], r4 @8 + vdup.8 q2, d8[3] @12 + vst1.32 {d28, d29} , [r2], r4 @9 + vdup.8 q3, d8[2] @13 + vst1.32 {d30, d31}, [r2], r4 @10 + vdup.8 q5, d8[1] @14 + vst1.32 {d2, d3} , [r2], r4 @11 + vdup.8 q6, d8[0] @15 + vst1.32 {d4, d5} , [r2], r4 @12 + + vst1.32 {d6, d7} , [r2], r4 @13 + + vst1.32 {d10, d11} , [r2], r4 @14 + + vst1.32 {d12, d13} , [r2], r4 @15 + b end_func + + + @/----------------------------- + +do_dc: @/--------------------------------- + @DO DC + str r10 , [r7] @MIN SAD + mov r10, #2 + str r10 , [r6] @ MODE +do_dc_vert: + vst1.32 {d30, d31}, [r2], r4 @0 + vst1.32 {d30, d31}, [r2], r4 @1 + vst1.32 {d30, d31}, [r2], r4 @2 + vst1.32 {d30, d31}, [r2], r4 @3 + vst1.32 {d30, d31}, [r2], r4 @4 + vst1.32 {d30, d31}, [r2], r4 @5 + vst1.32 {d30, d31}, [r2], r4 @6 + vst1.32 {d30, d31}, [r2], r4 @7 + vst1.32 {d30, d31}, [r2], r4 @8 + vst1.32 {d30, d31}, [r2], r4 @9 + vst1.32 {d30, d31}, [r2], r4 @10 + vst1.32 {d30, d31}, [r2], r4 @11 + vst1.32 {d30, d31}, [r2], r4 @12 + vst1.32 {d30, d31}, [r2], r4 @13 + vst1.32 {d30, d31}, [r2], r4 @14 + vst1.32 {d30, d31}, [r2], r4 @15 + @/------------------ +end_func: + vpop {d8-d15} + ldmfd sp!, {r4-r12, pc} @Restoring registers from stack + + diff --git a/encoder/arm/ih264e_evaluate_intra4x4_modes_a9q.s b/encoder/arm/ih264e_evaluate_intra4x4_modes_a9q.s new file mode 100755 index 0000000..568e623 --- /dev/null +++ b/encoder/arm/ih264e_evaluate_intra4x4_modes_a9q.s @@ -0,0 +1,529 @@ +@/****************************************************************************** +@ * +@ * Copyright (C) 2015 The Android Open Source Project +@ * +@ * Licensed under the Apache License, Version 2.0 (the "License"); +@ * you may not use this file except in compliance with the License. +@ * You may obtain a copy of the License at: +@ * +@ * http://www.apache.org/licenses/LICENSE-2.0 +@ * +@ * Unless required by applicable law or agreed to in writing, software +@ * distributed under the License is distributed on an "AS IS" BASIS, +@ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +@ * See the License for the specific language governing permissions and +@ * limitations under the License. +@ * +@ ***************************************************************************** +@ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore +@*/ +@/** + +@/** + +.data +.p2align 2 + +scratch_intrapred_luma_4x4_prediction: + .long ver, hor, d_c, dia_dl + .long dia_dr, ver_r, hor_d, ver_l + .long hor_u + + +.text +.p2align 2 + +scratch_intrapred_luma_4x4_prediction_addr1: + .long scratch_intrapred_luma_4x4_prediction - scrintra_4x4 - 8 + + + +@/** +@/** +@****************************************************************************** +@* +@* @brief :Evaluate best intra 4x4 mode +@* and do the prediction. +@* +@* @par Description +@* This function evaluates 4x4 modes and compute corresponding sad +@* and return the buffer predicted with best mode. +@* +@* @param[in] pu1_src +@* UWORD8 pointer to the source +@* +@** @param[in] pu1_ngbr_pels +@* UWORD8 pointer to neighbouring pels +@* +@* @param[out] pu1_dst +@* UWORD8 pointer to the destination +@* +@* @param[in] src_strd +@* integer source stride +@* +@* @param[in] dst_strd +@* integer destination stride +@* +@* @param[in] u4_n_avblty +@* availability of neighbouring pixels +@* +@* @param[in] u4_intra_mode +@* Pointer to the variable in which best mode is returned +@* +@* @param[in] pu4_sadmin +@* Pointer to the variable in which minimum cost is returned +@* +@* @param[in] u4_valid_intra_modes +@* Says what all modes are valid +@* +@* * @param[in] u4_lambda +@* Lamda value for computing cost from SAD +@* +@* @param[in] u4_predictd_mode +@* Predicted mode for cost computation +@* +@* +@* +@* @return none +@* +@****************************************************************************** +@*/ +@void ih264e_evaluate_intra_4x4_modes(UWORD8 *pu1_src, +@ UWORD8 *pu1_ngbr_pels, +@ UWORD8 *pu1_dst, +@ UWORD32 src_strd, +@ UWORD32 dst_strd, +@ WORD32 u4_n_avblty, +@ UWORD32 *u4_intra_mode, +@ WORD32 *pu4_sadmin, +@ UWORD32 u4_valid_intra_modes, +@ UWORD32 u4_lambda, +@ UWORD32 u4_predictd_mode) + + + + .global ih264e_evaluate_intra_4x4_modes_a9q + +ih264e_evaluate_intra_4x4_modes_a9q: + +@r0 = pu1_src, +@r1 = pu1_ngbr_pels_i16, +@r2 = pu1_dst, +@r3 = src_strd, +@r4 = dst_strd, +@r5 = u4_n_avblty, +@r6 = u4_intra_mode, +@r7 = pu4_sadmin +@r8 = u4_valid_intra_modes +@r0 =u4_lambda +@r1 = u4_predictd_mode + + + stmfd sp!, {r4-r12, r14} @store register values to stack + +@-------------------- + ldr r5, [sp, #44] @r5 = u4_n_avblty, +@---------------------- + vpush {d8-d15} +@Loading neighbours + vld1.32 {q0}, [r1] + add r4, r1, #12 + vld1.8 d1[5], [r4] + vld1.8 d1[7], [r1] + @-------------------------------- + ldr r8, [sp, #120] @u4_valid_intra_modes +@---------------------------------------------- + + + +@ LOADING pu1_src + vld1.32 {d20[0]}, [r0], r3 + vext.8 q1, q0, q0, #1 + vld1.32 {d20[1]}, [r0], r3 + mov r11, #1 + vld1.32 {d21[0]}, [r0], r3 + lsl r11, r11, #30 + vld1.32 {d21[1]}, [r0], r3 + + + +@-------------------------------- + ldr r0, [sp, #124] @r0 =u4_lambda + ldr r1, [sp, #128] @r1 = u4_predictd_mode +@------ + + +vert: + ands r10, r8, #01 @VERT sad ?? + beq horz + vdup.32 q2, d2[1] + vabdl.u8 q14, d4, d20 + vabal.u8 q14, d4, d21 + vadd.i16 d28, d29, d28 + subs r6, r1, #0 + vpaddl.u16 d28, d28 @ + lslne r6, r0, #2 + vpaddl.u32 d28, d28 @/ + moveq r6, r0 @ + vmov.u32 r9, d28[0] @ vert + add r9, r6, r9 + + subs r6, r11, r9 + movgt r11, r9 + movgt r12, #0 + +horz: + ands r10, r8, #02 @HORZ sad ?? + beq dc + vdup.32 q3, d0[0] + vmov.32 q4, q3 + vtrn.8 q3, q4 + vtrn.16 d7, d6 + vtrn.16 d9, d8 + vtrn.32 d9, d7 + vtrn.32 d8, d6 + vabdl.u8 q14, d6, d20 + subs r6, r1, #1 + vabal.u8 q14, d7, d21 + vadd.i16 d28, d29, d28 + lslne r6, r0, #2 + vpaddl.u16 d28, d28 @ + vpaddl.u32 d28, d28 @/ + vmov.u32 r9, d28[0] @ + moveq r6, r0 @ + add r9, r6, r9 + + subs r6, r11, r9 + movgt r11, r9 + movgt r12, #1 + +dc: + ands r10, r8, #04 @DC sad ?? + beq diags + vext.8 q4, q0, q0, #5 + vaddl.u8 q4, d0, d8 + vpaddl.u16 d8, d8 @ + vpaddl.u32 d8, d8 @/ + vmov.u32 r4, d8[0] @ + mov r14, #1 + ands r10, r5, #1 + addne r4, r4, #2 + addne r14, r14, #1 + ands r10, r5, #4 + addne r4, r4, #2 + addne r14, r14, #1 + ands r10, r5, #5 + moveq r4, #128 + moveq r14, #0 + subs r6, r1, #2 + lsr r4, r4, r14 + vdup.8 q4, r4 + lslne r6, r0, #2 + vabdl.u8 q14, d8, d20 + vabal.u8 q14, d9, d21 + vadd.i16 d28, d29, d28 + vpaddl.u16 d28, d28 @ + vpaddl.u32 d28, d28 @/ + vmov.u32 r9, d28[0] @ + + moveq r6, r0 @ + add r9, r6, r9 + + subs r6, r11, r9 + movgt r11, r9 + movgt r12, #2 + +diags: + ands r10, r8, #504 @/* if modes other than VERT, HORZ and DC are valid ????*/ + beq pred + @/* Performing FILT11 and FILT121 operation for all neighbour values*/ + vext.8 q5, q0, q0, #2 + vaddl.u8 q6, d0, d2 + vaddl.u8 q7, d1, d3 + vaddl.u8 q8, d10, d2 + vaddl.u8 q9, d11, d3 + vadd.u16 q12, q10, q11 + vqrshrun.s16 d10, q6, #1 + vqrshrun.s16 d11, q7, #1 + vadd.u16 q11, q6, q8 + vadd.u16 q12, q7, q9 + vqrshrun.s16 d12, q11, #2 + vqrshrun.s16 d13, q12, #2 + mov r14, #0 + vdup.32 q13 , r14 + mov r14, #-1 + vmov.i32 d26[0], r14 + +diag_dl: + ands r10, r8, #0x08 @DIAG_DL sad ?? + beq diag_dr + + vext.8 q15, q6, q6, #5 + vbit.32 d14, d30, d26 + vext.8 q15, q6, q6, #15 + vbit.32 d15, d31, d26 + vext.8 q15, q6, q6, #2 + vext.32 q14, q13, q13, #3 + vbit.32 d14, d30, d28 + vext.8 q15, q6, q6, #4 + vbit.32 d15, d30, d28 + vabdl.u8 q14, d14, d20 + subs r6, r1, #3 + vabal.u8 q14, d15, d21 + vadd.i16 d28, d29, d28 + vpaddl.u16 d28, d28 @ + lslne r6, r0, #2 + vpaddl.u32 d28, d28 @/ + vmov.u32 r9, d28[0] @ + + moveq r6, r0 @ + add r9, r6, r9 + + subs r6, r11, r9 + movgt r11, r9 + movgt r12, #3 + +diag_dr: + ands r10, r8, #16 @DIAG_DR sad ?? + beq vert_r + + vext.8 q15, q6, q6, #3 + vbit.32 d16, d30, d26 + vext.8 q15, q6, q6, #1 + vbit.32 d17, d30, d26 + vext.8 q15, q6, q6, #4 + vext.32 q14, q13, q13, #3 + vbit.32 d17, d31, d28 + vext.8 q15, q6, q6, #6 + vbit.32 d16, d31, d28 + vabdl.u8 q14, d16, d20 + subs r6, r1, #4 + vabal.u8 q14, d17, d21 + vadd.i16 d28, d29, d28 + vpaddl.u16 d28, d28 @ + lslne r6, r0, #2 + vpaddl.u32 d28, d28 @/ + vmov.u32 r9, d28[0] @ + + moveq r6, r0 @ + add r9, r6, r9 + + subs r6, r11, r9 + movgt r11, r9 + movgt r12, #4 + +vert_r: + ands r10, r8, #32 @VERT_R sad ?? + beq horz_d + vext.8 q15, q5, q5, #4 + vbit.32 d18, d30, d26 + vext.8 q15, q5, q5, #3 + vbit.32 d19, d30, d26 + vext.32 q14, q13, q13, #3 + vext.8 q15, q6, q6, #15 + vbit.32 d18, d30, d28 + vext.8 q15, q6, q6, #14 + vbit.32 d19, d30, d28 + mov r14, #0 + vdup.32 q14 , r14 + mov r14, #0xff + vmov.i8 d28[0], r14 + vext.8 q15, q6, q6, #2 + vbit.32 d19, d30, d28 + vext.32 q14, q14, q14, #3 + subs r6, r1, #5 + vext.8 q15, q6, q6, #13 + vbit.32 d19, d30, d28 + lslne r6, r0, #2 + vabdl.u8 q14, d18, d20 + vabal.u8 q14, d19, d21 + vadd.i16 d28, d29, d28 + vpaddl.u16 d28, d28 @ + vpaddl.u32 d28, d28 @/ + vmov.u32 r9, d28[0] @ + + + moveq r6, r0 @ + add r9, r6, r9 + + subs r6, r11, r9 + movgt r11, r9 + movgt r12, #5 + +horz_d: + vmov.8 q1, q5 + vmov.8 q15, q6 + vzip.8 q1, q15 + + ands r10, r8, #64 @HORZ_D sad ?? + beq vert_l + vext.8 q15, q6, q6, #2 + vbit.32 d8, d30, d26 + mov r14, #0 + vdup.32 q14 , r14 + mov r14, #0xff + vmov.i8 d28[0], r14 + vext.8 q15, q5, q5, #3 + vbit.32 d8, d30, d28 + vext.8 q15, q1, q1, #2 + vbit.32 d9, d30, d26 + vext.32 q14, q13, q13, #3 + vbit.32 d8, d2, d28 + subs r6, r1, #6 + vext.8 q15, q1, q1, #12 + vbit.32 d9, d30, d28 + vabdl.u8 q14, d8, d20 + vabal.u8 q14, d9, d21 + vadd.i16 d28, d29, d28 + vpaddl.u16 d28, d28 @ + lslne r6, r0, #2 + vpaddl.u32 d28, d28 @/ + vmov.u32 r9, d28[0] @ + + + moveq r6, r0 @ + add r9, r6, r9 + + subs r6, r11, r9 + movgt r11, r9 + movgt r12, #6 +vert_l: + ands r10, r8, #128 @VERT_L sad ?? + beq horz_u + vext.8 q15, q5, q5, #5 + vbit.32 d24, d30, d26 + vext.8 q15, q15, q15, #1 + vbit.32 d25, d30, d26 + vext.8 q15, q6, q6, #1 + vext.32 q14, q13, q13, #3 + vbit.32 d24, d30, d28 + vext.8 q15, q15, q15, #1 + subs r6, r1, #7 + vbit.32 d25, d30, d28 + vabdl.u8 q14, d24, d20 + vabal.u8 q14, d25, d21 + vadd.i16 d28, d29, d28 + vpaddl.u16 d28, d28 @ + lslne r6, r0, #2 + vpaddl.u32 d28, d28 @/ + vmov.u32 r9, d28[0] @ + + moveq r6, r0 @ + add r9, r6, r9 + + subs r6, r11, r9 + movgt r11, r9 + movgt r12, #7 + +horz_u: + ands r10, r8, #256 @HORZ_U sad ?? + beq pred + vrev64.8 q5, q1 + vdup.8 q1, d0[0] + vext.8 q6, q6, #7 + mov r14, #0 + vdup.32 q14 , r14 + mov r14, #0xff + vmov.i8 d28[0], r14 + vbit.32 d11, d13, d28 + movw r14, #0xffff + vmov.i16 d28[0], r14 + vext.8 q6, q5, q5, #7 + subs r6, r1, #8 + vbit.32 d3, d12, d28 + vext.8 q6, q5, q5, #3 + vbit.32 d2, d12, d26 + vext.32 q14, q13, q13, #3 + vext.8 q6, q5, q5, #1 + vbit.32 d2, d12, d28 + vabdl.u8 q14, d2, d20 + vabal.u8 q14, d3, d21 + vadd.i16 d28, d29, d28 + vpaddl.u16 d28, d28 @ + lslne r6, r0, #2 + vpaddl.u32 d28, d28 @/ + vmov.u32 r9, d28[0] @ + + + moveq r6, r0 @ + add r9, r6, r9 + + subs r6, r11, r9 + movgt r11, r9 + movgt r12, #8 + +pred: @/*dOING FINAL PREDICTION*/ +@--------------------------- + ldr r7, [sp, #116] @r7 = pu4_sadmin + ldr r6, [sp, #112] @ R6 =MODE +@-------------------------- + str r11, [r7] @/STORING MIN SAD*/ + str r12, [r6] @/FINAL MODE*/ + + + ldr r3, scratch_intrapred_luma_4x4_prediction_addr1 +scrintra_4x4: + add r3, r3, pc + lsl r12, r12, #2 + add r3, r3, r12 + + ldr r5, [r3] + and r5, r5, #0xfffffffe + + bx r5 + + +ver: + vext.8 q0, q0, q0, #1 + vdup.32 q15, d0[1] + b store + +hor: + vmov.32 q15, q3 + b store + +d_c: + vdup.8 q15, r4 + b store + +dia_dl: + vmov.32 q15, q7 + b store + +dia_dr: + vmov.32 q15, q8 + b store + +ver_r: + vmov.32 q15, q9 + b store + +hor_d: + vmov.32 q15, q4 + b store + +ver_l: + vmov.32 q15, q12 + b store + +hor_u: + vmov.32 q15, q1 + +store: @/* storing to pu1_dst*/ + + ldr r4, [sp, #104] @r4 = dst_strd, + + vst1.32 {d30[0]}, [r2], r4 + vst1.32 {d30[1]}, [r2], r4 + vst1.32 {d31[0]}, [r2], r4 + vst1.32 {d31[1]}, [r2], r4 + + +end_func: + vpop {d8-d15} + ldmfd sp!, {r4-r12, pc} @Restoring registers from stack + + + + + diff --git a/encoder/arm/ih264e_evaluate_intra_chroma_modes_a9q.s b/encoder/arm/ih264e_evaluate_intra_chroma_modes_a9q.s new file mode 100755 index 0000000..e4dfca8 --- /dev/null +++ b/encoder/arm/ih264e_evaluate_intra_chroma_modes_a9q.s @@ -0,0 +1,346 @@ +@/****************************************************************************** +@ * +@ * Copyright (C) 2015 The Android Open Source Project +@ * +@ * Licensed under the Apache License, Version 2.0 (the "License"); +@ * you may not use this file except in compliance with the License. +@ * You may obtain a copy of the License at: +@ * +@ * http://www.apache.org/licenses/LICENSE-2.0 +@ * +@ * Unless required by applicable law or agreed to in writing, software +@ * distributed under the License is distributed on an "AS IS" BASIS, +@ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +@ * See the License for the specific language governing permissions and +@ * limitations under the License. +@ * +@ ***************************************************************************** +@ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore +@*/ +@/** + +@/** +@****************************************************************************** +@* +@* @brief :Evaluate best intr chroma mode (among VERT, HORZ and DC ) +@* and do the prediction. +@* +@* @par Description +@* This function evaluates first three intra chroma modes and compute corresponding sad +@* and return the buffer predicted with best mode. +@* +@* @param[in] pu1_src +@* UWORD8 pointer to the source +@* +@** @param[in] pu1_ngbr_pels +@* UWORD8 pointer to neighbouring pels +@* +@* @param[out] pu1_dst +@* UWORD8 pointer to the destination +@* +@* @param[in] src_strd +@* integer source stride +@* +@* @param[in] dst_strd +@* integer destination stride +@* +@* @param[in] u4_n_avblty +@* availability of neighbouring pixels +@* +@* @param[in] u4_intra_mode +@* Pointer to the variable in which best mode is returned +@* +@* @param[in] pu4_sadmin +@* Pointer to the variable in which minimum sad is returned +@* +@* @param[in] u4_valid_intra_modes +@* Says what all modes are valid +@* +@* +@* @return none +@* +@****************************************************************************** +@*/ +@ +@void ih264e_evaluate_intra_chroma_modes(UWORD8 *pu1_src, +@ UWORD8 *pu1_ngbr_pels_i16, +@ UWORD8 *pu1_dst, +@ UWORD32 src_strd, +@ UWORD32 dst_strd, +@ WORD32 u4_n_avblty, +@ UWORD32 *u4_intra_mode, +@ WORD32 *pu4_sadmin, +@ UWORD32 u4_valid_intra_modes) +@ +.text +.p2align 2 + + .global ih264e_evaluate_intra_chroma_modes_a9q + +ih264e_evaluate_intra_chroma_modes_a9q: + +@r0 = pu1_src, +@r1 = pu1_ngbr_pels_i16, +@r2 = pu1_dst, +@r3 = src_strd, +@r4 = dst_strd, +@r5 = u4_n_avblty, +@r6 = u4_intra_mode, +@r7 = pu4_sadmin + + + + stmfd sp!, {r4-r12, r14} @store register values to stack + @----------------------- + ldr r5, [sp, #44] @r5 = u4_n_avblty, + @------------------------- + mov r12, r1 @ + vpush {d8-d15} + vld1.32 {q4}, [r1]! + add r1, r1, #2 + vld1.32 {q5}, [r1]! + + vuzp.u8 q4, q5 @ + + vpaddl.u8 d8, d8 + vpadd.u16 d8, d8 + + vpaddl.u8 d9, d9 + vpadd.u16 d9, d9 + + vpaddl.u8 d10, d10 + vpadd.u16 d10, d10 + + vpaddl.u8 d11, d11 + + and r7, r5, #5 + vpadd.u16 d11, d11 + subs r8, r7, #5 + beq all_available + subs r8, r7, #4 + beq top_available + subs r8, r7, #1 + beq left_available + mov r10, #128 + vdup.8 q14, r10 + vdup.8 q15, r10 + b sad + +all_available: + vzip.u16 q4, q5 + vext.16 q6, q4, q4, #2 + vadd.u16 q7, q5, q6 + vqrshrn.u16 d14, q7, #3 + vqrshrn.u16 d15, q4, #2 + vqrshrn.u16 d16, q5, #2 + vdup.16 d28, d14[0] + vdup.16 d29, d16[1] + vdup.16 d30, d15[0] + vdup.16 d31, d14[1] + b sad +top_available: + vzip.u16 q4, q5 + vqrshrn.u16 d16, q5, #2 + vdup.16 d28, d16[0] + vdup.16 d29, d16[1] + vdup.16 d30, d16[0] + vdup.16 d31, d16[1] + b sad +left_available: + vzip.u16 q4, q5 + vqrshrn.u16 d16, q4, #2 + vdup.16 d28, d16[3] + vdup.16 d29, d16[3] + vdup.16 d30, d16[2] + vdup.16 d31, d16[2] + + +sad: + vld1.32 {q4}, [r12]! + sub r8, r12, #2 + add r12, r12, #2 + vld1.32 {q5}, [r12]! + add r12, r0, r3, lsl #2 + sub r10, r8, #8 + vld1.32 {q0}, [r0], r3 + ldrh r9, [r8] + vdup.16 q10, r9 @ row 0 + + @/vertical row 0; + vabdl.u8 q8, d0, d10 + vabdl.u8 q9, d1, d11 + sub r8, r8, #2 + vld1.32 {q1}, [r12], r3 + + @/HORZ row 0; + vabdl.u8 q13, d0, d20 + vabdl.u8 q7, d1, d21 + ldrh r9, [r10] + @/dc row 0; + vabdl.u8 q11, d0, d28 + vabdl.u8 q12, d1, d29 + + + vdup.16 q10, r9 @ row 4 + @/vertical row 4; + vabal.u8 q8, d2, d10 + vabal.u8 q9, d3, d11 + sub r10, r10, #2 + + @/HORZ row 4; + vabal.u8 q13, d2, d20 + vabal.u8 q7, d3, d21 + @/dc row 4; + vabal.u8 q11, d2, d30 + vabal.u8 q12, d3, d31 + + mov r11, #3 + +loop: + vld1.32 {q0}, [r0], r3 + ldrh r9, [r8] + + + @/vertical row i; + vabal.u8 q8, d0, d10 + vabal.u8 q9, d1, d11 + + vdup.16 q10, r9 @ row i + vld1.32 {q1}, [r12], r3 + sub r8, r8, #2 + @/HORZ row i; + vabal.u8 q13, d0, d20 + vabal.u8 q7, d1, d21 + ldrh r9, [r10] + @/dc row i; + vabal.u8 q11, d0, d28 + vabal.u8 q12, d1, d29 + sub r10, r10, #2 + + vdup.16 q10, r9 @ row i+4 + @/vertical row 4; + vabal.u8 q8, d2, d10 + vabal.u8 q9, d3, d11 + subs r11, r11, #1 + + @/HORZ row i+4; + vabal.u8 q13, d2, d20 + vabal.u8 q7, d3, d21 + @/dc row i+4; + vabal.u8 q11, d2, d30 + vabal.u8 q12, d3, d31 + bne loop + + + +@------------------------------------------- + + vadd.i16 q9, q9, q8 @/VERT + vadd.i16 q7, q13, q7 @/HORZ + vadd.i16 q12, q11, q12 @/DC + vadd.i16 d18, d19, d18 @/VERT + vadd.i16 d14, d15, d14 @/HORZ + vadd.i16 d24, d24, d25 @/DC + vpaddl.u16 d18, d18 @/VERT + vpaddl.u16 d14, d14 @/HORZ + vpaddl.u16 d24, d24 @/DC + vpaddl.u32 d18, d18 @/VERT + vpaddl.u32 d14, d14 @/HORZ + vpaddl.u32 d24, d24 @/DC + + + + vmov.u32 r8, d18[0] @ vert + vmov.u32 r9, d14[0] @horz + vmov.u32 r10, d24[0] @dc + + mov r11, #1 +@----------------------- + ldr r0, [sp, #120] @ u4_valid_intra_modes +@-------------------------------------------- + + + lsl r11 , #30 + + ands r7, r0, #04 @ vert mode valid???????????? + moveq r8, r11 + + ands r6, r0, #02 @ horz mode valid???????????? + moveq r9, r11 + + ands r6, r0, #01 @ dc mode valid???????????? + moveq r10, r11 + + + @--------------------------- + ldr r4, [sp, #104] @r4 = dst_strd, + ldr r6, [sp, #112] @ R6 =MODE + ldr r7, [sp, #116] @r7 = pu4_sadmin + + @-------------------------- + + cmp r10, r9 + bgt not_dc + cmp r10, r8 + bgt do_vert + + @/---------------------- + @DO DC PREDICTION + str r10 , [r7] @MIN SAD + mov r10, #0 + str r10 , [r6] @ MODE + b do_dc_vert + @----------------------------- + +not_dc: + cmp r9, r8 + bgt do_vert + @/---------------------- + @DO HORIZONTAL + + vdup.16 q10, d9[3] @/HORIZONTAL VALUE ROW=0; + str r9 , [r7] @MIN SAD + mov r9, #1 + vdup.16 q11, d9[2] @/HORIZONTAL VALUE ROW=1; + str r9 , [r6] @ MODE + vdup.16 q12, d9[1] @/HORIZONTAL VALUE ROW=2; + vst1.32 {d20, d21} , [r2], r4 @0 + vdup.16 q13, d9[0] @/HORIZONTAL VALUE ROW=3; + vst1.32 {d22, d23} , [r2], r4 @1 + vdup.16 q14, d8[3] @/HORIZONTAL VALUE ROW=4; + vst1.32 {d24, d25} , [r2], r4 @2 + vdup.16 q15, d8[2] @/HORIZONTAL VALUE ROW=5; + vst1.32 {d26, d27} , [r2], r4 @3 + vdup.16 q1, d8[1] @/HORIZONTAL VALUE ROW=6; + vst1.32 {d28, d29} , [r2], r4 @4 + vdup.16 q2, d8[0] @/HORIZONTAL VALUE ROW=7; + vst1.32 {d30, d31} , [r2], r4 @5 + vst1.32 {d2, d3} , [r2], r4 @6 + vst1.32 {d4, d5} , [r2], r4 @7 + b end_func + +do_vert: + @DO VERTICAL PREDICTION + str r8 , [r7] @MIN SAD + mov r8, #2 + str r8 , [r6] @ MODE + vmov q15, q5 + vmov q14, q5 + +do_dc_vert: + vst1.32 {d28, d29} , [r2], r4 @0 + vst1.32 {d28, d29} , [r2], r4 @1 + vst1.32 {d28, d29} , [r2], r4 @2 + vst1.32 {d28, d29} , [r2], r4 @3 + vst1.32 {d30, d31} , [r2], r4 @4 + vst1.32 {d30, d31} , [r2], r4 @5 + vst1.32 {d30, d31} , [r2], r4 @6 + vst1.32 {d30, d31} , [r2], r4 @7 + + +end_func: + vpop {d8-d15} + ldmfd sp!, {r4-r12, pc} @Restoring registers from stack + + + diff --git a/encoder/arm/ih264e_fmt_conv.s b/encoder/arm/ih264e_fmt_conv.s new file mode 100755 index 0000000..2bf1479 --- /dev/null +++ b/encoder/arm/ih264e_fmt_conv.s @@ -0,0 +1,329 @@ +@/****************************************************************************** +@ * +@ * Copyright (C) 2015 The Android Open Source Project +@ * +@ * Licensed under the Apache License, Version 2.0 (the "License"); +@ * you may not use this file except in compliance with the License. +@ * You may obtain a copy of the License at: +@ * +@ * http://www.apache.org/licenses/LICENSE-2.0 +@ * +@ * Unless required by applicable law or agreed to in writing, software +@ * distributed under the License is distributed on an "AS IS" BASIS, +@ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +@ * See the License for the specific language governing permissions and +@ * limitations under the License. +@ * +@ ***************************************************************************** +@ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore +@*/ +@/** + +.text +.p2align 2 +@/** + +@/***************************************************************************** +@* * +@* Function Name : IH264D_CXA8_YUV420toYUV420SP_UV() * +@* * +@* Description : This function conversts the image from YUV420P color * +@* space to 420SP color space(UV interleaved). * +@* * +@* Arguments : R0 pu1_y * +@* R1 pu1_u * +@* R2 pu1_v * +@* R3 pu1_dest_y * +@* [R13 #40] pu1_dest_uv * +@* [R13 #44] u2_height * +@* [R13 #48] u2_width * +@* [R13 #52] u2_stridey * +@* [R13 #56] u2_strideu * +@* [R13 #60] u2_stridev * +@* [R13 #64] u2_dest_stride_y * +@* [R13 #68] u2_dest_stride_uv * +@* [R13 #72] convert_uv_only * +@* * +@* Values Returned : None * +@* * +@* Register Usage : R0 - R14 * +@* * +@* Stack Usage : 40 Bytes * +@* * +@* Interruptibility : Interruptible * +@* * +@* Known Limitations * +@* Assumptions: Image Width: Assumed to be multiple of 16 and * +@* greater than or equal to 16 * +@* Image Height: Assumed to be even. * +@* * +@* Revision History : * +@* DD MM YYYY Author(s) Changes (Describe the changes made) * +@* 07 06 2010 Varshita Draft * +@* 07 06 2010 Naveen Kr T Completed * +@* * +@*****************************************************************************/ + .global ih264e_fmt_conv_420p_to_420sp_a9q + +ih264e_fmt_conv_420p_to_420sp_a9q: + + @// push the registers on the stack + stmfd sp!, {r4-r12, lr} + + ldr r4, [sp, #72] @// Load convert_uv_only + + cmp r4, #1 + beq yuv420sp_uv_chroma + @/* Do the preprocessing before the main loops start */ + @// Load the parameters from stack + ldr r4, [sp, #44] @// Load u2_height from stack + ldr r5, [sp, #48] @// Load u2_width from stack + ldr r7, [sp, #52] @// Load u2_stridey from stack + ldr r8, [sp, #64] @// Load u2_dest_stride_y from stack + sub r7, r7, r5 @// Source increment + sub r8, r8, r5 @// Destination increment + + vpush {d8-d15} +yuv420sp_uv_row_loop_y: + mov r6, r5 + +yuv420sp_uv_col_loop_y: + pld [r0, #128] + vld1.8 {d0, d1}, [r0]! + vst1.8 {d0, d1}, [r3]! + sub r6, r6, #16 + cmp r6, #15 + bgt yuv420sp_uv_col_loop_y + + cmp r6, #0 + beq yuv420sp_uv_row_loop_end_y + @//If non-multiple of 16, then go back by few bytes to ensure 16 bytes can be read + @//Ex if width is 162, above loop will process 160 pixels. And + @//Both source and destination will point to 146th pixel and then 16 bytes will be read + @// and written using VLD1 and VST1 + rsb r6, r6, #16 + sub r0, r0, r6 + sub r3, r3, r6 + + vld1.8 {d0, d1}, [r0]! + vst1.8 {d0, d1}, [r3]! + +yuv420sp_uv_row_loop_end_y: + add r0, r0, r7 + add r3, r3, r8 + subs r4, r4, #1 + bgt yuv420sp_uv_row_loop_y + +yuv420sp_uv_chroma: + + ldr r3, [sp, #40] @// Load pu1_dest_uv from stack + + ldr r4, [sp, #44] @// Load u2_height from stack + + ldr r5, [sp, #48] @// Load u2_width from stack + + + ldr r7, [sp, #56] @// Load u2_strideu from stack + + ldr r8, [sp, #68] @// Load u2_dest_stride_uv from stack + + sub r7, r7, r5, lsr #1 @// Source increment + + sub r8, r8, r5 @// Destination increment + + mov r5, r5, lsr #1 + mov r4, r4, lsr #1 + ldr r3, [sp, #40] @// Load pu1_dest_uv from stack + vpush {d8-d15} +yuv420sp_uv_row_loop_uv: + mov r6, r5 + + +yuv420sp_uv_col_loop_uv: + pld [r1, #128] + pld [r2, #128] + vld1.8 d0, [r1]! + vld1.8 d1, [r2]! + vst2.8 {d0, d1}, [r3]! + sub r6, r6, #8 + cmp r6, #7 + bgt yuv420sp_uv_col_loop_uv + + cmp r6, #0 + beq yuv420sp_uv_row_loop_end_uv + @//If non-multiple of 16, then go back by few bytes to ensure 16 bytes can be read + @//Ex if width is 162, above loop will process 160 pixels. And + @//Both source and destination will point to 146th pixel and then 16 bytes will be read + @// and written using VLD1 and VST1 + rsb r6, r6, #8 + sub r1, r1, r6 + sub r2, r2, r6 + sub r3, r3, r6, lsl #1 + + vld1.8 d0, [r1]! + vld1.8 d1, [r2]! + vst2.8 {d0, d1}, [r3]! + +yuv420sp_uv_row_loop_end_uv: + add r1, r1, r7 + add r2, r2, r7 + add r3, r3, r8 + subs r4, r4, #1 + bgt yuv420sp_uv_row_loop_uv + @//POP THE REGISTERS + vpop {d8-d15} + ldmfd sp!, {r4-r12, pc} + + + + + +@ /** +@ ******************************************************************************* +@ * +@ * @brief ih264e_fmt_conv_422i_to_420sp_a9q +@ * Function used from format conversion or frame copy +@ * +@ * +@ * +@ *Inputs : r0 - pu1_y - UWORD8 pointer to y plane. +@ * r1 - pu1_u - UWORD8 pointer to u plane. +@ * r2 - pu1_v - UWORD8 pointer to u plane. +@ * r3 - pu2_yuv422i - UWORD16 pointer to yuv422iimage. +@ * stack + 40 - u4_width - Width of the Y plane. +@ * 44 - u4_height - Height of the Y plane. +@ * 48 - u4_stride_y - Stride in pixels of Y plane. +@ * 52 - u4_stride_u - Stride in pixels of U plane. +@ * 56 - u4_stride_v - Stride in pixels of V plane. +@ * 60 - u4_stride_yuv422i- Stride in pixels of yuv422i image. +@ * +@ * @par Description +@ * Function used from copying or converting a reference frame to display buffer +@ * in non shared mode +@ * +@ * @param[in] pu1_y_dst +@ * Output Y pointer +@ * +@ * @param[in] pu1_u_dst +@ * Output U/UV pointer ( UV is interleaved in the same format as that of input) +@ * +@ * @param[in] pu1_v_dst +@ * Output V pointer ( used in 420P output case) +@ * +@ * @param[in] u4_dst_y_strd +@ * Stride of destination Y buffer +@ * +@ * @param[in] u4_dst_u_strd +@ * Stride of destination U/V buffer +@ * +@ * +@ * @param[in] blocking +@ * To indicate whether format conversion should wait till frame is reconstructed +@ * and then return after complete copy is done. To be set to 1 when called at the +@ * end of frame processing and set to 0 when called between frame processing modules +@ * in order to utilize available MCPS +@ * +@ * @returns Error from IH264E_ERROR_T +@ * +@ * @remarks +@ * Assumes that the stride of U and V buffers are same. +@ * This is correct in most cases +@ * If a case comes where this is not true we need to modify the fmt conversion funcnions called inside also +@ * Since we read 4 pixels ata time the width should be aligned to 4 +@ * In assembly width should be aligned to 16 and height to 2. +@ * +@ * +@ * Revision History : +@ * DD MM YYYY Author(s) Changes (Describe the changes made) +@ * 07 06 2010 Harinarayanan K K Adapeted to 422p +@ * +@ ******************************************************************************* +@ */ + +@//` +@*/ + .global ih264e_fmt_conv_422i_to_420sp_a9q +ih264e_fmt_conv_422i_to_420sp_a9q: + stmfd sp!, {r4-r12, lr} @// Back the register which are used + + + + @/* Do the preprocessing before the main loops start */ + @// Load the parameters from stack + ldr r4, [sp, #48] @// Load u4_stride_y from stack + + ldr r5, [sp, #60] @// Load u4_stride_yuv422i from stack + add r6, r0, r4 @// pu1_y_nxt_row = pu1_y + u4_stride_y + + ldr r7, [sp, #40] @// Load u4_width from stack + add r8, r3, r5, lsl #1 @// pu2_yuv422i_nxt_row = pu2_yuv422i_y + u4_stride_yuv422i(2 Bytes for each pixel) + + ldr r9, [sp, #52] @// Load u4_stride_u from stack + sub r12, r4, r7 @// u2_offset1 = u4_stride_y - u4_width + +@LDR r10,[sp,#56] ;// Load u4_stride_v from stack + sub r14, r5, r7 @// u2_offset_yuv422i = u4_stride_yuv422i - u4_width + + ldr r11, [sp, #44] @// Load u4_height from stack + sub r9, r9, r7 @// u2_offset2 = u4_stride_u - u4_width >> 1 + +@ SUB r10,r10,r7,ASR #1 ;// u2_offset3 = u4_stride_v - u4_width >> 1 + mov r14, r14, lsl #1 @// u2_offset_yuv422i = u2_offset_yuv422i * 2 + + mov r7, r7, asr #4 @// u4_width = u4_width / 16 (u4_width >> 4) + mov r11, r11, asr #1 @// u4_width = u4_width / 2 (u4_width >> 1) + + add r4, r12, r4 @// u2_offset1 = u2_offset1 + u4_stride_y + add r5, r14, r5, lsl #1 @// u2_offset_yuv422i = u2_offset_yuv422i + u4_stride_yuv422i + + vpush {d8-d15} + +@// Register Assignment +@// pu1_y - r0 +@// pu1_y_nxt_row - r6 +@// pu1_u - r1 +@// pu1_v - r2 +@// pu2_yuv422i - r3 +@// pu2_yuv422i_nxt_row - r8 +@// u2_offset1 - r4 +@// u2_offset2 - r9 +@// u2_offset3 - r10 +@// u2_offset_yuv422i - r5 +@// u4_width / 16 - r7 +@// u4_height / 2 - r11 +@// inner loop count - r12 +yuv420_to_yuv422i_hight_loop: + + mov r12, r7 @// Inner loop count = u4_width / 16 + +yuv420_to_yuv422i_width_loop: + vld4.8 {d0, d1, d2, d3}, [r3]! @// Load the 16 elements of row 1 + vld4.8 {d4, d5, d6, d7}, [r8]! @// Load the 16 elements of row 2 + subs r12, r12, #1 + + vrhadd.u8 d0, d0, d4 + vrhadd.u8 d2, d2, d6 + + vst2.8 {d1, d3}, [r0]! @// Store the 16 elements of row1 Y + vst2.8 {d5, d7}, [r6]! @// Store the 16 elements of row2 Y + + vst2.8 {d0, d2}, [r1]! @// Store the 8 elements of row1/2 U + + bgt yuv420_to_yuv422i_width_loop + + @// Update the buffer pointer so that they will refer to next pair of rows + add r0, r0, r4 @// pu1_y = pu1_y + u2_offset1 + add r6, r6, r4 @// pu1_y_nxt_row = pu1_y_nxt_row + u2_offset1 + + add r1, r1, r9 @// pu1_u = pu1_u + u2_offset2 + subs r11, r11, #1 + + add r3, r3, r5 @// pu2_yuv422i = pu2_yuv422i + u2_offset_yuv422i + + add r8, r8, r5 @// pu2_yuv422i_nxt_row = pu2_yuv422i_nxt_row + u2_offset_yuv422i + bgt yuv420_to_yuv422i_hight_loop + vpop {d8-d15} + ldmfd sp!, {r4-r12, pc} @// Restore the register which are used + + + diff --git a/encoder/arm/ih264e_function_selector.c b/encoder/arm/ih264e_function_selector.c new file mode 100755 index 0000000..bb181c1 --- /dev/null +++ b/encoder/arm/ih264e_function_selector.c @@ -0,0 +1,170 @@ +/****************************************************************************** + * + * Copyright (C) 2015 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ***************************************************************************** + * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore +*/ + +/** +******************************************************************************* +* @file +* ih264e_function_selector.c +* +* @brief +* Contains functions to initialize function pointers used in h264 +* +* @author +* Ittiam +* +* @par List of Functions: +* +* @remarks +* None +* +******************************************************************************* +*/ + + +/*****************************************************************************/ +/* File Includes */ +/*****************************************************************************/ + +/* System Include Files */ +#include <stdio.h> +#include <stddef.h> +#include <stdlib.h> +#include <string.h> + +/* User Include Files */ +#include "ih264_typedefs.h" +#include "iv2.h" +#include "ive2.h" +#include "ih264_defs.h" +#include "ih264_size_defs.h" +#include "ih264e_defs.h" +#include "ih264e_error.h" +#include "ih264e_bitstream.h" +#include "ime_distortion_metrics.h" +#include "ime_structs.h" +#include "ih264_defs.h" +#include "ih264_error.h" +#include "ih264_structs.h" +#include "ih264_trans_quant_itrans_iquant.h" +#include "ih264_inter_pred_filters.h" +#include "ih264_mem_fns.h" +#include "ih264_padding.h" +#include "ih264_intra_pred_filters.h" +#include "ih264_deblk_edge_filters.h" + +#include "ih264_macros.h" +#include "ih264_platform_macros.h" +#include "ih264e_defs.h" +#include "irc_cntrl_param.h" +#include "irc_frame_info_collector.h" +#include "ih264e_rate_control.h" +#include "ih264e_structs.h" +#include "ih264e_platform_macros.h" + +/** +******************************************************************************* +* +* @brief Initialize the intra/inter/transform/deblk function pointers of +* codec context +* +* @par Description: the current routine initializes the function pointers of +* codec context basing on the architecture in use +* +* @param[in] ps_codec +* Codec context pointer +* +* @returns none +* +* @remarks none +* +******************************************************************************* +*/ +#ifdef ARMV8 +void ih264e_init_function_ptr(void *pv_codec) +{ + codec_t *ps_codec = (codec_t *)pv_codec; + ih264e_init_function_ptr_generic(ps_codec); + switch(ps_codec->s_cfg.e_arch) + { + case ARCH_ARM_NONEON: + break; + case ARCH_ARM_A53: + case ARCH_ARM_A57: + case ARCH_ARM_V8_NEON: + ih264e_init_function_ptr_neon_av8(ps_codec); + break; + default: + ih264e_init_function_ptr_neon_av8(ps_codec); + break; + } +} + +/** +******************************************************************************* +* +* @brief Determine the architecture of the encoder executing environment +* +* @par Description: This routine returns the architecture of the enviro- +* ment in which the current encoder is being tested +* +* @param[in] void +* +* @returns IV_ARCH_T +* architecture +* +* @remarks none +* +******************************************************************************* +*/ +IV_ARCH_T ih264e_default_arch(void) +{ + return ARCH_ARM_V8_NEON; +} + +#else + +void ih264e_init_function_ptr(void *pv_codec) +{ + codec_t *ps_codec = (codec_t *)pv_codec; + ih264e_init_function_ptr_generic(ps_codec); + switch(ps_codec->s_cfg.e_arch) + { + case ARCH_ARM_NONEON: + break; + case ARCH_ARM_A9Q: + case ARCH_ARM_A9A: + case ARCH_ARM_A9: + case ARCH_ARM_A7: + case ARCH_ARM_A5: + case ARCH_ARM_A15: + ih264e_init_function_ptr_neon_a9q(ps_codec); + break; + default: + ih264e_init_function_ptr_neon_a9q(ps_codec); + break; + } +} + +IV_ARCH_T ih264e_default_arch(void) +{ + return ARCH_ARM_A9Q; +} + +#endif diff --git a/encoder/arm/ih264e_function_selector_a9q.c b/encoder/arm/ih264e_function_selector_a9q.c new file mode 100755 index 0000000..8b2879b --- /dev/null +++ b/encoder/arm/ih264e_function_selector_a9q.c @@ -0,0 +1,252 @@ +/****************************************************************************** + * + * Copyright (C) 2015 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ***************************************************************************** + * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore +*/ +/** +******************************************************************************* +* @file +* ih264e_function_selector_generic.c +* +* @brief +* Contains functions to initialize function pointers of codec context +* +* @author +* Ittiam +* +* @par List of Functions: +* - ih264e_init_function_ptr_generic +* +* @remarks +* None +* +******************************************************************************* +*/ + + +/*****************************************************************************/ +/* File Includes */ +/*****************************************************************************/ + +/* System Include files */ +#include <stdio.h> +#include <stddef.h> +#include <stdlib.h> +#include <string.h> + +/* User Include files */ +#include "ih264_typedefs.h" +#include "iv2.h" +#include "ive2.h" +#include "ih264_defs.h" +#include "ih264_size_defs.h" +#include "ih264e_defs.h" +#include "ih264e_error.h" +#include "ih264e_bitstream.h" +#include "ime_distortion_metrics.h" +#include "ime_structs.h" +#include "ih264_defs.h" +#include "ih264_error.h" +#include "ih264_structs.h" +#include "ih264_trans_quant_itrans_iquant.h" +#include "ih264_inter_pred_filters.h" +#include "ih264_mem_fns.h" +#include "ih264_padding.h" +#include "ih264_intra_pred_filters.h" +#include "ih264_deblk_edge_filters.h" + +#include "irc_cntrl_param.h" +#include "irc_frame_info_collector.h" +#include "ih264e_rate_control.h" +#include "ih264e_structs.h" +#include "ih264e_platform_macros.h" +#include "ih264_intra_pred_filters.h" +#include "ih264_trans_quant_itrans_iquant.h" +#include "ih264e_defs.h" +#include "ih264e_structs.h" +#include "ih264_deblk_edge_filters.h" +#include "ih264e_core_coding.h" +#include "ih264_cavlc_tables.h" +#include "ih264e_cavlc.h" +#include "ih264_padding.h" +#include "ih264e_intra_modes_eval.h" +#include "ih264_mem_fns.h" +#include "ih264e_fmt_conv.h" +#include "ih264e_half_pel.h" + +/** +******************************************************************************* +* +* @brief Initialize the intra/inter/transform/deblk function pointers of +* codec context +* +* @par Description: the current routine initializes the function pointers of +* codec context basing on the architecture in use +* +* @param[in] ps_codec +* Codec context pointer +* +* @returns none +* +* @remarks none +* +******************************************************************************* +*/ +void ih264e_init_function_ptr_neon_a9q(codec_t *ps_codec) +{ + WORD32 i= 0; + + /* curr proc ctxt */ + process_ctxt_t *ps_proc = NULL; + me_ctxt_t *ps_me_ctxt = NULL; + + /* Init function pointers for intra pred leaf level functions luma + * Intra 16x16 */ + ps_codec->apf_intra_pred_16_l[0] = ih264_intra_pred_luma_16x16_mode_vert_a9q; + ps_codec->apf_intra_pred_16_l[1] = ih264_intra_pred_luma_16x16_mode_horz_a9q; + ps_codec->apf_intra_pred_16_l[2] = ih264_intra_pred_luma_16x16_mode_dc_a9q; + ps_codec->apf_intra_pred_16_l[3] = ih264_intra_pred_luma_16x16_mode_plane_a9q; + + /* Init function pointers for intra pred leaf level functions luma + * Intra 4x4 */ + ps_codec->apf_intra_pred_4_l[0] = ih264_intra_pred_luma_4x4_mode_vert_a9q; + ps_codec->apf_intra_pred_4_l[1] = ih264_intra_pred_luma_4x4_mode_horz_a9q; + ps_codec->apf_intra_pred_4_l[2] = ih264_intra_pred_luma_4x4_mode_dc_a9q; + ps_codec->apf_intra_pred_4_l[3] = ih264_intra_pred_luma_4x4_mode_diag_dl_a9q; + ps_codec->apf_intra_pred_4_l[4] = ih264_intra_pred_luma_4x4_mode_diag_dr_a9q; + ps_codec->apf_intra_pred_4_l[5] = ih264_intra_pred_luma_4x4_mode_vert_r_a9q; + ps_codec->apf_intra_pred_4_l[6] = ih264_intra_pred_luma_4x4_mode_horz_d_a9q; + ps_codec->apf_intra_pred_4_l[7] = ih264_intra_pred_luma_4x4_mode_vert_l_a9q; + ps_codec->apf_intra_pred_4_l[8] = ih264_intra_pred_luma_4x4_mode_horz_u_a9q; + + /* Init function pointers for intra pred leaf level functions luma + * Intra 8x8 */ + ps_codec->apf_intra_pred_8_l[0] = ih264_intra_pred_luma_8x8_mode_vert_a9q; + ps_codec->apf_intra_pred_8_l[2] = ih264_intra_pred_luma_8x8_mode_dc_a9q; + ps_codec->apf_intra_pred_8_l[3] = ih264_intra_pred_luma_8x8_mode_diag_dl_a9q; + ps_codec->apf_intra_pred_8_l[4] = ih264_intra_pred_luma_8x8_mode_diag_dr_a9q; + ps_codec->apf_intra_pred_8_l[5] = ih264_intra_pred_luma_8x8_mode_vert_r_a9q; + ps_codec->apf_intra_pred_8_l[6] = ih264_intra_pred_luma_8x8_mode_horz_d_a9q; + ps_codec->apf_intra_pred_8_l[7] = ih264_intra_pred_luma_8x8_mode_vert_l_a9q; + ps_codec->apf_intra_pred_8_l[8] = ih264_intra_pred_luma_8x8_mode_horz_u_a9q; + + /* Init function pointers for intra pred leaf level functions chroma + * Intra 8x8 */ + ps_codec->apf_intra_pred_c[0] = ih264_intra_pred_chroma_8x8_mode_dc_a9q; + ps_codec->apf_intra_pred_c[1] = ih264_intra_pred_chroma_8x8_mode_horz_a9q; + ps_codec->apf_intra_pred_c[2] = ih264_intra_pred_chroma_8x8_mode_vert_a9q; + ps_codec->apf_intra_pred_c[3] = ih264_intra_pred_chroma_8x8_mode_plane_a9q; + + /* Init forward transform fn ptr */ + ps_codec->pf_resi_trans_quant_8x8 = ih264_resi_trans_quant_8x8; + ps_codec->pf_resi_trans_quant_4x4 = ih264_resi_trans_quant_4x4_a9; + ps_codec->pf_resi_trans_quant_chroma_4x4 = ih264_resi_trans_quant_chroma_4x4_a9; + ps_codec->pf_hadamard_quant_4x4 = ih264_hadamard_quant_4x4_a9; + ps_codec->pf_hadamard_quant_2x2_uv = ih264_hadamard_quant_2x2_uv_a9; + + /* Init inverse transform fn ptr */ + ps_codec->pf_iquant_itrans_recon_8x8 = ih264_iquant_itrans_recon_8x8; + ps_codec->pf_iquant_itrans_recon_4x4 = ih264_iquant_itrans_recon_4x4_a9; + ps_codec->pf_iquant_itrans_recon_4x4_dc = ih264_iquant_itrans_recon_4x4_dc_a9; + ps_codec->pf_iquant_itrans_recon_chroma_4x4 = ih264_iquant_itrans_recon_chroma_4x4_a9; + ps_codec->pf_iquant_itrans_recon_chroma_4x4_dc = ih264_iquant_itrans_recon_chroma_4x4_dc_a9; + ps_codec->pf_ihadamard_scaling_4x4 = ih264_ihadamard_scaling_4x4_a9; + ps_codec->pf_ihadamard_scaling_2x2_uv = ih264_ihadamard_scaling_2x2_uv_a9; + ps_codec->pf_interleave_copy = ih264_interleave_copy_a9; + + /* Init fn ptr luma core coding */ + ps_codec->luma_energy_compaction[0] = ih264e_code_luma_intra_macroblock_16x16; + ps_codec->luma_energy_compaction[1] = ih264e_code_luma_intra_macroblock_4x4; + ps_codec->luma_energy_compaction[3] = ih264e_code_luma_inter_macroblock_16x16; + + /* Init fn ptr chroma core coding */ + ps_codec->chroma_energy_compaction[0] = ih264e_code_chroma_intra_macroblock_8x8; + ps_codec->chroma_energy_compaction[1] = ih264e_code_chroma_inter_macroblock_8x8; + + /* Init fn ptr luma deblocking */ + ps_codec->pf_deblk_luma_vert_bs4 = ih264_deblk_luma_vert_bs4_a9; + ps_codec->pf_deblk_luma_vert_bslt4 = ih264_deblk_luma_vert_bslt4_a9; + ps_codec->pf_deblk_luma_horz_bs4 = ih264_deblk_luma_horz_bs4_a9; + ps_codec->pf_deblk_luma_horz_bslt4 = ih264_deblk_luma_horz_bslt4_a9; + + /* Init fn ptr chroma deblocking */ + ps_codec->pf_deblk_chroma_vert_bs4 = ih264_deblk_chroma_vert_bs4_a9; + ps_codec->pf_deblk_chroma_vert_bslt4 = ih264_deblk_chroma_vert_bslt4_a9; + ps_codec->pf_deblk_chroma_horz_bs4 = ih264_deblk_chroma_horz_bs4_a9; + ps_codec->pf_deblk_chroma_horz_bslt4 = ih264_deblk_chroma_horz_bslt4_a9; + + /* write mb syntax layer */ + ps_codec->pf_write_mb_syntax_layer[ISLICE] = ih264e_write_islice_mb; + ps_codec->pf_write_mb_syntax_layer[PSLICE] = ih264e_write_pslice_mb; + + /* Padding Functions */ + ps_codec->pf_pad_top = ih264_pad_top_a9q; + ps_codec->pf_pad_bottom = ih264_pad_bottom; + ps_codec->pf_pad_left_luma = ih264_pad_left_luma_a9q; + ps_codec->pf_pad_left_chroma = ih264_pad_left_chroma_a9q; + ps_codec->pf_pad_right_luma = ih264_pad_right_luma_a9q; + ps_codec->pf_pad_right_chroma = ih264_pad_right_chroma_a9q; + + /* Inter pred leaf level functions */ + ps_codec->pf_inter_pred_luma_copy = ih264_inter_pred_luma_copy_a9q; + ps_codec->pf_inter_pred_luma_horz = ih264_inter_pred_luma_horz_a9q; + ps_codec->pf_inter_pred_luma_vert = ih264_inter_pred_luma_vert_a9q; + ps_codec->pf_inter_pred_luma_bilinear = ih264_inter_pred_luma_bilinear_a9q; + ps_codec->pf_inter_pred_chroma = ih264_inter_pred_chroma_a9q; + + /* sad me level functions */ + ps_codec->apf_compute_sad_16x16[0] = ime_compute_sad_16x16_a9q; + ps_codec->apf_compute_sad_16x16[1] = ime_compute_sad_16x16_fast_a9q; + ps_codec->pf_compute_sad_16x8 = ime_compute_sad_16x8_a9q; + + /* memor handling operations */ + ps_codec->pf_mem_cpy = ih264_memcpy_a9q; + ps_codec->pf_mem_cpy_mul8 = ih264_memcpy_mul_8_a9q; + ps_codec->pf_mem_set = ih264_memset_a9q; + ps_codec->pf_mem_set_mul8 = ih264_memset_mul_8_a9q; + + /* sad me level functions */ + for(i = 0; i < (MAX_PROCESS_CTXT); i++) + { + ps_proc = &ps_codec->as_process[i]; + ps_me_ctxt = &ps_proc->s_me_ctxt; + ps_me_ctxt->pf_ime_compute_sad_16x16[0] = ime_compute_sad_16x16_a9q; + ps_me_ctxt->pf_ime_compute_sad_16x16[1] = ime_compute_sad_16x16_fast_a9q; + ps_me_ctxt->pf_ime_compute_sad_16x8 = ime_compute_sad_16x8_a9q; + ps_me_ctxt->pf_ime_compute_sad4_diamond = ime_calculate_sad4_prog_a9q; + ps_me_ctxt->pf_ime_compute_sad3_diamond = ime_calculate_sad3_prog_a9q; + ps_me_ctxt->pf_ime_compute_sad2_diamond = ime_calculate_sad2_prog_a9q; + ps_me_ctxt->pf_ime_sub_pel_compute_sad_16x16 = ime_sub_pel_compute_sad_16x16_a9q; + ps_me_ctxt->pf_ime_compute_sad_stat_luma_16x16 = ime_compute_satqd_16x16_lumainter_a9q; + } + + /* intra mode eval -encoder level function */ + ps_codec->pf_ih264e_evaluate_intra16x16_modes = ih264e_evaluate_intra16x16_modes_a9q; + ps_codec->pf_ih264e_evaluate_intra_chroma_modes = ih264e_evaluate_intra_chroma_modes_a9q; + ps_codec->pf_ih264e_evaluate_intra_4x4_modes = ih264e_evaluate_intra_4x4_modes_a9q; + + /* csc */ + ps_codec->pf_ih264e_conv_420p_to_420sp = ih264e_fmt_conv_420p_to_420sp_a9q; + ps_codec->pf_ih264e_fmt_conv_422i_to_420sp = ih264e_fmt_conv_422i_to_420sp_a9q; + + /* Halp pel generation function - encoder level*/ + ps_codec->pf_ih264e_sixtapfilter_horz = ih264e_sixtapfilter_horz_a9q; + ps_codec->pf_ih264e_sixtap_filter_2dvh_vert = ih264e_sixtap_filter_2dvh_vert_a9q; + + return ; + } + diff --git a/encoder/arm/ih264e_function_selector_av8.c b/encoder/arm/ih264e_function_selector_av8.c new file mode 100755 index 0000000..173c2d5 --- /dev/null +++ b/encoder/arm/ih264e_function_selector_av8.c @@ -0,0 +1,259 @@ +/****************************************************************************** + * + * Copyright (C) 2015 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ***************************************************************************** + * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore +*/ +/** +******************************************************************************* +* @file +* ih264e_function_selector_generic.c +* +* @brief +* Contains functions to initialize function pointers of codec context +* +* @author +* Ittiam +* +* @par List of Functions: +* - ih264e_init_function_ptr_generic +* +* @remarks +* None +* +******************************************************************************* +*/ + + +/*****************************************************************************/ +/* File Includes */ +/*****************************************************************************/ + +/*****************************************************************************/ +/* File Includes */ +/*****************************************************************************/ + +/* System Include files */ +#include <stdio.h> +#include <stddef.h> +#include <stdlib.h> +#include <string.h> + +/* User Include files */ +#include "ih264_typedefs.h" +#include "iv2.h" +#include "ive2.h" +#include "ih264_defs.h" +#include "ih264_size_defs.h" +#include "ih264e_defs.h" +#include "ih264e_error.h" +#include "ih264e_bitstream.h" +#include "ime_distortion_metrics.h" +#include "ime_structs.h" +#include "ih264_defs.h" +#include "ih264_error.h" +#include "ih264_structs.h" +#include "ih264_trans_quant_itrans_iquant.h" +#include "ih264_inter_pred_filters.h" +#include "ih264_mem_fns.h" +#include "ih264_padding.h" +#include "ih264_intra_pred_filters.h" +#include "ih264_deblk_edge_filters.h" + +#include "irc_cntrl_param.h" +#include "irc_frame_info_collector.h" +#include "ih264e_rate_control.h" +#include "ih264e_structs.h" +#include "ih264e_platform_macros.h" +#include "ih264_intra_pred_filters.h" +#include "ih264_trans_quant_itrans_iquant.h" +#include "ih264e_defs.h" +#include "ih264e_structs.h" +#include "ih264_deblk_edge_filters.h" +#include "ih264e_core_coding.h" +#include "ih264_cavlc_tables.h" +#include "ih264e_cavlc.h" +#include "ih264_padding.h" +#include "ih264e_intra_modes_eval.h" +#include "ih264_mem_fns.h" +#include "ih264e_fmt_conv.h" +#include "ih264e_half_pel.h" + + +/** +******************************************************************************* +* +* @brief Initialize the intra/inter/transform/deblk function pointers of +* codec context +* +* @par Description: the current routine initializes the function pointers of +* codec context basing on the architecture in use +* +* @param[in] ps_codec +* Codec context pointer +* +* @returns none +* +* @remarks none +* +******************************************************************************* +*/ +void ih264e_init_function_ptr_neon_av8(codec_t *ps_codec) +{ + + WORD32 i= 0; + + /* curr proc ctxt */ + process_ctxt_t *ps_proc = NULL; + me_ctxt_t *ps_me_ctxt = NULL; + + /* Init function pointers for intra pred leaf level functions luma + * Intra 16x16 */ + ps_codec->apf_intra_pred_16_l[0] = ih264_intra_pred_luma_16x16_mode_vert_av8; + ps_codec->apf_intra_pred_16_l[1] = ih264_intra_pred_luma_16x16_mode_horz_av8; + ps_codec->apf_intra_pred_16_l[2] = ih264_intra_pred_luma_16x16_mode_dc_av8; + ps_codec->apf_intra_pred_16_l[3] = ih264_intra_pred_luma_16x16_mode_plane_av8; + + /* Init function pointers for intra pred leaf level functions luma + * Intra 4x4 */ + ps_codec->apf_intra_pred_4_l[0] = ih264_intra_pred_luma_4x4_mode_vert_av8; + ps_codec->apf_intra_pred_4_l[1] = ih264_intra_pred_luma_4x4_mode_horz_av8; + ps_codec->apf_intra_pred_4_l[2] = ih264_intra_pred_luma_4x4_mode_dc_av8; + ps_codec->apf_intra_pred_4_l[3] = ih264_intra_pred_luma_4x4_mode_diag_dl_av8; + ps_codec->apf_intra_pred_4_l[4] = ih264_intra_pred_luma_4x4_mode_diag_dr_av8; + ps_codec->apf_intra_pred_4_l[5] = ih264_intra_pred_luma_4x4_mode_vert_r_av8; + ps_codec->apf_intra_pred_4_l[6] = ih264_intra_pred_luma_4x4_mode_horz_d_av8; + ps_codec->apf_intra_pred_4_l[7] = ih264_intra_pred_luma_4x4_mode_vert_l_av8; + ps_codec->apf_intra_pred_4_l[8] = ih264_intra_pred_luma_4x4_mode_horz_u_av8; + + /* Init function pointers for intra pred leaf level functions luma + * Intra 8x8 */ + ps_codec->apf_intra_pred_8_l[0] = ih264_intra_pred_luma_8x8_mode_vert_av8; + ps_codec->apf_intra_pred_8_l[2] = ih264_intra_pred_luma_8x8_mode_dc_av8; + ps_codec->apf_intra_pred_8_l[3] = ih264_intra_pred_luma_8x8_mode_diag_dl_av8; + ps_codec->apf_intra_pred_8_l[4] = ih264_intra_pred_luma_8x8_mode_diag_dr_av8; + ps_codec->apf_intra_pred_8_l[5] = ih264_intra_pred_luma_8x8_mode_vert_r_av8; + ps_codec->apf_intra_pred_8_l[6] = ih264_intra_pred_luma_8x8_mode_horz_d_av8; + ps_codec->apf_intra_pred_8_l[7] = ih264_intra_pred_luma_8x8_mode_vert_l_av8; + ps_codec->apf_intra_pred_8_l[8] = ih264_intra_pred_luma_8x8_mode_horz_u_av8; + + /* Init function pointers for intra pred leaf level functions chroma + * Intra 8x8 */ + ps_codec->apf_intra_pred_c[0] = ih264_intra_pred_chroma_8x8_mode_dc_av8; + ps_codec->apf_intra_pred_c[1] = ih264_intra_pred_chroma_8x8_mode_horz_av8; + ps_codec->apf_intra_pred_c[2] = ih264_intra_pred_chroma_8x8_mode_vert_av8; + ps_codec->apf_intra_pred_c[3] = ih264_intra_pred_chroma_8x8_mode_plane_av8; + + + /* Init forward transform fn ptr */ + ps_codec->pf_resi_trans_quant_8x8 = ih264_resi_trans_quant_8x8; + ps_codec->pf_resi_trans_quant_4x4 = ih264_resi_trans_quant_4x4_av8; + ps_codec->pf_resi_trans_quant_chroma_4x4 = ih264_resi_trans_quant_chroma_4x4_av8; + ps_codec->pf_hadamard_quant_4x4 = ih264_hadamard_quant_4x4_av8; + ps_codec->pf_hadamard_quant_2x2_uv = ih264_hadamard_quant_2x2_uv_av8; + + /* Init inverse transform fn ptr */ + ps_codec->pf_iquant_itrans_recon_8x8 = ih264_iquant_itrans_recon_8x8_av8; + ps_codec->pf_iquant_itrans_recon_4x4 = ih264_iquant_itrans_recon_4x4_av8; + ps_codec->pf_iquant_itrans_recon_4x4_dc = ih264_iquant_itrans_recon_4x4_dc_av8; + ps_codec->pf_iquant_itrans_recon_chroma_4x4 = ih264_iquant_itrans_recon_chroma_4x4_av8; + ps_codec->pf_iquant_itrans_recon_chroma_4x4_dc = ih264_iquant_itrans_recon_chroma_4x4_dc_av8; + ps_codec->pf_ihadamard_scaling_4x4 = ih264_ihadamard_scaling_4x4_av8; + ps_codec->pf_ihadamard_scaling_2x2_uv = ih264_ihadamard_scaling_2x2_uv_av8; + ps_codec->pf_interleave_copy = ih264_interleave_copy_av8; + + /* Init fn ptr luma core coding */ + ps_codec->luma_energy_compaction[0] = ih264e_code_luma_intra_macroblock_16x16; + ps_codec->luma_energy_compaction[1] = ih264e_code_luma_intra_macroblock_4x4; + ps_codec->luma_energy_compaction[3] = ih264e_code_luma_inter_macroblock_16x16; + + /* Init fn ptr chroma core coding */ + ps_codec->chroma_energy_compaction[0] = ih264e_code_chroma_intra_macroblock_8x8; + ps_codec->chroma_energy_compaction[1] = ih264e_code_chroma_inter_macroblock_8x8; + + /* Init fn ptr luma deblocking */ + ps_codec->pf_deblk_luma_vert_bs4 = ih264_deblk_luma_vert_bs4_av8; + ps_codec->pf_deblk_luma_vert_bslt4 = ih264_deblk_luma_vert_bslt4_av8; + ps_codec->pf_deblk_luma_horz_bs4 = ih264_deblk_luma_horz_bs4_av8; + ps_codec->pf_deblk_luma_horz_bslt4 = ih264_deblk_luma_horz_bslt4_av8; + + /* Init fn ptr chroma deblocking */ + ps_codec->pf_deblk_chroma_vert_bs4 = ih264_deblk_chroma_vert_bs4_av8; + ps_codec->pf_deblk_chroma_vert_bslt4 = ih264_deblk_chroma_vert_bslt4_av8; + ps_codec->pf_deblk_chroma_horz_bs4 = ih264_deblk_chroma_horz_bs4_av8; + ps_codec->pf_deblk_chroma_horz_bslt4 = ih264_deblk_chroma_horz_bslt4_av8; + + /* write mb syntax layer */ + ps_codec->pf_write_mb_syntax_layer[ISLICE] = ih264e_write_islice_mb; + ps_codec->pf_write_mb_syntax_layer[PSLICE] = ih264e_write_pslice_mb; + + /* Padding Functions */ + ps_codec->pf_pad_top = ih264_pad_top_av8; + ps_codec->pf_pad_bottom = ih264_pad_bottom; + ps_codec->pf_pad_left_luma = ih264_pad_left_luma_av8; + ps_codec->pf_pad_left_chroma = ih264_pad_left_chroma_av8; + ps_codec->pf_pad_right_luma = ih264_pad_right_luma_av8; + ps_codec->pf_pad_right_chroma = ih264_pad_right_chroma_av8; + + /* Inter pred leaf level functions */ + ps_codec->pf_inter_pred_luma_copy = ih264_inter_pred_luma_copy_av8; + ps_codec->pf_inter_pred_luma_horz = ih264_inter_pred_luma_horz_av8; + ps_codec->pf_inter_pred_luma_vert = ih264_inter_pred_luma_vert_av8; + ps_codec->pf_inter_pred_luma_bilinear = ih264_inter_pred_luma_bilinear; + ps_codec->pf_inter_pred_chroma = ih264_inter_pred_chroma_av8; + + /* sad me level functions */ + ps_codec->apf_compute_sad_16x16[0] = ime_compute_sad_16x16_av8; + ps_codec->apf_compute_sad_16x16[1] = ime_compute_sad_16x16_fast_av8; + ps_codec->pf_compute_sad_16x8 = ime_compute_sad_16x8_av8; + + /* memor handling operations */ + ps_codec->pf_mem_cpy = ih264_memcpy_av8; + ps_codec->pf_mem_cpy_mul8 = ih264_memcpy_mul_8_av8; + ps_codec->pf_mem_set = ih264_memset_av8; + ps_codec->pf_mem_set_mul8 = ih264_memset_mul_8_av8; + + /* sad me level functions */ + for(i = 0; i < (MAX_PROCESS_CTXT); i++) + { + ps_proc = &ps_codec->as_process[i]; + ps_me_ctxt = &ps_proc->s_me_ctxt; + ps_me_ctxt->pf_ime_compute_sad_16x16[0] = ime_compute_sad_16x16_av8; + ps_me_ctxt->pf_ime_compute_sad_16x16[1] = ime_compute_sad_16x16_fast_av8; + ps_me_ctxt->pf_ime_compute_sad_16x8 = ime_compute_sad_16x8_av8; + ps_me_ctxt->pf_ime_compute_sad4_diamond = ime_calculate_sad4_prog_av8; + ps_me_ctxt->pf_ime_compute_sad3_diamond = ime_calculate_sad3_prog_av8; + ps_me_ctxt->pf_ime_compute_sad2_diamond = ime_calculate_sad2_prog_av8; + ps_me_ctxt->pf_ime_sub_pel_compute_sad_16x16 = ime_sub_pel_compute_sad_16x16_av8; + ps_me_ctxt->pf_ime_compute_sad_stat_luma_16x16 = ime_compute_satqd_16x16_lumainter_av8; + } + + /* intra mode eval -encoder level function */ + ps_codec->pf_ih264e_evaluate_intra16x16_modes = ih264e_evaluate_intra16x16_modes_av8; + ps_codec->pf_ih264e_evaluate_intra_chroma_modes = ih264e_evaluate_intra_chroma_modes_av8; + ps_codec->pf_ih264e_evaluate_intra_4x4_modes = ih264e_evaluate_intra_4x4_modes; + + /* csc */ + ps_codec->pf_ih264e_conv_420p_to_420sp = ih264e_fmt_conv_420p_to_420sp; + ps_codec->pf_ih264e_fmt_conv_422i_to_420sp = ih264e_fmt_conv_422i_to_420sp; + + /* Halp pel generation function - encoder level*/ + ps_codec->pf_ih264e_sixtapfilter_horz = ih264e_sixtapfilter_horz_av8; + ps_codec->pf_ih264e_sixtap_filter_2dvh_vert = ih264e_sixtap_filter_2dvh_vert_av8; + + return ; + } + diff --git a/encoder/arm/ih264e_half_pel.s b/encoder/arm/ih264e_half_pel.s new file mode 100755 index 0000000..1b9a87a --- /dev/null +++ b/encoder/arm/ih264e_half_pel.s @@ -0,0 +1,951 @@ +@/****************************************************************************** +@ * +@ * Copyright (C) 2015 The Android Open Source Project +@ * +@ * Licensed under the Apache License, Version 2.0 (the "License"); +@ * you may not use this file except in compliance with the License. +@ * You may obtain a copy of the License at: +@ * +@ * http://www.apache.org/licenses/LICENSE-2.0 +@ * +@ * Unless required by applicable law or agreed to in writing, software +@ * distributed under the License is distributed on an "AS IS" BASIS, +@ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +@ * See the License for the specific language governing permissions and +@ * limitations under the License. +@ * +@ ***************************************************************************** +@ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore +@*/ +@/** +@ ******************************************************************************* +@ * @file +@ * ih264e_half_pel.s +@ * +@ * @brief +@ * +@ * +@ * @author +@ * Ittiam +@ * +@ * @par List of Functions: +@ * ih264e_sixtapfilter_horz +@ * ih264e_sixtap_filter_2dvh_vert +@ +@ * +@ * @remarks +@ * None +@ * +@ ******************************************************************************* +@ */ + + +.text +.p2align 2 + +@ /** +@/******************************************************************************* +@* +@* @brief +@* Interprediction luma filter for horizontal input(Filter run for width = 17 and height =16) +@* +@* @par Description: +@* Applies a 6 tap horizontal filter .The output is clipped to 8 bits +@* sec 8.4.2.2.1 titled "Luma sample interpolation process" +@* +@* @param[in] pu1_src +@* UWORD8 pointer to the source +@* +@* @param[out] pu1_dst +@* UWORD8 pointer to the destination +@* +@* @param[in] src_strd +@* integer source stride +@* +@* @param[in] dst_strd +@* integer destination stride +@* +@* +@* @returns +@* +@* @remarks +@* None +@* +@******************************************************************************* +@*/ +@void ih264e_sixtapfilter_horz(UWORD8 *pu1_src, +@ UWORD8 *pu1_dst, +@ WORD32 src_strd, +@ WORD32 dst_strd); + + +.equ HALFPEL_WIDTH , 17 + 1 @( make it even, two rows are processed at a time) + + + .global ih264e_sixtapfilter_horz_a9q +ih264e_sixtapfilter_horz_a9q: + stmfd sp!, {lr} + + vmov.i8 d0, #5 + sub r0, r0, #2 + + vmov.i8 d1, #20 + mov r14, #HALFPEL_WIDTH + vpush {d8-d15} + +filter_horz_loop: + + + vld1.8 {d2, d3, d4}, [r0], r2 @// Load row0 + vld1.8 {d5, d6, d7}, [r0], r2 @// Load row1 + + @// Processing row0 and row1 + + vext.8 d31, d2, d3, #5 @//extract a[5] (column1,row0) + vext.8 d30, d3, d4, #5 @//extract a[5] (column2,row0) + + vaddl.u8 q4, d31, d2 @// a0 + a5 (column1,row0) + vext.8 d29, d4, d4, #5 @//extract a[5] (column3,row0) + vaddl.u8 q5, d30, d3 @// a0 + a5 (column2,row0) + vext.8 d28, d5, d6, #5 @//extract a[5] (column1,row1) + vaddl.u8 q6, d29, d4 @// a0 + a5 (column3,row0) + vext.8 d27, d6, d7, #5 @//extract a[5] (column2,row1) + vaddl.u8 q7, d28, d5 @// a0 + a5 (column1,row1) + vext.8 d26, d7, d7, #5 @//extract a[5] (column3,row1) + + vaddl.u8 q8, d27, d6 @// a0 + a5 (column2,row1) + vext.8 d31, d2, d3, #2 @//extract a[2] (column1,row0) + vaddl.u8 q9, d26, d7 @// a0 + a5 (column3,row1) + vext.8 d30, d3, d4, #2 @//extract a[2] (column2,row0) + vmlal.u8 q4, d31, d1 @// a0 + a5 + 20a2 (column1,row0) + vext.8 d29, d4, d4, #2 @//extract a[2] (column3,row0) + vmlal.u8 q5, d30, d1 @// a0 + a5 + 20a2 (column2,row0) + vext.8 d28, d5, d6, #2 @//extract a[2] (column1,row1) + vmlal.u8 q6, d29, d1 @// a0 + a5 + 20a2 (column3,row0) + vext.8 d27, d6, d7, #2 @//extract a[2] (column2,row1) + vmlal.u8 q7, d28, d1 @// a0 + a5 + 20a2 (column1,row1) + vext.8 d26, d7, d7, #2 @//extract a[2] (column3,row1) + + vmlal.u8 q8, d27, d1 @// a0 + a5 + 20a2 (column2,row1) + vext.8 d31, d2, d3, #3 @//extract a[3] (column1,row0) + vmlal.u8 q9, d26, d1 @// a0 + a5 + 20a2 (column3,row1) + vext.8 d30, d3, d4, #3 @//extract a[3] (column2,row0) + vmlal.u8 q4, d31, d1 @// a0 + a5 + 20a2 + 20a3 (column1,row0) + vext.8 d29, d4, d4, #3 @//extract a[3] (column3,row0) + vmlal.u8 q5, d30, d1 @// a0 + a5 + 20a2 + 20a3 (column2,row0) + vext.8 d28, d5, d6, #3 @//extract a[3] (column1,row1) + vmlal.u8 q6, d29, d1 @// a0 + a5 + 20a2 + 20a3 (column3,row0) + vext.8 d27, d6, d7, #3 @//extract a[3] (column2,row1) + vmlal.u8 q7, d28, d1 @// a0 + a5 + 20a2 + 20a3 (column1,row1) + vext.8 d26, d7, d7, #3 @//extract a[3] (column3,row1) + + vmlal.u8 q8, d27, d1 @// a0 + a5 + 20a2 + 20a3 (column2,row1) + vext.8 d31, d2, d3, #1 @//extract a[1] (column1,row0) + vmlal.u8 q9, d26, d1 @// a0 + a5 + 20a2 + 20a3 (column3,row1) + vext.8 d30, d3, d4, #1 @//extract a[1] (column2,row0) + vmlsl.u8 q4, d31, d0 @// a0 + a5 + 20a2 + 20a3 - 5a1 (column1,row0) + vext.8 d29, d4, d4, #1 @//extract a[1] (column3,row0) + vmlsl.u8 q5, d30, d0 @// a0 + a5 + 20a2 + 20a3 - 5a1 (column2,row0) + vext.8 d28, d5, d6, #1 @//extract a[1] (column1,row1) + vmlsl.u8 q6, d29, d0 @// a0 + a5 + 20a2 + 20a3 - 5a1 (column3,row0) + vext.8 d27, d6, d7, #1 @//extract a[1] (column2,row1) + vmlsl.u8 q7, d28, d0 @// a0 + a5 + 20a2 + 20a3 - 5a1 (column1,row1) + vext.8 d26, d7, d7, #1 @//extract a[1] (column3,row1) + + vmlsl.u8 q8, d27, d0 @// a0 + a5 + 20a2 + 20a3 - 5a1 (column2,row1) + vext.8 d31, d2, d3, #4 @//extract a[4] (column1,row0) + vmlsl.u8 q9, d26, d0 @// a0 + a5 + 20a2 + 20a3 - 5a1 (column3,row1) + vext.8 d30, d3, d4, #4 @//extract a[4] (column2,row0) + vmlsl.u8 q4, d31, d0 @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1,row0) + vext.8 d29, d4, d4, #4 @//extract a[4] (column3,row0) + vmlsl.u8 q5, d30, d0 @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column2,row0) + vext.8 d28, d5, d6, #4 @//extract a[4] (column1,row1) + vmlsl.u8 q6, d29, d0 @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column3,row0) + vext.8 d27, d6, d7, #4 @//extract a[4] (column2,row1) + vmlsl.u8 q7, d28, d0 @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1,row1) + vext.8 d26, d7, d7, #4 @//extract a[4] (column3,row1) + + vmlsl.u8 q8, d27, d0 @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column2,row1) + vmlsl.u8 q9, d26, d0 @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column3,row1) + + vqrshrun.s16 d20, q4, #5 @// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column1,row0) + vqrshrun.s16 d21, q5, #5 @// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column2,row0) + vqrshrun.s16 d22, q6, #5 @// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column3,row0) + vqrshrun.s16 d23, q7, #5 @// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column1,row1) + vqrshrun.s16 d24, q8, #5 @// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column2,row1) + vqrshrun.s16 d25, q9, #5 @// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column3,row1) + + vst1.8 {d20, d21, d22}, [r1], r3 @//Store dest row0 + vst1.8 {d23, d24, d25}, [r1], r3 @//Store dest row1 + + subs r14, r14, #2 @ decrement counter + + bne filter_horz_loop + + vpop {d8-d15} + ldmfd sp!, {pc} + + + + + + + + + +@/** +@******************************************************************************* +@* +@* @brief +@* This function implements a two stage cascaded six tap filter. It +@* applies the six tap filter in the vertical direction on the +@* predictor values, followed by applying the same filter in the +@* horizontal direction on the output of the first stage. The six tap +@* filtering operation is described in sec 8.4.2.2.1 titled "Luma sample +@* interpolation process" +@* (Filter run for width = 17 and height =17) +@* @par Description: +@* The function interpolates +@* the predictors first in the vertical direction and then in the +@* horizontal direction to output the (1/2,1/2). The output of the first +@* stage of the filter is stored in the buffer pointed to by pi16_pred1(only in C) +@* in 16 bit precision. +@* +@* +@* @param[in] pu1_src +@* UWORD8 pointer to the source +@* +@* @param[out] pu1_dst1 +@* UWORD8 pointer to the destination(vertical filtered output) +@* +@* @param[out] pu1_dst2 +@* UWORD8 pointer to the destination(out put after applying horizontal filter to the intermediate vertical output) +@* +@* @param[in] src_strd +@* integer source stride +@* +@* @param[in] dst_strd +@* integer destination stride of pu1_dst +@* +@* @param[in]pi16_pred1 +@* Pointer to 16bit intermediate buffer(used only in c) +@* +@* @param[in] pi16_pred1_strd +@* integer destination stride of pi16_pred1 +@* +@* +@* @returns +@* +@* @remarks +@* None +@* +@******************************************************************************* +@*/ +@void ih264e_sixtap_filter_2dvh_vert(UWORD8 *pu1_src, +@ UWORD8 *pu1_dst1, +@ UWORD8 *pu1_dst2, +@ WORD32 src_strd, +@ WORD32 dst_strd, +@ WORD32 *pi16_pred1,/* Pointer to 16bit intermmediate buffer (used only in c)*/ +@ WORD32 pi16_pred1_strd) + + + + + .global ih264e_sixtap_filter_2dvh_vert_a9q + +ih264e_sixtap_filter_2dvh_vert_a9q: + stmfd sp!, {r10, r11, r12, lr} + +@//r0 - pu1_ref +@//r3 - u4_ref_width + vpush {d8-d15} + @// Load six rows for vertical interpolation + lsl r12, r3, #1 + sub r0, r0, r12 + sub r0, r0, #2 + vld1.8 {d2, d3, d4}, [r0], r3 + vld1.8 {d5, d6, d7}, [r0], r3 + vld1.8 {d8, d9, d10}, [r0], r3 + mov r12, #5 + vld1.8 {d11, d12, d13}, [r0], r3 + mov r14, #20 + vld1.8 {d14, d15, d16}, [r0], r3 + vmov.16 d0[0], r12 + vmov.16 d0[1], r14 + vld1.8 {d17, d18, d19}, [r0], r3 + vmov.i8 d1, #20 + +@// r12 - u2_buff1_width +@// r14 - u2_buff2_width + ldr r12, [sp, #80] + add r11, r1, #6 + + mov r14, r12 + + mov r10, #3 @loop counter + + +filter_2dvh_loop: + + @// ////////////// ROW 1 /////////////////////// + +@// Process first vertical interpolated row +@// each column is + vaddl.u8 q10, d2, d17 @// a0 + a5 (column1,row0) + vmov.i8 d31, #5 + vmlal.u8 q10, d8, d1 @// a0 + a5 + 20a2 (column1,row0) + vmlal.u8 q10, d11, d1 @// a0 + a5 + 20a2 + 20a3 (column1,row0) + vmlsl.u8 q10, d5, d31 @// a0 + a5 + 20a2 + 20a3 - 5a1 (column1,row0) + vmlsl.u8 q10, d14, d31 @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1,row0) + + + vaddl.u8 q11, d3, d18 @// a0 + a5 (column2,row0) + vmlal.u8 q11, d9, d1 @// a0 + a5 + 20a2 (column2,row0) + vmlal.u8 q11, d12, d1 @// a0 + a5 + 20a2 + 20a3 (column2,row0) + vmlsl.u8 q11, d6, d31 @// a0 + a5 + 20a2 + 20a3 - 5a1 (column2,row0) + vmlsl.u8 q11, d15, d31 @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column2,row0) + vext.16 d30, d20, d21, #2 @//extract a[2] (set1) + + vaddl.u8 q12, d4, d19 @// a0 + a5 (column3,row0) + vext.16 d29, d20, d21, #3 @//extract a[3] (set1) + vmlal.u8 q12, d10, d1 @// a0 + a5 + 20a2 (column3,row0) + vmlal.u8 q12, d13, d1 @// a0 + a5 + 20a2 + 20a3 (column3,row0) + vmlsl.u8 q12, d7, d31 @// a0 + a5 + 20a2 + 20a3 - 5a1 (column3,row0) + vmlsl.u8 q12, d16, d31 @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column3,row0) + + vqrshrun.s16 d2, q10, #5 @// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column1,row0) + vext.16 d31, d21, d22, #1 @//extract a[5] (set1) + vqrshrun.s16 d3, q11, #5 @// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column2,row0) + vext.16 d28, d20, d21, #1 @//extract a[1] (set1) + + vaddl.s16 q13, d31, d20 @// a0 + a5 (set1) + vext.16 d31, d22, d23, #1 @//extract a[5] (set2) + vmlal.s16 q13, d30, d0[1] @// a0 + a5 + 20a2 (set1) + vmlal.s16 q13, d29, d0[1] @// a0 + a5 + 20a2 + 20a3 (set1) + vmlsl.s16 q13, d28, d0[0] @// a0 + a5 + 20a2 + 20a3 - 5a1 (set1) + vmlsl.s16 q13, d21, d0[0] @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (set1) + vext.16 d30, d21, d22, #2 @//extract a[2] (set2) + + vqrshrun.s16 d4, q12, #5 @// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column3,row0) + vext.16 d29, d21, d22, #3 @//extract a[3] (set2) + + vext.16 d28, d21, d22, #1 @//extract a[1] (set2) + vaddl.s16 q10, d31, d21 @// a0 + a5 (set2) + vmlal.s16 q10, d30, d0[1] @// a0 + a5 + 20a2 (set2) + vmlal.s16 q10, d29, d0[1] @// a0 + a5 + 20a2 + 20a3 (set2) + vmlsl.s16 q10, d28, d0[0] @// a0 + a5 + 20a2 + 20a3 - 5a1 (set2) + vmlsl.s16 q10, d22, d0[0] @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (set2) + vext.16 d31, d23, d24, #1 @//extract a[5] (set3) + + vext.8 d2, d2, d3, #2 + vst1.8 {d3, d4}, [r11], r12 @// store row1 - 1,1/2 grid + vst1.8 {d2}, [r1], r12 @// store row1 - 1,1/2 grid + + vext.16 d30, d22, d23, #2 @//extract a[2] (set3) + vext.16 d29, d22, d23, #3 @//extract a[3] (set3) + + vaddl.s16 q1, d31, d22 @// a0 + a5 (set3) + vext.16 d28, d22, d23, #1 @//extract a[1] (set3) + vmlal.s16 q1, d30, d0[1] @// a0 + a5 + 20a2 (set3) + vmlal.s16 q1, d29, d0[1] @// a0 + a5 + 20a2 + 20a3 (set3) + vmlsl.s16 q1, d28, d0[0] @// a0 + a5 + 20a2 + 20a3 - 5a1 (set3) + vmlsl.s16 q1, d23, d0[0] @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (set3) + vext.16 d31, d24, d25, #1 @//extract a[5] (set4) + + vshrn.s32 d21, q10, #8 @// shift by 8 and later we will shift by 2 more with rounding (set2) + vext.16 d30, d23, d24, #2 @//extract a[2] (set4) + vshrn.s32 d20, q13, #8 @// shift by 8 and later we will shift by 2 more with rounding (set1) + vext.16 d29, d23, d24, #3 @//extract a[3] (set4) + + vaddl.s16 q13, d31, d23 @// a0 + a5 (set4) + vext.16 d28, d23, d24, #1 @//extract a[1] (set4) + vext.16 d31, d25, d25, #1 @//extract a[5] (set5) ;//here only first element in the row is valid + vmlal.s16 q13, d30, d0[1] @// a0 + a5 + 20a2 (set4) + vmlal.s16 q13, d29, d0[1] @// a0 + a5 + 20a2 + 20a3 (set4) + vmlsl.s16 q13, d28, d0[0] @// a0 + a5 + 20a2 + 20a3 - 5a1 (set4) + vmlsl.s16 q13, d24, d0[0] @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (set4) + vext.16 d30, d24, d25, #2 @//extract a[2] (set5) + + vaddl.s16 q11, d31, d24 @// a0 + a5 (set5) + vext.16 d29, d24, d25, #3 @//extract a[3] (set5) + + vext.16 d31, d24, d25, #1 @//extract a[1] (set5) + vshrn.s32 d28, q1, #8 @// shift by 8 and later we will shift by 2 more with rounding (set3) + + vld1.8 {d2, d3, d4}, [r0], r3 @// Load next Row data + vmlal.s16 q11, d30, d0[1] @// a0 + a5 + 20a2 (set5) + vmlal.s16 q11, d29, d0[1] @// a0 + a5 + 20a2 + 20a3 (set5) + vmlsl.s16 q11, d31, d0[0] @// a0 + a5 + 20a2 + 20a3 - 5a1 (set5) + vmlsl.s16 q11, d25, d0[0] @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (set5) + vshrn.s32 d29, q13, #8 @// shift by 8 and later we will shift by 2 more with rounding (set4) + vqrshrun.s16 d26, q10, #2 @// half,half gird set1,2 + + + @//VQRSHRUN.s16 D27,Q14,#2 ;// half,half gird set3,4 + @//VSHRN.s32 D28,Q11,#8 ;// shift by 8 and later we will shift by 2 more with rounding (set5) + + @//VQRSHRUN.s16 D28,Q14,#2 ;// half,half gird set5 + + @//VST1.8 {D26,D27,D28},[r2],r14 ;// store 1/2,1,2 grif values + @// ////////////// ROW 2 /////////////////////// + +@// Process first vertical interpolated row +@// each column is + vaddl.u8 q10, d5, d2 @// a0 + a5 (column1,row0) + vmov.i8 d31, #5 + vmlal.u8 q10, d11, d1 @// a0 + a5 + 20a2 (column1,row0) + vmlal.u8 q10, d14, d1 @// a0 + a5 + 20a2 + 20a3 (column1,row0) + vmlsl.u8 q10, d8, d31 @// a0 + a5 + 20a2 + 20a3 - 5a1 (column1,row0) + vmlsl.u8 q10, d17, d31 @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1,row0) + + vqrshrun.s16 d27, q14, #2 @// half,half gird set3,4 + vshrn.s32 d28, q11, #8 @// shift by 8 and later we will shift by 2 more with rounding (set5) + + vaddl.u8 q11, d6, d3 @// a0 + a5 (column2,row0) + vmlal.u8 q11, d12, d1 @// a0 + a5 + 20a2 (column2,row0) + vmlal.u8 q11, d15, d1 @// a0 + a5 + 20a2 + 20a3 (column2,row0) + vmlsl.u8 q11, d9, d31 @// a0 + a5 + 20a2 + 20a3 - 5a1 (column2,row0) + vmlsl.u8 q11, d18, d31 @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column2,row0) + + vqrshrun.s16 d28, q14, #2 @// half,half gird set5 + vext.16 d30, d20, d21, #2 @//extract a[2] (set1) + + vaddl.u8 q12, d7, d4 @// a0 + a5 (column3,row0) + vext.16 d29, d20, d21, #3 @//extract a[3] (set1) + vmlal.u8 q12, d13, d1 @// a0 + a5 + 20a2 (column3,row0) + vmlal.u8 q12, d16, d1 @// a0 + a5 + 20a2 + 20a3 (column3,row0) + vmlsl.u8 q12, d10, d31 @// a0 + a5 + 20a2 + 20a3 - 5a1 (column3,row0) + vmlsl.u8 q12, d19, d31 @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column3,row0) + vst1.8 {d26, d27, d28}, [r2], r14 @// store 1/2,1,2 grif values + + vqrshrun.s16 d5, q10, #5 @// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column1,row0) + vext.16 d31, d21, d22, #1 @//extract a[5] (set1) + vqrshrun.s16 d6, q11, #5 @// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column2,row0) + vext.16 d28, d20, d21, #1 @//extract a[1] (set1) + + vaddl.s16 q13, d31, d20 @// a0 + a5 (set1) + vext.16 d31, d22, d23, #1 @//extract a[5] (set2) + vmlal.s16 q13, d30, d0[1] @// a0 + a5 + 20a2 (set1) + vmlal.s16 q13, d29, d0[1] @// a0 + a5 + 20a2 + 20a3 (set1) + vmlsl.s16 q13, d28, d0[0] @// a0 + a5 + 20a2 + 20a3 - 5a1 (set1) + vmlsl.s16 q13, d21, d0[0] @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (set1) + vext.16 d30, d21, d22, #2 @//extract a[2] (set2) + + vqrshrun.s16 d7, q12, #5 @// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column3,row0) + vext.16 d29, d21, d22, #3 @//extract a[3] (set2) + + vext.16 d28, d21, d22, #1 @//extract a[1] (set2) + vaddl.s16 q10, d31, d21 @// a0 + a5 (set2) + vmlal.s16 q10, d30, d0[1] @// a0 + a5 + 20a2 (set2) + vmlal.s16 q10, d29, d0[1] @// a0 + a5 + 20a2 + 20a3 (set2) + vmlsl.s16 q10, d28, d0[0] @// a0 + a5 + 20a2 + 20a3 - 5a1 (set2) + vmlsl.s16 q10, d22, d0[0] @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (set2) + vext.16 d31, d23, d24, #1 @//extract a[5] (set3) + + vext.8 d5, d5, d6, #2 + vst1.8 {d6, d7}, [r11], r12 @// store row1 - 1,1/2 grid + vst1.8 {d5}, [r1], r12 @// store row1 - 1,1/2 grid + + vext.16 d30, d22, d23, #2 @//extract a[2] (set3) + vext.16 d29, d22, d23, #3 @//extract a[3] (set3) + + vaddl.s16 q3, d31, d22 @// a0 + a5 (set3) + vext.16 d28, d22, d23, #1 @//extract a[1] (set3) + vmlal.s16 q3, d30, d0[1] @// a0 + a5 + 20a2 (set3) + vmlal.s16 q3, d29, d0[1] @// a0 + a5 + 20a2 + 20a3 (set3) + vmlsl.s16 q3, d28, d0[0] @// a0 + a5 + 20a2 + 20a3 - 5a1 (set3) + vmlsl.s16 q3, d23, d0[0] @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (set3) + vext.16 d31, d24, d25, #1 @//extract a[5] (set4) + + vshrn.s32 d21, q10, #8 @// shift by 8 and later we will shift by 2 more with rounding (set2) + vext.16 d30, d23, d24, #2 @//extract a[2] (set4) + vshrn.s32 d20, q13, #8 @// shift by 8 and later we will shift by 2 more with rounding (set1) + vext.16 d29, d23, d24, #3 @//extract a[3] (set4) + + vaddl.s16 q13, d31, d23 @// a0 + a5 (set4) + vext.16 d28, d23, d24, #1 @//extract a[1] (set4) + vext.16 d31, d25, d25, #1 @//extract a[5] (set5) ;//here only first element in the row is valid + vmlal.s16 q13, d30, d0[1] @// a0 + a5 + 20a2 (set4) + vmlal.s16 q13, d29, d0[1] @// a0 + a5 + 20a2 + 20a3 (set4) + vmlsl.s16 q13, d28, d0[0] @// a0 + a5 + 20a2 + 20a3 - 5a1 (set4) + vmlsl.s16 q13, d24, d0[0] @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (set4) + vext.16 d30, d24, d25, #2 @//extract a[2] (set5) + + vaddl.s16 q11, d31, d24 @// a0 + a5 (set5) + vext.16 d29, d24, d25, #3 @//extract a[3] (set5) + + vext.16 d31, d24, d25, #1 @//extract a[1] (set5) + vshrn.s32 d28, q3, #8 @// shift by 8 and later we will shift by 2 more with rounding (set3) + + vld1.8 {d5, d6, d7}, [r0], r3 @// Load next Row data + vmlal.s16 q11, d30, d0[1] @// a0 + a5 + 20a2 (set5) + vmlal.s16 q11, d29, d0[1] @// a0 + a5 + 20a2 + 20a3 (set5) + vmlsl.s16 q11, d31, d0[0] @// a0 + a5 + 20a2 + 20a3 - 5a1 (set5) + vmlsl.s16 q11, d25, d0[0] @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (set5) + vshrn.s32 d29, q13, #8 @// shift by 8 and later we will shift by 2 more with rounding (set4) + vqrshrun.s16 d26, q10, #2 @// half,half gird set1,2 + + + @//VQRSHRUN.s16 D27,Q14,#2 ;// half,half gird set3,4 + @//VSHRN.s32 D28,Q11,#8 ;// shift by 8 and later we will shift by 2 more with rounding (set5) + + @//VQRSHRUN.s16 D28,Q14,#2 ;// half,half gird set5 + + @//VST1.8 {D26,D27,D28},[r2],r14 ;// store 1/2,1,2 grif values + @// ////////////// ROW 3 /////////////////////// + +@// Process first vertical interpolated row +@// each column is + vaddl.u8 q10, d8, d5 @// a0 + a5 (column1,row0) + vmov.i8 d31, #5 + vmlal.u8 q10, d14, d1 @// a0 + a5 + 20a2 (column1,row0) + vmlal.u8 q10, d17, d1 @// a0 + a5 + 20a2 + 20a3 (column1,row0) + vmlsl.u8 q10, d11, d31 @// a0 + a5 + 20a2 + 20a3 - 5a1 (column1,row0) + vmlsl.u8 q10, d2, d31 @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1,row0) + + vqrshrun.s16 d27, q14, #2 @// half,half gird set3,4 + vshrn.s32 d28, q11, #8 @// shift by 8 and later we will shift by 2 more with rounding (set5) + + vaddl.u8 q11, d9, d6 @// a0 + a5 (column2,row0) + vmlal.u8 q11, d15, d1 @// a0 + a5 + 20a2 (column2,row0) + vmlal.u8 q11, d18, d1 @// a0 + a5 + 20a2 + 20a3 (column2,row0) + vmlsl.u8 q11, d12, d31 @// a0 + a5 + 20a2 + 20a3 - 5a1 (column2,row0) + vmlsl.u8 q11, d3, d31 @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column2,row0) + + vqrshrun.s16 d28, q14, #2 @// half,half gird set5 + vext.16 d30, d20, d21, #2 @//extract a[2] (set1) + + vaddl.u8 q12, d10, d7 @// a0 + a5 (column3,row0) + vext.16 d29, d20, d21, #3 @//extract a[3] (set1) + vmlal.u8 q12, d16, d1 @// a0 + a5 + 20a2 (column3,row0) + vmlal.u8 q12, d19, d1 @// a0 + a5 + 20a2 + 20a3 (column3,row0) + vmlsl.u8 q12, d13, d31 @// a0 + a5 + 20a2 + 20a3 - 5a1 (column3,row0) + vmlsl.u8 q12, d4, d31 @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column3,row0) + + vst1.8 {d26, d27, d28}, [r2], r14 @// store 1/2,1,2 grif values + + vqrshrun.s16 d8, q10, #5 @// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column1,row0) + vext.16 d31, d21, d22, #1 @//extract a[5] (set1) + vqrshrun.s16 d9, q11, #5 @// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column2,row0) + vext.16 d28, d20, d21, #1 @//extract a[1] (set1) + + vaddl.s16 q13, d31, d20 @// a0 + a5 (set1) + vext.16 d31, d22, d23, #1 @//extract a[5] (set2) + vmlal.s16 q13, d30, d0[1] @// a0 + a5 + 20a2 (set1) + vmlal.s16 q13, d29, d0[1] @// a0 + a5 + 20a2 + 20a3 (set1) + vmlsl.s16 q13, d28, d0[0] @// a0 + a5 + 20a2 + 20a3 - 5a1 (set1) + vmlsl.s16 q13, d21, d0[0] @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (set1) + vext.16 d30, d21, d22, #2 @//extract a[2] (set2) + + vqrshrun.s16 d10, q12, #5 @// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column3,row0) + vext.16 d29, d21, d22, #3 @//extract a[3] (set2) + + vext.16 d28, d21, d22, #1 @//extract a[1] (set2) + vaddl.s16 q10, d31, d21 @// a0 + a5 (set2) + vmlal.s16 q10, d30, d0[1] @// a0 + a5 + 20a2 (set2) + vmlal.s16 q10, d29, d0[1] @// a0 + a5 + 20a2 + 20a3 (set2) + vmlsl.s16 q10, d28, d0[0] @// a0 + a5 + 20a2 + 20a3 - 5a1 (set2) + vmlsl.s16 q10, d22, d0[0] @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (set2) + vext.16 d31, d23, d24, #1 @//extract a[5] (set3) + + vext.8 d8, d8, d9, #2 + vst1.8 {d9, d10}, [r11], r12 @// store row1 - 1,1/2 grid + vst1.8 {d8}, [r1], r12 @// store row1 - 1,1/2 grid + + vext.16 d30, d22, d23, #2 @//extract a[2] (set3) + vext.16 d29, d22, d23, #3 @//extract a[3] (set3) + + vaddl.s16 q4, d31, d22 @// a0 + a5 (set3) + vext.16 d28, d22, d23, #1 @//extract a[1] (set3) + vmlal.s16 q4, d30, d0[1] @// a0 + a5 + 20a2 (set3) + vmlal.s16 q4, d29, d0[1] @// a0 + a5 + 20a2 + 20a3 (set3) + vmlsl.s16 q4, d28, d0[0] @// a0 + a5 + 20a2 + 20a3 - 5a1 (set3) + vmlsl.s16 q4, d23, d0[0] @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (set3) + vext.16 d31, d24, d25, #1 @//extract a[5] (set4) + + vshrn.s32 d21, q10, #8 @// shift by 8 and later we will shift by 2 more with rounding (set2) + vext.16 d30, d23, d24, #2 @//extract a[2] (set4) + vshrn.s32 d20, q13, #8 @// shift by 8 and later we will shift by 2 more with rounding (set1) + vext.16 d29, d23, d24, #3 @//extract a[3] (set4) + + vaddl.s16 q13, d31, d23 @// a0 + a5 (set4) + vext.16 d28, d23, d24, #1 @//extract a[1] (set4) + vext.16 d31, d25, d25, #1 @//extract a[5] (set5) ;//here only first element in the row is valid + vmlal.s16 q13, d30, d0[1] @// a0 + a5 + 20a2 (set4) + vmlal.s16 q13, d29, d0[1] @// a0 + a5 + 20a2 + 20a3 (set4) + vmlsl.s16 q13, d28, d0[0] @// a0 + a5 + 20a2 + 20a3 - 5a1 (set4) + vmlsl.s16 q13, d24, d0[0] @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (set4) + vext.16 d30, d24, d25, #2 @//extract a[2] (set5) + + vaddl.s16 q11, d31, d24 @// a0 + a5 (set5) + vext.16 d29, d24, d25, #3 @//extract a[3] (set5) + + vext.16 d31, d24, d25, #1 @//extract a[1] (set5) + vshrn.s32 d28, q4, #8 @// shift by 8 and later we will shift by 2 more with rounding (set3) + + vld1.8 {d8, d9, d10}, [r0], r3 @// Load next Row data + vmlal.s16 q11, d30, d0[1] @// a0 + a5 + 20a2 (set5) + vmlal.s16 q11, d29, d0[1] @// a0 + a5 + 20a2 + 20a3 (set5) + vmlsl.s16 q11, d31, d0[0] @// a0 + a5 + 20a2 + 20a3 - 5a1 (set5) + vmlsl.s16 q11, d25, d0[0] @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (set5) + vshrn.s32 d29, q13, #8 @// shift by 8 and later we will shift by 2 more with rounding (set4) + vqrshrun.s16 d26, q10, #2 @// half,half gird set1,2 + + + @//VQRSHRUN.s16 D27,Q14,#2 ;// half,half gird set3,4 + @//VSHRN.s32 D28,Q11,#8 ;// shift by 8 and later we will shift by 2 more with rounding (set5) + + @//VQRSHRUN.s16 D28,Q14,#2 ;// half,half gird set5 + + @//VST1.8 {D26,D27,D28},[r2],r14 ;// store 1/2,1,2 grif values + @// ////////////// ROW 4 /////////////////////// + +@// Process first vertical interpolated row +@// each column is + vaddl.u8 q10, d11, d8 @// a0 + a5 (column1,row0) + vmov.i8 d31, #5 + vmlal.u8 q10, d17, d1 @// a0 + a5 + 20a2 (column1,row0) + vmlal.u8 q10, d2, d1 @// a0 + a5 + 20a2 + 20a3 (column1,row0) + vmlsl.u8 q10, d14, d31 @// a0 + a5 + 20a2 + 20a3 - 5a1 (column1,row0) + vmlsl.u8 q10, d5, d31 @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1,row0) + + vqrshrun.s16 d27, q14, #2 @// half,half gird set3,4 + vshrn.s32 d28, q11, #8 @// shift by 8 and later we will shift by 2 more with rounding (set5) + + vaddl.u8 q11, d12, d9 @// a0 + a5 (column2,row0) + vmlal.u8 q11, d18, d1 @// a0 + a5 + 20a2 (column2,row0) + vmlal.u8 q11, d3, d1 @// a0 + a5 + 20a2 + 20a3 (column2,row0) + vmlsl.u8 q11, d15, d31 @// a0 + a5 + 20a2 + 20a3 - 5a1 (column2,row0) + vmlsl.u8 q11, d6, d31 @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column2,row0) + + vqrshrun.s16 d28, q14, #2 @// half,half gird set5 + vext.16 d30, d20, d21, #2 @//extract a[2] (set1) + + vaddl.u8 q12, d13, d10 @// a0 + a5 (column3,row0) + vext.16 d29, d20, d21, #3 @//extract a[3] (set1) + vmlal.u8 q12, d19, d1 @// a0 + a5 + 20a2 (column3,row0) + vmlal.u8 q12, d4, d1 @// a0 + a5 + 20a2 + 20a3 (column3,row0) + vmlsl.u8 q12, d16, d31 @// a0 + a5 + 20a2 + 20a3 - 5a1 (column3,row0) + vmlsl.u8 q12, d7, d31 @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column3,row0) + + vst1.8 {d26, d27, d28}, [r2], r14 @// store 1/2,1,2 grif values + + vqrshrun.s16 d11, q10, #5 @// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column1,row0) + vext.16 d31, d21, d22, #1 @//extract a[5] (set1) + vqrshrun.s16 d12, q11, #5 @// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column2,row0) + vext.16 d28, d20, d21, #1 @//extract a[1] (set1) + + vaddl.s16 q13, d31, d20 @// a0 + a5 (set1) + vext.16 d31, d22, d23, #1 @//extract a[5] (set2) + vmlal.s16 q13, d30, d0[1] @// a0 + a5 + 20a2 (set1) + vmlal.s16 q13, d29, d0[1] @// a0 + a5 + 20a2 + 20a3 (set1) + vmlsl.s16 q13, d28, d0[0] @// a0 + a5 + 20a2 + 20a3 - 5a1 (set1) + vmlsl.s16 q13, d21, d0[0] @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (set1) + vext.16 d30, d21, d22, #2 @//extract a[2] (set2) + + vqrshrun.s16 d13, q12, #5 @// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column3,row0) + vext.16 d29, d21, d22, #3 @//extract a[3] (set2) + + vext.16 d28, d21, d22, #1 @//extract a[1] (set2) + vaddl.s16 q10, d31, d21 @// a0 + a5 (set2) + vmlal.s16 q10, d30, d0[1] @// a0 + a5 + 20a2 (set2) + vmlal.s16 q10, d29, d0[1] @// a0 + a5 + 20a2 + 20a3 (set2) + vmlsl.s16 q10, d28, d0[0] @// a0 + a5 + 20a2 + 20a3 - 5a1 (set2) + vmlsl.s16 q10, d22, d0[0] @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (set2) + vext.16 d31, d23, d24, #1 @//extract a[5] (set3) + + vext.8 d11, d11, d12, #2 + vst1.8 {d12, d13}, [r11], r12 @// store row1 - 1,1/2 grid + vst1.8 {d11}, [r1], r12 @// store row1 - 1,1/2 grid + + vext.16 d30, d22, d23, #2 @//extract a[2] (set3) + vext.16 d29, d22, d23, #3 @//extract a[3] (set3) + + vaddl.s16 q6, d31, d22 @// a0 + a5 (set3) + vext.16 d28, d22, d23, #1 @//extract a[1] (set3) + vmlal.s16 q6, d30, d0[1] @// a0 + a5 + 20a2 (set3) + vmlal.s16 q6, d29, d0[1] @// a0 + a5 + 20a2 + 20a3 (set3) + vmlsl.s16 q6, d28, d0[0] @// a0 + a5 + 20a2 + 20a3 - 5a1 (set3) + vmlsl.s16 q6, d23, d0[0] @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (set3) + vext.16 d31, d24, d25, #1 @//extract a[5] (set4) + + vshrn.s32 d21, q10, #8 @// shift by 8 and later we will shift by 2 more with rounding (set2) + vext.16 d30, d23, d24, #2 @//extract a[2] (set4) + vshrn.s32 d20, q13, #8 @// shift by 8 and later we will shift by 2 more with rounding (set1) + vext.16 d29, d23, d24, #3 @//extract a[3] (set4) + + vaddl.s16 q13, d31, d23 @// a0 + a5 (set4) + vext.16 d28, d23, d24, #1 @//extract a[1] (set4) + vext.16 d31, d25, d25, #1 @//extract a[5] (set5) ;//here only first element in the row is valid + vmlal.s16 q13, d30, d0[1] @// a0 + a5 + 20a2 (set4) + vmlal.s16 q13, d29, d0[1] @// a0 + a5 + 20a2 + 20a3 (set4) + vmlsl.s16 q13, d28, d0[0] @// a0 + a5 + 20a2 + 20a3 - 5a1 (set4) + vmlsl.s16 q13, d24, d0[0] @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (set4) + vext.16 d30, d24, d25, #2 @//extract a[2] (set5) + + vaddl.s16 q11, d31, d24 @// a0 + a5 (set5) + vext.16 d29, d24, d25, #3 @//extract a[3] (set5) + + vext.16 d31, d24, d25, #1 @//extract a[1] (set5) + vshrn.s32 d28, q6, #8 @// shift by 8 and later we will shift by 2 more with rounding (set3) + + vld1.8 {d11, d12, d13}, [r0], r3 @// Load next Row data + vmlal.s16 q11, d30, d0[1] @// a0 + a5 + 20a2 (set5) + vmlal.s16 q11, d29, d0[1] @// a0 + a5 + 20a2 + 20a3 (set5) + vmlsl.s16 q11, d31, d0[0] @// a0 + a5 + 20a2 + 20a3 - 5a1 (set5) + vmlsl.s16 q11, d25, d0[0] @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (set5) + vshrn.s32 d29, q13, #8 @// shift by 8 and later we will shift by 2 more with rounding (set4) + vqrshrun.s16 d26, q10, #2 @// half,half gird set1,2 + + + @//VQRSHRUN.s16 D27,Q14,#2 ;// half,half gird set3,4 + @//VSHRN.s32 D28,Q11,#8 ;// shift by 8 and later we will shift by 2 more with rounding (set5) + + @//VQRSHRUN.s16 D28,Q14,#2 ;// half,half gird set5 + + @//VST1.8 {D26,D27,D28},[r2],r14 ;// store 1/2,1,2 grif values + @// ////////////// ROW 5 /////////////////////// + +@// Process first vertical interpolated row +@// each column is + vaddl.u8 q10, d14, d11 @// a0 + a5 (column1,row0) + vmov.i8 d31, #5 + vmlal.u8 q10, d2, d1 @// a0 + a5 + 20a2 (column1,row0) + vmlal.u8 q10, d5, d1 @// a0 + a5 + 20a2 + 20a3 (column1,row0) + vmlsl.u8 q10, d17, d31 @// a0 + a5 + 20a2 + 20a3 - 5a1 (column1,row0) + vmlsl.u8 q10, d8, d31 @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1,row0) + + vqrshrun.s16 d27, q14, #2 @// half,half gird set3,4 + vshrn.s32 d28, q11, #8 @// shift by 8 and later we will shift by 2 more with rounding (set5) + + vaddl.u8 q11, d15, d12 @// a0 + a5 (column2,row0) + vmlal.u8 q11, d3, d1 @// a0 + a5 + 20a2 (column2,row0) + vmlal.u8 q11, d6, d1 @// a0 + a5 + 20a2 + 20a3 (column2,row0) + vmlsl.u8 q11, d18, d31 @// a0 + a5 + 20a2 + 20a3 - 5a1 (column2,row0) + vmlsl.u8 q11, d9, d31 @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column2,row0) + + vqrshrun.s16 d28, q14, #2 @// half,half gird set5 + vext.16 d30, d20, d21, #2 @//extract a[2] (set1) + + vaddl.u8 q12, d16, d13 @// a0 + a5 (column3,row0) + vext.16 d29, d20, d21, #3 @//extract a[3] (set1) + vmlal.u8 q12, d4, d1 @// a0 + a5 + 20a2 (column3,row0) + vmlal.u8 q12, d7, d1 @// a0 + a5 + 20a2 + 20a3 (column3,row0) + vmlsl.u8 q12, d19, d31 @// a0 + a5 + 20a2 + 20a3 - 5a1 (column3,row0) + vmlsl.u8 q12, d10, d31 @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column3,row0) + + vst1.8 {d26, d27, d28}, [r2], r14 @// store 1/2,1,2 grif values + + vqrshrun.s16 d14, q10, #5 @// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column1,row0) + vext.16 d31, d21, d22, #1 @//extract a[5] (set1) + vqrshrun.s16 d15, q11, #5 @// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column2,row0) + vext.16 d28, d20, d21, #1 @//extract a[1] (set1) + + vaddl.s16 q13, d31, d20 @// a0 + a5 (set1) + vext.16 d31, d22, d23, #1 @//extract a[5] (set2) + vmlal.s16 q13, d30, d0[1] @// a0 + a5 + 20a2 (set1) + vmlal.s16 q13, d29, d0[1] @// a0 + a5 + 20a2 + 20a3 (set1) + vmlsl.s16 q13, d28, d0[0] @// a0 + a5 + 20a2 + 20a3 - 5a1 (set1) + vmlsl.s16 q13, d21, d0[0] @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (set1) + vext.16 d30, d21, d22, #2 @//extract a[2] (set2) + + vqrshrun.s16 d16, q12, #5 @// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column3,row0) + vext.16 d29, d21, d22, #3 @//extract a[3] (set2) + + vext.16 d28, d21, d22, #1 @//extract a[1] (set2) + vaddl.s16 q10, d31, d21 @// a0 + a5 (set2) + vmlal.s16 q10, d30, d0[1] @// a0 + a5 + 20a2 (set2) + vmlal.s16 q10, d29, d0[1] @// a0 + a5 + 20a2 + 20a3 (set2) + vmlsl.s16 q10, d28, d0[0] @// a0 + a5 + 20a2 + 20a3 - 5a1 (set2) + vmlsl.s16 q10, d22, d0[0] @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (set2) + vext.16 d31, d23, d24, #1 @//extract a[5] (set3) + + vext.8 d14, d14, d15, #2 + vst1.8 {d15, d16}, [r11], r12 @// store row1 - 1,1/2 grid + vst1.8 {d14}, [r1], r12 @// store row1 - 1,1/2 grid + + vext.16 d30, d22, d23, #2 @//extract a[2] (set3) + vext.16 d29, d22, d23, #3 @//extract a[3] (set3) + + vaddl.s16 q7, d31, d22 @// a0 + a5 (set3) + vext.16 d28, d22, d23, #1 @//extract a[1] (set3) + vmlal.s16 q7, d30, d0[1] @// a0 + a5 + 20a2 (set3) + vmlal.s16 q7, d29, d0[1] @// a0 + a5 + 20a2 + 20a3 (set3) + vmlsl.s16 q7, d28, d0[0] @// a0 + a5 + 20a2 + 20a3 - 5a1 (set3) + vmlsl.s16 q7, d23, d0[0] @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (set3) + vext.16 d31, d24, d25, #1 @//extract a[5] (set4) + + vshrn.s32 d21, q10, #8 @// shift by 8 and later we will shift by 2 more with rounding (set2) + vext.16 d30, d23, d24, #2 @//extract a[2] (set4) + vshrn.s32 d20, q13, #8 @// shift by 8 and later we will shift by 2 more with rounding (set1) + vext.16 d29, d23, d24, #3 @//extract a[3] (set4) + + vaddl.s16 q13, d31, d23 @// a0 + a5 (set4) + vext.16 d28, d23, d24, #1 @//extract a[1] (set4) + vext.16 d31, d25, d25, #1 @//extract a[5] (set5) ;//here only first element in the row is valid + vmlal.s16 q13, d30, d0[1] @// a0 + a5 + 20a2 (set4) + vmlal.s16 q13, d29, d0[1] @// a0 + a5 + 20a2 + 20a3 (set4) + vmlsl.s16 q13, d28, d0[0] @// a0 + a5 + 20a2 + 20a3 - 5a1 (set4) + vmlsl.s16 q13, d24, d0[0] @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (set4) + vext.16 d30, d24, d25, #2 @//extract a[2] (set5) + + vaddl.s16 q11, d31, d24 @// a0 + a5 (set5) + vext.16 d29, d24, d25, #3 @//extract a[3] (set5) + + vext.16 d31, d24, d25, #1 @//extract a[1] (set5) + vshrn.s32 d28, q7, #8 @// shift by 8 and later we will shift by 2 more with rounding (set3) + + vld1.8 {d14, d15, d16}, [r0], r3 @// Load next Row data + vmlal.s16 q11, d30, d0[1] @// a0 + a5 + 20a2 (set5) + vmlal.s16 q11, d29, d0[1] @// a0 + a5 + 20a2 + 20a3 (set5) + vmlsl.s16 q11, d31, d0[0] @// a0 + a5 + 20a2 + 20a3 - 5a1 (set5) + vmlsl.s16 q11, d25, d0[0] @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (set5) + vshrn.s32 d29, q13, #8 @// shift by 8 and later we will shift by 2 more with rounding (set4) + vqrshrun.s16 d26, q10, #2 @// half,half gird set1,2 + + + @//VQRSHRUN.s16 D27,Q14,#2 ;// half,half gird set3,4 + @//VSHRN.s32 D28,Q11,#8 ;// shift by 8 and later we will shift by 2 more with rounding (set5) + + @//VQRSHRUN.s16 D28,Q14,#2 ;// half,half gird set5 + + @//VST1.8 {D26,D27,D28},[r2],r14 ;// store 1/2,1,2 grif values + @// ////////////// ROW 6 /////////////////////// + +@// Process first vertical interpolated row +@// each column is + + cmp r10, #1 @// if it 17 rows are complete skip + beq filter_2dvh_skip_row + vaddl.u8 q10, d17, d14 @// a0 + a5 (column1,row0) + vmov.i8 d31, #5 + vmlal.u8 q10, d5, d1 @// a0 + a5 + 20a2 (column1,row0) + vmlal.u8 q10, d8, d1 @// a0 + a5 + 20a2 + 20a3 (column1,row0) + vmlsl.u8 q10, d2, d31 @// a0 + a5 + 20a2 + 20a3 - 5a1 (column1,row0) + vmlsl.u8 q10, d11, d31 @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1,row0) + + vqrshrun.s16 d27, q14, #2 @// half,half gird set3,4 + vshrn.s32 d28, q11, #8 @// shift by 8 and later we will shift by 2 more with rounding (set5) + + vaddl.u8 q11, d18, d15 @// a0 + a5 (column2,row0) + vmlal.u8 q11, d6, d1 @// a0 + a5 + 20a2 (column2,row0) + vmlal.u8 q11, d9, d1 @// a0 + a5 + 20a2 + 20a3 (column2,row0) + vmlsl.u8 q11, d3, d31 @// a0 + a5 + 20a2 + 20a3 - 5a1 (column2,row0) + vmlsl.u8 q11, d12, d31 @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column2,row0) + + vqrshrun.s16 d28, q14, #2 @// half,half gird set5 + vext.16 d30, d20, d21, #2 @//extract a[2] (set1) + + vaddl.u8 q12, d19, d16 @// a0 + a5 (column3,row0) + vext.16 d29, d20, d21, #3 @//extract a[3] (set1) + vmlal.u8 q12, d7, d1 @// a0 + a5 + 20a2 (column3,row0) + vmlal.u8 q12, d10, d1 @// a0 + a5 + 20a2 + 20a3 (column3,row0) + vmlsl.u8 q12, d4, d31 @// a0 + a5 + 20a2 + 20a3 - 5a1 (column3,row0) + vmlsl.u8 q12, d13, d31 @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column3,row0) + + vst1.8 {d26, d27, d28}, [r2], r14 @// store 1/2,1,2 grif values + + vqrshrun.s16 d17, q10, #5 @// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column1,row0) + vext.16 d31, d21, d22, #1 @//extract a[5] (set1) + vqrshrun.s16 d18, q11, #5 @// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column2,row0) + vext.16 d28, d20, d21, #1 @//extract a[1] (set1) + + vaddl.s16 q13, d31, d20 @// a0 + a5 (set1) + vext.16 d31, d22, d23, #1 @//extract a[5] (set2) + vmlal.s16 q13, d30, d0[1] @// a0 + a5 + 20a2 (set1) + vmlal.s16 q13, d29, d0[1] @// a0 + a5 + 20a2 + 20a3 (set1) + vmlsl.s16 q13, d28, d0[0] @// a0 + a5 + 20a2 + 20a3 - 5a1 (set1) + vmlsl.s16 q13, d21, d0[0] @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (set1) + vext.16 d30, d21, d22, #2 @//extract a[2] (set2) + + vqrshrun.s16 d19, q12, #5 @// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column3,row0) + vext.16 d29, d21, d22, #3 @//extract a[3] (set2) + + vext.16 d28, d21, d22, #1 @//extract a[1] (set2) + vaddl.s16 q10, d31, d21 @// a0 + a5 (set2) + vmlal.s16 q10, d30, d0[1] @// a0 + a5 + 20a2 (set2) + vmlal.s16 q10, d29, d0[1] @// a0 + a5 + 20a2 + 20a3 (set2) + vmlsl.s16 q10, d28, d0[0] @// a0 + a5 + 20a2 + 20a3 - 5a1 (set2) + vmlsl.s16 q10, d22, d0[0] @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (set2) + vext.16 d31, d23, d24, #1 @//extract a[5] (set3) + + vext.8 d17, d17, d18, #2 + vst1.8 {d18, d19}, [r11], r12 @// store row1 - 1,1/2 grid + vst1.8 {d17}, [r1], r12 @// store row1 - 1,1/2 grid + + vext.16 d30, d22, d23, #2 @//extract a[2] (set3) + vext.16 d29, d22, d23, #3 @//extract a[3] (set3) + + vaddl.s16 q9, d31, d22 @// a0 + a5 (set3) + vext.16 d28, d22, d23, #1 @//extract a[1] (set3) + vmlal.s16 q9, d30, d0[1] @// a0 + a5 + 20a2 (set3) + vmlal.s16 q9, d29, d0[1] @// a0 + a5 + 20a2 + 20a3 (set3) + vmlsl.s16 q9, d28, d0[0] @// a0 + a5 + 20a2 + 20a3 - 5a1 (set3) + vmlsl.s16 q9, d23, d0[0] @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (set3) + vext.16 d31, d24, d25, #1 @//extract a[5] (set4) + + vshrn.s32 d21, q10, #8 @// shift by 8 and later we will shift by 2 more with rounding (set2) + vext.16 d30, d23, d24, #2 @//extract a[2] (set4) + vshrn.s32 d20, q13, #8 @// shift by 8 and later we will shift by 2 more with rounding (set1) + vext.16 d29, d23, d24, #3 @//extract a[3] (set4) + + vaddl.s16 q13, d31, d23 @// a0 + a5 (set4) + vext.16 d28, d23, d24, #1 @//extract a[1] (set4) + vext.16 d31, d25, d25, #1 @//extract a[5] (set5) ;//here only first element in the row is valid + vmlal.s16 q13, d30, d0[1] @// a0 + a5 + 20a2 (set4) + vmlal.s16 q13, d29, d0[1] @// a0 + a5 + 20a2 + 20a3 (set4) + vmlsl.s16 q13, d28, d0[0] @// a0 + a5 + 20a2 + 20a3 - 5a1 (set4) + vmlsl.s16 q13, d24, d0[0] @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (set4) + vext.16 d30, d24, d25, #2 @//extract a[2] (set5) + + vaddl.s16 q11, d31, d24 @// a0 + a5 (set5) + vext.16 d29, d24, d25, #3 @//extract a[3] (set5) + + vext.16 d31, d24, d25, #1 @//extract a[1] (set5) + vshrn.s32 d28, q9, #8 @// shift by 8 and later we will shift by 2 more with rounding (set3) + + vld1.8 {d17, d18, d19}, [r0], r3 @// Load next Row data + vmlal.s16 q11, d30, d0[1] @// a0 + a5 + 20a2 (set5) + vmlal.s16 q11, d29, d0[1] @// a0 + a5 + 20a2 + 20a3 (set5) + vmlsl.s16 q11, d31, d0[0] @// a0 + a5 + 20a2 + 20a3 - 5a1 (set5) + vmlsl.s16 q11, d25, d0[0] @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (set5) + vshrn.s32 d29, q13, #8 @// shift by 8 and later we will shift by 2 more with rounding (set4) + vqrshrun.s16 d26, q10, #2 @// half,half gird set1,2 + + + vqrshrun.s16 d27, q14, #2 @// half,half gird set3,4 + vshrn.s32 d28, q11, #8 @// shift by 8 and later we will shift by 2 more with rounding (set5) + + vqrshrun.s16 d28, q14, #2 @// half,half gird set5 + + vst1.8 {d26, d27, d28}, [r2], r14 @// store 1/2,1,2 grif values + + subs r10, r10, #1 @//decrement loop counter + + bne filter_2dvh_loop + + +@// Process first vertical interpolated row +@// each column is + @// ////////////// ROW 13 /////////////////////// + +@// Process first vertical interpolated row +@// each column is + vpop {d8-d15} + ldmfd sp!, {r10, r11, r12, pc} + +filter_2dvh_skip_row: + + vqrshrun.s16 d27, q14, #2 @// half,half gird set3,4 + vshrn.s32 d28, q11, #8 @// shift by 8 and later we will shift by 2 more with rounding (set5) + + vqrshrun.s16 d28, q14, #2 @// half,half gird set5 + + vst1.8 {d26, d27, d28}, [r2], r14 @// store 1/2,1,2 grif values + vpop {d8-d15} + ldmfd sp!, {r10, r11, r12, pc} + + + + diff --git a/encoder/arm/ih264e_platform_macros.h b/encoder/arm/ih264e_platform_macros.h new file mode 100755 index 0000000..39cac96 --- /dev/null +++ b/encoder/arm/ih264e_platform_macros.h @@ -0,0 +1,143 @@ +/****************************************************************************** + * + * Copyright (C) 2015 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ***************************************************************************** + * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore +*/ +/** +******************************************************************************* +* @file +* ih264e_platform_macros.h +* +* @brief +* Contains platform specific routines used for codec context intialization +* +* @author +* ittiam +* +* @remarks +* none +* +******************************************************************************* +*/ + +#ifndef IH264E_PLATFORM_MACROS_H_ +#define IH264E_PLATFORM_MACROS_H_ + +/*****************************************************************************/ +/* Extern Function Declarations */ +/*****************************************************************************/ + +/** +******************************************************************************* +* +* @brief Initialize the intra/inter/transform/deblk function pointers of +* codec context +* +* @par Description: the current routine initializes the function pointers of +* codec context basing on the architecture in use +* +* @param[in] ps_codec +* Codec context pointer +* +* @returns none +* +* @remarks none +* +******************************************************************************* +*/ +void ih264e_init_function_ptr_neon_a9q(codec_t *ps_codec); + +/** +******************************************************************************* +* +* @brief Initialize the intra/inter/transform/deblk function pointers of +* codec context +* +* @par Description: the current routine initializes the function pointers of +* codec context basing on the architecture in use +* +* @param[in] ps_codec +* Codec context pointer +* +* @returns none +* +* @remarks none +* +******************************************************************************* +*/ +void ih264e_init_function_ptr_neon_av8(codec_t *ps_codec); + +/** +******************************************************************************* +* +* @brief Initialize the intra/inter/transform/deblk function pointers of +* codec context +* +* @par Description: the current routine initializes the function pointers of +* codec context basing on the architecture in use +* +* @param[in] ps_codec +* Codec context pointer +* +* @returns none +* +* @remarks none +* +******************************************************************************* +*/ +void ih264e_init_function_ptr_generic(codec_t *ps_codec); + +/** +******************************************************************************* +* +* @brief Initialize the intra/inter/transform/deblk function pointers of +* codec context +* +* @par Description: the current routine initializes the function pointers of +* codec context basing on the architecture in use +* +* @param[in] ps_codec +* Codec context pointer +* +* @returns none +* +* @remarks none +* +******************************************************************************* +*/ +void ih264e_init_function_ptr(void *pv_codec); + +/** +******************************************************************************* +* +* @brief Determine the architecture of the encoder executing environment +* +* @par Description: This routine returns the architecture of the enviro- +* ment in which the current encoder is being tested +* +* @param[in] void +* +* @returns IV_ARCH_T +* architecture +* +* @remarks none +* +******************************************************************************* +*/ +IV_ARCH_T ih264e_default_arch(void); + +#endif /* IH264E_PLATFORM_MACROS_H_ */ diff --git a/encoder/arm/ime_distortion_metrics_a9q.s b/encoder/arm/ime_distortion_metrics_a9q.s new file mode 100755 index 0000000..b58911e --- /dev/null +++ b/encoder/arm/ime_distortion_metrics_a9q.s @@ -0,0 +1,1353 @@ +@/****************************************************************************** +@ * +@ * Copyright (C) 2015 The Android Open Source Project +@ * +@ * Licensed under the Apache License, Version 2.0 (the "License"); +@ * you may not use this file except in compliance with the License. +@ * You may obtain a copy of the License at: +@ * +@ * http://www.apache.org/licenses/LICENSE-2.0 +@ * +@ * Unless required by applicable law or agreed to in writing, software +@ * distributed under the License is distributed on an "AS IS" BASIS, +@ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +@ * See the License for the specific language governing permissions and +@ * limitations under the License. +@ * +@ ***************************************************************************** +@ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore +@*/ +@/** + +@/** +@****************************************************************************** +@* +@* +@* @brief +@* This file contains definitions of routines that compute distortion +@* between two macro/sub blocks of identical dimensions +@* +@* @author +@* Ittiam +@* +@* @par List of Functions: +@* - ime_compute_sad_16x16_a9q() +@* - ime_compute_sad_16x16_fast_a9q() +@* - ime_compute_sad_16x8_a9q() +@* - ime_compute_sad_16x16_ea8_a9q() +@* - ime_calculate_sad2_prog_a9q() +@* - ime_calculate_sad3_prog_a9q() +@* - ime_calculate_sad4_prog_a9q() +@* - ime_sub_pel_compute_sad_16x16_a9q() +@* - ime_compute_satqd_16x16_lumainter_a9q() +@* - +@* @remarks +@* None +@* +@******************************************************************************* +@ + + +@/** +@****************************************************************************** +@* +@* @brief computes distortion (SAD) between 2 16x16 blocks (fast mode) +@* +@* @par Description +@* This functions computes SAD between 2 16x16 blocks. There is a provision +@* for early exit if the up-to computed SAD exceeds maximum allowed SAD. To +@* compute the distortion of the entire block set u4_max_sad to USHRT_MAX. +@* +@* @param[in] pu1_src +@* UWORD8 pointer to the source +@* +@* @param[out] pu1_dst +@* UWORD8 pointer to the destination +@* +@* @param[in] src_strd +@* integer source stride +@* +@* @param[in] dst_strd +@* integer destination stride +@* +@* @param[in] i4_max_sad +@* integer maximum allowed distortion +@* +@* @param[in] pi4_mb_distortion +@* integer evaluated sad +@* +@* @remarks +@* +@****************************************************************************** +@*/ +.text +.p2align 2 + .global ime_compute_sad_16x16_fast_a9q +ime_compute_sad_16x16_fast_a9q: + + stmfd sp!, {r12, lr} + lsl r2, r2, #1 + lsl r3, r3, #1 + + @for bringing buffer2 into cache..., dummy load instructions + @ LDR r12,[r1] + + vld1.8 {d4, d5}, [r0], r2 + vld1.8 {d6, d7}, [r1], r3 + mov r12, #6 + vld1.8 {d8, d9}, [r0], r2 + vabdl.u8 q0, d6, d4 + vabdl.u8 q1, d7, d5 + vld1.8 {d10, d11}, [r1], r3 + +loop_sad_16x16_fast: + + vld1.8 {d4, d5}, [r0], r2 + vabal.u8 q0, d10, d8 + vabal.u8 q1, d11, d9 + vld1.8 {d6, d7}, [r1], r3 + subs r12, #2 + vld1.8 {d8, d9}, [r0], r2 + vabal.u8 q0, d6, d4 + vabal.u8 q1, d7, d5 + vld1.8 {d10, d11}, [r1], r3 + + bne loop_sad_16x16_fast + + vabal.u8 q0, d10, d8 + vabal.u8 q1, d11, d9 + + vadd.i16 q0, q0, q1 + vadd.i16 d0, d1, d0 + + ldr r12, [sp, #12] + vpaddl.u16 d0, d0 + vpaddl.u32 d0, d0 + vshl.u32 d0, d0, #1 + vst1.32 {d0[0]}, [r12] + + ldmfd sp!, {r12, pc} + + + + +@/** +@****************************************************************************** +@* +@* @brief computes distortion (SAD) between 2 16x8 blocks +@* +@* +@* @par Description +@* This functions computes SAD between 2 16x8 blocks. There is a provision +@* for early exit if the up-to computed SAD exceeds maximum allowed SAD. To +@* compute the distortion of the entire block set u4_max_sad to USHRT_MAX. +@* +@* @param[in] pu1_src +@* UWORD8 pointer to the source +@* +@* @param[out] pu1_dst +@* UWORD8 pointer to the destination +@* +@* @param[in] src_strd +@* integer source stride +@* +@* @param[in] dst_strd +@* integer destination stride +@* +@* @param[in] u4_max_sad +@* integer maximum allowed distortion +@* +@* @param[in] pi4_mb_distortion +@* integer evaluated sad +@* +@* @remarks +@* +@****************************************************************************** +@*/ +@ + .global ime_compute_sad_16x8_a9q +ime_compute_sad_16x8_a9q: + + stmfd sp!, {r12, lr} + + @for bringing buffer2 into cache..., dummy load instructions + @LDR r12,[r1] + + vld1.8 {d4, d5}, [r0], r2 + vld1.8 {d6, d7}, [r1], r3 + mov r12, #6 + vld1.8 {d8, d9}, [r0], r2 + vabdl.u8 q0, d6, d4 + vabdl.u8 q1, d7, d5 + vld1.8 {d10, d11}, [r1], r3 + +loop_sad_16x8: + + vld1.8 {d4, d5}, [r0], r2 + vabal.u8 q0, d10, d8 + vabal.u8 q1, d11, d9 + vld1.8 {d6, d7}, [r1], r3 + subs r12, #2 + vld1.8 {d8, d9}, [r0], r2 + vabal.u8 q0, d6, d4 + vabal.u8 q1, d7, d5 + vld1.8 {d10, d11}, [r1], r3 + + bne loop_sad_16x8 + + vabal.u8 q0, d10, d8 + vabal.u8 q1, d11, d9 + + vadd.i16 q0, q0, q1 + vadd.i16 d0, d1, d0 + + ldr r12, [sp, #12] + vpaddl.u16 d0, d0 + vpaddl.u32 d0, d0 + vst1.32 {d0[0]}, [r12] + + ldmfd sp!, {r12, pc} + + + + + +@/** +@****************************************************************************** +@* +@* @brief computes distortion (SAD) between 2 16x16 blocks with early exit +@* +@* @par Description +@* This functions computes SAD between 2 16x16 blocks. There is a provision +@* for early exit if the up-to computed SAD exceeds maximum allowed SAD. To +@* compute the distortion of the entire block set u4_max_sad to USHRT_MAX. +@* +@* @param[in] pu1_src +@* UWORD8 pointer to the source +@* +@* @param[out] pu1_dst +@* UWORD8 pointer to the destination +@* +@* @param[in] src_strd +@* integer source stride +@* +@* @param[in] dst_strd +@* integer destination stride +@* +@* @param[in] i4_max_sad +@* integer maximum allowed distortion +@* +@* @param[in] pi4_mb_distortion +@* integer evaluated sad +@* +@* @remarks +@* +@****************************************************************************** +@*/ + .global ime_compute_sad_16x16_ea8_a9q + +ime_compute_sad_16x16_ea8_a9q: + + stmfd sp!, {r5-r7, lr} + lsl r2, r2, #1 + lsl r3, r3, #1 + + @for bringing buffer2 into cache..., dummy load instructions + @LDR r12,[r1] + + vld1.8 {d4, d5}, [r0], r2 + vld1.8 {d6, d7}, [r1], r3 + mov r5, #6 + vld1.8 {d8, d9}, [r0], r2 + vabdl.u8 q0, d6, d4 + vabdl.u8 q1, d7, d5 + vld1.8 {d10, d11}, [r1], r3 + ldrd r6, r7, [sp, #16] + @r6 = i4_max_sad, r7 = pi4_mb_distortion + +loop_sad_16x16_ea8_1: + + vld1.8 {d4, d5}, [r0], r2 + vabal.u8 q0, d10, d8 + vabal.u8 q1, d11, d9 + vld1.8 {d6, d7}, [r1], r3 + subs r5, #2 + vld1.8 {d8, d9}, [r0], r2 + vabal.u8 q0, d6, d4 + vabal.u8 q1, d7, d5 + vld1.8 {d10, d11}, [r1], r3 + + bne loop_sad_16x16_ea8_1 + + vabal.u8 q0, d10, d8 + sub r0, r0, r2, lsl #3 + vabal.u8 q1, d11, d9 + sub r1, r1, r3, lsl #3 + + vadd.i16 q6, q0, q1 + add r0, r0, r2, asr #1 + vadd.i16 d12, d12, d13 + add r1, r1, r3, asr #1 + + vpaddl.u16 d12, d12 + vld1.8 {d4, d5}, [r0], r2 + vld1.8 {d6, d7}, [r1], r3 + vpaddl.u32 d12, d12 + vld1.8 {d8, d9}, [r0], r2 + vabal.u8 q0, d6, d4 + vabal.u8 q1, d7, d5 + + vst1.32 {d12[0]}, [r7] + ldr r5, [r7] + cmp r5, r6 + bgt end_func_16x16_ea8 + + vld1.8 {d10, d11}, [r1], r3 + mov r5, #6 + +loop_sad_16x16_ea8_2: + + vld1.8 {d4, d5}, [r0], r2 + vabal.u8 q0, d10, d8 + vabal.u8 q1, d11, d9 + vld1.8 {d6, d7}, [r1], r3 + subs r5, #2 + vld1.8 {d8, d9}, [r0], r2 + vabal.u8 q0, d6, d4 + vabal.u8 q1, d7, d5 + vld1.8 {d10, d11}, [r1], r3 + + bne loop_sad_16x16_ea8_2 + + vabal.u8 q0, d10, d8 + vabal.u8 q1, d11, d9 + + vadd.i16 q0, q0, q1 + vadd.i16 d0, d1, d0 + + vpaddl.u16 d0, d0 + vpaddl.u32 d0, d0 + + vst1.32 {d0[0]}, [r7] + +end_func_16x16_ea8: + + ldmfd sp!, {r5-r7, pc} + + + +@/* +@//--------------------------------------------------------------------------- +@// Function Name : Calculate_Mad2_prog() +@// +@// Detail Description : This function find the sad values of 4 Progressive MBs +@// at one shot +@// +@// Platform : CortexA8/NEON . +@// +@//----------------------------------------------------------------------------- +@*/ + + .global ime_calculate_sad2_prog_a9q + +ime_calculate_sad2_prog_a9q: + + @ r0 = ref1 <UWORD8 *> + @ r1 = ref2 <UWORD8 *> + @ r2 = src <UWORD8 *> + @ r3 = RefBufferWidth <UWORD32> + @ stack = CurBufferWidth <UWORD32>, psad <UWORD32 *> + + stmfd sp!, {r4-r5, lr} + + ldr r4, [sp, #8] @ load src stride to r4 + mov r5, #14 + + @Row 1 + vld1.8 {d0, d1}, [r2], r4 @ load src Row 1 + vld1.8 {d2, d3}, [r0], r3 @ load ref1 Row 1 + vld1.8 {d4, d5}, [r1], r3 @ load ref2 Row 1 + + @Row 2 + vld1.8 {d6, d7}, [r2], r4 @ load src Row 2 + vabdl.u8 q6, d2, d0 + vabdl.u8 q7, d3, d1 + vld1.8 {d8, d9}, [r0], r3 @ load ref1 Row 2 + vabdl.u8 q8, d4, d0 + vabdl.u8 q9, d5, d1 + vld1.8 {d10, d11}, [r1], r3 @ load ref2 Row 2 + +loop_sad2_prog: + + subs r5, #2 + @Row 1 + vld1.8 {d0, d1}, [r2], r4 @ load src Row 1 + vabal.u8 q6, d8, d6 + vabal.u8 q7, d9, d7 + vld1.8 {d2, d3}, [r0], r3 @ load ref1 Row 1 + vabal.u8 q8, d10, d6 + vabal.u8 q9, d11, d7 + vld1.8 {d4, d5}, [r1], r3 @ load ref2 Row 1 + + @Row 2 + vld1.8 {d6, d7}, [r2], r4 @ load src Row 2 + vabal.u8 q6, d2, d0 + vabal.u8 q7, d3, d1 + vld1.8 {d8, d9}, [r0], r3 @ load ref1 Row 2 + vabal.u8 q8, d4, d0 + vabal.u8 q9, d5, d1 + vld1.8 {d10, d11}, [r1], r3 @ load ref2 Row 2 + + bne loop_sad2_prog + + vabal.u8 q6, d8, d6 + vabal.u8 q7, d9, d7 + vabal.u8 q8, d10, d6 + vabal.u8 q9, d11, d7 + + @ Compute SAD + + vadd.u16 q6, q6, q7 @ Q6 : sad_ref1 + vadd.u16 q8, q8, q9 @ Q8 : sad_ref2 + + vadd.u16 d12, d12, d13 + ldr r5, [sp, #16] @ loading pi4_sad to r5 + vadd.u16 d16, d16, d17 + + vpadd.u16 d12, d12, d16 + vpaddl.u16 d12, d12 + + vst1.64 {d12}, [r5]! + + ldmfd sp!, {r4-r5, pc} + + + +@/* +@//--------------------------------------------------------------------------- +@// Function Name : Calculate_Mad3_prog() +@// +@// Detail Description : This function find the sad values of 4 Progressive MBs +@// at one shot +@// +@// Platform : CortexA8/NEON . +@// +@//----------------------------------------------------------------------------- +@*/ + + .global ime_calculate_sad3_prog_a9q + +ime_calculate_sad3_prog_a9q: + + @ r0 = ref1 <UWORD8 *> + @ r1 = ref2 <UWORD8 *> + @ r2 = ref3 <UWORD8 *> + @ r3 = src <UWORD8 *> + @ stack = RefBufferWidth <UWORD32>, CurBufferWidth <UWORD32>, psad <UWORD32 *> + + + stmfd sp!, {r4-r6, lr} + + ldrd r4, r5, [sp, #16] @ load ref stride to r4, src stride to r5 + mov r6, #14 + + @ Row 1 + vld1.8 {d0, d1}, [r3], r5 @ load src Row 1 + vld1.8 {d2, d3}, [r0], r4 @ load ref1 Row 1 + vld1.8 {d4, d5}, [r1], r4 @ load ref2 Row 1 + vabdl.u8 q8, d2, d0 + vabdl.u8 q9, d3, d1 + vld1.8 {d6, d7}, [r2], r4 @ load ref3 Row 1 + vabdl.u8 q10, d4, d0 + vabdl.u8 q11, d5, d1 + + @ Row 2 + vld1.8 {d8, d9}, [r3], r5 @ load src Row 1 + vabdl.u8 q12, d6, d0 + vabdl.u8 q13, d7, d1 + vld1.8 {d10, d11}, [r0], r4 @ load ref1 Row 1 + vld1.8 {d12, d13}, [r1], r4 @ load ref2 Row 1 + vabal.u8 q8, d10, d8 + vabal.u8 q9, d11, d9 + vld1.8 {d14, d15}, [r2], r4 @ load ref3 Row 1 + vabal.u8 q10, d12, d8 + vabal.u8 q11, d13, d9 + +loop_sad3_prog: + + @Row 1 + vld1.8 {d0, d1}, [r3], r5 @ load src Row 1 + vabal.u8 q12, d14, d8 + vabal.u8 q13, d15, d9 + vld1.8 {d2, d3}, [r0], r4 @ load ref1 Row 1 + vld1.8 {d4, d5}, [r1], r4 @ load ref2 Row 1 + vabal.u8 q8, d2, d0 + vabal.u8 q9, d3, d1 + vld1.8 {d6, d7}, [r2], r4 @ load ref3 Row 1 + vabal.u8 q10, d4, d0 + vabal.u8 q11, d5, d1 + + @Row 2 + vld1.8 {d8, d9}, [r3], r5 @ load src Row 1 + vabal.u8 q12, d6, d0 + vabal.u8 q13, d7, d1 + vld1.8 {d10, d11}, [r0], r4 @ load ref1 Row 1 + subs r6, #2 + vld1.8 {d12, d13}, [r1], r4 @ load ref2 Row 1 + vabal.u8 q8, d10, d8 + vabal.u8 q9, d11, d9 + vld1.8 {d14, d15}, [r2], r4 @ load ref3 Row 1 + vabal.u8 q10, d12, d8 + vabal.u8 q11, d13, d9 + + bne loop_sad3_prog + + vabal.u8 q12, d14, d8 + vabal.u8 q13, d15, d9 + + @ Compute SAD + + vadd.u16 q8, q8, q9 @ Q8 : sad_ref1 + vadd.u16 q10, q10, q11 @ Q10 : sad_ref2 + vadd.u16 q12, q12, q13 @ Q12 : sad_ref3 + + vadd.u16 d16, d16, d17 + vadd.u16 d20, d20, d21 + vadd.u16 d24, d24, d25 + + vpadd.u16 d16, d16, d20 + vpadd.u16 d24, d24, d24 + + ldr r6, [sp, #24] @ loading pi4_sad to r6 + vpaddl.u16 d16, d16 + vpaddl.u16 d24, d24 + + vst1.64 {d16}, [r6]! + vst1.32 {d24[0]}, [r6] + + ldmfd sp!, {r4-r6, pc} + + + +@/** +@****************************************************************************** +@* +@* @brief computes distortion (SAD) for sub-pel motion estimation +@* +@* @par Description +@* This functions computes SAD for all the 8 half pel points +@* +@* @param[out] pi4_sad +@* integer evaluated sad +@* pi4_sad[0] - half x +@* pi4_sad[1] - half x - 1 +@* pi4_sad[2] - half y +@* pi4_sad[3] - half y - 1 +@* pi4_sad[4] - half xy +@* pi4_sad[5] - half xy - 1 +@* pi4_sad[6] - half xy - strd +@* pi4_sad[7] - half xy - 1 - strd +@* +@* @remarks +@* +@****************************************************************************** +@*/ + +.text +.p2align 2 + + .global ime_sub_pel_compute_sad_16x16_a9q + +ime_sub_pel_compute_sad_16x16_a9q: + + stmfd sp!, {r4-r11, lr} @store register values to stack + + ldr r9, [sp, #36] + ldr r10, [sp, #40] + + sub r4, r1, #1 @ x left + sub r5, r2, r10 @ y top + + sub r6, r3, #1 @ xy left + sub r7, r3, r10 @ xy top + + sub r8, r7, #1 @ xy top-left + mov r11, #15 + + @for bringing buffer2 into cache..., dummy load instructions + @ LDR r12,[r1] + @ LDR r12,[sp,#12] + + vld1.8 {d0, d1}, [r0], r9 @ src + vld1.8 {d2, d3}, [r5], r10 @ y top LOAD + vld1.8 {d4, d5}, [r7], r10 @ xy top LOAD + vld1.8 {d6, d7}, [r8], r10 @ xy top-left LOAD + + vabdl.u8 q6, d2, d0 @ y top ABS1 + vabdl.u8 q7, d4, d0 @ xy top ABS1 + vld1.8 {d8, d9}, [r1], r10 @ x LOAD + vabdl.u8 q8, d6, d0 @ xy top-left ABS1 + vabdl.u8 q9, d8, d0 @ x ABS1 + vld1.8 {d10, d11}, [r4], r10 @ x left LOAD + + vabal.u8 q6, d3, d1 @ y top ABS2 + vabal.u8 q7, d5, d1 @ xy top ABS2 + vld1.8 {d2, d3}, [r2], r10 @ y LOAD + vabal.u8 q8, d7, d1 @ xy top-left ABS2 + vabal.u8 q9, d9, d1 @ x ABS2 + vld1.8 {d4, d5}, [r3], r10 @ xy LOAD + + vabdl.u8 q10, d10, d0 @ x left ABS1 + vabdl.u8 q11, d2, d0 @ y ABS1 + vld1.8 {d6, d7}, [r6], r10 @ xy left LOAD + vabdl.u8 q12, d4, d0 @ xy ABS1 + vabdl.u8 q13, d6, d0 @ xy left ABS1 + +loop_sub_pel_16x16: + + vabal.u8 q10, d11, d1 @ x left ABS2 + vabal.u8 q11, d3, d1 @ y ABS2 + subs r11, #1 + vabal.u8 q12, d5, d1 @ xy ABS2 + vabal.u8 q13, d7, d1 @ xy left ABS2 + + vld1.8 {d0, d1}, [r0], r9 @ src + vabal.u8 q6, d2, d0 @ y top ABS1 + vabal.u8 q7, d4, d0 @ xy top ABS1 + vld1.8 {d8, d9}, [r1], r10 @ x LOAD + vabal.u8 q8, d6, d0 @ xy top-left ABS1 + vabal.u8 q9, d8, d0 @ x ABS1 + vld1.8 {d10, d11}, [r4], r10 @ x left LOAD + + vabal.u8 q6, d3, d1 @ y top ABS2 + vabal.u8 q7, d5, d1 @ xy top ABS2 + vld1.8 {d2, d3}, [r2], r10 @ y LOAD + vabal.u8 q8, d7, d1 @ xy top-left ABS2 + vabal.u8 q9, d9, d1 @ x ABS2 + vld1.8 {d4, d5}, [r3], r10 @ xy LOAD + + vabal.u8 q10, d10, d0 @ x left ABS1 + vabal.u8 q11, d2, d0 @ y ABS1 + vld1.8 {d6, d7}, [r6], r10 @ xy left LOAD + vabal.u8 q12, d4, d0 @ xy ABS1 + vabal.u8 q13, d6, d0 @ xy left ABS1 + + bne loop_sub_pel_16x16 + + vabal.u8 q10, d11, d1 @ x left ABS2 + vabal.u8 q11, d3, d1 @ y ABS2 + vabal.u8 q12, d5, d1 @ xy ABS2 + vabal.u8 q13, d7, d1 @ xy left ABS2 + + vadd.i16 d0, d18, d19 @ x + vadd.i16 d3, d12, d13 @ y top + vadd.i16 d6, d14, d15 @ xy top + vadd.i16 d5, d26, d27 @ xy left + vadd.i16 d1, d20, d21 @ x left + vadd.i16 d2, d22, d23 @ y + vadd.i16 d4, d24, d25 @ xy + vadd.i16 d7, d16, d17 @ xy top left + + vpadd.i16 d0, d0, d1 + vpadd.i16 d2, d2, d3 + vpadd.i16 d4, d4, d5 + vpadd.i16 d6, d6, d7 + + vpaddl.u16 d0, d0 + vpaddl.u16 d2, d2 + ldr r11, [sp, #44] + vpaddl.u16 d4, d4 + vpaddl.u16 d6, d6 + + vst1.32 {d0}, [r11]! + vst1.32 {d2}, [r11]! + vst1.32 {d4}, [r11]! + vst1.32 {d6}, [r11]! + + ldmfd sp!, {r4-r11, pc} @Restoring registers from stack + + + +@/** +@****************************************************************************** +@* +@* @brief computes distortion (SAD) between 2 16x16 blocks +@* +@* @par Description +@* This functions computes SAD between 2 16x16 blocks. There is a provision +@* for early exit if the up-to computed SAD exceeds maximum allowed SAD. To +@* compute the distortion of the entire block set u4_max_sad to USHRT_MAX. +@* +@* @param[in] pu1_src +@* UWORD8 pointer to the source +@* +@* @param[out] pu1_dst +@* UWORD8 pointer to the destination +@* +@* @param[in] src_strd +@* integer source stride +@* +@* @param[in] dst_strd +@* integer destination stride +@* +@* @param[in] i4_max_sad +@* integer maximum allowed distortion +@* +@* @param[in] pi4_mb_distortion +@* integer evaluated sad +@* +@* @remarks +@* +@****************************************************************************** +@*/ + +.text +.p2align 2 + + .global ime_compute_sad_16x16_a9q + +ime_compute_sad_16x16_a9q: + + + @STMFD sp!,{r12,lr} + stmfd sp!, {r12, r14} @store register values to stack + + @for bringing buffer2 into cache..., dummy load instructions + @ LDR r12,[r1] + @ LDR r12,[sp,#12] + + vld1.8 {d4, d5}, [r0], r2 + vld1.8 {d6, d7}, [r1], r3 + + mov r12, #14 + vld1.8 {d8, d9}, [r0], r2 + vabdl.u8 q0, d4, d6 + vld1.8 {d10, d11}, [r1], r3 + vabdl.u8 q1, d5, d7 + +loop_sad_16x16: + + vld1.8 {d4, d5}, [r0], r2 + vabal.u8 q0, d8, d10 + vld1.8 {d6, d7}, [r1], r3 + vabal.u8 q1, d9, d11 + + vld1.8 {d8, d9}, [r0], r2 + vabal.u8 q0, d4, d6 + subs r12, #2 + vld1.8 {d10, d11}, [r1], r3 + vabal.u8 q1, d5, d7 + + bne loop_sad_16x16 + + vabal.u8 q0, d8, d10 + vabal.u8 q1, d9, d11 + + vadd.i16 q0, q0, q1 + vadd.i16 d0, d1, d0 + ldr r12, [sp, #12] + + vpaddl.u16 d0, d0 + vpaddl.u32 d0, d0 + vst1.32 {d0[0]}, [r12] + + ldmfd sp!, {r12, pc} @Restoring registers from stack + + +@/* +@//--------------------------------------------------------------------------- +@// Function Name : Calculate_Mad4_prog() +@// +@// Detail Description : This function find the sad values of 4 Progressive MBs +@// at one shot +@// +@// Platform : CortexA8/NEON . +@// +@//----------------------------------------------------------------------------- +@*/ + + .global ime_calculate_sad4_prog_a9q + +ime_calculate_sad4_prog_a9q: + @ r0 = temp_frame <UWORD8 *> + @ r1 = buffer_ptr <UWORD8 *> + @ r2 = RefBufferWidth <UWORD32> + @ r3 = CurBufferWidth <UWORD32> + @ stack = psad <UWORD32 *> {at 0x34} + + stmfd sp!, {r4-r7, lr} + + @UWORD8 *left_ptr = temp_frame - 1; + @UWORD8 *right_ptr = temp_frame + 1; + @UWORD8 *top_ptr = temp_frame - RefBufferWidth; + @UWORD8 *bot_ptr = temp_frame + RefBufferWidth; + + mov r7, #14 + sub r4, r0, #0x01 @r4 = left_ptr + add r5, r0, #0x1 @r5 = right_ptr + sub r6, r0, r2 @r6 = top_ptr + add r0, r0, r2 @r0 = bot_ptr + @r1 = buffer_ptr + + @D0:D1 : buffer + @D2:D3 : top + @D4:D5 : left + @D6:D7 : right + @D8:D9 : bottom + + @Row 1 + vld1.8 {d0, d1}, [r1], r3 @ load src Row 1 + vld1.8 {d2, d3}, [r6], r2 @ load top Row 1 + vld1.8 {d4, d5}, [r4], r2 @ load left Row 1 + + vabdl.u8 q5, d2, d0 + vld1.8 {d6, d7}, [r5], r2 @ load right Row 1 + vabdl.u8 q6, d3, d1 + + vabdl.u8 q7, d0, d4 + vld1.8 {d8, d9}, [r0], r2 @ load bottom Row 1 + vabdl.u8 q8, d1, d5 + + @Row 2 + vabdl.u8 q9, d0, d6 + vld1.8 {d26, d27}, [r1], r3 @ load src Row 2 + vabdl.u8 q10, d1, d7 + + vabdl.u8 q11, d0, d8 + vld1.8 {d2, d3}, [r6], r2 @ load top Row 2 + vabdl.u8 q12, d1, d9 + +loop_sad4_prog: + + vabal.u8 q5, d26, d2 + vld1.8 {d4, d5}, [r4], r2 @ load left Row 2 + vabal.u8 q6, d27, d3 + + vabal.u8 q7, d26, d4 + vld1.8 {d6, d7}, [r5], r2 @ load right Row 2 + vabal.u8 q8, d27, d5 + + vabal.u8 q9, d26, d6 + vld1.8 {d8, d9}, [r0], r2 @ load bottom Row 2 + vabal.u8 q10, d27, d7 + + @Row 1 + vabal.u8 q11, d26, d8 + vld1.8 {d0, d1}, [r1], r3 @ load src Row 1 + vabal.u8 q12, d27, d9 + + vld1.8 {d2, d3}, [r6], r2 @ load top Row 1 + subs r7, #2 + vld1.8 {d4, d5}, [r4], r2 @ load left Row 1 + + vabal.u8 q5, d0, d2 + + vld1.8 {d6, d7}, [r5], r2 @ load right Row 1 + vabal.u8 q6, d1, d3 + + vabal.u8 q7, d0, d4 + vld1.8 {d8, d9}, [r0], r2 @ load bottom Row 1 + vabal.u8 q8, d1, d5 + + @Row 2 + vabal.u8 q9, d0, d6 + vld1.8 {d26, d27}, [r1], r3 @ load src Row 2 + vabal.u8 q10, d1, d7 + + vabal.u8 q11, d0, d8 + vld1.8 {d2, d3}, [r6], r2 @ load top Row 2 + vabal.u8 q12, d1, d9 + + bne loop_sad4_prog + + vabal.u8 q5, d26, d2 + vld1.8 {d4, d5}, [r4], r2 @ load left Row 2 + vabal.u8 q6, d27, d3 + + vabal.u8 q7, d26, d4 + vld1.8 {d6, d7}, [r5], r2 @ load right Row 2 + vabal.u8 q8, d27, d5 + + vabal.u8 q9, d26, d6 + vld1.8 {d8, d9}, [r0], r2 @ load bottom Row 2 + vabal.u8 q10, d27, d7 + + vabal.u8 q11, d26, d8 + vabal.u8 q12, d27, d9 + + @;Q5:Q6 : sad_top + @;Q7:Q8 : sad_left + @;Q9:Q10 : sad_right + @;Q11:Q12 : sad_bot + + vadd.u16 q5, q5, q6 + vadd.u16 q7, q7, q8 + vadd.u16 q9, q9, q10 + vadd.u16 q11, q11, q12 + + @; Free :- + @; Q6,Q8,Q10,Q12 + + @;Q5 -> D10:D11 + @;Q7 -> D14:D15 + @;Q9 -> D18:D19 + @;Q11 -> D22:D23 + + vadd.u16 d10, d10, d11 + vadd.u16 d14, d14, d15 + vadd.u16 d18, d18, d19 + vadd.u16 d22, d22, d23 + + @;D10 : sad_top + @;D14 : sad_left + @;D18 : sad_right + @;D22 : sad_bot + + + vpaddl.u16 d11, d10 + vpaddl.u16 d15, d14 + vpaddl.u16 d19, d18 + vpaddl.u16 d23, d22 + + @;D11 : sad_top + @;D15 : sad_left + @;D19 : sad_right + @;D23 : sad_bot + + vpaddl.u32 d10, d11 + vpaddl.u32 d22, d23 + vpaddl.u32 d14, d15 + vpaddl.u32 d18, d19 + + @;D10 : sad_top + @;D14 : sad_left + @;D18 : sad_right + @;D22 : sad_bot + + ldr r4, [sp, #20] @;Can be rearranged + + vsli.64 d10, d22, #32 + vsli.64 d14, d18, #32 + + vst1.64 {d14}, [r4]! + vst1.64 {d10}, [r4]! + + ldmfd sp!, {r4-r7, pc} + + + + +@***************************************************************************** +@* +@* Function Name : ime_compute_satqd_16x16_lumainter_a9 +@* Description : This fucntion computes SAD for a 16x16 block. +@ : It also computes if any 4x4 block will have a nonzero coefficent after transform and quant +@ +@ Arguments : R0 :pointer to src buffer +@ R1 :pointer to est buffer +@ R2 :source stride +@ R3 :est stride +@ STACk :Threshold,distotion,is_nonzero +@* +@* Values Returned : NONE +@* +@* Register Usage : R0-R11 +@* Stack Usage : +@* Cycles : Around +@* Interruptiaility : Interruptable +@* +@* Known Limitations +@* \Assumptions : +@* +@* Revision History : +@* DD MM YYYY Author(s) Changes +@* 14 04 2014 Harinarayanan K K First version +@* +@***************************************************************************** + .global ime_compute_satqd_16x16_lumainter_a9q +ime_compute_satqd_16x16_lumainter_a9q: + @R0 :pointer to src buffer + @R1 :pointer to est buffer + @R2 :Source stride + @R3 :Pred stride + @R4 :Threshold pointer + @R5 :Distortion,ie SAD + @R6 :is nonzero + + push {r4-r12, lr} @push all the variables first + @ADD SP,SP,#40 ;decrement stack pointer,to accomodate two variables + ldr r4, [sp, #40] @load the threshold address + + mov r8, #8 @Number of 4x8 blocks to be processed + mov r10, #0 @Sad + mov r7, #0 @Nonzero info + @---------------------------------------------------- + + vld1.u8 d30, [r0], r2 @I load 8 pix src row 1 + + vld1.u8 d31, [r1], r3 @I load 8 pix pred row 1 + + vld1.u8 d28, [r0], r2 @I load 8 pix src row 2 + + vld1.u8 d29, [r1], r3 @I load 8 pix pred row 2 + + vld1.u8 d26, [r0], r2 @I load 8 pix src row 3 + vabdl.u8 q0, d30, d31 @I Abs diff r1 blk 12 + + vld1.u8 d27, [r1], r3 @I load 8 pix pred row 3 + + vld1.u8 d24, [r0], r2 @I load 8 pix src row 4 + + vld1.u8 d25, [r1], r3 @I load 8 pix pred row 4 + vabdl.u8 q1, d28, d29 @I Abs diff r1 blk 12 + + vld1.u16 {q11}, [r4] @I load the threhold + vabdl.u8 q2, d26, d27 @I Abs diff r1 blk 12 + + vabdl.u8 q3, d24, d25 @I Abs diff r1 blk 12 + + + +core_loop: + @S1 S2 S3 S4 A1 A2 A3 A4 + @S5 S6 S7 S8 A5 A6 A7 A8 + @S9 S10 S11 S12 A9 A10 A11 A12 + @S13 S14 S15 S16 A13 A14 A15 A16 + ands r11, r8, #1 @II See if we are at even or odd block + vadd.u16 q4 , q0, q3 @I Add r1 r4 + lsl r11, r2, #2 @II Move back src 4 rows + + subeq r0, r0, r11 @II Move back src 4 rows if we are at even block + vadd.u16 q5 , q1, q2 @I Add r2 r3 + addeq r0, r0, #8 @II Move src 8 cols forward if we are at even block + + lsl r11, r3, #2 @II Move back pred 4 rows + vtrn.16 d8 , d10 @I trnspse 1 + subeq r1, r1, r11 @II Move back pred 4 rows if we are at even block + + addeq r1, r1, #8 @II Move pred 8 cols forward if we are at even block + vtrn.16 d9 , d11 @I trnspse 2 + subne r0, r0, #8 @II Src 8clos back for odd rows + + subne r1, r1, #8 @II Pred 8 cols back for odd rows + vtrn.32 d10, d11 @I trnspse 4 + + + vtrn.32 d8 , d9 @I trnspse 3 + vswp d10, d11 @I rearrange so that the q4 and q5 add properly + @D8 S1 S4 A1 A4 + @D9 S2 S3 A2 A3 + @D11 S1 S4 A1 A4 + @D10 S2 S3 A2 A3 + + vadd.s16 q6, q4, q5 @I Get s1 s4 + vld1.u8 d30, [r0], r2 @II load first 8 pix src row 1 + + vtrn.s16 d12, d13 @I Get s2 s3 + @D12 S1 S4 A1 A4 + @D13 S2 S3 A2 A3 + + vshl.s16 q7, q6 , #1 @I si = si<<1 + vld1.u8 d31, [r1], r3 @II load first 8 pix pred row 1 + + vpadd.s16 d16, d12, d13 @I (s1 + s4) (s2 + s3) + vld1.u8 d28, [r0], r2 @II load first 8 pix src row 2 + @ D16 S14 A14 S23 A23 + vrev32.16 d0, d16 @I + vuzp.s16 d16, d0 @I + @D16 S14 S23 A14 A23 + vadd.s16 d17, d12, d13 @I (s1 + s2) (s3 + s4) + vld1.u8 d29, [r1], r3 @II load first 8 pix pred row 2 + @D17 S12 S34 A12 A34 + + vrev32.16 q9, q7 @I Rearrange si's + @Q9 Z4,Z1,Y4,Y1,Z3,Z2,Y3,Y2 + + @D12 S1 S4 A1 A4 + @D19 Z3 Z2 Y3 Y2 + vsub.s16 d8, d12, d19 @I (s1 - (s3<<1)) (s4 - (s2<<1)) + vld1.u8 d26, [r0], r2 @II load first 8 pix src row 3 + @D13 S2 S3 A2 A3 + @D18 Z4 Z1 Y4 Y1 + vsub.s16 d9, d13, d18 @I (s2 - (s4<<1)) (s3 - (s1<<1)) + vld1.u8 d27, [r1], r3 @II load first 8 pix pred row 3 + @Q10 S8 S5 A8 A5 S7 S4 A7 A4 + + @D16 S14 S23 A14 A23 + vpadd.s16 d10, d16, d17 @I Get sad by adding s1 s2 s3 s4 + vld1.u8 d24, [r0], r2 @II load first 8 pix src row 4 + @D22 SAD1 SAD2 junk junk + + + @Q8 S2 S1 A2 A1 S6 S3 A6 A3 + @Q10 S8 S5 A8 A5 S7 S4 A7 A4 + vtrn.32 q8, q4 @I Rearrange to make ls of each block togather + @Q8 S2 S1 S8 S5 S6 S3 S7 S4 + @Q10 A2 A1 A8 A5 A6 A3 A7 A4 + + + ldrh r11, [r4, #16] @I Load the threshold for DC val blk 1 + vdup.s16 q6, d10[0] @I Get the sad blk 1 + vabdl.u8 q0, d30, d31 @II Abs diff r1 blk 12 + + vshl.s16 q7, q6, #1 @I sad_2 = sad_1<<1 + vmov.s16 r9, d10[0] @I Get the sad for block 1 + + vsub.s16 q9, q7, q8 @I Add to the lss + vmov.s16 r5, d10[1] @I Get the sad for block 2 + + vcle.s16 q7, q11, q9 @I Add to the lss + vld1.u8 d25, [r1], r3 @II load first 8 pix pred row 4 + + vdup.s16 q15, d10[1] @I Get the sad blk 1 + vabdl.u8 q1, d28, d29 @II Abs diff r1 blk 12 + + + vshl.s16 q14, q15, #1 @I sad_2 = sad_1<<1 + vsub.s16 q3, q14, q4 @I Add to the lss + vcle.s16 q15, q11, q3 @I Add to the lss + + ADD R10, R10, R9 @I Add to the global sad blk 1 + vtrn.u8 q15, q7 @I get all comparison bits to one reg + vabdl.u8 q2, d26, d27 @II Abs diff r1 blk 12 + + ADD R10, R10, R5 @I Add to the global sad blk 2 + vshr.u8 q14, q15, #7 @I Shift the bits so that no overflow occurs + cmp r11, r9 + + movle r7, #0xf @I If not met mark it by mvoing non zero val to R7 blk 1 ;I Compare with threshold blk 1 + vadd.u8 d28, d28, d29 @I Add the bits + cmp r11, r5 @I Compare with threshold blk 2 + + movle r7, #0xf @I If not met mark it by mvoing non zero val to R7 blk 2 + vpadd.u8 d28, d28, d29 @I Add the bits + + vmov.u32 r11, d28[0] @I Since a set bit now represents a unstatisofrd contifon store it in r11 + vabdl.u8 q3, d24, d25 @II Abs diff r1 blk 12 + + orr r7, r7, r11 @I get the guy to r11 + + + sub r8, r8, #1 @I Decremrnt block count + + cmp r7, #0 @I If we have atlest one non zero block + bne compute_sad_only @I if a non zero block is der,From now on compute sad only + + cmp r8, #1 @I See if we are at the last block + bne core_loop @I If the blocks are zero, lets continue the satdq + + + @EPILOUGE for core loop + @S1 S2 S3 S4 A1 A2 A3 A4 + @S5 S6 S7 S8 A5 A6 A7 A8 + @S9 S10 S11 S12 A9 A10 A11 A12 + @S13 S14 S15 S16 A13 A14 A15 A16 + vadd.u16 q4 , q0, q3 @Add r1 r4 + vadd.u16 q5 , q1, q2 @Add r2 r3 + @D8 S1 S2 S2 S1 + @D10 S4 S3 S3 S4 + @D9 A1 A2 A2 A1 + @D11 A4 A3 A3 A4 + vtrn.16 d8 , d10 @I trnspse 1 + vtrn.16 d9 , d11 @I trnspse 2 + vtrn.32 d8 , d9 @I trnspse 3 + vtrn.32 d10, d11 @I trnspse 4 + + vswp d10, d11 @I rearrange so that the q4 and q5 add properly + @D8 S1 S4 A1 A4 + @D9 S2 S3 A2 A3 + @D11 S1 S4 A1 A4 + @D10 S2 S3 A2 A3 + vadd.s16 q6, q4, q5 @Get s1 s4 + vtrn.s16 d12, d13 @Get s2 s3 + @D12 S1 S4 A1 A4 + @D13 S2 S3 A2 A3 + + vshl.s16 q7, q6 , #1 @si = si<<1 + vmov.s16 r9, d10[0] @Get the sad for block 1 + + vpadd.s16 d16, d12, d13 @(s1 + s4) (s2 + s3) + vmov.s16 r5, d10[1] @Get the sad for block 2 + @D16 S14 A14 S23 A23 + vrev32.16 d30, d16 @ + vuzp.s16 d16, d30 @ + @D16 S14 S23 A14 A23 + vadd.s16 d17, d12, d13 @(s1 + s2) (s3 + s4) + @D17 S12 S34 A12 A34 + + vrev32.16 q9, q7 @Rearrange si's + @Q9 Z4,Z1,Y4,Y1,Z3,Z2,Y3,Y2 + + @D12 S1 S4 A1 A4 + @D19 Z3 Z2 Y3 Y2 + vsub.s16 d8, d12, d19 @(s1 - (s3<<1)) (s4 - (s2<<1)) + @D13 S2 S3 A2 A3 + @D18 Z4 Z1 Y4 Y1 + vsub.s16 d9, d13, d18 @(s2 - (s4<<1)) (s3 - (s1<<1)) + @Q10 S8 S5 A8 A5 S7 S4 A7 A4 + + @D16 S14 S23 A14 A23 + vpadd.s16 d10, d16, d17 @I Get sad by adding s1 s2 s3 s4 + @D22 SAD1 SAD2 junk junk + vmov.u16 r9, d10[0] @Get the sad for block 1 + vmov.u16 r5, d10[1] @Get the sad for block 2 + + @Q8 S2 S1 A2 A1 S6 S3 A6 A3 + @Q10 S8 S5 A8 A5 S7 S4 A7 A4 + ldrh r11, [r4, #16] @Load the threshold for DC val blk 1 + vtrn.32 q8, q4 @Rearrange to make ls of each block togather + ADD R10, R10, R9 @Add to the global sad blk 1 + + @Q8 S2 S1 S8 S5 S6 S3 S7 S4 + @Q10 A2 A1 A8 A5 A6 A3 A7 A4 + + vld1.u16 {q11}, [r4] @load the threhold + ADD R10, R10, R5 @Add to the global sad blk 2 + + vdup.u16 q6, d10[0] @Get the sad blk 1 + + cmp r11, r9 @Compare with threshold blk 1 + vshl.u16 q7, q6, #1 @sad_2 = sad_1<<1 + + vsub.s16 q9, q7, q8 @Add to the lss + + vcle.s16 q15, q11, q9 @Add to the lss + movle r7, #0xf @If not met mark it by mvoing non zero val to R7 blk 1 + + cmp r11, r5 @Compare with threshold blk 2 + vdup.u16 q14, d10[1] @Get the sad blk 1 + + vshl.u16 q13, q14, #1 @sad_2 = sad_1<<1 + vsub.s16 q12, q13, q4 @Add to the lss + vcle.s16 q14, q11, q12 @Add to the lss + movle r7, #0xf @If not met mark it by mvoing non zero val to R7 blk 2 + + vtrn.u8 q14, q15 @get all comparison bits to one reg + vshr.u8 q14, q14, #7 @Shift the bits so that no overflow occurs + vadd.u8 d28, d28, d29 @Add the bits + vpadd.u8 d28, d28, d29 @Add the bits + vmov.u32 r11, d28[0] @Since a set bit now represents a unstatisofrd contifon store it in r11 + orr r7, r7, r11 @get the guy to r11 + + b funcend_sad_16x16 @Since all blocks ar processed nw, got to end + +compute_sad_only: @This block computes SAD only, so will be lighter + @IT will start processign at n odd block + @It will compute sad for odd blok, + @and then for two blocks at a time + @The counter is r7, hence r7 blocks will be processed + + and r11, r8, #1 @Get the last bit of counter + cmp r11, #0 @See if we are at even or odd block + @iif the blk is even we just have to set the pointer to the + @start of current row + + lsleq r11, r2, #2 @I Move back src 4 rows + subeq r0, r0, r11 @I Move back src 4 rows if we are at even block + + lsleq r11, r3, #2 @I Move back pred 4 rows + subeq r1, r1, r11 @I Move back pred 4 rows if we are at even block + @ADDEQ R8,R8,#2 ;Inc counter + beq skip_odd_blk @If the blk is odd we have to compute sad + + + vadd.u16 q4, q0, q1 @Add SAD of row1 and row2 + vadd.u16 q5, q2, q3 @Add SAD of row3 and row4 + vadd.u16 q6, q4, q5 @Add SAD of row 1-4 + vadd.u16 d14, d12, d13 @Add Blk1 and blk2 + vpadd.u16 d16, d14, d15 @Add col 1-2 and 3-4 + vpadd.u16 d18, d16, d17 @Add col 12-34 + + vmov.u16 r9, d18[0] @Move sad to arm + ADD R10, R10, R9 @Add to the global sad + + sub r8, r8, #1 @Dec counter + cmp r8, #0 @See if we processed last block + beq funcend_sad_16x16 @if lprocessed last block goto end of func + + sub r0, r0, #8 @Since we processed od block move back src by 8 cols + sub r1, r1, #8 @Since we processed od block move back pred by 8 cols + +skip_odd_blk: + + vmov.s16 q0, #0 @Initialize the accumulator + vmov.s16 q1, #0 @Initialize the accumulator + + vld1.u8 {q15}, [r0], r2 @load src r1 + vld1.u8 {q14}, [r1], r3 @load pred r1 + + vld1.u8 {q13}, [r0], r2 @load src r2 + vld1.u8 {q12}, [r1], r3 @load pred r2 + + vld1.u8 {q11}, [r0], r2 @load src r3 + vld1.u8 {q10}, [r1], r3 @load pred r2 + + vld1.u8 {q9}, [r0], r2 @load src r4 + vld1.u8 {q8}, [r1], r3 @load pred r4 + + cmp r8, #2 + beq sad_epilouge + +sad_loop: + + vabal.u8 q0, d30, d28 @I accumulate Abs diff R1 + vabal.u8 q1, d31, d29 @I accumulate Abs diff R1 + + vld1.u8 {q15}, [r0], r2 @II load r1 src + vabal.u8 q0, d26, d24 @I accumulate Abs diff R2 + + vld1.u8 {q14}, [r1], r3 @II load r1 pred + vabal.u8 q1, d27, d25 @I accumulate Abs diff R2 + + vld1.u8 {q13}, [r0], r2 @II load r3 src + vabal.u8 q0, d22, d20 @I accumulate Abs diff R3 + + vld1.u8 {q12}, [r1], r3 @II load r2 pred + vabal.u8 q1, d23, d21 @I accumulate Abs diff R3 + + vld1.u8 {q11}, [r0], r2 @II load r3 src + vabal.u8 q0, d18, d16 @I accumulate Abs diff R4 + + + sub r8, r8, #2 @Since we processe 16 pix @a time, dec by 2 + vld1.u8 {q10}, [r1], r3 @II load r3 pred + vabal.u8 q1, d19, d17 @I accumulate Abs diff R4 + + cmp r8, #2 @Check if last loop + vld1.u8 {q9}, [r0], r2 @II load r4 src + vld1.u8 {q8}, [r1], r3 @II load r4 pred + + bne sad_loop @Go back to SAD computation + +sad_epilouge: + vabal.u8 q0, d30, d28 @Accumulate Abs diff R1 + vabal.u8 q1, d31, d29 @Accumulate Abs diff R1 + + vabal.u8 q0, d26, d24 @Accumulate Abs diff R2 + vabal.u8 q1, d27, d25 @Accumulate Abs diff R2 + + vabal.u8 q0, d22, d20 @Accumulate Abs diff R3 + vabal.u8 q1, d23, d21 @Aaccumulate Abs diff R3 + + vabal.u8 q0, d18, d16 @Accumulate Abs diff R4 + vabal.u8 q1, d19, d17 @Accumulate Abs diff R4 + + vadd.u16 q2, q0, q1 @ADD two accumulators + vadd.u16 d6, d4, d5 @Add two blk sad + vpadd.u16 d8, d6, d7 @Add col 1-2 and 3-4 sad + vpadd.u16 d10, d8, d9 @Add col 12-34 sad + + vmov.u16 r9, d10[0] @move SAD to ARM + ADD R10, R10, R9 @Add to the global sad + +funcend_sad_16x16: @End of fucntion process + ldr r5, [sp, #44] + ldr r6, [sp, #48] + + str r7, [r6] @Store the is zero reg + str r10, [r5] @Store sad + + @SUB SP,SP,#40 + pop {r4-r12, pc} + + diff --git a/encoder/arm/ime_platform_macros.h b/encoder/arm/ime_platform_macros.h new file mode 100755 index 0000000..0f5b2f2 --- /dev/null +++ b/encoder/arm/ime_platform_macros.h @@ -0,0 +1,51 @@ +/****************************************************************************** + * + * Copyright (C) 2015 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ***************************************************************************** + * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore +*/ +/** +******************************************************************************* +* @file +* ime_platform_macros.h +* +* @brief +* Platform specific Macro definitions used in the codec +* +* @author +* Ittiam +* +* @remarks +* None +* +******************************************************************************* +*/ + +#ifndef _IME_PLATFORM_MACROS_H_ +#define _IME_PLATFORM_MACROS_H_ + +/*****************************************************************************/ +/* Function macro definitions */ +/*****************************************************************************/ + +#define USADA8(src,est,sad) \ + sad += ABS(src[0]-est[0]) + \ + ABS(src[1]-est[1]) + \ + ABS(src[2]-est[2]) + \ + ABS(src[3]-est[3]) + + +#endif /* _IH264_PLATFORM_MACROS_H_ */ |