diff options
author | Hamsalekha S <hamsalekha.s@ittiam.com> | 2015-03-13 21:24:58 +0530 |
---|---|---|
committer | Hamsalekha S <hamsalekha.s@ittiam.com> | 2015-04-02 15:59:02 +0530 |
commit | 8d3d303c7942ced6a987a52db8977d768dc3605f (patch) | |
tree | cc806c96794356996b13ba9970941d0aed74a97e /common/arm | |
parent | 3956d913d37327dcb340f836e604b04bd478b158 (diff) | |
download | android_external_libavc-8d3d303c7942ced6a987a52db8977d768dc3605f.tar.gz android_external_libavc-8d3d303c7942ced6a987a52db8977d768dc3605f.tar.bz2 android_external_libavc-8d3d303c7942ced6a987a52db8977d768dc3605f.zip |
Initial version
Change-Id: I7efe9a589cd24edf86e8d086b40c27cbbf8b4017
Diffstat (limited to 'common/arm')
30 files changed, 15428 insertions, 0 deletions
diff --git a/common/arm/ih264_arm_memory_barrier.s b/common/arm/ih264_arm_memory_barrier.s new file mode 100755 index 0000000..523218f --- /dev/null +++ b/common/arm/ih264_arm_memory_barrier.s @@ -0,0 +1,77 @@ +@/****************************************************************************** +@ * +@ * Copyright (C) 2015 The Android Open Source Project +@ * +@ * Licensed under the Apache License, Version 2.0 (the "License"); +@ * you may not use this file except in compliance with the License. +@ * You may obtain a copy of the License at: +@ * +@ * http://www.apache.org/licenses/LICENSE-2.0 +@ * +@ * Unless required by applicable law or agreed to in writing, software +@ * distributed under the License is distributed on an "AS IS" BASIS, +@ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +@ * See the License for the specific language governing permissions and +@ * limitations under the License. +@ * +@ ***************************************************************************** +@ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore +@*/ +@/** +@******************************************************************************* +@* @file +@* ih264_arm_memory_barrier.s +@* +@* @brief +@* Contains function definitions for data synchronization. +@* +@* @author +@* Ittiam +@* +@* @par List of Functions: +@* +@* +@* @remarks +@* None +@* +@******************************************************************************* + +.text +.p2align 2 + + +@***************************************************************************** +@* +@* Function Name : ih264_arm_dsb +@* Description : Adds DSB +@* Revision History : +@* DD MM YYYY Author(s) Changes +@* 03 07 2008 100355 First version +@* +@***************************************************************************** + + .global ih264_arm_dsb +ih264_arm_dsb: + dsb + bx lr + + + +@***************************************************************************** +@* +@* Function Name : ih264_arm_dmb +@* Description : Adds DMB +@* Revision History : +@* DD MM YYYY Author(s) Changes +@* 03 07 2008 100355 First version +@* +@***************************************************************************** + + .global ih264_arm_dmb + +ih264_arm_dmb: + dmb + bx lr + + + diff --git a/common/arm/ih264_deblk_chroma_a9.s b/common/arm/ih264_deblk_chroma_a9.s new file mode 100755 index 0000000..66102a7 --- /dev/null +++ b/common/arm/ih264_deblk_chroma_a9.s @@ -0,0 +1,1337 @@ +@/****************************************************************************** +@ * +@ * Copyright (C) 2015 The Android Open Source Project +@ * +@ * Licensed under the Apache License, Version 2.0 (the "License"); +@ * you may not use this file except in compliance with the License. +@ * You may obtain a copy of the License at: +@ * +@ * http://www.apache.org/licenses/LICENSE-2.0 +@ * +@ * Unless required by applicable law or agreed to in writing, software +@ * distributed under the License is distributed on an "AS IS" BASIS, +@ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +@ * See the License for the specific language governing permissions and +@ * limitations under the License. +@ * +@ ***************************************************************************** +@ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore +@*/ +@/*****************************************************************************/ +@/* */ +@/* File Name : ih264_deblk_chroma_a9.s */ +@/* */ +@/* Description : Contains function definitions for deblocking luma */ +@/* edge. Functions are coded in NEON assembly and can */ +@/* be compiled using ARM RVDS. */ +@/* */ +@/* List of Functions : ih264_deblk_chroma_vert_bs4_bp_a9() */ +@/* ih264_deblk_chroma_vert_bslt4_bp_a9() */ +@/* ih264_deblk_chroma_horz_bs4_bp_a9() */ +@/* ih264_deblk_chroma_horz_bslt4_bp_a9() */ +@/* ih264_deblk_chroma_vert_bs4_mbaff_bp_a9() */ +@/* ih264_deblk_chroma_vert_bslt4_mbaff_bp_a9() */ +@/* ih264_deblk_chroma_vert_bs4_a9() */ +@/* ih264_deblk_chroma_vert_bslt4_a9() */ +@/* ih264_deblk_chroma_horz_bs4_a9() */ +@/* ih264_deblk_chroma_horz_bslt4_a9() */ +@/* ih264_deblk_chroma_vert_bs4_mbaff_a9() */ +@/* ih264_deblk_chroma_vert_bslt4_mbaff_a9() */ +@/* */ +@/* Issues / Problems : None */ +@/* */ +@/* Revision History : */ +@/* */ +@/* DD MM YYYY Author(s) Changes (Describe the changes made) */ +@/* 28 11 2013 Ittiam Draft */ +@/* 05 01 2015 Kaushik Added double-call functions for */ +@/* Senthoor vertical deblocking, and high */ +@/* profile functions. */ +@/* */ +@/*****************************************************************************/ + + +.text +.p2align 2 + +@/** +@******************************************************************************* +@* +@* @brief +@* Performs filtering of a chroma block horizontal edge when the +@* boundary strength is set to 4 +@* +@* @par Description: +@* This operation is described in Sec. 8.7.2.4 under the title +@* "Filtering process for edges for bS equal to 4" in ITU T Rec H.264. +@* +@* @param[in] r0 - pu1_src +@* Pointer to the src sample q0 +@* +@* @param[in] r1 - src_strd +@* Source stride +@* +@* @param[in] r2 - alpha +@* Alpha Value for the boundary +@* +@* @param[in] r3 - beta +@* Beta Value for the boundary +@* +@* @returns +@* None +@* +@* @remarks +@* None +@* +@******************************************************************************* +@*/ + + .global ih264_deblk_chroma_horz_bs4_bp_a9 + +ih264_deblk_chroma_horz_bs4_bp_a9: + + stmfd sp!, {r4, lr} @ + vpush {d8 - d15} + sub r0, r0, r1, lsl #1 @R0 = uc_edgePixel pointing to p1 of chroma + vld2.8 {d6, d7}, [r0], r1 @D6 = p1u , D7 = p1v + mov r4, r0 @Keeping a backup of the pointer p0 of chroma + vld2.8 {d4, d5}, [r0], r1 @D4 = p0u , D5 = p0v + vdup.8 q10, r2 @Q10 contains alpha + vld2.8 {d0, d1}, [r0], r1 @D0 = q0u , D1 = q0v + vaddl.u8 q4, d6, d0 @ + vaddl.u8 q5, d7, d1 @Q4,Q5 = q0 + p1 + vmov.i8 d31, #2 @ + vld2.8 {d2, d3}, [r0] @D2 = q1u , D3 = q1v + vabd.u8 q13, q3, q2 @Q13 = ABS(p1 - p0) + vmlal.u8 q4, d2, d31 @ + vmlal.u8 q5, d3, d31 @Q5,Q4 = (X2(q1U) + q0U + p1U) + vabd.u8 q11, q2, q0 @Q11 = ABS(p0 - q0) + vabd.u8 q12, q1, q0 @Q12 = ABS(q1 - q0) + vaddl.u8 q7, d4, d2 @ + vaddl.u8 q14, d5, d3 @Q14,Q7 = P0 + Q1 + vdup.8 q8, r3 @Q8 contains beta + vmlal.u8 q7, d6, d31 @ + vmlal.u8 q14, d7, d31 @Q14,Q7 = (X2(p1U) + p0U + q1U) + vcge.u8 q9, q11, q10 @Q9 = ( ABS(p0 - q0) >= Alpha ) + vcge.u8 q12, q12, q8 @Q12= ( ABS(q1 - q0) >= Beta ) + vcge.u8 q13, q13, q8 @Q13= ( ABS(p1 - p0) >= Beta ) + vrshrn.u16 d8, q4, #2 @ + vrshrn.u16 d9, q5, #2 @Q4 = (X2(q1U) + q0U + p1U + 2) >> 2 + vorr q9, q9, q12 @Q9 = ( ABS(p0 - q0) >= Alpha ) | ( ABS(q1 - q0) >= Beta ) + vrshrn.u16 d10, q7, #2 @ + vrshrn.u16 d11, q14, #2 @Q5 = (X2(p1U) + p0U + q1U + 2) >> 2 + vorr q9, q9, q13 @Q9 = ( ABS(p0 - q0) >= Alpha ) | ( ABS(q1 - q0) >= Beta ) | ( ABS(p1 - p0) >= Beta ) + vbit q5, q2, q9 @ + vbit q4, q0, q9 @ + vst2.8 {d10, d11}, [r4], r1 @ + vst2.8 {d8, d9}, [r4] @ + vpop {d8 - d15} + ldmfd sp!, {r4, pc} @ + + + +@/** +@******************************************************************************* +@* +@* @brief +@* Performs filtering of a chroma block vertical edge when the +@* boundary strength is set to 4 +@* +@* @par Description: +@* This operation is described in Sec. 8.7.2.4 under the title +@* "Filtering process for edges for bS equal to 4" in ITU T Rec H.264. +@* +@* @param[in] r0 - pu1_src +@* Pointer to the src sample q0 +@* +@* @param[in] r1 - src_strd +@* Source stride +@* +@* @param[in] r2 - alpha +@* Alpha Value for the boundary +@* +@* @param[in] r3 - beta +@* Beta Value for the boundary +@* +@* @returns +@* None +@* +@* @remarks +@* None +@* +@******************************************************************************* +@*/ + + .global ih264_deblk_chroma_vert_bs4_bp_a9 + +ih264_deblk_chroma_vert_bs4_bp_a9: + + stmfd sp!, {r12, r14} + vpush {d8 - d15} + sub r0, r0, #4 @point r0 to p1u of row0. + mov r12, r0 @keep a back up of r0 for buffer write + + vld4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0], r1 + vld4.16 {d0[1], d2[1], d4[1], d6[1]}, [r0], r1 + vld4.16 {d0[2], d2[2], d4[2], d6[2]}, [r0], r1 + vld4.16 {d0[3], d2[3], d4[3], d6[3]}, [r0], r1 + + vld4.16 {d1[0], d3[0], d5[0], d7[0]}, [r0], r1 + vld4.16 {d1[1], d3[1], d5[1], d7[1]}, [r0], r1 + vld4.16 {d1[2], d3[2], d5[2], d7[2]}, [r0], r1 + vld4.16 {d1[3], d3[3], d5[3], d7[3]}, [r0], r1 + + vdup.8 q11, r2 @Q4 = alpha + vdup.8 q12, r3 @Q5 = beta + vmov.i8 d31, #2 + + vabd.u8 q4, q1, q2 @|p0-q0| + vabd.u8 q5, q3, q2 @|q1-q0| + vabd.u8 q6, q0, q1 @|p1-p0| + vaddl.u8 q7, d2, d6 + vaddl.u8 q8, d3, d7 @(p0 + q1) + vclt.u8 q4, q4, q11 @|p0-q0| < alpha ? + vclt.u8 q5, q5, q12 @|q1-q0| < beta ? + vclt.u8 q6, q6, q12 @|p1-p0| < beta ? + vmlal.u8 q7, d0, d31 + vmlal.u8 q8, d1, d31 @2*p1 + (p0 + q1) + vaddl.u8 q9, d0, d4 + vaddl.u8 q10, d1, d5 @(p1 + q0) + vand.u8 q4, q4, q5 @|p0-q0| < alpha && |q1-q0| < beta + vmlal.u8 q9, d6, d31 + vmlal.u8 q10, d7, d31 @2*q1 + (p1 + q0) + + vrshrn.i16 d14, q7, #2 + vrshrn.i16 d15, q8, #2 @(2*p1 + (p0 + q1) + 2) >> 2 + vand.u8 q4, q4, q6 @|p0-q0| < alpha && |q1-q0| < beta && |p1-p0| < beta + vrshrn.i16 d18, q9, #2 + vrshrn.i16 d19, q10, #2 @(2*q1 + (p1 + q0) + 2) >> 2 + + vbit q1, q7, q4 + vbit q2, q9, q4 + + vst4.16 {d0[0], d2[0], d4[0], d6[0]}, [r12], r1 + vst4.16 {d0[1], d2[1], d4[1], d6[1]}, [r12], r1 + vst4.16 {d0[2], d2[2], d4[2], d6[2]}, [r12], r1 + vst4.16 {d0[3], d2[3], d4[3], d6[3]}, [r12], r1 + + vst4.16 {d1[0], d3[0], d5[0], d7[0]}, [r12], r1 + vst4.16 {d1[1], d3[1], d5[1], d7[1]}, [r12], r1 + vst4.16 {d1[2], d3[2], d5[2], d7[2]}, [r12], r1 + vst4.16 {d1[3], d3[3], d5[3], d7[3]}, [r12], r1 + vpop {d8 - d15} + ldmfd sp!, {r12, pc} + + + +@/** +@******************************************************************************* +@* +@* @brief +@* Performs filtering of a chroma block horizontal edge for cases where the +@* boundary strength is less than 4 +@* +@* @par Description: +@* This operation is described in Sec. 8.7.2.4 under the title +@* "Filtering process for edges for bS equal to 4" in ITU T Rec H.264. +@* +@* @param[in] r0 - pu1_src +@* Pointer to the src sample q0 +@* +@* @param[in] r1 - src_strd +@* Source stride +@* +@* @param[in] r2 - alpha +@* Alpha Value for the boundary +@* +@* @param[in] r3 - beta +@* Beta Value for the boundary +@* +@* @param[in] sp(0) - u4_bs +@* Packed Boundary strength array +@* +@* @param[in] sp(4) - pu1_cliptab +@* tc0_table +@* +@* @returns +@* None +@* +@* @remarks +@* None +@* +@******************************************************************************* +@*/ + + .global ih264_deblk_chroma_horz_bslt4_bp_a9 + +ih264_deblk_chroma_horz_bslt4_bp_a9: + + stmfd sp!, {r4-r6, lr} @ + + ldrd r4, r5, [sp, #0x10] @r4 = u4_bs , r5 = pu1_cliptab + vpush {d8 - d15} + sub r0, r0, r1, lsl #1 @R0 = uc_edgePixelU pointing to p2 of chroma U + rev r4, r4 @ + vmov.32 d12[0], r4 @d12[0] = ui_Bs + vld1.32 d16[0], [r5] @D16[0] contains cliptab + vld2.8 {d6, d7}, [r0], r1 @Q3=p1 + vtbl.8 d14, {d16}, d12 @ + vmovl.u8 q6, d12 @q6 = uc_Bs in each 16 bit scalar + mov r6, r0 @Keeping a backup of the pointer to chroma U P0 + vld2.8 {d4, d5}, [r0], r1 @Q2=p0 + vmov.i8 d30, #1 @ + vdup.8 q10, r2 @Q10 contains alpha + vld2.8 {d0, d1}, [r0], r1 @Q0=q0 + vmovl.u8 q7, d14 @ + vld2.8 {d2, d3}, [r0] @Q1=q1 + vsubl.u8 q5, d1, d5 @ + vsubl.u8 q4, d0, d4 @Q5,Q4 = (q0 - p0) + vabd.u8 q13, q3, q2 @Q13 = ABS(p1 - p0) + vshl.i16 q5, q5, #2 @Q5 = (q0 - p0)<<2 + vabd.u8 q11, q2, q0 @Q11 = ABS(p0 - q0) + vshl.i16 q4, q4, #2 @Q4 = (q0 - p0)<<2 + vsli.16 q7, q7, #8 @ + vabd.u8 q12, q1, q0 @Q12 = ABS(q1 - q0) + vcge.u8 q9, q11, q10 @Q9 = ( ABS(p0 - q0) >= Alpha ) + vsubl.u8 q10, d6, d2 @Q10 = (p1 - q1)L + vsubl.u8 q3, d7, d3 @Q3 = (p1 - q1)H + vdup.8 q8, r3 @Q8 contains beta + vadd.i16 q4, q4, q10 @ + vadd.i16 q5, q5, q3 @Q5,Q4 = [ (q0 - p0)<<2 ] + (p1 - q1) + vcge.u8 q12, q12, q8 @Q12= ( ABS(q1 - q0) >= Beta ) + vcgt.s16 d12, d12, #0 @Q6 = (us_Bs > 0) + vqrshrn.s16 d8, q4, #3 @ + vqrshrn.s16 d9, q5, #3 @Q4 = i_macro = (((q0 - p0)<<2) + (p1 - q1) + 4)>>3 + vadd.i8 d14, d14, d30 @Q7 = C = C0+1 + vcge.u8 q13, q13, q8 @Q13= ( ABS(p1 - p0) >= Beta ) + vorr q9, q9, q12 @Q9 = ( ABS(p0 - q0) >= Alpha ) | ( ABS(q1 - q0) >= Beta ) + vabs.s8 q3, q4 @Q4 = ABS (i_macro) + vmov.i8 d15, d14 @ + vmov.i8 d13, d12 @ + vorr q9, q9, q13 @Q9 = ( ABS(p0 - q0) >= Alpha ) | ( ABS(q1 - q0) >= Beta ) | ( ABS(p1 - p0) >= Beta ) + vmin.u8 q7, q3, q7 @Q7 = delta = (ABS(i_macro) > C) ? C : ABS(i_macro) + vbic q6, q6, q9 @final condition + vcge.s8 q4, q4, #0 @Q4 = (i_macro >= 0) + vand q7, q7, q6 @Making delta zero in places where values shouldn be filterd + vqadd.u8 q8, q2, q7 @Q8 = p0 + delta + vqsub.u8 q2, q2, q7 @Q2 = p0 - delta + vqadd.u8 q9, q0, q7 @Q9 = q0 + delta + vqsub.u8 q0, q0, q7 @Q0 = q0 - delta + vbif q8, q2, q4 @Q8 = (i_macro >= 0 ) ? (p0+delta) : (p0-delta) + vbif q0, q9, q4 @Q0 = (i_macro >= 0 ) ? (q0-delta) : (q0+delta) + vst2.8 {d16, d17}, [r6], r1 @ + vst2.8 {d0, d1}, [r6] @ + vpop {d8 - d15} + ldmfd sp!, {r4-r6, pc} @ + + + +@/** +@******************************************************************************* +@* +@* @brief +@* Performs filtering of a chroma block vertical edge for cases where the +@* boundary strength is less than 4 +@* +@* @par Description: +@* This operation is described in Sec. 8.7.2.4 under the title +@* "Filtering process for edges for bS equal to 4" in ITU T Rec H.264. +@* +@* @param[in] r0 - pu1_src +@* Pointer to the src sample q0 +@* +@* @param[in] r1 - src_strd +@* Source stride +@* +@* @param[in] r2 - alpha +@* Alpha Value for the boundary +@* +@* @param[in] r3 - beta +@* Beta Value for the boundary +@* +@* @param[in] sp(0) - u4_bs +@* Packed Boundary strength array +@* +@* @param[in] sp(4) - pu1_cliptab +@* tc0_table +@* +@* @returns +@* None +@* +@* @remarks +@* None +@* +@******************************************************************************* +@*/ + + .global ih264_deblk_chroma_vert_bslt4_bp_a9 + +ih264_deblk_chroma_vert_bslt4_bp_a9: + + stmfd sp!, {r10-r12, r14} + + sub r0, r0, #4 @point r0 to p1u of row0. + ldr r11, [sp, #16] @r12 = ui_Bs + + ldr r10, [sp, #20] @r14 = puc_ClipTab + mov r12, r0 @keep a back up of r0 for buffer write + vpush {d8 - d15} + vld4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0], r1 + vld4.16 {d0[1], d2[1], d4[1], d6[1]}, [r0], r1 + vld4.16 {d0[2], d2[2], d4[2], d6[2]}, [r0], r1 + vld4.16 {d0[3], d2[3], d4[3], d6[3]}, [r0], r1 + + vld4.16 {d1[0], d3[0], d5[0], d7[0]}, [r0], r1 + vld4.16 {d1[1], d3[1], d5[1], d7[1]}, [r0], r1 + vld4.16 {d1[2], d3[2], d5[2], d7[2]}, [r0], r1 + vld4.16 {d1[3], d3[3], d5[3], d7[3]}, [r0], r1 + + + vdup.8 q11, r2 @Q4 = alpha + vabd.u8 q4, q1, q2 @|p0-q0| + vdup.8 q12, r3 @Q5 = beta + vabd.u8 q5, q3, q2 @|q1-q0| + vabd.u8 q6, q0, q1 @|p1-p0| + vclt.u8 q4, q4, q11 @|p0-q0| < alpha ? + vsubl.u8 q7, d0, d6 + vclt.u8 q5, q5, q12 @|q1-q0| < beta ? + vsubl.u8 q8, d1, d7 @(p1 - q1) + vclt.u8 q6, q6, q12 @|p1-p0| < beta ? + vsubl.u8 q9, d4, d2 + vand.u8 q4, q4, q5 @|p0-q0| < alpha && |q1-q0| < beta + vsubl.u8 q10, d5, d3 @(q0 - p0) + vmov.u16 q14, #4 + vld1.32 {d24[0]}, [r10] @Load ClipTable + rev r11, r11 @Blocking strengths + vand.u8 q4, q4, q6 @|p0-q0| < alpha && |q1-q0| < beta && |p1-p0| < beta + + vmov.32 d10[0], r11 + + vmla.s16 q7, q9, q14 + vmla.s16 q8, q10, q14 @4*(q0 - p0) + (p1 - q1) + + vmovl.u8 q5, d10 + + + vsli.u16 d10, d10, #8 + vmovl.u16 q5, d10 + vsli.u32 q5, q5, #16 + vtbl.8 d12, {d24}, d10 + vtbl.8 d13, {d24}, d11 @tC0 + vmov.u8 q12, #1 + vadd.u8 q6, q6, q12 @tC0 + 1 + vcge.u8 q5, q5, q12 @u4_bS > 0 ? + vand.u8 q4, q4, q5 @|p0-q0| < alpha && |q1-q0| < beta && |p1-p0| < beta && u4_bs != 0 + + @ Q0 - Q3(inputs), + @ Q4 (|p0-q0| < alpha && |q1-q0| < beta && |p1-p0| < beta && u4_bs != 0), + @ Q6 (tC) + + vrshr.s16 q7, q7, #3 + vrshr.s16 q8, q8, #3 @(((q0 - p0) << 2) + (p1 - q1) + 4) >> 3) + + vcgt.s16 q9, q7, #0 + vcgt.s16 q10, q8, #0 + vmovn.i16 d18, q9 + vmovn.i16 d19, q10 @Q9 = sign(delta) + vabs.s16 q7, q7 + vabs.s16 q8, q8 + vmovn.u16 d14, q7 + vmovn.u16 d15, q8 + vmin.u8 q7, q7, q6 @Q7 = |delta| + + vqadd.u8 q10, q1, q7 @p0+|delta| + vqadd.u8 q11, q2, q7 @q0+|delta| + vqsub.u8 q12, q1, q7 @p0-|delta| + vqsub.u8 q13, q2, q7 @q0-|delta| + + vbit q12, q10, q9 @p0 + delta + vbit q11, q13, q9 @q0 - delta + + vbit q1, q12, q4 + vbit q2, q11, q4 + + vst4.16 {d0[0], d2[0], d4[0], d6[0]}, [r12], r1 + vst4.16 {d0[1], d2[1], d4[1], d6[1]}, [r12], r1 + vst4.16 {d0[2], d2[2], d4[2], d6[2]}, [r12], r1 + vst4.16 {d0[3], d2[3], d4[3], d6[3]}, [r12], r1 + + vst4.16 {d1[0], d3[0], d5[0], d7[0]}, [r12], r1 + vst4.16 {d1[1], d3[1], d5[1], d7[1]}, [r12], r1 + vst4.16 {d1[2], d3[2], d5[2], d7[2]}, [r12], r1 + vst4.16 {d1[3], d3[3], d5[3], d7[3]}, [r12], r1 + vpop {d8 - d15} + ldmfd sp!, {r10-r12, pc} + + + +@/** +@******************************************************************************* +@* +@* @brief +@* Performs filtering of a chroma block vertical edge when the +@* boundary strength is set to 4 on calling twice +@* +@* @par Description: +@* This operation is described in Sec. 8.7.2.4 under the title +@* "Filtering process for edges for bS equal to 4" in ITU T Rec H.264. +@* +@* @param[in] r0 - pu1_src +@* Pointer to the src sample q0 +@* +@* @param[in] r1 - src_strd +@* Source stride +@* +@* @param[in] r2 - alpha +@* Alpha Value for the boundary +@* +@* @param[in] r3 - beta +@* Beta Value for the boundary +@* +@* @returns +@* None +@* +@* @remarks +@* None +@* +@******************************************************************************* +@*/ + + .global ih264_deblk_chroma_vert_bs4_mbaff_bp_a9 + +ih264_deblk_chroma_vert_bs4_mbaff_bp_a9: + + stmfd sp!, {r12, r14} + vpush {d8 - d15} + sub r0, r0, #4 @point r0 to p1u of row0. + mov r12, r0 @keep a back up of r0 for buffer write + + vld4.16 {d0[0], d1[0], d2[0], d3[0]}, [r0], r1 + vld4.16 {d0[1], d1[1], d2[1], d3[1]}, [r0], r1 + vld4.16 {d0[2], d1[2], d2[2], d3[2]}, [r0], r1 + vld4.16 {d0[3], d1[3], d2[3], d3[3]}, [r0], r1 + + vdup.8 d11, r2 @D11 = alpha + vdup.8 d12, r3 @D12 = beta + vmov.i8 d31, #2 + + vabd.u8 d4, d1, d2 @|p0-q0| + vabd.u8 d5, d3, d2 @|q1-q0| + vabd.u8 d6, d0, d1 @|p1-p0| + vaddl.u8 q14, d1, d3 @(p0 + q1) + vclt.u8 d4, d4, d11 @|p0-q0| < alpha ? + vclt.u8 d5, d5, d12 @|q1-q0| < beta ? + vclt.u8 d6, d6, d12 @|p1-p0| < beta ? + vmlal.u8 q14, d0, d31 @2*p1 + (p0 + q1) + vaddl.u8 q13, d0, d2 @(p1 + q0) + vand.u8 d4, d4, d5 @|p0-q0| < alpha && |q1-q0| < beta + vmlal.u8 q13, d3, d31 @2*q1 + (p1 + q0) + + vrshrn.i16 d7, q14, #2 @(2*p1 + (p0 + q1) + 2) >> 2 + vand.u8 d4, d4, d6 @|p0-q0| < alpha && |q1-q0| < beta && |p1-p0| < beta + vrshrn.i16 d9, q13, #2 @(2*q1 + (p1 + q0) + 2) >> 2 + + vbit d1, d7, d4 + vbit d2, d9, d4 + + vst4.16 {d0[0], d1[0], d2[0], d3[0]}, [r12], r1 + vst4.16 {d0[1], d1[1], d2[1], d3[1]}, [r12], r1 + vst4.16 {d0[2], d1[2], d2[2], d3[2]}, [r12], r1 + vst4.16 {d0[3], d1[3], d2[3], d3[3]}, [r12], r1 + vpop {d8 - d15} + ldmfd sp!, {r12, pc} + + + +@/** +@******************************************************************************* +@* +@* @brief +@* Performs filtering of a chroma block vertical edge for cases where the +@* boundary strength is less than 4 on calling twice +@* +@* @par Description: +@* This operation is described in Sec. 8.7.2.4 under the title +@* "Filtering process for edges for bS equal to 4" in ITU T Rec H.264. +@* +@* @param[in] r0 - pu1_src +@* Pointer to the src sample q0 +@* +@* @param[in] r1 - src_strd +@* Source stride +@* +@* @param[in] r2 - alpha +@* Alpha Value for the boundary +@* +@* @param[in] r3 - beta +@* Beta Value for the boundary +@* +@* @param[in] sp(0) - u4_bs +@* Packed Boundary strength array +@* +@* @param[in] sp(4) - pu1_cliptab +@* tc0_table +@* +@* @returns +@* None +@* +@* @remarks +@* None +@* +@******************************************************************************* +@*/ + + .global ih264_deblk_chroma_vert_bslt4_mbaff_bp_a9 + +ih264_deblk_chroma_vert_bslt4_mbaff_bp_a9: + + stmfd sp!, {r10-r12, r14} + + sub r0, r0, #4 @point r0 to p1u of row0. + ldr r11, [sp, #16] @r11 = ui_Bs + + ldr r10, [sp, #20] @r10 = puc_ClipTab + mov r12, r0 @keep a back up of r0 for buffer write + vpush {d8 - d15} + vld4.16 {d0[0], d1[0], d2[0], d3[0]}, [r0], r1 + vld4.16 {d0[1], d1[1], d2[1], d3[1]}, [r0], r1 + vld4.16 {d0[2], d1[2], d2[2], d3[2]}, [r0], r1 + vld4.16 {d0[3], d1[3], d2[3], d3[3]}, [r0], r1 + + vdup.8 d11, r2 @D11 = alpha + vabd.u8 d4, d1, d2 @|p0-q0| + vdup.8 d12, r3 @D12 = beta + vabd.u8 d5, d3, d2 @|q1-q0| + vabd.u8 d6, d0, d1 @|p1-p0| + vclt.u8 d4, d4, d11 @|p0-q0| < alpha ? + vclt.u8 d5, d5, d12 @|q1-q0| < beta ? + vsubl.u8 q14, d0, d3 @(p1 - q1) + vclt.u8 d6, d6, d12 @|p1-p0| < beta ? + vand.u8 d4, d4, d5 @|p0-q0| < alpha && |q1-q0| < beta + vsubl.u8 q12, d2, d1 @(q0 - p0) + vmov.u16 q10, #4 + + vld1.32 {d31[0]}, [r10] @Load ClipTable + rev r11, r11 @Blocking strengths + vand.u8 d4, d4, d6 @|p0-q0| < alpha && |q1-q0| < beta && |p1-p0| < beta + vmov.32 d22[0], r11 + vmla.s16 q14, q12, q10 @4*(q0 - p0) + (p1 - q1) + vmovl.u8 q11, d22 + vsli.u16 d22, d22, #8 + vtbl.8 d6, {d31}, d22 @tC0 + vmov.u8 d12, #1 + vadd.u8 d6, d6, d12 @tC0 + 1 + vcge.u8 d5, d22, d12 @u4_bS > 0 ? + vand.u8 d4, d4, d5 @|p0-q0| < alpha && |q1-q0| < beta && |p1-p0| < beta && u4_bs != 0 + + @ D0 - D3(inputs), + @ D4 (|p0-q0| < alpha && |q1-q0| < beta && |p1-p0| < beta && u4_bs != 0), + @ D6 (tC) + + vrshr.s16 q14, q14, #3 @(((q0 - p0) << 2) + (p1 - q1) + 4) >> 3) + + vcgt.s16 q13, q14, #0 + vmovn.i16 d9, q13 @D9 = sign(delta) + vabs.s16 q14, q14 + vmovn.u16 d7, q14 + vmin.u8 d7, d7, d6 @D7 = |delta| + + vqadd.u8 d10, d1, d7 @p0+|delta| + vqadd.u8 d11, d2, d7 @q0+|delta| + vqsub.u8 d12, d1, d7 @p0-|delta| + vqsub.u8 d13, d2, d7 @q0-|delta| + + vbit d12, d10, d9 @p0 + delta + vbit d11, d13, d9 @q0 - delta + + vbit d1, d12, d4 + vbit d2, d11, d4 + + vst4.16 {d0[0], d1[0], d2[0], d3[0]}, [r12], r1 + vst4.16 {d0[1], d1[1], d2[1], d3[1]}, [r12], r1 + vst4.16 {d0[2], d1[2], d2[2], d3[2]}, [r12], r1 + vst4.16 {d0[3], d1[3], d2[3], d3[3]}, [r12], r1 + vpop {d8 - d15} + ldmfd sp!, {r10-r12, pc} + + + +@/** +@******************************************************************************* +@* +@* @brief +@* Performs filtering of a chroma block horizontal edge when the +@* boundary strength is set to 4 in high profile +@* +@* @par Description: +@* This operation is described in Sec. 8.7.2.4 under the title +@* "Filtering process for edges for bS equal to 4" in ITU T Rec H.264. +@* +@* @param[in] r0 - pu1_src +@* Pointer to the src sample q0 +@* +@* @param[in] r1 - src_strd +@* Source stride +@* +@* @param[in] r2 - alpha_cb +@* Alpha Value for the boundary in U +@* +@* @param[in] r3 - beta_cb +@* Beta Value for the boundary in U +@* +@* @param[in] sp(0) - alpha_cr +@* Alpha Value for the boundary in V +@* +@* @param[in] sp(4) - beta_cr +@* Beta Value for the boundary in V +@* +@* @returns +@* None +@* +@* @remarks +@* None +@* +@******************************************************************************* +@*/ + + .global ih264_deblk_chroma_horz_bs4_a9 + +ih264_deblk_chroma_horz_bs4_a9: + + stmfd sp!, {r4-r6, lr} @ + + ldr r5, [sp, #16] @R5 = alpha_cr + ldr r6, [sp, #20] @R6 = beta_cr + vpush {d8 - d15} + sub r0, r0, r1, lsl #1 @R0 = uc_edgePixel pointing to p1 of chroma + vld2.8 {d6, d7}, [r0], r1 @D6 = p1u , D7 = p1v + mov r4, r0 @Keeping a backup of the pointer p0 of chroma + vld2.8 {d4, d5}, [r0], r1 @D4 = p0u , D5 = p0v + vdup.8 d20, r2 @D20 contains alpha_cb + vdup.8 d21, r5 @D21 contains alpha_cr + vld2.8 {d0, d1}, [r0], r1 @D0 = q0u , D1 = q0v + vaddl.u8 q4, d6, d0 @ + vaddl.u8 q5, d7, d1 @Q4,Q5 = q0 + p1 + vmov.i8 d31, #2 @ + vld2.8 {d2, d3}, [r0] @D2 = q1u , D3 = q1v + vabd.u8 q13, q3, q2 @Q13 = ABS(p1 - p0) + vmlal.u8 q4, d2, d31 @ + vmlal.u8 q5, d3, d31 @Q5,Q4 = (X2(q1U) + q0U + p1U) + vabd.u8 q11, q2, q0 @Q11 = ABS(p0 - q0) + vabd.u8 q12, q1, q0 @Q12 = ABS(q1 - q0) + vaddl.u8 q7, d4, d2 @ + vaddl.u8 q14, d5, d3 @Q14,Q7 = P0 + Q1 + vdup.8 d16, r3 @D16 contains beta_cb + vdup.8 d17, r6 @D17 contains beta_cr + vmlal.u8 q7, d6, d31 @ + vmlal.u8 q14, d7, d31 @Q14,Q7 = (X2(p1U) + p0U + q1U) + vcge.u8 q9, q11, q10 @Q9 = ( ABS(p0 - q0) >= Alpha ) + vcge.u8 q12, q12, q8 @Q12= ( ABS(q1 - q0) >= Beta ) + vcge.u8 q13, q13, q8 @Q13= ( ABS(p1 - p0) >= Beta ) + vrshrn.u16 d8, q4, #2 @ + vrshrn.u16 d9, q5, #2 @Q4 = (X2(q1U) + q0U + p1U + 2) >> 2 + vorr q9, q9, q12 @Q9 = ( ABS(p0 - q0) >= Alpha ) | ( ABS(q1 - q0) >= Beta ) + vrshrn.u16 d10, q7, #2 @ + vrshrn.u16 d11, q14, #2 @Q5 = (X2(p1U) + p0U + q1U + 2) >> 2 + vorr q9, q9, q13 @Q9 = ( ABS(p0 - q0) >= Alpha ) | ( ABS(q1 - q0) >= Beta ) | ( ABS(p1 - p0) >= Beta ) + vbit q5, q2, q9 @ + vbit q4, q0, q9 @ + vst2.8 {d10, d11}, [r4], r1 @ + vst2.8 {d8, d9}, [r4] @ + vpop {d8 - d15} + ldmfd sp!, {r4-r6, pc} @ + + + +@/** +@******************************************************************************* +@* +@* @brief +@* Performs filtering of a chroma block vertical edge when the +@* boundary strength is set to 4 in high profile +@* +@* @par Description: +@* This operation is described in Sec. 8.7.2.4 under the title +@* "Filtering process for edges for bS equal to 4" in ITU T Rec H.264. +@* +@* @param[in] r0 - pu1_src +@* Pointer to the src sample q0 +@* +@* @param[in] r1 - src_strd +@* Source stride +@* +@* @param[in] r2 - alpha_cb +@* Alpha Value for the boundary in U +@* +@* @param[in] r3 - beta_cb +@* Beta Value for the boundary in U +@* +@* @param[in] sp(0) - alpha_cr +@* Alpha Value for the boundary in V +@* +@* @param[in] sp(4) - beta_cr +@* Beta Value for the boundary in V +@* +@* @returns +@* None +@* +@* @remarks +@* None +@* +@******************************************************************************* +@*/ + + .global ih264_deblk_chroma_vert_bs4_a9 + +ih264_deblk_chroma_vert_bs4_a9: + + stmfd sp!, {r4, r5, r12, r14} + + sub r0, r0, #4 @point r0 to p1u of row0. + mov r12, r0 @keep a back up of r0 for buffer write + + ldr r4, [sp, #16] @r4 = alpha_cr + ldr r5, [sp, #20] @r5 = beta_cr + add r2, r2, r4, lsl #8 @r2 = (alpha_cr,alpha_cb) + add r3, r3, r5, lsl #8 @r3 = (beta_cr,beta_cb) + vpush {d8 - d15} + vld4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0], r1 + vld4.16 {d0[1], d2[1], d4[1], d6[1]}, [r0], r1 + vld4.16 {d0[2], d2[2], d4[2], d6[2]}, [r0], r1 + vld4.16 {d0[3], d2[3], d4[3], d6[3]}, [r0], r1 + + vld4.16 {d1[0], d3[0], d5[0], d7[0]}, [r0], r1 + vld4.16 {d1[1], d3[1], d5[1], d7[1]}, [r0], r1 + vld4.16 {d1[2], d3[2], d5[2], d7[2]}, [r0], r1 + vld4.16 {d1[3], d3[3], d5[3], d7[3]}, [r0], r1 + + vdup.16 q11, r2 @Q11 = alpha + vdup.16 q12, r3 @Q12 = beta + vmov.i8 d31, #2 + + vabd.u8 q4, q1, q2 @|p0-q0| + vabd.u8 q5, q3, q2 @|q1-q0| + vabd.u8 q6, q0, q1 @|p1-p0| + vaddl.u8 q7, d2, d6 + vaddl.u8 q8, d3, d7 @(p0 + q1) + vclt.u8 q4, q4, q11 @|p0-q0| < alpha ? + vclt.u8 q5, q5, q12 @|q1-q0| < beta ? + vclt.u8 q6, q6, q12 @|p1-p0| < beta ? + vmlal.u8 q7, d0, d31 + vmlal.u8 q8, d1, d31 @2*p1 + (p0 + q1) + vaddl.u8 q9, d0, d4 + vaddl.u8 q10, d1, d5 @(p1 + q0) + vand.u8 q4, q4, q5 @|p0-q0| < alpha && |q1-q0| < beta + vmlal.u8 q9, d6, d31 + vmlal.u8 q10, d7, d31 @2*q1 + (p1 + q0) + + vrshrn.i16 d14, q7, #2 + vrshrn.i16 d15, q8, #2 @(2*p1 + (p0 + q1) + 2) >> 2 + vand.u8 q4, q4, q6 @|p0-q0| < alpha && |q1-q0| < beta && |p1-p0| < beta + vrshrn.i16 d18, q9, #2 + vrshrn.i16 d19, q10, #2 @(2*q1 + (p1 + q0) + 2) >> 2 + + vbit q1, q7, q4 + vbit q2, q9, q4 + + vst4.16 {d0[0], d2[0], d4[0], d6[0]}, [r12], r1 + vst4.16 {d0[1], d2[1], d4[1], d6[1]}, [r12], r1 + vst4.16 {d0[2], d2[2], d4[2], d6[2]}, [r12], r1 + vst4.16 {d0[3], d2[3], d4[3], d6[3]}, [r12], r1 + + vst4.16 {d1[0], d3[0], d5[0], d7[0]}, [r12], r1 + vst4.16 {d1[1], d3[1], d5[1], d7[1]}, [r12], r1 + vst4.16 {d1[2], d3[2], d5[2], d7[2]}, [r12], r1 + vst4.16 {d1[3], d3[3], d5[3], d7[3]}, [r12], r1 + vpop {d8 - d15} + ldmfd sp!, {r4, r5, r12, pc} + + + +@/** +@******************************************************************************* +@* +@* @brief +@* Performs filtering of a chroma block horizontal edge for cases where the +@* boundary strength is less than 4 in high profile +@* +@* @par Description: +@* This operation is described in Sec. 8.7.2.4 under the title +@* "Filtering process for edges for bS equal to 4" in ITU T Rec H.264. +@* +@* @param[in] r0 - pu1_src +@* Pointer to the src sample q0 +@* +@* @param[in] r1 - src_strd +@* Source stride +@* +@* @param[in] r2 - alpha_cb +@* Alpha Value for the boundary in U +@* +@* @param[in] r3 - beta_cb +@* Beta Value for the boundary in U +@* +@* @param[in] sp(0) - alpha_cr +@* Alpha Value for the boundary in V +@* +@* @param[in] sp(4) - beta_cr +@* Beta Value for the boundary in V +@* +@* @param[in] sp(8) - u4_bs +@* Packed Boundary strength array +@* +@* @param[in] sp(12) - pu1_cliptab_cb +@* tc0_table for U +@* +@* @param[in] sp(16) - pu1_cliptab_cr +@* tc0_table for V +@* +@* @returns +@* None +@* +@* @remarks +@* None +@* +@******************************************************************************* +@*/ + + .global ih264_deblk_chroma_horz_bslt4_a9 + +ih264_deblk_chroma_horz_bslt4_a9: + + stmfd sp!, {r4-r9, lr} @ + + ldrd r4, r5, [sp, #28] @R4 = alpha_cr , R5 = beta_cr + ldr r7, [sp, #36] @R7 = u4_bs + ldrd r8, r9, [sp, #40] @R8 = pu1_cliptab_cb , R9 = pu1_cliptab_cr + sub r0, r0, r1, lsl #1 @R0 = uc_edgePixelU pointing to p1 of chroma U + vpush {d8 - d15} + rev r7, r7 @ + vmov.32 d12[0], r7 @D12[0] = ui_Bs + + vld1.32 d16[0], [r8] @D16[0] contains cliptab_cb + vld1.32 d17[0], [r9] @D17[0] contains cliptab_cr + vld2.8 {d6, d7}, [r0], r1 @Q3=p1 + vtbl.8 d14, {d16}, d12 @Retreiving cliptab values for U + vtbl.8 d28, {d17}, d12 @Retrieving cliptab values for V + vmovl.u8 q6, d12 @Q6 = uc_Bs in each 16 bit scalar + mov r6, r0 @Keeping a backup of the pointer to chroma U P0 + vld2.8 {d4, d5}, [r0], r1 @Q2=p0 + vmov.i8 d30, #1 @ + vdup.8 d20, r2 @D20 contains alpha_cb + vdup.8 d21, r4 @D21 contains alpha_cr + vld2.8 {d0, d1}, [r0], r1 @Q0=q0 + vmovl.u8 q7, d14 @ + vmovl.u8 q14, d28 @ + vmov.i16 d15, d28 @D14 has cliptab values for U, D15 for V + vld2.8 {d2, d3}, [r0] @Q1=q1 + vsubl.u8 q5, d1, d5 @ + vsubl.u8 q4, d0, d4 @Q5,Q4 = (q0 - p0) + vabd.u8 q13, q3, q2 @Q13 = ABS(p1 - p0) + vshl.i16 q5, q5, #2 @Q5 = (q0 - p0)<<2 + vabd.u8 q11, q2, q0 @Q11 = ABS(p0 - q0) + vshl.i16 q4, q4, #2 @Q4 = (q0 - p0)<<2 + vsli.16 q7, q7, #8 @ + vabd.u8 q12, q1, q0 @Q12 = ABS(q1 - q0) + vcge.u8 q9, q11, q10 @Q9 = ( ABS(p0 - q0) >= Alpha ) + vsubl.u8 q10, d6, d2 @Q10 = (p1 - q1)L + vsubl.u8 q3, d7, d3 @Q3 = (p1 - q1)H + vdup.8 d16, r3 @Q8 contains beta_cb + vdup.8 d17, r5 @Q8 contains beta_cr + vadd.i16 q4, q4, q10 @ + vadd.i16 q5, q5, q3 @Q5,Q4 = [ (q0 - p0)<<2 ] + (p1 - q1) + vcge.u8 q12, q12, q8 @Q12= ( ABS(q1 - q0) >= Beta ) + vcgt.s16 d12, d12, #0 @Q6 = (us_Bs > 0) + vqrshrn.s16 d8, q4, #3 @ + vqrshrn.s16 d9, q5, #3 @Q4 = i_macro = (((q0 - p0)<<2) + (p1 - q1) + 4)>>3 + vadd.i8 d14, d14, d30 @D14 = C = C0+1 for U + vcge.u8 q13, q13, q8 @Q13= ( ABS(p1 - p0) >= Beta ) + vorr q9, q9, q12 @Q9 = ( ABS(p0 - q0) >= Alpha ) | ( ABS(q1 - q0) >= Beta ) + vabs.s8 q3, q4 @Q4 = ABS (i_macro) + vadd.i8 d15, d15, d30 @D15 = C = C0+1 for V + vmov.i8 d13, d12 @ + vorr q9, q9, q13 @Q9 = ( ABS(p0 - q0) >= Alpha ) | ( ABS(q1 - q0) >= Beta ) | ( ABS(p1 - p0) >= Beta ) + vmin.u8 q7, q3, q7 @Q7 = delta = (ABS(i_macro) > C) ? C : ABS(i_macro) + vbic q6, q6, q9 @final condition + vcge.s8 q4, q4, #0 @Q4 = (i_macro >= 0) + vand q7, q7, q6 @Making delta zero in places where values shouldn be filterd + vqadd.u8 q8, q2, q7 @Q8 = p0 + delta + vqsub.u8 q2, q2, q7 @Q2 = p0 - delta + vqadd.u8 q9, q0, q7 @Q9 = q0 + delta + vqsub.u8 q0, q0, q7 @Q0 = q0 - delta + vbif q8, q2, q4 @Q8 = (i_macro >= 0 ) ? (p0+delta) : (p0-delta) + vbif q0, q9, q4 @Q0 = (i_macro >= 0 ) ? (q0-delta) : (q0+delta) + vst2.8 {d16, d17}, [r6], r1 @ + vst2.8 {d0, d1}, [r6] @ + vpop {d8 - d15} + ldmfd sp!, {r4-r9, pc} @ + + + +@/** +@******************************************************************************* +@* +@* @brief +@* Performs filtering of a chroma block vertical edge for cases where the +@* boundary strength is less than 4 in high profile +@* +@* @par Description: +@* This operation is described in Sec. 8.7.2.4 under the title +@* "Filtering process for edges for bS equal to 4" in ITU T Rec H.264. +@* +@* @param[in] r0 - pu1_src +@* Pointer to the src sample q0 +@* +@* @param[in] r1 - src_strd +@* Source stride +@* +@* @param[in] r2 - alpha_cb +@* Alpha Value for the boundary in U +@* +@* @param[in] r3 - beta_cb +@* Beta Value for the boundary in U +@* +@* @param[in] sp(0) - alpha_cr +@* Alpha Value for the boundary in V +@* +@* @param[in] sp(4) - beta_cr +@* Beta Value for the boundary in V +@* +@* @param[in] sp(8) - u4_bs +@* Packed Boundary strength array +@* +@* @param[in] sp(12) - pu1_cliptab_cb +@* tc0_table for U +@* +@* @param[in] sp(16) - pu1_cliptab_cr +@* tc0_table for V +@* +@* @returns +@* None +@* +@* @remarks +@* None +@* +@******************************************************************************* +@*/ + + .global ih264_deblk_chroma_vert_bslt4_a9 + +ih264_deblk_chroma_vert_bslt4_a9: + + stmfd sp!, {r4-r7, r10-r12, r14} + + sub r0, r0, #4 @point r0 to p1u of row0. + ldrd r4, r5, [sp, #32] @R4 = alpha_cr , R5 = beta_cr + add r2, r2, r4, lsl #8 + add r3, r3, r5, lsl #8 + ldr r6, [sp, #40] @R6 = u4_bs + ldrd r10, r11, [sp, #44] @R10 = pu1_cliptab_cb , R11 = pu1_cliptab_cr + vpush {d8 - d15} + mov r12, r0 @keep a back up of R0 for buffer write + + vld4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0], r1 + vld4.16 {d0[1], d2[1], d4[1], d6[1]}, [r0], r1 + vld4.16 {d0[2], d2[2], d4[2], d6[2]}, [r0], r1 + vld4.16 {d0[3], d2[3], d4[3], d6[3]}, [r0], r1 + + vld4.16 {d1[0], d3[0], d5[0], d7[0]}, [r0], r1 + vld4.16 {d1[1], d3[1], d5[1], d7[1]}, [r0], r1 + vld4.16 {d1[2], d3[2], d5[2], d7[2]}, [r0], r1 + vld4.16 {d1[3], d3[3], d5[3], d7[3]}, [r0], r1 + + + vdup.16 q11, r2 @Q11 = alpha + vabd.u8 q4, q1, q2 @|p0-q0| + vdup.16 q12, r3 @Q12 = beta + vabd.u8 q5, q3, q2 @|q1-q0| + vabd.u8 q6, q0, q1 @|p1-p0| + vclt.u8 q4, q4, q11 @|p0-q0| < alpha ? + vsubl.u8 q7, d0, d6 + vclt.u8 q5, q5, q12 @|q1-q0| < beta ? + vsubl.u8 q8, d1, d7 @(p1 - q1) + vclt.u8 q6, q6, q12 @|p1-p0| < beta ? + vsubl.u8 q9, d4, d2 + vand.u8 q4, q4, q5 @|p0-q0| < alpha && |q1-q0| < beta + vsubl.u8 q10, d5, d3 @(q0 - p0) + vmov.u16 q14, #4 + vld1.32 {d24[0]}, [r10] @Load ClipTable for U + vld1.32 {d25[0]}, [r11] @Load ClipTable for V + rev r6, r6 @Blocking strengths + vand.u8 q4, q4, q6 @|p0-q0| < alpha && |q1-q0| < beta && |p1-p0| < beta + + vmov.32 d10[0], r6 + + vmla.s16 q7, q9, q14 + vmla.s16 q8, q10, q14 @4*(q0 - p0) + (p1 - q1) + + vmovl.u8 q5, d10 + vsli.u16 d10, d10, #8 + vtbl.8 d12, {d24}, d10 @tC0 for U + vtbl.8 d13, {d25}, d10 @tC0 for V + vzip.8 d12, d13 + vmovl.u16 q5, d10 + vsli.u32 q5, q5, #16 + vmov.u8 q12, #1 + vadd.u8 q6, q6, q12 @tC0 + 1 + vcge.u8 q5, q5, q12 @u4_bS > 0 ? + vand.u8 q4, q4, q5 @|p0-q0| < alpha && |q1-q0| < beta && |p1-p0| < beta && u4_bs != 0 + + @ Q0 - Q3(inputs), + @ Q4 (|p0-q0| < alpha && |q1-q0| < beta && |p1-p0| < beta && u4_bs != 0), + @ Q6 (tC) + + vrshr.s16 q7, q7, #3 + vrshr.s16 q8, q8, #3 @(((q0 - p0) << 2) + (p1 - q1) + 4) >> 3) + + vcgt.s16 q9, q7, #0 + vcgt.s16 q10, q8, #0 + vmovn.i16 d18, q9 + vmovn.i16 d19, q10 @Q9 = sign(delta) + vabs.s16 q7, q7 + vabs.s16 q8, q8 + vmovn.u16 d14, q7 + vmovn.u16 d15, q8 + vmin.u8 q7, q7, q6 @Q7 = |delta| + + vqadd.u8 q10, q1, q7 @p0+|delta| + vqadd.u8 q11, q2, q7 @q0+|delta| + vqsub.u8 q12, q1, q7 @p0-|delta| + vqsub.u8 q13, q2, q7 @q0-|delta| + + vbit q12, q10, q9 @p0 + delta + vbit q11, q13, q9 @q0 - delta + + vbit q1, q12, q4 + vbit q2, q11, q4 + + vst4.16 {d0[0], d2[0], d4[0], d6[0]}, [r12], r1 + vst4.16 {d0[1], d2[1], d4[1], d6[1]}, [r12], r1 + vst4.16 {d0[2], d2[2], d4[2], d6[2]}, [r12], r1 + vst4.16 {d0[3], d2[3], d4[3], d6[3]}, [r12], r1 + + vst4.16 {d1[0], d3[0], d5[0], d7[0]}, [r12], r1 + vst4.16 {d1[1], d3[1], d5[1], d7[1]}, [r12], r1 + vst4.16 {d1[2], d3[2], d5[2], d7[2]}, [r12], r1 + vst4.16 {d1[3], d3[3], d5[3], d7[3]}, [r12], r1 + vpop {d8 - d15} + ldmfd sp!, {r4-r7, r10-r12, pc} + + + +@/** +@******************************************************************************* +@* +@* @brief +@* Performs filtering of a chroma block vertical edge when the +@* boundary strength is set to 4 on calling twice in high profile +@* +@* @par Description: +@* This operation is described in Sec. 8.7.2.4 under the title +@* "Filtering process for edges for bS equal to 4" in ITU T Rec H.264. +@* +@* @param[in] r0 - pu1_src +@* Pointer to the src sample q0 +@* +@* @param[in] r1 - src_strd +@* Source stride +@* +@* @param[in] r2 - alpha_cb +@* Alpha Value for the boundary in U +@* +@* @param[in] r3 - beta_cb +@* Beta Value for the boundary in U +@* +@* @param[in] sp(0) - alpha_cr +@* Alpha Value for the boundary in V +@* +@* @param[in] sp(4) - beta_cr +@* Beta Value for the boundary in V +@* +@* @returns +@* None +@* +@* @remarks +@* None +@* +@******************************************************************************* +@*/ + + .global ih264_deblk_chroma_vert_bs4_mbaff_a9 + +ih264_deblk_chroma_vert_bs4_mbaff_a9: + + stmfd sp!, {r4, r5, r12, r14} + + sub r0, r0, #4 @point r0 to p1u of row0. + mov r12, r0 @keep a back up of r0 for buffer write + ldrd r4, r5, [sp, #16] @R4 = alpha_cr , R5 = beta_cr + add r2, r2, r4, lsl #8 + add r3, r3, r5, lsl #8 + vpush {d8 - d15} + vld4.16 {d0[0], d1[0], d2[0], d3[0]}, [r0], r1 + vld4.16 {d0[1], d1[1], d2[1], d3[1]}, [r0], r1 + vld4.16 {d0[2], d1[2], d2[2], d3[2]}, [r0], r1 + vld4.16 {d0[3], d1[3], d2[3], d3[3]}, [r0], r1 + + vdup.16 d11, r2 @D11 = alpha + vdup.16 d12, r3 @D12 = beta + vmov.i8 d31, #2 + + vabd.u8 d4, d1, d2 @|p0-q0| + vabd.u8 d5, d3, d2 @|q1-q0| + vabd.u8 d6, d0, d1 @|p1-p0| + vaddl.u8 q14, d1, d3 @(p0 + q1) + vclt.u8 d4, d4, d11 @|p0-q0| < alpha ? + vclt.u8 d5, d5, d12 @|q1-q0| < beta ? + vclt.u8 d6, d6, d12 @|p1-p0| < beta ? + vmlal.u8 q14, d0, d31 @2*p1 + (p0 + q1) + vaddl.u8 q13, d0, d2 @(p1 + q0) + vand.u8 d4, d4, d5 @|p0-q0| < alpha && |q1-q0| < beta + vmlal.u8 q13, d3, d31 @2*q1 + (p1 + q0) + + vrshrn.i16 d7, q14, #2 @(2*p1 + (p0 + q1) + 2) >> 2 + vand.u8 d4, d4, d6 @|p0-q0| < alpha && |q1-q0| < beta && |p1-p0| < beta + vrshrn.i16 d9, q13, #2 @(2*q1 + (p1 + q0) + 2) >> 2 + + vbit d1, d7, d4 + vbit d2, d9, d4 + + vst4.16 {d0[0], d1[0], d2[0], d3[0]}, [r12], r1 + vst4.16 {d0[1], d1[1], d2[1], d3[1]}, [r12], r1 + vst4.16 {d0[2], d1[2], d2[2], d3[2]}, [r12], r1 + vst4.16 {d0[3], d1[3], d2[3], d3[3]}, [r12], r1 + vpop {d8 - d15} + ldmfd sp!, {r4, r5, r12, pc} + + + +@/** +@******************************************************************************* +@* +@* @brief +@* Performs filtering of a chroma block vertical edge for cases where the +@* boundary strength is less than 4 on calling twice in high profile +@* +@* @par Description: +@* This operation is described in Sec. 8.7.2.4 under the title +@* "Filtering process for edges for bS equal to 4" in ITU T Rec H.264. +@* +@* @param[in] r0 - pu1_src +@* Pointer to the src sample q0 +@* +@* @param[in] r1 - src_strd +@* Source stride +@* +@* @param[in] r2 - alpha_cb +@* Alpha Value for the boundary in U +@* +@* @param[in] r3 - beta_cb +@* Beta Value for the boundary in U +@* +@* @param[in] sp(0) - alpha_cr +@* Alpha Value for the boundary in V +@* +@* @param[in] sp(4) - beta_cr +@* Beta Value for the boundary in V +@* +@* @param[in] sp(8) - u4_bs +@* Packed Boundary strength array +@* +@* @param[in] sp(12) - pu1_cliptab_cb +@* tc0_table for U +@* +@* @param[in] sp(16) - pu1_cliptab_cr +@* tc0_table for V +@* +@* @returns +@* None +@* +@* @remarks +@* None +@* +@******************************************************************************* +@*/ + + .global ih264_deblk_chroma_vert_bslt4_mbaff_a9 + +ih264_deblk_chroma_vert_bslt4_mbaff_a9: + + stmfd sp!, {r4-r6, r10-r12, r14} + + sub r0, r0, #4 @point r0 to p1u of row0. + mov r12, r0 @keep a back up of r0 for buffer write + + ldrd r4, r5, [sp, #28] @R4 = alpha_cr , R5 = beta_cr + add r2, r2, r4, lsl #8 + add r3, r3, r5, lsl #8 + ldr r6, [sp, #36] @R6 = u4_bs + ldrd r10, r11, [sp, #40] @R10 = pu1_cliptab_cb , R11 = pu1_cliptab_cr + vpush {d8 - d15} + vld4.16 {d0[0], d1[0], d2[0], d3[0]}, [r0], r1 + vld4.16 {d0[1], d1[1], d2[1], d3[1]}, [r0], r1 + vld4.16 {d0[2], d1[2], d2[2], d3[2]}, [r0], r1 + vld4.16 {d0[3], d1[3], d2[3], d3[3]}, [r0], r1 + + vdup.16 d11, r2 @D11 = alpha + vabd.u8 d4, d1, d2 @|p0-q0| + vdup.16 d12, r3 @D12 = beta + vabd.u8 d5, d3, d2 @|q1-q0| + vabd.u8 d6, d0, d1 @|p1-p0| + vclt.u8 d4, d4, d11 @|p0-q0| < alpha ? + vclt.u8 d5, d5, d12 @|q1-q0| < beta ? + vsubl.u8 q14, d0, d3 @(p1 - q1) + vclt.u8 d6, d6, d12 @|p1-p0| < beta ? + vand.u8 d4, d4, d5 @|p0-q0| < alpha && |q1-q0| < beta + vsubl.u8 q12, d2, d1 @(q0 - p0) + vmov.u16 q10, #4 + + vld1.32 {d31[1]}, [r10] @Load ClipTable for U + vld1.32 {d31[0]}, [r11] @Load ClipTable for V + rev r6, r6 @Blocking strengths + vand.u8 d4, d4, d6 @|p0-q0| < alpha && |q1-q0| < beta && |p1-p0| < beta + vmov.32 d22[0], r6 + vmla.s16 q14, q12, q10 @4*(q0 - p0) + (p1 - q1) + vmovl.u8 q11, d22 + vsli.u16 d22, d22, #8 + vmov.u16 d13, #4 + vadd.u8 d22, d22, d13 + vtbl.8 d6, {d31}, d22 @tC0 + vmov.u8 d12, #1 + vsub.u8 d22, d22, d13 + vadd.u8 d6, d6, d12 @tC0 + 1 + vcge.u8 d5, d22, d12 @u4_bS > 0 ? + vand.u8 d4, d4, d5 @|p0-q0| < alpha && |q1-q0| < beta && |p1-p0| < beta && u4_bs != 0 + + @ D0 - D3(inputs), + @ D4 (|p0-q0| < alpha && |q1-q0| < beta && |p1-p0| < beta && u4_bs != 0), + @ D6 (tC) + + vrshr.s16 q14, q14, #3 @(((q0 - p0) << 2) + (p1 - q1) + 4) >> 3) + + vcgt.s16 q13, q14, #0 + vmovn.i16 d9, q13 @D9 = sign(delta) + vabs.s16 q14, q14 + vmovn.u16 d7, q14 + vmin.u8 d7, d7, d6 @D7 = |delta| + + vqadd.u8 d10, d1, d7 @p0+|delta| + vqadd.u8 d11, d2, d7 @q0+|delta| + vqsub.u8 d12, d1, d7 @p0-|delta| + vqsub.u8 d13, d2, d7 @q0-|delta| + + vbit d12, d10, d9 @p0 + delta + vbit d11, d13, d9 @q0 - delta + + vbit d1, d12, d4 + vbit d2, d11, d4 + + vst4.16 {d0[0], d1[0], d2[0], d3[0]}, [r12], r1 + vst4.16 {d0[1], d1[1], d2[1], d3[1]}, [r12], r1 + vst4.16 {d0[2], d1[2], d2[2], d3[2]}, [r12], r1 + vst4.16 {d0[3], d1[3], d2[3], d3[3]}, [r12], r1 + vpop {d8 - d15} + ldmfd sp!, {r4-r6, r10-r12, pc} + + + diff --git a/common/arm/ih264_deblk_luma_a9.s b/common/arm/ih264_deblk_luma_a9.s new file mode 100755 index 0000000..3e6a4d9 --- /dev/null +++ b/common/arm/ih264_deblk_luma_a9.s @@ -0,0 +1,1092 @@ +@/****************************************************************************** +@ * +@ * Copyright (C) 2015 The Android Open Source Project +@ * +@ * Licensed under the Apache License, Version 2.0 (the "License"); +@ * you may not use this file except in compliance with the License. +@ * You may obtain a copy of the License at: +@ * +@ * http://www.apache.org/licenses/LICENSE-2.0 +@ * +@ * Unless required by applicable law or agreed to in writing, software +@ * distributed under the License is distributed on an "AS IS" BASIS, +@ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +@ * See the License for the specific language governing permissions and +@ * limitations under the License. +@ * +@ ***************************************************************************** +@ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore +@*/ +@/*****************************************************************************/ +@/* */ +@/* File Name : ih264_deblk_luma_a9.s */ +@/* */ +@/* Description : Contains function definitions for deblocking luma */ +@/* edge. Functions are coded in NEON assembly and can */ +@/* be compiled using ARM RVDS. */ +@/* */ +@/* List of Functions : ih264_deblk_luma_vert_bs4_a9() */ +@/* ih264_deblk_luma_vert_bslt4_a9() */ +@/* ih264_deblk_luma_horz_bs4_a9() */ +@/* ih264_deblk_luma_horz_bslt4_a9() */ +@/* ih264_deblk_luma_vert_bs4_mbaff_a9() */ +@/* ih264_deblk_luma_vert_bslt4_mbaff_a9() */ +@/* */ +@/* Issues / Problems : None */ +@/* */ +@/* Revision History : */ +@/* */ +@/* DD MM YYYY Author(s) Changes (Describe the changes made) */ +@/* 28 11 2013 Ittiam Draft */ +@/* 05 01 2015 Kaushik Added double-call functions for */ +@/* Senthoor vertical deblocking. */ +@/* */ +@/*****************************************************************************/ + + +.text +.p2align 2 + +@/** +@******************************************************************************* +@* +@* @brief +@* Performs filtering of a luma block horizontal edge for cases where the +@* boundary strength is less than 4 +@* +@* @par Description: +@* This operation is described in Sec. 8.7.2.4 under the title +@* "Filtering process for edges for bS equal to 4" in ITU T Rec H.264. +@* +@* @param[in] r0 - pu1_src +@* Pointer to the src sample q0 +@* +@* @param[in] r1 - src_strd +@* Source stride +@* +@* @param[in] r2 - alpha +@* Alpha Value for the boundary +@* +@* @param[in] r3 - beta +@* Beta Value for the boundary +@* +@* @param[in] sp(0) - u4_bs +@* Packed Boundary strength array +@* +@* @param[in] sp(4) - pu1_cliptab +@* tc0_table +@* +@* @returns +@* None +@* +@* @remarks +@* None +@* +@******************************************************************************* +@*/ + + .global ih264_deblk_luma_horz_bslt4_a9 + +ih264_deblk_luma_horz_bslt4_a9: + + stmfd sp!, {r4-r7, lr} + + ldrd r4, r5, [sp, #0x14] @r4 = ui_Bs , r5 = *puc_ClpTab + vpush {d8 - d15} + sub r0, r0, r1, lsl #1 @R1 = uc_Horizonpad + sub r0, r0, r1 @r0 pointer to p2 + rev r4, r4 @ + vld1.8 {q5}, [r0], r1 @p2 values are loaded into q5 + vmov.32 d12[0], r4 @d12[0] = ui_Bs + mov r6, r0 @keeping backup of pointer to p1 + vld1.8 {q4}, [r0], r1 @p1 values are loaded into q4 + mov r7, r0 @keeping backup of pointer to p0 + vld1.8 {q3}, [r0], r1 @p0 values are loaded into q3 + vmovl.u8 q6, d12 @q6 = uc_Bs in each 16 bt scalar + vld1.8 {q0}, [r0], r1 @q0 values are loaded into q0 + vabd.u8 q13, q4, q3 @Q13 = ABS(p1 - p0) + vld1.8 {q1}, [r0], r1 @q1 values are loaded into q1 + vabd.u8 q11, q3, q0 @Q11 = ABS(p0 - q0) + vld1.32 d16[0], [r5] @D16[0] contains cliptab + vabd.u8 q12, q1, q0 @Q12 = ABS(q1 - q0) + vld1.8 {q2}, [r0], r1 @q2 values are loaded into q2 + vtbl.8 d14, {d16}, d12 @ + vdup.8 q10, r2 @Q10 contains alpha + vdup.8 q8, r3 @Q8 contains beta + vmovl.u16 q6, d12 @ + vmovl.u16 q7, d14 @ + vabd.u8 q14, q5, q3 @Q14 = Ap = ABS(p2 - p0) + vabd.u8 q15, q2, q0 @Q15 = Aq = ABS(q2 - q0) + vcgt.s32 q6, q6, #0 @Q6 = (us_Bs > 0) + vsli.32 q7, q7, #8 @ + vcge.u8 q9, q11, q10 @Q9 = ( ABS(p0 - q0) >= Alpha ) + vcge.u8 q12, q12, q8 @Q12=( ABS(q1 - q0) >= Beta ) + vcge.u8 q13, q13, q8 @Q13=( ABS(p1 - p0) >= Beta ) + vcgt.u8 q10, q8, q14 @Q10=(Ap<Beta) + vcgt.u8 q11, q8, q15 @Q11=(Aq<Beta) + vsli.32 q7, q7, #16 @Q7 = C0 + vorr q9, q9, q12 @Q9 = ( ABS(p0 - q0) >= Alpha ) | ( ABS(q1 - q0) >= Beta ) + vsubl.u8 q15, d1, d7 @ + vsubl.u8 q12, d0, d6 @Q15,Q12 = (q0 - p0) + vorr q9, q9, q13 @Q9 = ( ABS(p0 - q0) >= Alpha ) | ( ABS(q1 - q0) >= Beta ) | ( ABS(p1 - p0) >= Beta ) + vsubl.u8 q14, d8, d2 @Q14 = (p1 - q1)L + vshl.i16 q13, q15, #2 @Q13 = (q0 - p0)<<2 + vshl.i16 q12, q12, #2 @Q12 = (q0 - p0)<<2 + vsubl.u8 q15, d9, d3 @Q15 = (p1 - q1)H + vbic q6, q6, q9 @final condition + vadd.i16 q12, q12, q14 @ + vadd.i16 q13, q13, q15 @Q13,Q12 = [ (q0 - p0)<<2 ] + (p1 - q1) + vsub.i8 q9, q7, q10 @Q9 = C0 + (Ap < Beta) + vrhadd.u8 q8, q3, q0 @Q8 = ((p0+q0+1) >> 1) + vqrshrn.s16 d24, q12, #3 @ + vqrshrn.s16 d25, q13, #3 @Q12 = i_macro = (((q0 - p0)<<2) + (p1 - q1) + 4)>>3 + vsub.i8 q9, q9, q11 @Q9 = C0 + (Ap < Beta) + (Aq < Beta) + vand.i8 q10, q10, q6 @ + vand.i8 q11, q11, q6 @ + vabs.s8 q13, q12 @Q13 = ABS (i_macro) + vaddl.u8 q14, d17, d11 @ + vaddl.u8 q5, d16, d10 @Q14,Q5 = p2 + (p0+q0+1)>>1 + vaddl.u8 q15, d17, d5 @ + vmin.u8 q9, q13, q9 @Q9 = delta = (ABS(i_macro) > C) ? C : ABS(i_macro) + vshll.u8 q13, d9, #1 @ + vaddl.u8 q2, d16, d4 @Q15,Q2 = q2 + (p0+q0+1)>>1 + vshll.u8 q8, d8, #1 @Q13,Q8 = (p1<<1) + vand q9, q9, q6 @Making delta zero in places where values shouldn be filterd + vsub.i16 q14, q14, q13 @Q14,Q5 = [p2 + (p0+q0+1)>>1] - (p1<<1) + vsub.i16 q5, q5, q8 @ + vshll.u8 q8, d2, #1 @ + vshll.u8 q13, d3, #1 @Q13,Q8 = (q1<<1) + vqshrn.s16 d29, q14, #1 @ + vqshrn.s16 d28, q5, #1 @Q14 = i_macro_p1 + vsub.i16 q2, q2, q8 @ + vsub.i16 q15, q15, q13 @Q15,Q2 = [q2 + (p0+q0+1)>>1] - (q1<<1) + vneg.s8 q13, q7 @Q13 = -C0 + vmin.s8 q14, q14, q7 @Q14 = min(C0,i_macro_p1) + vcge.s8 q12, q12, #0 @Q12 = (i_macro >= 0) + vqshrn.s16 d31, q15, #1 @ + vqshrn.s16 d30, q2, #1 @Q15 = i_macro_q1 + vmax.s8 q14, q14, q13 @Q14 = max( - C0 , min(C0, i_macro_p1) ) + vqadd.u8 q8, q3, q9 @Q8 = p0 + delta + vqsub.u8 q3, q3, q9 @Q3 = p0 - delta + vmin.s8 q15, q15, q7 @Q15 = min(C0,i_macro_q1) + vand.i8 q14, q10, q14 @condition check Ap<beta + vqadd.u8 q7, q0, q9 @Q7 = q0 + delta + vqsub.u8 q0, q0, q9 @Q0 = q0 - delta + vmax.s8 q15, q15, q13 @Q15 = max( - C0 , min(C0, i_macro_q1) ) + vbif q8, q3, q12 @Q8 = (i_macro >= 0 ) ? (p0+delta) : (p0-delta) + vbif q0, q7, q12 @Q0 = (i_macro >= 0 ) ? (q0-delta) : (q0+delta) + vadd.i8 q14, q14, q4 @ + vand.i8 q15, q11, q15 @condition check Aq<beta + vst1.8 {q8}, [r7], r1 @writting back filtered value of p0 + vadd.i8 q15, q15, q1 @ + vst1.8 {q0}, [r7], r1 @writting back filtered value of q0 + vst1.8 {q14}, [r6] @writting back filtered value of p1 + vst1.8 {q15}, [r7], r1 @writting back filtered value of q1 + vpop {d8 - d15} + ldmfd sp!, {r4-r7, pc} + + + +@/** +@******************************************************************************* +@* +@* @brief +@* Performs filtering of a luma block horizontal edge when the +@* boundary strength is set to 4 +@* +@* @par Description: +@* This operation is described in Sec. 8.7.2.4 under the title +@* "Filtering process for edges for bS equal to 4" in ITU T Rec H.264. +@* +@* @param[in] r0 - pu1_src +@* Pointer to the src sample q0 +@* +@* @param[in] r1 - src_strd +@* Source stride +@* +@* @param[in] r2 - alpha +@* Alpha Value for the boundary +@* +@* @param[in] r3 - beta +@* Beta Value for the boundary +@* +@* @returns +@* None +@* +@* @remarks +@* None +@* +@******************************************************************************* +@*/ + + .global ih264_deblk_luma_horz_bs4_a9 + +ih264_deblk_luma_horz_bs4_a9: + + @ Back up necessary registers on stack + stmfd sp!, {r12, r14} + vpush {d8 - d15} + @ Init + vdup.8 q0, r2 @duplicate alpha + sub r12, r0, r1 @pointer to p0 = q0 - src_strd + vdup.8 q1, r3 @duplicate beta + sub r14, r0, r1, lsl#1 @pointer to p1 = q0 - src_strd*2 + sub r2, r0, r1, lsl#2 @pointer to p3 = q0 - src_strd*4 + sub r3, r14, r1 @pointer to p2 = p1 - src_strd + + @ Load Data + vld1.8 {d4, d5}, [r0], r1 @load q0 to Q2, q0 = q0 + src_strd + vld1.8 {d6, d7}, [r12] @load p0 to Q3 + vld1.8 {d8, d9}, [r0], r1 @load q1 to Q4, q0 = q0 + src_strd + vld1.8 {d10, d11}, [r14] @load p1 to Q5 + + @ Filter Decision + vabd.u8 q6, q2, q3 @ABS(p0 - q0) + vabd.u8 q7, q4, q2 @ABS(q1 - q0) + vabd.u8 q8, q5, q3 @ABS(p1 - p0) + vcge.u8 q9, q6, q0 @ABS(p0 - q0) >= Alpha + vcge.u8 q7, q7, q1 @ABS(q1 - q0) >= Beta + vcge.u8 q8, q8, q1 @ABS(p1 - p0) >= Beta + vmov.i8 q10, #2 + vorr q9, q9, q7 @ABS(p0 - q0) >= Alpha || ABS(q1 - q0) >= Beta + vld1.8 {d14, d15}, [r0], r1 @load q2 to Q7, q0 = q0 + src_strd + vorr q9, q9, q8 @ABS(p0 - q0) >= Alpha || ABS(q1 - q0) >= Beta || ABS(p1 - p0) >= Beta + vsra.u8 q10, q0, #2 @((Alpha >> 2) + 2) + vabd.u8 q11, q7, q2 @Aq = ABS(q2 - q0) + vaddl.u8 q12, d4, d6 @p0+q0 L + vaddl.u8 q13, d5, d7 @p0+q0 H + vclt.u8 q11, q11, q1 @Aq < Beta + vclt.u8 q10, q6, q10 @(ABS(p0 - q0) <((Alpha >>2) + 2)) + + @ Deblock Filtering q0', q1', q2' + vaddw.u8 q14, q12, d8 @p0+q0+q1 L + vaddw.u8 q15, q13, d9 @p0+q0+q1 H + vand q11, q11, q10 @(Aq < Beta && ABS(p0 - q0) <((Alpha >>2) + 2)) + @ q0' if (Aq < Beta && ABS(p0 - q0) <((Alpha >>2) + 2)) TRUE + vadd.i16 q8, q14, q14 @2*(p0+q0+q1)L + vadd.i16 q0, q15, q15 @2*(p0+q0+q1)H + vaddw.u8 q8, q8, d14 @2*(p0+q0+q1)+q2 L + vaddw.u8 q0, q0, d15 @2*(p0+q0+q1)+q2 H + vaddw.u8 q8, q8, d10 @2*(p0+q0+q1)+q2 +p1 L + vaddw.u8 q0, q0, d11 @2*(p0+q0+q1)+q2 +p1 H + vrshrn.u16 d12, q8, #3 @(2*(p0+q0+q1)+q2 +p1 +4)>> 3 L [q0'] + vrshrn.u16 d13, q0, #3 @(2*(p0+q0+q1)+q2 +p1 +4)>> 3 H [q0'] + @ q0" if (Aq < Beta && ABS(p0 - q0) <((Alpha >>2) + 2)) FALSE + vaddl.u8 q8, d8, d8 @2*q1 L + vaddl.u8 q0, d9, d9 @2*q1 H + vaddw.u8 q8, q8, d4 @2*q1+q0 L + vaddw.u8 q0, q0, d5 @2*q1+q0 H + vaddw.u8 q8, q8, d10 @2*q1+q0+p1 L + vaddw.u8 q0, q0, d11 @2*q1+q0+p1 H + vrshrn.u16 d16, q8, #2 @(2*q1+q0+p1+2)>>2 L [q0"] + vrshrn.u16 d17, q0, #2 @(2*q1+q0+p1+2)>>2 H [q0"] + @ q1' + vaddw.u8 q14, q14, d14 @p0+q0+q1+q2 L + vaddw.u8 q15, q15, d15 @p0+q0+q1+q2 H + vld1.8 {q0}, [r0], r1 @load q3 to Q0, q0 = q0 + src_strd + vbit q8, q6, q11 @choosing between q0' and q0" depending on condn + sub r0, r0, r1, lsl #2 @pointer to q0 + vbic q11, q11, q9 @((ABS(p0 - q0) >= Alpha || ABS(q1 - q0) >= Beta || ABS(p1 - p0) >= Beta)) + @ && (Aq < Beta && ABS(p0 - q0) <((Alpha >>2) + 2)) + vrshrn.u16 d12, q14, #2 @(p0+q0+q1+q2+2)>>2 L [q1'] + vrshrn.u16 d13, q15, #2 @(p0+q0+q1+q2+2)>>2 H [q1'] + vbif q2, q8, q9 @choose q0 or filtered q0 + @ q2' + vaddl.u8 q8, d14, d0 @q2+q3,L + vaddl.u8 q0, d15, d1 @q2+q3,H + vadd.i16 q14, q14, q8 @p0+q0+q1+2*q2+q3 L + vst1.8 {d4, d5}, [r0], r1 @store q0 + vadd.i16 q15, q15, q0 @p0+q0+q1+2*q2+q3 H + vadd.i16 q14, q14, q8 @p0+q0+q1+3*q2+2*q3 L + vadd.i16 q15, q15, q0 @p0+q0+q1+3*q2+2*q3 H + vrshrn.u16 d0, q14, #3 @(p0+q0+q1+3*q2+2*q3+4)>>3 L [q2'] + vrshrn.u16 d1, q15, #3 @(p0+q0+q1+3*q2+2*q3+4)>>3 H [q2'] + vld1.8 {d30, d31}, [r3] @load p2 to Q15 + vbif q6, q4, q11 @choose q1 or filtered value of q1 + + vabd.u8 q8, q15, q3 @Ap,ABS(p2 - p0) + vaddw.u8 q12, q12, d10 @p0+q0+p1 L + vbif q0, q7, q11 @choose q2 or filtered q2 + vaddw.u8 q13, q13, d11 @p0+q0+p1 H + vst1.8 {d12, d13}, [r0], r1 @store q1 + vclt.u8 q8, q8, q1 @Ap < Beta + vadd.i16 q14, q12, q12 @2*(p0+q0+p1) L + vadd.i16 q2, q13, q13 @2*(p0+q0+p1) H + vst1.8 {d0, d1}, [r0], r1 @store q2 + vand q10, q10, q8 @((Ap < Beta) && (ABS(p0 - q0) <((Alpha >>2) + 2))) + vaddw.u8 q14, q14, d30 @2*(p0+q0+p1)+p2 l + vaddw.u8 q2, q2, d31 @2*(p0+q0+p1)+p2 H + vaddw.u8 q14, q14, d8 @2*(p0+q0+p1)+p2+q1 L + vaddw.u8 q2, q2, d9 @2*(p0+q0+p1)+p2+q1 H + vrshrn.u16 d28, q14, #3 @(2*(p0+q0+p1)+p2+q1+4)>>3 L,p0' + vrshrn.u16 d29, q2, #3 @(2*(p0+q0+p1)+p2+q1+4)>>3 H,p0' + vmov.i8 d0, #2 + vmov.i16 d1, #2 + vaddl.u8 q1, d6, d8 @p0+q1 L + vmlal.u8 q1, d10, d0 @2*p1+p0+q1 L + vaddl.u8 q8, d7, d9 @p0+q1 H + vmlal.u8 q8, d11, d0 @2*p1+p0+q1 H + vaddw.u8 q6, q12, d30 @(p0+q0+p1) +p2 L + vld1.8 {d24, d25}, [r2] @load p3,Q12 + vaddw.u8 q2, q13, d31 @(p0+q0+p1) +p2 H + vaddl.u8 q4, d30, d24 @p2+p3 L + vrshrn.u16 d26, q6, #2 @((p0+q0+p1)+p2 +2)>>2,p1' L + vrshrn.u16 d2, q1, #2 @(2*p1+p0+q1+2)>>2,p0"L + vrshrn.u16 d27, q2, #2 @((p0+q0+p1)+p2 +2)>>2,p1' H + vrshrn.u16 d3, q8, #2 @(2*p1+p0+q1+2)>>2,p0" H + vaddl.u8 q8, d31, d25 @p2+p3 H + vmla.u16 q6, q4, d1[0] @(p0+q0+p1)+3*p2+2*p3 L + vmla.u16 q2, q8, d1[0] @(p0+q0+p1)+3*p2+2*p3 H + vbic q8, q10, q9 @((ABS(p0 - q0) >= Alpha || ABS(q1 - q0) >= Beta || ABS(p1 - p0) >= Beta)) + @&& (Ap < Beta && ABS(p0 - q0) <((Alpha >>2) + 2)) + vbit q1, q14, q10 @choosing between po' and p0" + vrshrn.u16 d12, q6, #3 @((p0+q0+p1)+3*p2+2*p3+4)>>3 L p2' + vrshrn.u16 d13, q2, #3 @((p0+q0+p1)+3*p2+2*p3+4)>>3 H p2' + vbif q3, q1, q9 @choosing between p0 and filtered value of p0 + vbit q5, q13, q8 @choosing between p1 and p1' + vbit q15, q6, q8 @choosing between p2 and p2' + vst1.8 {d6, d7}, [r12] @store p0 + vst1.8 {d10, d11}, [r14] @store p1 + vst1.8 {d30, d31}, [r3] @store p2 + vpop {d8 - d15} + ldmfd sp!, {r12, pc} + + + +@/** +@******************************************************************************* +@* +@* @brief +@* Performs filtering of a luma block vertical edge for cases where the +@* boundary strength is less than 4 +@* +@* @par Description: +@* This operation is described in Sec. 8.7.2.4 under the title +@* "Filtering process for edges for bS equal to 4" in ITU T Rec H.264. +@* +@* @param[in] r0 - pu1_src +@* Pointer to the src sample q0 +@* +@* @param[in] r1 - src_strd +@* Source stride +@* +@* @param[in] r2 - alpha +@* Alpha Value for the boundary +@* +@* @param[in] r3 - beta +@* Beta Value for the boundary +@* +@* @param[in] sp(0) - u4_bs +@* Packed Boundary strength array +@* +@* @param[in] sp(4) - pu1_cliptab +@* tc0_table +@* +@* @returns +@* None +@* +@* @remarks +@* None +@* +@******************************************************************************* +@*/ + + .global ih264_deblk_luma_vert_bslt4_a9 + +ih264_deblk_luma_vert_bslt4_a9: + + stmfd sp!, {r12, lr} + + sub r0, r0, #4 @pointer uc_edgePixel-4 + ldr r12, [sp, #8] @r12 = ui_Bs + ldr r14, [sp, #12] @r14 = *puc_ClpTab + vpush {d8 - d15} + @loading p3:p2:p1:p0:q0:q1:q2:q3 for every row + vld1.8 {d0}, [r0], r1 @row1 + vld1.8 d2, [r0], r1 @row2 + vld1.8 d4, [r0], r1 @row3 + rev r12, r12 @reversing ui_bs + vld1.8 d6, [r0], r1 @row4 + vmov.32 d18[0], r12 @d12[0] = ui_Bs + vld1.32 d16[0], [r14] @D16[0] contains cliptab + vld1.8 d8, [r0], r1 @row5 + vmovl.u8 q9, d18 @q6 = uc_Bs in each 16 bt scalar + vld1.8 d10, [r0], r1 @row6 + vld1.8 d12, [r0], r1 @row7 + vtbl.8 d16, {d16}, d18 @puc_ClipTab[uc_Bs] + vld1.8 d14, [r0], r1 @row8 + vld1.8 d1, [r0], r1 @row9 + vmovl.u16 q8, d16 @ + vld1.8 d3, [r0], r1 @row10 + vld1.8 d5, [r0], r1 @row11 + vld1.8 d7, [r0], r1 @row12 + vsli.32 q8, q8, #8 @ + vld1.8 d9, [r0], r1 @row13 + vld1.8 d11, [r0], r1 @row14 + vld1.8 d13, [r0], r1 @row15 + vsli.32 q8, q8, #16 @Q8 = C0 + vld1.8 d15, [r0], r1 @row16 + + @taking two 8x8 transposes + @2X2 transposes + vtrn.8 d0, d2 @row1 &2 + vtrn.8 d4, d6 @row3&row4 + vtrn.8 d8, d10 @row5&6 + vtrn.8 d12, d14 @row7 & 8 + vtrn.8 d1, d3 @row9 &10 + vtrn.8 d5, d7 @row11 & 12 + vtrn.8 d9, d11 @row13 &14 + vtrn.8 d13, d15 @row15 & 16 + @4x4 transposes + vtrn.16 d2, d6 @row2 & row4 + vtrn.16 d10, d14 @row6 & row8 + vtrn.16 d3, d7 @row10 & 12 + vtrn.16 d11, d15 @row14 & row16 + vtrn.32 d6, d14 @row4 & 8 + vtrn.32 d7, d15 @row 12 & 16 + + @now Q3 ->p0 and Q7->q3 + vtrn.16 d0, d4 @row1 & 3 + vtrn.16 d8, d12 @row 5 & 7 + vtrn.16 d1, d5 @row9 & row11 + vtrn.16 d9, d13 @row13 & row15 + vtrn.32 d0, d8 @row1 & row5 + vtrn.32 d1, d9 @row9 & 13 + + @now Q0->p3 & Q4->q0 + @starting processing as p0 and q0 are now ready + vtrn.32 d2, d10 @row2 &6 + vrhadd.u8 q10, q3, q4 @((p0 + q0 + 1) >> 1) + vtrn.32 d3, d11 @row10&row14 + vmov.i8 d19, #2 + @now Q1->p2 & Q5->q1 + vtrn.32 d4, d12 @row3 & 7 + vabd.u8 q11, q3, q4 @ABS(p0 - q0) + vtrn.32 d5, d13 @row11 & row15 + vaddl.u8 q12, d20, d2 @(p2 + ((p0 + q0 + 1) >> 1) L + @now Q2->p1,Q6->q2 + vaddl.u8 q13, d21, d3 @(p2 + ((p0 + q0 + 1) >> 1) H + vmlsl.u8 q12, d4, d19 @(p2 + ((p0 + q0 + 1) >> 1) - (p1 << 1)) L + vmlsl.u8 q13, d5, d19 @(p2 + ((p0 + q0 + 1) >> 1) - (p1 << 1)) H + vdup.8 q14, r2 @alpha + vcle.u8 q11, q14, q11 @ABS(p0 - q0) >= Alpha(Alpha <=ABS(p0 - q0)) + vdup.i8 q14, r3 @beta + vabd.u8 q15, q5, q4 @ABS(q1 - q0) + vqshrn.s16 d24, q12, #1 @((p2 + ((p0 + q0 + 1) >> 1) - (p1 << 1)) >> 1) L + vqshrn.s16 d25 , q13, #1 @((p2 + ((p0 + q0 + 1) >> 1) - (p1 << 1)) >> 1) H + vcge.u8 q15, q15, q14 @ABS(q1 - q0) >= Beta + vabd.u8 q13, q2, q3 @ABS(p1 - p0) + vmin.s8 q12, q12, q8 @min(deltap1 ,C0) + vorr q11, q11, q15 @ABS(q1 - q0) >= Beta ||ABS(p0 - q0) >= Alpha + vneg.s8 q15, q8 @-C0 + vcge.u8 q13, q13, q14 @ABS(p1 - p0) >= Beta + vmax.s8 q12, q12, q15 @max(deltap1,-C0) + vorr q11, q11, q13 @ABS(p0 - q0) >= Alpha || ABS(q1 - q0) >= Beta || ABS(p1 - p0) >= Beta) + vmovl.u16 q13, d18 @ui_bs + vaddl.u8 q9, d20, d12 @q2 + ((p0 + q0 + 1) >> 1) L + vceq.u32 q13, q13, #0 @ui_bs == 0 + vsubw.u8 q9, q9, d10 @(q2 + ((p0 + q0 + 1) >> 1) - q1) L + vaddl.u8 q10, d21, d13 @q2 + ((p0 + q0 + 1) >> 1) H + vsubw.u8 q9, q9, d10 @(q2 + ((p0 + q0 + 1) >> 1) - 2*q1)L + vsubw.u8 q10, q10, d11 @(q2 + ((p0 + q0 + 1) >> 1) - q1) H + vorr q13, q13, q11 @(ABS(p0 - q0) >= Alpha || ABS(q1 - q0) >= Beta || ABS(p1 - p0) >= Beta)) &&(ui_bs) + vsubw.u8 q10, q10, d11 @(q2 + ((p0 + q0 + 1) >> 1) - 2*q1) H + vqshrn.s16 d18, q9, #1 @((q2 + ((p0 + q0 + 1) >> 1) - (q1 << 1)) >> 1) L + vabd.u8 q11, q1, q3 @Ap = ABS(p2 - p0) + vqshrn.s16 d19, q10, #1 @((q2 + ((p0 + q0 + 1) >> 1) - (q1 << 1)) >> 1) H + vabd.u8 q10, q6, q4 @Aq= ABS(q2 - q0) + vclt.u8 q11, q11, q14 @Ap < Beta + vmin.s8 q9, q9, q8 @min(delatq1,C0) + vclt.u8 q10, q10, q14 @Aq <Beta + vsubl.u8 q14, d8, d6 @(q0 - p0) L + vmax.s8 q9, q9, q15 @max(deltaq1,-C0) + vsubl.u8 q15, d9, d7 @(q0 - p0) H + vshl.s16 q14, q14, #2 @(q0 - p0)<<2 L + vsub.u8 q8, q8, q11 @C0 + (Ap < Beta) + vshl.s16 q15, q15, #2 @(q0 - p0) << 2) H + vaddw.u8 q14, q14, d4 @((q0 - p0) << 2) + (p1 L + vaddw.u8 q15, q15, d5 @((q0 - p0) << 2) + (p1 H + vsubw.u8 q14, q14, d10 @((q0 - p0) << 2) + (p1 - q1) L + vsubw.u8 q15, q15, d11 @((q0 - p0) << 2) + (p1 - q1) H + vbic q11, q11, q13 @final condition for p1 + vrshrn.s16 d28, q14, #3 @delta = ((((q0 - p0) << 2) + (p1 - q1) + 4) >> 3); L + vrshrn.s16 d29, q15, #3 @delta = ((((q0 - p0) << 2) + (p1 - q1) + 4) >> 3) H + vsub.u8 q8, q8, q10 @C0 + (Ap < Beta) + (Aq < Beta) + vbic q10, q10, q13 @final condition for q1 + vabs.s8 q15, q14 @abs(delta) + vand q12, q12, q11 @delatp1 + vand q9, q9, q10 @delta q1 + vmin.u8 q15, q15, q8 @min((abs(delta),C) + vadd.i8 q2, q2, q12 @p1+deltap1 + vadd.i8 q5, q5, q9 @q1+deltaq1 + vbic q15, q15, q13 @abs(delta) of pixels to be changed only + vcge.s8 q14, q14, #0 @sign(delta) + vqsub.u8 q11, q3, q15 @clip(p0-delta) + vtrn.8 d0, d2 @row1 &2 + vqadd.u8 q3, q3, q15 @clip(p0+delta) + vtrn.8 d1, d3 @row9 &10 + vqadd.u8 q12, q4, q15 @clip(q0+delta) + vtrn.8 d12, d14 @row7 & 8 + vqsub.u8 q4, q4, q15 @clip(q0-delta) + vtrn.8 d13, d15 @row15 & 16 + vbif q3, q11, q14 @p0 + vbif q4, q12, q14 @q0 + vtrn.8 d4, d6 @row3&row4 + vtrn.8 d8, d10 @row5&6 + vtrn.8 d5, d7 @row11 & 12 + vtrn.8 d9, d11 @row13 &14 + vtrn.16 d2, d6 @row2 & row4 + vtrn.16 d10, d14 @row6 & row8 + vtrn.16 d3, d7 @row10 & 12 + vtrn.16 d11, d15 @row14 & row16 + vtrn.32 d6, d14 @row4 & 8 + vtrn.32 d7, d15 @row 12 & 16 + @now Q3 ->p0 and Q7->q3 + vtrn.16 d0, d4 @row1 & 3 + vtrn.16 d8, d12 @row 5 & 7 + vtrn.16 d1, d5 @row9 & row11 + vtrn.16 d9, d13 @row13 & row15 + sub r0, r0, r1, lsl#4 @restore pointer + vtrn.32 d0, d8 @row1 & row5 + vtrn.32 d1, d9 @row9 & 13 + vtrn.32 d2, d10 @row2 &6 + vtrn.32 d3, d11 @row10&row14 + vtrn.32 d4, d12 @row3 & 7 + vtrn.32 d5, d13 @row11 & row15 + vst1.8 {d0}, [r0], r1 @row1 + vst1.8 d2, [r0], r1 @row2 + vst1.8 d4, [r0], r1 @row3 + vst1.8 d6, [r0], r1 @row4 + vst1.8 d8, [r0], r1 @row5 + vst1.8 d10, [r0], r1 @row6 + vst1.8 d12, [r0], r1 @row7 + vst1.8 d14, [r0], r1 @row8 + vst1.8 d1, [r0], r1 @row9 + vst1.8 d3, [r0], r1 @row10 + vst1.8 d5, [r0], r1 @row11 + vst1.8 d7, [r0], r1 @row12 + vst1.8 d9, [r0], r1 @row13 + vst1.8 d11, [r0], r1 @row14 + vst1.8 d13, [r0], r1 @row15 + vst1.8 d15, [r0], r1 @row16 + vpop {d8 - d15} + ldmfd sp!, {r12, pc} + + + +@/** +@******************************************************************************* +@* +@* @brief +@* Performs filtering of a luma block vertical edge when the +@* boundary strength is set to 4 +@* +@* @par Description: +@* This operation is described in Sec. 8.7.2.4 under the title +@* "Filtering process for edges for bS equal to 4" in ITU T Rec H.264. +@* +@* @param[in] r0 - pu1_src +@* Pointer to the src sample q0 +@* +@* @param[in] r1 - src_strd +@* Source stride +@* +@* @param[in] r2 - alpha +@* Alpha Value for the boundary +@* +@* @param[in] r3 - beta +@* Beta Value for the boundary +@* +@* @returns +@* None +@* +@* @remarks +@* None +@* +@******************************************************************************* +@*/ + + .global ih264_deblk_luma_vert_bs4_a9 + +ih264_deblk_luma_vert_bs4_a9: + + stmfd sp!, {r12, lr} + vpush {d8 - d15} + sub r0, r0, #4 @pointer uc_edgePixel-4 + @loading p3:p2:p1:p0:q0:q1:q2:q3 for every row + vld1.8 d0, [r0], r1 @row1 + vld1.8 d2, [r0], r1 @row2 + vld1.8 d4, [r0], r1 @row3 + vld1.8 d6, [r0], r1 @row4 + vld1.8 d8, [r0], r1 @row5 + vld1.8 d10, [r0], r1 @row6 + vld1.8 d12, [r0], r1 @row7 + vld1.8 d14, [r0], r1 @row8 + vld1.8 d1, [r0], r1 @row9 + vld1.8 d3, [r0], r1 @row10 + vld1.8 d5, [r0], r1 @row11 + vld1.8 d7, [r0], r1 @row12 + vld1.8 d9, [r0], r1 @row13 + vld1.8 d11, [r0], r1 @row14 + vld1.8 d13, [r0], r1 @row15 + vld1.8 d15, [r0], r1 @row16 + @taking two 8x8 transposes + @2X2 transposes + vtrn.8 d0, d2 @row1 &2 + vtrn.8 d4, d6 @row3&row4 + vtrn.8 d8, d10 @row5&6 + vtrn.8 d12, d14 @row7 & 8 + vtrn.8 d1, d3 @row9 &10 + vtrn.8 d5, d7 @row11 & 12 + vtrn.8 d9, d11 @row13 &14 + vtrn.8 d13, d15 @row15 & 16 + @4x4 transposes + vtrn.16 d2, d6 @row2 & row4 + vtrn.16 d10, d14 @row6 & row8 + vtrn.16 d3, d7 @row10 & 12 + vtrn.16 d11, d15 @row14 & row16 + vtrn.32 d6, d14 @row4 & 8 + vtrn.32 d7, d15 @row 12 & 16 + @now Q3 ->p0 and Q7->q3 + vtrn.16 d0, d4 @row1 & 3 + vtrn.16 d8, d12 @row 5 & 7 + vtrn.16 d1, d5 @row9 & row11 + vtrn.16 d9, d13 @row13 & row15 + vtrn.32 d0, d8 @row1 & row5 + vtrn.32 d1, d9 @row9 & 13 + @now Q0->p3 & Q4->q0 + @starting processing as p0 and q0 are now ready + @now Q1->p2 & Q5->q1 + vpush {q7} @saving in stack + vtrn.32 d4, d12 @row3 & 7 + vmov.i16 q14, #2 + vtrn.32 d5, d13 @row11 & row15 + vaddl.u8 q8, d6, d8 @p0+q0 L + vtrn.32 d2, d10 @row2 &6 + vaddl.u8 q9, d7, d9 @p0+q0 H + vtrn.32 d3, d11 @row10&row14 + vaddw.u8 q10, q8, d4 @p0+q0+p1 L + vaddw.u8 q11, q9, d5 @p0+q0+p1 H + vaddl.u8 q12, d2, d10 @p2+q1 L + vaddl.u8 q13, d3, d11 @p2+q1 H + vmla.u16 q12, q10, q14 @p2 + X2(p1) + X2(p0) + X2(q0) + q1 L + vmla.u16 q13, q11, q14 @p2 + X2(p1) + X2(p0) + X2(q0) + q1 H + vmov.i8 q14, #2 + vaddw.u8 q8, q10, d2 @p0+q0+p1+p2 L + vaddw.u8 q9, q11, d3 @p0+q0+p1+p2 H + vdup.i8 q15, r2 @duplicate alpha + vrshrn.u16 d20, q8, #2 @(p2 + p1 + p0 + q0 + 2) >> 2)L p1' + vrshrn.u16 d21, q9, #2 @(p2 + p1 + p0 + q0 + 2) >> 2)H p1' + vabd.u8 q11, q3, q4 @ABD(p0-q0) + vsra.u8 q14, q15, #2 @alpha >>2 +2 + vabd.u8 q15, q1, q3 @Ap = ABD(p2-p0) + vrshrn.u16 d24, q12, #3 @((p2 + X2(p1) + X2(p0) + X2(q0) + q1 + 4) >> 3) L p0' + vrshrn.u16 d25, q13, #3 @((p2 + X2(p1) + X2(p0) + X2(q0) + q1 + 4) >> 3) H p0' + vdup.i8 q13, r3 @beta + vcgt.u8 q14, q14, q11 @ABS(p0 - q0) <((Alpha >>2) + 2) + vaddl.u8 q11, d6, d10 @p0+q1 L + vcgt.u8 q7, q13, q15 @beta>Ap + vaddl.u8 q15, d7, d11 @p0+q1 H + vaddw.u8 q11, q11, d4 @p0+q1+p1 L + vaddw.u8 q15, q15, d5 @p0+q1+p1 H + vaddw.u8 q11, q11, d4 @p0+q1+2*p1 L + vaddw.u8 q15, q15, d5 @p0+q1+2*p1 H + vand q7, q7, q14 @(Ap < Beta && ABS(p0 - q0) <((Alpha >>2) + 2) + vrshrn.u16 d22, q11, #2 @((X2(p1) + p0 + q1 + 2) >> 2) L p0" + vrshrn.u16 d23, q15, #2 @((X2(p1) + p0 + q1 + 2) >> 2) H p0" + vaddl.u8 q15, d2, d0 @p2+p3 L + vbif q12, q11, q7 @p0' or p0 " + vaddl.u8 q11, d3, d1 @p2+p3 H + vadd.u16 q15, q15, q15 @2*(p2+p3) L + vadd.u16 q11, q11, q11 @2*(p2+p3)H + vadd.u16 q8, q8, q15 @(X2(p3) + X3(p2) + p1 + p0 + q0) L + vadd.u16 q9, q9, q11 @(X2(p3) + X3(p2) + p1 + p0 + q0) H + vabd.u8 q15, q6, q4 @Aq = abs(q2-q0) + vabd.u8 q11, q5, q4 @ABS(Q1-Q0) + vrshrn.u16 d16, q8, #3 @((X2(p3) + X3(p2) + p1 + p0 + q0 + 4) >> 3); L p2' + vrshrn.u16 d17, q9, #3 @((X2(p3) + X3(p2) + p1 + p0 + q0 + 4) >> 3); H p2' + vabd.u8 q9, q2, q3 @ABS(p1-p0) + vcgt.u8 q15, q13, q15 @Aq < Beta + vcge.u8 q11, q11, q13 @ABS(q1 - q0) >= Beta + vcge.u8 q9, q9, q13 @ABS(p1 - p0) >= beta + vdup.i8 q13, r2 @duplicate alpha + vand q15, q15, q14 @(Aq < Beta && ABS(p0 - q0) <((Alpha >>2) + 2)) + vabd.u8 q14, q3, q4 @abs(p0-q0) + vorr q11, q11, q9 @ABS(p1 - p0) >= Beta || ABS(q1 - q0) >= Beta + vaddl.u8 q9, d6, d8 @p0+q0 L + vcge.u8 q14, q14, q13 @ABS(p0 - q0) >= Alpha + vaddl.u8 q13, d7, d9 @p0+q0 H + vaddw.u8 q9, q9, d10 @p0+q0+q1 L + vorr q11, q11, q14 @ABS(p1 - p0) >= Beta || ABS(q1 - q0) >= Beta||ABS(p0 - q0) >= Alpha + vaddw.u8 q13, q13, d11 @p0+q0+q1 H + vbic q7, q7, q11 @final condn for p's + vmov.i8 q14, #2 + vbif q3, q12, q11 @final p0 + vbit q1, q8, q7 @final p2 + vbif q10, q2, q7 @final p1 + vaddl.u8 q12, d8, d4 @q0+p1 L + vmlal.u8 q12, d10, d28 @X2(q1) + q0 + p1 L + vaddl.u8 q8, d9, d5 @q0+p1 H + vmlal.u8 q8, d11, d28 @X2(q1) + q0 + p1 H + vmov.i16 q14, #2 + vaddl.u8 q7, d4, d12 @p1+q2 L + vmla.u16 q7, q9, q14 @p1 + X2(p0) + X2(q0) + X2(q1) + q2L + vaddl.u8 q2, d5, d13 @p1+q2H + vmla.u16 q2, q13, q14 @p1 + X2(p0) + X2(q0) + X2(q1) + q2H + vrshrn.u16 d24, q12, #2 @(X2(q1) + q0 + p1 + 2) >> 2; L q0' + vrshrn.u16 d25, q8, #2 @(X2(q1) + q0 + p1 + 2) >> 2; H q0' + vaddw.u8 q9, q9, d12 @p0 + q0 + q1 + q2 L + vaddw.u8 q13, q13, d13 @p0 + q0 + q1 + q2 H + vrshrn.u16 d16, q7, #3 @(p1 + X2(p0) + X2(q0) + X2(q1) + q2 + 4) >> 3 L qo" + vpop {q7} + vrshrn.u16 d17, q2, #3 @(p1 + X2(p0) + X2(q0) + X2(q1) + q2 + 4) >> 3 H qo" + vrshrn.u16 d4, q9, #2 @p0 + q0 + q1 + q2 + 2)>>2 L q1' + vrshrn.u16 d5, q13, #2 @p0 + q0 + q1 + q2 + 2)>>2 H q1' + vbit q12, q8, q15 @q0' or q0" + vbic q15, q15, q11 @final condn for q's + vtrn.8 d0, d2 @row1 &2 + vbit q5, q2, q15 @final q1 + vtrn.8 d1, d3 @row9 &10 + vaddl.u8 q8, d12, d14 @q2+q3 L + vtrn.8 d20, d6 @row3&row4 + vaddl.u8 q2, d13, d15 @q2+q3 H + vtrn.8 d21, d7 @row11 & 12 + vmla.u16 q9, q8, q14 @X2(q3) + X3(q2) + q1 + q0 + p0 L + vtrn.16 d2, d6 @row2 & row4 + vmla.u16 q13, q2, q14 @X2(q3) + X3(q2) + q1 + q0 + p0 H + vtrn.16 d3, d7 @row10 & 12 + vbif q4, q12, q11 @final q0 + vtrn.16 d0, d20 @row1 & 3 + vrshrn.u16 d18, q9, #3 @(X2(q3) + X3(q2) + q1 + q0 + p0 + 4) >> 3; L + vtrn.16 d1, d21 @row9 & row11 + vrshrn.u16 d19, q13, #3 @(X2(q3) + X3(q2) + q1 + q0 + p0 + 4) >> 3; H + vtrn.8 d8, d10 @row5&6 + vbit q6, q9, q15 @final q2 + vtrn.8 d9, d11 @row13 &14 + vtrn.8 d12, d14 @row7 & 8 + vtrn.8 d13, d15 @row15 & 16 + vtrn.16 d10, d14 @row6 & row8 + vtrn.16 d11, d15 @row14 & row16 + @now Q3 ->p0 and Q7->q3 + vtrn.16 d8, d12 @row 5 & 7 + vtrn.16 d9, d13 @row13 & row15 + sub r0, r0, r1, lsl#4 @restore pointer + vtrn.32 d6, d14 @row4 & 8 + vtrn.32 d7, d15 @row 12 & 16 + vtrn.32 d0, d8 @row1 & row5 + vtrn.32 d1, d9 @row9 & 13 + vtrn.32 d2, d10 @row2 &6 + vtrn.32 d3, d11 @row10&row14 + vtrn.32 d20, d12 @row3 & 7 + vtrn.32 d21, d13 @row11 & row15 + vst1.8 d0, [r0], r1 @row1 + vst1.8 d2, [r0], r1 @row2 + vst1.8 d20, [r0], r1 @row3 + vst1.8 d6, [r0], r1 @row4 + vst1.8 d8, [r0], r1 @row5 + vst1.8 d10, [r0], r1 @row6 + vst1.8 d12, [r0], r1 @row7 + vst1.8 d14, [r0], r1 @row8 + vst1.8 d1, [r0], r1 @row9 + vst1.8 d3, [r0], r1 @row10 + vst1.8 d21, [r0], r1 @row11 + vst1.8 d7, [r0], r1 @row12 + vst1.8 d9, [r0], r1 @row13 + vst1.8 d11, [r0], r1 @row14 + vst1.8 d13, [r0], r1 @row15 + vst1.8 d15, [r0], r1 @row16 + vpop {d8 - d15} + ldmfd sp!, {r12, pc} + + + +@/** +@******************************************************************************* +@* +@* @brief +@* Performs filtering of a luma block vertical edge when the +@* boundary strength is set to 4 on calling twice +@* +@* @par Description: +@* This operation is described in Sec. 8.7.2.4 under the title +@* "Filtering process for edges for bS equal to 4" in ITU T Rec H.264. +@* +@* @param[in] r0 - pu1_src +@* Pointer to the src sample q0 +@* +@* @param[in] r1 - src_strd +@* Source stride +@* +@* @param[in] r2 - alpha +@* Alpha Value for the boundary +@* +@* @param[in] r3 - beta +@* Beta Value for the boundary +@* +@* @returns +@* None +@* +@* @remarks +@* None +@* +@******************************************************************************* +@*/ + + .global ih264_deblk_luma_vert_bs4_mbaff_a9 + +ih264_deblk_luma_vert_bs4_mbaff_a9: + + stmfd sp!, {lr} + + sub r0, r0, #4 @pointer uc_edgePixel-4 + vpush {d8 - d15} + @loading [p3:p2],[p1:p0]:[q0:q1]:[q2:q3] for every row + vld4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0], r1 + vld4.16 {d0[1], d2[1], d4[1], d6[1]}, [r0], r1 + vld4.16 {d0[2], d2[2], d4[2], d6[2]}, [r0], r1 + vld4.16 {d0[3], d2[3], d4[3], d6[3]}, [r0], r1 + vld4.16 {d1[0], d3[0], d5[0], d7[0]}, [r0], r1 + vld4.16 {d1[1], d3[1], d5[1], d7[1]}, [r0], r1 + vld4.16 {d1[2], d3[2], d5[2], d7[2]}, [r0], r1 + vld4.16 {d1[3], d3[3], d5[3], d7[3]}, [r0], r1 + + vuzp.8 d0, d1 @D0->p3, D1->p2 + vuzp.8 d2, d3 @D2->p1, D3->p0 + vuzp.8 d4, d5 @D4->q0, D5->q1 + vuzp.8 d6, d7 @D6->q2, D7->q3 + + vmov.i16 q14, #2 + vaddl.u8 q4, d3, d4 @p0+q0 + vaddw.u8 q5, q4, d2 @p0+q0+p1 + vaddl.u8 q6, d1, d5 @p2+q1 + vmla.u16 q6, q5, q14 @p2 + X2(p1) + X2(p0) + X2(q0) + q1 + + vmov.i8 d14, #2 + vaddw.u8 q4, q5, d1 @p0+q0+p1+p2 + vdup.i8 d15, r2 @duplicate alpha + vrshrn.u16 d10, q4, #2 @(p2 + p1 + p0 + q0 + 2) >> 2) p1' + vabd.u8 d11, d3, d4 @ABD(p0-q0) + vsra.u8 d14, d15, #2 @alpha >>2 +2 + vabd.u8 d15, d1, d3 @Ap = ABD(p2-p0) + vrshrn.u16 d12, q6, #3 @((p2 + X2(p1) + X2(p0) + X2(q0) + q1 + 4) >> 3) p0' + vdup.i8 d13, r3 @beta + vcgt.u8 d14, d14, d11 @ABS(p0 - q0) <((Alpha >>2) + 2) + vaddl.u8 q8, d3, d5 @p0+q1 + vcgt.u8 d26, d13, d15 @beta>Ap + vaddw.u8 q8, q8, d2 @p0+q1+p1 + vaddw.u8 q8, q8, d2 @p0+q1+2*p1 + vand d26, d26, d14 @(Ap < Beta && ABS(p0 - q0) <((Alpha >>2) + 2) + vrshrn.u16 d11, q8, #2 @((X2(p1) + p0 + q1 + 2) >> 2) p0" + vbif d12, d11, d26 @p0' or p0 " + vaddl.u8 q9, d1, d0 @p2+p3 + vadd.u16 q9, q9, q9 @2*(p2+p3) + vadd.u16 q4, q4, q9 @(X2(p3) + X3(p2) + p1 + p0 + q0) + vabd.u8 d15, d6, d4 @Aq = abs(q2-q0) + vabd.u8 d11, d5, d4 @ABS(q1-q0) + vrshrn.u16 d8, q4, #3 @((X2(p3) + X3(p2) + p1 + p0 + q0 + 4) >> 3); p2' + vabd.u8 d9, d2, d3 @ABS(p1-p0) + vcgt.u8 d15, d13, d15 @Aq < Beta + vcge.u8 d11, d11, d13 @ABS(q1 - q0) >= Beta + vcge.u8 d9, d9, d13 @ABS(p1 - p0) >= beta + vdup.i8 d13, r2 @duplicate alpha + vand d15, d15, d14 @(Aq < Beta && ABS(p0 - q0) <((Alpha >>2) + 2)) + vabd.u8 d14, d3, d4 @abs(p0-q0) + vorr d11, d11, d9 @ABS(p1 - p0) >= Beta || ABS(q1 - q0) >= Beta + vcge.u8 d14, d14, d13 @ABS(p0 - q0) >= Alpha + vaddl.u8 q10, d3, d4 @p0+q0 + vorr d11, d11, d14 @ABS(p1 - p0) >= Beta || ABS(q1 - q0) >= Beta||ABS(p0 - q0) >= Alpha + vaddw.u8 q10, q10, d5 @p0+q0+q1 + vbic d26, d26, d11 @final condn for p's + vmov.i8 d14, #2 + vbif d3, d12, d11 @final p0 + vbit d1, d8, d26 @final p2 + vbif d10, d2, d26 @final p1 + vaddl.u8 q6, d4, d2 @q0+p1 + vmlal.u8 q6, d5, d14 @X2(q1) + q0 + p1 + + vaddl.u8 q11, d2, d6 @p1+q2 + vmla.u16 q11, q10, q14 @p1 + X2(p0) + X2(q0) + X2(q1) + q2 + vrshrn.u16 d12, q6, #2 @(X2(q1) + q0 + p1 + 2) >> 2; q0' + vaddw.u8 q10, q10, d6 @p0 + q0 + q1 + q2 + vrshrn.u16 d8, q11, #3 @(p1 + X2(p0) + X2(q0) + X2(q1) + q2 + 4) >> 3 qo" + + vrshrn.u16 d2, q10, #2 @p0 + q0 + q1 + q2 + 2)>>2 q1' + vbit d12, d8, d15 @q0' or q0" + vbic d15, d15, d11 @final condn for q's + vbit d5, d2, d15 @final q1 + vaddl.u8 q12, d6, d7 @q2+q3 + vmla.u16 q10, q12, q14 @X2(q3) + X3(q2) + q1 + q0 + p0 + vbif d4, d12, d11 @final q0 + vrshrn.u16 d9, q10, #3 @(X2(q3) + X3(q2) + q1 + q0 + p0 + 4) >> 3; + vbit d6, d9, d15 @final q2 + vand d2, d10, d10 @D0->p3, D1->p2, D2->p1, D3->p0, D4->q0, D5->q1, D6->q2, D7->q3 + + vzip.8 d0, d1 @D0,D1 -> [p3:p2] + vzip.8 d2, d3 @D2,D3 -> [p1:p0] + vzip.8 d4, d5 @D4,D5 -> [q0:q1] + vzip.8 d6, d7 @D6,D7 -> [q2:q3] + + sub r0, r0, r1, lsl#3 @restore pointer + + @storing [p3:p2],[p1:p0]:[q0:q1]:[q2:q3] in every row + vst4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0], r1 + vst4.16 {d0[1], d2[1], d4[1], d6[1]}, [r0], r1 + vst4.16 {d0[2], d2[2], d4[2], d6[2]}, [r0], r1 + vst4.16 {d0[3], d2[3], d4[3], d6[3]}, [r0], r1 + vst4.16 {d1[0], d3[0], d5[0], d7[0]}, [r0], r1 + vst4.16 {d1[1], d3[1], d5[1], d7[1]}, [r0], r1 + vst4.16 {d1[2], d3[2], d5[2], d7[2]}, [r0], r1 + vst4.16 {d1[3], d3[3], d5[3], d7[3]}, [r0], r1 + vpop {d8 - d15} + ldmfd sp!, {pc} + + + +@/** +@******************************************************************************* +@* +@* @brief +@* Performs filtering of a luma block vertical edge for cases where the +@* boundary strength is less than 4 on calling twice +@* +@* @par Description: +@* This operation is described in Sec. 8.7.2.4 under the title +@* "Filtering process for edges for bS equal to 4" in ITU T Rec H.264. +@* +@* @param[in] r0 - pu1_src +@* Pointer to the src sample q0 +@* +@* @param[in] r1 - src_strd +@* Source stride +@* +@* @param[in] r2 - alpha +@* Alpha Value for the boundary +@* +@* @param[in] r3 - beta +@* Beta Value for the boundary +@* +@* @param[in] sp(0) - u4_bs +@* Packed Boundary strength array +@* +@* @param[in] sp(4) - pu1_cliptab +@* tc0_table +@* +@* @returns +@* None +@* +@* @remarks +@* None +@* +@******************************************************************************* +@*/ + + .global ih264_deblk_luma_vert_bslt4_mbaff_a9 + +ih264_deblk_luma_vert_bslt4_mbaff_a9: + + stmfd sp!, {r12, lr} + + sub r0, r0, #4 @pointer uc_edgePixel-4 + ldr r12, [sp, #8] @r12 = ui_Bs + ldr r14, [sp, #12] @r14 = pu1_ClipTab + vpush {d8 - d15} + @loading [p3:p2],[p1:p0]:[q0:q1]:[q2:q3] for every row + vld4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0], r1 + vld4.16 {d0[1], d2[1], d4[1], d6[1]}, [r0], r1 + vld4.16 {d0[2], d2[2], d4[2], d6[2]}, [r0], r1 + vld4.16 {d0[3], d2[3], d4[3], d6[3]}, [r0], r1 + vld4.16 {d1[0], d3[0], d5[0], d7[0]}, [r0], r1 + vld4.16 {d1[1], d3[1], d5[1], d7[1]}, [r0], r1 + vld4.16 {d1[2], d3[2], d5[2], d7[2]}, [r0], r1 + vld4.16 {d1[3], d3[3], d5[3], d7[3]}, [r0], r1 + + vuzp.8 d0, d1 @D0->p3, D1->p2 + vuzp.8 d2, d3 @D2->p1, D3->p0 + vuzp.8 d4, d5 @D4->q0, D5->q1 + vuzp.8 d6, d7 @D6->q2, D7->q3 + + rev r12, r12 @reversing ui_bs + vmov.32 d8[0], r12 @D8[0] = ui_Bs + vld1.32 d9[0], [r14] @D9[0] contains cliptab + vmovl.u8 q15, d8 @D30 = ui_Bs in each 16 bt scalar + vtbl.8 d8, {d9}, d30 @puc_ClipTab[ui_Bs] + vsli.16 d8, d8, #8 @D8 = C0 + + vrhadd.u8 d10, d3, d4 @((p0 + q0 + 1) >> 1) + vmov.i8 d31, #2 + vabd.u8 d11, d3, d4 @ABS(p0 - q0) + vaddl.u8 q6, d10, d1 @(p2 + ((p0 + q0 + 1) >> 1) + vmlsl.u8 q6, d2, d31 @(p2 + ((p0 + q0 + 1) >> 1) - (p1 << 1)) + vdup.8 d14, r2 @alpha + vcle.u8 d11, d14, d11 @ABS(p0 - q0) >= Alpha(Alpha <=ABS(p0 - q0)) + vdup.i8 d14, r3 @beta + vabd.u8 d15, d5, d4 @ABS(q1 - q0) + vqshrn.s16 d12, q6, #1 @((p2 + ((p0 + q0 + 1) >> 1) - (p1 << 1)) >> 1) + vcge.u8 d15, d15, d14 @ABS(q1 - q0) >= Beta + vabd.u8 d13, d2, d3 @ABS(p1 - p0) + vmin.s8 d12, d12, d8 @min(deltap1 ,C0) + vorr d11, d11, d15 @ABS(q1 - q0) >= Beta ||ABS(p0 - q0) >= Alpha + vneg.s8 d15, d8 @-C0 + vcge.u8 d13, d13, d14 @ABS(p1 - p0) >= Beta + vmax.s8 d12, d12, d15 @max(deltap1,-C0) + vorr d11, d11, d13 @ABS(p0 - q0) >= Alpha || ABS(q1 - q0) >= Beta || ABS(p1 - p0) >= Beta) + vceq.u16 d13, d30, #0 @ui_bs == 0 + vaddl.u8 q14, d10, d6 @q2 + ((p0 + q0 + 1) >> 1) + vsubw.u8 q14, q14, d5 @q2 + ((p0 + q0 + 1) >> 1) - q1 + vsubw.u8 q14, q14, d5 @q2 + ((p0 + q0 + 1) >> 1) - 2*q1 + vorr d13, d13, d11 @(ABS(p0 - q0) >= Alpha || ABS(q1 - q0) >= Beta || ABS(p1 - p0) >= Beta)) + @|| (ui_bs == 0) + vqshrn.s16 d9, q14, #1 @(q2 + ((p0 + q0 + 1) >> 1) - (q1 << 1)) >> 1 + vabd.u8 d11, d1, d3 @Ap = ABS(p2 - p0) + vabd.u8 d10, d6, d4 @Aq= ABS(q2 - q0) + vclt.u8 d11, d11, d14 @Ap < Beta + vmin.s8 d9, d9, d8 @min(deltaq1,C0) + vclt.u8 d10, d10, d14 @Aq < Beta + vmax.s8 d9, d9, d15 @max(deltaq1,-C0) + vsubl.u8 q7, d4, d3 @q0 - p0 + vshl.s16 q7, q7, #2 @(q0 - p0) << 2 + vsub.u8 d8, d8, d11 @C0 + (Ap < Beta) + vaddw.u8 q7, q7, d2 @((q0 - p0) << 2) + p1 + vsubw.u8 q7, q7, d5 @((q0 - p0) << 2) + (p1 - q1) + vbic d11, d11, d13 @final condition for p1 + vrshr.s16 q15, q7, #3 @delta = (((q0 - p0) << 2) + (p1 - q1) + 4) >> 3 + vsub.u8 d8, d8, d10 @C0 + (Ap < Beta) + (Aq < Beta) + vbic d10, d10, d13 @final condition for q1 + vabs.s16 q14, q15 + vmovn.i16 d15, q14 @abs(delta) + vand d12, d12, d11 @delatp1 + vand d9, d9, d10 @deltaq1 + vmin.u8 d15, d15, d8 @min((abs(delta),C) + vadd.i8 d2, d2, d12 @p1+deltap1 + vadd.i8 d5, d5, d9 @q1+deltaq1 + vbic d15, d15, d13 @abs(delta) of pixels to be changed only + vcge.s16 q14, q15, #0 + vmovn.i16 d14, q14 @sign(delta) + vqsub.u8 d11, d3, d15 @clip(p0-delta) + vqadd.u8 d3, d3, d15 @clip(p0+delta) + vqadd.u8 d12, d4, d15 @clip(q0+delta) + vqsub.u8 d4, d4, d15 @clip(q0-delta) + vbif d3, d11, d14 @p0 + vbif d4, d12, d14 @q0 + + sub r0, r0, r1, lsl#3 @restore pointer + @D0->p3, D1->p2, D2->p1, D3->p0, D4->q0, D5->q1, D6->q2, D7->q3 + vzip.8 d0, d1 @D0,D1 -> [p3:p2] + vzip.8 d2, d3 @D2,D3 -> [p1:p0] + vzip.8 d4, d5 @D4,D5 -> [q0:q1] + vzip.8 d6, d7 @D6,D7 -> [q2:q3] + + @storing [p3:p2],[p1:p0]:[q0:q1]:[q2:q3] in every row + vst4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0], r1 + vst4.16 {d0[1], d2[1], d4[1], d6[1]}, [r0], r1 + vst4.16 {d0[2], d2[2], d4[2], d6[2]}, [r0], r1 + vst4.16 {d0[3], d2[3], d4[3], d6[3]}, [r0], r1 + vst4.16 {d1[0], d3[0], d5[0], d7[0]}, [r0], r1 + vst4.16 {d1[1], d3[1], d5[1], d7[1]}, [r0], r1 + vst4.16 {d1[2], d3[2], d5[2], d7[2]}, [r0], r1 + vst4.16 {d1[3], d3[3], d5[3], d7[3]}, [r0], r1 + vpop {d8 - d15} + ldmfd sp!, {r12, pc} + + + diff --git a/common/arm/ih264_default_weighted_pred_a9q.s b/common/arm/ih264_default_weighted_pred_a9q.s new file mode 100755 index 0000000..94cda46 --- /dev/null +++ b/common/arm/ih264_default_weighted_pred_a9q.s @@ -0,0 +1,359 @@ +@/****************************************************************************** +@ * +@ * Copyright (C) 2015 The Android Open Source Project +@ * +@ * Licensed under the Apache License, Version 2.0 (the "License"); +@ * you may not use this file except in compliance with the License. +@ * You may obtain a copy of the License at: +@ * +@ * http://www.apache.org/licenses/LICENSE-2.0 +@ * +@ * Unless required by applicable law or agreed to in writing, software +@ * distributed under the License is distributed on an "AS IS" BASIS, +@ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +@ * See the License for the specific language governing permissions and +@ * limitations under the License. +@ * +@ ***************************************************************************** +@ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore +@*/ +@/** +@****************************************************************************** +@* @file +@* ih264_default_weighted_pred_a9q.s +@* +@* @brief +@* Contains function definitions for default weighted prediction. +@* Functions are coded using NEON intrinsics and can be compiled using ARM RVCT +@* +@* @author +@* Kaushik Senthoor R +@* +@* @par List of Functions: +@* +@* - ih264_default_weighted_pred_luma_a9q() +@* - ih264_default_weighted_pred_chroma_a9q() +@* +@* @remarks +@* None +@* +@******************************************************************************* +@*/ +@******************************************************************************* +@* @function +@* ih264_default_weighted_pred_luma_a9q() +@* +@* @brief +@* This routine performs the default weighted prediction as described in sec +@* 8.4.2.3.1 titled "Default weighted sample prediction process" for luma. +@* +@* @par Description: +@* This function gets two ht x wd blocks, calculates their rounded-average and +@* stores it in the destination block. +@* +@* @param[in] pu1_src1: +@* UWORD8 Pointer to the buffer containing the first input block. +@* +@* @param[in] pu1_src2: +@* UWORD8 Pointer to the buffer containing the second input block. +@* +@* @param[out] pu1_dst +@* UWORD8 pointer to the destination where the output block is stored. +@* +@* @param[in] src_strd1 +@* Stride of the first input buffer +@* +@* @param[in] src_strd2 +@* Stride of the second input buffer +@* +@* @param[in] dst_strd +@* Stride of the destination buffer +@* +@* @param[in] ht +@* integer height of the array +@* +@* @param[in] wd +@* integer width of the array +@* +@* @returns +@* None +@* +@* @remarks +@* (ht,wd) can be (4,4), (4,8), (8,4), (8,8), (8,16), (16,8) or (16,16). +@* +@******************************************************************************* +@*/ +@void ih264_default_weighted_pred_luma_a9q(UWORD8 *pu1_src1, +@ UWORD8 *pu1_src2, +@ UWORD8 *pu1_dst, +@ WORD32 src_strd1, +@ WORD32 src_strd2, +@ WORD32 dst_strd, +@ WORD32 ht, +@ WORD32 wd) +@ +@**************Variables Vs Registers***************************************** +@ r0 => pu1_src1 +@ r1 => pu1_src2 +@ r2 => pu1_dst +@ r3 => src_strd1 +@ [sp] => src_strd2 (r4) +@ [sp+4] => dst_strd (r5) +@ [sp+8] => ht (r6) +@ [sp+12] => wd (r7) +@ +.text +.p2align 2 + + .global ih264_default_weighted_pred_luma_a9q + +ih264_default_weighted_pred_luma_a9q: + + stmfd sp!, {r4-r7, r14} @stack stores the values of the arguments + ldr r7, [sp, #32] @Load wd + ldr r4, [sp, #20] @Load src_strd2 + ldr r5, [sp, #24] @Load dst_strd + cmp r7, #16 + ldr r6, [sp, #28] @Load ht + vpush {d8-d15} + beq loop_16 @branch if wd is 16 + cmp r7, #8 + beq loop_8 @branch if wd is 8 + +loop_4: @each iteration processes four rows + + vld1.32 d0[0], [r0], r3 @load row 1 in source 1 + vld1.32 d0[1], [r0], r3 @load row 2 in source 1 + vld1.32 d2[0], [r1], r4 @load row 1 in source 2 + vld1.32 d2[1], [r1], r4 @load row 2 in source 2 + + vld1.32 d1[0], [r0], r3 @load row 3 in source 1 + vld1.32 d1[1], [r0], r3 @load row 4 in source 1 + vrhadd.u8 d0, d0, d2 + vld1.32 d3[0], [r1], r4 @load row 3 in source 2 + vld1.32 d3[1], [r1], r4 @load row 4 in source 2 + + subs r6, r6, #4 @decrement ht by 4 + vst1.32 d0[0], [r2], r5 @load row 1 in destination + vst1.32 d0[1], [r2], r5 @load row 2 in destination + vrhadd.u8 d1, d1, d3 + vst1.32 d1[0], [r2], r5 @load row 3 in destination + vst1.32 d1[1], [r2], r5 @load row 4 in destination + + bgt loop_4 @if greater than 0 repeat the loop again + + b end_loops + +loop_8: @each iteration processes four rows + + vld1.8 d0, [r0], r3 @load row 1 in source 1 + vld1.8 d4, [r1], r4 @load row 1 in source 2 + vld1.8 d1, [r0], r3 @load row 2 in source 1 + vld1.8 d5, [r1], r4 @load row 2 in source 2 + vld1.8 d2, [r0], r3 @load row 3 in source 1 + vrhadd.u8 q0, q0, q2 + vld1.8 d6, [r1], r4 @load row 3 in source 2 + vld1.8 d3, [r0], r3 @load row 4 in source 1 + vrhadd.u8 d2, d2, d6 + vld1.8 d7, [r1], r4 @load row 4 in source 2 + + subs r6, r6, #4 @decrement ht by 4 + vst1.8 d0, [r2], r5 @load row 1 in destination + vrhadd.u8 d3, d3, d7 + vst1.8 d1, [r2], r5 @load row 2 in destination + vst1.8 d2, [r2], r5 @load row 3 in destination + vst1.8 d3, [r2], r5 @load row 4 in destination + + bgt loop_8 @if greater than 0 repeat the loop again + + b end_loops + +loop_16: @each iteration processes eight rows + + vld1.8 {q0}, [r0], r3 @load row 1 in source 1 + vld1.8 {q8}, [r1], r4 @load row 1 in source 2 + vld1.8 {q1}, [r0], r3 @load row 2 in source 1 + vld1.8 {q9}, [r1], r4 @load row 2 in source 2 + vrhadd.u8 q0, q0, q8 + vld1.8 {q2}, [r0], r3 @load row 3 in source 1 + vld1.8 {q10}, [r1], r4 @load row 3 in source 2 + vrhadd.u8 q1, q1, q9 + vld1.8 {q3}, [r0], r3 @load row 4 in source 1 + vld1.8 {q11}, [r1], r4 @load row 4 in source 2 + vrhadd.u8 q2, q2, q10 + vld1.8 {q4}, [r0], r3 @load row 5 in source 1 + vld1.8 {q12}, [r1], r4 @load row 5 in source 2 + vrhadd.u8 q3, q3, q11 + vld1.8 {q5}, [r0], r3 @load row 6 in source 1 + vld1.8 {q13}, [r1], r4 @load row 6 in source 2 + vrhadd.u8 q4, q4, q12 + vld1.8 {q6}, [r0], r3 @load row 7 in source 1 + vld1.8 {q14}, [r1], r4 @load row 7 in source 2 + vrhadd.u8 q5, q5, q13 + vld1.8 {q7}, [r0], r3 @load row 8 in source 1 + vld1.8 {q15}, [r1], r4 @load row 8 in source 2 + + vrhadd.u8 q6, q6, q14 + vst1.8 {q0}, [r2], r5 @load row 1 in destination + vst1.8 {q1}, [r2], r5 @load row 2 in destination + vrhadd.u8 q7, q7, q15 + vst1.8 {q2}, [r2], r5 @load row 3 in destination + vst1.8 {q3}, [r2], r5 @load row 4 in destination + subs r6, r6, #8 @decrement ht by 8 + vst1.8 {q4}, [r2], r5 @load row 5 in destination + vst1.8 {q5}, [r2], r5 @load row 6 in destination + vst1.8 {q6}, [r2], r5 @load row 7 in destination + vst1.8 {q7}, [r2], r5 @load row 8 in destination + + bgt loop_16 @if greater than 0 repeat the loop again + +end_loops: + + vpop {d8-d15} + ldmfd sp!, {r4-r7, r15} @Reload the registers from sp + + +@******************************************************************************* +@* @function +@* ih264_default_weighted_pred_chroma_a9q() +@* +@* @brief +@* This routine performs the default weighted prediction as described in sec +@* 8.4.2.3.1 titled "Default weighted sample prediction process" for chroma. +@* +@* @par Description: +@* This function gets two ht x wd blocks, calculates their rounded-average and +@* stores it in the destination block for U and V. +@* +@* @param[in] pu1_src1: +@* UWORD8 Pointer to the buffer containing the first input block. +@* +@* @param[in] pu1_src2: +@* UWORD8 Pointer to the buffer containing the second input block. +@* +@* @param[out] pu1_dst +@* UWORD8 pointer to the destination where the output block is stored. +@* +@* @param[in] src_strd1 +@* Stride of the first input buffer +@* +@* @param[in] src_strd2 +@* Stride of the second input buffer +@* +@* @param[in] dst_strd +@* Stride of the destination buffer +@* +@* @param[in] ht +@* integer height of the array +@* +@* @param[in] wd +@* integer width of the array +@* +@* @returns +@* None +@* +@* @remarks +@* (ht,wd) can be (2,2), (2,4), (4,2), (4,4), (4,8), (8,4) or (8,8). +@* +@******************************************************************************* +@*/ +@void ih264_default_weighted_pred_chroma_a9q(UWORD8 *pu1_src1, +@ UWORD8 *pu1_src2, +@ UWORD8 *pu1_dst, +@ WORD32 src_strd1, +@ WORD32 src_strd2, +@ WORD32 dst_strd, +@ WORD32 ht, +@ WORD32 wd) +@ +@**************Variables Vs Registers***************************************** +@ r0 => pu1_src1 +@ r1 => pu1_src2 +@ r2 => pu1_dst +@ r3 => src_strd1 +@ [sp] => src_strd2 (r4) +@ [sp+4] => dst_strd (r5) +@ [sp+8] => ht (r6) +@ [sp+12] => wd (r7) +@ + + + .global ih264_default_weighted_pred_chroma_a9q + +ih264_default_weighted_pred_chroma_a9q: + + stmfd sp!, {r4-r7, r14} @stack stores the values of the arguments + ldr r7, [sp, #32] @Load wd + ldr r4, [sp, #20] @Load src_strd2 + ldr r5, [sp, #24] @Load dst_strd + cmp r7, #8 + ldr r6, [sp, #28] @Load ht + vpush {d8-d15} + beq loop_8_uv @branch if wd is 8 + cmp r7, #4 + beq loop_4_uv @branch if wd is 4 + +loop_2_uv: @each iteration processes two rows + + vld1.32 d0[0], [r0], r3 @load row 1 in source 1 + vld1.32 d0[1], [r0], r3 @load row 2 in source 1 + + vld1.32 d1[0], [r1], r4 @load row 1 in source 2 + vld1.32 d1[1], [r1], r4 @load row 2 in source 2 + + vrhadd.u8 d0, d0, d1 + + subs r6, r6, #2 @decrement ht by 2 + vst1.32 d0[0], [r2], r5 @load row 1 in destination + vst1.32 d0[1], [r2], r5 @load row 2 in destination + + bgt loop_2_uv @if greater than 0 repeat the loop again + + b end_loops_uv + +loop_4_uv: @each iteration processes two rows + + vld1.8 d0, [r0], r3 @load row 1 in source 1 + vld1.8 d2, [r1], r4 @load row 1 in source 2 + vld1.8 d1, [r0], r3 @load row 2 in source 1 + vrhadd.u8 d0, d0, d2 + vld1.8 d3, [r1], r4 @load row 2 in source 2 + + vrhadd.u8 d1, d1, d3 + vst1.8 d0, [r2], r5 @load row 1 in destination + subs r6, r6, #2 @decrement ht by 2 + vst1.8 d1, [r2], r5 @load row 2 in destination + + bgt loop_4_uv @if greater than 0 repeat the loop again + + b end_loops_uv + +loop_8_uv: @each iteration processes four rows + + vld1.8 {q0}, [r0], r3 @load row 1 in source 1 + vld1.8 {q4}, [r1], r4 @load row 1 in source 2 + vld1.8 {q1}, [r0], r3 @load row 2 in source 1 + vrhadd.u8 q0, q0, q4 + vld1.8 {q5}, [r1], r4 @load row 2 in source 2 + vld1.8 {q2}, [r0], r3 @load row 3 in source 1 + vrhadd.u8 q1, q1, q5 + vld1.8 {q6}, [r1], r4 @load row 3 in source 2 + vld1.8 {q3}, [r0], r3 @load row 4 in source 1 + vrhadd.u8 q2, q2, q6 + vld1.8 {q7}, [r1], r4 @load row 4 in source 2 + + vst1.8 {q0}, [r2], r5 @load row 1 in destination + vrhadd.u8 q3, q3, q7 + vst1.8 {q1}, [r2], r5 @load row 2 in destination + subs r6, r6, #4 @decrement ht by 4 + vst1.8 {q2}, [r2], r5 @load row 3 in destination + vst1.8 {q3}, [r2], r5 @load row 4 in destination + + bgt loop_8_uv @if greater than 0 repeat the loop again + +end_loops_uv: + + vpop {d8-d15} + ldmfd sp!, {r4-r7, r15} @Reload the registers from sp + + diff --git a/common/arm/ih264_ihadamard_scaling_a9.s b/common/arm/ih264_ihadamard_scaling_a9.s new file mode 100755 index 0000000..687099a --- /dev/null +++ b/common/arm/ih264_ihadamard_scaling_a9.s @@ -0,0 +1,250 @@ +@/****************************************************************************** +@ * +@ * Copyright (C) 2015 The Android Open Source Project +@ * +@ * Licensed under the Apache License, Version 2.0 (the "License"); +@ * you may not use this file except in compliance with the License. +@ * You may obtain a copy of the License at: +@ * +@ * http://www.apache.org/licenses/LICENSE-2.0 +@ * +@ * Unless required by applicable law or agreed to in writing, software +@ * distributed under the License is distributed on an "AS IS" BASIS, +@ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +@ * See the License for the specific language governing permissions and +@ * limitations under the License. +@ * +@ ***************************************************************************** +@ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore +@*/ +@/** +@ ******************************************************************************* +@ * @file +@ * ih264_ihadamard_scaling_a9.s +@ * +@ * @brief +@ * Contains function definitions for inverse hadamard transform on 4x4 DC outputs +@ * of 16x16 intra-prediction +@ * +@ * @author +@ * Mohit +@ * +@ * @par List of Functions: +@ * - ih264_ihadamard_scaling_4x4_a9() +@ * - ih264_ihadamard_scaling_2x2_uv_a9() +@ * +@ * @remarks +@ * None +@ * +@ ******************************************************************************* +@ */ +@ * @brief This function performs a 4x4 inverse hadamard transform on the 4x4 DC coefficients +@ * of a 16x16 intra prediction macroblock, and then performs scaling. +@ * prediction buffer +@ * +@ * @par Description: +@ * The DC coefficients pass through a 2-stage inverse hadamard transform. +@ * This inverse transformed content is scaled to based on Qp value. +@ * +@ * @param[in] pi2_src +@ * input 4x4 block of DC coefficients +@ * +@ * @param[out] pi2_out +@ * output 4x4 block +@ * +@ * @param[in] pu2_iscal_mat +@ * pointer to scaling list +@ * +@ * @param[in] pu2_weigh_mat +@ * pointer to weight matrix +@ * +@ * @param[in] u4_qp_div_6 +@ * Floor (qp/6) +@ * +@ * @param[in] pi4_tmp +@ * temporary buffer of size 1*16 +@ * +@ * @returns none +@ * +@ * @remarks none +@ * +@ ******************************************************************************* +@ */ +@ * +@ ******************************************************************************* +@ */ +@ void ih264_ihadamard_scaling_4x4(WORD16* pi2_src, +@ WORD16* pi2_out, +@ const UWORD16 *pu2_iscal_mat, +@ const UWORD16 *pu2_weigh_mat, +@ UWORD32 u4_qp_div_6, +@ WORD32* pi4_tmp) +@**************Variables Vs Registers***************************************** +@r0 => *pi2_src +@r1 => *pi2_out +@r2 => *pu2_iscal_mat +@r3 => *pu2_weigh_mat +@r4 => u4_qp_div_6 + +.text +.p2align 2 + + .global ih264_ihadamard_scaling_4x4_a9 + +ih264_ihadamard_scaling_4x4_a9: + +@VLD4.S16 is used because the pointer is incremented by SUB_BLK_WIDTH_4x4 +@If the macro value changes need to change the instruction according to it. +@Only one shift is done in horizontal inverse because, +@if u4_qp_div_6 is lesser than 4 then shift value will be neagative and do negative left shift, in this case rnd_factor has value +@if u4_qp_div_6 is greater than 4 then shift value will be positive and do left shift, here rnd_factor is 0 + + stmfd sp!, {r4-r12, r14} @ stack stores the values of the arguments + ldr r4, [sp, #40] @ Loads u4_qp_div_6 + vdup.s32 q10, r4 @ Populate the u4_qp_div_6 in Q10 + ldrh r6, [r3] @ load pu2_weight_mat[0] , H for unsigned halfword load + ldrh r7, [r2] @ load pu2_iscal_mat[0] , H for unsigned halfword load + mul r6, r6, r7 @ pu2_iscal_mat[0]*pu2_weigh_mat[0] + vdup.s32 q9, r6 @ Populate pu2_iscal_mat[0]*pu2_weigh_mat[0] 32-bit in Q9 + vpush {d8-d15} +@=======================INVERSE HADAMARD TRANSFORM================================ + + vld4.s16 {d0, d1, d2, d3}, [r0] @load x4,x5,x6,x7 + vaddl.s16 q12, d0, d3 @x0 = x4 + x7 + vaddl.s16 q13, d1, d2 @x1 = x5 + x6 + vsubl.s16 q14, d1, d2 @x2 = x5 - x6 + vsubl.s16 q15, d0, d3 @x3 = x4 - x7 + + vadd.s32 q2, q12, q13 @pi4_tmp_ptr[0] = x0 + x1 + vadd.s32 q3, q15, q14 @pi4_tmp_ptr[1] = x3 + x2 + vsub.s32 q4, q12, q13 @pi4_tmp_ptr[2] = x0 - x1 + vsub.s32 q5, q15, q14 @pi4_tmp_ptr[3] = x3 - x2 + + vtrn.32 q2, q3 @Transpose the register for vertical transform + vtrn.32 q4, q5 + + vswp d5, d8 @Q2 = x4, Q4 = x6 + vswp d7, d10 @Q3 = x5, Q5 = x7 + + + vadd.s32 q12, q2, q5 @x0 = x4+x7 + vadd.s32 q13, q3, q4 @x1 = x5+x6 + vsub.s32 q14, q3, q4 @x2 = x5-x6 + vsub.s32 q15, q2, q5 @x3 = x4-x7 + + vadd.s32 q0, q12, q13 @pi4_tmp_ptr[0] = x0 + x1 + vadd.s32 q1, q15, q14 @pi4_tmp_ptr[1] = x3 + x2 + vsub.s32 q2, q12, q13 @pi4_tmp_ptr[2] = x0 - x1 + vsub.s32 q3, q15, q14 @pi4_tmp_ptr[3] = x3 - x2 + + + vmul.s32 q0, q0, q9 @ Q0 = p[i] = (x[i] * trns_coeff[i]) where i = 0..3 + vmul.s32 q1, q1, q9 @ Q1 = p[i] = (x[i] * trns_coeff[i]) where i = 4..7 + vmul.s32 q2, q2, q9 @ Q2 = p[i] = (x[i] * trns_coeff[i]) where i = 8..11 + vmul.s32 q3, q3, q9 @ Q3 = p[i] = (x[i] * trns_coeff[i]) where i = 12..15 + + vshl.s32 q0, q0, q10 @ Q0 = q[i] = (p[i] << (qP/6)) where i = 0..3 + vshl.s32 q1, q1, q10 @ Q1 = q[i] = (p[i] << (qP/6)) where i = 4..7 + vshl.s32 q2, q2, q10 @ Q2 = q[i] = (p[i] << (qP/6)) where i = 8..11 + vshl.s32 q3, q3, q10 @ Q3 = q[i] = (p[i] << (qP/6)) where i = 12..15 + + vqrshrn.s32 d0, q0, #0x6 @ D0 = c[i] = ((q[i] + 32) >> 4) where i = 0..3 + vqrshrn.s32 d1, q1, #0x6 @ D1 = c[i] = ((q[i] + 32) >> 4) where i = 4..7 + vqrshrn.s32 d2, q2, #0x6 @ D2 = c[i] = ((q[i] + 32) >> 4) where i = 8..11 + vqrshrn.s32 d3, q3, #0x6 @ D3 = c[i] = ((q[i] + 32) >> 4) where i = 12..15 + + vst1.s16 {d0, d1, d2, d3}, [r1] @IV row store the value + + vpop {d8-d15} + ldmfd sp!, {r4-r12, r15} @Reload the registers from SP + + + +@ ******************************************************************************* +@ */ +@ * @brief This function performs a 2x2 inverse hadamard transform for chroma block +@ * +@ * @par Description: +@ * The DC coefficients pass through a 2-stage inverse hadamard transform. +@ * This inverse transformed content is scaled to based on Qp value. +@ * Both DC blocks of U and v blocks are processesd +@ * +@ * @param[in] pi2_src +@ * input 1x8 block of ceffs. First 4 are from U and next from V +@ * +@ * @param[out] pi2_out +@ * output 1x8 block +@ * +@ * @param[in] pu2_iscal_mat +@ * pointer to scaling list +@ * +@ * @param[in] pu2_weigh_mat +@ * pointer to weight matrix +@ * +@ * @param[in] u4_qp_div_6 +@ * Floor (qp/6) +@ * +@ * @returns none +@ * +@ * @remarks none +@ * +@ ******************************************************************************* +@ */ +@ * +@ ******************************************************************************* +@ */ +@ void ih264_ihadamard_scaling_2x2_uv(WORD16* pi2_src, +@ WORD16* pi2_out, +@ const UWORD16 *pu2_iscal_mat, +@ const UWORD16 *pu2_weigh_mat, +@ UWORD32 u4_qp_div_6, + + .global ih264_ihadamard_scaling_2x2_uv_a9 +ih264_ihadamard_scaling_2x2_uv_a9: + +@Registers used +@ r0 : *pi2_src +@ r1 : *pi2_out +@ r2 : *pu2_iscal_mat +@ r3 : *pu2_weigh_mat + + vld1.u16 d26[0], [r2] + vld1.u16 d27[0], [r3] + vmull.u16 q15, d26, d27 @pu2_iscal_mat[0] * pu2_weigh_mat[0] + vdup.u32 q15, d30[0] + + vld1.u16 d28[0], [sp] @load qp/6 + + vpush {d8-d15} + + vmov.u16 d29, #5 + vsubl.u16 q14, d28, d29 @qp\6 - 5 + vdup.s32 q14, d28[0] + + vld2.s16 {d0, d1}, [r0] @load 8 dc coeffs + @i2_x4,i2_x6,i2_y4,i1_y6 -> d0 + @i2_x5,i2_x7,i2_y5,i1_y6 -> d1 + + vaddl.s16 q1, d0, d1 @ i4_x0 = i4_x4 + i4_x5;...x2 + vsubl.s16 q2, d0, d1 @ i4_x1 = i4_x4 - i4_x5;...x3 + + vtrn.s32 q1, q2 @i4_x0 i4_x1 -> q1 + + vadd.s32 q3, q1, q2 @i4_x4 = i4_x0+i4_x2;.. i4_x5 + vsub.s32 q1, q1, q2 @i4_x6 = i4_x0-i4_x2;.. i4_x7 + + vmul.s32 q5, q3, q15 + vmul.s32 q6, q1, q15 + + vshl.s32 q7, q5, q14 + vshl.s32 q8, q6, q14 + + vmovn.s32 d18, q7 @i4_x4 i4_x5 i4_y4 i4_y5 + vmovn.s32 d19, q8 @i4_x6 i4_x7 i4_y6 i4_y7 + + vst2.s32 {d18-d19}, [r1] + + vpop {d8-d15} + bx lr + + diff --git a/common/arm/ih264_inter_pred_chroma_a9q.s b/common/arm/ih264_inter_pred_chroma_a9q.s new file mode 100755 index 0000000..afd2860 --- /dev/null +++ b/common/arm/ih264_inter_pred_chroma_a9q.s @@ -0,0 +1,254 @@ +@/****************************************************************************** +@ * +@ * Copyright (C) 2015 The Android Open Source Project +@ * +@ * Licensed under the Apache License, Version 2.0 (the "License"); +@ * you may not use this file except in compliance with the License. +@ * You may obtain a copy of the License at: +@ * +@ * http://www.apache.org/licenses/LICENSE-2.0 +@ * +@ * Unless required by applicable law or agreed to in writing, software +@ * distributed under the License is distributed on an "AS IS" BASIS, +@ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +@ * See the License for the specific language governing permissions and +@ * limitations under the License. +@ * +@ ***************************************************************************** +@ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore +@*/ +@/** +@****************************************************************************** +@* @file +@* ih264_inter_pred_chroma_a9q.s +@* +@* @brief +@* Contains function definitions for inter prediction interpolation. +@* +@* @author +@* Ittaim +@* +@* @par List of Functions: +@* +@* - ih264_inter_pred_chroma_a9q() +@* +@* @remarks +@* None +@* +@******************************************************************************* +@*/ + +@/* All the functions here are replicated from ih264_inter_pred_filters.c +@ + +@/** +@/** +@/** +@ +@/** +@******************************************************************************* +@* +@* @brief +@* Interprediction chroma filter +@* +@* @par Description: +@* Applies filtering to chroma samples as mentioned in +@* sec 8.4.2.2.2 titled "chroma sample interpolation process" +@* +@* @param[in] pu1_src +@* UWORD8 pointer to the source containing alternate U and V samples +@* +@* @param[out] pu1_dst +@* UWORD8 pointer to the destination +@* +@* @param[in] src_strd +@* integer source stride +@* +@* @param[in] dst_strd +@* integer destination stride +@* +@* @param[in]uc_dx +@* dx value where the sample is to be produced(refer sec 8.4.2.2.2 ) +@* +@* @param[in] uc_dy +@* dy value where the sample is to be produced(refer sec 8.4.2.2.2 ) +@* +@* @param[in] ht +@* integer height of the array +@* +@* @param[in] wd +@* integer width of the array +@* +@* @returns +@* +@* @remarks +@* None +@* +@******************************************************************************* +@*/ + +@void ih264_inter_pred_chroma(UWORD8 *pu1_src, +@ UWORD8 *pu1_dst, +@ WORD32 src_strd, +@ WORD32 dst_strd, +@ UWORD8 u1_dx, +@ UWORD8 u1_dy, +@ WORD32 ht, +@ WORD32 wd) +@**************Variables Vs Registers***************************************** +@ r0 => *pu1_src +@ r1 => *pu1_dst +@ r2 => src_strd +@ r3 => dst_strd +@ r4 => u1_dx +@ r5 => u1_dy +@ r6 => height +@ r7 => width +@ +.text +.p2align 2 + + .global ih264_inter_pred_chroma_a9q + +ih264_inter_pred_chroma_a9q: + + + + stmfd sp!, {r4-r12, r14} @store register values to stack + vstmdb sp!, {d8-d15} @push neon registers to stack + ldr r4, [sp, #104] + ldr r5, [sp, #108] + ldr r6, [sp, #112] + ldr r7, [sp, #116] + + rsb r8, r4, #8 @8-u1_dx + rsb r9, r5, #8 @8-u1_dy + mul r10, r8, r9 + mul r11, r4, r9 + + vdup.u8 d28, r10 + vdup.u8 d29, r11 + + mul r10, r8, r5 + mul r11, r4, r5 + + vdup.u8 d30, r10 + vdup.u8 d31, r11 + + subs r12, r7, #2 @if wd=4 branch to loop_4 + beq loop_2 + subs r12, r7, #4 @if wd=8 branch to loop_8 + beq loop_4 + +loop_8: + sub r6, #1 + vld1.8 {d0, d1, d2}, [r0], r2 @ Load row0 + vld1.8 {d5, d6, d7}, [r0], r2 @ Load row1 + vext.8 d3, d0, d1, #2 + vext.8 d8, d5, d6, #2 + + vmull.u8 q5, d0, d28 + vmlal.u8 q5, d5, d30 + vmlal.u8 q5, d3, d29 + vmlal.u8 q5, d8, d31 + vext.8 d9, d6, d7, #2 + vext.8 d4, d1, d2, #2 + +inner_loop_8: + vmull.u8 q6, d6, d30 + vmlal.u8 q6, d1, d28 + vmlal.u8 q6, d9, d31 + vmlal.u8 q6, d4, d29 + vmov d0, d5 + vmov d3, d8 + + vqrshrun.s16 d14, q5, #6 + vmov d1, d6 + vmov d4, d9 + + vld1.8 {d5, d6, d7}, [r0], r2 @ Load row1 + vqrshrun.s16 d15, q6, #6 + + vext.8 d8, d5, d6, #2 + subs r6, #1 + vext.8 d9, d6, d7, #2 + vst1.8 {q7}, [r1], r3 @ Store dest row + + vmull.u8 q5, d0, d28 + vmlal.u8 q5, d5, d30 + vmlal.u8 q5, d3, d29 + vmlal.u8 q5, d8, d31 + bne inner_loop_8 + + vmull.u8 q6, d6, d30 + vmlal.u8 q6, d1, d28 + vmlal.u8 q6, d9, d31 + vmlal.u8 q6, d4, d29 + + vqrshrun.s16 d14, q5, #6 + vqrshrun.s16 d15, q6, #6 + + vst1.8 {q7}, [r1], r3 @ Store dest row + + b end_func + +loop_4: + sub r6, #1 + vld1.8 {d0, d1}, [r0], r2 @ Load row0 + vld1.8 {d2, d3}, [r0], r2 @ Load row1 + vext.8 d1, d0, d1, #2 + vext.8 d3, d2, d3, #2 + + vmull.u8 q2, d2, d30 + vmlal.u8 q2, d0, d28 + vmlal.u8 q2, d3, d31 + vmlal.u8 q2, d1, d29 + +inner_loop_4: + subs r6, #1 + vmov d0, d2 + vmov d1, d3 + + vld1.8 {d2, d3}, [r0], r2 @ Load row1 + vqrshrun.s16 d6, q2, #6 + + vext.8 d3, d2, d3, #2 + vst1.8 {d6}, [r1], r3 @ Store dest row + + vmull.u8 q2, d0, d28 + vmlal.u8 q2, d2, d30 + vmlal.u8 q2, d1, d29 + vmlal.u8 q2, d3, d31 + bne inner_loop_4 + + vqrshrun.s16 d6, q2, #6 + vst1.8 {d6}, [r1], r3 @ Store dest row + + b end_func + +loop_2: + vld1.8 {d0}, [r0], r2 @ Load row0 + vext.8 d1, d0, d0, #2 + vld1.8 {d2}, [r0], r2 @ Load row1 + vext.8 d3, d2, d2, #2 + vmull.u8 q2, d0, d28 + vmlal.u8 q2, d1, d29 + vmlal.u8 q2, d2, d30 + vmlal.u8 q2, d3, d31 + vld1.8 {d6}, [r0] @ Load row2 + vqrshrun.s16 d4, q2, #6 + vext.8 d7, d6, d6, #2 + vst1.32 d4[0], [r1], r3 @ Store dest row0 + vmull.u8 q4, d2, d28 + vmlal.u8 q4, d3, d29 + vmlal.u8 q4, d6, d30 + vmlal.u8 q4, d7, d31 + subs r6, #2 + vqrshrun.s16 d8, q4, #6 + vst1.32 d8[0], [r1], r3 @ Store dest row1 + bne loop_2 @ repeat if ht=2 + +end_func: + vldmia sp!, {d8-d15} @ Restore neon registers that were saved + ldmfd sp!, {r4-r12, pc} @ Restoring registers from stack + diff --git a/common/arm/ih264_inter_pred_filters_luma_horz_a9q.s b/common/arm/ih264_inter_pred_filters_luma_horz_a9q.s new file mode 100755 index 0000000..ea6bba0 --- /dev/null +++ b/common/arm/ih264_inter_pred_filters_luma_horz_a9q.s @@ -0,0 +1,245 @@ +@/****************************************************************************** +@ * +@ * Copyright (C) 2015 The Android Open Source Project +@ * +@ * Licensed under the Apache License, Version 2.0 (the "License"); +@ * you may not use this file except in compliance with the License. +@ * You may obtain a copy of the License at: +@ * +@ * http://www.apache.org/licenses/LICENSE-2.0 +@ * +@ * Unless required by applicable law or agreed to in writing, software +@ * distributed under the License is distributed on an "AS IS" BASIS, +@ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +@ * See the License for the specific language governing permissions and +@ * limitations under the License. +@ * +@ ***************************************************************************** +@ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore +@*/ +@/** +@****************************************************************************** +@* @file +@* ih264_inter_pred_luma_horz_a9q.s +@* +@* @brief +@* Contains function definitions for inter prediction interpolation. +@* +@* @author +@* Ittiam +@* +@* @par List of Functions: +@* +@* - ih264_inter_pred_luma_horz_a9q() +@* +@* @remarks +@* None +@* +@******************************************************************************* +@*/ + +@/* All the functions here are replicated from ih264_inter_pred_filters.c +@ + +@/** +@/** +@******************************************************************************* +@* +@* @brief +@* Interprediction luma filter for horizontal input +@* +@* @par Description: +@* Applies a 6 tap horizontal filter .The output is clipped to 8 bits +@* sec 8.4.2.2.1 titled "Luma sample interpolation process" +@* +@* @param[in] pu1_src +@* UWORD8 pointer to the source +@* +@* @param[out] pu1_dst +@* UWORD8 pointer to the destination +@* +@* @param[in] src_strd +@* integer source stride +@* +@* @param[in] dst_strd +@* integer destination stride +@* +@* @param[in] ht +@* integer height of the array +@* +@* @param[in] wd +@* integer width of the array +@* +@* @returns +@* +@ @remarks +@* None +@* +@******************************************************************************* +@*/ + +@void ih264_inter_pred_luma_horz ( +@ UWORD8 *pu1_src, +@ UWORD8 *pu1_dst, +@ WORD32 src_strd, +@ WORD32 dst_strd, +@ WORD32 ht, +@ WORD32 wd ) + +@**************Variables Vs Registers***************************************** +@ r0 => *pu1_src +@ r1 => *pu1_dst +@ r2 => src_strd +@ r3 => dst_strd +@ r5 => ht +@ r6 => wd + +.text +.p2align 2 + + + .global ih264_inter_pred_luma_horz_a9q + +ih264_inter_pred_luma_horz_a9q: + + stmfd sp!, {r4-r12, r14} @store register values to stack + vstmdb sp!, {d8-d15} @push neon registers to stack + ldr r5, [sp, #104] @Loads ht + sub r0, r0, #2 @pu1_src-2 + ldr r6, [sp, #108] @Loads wd + vmov.i8 d0, #5 @filter coeff + subs r12, r6, #8 @if wd=8 branch to loop_8 + vmov.i8 d1, #20 @filter coeff + beq loop_8 + + subs r12, r6, #4 @if wd=4 branch to loop_4 + beq loop_4 + +loop_16: @when wd=16 + @// Processing row0 and row1 + vld1.8 {d2, d3, d4}, [r0], r2 @// Load row0 ;for checking loop + vext.8 d31, d2, d3, #5 @//extract a[5] (column1,row0) + vld1.8 {d5, d6, d7}, [r0], r2 @// Load row1 + vext.8 d30, d3, d4, #5 @//extract a[5] (column2,row0) + vaddl.u8 q4, d31, d2 @// a0 + a5 (column1,row0) + vext.8 d28, d5, d6, #5 @//extract a[5] (column1,row1) + vaddl.u8 q5, d30, d3 @// a0 + a5 (column2,row0) + vext.8 d27, d6, d7, #5 @//extract a[5] (column2,row1) + vaddl.u8 q7, d28, d5 @// a0 + a5 (column1,row1) + vext.8 d31, d2, d3, #2 @//extract a[2] (column1,row0) + vaddl.u8 q8, d27, d6 @// a0 + a5 (column2,row1) + vext.8 d30, d3, d4, #2 @//extract a[2] (column2,row0) + vmlal.u8 q4, d31, d1 @// a0 + a5 + 20a2 (column1,row0) + vext.8 d28, d5, d6, #2 @//extract a[2] (column1,row1) + vmlal.u8 q5, d30, d1 @// a0 + a5 + 20a2 (column2,row0) + vext.8 d27, d6, d7, #2 @//extract a[2] (column2,row1) + vmlal.u8 q7, d28, d1 @// a0 + a5 + 20a2 (column1,row1) + vext.8 d31, d2, d3, #3 @//extract a[3] (column1,row0) + vmlal.u8 q8, d27, d1 @// a0 + a5 + 20a2 (column2,row1) + vext.8 d30, d3, d4, #3 @//extract a[3] (column2,row0) + vmlal.u8 q4, d31, d1 @// a0 + a5 + 20a2 + 20a3 (column1,row0) + vext.8 d28, d5, d6, #3 @//extract a[3] (column1,row1) + vmlal.u8 q5, d30, d1 @// a0 + a5 + 20a2 + 20a3 (column2,row0) + vext.8 d27, d6, d7, #3 @//extract a[3] (column2,row1) + vmlal.u8 q7, d28, d1 @// a0 + a5 + 20a2 + 20a3 (column1,row1) + vext.8 d31, d2, d3, #1 @//extract a[1] (column1,row0) + vmlal.u8 q8, d27, d1 @// a0 + a5 + 20a2 + 20a3 (column2,row1) + vext.8 d30, d3, d4, #1 @//extract a[1] (column2,row0) + vmlsl.u8 q4, d31, d0 @// a0 + a5 + 20a2 + 20a3 - 5a1 (column1,row0) + vext.8 d28, d5, d6, #1 @//extract a[1] (column1,row1) + vmlsl.u8 q5, d30, d0 @// a0 + a5 + 20a2 + 20a3 - 5a1 (column2,row0) + vext.8 d27, d6, d7, #1 @//extract a[1] (column2,row1) + vmlsl.u8 q7, d28, d0 @// a0 + a5 + 20a2 + 20a3 - 5a1 (column1,row1) + vext.8 d31, d2, d3, #4 @//extract a[4] (column1,row0) + vmlsl.u8 q8, d27, d0 @// a0 + a5 + 20a2 + 20a3 - 5a1 (column2,row1) + vext.8 d30, d3, d4, #4 @//extract a[4] (column2,row0) + vmlsl.u8 q4, d31, d0 @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1,row0) + vext.8 d28, d5, d6, #4 @//extract a[4] (column1,row1) + vmlsl.u8 q5, d30, d0 @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column2,row0) + vext.8 d27, d6, d7, #4 @//extract a[4] (column2,row1) + vmlsl.u8 q7, d28, d0 @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1,row1) + vmlsl.u8 q8, d27, d0 @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column2,row1) + vqrshrun.s16 d20, q4, #5 @// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column1,row0) + vqrshrun.s16 d21, q5, #5 @// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column2,row0) + vext.8 d31, d2, d3, #5 @//extract a[5] (column1,row2) + vst1.8 {d20, d21}, [r1], r3 @//Store dest row0 + vqrshrun.s16 d23, q7, #5 @// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column1,row1) + vext.8 d30, d3, d4, #5 @//extract a[5] (column2,row2) + vqrshrun.s16 d24, q8, #5 @// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column2,row1) + vst1.8 {d23, d24}, [r1], r3 @//Store dest row1 + subs r5, r5, #2 @ 2 rows done, decrement by 2 + + beq end_func + b loop_16 @ loop if height == 8 or 16 + +loop_8: +@// Processing row0 and row1 + vld1.8 {d5, d6}, [r0], r2 @// Load row1 + vext.8 d28, d5, d6, #5 @//extract a[5] (column1,row1) + vld1.8 {d2, d3}, [r0], r2 @// Load row0 + vext.8 d25, d5, d6, #2 @//extract a[2] (column1,row1) + vext.8 d31, d2, d3, #5 @//extract a[5] (column1,row0) + vext.8 d24, d5, d6, #3 @//extract a[3] (column1,row1) + vext.8 d23, d5, d6, #1 @//extract a[1] (column1,row1) + vext.8 d22, d5, d6, #4 @//extract a[4] (column1,row1) + vaddl.u8 q7, d28, d5 @// a0 + a5 (column1,row1) + vext.8 d29, d2, d3, #3 @//extract a[3] (column1,row0) + vmlal.u8 q7, d25, d1 @// a0 + a5 + 20a2 (column1,row1) + vmlal.u8 q7, d24, d1 @// a0 + a5 + 20a2 + 20a3 (column1,row1) + vmlsl.u8 q7, d23, d0 @// a0 + a5 + 20a2 + 20a3 - 5a1 (column1,row1) + vmlsl.u8 q7, d22, d0 @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1,row1) + vext.8 d30, d2, d3, #2 @//extract a[2] (column1,row0) + vaddl.u8 q4, d31, d2 @// a0 + a5 (column1,row0) + vext.8 d27, d2, d3, #1 @//extract a[1] (column1,row0) + vext.8 d26, d2, d3, #4 @//extract a[4] (column1,row0) + vmlal.u8 q4, d29, d1 @// a0 + a5 + 20a2 + 20a3 (column1,row0) + vmlal.u8 q4, d30, d1 @// a0 + a5 + 20a2 (column1,row0) + vmlsl.u8 q4, d27, d0 @// a0 + a5 + 20a2 + 20a3 - 5a1 (column1,row0) + vmlsl.u8 q4, d26, d0 @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1,row0) + vqrshrun.s16 d23, q7, #5 @// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column1,row0) + vst1.8 {d23}, [r1], r3 @//Store dest row0 + vqrshrun.s16 d20, q4, #5 @// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column1,row1) + vst1.8 {d20}, [r1], r3 @//Store dest row1 + subs r5, r5, #2 @ 2 rows done, decrement by 2 + + beq end_func @ Branch if height==4 + + b loop_8 @looping if height =8 or 16 + +loop_4: + vld1.8 {d5, d6}, [r0], r2 @// Load row1 + vext.8 d28, d5, d6, #5 @//extract a[5] (column1,row1) + vld1.8 {d2, d3}, [r0], r2 @// Load row0 + vext.8 d25, d5, d6, #2 @//extract a[2] (column1,row1) + vext.8 d31, d2, d3, #5 @//extract a[5] (column1,row0) + vaddl.u8 q7, d28, d5 @// a0 + a5 (column1,row1) + vext.8 d24, d5, d6, #3 @//extract a[3] (column1,row1) + vext.8 d23, d5, d6, #1 @//extract a[1] (column1,row1) + vext.8 d22, d5, d6, #4 @//extract a[4] (column1,row1) + vext.8 d29, d2, d3, #3 @//extract a[3] (column1,row0) + vmlal.u8 q7, d25, d1 @// a0 + a5 + 20a2 (column1,row1) + vmlal.u8 q7, d24, d1 @// a0 + a5 + 20a2 + 20a3 (column1,row1) + vmlsl.u8 q7, d23, d0 @// a0 + a5 + 20a2 + 20a3 - 5a1 (column1,row1) + vmlsl.u8 q7, d22, d0 @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1,row1) + vaddl.u8 q4, d31, d2 @// a0 + a5 (column1,row0) + vext.8 d30, d2, d3, #2 @//extract a[2] (column1,row0) + vext.8 d27, d2, d3, #1 @//extract a[1] (column1,row0) + vext.8 d26, d2, d3, #4 @//extract a[4] (column1,row0) + vmlal.u8 q4, d29, d1 @// a0 + a5 + 20a2 + 20a3 (column1,row0) + vmlal.u8 q4, d30, d1 @// a0 + a5 + 20a2 (column1,row0) + vmlsl.u8 q4, d27, d0 @// a0 + a5 + 20a2 + 20a3 - 5a1 (column1,row0) + vmlsl.u8 q4, d26, d0 @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1,row0) + vqrshrun.s16 d23, q7, #5 @// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column1,row0) + vst1.32 d23[0], [r1], r3 @//Store dest row0 + vqrshrun.s16 d20, q4, #5 @// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column1,row1) + vst1.32 d20[0], [r1], r3 @//Store dest row1 + subs r5, r5, #2 @ 2 rows done, decrement by 2 + beq end_func + + b loop_4 + +end_func: + vldmia sp!, {d8-d15} @ Restore neon registers that were saved + ldmfd sp!, {r4-r12, pc} @Restoring registers from stack + + diff --git a/common/arm/ih264_inter_pred_filters_luma_vert_a9q.s b/common/arm/ih264_inter_pred_filters_luma_vert_a9q.s new file mode 100755 index 0000000..5b29e02 --- /dev/null +++ b/common/arm/ih264_inter_pred_filters_luma_vert_a9q.s @@ -0,0 +1,301 @@ +@/****************************************************************************** +@ * +@ * Copyright (C) 2015 The Android Open Source Project +@ * +@ * Licensed under the Apache License, Version 2.0 (the "License"); +@ * you may not use this file except in compliance with the License. +@ * You may obtain a copy of the License at: +@ * +@ * http://www.apache.org/licenses/LICENSE-2.0 +@ * +@ * Unless required by applicable law or agreed to in writing, software +@ * distributed under the License is distributed on an "AS IS" BASIS, +@ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +@ * See the License for the specific language governing permissions and +@ * limitations under the License. +@ * +@ ***************************************************************************** +@ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore +@*/ +@/** +@****************************************************************************** +@* @file +@* ih264_inter_pred_luma_vert_a9q.s +@* +@* @brief +@* Contains function definitions for inter prediction interpolation. +@* +@* @author +@* Ittiam +@* +@* @par List of Functions: +@* +@* - ih264_inter_pred_luma_vert_a9q() +@* +@* @remarks +@* None +@* +@******************************************************************************* +@*/ + +@/* All the functions here are replicated from ih264_inter_pred_filters.c +@ + +@/** +@/** +@/** +@ ******************************************************************************* +@ * +@ * @brief +@ * Interprediction luma filter for vertical input +@ * +@ * @par Description: +@ * Applies a 6 tap vertcal filter.The output is clipped to 8 bits +@ * sec 8.4.2.2.1 titled "Luma sample interpolation process" +@ * +@ * @param[in] pu1_src +@ * UWORD8 pointer to the source +@ * +@ * @param[out] pu1_dst +@ * UWORD8 pointer to the destination +@ * +@ * @param[in] src_strd +@ * integer source stride +@ * +@ * @param[in] dst_strd +@ * integer destination stride +@ * +@ * @param[in] ht +@ * integer height of the array +@ * +@ * @param[in] wd +@ * integer width of the array +@ * +@ * @returns +@ * +@ * @remarks +@ * None +@ * +@ ******************************************************************************* + +@void ih264_inter_pred_luma_vert ( +@ UWORD8 *pu1_src, +@ UWORD8 *pu1_dst, +@ WORD32 src_strd, +@ WORD32 dst_strd, +@ WORD32 ht, +@ WORD32 wd ) + +@**************Variables Vs Registers***************************************** +@ r0 => *pu1_src +@ r1 => *pu1_dst +@ r2 => src_strd +@ r3 => dst_strd +@ r5 => ht +@ r6 => wd + +.text +.p2align 2 + + + .global ih264_inter_pred_luma_vert_a9q + +ih264_inter_pred_luma_vert_a9q: + + stmfd sp!, {r4-r12, r14} @store register values to stack + vstmdb sp!, {d8-d15} @push neon registers to stack + ldr r5, [sp, #104] @Loads ht + sub r0, r0, r2, lsl #1 @pu1_src-2*src_strd + ldr r6, [sp, #108] @Loads wd + vmov.u16 q11, #20 @ Filter coeff 0x14 into Q11 + + subs r12, r6, #8 @if wd=8 branch to loop_8 + vmov.u16 q12, #5 @ Filter coeff 0x5 into Q12 + beq loop_8 + + subs r12, r6, #4 @if wd=4 branch to loop_4 + beq loop_4 + +loop_16: @when wd=16 + + vld1.u32 {q0}, [r0], r2 @ Vector load from src[0_0] + vld1.u32 {q1}, [r0], r2 @ Vector load from src[1_0] + vld1.u32 {q2}, [r0], r2 @ Vector load from src[2_0] + vld1.u32 {q3}, [r0], r2 @ Vector load from src[3_0] + vld1.u32 {q4}, [r0], r2 @ Vector load from src[4_0] + vaddl.u8 q6, d4, d6 @ temp1 = src[2_0] + src[3_0] + vld1.u32 {q5}, [r0], r2 @ Vector load from src[5_0] + + vaddl.u8 q7, d0, d10 @ temp = src[0_0] + src[5_0] + vaddl.u8 q8, d2, d8 @ temp2 = src[1_0] + src[4_0] + vmla.u16 q7, q6, q11 @ temp += temp1 * 20 + vaddl.u8 q10, d1, d11 @ temp4 = src[0_8] + src[5_8] + vaddl.u8 q9, d5, d7 @ temp3 = src[2_8] + src[3_8] + vmla.u16 q10, q9, q11 @ temp4 += temp3 * 20 + vld1.u32 {q0}, [r0], r2 + vaddl.u8 q13, d3, d9 @ temp5 = src[1_8] + src[4_8] + vaddl.u8 q6, d6, d8 + vmls.u16 q7, q8, q12 @ temp -= temp2 * 5 + vaddl.u8 q8, d2, d0 + vaddl.u8 q9, d4, d10 + vmla.u16 q8, q6, q11 + vmls.u16 q10, q13, q12 @ temp4 -= temp5 * 5 + vaddl.u8 q13, d5, d11 + vaddl.u8 q6, d7, d9 + vqrshrun.s16 d30, q7, #5 @ dst[0_0] = CLIP_U8((temp +16) >> 5) + vaddl.u8 q7, d3, d1 + vld1.u32 {q1}, [r0], r2 + vmla.u16 q7, q6, q11 + vmls.u16 q8, q9, q12 + vqrshrun.s16 d31, q10, #5 @ dst[0_8] = CLIP_U8((temp4 +16) >> 5) + vaddl.u8 q9, d4, d2 + vaddl.u8 q6, d8, d10 + + vst1.u32 {q15}, [r1], r3 @ Vector store to dst[0_0] + vmla.u16 q9, q6, q11 + vaddl.u8 q10, d6, d0 + vmls.u16 q7, q13, q12 + vqrshrun.s16 d30, q8, #5 + vaddl.u8 q6, d9, d11 + vaddl.u8 q8, d5, d3 + vaddl.u8 q13, d7, d1 + vmla.u16 q8, q6, q11 + vmls.u16 q9, q10, q12 + vld1.u32 {q2}, [r0], r2 + + vqrshrun.s16 d31, q7, #5 + vaddl.u8 q6, d10, d0 + vaddl.u8 q7, d6, d4 + vaddl.u8 q10, d8, d2 + vmla.u16 q7, q6, q11 + vmls.u16 q8, q13, q12 + vst1.u32 {q15}, [r1], r3 @store row 1 + vqrshrun.s16 d30, q9, #5 + vaddl.u8 q9, d7, d5 + vaddl.u8 q6, d11, d1 + vmla.u16 q9, q6, q11 + vaddl.u8 q13, d9, d3 + vmls.u16 q7, q10, q12 + + vqrshrun.s16 d31, q8, #5 + vmls.u16 q9, q13, q12 + vaddl.u8 q6, d0, d2 @ temp1 = src[2_0] + src[3_0] + vst1.u32 {q15}, [r1], r3 @store row 2 + vaddl.u8 q8, d10, d4 @ temp2 = src[1_0] + src[4_0] + vaddl.u8 q10, d9, d7 @ temp4 = src[0_8] + src[5_8] + vqrshrun.s16 d30, q7, #5 + vaddl.u8 q13, d5, d11 @ temp5 = src[1_8] + src[4_8] + vaddl.u8 q7, d8, d6 @ temp = src[0_0] + src[5_0] + vqrshrun.s16 d31, q9, #5 + vmla.u16 q7, q6, q11 @ temp += temp1 * 20 + vaddl.u8 q9, d1, d3 @ temp3 = src[2_8] + src[3_8] + vst1.u32 {q15}, [r1], r3 @store row 3 + subs r5, r5, #4 @ 4 rows processed, decrement by 4 + subne r0, r0 , r2, lsl #2 + subne r0, r0, r2 + beq end_func @ Branch if height==4 + + b loop_16 @ looping if height = 8 or 16 + +loop_8: +@// Processing row0 and row1 + + vld1.u32 d0, [r0], r2 @ Vector load from src[0_0] + vld1.u32 d1, [r0], r2 @ Vector load from src[1_0] + vld1.u32 d2, [r0], r2 @ Vector load from src[2_0] + vld1.u32 d3, [r0], r2 @ Vector load from src[3_0] + vld1.u32 d4, [r0], r2 @ Vector load from src[4_0] + vld1.u32 d5, [r0], r2 @ Vector load from src[5_0] + + vaddl.u8 q3, d2, d3 @ temp1 = src[2_0] + src[3_0] + vaddl.u8 q4, d0, d5 @ temp = src[0_0] + src[5_0] + vaddl.u8 q5, d1, d4 @ temp2 = src[1_0] + src[4_0] + vmla.u16 q4, q3, q11 @ temp += temp1 * 20 + vld1.u32 d6, [r0], r2 + vaddl.u8 q7, d3, d4 + vaddl.u8 q8, d1, d6 + vaddl.u8 q9, d2, d5 + vmls.u16 q4, q5, q12 @ temp -= temp2 * 5 + vmla.u16 q8, q7, q11 + vld1.u32 d7, [r0], r2 + vaddl.u8 q10, d4, d5 + vaddl.u8 q6, d2, d7 + vaddl.u8 q5, d3, d6 + vmls.u16 q8, q9, q12 + vqrshrun.s16 d26, q4, #5 @ dst[0_0] = CLIP_U8( (temp + 16) >> 5) + vmla.u16 q6, q10, q11 + vld1.u32 d0, [r0], r2 + vaddl.u8 q7, d5, d6 + vqrshrun.s16 d27, q8, #5 + vaddl.u8 q10, d3, d0 + vmls.u16 q6, q5, q12 + vst1.u32 d26, [r1], r3 @ Vector store to dst[0_0] + vaddl.u8 q9, d4, d7 + vmla.u16 q10, q7, q11 + vst1.u32 d27, [r1], r3 + vqrshrun.s16 d28, q6, #5 + vst1.u32 d28, [r1], r3 + vmls.u16 q10, q9, q12 + vqrshrun.s16 d29, q10, #5 + vst1.u32 d29, [r1], r3 @store row 3 + + subs r5, r5, #4 @ 4 rows processed, decrement by 4 + subne r0, r0 , r2, lsl #2 + subne r0, r0, r2 + beq end_func @ Branch if height==4 + + b loop_8 @looping if height == 8 or 16 + + +loop_4: +@// Processing row0 and row1 + + vld1.u32 d0[0], [r0], r2 @ Vector load from src[0_0] + vld1.u32 d1[0], [r0], r2 @ Vector load from src[1_0] + vld1.u32 d2[0], [r0], r2 @ Vector load from src[2_0] + vld1.u32 d3[0], [r0], r2 @ Vector load from src[3_0] + vld1.u32 d4[0], [r0], r2 @ Vector load from src[4_0] + vld1.u32 d5[0], [r0], r2 @ Vector load from src[5_0] + + vaddl.u8 q3, d2, d3 @ temp1 = src[2_0] + src[3_0] + vaddl.u8 q4, d0, d5 @ temp = src[0_0] + src[5_0] + vaddl.u8 q5, d1, d4 @ temp2 = src[1_0] + src[4_0] + vmla.u16 q4, q3, q11 @ temp += temp1 * 20 + vld1.u32 d6[0], [r0], r2 + vaddl.u8 q7, d3, d4 + vaddl.u8 q8, d1, d6 + vaddl.u8 q9, d2, d5 + vmls.u16 q4, q5, q12 @ temp -= temp2 * 5 + vld1.u32 d7[0], [r0], r2 + vmla.u16 q8, q7, q11 + vaddl.u8 q10, d4, d5 + vaddl.u8 q6, d2, d7 + vaddl.u8 q5, d3, d6 + vmls.u16 q8, q9, q12 + vqrshrun.s16 d26, q4, #5 @ dst[0_0] = CLIP_U8( (temp + 16) >> 5) + vmla.u16 q6, q10, q11 + vld1.u32 d0[0], [r0], r2 + vaddl.u8 q7, d5, d6 + vqrshrun.s16 d27, q8, #5 + vaddl.u8 q10, d3, d0 + vmls.u16 q6, q5, q12 + vst1.u32 d26[0], [r1], r3 @ Vector store to dst[0_0] + vaddl.u8 q9, d4, d7 + vmla.u16 q10, q7, q11 + vst1.u32 d27[0], [r1], r3 + vqrshrun.s16 d28, q6, #5 + vst1.u32 d28[0], [r1], r3 + vmls.u16 q10, q9, q12 + vqrshrun.s16 d29, q10, #5 + vst1.u32 d29[0], [r1], r3 @store row 3 + + subs r5, r5, #8 + subeq r0, r0, r2, lsl #2 + subeq r0, r0, r2 + beq loop_4 @ Loop if height==8 + +end_func: + vldmia sp!, {d8-d15} @ Restore neon registers that were saved + ldmfd sp!, {r4-r12, pc} @Restoring registers from stack + + diff --git a/common/arm/ih264_inter_pred_luma_bilinear_a9q.s b/common/arm/ih264_inter_pred_luma_bilinear_a9q.s new file mode 100755 index 0000000..6a3c83d --- /dev/null +++ b/common/arm/ih264_inter_pred_luma_bilinear_a9q.s @@ -0,0 +1,398 @@ +@/****************************************************************************** +@ * +@ * Copyright (C) 2015 The Android Open Source Project +@ * +@ * Licensed under the Apache License, Version 2.0 (the "License"); +@ * you may not use this file except in compliance with the License. +@ * You may obtain a copy of the License at: +@ * +@ * http://www.apache.org/licenses/LICENSE-2.0 +@ * +@ * Unless required by applicable law or agreed to in writing, software +@ * distributed under the License is distributed on an "AS IS" BASIS, +@ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +@ * See the License for the specific language governing permissions and +@ * limitations under the License. +@ * +@ ***************************************************************************** +@ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore +@*/ +@/** +@****************************************************************************** +@* @file +@* ih264_inter_pred_luma_bilinear_a9q.s +@* +@* @brief +@* Contains function definitions for inter prediction interpolation. +@* +@* @author +@* Ittiam +@* +@* @par List of Functions: +@* +@* - ih264_inter_pred_luma_bilinear_a9q() +@* +@* @remarks +@* None +@* +@******************************************************************************* +@*/ + +@/* All the functions here are replicated from ih264_inter_pred_filters.c +@ + +@/** +@/** +@/** +@ ******************************************************************************* +@ * function:ih264_inter_pred_luma_bilinear +@ * +@* @brief +@* This routine applies the bilinear filter to the predictors . +@* The filtering operation is described in +@* sec 8.4.2.2.1 titled "Luma sample interpolation process" +@* +@* @par Description: +@\note +@* This function is called to obtain pixels lying at the following +@* locations (1/4,1), (3/4,1),(1,1/4), (1,3/4) ,(1/4,1/2), (3/4,1/2),(1/2,1/4), (1/2,3/4),(3/4,1/4),(1/4,3/4),(3/4,3/4)&& (1/4,1/4) . +@* The function averages the two adjacent values from the two input arrays in horizontal direction. +@* +@* +@* @param[in] pu1_src1: +@* UWORD8 Pointer to the buffer containing the first input array. +@* +@* @param[in] pu1_src2: +@* UWORD8 Pointer to the buffer containing the second input array. +@* +@* @param[out] pu1_dst +@* UWORD8 pointer to the destination where the output of bilinear filter is stored. +@* +@* @param[in] src_strd1 +@* Stride of the first input buffer +@* +@* @param[in] src_strd2 +@* Stride of the second input buffer +@* +@* @param[in] dst_strd +@* integer destination stride of pu1_dst +@* +@* @param[in] ht +@* integer height of the array +@* +@* @param[in] wd +@* integer width of the array +@* +@* @returns +@* +@* @remarks +@* None +@* +@******************************************************************************* +@*/ + +@void ih264_inter_pred_luma_bilinear(UWORD8 *pu1_src1, +@ UWORD8 *pu1_src2, +@ UWORD8 *pu1_dst, +@ WORD32 src_strd1, +@ WORD32 src_strd2, +@ WORD32 dst_strd, +@ WORD32 height, +@ WORD32 width) +@ +@**************Variables Vs Registers***************************************** +@ r0 => *pu1_src1 +@ r1 => *pu1_src2 +@ r2 => *pu1_dst +@ r3 => src_strd1 +@ r4 => src_strd2 +@ r5 => dst_strd +@ r6 => height +@ r7 => width +@ +.text +.p2align 2 + + .global ih264_inter_pred_luma_bilinear_a9q + +ih264_inter_pred_luma_bilinear_a9q: + + + + stmfd sp!, {r4-r12, r14} @store register values to stack + vstmdb sp!, {d8-d15} @push neon registers to stack + ldr r4, [sp, #104] + ldr r5, [sp, #108] @ + ldr r6, [sp, #112] + ldr r7, [sp, #116] + + subs r12, r7, #4 @if wd=4 branch to loop_4 + beq loop_4 + subs r12, r7, #8 @if wd=8 branch to loop_8 + beq loop_8 + +loop_16: @when wd=16 + + vld1.8 {q0}, [r0], r3 @// Load row0 ;src1 + vld1.8 {q2}, [r1], r4 @// Load row0 ;src2 + vld1.8 {q1}, [r0], r3 @// Load row1 ;src1 + vaddl.u8 q10, d0, d4 + vld1.8 {q3}, [r1], r4 @// Load row1 ;src2 + vaddl.u8 q11, d1, d5 + vld1.8 {q4}, [r0], r3 @// Load row2 ;src1 + vaddl.u8 q12, d2, d6 + vld1.8 {q5}, [r0], r3 @// Load row3 ;src1 + vaddl.u8 q13, d3, d7 + vld1.8 {q6}, [r1], r4 @// Load row2 ;src2 + vaddl.u8 q8, d8, d12 + vld1.8 {q7}, [r1], r4 @// Load row3 ;src2 + vaddl.u8 q9, d9, d13 + vqrshrun.s16 d28, q10, #1 + vqrshrun.s16 d29, q11, #1 + vaddl.u8 q10, d10, d14 + vqrshrun.s16 d30, q12, #1 + vqrshrun.s16 d31, q13, #1 + vst1.8 {q14}, [r2], r5 @//Store dest row0 + vaddl.u8 q11, d11, d15 + vst1.8 {q15}, [r2], r5 @//Store dest row1 + vqrshrun.s16 d28, q8, #1 + vld1.8 {q0}, [r0], r3 @// Load row4 ;src1 + vqrshrun.s16 d29, q9, #1 + vld1.8 {q1}, [r0], r3 @// Load row5 ;src1 + vqrshrun.s16 d30, q10, #1 + vld1.8 {q2}, [r1], r4 @// Load row4 ;src2 + vqrshrun.s16 d31, q11, #1 + vld1.8 {q3}, [r1], r4 @// Load row5 ;src2 + vaddl.u8 q10, d0, d4 + vst1.8 {q14}, [r2], r5 @//Store dest row2 + vaddl.u8 q13, d3, d7 + vst1.8 {q15}, [r2], r5 @//Store dest row3 + vaddl.u8 q11, d1, d5 + vld1.8 {q4}, [r0], r3 @// Load row6 ;src1 + vaddl.u8 q12, d2, d6 + vld1.8 {q5}, [r0], r3 @// Load row7 ;src1 + vqrshrun.s16 d28, q10, #1 + vld1.8 {q6}, [r1], r4 @// Load row6 ;src2 + vqrshrun.s16 d29, q11, #1 + vld1.8 {q7}, [r1], r4 @// Load row7 ;src2 + vaddl.u8 q8, d8, d12 + vaddl.u8 q9, d9, d13 + vaddl.u8 q10, d10, d14 + vqrshrun.s16 d30, q12, #1 + vqrshrun.s16 d31, q13, #1 + vst1.8 {q14}, [r2], r5 @//Store dest row4 + vaddl.u8 q11, d11, d15 + vst1.8 {q15}, [r2], r5 @//Store dest row5 + vqrshrun.s16 d28, q8, #1 + vqrshrun.s16 d30, q10, #1 + vqrshrun.s16 d29, q9, #1 + vld1.8 {q2}, [r1], r4 @// Load row8 ;src2 + vqrshrun.s16 d31, q11, #1 + vst1.8 {q14}, [r2], r5 @//Store dest row6 + subs r12, r6, #8 + vst1.8 {q15}, [r2], r5 @//Store dest row7 + + beq end_func @ end function if ht=8 + + vld1.8 {q0}, [r0], r3 @// Load row8 ;src1 + vaddl.u8 q10, d0, d4 + vld1.8 {q1}, [r0], r3 @// Load row9 ;src1 + vaddl.u8 q11, d1, d5 + vld1.8 {q3}, [r1], r4 @// Load row9 ;src2 + vqrshrun.s16 d28, q10, #1 + vld1.8 {q4}, [r0], r3 @// Load row10 ;src1 + vqrshrun.s16 d29, q11, #1 + vld1.8 {q5}, [r0], r3 @// Load row11 ;src1 + vaddl.u8 q12, d2, d6 + vld1.8 {q6}, [r1], r4 @// Load row10 ;src2 + vaddl.u8 q13, d3, d7 + vld1.8 {q7}, [r1], r4 @// Load row11 ;src2 + vaddl.u8 q8, d8, d12 + vaddl.u8 q9, d9, d13 + vaddl.u8 q10, d10, d14 + vqrshrun.s16 d30, q12, #1 + vst1.8 {q14}, [r2], r5 @//Store dest row8 + vqrshrun.s16 d31, q13, #1 + vst1.8 {q15}, [r2], r5 @//Store dest row9 + vqrshrun.s16 d28, q8, #1 + vld1.8 {q0}, [r0], r3 @// Load row12 ;src1 + vaddl.u8 q11, d11, d15 + vld1.8 {q1}, [r0], r3 @// Load row13 ;src1 + vqrshrun.s16 d29, q9, #1 + vld1.8 {q2}, [r1], r4 @// Load row12 ;src2 + vqrshrun.s16 d30, q10, #1 + vld1.8 {q3}, [r1], r4 @// Load row13 ;src2 + vqrshrun.s16 d31, q11, #1 + vst1.8 {q14}, [r2], r5 @//Store dest row10 + vaddl.u8 q10, d0, d4 + vst1.8 {q15}, [r2], r5 @//Store dest row11 + vaddl.u8 q11, d1, d5 + vld1.8 {q4}, [r0], r3 @// Load row14 ;src1 + vaddl.u8 q13, d3, d7 + vld1.8 {q5}, [r0], r3 @// Load row15 ;src1 + vaddl.u8 q12, d2, d6 + vld1.8 {q6}, [r1], r4 @// Load row14 ;src2 + vaddl.u8 q8, d8, d12 + vld1.8 {q7}, [r1], r4 @// Load row15 ;src2 + vaddl.u8 q9, d9, d13 + vqrshrun.s16 d28, q10, #1 + vqrshrun.s16 d29, q11, #1 + vaddl.u8 q10, d10, d14 + vst1.8 {q14}, [r2], r5 @//Store dest row12 + vqrshrun.s16 d30, q12, #1 + vqrshrun.s16 d31, q13, #1 + vaddl.u8 q11, d11, d15 + vst1.8 {q15}, [r2], r5 @//Store dest row13 + vqrshrun.s16 d28, q8, #1 + vqrshrun.s16 d29, q9, #1 + vqrshrun.s16 d30, q10, #1 + vst1.8 {q14}, [r2], r5 @//Store dest row14 + vqrshrun.s16 d31, q11, #1 + vst1.8 {q15}, [r2], r5 @//Store dest row15 + b end_func + + + +loop_8: @wd=8; + vld1.8 {d0}, [r0], r3 @// Load row0 ;src1 + vld1.8 {d4}, [r1], r4 @// Load row0 ;src2 + vld1.8 {d1}, [r0], r3 @// Load row1 ;src1 + vaddl.u8 q10, d0, d4 + vld1.8 {d5}, [r1], r4 @// Load row1 ;src2 + vld1.8 {d2}, [r0], r3 @// Load row2 ;src1 + vqrshrun.s16 d28, q10, #1 + vld1.8 {d6}, [r1], r4 @// Load row2 ;src2 + vaddl.u8 q11, d1, d5 + vld1.8 {d3}, [r0], r3 @// Load row3 ;src1 + vaddl.u8 q12, d2, d6 + vst1.8 {d28}, [r2], r5 @//Store dest row0 + vqrshrun.s16 d29, q11, #1 + vld1.8 {d7}, [r1], r4 @// Load row3 ;src2 + vqrshrun.s16 d30, q12, #1 + vst1.8 {d29}, [r2], r5 @//Store dest row1 + vaddl.u8 q13, d3, d7 + vst1.8 {d30}, [r2], r5 @//Store dest row2 + vqrshrun.s16 d31, q13, #1 + subs r12, r6, #4 + vst1.8 {d31}, [r2], r5 @//Store dest row3 + beq end_func @ end function if ht=4 + + vld1.8 {d12}, [r1], r4 @// Load row4 ;src2 + vld1.8 {d8}, [r0], r3 @// Load row4 ;src1 + vld1.8 {d9}, [r0], r3 @// Load row5 ;src1 + vaddl.u8 q8, d8, d12 + vld1.8 {d13}, [r1], r4 @// Load row5 ;src2 + vld1.8 {d10}, [r0], r3 @// Load row6;src1 + vaddl.u8 q9, d9, d13 + vld1.8 {d14}, [r1], r4 @// Load row6 ;src2 + vqrshrun.s16 d28, q8, #1 + vld1.8 {d11}, [r0], r3 @// Load row7 ;src1 + vqrshrun.s16 d29, q9, #1 + vst1.8 {d28}, [r2], r5 @//Store dest row4 + vaddl.u8 q10, d10, d14 + vst1.8 {d29}, [r2], r5 @//Store dest row5 + vqrshrun.s16 d30, q10, #1 + vld1.8 {d15}, [r1], r4 @// Load row7 ;src2 + vaddl.u8 q11, d11, d15 + vst1.8 {d30}, [r2], r5 @//Store dest row6 + vqrshrun.s16 d31, q11, #1 + subs r12, r6, #8 + vst1.8 {d31}, [r2], r5 @//Store dest row7 + beq end_func @ end function if ht=8 + + vld1.8 {d0}, [r0], r3 @// Load row8 ;src1 + vld1.8 {d4}, [r1], r4 @// Load row8 ;src2 + vld1.8 {d1}, [r0], r3 @// Load row9 ;src1 + vaddl.u8 q10, d0, d4 + vld1.8 {d5}, [r1], r4 @// Load row9 ;src2 + vld1.8 {d2}, [r0], r3 @// Load row10 ;src1 + vaddl.u8 q11, d1, d5 + vld1.8 {d6}, [r1], r4 @// Load row10 ;src2 + vqrshrun.s16 d28, q10, #1 + vld1.8 {d3}, [r0], r3 @// Load row11 ;src1 + vaddl.u8 q12, d2, d6 + vld1.8 {d7}, [r1], r4 @// Load row11 ;src2 + vqrshrun.s16 d29, q11, #1 + vld1.8 {d8}, [r0], r3 @// Load row12 ;src1 + vaddl.u8 q13, d3, d7 + vst1.8 {d28}, [r2], r5 @//Store dest row8 + vqrshrun.s16 d30, q12, #1 + vld1.8 {d12}, [r1], r4 @// Load row12 ;src2 + vqrshrun.s16 d31, q13, #1 + vst1.8 {d29}, [r2], r5 @//Store dest row9 + vaddl.u8 q8, d8, d12 + vld1.8 {d9}, [r0], r3 @// Load row13 ;src1 + vqrshrun.s16 d28, q8, #1 + vld1.8 {d13}, [r1], r4 @// Load row13 ;src2 + vld1.8 {d10}, [r0], r3 @// Load row14;src1 + vaddl.u8 q9, d9, d13 + vld1.8 {d11}, [r0], r3 @// Load row15 ;src1 + vld1.8 {d14}, [r1], r4 @// Load row14 ;src2 + vqrshrun.s16 d29, q9, #1 + vld1.8 {d15}, [r1], r4 @// Load roW15 ;src2 + vaddl.u8 q10, d10, d14 + vst1.8 {d30}, [r2], r5 @//Store dest row10 + vaddl.u8 q11, d11, d15 + vst1.8 {d31}, [r2], r5 @//Store dest row11 + vqrshrun.s16 d30, q10, #1 + vst1.8 {d28}, [r2], r5 @//Store dest row12 + vqrshrun.s16 d31, q11, #1 + vst1.8 {d29}, [r2], r5 @//Store dest row13 + vst1.8 {d30}, [r2], r5 @//Store dest row14 + vst1.8 {d31}, [r2], r5 @//Store dest row15 + + b end_func + + + +loop_4: + vld1.32 d0[0], [r0], r3 @// Load row0 ;src1 + vld1.32 d4[0], [r1], r4 @// Load row0 ;src2 + vld1.32 d1[0], [r0], r3 @// Load row1 ;src1 + vaddl.u8 q10, d0, d4 + vld1.32 d5[0], [r1], r4 @// Load row1 ;src2 + vld1.32 d2[0], [r0], r3 @// Load row2 ;src1 + vqrshrun.s16 d28, q10, #1 + vld1.32 d6[0], [r1], r4 @// Load row2 ;src2 + vaddl.u8 q11, d1, d5 + vld1.32 d3[0], [r0], r3 @// Load row3 ;src1 + vaddl.u8 q12, d2, d6 + vst1.32 d28[0], [r2], r5 @//Store dest row0 + vqrshrun.s16 d29, q11, #1 + vld1.32 d7[0], [r1], r4 @// Load row3 ;src2 + vqrshrun.s16 d30, q12, #1 + vst1.32 d29[0], [r2], r5 @//Store dest row1 + vaddl.u8 q13, d3, d7 + vst1.32 d30[0], [r2], r5 @//Store dest row2 + vqrshrun.s16 d31, q13, #1 + subs r12, r6, #4 + vst1.32 d31[0], [r2], r5 @//Store dest row3 + beq end_func @ end function if ht=4 + + vld1.32 d12[0], [r1], r4 @// Load row4 ;src2 + vld1.32 d8[0], [r0], r3 @// Load row4 ;src1 + vld1.32 d9[0], [r0], r3 @// Load row5 ;src1 + vaddl.u8 q8, d8, d12 + vld1.32 d13[0], [r1], r4 @// Load row5 ;src2 + vld1.32 d10[0], [r0], r3 @// Load row6;src1 + vaddl.u8 q9, d9, d13 + vld1.32 d14[0], [r1], r4 @// Load row6 ;src2 + vqrshrun.s16 d28, q8, #1 + vld1.32 d11[0], [r0], r3 @// Load row7 ;src1 + vqrshrun.s16 d29, q9, #1 + vst1.32 d28[0], [r2], r5 @//Store dest row4 + vaddl.u8 q10, d10, d14 + vst1.32 d29[0], [r2], r5 @//Store dest row5 + vqrshrun.s16 d30, q10, #1 + vld1.32 d15[0], [r1], r4 @// Load row7 ;src2 + vaddl.u8 q11, d11, d15 + vst1.32 d30[0], [r2], r5 @//Store dest row6 + vqrshrun.s16 d31, q11, #1 + vst1.32 d31[0], [r2], r5 @//Store dest row7 + +end_func: + + vldmia sp!, {d8-d15} @ Restore neon registers that were saved + ldmfd sp!, {r4-r12, pc} @Restoring registers from stack + + diff --git a/common/arm/ih264_inter_pred_luma_copy_a9q.s b/common/arm/ih264_inter_pred_luma_copy_a9q.s new file mode 100755 index 0000000..8ba2fbf --- /dev/null +++ b/common/arm/ih264_inter_pred_luma_copy_a9q.s @@ -0,0 +1,253 @@ +@/****************************************************************************** +@ * +@ * Copyright (C) 2015 The Android Open Source Project +@ * +@ * Licensed under the Apache License, Version 2.0 (the "License"); +@ * you may not use this file except in compliance with the License. +@ * You may obtain a copy of the License at: +@ * +@ * http://www.apache.org/licenses/LICENSE-2.0 +@ * +@ * Unless required by applicable law or agreed to in writing, software +@ * distributed under the License is distributed on an "AS IS" BASIS, +@ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +@ * See the License for the specific language governing permissions and +@ * limitations under the License. +@ * +@ ***************************************************************************** +@ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore +@*/ +@/** +@/** +@******************************************************************************* +@* +@* @brief +@* Interprediction luma function for copy +@* +@* @par Description: +@* Copies the array of width 'wd' and height 'ht' from the location pointed +@* by 'src' to the location pointed by 'dst' +@* +@* @param[in] pu1_src +@* UWORD8 pointer to the source +@* +@* @param[out] pu1_dst +@* UWORD8 pointer to the destination +@* +@* @param[in] src_strd +@* integer source stride +@* +@* @param[in] dst_strd +@* integer destination stride +@* +@* +@* @param[in] ht +@* integer height of the array +@* +@* @param[in] wd +@* integer width of the array +@* +@* @returns +@* +@* @remarks +@* None +@* +@******************************************************************************* +@*/ +@void ih264_inter_pred_luma_copy ( +@ UWORD8 *pu1_src, +@ UWORD8 *pu1_dst, +@ WORD32 src_strd, +@ WORD32 dst_strd, +@ WORD32 ht, +@ WORD32 wd ) + +@**************Variables Vs Registers***************************************** +@ r0 => *pu1_src +@ r1 => *pu1_dst +@ r2 => src_strd +@ r3 => dst_strd +@ r7 => ht +@ r12 => wd + +.text +.p2align 2 + + .global ih264_inter_pred_luma_copy_a9q + +ih264_inter_pred_luma_copy_a9q: + stmfd sp!, {r4-r12, r14} @stack stores the values of the arguments + vstmdb sp!, {d8-d15} @push neon registers to stack + ldr r12, [sp, #108] @Loads wd + ldr r7, [sp, #104] @Loads ht + cmp r7, #0 @checks ht == 0 + ble end_loops + tst r12, #15 @checks wd for multiples for 4 & 8 + beq core_loop_wd_16 + tst r12, #7 @checks wd for multiples for 4 & 8 + beq core_loop_wd_8 + sub r11, r12, #4 + +outer_loop_wd_4: + subs r4, r12, #0 @checks wd == 0 + ble end_inner_loop_wd_4 + +inner_loop_wd_4: + vld1.32 {d0[0]}, [r0] @vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp, 0) + add r5, r0, r2 @pu1_src_tmp += src_strd + add r6, r1, r3 @pu1_dst_tmp += dst_strd + vst1.32 {d0[0]}, [r1] @vst1_lane_u32((uint32_t *)pu1_dst_tmp, src_tmp, 0) + vld1.32 {d0[0]}, [r5], r2 @vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp, 0) + add r0, r0, #4 @pu1_src += 4 + vst1.32 {d0[0]}, [r6], r3 @vst1_lane_u32((uint32_t *)pu1_dst_tmp, src_tmp, 0) + vld1.32 {d0[0]}, [r5], r2 @vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp, 0) + subs r4, r4, #4 @(wd -4) + vst1.32 {d0[0]}, [r6], r3 @vst1_lane_u32((uint32_t *)pu1_dst_tmp, src_tmp, 0) + vld1.32 {d0[0]}, [r5], r2 @vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp, 0) + add r1, r1, #4 @pu1_dst += 4 + vst1.32 {d0[0]}, [r6], r3 @vst1_lane_u32((uint32_t *)pu1_dst_tmp, src_tmp, 0) + + bgt inner_loop_wd_4 + +end_inner_loop_wd_4: + subs r7, r7, #4 @ht - 4 + sub r0, r5, r11 @pu1_src = pu1_src_tmp + sub r1, r6, r11 @pu1_dst = pu1_dst_tmp + bgt outer_loop_wd_4 + +end_loops: + vldmia sp!, {d8-d15} @ Restore neon registers that were saved + ldmfd sp!, {r4-r12, r15} @Reload the registers from SP + + + +core_loop_wd_8: + sub r11, r12, #8 + +outer_loop_wd_8: + subs r4, r12, #0 @checks wd + ble end_inner_loop_wd_8 + +inner_loop_wd_8: + add r5, r0, r2 @pu1_src_tmp += src_strd + vld1.8 {d0}, [r0]! @vld1_u8(pu1_src_tmp) + add r6, r1, r3 @pu1_dst_tmp += dst_strd + vst1.8 {d0}, [r1]! @vst1_u8(pu1_dst_tmp, tmp_src) + vld1.8 {d1}, [r5], r2 @vld1_u8(pu1_src_tmp) + vst1.8 {d1}, [r6], r3 @vst1_u8(pu1_dst_tmp, tmp_src) + subs r4, r4, #8 @wd - 8(Loop condition) + vld1.8 {d2}, [r5], r2 @vld1_u8(pu1_src_tmp) + vst1.8 {d2}, [r6], r3 @vst1_u8(pu1_dst_tmp, tmp_src) + vld1.8 {d3}, [r5], r2 @vld1_u8(pu1_src_tmp) + vst1.8 {d3}, [r6], r3 @vst1_u8(pu1_dst_tmp, tmp_src) + bgt inner_loop_wd_8 + +end_inner_loop_wd_8: + subs r7, r7, #4 @ht -= 4 + sub r0, r5, r11 @pu1_src = pu1_src_tmp + sub r1, r6, r11 @pu1_dst = pu1_dst_tmp + bgt outer_loop_wd_8 + + vldmia sp!, {d8-d15} @ Restore neon registers that were saved + ldmfd sp!, {r4-r12, r15} @Reload the registers from SP + +core_loop_wd_16: + sub r11, r12, #16 + +outer_loop_wd_16: + subs r4, r12, #0 @checks wd + ble end_inner_loop_wd_16 + +inner_loop_wd_16: + add r5, r0, r2 @pu1_src_tmp += src_strd + vld1.8 {q0}, [r0]! @vld1_u8(pu1_src_tmp) + add r6, r1, r3 @pu1_dst_tmp += dst_strd + vst1.8 {q0}, [r1]! @vst1_u8(pu1_dst_tmp, tmp_src) + vld1.8 {q1}, [r5], r2 @vld1_u8(pu1_src_tmp) + vst1.8 {q1}, [r6], r3 @vst1_u8(pu1_dst_tmp, tmp_src) + subs r4, r4, #16 @wd - 8(Loop condition) + vld1.8 {q2}, [r5], r2 @vld1_u8(pu1_src_tmp) + vst1.8 {q2}, [r6], r3 @vst1_u8(pu1_dst_tmp, tmp_src) + vld1.8 {q3}, [r5], r2 @vld1_u8(pu1_src_tmp) + vst1.8 {q3}, [r6], r3 @vst1_u8(pu1_dst_tmp, tmp_src) + bgt inner_loop_wd_16 + +end_inner_loop_wd_16: + subs r7, r7, #4 @ht -= 4 + sub r0, r5, r11 @pu1_src = pu1_src_tmp + sub r1, r6, r11 @pu1_dst = pu1_dst_tmp + bgt outer_loop_wd_16 + + vldmia sp!, {d8-d15} @ Restore neon registers that were saved + ldmfd sp!, {r4-r12, r15} @Reload the registers from SP + + +@ /* +@ ******************************************************************************** +@ * +@ * @brief This function copies a 4x4 block to destination +@ * +@ * @par Description: +@ * Copies a 4x4 block to destination, where both src and dst are interleaved +@ * +@ * @param[in] pi2_src +@ * Source +@ * +@ * @param[in] pu1_out +@ * Output pointer +@ * +@ * @param[in] pred_strd, +@ * Prediction buffer stride +@ * +@ * @param[in] out_strd +@ * output buffer buffer Stride +@ * +@ * @returns none +@ * +@ * @remarks none +@ * Currently wd and height is not used, ie a 4x4 block is always copied +@ * +@ ******************************************************************************* +@ */ +@ void ih264_interleave_copy(WORD16 *pi2_src, +@ UWORD8 *pu1_out, +@ WORD32 pred_strd, +@ WORD32 out_strd +@ WORD32 wd +@ WORD32 ht) +@ Register Usage +@ r0 : pi2_src +@ r1 : pu1_out +@ r2 : src_strd +@ r3 : out_strd +@ Neon registers d0-d7, d16-d30 are used +@ No need for pushing arm and neon registers + + .global ih264_interleave_copy_a9 +ih264_interleave_copy_a9: + + vld1.u8 d2, [r0], r2 @load src plane 1 => d2 &pred palne 2 => d3 + vld1.u8 d3, [r0], r2 + vld1.u8 d4, [r0], r2 + vld1.u8 d5, [r0], r2 + + mov r0, r1 + + vld1.u8 d18, [r1], r3 @load out [8 bit size) -8 coeffs + vld1.u8 d19, [r1], r3 + vmov.u16 q15, #0x00ff + vld1.u8 d20, [r1], r3 + vld1.u8 d21, [r1], r3 + + vbit.u8 q9, q1, q15 + vbit.u8 q10, q2, q15 + + vst1.u8 d18, [r0], r3 @store out + vst1.u8 d19, [r0], r3 + vst1.u8 d20, [r0], r3 + vst1.u8 d21, [r0], r3 + + bx lr + + + diff --git a/common/arm/ih264_inter_pred_luma_horz_hpel_vert_hpel_a9q.s b/common/arm/ih264_inter_pred_luma_horz_hpel_vert_hpel_a9q.s new file mode 100755 index 0000000..43321a8 --- /dev/null +++ b/common/arm/ih264_inter_pred_luma_horz_hpel_vert_hpel_a9q.s @@ -0,0 +1,441 @@ +@/****************************************************************************** +@ * +@ * Copyright (C) 2015 The Android Open Source Project +@ * +@ * Licensed under the Apache License, Version 2.0 (the "License"); +@ * you may not use this file except in compliance with the License. +@ * You may obtain a copy of the License at: +@ * +@ * http://www.apache.org/licenses/LICENSE-2.0 +@ * +@ * Unless required by applicable law or agreed to in writing, software +@ * distributed under the License is distributed on an "AS IS" BASIS, +@ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +@ * See the License for the specific language governing permissions and +@ * limitations under the License. +@ * +@ ***************************************************************************** +@ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore +@*/ +@/** +@****************************************************************************** +@* @file +@* ih264_inter_pred_luma_horz_hpel_vert_hpel_a9q.s +@* +@* @brief +@* Contains function definitions for inter prediction interpolation. +@* +@* @author +@* Mohit +@* +@* @par List of Functions: +@* +@* - ih264_inter_pred_luma_horz_hpel_vert_hpel_a9q() +@* +@* @remarks +@* None +@* +@******************************************************************************* +@*/ + +@/* All the functions here are replicated from ih264_inter_pred_filters.c +@ + +@/** +@/** +@/** +@******************************************************************************* +@* +@* @brief +@* This function implements a two stage cascaded six tap filter. It +@* applies the six tap filter in the vertical direction on the +@* predictor values, followed by applying the same filter in the +@* horizontal direction on the output of the first stage. The six tap +@* filtering operation is described in sec 8.4.2.2.1 titled "Luma sample +@* interpolation process" +@* +@* @par Description: +@* This function is called to obtain pixels lying at the following +@* location (1/2,1/2). The function interpolates +@* the predictors first in the horizontal direction and then in the +@* vertical direction to output the (1/2,1/2). +@* +@* @param[in] pu1_src +@* UWORD8 pointer to the source +@* +@* @param[out] pu1_dst +@* UWORD8 pointer to the destination +@* +@* @param[in] src_strd +@* integer source stride +@* +@* @param[in] dst_strd +@* integer destination stride +@* +@* @param[in] ht +@* integer height of the array +@* +@* @param[in] wd +@* integer width of the array +@* +@* @param[in] pu1_tmp: temporary buffer +@* +@* @param[in] dydx: x and y reference offset for qpel calculations: UNUSED in this function. +@* +@* @returns +@* +@* @remarks +@* None +@* +@******************************************************************************* +@*/; + +@void ih264_inter_pred_luma_horz_hpel_vert_hpel(UWORD8 *pu1_src, +@ UWORD8 *pu1_dst, +@ WORD32 src_strd,, +@ WORD32 dst_strd, +@ WORD32 ht, +@ WORD32 wd, +@ UWORD8* pu1_tmp, +@ UWORD32 dydx) + +@**************Variables Vs Registers***************************************** +@ r0 => *pu1_src +@ r1 => *pu1_dst +@ r2 => src_strd +@ r3 => dst_strd +@ r8 => ht +@ r9 => wd + +.text +.p2align 2 + + .global ih264_inter_pred_luma_horz_hpel_vert_hpel_a9q + +ih264_inter_pred_luma_horz_hpel_vert_hpel_a9q: + + stmfd sp!, {r4-r12, r14} @store register values to stack + vstmdb sp!, {d8-d15} @push neon registers to stack + ldr r8, [sp, #104] @ loads ht + sub r0, r0, r2, lsl #1 @pu1_src-2*src_strd + sub r0, r0, #2 @pu1_src-2 + ldr r9, [sp, #108] @ loads wd + + vmov.s16 d0, #20 @ Filter coeff 20 + vmov.s16 d1, #5 @ Filter coeff 5 + subs r12, r9, #4 @if wd=4 branch to loop_4 + beq loop_4 + subs r12, r9, #8 @if wd=8 branch to loop_8 + beq loop_8 + + mov r10, #8 + sub r7, r3, r10 + @when wd=16 + +loop_16: + vld1.u32 {d2, d3, d4}, [r0], r2 @ Vector load from src[0_0] + vld1.u32 {d5, d6, d7}, [r0], r2 @ Vector load from src[1_0] + vld1.u32 {d8, d9, d10}, [r0], r2 @ Vector load from src[2_0] + vld1.u32 {d11, d12, d13}, [r0], r2 @ Vector load from src[3_0] + vld1.u32 {d14, d15, d16}, [r0], r2 @ Vector load from src[4_0] + vld1.u32 {d17, d18, d19}, [r0], r2 @ Vector load from src[5_0] + + @ vERTICAL FILTERING FOR ROW 0 + vaddl.u8 q10, d8, d11 @ temp1 = src[2_0] + src[3_0] + vaddl.u8 q12, d2, d17 @ temp2 = src[0_0] + src[5_0] + vaddl.u8 q11, d5, d14 @ temp = src[1_0] + src[4_0] + vaddl.u8 q13, d3, d18 @ temp2 = src[0_0] + src[5_0] + vmla.u16 q12, q10, d0[0] @ temp += temp1 * 20 + vmls.s16 q12, q11, d1[0] @ temp -= temp2 * 5 + vaddl.u8 q10, d6, d15 @ temp = src[1_0] + src[4_0] + vaddl.u8 q11, d9, d12 @ temp3 = src[2_0] + src[3_0] + vaddl.u8 q14, d4, d19 @ temp2 = src[0_0] + src[5_0] + vmla.u16 q13, q11, d0[0] @ temp4 += temp3 * 20 + vmls.s16 q13, q10, d1[0] @ temp -= temp2 * 5 + vaddl.u8 q11, d10, d13 @ temp3 = src[2_0] + src[3_0] + vaddl.u8 q10, d7, d16 @ temp = src[1_0] + src[4_0] + vmla.u16 q14, q11, d0[0] @ temp4 += temp3 * 20 + vmls.s16 q14, q10, d1[0] @ temp -= temp2 * 5 + vext.16 q10, q12, q13, #5 @//extract a[5] (column1) + + @Q12,Q13,Q14 HAVE VERTICAL FILTERED VALUES + @CASCADED FILTERING FOR ROW 0 + vext.16 q11, q12, q13, #2 @//extract a[2] (column1) + vaddl.s16 q1, d20, d24 @// a0 + a5 (column1) + vaddl.s16 q15, d21, d25 @// a0 + a5 (column1) + vmlal.s16 q1, d22, d0[0] @// a0 + a5 + 20a2 (column1) + vmlal.s16 q15, d23, d0[0] @// a0 + a5 + 20a2 (column1) + vext.16 q11, q12, q13, #1 @//extract a[1] (column1) + vext.16 q10, q12, q13, #3 @//extract a[3] (column1) + vmlsl.s16 q1, d22, d1[0] @// a0 + a5 + 20a2 + 20a3 - 5a1 (column1) + vmlsl.s16 q15, d23, d1[0] @// a0 + a5 + 20a2 + 20a3 - 5a1 (column1) + vmlal.s16 q1, d20, d0[0] @// a0 + a5 + 20a2 + 20a3 (column1) + vmlal.s16 q15, d21, d0[0] @// a0 + a5 + 20a2 + 20a3 (column1) + vext.16 q11, q12, q13, #4 @//extract a[4] (column1) + vext.16 q10, q13, q14, #5 @//extract a[5] (column2) + vmlsl.s16 q1, d22, d1[0] @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1) + vmlsl.s16 q15, d23, d1[0] @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1) + vqrshrun.s32 d22, q1, #10 + vqrshrun.s32 d23, q15, #10 + vqshrun.s16 d22, q11, #0 + vst1.u8 {d22}, [r1], r10 @//Store dest row0, column 1; (1/2,1/2) + vext.16 q11, q13, q14, #2 @//extract a[2] (column2) + vaddl.s16 q1, d20, d26 @// a0 + a5 (column2) + vaddl.s16 q15, d21, d27 @// a0 + a5 (column2) + vmlal.s16 q1, d22, d0[0] @// a0 + a5 + 20a2 (column2) + vmlal.s16 q15, d23, d0[0] @// a0 + a5 + 20a2 (column2) + vext.16 q10, q13, q14, #3 @//extract a[3] (column2) + vext.16 q11, q13, q14, #1 @//extract a[1] (column2) + vmlal.s16 q1, d20, d0[0] @// a0 + a5 + 20a2 + 20a3 (column2) + vmlal.s16 q15, d21, d0[0] @// a0 + a5 + 20a2 + 20a3 (column2) + vext.16 q10, q13, q14, #4 @//extract a[4] (column2) + vmlsl.s16 q1, d22, d1[0] @// a0 + a5 + 20a2 + 20a3 - 5a1 (column2) + vmlsl.s16 q15, d23, d1[0] @// a0 + a5 + 20a2 + 20a3 - 5a1 (column2) + vmlsl.s16 q1, d20, d1[0] @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column2) + vmlsl.s16 q15, d21, d1[0] @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column2) + vqrshrun.s32 d20, q1, #10 + vqrshrun.s32 d21, q15, #10 + vld1.u32 {d2, d3, d4}, [r0], r2 @ Vector load from src[6_0] + vqshrun.s16 d22, q10, #0 + vst1.u8 {d22}, [r1], r7 @//Store dest row0 ,column 2; (1/2,1/2) + + @ vERTICAL FILTERING FOR ROW 1 + vaddl.u8 q10, d11, d14 @ temp1 = src[2_0] + src[3_0] + vaddl.u8 q12, d5, d2 @ temp2 = src[0_0] + src[5_0] + vaddl.u8 q11, d8, d17 @ temp = src[1_0] + src[4_0] + vaddl.u8 q13, d6, d3 @ temp2 = src[0_0] + src[5_0] + vmla.u16 q12, q10, d0[0] @ temp += temp1 * 20 + vaddl.u8 q10, d9, d18 @ temp = src[1_0] + src[4_0] + vmls.s16 q12, q11, d1[0] @ temp -= temp2 * 5 + vaddl.u8 q11, d12, d15 @ temp3 = src[2_0] + src[3_0] + vaddl.u8 q14, d7, d4 @ temp2 = src[0_0] + src[5_0] + vmla.u16 q13, q11, d0[0] @ temp4 += temp3 * 20 + vaddl.u8 q11, d13, d16 @ temp3 = src[2_0] + src[3_0] + vmls.s16 q13, q10, d1[0] @ temp -= temp2 * 5 + vmla.u16 q14, q11, d0[0] @ temp4 += temp3 * 20 + vaddl.u8 q10, d10, d19 @ temp = src[1_0] + src[4_0] + vmls.s16 q14, q10, d1[0] @ temp -= temp2 * 5 + vext.16 q10, q12, q13, #5 @//extract a[5] (column1) + + @Q12,Q13,Q14 HAVE VERTICAL FILTERED VALUES + @CASCADED FILTERING FOR ROW 1 + vext.16 q11, q12, q13, #2 @//extract a[2] (column1) + vaddl.s16 q3, d20, d24 @// a0 + a5 (column1) + vaddl.s16 q15, d21, d25 @// a0 + a5 (column1) + vmlal.s16 q3, d22, d0[0] @// a0 + a5 + 20a2 (column1) + vmlal.s16 q15, d23, d0[0] @// a0 + a5 + 20a2 (column1) + vext.16 q11, q12, q13, #1 @//extract a[1] (column1) + vext.16 q10, q12, q13, #3 @//extract a[3] (column1) + vmlsl.s16 q3, d22, d1[0] @// a0 + a5 + 20a2 + 20a3 - 5a1 (column1) + vmlsl.s16 q15, d23, d1[0] @// a0 + a5 + 20a2 + 20a3 - 5a1 (column1) + vmlal.s16 q3, d20, d0[0] @// a0 + a5 + 20a2 + 20a3 (column1) + vmlal.s16 q15, d21, d0[0] @// a0 + a5 + 20a2 + 20a3 (column1) + vext.16 q11, q12, q13, #4 @//extract a[4] (column1) + vext.16 q10, q13, q14, #5 @//extract a[5] (column2) + vmlsl.s16 q3, d22, d1[0] @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1) + vmlsl.s16 q15, d23, d1[0] @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1) + vqrshrun.s32 d22, q3, #10 + vqrshrun.s32 d23, q15, #10 + vqshrun.s16 d22, q11, #0 + vst1.u8 {d22}, [r1], r10 @//Store dest row1, column 1; (1/2,1/2) + vext.16 q11, q13, q14, #2 @//extract a[2] (column2) + vaddl.s16 q3, d20, d26 @// a0 + a5 (column2) + vaddl.s16 q15, d21, d27 @// a0 + a5 (column2) + vmlal.s16 q3, d22, d0[0] @// a0 + a5 + 20a2 (column2) + vmlal.s16 q15, d23, d0[0] @// a0 + a5 + 20a2 (column2) + vext.16 q10, q13, q14, #3 @//extract a[3] (column2) + vext.16 q11, q13, q14, #1 @//extract a[1] (column2) + vmlal.s16 q3, d20, d0[0] @// a0 + a5 + 20a2 + 20a3 (column2) + vmlal.s16 q15, d21, d0[0] @// a0 + a5 + 20a2 + 20a3 (column2) + vext.16 q10, q13, q14, #4 @//extract a[4] (column2) + vmlsl.s16 q3, d22, d1[0] @// a0 + a5 + 20a2 + 20a3 - 5a1 (column2) + vmlsl.s16 q15, d23, d1[0] @// a0 + a5 + 20a2 + 20a3 - 5a1 (column2) + vmlsl.s16 q3, d20, d1[0] @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column2) + vmlsl.s16 q15, d21, d1[0] @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column2) + vqrshrun.s32 d20, q3, #10 + vqrshrun.s32 d21, q15, #10 + vqshrun.s16 d22, q10, #0 + vst1.u8 {d22}, [r1], r7 @//Store dest row1 ,column 2; (1/2,1/2) + + subs r8, r8, #2 @ 2 rows processed, decrement by 2 + subne r0, r0 , r2, lsl #2 + subne r0, r0, r2 + beq end_func @ Branch if height==4 + + b loop_16 @ looping if height = 8 or 16 + +loop_8: + vld1.u32 {d2, d3}, [r0], r2 @ Vector load from src[0_0] + vld1.u32 {d4, d5}, [r0], r2 @ Vector load from src[1_0] + vld1.u32 {d6, d7}, [r0], r2 @ Vector load from src[2_0] + vld1.u32 {d8, d9}, [r0], r2 @ Vector load from src[3_0] + vld1.u32 {d10, d11}, [r0], r2 @ Vector load from src[4_0] + vld1.u32 {d12, d13}, [r0], r2 @ Vector load from src[5_0] + + @ vERTICAL FILTERING FOR ROW 0 + vaddl.u8 q10, d6, d8 @ temp1 = src[2_0] + src[3_0] + vaddl.u8 q11, d4, d10 @ temp2 = src[1_0] + src4_0] + vaddl.u8 q12, d2, d12 @ temp = src[0_0] + src[5_0] + vaddl.u8 q13, d3, d13 @ temp = src[0_0] + src[5_0] + vaddl.u8 q14, d7, d9 @ temp1 = src[2_0] + src[3_0] + vmla.u16 q12, q10, d0[0] @ temp += temp1 * 20 + vmls.s16 q12, q11, d1[0] @ temp -= temp2 * 5 + vaddl.u8 q15, d5, d11 @ temp2 = src[1_0] + src4_0] + vmla.u16 q13, q14, d0[0] @ temp += temp1 * 20 + vmls.s16 q13, q15, d1[0] @ temp -= temp2 * 5 + @Q12,Q13 HAVE VERTICAL FILTERED VALUES + @CASCADED FILTERING FOR ROW 0 + + vext.16 q10, q12, q13, #5 @//extract a[5] (column1) + vext.16 q11, q12, q13, #2 @//extract a[2] (column1) + vaddl.s16 q14, d20, d24 @// a0 + a5 (column1) + vaddl.s16 q15, d21, d25 @// a0 + a5 (column1) + vext.16 q9, q12, q13, #1 @//extract a[1] (column1) + vext.16 q10, q12, q13, #3 @//extract a[3] (column1) + vext.16 q1, q12, q13, #4 @//extract a[4] (column1) + vmlal.s16 q14, d22, d0[0] @// a0 + a5 + 20a2 (column1) + vmlsl.s16 q14, d18, d1[0] @// a0 + a5 + 20a2 + 20a3 - 5a1 (column1) + vmlal.s16 q14, d20, d0[0] @// a0 + a5 + 20a2 + 20a3 (column1) + vmlsl.s16 q14, d2, d1[0] @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1) + vld1.u32 {d14, d15}, [r0], r2 @ Vector load from src[6_0] + vmlal.s16 q15, d23, d0[0] @// a0 + a5 + 20a2 (column1) + vmlal.s16 q15, d21, d0[0] @// a0 + a5 + 20a2 + 20a3 (column1) + vmlsl.s16 q15, d19, d1[0] @// a0 + a5 + 20a2 + 20a3 - 5a1 (column1) + vmlsl.s16 q15, d3, d1[0] @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1) + + vaddl.u8 q12, d4, d14 @ temp = src[0_0] + src[5_0] + vaddl.u8 q13, d5, d15 @ temp = src[0_0] + src[5_0] + vqrshrun.s32 d18, q14, #10 + vaddl.u8 q14, d9, d11 @ temp1 = src[2_0] + src[3_0] + vaddl.u8 q10, d8, d10 @ temp1 = src[2_0] + src[3_0] + vaddl.u8 q11, d6, d12 @ temp2 = src[1_0] + src4_0] + vqrshrun.s32 d19, q15, #10 + vmla.u16 q12, q10, d0[0] @ temp += temp1 * 20 + vmls.s16 q12, q11, d1[0] @ temp -= temp2 * 5 + vaddl.u8 q15, d7, d13 @ temp2 = src[1_0] + src4_0] + vmla.u16 q13, q14, d0[0] @ temp += temp1 * 20 + vmls.s16 q13, q15, d1[0] @ temp -= temp2 * 5 + vqshrun.s16 d2, q9, #0 + @ vERTICAL FILTERING FOR ROW 1 + + @Q12,Q13 HAVE VERTICAL FILTERED VALUES + @CASCADED FILTERING FOR ROW 1 + vext.16 q10, q12, q13, #5 @//extract a[5] (column1) + vext.16 q11, q12, q13, #2 @//extract a[2] (column1) + vaddl.s16 q14, d20, d24 @// a0 + a5 (column1) + vaddl.s16 q15, d21, d25 @// a0 + a5 (column1) + vst1.u8 {d2}, [r1], r3 @//Store dest row0, column 1; (1/2,1/2) + vext.16 q9, q12, q13, #1 @//extract a[1] (column1) + vext.16 q10, q12, q13, #3 @//extract a[3] (column1) + vext.16 q2, q12, q13, #4 @//extract a[4] (column1) + vmlal.s16 q14, d22, d0[0] @// a0 + a5 + 20a2 (column1) + vmlsl.s16 q14, d18, d1[0] @// a0 + a5 + 20a2 + 20a3 - 5a1 (column1) + vmlal.s16 q14, d20, d0[0] @// a0 + a5 + 20a2 + 20a3 (column1) + vmlsl.s16 q14, d4, d1[0] @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1) + vmlal.s16 q15, d23, d0[0] @// a0 + a5 + 20a2 (column1) + vmlal.s16 q15, d21, d0[0] @// a0 + a5 + 20a2 + 20a3 (column1) + vmlsl.s16 q15, d19, d1[0] @// a0 + a5 + 20a2 + 20a3 - 5a1 (column1) + vmlsl.s16 q15, d5, d1[0] @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1) + vqrshrun.s32 d18, q14, #10 + vqrshrun.s32 d19, q15, #10 + vqshrun.s16 d3, q9, #0 + vst1.u8 {d3}, [r1], r3 @//Store dest row1, column 1; (1/2,1/2) + + subs r8, r8, #2 @ 2 rows processed, decrement by 2 + subne r0, r0 , r2, lsl #2 + subne r0, r0, r2 + beq end_func @ Branch if height==4 + + b loop_8 @looping if height == 8 or 16 + +loop_4: + vld1.u32 {d2, d3}, [r0], r2 @ Vector load from src[0_0] + vld1.u32 {d4, d5}, [r0], r2 @ Vector load from src[1_0] + vld1.u32 {d6, d7}, [r0], r2 @ Vector load from src[2_0] + vld1.u32 {d8, d9}, [r0], r2 @ Vector load from src[3_0] + vld1.u32 {d10, d11}, [r0], r2 @ Vector load from src[4_0] + vld1.u32 {d12, d13}, [r0], r2 @ Vector load from src[5_0] + + @ vERTICAL FILTERING FOR ROW 0 + vaddl.u8 q10, d6, d8 @ temp1 = src[2_0] + src[3_0] + vaddl.u8 q11, d4, d10 @ temp2 = src[1_0] + src4_0] + vaddl.u8 q12, d2, d12 @ temp = src[0_0] + src[5_0] + vaddl.u8 q13, d3, d13 @ temp = src[0_0] + src[5_0] + vaddl.u8 q14, d7, d9 @ temp1 = src[2_0] + src[3_0] + vmla.u16 q12, q10, d0[0] @ temp += temp1 * 20 + vmls.s16 q12, q11, d1[0] @ temp -= temp2 * 5 + vaddl.u8 q15, d5, d11 @ temp2 = src[1_0] + src4_0] + vmla.u16 q13, q14, d0[0] @ temp += temp1 * 20 + vmls.s16 q13, q15, d1[0] @ temp -= temp2 * 5 + @Q12,Q13 HAVE VERTICAL FILTERED VALUES + @CASCADED FILTERING FOR ROW 0 + + vext.16 q10, q12, q13, #5 @//extract a[5] (column1) + vext.16 q11, q12, q13, #2 @//extract a[2] (column1) + vaddl.s16 q14, d20, d24 @// a0 + a5 (column1) + vaddl.s16 q15, d21, d25 @// a0 + a5 (column1) + + vext.16 q1, q12, q13, #4 @//extract a[4] (column1) + vext.16 q9, q12, q13, #1 @//extract a[1] (column1) + vext.16 q10, q12, q13, #3 @//extract a[3] (column1) + + vmlal.s16 q14, d22, d0[0] @// a0 + a5 + 20a2 (column1) + vmlsl.s16 q14, d18, d1[0] @// a0 + a5 + 20a2 + 20a3 - 5a1 (column1) + vmlal.s16 q14, d20, d0[0] @// a0 + a5 + 20a2 + 20a3 (column1) + vmlsl.s16 q14, d2, d1[0] @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1) + vld1.u32 {d14, d15}, [r0], r2 @ Vector load from src[6_0] + vmlal.s16 q15, d23, d0[0] @// a0 + a5 + 20a2 (column1) + vmlal.s16 q15, d21, d0[0] @// a0 + a5 + 20a2 + 20a3 (column1) + vmlsl.s16 q15, d19, d1[0] @// a0 + a5 + 20a2 + 20a3 - 5a1 (column1) + vmlsl.s16 q15, d3, d1[0] @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1) + vaddl.u8 q12, d4, d14 @ temp = src[0_0] + src[5_0] + vaddl.u8 q13, d5, d15 @ temp = src[0_0] + src[5_0] + vqrshrun.s32 d18, q14, #10 + vaddl.u8 q14, d9, d11 @ temp1 = src[2_0] + src[3_0] + vaddl.u8 q11, d6, d12 @ temp2 = src[1_0] + src4_0] + vaddl.u8 q10, d8, d10 @ temp1 = src[2_0] + src[3_0] + vqrshrun.s32 d19, q15, #10 + vmla.u16 q12, q10, d0[0] @ temp += temp1 * 20 + vmls.s16 q12, q11, d1[0] @ temp -= temp2 * 5 + vaddl.u8 q15, d7, d13 @ temp2 = src[1_0] + src4_0] + vqshrun.s16 d2, q9, #0 + vmla.u16 q13, q14, d0[0] @ temp += temp1 * 20 + vmls.s16 q13, q15, d1[0] @ temp -= temp2 * 5 + + @ vERTICAL FILTERING FOR ROW 1 + + @Q12,Q13 HAVE VERTICAL FILTERED VALUES + @CASCADED FILTERING FOR ROW 1 + vext.16 q10, q12, q13, #5 @//extract a[5] (column1) + vext.16 q11, q12, q13, #2 @//extract a[2] (column1) + vst1.u32 {d2[0]}, [r1], r3 @//Store dest row0, column 1; (1/2,1/2) + vaddl.s16 q14, d20, d24 @// a0 + a5 (column1) + vaddl.s16 q15, d21, d25 @// a0 + a5 (column1) + vext.16 q9, q12, q13, #1 @//extract a[1] (column1) + vext.16 q10, q12, q13, #3 @//extract a[3] (column1) + vext.16 q2, q12, q13, #4 @//extract a[4] (column1) + vmlal.s16 q14, d22, d0[0] @// a0 + a5 + 20a2 (column1) + vmlsl.s16 q14, d18, d1[0] @// a0 + a5 + 20a2 + 20a3 - 5a1 (column1) + vmlal.s16 q14, d20, d0[0] @// a0 + a5 + 20a2 + 20a3 (column1) + vmlsl.s16 q14, d4, d1[0] @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1) + vmlal.s16 q15, d23, d0[0] @// a0 + a5 + 20a2 (column1) + vmlal.s16 q15, d21, d0[0] @// a0 + a5 + 20a2 + 20a3 (column1) + vmlsl.s16 q15, d19, d1[0] @// a0 + a5 + 20a2 + 20a3 - 5a1 (column1) + vmlsl.s16 q15, d5, d1[0] @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1) + vqrshrun.s32 d18, q14, #10 + vqrshrun.s32 d19, q15, #10 + vqshrun.s16 d4, q9, #0 + vst1.u32 {d4[0]}, [r1], r3 @//Store dest row1, column 1; (1/2,1/2) + + subs r8, r8, #2 @ 2 rows processed, decrement by 2 + subne r0, r0 , r2, lsl #2 + subne r0, r0, r2 + beq end_func @ Branch if height==4 + + b loop_4 @looping if height == 8 or 16 + +end_func: + vldmia sp!, {d8-d15} @ Restore neon registers that were saved + ldmfd sp!, {r4-r12, pc} @Restoring registers from stack + + diff --git a/common/arm/ih264_inter_pred_luma_horz_hpel_vert_qpel_a9q.s b/common/arm/ih264_inter_pred_luma_horz_hpel_vert_qpel_a9q.s new file mode 100755 index 0000000..65a6de7 --- /dev/null +++ b/common/arm/ih264_inter_pred_luma_horz_hpel_vert_qpel_a9q.s @@ -0,0 +1,1044 @@ +@/****************************************************************************** +@ * +@ * Copyright (C) 2015 The Android Open Source Project +@ * +@ * Licensed under the Apache License, Version 2.0 (the "License"); +@ * you may not use this file except in compliance with the License. +@ * You may obtain a copy of the License at: +@ * +@ * http://www.apache.org/licenses/LICENSE-2.0 +@ * +@ * Unless required by applicable law or agreed to in writing, software +@ * distributed under the License is distributed on an "AS IS" BASIS, +@ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +@ * See the License for the specific language governing permissions and +@ * limitations under the License. +@ * +@ ***************************************************************************** +@ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore +@*/ +@/** +@****************************************************************************** +@* @file +@* ih264_inter_pred_luma_horz_hpel_vert_qpel_a9q.s +@* +@* @brief +@* Contains function definitions for inter prediction interpolation. +@* +@* @author +@* Mohit +@* +@* @par List of Functions: +@* +@* - ih264_inter_pred_luma_horz_hpel_vert_qpel_a9q() +@* +@* @remarks +@* None +@* +@******************************************************************************* +@*/ + +@/* All the functions here are replicated from ih264_inter_pred_filters.c +@ + +@/** +@/** +@/** +@******************************************************************************* +@* +@* @brief +@* This function implements a two stage cascaded six tap filter. It +@* applies the six tap filter in the horizontal direction on the +@* predictor values, followed by applying the same filter in the +@* vertical direction on the output of the first stage. It then averages +@* the output of the 1st stage and the output of the 2nd stage to obtain +@* the quarter pel values. The six tap filtering operation is described +@* in sec 8.4.2.2.1 titled "Luma sample interpolation process". +@* +@* @par Description: +@* This function is called to obtain pixels lying at the following +@* location (1/2,1/4) or (1/2,3/4). The function interpolates +@* the predictors first in the horizontal direction and then in the +@* vertical direction to output the (1/2,1/2). It then averages +@* the output of the 2nd stage and (1/2,1/2) value to obtain (1/2,1/4) +@* or (1/2,3/4) depending on the offset. +@* +@* @param[in] pu1_src +@* UWORD8 pointer to the source +@* +@* @param[out] pu1_dst +@* UWORD8 pointer to the destination +@* +@* @param[in] src_strd +@* integer source stride +@* +@* @param[in] dst_strd +@* integer destination stride +@* +@* @param[in] ht +@* integer height of the array +@* +@* @param[in] wd +@* integer width of the array +@* +@* @param[in] pu1_tmp: temporary buffer +@* +@* @param[in] dydx: x and y reference offset for qpel calculations +@* +@* @returns +@* +@* @remarks +@* None +@* +@******************************************************************************* +@*/; + +@void ih264_inter_pred_luma_horz_hpel_vert_qpel(UWORD8 *pu1_src, +@ UWORD8 *pu1_dst, +@ WORD32 src_strd,, +@ WORD32 dst_strd, +@ WORD32 ht, +@ WORD32 wd, +@ UWORD8* pu1_tmp, +@ UWORD32 dydx) + +@**************Variables Vs Registers***************************************** +@ r0 => *pu1_src +@ r1 => *pu1_dst +@ r2 => src_strd +@ r3 => dst_strd +@ r4 => ht +@ r5 => wd +@ r7 => dydx +@ r9 => *pu1_tmp + +.text +.p2align 2 + + .global ih264_inter_pred_luma_horz_hpel_vert_qpel_a9q + +ih264_inter_pred_luma_horz_hpel_vert_qpel_a9q: + + stmfd sp!, {r4-r12, r14} @ store register values to stack + vstmdb sp!, {d8-d15} @push neon registers to stack + ldr r4, [sp, #104] @ loads ht + sub r0, r0, r2, lsl #1 @ pu1_src-2*src_strd + sub r0, r0, #2 @ pu1_src-2 + ldr r5, [sp, #108] @ loads wd + ldr r7, [sp, #116] @ loads dydx + lsr r7, r7, #3 @ dydx >> 2 followed by dydx & 0x3 and dydx>>1 to obtain the deciding bit + ldr r9, [sp, #112] @ pu1_tmp + add r7, r7, #2 + mov r6, #48 + mla r7, r7, r6, r9 + + subs r12, r5, #4 @if wd=4 branch to loop_4 + beq loop_4_start + + subs r12, r5, #8 @if wd=8 branch to loop_8 + beq loop_8_start + + @when wd=16 + vmov.u16 q11, #20 @ Filter coeff 0x14 into Q11 + vmov.u16 q12, #5 @ Filter coeff 0x5 into Q12 + add r8, r0, #8 + add r14, r1, #8 + add r10, r9, #8 + mov r12, r4 + add r11, r7, #8 + +loop_16_lowhalf_start: + vld1.32 {q0}, [r0], r2 @ row -2 load for horizontal filter + vext.8 d5, d0, d1, #5 + vaddl.u8 q3, d0, d5 + + vext.8 d2, d0, d1, #2 + vext.8 d3, d0, d1, #3 + vaddl.u8 q4, d2, d3 + vext.8 d4, d0, d1, #4 + vmla.u16 q3, q4, q11 + vext.8 d1, d0, d1, #1 + vaddl.u8 q4, d1, d4 + vld1.32 {q0}, [r0], r2 @ row -1 load for horizontal filter + vmls.u16 q3, q4, q12 + vext.8 d5, d0, d1, #5 + vaddl.u8 q4, d0, d5 + vext.8 d2, d0, d1, #2 + vext.8 d3, d0, d1, #3 + vaddl.u8 q5, d2, d3 + + vst1.32 {q3}, [r9], r6 @ store temp buffer 0 + + vext.8 d4, d0, d1, #4 + vmla.u16 q4, q5, q11 + vext.8 d1, d0, d1, #1 + vaddl.u8 q5, d1, d4 + vld1.32 {q0}, [r0], r2 @ row 0 load for horizontal filter + vmls.u16 q4, q5, q12 + vext.8 d5, d0, d1, #5 + vaddl.u8 q5, d0, d5 + vext.8 d2, d0, d1, #2 + vext.8 d3, d0, d1, #3 + vaddl.u8 q6, d2, d3 + + vst1.32 {q4}, [r9], r6 @ store temp buffer 1 + + vext.8 d4, d0, d1, #4 + vmla.u16 q5, q6, q11 + vext.8 d1, d0, d1, #1 + vaddl.u8 q6, d1, d4 + vld1.32 {q0}, [r0], r2 @ row 1 load for horizontal filter + vmls.u16 q5, q6, q12 + vext.8 d5, d0, d1, #5 + vaddl.u8 q6, d0, d5 + vext.8 d2, d0, d1, #2 + vext.8 d3, d0, d1, #3 + vaddl.u8 q7, d2, d3 + + vst1.32 {q5}, [r9], r6 @ store temp buffer 2 + + vext.8 d4, d0, d1, #4 + vmla.u16 q6, q7, q11 + vext.8 d1, d0, d1, #1 + vaddl.u8 q7, d1, d4 + vld1.32 {q0}, [r0], r2 @ row 2 load for horizontal filter + vmls.u16 q6, q7, q12 + vext.8 d5, d0, d1, #5 + vaddl.u8 q7, d0, d5 + vext.8 d2, d0, d1, #2 + vext.8 d3, d0, d1, #3 + vaddl.u8 q8, d2, d3 + + vst1.32 {q6}, [r9], r6 @ store temp buffer 3 + + vext.8 d4, d0, d1, #4 + vmla.u16 q7, q8, q11 + vext.8 d1, d0, d1, #1 + vaddl.u8 q8, d1, d4 + + vmls.u16 q7, q8, q12 +loop_16_lowhalf: + + vld1.32 {q0}, [r0], r2 @ row 3 load for horizontal filter + vext.8 d5, d0, d1, #5 + vext.8 d2, d0, d1, #2 + vext.8 d3, d0, d1, #3 + vaddl.u8 q8, d0, d5 + + vst1.32 {q7}, [r9], r6 @ store temp buffer 4 + vaddl.u8 q9, d2, d3 + vext.8 d4, d0, d1, #4 + vmla.u16 q8, q9, q11 + vext.8 d1, d0, d1, #1 + vadd.s16 q14, q4, q7 + vaddl.u8 q9, d1, d4 + vadd.s16 q15, q5, q6 + vmls.u16 q8, q9, q12 + vld1.32 {q0}, [r0], r2 @ row 4 load for hoorizontal filter + vext.8 d5, d0, d1, #5 + vext.8 d2, d0, d1, #2 + vext.8 d3, d0, d1, #3 + vaddl.u8 q10, d0, d5 + + vst1.32 {q8}, [r9], r6 @ store temp buffer r5 + + vaddl.s16 q9, d6, d16 + + vld1.32 {q13}, [r7], r6 @ load from temp buffer 0 + + vaddl.s16 q3, d7, d17 + + vqrshrun.s16 d26, q13, #5 + + vmlal.s16 q9, d30, d22 + vmlsl.s16 q9, d28, d24 + vmlal.s16 q3, d31, d22 + vmlsl.s16 q3, d29, d24 + vaddl.u8 q1, d2, d3 + vext.8 d4, d0, d1, #4 + vmla.u16 q10, q1, q11 + vqrshrun.s32 d18, q9, #10 + vext.8 d1, d0, d1, #1 + vqrshrun.s32 d19, q3, #10 + vadd.s16 q14, q5, q8 + vaddl.u8 q1, d1, d4 + vadd.s16 q15, q6, q7 + vmls.u16 q10, q1, q12 + vqmovn.u16 d18, q9 + vld1.32 {q0}, [r0], r2 @ row 5 load for horizontal filter + + vrhadd.u8 d26, d18, d26 + + vext.8 d5, d0, d1, #5 + vext.8 d2, d0, d1, #2 + + vst1.32 {q10}, [r9], r6 @ store temp buffer r6 + + vaddl.s16 q9, d8, d20 + + vaddl.s16 q3, d9, d21 + + vld1.32 {q4}, [r7], r6 @load from temp buffer 1 + + + vst1.32 d26, [r1], r3 @ store row 0 + + vmlal.s16 q9, d30, d22 + vmlsl.s16 q9, d28, d24 + + vqrshrun.s16 d28, q4, #5 + + vmlal.s16 q3, d31, d22 + vmlsl.s16 q3, d29, d24 + vext.8 d3, d0, d1, #3 + vaddl.u8 q4, d0, d5 + vaddl.u8 q1, d2, d3 + vqrshrun.s32 d18, q9, #10 + vext.8 d4, d0, d1, #4 + vqrshrun.s32 d19, q3, #10 + vmla.u16 q4, q1, q11 + vext.8 d1, d0, d1, #1 + vadd.s16 q13, q6, q10 + vaddl.u8 q1, d1, d4 + vqmovn.u16 d18, q9 + vadd.s16 q15, q7, q8 + vmls.u16 q4, q1, q12 + vld1.32 {q0}, [r0], r2 @ row 6 load for horizontal filter + + vrhadd.u8 d28, d28, d18 + + vext.8 d5, d0, d1, #5 + vext.8 d2, d0, d1, #2 + vext.8 d3, d0, d1, #3 + + vst1.32 d28, [r1], r3 @ store row 1 + + vaddl.u8 q14, d0, d5 + + vst1.32 {q4}, [r9], r6 @ store temp buffer r7 + + vaddl.s16 q9, d10, d8 + vaddl.s16 q3, d11, d9 + + vld1.32 {q5}, [r7], r6 @ load from temp buffer 2 + + vmlal.s16 q9, d30, d22 + vmlsl.s16 q9, d26, d24 + vmlal.s16 q3, d31, d22 + + vqrshrun.s16 d26, q5, #5 + + vmlsl.s16 q3, d27, d24 + vaddl.u8 q1, d2, d3 + vext.8 d4, d0, d1, #4 + vmla.u16 q14, q1, q11 + vqrshrun.s32 d18, q9, #10 + vext.8 d1, d0, d1, #1 + vqrshrun.s32 d19, q3, #10 + vadd.s16 q5, q7, q4 + vaddl.u8 q1, d1, d4 + vadd.s16 q15, q8, q10 + vmls.u16 q14, q1, q12 + vqmovn.u16 d27, q9 + + vaddl.s16 q9, d12, d28 + vaddl.s16 q3, d13, d29 + + vrhadd.u8 d26, d26, d27 + + vmlal.s16 q9, d30, d22 + vmlsl.s16 q9, d10, d24 + vmlal.s16 q3, d31, d22 + vmlsl.s16 q3, d11, d24 + + vst1.32 d26, [r1], r3 @ store row 2 + + vst1.32 {q14}, [r9] + + + vqrshrun.s32 d18, q9, #10 + vmov q5, q10 + vld1.32 {q15}, [r7], r6 @ load from temp buffer 3 + + vqrshrun.s32 d19, q3, #10 + subs r4, r4, #4 + + vqrshrun.s16 d30, q15, #5 + + vqmovn.u16 d18, q9 + vmov q6, q4 + vmov q3, q7 + vrhadd.u8 d30, d18, d30 + vmov q4, q8 + vmov q7, q14 + vst1.32 d30, [r1], r3 @ store row 3 + + bgt loop_16_lowhalf @ looping if height =16 + + +loop_16_highhalf_start: + vld1.32 {q0}, [r8], r2 + vext.8 d5, d0, d1, #5 + vaddl.u8 q3, d0, d5 + vext.8 d2, d0, d1, #2 + vext.8 d3, d0, d1, #3 + vaddl.u8 q4, d2, d3 + vext.8 d4, d0, d1, #4 + vmla.u16 q3, q4, q11 + vext.8 d1, d0, d1, #1 + vaddl.u8 q4, d1, d4 + vld1.32 {q0}, [r8], r2 + vmls.u16 q3, q4, q12 + vext.8 d5, d0, d1, #5 + vaddl.u8 q4, d0, d5 + vext.8 d2, d0, d1, #2 + vext.8 d3, d0, d1, #3 + vaddl.u8 q5, d2, d3 + + vst1.32 {q3}, [r10], r6 + + vext.8 d4, d0, d1, #4 + vmla.u16 q4, q5, q11 + vext.8 d1, d0, d1, #1 + vaddl.u8 q5, d1, d4 + vld1.32 {q0}, [r8], r2 + vmls.u16 q4, q5, q12 + vext.8 d5, d0, d1, #5 + vaddl.u8 q5, d0, d5 + vext.8 d2, d0, d1, #2 + vext.8 d3, d0, d1, #3 + vaddl.u8 q6, d2, d3 + + vst1.32 {q4}, [r10], r6 + + vext.8 d4, d0, d1, #4 + vmla.u16 q5, q6, q11 + vext.8 d1, d0, d1, #1 + vaddl.u8 q6, d1, d4 + vld1.32 {q0}, [r8], r2 + vmls.u16 q5, q6, q12 + vext.8 d5, d0, d1, #5 + vaddl.u8 q6, d0, d5 + vext.8 d2, d0, d1, #2 + vext.8 d3, d0, d1, #3 + vaddl.u8 q7, d2, d3 + + vst1.32 {q5}, [r10], r6 + + vext.8 d4, d0, d1, #4 + vmla.u16 q6, q7, q11 + vext.8 d1, d0, d1, #1 + vaddl.u8 q7, d1, d4 + vld1.32 {q0}, [r8], r2 + vmls.u16 q6, q7, q12 + vext.8 d5, d0, d1, #5 + vaddl.u8 q7, d0, d5 + vext.8 d2, d0, d1, #2 + vext.8 d3, d0, d1, #3 + vaddl.u8 q8, d2, d3 + + vst1.32 {q6}, [r10], r6 + + vext.8 d4, d0, d1, #4 + vmla.u16 q7, q8, q11 + vext.8 d1, d0, d1, #1 + vaddl.u8 q8, d1, d4 + + vmls.u16 q7, q8, q12 + +loop_16_highhalf: + + vld1.32 {q0}, [r8], r2 + vext.8 d5, d0, d1, #5 + vext.8 d2, d0, d1, #2 + vext.8 d3, d0, d1, #3 + vaddl.u8 q8, d0, d5 + + vst1.32 {q7}, [r10], r6 + + vaddl.u8 q9, d2, d3 + vext.8 d4, d0, d1, #4 + vmla.u16 q8, q9, q11 + vext.8 d1, d0, d1, #1 + vadd.s16 q14, q4, q7 + vaddl.u8 q9, d1, d4 + vadd.s16 q15, q5, q6 + vmls.u16 q8, q9, q12 + vld1.32 {q0}, [r8], r2 + vext.8 d5, d0, d1, #5 + vext.8 d2, d0, d1, #2 + vext.8 d3, d0, d1, #3 + vaddl.u8 q10, d0, d5 + + vst1.32 {q8}, [r10], r6 + + vaddl.s16 q9, d6, d16 + + vld1.32 {q13}, [r11], r6 + + vaddl.s16 q3, d7, d17 + + vqrshrun.s16 d26, q13, #5 + + vmlal.s16 q9, d30, d22 + vmlsl.s16 q9, d28, d24 + vmlal.s16 q3, d31, d22 + vmlsl.s16 q3, d29, d24 + vaddl.u8 q1, d2, d3 + vext.8 d4, d0, d1, #4 + vmla.u16 q10, q1, q11 + vqrshrun.s32 d18, q9, #10 + vext.8 d1, d0, d1, #1 + vqrshrun.s32 d19, q3, #10 + vadd.s16 q14, q5, q8 + vaddl.u8 q1, d1, d4 + vadd.s16 q15, q6, q7 + vmls.u16 q10, q1, q12 + vqmovn.u16 d18, q9 + vld1.32 {q0}, [r8], r2 + + vrhadd.u8 d26, d18, d26 + + vext.8 d5, d0, d1, #5 + vext.8 d2, d0, d1, #2 + + vst1.32 {q10}, [r10], r6 + + vaddl.s16 q9, d8, d20 + vaddl.s16 q3, d9, d21 + + vld1.32 {q4}, [r11], r6 + + + vst1.32 d26, [r14], r3 @store row 0 + + vmlal.s16 q9, d30, d22 + vmlsl.s16 q9, d28, d24 + + vqrshrun.s16 d28, q4, #5 + + vmlal.s16 q3, d31, d22 + vmlsl.s16 q3, d29, d24 + vext.8 d3, d0, d1, #3 + vaddl.u8 q4, d0, d5 + vaddl.u8 q1, d2, d3 + vqrshrun.s32 d18, q9, #10 + vext.8 d4, d0, d1, #4 + vqrshrun.s32 d19, q3, #10 + vmla.u16 q4, q1, q11 + vext.8 d1, d0, d1, #1 + vadd.s16 q13, q6, q10 + vaddl.u8 q1, d1, d4 + vqmovn.u16 d18, q9 + vadd.s16 q15, q7, q8 + vmls.u16 q4, q1, q12 + vld1.32 {q0}, [r8], r2 + + vrhadd.u8 d28, d28, d18 + + vext.8 d5, d0, d1, #5 + vext.8 d2, d0, d1, #2 + vext.8 d3, d0, d1, #3 + + vst1.32 d28, [r14], r3 @store row 1 + + vaddl.u8 q14, d0, d5 + + vst1.32 {q4}, [r10], r6 + + vaddl.s16 q9, d10, d8 + vaddl.s16 q3, d11, d9 + + vld1.32 {q5}, [r11], r6 + + vmlal.s16 q9, d30, d22 + vmlsl.s16 q9, d26, d24 + vmlal.s16 q3, d31, d22 + + vqrshrun.s16 d26, q5, #5 + + vmlsl.s16 q3, d27, d24 + vaddl.u8 q1, d2, d3 + vext.8 d4, d0, d1, #4 + vmla.u16 q14, q1, q11 + vqrshrun.s32 d18, q9, #10 + vext.8 d1, d0, d1, #1 + vqrshrun.s32 d19, q3, #10 + vadd.s16 q5, q7, q4 + vaddl.u8 q1, d1, d4 + vadd.s16 q15, q8, q10 + vmls.u16 q14, q1, q12 + vqmovn.u16 d27, q9 + + + vaddl.s16 q9, d12, d28 + vaddl.s16 q3, d13, d29 + + vrhadd.u8 d26, d26, d27 + + vmlal.s16 q9, d30, d22 + vmlsl.s16 q9, d10, d24 + vmlal.s16 q3, d31, d22 + vmlsl.s16 q3, d11, d24 + + vst1.32 d26, [r14], r3 @ store row 2 + + vst1.32 {q14}, [r10] + + vqrshrun.s32 d18, q9, #10 + vmov q5, q10 + vld1.32 {q15}, [r11], r6 + + vqrshrun.s32 d19, q3, #10 + subs r12, r12, #4 + + vqrshrun.s16 d30, q15, #5 + + vqmovn.u16 d18, q9 + vmov q6, q4 + vmov q3, q7 + vrhadd.u8 d30, d18, d30 + vmov q4, q8 + vmov q7, q14 + vst1.32 d30, [r14], r3 @ store row 3 + + bgt loop_16_highhalf @ looping if height = 8 or 16 + b end_func + +loop_8_start: + + vmov.u16 q11, #20 @ Filter coeff 20 into Q11 + vmov.u16 q12, #5 @ Filter coeff 5 into Q12 + vld1.32 {q0}, [r0], r2 @ row -2 load for horizontal filter + vext.8 d5, d0, d1, #5 + vaddl.u8 q3, d0, d5 + + vext.8 d2, d0, d1, #2 + vext.8 d3, d0, d1, #3 + vaddl.u8 q4, d2, d3 + vext.8 d4, d0, d1, #4 + vmla.u16 q3, q4, q11 + vext.8 d1, d0, d1, #1 + vaddl.u8 q4, d1, d4 + vld1.32 {q0}, [r0], r2 @ row -1 load for horizontal filter + vmls.u16 q3, q4, q12 + vext.8 d5, d0, d1, #5 + vaddl.u8 q4, d0, d5 + vext.8 d2, d0, d1, #2 + vext.8 d3, d0, d1, #3 + vaddl.u8 q5, d2, d3 + + vst1.32 {q3}, [r9], r6 @ store temp buffer 0 + + vext.8 d4, d0, d1, #4 + vmla.u16 q4, q5, q11 + vext.8 d1, d0, d1, #1 + vaddl.u8 q5, d1, d4 + vld1.32 {q0}, [r0], r2 @ row 0 load for horizontal filter + vmls.u16 q4, q5, q12 + vext.8 d5, d0, d1, #5 + vaddl.u8 q5, d0, d5 + vext.8 d2, d0, d1, #2 + vext.8 d3, d0, d1, #3 + vaddl.u8 q6, d2, d3 + + vst1.32 {q4}, [r9], r6 @ store temp buffer 1 + + vext.8 d4, d0, d1, #4 + vmla.u16 q5, q6, q11 + vext.8 d1, d0, d1, #1 + vaddl.u8 q6, d1, d4 + vld1.32 {q0}, [r0], r2 @ row 1 load for horizontal filter + vmls.u16 q5, q6, q12 + vext.8 d5, d0, d1, #5 + vaddl.u8 q6, d0, d5 + vext.8 d2, d0, d1, #2 + vext.8 d3, d0, d1, #3 + vaddl.u8 q7, d2, d3 + + vst1.32 {q5}, [r9], r6 @ store temp buffer 2 + + vext.8 d4, d0, d1, #4 + vmla.u16 q6, q7, q11 + vext.8 d1, d0, d1, #1 + vaddl.u8 q7, d1, d4 + vld1.32 {q0}, [r0], r2 @ row 2 load for horizontal filter + vmls.u16 q6, q7, q12 + vext.8 d5, d0, d1, #5 + vaddl.u8 q7, d0, d5 + vext.8 d2, d0, d1, #2 + vext.8 d3, d0, d1, #3 + vaddl.u8 q8, d2, d3 + + vst1.32 {q6}, [r9], r6 @ store temp buffer 3 + + vext.8 d4, d0, d1, #4 + vmla.u16 q7, q8, q11 + vext.8 d1, d0, d1, #1 + vaddl.u8 q8, d1, d4 + + vmls.u16 q7, q8, q12 +loop_8: + + vld1.32 {q0}, [r0], r2 @ row 3 load for horizontal filter + vext.8 d5, d0, d1, #5 + vext.8 d2, d0, d1, #2 + vext.8 d3, d0, d1, #3 + vaddl.u8 q8, d0, d5 + + vst1.32 {q7}, [r9], r6 @ store temp buffer 4 + + vaddl.u8 q9, d2, d3 + vext.8 d4, d0, d1, #4 + vmla.u16 q8, q9, q11 + vext.8 d1, d0, d1, #1 + vadd.s16 q14, q4, q7 + vaddl.u8 q9, d1, d4 + vadd.s16 q15, q5, q6 + vmls.u16 q8, q9, q12 + vld1.32 {q0}, [r0], r2 @ row 4 load for hoorizontal filter + vext.8 d5, d0, d1, #5 + vext.8 d2, d0, d1, #2 + vext.8 d3, d0, d1, #3 + vaddl.u8 q10, d0, d5 + + vst1.32 {q8}, [r9], r6 @ store temp buffer r5 + + vaddl.s16 q9, d6, d16 + + vld1.32 {q13}, [r7], r6 @ load from temp buffer 0 + + vaddl.s16 q3, d7, d17 + + vqrshrun.s16 d26, q13, #5 + + vmlal.s16 q9, d30, d22 + vmlsl.s16 q9, d28, d24 + vmlal.s16 q3, d31, d22 + vmlsl.s16 q3, d29, d24 + vaddl.u8 q1, d2, d3 + vext.8 d4, d0, d1, #4 + vmla.u16 q10, q1, q11 + vqrshrun.s32 d18, q9, #10 + vext.8 d1, d0, d1, #1 + vqrshrun.s32 d19, q3, #10 + vadd.s16 q14, q5, q8 + vaddl.u8 q1, d1, d4 + vadd.s16 q15, q6, q7 + vmls.u16 q10, q1, q12 + vqmovn.u16 d18, q9 + vld1.32 {q0}, [r0], r2 @ row 5 load for horizontal filter + + vrhadd.u8 d26, d18, d26 + + vext.8 d5, d0, d1, #5 + vext.8 d2, d0, d1, #2 + + vst1.32 {q10}, [r9], r6 @ store temp buffer r6 + + vaddl.s16 q9, d8, d20 + + vaddl.s16 q3, d9, d21 + + vld1.32 {q4}, [r7], r6 @load from temp buffer 1 + + + vst1.32 d26, [r1], r3 @ store row 0 + + vmlal.s16 q9, d30, d22 + vmlsl.s16 q9, d28, d24 + + vqrshrun.s16 d28, q4, #5 + + vmlal.s16 q3, d31, d22 + vmlsl.s16 q3, d29, d24 + vext.8 d3, d0, d1, #3 + vaddl.u8 q4, d0, d5 + vaddl.u8 q1, d2, d3 + vqrshrun.s32 d18, q9, #10 + vext.8 d4, d0, d1, #4 + vqrshrun.s32 d19, q3, #10 + vmla.u16 q4, q1, q11 + vext.8 d1, d0, d1, #1 + vadd.s16 q13, q6, q10 + vaddl.u8 q1, d1, d4 + vqmovn.u16 d18, q9 + vadd.s16 q15, q7, q8 + vmls.u16 q4, q1, q12 + vld1.32 {q0}, [r0], r2 @ row 6 load for horizontal filter + + vrhadd.u8 d28, d28, d18 + + vext.8 d5, d0, d1, #5 + vext.8 d2, d0, d1, #2 + vext.8 d3, d0, d1, #3 + + vst1.32 d28, [r1], r3 @ store row 1 + + vaddl.u8 q14, d0, d5 + + vst1.32 {q4}, [r9], r6 @ store temp buffer r7 + + vaddl.s16 q9, d10, d8 + vaddl.s16 q3, d11, d9 + + vld1.32 {q5}, [r7], r6 @ load from temp buffer 2 + + vmlal.s16 q9, d30, d22 + vmlsl.s16 q9, d26, d24 + vmlal.s16 q3, d31, d22 + + vqrshrun.s16 d26, q5, #5 + + vmlsl.s16 q3, d27, d24 + vaddl.u8 q1, d2, d3 + vext.8 d4, d0, d1, #4 + vmla.u16 q14, q1, q11 + vqrshrun.s32 d18, q9, #10 + vext.8 d1, d0, d1, #1 + vqrshrun.s32 d19, q3, #10 + vadd.s16 q5, q7, q4 + vaddl.u8 q1, d1, d4 + vadd.s16 q15, q8, q10 + vmls.u16 q14, q1, q12 + vqmovn.u16 d27, q9 + + vaddl.s16 q9, d12, d28 + vaddl.s16 q3, d13, d29 + + vrhadd.u8 d26, d26, d27 + + vmlal.s16 q9, d30, d22 + vmlsl.s16 q9, d10, d24 + vmlal.s16 q3, d31, d22 + vmlsl.s16 q3, d11, d24 + + vst1.32 d26, [r1], r3 @ store row 2 + + vst1.32 {q14}, [r9] + + + vqrshrun.s32 d18, q9, #10 + vmov q5, q10 + vld1.32 {q15}, [r7], r6 @ load from temp buffer 3 + + vqrshrun.s32 d19, q3, #10 + subs r4, r4, #4 + + vqrshrun.s16 d30, q15, #5 + + vqmovn.u16 d18, q9 + vmov q6, q4 + vmov q3, q7 + vrhadd.u8 d30, d18, d30 + vmov q4, q8 + vmov q7, q14 + vst1.32 d30, [r1], r3 @ store row 3 + + bgt loop_8 @if height =8 or 16 loop + b end_func + +loop_4_start: + vmov.u16 d22, #20 @ Filter coeff 20 into D22 + vmov.u16 d23, #5 @ Filter coeff 5 into D23 + + vld1.32 {q0}, [r0], r2 @row -2 load + vext.8 d5, d0, d1, #5 + vaddl.u8 q3, d0, d5 + vext.8 d2, d0, d1, #2 + vext.8 d3, d0, d1, #3 + vaddl.u8 q4, d2, d3 + vext.8 d4, d0, d1, #4 + vmla.u16 d6, d8, d22 + vext.8 d1, d0, d1, #1 + vaddl.u8 q4, d1, d4 + vld1.32 {q0}, [r0], r2 @ row -1 load + vmls.u16 d6, d8, d23 + vext.8 d5, d0, d1, #5 + vaddl.u8 q4, d0, d5 + vext.8 d2, d0, d1, #2 + vext.8 d3, d0, d1, #3 + vaddl.u8 q5, d2, d3 + + vst1.32 d6, [r9], r6 @ store temp buffer 0 + + vext.8 d4, d0, d1, #4 + vmla.u16 d8, d10, d22 + vext.8 d1, d0, d1, #1 + vaddl.u8 q5, d1, d4 + vld1.32 {q0}, [r0], r2 @ row 0 load + vmls.u16 d8, d10, d23 + vext.8 d5, d0, d1, #5 + vaddl.u8 q5, d0, d5 + vext.8 d2, d0, d1, #2 + vext.8 d3, d0, d1, #3 + vaddl.u8 q6, d2, d3 + + vst1.32 d8, [r9], r6 @ store temp buffer 1 + + vext.8 d4, d0, d1, #4 + vmla.u16 d10, d12, d22 + vext.8 d1, d0, d1, #1 + vaddl.u8 q6, d1, d4 + vld1.32 {q0}, [r0], r2 @ row 1 load + vmls.u16 d10, d12, d23 + vext.8 d5, d0, d1, #5 + vaddl.u8 q6, d0, d5 + vext.8 d2, d0, d1, #2 + vext.8 d3, d0, d1, #3 + vaddl.u8 q7, d2, d3 + + vst1.32 d10, [r9], r6 @ store temp buffer 2 + + vext.8 d4, d0, d1, #4 + vmla.u16 d12, d14, d22 + vext.8 d1, d0, d1, #1 + vaddl.u8 q7, d1, d4 + vld1.32 {q0}, [r0], r2 @ row 2 load + vmls.u16 d12, d14, d23 + vext.8 d5, d0, d1, #5 + vaddl.u8 q7, d0, d5 + vext.8 d2, d0, d1, #2 + vext.8 d3, d0, d1, #3 + vaddl.u8 q8, d2, d3 + vext.8 d4, d0, d1, #4 + vmla.u16 d14, d16, d22 + vext.8 d1, d0, d1, #1 + vaddl.u8 q8, d1, d4 + + vst1.32 d12, [r9], r6 @ store temp buffer 3 + + vmls.u16 d14, d16, d23 + +loop_4: + + vld1.32 {q0}, [r0], r2 @ row 3 load + vext.8 d5, d0, d1, #5 + vaddl.u8 q8, d0, d5 + vext.8 d2, d0, d1, #2 + vext.8 d3, d0, d1, #3 + vaddl.u8 q9, d2, d3 + vst1.32 d14, [r9], r6 @ store temp buffer 4 + vext.8 d4, d0, d1, #4 + vmla.u16 d16, d18, d22 + vext.8 d1, d0, d1, #1 + vaddl.u8 q9, d1, d4 + vadd.s16 d2, d10, d12 + vmls.u16 d16, d18, d23 + vadd.s16 d3, d8, d14 + vld1.32 {q9}, [r0], r2 @ row 4 load + vext.8 d25, d18, d19, #5 + vaddl.u8 q13, d18, d25 + vext.8 d20, d18, d19, #2 + + vst1.32 d16, [r9], r6 @ store temp buffer 5 + + vaddl.s16 q0, d6, d16 + vmlal.s16 q0, d2, d22 + vext.8 d21, d18, d19, #3 + vaddl.u8 q14, d20, d21 + vext.8 d24, d18, d19, #4 + vmlsl.s16 q0, d3, d23 + vmla.u16 d26, d28, d22 + vext.8 d19, d18, d19, #1 + vaddl.u8 q14, d19, d24 + vadd.s16 d2, d12, d14 + vmls.u16 d26, d28, d23 + vqrshrun.s32 d0, q0, #0xa + vadd.s16 d3, d10, d16 + vld1.32 {q9}, [r0], r2 @ row 5 load + vext.8 d25, d18, d19, #5 + vqmovn.u16 d11, q0 + vaddl.u8 q14, d18, d25 + + vst1.32 d26, [r9], r6 @ store temp buffer 6 + + @Q3 available here + vld1.32 d6, [r7], r6 @ load from temp buffer 0 + vld1.32 d7, [r7], r6 @ load from temp buffer 1 + vqrshrun.s16 d9, q3, #5 + + vext.8 d20, d18, d19, #2 + + vaddl.s16 q0, d8, d26 + vmlal.s16 q0, d2, d22 + vext.8 d21, d18, d19, #3 + vaddl.u8 q3, d20, d21 + vext.8 d24, d18, d19, #4 + vmlsl.s16 q0, d3, d23 + vmla.u16 d28, d6, d22 + vext.8 d19, d18, d19, #1 + vaddl.u8 q3, d19, d24 + vadd.s16 d2, d14, d16 + vmls.u16 d28, d6, d23 + vqrshrun.s32 d0, q0, #0xa + vadd.s16 d3, d12, d26 + vld1.32 {q9}, [r0], r2 @ row 6 load + vext.8 d25, d18, d19, #5 + vqmovn.u16 d13, q0 + + vtrn.32 d11, d13 + vaddl.s16 q0, d10, d28 + vrhadd.u8 d9, d9, d11 + + vst1.32 d28, [r9], r6 @ store temp buffer 7 + + vmlal.s16 q0, d2, d22 + vaddl.u8 q15, d18, d25 + + vst1.32 d9[0], [r1], r3 @ store row 0 + + vext.8 d20, d18, d19, #2 + + vst1.32 d9[1], [r1], r3 @ store row 1 + + vext.8 d21, d18, d19, #3 + vmlsl.s16 q0, d3, d23 + vaddl.u8 q4, d20, d21 + vext.8 d24, d18, d19, #4 + vmla.u16 d30, d8, d22 + vext.8 d19, d18, d19, #1 + vaddl.u8 q4, d19, d24 + vqrshrun.s32 d0, q0, #0xa + vadd.s16 d2, d16, d26 + vmls.u16 d30, d8, d23 + vqmovn.u16 d4, q0 + + vadd.s16 d3, d14, d28 + + + vaddl.s16 q0, d12, d30 + + vst1.32 d30, [r9] + + vmlal.s16 q0, d2, d22 + + vld1.32 d8, [r7], r6 @ load from temp buffer 2 + vld1.32 d9, [r7], r6 @ load from temp buffer 3 + vmlsl.s16 q0, d3, d23 + subs r4, r4, #4 + vqrshrun.s16 d10, q4, #5 + + vmov d12, d28 + + vqrshrun.s32 d0, q0, #0xa + vmov d6, d14 + vmov d8, d16 + + vqmovn.u16 d5, q0 + + vtrn.32 d4, d5 + vrhadd.u8 d4, d4, d10 + vmov d10, d26 + vmov d14, d30 + + vst1.32 d4[0], [r1], r3 @ store row 2 + vst1.32 d4[1], [r1], r3 @ store row 3 + + bgt loop_4 + +end_func: + vldmia sp!, {d8-d15} @ Restore neon registers that were saved + ldmfd sp!, {r4-r12, pc} @Restoring registers from stack + + diff --git a/common/arm/ih264_inter_pred_luma_horz_qpel_a9q.s b/common/arm/ih264_inter_pred_luma_horz_qpel_a9q.s new file mode 100755 index 0000000..c39ae01 --- /dev/null +++ b/common/arm/ih264_inter_pred_luma_horz_qpel_a9q.s @@ -0,0 +1,266 @@ +@/****************************************************************************** +@ * +@ * Copyright (C) 2015 The Android Open Source Project +@ * +@ * Licensed under the Apache License, Version 2.0 (the "License"); +@ * you may not use this file except in compliance with the License. +@ * You may obtain a copy of the License at: +@ * +@ * http://www.apache.org/licenses/LICENSE-2.0 +@ * +@ * Unless required by applicable law or agreed to in writing, software +@ * distributed under the License is distributed on an "AS IS" BASIS, +@ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +@ * See the License for the specific language governing permissions and +@ * limitations under the License. +@ * +@ ***************************************************************************** +@ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore +@*/ +@/** +@****************************************************************************** +@* @file +@* ih264_inter_pred_luma_horz_qpel_a9q.s +@* +@* @brief +@* Contains function definitions for inter prediction horizontal quarter pel interpolation. +@* +@* @author +@* Mohit +@* +@* @par List of Functions: +@* +@* - ih264_inter_pred_luma_horz_qpe_a9ql() +@* +@* @remarks +@* None +@* +@******************************************************************************* +@*/ + +@/* All the functions here are replicated from ih264_inter_pred_filters.c +@ + +@/** +@/** +@******************************************************************************* +@* +@* @brief +@* Quarter pel interprediction luma filter for horizontal input +@* +@* @par Description: +@* Applies a 6 tap horizontal filter .The output is clipped to 8 bits +@* sec 8.4.2.2.1 titled "Luma sample interpolation process" +@* +@* @param[in] pu1_src +@* UWORD8 pointer to the source +@* +@* @param[out] pu1_dst +@* UWORD8 pointer to the destination +@* +@* @param[in] src_strd +@* integer source stride +@* +@* @param[in] dst_strd +@* integer destination stride +@* +@* @param[in] ht +@* integer height of the array +@* +@* @param[in] wd +@* integer width of the array +@* +@ @param[in] pu1_tmp: temporary buffer: UNUSED in this function +@* +@* @param[in] dydx: x and y reference offset for qpel calculations. +@* @returns +@* +@ @remarks +@* None +@* +@******************************************************************************* +@*/ + +@void ih264_inter_pred_luma_horz ( +@ UWORD8 *pu1_src, +@ UWORD8 *pu1_dst, +@ WORD32 src_strd, +@ WORD32 dst_strd, +@ WORD32 ht, +@ WORD32 wd, +@ UWORD8* pu1_tmp, +@ UWORD32 dydx) + +@**************Variables Vs Registers***************************************** +@ r0 => *pu1_src +@ r1 => *pu1_dst +@ r2 => src_strd +@ r3 => dst_strd +@ r5 => ht +@ r6 => wd +@ r7 => dydx + +.text +.p2align 2 + + + .global ih264_inter_pred_luma_horz_qpel_a9q + +ih264_inter_pred_luma_horz_qpel_a9q: + + stmfd sp!, {r4-r12, r14} @store register values to stack + vstmdb sp!, {d8-d15} @push neon registers to stack + ldr r5, [sp, #104] @Loads ht + ldr r6, [sp, #108] @Loads wd + ldr r7, [sp, #116] @Loads dydx + and r7, r7, #3 @Finds x-offset + add r7, r0, r7, lsr #1 @pu1_src + (x_offset>>1) + sub r0, r0, #2 @pu1_src-2 + vmov.i8 d0, #5 @filter coeff + subs r12, r6, #8 @if wd=8 branch to loop_8 + vmov.i8 d1, #20 @filter coeff + + beq loop_8 + + subs r12, r6, #4 @if wd=4 branch to loop_4 + beq loop_4 + +loop_16: @when wd=16 + @// Processing row0 and row1 + vld1.8 {d2, d3, d4}, [r0], r2 @// Load row0 + vext.8 d31, d2, d3, #5 @//extract a[5] (column1,row0) + vld1.8 {d5, d6, d7}, [r0], r2 @// Load row1 + vext.8 d30, d3, d4, #5 @//extract a[5] (column2,row0) + vaddl.u8 q4, d31, d2 @// a0 + a5 (column1,row0) + vext.8 d28, d5, d6, #5 @//extract a[5] (column1,row1) + vaddl.u8 q5, d30, d3 @// a0 + a5 (column2,row0) + vext.8 d27, d6, d7, #5 @//extract a[5] (column2,row1) + vaddl.u8 q7, d28, d5 @// a0 + a5 (column1,row1) + vext.8 d31, d2, d3, #2 @//extract a[2] (column1,row0) + vaddl.u8 q8, d27, d6 @// a0 + a5 (column2,row1) + vext.8 d30, d3, d4, #2 @//extract a[2] (column2,row0) + vmlal.u8 q4, d31, d1 @// a0 + a5 + 20a2 (column1,row0) + vext.8 d28, d5, d6, #2 @//extract a[2] (column1,row1) + vmlal.u8 q5, d30, d1 @// a0 + a5 + 20a2 (column2,row0) + vext.8 d27, d6, d7, #2 @//extract a[2] (column2,row1) + vmlal.u8 q7, d28, d1 @// a0 + a5 + 20a2 (column1,row1) + vext.8 d31, d2, d3, #3 @//extract a[3] (column1,row0) + vmlal.u8 q8, d27, d1 @// a0 + a5 + 20a2 (column2,row1) + vext.8 d30, d3, d4, #3 @//extract a[3] (column2,row0) + vmlal.u8 q4, d31, d1 @// a0 + a5 + 20a2 + 20a3 (column1,row0) + vext.8 d28, d5, d6, #3 @//extract a[3] (column1,row1) + vmlal.u8 q5, d30, d1 @// a0 + a5 + 20a2 + 20a3 (column2,row0) + vext.8 d27, d6, d7, #3 @//extract a[3] (column2,row1) + vmlal.u8 q7, d28, d1 @// a0 + a5 + 20a2 + 20a3 (column1,row1) + vext.8 d31, d2, d3, #1 @//extract a[1] (column1,row0) + vmlal.u8 q8, d27, d1 @// a0 + a5 + 20a2 + 20a3 (column2,row1) + vext.8 d30, d3, d4, #1 @//extract a[1] (column2,row0) + vmlsl.u8 q4, d31, d0 @// a0 + a5 + 20a2 + 20a3 - 5a1 (column1,row0) + vext.8 d28, d5, d6, #1 @//extract a[1] (column1,row1) + vmlsl.u8 q5, d30, d0 @// a0 + a5 + 20a2 + 20a3 - 5a1 (column2,row0) + vext.8 d27, d6, d7, #1 @//extract a[1] (column2,row1) + vmlsl.u8 q7, d28, d0 @// a0 + a5 + 20a2 + 20a3 - 5a1 (column1,row1) + vext.8 d31, d2, d3, #4 @//extract a[4] (column1,row0) + vmlsl.u8 q8, d27, d0 @// a0 + a5 + 20a2 + 20a3 - 5a1 (column2,row1) + vext.8 d30, d3, d4, #4 @//extract a[4] (column2,row0) + vmlsl.u8 q4, d31, d0 @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1,row0) + vext.8 d28, d5, d6, #4 @//extract a[4] (column1,row1) + vmlsl.u8 q5, d30, d0 @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column2,row0) + vext.8 d27, d6, d7, #4 @//extract a[4] (column2,row1) + vmlsl.u8 q7, d28, d0 @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1,row1) + vmlsl.u8 q8, d27, d0 @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column2,row1) + vld1.32 {d12, d13}, [r7], r2 @Load value for interpolation (column1,row0) + vqrshrun.s16 d20, q4, #5 @// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column1,row0) + vqrshrun.s16 d21, q5, #5 @// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column2,row0) + vext.8 d31, d2, d3, #5 @//extract a[5] (column1,row2) + vrhadd.u8 q10, q6, q10 @Interpolation step for qpel calculation + vqrshrun.s16 d18, q7, #5 @// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column1,row1) + vst1.8 {d20, d21}, [r1], r3 @//Store dest row0 + vext.8 d30, d3, d4, #5 @//extract a[5] (column2,row2) + vqrshrun.s16 d19, q8, #5 @// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column2,row1) + vld1.32 {d12, d13}, [r7], r2 @Load value for interpolation (column1,row1) + vrhadd.u8 q9, q6, q9 @Interpolation step for qpel calculation + vst1.8 {d18, d19}, [r1], r3 @//Store dest row1 + subs r5, r5, #2 @ 2 rows done, decrement by 2 + + beq end_func + b loop_16 + +loop_8: +@// Processing row0 and row1 + + vld1.8 {d5, d6}, [r0], r2 @// Load row1 + vext.8 d28, d5, d6, #5 @//extract a[5] (column1,row1) + vld1.8 {d2, d3}, [r0], r2 @// Load row0 + vext.8 d25, d5, d6, #2 @//extract a[2] (column1,row1) + vext.8 d31, d2, d3, #5 @//extract a[5] (column1,row0) + vext.8 d24, d5, d6, #3 @//extract a[3] (column1,row1) + vext.8 d23, d5, d6, #1 @//extract a[1] (column1,row1) + vext.8 d22, d5, d6, #4 @//extract a[4] (column1,row1) + vaddl.u8 q7, d28, d5 @// a0 + a5 (column1,row1) + vext.8 d29, d2, d3, #3 @//extract a[3] (column1,row0) + vmlal.u8 q7, d25, d1 @// a0 + a5 + 20a2 (column1,row1) + vmlal.u8 q7, d24, d1 @// a0 + a5 + 20a2 + 20a3 (column1,row1) + vmlsl.u8 q7, d23, d0 @// a0 + a5 + 20a2 + 20a3 - 5a1 (column1,row1) + vmlsl.u8 q7, d22, d0 @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1,row1) + vext.8 d30, d2, d3, #2 @//extract a[2] (column1,row0) + vaddl.u8 q4, d31, d2 @// a0 + a5 (column1,row0) + vext.8 d27, d2, d3, #1 @//extract a[1] (column1,row0) + vext.8 d26, d2, d3, #4 @//extract a[4] (column1,row0) + vmlal.u8 q4, d29, d1 @// a0 + a5 + 20a2 + 20a3 (column1,row0) + vmlal.u8 q4, d30, d1 @// a0 + a5 + 20a2 (column1,row0) + vmlsl.u8 q4, d27, d0 @// a0 + a5 + 20a2 + 20a3 - 5a1 (column1,row0) + vmlsl.u8 q4, d26, d0 @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1,row0) + vqrshrun.s16 d18, q7, #5 @// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column1,row0) + vld1.32 d12, [r7], r2 @Load value for interpolation (column1,row0) + vld1.32 d13, [r7], r2 @Load value for interpolation (column1,row1) + vqrshrun.s16 d19, q4, #5 @// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column1,row1) + vrhadd.u8 q9, q6, q9 @Interpolation step for qpel calculation + vst1.8 {d18}, [r1], r3 @//Store dest row0 + vst1.8 {d19}, [r1], r3 @//Store dest row1 + subs r5, r5, #2 @ 2 rows done, decrement by 2 + + beq end_func @ Branch if height==4 + b loop_8 @looping if height == 8 or 16 + +loop_4: + vld1.8 {d5, d6}, [r0], r2 @// Load row1 + vext.8 d28, d5, d6, #5 @//extract a[5] (column1,row1) + vld1.8 {d2, d3}, [r0], r2 @// Load row0 + vext.8 d25, d5, d6, #2 @//extract a[2] (column1,row1) + vext.8 d31, d2, d3, #5 @//extract a[5] (column1,row0) + vaddl.u8 q7, d28, d5 @// a0 + a5 (column1,row1) + vext.8 d24, d5, d6, #3 @//extract a[3] (column1,row1) + vext.8 d23, d5, d6, #1 @//extract a[1] (column1,row1) + vext.8 d22, d5, d6, #4 @//extract a[4] (column1,row1) + vext.8 d29, d2, d3, #3 @//extract a[3] (column1,row0) + vmlal.u8 q7, d25, d1 @// a0 + a5 + 20a2 (column1,row1) + vmlal.u8 q7, d24, d1 @// a0 + a5 + 20a2 + 20a3 (column1,row1) + vmlsl.u8 q7, d23, d0 @// a0 + a5 + 20a2 + 20a3 - 5a1 (column1,row1) + vmlsl.u8 q7, d22, d0 @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1,row1) + vaddl.u8 q4, d31, d2 @// a0 + a5 (column1,row0) + vext.8 d30, d2, d3, #2 @//extract a[2] (column1,row0) + vld1.32 d12, [r7], r2 @Load value for interpolation (column1,row0) + vld1.32 d13, [r7], r2 @Load value for interpolation (column1,row1) + vext.8 d27, d2, d3, #1 @//extract a[1] (column1,row0) + vext.8 d26, d2, d3, #4 @//extract a[4] (column1,row0) + vmlal.u8 q4, d29, d1 @// a0 + a5 + 20a2 + 20a3 (column1,row0) + vmlal.u8 q4, d30, d1 @// a0 + a5 + 20a2 (column1,row0) + vmlsl.u8 q4, d27, d0 @// a0 + a5 + 20a2 + 20a3 - 5a1 (column1,row0) + vmlsl.u8 q4, d26, d0 @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1,row0) + vqrshrun.s16 d18, q7, #5 @// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column1,row0) + vqrshrun.s16 d19, q4, #5 @// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column1,row1) + vrhadd.u8 q9, q6, q9 @Interpolation step for qpel calculation + vst1.32 d18[0], [r1], r3 @//Store dest row0 + vst1.32 d19[0], [r1], r3 @//Store dest row1 + + subs r5, r5, #2 @ 2 rows done, decrement by 2 + beq end_func + + b loop_4 + +end_func: + vldmia sp!, {d8-d15} @ Restore neon registers that were saved + ldmfd sp!, {r4-r12, pc} @Restoring registers from stack + + diff --git a/common/arm/ih264_inter_pred_luma_horz_qpel_vert_hpel_a9q.s b/common/arm/ih264_inter_pred_luma_horz_qpel_vert_hpel_a9q.s new file mode 100755 index 0000000..565cc80 --- /dev/null +++ b/common/arm/ih264_inter_pred_luma_horz_qpel_vert_hpel_a9q.s @@ -0,0 +1,505 @@ +@/****************************************************************************** +@ * +@ * Copyright (C) 2015 The Android Open Source Project +@ * +@ * Licensed under the Apache License, Version 2.0 (the "License"); +@ * you may not use this file except in compliance with the License. +@ * You may obtain a copy of the License at: +@ * +@ * http://www.apache.org/licenses/LICENSE-2.0 +@ * +@ * Unless required by applicable law or agreed to in writing, software +@ * distributed under the License is distributed on an "AS IS" BASIS, +@ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +@ * See the License for the specific language governing permissions and +@ * limitations under the License. +@ * +@ ***************************************************************************** +@ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore +@*/ +@/** +@****************************************************************************** +@* @file +@* ih264_inter_pred_luma_horz_qpel_vert_hpel_a9q.s +@* +@* @brief +@* Contains function definitions for inter prediction interpolation. +@* +@* @author +@* Mohit +@* +@* @par List of Functions: +@* +@* - ih264_inter_pred_luma_horz_qpel_vert_hpel_a9q() +@* +@* @remarks +@* None +@* +@******************************************************************************* +@*/ + +@/* All the functions here are replicated from ih264_inter_pred_filters.c +@ + +@/** +@/** +@/** +@******************************************************************************* +@* +@* @brief +@* This function implements a two stage cascaded six tap filter. It +@* applies the six tap filter in the vertical direction on the +@* predictor values, followed by applying the same filter in the +@* horizontal direction on the output of the first stage. It then averages +@* the output of the 1st stage and the final stage to obtain the quarter +@* pel values.The six tap filtering operation is described in sec 8.4.2.2.1 +@* titled "Luma sample interpolation process". +@* +@* @par Description: +@* This function is called to obtain pixels lying at the following +@* location (1/4,1/2) or (3/4,1/2). The function interpolates +@* the predictors first in the verical direction and then in the +@* horizontal direction to output the (1/2,1/2). It then averages +@* the output of the 2nd stage and (1/2,1/2) value to obtain (1/4,1/2) +@* or (3/4,1/2) depending on the offset. +@* +@* @param[in] pu1_src +@* UWORD8 pointer to the source +@* +@* @param[out] pu1_dst +@* UWORD8 pointer to the destination +@* +@* @param[in] src_strd +@* integer source stride +@* +@* @param[in] dst_strd +@* integer destination stride +@* +@* @param[in] ht +@* integer height of the array +@* +@* @param[in] wd +@* integer width of the array +@* +@* @param[in] pu1_tmp: temporary buffer +@* +@* @param[in] dydx: x and y reference offset for qpel calculations +@* +@* @returns +@* +@* @remarks +@* None +@* +@******************************************************************************* +@*/; + +@void ih264_inter_pred_luma_horz_qpel_vert_hpel(UWORD8 *pu1_src, +@ UWORD8 *pu1_dst, +@ WORD32 src_strd,, +@ WORD32 dst_strd, +@ WORD32 ht, +@ WORD32 wd, +@ UWORD8* pu1_tmp, +@ UWORD32 dydx) + +@**************Variables Vs Registers***************************************** +@ r0 => *pu1_src +@ r1 => *pu1_dst +@ r2 => src_strd +@ r3 => dst_strd +@ r4 => ht +@ r5 => wd +@ r6 => dydx +@ r9 => *pu1_tmp + +.text +.p2align 2 + + .global ih264_inter_pred_luma_horz_qpel_vert_hpel_a9q + +ih264_inter_pred_luma_horz_qpel_vert_hpel_a9q: + + stmfd sp!, {r4-r12, r14} @store register values to stack + vstmdb sp!, {d8-d15} @push neon registers to stack + ldr r4, [sp, #104] @ loads ht + sub r0, r0, r2, lsl #1 @pu1_src-2*src_strd + sub r0, r0, #2 @pu1_src-2 + ldr r5, [sp, #108] @ loads wd + ldr r6, [sp, #116] @ loads dydx + and r6, r6, #2 @ dydx & 0x3 followed by dydx>>1 and dydx<<1 + ldr r9, [sp, #112] @pu1_tmp + add r7, r9, #4 + add r6, r7, r6 @ pi16_pred1_temp += (x_offset>>1) + + vmov.u16 q13, #0x14 @ Filter coeff 20 into Q13 + vmov.u16 q12, #0x5 @ Filter coeff 5 into Q12 + mov r7, #0x20 + mov r8, #0x30 + subs r12, r5, #4 @if wd=4 branch to loop_4 + beq loop_4 + + subs r12, r5, #8 @if wd=8 branch to loop_8 + beq loop_8 + + @when wd=16 + vmov.u16 q14, #0x14 @ Filter coeff 20 into Q13 + vmov.u16 q15, #0x5 @ Filter coeff 5 into Q12 + add r14, r2, #0 + sub r2, r2, #16 + + +loop_16: + + vld1.u32 {q0}, [r0]! @ Vector load from src[0_0] + vld1.u32 d12, [r0], r2 @ Vector load from src[0_0] + vld1.u32 {q1}, [r0]! @ Vector load from src[1_0] + vld1.u32 d13, [r0], r2 @ Vector load from src[1_0] + vld1.u32 {q2}, [r0]! @ Vector load from src[2_0] + vld1.u32 d14, [r0], r2 @ Vector load from src[2_0] + vld1.u32 {q3}, [r0]! @ Vector load from src[3_0] + vld1.u32 d15, [r0], r2 @ Vector load from src[3_0] + vld1.u32 {q4}, [r0]! @ Vector load from src[4_0] + vld1.u32 d16, [r0], r2 @ Vector load from src[4_0] + + vld1.u32 {q5}, [r0]! @ Vector load from src[5_0] + vld1.u32 d17, [r0], r2 @ Vector load from src[5_0] + + vaddl.u8 q10, d4, d6 + vaddl.u8 q9, d0, d10 + vaddl.u8 q11, d2, d8 + vmla.u16 q9, q10, q14 + vaddl.u8 q12, d5, d7 + vaddl.u8 q10, d1, d11 + vaddl.u8 q13, d3, d9 + vmla.u16 q10, q12, q14 + vaddl.u8 q12, d14, d15 + vmls.u16 q9, q11, q15 + vaddl.u8 q11, d12, d17 + vmls.u16 q10, q13, q15 + vaddl.u8 q13, d13, d16 + vmla.u16 q11, q12, q14 + vmls.u16 q11, q13, q15 + vst1.32 {q9}, [r9]! + vst1.32 {q10}, [r9]! + vext.16 q12, q9, q10, #2 + vext.16 q13, q9, q10, #3 + vst1.32 {q11}, [r9] + vext.16 q11, q9, q10, #5 + vadd.s16 q0, q12, q13 + vext.16 q12, q9, q10, #1 + vext.16 q13, q9, q10, #4 + vadd.s16 q12, q12, q13 + + vaddl.s16 q13, d18, d22 + vmlal.s16 q13, d0, d28 + vmlsl.s16 q13, d24, d30 + + vaddl.s16 q11, d19, d23 + vmlal.s16 q11, d1, d28 + vmlsl.s16 q11, d25, d30 + + vqrshrun.s32 d18, q13, #10 + vqrshrun.s32 d19, q11, #10 + vld1.32 {q11}, [r9]! + vqmovn.u16 d18, q9 + + vext.16 q12, q10, q11, #2 + vext.16 q13, q10, q11, #3 + vext.16 q0, q10, q11, #5 + vst1.32 d18, [r1] + vadd.s16 q9, q12, q13 + vext.16 q12, q10, q11, #1 + vext.16 q13, q10, q11, #4 + vadd.s16 q12, q12, q13 + + vaddl.s16 q13, d0, d20 + vmlal.s16 q13, d18, d28 + vmlsl.s16 q13, d24, d30 + + vaddl.s16 q11, d1, d21 + vmlal.s16 q11, d19, d28 + vmlsl.s16 q11, d25, d30 + + vqrshrun.s32 d18, q13, #10 + vqrshrun.s32 d19, q11, #10 + + vaddl.u8 q12, d7, d9 + vld1.32 {q10}, [r6]! + vld1.32 {q11}, [r6], r7 + + vqmovn.u16 d19, q9 + + vld1.32 d18, [r1] + vqrshrun.s16 d20, q10, #5 + vqrshrun.s16 d21, q11, #5 + vaddl.u8 q11, d4, d10 + vld1.u32 {q0}, [r0]! @ Vector load from src[6_0] + vrhadd.u8 q9, q9, q10 + vld1.u32 d12, [r0], r2 @ Vector load from src[6_0] + vaddl.u8 q10, d6, d8 + vaddl.u8 q13, d5, d11 + vst1.32 {q9}, [r1], r3 @ store row 0 + +@ROW_2 + + vaddl.u8 q9, d2, d0 + + vmla.u16 q9, q10, q14 + + vaddl.u8 q10, d3, d1 + + vmla.u16 q10, q12, q14 + vaddl.u8 q12, d15, d16 + vmls.u16 q9, q11, q15 + vaddl.u8 q11, d13, d12 + vmls.u16 q10, q13, q15 + vaddl.u8 q13, d14, d17 + vmla.u16 q11, q12, q14 + vmls.u16 q11, q13, q15 + vst1.32 {q9}, [r9]! + vst1.32 {q10}, [r9]! + vext.16 q12, q9, q10, #2 + vext.16 q13, q9, q10, #3 + vst1.32 {q11}, [r9] + vext.16 q11, q9, q10, #5 + vadd.s16 q1, q12, q13 + vext.16 q12, q9, q10, #1 + vext.16 q13, q9, q10, #4 + vadd.s16 q12, q12, q13 + + vaddl.s16 q13, d18, d22 + vmlal.s16 q13, d2, d28 + vmlsl.s16 q13, d24, d30 + + vaddl.s16 q11, d19, d23 + vmlal.s16 q11, d3, d28 + vmlsl.s16 q11, d25, d30 + + vqrshrun.s32 d18, q13, #10 + vqrshrun.s32 d19, q11, #10 + vld1.32 {q11}, [r9]! + vqmovn.u16 d18, q9 + + vext.16 q12, q10, q11, #2 + vext.16 q13, q10, q11, #3 + vext.16 q1, q10, q11, #5 + vst1.32 d18, [r1] + vadd.s16 q9, q12, q13 + vext.16 q12, q10, q11, #1 + vext.16 q13, q10, q11, #4 + vadd.s16 q12, q12, q13 + + vaddl.s16 q13, d2, d20 + vmlal.s16 q13, d18, d28 + vmlsl.s16 q13, d24, d30 + + vaddl.s16 q11, d3, d21 + vmlal.s16 q11, d19, d28 + vmlsl.s16 q11, d25, d30 + + vqrshrun.s32 d18, q13, #10 + vqrshrun.s32 d19, q11, #10 + vaddl.u8 q12, d9, d11 + vld1.32 {q10}, [r6]! + vld1.32 {q11}, [r6], r7 + vqmovn.u16 d19, q9 + vld1.32 d18, [r1] + vqrshrun.s16 d20, q10, #5 + vqrshrun.s16 d21, q11, #5 + + vrhadd.u8 q9, q9, q10 + + vst1.32 {q9}, [r1], r3 @ store row 1 + + subs r4, r4, #2 + subne r0, r0 , r14, lsl #2 + subne r0, r0, r14 + + beq end_func @ Branch if height==4 + b loop_16 @ Loop if height==8 + +loop_8: + vld1.u32 {q0}, [r0], r2 @ Vector load from src[0_0] + vld1.u32 {q1}, [r0], r2 @ Vector load from src[1_0] + vld1.u32 {q2}, [r0], r2 @ Vector load from src[2_0] + vld1.u32 {q3}, [r0], r2 @ Vector load from src[3_0] + vld1.u32 {q4}, [r0], r2 @ Vector load from src[4_0] + + vld1.u32 {q5}, [r0], r2 @ Vector load from src[5_0] + vaddl.u8 q7, d4, d6 + vaddl.u8 q6, d0, d10 + vaddl.u8 q8, d2, d8 + vmla.u16 q6, q7, q13 + vaddl.u8 q9, d5, d7 + vaddl.u8 q7, d1, d11 + vaddl.u8 q11, d3, d9 + vmla.u16 q7, q9, q13 + vmls.u16 q6, q8, q12 + vld1.32 {q0}, [r0], r2 @ Vector load from src[6_0] + vaddl.u8 q8, d6, d8 + vmls.u16 q7, q11, q12 + vaddl.u8 q14, d2, d0 + vst1.32 {q6}, [r9]! @ store row 0 to temp buffer: col 0 + vext.16 q11, q6, q7, #5 + vaddl.u8 q9, d4, d10 + vmla.u16 q14, q8, q13 + vaddl.s16 q15, d12, d22 + vst1.32 {q7}, [r9], r7 @ store row 0 to temp buffer: col 1 + vaddl.s16 q11, d13, d23 + vext.16 q8, q6, q7, #2 + vmls.u16 q14, q9, q12 + vext.16 q9, q6, q7, #3 + vext.16 q10, q6, q7, #4 + vext.16 q7, q6, q7, #1 + vadd.s16 q8, q8, q9 + vadd.s16 q9, q7, q10 + vaddl.u8 q10, d7, d9 + vmlal.s16 q15, d16, d26 + vmlsl.s16 q15, d18, d24 + vmlal.s16 q11, d17, d26 + vmlsl.s16 q11, d19, d24 + vaddl.u8 q7, d3, d1 + vst1.32 {q14}, [r9]! @ store row 1 to temp buffer: col 0 + vmla.u16 q7, q10, q13 + vqrshrun.s32 d12, q15, #10 + vaddl.u8 q8, d5, d11 + vqrshrun.s32 d13, q11, #10 + vmls.u16 q7, q8, q12 +@ vld1.32 {q1},[r0],r2 ; Vector load from src[7_0] + vqmovn.u16 d25, q6 + vaddl.u8 q8, d8, d10 + + + vext.16 q11, q14, q7, #5 + vaddl.u8 q10, d4, d2 + vaddl.s16 q15, d28, d22 + vmla.u16 q10, q8, q13 + vst1.32 {q7}, [r9], r7 @ store row 1 to temp buffer: col 1 + vaddl.s16 q11, d29, d23 + vext.16 q8, q14, q7, #2 + vext.16 q9, q14, q7, #3 + vext.16 q6, q14, q7, #4 + vext.16 q7, q14, q7, #1 + vadd.s16 q8, q8, q9 + vadd.s16 q9, q6, q7 + vld1.32 {q7}, [r6], r8 @ load row 0 from temp buffer + vmlal.s16 q15, d16, d26 + vmlsl.s16 q15, d18, d24 + vmlal.s16 q11, d17, d26 + vmlsl.s16 q11, d19, d24 + vqrshrun.s16 d14, q7, #0x5 + vld1.32 {q14}, [r6], r8 @ load row 1 from temp buffer + vaddl.u8 q9, d6, d0 + vqrshrun.s32 d16, q15, #10 + vqrshrun.s16 d15, q14, #0x5 + vqrshrun.s32 d17, q11, #10 + vmov d12, d25 + vmov d25, d24 + + vqmovn.u16 d13, q8 + vrhadd.u8 q6, q6, q7 + + vst1.32 d12, [r1], r3 @ store row 0 + vst1.32 d13, [r1], r3 @ store row 1 + + subs r4, r4, #2 + subne r0, r0 , r2, lsl #2 + subne r0, r0, r2 + + beq end_func @ Branch if height==4 + b loop_8 @ Loop if height==8 + +loop_4: + vld1.u32 {q0}, [r0], r2 @ Vector load from src[0_0] + vld1.u32 {q1}, [r0], r2 @ Vector load from src[1_0] + vld1.u32 {q2}, [r0], r2 @ Vector load from src[2_0] + vld1.u32 {q3}, [r0], r2 @ Vector load from src[3_0] + vld1.u32 {q4}, [r0], r2 @ Vector load from src[4_0] + vld1.u32 {q5}, [r0], r2 @ Vector load from src[5_0] + + vaddl.u8 q7, d4, d6 @ temp1 = src[2_0] + src[3_0] + vaddl.u8 q6, d0, d10 @ temp = src[0_0] + src[5_0] + vaddl.u8 q8, d2, d8 @ temp2 = src[1_0] + src[4_0] + vmla.u16 q6, q7, q13 @ temp += temp1 * 20 + vaddl.u8 q9, d5, d7 @ temp1 = src[2_0] + src[3_0] + vaddl.u8 q7, d1, d11 @ temp = src[0_0] + src[5_0] + vaddl.u8 q11, d3, d9 @ temp2 = src[1_0] + src[4_0] + vmla.u16 q7, q9, q13 @ temp += temp1 * 20 + vmls.u16 q6, q8, q12 @ temp -= temp2 * 5 + vld1.32 {q0}, [r0], r2 @ Vector load from src[6_0] + vaddl.u8 q8, d6, d8 + vmls.u16 q7, q11, q12 @ temp -= temp2 * 5 + @Q6 and Q7 have filtered values + vaddl.u8 q14, d2, d0 + vst1.32 {q6}, [r9]! @ store row 0 to temp buffer: col 0 + vext.16 q11, q6, q7, #5 + vaddl.u8 q9, d4, d10 + vmla.u16 q14, q8, q13 + vaddl.s16 q15, d12, d22 + vst1.32 {q7}, [r9], r7 @ store row 0 to temp buffer: col 1 + vaddl.s16 q11, d13, d23 + vext.16 q8, q6, q7, #2 + vmls.u16 q14, q9, q12 + vext.16 q9, q6, q7, #3 + vext.16 q10, q6, q7, #4 + vext.16 q7, q6, q7, #1 + vadd.s16 q8, q8, q9 + vadd.s16 q9, q7, q10 + vaddl.u8 q10, d7, d9 + vmlal.s16 q15, d16, d26 + vmlsl.s16 q15, d18, d24 + vmlal.s16 q11, d17, d26 + vmlsl.s16 q11, d19, d24 + vaddl.u8 q7, d3, d1 + vst1.32 {q14}, [r9]! @ store row 1 to temp buffer: col 0 + vmla.u16 q7, q10, q13 + vqrshrun.s32 d12, q15, #10 + vaddl.u8 q8, d5, d11 + vqrshrun.s32 d13, q11, #10 + vmls.u16 q7, q8, q12 + vqmovn.u16 d25, q6 + vaddl.u8 q8, d8, d10 + + vext.16 q11, q14, q7, #5 + vaddl.u8 q10, d4, d2 + vaddl.s16 q15, d28, d22 + vmla.u16 q10, q8, q13 + vst1.32 {q7}, [r9], r7 @ store row 1 to temp buffer: col 1 + vaddl.s16 q11, d29, d23 + vext.16 q8, q14, q7, #2 + vext.16 q9, q14, q7, #3 + vext.16 q6, q14, q7, #4 + vext.16 q7, q14, q7, #1 + vadd.s16 q8, q8, q9 + vadd.s16 q9, q6, q7 + vld1.32 d14, [r6], r8 @load row 0 from temp buffer + vmlal.s16 q15, d16, d26 + vmlsl.s16 q15, d18, d24 + vmlal.s16 q11, d17, d26 + vmlsl.s16 q11, d19, d24 + vqrshrun.s16 d14, q7, #0x5 + vld1.32 d28, [r6], r8 @load row 1 from temp buffer + vaddl.u8 q9, d6, d0 + vqrshrun.s32 d16, q15, #10 + vqrshrun.s16 d15, q14, #0x5 + vqrshrun.s32 d17, q11, #10 + vmov d12, d25 + vmov d25, d24 + + vqmovn.u16 d13, q8 + vrhadd.u8 q6, q6, q7 + vst1.32 d12[0], [r1], r3 @ store row 0 + vst1.32 d13[0], [r1], r3 @store row 1 + + subs r4, r4, #2 + subne r0, r0 , r2, lsl #2 + subne r0, r0, r2 + + beq end_func @ Branch if height==4 + b loop_4 @ Loop if height==8 + +end_func: + vldmia sp!, {d8-d15} @ Restore neon registers that were saved + ldmfd sp!, {r4-r12, pc} @Restoring registers from stack + + diff --git a/common/arm/ih264_inter_pred_luma_horz_qpel_vert_qpel_a9q.s b/common/arm/ih264_inter_pred_luma_horz_qpel_vert_qpel_a9q.s new file mode 100755 index 0000000..3c8b60a --- /dev/null +++ b/common/arm/ih264_inter_pred_luma_horz_qpel_vert_qpel_a9q.s @@ -0,0 +1,355 @@ +@/****************************************************************************** +@ * +@ * Copyright (C) 2015 The Android Open Source Project +@ * +@ * Licensed under the Apache License, Version 2.0 (the "License"); +@ * you may not use this file except in compliance with the License. +@ * You may obtain a copy of the License at: +@ * +@ * http://www.apache.org/licenses/LICENSE-2.0 +@ * +@ * Unless required by applicable law or agreed to in writing, software +@ * distributed under the License is distributed on an "AS IS" BASIS, +@ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +@ * See the License for the specific language governing permissions and +@ * limitations under the License. +@ * +@ ***************************************************************************** +@ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore +@*/ +@/** +@****************************************************************************** +@* @file +@* ih264_inter_pred_luma_horz_qpel_vert_qpel_a9q.s +@* +@* @brief +@* Contains function definitions for inter prediction interpolation. +@* +@* @author +@* Mohit +@* +@* @par List of Functions: +@* +@* - ih264_inter_pred_luma_horz_qpel_vert_qpel_a9q() +@* +@* @remarks +@* None +@* +@******************************************************************************* +@*/ + +@/* All the functions here are replicated from ih264_inter_pred_filters.c +@ + +@/** +@/** +@/** +@******************************************************************************* +@* +@* @brief +@* This function implements two six tap filters. It +@* applies the six tap filter in the horizontal direction on the +@* predictor values, then applies the same filter in the +@* vertical direction on the predictor values. It then averages these +@* two outputs to obtain quarter pel values in horizontal and vertical direction. +@* The six tap filtering operation is described in sec 8.4.2.2.1 titled +@* "Luma sample interpolation process" +@* +@* @par Description: +@* This function is called to obtain pixels lying at the following +@* location (1/4,1/4) or (3/4,1/4) or (1/4,3/4) or (3/4,3/4). +@* The function interpolates the predictors first in the horizontal direction +@* and then in the vertical direction, and then averages these two +@* values. +@* +@* @param[in] pu1_src +@* UWORD8 pointer to the source +@* +@* @param[out] pu1_dst +@* UWORD8 pointer to the destination +@* +@* @param[in] src_strd +@* integer source stride +@* +@* @param[in] dst_strd +@* integer destination stride +@* +@* @param[in] ht +@* integer height of the array +@* +@* @param[in] wd +@* integer width of the array +@* +@* @param[in] pu1_tmp: temporary buffer +@* +@* @param[in] dydx: x and y reference offset for qpel calculations +@* +@* @returns +@* +@* @remarks +@* None +@* +@******************************************************************************* +@*/; + +@void ih264_inter_pred_luma_horz_qpel_vert_qpel(UWORD8 *pu1_src, +@ UWORD8 *pu1_dst, +@ WORD32 src_strd,, +@ WORD32 dst_strd, +@ WORD32 ht, +@ WORD32 wd, +@ UWORD8* pu1_tmp, +@ UWORD32 dydx) + +@**************Variables Vs Registers***************************************** +@ r0 => *pu1_src +@ r1 => *pu1_dst +@ r2 => src_strd +@ r3 => dst_strd +@ r4 => ht +@ r5 => wd +@ r6 => dydx + +.text +.p2align 2 + + .global ih264_inter_pred_luma_horz_qpel_vert_qpel_a9q + +ih264_inter_pred_luma_horz_qpel_vert_qpel_a9q: + + stmfd sp!, {r4-r12, r14} @store register values to stack + vstmdb sp!, {d8-d15} @push neon registers to stack + ldr r4, [sp, #104] @ loads ht + ldr r5, [sp, #108] @ loads wd + ldr r6, [sp, #116] @dydx + and r7, r6, #3 + add r7, r0, r7, lsr #1 @pu1_pred_vert = pu1_src + (x_offset>>1) + + and r6, r6, #12 @Finds y-offset + lsr r6, r6, #3 @dydx>>3 + mul r6, r2, r6 + add r6, r0, r6 @pu1_pred_horz = pu1_src + (y_offset>>1)*src_strd + sub r7, r7, r2, lsl #1 @pu1_pred_vert-2*src_strd + sub r6, r6, #2 @pu1_pred_horz-2 + vmov.u8 d30, #20 @ Filter coeff 20 + vmov.u8 d31, #5 @ Filter coeff 5 + + subs r12, r5, #4 @if wd=4 branch to loop_4 + beq loop_4 + subs r12, r5, #8 @if wd=8 branch to loop_8 + beq loop_8 + +loop_16: + vld1.32 {q0}, [r7], r2 @ Vector load from src[0_0] + vld1.32 {q1}, [r7], r2 @ Vector load from src[1_0] + vld1.32 {q2}, [r7], r2 @ Vector load from src[2_0] + vld1.32 {q3}, [r7], r2 @ Vector load from src[3_0] + vld1.32 {q4}, [r7], r2 @ Vector load from src[4_0] + add r11, r6, #8 + vld1.32 {q5}, [r7], r2 @ Vector load from src[5_0] + vld1.32 {q9}, [r6], r2 @ horz row0, col 0 + vaddl.u8 q12, d0, d10 + vmlal.u8 q12, d4, d30 + vmlal.u8 q12, d6, d30 + vmlsl.u8 q12, d2, d31 + vmlsl.u8 q12, d8, d31 + vext.8 d23, d18, d19, #5 + vext.8 d20, d18, d19, #2 + vext.8 d21, d18, d19, #3 + vext.8 d22, d18, d19, #4 + vext.8 d19, d18, d19, #1 + vqrshrun.s16 d26, q12, #5 + vaddl.u8 q14, d18, d23 + vmlal.u8 q14, d20, d30 + vmlal.u8 q14, d21, d30 + vmlsl.u8 q14, d19, d31 + vmlsl.u8 q14, d22, d31 + vld1.32 {q9}, [r11], r2 @ horz row 0, col 1 + vaddl.u8 q12, d1, d11 + vmlal.u8 q12, d5, d30 + vmlal.u8 q12, d7, d30 + vmlsl.u8 q12, d3, d31 + vmlsl.u8 q12, d9, d31 + vqrshrun.s16 d28, q14, #5 + vext.8 d23, d18, d19, #5 + vext.8 d20, d18, d19, #2 + vext.8 d21, d18, d19, #3 + vext.8 d22, d18, d19, #4 + vext.8 d19, d18, d19, #1 + vqrshrun.s16 d27, q12, #5 + vld1.32 {q6}, [r7], r2 @ src[6_0] + + vaddl.u8 q12, d18, d23 + vmlal.u8 q12, d20, d30 + vmlal.u8 q12, d21, d30 + vmlsl.u8 q12, d19, d31 + vmlsl.u8 q12, d22, d31 + + vaddl.u8 q8, d2, d12 + vmlal.u8 q8, d6, d30 + vmlal.u8 q8, d8, d30 + vmlsl.u8 q8, d4, d31 + vmlsl.u8 q8, d10, d31 + vqrshrun.s16 d29, q12, #5 + vld1.32 {q9}, [r6], r2 @ horz row 1, col 0 + + vaddl.u8 q12, d3, d13 + vmlal.u8 q12, d7, d30 + vmlal.u8 q12, d9, d30 + vmlsl.u8 q12, d5, d31 + vmlsl.u8 q12, d11, d31 + vrhadd.u8 q14, q14, q13 + vqrshrun.s16 d26, q8, #5 + vext.8 d23, d18, d19, #5 + vext.8 d20, d18, d19, #2 + vext.8 d21, d18, d19, #3 + vext.8 d22, d18, d19, #4 + vst1.32 {q14}, [r1], r3 @ store row 0 + vext.8 d19, d18, d19, #1 + vqrshrun.s16 d27, q12, #5 + + vaddl.u8 q14, d18, d23 + vmlal.u8 q14, d20, d30 + vmlal.u8 q14, d21, d30 + vmlsl.u8 q14, d19, d31 + vmlsl.u8 q14, d22, d31 + + vld1.32 {q9}, [r11], r2 @ horz row 1, col 1 + + vext.8 d23, d18, d19, #5 + vext.8 d20, d18, d19, #2 + vext.8 d21, d18, d19, #3 + vext.8 d22, d18, d19, #4 + vext.8 d19, d18, d19, #1 + + vqrshrun.s16 d28, q14, #5 + vaddl.u8 q12, d18, d23 + vmlal.u8 q12, d20, d30 + vmlal.u8 q12, d21, d30 + vmlsl.u8 q12, d19, d31 + vmlsl.u8 q12, d22, d31 + + vqrshrun.s16 d29, q12, #5 + vrhadd.u8 q14, q14, q13 + vst1.32 {q14}, [r1], r3 @ store row 1 + + subs r4, r4, #2 @ 2 rows processed, decrement by 2 + subne r7, r7 , r2, lsl #2 + subne r7, r7, r2 + beq end_func @ Branch if height==4 + + b loop_16 @ looping if height = 8 or 16 + + +loop_8: + vld1.32 d0, [r7], r2 @ Vector load from src[0_0] + vld1.32 d1, [r7], r2 @ Vector load from src[1_0] + vld1.32 d2, [r7], r2 @ Vector load from src[2_0] + vld1.32 d3, [r7], r2 @ Vector load from src[3_0] + vld1.32 d4, [r7], r2 @ Vector load from src[4_0] + vld1.32 d5, [r7], r2 @ Vector load from src[5_0] + vaddl.u8 q5, d0, d5 + vmlal.u8 q5, d2, d30 + vmlal.u8 q5, d3, d30 + vmlsl.u8 q5, d1, d31 + vmlsl.u8 q5, d4, d31 + vld1.32 {q6}, [r6], r2 @horz row 0 + vext.8 d17, d12, d13, #5 + vext.8 d14, d12, d13, #2 + vext.8 d15, d12, d13, #3 + vext.8 d16, d12, d13, #4 + vext.8 d13, d12, d13, #1 + vqrshrun.s16 d26, q5, #5 + vld1.32 d6, [r7], r2 @ src[6_0] + vaddl.u8 q5, d12, d17 + vmlal.u8 q5, d14, d30 + vmlal.u8 q5, d15, d30 + vmlsl.u8 q5, d13, d31 + vmlsl.u8 q5, d16, d31 + vld1.32 {q6}, [r6], r2 @ horz row 1 + vaddl.u8 q9, d1, d6 + vmlal.u8 q9, d3, d30 + vmlal.u8 q9, d4, d30 + vmlsl.u8 q9, d2, d31 + vmlsl.u8 q9, d5, d31 + vqrshrun.s16 d28, q5, #5 + vext.8 d17, d12, d13, #5 + vext.8 d14, d12, d13, #2 + vext.8 d15, d12, d13, #3 + vext.8 d16, d12, d13, #4 + vext.8 d13, d12, d13, #1 + vqrshrun.s16 d27, q9, #5 + vaddl.u8 q5, d12, d17 + vmlal.u8 q5, d14, d30 + vmlal.u8 q5, d15, d30 + vmlsl.u8 q5, d13, d31 + vmlsl.u8 q5, d16, d31 + vqrshrun.s16 d29, q5, #5 + vrhadd.u8 q13, q13, q14 + vst1.32 d26, [r1], r3 + vst1.32 d27, [r1], r3 + + subs r4, r4, #2 @ 2 rows processed, decrement by 2 + subne r7, r7 , r2, lsl #2 + subne r7, r7, r2 + beq end_func @ Branch if height==4 + b loop_8 @looping if height == 8 or 16 + +loop_4: + vld1.32 d0[0], [r7], r2 @ Vector load from src[0_0] + vld1.32 d1[0], [r7], r2 @ Vector load from src[1_0] + vld1.32 d2[0], [r7], r2 @ Vector load from src[2_0] + vld1.32 d3[0], [r7], r2 @ Vector load from src[3_0] + vld1.32 d4[0], [r7], r2 @ Vector load from src[4_0] + vld1.32 d5[0], [r7], r2 @ Vector load from src[5_0] + vaddl.u8 q5, d0, d5 + vmlal.u8 q5, d2, d30 + vmlal.u8 q5, d3, d30 + vmlsl.u8 q5, d1, d31 + vmlsl.u8 q5, d4, d31 + vld1.32 {q6}, [r6], r2 @load for horz filter row 0 + vext.8 d17, d12, d13, #5 + vext.8 d14, d12, d13, #2 + vext.8 d15, d12, d13, #3 + vext.8 d16, d12, d13, #4 + vext.8 d13, d12, d13, #1 + vqrshrun.s16 d26, q5, #5 + vld1.32 d6[0], [r7], r2 @ Vector load from src[6_0] + vaddl.u8 q5, d12, d17 + vmlal.u8 q5, d14, d30 + vmlal.u8 q5, d15, d30 + vmlsl.u8 q5, d13, d31 + vmlsl.u8 q5, d16, d31 + vld1.32 {q6}, [r6], r2 @horz row 1 + vaddl.u8 q9, d1, d6 + vmlal.u8 q9, d3, d30 + vmlal.u8 q9, d4, d30 + vmlsl.u8 q9, d2, d31 + vmlsl.u8 q9, d5, d31 + vqrshrun.s16 d28, q5, #5 + vext.8 d17, d12, d13, #5 + vext.8 d14, d12, d13, #2 + vext.8 d15, d12, d13, #3 + vext.8 d16, d12, d13, #4 + vext.8 d13, d12, d13, #1 + vqrshrun.s16 d27, q9, #5 + vaddl.u8 q5, d12, d17 + vmlal.u8 q5, d14, d30 + vmlal.u8 q5, d15, d30 + vmlsl.u8 q5, d13, d31 + vmlsl.u8 q5, d16, d31 + vqrshrun.s16 d29, q5, #5 + vrhadd.u8 q13, q13, q14 + vst1.32 d26[0], [r1], r3 + vst1.32 d27[0], [r1], r3 + + subs r4, r4, #2 @ 2 rows processed, decrement by 2 + subne r7, r7 , r2, lsl #2 + subne r7, r7, r2 + beq end_func @ Branch if height==4 + b loop_4 @ Loop if height==8 +end_func: + vldmia sp!, {d8-d15} @ Restore neon registers that were saved + ldmfd sp!, {r4-r12, pc} @Restoring registers from stack + + diff --git a/common/arm/ih264_inter_pred_luma_vert_qpel_a9q.s b/common/arm/ih264_inter_pred_luma_vert_qpel_a9q.s new file mode 100755 index 0000000..d45055e --- /dev/null +++ b/common/arm/ih264_inter_pred_luma_vert_qpel_a9q.s @@ -0,0 +1,330 @@ +@/****************************************************************************** +@ * +@ * Copyright (C) 2015 The Android Open Source Project +@ * +@ * Licensed under the Apache License, Version 2.0 (the "License"); +@ * you may not use this file except in compliance with the License. +@ * You may obtain a copy of the License at: +@ * +@ * http://www.apache.org/licenses/LICENSE-2.0 +@ * +@ * Unless required by applicable law or agreed to in writing, software +@ * distributed under the License is distributed on an "AS IS" BASIS, +@ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +@ * See the License for the specific language governing permissions and +@ * limitations under the License. +@ * +@ ***************************************************************************** +@ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore +@*/ +@/** +@****************************************************************************** +@* @file +@* ih264_inter_pred_luma_vert_qpel_a9q.s +@* +@* @brief +@* Contains function definitions for inter prediction vertical quarter pel interpolation. +@* +@* @author +@* Mohit +@* +@* @par List of Functions: +@* +@* - ih264_inter_pred_luma_vert_qpel_a9q() +@* +@* @remarks +@* None +@* +@******************************************************************************* +@*/ + +@/* All the functions here are replicated from ih264_inter_pred_filters.c +@ + +@/** +@/** +@******************************************************************************* +@* +@* @brief +@* Quarter pel interprediction luma filter for vertical input +@* +@* @par Description: +@* Applies a 6 tap horizontal filter .The output is clipped to 8 bits +@* sec 8.4.2.2.1 titled "Luma sample interpolation process" +@* +@* @param[in] pu1_src +@* UWORD8 pointer to the source +@* +@* @param[out] pu1_dst +@* UWORD8 pointer to the destination +@* +@* @param[in] src_strd +@* integer source stride +@* +@* @param[in] dst_strd +@* integer destination stride +@* +@* @param[in] ht +@* integer height of the array +@* +@* @param[in] wd +@* integer width of the array +@* +@* @param[in] pu1_tmp: temporary buffer: UNUSED in this function +@* +@* @param[in] dydx: x and y reference offset for qpel calculations. +@* @returns +@* +@ @remarks +@* None +@* +@******************************************************************************* +@*/ + +@void ih264_inter_pred_luma_vert ( +@ UWORD8 *pu1_src, +@ UWORD8 *pu1_dst, +@ WORD32 src_strd, +@ WORD32 dst_strd, +@ WORD32 ht, +@ WORD32 wd, +@ UWORD8* pu1_tmp, +@ UWORD32 dydx) + +@**************Variables Vs Registers***************************************** +@ r0 => *pu1_src +@ r1 => *pu1_dst +@ r2 => src_strd +@ r3 => dst_strd +@ r5 => ht +@ r6 => wd +@ r7 => dydx + +.text +.p2align 2 + + .global ih264_inter_pred_luma_vert_qpel_a9q + +ih264_inter_pred_luma_vert_qpel_a9q: + + stmfd sp!, {r4-r12, r14} @store register values to stack + vstmdb sp!, {d8-d15} @push neon registers to stack + ldr r5, [sp, #104] @Loads ht + + ldr r6, [sp, #108] @Loads wd + ldr r7, [sp, #116] @Loads dydx + and r7, r7, #12 @Finds y-offset + lsr r7, r7, #3 @dydx>>3 + mul r7, r2, r7 + add r7, r0, r7 @pu1_src + (y_offset>>1)*src_strd + vmov.u16 q11, #20 @ Filter coeff 0x14 into Q11 + sub r0, r0, r2, lsl #1 @pu1_src-2*src_strd + subs r12, r6, #8 @if wd=8 branch to loop_8 + vmov.u16 q12, #5 @ Filter coeff 0x5 into Q12 + beq loop_8 + + subs r12, r6, #4 @if wd=4 branch to loop_4 + beq loop_4 + +loop_16: @when wd=16 + + vld1.u32 {q0}, [r0], r2 @ Vector load from src[0_0] + vld1.u32 {q1}, [r0], r2 @ Vector load from src[1_0] + vld1.u32 {q2}, [r0], r2 @ Vector load from src[2_0] + vld1.u32 {q3}, [r0], r2 @ Vector load from src[3_0] + vld1.u32 {q4}, [r0], r2 @ Vector load from src[4_0] + vaddl.u8 q6, d4, d6 @ temp1 = src[2_0] + src[3_0] + vld1.u32 {q5}, [r0], r2 @ Vector load from src[5_0] + vaddl.u8 q7, d0, d10 @ temp = src[0_0] + src[5_0] + vaddl.u8 q8, d2, d8 @ temp2 = src[1_0] + src[4_0] + vmla.u16 q7, q6, q11 @ temp += temp1 * 20 + vaddl.u8 q10, d1, d11 @ temp4 = src[0_8] + src[5_8] + vaddl.u8 q9, d5, d7 @ temp3 = src[2_8] + src[3_8] + vmla.u16 q10, q9, q11 @ temp4 += temp3 * 20 + vld1.u32 {q0}, [r0], r2 + vaddl.u8 q13, d3, d9 @ temp5 = src[1_8] + src[4_8] + vaddl.u8 q6, d6, d8 + vmls.u16 q7, q8, q12 @ temp -= temp2 * 5 + vaddl.u8 q8, d2, d0 + vaddl.u8 q9, d4, d10 + vmla.u16 q8, q6, q11 + vmls.u16 q10, q13, q12 @ temp4 -= temp5 * 5 + vaddl.u8 q13, d5, d11 + vaddl.u8 q6, d7, d9 + vqrshrun.s16 d30, q7, #5 @ dst[0_0] = CLIP_U8((temp +16) >> 5) + vaddl.u8 q7, d3, d1 + vld1.u32 {q1}, [r0], r2 + vmla.u16 q7, q6, q11 + vmls.u16 q8, q9, q12 + vqrshrun.s16 d31, q10, #5 @ dst[0_8] = CLIP_U8((temp4 +16) >> 5) + vld1.u32 {q10}, [r7], r2 @ Load for interpolation row 0 + vrhadd.u8 q15, q10, q15 @ Interpolation to obtain qpel value + vaddl.u8 q9, d4, d2 + vaddl.u8 q6, d8, d10 + + vst1.u32 {q15}, [r1], r3 @ Vector store to dst[0_0] + vmla.u16 q9, q6, q11 + vaddl.u8 q10, d6, d0 + vmls.u16 q7, q13, q12 + vqrshrun.s16 d30, q8, #5 + vaddl.u8 q6, d9, d11 + vaddl.u8 q8, d5, d3 + vaddl.u8 q13, d7, d1 + vmla.u16 q8, q6, q11 + vmls.u16 q9, q10, q12 + vld1.u32 {q2}, [r0], r2 + + vqrshrun.s16 d31, q7, #5 + vld1.u32 {q7}, [r7], r2 @ Load for interpolation row 1 + vaddl.u8 q6, d10, d0 + vrhadd.u8 q15, q7, q15 @ Interpolation to obtain qpel value + vaddl.u8 q7, d6, d4 + vaddl.u8 q10, d8, d2 + vmla.u16 q7, q6, q11 + vmls.u16 q8, q13, q12 + vst1.u32 {q15}, [r1], r3 @store row 1 + vqrshrun.s16 d30, q9, #5 + vaddl.u8 q9, d7, d5 + vaddl.u8 q6, d11, d1 + vmla.u16 q9, q6, q11 + vaddl.u8 q13, d9, d3 + vmls.u16 q7, q10, q12 + vqrshrun.s16 d31, q8, #5 + vld1.u32 {q8}, [r7], r2 @ Load for interpolation row 2 + vmls.u16 q9, q13, q12 + vrhadd.u8 q15, q8, q15 @ Interpolation to obtain qpel value + vaddl.u8 q6, d0, d2 @ temp1 = src[2_0] + src[3_0] + vst1.u32 {q15}, [r1], r3 @store row 2 + vaddl.u8 q8, d10, d4 @ temp2 = src[1_0] + src[4_0] + vaddl.u8 q10, d9, d7 @ temp4 = src[0_8] + src[5_8] + vqrshrun.s16 d30, q7, #5 + vaddl.u8 q13, d5, d11 @ temp5 = src[1_8] + src[4_8] + vaddl.u8 q7, d8, d6 @ temp = src[0_0] + src[5_0] + vqrshrun.s16 d31, q9, #5 + vld1.u32 {q9}, [r7], r2 @ Load for interpolation row 3 + vmla.u16 q7, q6, q11 @ temp += temp1 * 20 + vrhadd.u8 q15, q9, q15 @ Interpolation to obtain qpel value + vaddl.u8 q9, d1, d3 @ temp3 = src[2_8] + src[3_8] + vst1.u32 {q15}, [r1], r3 @store row 3 + subs r5, r5, #4 @ 4 rows processed, decrement by 4 + subne r0, r0 , r2, lsl #2 + subne r0, r0, r2 + beq end_func @ Branch if height==4 + + b loop_16 @ looping if height = 8 or 16 + + +loop_8: + + @// Processing row0 and row1 + vld1.u32 d0, [r0], r2 @ Vector load from src[0_0] + vld1.u32 d1, [r0], r2 @ Vector load from src[1_0] + vld1.u32 d2, [r0], r2 @ Vector load from src[2_0] + vld1.u32 d3, [r0], r2 @ Vector load from src[3_0] + vld1.u32 d4, [r0], r2 @ Vector load from src[4_0] + vld1.u32 d5, [r0], r2 @ Vector load from src[5_0] + + vaddl.u8 q3, d2, d3 @ temp1 = src[2_0] + src[3_0] + vaddl.u8 q4, d0, d5 @ temp = src[0_0] + src[5_0] + vaddl.u8 q5, d1, d4 @ temp2 = src[1_0] + src[4_0] + vmla.u16 q4, q3, q11 @ temp += temp1 * 20 + vld1.u32 d6, [r0], r2 + vaddl.u8 q7, d3, d4 + vaddl.u8 q8, d1, d6 + vaddl.u8 q9, d2, d5 + vmls.u16 q4, q5, q12 @ temp -= temp2 * 5 + vmla.u16 q8, q7, q11 + vld1.u32 d7, [r0], r2 + vaddl.u8 q10, d4, d5 + vaddl.u8 q6, d2, d7 + vaddl.u8 q5, d3, d6 + vmls.u16 q8, q9, q12 + vqrshrun.s16 d26, q4, #5 @ dst[0_0] = CLIP_U8( (temp + 16) >> 5) + vmla.u16 q6, q10, q11 + vld1.32 d8, [r7], r2 @Load value for interpolation (row0) + vld1.32 d9, [r7], r2 @Load value for interpolation (row1) + vld1.u32 d0, [r0], r2 + vaddl.u8 q7, d5, d6 + vqrshrun.s16 d27, q8, #5 + vrhadd.u8 q13, q4, q13 @ Interpolation step for qpel calculation + vaddl.u8 q10, d3, d0 + vmls.u16 q6, q5, q12 + vst1.u32 d26, [r1], r3 @ Vector store to dst[0_0] + vaddl.u8 q9, d4, d7 + vmla.u16 q10, q7, q11 + vst1.u32 d27, [r1], r3 @ Vector store to dst[1_0] + vqrshrun.s16 d28, q6, #5 + vmls.u16 q10, q9, q12 + vld1.32 d12, [r7], r2 @Load value for interpolation (row2) + vld1.32 d13, [r7], r2 @Load value for interpolation (row3) + vqrshrun.s16 d29, q10, #5 + subs r9, r5, #4 + vrhadd.u8 q14, q6, q14 + vst1.u32 d28, [r1], r3 @store row 2 + vst1.u32 d29, [r1], r3 @store row 3 + + subs r5, r5, #4 @ 4 rows processed, decrement by 4 + subne r0, r0 , r2, lsl #2 + subne r0, r0, r2 + beq end_func @ Branch if height==4 + b loop_8 @looping if height == 8 or 16 + +loop_4: +@// Processing row0 and row1 + + vld1.u32 d0[0], [r0], r2 @ Vector load from src[0_0] + vld1.u32 d1[0], [r0], r2 @ Vector load from src[1_0] + vld1.u32 d2[0], [r0], r2 @ Vector load from src[2_0] + vld1.u32 d3[0], [r0], r2 @ Vector load from src[3_0] + vld1.u32 d4[0], [r0], r2 @ Vector load from src[4_0] + vld1.u32 d5[0], [r0], r2 @ Vector load from src[5_0] + + vaddl.u8 q3, d2, d3 @ temp1 = src[2_0] + src[3_0] + vaddl.u8 q4, d0, d5 @ temp = src[0_0] + src[5_0] + vaddl.u8 q5, d1, d4 @ temp2 = src[1_0] + src[4_0] + vmla.u16 q4, q3, q11 @ temp += temp1 * 20 + vld1.u32 d6, [r0], r2 + vaddl.u8 q7, d3, d4 + vaddl.u8 q8, d1, d6 + vaddl.u8 q9, d2, d5 + vmls.u16 q4, q5, q12 @ temp -= temp2 * 5 + vld1.u32 d7[0], [r0], r2 + vmla.u16 q8, q7, q11 + vaddl.u8 q10, d4, d5 + vaddl.u8 q6, d2, d7 + vaddl.u8 q5, d3, d6 + vmls.u16 q8, q9, q12 + vqrshrun.s16 d26, q4, #5 @ dst[0_0] = CLIP_U8( (temp + 16) >> 5) + vld1.u32 d8[0], [r7], r2 @Load value for interpolation - row 0 + vld1.u32 d9[0], [r7], r2 @Load value for interpolation - row 1 + vmla.u16 q6, q10, q11 + vld1.u32 d0[0], [r0], r2 + vaddl.u8 q7, d5, d6 + vqrshrun.s16 d27, q8, #5 + vaddl.u8 q10, d3, d0 + vrhadd.u8 q13, q13, q4 @Interpolation step for qpel calculation + vmls.u16 q6, q5, q12 + vst1.u32 d26[0], [r1], r3 @ Vector store to dst[0_0] + vaddl.u8 q9, d4, d7 + vmla.u16 q10, q7, q11 + vst1.u32 d27[0], [r1], r3 @ store row 1 + vqrshrun.s16 d28, q6, #5 + vld1.u32 d12[0], [r7], r2 @Load value for interpolation - row 2 + vld1.u32 d13[0], [r7], r2 @Load value for interpolation - row 3 + + vmls.u16 q10, q9, q12 + vqrshrun.s16 d29, q10, #5 + vrhadd.u8 q14, q6, q14 @Interpolation step for qpel calculation + vst1.u32 d28[0], [r1], r3 @store row 2 + vst1.u32 d29[0], [r1], r3 @store row 3 + + subs r5, r5, #8 + subeq r0, r0, r2, lsl #2 + subeq r0, r0, r2 + beq loop_4 @ Loop if height==8 + +end_func: + vldmia sp!, {d8-d15} @ Restore neon registers that were saved + ldmfd sp!, {r4-r12, pc} @Restoring registers from stack + + diff --git a/common/arm/ih264_intra_pred_chroma_a9q.s b/common/arm/ih264_intra_pred_chroma_a9q.s new file mode 100755 index 0000000..d03fc55 --- /dev/null +++ b/common/arm/ih264_intra_pred_chroma_a9q.s @@ -0,0 +1,551 @@ +@/****************************************************************************** +@ * +@ * Copyright (C) 2015 The Android Open Source Project +@ * +@ * Licensed under the Apache License, Version 2.0 (the "License"); +@ * you may not use this file except in compliance with the License. +@ * You may obtain a copy of the License at: +@ * +@ * http://www.apache.org/licenses/LICENSE-2.0 +@ * +@ * Unless required by applicable law or agreed to in writing, software +@ * distributed under the License is distributed on an "AS IS" BASIS, +@ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +@ * See the License for the specific language governing permissions and +@ * limitations under the License. +@ * +@ ***************************************************************************** +@ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore +@*/ +@/** +@****************************************************************************** +@* @file +@* ih264_intra_pred_chroma_a9q.s +@* +@* @brief +@* Contains function definitions for intra chroma prediction . +@* +@* @author +@* Ittiam +@* +@* @par List of Functions: +@* +@* - ih264_intra_pred_chroma_mode_horz_a9q() +@* - ih264_intra_pred_chroma_8x8_mode_vert_a9q() +@* - ih264_intra_pred_chroma_mode_dc_a9q() +@* - ih264_intra_pred_chroma_mode_plane_a9q() +@* +@* @remarks +@* None +@* +@******************************************************************************* +@*/ + +@/* All the functions here are replicated from ih264_chroma_intra_pred_filters.c +@ + +@/** +@/** +@/** +@ +.text +.p2align 2 + + .extern ih264_gai1_intrapred_chroma_plane_coeffs1 +.hidden ih264_gai1_intrapred_chroma_plane_coeffs1 + .extern ih264_gai1_intrapred_chroma_plane_coeffs2 +.hidden ih264_gai1_intrapred_chroma_plane_coeffs2 +scratch_chroma_intrapred_addr1: + .long ih264_gai1_intrapred_chroma_plane_coeffs1 - scrlblc1 - 8 + +scratch_intrapred_chroma_plane_addr1: + .long ih264_gai1_intrapred_chroma_plane_coeffs2 - scrlblc2 - 8 +@/** +@******************************************************************************* +@* +@*ih264_intra_pred_chroma_8x8_mode_dc +@* +@* @brief +@* Perform Intra prediction for chroma_8x8 mode:DC +@* +@* @par Description: +@* Perform Intra prediction for chroma_8x8 mode:DC ,described in sec 8.3.4.1 +@* +@* @param[in] pu1_src +@* UWORD8 pointer to the source containing alternate U and V samples +@* +@* @param[out] pu1_dst +@* UWORD8 pointer to the destination with alternate U and V samples +@* +@* @param[in] src_strd +@* integer source stride +@* +@* @param[in] dst_strd +@* integer destination stride +@* +@** @param[in] ui_neighboravailability +@* availability of neighbouring pixels +@* +@* @returns +@* +@* @remarks +@* None +@* +@*******************************************************************************/ +@void ih264_intra_pred_chroma_8x8_mode_dc(UWORD8 *pu1_src, +@ UWORD8 *pu1_dst, +@ WORD32 src_strd, +@ WORD32 dst_strd, +@ WORD32 ui_neighboravailability) + +@**************Variables Vs Registers***************************************** +@ r0 => *pu1_src +@ r1 => *pu1_dst +@ r2 => src_strd +@ r3 => dst_strd +@ r4 => ui_neighboravailability + + + + .global ih264_intra_pred_chroma_8x8_mode_dc_a9q + +ih264_intra_pred_chroma_8x8_mode_dc_a9q: + + stmfd sp!, {r4, r14} @store register values to stack + ldr r4, [sp, #8] @r4 => ui_neighboravailability + vpush {d8-d15} + + ands r2, r4, #0x01 @CHECKING IF LEFT_AVAILABLE ELSE BRANCHING TO ONLY TOP AVAILABLE + beq top_available + ands r2, r4, #0x04 @CHECKING IF TOP_AVAILABLE ELSE BRANCHING TO ONLY LEFT AVAILABLE + beq left_available + + vld1.u8 {q0}, [r0] @BOTH LEFT AND TOP AVAILABLE + add r0, r0, #18 + vld1.u8 {q1}, [r0] + vaddl.u8 q2, d1, d2 + vaddl.u8 q3, d0, d3 + vmovl.u8 q1, d3 + vmovl.u8 q0, d0 + + vadd.u16 d12, d4, d5 + vadd.u16 d13, d2, d3 + vadd.u16 d15, d6, d7 + vadd.u16 d14, d0, d1 + + vpadd.u32 d12, d12, d15 + vpadd.u32 d14, d13, d14 + vqrshrun.s16 d12, q6, #3 + vqrshrun.s16 d14, q7, #2 + vdup.u16 d8, d12[0] + vdup.u16 d9, d14[0] + vdup.u16 d10, d14[1] + vdup.u16 d11, d12[1] + b str_pred + +top_available: @ONLY TOP AVAILABLE + ands r2, r4, #0x04 @CHECKING TOP AVAILABILTY OR ELSE BRANCH TO NONE AVAILABLE + beq none_available + + add r0, r0, #18 + vld1.u8 {q0}, [r0] + vmovl.u8 q1, d0 + vmovl.u8 q2, d1 + vadd.u16 d0, d2, d3 + vadd.u16 d1, d4, d5 + vpaddl.u32 q0, q0 + vqrshrun.s16 d0, q0, #2 + vdup.u16 d8, d0[0] + vdup.u16 d9, d0[2] + vmov q5, q4 + b str_pred + +left_available: @ONLY LEFT AVAILABLE + vld1.u8 {q0}, [r0] + vmovl.u8 q1, d0 + vmovl.u8 q2, d1 + vadd.u16 d0, d2, d3 + vadd.u16 d1, d4, d5 + vpaddl.u32 q0, q0 + vqrshrun.s16 d0, q0, #2 + vdup.u16 q5, d0[0] + vdup.u16 q4, d0[2] + b str_pred + +none_available: @NONE AVAILABLE + vmov.u8 q4, #128 + vmov.u8 q5, #128 + +str_pred: + vst1.8 {q4}, [r1], r3 + vst1.8 {q4}, [r1], r3 + vst1.8 {q4}, [r1], r3 + vst1.8 {q4}, [r1], r3 + vst1.8 {q5}, [r1], r3 + vst1.8 {q5}, [r1], r3 + vst1.8 {q5}, [r1], r3 + vst1.8 {q5}, [r1], r3 + + vpop {d8-d15} + ldmfd sp!, {r4, pc} @Restoring registers from stack + + + +@/****************************************************************************** + + +@/** +@******************************************************************************* +@* +@*ih264_intra_pred_chroma_8x8_mode_horz +@* +@* @brief +@* Perform Intra prediction for chroma_8x8 mode:Horizontal +@* +@* @par Description: +@* Perform Intra prediction for chroma_8x8 mode:Horizontal ,described in sec 8.3.4.2 +@* +@* @param[in] pu1_src +@* UWORD8 pointer to the source containing alternate U and V samples +@* +@* @param[out] pu1_dst +@* UWORD8 pointer to the destination with alternate U and V samples +@* +@* @param[in] src_strd +@* integer source stride +@* +@* @param[in] dst_strd +@* integer destination stride +@* +@* @param[in] ui_neighboravailability +@* availability of neighbouring pixels(Not used in this function) +@* +@* @returns +@* +@* @remarks +@* None +@* +@******************************************************************************* +@*/ +@void ih264_intra_pred_chroma_8x8_mode_horz(UWORD8 *pu1_src, +@ UWORD8 *pu1_dst, +@ WORD32 src_strd, +@ WORD32 dst_strd, +@ WORD32 ui_neighboravailability) +@**************Variables Vs Registers***************************************** +@ r0 => *pu1_src +@ r1 => *pu1_dst +@ r2 => src_strd +@ r3 => dst_strd +@ r4 => ui_neighboravailability + + + .global ih264_intra_pred_chroma_8x8_mode_horz_a9q + +ih264_intra_pred_chroma_8x8_mode_horz_a9q: + + stmfd sp!, {r14} @store register values to stack + + vld1.u8 {q0}, [r0] + mov r2, #6 + + vdup.u16 q1, d1[3] + vdup.u16 q2, d1[2] + vst1.8 {q1}, [r1], r3 + +loop_8x8_horz: + vext.8 q0, q0, q0, #12 + vst1.8 {q2}, [r1], r3 + vdup.u16 q1, d1[3] + subs r2, #2 + vdup.u16 q2, d1[2] + vst1.8 {q1}, [r1], r3 + bne loop_8x8_horz + + vext.8 q0, q0, q0, #12 + vst1.8 {q2}, [r1], r3 + + ldmfd sp!, {pc} @restoring registers from stack + + + + +@/** +@******************************************************************************* +@* +@*ih264_intra_pred_chroma_8x8_mode_vert +@* +@* @brief +@* Perform Intra prediction for chroma_8x8 mode:vertical +@* +@* @par Description: +@*Perform Intra prediction for chroma_8x8 mode:vertical ,described in sec 8.3.4.3 +@* +@* @param[in] pu1_src +@* UWORD8 pointer to the source containing alternate U and V samples +@* +@* @param[out] pu1_dst +@* UWORD8 pointer to the destination with alternate U and V samples +@* +@* @param[in] src_strd +@* integer source stride +@* +@* @param[in] dst_strd +@* integer destination stride +@* +@* @param[in] ui_neighboravailability +@* availability of neighbouring pixels(Not used in this function) +@* +@* @returns +@* +@* @remarks +@* None +@* +@******************************************************************************* +@void ih264_intra_pred_chroma_8x8_mode_vert(UWORD8 *pu1_src, +@ UWORD8 *pu1_dst, +@ WORD32 src_strd, +@ WORD32 dst_strd, +@ WORD32 ui_neighboravailability) + +@**************Variables Vs Registers***************************************** +@ r0 => *pu1_src +@ r1 => *pu1_dst +@ r2 => src_strd +@ r3 => dst_strd +@ r4 => ui_neighboravailability + + + .global ih264_intra_pred_chroma_8x8_mode_vert_a9q + +ih264_intra_pred_chroma_8x8_mode_vert_a9q: + + stmfd sp!, {r4-r12, r14} @store register values to stack + + add r0, r0, #18 + vld1.8 {q0}, [r0] + + vst1.8 {q0}, [r1], r3 + vst1.8 {q0}, [r1], r3 + vst1.8 {q0}, [r1], r3 + vst1.8 {q0}, [r1], r3 + vst1.8 {q0}, [r1], r3 + vst1.8 {q0}, [r1], r3 + vst1.8 {q0}, [r1], r3 + vst1.8 {q0}, [r1], r3 + + ldmfd sp!, {r4-r12, pc} @Restoring registers from stack + + + + +@/****************************************************************************** + + +@/** +@******************************************************************************* +@* +@*ih264_intra_pred_chroma_8x8_mode_plane +@* +@* @brief +@* Perform Intra prediction for chroma_8x8 mode:PLANE +@* +@* @par Description: +@* Perform Intra prediction for chroma_8x8 mode:PLANE ,described in sec 8.3.4.4 +@* +@* @param[in] pu1_src +@* UWORD8 pointer to the source containing alternate U and V samples +@* +@* @param[out] pu1_dst +@* UWORD8 pointer to the destination with alternate U and V samples +@* +@* @param[in] src_strd +@* integer source stride +@* +@* @param[in] dst_strd +@* integer destination stride +@* +@* @param[in] ui_neighboravailability +@* availability of neighbouring pixels +@* +@* @returns +@* +@* @remarks +@* None +@* +@*******************************************************************************/ +@void ih264_intra_pred_chroma_8x8_mode_plane(UWORD8 *pu1_src, +@ UWORD8 *pu1_dst, +@ WORD32 src_strd, +@ WORD32 dst_strd, +@ WORD32 ui_neighboravailability) + +@**************Variables Vs Registers***************************************** +@ r0 => *pu1_src +@ r1 => *pu1_dst +@ r2 => src_strd +@ r3 => dst_strd +@ r4 => ui_neighboravailability + + .global ih264_intra_pred_chroma_8x8_mode_plane_a9q +ih264_intra_pred_chroma_8x8_mode_plane_a9q: + + stmfd sp!, {r4-r10, r12, lr} + vpush {d8-d15} + + + vld1.32 d0, [r0] + add r10, r0, #10 + vld1.32 d1, [r10] + add r10, r10, #6 + vrev64.16 d5, d0 + vld1.32 d2, [r10]! + add r10, r10, #2 + vrev64.16 d7, d2 + vld1.32 d3, [r10] + sub r5, r3, #8 + ldr r12, scratch_chroma_intrapred_addr1 +scrlblc1: + add r12, r12, pc + vsubl.u8 q5, d5, d1 + vld1.64 {q4}, [r12] @ Load multiplication factors 1 to 8 into D3 + vsubl.u8 q6, d3, d7 + vmul.s16 q7, q5, q4 + vmul.s16 q8, q6, q4 + vuzp.16 q7, q8 + + vpadd.s16 d14, d14 + vpadd.s16 d15, d15 + vpadd.s16 d16, d16 + vpadd.s16 d17, d17 + vpadd.s16 d14, d14 + vpadd.s16 d15, d15 + vpadd.s16 d16, d16 + vpadd.s16 d17, d17 + + mov r6, #34 + vdup.16 q9, r6 + + vmull.s16 q11, d14, d18 + vmull.s16 q12, d15, d18 + vmull.s16 q13, d16, d18 + vmull.s16 q14, d17, d18 + + vrshrn.s32 d10, q11, #6 + vrshrn.s32 d12, q12, #6 + vrshrn.s32 d13, q13, #6 + vrshrn.s32 d14, q14, #6 + + + ldrb r6, [r0], #1 + add r10, r0, #31 + ldrb r8, [r0], #1 + ldrb r7, [r10], #1 + ldrb r9, [r10], #1 + + add r6, r6, r7 + add r8, r8, r9 + lsl r6, r6, #4 + lsl r8, r8, #4 + + vdup.16 q0, r6 + vdup.16 q1, r8 + vdup.16 q2, d12[0] + vdup.16 q3, d10[0] + + vdup.16 q12, d14[0] + vdup.16 q13, d13[0] + vzip.16 q2, q12 + vzip.16 q3, q13 + vzip.16 q0, q1 + + ldr r12, scratch_intrapred_chroma_plane_addr1 +scrlblc2: + add r12, r12, pc + vld1.64 {q4}, [r12] + vmov.16 q5, q4 + vmov q11, q4 + vzip.16 q4, q5 + + vmul.s16 q6, q2, q4 + vmul.s16 q8, q2, q5 + vadd.s16 q6, q0, q6 + vadd.s16 q8, q0, q8 + + + vdup.16 q10, d22[0] + vmul.s16 q2, q3, q10 + vdup.16 q15, d22[1] + vmul.s16 q9, q3, q10 + vmul.s16 q7, q3, q15 + vmul.s16 q4, q3, q15 + vadd.s16 q12, q6, q2 + vadd.s16 q0, q8, q9 + vadd.s16 q1, q6, q7 + vqrshrun.s16 d28, q12, #5 + vadd.s16 q13, q8, q4 + vqrshrun.s16 d29, q0, #5 + vdup.16 q10, d22[2] + vst1.8 {q14}, [r1], r3 + vqrshrun.s16 d28, q1, #5 + vqrshrun.s16 d29, q13, #5 + vmul.s16 q2, q3, q10 + vmul.s16 q9, q3, q10 + vst1.8 {q14}, [r1], r3 + vadd.s16 q12, q6, q2 + vadd.s16 q0, q8, q9 + vdup.16 q15, d22[3] + vqrshrun.s16 d28, q12, #5 + vqrshrun.s16 d29, q0, #5 + vmul.s16 q7, q3, q15 + vmul.s16 q4, q3, q15 + vst1.8 {q14}, [r1], r3 + vadd.s16 q1, q6, q7 + vadd.s16 q13, q8, q4 + vdup.16 q10, d23[0] + vqrshrun.s16 d28, q1, #5 + vqrshrun.s16 d29, q13, #5 + vmul.s16 q2, q3, q10 + vmul.s16 q9, q3, q10 + vst1.8 {q14}, [r1], r3 + vadd.s16 q12, q6, q2 + vadd.s16 q0, q8, q9 + vdup.16 q15, d23[1] + vqrshrun.s16 d28, q12, #5 + vqrshrun.s16 d29, q0, #5 + vmul.s16 q7, q3, q15 + vmul.s16 q4, q3, q15 + vst1.8 {q14}, [r1], r3 + vadd.s16 q1, q6, q7 + vadd.s16 q13, q8, q4 + vdup.16 q10, d23[2] + vqrshrun.s16 d28, q1, #5 + vqrshrun.s16 d29, q13, #5 + vmul.s16 q2, q3, q10 + vmul.s16 q9, q3, q10 + vst1.8 {q14}, [r1], r3 + vadd.s16 q12, q6, q2 + vadd.s16 q0, q8, q9 + vdup.16 q15, d23[3] + vqrshrun.s16 d28, q12, #5 + vqrshrun.s16 d29, q0, #5 + vmul.s16 q7, q3, q15 + vmul.s16 q4, q3, q15 + vst1.8 {q14}, [r1], r3 + vadd.s16 q1, q6, q7 + vadd.s16 q13, q8, q4 + vqrshrun.s16 d28, q1, #5 + vqrshrun.s16 d29, q13, #5 + vst1.8 {q14}, [r1], r3 + + + +end_func_plane: + + + vpop {d8-d15} + ldmfd sp!, {r4-r10, r12, pc} + + + + diff --git a/common/arm/ih264_intra_pred_luma_16x16_a9q.s b/common/arm/ih264_intra_pred_luma_16x16_a9q.s new file mode 100755 index 0000000..e38e203 --- /dev/null +++ b/common/arm/ih264_intra_pred_luma_16x16_a9q.s @@ -0,0 +1,520 @@ +@/****************************************************************************** +@ * +@ * Copyright (C) 2015 The Android Open Source Project +@ * +@ * Licensed under the Apache License, Version 2.0 (the "License"); +@ * you may not use this file except in compliance with the License. +@ * You may obtain a copy of the License at: +@ * +@ * http://www.apache.org/licenses/LICENSE-2.0 +@ * +@ * Unless required by applicable law or agreed to in writing, software +@ * distributed under the License is distributed on an "AS IS" BASIS, +@ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +@ * See the License for the specific language governing permissions and +@ * limitations under the License. +@ * +@ ***************************************************************************** +@ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore +@*/ +@/** +@****************************************************************************** +@* @file +@* ih264_intra_pred_luma_16x16_a9q.s +@* +@* @brief +@* Contains function definitions for intra 16x16 Luma prediction . +@* +@* @author +@* Ittiam +@* +@* @par List of Functions: +@* +@* - ih264_intra_pred_luma_16x16_mode_vert_a9q() +@* - ih264_intra_pred_luma_16x16_mode_horz_a9q() +@* - ih264_intra_pred_luma_16x16_mode_dc_a9q() +@* - ih264_intra_pred_luma_16x16_mode_plane_a9q() +@* +@* @remarks +@* None +@* +@******************************************************************************* +@*/ + +@/* All the functions here are replicated from ih264_intra_pred_filters.c +@ + +@/** +@/** +@/** +@ + +.text +.p2align 2 + + + .extern ih264_gai1_intrapred_luma_plane_coeffs +.hidden ih264_gai1_intrapred_luma_plane_coeffs +scratch_intrapred_addr1: + .long ih264_gai1_intrapred_luma_plane_coeffs - scrlbl1 - 8 +@/** +@******************************************************************************* +@* +@*ih264_intra_pred_luma_16x16_mode_vert_a9q +@* +@* @brief +@* Perform Intra prediction for luma_16x16 mode:vertical +@* +@* @par Description: +@* Perform Intra prediction for luma_16x16 mode:Vertical ,described in sec 8.3.3.1 +@* +@* @param[in] pu1_src +@* UWORD8 pointer to the source +@* +@* @param[out] pu1_dst +@* UWORD8 pointer to the destination +@* +@* @param[in] src_strd +@* integer source stride +@* +@* @param[in] dst_strd +@* integer destination stride +@* +@* @param[in] ui_neighboravailability +@* availability of neighbouring pixels(Not used in this function) +@* +@* @returns +@* +@* @remarks +@* None +@* +@******************************************************************************* +@void ih264_intra_pred_luma_16x16_mode_vert(UWORD8 *pu1_src, +@ UWORD8 *pu1_dst, +@ WORD32 src_strd, +@ WORD32 dst_strd, +@ WORD32 ui_neighboravailability) + +@**************Variables Vs Registers***************************************** +@ r0 => *pu1_src +@ r1 => *pu1_dst +@ r2 => src_strd +@ r3 => dst_strd +@ r4 => ui_neighboravailability + + + .global ih264_intra_pred_luma_16x16_mode_vert_a9q + +ih264_intra_pred_luma_16x16_mode_vert_a9q: + + stmfd sp!, {r4-r12, r14} @store register values to stack + + add r0, r0, #17 + vld1.8 {q0}, [r0] + + vst1.8 {q0}, [r1], r3 + vst1.8 {q0}, [r1], r3 + vst1.8 {q0}, [r1], r3 + vst1.8 {q0}, [r1], r3 + vst1.8 {q0}, [r1], r3 + vst1.8 {q0}, [r1], r3 + vst1.8 {q0}, [r1], r3 + vst1.8 {q0}, [r1], r3 + vst1.8 {q0}, [r1], r3 + vst1.8 {q0}, [r1], r3 + vst1.8 {q0}, [r1], r3 + vst1.8 {q0}, [r1], r3 + vst1.8 {q0}, [r1], r3 + vst1.8 {q0}, [r1], r3 + vst1.8 {q0}, [r1], r3 + vst1.8 {q0}, [r1], r3 + + ldmfd sp!, {r4-r12, pc} @Restoring registers from stack + + + + + +@/****************************************************************************** + + +@/** +@******************************************************************************* +@* +@*ih264_intra_pred_luma_16x16_mode_horz_a9q +@* +@* @brief +@* Perform Intra prediction for luma_16x16 mode:horizontal +@* +@* @par Description: +@* Perform Intra prediction for luma_16x16 mode:horizontal ,described in sec 8.3.3.2 +@* +@* @param[in] pu1_src +@* UWORD8 pointer to the source +@* +@* @param[out] pu1_dst +@* UWORD8 pointer to the destination +@* +@* @param[in] src_strd +@* integer source stride +@* +@* @param[in] dst_strd +@* integer destination stride +@* +@* @param[in] ui_neighboravailability +@* availability of neighbouring pixels(Not used in this function) +@* +@* @returns +@* +@* @remarks +@* None +@* +@******************************************************************************* +@*/ +@void ih264_intra_pred_luma_16x16_mode_horz(UWORD8 *pu1_src, +@ UWORD8 *pu1_dst, +@ WORD32 src_strd, +@ WORD32 dst_strd, +@ WORD32 ui_neighboravailability) +@**************Variables Vs Registers***************************************** +@ r0 => *pu1_src +@ r1 => *pu1_dst +@ r2 => src_strd +@ r3 => dst_strd +@ r4 => ui_neighboravailability + + .global ih264_intra_pred_luma_16x16_mode_horz_a9q + +ih264_intra_pred_luma_16x16_mode_horz_a9q: + + stmfd sp!, {r14} @store register values to stack + + vld1.u8 {q0}, [r0] + mov r2, #14 + + vdup.u8 q1, d1[7] + vdup.u8 q2, d1[6] + vst1.8 {q1}, [r1], r3 + +loop_16x16_horz: + vext.8 q0, q0, q0, #14 + vst1.8 {q2}, [r1], r3 + vdup.u8 q1, d1[7] + subs r2, #2 + vdup.u8 q2, d1[6] + vst1.8 {q1}, [r1], r3 + bne loop_16x16_horz + + vext.8 q0, q0, q0, #14 + vst1.8 {q2}, [r1], r3 + + ldmfd sp!, {pc} @Restoring registers from stack + + + + +@/****************************************************************************** + + +@/** +@******************************************************************************* +@* +@*ih264_intra_pred_luma_16x16_mode_dc_a9q +@* +@* @brief +@* Perform Intra prediction for luma_16x16 mode:DC +@* +@* @par Description: +@* Perform Intra prediction for luma_16x16 mode:DC ,described in sec 8.3.3.3 +@* +@* @param[in] pu1_src +@* UWORD8 pointer to the source +@* +@* @param[out] pu1_dst +@* UWORD8 pointer to the destination +@* +@* @param[in] src_strd +@* integer source stride +@* +@* @param[in] dst_strd +@* integer destination stride +@* +@* @param[in] ui_neighboravailability +@* availability of neighbouring pixels +@* +@* @returns +@* +@* @remarks +@* None +@* +@*******************************************************************************/ +@void ih264_intra_pred_luma_16x16_mode_dc(UWORD8 *pu1_src, +@ UWORD8 *pu1_dst, +@ WORD32 src_strd, +@ WORD32 dst_strd, +@ WORD32 ui_neighboravailability) + +@**************Variables Vs Registers***************************************** +@ r0 => *pu1_src +@ r1 => *pu1_dst +@ r2 => src_strd +@ r3 => dst_strd +@ r4 => ui_neighboravailability + + .global ih264_intra_pred_luma_16x16_mode_dc_a9q + +ih264_intra_pred_luma_16x16_mode_dc_a9q: + + stmfd sp!, {r4, r14} @store register values to stack + ldr r4, [sp, #8] @r4 => ui_neighboravailability + + ands r2, r4, #0x01 @CHECKING IF LEFT_AVAILABLE ELSE BRANCHING TO ONLY TOP AVAILABLE + beq top_available + ands r2, r4, #0x04 @CHECKING IF TOP_AVAILABLE ELSE BRANCHING TO ONLY LEFT AVAILABLE + beq left_available + + vld1.u8 {q0}, [r0] @BOTH LEFT AND TOP AVAILABLE + add r0, r0, #17 + vpaddl.u8 q0, q0 + vld1.u8 {q1}, [r0] + vpaddl.u8 q1, q1 + vadd.u16 q0, q0, q1 + vadd.u16 d0, d0, d1 + vpaddl.u16 d0, d0 + vpaddl.u32 d0, d0 + vqrshrun.s16 d0, q0, #5 + vdup.u8 q0, d0[0] + b str_pred + +top_available: @ONLY TOP AVAILABLE + ands r2, r4, #0x04 @CHECKING TOP AVAILABILTY OR ELSE BRANCH TO NONE AVAILABLE + beq none_available + + add r0, r0, #17 + vld1.u8 {q0}, [r0] + vpaddl.u8 q0, q0 + vadd.u16 d0, d0, d1 + vpaddl.u16 d0, d0 + vpaddl.u32 d0, d0 + vqrshrun.s16 d0, q0, #4 + vdup.u8 q0, d0[0] + b str_pred + +left_available: @ONLY LEFT AVAILABLE + vld1.u8 {q0}, [r0] + vpaddl.u8 q0, q0 + vadd.u16 d0, d0, d1 + vpaddl.u16 d0, d0 + vpaddl.u32 d0, d0 + vqrshrun.s16 d0, q0, #4 + vdup.u8 q0, d0[0] + b str_pred + +none_available: @NONE AVAILABLE + vmov.u8 q0, #128 + +str_pred: + vst1.8 {q0}, [r1], r3 + vst1.8 {q0}, [r1], r3 + vst1.8 {q0}, [r1], r3 + vst1.8 {q0}, [r1], r3 + vst1.8 {q0}, [r1], r3 + vst1.8 {q0}, [r1], r3 + vst1.8 {q0}, [r1], r3 + vst1.8 {q0}, [r1], r3 + vst1.8 {q0}, [r1], r3 + vst1.8 {q0}, [r1], r3 + vst1.8 {q0}, [r1], r3 + vst1.8 {q0}, [r1], r3 + vst1.8 {q0}, [r1], r3 + vst1.8 {q0}, [r1], r3 + vst1.8 {q0}, [r1], r3 + vst1.8 {q0}, [r1], r3 + + ldmfd sp!, {r4, pc} @Restoring registers from stack + + + + + +@/****************************************************************************** + + +@/** +@******************************************************************************* +@* +@*ih264_intra_pred_luma_16x16_mode_plane_a9q +@* +@* @brief +@* Perform Intra prediction for luma_16x16 mode:PLANE +@* +@* @par Description: +@* Perform Intra prediction for luma_16x16 mode:PLANE ,described in sec 8.3.3.4 +@* +@* @param[in] pu1_src +@* UWORD8 pointer to the source +@* +@* @param[out] pu1_dst +@* UWORD8 pointer to the destination +@* +@* @param[in] src_strd +@* integer source stride +@* +@* @param[in] dst_strd +@* integer destination stride +@* +@* @param[in] ui_neighboravailability +@* availability of neighbouring pixels +@* +@* @returns +@* +@* @remarks +@* None +@* +@*******************************************************************************/ +@void ih264_intra_pred_luma_16x16_mode_plane(UWORD8 *pu1_src, +@ UWORD8 *pu1_dst, +@ WORD32 src_strd, +@ WORD32 dst_strd, +@ WORD32 ui_neighboravailability) + +@**************Variables Vs Registers***************************************** +@ r0 => *pu1_src +@ r1 => *pu1_dst +@ r2 => src_strd +@ r3 => dst_strd +@ r4 => ui_neighboravailability + + .global ih264_intra_pred_luma_16x16_mode_plane_a9q +ih264_intra_pred_luma_16x16_mode_plane_a9q: + + stmfd sp!, {r4-r10, r12, lr} + + mov r2, r1 + add r1, r0, #17 + add r0, r0, #15 + + mov r8, #9 + sub r1, r1, #1 + mov r10, r1 @top_left + mov r4, #-1 + vld1.32 d2, [r1], r8 + ldr r7, scratch_intrapred_addr1 +scrlbl1: + add r7, r7, pc + + vld1.32 d0, [r1] + vrev64.8 d2, d2 + vld1.32 {q3}, [r7] + vsubl.u8 q0, d0, d2 + vmovl.u8 q8, d6 + vmul.s16 q0, q0, q8 + vmovl.u8 q9, d7 + + add r7, r0, r4, lsl #3 + sub r0, r7, r4, lsl #1 + rsb lr, r4, #0x0 + + vpadd.s16 d0, d0, d1 + + ldrb r8, [r7], r4 + ldrb r9, [r0], lr + + vpaddl.s16 d0, d0 + sub r12, r8, r9 + + ldrb r8, [r7], r4 + + vpaddl.s32 d0, d0 + ldrb r9, [r0], lr + sub r8, r8, r9 + vshl.s32 d2, d0, #2 + add r12, r12, r8, lsl #1 + + vadd.s32 d0, d0, d2 + ldrb r8, [r7], r4 + ldrb r9, [r0], lr + vrshr.s32 d0, d0, #6 @ i_b = D0[0] + sub r8, r8, r9 + ldrb r5, [r7], r4 + add r8, r8, r8, lsl #1 + + vdup.16 q2, d0[0] + add r12, r12, r8 + ldrb r9, [r0], lr + vmul.s16 q0, q2, q8 + sub r5, r5, r9 + vmul.s16 q1, q2, q9 + add r12, r12, r5, lsl #2 + + ldrb r8, [r7], r4 + ldrb r9, [r0], lr + sub r8, r8, r9 + ldrb r5, [r7], r4 + add r8, r8, r8, lsl #2 + ldrb r6, [r0], lr + add r12, r12, r8 + ldrb r8, [r7], r4 + ldrb r9, [r0], lr + + sub r5, r5, r6 + sub r8, r8, r9 + add r5, r5, r5, lsl #1 + rsb r8, r8, r8, lsl #3 + add r12, r12, r5, lsl #1 + ldrb r5, [r7], r4 + ldrb r6, [r10] @top_left + add r12, r12, r8 + sub r9, r5, r6 + ldrb r6, [r1, #7] + add r12, r12, r9, lsl #3 @ i_c = r12 + add r8, r5, r6 + + add r12, r12, r12, lsl #2 + lsl r8, r8, #4 @ i_a = r8 + + add r12, r12, #0x20 + lsr r12, r12, #6 + + vshl.s16 q14, q2, #3 + vdup.16 q3, r12 + + vdup.16 q15, r8 + vshl.s16 q13, q3, #3 + vsub.s16 q15, q15, q14 + vsub.s16 q15, q15, q13 + vadd.s16 q14, q15, q3 + + mov r0, #14 + vadd.s16 q13, q14, q0 + vadd.s16 q14, q14, q1 + vqrshrun.s16 d20, q13, #5 + vqrshrun.s16 d21, q14, #5 + +loop_16x16_plane: + + vadd.s16 q13, q13, q3 + vadd.s16 q14, q14, q3 + vqrshrun.s16 d22, q13, #5 + vst1.32 {q10}, [r2], r3 + vqrshrun.s16 d23, q14, #5 + + vadd.s16 q13, q13, q3 + subs r0, #2 + vadd.s16 q14, q14, q3 + vqrshrun.s16 d20, q13, #5 + vst1.32 {q11}, [r2], r3 + vqrshrun.s16 d21, q14, #5 + bne loop_16x16_plane + + vadd.s16 q13, q13, q3 + vadd.s16 q14, q14, q3 + vqrshrun.s16 d22, q13, #5 + vst1.32 {q10}, [r2], r3 + vqrshrun.s16 d23, q14, #5 + vst1.32 {q11}, [r2], r3 + + ldmfd sp!, {r4-r10, r12, pc} + + + diff --git a/common/arm/ih264_intra_pred_luma_4x4_a9q.s b/common/arm/ih264_intra_pred_luma_4x4_a9q.s new file mode 100755 index 0000000..cb386ea --- /dev/null +++ b/common/arm/ih264_intra_pred_luma_4x4_a9q.s @@ -0,0 +1,842 @@ +@/****************************************************************************** +@ * +@ * Copyright (C) 2015 The Android Open Source Project +@ * +@ * Licensed under the Apache License, Version 2.0 (the "License"); +@ * you may not use this file except in compliance with the License. +@ * You may obtain a copy of the License at: +@ * +@ * http://www.apache.org/licenses/LICENSE-2.0 +@ * +@ * Unless required by applicable law or agreed to in writing, software +@ * distributed under the License is distributed on an "AS IS" BASIS, +@ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +@ * See the License for the specific language governing permissions and +@ * limitations under the License. +@ * +@ ***************************************************************************** +@ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore +@*/ +@/** +@****************************************************************************** +@* @file +@* ih264_intra_pred_luma_4x4_a9q.s +@* +@* @brief +@* Contains function definitions for intra 4x4 Luma prediction . +@* +@* @author +@* Ittiam +@* +@* @par List of Functions: +@* +@* -ih264_intra_pred_luma_4x4_mode_vert_a9q +@* -ih264_intra_pred_luma_4x4_mode_horz_a9q +@* -ih264_intra_pred_luma_4x4_mode_dc_a9q +@* -ih264_intra_pred_luma_4x4_mode_diag_dl_a9q +@* -ih264_intra_pred_luma_4x4_mode_diag_dr_a9q +@* -ih264_intra_pred_luma_4x4_mode_vert_r_a9q +@* -ih264_intra_pred_luma_4x4_mode_horz_d_a9q +@* -ih264_intra_pred_luma_4x4_mode_vert_l_a9q +@* -ih264_intra_pred_luma_4x4_mode_horz_u_a9q +@* +@* @remarks +@* None +@* +@******************************************************************************* +@*/ + +@/* All the functions here are replicated from ih264_intra_pred_filters.c +@ + +@/** +@/** +@/** +@ + +.text +.p2align 2 + + +@/** +@******************************************************************************* +@* +@*ih264_intra_pred_luma_4x4_mode_vert +@* +@* @brief +@* Perform Intra prediction for luma_4x4 mode:vertical +@* +@* @par Description: +@* Perform Intra prediction for luma_4x4 mode:vertical ,described in sec 8.3.1.2.1 +@* +@* @param[in] pu1_src +@* UWORD8 pointer to the source +@* +@* @param[out] pu1_dst +@* UWORD8 pointer to the destination +@* +@* @param[in] src_strd +@* integer source stride +@* +@* @param[in] dst_strd +@* integer destination stride +@* +@* @param[in] ui_neighboravailability +@* availability of neighbouring pixels(Not used in this function) +@* +@* @returns +@* +@* @remarks +@* None +@* +@******************************************************************************* +@void ih264_intra_pred_luma_4x4_mode_vert(UWORD8 *pu1_src, +@ UWORD8 *pu1_dst, +@ WORD32 src_strd, +@ WORD32 dst_strd, +@ WORD32 ui_neighboravailability) + +@**************Variables Vs Registers***************************************** +@ r0 => *pu1_src +@ r1 => *pu1_dst +@ r2 => src_strd +@ r3 => dst_strd +@ r4 => ui_neighboravailability + + .global ih264_intra_pred_luma_4x4_mode_vert_a9q + +ih264_intra_pred_luma_4x4_mode_vert_a9q: + + + + stmfd sp!, {r4-r12, r14} @store register values to stack + + add r0, r0, #5 + + vld1.32 d0[0], [r0] + + vst1.32 d0[0], [r1], r3 + vst1.32 d0[0], [r1], r3 + vst1.32 d0[0], [r1], r3 + vst1.32 d0[0], [r1], r3 + + + + ldmfd sp!, {r4-r12, pc} @Restoring registers from stack + + + + + +@/****************************************************************************** + + +@/** +@******************************************************************************* +@* +@*ih264_intra_pred_luma_4x4_mode_horz +@* +@* @brief +@* Perform Intra prediction for luma_4x4 mode:horizontal +@* +@* @par Description: +@* Perform Intra prediction for luma_4x4 mode:horizontal ,described in sec 8.3.1.2.2 +@* +@* @param[in] pu1_src +@* UWORD8 pointer to the source +@* +@* @param[out] pu1_dst +@* UWORD8 pointer to the destination +@* +@* @param[in] src_strd +@* integer source stride +@* +@* @param[in] dst_strd +@* integer destination stride +@* +@* @param[in] ui_neighboravailability +@* availability of neighbouring pixels(Not used in this function) +@* +@* @returns +@* +@* @remarks +@* None +@* +@******************************************************************************* +@*/ +@void ih264_intra_pred_luma_4x4_mode_horz(UWORD8 *pu1_src, +@ UWORD8 *pu1_dst, +@ WORD32 src_strd, +@ WORD32 dst_strd, +@ WORD32 ui_neighboravailability) +@**************Variables Vs Registers***************************************** +@ r0 => *pu1_src +@ r1 => *pu1_dst +@ r2 => src_strd +@ r3 => dst_strd +@ r4 => ui_neighboravailability + + + + .global ih264_intra_pred_luma_4x4_mode_horz_a9q + +ih264_intra_pred_luma_4x4_mode_horz_a9q: + + + + stmfd sp!, {r4-r12, r14} @store register values to stack + add r0, r0, #3 + mov r2 , #-1 + + ldrb r5, [r0], r2 + vdup.u8 d0, r5 + ldrb r6, [r0], r2 + vst1.32 d0[0], [r1], r3 + vdup.u8 d1, r6 + ldrb r7, [r0], r2 + vst1.32 d1[0], [r1], r3 + vdup.u8 d2, r7 + ldrb r8, [r0], r2 + vst1.32 d2[0], [r1], r3 + vdup.u8 d3, r8 + vst1.32 d3[0], [r1], r3 + + + ldmfd sp!, {r4-r12, pc} @Restoring registers from stack + + + + + + + +@/****************************************************************************** + + +@/** +@******************************************************************************* +@* +@*ih264_intra_pred_luma_4x4_mode_dc +@* +@* @brief +@* Perform Intra prediction for luma_4x4 mode:DC +@* +@* @par Description: +@* Perform Intra prediction for luma_4x4 mode:DC ,described in sec 8.3.1.2.3 +@* +@* @param[in] pu1_src +@* UWORD8 pointer to the source +@* +@* @param[out] pu1_dst +@* UWORD8 pointer to the destination +@* +@* @param[in] src_strd +@* integer source stride +@* +@* @param[in] dst_strd +@* integer destination stride +@* +@* @param[in] ui_neighboravailability +@* availability of neighbouring pixels +@* +@* @returns +@* +@* @remarks +@* None +@* +@*******************************************************************************/ +@void ih264_intra_pred_luma_4x4_mode_dc(UWORD8 *pu1_src, +@ UWORD8 *pu1_dst, +@ WORD32 src_strd, +@ WORD32 dst_strd, +@ WORD32 ui_neighboravailability) + +@**************Variables Vs Registers***************************************** +@ r0 => *pu1_src +@ r1 => *pu1_dst +@ r2 => src_strd +@ r3 => dst_strd +@ r4 => ui_neighboravailability + + + + .global ih264_intra_pred_luma_4x4_mode_dc_a9q + +ih264_intra_pred_luma_4x4_mode_dc_a9q: + + + + stmfd sp!, {r4-r12, r14} @store register values to stack + ldr r4, [sp, #40] @ r4 => ui_neighboravailability + + ands r5, r4, #0x01 + beq top_available @LEFT NOT AVAILABLE + + add r10, r0, #3 + mov r2, #-1 + ldrb r5, [r10], r2 + ldrb r6, [r10], r2 + ldrb r7, [r10], r2 + add r5, r5, r6 + ldrb r8, [r10], r2 + add r5, r5, r7 + ands r11, r4, #0x04 @ CHECKING IF TOP_AVAILABLE ELSE BRANCHING TO ONLY LEFT AVAILABLE + add r5, r5, r8 + beq left_available + add r10, r0, #5 + @ BOTH LEFT AND TOP AVAILABLE + ldrb r6, [r10], #1 + ldrb r7, [r10], #1 + add r5, r5, r6 + ldrb r8, [r10], #1 + add r5, r5, r7 + ldrb r9, [r10], #1 + add r5, r5, r8 + add r5, r5, r9 + add r5, r5, #4 + lsr r5, r5, #3 + vdup.u8 d0, r5 + vst1.32 d0[0], [r1], r3 + vst1.32 d0[0], [r1], r3 + vst1.32 d0[0], [r1], r3 + vst1.32 d0[0], [r1], r3 + b end_func + +top_available: @ ONLT TOP AVAILABLE + ands r11, r4, #0x04 @ CHECKING TOP AVAILABILTY OR ELSE BRANCH TO NONE AVAILABLE + beq none_available + + add r10, r0, #5 + ldrb r6, [r10], #1 + ldrb r7, [r10], #1 + ldrb r8, [r10], #1 + add r5, r6, r7 + ldrb r9, [r10], #1 + add r5, r5, r8 + add r5, r5, r9 + add r5, r5, #2 + lsr r5, r5, #2 + vdup.u8 d0, r5 + vst1.32 d0[0], [r1], r3 + vst1.32 d0[0], [r1], r3 + vst1.32 d0[0], [r1], r3 + vst1.32 d0[0], [r1], r3 + b end_func + +left_available: @ONLY LEFT AVAILABLE + add r5, r5, #2 + lsr r5, r5, #2 + vdup.u8 d0, r5 + vst1.32 d0[0], [r1], r3 + vst1.32 d0[0], [r1], r3 + vst1.32 d0[0], [r1], r3 + vst1.32 d0[0], [r1], r3 + b end_func + +none_available: @NONE AVAILABLE + mov r5, #128 + vdup.u8 d0, r5 + vst1.32 d0[0], [r1], r3 + vst1.32 d0[0], [r1], r3 + vst1.32 d0[0], [r1], r3 + vst1.32 d0[0], [r1], r3 + b end_func + + +end_func: + ldmfd sp!, {r4-r12, pc} @Restoring registers from stack + + + + + + + +@/** +@******************************************************************************* +@* +@*ih264_intra_pred_luma_4x4_mode_diag_dl +@* +@* @brief +@* Perform Intra prediction for luma_4x4 mode:Diagonal_Down_Left +@* +@* @par Description: +@* Perform Intra prediction for luma_4x4 mode:Diagonal_Down_Left ,described in sec 8.3.1.2.4 +@* +@* @param[in] pu1_src +@* UWORD8 pointer to the source +@* +@* @param[out] pu1_dst +@* UWORD8 pointer to the destination +@* +@* @param[in] src_strd +@* integer source stride +@* +@* @param[in] dst_strd +@* integer destination stride +@* +@* @param[in] ui_neighboravailability +@* availability of neighbouring pixels +@* +@* @returns +@* +@* @remarks +@* None +@* +@*******************************************************************************/ +@void ih264_intra_pred_luma_4x4_mode_diag_dl(UWORD8 *pu1_src, +@ UWORD8 *pu1_dst, +@ WORD32 src_strd, +@ WORD32 dst_strd, +@ WORD32 ui_neighboravailability) + +@**************Variables Vs Registers***************************************** +@ r0 => *pu1_src +@ r1 => *pu1_dst +@ r2 => src_strd +@ r3 => dst_strd +@ r4 => ui_neighboravailability + + + .global ih264_intra_pred_luma_4x4_mode_diag_dl_a9q + +ih264_intra_pred_luma_4x4_mode_diag_dl_a9q: + + stmfd sp!, {r4-r12, r14} @store register values to stack + + add r0, r0, #5 + sub r5, r3, #2 + add r6, r0, #7 + vld1.8 {d0}, [r0] + vext.8 d1, d0, d0, #1 + vext.8 d2, d0, d0, #2 + vld1.8 {d2[6]}, [r6] + vaddl.u8 q10, d0, d1 + vaddl.u8 q11, d1, d2 + vadd.u16 q12, q10, q11 + vqrshrun.s16 d3, q12, #2 + vst1.32 {d3[0]}, [r1], r3 + vext.8 d4, d3, d3, #1 + vst1.32 {d4[0]}, [r1], r3 + vst1.16 {d3[1]}, [r1]! + vst1.16 {d3[2]}, [r1], r5 + vst1.16 {d4[1]}, [r1]! + vst1.16 {d4[2]}, [r1] + +end_func_diag_dl: + ldmfd sp!, {r4-r12, pc} @Restoring registers from stack + + + + + + + + + +@/** +@******************************************************************************* +@* +@*ih264_intra_pred_luma_4x4_mode_diag_dr +@* +@* @brief +@* Perform Intra prediction for luma_4x4 mode:Diagonal_Down_Right +@* +@* @par Description: +@* Perform Intra prediction for luma_4x4 mode:Diagonal_Down_Right ,described in sec 8.3.1.2.5 +@* +@* @param[in] pu1_src +@* UWORD8 pointer to the source +@* +@* @param[out] pu1_dst +@* UWORD8 pointer to the destination +@* +@* @param[in] src_strd +@* integer source stride +@* +@* @param[in] dst_strd +@* integer destination stride +@* +@* @param[in] ui_neighboravailability +@* availability of neighbouring pixels +@* +@* @returns +@* +@* @remarks +@* None +@* +@*******************************************************************************/ +@void ih264_intra_pred_luma_4x4_mode_diag_dr(UWORD8 *pu1_src, +@ UWORD8 *pu1_dst, +@ WORD32 src_strd, +@ WORD32 dst_strd, +@ WORD32 ui_neighboravailability) + +@**************Variables Vs Registers***************************************** +@ r0 => *pu1_src +@ r1 => *pu1_dst +@ r2 => src_strd +@ r3 => dst_strd +@ r4 => ui_neighboravailability + + + .global ih264_intra_pred_luma_4x4_mode_diag_dr_a9q + +ih264_intra_pred_luma_4x4_mode_diag_dr_a9q: + + stmfd sp!, {r4-r12, r14} @store register values to stack + + + vld1.u8 {d0}, [r0] + add r0, r0, #1 + vld1.u8 {d1}, [r0] + vext.8 d2, d1, d1, #1 + vaddl.u8 q10, d0, d1 + vaddl.u8 q11, d1, d2 + vadd.u16 q12, q10, q11 + vqrshrun.s16 d3, q12, #2 + + vext.8 d4, d3, d3, #1 + sub r5, r3, #2 + vst1.16 {d4[1]}, [r1]! + vst1.16 {d4[2]}, [r1], r5 + vst1.16 {d3[1]}, [r1]! + vst1.16 {d3[2]}, [r1], r5 + vst1.32 {d4[0]}, [r1], r3 + vst1.32 {d3[0]}, [r1], r3 + +end_func_diag_dr: + ldmfd sp!, {r4-r12, pc} @Restoring registers from stack + + + + + + + +@/** +@******************************************************************************* +@* +@*ih264_intra_pred_luma_4x4_mode_vert_r +@* +@* @brief +@* Perform Intra prediction for luma_4x4 mode:Vertical_Right +@* +@* @par Description: +@* Perform Intra prediction for luma_4x4 mode:Vertical_Right ,described in sec 8.3.1.2.6 +@* +@* @param[in] pu1_src +@* UWORD8 pointer to the source +@* +@* @param[out] pu1_dst +@* UWORD8 pointer to the destination +@* +@* @param[in] src_strd +@* integer source stride +@* +@* @param[in] dst_strd +@* integer destination stride +@* +@* @param[in] ui_neighboravailability +@* availability of neighbouring pixels +@* +@* @returns +@* +@* @remarks +@* None +@* +@*******************************************************************************/ +@void ih264_intra_pred_luma_4x4_mode_vert_r(UWORD8 *pu1_src, +@ UWORD8 *pu1_dst, +@ WORD32 src_strd, +@ WORD32 dst_strd, +@ WORD32 ui_neighboravailability) + +@**************Variables Vs Registers***************************************** +@ r0 => *pu1_src +@ r1 => *pu1_dst +@ r2 => src_strd +@ r3 => dst_strd +@ r4 => ui_neighboravailability + + + .global ih264_intra_pred_luma_4x4_mode_vert_r_a9q + +ih264_intra_pred_luma_4x4_mode_vert_r_a9q: + + stmfd sp!, {r4-r12, r14} @store register values to stack + + + vld1.u8 {d0}, [r0] + add r0, r0, #1 + vld1.u8 {d1}, [r0] + vext.8 d2, d1, d1, #1 + vaddl.u8 q10, d0, d1 + vaddl.u8 q11, d1, d2 + vadd.u16 q12, q10, q11 + vqrshrun.s16 d4, q10, #1 + vqrshrun.s16 d3, q12, #2 + sub r5, r3, #2 + vext.8 d5, d3, d3, #3 + vst1.32 {d4[1]}, [r1], r3 + vst1.32 {d5[0]}, [r1], r3 + sub r8, r3, #3 + vst1.u8 {d3[2]}, [r1]! + vst1.16 {d4[2]}, [r1]! + vst1.u8 {d4[6]}, [r1], r8 + vst1.u8 {d3[1]}, [r1]! + vst1.16 {d5[0]}, [r1]! + vst1.u8 {d5[2]}, [r1] + + +end_func_vert_r: + ldmfd sp!, {r4-r12, pc} @Restoring registers from stack + + + + + +@/** +@******************************************************************************* +@* +@*ih264_intra_pred_luma_4x4_mode_horz_d +@* +@* @brief +@* Perform Intra prediction for luma_4x4 mode:Horizontal_Down +@* +@* @par Description: +@* Perform Intra prediction for luma_4x4 mode:Horizontal_Down ,described in sec 8.3.1.2.7 +@* +@* @param[in] pu1_src +@* UWORD8 pointer to the source +@* +@* @param[out] pu1_dst +@* UWORD8 pointer to the destination +@* +@* @param[in] src_strd +@* integer source stride +@* +@* @param[in] dst_strd +@* integer destination stride +@* +@* @param[in] ui_neighboravailability +@* availability of neighbouring pixels +@* +@* @returns +@* +@* @remarks +@* None +@* +@*******************************************************************************/ +@void ih264_intra_pred_luma_4x4_mode_horz_d(UWORD8 *pu1_src, +@ UWORD8 *pu1_dst, +@ WORD32 src_strd, +@ WORD32 dst_strd, +@ WORD32 ui_neighboravailability) + +@**************Variables Vs Registers***************************************** +@ r0 => *pu1_src +@ r1 => *pu1_dst +@ r2 => src_strd +@ r3 => dst_strd +@ r4 => ui_neighboravailability + + + .global ih264_intra_pred_luma_4x4_mode_horz_d_a9q + +ih264_intra_pred_luma_4x4_mode_horz_d_a9q: + + stmfd sp!, {r4-r12, r14} @store register values to stack + + vld1.u8 {d0}, [r0] + add r0, r0, #1 + vld1.u8 {d1}, [r0] + vext.8 d2, d1, d0, #1 + vaddl.u8 q10, d0, d1 + vaddl.u8 q11, d1, d2 + vadd.u16 q12, q10, q11 + vqrshrun.s16 d4, q10, #1 + vqrshrun.s16 d5, q12, #2 + sub r5, r3, #2 + vmov.8 d6, d5 + vtrn.8 d4, d5 @ + vst1.u16 {d5[1]}, [r1]! + vst1.16 {d6[2]}, [r1], r5 + vst1.u16 {d4[1]}, [r1]! + vst1.16 {d5[1]}, [r1], r5 + vst1.u16 {d5[0]}, [r1]! + vst1.16 {d4[1]}, [r1], r5 + vst1.u16 {d4[0]}, [r1]! + vst1.16 {d5[0]}, [r1], r5 + +end_func_horz_d: + ldmfd sp!, {r4-r12, pc} @Restoring registers from stack + + + + + + + +@/** +@******************************************************************************* +@* +@*ih264_intra_pred_luma_4x4_mode_vert_l +@* +@* @brief +@* Perform Intra prediction for luma_4x4 mode:Vertical_Left +@* +@* @par Description: +@* Perform Intra prediction for luma_4x4 mode:Vertical_Left ,described in sec 8.3.1.2.8 +@* +@* @param[in] pu1_src +@* UWORD8 pointer to the source +@* +@* @param[out] pu1_dst +@* UWORD8 pointer to the destination +@* +@* @param[in] src_strd +@* integer source stride +@* +@* @param[in] dst_strd +@* integer destination stride +@* +@* @param[in] ui_neighboravailability +@* availability of neighbouring pixels +@* +@* @returns +@* +@* @remarks +@* None +@* +@*******************************************************************************/ +@void ih264_intra_pred_luma_4x4_mode_vert_l(UWORD8 *pu1_src, +@ UWORD8 *pu1_dst, +@ WORD32 src_strd, +@ WORD32 dst_strd, +@ WORD32 ui_neighboravailability) + +@**************Variables Vs Registers***************************************** +@ r0 => *pu1_src +@ r1 => *pu1_dst +@ r2 => src_strd +@ r3 => dst_strd +@ r4 => ui_neighboravailability + + + .global ih264_intra_pred_luma_4x4_mode_vert_l_a9q + +ih264_intra_pred_luma_4x4_mode_vert_l_a9q: + + stmfd sp!, {r4-r12, r14} @store register values to stack + add r0, r0, #4 + vld1.u8 {d0}, [r0] + add r0, r0, #1 + vld1.u8 {d1}, [r0] + vext.8 d2, d1, d0, #1 + vaddl.u8 q10, d0, d1 + vaddl.u8 q11, d1, d2 + vadd.u16 q12, q10, q11 + vqrshrun.s16 d4, q10, #1 + vqrshrun.s16 d5, q12, #2 + vext.8 d6, d4, d4, #1 + vext.8 d7, d5, d5, #1 + vst1.32 {d6[0]}, [r1], r3 + vext.8 d16, d4, d4, #2 + vext.8 d17, d5, d5, #2 + vst1.32 {d7[0]}, [r1], r3 + vst1.32 {d16[0]}, [r1], r3 + vst1.32 {d17[0]}, [r1], r3 + + + +end_func_vert_l: + ldmfd sp!, {r4-r12, pc} @Restoring registers from stack + + + + + + + +@/** +@******************************************************************************* +@* +@*ih264_intra_pred_luma_4x4_mode_horz_u +@* +@* @brief +@* Perform Intra prediction for luma_4x4 mode:Horizontal_Up +@* +@* @par Description: +@* Perform Intra prediction for luma_4x4 mode:Horizontal_Up ,described in sec 8.3.1.2.9 +@* +@* @param[in] pu1_src +@* UWORD8 pointer to the source +@* +@* @param[out] pu1_dst +@* UWORD8 pointer to the destination +@* +@* @param[in] src_strd +@* integer source stride +@* +@* @param[in] dst_strd +@* integer destination stride +@* +@* @param[in] ui_neighboravailability +@* availability of neighbouring pixels +@* +@* @returns +@* +@* @remarks +@* None +@* +@*******************************************************************************/ +@void ih264_intra_pred_luma_4x4_mode_horz_u(UWORD8 *pu1_src, +@ UWORD8 *pu1_dst, +@ WORD32 src_strd, +@ WORD32 dst_strd, +@ WORD32 ui_neighboravailability) + +@**************Variables Vs Registers***************************************** +@ r0 => *pu1_src +@ r1 => *pu1_dst +@ r2 => src_strd +@ r3 => dst_strd +@ r4 => ui_neighboravailability + + + .global ih264_intra_pred_luma_4x4_mode_horz_u_a9q + +ih264_intra_pred_luma_4x4_mode_horz_u_a9q: + + stmfd sp!, {r4-r12, r14} @store register values to stack + mov r10, r0 + vld1.u8 {d0}, [r0] + ldrb r9, [r0], #1 + vext.8 d1, d0, d0, #1 + vld1.u8 {d0[7]}, [r10] + vext.8 d2, d1, d1, #1 + vaddl.u8 q10, d0, d1 + vaddl.u8 q11, d1, d2 + vadd.u16 q12, q10, q11 + vqrshrun.s16 d4, q10, #1 + vqrshrun.s16 d5, q12, #2 + vmov d6, d4 + vext.8 d6, d5, d4, #1 + vst1.8 {d4[2]}, [r1]! + vst1.8 {d6[0]}, [r1]! + vtrn.8 d6, d5 @ + sub r5, r3, #2 + vtrn.8 d4, d6 @ + vdup.8 d7, r9 + vst1.16 {d6[0]}, [r1], r5 + vst1.16 {d6[0]}, [r1]! + vst1.16 {d5[3]}, [r1], r5 + vst1.16 {d5[3]}, [r1]! + vst1.16 {d7[3]}, [r1], r5 + vst1.32 {d7[0]}, [r1], r3 + +end_func_horz_u: + ldmfd sp!, {r4-r12, pc} @Restoring registers from stack + + diff --git a/common/arm/ih264_intra_pred_luma_8x8_a9q.s b/common/arm/ih264_intra_pred_luma_8x8_a9q.s new file mode 100755 index 0000000..6da1c95 --- /dev/null +++ b/common/arm/ih264_intra_pred_luma_8x8_a9q.s @@ -0,0 +1,1037 @@ +@/****************************************************************************** +@ * +@ * Copyright (C) 2015 The Android Open Source Project +@ * +@ * Licensed under the Apache License, Version 2.0 (the "License"); +@ * you may not use this file except in compliance with the License. +@ * You may obtain a copy of the License at: +@ * +@ * http://www.apache.org/licenses/LICENSE-2.0 +@ * +@ * Unless required by applicable law or agreed to in writing, software +@ * distributed under the License is distributed on an "AS IS" BASIS, +@ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +@ * See the License for the specific language governing permissions and +@ * limitations under the License. +@ * +@ ***************************************************************************** +@ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore +@*/ +@/** +@****************************************************************************** +@* @file +@* ih264_intra_pred_luma_8x8_a9q.s +@* +@* @brief +@* Contains function definitions for intra 8x8 Luma prediction . +@* +@* @author +@* Ittiam +@* +@* @par List of Functions: +@* +@* -ih264_intra_pred_luma_8x8_mode_ref_filtering_a9q +@* -ih264_intra_pred_luma_8x8_mode_vert_a9q +@* -ih264_intra_pred_luma_8x8_mode_horz_a9q +@* -ih264_intra_pred_luma_8x8_mode_dc_a9q +@* -ih264_intra_pred_luma_8x8_mode_diag_dl_a9q +@* -ih264_intra_pred_luma_8x8_mode_diag_dr_a9q +@* -ih264_intra_pred_luma_8x8_mode_vert_r_a9q +@* -ih264_intra_pred_luma_8x8_mode_horz_d_a9q +@* -ih264_intra_pred_luma_8x8_mode_vert_l_a9q +@* -ih264_intra_pred_luma_8x8_mode_horz_u_a9q +@* +@* @remarks +@* None +@* +@******************************************************************************* +@*/ + +@/* All the functions here are replicated from ih264_intra_pred_filters.c +@ + +@/** +@/** +@/** +@ + + +.text +.p2align 2 + + .extern ih264_gai1_intrapred_luma_8x8_horz_u +.hidden ih264_gai1_intrapred_luma_8x8_horz_u +scratch_intrapred_addr_8x8: + .long ih264_gai1_intrapred_luma_8x8_horz_u - scrlb8x8l2 - 8 + +@/** +@******************************************************************************* +@* +@*ih264_intra_pred_luma_8x8_mode_ref_filtering +@* +@* @brief +@* Reference sample filtering process for Intra_8x8 sample prediction +@* +@* @par Description: +@* Perform Reference sample filtering process for Intra_8x8 sample prediction ,described in sec 8.3.2.2.1 +@* +@* @param[in] pu1_src +@* UWORD8 pointer to the source +@* +@* @param[out] pu1_dst +@* UWORD8 pointer to the destination +@* +@* @param[in] src_strd +@* integer source stride [Not used] +@* +@* @param[in] dst_strd +@* integer destination stride[Not used] +@* +@* @param[in] ui_neighboravailability +@* availability of neighbouring pixels[Not used] +@* +@* @returns +@* +@* @remarks +@* None +@* +@*******************************************************************************/ +@void ih264_intra_pred_luma_8x8_mode_ref_filtering(UWORD8 *pu1_src, +@ UWORD8 *pu1_dst) + +@**************Variables Vs Registers***************************************** +@ r0 => *pu1_src +@ r1 => *pu1_dst + + + .global ih264_intra_pred_luma_8x8_mode_ref_filtering_a9q + +ih264_intra_pred_luma_8x8_mode_ref_filtering_a9q: + + stmfd sp!, {r4-r12, r14} @store register values to stack + vpush {d8-d15} + + + vld1.u8 {q0}, [r0]! @ + vld1.u8 {q1}, [r0] + add r0, r0, #8 @ + vext.8 q2, q0, q1, #1 + vext.8 q3, q1, q1, #1 + vext.8 q4, q2, q3, #1 + vext.8 q5, q3, q3, #1 + vld1.8 {d10[7]}, [r0] @ LOADING SRC[24] AGIN TO THE END FOR p'[ 15, -1 ] = ( p[ 14, -1 ] + 3 * p[ 15, -1 ] + 2 ) >> 2 + vaddl.u8 q10, d0, d4 + vaddl.u8 q7, d0, d0 @ SPECIAL CASE FOR p'[ -1 ,7 ] = ( p[ -1, 6 ] + 3 * p[ -1, 7 ] + 2 ) >> 2 + vadd.u16 q7, q10, q7 + vaddl.u8 q11, d1, d5 + vqrshrun.s16 d14, q7, #2 + vaddl.u8 q12, d4, d8 + vaddl.u8 q13, d5, d9 + vst1.8 {d14[0]}, [r1]! + vadd.u16 q12, q10, q12 + vadd.u16 q13, q11, q13 + vaddl.u8 q9, d2, d6 + vaddl.u8 q8, d6, d10 + vqrshrun.s16 d4, q12, #2 + vqrshrun.s16 d5, q13, #2 + vadd.u16 q6, q8, q9 + vst1.8 {q2}, [r1]! + vqrshrun.s16 d6, q6, #2 + vst1.8 {d6}, [r1] + + +end_func_ref_filt: + vpop {d8-d15} + ldmfd sp!, {r4-r12, pc} @Restoring registers from stack + + + + + + +@/** +@******************************************************************************* +@* +@*ih264_intra_pred_luma_8x8_mode_vert +@* +@* @brief +@* Perform Intra prediction for luma_8x8 mode:vertical +@* +@* @par Description: +@* Perform Intra prediction for luma_8x8 mode:vertical ,described in sec 8.3.2.2.2 +@* +@* @param[in] pu1_src +@* UWORD8 pointer to the source +@* +@* @param[out] pu1_dst +@* UWORD8 pointer to the destination +@* +@* @param[in] src_strd +@* integer source stride +@* +@* @param[in] dst_strd +@* integer destination stride +@* +@* @param[in] ui_neighboravailability +@* availability of neighbouring pixels(Not used in this function) +@* +@* @returns +@* +@* @remarks +@* None +@* +@******************************************************************************* +@void ih264_intra_pred_luma_8x8_mode_vert(UWORD8 *pu1_src, +@ UWORD8 *pu1_dst, +@ WORD32 src_strd, +@ WORD32 dst_strd, +@ WORD32 ui_neighboravailability) + +@**************Variables Vs Registers***************************************** +@ r0 => *pu1_src +@ r1 => *pu1_dst +@ r2 => src_strd +@ r3 => dst_strd +@ r4 => ui_neighboravailability + + + .global ih264_intra_pred_luma_8x8_mode_vert_a9q + +ih264_intra_pred_luma_8x8_mode_vert_a9q: + + stmfd sp!, {r4-r12, r14} @store register values to stack + + add r0, r0, #9 + vld1.8 d0, [r0] + + vst1.8 d0, [r1], r3 + vst1.8 d0, [r1], r3 + vst1.8 d0, [r1], r3 + vst1.8 d0, [r1], r3 + vst1.8 d0, [r1], r3 + vst1.8 d0, [r1], r3 + vst1.8 d0, [r1], r3 + vst1.8 d0, [r1], r3 + + ldmfd sp!, {r4-r12, pc} @Restoring registers from stack + + + + + +@/****************************************************************************** + + +@/** +@******************************************************************************* +@* +@*ih264_intra_pred_luma_8x8_mode_horz +@* +@* @brief +@* Perform Intra prediction for luma_8x8 mode:horizontal +@* +@* @par Description: +@* Perform Intra prediction for luma_8x8 mode:horizontal ,described in sec 8.3.2.2.2 +@* +@* @param[in] pu1_src +@* UWORD8 pointer to the source +@* +@* @param[out] pu1_dst +@* UWORD8 pointer to the destination +@* +@* @param[in] src_strd +@* integer source stride +@* +@* @param[in] dst_strd +@* integer destination stride +@* +@* @param[in] ui_neighboravailability +@* availability of neighbouring pixels(Not used in this function) +@* +@* @returns +@* +@* @remarks +@* None +@* +@******************************************************************************* +@*/ +@void ih264_intra_pred_luma_8x8_mode_horz(UWORD8 *pu1_src, +@ UWORD8 *pu1_dst, +@ WORD32 src_strd, +@ WORD32 dst_strd, +@ WORD32 ui_neighboravailability) +@**************Variables Vs Registers***************************************** +@ r0 => *pu1_src +@ r1 => *pu1_dst +@ r2 => src_strd +@ r3 => dst_strd +@ r4 => ui_neighboravailability + + + .global ih264_intra_pred_luma_8x8_mode_horz_a9q + +ih264_intra_pred_luma_8x8_mode_horz_a9q: + + stmfd sp!, {r14} @store register values to stack + + vld1.u8 {d0}, [r0] + mov r2, #6 + + vdup.u8 d1, d0[7] + vdup.u8 d2, d0[6] + vst1.8 {d1}, [r1], r3 + +loop_8x8_horz: + vext.8 d0, d0, d0, #6 + vst1.8 {d2}, [r1], r3 + vdup.u8 d1, d0[7] + subs r2, #2 + vdup.u8 d2, d0[6] + vst1.8 {d1}, [r1], r3 + bne loop_8x8_horz + + vext.8 d0, d0, d0, #6 + vst1.8 {d2}, [r1], r3 + + ldmfd sp!, {pc} @restoring registers from stack + + + + + +@/****************************************************************************** + + +@/** +@******************************************************************************* +@* +@*ih264_intra_pred_luma_8x8_mode_dc +@* +@* @brief +@* Perform Intra prediction for luma_8x8 mode:DC +@* +@* @par Description: +@* Perform Intra prediction for luma_8x8 mode:DC ,described in sec 8.3.2.2.3 +@* +@* @param[in] pu1_src +@* UWORD8 pointer to the source +@* +@* @param[out] pu1_dst +@* UWORD8 pointer to the destination +@* +@* @param[in] src_strd +@* integer source stride +@* +@* @param[in] dst_strd +@* integer destination stride +@* +@* @param[in] ui_neighboravailability +@* availability of neighbouring pixels +@* +@* @returns +@* +@* @remarks +@* None +@* +@*******************************************************************************/ +@void ih264_intra_pred_luma_8x8_mode_dc(UWORD8 *pu1_src, +@ UWORD8 *pu1_dst, +@ WORD32 src_strd, +@ WORD32 dst_strd, +@ WORD32 ui_neighboravailability) + +@**************Variables Vs Registers***************************************** +@ r0 => *pu1_src +@ r1 => *pu1_dst +@ r2 => src_strd +@ r3 => dst_strd +@ r4 => ui_neighboravailability + + + .global ih264_intra_pred_luma_8x8_mode_dc_a9q + +ih264_intra_pred_luma_8x8_mode_dc_a9q: + + stmfd sp!, {r4, r14} @store register values to stack + ldr r4, [sp, #8] @r4 => ui_neighboravailability + + ands r2, r4, #0x01 @CHECKING IF LEFT_AVAILABLE ELSE BRANCHING TO ONLY TOP AVAILABLE + beq top_available + ands r2, r4, #0x04 @CHECKING IF TOP_AVAILABLE ELSE BRANCHING TO ONLY LEFT AVAILABLE + beq left_available + + vld1.u8 {d0}, [r0] @BOTH LEFT AND TOP AVAILABLE + add r0, r0, #9 + vld1.u8 {d1}, [r0] + vpaddl.u8 q0, q0 + vadd.u16 d0, d0, d1 + vpaddl.u16 d0, d0 + vpaddl.u32 d0, d0 + vqrshrun.s16 d0, q0, #4 + vdup.u8 d0, d0[0] + b str_pred + +top_available: @ONLY TOP AVAILABLE + ands r2, r4, #0x04 @CHECKING TOP AVAILABILTY OR ELSE BRANCH TO NONE AVAILABLE + beq none_available + + add r0, r0, #9 + vld1.u8 {d0}, [r0] + vpaddl.u8 d0, d0 + vpaddl.u16 d0, d0 + vpaddl.u32 d0, d0 + vqrshrun.s16 d0, q0, #3 + vdup.u8 d0, d0[0] + b str_pred + +left_available: @ONLY LEFT AVAILABLE + vld1.u8 {d0}, [r0] + vpaddl.u8 d0, d0 + vpaddl.u16 d0, d0 + vpaddl.u32 d0, d0 + vqrshrun.s16 d0, q0, #3 + vdup.u8 d0, d0[0] + b str_pred + +none_available: @NONE AVAILABLE + vmov.u8 q0, #128 + +str_pred: + vst1.8 {d0}, [r1], r3 + vst1.8 {d0}, [r1], r3 + vst1.8 {d0}, [r1], r3 + vst1.8 {d0}, [r1], r3 + vst1.8 {d0}, [r1], r3 + vst1.8 {d0}, [r1], r3 + vst1.8 {d0}, [r1], r3 + vst1.8 {d0}, [r1], r3 + + ldmfd sp!, {r4, pc} @Restoring registers from stack + + + + + + +@/** +@******************************************************************************* +@* +@*ih264_intra_pred_luma_8x8_mode_diag_dl +@* +@* @brief +@* Perform Intra prediction for luma_8x8 mode:Diagonal_Down_Left +@* +@* @par Description: +@* Perform Intra prediction for luma_8x8 mode:Diagonal_Down_Left ,described in sec 8.3.2.2.4 +@* +@* @param[in] pu1_src +@* UWORD8 pointer to the source +@* +@* @param[out] pu1_dst +@* UWORD8 pointer to the destination +@* +@* @param[in] src_strd +@* integer source stride +@* +@* @param[in] dst_strd +@* integer destination stride +@* +@* @param[in] ui_neighboravailability +@* availability of neighbouring pixels +@* +@* @returns +@* +@* @remarks +@* None +@* +@*******************************************************************************/ +@void ih264_intra_pred_luma_8x8_mode_diag_dl(UWORD8 *pu1_src, +@ UWORD8 *pu1_dst, +@ WORD32 src_strd, +@ WORD32 dst_strd, +@ WORD32 ui_neighboravailability) + +@**************Variables Vs Registers***************************************** +@ r0 => *pu1_src +@ r1 => *pu1_dst +@ r2 => src_strd +@ r3 => dst_strd +@ r4 => ui_neighboravailability + + .global ih264_intra_pred_luma_8x8_mode_diag_dl_a9q + +ih264_intra_pred_luma_8x8_mode_diag_dl_a9q: + + stmfd sp!, {r4-r12, r14} @store register values to stack + + add r0, r0, #9 + sub r5, r3, #4 + add r6, r0, #15 + vld1.8 {q0}, [r0] + vext.8 q2, q0, q0, #2 + vext.8 q1, q0, q0, #1 + vld1.8 {d5[6]}, [r6] + @ q1 = q0 shifted to left once + @ q2 = q1 shifted to left once + vaddl.u8 q10, d0, d2 @Adding for FILT121 + vaddl.u8 q11, d1, d3 + vaddl.u8 q12, d2, d4 + vaddl.u8 q13, d3, d5 + vadd.u16 q12, q10, q12 + vadd.u16 q13, q11, q13 + + vqrshrun.s16 d4, q12, #2 + vqrshrun.s16 d5, q13, #2 + @Q2 has all FILT121 values + vst1.8 {d4}, [r1], r3 + vext.8 q9, q2, q2, #1 + vext.8 q8, q9, q9, #1 + vst1.8 {d18}, [r1], r3 + vext.8 q15, q8, q8, #1 + vst1.8 {d16}, [r1], r3 + vst1.8 {d30}, [r1], r3 + vst1.32 {d4[1]}, [r1]! + vst1.32 {d5[0]}, [r1], r5 + vst1.32 {d18[1]}, [r1]! + vst1.32 {d19[0]}, [r1], r5 + vst1.32 {d16[1]}, [r1]! + vst1.32 {d17[0]}, [r1], r5 + vst1.32 {d30[1]}, [r1]! + vst1.32 {d31[0]}, [r1], r5 + + +end_func_diag_dl: + ldmfd sp!, {r4-r12, pc} @Restoring registers from stack + + + + +@/** +@******************************************************************************* +@* +@*ih264_intra_pred_luma_8x8_mode_diag_dr +@* +@* @brief +@* Perform Intra prediction for luma_8x8 mode:Diagonal_Down_Right +@* +@* @par Description: +@* Perform Intra prediction for luma_8x8 mode:Diagonal_Down_Right ,described in sec 8.3.2.2.5 +@* +@* @param[in] pu1_src +@* UWORD8 pointer to the source +@* +@* @param[out] pu1_dst +@* UWORD8 pointer to the destination +@* +@* @param[in] src_strd +@* integer source stride +@* +@* @param[in] dst_strd +@* integer destination stride +@* +@* @param[in] ui_neighboravailability +@* availability of neighbouring pixels +@* +@* @returns +@* +@* @remarks +@* None +@* +@*******************************************************************************/ +@void ih264_intra_pred_luma_8x8_mode_diag_dr(UWORD8 *pu1_src, +@ UWORD8 *pu1_dst, +@ WORD32 src_strd, +@ WORD32 dst_strd, +@ WORD32 ui_neighboravailability) + +@**************Variables Vs Registers***************************************** +@ r0 => *pu1_src +@ r1 => *pu1_dst +@ r2 => src_strd +@ r3 => dst_strd +@ r4 => ui_neighboravailability + + + .global ih264_intra_pred_luma_8x8_mode_diag_dr_a9q + +ih264_intra_pred_luma_8x8_mode_diag_dr_a9q: + + stmfd sp!, {r4-r12, r14} @store register values to stack + + + vld1.u8 {q0}, [r0] + add r0, r0, #1 + vld1.u8 {q1}, [r0] + vext.8 q2, q1, q1, #1 + @ q1 = q0 shifted to left once + @ q2 = q1 shifted to left once + vaddl.u8 q10, d0, d2 @Adding for FILT121 + vaddl.u8 q11, d1, d3 + vaddl.u8 q12, d2, d4 + vaddl.u8 q13, d3, d5 + vadd.u16 q12, q10, q12 + vadd.u16 q13, q11, q13 + vqrshrun.s16 d4, q12, #2 + vqrshrun.s16 d5, q13, #2 + @Q2 has all FILT121 values + sub r5, r3, #4 + vext.8 q9, q2, q2, #15 + vst1.8 {d19}, [r1], r3 + vext.8 q8, q9, q9, #15 + vst1.8 {d17}, [r1], r3 + vext.8 q15, q8, q8, #15 + vst1.8 {d31}, [r1], r3 + vst1.32 {d4[1]}, [r1]! + vst1.32 {d5[0]}, [r1], r5 + vst1.32 {d18[1]}, [r1]! + vst1.32 {d19[0]}, [r1], r5 + vst1.32 {d16[1]}, [r1]! + vst1.32 {d17[0]}, [r1], r5 + vst1.32 {d30[1]}, [r1]! + vst1.32 {d31[0]}, [r1], r5 + vst1.8 {d4}, [r1], r3 + +end_func_diag_dr: + ldmfd sp!, {r4-r12, pc} @Restoring registers from stack + + + + +@/** +@******************************************************************************* +@* +@*ih264_intra_pred_luma_8x8_mode_vert_r +@* +@* @brief +@* Perform Intra prediction for luma_8x8 mode:Vertical_Right +@* +@* @par Description: +@* Perform Intra prediction for luma_8x8 mode:Vertical_Right ,described in sec 8.3.2.2.6 +@* +@* @param[in] pu1_src +@* UWORD8 pointer to the source +@* +@* @param[out] pu1_dst +@* UWORD8 pointer to the destination +@* +@* @param[in] src_strd +@* integer source stride +@* +@* @param[in] dst_strd +@* integer destination stride +@* +@* @param[in] ui_neighboravailability +@* availability of neighbouring pixels +@* +@* @returns +@* +@* @remarks +@* None +@* +@*******************************************************************************/ +@void ih264_intra_pred_luma_8x8_mode_vert_r(UWORD8 *pu1_src, +@ UWORD8 *pu1_dst, +@ WORD32 src_strd, +@ WORD32 dst_strd, +@ WORD32 ui_neighboravailability) + +@**************Variables Vs Registers***************************************** +@ r0 => *pu1_src +@ r1 => *pu1_dst +@ r2 => src_strd +@ r3 => dst_strd +@ r4 => ui_neighboravailability + + + .global ih264_intra_pred_luma_8x8_mode_vert_r_a9q + +ih264_intra_pred_luma_8x8_mode_vert_r_a9q: + + stmfd sp!, {r4-r12, r14} @store register values to stack + + vld1.u8 {q0}, [r0] + add r0, r0, #1 + vld1.u8 {q1}, [r0] + vext.8 q2, q1, q1, #1 + @ q1 = q0 shifted to left once + @ q2 = q1 shifted to left once + vaddl.u8 q10, d0, d2 + vaddl.u8 q11, d1, d3 + vaddl.u8 q12, d2, d4 + vaddl.u8 q13, d3, d5 + vadd.u16 q12, q10, q12 + vadd.u16 q13, q11, q13 + + vqrshrun.s16 d4, q10, #1 + vqrshrun.s16 d5, q11, #1 + vqrshrun.s16 d6, q12, #2 + vqrshrun.s16 d7, q13, #2 + @Q2 has all FILT11 values + @Q3 has all FILT121 values + sub r5, r3, #6 + sub r6, r3, #4 + vst1.8 {d5}, [r1], r3 @ row 0 + vext.8 q9, q3, q3, #15 + vmov.8 q11, q9 + vext.8 q8, q2, q2, #1 + vst1.8 {d19}, [r1], r3 @row 1 + + vmov.8 q15, q8 + vext.8 q10, q2, q2, #15 + vuzp.8 q8, q9 + @row 2 + vext.8 q14, q8, q8, #1 + vst1.8 {d21}, [r1] + vst1.8 {d6[6]}, [r1], r3 + @row 3 + + vst1.16 {d29[1]}, [r1]! + vst1.32 {d7[0]}, [r1]! + vst1.16 {d7[2]}, [r1], r5 +@row 4 + vst1.16 {d19[1]}, [r1]! + vst1.32 {d5[0]}, [r1]! + vst1.16 {d5[2]}, [r1], r5 + +@row 5 + vext.8 q13, q9, q9, #1 + vst1.16 {d17[1]}, [r1]! + vst1.32 {d23[0]}, [r1]! + vst1.16 {d23[2]}, [r1], r5 + + +@row 6 + vst1.16 {d27[0]}, [r1]! + vst1.8 {d27[2]}, [r1]! + vst1.8 {d5[0]}, [r1]! + vst1.32 {d31[0]}, [r1], r6 +@row 7 + vst1.32 {d29[0]}, [r1]! + vst1.32 {d7[0]}, [r1]! + + + +end_func_vert_r: + ldmfd sp!, {r4-r12, pc} @Restoring registers from stack + + + + +@/** +@******************************************************************************* +@* +@*ih264_intra_pred_luma_8x8_mode_horz_d +@* +@* @brief +@* Perform Intra prediction for luma_8x8 mode:Horizontal_Down +@* +@* @par Description: +@* Perform Intra prediction for luma_8x8 mode:Horizontal_Down ,described in sec 8.3.2.2.7 +@* +@* @param[in] pu1_src +@* UWORD8 pointer to the source +@* +@* @param[out] pu1_dst +@* UWORD8 pointer to the destination +@* +@* @param[in] src_strd +@* integer source stride +@* +@* @param[in] dst_strd +@* integer destination stride +@* +@* @param[in] ui_neighboravailability +@* availability of neighbouring pixels +@* +@* @returns +@* +@* @remarks +@* None +@* +@*******************************************************************************/ +@void ih264_intra_pred_luma_8x8_mode_horz_d(UWORD8 *pu1_src, +@ UWORD8 *pu1_dst, +@ WORD32 src_strd, +@ WORD32 dst_strd, +@ WORD32 ui_neighboravailability) + +@**************Variables Vs Registers***************************************** +@ r0 => *pu1_src +@ r1 => *pu1_dst +@ r2 => src_strd +@ r3 => dst_strd +@ r4 => ui_neighboravailability + + .global ih264_intra_pred_luma_8x8_mode_horz_d_a9q + +ih264_intra_pred_luma_8x8_mode_horz_d_a9q: + + stmfd sp!, {r4-r12, r14} @store register values to stack + vpush {d8-d15} + + vld1.u8 {q0}, [r0] + add r0, r0, #1 + vld1.u8 {q1}, [r0] + vext.8 q2, q1, q1, #1 + @ q1 = q0 shifted to left once + @ q2 = q1 shifted to left once + vaddl.u8 q10, d0, d2 + vaddl.u8 q11, d1, d3 + vaddl.u8 q12, d2, d4 + vaddl.u8 q13, d3, d5 + vadd.u16 q12, q10, q12 + vadd.u16 q13, q11, q13 + + vqrshrun.s16 d4, q10, #1 + vqrshrun.s16 d5, q11, #1 + vqrshrun.s16 d6, q12, #2 + vqrshrun.s16 d7, q13, #2 + @Q2 has all FILT11 values + @Q3 has all FILT121 values + vmov.8 q4, q2 + vmov.8 q5, q3 + sub r6, r3, #6 + vtrn.8 q4, q5 @ + vmov.8 q6, q4 + vmov.8 q7, q5 + sub r5, r3, #4 + vtrn.16 q6, q7 + vext.8 q8, q3, q3, #14 + @ROW 0 + vst1.8 {d17}, [r1] + vst1.16 {d10[3]}, [r1], r3 + + @ROW 1 + vst1.32 {d14[1]}, [r1]! + vst1.32 {d7[0]}, [r1], r5 + @ROW 2 + vst1.16 {d10[2]}, [r1]! + vst1.32 {d14[1]}, [r1]! + vst1.16 {d7[0]}, [r1], r6 + @ROW 3 + vst1.32 {d12[1]}, [r1]! + vst1.32 {d14[1]}, [r1], r5 + @ROW 4 + vst1.16 {d14[1]}, [r1]! + vst1.32 {d12[1]}, [r1]! + vst1.16 {d14[2]}, [r1], r6 + @ROW 5 + vst1.32 {d14[0]}, [r1]! + vst1.32 {d12[1]}, [r1], r5 + @ROW 6 + vst1.16 {d10[0]}, [r1]! + vst1.16 {d8[1]}, [r1]! + vst1.16 {d14[1]}, [r1]! + vst1.16 {d12[2]}, [r1], r6 + @ROW 7 + vst1.32 {d12[0]}, [r1]! + vst1.32 {d14[0]}, [r1], r5 + +end_func_horz_d: + vpop {d8-d15} + ldmfd sp!, {r4-r12, pc} @Restoring registers from stack + + + + + +@/** +@******************************************************************************* +@* +@*ih264_intra_pred_luma_8x8_mode_vert_l +@* +@* @brief +@* Perform Intra prediction for luma_8x8 mode:Vertical_Left +@* +@* @par Description: +@* Perform Intra prediction for luma_8x8 mode:Vertical_Left ,described in sec 8.3.2.2.8 +@* +@* @param[in] pu1_src +@* UWORD8 pointer to the source +@* +@* @param[out] pu1_dst +@* UWORD8 pointer to the destination +@* +@* @param[in] src_strd +@* integer source stride +@* +@* @param[in] dst_strd +@* integer destination stride +@* +@* @param[in] ui_neighboravailability +@* availability of neighbouring pixels +@* +@* @returns +@* +@* @remarks +@* None +@* +@*******************************************************************************/ +@void ih264_intra_pred_luma_8x8_mode_vert_l(UWORD8 *pu1_src, +@ UWORD8 *pu1_dst, +@ WORD32 src_strd, +@ WORD32 dst_strd, +@ WORD32 ui_neighboravailability) + +@**************Variables Vs Registers***************************************** +@ r0 => *pu1_src +@ r1 => *pu1_dst +@ r2 => src_strd +@ r3 => dst_strd +@ r4 => ui_neighboravailability + + + .global ih264_intra_pred_luma_8x8_mode_vert_l_a9q + +ih264_intra_pred_luma_8x8_mode_vert_l_a9q: + + stmfd sp!, {r4-r12, r14} @Restoring registers from stack + vpush {d8-d15} + add r0, r0, #9 + vld1.u8 {q0}, [r0] + add r0, r0, #1 + vld1.u8 {q1}, [r0] + vext.8 q2, q1, q1, #1 + vaddl.u8 q10, d0, d2 + vaddl.u8 q11, d1, d3 + vaddl.u8 q12, d2, d4 + vaddl.u8 q13, d3, d5 + vadd.u16 q12, q10, q12 + vadd.u16 q13, q11, q13 + + vqrshrun.s16 d4, q10, #1 + vqrshrun.s16 d5, q11, #1 + vqrshrun.s16 d6, q12, #2 + vext.8 q4, q2, q2, #1 + vqrshrun.s16 d7, q13, #2 + @Q2 has all FILT11 values + @Q3 has all FILT121 values + + vext.8 q5, q3, q3, #1 + @ROW 0,1 + vst1.8 {d4}, [r1], r3 + vst1.8 {d6}, [r1], r3 + + vext.8 q6, q4, q4, #1 + vext.8 q7, q5, q5, #1 + @ROW 2,3 + vst1.8 {d8}, [r1], r3 + vst1.8 {d10}, [r1], r3 + + vext.8 q8, q6, q6, #1 + vext.8 q9, q7, q7, #1 + @ROW 4,5 + vst1.8 {d12}, [r1], r3 + vst1.8 {d14}, [r1], r3 + @ROW 6,7 + vst1.8 {d16}, [r1], r3 + vst1.8 {d18}, [r1], r3 + +end_func_vert_l: + vpop {d8-d15} + ldmfd sp!, {r4-r12, pc} @Restoring registers from stack + + + + + +@/** +@******************************************************************************* +@* +@*ih264_intra_pred_luma_8x8_mode_horz_u +@* +@* @brief +@* Perform Intra prediction for luma_8x8 mode:Horizontal_Up +@* +@* @par Description: +@* Perform Intra prediction for luma_8x8 mode:Horizontal_Up ,described in sec 8.3.2.2.9 +@* +@* @param[in] pu1_src +@* UWORD8 pointer to the source +@* +@* @param[out] pu1_dst +@* UWORD8 pointer to the destination +@* +@* @param[in] src_strd +@* integer source stride +@* +@* @param[in] dst_strd +@* integer destination stride +@* +@* @param[in] ui_neighboravailability +@* availability of neighbouring pixels +@* +@* @returns +@* +@* @remarks +@* None +@* +@*******************************************************************************/ +@void ih264_intra_pred_luma_8x8_mode_horz_u(UWORD8 *pu1_src, +@ UWORD8 *pu1_dst, +@ WORD32 src_strd, +@ WORD32 dst_strd, +@ WORD32 ui_neighboravailability) + +@**************Variables Vs Registers***************************************** +@ r0 => *pu1_src +@ r1 => *pu1_dst +@ r2 => src_strd +@ r3 => dst_strd +@ r4 => ui_neighboravailability + + .global ih264_intra_pred_luma_8x8_mode_horz_u_a9q + +ih264_intra_pred_luma_8x8_mode_horz_u_a9q: + + stmfd sp!, {r4-r12, r14} @store register values to stack + vpush {d8-d15} + + vld1.u8 {q0}, [r0] + vld1.u8 {d1[7]}, [r0] + vext.8 q1, q0, q0, #1 + vext.8 q2, q1, q1, #1 + @ LOADING V TABLE + ldr r12, scratch_intrapred_addr_8x8 +scrlb8x8l2: + add r12, r12, pc + vaddl.u8 q10, d0, d2 + vaddl.u8 q11, d1, d3 + vaddl.u8 q12, d2, d4 + vaddl.u8 q13, d3, d5 + vadd.u16 q12, q10, q12 + vadd.u16 q13, q11, q13 + vld1.u8 {q5}, [r12] + vqrshrun.s16 d4, q10, #1 + vqrshrun.s16 d5, q11, #1 + vqrshrun.s16 d6, q12, #2 + vqrshrun.s16 d7, q13, #2 + @Q2 has all FILT11 values + @Q3 has all FILT121 values + vtbl.u8 d12, {q2, q3}, d10 + vdup.u8 q7, d5[7] @ + vtbl.u8 d13, {q2, q3}, d11 + vext.8 q8, q6, q7, #2 + vext.8 q9, q8, q7, #2 + vst1.8 {d12}, [r1], r3 + vext.8 q10, q9, q7, #2 + vst1.8 {d16}, [r1], r3 + vst1.8 {d18}, [r1], r3 + vst1.8 {d20}, [r1], r3 + vst1.8 {d13}, [r1], r3 + vst1.8 {d17}, [r1], r3 + vst1.8 {d19}, [r1], r3 + vst1.8 {d21}, [r1], r3 + + +end_func_horz_u: + vpop {d8-d15} + ldmfd sp!, {r4-r12, pc} @Restoring registers from stack + + + + + + + + diff --git a/common/arm/ih264_iquant_itrans_recon_a9.s b/common/arm/ih264_iquant_itrans_recon_a9.s new file mode 100755 index 0000000..f71ca69 --- /dev/null +++ b/common/arm/ih264_iquant_itrans_recon_a9.s @@ -0,0 +1,871 @@ +@/****************************************************************************** +@ * +@ * Copyright (C) 2015 The Android Open Source Project +@ * +@ * Licensed under the Apache License, Version 2.0 (the "License"); +@ * you may not use this file except in compliance with the License. +@ * You may obtain a copy of the License at: +@ * +@ * http://www.apache.org/licenses/LICENSE-2.0 +@ * +@ * Unless required by applicable law or agreed to in writing, software +@ * distributed under the License is distributed on an "AS IS" BASIS, +@ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +@ * See the License for the specific language governing permissions and +@ * limitations under the License. +@ * +@ ***************************************************************************** +@ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore +@*/ +@/** +@ ******************************************************************************* +@ * @file +@ * ih264_iquant_itrans_recon_a9.s +@ * +@ * @brief +@ * Contains function definitions for single stage inverse transform +@ * +@ * @author +@ * Mohit +@ * Harinarayanaan +@ * +@ * @par List of Functions: +@ * - ih264_iquant_itrans_recon_4x4_a9() +@ * - ih264_iquant_itrans_recon_8x8_a9() +@ * - ih264_iquant_itrans_recon_chroma_4x4_a9() +@ * +@ * @remarks +@ * None +@ * +@ ******************************************************************************* +@*/ +@/** +@ ******************************************************************************* +@ * +@ * @brief +@ * This function performs inverse quant and Inverse transform type Ci4 for 4*4 block +@ * +@ * @par Description: +@ * Performs inverse transform Ci4 and adds the residue to get the +@ * reconstructed block +@ * +@ * @param[in] pi2_src +@ * Input 4x4 coefficients +@ * +@ * @param[in] pu1_pred +@ * Prediction 4x4 block +@ * +@ * @param[out] pu1_out +@ * Output 4x4 block +@ * +@ * @param[in] u4_qp_div_6 +@ * QP +@ * +@ * @param[in] pu2_weigh_mat +@ * Pointer to weight matrix +@ * +@ * @param[in] pred_strd, +@ * Prediction stride +@ * +@ * @param[in] out_strd +@ * Output Stride +@ * +@ *@param[in] pi2_tmp +@ * temporary buffer of size 1*16 +@ * +@ * @param[in] pu2_iscal_mat +@ * Pointer to the inverse quantization matrix +@ * +@ * @returns Void +@ * +@ * @remarks +@ * None +@ * +@ ******************************************************************************* +@ */ +@void ih264_iquant_itrans_recon_4x4(WORD16 *pi2_src, +@ UWORD8 *pu1_pred, +@ UWORD8 *pu1_out, +@ WORD32 pred_strd, +@ WORD32 out_strd, +@ const UWORD16 *pu2_iscal_mat, +@ const UWORD16 *pu2_weigh_mat, +@ UWORD32 u4_qp_div_6, +@ WORD32 *pi4_tmp, +@ WORD32 iq_start_idx +@ WORD16 *pi2_dc_ld_addr) +@**************Variables Vs Registers***************************************** +@r0 => *pi2_src +@r1 => *pu1_pred +@r2 => *pu1_out +@r3 => pred_strd +@r4 => out_strd +@r5 => *pu2_iscal_mat +@r6 => *pu2_weigh_mat +@r7 => u4_qp_div_6 +@r8 => iq_start_idx +@r10=> pi2_dc_ld_addr +.text +.p2align 2 + + .global ih264_iquant_itrans_recon_4x4_a9 + +ih264_iquant_itrans_recon_4x4_a9: + +@VLD4.S16 is used because the pointer is incremented by SUB_BLK_WIDTH_4x4 +@If the macro value changes need to change the instruction according to it. +@Only one shift is done in horizontal inverse because, +@if u4_qp_div_6 is lesser than 4 then shift value will be neagative and do negative left shift, in this case rnd_factor has value +@if u4_qp_div_6 is greater than 4 then shift value will be positive and do left shift, here rnd_factor is 0 + + stmfd sp!, {r4-r12, r14} @stack stores the values of the arguments + ldr r7, [sp, #52] @Loads u4_qp_div_6 + ldr r4, [sp, #40] @Loads out_strd + vdup.s32 q15, r7 @Populate the u4_qp_div_6 in Q15 + ldr r5, [sp, #44] @Loads *pu2_iscal_mat + + ldr r6, [sp, #48] @Loads *pu2_weigh_mat + + ldr r8, [sp, #60] @Loads iq_start_idx + + ldr r10, [sp, #64] @Load alternate dc address + + vpush {d8-d15} +@=======================DEQUANT FROM HERE=================================== + + vld4.s16 {d20, d21, d22, d23}, [r5] @Load pu2_iscal_mat[i], i =0..15 + vld4.s16 {d26, d27, d28, d29}, [r6] @pu2_weigh_mat[i], i =0..15 + vmul.s16 q10, q10, q13 @x[i]=(scale[i] * dequant[i]) where i = 0..7 + vld4.s16 {d16, d17, d18, d19}, [r0] @pi2_src_tmp[i], i =0..15 + + vmul.s16 q11, q11, q14 @x[i]=(scale[i] * dequant[i]) where i = 8..15 + + subs r8, r8, #1 @ if r8 == 1 => intra case , so result of subtraction is zero and Z flag is set + ldreqsh r9, [r10] @ Loads signed halfword pi2_dc_ld_addr[0], if r8==1 + + vmull.s16 q0, d16, d20 @ Q0 = p[i] = (x[i] * trns_coeff[i]) where i = 0..3 + vmull.s16 q1, d17, d21 @ Q1 = p[i] = (x[i] * trns_coeff[i]) where i = 4..7 + vmull.s16 q2, d18, d22 @ Q2 = p[i] = (x[i] * trns_coeff[i]) where i = 8..11 + vmull.s16 q3, d19, d23 @ Q3 = p[i] = (x[i] * trns_coeff[i]) where i = 12..15 + + vshl.s32 q0, q0, q15 @ Q0 = q[i] = (p[i] << (qP/6)) where i = 0..3 + vshl.s32 q1, q1, q15 @ Q1 = q[i] = (p[i] << (qP/6)) where i = 4..7 + vshl.s32 q2, q2, q15 @ Q2 = q[i] = (p[i] << (qP/6)) where i = 8..11 + vshl.s32 q3, q3, q15 @ Q3 = q[i] = (p[i] << (qP/6)) where i = 12..15 + + vqrshrn.s32 d0, q0, #0x4 @ D0 = c[i] = ((q[i] + 32) >> 4) where i = 0..3 + vqrshrn.s32 d1, q1, #0x4 @ D1 = c[i] = ((q[i] + 32) >> 4) where i = 4..7 + vqrshrn.s32 d2, q2, #0x4 @ D2 = c[i] = ((q[i] + 32) >> 4) where i = 8..11 + vqrshrn.s32 d3, q3, #0x4 @ D3 = c[i] = ((q[i] + 32) >> 4) where i = 12..15 + + vmoveq.16 d0[0], r9 @ Restore dc value in case of intra, i.e. r8 == 1 + +@========= PROCESS IDCT FROM HERE ======= +@Steps for Stage 1: +@------------------ + vld1.32 d30[0], [r1], r3 @I row Load pu1_pred buffer + vadd.s16 d4, d0, d2 @x0 = q0 + q1; + + vsub.s16 d5, d0, d2 @x1 = q0 - q1; + + vshr.s16 d8, d1, #1 @q0>>1 + vshr.s16 d9, d3, #1 @q1>>1 + + vsub.s16 d6, d8, d3 @x2 = (q0 >> 1) - q1; + vadd.s16 d7, d1, d9 @x3 = q0+ (q1 >> 1); + vld1.32 d30[1], [r1], r3 @II row Load pu1_pred buffer + + vswp d6, d7 @Reverse positions of x2 and x3 + + vsub.s16 q6, q2, q3 @x0-x3 and x1-x2 combined + vadd.s16 q5, q2, q3 @x0 + x3 and x1+x2 combined + + vld1.32 d31[0], [r1], r3 @III row Load pu1_pred buf + + vswp d12, d13 +@Steps for Stage 2: +@------------------ + vtrn.16 d10, d11 + vtrn.16 d12, d13 + vtrn.32 d10, d12 + vtrn.32 d11, d13 + vadd.s16 d14, d10, d12 @x0 = q0 + q1; + + vsub.s16 d15, d10, d12 @x1 = q0 - q1; + + vshr.s16 d18, d11, #1 @q0>>1 + vshr.s16 d19, d13, #1 @q1>>1 + + vsub.s16 d16, d18, d13 @x2 = (q0 >> 1) - q1; + vadd.s16 d17, d11, d19 @x3 = q0+ (q1 >> 1); + + vld1.32 d31[1], [r1], r3 @IV row Load pu1_pred buffer + vswp d16, d17 @Reverse positions of x2 and x3 + + vsub.s16 q11, q7, q8 @x0-x3 and x1-x2 combined + vadd.s16 q10, q7, q8 @x0 + x3 and x1+x2 combined + + vswp d22, d23 + + vrshr.s16 q10, q10, #6 @ + vrshr.s16 q11, q11, #6 + + vaddw.u8 q10, q10, d30 + vaddw.u8 q11, q11, d31 + + vqmovun.s16 d0, q10 + vqmovun.s16 d1, q11 + + vst1.32 d0[0], [r2], r4 @I row store the value + vst1.32 d0[1], [r2], r4 @II row store the value + vst1.32 d1[0], [r2], r4 @III row store the value + vst1.32 d1[1], [r2] @IV row store the value + + vpop {d8-d15} + ldmfd sp!, {r4-r12, r15} @Reload the registers from SP + + + @/** +@ ******************************************************************************* +@ * +@ * @brief +@ * This function performs inverse quant and Inverse transform type Ci4 for 4*4 block +@ * +@ * @par Description: +@ * Performs inverse transform Ci4 and adds the residue to get the +@ * reconstructed block +@ * +@ * @param[in] pi2_src +@ * Input 4x4 coefficients +@ * +@ * @param[in] pu1_pred +@ * Prediction 4x4 block +@ * +@ * @param[out] pu1_out +@ * Output 4x4 block +@ * +@ * @param[in] u4_qp_div_6 +@ * QP +@ * +@ * @param[in] pu2_weigh_mat +@ * Pointer to weight matrix +@ * +@ * @param[in] pred_strd, +@ * Prediction stride +@ * +@ * @param[in] out_strd +@ * Output Stride +@ * +@ *@param[in] pi2_tmp +@ * temporary buffer of size 1*16 +@ * +@ * @param[in] pu2_iscal_mat +@ * Pointer to the inverse quantization matrix +@ * +@ * @returns Void +@ * +@ * @remarks +@ * None +@ * +@ ******************************************************************************* +@ */ +@void ih264_iquant_itrans_recon_chroma_4x4(WORD16 *pi2_src, +@ UWORD8 *pu1_pred, +@ UWORD8 *pu1_out, +@ WORD32 pred_strd, +@ WORD32 out_strd, +@ const UWORD16 *pu2_iscal_mat, +@ const UWORD16 *pu2_weigh_mat, +@ UWORD32 u4_qp_div_6, +@ WORD32 *pi4_tmp +@ WORD16 *pi2_dc_src) +@**************Variables Vs Registers***************************************** +@r0 => *pi2_src +@r1 => *pu1_pred +@r2 => *pu1_out +@r3 => pred_strd +@r4 => out_strd +@r5 => *pu2_iscal_mat +@r6 => *pu2_weigh_mat +@r7 => u4_qp_div_6 + + .global ih264_iquant_itrans_recon_chroma_4x4_a9 +ih264_iquant_itrans_recon_chroma_4x4_a9: + +@VLD4.S16 is used because the pointer is incremented by SUB_BLK_WIDTH_4x4 +@If the macro value changes need to change the instruction according to it. +@Only one shift is done in horizontal inverse because, +@if u4_qp_div_6 is lesser than 4 then shift value will be neagative and do negative left shift, in this case rnd_factor has value +@if u4_qp_div_6 is greater than 4 then shift value will be positive and do left shift, here rnd_factor is 0 + + stmfd sp!, {r4-r12, r14} @stack stores the values of the arguments + ldr r7, [sp, #52] @Loads u4_qp_div_6 + ldr r4, [sp, #40] @Loads out_strd + vdup.s32 q15, r7 @Populate the u4_qp_div_6 in Q15 + ldr r5, [sp, #44] @Loads *pu2_iscal_mat + ldr r6, [sp, #48] @Loads *pu2_weigh_mat + ldr r8, [sp, #60] @loads *pi2_dc_src + + vpush {d8-d15} +@=======================DEQUANT FROM HERE=================================== + + vld4.s16 {d20, d21, d22, d23}, [r5] @Load pu2_iscal_mat[i], i =0..15 + vld4.s16 {d26, d27, d28, d29}, [r6] @pu2_weigh_mat[i], i =0..15 + vmul.s16 q10, q10, q13 @x[i]=(scale[i] * dequant[i]) where i = 0..7 + vld4.s16 {d16, d17, d18, d19}, [r0] @pi2_src_tmp[i], i =0..15 + + vmul.s16 q11, q11, q14 @x[i]=(scale[i] * dequant[i]) where i = 8..15 + + vmull.s16 q0, d16, d20 @ Q0 = p[i] = (x[i] * trns_coeff[i]) where i = 0..3 + vmull.s16 q1, d17, d21 @ Q1 = p[i] = (x[i] * trns_coeff[i]) where i = 4..7 + vmull.s16 q2, d18, d22 @ Q2 = p[i] = (x[i] * trns_coeff[i]) where i = 8..11 + vmull.s16 q3, d19, d23 @ Q3 = p[i] = (x[i] * trns_coeff[i]) where i = 12..15 + + vshl.s32 q0, q0, q15 @ Q0 = q[i] = (p[i] << (qP/6)) where i = 0..3 + vshl.s32 q1, q1, q15 @ Q1 = q[i] = (p[i] << (qP/6)) where i = 4..7 + vshl.s32 q2, q2, q15 @ Q2 = q[i] = (p[i] << (qP/6)) where i = 8..11 + vshl.s32 q3, q3, q15 @ Q3 = q[i] = (p[i] << (qP/6)) where i = 12..15 + + vqrshrn.s32 d0, q0, #0x4 @ D0 = c[i] = ((q[i] + 32) >> 4) where i = 0..3 + vqrshrn.s32 d1, q1, #0x4 @ D1 = c[i] = ((q[i] + 32) >> 4) where i = 4..7 + vqrshrn.s32 d2, q2, #0x4 @ D2 = c[i] = ((q[i] + 32) >> 4) where i = 8..11 + vqrshrn.s32 d3, q3, #0x4 @ D3 = c[i] = ((q[i] + 32) >> 4) where i = 12..15 + + ldrsh r9, [r8] @ Loads signed halfword pi2_dc_src[0] + vmov.16 d0[0], r9 @ Restore dc value since its chroma iq-it + +@========= PROCESS IDCT FROM HERE ======= +@Steps for Stage 1: +@------------------ + vld2.8 {d28, d29}, [r1], r3 @I row Load pu1_pred buffer + vadd.s16 d4, d0, d2 @x0 = q0 + q1; + + vsub.s16 d5, d0, d2 @x1 = q0 - q1; + + vshr.s16 d8, d1, #1 @q0>>1 + vshr.s16 d9, d3, #1 @q1>>1 + + vsub.s16 d6, d8, d3 @x2 = (q0 >> 1) - q1; + vadd.s16 d7, d1, d9 @x3 = q0+ (q1 >> 1); + vld2.8 {d29, d30}, [r1], r3 @II row Load pu1_pred buffer + + vswp d6, d7 @Reverse positions of x2 and x3 + + vsub.s16 q6, q2, q3 @x0-x3 and x1-x2 combined + vtrn.32 d28, d29 @ D28 -- row I and II of pu1_pred_buffer + vadd.s16 q5, q2, q3 @x0 + x3 and x1+x2 combined + + vld2.8 {d29, d30}, [r1], r3 @III row Load pu1_pred buf + + vswp d12, d13 +@Steps for Stage 2: +@------------------ + vtrn.16 d10, d11 + vtrn.16 d12, d13 + vtrn.32 d10, d12 + vtrn.32 d11, d13 + vadd.s16 d14, d10, d12 @x0 = q0 + q1; + + vsub.s16 d15, d10, d12 @x1 = q0 - q1; + + vshr.s16 d18, d11, #1 @q0>>1 + vshr.s16 d19, d13, #1 @q1>>1 + + vsub.s16 d16, d18, d13 @x2 = (q0 >> 1) - q1; + vadd.s16 d17, d11, d19 @x3 = q0+ (q1 >> 1); + + vld2.8 {d30, d31}, [r1], r3 @IV row Load pu1_pred buffer + vswp d16, d17 @Reverse positions of x2 and x3 + + vsub.s16 q11, q7, q8 @x0-x3 and x1-x2 combined + vtrn.32 d29, d30 @ D29 -- row III and IV of pu1_pred_buf + vadd.s16 q10, q7, q8 @x0 + x3 and x1+x2 combined + + vswp d22, d23 + + vrshr.s16 q10, q10, #6 @ + vrshr.s16 q11, q11, #6 + + vaddw.u8 q10, q10, d28 + vaddw.u8 q11, q11, d29 + + vld1.u8 d0, [r2], r4 @Loading out buffer 16 coeffs + vld1.u8 d1, [r2], r4 + vld1.u8 d2, [r2], r4 + vld1.u8 d3, [r2], r4 + + sub r2, r2, r4, lsl #2 + + vqmovun.s16 d20, q10 @Getting quantized coeffs + vqmovun.s16 d22, q11 + + vmovl.u8 q10, d20 @Move the coffs into 16 bit + vmovl.u8 q11, d22 @so that we can use vbit to copy + + vmov.u16 q14, #0x00ff @Copy lsb from qantized(long)coeffs + + vbit.u8 q0, q10, q14 + vbit.u8 q1, q11, q14 + + vst1.u8 d0, [r2], r4 + vst1.u8 d1, [r2], r4 + vst1.u8 d2, [r2], r4 + vst1.u8 d3, [r2] + + vpop {d8-d15} + ldmfd sp!, {r4-r12, r15} @Reload the registers from SP + + +@/* +@ ******************************************************************************* +@ * +@ * @brief +@ * This function performs inverse quant and Inverse transform type Ci4 for 8*8 block +@ * +@ * @par Description: +@ * Performs inverse transform Ci8 and adds the residue to get the +@ * reconstructed block +@ * +@ * @param[in] pi2_src +@ * Input 4x4 coefficients +@ * +@ * @param[in] pu1_pred +@ * Prediction 4x4 block +@ * +@ * @param[out] pu1_out +@ * Output 4x4 block +@ * +@ * @param[in] u4_qp_div_6 +@ * QP +@ * +@ * @param[in] pu2_weigh_mat +@ * Pointer to weight matrix +@ * +@ * @param[in] pred_strd, +@ * Prediction stride +@ * +@ * @param[in] out_strd +@ * Output Stride +@ * +@ *@param[in] pi2_tmp +@ * temporary buffer of size 1*64 +@ * +@ * @param[in] pu2_iscal_mat +@ * Pointer to the inverse quantization matrix +@ * +@ * @returns Void +@ * +@ * @remarks +@ * None +@ * +@ ******************************************************************************* +@ */ +@void ih264_iquant_itrans_recon_8x8(WORD16 *pi2_src, +@ UWORD8 *pu1_pred, +@ UWORD8 *pu1_out, +@ WORD32 pred_strd, +@ WORD32 out_strd, +@ const UWORD16 *pu2_iscal_mat, +@ const UWORD16 *pu2_weigh_mat, +@ UWORD32 u4_qp_div_6, +@ WORD32 *pi4_tmp, +@ WORD32 iq_start_idx) +@**************Variables Vs Registers***************************************** +@r0 => *pi2_src +@r1 => *pu1_pred +@r2 => *pu1_out +@r3 => pred_strd +@r4 => out_strd +@r5 => *pu2_iscal_mat +@r6 => *pu2_weigh_mat +@r7 => u4_qp_div_6 + + + .global ih264_iquant_itrans_recon_8x8_a9 +ih264_iquant_itrans_recon_8x8_a9: + + stmfd sp!, {r4-r12, r14} @stack stores the values of the arguments + ldr r7, [sp, #52] @Loads u4_qp_div_6 + ldr r4, [sp, #40] @Loads out_strd + + ldr r5, [sp, #44] @Loads *pu2_iscal_mat + ldr r6, [sp, #48] @Loads *pu2_weigh_mat + vdup.s32 q15, r7 @Populate the u4_qp_div_6 in Q15 + vpush {d8-d15} + +idct_8x8_begin: + +@========= DEQUANT FROM HERE =========== + + vld1.32 {q13}, [r5]! @ Q13 = dequant values row 0 + vld1.32 {q10}, [r6]! @ Q10 = scaling factors row 0 + vld1.32 {q14}, [r5]! @ Q14 = dequant values row 1 + vmul.s16 q10, q10, q13 @ Q10 = x[i] = (scale[i] * dequant[i]) where i = 0..7 + vld1.32 {q11}, [r6]! @ Q11 = scaling factors row 1 + vld1.32 {q8}, [r0]! @ Q8 = Source row 0 + vmul.s16 q11, q11, q14 @ Q11 = x[i] = (scale[i] * dequant[i]) where i = 8..15 + vmull.s16 q0, d16, d20 @ Q0 = p[i] = (x[i] * trns_coeff[i]) where i = 0..3 + vld1.32 {q9}, [r0]! @ Q8 = Source row 1 + vmull.s16 q1, d17, d21 @ Q1 = p[i] = (x[i] * trns_coeff[i]) where i = 4..7 + vmull.s16 q2, d18, d22 @ Q2 = p[i] = (x[i] * trns_coeff[i]) where i = 8..11 + vld1.32 {q13}, [r6]! @ Scaling factors row 2 + vmull.s16 q3, d19, d23 @ Q3 = p[i] = (x[i] * trns_coeff[i]) where i = 12..15 + vld1.32 {q14}, [r6]! @ Scaling factors row 3 + vshl.s32 q0, q0, q15 @ Q0 = q[i] = (p[i] << (qP/6)) where i = 0..3 + vld1.32 {q10}, [r5]! @ Q10 = Dequant values row 2 + vshl.s32 q1, q1, q15 @ Q1 = q[i] = (p[i] << (qP/6)) where i = 4..7 + vld1.32 {q8}, [r0]! @ Source Row 2 + vshl.s32 q2, q2, q15 @ Q2 = q[i] = (p[i] << (qP/6)) where i = 8..11 + vld1.32 {q11}, [r5]! @ Q11 = Dequant values row 3 + vshl.s32 q3, q3, q15 @ Q3 = q[i] = (p[i] << (qP/6)) where i = 12..15 + vld1.32 {q9}, [r0]! @ Source Row 3 + vmul.s16 q10, q10, q13 @ Dequant row2*scale matrix row 2 + vmul.s16 q11, q11, q14 @ Dequant row 3*scale matrix row 3 + vld1.32 {q4}, [r6]! @ Scaling factors row 4 + vqrshrn.s32 d0, q0, #0x6 @ D0 = c[i] = ((q[i] + 32) >> 6) where i = 0..3 + vqrshrn.s32 d1, q1, #0x6 @ D1 = c[i] = ((q[i] + 32) >> 6) where i = 4..7 + vld1.32 {q5}, [r6]! @ Scaling factors row 5 + vqrshrn.s32 d2, q2, #0x6 @ D2 = c[i] = ((q[i] + 32) >> 6) where i = 8..11 + vqrshrn.s32 d3, q3, #0x6 @ D3 = c[i] = ((q[i] + 32) >> 6) where i = 12..15 + vld1.32 {q13}, [r5]! @ Q13 = Dequant values row 4 + vmull.s16 q2, d16, d20 @ p[i] = (x[i] * trns_coeff[i]) where i=16..19 + vmull.s16 q3, d17, d21 @ p[i] = (x[i] * trns_coeff[i]) where i=20..23 + vld1.32 {q12}, [r5]! @ Q12 = Dequant values row 5 + vmull.s16 q6, d18, d22 @ p[i] = (x[i] * trns_coeff[i]) where i=24..27 + vmull.s16 q7, d19, d23 @ p[i] = (x[i] * trns_coeff[i]) where i=28..31 + + vld1.32 {q14}, [r0]! @ Source row 4 + vmul.s16 q10, q4, q13 @ Dequant row4*scale matrix row 4 + vmul.s16 q11, q5, q12 @ Dequant row5*scale matrix row 5 + vld1.32 {q9}, [r0]! @ Source row 5 + vshl.s32 q2, q2, q15 @ + vshl.s32 q3, q3, q15 @ + vld1.32 {q13}, [r6]! @ Scaling factors row 6 + vshl.s32 q6, q6, q15 @ + vshl.s32 q7, q7, q15 @ + vmull.s16 q4, d28, d20 @ i = 32..35 + vqrshrn.s32 d4, q2, #0x6 @ D4 = c[i] = ((q[i] + 32) >> 6) where i = 16..19 + vqrshrn.s32 d5, q3, #0x6 @ D5 = c[i] = ((q[i] + 32) >> 6) where i = 20..23 + vmull.s16 q5, d29, d21 @ i =36..39 + vld1.32 {q10}, [r5]! @ Dequant values row 6 + vqrshrn.s32 d6, q6, #0x6 @ D6 = c[i] = ((q[i] + 32) >> 6) where i = 24..27 + vqrshrn.s32 d7, q7, #0x6 @ D7 = c[i] = ((q[i] + 32) >> 6) where i = 28..31 + vld1.32 {q14}, [r6]! @ Scaling factors row 7 + vmull.s16 q6, d18, d22 @ + vld1.32 {q8}, [r0]! @ Source row 6 + vmull.s16 q7, d19, d23 @ + vld1.32 {q11}, [r5]! @ Dequant values row 7 + vshl.s32 q4, q4, q15 @ + vld1.32 {q9}, [r0]! @ Source row 7 + vshl.s32 q5, q5, q15 @ + + vshl.s32 q6, q6, q15 @ + vshl.s32 q7, q7, q15 @ + vmul.s16 q10, q10, q13 @ Dequant*scaling row 6 + vmul.s16 q11, q11, q14 @ Dequant*scaling row 7 + vqrshrn.s32 d8, q4, #0x6 @ D8 = c[i] = ((q[i] + 32) >> 6) where i = 32..35 + vqrshrn.s32 d9, q5, #0x6 @ D9 = c[i] = ((q[i] + 32) >> 6) where i = 36..39 + vqrshrn.s32 d10, q6, #0x6 @ D10 = c[i] = ((q[i] + 32) >> 6) where i = 40..43 + vqrshrn.s32 d11, q7, #0x6 @ D11 = c[i] = ((q[i] + 32) >> 6) where i = 44..47 + vmull.s16 q6, d16, d20 @ i= 48..51 + vmull.s16 q7, d17, d21 @ i= 52..55 + vmull.s16 q8, d18, d22 @ i=56..59 + vmull.s16 q9, d19, d23 @ i=60..63 + vshl.s32 q6, q6, q15 @ + vzip.s16 q0, q1 @Transpose + vshl.s32 q7, q7, q15 @ + vshl.s32 q8, q8, q15 @ + vzip.s16 q2, q3 @ + vshl.s32 q9, q9, q15 @ + vqrshrn.s32 d12, q6, #0x6 @ D12 = c[i] = ((q[i] + 32) >> 6) where i = 48..51 + vzip.s16 q4, q5 @Transpose + vqrshrn.s32 d13, q7, #0x6 @ D13 = c[i] = ((q[i] + 32) >> 6) where i = 52..55 + vqrshrn.s32 d14, q8, #0x6 @ D14 = c[i] = ((q[i] + 32) >> 6) where i = 56..59 + vzip.s32 q0, q2 @Transpose + vqrshrn.s32 d15, q9, #0x6 @ D15 = c[i] = ((q[i] + 32) >> 6) where i = 60..63 + +@========= PROCESS IDCT FROM HERE ======= + +@Steps for Stage 2: +@------------------ + +@ TRANSPOSE 8x8 coeffs to actual order + + vzip.s16 q6, q7 @ + + vzip.s32 q1, q3 @ + vzip.s32 q4, q6 @ + vzip.s32 q5, q7 @ + + vswp d1, d8 @ Q0/Q1 = Row order x0/x1 + vswp d3, d10 @ Q2/Q3 = Row order x2/x3 + vswp d5, d12 @ Q4/Q5 = Row order x4/x5 + vswp d7, d14 @ Q6/Q7 = Row order x6/x7 + + vswp q1, q4 @ + vshr.s16 q10, q2, #0x1 @ + vswp q3, q6 @ + +@Steps for Stage 1: +@------------------ + + vadd.s16 q8, q0, q4 @ Q8 = y0 + vsub.s16 q9, q0, q4 @ Q9 = y2 + + vsra.s16 q2, q6, #0x1 @ Q2 = y6 + vsub.s16 q6, q10, q6 @ Q6 = y4 + + vaddl.s16 q12, d14, d2 @ y3 (0-3) 1+7 + vaddl.s16 q13, d15, d3 @ y3 (4-7) 1+7 + + vsubl.s16 q10, d14, d2 @ y5 (0-3) 7-1 + vsubl.s16 q11, d15, d3 @ y5 (4-7) 7-1 + + vadd.s16 q0, q8, q2 @ Q0 = z0 + vsub.s16 q4, q8, q2 @ Q4 = z6 + + vadd.s16 q8, q9, q6 @ Q8 = z2 + vsub.s16 q2, q9, q6 @ Q2 = z4 + + vsubw.s16 q12, q12, d6 @ y3 (0-3) 1+7-3 + vsubw.s16 q13, q13, d7 @ y3 (0-7) 1+7-3 + + vshr.s16 q6, q3, #0x1 @ + + vaddw.s16 q10, q10, d10 @ + vaddw.s16 q11, q11, d11 @ + + vshr.s16 q9, q5, #0x1 @ + + vsubw.s16 q12, q12, d12 @ + vsubw.s16 q13, q13, d13 @ + + vaddw.s16 q10, q10, d18 @ + vaddw.s16 q11, q11, d19 @ + + vqmovn.s32 d12, q12 @ + vaddl.s16 q12, d10, d6 @ + vqmovn.s32 d13, q13 @ Q6 = y3 + vaddl.s16 q13, d11, d7 @ + vqmovn.s32 d18, q10 @ + vsubl.s16 q10, d10, d6 @ + vqmovn.s32 d19, q11 @ Q9 = y5 + vsubl.s16 q11, d11, d7 @ + + vshr.s16 q3, q6, #0x2 @ + + vsra.s16 q6, q9, #0x2 @ Q6 = z3 + + vaddw.s16 q12, q12, d2 @ + vaddw.s16 q13, q13, d3 @ + + vshr.s16 q1, #0x1 @ + + vsub.s16 q5, q3, q9 @ Q5 = z5 + + vsubw.s16 q10, q10, d14 @ + vsubw.s16 q11, q11, d15 @ + + vshr.s16 q7, #0x1 @ + + vaddw.s16 q12, q12, d2 @ + vaddw.s16 q13, q13, d3 @ + + vsubw.s16 q10, q10, d14 @ + vsubw.s16 q11, q11, d15 @ + + + vqmovn.s32 d14, q12 @ + vadd.s16 q1, q8, q5 @ Q1 = x1 + vqmovn.s32 d15, q13 @ Q7 = y7 + vsub.s16 q3, q8, q5 @ Q3 = x6 + vqmovn.s32 d18, q10 @ + vsub.s16 q5, q2, q6 @ Q5 = x5 + vqmovn.s32 d19, q11 @ Q9 = y1 + vadd.s16 q2, q2, q6 @ Q2 = x2 + + vshr.s16 q12, q9, #0x2 @ + vsra.s16 q9, q7, #0x2 @ Q9 = z1 + + vsub.s16 q11, q7, q12 @ Q11 = z7 + + vadd.s16 q6, q4, q9 @ Q6 = x3 + vsub.s16 q4, q4, q9 @ Q4 = x4 + + vsub.s16 q7, q0, q11 @ Q7 = x7 + vadd.s16 q0, q0, q11 @ Q0 = x0 + + vswp.s16 q3, q6 @ Q3 = x3, Q6 = x6 + + +@Steps for Stage 2: +@------------------ + +@ TRANSPOSE 8x8 coeffs to actual order + + vzip.s16 q0, q1 @ + vzip.s16 q2, q3 @ + vzip.s16 q4, q5 @ + vzip.s16 q6, q7 @ + + vzip.s32 q0, q2 @ + vzip.s32 q1, q3 @ + vzip.s32 q4, q6 @ + vzip.s32 q5, q7 @ + + vswp d1, d8 @ Q0/Q1 = Row order x0/x1 + vswp d3, d10 @ Q2/Q3 = Row order x2/x3 + vswp d5, d12 @ Q4/Q5 = Row order x4/x5 + vswp d7, d14 @ Q6/Q7 = Row order x6/x7 + + vswp q1, q4 @ + vshr.s16 q10, q2, #0x1 @ + vswp q3, q6 @ + +@Steps for Stage 3: +@------------------ + +@Repeat stage 1 again for vertical transform + + vadd.s16 q8, q0, q4 @ Q8 = y0 + vld1.32 d28, [r1], r3 @ Q12 = 0x070605....0x070605.... + vsub.s16 q9, q0, q4 @ Q9 = y2 + + vsra.s16 q2, q6, #0x1 @ Q2 = y6 + vsub.s16 q6, q10, q6 @ Q6 = y4 + + vaddl.s16 q12, d14, d2 @ + vld1.32 d29, [r1], r3 @ Q12 = 0x070605....0x070605.... + vaddl.s16 q13, d15, d3 @ + + vsubl.s16 q10, d14, d2 @ + vld1.32 d30, [r1], r3 @ Q12 = 0x070605....0x070605.... + vsubl.s16 q11, d15, d3 @ + + vadd.s16 q0, q8, q2 @ Q0 = z0 + vld1.32 d31, [r1], r3 @ Q12 = 0x070605....0x070605.... + vsub.s16 q4, q8, q2 @ Q4 = z6 + + vadd.s16 q8, q9, q6 @ Q8 = z2 + vsub.s16 q2, q9, q6 @ Q2 = z4 + + vsubw.s16 q12, q12, d6 @ + vsubw.s16 q13, q13, d7 @ + + vshr.s16 q6, q3, #0x1 @ + + vaddw.s16 q10, q10, d10 @ + vaddw.s16 q11, q11, d11 @ + + vshr.s16 q9, q5, #0x1 @ + + vsubw.s16 q12, q12, d12 @ + vsubw.s16 q13, q13, d13 @ + + vaddw.s16 q10, q10, d18 @ + vaddw.s16 q11, q11, d19 @ + + vqmovn.s32 d12, q12 @ + vaddl.s16 q12, d10, d6 @ + vqmovn.s32 d13, q13 @ Q6 = y3 + vaddl.s16 q13, d11, d7 @ + vqmovn.s32 d18, q10 @ + vsubl.s16 q10, d10, d6 @ + vqmovn.s32 d19, q11 @ Q9 = y5 + vsubl.s16 q11, d11, d7 @ + + vshr.s16 q3, q6, #0x2 @ + + vsra.s16 q6, q9, #0x2 @ Q6 = z3 + + vaddw.s16 q12, q12, d2 @ + vaddw.s16 q13, q13, d3 @ + + vshr.s16 q1, #0x1 @ + + vsub.s16 q5, q3, q9 @ Q5 = z5 + + vsubw.s16 q10, q10, d14 @ + vsubw.s16 q11, q11, d15 @ + + vshr.s16 q7, #0x1 @ + + vaddw.s16 q12, q12, d2 @ + vaddw.s16 q13, q13, d3 @ + + vsubw.s16 q10, q10, d14 @ + vsubw.s16 q11, q11, d15 @ + + vqmovn.s32 d14, q12 @ + vadd.s16 q1, q8, q5 @ Q1 = x1 + vqmovn.s32 d15, q13 @ Q7 = y7 + vsub.s16 q3, q8, q5 @ Q3 = x6 + vqmovn.s32 d18, q10 @ + vsub.s16 q5, q2, q6 @ Q5 = x5 + vqmovn.s32 d19, q11 @ Q9 = y1 + vadd.s16 q2, q2, q6 @ Q2 = x2 + + vshr.s16 q12, q9, #0x2 @ + vsra.s16 q9, q7, #0x2 @ Q9 = z1 + + vsub.s16 q11, q7, q12 @ Q11 = z7 + + vadd.s16 q6, q4, q9 @ Q6 = x3 + vsub.s16 q4, q4, q9 @ Q4 = x4 + + vsub.s16 q7, q0, q11 @ Q7 = x7 + vadd.s16 q0, q0, q11 @ Q0 = x0 + + vswp.s16 q3, q6 @ Q3 <-> Q6 + + vrshr.s16 q1, q1, #6 @ + vld1.32 d16, [r1], r3 @ Q12 = 0x070605....0x070605.... + vrshr.s16 q2, q2, #6 @ + vrshr.s16 q4, q4, #6 @ + vld1.32 d17, [r1], r3 @ Q12 = 0x070605....0x070605.... + vrshr.s16 q5, q5, #6 @ + vrshr.s16 q7, q7, #6 @ + vld1.32 d18, [r1], r3 @ Q12 = 0x070605....0x070605.... + vrshr.s16 q0, q0, #6 @ + vrshr.s16 q3, q3, #6 @ + vld1.32 d19, [r1], r3 @ Q12 = 0x070605....0x070605.... + vrshr.s16 q6, q6, #6 @ + +@ Code Added to pack sign and magnitudes + + vaddw.u8 q0, q0, d28 + vaddw.u8 q1, q1, d29 + vaddw.u8 q2, q2, d30 + vaddw.u8 q3, q3, d31 + vqmovun.s16 d0, q0 + vaddw.u8 q4, q4, d16 + vqmovun.s16 d1, q1 + vaddw.u8 q5, q5, d17 + vqmovun.s16 d2, q2 + vaddw.u8 q6, q6, d18 + vqmovun.s16 d3, q3 + vaddw.u8 q7, q7, d19 + + vqmovun.s16 d4, q4 + vst1.32 d0, [r2], r4 @ Magnitudes of 1st 4x4 block coeffs + vqmovun.s16 d5, q5 + vst1.32 d1, [r2], r4 @ Magnitudes of 1st 4x4 block coeffs + vqmovun.s16 d6, q6 + vst1.32 d2, [r2], r4 @ Magnitudes of 1st 4x4 block coeffs + vqmovun.s16 d7, q7 + vst1.32 d3, [r2], r4 @ Magnitudes of 1st 4x4 block coeffs + vst1.32 d4, [r2], r4 @ Magnitudes of 1st 4x4 block coeffs + + vst1.32 d5, [r2], r4 @ Magnitudes of 1st 4x4 block coeffs + + + vst1.32 d6, [r2], r4 @ Magnitudes of 1st 4x4 block coeffs + + + vst1.32 d7, [r2], r4 @ Magnitudes of 1st 4x4 block coeffs + +idct_8x8_end: + + vpop {d8-d15} + ldmfd sp!, {r4-r12, r15} + diff --git a/common/arm/ih264_iquant_itrans_recon_dc_a9.s b/common/arm/ih264_iquant_itrans_recon_dc_a9.s new file mode 100755 index 0000000..8d71bdb --- /dev/null +++ b/common/arm/ih264_iquant_itrans_recon_dc_a9.s @@ -0,0 +1,399 @@ +@/****************************************************************************** +@ * +@ * Copyright (C) 2015 The Android Open Source Project +@ * +@ * Licensed under the Apache License, Version 2.0 (the "License"); +@ * you may not use this file except in compliance with the License. +@ * You may obtain a copy of the License at: +@ * +@ * http://www.apache.org/licenses/LICENSE-2.0 +@ * +@ * Unless required by applicable law or agreed to in writing, software +@ * distributed under the License is distributed on an "AS IS" BASIS, +@ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +@ * See the License for the specific language governing permissions and +@ * limitations under the License. +@ * +@ ***************************************************************************** +@ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore +@*/ +@/** +@ ******************************************************************************* +@ * @file +@ * ih264_iquant_itrans_recon_dc_a9.s +@ * +@ * @brief +@ * Contains function definitions for single stage inverse transform +@ * +@ * @author +@ * Mohit +@ * +@ * @par List of Functions: +@ * - ih264_iquant_itrans_recon_4x4_dc_a9() +@ * - ih264_iquant_itrans_recon_8x8_dc_a9() +@ * - ih264_iquant_itrans_recon_chroma_4x4_dc_a9() +@ * +@ * @remarks +@ * None +@ * +@ ******************************************************************************* +@*/ +@/** +@ ******************************************************************************* +@ * +@ * @brief +@ * This function performs inverse quant and Inverse transform type Ci4 for 4*4 block +@ * for dc input pattern only, i.e. only the (0,0) element of the input 4x4 block is +@ * non-zero. For complete function, refer ih264_iquant_itrans_recon_a9.s +@ * +@ * @par Description: +@ * Performs inverse transform Ci4 and adds the residue to get the +@ * reconstructed block +@ * +@ * @param[in] pi2_src +@ * Input 4x4 coefficients +@ * +@ * @param[in] pu1_pred +@ * Prediction 4x4 block +@ * +@ * @param[out] pu1_out +@ * Output 4x4 block +@ * +@ * @param[in] u4_qp_div_6 +@ * QP +@ * +@ * @param[in] pu2_weigh_mat +@ * Pointer to weight matrix +@ * +@ * @param[in] pred_strd, +@ * Prediction stride +@ * +@ * @param[in] out_strd +@ * Output Stride +@ * +@ *@param[in] pi2_tmp +@ * temporary buffer of size 1*16 +@ * +@ * @param[in] pu2_iscal_mat +@ * Pointer to the inverse quantization matrix +@ * +@ * @returns Void +@ * +@ * @remarks +@ * None +@ * +@ ******************************************************************************* +@ */ +@void ih264_iquant_itrans_recon_4x4_dc(WORD16 *pi2_src, +@ UWORD8 *pu1_pred, +@ UWORD8 *pu1_out, +@ WORD32 pred_strd, +@ WORD32 out_strd, +@ const UWORD16 *pu2_iscal_mat, +@ const UWORD16 *pu2_weigh_mat, +@ UWORD32 u4_qp_div_6, +@ WORD32 *pi4_tmp, +@ WORD32 iq_start_idx +@ WORD16 *pi2_dc_ld_addr) +@**************Variables Vs Registers***************************************** +@r0 => *pi2_src +@r1 => *pu1_pred +@r2 => *pu1_out +@r3 => pred_strd +@r4 => out_strd +@r5 => *pu2_iscal_mat +@r6 => *pu2_weigh_mat +@r7 => u4_qp_div_6 +@r9 => iq_start_idx +@unused => pi2_dc_ld_addr + +.text +.p2align 2 + + .global ih264_iquant_itrans_recon_4x4_dc_a9 + +ih264_iquant_itrans_recon_4x4_dc_a9: + +@Only one shift is done in horizontal inverse because, +@if u4_qp_div_6 is lesser than 4 then shift value will be neagative and do negative left shift, in this case rnd_factor has value +@if u4_qp_div_6 is greater than 4 then shift value will be positive and do left shift, here rnd_factor is 0 + + stmfd sp!, {r4-r10, r14} @stack stores the values of the arguments + ldr r5, [sp, #36] @Loads *pu2_iscal_mat + ldr r6, [sp, #40] @Loads *pu2_weigh_mat + ldrsh r8, [r0] @load pi2_src[0], SH for signed halfword load + ldrh r6, [r6] @load pu2_weight_mat[0] , H for unsigned halfword load + ldrh r5, [r5] @load pu2_iscal_mat[0] , H for unsigned halfword load +@=======================DEQUANT FROM HERE=================================== + mul r6, r6, r5 @pu2_iscal_mat[0]*pu2_weigh_mat[0] + ldr r7, [sp, #44] @Loads u4_qp_div_6 + mul r6, r6, r8 @pi2_src[0]*pu2_iscal_mat[0]*pu2_weigh_mat[0] + ldr r4, [sp, #32] @Loads out_strd + ldr r9, [sp, #52] @Loads iq_start_idx + + lsl r6, r6, r7 @(pi2_src[0]*pu2_iscal_mat[0]*pu2_weigh_mat[0])<<u4_qp_div_6 + add r6, r6, #8 @(pi2_src[0]*pu2_iscal_mat[0]*pu2_weigh_mat[0])<<u4_qp_div_6 + rnd_fact + asr r6, r6, #4 @q0 = (pi2_src[0]*pu2_iscal_mat[0]*pu2_weigh_mat[0] + rnd_fact)<<(u4_qp_div_6-4) + + subs r9, r9, #1 @ if r8 == 1 => intra case , so result of subtraction is zero and Z flag is set + ldreqsh r10, [r0] @ Loads signed halfword pi2_src[0], if r9==1 + moveq r6, r10 @ Restore dc value in case of intra, i.e. r9 == 1 + + add r6, r6, #32 @i_macro = q0 + 32 + asr r6, r6, #6 @i_macro >>6 = DC output of 2-stage transform + vdup.s16 q0, r6 @copy transform output to Q0 + + vld1.32 d30[0], [r1], r3 @I row Load pu1_pred buffer + + vld1.32 d30[1], [r1], r3 @II row Load pu1_pred buffer + + vld1.32 d31[0], [r1], r3 @III row Load pu1_pred buf + + vld1.32 d31[1], [r1], r3 @IV row Load pu1_pred buffer + vaddw.u8 q10, q0, d30 + + vaddw.u8 q11, q0, d31 + + vqmovun.s16 d0, q10 + + vst1.32 d0[0], [r2], r4 @I row store the value + vqmovun.s16 d1, q11 + vst1.32 d0[1], [r2], r4 @II row store the value + vst1.32 d1[0], [r2], r4 @III row store the value + vst1.32 d1[1], [r2] @IV row store the value + + ldmfd sp!, {r4-r10, r15} @Reload the registers from SP + + + + +@/* +@ ******************************************************************************* +@ * +@ * @brief +@ * This function performs inverse quant and Inverse transform type Ci4 for 8*8 block +@ * for dc input pattern only, i.e. only the (0,0) element of the input 8x8 block is +@ * non-zero. For complete function, refer ih264_iquant_itrans_recon_a9.s +@ * +@ * @par Description: +@ * Performs inverse transform Ci8 and adds the residue to get the +@ * reconstructed block +@ * +@ * @param[in] pi2_src +@ * Input 4x4 coefficients +@ * +@ * @param[in] pu1_pred +@ * Prediction 4x4 block +@ * +@ * @param[out] pu1_out +@ * Output 4x4 block +@ * +@ * @param[in] u4_qp_div_6 +@ * QP +@ * +@ * @param[in] pu2_weigh_mat +@ * Pointer to weight matrix +@ * +@ * @param[in] pred_strd, +@ * Prediction stride +@ * +@ * @param[in] out_strd +@ * Output Stride +@ * +@ *@param[in] pi2_tmp +@ * temporary buffer of size 1*64 +@ * +@ * @param[in] pu2_iscal_mat +@ * Pointer to the inverse quantization matrix +@ * +@ * @returns Void +@ * +@ * @remarks +@ * None +@ * +@ ******************************************************************************* +@ */ +@void ih264_iquant_itrans_recon_8x8_dc(WORD16 *pi2_src, +@ UWORD8 *pu1_pred, +@ UWORD8 *pu1_out, +@ WORD32 pred_strd, +@ WORD32 out_strd, +@ const UWORD16 *pu2_iscal_mat, +@ const UWORD16 *pu2_weigh_mat, +@ UWORD32 u4_qp_div_6, +@ WORD32 *pi4_tmp, +@ WORD32 iq_start_idx) +@**************Variables Vs Registers***************************************** +@r0 => *pi2_src +@r1 => *pu1_pred +@r2 => *pu1_out +@r3 => pred_strd +@r4 => out_strd +@r5 => *pu2_iscal_mat +@r6 => *pu2_weigh_mat +@r7 => u4_qp_div_6 + + + .global ih264_iquant_itrans_recon_8x8_dc_a9 +ih264_iquant_itrans_recon_8x8_dc_a9: + + stmfd sp!, {r4-r8, r14} @stack stores the values of the arguments + ldr r5, [sp, #28] @Loads *pu2_iscal_mat + ldr r6, [sp, #32] @Loads *pu2_weigh_mat + ldrsh r8, [r0] @load pi2_src[0], SH for signed halfword load + ldrh r6, [r6] @load pu2_weight_mat[0] , H for unsigned halfword load + ldrh r5, [r5] @load pu2_iscal_mat[0] , H for unsigned halfword load +@=======================DEQUANT FROM HERE=================================== + mul r6, r6, r5 @pu2_iscal_mat[0]*pu2_weigh_mat[0] + ldr r7, [sp, #36] @Loads u4_qp_div_6 + mul r6, r6, r8 @pi2_src[0]*pu2_iscal_mat[0]*pu2_weigh_mat[0] + ldr r4, [sp, #24] @Loads out_strd + + vpush {d8-d15} + lsl r6, r6, r7 @(pi2_src[0]*pu2_iscal_mat[0]*pu2_weigh_mat[0])<<u4_qp_div_6 + add r6, r6, #32 @(pi2_src[0]*pu2_iscal_mat[0]*pu2_weigh_mat[0])<<u4_qp_div_6 + rnd_fact + asr r6, r6, #6 @q0 = (pi2_src[0]*pu2_iscal_mat[0]*pu2_weigh_mat[0] + rnd_fact)<<(u4_qp_div_6-4) + add r6, r6, #32 @i_macro = q0 + 32 + asr r6, r6, #6 @i_macro >>6 = DC output of 2-stage transform + vdup.s16 q8, r6 @copy transform output to Q0 + + vld1.32 d24, [r1], r3 @ Q12 = 0x070605....0x070605.... + + vld1.32 d25, [r1], r3 @ Q12 = 0x070605....0x070605.... + + vld1.32 d26, [r1], r3 @ Q12 = 0x070605....0x070605.... + vaddw.u8 q0, q8, d24 + vld1.32 d27, [r1], r3 @ Q12 = 0x070605....0x070605.... + vaddw.u8 q1, q8, d25 + vld1.32 d28, [r1], r3 @ Q12 = 0x070605....0x070605.... + vaddw.u8 q2, q8, d26 + vld1.32 d29, [r1], r3 @ Q12 = 0x070605....0x070605.... + vaddw.u8 q3, q8, d27 + vld1.32 d30, [r1], r3 @ Q12 = 0x070605....0x070605.... + vaddw.u8 q4, q8, d28 + vld1.32 d31, [r1], r3 @ Q12 = 0x070605....0x070605.... + +@ Code Added to pack sign and magnitudes + + + vqmovun.s16 d0, q0 + vaddw.u8 q5, q8, d29 + vqmovun.s16 d1, q1 + vaddw.u8 q6, q8, d30 + vqmovun.s16 d2, q2 + vqmovun.s16 d3, q3 + vaddw.u8 q7, q8, d31 + vqmovun.s16 d4, q4 + vqmovun.s16 d5, q5 + vst1.32 d0, [r2], r4 @ Magnitudes of 1st 4x4 block coeffs + vqmovun.s16 d6, q6 + vst1.32 d1, [r2], r4 @ Magnitudes of 1st 4x4 block coeffs + vqmovun.s16 d7, q7 + vst1.32 d2, [r2], r4 @ Magnitudes of 1st 4x4 block coeffs + vst1.32 d3, [r2], r4 @ Magnitudes of 1st 4x4 block coeffs + vst1.32 d4, [r2], r4 @ Magnitudes of 1st 4x4 block coeffs + vst1.32 d5, [r2], r4 @ Magnitudes of 1st 4x4 block coeffs + vst1.32 d6, [r2], r4 @ Magnitudes of 1st 4x4 block coeffs + vst1.32 d7, [r2], r4 @ Magnitudes of 1st 4x4 block coeffs + + vpop {d8-d15} + ldmfd sp!, {r4-r8, r15} + + +@ /* +@ ******************************************************************************** +@ * +@ * @brief This function reconstructs a 4x4 sub block from quantized resiude and +@ * prediction buffer if only dc value is present for residue +@ * +@ * @par Description: +@ * The quantized residue is first inverse quantized, +@ * This inverse quantized content is added to the prediction buffer to recon- +@ * struct the end output +@ * +@ * @param[in] pi2_src +@ * quantized dc coeffiient +@ * +@ * @param[in] pu1_pred +@ * prediction 4x4 block in interleaved format +@ * +@ * @param[in] pred_strd, +@ * Prediction buffer stride in interleaved format +@ * +@ * @param[in] out_strd +@ * recon buffer Stride +@ * +@ * @returns none +@ * +@ * @remarks none +@ * +@ ******************************************************************************* +@ */ +@ void ih264_iquant_itrans_recon_chroma_4x4_dc(WORD16 *pi2_src, +@ UWORD8 *pu1_pred, +@ UWORD8 *pu1_out, +@ WORD32 pred_strd, +@ WORD32 out_strd, +@ const UWORD16 *pu2_iscal_mat, +@ const UWORD16 *pu2_weigh_mat, +@ UWORD32 u4_qp_div_6, +@ WORD16 *pi2_tmp, +@ WORD16 *pi2_dc_src) +@ Register Usage +@ r0 : pi2_src +@ r1 : pu1_pred +@ r2 : pu1_out +@ r3 : pred_strd +@ Neon registers d0-d7, d16-d30 are used +@ No need for pushing arm and neon registers + .global ih264_iquant_itrans_recon_chroma_4x4_dc_a9 +ih264_iquant_itrans_recon_chroma_4x4_dc_a9: + + ldr r0, [sp, #20] + vld1.s16 d0, [r0] @load pi2_dc_src + + ldr r0, [sp] @load out_strd + + vld2.s8 {d2, d3}, [r1], r3 @load pred plane 1 => d2 &pred palne 2 => d3 + vld2.s8 {d3, d4}, [r1], r3 + vrshr.s16 d0, d0, #6 @i_macro = ((q0 + 32) >> 6); + vld2.s8 {d4, d5}, [r1], r3 + vld2.s8 {d5, d6}, [r1], r3 + + vdup.s16 q0, d0[0] @duplicate pi2_sr[0] + mov r1, r2 @backup pu1_out + + vtrn.32 d2, d3 @mov the 4 coeffs of current block to d2 + vtrn.32 d4, d5 + + vmov.u16 q15, #0x00ff + + vld1.u8 d18, [r2], r0 @load out [8 bit size) -8 coeffs + vaddw.u8 q1, q0, d2 @Add pred + vld1.u8 d19, [r2], r0 + vaddw.u8 q2, q0, d4 + vld1.u8 d20, [r2], r0 + vld1.u8 d21, [r2], r0 + + vqmovun.s16 d2, q1 + vqmovun.s16 d4, q2 + + vmovl.u8 q1, d2 + vmovl.u8 q2, d4 + + vbit.u8 q9, q1, q15 + vbit.u8 q10, q2, q15 + + vst1.u8 d18, [r1], r0 @store out + vst1.u8 d19, [r1], r0 + vst1.u8 d20, [r1], r0 + vst1.u8 d21, [r1], r0 + + bx lr + + + + + + + diff --git a/common/arm/ih264_itrans_recon_a9.s b/common/arm/ih264_itrans_recon_a9.s new file mode 100755 index 0000000..1d74da5 --- /dev/null +++ b/common/arm/ih264_itrans_recon_a9.s @@ -0,0 +1,216 @@ +@/****************************************************************************** +@ * +@ * Copyright (C) 2015 The Android Open Source Project +@ * +@ * Licensed under the Apache License, Version 2.0 (the "License"); +@ * you may not use this file except in compliance with the License. +@ * You may obtain a copy of the License at: +@ * +@ * http://www.apache.org/licenses/LICENSE-2.0 +@ * +@ * Unless required by applicable law or agreed to in writing, software +@ * distributed under the License is distributed on an "AS IS" BASIS, +@ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +@ * See the License for the specific language governing permissions and +@ * limitations under the License. +@ * +@ ***************************************************************************** +@ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore +@*/ +@/** +@ ******************************************************************************* +@ * @file +@ * ih264_itrans_recon_neon_a9.s +@ * +@ * @brief +@ * Contains function definitions for single stage inverse transform +@ * +@ * +@ * @par List of Functions: +@ * - ih264_itrans_recon_4x4_a9() +@ * +@ * @remarks +@ * None +@ * +@ ******************************************************************************* +@*/ +@/** +@ ******************************************************************************* +@ * +@ * @brief +@ * This function performs Inverse transform type Ci4 for 4*4 block +@ * +@ * @par Description: +@ * Performs inverse transform Ci4 and adds the residue to get the +@ * reconstructed block +@ * +@ * @param[in] pi16_levelBlock +@ * Input 4x4 coefficients +@ * +@ * @param[in] puc_predBuffer +@ * Prediction 4x4 block +@ * +@ * @param[out] puc_reconPic +@ * Output 4x4 block +@ * +@ * @param[in] ui16_picWidth +@ * Input stride +@ * +@ * @param[in] pred_strd +@ * Prediction stride +@ * +@ * @param[in] dst_strd +@ * Output Stride +@ * +@ * @param[in] zero_cols +@ * Zero columns in pi2_src +@ * +@ * @returns Void +@ * +@ * @remarks +@ * None +@ * +@ * +@ ******************************************************************************* +@ */ +@void ih264_itrans_recon_4x4( +@ WORD16 *pi2_src, +@ UWORD8 *pu1_pred, +@ UWORD8 *pu1_recon, +@ WORD32 src_strd, +@ WORD32 pred_strd, +@ WORD32 dst_strd, +@ UWORD32 q_lev, //quantizer level +@ WORD32 *pi4_tmp) +@**************Variables Vs Registers***************************************** +@r0 => *pi2_src +@r1 => *pu1_pred +@r2 => *pu1_recon +@r3 => src_strd +@r4 => pred_strd +@r5 => dst_strd +@r6 => q_lev +@r7 => *pi4_tmp + +.text +.p2align 2 + + + .global ih264_itrans_recon_4x4_a9 + +ih264_itrans_recon_4x4_a9: + stmfd sp!, {r4-r12, r14} @stack stores the values of the arguments + lsl r3, r3, #1 + + vld1.16 d0, [r0], r3 @0th row pi2_src_tmp[0] + ldr r4, [sp, #40] @Loads pred_strd + + vld1.16 d1, [r0], r3 @I row pi2_src_tmp[0] + ldr r5, [sp, #44] @Loads *dst_strd + + vld1.16 d2, [r0], r3 @II row pi2_src_tmp[0] + + vld1.16 d3, [r0] @III row pi2_src_tmp[0] + ldr r7, [sp, #52] @Loads *pi4_tmp + + vpush {d8-d15} + + vtrn.16 d0, d1 @Transpose to get all the 0th element in the single D register + vtrn.16 d2, d3 + vtrn.32 d0, d2 + vtrn.32 d1, d3 @D0 --> pi2_src_tmp[0], D1 --> pi2_src_tmp[1] + @D2 --> pi2_src_tmp[2], D3 --> pi2_src_tmp[3] + + vaddl.s16 q3, d0, d2 @x0 = (pi2_src_tmp[0] + pi2_src_tmp[2]) + vsubl.s16 q4, d0, d2 @x1 = (pi2_src_tmp[0] - pi2_src_tmp[2]) + vshr.s16 d4, d1, #1 @pi2_src_tmp[1] >> 1 + vshr.s16 d5, d3, #1 @pi2_src_tmp[3] >> 1 + + vsubl.s16 q5, d4, d3 @x2 = D_SHIFT(pi2_src_tmp[1],1,shft) - pi2_src_tmp[3] + + vaddl.s16 q6, d1, d5 @x3 = pi2_src_tmp[1] + D_SHIFT(pi2_src_tmp[3],1,shft) + + vadd.s32 q8, q4, q5 @x1 + x2 + vsub.s32 q9, q4, q5 @x1 - x2 + + vadd.s32 q7, q3, q6 @x0 + x3 + vsub.s32 q10, q3, q6 @x0 - x3 + + vtrn.32 q7, q8 @Transpose the register to have the adjacent values + + vtrn.32 q9, q10 + vadd.s32 d6, d14, d15 @x0(0,1) = (pi4_tblk[0,1] + pi4_tblk[8,9]) + + vsub.s32 d7, d14, d15 @x1(0,1) = (pi4_tblk[0,1] - pi4_tblk[8,9]) + + vshr.s32 d4, d16, #1 @pi4_tblk[4,5] >> 1 + vshr.s32 d5, d17, #1 @pi4_tblk[12,13] >> 1 + + vsub.s32 d8, d4, d17 @x2(0,1) = D_SHIFT(pi4_tblk[4,5],1,shft) - pi4_tblk[12,13] + vadd.s32 d9, d16, d5 @x3(0,1) = pi4_tblk[4,5] + D_SHIFT(pi4_tblk[12,13],1,shft) + + vadd.s32 d10, d18, d19 @x0(2,3) = (pi4_tblk[2,3] + pi4_tblk[10,11]) + vsub.s32 d11, d18, d19 @x1(2,3) = (pi4_tblk[2,3] - pi4_tblk[10,11]) + vshr.s32 d4, d20, #1 @pi4_tblk[6,7] >> 1 + vshr.s32 d5, d21, #1 @pi4_tblk[14,15] >> 1 + + vld1.32 d30[0], [r1], r4 @I row Load pu1_pred buffer + vsub.s32 d12, d4, d21 @x2(2,3) = D_SHIFT(pi4_tblk[6,7],1,shft) - pi4_tblk[14,15] + + vmovl.u8 q15, d30 @I row Convert 8 bit pred buffer to 16 bit + vadd.s32 d13, d20, d5 @x3(2,3) = pi4_tblk[6,7] + D_SHIFT(pi4_tblk[14,15],1,shft) + + vadd.s32 d16, d6, d9 @I row i_macro(0,1) = x0(0,1) + x3(0,1) + + vld1.32 d28[0], [r1], r4 @II row Load pu1_pred buffer + vadd.s32 d17, d10, d13 @I row i_macro(2,3) = x0(2,3) + x3(2,3) + + vqrshrn.s32 d16, q8, #6 @I row i_macro = D_SHIFT(i_macro,6,shft) + + vmovl.u8 q14, d28 @II row Convert 8 bit pred buffer to 16 bit + vadd.u16 d16, d16, d30 @I row i_macro += *pu1_pred_tmp + + vqmovun.s16 d16, q8 @I row CLIP_U8(i_macro) + vadd.s32 d18, d7, d8 @II row i_macro(0,1) = x1(0,1) + x2(0,1) + + vld1.32 d26[0], [r1], r4 @III row Load pu1_pred buffer + vadd.s32 d19, d11, d12 @II row i_macro(2,3) = x1(2,3) + x2(2,3) + + vqrshrn.s32 d18, q9, #6 @II row i_macro = D_SHIFT(i_macro,6,shft) + + vmovl.u8 q13, d26 @III row Convert 8 bit pred buffer to 16 bit + vadd.u16 d18, d18, d28 @II row i_macro += *pu1_pred_tmp + + vst1.32 d16[0], [r2], r5 @I row store the value + vsub.s32 d20, d7, d8 @III row i_macro(0,1) = x1(0,1) - x2(0,1) + + vqmovun.s16 d18, q9 @II row CLIP_U8(i_macro) + vsub.s32 d21, d11, d12 @III row i_macro(2,3) = x1(2,3) - x2(2,3) + + vld1.32 d24[0], [r1], r4 @IV row Load pu1_pred buffer + vqrshrn.s32 d20, q10, #6 @III row i_macro = D_SHIFT(i_macro,6,shft) + + vmovl.u8 q12, d24 @IV row Convert 8 bit pred buffer to 16 bit + vadd.u16 d20, d20, d26 @III row i_macro += *pu1_pred_tmp + + vqmovun.s16 d20, q10 @III row CLIP_U8(i_macro) + vsub.s32 d22, d6, d9 @IV row i_macro(0,1) = x0(0,1) - x3(0,1) + + vst1.32 d18[0], [r2], r5 @II row store the value + vsub.s32 d23, d10, d13 @IV row i_macro(2,3) = x0(2,3) - x3(2,3) + + vqrshrn.s32 d22, q11, #6 @IV row i_macro = D_SHIFT(i_macro,6,shft) + + vst1.32 d20[0], [r2], r5 @III row store the value + vadd.u16 d22, d22, d24 @IV row i_macro += *pu1_pred_tmp + + vqmovun.s16 d22, q11 @IV row CLIP_U8(i_macro) + vst1.32 d22[0], [r2], r5 @IV row store the value + + + vpop {d8-d15} + ldmfd sp!, {r4-r12, r15} @Reload the registers from SP + + + + diff --git a/common/arm/ih264_mem_fns_neon.s b/common/arm/ih264_mem_fns_neon.s new file mode 100755 index 0000000..2808897 --- /dev/null +++ b/common/arm/ih264_mem_fns_neon.s @@ -0,0 +1,268 @@ +@/****************************************************************************** +@ * +@ * Copyright (C) 2015 The Android Open Source Project +@ * +@ * Licensed under the Apache License, Version 2.0 (the "License"); +@ * you may not use this file except in compliance with the License. +@ * You may obtain a copy of the License at: +@ * +@ * http://www.apache.org/licenses/LICENSE-2.0 +@ * +@ * Unless required by applicable law or agreed to in writing, software +@ * distributed under the License is distributed on an "AS IS" BASIS, +@ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +@ * See the License for the specific language governing permissions and +@ * limitations under the License. +@ * +@ ***************************************************************************** +@ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore +@*/ +@/** +@ ******************************************************************************* +@ * @file +@ * ih264_mem_fns_neon.s +@ * +@ * @brief +@ * Contains function definitions for memory manipulation +@ * +@ * @author +@ * Naveen SR +@ * +@ * @par List of Functions: +@ * - ih264_memcpy_mul_8_a9q() +@ * - ih264_memcpy_a9q() +@ * - ih264_memset_mul_8_a9q() +@ * - ih264_memset_a9q() +@ * - ih264_memset_16bit_mul_8_a9q() +@ * - ih264_memset_a9q() +@ * +@ * @remarks +@ * None +@ * +@ ******************************************************************************* +@*/ + +@/** +@******************************************************************************* +@* +@* @brief +@* memcpy of a 1d array +@* +@* @par Description: +@* Does memcpy of 8bit data from source to destination for 8,16 or 32 number of bytes +@* +@* @param[in] pu1_dst +@* UWORD8 pointer to the destination +@* +@* @param[in] pu1_src +@* UWORD8 pointer to the source +@* +@* @param[in] num_bytes +@* number of bytes to copy +@* @returns +@* +@* @remarks +@* None +@* +@******************************************************************************* +@*/ +@void ih264_memcpy_mul_8(UWORD8 *pu1_dst, +@ UWORD8 *pu1_src, +@ UWORD8 num_bytes) +@**************Variables Vs Registers************************* +@ r0 => *pu1_dst +@ r1 => *pu1_src +@ r2 => num_bytes + +.text +.p2align 2 + + + .global ih264_memcpy_mul_8_a9q + +ih264_memcpy_mul_8_a9q: + +loop_neon_memcpy_mul_8: + @ Memcpy 8 bytes + vld1.8 d0, [r1]! + vst1.8 d0, [r0]! + + subs r2, r2, #8 + bne loop_neon_memcpy_mul_8 + bx lr + + + +@******************************************************************************* +@*/ +@void ih264_memcpy(UWORD8 *pu1_dst, +@ UWORD8 *pu1_src, +@ UWORD8 num_bytes) +@**************Variables Vs Registers************************* +@ r0 => *pu1_dst +@ r1 => *pu1_src +@ r2 => num_bytes + + + + .global ih264_memcpy_a9q + +ih264_memcpy_a9q: + subs r2, #8 + blt memcpy +loop_neon_memcpy: + @ Memcpy 8 bytes + vld1.8 d0, [r1]! + vst1.8 d0, [r0]! + + subs r2, #8 + bge loop_neon_memcpy + cmp r2, #-8 + bxeq lr + +memcpy: + add r2, #8 + +loop_memcpy: + ldrb r3, [r1], #1 + strb r3, [r0], #1 + subs r2, #1 + bne loop_memcpy + bx lr + + + + +@void ih264_memset_mul_8(UWORD8 *pu1_dst, +@ UWORD8 value, +@ UWORD8 num_bytes) +@**************Variables Vs Registers************************* +@ r0 => *pu1_dst +@ r1 => value +@ r2 => num_bytes + + + + .global ih264_memset_mul_8_a9q + +ih264_memset_mul_8_a9q: + +@ Assumptions: numbytes is either 8, 16 or 32 + vdup.8 d0, r1 +loop_memset_mul_8: + @ Memset 8 bytes + vst1.8 d0, [r0]! + + subs r2, r2, #8 + bne loop_memset_mul_8 + + bx lr + + + + +@void ih264_memset(UWORD8 *pu1_dst, +@ UWORD8 value, +@ UWORD8 num_bytes) +@**************Variables Vs Registers************************* +@ r0 => *pu1_dst +@ r1 => value +@ r2 => num_bytes + + + + .global ih264_memset_a9q + +ih264_memset_a9q: + subs r2, #8 + blt memset + vdup.8 d0, r1 +loop_neon_memset: + @ Memcpy 8 bytes + vst1.8 d0, [r0]! + + subs r2, #8 + bge loop_neon_memset + cmp r2, #-8 + bxeq lr + +memset: + add r2, #8 + +loop_memset: + strb r1, [r0], #1 + subs r2, #1 + bne loop_memset + bx lr + + + + +@void ih264_memset_16bit_mul_8(UWORD16 *pu2_dst, +@ UWORD16 value, +@ UWORD8 num_words) +@**************Variables Vs Registers************************* +@ r0 => *pu2_dst +@ r1 => value +@ r2 => num_words + + + + .global ih264_memset_16bit_mul_8_a9q + +ih264_memset_16bit_mul_8_a9q: + +@ Assumptions: num_words is either 8, 16 or 32 + + @ Memset 8 words + vdup.16 d0, r1 +loop_memset_16bit_mul_8: + vst1.16 d0, [r0]! + vst1.16 d0, [r0]! + + subs r2, r2, #8 + bne loop_memset_16bit_mul_8 + + bx lr + + + + +@void ih264_memset_16bit(UWORD16 *pu2_dst, +@ UWORD16 value, +@ UWORD8 num_words) +@**************Variables Vs Registers************************* +@ r0 => *pu2_dst +@ r1 => value +@ r2 => num_words + + + + .global ih264_memset_16bit_a9q + +ih264_memset_16bit_a9q: + subs r2, #8 + blt memset_16bit + vdup.16 d0, r1 +loop_neon_memset_16bit: + @ Memset 8 words + vst1.16 d0, [r0]! + vst1.16 d0, [r0]! + + subs r2, #8 + bge loop_neon_memset_16bit + cmp r2, #-8 + bxeq lr + +memset_16bit: + add r2, #8 + +loop_memset_16bit: + strh r1, [r0], #2 + subs r2, #1 + bne loop_memset_16bit + bx lr + + + + diff --git a/common/arm/ih264_padding_neon.s b/common/arm/ih264_padding_neon.s new file mode 100755 index 0000000..9bab268 --- /dev/null +++ b/common/arm/ih264_padding_neon.s @@ -0,0 +1,646 @@ +@/****************************************************************************** +@ * +@ * Copyright (C) 2015 The Android Open Source Project +@ * +@ * Licensed under the Apache License, Version 2.0 (the "License"); +@ * you may not use this file except in compliance with the License. +@ * You may obtain a copy of the License at: +@ * +@ * http://www.apache.org/licenses/LICENSE-2.0 +@ * +@ * Unless required by applicable law or agreed to in writing, software +@ * distributed under the License is distributed on an "AS IS" BASIS, +@ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +@ * See the License for the specific language governing permissions and +@ * limitations under the License. +@ * +@ ***************************************************************************** +@ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore +@*/ +@/** +@ ******************************************************************************* +@ * @file +@ * ih264_padding_neon.s +@ * +@ * @brief +@ * Contains function definitions padding +@ * +@ * @author +@ * Ittiam +@ * +@ * @par List of Functions: +@ * - ih264_pad_top_a9q() +@ * - ih264_pad_left_luma_a9q() +@ * - ih264_pad_left_chroma_a9q() +@ * - ih264_pad_right_luma_a9q() +@ * - ih264_pad_right_chroma_a9q() +@ * +@ * @remarks +@ * None +@ * +@ ******************************************************************************* +@*/ + + +@/** +@******************************************************************************* +@* +@* @brief pad at the top of a 2d array +@* +@* @par Description: +@* The top row of a 2d array is replicated for pad_size times at the top +@* +@* @param[in] pu1_src +@* UWORD8 pointer to the source +@* +@* @param[in] src_strd +@* integer source stride +@* +@* @param[in] wd +@* integer width of the array +@* +@* @param[in] pad_size +@* integer -padding size of the array +@* +@* @returns none +@* +@* @remarks none +@* +@******************************************************************************* +@*/ +@void ih264_pad_top(UWORD8 *pu1_src, +@ WORD32 src_strd, +@ WORD32 wd, +@ WORD32 pad_size) +@**************Variables Vs Registers************************* +@ r0 => *pu1_src +@ r1 => src_strd +@ r2 => wd +@ r3 => pad_size + +.text +.p2align 2 + + .global ih264_pad_top_a9q + +ih264_pad_top_a9q: + + stmfd sp!, {r4-r11, lr} @stack stores the values of the arguments + + sub r5, r0, r1 + rsb r6, r1, #0 + +loop_neon_memcpy_mul_16: + @ Load 16 bytes + vld1.8 {d0, d1}, [r0]! + mov r4, r5 + mov r7, r3 + add r5, r5, #16 + +loop_neon_pad_top: + vst1.8 {d0, d1}, [r4], r6 + subs r7, r7, #1 + bne loop_neon_pad_top + + subs r2, r2, #16 + bne loop_neon_memcpy_mul_16 + + ldmfd sp!, {r4-r11, pc} @Reload the registers from SP + + + + +@/** +@******************************************************************************* +@* +@* @brief +@* Padding (luma block) at the left of a 2d array +@* +@* @par Description: +@* The left column of a 2d array is replicated for pad_size times at the left +@* +@* +@* @param[in] pu1_src +@* UWORD8 pointer to the source +@* +@* @param[in] src_strd +@* integer source stride +@* +@* @param[in] ht +@* integer height of the array +@* +@* @param[in] wd +@* integer width of the array +@* +@* @param[in] pad_size +@* integer -padding size of the array +@* +@* @param[in] ht +@* integer height of the array +@* +@* @param[in] wd +@* integer width of the array +@* +@* @returns +@* +@* @remarks +@* None +@* +@******************************************************************************* +@*/ +@#if PAD_LEFT_LUMA == C +@void ih264_pad_left_luma(UWORD8 *pu1_src, +@ WORD32 src_strd, +@ WORD32 ht, +@ WORD32 pad_size) +@**************Variables Vs Registers************************* +@ r0 => *pu1_src +@ r1 => src_strd +@ r2 => ht +@ r3 => pad_size + + + .global ih264_pad_left_luma_a9q + +ih264_pad_left_luma_a9q: + + stmfd sp!, {r4-r11, lr} @stack stores the values of the arguments + + + sub r4, r0, r3 + sub r6, r1, #16 + subs r5, r3, #16 + bne loop_32 +loop_16: @ /*hard coded for width=16 ,height =8,16*/ + ldrb r8, [r0], r1 + ldrb r9, [r0], r1 + vdup.u8 q0, r8 + ldrb r10, [r0], r1 + vst1.8 {q0}, [r4], r1 @ 16 bytes store + vdup.u8 q1, r9 + vst1.8 {q1}, [r4], r1 @ 16 bytes store + ldrb r11, [r0], r1 + vdup.u8 q2, r10 + vdup.u8 q3, r11 + vst1.8 {q2}, [r4], r1 @ 16 bytes store + ldrb r8, [r0], r1 + vst1.8 {q3}, [r4], r1 @ 16 bytes store + ldrb r9, [r0], r1 + vdup.u8 q0, r8 + ldrb r10, [r0], r1 + vst1.8 {q0}, [r4], r1 @ 16 bytes store + vdup.u8 q1, r9 + ldrb r11, [r0], r1 + vst1.8 {q1}, [r4], r1 @ 16 bytes store + vdup.u8 q2, r10 + vdup.u8 q3, r11 + subs r2, r2, #8 + vst1.8 {q2}, [r4], r1 @ 16 bytes store + vst1.8 {q3}, [r4], r1 @ 16 bytes store + bne loop_16 + b end_func + +loop_32: @ /*hard coded for width=32 ,height =8,16*/ + ldrb r8, [r0], r1 + ldrb r9, [r0], r1 + vdup.u8 q0, r8 + ldrb r10, [r0], r1 + vst1.8 {q0}, [r4]! @ 16 bytes store + vdup.u8 q1, r9 + vst1.8 {q0}, [r4], r6 + vst1.8 {q1}, [r4]! @ 16 bytes store + vdup.u8 q2, r10 + vst1.8 {q1}, [r4], r6 @ 16 bytes store + ldrb r11, [r0], r1 + vst1.8 {q2}, [r4]! @ 16 bytes store + vdup.u8 q3, r11 + vst1.8 {q2}, [r4], r6 @ 16 bytes store + ldrb r8, [r0], r1 + vst1.8 {q3}, [r4]! @ 16 bytes store + vdup.u8 q0, r8 + ldrb r9, [r0], r1 + vst1.8 {q3}, [r4], r6 @ 16 bytes store + ldrb r10, [r0], r1 + vst1.8 {q0}, [r4]! @ 16 bytes store + vdup.u8 q1, r9 + vst1.8 {q0}, [r4], r6 @ 16 bytes store + ldrb r11, [r0], r1 + vst1.8 {q1}, [r4]! @ 16 bytes store + vdup.u8 q2, r10 + vst1.8 {q1}, [r4], r6 @ 16 bytes store + vst1.8 {q2}, [r4]! @ 16 bytes store + vdup.u8 q3, r11 + vst1.8 {q2}, [r4], r6 @ 16 bytes store + subs r2, r2, #8 + vst1.8 {q3}, [r4]! @ 16 bytes store + vst1.8 {q3}, [r4], r6 @ 16 bytes store + bne loop_32 + + + +end_func: + ldmfd sp!, {r4-r11, pc} @Reload the registers from SP + + + + + +@/** +@******************************************************************************* +@* +@* @brief +@* Padding (chroma block) at the left of a 2d array +@* +@* @par Description: +@* The left column of a 2d array is replicated for pad_size times at the left +@* +@* +@* @param[in] pu1_src +@* UWORD8 pointer to the source +@* +@* @param[in] src_strd +@* integer source stride +@* +@* @param[in] ht +@* integer height of the array +@* +@* @param[in] wd +@* integer width of the array (each colour component) +@* +@* @param[in] pad_size +@* integer -padding size of the array +@* +@* @param[in] ht +@* integer height of the array +@* +@* @param[in] wd +@* integer width of the array +@* +@* @returns +@* +@* @remarks +@* None +@* +@******************************************************************************* +@*/ +@#if PAD_LEFT_CHROMA == C +@void ih264_pad_left_chroma(UWORD8 *pu1_src, +@ WORD32 src_strd, +@ WORD32 ht, +@ WORD32 pad_size) +@{ +@ r0 => *pu1_src +@ r1 => src_strd +@ r2 => ht +@ r3 => pad_size + + + + .global ih264_pad_left_chroma_a9q + +ih264_pad_left_chroma_a9q: + + stmfd sp!, {r4-r11, lr} @stack stores the values of the arguments + + sub r4, r0, r3 + sub r6, r1, #16 + + +loop_32_l_c: @ /*hard coded for width=32 ,height =4,8,12*/ + ldrh r8, [r0], r1 + ldrh r9, [r0], r1 + vdup.u16 q0, r8 + ldrh r10, [r0], r1 + vst1.8 {q0}, [r4]! @ 16 bytes store + vdup.u16 q1, r9 + vst1.8 {q0}, [r4], r6 @ 16 bytes store + ldrh r11, [r0], r1 + vst1.8 {q1}, [r4]! @ 16 bytes store + vdup.u16 q2, r10 + vst1.8 {q1}, [r4], r6 @ 16 bytes store + vdup.u16 q3, r11 + vst1.8 {q2}, [r4]! @ 16 bytes store + vst1.8 {q2}, [r4], r6 @ 16 bytes store + subs r2, r2, #4 + vst1.8 {q3}, [r4]! @ 16 bytes store + vst1.8 {q3}, [r4], r6 @ 16 bytes store + + + beq end_func_l_c @/* Branching when ht=4*/ + + ldrh r8, [r0], r1 + ldrh r9, [r0], r1 + vdup.u16 q0, r8 + ldrh r10, [r0], r1 + vst1.8 {q0}, [r4]! @ 16 bytes store + vdup.u16 q1, r9 + vst1.8 {q0}, [r4], r6 + ldrh r11, [r0], r1 + vst1.8 {q1}, [r4]! @ 16 bytes store + vdup.u16 q2, r10 + vst1.8 {q1}, [r4], r6 @ 16 bytes store + vdup.u16 q3, r11 + vst1.8 {q2}, [r4]! @ 16 bytes store + vst1.8 {q2}, [r4], r6 @ 16 bytes store + subs r2, r2, #4 + vst1.8 {q3}, [r4]! @ 16 bytes store + vst1.8 {q3}, [r4], r6 @ 16 bytes store + + beq end_func_l_c @/* Branching when ht=8*/ + bne loop_32_l_c + + ldrh r8, [r0], r1 + ldrh r9, [r0], r1 + vdup.u16 q0, r8 + ldrh r10, [r0], r1 + vst1.8 {q0}, [r4]! @ 16 bytes store + vdup.u16 q1, r9 + vst1.8 {q0}, [r4], r6 + ldrh r11, [r0], r1 + vst1.8 {q1}, [r4]! @ 16 bytes store + vdup.u16 q2, r10 + vst1.8 {q1}, [r4], r6 @ 16 bytes store + vdup.u16 q3, r11 + vst1.8 {q2}, [r4]! @ 16 bytes store + vst1.8 {q2}, [r4], r6 @ 16 bytes store + vst1.8 {q3}, [r4]! @ 16 bytes store + vst1.8 {q3}, [r4], r6 @ 16 bytes store + +end_func_l_c: + ldmfd sp!, {r4-r11, pc} @Reload the registers from SP + + + + + +@/** +@******************************************************************************* +@* +@* @brief +@* Padding (luma block) at the right of a 2d array +@* +@* @par Description: +@* The right column of a 2d array is replicated for pad_size times at the right +@* +@* +@* @param[in] pu1_src +@* UWORD8 pointer to the source +@* +@* @param[in] src_strd +@* integer source stride +@* +@* @param[in] ht +@* integer height of the array +@* +@* @param[in] wd +@* integer width of the array +@* +@* @param[in] pad_size +@* integer -padding size of the array +@* +@* @param[in] ht +@* integer height of the array +@* +@* @param[in] wd +@* integer width of the array +@* +@* @returns +@* +@* @remarks +@* None +@* +@******************************************************************************* +@*/ +@#if PAD_RIGHT_LUMA == C +@void ih264_pad_right_luma(UWORD8 *pu1_src, +@ WORD32 src_strd, +@ WORD32 ht, +@ WORD32 pad_size) +@{ +@ WORD32 row; +@ +@ for(row = 0; row < ht; row++) +@ { +@ memset(pu1_src, *(pu1_src -1), pad_size); +@ +@ pu1_src += src_strd; +@ } +@} +@ +@ r0 => *pu1_src +@ r1 => src_strd +@ r2 => ht +@ r3 => pad_size + + + + .global ih264_pad_right_luma_a9q + +ih264_pad_right_luma_a9q: + + stmfd sp!, {r4-r11, lr} @stack stores the values of the arguments + + mov r4, r0 + sub r6, r1, #16 + sub r0, r0, #1 + subs r5, r3, #16 + bne loop_32 +loop_16_r: @ /*hard coded for width=16 ,height =8,16*/ + ldrb r8, [r0], r1 + ldrb r9, [r0], r1 + vdup.u8 q0, r8 + ldrb r10, [r0], r1 + vst1.8 {q0}, [r4], r1 @ 16 bytes store + vdup.u8 q1, r9 + vst1.8 {q1}, [r4], r1 @ 16 bytes store + ldrb r11, [r0], r1 + vdup.u8 q2, r10 + vdup.u8 q3, r11 + vst1.8 {q2}, [r4], r1 @ 16 bytes store + ldrb r8, [r0], r1 + vst1.8 {q3}, [r4], r1 @ 16 bytes store + ldrb r9, [r0], r1 + vdup.u8 q0, r8 + ldrb r10, [r0], r1 + vst1.8 {q0}, [r4], r1 @ 16 bytes store + vdup.u8 q1, r9 + ldrb r11, [r0], r1 + vst1.8 {q1}, [r4], r1 @ 16 bytes store + vdup.u8 q2, r10 + vdup.u8 q3, r11 + subs r2, r2, #8 + vst1.8 {q2}, [r4], r1 @ 16 bytes store + vst1.8 {q3}, [r4], r1 @ 16 bytes store + bne loop_16_r + b end_func_r + +loop_32_r: @ /*hard coded for width=32 ,height =8,16*/ + ldrb r8, [r0], r1 + ldrb r9, [r0], r1 + vdup.u8 q0, r8 + ldrb r10, [r0], r1 + vst1.8 {q0}, [r4]! @ 16 bytes store + vdup.u8 q1, r9 + vst1.8 {q0}, [r4], r6 + vst1.8 {q1}, [r4]! @ 16 bytes store + vdup.u8 q2, r10 + vst1.8 {q1}, [r4], r6 @ 16 bytes store + ldrb r11, [r0], r1 + vst1.8 {q2}, [r4]! @ 16 bytes store + vdup.u8 q3, r11 + vst1.8 {q2}, [r4], r6 @ 16 bytes store + ldrb r8, [r0], r1 + vst1.8 {q3}, [r4]! @ 16 bytes store + ldrb r9, [r0], r1 + vdup.u8 q0, r8 + vst1.8 {q3}, [r4], r6 @ 16 bytes store + ldrb r10, [r0], r1 + vst1.8 {q0}, [r4]! @ 16 bytes store + vdup.u8 q1, r9 + vst1.8 {q0}, [r4], r6 @ 16 bytes store + ldrb r11, [r0], r1 + vst1.8 {q1}, [r4]! @ 16 bytes store + vdup.u8 q2, r10 + vst1.8 {q1}, [r4], r6 @ 16 bytes store + vst1.8 {q2}, [r4]! @ 16 bytes store + vdup.u8 q3, r11 + vst1.8 {q2}, [r4], r6 @ 16 bytes store + subs r2, r2, #8 + vst1.8 {q3}, [r4]! @ 16 bytes store + vst1.8 {q3}, [r4], r6 @ 16 bytes store + bne loop_32_r + + + +end_func_r: + ldmfd sp!, {r4-r11, pc} @Reload the registers from SP + + + + + +@/** +@******************************************************************************* +@* +@* @brief +@;* Padding (chroma block) at the right of a 2d array +@* +@* @par Description: +@* The right column of a 2d array is replicated for pad_size times at the right +@* +@* +@* @param[in] pu1_src +@;* UWORD8 pointer to the source +@* +@* @param[in] src_strd +@* integer source stride +@* +@* @param[in] ht +@;* integer height of the array +@* +@* @param[in] wd +@* integer width of the array (each colour component) +@* +@* @param[in] pad_size +@* integer -padding size of the array +@* +@* @param[in] ht +@;* integer height of the array +@* +@* @param[in] wd +@* integer width of the array +@* +@* @returns +@* +@* @remarks +@* None +@* +@******************************************************************************* +@*/ +@#if PAD_RIGHT_CHROMA == C +@void ih264_pad_right_chroma(UWORD8 *pu1_src, +@ WORD32 src_strd, +@ WORD32 ht, +@ WORD32 pad_size) +@ r0 => *pu1_src +@ r1 => src_strd +@ r2 => ht +@ r3 => pad_size + + + + .global ih264_pad_right_chroma_a9q + +ih264_pad_right_chroma_a9q: + + stmfd sp!, {r4-r11, lr} @stack stores the values of the arguments + + mov r4, r0 + sub r6, r1, #16 + sub r0, r0, #2 +loop_32_r_c: @ /*hard coded for width=32 ,height =8,4*/ + ldrh r8, [r0], r1 + ldrh r9, [r0], r1 + vdup.u16 q0, r8 + ldrh r10, [r0], r1 + vst1.8 {q0}, [r4]! @ 16 bytes store + vdup.u16 q1, r9 + vst1.8 {q0}, [r4], r6 + vst1.8 {q1}, [r4]! @ 16 bytes store + vdup.u16 q2, r10 + vst1.8 {q1}, [r4], r6 @ 16 bytes store + subs r2, r2, #4 + ldrh r11, [r0], r1 + vst1.8 {q2}, [r4]! @ 16 bytes store + vdup.u16 q3, r11 + vst1.8 {q2}, [r4], r6 @ 16 bytes store + vst1.8 {q3}, [r4]! @ 16 bytes store + vst1.8 {q3}, [r4], r6 @ 16 bytes store + + beq end_func_r_c @/* Branching when ht=4*/ + + ldrh r8, [r0], r1 + vdup.u16 q0, r8 + ldrh r9, [r0], r1 + ldrh r10, [r0], r1 + vst1.8 {q0}, [r4]! @ 16 bytes store + vdup.u16 q1, r9 + vst1.8 {q0}, [r4], r6 @ 16 bytes store + ldrh r11, [r0], r1 + vst1.8 {q1}, [r4]! @ 16 bytes store + vdup.u16 q2, r10 + vst1.8 {q1}, [r4], r6 @ 16 bytes store + vst1.8 {q2}, [r4]! @ 16 bytes store + vdup.u16 q3, r11 + vst1.8 {q2}, [r4], r6 @ 16 bytes store + subs r2, r2, #4 + vst1.8 {q3}, [r4]! @ 16 bytes store + vst1.8 {q3}, [r4], r6 @ 16 bytes store + + beq end_func_r_c @/* Branching when ht=8*/ + bne loop_32_r_c + + ldrh r8, [r0], r1 + vdup.u16 q0, r8 + ldrh r9, [r0], r1 + ldrh r10, [r0], r1 + vst1.8 {q0}, [r4]! @ 16 bytes store + vdup.u16 q1, r9 + vst1.8 {q0}, [r4], r6 @ 16 bytes store + ldrh r11, [r0], r1 + vst1.8 {q1}, [r4]! @ 16 bytes store + vdup.u16 q2, r10 + vst1.8 {q1}, [r4], r6 @ 16 bytes store + vst1.8 {q2}, [r4]! @ 16 bytes store + vdup.u16 q3, r11 + vst1.8 {q2}, [r4], r6 @ 16 bytes store + vst1.8 {q3}, [r4]! @ 16 bytes store + vst1.8 {q3}, [r4], r6 @ 16 bytes store + +end_func_r_c: + ldmfd sp!, {r4-r11, pc} @Reload the registers from SP + + + + + diff --git a/common/arm/ih264_platform_macros.h b/common/arm/ih264_platform_macros.h new file mode 100755 index 0000000..1f67403 --- /dev/null +++ b/common/arm/ih264_platform_macros.h @@ -0,0 +1,152 @@ +/****************************************************************************** + * + * Copyright (C) 2015 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ***************************************************************************** + * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore +*/ +/** +******************************************************************************* +* @file +* ih264_platform_macros.h +* +* @brief +* Platform specific Macro definitions used in the codec +* +* @author +* Ittiam +* +* @remarks +* None +* +******************************************************************************* +*/ +#ifndef _IHEVC_PLATFORM_MACROS_H_ +#define _IHEVC_PLATFORM_MACROS_H_ + +#ifndef ARMV8 +void ih264_arm_dsb(void); + +#define DATA_SYNC() ih264_arm_dsb() +static __inline WORD32 CLIP_U8(WORD32 x) +{ + asm("usat %0, #8, %1" : "=r"(x) : "r"(x)); + return x; +} + +static __inline WORD32 CLIP_S8(WORD32 x) +{ + asm("ssat %0, #8, %1" : "=r"(x) : "r"(x)); + return x; +} + +static __inline WORD32 CLIP_U10(WORD32 x) +{ + asm("usat %0, #10, %1" : "=r"(x) : "r"(x)); + return x; +} + +static __inline WORD32 CLIP_S10(WORD32 x) +{ + asm("ssat %0, #10, %1" : "=r"(x) : "r"(x)); + return x; +} + +static __inline WORD32 CLIP_U12(WORD32 x) +{ + asm("usat %0, #12, %1" : "=r"(x) : "r"(x)); + return x; +} + +static __inline WORD32 CLIP_S12(WORD32 x) +{ + asm("ssat %0, #12, %1" : "=r"(x) : "r"(x)); + return x; +} + +static __inline WORD32 CLIP_U16(WORD32 x) +{ + asm("usat %0, #16, %1" : "=r"(x) : "r"(x)); + return x; +} +static __inline WORD32 CLIP_S16(WORD32 x) +{ + asm("ssat %0, #16, %1" : "=r"(x) : "r"(x)); + return x; +} + + +static __inline UWORD32 ITT_BIG_ENDIAN(UWORD32 x) +{ + asm("rev %0, %1" : "=r"(x) : "r"(x)); + return x; +} +#else +#define DATA_SYNC() ; + +#define CLIP_U8(x) CLIP3(0, 255, (x)) +#define CLIP_S8(x) CLIP3(-128, 127, (x)) + +#define CLIP_U10(x) CLIP3(0, 1023, (x)) +#define CLIP_S10(x) CLIP3(-512, 511, (x)) + +#define CLIP_U12(x) CLIP3(0, 4095, (x)) +#define CLIP_S12(x) CLIP3(-2048, 2047, (x)) + +#define CLIP_U16(x) CLIP3(0, 65535, (x)) +#define CLIP_S16(x) CLIP3(-32768, 32767, (x)) + +#define ITT_BIG_ENDIAN(x) ((x & 0x000000ff) << 24) | \ + ((x & 0x0000ff00) << 8) | \ + ((x & 0x00ff0000) >> 8) | \ + ((UWORD32)x >> 24); +#endif + +#define SHL(x,y) (((y) < 32) ? ((x) << (y)) : 0) +#define SHR(x,y) (((y) < 32) ? ((x) >> (y)) : 0) + +#define SHR_NEG(val,shift) ((shift>0)?(val>>shift):(val<<(-shift))) +#define SHL_NEG(val,shift) ((shift<0)?(val>>(-shift)):(val<<shift)) + +#define INLINE inline + +static INLINE UWORD32 CLZ(UWORD32 u4_word) +{ + if(u4_word) + return (__builtin_clz(u4_word)); + else + return 32; +} +static INLINE UWORD32 CTZ(UWORD32 u4_word) +{ + if(0 == u4_word) + return 31; + else + { + unsigned int index; + index = __builtin_ctz(u4_word); + return (UWORD32)index; + } +} + + +#define NOP(nop_cnt) {UWORD32 nop_i; for (nop_i = 0; nop_i < nop_cnt; nop_i++);} + + +#define MEM_ALIGN8 __attribute__ ((aligned (8))) +#define MEM_ALIGN16 __attribute__ ((aligned (16))) +#define MEM_ALIGN32 __attribute__ ((aligned (32))) + +#endif /* _IHEVC_PLATFORM_MACROS_H_ */ diff --git a/common/arm/ih264_resi_trans_a9.s b/common/arm/ih264_resi_trans_a9.s new file mode 100755 index 0000000..08821f5 --- /dev/null +++ b/common/arm/ih264_resi_trans_a9.s @@ -0,0 +1,604 @@ +@/****************************************************************************** +@ * +@ * Copyright (C) 2015 The Android Open Source Project +@ * +@ * Licensed under the Apache License, Version 2.0 (the "License"); +@ * you may not use this file except in compliance with the License. +@ * You may obtain a copy of the License at: +@ * +@ * http://www.apache.org/licenses/LICENSE-2.0 +@ * +@ * Unless required by applicable law or agreed to in writing, software +@ * distributed under the License is distributed on an "AS IS" BASIS, +@ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +@ * See the License for the specific language governing permissions and +@ * limitations under the License. +@ * +@ ***************************************************************************** +@ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore +@*/ +@/** +@******************************************************************************* +@* @file +@* ih264_resi_trans_a9.s +@* +@* @brief +@* Contains function definitions for residual and forward trans +@* +@* @author +@* Ittiam +@* +@* @par List of Functions: +@* ih264_resi_trans_4x4_a9 +@* ih264_resi_trans_8x8_a9 +@* @remarks +@* None +@* +@******************************************************************************* + + +.text +.p2align 2 +@***************************************************************************** +@* +@* Function Name : ih264_resi_trans_4x4_a9 +@* Description : This function does cf4 of H264 followed by and approximate scaling +@* +@* Arguments : +@ R0 :pointer to src buffer +@ R1 :pointer to pred buffer +@ R2 :pointer to dst buffer +@ R3 :src_stride +@ STACk :pred_stride,dst_stride + +@* Values Returned : NONE +@* +@* Register Usage : +@* Stack Usage : +@* Cycles : Around +@* Interruptiaility : Interruptable +@* +@* Known Limitations +@* \Assumptions : +@* +@* Revision History : +@* DD MM YYYY Author(s) Changes +@* 30 12 2009 100633 First version +@* +@***************************************************************************** + + + .global ih264_resi_trans_4x4_a9 + .extern g_scal_coff_h264_4x4 +g_scal_coff_h264_4x4_addr: + .long g_scal_coff_h264_4x4 - 4x4lbl - 8 + +ih264_resi_trans_4x4_a9: + + @R0 :pointer to src buffer + @R1 :pointer to pred buffer + @R2 :pointer to dst buffer + @R3 :src_stride + @STACk :pred_stride,dst_stride + + push {r4-r12, lr} @push all the variables first + + mov r6, sp + add r6, r6, #40 @decrement stack pointer,to accomodate two variables + ldmfd r6, {r4-r5} @load the strides into registers + @R4 pred_stride + @R5 dst_stride + + + @we have to give the stride as post inrement in VLDR1 + @but since thr stride is from end of row 1 to start of row 2, + @we need to add the size of the curent row to strides ie we need to add 4 to it (4 bytes) + @ADD R3,#4 + @ADD R4,#4 + @ADD R5,#4 + @in case of dst the stride represnts 16 bit ie 2*8bits + @hence we need to add #4 to it and thenm multiply by 2 + @--------------------function loading done------------------------ + + @lets find residual + @data is like 1a -> d0[1:31] d0[32:64] + @ a b c d # # # # + vld1.u8 d30, [r0], r3 @load 4 pixels of row1 current buffer + vld1.u8 d31, [r1], r4 @load 4 pixels of row1 pred buffer + @ data is like 1a -> q4[1:63] q4[64:148] + @ d8[1:63] d9[1:63] + @ a b c d # # # # + + vld1.u8 d28, [r0], r3 @load row 2 of src to d28[0] + vld1.u8 d29, [r1], r4 @load row2 of pred to d29[0] + + vld1.u8 d26, [r0], r3 @load row 3 of src to d26[0] + vsubl.u8 q0, d30, d31 @curr - pred for row one + + vld1.u8 d27, [r1], r4 @load row 3of pred t0 d27[0] + vsubl.u8 q1, d28, d29 @find row 2 of src -pred to d0 + + vld1.u8 d24, [r0], r3 @load row 4 of src to d24[0] + + vld1.u8 d25, [r1], r4 @load row 4 of src tp d25[0] + vsubl.u8 q2, d26, d27 @load src-pred row 3 to d[2] + + lsl r5, r5, #2 @ multiply dst stride by since we are storing 32 bit values + ldr r6, g_scal_coff_h264_4x4_addr +4x4lbl: + add r6, r6, pc @ load the address of global array + + vsubl.u8 q3, d24, d25 @load row 4 of src - pred to q6 + + @after this + @D0 -> 1a + @D2 -> 2a + @D4 -> 3a + @D6 -> 4a + + @transpose the matrix so that we can do the horizontal transform first + @#1 #2 #3 #4 + @a b c d ---- D0 + @e f g h -----D2 + @i j k l -----D4 + @m n o p -----D6 + @transpose the inner 2x2 blocks + vtrn.16 d0, d2 + vld1.s16 {q10}, [r6]! @ load the scaling values 0-7; + vtrn.16 d4, d6 + @a e c g + @b f d h + @i m k o + @j n l p + vtrn.32 d0, d4 + vtrn.32 d2, d6 + @a e i m #1 -- D0 --- x4 + @b f j n #2 -- D2 --- x5 + @c g k o #3 -- D4 ----x6 + @d h l p #4 -- D6 ----x7 + + @we have loaded the residuals into the registers , now we need to add and subtract them + @let us do the horiz transform first + + vsub.s16 d5, d2, d4 @x2 = x5-x6 + vsub.s16 d7, d0, d6 @x3 = x4-x7; + + vadd.s16 d3, d2, d4 @x1 = x5+x6 + vadd.s16 d1, d0, d6 @x0 = x4+x7 + + + vshl.s16 d31, d7, #1 @ + vshl.s16 d30, d5, #1 @ + + vadd.s16 d0, d1, d3 @x0 + x1; + vsub.s16 d4, d1, d3 @x0 - x1; + + vadd.s16 d2, d31, d5 @U_SHIFT(x3,1,shft) + x2; + vsub.s16 d6, d7, d30 @x3 - U_SHIFT(x2,1,shft); + + @taking transform again so as to make do vert transform + vtrn.16 d0, d2 + vtrn.16 d4, d6 + + vtrn.32 d0, d4 + vtrn.32 d2, d6 + + @let us do vertical transform + @same code as horiz + + vadd.s16 d1, d0, d6 @x0 = x4+x7 + vadd.s16 d3, d2, d4 @x1 = x5+x6 + vsub.s16 d7, d0, d6 @x3 = x4-x7; + vsub.s16 d5, d2, d4 @x2 = x5-x6 + + +@Since we are going to do scal / quant or whatever, we are going to divide by +@a 32 bit number. So we have to expand the values + + @VADDL.S16 Q12,D1,D3;x0 + x1 + @VSUBL.S16 Q14,D1,D3;x0 - x1 + + @VSHL.S16 D8,D5,#1; + @VSHL.S16 D9,D7,#1; + + @VADDL.S16 Q13,D9,D5 ; + x2 + @VSUBL.S16 Q15,D7,D8 ;x3 - U_SHIFT(x2,1,shft) + +@scaling follows + +@now we need to do the scaling,so load the scaling matrix +@mutliplying by the scaling coeffient; store the results from q5-q8 ; + + vadd.s16 d24, d3, d1 @x4 = x0 + x1 + vsub.s16 d28, d1, d3 @x6 = x0 - x1 + + vshl.s16 d0, d7, #1 @ U_SHIFT(x3,1,shft) + vmull.s16 q4, d24, d20 @x4*s0 + + vshl.s16 d2, d5, #1 @ U_SHIFT(x2,1,shft) + + vadd.s16 d26, d0, d5 @x5 = U_SHIFT(x3,1,shft) + x2 + vmull.s16 q5, d26, d21 @x5*s1 + + vst1.s32 {q4}, [r2], r5 @save 4 pixels of row1 current buffer and increment pointer by stride + + vld1.s16 {q10}, [r6] @load 8-16 scaling coeffcients + + vsub.s16 d30, d7, d2 @x7 = x3 - U_SHIFT(x2,1,shft) + + vmull.s16 q6, d28, d20 @x6*s2 + vst1.s32 {q5}, [r2], r5 + + vmull.s16 q7, d30, d21 @x7*s3 + + + vst1.s32 {q6}, [r2], r5 + vst1.s32 {q7}, [r2] + + pop {r4-r12, pc} @pop back all variables + + + + +@***************************************************************************** +@* Function Name : ih264_resi_trans_8x8_a9 +@* Description : This function does cf8 followd by an approximate normalization of H264 +@* +@* Arguments : +@* R0 :pointer to src buffer +@ R1 :pointer to pred buffer +@ R2 :pointer to dst buffer +@ R3 :src_stride +@ STACk :pred_stride,dst_st +@* +@* +@* Values Returned : NONE +@* +@* Register Usage : +@* Stack Usage : +@* Cycles : Around +@* Interruptiaility : Interruptable +@* +@* Known Limitations +@* \Assumptions : +@* +@* Revision History : +@* DD MM YYYY Author(s) Changes +@* 30 12 2009 100633 First version +@* +@***************************************************************************** + + + .global ih264_resi_trans_8x8_a9 + .extern g_scal_coff_h264_8x8 +g_scal_coff_h264_8x8_addr: + .long g_scal_coff_h264_8x8 - 8x8lbl - 8 + + +ih264_resi_trans_8x8_a9: + + @R0 :pointer to src buffer + @R1 :pointer to pred buffer + @R2 :pointer to dst buffer + @R3 :src_stride + @STACk :pred_stride,dst_stride + + push {r4-r12, lr} @push all the variables first + + mov r6, sp + add r6, r6, #40 @decrement stack pointer,to accomodate two variables + ldmfd r6, {r4-r5} @load the strides into registers + @R4 pred_stride + @R5 dst_stride + + @we have to give the stride as post inrement in vst1 + @in case of dst the stride represnts 16 bit ie 2*8bits + @hence we need to add #4 to it and thenm multiply by 2 + @--------------------function loading done------------------------ + + @lets find residual + @data is like 1a -> d0[1:31] d0[32:64] + @ a b c d # # # # + vld1.u8 d30, [r0], r3 @load 4 pixels of row1 current buffer + vld1.u8 d31, [r1], r4 @load 4 pixels of row1 pred buffer + + vld1.u8 d28, [r0], r3 @src rw2 + vld1.u8 d29, [r1], r4 @pred rw2 + vsubl.u8 q0, d30, d31 @src-pred rw1 + + vld1.u8 d26, [r0], r3 + vld1.u8 d27, [r1], r4 + vsubl.u8 q1, d28, d29 + + vld1.u8 d24, [r0], r3 + vld1.u8 d25, [r1], r4 + vsubl.u8 q2, d26, d27 + + vld1.u8 d22, [r0], r3 + vld1.u8 d23, [r1], r4 + vsubl.u8 q3, d24, d25 + + vld1.u8 d20, [r0], r3 + vld1.u8 d21, [r1], r4 + vsubl.u8 q4, d22, d23 + + vld1.u8 d18, [r0], r3 + vld1.u8 d19, [r1], r4 + vsubl.u8 q5, d20, d21 + + vld1.u8 d16, [r0], r3 + vld1.u8 d17, [r1], r4 + vsubl.u8 q6, d18, d19 + + lsl r5, r5, #2 + + + vsubl.u8 q7, d16, d17 + + @after this + @Q0 -> 1a + @Q1 -> 2a + @Q2 -> 3a + @Q3 -> 4a + @Q4 -> 5a + @Q5 -> 6a + @Q6 -> 7a + @Q7 -> 8a + + @transpose the matrix so that we can do the horizontal transform first + + @transpose the inner 2x2 blocks + vtrn.16 q0, q1 + vtrn.16 q2, q3 + vtrn.16 q4, q5 + vtrn.16 q6, q7 + + @transpose the inner 4x4 blocks + vtrn.32 q0, q2 + vtrn.32 q1, q3 + + vtrn.32 q4, q6 + vtrn.32 q5, q7 + + @transpose the outer 8x8 blocks + vswp d1, d8 + vswp d7, d14 + vswp d3, d10 + vswp d5, d12 + @transpose done + +@@this point we will have data in Q0-Q7 +@Q7 will be populated within 2 clock cycle +@all others are availabe @ this clock cycle + + @we have loaded the residuals into the registers , now we need to add and subtract them + @let us do the horiz transform first + + vadd.s16 q8, q0, q7 @ a0 = r0 + r7; + vadd.s16 q9, q1, q6 @ a1 = r1 + r6; + vadd.s16 q10, q2, q5 @ a2 = r2 + r5; + vadd.s16 q11, q3, q4 @ a3 = r3 + r4; + + vsub.s16 q12, q0, q7 @ b0 = r0 - r7; + vsub.s16 q13, q1, q6 @ b1 = r1 - r6; + vsub.s16 q15, q3, q4 @ b3 = r3 - r4; + vsub.s16 q14, q2, q5 @ b2 = r2 - r5; + + vadd.s16 q1, q8, q11 @ a4 = a0 + a3; + vadd.s16 q3, q9, q10 @ a5 = a1 + a2; + vsub.s16 q7, q9, q10 @ a7 = a1 - a2; + vsub.s16 q5, q8, q11 @ a6 = a0 - a3; + + ldr r6, g_scal_coff_h264_8x8_addr +8x8lbl: + add r6, r6, pc @ load the address of global array + + vadd.s16 q0, q1, q3 @ pi2_res[0] = a4 + a5; + vshr.s16 q8, q7, #1 @ pi2_res[2] = a6 + D_SHIFT(a7,1,shft); + + vsub.s16 q4, q1, q3 @ pi2_res[4] = a4 - a5; + + vadd.s16 q2, q5, q8 @ + + + vshr.s16 q9, q5, #1 @ pi2_res[6] = D_SHIFT(a6,1,shft) - a7; + vsub.s16 q6, q9, q7 @ + +@do not change Q0,Q2.Q4,Q6 they contain results +@Q1,Q3,Q5,Q7 TO STORE RESULTS +@Q8 Q9 Q10 Q11 USE @WILL + + vshr.s16 q1, q12, #1 @ D_SHIFT(b0,1,shft) + vshr.s16 q3, q13, #1 @ D_SHIFT(b1,1,shft) + vshr.s16 q5, q14, #1 @ D_SHIFT(b2,1,shft) + vshr.s16 q7, q15, #1 @ D_SHIFT(b3,1,shft) + + vadd.s16 q8, q1, q12 @ (D_SHIFT(b0,1,shft) + b0); + vadd.s16 q9, q3, q13 @ (D_SHIFT(b1,1,shft) + b1); + vadd.s16 q10, q5, q14 @ (D_SHIFT(b2,1,shft) + b2); + vadd.s16 q11, q7, q15 @ (D_SHIFT(b3,1,shft) + b3); + + vadd.s16 q1, q14, q8 @ b2 + (D_SHIFT(b0,1,shft) + b0); + vsub.s16 q5, q15, q9 @ b3 - (D_SHIFT(b1,1,shft) + b1); + vadd.s16 q3, q15, q10 @ b3 + (D_SHIFT(b2,1,shft) + b2); + vsub.s16 q7, q11, q14 @ -b2 + (D_SHIFT(b3,1,shft) + b3); + + vadd.s16 q8, q13, q1 @ b4 = b1 + b2 + (D_SHIFT(b0,1,shft) + b0); + vsub.s16 q9, q12, q3 @ b5 = b0 - b3 - (D_SHIFT(b2,1,shft) + b2); + vadd.s16 q10, q12, q5 @ b6 = b0 + b3 - (D_SHIFT(b1,1,shft) + b1); + vadd.s16 q11, q13, q7 @ b7 = b1 - b2 + (D_SHIFT(b3,1,shft) + b3); + + vshr.s16 q15, q8, #2 @ D_SHIFT(b4,2,shft) + vshr.s16 q14, q9, #2 @ D_SHIFT(b5,2,shft); + vshr.s16 q13, q10, #2 @ D_SHIFT(b6,2,shft); + vshr.s16 q12, q11, #2 @ D_SHIFT(b7,2,shft); + + + vadd.s16 q3, q9, q13 @ pi2_res[3] = b5 + D_SHIFT(b6,2,shft); + vsub.s16 q5, q10, q14 @ pi2_res[5] = b6 - D_SHIFT(b5,2,shft); + vadd.s16 q1, q8, q12 @ pi2_res[1] = b4 + D_SHIFT(b7,2,shft); + vsub.s16 q7, q15, q11 @ pi2_res[7] = D_SHIFT(b4,2,shft) - b7; + + @------------horiz transform done------------------------- + @results are in Q0-Q7 + @all other neon registes can be used at will + +@doing vertical transform +@code exact copy of horiz transform above + + @transpose the inner 2x2 blocks + vtrn.16 q0, q1 + vtrn.16 q2, q3 + vtrn.16 q4, q5 + vtrn.16 q6, q7 + + @transpose the inner 4x4 blocks + vtrn.32 q0, q2 + vtrn.32 q1, q3 + + vtrn.32 q4, q6 + vtrn.32 q5, q7 + + @transpose the outer 8x8 blocks + vswp d1, d8 + vswp d3, d10 + vswp d5, d12 + vswp d7, d14 + + @transpose done + + vadd.s16 q8, q0, q7 @ a0 = r0 + r7; + vadd.s16 q9, q1, q6 @ a1 = r1 + r6; + vadd.s16 q10, q2, q5 @ a2 = r2 + r5; + vadd.s16 q11, q3, q4 @ a3 = r3 + r4; + + vsub.s16 q12, q0, q7 @ b0 = r0 - r7; + vsub.s16 q13, q1, q6 @ b1 = r1 - r6; + vsub.s16 q14, q2, q5 @ b2 = r2 - r5; + vsub.s16 q15, q3, q4 @ b3 = r3 - r4; + + vadd.s16 q1, q8, q11 @ a4 = a0 + a3; + vadd.s16 q3, q9, q10 @ a5 = a1 + a2; + vsub.s16 q5, q8, q11 @ a6 = a0 - a3; + vsub.s16 q7, q9, q10 @ a7 = a1 - a2; + + + vadd.s16 q0, q1, q3 @ pi2_res[0] = a4 + a5; + + vshr.s16 q8, q7, #1 @ pi2_res[2] = a6 + D_SHIFT(a7,1,shft); + @DSHIFT_TO_0 Q8,Q7,#1,#0 + vadd.s16 q2, q5, q8 @ + + vsub.s16 q4, q1, q3 @ pi2_res[4] = a4 - a5; + + vshr.s16 q9, q5, #1 @ pi2_res[6] = D_SHIFT(a6,1,shft) - a7; + vsub.s16 q6, q9, q7 @ + +@do not change Q0,Q2.Q4,Q6 they contain results +@Q1,Q3,Q5,Q7 TO STORE RESULTS +@Q8 Q9 Q10 Q11 USE @WILL + + vshr.s16 q1, q12, #1 @ D_SHIFT(b0,1,shft) + vshr.s16 q3, q13, #1 @ D_SHIFT(b1,1,shft) + vshr.s16 q5, q14, #1 @ D_SHIFT(b2,1,shft) + vshr.s16 q7, q15, #1 @ D_SHIFT(b3,1,shft) + + + vadd.s16 q8, q1, q12 @ (D_SHIFT(b0,1,shft) + b0); + vadd.s16 q9, q3, q13 @ (D_SHIFT(b1,1,shft) + b1); + vadd.s16 q10, q5, q14 @ (D_SHIFT(b2,1,shft) + b2); + vadd.s16 q11, q7, q15 @ (D_SHIFT(b3,1,shft) + b3); + + vadd.s16 q1, q14, q8 @ b2 + (D_SHIFT(b0,1,shft) + b0); + vadd.s16 q3, q15, q10 @ b3 + (D_SHIFT(b2,1,shft) + b2); + vsub.s16 q5, q15, q9 @ b3 - (D_SHIFT(b1,1,shft) + b1); + vsub.s16 q7, q11, q14 @ -b2 + (D_SHIFT(b3,1,shft) + b3); + + vadd.s16 q8, q13, q1 @ b4 = b1 + b2 + (D_SHIFT(b0,1,shft) + b0); + vsub.s16 q9, q12, q3 @ b5 = b0 - b3 - (D_SHIFT(b2,1,shft) + b2); + vadd.s16 q10, q12, q5 @ b6 = b0 + b3 - (D_SHIFT(b1,1,shft) + b1); + vadd.s16 q11, q13, q7 @ b7 = b1 - b2 + (D_SHIFT(b3,1,shft) + b3); + + vshr.s16 q15, q8, #2 @ D_SHIFT(b4,2,shft) + vshr.s16 q14, q9, #2 @ D_SHIFT(b5,2,shft); + vshr.s16 q13, q10, #2 @ D_SHIFT(b6,2,shft); + vshr.s16 q12, q11, #2 @ D_SHIFT(b7,2,shft); + + +@since we are going to scal by small values, we need not expand the guys to 32 bit bit values + vsub.s16 q5, q10, q14 @ pi2_res[5] = b6 - D_SHIFT(b5,2,shft); + vsub.s16 q7, q15, q11 @ pi2_res[7] = D_SHIFT(b4,2,shft) - b7; + vadd.s16 q3, q9, q13 @ pi2_res[3] = b5 + D_SHIFT(b6,2,shft); + vadd.s16 q1, q8, q12 @ pi2_res[1] = b4 + D_SHIFT(b7,2,shft); + + @------------vert transform done------------------------- + @results are in Q0-Q7 + @all other neon registes can be used at will + + @scaling + @since the 8x8 scaling matrix repeats in 1x4,1x4 block , + @we need only load 4 values for each row and in total 4 rows + vld1.s16 {q14-q15}, [r6] @ + + @since we need to get a 32 bit o/p for two 16 bit multiplications + @we need a VMULL instruction +@-----------------------------first and second row + + vmull.s16 q8, d0, d28 @scale the first row first 4 elem + vmull.s16 q9, d28, d1 @scale the second row last 4 elemts + + vmull.s16 q10, d2, d29 @ scale second row first 4 elem + vmull.s16 q11, d29, d3 @scale the second row last 4 elem + vmull.s16 q12, d4, d30 @scale third row first 4 elem + + vst1.s32 {q8, q9}, [r2], r5 @ write the first row complete + + vmull.s16 q13, d30, d5 @scale the third row last 4 elem + vmull.s16 q8, d6, d31 @scale the fourth row first 4 elem + + + vst1.s32 {q10, q11}, [r2], r5 @store the second row complete + +@------------------------------- 3rd and 4th row + + vmull.s16 q9, d31, d7 @scale the fourth row second column + + vst1.s32 {q12, q13}, [r2], r5 @store the third row complete + + vmull.s16 q10, d8, d28 @scale the 5th row fisrst 4 elms + vmull.s16 q11, d28, d9 @scale the 5th row second 4 elems + + vmull.s16 q12, d10, d29 @scale the 6th row first4 elements + + + vst1.s32 {q8, q9}, [r2], r5 @store fifth row + +@--------------------------------5th and 6th row + + vmull.s16 q13, d29, d11 @scale 6th row sendond 4 elems + + vmull.s16 q8, d12, d30 @scale 7th rw first 4 elms + + vst1.s32 {q10, q11}, [r2], r5 @store 6th row second 4 elements + + vmull.s16 q9, d30, d13 @scale 7th rw second 4 elms + vmull.s16 q10, d14, d31 @scale 8th rw forst 4 elms + + + vst1.s32 {q12, q13}, [r2], r5 @store 6th row + +@----------------------------------7th and 8th row + vmull.s16 q11, d31, d15 @scale 8th row second 4 elms + + vst1.s32 {q8, q9}, [r2], r5 @store 7th row + vst1.s32 {q10, q11}, [r2], r5 @store 8th row + +@----------------------------------done writing + + pop {r4-r12, pc} @pop back all variables + + + + + + diff --git a/common/arm/ih264_resi_trans_quant_a9.s b/common/arm/ih264_resi_trans_quant_a9.s new file mode 100755 index 0000000..caf362e --- /dev/null +++ b/common/arm/ih264_resi_trans_quant_a9.s @@ -0,0 +1,694 @@ +@/****************************************************************************** +@ * +@ * Copyright (C) 2015 The Android Open Source Project +@ * +@ * Licensed under the Apache License, Version 2.0 (the "License"); +@ * you may not use this file except in compliance with the License. +@ * You may obtain a copy of the License at: +@ * +@ * http://www.apache.org/licenses/LICENSE-2.0 +@ * +@ * Unless required by applicable law or agreed to in writing, software +@ * distributed under the License is distributed on an "AS IS" BASIS, +@ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +@ * See the License for the specific language governing permissions and +@ * limitations under the License. +@ * +@ ***************************************************************************** +@ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore +@*/ +@/** +@******************************************************************************* +@* @file +@* ih264_resi_trans_quant_a9.s +@* +@* @brief +@* Contains function definitions for residual and forward trans +@* +@* @author +@* Ittiam +@* +@* @par List of Functions: +@* ih264_resi_trans_quant_4x4_a9 +@* ih264_resi_trans_quant_8x8_a9 +@* ih264_resi_trans_quant_chroma_4x4_a9 +@* ih264_hadamard_quant_4x4_a9 +@* ih264_hadamard_quant_2x2_uv_a9 +@* +@* @remarks +@* None +@* +@******************************************************************************* + + +.text +.p2align 2 +@***************************************************************************** +@* +@* Function Name : ih264_resi_trans_quant_4x4_a9 +@* Description : This function does cf4 of H264 +@* +@* Arguments : R0 :pointer to src buffer +@ R1 :pointer to pred buffer +@ R2 :pointer to dst buffer +@ R3 :source stride +@ STACK : pred stride, +@ dst stride, +@ pointer to scaling matrix, +@ pointer to threshold matrix, +@ qbits, +@ rounding factor, +@ pointer to store nnz +@ pointer to store non quantized dc value +@ Values Returned : NONE +@ +@ Register Usage : +@ Stack Usage : 40 bytes +@ Cycles : Around +@ Interruptiaility : Interruptable +@ +@ Known Limitations +@ \Assumptions : +@ +@ Revision History : +@ DD MM YYYY Author(s) Changes +@ 1 12 2013 100633 First version +@ 20 1 2014 100633 Changes the API, Optimization +@ +@***************************************************************************** + + .global ih264_resi_trans_quant_4x4_a9 +ih264_resi_trans_quant_4x4_a9: + + @R0 :pointer to src buffer + @R1 :pointer to pred buffer + @R2 :pointer to dst buffer + @R3 :Source stride + @STACk :pred stride + @ :scale matirx, + @ :threshold matrix + @ :qbits + @ :round factor + @ :nnz + + push {r4-r12, lr} @push all the variables first + + add r11, sp, #40 @decrement stack pointer,to accomodate two variables + ldmfd r11, {r4-r10} @load the strides into registers + + @R0 :pointer to src buffer + @R1 :pointer to pred buffer + @R2 :pointer to dst buffer + @R3 :Source stride + @R4 :Pred stride + @R5 :scale matirx, + @R6 :threshold matrix + @R7 :qbits + @R8 :round factor + @R9 :nnz + + vpush {d8-d15} + + mov r11, #0 + sub r7, r11, r7 @Negate the qbit value for usiing LSL + + @------------Fucntion Loading done----------------; + + vld1.u8 d30, [r0], r3 @load first 8 pix src row 1 + + vld1.u8 d31, [r1], r4 @load first 8 pix pred row 1 + + vld1.u8 d28, [r0], r3 @load first 8 pix src row 2 + + vld1.u8 d29, [r1], r4 @load first 8 pix pred row 2 + + vld1.u8 d26, [r0], r3 @load first 8 pix src row 3 + + vld1.u8 d27, [r1], r4 @load first 8 pix pred row 3 + vsubl.u8 q0, d30, d31 @find residue row 1 + + vld1.u8 d24, [r0], r3 @load first 8 pix src row 4 + + vld1.u8 d25, [r1], r4 @load first 8 pix pred row 4 + vsubl.u8 q1, d28, d29 @find residue row 2 + + vsubl.u8 q2, d26, d27 @find residue row 3 + vsubl.u8 q3, d24, d25 @find residue row 4 + + vtrn.16 d0, d2 @T12 + vtrn.16 d4, d6 @T23 + vtrn.32 d0, d4 @T13 + vtrn.32 d2, d6 @T14 + + vadd.s16 d8 , d0, d6 @x0 = x4+x7 + vadd.s16 d9 , d2, d4 @x1 = x5+x6 + vsub.s16 d10, d2, d4 @x2 = x5-x6 + vsub.s16 d11, d0, d6 @x3 = x4-x7 + + vshl.s16 d12, d10, #1 @U_SHIFT(x2,1,shft) + vshl.s16 d13, d11, #1 @U_SHIFT(x3,1,shft) + + vadd.s16 d14, d8, d9 @x4 = x0 + x1; + vsub.s16 d16, d8, d9 @x6 = x0 - x1; + vadd.s16 d15, d13, d10 @x5 = U_SHIFT(x3,1,shft) + x2; + vsub.s16 d17, d11, d12 @x7 = x3 - U_SHIFT(x2,1,shft); + + @taking transpose again so as to make do vert transform + vtrn.16 d14, d15 @T12 + vtrn.16 d16, d17 @T23 + vtrn.32 d14, d16 @T13 + vtrn.32 d15, d17 @T24 + + @let us do vertical transform + @same code as horiz + vadd.s16 d18, d14, d17 @x0 = x4+x7 + vadd.s16 d19, d15, d16 @x1 = x5+x6 + vsub.s16 d20, d15, d16 @x2 = x5-x6 + vsub.s16 d21, d14, d17 @x3 = x4-x7 + + vshl.s16 d22, d20, #1 @U_SHIFT(x2,1,shft) + vshl.s16 d23, d21, #1 @U_SHIFT(x3,1,shft) + + vdup.s32 q4, r8 @Load rounding value row 1 + + vadd.s16 d24, d18, d19 @x5 = x0 + x1; + vsub.s16 d26, d18, d19 @x7 = x0 - x1; + vadd.s16 d25, d23, d20 @x6 = U_SHIFT(x3,1,shft) + x2; + vsub.s16 d27, d21, d22 @x8 = x3 - U_SHIFT(x2,1,shft); + vdup.s32 q10, r7 @Load qbit values + + vst1.s16 d24[0], [r10] @Store the dc value to alternate dc sddress + +@core tranform is done for 4x8 block 1 + vld1.s16 {q14-q15}, [r5] @load the scaling values + + vabs.s16 q0, q12 @Abs val of row 1 blk 1 + + vabs.s16 q1, q13 @Abs val of row 2 blk 1 + + vmov.s32 q5, q4 @copy round fact for row 2 + + vmov.s32 q6, q4 @copy round fact for row 2 + vclt.s16 q2, q12, #0 @Get the sign of row 1 blk 1 + + vmov.s32 q7, q4 @copy round fact for row 2 + vclt.s16 q3, q13, #0 @Get the sign of row 2 blk 1 + + vmlal.s16 q4, d0, d28 @Multiply and add row 1 + vmlal.s16 q5, d1, d29 @Multiply and add row 2 + vmlal.s16 q6, d2, d30 @Multiply and add row 3 + vmlal.s16 q7, d3, d31 @Multiply and add row 4 + + vshl.s32 q11, q4, q10 @Shift row 1 + vshl.s32 q12, q5, q10 @Shift row 2 + vshl.s32 q13, q6, q10 @Shift row 3 + vshl.s32 q14, q7, q10 @Shift row 4 + + vmovn.s32 d30, q11 @Narrow row 1 + vmovn.s32 d31, q12 @Narrow row 2 + vmovn.s32 d0 , q13 @Narrow row 3 + vmovn.s32 d1 , q14 @Narrow row 4 + + vneg.s16 q1, q15 @Get negative + vneg.s16 q4, q0 @Get negative + + vceq.s16 q5, q15, #0 @I compare with zero row 1 and 2 blk 1 + vceq.s16 q6, q0 , #0 @I compare with zero row 1 and 2 blk 1 + + vbsl.s16 q2, q1, q15 @Restore sign of row 1 and 2 + vbsl.s16 q3, q4, q0 @Restore sign of row 3 and 4 + + + vmovn.u16 d14, q5 @I Narrow the comparison for row 1 and 2 blk 1 + vmovn.u16 d15, q6 @I Narrow the comparison for row 1 and 2 blk 2 + + vshr.u8 q8, q7, #7 @I Reduce comaparison bit to a signle bit row 1 and 2 blk 1 and 2 [ keep the value for later use ] + + vpadd.u8 d18, d16, d17 @I pair add nnz 1 + vpadd.u8 d20, d18, d19 @I Pair add nnz 2 + vpadd.u8 d22, d20, d21 @I Pair add nnz 3 + vpadd.u8 d24, d22, d23 @I Pair add nnz4 + vst1.s16 {q2-q3}, [r2] @Store blk + + vmov.u8 d25, #16 @I Get max nnz + vsub.u8 d26, d25, d24 @I invert current nnz + + vst1.u8 d26[0], [r9] @I Write nnz + + vpop {d8-d15} + pop {r4-r12, pc} + + + +@***************************************************************************** +@* +@* Function Name : ih264_resi_trans_quant_chroma_4x4_a9 +@* Description : This function does residue calculation, forward transform +@* and quantization for 4x4 chroma block. +@* +@* Arguments : R0 :pointer to src buffer +@ R1 :pointer to pred buffer +@ R2 :pointer to dst buffer +@ R3 :source stride +@ STACK : pred stride, +@ dst stride, +@ pointer to scaling matrix, +@ pointer to threshold matrix, +@ qbits, +@ rounding factor, +@ pointer to store nnz +@ pointer to store unquantized dc values +@ Values Returned : NONE +@ +@ Register Usage : +@ Stack Usage : 40 bytes +@ Cycles : Around +@ Interruptiaility : Interruptable +@ +@ Known Limitations +@ \Assumptions : +@ +@ Revision History : +@ DD MM YYYY Author(s) Changes +@ 11 2 2015 100664 First version +@ +@***************************************************************************** + + .global ih264_resi_trans_quant_chroma_4x4_a9 +ih264_resi_trans_quant_chroma_4x4_a9: + + @R0 :pointer to src buffer + @R1 :pointer to pred buffer + @R2 :pointer to dst buffer + @R3 :Source stride + @STACk :pred stride + @ :scale matirx, + @ :threshold matrix + @ :qbits + @ :round factor + @ :nnz + @ :pu1_dc_alt_addr + push {r4-r12, lr} @push all the variables first + + add r11, sp, #40 @decrement stack pointer,to accomodate two variables + ldmfd r11, {r4-r10} @load the strides into registers + + @R0 :pointer to src buffer + @R1 :pointer to pred buffer + @R2 :pointer to dst buffer + @R3 :Source stride + @R4 :Pred stride + @R5 :scale matirx, + @R6 :threshold matrix + @R7 :qbits + @R8 :round factor + @R9 :nnz + vpush {d8-d15} + mov r11, #0 + sub r7, r11, r7 @Negate the qbit value for usiing LSL + + @------------Fucntion Loading done----------------; + + vld2.u8 {d10, d11}, [r0], r3 @load first 8 pix src row 1 + + vld2.u8 {d11, d12}, [r1], r4 @load first 8 pix pred row 1 + + vld2.u8 {d28, d29}, [r0], r3 @load first 8 pix src row 2 + + vld2.u8 {d29, d30}, [r1], r4 @load first 8 pix pred row 2 + + vld2.u8 {d25, d26}, [r0], r3 @load first 8 pix src row 3 + + vld2.u8 {d26, d27}, [r1], r4 @load first 8 pix pred row 3 + vsubl.u8 q0, d10, d11 @find residue row 1 + + vld2.u8 {d22, d23}, [r0], r3 @load first 8 pix src row 4 + + vld2.u8 {d23, d24}, [r1], r4 @load first 8 pix pred row 4 + vsubl.u8 q1, d28, d29 @find residue row 2 + + vsubl.u8 q2, d25, d26 @find residue row 3 + vsubl.u8 q3, d22, d23 @find residue row 4 + + vtrn.16 d0, d2 @T12 + vtrn.16 d4, d6 @T23 + vtrn.32 d0, d4 @T13 + vtrn.32 d2, d6 @T14 + + vadd.s16 d8 , d0, d6 @x0 = x4+x7 + vadd.s16 d9 , d2, d4 @x1 = x5+x6 + vsub.s16 d10, d2, d4 @x2 = x5-x6 + vsub.s16 d11, d0, d6 @x3 = x4-x7 + + vshl.s16 d12, d10, #1 @U_SHIFT(x2,1,shft) + vshl.s16 d13, d11, #1 @U_SHIFT(x3,1,shft) + + vadd.s16 d14, d8, d9 @x4 = x0 + x1; + vsub.s16 d16, d8, d9 @x6 = x0 - x1; + vadd.s16 d15, d13, d10 @x5 = U_SHIFT(x3,1,shft) + x2; + vsub.s16 d17, d11, d12 @x7 = x3 - U_SHIFT(x2,1,shft); + + @taking transpose again so as to make do vert transform + vtrn.16 d14, d15 @T12 + vtrn.16 d16, d17 @T23 + vtrn.32 d14, d16 @T13 + vtrn.32 d15, d17 @T24 + + @let us do vertical transform + @same code as horiz + vadd.s16 d18, d14, d17 @x0 = x4+x7 + vadd.s16 d19, d15, d16 @x1 = x5+x6 + vsub.s16 d20, d15, d16 @x2 = x5-x6 + vsub.s16 d21, d14, d17 @x3 = x4-x7 + + vshl.s16 d22, d20, #1 @U_SHIFT(x2,1,shft) + vshl.s16 d23, d21, #1 @U_SHIFT(x3,1,shft) + + vdup.s32 q4, r8 @Load rounding value row 1 + + vadd.s16 d24, d18, d19 @x5 = x0 + x1; + vsub.s16 d26, d18, d19 @x7 = x0 - x1; + vadd.s16 d25, d23, d20 @x6 = U_SHIFT(x3,1,shft) + x2; + vsub.s16 d27, d21, d22 @x8 = x3 - U_SHIFT(x2,1,shft); + vdup.s32 q10, r7 @Load qbit values + + vst1.s16 d24[0], [r10] @Store Unquantized dc value to dc alte address + +@core tranform is done for 4x8 block 1 + vld1.s16 {q14-q15}, [r5] @load the scaling values + + vabs.s16 q0, q12 @Abs val of row 1 blk 1 + + vabs.s16 q1, q13 @Abs val of row 2 blk 1 + + vmov.s32 q5, q4 @copy round fact for row 2 + + vmov.s32 q6, q4 @copy round fact for row 2 + vclt.s16 q2, q12, #0 @Get the sign of row 1 blk 1 + + vmov.s32 q7, q4 @copy round fact for row 2 + vclt.s16 q3, q13, #0 @Get the sign of row 2 blk 1 + + vmlal.s16 q4, d0, d28 @Multiply and add row 1 + vmlal.s16 q5, d1, d29 @Multiply and add row 2 + vmlal.s16 q6, d2, d30 @Multiply and add row 3 + vmlal.s16 q7, d3, d31 @Multiply and add row 4 + + vshl.s32 q11, q4, q10 @Shift row 1 + vshl.s32 q12, q5, q10 @Shift row 2 + vshl.s32 q13, q6, q10 @Shift row 3 + vshl.s32 q14, q7, q10 @Shift row 4 + + vmovn.s32 d30, q11 @Narrow row 1 + vmovn.s32 d31, q12 @Narrow row 2 + vmovn.s32 d0 , q13 @Narrow row 3 + vmovn.s32 d1 , q14 @Narrow row 4 + + vneg.s16 q1, q15 @Get negative + vneg.s16 q4, q0 @Get negative + + vceq.s16 q5, q15, #0 @I compare with zero row 1 and 2 blk 1 + vceq.s16 q6, q0 , #0 @I compare with zero row 1 and 2 blk 1 + + vbsl.s16 q2, q1, q15 @Restore sign of row 1 and 2 + vbsl.s16 q3, q4, q0 @Restore sign of row 3 and 4 + + vmovn.u16 d14, q5 @I Narrow the comparison for row 1 and 2 blk 1 + vmovn.u16 d15, q6 @I Narrow the comparison for row 1 and 2 blk 2 + + vshr.u8 q8, q7, #7 @I Reduce comaparison bit to a signle bit row 1 and 2 blk 1 and 2 [ keep the value for later use ] + + vpadd.u8 d18, d16, d17 @I pair add nnz 1 + vpadd.u8 d20, d18, d19 @I Pair add nnz 2 + vpadd.u8 d22, d20, d21 @I Pair add nnz 3 + vpadd.u8 d24, d22, d23 @I Pair add nnz4 + vst1.s16 {q2-q3}, [r2] @Store blk + + vmov.u8 d25, #16 @I Get max nnz + vsub.u8 d26, d25, d24 @I invert current nnz + + vst1.u8 d26[0], [r9] @I Write nnz + + vpop {d8-d15} + pop {r4-r12, pc} + + + +@***************************************************************************** +@* +@* Function Name : ih264_hadamard_quant_4x4_a9 +@* Description : This function does forward hadamard transform and +@* quantization for luma dc block +@* +@* Arguments : R0 :pointer to src buffer +@ R1 :pointer to dst buffer +@ R2 :pu2_scale_matrix +@ R2 :pu2_threshold_matrix +@ STACk : u4_qbits +@ u4_round_factor +@ pu1_nnz +@ Values Returned : NONE +@ +@ Register Usage : +@ Stack Usage : 0 bytes +@ Cycles : Around +@ Interruptiaility : Interruptable +@ +@ Known Limitations +@ \Assumptions : +@ +@ Revision History : +@ DD MM YYYY Author(s) Changes +@ 20 2 2015 100633 First version +@ +@***************************************************************************** +@ih264_hadamard_quant_4x4_a9(WORD16 *pi2_src, WORD16 *pi2_dst, +@ const UWORD16 *pu2_scale_matrix, +@ const UWORD16 *pu2_threshold_matrix, UWORD32 u4_qbits, +@ UWORD32 u4_round_factor,UWORD8 *pu1_nnz +@ ) + .global ih264_hadamard_quant_4x4_a9 +ih264_hadamard_quant_4x4_a9: + +@Registert usage +@ r0 : src +@ r1 : dst +@ r2 : *pu2_scale_matrix +@ r3 : *pu2_threshold_matrix + + vld4.s16 {d0, d1, d2, d3}, [r0]! @Load 4x4 block + vpush {d8-d15} + + vld1.u16 d30[0], [r2] @load pu2_scale_matrix[0] + + vaddl.s16 q3, d0, d3 @x0 = x4 + x7; + vaddl.s16 q4, d1, d2 @x1 = x5 + x6; + vsubl.s16 q5, d1, d2 @x2 = x5 - x6; + vsubl.s16 q6, d0, d3 @x3 = x4 - x7; + + vdup.u16 d30, d30[0] @pu2_scale_matrix[0] + + vadd.s32 q7, q3, q4 @pi2_dst[0] = x0 + x1; + vadd.s32 q8, q6, q5 @pi2_dst[1] = x3 + x2; + add r3, sp, #68 @Get address of u4_round_factor + vsub.s32 q9, q3, q4 @pi2_dst[2] = x0 - x1; + vsub.s32 q10, q6, q5 @pi2_dst[3] = x3 - x2; + + vtrn.s32 q7, q8 @transpose 4x4 block + vtrn.s32 q9, q10 + vld1.s32 d0[0], [r3] @load u4_round_factor + vswp d15, d18 + vswp d17, d20 + + add r3, sp, #64 @Get address of u4_qbits + vadd.s32 q11, q7, q10 @x0 = x4 + x7; + vadd.s32 q12, q8, q9 @x1 = x5 + x6; + vld1.s32 d31[0], [r3] @load u4_qbits + vsub.s32 q13, q8, q9 @x2 = x5 - x6; + vsub.s32 q14, q7, q10 @x3 = x4 - x7; + + vdup.s32 q7, d0[0] @u4_round_factor + + vadd.s32 q0, q11, q12 @(x0 + x1) + vadd.s32 q1, q14, q13 @(x3 + x2) + vsub.s32 q2, q11, q12 @(x0 - x1) + vsub.s32 q3, q14, q13 @(x3 - x2) + + vdup.s32 q11, d31[0] @u4_round_factor + + vshrn.s32 d0, q0, #1 @i4_value = (x0 + x1) >> 1; + vshrn.s32 d1, q1, #1 @i4_value = (x3 + x2) >> 1; + vshrn.s32 d2, q2, #1 @i4_value = (x0 - x1) >> 1; + vshrn.s32 d3, q3, #1 @i4_value = (x3 - x2) >> 1; + + vabs.s16 q5, q0 + vabs.s16 q6, q1 + + vmov.s32 q8, q7 @Get the round fact + vmov.s32 q9, q7 + vmov.s32 q10, q7 + + vclt.s16 q3, q0, #0 @get the sign row 1,2 + vclt.s16 q4, q1, #0 + + vneg.s32 q11, q11 @-u4_round_factor + + vmlal.u16 q7, d10, d30 + vmlal.u16 q8, d11, d30 + vmlal.u16 q9, d12, d30 + vmlal.u16 q10, d13, d30 + + vshl.u32 q7, q7, q11 + vshl.u32 q8, q8, q11 + vshl.u32 q9, q9, q11 + vshl.u32 q10, q10, q11 + + vqmovn.u32 d22, q7 + vqmovn.u32 d23, q8 + vqmovn.u32 d24, q9 + vqmovn.u32 d25, q10 + + vneg.s16 q13, q11 + vneg.s16 q14, q12 + + vbsl.s16 q3, q13, q11 + vbsl.s16 q4, q14, q12 + + vceq.s16 q5, q11, #0 + vceq.s16 q6, q12, #0 + + vst1.s16 {q3}, [r1]! + + vshrn.u16 d14, q5, #8 + vshrn.u16 d15, q6, #8 + + ldr r3, [sp, #72] @Load *pu1_nnz + + vshr.u8 q7, q7, #7 + + vst1.s16 {q4}, [r1]! + + vadd.u8 d16, d14, d15 + vmov.u8 d20, #16 + vpadd.u8 d17, d16, d16 + vpadd.u8 d18, d17, d17 + vpadd.u8 d19, d18, d18 + vsub.u8 d20, d20, d19 + vst1.u8 d20[0], [r3] + + vpop {d8-d15} + bx lr + + + + +@***************************************************************************** +@* +@* Function Name : ih264_hadamard_quant_2x2_uv_a9 +@* Description : This function does forward hadamard transform and +@* quantization for dc block of chroma for both planes +@* +@* Arguments : R0 :pointer to src buffer +@ R1 :pointer to dst buffer +@ R2 :pu2_scale_matrix +@ R2 :pu2_threshold_matrix +@ STACk : u4_qbits +@ u4_round_factor +@ pu1_nnz +@ Values Returned : NONE +@ +@ Register Usage : +@ Stack Usage : 0 bytes +@ Cycles : Around +@ Interruptiaility : Interruptable +@ +@ Known Limitations +@ \Assumptions : +@ +@ Revision History : +@ DD MM YYYY Author(s) Changes +@ 20 2 2015 100633 First version +@ +@***************************************************************************** +@ ih264_hadamard_quant_2x2_uv_a9(WORD16 *pi2_src, WORD16 *pi2_dst, +@ const UWORD16 *pu2_scale_matrix, +@ const UWORD16 *pu2_threshold_matrix, UWORD32 u4_qbits, +@ UWORD32 u4_round_factor,UWORD8 *pu1_nnz +@ ) + + .global ih264_hadamard_quant_2x2_uv_a9 +ih264_hadamard_quant_2x2_uv_a9: + + vpush {d8-d15} + vld2.s16 {d0-d1}, [r0] @load src + + add r3, sp, #68 @Get address of u4_round_factor + + vaddl.s16 q3, d0, d1 @x0 = x4 + x5;, x2 = x6 + x7; + vld1.u16 d30[0], [r2] @load pu2_scale_matrix[0] + vsubl.s16 q4, d0, d1 @x1 = x4 - x5; x3 = x6 - x7; + + add r0, sp, #64 @Get affress of u4_qbits + vld1.s32 d28[0], [r3] @load u4_round_factor + vtrn.s32 q3, q4 @q1 -> x0 x1, q2 -> x2 x3 + + vadd.s32 q0, q3, q4 @ (x0 + x2) (x1 + x3) (y0 + y2); (y1 + y3); + vld1.s32 d24[0], [r0] @load u4_qbits + vsub.s32 q1, q3, q4 @ (x0 - x2) (x1 - x3) (y0 - y2); (y1 - y3); + + vdup.u16 d30, d30[0] @pu2_scale_matrix + + vabs.s32 q2, q0 + vabs.s32 q3, q1 + + vdup.s32 q14, d28[0] @u4_round_factor + + vmovl.u16 q15, d30 @pu2_scale_matrix + + vclt.s32 q4, q0, #0 @get the sign row 1,2 + vdup.s32 q12, d24[0] @u4_round_factor + vclt.s32 q5, q1, #0 + + vqmovn.u32 d8, q4 + vqmovn.s32 d9, q5 + + vmov.s32 q13, q14 @Get the round fact + vneg.s32 q12, q12 @-u4_round_factor + + vmla.u32 q13, q2, q15 + vmla.u32 q14, q3, q15 + + vshl.u32 q13, q13, q12 @>>qbit + vshl.u32 q14, q14, q12 @>>qbit + + vqmovn.u32 d10, q13 + vqmovn.u32 d11, q14 + + vneg.s16 q6, q5 + + vbsl.s16 q4, q6, q5 @*sign + + vtrn.s32 d8, d9 + + vceq.s16 q7, q4, #0 @Compute nnz + + vshrn.u16 d14, q7, #8 @reduce nnz comparison to 1 bit + + ldr r3, [sp, #72] @Load *pu1_nnz + vshr.u8 d14, d14, #7 @reduce nnz comparison to 1 bit + vmov.u8 d20, #4 @Since we add zeros, we need to subtract from 4 to get nnz + vpadd.u8 d17, d14, d14 @Sum up nnz + + vst1.s16 {q4}, [r1]! @Store the block + + vpadd.u8 d17, d17, d17 @Sum up nnz + vsub.u8 d20, d20, d17 @4- numzeros + vst1.u16 d20[0], [r3] @store nnz + + vpop {d8-d15} + bx lr + + + + + diff --git a/common/arm/ih264_weighted_bi_pred_a9q.s b/common/arm/ih264_weighted_bi_pred_a9q.s new file mode 100755 index 0000000..ccae779 --- /dev/null +++ b/common/arm/ih264_weighted_bi_pred_a9q.s @@ -0,0 +1,642 @@ +@/****************************************************************************** +@ * +@ * Copyright (C) 2015 The Android Open Source Project +@ * +@ * Licensed under the Apache License, Version 2.0 (the "License"); +@ * you may not use this file except in compliance with the License. +@ * You may obtain a copy of the License at: +@ * +@ * http://www.apache.org/licenses/LICENSE-2.0 +@ * +@ * Unless required by applicable law or agreed to in writing, software +@ * distributed under the License is distributed on an "AS IS" BASIS, +@ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +@ * See the License for the specific language governing permissions and +@ * limitations under the License. +@ * +@ ***************************************************************************** +@ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore +@*/ +@/** +@****************************************************************************** +@* @file +@* ih264_weighted_bi_pred_a9q.s +@* +@* @brief +@* Contains function definitions for weighted biprediction. +@* +@* @author +@* Kaushik Senthoor R +@* +@* @par List of Functions: +@* +@* - ih264_weighted_bi_pred_luma_a9q() +@* - ih264_weighted_bi_pred_chroma_a9q() +@* +@* @remarks +@* None +@* +@******************************************************************************* +@*/ +@******************************************************************************* +@* @function +@* ih264_weighted_bi_pred_luma_a9q() +@* +@* @brief +@* This routine performs the weighted biprediction as described in sec +@* 8.4.2.3.2 titled "Weighted sample prediction process" for luma. +@* +@* @par Description: +@* This function gets two ht x wd blocks, calculates the weighted samples, +@* rounds off, adds offset and stores it in the destination block. +@* +@* @param[in] pu1_src1 +@* UWORD8 Pointer to the buffer containing the input block 1. +@* +@* @param[in] pu1_src2 +@* UWORD8 Pointer to the buffer containing the input block 2. +@* +@* @param[out] pu1_dst +@* UWORD8 pointer to the destination where the output block is stored. +@* +@* @param[in] src_strd1 +@* Stride of the input buffer 1 +@* +@* @param[in] src_strd2 +@* Stride of the input buffer 2 +@* +@* @param[in] dst_strd +@* Stride of the destination buffer +@* +@* @param[in] log_wd +@* number of bits to be rounded off +@* +@* @param[in] wt1 +@* weight for the weighted prediction +@* +@* @param[in] wt2 +@* weight for the weighted prediction +@* +@* @param[in] ofst1 +@* offset 1 used after rounding off +@* +@* @param[in] ofst2 +@* offset 2 used after rounding off +@* +@* @param[in] ht +@* integer height of the array +@* +@* @param[in] wd +@* integer width of the array +@* +@* @returns +@* None +@* +@* @remarks +@* (ht,wd) can be (4,4), (4,8), (8,4), (8,8), (8,16), (16,8) or (16,16). +@* +@******************************************************************************* +@*/ +@void ih264_weighted_bi_pred_luma_a9q(UWORD8 *pu1_src1, +@ UWORD8 *pu1_src2, +@ UWORD8 *pu1_dst, +@ WORD32 src_strd1, +@ WORD32 src_strd2, +@ WORD32 dst_strd, +@ WORD32 log_wd, +@ WORD32 wt1, +@ WORD32 wt2, +@ WORD32 ofst1, +@ WORD32 ofst2, +@ WORD32 ht, +@ WORD32 wd) +@ +@**************Variables Vs Registers***************************************** +@ r0 => pu1_src1 +@ r1 => pu1_src2 +@ r2 => pu1_dst +@ r3 => src_strd1 +@ [sp] => src_strd2 (r4) +@ [sp+4] => dst_strd (r5) +@ [sp+8] => log_wd (r6) +@ [sp+12] => wt1 (r7) +@ [sp+16] => wt2 (r8) +@ [sp+20] => ofst1 (r9) +@ [sp+24] => ofst2 (r10) +@ [sp+28] => ht (r11) +@ [sp+32] => wd (r12) +@ +.text +.p2align 2 + + .global ih264_weighted_bi_pred_luma_a9q + +ih264_weighted_bi_pred_luma_a9q: + + stmfd sp!, {r4-r12, r14} @stack stores the values of the arguments + ldr r6, [sp, #48] @Load log_wd in r6 + ldr r7, [sp, #52] @Load wt1 in r7 + ldr r8, [sp, #56] @Load wt2 in r8 + ldr r9, [sp, #60] @Load ofst1 in r9 + + add r6, r6, #1 @r6 = log_wd + 1 + sxtb r7, r7 @sign-extend 16-bit wt1 to 32-bit + ldr r4, [sp, #40] @Load src_strd2 in r4 + ldr r5, [sp, #44] @Load dst_strd in r5 + sxtb r9, r9 @sign-extend 8-bit ofst1 to 32-bit + rsb r10, r6, #0 @r13 = -(log_wd + 1) + ldr r11, [sp, #68] @Load ht in r11 + ldr r12, [sp, #72] @Load wd in r12 + vdup.16 q0, r10 @Q0 = -(log_wd + 1) (32-bit) + add r9, r9, #1 @r9 = ofst1 + 1 + + ldr r10, [sp, #64] @Load ofst2 in r10 + sxtb r8, r8 @sign-extend 16-bit wt2 to 32-bit + cmp r12, #16 @check if wd is 16 + vpush {d8-d15} + sxtb r10, r10 @sign-extend 8-bit ofst2 to 32-bit + add r9, r9, r10 @r9 = ofst1 + ofst2 + 1 + vmov d2, r7, r8 @D2 = {wt1(32-bit), wt2(32-bit)} + asr r9, r9, #1 @r9 = ofst = (ofst1 + ofst2 + 1) >> 1 + vdup.8 d3, r9 @D3 = ofst (8-bit) + beq loop_16 @branch if wd is 16 + + cmp r12, #8 @check if wd is 8 + beq loop_8 @branch if wd is 8 + +loop_4: @each iteration processes four rows + + vld1.32 d4[0], [r0], r3 @load row 1 in source 1 + vld1.32 d4[1], [r0], r3 @load row 2 in source 1 + vld1.32 d6[0], [r1], r4 @load row 1 in source 2 + vld1.32 d6[1], [r1], r4 @load row 2 in source 2 + + vmovl.u8 q2, d4 @converting rows 1,2 in source 1 to 16-bit + vld1.32 d8[0], [r0], r3 @load row 3 in source 1 + vld1.32 d8[1], [r0], r3 @load row 4 in source 1 + vmovl.u8 q3, d6 @converting rows 1,2 in source 2 to 16-bit + vld1.32 d10[0], [r1], r4 @load row 3 in source 2 + vld1.32 d10[1], [r1], r4 @load row 4 in source 2 + + vmovl.u8 q4, d8 @converting rows 3,4 in source 1 to 16-bit + vmovl.u8 q5, d10 @converting rows 3,4 in source 2 to 16-bit + + vmul.s16 q2, q2, d2[0] @weight 1 mult. for rows 1,2 + vmla.s16 q2, q3, d2[2] @weight 2 mult. for rows 1,2 + vmul.s16 q4, q4, d2[0] @weight 1 mult. for rows 3,4 + vmla.s16 q4, q5, d2[2] @weight 2 mult. for rows 3,4 + + subs r11, r11, #4 @decrement ht by 4 + vrshl.s16 q2, q2, q0 @rounds off the weighted samples from rows 1,2 + vrshl.s16 q4, q4, q0 @rounds off the weighted samples from rows 3,4 + + vaddw.s8 q2, q2, d3 @adding offset for rows 1,2 + vaddw.s8 q4, q4, d3 @adding offset for rows 3,4 + + vqmovun.s16 d4, q2 @saturating rows 1,2 to unsigned 8-bit + vqmovun.s16 d8, q4 @saturating rows 3,4 to unsigned 8-bit + + vst1.32 d4[0], [r2], r5 @store row 1 in destination + vst1.32 d4[1], [r2], r5 @store row 2 in destination + vst1.32 d8[0], [r2], r5 @store row 3 in destination + vst1.32 d8[1], [r2], r5 @store row 4 in destination + + bgt loop_4 @if greater than 0 repeat the loop again + + b end_loops + +loop_8: @each iteration processes four rows + + vld1.8 d4, [r0], r3 @load row 1 in source 1 + vld1.8 d6, [r1], r4 @load row 1 in source 2 + vld1.8 d8, [r0], r3 @load row 2 in source 1 + vld1.8 d10, [r1], r4 @load row 2 in source 2 + vmovl.u8 q2, d4 @converting row 1 in source 1 to 16-bit + vld1.8 d12, [r0], r3 @load row 3 in source 1 + vld1.8 d14, [r1], r4 @load row 3 in source 2 + vmovl.u8 q3, d6 @converting row 1 in source 2 to 16-bit + vld1.8 d16, [r0], r3 @load row 4 in source 1 + vld1.8 d18, [r1], r4 @load row 4 in source 2 + + vmovl.u8 q4, d8 @converting row 2 in source 1 to 16-bit + vmovl.u8 q5, d10 @converting row 2 in source 2 to 16-bit + + vmul.s16 q2, q2, d2[0] @weight 1 mult. for row 1 + vmla.s16 q2, q3, d2[2] @weight 2 mult. for row 1 + vmovl.u8 q6, d12 @converting row 3 in source 1 to 16-bit + vmovl.u8 q7, d14 @converting row 3 in source 2 to 16-bit + vmul.s16 q4, q4, d2[0] @weight 1 mult. for row 2 + vmla.s16 q4, q5, d2[2] @weight 2 mult. for row 2 + vmovl.u8 q8, d16 @converting row 4 in source 1 to 16-bit + vmovl.u8 q9, d18 @converting row 4 in source 2 to 16-bit + + vmul.s16 q6, q6, d2[0] @weight 1 mult. for row 3 + vmla.s16 q6, q7, d2[2] @weight 2 mult. for row 3 + vmul.s16 q8, q8, d2[0] @weight 1 mult. for row 4 + vmla.s16 q8, q9, d2[2] @weight 2 mult. for row 4 + + vrshl.s16 q2, q2, q0 @rounds off the weighted samples from row 1 + vrshl.s16 q4, q4, q0 @rounds off the weighted samples from row 2 + vrshl.s16 q6, q6, q0 @rounds off the weighted samples from row 3 + vaddw.s8 q2, q2, d3 @adding offset for row 1 + vrshl.s16 q8, q8, q0 @rounds off the weighted samples from row 4 + vaddw.s8 q4, q4, d3 @adding offset for row 2 + + vaddw.s8 q6, q6, d3 @adding offset for row 3 + vqmovun.s16 d4, q2 @saturating row 1 to unsigned 8-bit + vaddw.s8 q8, q8, d3 @adding offset for row 4 + vqmovun.s16 d8, q4 @saturating row 2 to unsigned 8-bit + + vqmovun.s16 d12, q6 @saturating row 3 to unsigned 8-bit + vqmovun.s16 d16, q8 @saturating row 4 to unsigned 8-bit + + vst1.8 d4, [r2], r5 @store row 1 in destination + vst1.8 d8, [r2], r5 @store row 2 in destination + subs r11, r11, #4 @decrement ht by 4 + vst1.8 d12, [r2], r5 @store row 3 in destination + vst1.8 d16, [r2], r5 @store row 4 in destination + + bgt loop_8 @if greater than 0 repeat the loop again + + b end_loops + +loop_16: @each iteration processes two rows + + vld1.8 {q2}, [r0], r3 @load row 1 in source 1 + vld1.8 {q3}, [r1], r4 @load row 1 in source 2 + vld1.8 {q4}, [r0], r3 @load row 2 in source 1 + vld1.8 {q5}, [r1], r4 @load row 2 in source 2 + vmovl.u8 q10, d4 @converting row 1L in source 1 to 16-bit + vld1.8 {q6}, [r0], r3 @load row 3 in source 1 + vld1.8 {q7}, [r1], r4 @load row 3 in source 2 + vmovl.u8 q11, d6 @converting row 1L in source 2 to 16-bit + vld1.8 {q8}, [r0], r3 @load row 4 in source 1 + vld1.8 {q9}, [r1], r4 @load row 4 in source 2 + + vmovl.u8 q2, d5 @converting row 1H in source 1 to 16-bit + vmovl.u8 q3, d7 @converting row 1H in source 2 to 16-bit + + vmul.s16 q10, q10, d2[0] @weight 1 mult. for row 1L + vmla.s16 q10, q11, d2[2] @weight 2 mult. for row 1L + vmovl.u8 q12, d8 @converting row 2L in source 1 to 16-bit + vmovl.u8 q13, d10 @converting row 2L in source 2 to 16-bit + + vmul.s16 q2, q2, d2[0] @weight 1 mult. for row 1H + vmla.s16 q2, q3, d2[2] @weight 2 mult. for row 1H + vmovl.u8 q4, d9 @converting row 2H in source 1 to 16-bit + vmovl.u8 q5, d11 @converting row 2H in source 2 to 16-bit + + vmul.s16 q12, q12, d2[0] @weight 1 mult. for row 2L + vmla.s16 q12, q13, d2[2] @weight 2 mult. for row 2L + vmovl.u8 q14, d12 @converting row 3L in source 1 to 16-bit + vmovl.u8 q15, d14 @converting row 3L in source 2 to 16-bit + + vmul.s16 q4, q4, d2[0] @weight 1 mult. for row 2H + vmla.s16 q4, q5, d2[2] @weight 2 mult. for row 2H + vmovl.u8 q6, d13 @converting row 3H in source 1 to 16-bit + vmovl.u8 q7, d15 @converting row 3H in source 2 to 16-bit + + vmul.s16 q14, q14, d2[0] @weight 1 mult. for row 3L + vmla.s16 q14, q15, d2[2] @weight 2 mult. for row 3L + vmovl.u8 q11, d16 @converting row 4L in source 1 to 16-bit + vmovl.u8 q3, d18 @converting row 4L in source 2 to 16-bit + + vmul.s16 q6, q6, d2[0] @weight 1 mult. for row 3H + vmla.s16 q6, q7, d2[2] @weight 2 mult. for row 3H + vmovl.u8 q8, d17 @converting row 4H in source 1 to 16-bit + vmovl.u8 q9, d19 @converting row 4H in source 2 to 16-bit + + vmul.s16 q11, q11, d2[0] @weight 1 mult. for row 4L + vmla.s16 q11, q3, d2[2] @weight 2 mult. for row 4L + vrshl.s16 q10, q10, q0 @rounds off the weighted samples from row 1L + + vmul.s16 q8, q8, d2[0] @weight 1 mult. for row 4H + vmla.s16 q8, q9, d2[2] @weight 2 mult. for row 4H + vrshl.s16 q2, q2, q0 @rounds off the weighted samples from row 1H + + vrshl.s16 q12, q12, q0 @rounds off the weighted samples from row 2L + vaddw.s8 q10, q10, d3 @adding offset for row 1L + vrshl.s16 q4, q4, q0 @rounds off the weighted samples from row 2H + vaddw.s8 q2, q2, d3 @adding offset for row 1H + vrshl.s16 q14, q14, q0 @rounds off the weighted samples from row 3L + vaddw.s8 q12, q12, d3 @adding offset for row 2L + vrshl.s16 q6, q6, q0 @rounds off the weighted samples from row 3H + vaddw.s8 q4, q4, d3 @adding offset for row 2H + vrshl.s16 q11, q11, q0 @rounds off the weighted samples from row 4L + vaddw.s8 q14, q14, d3 @adding offset for row 3L + vrshl.s16 q8, q8, q0 @rounds off the weighted samples from row 4H + vaddw.s8 q6, q6, d3 @adding offset for row 3H + + vqmovun.s16 d26, q10 @saturating row 1L to unsigned 8-bit + vaddw.s8 q11, q11, d3 @adding offset for row 4L + vqmovun.s16 d27, q2 @saturating row 1H to unsigned 8-bit + vaddw.s8 q8, q8, d3 @adding offset for row 4H + + vqmovun.s16 d10, q12 @saturating row 2L to unsigned 8-bit + vqmovun.s16 d11, q4 @saturating row 2H to unsigned 8-bit + vqmovun.s16 d30, q14 @saturating row 3L to unsigned 8-bit + vqmovun.s16 d31, q6 @saturating row 3H to unsigned 8-bit + vst1.8 {q13}, [r2], r5 @store row 1 in destination + vqmovun.s16 d14, q11 @saturating row 4L to unsigned 8-bit + vqmovun.s16 d15, q8 @saturating row 4H to unsigned 8-bit + + vst1.8 {q5}, [r2], r5 @store row 2 in destination + subs r11, r11, #4 @decrement ht by 4 + vst1.8 {q15}, [r2], r5 @store row 3 in destination + vst1.8 {q7}, [r2], r5 @store row 4 in destination + + bgt loop_16 @if greater than 0 repeat the loop again + +end_loops: + + vpop {d8-d15} + ldmfd sp!, {r4-r12, r15} @Reload the registers from sp + + +@******************************************************************************* +@* @function +@* ih264_weighted_bi_pred_chroma_a9q() +@* +@* @brief +@* This routine performs the default weighted prediction as described in sec +@* 8.4.2.3.2 titled "Weighted sample prediction process" for chroma. +@* +@* @par Description: +@* This function gets two ht x wd blocks, calculates the weighted samples, +@* rounds off, adds offset and stores it in the destination block for U and V. +@* +@* @param[in] pu1_src1 +@* UWORD8 Pointer to the buffer containing the input block 1. +@* +@* @param[in] pu1_src2 +@* UWORD8 Pointer to the buffer containing the input block 2. +@* +@* @param[out] pu1_dst +@* UWORD8 pointer to the destination where the output block is stored. +@* +@* @param[in] src_strd1 +@* Stride of the input buffer 1 +@* +@* @param[in] src_strd2 +@* Stride of the input buffer 2 +@* +@* @param[in] dst_strd +@* Stride of the destination buffer +@* +@* @param[in] log_wd +@* number of bits to be rounded off +@* +@* @param[in] wt1 +@* weights for the weighted prediction in U and V +@* +@* @param[in] wt2 +@* weights for the weighted prediction in U and V +@* +@* @param[in] ofst1 +@* offset 1 used after rounding off for U an dV +@* +@* @param[in] ofst2 +@* offset 2 used after rounding off for U and V +@* +@* @param[in] ht +@* integer height of the array +@* +@* @param[in] wd +@* integer width of the array +@* +@* @returns +@* None +@* +@* @remarks +@* (ht,wd) can be (2,2), (2,4), (4,2), (4,4), (4,8), (8,4) or (8,8). +@* +@******************************************************************************* +@*/ +@void ih264_weighted_bi_pred_chroma_a9q(UWORD8 *pu1_src1, +@ UWORD8 *pu1_src2, +@ UWORD8 *pu1_dst, +@ WORD32 src_strd1, +@ WORD32 src_strd2, +@ WORD32 dst_strd, +@ WORD32 log_wd, +@ WORD32 wt1, +@ WORD32 wt2, +@ WORD32 ofst1, +@ WORD32 ofst2, +@ WORD32 ht, +@ WORD32 wd) +@ +@**************Variables Vs Registers***************************************** +@ r0 => pu1_src1 +@ r1 => pu1_src2 +@ r2 => pu1_dst +@ r3 => src_strd1 +@ [sp] => src_strd2 (r4) +@ [sp+4] => dst_strd (r5) +@ [sp+8] => log_wd (r6) +@ [sp+12] => wt1 (r7) +@ [sp+16] => wt2 (r8) +@ [sp+20] => ofst1 (r9) +@ [sp+24] => ofst2 (r10) +@ [sp+28] => ht (r11) +@ [sp+32] => wd (r12) +@ + + + .global ih264_weighted_bi_pred_chroma_a9q + +ih264_weighted_bi_pred_chroma_a9q: + + stmfd sp!, {r4-r12, r14} @stack stores the values of the arguments + + ldr r6, [sp, #48] @Load log_wd in r6 + ldr r7, [sp, #52] @Load wt1 in r7 + ldr r8, [sp, #56] @Load wt2 in r8 + add r6, r6, #1 @r6 = log_wd + 1 + ldr r9, [sp, #60] @Load ofst1 in r9 + ldr r10, [sp, #64] @Load ofst2 in r10 + + rsb r12, r6, #0 @r12 = -(log_wd + 1) + ldr r4, [sp, #40] @Load src_strd2 in r4 + ldr r5, [sp, #44] @Load dst_strd in r5 + vdup.16 q0, r12 @Q0 = -(log_wd + 1) (16-bit) + + ldr r11, [sp, #68] @Load ht in r11 + vdup.32 q1, r7 @Q1 = (wt1_u, wt1_v) (32-bit) + ldr r12, [sp, #72] @Load wd in r12 + vdup.32 q2, r8 @Q2 = (wt2_u, wt2_v) (32-bit) + asr r7, r9, #8 @r7 = ofst1_v + asr r8, r10, #8 @r8 = ofst2_v + vpush {d8-d15} + sxtb r9, r9 @sign-extend 8-bit ofst1_u to 32-bit + sxtb r10, r10 @sign-extend 8-bit ofst2_u to 32-bit + sxtb r7, r7 @sign-extend 8-bit ofst1_v to 32-bit + sxtb r8, r8 @sign-extend 8-bit ofst2_v to 32-bit + + add r9, r9, #1 @r9 = ofst1_u + 1 + add r7, r7, #1 @r7 = ofst1_v + 1 + add r9, r9, r10 @r9 = ofst1_u + ofst2_u + 1 + add r7, r7, r8 @r7 = ofst1_v + ofst2_v + 1 + asr r9, r9, #1 @r9 = ofst_u = (ofst1_u + ofst2_u + 1) >> 1 + asr r7, r7, #1 @r7 = ofst_v = (ofst1_v + ofst2_v + 1) >> 1 + cmp r12, #8 @check if wd is 8 + pkhbt r9, r9, r7, lsl #16 @r9 = {ofst_u(16-bit), ofst_v(16-bit)} + vdup.32 q3, r9 @Q3 = {ofst_u(16-bit), ofst_v(16-bit)} + beq loop_8_uv @branch if wd is 8 + + cmp r12, #4 @check if wd is 4 + beq loop_4_uv @branch if wd is 4 + +loop_2_uv: @each iteration processes two rows + + vld1.32 d8[0], [r0], r3 @load row 1 in source 1 + vld1.32 d8[1], [r0], r3 @load row 2 in source 1 + vld1.32 d10[0], [r1], r4 @load row 1 in source 2 + vld1.32 d10[1], [r1], r4 @load row 2 in source 2 + + vmovl.u8 q4, d8 @converting rows 1,2 in source 1 to 16-bit + vmovl.u8 q5, d10 @converting rows 1,2 in source 2 to 16-bit + + vmul.s16 q4, q4, q1 @weight 1 mult. for rows 1,2 + vmla.s16 q4, q5, q2 @weight 2 mult. for rows 1,2 + + vrshl.s16 q4, q4, q0 @rounds off the weighted samples from rows 1,2 + + vadd.s16 q4, q4, q3 @adding offset for rows 1,2 + + vqmovun.s16 d8, q4 @saturating rows 1,2 to unsigned 8-bit + + vst1.32 d8[0], [r2], r5 @store row 1 in destination + vst1.32 d8[1], [r2], r5 @store row 2 in destination + + subs r11, r11, #2 @decrement ht by 2 + bgt loop_2_uv @if greater than 0 repeat the loop again + + b end_loops_uv + +loop_4_uv: @each iteration processes two rows + + vld1.8 d8, [r0], r3 @load row 1 in source 1 + vld1.8 d10, [r1], r4 @load row 1 in source 2 + vmovl.u8 q4, d8 @converting row 1 in source 1 to 16-bit + vld1.8 d12, [r0], r3 @load row 2 in source 1 + vmovl.u8 q5, d10 @converting row 1 in source 2 to 16-bit + vld1.8 d14, [r1], r4 @load row 2 in source 2 + + vmovl.u8 q6, d12 @converting row 2 in source 1 to 16-bit + vmul.s16 q4, q4, q1 @weight 1 mult. for row 1 + vmla.s16 q4, q5, q2 @weight 2 mult. for row 1 + vmovl.u8 q7, d14 @converting row 2 in source 2 to 16-bit + + vmul.s16 q6, q6, q1 @weight 1 mult. for row 2 + vmla.s16 q6, q7, q2 @weight 2 mult. for row 2 + + subs r11, r11, #2 @decrement ht by 2 + vrshl.s16 q4, q4, q0 @rounds off the weighted samples from row 1 + vrshl.s16 q6, q6, q0 @rounds off the weighted samples from row 2 + vadd.s16 q4, q4, q3 @adding offset for row 1 + vadd.s16 q6, q6, q3 @adding offset for row 2 + + vqmovun.s16 d8, q4 @saturating row 1 to unsigned 8-bit + vqmovun.s16 d12, q6 @saturating row 2 to unsigned 8-bit + + vst1.8 d8, [r2], r5 @store row 1 in destination + vst1.8 d12, [r2], r5 @store row 2 in destination + + bgt loop_4_uv @if greater than 0 repeat the loop again + + b end_loops_uv + +loop_8_uv: @each iteration processes two rows + + vld1.8 {q4}, [r0], r3 @load row 1 in source 1 + vld1.8 {q5}, [r1], r4 @load row 1 in source 2 + vld1.8 {q6}, [r0], r3 @load row 2 in source 1 + vld1.8 {q7}, [r1], r4 @load row 2 in source 2 + vmovl.u8 q12, d8 @converting row 1L in source 1 to 16-bit + vld1.8 {q8}, [r0], r3 @load row 3 in source 1 + vld1.8 {q9}, [r1], r4 @load row 3 in source 2 + vmovl.u8 q13, d10 @converting row 1L in source 2 to 16-bit + vld1.8 {q10}, [r0], r3 @load row 4 in source 1 + vld1.8 {q11}, [r1], r4 @load row 4 in source 2 + + vmovl.u8 q4, d9 @converting row 1H in source 1 to 16-bit + vmovl.u8 q5, d11 @converting row 1H in source 2 to 16-bit + + vmul.s16 q12, q12, q1 @weight 1 mult. for row 1L + vmla.s16 q12, q13, q2 @weight 2 mult. for row 1L + vmovl.u8 q14, d12 @converting row 2L in source 1 to 16-bit + vmovl.u8 q15, d14 @converting row 2L in source 2 to 16-bit + + vmul.s16 q4, q4, q1 @weight 1 mult. for row 1H + vmla.s16 q4, q5, q2 @weight 2 mult. for row 1H + vmovl.u8 q6, d13 @converting row 2H in source 1 to 16-bit + vmovl.u8 q7, d15 @converting row 2H in source 2 to 16-bit + + vmul.s16 q14, q14, q1 @weight 1 mult. for row 2L + vmla.s16 q14, q15, q2 @weight 2 mult. for row 2L + vmovl.u8 q13, d16 @converting row 3L in source 1 to 16-bit + vmovl.u8 q5, d18 @converting row 3L in source 2 to 16-bit + + vmul.s16 q6, q6, q1 @weight 1 mult. for row 2H + vmla.s16 q6, q7, q2 @weight 2 mult. for row 2H + vmovl.u8 q8, d17 @converting row 3H in source 1 to 16-bit + vmovl.u8 q9, d19 @converting row 3H in source 2 to 16-bit + + vmul.s16 q13, q13, q1 @weight 1 mult. for row 3L + vmla.s16 q13, q5, q2 @weight 2 mult. for row 3L + vmovl.u8 q15, d20 @converting row 4L in source 1 to 16-bit + vmovl.u8 q7, d22 @converting row 4L in source 2 to 16-bit + + vmul.s16 q8, q8, q1 @weight 1 mult. for row 3H + vmla.s16 q8, q9, q2 @weight 2 mult. for row 3H + vmovl.u8 q10, d21 @converting row 4H in source 1 to 16-bit + vmovl.u8 q11, d23 @converting row 4H in source 2 to 16-bit + + vmul.s16 q15, q15, q1 @weight 1 mult. for row 4L + vmla.s16 q15, q7, q2 @weight 2 mult. for row 4L + vrshl.s16 q12, q12, q0 @rounds off the weighted samples from row 1L + + vmul.s16 q10, q10, q1 @weight 1 mult. for row 4H + vmla.s16 q10, q11, q2 @weight 2 mult. for row 4H + vrshl.s16 q4, q4, q0 @rounds off the weighted samples from row 1H + + vrshl.s16 q14, q14, q0 @rounds off the weighted samples from row 2L + vadd.s16 q12, q12, q3 @adding offset for row 1L + vrshl.s16 q6, q6, q0 @rounds off the weighted samples from row 2H + vadd.s16 q4, q4, q3 @adding offset for row 1H + vrshl.s16 q13, q13, q0 @rounds off the weighted samples from row 3L + vadd.s16 q14, q14, q3 @adding offset for row 2L + vrshl.s16 q8, q8, q0 @rounds off the weighted samples from row 3H + vadd.s16 q6, q6, q3 @adding offset for row 2H + vrshl.s16 q15, q15, q0 @rounds off the weighted samples from row 4L + vadd.s16 q13, q13, q3 @adding offset for row 3L + vrshl.s16 q10, q10, q0 @rounds off the weighted samples from row 4H + vadd.s16 q8, q8, q3 @adding offset for row 3H + + vqmovun.s16 d10, q12 @saturating row 1L to unsigned 8-bit + vadd.s16 q15, q15, q3 @adding offset for row 4L + vqmovun.s16 d11, q4 @saturating row 1H to unsigned 8-bit + vadd.s16 q10, q10, q3 @adding offset for row 4H + + vqmovun.s16 d18, q14 @saturating row 2L to unsigned 8-bit + vqmovun.s16 d19, q6 @saturating row 2H to unsigned 8-bit + vqmovun.s16 d14, q13 @saturating row 3L to unsigned 8-bit + vqmovun.s16 d15, q8 @saturating row 3H to unsigned 8-bit + vst1.8 {q5}, [r2], r5 @store row 1 in destination + vqmovun.s16 d22, q15 @saturating row 4L to unsigned 8-bit + vqmovun.s16 d23, q10 @saturating row 4H to unsigned 8-bit + + vst1.8 {q9}, [r2], r5 @store row 2 in destination + subs r11, r11, #4 @decrement ht by 4 + vst1.8 {q7}, [r2], r5 @store row 3 in destination + vst1.8 {q11}, [r2], r5 @store row 4 in destination + + bgt loop_8_uv @if greater than 0 repeat the loop again + +end_loops_uv: + + vpop {d8-d15} + ldmfd sp!, {r4-r12, r15} @Reload the registers from sp + + diff --git a/common/arm/ih264_weighted_pred_a9q.s b/common/arm/ih264_weighted_pred_a9q.s new file mode 100755 index 0000000..1ce94d0 --- /dev/null +++ b/common/arm/ih264_weighted_pred_a9q.s @@ -0,0 +1,479 @@ +@/****************************************************************************** +@ * +@ * Copyright (C) 2015 The Android Open Source Project +@ * +@ * Licensed under the Apache License, Version 2.0 (the "License"); +@ * you may not use this file except in compliance with the License. +@ * You may obtain a copy of the License at: +@ * +@ * http://www.apache.org/licenses/LICENSE-2.0 +@ * +@ * Unless required by applicable law or agreed to in writing, software +@ * distributed under the License is distributed on an "AS IS" BASIS, +@ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +@ * See the License for the specific language governing permissions and +@ * limitations under the License. +@ * +@ ***************************************************************************** +@ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore +@*/ +@/** +@****************************************************************************** +@* @file +@* ih264_weighted_pred_a9q.s +@* +@* @brief +@* Contains function definitions for weighted prediction. +@* +@* @author +@* Kaushik Senthoor R +@* +@* @par List of Functions: +@* +@* - ih264_weighted_pred_luma_a9q() +@* - ih264_weighted_pred_chroma_a9q() +@* +@* @remarks +@* None +@* +@******************************************************************************* +@*/ +@******************************************************************************* +@* @function +@* ih264_weighted_pred_luma_a9q() +@* +@* @brief +@* This routine performs the default weighted prediction as described in sec +@* 8.4.2.3.2 titled "Weighted sample prediction process" for luma. +@* +@* @par Description: +@* This function gets a ht x wd block, calculates the weighted sample, rounds +@* off, adds offset and stores it in the destination block. +@* +@* @param[in] pu1_src: +@* UWORD8 Pointer to the buffer containing the input block. +@* +@* @param[out] pu1_dst +@* UWORD8 pointer to the destination where the output block is stored. +@* +@* @param[in] src_strd +@* Stride of the input buffer +@* +@* @param[in] dst_strd +@* Stride of the destination buffer +@* +@* @param[in] log_wd +@* number of bits to be rounded off +@* +@* @param[in] wt +@* weight for the weighted prediction +@* +@* @param[in] ofst +@* offset used after rounding off +@* +@* @param[in] ht +@* integer height of the array +@* +@* @param[in] wd +@* integer width of the array +@* +@* @returns +@* None +@* +@* @remarks +@* (ht,wd) can be (4,4), (4,8), (8,4), (8,8), (8,16), (16,8) or (16,16). +@* +@******************************************************************************* +@*/ +@void ih264_weighted_pred_luma_a9q(UWORD8 *pu1_src, +@ UWORD8 *pu1_dst, +@ WORD32 src_strd, +@ WORD32 dst_strd, +@ WORD32 log_wd, +@ WORD32 wt, +@ WORD32 ofst, +@ WORD32 ht, +@ WORD32 wd) +@ +@**************Variables Vs Registers***************************************** +@ r0 => pu1_src +@ r1 => pu1_dst +@ r2 => src_strd +@ r3 => dst_strd +@ [sp] => log_wd (r4) +@ [sp+4] => wt (r5) +@ [sp+8] => ofst (r6) +@ [sp+12] => ht (r7) +@ [sp+16] => wd (r8) +@ +.text +.p2align 2 + + .global ih264_weighted_pred_luma_a9q + +ih264_weighted_pred_luma_a9q: + + stmfd sp!, {r4-r9, r14} @stack stores the values of the arguments + ldr r5, [sp, #32] @Load wt + ldr r4, [sp, #28] @Load log_wd in r4 + ldr r6, [sp, #36] @Load ofst + ldr r7, [sp, #40] @Load ht + ldr r8, [sp, #44] @Load wd + vpush {d8-d15} + + vdup.16 d2, r5 @D2 = wt (16-bit) + rsb r9, r4, #0 @r9 = -log_wd + vdup.8 d3, r6 @D3 = ofst (8-bit) + cmp r8, #16 @check if wd is 16 + vdup.16 q0, r9 @Q0 = -log_wd (16-bit) + beq loop_16 @branch if wd is 16 + + cmp r8, #8 @check if wd is 8 + beq loop_8 @branch if wd is 8 + +loop_4: @each iteration processes four rows + + vld1.32 d4[0], [r0], r2 @load row 1 in source + vld1.32 d4[1], [r0], r2 @load row 2 in source + vld1.32 d6[0], [r0], r2 @load row 3 in source + vld1.32 d6[1], [r0], r2 @load row 4 in source + + vmovl.u8 q2, d4 @converting rows 1,2 to 16-bit + vmovl.u8 q3, d6 @converting rows 3,4 to 16-bit + + vmul.s16 q2, q2, d2[0] @weight mult. for rows 1,2 + vmul.s16 q3, q3, d2[0] @weight mult. for rows 3,4 + + subs r7, r7, #4 @decrement ht by 4 + vrshl.s16 q2, q2, q0 @rounds off the weighted samples from rows 1,2 + vrshl.s16 q3, q3, q0 @rounds off the weighted samples from rows 3,4 + + vaddw.s8 q2, q2, d3 @adding offset for rows 1,2 + vaddw.s8 q3, q3, d3 @adding offset for rows 3,4 + + vqmovun.s16 d4, q2 @saturating rows 1,2 to unsigned 8-bit + vqmovun.s16 d6, q3 @saturating rows 3,4 to unsigned 8-bit + + vst1.32 d4[0], [r1], r3 @store row 1 in destination + vst1.32 d4[1], [r1], r3 @store row 2 in destination + vst1.32 d6[0], [r1], r3 @store row 3 in destination + vst1.32 d6[1], [r1], r3 @store row 4 in destination + + bgt loop_4 @if greater than 0 repeat the loop again + + b end_loops + +loop_8: @each iteration processes four rows + + vld1.8 d4, [r0], r2 @load row 1 in source + vld1.8 d6, [r0], r2 @load row 2 in source + vld1.8 d8, [r0], r2 @load row 3 in source + vmovl.u8 q2, d4 @converting row 1 to 16-bit + vld1.8 d10, [r0], r2 @load row 4 in source + vmovl.u8 q3, d6 @converting row 2 to 16-bit + + vmovl.u8 q4, d8 @converting row 3 to 16-bit + vmul.s16 q2, q2, d2[0] @weight mult. for row 1 + vmovl.u8 q5, d10 @converting row 4 to 16-bit + vmul.s16 q3, q3, d2[0] @weight mult. for row 2 + vmul.s16 q4, q4, d2[0] @weight mult. for row 3 + vmul.s16 q5, q5, d2[0] @weight mult. for row 4 + + vrshl.s16 q2, q2, q0 @rounds off the weighted samples from row 1 + vrshl.s16 q3, q3, q0 @rounds off the weighted samples from row 2 + vrshl.s16 q4, q4, q0 @rounds off the weighted samples from row 3 + vaddw.s8 q2, q2, d3 @adding offset for row 1 + vrshl.s16 q5, q5, q0 @rounds off the weighted samples from row 4 + vaddw.s8 q3, q3, d3 @adding offset for row 2 + + vaddw.s8 q4, q4, d3 @adding offset for row 3 + vqmovun.s16 d4, q2 @saturating row 1 to unsigned 8-bit + vaddw.s8 q5, q5, d3 @adding offset for row 4 + vqmovun.s16 d6, q3 @saturating row 2 to unsigned 8-bit + vqmovun.s16 d8, q4 @saturating row 3 to unsigned 8-bit + vqmovun.s16 d10, q5 @saturating row 4 to unsigned 8-bit + + vst1.8 d4, [r1], r3 @store row 1 in destination + vst1.8 d6, [r1], r3 @store row 2 in destination + subs r7, r7, #4 @decrement ht by 4 + vst1.8 d8, [r1], r3 @store row 3 in destination + vst1.8 d10, [r1], r3 @store row 4 in destination + + bgt loop_8 @if greater than 0 repeat the loop again + + b end_loops + +loop_16: @each iteration processes two rows + + vld1.8 {q2}, [r0], r2 @load row 1 in source + vld1.8 {q3}, [r0], r2 @load row 2 in source + vmovl.u8 q6, d4 @converting row 1L to 16-bit + vld1.8 {q4}, [r0], r2 @load row 3 in source + vmovl.u8 q7, d5 @converting row 1H to 16-bit + vld1.8 {q5}, [r0], r2 @load row 4 in source + + vmovl.u8 q8, d6 @converting row 2L to 16-bit + vmul.s16 q6, q6, d2[0] @weight mult. for row 1L + vmovl.u8 q9, d7 @converting row 2H to 16-bit + vmul.s16 q7, q7, d2[0] @weight mult. for row 1H + vmovl.u8 q10, d8 @converting row 3L to 16-bit + vmul.s16 q8, q8, d2[0] @weight mult. for row 2L + vmovl.u8 q11, d9 @converting row 3H to 16-bit + vmul.s16 q9, q9, d2[0] @weight mult. for row 2H + vmovl.u8 q12, d10 @converting row 4L to 16-bit + vmul.s16 q10, q10, d2[0] @weight mult. for row 3L + vmovl.u8 q13, d11 @converting row 4H to 16-bit + vmul.s16 q11, q11, d2[0] @weight mult. for row 3H + + vmul.s16 q12, q12, d2[0] @weight mult. for row 4L + vrshl.s16 q6, q6, q0 @rounds off the weighted samples from row 1L + vmul.s16 q13, q13, d2[0] @weight mult. for row 4H + + vrshl.s16 q7, q7, q0 @rounds off the weighted samples from row 1H + vrshl.s16 q8, q8, q0 @rounds off the weighted samples from row 2L + vaddw.s8 q6, q6, d3 @adding offset for row 1L + vrshl.s16 q9, q9, q0 @rounds off the weighted samples from row 2H + vaddw.s8 q7, q7, d3 @adding offset for row 1H + vqmovun.s16 d4, q6 @saturating row 1L to unsigned 8-bit + vrshl.s16 q10, q10, q0 @rounds off the weighted samples from row 3L + vaddw.s8 q8, q8, d3 @adding offset for row 2L + vqmovun.s16 d5, q7 @saturating row 1H to unsigned 8-bit + vrshl.s16 q11, q11, q0 @rounds off the weighted samples from row 3H + vaddw.s8 q9, q9, d3 @adding offset for row 2H + vqmovun.s16 d6, q8 @saturating row 2L to unsigned 8-bit + vrshl.s16 q12, q12, q0 @rounds off the weighted samples from row 4L + vaddw.s8 q10, q10, d3 @adding offset for row 3L + vqmovun.s16 d7, q9 @saturating row 2H to unsigned 8-bit + vrshl.s16 q13, q13, q0 @rounds off the weighted samples from row 4H + vaddw.s8 q11, q11, d3 @adding offset for row 3H + + vqmovun.s16 d8, q10 @saturating row 3L to unsigned 8-bit + vaddw.s8 q12, q12, d3 @adding offset for row 4L + vqmovun.s16 d9, q11 @saturating row 3H to unsigned 8-bit + vaddw.s8 q13, q13, d3 @adding offset for row 4H + + vqmovun.s16 d10, q12 @saturating row 4L to unsigned 8-bit + vst1.8 {q2}, [r1], r3 @store row 1 in destination + vqmovun.s16 d11, q13 @saturating row 4H to unsigned 8-bit + vst1.8 {q3}, [r1], r3 @store row 2 in destination + subs r7, r7, #4 @decrement ht by 4 + vst1.8 {q4}, [r1], r3 @store row 3 in destination + vst1.8 {q5}, [r1], r3 @store row 4 in destination + + bgt loop_16 @if greater than 0 repeat the loop again + +end_loops: + + vpop {d8-d15} + ldmfd sp!, {r4-r9, r15} @Reload the registers from sp + + +@******************************************************************************* +@* @function +@* ih264_weighted_pred_chroma_a9q() +@* +@* @brief +@* This routine performs the default weighted prediction as described in sec +@* 8.4.2.3.2 titled "Weighted sample prediction process" for chroma. +@* +@* @par Description: +@* This function gets a ht x wd block, calculates the weighted sample, rounds +@* off, adds offset and stores it in the destination block for U and V. +@* +@* @param[in] pu1_src: +@* UWORD8 Pointer to the buffer containing the input block. +@* +@* @param[out] pu1_dst +@* UWORD8 pointer to the destination where the output block is stored. +@* +@* @param[in] src_strd +@* Stride of the input buffer +@* +@* @param[in] dst_strd +@* Stride of the destination buffer +@* +@* @param[in] log_wd +@* number of bits to be rounded off +@* +@* @param[in] wt +@* weights for the weighted prediction for U and V +@* +@* @param[in] ofst +@* offsets used after rounding off for U and V +@* +@* @param[in] ht +@* integer height of the array +@* +@* @param[in] wd +@* integer width of the array +@* +@* @returns +@* None +@* +@* @remarks +@* (ht,wd) can be (2,2), (2,4), (4,2), (4,4), (4,8), (8,4) or (8,8). +@* +@******************************************************************************* +@*/ +@void ih264_weighted_pred_chroma_a9q(UWORD8 *pu1_src, +@ UWORD8 *pu1_dst, +@ WORD32 src_strd, +@ WORD32 dst_strd, +@ WORD32 log_wd, +@ WORD32 wt, +@ WORD32 ofst, +@ WORD32 ht, +@ WORD32 wd) +@ +@**************Variables Vs Registers***************************************** +@ r0 => pu1_src +@ r1 => pu1_dst +@ r2 => src_strd +@ r3 => dst_strd +@ [sp] => log_wd (r4) +@ [sp+4] => wt (r5) +@ [sp+8] => ofst (r6) +@ [sp+12] => ht (r7) +@ [sp+16] => wd (r8) +@ + + + .global ih264_weighted_pred_chroma_a9q + +ih264_weighted_pred_chroma_a9q: + + stmfd sp!, {r4-r9, r14} @stack stores the values of the arguments + + ldr r4, [sp, #28] @Load log_wd in r4 + ldr r5, [sp, #32] @Load wt = {wt_u (16-bit), wt_v (16-bit)} + ldr r6, [sp, #36] @Load ofst = {ofst_u (8-bit), ofst_v (8-bit)} + ldr r8, [sp, #44] @Load wd + + rsb r9, r4, #0 @r9 = -log_wd + vdup.32 q1, r5 @Q1 = {wt_u (16-bit), wt_v (16-bit)} + ldr r7, [sp, #40] @Load ht + vpush {d8-d15} + vdup.16 d4, r6 @D4 = {ofst_u (8-bit), ofst_v (8-bit)} + cmp r8, #8 @check if wd is 8 + vdup.16 q0, r9 @Q0 = -log_wd (16-bit) + beq loop_8_uv @branch if wd is 8 + + cmp r8, #4 @check if ws is 4 + beq loop_4_uv @branch if wd is 4 + +loop_2_uv: @each iteration processes two rows + + vld1.32 d6[0], [r0], r2 @load row 1 in source + vld1.32 d6[1], [r0], r2 @load row 2 in source + + vmovl.u8 q3, d6 @converting rows 1,2 to 16-bit + + vmul.s16 q3, q3, q1 @weight mult. for rows 1,2 + + vrshl.s16 q3, q3, q0 @rounds off the weighted samples from rows 1,2 + + vaddw.s8 q3, q3, d4 @adding offset for rows 1,2 + + vqmovun.s16 d6, q3 @saturating rows 1,2 to unsigned 8-bit + + subs r7, r7, #2 @decrement ht by 2 + vst1.32 d6[0], [r1], r3 @store row 1 in destination + vst1.32 d6[1], [r1], r3 @store row 2 in destination + + bgt loop_2_uv @if greater than 0 repeat the loop again + + b end_loops_uv + +loop_4_uv: @each iteration processes two rows + + vld1.8 d6, [r0], r2 @load row 1 in source + vld1.8 d8, [r0], r2 @load row 2 in source + + vmovl.u8 q3, d6 @converting row 1 to 16-bit + vmovl.u8 q4, d8 @converting row 2 to 16-bit + + vmul.s16 q3, q3, q1 @weight mult. for row 1 + vmul.s16 q4, q4, q1 @weight mult. for row 2 + + subs r7, r7, #2 @decrement ht by 2 + vrshl.s16 q3, q3, q0 @rounds off the weighted samples from row 1 + vrshl.s16 q4, q4, q0 @rounds off the weighted samples from row 2 + + vaddw.s8 q3, q3, d4 @adding offset for row 1 + vaddw.s8 q4, q4, d4 @adding offset for row 2 + + vqmovun.s16 d6, q3 @saturating row 1 to unsigned 8-bit + vqmovun.s16 d8, q4 @saturating row 2 to unsigned 8-bit + + vst1.8 d6, [r1], r3 @store row 1 in destination + vst1.8 d8, [r1], r3 @store row 2 in destination + + bgt loop_4_uv @if greater than 0 repeat the loop again + + b end_loops_uv + +loop_8_uv: @each iteration processes two rows + + vld1.8 {q3}, [r0], r2 @load row 1 in source + vld1.8 {q4}, [r0], r2 @load row 2 in source + vmovl.u8 q7, d6 @converting row 1L to 16-bit + vld1.8 {q5}, [r0], r2 @load row 3 in source + vmovl.u8 q8, d7 @converting row 1H to 16-bit + vld1.8 {q6}, [r0], r2 @load row 4 in source + + vmul.s16 q7, q7, q1 @weight mult. for row 1L + vmovl.u8 q9, d8 @converting row 2L to 16-bit + vmul.s16 q8, q8, q1 @weight mult. for row 1H + vmovl.u8 q10, d9 @converting row 2H to 16-bit + vmul.s16 q9, q9, q1 @weight mult. for row 2L + vmovl.u8 q11, d10 @converting row 3L to 16-bit + vmul.s16 q10, q10, q1 @weight mult. for row 2H + vmovl.u8 q12, d11 @converting row 3H to 16-bit + vmul.s16 q11, q11, q1 @weight mult. for row 3L + vmovl.u8 q13, d12 @converting row 4L to 16-bit + vmul.s16 q12, q12, q1 @weight mult. for row 3H + vmovl.u8 q14, d13 @converting row 4H to 16-bit + + vmul.s16 q13, q13, q1 @weight mult. for row 4L + vrshl.s16 q7, q7, q0 @rounds off the weighted samples from row 1L + vmul.s16 q14, q14, q1 @weight mult. for row 4H + + vrshl.s16 q8, q8, q0 @rounds off the weighted samples from row 1H + vrshl.s16 q9, q9, q0 @rounds off the weighted samples from row 2L + vaddw.s8 q7, q7, d4 @adding offset for row 1L + vrshl.s16 q10, q10, q0 @rounds off the weighted samples from row 2H + vaddw.s8 q8, q8, d4 @adding offset for row 1H + vqmovun.s16 d6, q7 @saturating row 1L to unsigned 8-bit + vrshl.s16 q11, q11, q0 @rounds off the weighted samples from row 3L + vaddw.s8 q9, q9, d4 @adding offset for row 2L + vqmovun.s16 d7, q8 @saturating row 1H to unsigned 8-bit + vrshl.s16 q12, q12, q0 @rounds off the weighted samples from row 3H + vaddw.s8 q10, q10, d4 @adding offset for row 2H + vqmovun.s16 d8, q9 @saturating row 2L to unsigned 8-bit + vrshl.s16 q13, q13, q0 @rounds off the weighted samples from row 4L + vaddw.s8 q11, q11, d4 @adding offset for row 3L + vqmovun.s16 d9, q10 @saturating row 2H to unsigned 8-bit + vrshl.s16 q14, q14, q0 @rounds off the weighted samples from row 4H + vaddw.s8 q12, q12, d4 @adding offset for row 3H + + vqmovun.s16 d10, q11 @saturating row 3L to unsigned 8-bit + vaddw.s8 q13, q13, d4 @adding offset for row 4L + vqmovun.s16 d11, q12 @saturating row 3H to unsigned 8-bit + vaddw.s8 q14, q14, d4 @adding offset for row 4H + + vqmovun.s16 d12, q13 @saturating row 4L to unsigned 8-bit + vst1.8 {q3}, [r1], r3 @store row 1 in destination + vqmovun.s16 d13, q14 @saturating row 4H to unsigned 8-bit + vst1.8 {q4}, [r1], r3 @store row 2 in destination + subs r7, r7, #4 @decrement ht by 4 + vst1.8 {q5}, [r1], r3 @store row 3 in destination + vst1.8 {q6}, [r1], r3 @store row 4 in destination + + bgt loop_8_uv @if greater than 0 repeat the loop again + +end_loops_uv: + + vpop {d8-d15} + ldmfd sp!, {r4-r9, r15} @Reload the registers from sp + + |