@/****************************************************************************** @ * @ * Copyright (C) 2015 The Android Open Source Project @ * @ * Licensed under the Apache License, Version 2.0 (the "License"); @ * you may not use this file except in compliance with the License. @ * You may obtain a copy of the License at: @ * @ * http://www.apache.org/licenses/LICENSE-2.0 @ * @ * Unless required by applicable law or agreed to in writing, software @ * distributed under the License is distributed on an "AS IS" BASIS, @ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @ * See the License for the specific language governing permissions and @ * limitations under the License. @ * @ ***************************************************************************** @ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore @*/ @/*****************************************************************************/ @/* */ @/* File Name : ih264_deblk_chroma_a9.s */ @/* */ @/* Description : Contains function definitions for deblocking luma */ @/* edge. Functions are coded in NEON assembly and can */ @/* be compiled using ARM RVDS. */ @/* */ @/* List of Functions : ih264_deblk_chroma_vert_bs4_bp_a9() */ @/* ih264_deblk_chroma_vert_bslt4_bp_a9() */ @/* ih264_deblk_chroma_horz_bs4_bp_a9() */ @/* ih264_deblk_chroma_horz_bslt4_bp_a9() */ @/* ih264_deblk_chroma_vert_bs4_mbaff_bp_a9() */ @/* ih264_deblk_chroma_vert_bslt4_mbaff_bp_a9() */ @/* ih264_deblk_chroma_vert_bs4_a9() */ @/* ih264_deblk_chroma_vert_bslt4_a9() */ @/* ih264_deblk_chroma_horz_bs4_a9() */ @/* ih264_deblk_chroma_horz_bslt4_a9() */ @/* ih264_deblk_chroma_vert_bs4_mbaff_a9() */ @/* ih264_deblk_chroma_vert_bslt4_mbaff_a9() */ @/* */ @/* Issues / Problems : None */ @/* */ @/* Revision History : */ @/* */ @/* DD MM YYYY Author(s) Changes (Describe the changes made) */ @/* 28 11 2013 Ittiam Draft */ @/* 05 01 2015 Kaushik Added double-call functions for */ @/* Senthoor vertical deblocking, and high */ @/* profile functions. */ @/* */ @/*****************************************************************************/ .text .p2align 2 @** @******************************************************************************* @* @* @brief @* Performs filtering of a chroma block horizontal edge when the @* boundary strength is set to 4 @* @* @par Description: @* This operation is described in Sec. 8.7.2.4 under the title @* "Filtering process for edges for bS equal to 4" in ITU T Rec H.264. @* @* @param[in] r0 - pu1_src @* Pointer to the src sample q0 @* @* @param[in] r1 - src_strd @* Source stride @* @* @param[in] r2 - alpha @* Alpha Value for the boundary @* @* @param[in] r3 - beta @* Beta Value for the boundary @* @* @returns @* None @* @* @remarks @* None @* @******************************************************************************* @* .global ih264_deblk_chroma_horz_bs4_bp_a9 ih264_deblk_chroma_horz_bs4_bp_a9: stmfd sp!, {r4, lr} @ vpush {d8 - d15} sub r0, r0, r1, lsl #1 @R0 = uc_edgePixel pointing to p1 of chroma vld2.8 {d6, d7}, [r0], r1 @D6 = p1u , D7 = p1v mov r4, r0 @Keeping a backup of the pointer p0 of chroma vld2.8 {d4, d5}, [r0], r1 @D4 = p0u , D5 = p0v vdup.8 q10, r2 @Q10 contains alpha vld2.8 {d0, d1}, [r0], r1 @D0 = q0u , D1 = q0v vaddl.u8 q4, d6, d0 @ vaddl.u8 q5, d7, d1 @Q4,Q5 = q0 + p1 vmov.i8 d31, #2 @ vld2.8 {d2, d3}, [r0] @D2 = q1u , D3 = q1v vabd.u8 q13, q3, q2 @Q13 = ABS(p1 - p0) vmlal.u8 q4, d2, d31 @ vmlal.u8 q5, d3, d31 @Q5,Q4 = (X2(q1U) + q0U + p1U) vabd.u8 q11, q2, q0 @Q11 = ABS(p0 - q0) vabd.u8 q12, q1, q0 @Q12 = ABS(q1 - q0) vaddl.u8 q7, d4, d2 @ vaddl.u8 q14, d5, d3 @Q14,Q7 = P0 + Q1 vdup.8 q8, r3 @Q8 contains beta vmlal.u8 q7, d6, d31 @ vmlal.u8 q14, d7, d31 @Q14,Q7 = (X2(p1U) + p0U + q1U) vcge.u8 q9, q11, q10 @Q9 = ( ABS(p0 - q0) >= Alpha ) vcge.u8 q12, q12, q8 @Q12= ( ABS(q1 - q0) >= Beta ) vcge.u8 q13, q13, q8 @Q13= ( ABS(p1 - p0) >= Beta ) vrshrn.u16 d8, q4, #2 @ vrshrn.u16 d9, q5, #2 @Q4 = (X2(q1U) + q0U + p1U + 2) >> 2 vorr q9, q9, q12 @Q9 = ( ABS(p0 - q0) >= Alpha ) | ( ABS(q1 - q0) >= Beta ) vrshrn.u16 d10, q7, #2 @ vrshrn.u16 d11, q14, #2 @Q5 = (X2(p1U) + p0U + q1U + 2) >> 2 vorr q9, q9, q13 @Q9 = ( ABS(p0 - q0) >= Alpha ) | ( ABS(q1 - q0) >= Beta ) | ( ABS(p1 - p0) >= Beta ) vbit q5, q2, q9 @ vbit q4, q0, q9 @ vst2.8 {d10, d11}, [r4], r1 @ vst2.8 {d8, d9}, [r4] @ vpop {d8 - d15} ldmfd sp!, {r4, pc} @ @** @******************************************************************************* @* @* @brief @* Performs filtering of a chroma block vertical edge when the @* boundary strength is set to 4 @* @* @par Description: @* This operation is described in Sec. 8.7.2.4 under the title @* "Filtering process for edges for bS equal to 4" in ITU T Rec H.264. @* @* @param[in] r0 - pu1_src @* Pointer to the src sample q0 @* @* @param[in] r1 - src_strd @* Source stride @* @* @param[in] r2 - alpha @* Alpha Value for the boundary @* @* @param[in] r3 - beta @* Beta Value for the boundary @* @* @returns @* None @* @* @remarks @* None @* @******************************************************************************* @* .global ih264_deblk_chroma_vert_bs4_bp_a9 ih264_deblk_chroma_vert_bs4_bp_a9: stmfd sp!, {r12, r14} vpush {d8 - d15} sub r0, r0, #4 @point r0 to p1u of row0. mov r12, r0 @keep a back up of r0 for buffer write vld4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0], r1 vld4.16 {d0[1], d2[1], d4[1], d6[1]}, [r0], r1 vld4.16 {d0[2], d2[2], d4[2], d6[2]}, [r0], r1 vld4.16 {d0[3], d2[3], d4[3], d6[3]}, [r0], r1 vld4.16 {d1[0], d3[0], d5[0], d7[0]}, [r0], r1 vld4.16 {d1[1], d3[1], d5[1], d7[1]}, [r0], r1 vld4.16 {d1[2], d3[2], d5[2], d7[2]}, [r0], r1 vld4.16 {d1[3], d3[3], d5[3], d7[3]}, [r0], r1 vdup.8 q11, r2 @Q4 = alpha vdup.8 q12, r3 @Q5 = beta vmov.i8 d31, #2 vabd.u8 q4, q1, q2 @|p0-q0| vabd.u8 q5, q3, q2 @|q1-q0| vabd.u8 q6, q0, q1 @|p1-p0| vaddl.u8 q7, d2, d6 vaddl.u8 q8, d3, d7 @(p0 + q1) vclt.u8 q4, q4, q11 @|p0-q0| < alpha ? vclt.u8 q5, q5, q12 @|q1-q0| < beta ? vclt.u8 q6, q6, q12 @|p1-p0| < beta ? vmlal.u8 q7, d0, d31 vmlal.u8 q8, d1, d31 @2*p1 + (p0 + q1) vaddl.u8 q9, d0, d4 vaddl.u8 q10, d1, d5 @(p1 + q0) vand.u8 q4, q4, q5 @|p0-q0| < alpha && |q1-q0| < beta vmlal.u8 q9, d6, d31 vmlal.u8 q10, d7, d31 @2*q1 + (p1 + q0) vrshrn.i16 d14, q7, #2 vrshrn.i16 d15, q8, #2 @(2*p1 + (p0 + q1) + 2) >> 2 vand.u8 q4, q4, q6 @|p0-q0| < alpha && |q1-q0| < beta && |p1-p0| < beta vrshrn.i16 d18, q9, #2 vrshrn.i16 d19, q10, #2 @(2*q1 + (p1 + q0) + 2) >> 2 vbit q1, q7, q4 vbit q2, q9, q4 vst4.16 {d0[0], d2[0], d4[0], d6[0]}, [r12], r1 vst4.16 {d0[1], d2[1], d4[1], d6[1]}, [r12], r1 vst4.16 {d0[2], d2[2], d4[2], d6[2]}, [r12], r1 vst4.16 {d0[3], d2[3], d4[3], d6[3]}, [r12], r1 vst4.16 {d1[0], d3[0], d5[0], d7[0]}, [r12], r1 vst4.16 {d1[1], d3[1], d5[1], d7[1]}, [r12], r1 vst4.16 {d1[2], d3[2], d5[2], d7[2]}, [r12], r1 vst4.16 {d1[3], d3[3], d5[3], d7[3]}, [r12], r1 vpop {d8 - d15} ldmfd sp!, {r12, pc} @** @******************************************************************************* @* @* @brief @* Performs filtering of a chroma block horizontal edge for cases where the @* boundary strength is less than 4 @* @* @par Description: @* This operation is described in Sec. 8.7.2.4 under the title @* "Filtering process for edges for bS equal to 4" in ITU T Rec H.264. @* @* @param[in] r0 - pu1_src @* Pointer to the src sample q0 @* @* @param[in] r1 - src_strd @* Source stride @* @* @param[in] r2 - alpha @* Alpha Value for the boundary @* @* @param[in] r3 - beta @* Beta Value for the boundary @* @* @param[in] sp(0) - u4_bs @* Packed Boundary strength array @* @* @param[in] sp(4) - pu1_cliptab @* tc0_table @* @* @returns @* None @* @* @remarks @* None @* @******************************************************************************* @* .global ih264_deblk_chroma_horz_bslt4_bp_a9 ih264_deblk_chroma_horz_bslt4_bp_a9: stmfd sp!, {r4-r6, lr} @ ldrd r4, r5, [sp, #0x10] @r4 = u4_bs , r5 = pu1_cliptab vpush {d8 - d15} sub r0, r0, r1, lsl #1 @R0 = uc_edgePixelU pointing to p2 of chroma U rev r4, r4 @ vmov.32 d12[0], r4 @d12[0] = ui_Bs vld1.32 d16[0], [r5] @D16[0] contains cliptab vld2.8 {d6, d7}, [r0], r1 @Q3=p1 vtbl.8 d14, {d16}, d12 @ vmovl.u8 q6, d12 @q6 = uc_Bs in each 16 bit scalar mov r6, r0 @Keeping a backup of the pointer to chroma U P0 vld2.8 {d4, d5}, [r0], r1 @Q2=p0 vmov.i8 d30, #1 @ vdup.8 q10, r2 @Q10 contains alpha vld2.8 {d0, d1}, [r0], r1 @Q0=q0 vmovl.u8 q7, d14 @ vld2.8 {d2, d3}, [r0] @Q1=q1 vsubl.u8 q5, d1, d5 @ vsubl.u8 q4, d0, d4 @Q5,Q4 = (q0 - p0) vabd.u8 q13, q3, q2 @Q13 = ABS(p1 - p0) vshl.i16 q5, q5, #2 @Q5 = (q0 - p0)<<2 vabd.u8 q11, q2, q0 @Q11 = ABS(p0 - q0) vshl.i16 q4, q4, #2 @Q4 = (q0 - p0)<<2 vsli.16 q7, q7, #8 @ vabd.u8 q12, q1, q0 @Q12 = ABS(q1 - q0) vcge.u8 q9, q11, q10 @Q9 = ( ABS(p0 - q0) >= Alpha ) vsubl.u8 q10, d6, d2 @Q10 = (p1 - q1)L vsubl.u8 q3, d7, d3 @Q3 = (p1 - q1)H vdup.8 q8, r3 @Q8 contains beta vadd.i16 q4, q4, q10 @ vadd.i16 q5, q5, q3 @Q5,Q4 = [ (q0 - p0)<<2 ] + (p1 - q1) vcge.u8 q12, q12, q8 @Q12= ( ABS(q1 - q0) >= Beta ) vcgt.s16 d12, d12, #0 @Q6 = (us_Bs > 0) vqrshrn.s16 d8, q4, #3 @ vqrshrn.s16 d9, q5, #3 @Q4 = i_macro = (((q0 - p0)<<2) + (p1 - q1) + 4)>>3 vadd.i8 d14, d14, d30 @Q7 = C = C0+1 vcge.u8 q13, q13, q8 @Q13= ( ABS(p1 - p0) >= Beta ) vorr q9, q9, q12 @Q9 = ( ABS(p0 - q0) >= Alpha ) | ( ABS(q1 - q0) >= Beta ) vabs.s8 q3, q4 @Q4 = ABS (i_macro) vmov.i8 d15, d14 @ vmov.i8 d13, d12 @ vorr q9, q9, q13 @Q9 = ( ABS(p0 - q0) >= Alpha ) | ( ABS(q1 - q0) >= Beta ) | ( ABS(p1 - p0) >= Beta ) vmin.u8 q7, q3, q7 @Q7 = delta = (ABS(i_macro) > C) ? C : ABS(i_macro) vbic q6, q6, q9 @final condition vcge.s8 q4, q4, #0 @Q4 = (i_macro >= 0) vand q7, q7, q6 @Making delta zero in places where values shouldn be filterd vqadd.u8 q8, q2, q7 @Q8 = p0 + delta vqsub.u8 q2, q2, q7 @Q2 = p0 - delta vqadd.u8 q9, q0, q7 @Q9 = q0 + delta vqsub.u8 q0, q0, q7 @Q0 = q0 - delta vbif q8, q2, q4 @Q8 = (i_macro >= 0 ) ? (p0+delta) : (p0-delta) vbif q0, q9, q4 @Q0 = (i_macro >= 0 ) ? (q0-delta) : (q0+delta) vst2.8 {d16, d17}, [r6], r1 @ vst2.8 {d0, d1}, [r6] @ vpop {d8 - d15} ldmfd sp!, {r4-r6, pc} @ @** @******************************************************************************* @* @* @brief @* Performs filtering of a chroma block vertical edge for cases where the @* boundary strength is less than 4 @* @* @par Description: @* This operation is described in Sec. 8.7.2.4 under the title @* "Filtering process for edges for bS equal to 4" in ITU T Rec H.264. @* @* @param[in] r0 - pu1_src @* Pointer to the src sample q0 @* @* @param[in] r1 - src_strd @* Source stride @* @* @param[in] r2 - alpha @* Alpha Value for the boundary @* @* @param[in] r3 - beta @* Beta Value for the boundary @* @* @param[in] sp(0) - u4_bs @* Packed Boundary strength array @* @* @param[in] sp(4) - pu1_cliptab @* tc0_table @* @* @returns @* None @* @* @remarks @* None @* @******************************************************************************* @* .global ih264_deblk_chroma_vert_bslt4_bp_a9 ih264_deblk_chroma_vert_bslt4_bp_a9: stmfd sp!, {r10-r12, r14} sub r0, r0, #4 @point r0 to p1u of row0. ldr r11, [sp, #16] @r12 = ui_Bs ldr r10, [sp, #20] @r14 = puc_ClipTab mov r12, r0 @keep a back up of r0 for buffer write vpush {d8 - d15} vld4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0], r1 vld4.16 {d0[1], d2[1], d4[1], d6[1]}, [r0], r1 vld4.16 {d0[2], d2[2], d4[2], d6[2]}, [r0], r1 vld4.16 {d0[3], d2[3], d4[3], d6[3]}, [r0], r1 vld4.16 {d1[0], d3[0], d5[0], d7[0]}, [r0], r1 vld4.16 {d1[1], d3[1], d5[1], d7[1]}, [r0], r1 vld4.16 {d1[2], d3[2], d5[2], d7[2]}, [r0], r1 vld4.16 {d1[3], d3[3], d5[3], d7[3]}, [r0], r1 vdup.8 q11, r2 @Q4 = alpha vabd.u8 q4, q1, q2 @|p0-q0| vdup.8 q12, r3 @Q5 = beta vabd.u8 q5, q3, q2 @|q1-q0| vabd.u8 q6, q0, q1 @|p1-p0| vclt.u8 q4, q4, q11 @|p0-q0| < alpha ? vsubl.u8 q7, d0, d6 vclt.u8 q5, q5, q12 @|q1-q0| < beta ? vsubl.u8 q8, d1, d7 @(p1 - q1) vclt.u8 q6, q6, q12 @|p1-p0| < beta ? vsubl.u8 q9, d4, d2 vand.u8 q4, q4, q5 @|p0-q0| < alpha && |q1-q0| < beta vsubl.u8 q10, d5, d3 @(q0 - p0) vmov.u16 q14, #4 vld1.32 {d24[0]}, [r10] @Load ClipTable rev r11, r11 @Blocking strengths vand.u8 q4, q4, q6 @|p0-q0| < alpha && |q1-q0| < beta && |p1-p0| < beta vmov.32 d10[0], r11 vmla.s16 q7, q9, q14 vmla.s16 q8, q10, q14 @4*(q0 - p0) + (p1 - q1) vmovl.u8 q5, d10 vsli.u16 d10, d10, #8 vmovl.u16 q5, d10 vsli.u32 q5, q5, #16 vtbl.8 d12, {d24}, d10 vtbl.8 d13, {d24}, d11 @tC0 vmov.u8 q12, #1 vadd.u8 q6, q6, q12 @tC0 + 1 vcge.u8 q5, q5, q12 @u4_bS > 0 ? vand.u8 q4, q4, q5 @|p0-q0| < alpha && |q1-q0| < beta && |p1-p0| < beta && u4_bs != 0 @ Q0 - Q3(inputs), @ Q4 (|p0-q0| < alpha && |q1-q0| < beta && |p1-p0| < beta && u4_bs != 0), @ Q6 (tC) vrshr.s16 q7, q7, #3 vrshr.s16 q8, q8, #3 @(((q0 - p0) << 2) + (p1 - q1) + 4) >> 3) vcgt.s16 q9, q7, #0 vcgt.s16 q10, q8, #0 vmovn.i16 d18, q9 vmovn.i16 d19, q10 @Q9 = sign(delta) vabs.s16 q7, q7 vabs.s16 q8, q8 vmovn.u16 d14, q7 vmovn.u16 d15, q8 vmin.u8 q7, q7, q6 @Q7 = |delta| vqadd.u8 q10, q1, q7 @p0+|delta| vqadd.u8 q11, q2, q7 @q0+|delta| vqsub.u8 q12, q1, q7 @p0-|delta| vqsub.u8 q13, q2, q7 @q0-|delta| vbit q12, q10, q9 @p0 + delta vbit q11, q13, q9 @q0 - delta vbit q1, q12, q4 vbit q2, q11, q4 vst4.16 {d0[0], d2[0], d4[0], d6[0]}, [r12], r1 vst4.16 {d0[1], d2[1], d4[1], d6[1]}, [r12], r1 vst4.16 {d0[2], d2[2], d4[2], d6[2]}, [r12], r1 vst4.16 {d0[3], d2[3], d4[3], d6[3]}, [r12], r1 vst4.16 {d1[0], d3[0], d5[0], d7[0]}, [r12], r1 vst4.16 {d1[1], d3[1], d5[1], d7[1]}, [r12], r1 vst4.16 {d1[2], d3[2], d5[2], d7[2]}, [r12], r1 vst4.16 {d1[3], d3[3], d5[3], d7[3]}, [r12], r1 vpop {d8 - d15} ldmfd sp!, {r10-r12, pc} @** @******************************************************************************* @* @* @brief @* Performs filtering of a chroma block vertical edge when the @* boundary strength is set to 4 on calling twice @* @* @par Description: @* This operation is described in Sec. 8.7.2.4 under the title @* "Filtering process for edges for bS equal to 4" in ITU T Rec H.264. @* @* @param[in] r0 - pu1_src @* Pointer to the src sample q0 @* @* @param[in] r1 - src_strd @* Source stride @* @* @param[in] r2 - alpha @* Alpha Value for the boundary @* @* @param[in] r3 - beta @* Beta Value for the boundary @* @* @returns @* None @* @* @remarks @* None @* @******************************************************************************* @* .global ih264_deblk_chroma_vert_bs4_mbaff_bp_a9 ih264_deblk_chroma_vert_bs4_mbaff_bp_a9: stmfd sp!, {r12, r14} vpush {d8 - d15} sub r0, r0, #4 @point r0 to p1u of row0. mov r12, r0 @keep a back up of r0 for buffer write vld4.16 {d0[0], d1[0], d2[0], d3[0]}, [r0], r1 vld4.16 {d0[1], d1[1], d2[1], d3[1]}, [r0], r1 vld4.16 {d0[2], d1[2], d2[2], d3[2]}, [r0], r1 vld4.16 {d0[3], d1[3], d2[3], d3[3]}, [r0], r1 vdup.8 d11, r2 @D11 = alpha vdup.8 d12, r3 @D12 = beta vmov.i8 d31, #2 vabd.u8 d4, d1, d2 @|p0-q0| vabd.u8 d5, d3, d2 @|q1-q0| vabd.u8 d6, d0, d1 @|p1-p0| vaddl.u8 q14, d1, d3 @(p0 + q1) vclt.u8 d4, d4, d11 @|p0-q0| < alpha ? vclt.u8 d5, d5, d12 @|q1-q0| < beta ? vclt.u8 d6, d6, d12 @|p1-p0| < beta ? vmlal.u8 q14, d0, d31 @2*p1 + (p0 + q1) vaddl.u8 q13, d0, d2 @(p1 + q0) vand.u8 d4, d4, d5 @|p0-q0| < alpha && |q1-q0| < beta vmlal.u8 q13, d3, d31 @2*q1 + (p1 + q0) vrshrn.i16 d7, q14, #2 @(2*p1 + (p0 + q1) + 2) >> 2 vand.u8 d4, d4, d6 @|p0-q0| < alpha && |q1-q0| < beta && |p1-p0| < beta vrshrn.i16 d9, q13, #2 @(2*q1 + (p1 + q0) + 2) >> 2 vbit d1, d7, d4 vbit d2, d9, d4 vst4.16 {d0[0], d1[0], d2[0], d3[0]}, [r12], r1 vst4.16 {d0[1], d1[1], d2[1], d3[1]}, [r12], r1 vst4.16 {d0[2], d1[2], d2[2], d3[2]}, [r12], r1 vst4.16 {d0[3], d1[3], d2[3], d3[3]}, [r12], r1 vpop {d8 - d15} ldmfd sp!, {r12, pc} @** @******************************************************************************* @* @* @brief @* Performs filtering of a chroma block vertical edge for cases where the @* boundary strength is less than 4 on calling twice @* @* @par Description: @* This operation is described in Sec. 8.7.2.4 under the title @* "Filtering process for edges for bS equal to 4" in ITU T Rec H.264. @* @* @param[in] r0 - pu1_src @* Pointer to the src sample q0 @* @* @param[in] r1 - src_strd @* Source stride @* @* @param[in] r2 - alpha @* Alpha Value for the boundary @* @* @param[in] r3 - beta @* Beta Value for the boundary @* @* @param[in] sp(0) - u4_bs @* Packed Boundary strength array @* @* @param[in] sp(4) - pu1_cliptab @* tc0_table @* @* @returns @* None @* @* @remarks @* None @* @******************************************************************************* @* .global ih264_deblk_chroma_vert_bslt4_mbaff_bp_a9 ih264_deblk_chroma_vert_bslt4_mbaff_bp_a9: stmfd sp!, {r10-r12, r14} sub r0, r0, #4 @point r0 to p1u of row0. ldr r11, [sp, #16] @r11 = ui_Bs ldr r10, [sp, #20] @r10 = puc_ClipTab mov r12, r0 @keep a back up of r0 for buffer write vpush {d8 - d15} vld4.16 {d0[0], d1[0], d2[0], d3[0]}, [r0], r1 vld4.16 {d0[1], d1[1], d2[1], d3[1]}, [r0], r1 vld4.16 {d0[2], d1[2], d2[2], d3[2]}, [r0], r1 vld4.16 {d0[3], d1[3], d2[3], d3[3]}, [r0], r1 vdup.8 d11, r2 @D11 = alpha vabd.u8 d4, d1, d2 @|p0-q0| vdup.8 d12, r3 @D12 = beta vabd.u8 d5, d3, d2 @|q1-q0| vabd.u8 d6, d0, d1 @|p1-p0| vclt.u8 d4, d4, d11 @|p0-q0| < alpha ? vclt.u8 d5, d5, d12 @|q1-q0| < beta ? vsubl.u8 q14, d0, d3 @(p1 - q1) vclt.u8 d6, d6, d12 @|p1-p0| < beta ? vand.u8 d4, d4, d5 @|p0-q0| < alpha && |q1-q0| < beta vsubl.u8 q12, d2, d1 @(q0 - p0) vmov.u16 q10, #4 vld1.32 {d31[0]}, [r10] @Load ClipTable rev r11, r11 @Blocking strengths vand.u8 d4, d4, d6 @|p0-q0| < alpha && |q1-q0| < beta && |p1-p0| < beta vmov.32 d22[0], r11 vmla.s16 q14, q12, q10 @4*(q0 - p0) + (p1 - q1) vmovl.u8 q11, d22 vsli.u16 d22, d22, #8 vtbl.8 d6, {d31}, d22 @tC0 vmov.u8 d12, #1 vadd.u8 d6, d6, d12 @tC0 + 1 vcge.u8 d5, d22, d12 @u4_bS > 0 ? vand.u8 d4, d4, d5 @|p0-q0| < alpha && |q1-q0| < beta && |p1-p0| < beta && u4_bs != 0 @ D0 - D3(inputs), @ D4 (|p0-q0| < alpha && |q1-q0| < beta && |p1-p0| < beta && u4_bs != 0), @ D6 (tC) vrshr.s16 q14, q14, #3 @(((q0 - p0) << 2) + (p1 - q1) + 4) >> 3) vcgt.s16 q13, q14, #0 vmovn.i16 d9, q13 @D9 = sign(delta) vabs.s16 q14, q14 vmovn.u16 d7, q14 vmin.u8 d7, d7, d6 @D7 = |delta| vqadd.u8 d10, d1, d7 @p0+|delta| vqadd.u8 d11, d2, d7 @q0+|delta| vqsub.u8 d12, d1, d7 @p0-|delta| vqsub.u8 d13, d2, d7 @q0-|delta| vbit d12, d10, d9 @p0 + delta vbit d11, d13, d9 @q0 - delta vbit d1, d12, d4 vbit d2, d11, d4 vst4.16 {d0[0], d1[0], d2[0], d3[0]}, [r12], r1 vst4.16 {d0[1], d1[1], d2[1], d3[1]}, [r12], r1 vst4.16 {d0[2], d1[2], d2[2], d3[2]}, [r12], r1 vst4.16 {d0[3], d1[3], d2[3], d3[3]}, [r12], r1 vpop {d8 - d15} ldmfd sp!, {r10-r12, pc} @** @******************************************************************************* @* @* @brief @* Performs filtering of a chroma block horizontal edge when the @* boundary strength is set to 4 in high profile @* @* @par Description: @* This operation is described in Sec. 8.7.2.4 under the title @* "Filtering process for edges for bS equal to 4" in ITU T Rec H.264. @* @* @param[in] r0 - pu1_src @* Pointer to the src sample q0 @* @* @param[in] r1 - src_strd @* Source stride @* @* @param[in] r2 - alpha_cb @* Alpha Value for the boundary in U @* @* @param[in] r3 - beta_cb @* Beta Value for the boundary in U @* @* @param[in] sp(0) - alpha_cr @* Alpha Value for the boundary in V @* @* @param[in] sp(4) - beta_cr @* Beta Value for the boundary in V @* @* @returns @* None @* @* @remarks @* None @* @******************************************************************************* @* .global ih264_deblk_chroma_horz_bs4_a9 ih264_deblk_chroma_horz_bs4_a9: stmfd sp!, {r4-r6, lr} @ ldr r5, [sp, #16] @R5 = alpha_cr ldr r6, [sp, #20] @R6 = beta_cr vpush {d8 - d15} sub r0, r0, r1, lsl #1 @R0 = uc_edgePixel pointing to p1 of chroma vld2.8 {d6, d7}, [r0], r1 @D6 = p1u , D7 = p1v mov r4, r0 @Keeping a backup of the pointer p0 of chroma vld2.8 {d4, d5}, [r0], r1 @D4 = p0u , D5 = p0v vdup.8 d20, r2 @D20 contains alpha_cb vdup.8 d21, r5 @D21 contains alpha_cr vld2.8 {d0, d1}, [r0], r1 @D0 = q0u , D1 = q0v vaddl.u8 q4, d6, d0 @ vaddl.u8 q5, d7, d1 @Q4,Q5 = q0 + p1 vmov.i8 d31, #2 @ vld2.8 {d2, d3}, [r0] @D2 = q1u , D3 = q1v vabd.u8 q13, q3, q2 @Q13 = ABS(p1 - p0) vmlal.u8 q4, d2, d31 @ vmlal.u8 q5, d3, d31 @Q5,Q4 = (X2(q1U) + q0U + p1U) vabd.u8 q11, q2, q0 @Q11 = ABS(p0 - q0) vabd.u8 q12, q1, q0 @Q12 = ABS(q1 - q0) vaddl.u8 q7, d4, d2 @ vaddl.u8 q14, d5, d3 @Q14,Q7 = P0 + Q1 vdup.8 d16, r3 @D16 contains beta_cb vdup.8 d17, r6 @D17 contains beta_cr vmlal.u8 q7, d6, d31 @ vmlal.u8 q14, d7, d31 @Q14,Q7 = (X2(p1U) + p0U + q1U) vcge.u8 q9, q11, q10 @Q9 = ( ABS(p0 - q0) >= Alpha ) vcge.u8 q12, q12, q8 @Q12= ( ABS(q1 - q0) >= Beta ) vcge.u8 q13, q13, q8 @Q13= ( ABS(p1 - p0) >= Beta ) vrshrn.u16 d8, q4, #2 @ vrshrn.u16 d9, q5, #2 @Q4 = (X2(q1U) + q0U + p1U + 2) >> 2 vorr q9, q9, q12 @Q9 = ( ABS(p0 - q0) >= Alpha ) | ( ABS(q1 - q0) >= Beta ) vrshrn.u16 d10, q7, #2 @ vrshrn.u16 d11, q14, #2 @Q5 = (X2(p1U) + p0U + q1U + 2) >> 2 vorr q9, q9, q13 @Q9 = ( ABS(p0 - q0) >= Alpha ) | ( ABS(q1 - q0) >= Beta ) | ( ABS(p1 - p0) >= Beta ) vbit q5, q2, q9 @ vbit q4, q0, q9 @ vst2.8 {d10, d11}, [r4], r1 @ vst2.8 {d8, d9}, [r4] @ vpop {d8 - d15} ldmfd sp!, {r4-r6, pc} @ @** @******************************************************************************* @* @* @brief @* Performs filtering of a chroma block vertical edge when the @* boundary strength is set to 4 in high profile @* @* @par Description: @* This operation is described in Sec. 8.7.2.4 under the title @* "Filtering process for edges for bS equal to 4" in ITU T Rec H.264. @* @* @param[in] r0 - pu1_src @* Pointer to the src sample q0 @* @* @param[in] r1 - src_strd @* Source stride @* @* @param[in] r2 - alpha_cb @* Alpha Value for the boundary in U @* @* @param[in] r3 - beta_cb @* Beta Value for the boundary in U @* @* @param[in] sp(0) - alpha_cr @* Alpha Value for the boundary in V @* @* @param[in] sp(4) - beta_cr @* Beta Value for the boundary in V @* @* @returns @* None @* @* @remarks @* None @* @******************************************************************************* @* .global ih264_deblk_chroma_vert_bs4_a9 ih264_deblk_chroma_vert_bs4_a9: stmfd sp!, {r4, r5, r12, r14} sub r0, r0, #4 @point r0 to p1u of row0. mov r12, r0 @keep a back up of r0 for buffer write ldr r4, [sp, #16] @r4 = alpha_cr ldr r5, [sp, #20] @r5 = beta_cr add r2, r2, r4, lsl #8 @r2 = (alpha_cr,alpha_cb) add r3, r3, r5, lsl #8 @r3 = (beta_cr,beta_cb) vpush {d8 - d15} vld4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0], r1 vld4.16 {d0[1], d2[1], d4[1], d6[1]}, [r0], r1 vld4.16 {d0[2], d2[2], d4[2], d6[2]}, [r0], r1 vld4.16 {d0[3], d2[3], d4[3], d6[3]}, [r0], r1 vld4.16 {d1[0], d3[0], d5[0], d7[0]}, [r0], r1 vld4.16 {d1[1], d3[1], d5[1], d7[1]}, [r0], r1 vld4.16 {d1[2], d3[2], d5[2], d7[2]}, [r0], r1 vld4.16 {d1[3], d3[3], d5[3], d7[3]}, [r0], r1 vdup.16 q11, r2 @Q11 = alpha vdup.16 q12, r3 @Q12 = beta vmov.i8 d31, #2 vabd.u8 q4, q1, q2 @|p0-q0| vabd.u8 q5, q3, q2 @|q1-q0| vabd.u8 q6, q0, q1 @|p1-p0| vaddl.u8 q7, d2, d6 vaddl.u8 q8, d3, d7 @(p0 + q1) vclt.u8 q4, q4, q11 @|p0-q0| < alpha ? vclt.u8 q5, q5, q12 @|q1-q0| < beta ? vclt.u8 q6, q6, q12 @|p1-p0| < beta ? vmlal.u8 q7, d0, d31 vmlal.u8 q8, d1, d31 @2*p1 + (p0 + q1) vaddl.u8 q9, d0, d4 vaddl.u8 q10, d1, d5 @(p1 + q0) vand.u8 q4, q4, q5 @|p0-q0| < alpha && |q1-q0| < beta vmlal.u8 q9, d6, d31 vmlal.u8 q10, d7, d31 @2*q1 + (p1 + q0) vrshrn.i16 d14, q7, #2 vrshrn.i16 d15, q8, #2 @(2*p1 + (p0 + q1) + 2) >> 2 vand.u8 q4, q4, q6 @|p0-q0| < alpha && |q1-q0| < beta && |p1-p0| < beta vrshrn.i16 d18, q9, #2 vrshrn.i16 d19, q10, #2 @(2*q1 + (p1 + q0) + 2) >> 2 vbit q1, q7, q4 vbit q2, q9, q4 vst4.16 {d0[0], d2[0], d4[0], d6[0]}, [r12], r1 vst4.16 {d0[1], d2[1], d4[1], d6[1]}, [r12], r1 vst4.16 {d0[2], d2[2], d4[2], d6[2]}, [r12], r1 vst4.16 {d0[3], d2[3], d4[3], d6[3]}, [r12], r1 vst4.16 {d1[0], d3[0], d5[0], d7[0]}, [r12], r1 vst4.16 {d1[1], d3[1], d5[1], d7[1]}, [r12], r1 vst4.16 {d1[2], d3[2], d5[2], d7[2]}, [r12], r1 vst4.16 {d1[3], d3[3], d5[3], d7[3]}, [r12], r1 vpop {d8 - d15} ldmfd sp!, {r4, r5, r12, pc} @** @******************************************************************************* @* @* @brief @* Performs filtering of a chroma block horizontal edge for cases where the @* boundary strength is less than 4 in high profile @* @* @par Description: @* This operation is described in Sec. 8.7.2.4 under the title @* "Filtering process for edges for bS equal to 4" in ITU T Rec H.264. @* @* @param[in] r0 - pu1_src @* Pointer to the src sample q0 @* @* @param[in] r1 - src_strd @* Source stride @* @* @param[in] r2 - alpha_cb @* Alpha Value for the boundary in U @* @* @param[in] r3 - beta_cb @* Beta Value for the boundary in U @* @* @param[in] sp(0) - alpha_cr @* Alpha Value for the boundary in V @* @* @param[in] sp(4) - beta_cr @* Beta Value for the boundary in V @* @* @param[in] sp(8) - u4_bs @* Packed Boundary strength array @* @* @param[in] sp(12) - pu1_cliptab_cb @* tc0_table for U @* @* @param[in] sp(16) - pu1_cliptab_cr @* tc0_table for V @* @* @returns @* None @* @* @remarks @* None @* @******************************************************************************* @* .global ih264_deblk_chroma_horz_bslt4_a9 ih264_deblk_chroma_horz_bslt4_a9: stmfd sp!, {r4-r9, lr} @ ldrd r4, r5, [sp, #28] @R4 = alpha_cr , R5 = beta_cr ldr r7, [sp, #36] @R7 = u4_bs ldrd r8, r9, [sp, #40] @R8 = pu1_cliptab_cb , R9 = pu1_cliptab_cr sub r0, r0, r1, lsl #1 @R0 = uc_edgePixelU pointing to p1 of chroma U vpush {d8 - d15} rev r7, r7 @ vmov.32 d12[0], r7 @D12[0] = ui_Bs vld1.32 d16[0], [r8] @D16[0] contains cliptab_cb vld1.32 d17[0], [r9] @D17[0] contains cliptab_cr vld2.8 {d6, d7}, [r0], r1 @Q3=p1 vtbl.8 d14, {d16}, d12 @Retreiving cliptab values for U vtbl.8 d28, {d17}, d12 @Retrieving cliptab values for V vmovl.u8 q6, d12 @Q6 = uc_Bs in each 16 bit scalar mov r6, r0 @Keeping a backup of the pointer to chroma U P0 vld2.8 {d4, d5}, [r0], r1 @Q2=p0 vmov.i8 d30, #1 @ vdup.8 d20, r2 @D20 contains alpha_cb vdup.8 d21, r4 @D21 contains alpha_cr vld2.8 {d0, d1}, [r0], r1 @Q0=q0 vmovl.u8 q7, d14 @ vmovl.u8 q14, d28 @ vmov.i16 d15, d28 @D14 has cliptab values for U, D15 for V vld2.8 {d2, d3}, [r0] @Q1=q1 vsubl.u8 q5, d1, d5 @ vsubl.u8 q4, d0, d4 @Q5,Q4 = (q0 - p0) vabd.u8 q13, q3, q2 @Q13 = ABS(p1 - p0) vshl.i16 q5, q5, #2 @Q5 = (q0 - p0)<<2 vabd.u8 q11, q2, q0 @Q11 = ABS(p0 - q0) vshl.i16 q4, q4, #2 @Q4 = (q0 - p0)<<2 vsli.16 q7, q7, #8 @ vabd.u8 q12, q1, q0 @Q12 = ABS(q1 - q0) vcge.u8 q9, q11, q10 @Q9 = ( ABS(p0 - q0) >= Alpha ) vsubl.u8 q10, d6, d2 @Q10 = (p1 - q1)L vsubl.u8 q3, d7, d3 @Q3 = (p1 - q1)H vdup.8 d16, r3 @Q8 contains beta_cb vdup.8 d17, r5 @Q8 contains beta_cr vadd.i16 q4, q4, q10 @ vadd.i16 q5, q5, q3 @Q5,Q4 = [ (q0 - p0)<<2 ] + (p1 - q1) vcge.u8 q12, q12, q8 @Q12= ( ABS(q1 - q0) >= Beta ) vcgt.s16 d12, d12, #0 @Q6 = (us_Bs > 0) vqrshrn.s16 d8, q4, #3 @ vqrshrn.s16 d9, q5, #3 @Q4 = i_macro = (((q0 - p0)<<2) + (p1 - q1) + 4)>>3 vadd.i8 d14, d14, d30 @D14 = C = C0+1 for U vcge.u8 q13, q13, q8 @Q13= ( ABS(p1 - p0) >= Beta ) vorr q9, q9, q12 @Q9 = ( ABS(p0 - q0) >= Alpha ) | ( ABS(q1 - q0) >= Beta ) vabs.s8 q3, q4 @Q4 = ABS (i_macro) vadd.i8 d15, d15, d30 @D15 = C = C0+1 for V vmov.i8 d13, d12 @ vorr q9, q9, q13 @Q9 = ( ABS(p0 - q0) >= Alpha ) | ( ABS(q1 - q0) >= Beta ) | ( ABS(p1 - p0) >= Beta ) vmin.u8 q7, q3, q7 @Q7 = delta = (ABS(i_macro) > C) ? C : ABS(i_macro) vbic q6, q6, q9 @final condition vcge.s8 q4, q4, #0 @Q4 = (i_macro >= 0) vand q7, q7, q6 @Making delta zero in places where values shouldn be filterd vqadd.u8 q8, q2, q7 @Q8 = p0 + delta vqsub.u8 q2, q2, q7 @Q2 = p0 - delta vqadd.u8 q9, q0, q7 @Q9 = q0 + delta vqsub.u8 q0, q0, q7 @Q0 = q0 - delta vbif q8, q2, q4 @Q8 = (i_macro >= 0 ) ? (p0+delta) : (p0-delta) vbif q0, q9, q4 @Q0 = (i_macro >= 0 ) ? (q0-delta) : (q0+delta) vst2.8 {d16, d17}, [r6], r1 @ vst2.8 {d0, d1}, [r6] @ vpop {d8 - d15} ldmfd sp!, {r4-r9, pc} @ @** @******************************************************************************* @* @* @brief @* Performs filtering of a chroma block vertical edge for cases where the @* boundary strength is less than 4 in high profile @* @* @par Description: @* This operation is described in Sec. 8.7.2.4 under the title @* "Filtering process for edges for bS equal to 4" in ITU T Rec H.264. @* @* @param[in] r0 - pu1_src @* Pointer to the src sample q0 @* @* @param[in] r1 - src_strd @* Source stride @* @* @param[in] r2 - alpha_cb @* Alpha Value for the boundary in U @* @* @param[in] r3 - beta_cb @* Beta Value for the boundary in U @* @* @param[in] sp(0) - alpha_cr @* Alpha Value for the boundary in V @* @* @param[in] sp(4) - beta_cr @* Beta Value for the boundary in V @* @* @param[in] sp(8) - u4_bs @* Packed Boundary strength array @* @* @param[in] sp(12) - pu1_cliptab_cb @* tc0_table for U @* @* @param[in] sp(16) - pu1_cliptab_cr @* tc0_table for V @* @* @returns @* None @* @* @remarks @* None @* @******************************************************************************* @* .global ih264_deblk_chroma_vert_bslt4_a9 ih264_deblk_chroma_vert_bslt4_a9: stmfd sp!, {r4-r7, r10-r12, r14} sub r0, r0, #4 @point r0 to p1u of row0. ldrd r4, r5, [sp, #32] @R4 = alpha_cr , R5 = beta_cr add r2, r2, r4, lsl #8 add r3, r3, r5, lsl #8 ldr r6, [sp, #40] @R6 = u4_bs ldrd r10, r11, [sp, #44] @R10 = pu1_cliptab_cb , R11 = pu1_cliptab_cr vpush {d8 - d15} mov r12, r0 @keep a back up of R0 for buffer write vld4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0], r1 vld4.16 {d0[1], d2[1], d4[1], d6[1]}, [r0], r1 vld4.16 {d0[2], d2[2], d4[2], d6[2]}, [r0], r1 vld4.16 {d0[3], d2[3], d4[3], d6[3]}, [r0], r1 vld4.16 {d1[0], d3[0], d5[0], d7[0]}, [r0], r1 vld4.16 {d1[1], d3[1], d5[1], d7[1]}, [r0], r1 vld4.16 {d1[2], d3[2], d5[2], d7[2]}, [r0], r1 vld4.16 {d1[3], d3[3], d5[3], d7[3]}, [r0], r1 vdup.16 q11, r2 @Q11 = alpha vabd.u8 q4, q1, q2 @|p0-q0| vdup.16 q12, r3 @Q12 = beta vabd.u8 q5, q3, q2 @|q1-q0| vabd.u8 q6, q0, q1 @|p1-p0| vclt.u8 q4, q4, q11 @|p0-q0| < alpha ? vsubl.u8 q7, d0, d6 vclt.u8 q5, q5, q12 @|q1-q0| < beta ? vsubl.u8 q8, d1, d7 @(p1 - q1) vclt.u8 q6, q6, q12 @|p1-p0| < beta ? vsubl.u8 q9, d4, d2 vand.u8 q4, q4, q5 @|p0-q0| < alpha && |q1-q0| < beta vsubl.u8 q10, d5, d3 @(q0 - p0) vmov.u16 q14, #4 vld1.32 {d24[0]}, [r10] @Load ClipTable for U vld1.32 {d25[0]}, [r11] @Load ClipTable for V rev r6, r6 @Blocking strengths vand.u8 q4, q4, q6 @|p0-q0| < alpha && |q1-q0| < beta && |p1-p0| < beta vmov.32 d10[0], r6 vmla.s16 q7, q9, q14 vmla.s16 q8, q10, q14 @4*(q0 - p0) + (p1 - q1) vmovl.u8 q5, d10 vsli.u16 d10, d10, #8 vtbl.8 d12, {d24}, d10 @tC0 for U vtbl.8 d13, {d25}, d10 @tC0 for V vzip.8 d12, d13 vmovl.u16 q5, d10 vsli.u32 q5, q5, #16 vmov.u8 q12, #1 vadd.u8 q6, q6, q12 @tC0 + 1 vcge.u8 q5, q5, q12 @u4_bS > 0 ? vand.u8 q4, q4, q5 @|p0-q0| < alpha && |q1-q0| < beta && |p1-p0| < beta && u4_bs != 0 @ Q0 - Q3(inputs), @ Q4 (|p0-q0| < alpha && |q1-q0| < beta && |p1-p0| < beta && u4_bs != 0), @ Q6 (tC) vrshr.s16 q7, q7, #3 vrshr.s16 q8, q8, #3 @(((q0 - p0) << 2) + (p1 - q1) + 4) >> 3) vcgt.s16 q9, q7, #0 vcgt.s16 q10, q8, #0 vmovn.i16 d18, q9 vmovn.i16 d19, q10 @Q9 = sign(delta) vabs.s16 q7, q7 vabs.s16 q8, q8 vmovn.u16 d14, q7 vmovn.u16 d15, q8 vmin.u8 q7, q7, q6 @Q7 = |delta| vqadd.u8 q10, q1, q7 @p0+|delta| vqadd.u8 q11, q2, q7 @q0+|delta| vqsub.u8 q12, q1, q7 @p0-|delta| vqsub.u8 q13, q2, q7 @q0-|delta| vbit q12, q10, q9 @p0 + delta vbit q11, q13, q9 @q0 - delta vbit q1, q12, q4 vbit q2, q11, q4 vst4.16 {d0[0], d2[0], d4[0], d6[0]}, [r12], r1 vst4.16 {d0[1], d2[1], d4[1], d6[1]}, [r12], r1 vst4.16 {d0[2], d2[2], d4[2], d6[2]}, [r12], r1 vst4.16 {d0[3], d2[3], d4[3], d6[3]}, [r12], r1 vst4.16 {d1[0], d3[0], d5[0], d7[0]}, [r12], r1 vst4.16 {d1[1], d3[1], d5[1], d7[1]}, [r12], r1 vst4.16 {d1[2], d3[2], d5[2], d7[2]}, [r12], r1 vst4.16 {d1[3], d3[3], d5[3], d7[3]}, [r12], r1 vpop {d8 - d15} ldmfd sp!, {r4-r7, r10-r12, pc} @** @******************************************************************************* @* @* @brief @* Performs filtering of a chroma block vertical edge when the @* boundary strength is set to 4 on calling twice in high profile @* @* @par Description: @* This operation is described in Sec. 8.7.2.4 under the title @* "Filtering process for edges for bS equal to 4" in ITU T Rec H.264. @* @* @param[in] r0 - pu1_src @* Pointer to the src sample q0 @* @* @param[in] r1 - src_strd @* Source stride @* @* @param[in] r2 - alpha_cb @* Alpha Value for the boundary in U @* @* @param[in] r3 - beta_cb @* Beta Value for the boundary in U @* @* @param[in] sp(0) - alpha_cr @* Alpha Value for the boundary in V @* @* @param[in] sp(4) - beta_cr @* Beta Value for the boundary in V @* @* @returns @* None @* @* @remarks @* None @* @******************************************************************************* @* .global ih264_deblk_chroma_vert_bs4_mbaff_a9 ih264_deblk_chroma_vert_bs4_mbaff_a9: stmfd sp!, {r4, r5, r12, r14} sub r0, r0, #4 @point r0 to p1u of row0. mov r12, r0 @keep a back up of r0 for buffer write ldrd r4, r5, [sp, #16] @R4 = alpha_cr , R5 = beta_cr add r2, r2, r4, lsl #8 add r3, r3, r5, lsl #8 vpush {d8 - d15} vld4.16 {d0[0], d1[0], d2[0], d3[0]}, [r0], r1 vld4.16 {d0[1], d1[1], d2[1], d3[1]}, [r0], r1 vld4.16 {d0[2], d1[2], d2[2], d3[2]}, [r0], r1 vld4.16 {d0[3], d1[3], d2[3], d3[3]}, [r0], r1 vdup.16 d11, r2 @D11 = alpha vdup.16 d12, r3 @D12 = beta vmov.i8 d31, #2 vabd.u8 d4, d1, d2 @|p0-q0| vabd.u8 d5, d3, d2 @|q1-q0| vabd.u8 d6, d0, d1 @|p1-p0| vaddl.u8 q14, d1, d3 @(p0 + q1) vclt.u8 d4, d4, d11 @|p0-q0| < alpha ? vclt.u8 d5, d5, d12 @|q1-q0| < beta ? vclt.u8 d6, d6, d12 @|p1-p0| < beta ? vmlal.u8 q14, d0, d31 @2*p1 + (p0 + q1) vaddl.u8 q13, d0, d2 @(p1 + q0) vand.u8 d4, d4, d5 @|p0-q0| < alpha && |q1-q0| < beta vmlal.u8 q13, d3, d31 @2*q1 + (p1 + q0) vrshrn.i16 d7, q14, #2 @(2*p1 + (p0 + q1) + 2) >> 2 vand.u8 d4, d4, d6 @|p0-q0| < alpha && |q1-q0| < beta && |p1-p0| < beta vrshrn.i16 d9, q13, #2 @(2*q1 + (p1 + q0) + 2) >> 2 vbit d1, d7, d4 vbit d2, d9, d4 vst4.16 {d0[0], d1[0], d2[0], d3[0]}, [r12], r1 vst4.16 {d0[1], d1[1], d2[1], d3[1]}, [r12], r1 vst4.16 {d0[2], d1[2], d2[2], d3[2]}, [r12], r1 vst4.16 {d0[3], d1[3], d2[3], d3[3]}, [r12], r1 vpop {d8 - d15} ldmfd sp!, {r4, r5, r12, pc} @** @******************************************************************************* @* @* @brief @* Performs filtering of a chroma block vertical edge for cases where the @* boundary strength is less than 4 on calling twice in high profile @* @* @par Description: @* This operation is described in Sec. 8.7.2.4 under the title @* "Filtering process for edges for bS equal to 4" in ITU T Rec H.264. @* @* @param[in] r0 - pu1_src @* Pointer to the src sample q0 @* @* @param[in] r1 - src_strd @* Source stride @* @* @param[in] r2 - alpha_cb @* Alpha Value for the boundary in U @* @* @param[in] r3 - beta_cb @* Beta Value for the boundary in U @* @* @param[in] sp(0) - alpha_cr @* Alpha Value for the boundary in V @* @* @param[in] sp(4) - beta_cr @* Beta Value for the boundary in V @* @* @param[in] sp(8) - u4_bs @* Packed Boundary strength array @* @* @param[in] sp(12) - pu1_cliptab_cb @* tc0_table for U @* @* @param[in] sp(16) - pu1_cliptab_cr @* tc0_table for V @* @* @returns @* None @* @* @remarks @* None @* @******************************************************************************* @* .global ih264_deblk_chroma_vert_bslt4_mbaff_a9 ih264_deblk_chroma_vert_bslt4_mbaff_a9: stmfd sp!, {r4-r6, r10-r12, r14} sub r0, r0, #4 @point r0 to p1u of row0. mov r12, r0 @keep a back up of r0 for buffer write ldrd r4, r5, [sp, #28] @R4 = alpha_cr , R5 = beta_cr add r2, r2, r4, lsl #8 add r3, r3, r5, lsl #8 ldr r6, [sp, #36] @R6 = u4_bs ldrd r10, r11, [sp, #40] @R10 = pu1_cliptab_cb , R11 = pu1_cliptab_cr vpush {d8 - d15} vld4.16 {d0[0], d1[0], d2[0], d3[0]}, [r0], r1 vld4.16 {d0[1], d1[1], d2[1], d3[1]}, [r0], r1 vld4.16 {d0[2], d1[2], d2[2], d3[2]}, [r0], r1 vld4.16 {d0[3], d1[3], d2[3], d3[3]}, [r0], r1 vdup.16 d11, r2 @D11 = alpha vabd.u8 d4, d1, d2 @|p0-q0| vdup.16 d12, r3 @D12 = beta vabd.u8 d5, d3, d2 @|q1-q0| vabd.u8 d6, d0, d1 @|p1-p0| vclt.u8 d4, d4, d11 @|p0-q0| < alpha ? vclt.u8 d5, d5, d12 @|q1-q0| < beta ? vsubl.u8 q14, d0, d3 @(p1 - q1) vclt.u8 d6, d6, d12 @|p1-p0| < beta ? vand.u8 d4, d4, d5 @|p0-q0| < alpha && |q1-q0| < beta vsubl.u8 q12, d2, d1 @(q0 - p0) vmov.u16 q10, #4 vld1.32 {d31[1]}, [r10] @Load ClipTable for U vld1.32 {d31[0]}, [r11] @Load ClipTable for V rev r6, r6 @Blocking strengths vand.u8 d4, d4, d6 @|p0-q0| < alpha && |q1-q0| < beta && |p1-p0| < beta vmov.32 d22[0], r6 vmla.s16 q14, q12, q10 @4*(q0 - p0) + (p1 - q1) vmovl.u8 q11, d22 vsli.u16 d22, d22, #8 vmov.u16 d13, #4 vadd.u8 d22, d22, d13 vtbl.8 d6, {d31}, d22 @tC0 vmov.u8 d12, #1 vsub.u8 d22, d22, d13 vadd.u8 d6, d6, d12 @tC0 + 1 vcge.u8 d5, d22, d12 @u4_bS > 0 ? vand.u8 d4, d4, d5 @|p0-q0| < alpha && |q1-q0| < beta && |p1-p0| < beta && u4_bs != 0 @ D0 - D3(inputs), @ D4 (|p0-q0| < alpha && |q1-q0| < beta && |p1-p0| < beta && u4_bs != 0), @ D6 (tC) vrshr.s16 q14, q14, #3 @(((q0 - p0) << 2) + (p1 - q1) + 4) >> 3) vcgt.s16 q13, q14, #0 vmovn.i16 d9, q13 @D9 = sign(delta) vabs.s16 q14, q14 vmovn.u16 d7, q14 vmin.u8 d7, d7, d6 @D7 = |delta| vqadd.u8 d10, d1, d7 @p0+|delta| vqadd.u8 d11, d2, d7 @q0+|delta| vqsub.u8 d12, d1, d7 @p0-|delta| vqsub.u8 d13, d2, d7 @q0-|delta| vbit d12, d10, d9 @p0 + delta vbit d11, d13, d9 @q0 - delta vbit d1, d12, d4 vbit d2, d11, d4 vst4.16 {d0[0], d1[0], d2[0], d3[0]}, [r12], r1 vst4.16 {d0[1], d1[1], d2[1], d3[1]}, [r12], r1 vst4.16 {d0[2], d1[2], d2[2], d3[2]}, [r12], r1 vst4.16 {d0[3], d1[3], d2[3], d3[3]}, [r12], r1 vpop {d8 - d15} ldmfd sp!, {r4-r6, r10-r12, pc}