diff options
author | Hamsalekha S <hamsalekha.s@ittiam.com> | 2015-03-13 21:24:58 +0530 |
---|---|---|
committer | Hamsalekha S <hamsalekha.s@ittiam.com> | 2015-04-02 15:59:02 +0530 |
commit | 8d3d303c7942ced6a987a52db8977d768dc3605f (patch) | |
tree | cc806c96794356996b13ba9970941d0aed74a97e /common | |
parent | 3956d913d37327dcb340f836e604b04bd478b158 (diff) | |
download | android_external_libavc-8d3d303c7942ced6a987a52db8977d768dc3605f.tar.gz android_external_libavc-8d3d303c7942ced6a987a52db8977d768dc3605f.tar.bz2 android_external_libavc-8d3d303c7942ced6a987a52db8977d768dc3605f.zip |
Initial version
Change-Id: I7efe9a589cd24edf86e8d086b40c27cbbf8b4017
Diffstat (limited to 'common')
120 files changed, 76864 insertions, 0 deletions
diff --git a/common/arm/ih264_arm_memory_barrier.s b/common/arm/ih264_arm_memory_barrier.s new file mode 100755 index 0000000..523218f --- /dev/null +++ b/common/arm/ih264_arm_memory_barrier.s @@ -0,0 +1,77 @@ +@/****************************************************************************** +@ * +@ * Copyright (C) 2015 The Android Open Source Project +@ * +@ * Licensed under the Apache License, Version 2.0 (the "License"); +@ * you may not use this file except in compliance with the License. +@ * You may obtain a copy of the License at: +@ * +@ * http://www.apache.org/licenses/LICENSE-2.0 +@ * +@ * Unless required by applicable law or agreed to in writing, software +@ * distributed under the License is distributed on an "AS IS" BASIS, +@ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +@ * See the License for the specific language governing permissions and +@ * limitations under the License. +@ * +@ ***************************************************************************** +@ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore +@*/ +@/** +@******************************************************************************* +@* @file +@* ih264_arm_memory_barrier.s +@* +@* @brief +@* Contains function definitions for data synchronization. +@* +@* @author +@* Ittiam +@* +@* @par List of Functions: +@* +@* +@* @remarks +@* None +@* +@******************************************************************************* + +.text +.p2align 2 + + +@***************************************************************************** +@* +@* Function Name : ih264_arm_dsb +@* Description : Adds DSB +@* Revision History : +@* DD MM YYYY Author(s) Changes +@* 03 07 2008 100355 First version +@* +@***************************************************************************** + + .global ih264_arm_dsb +ih264_arm_dsb: + dsb + bx lr + + + +@***************************************************************************** +@* +@* Function Name : ih264_arm_dmb +@* Description : Adds DMB +@* Revision History : +@* DD MM YYYY Author(s) Changes +@* 03 07 2008 100355 First version +@* +@***************************************************************************** + + .global ih264_arm_dmb + +ih264_arm_dmb: + dmb + bx lr + + + diff --git a/common/arm/ih264_deblk_chroma_a9.s b/common/arm/ih264_deblk_chroma_a9.s new file mode 100755 index 0000000..66102a7 --- /dev/null +++ b/common/arm/ih264_deblk_chroma_a9.s @@ -0,0 +1,1337 @@ +@/****************************************************************************** +@ * +@ * Copyright (C) 2015 The Android Open Source Project +@ * +@ * Licensed under the Apache License, Version 2.0 (the "License"); +@ * you may not use this file except in compliance with the License. +@ * You may obtain a copy of the License at: +@ * +@ * http://www.apache.org/licenses/LICENSE-2.0 +@ * +@ * Unless required by applicable law or agreed to in writing, software +@ * distributed under the License is distributed on an "AS IS" BASIS, +@ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +@ * See the License for the specific language governing permissions and +@ * limitations under the License. +@ * +@ ***************************************************************************** +@ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore +@*/ +@/*****************************************************************************/ +@/* */ +@/* File Name : ih264_deblk_chroma_a9.s */ +@/* */ +@/* Description : Contains function definitions for deblocking luma */ +@/* edge. Functions are coded in NEON assembly and can */ +@/* be compiled using ARM RVDS. */ +@/* */ +@/* List of Functions : ih264_deblk_chroma_vert_bs4_bp_a9() */ +@/* ih264_deblk_chroma_vert_bslt4_bp_a9() */ +@/* ih264_deblk_chroma_horz_bs4_bp_a9() */ +@/* ih264_deblk_chroma_horz_bslt4_bp_a9() */ +@/* ih264_deblk_chroma_vert_bs4_mbaff_bp_a9() */ +@/* ih264_deblk_chroma_vert_bslt4_mbaff_bp_a9() */ +@/* ih264_deblk_chroma_vert_bs4_a9() */ +@/* ih264_deblk_chroma_vert_bslt4_a9() */ +@/* ih264_deblk_chroma_horz_bs4_a9() */ +@/* ih264_deblk_chroma_horz_bslt4_a9() */ +@/* ih264_deblk_chroma_vert_bs4_mbaff_a9() */ +@/* ih264_deblk_chroma_vert_bslt4_mbaff_a9() */ +@/* */ +@/* Issues / Problems : None */ +@/* */ +@/* Revision History : */ +@/* */ +@/* DD MM YYYY Author(s) Changes (Describe the changes made) */ +@/* 28 11 2013 Ittiam Draft */ +@/* 05 01 2015 Kaushik Added double-call functions for */ +@/* Senthoor vertical deblocking, and high */ +@/* profile functions. */ +@/* */ +@/*****************************************************************************/ + + +.text +.p2align 2 + +@/** +@******************************************************************************* +@* +@* @brief +@* Performs filtering of a chroma block horizontal edge when the +@* boundary strength is set to 4 +@* +@* @par Description: +@* This operation is described in Sec. 8.7.2.4 under the title +@* "Filtering process for edges for bS equal to 4" in ITU T Rec H.264. +@* +@* @param[in] r0 - pu1_src +@* Pointer to the src sample q0 +@* +@* @param[in] r1 - src_strd +@* Source stride +@* +@* @param[in] r2 - alpha +@* Alpha Value for the boundary +@* +@* @param[in] r3 - beta +@* Beta Value for the boundary +@* +@* @returns +@* None +@* +@* @remarks +@* None +@* +@******************************************************************************* +@*/ + + .global ih264_deblk_chroma_horz_bs4_bp_a9 + +ih264_deblk_chroma_horz_bs4_bp_a9: + + stmfd sp!, {r4, lr} @ + vpush {d8 - d15} + sub r0, r0, r1, lsl #1 @R0 = uc_edgePixel pointing to p1 of chroma + vld2.8 {d6, d7}, [r0], r1 @D6 = p1u , D7 = p1v + mov r4, r0 @Keeping a backup of the pointer p0 of chroma + vld2.8 {d4, d5}, [r0], r1 @D4 = p0u , D5 = p0v + vdup.8 q10, r2 @Q10 contains alpha + vld2.8 {d0, d1}, [r0], r1 @D0 = q0u , D1 = q0v + vaddl.u8 q4, d6, d0 @ + vaddl.u8 q5, d7, d1 @Q4,Q5 = q0 + p1 + vmov.i8 d31, #2 @ + vld2.8 {d2, d3}, [r0] @D2 = q1u , D3 = q1v + vabd.u8 q13, q3, q2 @Q13 = ABS(p1 - p0) + vmlal.u8 q4, d2, d31 @ + vmlal.u8 q5, d3, d31 @Q5,Q4 = (X2(q1U) + q0U + p1U) + vabd.u8 q11, q2, q0 @Q11 = ABS(p0 - q0) + vabd.u8 q12, q1, q0 @Q12 = ABS(q1 - q0) + vaddl.u8 q7, d4, d2 @ + vaddl.u8 q14, d5, d3 @Q14,Q7 = P0 + Q1 + vdup.8 q8, r3 @Q8 contains beta + vmlal.u8 q7, d6, d31 @ + vmlal.u8 q14, d7, d31 @Q14,Q7 = (X2(p1U) + p0U + q1U) + vcge.u8 q9, q11, q10 @Q9 = ( ABS(p0 - q0) >= Alpha ) + vcge.u8 q12, q12, q8 @Q12= ( ABS(q1 - q0) >= Beta ) + vcge.u8 q13, q13, q8 @Q13= ( ABS(p1 - p0) >= Beta ) + vrshrn.u16 d8, q4, #2 @ + vrshrn.u16 d9, q5, #2 @Q4 = (X2(q1U) + q0U + p1U + 2) >> 2 + vorr q9, q9, q12 @Q9 = ( ABS(p0 - q0) >= Alpha ) | ( ABS(q1 - q0) >= Beta ) + vrshrn.u16 d10, q7, #2 @ + vrshrn.u16 d11, q14, #2 @Q5 = (X2(p1U) + p0U + q1U + 2) >> 2 + vorr q9, q9, q13 @Q9 = ( ABS(p0 - q0) >= Alpha ) | ( ABS(q1 - q0) >= Beta ) | ( ABS(p1 - p0) >= Beta ) + vbit q5, q2, q9 @ + vbit q4, q0, q9 @ + vst2.8 {d10, d11}, [r4], r1 @ + vst2.8 {d8, d9}, [r4] @ + vpop {d8 - d15} + ldmfd sp!, {r4, pc} @ + + + +@/** +@******************************************************************************* +@* +@* @brief +@* Performs filtering of a chroma block vertical edge when the +@* boundary strength is set to 4 +@* +@* @par Description: +@* This operation is described in Sec. 8.7.2.4 under the title +@* "Filtering process for edges for bS equal to 4" in ITU T Rec H.264. +@* +@* @param[in] r0 - pu1_src +@* Pointer to the src sample q0 +@* +@* @param[in] r1 - src_strd +@* Source stride +@* +@* @param[in] r2 - alpha +@* Alpha Value for the boundary +@* +@* @param[in] r3 - beta +@* Beta Value for the boundary +@* +@* @returns +@* None +@* +@* @remarks +@* None +@* +@******************************************************************************* +@*/ + + .global ih264_deblk_chroma_vert_bs4_bp_a9 + +ih264_deblk_chroma_vert_bs4_bp_a9: + + stmfd sp!, {r12, r14} + vpush {d8 - d15} + sub r0, r0, #4 @point r0 to p1u of row0. + mov r12, r0 @keep a back up of r0 for buffer write + + vld4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0], r1 + vld4.16 {d0[1], d2[1], d4[1], d6[1]}, [r0], r1 + vld4.16 {d0[2], d2[2], d4[2], d6[2]}, [r0], r1 + vld4.16 {d0[3], d2[3], d4[3], d6[3]}, [r0], r1 + + vld4.16 {d1[0], d3[0], d5[0], d7[0]}, [r0], r1 + vld4.16 {d1[1], d3[1], d5[1], d7[1]}, [r0], r1 + vld4.16 {d1[2], d3[2], d5[2], d7[2]}, [r0], r1 + vld4.16 {d1[3], d3[3], d5[3], d7[3]}, [r0], r1 + + vdup.8 q11, r2 @Q4 = alpha + vdup.8 q12, r3 @Q5 = beta + vmov.i8 d31, #2 + + vabd.u8 q4, q1, q2 @|p0-q0| + vabd.u8 q5, q3, q2 @|q1-q0| + vabd.u8 q6, q0, q1 @|p1-p0| + vaddl.u8 q7, d2, d6 + vaddl.u8 q8, d3, d7 @(p0 + q1) + vclt.u8 q4, q4, q11 @|p0-q0| < alpha ? + vclt.u8 q5, q5, q12 @|q1-q0| < beta ? + vclt.u8 q6, q6, q12 @|p1-p0| < beta ? + vmlal.u8 q7, d0, d31 + vmlal.u8 q8, d1, d31 @2*p1 + (p0 + q1) + vaddl.u8 q9, d0, d4 + vaddl.u8 q10, d1, d5 @(p1 + q0) + vand.u8 q4, q4, q5 @|p0-q0| < alpha && |q1-q0| < beta + vmlal.u8 q9, d6, d31 + vmlal.u8 q10, d7, d31 @2*q1 + (p1 + q0) + + vrshrn.i16 d14, q7, #2 + vrshrn.i16 d15, q8, #2 @(2*p1 + (p0 + q1) + 2) >> 2 + vand.u8 q4, q4, q6 @|p0-q0| < alpha && |q1-q0| < beta && |p1-p0| < beta + vrshrn.i16 d18, q9, #2 + vrshrn.i16 d19, q10, #2 @(2*q1 + (p1 + q0) + 2) >> 2 + + vbit q1, q7, q4 + vbit q2, q9, q4 + + vst4.16 {d0[0], d2[0], d4[0], d6[0]}, [r12], r1 + vst4.16 {d0[1], d2[1], d4[1], d6[1]}, [r12], r1 + vst4.16 {d0[2], d2[2], d4[2], d6[2]}, [r12], r1 + vst4.16 {d0[3], d2[3], d4[3], d6[3]}, [r12], r1 + + vst4.16 {d1[0], d3[0], d5[0], d7[0]}, [r12], r1 + vst4.16 {d1[1], d3[1], d5[1], d7[1]}, [r12], r1 + vst4.16 {d1[2], d3[2], d5[2], d7[2]}, [r12], r1 + vst4.16 {d1[3], d3[3], d5[3], d7[3]}, [r12], r1 + vpop {d8 - d15} + ldmfd sp!, {r12, pc} + + + +@/** +@******************************************************************************* +@* +@* @brief +@* Performs filtering of a chroma block horizontal edge for cases where the +@* boundary strength is less than 4 +@* +@* @par Description: +@* This operation is described in Sec. 8.7.2.4 under the title +@* "Filtering process for edges for bS equal to 4" in ITU T Rec H.264. +@* +@* @param[in] r0 - pu1_src +@* Pointer to the src sample q0 +@* +@* @param[in] r1 - src_strd +@* Source stride +@* +@* @param[in] r2 - alpha +@* Alpha Value for the boundary +@* +@* @param[in] r3 - beta +@* Beta Value for the boundary +@* +@* @param[in] sp(0) - u4_bs +@* Packed Boundary strength array +@* +@* @param[in] sp(4) - pu1_cliptab +@* tc0_table +@* +@* @returns +@* None +@* +@* @remarks +@* None +@* +@******************************************************************************* +@*/ + + .global ih264_deblk_chroma_horz_bslt4_bp_a9 + +ih264_deblk_chroma_horz_bslt4_bp_a9: + + stmfd sp!, {r4-r6, lr} @ + + ldrd r4, r5, [sp, #0x10] @r4 = u4_bs , r5 = pu1_cliptab + vpush {d8 - d15} + sub r0, r0, r1, lsl #1 @R0 = uc_edgePixelU pointing to p2 of chroma U + rev r4, r4 @ + vmov.32 d12[0], r4 @d12[0] = ui_Bs + vld1.32 d16[0], [r5] @D16[0] contains cliptab + vld2.8 {d6, d7}, [r0], r1 @Q3=p1 + vtbl.8 d14, {d16}, d12 @ + vmovl.u8 q6, d12 @q6 = uc_Bs in each 16 bit scalar + mov r6, r0 @Keeping a backup of the pointer to chroma U P0 + vld2.8 {d4, d5}, [r0], r1 @Q2=p0 + vmov.i8 d30, #1 @ + vdup.8 q10, r2 @Q10 contains alpha + vld2.8 {d0, d1}, [r0], r1 @Q0=q0 + vmovl.u8 q7, d14 @ + vld2.8 {d2, d3}, [r0] @Q1=q1 + vsubl.u8 q5, d1, d5 @ + vsubl.u8 q4, d0, d4 @Q5,Q4 = (q0 - p0) + vabd.u8 q13, q3, q2 @Q13 = ABS(p1 - p0) + vshl.i16 q5, q5, #2 @Q5 = (q0 - p0)<<2 + vabd.u8 q11, q2, q0 @Q11 = ABS(p0 - q0) + vshl.i16 q4, q4, #2 @Q4 = (q0 - p0)<<2 + vsli.16 q7, q7, #8 @ + vabd.u8 q12, q1, q0 @Q12 = ABS(q1 - q0) + vcge.u8 q9, q11, q10 @Q9 = ( ABS(p0 - q0) >= Alpha ) + vsubl.u8 q10, d6, d2 @Q10 = (p1 - q1)L + vsubl.u8 q3, d7, d3 @Q3 = (p1 - q1)H + vdup.8 q8, r3 @Q8 contains beta + vadd.i16 q4, q4, q10 @ + vadd.i16 q5, q5, q3 @Q5,Q4 = [ (q0 - p0)<<2 ] + (p1 - q1) + vcge.u8 q12, q12, q8 @Q12= ( ABS(q1 - q0) >= Beta ) + vcgt.s16 d12, d12, #0 @Q6 = (us_Bs > 0) + vqrshrn.s16 d8, q4, #3 @ + vqrshrn.s16 d9, q5, #3 @Q4 = i_macro = (((q0 - p0)<<2) + (p1 - q1) + 4)>>3 + vadd.i8 d14, d14, d30 @Q7 = C = C0+1 + vcge.u8 q13, q13, q8 @Q13= ( ABS(p1 - p0) >= Beta ) + vorr q9, q9, q12 @Q9 = ( ABS(p0 - q0) >= Alpha ) | ( ABS(q1 - q0) >= Beta ) + vabs.s8 q3, q4 @Q4 = ABS (i_macro) + vmov.i8 d15, d14 @ + vmov.i8 d13, d12 @ + vorr q9, q9, q13 @Q9 = ( ABS(p0 - q0) >= Alpha ) | ( ABS(q1 - q0) >= Beta ) | ( ABS(p1 - p0) >= Beta ) + vmin.u8 q7, q3, q7 @Q7 = delta = (ABS(i_macro) > C) ? C : ABS(i_macro) + vbic q6, q6, q9 @final condition + vcge.s8 q4, q4, #0 @Q4 = (i_macro >= 0) + vand q7, q7, q6 @Making delta zero in places where values shouldn be filterd + vqadd.u8 q8, q2, q7 @Q8 = p0 + delta + vqsub.u8 q2, q2, q7 @Q2 = p0 - delta + vqadd.u8 q9, q0, q7 @Q9 = q0 + delta + vqsub.u8 q0, q0, q7 @Q0 = q0 - delta + vbif q8, q2, q4 @Q8 = (i_macro >= 0 ) ? (p0+delta) : (p0-delta) + vbif q0, q9, q4 @Q0 = (i_macro >= 0 ) ? (q0-delta) : (q0+delta) + vst2.8 {d16, d17}, [r6], r1 @ + vst2.8 {d0, d1}, [r6] @ + vpop {d8 - d15} + ldmfd sp!, {r4-r6, pc} @ + + + +@/** +@******************************************************************************* +@* +@* @brief +@* Performs filtering of a chroma block vertical edge for cases where the +@* boundary strength is less than 4 +@* +@* @par Description: +@* This operation is described in Sec. 8.7.2.4 under the title +@* "Filtering process for edges for bS equal to 4" in ITU T Rec H.264. +@* +@* @param[in] r0 - pu1_src +@* Pointer to the src sample q0 +@* +@* @param[in] r1 - src_strd +@* Source stride +@* +@* @param[in] r2 - alpha +@* Alpha Value for the boundary +@* +@* @param[in] r3 - beta +@* Beta Value for the boundary +@* +@* @param[in] sp(0) - u4_bs +@* Packed Boundary strength array +@* +@* @param[in] sp(4) - pu1_cliptab +@* tc0_table +@* +@* @returns +@* None +@* +@* @remarks +@* None +@* +@******************************************************************************* +@*/ + + .global ih264_deblk_chroma_vert_bslt4_bp_a9 + +ih264_deblk_chroma_vert_bslt4_bp_a9: + + stmfd sp!, {r10-r12, r14} + + sub r0, r0, #4 @point r0 to p1u of row0. + ldr r11, [sp, #16] @r12 = ui_Bs + + ldr r10, [sp, #20] @r14 = puc_ClipTab + mov r12, r0 @keep a back up of r0 for buffer write + vpush {d8 - d15} + vld4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0], r1 + vld4.16 {d0[1], d2[1], d4[1], d6[1]}, [r0], r1 + vld4.16 {d0[2], d2[2], d4[2], d6[2]}, [r0], r1 + vld4.16 {d0[3], d2[3], d4[3], d6[3]}, [r0], r1 + + vld4.16 {d1[0], d3[0], d5[0], d7[0]}, [r0], r1 + vld4.16 {d1[1], d3[1], d5[1], d7[1]}, [r0], r1 + vld4.16 {d1[2], d3[2], d5[2], d7[2]}, [r0], r1 + vld4.16 {d1[3], d3[3], d5[3], d7[3]}, [r0], r1 + + + vdup.8 q11, r2 @Q4 = alpha + vabd.u8 q4, q1, q2 @|p0-q0| + vdup.8 q12, r3 @Q5 = beta + vabd.u8 q5, q3, q2 @|q1-q0| + vabd.u8 q6, q0, q1 @|p1-p0| + vclt.u8 q4, q4, q11 @|p0-q0| < alpha ? + vsubl.u8 q7, d0, d6 + vclt.u8 q5, q5, q12 @|q1-q0| < beta ? + vsubl.u8 q8, d1, d7 @(p1 - q1) + vclt.u8 q6, q6, q12 @|p1-p0| < beta ? + vsubl.u8 q9, d4, d2 + vand.u8 q4, q4, q5 @|p0-q0| < alpha && |q1-q0| < beta + vsubl.u8 q10, d5, d3 @(q0 - p0) + vmov.u16 q14, #4 + vld1.32 {d24[0]}, [r10] @Load ClipTable + rev r11, r11 @Blocking strengths + vand.u8 q4, q4, q6 @|p0-q0| < alpha && |q1-q0| < beta && |p1-p0| < beta + + vmov.32 d10[0], r11 + + vmla.s16 q7, q9, q14 + vmla.s16 q8, q10, q14 @4*(q0 - p0) + (p1 - q1) + + vmovl.u8 q5, d10 + + + vsli.u16 d10, d10, #8 + vmovl.u16 q5, d10 + vsli.u32 q5, q5, #16 + vtbl.8 d12, {d24}, d10 + vtbl.8 d13, {d24}, d11 @tC0 + vmov.u8 q12, #1 + vadd.u8 q6, q6, q12 @tC0 + 1 + vcge.u8 q5, q5, q12 @u4_bS > 0 ? + vand.u8 q4, q4, q5 @|p0-q0| < alpha && |q1-q0| < beta && |p1-p0| < beta && u4_bs != 0 + + @ Q0 - Q3(inputs), + @ Q4 (|p0-q0| < alpha && |q1-q0| < beta && |p1-p0| < beta && u4_bs != 0), + @ Q6 (tC) + + vrshr.s16 q7, q7, #3 + vrshr.s16 q8, q8, #3 @(((q0 - p0) << 2) + (p1 - q1) + 4) >> 3) + + vcgt.s16 q9, q7, #0 + vcgt.s16 q10, q8, #0 + vmovn.i16 d18, q9 + vmovn.i16 d19, q10 @Q9 = sign(delta) + vabs.s16 q7, q7 + vabs.s16 q8, q8 + vmovn.u16 d14, q7 + vmovn.u16 d15, q8 + vmin.u8 q7, q7, q6 @Q7 = |delta| + + vqadd.u8 q10, q1, q7 @p0+|delta| + vqadd.u8 q11, q2, q7 @q0+|delta| + vqsub.u8 q12, q1, q7 @p0-|delta| + vqsub.u8 q13, q2, q7 @q0-|delta| + + vbit q12, q10, q9 @p0 + delta + vbit q11, q13, q9 @q0 - delta + + vbit q1, q12, q4 + vbit q2, q11, q4 + + vst4.16 {d0[0], d2[0], d4[0], d6[0]}, [r12], r1 + vst4.16 {d0[1], d2[1], d4[1], d6[1]}, [r12], r1 + vst4.16 {d0[2], d2[2], d4[2], d6[2]}, [r12], r1 + vst4.16 {d0[3], d2[3], d4[3], d6[3]}, [r12], r1 + + vst4.16 {d1[0], d3[0], d5[0], d7[0]}, [r12], r1 + vst4.16 {d1[1], d3[1], d5[1], d7[1]}, [r12], r1 + vst4.16 {d1[2], d3[2], d5[2], d7[2]}, [r12], r1 + vst4.16 {d1[3], d3[3], d5[3], d7[3]}, [r12], r1 + vpop {d8 - d15} + ldmfd sp!, {r10-r12, pc} + + + +@/** +@******************************************************************************* +@* +@* @brief +@* Performs filtering of a chroma block vertical edge when the +@* boundary strength is set to 4 on calling twice +@* +@* @par Description: +@* This operation is described in Sec. 8.7.2.4 under the title +@* "Filtering process for edges for bS equal to 4" in ITU T Rec H.264. +@* +@* @param[in] r0 - pu1_src +@* Pointer to the src sample q0 +@* +@* @param[in] r1 - src_strd +@* Source stride +@* +@* @param[in] r2 - alpha +@* Alpha Value for the boundary +@* +@* @param[in] r3 - beta +@* Beta Value for the boundary +@* +@* @returns +@* None +@* +@* @remarks +@* None +@* +@******************************************************************************* +@*/ + + .global ih264_deblk_chroma_vert_bs4_mbaff_bp_a9 + +ih264_deblk_chroma_vert_bs4_mbaff_bp_a9: + + stmfd sp!, {r12, r14} + vpush {d8 - d15} + sub r0, r0, #4 @point r0 to p1u of row0. + mov r12, r0 @keep a back up of r0 for buffer write + + vld4.16 {d0[0], d1[0], d2[0], d3[0]}, [r0], r1 + vld4.16 {d0[1], d1[1], d2[1], d3[1]}, [r0], r1 + vld4.16 {d0[2], d1[2], d2[2], d3[2]}, [r0], r1 + vld4.16 {d0[3], d1[3], d2[3], d3[3]}, [r0], r1 + + vdup.8 d11, r2 @D11 = alpha + vdup.8 d12, r3 @D12 = beta + vmov.i8 d31, #2 + + vabd.u8 d4, d1, d2 @|p0-q0| + vabd.u8 d5, d3, d2 @|q1-q0| + vabd.u8 d6, d0, d1 @|p1-p0| + vaddl.u8 q14, d1, d3 @(p0 + q1) + vclt.u8 d4, d4, d11 @|p0-q0| < alpha ? + vclt.u8 d5, d5, d12 @|q1-q0| < beta ? + vclt.u8 d6, d6, d12 @|p1-p0| < beta ? + vmlal.u8 q14, d0, d31 @2*p1 + (p0 + q1) + vaddl.u8 q13, d0, d2 @(p1 + q0) + vand.u8 d4, d4, d5 @|p0-q0| < alpha && |q1-q0| < beta + vmlal.u8 q13, d3, d31 @2*q1 + (p1 + q0) + + vrshrn.i16 d7, q14, #2 @(2*p1 + (p0 + q1) + 2) >> 2 + vand.u8 d4, d4, d6 @|p0-q0| < alpha && |q1-q0| < beta && |p1-p0| < beta + vrshrn.i16 d9, q13, #2 @(2*q1 + (p1 + q0) + 2) >> 2 + + vbit d1, d7, d4 + vbit d2, d9, d4 + + vst4.16 {d0[0], d1[0], d2[0], d3[0]}, [r12], r1 + vst4.16 {d0[1], d1[1], d2[1], d3[1]}, [r12], r1 + vst4.16 {d0[2], d1[2], d2[2], d3[2]}, [r12], r1 + vst4.16 {d0[3], d1[3], d2[3], d3[3]}, [r12], r1 + vpop {d8 - d15} + ldmfd sp!, {r12, pc} + + + +@/** +@******************************************************************************* +@* +@* @brief +@* Performs filtering of a chroma block vertical edge for cases where the +@* boundary strength is less than 4 on calling twice +@* +@* @par Description: +@* This operation is described in Sec. 8.7.2.4 under the title +@* "Filtering process for edges for bS equal to 4" in ITU T Rec H.264. +@* +@* @param[in] r0 - pu1_src +@* Pointer to the src sample q0 +@* +@* @param[in] r1 - src_strd +@* Source stride +@* +@* @param[in] r2 - alpha +@* Alpha Value for the boundary +@* +@* @param[in] r3 - beta +@* Beta Value for the boundary +@* +@* @param[in] sp(0) - u4_bs +@* Packed Boundary strength array +@* +@* @param[in] sp(4) - pu1_cliptab +@* tc0_table +@* +@* @returns +@* None +@* +@* @remarks +@* None +@* +@******************************************************************************* +@*/ + + .global ih264_deblk_chroma_vert_bslt4_mbaff_bp_a9 + +ih264_deblk_chroma_vert_bslt4_mbaff_bp_a9: + + stmfd sp!, {r10-r12, r14} + + sub r0, r0, #4 @point r0 to p1u of row0. + ldr r11, [sp, #16] @r11 = ui_Bs + + ldr r10, [sp, #20] @r10 = puc_ClipTab + mov r12, r0 @keep a back up of r0 for buffer write + vpush {d8 - d15} + vld4.16 {d0[0], d1[0], d2[0], d3[0]}, [r0], r1 + vld4.16 {d0[1], d1[1], d2[1], d3[1]}, [r0], r1 + vld4.16 {d0[2], d1[2], d2[2], d3[2]}, [r0], r1 + vld4.16 {d0[3], d1[3], d2[3], d3[3]}, [r0], r1 + + vdup.8 d11, r2 @D11 = alpha + vabd.u8 d4, d1, d2 @|p0-q0| + vdup.8 d12, r3 @D12 = beta + vabd.u8 d5, d3, d2 @|q1-q0| + vabd.u8 d6, d0, d1 @|p1-p0| + vclt.u8 d4, d4, d11 @|p0-q0| < alpha ? + vclt.u8 d5, d5, d12 @|q1-q0| < beta ? + vsubl.u8 q14, d0, d3 @(p1 - q1) + vclt.u8 d6, d6, d12 @|p1-p0| < beta ? + vand.u8 d4, d4, d5 @|p0-q0| < alpha && |q1-q0| < beta + vsubl.u8 q12, d2, d1 @(q0 - p0) + vmov.u16 q10, #4 + + vld1.32 {d31[0]}, [r10] @Load ClipTable + rev r11, r11 @Blocking strengths + vand.u8 d4, d4, d6 @|p0-q0| < alpha && |q1-q0| < beta && |p1-p0| < beta + vmov.32 d22[0], r11 + vmla.s16 q14, q12, q10 @4*(q0 - p0) + (p1 - q1) + vmovl.u8 q11, d22 + vsli.u16 d22, d22, #8 + vtbl.8 d6, {d31}, d22 @tC0 + vmov.u8 d12, #1 + vadd.u8 d6, d6, d12 @tC0 + 1 + vcge.u8 d5, d22, d12 @u4_bS > 0 ? + vand.u8 d4, d4, d5 @|p0-q0| < alpha && |q1-q0| < beta && |p1-p0| < beta && u4_bs != 0 + + @ D0 - D3(inputs), + @ D4 (|p0-q0| < alpha && |q1-q0| < beta && |p1-p0| < beta && u4_bs != 0), + @ D6 (tC) + + vrshr.s16 q14, q14, #3 @(((q0 - p0) << 2) + (p1 - q1) + 4) >> 3) + + vcgt.s16 q13, q14, #0 + vmovn.i16 d9, q13 @D9 = sign(delta) + vabs.s16 q14, q14 + vmovn.u16 d7, q14 + vmin.u8 d7, d7, d6 @D7 = |delta| + + vqadd.u8 d10, d1, d7 @p0+|delta| + vqadd.u8 d11, d2, d7 @q0+|delta| + vqsub.u8 d12, d1, d7 @p0-|delta| + vqsub.u8 d13, d2, d7 @q0-|delta| + + vbit d12, d10, d9 @p0 + delta + vbit d11, d13, d9 @q0 - delta + + vbit d1, d12, d4 + vbit d2, d11, d4 + + vst4.16 {d0[0], d1[0], d2[0], d3[0]}, [r12], r1 + vst4.16 {d0[1], d1[1], d2[1], d3[1]}, [r12], r1 + vst4.16 {d0[2], d1[2], d2[2], d3[2]}, [r12], r1 + vst4.16 {d0[3], d1[3], d2[3], d3[3]}, [r12], r1 + vpop {d8 - d15} + ldmfd sp!, {r10-r12, pc} + + + +@/** +@******************************************************************************* +@* +@* @brief +@* Performs filtering of a chroma block horizontal edge when the +@* boundary strength is set to 4 in high profile +@* +@* @par Description: +@* This operation is described in Sec. 8.7.2.4 under the title +@* "Filtering process for edges for bS equal to 4" in ITU T Rec H.264. +@* +@* @param[in] r0 - pu1_src +@* Pointer to the src sample q0 +@* +@* @param[in] r1 - src_strd +@* Source stride +@* +@* @param[in] r2 - alpha_cb +@* Alpha Value for the boundary in U +@* +@* @param[in] r3 - beta_cb +@* Beta Value for the boundary in U +@* +@* @param[in] sp(0) - alpha_cr +@* Alpha Value for the boundary in V +@* +@* @param[in] sp(4) - beta_cr +@* Beta Value for the boundary in V +@* +@* @returns +@* None +@* +@* @remarks +@* None +@* +@******************************************************************************* +@*/ + + .global ih264_deblk_chroma_horz_bs4_a9 + +ih264_deblk_chroma_horz_bs4_a9: + + stmfd sp!, {r4-r6, lr} @ + + ldr r5, [sp, #16] @R5 = alpha_cr + ldr r6, [sp, #20] @R6 = beta_cr + vpush {d8 - d15} + sub r0, r0, r1, lsl #1 @R0 = uc_edgePixel pointing to p1 of chroma + vld2.8 {d6, d7}, [r0], r1 @D6 = p1u , D7 = p1v + mov r4, r0 @Keeping a backup of the pointer p0 of chroma + vld2.8 {d4, d5}, [r0], r1 @D4 = p0u , D5 = p0v + vdup.8 d20, r2 @D20 contains alpha_cb + vdup.8 d21, r5 @D21 contains alpha_cr + vld2.8 {d0, d1}, [r0], r1 @D0 = q0u , D1 = q0v + vaddl.u8 q4, d6, d0 @ + vaddl.u8 q5, d7, d1 @Q4,Q5 = q0 + p1 + vmov.i8 d31, #2 @ + vld2.8 {d2, d3}, [r0] @D2 = q1u , D3 = q1v + vabd.u8 q13, q3, q2 @Q13 = ABS(p1 - p0) + vmlal.u8 q4, d2, d31 @ + vmlal.u8 q5, d3, d31 @Q5,Q4 = (X2(q1U) + q0U + p1U) + vabd.u8 q11, q2, q0 @Q11 = ABS(p0 - q0) + vabd.u8 q12, q1, q0 @Q12 = ABS(q1 - q0) + vaddl.u8 q7, d4, d2 @ + vaddl.u8 q14, d5, d3 @Q14,Q7 = P0 + Q1 + vdup.8 d16, r3 @D16 contains beta_cb + vdup.8 d17, r6 @D17 contains beta_cr + vmlal.u8 q7, d6, d31 @ + vmlal.u8 q14, d7, d31 @Q14,Q7 = (X2(p1U) + p0U + q1U) + vcge.u8 q9, q11, q10 @Q9 = ( ABS(p0 - q0) >= Alpha ) + vcge.u8 q12, q12, q8 @Q12= ( ABS(q1 - q0) >= Beta ) + vcge.u8 q13, q13, q8 @Q13= ( ABS(p1 - p0) >= Beta ) + vrshrn.u16 d8, q4, #2 @ + vrshrn.u16 d9, q5, #2 @Q4 = (X2(q1U) + q0U + p1U + 2) >> 2 + vorr q9, q9, q12 @Q9 = ( ABS(p0 - q0) >= Alpha ) | ( ABS(q1 - q0) >= Beta ) + vrshrn.u16 d10, q7, #2 @ + vrshrn.u16 d11, q14, #2 @Q5 = (X2(p1U) + p0U + q1U + 2) >> 2 + vorr q9, q9, q13 @Q9 = ( ABS(p0 - q0) >= Alpha ) | ( ABS(q1 - q0) >= Beta ) | ( ABS(p1 - p0) >= Beta ) + vbit q5, q2, q9 @ + vbit q4, q0, q9 @ + vst2.8 {d10, d11}, [r4], r1 @ + vst2.8 {d8, d9}, [r4] @ + vpop {d8 - d15} + ldmfd sp!, {r4-r6, pc} @ + + + +@/** +@******************************************************************************* +@* +@* @brief +@* Performs filtering of a chroma block vertical edge when the +@* boundary strength is set to 4 in high profile +@* +@* @par Description: +@* This operation is described in Sec. 8.7.2.4 under the title +@* "Filtering process for edges for bS equal to 4" in ITU T Rec H.264. +@* +@* @param[in] r0 - pu1_src +@* Pointer to the src sample q0 +@* +@* @param[in] r1 - src_strd +@* Source stride +@* +@* @param[in] r2 - alpha_cb +@* Alpha Value for the boundary in U +@* +@* @param[in] r3 - beta_cb +@* Beta Value for the boundary in U +@* +@* @param[in] sp(0) - alpha_cr +@* Alpha Value for the boundary in V +@* +@* @param[in] sp(4) - beta_cr +@* Beta Value for the boundary in V +@* +@* @returns +@* None +@* +@* @remarks +@* None +@* +@******************************************************************************* +@*/ + + .global ih264_deblk_chroma_vert_bs4_a9 + +ih264_deblk_chroma_vert_bs4_a9: + + stmfd sp!, {r4, r5, r12, r14} + + sub r0, r0, #4 @point r0 to p1u of row0. + mov r12, r0 @keep a back up of r0 for buffer write + + ldr r4, [sp, #16] @r4 = alpha_cr + ldr r5, [sp, #20] @r5 = beta_cr + add r2, r2, r4, lsl #8 @r2 = (alpha_cr,alpha_cb) + add r3, r3, r5, lsl #8 @r3 = (beta_cr,beta_cb) + vpush {d8 - d15} + vld4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0], r1 + vld4.16 {d0[1], d2[1], d4[1], d6[1]}, [r0], r1 + vld4.16 {d0[2], d2[2], d4[2], d6[2]}, [r0], r1 + vld4.16 {d0[3], d2[3], d4[3], d6[3]}, [r0], r1 + + vld4.16 {d1[0], d3[0], d5[0], d7[0]}, [r0], r1 + vld4.16 {d1[1], d3[1], d5[1], d7[1]}, [r0], r1 + vld4.16 {d1[2], d3[2], d5[2], d7[2]}, [r0], r1 + vld4.16 {d1[3], d3[3], d5[3], d7[3]}, [r0], r1 + + vdup.16 q11, r2 @Q11 = alpha + vdup.16 q12, r3 @Q12 = beta + vmov.i8 d31, #2 + + vabd.u8 q4, q1, q2 @|p0-q0| + vabd.u8 q5, q3, q2 @|q1-q0| + vabd.u8 q6, q0, q1 @|p1-p0| + vaddl.u8 q7, d2, d6 + vaddl.u8 q8, d3, d7 @(p0 + q1) + vclt.u8 q4, q4, q11 @|p0-q0| < alpha ? + vclt.u8 q5, q5, q12 @|q1-q0| < beta ? + vclt.u8 q6, q6, q12 @|p1-p0| < beta ? + vmlal.u8 q7, d0, d31 + vmlal.u8 q8, d1, d31 @2*p1 + (p0 + q1) + vaddl.u8 q9, d0, d4 + vaddl.u8 q10, d1, d5 @(p1 + q0) + vand.u8 q4, q4, q5 @|p0-q0| < alpha && |q1-q0| < beta + vmlal.u8 q9, d6, d31 + vmlal.u8 q10, d7, d31 @2*q1 + (p1 + q0) + + vrshrn.i16 d14, q7, #2 + vrshrn.i16 d15, q8, #2 @(2*p1 + (p0 + q1) + 2) >> 2 + vand.u8 q4, q4, q6 @|p0-q0| < alpha && |q1-q0| < beta && |p1-p0| < beta + vrshrn.i16 d18, q9, #2 + vrshrn.i16 d19, q10, #2 @(2*q1 + (p1 + q0) + 2) >> 2 + + vbit q1, q7, q4 + vbit q2, q9, q4 + + vst4.16 {d0[0], d2[0], d4[0], d6[0]}, [r12], r1 + vst4.16 {d0[1], d2[1], d4[1], d6[1]}, [r12], r1 + vst4.16 {d0[2], d2[2], d4[2], d6[2]}, [r12], r1 + vst4.16 {d0[3], d2[3], d4[3], d6[3]}, [r12], r1 + + vst4.16 {d1[0], d3[0], d5[0], d7[0]}, [r12], r1 + vst4.16 {d1[1], d3[1], d5[1], d7[1]}, [r12], r1 + vst4.16 {d1[2], d3[2], d5[2], d7[2]}, [r12], r1 + vst4.16 {d1[3], d3[3], d5[3], d7[3]}, [r12], r1 + vpop {d8 - d15} + ldmfd sp!, {r4, r5, r12, pc} + + + +@/** +@******************************************************************************* +@* +@* @brief +@* Performs filtering of a chroma block horizontal edge for cases where the +@* boundary strength is less than 4 in high profile +@* +@* @par Description: +@* This operation is described in Sec. 8.7.2.4 under the title +@* "Filtering process for edges for bS equal to 4" in ITU T Rec H.264. +@* +@* @param[in] r0 - pu1_src +@* Pointer to the src sample q0 +@* +@* @param[in] r1 - src_strd +@* Source stride +@* +@* @param[in] r2 - alpha_cb +@* Alpha Value for the boundary in U +@* +@* @param[in] r3 - beta_cb +@* Beta Value for the boundary in U +@* +@* @param[in] sp(0) - alpha_cr +@* Alpha Value for the boundary in V +@* +@* @param[in] sp(4) - beta_cr +@* Beta Value for the boundary in V +@* +@* @param[in] sp(8) - u4_bs +@* Packed Boundary strength array +@* +@* @param[in] sp(12) - pu1_cliptab_cb +@* tc0_table for U +@* +@* @param[in] sp(16) - pu1_cliptab_cr +@* tc0_table for V +@* +@* @returns +@* None +@* +@* @remarks +@* None +@* +@******************************************************************************* +@*/ + + .global ih264_deblk_chroma_horz_bslt4_a9 + +ih264_deblk_chroma_horz_bslt4_a9: + + stmfd sp!, {r4-r9, lr} @ + + ldrd r4, r5, [sp, #28] @R4 = alpha_cr , R5 = beta_cr + ldr r7, [sp, #36] @R7 = u4_bs + ldrd r8, r9, [sp, #40] @R8 = pu1_cliptab_cb , R9 = pu1_cliptab_cr + sub r0, r0, r1, lsl #1 @R0 = uc_edgePixelU pointing to p1 of chroma U + vpush {d8 - d15} + rev r7, r7 @ + vmov.32 d12[0], r7 @D12[0] = ui_Bs + + vld1.32 d16[0], [r8] @D16[0] contains cliptab_cb + vld1.32 d17[0], [r9] @D17[0] contains cliptab_cr + vld2.8 {d6, d7}, [r0], r1 @Q3=p1 + vtbl.8 d14, {d16}, d12 @Retreiving cliptab values for U + vtbl.8 d28, {d17}, d12 @Retrieving cliptab values for V + vmovl.u8 q6, d12 @Q6 = uc_Bs in each 16 bit scalar + mov r6, r0 @Keeping a backup of the pointer to chroma U P0 + vld2.8 {d4, d5}, [r0], r1 @Q2=p0 + vmov.i8 d30, #1 @ + vdup.8 d20, r2 @D20 contains alpha_cb + vdup.8 d21, r4 @D21 contains alpha_cr + vld2.8 {d0, d1}, [r0], r1 @Q0=q0 + vmovl.u8 q7, d14 @ + vmovl.u8 q14, d28 @ + vmov.i16 d15, d28 @D14 has cliptab values for U, D15 for V + vld2.8 {d2, d3}, [r0] @Q1=q1 + vsubl.u8 q5, d1, d5 @ + vsubl.u8 q4, d0, d4 @Q5,Q4 = (q0 - p0) + vabd.u8 q13, q3, q2 @Q13 = ABS(p1 - p0) + vshl.i16 q5, q5, #2 @Q5 = (q0 - p0)<<2 + vabd.u8 q11, q2, q0 @Q11 = ABS(p0 - q0) + vshl.i16 q4, q4, #2 @Q4 = (q0 - p0)<<2 + vsli.16 q7, q7, #8 @ + vabd.u8 q12, q1, q0 @Q12 = ABS(q1 - q0) + vcge.u8 q9, q11, q10 @Q9 = ( ABS(p0 - q0) >= Alpha ) + vsubl.u8 q10, d6, d2 @Q10 = (p1 - q1)L + vsubl.u8 q3, d7, d3 @Q3 = (p1 - q1)H + vdup.8 d16, r3 @Q8 contains beta_cb + vdup.8 d17, r5 @Q8 contains beta_cr + vadd.i16 q4, q4, q10 @ + vadd.i16 q5, q5, q3 @Q5,Q4 = [ (q0 - p0)<<2 ] + (p1 - q1) + vcge.u8 q12, q12, q8 @Q12= ( ABS(q1 - q0) >= Beta ) + vcgt.s16 d12, d12, #0 @Q6 = (us_Bs > 0) + vqrshrn.s16 d8, q4, #3 @ + vqrshrn.s16 d9, q5, #3 @Q4 = i_macro = (((q0 - p0)<<2) + (p1 - q1) + 4)>>3 + vadd.i8 d14, d14, d30 @D14 = C = C0+1 for U + vcge.u8 q13, q13, q8 @Q13= ( ABS(p1 - p0) >= Beta ) + vorr q9, q9, q12 @Q9 = ( ABS(p0 - q0) >= Alpha ) | ( ABS(q1 - q0) >= Beta ) + vabs.s8 q3, q4 @Q4 = ABS (i_macro) + vadd.i8 d15, d15, d30 @D15 = C = C0+1 for V + vmov.i8 d13, d12 @ + vorr q9, q9, q13 @Q9 = ( ABS(p0 - q0) >= Alpha ) | ( ABS(q1 - q0) >= Beta ) | ( ABS(p1 - p0) >= Beta ) + vmin.u8 q7, q3, q7 @Q7 = delta = (ABS(i_macro) > C) ? C : ABS(i_macro) + vbic q6, q6, q9 @final condition + vcge.s8 q4, q4, #0 @Q4 = (i_macro >= 0) + vand q7, q7, q6 @Making delta zero in places where values shouldn be filterd + vqadd.u8 q8, q2, q7 @Q8 = p0 + delta + vqsub.u8 q2, q2, q7 @Q2 = p0 - delta + vqadd.u8 q9, q0, q7 @Q9 = q0 + delta + vqsub.u8 q0, q0, q7 @Q0 = q0 - delta + vbif q8, q2, q4 @Q8 = (i_macro >= 0 ) ? (p0+delta) : (p0-delta) + vbif q0, q9, q4 @Q0 = (i_macro >= 0 ) ? (q0-delta) : (q0+delta) + vst2.8 {d16, d17}, [r6], r1 @ + vst2.8 {d0, d1}, [r6] @ + vpop {d8 - d15} + ldmfd sp!, {r4-r9, pc} @ + + + +@/** +@******************************************************************************* +@* +@* @brief +@* Performs filtering of a chroma block vertical edge for cases where the +@* boundary strength is less than 4 in high profile +@* +@* @par Description: +@* This operation is described in Sec. 8.7.2.4 under the title +@* "Filtering process for edges for bS equal to 4" in ITU T Rec H.264. +@* +@* @param[in] r0 - pu1_src +@* Pointer to the src sample q0 +@* +@* @param[in] r1 - src_strd +@* Source stride +@* +@* @param[in] r2 - alpha_cb +@* Alpha Value for the boundary in U +@* +@* @param[in] r3 - beta_cb +@* Beta Value for the boundary in U +@* +@* @param[in] sp(0) - alpha_cr +@* Alpha Value for the boundary in V +@* +@* @param[in] sp(4) - beta_cr +@* Beta Value for the boundary in V +@* +@* @param[in] sp(8) - u4_bs +@* Packed Boundary strength array +@* +@* @param[in] sp(12) - pu1_cliptab_cb +@* tc0_table for U +@* +@* @param[in] sp(16) - pu1_cliptab_cr +@* tc0_table for V +@* +@* @returns +@* None +@* +@* @remarks +@* None +@* +@******************************************************************************* +@*/ + + .global ih264_deblk_chroma_vert_bslt4_a9 + +ih264_deblk_chroma_vert_bslt4_a9: + + stmfd sp!, {r4-r7, r10-r12, r14} + + sub r0, r0, #4 @point r0 to p1u of row0. + ldrd r4, r5, [sp, #32] @R4 = alpha_cr , R5 = beta_cr + add r2, r2, r4, lsl #8 + add r3, r3, r5, lsl #8 + ldr r6, [sp, #40] @R6 = u4_bs + ldrd r10, r11, [sp, #44] @R10 = pu1_cliptab_cb , R11 = pu1_cliptab_cr + vpush {d8 - d15} + mov r12, r0 @keep a back up of R0 for buffer write + + vld4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0], r1 + vld4.16 {d0[1], d2[1], d4[1], d6[1]}, [r0], r1 + vld4.16 {d0[2], d2[2], d4[2], d6[2]}, [r0], r1 + vld4.16 {d0[3], d2[3], d4[3], d6[3]}, [r0], r1 + + vld4.16 {d1[0], d3[0], d5[0], d7[0]}, [r0], r1 + vld4.16 {d1[1], d3[1], d5[1], d7[1]}, [r0], r1 + vld4.16 {d1[2], d3[2], d5[2], d7[2]}, [r0], r1 + vld4.16 {d1[3], d3[3], d5[3], d7[3]}, [r0], r1 + + + vdup.16 q11, r2 @Q11 = alpha + vabd.u8 q4, q1, q2 @|p0-q0| + vdup.16 q12, r3 @Q12 = beta + vabd.u8 q5, q3, q2 @|q1-q0| + vabd.u8 q6, q0, q1 @|p1-p0| + vclt.u8 q4, q4, q11 @|p0-q0| < alpha ? + vsubl.u8 q7, d0, d6 + vclt.u8 q5, q5, q12 @|q1-q0| < beta ? + vsubl.u8 q8, d1, d7 @(p1 - q1) + vclt.u8 q6, q6, q12 @|p1-p0| < beta ? + vsubl.u8 q9, d4, d2 + vand.u8 q4, q4, q5 @|p0-q0| < alpha && |q1-q0| < beta + vsubl.u8 q10, d5, d3 @(q0 - p0) + vmov.u16 q14, #4 + vld1.32 {d24[0]}, [r10] @Load ClipTable for U + vld1.32 {d25[0]}, [r11] @Load ClipTable for V + rev r6, r6 @Blocking strengths + vand.u8 q4, q4, q6 @|p0-q0| < alpha && |q1-q0| < beta && |p1-p0| < beta + + vmov.32 d10[0], r6 + + vmla.s16 q7, q9, q14 + vmla.s16 q8, q10, q14 @4*(q0 - p0) + (p1 - q1) + + vmovl.u8 q5, d10 + vsli.u16 d10, d10, #8 + vtbl.8 d12, {d24}, d10 @tC0 for U + vtbl.8 d13, {d25}, d10 @tC0 for V + vzip.8 d12, d13 + vmovl.u16 q5, d10 + vsli.u32 q5, q5, #16 + vmov.u8 q12, #1 + vadd.u8 q6, q6, q12 @tC0 + 1 + vcge.u8 q5, q5, q12 @u4_bS > 0 ? + vand.u8 q4, q4, q5 @|p0-q0| < alpha && |q1-q0| < beta && |p1-p0| < beta && u4_bs != 0 + + @ Q0 - Q3(inputs), + @ Q4 (|p0-q0| < alpha && |q1-q0| < beta && |p1-p0| < beta && u4_bs != 0), + @ Q6 (tC) + + vrshr.s16 q7, q7, #3 + vrshr.s16 q8, q8, #3 @(((q0 - p0) << 2) + (p1 - q1) + 4) >> 3) + + vcgt.s16 q9, q7, #0 + vcgt.s16 q10, q8, #0 + vmovn.i16 d18, q9 + vmovn.i16 d19, q10 @Q9 = sign(delta) + vabs.s16 q7, q7 + vabs.s16 q8, q8 + vmovn.u16 d14, q7 + vmovn.u16 d15, q8 + vmin.u8 q7, q7, q6 @Q7 = |delta| + + vqadd.u8 q10, q1, q7 @p0+|delta| + vqadd.u8 q11, q2, q7 @q0+|delta| + vqsub.u8 q12, q1, q7 @p0-|delta| + vqsub.u8 q13, q2, q7 @q0-|delta| + + vbit q12, q10, q9 @p0 + delta + vbit q11, q13, q9 @q0 - delta + + vbit q1, q12, q4 + vbit q2, q11, q4 + + vst4.16 {d0[0], d2[0], d4[0], d6[0]}, [r12], r1 + vst4.16 {d0[1], d2[1], d4[1], d6[1]}, [r12], r1 + vst4.16 {d0[2], d2[2], d4[2], d6[2]}, [r12], r1 + vst4.16 {d0[3], d2[3], d4[3], d6[3]}, [r12], r1 + + vst4.16 {d1[0], d3[0], d5[0], d7[0]}, [r12], r1 + vst4.16 {d1[1], d3[1], d5[1], d7[1]}, [r12], r1 + vst4.16 {d1[2], d3[2], d5[2], d7[2]}, [r12], r1 + vst4.16 {d1[3], d3[3], d5[3], d7[3]}, [r12], r1 + vpop {d8 - d15} + ldmfd sp!, {r4-r7, r10-r12, pc} + + + +@/** +@******************************************************************************* +@* +@* @brief +@* Performs filtering of a chroma block vertical edge when the +@* boundary strength is set to 4 on calling twice in high profile +@* +@* @par Description: +@* This operation is described in Sec. 8.7.2.4 under the title +@* "Filtering process for edges for bS equal to 4" in ITU T Rec H.264. +@* +@* @param[in] r0 - pu1_src +@* Pointer to the src sample q0 +@* +@* @param[in] r1 - src_strd +@* Source stride +@* +@* @param[in] r2 - alpha_cb +@* Alpha Value for the boundary in U +@* +@* @param[in] r3 - beta_cb +@* Beta Value for the boundary in U +@* +@* @param[in] sp(0) - alpha_cr +@* Alpha Value for the boundary in V +@* +@* @param[in] sp(4) - beta_cr +@* Beta Value for the boundary in V +@* +@* @returns +@* None +@* +@* @remarks +@* None +@* +@******************************************************************************* +@*/ + + .global ih264_deblk_chroma_vert_bs4_mbaff_a9 + +ih264_deblk_chroma_vert_bs4_mbaff_a9: + + stmfd sp!, {r4, r5, r12, r14} + + sub r0, r0, #4 @point r0 to p1u of row0. + mov r12, r0 @keep a back up of r0 for buffer write + ldrd r4, r5, [sp, #16] @R4 = alpha_cr , R5 = beta_cr + add r2, r2, r4, lsl #8 + add r3, r3, r5, lsl #8 + vpush {d8 - d15} + vld4.16 {d0[0], d1[0], d2[0], d3[0]}, [r0], r1 + vld4.16 {d0[1], d1[1], d2[1], d3[1]}, [r0], r1 + vld4.16 {d0[2], d1[2], d2[2], d3[2]}, [r0], r1 + vld4.16 {d0[3], d1[3], d2[3], d3[3]}, [r0], r1 + + vdup.16 d11, r2 @D11 = alpha + vdup.16 d12, r3 @D12 = beta + vmov.i8 d31, #2 + + vabd.u8 d4, d1, d2 @|p0-q0| + vabd.u8 d5, d3, d2 @|q1-q0| + vabd.u8 d6, d0, d1 @|p1-p0| + vaddl.u8 q14, d1, d3 @(p0 + q1) + vclt.u8 d4, d4, d11 @|p0-q0| < alpha ? + vclt.u8 d5, d5, d12 @|q1-q0| < beta ? + vclt.u8 d6, d6, d12 @|p1-p0| < beta ? + vmlal.u8 q14, d0, d31 @2*p1 + (p0 + q1) + vaddl.u8 q13, d0, d2 @(p1 + q0) + vand.u8 d4, d4, d5 @|p0-q0| < alpha && |q1-q0| < beta + vmlal.u8 q13, d3, d31 @2*q1 + (p1 + q0) + + vrshrn.i16 d7, q14, #2 @(2*p1 + (p0 + q1) + 2) >> 2 + vand.u8 d4, d4, d6 @|p0-q0| < alpha && |q1-q0| < beta && |p1-p0| < beta + vrshrn.i16 d9, q13, #2 @(2*q1 + (p1 + q0) + 2) >> 2 + + vbit d1, d7, d4 + vbit d2, d9, d4 + + vst4.16 {d0[0], d1[0], d2[0], d3[0]}, [r12], r1 + vst4.16 {d0[1], d1[1], d2[1], d3[1]}, [r12], r1 + vst4.16 {d0[2], d1[2], d2[2], d3[2]}, [r12], r1 + vst4.16 {d0[3], d1[3], d2[3], d3[3]}, [r12], r1 + vpop {d8 - d15} + ldmfd sp!, {r4, r5, r12, pc} + + + +@/** +@******************************************************************************* +@* +@* @brief +@* Performs filtering of a chroma block vertical edge for cases where the +@* boundary strength is less than 4 on calling twice in high profile +@* +@* @par Description: +@* This operation is described in Sec. 8.7.2.4 under the title +@* "Filtering process for edges for bS equal to 4" in ITU T Rec H.264. +@* +@* @param[in] r0 - pu1_src +@* Pointer to the src sample q0 +@* +@* @param[in] r1 - src_strd +@* Source stride +@* +@* @param[in] r2 - alpha_cb +@* Alpha Value for the boundary in U +@* +@* @param[in] r3 - beta_cb +@* Beta Value for the boundary in U +@* +@* @param[in] sp(0) - alpha_cr +@* Alpha Value for the boundary in V +@* +@* @param[in] sp(4) - beta_cr +@* Beta Value for the boundary in V +@* +@* @param[in] sp(8) - u4_bs +@* Packed Boundary strength array +@* +@* @param[in] sp(12) - pu1_cliptab_cb +@* tc0_table for U +@* +@* @param[in] sp(16) - pu1_cliptab_cr +@* tc0_table for V +@* +@* @returns +@* None +@* +@* @remarks +@* None +@* +@******************************************************************************* +@*/ + + .global ih264_deblk_chroma_vert_bslt4_mbaff_a9 + +ih264_deblk_chroma_vert_bslt4_mbaff_a9: + + stmfd sp!, {r4-r6, r10-r12, r14} + + sub r0, r0, #4 @point r0 to p1u of row0. + mov r12, r0 @keep a back up of r0 for buffer write + + ldrd r4, r5, [sp, #28] @R4 = alpha_cr , R5 = beta_cr + add r2, r2, r4, lsl #8 + add r3, r3, r5, lsl #8 + ldr r6, [sp, #36] @R6 = u4_bs + ldrd r10, r11, [sp, #40] @R10 = pu1_cliptab_cb , R11 = pu1_cliptab_cr + vpush {d8 - d15} + vld4.16 {d0[0], d1[0], d2[0], d3[0]}, [r0], r1 + vld4.16 {d0[1], d1[1], d2[1], d3[1]}, [r0], r1 + vld4.16 {d0[2], d1[2], d2[2], d3[2]}, [r0], r1 + vld4.16 {d0[3], d1[3], d2[3], d3[3]}, [r0], r1 + + vdup.16 d11, r2 @D11 = alpha + vabd.u8 d4, d1, d2 @|p0-q0| + vdup.16 d12, r3 @D12 = beta + vabd.u8 d5, d3, d2 @|q1-q0| + vabd.u8 d6, d0, d1 @|p1-p0| + vclt.u8 d4, d4, d11 @|p0-q0| < alpha ? + vclt.u8 d5, d5, d12 @|q1-q0| < beta ? + vsubl.u8 q14, d0, d3 @(p1 - q1) + vclt.u8 d6, d6, d12 @|p1-p0| < beta ? + vand.u8 d4, d4, d5 @|p0-q0| < alpha && |q1-q0| < beta + vsubl.u8 q12, d2, d1 @(q0 - p0) + vmov.u16 q10, #4 + + vld1.32 {d31[1]}, [r10] @Load ClipTable for U + vld1.32 {d31[0]}, [r11] @Load ClipTable for V + rev r6, r6 @Blocking strengths + vand.u8 d4, d4, d6 @|p0-q0| < alpha && |q1-q0| < beta && |p1-p0| < beta + vmov.32 d22[0], r6 + vmla.s16 q14, q12, q10 @4*(q0 - p0) + (p1 - q1) + vmovl.u8 q11, d22 + vsli.u16 d22, d22, #8 + vmov.u16 d13, #4 + vadd.u8 d22, d22, d13 + vtbl.8 d6, {d31}, d22 @tC0 + vmov.u8 d12, #1 + vsub.u8 d22, d22, d13 + vadd.u8 d6, d6, d12 @tC0 + 1 + vcge.u8 d5, d22, d12 @u4_bS > 0 ? + vand.u8 d4, d4, d5 @|p0-q0| < alpha && |q1-q0| < beta && |p1-p0| < beta && u4_bs != 0 + + @ D0 - D3(inputs), + @ D4 (|p0-q0| < alpha && |q1-q0| < beta && |p1-p0| < beta && u4_bs != 0), + @ D6 (tC) + + vrshr.s16 q14, q14, #3 @(((q0 - p0) << 2) + (p1 - q1) + 4) >> 3) + + vcgt.s16 q13, q14, #0 + vmovn.i16 d9, q13 @D9 = sign(delta) + vabs.s16 q14, q14 + vmovn.u16 d7, q14 + vmin.u8 d7, d7, d6 @D7 = |delta| + + vqadd.u8 d10, d1, d7 @p0+|delta| + vqadd.u8 d11, d2, d7 @q0+|delta| + vqsub.u8 d12, d1, d7 @p0-|delta| + vqsub.u8 d13, d2, d7 @q0-|delta| + + vbit d12, d10, d9 @p0 + delta + vbit d11, d13, d9 @q0 - delta + + vbit d1, d12, d4 + vbit d2, d11, d4 + + vst4.16 {d0[0], d1[0], d2[0], d3[0]}, [r12], r1 + vst4.16 {d0[1], d1[1], d2[1], d3[1]}, [r12], r1 + vst4.16 {d0[2], d1[2], d2[2], d3[2]}, [r12], r1 + vst4.16 {d0[3], d1[3], d2[3], d3[3]}, [r12], r1 + vpop {d8 - d15} + ldmfd sp!, {r4-r6, r10-r12, pc} + + + diff --git a/common/arm/ih264_deblk_luma_a9.s b/common/arm/ih264_deblk_luma_a9.s new file mode 100755 index 0000000..3e6a4d9 --- /dev/null +++ b/common/arm/ih264_deblk_luma_a9.s @@ -0,0 +1,1092 @@ +@/****************************************************************************** +@ * +@ * Copyright (C) 2015 The Android Open Source Project +@ * +@ * Licensed under the Apache License, Version 2.0 (the "License"); +@ * you may not use this file except in compliance with the License. +@ * You may obtain a copy of the License at: +@ * +@ * http://www.apache.org/licenses/LICENSE-2.0 +@ * +@ * Unless required by applicable law or agreed to in writing, software +@ * distributed under the License is distributed on an "AS IS" BASIS, +@ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +@ * See the License for the specific language governing permissions and +@ * limitations under the License. +@ * +@ ***************************************************************************** +@ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore +@*/ +@/*****************************************************************************/ +@/* */ +@/* File Name : ih264_deblk_luma_a9.s */ +@/* */ +@/* Description : Contains function definitions for deblocking luma */ +@/* edge. Functions are coded in NEON assembly and can */ +@/* be compiled using ARM RVDS. */ +@/* */ +@/* List of Functions : ih264_deblk_luma_vert_bs4_a9() */ +@/* ih264_deblk_luma_vert_bslt4_a9() */ +@/* ih264_deblk_luma_horz_bs4_a9() */ +@/* ih264_deblk_luma_horz_bslt4_a9() */ +@/* ih264_deblk_luma_vert_bs4_mbaff_a9() */ +@/* ih264_deblk_luma_vert_bslt4_mbaff_a9() */ +@/* */ +@/* Issues / Problems : None */ +@/* */ +@/* Revision History : */ +@/* */ +@/* DD MM YYYY Author(s) Changes (Describe the changes made) */ +@/* 28 11 2013 Ittiam Draft */ +@/* 05 01 2015 Kaushik Added double-call functions for */ +@/* Senthoor vertical deblocking. */ +@/* */ +@/*****************************************************************************/ + + +.text +.p2align 2 + +@/** +@******************************************************************************* +@* +@* @brief +@* Performs filtering of a luma block horizontal edge for cases where the +@* boundary strength is less than 4 +@* +@* @par Description: +@* This operation is described in Sec. 8.7.2.4 under the title +@* "Filtering process for edges for bS equal to 4" in ITU T Rec H.264. +@* +@* @param[in] r0 - pu1_src +@* Pointer to the src sample q0 +@* +@* @param[in] r1 - src_strd +@* Source stride +@* +@* @param[in] r2 - alpha +@* Alpha Value for the boundary +@* +@* @param[in] r3 - beta +@* Beta Value for the boundary +@* +@* @param[in] sp(0) - u4_bs +@* Packed Boundary strength array +@* +@* @param[in] sp(4) - pu1_cliptab +@* tc0_table +@* +@* @returns +@* None +@* +@* @remarks +@* None +@* +@******************************************************************************* +@*/ + + .global ih264_deblk_luma_horz_bslt4_a9 + +ih264_deblk_luma_horz_bslt4_a9: + + stmfd sp!, {r4-r7, lr} + + ldrd r4, r5, [sp, #0x14] @r4 = ui_Bs , r5 = *puc_ClpTab + vpush {d8 - d15} + sub r0, r0, r1, lsl #1 @R1 = uc_Horizonpad + sub r0, r0, r1 @r0 pointer to p2 + rev r4, r4 @ + vld1.8 {q5}, [r0], r1 @p2 values are loaded into q5 + vmov.32 d12[0], r4 @d12[0] = ui_Bs + mov r6, r0 @keeping backup of pointer to p1 + vld1.8 {q4}, [r0], r1 @p1 values are loaded into q4 + mov r7, r0 @keeping backup of pointer to p0 + vld1.8 {q3}, [r0], r1 @p0 values are loaded into q3 + vmovl.u8 q6, d12 @q6 = uc_Bs in each 16 bt scalar + vld1.8 {q0}, [r0], r1 @q0 values are loaded into q0 + vabd.u8 q13, q4, q3 @Q13 = ABS(p1 - p0) + vld1.8 {q1}, [r0], r1 @q1 values are loaded into q1 + vabd.u8 q11, q3, q0 @Q11 = ABS(p0 - q0) + vld1.32 d16[0], [r5] @D16[0] contains cliptab + vabd.u8 q12, q1, q0 @Q12 = ABS(q1 - q0) + vld1.8 {q2}, [r0], r1 @q2 values are loaded into q2 + vtbl.8 d14, {d16}, d12 @ + vdup.8 q10, r2 @Q10 contains alpha + vdup.8 q8, r3 @Q8 contains beta + vmovl.u16 q6, d12 @ + vmovl.u16 q7, d14 @ + vabd.u8 q14, q5, q3 @Q14 = Ap = ABS(p2 - p0) + vabd.u8 q15, q2, q0 @Q15 = Aq = ABS(q2 - q0) + vcgt.s32 q6, q6, #0 @Q6 = (us_Bs > 0) + vsli.32 q7, q7, #8 @ + vcge.u8 q9, q11, q10 @Q9 = ( ABS(p0 - q0) >= Alpha ) + vcge.u8 q12, q12, q8 @Q12=( ABS(q1 - q0) >= Beta ) + vcge.u8 q13, q13, q8 @Q13=( ABS(p1 - p0) >= Beta ) + vcgt.u8 q10, q8, q14 @Q10=(Ap<Beta) + vcgt.u8 q11, q8, q15 @Q11=(Aq<Beta) + vsli.32 q7, q7, #16 @Q7 = C0 + vorr q9, q9, q12 @Q9 = ( ABS(p0 - q0) >= Alpha ) | ( ABS(q1 - q0) >= Beta ) + vsubl.u8 q15, d1, d7 @ + vsubl.u8 q12, d0, d6 @Q15,Q12 = (q0 - p0) + vorr q9, q9, q13 @Q9 = ( ABS(p0 - q0) >= Alpha ) | ( ABS(q1 - q0) >= Beta ) | ( ABS(p1 - p0) >= Beta ) + vsubl.u8 q14, d8, d2 @Q14 = (p1 - q1)L + vshl.i16 q13, q15, #2 @Q13 = (q0 - p0)<<2 + vshl.i16 q12, q12, #2 @Q12 = (q0 - p0)<<2 + vsubl.u8 q15, d9, d3 @Q15 = (p1 - q1)H + vbic q6, q6, q9 @final condition + vadd.i16 q12, q12, q14 @ + vadd.i16 q13, q13, q15 @Q13,Q12 = [ (q0 - p0)<<2 ] + (p1 - q1) + vsub.i8 q9, q7, q10 @Q9 = C0 + (Ap < Beta) + vrhadd.u8 q8, q3, q0 @Q8 = ((p0+q0+1) >> 1) + vqrshrn.s16 d24, q12, #3 @ + vqrshrn.s16 d25, q13, #3 @Q12 = i_macro = (((q0 - p0)<<2) + (p1 - q1) + 4)>>3 + vsub.i8 q9, q9, q11 @Q9 = C0 + (Ap < Beta) + (Aq < Beta) + vand.i8 q10, q10, q6 @ + vand.i8 q11, q11, q6 @ + vabs.s8 q13, q12 @Q13 = ABS (i_macro) + vaddl.u8 q14, d17, d11 @ + vaddl.u8 q5, d16, d10 @Q14,Q5 = p2 + (p0+q0+1)>>1 + vaddl.u8 q15, d17, d5 @ + vmin.u8 q9, q13, q9 @Q9 = delta = (ABS(i_macro) > C) ? C : ABS(i_macro) + vshll.u8 q13, d9, #1 @ + vaddl.u8 q2, d16, d4 @Q15,Q2 = q2 + (p0+q0+1)>>1 + vshll.u8 q8, d8, #1 @Q13,Q8 = (p1<<1) + vand q9, q9, q6 @Making delta zero in places where values shouldn be filterd + vsub.i16 q14, q14, q13 @Q14,Q5 = [p2 + (p0+q0+1)>>1] - (p1<<1) + vsub.i16 q5, q5, q8 @ + vshll.u8 q8, d2, #1 @ + vshll.u8 q13, d3, #1 @Q13,Q8 = (q1<<1) + vqshrn.s16 d29, q14, #1 @ + vqshrn.s16 d28, q5, #1 @Q14 = i_macro_p1 + vsub.i16 q2, q2, q8 @ + vsub.i16 q15, q15, q13 @Q15,Q2 = [q2 + (p0+q0+1)>>1] - (q1<<1) + vneg.s8 q13, q7 @Q13 = -C0 + vmin.s8 q14, q14, q7 @Q14 = min(C0,i_macro_p1) + vcge.s8 q12, q12, #0 @Q12 = (i_macro >= 0) + vqshrn.s16 d31, q15, #1 @ + vqshrn.s16 d30, q2, #1 @Q15 = i_macro_q1 + vmax.s8 q14, q14, q13 @Q14 = max( - C0 , min(C0, i_macro_p1) ) + vqadd.u8 q8, q3, q9 @Q8 = p0 + delta + vqsub.u8 q3, q3, q9 @Q3 = p0 - delta + vmin.s8 q15, q15, q7 @Q15 = min(C0,i_macro_q1) + vand.i8 q14, q10, q14 @condition check Ap<beta + vqadd.u8 q7, q0, q9 @Q7 = q0 + delta + vqsub.u8 q0, q0, q9 @Q0 = q0 - delta + vmax.s8 q15, q15, q13 @Q15 = max( - C0 , min(C0, i_macro_q1) ) + vbif q8, q3, q12 @Q8 = (i_macro >= 0 ) ? (p0+delta) : (p0-delta) + vbif q0, q7, q12 @Q0 = (i_macro >= 0 ) ? (q0-delta) : (q0+delta) + vadd.i8 q14, q14, q4 @ + vand.i8 q15, q11, q15 @condition check Aq<beta + vst1.8 {q8}, [r7], r1 @writting back filtered value of p0 + vadd.i8 q15, q15, q1 @ + vst1.8 {q0}, [r7], r1 @writting back filtered value of q0 + vst1.8 {q14}, [r6] @writting back filtered value of p1 + vst1.8 {q15}, [r7], r1 @writting back filtered value of q1 + vpop {d8 - d15} + ldmfd sp!, {r4-r7, pc} + + + +@/** +@******************************************************************************* +@* +@* @brief +@* Performs filtering of a luma block horizontal edge when the +@* boundary strength is set to 4 +@* +@* @par Description: +@* This operation is described in Sec. 8.7.2.4 under the title +@* "Filtering process for edges for bS equal to 4" in ITU T Rec H.264. +@* +@* @param[in] r0 - pu1_src +@* Pointer to the src sample q0 +@* +@* @param[in] r1 - src_strd +@* Source stride +@* +@* @param[in] r2 - alpha +@* Alpha Value for the boundary +@* +@* @param[in] r3 - beta +@* Beta Value for the boundary +@* +@* @returns +@* None +@* +@* @remarks +@* None +@* +@******************************************************************************* +@*/ + + .global ih264_deblk_luma_horz_bs4_a9 + +ih264_deblk_luma_horz_bs4_a9: + + @ Back up necessary registers on stack + stmfd sp!, {r12, r14} + vpush {d8 - d15} + @ Init + vdup.8 q0, r2 @duplicate alpha + sub r12, r0, r1 @pointer to p0 = q0 - src_strd + vdup.8 q1, r3 @duplicate beta + sub r14, r0, r1, lsl#1 @pointer to p1 = q0 - src_strd*2 + sub r2, r0, r1, lsl#2 @pointer to p3 = q0 - src_strd*4 + sub r3, r14, r1 @pointer to p2 = p1 - src_strd + + @ Load Data + vld1.8 {d4, d5}, [r0], r1 @load q0 to Q2, q0 = q0 + src_strd + vld1.8 {d6, d7}, [r12] @load p0 to Q3 + vld1.8 {d8, d9}, [r0], r1 @load q1 to Q4, q0 = q0 + src_strd + vld1.8 {d10, d11}, [r14] @load p1 to Q5 + + @ Filter Decision + vabd.u8 q6, q2, q3 @ABS(p0 - q0) + vabd.u8 q7, q4, q2 @ABS(q1 - q0) + vabd.u8 q8, q5, q3 @ABS(p1 - p0) + vcge.u8 q9, q6, q0 @ABS(p0 - q0) >= Alpha + vcge.u8 q7, q7, q1 @ABS(q1 - q0) >= Beta + vcge.u8 q8, q8, q1 @ABS(p1 - p0) >= Beta + vmov.i8 q10, #2 + vorr q9, q9, q7 @ABS(p0 - q0) >= Alpha || ABS(q1 - q0) >= Beta + vld1.8 {d14, d15}, [r0], r1 @load q2 to Q7, q0 = q0 + src_strd + vorr q9, q9, q8 @ABS(p0 - q0) >= Alpha || ABS(q1 - q0) >= Beta || ABS(p1 - p0) >= Beta + vsra.u8 q10, q0, #2 @((Alpha >> 2) + 2) + vabd.u8 q11, q7, q2 @Aq = ABS(q2 - q0) + vaddl.u8 q12, d4, d6 @p0+q0 L + vaddl.u8 q13, d5, d7 @p0+q0 H + vclt.u8 q11, q11, q1 @Aq < Beta + vclt.u8 q10, q6, q10 @(ABS(p0 - q0) <((Alpha >>2) + 2)) + + @ Deblock Filtering q0', q1', q2' + vaddw.u8 q14, q12, d8 @p0+q0+q1 L + vaddw.u8 q15, q13, d9 @p0+q0+q1 H + vand q11, q11, q10 @(Aq < Beta && ABS(p0 - q0) <((Alpha >>2) + 2)) + @ q0' if (Aq < Beta && ABS(p0 - q0) <((Alpha >>2) + 2)) TRUE + vadd.i16 q8, q14, q14 @2*(p0+q0+q1)L + vadd.i16 q0, q15, q15 @2*(p0+q0+q1)H + vaddw.u8 q8, q8, d14 @2*(p0+q0+q1)+q2 L + vaddw.u8 q0, q0, d15 @2*(p0+q0+q1)+q2 H + vaddw.u8 q8, q8, d10 @2*(p0+q0+q1)+q2 +p1 L + vaddw.u8 q0, q0, d11 @2*(p0+q0+q1)+q2 +p1 H + vrshrn.u16 d12, q8, #3 @(2*(p0+q0+q1)+q2 +p1 +4)>> 3 L [q0'] + vrshrn.u16 d13, q0, #3 @(2*(p0+q0+q1)+q2 +p1 +4)>> 3 H [q0'] + @ q0" if (Aq < Beta && ABS(p0 - q0) <((Alpha >>2) + 2)) FALSE + vaddl.u8 q8, d8, d8 @2*q1 L + vaddl.u8 q0, d9, d9 @2*q1 H + vaddw.u8 q8, q8, d4 @2*q1+q0 L + vaddw.u8 q0, q0, d5 @2*q1+q0 H + vaddw.u8 q8, q8, d10 @2*q1+q0+p1 L + vaddw.u8 q0, q0, d11 @2*q1+q0+p1 H + vrshrn.u16 d16, q8, #2 @(2*q1+q0+p1+2)>>2 L [q0"] + vrshrn.u16 d17, q0, #2 @(2*q1+q0+p1+2)>>2 H [q0"] + @ q1' + vaddw.u8 q14, q14, d14 @p0+q0+q1+q2 L + vaddw.u8 q15, q15, d15 @p0+q0+q1+q2 H + vld1.8 {q0}, [r0], r1 @load q3 to Q0, q0 = q0 + src_strd + vbit q8, q6, q11 @choosing between q0' and q0" depending on condn + sub r0, r0, r1, lsl #2 @pointer to q0 + vbic q11, q11, q9 @((ABS(p0 - q0) >= Alpha || ABS(q1 - q0) >= Beta || ABS(p1 - p0) >= Beta)) + @ && (Aq < Beta && ABS(p0 - q0) <((Alpha >>2) + 2)) + vrshrn.u16 d12, q14, #2 @(p0+q0+q1+q2+2)>>2 L [q1'] + vrshrn.u16 d13, q15, #2 @(p0+q0+q1+q2+2)>>2 H [q1'] + vbif q2, q8, q9 @choose q0 or filtered q0 + @ q2' + vaddl.u8 q8, d14, d0 @q2+q3,L + vaddl.u8 q0, d15, d1 @q2+q3,H + vadd.i16 q14, q14, q8 @p0+q0+q1+2*q2+q3 L + vst1.8 {d4, d5}, [r0], r1 @store q0 + vadd.i16 q15, q15, q0 @p0+q0+q1+2*q2+q3 H + vadd.i16 q14, q14, q8 @p0+q0+q1+3*q2+2*q3 L + vadd.i16 q15, q15, q0 @p0+q0+q1+3*q2+2*q3 H + vrshrn.u16 d0, q14, #3 @(p0+q0+q1+3*q2+2*q3+4)>>3 L [q2'] + vrshrn.u16 d1, q15, #3 @(p0+q0+q1+3*q2+2*q3+4)>>3 H [q2'] + vld1.8 {d30, d31}, [r3] @load p2 to Q15 + vbif q6, q4, q11 @choose q1 or filtered value of q1 + + vabd.u8 q8, q15, q3 @Ap,ABS(p2 - p0) + vaddw.u8 q12, q12, d10 @p0+q0+p1 L + vbif q0, q7, q11 @choose q2 or filtered q2 + vaddw.u8 q13, q13, d11 @p0+q0+p1 H + vst1.8 {d12, d13}, [r0], r1 @store q1 + vclt.u8 q8, q8, q1 @Ap < Beta + vadd.i16 q14, q12, q12 @2*(p0+q0+p1) L + vadd.i16 q2, q13, q13 @2*(p0+q0+p1) H + vst1.8 {d0, d1}, [r0], r1 @store q2 + vand q10, q10, q8 @((Ap < Beta) && (ABS(p0 - q0) <((Alpha >>2) + 2))) + vaddw.u8 q14, q14, d30 @2*(p0+q0+p1)+p2 l + vaddw.u8 q2, q2, d31 @2*(p0+q0+p1)+p2 H + vaddw.u8 q14, q14, d8 @2*(p0+q0+p1)+p2+q1 L + vaddw.u8 q2, q2, d9 @2*(p0+q0+p1)+p2+q1 H + vrshrn.u16 d28, q14, #3 @(2*(p0+q0+p1)+p2+q1+4)>>3 L,p0' + vrshrn.u16 d29, q2, #3 @(2*(p0+q0+p1)+p2+q1+4)>>3 H,p0' + vmov.i8 d0, #2 + vmov.i16 d1, #2 + vaddl.u8 q1, d6, d8 @p0+q1 L + vmlal.u8 q1, d10, d0 @2*p1+p0+q1 L + vaddl.u8 q8, d7, d9 @p0+q1 H + vmlal.u8 q8, d11, d0 @2*p1+p0+q1 H + vaddw.u8 q6, q12, d30 @(p0+q0+p1) +p2 L + vld1.8 {d24, d25}, [r2] @load p3,Q12 + vaddw.u8 q2, q13, d31 @(p0+q0+p1) +p2 H + vaddl.u8 q4, d30, d24 @p2+p3 L + vrshrn.u16 d26, q6, #2 @((p0+q0+p1)+p2 +2)>>2,p1' L + vrshrn.u16 d2, q1, #2 @(2*p1+p0+q1+2)>>2,p0"L + vrshrn.u16 d27, q2, #2 @((p0+q0+p1)+p2 +2)>>2,p1' H + vrshrn.u16 d3, q8, #2 @(2*p1+p0+q1+2)>>2,p0" H + vaddl.u8 q8, d31, d25 @p2+p3 H + vmla.u16 q6, q4, d1[0] @(p0+q0+p1)+3*p2+2*p3 L + vmla.u16 q2, q8, d1[0] @(p0+q0+p1)+3*p2+2*p3 H + vbic q8, q10, q9 @((ABS(p0 - q0) >= Alpha || ABS(q1 - q0) >= Beta || ABS(p1 - p0) >= Beta)) + @&& (Ap < Beta && ABS(p0 - q0) <((Alpha >>2) + 2)) + vbit q1, q14, q10 @choosing between po' and p0" + vrshrn.u16 d12, q6, #3 @((p0+q0+p1)+3*p2+2*p3+4)>>3 L p2' + vrshrn.u16 d13, q2, #3 @((p0+q0+p1)+3*p2+2*p3+4)>>3 H p2' + vbif q3, q1, q9 @choosing between p0 and filtered value of p0 + vbit q5, q13, q8 @choosing between p1 and p1' + vbit q15, q6, q8 @choosing between p2 and p2' + vst1.8 {d6, d7}, [r12] @store p0 + vst1.8 {d10, d11}, [r14] @store p1 + vst1.8 {d30, d31}, [r3] @store p2 + vpop {d8 - d15} + ldmfd sp!, {r12, pc} + + + +@/** +@******************************************************************************* +@* +@* @brief +@* Performs filtering of a luma block vertical edge for cases where the +@* boundary strength is less than 4 +@* +@* @par Description: +@* This operation is described in Sec. 8.7.2.4 under the title +@* "Filtering process for edges for bS equal to 4" in ITU T Rec H.264. +@* +@* @param[in] r0 - pu1_src +@* Pointer to the src sample q0 +@* +@* @param[in] r1 - src_strd +@* Source stride +@* +@* @param[in] r2 - alpha +@* Alpha Value for the boundary +@* +@* @param[in] r3 - beta +@* Beta Value for the boundary +@* +@* @param[in] sp(0) - u4_bs +@* Packed Boundary strength array +@* +@* @param[in] sp(4) - pu1_cliptab +@* tc0_table +@* +@* @returns +@* None +@* +@* @remarks +@* None +@* +@******************************************************************************* +@*/ + + .global ih264_deblk_luma_vert_bslt4_a9 + +ih264_deblk_luma_vert_bslt4_a9: + + stmfd sp!, {r12, lr} + + sub r0, r0, #4 @pointer uc_edgePixel-4 + ldr r12, [sp, #8] @r12 = ui_Bs + ldr r14, [sp, #12] @r14 = *puc_ClpTab + vpush {d8 - d15} + @loading p3:p2:p1:p0:q0:q1:q2:q3 for every row + vld1.8 {d0}, [r0], r1 @row1 + vld1.8 d2, [r0], r1 @row2 + vld1.8 d4, [r0], r1 @row3 + rev r12, r12 @reversing ui_bs + vld1.8 d6, [r0], r1 @row4 + vmov.32 d18[0], r12 @d12[0] = ui_Bs + vld1.32 d16[0], [r14] @D16[0] contains cliptab + vld1.8 d8, [r0], r1 @row5 + vmovl.u8 q9, d18 @q6 = uc_Bs in each 16 bt scalar + vld1.8 d10, [r0], r1 @row6 + vld1.8 d12, [r0], r1 @row7 + vtbl.8 d16, {d16}, d18 @puc_ClipTab[uc_Bs] + vld1.8 d14, [r0], r1 @row8 + vld1.8 d1, [r0], r1 @row9 + vmovl.u16 q8, d16 @ + vld1.8 d3, [r0], r1 @row10 + vld1.8 d5, [r0], r1 @row11 + vld1.8 d7, [r0], r1 @row12 + vsli.32 q8, q8, #8 @ + vld1.8 d9, [r0], r1 @row13 + vld1.8 d11, [r0], r1 @row14 + vld1.8 d13, [r0], r1 @row15 + vsli.32 q8, q8, #16 @Q8 = C0 + vld1.8 d15, [r0], r1 @row16 + + @taking two 8x8 transposes + @2X2 transposes + vtrn.8 d0, d2 @row1 &2 + vtrn.8 d4, d6 @row3&row4 + vtrn.8 d8, d10 @row5&6 + vtrn.8 d12, d14 @row7 & 8 + vtrn.8 d1, d3 @row9 &10 + vtrn.8 d5, d7 @row11 & 12 + vtrn.8 d9, d11 @row13 &14 + vtrn.8 d13, d15 @row15 & 16 + @4x4 transposes + vtrn.16 d2, d6 @row2 & row4 + vtrn.16 d10, d14 @row6 & row8 + vtrn.16 d3, d7 @row10 & 12 + vtrn.16 d11, d15 @row14 & row16 + vtrn.32 d6, d14 @row4 & 8 + vtrn.32 d7, d15 @row 12 & 16 + + @now Q3 ->p0 and Q7->q3 + vtrn.16 d0, d4 @row1 & 3 + vtrn.16 d8, d12 @row 5 & 7 + vtrn.16 d1, d5 @row9 & row11 + vtrn.16 d9, d13 @row13 & row15 + vtrn.32 d0, d8 @row1 & row5 + vtrn.32 d1, d9 @row9 & 13 + + @now Q0->p3 & Q4->q0 + @starting processing as p0 and q0 are now ready + vtrn.32 d2, d10 @row2 &6 + vrhadd.u8 q10, q3, q4 @((p0 + q0 + 1) >> 1) + vtrn.32 d3, d11 @row10&row14 + vmov.i8 d19, #2 + @now Q1->p2 & Q5->q1 + vtrn.32 d4, d12 @row3 & 7 + vabd.u8 q11, q3, q4 @ABS(p0 - q0) + vtrn.32 d5, d13 @row11 & row15 + vaddl.u8 q12, d20, d2 @(p2 + ((p0 + q0 + 1) >> 1) L + @now Q2->p1,Q6->q2 + vaddl.u8 q13, d21, d3 @(p2 + ((p0 + q0 + 1) >> 1) H + vmlsl.u8 q12, d4, d19 @(p2 + ((p0 + q0 + 1) >> 1) - (p1 << 1)) L + vmlsl.u8 q13, d5, d19 @(p2 + ((p0 + q0 + 1) >> 1) - (p1 << 1)) H + vdup.8 q14, r2 @alpha + vcle.u8 q11, q14, q11 @ABS(p0 - q0) >= Alpha(Alpha <=ABS(p0 - q0)) + vdup.i8 q14, r3 @beta + vabd.u8 q15, q5, q4 @ABS(q1 - q0) + vqshrn.s16 d24, q12, #1 @((p2 + ((p0 + q0 + 1) >> 1) - (p1 << 1)) >> 1) L + vqshrn.s16 d25 , q13, #1 @((p2 + ((p0 + q0 + 1) >> 1) - (p1 << 1)) >> 1) H + vcge.u8 q15, q15, q14 @ABS(q1 - q0) >= Beta + vabd.u8 q13, q2, q3 @ABS(p1 - p0) + vmin.s8 q12, q12, q8 @min(deltap1 ,C0) + vorr q11, q11, q15 @ABS(q1 - q0) >= Beta ||ABS(p0 - q0) >= Alpha + vneg.s8 q15, q8 @-C0 + vcge.u8 q13, q13, q14 @ABS(p1 - p0) >= Beta + vmax.s8 q12, q12, q15 @max(deltap1,-C0) + vorr q11, q11, q13 @ABS(p0 - q0) >= Alpha || ABS(q1 - q0) >= Beta || ABS(p1 - p0) >= Beta) + vmovl.u16 q13, d18 @ui_bs + vaddl.u8 q9, d20, d12 @q2 + ((p0 + q0 + 1) >> 1) L + vceq.u32 q13, q13, #0 @ui_bs == 0 + vsubw.u8 q9, q9, d10 @(q2 + ((p0 + q0 + 1) >> 1) - q1) L + vaddl.u8 q10, d21, d13 @q2 + ((p0 + q0 + 1) >> 1) H + vsubw.u8 q9, q9, d10 @(q2 + ((p0 + q0 + 1) >> 1) - 2*q1)L + vsubw.u8 q10, q10, d11 @(q2 + ((p0 + q0 + 1) >> 1) - q1) H + vorr q13, q13, q11 @(ABS(p0 - q0) >= Alpha || ABS(q1 - q0) >= Beta || ABS(p1 - p0) >= Beta)) &&(ui_bs) + vsubw.u8 q10, q10, d11 @(q2 + ((p0 + q0 + 1) >> 1) - 2*q1) H + vqshrn.s16 d18, q9, #1 @((q2 + ((p0 + q0 + 1) >> 1) - (q1 << 1)) >> 1) L + vabd.u8 q11, q1, q3 @Ap = ABS(p2 - p0) + vqshrn.s16 d19, q10, #1 @((q2 + ((p0 + q0 + 1) >> 1) - (q1 << 1)) >> 1) H + vabd.u8 q10, q6, q4 @Aq= ABS(q2 - q0) + vclt.u8 q11, q11, q14 @Ap < Beta + vmin.s8 q9, q9, q8 @min(delatq1,C0) + vclt.u8 q10, q10, q14 @Aq <Beta + vsubl.u8 q14, d8, d6 @(q0 - p0) L + vmax.s8 q9, q9, q15 @max(deltaq1,-C0) + vsubl.u8 q15, d9, d7 @(q0 - p0) H + vshl.s16 q14, q14, #2 @(q0 - p0)<<2 L + vsub.u8 q8, q8, q11 @C0 + (Ap < Beta) + vshl.s16 q15, q15, #2 @(q0 - p0) << 2) H + vaddw.u8 q14, q14, d4 @((q0 - p0) << 2) + (p1 L + vaddw.u8 q15, q15, d5 @((q0 - p0) << 2) + (p1 H + vsubw.u8 q14, q14, d10 @((q0 - p0) << 2) + (p1 - q1) L + vsubw.u8 q15, q15, d11 @((q0 - p0) << 2) + (p1 - q1) H + vbic q11, q11, q13 @final condition for p1 + vrshrn.s16 d28, q14, #3 @delta = ((((q0 - p0) << 2) + (p1 - q1) + 4) >> 3); L + vrshrn.s16 d29, q15, #3 @delta = ((((q0 - p0) << 2) + (p1 - q1) + 4) >> 3) H + vsub.u8 q8, q8, q10 @C0 + (Ap < Beta) + (Aq < Beta) + vbic q10, q10, q13 @final condition for q1 + vabs.s8 q15, q14 @abs(delta) + vand q12, q12, q11 @delatp1 + vand q9, q9, q10 @delta q1 + vmin.u8 q15, q15, q8 @min((abs(delta),C) + vadd.i8 q2, q2, q12 @p1+deltap1 + vadd.i8 q5, q5, q9 @q1+deltaq1 + vbic q15, q15, q13 @abs(delta) of pixels to be changed only + vcge.s8 q14, q14, #0 @sign(delta) + vqsub.u8 q11, q3, q15 @clip(p0-delta) + vtrn.8 d0, d2 @row1 &2 + vqadd.u8 q3, q3, q15 @clip(p0+delta) + vtrn.8 d1, d3 @row9 &10 + vqadd.u8 q12, q4, q15 @clip(q0+delta) + vtrn.8 d12, d14 @row7 & 8 + vqsub.u8 q4, q4, q15 @clip(q0-delta) + vtrn.8 d13, d15 @row15 & 16 + vbif q3, q11, q14 @p0 + vbif q4, q12, q14 @q0 + vtrn.8 d4, d6 @row3&row4 + vtrn.8 d8, d10 @row5&6 + vtrn.8 d5, d7 @row11 & 12 + vtrn.8 d9, d11 @row13 &14 + vtrn.16 d2, d6 @row2 & row4 + vtrn.16 d10, d14 @row6 & row8 + vtrn.16 d3, d7 @row10 & 12 + vtrn.16 d11, d15 @row14 & row16 + vtrn.32 d6, d14 @row4 & 8 + vtrn.32 d7, d15 @row 12 & 16 + @now Q3 ->p0 and Q7->q3 + vtrn.16 d0, d4 @row1 & 3 + vtrn.16 d8, d12 @row 5 & 7 + vtrn.16 d1, d5 @row9 & row11 + vtrn.16 d9, d13 @row13 & row15 + sub r0, r0, r1, lsl#4 @restore pointer + vtrn.32 d0, d8 @row1 & row5 + vtrn.32 d1, d9 @row9 & 13 + vtrn.32 d2, d10 @row2 &6 + vtrn.32 d3, d11 @row10&row14 + vtrn.32 d4, d12 @row3 & 7 + vtrn.32 d5, d13 @row11 & row15 + vst1.8 {d0}, [r0], r1 @row1 + vst1.8 d2, [r0], r1 @row2 + vst1.8 d4, [r0], r1 @row3 + vst1.8 d6, [r0], r1 @row4 + vst1.8 d8, [r0], r1 @row5 + vst1.8 d10, [r0], r1 @row6 + vst1.8 d12, [r0], r1 @row7 + vst1.8 d14, [r0], r1 @row8 + vst1.8 d1, [r0], r1 @row9 + vst1.8 d3, [r0], r1 @row10 + vst1.8 d5, [r0], r1 @row11 + vst1.8 d7, [r0], r1 @row12 + vst1.8 d9, [r0], r1 @row13 + vst1.8 d11, [r0], r1 @row14 + vst1.8 d13, [r0], r1 @row15 + vst1.8 d15, [r0], r1 @row16 + vpop {d8 - d15} + ldmfd sp!, {r12, pc} + + + +@/** +@******************************************************************************* +@* +@* @brief +@* Performs filtering of a luma block vertical edge when the +@* boundary strength is set to 4 +@* +@* @par Description: +@* This operation is described in Sec. 8.7.2.4 under the title +@* "Filtering process for edges for bS equal to 4" in ITU T Rec H.264. +@* +@* @param[in] r0 - pu1_src +@* Pointer to the src sample q0 +@* +@* @param[in] r1 - src_strd +@* Source stride +@* +@* @param[in] r2 - alpha +@* Alpha Value for the boundary +@* +@* @param[in] r3 - beta +@* Beta Value for the boundary +@* +@* @returns +@* None +@* +@* @remarks +@* None +@* +@******************************************************************************* +@*/ + + .global ih264_deblk_luma_vert_bs4_a9 + +ih264_deblk_luma_vert_bs4_a9: + + stmfd sp!, {r12, lr} + vpush {d8 - d15} + sub r0, r0, #4 @pointer uc_edgePixel-4 + @loading p3:p2:p1:p0:q0:q1:q2:q3 for every row + vld1.8 d0, [r0], r1 @row1 + vld1.8 d2, [r0], r1 @row2 + vld1.8 d4, [r0], r1 @row3 + vld1.8 d6, [r0], r1 @row4 + vld1.8 d8, [r0], r1 @row5 + vld1.8 d10, [r0], r1 @row6 + vld1.8 d12, [r0], r1 @row7 + vld1.8 d14, [r0], r1 @row8 + vld1.8 d1, [r0], r1 @row9 + vld1.8 d3, [r0], r1 @row10 + vld1.8 d5, [r0], r1 @row11 + vld1.8 d7, [r0], r1 @row12 + vld1.8 d9, [r0], r1 @row13 + vld1.8 d11, [r0], r1 @row14 + vld1.8 d13, [r0], r1 @row15 + vld1.8 d15, [r0], r1 @row16 + @taking two 8x8 transposes + @2X2 transposes + vtrn.8 d0, d2 @row1 &2 + vtrn.8 d4, d6 @row3&row4 + vtrn.8 d8, d10 @row5&6 + vtrn.8 d12, d14 @row7 & 8 + vtrn.8 d1, d3 @row9 &10 + vtrn.8 d5, d7 @row11 & 12 + vtrn.8 d9, d11 @row13 &14 + vtrn.8 d13, d15 @row15 & 16 + @4x4 transposes + vtrn.16 d2, d6 @row2 & row4 + vtrn.16 d10, d14 @row6 & row8 + vtrn.16 d3, d7 @row10 & 12 + vtrn.16 d11, d15 @row14 & row16 + vtrn.32 d6, d14 @row4 & 8 + vtrn.32 d7, d15 @row 12 & 16 + @now Q3 ->p0 and Q7->q3 + vtrn.16 d0, d4 @row1 & 3 + vtrn.16 d8, d12 @row 5 & 7 + vtrn.16 d1, d5 @row9 & row11 + vtrn.16 d9, d13 @row13 & row15 + vtrn.32 d0, d8 @row1 & row5 + vtrn.32 d1, d9 @row9 & 13 + @now Q0->p3 & Q4->q0 + @starting processing as p0 and q0 are now ready + @now Q1->p2 & Q5->q1 + vpush {q7} @saving in stack + vtrn.32 d4, d12 @row3 & 7 + vmov.i16 q14, #2 + vtrn.32 d5, d13 @row11 & row15 + vaddl.u8 q8, d6, d8 @p0+q0 L + vtrn.32 d2, d10 @row2 &6 + vaddl.u8 q9, d7, d9 @p0+q0 H + vtrn.32 d3, d11 @row10&row14 + vaddw.u8 q10, q8, d4 @p0+q0+p1 L + vaddw.u8 q11, q9, d5 @p0+q0+p1 H + vaddl.u8 q12, d2, d10 @p2+q1 L + vaddl.u8 q13, d3, d11 @p2+q1 H + vmla.u16 q12, q10, q14 @p2 + X2(p1) + X2(p0) + X2(q0) + q1 L + vmla.u16 q13, q11, q14 @p2 + X2(p1) + X2(p0) + X2(q0) + q1 H + vmov.i8 q14, #2 + vaddw.u8 q8, q10, d2 @p0+q0+p1+p2 L + vaddw.u8 q9, q11, d3 @p0+q0+p1+p2 H + vdup.i8 q15, r2 @duplicate alpha + vrshrn.u16 d20, q8, #2 @(p2 + p1 + p0 + q0 + 2) >> 2)L p1' + vrshrn.u16 d21, q9, #2 @(p2 + p1 + p0 + q0 + 2) >> 2)H p1' + vabd.u8 q11, q3, q4 @ABD(p0-q0) + vsra.u8 q14, q15, #2 @alpha >>2 +2 + vabd.u8 q15, q1, q3 @Ap = ABD(p2-p0) + vrshrn.u16 d24, q12, #3 @((p2 + X2(p1) + X2(p0) + X2(q0) + q1 + 4) >> 3) L p0' + vrshrn.u16 d25, q13, #3 @((p2 + X2(p1) + X2(p0) + X2(q0) + q1 + 4) >> 3) H p0' + vdup.i8 q13, r3 @beta + vcgt.u8 q14, q14, q11 @ABS(p0 - q0) <((Alpha >>2) + 2) + vaddl.u8 q11, d6, d10 @p0+q1 L + vcgt.u8 q7, q13, q15 @beta>Ap + vaddl.u8 q15, d7, d11 @p0+q1 H + vaddw.u8 q11, q11, d4 @p0+q1+p1 L + vaddw.u8 q15, q15, d5 @p0+q1+p1 H + vaddw.u8 q11, q11, d4 @p0+q1+2*p1 L + vaddw.u8 q15, q15, d5 @p0+q1+2*p1 H + vand q7, q7, q14 @(Ap < Beta && ABS(p0 - q0) <((Alpha >>2) + 2) + vrshrn.u16 d22, q11, #2 @((X2(p1) + p0 + q1 + 2) >> 2) L p0" + vrshrn.u16 d23, q15, #2 @((X2(p1) + p0 + q1 + 2) >> 2) H p0" + vaddl.u8 q15, d2, d0 @p2+p3 L + vbif q12, q11, q7 @p0' or p0 " + vaddl.u8 q11, d3, d1 @p2+p3 H + vadd.u16 q15, q15, q15 @2*(p2+p3) L + vadd.u16 q11, q11, q11 @2*(p2+p3)H + vadd.u16 q8, q8, q15 @(X2(p3) + X3(p2) + p1 + p0 + q0) L + vadd.u16 q9, q9, q11 @(X2(p3) + X3(p2) + p1 + p0 + q0) H + vabd.u8 q15, q6, q4 @Aq = abs(q2-q0) + vabd.u8 q11, q5, q4 @ABS(Q1-Q0) + vrshrn.u16 d16, q8, #3 @((X2(p3) + X3(p2) + p1 + p0 + q0 + 4) >> 3); L p2' + vrshrn.u16 d17, q9, #3 @((X2(p3) + X3(p2) + p1 + p0 + q0 + 4) >> 3); H p2' + vabd.u8 q9, q2, q3 @ABS(p1-p0) + vcgt.u8 q15, q13, q15 @Aq < Beta + vcge.u8 q11, q11, q13 @ABS(q1 - q0) >= Beta + vcge.u8 q9, q9, q13 @ABS(p1 - p0) >= beta + vdup.i8 q13, r2 @duplicate alpha + vand q15, q15, q14 @(Aq < Beta && ABS(p0 - q0) <((Alpha >>2) + 2)) + vabd.u8 q14, q3, q4 @abs(p0-q0) + vorr q11, q11, q9 @ABS(p1 - p0) >= Beta || ABS(q1 - q0) >= Beta + vaddl.u8 q9, d6, d8 @p0+q0 L + vcge.u8 q14, q14, q13 @ABS(p0 - q0) >= Alpha + vaddl.u8 q13, d7, d9 @p0+q0 H + vaddw.u8 q9, q9, d10 @p0+q0+q1 L + vorr q11, q11, q14 @ABS(p1 - p0) >= Beta || ABS(q1 - q0) >= Beta||ABS(p0 - q0) >= Alpha + vaddw.u8 q13, q13, d11 @p0+q0+q1 H + vbic q7, q7, q11 @final condn for p's + vmov.i8 q14, #2 + vbif q3, q12, q11 @final p0 + vbit q1, q8, q7 @final p2 + vbif q10, q2, q7 @final p1 + vaddl.u8 q12, d8, d4 @q0+p1 L + vmlal.u8 q12, d10, d28 @X2(q1) + q0 + p1 L + vaddl.u8 q8, d9, d5 @q0+p1 H + vmlal.u8 q8, d11, d28 @X2(q1) + q0 + p1 H + vmov.i16 q14, #2 + vaddl.u8 q7, d4, d12 @p1+q2 L + vmla.u16 q7, q9, q14 @p1 + X2(p0) + X2(q0) + X2(q1) + q2L + vaddl.u8 q2, d5, d13 @p1+q2H + vmla.u16 q2, q13, q14 @p1 + X2(p0) + X2(q0) + X2(q1) + q2H + vrshrn.u16 d24, q12, #2 @(X2(q1) + q0 + p1 + 2) >> 2; L q0' + vrshrn.u16 d25, q8, #2 @(X2(q1) + q0 + p1 + 2) >> 2; H q0' + vaddw.u8 q9, q9, d12 @p0 + q0 + q1 + q2 L + vaddw.u8 q13, q13, d13 @p0 + q0 + q1 + q2 H + vrshrn.u16 d16, q7, #3 @(p1 + X2(p0) + X2(q0) + X2(q1) + q2 + 4) >> 3 L qo" + vpop {q7} + vrshrn.u16 d17, q2, #3 @(p1 + X2(p0) + X2(q0) + X2(q1) + q2 + 4) >> 3 H qo" + vrshrn.u16 d4, q9, #2 @p0 + q0 + q1 + q2 + 2)>>2 L q1' + vrshrn.u16 d5, q13, #2 @p0 + q0 + q1 + q2 + 2)>>2 H q1' + vbit q12, q8, q15 @q0' or q0" + vbic q15, q15, q11 @final condn for q's + vtrn.8 d0, d2 @row1 &2 + vbit q5, q2, q15 @final q1 + vtrn.8 d1, d3 @row9 &10 + vaddl.u8 q8, d12, d14 @q2+q3 L + vtrn.8 d20, d6 @row3&row4 + vaddl.u8 q2, d13, d15 @q2+q3 H + vtrn.8 d21, d7 @row11 & 12 + vmla.u16 q9, q8, q14 @X2(q3) + X3(q2) + q1 + q0 + p0 L + vtrn.16 d2, d6 @row2 & row4 + vmla.u16 q13, q2, q14 @X2(q3) + X3(q2) + q1 + q0 + p0 H + vtrn.16 d3, d7 @row10 & 12 + vbif q4, q12, q11 @final q0 + vtrn.16 d0, d20 @row1 & 3 + vrshrn.u16 d18, q9, #3 @(X2(q3) + X3(q2) + q1 + q0 + p0 + 4) >> 3; L + vtrn.16 d1, d21 @row9 & row11 + vrshrn.u16 d19, q13, #3 @(X2(q3) + X3(q2) + q1 + q0 + p0 + 4) >> 3; H + vtrn.8 d8, d10 @row5&6 + vbit q6, q9, q15 @final q2 + vtrn.8 d9, d11 @row13 &14 + vtrn.8 d12, d14 @row7 & 8 + vtrn.8 d13, d15 @row15 & 16 + vtrn.16 d10, d14 @row6 & row8 + vtrn.16 d11, d15 @row14 & row16 + @now Q3 ->p0 and Q7->q3 + vtrn.16 d8, d12 @row 5 & 7 + vtrn.16 d9, d13 @row13 & row15 + sub r0, r0, r1, lsl#4 @restore pointer + vtrn.32 d6, d14 @row4 & 8 + vtrn.32 d7, d15 @row 12 & 16 + vtrn.32 d0, d8 @row1 & row5 + vtrn.32 d1, d9 @row9 & 13 + vtrn.32 d2, d10 @row2 &6 + vtrn.32 d3, d11 @row10&row14 + vtrn.32 d20, d12 @row3 & 7 + vtrn.32 d21, d13 @row11 & row15 + vst1.8 d0, [r0], r1 @row1 + vst1.8 d2, [r0], r1 @row2 + vst1.8 d20, [r0], r1 @row3 + vst1.8 d6, [r0], r1 @row4 + vst1.8 d8, [r0], r1 @row5 + vst1.8 d10, [r0], r1 @row6 + vst1.8 d12, [r0], r1 @row7 + vst1.8 d14, [r0], r1 @row8 + vst1.8 d1, [r0], r1 @row9 + vst1.8 d3, [r0], r1 @row10 + vst1.8 d21, [r0], r1 @row11 + vst1.8 d7, [r0], r1 @row12 + vst1.8 d9, [r0], r1 @row13 + vst1.8 d11, [r0], r1 @row14 + vst1.8 d13, [r0], r1 @row15 + vst1.8 d15, [r0], r1 @row16 + vpop {d8 - d15} + ldmfd sp!, {r12, pc} + + + +@/** +@******************************************************************************* +@* +@* @brief +@* Performs filtering of a luma block vertical edge when the +@* boundary strength is set to 4 on calling twice +@* +@* @par Description: +@* This operation is described in Sec. 8.7.2.4 under the title +@* "Filtering process for edges for bS equal to 4" in ITU T Rec H.264. +@* +@* @param[in] r0 - pu1_src +@* Pointer to the src sample q0 +@* +@* @param[in] r1 - src_strd +@* Source stride +@* +@* @param[in] r2 - alpha +@* Alpha Value for the boundary +@* +@* @param[in] r3 - beta +@* Beta Value for the boundary +@* +@* @returns +@* None +@* +@* @remarks +@* None +@* +@******************************************************************************* +@*/ + + .global ih264_deblk_luma_vert_bs4_mbaff_a9 + +ih264_deblk_luma_vert_bs4_mbaff_a9: + + stmfd sp!, {lr} + + sub r0, r0, #4 @pointer uc_edgePixel-4 + vpush {d8 - d15} + @loading [p3:p2],[p1:p0]:[q0:q1]:[q2:q3] for every row + vld4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0], r1 + vld4.16 {d0[1], d2[1], d4[1], d6[1]}, [r0], r1 + vld4.16 {d0[2], d2[2], d4[2], d6[2]}, [r0], r1 + vld4.16 {d0[3], d2[3], d4[3], d6[3]}, [r0], r1 + vld4.16 {d1[0], d3[0], d5[0], d7[0]}, [r0], r1 + vld4.16 {d1[1], d3[1], d5[1], d7[1]}, [r0], r1 + vld4.16 {d1[2], d3[2], d5[2], d7[2]}, [r0], r1 + vld4.16 {d1[3], d3[3], d5[3], d7[3]}, [r0], r1 + + vuzp.8 d0, d1 @D0->p3, D1->p2 + vuzp.8 d2, d3 @D2->p1, D3->p0 + vuzp.8 d4, d5 @D4->q0, D5->q1 + vuzp.8 d6, d7 @D6->q2, D7->q3 + + vmov.i16 q14, #2 + vaddl.u8 q4, d3, d4 @p0+q0 + vaddw.u8 q5, q4, d2 @p0+q0+p1 + vaddl.u8 q6, d1, d5 @p2+q1 + vmla.u16 q6, q5, q14 @p2 + X2(p1) + X2(p0) + X2(q0) + q1 + + vmov.i8 d14, #2 + vaddw.u8 q4, q5, d1 @p0+q0+p1+p2 + vdup.i8 d15, r2 @duplicate alpha + vrshrn.u16 d10, q4, #2 @(p2 + p1 + p0 + q0 + 2) >> 2) p1' + vabd.u8 d11, d3, d4 @ABD(p0-q0) + vsra.u8 d14, d15, #2 @alpha >>2 +2 + vabd.u8 d15, d1, d3 @Ap = ABD(p2-p0) + vrshrn.u16 d12, q6, #3 @((p2 + X2(p1) + X2(p0) + X2(q0) + q1 + 4) >> 3) p0' + vdup.i8 d13, r3 @beta + vcgt.u8 d14, d14, d11 @ABS(p0 - q0) <((Alpha >>2) + 2) + vaddl.u8 q8, d3, d5 @p0+q1 + vcgt.u8 d26, d13, d15 @beta>Ap + vaddw.u8 q8, q8, d2 @p0+q1+p1 + vaddw.u8 q8, q8, d2 @p0+q1+2*p1 + vand d26, d26, d14 @(Ap < Beta && ABS(p0 - q0) <((Alpha >>2) + 2) + vrshrn.u16 d11, q8, #2 @((X2(p1) + p0 + q1 + 2) >> 2) p0" + vbif d12, d11, d26 @p0' or p0 " + vaddl.u8 q9, d1, d0 @p2+p3 + vadd.u16 q9, q9, q9 @2*(p2+p3) + vadd.u16 q4, q4, q9 @(X2(p3) + X3(p2) + p1 + p0 + q0) + vabd.u8 d15, d6, d4 @Aq = abs(q2-q0) + vabd.u8 d11, d5, d4 @ABS(q1-q0) + vrshrn.u16 d8, q4, #3 @((X2(p3) + X3(p2) + p1 + p0 + q0 + 4) >> 3); p2' + vabd.u8 d9, d2, d3 @ABS(p1-p0) + vcgt.u8 d15, d13, d15 @Aq < Beta + vcge.u8 d11, d11, d13 @ABS(q1 - q0) >= Beta + vcge.u8 d9, d9, d13 @ABS(p1 - p0) >= beta + vdup.i8 d13, r2 @duplicate alpha + vand d15, d15, d14 @(Aq < Beta && ABS(p0 - q0) <((Alpha >>2) + 2)) + vabd.u8 d14, d3, d4 @abs(p0-q0) + vorr d11, d11, d9 @ABS(p1 - p0) >= Beta || ABS(q1 - q0) >= Beta + vcge.u8 d14, d14, d13 @ABS(p0 - q0) >= Alpha + vaddl.u8 q10, d3, d4 @p0+q0 + vorr d11, d11, d14 @ABS(p1 - p0) >= Beta || ABS(q1 - q0) >= Beta||ABS(p0 - q0) >= Alpha + vaddw.u8 q10, q10, d5 @p0+q0+q1 + vbic d26, d26, d11 @final condn for p's + vmov.i8 d14, #2 + vbif d3, d12, d11 @final p0 + vbit d1, d8, d26 @final p2 + vbif d10, d2, d26 @final p1 + vaddl.u8 q6, d4, d2 @q0+p1 + vmlal.u8 q6, d5, d14 @X2(q1) + q0 + p1 + + vaddl.u8 q11, d2, d6 @p1+q2 + vmla.u16 q11, q10, q14 @p1 + X2(p0) + X2(q0) + X2(q1) + q2 + vrshrn.u16 d12, q6, #2 @(X2(q1) + q0 + p1 + 2) >> 2; q0' + vaddw.u8 q10, q10, d6 @p0 + q0 + q1 + q2 + vrshrn.u16 d8, q11, #3 @(p1 + X2(p0) + X2(q0) + X2(q1) + q2 + 4) >> 3 qo" + + vrshrn.u16 d2, q10, #2 @p0 + q0 + q1 + q2 + 2)>>2 q1' + vbit d12, d8, d15 @q0' or q0" + vbic d15, d15, d11 @final condn for q's + vbit d5, d2, d15 @final q1 + vaddl.u8 q12, d6, d7 @q2+q3 + vmla.u16 q10, q12, q14 @X2(q3) + X3(q2) + q1 + q0 + p0 + vbif d4, d12, d11 @final q0 + vrshrn.u16 d9, q10, #3 @(X2(q3) + X3(q2) + q1 + q0 + p0 + 4) >> 3; + vbit d6, d9, d15 @final q2 + vand d2, d10, d10 @D0->p3, D1->p2, D2->p1, D3->p0, D4->q0, D5->q1, D6->q2, D7->q3 + + vzip.8 d0, d1 @D0,D1 -> [p3:p2] + vzip.8 d2, d3 @D2,D3 -> [p1:p0] + vzip.8 d4, d5 @D4,D5 -> [q0:q1] + vzip.8 d6, d7 @D6,D7 -> [q2:q3] + + sub r0, r0, r1, lsl#3 @restore pointer + + @storing [p3:p2],[p1:p0]:[q0:q1]:[q2:q3] in every row + vst4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0], r1 + vst4.16 {d0[1], d2[1], d4[1], d6[1]}, [r0], r1 + vst4.16 {d0[2], d2[2], d4[2], d6[2]}, [r0], r1 + vst4.16 {d0[3], d2[3], d4[3], d6[3]}, [r0], r1 + vst4.16 {d1[0], d3[0], d5[0], d7[0]}, [r0], r1 + vst4.16 {d1[1], d3[1], d5[1], d7[1]}, [r0], r1 + vst4.16 {d1[2], d3[2], d5[2], d7[2]}, [r0], r1 + vst4.16 {d1[3], d3[3], d5[3], d7[3]}, [r0], r1 + vpop {d8 - d15} + ldmfd sp!, {pc} + + + +@/** +@******************************************************************************* +@* +@* @brief +@* Performs filtering of a luma block vertical edge for cases where the +@* boundary strength is less than 4 on calling twice +@* +@* @par Description: +@* This operation is described in Sec. 8.7.2.4 under the title +@* "Filtering process for edges for bS equal to 4" in ITU T Rec H.264. +@* +@* @param[in] r0 - pu1_src +@* Pointer to the src sample q0 +@* +@* @param[in] r1 - src_strd +@* Source stride +@* +@* @param[in] r2 - alpha +@* Alpha Value for the boundary +@* +@* @param[in] r3 - beta +@* Beta Value for the boundary +@* +@* @param[in] sp(0) - u4_bs +@* Packed Boundary strength array +@* +@* @param[in] sp(4) - pu1_cliptab +@* tc0_table +@* +@* @returns +@* None +@* +@* @remarks +@* None +@* +@******************************************************************************* +@*/ + + .global ih264_deblk_luma_vert_bslt4_mbaff_a9 + +ih264_deblk_luma_vert_bslt4_mbaff_a9: + + stmfd sp!, {r12, lr} + + sub r0, r0, #4 @pointer uc_edgePixel-4 + ldr r12, [sp, #8] @r12 = ui_Bs + ldr r14, [sp, #12] @r14 = pu1_ClipTab + vpush {d8 - d15} + @loading [p3:p2],[p1:p0]:[q0:q1]:[q2:q3] for every row + vld4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0], r1 + vld4.16 {d0[1], d2[1], d4[1], d6[1]}, [r0], r1 + vld4.16 {d0[2], d2[2], d4[2], d6[2]}, [r0], r1 + vld4.16 {d0[3], d2[3], d4[3], d6[3]}, [r0], r1 + vld4.16 {d1[0], d3[0], d5[0], d7[0]}, [r0], r1 + vld4.16 {d1[1], d3[1], d5[1], d7[1]}, [r0], r1 + vld4.16 {d1[2], d3[2], d5[2], d7[2]}, [r0], r1 + vld4.16 {d1[3], d3[3], d5[3], d7[3]}, [r0], r1 + + vuzp.8 d0, d1 @D0->p3, D1->p2 + vuzp.8 d2, d3 @D2->p1, D3->p0 + vuzp.8 d4, d5 @D4->q0, D5->q1 + vuzp.8 d6, d7 @D6->q2, D7->q3 + + rev r12, r12 @reversing ui_bs + vmov.32 d8[0], r12 @D8[0] = ui_Bs + vld1.32 d9[0], [r14] @D9[0] contains cliptab + vmovl.u8 q15, d8 @D30 = ui_Bs in each 16 bt scalar + vtbl.8 d8, {d9}, d30 @puc_ClipTab[ui_Bs] + vsli.16 d8, d8, #8 @D8 = C0 + + vrhadd.u8 d10, d3, d4 @((p0 + q0 + 1) >> 1) + vmov.i8 d31, #2 + vabd.u8 d11, d3, d4 @ABS(p0 - q0) + vaddl.u8 q6, d10, d1 @(p2 + ((p0 + q0 + 1) >> 1) + vmlsl.u8 q6, d2, d31 @(p2 + ((p0 + q0 + 1) >> 1) - (p1 << 1)) + vdup.8 d14, r2 @alpha + vcle.u8 d11, d14, d11 @ABS(p0 - q0) >= Alpha(Alpha <=ABS(p0 - q0)) + vdup.i8 d14, r3 @beta + vabd.u8 d15, d5, d4 @ABS(q1 - q0) + vqshrn.s16 d12, q6, #1 @((p2 + ((p0 + q0 + 1) >> 1) - (p1 << 1)) >> 1) + vcge.u8 d15, d15, d14 @ABS(q1 - q0) >= Beta + vabd.u8 d13, d2, d3 @ABS(p1 - p0) + vmin.s8 d12, d12, d8 @min(deltap1 ,C0) + vorr d11, d11, d15 @ABS(q1 - q0) >= Beta ||ABS(p0 - q0) >= Alpha + vneg.s8 d15, d8 @-C0 + vcge.u8 d13, d13, d14 @ABS(p1 - p0) >= Beta + vmax.s8 d12, d12, d15 @max(deltap1,-C0) + vorr d11, d11, d13 @ABS(p0 - q0) >= Alpha || ABS(q1 - q0) >= Beta || ABS(p1 - p0) >= Beta) + vceq.u16 d13, d30, #0 @ui_bs == 0 + vaddl.u8 q14, d10, d6 @q2 + ((p0 + q0 + 1) >> 1) + vsubw.u8 q14, q14, d5 @q2 + ((p0 + q0 + 1) >> 1) - q1 + vsubw.u8 q14, q14, d5 @q2 + ((p0 + q0 + 1) >> 1) - 2*q1 + vorr d13, d13, d11 @(ABS(p0 - q0) >= Alpha || ABS(q1 - q0) >= Beta || ABS(p1 - p0) >= Beta)) + @|| (ui_bs == 0) + vqshrn.s16 d9, q14, #1 @(q2 + ((p0 + q0 + 1) >> 1) - (q1 << 1)) >> 1 + vabd.u8 d11, d1, d3 @Ap = ABS(p2 - p0) + vabd.u8 d10, d6, d4 @Aq= ABS(q2 - q0) + vclt.u8 d11, d11, d14 @Ap < Beta + vmin.s8 d9, d9, d8 @min(deltaq1,C0) + vclt.u8 d10, d10, d14 @Aq < Beta + vmax.s8 d9, d9, d15 @max(deltaq1,-C0) + vsubl.u8 q7, d4, d3 @q0 - p0 + vshl.s16 q7, q7, #2 @(q0 - p0) << 2 + vsub.u8 d8, d8, d11 @C0 + (Ap < Beta) + vaddw.u8 q7, q7, d2 @((q0 - p0) << 2) + p1 + vsubw.u8 q7, q7, d5 @((q0 - p0) << 2) + (p1 - q1) + vbic d11, d11, d13 @final condition for p1 + vrshr.s16 q15, q7, #3 @delta = (((q0 - p0) << 2) + (p1 - q1) + 4) >> 3 + vsub.u8 d8, d8, d10 @C0 + (Ap < Beta) + (Aq < Beta) + vbic d10, d10, d13 @final condition for q1 + vabs.s16 q14, q15 + vmovn.i16 d15, q14 @abs(delta) + vand d12, d12, d11 @delatp1 + vand d9, d9, d10 @deltaq1 + vmin.u8 d15, d15, d8 @min((abs(delta),C) + vadd.i8 d2, d2, d12 @p1+deltap1 + vadd.i8 d5, d5, d9 @q1+deltaq1 + vbic d15, d15, d13 @abs(delta) of pixels to be changed only + vcge.s16 q14, q15, #0 + vmovn.i16 d14, q14 @sign(delta) + vqsub.u8 d11, d3, d15 @clip(p0-delta) + vqadd.u8 d3, d3, d15 @clip(p0+delta) + vqadd.u8 d12, d4, d15 @clip(q0+delta) + vqsub.u8 d4, d4, d15 @clip(q0-delta) + vbif d3, d11, d14 @p0 + vbif d4, d12, d14 @q0 + + sub r0, r0, r1, lsl#3 @restore pointer + @D0->p3, D1->p2, D2->p1, D3->p0, D4->q0, D5->q1, D6->q2, D7->q3 + vzip.8 d0, d1 @D0,D1 -> [p3:p2] + vzip.8 d2, d3 @D2,D3 -> [p1:p0] + vzip.8 d4, d5 @D4,D5 -> [q0:q1] + vzip.8 d6, d7 @D6,D7 -> [q2:q3] + + @storing [p3:p2],[p1:p0]:[q0:q1]:[q2:q3] in every row + vst4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0], r1 + vst4.16 {d0[1], d2[1], d4[1], d6[1]}, [r0], r1 + vst4.16 {d0[2], d2[2], d4[2], d6[2]}, [r0], r1 + vst4.16 {d0[3], d2[3], d4[3], d6[3]}, [r0], r1 + vst4.16 {d1[0], d3[0], d5[0], d7[0]}, [r0], r1 + vst4.16 {d1[1], d3[1], d5[1], d7[1]}, [r0], r1 + vst4.16 {d1[2], d3[2], d5[2], d7[2]}, [r0], r1 + vst4.16 {d1[3], d3[3], d5[3], d7[3]}, [r0], r1 + vpop {d8 - d15} + ldmfd sp!, {r12, pc} + + + diff --git a/common/arm/ih264_default_weighted_pred_a9q.s b/common/arm/ih264_default_weighted_pred_a9q.s new file mode 100755 index 0000000..94cda46 --- /dev/null +++ b/common/arm/ih264_default_weighted_pred_a9q.s @@ -0,0 +1,359 @@ +@/****************************************************************************** +@ * +@ * Copyright (C) 2015 The Android Open Source Project +@ * +@ * Licensed under the Apache License, Version 2.0 (the "License"); +@ * you may not use this file except in compliance with the License. +@ * You may obtain a copy of the License at: +@ * +@ * http://www.apache.org/licenses/LICENSE-2.0 +@ * +@ * Unless required by applicable law or agreed to in writing, software +@ * distributed under the License is distributed on an "AS IS" BASIS, +@ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +@ * See the License for the specific language governing permissions and +@ * limitations under the License. +@ * +@ ***************************************************************************** +@ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore +@*/ +@/** +@****************************************************************************** +@* @file +@* ih264_default_weighted_pred_a9q.s +@* +@* @brief +@* Contains function definitions for default weighted prediction. +@* Functions are coded using NEON intrinsics and can be compiled using ARM RVCT +@* +@* @author +@* Kaushik Senthoor R +@* +@* @par List of Functions: +@* +@* - ih264_default_weighted_pred_luma_a9q() +@* - ih264_default_weighted_pred_chroma_a9q() +@* +@* @remarks +@* None +@* +@******************************************************************************* +@*/ +@******************************************************************************* +@* @function +@* ih264_default_weighted_pred_luma_a9q() +@* +@* @brief +@* This routine performs the default weighted prediction as described in sec +@* 8.4.2.3.1 titled "Default weighted sample prediction process" for luma. +@* +@* @par Description: +@* This function gets two ht x wd blocks, calculates their rounded-average and +@* stores it in the destination block. +@* +@* @param[in] pu1_src1: +@* UWORD8 Pointer to the buffer containing the first input block. +@* +@* @param[in] pu1_src2: +@* UWORD8 Pointer to the buffer containing the second input block. +@* +@* @param[out] pu1_dst +@* UWORD8 pointer to the destination where the output block is stored. +@* +@* @param[in] src_strd1 +@* Stride of the first input buffer +@* +@* @param[in] src_strd2 +@* Stride of the second input buffer +@* +@* @param[in] dst_strd +@* Stride of the destination buffer +@* +@* @param[in] ht +@* integer height of the array +@* +@* @param[in] wd +@* integer width of the array +@* +@* @returns +@* None +@* +@* @remarks +@* (ht,wd) can be (4,4), (4,8), (8,4), (8,8), (8,16), (16,8) or (16,16). +@* +@******************************************************************************* +@*/ +@void ih264_default_weighted_pred_luma_a9q(UWORD8 *pu1_src1, +@ UWORD8 *pu1_src2, +@ UWORD8 *pu1_dst, +@ WORD32 src_strd1, +@ WORD32 src_strd2, +@ WORD32 dst_strd, +@ WORD32 ht, +@ WORD32 wd) +@ +@**************Variables Vs Registers***************************************** +@ r0 => pu1_src1 +@ r1 => pu1_src2 +@ r2 => pu1_dst +@ r3 => src_strd1 +@ [sp] => src_strd2 (r4) +@ [sp+4] => dst_strd (r5) +@ [sp+8] => ht (r6) +@ [sp+12] => wd (r7) +@ +.text +.p2align 2 + + .global ih264_default_weighted_pred_luma_a9q + +ih264_default_weighted_pred_luma_a9q: + + stmfd sp!, {r4-r7, r14} @stack stores the values of the arguments + ldr r7, [sp, #32] @Load wd + ldr r4, [sp, #20] @Load src_strd2 + ldr r5, [sp, #24] @Load dst_strd + cmp r7, #16 + ldr r6, [sp, #28] @Load ht + vpush {d8-d15} + beq loop_16 @branch if wd is 16 + cmp r7, #8 + beq loop_8 @branch if wd is 8 + +loop_4: @each iteration processes four rows + + vld1.32 d0[0], [r0], r3 @load row 1 in source 1 + vld1.32 d0[1], [r0], r3 @load row 2 in source 1 + vld1.32 d2[0], [r1], r4 @load row 1 in source 2 + vld1.32 d2[1], [r1], r4 @load row 2 in source 2 + + vld1.32 d1[0], [r0], r3 @load row 3 in source 1 + vld1.32 d1[1], [r0], r3 @load row 4 in source 1 + vrhadd.u8 d0, d0, d2 + vld1.32 d3[0], [r1], r4 @load row 3 in source 2 + vld1.32 d3[1], [r1], r4 @load row 4 in source 2 + + subs r6, r6, #4 @decrement ht by 4 + vst1.32 d0[0], [r2], r5 @load row 1 in destination + vst1.32 d0[1], [r2], r5 @load row 2 in destination + vrhadd.u8 d1, d1, d3 + vst1.32 d1[0], [r2], r5 @load row 3 in destination + vst1.32 d1[1], [r2], r5 @load row 4 in destination + + bgt loop_4 @if greater than 0 repeat the loop again + + b end_loops + +loop_8: @each iteration processes four rows + + vld1.8 d0, [r0], r3 @load row 1 in source 1 + vld1.8 d4, [r1], r4 @load row 1 in source 2 + vld1.8 d1, [r0], r3 @load row 2 in source 1 + vld1.8 d5, [r1], r4 @load row 2 in source 2 + vld1.8 d2, [r0], r3 @load row 3 in source 1 + vrhadd.u8 q0, q0, q2 + vld1.8 d6, [r1], r4 @load row 3 in source 2 + vld1.8 d3, [r0], r3 @load row 4 in source 1 + vrhadd.u8 d2, d2, d6 + vld1.8 d7, [r1], r4 @load row 4 in source 2 + + subs r6, r6, #4 @decrement ht by 4 + vst1.8 d0, [r2], r5 @load row 1 in destination + vrhadd.u8 d3, d3, d7 + vst1.8 d1, [r2], r5 @load row 2 in destination + vst1.8 d2, [r2], r5 @load row 3 in destination + vst1.8 d3, [r2], r5 @load row 4 in destination + + bgt loop_8 @if greater than 0 repeat the loop again + + b end_loops + +loop_16: @each iteration processes eight rows + + vld1.8 {q0}, [r0], r3 @load row 1 in source 1 + vld1.8 {q8}, [r1], r4 @load row 1 in source 2 + vld1.8 {q1}, [r0], r3 @load row 2 in source 1 + vld1.8 {q9}, [r1], r4 @load row 2 in source 2 + vrhadd.u8 q0, q0, q8 + vld1.8 {q2}, [r0], r3 @load row 3 in source 1 + vld1.8 {q10}, [r1], r4 @load row 3 in source 2 + vrhadd.u8 q1, q1, q9 + vld1.8 {q3}, [r0], r3 @load row 4 in source 1 + vld1.8 {q11}, [r1], r4 @load row 4 in source 2 + vrhadd.u8 q2, q2, q10 + vld1.8 {q4}, [r0], r3 @load row 5 in source 1 + vld1.8 {q12}, [r1], r4 @load row 5 in source 2 + vrhadd.u8 q3, q3, q11 + vld1.8 {q5}, [r0], r3 @load row 6 in source 1 + vld1.8 {q13}, [r1], r4 @load row 6 in source 2 + vrhadd.u8 q4, q4, q12 + vld1.8 {q6}, [r0], r3 @load row 7 in source 1 + vld1.8 {q14}, [r1], r4 @load row 7 in source 2 + vrhadd.u8 q5, q5, q13 + vld1.8 {q7}, [r0], r3 @load row 8 in source 1 + vld1.8 {q15}, [r1], r4 @load row 8 in source 2 + + vrhadd.u8 q6, q6, q14 + vst1.8 {q0}, [r2], r5 @load row 1 in destination + vst1.8 {q1}, [r2], r5 @load row 2 in destination + vrhadd.u8 q7, q7, q15 + vst1.8 {q2}, [r2], r5 @load row 3 in destination + vst1.8 {q3}, [r2], r5 @load row 4 in destination + subs r6, r6, #8 @decrement ht by 8 + vst1.8 {q4}, [r2], r5 @load row 5 in destination + vst1.8 {q5}, [r2], r5 @load row 6 in destination + vst1.8 {q6}, [r2], r5 @load row 7 in destination + vst1.8 {q7}, [r2], r5 @load row 8 in destination + + bgt loop_16 @if greater than 0 repeat the loop again + +end_loops: + + vpop {d8-d15} + ldmfd sp!, {r4-r7, r15} @Reload the registers from sp + + +@******************************************************************************* +@* @function +@* ih264_default_weighted_pred_chroma_a9q() +@* +@* @brief +@* This routine performs the default weighted prediction as described in sec +@* 8.4.2.3.1 titled "Default weighted sample prediction process" for chroma. +@* +@* @par Description: +@* This function gets two ht x wd blocks, calculates their rounded-average and +@* stores it in the destination block for U and V. +@* +@* @param[in] pu1_src1: +@* UWORD8 Pointer to the buffer containing the first input block. +@* +@* @param[in] pu1_src2: +@* UWORD8 Pointer to the buffer containing the second input block. +@* +@* @param[out] pu1_dst +@* UWORD8 pointer to the destination where the output block is stored. +@* +@* @param[in] src_strd1 +@* Stride of the first input buffer +@* +@* @param[in] src_strd2 +@* Stride of the second input buffer +@* +@* @param[in] dst_strd +@* Stride of the destination buffer +@* +@* @param[in] ht +@* integer height of the array +@* +@* @param[in] wd +@* integer width of the array +@* +@* @returns +@* None +@* +@* @remarks +@* (ht,wd) can be (2,2), (2,4), (4,2), (4,4), (4,8), (8,4) or (8,8). +@* +@******************************************************************************* +@*/ +@void ih264_default_weighted_pred_chroma_a9q(UWORD8 *pu1_src1, +@ UWORD8 *pu1_src2, +@ UWORD8 *pu1_dst, +@ WORD32 src_strd1, +@ WORD32 src_strd2, +@ WORD32 dst_strd, +@ WORD32 ht, +@ WORD32 wd) +@ +@**************Variables Vs Registers***************************************** +@ r0 => pu1_src1 +@ r1 => pu1_src2 +@ r2 => pu1_dst +@ r3 => src_strd1 +@ [sp] => src_strd2 (r4) +@ [sp+4] => dst_strd (r5) +@ [sp+8] => ht (r6) +@ [sp+12] => wd (r7) +@ + + + .global ih264_default_weighted_pred_chroma_a9q + +ih264_default_weighted_pred_chroma_a9q: + + stmfd sp!, {r4-r7, r14} @stack stores the values of the arguments + ldr r7, [sp, #32] @Load wd + ldr r4, [sp, #20] @Load src_strd2 + ldr r5, [sp, #24] @Load dst_strd + cmp r7, #8 + ldr r6, [sp, #28] @Load ht + vpush {d8-d15} + beq loop_8_uv @branch if wd is 8 + cmp r7, #4 + beq loop_4_uv @branch if wd is 4 + +loop_2_uv: @each iteration processes two rows + + vld1.32 d0[0], [r0], r3 @load row 1 in source 1 + vld1.32 d0[1], [r0], r3 @load row 2 in source 1 + + vld1.32 d1[0], [r1], r4 @load row 1 in source 2 + vld1.32 d1[1], [r1], r4 @load row 2 in source 2 + + vrhadd.u8 d0, d0, d1 + + subs r6, r6, #2 @decrement ht by 2 + vst1.32 d0[0], [r2], r5 @load row 1 in destination + vst1.32 d0[1], [r2], r5 @load row 2 in destination + + bgt loop_2_uv @if greater than 0 repeat the loop again + + b end_loops_uv + +loop_4_uv: @each iteration processes two rows + + vld1.8 d0, [r0], r3 @load row 1 in source 1 + vld1.8 d2, [r1], r4 @load row 1 in source 2 + vld1.8 d1, [r0], r3 @load row 2 in source 1 + vrhadd.u8 d0, d0, d2 + vld1.8 d3, [r1], r4 @load row 2 in source 2 + + vrhadd.u8 d1, d1, d3 + vst1.8 d0, [r2], r5 @load row 1 in destination + subs r6, r6, #2 @decrement ht by 2 + vst1.8 d1, [r2], r5 @load row 2 in destination + + bgt loop_4_uv @if greater than 0 repeat the loop again + + b end_loops_uv + +loop_8_uv: @each iteration processes four rows + + vld1.8 {q0}, [r0], r3 @load row 1 in source 1 + vld1.8 {q4}, [r1], r4 @load row 1 in source 2 + vld1.8 {q1}, [r0], r3 @load row 2 in source 1 + vrhadd.u8 q0, q0, q4 + vld1.8 {q5}, [r1], r4 @load row 2 in source 2 + vld1.8 {q2}, [r0], r3 @load row 3 in source 1 + vrhadd.u8 q1, q1, q5 + vld1.8 {q6}, [r1], r4 @load row 3 in source 2 + vld1.8 {q3}, [r0], r3 @load row 4 in source 1 + vrhadd.u8 q2, q2, q6 + vld1.8 {q7}, [r1], r4 @load row 4 in source 2 + + vst1.8 {q0}, [r2], r5 @load row 1 in destination + vrhadd.u8 q3, q3, q7 + vst1.8 {q1}, [r2], r5 @load row 2 in destination + subs r6, r6, #4 @decrement ht by 4 + vst1.8 {q2}, [r2], r5 @load row 3 in destination + vst1.8 {q3}, [r2], r5 @load row 4 in destination + + bgt loop_8_uv @if greater than 0 repeat the loop again + +end_loops_uv: + + vpop {d8-d15} + ldmfd sp!, {r4-r7, r15} @Reload the registers from sp + + diff --git a/common/arm/ih264_ihadamard_scaling_a9.s b/common/arm/ih264_ihadamard_scaling_a9.s new file mode 100755 index 0000000..687099a --- /dev/null +++ b/common/arm/ih264_ihadamard_scaling_a9.s @@ -0,0 +1,250 @@ +@/****************************************************************************** +@ * +@ * Copyright (C) 2015 The Android Open Source Project +@ * +@ * Licensed under the Apache License, Version 2.0 (the "License"); +@ * you may not use this file except in compliance with the License. +@ * You may obtain a copy of the License at: +@ * +@ * http://www.apache.org/licenses/LICENSE-2.0 +@ * +@ * Unless required by applicable law or agreed to in writing, software +@ * distributed under the License is distributed on an "AS IS" BASIS, +@ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +@ * See the License for the specific language governing permissions and +@ * limitations under the License. +@ * +@ ***************************************************************************** +@ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore +@*/ +@/** +@ ******************************************************************************* +@ * @file +@ * ih264_ihadamard_scaling_a9.s +@ * +@ * @brief +@ * Contains function definitions for inverse hadamard transform on 4x4 DC outputs +@ * of 16x16 intra-prediction +@ * +@ * @author +@ * Mohit +@ * +@ * @par List of Functions: +@ * - ih264_ihadamard_scaling_4x4_a9() +@ * - ih264_ihadamard_scaling_2x2_uv_a9() +@ * +@ * @remarks +@ * None +@ * +@ ******************************************************************************* +@ */ +@ * @brief This function performs a 4x4 inverse hadamard transform on the 4x4 DC coefficients +@ * of a 16x16 intra prediction macroblock, and then performs scaling. +@ * prediction buffer +@ * +@ * @par Description: +@ * The DC coefficients pass through a 2-stage inverse hadamard transform. +@ * This inverse transformed content is scaled to based on Qp value. +@ * +@ * @param[in] pi2_src +@ * input 4x4 block of DC coefficients +@ * +@ * @param[out] pi2_out +@ * output 4x4 block +@ * +@ * @param[in] pu2_iscal_mat +@ * pointer to scaling list +@ * +@ * @param[in] pu2_weigh_mat +@ * pointer to weight matrix +@ * +@ * @param[in] u4_qp_div_6 +@ * Floor (qp/6) +@ * +@ * @param[in] pi4_tmp +@ * temporary buffer of size 1*16 +@ * +@ * @returns none +@ * +@ * @remarks none +@ * +@ ******************************************************************************* +@ */ +@ * +@ ******************************************************************************* +@ */ +@ void ih264_ihadamard_scaling_4x4(WORD16* pi2_src, +@ WORD16* pi2_out, +@ const UWORD16 *pu2_iscal_mat, +@ const UWORD16 *pu2_weigh_mat, +@ UWORD32 u4_qp_div_6, +@ WORD32* pi4_tmp) +@**************Variables Vs Registers***************************************** +@r0 => *pi2_src +@r1 => *pi2_out +@r2 => *pu2_iscal_mat +@r3 => *pu2_weigh_mat +@r4 => u4_qp_div_6 + +.text +.p2align 2 + + .global ih264_ihadamard_scaling_4x4_a9 + +ih264_ihadamard_scaling_4x4_a9: + +@VLD4.S16 is used because the pointer is incremented by SUB_BLK_WIDTH_4x4 +@If the macro value changes need to change the instruction according to it. +@Only one shift is done in horizontal inverse because, +@if u4_qp_div_6 is lesser than 4 then shift value will be neagative and do negative left shift, in this case rnd_factor has value +@if u4_qp_div_6 is greater than 4 then shift value will be positive and do left shift, here rnd_factor is 0 + + stmfd sp!, {r4-r12, r14} @ stack stores the values of the arguments + ldr r4, [sp, #40] @ Loads u4_qp_div_6 + vdup.s32 q10, r4 @ Populate the u4_qp_div_6 in Q10 + ldrh r6, [r3] @ load pu2_weight_mat[0] , H for unsigned halfword load + ldrh r7, [r2] @ load pu2_iscal_mat[0] , H for unsigned halfword load + mul r6, r6, r7 @ pu2_iscal_mat[0]*pu2_weigh_mat[0] + vdup.s32 q9, r6 @ Populate pu2_iscal_mat[0]*pu2_weigh_mat[0] 32-bit in Q9 + vpush {d8-d15} +@=======================INVERSE HADAMARD TRANSFORM================================ + + vld4.s16 {d0, d1, d2, d3}, [r0] @load x4,x5,x6,x7 + vaddl.s16 q12, d0, d3 @x0 = x4 + x7 + vaddl.s16 q13, d1, d2 @x1 = x5 + x6 + vsubl.s16 q14, d1, d2 @x2 = x5 - x6 + vsubl.s16 q15, d0, d3 @x3 = x4 - x7 + + vadd.s32 q2, q12, q13 @pi4_tmp_ptr[0] = x0 + x1 + vadd.s32 q3, q15, q14 @pi4_tmp_ptr[1] = x3 + x2 + vsub.s32 q4, q12, q13 @pi4_tmp_ptr[2] = x0 - x1 + vsub.s32 q5, q15, q14 @pi4_tmp_ptr[3] = x3 - x2 + + vtrn.32 q2, q3 @Transpose the register for vertical transform + vtrn.32 q4, q5 + + vswp d5, d8 @Q2 = x4, Q4 = x6 + vswp d7, d10 @Q3 = x5, Q5 = x7 + + + vadd.s32 q12, q2, q5 @x0 = x4+x7 + vadd.s32 q13, q3, q4 @x1 = x5+x6 + vsub.s32 q14, q3, q4 @x2 = x5-x6 + vsub.s32 q15, q2, q5 @x3 = x4-x7 + + vadd.s32 q0, q12, q13 @pi4_tmp_ptr[0] = x0 + x1 + vadd.s32 q1, q15, q14 @pi4_tmp_ptr[1] = x3 + x2 + vsub.s32 q2, q12, q13 @pi4_tmp_ptr[2] = x0 - x1 + vsub.s32 q3, q15, q14 @pi4_tmp_ptr[3] = x3 - x2 + + + vmul.s32 q0, q0, q9 @ Q0 = p[i] = (x[i] * trns_coeff[i]) where i = 0..3 + vmul.s32 q1, q1, q9 @ Q1 = p[i] = (x[i] * trns_coeff[i]) where i = 4..7 + vmul.s32 q2, q2, q9 @ Q2 = p[i] = (x[i] * trns_coeff[i]) where i = 8..11 + vmul.s32 q3, q3, q9 @ Q3 = p[i] = (x[i] * trns_coeff[i]) where i = 12..15 + + vshl.s32 q0, q0, q10 @ Q0 = q[i] = (p[i] << (qP/6)) where i = 0..3 + vshl.s32 q1, q1, q10 @ Q1 = q[i] = (p[i] << (qP/6)) where i = 4..7 + vshl.s32 q2, q2, q10 @ Q2 = q[i] = (p[i] << (qP/6)) where i = 8..11 + vshl.s32 q3, q3, q10 @ Q3 = q[i] = (p[i] << (qP/6)) where i = 12..15 + + vqrshrn.s32 d0, q0, #0x6 @ D0 = c[i] = ((q[i] + 32) >> 4) where i = 0..3 + vqrshrn.s32 d1, q1, #0x6 @ D1 = c[i] = ((q[i] + 32) >> 4) where i = 4..7 + vqrshrn.s32 d2, q2, #0x6 @ D2 = c[i] = ((q[i] + 32) >> 4) where i = 8..11 + vqrshrn.s32 d3, q3, #0x6 @ D3 = c[i] = ((q[i] + 32) >> 4) where i = 12..15 + + vst1.s16 {d0, d1, d2, d3}, [r1] @IV row store the value + + vpop {d8-d15} + ldmfd sp!, {r4-r12, r15} @Reload the registers from SP + + + +@ ******************************************************************************* +@ */ +@ * @brief This function performs a 2x2 inverse hadamard transform for chroma block +@ * +@ * @par Description: +@ * The DC coefficients pass through a 2-stage inverse hadamard transform. +@ * This inverse transformed content is scaled to based on Qp value. +@ * Both DC blocks of U and v blocks are processesd +@ * +@ * @param[in] pi2_src +@ * input 1x8 block of ceffs. First 4 are from U and next from V +@ * +@ * @param[out] pi2_out +@ * output 1x8 block +@ * +@ * @param[in] pu2_iscal_mat +@ * pointer to scaling list +@ * +@ * @param[in] pu2_weigh_mat +@ * pointer to weight matrix +@ * +@ * @param[in] u4_qp_div_6 +@ * Floor (qp/6) +@ * +@ * @returns none +@ * +@ * @remarks none +@ * +@ ******************************************************************************* +@ */ +@ * +@ ******************************************************************************* +@ */ +@ void ih264_ihadamard_scaling_2x2_uv(WORD16* pi2_src, +@ WORD16* pi2_out, +@ const UWORD16 *pu2_iscal_mat, +@ const UWORD16 *pu2_weigh_mat, +@ UWORD32 u4_qp_div_6, + + .global ih264_ihadamard_scaling_2x2_uv_a9 +ih264_ihadamard_scaling_2x2_uv_a9: + +@Registers used +@ r0 : *pi2_src +@ r1 : *pi2_out +@ r2 : *pu2_iscal_mat +@ r3 : *pu2_weigh_mat + + vld1.u16 d26[0], [r2] + vld1.u16 d27[0], [r3] + vmull.u16 q15, d26, d27 @pu2_iscal_mat[0] * pu2_weigh_mat[0] + vdup.u32 q15, d30[0] + + vld1.u16 d28[0], [sp] @load qp/6 + + vpush {d8-d15} + + vmov.u16 d29, #5 + vsubl.u16 q14, d28, d29 @qp\6 - 5 + vdup.s32 q14, d28[0] + + vld2.s16 {d0, d1}, [r0] @load 8 dc coeffs + @i2_x4,i2_x6,i2_y4,i1_y6 -> d0 + @i2_x5,i2_x7,i2_y5,i1_y6 -> d1 + + vaddl.s16 q1, d0, d1 @ i4_x0 = i4_x4 + i4_x5;...x2 + vsubl.s16 q2, d0, d1 @ i4_x1 = i4_x4 - i4_x5;...x3 + + vtrn.s32 q1, q2 @i4_x0 i4_x1 -> q1 + + vadd.s32 q3, q1, q2 @i4_x4 = i4_x0+i4_x2;.. i4_x5 + vsub.s32 q1, q1, q2 @i4_x6 = i4_x0-i4_x2;.. i4_x7 + + vmul.s32 q5, q3, q15 + vmul.s32 q6, q1, q15 + + vshl.s32 q7, q5, q14 + vshl.s32 q8, q6, q14 + + vmovn.s32 d18, q7 @i4_x4 i4_x5 i4_y4 i4_y5 + vmovn.s32 d19, q8 @i4_x6 i4_x7 i4_y6 i4_y7 + + vst2.s32 {d18-d19}, [r1] + + vpop {d8-d15} + bx lr + + diff --git a/common/arm/ih264_inter_pred_chroma_a9q.s b/common/arm/ih264_inter_pred_chroma_a9q.s new file mode 100755 index 0000000..afd2860 --- /dev/null +++ b/common/arm/ih264_inter_pred_chroma_a9q.s @@ -0,0 +1,254 @@ +@/****************************************************************************** +@ * +@ * Copyright (C) 2015 The Android Open Source Project +@ * +@ * Licensed under the Apache License, Version 2.0 (the "License"); +@ * you may not use this file except in compliance with the License. +@ * You may obtain a copy of the License at: +@ * +@ * http://www.apache.org/licenses/LICENSE-2.0 +@ * +@ * Unless required by applicable law or agreed to in writing, software +@ * distributed under the License is distributed on an "AS IS" BASIS, +@ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +@ * See the License for the specific language governing permissions and +@ * limitations under the License. +@ * +@ ***************************************************************************** +@ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore +@*/ +@/** +@****************************************************************************** +@* @file +@* ih264_inter_pred_chroma_a9q.s +@* +@* @brief +@* Contains function definitions for inter prediction interpolation. +@* +@* @author +@* Ittaim +@* +@* @par List of Functions: +@* +@* - ih264_inter_pred_chroma_a9q() +@* +@* @remarks +@* None +@* +@******************************************************************************* +@*/ + +@/* All the functions here are replicated from ih264_inter_pred_filters.c +@ + +@/** +@/** +@/** +@ +@/** +@******************************************************************************* +@* +@* @brief +@* Interprediction chroma filter +@* +@* @par Description: +@* Applies filtering to chroma samples as mentioned in +@* sec 8.4.2.2.2 titled "chroma sample interpolation process" +@* +@* @param[in] pu1_src +@* UWORD8 pointer to the source containing alternate U and V samples +@* +@* @param[out] pu1_dst +@* UWORD8 pointer to the destination +@* +@* @param[in] src_strd +@* integer source stride +@* +@* @param[in] dst_strd +@* integer destination stride +@* +@* @param[in]uc_dx +@* dx value where the sample is to be produced(refer sec 8.4.2.2.2 ) +@* +@* @param[in] uc_dy +@* dy value where the sample is to be produced(refer sec 8.4.2.2.2 ) +@* +@* @param[in] ht +@* integer height of the array +@* +@* @param[in] wd +@* integer width of the array +@* +@* @returns +@* +@* @remarks +@* None +@* +@******************************************************************************* +@*/ + +@void ih264_inter_pred_chroma(UWORD8 *pu1_src, +@ UWORD8 *pu1_dst, +@ WORD32 src_strd, +@ WORD32 dst_strd, +@ UWORD8 u1_dx, +@ UWORD8 u1_dy, +@ WORD32 ht, +@ WORD32 wd) +@**************Variables Vs Registers***************************************** +@ r0 => *pu1_src +@ r1 => *pu1_dst +@ r2 => src_strd +@ r3 => dst_strd +@ r4 => u1_dx +@ r5 => u1_dy +@ r6 => height +@ r7 => width +@ +.text +.p2align 2 + + .global ih264_inter_pred_chroma_a9q + +ih264_inter_pred_chroma_a9q: + + + + stmfd sp!, {r4-r12, r14} @store register values to stack + vstmdb sp!, {d8-d15} @push neon registers to stack + ldr r4, [sp, #104] + ldr r5, [sp, #108] + ldr r6, [sp, #112] + ldr r7, [sp, #116] + + rsb r8, r4, #8 @8-u1_dx + rsb r9, r5, #8 @8-u1_dy + mul r10, r8, r9 + mul r11, r4, r9 + + vdup.u8 d28, r10 + vdup.u8 d29, r11 + + mul r10, r8, r5 + mul r11, r4, r5 + + vdup.u8 d30, r10 + vdup.u8 d31, r11 + + subs r12, r7, #2 @if wd=4 branch to loop_4 + beq loop_2 + subs r12, r7, #4 @if wd=8 branch to loop_8 + beq loop_4 + +loop_8: + sub r6, #1 + vld1.8 {d0, d1, d2}, [r0], r2 @ Load row0 + vld1.8 {d5, d6, d7}, [r0], r2 @ Load row1 + vext.8 d3, d0, d1, #2 + vext.8 d8, d5, d6, #2 + + vmull.u8 q5, d0, d28 + vmlal.u8 q5, d5, d30 + vmlal.u8 q5, d3, d29 + vmlal.u8 q5, d8, d31 + vext.8 d9, d6, d7, #2 + vext.8 d4, d1, d2, #2 + +inner_loop_8: + vmull.u8 q6, d6, d30 + vmlal.u8 q6, d1, d28 + vmlal.u8 q6, d9, d31 + vmlal.u8 q6, d4, d29 + vmov d0, d5 + vmov d3, d8 + + vqrshrun.s16 d14, q5, #6 + vmov d1, d6 + vmov d4, d9 + + vld1.8 {d5, d6, d7}, [r0], r2 @ Load row1 + vqrshrun.s16 d15, q6, #6 + + vext.8 d8, d5, d6, #2 + subs r6, #1 + vext.8 d9, d6, d7, #2 + vst1.8 {q7}, [r1], r3 @ Store dest row + + vmull.u8 q5, d0, d28 + vmlal.u8 q5, d5, d30 + vmlal.u8 q5, d3, d29 + vmlal.u8 q5, d8, d31 + bne inner_loop_8 + + vmull.u8 q6, d6, d30 + vmlal.u8 q6, d1, d28 + vmlal.u8 q6, d9, d31 + vmlal.u8 q6, d4, d29 + + vqrshrun.s16 d14, q5, #6 + vqrshrun.s16 d15, q6, #6 + + vst1.8 {q7}, [r1], r3 @ Store dest row + + b end_func + +loop_4: + sub r6, #1 + vld1.8 {d0, d1}, [r0], r2 @ Load row0 + vld1.8 {d2, d3}, [r0], r2 @ Load row1 + vext.8 d1, d0, d1, #2 + vext.8 d3, d2, d3, #2 + + vmull.u8 q2, d2, d30 + vmlal.u8 q2, d0, d28 + vmlal.u8 q2, d3, d31 + vmlal.u8 q2, d1, d29 + +inner_loop_4: + subs r6, #1 + vmov d0, d2 + vmov d1, d3 + + vld1.8 {d2, d3}, [r0], r2 @ Load row1 + vqrshrun.s16 d6, q2, #6 + + vext.8 d3, d2, d3, #2 + vst1.8 {d6}, [r1], r3 @ Store dest row + + vmull.u8 q2, d0, d28 + vmlal.u8 q2, d2, d30 + vmlal.u8 q2, d1, d29 + vmlal.u8 q2, d3, d31 + bne inner_loop_4 + + vqrshrun.s16 d6, q2, #6 + vst1.8 {d6}, [r1], r3 @ Store dest row + + b end_func + +loop_2: + vld1.8 {d0}, [r0], r2 @ Load row0 + vext.8 d1, d0, d0, #2 + vld1.8 {d2}, [r0], r2 @ Load row1 + vext.8 d3, d2, d2, #2 + vmull.u8 q2, d0, d28 + vmlal.u8 q2, d1, d29 + vmlal.u8 q2, d2, d30 + vmlal.u8 q2, d3, d31 + vld1.8 {d6}, [r0] @ Load row2 + vqrshrun.s16 d4, q2, #6 + vext.8 d7, d6, d6, #2 + vst1.32 d4[0], [r1], r3 @ Store dest row0 + vmull.u8 q4, d2, d28 + vmlal.u8 q4, d3, d29 + vmlal.u8 q4, d6, d30 + vmlal.u8 q4, d7, d31 + subs r6, #2 + vqrshrun.s16 d8, q4, #6 + vst1.32 d8[0], [r1], r3 @ Store dest row1 + bne loop_2 @ repeat if ht=2 + +end_func: + vldmia sp!, {d8-d15} @ Restore neon registers that were saved + ldmfd sp!, {r4-r12, pc} @ Restoring registers from stack + diff --git a/common/arm/ih264_inter_pred_filters_luma_horz_a9q.s b/common/arm/ih264_inter_pred_filters_luma_horz_a9q.s new file mode 100755 index 0000000..ea6bba0 --- /dev/null +++ b/common/arm/ih264_inter_pred_filters_luma_horz_a9q.s @@ -0,0 +1,245 @@ +@/****************************************************************************** +@ * +@ * Copyright (C) 2015 The Android Open Source Project +@ * +@ * Licensed under the Apache License, Version 2.0 (the "License"); +@ * you may not use this file except in compliance with the License. +@ * You may obtain a copy of the License at: +@ * +@ * http://www.apache.org/licenses/LICENSE-2.0 +@ * +@ * Unless required by applicable law or agreed to in writing, software +@ * distributed under the License is distributed on an "AS IS" BASIS, +@ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +@ * See the License for the specific language governing permissions and +@ * limitations under the License. +@ * +@ ***************************************************************************** +@ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore +@*/ +@/** +@****************************************************************************** +@* @file +@* ih264_inter_pred_luma_horz_a9q.s +@* +@* @brief +@* Contains function definitions for inter prediction interpolation. +@* +@* @author +@* Ittiam +@* +@* @par List of Functions: +@* +@* - ih264_inter_pred_luma_horz_a9q() +@* +@* @remarks +@* None +@* +@******************************************************************************* +@*/ + +@/* All the functions here are replicated from ih264_inter_pred_filters.c +@ + +@/** +@/** +@******************************************************************************* +@* +@* @brief +@* Interprediction luma filter for horizontal input +@* +@* @par Description: +@* Applies a 6 tap horizontal filter .The output is clipped to 8 bits +@* sec 8.4.2.2.1 titled "Luma sample interpolation process" +@* +@* @param[in] pu1_src +@* UWORD8 pointer to the source +@* +@* @param[out] pu1_dst +@* UWORD8 pointer to the destination +@* +@* @param[in] src_strd +@* integer source stride +@* +@* @param[in] dst_strd +@* integer destination stride +@* +@* @param[in] ht +@* integer height of the array +@* +@* @param[in] wd +@* integer width of the array +@* +@* @returns +@* +@ @remarks +@* None +@* +@******************************************************************************* +@*/ + +@void ih264_inter_pred_luma_horz ( +@ UWORD8 *pu1_src, +@ UWORD8 *pu1_dst, +@ WORD32 src_strd, +@ WORD32 dst_strd, +@ WORD32 ht, +@ WORD32 wd ) + +@**************Variables Vs Registers***************************************** +@ r0 => *pu1_src +@ r1 => *pu1_dst +@ r2 => src_strd +@ r3 => dst_strd +@ r5 => ht +@ r6 => wd + +.text +.p2align 2 + + + .global ih264_inter_pred_luma_horz_a9q + +ih264_inter_pred_luma_horz_a9q: + + stmfd sp!, {r4-r12, r14} @store register values to stack + vstmdb sp!, {d8-d15} @push neon registers to stack + ldr r5, [sp, #104] @Loads ht + sub r0, r0, #2 @pu1_src-2 + ldr r6, [sp, #108] @Loads wd + vmov.i8 d0, #5 @filter coeff + subs r12, r6, #8 @if wd=8 branch to loop_8 + vmov.i8 d1, #20 @filter coeff + beq loop_8 + + subs r12, r6, #4 @if wd=4 branch to loop_4 + beq loop_4 + +loop_16: @when wd=16 + @// Processing row0 and row1 + vld1.8 {d2, d3, d4}, [r0], r2 @// Load row0 ;for checking loop + vext.8 d31, d2, d3, #5 @//extract a[5] (column1,row0) + vld1.8 {d5, d6, d7}, [r0], r2 @// Load row1 + vext.8 d30, d3, d4, #5 @//extract a[5] (column2,row0) + vaddl.u8 q4, d31, d2 @// a0 + a5 (column1,row0) + vext.8 d28, d5, d6, #5 @//extract a[5] (column1,row1) + vaddl.u8 q5, d30, d3 @// a0 + a5 (column2,row0) + vext.8 d27, d6, d7, #5 @//extract a[5] (column2,row1) + vaddl.u8 q7, d28, d5 @// a0 + a5 (column1,row1) + vext.8 d31, d2, d3, #2 @//extract a[2] (column1,row0) + vaddl.u8 q8, d27, d6 @// a0 + a5 (column2,row1) + vext.8 d30, d3, d4, #2 @//extract a[2] (column2,row0) + vmlal.u8 q4, d31, d1 @// a0 + a5 + 20a2 (column1,row0) + vext.8 d28, d5, d6, #2 @//extract a[2] (column1,row1) + vmlal.u8 q5, d30, d1 @// a0 + a5 + 20a2 (column2,row0) + vext.8 d27, d6, d7, #2 @//extract a[2] (column2,row1) + vmlal.u8 q7, d28, d1 @// a0 + a5 + 20a2 (column1,row1) + vext.8 d31, d2, d3, #3 @//extract a[3] (column1,row0) + vmlal.u8 q8, d27, d1 @// a0 + a5 + 20a2 (column2,row1) + vext.8 d30, d3, d4, #3 @//extract a[3] (column2,row0) + vmlal.u8 q4, d31, d1 @// a0 + a5 + 20a2 + 20a3 (column1,row0) + vext.8 d28, d5, d6, #3 @//extract a[3] (column1,row1) + vmlal.u8 q5, d30, d1 @// a0 + a5 + 20a2 + 20a3 (column2,row0) + vext.8 d27, d6, d7, #3 @//extract a[3] (column2,row1) + vmlal.u8 q7, d28, d1 @// a0 + a5 + 20a2 + 20a3 (column1,row1) + vext.8 d31, d2, d3, #1 @//extract a[1] (column1,row0) + vmlal.u8 q8, d27, d1 @// a0 + a5 + 20a2 + 20a3 (column2,row1) + vext.8 d30, d3, d4, #1 @//extract a[1] (column2,row0) + vmlsl.u8 q4, d31, d0 @// a0 + a5 + 20a2 + 20a3 - 5a1 (column1,row0) + vext.8 d28, d5, d6, #1 @//extract a[1] (column1,row1) + vmlsl.u8 q5, d30, d0 @// a0 + a5 + 20a2 + 20a3 - 5a1 (column2,row0) + vext.8 d27, d6, d7, #1 @//extract a[1] (column2,row1) + vmlsl.u8 q7, d28, d0 @// a0 + a5 + 20a2 + 20a3 - 5a1 (column1,row1) + vext.8 d31, d2, d3, #4 @//extract a[4] (column1,row0) + vmlsl.u8 q8, d27, d0 @// a0 + a5 + 20a2 + 20a3 - 5a1 (column2,row1) + vext.8 d30, d3, d4, #4 @//extract a[4] (column2,row0) + vmlsl.u8 q4, d31, d0 @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1,row0) + vext.8 d28, d5, d6, #4 @//extract a[4] (column1,row1) + vmlsl.u8 q5, d30, d0 @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column2,row0) + vext.8 d27, d6, d7, #4 @//extract a[4] (column2,row1) + vmlsl.u8 q7, d28, d0 @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1,row1) + vmlsl.u8 q8, d27, d0 @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column2,row1) + vqrshrun.s16 d20, q4, #5 @// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column1,row0) + vqrshrun.s16 d21, q5, #5 @// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column2,row0) + vext.8 d31, d2, d3, #5 @//extract a[5] (column1,row2) + vst1.8 {d20, d21}, [r1], r3 @//Store dest row0 + vqrshrun.s16 d23, q7, #5 @// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column1,row1) + vext.8 d30, d3, d4, #5 @//extract a[5] (column2,row2) + vqrshrun.s16 d24, q8, #5 @// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column2,row1) + vst1.8 {d23, d24}, [r1], r3 @//Store dest row1 + subs r5, r5, #2 @ 2 rows done, decrement by 2 + + beq end_func + b loop_16 @ loop if height == 8 or 16 + +loop_8: +@// Processing row0 and row1 + vld1.8 {d5, d6}, [r0], r2 @// Load row1 + vext.8 d28, d5, d6, #5 @//extract a[5] (column1,row1) + vld1.8 {d2, d3}, [r0], r2 @// Load row0 + vext.8 d25, d5, d6, #2 @//extract a[2] (column1,row1) + vext.8 d31, d2, d3, #5 @//extract a[5] (column1,row0) + vext.8 d24, d5, d6, #3 @//extract a[3] (column1,row1) + vext.8 d23, d5, d6, #1 @//extract a[1] (column1,row1) + vext.8 d22, d5, d6, #4 @//extract a[4] (column1,row1) + vaddl.u8 q7, d28, d5 @// a0 + a5 (column1,row1) + vext.8 d29, d2, d3, #3 @//extract a[3] (column1,row0) + vmlal.u8 q7, d25, d1 @// a0 + a5 + 20a2 (column1,row1) + vmlal.u8 q7, d24, d1 @// a0 + a5 + 20a2 + 20a3 (column1,row1) + vmlsl.u8 q7, d23, d0 @// a0 + a5 + 20a2 + 20a3 - 5a1 (column1,row1) + vmlsl.u8 q7, d22, d0 @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1,row1) + vext.8 d30, d2, d3, #2 @//extract a[2] (column1,row0) + vaddl.u8 q4, d31, d2 @// a0 + a5 (column1,row0) + vext.8 d27, d2, d3, #1 @//extract a[1] (column1,row0) + vext.8 d26, d2, d3, #4 @//extract a[4] (column1,row0) + vmlal.u8 q4, d29, d1 @// a0 + a5 + 20a2 + 20a3 (column1,row0) + vmlal.u8 q4, d30, d1 @// a0 + a5 + 20a2 (column1,row0) + vmlsl.u8 q4, d27, d0 @// a0 + a5 + 20a2 + 20a3 - 5a1 (column1,row0) + vmlsl.u8 q4, d26, d0 @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1,row0) + vqrshrun.s16 d23, q7, #5 @// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column1,row0) + vst1.8 {d23}, [r1], r3 @//Store dest row0 + vqrshrun.s16 d20, q4, #5 @// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column1,row1) + vst1.8 {d20}, [r1], r3 @//Store dest row1 + subs r5, r5, #2 @ 2 rows done, decrement by 2 + + beq end_func @ Branch if height==4 + + b loop_8 @looping if height =8 or 16 + +loop_4: + vld1.8 {d5, d6}, [r0], r2 @// Load row1 + vext.8 d28, d5, d6, #5 @//extract a[5] (column1,row1) + vld1.8 {d2, d3}, [r0], r2 @// Load row0 + vext.8 d25, d5, d6, #2 @//extract a[2] (column1,row1) + vext.8 d31, d2, d3, #5 @//extract a[5] (column1,row0) + vaddl.u8 q7, d28, d5 @// a0 + a5 (column1,row1) + vext.8 d24, d5, d6, #3 @//extract a[3] (column1,row1) + vext.8 d23, d5, d6, #1 @//extract a[1] (column1,row1) + vext.8 d22, d5, d6, #4 @//extract a[4] (column1,row1) + vext.8 d29, d2, d3, #3 @//extract a[3] (column1,row0) + vmlal.u8 q7, d25, d1 @// a0 + a5 + 20a2 (column1,row1) + vmlal.u8 q7, d24, d1 @// a0 + a5 + 20a2 + 20a3 (column1,row1) + vmlsl.u8 q7, d23, d0 @// a0 + a5 + 20a2 + 20a3 - 5a1 (column1,row1) + vmlsl.u8 q7, d22, d0 @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1,row1) + vaddl.u8 q4, d31, d2 @// a0 + a5 (column1,row0) + vext.8 d30, d2, d3, #2 @//extract a[2] (column1,row0) + vext.8 d27, d2, d3, #1 @//extract a[1] (column1,row0) + vext.8 d26, d2, d3, #4 @//extract a[4] (column1,row0) + vmlal.u8 q4, d29, d1 @// a0 + a5 + 20a2 + 20a3 (column1,row0) + vmlal.u8 q4, d30, d1 @// a0 + a5 + 20a2 (column1,row0) + vmlsl.u8 q4, d27, d0 @// a0 + a5 + 20a2 + 20a3 - 5a1 (column1,row0) + vmlsl.u8 q4, d26, d0 @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1,row0) + vqrshrun.s16 d23, q7, #5 @// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column1,row0) + vst1.32 d23[0], [r1], r3 @//Store dest row0 + vqrshrun.s16 d20, q4, #5 @// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column1,row1) + vst1.32 d20[0], [r1], r3 @//Store dest row1 + subs r5, r5, #2 @ 2 rows done, decrement by 2 + beq end_func + + b loop_4 + +end_func: + vldmia sp!, {d8-d15} @ Restore neon registers that were saved + ldmfd sp!, {r4-r12, pc} @Restoring registers from stack + + diff --git a/common/arm/ih264_inter_pred_filters_luma_vert_a9q.s b/common/arm/ih264_inter_pred_filters_luma_vert_a9q.s new file mode 100755 index 0000000..5b29e02 --- /dev/null +++ b/common/arm/ih264_inter_pred_filters_luma_vert_a9q.s @@ -0,0 +1,301 @@ +@/****************************************************************************** +@ * +@ * Copyright (C) 2015 The Android Open Source Project +@ * +@ * Licensed under the Apache License, Version 2.0 (the "License"); +@ * you may not use this file except in compliance with the License. +@ * You may obtain a copy of the License at: +@ * +@ * http://www.apache.org/licenses/LICENSE-2.0 +@ * +@ * Unless required by applicable law or agreed to in writing, software +@ * distributed under the License is distributed on an "AS IS" BASIS, +@ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +@ * See the License for the specific language governing permissions and +@ * limitations under the License. +@ * +@ ***************************************************************************** +@ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore +@*/ +@/** +@****************************************************************************** +@* @file +@* ih264_inter_pred_luma_vert_a9q.s +@* +@* @brief +@* Contains function definitions for inter prediction interpolation. +@* +@* @author +@* Ittiam +@* +@* @par List of Functions: +@* +@* - ih264_inter_pred_luma_vert_a9q() +@* +@* @remarks +@* None +@* +@******************************************************************************* +@*/ + +@/* All the functions here are replicated from ih264_inter_pred_filters.c +@ + +@/** +@/** +@/** +@ ******************************************************************************* +@ * +@ * @brief +@ * Interprediction luma filter for vertical input +@ * +@ * @par Description: +@ * Applies a 6 tap vertcal filter.The output is clipped to 8 bits +@ * sec 8.4.2.2.1 titled "Luma sample interpolation process" +@ * +@ * @param[in] pu1_src +@ * UWORD8 pointer to the source +@ * +@ * @param[out] pu1_dst +@ * UWORD8 pointer to the destination +@ * +@ * @param[in] src_strd +@ * integer source stride +@ * +@ * @param[in] dst_strd +@ * integer destination stride +@ * +@ * @param[in] ht +@ * integer height of the array +@ * +@ * @param[in] wd +@ * integer width of the array +@ * +@ * @returns +@ * +@ * @remarks +@ * None +@ * +@ ******************************************************************************* + +@void ih264_inter_pred_luma_vert ( +@ UWORD8 *pu1_src, +@ UWORD8 *pu1_dst, +@ WORD32 src_strd, +@ WORD32 dst_strd, +@ WORD32 ht, +@ WORD32 wd ) + +@**************Variables Vs Registers***************************************** +@ r0 => *pu1_src +@ r1 => *pu1_dst +@ r2 => src_strd +@ r3 => dst_strd +@ r5 => ht +@ r6 => wd + +.text +.p2align 2 + + + .global ih264_inter_pred_luma_vert_a9q + +ih264_inter_pred_luma_vert_a9q: + + stmfd sp!, {r4-r12, r14} @store register values to stack + vstmdb sp!, {d8-d15} @push neon registers to stack + ldr r5, [sp, #104] @Loads ht + sub r0, r0, r2, lsl #1 @pu1_src-2*src_strd + ldr r6, [sp, #108] @Loads wd + vmov.u16 q11, #20 @ Filter coeff 0x14 into Q11 + + subs r12, r6, #8 @if wd=8 branch to loop_8 + vmov.u16 q12, #5 @ Filter coeff 0x5 into Q12 + beq loop_8 + + subs r12, r6, #4 @if wd=4 branch to loop_4 + beq loop_4 + +loop_16: @when wd=16 + + vld1.u32 {q0}, [r0], r2 @ Vector load from src[0_0] + vld1.u32 {q1}, [r0], r2 @ Vector load from src[1_0] + vld1.u32 {q2}, [r0], r2 @ Vector load from src[2_0] + vld1.u32 {q3}, [r0], r2 @ Vector load from src[3_0] + vld1.u32 {q4}, [r0], r2 @ Vector load from src[4_0] + vaddl.u8 q6, d4, d6 @ temp1 = src[2_0] + src[3_0] + vld1.u32 {q5}, [r0], r2 @ Vector load from src[5_0] + + vaddl.u8 q7, d0, d10 @ temp = src[0_0] + src[5_0] + vaddl.u8 q8, d2, d8 @ temp2 = src[1_0] + src[4_0] + vmla.u16 q7, q6, q11 @ temp += temp1 * 20 + vaddl.u8 q10, d1, d11 @ temp4 = src[0_8] + src[5_8] + vaddl.u8 q9, d5, d7 @ temp3 = src[2_8] + src[3_8] + vmla.u16 q10, q9, q11 @ temp4 += temp3 * 20 + vld1.u32 {q0}, [r0], r2 + vaddl.u8 q13, d3, d9 @ temp5 = src[1_8] + src[4_8] + vaddl.u8 q6, d6, d8 + vmls.u16 q7, q8, q12 @ temp -= temp2 * 5 + vaddl.u8 q8, d2, d0 + vaddl.u8 q9, d4, d10 + vmla.u16 q8, q6, q11 + vmls.u16 q10, q13, q12 @ temp4 -= temp5 * 5 + vaddl.u8 q13, d5, d11 + vaddl.u8 q6, d7, d9 + vqrshrun.s16 d30, q7, #5 @ dst[0_0] = CLIP_U8((temp +16) >> 5) + vaddl.u8 q7, d3, d1 + vld1.u32 {q1}, [r0], r2 + vmla.u16 q7, q6, q11 + vmls.u16 q8, q9, q12 + vqrshrun.s16 d31, q10, #5 @ dst[0_8] = CLIP_U8((temp4 +16) >> 5) + vaddl.u8 q9, d4, d2 + vaddl.u8 q6, d8, d10 + + vst1.u32 {q15}, [r1], r3 @ Vector store to dst[0_0] + vmla.u16 q9, q6, q11 + vaddl.u8 q10, d6, d0 + vmls.u16 q7, q13, q12 + vqrshrun.s16 d30, q8, #5 + vaddl.u8 q6, d9, d11 + vaddl.u8 q8, d5, d3 + vaddl.u8 q13, d7, d1 + vmla.u16 q8, q6, q11 + vmls.u16 q9, q10, q12 + vld1.u32 {q2}, [r0], r2 + + vqrshrun.s16 d31, q7, #5 + vaddl.u8 q6, d10, d0 + vaddl.u8 q7, d6, d4 + vaddl.u8 q10, d8, d2 + vmla.u16 q7, q6, q11 + vmls.u16 q8, q13, q12 + vst1.u32 {q15}, [r1], r3 @store row 1 + vqrshrun.s16 d30, q9, #5 + vaddl.u8 q9, d7, d5 + vaddl.u8 q6, d11, d1 + vmla.u16 q9, q6, q11 + vaddl.u8 q13, d9, d3 + vmls.u16 q7, q10, q12 + + vqrshrun.s16 d31, q8, #5 + vmls.u16 q9, q13, q12 + vaddl.u8 q6, d0, d2 @ temp1 = src[2_0] + src[3_0] + vst1.u32 {q15}, [r1], r3 @store row 2 + vaddl.u8 q8, d10, d4 @ temp2 = src[1_0] + src[4_0] + vaddl.u8 q10, d9, d7 @ temp4 = src[0_8] + src[5_8] + vqrshrun.s16 d30, q7, #5 + vaddl.u8 q13, d5, d11 @ temp5 = src[1_8] + src[4_8] + vaddl.u8 q7, d8, d6 @ temp = src[0_0] + src[5_0] + vqrshrun.s16 d31, q9, #5 + vmla.u16 q7, q6, q11 @ temp += temp1 * 20 + vaddl.u8 q9, d1, d3 @ temp3 = src[2_8] + src[3_8] + vst1.u32 {q15}, [r1], r3 @store row 3 + subs r5, r5, #4 @ 4 rows processed, decrement by 4 + subne r0, r0 , r2, lsl #2 + subne r0, r0, r2 + beq end_func @ Branch if height==4 + + b loop_16 @ looping if height = 8 or 16 + +loop_8: +@// Processing row0 and row1 + + vld1.u32 d0, [r0], r2 @ Vector load from src[0_0] + vld1.u32 d1, [r0], r2 @ Vector load from src[1_0] + vld1.u32 d2, [r0], r2 @ Vector load from src[2_0] + vld1.u32 d3, [r0], r2 @ Vector load from src[3_0] + vld1.u32 d4, [r0], r2 @ Vector load from src[4_0] + vld1.u32 d5, [r0], r2 @ Vector load from src[5_0] + + vaddl.u8 q3, d2, d3 @ temp1 = src[2_0] + src[3_0] + vaddl.u8 q4, d0, d5 @ temp = src[0_0] + src[5_0] + vaddl.u8 q5, d1, d4 @ temp2 = src[1_0] + src[4_0] + vmla.u16 q4, q3, q11 @ temp += temp1 * 20 + vld1.u32 d6, [r0], r2 + vaddl.u8 q7, d3, d4 + vaddl.u8 q8, d1, d6 + vaddl.u8 q9, d2, d5 + vmls.u16 q4, q5, q12 @ temp -= temp2 * 5 + vmla.u16 q8, q7, q11 + vld1.u32 d7, [r0], r2 + vaddl.u8 q10, d4, d5 + vaddl.u8 q6, d2, d7 + vaddl.u8 q5, d3, d6 + vmls.u16 q8, q9, q12 + vqrshrun.s16 d26, q4, #5 @ dst[0_0] = CLIP_U8( (temp + 16) >> 5) + vmla.u16 q6, q10, q11 + vld1.u32 d0, [r0], r2 + vaddl.u8 q7, d5, d6 + vqrshrun.s16 d27, q8, #5 + vaddl.u8 q10, d3, d0 + vmls.u16 q6, q5, q12 + vst1.u32 d26, [r1], r3 @ Vector store to dst[0_0] + vaddl.u8 q9, d4, d7 + vmla.u16 q10, q7, q11 + vst1.u32 d27, [r1], r3 + vqrshrun.s16 d28, q6, #5 + vst1.u32 d28, [r1], r3 + vmls.u16 q10, q9, q12 + vqrshrun.s16 d29, q10, #5 + vst1.u32 d29, [r1], r3 @store row 3 + + subs r5, r5, #4 @ 4 rows processed, decrement by 4 + subne r0, r0 , r2, lsl #2 + subne r0, r0, r2 + beq end_func @ Branch if height==4 + + b loop_8 @looping if height == 8 or 16 + + +loop_4: +@// Processing row0 and row1 + + vld1.u32 d0[0], [r0], r2 @ Vector load from src[0_0] + vld1.u32 d1[0], [r0], r2 @ Vector load from src[1_0] + vld1.u32 d2[0], [r0], r2 @ Vector load from src[2_0] + vld1.u32 d3[0], [r0], r2 @ Vector load from src[3_0] + vld1.u32 d4[0], [r0], r2 @ Vector load from src[4_0] + vld1.u32 d5[0], [r0], r2 @ Vector load from src[5_0] + + vaddl.u8 q3, d2, d3 @ temp1 = src[2_0] + src[3_0] + vaddl.u8 q4, d0, d5 @ temp = src[0_0] + src[5_0] + vaddl.u8 q5, d1, d4 @ temp2 = src[1_0] + src[4_0] + vmla.u16 q4, q3, q11 @ temp += temp1 * 20 + vld1.u32 d6[0], [r0], r2 + vaddl.u8 q7, d3, d4 + vaddl.u8 q8, d1, d6 + vaddl.u8 q9, d2, d5 + vmls.u16 q4, q5, q12 @ temp -= temp2 * 5 + vld1.u32 d7[0], [r0], r2 + vmla.u16 q8, q7, q11 + vaddl.u8 q10, d4, d5 + vaddl.u8 q6, d2, d7 + vaddl.u8 q5, d3, d6 + vmls.u16 q8, q9, q12 + vqrshrun.s16 d26, q4, #5 @ dst[0_0] = CLIP_U8( (temp + 16) >> 5) + vmla.u16 q6, q10, q11 + vld1.u32 d0[0], [r0], r2 + vaddl.u8 q7, d5, d6 + vqrshrun.s16 d27, q8, #5 + vaddl.u8 q10, d3, d0 + vmls.u16 q6, q5, q12 + vst1.u32 d26[0], [r1], r3 @ Vector store to dst[0_0] + vaddl.u8 q9, d4, d7 + vmla.u16 q10, q7, q11 + vst1.u32 d27[0], [r1], r3 + vqrshrun.s16 d28, q6, #5 + vst1.u32 d28[0], [r1], r3 + vmls.u16 q10, q9, q12 + vqrshrun.s16 d29, q10, #5 + vst1.u32 d29[0], [r1], r3 @store row 3 + + subs r5, r5, #8 + subeq r0, r0, r2, lsl #2 + subeq r0, r0, r2 + beq loop_4 @ Loop if height==8 + +end_func: + vldmia sp!, {d8-d15} @ Restore neon registers that were saved + ldmfd sp!, {r4-r12, pc} @Restoring registers from stack + + diff --git a/common/arm/ih264_inter_pred_luma_bilinear_a9q.s b/common/arm/ih264_inter_pred_luma_bilinear_a9q.s new file mode 100755 index 0000000..6a3c83d --- /dev/null +++ b/common/arm/ih264_inter_pred_luma_bilinear_a9q.s @@ -0,0 +1,398 @@ +@/****************************************************************************** +@ * +@ * Copyright (C) 2015 The Android Open Source Project +@ * +@ * Licensed under the Apache License, Version 2.0 (the "License"); +@ * you may not use this file except in compliance with the License. +@ * You may obtain a copy of the License at: +@ * +@ * http://www.apache.org/licenses/LICENSE-2.0 +@ * +@ * Unless required by applicable law or agreed to in writing, software +@ * distributed under the License is distributed on an "AS IS" BASIS, +@ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +@ * See the License for the specific language governing permissions and +@ * limitations under the License. +@ * +@ ***************************************************************************** +@ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore +@*/ +@/** +@****************************************************************************** +@* @file +@* ih264_inter_pred_luma_bilinear_a9q.s +@* +@* @brief +@* Contains function definitions for inter prediction interpolation. +@* +@* @author +@* Ittiam +@* +@* @par List of Functions: +@* +@* - ih264_inter_pred_luma_bilinear_a9q() +@* +@* @remarks +@* None +@* +@******************************************************************************* +@*/ + +@/* All the functions here are replicated from ih264_inter_pred_filters.c +@ + +@/** +@/** +@/** +@ ******************************************************************************* +@ * function:ih264_inter_pred_luma_bilinear +@ * +@* @brief +@* This routine applies the bilinear filter to the predictors . +@* The filtering operation is described in +@* sec 8.4.2.2.1 titled "Luma sample interpolation process" +@* +@* @par Description: +@\note +@* This function is called to obtain pixels lying at the following +@* locations (1/4,1), (3/4,1),(1,1/4), (1,3/4) ,(1/4,1/2), (3/4,1/2),(1/2,1/4), (1/2,3/4),(3/4,1/4),(1/4,3/4),(3/4,3/4)&& (1/4,1/4) . +@* The function averages the two adjacent values from the two input arrays in horizontal direction. +@* +@* +@* @param[in] pu1_src1: +@* UWORD8 Pointer to the buffer containing the first input array. +@* +@* @param[in] pu1_src2: +@* UWORD8 Pointer to the buffer containing the second input array. +@* +@* @param[out] pu1_dst +@* UWORD8 pointer to the destination where the output of bilinear filter is stored. +@* +@* @param[in] src_strd1 +@* Stride of the first input buffer +@* +@* @param[in] src_strd2 +@* Stride of the second input buffer +@* +@* @param[in] dst_strd +@* integer destination stride of pu1_dst +@* +@* @param[in] ht +@* integer height of the array +@* +@* @param[in] wd +@* integer width of the array +@* +@* @returns +@* +@* @remarks +@* None +@* +@******************************************************************************* +@*/ + +@void ih264_inter_pred_luma_bilinear(UWORD8 *pu1_src1, +@ UWORD8 *pu1_src2, +@ UWORD8 *pu1_dst, +@ WORD32 src_strd1, +@ WORD32 src_strd2, +@ WORD32 dst_strd, +@ WORD32 height, +@ WORD32 width) +@ +@**************Variables Vs Registers***************************************** +@ r0 => *pu1_src1 +@ r1 => *pu1_src2 +@ r2 => *pu1_dst +@ r3 => src_strd1 +@ r4 => src_strd2 +@ r5 => dst_strd +@ r6 => height +@ r7 => width +@ +.text +.p2align 2 + + .global ih264_inter_pred_luma_bilinear_a9q + +ih264_inter_pred_luma_bilinear_a9q: + + + + stmfd sp!, {r4-r12, r14} @store register values to stack + vstmdb sp!, {d8-d15} @push neon registers to stack + ldr r4, [sp, #104] + ldr r5, [sp, #108] @ + ldr r6, [sp, #112] + ldr r7, [sp, #116] + + subs r12, r7, #4 @if wd=4 branch to loop_4 + beq loop_4 + subs r12, r7, #8 @if wd=8 branch to loop_8 + beq loop_8 + +loop_16: @when wd=16 + + vld1.8 {q0}, [r0], r3 @// Load row0 ;src1 + vld1.8 {q2}, [r1], r4 @// Load row0 ;src2 + vld1.8 {q1}, [r0], r3 @// Load row1 ;src1 + vaddl.u8 q10, d0, d4 + vld1.8 {q3}, [r1], r4 @// Load row1 ;src2 + vaddl.u8 q11, d1, d5 + vld1.8 {q4}, [r0], r3 @// Load row2 ;src1 + vaddl.u8 q12, d2, d6 + vld1.8 {q5}, [r0], r3 @// Load row3 ;src1 + vaddl.u8 q13, d3, d7 + vld1.8 {q6}, [r1], r4 @// Load row2 ;src2 + vaddl.u8 q8, d8, d12 + vld1.8 {q7}, [r1], r4 @// Load row3 ;src2 + vaddl.u8 q9, d9, d13 + vqrshrun.s16 d28, q10, #1 + vqrshrun.s16 d29, q11, #1 + vaddl.u8 q10, d10, d14 + vqrshrun.s16 d30, q12, #1 + vqrshrun.s16 d31, q13, #1 + vst1.8 {q14}, [r2], r5 @//Store dest row0 + vaddl.u8 q11, d11, d15 + vst1.8 {q15}, [r2], r5 @//Store dest row1 + vqrshrun.s16 d28, q8, #1 + vld1.8 {q0}, [r0], r3 @// Load row4 ;src1 + vqrshrun.s16 d29, q9, #1 + vld1.8 {q1}, [r0], r3 @// Load row5 ;src1 + vqrshrun.s16 d30, q10, #1 + vld1.8 {q2}, [r1], r4 @// Load row4 ;src2 + vqrshrun.s16 d31, q11, #1 + vld1.8 {q3}, [r1], r4 @// Load row5 ;src2 + vaddl.u8 q10, d0, d4 + vst1.8 {q14}, [r2], r5 @//Store dest row2 + vaddl.u8 q13, d3, d7 + vst1.8 {q15}, [r2], r5 @//Store dest row3 + vaddl.u8 q11, d1, d5 + vld1.8 {q4}, [r0], r3 @// Load row6 ;src1 + vaddl.u8 q12, d2, d6 + vld1.8 {q5}, [r0], r3 @// Load row7 ;src1 + vqrshrun.s16 d28, q10, #1 + vld1.8 {q6}, [r1], r4 @// Load row6 ;src2 + vqrshrun.s16 d29, q11, #1 + vld1.8 {q7}, [r1], r4 @// Load row7 ;src2 + vaddl.u8 q8, d8, d12 + vaddl.u8 q9, d9, d13 + vaddl.u8 q10, d10, d14 + vqrshrun.s16 d30, q12, #1 + vqrshrun.s16 d31, q13, #1 + vst1.8 {q14}, [r2], r5 @//Store dest row4 + vaddl.u8 q11, d11, d15 + vst1.8 {q15}, [r2], r5 @//Store dest row5 + vqrshrun.s16 d28, q8, #1 + vqrshrun.s16 d30, q10, #1 + vqrshrun.s16 d29, q9, #1 + vld1.8 {q2}, [r1], r4 @// Load row8 ;src2 + vqrshrun.s16 d31, q11, #1 + vst1.8 {q14}, [r2], r5 @//Store dest row6 + subs r12, r6, #8 + vst1.8 {q15}, [r2], r5 @//Store dest row7 + + beq end_func @ end function if ht=8 + + vld1.8 {q0}, [r0], r3 @// Load row8 ;src1 + vaddl.u8 q10, d0, d4 + vld1.8 {q1}, [r0], r3 @// Load row9 ;src1 + vaddl.u8 q11, d1, d5 + vld1.8 {q3}, [r1], r4 @// Load row9 ;src2 + vqrshrun.s16 d28, q10, #1 + vld1.8 {q4}, [r0], r3 @// Load row10 ;src1 + vqrshrun.s16 d29, q11, #1 + vld1.8 {q5}, [r0], r3 @// Load row11 ;src1 + vaddl.u8 q12, d2, d6 + vld1.8 {q6}, [r1], r4 @// Load row10 ;src2 + vaddl.u8 q13, d3, d7 + vld1.8 {q7}, [r1], r4 @// Load row11 ;src2 + vaddl.u8 q8, d8, d12 + vaddl.u8 q9, d9, d13 + vaddl.u8 q10, d10, d14 + vqrshrun.s16 d30, q12, #1 + vst1.8 {q14}, [r2], r5 @//Store dest row8 + vqrshrun.s16 d31, q13, #1 + vst1.8 {q15}, [r2], r5 @//Store dest row9 + vqrshrun.s16 d28, q8, #1 + vld1.8 {q0}, [r0], r3 @// Load row12 ;src1 + vaddl.u8 q11, d11, d15 + vld1.8 {q1}, [r0], r3 @// Load row13 ;src1 + vqrshrun.s16 d29, q9, #1 + vld1.8 {q2}, [r1], r4 @// Load row12 ;src2 + vqrshrun.s16 d30, q10, #1 + vld1.8 {q3}, [r1], r4 @// Load row13 ;src2 + vqrshrun.s16 d31, q11, #1 + vst1.8 {q14}, [r2], r5 @//Store dest row10 + vaddl.u8 q10, d0, d4 + vst1.8 {q15}, [r2], r5 @//Store dest row11 + vaddl.u8 q11, d1, d5 + vld1.8 {q4}, [r0], r3 @// Load row14 ;src1 + vaddl.u8 q13, d3, d7 + vld1.8 {q5}, [r0], r3 @// Load row15 ;src1 + vaddl.u8 q12, d2, d6 + vld1.8 {q6}, [r1], r4 @// Load row14 ;src2 + vaddl.u8 q8, d8, d12 + vld1.8 {q7}, [r1], r4 @// Load row15 ;src2 + vaddl.u8 q9, d9, d13 + vqrshrun.s16 d28, q10, #1 + vqrshrun.s16 d29, q11, #1 + vaddl.u8 q10, d10, d14 + vst1.8 {q14}, [r2], r5 @//Store dest row12 + vqrshrun.s16 d30, q12, #1 + vqrshrun.s16 d31, q13, #1 + vaddl.u8 q11, d11, d15 + vst1.8 {q15}, [r2], r5 @//Store dest row13 + vqrshrun.s16 d28, q8, #1 + vqrshrun.s16 d29, q9, #1 + vqrshrun.s16 d30, q10, #1 + vst1.8 {q14}, [r2], r5 @//Store dest row14 + vqrshrun.s16 d31, q11, #1 + vst1.8 {q15}, [r2], r5 @//Store dest row15 + b end_func + + + +loop_8: @wd=8; + vld1.8 {d0}, [r0], r3 @// Load row0 ;src1 + vld1.8 {d4}, [r1], r4 @// Load row0 ;src2 + vld1.8 {d1}, [r0], r3 @// Load row1 ;src1 + vaddl.u8 q10, d0, d4 + vld1.8 {d5}, [r1], r4 @// Load row1 ;src2 + vld1.8 {d2}, [r0], r3 @// Load row2 ;src1 + vqrshrun.s16 d28, q10, #1 + vld1.8 {d6}, [r1], r4 @// Load row2 ;src2 + vaddl.u8 q11, d1, d5 + vld1.8 {d3}, [r0], r3 @// Load row3 ;src1 + vaddl.u8 q12, d2, d6 + vst1.8 {d28}, [r2], r5 @//Store dest row0 + vqrshrun.s16 d29, q11, #1 + vld1.8 {d7}, [r1], r4 @// Load row3 ;src2 + vqrshrun.s16 d30, q12, #1 + vst1.8 {d29}, [r2], r5 @//Store dest row1 + vaddl.u8 q13, d3, d7 + vst1.8 {d30}, [r2], r5 @//Store dest row2 + vqrshrun.s16 d31, q13, #1 + subs r12, r6, #4 + vst1.8 {d31}, [r2], r5 @//Store dest row3 + beq end_func @ end function if ht=4 + + vld1.8 {d12}, [r1], r4 @// Load row4 ;src2 + vld1.8 {d8}, [r0], r3 @// Load row4 ;src1 + vld1.8 {d9}, [r0], r3 @// Load row5 ;src1 + vaddl.u8 q8, d8, d12 + vld1.8 {d13}, [r1], r4 @// Load row5 ;src2 + vld1.8 {d10}, [r0], r3 @// Load row6;src1 + vaddl.u8 q9, d9, d13 + vld1.8 {d14}, [r1], r4 @// Load row6 ;src2 + vqrshrun.s16 d28, q8, #1 + vld1.8 {d11}, [r0], r3 @// Load row7 ;src1 + vqrshrun.s16 d29, q9, #1 + vst1.8 {d28}, [r2], r5 @//Store dest row4 + vaddl.u8 q10, d10, d14 + vst1.8 {d29}, [r2], r5 @//Store dest row5 + vqrshrun.s16 d30, q10, #1 + vld1.8 {d15}, [r1], r4 @// Load row7 ;src2 + vaddl.u8 q11, d11, d15 + vst1.8 {d30}, [r2], r5 @//Store dest row6 + vqrshrun.s16 d31, q11, #1 + subs r12, r6, #8 + vst1.8 {d31}, [r2], r5 @//Store dest row7 + beq end_func @ end function if ht=8 + + vld1.8 {d0}, [r0], r3 @// Load row8 ;src1 + vld1.8 {d4}, [r1], r4 @// Load row8 ;src2 + vld1.8 {d1}, [r0], r3 @// Load row9 ;src1 + vaddl.u8 q10, d0, d4 + vld1.8 {d5}, [r1], r4 @// Load row9 ;src2 + vld1.8 {d2}, [r0], r3 @// Load row10 ;src1 + vaddl.u8 q11, d1, d5 + vld1.8 {d6}, [r1], r4 @// Load row10 ;src2 + vqrshrun.s16 d28, q10, #1 + vld1.8 {d3}, [r0], r3 @// Load row11 ;src1 + vaddl.u8 q12, d2, d6 + vld1.8 {d7}, [r1], r4 @// Load row11 ;src2 + vqrshrun.s16 d29, q11, #1 + vld1.8 {d8}, [r0], r3 @// Load row12 ;src1 + vaddl.u8 q13, d3, d7 + vst1.8 {d28}, [r2], r5 @//Store dest row8 + vqrshrun.s16 d30, q12, #1 + vld1.8 {d12}, [r1], r4 @// Load row12 ;src2 + vqrshrun.s16 d31, q13, #1 + vst1.8 {d29}, [r2], r5 @//Store dest row9 + vaddl.u8 q8, d8, d12 + vld1.8 {d9}, [r0], r3 @// Load row13 ;src1 + vqrshrun.s16 d28, q8, #1 + vld1.8 {d13}, [r1], r4 @// Load row13 ;src2 + vld1.8 {d10}, [r0], r3 @// Load row14;src1 + vaddl.u8 q9, d9, d13 + vld1.8 {d11}, [r0], r3 @// Load row15 ;src1 + vld1.8 {d14}, [r1], r4 @// Load row14 ;src2 + vqrshrun.s16 d29, q9, #1 + vld1.8 {d15}, [r1], r4 @// Load roW15 ;src2 + vaddl.u8 q10, d10, d14 + vst1.8 {d30}, [r2], r5 @//Store dest row10 + vaddl.u8 q11, d11, d15 + vst1.8 {d31}, [r2], r5 @//Store dest row11 + vqrshrun.s16 d30, q10, #1 + vst1.8 {d28}, [r2], r5 @//Store dest row12 + vqrshrun.s16 d31, q11, #1 + vst1.8 {d29}, [r2], r5 @//Store dest row13 + vst1.8 {d30}, [r2], r5 @//Store dest row14 + vst1.8 {d31}, [r2], r5 @//Store dest row15 + + b end_func + + + +loop_4: + vld1.32 d0[0], [r0], r3 @// Load row0 ;src1 + vld1.32 d4[0], [r1], r4 @// Load row0 ;src2 + vld1.32 d1[0], [r0], r3 @// Load row1 ;src1 + vaddl.u8 q10, d0, d4 + vld1.32 d5[0], [r1], r4 @// Load row1 ;src2 + vld1.32 d2[0], [r0], r3 @// Load row2 ;src1 + vqrshrun.s16 d28, q10, #1 + vld1.32 d6[0], [r1], r4 @// Load row2 ;src2 + vaddl.u8 q11, d1, d5 + vld1.32 d3[0], [r0], r3 @// Load row3 ;src1 + vaddl.u8 q12, d2, d6 + vst1.32 d28[0], [r2], r5 @//Store dest row0 + vqrshrun.s16 d29, q11, #1 + vld1.32 d7[0], [r1], r4 @// Load row3 ;src2 + vqrshrun.s16 d30, q12, #1 + vst1.32 d29[0], [r2], r5 @//Store dest row1 + vaddl.u8 q13, d3, d7 + vst1.32 d30[0], [r2], r5 @//Store dest row2 + vqrshrun.s16 d31, q13, #1 + subs r12, r6, #4 + vst1.32 d31[0], [r2], r5 @//Store dest row3 + beq end_func @ end function if ht=4 + + vld1.32 d12[0], [r1], r4 @// Load row4 ;src2 + vld1.32 d8[0], [r0], r3 @// Load row4 ;src1 + vld1.32 d9[0], [r0], r3 @// Load row5 ;src1 + vaddl.u8 q8, d8, d12 + vld1.32 d13[0], [r1], r4 @// Load row5 ;src2 + vld1.32 d10[0], [r0], r3 @// Load row6;src1 + vaddl.u8 q9, d9, d13 + vld1.32 d14[0], [r1], r4 @// Load row6 ;src2 + vqrshrun.s16 d28, q8, #1 + vld1.32 d11[0], [r0], r3 @// Load row7 ;src1 + vqrshrun.s16 d29, q9, #1 + vst1.32 d28[0], [r2], r5 @//Store dest row4 + vaddl.u8 q10, d10, d14 + vst1.32 d29[0], [r2], r5 @//Store dest row5 + vqrshrun.s16 d30, q10, #1 + vld1.32 d15[0], [r1], r4 @// Load row7 ;src2 + vaddl.u8 q11, d11, d15 + vst1.32 d30[0], [r2], r5 @//Store dest row6 + vqrshrun.s16 d31, q11, #1 + vst1.32 d31[0], [r2], r5 @//Store dest row7 + +end_func: + + vldmia sp!, {d8-d15} @ Restore neon registers that were saved + ldmfd sp!, {r4-r12, pc} @Restoring registers from stack + + diff --git a/common/arm/ih264_inter_pred_luma_copy_a9q.s b/common/arm/ih264_inter_pred_luma_copy_a9q.s new file mode 100755 index 0000000..8ba2fbf --- /dev/null +++ b/common/arm/ih264_inter_pred_luma_copy_a9q.s @@ -0,0 +1,253 @@ +@/****************************************************************************** +@ * +@ * Copyright (C) 2015 The Android Open Source Project +@ * +@ * Licensed under the Apache License, Version 2.0 (the "License"); +@ * you may not use this file except in compliance with the License. +@ * You may obtain a copy of the License at: +@ * +@ * http://www.apache.org/licenses/LICENSE-2.0 +@ * +@ * Unless required by applicable law or agreed to in writing, software +@ * distributed under the License is distributed on an "AS IS" BASIS, +@ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +@ * See the License for the specific language governing permissions and +@ * limitations under the License. +@ * +@ ***************************************************************************** +@ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore +@*/ +@/** +@/** +@******************************************************************************* +@* +@* @brief +@* Interprediction luma function for copy +@* +@* @par Description: +@* Copies the array of width 'wd' and height 'ht' from the location pointed +@* by 'src' to the location pointed by 'dst' +@* +@* @param[in] pu1_src +@* UWORD8 pointer to the source +@* +@* @param[out] pu1_dst +@* UWORD8 pointer to the destination +@* +@* @param[in] src_strd +@* integer source stride +@* +@* @param[in] dst_strd +@* integer destination stride +@* +@* +@* @param[in] ht +@* integer height of the array +@* +@* @param[in] wd +@* integer width of the array +@* +@* @returns +@* +@* @remarks +@* None +@* +@******************************************************************************* +@*/ +@void ih264_inter_pred_luma_copy ( +@ UWORD8 *pu1_src, +@ UWORD8 *pu1_dst, +@ WORD32 src_strd, +@ WORD32 dst_strd, +@ WORD32 ht, +@ WORD32 wd ) + +@**************Variables Vs Registers***************************************** +@ r0 => *pu1_src +@ r1 => *pu1_dst +@ r2 => src_strd +@ r3 => dst_strd +@ r7 => ht +@ r12 => wd + +.text +.p2align 2 + + .global ih264_inter_pred_luma_copy_a9q + +ih264_inter_pred_luma_copy_a9q: + stmfd sp!, {r4-r12, r14} @stack stores the values of the arguments + vstmdb sp!, {d8-d15} @push neon registers to stack + ldr r12, [sp, #108] @Loads wd + ldr r7, [sp, #104] @Loads ht + cmp r7, #0 @checks ht == 0 + ble end_loops + tst r12, #15 @checks wd for multiples for 4 & 8 + beq core_loop_wd_16 + tst r12, #7 @checks wd for multiples for 4 & 8 + beq core_loop_wd_8 + sub r11, r12, #4 + +outer_loop_wd_4: + subs r4, r12, #0 @checks wd == 0 + ble end_inner_loop_wd_4 + +inner_loop_wd_4: + vld1.32 {d0[0]}, [r0] @vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp, 0) + add r5, r0, r2 @pu1_src_tmp += src_strd + add r6, r1, r3 @pu1_dst_tmp += dst_strd + vst1.32 {d0[0]}, [r1] @vst1_lane_u32((uint32_t *)pu1_dst_tmp, src_tmp, 0) + vld1.32 {d0[0]}, [r5], r2 @vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp, 0) + add r0, r0, #4 @pu1_src += 4 + vst1.32 {d0[0]}, [r6], r3 @vst1_lane_u32((uint32_t *)pu1_dst_tmp, src_tmp, 0) + vld1.32 {d0[0]}, [r5], r2 @vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp, 0) + subs r4, r4, #4 @(wd -4) + vst1.32 {d0[0]}, [r6], r3 @vst1_lane_u32((uint32_t *)pu1_dst_tmp, src_tmp, 0) + vld1.32 {d0[0]}, [r5], r2 @vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp, 0) + add r1, r1, #4 @pu1_dst += 4 + vst1.32 {d0[0]}, [r6], r3 @vst1_lane_u32((uint32_t *)pu1_dst_tmp, src_tmp, 0) + + bgt inner_loop_wd_4 + +end_inner_loop_wd_4: + subs r7, r7, #4 @ht - 4 + sub r0, r5, r11 @pu1_src = pu1_src_tmp + sub r1, r6, r11 @pu1_dst = pu1_dst_tmp + bgt outer_loop_wd_4 + +end_loops: + vldmia sp!, {d8-d15} @ Restore neon registers that were saved + ldmfd sp!, {r4-r12, r15} @Reload the registers from SP + + + +core_loop_wd_8: + sub r11, r12, #8 + +outer_loop_wd_8: + subs r4, r12, #0 @checks wd + ble end_inner_loop_wd_8 + +inner_loop_wd_8: + add r5, r0, r2 @pu1_src_tmp += src_strd + vld1.8 {d0}, [r0]! @vld1_u8(pu1_src_tmp) + add r6, r1, r3 @pu1_dst_tmp += dst_strd + vst1.8 {d0}, [r1]! @vst1_u8(pu1_dst_tmp, tmp_src) + vld1.8 {d1}, [r5], r2 @vld1_u8(pu1_src_tmp) + vst1.8 {d1}, [r6], r3 @vst1_u8(pu1_dst_tmp, tmp_src) + subs r4, r4, #8 @wd - 8(Loop condition) + vld1.8 {d2}, [r5], r2 @vld1_u8(pu1_src_tmp) + vst1.8 {d2}, [r6], r3 @vst1_u8(pu1_dst_tmp, tmp_src) + vld1.8 {d3}, [r5], r2 @vld1_u8(pu1_src_tmp) + vst1.8 {d3}, [r6], r3 @vst1_u8(pu1_dst_tmp, tmp_src) + bgt inner_loop_wd_8 + +end_inner_loop_wd_8: + subs r7, r7, #4 @ht -= 4 + sub r0, r5, r11 @pu1_src = pu1_src_tmp + sub r1, r6, r11 @pu1_dst = pu1_dst_tmp + bgt outer_loop_wd_8 + + vldmia sp!, {d8-d15} @ Restore neon registers that were saved + ldmfd sp!, {r4-r12, r15} @Reload the registers from SP + +core_loop_wd_16: + sub r11, r12, #16 + +outer_loop_wd_16: + subs r4, r12, #0 @checks wd + ble end_inner_loop_wd_16 + +inner_loop_wd_16: + add r5, r0, r2 @pu1_src_tmp += src_strd + vld1.8 {q0}, [r0]! @vld1_u8(pu1_src_tmp) + add r6, r1, r3 @pu1_dst_tmp += dst_strd + vst1.8 {q0}, [r1]! @vst1_u8(pu1_dst_tmp, tmp_src) + vld1.8 {q1}, [r5], r2 @vld1_u8(pu1_src_tmp) + vst1.8 {q1}, [r6], r3 @vst1_u8(pu1_dst_tmp, tmp_src) + subs r4, r4, #16 @wd - 8(Loop condition) + vld1.8 {q2}, [r5], r2 @vld1_u8(pu1_src_tmp) + vst1.8 {q2}, [r6], r3 @vst1_u8(pu1_dst_tmp, tmp_src) + vld1.8 {q3}, [r5], r2 @vld1_u8(pu1_src_tmp) + vst1.8 {q3}, [r6], r3 @vst1_u8(pu1_dst_tmp, tmp_src) + bgt inner_loop_wd_16 + +end_inner_loop_wd_16: + subs r7, r7, #4 @ht -= 4 + sub r0, r5, r11 @pu1_src = pu1_src_tmp + sub r1, r6, r11 @pu1_dst = pu1_dst_tmp + bgt outer_loop_wd_16 + + vldmia sp!, {d8-d15} @ Restore neon registers that were saved + ldmfd sp!, {r4-r12, r15} @Reload the registers from SP + + +@ /* +@ ******************************************************************************** +@ * +@ * @brief This function copies a 4x4 block to destination +@ * +@ * @par Description: +@ * Copies a 4x4 block to destination, where both src and dst are interleaved +@ * +@ * @param[in] pi2_src +@ * Source +@ * +@ * @param[in] pu1_out +@ * Output pointer +@ * +@ * @param[in] pred_strd, +@ * Prediction buffer stride +@ * +@ * @param[in] out_strd +@ * output buffer buffer Stride +@ * +@ * @returns none +@ * +@ * @remarks none +@ * Currently wd and height is not used, ie a 4x4 block is always copied +@ * +@ ******************************************************************************* +@ */ +@ void ih264_interleave_copy(WORD16 *pi2_src, +@ UWORD8 *pu1_out, +@ WORD32 pred_strd, +@ WORD32 out_strd +@ WORD32 wd +@ WORD32 ht) +@ Register Usage +@ r0 : pi2_src +@ r1 : pu1_out +@ r2 : src_strd +@ r3 : out_strd +@ Neon registers d0-d7, d16-d30 are used +@ No need for pushing arm and neon registers + + .global ih264_interleave_copy_a9 +ih264_interleave_copy_a9: + + vld1.u8 d2, [r0], r2 @load src plane 1 => d2 &pred palne 2 => d3 + vld1.u8 d3, [r0], r2 + vld1.u8 d4, [r0], r2 + vld1.u8 d5, [r0], r2 + + mov r0, r1 + + vld1.u8 d18, [r1], r3 @load out [8 bit size) -8 coeffs + vld1.u8 d19, [r1], r3 + vmov.u16 q15, #0x00ff + vld1.u8 d20, [r1], r3 + vld1.u8 d21, [r1], r3 + + vbit.u8 q9, q1, q15 + vbit.u8 q10, q2, q15 + + vst1.u8 d18, [r0], r3 @store out + vst1.u8 d19, [r0], r3 + vst1.u8 d20, [r0], r3 + vst1.u8 d21, [r0], r3 + + bx lr + + + diff --git a/common/arm/ih264_inter_pred_luma_horz_hpel_vert_hpel_a9q.s b/common/arm/ih264_inter_pred_luma_horz_hpel_vert_hpel_a9q.s new file mode 100755 index 0000000..43321a8 --- /dev/null +++ b/common/arm/ih264_inter_pred_luma_horz_hpel_vert_hpel_a9q.s @@ -0,0 +1,441 @@ +@/****************************************************************************** +@ * +@ * Copyright (C) 2015 The Android Open Source Project +@ * +@ * Licensed under the Apache License, Version 2.0 (the "License"); +@ * you may not use this file except in compliance with the License. +@ * You may obtain a copy of the License at: +@ * +@ * http://www.apache.org/licenses/LICENSE-2.0 +@ * +@ * Unless required by applicable law or agreed to in writing, software +@ * distributed under the License is distributed on an "AS IS" BASIS, +@ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +@ * See the License for the specific language governing permissions and +@ * limitations under the License. +@ * +@ ***************************************************************************** +@ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore +@*/ +@/** +@****************************************************************************** +@* @file +@* ih264_inter_pred_luma_horz_hpel_vert_hpel_a9q.s +@* +@* @brief +@* Contains function definitions for inter prediction interpolation. +@* +@* @author +@* Mohit +@* +@* @par List of Functions: +@* +@* - ih264_inter_pred_luma_horz_hpel_vert_hpel_a9q() +@* +@* @remarks +@* None +@* +@******************************************************************************* +@*/ + +@/* All the functions here are replicated from ih264_inter_pred_filters.c +@ + +@/** +@/** +@/** +@******************************************************************************* +@* +@* @brief +@* This function implements a two stage cascaded six tap filter. It +@* applies the six tap filter in the vertical direction on the +@* predictor values, followed by applying the same filter in the +@* horizontal direction on the output of the first stage. The six tap +@* filtering operation is described in sec 8.4.2.2.1 titled "Luma sample +@* interpolation process" +@* +@* @par Description: +@* This function is called to obtain pixels lying at the following +@* location (1/2,1/2). The function interpolates +@* the predictors first in the horizontal direction and then in the +@* vertical direction to output the (1/2,1/2). +@* +@* @param[in] pu1_src +@* UWORD8 pointer to the source +@* +@* @param[out] pu1_dst +@* UWORD8 pointer to the destination +@* +@* @param[in] src_strd +@* integer source stride +@* +@* @param[in] dst_strd +@* integer destination stride +@* +@* @param[in] ht +@* integer height of the array +@* +@* @param[in] wd +@* integer width of the array +@* +@* @param[in] pu1_tmp: temporary buffer +@* +@* @param[in] dydx: x and y reference offset for qpel calculations: UNUSED in this function. +@* +@* @returns +@* +@* @remarks +@* None +@* +@******************************************************************************* +@*/; + +@void ih264_inter_pred_luma_horz_hpel_vert_hpel(UWORD8 *pu1_src, +@ UWORD8 *pu1_dst, +@ WORD32 src_strd,, +@ WORD32 dst_strd, +@ WORD32 ht, +@ WORD32 wd, +@ UWORD8* pu1_tmp, +@ UWORD32 dydx) + +@**************Variables Vs Registers***************************************** +@ r0 => *pu1_src +@ r1 => *pu1_dst +@ r2 => src_strd +@ r3 => dst_strd +@ r8 => ht +@ r9 => wd + +.text +.p2align 2 + + .global ih264_inter_pred_luma_horz_hpel_vert_hpel_a9q + +ih264_inter_pred_luma_horz_hpel_vert_hpel_a9q: + + stmfd sp!, {r4-r12, r14} @store register values to stack + vstmdb sp!, {d8-d15} @push neon registers to stack + ldr r8, [sp, #104] @ loads ht + sub r0, r0, r2, lsl #1 @pu1_src-2*src_strd + sub r0, r0, #2 @pu1_src-2 + ldr r9, [sp, #108] @ loads wd + + vmov.s16 d0, #20 @ Filter coeff 20 + vmov.s16 d1, #5 @ Filter coeff 5 + subs r12, r9, #4 @if wd=4 branch to loop_4 + beq loop_4 + subs r12, r9, #8 @if wd=8 branch to loop_8 + beq loop_8 + + mov r10, #8 + sub r7, r3, r10 + @when wd=16 + +loop_16: + vld1.u32 {d2, d3, d4}, [r0], r2 @ Vector load from src[0_0] + vld1.u32 {d5, d6, d7}, [r0], r2 @ Vector load from src[1_0] + vld1.u32 {d8, d9, d10}, [r0], r2 @ Vector load from src[2_0] + vld1.u32 {d11, d12, d13}, [r0], r2 @ Vector load from src[3_0] + vld1.u32 {d14, d15, d16}, [r0], r2 @ Vector load from src[4_0] + vld1.u32 {d17, d18, d19}, [r0], r2 @ Vector load from src[5_0] + + @ vERTICAL FILTERING FOR ROW 0 + vaddl.u8 q10, d8, d11 @ temp1 = src[2_0] + src[3_0] + vaddl.u8 q12, d2, d17 @ temp2 = src[0_0] + src[5_0] + vaddl.u8 q11, d5, d14 @ temp = src[1_0] + src[4_0] + vaddl.u8 q13, d3, d18 @ temp2 = src[0_0] + src[5_0] + vmla.u16 q12, q10, d0[0] @ temp += temp1 * 20 + vmls.s16 q12, q11, d1[0] @ temp -= temp2 * 5 + vaddl.u8 q10, d6, d15 @ temp = src[1_0] + src[4_0] + vaddl.u8 q11, d9, d12 @ temp3 = src[2_0] + src[3_0] + vaddl.u8 q14, d4, d19 @ temp2 = src[0_0] + src[5_0] + vmla.u16 q13, q11, d0[0] @ temp4 += temp3 * 20 + vmls.s16 q13, q10, d1[0] @ temp -= temp2 * 5 + vaddl.u8 q11, d10, d13 @ temp3 = src[2_0] + src[3_0] + vaddl.u8 q10, d7, d16 @ temp = src[1_0] + src[4_0] + vmla.u16 q14, q11, d0[0] @ temp4 += temp3 * 20 + vmls.s16 q14, q10, d1[0] @ temp -= temp2 * 5 + vext.16 q10, q12, q13, #5 @//extract a[5] (column1) + + @Q12,Q13,Q14 HAVE VERTICAL FILTERED VALUES + @CASCADED FILTERING FOR ROW 0 + vext.16 q11, q12, q13, #2 @//extract a[2] (column1) + vaddl.s16 q1, d20, d24 @// a0 + a5 (column1) + vaddl.s16 q15, d21, d25 @// a0 + a5 (column1) + vmlal.s16 q1, d22, d0[0] @// a0 + a5 + 20a2 (column1) + vmlal.s16 q15, d23, d0[0] @// a0 + a5 + 20a2 (column1) + vext.16 q11, q12, q13, #1 @//extract a[1] (column1) + vext.16 q10, q12, q13, #3 @//extract a[3] (column1) + vmlsl.s16 q1, d22, d1[0] @// a0 + a5 + 20a2 + 20a3 - 5a1 (column1) + vmlsl.s16 q15, d23, d1[0] @// a0 + a5 + 20a2 + 20a3 - 5a1 (column1) + vmlal.s16 q1, d20, d0[0] @// a0 + a5 + 20a2 + 20a3 (column1) + vmlal.s16 q15, d21, d0[0] @// a0 + a5 + 20a2 + 20a3 (column1) + vext.16 q11, q12, q13, #4 @//extract a[4] (column1) + vext.16 q10, q13, q14, #5 @//extract a[5] (column2) + vmlsl.s16 q1, d22, d1[0] @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1) + vmlsl.s16 q15, d23, d1[0] @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1) + vqrshrun.s32 d22, q1, #10 + vqrshrun.s32 d23, q15, #10 + vqshrun.s16 d22, q11, #0 + vst1.u8 {d22}, [r1], r10 @//Store dest row0, column 1; (1/2,1/2) + vext.16 q11, q13, q14, #2 @//extract a[2] (column2) + vaddl.s16 q1, d20, d26 @// a0 + a5 (column2) + vaddl.s16 q15, d21, d27 @// a0 + a5 (column2) + vmlal.s16 q1, d22, d0[0] @// a0 + a5 + 20a2 (column2) + vmlal.s16 q15, d23, d0[0] @// a0 + a5 + 20a2 (column2) + vext.16 q10, q13, q14, #3 @//extract a[3] (column2) + vext.16 q11, q13, q14, #1 @//extract a[1] (column2) + vmlal.s16 q1, d20, d0[0] @// a0 + a5 + 20a2 + 20a3 (column2) + vmlal.s16 q15, d21, d0[0] @// a0 + a5 + 20a2 + 20a3 (column2) + vext.16 q10, q13, q14, #4 @//extract a[4] (column2) + vmlsl.s16 q1, d22, d1[0] @// a0 + a5 + 20a2 + 20a3 - 5a1 (column2) + vmlsl.s16 q15, d23, d1[0] @// a0 + a5 + 20a2 + 20a3 - 5a1 (column2) + vmlsl.s16 q1, d20, d1[0] @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column2) + vmlsl.s16 q15, d21, d1[0] @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column2) + vqrshrun.s32 d20, q1, #10 + vqrshrun.s32 d21, q15, #10 + vld1.u32 {d2, d3, d4}, [r0], r2 @ Vector load from src[6_0] + vqshrun.s16 d22, q10, #0 + vst1.u8 {d22}, [r1], r7 @//Store dest row0 ,column 2; (1/2,1/2) + + @ vERTICAL FILTERING FOR ROW 1 + vaddl.u8 q10, d11, d14 @ temp1 = src[2_0] + src[3_0] + vaddl.u8 q12, d5, d2 @ temp2 = src[0_0] + src[5_0] + vaddl.u8 q11, d8, d17 @ temp = src[1_0] + src[4_0] + vaddl.u8 q13, d6, d3 @ temp2 = src[0_0] + src[5_0] + vmla.u16 q12, q10, d0[0] @ temp += temp1 * 20 + vaddl.u8 q10, d9, d18 @ temp = src[1_0] + src[4_0] + vmls.s16 q12, q11, d1[0] @ temp -= temp2 * 5 + vaddl.u8 q11, d12, d15 @ temp3 = src[2_0] + src[3_0] + vaddl.u8 q14, d7, d4 @ temp2 = src[0_0] + src[5_0] + vmla.u16 q13, q11, d0[0] @ temp4 += temp3 * 20 + vaddl.u8 q11, d13, d16 @ temp3 = src[2_0] + src[3_0] + vmls.s16 q13, q10, d1[0] @ temp -= temp2 * 5 + vmla.u16 q14, q11, d0[0] @ temp4 += temp3 * 20 + vaddl.u8 q10, d10, d19 @ temp = src[1_0] + src[4_0] + vmls.s16 q14, q10, d1[0] @ temp -= temp2 * 5 + vext.16 q10, q12, q13, #5 @//extract a[5] (column1) + + @Q12,Q13,Q14 HAVE VERTICAL FILTERED VALUES + @CASCADED FILTERING FOR ROW 1 + vext.16 q11, q12, q13, #2 @//extract a[2] (column1) + vaddl.s16 q3, d20, d24 @// a0 + a5 (column1) + vaddl.s16 q15, d21, d25 @// a0 + a5 (column1) + vmlal.s16 q3, d22, d0[0] @// a0 + a5 + 20a2 (column1) + vmlal.s16 q15, d23, d0[0] @// a0 + a5 + 20a2 (column1) + vext.16 q11, q12, q13, #1 @//extract a[1] (column1) + vext.16 q10, q12, q13, #3 @//extract a[3] (column1) + vmlsl.s16 q3, d22, d1[0] @// a0 + a5 + 20a2 + 20a3 - 5a1 (column1) + vmlsl.s16 q15, d23, d1[0] @// a0 + a5 + 20a2 + 20a3 - 5a1 (column1) + vmlal.s16 q3, d20, d0[0] @// a0 + a5 + 20a2 + 20a3 (column1) + vmlal.s16 q15, d21, d0[0] @// a0 + a5 + 20a2 + 20a3 (column1) + vext.16 q11, q12, q13, #4 @//extract a[4] (column1) + vext.16 q10, q13, q14, #5 @//extract a[5] (column2) + vmlsl.s16 q3, d22, d1[0] @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1) + vmlsl.s16 q15, d23, d1[0] @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1) + vqrshrun.s32 d22, q3, #10 + vqrshrun.s32 d23, q15, #10 + vqshrun.s16 d22, q11, #0 + vst1.u8 {d22}, [r1], r10 @//Store dest row1, column 1; (1/2,1/2) + vext.16 q11, q13, q14, #2 @//extract a[2] (column2) + vaddl.s16 q3, d20, d26 @// a0 + a5 (column2) + vaddl.s16 q15, d21, d27 @// a0 + a5 (column2) + vmlal.s16 q3, d22, d0[0] @// a0 + a5 + 20a2 (column2) + vmlal.s16 q15, d23, d0[0] @// a0 + a5 + 20a2 (column2) + vext.16 q10, q13, q14, #3 @//extract a[3] (column2) + vext.16 q11, q13, q14, #1 @//extract a[1] (column2) + vmlal.s16 q3, d20, d0[0] @// a0 + a5 + 20a2 + 20a3 (column2) + vmlal.s16 q15, d21, d0[0] @// a0 + a5 + 20a2 + 20a3 (column2) + vext.16 q10, q13, q14, #4 @//extract a[4] (column2) + vmlsl.s16 q3, d22, d1[0] @// a0 + a5 + 20a2 + 20a3 - 5a1 (column2) + vmlsl.s16 q15, d23, d1[0] @// a0 + a5 + 20a2 + 20a3 - 5a1 (column2) + vmlsl.s16 q3, d20, d1[0] @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column2) + vmlsl.s16 q15, d21, d1[0] @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column2) + vqrshrun.s32 d20, q3, #10 + vqrshrun.s32 d21, q15, #10 + vqshrun.s16 d22, q10, #0 + vst1.u8 {d22}, [r1], r7 @//Store dest row1 ,column 2; (1/2,1/2) + + subs r8, r8, #2 @ 2 rows processed, decrement by 2 + subne r0, r0 , r2, lsl #2 + subne r0, r0, r2 + beq end_func @ Branch if height==4 + + b loop_16 @ looping if height = 8 or 16 + +loop_8: + vld1.u32 {d2, d3}, [r0], r2 @ Vector load from src[0_0] + vld1.u32 {d4, d5}, [r0], r2 @ Vector load from src[1_0] + vld1.u32 {d6, d7}, [r0], r2 @ Vector load from src[2_0] + vld1.u32 {d8, d9}, [r0], r2 @ Vector load from src[3_0] + vld1.u32 {d10, d11}, [r0], r2 @ Vector load from src[4_0] + vld1.u32 {d12, d13}, [r0], r2 @ Vector load from src[5_0] + + @ vERTICAL FILTERING FOR ROW 0 + vaddl.u8 q10, d6, d8 @ temp1 = src[2_0] + src[3_0] + vaddl.u8 q11, d4, d10 @ temp2 = src[1_0] + src4_0] + vaddl.u8 q12, d2, d12 @ temp = src[0_0] + src[5_0] + vaddl.u8 q13, d3, d13 @ temp = src[0_0] + src[5_0] + vaddl.u8 q14, d7, d9 @ temp1 = src[2_0] + src[3_0] + vmla.u16 q12, q10, d0[0] @ temp += temp1 * 20 + vmls.s16 q12, q11, d1[0] @ temp -= temp2 * 5 + vaddl.u8 q15, d5, d11 @ temp2 = src[1_0] + src4_0] + vmla.u16 q13, q14, d0[0] @ temp += temp1 * 20 + vmls.s16 q13, q15, d1[0] @ temp -= temp2 * 5 + @Q12,Q13 HAVE VERTICAL FILTERED VALUES + @CASCADED FILTERING FOR ROW 0 + + vext.16 q10, q12, q13, #5 @//extract a[5] (column1) + vext.16 q11, q12, q13, #2 @//extract a[2] (column1) + vaddl.s16 q14, d20, d24 @// a0 + a5 (column1) + vaddl.s16 q15, d21, d25 @// a0 + a5 (column1) + vext.16 q9, q12, q13, #1 @//extract a[1] (column1) + vext.16 q10, q12, q13, #3 @//extract a[3] (column1) + vext.16 q1, q12, q13, #4 @//extract a[4] (column1) + vmlal.s16 q14, d22, d0[0] @// a0 + a5 + 20a2 (column1) + vmlsl.s16 q14, d18, d1[0] @// a0 + a5 + 20a2 + 20a3 - 5a1 (column1) + vmlal.s16 q14, d20, d0[0] @// a0 + a5 + 20a2 + 20a3 (column1) + vmlsl.s16 q14, d2, d1[0] @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1) + vld1.u32 {d14, d15}, [r0], r2 @ Vector load from src[6_0] + vmlal.s16 q15, d23, d0[0] @// a0 + a5 + 20a2 (column1) + vmlal.s16 q15, d21, d0[0] @// a0 + a5 + 20a2 + 20a3 (column1) + vmlsl.s16 q15, d19, d1[0] @// a0 + a5 + 20a2 + 20a3 - 5a1 (column1) + vmlsl.s16 q15, d3, d1[0] @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1) + + vaddl.u8 q12, d4, d14 @ temp = src[0_0] + src[5_0] + vaddl.u8 q13, d5, d15 @ temp = src[0_0] + src[5_0] + vqrshrun.s32 d18, q14, #10 + vaddl.u8 q14, d9, d11 @ temp1 = src[2_0] + src[3_0] + vaddl.u8 q10, d8, d10 @ temp1 = src[2_0] + src[3_0] + vaddl.u8 q11, d6, d12 @ temp2 = src[1_0] + src4_0] + vqrshrun.s32 d19, q15, #10 + vmla.u16 q12, q10, d0[0] @ temp += temp1 * 20 + vmls.s16 q12, q11, d1[0] @ temp -= temp2 * 5 + vaddl.u8 q15, d7, d13 @ temp2 = src[1_0] + src4_0] + vmla.u16 q13, q14, d0[0] @ temp += temp1 * 20 + vmls.s16 q13, q15, d1[0] @ temp -= temp2 * 5 + vqshrun.s16 d2, q9, #0 + @ vERTICAL FILTERING FOR ROW 1 + + @Q12,Q13 HAVE VERTICAL FILTERED VALUES + @CASCADED FILTERING FOR ROW 1 + vext.16 q10, q12, q13, #5 @//extract a[5] (column1) + vext.16 q11, q12, q13, #2 @//extract a[2] (column1) + vaddl.s16 q14, d20, d24 @// a0 + a5 (column1) + vaddl.s16 q15, d21, d25 @// a0 + a5 (column1) + vst1.u8 {d2}, [r1], r3 @//Store dest row0, column 1; (1/2,1/2) + vext.16 q9, q12, q13, #1 @//extract a[1] (column1) + vext.16 q10, q12, q13, #3 @//extract a[3] (column1) + vext.16 q2, q12, q13, #4 @//extract a[4] (column1) + vmlal.s16 q14, d22, d0[0] @// a0 + a5 + 20a2 (column1) + vmlsl.s16 q14, d18, d1[0] @// a0 + a5 + 20a2 + 20a3 - 5a1 (column1) + vmlal.s16 q14, d20, d0[0] @// a0 + a5 + 20a2 + 20a3 (column1) + vmlsl.s16 q14, d4, d1[0] @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1) + vmlal.s16 q15, d23, d0[0] @// a0 + a5 + 20a2 (column1) + vmlal.s16 q15, d21, d0[0] @// a0 + a5 + 20a2 + 20a3 (column1) + vmlsl.s16 q15, d19, d1[0] @// a0 + a5 + 20a2 + 20a3 - 5a1 (column1) + vmlsl.s16 q15, d5, d1[0] @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1) + vqrshrun.s32 d18, q14, #10 + vqrshrun.s32 d19, q15, #10 + vqshrun.s16 d3, q9, #0 + vst1.u8 {d3}, [r1], r3 @//Store dest row1, column 1; (1/2,1/2) + + subs r8, r8, #2 @ 2 rows processed, decrement by 2 + subne r0, r0 , r2, lsl #2 + subne r0, r0, r2 + beq end_func @ Branch if height==4 + + b loop_8 @looping if height == 8 or 16 + +loop_4: + vld1.u32 {d2, d3}, [r0], r2 @ Vector load from src[0_0] + vld1.u32 {d4, d5}, [r0], r2 @ Vector load from src[1_0] + vld1.u32 {d6, d7}, [r0], r2 @ Vector load from src[2_0] + vld1.u32 {d8, d9}, [r0], r2 @ Vector load from src[3_0] + vld1.u32 {d10, d11}, [r0], r2 @ Vector load from src[4_0] + vld1.u32 {d12, d13}, [r0], r2 @ Vector load from src[5_0] + + @ vERTICAL FILTERING FOR ROW 0 + vaddl.u8 q10, d6, d8 @ temp1 = src[2_0] + src[3_0] + vaddl.u8 q11, d4, d10 @ temp2 = src[1_0] + src4_0] + vaddl.u8 q12, d2, d12 @ temp = src[0_0] + src[5_0] + vaddl.u8 q13, d3, d13 @ temp = src[0_0] + src[5_0] + vaddl.u8 q14, d7, d9 @ temp1 = src[2_0] + src[3_0] + vmla.u16 q12, q10, d0[0] @ temp += temp1 * 20 + vmls.s16 q12, q11, d1[0] @ temp -= temp2 * 5 + vaddl.u8 q15, d5, d11 @ temp2 = src[1_0] + src4_0] + vmla.u16 q13, q14, d0[0] @ temp += temp1 * 20 + vmls.s16 q13, q15, d1[0] @ temp -= temp2 * 5 + @Q12,Q13 HAVE VERTICAL FILTERED VALUES + @CASCADED FILTERING FOR ROW 0 + + vext.16 q10, q12, q13, #5 @//extract a[5] (column1) + vext.16 q11, q12, q13, #2 @//extract a[2] (column1) + vaddl.s16 q14, d20, d24 @// a0 + a5 (column1) + vaddl.s16 q15, d21, d25 @// a0 + a5 (column1) + + vext.16 q1, q12, q13, #4 @//extract a[4] (column1) + vext.16 q9, q12, q13, #1 @//extract a[1] (column1) + vext.16 q10, q12, q13, #3 @//extract a[3] (column1) + + vmlal.s16 q14, d22, d0[0] @// a0 + a5 + 20a2 (column1) + vmlsl.s16 q14, d18, d1[0] @// a0 + a5 + 20a2 + 20a3 - 5a1 (column1) + vmlal.s16 q14, d20, d0[0] @// a0 + a5 + 20a2 + 20a3 (column1) + vmlsl.s16 q14, d2, d1[0] @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1) + vld1.u32 {d14, d15}, [r0], r2 @ Vector load from src[6_0] + vmlal.s16 q15, d23, d0[0] @// a0 + a5 + 20a2 (column1) + vmlal.s16 q15, d21, d0[0] @// a0 + a5 + 20a2 + 20a3 (column1) + vmlsl.s16 q15, d19, d1[0] @// a0 + a5 + 20a2 + 20a3 - 5a1 (column1) + vmlsl.s16 q15, d3, d1[0] @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1) + vaddl.u8 q12, d4, d14 @ temp = src[0_0] + src[5_0] + vaddl.u8 q13, d5, d15 @ temp = src[0_0] + src[5_0] + vqrshrun.s32 d18, q14, #10 + vaddl.u8 q14, d9, d11 @ temp1 = src[2_0] + src[3_0] + vaddl.u8 q11, d6, d12 @ temp2 = src[1_0] + src4_0] + vaddl.u8 q10, d8, d10 @ temp1 = src[2_0] + src[3_0] + vqrshrun.s32 d19, q15, #10 + vmla.u16 q12, q10, d0[0] @ temp += temp1 * 20 + vmls.s16 q12, q11, d1[0] @ temp -= temp2 * 5 + vaddl.u8 q15, d7, d13 @ temp2 = src[1_0] + src4_0] + vqshrun.s16 d2, q9, #0 + vmla.u16 q13, q14, d0[0] @ temp += temp1 * 20 + vmls.s16 q13, q15, d1[0] @ temp -= temp2 * 5 + + @ vERTICAL FILTERING FOR ROW 1 + + @Q12,Q13 HAVE VERTICAL FILTERED VALUES + @CASCADED FILTERING FOR ROW 1 + vext.16 q10, q12, q13, #5 @//extract a[5] (column1) + vext.16 q11, q12, q13, #2 @//extract a[2] (column1) + vst1.u32 {d2[0]}, [r1], r3 @//Store dest row0, column 1; (1/2,1/2) + vaddl.s16 q14, d20, d24 @// a0 + a5 (column1) + vaddl.s16 q15, d21, d25 @// a0 + a5 (column1) + vext.16 q9, q12, q13, #1 @//extract a[1] (column1) + vext.16 q10, q12, q13, #3 @//extract a[3] (column1) + vext.16 q2, q12, q13, #4 @//extract a[4] (column1) + vmlal.s16 q14, d22, d0[0] @// a0 + a5 + 20a2 (column1) + vmlsl.s16 q14, d18, d1[0] @// a0 + a5 + 20a2 + 20a3 - 5a1 (column1) + vmlal.s16 q14, d20, d0[0] @// a0 + a5 + 20a2 + 20a3 (column1) + vmlsl.s16 q14, d4, d1[0] @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1) + vmlal.s16 q15, d23, d0[0] @// a0 + a5 + 20a2 (column1) + vmlal.s16 q15, d21, d0[0] @// a0 + a5 + 20a2 + 20a3 (column1) + vmlsl.s16 q15, d19, d1[0] @// a0 + a5 + 20a2 + 20a3 - 5a1 (column1) + vmlsl.s16 q15, d5, d1[0] @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1) + vqrshrun.s32 d18, q14, #10 + vqrshrun.s32 d19, q15, #10 + vqshrun.s16 d4, q9, #0 + vst1.u32 {d4[0]}, [r1], r3 @//Store dest row1, column 1; (1/2,1/2) + + subs r8, r8, #2 @ 2 rows processed, decrement by 2 + subne r0, r0 , r2, lsl #2 + subne r0, r0, r2 + beq end_func @ Branch if height==4 + + b loop_4 @looping if height == 8 or 16 + +end_func: + vldmia sp!, {d8-d15} @ Restore neon registers that were saved + ldmfd sp!, {r4-r12, pc} @Restoring registers from stack + + diff --git a/common/arm/ih264_inter_pred_luma_horz_hpel_vert_qpel_a9q.s b/common/arm/ih264_inter_pred_luma_horz_hpel_vert_qpel_a9q.s new file mode 100755 index 0000000..65a6de7 --- /dev/null +++ b/common/arm/ih264_inter_pred_luma_horz_hpel_vert_qpel_a9q.s @@ -0,0 +1,1044 @@ +@/****************************************************************************** +@ * +@ * Copyright (C) 2015 The Android Open Source Project +@ * +@ * Licensed under the Apache License, Version 2.0 (the "License"); +@ * you may not use this file except in compliance with the License. +@ * You may obtain a copy of the License at: +@ * +@ * http://www.apache.org/licenses/LICENSE-2.0 +@ * +@ * Unless required by applicable law or agreed to in writing, software +@ * distributed under the License is distributed on an "AS IS" BASIS, +@ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +@ * See the License for the specific language governing permissions and +@ * limitations under the License. +@ * +@ ***************************************************************************** +@ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore +@*/ +@/** +@****************************************************************************** +@* @file +@* ih264_inter_pred_luma_horz_hpel_vert_qpel_a9q.s +@* +@* @brief +@* Contains function definitions for inter prediction interpolation. +@* +@* @author +@* Mohit +@* +@* @par List of Functions: +@* +@* - ih264_inter_pred_luma_horz_hpel_vert_qpel_a9q() +@* +@* @remarks +@* None +@* +@******************************************************************************* +@*/ + +@/* All the functions here are replicated from ih264_inter_pred_filters.c +@ + +@/** +@/** +@/** +@******************************************************************************* +@* +@* @brief +@* This function implements a two stage cascaded six tap filter. It +@* applies the six tap filter in the horizontal direction on the +@* predictor values, followed by applying the same filter in the +@* vertical direction on the output of the first stage. It then averages +@* the output of the 1st stage and the output of the 2nd stage to obtain +@* the quarter pel values. The six tap filtering operation is described +@* in sec 8.4.2.2.1 titled "Luma sample interpolation process". +@* +@* @par Description: +@* This function is called to obtain pixels lying at the following +@* location (1/2,1/4) or (1/2,3/4). The function interpolates +@* the predictors first in the horizontal direction and then in the +@* vertical direction to output the (1/2,1/2). It then averages +@* the output of the 2nd stage and (1/2,1/2) value to obtain (1/2,1/4) +@* or (1/2,3/4) depending on the offset. +@* +@* @param[in] pu1_src +@* UWORD8 pointer to the source +@* +@* @param[out] pu1_dst +@* UWORD8 pointer to the destination +@* +@* @param[in] src_strd +@* integer source stride +@* +@* @param[in] dst_strd +@* integer destination stride +@* +@* @param[in] ht +@* integer height of the array +@* +@* @param[in] wd +@* integer width of the array +@* +@* @param[in] pu1_tmp: temporary buffer +@* +@* @param[in] dydx: x and y reference offset for qpel calculations +@* +@* @returns +@* +@* @remarks +@* None +@* +@******************************************************************************* +@*/; + +@void ih264_inter_pred_luma_horz_hpel_vert_qpel(UWORD8 *pu1_src, +@ UWORD8 *pu1_dst, +@ WORD32 src_strd,, +@ WORD32 dst_strd, +@ WORD32 ht, +@ WORD32 wd, +@ UWORD8* pu1_tmp, +@ UWORD32 dydx) + +@**************Variables Vs Registers***************************************** +@ r0 => *pu1_src +@ r1 => *pu1_dst +@ r2 => src_strd +@ r3 => dst_strd +@ r4 => ht +@ r5 => wd +@ r7 => dydx +@ r9 => *pu1_tmp + +.text +.p2align 2 + + .global ih264_inter_pred_luma_horz_hpel_vert_qpel_a9q + +ih264_inter_pred_luma_horz_hpel_vert_qpel_a9q: + + stmfd sp!, {r4-r12, r14} @ store register values to stack + vstmdb sp!, {d8-d15} @push neon registers to stack + ldr r4, [sp, #104] @ loads ht + sub r0, r0, r2, lsl #1 @ pu1_src-2*src_strd + sub r0, r0, #2 @ pu1_src-2 + ldr r5, [sp, #108] @ loads wd + ldr r7, [sp, #116] @ loads dydx + lsr r7, r7, #3 @ dydx >> 2 followed by dydx & 0x3 and dydx>>1 to obtain the deciding bit + ldr r9, [sp, #112] @ pu1_tmp + add r7, r7, #2 + mov r6, #48 + mla r7, r7, r6, r9 + + subs r12, r5, #4 @if wd=4 branch to loop_4 + beq loop_4_start + + subs r12, r5, #8 @if wd=8 branch to loop_8 + beq loop_8_start + + @when wd=16 + vmov.u16 q11, #20 @ Filter coeff 0x14 into Q11 + vmov.u16 q12, #5 @ Filter coeff 0x5 into Q12 + add r8, r0, #8 + add r14, r1, #8 + add r10, r9, #8 + mov r12, r4 + add r11, r7, #8 + +loop_16_lowhalf_start: + vld1.32 {q0}, [r0], r2 @ row -2 load for horizontal filter + vext.8 d5, d0, d1, #5 + vaddl.u8 q3, d0, d5 + + vext.8 d2, d0, d1, #2 + vext.8 d3, d0, d1, #3 + vaddl.u8 q4, d2, d3 + vext.8 d4, d0, d1, #4 + vmla.u16 q3, q4, q11 + vext.8 d1, d0, d1, #1 + vaddl.u8 q4, d1, d4 + vld1.32 {q0}, [r0], r2 @ row -1 load for horizontal filter + vmls.u16 q3, q4, q12 + vext.8 d5, d0, d1, #5 + vaddl.u8 q4, d0, d5 + vext.8 d2, d0, d1, #2 + vext.8 d3, d0, d1, #3 + vaddl.u8 q5, d2, d3 + + vst1.32 {q3}, [r9], r6 @ store temp buffer 0 + + vext.8 d4, d0, d1, #4 + vmla.u16 q4, q5, q11 + vext.8 d1, d0, d1, #1 + vaddl.u8 q5, d1, d4 + vld1.32 {q0}, [r0], r2 @ row 0 load for horizontal filter + vmls.u16 q4, q5, q12 + vext.8 d5, d0, d1, #5 + vaddl.u8 q5, d0, d5 + vext.8 d2, d0, d1, #2 + vext.8 d3, d0, d1, #3 + vaddl.u8 q6, d2, d3 + + vst1.32 {q4}, [r9], r6 @ store temp buffer 1 + + vext.8 d4, d0, d1, #4 + vmla.u16 q5, q6, q11 + vext.8 d1, d0, d1, #1 + vaddl.u8 q6, d1, d4 + vld1.32 {q0}, [r0], r2 @ row 1 load for horizontal filter + vmls.u16 q5, q6, q12 + vext.8 d5, d0, d1, #5 + vaddl.u8 q6, d0, d5 + vext.8 d2, d0, d1, #2 + vext.8 d3, d0, d1, #3 + vaddl.u8 q7, d2, d3 + + vst1.32 {q5}, [r9], r6 @ store temp buffer 2 + + vext.8 d4, d0, d1, #4 + vmla.u16 q6, q7, q11 + vext.8 d1, d0, d1, #1 + vaddl.u8 q7, d1, d4 + vld1.32 {q0}, [r0], r2 @ row 2 load for horizontal filter + vmls.u16 q6, q7, q12 + vext.8 d5, d0, d1, #5 + vaddl.u8 q7, d0, d5 + vext.8 d2, d0, d1, #2 + vext.8 d3, d0, d1, #3 + vaddl.u8 q8, d2, d3 + + vst1.32 {q6}, [r9], r6 @ store temp buffer 3 + + vext.8 d4, d0, d1, #4 + vmla.u16 q7, q8, q11 + vext.8 d1, d0, d1, #1 + vaddl.u8 q8, d1, d4 + + vmls.u16 q7, q8, q12 +loop_16_lowhalf: + + vld1.32 {q0}, [r0], r2 @ row 3 load for horizontal filter + vext.8 d5, d0, d1, #5 + vext.8 d2, d0, d1, #2 + vext.8 d3, d0, d1, #3 + vaddl.u8 q8, d0, d5 + + vst1.32 {q7}, [r9], r6 @ store temp buffer 4 + vaddl.u8 q9, d2, d3 + vext.8 d4, d0, d1, #4 + vmla.u16 q8, q9, q11 + vext.8 d1, d0, d1, #1 + vadd.s16 q14, q4, q7 + vaddl.u8 q9, d1, d4 + vadd.s16 q15, q5, q6 + vmls.u16 q8, q9, q12 + vld1.32 {q0}, [r0], r2 @ row 4 load for hoorizontal filter + vext.8 d5, d0, d1, #5 + vext.8 d2, d0, d1, #2 + vext.8 d3, d0, d1, #3 + vaddl.u8 q10, d0, d5 + + vst1.32 {q8}, [r9], r6 @ store temp buffer r5 + + vaddl.s16 q9, d6, d16 + + vld1.32 {q13}, [r7], r6 @ load from temp buffer 0 + + vaddl.s16 q3, d7, d17 + + vqrshrun.s16 d26, q13, #5 + + vmlal.s16 q9, d30, d22 + vmlsl.s16 q9, d28, d24 + vmlal.s16 q3, d31, d22 + vmlsl.s16 q3, d29, d24 + vaddl.u8 q1, d2, d3 + vext.8 d4, d0, d1, #4 + vmla.u16 q10, q1, q11 + vqrshrun.s32 d18, q9, #10 + vext.8 d1, d0, d1, #1 + vqrshrun.s32 d19, q3, #10 + vadd.s16 q14, q5, q8 + vaddl.u8 q1, d1, d4 + vadd.s16 q15, q6, q7 + vmls.u16 q10, q1, q12 + vqmovn.u16 d18, q9 + vld1.32 {q0}, [r0], r2 @ row 5 load for horizontal filter + + vrhadd.u8 d26, d18, d26 + + vext.8 d5, d0, d1, #5 + vext.8 d2, d0, d1, #2 + + vst1.32 {q10}, [r9], r6 @ store temp buffer r6 + + vaddl.s16 q9, d8, d20 + + vaddl.s16 q3, d9, d21 + + vld1.32 {q4}, [r7], r6 @load from temp buffer 1 + + + vst1.32 d26, [r1], r3 @ store row 0 + + vmlal.s16 q9, d30, d22 + vmlsl.s16 q9, d28, d24 + + vqrshrun.s16 d28, q4, #5 + + vmlal.s16 q3, d31, d22 + vmlsl.s16 q3, d29, d24 + vext.8 d3, d0, d1, #3 + vaddl.u8 q4, d0, d5 + vaddl.u8 q1, d2, d3 + vqrshrun.s32 d18, q9, #10 + vext.8 d4, d0, d1, #4 + vqrshrun.s32 d19, q3, #10 + vmla.u16 q4, q1, q11 + vext.8 d1, d0, d1, #1 + vadd.s16 q13, q6, q10 + vaddl.u8 q1, d1, d4 + vqmovn.u16 d18, q9 + vadd.s16 q15, q7, q8 + vmls.u16 q4, q1, q12 + vld1.32 {q0}, [r0], r2 @ row 6 load for horizontal filter + + vrhadd.u8 d28, d28, d18 + + vext.8 d5, d0, d1, #5 + vext.8 d2, d0, d1, #2 + vext.8 d3, d0, d1, #3 + + vst1.32 d28, [r1], r3 @ store row 1 + + vaddl.u8 q14, d0, d5 + + vst1.32 {q4}, [r9], r6 @ store temp buffer r7 + + vaddl.s16 q9, d10, d8 + vaddl.s16 q3, d11, d9 + + vld1.32 {q5}, [r7], r6 @ load from temp buffer 2 + + vmlal.s16 q9, d30, d22 + vmlsl.s16 q9, d26, d24 + vmlal.s16 q3, d31, d22 + + vqrshrun.s16 d26, q5, #5 + + vmlsl.s16 q3, d27, d24 + vaddl.u8 q1, d2, d3 + vext.8 d4, d0, d1, #4 + vmla.u16 q14, q1, q11 + vqrshrun.s32 d18, q9, #10 + vext.8 d1, d0, d1, #1 + vqrshrun.s32 d19, q3, #10 + vadd.s16 q5, q7, q4 + vaddl.u8 q1, d1, d4 + vadd.s16 q15, q8, q10 + vmls.u16 q14, q1, q12 + vqmovn.u16 d27, q9 + + vaddl.s16 q9, d12, d28 + vaddl.s16 q3, d13, d29 + + vrhadd.u8 d26, d26, d27 + + vmlal.s16 q9, d30, d22 + vmlsl.s16 q9, d10, d24 + vmlal.s16 q3, d31, d22 + vmlsl.s16 q3, d11, d24 + + vst1.32 d26, [r1], r3 @ store row 2 + + vst1.32 {q14}, [r9] + + + vqrshrun.s32 d18, q9, #10 + vmov q5, q10 + vld1.32 {q15}, [r7], r6 @ load from temp buffer 3 + + vqrshrun.s32 d19, q3, #10 + subs r4, r4, #4 + + vqrshrun.s16 d30, q15, #5 + + vqmovn.u16 d18, q9 + vmov q6, q4 + vmov q3, q7 + vrhadd.u8 d30, d18, d30 + vmov q4, q8 + vmov q7, q14 + vst1.32 d30, [r1], r3 @ store row 3 + + bgt loop_16_lowhalf @ looping if height =16 + + +loop_16_highhalf_start: + vld1.32 {q0}, [r8], r2 + vext.8 d5, d0, d1, #5 + vaddl.u8 q3, d0, d5 + vext.8 d2, d0, d1, #2 + vext.8 d3, d0, d1, #3 + vaddl.u8 q4, d2, d3 + vext.8 d4, d0, d1, #4 + vmla.u16 q3, q4, q11 + vext.8 d1, d0, d1, #1 + vaddl.u8 q4, d1, d4 + vld1.32 {q0}, [r8], r2 + vmls.u16 q3, q4, q12 + vext.8 d5, d0, d1, #5 + vaddl.u8 q4, d0, d5 + vext.8 d2, d0, d1, #2 + vext.8 d3, d0, d1, #3 + vaddl.u8 q5, d2, d3 + + vst1.32 {q3}, [r10], r6 + + vext.8 d4, d0, d1, #4 + vmla.u16 q4, q5, q11 + vext.8 d1, d0, d1, #1 + vaddl.u8 q5, d1, d4 + vld1.32 {q0}, [r8], r2 + vmls.u16 q4, q5, q12 + vext.8 d5, d0, d1, #5 + vaddl.u8 q5, d0, d5 + vext.8 d2, d0, d1, #2 + vext.8 d3, d0, d1, #3 + vaddl.u8 q6, d2, d3 + + vst1.32 {q4}, [r10], r6 + + vext.8 d4, d0, d1, #4 + vmla.u16 q5, q6, q11 + vext.8 d1, d0, d1, #1 + vaddl.u8 q6, d1, d4 + vld1.32 {q0}, [r8], r2 + vmls.u16 q5, q6, q12 + vext.8 d5, d0, d1, #5 + vaddl.u8 q6, d0, d5 + vext.8 d2, d0, d1, #2 + vext.8 d3, d0, d1, #3 + vaddl.u8 q7, d2, d3 + + vst1.32 {q5}, [r10], r6 + + vext.8 d4, d0, d1, #4 + vmla.u16 q6, q7, q11 + vext.8 d1, d0, d1, #1 + vaddl.u8 q7, d1, d4 + vld1.32 {q0}, [r8], r2 + vmls.u16 q6, q7, q12 + vext.8 d5, d0, d1, #5 + vaddl.u8 q7, d0, d5 + vext.8 d2, d0, d1, #2 + vext.8 d3, d0, d1, #3 + vaddl.u8 q8, d2, d3 + + vst1.32 {q6}, [r10], r6 + + vext.8 d4, d0, d1, #4 + vmla.u16 q7, q8, q11 + vext.8 d1, d0, d1, #1 + vaddl.u8 q8, d1, d4 + + vmls.u16 q7, q8, q12 + +loop_16_highhalf: + + vld1.32 {q0}, [r8], r2 + vext.8 d5, d0, d1, #5 + vext.8 d2, d0, d1, #2 + vext.8 d3, d0, d1, #3 + vaddl.u8 q8, d0, d5 + + vst1.32 {q7}, [r10], r6 + + vaddl.u8 q9, d2, d3 + vext.8 d4, d0, d1, #4 + vmla.u16 q8, q9, q11 + vext.8 d1, d0, d1, #1 + vadd.s16 q14, q4, q7 + vaddl.u8 q9, d1, d4 + vadd.s16 q15, q5, q6 + vmls.u16 q8, q9, q12 + vld1.32 {q0}, [r8], r2 + vext.8 d5, d0, d1, #5 + vext.8 d2, d0, d1, #2 + vext.8 d3, d0, d1, #3 + vaddl.u8 q10, d0, d5 + + vst1.32 {q8}, [r10], r6 + + vaddl.s16 q9, d6, d16 + + vld1.32 {q13}, [r11], r6 + + vaddl.s16 q3, d7, d17 + + vqrshrun.s16 d26, q13, #5 + + vmlal.s16 q9, d30, d22 + vmlsl.s16 q9, d28, d24 + vmlal.s16 q3, d31, d22 + vmlsl.s16 q3, d29, d24 + vaddl.u8 q1, d2, d3 + vext.8 d4, d0, d1, #4 + vmla.u16 q10, q1, q11 + vqrshrun.s32 d18, q9, #10 + vext.8 d1, d0, d1, #1 + vqrshrun.s32 d19, q3, #10 + vadd.s16 q14, q5, q8 + vaddl.u8 q1, d1, d4 + vadd.s16 q15, q6, q7 + vmls.u16 q10, q1, q12 + vqmovn.u16 d18, q9 + vld1.32 {q0}, [r8], r2 + + vrhadd.u8 d26, d18, d26 + + vext.8 d5, d0, d1, #5 + vext.8 d2, d0, d1, #2 + + vst1.32 {q10}, [r10], r6 + + vaddl.s16 q9, d8, d20 + vaddl.s16 q3, d9, d21 + + vld1.32 {q4}, [r11], r6 + + + vst1.32 d26, [r14], r3 @store row 0 + + vmlal.s16 q9, d30, d22 + vmlsl.s16 q9, d28, d24 + + vqrshrun.s16 d28, q4, #5 + + vmlal.s16 q3, d31, d22 + vmlsl.s16 q3, d29, d24 + vext.8 d3, d0, d1, #3 + vaddl.u8 q4, d0, d5 + vaddl.u8 q1, d2, d3 + vqrshrun.s32 d18, q9, #10 + vext.8 d4, d0, d1, #4 + vqrshrun.s32 d19, q3, #10 + vmla.u16 q4, q1, q11 + vext.8 d1, d0, d1, #1 + vadd.s16 q13, q6, q10 + vaddl.u8 q1, d1, d4 + vqmovn.u16 d18, q9 + vadd.s16 q15, q7, q8 + vmls.u16 q4, q1, q12 + vld1.32 {q0}, [r8], r2 + + vrhadd.u8 d28, d28, d18 + + vext.8 d5, d0, d1, #5 + vext.8 d2, d0, d1, #2 + vext.8 d3, d0, d1, #3 + + vst1.32 d28, [r14], r3 @store row 1 + + vaddl.u8 q14, d0, d5 + + vst1.32 {q4}, [r10], r6 + + vaddl.s16 q9, d10, d8 + vaddl.s16 q3, d11, d9 + + vld1.32 {q5}, [r11], r6 + + vmlal.s16 q9, d30, d22 + vmlsl.s16 q9, d26, d24 + vmlal.s16 q3, d31, d22 + + vqrshrun.s16 d26, q5, #5 + + vmlsl.s16 q3, d27, d24 + vaddl.u8 q1, d2, d3 + vext.8 d4, d0, d1, #4 + vmla.u16 q14, q1, q11 + vqrshrun.s32 d18, q9, #10 + vext.8 d1, d0, d1, #1 + vqrshrun.s32 d19, q3, #10 + vadd.s16 q5, q7, q4 + vaddl.u8 q1, d1, d4 + vadd.s16 q15, q8, q10 + vmls.u16 q14, q1, q12 + vqmovn.u16 d27, q9 + + + vaddl.s16 q9, d12, d28 + vaddl.s16 q3, d13, d29 + + vrhadd.u8 d26, d26, d27 + + vmlal.s16 q9, d30, d22 + vmlsl.s16 q9, d10, d24 + vmlal.s16 q3, d31, d22 + vmlsl.s16 q3, d11, d24 + + vst1.32 d26, [r14], r3 @ store row 2 + + vst1.32 {q14}, [r10] + + vqrshrun.s32 d18, q9, #10 + vmov q5, q10 + vld1.32 {q15}, [r11], r6 + + vqrshrun.s32 d19, q3, #10 + subs r12, r12, #4 + + vqrshrun.s16 d30, q15, #5 + + vqmovn.u16 d18, q9 + vmov q6, q4 + vmov q3, q7 + vrhadd.u8 d30, d18, d30 + vmov q4, q8 + vmov q7, q14 + vst1.32 d30, [r14], r3 @ store row 3 + + bgt loop_16_highhalf @ looping if height = 8 or 16 + b end_func + +loop_8_start: + + vmov.u16 q11, #20 @ Filter coeff 20 into Q11 + vmov.u16 q12, #5 @ Filter coeff 5 into Q12 + vld1.32 {q0}, [r0], r2 @ row -2 load for horizontal filter + vext.8 d5, d0, d1, #5 + vaddl.u8 q3, d0, d5 + + vext.8 d2, d0, d1, #2 + vext.8 d3, d0, d1, #3 + vaddl.u8 q4, d2, d3 + vext.8 d4, d0, d1, #4 + vmla.u16 q3, q4, q11 + vext.8 d1, d0, d1, #1 + vaddl.u8 q4, d1, d4 + vld1.32 {q0}, [r0], r2 @ row -1 load for horizontal filter + vmls.u16 q3, q4, q12 + vext.8 d5, d0, d1, #5 + vaddl.u8 q4, d0, d5 + vext.8 d2, d0, d1, #2 + vext.8 d3, d0, d1, #3 + vaddl.u8 q5, d2, d3 + + vst1.32 {q3}, [r9], r6 @ store temp buffer 0 + + vext.8 d4, d0, d1, #4 + vmla.u16 q4, q5, q11 + vext.8 d1, d0, d1, #1 + vaddl.u8 q5, d1, d4 + vld1.32 {q0}, [r0], r2 @ row 0 load for horizontal filter + vmls.u16 q4, q5, q12 + vext.8 d5, d0, d1, #5 + vaddl.u8 q5, d0, d5 + vext.8 d2, d0, d1, #2 + vext.8 d3, d0, d1, #3 + vaddl.u8 q6, d2, d3 + + vst1.32 {q4}, [r9], r6 @ store temp buffer 1 + + vext.8 d4, d0, d1, #4 + vmla.u16 q5, q6, q11 + vext.8 d1, d0, d1, #1 + vaddl.u8 q6, d1, d4 + vld1.32 {q0}, [r0], r2 @ row 1 load for horizontal filter + vmls.u16 q5, q6, q12 + vext.8 d5, d0, d1, #5 + vaddl.u8 q6, d0, d5 + vext.8 d2, d0, d1, #2 + vext.8 d3, d0, d1, #3 + vaddl.u8 q7, d2, d3 + + vst1.32 {q5}, [r9], r6 @ store temp buffer 2 + + vext.8 d4, d0, d1, #4 + vmla.u16 q6, q7, q11 + vext.8 d1, d0, d1, #1 + vaddl.u8 q7, d1, d4 + vld1.32 {q0}, [r0], r2 @ row 2 load for horizontal filter + vmls.u16 q6, q7, q12 + vext.8 d5, d0, d1, #5 + vaddl.u8 q7, d0, d5 + vext.8 d2, d0, d1, #2 + vext.8 d3, d0, d1, #3 + vaddl.u8 q8, d2, d3 + + vst1.32 {q6}, [r9], r6 @ store temp buffer 3 + + vext.8 d4, d0, d1, #4 + vmla.u16 q7, q8, q11 + vext.8 d1, d0, d1, #1 + vaddl.u8 q8, d1, d4 + + vmls.u16 q7, q8, q12 +loop_8: + + vld1.32 {q0}, [r0], r2 @ row 3 load for horizontal filter + vext.8 d5, d0, d1, #5 + vext.8 d2, d0, d1, #2 + vext.8 d3, d0, d1, #3 + vaddl.u8 q8, d0, d5 + + vst1.32 {q7}, [r9], r6 @ store temp buffer 4 + + vaddl.u8 q9, d2, d3 + vext.8 d4, d0, d1, #4 + vmla.u16 q8, q9, q11 + vext.8 d1, d0, d1, #1 + vadd.s16 q14, q4, q7 + vaddl.u8 q9, d1, d4 + vadd.s16 q15, q5, q6 + vmls.u16 q8, q9, q12 + vld1.32 {q0}, [r0], r2 @ row 4 load for hoorizontal filter + vext.8 d5, d0, d1, #5 + vext.8 d2, d0, d1, #2 + vext.8 d3, d0, d1, #3 + vaddl.u8 q10, d0, d5 + + vst1.32 {q8}, [r9], r6 @ store temp buffer r5 + + vaddl.s16 q9, d6, d16 + + vld1.32 {q13}, [r7], r6 @ load from temp buffer 0 + + vaddl.s16 q3, d7, d17 + + vqrshrun.s16 d26, q13, #5 + + vmlal.s16 q9, d30, d22 + vmlsl.s16 q9, d28, d24 + vmlal.s16 q3, d31, d22 + vmlsl.s16 q3, d29, d24 + vaddl.u8 q1, d2, d3 + vext.8 d4, d0, d1, #4 + vmla.u16 q10, q1, q11 + vqrshrun.s32 d18, q9, #10 + vext.8 d1, d0, d1, #1 + vqrshrun.s32 d19, q3, #10 + vadd.s16 q14, q5, q8 + vaddl.u8 q1, d1, d4 + vadd.s16 q15, q6, q7 + vmls.u16 q10, q1, q12 + vqmovn.u16 d18, q9 + vld1.32 {q0}, [r0], r2 @ row 5 load for horizontal filter + + vrhadd.u8 d26, d18, d26 + + vext.8 d5, d0, d1, #5 + vext.8 d2, d0, d1, #2 + + vst1.32 {q10}, [r9], r6 @ store temp buffer r6 + + vaddl.s16 q9, d8, d20 + + vaddl.s16 q3, d9, d21 + + vld1.32 {q4}, [r7], r6 @load from temp buffer 1 + + + vst1.32 d26, [r1], r3 @ store row 0 + + vmlal.s16 q9, d30, d22 + vmlsl.s16 q9, d28, d24 + + vqrshrun.s16 d28, q4, #5 + + vmlal.s16 q3, d31, d22 + vmlsl.s16 q3, d29, d24 + vext.8 d3, d0, d1, #3 + vaddl.u8 q4, d0, d5 + vaddl.u8 q1, d2, d3 + vqrshrun.s32 d18, q9, #10 + vext.8 d4, d0, d1, #4 + vqrshrun.s32 d19, q3, #10 + vmla.u16 q4, q1, q11 + vext.8 d1, d0, d1, #1 + vadd.s16 q13, q6, q10 + vaddl.u8 q1, d1, d4 + vqmovn.u16 d18, q9 + vadd.s16 q15, q7, q8 + vmls.u16 q4, q1, q12 + vld1.32 {q0}, [r0], r2 @ row 6 load for horizontal filter + + vrhadd.u8 d28, d28, d18 + + vext.8 d5, d0, d1, #5 + vext.8 d2, d0, d1, #2 + vext.8 d3, d0, d1, #3 + + vst1.32 d28, [r1], r3 @ store row 1 + + vaddl.u8 q14, d0, d5 + + vst1.32 {q4}, [r9], r6 @ store temp buffer r7 + + vaddl.s16 q9, d10, d8 + vaddl.s16 q3, d11, d9 + + vld1.32 {q5}, [r7], r6 @ load from temp buffer 2 + + vmlal.s16 q9, d30, d22 + vmlsl.s16 q9, d26, d24 + vmlal.s16 q3, d31, d22 + + vqrshrun.s16 d26, q5, #5 + + vmlsl.s16 q3, d27, d24 + vaddl.u8 q1, d2, d3 + vext.8 d4, d0, d1, #4 + vmla.u16 q14, q1, q11 + vqrshrun.s32 d18, q9, #10 + vext.8 d1, d0, d1, #1 + vqrshrun.s32 d19, q3, #10 + vadd.s16 q5, q7, q4 + vaddl.u8 q1, d1, d4 + vadd.s16 q15, q8, q10 + vmls.u16 q14, q1, q12 + vqmovn.u16 d27, q9 + + vaddl.s16 q9, d12, d28 + vaddl.s16 q3, d13, d29 + + vrhadd.u8 d26, d26, d27 + + vmlal.s16 q9, d30, d22 + vmlsl.s16 q9, d10, d24 + vmlal.s16 q3, d31, d22 + vmlsl.s16 q3, d11, d24 + + vst1.32 d26, [r1], r3 @ store row 2 + + vst1.32 {q14}, [r9] + + + vqrshrun.s32 d18, q9, #10 + vmov q5, q10 + vld1.32 {q15}, [r7], r6 @ load from temp buffer 3 + + vqrshrun.s32 d19, q3, #10 + subs r4, r4, #4 + + vqrshrun.s16 d30, q15, #5 + + vqmovn.u16 d18, q9 + vmov q6, q4 + vmov q3, q7 + vrhadd.u8 d30, d18, d30 + vmov q4, q8 + vmov q7, q14 + vst1.32 d30, [r1], r3 @ store row 3 + + bgt loop_8 @if height =8 or 16 loop + b end_func + +loop_4_start: + vmov.u16 d22, #20 @ Filter coeff 20 into D22 + vmov.u16 d23, #5 @ Filter coeff 5 into D23 + + vld1.32 {q0}, [r0], r2 @row -2 load + vext.8 d5, d0, d1, #5 + vaddl.u8 q3, d0, d5 + vext.8 d2, d0, d1, #2 + vext.8 d3, d0, d1, #3 + vaddl.u8 q4, d2, d3 + vext.8 d4, d0, d1, #4 + vmla.u16 d6, d8, d22 + vext.8 d1, d0, d1, #1 + vaddl.u8 q4, d1, d4 + vld1.32 {q0}, [r0], r2 @ row -1 load + vmls.u16 d6, d8, d23 + vext.8 d5, d0, d1, #5 + vaddl.u8 q4, d0, d5 + vext.8 d2, d0, d1, #2 + vext.8 d3, d0, d1, #3 + vaddl.u8 q5, d2, d3 + + vst1.32 d6, [r9], r6 @ store temp buffer 0 + + vext.8 d4, d0, d1, #4 + vmla.u16 d8, d10, d22 + vext.8 d1, d0, d1, #1 + vaddl.u8 q5, d1, d4 + vld1.32 {q0}, [r0], r2 @ row 0 load + vmls.u16 d8, d10, d23 + vext.8 d5, d0, d1, #5 + vaddl.u8 q5, d0, d5 + vext.8 d2, d0, d1, #2 + vext.8 d3, d0, d1, #3 + vaddl.u8 q6, d2, d3 + + vst1.32 d8, [r9], r6 @ store temp buffer 1 + + vext.8 d4, d0, d1, #4 + vmla.u16 d10, d12, d22 + vext.8 d1, d0, d1, #1 + vaddl.u8 q6, d1, d4 + vld1.32 {q0}, [r0], r2 @ row 1 load + vmls.u16 d10, d12, d23 + vext.8 d5, d0, d1, #5 + vaddl.u8 q6, d0, d5 + vext.8 d2, d0, d1, #2 + vext.8 d3, d0, d1, #3 + vaddl.u8 q7, d2, d3 + + vst1.32 d10, [r9], r6 @ store temp buffer 2 + + vext.8 d4, d0, d1, #4 + vmla.u16 d12, d14, d22 + vext.8 d1, d0, d1, #1 + vaddl.u8 q7, d1, d4 + vld1.32 {q0}, [r0], r2 @ row 2 load + vmls.u16 d12, d14, d23 + vext.8 d5, d0, d1, #5 + vaddl.u8 q7, d0, d5 + vext.8 d2, d0, d1, #2 + vext.8 d3, d0, d1, #3 + vaddl.u8 q8, d2, d3 + vext.8 d4, d0, d1, #4 + vmla.u16 d14, d16, d22 + vext.8 d1, d0, d1, #1 + vaddl.u8 q8, d1, d4 + + vst1.32 d12, [r9], r6 @ store temp buffer 3 + + vmls.u16 d14, d16, d23 + +loop_4: + + vld1.32 {q0}, [r0], r2 @ row 3 load + vext.8 d5, d0, d1, #5 + vaddl.u8 q8, d0, d5 + vext.8 d2, d0, d1, #2 + vext.8 d3, d0, d1, #3 + vaddl.u8 q9, d2, d3 + vst1.32 d14, [r9], r6 @ store temp buffer 4 + vext.8 d4, d0, d1, #4 + vmla.u16 d16, d18, d22 + vext.8 d1, d0, d1, #1 + vaddl.u8 q9, d1, d4 + vadd.s16 d2, d10, d12 + vmls.u16 d16, d18, d23 + vadd.s16 d3, d8, d14 + vld1.32 {q9}, [r0], r2 @ row 4 load + vext.8 d25, d18, d19, #5 + vaddl.u8 q13, d18, d25 + vext.8 d20, d18, d19, #2 + + vst1.32 d16, [r9], r6 @ store temp buffer 5 + + vaddl.s16 q0, d6, d16 + vmlal.s16 q0, d2, d22 + vext.8 d21, d18, d19, #3 + vaddl.u8 q14, d20, d21 + vext.8 d24, d18, d19, #4 + vmlsl.s16 q0, d3, d23 + vmla.u16 d26, d28, d22 + vext.8 d19, d18, d19, #1 + vaddl.u8 q14, d19, d24 + vadd.s16 d2, d12, d14 + vmls.u16 d26, d28, d23 + vqrshrun.s32 d0, q0, #0xa + vadd.s16 d3, d10, d16 + vld1.32 {q9}, [r0], r2 @ row 5 load + vext.8 d25, d18, d19, #5 + vqmovn.u16 d11, q0 + vaddl.u8 q14, d18, d25 + + vst1.32 d26, [r9], r6 @ store temp buffer 6 + + @Q3 available here + vld1.32 d6, [r7], r6 @ load from temp buffer 0 + vld1.32 d7, [r7], r6 @ load from temp buffer 1 + vqrshrun.s16 d9, q3, #5 + + vext.8 d20, d18, d19, #2 + + vaddl.s16 q0, d8, d26 + vmlal.s16 q0, d2, d22 + vext.8 d21, d18, d19, #3 + vaddl.u8 q3, d20, d21 + vext.8 d24, d18, d19, #4 + vmlsl.s16 q0, d3, d23 + vmla.u16 d28, d6, d22 + vext.8 d19, d18, d19, #1 + vaddl.u8 q3, d19, d24 + vadd.s16 d2, d14, d16 + vmls.u16 d28, d6, d23 + vqrshrun.s32 d0, q0, #0xa + vadd.s16 d3, d12, d26 + vld1.32 {q9}, [r0], r2 @ row 6 load + vext.8 d25, d18, d19, #5 + vqmovn.u16 d13, q0 + + vtrn.32 d11, d13 + vaddl.s16 q0, d10, d28 + vrhadd.u8 d9, d9, d11 + + vst1.32 d28, [r9], r6 @ store temp buffer 7 + + vmlal.s16 q0, d2, d22 + vaddl.u8 q15, d18, d25 + + vst1.32 d9[0], [r1], r3 @ store row 0 + + vext.8 d20, d18, d19, #2 + + vst1.32 d9[1], [r1], r3 @ store row 1 + + vext.8 d21, d18, d19, #3 + vmlsl.s16 q0, d3, d23 + vaddl.u8 q4, d20, d21 + vext.8 d24, d18, d19, #4 + vmla.u16 d30, d8, d22 + vext.8 d19, d18, d19, #1 + vaddl.u8 q4, d19, d24 + vqrshrun.s32 d0, q0, #0xa + vadd.s16 d2, d16, d26 + vmls.u16 d30, d8, d23 + vqmovn.u16 d4, q0 + + vadd.s16 d3, d14, d28 + + + vaddl.s16 q0, d12, d30 + + vst1.32 d30, [r9] + + vmlal.s16 q0, d2, d22 + + vld1.32 d8, [r7], r6 @ load from temp buffer 2 + vld1.32 d9, [r7], r6 @ load from temp buffer 3 + vmlsl.s16 q0, d3, d23 + subs r4, r4, #4 + vqrshrun.s16 d10, q4, #5 + + vmov d12, d28 + + vqrshrun.s32 d0, q0, #0xa + vmov d6, d14 + vmov d8, d16 + + vqmovn.u16 d5, q0 + + vtrn.32 d4, d5 + vrhadd.u8 d4, d4, d10 + vmov d10, d26 + vmov d14, d30 + + vst1.32 d4[0], [r1], r3 @ store row 2 + vst1.32 d4[1], [r1], r3 @ store row 3 + + bgt loop_4 + +end_func: + vldmia sp!, {d8-d15} @ Restore neon registers that were saved + ldmfd sp!, {r4-r12, pc} @Restoring registers from stack + + diff --git a/common/arm/ih264_inter_pred_luma_horz_qpel_a9q.s b/common/arm/ih264_inter_pred_luma_horz_qpel_a9q.s new file mode 100755 index 0000000..c39ae01 --- /dev/null +++ b/common/arm/ih264_inter_pred_luma_horz_qpel_a9q.s @@ -0,0 +1,266 @@ +@/****************************************************************************** +@ * +@ * Copyright (C) 2015 The Android Open Source Project +@ * +@ * Licensed under the Apache License, Version 2.0 (the "License"); +@ * you may not use this file except in compliance with the License. +@ * You may obtain a copy of the License at: +@ * +@ * http://www.apache.org/licenses/LICENSE-2.0 +@ * +@ * Unless required by applicable law or agreed to in writing, software +@ * distributed under the License is distributed on an "AS IS" BASIS, +@ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +@ * See the License for the specific language governing permissions and +@ * limitations under the License. +@ * +@ ***************************************************************************** +@ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore +@*/ +@/** +@****************************************************************************** +@* @file +@* ih264_inter_pred_luma_horz_qpel_a9q.s +@* +@* @brief +@* Contains function definitions for inter prediction horizontal quarter pel interpolation. +@* +@* @author +@* Mohit +@* +@* @par List of Functions: +@* +@* - ih264_inter_pred_luma_horz_qpe_a9ql() +@* +@* @remarks +@* None +@* +@******************************************************************************* +@*/ + +@/* All the functions here are replicated from ih264_inter_pred_filters.c +@ + +@/** +@/** +@******************************************************************************* +@* +@* @brief +@* Quarter pel interprediction luma filter for horizontal input +@* +@* @par Description: +@* Applies a 6 tap horizontal filter .The output is clipped to 8 bits +@* sec 8.4.2.2.1 titled "Luma sample interpolation process" +@* +@* @param[in] pu1_src +@* UWORD8 pointer to the source +@* +@* @param[out] pu1_dst +@* UWORD8 pointer to the destination +@* +@* @param[in] src_strd +@* integer source stride +@* +@* @param[in] dst_strd +@* integer destination stride +@* +@* @param[in] ht +@* integer height of the array +@* +@* @param[in] wd +@* integer width of the array +@* +@ @param[in] pu1_tmp: temporary buffer: UNUSED in this function +@* +@* @param[in] dydx: x and y reference offset for qpel calculations. +@* @returns +@* +@ @remarks +@* None +@* +@******************************************************************************* +@*/ + +@void ih264_inter_pred_luma_horz ( +@ UWORD8 *pu1_src, +@ UWORD8 *pu1_dst, +@ WORD32 src_strd, +@ WORD32 dst_strd, +@ WORD32 ht, +@ WORD32 wd, +@ UWORD8* pu1_tmp, +@ UWORD32 dydx) + +@**************Variables Vs Registers***************************************** +@ r0 => *pu1_src +@ r1 => *pu1_dst +@ r2 => src_strd +@ r3 => dst_strd +@ r5 => ht +@ r6 => wd +@ r7 => dydx + +.text +.p2align 2 + + + .global ih264_inter_pred_luma_horz_qpel_a9q + +ih264_inter_pred_luma_horz_qpel_a9q: + + stmfd sp!, {r4-r12, r14} @store register values to stack + vstmdb sp!, {d8-d15} @push neon registers to stack + ldr r5, [sp, #104] @Loads ht + ldr r6, [sp, #108] @Loads wd + ldr r7, [sp, #116] @Loads dydx + and r7, r7, #3 @Finds x-offset + add r7, r0, r7, lsr #1 @pu1_src + (x_offset>>1) + sub r0, r0, #2 @pu1_src-2 + vmov.i8 d0, #5 @filter coeff + subs r12, r6, #8 @if wd=8 branch to loop_8 + vmov.i8 d1, #20 @filter coeff + + beq loop_8 + + subs r12, r6, #4 @if wd=4 branch to loop_4 + beq loop_4 + +loop_16: @when wd=16 + @// Processing row0 and row1 + vld1.8 {d2, d3, d4}, [r0], r2 @// Load row0 + vext.8 d31, d2, d3, #5 @//extract a[5] (column1,row0) + vld1.8 {d5, d6, d7}, [r0], r2 @// Load row1 + vext.8 d30, d3, d4, #5 @//extract a[5] (column2,row0) + vaddl.u8 q4, d31, d2 @// a0 + a5 (column1,row0) + vext.8 d28, d5, d6, #5 @//extract a[5] (column1,row1) + vaddl.u8 q5, d30, d3 @// a0 + a5 (column2,row0) + vext.8 d27, d6, d7, #5 @//extract a[5] (column2,row1) + vaddl.u8 q7, d28, d5 @// a0 + a5 (column1,row1) + vext.8 d31, d2, d3, #2 @//extract a[2] (column1,row0) + vaddl.u8 q8, d27, d6 @// a0 + a5 (column2,row1) + vext.8 d30, d3, d4, #2 @//extract a[2] (column2,row0) + vmlal.u8 q4, d31, d1 @// a0 + a5 + 20a2 (column1,row0) + vext.8 d28, d5, d6, #2 @//extract a[2] (column1,row1) + vmlal.u8 q5, d30, d1 @// a0 + a5 + 20a2 (column2,row0) + vext.8 d27, d6, d7, #2 @//extract a[2] (column2,row1) + vmlal.u8 q7, d28, d1 @// a0 + a5 + 20a2 (column1,row1) + vext.8 d31, d2, d3, #3 @//extract a[3] (column1,row0) + vmlal.u8 q8, d27, d1 @// a0 + a5 + 20a2 (column2,row1) + vext.8 d30, d3, d4, #3 @//extract a[3] (column2,row0) + vmlal.u8 q4, d31, d1 @// a0 + a5 + 20a2 + 20a3 (column1,row0) + vext.8 d28, d5, d6, #3 @//extract a[3] (column1,row1) + vmlal.u8 q5, d30, d1 @// a0 + a5 + 20a2 + 20a3 (column2,row0) + vext.8 d27, d6, d7, #3 @//extract a[3] (column2,row1) + vmlal.u8 q7, d28, d1 @// a0 + a5 + 20a2 + 20a3 (column1,row1) + vext.8 d31, d2, d3, #1 @//extract a[1] (column1,row0) + vmlal.u8 q8, d27, d1 @// a0 + a5 + 20a2 + 20a3 (column2,row1) + vext.8 d30, d3, d4, #1 @//extract a[1] (column2,row0) + vmlsl.u8 q4, d31, d0 @// a0 + a5 + 20a2 + 20a3 - 5a1 (column1,row0) + vext.8 d28, d5, d6, #1 @//extract a[1] (column1,row1) + vmlsl.u8 q5, d30, d0 @// a0 + a5 + 20a2 + 20a3 - 5a1 (column2,row0) + vext.8 d27, d6, d7, #1 @//extract a[1] (column2,row1) + vmlsl.u8 q7, d28, d0 @// a0 + a5 + 20a2 + 20a3 - 5a1 (column1,row1) + vext.8 d31, d2, d3, #4 @//extract a[4] (column1,row0) + vmlsl.u8 q8, d27, d0 @// a0 + a5 + 20a2 + 20a3 - 5a1 (column2,row1) + vext.8 d30, d3, d4, #4 @//extract a[4] (column2,row0) + vmlsl.u8 q4, d31, d0 @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1,row0) + vext.8 d28, d5, d6, #4 @//extract a[4] (column1,row1) + vmlsl.u8 q5, d30, d0 @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column2,row0) + vext.8 d27, d6, d7, #4 @//extract a[4] (column2,row1) + vmlsl.u8 q7, d28, d0 @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1,row1) + vmlsl.u8 q8, d27, d0 @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column2,row1) + vld1.32 {d12, d13}, [r7], r2 @Load value for interpolation (column1,row0) + vqrshrun.s16 d20, q4, #5 @// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column1,row0) + vqrshrun.s16 d21, q5, #5 @// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column2,row0) + vext.8 d31, d2, d3, #5 @//extract a[5] (column1,row2) + vrhadd.u8 q10, q6, q10 @Interpolation step for qpel calculation + vqrshrun.s16 d18, q7, #5 @// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column1,row1) + vst1.8 {d20, d21}, [r1], r3 @//Store dest row0 + vext.8 d30, d3, d4, #5 @//extract a[5] (column2,row2) + vqrshrun.s16 d19, q8, #5 @// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column2,row1) + vld1.32 {d12, d13}, [r7], r2 @Load value for interpolation (column1,row1) + vrhadd.u8 q9, q6, q9 @Interpolation step for qpel calculation + vst1.8 {d18, d19}, [r1], r3 @//Store dest row1 + subs r5, r5, #2 @ 2 rows done, decrement by 2 + + beq end_func + b loop_16 + +loop_8: +@// Processing row0 and row1 + + vld1.8 {d5, d6}, [r0], r2 @// Load row1 + vext.8 d28, d5, d6, #5 @//extract a[5] (column1,row1) + vld1.8 {d2, d3}, [r0], r2 @// Load row0 + vext.8 d25, d5, d6, #2 @//extract a[2] (column1,row1) + vext.8 d31, d2, d3, #5 @//extract a[5] (column1,row0) + vext.8 d24, d5, d6, #3 @//extract a[3] (column1,row1) + vext.8 d23, d5, d6, #1 @//extract a[1] (column1,row1) + vext.8 d22, d5, d6, #4 @//extract a[4] (column1,row1) + vaddl.u8 q7, d28, d5 @// a0 + a5 (column1,row1) + vext.8 d29, d2, d3, #3 @//extract a[3] (column1,row0) + vmlal.u8 q7, d25, d1 @// a0 + a5 + 20a2 (column1,row1) + vmlal.u8 q7, d24, d1 @// a0 + a5 + 20a2 + 20a3 (column1,row1) + vmlsl.u8 q7, d23, d0 @// a0 + a5 + 20a2 + 20a3 - 5a1 (column1,row1) + vmlsl.u8 q7, d22, d0 @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1,row1) + vext.8 d30, d2, d3, #2 @//extract a[2] (column1,row0) + vaddl.u8 q4, d31, d2 @// a0 + a5 (column1,row0) + vext.8 d27, d2, d3, #1 @//extract a[1] (column1,row0) + vext.8 d26, d2, d3, #4 @//extract a[4] (column1,row0) + vmlal.u8 q4, d29, d1 @// a0 + a5 + 20a2 + 20a3 (column1,row0) + vmlal.u8 q4, d30, d1 @// a0 + a5 + 20a2 (column1,row0) + vmlsl.u8 q4, d27, d0 @// a0 + a5 + 20a2 + 20a3 - 5a1 (column1,row0) + vmlsl.u8 q4, d26, d0 @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1,row0) + vqrshrun.s16 d18, q7, #5 @// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column1,row0) + vld1.32 d12, [r7], r2 @Load value for interpolation (column1,row0) + vld1.32 d13, [r7], r2 @Load value for interpolation (column1,row1) + vqrshrun.s16 d19, q4, #5 @// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column1,row1) + vrhadd.u8 q9, q6, q9 @Interpolation step for qpel calculation + vst1.8 {d18}, [r1], r3 @//Store dest row0 + vst1.8 {d19}, [r1], r3 @//Store dest row1 + subs r5, r5, #2 @ 2 rows done, decrement by 2 + + beq end_func @ Branch if height==4 + b loop_8 @looping if height == 8 or 16 + +loop_4: + vld1.8 {d5, d6}, [r0], r2 @// Load row1 + vext.8 d28, d5, d6, #5 @//extract a[5] (column1,row1) + vld1.8 {d2, d3}, [r0], r2 @// Load row0 + vext.8 d25, d5, d6, #2 @//extract a[2] (column1,row1) + vext.8 d31, d2, d3, #5 @//extract a[5] (column1,row0) + vaddl.u8 q7, d28, d5 @// a0 + a5 (column1,row1) + vext.8 d24, d5, d6, #3 @//extract a[3] (column1,row1) + vext.8 d23, d5, d6, #1 @//extract a[1] (column1,row1) + vext.8 d22, d5, d6, #4 @//extract a[4] (column1,row1) + vext.8 d29, d2, d3, #3 @//extract a[3] (column1,row0) + vmlal.u8 q7, d25, d1 @// a0 + a5 + 20a2 (column1,row1) + vmlal.u8 q7, d24, d1 @// a0 + a5 + 20a2 + 20a3 (column1,row1) + vmlsl.u8 q7, d23, d0 @// a0 + a5 + 20a2 + 20a3 - 5a1 (column1,row1) + vmlsl.u8 q7, d22, d0 @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1,row1) + vaddl.u8 q4, d31, d2 @// a0 + a5 (column1,row0) + vext.8 d30, d2, d3, #2 @//extract a[2] (column1,row0) + vld1.32 d12, [r7], r2 @Load value for interpolation (column1,row0) + vld1.32 d13, [r7], r2 @Load value for interpolation (column1,row1) + vext.8 d27, d2, d3, #1 @//extract a[1] (column1,row0) + vext.8 d26, d2, d3, #4 @//extract a[4] (column1,row0) + vmlal.u8 q4, d29, d1 @// a0 + a5 + 20a2 + 20a3 (column1,row0) + vmlal.u8 q4, d30, d1 @// a0 + a5 + 20a2 (column1,row0) + vmlsl.u8 q4, d27, d0 @// a0 + a5 + 20a2 + 20a3 - 5a1 (column1,row0) + vmlsl.u8 q4, d26, d0 @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1,row0) + vqrshrun.s16 d18, q7, #5 @// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column1,row0) + vqrshrun.s16 d19, q4, #5 @// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column1,row1) + vrhadd.u8 q9, q6, q9 @Interpolation step for qpel calculation + vst1.32 d18[0], [r1], r3 @//Store dest row0 + vst1.32 d19[0], [r1], r3 @//Store dest row1 + + subs r5, r5, #2 @ 2 rows done, decrement by 2 + beq end_func + + b loop_4 + +end_func: + vldmia sp!, {d8-d15} @ Restore neon registers that were saved + ldmfd sp!, {r4-r12, pc} @Restoring registers from stack + + diff --git a/common/arm/ih264_inter_pred_luma_horz_qpel_vert_hpel_a9q.s b/common/arm/ih264_inter_pred_luma_horz_qpel_vert_hpel_a9q.s new file mode 100755 index 0000000..565cc80 --- /dev/null +++ b/common/arm/ih264_inter_pred_luma_horz_qpel_vert_hpel_a9q.s @@ -0,0 +1,505 @@ +@/****************************************************************************** +@ * +@ * Copyright (C) 2015 The Android Open Source Project +@ * +@ * Licensed under the Apache License, Version 2.0 (the "License"); +@ * you may not use this file except in compliance with the License. +@ * You may obtain a copy of the License at: +@ * +@ * http://www.apache.org/licenses/LICENSE-2.0 +@ * +@ * Unless required by applicable law or agreed to in writing, software +@ * distributed under the License is distributed on an "AS IS" BASIS, +@ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +@ * See the License for the specific language governing permissions and +@ * limitations under the License. +@ * +@ ***************************************************************************** +@ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore +@*/ +@/** +@****************************************************************************** +@* @file +@* ih264_inter_pred_luma_horz_qpel_vert_hpel_a9q.s +@* +@* @brief +@* Contains function definitions for inter prediction interpolation. +@* +@* @author +@* Mohit +@* +@* @par List of Functions: +@* +@* - ih264_inter_pred_luma_horz_qpel_vert_hpel_a9q() +@* +@* @remarks +@* None +@* +@******************************************************************************* +@*/ + +@/* All the functions here are replicated from ih264_inter_pred_filters.c +@ + +@/** +@/** +@/** +@******************************************************************************* +@* +@* @brief +@* This function implements a two stage cascaded six tap filter. It +@* applies the six tap filter in the vertical direction on the +@* predictor values, followed by applying the same filter in the +@* horizontal direction on the output of the first stage. It then averages +@* the output of the 1st stage and the final stage to obtain the quarter +@* pel values.The six tap filtering operation is described in sec 8.4.2.2.1 +@* titled "Luma sample interpolation process". +@* +@* @par Description: +@* This function is called to obtain pixels lying at the following +@* location (1/4,1/2) or (3/4,1/2). The function interpolates +@* the predictors first in the verical direction and then in the +@* horizontal direction to output the (1/2,1/2). It then averages +@* the output of the 2nd stage and (1/2,1/2) value to obtain (1/4,1/2) +@* or (3/4,1/2) depending on the offset. +@* +@* @param[in] pu1_src +@* UWORD8 pointer to the source +@* +@* @param[out] pu1_dst +@* UWORD8 pointer to the destination +@* +@* @param[in] src_strd +@* integer source stride +@* +@* @param[in] dst_strd +@* integer destination stride +@* +@* @param[in] ht +@* integer height of the array +@* +@* @param[in] wd +@* integer width of the array +@* +@* @param[in] pu1_tmp: temporary buffer +@* +@* @param[in] dydx: x and y reference offset for qpel calculations +@* +@* @returns +@* +@* @remarks +@* None +@* +@******************************************************************************* +@*/; + +@void ih264_inter_pred_luma_horz_qpel_vert_hpel(UWORD8 *pu1_src, +@ UWORD8 *pu1_dst, +@ WORD32 src_strd,, +@ WORD32 dst_strd, +@ WORD32 ht, +@ WORD32 wd, +@ UWORD8* pu1_tmp, +@ UWORD32 dydx) + +@**************Variables Vs Registers***************************************** +@ r0 => *pu1_src +@ r1 => *pu1_dst +@ r2 => src_strd +@ r3 => dst_strd +@ r4 => ht +@ r5 => wd +@ r6 => dydx +@ r9 => *pu1_tmp + +.text +.p2align 2 + + .global ih264_inter_pred_luma_horz_qpel_vert_hpel_a9q + +ih264_inter_pred_luma_horz_qpel_vert_hpel_a9q: + + stmfd sp!, {r4-r12, r14} @store register values to stack + vstmdb sp!, {d8-d15} @push neon registers to stack + ldr r4, [sp, #104] @ loads ht + sub r0, r0, r2, lsl #1 @pu1_src-2*src_strd + sub r0, r0, #2 @pu1_src-2 + ldr r5, [sp, #108] @ loads wd + ldr r6, [sp, #116] @ loads dydx + and r6, r6, #2 @ dydx & 0x3 followed by dydx>>1 and dydx<<1 + ldr r9, [sp, #112] @pu1_tmp + add r7, r9, #4 + add r6, r7, r6 @ pi16_pred1_temp += (x_offset>>1) + + vmov.u16 q13, #0x14 @ Filter coeff 20 into Q13 + vmov.u16 q12, #0x5 @ Filter coeff 5 into Q12 + mov r7, #0x20 + mov r8, #0x30 + subs r12, r5, #4 @if wd=4 branch to loop_4 + beq loop_4 + + subs r12, r5, #8 @if wd=8 branch to loop_8 + beq loop_8 + + @when wd=16 + vmov.u16 q14, #0x14 @ Filter coeff 20 into Q13 + vmov.u16 q15, #0x5 @ Filter coeff 5 into Q12 + add r14, r2, #0 + sub r2, r2, #16 + + +loop_16: + + vld1.u32 {q0}, [r0]! @ Vector load from src[0_0] + vld1.u32 d12, [r0], r2 @ Vector load from src[0_0] + vld1.u32 {q1}, [r0]! @ Vector load from src[1_0] + vld1.u32 d13, [r0], r2 @ Vector load from src[1_0] + vld1.u32 {q2}, [r0]! @ Vector load from src[2_0] + vld1.u32 d14, [r0], r2 @ Vector load from src[2_0] + vld1.u32 {q3}, [r0]! @ Vector load from src[3_0] + vld1.u32 d15, [r0], r2 @ Vector load from src[3_0] + vld1.u32 {q4}, [r0]! @ Vector load from src[4_0] + vld1.u32 d16, [r0], r2 @ Vector load from src[4_0] + + vld1.u32 {q5}, [r0]! @ Vector load from src[5_0] + vld1.u32 d17, [r0], r2 @ Vector load from src[5_0] + + vaddl.u8 q10, d4, d6 + vaddl.u8 q9, d0, d10 + vaddl.u8 q11, d2, d8 + vmla.u16 q9, q10, q14 + vaddl.u8 q12, d5, d7 + vaddl.u8 q10, d1, d11 + vaddl.u8 q13, d3, d9 + vmla.u16 q10, q12, q14 + vaddl.u8 q12, d14, d15 + vmls.u16 q9, q11, q15 + vaddl.u8 q11, d12, d17 + vmls.u16 q10, q13, q15 + vaddl.u8 q13, d13, d16 + vmla.u16 q11, q12, q14 + vmls.u16 q11, q13, q15 + vst1.32 {q9}, [r9]! + vst1.32 {q10}, [r9]! + vext.16 q12, q9, q10, #2 + vext.16 q13, q9, q10, #3 + vst1.32 {q11}, [r9] + vext.16 q11, q9, q10, #5 + vadd.s16 q0, q12, q13 + vext.16 q12, q9, q10, #1 + vext.16 q13, q9, q10, #4 + vadd.s16 q12, q12, q13 + + vaddl.s16 q13, d18, d22 + vmlal.s16 q13, d0, d28 + vmlsl.s16 q13, d24, d30 + + vaddl.s16 q11, d19, d23 + vmlal.s16 q11, d1, d28 + vmlsl.s16 q11, d25, d30 + + vqrshrun.s32 d18, q13, #10 + vqrshrun.s32 d19, q11, #10 + vld1.32 {q11}, [r9]! + vqmovn.u16 d18, q9 + + vext.16 q12, q10, q11, #2 + vext.16 q13, q10, q11, #3 + vext.16 q0, q10, q11, #5 + vst1.32 d18, [r1] + vadd.s16 q9, q12, q13 + vext.16 q12, q10, q11, #1 + vext.16 q13, q10, q11, #4 + vadd.s16 q12, q12, q13 + + vaddl.s16 q13, d0, d20 + vmlal.s16 q13, d18, d28 + vmlsl.s16 q13, d24, d30 + + vaddl.s16 q11, d1, d21 + vmlal.s16 q11, d19, d28 + vmlsl.s16 q11, d25, d30 + + vqrshrun.s32 d18, q13, #10 + vqrshrun.s32 d19, q11, #10 + + vaddl.u8 q12, d7, d9 + vld1.32 {q10}, [r6]! + vld1.32 {q11}, [r6], r7 + + vqmovn.u16 d19, q9 + + vld1.32 d18, [r1] + vqrshrun.s16 d20, q10, #5 + vqrshrun.s16 d21, q11, #5 + vaddl.u8 q11, d4, d10 + vld1.u32 {q0}, [r0]! @ Vector load from src[6_0] + vrhadd.u8 q9, q9, q10 + vld1.u32 d12, [r0], r2 @ Vector load from src[6_0] + vaddl.u8 q10, d6, d8 + vaddl.u8 q13, d5, d11 + vst1.32 {q9}, [r1], r3 @ store row 0 + +@ROW_2 + + vaddl.u8 q9, d2, d0 + + vmla.u16 q9, q10, q14 + + vaddl.u8 q10, d3, d1 + + vmla.u16 q10, q12, q14 + vaddl.u8 q12, d15, d16 + vmls.u16 q9, q11, q15 + vaddl.u8 q11, d13, d12 + vmls.u16 q10, q13, q15 + vaddl.u8 q13, d14, d17 + vmla.u16 q11, q12, q14 + vmls.u16 q11, q13, q15 + vst1.32 {q9}, [r9]! + vst1.32 {q10}, [r9]! + vext.16 q12, q9, q10, #2 + vext.16 q13, q9, q10, #3 + vst1.32 {q11}, [r9] + vext.16 q11, q9, q10, #5 + vadd.s16 q1, q12, q13 + vext.16 q12, q9, q10, #1 + vext.16 q13, q9, q10, #4 + vadd.s16 q12, q12, q13 + + vaddl.s16 q13, d18, d22 + vmlal.s16 q13, d2, d28 + vmlsl.s16 q13, d24, d30 + + vaddl.s16 q11, d19, d23 + vmlal.s16 q11, d3, d28 + vmlsl.s16 q11, d25, d30 + + vqrshrun.s32 d18, q13, #10 + vqrshrun.s32 d19, q11, #10 + vld1.32 {q11}, [r9]! + vqmovn.u16 d18, q9 + + vext.16 q12, q10, q11, #2 + vext.16 q13, q10, q11, #3 + vext.16 q1, q10, q11, #5 + vst1.32 d18, [r1] + vadd.s16 q9, q12, q13 + vext.16 q12, q10, q11, #1 + vext.16 q13, q10, q11, #4 + vadd.s16 q12, q12, q13 + + vaddl.s16 q13, d2, d20 + vmlal.s16 q13, d18, d28 + vmlsl.s16 q13, d24, d30 + + vaddl.s16 q11, d3, d21 + vmlal.s16 q11, d19, d28 + vmlsl.s16 q11, d25, d30 + + vqrshrun.s32 d18, q13, #10 + vqrshrun.s32 d19, q11, #10 + vaddl.u8 q12, d9, d11 + vld1.32 {q10}, [r6]! + vld1.32 {q11}, [r6], r7 + vqmovn.u16 d19, q9 + vld1.32 d18, [r1] + vqrshrun.s16 d20, q10, #5 + vqrshrun.s16 d21, q11, #5 + + vrhadd.u8 q9, q9, q10 + + vst1.32 {q9}, [r1], r3 @ store row 1 + + subs r4, r4, #2 + subne r0, r0 , r14, lsl #2 + subne r0, r0, r14 + + beq end_func @ Branch if height==4 + b loop_16 @ Loop if height==8 + +loop_8: + vld1.u32 {q0}, [r0], r2 @ Vector load from src[0_0] + vld1.u32 {q1}, [r0], r2 @ Vector load from src[1_0] + vld1.u32 {q2}, [r0], r2 @ Vector load from src[2_0] + vld1.u32 {q3}, [r0], r2 @ Vector load from src[3_0] + vld1.u32 {q4}, [r0], r2 @ Vector load from src[4_0] + + vld1.u32 {q5}, [r0], r2 @ Vector load from src[5_0] + vaddl.u8 q7, d4, d6 + vaddl.u8 q6, d0, d10 + vaddl.u8 q8, d2, d8 + vmla.u16 q6, q7, q13 + vaddl.u8 q9, d5, d7 + vaddl.u8 q7, d1, d11 + vaddl.u8 q11, d3, d9 + vmla.u16 q7, q9, q13 + vmls.u16 q6, q8, q12 + vld1.32 {q0}, [r0], r2 @ Vector load from src[6_0] + vaddl.u8 q8, d6, d8 + vmls.u16 q7, q11, q12 + vaddl.u8 q14, d2, d0 + vst1.32 {q6}, [r9]! @ store row 0 to temp buffer: col 0 + vext.16 q11, q6, q7, #5 + vaddl.u8 q9, d4, d10 + vmla.u16 q14, q8, q13 + vaddl.s16 q15, d12, d22 + vst1.32 {q7}, [r9], r7 @ store row 0 to temp buffer: col 1 + vaddl.s16 q11, d13, d23 + vext.16 q8, q6, q7, #2 + vmls.u16 q14, q9, q12 + vext.16 q9, q6, q7, #3 + vext.16 q10, q6, q7, #4 + vext.16 q7, q6, q7, #1 + vadd.s16 q8, q8, q9 + vadd.s16 q9, q7, q10 + vaddl.u8 q10, d7, d9 + vmlal.s16 q15, d16, d26 + vmlsl.s16 q15, d18, d24 + vmlal.s16 q11, d17, d26 + vmlsl.s16 q11, d19, d24 + vaddl.u8 q7, d3, d1 + vst1.32 {q14}, [r9]! @ store row 1 to temp buffer: col 0 + vmla.u16 q7, q10, q13 + vqrshrun.s32 d12, q15, #10 + vaddl.u8 q8, d5, d11 + vqrshrun.s32 d13, q11, #10 + vmls.u16 q7, q8, q12 +@ vld1.32 {q1},[r0],r2 ; Vector load from src[7_0] + vqmovn.u16 d25, q6 + vaddl.u8 q8, d8, d10 + + + vext.16 q11, q14, q7, #5 + vaddl.u8 q10, d4, d2 + vaddl.s16 q15, d28, d22 + vmla.u16 q10, q8, q13 + vst1.32 {q7}, [r9], r7 @ store row 1 to temp buffer: col 1 + vaddl.s16 q11, d29, d23 + vext.16 q8, q14, q7, #2 + vext.16 q9, q14, q7, #3 + vext.16 q6, q14, q7, #4 + vext.16 q7, q14, q7, #1 + vadd.s16 q8, q8, q9 + vadd.s16 q9, q6, q7 + vld1.32 {q7}, [r6], r8 @ load row 0 from temp buffer + vmlal.s16 q15, d16, d26 + vmlsl.s16 q15, d18, d24 + vmlal.s16 q11, d17, d26 + vmlsl.s16 q11, d19, d24 + vqrshrun.s16 d14, q7, #0x5 + vld1.32 {q14}, [r6], r8 @ load row 1 from temp buffer + vaddl.u8 q9, d6, d0 + vqrshrun.s32 d16, q15, #10 + vqrshrun.s16 d15, q14, #0x5 + vqrshrun.s32 d17, q11, #10 + vmov d12, d25 + vmov d25, d24 + + vqmovn.u16 d13, q8 + vrhadd.u8 q6, q6, q7 + + vst1.32 d12, [r1], r3 @ store row 0 + vst1.32 d13, [r1], r3 @ store row 1 + + subs r4, r4, #2 + subne r0, r0 , r2, lsl #2 + subne r0, r0, r2 + + beq end_func @ Branch if height==4 + b loop_8 @ Loop if height==8 + +loop_4: + vld1.u32 {q0}, [r0], r2 @ Vector load from src[0_0] + vld1.u32 {q1}, [r0], r2 @ Vector load from src[1_0] + vld1.u32 {q2}, [r0], r2 @ Vector load from src[2_0] + vld1.u32 {q3}, [r0], r2 @ Vector load from src[3_0] + vld1.u32 {q4}, [r0], r2 @ Vector load from src[4_0] + vld1.u32 {q5}, [r0], r2 @ Vector load from src[5_0] + + vaddl.u8 q7, d4, d6 @ temp1 = src[2_0] + src[3_0] + vaddl.u8 q6, d0, d10 @ temp = src[0_0] + src[5_0] + vaddl.u8 q8, d2, d8 @ temp2 = src[1_0] + src[4_0] + vmla.u16 q6, q7, q13 @ temp += temp1 * 20 + vaddl.u8 q9, d5, d7 @ temp1 = src[2_0] + src[3_0] + vaddl.u8 q7, d1, d11 @ temp = src[0_0] + src[5_0] + vaddl.u8 q11, d3, d9 @ temp2 = src[1_0] + src[4_0] + vmla.u16 q7, q9, q13 @ temp += temp1 * 20 + vmls.u16 q6, q8, q12 @ temp -= temp2 * 5 + vld1.32 {q0}, [r0], r2 @ Vector load from src[6_0] + vaddl.u8 q8, d6, d8 + vmls.u16 q7, q11, q12 @ temp -= temp2 * 5 + @Q6 and Q7 have filtered values + vaddl.u8 q14, d2, d0 + vst1.32 {q6}, [r9]! @ store row 0 to temp buffer: col 0 + vext.16 q11, q6, q7, #5 + vaddl.u8 q9, d4, d10 + vmla.u16 q14, q8, q13 + vaddl.s16 q15, d12, d22 + vst1.32 {q7}, [r9], r7 @ store row 0 to temp buffer: col 1 + vaddl.s16 q11, d13, d23 + vext.16 q8, q6, q7, #2 + vmls.u16 q14, q9, q12 + vext.16 q9, q6, q7, #3 + vext.16 q10, q6, q7, #4 + vext.16 q7, q6, q7, #1 + vadd.s16 q8, q8, q9 + vadd.s16 q9, q7, q10 + vaddl.u8 q10, d7, d9 + vmlal.s16 q15, d16, d26 + vmlsl.s16 q15, d18, d24 + vmlal.s16 q11, d17, d26 + vmlsl.s16 q11, d19, d24 + vaddl.u8 q7, d3, d1 + vst1.32 {q14}, [r9]! @ store row 1 to temp buffer: col 0 + vmla.u16 q7, q10, q13 + vqrshrun.s32 d12, q15, #10 + vaddl.u8 q8, d5, d11 + vqrshrun.s32 d13, q11, #10 + vmls.u16 q7, q8, q12 + vqmovn.u16 d25, q6 + vaddl.u8 q8, d8, d10 + + vext.16 q11, q14, q7, #5 + vaddl.u8 q10, d4, d2 + vaddl.s16 q15, d28, d22 + vmla.u16 q10, q8, q13 + vst1.32 {q7}, [r9], r7 @ store row 1 to temp buffer: col 1 + vaddl.s16 q11, d29, d23 + vext.16 q8, q14, q7, #2 + vext.16 q9, q14, q7, #3 + vext.16 q6, q14, q7, #4 + vext.16 q7, q14, q7, #1 + vadd.s16 q8, q8, q9 + vadd.s16 q9, q6, q7 + vld1.32 d14, [r6], r8 @load row 0 from temp buffer + vmlal.s16 q15, d16, d26 + vmlsl.s16 q15, d18, d24 + vmlal.s16 q11, d17, d26 + vmlsl.s16 q11, d19, d24 + vqrshrun.s16 d14, q7, #0x5 + vld1.32 d28, [r6], r8 @load row 1 from temp buffer + vaddl.u8 q9, d6, d0 + vqrshrun.s32 d16, q15, #10 + vqrshrun.s16 d15, q14, #0x5 + vqrshrun.s32 d17, q11, #10 + vmov d12, d25 + vmov d25, d24 + + vqmovn.u16 d13, q8 + vrhadd.u8 q6, q6, q7 + vst1.32 d12[0], [r1], r3 @ store row 0 + vst1.32 d13[0], [r1], r3 @store row 1 + + subs r4, r4, #2 + subne r0, r0 , r2, lsl #2 + subne r0, r0, r2 + + beq end_func @ Branch if height==4 + b loop_4 @ Loop if height==8 + +end_func: + vldmia sp!, {d8-d15} @ Restore neon registers that were saved + ldmfd sp!, {r4-r12, pc} @Restoring registers from stack + + diff --git a/common/arm/ih264_inter_pred_luma_horz_qpel_vert_qpel_a9q.s b/common/arm/ih264_inter_pred_luma_horz_qpel_vert_qpel_a9q.s new file mode 100755 index 0000000..3c8b60a --- /dev/null +++ b/common/arm/ih264_inter_pred_luma_horz_qpel_vert_qpel_a9q.s @@ -0,0 +1,355 @@ +@/****************************************************************************** +@ * +@ * Copyright (C) 2015 The Android Open Source Project +@ * +@ * Licensed under the Apache License, Version 2.0 (the "License"); +@ * you may not use this file except in compliance with the License. +@ * You may obtain a copy of the License at: +@ * +@ * http://www.apache.org/licenses/LICENSE-2.0 +@ * +@ * Unless required by applicable law or agreed to in writing, software +@ * distributed under the License is distributed on an "AS IS" BASIS, +@ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +@ * See the License for the specific language governing permissions and +@ * limitations under the License. +@ * +@ ***************************************************************************** +@ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore +@*/ +@/** +@****************************************************************************** +@* @file +@* ih264_inter_pred_luma_horz_qpel_vert_qpel_a9q.s +@* +@* @brief +@* Contains function definitions for inter prediction interpolation. +@* +@* @author +@* Mohit +@* +@* @par List of Functions: +@* +@* - ih264_inter_pred_luma_horz_qpel_vert_qpel_a9q() +@* +@* @remarks +@* None +@* +@******************************************************************************* +@*/ + +@/* All the functions here are replicated from ih264_inter_pred_filters.c +@ + +@/** +@/** +@/** +@******************************************************************************* +@* +@* @brief +@* This function implements two six tap filters. It +@* applies the six tap filter in the horizontal direction on the +@* predictor values, then applies the same filter in the +@* vertical direction on the predictor values. It then averages these +@* two outputs to obtain quarter pel values in horizontal and vertical direction. +@* The six tap filtering operation is described in sec 8.4.2.2.1 titled +@* "Luma sample interpolation process" +@* +@* @par Description: +@* This function is called to obtain pixels lying at the following +@* location (1/4,1/4) or (3/4,1/4) or (1/4,3/4) or (3/4,3/4). +@* The function interpolates the predictors first in the horizontal direction +@* and then in the vertical direction, and then averages these two +@* values. +@* +@* @param[in] pu1_src +@* UWORD8 pointer to the source +@* +@* @param[out] pu1_dst +@* UWORD8 pointer to the destination +@* +@* @param[in] src_strd +@* integer source stride +@* +@* @param[in] dst_strd +@* integer destination stride +@* +@* @param[in] ht +@* integer height of the array +@* +@* @param[in] wd +@* integer width of the array +@* +@* @param[in] pu1_tmp: temporary buffer +@* +@* @param[in] dydx: x and y reference offset for qpel calculations +@* +@* @returns +@* +@* @remarks +@* None +@* +@******************************************************************************* +@*/; + +@void ih264_inter_pred_luma_horz_qpel_vert_qpel(UWORD8 *pu1_src, +@ UWORD8 *pu1_dst, +@ WORD32 src_strd,, +@ WORD32 dst_strd, +@ WORD32 ht, +@ WORD32 wd, +@ UWORD8* pu1_tmp, +@ UWORD32 dydx) + +@**************Variables Vs Registers***************************************** +@ r0 => *pu1_src +@ r1 => *pu1_dst +@ r2 => src_strd +@ r3 => dst_strd +@ r4 => ht +@ r5 => wd +@ r6 => dydx + +.text +.p2align 2 + + .global ih264_inter_pred_luma_horz_qpel_vert_qpel_a9q + +ih264_inter_pred_luma_horz_qpel_vert_qpel_a9q: + + stmfd sp!, {r4-r12, r14} @store register values to stack + vstmdb sp!, {d8-d15} @push neon registers to stack + ldr r4, [sp, #104] @ loads ht + ldr r5, [sp, #108] @ loads wd + ldr r6, [sp, #116] @dydx + and r7, r6, #3 + add r7, r0, r7, lsr #1 @pu1_pred_vert = pu1_src + (x_offset>>1) + + and r6, r6, #12 @Finds y-offset + lsr r6, r6, #3 @dydx>>3 + mul r6, r2, r6 + add r6, r0, r6 @pu1_pred_horz = pu1_src + (y_offset>>1)*src_strd + sub r7, r7, r2, lsl #1 @pu1_pred_vert-2*src_strd + sub r6, r6, #2 @pu1_pred_horz-2 + vmov.u8 d30, #20 @ Filter coeff 20 + vmov.u8 d31, #5 @ Filter coeff 5 + + subs r12, r5, #4 @if wd=4 branch to loop_4 + beq loop_4 + subs r12, r5, #8 @if wd=8 branch to loop_8 + beq loop_8 + +loop_16: + vld1.32 {q0}, [r7], r2 @ Vector load from src[0_0] + vld1.32 {q1}, [r7], r2 @ Vector load from src[1_0] + vld1.32 {q2}, [r7], r2 @ Vector load from src[2_0] + vld1.32 {q3}, [r7], r2 @ Vector load from src[3_0] + vld1.32 {q4}, [r7], r2 @ Vector load from src[4_0] + add r11, r6, #8 + vld1.32 {q5}, [r7], r2 @ Vector load from src[5_0] + vld1.32 {q9}, [r6], r2 @ horz row0, col 0 + vaddl.u8 q12, d0, d10 + vmlal.u8 q12, d4, d30 + vmlal.u8 q12, d6, d30 + vmlsl.u8 q12, d2, d31 + vmlsl.u8 q12, d8, d31 + vext.8 d23, d18, d19, #5 + vext.8 d20, d18, d19, #2 + vext.8 d21, d18, d19, #3 + vext.8 d22, d18, d19, #4 + vext.8 d19, d18, d19, #1 + vqrshrun.s16 d26, q12, #5 + vaddl.u8 q14, d18, d23 + vmlal.u8 q14, d20, d30 + vmlal.u8 q14, d21, d30 + vmlsl.u8 q14, d19, d31 + vmlsl.u8 q14, d22, d31 + vld1.32 {q9}, [r11], r2 @ horz row 0, col 1 + vaddl.u8 q12, d1, d11 + vmlal.u8 q12, d5, d30 + vmlal.u8 q12, d7, d30 + vmlsl.u8 q12, d3, d31 + vmlsl.u8 q12, d9, d31 + vqrshrun.s16 d28, q14, #5 + vext.8 d23, d18, d19, #5 + vext.8 d20, d18, d19, #2 + vext.8 d21, d18, d19, #3 + vext.8 d22, d18, d19, #4 + vext.8 d19, d18, d19, #1 + vqrshrun.s16 d27, q12, #5 + vld1.32 {q6}, [r7], r2 @ src[6_0] + + vaddl.u8 q12, d18, d23 + vmlal.u8 q12, d20, d30 + vmlal.u8 q12, d21, d30 + vmlsl.u8 q12, d19, d31 + vmlsl.u8 q12, d22, d31 + + vaddl.u8 q8, d2, d12 + vmlal.u8 q8, d6, d30 + vmlal.u8 q8, d8, d30 + vmlsl.u8 q8, d4, d31 + vmlsl.u8 q8, d10, d31 + vqrshrun.s16 d29, q12, #5 + vld1.32 {q9}, [r6], r2 @ horz row 1, col 0 + + vaddl.u8 q12, d3, d13 + vmlal.u8 q12, d7, d30 + vmlal.u8 q12, d9, d30 + vmlsl.u8 q12, d5, d31 + vmlsl.u8 q12, d11, d31 + vrhadd.u8 q14, q14, q13 + vqrshrun.s16 d26, q8, #5 + vext.8 d23, d18, d19, #5 + vext.8 d20, d18, d19, #2 + vext.8 d21, d18, d19, #3 + vext.8 d22, d18, d19, #4 + vst1.32 {q14}, [r1], r3 @ store row 0 + vext.8 d19, d18, d19, #1 + vqrshrun.s16 d27, q12, #5 + + vaddl.u8 q14, d18, d23 + vmlal.u8 q14, d20, d30 + vmlal.u8 q14, d21, d30 + vmlsl.u8 q14, d19, d31 + vmlsl.u8 q14, d22, d31 + + vld1.32 {q9}, [r11], r2 @ horz row 1, col 1 + + vext.8 d23, d18, d19, #5 + vext.8 d20, d18, d19, #2 + vext.8 d21, d18, d19, #3 + vext.8 d22, d18, d19, #4 + vext.8 d19, d18, d19, #1 + + vqrshrun.s16 d28, q14, #5 + vaddl.u8 q12, d18, d23 + vmlal.u8 q12, d20, d30 + vmlal.u8 q12, d21, d30 + vmlsl.u8 q12, d19, d31 + vmlsl.u8 q12, d22, d31 + + vqrshrun.s16 d29, q12, #5 + vrhadd.u8 q14, q14, q13 + vst1.32 {q14}, [r1], r3 @ store row 1 + + subs r4, r4, #2 @ 2 rows processed, decrement by 2 + subne r7, r7 , r2, lsl #2 + subne r7, r7, r2 + beq end_func @ Branch if height==4 + + b loop_16 @ looping if height = 8 or 16 + + +loop_8: + vld1.32 d0, [r7], r2 @ Vector load from src[0_0] + vld1.32 d1, [r7], r2 @ Vector load from src[1_0] + vld1.32 d2, [r7], r2 @ Vector load from src[2_0] + vld1.32 d3, [r7], r2 @ Vector load from src[3_0] + vld1.32 d4, [r7], r2 @ Vector load from src[4_0] + vld1.32 d5, [r7], r2 @ Vector load from src[5_0] + vaddl.u8 q5, d0, d5 + vmlal.u8 q5, d2, d30 + vmlal.u8 q5, d3, d30 + vmlsl.u8 q5, d1, d31 + vmlsl.u8 q5, d4, d31 + vld1.32 {q6}, [r6], r2 @horz row 0 + vext.8 d17, d12, d13, #5 + vext.8 d14, d12, d13, #2 + vext.8 d15, d12, d13, #3 + vext.8 d16, d12, d13, #4 + vext.8 d13, d12, d13, #1 + vqrshrun.s16 d26, q5, #5 + vld1.32 d6, [r7], r2 @ src[6_0] + vaddl.u8 q5, d12, d17 + vmlal.u8 q5, d14, d30 + vmlal.u8 q5, d15, d30 + vmlsl.u8 q5, d13, d31 + vmlsl.u8 q5, d16, d31 + vld1.32 {q6}, [r6], r2 @ horz row 1 + vaddl.u8 q9, d1, d6 + vmlal.u8 q9, d3, d30 + vmlal.u8 q9, d4, d30 + vmlsl.u8 q9, d2, d31 + vmlsl.u8 q9, d5, d31 + vqrshrun.s16 d28, q5, #5 + vext.8 d17, d12, d13, #5 + vext.8 d14, d12, d13, #2 + vext.8 d15, d12, d13, #3 + vext.8 d16, d12, d13, #4 + vext.8 d13, d12, d13, #1 + vqrshrun.s16 d27, q9, #5 + vaddl.u8 q5, d12, d17 + vmlal.u8 q5, d14, d30 + vmlal.u8 q5, d15, d30 + vmlsl.u8 q5, d13, d31 + vmlsl.u8 q5, d16, d31 + vqrshrun.s16 d29, q5, #5 + vrhadd.u8 q13, q13, q14 + vst1.32 d26, [r1], r3 + vst1.32 d27, [r1], r3 + + subs r4, r4, #2 @ 2 rows processed, decrement by 2 + subne r7, r7 , r2, lsl #2 + subne r7, r7, r2 + beq end_func @ Branch if height==4 + b loop_8 @looping if height == 8 or 16 + +loop_4: + vld1.32 d0[0], [r7], r2 @ Vector load from src[0_0] + vld1.32 d1[0], [r7], r2 @ Vector load from src[1_0] + vld1.32 d2[0], [r7], r2 @ Vector load from src[2_0] + vld1.32 d3[0], [r7], r2 @ Vector load from src[3_0] + vld1.32 d4[0], [r7], r2 @ Vector load from src[4_0] + vld1.32 d5[0], [r7], r2 @ Vector load from src[5_0] + vaddl.u8 q5, d0, d5 + vmlal.u8 q5, d2, d30 + vmlal.u8 q5, d3, d30 + vmlsl.u8 q5, d1, d31 + vmlsl.u8 q5, d4, d31 + vld1.32 {q6}, [r6], r2 @load for horz filter row 0 + vext.8 d17, d12, d13, #5 + vext.8 d14, d12, d13, #2 + vext.8 d15, d12, d13, #3 + vext.8 d16, d12, d13, #4 + vext.8 d13, d12, d13, #1 + vqrshrun.s16 d26, q5, #5 + vld1.32 d6[0], [r7], r2 @ Vector load from src[6_0] + vaddl.u8 q5, d12, d17 + vmlal.u8 q5, d14, d30 + vmlal.u8 q5, d15, d30 + vmlsl.u8 q5, d13, d31 + vmlsl.u8 q5, d16, d31 + vld1.32 {q6}, [r6], r2 @horz row 1 + vaddl.u8 q9, d1, d6 + vmlal.u8 q9, d3, d30 + vmlal.u8 q9, d4, d30 + vmlsl.u8 q9, d2, d31 + vmlsl.u8 q9, d5, d31 + vqrshrun.s16 d28, q5, #5 + vext.8 d17, d12, d13, #5 + vext.8 d14, d12, d13, #2 + vext.8 d15, d12, d13, #3 + vext.8 d16, d12, d13, #4 + vext.8 d13, d12, d13, #1 + vqrshrun.s16 d27, q9, #5 + vaddl.u8 q5, d12, d17 + vmlal.u8 q5, d14, d30 + vmlal.u8 q5, d15, d30 + vmlsl.u8 q5, d13, d31 + vmlsl.u8 q5, d16, d31 + vqrshrun.s16 d29, q5, #5 + vrhadd.u8 q13, q13, q14 + vst1.32 d26[0], [r1], r3 + vst1.32 d27[0], [r1], r3 + + subs r4, r4, #2 @ 2 rows processed, decrement by 2 + subne r7, r7 , r2, lsl #2 + subne r7, r7, r2 + beq end_func @ Branch if height==4 + b loop_4 @ Loop if height==8 +end_func: + vldmia sp!, {d8-d15} @ Restore neon registers that were saved + ldmfd sp!, {r4-r12, pc} @Restoring registers from stack + + diff --git a/common/arm/ih264_inter_pred_luma_vert_qpel_a9q.s b/common/arm/ih264_inter_pred_luma_vert_qpel_a9q.s new file mode 100755 index 0000000..d45055e --- /dev/null +++ b/common/arm/ih264_inter_pred_luma_vert_qpel_a9q.s @@ -0,0 +1,330 @@ +@/****************************************************************************** +@ * +@ * Copyright (C) 2015 The Android Open Source Project +@ * +@ * Licensed under the Apache License, Version 2.0 (the "License"); +@ * you may not use this file except in compliance with the License. +@ * You may obtain a copy of the License at: +@ * +@ * http://www.apache.org/licenses/LICENSE-2.0 +@ * +@ * Unless required by applicable law or agreed to in writing, software +@ * distributed under the License is distributed on an "AS IS" BASIS, +@ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +@ * See the License for the specific language governing permissions and +@ * limitations under the License. +@ * +@ ***************************************************************************** +@ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore +@*/ +@/** +@****************************************************************************** +@* @file +@* ih264_inter_pred_luma_vert_qpel_a9q.s +@* +@* @brief +@* Contains function definitions for inter prediction vertical quarter pel interpolation. +@* +@* @author +@* Mohit +@* +@* @par List of Functions: +@* +@* - ih264_inter_pred_luma_vert_qpel_a9q() +@* +@* @remarks +@* None +@* +@******************************************************************************* +@*/ + +@/* All the functions here are replicated from ih264_inter_pred_filters.c +@ + +@/** +@/** +@******************************************************************************* +@* +@* @brief +@* Quarter pel interprediction luma filter for vertical input +@* +@* @par Description: +@* Applies a 6 tap horizontal filter .The output is clipped to 8 bits +@* sec 8.4.2.2.1 titled "Luma sample interpolation process" +@* +@* @param[in] pu1_src +@* UWORD8 pointer to the source +@* +@* @param[out] pu1_dst +@* UWORD8 pointer to the destination +@* +@* @param[in] src_strd +@* integer source stride +@* +@* @param[in] dst_strd +@* integer destination stride +@* +@* @param[in] ht +@* integer height of the array +@* +@* @param[in] wd +@* integer width of the array +@* +@* @param[in] pu1_tmp: temporary buffer: UNUSED in this function +@* +@* @param[in] dydx: x and y reference offset for qpel calculations. +@* @returns +@* +@ @remarks +@* None +@* +@******************************************************************************* +@*/ + +@void ih264_inter_pred_luma_vert ( +@ UWORD8 *pu1_src, +@ UWORD8 *pu1_dst, +@ WORD32 src_strd, +@ WORD32 dst_strd, +@ WORD32 ht, +@ WORD32 wd, +@ UWORD8* pu1_tmp, +@ UWORD32 dydx) + +@**************Variables Vs Registers***************************************** +@ r0 => *pu1_src +@ r1 => *pu1_dst +@ r2 => src_strd +@ r3 => dst_strd +@ r5 => ht +@ r6 => wd +@ r7 => dydx + +.text +.p2align 2 + + .global ih264_inter_pred_luma_vert_qpel_a9q + +ih264_inter_pred_luma_vert_qpel_a9q: + + stmfd sp!, {r4-r12, r14} @store register values to stack + vstmdb sp!, {d8-d15} @push neon registers to stack + ldr r5, [sp, #104] @Loads ht + + ldr r6, [sp, #108] @Loads wd + ldr r7, [sp, #116] @Loads dydx + and r7, r7, #12 @Finds y-offset + lsr r7, r7, #3 @dydx>>3 + mul r7, r2, r7 + add r7, r0, r7 @pu1_src + (y_offset>>1)*src_strd + vmov.u16 q11, #20 @ Filter coeff 0x14 into Q11 + sub r0, r0, r2, lsl #1 @pu1_src-2*src_strd + subs r12, r6, #8 @if wd=8 branch to loop_8 + vmov.u16 q12, #5 @ Filter coeff 0x5 into Q12 + beq loop_8 + + subs r12, r6, #4 @if wd=4 branch to loop_4 + beq loop_4 + +loop_16: @when wd=16 + + vld1.u32 {q0}, [r0], r2 @ Vector load from src[0_0] + vld1.u32 {q1}, [r0], r2 @ Vector load from src[1_0] + vld1.u32 {q2}, [r0], r2 @ Vector load from src[2_0] + vld1.u32 {q3}, [r0], r2 @ Vector load from src[3_0] + vld1.u32 {q4}, [r0], r2 @ Vector load from src[4_0] + vaddl.u8 q6, d4, d6 @ temp1 = src[2_0] + src[3_0] + vld1.u32 {q5}, [r0], r2 @ Vector load from src[5_0] + vaddl.u8 q7, d0, d10 @ temp = src[0_0] + src[5_0] + vaddl.u8 q8, d2, d8 @ temp2 = src[1_0] + src[4_0] + vmla.u16 q7, q6, q11 @ temp += temp1 * 20 + vaddl.u8 q10, d1, d11 @ temp4 = src[0_8] + src[5_8] + vaddl.u8 q9, d5, d7 @ temp3 = src[2_8] + src[3_8] + vmla.u16 q10, q9, q11 @ temp4 += temp3 * 20 + vld1.u32 {q0}, [r0], r2 + vaddl.u8 q13, d3, d9 @ temp5 = src[1_8] + src[4_8] + vaddl.u8 q6, d6, d8 + vmls.u16 q7, q8, q12 @ temp -= temp2 * 5 + vaddl.u8 q8, d2, d0 + vaddl.u8 q9, d4, d10 + vmla.u16 q8, q6, q11 + vmls.u16 q10, q13, q12 @ temp4 -= temp5 * 5 + vaddl.u8 q13, d5, d11 + vaddl.u8 q6, d7, d9 + vqrshrun.s16 d30, q7, #5 @ dst[0_0] = CLIP_U8((temp +16) >> 5) + vaddl.u8 q7, d3, d1 + vld1.u32 {q1}, [r0], r2 + vmla.u16 q7, q6, q11 + vmls.u16 q8, q9, q12 + vqrshrun.s16 d31, q10, #5 @ dst[0_8] = CLIP_U8((temp4 +16) >> 5) + vld1.u32 {q10}, [r7], r2 @ Load for interpolation row 0 + vrhadd.u8 q15, q10, q15 @ Interpolation to obtain qpel value + vaddl.u8 q9, d4, d2 + vaddl.u8 q6, d8, d10 + + vst1.u32 {q15}, [r1], r3 @ Vector store to dst[0_0] + vmla.u16 q9, q6, q11 + vaddl.u8 q10, d6, d0 + vmls.u16 q7, q13, q12 + vqrshrun.s16 d30, q8, #5 + vaddl.u8 q6, d9, d11 + vaddl.u8 q8, d5, d3 + vaddl.u8 q13, d7, d1 + vmla.u16 q8, q6, q11 + vmls.u16 q9, q10, q12 + vld1.u32 {q2}, [r0], r2 + + vqrshrun.s16 d31, q7, #5 + vld1.u32 {q7}, [r7], r2 @ Load for interpolation row 1 + vaddl.u8 q6, d10, d0 + vrhadd.u8 q15, q7, q15 @ Interpolation to obtain qpel value + vaddl.u8 q7, d6, d4 + vaddl.u8 q10, d8, d2 + vmla.u16 q7, q6, q11 + vmls.u16 q8, q13, q12 + vst1.u32 {q15}, [r1], r3 @store row 1 + vqrshrun.s16 d30, q9, #5 + vaddl.u8 q9, d7, d5 + vaddl.u8 q6, d11, d1 + vmla.u16 q9, q6, q11 + vaddl.u8 q13, d9, d3 + vmls.u16 q7, q10, q12 + vqrshrun.s16 d31, q8, #5 + vld1.u32 {q8}, [r7], r2 @ Load for interpolation row 2 + vmls.u16 q9, q13, q12 + vrhadd.u8 q15, q8, q15 @ Interpolation to obtain qpel value + vaddl.u8 q6, d0, d2 @ temp1 = src[2_0] + src[3_0] + vst1.u32 {q15}, [r1], r3 @store row 2 + vaddl.u8 q8, d10, d4 @ temp2 = src[1_0] + src[4_0] + vaddl.u8 q10, d9, d7 @ temp4 = src[0_8] + src[5_8] + vqrshrun.s16 d30, q7, #5 + vaddl.u8 q13, d5, d11 @ temp5 = src[1_8] + src[4_8] + vaddl.u8 q7, d8, d6 @ temp = src[0_0] + src[5_0] + vqrshrun.s16 d31, q9, #5 + vld1.u32 {q9}, [r7], r2 @ Load for interpolation row 3 + vmla.u16 q7, q6, q11 @ temp += temp1 * 20 + vrhadd.u8 q15, q9, q15 @ Interpolation to obtain qpel value + vaddl.u8 q9, d1, d3 @ temp3 = src[2_8] + src[3_8] + vst1.u32 {q15}, [r1], r3 @store row 3 + subs r5, r5, #4 @ 4 rows processed, decrement by 4 + subne r0, r0 , r2, lsl #2 + subne r0, r0, r2 + beq end_func @ Branch if height==4 + + b loop_16 @ looping if height = 8 or 16 + + +loop_8: + + @// Processing row0 and row1 + vld1.u32 d0, [r0], r2 @ Vector load from src[0_0] + vld1.u32 d1, [r0], r2 @ Vector load from src[1_0] + vld1.u32 d2, [r0], r2 @ Vector load from src[2_0] + vld1.u32 d3, [r0], r2 @ Vector load from src[3_0] + vld1.u32 d4, [r0], r2 @ Vector load from src[4_0] + vld1.u32 d5, [r0], r2 @ Vector load from src[5_0] + + vaddl.u8 q3, d2, d3 @ temp1 = src[2_0] + src[3_0] + vaddl.u8 q4, d0, d5 @ temp = src[0_0] + src[5_0] + vaddl.u8 q5, d1, d4 @ temp2 = src[1_0] + src[4_0] + vmla.u16 q4, q3, q11 @ temp += temp1 * 20 + vld1.u32 d6, [r0], r2 + vaddl.u8 q7, d3, d4 + vaddl.u8 q8, d1, d6 + vaddl.u8 q9, d2, d5 + vmls.u16 q4, q5, q12 @ temp -= temp2 * 5 + vmla.u16 q8, q7, q11 + vld1.u32 d7, [r0], r2 + vaddl.u8 q10, d4, d5 + vaddl.u8 q6, d2, d7 + vaddl.u8 q5, d3, d6 + vmls.u16 q8, q9, q12 + vqrshrun.s16 d26, q4, #5 @ dst[0_0] = CLIP_U8( (temp + 16) >> 5) + vmla.u16 q6, q10, q11 + vld1.32 d8, [r7], r2 @Load value for interpolation (row0) + vld1.32 d9, [r7], r2 @Load value for interpolation (row1) + vld1.u32 d0, [r0], r2 + vaddl.u8 q7, d5, d6 + vqrshrun.s16 d27, q8, #5 + vrhadd.u8 q13, q4, q13 @ Interpolation step for qpel calculation + vaddl.u8 q10, d3, d0 + vmls.u16 q6, q5, q12 + vst1.u32 d26, [r1], r3 @ Vector store to dst[0_0] + vaddl.u8 q9, d4, d7 + vmla.u16 q10, q7, q11 + vst1.u32 d27, [r1], r3 @ Vector store to dst[1_0] + vqrshrun.s16 d28, q6, #5 + vmls.u16 q10, q9, q12 + vld1.32 d12, [r7], r2 @Load value for interpolation (row2) + vld1.32 d13, [r7], r2 @Load value for interpolation (row3) + vqrshrun.s16 d29, q10, #5 + subs r9, r5, #4 + vrhadd.u8 q14, q6, q14 + vst1.u32 d28, [r1], r3 @store row 2 + vst1.u32 d29, [r1], r3 @store row 3 + + subs r5, r5, #4 @ 4 rows processed, decrement by 4 + subne r0, r0 , r2, lsl #2 + subne r0, r0, r2 + beq end_func @ Branch if height==4 + b loop_8 @looping if height == 8 or 16 + +loop_4: +@// Processing row0 and row1 + + vld1.u32 d0[0], [r0], r2 @ Vector load from src[0_0] + vld1.u32 d1[0], [r0], r2 @ Vector load from src[1_0] + vld1.u32 d2[0], [r0], r2 @ Vector load from src[2_0] + vld1.u32 d3[0], [r0], r2 @ Vector load from src[3_0] + vld1.u32 d4[0], [r0], r2 @ Vector load from src[4_0] + vld1.u32 d5[0], [r0], r2 @ Vector load from src[5_0] + + vaddl.u8 q3, d2, d3 @ temp1 = src[2_0] + src[3_0] + vaddl.u8 q4, d0, d5 @ temp = src[0_0] + src[5_0] + vaddl.u8 q5, d1, d4 @ temp2 = src[1_0] + src[4_0] + vmla.u16 q4, q3, q11 @ temp += temp1 * 20 + vld1.u32 d6, [r0], r2 + vaddl.u8 q7, d3, d4 + vaddl.u8 q8, d1, d6 + vaddl.u8 q9, d2, d5 + vmls.u16 q4, q5, q12 @ temp -= temp2 * 5 + vld1.u32 d7[0], [r0], r2 + vmla.u16 q8, q7, q11 + vaddl.u8 q10, d4, d5 + vaddl.u8 q6, d2, d7 + vaddl.u8 q5, d3, d6 + vmls.u16 q8, q9, q12 + vqrshrun.s16 d26, q4, #5 @ dst[0_0] = CLIP_U8( (temp + 16) >> 5) + vld1.u32 d8[0], [r7], r2 @Load value for interpolation - row 0 + vld1.u32 d9[0], [r7], r2 @Load value for interpolation - row 1 + vmla.u16 q6, q10, q11 + vld1.u32 d0[0], [r0], r2 + vaddl.u8 q7, d5, d6 + vqrshrun.s16 d27, q8, #5 + vaddl.u8 q10, d3, d0 + vrhadd.u8 q13, q13, q4 @Interpolation step for qpel calculation + vmls.u16 q6, q5, q12 + vst1.u32 d26[0], [r1], r3 @ Vector store to dst[0_0] + vaddl.u8 q9, d4, d7 + vmla.u16 q10, q7, q11 + vst1.u32 d27[0], [r1], r3 @ store row 1 + vqrshrun.s16 d28, q6, #5 + vld1.u32 d12[0], [r7], r2 @Load value for interpolation - row 2 + vld1.u32 d13[0], [r7], r2 @Load value for interpolation - row 3 + + vmls.u16 q10, q9, q12 + vqrshrun.s16 d29, q10, #5 + vrhadd.u8 q14, q6, q14 @Interpolation step for qpel calculation + vst1.u32 d28[0], [r1], r3 @store row 2 + vst1.u32 d29[0], [r1], r3 @store row 3 + + subs r5, r5, #8 + subeq r0, r0, r2, lsl #2 + subeq r0, r0, r2 + beq loop_4 @ Loop if height==8 + +end_func: + vldmia sp!, {d8-d15} @ Restore neon registers that were saved + ldmfd sp!, {r4-r12, pc} @Restoring registers from stack + + diff --git a/common/arm/ih264_intra_pred_chroma_a9q.s b/common/arm/ih264_intra_pred_chroma_a9q.s new file mode 100755 index 0000000..d03fc55 --- /dev/null +++ b/common/arm/ih264_intra_pred_chroma_a9q.s @@ -0,0 +1,551 @@ +@/****************************************************************************** +@ * +@ * Copyright (C) 2015 The Android Open Source Project +@ * +@ * Licensed under the Apache License, Version 2.0 (the "License"); +@ * you may not use this file except in compliance with the License. +@ * You may obtain a copy of the License at: +@ * +@ * http://www.apache.org/licenses/LICENSE-2.0 +@ * +@ * Unless required by applicable law or agreed to in writing, software +@ * distributed under the License is distributed on an "AS IS" BASIS, +@ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +@ * See the License for the specific language governing permissions and +@ * limitations under the License. +@ * +@ ***************************************************************************** +@ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore +@*/ +@/** +@****************************************************************************** +@* @file +@* ih264_intra_pred_chroma_a9q.s +@* +@* @brief +@* Contains function definitions for intra chroma prediction . +@* +@* @author +@* Ittiam +@* +@* @par List of Functions: +@* +@* - ih264_intra_pred_chroma_mode_horz_a9q() +@* - ih264_intra_pred_chroma_8x8_mode_vert_a9q() +@* - ih264_intra_pred_chroma_mode_dc_a9q() +@* - ih264_intra_pred_chroma_mode_plane_a9q() +@* +@* @remarks +@* None +@* +@******************************************************************************* +@*/ + +@/* All the functions here are replicated from ih264_chroma_intra_pred_filters.c +@ + +@/** +@/** +@/** +@ +.text +.p2align 2 + + .extern ih264_gai1_intrapred_chroma_plane_coeffs1 +.hidden ih264_gai1_intrapred_chroma_plane_coeffs1 + .extern ih264_gai1_intrapred_chroma_plane_coeffs2 +.hidden ih264_gai1_intrapred_chroma_plane_coeffs2 +scratch_chroma_intrapred_addr1: + .long ih264_gai1_intrapred_chroma_plane_coeffs1 - scrlblc1 - 8 + +scratch_intrapred_chroma_plane_addr1: + .long ih264_gai1_intrapred_chroma_plane_coeffs2 - scrlblc2 - 8 +@/** +@******************************************************************************* +@* +@*ih264_intra_pred_chroma_8x8_mode_dc +@* +@* @brief +@* Perform Intra prediction for chroma_8x8 mode:DC +@* +@* @par Description: +@* Perform Intra prediction for chroma_8x8 mode:DC ,described in sec 8.3.4.1 +@* +@* @param[in] pu1_src +@* UWORD8 pointer to the source containing alternate U and V samples +@* +@* @param[out] pu1_dst +@* UWORD8 pointer to the destination with alternate U and V samples +@* +@* @param[in] src_strd +@* integer source stride +@* +@* @param[in] dst_strd +@* integer destination stride +@* +@** @param[in] ui_neighboravailability +@* availability of neighbouring pixels +@* +@* @returns +@* +@* @remarks +@* None +@* +@*******************************************************************************/ +@void ih264_intra_pred_chroma_8x8_mode_dc(UWORD8 *pu1_src, +@ UWORD8 *pu1_dst, +@ WORD32 src_strd, +@ WORD32 dst_strd, +@ WORD32 ui_neighboravailability) + +@**************Variables Vs Registers***************************************** +@ r0 => *pu1_src +@ r1 => *pu1_dst +@ r2 => src_strd +@ r3 => dst_strd +@ r4 => ui_neighboravailability + + + + .global ih264_intra_pred_chroma_8x8_mode_dc_a9q + +ih264_intra_pred_chroma_8x8_mode_dc_a9q: + + stmfd sp!, {r4, r14} @store register values to stack + ldr r4, [sp, #8] @r4 => ui_neighboravailability + vpush {d8-d15} + + ands r2, r4, #0x01 @CHECKING IF LEFT_AVAILABLE ELSE BRANCHING TO ONLY TOP AVAILABLE + beq top_available + ands r2, r4, #0x04 @CHECKING IF TOP_AVAILABLE ELSE BRANCHING TO ONLY LEFT AVAILABLE + beq left_available + + vld1.u8 {q0}, [r0] @BOTH LEFT AND TOP AVAILABLE + add r0, r0, #18 + vld1.u8 {q1}, [r0] + vaddl.u8 q2, d1, d2 + vaddl.u8 q3, d0, d3 + vmovl.u8 q1, d3 + vmovl.u8 q0, d0 + + vadd.u16 d12, d4, d5 + vadd.u16 d13, d2, d3 + vadd.u16 d15, d6, d7 + vadd.u16 d14, d0, d1 + + vpadd.u32 d12, d12, d15 + vpadd.u32 d14, d13, d14 + vqrshrun.s16 d12, q6, #3 + vqrshrun.s16 d14, q7, #2 + vdup.u16 d8, d12[0] + vdup.u16 d9, d14[0] + vdup.u16 d10, d14[1] + vdup.u16 d11, d12[1] + b str_pred + +top_available: @ONLY TOP AVAILABLE + ands r2, r4, #0x04 @CHECKING TOP AVAILABILTY OR ELSE BRANCH TO NONE AVAILABLE + beq none_available + + add r0, r0, #18 + vld1.u8 {q0}, [r0] + vmovl.u8 q1, d0 + vmovl.u8 q2, d1 + vadd.u16 d0, d2, d3 + vadd.u16 d1, d4, d5 + vpaddl.u32 q0, q0 + vqrshrun.s16 d0, q0, #2 + vdup.u16 d8, d0[0] + vdup.u16 d9, d0[2] + vmov q5, q4 + b str_pred + +left_available: @ONLY LEFT AVAILABLE + vld1.u8 {q0}, [r0] + vmovl.u8 q1, d0 + vmovl.u8 q2, d1 + vadd.u16 d0, d2, d3 + vadd.u16 d1, d4, d5 + vpaddl.u32 q0, q0 + vqrshrun.s16 d0, q0, #2 + vdup.u16 q5, d0[0] + vdup.u16 q4, d0[2] + b str_pred + +none_available: @NONE AVAILABLE + vmov.u8 q4, #128 + vmov.u8 q5, #128 + +str_pred: + vst1.8 {q4}, [r1], r3 + vst1.8 {q4}, [r1], r3 + vst1.8 {q4}, [r1], r3 + vst1.8 {q4}, [r1], r3 + vst1.8 {q5}, [r1], r3 + vst1.8 {q5}, [r1], r3 + vst1.8 {q5}, [r1], r3 + vst1.8 {q5}, [r1], r3 + + vpop {d8-d15} + ldmfd sp!, {r4, pc} @Restoring registers from stack + + + +@/****************************************************************************** + + +@/** +@******************************************************************************* +@* +@*ih264_intra_pred_chroma_8x8_mode_horz +@* +@* @brief +@* Perform Intra prediction for chroma_8x8 mode:Horizontal +@* +@* @par Description: +@* Perform Intra prediction for chroma_8x8 mode:Horizontal ,described in sec 8.3.4.2 +@* +@* @param[in] pu1_src +@* UWORD8 pointer to the source containing alternate U and V samples +@* +@* @param[out] pu1_dst +@* UWORD8 pointer to the destination with alternate U and V samples +@* +@* @param[in] src_strd +@* integer source stride +@* +@* @param[in] dst_strd +@* integer destination stride +@* +@* @param[in] ui_neighboravailability +@* availability of neighbouring pixels(Not used in this function) +@* +@* @returns +@* +@* @remarks +@* None +@* +@******************************************************************************* +@*/ +@void ih264_intra_pred_chroma_8x8_mode_horz(UWORD8 *pu1_src, +@ UWORD8 *pu1_dst, +@ WORD32 src_strd, +@ WORD32 dst_strd, +@ WORD32 ui_neighboravailability) +@**************Variables Vs Registers***************************************** +@ r0 => *pu1_src +@ r1 => *pu1_dst +@ r2 => src_strd +@ r3 => dst_strd +@ r4 => ui_neighboravailability + + + .global ih264_intra_pred_chroma_8x8_mode_horz_a9q + +ih264_intra_pred_chroma_8x8_mode_horz_a9q: + + stmfd sp!, {r14} @store register values to stack + + vld1.u8 {q0}, [r0] + mov r2, #6 + + vdup.u16 q1, d1[3] + vdup.u16 q2, d1[2] + vst1.8 {q1}, [r1], r3 + +loop_8x8_horz: + vext.8 q0, q0, q0, #12 + vst1.8 {q2}, [r1], r3 + vdup.u16 q1, d1[3] + subs r2, #2 + vdup.u16 q2, d1[2] + vst1.8 {q1}, [r1], r3 + bne loop_8x8_horz + + vext.8 q0, q0, q0, #12 + vst1.8 {q2}, [r1], r3 + + ldmfd sp!, {pc} @restoring registers from stack + + + + +@/** +@******************************************************************************* +@* +@*ih264_intra_pred_chroma_8x8_mode_vert +@* +@* @brief +@* Perform Intra prediction for chroma_8x8 mode:vertical +@* +@* @par Description: +@*Perform Intra prediction for chroma_8x8 mode:vertical ,described in sec 8.3.4.3 +@* +@* @param[in] pu1_src +@* UWORD8 pointer to the source containing alternate U and V samples +@* +@* @param[out] pu1_dst +@* UWORD8 pointer to the destination with alternate U and V samples +@* +@* @param[in] src_strd +@* integer source stride +@* +@* @param[in] dst_strd +@* integer destination stride +@* +@* @param[in] ui_neighboravailability +@* availability of neighbouring pixels(Not used in this function) +@* +@* @returns +@* +@* @remarks +@* None +@* +@******************************************************************************* +@void ih264_intra_pred_chroma_8x8_mode_vert(UWORD8 *pu1_src, +@ UWORD8 *pu1_dst, +@ WORD32 src_strd, +@ WORD32 dst_strd, +@ WORD32 ui_neighboravailability) + +@**************Variables Vs Registers***************************************** +@ r0 => *pu1_src +@ r1 => *pu1_dst +@ r2 => src_strd +@ r3 => dst_strd +@ r4 => ui_neighboravailability + + + .global ih264_intra_pred_chroma_8x8_mode_vert_a9q + +ih264_intra_pred_chroma_8x8_mode_vert_a9q: + + stmfd sp!, {r4-r12, r14} @store register values to stack + + add r0, r0, #18 + vld1.8 {q0}, [r0] + + vst1.8 {q0}, [r1], r3 + vst1.8 {q0}, [r1], r3 + vst1.8 {q0}, [r1], r3 + vst1.8 {q0}, [r1], r3 + vst1.8 {q0}, [r1], r3 + vst1.8 {q0}, [r1], r3 + vst1.8 {q0}, [r1], r3 + vst1.8 {q0}, [r1], r3 + + ldmfd sp!, {r4-r12, pc} @Restoring registers from stack + + + + +@/****************************************************************************** + + +@/** +@******************************************************************************* +@* +@*ih264_intra_pred_chroma_8x8_mode_plane +@* +@* @brief +@* Perform Intra prediction for chroma_8x8 mode:PLANE +@* +@* @par Description: +@* Perform Intra prediction for chroma_8x8 mode:PLANE ,described in sec 8.3.4.4 +@* +@* @param[in] pu1_src +@* UWORD8 pointer to the source containing alternate U and V samples +@* +@* @param[out] pu1_dst +@* UWORD8 pointer to the destination with alternate U and V samples +@* +@* @param[in] src_strd +@* integer source stride +@* +@* @param[in] dst_strd +@* integer destination stride +@* +@* @param[in] ui_neighboravailability +@* availability of neighbouring pixels +@* +@* @returns +@* +@* @remarks +@* None +@* +@*******************************************************************************/ +@void ih264_intra_pred_chroma_8x8_mode_plane(UWORD8 *pu1_src, +@ UWORD8 *pu1_dst, +@ WORD32 src_strd, +@ WORD32 dst_strd, +@ WORD32 ui_neighboravailability) + +@**************Variables Vs Registers***************************************** +@ r0 => *pu1_src +@ r1 => *pu1_dst +@ r2 => src_strd +@ r3 => dst_strd +@ r4 => ui_neighboravailability + + .global ih264_intra_pred_chroma_8x8_mode_plane_a9q +ih264_intra_pred_chroma_8x8_mode_plane_a9q: + + stmfd sp!, {r4-r10, r12, lr} + vpush {d8-d15} + + + vld1.32 d0, [r0] + add r10, r0, #10 + vld1.32 d1, [r10] + add r10, r10, #6 + vrev64.16 d5, d0 + vld1.32 d2, [r10]! + add r10, r10, #2 + vrev64.16 d7, d2 + vld1.32 d3, [r10] + sub r5, r3, #8 + ldr r12, scratch_chroma_intrapred_addr1 +scrlblc1: + add r12, r12, pc + vsubl.u8 q5, d5, d1 + vld1.64 {q4}, [r12] @ Load multiplication factors 1 to 8 into D3 + vsubl.u8 q6, d3, d7 + vmul.s16 q7, q5, q4 + vmul.s16 q8, q6, q4 + vuzp.16 q7, q8 + + vpadd.s16 d14, d14 + vpadd.s16 d15, d15 + vpadd.s16 d16, d16 + vpadd.s16 d17, d17 + vpadd.s16 d14, d14 + vpadd.s16 d15, d15 + vpadd.s16 d16, d16 + vpadd.s16 d17, d17 + + mov r6, #34 + vdup.16 q9, r6 + + vmull.s16 q11, d14, d18 + vmull.s16 q12, d15, d18 + vmull.s16 q13, d16, d18 + vmull.s16 q14, d17, d18 + + vrshrn.s32 d10, q11, #6 + vrshrn.s32 d12, q12, #6 + vrshrn.s32 d13, q13, #6 + vrshrn.s32 d14, q14, #6 + + + ldrb r6, [r0], #1 + add r10, r0, #31 + ldrb r8, [r0], #1 + ldrb r7, [r10], #1 + ldrb r9, [r10], #1 + + add r6, r6, r7 + add r8, r8, r9 + lsl r6, r6, #4 + lsl r8, r8, #4 + + vdup.16 q0, r6 + vdup.16 q1, r8 + vdup.16 q2, d12[0] + vdup.16 q3, d10[0] + + vdup.16 q12, d14[0] + vdup.16 q13, d13[0] + vzip.16 q2, q12 + vzip.16 q3, q13 + vzip.16 q0, q1 + + ldr r12, scratch_intrapred_chroma_plane_addr1 +scrlblc2: + add r12, r12, pc + vld1.64 {q4}, [r12] + vmov.16 q5, q4 + vmov q11, q4 + vzip.16 q4, q5 + + vmul.s16 q6, q2, q4 + vmul.s16 q8, q2, q5 + vadd.s16 q6, q0, q6 + vadd.s16 q8, q0, q8 + + + vdup.16 q10, d22[0] + vmul.s16 q2, q3, q10 + vdup.16 q15, d22[1] + vmul.s16 q9, q3, q10 + vmul.s16 q7, q3, q15 + vmul.s16 q4, q3, q15 + vadd.s16 q12, q6, q2 + vadd.s16 q0, q8, q9 + vadd.s16 q1, q6, q7 + vqrshrun.s16 d28, q12, #5 + vadd.s16 q13, q8, q4 + vqrshrun.s16 d29, q0, #5 + vdup.16 q10, d22[2] + vst1.8 {q14}, [r1], r3 + vqrshrun.s16 d28, q1, #5 + vqrshrun.s16 d29, q13, #5 + vmul.s16 q2, q3, q10 + vmul.s16 q9, q3, q10 + vst1.8 {q14}, [r1], r3 + vadd.s16 q12, q6, q2 + vadd.s16 q0, q8, q9 + vdup.16 q15, d22[3] + vqrshrun.s16 d28, q12, #5 + vqrshrun.s16 d29, q0, #5 + vmul.s16 q7, q3, q15 + vmul.s16 q4, q3, q15 + vst1.8 {q14}, [r1], r3 + vadd.s16 q1, q6, q7 + vadd.s16 q13, q8, q4 + vdup.16 q10, d23[0] + vqrshrun.s16 d28, q1, #5 + vqrshrun.s16 d29, q13, #5 + vmul.s16 q2, q3, q10 + vmul.s16 q9, q3, q10 + vst1.8 {q14}, [r1], r3 + vadd.s16 q12, q6, q2 + vadd.s16 q0, q8, q9 + vdup.16 q15, d23[1] + vqrshrun.s16 d28, q12, #5 + vqrshrun.s16 d29, q0, #5 + vmul.s16 q7, q3, q15 + vmul.s16 q4, q3, q15 + vst1.8 {q14}, [r1], r3 + vadd.s16 q1, q6, q7 + vadd.s16 q13, q8, q4 + vdup.16 q10, d23[2] + vqrshrun.s16 d28, q1, #5 + vqrshrun.s16 d29, q13, #5 + vmul.s16 q2, q3, q10 + vmul.s16 q9, q3, q10 + vst1.8 {q14}, [r1], r3 + vadd.s16 q12, q6, q2 + vadd.s16 q0, q8, q9 + vdup.16 q15, d23[3] + vqrshrun.s16 d28, q12, #5 + vqrshrun.s16 d29, q0, #5 + vmul.s16 q7, q3, q15 + vmul.s16 q4, q3, q15 + vst1.8 {q14}, [r1], r3 + vadd.s16 q1, q6, q7 + vadd.s16 q13, q8, q4 + vqrshrun.s16 d28, q1, #5 + vqrshrun.s16 d29, q13, #5 + vst1.8 {q14}, [r1], r3 + + + +end_func_plane: + + + vpop {d8-d15} + ldmfd sp!, {r4-r10, r12, pc} + + + + diff --git a/common/arm/ih264_intra_pred_luma_16x16_a9q.s b/common/arm/ih264_intra_pred_luma_16x16_a9q.s new file mode 100755 index 0000000..e38e203 --- /dev/null +++ b/common/arm/ih264_intra_pred_luma_16x16_a9q.s @@ -0,0 +1,520 @@ +@/****************************************************************************** +@ * +@ * Copyright (C) 2015 The Android Open Source Project +@ * +@ * Licensed under the Apache License, Version 2.0 (the "License"); +@ * you may not use this file except in compliance with the License. +@ * You may obtain a copy of the License at: +@ * +@ * http://www.apache.org/licenses/LICENSE-2.0 +@ * +@ * Unless required by applicable law or agreed to in writing, software +@ * distributed under the License is distributed on an "AS IS" BASIS, +@ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +@ * See the License for the specific language governing permissions and +@ * limitations under the License. +@ * +@ ***************************************************************************** +@ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore +@*/ +@/** +@****************************************************************************** +@* @file +@* ih264_intra_pred_luma_16x16_a9q.s +@* +@* @brief +@* Contains function definitions for intra 16x16 Luma prediction . +@* +@* @author +@* Ittiam +@* +@* @par List of Functions: +@* +@* - ih264_intra_pred_luma_16x16_mode_vert_a9q() +@* - ih264_intra_pred_luma_16x16_mode_horz_a9q() +@* - ih264_intra_pred_luma_16x16_mode_dc_a9q() +@* - ih264_intra_pred_luma_16x16_mode_plane_a9q() +@* +@* @remarks +@* None +@* +@******************************************************************************* +@*/ + +@/* All the functions here are replicated from ih264_intra_pred_filters.c +@ + +@/** +@/** +@/** +@ + +.text +.p2align 2 + + + .extern ih264_gai1_intrapred_luma_plane_coeffs +.hidden ih264_gai1_intrapred_luma_plane_coeffs +scratch_intrapred_addr1: + .long ih264_gai1_intrapred_luma_plane_coeffs - scrlbl1 - 8 +@/** +@******************************************************************************* +@* +@*ih264_intra_pred_luma_16x16_mode_vert_a9q +@* +@* @brief +@* Perform Intra prediction for luma_16x16 mode:vertical +@* +@* @par Description: +@* Perform Intra prediction for luma_16x16 mode:Vertical ,described in sec 8.3.3.1 +@* +@* @param[in] pu1_src +@* UWORD8 pointer to the source +@* +@* @param[out] pu1_dst +@* UWORD8 pointer to the destination +@* +@* @param[in] src_strd +@* integer source stride +@* +@* @param[in] dst_strd +@* integer destination stride +@* +@* @param[in] ui_neighboravailability +@* availability of neighbouring pixels(Not used in this function) +@* +@* @returns +@* +@* @remarks +@* None +@* +@******************************************************************************* +@void ih264_intra_pred_luma_16x16_mode_vert(UWORD8 *pu1_src, +@ UWORD8 *pu1_dst, +@ WORD32 src_strd, +@ WORD32 dst_strd, +@ WORD32 ui_neighboravailability) + +@**************Variables Vs Registers***************************************** +@ r0 => *pu1_src +@ r1 => *pu1_dst +@ r2 => src_strd +@ r3 => dst_strd +@ r4 => ui_neighboravailability + + + .global ih264_intra_pred_luma_16x16_mode_vert_a9q + +ih264_intra_pred_luma_16x16_mode_vert_a9q: + + stmfd sp!, {r4-r12, r14} @store register values to stack + + add r0, r0, #17 + vld1.8 {q0}, [r0] + + vst1.8 {q0}, [r1], r3 + vst1.8 {q0}, [r1], r3 + vst1.8 {q0}, [r1], r3 + vst1.8 {q0}, [r1], r3 + vst1.8 {q0}, [r1], r3 + vst1.8 {q0}, [r1], r3 + vst1.8 {q0}, [r1], r3 + vst1.8 {q0}, [r1], r3 + vst1.8 {q0}, [r1], r3 + vst1.8 {q0}, [r1], r3 + vst1.8 {q0}, [r1], r3 + vst1.8 {q0}, [r1], r3 + vst1.8 {q0}, [r1], r3 + vst1.8 {q0}, [r1], r3 + vst1.8 {q0}, [r1], r3 + vst1.8 {q0}, [r1], r3 + + ldmfd sp!, {r4-r12, pc} @Restoring registers from stack + + + + + +@/****************************************************************************** + + +@/** +@******************************************************************************* +@* +@*ih264_intra_pred_luma_16x16_mode_horz_a9q +@* +@* @brief +@* Perform Intra prediction for luma_16x16 mode:horizontal +@* +@* @par Description: +@* Perform Intra prediction for luma_16x16 mode:horizontal ,described in sec 8.3.3.2 +@* +@* @param[in] pu1_src +@* UWORD8 pointer to the source +@* +@* @param[out] pu1_dst +@* UWORD8 pointer to the destination +@* +@* @param[in] src_strd +@* integer source stride +@* +@* @param[in] dst_strd +@* integer destination stride +@* +@* @param[in] ui_neighboravailability +@* availability of neighbouring pixels(Not used in this function) +@* +@* @returns +@* +@* @remarks +@* None +@* +@******************************************************************************* +@*/ +@void ih264_intra_pred_luma_16x16_mode_horz(UWORD8 *pu1_src, +@ UWORD8 *pu1_dst, +@ WORD32 src_strd, +@ WORD32 dst_strd, +@ WORD32 ui_neighboravailability) +@**************Variables Vs Registers***************************************** +@ r0 => *pu1_src +@ r1 => *pu1_dst +@ r2 => src_strd +@ r3 => dst_strd +@ r4 => ui_neighboravailability + + .global ih264_intra_pred_luma_16x16_mode_horz_a9q + +ih264_intra_pred_luma_16x16_mode_horz_a9q: + + stmfd sp!, {r14} @store register values to stack + + vld1.u8 {q0}, [r0] + mov r2, #14 + + vdup.u8 q1, d1[7] + vdup.u8 q2, d1[6] + vst1.8 {q1}, [r1], r3 + +loop_16x16_horz: + vext.8 q0, q0, q0, #14 + vst1.8 {q2}, [r1], r3 + vdup.u8 q1, d1[7] + subs r2, #2 + vdup.u8 q2, d1[6] + vst1.8 {q1}, [r1], r3 + bne loop_16x16_horz + + vext.8 q0, q0, q0, #14 + vst1.8 {q2}, [r1], r3 + + ldmfd sp!, {pc} @Restoring registers from stack + + + + +@/****************************************************************************** + + +@/** +@******************************************************************************* +@* +@*ih264_intra_pred_luma_16x16_mode_dc_a9q +@* +@* @brief +@* Perform Intra prediction for luma_16x16 mode:DC +@* +@* @par Description: +@* Perform Intra prediction for luma_16x16 mode:DC ,described in sec 8.3.3.3 +@* +@* @param[in] pu1_src +@* UWORD8 pointer to the source +@* +@* @param[out] pu1_dst +@* UWORD8 pointer to the destination +@* +@* @param[in] src_strd +@* integer source stride +@* +@* @param[in] dst_strd +@* integer destination stride +@* +@* @param[in] ui_neighboravailability +@* availability of neighbouring pixels +@* +@* @returns +@* +@* @remarks +@* None +@* +@*******************************************************************************/ +@void ih264_intra_pred_luma_16x16_mode_dc(UWORD8 *pu1_src, +@ UWORD8 *pu1_dst, +@ WORD32 src_strd, +@ WORD32 dst_strd, +@ WORD32 ui_neighboravailability) + +@**************Variables Vs Registers***************************************** +@ r0 => *pu1_src +@ r1 => *pu1_dst +@ r2 => src_strd +@ r3 => dst_strd +@ r4 => ui_neighboravailability + + .global ih264_intra_pred_luma_16x16_mode_dc_a9q + +ih264_intra_pred_luma_16x16_mode_dc_a9q: + + stmfd sp!, {r4, r14} @store register values to stack + ldr r4, [sp, #8] @r4 => ui_neighboravailability + + ands r2, r4, #0x01 @CHECKING IF LEFT_AVAILABLE ELSE BRANCHING TO ONLY TOP AVAILABLE + beq top_available + ands r2, r4, #0x04 @CHECKING IF TOP_AVAILABLE ELSE BRANCHING TO ONLY LEFT AVAILABLE + beq left_available + + vld1.u8 {q0}, [r0] @BOTH LEFT AND TOP AVAILABLE + add r0, r0, #17 + vpaddl.u8 q0, q0 + vld1.u8 {q1}, [r0] + vpaddl.u8 q1, q1 + vadd.u16 q0, q0, q1 + vadd.u16 d0, d0, d1 + vpaddl.u16 d0, d0 + vpaddl.u32 d0, d0 + vqrshrun.s16 d0, q0, #5 + vdup.u8 q0, d0[0] + b str_pred + +top_available: @ONLY TOP AVAILABLE + ands r2, r4, #0x04 @CHECKING TOP AVAILABILTY OR ELSE BRANCH TO NONE AVAILABLE + beq none_available + + add r0, r0, #17 + vld1.u8 {q0}, [r0] + vpaddl.u8 q0, q0 + vadd.u16 d0, d0, d1 + vpaddl.u16 d0, d0 + vpaddl.u32 d0, d0 + vqrshrun.s16 d0, q0, #4 + vdup.u8 q0, d0[0] + b str_pred + +left_available: @ONLY LEFT AVAILABLE + vld1.u8 {q0}, [r0] + vpaddl.u8 q0, q0 + vadd.u16 d0, d0, d1 + vpaddl.u16 d0, d0 + vpaddl.u32 d0, d0 + vqrshrun.s16 d0, q0, #4 + vdup.u8 q0, d0[0] + b str_pred + +none_available: @NONE AVAILABLE + vmov.u8 q0, #128 + +str_pred: + vst1.8 {q0}, [r1], r3 + vst1.8 {q0}, [r1], r3 + vst1.8 {q0}, [r1], r3 + vst1.8 {q0}, [r1], r3 + vst1.8 {q0}, [r1], r3 + vst1.8 {q0}, [r1], r3 + vst1.8 {q0}, [r1], r3 + vst1.8 {q0}, [r1], r3 + vst1.8 {q0}, [r1], r3 + vst1.8 {q0}, [r1], r3 + vst1.8 {q0}, [r1], r3 + vst1.8 {q0}, [r1], r3 + vst1.8 {q0}, [r1], r3 + vst1.8 {q0}, [r1], r3 + vst1.8 {q0}, [r1], r3 + vst1.8 {q0}, [r1], r3 + + ldmfd sp!, {r4, pc} @Restoring registers from stack + + + + + +@/****************************************************************************** + + +@/** +@******************************************************************************* +@* +@*ih264_intra_pred_luma_16x16_mode_plane_a9q +@* +@* @brief +@* Perform Intra prediction for luma_16x16 mode:PLANE +@* +@* @par Description: +@* Perform Intra prediction for luma_16x16 mode:PLANE ,described in sec 8.3.3.4 +@* +@* @param[in] pu1_src +@* UWORD8 pointer to the source +@* +@* @param[out] pu1_dst +@* UWORD8 pointer to the destination +@* +@* @param[in] src_strd +@* integer source stride +@* +@* @param[in] dst_strd +@* integer destination stride +@* +@* @param[in] ui_neighboravailability +@* availability of neighbouring pixels +@* +@* @returns +@* +@* @remarks +@* None +@* +@*******************************************************************************/ +@void ih264_intra_pred_luma_16x16_mode_plane(UWORD8 *pu1_src, +@ UWORD8 *pu1_dst, +@ WORD32 src_strd, +@ WORD32 dst_strd, +@ WORD32 ui_neighboravailability) + +@**************Variables Vs Registers***************************************** +@ r0 => *pu1_src +@ r1 => *pu1_dst +@ r2 => src_strd +@ r3 => dst_strd +@ r4 => ui_neighboravailability + + .global ih264_intra_pred_luma_16x16_mode_plane_a9q +ih264_intra_pred_luma_16x16_mode_plane_a9q: + + stmfd sp!, {r4-r10, r12, lr} + + mov r2, r1 + add r1, r0, #17 + add r0, r0, #15 + + mov r8, #9 + sub r1, r1, #1 + mov r10, r1 @top_left + mov r4, #-1 + vld1.32 d2, [r1], r8 + ldr r7, scratch_intrapred_addr1 +scrlbl1: + add r7, r7, pc + + vld1.32 d0, [r1] + vrev64.8 d2, d2 + vld1.32 {q3}, [r7] + vsubl.u8 q0, d0, d2 + vmovl.u8 q8, d6 + vmul.s16 q0, q0, q8 + vmovl.u8 q9, d7 + + add r7, r0, r4, lsl #3 + sub r0, r7, r4, lsl #1 + rsb lr, r4, #0x0 + + vpadd.s16 d0, d0, d1 + + ldrb r8, [r7], r4 + ldrb r9, [r0], lr + + vpaddl.s16 d0, d0 + sub r12, r8, r9 + + ldrb r8, [r7], r4 + + vpaddl.s32 d0, d0 + ldrb r9, [r0], lr + sub r8, r8, r9 + vshl.s32 d2, d0, #2 + add r12, r12, r8, lsl #1 + + vadd.s32 d0, d0, d2 + ldrb r8, [r7], r4 + ldrb r9, [r0], lr + vrshr.s32 d0, d0, #6 @ i_b = D0[0] + sub r8, r8, r9 + ldrb r5, [r7], r4 + add r8, r8, r8, lsl #1 + + vdup.16 q2, d0[0] + add r12, r12, r8 + ldrb r9, [r0], lr + vmul.s16 q0, q2, q8 + sub r5, r5, r9 + vmul.s16 q1, q2, q9 + add r12, r12, r5, lsl #2 + + ldrb r8, [r7], r4 + ldrb r9, [r0], lr + sub r8, r8, r9 + ldrb r5, [r7], r4 + add r8, r8, r8, lsl #2 + ldrb r6, [r0], lr + add r12, r12, r8 + ldrb r8, [r7], r4 + ldrb r9, [r0], lr + + sub r5, r5, r6 + sub r8, r8, r9 + add r5, r5, r5, lsl #1 + rsb r8, r8, r8, lsl #3 + add r12, r12, r5, lsl #1 + ldrb r5, [r7], r4 + ldrb r6, [r10] @top_left + add r12, r12, r8 + sub r9, r5, r6 + ldrb r6, [r1, #7] + add r12, r12, r9, lsl #3 @ i_c = r12 + add r8, r5, r6 + + add r12, r12, r12, lsl #2 + lsl r8, r8, #4 @ i_a = r8 + + add r12, r12, #0x20 + lsr r12, r12, #6 + + vshl.s16 q14, q2, #3 + vdup.16 q3, r12 + + vdup.16 q15, r8 + vshl.s16 q13, q3, #3 + vsub.s16 q15, q15, q14 + vsub.s16 q15, q15, q13 + vadd.s16 q14, q15, q3 + + mov r0, #14 + vadd.s16 q13, q14, q0 + vadd.s16 q14, q14, q1 + vqrshrun.s16 d20, q13, #5 + vqrshrun.s16 d21, q14, #5 + +loop_16x16_plane: + + vadd.s16 q13, q13, q3 + vadd.s16 q14, q14, q3 + vqrshrun.s16 d22, q13, #5 + vst1.32 {q10}, [r2], r3 + vqrshrun.s16 d23, q14, #5 + + vadd.s16 q13, q13, q3 + subs r0, #2 + vadd.s16 q14, q14, q3 + vqrshrun.s16 d20, q13, #5 + vst1.32 {q11}, [r2], r3 + vqrshrun.s16 d21, q14, #5 + bne loop_16x16_plane + + vadd.s16 q13, q13, q3 + vadd.s16 q14, q14, q3 + vqrshrun.s16 d22, q13, #5 + vst1.32 {q10}, [r2], r3 + vqrshrun.s16 d23, q14, #5 + vst1.32 {q11}, [r2], r3 + + ldmfd sp!, {r4-r10, r12, pc} + + + diff --git a/common/arm/ih264_intra_pred_luma_4x4_a9q.s b/common/arm/ih264_intra_pred_luma_4x4_a9q.s new file mode 100755 index 0000000..cb386ea --- /dev/null +++ b/common/arm/ih264_intra_pred_luma_4x4_a9q.s @@ -0,0 +1,842 @@ +@/****************************************************************************** +@ * +@ * Copyright (C) 2015 The Android Open Source Project +@ * +@ * Licensed under the Apache License, Version 2.0 (the "License"); +@ * you may not use this file except in compliance with the License. +@ * You may obtain a copy of the License at: +@ * +@ * http://www.apache.org/licenses/LICENSE-2.0 +@ * +@ * Unless required by applicable law or agreed to in writing, software +@ * distributed under the License is distributed on an "AS IS" BASIS, +@ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +@ * See the License for the specific language governing permissions and +@ * limitations under the License. +@ * +@ ***************************************************************************** +@ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore +@*/ +@/** +@****************************************************************************** +@* @file +@* ih264_intra_pred_luma_4x4_a9q.s +@* +@* @brief +@* Contains function definitions for intra 4x4 Luma prediction . +@* +@* @author +@* Ittiam +@* +@* @par List of Functions: +@* +@* -ih264_intra_pred_luma_4x4_mode_vert_a9q +@* -ih264_intra_pred_luma_4x4_mode_horz_a9q +@* -ih264_intra_pred_luma_4x4_mode_dc_a9q +@* -ih264_intra_pred_luma_4x4_mode_diag_dl_a9q +@* -ih264_intra_pred_luma_4x4_mode_diag_dr_a9q +@* -ih264_intra_pred_luma_4x4_mode_vert_r_a9q +@* -ih264_intra_pred_luma_4x4_mode_horz_d_a9q +@* -ih264_intra_pred_luma_4x4_mode_vert_l_a9q +@* -ih264_intra_pred_luma_4x4_mode_horz_u_a9q +@* +@* @remarks +@* None +@* +@******************************************************************************* +@*/ + +@/* All the functions here are replicated from ih264_intra_pred_filters.c +@ + +@/** +@/** +@/** +@ + +.text +.p2align 2 + + +@/** +@******************************************************************************* +@* +@*ih264_intra_pred_luma_4x4_mode_vert +@* +@* @brief +@* Perform Intra prediction for luma_4x4 mode:vertical +@* +@* @par Description: +@* Perform Intra prediction for luma_4x4 mode:vertical ,described in sec 8.3.1.2.1 +@* +@* @param[in] pu1_src +@* UWORD8 pointer to the source +@* +@* @param[out] pu1_dst +@* UWORD8 pointer to the destination +@* +@* @param[in] src_strd +@* integer source stride +@* +@* @param[in] dst_strd +@* integer destination stride +@* +@* @param[in] ui_neighboravailability +@* availability of neighbouring pixels(Not used in this function) +@* +@* @returns +@* +@* @remarks +@* None +@* +@******************************************************************************* +@void ih264_intra_pred_luma_4x4_mode_vert(UWORD8 *pu1_src, +@ UWORD8 *pu1_dst, +@ WORD32 src_strd, +@ WORD32 dst_strd, +@ WORD32 ui_neighboravailability) + +@**************Variables Vs Registers***************************************** +@ r0 => *pu1_src +@ r1 => *pu1_dst +@ r2 => src_strd +@ r3 => dst_strd +@ r4 => ui_neighboravailability + + .global ih264_intra_pred_luma_4x4_mode_vert_a9q + +ih264_intra_pred_luma_4x4_mode_vert_a9q: + + + + stmfd sp!, {r4-r12, r14} @store register values to stack + + add r0, r0, #5 + + vld1.32 d0[0], [r0] + + vst1.32 d0[0], [r1], r3 + vst1.32 d0[0], [r1], r3 + vst1.32 d0[0], [r1], r3 + vst1.32 d0[0], [r1], r3 + + + + ldmfd sp!, {r4-r12, pc} @Restoring registers from stack + + + + + +@/****************************************************************************** + + +@/** +@******************************************************************************* +@* +@*ih264_intra_pred_luma_4x4_mode_horz +@* +@* @brief +@* Perform Intra prediction for luma_4x4 mode:horizontal +@* +@* @par Description: +@* Perform Intra prediction for luma_4x4 mode:horizontal ,described in sec 8.3.1.2.2 +@* +@* @param[in] pu1_src +@* UWORD8 pointer to the source +@* +@* @param[out] pu1_dst +@* UWORD8 pointer to the destination +@* +@* @param[in] src_strd +@* integer source stride +@* +@* @param[in] dst_strd +@* integer destination stride +@* +@* @param[in] ui_neighboravailability +@* availability of neighbouring pixels(Not used in this function) +@* +@* @returns +@* +@* @remarks +@* None +@* +@******************************************************************************* +@*/ +@void ih264_intra_pred_luma_4x4_mode_horz(UWORD8 *pu1_src, +@ UWORD8 *pu1_dst, +@ WORD32 src_strd, +@ WORD32 dst_strd, +@ WORD32 ui_neighboravailability) +@**************Variables Vs Registers***************************************** +@ r0 => *pu1_src +@ r1 => *pu1_dst +@ r2 => src_strd +@ r3 => dst_strd +@ r4 => ui_neighboravailability + + + + .global ih264_intra_pred_luma_4x4_mode_horz_a9q + +ih264_intra_pred_luma_4x4_mode_horz_a9q: + + + + stmfd sp!, {r4-r12, r14} @store register values to stack + add r0, r0, #3 + mov r2 , #-1 + + ldrb r5, [r0], r2 + vdup.u8 d0, r5 + ldrb r6, [r0], r2 + vst1.32 d0[0], [r1], r3 + vdup.u8 d1, r6 + ldrb r7, [r0], r2 + vst1.32 d1[0], [r1], r3 + vdup.u8 d2, r7 + ldrb r8, [r0], r2 + vst1.32 d2[0], [r1], r3 + vdup.u8 d3, r8 + vst1.32 d3[0], [r1], r3 + + + ldmfd sp!, {r4-r12, pc} @Restoring registers from stack + + + + + + + +@/****************************************************************************** + + +@/** +@******************************************************************************* +@* +@*ih264_intra_pred_luma_4x4_mode_dc +@* +@* @brief +@* Perform Intra prediction for luma_4x4 mode:DC +@* +@* @par Description: +@* Perform Intra prediction for luma_4x4 mode:DC ,described in sec 8.3.1.2.3 +@* +@* @param[in] pu1_src +@* UWORD8 pointer to the source +@* +@* @param[out] pu1_dst +@* UWORD8 pointer to the destination +@* +@* @param[in] src_strd +@* integer source stride +@* +@* @param[in] dst_strd +@* integer destination stride +@* +@* @param[in] ui_neighboravailability +@* availability of neighbouring pixels +@* +@* @returns +@* +@* @remarks +@* None +@* +@*******************************************************************************/ +@void ih264_intra_pred_luma_4x4_mode_dc(UWORD8 *pu1_src, +@ UWORD8 *pu1_dst, +@ WORD32 src_strd, +@ WORD32 dst_strd, +@ WORD32 ui_neighboravailability) + +@**************Variables Vs Registers***************************************** +@ r0 => *pu1_src +@ r1 => *pu1_dst +@ r2 => src_strd +@ r3 => dst_strd +@ r4 => ui_neighboravailability + + + + .global ih264_intra_pred_luma_4x4_mode_dc_a9q + +ih264_intra_pred_luma_4x4_mode_dc_a9q: + + + + stmfd sp!, {r4-r12, r14} @store register values to stack + ldr r4, [sp, #40] @ r4 => ui_neighboravailability + + ands r5, r4, #0x01 + beq top_available @LEFT NOT AVAILABLE + + add r10, r0, #3 + mov r2, #-1 + ldrb r5, [r10], r2 + ldrb r6, [r10], r2 + ldrb r7, [r10], r2 + add r5, r5, r6 + ldrb r8, [r10], r2 + add r5, r5, r7 + ands r11, r4, #0x04 @ CHECKING IF TOP_AVAILABLE ELSE BRANCHING TO ONLY LEFT AVAILABLE + add r5, r5, r8 + beq left_available + add r10, r0, #5 + @ BOTH LEFT AND TOP AVAILABLE + ldrb r6, [r10], #1 + ldrb r7, [r10], #1 + add r5, r5, r6 + ldrb r8, [r10], #1 + add r5, r5, r7 + ldrb r9, [r10], #1 + add r5, r5, r8 + add r5, r5, r9 + add r5, r5, #4 + lsr r5, r5, #3 + vdup.u8 d0, r5 + vst1.32 d0[0], [r1], r3 + vst1.32 d0[0], [r1], r3 + vst1.32 d0[0], [r1], r3 + vst1.32 d0[0], [r1], r3 + b end_func + +top_available: @ ONLT TOP AVAILABLE + ands r11, r4, #0x04 @ CHECKING TOP AVAILABILTY OR ELSE BRANCH TO NONE AVAILABLE + beq none_available + + add r10, r0, #5 + ldrb r6, [r10], #1 + ldrb r7, [r10], #1 + ldrb r8, [r10], #1 + add r5, r6, r7 + ldrb r9, [r10], #1 + add r5, r5, r8 + add r5, r5, r9 + add r5, r5, #2 + lsr r5, r5, #2 + vdup.u8 d0, r5 + vst1.32 d0[0], [r1], r3 + vst1.32 d0[0], [r1], r3 + vst1.32 d0[0], [r1], r3 + vst1.32 d0[0], [r1], r3 + b end_func + +left_available: @ONLY LEFT AVAILABLE + add r5, r5, #2 + lsr r5, r5, #2 + vdup.u8 d0, r5 + vst1.32 d0[0], [r1], r3 + vst1.32 d0[0], [r1], r3 + vst1.32 d0[0], [r1], r3 + vst1.32 d0[0], [r1], r3 + b end_func + +none_available: @NONE AVAILABLE + mov r5, #128 + vdup.u8 d0, r5 + vst1.32 d0[0], [r1], r3 + vst1.32 d0[0], [r1], r3 + vst1.32 d0[0], [r1], r3 + vst1.32 d0[0], [r1], r3 + b end_func + + +end_func: + ldmfd sp!, {r4-r12, pc} @Restoring registers from stack + + + + + + + +@/** +@******************************************************************************* +@* +@*ih264_intra_pred_luma_4x4_mode_diag_dl +@* +@* @brief +@* Perform Intra prediction for luma_4x4 mode:Diagonal_Down_Left +@* +@* @par Description: +@* Perform Intra prediction for luma_4x4 mode:Diagonal_Down_Left ,described in sec 8.3.1.2.4 +@* +@* @param[in] pu1_src +@* UWORD8 pointer to the source +@* +@* @param[out] pu1_dst +@* UWORD8 pointer to the destination +@* +@* @param[in] src_strd +@* integer source stride +@* +@* @param[in] dst_strd +@* integer destination stride +@* +@* @param[in] ui_neighboravailability +@* availability of neighbouring pixels +@* +@* @returns +@* +@* @remarks +@* None +@* +@*******************************************************************************/ +@void ih264_intra_pred_luma_4x4_mode_diag_dl(UWORD8 *pu1_src, +@ UWORD8 *pu1_dst, +@ WORD32 src_strd, +@ WORD32 dst_strd, +@ WORD32 ui_neighboravailability) + +@**************Variables Vs Registers***************************************** +@ r0 => *pu1_src +@ r1 => *pu1_dst +@ r2 => src_strd +@ r3 => dst_strd +@ r4 => ui_neighboravailability + + + .global ih264_intra_pred_luma_4x4_mode_diag_dl_a9q + +ih264_intra_pred_luma_4x4_mode_diag_dl_a9q: + + stmfd sp!, {r4-r12, r14} @store register values to stack + + add r0, r0, #5 + sub r5, r3, #2 + add r6, r0, #7 + vld1.8 {d0}, [r0] + vext.8 d1, d0, d0, #1 + vext.8 d2, d0, d0, #2 + vld1.8 {d2[6]}, [r6] + vaddl.u8 q10, d0, d1 + vaddl.u8 q11, d1, d2 + vadd.u16 q12, q10, q11 + vqrshrun.s16 d3, q12, #2 + vst1.32 {d3[0]}, [r1], r3 + vext.8 d4, d3, d3, #1 + vst1.32 {d4[0]}, [r1], r3 + vst1.16 {d3[1]}, [r1]! + vst1.16 {d3[2]}, [r1], r5 + vst1.16 {d4[1]}, [r1]! + vst1.16 {d4[2]}, [r1] + +end_func_diag_dl: + ldmfd sp!, {r4-r12, pc} @Restoring registers from stack + + + + + + + + + +@/** +@******************************************************************************* +@* +@*ih264_intra_pred_luma_4x4_mode_diag_dr +@* +@* @brief +@* Perform Intra prediction for luma_4x4 mode:Diagonal_Down_Right +@* +@* @par Description: +@* Perform Intra prediction for luma_4x4 mode:Diagonal_Down_Right ,described in sec 8.3.1.2.5 +@* +@* @param[in] pu1_src +@* UWORD8 pointer to the source +@* +@* @param[out] pu1_dst +@* UWORD8 pointer to the destination +@* +@* @param[in] src_strd +@* integer source stride +@* +@* @param[in] dst_strd +@* integer destination stride +@* +@* @param[in] ui_neighboravailability +@* availability of neighbouring pixels +@* +@* @returns +@* +@* @remarks +@* None +@* +@*******************************************************************************/ +@void ih264_intra_pred_luma_4x4_mode_diag_dr(UWORD8 *pu1_src, +@ UWORD8 *pu1_dst, +@ WORD32 src_strd, +@ WORD32 dst_strd, +@ WORD32 ui_neighboravailability) + +@**************Variables Vs Registers***************************************** +@ r0 => *pu1_src +@ r1 => *pu1_dst +@ r2 => src_strd +@ r3 => dst_strd +@ r4 => ui_neighboravailability + + + .global ih264_intra_pred_luma_4x4_mode_diag_dr_a9q + +ih264_intra_pred_luma_4x4_mode_diag_dr_a9q: + + stmfd sp!, {r4-r12, r14} @store register values to stack + + + vld1.u8 {d0}, [r0] + add r0, r0, #1 + vld1.u8 {d1}, [r0] + vext.8 d2, d1, d1, #1 + vaddl.u8 q10, d0, d1 + vaddl.u8 q11, d1, d2 + vadd.u16 q12, q10, q11 + vqrshrun.s16 d3, q12, #2 + + vext.8 d4, d3, d3, #1 + sub r5, r3, #2 + vst1.16 {d4[1]}, [r1]! + vst1.16 {d4[2]}, [r1], r5 + vst1.16 {d3[1]}, [r1]! + vst1.16 {d3[2]}, [r1], r5 + vst1.32 {d4[0]}, [r1], r3 + vst1.32 {d3[0]}, [r1], r3 + +end_func_diag_dr: + ldmfd sp!, {r4-r12, pc} @Restoring registers from stack + + + + + + + +@/** +@******************************************************************************* +@* +@*ih264_intra_pred_luma_4x4_mode_vert_r +@* +@* @brief +@* Perform Intra prediction for luma_4x4 mode:Vertical_Right +@* +@* @par Description: +@* Perform Intra prediction for luma_4x4 mode:Vertical_Right ,described in sec 8.3.1.2.6 +@* +@* @param[in] pu1_src +@* UWORD8 pointer to the source +@* +@* @param[out] pu1_dst +@* UWORD8 pointer to the destination +@* +@* @param[in] src_strd +@* integer source stride +@* +@* @param[in] dst_strd +@* integer destination stride +@* +@* @param[in] ui_neighboravailability +@* availability of neighbouring pixels +@* +@* @returns +@* +@* @remarks +@* None +@* +@*******************************************************************************/ +@void ih264_intra_pred_luma_4x4_mode_vert_r(UWORD8 *pu1_src, +@ UWORD8 *pu1_dst, +@ WORD32 src_strd, +@ WORD32 dst_strd, +@ WORD32 ui_neighboravailability) + +@**************Variables Vs Registers***************************************** +@ r0 => *pu1_src +@ r1 => *pu1_dst +@ r2 => src_strd +@ r3 => dst_strd +@ r4 => ui_neighboravailability + + + .global ih264_intra_pred_luma_4x4_mode_vert_r_a9q + +ih264_intra_pred_luma_4x4_mode_vert_r_a9q: + + stmfd sp!, {r4-r12, r14} @store register values to stack + + + vld1.u8 {d0}, [r0] + add r0, r0, #1 + vld1.u8 {d1}, [r0] + vext.8 d2, d1, d1, #1 + vaddl.u8 q10, d0, d1 + vaddl.u8 q11, d1, d2 + vadd.u16 q12, q10, q11 + vqrshrun.s16 d4, q10, #1 + vqrshrun.s16 d3, q12, #2 + sub r5, r3, #2 + vext.8 d5, d3, d3, #3 + vst1.32 {d4[1]}, [r1], r3 + vst1.32 {d5[0]}, [r1], r3 + sub r8, r3, #3 + vst1.u8 {d3[2]}, [r1]! + vst1.16 {d4[2]}, [r1]! + vst1.u8 {d4[6]}, [r1], r8 + vst1.u8 {d3[1]}, [r1]! + vst1.16 {d5[0]}, [r1]! + vst1.u8 {d5[2]}, [r1] + + +end_func_vert_r: + ldmfd sp!, {r4-r12, pc} @Restoring registers from stack + + + + + +@/** +@******************************************************************************* +@* +@*ih264_intra_pred_luma_4x4_mode_horz_d +@* +@* @brief +@* Perform Intra prediction for luma_4x4 mode:Horizontal_Down +@* +@* @par Description: +@* Perform Intra prediction for luma_4x4 mode:Horizontal_Down ,described in sec 8.3.1.2.7 +@* +@* @param[in] pu1_src +@* UWORD8 pointer to the source +@* +@* @param[out] pu1_dst +@* UWORD8 pointer to the destination +@* +@* @param[in] src_strd +@* integer source stride +@* +@* @param[in] dst_strd +@* integer destination stride +@* +@* @param[in] ui_neighboravailability +@* availability of neighbouring pixels +@* +@* @returns +@* +@* @remarks +@* None +@* +@*******************************************************************************/ +@void ih264_intra_pred_luma_4x4_mode_horz_d(UWORD8 *pu1_src, +@ UWORD8 *pu1_dst, +@ WORD32 src_strd, +@ WORD32 dst_strd, +@ WORD32 ui_neighboravailability) + +@**************Variables Vs Registers***************************************** +@ r0 => *pu1_src +@ r1 => *pu1_dst +@ r2 => src_strd +@ r3 => dst_strd +@ r4 => ui_neighboravailability + + + .global ih264_intra_pred_luma_4x4_mode_horz_d_a9q + +ih264_intra_pred_luma_4x4_mode_horz_d_a9q: + + stmfd sp!, {r4-r12, r14} @store register values to stack + + vld1.u8 {d0}, [r0] + add r0, r0, #1 + vld1.u8 {d1}, [r0] + vext.8 d2, d1, d0, #1 + vaddl.u8 q10, d0, d1 + vaddl.u8 q11, d1, d2 + vadd.u16 q12, q10, q11 + vqrshrun.s16 d4, q10, #1 + vqrshrun.s16 d5, q12, #2 + sub r5, r3, #2 + vmov.8 d6, d5 + vtrn.8 d4, d5 @ + vst1.u16 {d5[1]}, [r1]! + vst1.16 {d6[2]}, [r1], r5 + vst1.u16 {d4[1]}, [r1]! + vst1.16 {d5[1]}, [r1], r5 + vst1.u16 {d5[0]}, [r1]! + vst1.16 {d4[1]}, [r1], r5 + vst1.u16 {d4[0]}, [r1]! + vst1.16 {d5[0]}, [r1], r5 + +end_func_horz_d: + ldmfd sp!, {r4-r12, pc} @Restoring registers from stack + + + + + + + +@/** +@******************************************************************************* +@* +@*ih264_intra_pred_luma_4x4_mode_vert_l +@* +@* @brief +@* Perform Intra prediction for luma_4x4 mode:Vertical_Left +@* +@* @par Description: +@* Perform Intra prediction for luma_4x4 mode:Vertical_Left ,described in sec 8.3.1.2.8 +@* +@* @param[in] pu1_src +@* UWORD8 pointer to the source +@* +@* @param[out] pu1_dst +@* UWORD8 pointer to the destination +@* +@* @param[in] src_strd +@* integer source stride +@* +@* @param[in] dst_strd +@* integer destination stride +@* +@* @param[in] ui_neighboravailability +@* availability of neighbouring pixels +@* +@* @returns +@* +@* @remarks +@* None +@* +@*******************************************************************************/ +@void ih264_intra_pred_luma_4x4_mode_vert_l(UWORD8 *pu1_src, +@ UWORD8 *pu1_dst, +@ WORD32 src_strd, +@ WORD32 dst_strd, +@ WORD32 ui_neighboravailability) + +@**************Variables Vs Registers***************************************** +@ r0 => *pu1_src +@ r1 => *pu1_dst +@ r2 => src_strd +@ r3 => dst_strd +@ r4 => ui_neighboravailability + + + .global ih264_intra_pred_luma_4x4_mode_vert_l_a9q + +ih264_intra_pred_luma_4x4_mode_vert_l_a9q: + + stmfd sp!, {r4-r12, r14} @store register values to stack + add r0, r0, #4 + vld1.u8 {d0}, [r0] + add r0, r0, #1 + vld1.u8 {d1}, [r0] + vext.8 d2, d1, d0, #1 + vaddl.u8 q10, d0, d1 + vaddl.u8 q11, d1, d2 + vadd.u16 q12, q10, q11 + vqrshrun.s16 d4, q10, #1 + vqrshrun.s16 d5, q12, #2 + vext.8 d6, d4, d4, #1 + vext.8 d7, d5, d5, #1 + vst1.32 {d6[0]}, [r1], r3 + vext.8 d16, d4, d4, #2 + vext.8 d17, d5, d5, #2 + vst1.32 {d7[0]}, [r1], r3 + vst1.32 {d16[0]}, [r1], r3 + vst1.32 {d17[0]}, [r1], r3 + + + +end_func_vert_l: + ldmfd sp!, {r4-r12, pc} @Restoring registers from stack + + + + + + + +@/** +@******************************************************************************* +@* +@*ih264_intra_pred_luma_4x4_mode_horz_u +@* +@* @brief +@* Perform Intra prediction for luma_4x4 mode:Horizontal_Up +@* +@* @par Description: +@* Perform Intra prediction for luma_4x4 mode:Horizontal_Up ,described in sec 8.3.1.2.9 +@* +@* @param[in] pu1_src +@* UWORD8 pointer to the source +@* +@* @param[out] pu1_dst +@* UWORD8 pointer to the destination +@* +@* @param[in] src_strd +@* integer source stride +@* +@* @param[in] dst_strd +@* integer destination stride +@* +@* @param[in] ui_neighboravailability +@* availability of neighbouring pixels +@* +@* @returns +@* +@* @remarks +@* None +@* +@*******************************************************************************/ +@void ih264_intra_pred_luma_4x4_mode_horz_u(UWORD8 *pu1_src, +@ UWORD8 *pu1_dst, +@ WORD32 src_strd, +@ WORD32 dst_strd, +@ WORD32 ui_neighboravailability) + +@**************Variables Vs Registers***************************************** +@ r0 => *pu1_src +@ r1 => *pu1_dst +@ r2 => src_strd +@ r3 => dst_strd +@ r4 => ui_neighboravailability + + + .global ih264_intra_pred_luma_4x4_mode_horz_u_a9q + +ih264_intra_pred_luma_4x4_mode_horz_u_a9q: + + stmfd sp!, {r4-r12, r14} @store register values to stack + mov r10, r0 + vld1.u8 {d0}, [r0] + ldrb r9, [r0], #1 + vext.8 d1, d0, d0, #1 + vld1.u8 {d0[7]}, [r10] + vext.8 d2, d1, d1, #1 + vaddl.u8 q10, d0, d1 + vaddl.u8 q11, d1, d2 + vadd.u16 q12, q10, q11 + vqrshrun.s16 d4, q10, #1 + vqrshrun.s16 d5, q12, #2 + vmov d6, d4 + vext.8 d6, d5, d4, #1 + vst1.8 {d4[2]}, [r1]! + vst1.8 {d6[0]}, [r1]! + vtrn.8 d6, d5 @ + sub r5, r3, #2 + vtrn.8 d4, d6 @ + vdup.8 d7, r9 + vst1.16 {d6[0]}, [r1], r5 + vst1.16 {d6[0]}, [r1]! + vst1.16 {d5[3]}, [r1], r5 + vst1.16 {d5[3]}, [r1]! + vst1.16 {d7[3]}, [r1], r5 + vst1.32 {d7[0]}, [r1], r3 + +end_func_horz_u: + ldmfd sp!, {r4-r12, pc} @Restoring registers from stack + + diff --git a/common/arm/ih264_intra_pred_luma_8x8_a9q.s b/common/arm/ih264_intra_pred_luma_8x8_a9q.s new file mode 100755 index 0000000..6da1c95 --- /dev/null +++ b/common/arm/ih264_intra_pred_luma_8x8_a9q.s @@ -0,0 +1,1037 @@ +@/****************************************************************************** +@ * +@ * Copyright (C) 2015 The Android Open Source Project +@ * +@ * Licensed under the Apache License, Version 2.0 (the "License"); +@ * you may not use this file except in compliance with the License. +@ * You may obtain a copy of the License at: +@ * +@ * http://www.apache.org/licenses/LICENSE-2.0 +@ * +@ * Unless required by applicable law or agreed to in writing, software +@ * distributed under the License is distributed on an "AS IS" BASIS, +@ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +@ * See the License for the specific language governing permissions and +@ * limitations under the License. +@ * +@ ***************************************************************************** +@ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore +@*/ +@/** +@****************************************************************************** +@* @file +@* ih264_intra_pred_luma_8x8_a9q.s +@* +@* @brief +@* Contains function definitions for intra 8x8 Luma prediction . +@* +@* @author +@* Ittiam +@* +@* @par List of Functions: +@* +@* -ih264_intra_pred_luma_8x8_mode_ref_filtering_a9q +@* -ih264_intra_pred_luma_8x8_mode_vert_a9q +@* -ih264_intra_pred_luma_8x8_mode_horz_a9q +@* -ih264_intra_pred_luma_8x8_mode_dc_a9q +@* -ih264_intra_pred_luma_8x8_mode_diag_dl_a9q +@* -ih264_intra_pred_luma_8x8_mode_diag_dr_a9q +@* -ih264_intra_pred_luma_8x8_mode_vert_r_a9q +@* -ih264_intra_pred_luma_8x8_mode_horz_d_a9q +@* -ih264_intra_pred_luma_8x8_mode_vert_l_a9q +@* -ih264_intra_pred_luma_8x8_mode_horz_u_a9q +@* +@* @remarks +@* None +@* +@******************************************************************************* +@*/ + +@/* All the functions here are replicated from ih264_intra_pred_filters.c +@ + +@/** +@/** +@/** +@ + + +.text +.p2align 2 + + .extern ih264_gai1_intrapred_luma_8x8_horz_u +.hidden ih264_gai1_intrapred_luma_8x8_horz_u +scratch_intrapred_addr_8x8: + .long ih264_gai1_intrapred_luma_8x8_horz_u - scrlb8x8l2 - 8 + +@/** +@******************************************************************************* +@* +@*ih264_intra_pred_luma_8x8_mode_ref_filtering +@* +@* @brief +@* Reference sample filtering process for Intra_8x8 sample prediction +@* +@* @par Description: +@* Perform Reference sample filtering process for Intra_8x8 sample prediction ,described in sec 8.3.2.2.1 +@* +@* @param[in] pu1_src +@* UWORD8 pointer to the source +@* +@* @param[out] pu1_dst +@* UWORD8 pointer to the destination +@* +@* @param[in] src_strd +@* integer source stride [Not used] +@* +@* @param[in] dst_strd +@* integer destination stride[Not used] +@* +@* @param[in] ui_neighboravailability +@* availability of neighbouring pixels[Not used] +@* +@* @returns +@* +@* @remarks +@* None +@* +@*******************************************************************************/ +@void ih264_intra_pred_luma_8x8_mode_ref_filtering(UWORD8 *pu1_src, +@ UWORD8 *pu1_dst) + +@**************Variables Vs Registers***************************************** +@ r0 => *pu1_src +@ r1 => *pu1_dst + + + .global ih264_intra_pred_luma_8x8_mode_ref_filtering_a9q + +ih264_intra_pred_luma_8x8_mode_ref_filtering_a9q: + + stmfd sp!, {r4-r12, r14} @store register values to stack + vpush {d8-d15} + + + vld1.u8 {q0}, [r0]! @ + vld1.u8 {q1}, [r0] + add r0, r0, #8 @ + vext.8 q2, q0, q1, #1 + vext.8 q3, q1, q1, #1 + vext.8 q4, q2, q3, #1 + vext.8 q5, q3, q3, #1 + vld1.8 {d10[7]}, [r0] @ LOADING SRC[24] AGIN TO THE END FOR p'[ 15, -1 ] = ( p[ 14, -1 ] + 3 * p[ 15, -1 ] + 2 ) >> 2 + vaddl.u8 q10, d0, d4 + vaddl.u8 q7, d0, d0 @ SPECIAL CASE FOR p'[ -1 ,7 ] = ( p[ -1, 6 ] + 3 * p[ -1, 7 ] + 2 ) >> 2 + vadd.u16 q7, q10, q7 + vaddl.u8 q11, d1, d5 + vqrshrun.s16 d14, q7, #2 + vaddl.u8 q12, d4, d8 + vaddl.u8 q13, d5, d9 + vst1.8 {d14[0]}, [r1]! + vadd.u16 q12, q10, q12 + vadd.u16 q13, q11, q13 + vaddl.u8 q9, d2, d6 + vaddl.u8 q8, d6, d10 + vqrshrun.s16 d4, q12, #2 + vqrshrun.s16 d5, q13, #2 + vadd.u16 q6, q8, q9 + vst1.8 {q2}, [r1]! + vqrshrun.s16 d6, q6, #2 + vst1.8 {d6}, [r1] + + +end_func_ref_filt: + vpop {d8-d15} + ldmfd sp!, {r4-r12, pc} @Restoring registers from stack + + + + + + +@/** +@******************************************************************************* +@* +@*ih264_intra_pred_luma_8x8_mode_vert +@* +@* @brief +@* Perform Intra prediction for luma_8x8 mode:vertical +@* +@* @par Description: +@* Perform Intra prediction for luma_8x8 mode:vertical ,described in sec 8.3.2.2.2 +@* +@* @param[in] pu1_src +@* UWORD8 pointer to the source +@* +@* @param[out] pu1_dst +@* UWORD8 pointer to the destination +@* +@* @param[in] src_strd +@* integer source stride +@* +@* @param[in] dst_strd +@* integer destination stride +@* +@* @param[in] ui_neighboravailability +@* availability of neighbouring pixels(Not used in this function) +@* +@* @returns +@* +@* @remarks +@* None +@* +@******************************************************************************* +@void ih264_intra_pred_luma_8x8_mode_vert(UWORD8 *pu1_src, +@ UWORD8 *pu1_dst, +@ WORD32 src_strd, +@ WORD32 dst_strd, +@ WORD32 ui_neighboravailability) + +@**************Variables Vs Registers***************************************** +@ r0 => *pu1_src +@ r1 => *pu1_dst +@ r2 => src_strd +@ r3 => dst_strd +@ r4 => ui_neighboravailability + + + .global ih264_intra_pred_luma_8x8_mode_vert_a9q + +ih264_intra_pred_luma_8x8_mode_vert_a9q: + + stmfd sp!, {r4-r12, r14} @store register values to stack + + add r0, r0, #9 + vld1.8 d0, [r0] + + vst1.8 d0, [r1], r3 + vst1.8 d0, [r1], r3 + vst1.8 d0, [r1], r3 + vst1.8 d0, [r1], r3 + vst1.8 d0, [r1], r3 + vst1.8 d0, [r1], r3 + vst1.8 d0, [r1], r3 + vst1.8 d0, [r1], r3 + + ldmfd sp!, {r4-r12, pc} @Restoring registers from stack + + + + + +@/****************************************************************************** + + +@/** +@******************************************************************************* +@* +@*ih264_intra_pred_luma_8x8_mode_horz +@* +@* @brief +@* Perform Intra prediction for luma_8x8 mode:horizontal +@* +@* @par Description: +@* Perform Intra prediction for luma_8x8 mode:horizontal ,described in sec 8.3.2.2.2 +@* +@* @param[in] pu1_src +@* UWORD8 pointer to the source +@* +@* @param[out] pu1_dst +@* UWORD8 pointer to the destination +@* +@* @param[in] src_strd +@* integer source stride +@* +@* @param[in] dst_strd +@* integer destination stride +@* +@* @param[in] ui_neighboravailability +@* availability of neighbouring pixels(Not used in this function) +@* +@* @returns +@* +@* @remarks +@* None +@* +@******************************************************************************* +@*/ +@void ih264_intra_pred_luma_8x8_mode_horz(UWORD8 *pu1_src, +@ UWORD8 *pu1_dst, +@ WORD32 src_strd, +@ WORD32 dst_strd, +@ WORD32 ui_neighboravailability) +@**************Variables Vs Registers***************************************** +@ r0 => *pu1_src +@ r1 => *pu1_dst +@ r2 => src_strd +@ r3 => dst_strd +@ r4 => ui_neighboravailability + + + .global ih264_intra_pred_luma_8x8_mode_horz_a9q + +ih264_intra_pred_luma_8x8_mode_horz_a9q: + + stmfd sp!, {r14} @store register values to stack + + vld1.u8 {d0}, [r0] + mov r2, #6 + + vdup.u8 d1, d0[7] + vdup.u8 d2, d0[6] + vst1.8 {d1}, [r1], r3 + +loop_8x8_horz: + vext.8 d0, d0, d0, #6 + vst1.8 {d2}, [r1], r3 + vdup.u8 d1, d0[7] + subs r2, #2 + vdup.u8 d2, d0[6] + vst1.8 {d1}, [r1], r3 + bne loop_8x8_horz + + vext.8 d0, d0, d0, #6 + vst1.8 {d2}, [r1], r3 + + ldmfd sp!, {pc} @restoring registers from stack + + + + + +@/****************************************************************************** + + +@/** +@******************************************************************************* +@* +@*ih264_intra_pred_luma_8x8_mode_dc +@* +@* @brief +@* Perform Intra prediction for luma_8x8 mode:DC +@* +@* @par Description: +@* Perform Intra prediction for luma_8x8 mode:DC ,described in sec 8.3.2.2.3 +@* +@* @param[in] pu1_src +@* UWORD8 pointer to the source +@* +@* @param[out] pu1_dst +@* UWORD8 pointer to the destination +@* +@* @param[in] src_strd +@* integer source stride +@* +@* @param[in] dst_strd +@* integer destination stride +@* +@* @param[in] ui_neighboravailability +@* availability of neighbouring pixels +@* +@* @returns +@* +@* @remarks +@* None +@* +@*******************************************************************************/ +@void ih264_intra_pred_luma_8x8_mode_dc(UWORD8 *pu1_src, +@ UWORD8 *pu1_dst, +@ WORD32 src_strd, +@ WORD32 dst_strd, +@ WORD32 ui_neighboravailability) + +@**************Variables Vs Registers***************************************** +@ r0 => *pu1_src +@ r1 => *pu1_dst +@ r2 => src_strd +@ r3 => dst_strd +@ r4 => ui_neighboravailability + + + .global ih264_intra_pred_luma_8x8_mode_dc_a9q + +ih264_intra_pred_luma_8x8_mode_dc_a9q: + + stmfd sp!, {r4, r14} @store register values to stack + ldr r4, [sp, #8] @r4 => ui_neighboravailability + + ands r2, r4, #0x01 @CHECKING IF LEFT_AVAILABLE ELSE BRANCHING TO ONLY TOP AVAILABLE + beq top_available + ands r2, r4, #0x04 @CHECKING IF TOP_AVAILABLE ELSE BRANCHING TO ONLY LEFT AVAILABLE + beq left_available + + vld1.u8 {d0}, [r0] @BOTH LEFT AND TOP AVAILABLE + add r0, r0, #9 + vld1.u8 {d1}, [r0] + vpaddl.u8 q0, q0 + vadd.u16 d0, d0, d1 + vpaddl.u16 d0, d0 + vpaddl.u32 d0, d0 + vqrshrun.s16 d0, q0, #4 + vdup.u8 d0, d0[0] + b str_pred + +top_available: @ONLY TOP AVAILABLE + ands r2, r4, #0x04 @CHECKING TOP AVAILABILTY OR ELSE BRANCH TO NONE AVAILABLE + beq none_available + + add r0, r0, #9 + vld1.u8 {d0}, [r0] + vpaddl.u8 d0, d0 + vpaddl.u16 d0, d0 + vpaddl.u32 d0, d0 + vqrshrun.s16 d0, q0, #3 + vdup.u8 d0, d0[0] + b str_pred + +left_available: @ONLY LEFT AVAILABLE + vld1.u8 {d0}, [r0] + vpaddl.u8 d0, d0 + vpaddl.u16 d0, d0 + vpaddl.u32 d0, d0 + vqrshrun.s16 d0, q0, #3 + vdup.u8 d0, d0[0] + b str_pred + +none_available: @NONE AVAILABLE + vmov.u8 q0, #128 + +str_pred: + vst1.8 {d0}, [r1], r3 + vst1.8 {d0}, [r1], r3 + vst1.8 {d0}, [r1], r3 + vst1.8 {d0}, [r1], r3 + vst1.8 {d0}, [r1], r3 + vst1.8 {d0}, [r1], r3 + vst1.8 {d0}, [r1], r3 + vst1.8 {d0}, [r1], r3 + + ldmfd sp!, {r4, pc} @Restoring registers from stack + + + + + + +@/** +@******************************************************************************* +@* +@*ih264_intra_pred_luma_8x8_mode_diag_dl +@* +@* @brief +@* Perform Intra prediction for luma_8x8 mode:Diagonal_Down_Left +@* +@* @par Description: +@* Perform Intra prediction for luma_8x8 mode:Diagonal_Down_Left ,described in sec 8.3.2.2.4 +@* +@* @param[in] pu1_src +@* UWORD8 pointer to the source +@* +@* @param[out] pu1_dst +@* UWORD8 pointer to the destination +@* +@* @param[in] src_strd +@* integer source stride +@* +@* @param[in] dst_strd +@* integer destination stride +@* +@* @param[in] ui_neighboravailability +@* availability of neighbouring pixels +@* +@* @returns +@* +@* @remarks +@* None +@* +@*******************************************************************************/ +@void ih264_intra_pred_luma_8x8_mode_diag_dl(UWORD8 *pu1_src, +@ UWORD8 *pu1_dst, +@ WORD32 src_strd, +@ WORD32 dst_strd, +@ WORD32 ui_neighboravailability) + +@**************Variables Vs Registers***************************************** +@ r0 => *pu1_src +@ r1 => *pu1_dst +@ r2 => src_strd +@ r3 => dst_strd +@ r4 => ui_neighboravailability + + .global ih264_intra_pred_luma_8x8_mode_diag_dl_a9q + +ih264_intra_pred_luma_8x8_mode_diag_dl_a9q: + + stmfd sp!, {r4-r12, r14} @store register values to stack + + add r0, r0, #9 + sub r5, r3, #4 + add r6, r0, #15 + vld1.8 {q0}, [r0] + vext.8 q2, q0, q0, #2 + vext.8 q1, q0, q0, #1 + vld1.8 {d5[6]}, [r6] + @ q1 = q0 shifted to left once + @ q2 = q1 shifted to left once + vaddl.u8 q10, d0, d2 @Adding for FILT121 + vaddl.u8 q11, d1, d3 + vaddl.u8 q12, d2, d4 + vaddl.u8 q13, d3, d5 + vadd.u16 q12, q10, q12 + vadd.u16 q13, q11, q13 + + vqrshrun.s16 d4, q12, #2 + vqrshrun.s16 d5, q13, #2 + @Q2 has all FILT121 values + vst1.8 {d4}, [r1], r3 + vext.8 q9, q2, q2, #1 + vext.8 q8, q9, q9, #1 + vst1.8 {d18}, [r1], r3 + vext.8 q15, q8, q8, #1 + vst1.8 {d16}, [r1], r3 + vst1.8 {d30}, [r1], r3 + vst1.32 {d4[1]}, [r1]! + vst1.32 {d5[0]}, [r1], r5 + vst1.32 {d18[1]}, [r1]! + vst1.32 {d19[0]}, [r1], r5 + vst1.32 {d16[1]}, [r1]! + vst1.32 {d17[0]}, [r1], r5 + vst1.32 {d30[1]}, [r1]! + vst1.32 {d31[0]}, [r1], r5 + + +end_func_diag_dl: + ldmfd sp!, {r4-r12, pc} @Restoring registers from stack + + + + +@/** +@******************************************************************************* +@* +@*ih264_intra_pred_luma_8x8_mode_diag_dr +@* +@* @brief +@* Perform Intra prediction for luma_8x8 mode:Diagonal_Down_Right +@* +@* @par Description: +@* Perform Intra prediction for luma_8x8 mode:Diagonal_Down_Right ,described in sec 8.3.2.2.5 +@* +@* @param[in] pu1_src +@* UWORD8 pointer to the source +@* +@* @param[out] pu1_dst +@* UWORD8 pointer to the destination +@* +@* @param[in] src_strd +@* integer source stride +@* +@* @param[in] dst_strd +@* integer destination stride +@* +@* @param[in] ui_neighboravailability +@* availability of neighbouring pixels +@* +@* @returns +@* +@* @remarks +@* None +@* +@*******************************************************************************/ +@void ih264_intra_pred_luma_8x8_mode_diag_dr(UWORD8 *pu1_src, +@ UWORD8 *pu1_dst, +@ WORD32 src_strd, +@ WORD32 dst_strd, +@ WORD32 ui_neighboravailability) + +@**************Variables Vs Registers***************************************** +@ r0 => *pu1_src +@ r1 => *pu1_dst +@ r2 => src_strd +@ r3 => dst_strd +@ r4 => ui_neighboravailability + + + .global ih264_intra_pred_luma_8x8_mode_diag_dr_a9q + +ih264_intra_pred_luma_8x8_mode_diag_dr_a9q: + + stmfd sp!, {r4-r12, r14} @store register values to stack + + + vld1.u8 {q0}, [r0] + add r0, r0, #1 + vld1.u8 {q1}, [r0] + vext.8 q2, q1, q1, #1 + @ q1 = q0 shifted to left once + @ q2 = q1 shifted to left once + vaddl.u8 q10, d0, d2 @Adding for FILT121 + vaddl.u8 q11, d1, d3 + vaddl.u8 q12, d2, d4 + vaddl.u8 q13, d3, d5 + vadd.u16 q12, q10, q12 + vadd.u16 q13, q11, q13 + vqrshrun.s16 d4, q12, #2 + vqrshrun.s16 d5, q13, #2 + @Q2 has all FILT121 values + sub r5, r3, #4 + vext.8 q9, q2, q2, #15 + vst1.8 {d19}, [r1], r3 + vext.8 q8, q9, q9, #15 + vst1.8 {d17}, [r1], r3 + vext.8 q15, q8, q8, #15 + vst1.8 {d31}, [r1], r3 + vst1.32 {d4[1]}, [r1]! + vst1.32 {d5[0]}, [r1], r5 + vst1.32 {d18[1]}, [r1]! + vst1.32 {d19[0]}, [r1], r5 + vst1.32 {d16[1]}, [r1]! + vst1.32 {d17[0]}, [r1], r5 + vst1.32 {d30[1]}, [r1]! + vst1.32 {d31[0]}, [r1], r5 + vst1.8 {d4}, [r1], r3 + +end_func_diag_dr: + ldmfd sp!, {r4-r12, pc} @Restoring registers from stack + + + + +@/** +@******************************************************************************* +@* +@*ih264_intra_pred_luma_8x8_mode_vert_r +@* +@* @brief +@* Perform Intra prediction for luma_8x8 mode:Vertical_Right +@* +@* @par Description: +@* Perform Intra prediction for luma_8x8 mode:Vertical_Right ,described in sec 8.3.2.2.6 +@* +@* @param[in] pu1_src +@* UWORD8 pointer to the source +@* +@* @param[out] pu1_dst +@* UWORD8 pointer to the destination +@* +@* @param[in] src_strd +@* integer source stride +@* +@* @param[in] dst_strd +@* integer destination stride +@* +@* @param[in] ui_neighboravailability +@* availability of neighbouring pixels +@* +@* @returns +@* +@* @remarks +@* None +@* +@*******************************************************************************/ +@void ih264_intra_pred_luma_8x8_mode_vert_r(UWORD8 *pu1_src, +@ UWORD8 *pu1_dst, +@ WORD32 src_strd, +@ WORD32 dst_strd, +@ WORD32 ui_neighboravailability) + +@**************Variables Vs Registers***************************************** +@ r0 => *pu1_src +@ r1 => *pu1_dst +@ r2 => src_strd +@ r3 => dst_strd +@ r4 => ui_neighboravailability + + + .global ih264_intra_pred_luma_8x8_mode_vert_r_a9q + +ih264_intra_pred_luma_8x8_mode_vert_r_a9q: + + stmfd sp!, {r4-r12, r14} @store register values to stack + + vld1.u8 {q0}, [r0] + add r0, r0, #1 + vld1.u8 {q1}, [r0] + vext.8 q2, q1, q1, #1 + @ q1 = q0 shifted to left once + @ q2 = q1 shifted to left once + vaddl.u8 q10, d0, d2 + vaddl.u8 q11, d1, d3 + vaddl.u8 q12, d2, d4 + vaddl.u8 q13, d3, d5 + vadd.u16 q12, q10, q12 + vadd.u16 q13, q11, q13 + + vqrshrun.s16 d4, q10, #1 + vqrshrun.s16 d5, q11, #1 + vqrshrun.s16 d6, q12, #2 + vqrshrun.s16 d7, q13, #2 + @Q2 has all FILT11 values + @Q3 has all FILT121 values + sub r5, r3, #6 + sub r6, r3, #4 + vst1.8 {d5}, [r1], r3 @ row 0 + vext.8 q9, q3, q3, #15 + vmov.8 q11, q9 + vext.8 q8, q2, q2, #1 + vst1.8 {d19}, [r1], r3 @row 1 + + vmov.8 q15, q8 + vext.8 q10, q2, q2, #15 + vuzp.8 q8, q9 + @row 2 + vext.8 q14, q8, q8, #1 + vst1.8 {d21}, [r1] + vst1.8 {d6[6]}, [r1], r3 + @row 3 + + vst1.16 {d29[1]}, [r1]! + vst1.32 {d7[0]}, [r1]! + vst1.16 {d7[2]}, [r1], r5 +@row 4 + vst1.16 {d19[1]}, [r1]! + vst1.32 {d5[0]}, [r1]! + vst1.16 {d5[2]}, [r1], r5 + +@row 5 + vext.8 q13, q9, q9, #1 + vst1.16 {d17[1]}, [r1]! + vst1.32 {d23[0]}, [r1]! + vst1.16 {d23[2]}, [r1], r5 + + +@row 6 + vst1.16 {d27[0]}, [r1]! + vst1.8 {d27[2]}, [r1]! + vst1.8 {d5[0]}, [r1]! + vst1.32 {d31[0]}, [r1], r6 +@row 7 + vst1.32 {d29[0]}, [r1]! + vst1.32 {d7[0]}, [r1]! + + + +end_func_vert_r: + ldmfd sp!, {r4-r12, pc} @Restoring registers from stack + + + + +@/** +@******************************************************************************* +@* +@*ih264_intra_pred_luma_8x8_mode_horz_d +@* +@* @brief +@* Perform Intra prediction for luma_8x8 mode:Horizontal_Down +@* +@* @par Description: +@* Perform Intra prediction for luma_8x8 mode:Horizontal_Down ,described in sec 8.3.2.2.7 +@* +@* @param[in] pu1_src +@* UWORD8 pointer to the source +@* +@* @param[out] pu1_dst +@* UWORD8 pointer to the destination +@* +@* @param[in] src_strd +@* integer source stride +@* +@* @param[in] dst_strd +@* integer destination stride +@* +@* @param[in] ui_neighboravailability +@* availability of neighbouring pixels +@* +@* @returns +@* +@* @remarks +@* None +@* +@*******************************************************************************/ +@void ih264_intra_pred_luma_8x8_mode_horz_d(UWORD8 *pu1_src, +@ UWORD8 *pu1_dst, +@ WORD32 src_strd, +@ WORD32 dst_strd, +@ WORD32 ui_neighboravailability) + +@**************Variables Vs Registers***************************************** +@ r0 => *pu1_src +@ r1 => *pu1_dst +@ r2 => src_strd +@ r3 => dst_strd +@ r4 => ui_neighboravailability + + .global ih264_intra_pred_luma_8x8_mode_horz_d_a9q + +ih264_intra_pred_luma_8x8_mode_horz_d_a9q: + + stmfd sp!, {r4-r12, r14} @store register values to stack + vpush {d8-d15} + + vld1.u8 {q0}, [r0] + add r0, r0, #1 + vld1.u8 {q1}, [r0] + vext.8 q2, q1, q1, #1 + @ q1 = q0 shifted to left once + @ q2 = q1 shifted to left once + vaddl.u8 q10, d0, d2 + vaddl.u8 q11, d1, d3 + vaddl.u8 q12, d2, d4 + vaddl.u8 q13, d3, d5 + vadd.u16 q12, q10, q12 + vadd.u16 q13, q11, q13 + + vqrshrun.s16 d4, q10, #1 + vqrshrun.s16 d5, q11, #1 + vqrshrun.s16 d6, q12, #2 + vqrshrun.s16 d7, q13, #2 + @Q2 has all FILT11 values + @Q3 has all FILT121 values + vmov.8 q4, q2 + vmov.8 q5, q3 + sub r6, r3, #6 + vtrn.8 q4, q5 @ + vmov.8 q6, q4 + vmov.8 q7, q5 + sub r5, r3, #4 + vtrn.16 q6, q7 + vext.8 q8, q3, q3, #14 + @ROW 0 + vst1.8 {d17}, [r1] + vst1.16 {d10[3]}, [r1], r3 + + @ROW 1 + vst1.32 {d14[1]}, [r1]! + vst1.32 {d7[0]}, [r1], r5 + @ROW 2 + vst1.16 {d10[2]}, [r1]! + vst1.32 {d14[1]}, [r1]! + vst1.16 {d7[0]}, [r1], r6 + @ROW 3 + vst1.32 {d12[1]}, [r1]! + vst1.32 {d14[1]}, [r1], r5 + @ROW 4 + vst1.16 {d14[1]}, [r1]! + vst1.32 {d12[1]}, [r1]! + vst1.16 {d14[2]}, [r1], r6 + @ROW 5 + vst1.32 {d14[0]}, [r1]! + vst1.32 {d12[1]}, [r1], r5 + @ROW 6 + vst1.16 {d10[0]}, [r1]! + vst1.16 {d8[1]}, [r1]! + vst1.16 {d14[1]}, [r1]! + vst1.16 {d12[2]}, [r1], r6 + @ROW 7 + vst1.32 {d12[0]}, [r1]! + vst1.32 {d14[0]}, [r1], r5 + +end_func_horz_d: + vpop {d8-d15} + ldmfd sp!, {r4-r12, pc} @Restoring registers from stack + + + + + +@/** +@******************************************************************************* +@* +@*ih264_intra_pred_luma_8x8_mode_vert_l +@* +@* @brief +@* Perform Intra prediction for luma_8x8 mode:Vertical_Left +@* +@* @par Description: +@* Perform Intra prediction for luma_8x8 mode:Vertical_Left ,described in sec 8.3.2.2.8 +@* +@* @param[in] pu1_src +@* UWORD8 pointer to the source +@* +@* @param[out] pu1_dst +@* UWORD8 pointer to the destination +@* +@* @param[in] src_strd +@* integer source stride +@* +@* @param[in] dst_strd +@* integer destination stride +@* +@* @param[in] ui_neighboravailability +@* availability of neighbouring pixels +@* +@* @returns +@* +@* @remarks +@* None +@* +@*******************************************************************************/ +@void ih264_intra_pred_luma_8x8_mode_vert_l(UWORD8 *pu1_src, +@ UWORD8 *pu1_dst, +@ WORD32 src_strd, +@ WORD32 dst_strd, +@ WORD32 ui_neighboravailability) + +@**************Variables Vs Registers***************************************** +@ r0 => *pu1_src +@ r1 => *pu1_dst +@ r2 => src_strd +@ r3 => dst_strd +@ r4 => ui_neighboravailability + + + .global ih264_intra_pred_luma_8x8_mode_vert_l_a9q + +ih264_intra_pred_luma_8x8_mode_vert_l_a9q: + + stmfd sp!, {r4-r12, r14} @Restoring registers from stack + vpush {d8-d15} + add r0, r0, #9 + vld1.u8 {q0}, [r0] + add r0, r0, #1 + vld1.u8 {q1}, [r0] + vext.8 q2, q1, q1, #1 + vaddl.u8 q10, d0, d2 + vaddl.u8 q11, d1, d3 + vaddl.u8 q12, d2, d4 + vaddl.u8 q13, d3, d5 + vadd.u16 q12, q10, q12 + vadd.u16 q13, q11, q13 + + vqrshrun.s16 d4, q10, #1 + vqrshrun.s16 d5, q11, #1 + vqrshrun.s16 d6, q12, #2 + vext.8 q4, q2, q2, #1 + vqrshrun.s16 d7, q13, #2 + @Q2 has all FILT11 values + @Q3 has all FILT121 values + + vext.8 q5, q3, q3, #1 + @ROW 0,1 + vst1.8 {d4}, [r1], r3 + vst1.8 {d6}, [r1], r3 + + vext.8 q6, q4, q4, #1 + vext.8 q7, q5, q5, #1 + @ROW 2,3 + vst1.8 {d8}, [r1], r3 + vst1.8 {d10}, [r1], r3 + + vext.8 q8, q6, q6, #1 + vext.8 q9, q7, q7, #1 + @ROW 4,5 + vst1.8 {d12}, [r1], r3 + vst1.8 {d14}, [r1], r3 + @ROW 6,7 + vst1.8 {d16}, [r1], r3 + vst1.8 {d18}, [r1], r3 + +end_func_vert_l: + vpop {d8-d15} + ldmfd sp!, {r4-r12, pc} @Restoring registers from stack + + + + + +@/** +@******************************************************************************* +@* +@*ih264_intra_pred_luma_8x8_mode_horz_u +@* +@* @brief +@* Perform Intra prediction for luma_8x8 mode:Horizontal_Up +@* +@* @par Description: +@* Perform Intra prediction for luma_8x8 mode:Horizontal_Up ,described in sec 8.3.2.2.9 +@* +@* @param[in] pu1_src +@* UWORD8 pointer to the source +@* +@* @param[out] pu1_dst +@* UWORD8 pointer to the destination +@* +@* @param[in] src_strd +@* integer source stride +@* +@* @param[in] dst_strd +@* integer destination stride +@* +@* @param[in] ui_neighboravailability +@* availability of neighbouring pixels +@* +@* @returns +@* +@* @remarks +@* None +@* +@*******************************************************************************/ +@void ih264_intra_pred_luma_8x8_mode_horz_u(UWORD8 *pu1_src, +@ UWORD8 *pu1_dst, +@ WORD32 src_strd, +@ WORD32 dst_strd, +@ WORD32 ui_neighboravailability) + +@**************Variables Vs Registers***************************************** +@ r0 => *pu1_src +@ r1 => *pu1_dst +@ r2 => src_strd +@ r3 => dst_strd +@ r4 => ui_neighboravailability + + .global ih264_intra_pred_luma_8x8_mode_horz_u_a9q + +ih264_intra_pred_luma_8x8_mode_horz_u_a9q: + + stmfd sp!, {r4-r12, r14} @store register values to stack + vpush {d8-d15} + + vld1.u8 {q0}, [r0] + vld1.u8 {d1[7]}, [r0] + vext.8 q1, q0, q0, #1 + vext.8 q2, q1, q1, #1 + @ LOADING V TABLE + ldr r12, scratch_intrapred_addr_8x8 +scrlb8x8l2: + add r12, r12, pc + vaddl.u8 q10, d0, d2 + vaddl.u8 q11, d1, d3 + vaddl.u8 q12, d2, d4 + vaddl.u8 q13, d3, d5 + vadd.u16 q12, q10, q12 + vadd.u16 q13, q11, q13 + vld1.u8 {q5}, [r12] + vqrshrun.s16 d4, q10, #1 + vqrshrun.s16 d5, q11, #1 + vqrshrun.s16 d6, q12, #2 + vqrshrun.s16 d7, q13, #2 + @Q2 has all FILT11 values + @Q3 has all FILT121 values + vtbl.u8 d12, {q2, q3}, d10 + vdup.u8 q7, d5[7] @ + vtbl.u8 d13, {q2, q3}, d11 + vext.8 q8, q6, q7, #2 + vext.8 q9, q8, q7, #2 + vst1.8 {d12}, [r1], r3 + vext.8 q10, q9, q7, #2 + vst1.8 {d16}, [r1], r3 + vst1.8 {d18}, [r1], r3 + vst1.8 {d20}, [r1], r3 + vst1.8 {d13}, [r1], r3 + vst1.8 {d17}, [r1], r3 + vst1.8 {d19}, [r1], r3 + vst1.8 {d21}, [r1], r3 + + +end_func_horz_u: + vpop {d8-d15} + ldmfd sp!, {r4-r12, pc} @Restoring registers from stack + + + + + + + + diff --git a/common/arm/ih264_iquant_itrans_recon_a9.s b/common/arm/ih264_iquant_itrans_recon_a9.s new file mode 100755 index 0000000..f71ca69 --- /dev/null +++ b/common/arm/ih264_iquant_itrans_recon_a9.s @@ -0,0 +1,871 @@ +@/****************************************************************************** +@ * +@ * Copyright (C) 2015 The Android Open Source Project +@ * +@ * Licensed under the Apache License, Version 2.0 (the "License"); +@ * you may not use this file except in compliance with the License. +@ * You may obtain a copy of the License at: +@ * +@ * http://www.apache.org/licenses/LICENSE-2.0 +@ * +@ * Unless required by applicable law or agreed to in writing, software +@ * distributed under the License is distributed on an "AS IS" BASIS, +@ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +@ * See the License for the specific language governing permissions and +@ * limitations under the License. +@ * +@ ***************************************************************************** +@ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore +@*/ +@/** +@ ******************************************************************************* +@ * @file +@ * ih264_iquant_itrans_recon_a9.s +@ * +@ * @brief +@ * Contains function definitions for single stage inverse transform +@ * +@ * @author +@ * Mohit +@ * Harinarayanaan +@ * +@ * @par List of Functions: +@ * - ih264_iquant_itrans_recon_4x4_a9() +@ * - ih264_iquant_itrans_recon_8x8_a9() +@ * - ih264_iquant_itrans_recon_chroma_4x4_a9() +@ * +@ * @remarks +@ * None +@ * +@ ******************************************************************************* +@*/ +@/** +@ ******************************************************************************* +@ * +@ * @brief +@ * This function performs inverse quant and Inverse transform type Ci4 for 4*4 block +@ * +@ * @par Description: +@ * Performs inverse transform Ci4 and adds the residue to get the +@ * reconstructed block +@ * +@ * @param[in] pi2_src +@ * Input 4x4 coefficients +@ * +@ * @param[in] pu1_pred +@ * Prediction 4x4 block +@ * +@ * @param[out] pu1_out +@ * Output 4x4 block +@ * +@ * @param[in] u4_qp_div_6 +@ * QP +@ * +@ * @param[in] pu2_weigh_mat +@ * Pointer to weight matrix +@ * +@ * @param[in] pred_strd, +@ * Prediction stride +@ * +@ * @param[in] out_strd +@ * Output Stride +@ * +@ *@param[in] pi2_tmp +@ * temporary buffer of size 1*16 +@ * +@ * @param[in] pu2_iscal_mat +@ * Pointer to the inverse quantization matrix +@ * +@ * @returns Void +@ * +@ * @remarks +@ * None +@ * +@ ******************************************************************************* +@ */ +@void ih264_iquant_itrans_recon_4x4(WORD16 *pi2_src, +@ UWORD8 *pu1_pred, +@ UWORD8 *pu1_out, +@ WORD32 pred_strd, +@ WORD32 out_strd, +@ const UWORD16 *pu2_iscal_mat, +@ const UWORD16 *pu2_weigh_mat, +@ UWORD32 u4_qp_div_6, +@ WORD32 *pi4_tmp, +@ WORD32 iq_start_idx +@ WORD16 *pi2_dc_ld_addr) +@**************Variables Vs Registers***************************************** +@r0 => *pi2_src +@r1 => *pu1_pred +@r2 => *pu1_out +@r3 => pred_strd +@r4 => out_strd +@r5 => *pu2_iscal_mat +@r6 => *pu2_weigh_mat +@r7 => u4_qp_div_6 +@r8 => iq_start_idx +@r10=> pi2_dc_ld_addr +.text +.p2align 2 + + .global ih264_iquant_itrans_recon_4x4_a9 + +ih264_iquant_itrans_recon_4x4_a9: + +@VLD4.S16 is used because the pointer is incremented by SUB_BLK_WIDTH_4x4 +@If the macro value changes need to change the instruction according to it. +@Only one shift is done in horizontal inverse because, +@if u4_qp_div_6 is lesser than 4 then shift value will be neagative and do negative left shift, in this case rnd_factor has value +@if u4_qp_div_6 is greater than 4 then shift value will be positive and do left shift, here rnd_factor is 0 + + stmfd sp!, {r4-r12, r14} @stack stores the values of the arguments + ldr r7, [sp, #52] @Loads u4_qp_div_6 + ldr r4, [sp, #40] @Loads out_strd + vdup.s32 q15, r7 @Populate the u4_qp_div_6 in Q15 + ldr r5, [sp, #44] @Loads *pu2_iscal_mat + + ldr r6, [sp, #48] @Loads *pu2_weigh_mat + + ldr r8, [sp, #60] @Loads iq_start_idx + + ldr r10, [sp, #64] @Load alternate dc address + + vpush {d8-d15} +@=======================DEQUANT FROM HERE=================================== + + vld4.s16 {d20, d21, d22, d23}, [r5] @Load pu2_iscal_mat[i], i =0..15 + vld4.s16 {d26, d27, d28, d29}, [r6] @pu2_weigh_mat[i], i =0..15 + vmul.s16 q10, q10, q13 @x[i]=(scale[i] * dequant[i]) where i = 0..7 + vld4.s16 {d16, d17, d18, d19}, [r0] @pi2_src_tmp[i], i =0..15 + + vmul.s16 q11, q11, q14 @x[i]=(scale[i] * dequant[i]) where i = 8..15 + + subs r8, r8, #1 @ if r8 == 1 => intra case , so result of subtraction is zero and Z flag is set + ldreqsh r9, [r10] @ Loads signed halfword pi2_dc_ld_addr[0], if r8==1 + + vmull.s16 q0, d16, d20 @ Q0 = p[i] = (x[i] * trns_coeff[i]) where i = 0..3 + vmull.s16 q1, d17, d21 @ Q1 = p[i] = (x[i] * trns_coeff[i]) where i = 4..7 + vmull.s16 q2, d18, d22 @ Q2 = p[i] = (x[i] * trns_coeff[i]) where i = 8..11 + vmull.s16 q3, d19, d23 @ Q3 = p[i] = (x[i] * trns_coeff[i]) where i = 12..15 + + vshl.s32 q0, q0, q15 @ Q0 = q[i] = (p[i] << (qP/6)) where i = 0..3 + vshl.s32 q1, q1, q15 @ Q1 = q[i] = (p[i] << (qP/6)) where i = 4..7 + vshl.s32 q2, q2, q15 @ Q2 = q[i] = (p[i] << (qP/6)) where i = 8..11 + vshl.s32 q3, q3, q15 @ Q3 = q[i] = (p[i] << (qP/6)) where i = 12..15 + + vqrshrn.s32 d0, q0, #0x4 @ D0 = c[i] = ((q[i] + 32) >> 4) where i = 0..3 + vqrshrn.s32 d1, q1, #0x4 @ D1 = c[i] = ((q[i] + 32) >> 4) where i = 4..7 + vqrshrn.s32 d2, q2, #0x4 @ D2 = c[i] = ((q[i] + 32) >> 4) where i = 8..11 + vqrshrn.s32 d3, q3, #0x4 @ D3 = c[i] = ((q[i] + 32) >> 4) where i = 12..15 + + vmoveq.16 d0[0], r9 @ Restore dc value in case of intra, i.e. r8 == 1 + +@========= PROCESS IDCT FROM HERE ======= +@Steps for Stage 1: +@------------------ + vld1.32 d30[0], [r1], r3 @I row Load pu1_pred buffer + vadd.s16 d4, d0, d2 @x0 = q0 + q1; + + vsub.s16 d5, d0, d2 @x1 = q0 - q1; + + vshr.s16 d8, d1, #1 @q0>>1 + vshr.s16 d9, d3, #1 @q1>>1 + + vsub.s16 d6, d8, d3 @x2 = (q0 >> 1) - q1; + vadd.s16 d7, d1, d9 @x3 = q0+ (q1 >> 1); + vld1.32 d30[1], [r1], r3 @II row Load pu1_pred buffer + + vswp d6, d7 @Reverse positions of x2 and x3 + + vsub.s16 q6, q2, q3 @x0-x3 and x1-x2 combined + vadd.s16 q5, q2, q3 @x0 + x3 and x1+x2 combined + + vld1.32 d31[0], [r1], r3 @III row Load pu1_pred buf + + vswp d12, d13 +@Steps for Stage 2: +@------------------ + vtrn.16 d10, d11 + vtrn.16 d12, d13 + vtrn.32 d10, d12 + vtrn.32 d11, d13 + vadd.s16 d14, d10, d12 @x0 = q0 + q1; + + vsub.s16 d15, d10, d12 @x1 = q0 - q1; + + vshr.s16 d18, d11, #1 @q0>>1 + vshr.s16 d19, d13, #1 @q1>>1 + + vsub.s16 d16, d18, d13 @x2 = (q0 >> 1) - q1; + vadd.s16 d17, d11, d19 @x3 = q0+ (q1 >> 1); + + vld1.32 d31[1], [r1], r3 @IV row Load pu1_pred buffer + vswp d16, d17 @Reverse positions of x2 and x3 + + vsub.s16 q11, q7, q8 @x0-x3 and x1-x2 combined + vadd.s16 q10, q7, q8 @x0 + x3 and x1+x2 combined + + vswp d22, d23 + + vrshr.s16 q10, q10, #6 @ + vrshr.s16 q11, q11, #6 + + vaddw.u8 q10, q10, d30 + vaddw.u8 q11, q11, d31 + + vqmovun.s16 d0, q10 + vqmovun.s16 d1, q11 + + vst1.32 d0[0], [r2], r4 @I row store the value + vst1.32 d0[1], [r2], r4 @II row store the value + vst1.32 d1[0], [r2], r4 @III row store the value + vst1.32 d1[1], [r2] @IV row store the value + + vpop {d8-d15} + ldmfd sp!, {r4-r12, r15} @Reload the registers from SP + + + @/** +@ ******************************************************************************* +@ * +@ * @brief +@ * This function performs inverse quant and Inverse transform type Ci4 for 4*4 block +@ * +@ * @par Description: +@ * Performs inverse transform Ci4 and adds the residue to get the +@ * reconstructed block +@ * +@ * @param[in] pi2_src +@ * Input 4x4 coefficients +@ * +@ * @param[in] pu1_pred +@ * Prediction 4x4 block +@ * +@ * @param[out] pu1_out +@ * Output 4x4 block +@ * +@ * @param[in] u4_qp_div_6 +@ * QP +@ * +@ * @param[in] pu2_weigh_mat +@ * Pointer to weight matrix +@ * +@ * @param[in] pred_strd, +@ * Prediction stride +@ * +@ * @param[in] out_strd +@ * Output Stride +@ * +@ *@param[in] pi2_tmp +@ * temporary buffer of size 1*16 +@ * +@ * @param[in] pu2_iscal_mat +@ * Pointer to the inverse quantization matrix +@ * +@ * @returns Void +@ * +@ * @remarks +@ * None +@ * +@ ******************************************************************************* +@ */ +@void ih264_iquant_itrans_recon_chroma_4x4(WORD16 *pi2_src, +@ UWORD8 *pu1_pred, +@ UWORD8 *pu1_out, +@ WORD32 pred_strd, +@ WORD32 out_strd, +@ const UWORD16 *pu2_iscal_mat, +@ const UWORD16 *pu2_weigh_mat, +@ UWORD32 u4_qp_div_6, +@ WORD32 *pi4_tmp +@ WORD16 *pi2_dc_src) +@**************Variables Vs Registers***************************************** +@r0 => *pi2_src +@r1 => *pu1_pred +@r2 => *pu1_out +@r3 => pred_strd +@r4 => out_strd +@r5 => *pu2_iscal_mat +@r6 => *pu2_weigh_mat +@r7 => u4_qp_div_6 + + .global ih264_iquant_itrans_recon_chroma_4x4_a9 +ih264_iquant_itrans_recon_chroma_4x4_a9: + +@VLD4.S16 is used because the pointer is incremented by SUB_BLK_WIDTH_4x4 +@If the macro value changes need to change the instruction according to it. +@Only one shift is done in horizontal inverse because, +@if u4_qp_div_6 is lesser than 4 then shift value will be neagative and do negative left shift, in this case rnd_factor has value +@if u4_qp_div_6 is greater than 4 then shift value will be positive and do left shift, here rnd_factor is 0 + + stmfd sp!, {r4-r12, r14} @stack stores the values of the arguments + ldr r7, [sp, #52] @Loads u4_qp_div_6 + ldr r4, [sp, #40] @Loads out_strd + vdup.s32 q15, r7 @Populate the u4_qp_div_6 in Q15 + ldr r5, [sp, #44] @Loads *pu2_iscal_mat + ldr r6, [sp, #48] @Loads *pu2_weigh_mat + ldr r8, [sp, #60] @loads *pi2_dc_src + + vpush {d8-d15} +@=======================DEQUANT FROM HERE=================================== + + vld4.s16 {d20, d21, d22, d23}, [r5] @Load pu2_iscal_mat[i], i =0..15 + vld4.s16 {d26, d27, d28, d29}, [r6] @pu2_weigh_mat[i], i =0..15 + vmul.s16 q10, q10, q13 @x[i]=(scale[i] * dequant[i]) where i = 0..7 + vld4.s16 {d16, d17, d18, d19}, [r0] @pi2_src_tmp[i], i =0..15 + + vmul.s16 q11, q11, q14 @x[i]=(scale[i] * dequant[i]) where i = 8..15 + + vmull.s16 q0, d16, d20 @ Q0 = p[i] = (x[i] * trns_coeff[i]) where i = 0..3 + vmull.s16 q1, d17, d21 @ Q1 = p[i] = (x[i] * trns_coeff[i]) where i = 4..7 + vmull.s16 q2, d18, d22 @ Q2 = p[i] = (x[i] * trns_coeff[i]) where i = 8..11 + vmull.s16 q3, d19, d23 @ Q3 = p[i] = (x[i] * trns_coeff[i]) where i = 12..15 + + vshl.s32 q0, q0, q15 @ Q0 = q[i] = (p[i] << (qP/6)) where i = 0..3 + vshl.s32 q1, q1, q15 @ Q1 = q[i] = (p[i] << (qP/6)) where i = 4..7 + vshl.s32 q2, q2, q15 @ Q2 = q[i] = (p[i] << (qP/6)) where i = 8..11 + vshl.s32 q3, q3, q15 @ Q3 = q[i] = (p[i] << (qP/6)) where i = 12..15 + + vqrshrn.s32 d0, q0, #0x4 @ D0 = c[i] = ((q[i] + 32) >> 4) where i = 0..3 + vqrshrn.s32 d1, q1, #0x4 @ D1 = c[i] = ((q[i] + 32) >> 4) where i = 4..7 + vqrshrn.s32 d2, q2, #0x4 @ D2 = c[i] = ((q[i] + 32) >> 4) where i = 8..11 + vqrshrn.s32 d3, q3, #0x4 @ D3 = c[i] = ((q[i] + 32) >> 4) where i = 12..15 + + ldrsh r9, [r8] @ Loads signed halfword pi2_dc_src[0] + vmov.16 d0[0], r9 @ Restore dc value since its chroma iq-it + +@========= PROCESS IDCT FROM HERE ======= +@Steps for Stage 1: +@------------------ + vld2.8 {d28, d29}, [r1], r3 @I row Load pu1_pred buffer + vadd.s16 d4, d0, d2 @x0 = q0 + q1; + + vsub.s16 d5, d0, d2 @x1 = q0 - q1; + + vshr.s16 d8, d1, #1 @q0>>1 + vshr.s16 d9, d3, #1 @q1>>1 + + vsub.s16 d6, d8, d3 @x2 = (q0 >> 1) - q1; + vadd.s16 d7, d1, d9 @x3 = q0+ (q1 >> 1); + vld2.8 {d29, d30}, [r1], r3 @II row Load pu1_pred buffer + + vswp d6, d7 @Reverse positions of x2 and x3 + + vsub.s16 q6, q2, q3 @x0-x3 and x1-x2 combined + vtrn.32 d28, d29 @ D28 -- row I and II of pu1_pred_buffer + vadd.s16 q5, q2, q3 @x0 + x3 and x1+x2 combined + + vld2.8 {d29, d30}, [r1], r3 @III row Load pu1_pred buf + + vswp d12, d13 +@Steps for Stage 2: +@------------------ + vtrn.16 d10, d11 + vtrn.16 d12, d13 + vtrn.32 d10, d12 + vtrn.32 d11, d13 + vadd.s16 d14, d10, d12 @x0 = q0 + q1; + + vsub.s16 d15, d10, d12 @x1 = q0 - q1; + + vshr.s16 d18, d11, #1 @q0>>1 + vshr.s16 d19, d13, #1 @q1>>1 + + vsub.s16 d16, d18, d13 @x2 = (q0 >> 1) - q1; + vadd.s16 d17, d11, d19 @x3 = q0+ (q1 >> 1); + + vld2.8 {d30, d31}, [r1], r3 @IV row Load pu1_pred buffer + vswp d16, d17 @Reverse positions of x2 and x3 + + vsub.s16 q11, q7, q8 @x0-x3 and x1-x2 combined + vtrn.32 d29, d30 @ D29 -- row III and IV of pu1_pred_buf + vadd.s16 q10, q7, q8 @x0 + x3 and x1+x2 combined + + vswp d22, d23 + + vrshr.s16 q10, q10, #6 @ + vrshr.s16 q11, q11, #6 + + vaddw.u8 q10, q10, d28 + vaddw.u8 q11, q11, d29 + + vld1.u8 d0, [r2], r4 @Loading out buffer 16 coeffs + vld1.u8 d1, [r2], r4 + vld1.u8 d2, [r2], r4 + vld1.u8 d3, [r2], r4 + + sub r2, r2, r4, lsl #2 + + vqmovun.s16 d20, q10 @Getting quantized coeffs + vqmovun.s16 d22, q11 + + vmovl.u8 q10, d20 @Move the coffs into 16 bit + vmovl.u8 q11, d22 @so that we can use vbit to copy + + vmov.u16 q14, #0x00ff @Copy lsb from qantized(long)coeffs + + vbit.u8 q0, q10, q14 + vbit.u8 q1, q11, q14 + + vst1.u8 d0, [r2], r4 + vst1.u8 d1, [r2], r4 + vst1.u8 d2, [r2], r4 + vst1.u8 d3, [r2] + + vpop {d8-d15} + ldmfd sp!, {r4-r12, r15} @Reload the registers from SP + + +@/* +@ ******************************************************************************* +@ * +@ * @brief +@ * This function performs inverse quant and Inverse transform type Ci4 for 8*8 block +@ * +@ * @par Description: +@ * Performs inverse transform Ci8 and adds the residue to get the +@ * reconstructed block +@ * +@ * @param[in] pi2_src +@ * Input 4x4 coefficients +@ * +@ * @param[in] pu1_pred +@ * Prediction 4x4 block +@ * +@ * @param[out] pu1_out +@ * Output 4x4 block +@ * +@ * @param[in] u4_qp_div_6 +@ * QP +@ * +@ * @param[in] pu2_weigh_mat +@ * Pointer to weight matrix +@ * +@ * @param[in] pred_strd, +@ * Prediction stride +@ * +@ * @param[in] out_strd +@ * Output Stride +@ * +@ *@param[in] pi2_tmp +@ * temporary buffer of size 1*64 +@ * +@ * @param[in] pu2_iscal_mat +@ * Pointer to the inverse quantization matrix +@ * +@ * @returns Void +@ * +@ * @remarks +@ * None +@ * +@ ******************************************************************************* +@ */ +@void ih264_iquant_itrans_recon_8x8(WORD16 *pi2_src, +@ UWORD8 *pu1_pred, +@ UWORD8 *pu1_out, +@ WORD32 pred_strd, +@ WORD32 out_strd, +@ const UWORD16 *pu2_iscal_mat, +@ const UWORD16 *pu2_weigh_mat, +@ UWORD32 u4_qp_div_6, +@ WORD32 *pi4_tmp, +@ WORD32 iq_start_idx) +@**************Variables Vs Registers***************************************** +@r0 => *pi2_src +@r1 => *pu1_pred +@r2 => *pu1_out +@r3 => pred_strd +@r4 => out_strd +@r5 => *pu2_iscal_mat +@r6 => *pu2_weigh_mat +@r7 => u4_qp_div_6 + + + .global ih264_iquant_itrans_recon_8x8_a9 +ih264_iquant_itrans_recon_8x8_a9: + + stmfd sp!, {r4-r12, r14} @stack stores the values of the arguments + ldr r7, [sp, #52] @Loads u4_qp_div_6 + ldr r4, [sp, #40] @Loads out_strd + + ldr r5, [sp, #44] @Loads *pu2_iscal_mat + ldr r6, [sp, #48] @Loads *pu2_weigh_mat + vdup.s32 q15, r7 @Populate the u4_qp_div_6 in Q15 + vpush {d8-d15} + +idct_8x8_begin: + +@========= DEQUANT FROM HERE =========== + + vld1.32 {q13}, [r5]! @ Q13 = dequant values row 0 + vld1.32 {q10}, [r6]! @ Q10 = scaling factors row 0 + vld1.32 {q14}, [r5]! @ Q14 = dequant values row 1 + vmul.s16 q10, q10, q13 @ Q10 = x[i] = (scale[i] * dequant[i]) where i = 0..7 + vld1.32 {q11}, [r6]! @ Q11 = scaling factors row 1 + vld1.32 {q8}, [r0]! @ Q8 = Source row 0 + vmul.s16 q11, q11, q14 @ Q11 = x[i] = (scale[i] * dequant[i]) where i = 8..15 + vmull.s16 q0, d16, d20 @ Q0 = p[i] = (x[i] * trns_coeff[i]) where i = 0..3 + vld1.32 {q9}, [r0]! @ Q8 = Source row 1 + vmull.s16 q1, d17, d21 @ Q1 = p[i] = (x[i] * trns_coeff[i]) where i = 4..7 + vmull.s16 q2, d18, d22 @ Q2 = p[i] = (x[i] * trns_coeff[i]) where i = 8..11 + vld1.32 {q13}, [r6]! @ Scaling factors row 2 + vmull.s16 q3, d19, d23 @ Q3 = p[i] = (x[i] * trns_coeff[i]) where i = 12..15 + vld1.32 {q14}, [r6]! @ Scaling factors row 3 + vshl.s32 q0, q0, q15 @ Q0 = q[i] = (p[i] << (qP/6)) where i = 0..3 + vld1.32 {q10}, [r5]! @ Q10 = Dequant values row 2 + vshl.s32 q1, q1, q15 @ Q1 = q[i] = (p[i] << (qP/6)) where i = 4..7 + vld1.32 {q8}, [r0]! @ Source Row 2 + vshl.s32 q2, q2, q15 @ Q2 = q[i] = (p[i] << (qP/6)) where i = 8..11 + vld1.32 {q11}, [r5]! @ Q11 = Dequant values row 3 + vshl.s32 q3, q3, q15 @ Q3 = q[i] = (p[i] << (qP/6)) where i = 12..15 + vld1.32 {q9}, [r0]! @ Source Row 3 + vmul.s16 q10, q10, q13 @ Dequant row2*scale matrix row 2 + vmul.s16 q11, q11, q14 @ Dequant row 3*scale matrix row 3 + vld1.32 {q4}, [r6]! @ Scaling factors row 4 + vqrshrn.s32 d0, q0, #0x6 @ D0 = c[i] = ((q[i] + 32) >> 6) where i = 0..3 + vqrshrn.s32 d1, q1, #0x6 @ D1 = c[i] = ((q[i] + 32) >> 6) where i = 4..7 + vld1.32 {q5}, [r6]! @ Scaling factors row 5 + vqrshrn.s32 d2, q2, #0x6 @ D2 = c[i] = ((q[i] + 32) >> 6) where i = 8..11 + vqrshrn.s32 d3, q3, #0x6 @ D3 = c[i] = ((q[i] + 32) >> 6) where i = 12..15 + vld1.32 {q13}, [r5]! @ Q13 = Dequant values row 4 + vmull.s16 q2, d16, d20 @ p[i] = (x[i] * trns_coeff[i]) where i=16..19 + vmull.s16 q3, d17, d21 @ p[i] = (x[i] * trns_coeff[i]) where i=20..23 + vld1.32 {q12}, [r5]! @ Q12 = Dequant values row 5 + vmull.s16 q6, d18, d22 @ p[i] = (x[i] * trns_coeff[i]) where i=24..27 + vmull.s16 q7, d19, d23 @ p[i] = (x[i] * trns_coeff[i]) where i=28..31 + + vld1.32 {q14}, [r0]! @ Source row 4 + vmul.s16 q10, q4, q13 @ Dequant row4*scale matrix row 4 + vmul.s16 q11, q5, q12 @ Dequant row5*scale matrix row 5 + vld1.32 {q9}, [r0]! @ Source row 5 + vshl.s32 q2, q2, q15 @ + vshl.s32 q3, q3, q15 @ + vld1.32 {q13}, [r6]! @ Scaling factors row 6 + vshl.s32 q6, q6, q15 @ + vshl.s32 q7, q7, q15 @ + vmull.s16 q4, d28, d20 @ i = 32..35 + vqrshrn.s32 d4, q2, #0x6 @ D4 = c[i] = ((q[i] + 32) >> 6) where i = 16..19 + vqrshrn.s32 d5, q3, #0x6 @ D5 = c[i] = ((q[i] + 32) >> 6) where i = 20..23 + vmull.s16 q5, d29, d21 @ i =36..39 + vld1.32 {q10}, [r5]! @ Dequant values row 6 + vqrshrn.s32 d6, q6, #0x6 @ D6 = c[i] = ((q[i] + 32) >> 6) where i = 24..27 + vqrshrn.s32 d7, q7, #0x6 @ D7 = c[i] = ((q[i] + 32) >> 6) where i = 28..31 + vld1.32 {q14}, [r6]! @ Scaling factors row 7 + vmull.s16 q6, d18, d22 @ + vld1.32 {q8}, [r0]! @ Source row 6 + vmull.s16 q7, d19, d23 @ + vld1.32 {q11}, [r5]! @ Dequant values row 7 + vshl.s32 q4, q4, q15 @ + vld1.32 {q9}, [r0]! @ Source row 7 + vshl.s32 q5, q5, q15 @ + + vshl.s32 q6, q6, q15 @ + vshl.s32 q7, q7, q15 @ + vmul.s16 q10, q10, q13 @ Dequant*scaling row 6 + vmul.s16 q11, q11, q14 @ Dequant*scaling row 7 + vqrshrn.s32 d8, q4, #0x6 @ D8 = c[i] = ((q[i] + 32) >> 6) where i = 32..35 + vqrshrn.s32 d9, q5, #0x6 @ D9 = c[i] = ((q[i] + 32) >> 6) where i = 36..39 + vqrshrn.s32 d10, q6, #0x6 @ D10 = c[i] = ((q[i] + 32) >> 6) where i = 40..43 + vqrshrn.s32 d11, q7, #0x6 @ D11 = c[i] = ((q[i] + 32) >> 6) where i = 44..47 + vmull.s16 q6, d16, d20 @ i= 48..51 + vmull.s16 q7, d17, d21 @ i= 52..55 + vmull.s16 q8, d18, d22 @ i=56..59 + vmull.s16 q9, d19, d23 @ i=60..63 + vshl.s32 q6, q6, q15 @ + vzip.s16 q0, q1 @Transpose + vshl.s32 q7, q7, q15 @ + vshl.s32 q8, q8, q15 @ + vzip.s16 q2, q3 @ + vshl.s32 q9, q9, q15 @ + vqrshrn.s32 d12, q6, #0x6 @ D12 = c[i] = ((q[i] + 32) >> 6) where i = 48..51 + vzip.s16 q4, q5 @Transpose + vqrshrn.s32 d13, q7, #0x6 @ D13 = c[i] = ((q[i] + 32) >> 6) where i = 52..55 + vqrshrn.s32 d14, q8, #0x6 @ D14 = c[i] = ((q[i] + 32) >> 6) where i = 56..59 + vzip.s32 q0, q2 @Transpose + vqrshrn.s32 d15, q9, #0x6 @ D15 = c[i] = ((q[i] + 32) >> 6) where i = 60..63 + +@========= PROCESS IDCT FROM HERE ======= + +@Steps for Stage 2: +@------------------ + +@ TRANSPOSE 8x8 coeffs to actual order + + vzip.s16 q6, q7 @ + + vzip.s32 q1, q3 @ + vzip.s32 q4, q6 @ + vzip.s32 q5, q7 @ + + vswp d1, d8 @ Q0/Q1 = Row order x0/x1 + vswp d3, d10 @ Q2/Q3 = Row order x2/x3 + vswp d5, d12 @ Q4/Q5 = Row order x4/x5 + vswp d7, d14 @ Q6/Q7 = Row order x6/x7 + + vswp q1, q4 @ + vshr.s16 q10, q2, #0x1 @ + vswp q3, q6 @ + +@Steps for Stage 1: +@------------------ + + vadd.s16 q8, q0, q4 @ Q8 = y0 + vsub.s16 q9, q0, q4 @ Q9 = y2 + + vsra.s16 q2, q6, #0x1 @ Q2 = y6 + vsub.s16 q6, q10, q6 @ Q6 = y4 + + vaddl.s16 q12, d14, d2 @ y3 (0-3) 1+7 + vaddl.s16 q13, d15, d3 @ y3 (4-7) 1+7 + + vsubl.s16 q10, d14, d2 @ y5 (0-3) 7-1 + vsubl.s16 q11, d15, d3 @ y5 (4-7) 7-1 + + vadd.s16 q0, q8, q2 @ Q0 = z0 + vsub.s16 q4, q8, q2 @ Q4 = z6 + + vadd.s16 q8, q9, q6 @ Q8 = z2 + vsub.s16 q2, q9, q6 @ Q2 = z4 + + vsubw.s16 q12, q12, d6 @ y3 (0-3) 1+7-3 + vsubw.s16 q13, q13, d7 @ y3 (0-7) 1+7-3 + + vshr.s16 q6, q3, #0x1 @ + + vaddw.s16 q10, q10, d10 @ + vaddw.s16 q11, q11, d11 @ + + vshr.s16 q9, q5, #0x1 @ + + vsubw.s16 q12, q12, d12 @ + vsubw.s16 q13, q13, d13 @ + + vaddw.s16 q10, q10, d18 @ + vaddw.s16 q11, q11, d19 @ + + vqmovn.s32 d12, q12 @ + vaddl.s16 q12, d10, d6 @ + vqmovn.s32 d13, q13 @ Q6 = y3 + vaddl.s16 q13, d11, d7 @ + vqmovn.s32 d18, q10 @ + vsubl.s16 q10, d10, d6 @ + vqmovn.s32 d19, q11 @ Q9 = y5 + vsubl.s16 q11, d11, d7 @ + + vshr.s16 q3, q6, #0x2 @ + + vsra.s16 q6, q9, #0x2 @ Q6 = z3 + + vaddw.s16 q12, q12, d2 @ + vaddw.s16 q13, q13, d3 @ + + vshr.s16 q1, #0x1 @ + + vsub.s16 q5, q3, q9 @ Q5 = z5 + + vsubw.s16 q10, q10, d14 @ + vsubw.s16 q11, q11, d15 @ + + vshr.s16 q7, #0x1 @ + + vaddw.s16 q12, q12, d2 @ + vaddw.s16 q13, q13, d3 @ + + vsubw.s16 q10, q10, d14 @ + vsubw.s16 q11, q11, d15 @ + + + vqmovn.s32 d14, q12 @ + vadd.s16 q1, q8, q5 @ Q1 = x1 + vqmovn.s32 d15, q13 @ Q7 = y7 + vsub.s16 q3, q8, q5 @ Q3 = x6 + vqmovn.s32 d18, q10 @ + vsub.s16 q5, q2, q6 @ Q5 = x5 + vqmovn.s32 d19, q11 @ Q9 = y1 + vadd.s16 q2, q2, q6 @ Q2 = x2 + + vshr.s16 q12, q9, #0x2 @ + vsra.s16 q9, q7, #0x2 @ Q9 = z1 + + vsub.s16 q11, q7, q12 @ Q11 = z7 + + vadd.s16 q6, q4, q9 @ Q6 = x3 + vsub.s16 q4, q4, q9 @ Q4 = x4 + + vsub.s16 q7, q0, q11 @ Q7 = x7 + vadd.s16 q0, q0, q11 @ Q0 = x0 + + vswp.s16 q3, q6 @ Q3 = x3, Q6 = x6 + + +@Steps for Stage 2: +@------------------ + +@ TRANSPOSE 8x8 coeffs to actual order + + vzip.s16 q0, q1 @ + vzip.s16 q2, q3 @ + vzip.s16 q4, q5 @ + vzip.s16 q6, q7 @ + + vzip.s32 q0, q2 @ + vzip.s32 q1, q3 @ + vzip.s32 q4, q6 @ + vzip.s32 q5, q7 @ + + vswp d1, d8 @ Q0/Q1 = Row order x0/x1 + vswp d3, d10 @ Q2/Q3 = Row order x2/x3 + vswp d5, d12 @ Q4/Q5 = Row order x4/x5 + vswp d7, d14 @ Q6/Q7 = Row order x6/x7 + + vswp q1, q4 @ + vshr.s16 q10, q2, #0x1 @ + vswp q3, q6 @ + +@Steps for Stage 3: +@------------------ + +@Repeat stage 1 again for vertical transform + + vadd.s16 q8, q0, q4 @ Q8 = y0 + vld1.32 d28, [r1], r3 @ Q12 = 0x070605....0x070605.... + vsub.s16 q9, q0, q4 @ Q9 = y2 + + vsra.s16 q2, q6, #0x1 @ Q2 = y6 + vsub.s16 q6, q10, q6 @ Q6 = y4 + + vaddl.s16 q12, d14, d2 @ + vld1.32 d29, [r1], r3 @ Q12 = 0x070605....0x070605.... + vaddl.s16 q13, d15, d3 @ + + vsubl.s16 q10, d14, d2 @ + vld1.32 d30, [r1], r3 @ Q12 = 0x070605....0x070605.... + vsubl.s16 q11, d15, d3 @ + + vadd.s16 q0, q8, q2 @ Q0 = z0 + vld1.32 d31, [r1], r3 @ Q12 = 0x070605....0x070605.... + vsub.s16 q4, q8, q2 @ Q4 = z6 + + vadd.s16 q8, q9, q6 @ Q8 = z2 + vsub.s16 q2, q9, q6 @ Q2 = z4 + + vsubw.s16 q12, q12, d6 @ + vsubw.s16 q13, q13, d7 @ + + vshr.s16 q6, q3, #0x1 @ + + vaddw.s16 q10, q10, d10 @ + vaddw.s16 q11, q11, d11 @ + + vshr.s16 q9, q5, #0x1 @ + + vsubw.s16 q12, q12, d12 @ + vsubw.s16 q13, q13, d13 @ + + vaddw.s16 q10, q10, d18 @ + vaddw.s16 q11, q11, d19 @ + + vqmovn.s32 d12, q12 @ + vaddl.s16 q12, d10, d6 @ + vqmovn.s32 d13, q13 @ Q6 = y3 + vaddl.s16 q13, d11, d7 @ + vqmovn.s32 d18, q10 @ + vsubl.s16 q10, d10, d6 @ + vqmovn.s32 d19, q11 @ Q9 = y5 + vsubl.s16 q11, d11, d7 @ + + vshr.s16 q3, q6, #0x2 @ + + vsra.s16 q6, q9, #0x2 @ Q6 = z3 + + vaddw.s16 q12, q12, d2 @ + vaddw.s16 q13, q13, d3 @ + + vshr.s16 q1, #0x1 @ + + vsub.s16 q5, q3, q9 @ Q5 = z5 + + vsubw.s16 q10, q10, d14 @ + vsubw.s16 q11, q11, d15 @ + + vshr.s16 q7, #0x1 @ + + vaddw.s16 q12, q12, d2 @ + vaddw.s16 q13, q13, d3 @ + + vsubw.s16 q10, q10, d14 @ + vsubw.s16 q11, q11, d15 @ + + vqmovn.s32 d14, q12 @ + vadd.s16 q1, q8, q5 @ Q1 = x1 + vqmovn.s32 d15, q13 @ Q7 = y7 + vsub.s16 q3, q8, q5 @ Q3 = x6 + vqmovn.s32 d18, q10 @ + vsub.s16 q5, q2, q6 @ Q5 = x5 + vqmovn.s32 d19, q11 @ Q9 = y1 + vadd.s16 q2, q2, q6 @ Q2 = x2 + + vshr.s16 q12, q9, #0x2 @ + vsra.s16 q9, q7, #0x2 @ Q9 = z1 + + vsub.s16 q11, q7, q12 @ Q11 = z7 + + vadd.s16 q6, q4, q9 @ Q6 = x3 + vsub.s16 q4, q4, q9 @ Q4 = x4 + + vsub.s16 q7, q0, q11 @ Q7 = x7 + vadd.s16 q0, q0, q11 @ Q0 = x0 + + vswp.s16 q3, q6 @ Q3 <-> Q6 + + vrshr.s16 q1, q1, #6 @ + vld1.32 d16, [r1], r3 @ Q12 = 0x070605....0x070605.... + vrshr.s16 q2, q2, #6 @ + vrshr.s16 q4, q4, #6 @ + vld1.32 d17, [r1], r3 @ Q12 = 0x070605....0x070605.... + vrshr.s16 q5, q5, #6 @ + vrshr.s16 q7, q7, #6 @ + vld1.32 d18, [r1], r3 @ Q12 = 0x070605....0x070605.... + vrshr.s16 q0, q0, #6 @ + vrshr.s16 q3, q3, #6 @ + vld1.32 d19, [r1], r3 @ Q12 = 0x070605....0x070605.... + vrshr.s16 q6, q6, #6 @ + +@ Code Added to pack sign and magnitudes + + vaddw.u8 q0, q0, d28 + vaddw.u8 q1, q1, d29 + vaddw.u8 q2, q2, d30 + vaddw.u8 q3, q3, d31 + vqmovun.s16 d0, q0 + vaddw.u8 q4, q4, d16 + vqmovun.s16 d1, q1 + vaddw.u8 q5, q5, d17 + vqmovun.s16 d2, q2 + vaddw.u8 q6, q6, d18 + vqmovun.s16 d3, q3 + vaddw.u8 q7, q7, d19 + + vqmovun.s16 d4, q4 + vst1.32 d0, [r2], r4 @ Magnitudes of 1st 4x4 block coeffs + vqmovun.s16 d5, q5 + vst1.32 d1, [r2], r4 @ Magnitudes of 1st 4x4 block coeffs + vqmovun.s16 d6, q6 + vst1.32 d2, [r2], r4 @ Magnitudes of 1st 4x4 block coeffs + vqmovun.s16 d7, q7 + vst1.32 d3, [r2], r4 @ Magnitudes of 1st 4x4 block coeffs + vst1.32 d4, [r2], r4 @ Magnitudes of 1st 4x4 block coeffs + + vst1.32 d5, [r2], r4 @ Magnitudes of 1st 4x4 block coeffs + + + vst1.32 d6, [r2], r4 @ Magnitudes of 1st 4x4 block coeffs + + + vst1.32 d7, [r2], r4 @ Magnitudes of 1st 4x4 block coeffs + +idct_8x8_end: + + vpop {d8-d15} + ldmfd sp!, {r4-r12, r15} + diff --git a/common/arm/ih264_iquant_itrans_recon_dc_a9.s b/common/arm/ih264_iquant_itrans_recon_dc_a9.s new file mode 100755 index 0000000..8d71bdb --- /dev/null +++ b/common/arm/ih264_iquant_itrans_recon_dc_a9.s @@ -0,0 +1,399 @@ +@/****************************************************************************** +@ * +@ * Copyright (C) 2015 The Android Open Source Project +@ * +@ * Licensed under the Apache License, Version 2.0 (the "License"); +@ * you may not use this file except in compliance with the License. +@ * You may obtain a copy of the License at: +@ * +@ * http://www.apache.org/licenses/LICENSE-2.0 +@ * +@ * Unless required by applicable law or agreed to in writing, software +@ * distributed under the License is distributed on an "AS IS" BASIS, +@ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +@ * See the License for the specific language governing permissions and +@ * limitations under the License. +@ * +@ ***************************************************************************** +@ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore +@*/ +@/** +@ ******************************************************************************* +@ * @file +@ * ih264_iquant_itrans_recon_dc_a9.s +@ * +@ * @brief +@ * Contains function definitions for single stage inverse transform +@ * +@ * @author +@ * Mohit +@ * +@ * @par List of Functions: +@ * - ih264_iquant_itrans_recon_4x4_dc_a9() +@ * - ih264_iquant_itrans_recon_8x8_dc_a9() +@ * - ih264_iquant_itrans_recon_chroma_4x4_dc_a9() +@ * +@ * @remarks +@ * None +@ * +@ ******************************************************************************* +@*/ +@/** +@ ******************************************************************************* +@ * +@ * @brief +@ * This function performs inverse quant and Inverse transform type Ci4 for 4*4 block +@ * for dc input pattern only, i.e. only the (0,0) element of the input 4x4 block is +@ * non-zero. For complete function, refer ih264_iquant_itrans_recon_a9.s +@ * +@ * @par Description: +@ * Performs inverse transform Ci4 and adds the residue to get the +@ * reconstructed block +@ * +@ * @param[in] pi2_src +@ * Input 4x4 coefficients +@ * +@ * @param[in] pu1_pred +@ * Prediction 4x4 block +@ * +@ * @param[out] pu1_out +@ * Output 4x4 block +@ * +@ * @param[in] u4_qp_div_6 +@ * QP +@ * +@ * @param[in] pu2_weigh_mat +@ * Pointer to weight matrix +@ * +@ * @param[in] pred_strd, +@ * Prediction stride +@ * +@ * @param[in] out_strd +@ * Output Stride +@ * +@ *@param[in] pi2_tmp +@ * temporary buffer of size 1*16 +@ * +@ * @param[in] pu2_iscal_mat +@ * Pointer to the inverse quantization matrix +@ * +@ * @returns Void +@ * +@ * @remarks +@ * None +@ * +@ ******************************************************************************* +@ */ +@void ih264_iquant_itrans_recon_4x4_dc(WORD16 *pi2_src, +@ UWORD8 *pu1_pred, +@ UWORD8 *pu1_out, +@ WORD32 pred_strd, +@ WORD32 out_strd, +@ const UWORD16 *pu2_iscal_mat, +@ const UWORD16 *pu2_weigh_mat, +@ UWORD32 u4_qp_div_6, +@ WORD32 *pi4_tmp, +@ WORD32 iq_start_idx +@ WORD16 *pi2_dc_ld_addr) +@**************Variables Vs Registers***************************************** +@r0 => *pi2_src +@r1 => *pu1_pred +@r2 => *pu1_out +@r3 => pred_strd +@r4 => out_strd +@r5 => *pu2_iscal_mat +@r6 => *pu2_weigh_mat +@r7 => u4_qp_div_6 +@r9 => iq_start_idx +@unused => pi2_dc_ld_addr + +.text +.p2align 2 + + .global ih264_iquant_itrans_recon_4x4_dc_a9 + +ih264_iquant_itrans_recon_4x4_dc_a9: + +@Only one shift is done in horizontal inverse because, +@if u4_qp_div_6 is lesser than 4 then shift value will be neagative and do negative left shift, in this case rnd_factor has value +@if u4_qp_div_6 is greater than 4 then shift value will be positive and do left shift, here rnd_factor is 0 + + stmfd sp!, {r4-r10, r14} @stack stores the values of the arguments + ldr r5, [sp, #36] @Loads *pu2_iscal_mat + ldr r6, [sp, #40] @Loads *pu2_weigh_mat + ldrsh r8, [r0] @load pi2_src[0], SH for signed halfword load + ldrh r6, [r6] @load pu2_weight_mat[0] , H for unsigned halfword load + ldrh r5, [r5] @load pu2_iscal_mat[0] , H for unsigned halfword load +@=======================DEQUANT FROM HERE=================================== + mul r6, r6, r5 @pu2_iscal_mat[0]*pu2_weigh_mat[0] + ldr r7, [sp, #44] @Loads u4_qp_div_6 + mul r6, r6, r8 @pi2_src[0]*pu2_iscal_mat[0]*pu2_weigh_mat[0] + ldr r4, [sp, #32] @Loads out_strd + ldr r9, [sp, #52] @Loads iq_start_idx + + lsl r6, r6, r7 @(pi2_src[0]*pu2_iscal_mat[0]*pu2_weigh_mat[0])<<u4_qp_div_6 + add r6, r6, #8 @(pi2_src[0]*pu2_iscal_mat[0]*pu2_weigh_mat[0])<<u4_qp_div_6 + rnd_fact + asr r6, r6, #4 @q0 = (pi2_src[0]*pu2_iscal_mat[0]*pu2_weigh_mat[0] + rnd_fact)<<(u4_qp_div_6-4) + + subs r9, r9, #1 @ if r8 == 1 => intra case , so result of subtraction is zero and Z flag is set + ldreqsh r10, [r0] @ Loads signed halfword pi2_src[0], if r9==1 + moveq r6, r10 @ Restore dc value in case of intra, i.e. r9 == 1 + + add r6, r6, #32 @i_macro = q0 + 32 + asr r6, r6, #6 @i_macro >>6 = DC output of 2-stage transform + vdup.s16 q0, r6 @copy transform output to Q0 + + vld1.32 d30[0], [r1], r3 @I row Load pu1_pred buffer + + vld1.32 d30[1], [r1], r3 @II row Load pu1_pred buffer + + vld1.32 d31[0], [r1], r3 @III row Load pu1_pred buf + + vld1.32 d31[1], [r1], r3 @IV row Load pu1_pred buffer + vaddw.u8 q10, q0, d30 + + vaddw.u8 q11, q0, d31 + + vqmovun.s16 d0, q10 + + vst1.32 d0[0], [r2], r4 @I row store the value + vqmovun.s16 d1, q11 + vst1.32 d0[1], [r2], r4 @II row store the value + vst1.32 d1[0], [r2], r4 @III row store the value + vst1.32 d1[1], [r2] @IV row store the value + + ldmfd sp!, {r4-r10, r15} @Reload the registers from SP + + + + +@/* +@ ******************************************************************************* +@ * +@ * @brief +@ * This function performs inverse quant and Inverse transform type Ci4 for 8*8 block +@ * for dc input pattern only, i.e. only the (0,0) element of the input 8x8 block is +@ * non-zero. For complete function, refer ih264_iquant_itrans_recon_a9.s +@ * +@ * @par Description: +@ * Performs inverse transform Ci8 and adds the residue to get the +@ * reconstructed block +@ * +@ * @param[in] pi2_src +@ * Input 4x4 coefficients +@ * +@ * @param[in] pu1_pred +@ * Prediction 4x4 block +@ * +@ * @param[out] pu1_out +@ * Output 4x4 block +@ * +@ * @param[in] u4_qp_div_6 +@ * QP +@ * +@ * @param[in] pu2_weigh_mat +@ * Pointer to weight matrix +@ * +@ * @param[in] pred_strd, +@ * Prediction stride +@ * +@ * @param[in] out_strd +@ * Output Stride +@ * +@ *@param[in] pi2_tmp +@ * temporary buffer of size 1*64 +@ * +@ * @param[in] pu2_iscal_mat +@ * Pointer to the inverse quantization matrix +@ * +@ * @returns Void +@ * +@ * @remarks +@ * None +@ * +@ ******************************************************************************* +@ */ +@void ih264_iquant_itrans_recon_8x8_dc(WORD16 *pi2_src, +@ UWORD8 *pu1_pred, +@ UWORD8 *pu1_out, +@ WORD32 pred_strd, +@ WORD32 out_strd, +@ const UWORD16 *pu2_iscal_mat, +@ const UWORD16 *pu2_weigh_mat, +@ UWORD32 u4_qp_div_6, +@ WORD32 *pi4_tmp, +@ WORD32 iq_start_idx) +@**************Variables Vs Registers***************************************** +@r0 => *pi2_src +@r1 => *pu1_pred +@r2 => *pu1_out +@r3 => pred_strd +@r4 => out_strd +@r5 => *pu2_iscal_mat +@r6 => *pu2_weigh_mat +@r7 => u4_qp_div_6 + + + .global ih264_iquant_itrans_recon_8x8_dc_a9 +ih264_iquant_itrans_recon_8x8_dc_a9: + + stmfd sp!, {r4-r8, r14} @stack stores the values of the arguments + ldr r5, [sp, #28] @Loads *pu2_iscal_mat + ldr r6, [sp, #32] @Loads *pu2_weigh_mat + ldrsh r8, [r0] @load pi2_src[0], SH for signed halfword load + ldrh r6, [r6] @load pu2_weight_mat[0] , H for unsigned halfword load + ldrh r5, [r5] @load pu2_iscal_mat[0] , H for unsigned halfword load +@=======================DEQUANT FROM HERE=================================== + mul r6, r6, r5 @pu2_iscal_mat[0]*pu2_weigh_mat[0] + ldr r7, [sp, #36] @Loads u4_qp_div_6 + mul r6, r6, r8 @pi2_src[0]*pu2_iscal_mat[0]*pu2_weigh_mat[0] + ldr r4, [sp, #24] @Loads out_strd + + vpush {d8-d15} + lsl r6, r6, r7 @(pi2_src[0]*pu2_iscal_mat[0]*pu2_weigh_mat[0])<<u4_qp_div_6 + add r6, r6, #32 @(pi2_src[0]*pu2_iscal_mat[0]*pu2_weigh_mat[0])<<u4_qp_div_6 + rnd_fact + asr r6, r6, #6 @q0 = (pi2_src[0]*pu2_iscal_mat[0]*pu2_weigh_mat[0] + rnd_fact)<<(u4_qp_div_6-4) + add r6, r6, #32 @i_macro = q0 + 32 + asr r6, r6, #6 @i_macro >>6 = DC output of 2-stage transform + vdup.s16 q8, r6 @copy transform output to Q0 + + vld1.32 d24, [r1], r3 @ Q12 = 0x070605....0x070605.... + + vld1.32 d25, [r1], r3 @ Q12 = 0x070605....0x070605.... + + vld1.32 d26, [r1], r3 @ Q12 = 0x070605....0x070605.... + vaddw.u8 q0, q8, d24 + vld1.32 d27, [r1], r3 @ Q12 = 0x070605....0x070605.... + vaddw.u8 q1, q8, d25 + vld1.32 d28, [r1], r3 @ Q12 = 0x070605....0x070605.... + vaddw.u8 q2, q8, d26 + vld1.32 d29, [r1], r3 @ Q12 = 0x070605....0x070605.... + vaddw.u8 q3, q8, d27 + vld1.32 d30, [r1], r3 @ Q12 = 0x070605....0x070605.... + vaddw.u8 q4, q8, d28 + vld1.32 d31, [r1], r3 @ Q12 = 0x070605....0x070605.... + +@ Code Added to pack sign and magnitudes + + + vqmovun.s16 d0, q0 + vaddw.u8 q5, q8, d29 + vqmovun.s16 d1, q1 + vaddw.u8 q6, q8, d30 + vqmovun.s16 d2, q2 + vqmovun.s16 d3, q3 + vaddw.u8 q7, q8, d31 + vqmovun.s16 d4, q4 + vqmovun.s16 d5, q5 + vst1.32 d0, [r2], r4 @ Magnitudes of 1st 4x4 block coeffs + vqmovun.s16 d6, q6 + vst1.32 d1, [r2], r4 @ Magnitudes of 1st 4x4 block coeffs + vqmovun.s16 d7, q7 + vst1.32 d2, [r2], r4 @ Magnitudes of 1st 4x4 block coeffs + vst1.32 d3, [r2], r4 @ Magnitudes of 1st 4x4 block coeffs + vst1.32 d4, [r2], r4 @ Magnitudes of 1st 4x4 block coeffs + vst1.32 d5, [r2], r4 @ Magnitudes of 1st 4x4 block coeffs + vst1.32 d6, [r2], r4 @ Magnitudes of 1st 4x4 block coeffs + vst1.32 d7, [r2], r4 @ Magnitudes of 1st 4x4 block coeffs + + vpop {d8-d15} + ldmfd sp!, {r4-r8, r15} + + +@ /* +@ ******************************************************************************** +@ * +@ * @brief This function reconstructs a 4x4 sub block from quantized resiude and +@ * prediction buffer if only dc value is present for residue +@ * +@ * @par Description: +@ * The quantized residue is first inverse quantized, +@ * This inverse quantized content is added to the prediction buffer to recon- +@ * struct the end output +@ * +@ * @param[in] pi2_src +@ * quantized dc coeffiient +@ * +@ * @param[in] pu1_pred +@ * prediction 4x4 block in interleaved format +@ * +@ * @param[in] pred_strd, +@ * Prediction buffer stride in interleaved format +@ * +@ * @param[in] out_strd +@ * recon buffer Stride +@ * +@ * @returns none +@ * +@ * @remarks none +@ * +@ ******************************************************************************* +@ */ +@ void ih264_iquant_itrans_recon_chroma_4x4_dc(WORD16 *pi2_src, +@ UWORD8 *pu1_pred, +@ UWORD8 *pu1_out, +@ WORD32 pred_strd, +@ WORD32 out_strd, +@ const UWORD16 *pu2_iscal_mat, +@ const UWORD16 *pu2_weigh_mat, +@ UWORD32 u4_qp_div_6, +@ WORD16 *pi2_tmp, +@ WORD16 *pi2_dc_src) +@ Register Usage +@ r0 : pi2_src +@ r1 : pu1_pred +@ r2 : pu1_out +@ r3 : pred_strd +@ Neon registers d0-d7, d16-d30 are used +@ No need for pushing arm and neon registers + .global ih264_iquant_itrans_recon_chroma_4x4_dc_a9 +ih264_iquant_itrans_recon_chroma_4x4_dc_a9: + + ldr r0, [sp, #20] + vld1.s16 d0, [r0] @load pi2_dc_src + + ldr r0, [sp] @load out_strd + + vld2.s8 {d2, d3}, [r1], r3 @load pred plane 1 => d2 &pred palne 2 => d3 + vld2.s8 {d3, d4}, [r1], r3 + vrshr.s16 d0, d0, #6 @i_macro = ((q0 + 32) >> 6); + vld2.s8 {d4, d5}, [r1], r3 + vld2.s8 {d5, d6}, [r1], r3 + + vdup.s16 q0, d0[0] @duplicate pi2_sr[0] + mov r1, r2 @backup pu1_out + + vtrn.32 d2, d3 @mov the 4 coeffs of current block to d2 + vtrn.32 d4, d5 + + vmov.u16 q15, #0x00ff + + vld1.u8 d18, [r2], r0 @load out [8 bit size) -8 coeffs + vaddw.u8 q1, q0, d2 @Add pred + vld1.u8 d19, [r2], r0 + vaddw.u8 q2, q0, d4 + vld1.u8 d20, [r2], r0 + vld1.u8 d21, [r2], r0 + + vqmovun.s16 d2, q1 + vqmovun.s16 d4, q2 + + vmovl.u8 q1, d2 + vmovl.u8 q2, d4 + + vbit.u8 q9, q1, q15 + vbit.u8 q10, q2, q15 + + vst1.u8 d18, [r1], r0 @store out + vst1.u8 d19, [r1], r0 + vst1.u8 d20, [r1], r0 + vst1.u8 d21, [r1], r0 + + bx lr + + + + + + + diff --git a/common/arm/ih264_itrans_recon_a9.s b/common/arm/ih264_itrans_recon_a9.s new file mode 100755 index 0000000..1d74da5 --- /dev/null +++ b/common/arm/ih264_itrans_recon_a9.s @@ -0,0 +1,216 @@ +@/****************************************************************************** +@ * +@ * Copyright (C) 2015 The Android Open Source Project +@ * +@ * Licensed under the Apache License, Version 2.0 (the "License"); +@ * you may not use this file except in compliance with the License. +@ * You may obtain a copy of the License at: +@ * +@ * http://www.apache.org/licenses/LICENSE-2.0 +@ * +@ * Unless required by applicable law or agreed to in writing, software +@ * distributed under the License is distributed on an "AS IS" BASIS, +@ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +@ * See the License for the specific language governing permissions and +@ * limitations under the License. +@ * +@ ***************************************************************************** +@ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore +@*/ +@/** +@ ******************************************************************************* +@ * @file +@ * ih264_itrans_recon_neon_a9.s +@ * +@ * @brief +@ * Contains function definitions for single stage inverse transform +@ * +@ * +@ * @par List of Functions: +@ * - ih264_itrans_recon_4x4_a9() +@ * +@ * @remarks +@ * None +@ * +@ ******************************************************************************* +@*/ +@/** +@ ******************************************************************************* +@ * +@ * @brief +@ * This function performs Inverse transform type Ci4 for 4*4 block +@ * +@ * @par Description: +@ * Performs inverse transform Ci4 and adds the residue to get the +@ * reconstructed block +@ * +@ * @param[in] pi16_levelBlock +@ * Input 4x4 coefficients +@ * +@ * @param[in] puc_predBuffer +@ * Prediction 4x4 block +@ * +@ * @param[out] puc_reconPic +@ * Output 4x4 block +@ * +@ * @param[in] ui16_picWidth +@ * Input stride +@ * +@ * @param[in] pred_strd +@ * Prediction stride +@ * +@ * @param[in] dst_strd +@ * Output Stride +@ * +@ * @param[in] zero_cols +@ * Zero columns in pi2_src +@ * +@ * @returns Void +@ * +@ * @remarks +@ * None +@ * +@ * +@ ******************************************************************************* +@ */ +@void ih264_itrans_recon_4x4( +@ WORD16 *pi2_src, +@ UWORD8 *pu1_pred, +@ UWORD8 *pu1_recon, +@ WORD32 src_strd, +@ WORD32 pred_strd, +@ WORD32 dst_strd, +@ UWORD32 q_lev, //quantizer level +@ WORD32 *pi4_tmp) +@**************Variables Vs Registers***************************************** +@r0 => *pi2_src +@r1 => *pu1_pred +@r2 => *pu1_recon +@r3 => src_strd +@r4 => pred_strd +@r5 => dst_strd +@r6 => q_lev +@r7 => *pi4_tmp + +.text +.p2align 2 + + + .global ih264_itrans_recon_4x4_a9 + +ih264_itrans_recon_4x4_a9: + stmfd sp!, {r4-r12, r14} @stack stores the values of the arguments + lsl r3, r3, #1 + + vld1.16 d0, [r0], r3 @0th row pi2_src_tmp[0] + ldr r4, [sp, #40] @Loads pred_strd + + vld1.16 d1, [r0], r3 @I row pi2_src_tmp[0] + ldr r5, [sp, #44] @Loads *dst_strd + + vld1.16 d2, [r0], r3 @II row pi2_src_tmp[0] + + vld1.16 d3, [r0] @III row pi2_src_tmp[0] + ldr r7, [sp, #52] @Loads *pi4_tmp + + vpush {d8-d15} + + vtrn.16 d0, d1 @Transpose to get all the 0th element in the single D register + vtrn.16 d2, d3 + vtrn.32 d0, d2 + vtrn.32 d1, d3 @D0 --> pi2_src_tmp[0], D1 --> pi2_src_tmp[1] + @D2 --> pi2_src_tmp[2], D3 --> pi2_src_tmp[3] + + vaddl.s16 q3, d0, d2 @x0 = (pi2_src_tmp[0] + pi2_src_tmp[2]) + vsubl.s16 q4, d0, d2 @x1 = (pi2_src_tmp[0] - pi2_src_tmp[2]) + vshr.s16 d4, d1, #1 @pi2_src_tmp[1] >> 1 + vshr.s16 d5, d3, #1 @pi2_src_tmp[3] >> 1 + + vsubl.s16 q5, d4, d3 @x2 = D_SHIFT(pi2_src_tmp[1],1,shft) - pi2_src_tmp[3] + + vaddl.s16 q6, d1, d5 @x3 = pi2_src_tmp[1] + D_SHIFT(pi2_src_tmp[3],1,shft) + + vadd.s32 q8, q4, q5 @x1 + x2 + vsub.s32 q9, q4, q5 @x1 - x2 + + vadd.s32 q7, q3, q6 @x0 + x3 + vsub.s32 q10, q3, q6 @x0 - x3 + + vtrn.32 q7, q8 @Transpose the register to have the adjacent values + + vtrn.32 q9, q10 + vadd.s32 d6, d14, d15 @x0(0,1) = (pi4_tblk[0,1] + pi4_tblk[8,9]) + + vsub.s32 d7, d14, d15 @x1(0,1) = (pi4_tblk[0,1] - pi4_tblk[8,9]) + + vshr.s32 d4, d16, #1 @pi4_tblk[4,5] >> 1 + vshr.s32 d5, d17, #1 @pi4_tblk[12,13] >> 1 + + vsub.s32 d8, d4, d17 @x2(0,1) = D_SHIFT(pi4_tblk[4,5],1,shft) - pi4_tblk[12,13] + vadd.s32 d9, d16, d5 @x3(0,1) = pi4_tblk[4,5] + D_SHIFT(pi4_tblk[12,13],1,shft) + + vadd.s32 d10, d18, d19 @x0(2,3) = (pi4_tblk[2,3] + pi4_tblk[10,11]) + vsub.s32 d11, d18, d19 @x1(2,3) = (pi4_tblk[2,3] - pi4_tblk[10,11]) + vshr.s32 d4, d20, #1 @pi4_tblk[6,7] >> 1 + vshr.s32 d5, d21, #1 @pi4_tblk[14,15] >> 1 + + vld1.32 d30[0], [r1], r4 @I row Load pu1_pred buffer + vsub.s32 d12, d4, d21 @x2(2,3) = D_SHIFT(pi4_tblk[6,7],1,shft) - pi4_tblk[14,15] + + vmovl.u8 q15, d30 @I row Convert 8 bit pred buffer to 16 bit + vadd.s32 d13, d20, d5 @x3(2,3) = pi4_tblk[6,7] + D_SHIFT(pi4_tblk[14,15],1,shft) + + vadd.s32 d16, d6, d9 @I row i_macro(0,1) = x0(0,1) + x3(0,1) + + vld1.32 d28[0], [r1], r4 @II row Load pu1_pred buffer + vadd.s32 d17, d10, d13 @I row i_macro(2,3) = x0(2,3) + x3(2,3) + + vqrshrn.s32 d16, q8, #6 @I row i_macro = D_SHIFT(i_macro,6,shft) + + vmovl.u8 q14, d28 @II row Convert 8 bit pred buffer to 16 bit + vadd.u16 d16, d16, d30 @I row i_macro += *pu1_pred_tmp + + vqmovun.s16 d16, q8 @I row CLIP_U8(i_macro) + vadd.s32 d18, d7, d8 @II row i_macro(0,1) = x1(0,1) + x2(0,1) + + vld1.32 d26[0], [r1], r4 @III row Load pu1_pred buffer + vadd.s32 d19, d11, d12 @II row i_macro(2,3) = x1(2,3) + x2(2,3) + + vqrshrn.s32 d18, q9, #6 @II row i_macro = D_SHIFT(i_macro,6,shft) + + vmovl.u8 q13, d26 @III row Convert 8 bit pred buffer to 16 bit + vadd.u16 d18, d18, d28 @II row i_macro += *pu1_pred_tmp + + vst1.32 d16[0], [r2], r5 @I row store the value + vsub.s32 d20, d7, d8 @III row i_macro(0,1) = x1(0,1) - x2(0,1) + + vqmovun.s16 d18, q9 @II row CLIP_U8(i_macro) + vsub.s32 d21, d11, d12 @III row i_macro(2,3) = x1(2,3) - x2(2,3) + + vld1.32 d24[0], [r1], r4 @IV row Load pu1_pred buffer + vqrshrn.s32 d20, q10, #6 @III row i_macro = D_SHIFT(i_macro,6,shft) + + vmovl.u8 q12, d24 @IV row Convert 8 bit pred buffer to 16 bit + vadd.u16 d20, d20, d26 @III row i_macro += *pu1_pred_tmp + + vqmovun.s16 d20, q10 @III row CLIP_U8(i_macro) + vsub.s32 d22, d6, d9 @IV row i_macro(0,1) = x0(0,1) - x3(0,1) + + vst1.32 d18[0], [r2], r5 @II row store the value + vsub.s32 d23, d10, d13 @IV row i_macro(2,3) = x0(2,3) - x3(2,3) + + vqrshrn.s32 d22, q11, #6 @IV row i_macro = D_SHIFT(i_macro,6,shft) + + vst1.32 d20[0], [r2], r5 @III row store the value + vadd.u16 d22, d22, d24 @IV row i_macro += *pu1_pred_tmp + + vqmovun.s16 d22, q11 @IV row CLIP_U8(i_macro) + vst1.32 d22[0], [r2], r5 @IV row store the value + + + vpop {d8-d15} + ldmfd sp!, {r4-r12, r15} @Reload the registers from SP + + + + diff --git a/common/arm/ih264_mem_fns_neon.s b/common/arm/ih264_mem_fns_neon.s new file mode 100755 index 0000000..2808897 --- /dev/null +++ b/common/arm/ih264_mem_fns_neon.s @@ -0,0 +1,268 @@ +@/****************************************************************************** +@ * +@ * Copyright (C) 2015 The Android Open Source Project +@ * +@ * Licensed under the Apache License, Version 2.0 (the "License"); +@ * you may not use this file except in compliance with the License. +@ * You may obtain a copy of the License at: +@ * +@ * http://www.apache.org/licenses/LICENSE-2.0 +@ * +@ * Unless required by applicable law or agreed to in writing, software +@ * distributed under the License is distributed on an "AS IS" BASIS, +@ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +@ * See the License for the specific language governing permissions and +@ * limitations under the License. +@ * +@ ***************************************************************************** +@ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore +@*/ +@/** +@ ******************************************************************************* +@ * @file +@ * ih264_mem_fns_neon.s +@ * +@ * @brief +@ * Contains function definitions for memory manipulation +@ * +@ * @author +@ * Naveen SR +@ * +@ * @par List of Functions: +@ * - ih264_memcpy_mul_8_a9q() +@ * - ih264_memcpy_a9q() +@ * - ih264_memset_mul_8_a9q() +@ * - ih264_memset_a9q() +@ * - ih264_memset_16bit_mul_8_a9q() +@ * - ih264_memset_a9q() +@ * +@ * @remarks +@ * None +@ * +@ ******************************************************************************* +@*/ + +@/** +@******************************************************************************* +@* +@* @brief +@* memcpy of a 1d array +@* +@* @par Description: +@* Does memcpy of 8bit data from source to destination for 8,16 or 32 number of bytes +@* +@* @param[in] pu1_dst +@* UWORD8 pointer to the destination +@* +@* @param[in] pu1_src +@* UWORD8 pointer to the source +@* +@* @param[in] num_bytes +@* number of bytes to copy +@* @returns +@* +@* @remarks +@* None +@* +@******************************************************************************* +@*/ +@void ih264_memcpy_mul_8(UWORD8 *pu1_dst, +@ UWORD8 *pu1_src, +@ UWORD8 num_bytes) +@**************Variables Vs Registers************************* +@ r0 => *pu1_dst +@ r1 => *pu1_src +@ r2 => num_bytes + +.text +.p2align 2 + + + .global ih264_memcpy_mul_8_a9q + +ih264_memcpy_mul_8_a9q: + +loop_neon_memcpy_mul_8: + @ Memcpy 8 bytes + vld1.8 d0, [r1]! + vst1.8 d0, [r0]! + + subs r2, r2, #8 + bne loop_neon_memcpy_mul_8 + bx lr + + + +@******************************************************************************* +@*/ +@void ih264_memcpy(UWORD8 *pu1_dst, +@ UWORD8 *pu1_src, +@ UWORD8 num_bytes) +@**************Variables Vs Registers************************* +@ r0 => *pu1_dst +@ r1 => *pu1_src +@ r2 => num_bytes + + + + .global ih264_memcpy_a9q + +ih264_memcpy_a9q: + subs r2, #8 + blt memcpy +loop_neon_memcpy: + @ Memcpy 8 bytes + vld1.8 d0, [r1]! + vst1.8 d0, [r0]! + + subs r2, #8 + bge loop_neon_memcpy + cmp r2, #-8 + bxeq lr + +memcpy: + add r2, #8 + +loop_memcpy: + ldrb r3, [r1], #1 + strb r3, [r0], #1 + subs r2, #1 + bne loop_memcpy + bx lr + + + + +@void ih264_memset_mul_8(UWORD8 *pu1_dst, +@ UWORD8 value, +@ UWORD8 num_bytes) +@**************Variables Vs Registers************************* +@ r0 => *pu1_dst +@ r1 => value +@ r2 => num_bytes + + + + .global ih264_memset_mul_8_a9q + +ih264_memset_mul_8_a9q: + +@ Assumptions: numbytes is either 8, 16 or 32 + vdup.8 d0, r1 +loop_memset_mul_8: + @ Memset 8 bytes + vst1.8 d0, [r0]! + + subs r2, r2, #8 + bne loop_memset_mul_8 + + bx lr + + + + +@void ih264_memset(UWORD8 *pu1_dst, +@ UWORD8 value, +@ UWORD8 num_bytes) +@**************Variables Vs Registers************************* +@ r0 => *pu1_dst +@ r1 => value +@ r2 => num_bytes + + + + .global ih264_memset_a9q + +ih264_memset_a9q: + subs r2, #8 + blt memset + vdup.8 d0, r1 +loop_neon_memset: + @ Memcpy 8 bytes + vst1.8 d0, [r0]! + + subs r2, #8 + bge loop_neon_memset + cmp r2, #-8 + bxeq lr + +memset: + add r2, #8 + +loop_memset: + strb r1, [r0], #1 + subs r2, #1 + bne loop_memset + bx lr + + + + +@void ih264_memset_16bit_mul_8(UWORD16 *pu2_dst, +@ UWORD16 value, +@ UWORD8 num_words) +@**************Variables Vs Registers************************* +@ r0 => *pu2_dst +@ r1 => value +@ r2 => num_words + + + + .global ih264_memset_16bit_mul_8_a9q + +ih264_memset_16bit_mul_8_a9q: + +@ Assumptions: num_words is either 8, 16 or 32 + + @ Memset 8 words + vdup.16 d0, r1 +loop_memset_16bit_mul_8: + vst1.16 d0, [r0]! + vst1.16 d0, [r0]! + + subs r2, r2, #8 + bne loop_memset_16bit_mul_8 + + bx lr + + + + +@void ih264_memset_16bit(UWORD16 *pu2_dst, +@ UWORD16 value, +@ UWORD8 num_words) +@**************Variables Vs Registers************************* +@ r0 => *pu2_dst +@ r1 => value +@ r2 => num_words + + + + .global ih264_memset_16bit_a9q + +ih264_memset_16bit_a9q: + subs r2, #8 + blt memset_16bit + vdup.16 d0, r1 +loop_neon_memset_16bit: + @ Memset 8 words + vst1.16 d0, [r0]! + vst1.16 d0, [r0]! + + subs r2, #8 + bge loop_neon_memset_16bit + cmp r2, #-8 + bxeq lr + +memset_16bit: + add r2, #8 + +loop_memset_16bit: + strh r1, [r0], #2 + subs r2, #1 + bne loop_memset_16bit + bx lr + + + + diff --git a/common/arm/ih264_padding_neon.s b/common/arm/ih264_padding_neon.s new file mode 100755 index 0000000..9bab268 --- /dev/null +++ b/common/arm/ih264_padding_neon.s @@ -0,0 +1,646 @@ +@/****************************************************************************** +@ * +@ * Copyright (C) 2015 The Android Open Source Project +@ * +@ * Licensed under the Apache License, Version 2.0 (the "License"); +@ * you may not use this file except in compliance with the License. +@ * You may obtain a copy of the License at: +@ * +@ * http://www.apache.org/licenses/LICENSE-2.0 +@ * +@ * Unless required by applicable law or agreed to in writing, software +@ * distributed under the License is distributed on an "AS IS" BASIS, +@ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +@ * See the License for the specific language governing permissions and +@ * limitations under the License. +@ * +@ ***************************************************************************** +@ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore +@*/ +@/** +@ ******************************************************************************* +@ * @file +@ * ih264_padding_neon.s +@ * +@ * @brief +@ * Contains function definitions padding +@ * +@ * @author +@ * Ittiam +@ * +@ * @par List of Functions: +@ * - ih264_pad_top_a9q() +@ * - ih264_pad_left_luma_a9q() +@ * - ih264_pad_left_chroma_a9q() +@ * - ih264_pad_right_luma_a9q() +@ * - ih264_pad_right_chroma_a9q() +@ * +@ * @remarks +@ * None +@ * +@ ******************************************************************************* +@*/ + + +@/** +@******************************************************************************* +@* +@* @brief pad at the top of a 2d array +@* +@* @par Description: +@* The top row of a 2d array is replicated for pad_size times at the top +@* +@* @param[in] pu1_src +@* UWORD8 pointer to the source +@* +@* @param[in] src_strd +@* integer source stride +@* +@* @param[in] wd +@* integer width of the array +@* +@* @param[in] pad_size +@* integer -padding size of the array +@* +@* @returns none +@* +@* @remarks none +@* +@******************************************************************************* +@*/ +@void ih264_pad_top(UWORD8 *pu1_src, +@ WORD32 src_strd, +@ WORD32 wd, +@ WORD32 pad_size) +@**************Variables Vs Registers************************* +@ r0 => *pu1_src +@ r1 => src_strd +@ r2 => wd +@ r3 => pad_size + +.text +.p2align 2 + + .global ih264_pad_top_a9q + +ih264_pad_top_a9q: + + stmfd sp!, {r4-r11, lr} @stack stores the values of the arguments + + sub r5, r0, r1 + rsb r6, r1, #0 + +loop_neon_memcpy_mul_16: + @ Load 16 bytes + vld1.8 {d0, d1}, [r0]! + mov r4, r5 + mov r7, r3 + add r5, r5, #16 + +loop_neon_pad_top: + vst1.8 {d0, d1}, [r4], r6 + subs r7, r7, #1 + bne loop_neon_pad_top + + subs r2, r2, #16 + bne loop_neon_memcpy_mul_16 + + ldmfd sp!, {r4-r11, pc} @Reload the registers from SP + + + + +@/** +@******************************************************************************* +@* +@* @brief +@* Padding (luma block) at the left of a 2d array +@* +@* @par Description: +@* The left column of a 2d array is replicated for pad_size times at the left +@* +@* +@* @param[in] pu1_src +@* UWORD8 pointer to the source +@* +@* @param[in] src_strd +@* integer source stride +@* +@* @param[in] ht +@* integer height of the array +@* +@* @param[in] wd +@* integer width of the array +@* +@* @param[in] pad_size +@* integer -padding size of the array +@* +@* @param[in] ht +@* integer height of the array +@* +@* @param[in] wd +@* integer width of the array +@* +@* @returns +@* +@* @remarks +@* None +@* +@******************************************************************************* +@*/ +@#if PAD_LEFT_LUMA == C +@void ih264_pad_left_luma(UWORD8 *pu1_src, +@ WORD32 src_strd, +@ WORD32 ht, +@ WORD32 pad_size) +@**************Variables Vs Registers************************* +@ r0 => *pu1_src +@ r1 => src_strd +@ r2 => ht +@ r3 => pad_size + + + .global ih264_pad_left_luma_a9q + +ih264_pad_left_luma_a9q: + + stmfd sp!, {r4-r11, lr} @stack stores the values of the arguments + + + sub r4, r0, r3 + sub r6, r1, #16 + subs r5, r3, #16 + bne loop_32 +loop_16: @ /*hard coded for width=16 ,height =8,16*/ + ldrb r8, [r0], r1 + ldrb r9, [r0], r1 + vdup.u8 q0, r8 + ldrb r10, [r0], r1 + vst1.8 {q0}, [r4], r1 @ 16 bytes store + vdup.u8 q1, r9 + vst1.8 {q1}, [r4], r1 @ 16 bytes store + ldrb r11, [r0], r1 + vdup.u8 q2, r10 + vdup.u8 q3, r11 + vst1.8 {q2}, [r4], r1 @ 16 bytes store + ldrb r8, [r0], r1 + vst1.8 {q3}, [r4], r1 @ 16 bytes store + ldrb r9, [r0], r1 + vdup.u8 q0, r8 + ldrb r10, [r0], r1 + vst1.8 {q0}, [r4], r1 @ 16 bytes store + vdup.u8 q1, r9 + ldrb r11, [r0], r1 + vst1.8 {q1}, [r4], r1 @ 16 bytes store + vdup.u8 q2, r10 + vdup.u8 q3, r11 + subs r2, r2, #8 + vst1.8 {q2}, [r4], r1 @ 16 bytes store + vst1.8 {q3}, [r4], r1 @ 16 bytes store + bne loop_16 + b end_func + +loop_32: @ /*hard coded for width=32 ,height =8,16*/ + ldrb r8, [r0], r1 + ldrb r9, [r0], r1 + vdup.u8 q0, r8 + ldrb r10, [r0], r1 + vst1.8 {q0}, [r4]! @ 16 bytes store + vdup.u8 q1, r9 + vst1.8 {q0}, [r4], r6 + vst1.8 {q1}, [r4]! @ 16 bytes store + vdup.u8 q2, r10 + vst1.8 {q1}, [r4], r6 @ 16 bytes store + ldrb r11, [r0], r1 + vst1.8 {q2}, [r4]! @ 16 bytes store + vdup.u8 q3, r11 + vst1.8 {q2}, [r4], r6 @ 16 bytes store + ldrb r8, [r0], r1 + vst1.8 {q3}, [r4]! @ 16 bytes store + vdup.u8 q0, r8 + ldrb r9, [r0], r1 + vst1.8 {q3}, [r4], r6 @ 16 bytes store + ldrb r10, [r0], r1 + vst1.8 {q0}, [r4]! @ 16 bytes store + vdup.u8 q1, r9 + vst1.8 {q0}, [r4], r6 @ 16 bytes store + ldrb r11, [r0], r1 + vst1.8 {q1}, [r4]! @ 16 bytes store + vdup.u8 q2, r10 + vst1.8 {q1}, [r4], r6 @ 16 bytes store + vst1.8 {q2}, [r4]! @ 16 bytes store + vdup.u8 q3, r11 + vst1.8 {q2}, [r4], r6 @ 16 bytes store + subs r2, r2, #8 + vst1.8 {q3}, [r4]! @ 16 bytes store + vst1.8 {q3}, [r4], r6 @ 16 bytes store + bne loop_32 + + + +end_func: + ldmfd sp!, {r4-r11, pc} @Reload the registers from SP + + + + + +@/** +@******************************************************************************* +@* +@* @brief +@* Padding (chroma block) at the left of a 2d array +@* +@* @par Description: +@* The left column of a 2d array is replicated for pad_size times at the left +@* +@* +@* @param[in] pu1_src +@* UWORD8 pointer to the source +@* +@* @param[in] src_strd +@* integer source stride +@* +@* @param[in] ht +@* integer height of the array +@* +@* @param[in] wd +@* integer width of the array (each colour component) +@* +@* @param[in] pad_size +@* integer -padding size of the array +@* +@* @param[in] ht +@* integer height of the array +@* +@* @param[in] wd +@* integer width of the array +@* +@* @returns +@* +@* @remarks +@* None +@* +@******************************************************************************* +@*/ +@#if PAD_LEFT_CHROMA == C +@void ih264_pad_left_chroma(UWORD8 *pu1_src, +@ WORD32 src_strd, +@ WORD32 ht, +@ WORD32 pad_size) +@{ +@ r0 => *pu1_src +@ r1 => src_strd +@ r2 => ht +@ r3 => pad_size + + + + .global ih264_pad_left_chroma_a9q + +ih264_pad_left_chroma_a9q: + + stmfd sp!, {r4-r11, lr} @stack stores the values of the arguments + + sub r4, r0, r3 + sub r6, r1, #16 + + +loop_32_l_c: @ /*hard coded for width=32 ,height =4,8,12*/ + ldrh r8, [r0], r1 + ldrh r9, [r0], r1 + vdup.u16 q0, r8 + ldrh r10, [r0], r1 + vst1.8 {q0}, [r4]! @ 16 bytes store + vdup.u16 q1, r9 + vst1.8 {q0}, [r4], r6 @ 16 bytes store + ldrh r11, [r0], r1 + vst1.8 {q1}, [r4]! @ 16 bytes store + vdup.u16 q2, r10 + vst1.8 {q1}, [r4], r6 @ 16 bytes store + vdup.u16 q3, r11 + vst1.8 {q2}, [r4]! @ 16 bytes store + vst1.8 {q2}, [r4], r6 @ 16 bytes store + subs r2, r2, #4 + vst1.8 {q3}, [r4]! @ 16 bytes store + vst1.8 {q3}, [r4], r6 @ 16 bytes store + + + beq end_func_l_c @/* Branching when ht=4*/ + + ldrh r8, [r0], r1 + ldrh r9, [r0], r1 + vdup.u16 q0, r8 + ldrh r10, [r0], r1 + vst1.8 {q0}, [r4]! @ 16 bytes store + vdup.u16 q1, r9 + vst1.8 {q0}, [r4], r6 + ldrh r11, [r0], r1 + vst1.8 {q1}, [r4]! @ 16 bytes store + vdup.u16 q2, r10 + vst1.8 {q1}, [r4], r6 @ 16 bytes store + vdup.u16 q3, r11 + vst1.8 {q2}, [r4]! @ 16 bytes store + vst1.8 {q2}, [r4], r6 @ 16 bytes store + subs r2, r2, #4 + vst1.8 {q3}, [r4]! @ 16 bytes store + vst1.8 {q3}, [r4], r6 @ 16 bytes store + + beq end_func_l_c @/* Branching when ht=8*/ + bne loop_32_l_c + + ldrh r8, [r0], r1 + ldrh r9, [r0], r1 + vdup.u16 q0, r8 + ldrh r10, [r0], r1 + vst1.8 {q0}, [r4]! @ 16 bytes store + vdup.u16 q1, r9 + vst1.8 {q0}, [r4], r6 + ldrh r11, [r0], r1 + vst1.8 {q1}, [r4]! @ 16 bytes store + vdup.u16 q2, r10 + vst1.8 {q1}, [r4], r6 @ 16 bytes store + vdup.u16 q3, r11 + vst1.8 {q2}, [r4]! @ 16 bytes store + vst1.8 {q2}, [r4], r6 @ 16 bytes store + vst1.8 {q3}, [r4]! @ 16 bytes store + vst1.8 {q3}, [r4], r6 @ 16 bytes store + +end_func_l_c: + ldmfd sp!, {r4-r11, pc} @Reload the registers from SP + + + + + +@/** +@******************************************************************************* +@* +@* @brief +@* Padding (luma block) at the right of a 2d array +@* +@* @par Description: +@* The right column of a 2d array is replicated for pad_size times at the right +@* +@* +@* @param[in] pu1_src +@* UWORD8 pointer to the source +@* +@* @param[in] src_strd +@* integer source stride +@* +@* @param[in] ht +@* integer height of the array +@* +@* @param[in] wd +@* integer width of the array +@* +@* @param[in] pad_size +@* integer -padding size of the array +@* +@* @param[in] ht +@* integer height of the array +@* +@* @param[in] wd +@* integer width of the array +@* +@* @returns +@* +@* @remarks +@* None +@* +@******************************************************************************* +@*/ +@#if PAD_RIGHT_LUMA == C +@void ih264_pad_right_luma(UWORD8 *pu1_src, +@ WORD32 src_strd, +@ WORD32 ht, +@ WORD32 pad_size) +@{ +@ WORD32 row; +@ +@ for(row = 0; row < ht; row++) +@ { +@ memset(pu1_src, *(pu1_src -1), pad_size); +@ +@ pu1_src += src_strd; +@ } +@} +@ +@ r0 => *pu1_src +@ r1 => src_strd +@ r2 => ht +@ r3 => pad_size + + + + .global ih264_pad_right_luma_a9q + +ih264_pad_right_luma_a9q: + + stmfd sp!, {r4-r11, lr} @stack stores the values of the arguments + + mov r4, r0 + sub r6, r1, #16 + sub r0, r0, #1 + subs r5, r3, #16 + bne loop_32 +loop_16_r: @ /*hard coded for width=16 ,height =8,16*/ + ldrb r8, [r0], r1 + ldrb r9, [r0], r1 + vdup.u8 q0, r8 + ldrb r10, [r0], r1 + vst1.8 {q0}, [r4], r1 @ 16 bytes store + vdup.u8 q1, r9 + vst1.8 {q1}, [r4], r1 @ 16 bytes store + ldrb r11, [r0], r1 + vdup.u8 q2, r10 + vdup.u8 q3, r11 + vst1.8 {q2}, [r4], r1 @ 16 bytes store + ldrb r8, [r0], r1 + vst1.8 {q3}, [r4], r1 @ 16 bytes store + ldrb r9, [r0], r1 + vdup.u8 q0, r8 + ldrb r10, [r0], r1 + vst1.8 {q0}, [r4], r1 @ 16 bytes store + vdup.u8 q1, r9 + ldrb r11, [r0], r1 + vst1.8 {q1}, [r4], r1 @ 16 bytes store + vdup.u8 q2, r10 + vdup.u8 q3, r11 + subs r2, r2, #8 + vst1.8 {q2}, [r4], r1 @ 16 bytes store + vst1.8 {q3}, [r4], r1 @ 16 bytes store + bne loop_16_r + b end_func_r + +loop_32_r: @ /*hard coded for width=32 ,height =8,16*/ + ldrb r8, [r0], r1 + ldrb r9, [r0], r1 + vdup.u8 q0, r8 + ldrb r10, [r0], r1 + vst1.8 {q0}, [r4]! @ 16 bytes store + vdup.u8 q1, r9 + vst1.8 {q0}, [r4], r6 + vst1.8 {q1}, [r4]! @ 16 bytes store + vdup.u8 q2, r10 + vst1.8 {q1}, [r4], r6 @ 16 bytes store + ldrb r11, [r0], r1 + vst1.8 {q2}, [r4]! @ 16 bytes store + vdup.u8 q3, r11 + vst1.8 {q2}, [r4], r6 @ 16 bytes store + ldrb r8, [r0], r1 + vst1.8 {q3}, [r4]! @ 16 bytes store + ldrb r9, [r0], r1 + vdup.u8 q0, r8 + vst1.8 {q3}, [r4], r6 @ 16 bytes store + ldrb r10, [r0], r1 + vst1.8 {q0}, [r4]! @ 16 bytes store + vdup.u8 q1, r9 + vst1.8 {q0}, [r4], r6 @ 16 bytes store + ldrb r11, [r0], r1 + vst1.8 {q1}, [r4]! @ 16 bytes store + vdup.u8 q2, r10 + vst1.8 {q1}, [r4], r6 @ 16 bytes store + vst1.8 {q2}, [r4]! @ 16 bytes store + vdup.u8 q3, r11 + vst1.8 {q2}, [r4], r6 @ 16 bytes store + subs r2, r2, #8 + vst1.8 {q3}, [r4]! @ 16 bytes store + vst1.8 {q3}, [r4], r6 @ 16 bytes store + bne loop_32_r + + + +end_func_r: + ldmfd sp!, {r4-r11, pc} @Reload the registers from SP + + + + + +@/** +@******************************************************************************* +@* +@* @brief +@;* Padding (chroma block) at the right of a 2d array +@* +@* @par Description: +@* The right column of a 2d array is replicated for pad_size times at the right +@* +@* +@* @param[in] pu1_src +@;* UWORD8 pointer to the source +@* +@* @param[in] src_strd +@* integer source stride +@* +@* @param[in] ht +@;* integer height of the array +@* +@* @param[in] wd +@* integer width of the array (each colour component) +@* +@* @param[in] pad_size +@* integer -padding size of the array +@* +@* @param[in] ht +@;* integer height of the array +@* +@* @param[in] wd +@* integer width of the array +@* +@* @returns +@* +@* @remarks +@* None +@* +@******************************************************************************* +@*/ +@#if PAD_RIGHT_CHROMA == C +@void ih264_pad_right_chroma(UWORD8 *pu1_src, +@ WORD32 src_strd, +@ WORD32 ht, +@ WORD32 pad_size) +@ r0 => *pu1_src +@ r1 => src_strd +@ r2 => ht +@ r3 => pad_size + + + + .global ih264_pad_right_chroma_a9q + +ih264_pad_right_chroma_a9q: + + stmfd sp!, {r4-r11, lr} @stack stores the values of the arguments + + mov r4, r0 + sub r6, r1, #16 + sub r0, r0, #2 +loop_32_r_c: @ /*hard coded for width=32 ,height =8,4*/ + ldrh r8, [r0], r1 + ldrh r9, [r0], r1 + vdup.u16 q0, r8 + ldrh r10, [r0], r1 + vst1.8 {q0}, [r4]! @ 16 bytes store + vdup.u16 q1, r9 + vst1.8 {q0}, [r4], r6 + vst1.8 {q1}, [r4]! @ 16 bytes store + vdup.u16 q2, r10 + vst1.8 {q1}, [r4], r6 @ 16 bytes store + subs r2, r2, #4 + ldrh r11, [r0], r1 + vst1.8 {q2}, [r4]! @ 16 bytes store + vdup.u16 q3, r11 + vst1.8 {q2}, [r4], r6 @ 16 bytes store + vst1.8 {q3}, [r4]! @ 16 bytes store + vst1.8 {q3}, [r4], r6 @ 16 bytes store + + beq end_func_r_c @/* Branching when ht=4*/ + + ldrh r8, [r0], r1 + vdup.u16 q0, r8 + ldrh r9, [r0], r1 + ldrh r10, [r0], r1 + vst1.8 {q0}, [r4]! @ 16 bytes store + vdup.u16 q1, r9 + vst1.8 {q0}, [r4], r6 @ 16 bytes store + ldrh r11, [r0], r1 + vst1.8 {q1}, [r4]! @ 16 bytes store + vdup.u16 q2, r10 + vst1.8 {q1}, [r4], r6 @ 16 bytes store + vst1.8 {q2}, [r4]! @ 16 bytes store + vdup.u16 q3, r11 + vst1.8 {q2}, [r4], r6 @ 16 bytes store + subs r2, r2, #4 + vst1.8 {q3}, [r4]! @ 16 bytes store + vst1.8 {q3}, [r4], r6 @ 16 bytes store + + beq end_func_r_c @/* Branching when ht=8*/ + bne loop_32_r_c + + ldrh r8, [r0], r1 + vdup.u16 q0, r8 + ldrh r9, [r0], r1 + ldrh r10, [r0], r1 + vst1.8 {q0}, [r4]! @ 16 bytes store + vdup.u16 q1, r9 + vst1.8 {q0}, [r4], r6 @ 16 bytes store + ldrh r11, [r0], r1 + vst1.8 {q1}, [r4]! @ 16 bytes store + vdup.u16 q2, r10 + vst1.8 {q1}, [r4], r6 @ 16 bytes store + vst1.8 {q2}, [r4]! @ 16 bytes store + vdup.u16 q3, r11 + vst1.8 {q2}, [r4], r6 @ 16 bytes store + vst1.8 {q3}, [r4]! @ 16 bytes store + vst1.8 {q3}, [r4], r6 @ 16 bytes store + +end_func_r_c: + ldmfd sp!, {r4-r11, pc} @Reload the registers from SP + + + + + diff --git a/common/arm/ih264_platform_macros.h b/common/arm/ih264_platform_macros.h new file mode 100755 index 0000000..1f67403 --- /dev/null +++ b/common/arm/ih264_platform_macros.h @@ -0,0 +1,152 @@ +/****************************************************************************** + * + * Copyright (C) 2015 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ***************************************************************************** + * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore +*/ +/** +******************************************************************************* +* @file +* ih264_platform_macros.h +* +* @brief +* Platform specific Macro definitions used in the codec +* +* @author +* Ittiam +* +* @remarks +* None +* +******************************************************************************* +*/ +#ifndef _IHEVC_PLATFORM_MACROS_H_ +#define _IHEVC_PLATFORM_MACROS_H_ + +#ifndef ARMV8 +void ih264_arm_dsb(void); + +#define DATA_SYNC() ih264_arm_dsb() +static __inline WORD32 CLIP_U8(WORD32 x) +{ + asm("usat %0, #8, %1" : "=r"(x) : "r"(x)); + return x; +} + +static __inline WORD32 CLIP_S8(WORD32 x) +{ + asm("ssat %0, #8, %1" : "=r"(x) : "r"(x)); + return x; +} + +static __inline WORD32 CLIP_U10(WORD32 x) +{ + asm("usat %0, #10, %1" : "=r"(x) : "r"(x)); + return x; +} + +static __inline WORD32 CLIP_S10(WORD32 x) +{ + asm("ssat %0, #10, %1" : "=r"(x) : "r"(x)); + return x; +} + +static __inline WORD32 CLIP_U12(WORD32 x) +{ + asm("usat %0, #12, %1" : "=r"(x) : "r"(x)); + return x; +} + +static __inline WORD32 CLIP_S12(WORD32 x) +{ + asm("ssat %0, #12, %1" : "=r"(x) : "r"(x)); + return x; +} + +static __inline WORD32 CLIP_U16(WORD32 x) +{ + asm("usat %0, #16, %1" : "=r"(x) : "r"(x)); + return x; +} +static __inline WORD32 CLIP_S16(WORD32 x) +{ + asm("ssat %0, #16, %1" : "=r"(x) : "r"(x)); + return x; +} + + +static __inline UWORD32 ITT_BIG_ENDIAN(UWORD32 x) +{ + asm("rev %0, %1" : "=r"(x) : "r"(x)); + return x; +} +#else +#define DATA_SYNC() ; + +#define CLIP_U8(x) CLIP3(0, 255, (x)) +#define CLIP_S8(x) CLIP3(-128, 127, (x)) + +#define CLIP_U10(x) CLIP3(0, 1023, (x)) +#define CLIP_S10(x) CLIP3(-512, 511, (x)) + +#define CLIP_U12(x) CLIP3(0, 4095, (x)) +#define CLIP_S12(x) CLIP3(-2048, 2047, (x)) + +#define CLIP_U16(x) CLIP3(0, 65535, (x)) +#define CLIP_S16(x) CLIP3(-32768, 32767, (x)) + +#define ITT_BIG_ENDIAN(x) ((x & 0x000000ff) << 24) | \ + ((x & 0x0000ff00) << 8) | \ + ((x & 0x00ff0000) >> 8) | \ + ((UWORD32)x >> 24); +#endif + +#define SHL(x,y) (((y) < 32) ? ((x) << (y)) : 0) +#define SHR(x,y) (((y) < 32) ? ((x) >> (y)) : 0) + +#define SHR_NEG(val,shift) ((shift>0)?(val>>shift):(val<<(-shift))) +#define SHL_NEG(val,shift) ((shift<0)?(val>>(-shift)):(val<<shift)) + +#define INLINE inline + +static INLINE UWORD32 CLZ(UWORD32 u4_word) +{ + if(u4_word) + return (__builtin_clz(u4_word)); + else + return 32; +} +static INLINE UWORD32 CTZ(UWORD32 u4_word) +{ + if(0 == u4_word) + return 31; + else + { + unsigned int index; + index = __builtin_ctz(u4_word); + return (UWORD32)index; + } +} + + +#define NOP(nop_cnt) {UWORD32 nop_i; for (nop_i = 0; nop_i < nop_cnt; nop_i++);} + + +#define MEM_ALIGN8 __attribute__ ((aligned (8))) +#define MEM_ALIGN16 __attribute__ ((aligned (16))) +#define MEM_ALIGN32 __attribute__ ((aligned (32))) + +#endif /* _IHEVC_PLATFORM_MACROS_H_ */ diff --git a/common/arm/ih264_resi_trans_a9.s b/common/arm/ih264_resi_trans_a9.s new file mode 100755 index 0000000..08821f5 --- /dev/null +++ b/common/arm/ih264_resi_trans_a9.s @@ -0,0 +1,604 @@ +@/****************************************************************************** +@ * +@ * Copyright (C) 2015 The Android Open Source Project +@ * +@ * Licensed under the Apache License, Version 2.0 (the "License"); +@ * you may not use this file except in compliance with the License. +@ * You may obtain a copy of the License at: +@ * +@ * http://www.apache.org/licenses/LICENSE-2.0 +@ * +@ * Unless required by applicable law or agreed to in writing, software +@ * distributed under the License is distributed on an "AS IS" BASIS, +@ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +@ * See the License for the specific language governing permissions and +@ * limitations under the License. +@ * +@ ***************************************************************************** +@ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore +@*/ +@/** +@******************************************************************************* +@* @file +@* ih264_resi_trans_a9.s +@* +@* @brief +@* Contains function definitions for residual and forward trans +@* +@* @author +@* Ittiam +@* +@* @par List of Functions: +@* ih264_resi_trans_4x4_a9 +@* ih264_resi_trans_8x8_a9 +@* @remarks +@* None +@* +@******************************************************************************* + + +.text +.p2align 2 +@***************************************************************************** +@* +@* Function Name : ih264_resi_trans_4x4_a9 +@* Description : This function does cf4 of H264 followed by and approximate scaling +@* +@* Arguments : +@ R0 :pointer to src buffer +@ R1 :pointer to pred buffer +@ R2 :pointer to dst buffer +@ R3 :src_stride +@ STACk :pred_stride,dst_stride + +@* Values Returned : NONE +@* +@* Register Usage : +@* Stack Usage : +@* Cycles : Around +@* Interruptiaility : Interruptable +@* +@* Known Limitations +@* \Assumptions : +@* +@* Revision History : +@* DD MM YYYY Author(s) Changes +@* 30 12 2009 100633 First version +@* +@***************************************************************************** + + + .global ih264_resi_trans_4x4_a9 + .extern g_scal_coff_h264_4x4 +g_scal_coff_h264_4x4_addr: + .long g_scal_coff_h264_4x4 - 4x4lbl - 8 + +ih264_resi_trans_4x4_a9: + + @R0 :pointer to src buffer + @R1 :pointer to pred buffer + @R2 :pointer to dst buffer + @R3 :src_stride + @STACk :pred_stride,dst_stride + + push {r4-r12, lr} @push all the variables first + + mov r6, sp + add r6, r6, #40 @decrement stack pointer,to accomodate two variables + ldmfd r6, {r4-r5} @load the strides into registers + @R4 pred_stride + @R5 dst_stride + + + @we have to give the stride as post inrement in VLDR1 + @but since thr stride is from end of row 1 to start of row 2, + @we need to add the size of the curent row to strides ie we need to add 4 to it (4 bytes) + @ADD R3,#4 + @ADD R4,#4 + @ADD R5,#4 + @in case of dst the stride represnts 16 bit ie 2*8bits + @hence we need to add #4 to it and thenm multiply by 2 + @--------------------function loading done------------------------ + + @lets find residual + @data is like 1a -> d0[1:31] d0[32:64] + @ a b c d # # # # + vld1.u8 d30, [r0], r3 @load 4 pixels of row1 current buffer + vld1.u8 d31, [r1], r4 @load 4 pixels of row1 pred buffer + @ data is like 1a -> q4[1:63] q4[64:148] + @ d8[1:63] d9[1:63] + @ a b c d # # # # + + vld1.u8 d28, [r0], r3 @load row 2 of src to d28[0] + vld1.u8 d29, [r1], r4 @load row2 of pred to d29[0] + + vld1.u8 d26, [r0], r3 @load row 3 of src to d26[0] + vsubl.u8 q0, d30, d31 @curr - pred for row one + + vld1.u8 d27, [r1], r4 @load row 3of pred t0 d27[0] + vsubl.u8 q1, d28, d29 @find row 2 of src -pred to d0 + + vld1.u8 d24, [r0], r3 @load row 4 of src to d24[0] + + vld1.u8 d25, [r1], r4 @load row 4 of src tp d25[0] + vsubl.u8 q2, d26, d27 @load src-pred row 3 to d[2] + + lsl r5, r5, #2 @ multiply dst stride by since we are storing 32 bit values + ldr r6, g_scal_coff_h264_4x4_addr +4x4lbl: + add r6, r6, pc @ load the address of global array + + vsubl.u8 q3, d24, d25 @load row 4 of src - pred to q6 + + @after this + @D0 -> 1a + @D2 -> 2a + @D4 -> 3a + @D6 -> 4a + + @transpose the matrix so that we can do the horizontal transform first + @#1 #2 #3 #4 + @a b c d ---- D0 + @e f g h -----D2 + @i j k l -----D4 + @m n o p -----D6 + @transpose the inner 2x2 blocks + vtrn.16 d0, d2 + vld1.s16 {q10}, [r6]! @ load the scaling values 0-7; + vtrn.16 d4, d6 + @a e c g + @b f d h + @i m k o + @j n l p + vtrn.32 d0, d4 + vtrn.32 d2, d6 + @a e i m #1 -- D0 --- x4 + @b f j n #2 -- D2 --- x5 + @c g k o #3 -- D4 ----x6 + @d h l p #4 -- D6 ----x7 + + @we have loaded the residuals into the registers , now we need to add and subtract them + @let us do the horiz transform first + + vsub.s16 d5, d2, d4 @x2 = x5-x6 + vsub.s16 d7, d0, d6 @x3 = x4-x7; + + vadd.s16 d3, d2, d4 @x1 = x5+x6 + vadd.s16 d1, d0, d6 @x0 = x4+x7 + + + vshl.s16 d31, d7, #1 @ + vshl.s16 d30, d5, #1 @ + + vadd.s16 d0, d1, d3 @x0 + x1; + vsub.s16 d4, d1, d3 @x0 - x1; + + vadd.s16 d2, d31, d5 @U_SHIFT(x3,1,shft) + x2; + vsub.s16 d6, d7, d30 @x3 - U_SHIFT(x2,1,shft); + + @taking transform again so as to make do vert transform + vtrn.16 d0, d2 + vtrn.16 d4, d6 + + vtrn.32 d0, d4 + vtrn.32 d2, d6 + + @let us do vertical transform + @same code as horiz + + vadd.s16 d1, d0, d6 @x0 = x4+x7 + vadd.s16 d3, d2, d4 @x1 = x5+x6 + vsub.s16 d7, d0, d6 @x3 = x4-x7; + vsub.s16 d5, d2, d4 @x2 = x5-x6 + + +@Since we are going to do scal / quant or whatever, we are going to divide by +@a 32 bit number. So we have to expand the values + + @VADDL.S16 Q12,D1,D3;x0 + x1 + @VSUBL.S16 Q14,D1,D3;x0 - x1 + + @VSHL.S16 D8,D5,#1; + @VSHL.S16 D9,D7,#1; + + @VADDL.S16 Q13,D9,D5 ; + x2 + @VSUBL.S16 Q15,D7,D8 ;x3 - U_SHIFT(x2,1,shft) + +@scaling follows + +@now we need to do the scaling,so load the scaling matrix +@mutliplying by the scaling coeffient; store the results from q5-q8 ; + + vadd.s16 d24, d3, d1 @x4 = x0 + x1 + vsub.s16 d28, d1, d3 @x6 = x0 - x1 + + vshl.s16 d0, d7, #1 @ U_SHIFT(x3,1,shft) + vmull.s16 q4, d24, d20 @x4*s0 + + vshl.s16 d2, d5, #1 @ U_SHIFT(x2,1,shft) + + vadd.s16 d26, d0, d5 @x5 = U_SHIFT(x3,1,shft) + x2 + vmull.s16 q5, d26, d21 @x5*s1 + + vst1.s32 {q4}, [r2], r5 @save 4 pixels of row1 current buffer and increment pointer by stride + + vld1.s16 {q10}, [r6] @load 8-16 scaling coeffcients + + vsub.s16 d30, d7, d2 @x7 = x3 - U_SHIFT(x2,1,shft) + + vmull.s16 q6, d28, d20 @x6*s2 + vst1.s32 {q5}, [r2], r5 + + vmull.s16 q7, d30, d21 @x7*s3 + + + vst1.s32 {q6}, [r2], r5 + vst1.s32 {q7}, [r2] + + pop {r4-r12, pc} @pop back all variables + + + + +@***************************************************************************** +@* Function Name : ih264_resi_trans_8x8_a9 +@* Description : This function does cf8 followd by an approximate normalization of H264 +@* +@* Arguments : +@* R0 :pointer to src buffer +@ R1 :pointer to pred buffer +@ R2 :pointer to dst buffer +@ R3 :src_stride +@ STACk :pred_stride,dst_st +@* +@* +@* Values Returned : NONE +@* +@* Register Usage : +@* Stack Usage : +@* Cycles : Around +@* Interruptiaility : Interruptable +@* +@* Known Limitations +@* \Assumptions : +@* +@* Revision History : +@* DD MM YYYY Author(s) Changes +@* 30 12 2009 100633 First version +@* +@***************************************************************************** + + + .global ih264_resi_trans_8x8_a9 + .extern g_scal_coff_h264_8x8 +g_scal_coff_h264_8x8_addr: + .long g_scal_coff_h264_8x8 - 8x8lbl - 8 + + +ih264_resi_trans_8x8_a9: + + @R0 :pointer to src buffer + @R1 :pointer to pred buffer + @R2 :pointer to dst buffer + @R3 :src_stride + @STACk :pred_stride,dst_stride + + push {r4-r12, lr} @push all the variables first + + mov r6, sp + add r6, r6, #40 @decrement stack pointer,to accomodate two variables + ldmfd r6, {r4-r5} @load the strides into registers + @R4 pred_stride + @R5 dst_stride + + @we have to give the stride as post inrement in vst1 + @in case of dst the stride represnts 16 bit ie 2*8bits + @hence we need to add #4 to it and thenm multiply by 2 + @--------------------function loading done------------------------ + + @lets find residual + @data is like 1a -> d0[1:31] d0[32:64] + @ a b c d # # # # + vld1.u8 d30, [r0], r3 @load 4 pixels of row1 current buffer + vld1.u8 d31, [r1], r4 @load 4 pixels of row1 pred buffer + + vld1.u8 d28, [r0], r3 @src rw2 + vld1.u8 d29, [r1], r4 @pred rw2 + vsubl.u8 q0, d30, d31 @src-pred rw1 + + vld1.u8 d26, [r0], r3 + vld1.u8 d27, [r1], r4 + vsubl.u8 q1, d28, d29 + + vld1.u8 d24, [r0], r3 + vld1.u8 d25, [r1], r4 + vsubl.u8 q2, d26, d27 + + vld1.u8 d22, [r0], r3 + vld1.u8 d23, [r1], r4 + vsubl.u8 q3, d24, d25 + + vld1.u8 d20, [r0], r3 + vld1.u8 d21, [r1], r4 + vsubl.u8 q4, d22, d23 + + vld1.u8 d18, [r0], r3 + vld1.u8 d19, [r1], r4 + vsubl.u8 q5, d20, d21 + + vld1.u8 d16, [r0], r3 + vld1.u8 d17, [r1], r4 + vsubl.u8 q6, d18, d19 + + lsl r5, r5, #2 + + + vsubl.u8 q7, d16, d17 + + @after this + @Q0 -> 1a + @Q1 -> 2a + @Q2 -> 3a + @Q3 -> 4a + @Q4 -> 5a + @Q5 -> 6a + @Q6 -> 7a + @Q7 -> 8a + + @transpose the matrix so that we can do the horizontal transform first + + @transpose the inner 2x2 blocks + vtrn.16 q0, q1 + vtrn.16 q2, q3 + vtrn.16 q4, q5 + vtrn.16 q6, q7 + + @transpose the inner 4x4 blocks + vtrn.32 q0, q2 + vtrn.32 q1, q3 + + vtrn.32 q4, q6 + vtrn.32 q5, q7 + + @transpose the outer 8x8 blocks + vswp d1, d8 + vswp d7, d14 + vswp d3, d10 + vswp d5, d12 + @transpose done + +@@this point we will have data in Q0-Q7 +@Q7 will be populated within 2 clock cycle +@all others are availabe @ this clock cycle + + @we have loaded the residuals into the registers , now we need to add and subtract them + @let us do the horiz transform first + + vadd.s16 q8, q0, q7 @ a0 = r0 + r7; + vadd.s16 q9, q1, q6 @ a1 = r1 + r6; + vadd.s16 q10, q2, q5 @ a2 = r2 + r5; + vadd.s16 q11, q3, q4 @ a3 = r3 + r4; + + vsub.s16 q12, q0, q7 @ b0 = r0 - r7; + vsub.s16 q13, q1, q6 @ b1 = r1 - r6; + vsub.s16 q15, q3, q4 @ b3 = r3 - r4; + vsub.s16 q14, q2, q5 @ b2 = r2 - r5; + + vadd.s16 q1, q8, q11 @ a4 = a0 + a3; + vadd.s16 q3, q9, q10 @ a5 = a1 + a2; + vsub.s16 q7, q9, q10 @ a7 = a1 - a2; + vsub.s16 q5, q8, q11 @ a6 = a0 - a3; + + ldr r6, g_scal_coff_h264_8x8_addr +8x8lbl: + add r6, r6, pc @ load the address of global array + + vadd.s16 q0, q1, q3 @ pi2_res[0] = a4 + a5; + vshr.s16 q8, q7, #1 @ pi2_res[2] = a6 + D_SHIFT(a7,1,shft); + + vsub.s16 q4, q1, q3 @ pi2_res[4] = a4 - a5; + + vadd.s16 q2, q5, q8 @ + + + vshr.s16 q9, q5, #1 @ pi2_res[6] = D_SHIFT(a6,1,shft) - a7; + vsub.s16 q6, q9, q7 @ + +@do not change Q0,Q2.Q4,Q6 they contain results +@Q1,Q3,Q5,Q7 TO STORE RESULTS +@Q8 Q9 Q10 Q11 USE @WILL + + vshr.s16 q1, q12, #1 @ D_SHIFT(b0,1,shft) + vshr.s16 q3, q13, #1 @ D_SHIFT(b1,1,shft) + vshr.s16 q5, q14, #1 @ D_SHIFT(b2,1,shft) + vshr.s16 q7, q15, #1 @ D_SHIFT(b3,1,shft) + + vadd.s16 q8, q1, q12 @ (D_SHIFT(b0,1,shft) + b0); + vadd.s16 q9, q3, q13 @ (D_SHIFT(b1,1,shft) + b1); + vadd.s16 q10, q5, q14 @ (D_SHIFT(b2,1,shft) + b2); + vadd.s16 q11, q7, q15 @ (D_SHIFT(b3,1,shft) + b3); + + vadd.s16 q1, q14, q8 @ b2 + (D_SHIFT(b0,1,shft) + b0); + vsub.s16 q5, q15, q9 @ b3 - (D_SHIFT(b1,1,shft) + b1); + vadd.s16 q3, q15, q10 @ b3 + (D_SHIFT(b2,1,shft) + b2); + vsub.s16 q7, q11, q14 @ -b2 + (D_SHIFT(b3,1,shft) + b3); + + vadd.s16 q8, q13, q1 @ b4 = b1 + b2 + (D_SHIFT(b0,1,shft) + b0); + vsub.s16 q9, q12, q3 @ b5 = b0 - b3 - (D_SHIFT(b2,1,shft) + b2); + vadd.s16 q10, q12, q5 @ b6 = b0 + b3 - (D_SHIFT(b1,1,shft) + b1); + vadd.s16 q11, q13, q7 @ b7 = b1 - b2 + (D_SHIFT(b3,1,shft) + b3); + + vshr.s16 q15, q8, #2 @ D_SHIFT(b4,2,shft) + vshr.s16 q14, q9, #2 @ D_SHIFT(b5,2,shft); + vshr.s16 q13, q10, #2 @ D_SHIFT(b6,2,shft); + vshr.s16 q12, q11, #2 @ D_SHIFT(b7,2,shft); + + + vadd.s16 q3, q9, q13 @ pi2_res[3] = b5 + D_SHIFT(b6,2,shft); + vsub.s16 q5, q10, q14 @ pi2_res[5] = b6 - D_SHIFT(b5,2,shft); + vadd.s16 q1, q8, q12 @ pi2_res[1] = b4 + D_SHIFT(b7,2,shft); + vsub.s16 q7, q15, q11 @ pi2_res[7] = D_SHIFT(b4,2,shft) - b7; + + @------------horiz transform done------------------------- + @results are in Q0-Q7 + @all other neon registes can be used at will + +@doing vertical transform +@code exact copy of horiz transform above + + @transpose the inner 2x2 blocks + vtrn.16 q0, q1 + vtrn.16 q2, q3 + vtrn.16 q4, q5 + vtrn.16 q6, q7 + + @transpose the inner 4x4 blocks + vtrn.32 q0, q2 + vtrn.32 q1, q3 + + vtrn.32 q4, q6 + vtrn.32 q5, q7 + + @transpose the outer 8x8 blocks + vswp d1, d8 + vswp d3, d10 + vswp d5, d12 + vswp d7, d14 + + @transpose done + + vadd.s16 q8, q0, q7 @ a0 = r0 + r7; + vadd.s16 q9, q1, q6 @ a1 = r1 + r6; + vadd.s16 q10, q2, q5 @ a2 = r2 + r5; + vadd.s16 q11, q3, q4 @ a3 = r3 + r4; + + vsub.s16 q12, q0, q7 @ b0 = r0 - r7; + vsub.s16 q13, q1, q6 @ b1 = r1 - r6; + vsub.s16 q14, q2, q5 @ b2 = r2 - r5; + vsub.s16 q15, q3, q4 @ b3 = r3 - r4; + + vadd.s16 q1, q8, q11 @ a4 = a0 + a3; + vadd.s16 q3, q9, q10 @ a5 = a1 + a2; + vsub.s16 q5, q8, q11 @ a6 = a0 - a3; + vsub.s16 q7, q9, q10 @ a7 = a1 - a2; + + + vadd.s16 q0, q1, q3 @ pi2_res[0] = a4 + a5; + + vshr.s16 q8, q7, #1 @ pi2_res[2] = a6 + D_SHIFT(a7,1,shft); + @DSHIFT_TO_0 Q8,Q7,#1,#0 + vadd.s16 q2, q5, q8 @ + + vsub.s16 q4, q1, q3 @ pi2_res[4] = a4 - a5; + + vshr.s16 q9, q5, #1 @ pi2_res[6] = D_SHIFT(a6,1,shft) - a7; + vsub.s16 q6, q9, q7 @ + +@do not change Q0,Q2.Q4,Q6 they contain results +@Q1,Q3,Q5,Q7 TO STORE RESULTS +@Q8 Q9 Q10 Q11 USE @WILL + + vshr.s16 q1, q12, #1 @ D_SHIFT(b0,1,shft) + vshr.s16 q3, q13, #1 @ D_SHIFT(b1,1,shft) + vshr.s16 q5, q14, #1 @ D_SHIFT(b2,1,shft) + vshr.s16 q7, q15, #1 @ D_SHIFT(b3,1,shft) + + + vadd.s16 q8, q1, q12 @ (D_SHIFT(b0,1,shft) + b0); + vadd.s16 q9, q3, q13 @ (D_SHIFT(b1,1,shft) + b1); + vadd.s16 q10, q5, q14 @ (D_SHIFT(b2,1,shft) + b2); + vadd.s16 q11, q7, q15 @ (D_SHIFT(b3,1,shft) + b3); + + vadd.s16 q1, q14, q8 @ b2 + (D_SHIFT(b0,1,shft) + b0); + vadd.s16 q3, q15, q10 @ b3 + (D_SHIFT(b2,1,shft) + b2); + vsub.s16 q5, q15, q9 @ b3 - (D_SHIFT(b1,1,shft) + b1); + vsub.s16 q7, q11, q14 @ -b2 + (D_SHIFT(b3,1,shft) + b3); + + vadd.s16 q8, q13, q1 @ b4 = b1 + b2 + (D_SHIFT(b0,1,shft) + b0); + vsub.s16 q9, q12, q3 @ b5 = b0 - b3 - (D_SHIFT(b2,1,shft) + b2); + vadd.s16 q10, q12, q5 @ b6 = b0 + b3 - (D_SHIFT(b1,1,shft) + b1); + vadd.s16 q11, q13, q7 @ b7 = b1 - b2 + (D_SHIFT(b3,1,shft) + b3); + + vshr.s16 q15, q8, #2 @ D_SHIFT(b4,2,shft) + vshr.s16 q14, q9, #2 @ D_SHIFT(b5,2,shft); + vshr.s16 q13, q10, #2 @ D_SHIFT(b6,2,shft); + vshr.s16 q12, q11, #2 @ D_SHIFT(b7,2,shft); + + +@since we are going to scal by small values, we need not expand the guys to 32 bit bit values + vsub.s16 q5, q10, q14 @ pi2_res[5] = b6 - D_SHIFT(b5,2,shft); + vsub.s16 q7, q15, q11 @ pi2_res[7] = D_SHIFT(b4,2,shft) - b7; + vadd.s16 q3, q9, q13 @ pi2_res[3] = b5 + D_SHIFT(b6,2,shft); + vadd.s16 q1, q8, q12 @ pi2_res[1] = b4 + D_SHIFT(b7,2,shft); + + @------------vert transform done------------------------- + @results are in Q0-Q7 + @all other neon registes can be used at will + + @scaling + @since the 8x8 scaling matrix repeats in 1x4,1x4 block , + @we need only load 4 values for each row and in total 4 rows + vld1.s16 {q14-q15}, [r6] @ + + @since we need to get a 32 bit o/p for two 16 bit multiplications + @we need a VMULL instruction +@-----------------------------first and second row + + vmull.s16 q8, d0, d28 @scale the first row first 4 elem + vmull.s16 q9, d28, d1 @scale the second row last 4 elemts + + vmull.s16 q10, d2, d29 @ scale second row first 4 elem + vmull.s16 q11, d29, d3 @scale the second row last 4 elem + vmull.s16 q12, d4, d30 @scale third row first 4 elem + + vst1.s32 {q8, q9}, [r2], r5 @ write the first row complete + + vmull.s16 q13, d30, d5 @scale the third row last 4 elem + vmull.s16 q8, d6, d31 @scale the fourth row first 4 elem + + + vst1.s32 {q10, q11}, [r2], r5 @store the second row complete + +@------------------------------- 3rd and 4th row + + vmull.s16 q9, d31, d7 @scale the fourth row second column + + vst1.s32 {q12, q13}, [r2], r5 @store the third row complete + + vmull.s16 q10, d8, d28 @scale the 5th row fisrst 4 elms + vmull.s16 q11, d28, d9 @scale the 5th row second 4 elems + + vmull.s16 q12, d10, d29 @scale the 6th row first4 elements + + + vst1.s32 {q8, q9}, [r2], r5 @store fifth row + +@--------------------------------5th and 6th row + + vmull.s16 q13, d29, d11 @scale 6th row sendond 4 elems + + vmull.s16 q8, d12, d30 @scale 7th rw first 4 elms + + vst1.s32 {q10, q11}, [r2], r5 @store 6th row second 4 elements + + vmull.s16 q9, d30, d13 @scale 7th rw second 4 elms + vmull.s16 q10, d14, d31 @scale 8th rw forst 4 elms + + + vst1.s32 {q12, q13}, [r2], r5 @store 6th row + +@----------------------------------7th and 8th row + vmull.s16 q11, d31, d15 @scale 8th row second 4 elms + + vst1.s32 {q8, q9}, [r2], r5 @store 7th row + vst1.s32 {q10, q11}, [r2], r5 @store 8th row + +@----------------------------------done writing + + pop {r4-r12, pc} @pop back all variables + + + + + + diff --git a/common/arm/ih264_resi_trans_quant_a9.s b/common/arm/ih264_resi_trans_quant_a9.s new file mode 100755 index 0000000..caf362e --- /dev/null +++ b/common/arm/ih264_resi_trans_quant_a9.s @@ -0,0 +1,694 @@ +@/****************************************************************************** +@ * +@ * Copyright (C) 2015 The Android Open Source Project +@ * +@ * Licensed under the Apache License, Version 2.0 (the "License"); +@ * you may not use this file except in compliance with the License. +@ * You may obtain a copy of the License at: +@ * +@ * http://www.apache.org/licenses/LICENSE-2.0 +@ * +@ * Unless required by applicable law or agreed to in writing, software +@ * distributed under the License is distributed on an "AS IS" BASIS, +@ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +@ * See the License for the specific language governing permissions and +@ * limitations under the License. +@ * +@ ***************************************************************************** +@ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore +@*/ +@/** +@******************************************************************************* +@* @file +@* ih264_resi_trans_quant_a9.s +@* +@* @brief +@* Contains function definitions for residual and forward trans +@* +@* @author +@* Ittiam +@* +@* @par List of Functions: +@* ih264_resi_trans_quant_4x4_a9 +@* ih264_resi_trans_quant_8x8_a9 +@* ih264_resi_trans_quant_chroma_4x4_a9 +@* ih264_hadamard_quant_4x4_a9 +@* ih264_hadamard_quant_2x2_uv_a9 +@* +@* @remarks +@* None +@* +@******************************************************************************* + + +.text +.p2align 2 +@***************************************************************************** +@* +@* Function Name : ih264_resi_trans_quant_4x4_a9 +@* Description : This function does cf4 of H264 +@* +@* Arguments : R0 :pointer to src buffer +@ R1 :pointer to pred buffer +@ R2 :pointer to dst buffer +@ R3 :source stride +@ STACK : pred stride, +@ dst stride, +@ pointer to scaling matrix, +@ pointer to threshold matrix, +@ qbits, +@ rounding factor, +@ pointer to store nnz +@ pointer to store non quantized dc value +@ Values Returned : NONE +@ +@ Register Usage : +@ Stack Usage : 40 bytes +@ Cycles : Around +@ Interruptiaility : Interruptable +@ +@ Known Limitations +@ \Assumptions : +@ +@ Revision History : +@ DD MM YYYY Author(s) Changes +@ 1 12 2013 100633 First version +@ 20 1 2014 100633 Changes the API, Optimization +@ +@***************************************************************************** + + .global ih264_resi_trans_quant_4x4_a9 +ih264_resi_trans_quant_4x4_a9: + + @R0 :pointer to src buffer + @R1 :pointer to pred buffer + @R2 :pointer to dst buffer + @R3 :Source stride + @STACk :pred stride + @ :scale matirx, + @ :threshold matrix + @ :qbits + @ :round factor + @ :nnz + + push {r4-r12, lr} @push all the variables first + + add r11, sp, #40 @decrement stack pointer,to accomodate two variables + ldmfd r11, {r4-r10} @load the strides into registers + + @R0 :pointer to src buffer + @R1 :pointer to pred buffer + @R2 :pointer to dst buffer + @R3 :Source stride + @R4 :Pred stride + @R5 :scale matirx, + @R6 :threshold matrix + @R7 :qbits + @R8 :round factor + @R9 :nnz + + vpush {d8-d15} + + mov r11, #0 + sub r7, r11, r7 @Negate the qbit value for usiing LSL + + @------------Fucntion Loading done----------------; + + vld1.u8 d30, [r0], r3 @load first 8 pix src row 1 + + vld1.u8 d31, [r1], r4 @load first 8 pix pred row 1 + + vld1.u8 d28, [r0], r3 @load first 8 pix src row 2 + + vld1.u8 d29, [r1], r4 @load first 8 pix pred row 2 + + vld1.u8 d26, [r0], r3 @load first 8 pix src row 3 + + vld1.u8 d27, [r1], r4 @load first 8 pix pred row 3 + vsubl.u8 q0, d30, d31 @find residue row 1 + + vld1.u8 d24, [r0], r3 @load first 8 pix src row 4 + + vld1.u8 d25, [r1], r4 @load first 8 pix pred row 4 + vsubl.u8 q1, d28, d29 @find residue row 2 + + vsubl.u8 q2, d26, d27 @find residue row 3 + vsubl.u8 q3, d24, d25 @find residue row 4 + + vtrn.16 d0, d2 @T12 + vtrn.16 d4, d6 @T23 + vtrn.32 d0, d4 @T13 + vtrn.32 d2, d6 @T14 + + vadd.s16 d8 , d0, d6 @x0 = x4+x7 + vadd.s16 d9 , d2, d4 @x1 = x5+x6 + vsub.s16 d10, d2, d4 @x2 = x5-x6 + vsub.s16 d11, d0, d6 @x3 = x4-x7 + + vshl.s16 d12, d10, #1 @U_SHIFT(x2,1,shft) + vshl.s16 d13, d11, #1 @U_SHIFT(x3,1,shft) + + vadd.s16 d14, d8, d9 @x4 = x0 + x1; + vsub.s16 d16, d8, d9 @x6 = x0 - x1; + vadd.s16 d15, d13, d10 @x5 = U_SHIFT(x3,1,shft) + x2; + vsub.s16 d17, d11, d12 @x7 = x3 - U_SHIFT(x2,1,shft); + + @taking transpose again so as to make do vert transform + vtrn.16 d14, d15 @T12 + vtrn.16 d16, d17 @T23 + vtrn.32 d14, d16 @T13 + vtrn.32 d15, d17 @T24 + + @let us do vertical transform + @same code as horiz + vadd.s16 d18, d14, d17 @x0 = x4+x7 + vadd.s16 d19, d15, d16 @x1 = x5+x6 + vsub.s16 d20, d15, d16 @x2 = x5-x6 + vsub.s16 d21, d14, d17 @x3 = x4-x7 + + vshl.s16 d22, d20, #1 @U_SHIFT(x2,1,shft) + vshl.s16 d23, d21, #1 @U_SHIFT(x3,1,shft) + + vdup.s32 q4, r8 @Load rounding value row 1 + + vadd.s16 d24, d18, d19 @x5 = x0 + x1; + vsub.s16 d26, d18, d19 @x7 = x0 - x1; + vadd.s16 d25, d23, d20 @x6 = U_SHIFT(x3,1,shft) + x2; + vsub.s16 d27, d21, d22 @x8 = x3 - U_SHIFT(x2,1,shft); + vdup.s32 q10, r7 @Load qbit values + + vst1.s16 d24[0], [r10] @Store the dc value to alternate dc sddress + +@core tranform is done for 4x8 block 1 + vld1.s16 {q14-q15}, [r5] @load the scaling values + + vabs.s16 q0, q12 @Abs val of row 1 blk 1 + + vabs.s16 q1, q13 @Abs val of row 2 blk 1 + + vmov.s32 q5, q4 @copy round fact for row 2 + + vmov.s32 q6, q4 @copy round fact for row 2 + vclt.s16 q2, q12, #0 @Get the sign of row 1 blk 1 + + vmov.s32 q7, q4 @copy round fact for row 2 + vclt.s16 q3, q13, #0 @Get the sign of row 2 blk 1 + + vmlal.s16 q4, d0, d28 @Multiply and add row 1 + vmlal.s16 q5, d1, d29 @Multiply and add row 2 + vmlal.s16 q6, d2, d30 @Multiply and add row 3 + vmlal.s16 q7, d3, d31 @Multiply and add row 4 + + vshl.s32 q11, q4, q10 @Shift row 1 + vshl.s32 q12, q5, q10 @Shift row 2 + vshl.s32 q13, q6, q10 @Shift row 3 + vshl.s32 q14, q7, q10 @Shift row 4 + + vmovn.s32 d30, q11 @Narrow row 1 + vmovn.s32 d31, q12 @Narrow row 2 + vmovn.s32 d0 , q13 @Narrow row 3 + vmovn.s32 d1 , q14 @Narrow row 4 + + vneg.s16 q1, q15 @Get negative + vneg.s16 q4, q0 @Get negative + + vceq.s16 q5, q15, #0 @I compare with zero row 1 and 2 blk 1 + vceq.s16 q6, q0 , #0 @I compare with zero row 1 and 2 blk 1 + + vbsl.s16 q2, q1, q15 @Restore sign of row 1 and 2 + vbsl.s16 q3, q4, q0 @Restore sign of row 3 and 4 + + + vmovn.u16 d14, q5 @I Narrow the comparison for row 1 and 2 blk 1 + vmovn.u16 d15, q6 @I Narrow the comparison for row 1 and 2 blk 2 + + vshr.u8 q8, q7, #7 @I Reduce comaparison bit to a signle bit row 1 and 2 blk 1 and 2 [ keep the value for later use ] + + vpadd.u8 d18, d16, d17 @I pair add nnz 1 + vpadd.u8 d20, d18, d19 @I Pair add nnz 2 + vpadd.u8 d22, d20, d21 @I Pair add nnz 3 + vpadd.u8 d24, d22, d23 @I Pair add nnz4 + vst1.s16 {q2-q3}, [r2] @Store blk + + vmov.u8 d25, #16 @I Get max nnz + vsub.u8 d26, d25, d24 @I invert current nnz + + vst1.u8 d26[0], [r9] @I Write nnz + + vpop {d8-d15} + pop {r4-r12, pc} + + + +@***************************************************************************** +@* +@* Function Name : ih264_resi_trans_quant_chroma_4x4_a9 +@* Description : This function does residue calculation, forward transform +@* and quantization for 4x4 chroma block. +@* +@* Arguments : R0 :pointer to src buffer +@ R1 :pointer to pred buffer +@ R2 :pointer to dst buffer +@ R3 :source stride +@ STACK : pred stride, +@ dst stride, +@ pointer to scaling matrix, +@ pointer to threshold matrix, +@ qbits, +@ rounding factor, +@ pointer to store nnz +@ pointer to store unquantized dc values +@ Values Returned : NONE +@ +@ Register Usage : +@ Stack Usage : 40 bytes +@ Cycles : Around +@ Interruptiaility : Interruptable +@ +@ Known Limitations +@ \Assumptions : +@ +@ Revision History : +@ DD MM YYYY Author(s) Changes +@ 11 2 2015 100664 First version +@ +@***************************************************************************** + + .global ih264_resi_trans_quant_chroma_4x4_a9 +ih264_resi_trans_quant_chroma_4x4_a9: + + @R0 :pointer to src buffer + @R1 :pointer to pred buffer + @R2 :pointer to dst buffer + @R3 :Source stride + @STACk :pred stride + @ :scale matirx, + @ :threshold matrix + @ :qbits + @ :round factor + @ :nnz + @ :pu1_dc_alt_addr + push {r4-r12, lr} @push all the variables first + + add r11, sp, #40 @decrement stack pointer,to accomodate two variables + ldmfd r11, {r4-r10} @load the strides into registers + + @R0 :pointer to src buffer + @R1 :pointer to pred buffer + @R2 :pointer to dst buffer + @R3 :Source stride + @R4 :Pred stride + @R5 :scale matirx, + @R6 :threshold matrix + @R7 :qbits + @R8 :round factor + @R9 :nnz + vpush {d8-d15} + mov r11, #0 + sub r7, r11, r7 @Negate the qbit value for usiing LSL + + @------------Fucntion Loading done----------------; + + vld2.u8 {d10, d11}, [r0], r3 @load first 8 pix src row 1 + + vld2.u8 {d11, d12}, [r1], r4 @load first 8 pix pred row 1 + + vld2.u8 {d28, d29}, [r0], r3 @load first 8 pix src row 2 + + vld2.u8 {d29, d30}, [r1], r4 @load first 8 pix pred row 2 + + vld2.u8 {d25, d26}, [r0], r3 @load first 8 pix src row 3 + + vld2.u8 {d26, d27}, [r1], r4 @load first 8 pix pred row 3 + vsubl.u8 q0, d10, d11 @find residue row 1 + + vld2.u8 {d22, d23}, [r0], r3 @load first 8 pix src row 4 + + vld2.u8 {d23, d24}, [r1], r4 @load first 8 pix pred row 4 + vsubl.u8 q1, d28, d29 @find residue row 2 + + vsubl.u8 q2, d25, d26 @find residue row 3 + vsubl.u8 q3, d22, d23 @find residue row 4 + + vtrn.16 d0, d2 @T12 + vtrn.16 d4, d6 @T23 + vtrn.32 d0, d4 @T13 + vtrn.32 d2, d6 @T14 + + vadd.s16 d8 , d0, d6 @x0 = x4+x7 + vadd.s16 d9 , d2, d4 @x1 = x5+x6 + vsub.s16 d10, d2, d4 @x2 = x5-x6 + vsub.s16 d11, d0, d6 @x3 = x4-x7 + + vshl.s16 d12, d10, #1 @U_SHIFT(x2,1,shft) + vshl.s16 d13, d11, #1 @U_SHIFT(x3,1,shft) + + vadd.s16 d14, d8, d9 @x4 = x0 + x1; + vsub.s16 d16, d8, d9 @x6 = x0 - x1; + vadd.s16 d15, d13, d10 @x5 = U_SHIFT(x3,1,shft) + x2; + vsub.s16 d17, d11, d12 @x7 = x3 - U_SHIFT(x2,1,shft); + + @taking transpose again so as to make do vert transform + vtrn.16 d14, d15 @T12 + vtrn.16 d16, d17 @T23 + vtrn.32 d14, d16 @T13 + vtrn.32 d15, d17 @T24 + + @let us do vertical transform + @same code as horiz + vadd.s16 d18, d14, d17 @x0 = x4+x7 + vadd.s16 d19, d15, d16 @x1 = x5+x6 + vsub.s16 d20, d15, d16 @x2 = x5-x6 + vsub.s16 d21, d14, d17 @x3 = x4-x7 + + vshl.s16 d22, d20, #1 @U_SHIFT(x2,1,shft) + vshl.s16 d23, d21, #1 @U_SHIFT(x3,1,shft) + + vdup.s32 q4, r8 @Load rounding value row 1 + + vadd.s16 d24, d18, d19 @x5 = x0 + x1; + vsub.s16 d26, d18, d19 @x7 = x0 - x1; + vadd.s16 d25, d23, d20 @x6 = U_SHIFT(x3,1,shft) + x2; + vsub.s16 d27, d21, d22 @x8 = x3 - U_SHIFT(x2,1,shft); + vdup.s32 q10, r7 @Load qbit values + + vst1.s16 d24[0], [r10] @Store Unquantized dc value to dc alte address + +@core tranform is done for 4x8 block 1 + vld1.s16 {q14-q15}, [r5] @load the scaling values + + vabs.s16 q0, q12 @Abs val of row 1 blk 1 + + vabs.s16 q1, q13 @Abs val of row 2 blk 1 + + vmov.s32 q5, q4 @copy round fact for row 2 + + vmov.s32 q6, q4 @copy round fact for row 2 + vclt.s16 q2, q12, #0 @Get the sign of row 1 blk 1 + + vmov.s32 q7, q4 @copy round fact for row 2 + vclt.s16 q3, q13, #0 @Get the sign of row 2 blk 1 + + vmlal.s16 q4, d0, d28 @Multiply and add row 1 + vmlal.s16 q5, d1, d29 @Multiply and add row 2 + vmlal.s16 q6, d2, d30 @Multiply and add row 3 + vmlal.s16 q7, d3, d31 @Multiply and add row 4 + + vshl.s32 q11, q4, q10 @Shift row 1 + vshl.s32 q12, q5, q10 @Shift row 2 + vshl.s32 q13, q6, q10 @Shift row 3 + vshl.s32 q14, q7, q10 @Shift row 4 + + vmovn.s32 d30, q11 @Narrow row 1 + vmovn.s32 d31, q12 @Narrow row 2 + vmovn.s32 d0 , q13 @Narrow row 3 + vmovn.s32 d1 , q14 @Narrow row 4 + + vneg.s16 q1, q15 @Get negative + vneg.s16 q4, q0 @Get negative + + vceq.s16 q5, q15, #0 @I compare with zero row 1 and 2 blk 1 + vceq.s16 q6, q0 , #0 @I compare with zero row 1 and 2 blk 1 + + vbsl.s16 q2, q1, q15 @Restore sign of row 1 and 2 + vbsl.s16 q3, q4, q0 @Restore sign of row 3 and 4 + + vmovn.u16 d14, q5 @I Narrow the comparison for row 1 and 2 blk 1 + vmovn.u16 d15, q6 @I Narrow the comparison for row 1 and 2 blk 2 + + vshr.u8 q8, q7, #7 @I Reduce comaparison bit to a signle bit row 1 and 2 blk 1 and 2 [ keep the value for later use ] + + vpadd.u8 d18, d16, d17 @I pair add nnz 1 + vpadd.u8 d20, d18, d19 @I Pair add nnz 2 + vpadd.u8 d22, d20, d21 @I Pair add nnz 3 + vpadd.u8 d24, d22, d23 @I Pair add nnz4 + vst1.s16 {q2-q3}, [r2] @Store blk + + vmov.u8 d25, #16 @I Get max nnz + vsub.u8 d26, d25, d24 @I invert current nnz + + vst1.u8 d26[0], [r9] @I Write nnz + + vpop {d8-d15} + pop {r4-r12, pc} + + + +@***************************************************************************** +@* +@* Function Name : ih264_hadamard_quant_4x4_a9 +@* Description : This function does forward hadamard transform and +@* quantization for luma dc block +@* +@* Arguments : R0 :pointer to src buffer +@ R1 :pointer to dst buffer +@ R2 :pu2_scale_matrix +@ R2 :pu2_threshold_matrix +@ STACk : u4_qbits +@ u4_round_factor +@ pu1_nnz +@ Values Returned : NONE +@ +@ Register Usage : +@ Stack Usage : 0 bytes +@ Cycles : Around +@ Interruptiaility : Interruptable +@ +@ Known Limitations +@ \Assumptions : +@ +@ Revision History : +@ DD MM YYYY Author(s) Changes +@ 20 2 2015 100633 First version +@ +@***************************************************************************** +@ih264_hadamard_quant_4x4_a9(WORD16 *pi2_src, WORD16 *pi2_dst, +@ const UWORD16 *pu2_scale_matrix, +@ const UWORD16 *pu2_threshold_matrix, UWORD32 u4_qbits, +@ UWORD32 u4_round_factor,UWORD8 *pu1_nnz +@ ) + .global ih264_hadamard_quant_4x4_a9 +ih264_hadamard_quant_4x4_a9: + +@Registert usage +@ r0 : src +@ r1 : dst +@ r2 : *pu2_scale_matrix +@ r3 : *pu2_threshold_matrix + + vld4.s16 {d0, d1, d2, d3}, [r0]! @Load 4x4 block + vpush {d8-d15} + + vld1.u16 d30[0], [r2] @load pu2_scale_matrix[0] + + vaddl.s16 q3, d0, d3 @x0 = x4 + x7; + vaddl.s16 q4, d1, d2 @x1 = x5 + x6; + vsubl.s16 q5, d1, d2 @x2 = x5 - x6; + vsubl.s16 q6, d0, d3 @x3 = x4 - x7; + + vdup.u16 d30, d30[0] @pu2_scale_matrix[0] + + vadd.s32 q7, q3, q4 @pi2_dst[0] = x0 + x1; + vadd.s32 q8, q6, q5 @pi2_dst[1] = x3 + x2; + add r3, sp, #68 @Get address of u4_round_factor + vsub.s32 q9, q3, q4 @pi2_dst[2] = x0 - x1; + vsub.s32 q10, q6, q5 @pi2_dst[3] = x3 - x2; + + vtrn.s32 q7, q8 @transpose 4x4 block + vtrn.s32 q9, q10 + vld1.s32 d0[0], [r3] @load u4_round_factor + vswp d15, d18 + vswp d17, d20 + + add r3, sp, #64 @Get address of u4_qbits + vadd.s32 q11, q7, q10 @x0 = x4 + x7; + vadd.s32 q12, q8, q9 @x1 = x5 + x6; + vld1.s32 d31[0], [r3] @load u4_qbits + vsub.s32 q13, q8, q9 @x2 = x5 - x6; + vsub.s32 q14, q7, q10 @x3 = x4 - x7; + + vdup.s32 q7, d0[0] @u4_round_factor + + vadd.s32 q0, q11, q12 @(x0 + x1) + vadd.s32 q1, q14, q13 @(x3 + x2) + vsub.s32 q2, q11, q12 @(x0 - x1) + vsub.s32 q3, q14, q13 @(x3 - x2) + + vdup.s32 q11, d31[0] @u4_round_factor + + vshrn.s32 d0, q0, #1 @i4_value = (x0 + x1) >> 1; + vshrn.s32 d1, q1, #1 @i4_value = (x3 + x2) >> 1; + vshrn.s32 d2, q2, #1 @i4_value = (x0 - x1) >> 1; + vshrn.s32 d3, q3, #1 @i4_value = (x3 - x2) >> 1; + + vabs.s16 q5, q0 + vabs.s16 q6, q1 + + vmov.s32 q8, q7 @Get the round fact + vmov.s32 q9, q7 + vmov.s32 q10, q7 + + vclt.s16 q3, q0, #0 @get the sign row 1,2 + vclt.s16 q4, q1, #0 + + vneg.s32 q11, q11 @-u4_round_factor + + vmlal.u16 q7, d10, d30 + vmlal.u16 q8, d11, d30 + vmlal.u16 q9, d12, d30 + vmlal.u16 q10, d13, d30 + + vshl.u32 q7, q7, q11 + vshl.u32 q8, q8, q11 + vshl.u32 q9, q9, q11 + vshl.u32 q10, q10, q11 + + vqmovn.u32 d22, q7 + vqmovn.u32 d23, q8 + vqmovn.u32 d24, q9 + vqmovn.u32 d25, q10 + + vneg.s16 q13, q11 + vneg.s16 q14, q12 + + vbsl.s16 q3, q13, q11 + vbsl.s16 q4, q14, q12 + + vceq.s16 q5, q11, #0 + vceq.s16 q6, q12, #0 + + vst1.s16 {q3}, [r1]! + + vshrn.u16 d14, q5, #8 + vshrn.u16 d15, q6, #8 + + ldr r3, [sp, #72] @Load *pu1_nnz + + vshr.u8 q7, q7, #7 + + vst1.s16 {q4}, [r1]! + + vadd.u8 d16, d14, d15 + vmov.u8 d20, #16 + vpadd.u8 d17, d16, d16 + vpadd.u8 d18, d17, d17 + vpadd.u8 d19, d18, d18 + vsub.u8 d20, d20, d19 + vst1.u8 d20[0], [r3] + + vpop {d8-d15} + bx lr + + + + +@***************************************************************************** +@* +@* Function Name : ih264_hadamard_quant_2x2_uv_a9 +@* Description : This function does forward hadamard transform and +@* quantization for dc block of chroma for both planes +@* +@* Arguments : R0 :pointer to src buffer +@ R1 :pointer to dst buffer +@ R2 :pu2_scale_matrix +@ R2 :pu2_threshold_matrix +@ STACk : u4_qbits +@ u4_round_factor +@ pu1_nnz +@ Values Returned : NONE +@ +@ Register Usage : +@ Stack Usage : 0 bytes +@ Cycles : Around +@ Interruptiaility : Interruptable +@ +@ Known Limitations +@ \Assumptions : +@ +@ Revision History : +@ DD MM YYYY Author(s) Changes +@ 20 2 2015 100633 First version +@ +@***************************************************************************** +@ ih264_hadamard_quant_2x2_uv_a9(WORD16 *pi2_src, WORD16 *pi2_dst, +@ const UWORD16 *pu2_scale_matrix, +@ const UWORD16 *pu2_threshold_matrix, UWORD32 u4_qbits, +@ UWORD32 u4_round_factor,UWORD8 *pu1_nnz +@ ) + + .global ih264_hadamard_quant_2x2_uv_a9 +ih264_hadamard_quant_2x2_uv_a9: + + vpush {d8-d15} + vld2.s16 {d0-d1}, [r0] @load src + + add r3, sp, #68 @Get address of u4_round_factor + + vaddl.s16 q3, d0, d1 @x0 = x4 + x5;, x2 = x6 + x7; + vld1.u16 d30[0], [r2] @load pu2_scale_matrix[0] + vsubl.s16 q4, d0, d1 @x1 = x4 - x5; x3 = x6 - x7; + + add r0, sp, #64 @Get affress of u4_qbits + vld1.s32 d28[0], [r3] @load u4_round_factor + vtrn.s32 q3, q4 @q1 -> x0 x1, q2 -> x2 x3 + + vadd.s32 q0, q3, q4 @ (x0 + x2) (x1 + x3) (y0 + y2); (y1 + y3); + vld1.s32 d24[0], [r0] @load u4_qbits + vsub.s32 q1, q3, q4 @ (x0 - x2) (x1 - x3) (y0 - y2); (y1 - y3); + + vdup.u16 d30, d30[0] @pu2_scale_matrix + + vabs.s32 q2, q0 + vabs.s32 q3, q1 + + vdup.s32 q14, d28[0] @u4_round_factor + + vmovl.u16 q15, d30 @pu2_scale_matrix + + vclt.s32 q4, q0, #0 @get the sign row 1,2 + vdup.s32 q12, d24[0] @u4_round_factor + vclt.s32 q5, q1, #0 + + vqmovn.u32 d8, q4 + vqmovn.s32 d9, q5 + + vmov.s32 q13, q14 @Get the round fact + vneg.s32 q12, q12 @-u4_round_factor + + vmla.u32 q13, q2, q15 + vmla.u32 q14, q3, q15 + + vshl.u32 q13, q13, q12 @>>qbit + vshl.u32 q14, q14, q12 @>>qbit + + vqmovn.u32 d10, q13 + vqmovn.u32 d11, q14 + + vneg.s16 q6, q5 + + vbsl.s16 q4, q6, q5 @*sign + + vtrn.s32 d8, d9 + + vceq.s16 q7, q4, #0 @Compute nnz + + vshrn.u16 d14, q7, #8 @reduce nnz comparison to 1 bit + + ldr r3, [sp, #72] @Load *pu1_nnz + vshr.u8 d14, d14, #7 @reduce nnz comparison to 1 bit + vmov.u8 d20, #4 @Since we add zeros, we need to subtract from 4 to get nnz + vpadd.u8 d17, d14, d14 @Sum up nnz + + vst1.s16 {q4}, [r1]! @Store the block + + vpadd.u8 d17, d17, d17 @Sum up nnz + vsub.u8 d20, d20, d17 @4- numzeros + vst1.u16 d20[0], [r3] @store nnz + + vpop {d8-d15} + bx lr + + + + + diff --git a/common/arm/ih264_weighted_bi_pred_a9q.s b/common/arm/ih264_weighted_bi_pred_a9q.s new file mode 100755 index 0000000..ccae779 --- /dev/null +++ b/common/arm/ih264_weighted_bi_pred_a9q.s @@ -0,0 +1,642 @@ +@/****************************************************************************** +@ * +@ * Copyright (C) 2015 The Android Open Source Project +@ * +@ * Licensed under the Apache License, Version 2.0 (the "License"); +@ * you may not use this file except in compliance with the License. +@ * You may obtain a copy of the License at: +@ * +@ * http://www.apache.org/licenses/LICENSE-2.0 +@ * +@ * Unless required by applicable law or agreed to in writing, software +@ * distributed under the License is distributed on an "AS IS" BASIS, +@ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +@ * See the License for the specific language governing permissions and +@ * limitations under the License. +@ * +@ ***************************************************************************** +@ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore +@*/ +@/** +@****************************************************************************** +@* @file +@* ih264_weighted_bi_pred_a9q.s +@* +@* @brief +@* Contains function definitions for weighted biprediction. +@* +@* @author +@* Kaushik Senthoor R +@* +@* @par List of Functions: +@* +@* - ih264_weighted_bi_pred_luma_a9q() +@* - ih264_weighted_bi_pred_chroma_a9q() +@* +@* @remarks +@* None +@* +@******************************************************************************* +@*/ +@******************************************************************************* +@* @function +@* ih264_weighted_bi_pred_luma_a9q() +@* +@* @brief +@* This routine performs the weighted biprediction as described in sec +@* 8.4.2.3.2 titled "Weighted sample prediction process" for luma. +@* +@* @par Description: +@* This function gets two ht x wd blocks, calculates the weighted samples, +@* rounds off, adds offset and stores it in the destination block. +@* +@* @param[in] pu1_src1 +@* UWORD8 Pointer to the buffer containing the input block 1. +@* +@* @param[in] pu1_src2 +@* UWORD8 Pointer to the buffer containing the input block 2. +@* +@* @param[out] pu1_dst +@* UWORD8 pointer to the destination where the output block is stored. +@* +@* @param[in] src_strd1 +@* Stride of the input buffer 1 +@* +@* @param[in] src_strd2 +@* Stride of the input buffer 2 +@* +@* @param[in] dst_strd +@* Stride of the destination buffer +@* +@* @param[in] log_wd +@* number of bits to be rounded off +@* +@* @param[in] wt1 +@* weight for the weighted prediction +@* +@* @param[in] wt2 +@* weight for the weighted prediction +@* +@* @param[in] ofst1 +@* offset 1 used after rounding off +@* +@* @param[in] ofst2 +@* offset 2 used after rounding off +@* +@* @param[in] ht +@* integer height of the array +@* +@* @param[in] wd +@* integer width of the array +@* +@* @returns +@* None +@* +@* @remarks +@* (ht,wd) can be (4,4), (4,8), (8,4), (8,8), (8,16), (16,8) or (16,16). +@* +@******************************************************************************* +@*/ +@void ih264_weighted_bi_pred_luma_a9q(UWORD8 *pu1_src1, +@ UWORD8 *pu1_src2, +@ UWORD8 *pu1_dst, +@ WORD32 src_strd1, +@ WORD32 src_strd2, +@ WORD32 dst_strd, +@ WORD32 log_wd, +@ WORD32 wt1, +@ WORD32 wt2, +@ WORD32 ofst1, +@ WORD32 ofst2, +@ WORD32 ht, +@ WORD32 wd) +@ +@**************Variables Vs Registers***************************************** +@ r0 => pu1_src1 +@ r1 => pu1_src2 +@ r2 => pu1_dst +@ r3 => src_strd1 +@ [sp] => src_strd2 (r4) +@ [sp+4] => dst_strd (r5) +@ [sp+8] => log_wd (r6) +@ [sp+12] => wt1 (r7) +@ [sp+16] => wt2 (r8) +@ [sp+20] => ofst1 (r9) +@ [sp+24] => ofst2 (r10) +@ [sp+28] => ht (r11) +@ [sp+32] => wd (r12) +@ +.text +.p2align 2 + + .global ih264_weighted_bi_pred_luma_a9q + +ih264_weighted_bi_pred_luma_a9q: + + stmfd sp!, {r4-r12, r14} @stack stores the values of the arguments + ldr r6, [sp, #48] @Load log_wd in r6 + ldr r7, [sp, #52] @Load wt1 in r7 + ldr r8, [sp, #56] @Load wt2 in r8 + ldr r9, [sp, #60] @Load ofst1 in r9 + + add r6, r6, #1 @r6 = log_wd + 1 + sxtb r7, r7 @sign-extend 16-bit wt1 to 32-bit + ldr r4, [sp, #40] @Load src_strd2 in r4 + ldr r5, [sp, #44] @Load dst_strd in r5 + sxtb r9, r9 @sign-extend 8-bit ofst1 to 32-bit + rsb r10, r6, #0 @r13 = -(log_wd + 1) + ldr r11, [sp, #68] @Load ht in r11 + ldr r12, [sp, #72] @Load wd in r12 + vdup.16 q0, r10 @Q0 = -(log_wd + 1) (32-bit) + add r9, r9, #1 @r9 = ofst1 + 1 + + ldr r10, [sp, #64] @Load ofst2 in r10 + sxtb r8, r8 @sign-extend 16-bit wt2 to 32-bit + cmp r12, #16 @check if wd is 16 + vpush {d8-d15} + sxtb r10, r10 @sign-extend 8-bit ofst2 to 32-bit + add r9, r9, r10 @r9 = ofst1 + ofst2 + 1 + vmov d2, r7, r8 @D2 = {wt1(32-bit), wt2(32-bit)} + asr r9, r9, #1 @r9 = ofst = (ofst1 + ofst2 + 1) >> 1 + vdup.8 d3, r9 @D3 = ofst (8-bit) + beq loop_16 @branch if wd is 16 + + cmp r12, #8 @check if wd is 8 + beq loop_8 @branch if wd is 8 + +loop_4: @each iteration processes four rows + + vld1.32 d4[0], [r0], r3 @load row 1 in source 1 + vld1.32 d4[1], [r0], r3 @load row 2 in source 1 + vld1.32 d6[0], [r1], r4 @load row 1 in source 2 + vld1.32 d6[1], [r1], r4 @load row 2 in source 2 + + vmovl.u8 q2, d4 @converting rows 1,2 in source 1 to 16-bit + vld1.32 d8[0], [r0], r3 @load row 3 in source 1 + vld1.32 d8[1], [r0], r3 @load row 4 in source 1 + vmovl.u8 q3, d6 @converting rows 1,2 in source 2 to 16-bit + vld1.32 d10[0], [r1], r4 @load row 3 in source 2 + vld1.32 d10[1], [r1], r4 @load row 4 in source 2 + + vmovl.u8 q4, d8 @converting rows 3,4 in source 1 to 16-bit + vmovl.u8 q5, d10 @converting rows 3,4 in source 2 to 16-bit + + vmul.s16 q2, q2, d2[0] @weight 1 mult. for rows 1,2 + vmla.s16 q2, q3, d2[2] @weight 2 mult. for rows 1,2 + vmul.s16 q4, q4, d2[0] @weight 1 mult. for rows 3,4 + vmla.s16 q4, q5, d2[2] @weight 2 mult. for rows 3,4 + + subs r11, r11, #4 @decrement ht by 4 + vrshl.s16 q2, q2, q0 @rounds off the weighted samples from rows 1,2 + vrshl.s16 q4, q4, q0 @rounds off the weighted samples from rows 3,4 + + vaddw.s8 q2, q2, d3 @adding offset for rows 1,2 + vaddw.s8 q4, q4, d3 @adding offset for rows 3,4 + + vqmovun.s16 d4, q2 @saturating rows 1,2 to unsigned 8-bit + vqmovun.s16 d8, q4 @saturating rows 3,4 to unsigned 8-bit + + vst1.32 d4[0], [r2], r5 @store row 1 in destination + vst1.32 d4[1], [r2], r5 @store row 2 in destination + vst1.32 d8[0], [r2], r5 @store row 3 in destination + vst1.32 d8[1], [r2], r5 @store row 4 in destination + + bgt loop_4 @if greater than 0 repeat the loop again + + b end_loops + +loop_8: @each iteration processes four rows + + vld1.8 d4, [r0], r3 @load row 1 in source 1 + vld1.8 d6, [r1], r4 @load row 1 in source 2 + vld1.8 d8, [r0], r3 @load row 2 in source 1 + vld1.8 d10, [r1], r4 @load row 2 in source 2 + vmovl.u8 q2, d4 @converting row 1 in source 1 to 16-bit + vld1.8 d12, [r0], r3 @load row 3 in source 1 + vld1.8 d14, [r1], r4 @load row 3 in source 2 + vmovl.u8 q3, d6 @converting row 1 in source 2 to 16-bit + vld1.8 d16, [r0], r3 @load row 4 in source 1 + vld1.8 d18, [r1], r4 @load row 4 in source 2 + + vmovl.u8 q4, d8 @converting row 2 in source 1 to 16-bit + vmovl.u8 q5, d10 @converting row 2 in source 2 to 16-bit + + vmul.s16 q2, q2, d2[0] @weight 1 mult. for row 1 + vmla.s16 q2, q3, d2[2] @weight 2 mult. for row 1 + vmovl.u8 q6, d12 @converting row 3 in source 1 to 16-bit + vmovl.u8 q7, d14 @converting row 3 in source 2 to 16-bit + vmul.s16 q4, q4, d2[0] @weight 1 mult. for row 2 + vmla.s16 q4, q5, d2[2] @weight 2 mult. for row 2 + vmovl.u8 q8, d16 @converting row 4 in source 1 to 16-bit + vmovl.u8 q9, d18 @converting row 4 in source 2 to 16-bit + + vmul.s16 q6, q6, d2[0] @weight 1 mult. for row 3 + vmla.s16 q6, q7, d2[2] @weight 2 mult. for row 3 + vmul.s16 q8, q8, d2[0] @weight 1 mult. for row 4 + vmla.s16 q8, q9, d2[2] @weight 2 mult. for row 4 + + vrshl.s16 q2, q2, q0 @rounds off the weighted samples from row 1 + vrshl.s16 q4, q4, q0 @rounds off the weighted samples from row 2 + vrshl.s16 q6, q6, q0 @rounds off the weighted samples from row 3 + vaddw.s8 q2, q2, d3 @adding offset for row 1 + vrshl.s16 q8, q8, q0 @rounds off the weighted samples from row 4 + vaddw.s8 q4, q4, d3 @adding offset for row 2 + + vaddw.s8 q6, q6, d3 @adding offset for row 3 + vqmovun.s16 d4, q2 @saturating row 1 to unsigned 8-bit + vaddw.s8 q8, q8, d3 @adding offset for row 4 + vqmovun.s16 d8, q4 @saturating row 2 to unsigned 8-bit + + vqmovun.s16 d12, q6 @saturating row 3 to unsigned 8-bit + vqmovun.s16 d16, q8 @saturating row 4 to unsigned 8-bit + + vst1.8 d4, [r2], r5 @store row 1 in destination + vst1.8 d8, [r2], r5 @store row 2 in destination + subs r11, r11, #4 @decrement ht by 4 + vst1.8 d12, [r2], r5 @store row 3 in destination + vst1.8 d16, [r2], r5 @store row 4 in destination + + bgt loop_8 @if greater than 0 repeat the loop again + + b end_loops + +loop_16: @each iteration processes two rows + + vld1.8 {q2}, [r0], r3 @load row 1 in source 1 + vld1.8 {q3}, [r1], r4 @load row 1 in source 2 + vld1.8 {q4}, [r0], r3 @load row 2 in source 1 + vld1.8 {q5}, [r1], r4 @load row 2 in source 2 + vmovl.u8 q10, d4 @converting row 1L in source 1 to 16-bit + vld1.8 {q6}, [r0], r3 @load row 3 in source 1 + vld1.8 {q7}, [r1], r4 @load row 3 in source 2 + vmovl.u8 q11, d6 @converting row 1L in source 2 to 16-bit + vld1.8 {q8}, [r0], r3 @load row 4 in source 1 + vld1.8 {q9}, [r1], r4 @load row 4 in source 2 + + vmovl.u8 q2, d5 @converting row 1H in source 1 to 16-bit + vmovl.u8 q3, d7 @converting row 1H in source 2 to 16-bit + + vmul.s16 q10, q10, d2[0] @weight 1 mult. for row 1L + vmla.s16 q10, q11, d2[2] @weight 2 mult. for row 1L + vmovl.u8 q12, d8 @converting row 2L in source 1 to 16-bit + vmovl.u8 q13, d10 @converting row 2L in source 2 to 16-bit + + vmul.s16 q2, q2, d2[0] @weight 1 mult. for row 1H + vmla.s16 q2, q3, d2[2] @weight 2 mult. for row 1H + vmovl.u8 q4, d9 @converting row 2H in source 1 to 16-bit + vmovl.u8 q5, d11 @converting row 2H in source 2 to 16-bit + + vmul.s16 q12, q12, d2[0] @weight 1 mult. for row 2L + vmla.s16 q12, q13, d2[2] @weight 2 mult. for row 2L + vmovl.u8 q14, d12 @converting row 3L in source 1 to 16-bit + vmovl.u8 q15, d14 @converting row 3L in source 2 to 16-bit + + vmul.s16 q4, q4, d2[0] @weight 1 mult. for row 2H + vmla.s16 q4, q5, d2[2] @weight 2 mult. for row 2H + vmovl.u8 q6, d13 @converting row 3H in source 1 to 16-bit + vmovl.u8 q7, d15 @converting row 3H in source 2 to 16-bit + + vmul.s16 q14, q14, d2[0] @weight 1 mult. for row 3L + vmla.s16 q14, q15, d2[2] @weight 2 mult. for row 3L + vmovl.u8 q11, d16 @converting row 4L in source 1 to 16-bit + vmovl.u8 q3, d18 @converting row 4L in source 2 to 16-bit + + vmul.s16 q6, q6, d2[0] @weight 1 mult. for row 3H + vmla.s16 q6, q7, d2[2] @weight 2 mult. for row 3H + vmovl.u8 q8, d17 @converting row 4H in source 1 to 16-bit + vmovl.u8 q9, d19 @converting row 4H in source 2 to 16-bit + + vmul.s16 q11, q11, d2[0] @weight 1 mult. for row 4L + vmla.s16 q11, q3, d2[2] @weight 2 mult. for row 4L + vrshl.s16 q10, q10, q0 @rounds off the weighted samples from row 1L + + vmul.s16 q8, q8, d2[0] @weight 1 mult. for row 4H + vmla.s16 q8, q9, d2[2] @weight 2 mult. for row 4H + vrshl.s16 q2, q2, q0 @rounds off the weighted samples from row 1H + + vrshl.s16 q12, q12, q0 @rounds off the weighted samples from row 2L + vaddw.s8 q10, q10, d3 @adding offset for row 1L + vrshl.s16 q4, q4, q0 @rounds off the weighted samples from row 2H + vaddw.s8 q2, q2, d3 @adding offset for row 1H + vrshl.s16 q14, q14, q0 @rounds off the weighted samples from row 3L + vaddw.s8 q12, q12, d3 @adding offset for row 2L + vrshl.s16 q6, q6, q0 @rounds off the weighted samples from row 3H + vaddw.s8 q4, q4, d3 @adding offset for row 2H + vrshl.s16 q11, q11, q0 @rounds off the weighted samples from row 4L + vaddw.s8 q14, q14, d3 @adding offset for row 3L + vrshl.s16 q8, q8, q0 @rounds off the weighted samples from row 4H + vaddw.s8 q6, q6, d3 @adding offset for row 3H + + vqmovun.s16 d26, q10 @saturating row 1L to unsigned 8-bit + vaddw.s8 q11, q11, d3 @adding offset for row 4L + vqmovun.s16 d27, q2 @saturating row 1H to unsigned 8-bit + vaddw.s8 q8, q8, d3 @adding offset for row 4H + + vqmovun.s16 d10, q12 @saturating row 2L to unsigned 8-bit + vqmovun.s16 d11, q4 @saturating row 2H to unsigned 8-bit + vqmovun.s16 d30, q14 @saturating row 3L to unsigned 8-bit + vqmovun.s16 d31, q6 @saturating row 3H to unsigned 8-bit + vst1.8 {q13}, [r2], r5 @store row 1 in destination + vqmovun.s16 d14, q11 @saturating row 4L to unsigned 8-bit + vqmovun.s16 d15, q8 @saturating row 4H to unsigned 8-bit + + vst1.8 {q5}, [r2], r5 @store row 2 in destination + subs r11, r11, #4 @decrement ht by 4 + vst1.8 {q15}, [r2], r5 @store row 3 in destination + vst1.8 {q7}, [r2], r5 @store row 4 in destination + + bgt loop_16 @if greater than 0 repeat the loop again + +end_loops: + + vpop {d8-d15} + ldmfd sp!, {r4-r12, r15} @Reload the registers from sp + + +@******************************************************************************* +@* @function +@* ih264_weighted_bi_pred_chroma_a9q() +@* +@* @brief +@* This routine performs the default weighted prediction as described in sec +@* 8.4.2.3.2 titled "Weighted sample prediction process" for chroma. +@* +@* @par Description: +@* This function gets two ht x wd blocks, calculates the weighted samples, +@* rounds off, adds offset and stores it in the destination block for U and V. +@* +@* @param[in] pu1_src1 +@* UWORD8 Pointer to the buffer containing the input block 1. +@* +@* @param[in] pu1_src2 +@* UWORD8 Pointer to the buffer containing the input block 2. +@* +@* @param[out] pu1_dst +@* UWORD8 pointer to the destination where the output block is stored. +@* +@* @param[in] src_strd1 +@* Stride of the input buffer 1 +@* +@* @param[in] src_strd2 +@* Stride of the input buffer 2 +@* +@* @param[in] dst_strd +@* Stride of the destination buffer +@* +@* @param[in] log_wd +@* number of bits to be rounded off +@* +@* @param[in] wt1 +@* weights for the weighted prediction in U and V +@* +@* @param[in] wt2 +@* weights for the weighted prediction in U and V +@* +@* @param[in] ofst1 +@* offset 1 used after rounding off for U an dV +@* +@* @param[in] ofst2 +@* offset 2 used after rounding off for U and V +@* +@* @param[in] ht +@* integer height of the array +@* +@* @param[in] wd +@* integer width of the array +@* +@* @returns +@* None +@* +@* @remarks +@* (ht,wd) can be (2,2), (2,4), (4,2), (4,4), (4,8), (8,4) or (8,8). +@* +@******************************************************************************* +@*/ +@void ih264_weighted_bi_pred_chroma_a9q(UWORD8 *pu1_src1, +@ UWORD8 *pu1_src2, +@ UWORD8 *pu1_dst, +@ WORD32 src_strd1, +@ WORD32 src_strd2, +@ WORD32 dst_strd, +@ WORD32 log_wd, +@ WORD32 wt1, +@ WORD32 wt2, +@ WORD32 ofst1, +@ WORD32 ofst2, +@ WORD32 ht, +@ WORD32 wd) +@ +@**************Variables Vs Registers***************************************** +@ r0 => pu1_src1 +@ r1 => pu1_src2 +@ r2 => pu1_dst +@ r3 => src_strd1 +@ [sp] => src_strd2 (r4) +@ [sp+4] => dst_strd (r5) +@ [sp+8] => log_wd (r6) +@ [sp+12] => wt1 (r7) +@ [sp+16] => wt2 (r8) +@ [sp+20] => ofst1 (r9) +@ [sp+24] => ofst2 (r10) +@ [sp+28] => ht (r11) +@ [sp+32] => wd (r12) +@ + + + .global ih264_weighted_bi_pred_chroma_a9q + +ih264_weighted_bi_pred_chroma_a9q: + + stmfd sp!, {r4-r12, r14} @stack stores the values of the arguments + + ldr r6, [sp, #48] @Load log_wd in r6 + ldr r7, [sp, #52] @Load wt1 in r7 + ldr r8, [sp, #56] @Load wt2 in r8 + add r6, r6, #1 @r6 = log_wd + 1 + ldr r9, [sp, #60] @Load ofst1 in r9 + ldr r10, [sp, #64] @Load ofst2 in r10 + + rsb r12, r6, #0 @r12 = -(log_wd + 1) + ldr r4, [sp, #40] @Load src_strd2 in r4 + ldr r5, [sp, #44] @Load dst_strd in r5 + vdup.16 q0, r12 @Q0 = -(log_wd + 1) (16-bit) + + ldr r11, [sp, #68] @Load ht in r11 + vdup.32 q1, r7 @Q1 = (wt1_u, wt1_v) (32-bit) + ldr r12, [sp, #72] @Load wd in r12 + vdup.32 q2, r8 @Q2 = (wt2_u, wt2_v) (32-bit) + asr r7, r9, #8 @r7 = ofst1_v + asr r8, r10, #8 @r8 = ofst2_v + vpush {d8-d15} + sxtb r9, r9 @sign-extend 8-bit ofst1_u to 32-bit + sxtb r10, r10 @sign-extend 8-bit ofst2_u to 32-bit + sxtb r7, r7 @sign-extend 8-bit ofst1_v to 32-bit + sxtb r8, r8 @sign-extend 8-bit ofst2_v to 32-bit + + add r9, r9, #1 @r9 = ofst1_u + 1 + add r7, r7, #1 @r7 = ofst1_v + 1 + add r9, r9, r10 @r9 = ofst1_u + ofst2_u + 1 + add r7, r7, r8 @r7 = ofst1_v + ofst2_v + 1 + asr r9, r9, #1 @r9 = ofst_u = (ofst1_u + ofst2_u + 1) >> 1 + asr r7, r7, #1 @r7 = ofst_v = (ofst1_v + ofst2_v + 1) >> 1 + cmp r12, #8 @check if wd is 8 + pkhbt r9, r9, r7, lsl #16 @r9 = {ofst_u(16-bit), ofst_v(16-bit)} + vdup.32 q3, r9 @Q3 = {ofst_u(16-bit), ofst_v(16-bit)} + beq loop_8_uv @branch if wd is 8 + + cmp r12, #4 @check if wd is 4 + beq loop_4_uv @branch if wd is 4 + +loop_2_uv: @each iteration processes two rows + + vld1.32 d8[0], [r0], r3 @load row 1 in source 1 + vld1.32 d8[1], [r0], r3 @load row 2 in source 1 + vld1.32 d10[0], [r1], r4 @load row 1 in source 2 + vld1.32 d10[1], [r1], r4 @load row 2 in source 2 + + vmovl.u8 q4, d8 @converting rows 1,2 in source 1 to 16-bit + vmovl.u8 q5, d10 @converting rows 1,2 in source 2 to 16-bit + + vmul.s16 q4, q4, q1 @weight 1 mult. for rows 1,2 + vmla.s16 q4, q5, q2 @weight 2 mult. for rows 1,2 + + vrshl.s16 q4, q4, q0 @rounds off the weighted samples from rows 1,2 + + vadd.s16 q4, q4, q3 @adding offset for rows 1,2 + + vqmovun.s16 d8, q4 @saturating rows 1,2 to unsigned 8-bit + + vst1.32 d8[0], [r2], r5 @store row 1 in destination + vst1.32 d8[1], [r2], r5 @store row 2 in destination + + subs r11, r11, #2 @decrement ht by 2 + bgt loop_2_uv @if greater than 0 repeat the loop again + + b end_loops_uv + +loop_4_uv: @each iteration processes two rows + + vld1.8 d8, [r0], r3 @load row 1 in source 1 + vld1.8 d10, [r1], r4 @load row 1 in source 2 + vmovl.u8 q4, d8 @converting row 1 in source 1 to 16-bit + vld1.8 d12, [r0], r3 @load row 2 in source 1 + vmovl.u8 q5, d10 @converting row 1 in source 2 to 16-bit + vld1.8 d14, [r1], r4 @load row 2 in source 2 + + vmovl.u8 q6, d12 @converting row 2 in source 1 to 16-bit + vmul.s16 q4, q4, q1 @weight 1 mult. for row 1 + vmla.s16 q4, q5, q2 @weight 2 mult. for row 1 + vmovl.u8 q7, d14 @converting row 2 in source 2 to 16-bit + + vmul.s16 q6, q6, q1 @weight 1 mult. for row 2 + vmla.s16 q6, q7, q2 @weight 2 mult. for row 2 + + subs r11, r11, #2 @decrement ht by 2 + vrshl.s16 q4, q4, q0 @rounds off the weighted samples from row 1 + vrshl.s16 q6, q6, q0 @rounds off the weighted samples from row 2 + vadd.s16 q4, q4, q3 @adding offset for row 1 + vadd.s16 q6, q6, q3 @adding offset for row 2 + + vqmovun.s16 d8, q4 @saturating row 1 to unsigned 8-bit + vqmovun.s16 d12, q6 @saturating row 2 to unsigned 8-bit + + vst1.8 d8, [r2], r5 @store row 1 in destination + vst1.8 d12, [r2], r5 @store row 2 in destination + + bgt loop_4_uv @if greater than 0 repeat the loop again + + b end_loops_uv + +loop_8_uv: @each iteration processes two rows + + vld1.8 {q4}, [r0], r3 @load row 1 in source 1 + vld1.8 {q5}, [r1], r4 @load row 1 in source 2 + vld1.8 {q6}, [r0], r3 @load row 2 in source 1 + vld1.8 {q7}, [r1], r4 @load row 2 in source 2 + vmovl.u8 q12, d8 @converting row 1L in source 1 to 16-bit + vld1.8 {q8}, [r0], r3 @load row 3 in source 1 + vld1.8 {q9}, [r1], r4 @load row 3 in source 2 + vmovl.u8 q13, d10 @converting row 1L in source 2 to 16-bit + vld1.8 {q10}, [r0], r3 @load row 4 in source 1 + vld1.8 {q11}, [r1], r4 @load row 4 in source 2 + + vmovl.u8 q4, d9 @converting row 1H in source 1 to 16-bit + vmovl.u8 q5, d11 @converting row 1H in source 2 to 16-bit + + vmul.s16 q12, q12, q1 @weight 1 mult. for row 1L + vmla.s16 q12, q13, q2 @weight 2 mult. for row 1L + vmovl.u8 q14, d12 @converting row 2L in source 1 to 16-bit + vmovl.u8 q15, d14 @converting row 2L in source 2 to 16-bit + + vmul.s16 q4, q4, q1 @weight 1 mult. for row 1H + vmla.s16 q4, q5, q2 @weight 2 mult. for row 1H + vmovl.u8 q6, d13 @converting row 2H in source 1 to 16-bit + vmovl.u8 q7, d15 @converting row 2H in source 2 to 16-bit + + vmul.s16 q14, q14, q1 @weight 1 mult. for row 2L + vmla.s16 q14, q15, q2 @weight 2 mult. for row 2L + vmovl.u8 q13, d16 @converting row 3L in source 1 to 16-bit + vmovl.u8 q5, d18 @converting row 3L in source 2 to 16-bit + + vmul.s16 q6, q6, q1 @weight 1 mult. for row 2H + vmla.s16 q6, q7, q2 @weight 2 mult. for row 2H + vmovl.u8 q8, d17 @converting row 3H in source 1 to 16-bit + vmovl.u8 q9, d19 @converting row 3H in source 2 to 16-bit + + vmul.s16 q13, q13, q1 @weight 1 mult. for row 3L + vmla.s16 q13, q5, q2 @weight 2 mult. for row 3L + vmovl.u8 q15, d20 @converting row 4L in source 1 to 16-bit + vmovl.u8 q7, d22 @converting row 4L in source 2 to 16-bit + + vmul.s16 q8, q8, q1 @weight 1 mult. for row 3H + vmla.s16 q8, q9, q2 @weight 2 mult. for row 3H + vmovl.u8 q10, d21 @converting row 4H in source 1 to 16-bit + vmovl.u8 q11, d23 @converting row 4H in source 2 to 16-bit + + vmul.s16 q15, q15, q1 @weight 1 mult. for row 4L + vmla.s16 q15, q7, q2 @weight 2 mult. for row 4L + vrshl.s16 q12, q12, q0 @rounds off the weighted samples from row 1L + + vmul.s16 q10, q10, q1 @weight 1 mult. for row 4H + vmla.s16 q10, q11, q2 @weight 2 mult. for row 4H + vrshl.s16 q4, q4, q0 @rounds off the weighted samples from row 1H + + vrshl.s16 q14, q14, q0 @rounds off the weighted samples from row 2L + vadd.s16 q12, q12, q3 @adding offset for row 1L + vrshl.s16 q6, q6, q0 @rounds off the weighted samples from row 2H + vadd.s16 q4, q4, q3 @adding offset for row 1H + vrshl.s16 q13, q13, q0 @rounds off the weighted samples from row 3L + vadd.s16 q14, q14, q3 @adding offset for row 2L + vrshl.s16 q8, q8, q0 @rounds off the weighted samples from row 3H + vadd.s16 q6, q6, q3 @adding offset for row 2H + vrshl.s16 q15, q15, q0 @rounds off the weighted samples from row 4L + vadd.s16 q13, q13, q3 @adding offset for row 3L + vrshl.s16 q10, q10, q0 @rounds off the weighted samples from row 4H + vadd.s16 q8, q8, q3 @adding offset for row 3H + + vqmovun.s16 d10, q12 @saturating row 1L to unsigned 8-bit + vadd.s16 q15, q15, q3 @adding offset for row 4L + vqmovun.s16 d11, q4 @saturating row 1H to unsigned 8-bit + vadd.s16 q10, q10, q3 @adding offset for row 4H + + vqmovun.s16 d18, q14 @saturating row 2L to unsigned 8-bit + vqmovun.s16 d19, q6 @saturating row 2H to unsigned 8-bit + vqmovun.s16 d14, q13 @saturating row 3L to unsigned 8-bit + vqmovun.s16 d15, q8 @saturating row 3H to unsigned 8-bit + vst1.8 {q5}, [r2], r5 @store row 1 in destination + vqmovun.s16 d22, q15 @saturating row 4L to unsigned 8-bit + vqmovun.s16 d23, q10 @saturating row 4H to unsigned 8-bit + + vst1.8 {q9}, [r2], r5 @store row 2 in destination + subs r11, r11, #4 @decrement ht by 4 + vst1.8 {q7}, [r2], r5 @store row 3 in destination + vst1.8 {q11}, [r2], r5 @store row 4 in destination + + bgt loop_8_uv @if greater than 0 repeat the loop again + +end_loops_uv: + + vpop {d8-d15} + ldmfd sp!, {r4-r12, r15} @Reload the registers from sp + + diff --git a/common/arm/ih264_weighted_pred_a9q.s b/common/arm/ih264_weighted_pred_a9q.s new file mode 100755 index 0000000..1ce94d0 --- /dev/null +++ b/common/arm/ih264_weighted_pred_a9q.s @@ -0,0 +1,479 @@ +@/****************************************************************************** +@ * +@ * Copyright (C) 2015 The Android Open Source Project +@ * +@ * Licensed under the Apache License, Version 2.0 (the "License"); +@ * you may not use this file except in compliance with the License. +@ * You may obtain a copy of the License at: +@ * +@ * http://www.apache.org/licenses/LICENSE-2.0 +@ * +@ * Unless required by applicable law or agreed to in writing, software +@ * distributed under the License is distributed on an "AS IS" BASIS, +@ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +@ * See the License for the specific language governing permissions and +@ * limitations under the License. +@ * +@ ***************************************************************************** +@ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore +@*/ +@/** +@****************************************************************************** +@* @file +@* ih264_weighted_pred_a9q.s +@* +@* @brief +@* Contains function definitions for weighted prediction. +@* +@* @author +@* Kaushik Senthoor R +@* +@* @par List of Functions: +@* +@* - ih264_weighted_pred_luma_a9q() +@* - ih264_weighted_pred_chroma_a9q() +@* +@* @remarks +@* None +@* +@******************************************************************************* +@*/ +@******************************************************************************* +@* @function +@* ih264_weighted_pred_luma_a9q() +@* +@* @brief +@* This routine performs the default weighted prediction as described in sec +@* 8.4.2.3.2 titled "Weighted sample prediction process" for luma. +@* +@* @par Description: +@* This function gets a ht x wd block, calculates the weighted sample, rounds +@* off, adds offset and stores it in the destination block. +@* +@* @param[in] pu1_src: +@* UWORD8 Pointer to the buffer containing the input block. +@* +@* @param[out] pu1_dst +@* UWORD8 pointer to the destination where the output block is stored. +@* +@* @param[in] src_strd +@* Stride of the input buffer +@* +@* @param[in] dst_strd +@* Stride of the destination buffer +@* +@* @param[in] log_wd +@* number of bits to be rounded off +@* +@* @param[in] wt +@* weight for the weighted prediction +@* +@* @param[in] ofst +@* offset used after rounding off +@* +@* @param[in] ht +@* integer height of the array +@* +@* @param[in] wd +@* integer width of the array +@* +@* @returns +@* None +@* +@* @remarks +@* (ht,wd) can be (4,4), (4,8), (8,4), (8,8), (8,16), (16,8) or (16,16). +@* +@******************************************************************************* +@*/ +@void ih264_weighted_pred_luma_a9q(UWORD8 *pu1_src, +@ UWORD8 *pu1_dst, +@ WORD32 src_strd, +@ WORD32 dst_strd, +@ WORD32 log_wd, +@ WORD32 wt, +@ WORD32 ofst, +@ WORD32 ht, +@ WORD32 wd) +@ +@**************Variables Vs Registers***************************************** +@ r0 => pu1_src +@ r1 => pu1_dst +@ r2 => src_strd +@ r3 => dst_strd +@ [sp] => log_wd (r4) +@ [sp+4] => wt (r5) +@ [sp+8] => ofst (r6) +@ [sp+12] => ht (r7) +@ [sp+16] => wd (r8) +@ +.text +.p2align 2 + + .global ih264_weighted_pred_luma_a9q + +ih264_weighted_pred_luma_a9q: + + stmfd sp!, {r4-r9, r14} @stack stores the values of the arguments + ldr r5, [sp, #32] @Load wt + ldr r4, [sp, #28] @Load log_wd in r4 + ldr r6, [sp, #36] @Load ofst + ldr r7, [sp, #40] @Load ht + ldr r8, [sp, #44] @Load wd + vpush {d8-d15} + + vdup.16 d2, r5 @D2 = wt (16-bit) + rsb r9, r4, #0 @r9 = -log_wd + vdup.8 d3, r6 @D3 = ofst (8-bit) + cmp r8, #16 @check if wd is 16 + vdup.16 q0, r9 @Q0 = -log_wd (16-bit) + beq loop_16 @branch if wd is 16 + + cmp r8, #8 @check if wd is 8 + beq loop_8 @branch if wd is 8 + +loop_4: @each iteration processes four rows + + vld1.32 d4[0], [r0], r2 @load row 1 in source + vld1.32 d4[1], [r0], r2 @load row 2 in source + vld1.32 d6[0], [r0], r2 @load row 3 in source + vld1.32 d6[1], [r0], r2 @load row 4 in source + + vmovl.u8 q2, d4 @converting rows 1,2 to 16-bit + vmovl.u8 q3, d6 @converting rows 3,4 to 16-bit + + vmul.s16 q2, q2, d2[0] @weight mult. for rows 1,2 + vmul.s16 q3, q3, d2[0] @weight mult. for rows 3,4 + + subs r7, r7, #4 @decrement ht by 4 + vrshl.s16 q2, q2, q0 @rounds off the weighted samples from rows 1,2 + vrshl.s16 q3, q3, q0 @rounds off the weighted samples from rows 3,4 + + vaddw.s8 q2, q2, d3 @adding offset for rows 1,2 + vaddw.s8 q3, q3, d3 @adding offset for rows 3,4 + + vqmovun.s16 d4, q2 @saturating rows 1,2 to unsigned 8-bit + vqmovun.s16 d6, q3 @saturating rows 3,4 to unsigned 8-bit + + vst1.32 d4[0], [r1], r3 @store row 1 in destination + vst1.32 d4[1], [r1], r3 @store row 2 in destination + vst1.32 d6[0], [r1], r3 @store row 3 in destination + vst1.32 d6[1], [r1], r3 @store row 4 in destination + + bgt loop_4 @if greater than 0 repeat the loop again + + b end_loops + +loop_8: @each iteration processes four rows + + vld1.8 d4, [r0], r2 @load row 1 in source + vld1.8 d6, [r0], r2 @load row 2 in source + vld1.8 d8, [r0], r2 @load row 3 in source + vmovl.u8 q2, d4 @converting row 1 to 16-bit + vld1.8 d10, [r0], r2 @load row 4 in source + vmovl.u8 q3, d6 @converting row 2 to 16-bit + + vmovl.u8 q4, d8 @converting row 3 to 16-bit + vmul.s16 q2, q2, d2[0] @weight mult. for row 1 + vmovl.u8 q5, d10 @converting row 4 to 16-bit + vmul.s16 q3, q3, d2[0] @weight mult. for row 2 + vmul.s16 q4, q4, d2[0] @weight mult. for row 3 + vmul.s16 q5, q5, d2[0] @weight mult. for row 4 + + vrshl.s16 q2, q2, q0 @rounds off the weighted samples from row 1 + vrshl.s16 q3, q3, q0 @rounds off the weighted samples from row 2 + vrshl.s16 q4, q4, q0 @rounds off the weighted samples from row 3 + vaddw.s8 q2, q2, d3 @adding offset for row 1 + vrshl.s16 q5, q5, q0 @rounds off the weighted samples from row 4 + vaddw.s8 q3, q3, d3 @adding offset for row 2 + + vaddw.s8 q4, q4, d3 @adding offset for row 3 + vqmovun.s16 d4, q2 @saturating row 1 to unsigned 8-bit + vaddw.s8 q5, q5, d3 @adding offset for row 4 + vqmovun.s16 d6, q3 @saturating row 2 to unsigned 8-bit + vqmovun.s16 d8, q4 @saturating row 3 to unsigned 8-bit + vqmovun.s16 d10, q5 @saturating row 4 to unsigned 8-bit + + vst1.8 d4, [r1], r3 @store row 1 in destination + vst1.8 d6, [r1], r3 @store row 2 in destination + subs r7, r7, #4 @decrement ht by 4 + vst1.8 d8, [r1], r3 @store row 3 in destination + vst1.8 d10, [r1], r3 @store row 4 in destination + + bgt loop_8 @if greater than 0 repeat the loop again + + b end_loops + +loop_16: @each iteration processes two rows + + vld1.8 {q2}, [r0], r2 @load row 1 in source + vld1.8 {q3}, [r0], r2 @load row 2 in source + vmovl.u8 q6, d4 @converting row 1L to 16-bit + vld1.8 {q4}, [r0], r2 @load row 3 in source + vmovl.u8 q7, d5 @converting row 1H to 16-bit + vld1.8 {q5}, [r0], r2 @load row 4 in source + + vmovl.u8 q8, d6 @converting row 2L to 16-bit + vmul.s16 q6, q6, d2[0] @weight mult. for row 1L + vmovl.u8 q9, d7 @converting row 2H to 16-bit + vmul.s16 q7, q7, d2[0] @weight mult. for row 1H + vmovl.u8 q10, d8 @converting row 3L to 16-bit + vmul.s16 q8, q8, d2[0] @weight mult. for row 2L + vmovl.u8 q11, d9 @converting row 3H to 16-bit + vmul.s16 q9, q9, d2[0] @weight mult. for row 2H + vmovl.u8 q12, d10 @converting row 4L to 16-bit + vmul.s16 q10, q10, d2[0] @weight mult. for row 3L + vmovl.u8 q13, d11 @converting row 4H to 16-bit + vmul.s16 q11, q11, d2[0] @weight mult. for row 3H + + vmul.s16 q12, q12, d2[0] @weight mult. for row 4L + vrshl.s16 q6, q6, q0 @rounds off the weighted samples from row 1L + vmul.s16 q13, q13, d2[0] @weight mult. for row 4H + + vrshl.s16 q7, q7, q0 @rounds off the weighted samples from row 1H + vrshl.s16 q8, q8, q0 @rounds off the weighted samples from row 2L + vaddw.s8 q6, q6, d3 @adding offset for row 1L + vrshl.s16 q9, q9, q0 @rounds off the weighted samples from row 2H + vaddw.s8 q7, q7, d3 @adding offset for row 1H + vqmovun.s16 d4, q6 @saturating row 1L to unsigned 8-bit + vrshl.s16 q10, q10, q0 @rounds off the weighted samples from row 3L + vaddw.s8 q8, q8, d3 @adding offset for row 2L + vqmovun.s16 d5, q7 @saturating row 1H to unsigned 8-bit + vrshl.s16 q11, q11, q0 @rounds off the weighted samples from row 3H + vaddw.s8 q9, q9, d3 @adding offset for row 2H + vqmovun.s16 d6, q8 @saturating row 2L to unsigned 8-bit + vrshl.s16 q12, q12, q0 @rounds off the weighted samples from row 4L + vaddw.s8 q10, q10, d3 @adding offset for row 3L + vqmovun.s16 d7, q9 @saturating row 2H to unsigned 8-bit + vrshl.s16 q13, q13, q0 @rounds off the weighted samples from row 4H + vaddw.s8 q11, q11, d3 @adding offset for row 3H + + vqmovun.s16 d8, q10 @saturating row 3L to unsigned 8-bit + vaddw.s8 q12, q12, d3 @adding offset for row 4L + vqmovun.s16 d9, q11 @saturating row 3H to unsigned 8-bit + vaddw.s8 q13, q13, d3 @adding offset for row 4H + + vqmovun.s16 d10, q12 @saturating row 4L to unsigned 8-bit + vst1.8 {q2}, [r1], r3 @store row 1 in destination + vqmovun.s16 d11, q13 @saturating row 4H to unsigned 8-bit + vst1.8 {q3}, [r1], r3 @store row 2 in destination + subs r7, r7, #4 @decrement ht by 4 + vst1.8 {q4}, [r1], r3 @store row 3 in destination + vst1.8 {q5}, [r1], r3 @store row 4 in destination + + bgt loop_16 @if greater than 0 repeat the loop again + +end_loops: + + vpop {d8-d15} + ldmfd sp!, {r4-r9, r15} @Reload the registers from sp + + +@******************************************************************************* +@* @function +@* ih264_weighted_pred_chroma_a9q() +@* +@* @brief +@* This routine performs the default weighted prediction as described in sec +@* 8.4.2.3.2 titled "Weighted sample prediction process" for chroma. +@* +@* @par Description: +@* This function gets a ht x wd block, calculates the weighted sample, rounds +@* off, adds offset and stores it in the destination block for U and V. +@* +@* @param[in] pu1_src: +@* UWORD8 Pointer to the buffer containing the input block. +@* +@* @param[out] pu1_dst +@* UWORD8 pointer to the destination where the output block is stored. +@* +@* @param[in] src_strd +@* Stride of the input buffer +@* +@* @param[in] dst_strd +@* Stride of the destination buffer +@* +@* @param[in] log_wd +@* number of bits to be rounded off +@* +@* @param[in] wt +@* weights for the weighted prediction for U and V +@* +@* @param[in] ofst +@* offsets used after rounding off for U and V +@* +@* @param[in] ht +@* integer height of the array +@* +@* @param[in] wd +@* integer width of the array +@* +@* @returns +@* None +@* +@* @remarks +@* (ht,wd) can be (2,2), (2,4), (4,2), (4,4), (4,8), (8,4) or (8,8). +@* +@******************************************************************************* +@*/ +@void ih264_weighted_pred_chroma_a9q(UWORD8 *pu1_src, +@ UWORD8 *pu1_dst, +@ WORD32 src_strd, +@ WORD32 dst_strd, +@ WORD32 log_wd, +@ WORD32 wt, +@ WORD32 ofst, +@ WORD32 ht, +@ WORD32 wd) +@ +@**************Variables Vs Registers***************************************** +@ r0 => pu1_src +@ r1 => pu1_dst +@ r2 => src_strd +@ r3 => dst_strd +@ [sp] => log_wd (r4) +@ [sp+4] => wt (r5) +@ [sp+8] => ofst (r6) +@ [sp+12] => ht (r7) +@ [sp+16] => wd (r8) +@ + + + .global ih264_weighted_pred_chroma_a9q + +ih264_weighted_pred_chroma_a9q: + + stmfd sp!, {r4-r9, r14} @stack stores the values of the arguments + + ldr r4, [sp, #28] @Load log_wd in r4 + ldr r5, [sp, #32] @Load wt = {wt_u (16-bit), wt_v (16-bit)} + ldr r6, [sp, #36] @Load ofst = {ofst_u (8-bit), ofst_v (8-bit)} + ldr r8, [sp, #44] @Load wd + + rsb r9, r4, #0 @r9 = -log_wd + vdup.32 q1, r5 @Q1 = {wt_u (16-bit), wt_v (16-bit)} + ldr r7, [sp, #40] @Load ht + vpush {d8-d15} + vdup.16 d4, r6 @D4 = {ofst_u (8-bit), ofst_v (8-bit)} + cmp r8, #8 @check if wd is 8 + vdup.16 q0, r9 @Q0 = -log_wd (16-bit) + beq loop_8_uv @branch if wd is 8 + + cmp r8, #4 @check if ws is 4 + beq loop_4_uv @branch if wd is 4 + +loop_2_uv: @each iteration processes two rows + + vld1.32 d6[0], [r0], r2 @load row 1 in source + vld1.32 d6[1], [r0], r2 @load row 2 in source + + vmovl.u8 q3, d6 @converting rows 1,2 to 16-bit + + vmul.s16 q3, q3, q1 @weight mult. for rows 1,2 + + vrshl.s16 q3, q3, q0 @rounds off the weighted samples from rows 1,2 + + vaddw.s8 q3, q3, d4 @adding offset for rows 1,2 + + vqmovun.s16 d6, q3 @saturating rows 1,2 to unsigned 8-bit + + subs r7, r7, #2 @decrement ht by 2 + vst1.32 d6[0], [r1], r3 @store row 1 in destination + vst1.32 d6[1], [r1], r3 @store row 2 in destination + + bgt loop_2_uv @if greater than 0 repeat the loop again + + b end_loops_uv + +loop_4_uv: @each iteration processes two rows + + vld1.8 d6, [r0], r2 @load row 1 in source + vld1.8 d8, [r0], r2 @load row 2 in source + + vmovl.u8 q3, d6 @converting row 1 to 16-bit + vmovl.u8 q4, d8 @converting row 2 to 16-bit + + vmul.s16 q3, q3, q1 @weight mult. for row 1 + vmul.s16 q4, q4, q1 @weight mult. for row 2 + + subs r7, r7, #2 @decrement ht by 2 + vrshl.s16 q3, q3, q0 @rounds off the weighted samples from row 1 + vrshl.s16 q4, q4, q0 @rounds off the weighted samples from row 2 + + vaddw.s8 q3, q3, d4 @adding offset for row 1 + vaddw.s8 q4, q4, d4 @adding offset for row 2 + + vqmovun.s16 d6, q3 @saturating row 1 to unsigned 8-bit + vqmovun.s16 d8, q4 @saturating row 2 to unsigned 8-bit + + vst1.8 d6, [r1], r3 @store row 1 in destination + vst1.8 d8, [r1], r3 @store row 2 in destination + + bgt loop_4_uv @if greater than 0 repeat the loop again + + b end_loops_uv + +loop_8_uv: @each iteration processes two rows + + vld1.8 {q3}, [r0], r2 @load row 1 in source + vld1.8 {q4}, [r0], r2 @load row 2 in source + vmovl.u8 q7, d6 @converting row 1L to 16-bit + vld1.8 {q5}, [r0], r2 @load row 3 in source + vmovl.u8 q8, d7 @converting row 1H to 16-bit + vld1.8 {q6}, [r0], r2 @load row 4 in source + + vmul.s16 q7, q7, q1 @weight mult. for row 1L + vmovl.u8 q9, d8 @converting row 2L to 16-bit + vmul.s16 q8, q8, q1 @weight mult. for row 1H + vmovl.u8 q10, d9 @converting row 2H to 16-bit + vmul.s16 q9, q9, q1 @weight mult. for row 2L + vmovl.u8 q11, d10 @converting row 3L to 16-bit + vmul.s16 q10, q10, q1 @weight mult. for row 2H + vmovl.u8 q12, d11 @converting row 3H to 16-bit + vmul.s16 q11, q11, q1 @weight mult. for row 3L + vmovl.u8 q13, d12 @converting row 4L to 16-bit + vmul.s16 q12, q12, q1 @weight mult. for row 3H + vmovl.u8 q14, d13 @converting row 4H to 16-bit + + vmul.s16 q13, q13, q1 @weight mult. for row 4L + vrshl.s16 q7, q7, q0 @rounds off the weighted samples from row 1L + vmul.s16 q14, q14, q1 @weight mult. for row 4H + + vrshl.s16 q8, q8, q0 @rounds off the weighted samples from row 1H + vrshl.s16 q9, q9, q0 @rounds off the weighted samples from row 2L + vaddw.s8 q7, q7, d4 @adding offset for row 1L + vrshl.s16 q10, q10, q0 @rounds off the weighted samples from row 2H + vaddw.s8 q8, q8, d4 @adding offset for row 1H + vqmovun.s16 d6, q7 @saturating row 1L to unsigned 8-bit + vrshl.s16 q11, q11, q0 @rounds off the weighted samples from row 3L + vaddw.s8 q9, q9, d4 @adding offset for row 2L + vqmovun.s16 d7, q8 @saturating row 1H to unsigned 8-bit + vrshl.s16 q12, q12, q0 @rounds off the weighted samples from row 3H + vaddw.s8 q10, q10, d4 @adding offset for row 2H + vqmovun.s16 d8, q9 @saturating row 2L to unsigned 8-bit + vrshl.s16 q13, q13, q0 @rounds off the weighted samples from row 4L + vaddw.s8 q11, q11, d4 @adding offset for row 3L + vqmovun.s16 d9, q10 @saturating row 2H to unsigned 8-bit + vrshl.s16 q14, q14, q0 @rounds off the weighted samples from row 4H + vaddw.s8 q12, q12, d4 @adding offset for row 3H + + vqmovun.s16 d10, q11 @saturating row 3L to unsigned 8-bit + vaddw.s8 q13, q13, d4 @adding offset for row 4L + vqmovun.s16 d11, q12 @saturating row 3H to unsigned 8-bit + vaddw.s8 q14, q14, d4 @adding offset for row 4H + + vqmovun.s16 d12, q13 @saturating row 4L to unsigned 8-bit + vst1.8 {q3}, [r1], r3 @store row 1 in destination + vqmovun.s16 d13, q14 @saturating row 4H to unsigned 8-bit + vst1.8 {q4}, [r1], r3 @store row 2 in destination + subs r7, r7, #4 @decrement ht by 4 + vst1.8 {q5}, [r1], r3 @store row 3 in destination + vst1.8 {q6}, [r1], r3 @store row 4 in destination + + bgt loop_8_uv @if greater than 0 repeat the loop again + +end_loops_uv: + + vpop {d8-d15} + ldmfd sp!, {r4-r9, r15} @Reload the registers from sp + + diff --git a/common/armv8/ih264_deblk_chroma_av8.s b/common/armv8/ih264_deblk_chroma_av8.s new file mode 100755 index 0000000..3021556 --- /dev/null +++ b/common/armv8/ih264_deblk_chroma_av8.s @@ -0,0 +1,585 @@ +//****************************************************************************** +//* +//* Copyright (C) 2015 The Android Open Source Project +//* +//* Licensed under the Apache License, Version 2.0 (the "License"); +//* you may not use this file except in compliance with the License. +//* You may obtain a copy of the License at: +//* +//* http://www.apache.org/licenses/LICENSE-2.0 +//* +//* Unless required by applicable law or agreed to in writing, software +//* distributed under the License is distributed on an "AS IS" BASIS, +//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +//* See the License for the specific language governing permissions and +//* limitations under the License. +//* +//***************************************************************************** +//* Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore +//*/ +///*****************************************************************************/ +///* */ +///* File Name : ih264_deblk_chroma_av8.s */ +///* */ +///* Description : Contains function definitions for deblocking luma */ +///* edge. Functions are coded in NEON assembly and can */ +///* be compiled using ARM RVDS. */ +///* */ +///* List of Functions : ih264_deblk_chroma_vert_bs4_av8() */ +///* ih264_deblk_chroma_vert_bslt4_av8() */ +///* ih264_deblk_chroma_horz_bs4_av8() */ +///* ih264_deblk_chroma_horz_bslt4_av8() */ +///* Issues / Problems : None */ +///* */ +///* Revision History : */ +///* */ +///* DD MM YYYY Author(s) Changes (Describe the changes made) */ +///* 28 11 2013 Ittiam Draft */ +///*****************************************************************************/ + + +.text +.p2align 2 +.include "ih264_neon_macros.s" + +///** +//******************************************************************************* +//* +//* @brief +//* Performs filtering of a chroma block horizontal edge when the +//* boundary strength is set to 4 in high profile +//* +//* @par Description: +//* This operation is described in Sec. 8.7.2.4 under the title +//* "Filtering process for edges for bS equal to 4" in ITU T Rec H.264. +//* +//* @param[in] x0 - pu1_src +//* Pointer to the src sample q0 +//* +//* @param[in] x1 - src_strd +//* Source stride +//* +//* @param[in] x2 - alpha_cb +//* Alpha Value for the boundary in U +//* +//* @param[in] x3 - beta_cb +//* Beta Value for the boundary in U +//* +//* @param[in] sp(0) - alpha_cr +//* Alpha Value for the boundary in V +//* +//* @param[in] sp(4) - beta_cr +//* Beta Value for the boundary in V +//* +//* @returns +//* None +//* +//* @remarks +//* None +//* +//******************************************************************************* +//*/ + + .global ih264_deblk_chroma_horz_bs4_av8 + +ih264_deblk_chroma_horz_bs4_av8: + + // STMFD sp!,{x4-x6,x14} // + push_v_regs + stp x19, x20, [sp, #-16]! + mov x6, x5 + mov x5, x4 + sub x0, x0, x1, lsl #1 //x0 = uc_edgePixel pointing to p1 of chroma + ld2 {v6.8b, v7.8b}, [x0], x1 //D6 = p1u , D7 = p1v + mov x4, x0 //Keeping a backup of the pointer p0 of chroma + ld2 {v4.8b, v5.8b}, [x0], x1 //D4 = p0u , D5 = p0v + dup v20.8b, w2 //D20 contains alpha_cb + dup v21.8b, w5 //D21 contains alpha_cr + mov v20.d[1], v21.d[0] + ld2 {v0.8b, v1.8b}, [x0], x1 //D0 = q0u , D1 = q0v + uaddl v8.8h, v6.8b, v0.8b // + uaddl v10.8h, v7.8b, v1.8b //Q4,Q5 = q0 + p1 + movi v31.8b, #2 // + ld2 {v2.8b, v3.8b}, [x0] //D2 = q1u , D3 = q1v + mov v0.d[1], v1.d[0] + mov v2.d[1], v3.d[0] + mov v4.d[1], v5.d[0] + mov v6.d[1], v7.d[0] + uabd v26.16b, v6.16b , v4.16b //Q13 = ABS(p1 - p0) + umlal v8.8h, v2.8b, v31.8b // + umlal v10.8h, v3.8b, v31.8b //Q5,Q4 = (X2(q1U) + q0U + p1U) + uabd v22.16b, v4.16b , v0.16b //Q11 = ABS(p0 - q0) + uabd v24.16b, v2.16b , v0.16b //Q12 = ABS(q1 - q0) + uaddl v14.8h, v4.8b, v2.8b // + uaddl v28.8h, v5.8b, v3.8b //Q14,Q7 = P0 + Q1 + dup v16.8b, w3 //D16 contains beta_cb + dup v17.8b, w6 //D17 contains beta_cr + mov v16.d[1], v17.d[0] + umlal v14.8h, v6.8b, v31.8b // + umlal v28.8h, v7.8b, v31.8b //Q14,Q7 = (X2(p1U) + p0U + q1U) + cmhs v18.16b, v22.16b, v20.16b + cmhs v24.16b, v24.16b, v16.16b + cmhs v26.16b, v26.16b, v16.16b + rshrn v8.8b, v8.8h, #2 // + rshrn v9.8b, v10.8h, #2 //Q4 = (X2(q1U) + q0U + p1U + 2) >> 2 + mov v8.d[1], v9.d[0] + orr v18.16b, v18.16b , v24.16b //Q9 = ( ABS(p0 - q0) >= Alpha ) | ( ABS(q1 - q0) >= Beta ) + rshrn v10.8b, v14.8h, #2 // + rshrn v11.8b, v28.8h, #2 //Q5 = (X2(p1U) + p0U + q1U + 2) >> 2 + mov v10.d[1], v11.d[0] + orr v18.16b, v18.16b , v26.16b //Q9 = ( ABS(p0 - q0) >= Alpha ) | ( ABS(q1 - q0) >= Beta ) | ( ABS(p1 - p0) >= Beta ) + bit v10.16b, v4.16b , v18.16b // + bit v8.16b, v0.16b , v18.16b // + mov v11.d[0], v10.d[1] + mov v9.d[0], v8.d[1] + st2 {v10.8b, v11.8b}, [x4], x1 // + st2 {v8.8b, v9.8b}, [x4] // + // LDMFD sp!,{x4-x6,pc} // + ldp x19, x20, [sp], #16 + pop_v_regs + ret + + + +///** +//******************************************************************************* +//* +//* @brief +//* Performs filtering of a chroma block vertical edge when the +//* boundary strength is set to 4 in high profile +//* +//* @par Description: +//* This operation is described in Sec. 8.7.2.4 under the title +//* "Filtering process for edges for bS equal to 4" in ITU T Rec H.264. +//* +//* @param[in] x0 - pu1_src +//* Pointer to the src sample q0 +//* +//* @param[in] x1 - src_strd +//* Source stride +//* +//* @param[in] x2 - alpha_cb +//* Alpha Value for the boundary in U +//* +//* @param[in] x3 - beta_cb +//* Beta Value for the boundary in U +//* +//* @param[in] sp(0) - alpha_cr +//* Alpha Value for the boundary in V +//* +//* @param[in] sp(4) - beta_cr +//* Beta Value for the boundary in V +//* +//* @returns +//* None +//* +//* @remarks +//* None +//* +//******************************************************************************* +//*/ + + .global ih264_deblk_chroma_vert_bs4_av8 + +ih264_deblk_chroma_vert_bs4_av8: + + // STMFD sp!,{x4,x5,x12,x14} + push_v_regs + stp x19, x20, [sp, #-16]! + + sub x0, x0, #4 //point x0 to p1u of row0. + mov x12, x0 //keep a back up of x0 for buffer write + + add x2, x2, x4, lsl #8 //x2 = (alpha_cr,alpha_cb) + add x3, x3, x5, lsl #8 //x3 = (beta_cr,beta_cb) + + ld4 {v0.h, v1.h, v2.h, v3.h}[0], [x0], x1 + ld4 {v0.h, v1.h, v2.h, v3.h}[1], [x0], x1 + ld4 {v0.h, v1.h, v2.h, v3.h}[2], [x0], x1 + ld4 {v0.h, v1.h, v2.h, v3.h}[3], [x0], x1 + + ld4 {v4.h, v5.h, v6.h, v7.h}[0], [x0], x1 + ld4 {v4.h, v5.h, v6.h, v7.h}[1], [x0], x1 + ld4 {v4.h, v5.h, v6.h, v7.h}[2], [x0], x1 + ld4 {v4.h, v5.h, v6.h, v7.h}[3], [x0], x1 + + mov v10.16b, v2.16b + mov v2.16b, v1.16b + mov v1.16b, v4.16b + mov v4.16b, v10.16b + mov v10.16b, v6.16b + mov v6.16b, v3.16b + mov v3.16b, v5.16b + mov v5.16b, v10.16b + + dup v22.8h, w2 //Q11 = alpha + dup v24.8h, w3 //Q12 = beta + movi v31.8b, #2 + + mov v0.d[1], v1.d[0] + mov v2.d[1], v3.d[0] + mov v4.d[1], v5.d[0] + mov v6.d[1], v7.d[0] + + uabd v8.16b, v2.16b , v4.16b //|p0-q0| + uabd v10.16b, v6.16b , v4.16b //|q1-q0| + uabd v12.16b, v0.16b , v2.16b //|p1-p0| + uaddl v14.8h, v2.8b, v6.8b + uaddl v16.8h, v3.8b, v7.8b //(p0 + q1) + cmhi v8.16b, v22.16b , v8.16b //|p0-q0| < alpha ? + cmhi v10.16b, v24.16b , v10.16b //|q1-q0| < beta ? + cmhi v12.16b, v24.16b , v12.16b //|p1-p0| < beta ? + umlal v14.8h, v0.8b, v31.8b + umlal v16.8h, v1.8b, v31.8b //2*p1 + (p0 + q1) + uaddl v18.8h, v0.8b, v4.8b + uaddl v20.8h, v1.8b, v5.8b //(p1 + q0) + and v8.16b, v8.16b , v10.16b //|p0-q0| < alpha && |q1-q0| < beta + umlal v18.8h, v6.8b, v31.8b + umlal v20.8h, v7.8b, v31.8b //2*q1 + (p1 + q0) + + rshrn v14.8b, v14.8h, #2 + rshrn v15.8b, v16.8h, #2 //(2*p1 + (p0 + q1) + 2) >> 2 + mov v14.d[1], v15.d[0] + and v8.16b, v8.16b , v12.16b //|p0-q0| < alpha && |q1-q0| < beta && |p1-p0| < beta + rshrn v18.8b, v18.8h, #2 + rshrn v19.8b, v20.8h, #2 //(2*q1 + (p1 + q0) + 2) >> 2 + mov v18.d[1], v19.d[0] + bit v2.16b, v14.16b , v8.16b + bit v4.16b, v18.16b , v8.16b + + mov v1.d[0], v0.d[1] + mov v3.d[0], v2.d[1] + mov v5.d[0], v4.d[1] + mov v7.d[0], v6.d[1] + + mov v10.16b, v1.16b + mov v1.16b, v2.16b + mov v2.16b, v4.16b + mov v4.16b, v10.16b + mov v10.16b, v3.16b + mov v3.16b, v6.16b + mov v6.16b, v5.16b + mov v5.16b, v10.16b + + st4 {v0.h, v1.h, v2.h, v3.h}[0], [x12], x1 + st4 {v0.h, v1.h, v2.h, v3.h}[1], [x12], x1 + st4 {v0.h, v1.h, v2.h, v3.h}[2], [x12], x1 + st4 {v0.h, v1.h, v2.h, v3.h}[3], [x12], x1 + + st4 {v4.h, v5.h, v6.h, v7.h}[0], [x12], x1 + st4 {v4.h, v5.h, v6.h, v7.h}[1], [x12], x1 + st4 {v4.h, v5.h, v6.h, v7.h}[2], [x12], x1 + st4 {v4.h, v5.h, v6.h, v7.h}[3], [x12], x1 + + // LDMFD sp!,{x4,x5,x12,pc} + ldp x19, x20, [sp], #16 + pop_v_regs + ret + + + +///** +//******************************************************************************* +//* +//* @brief +//* Performs filtering of a chroma block horizontal edge for cases where the +//* boundary strength is less than 4 in high profile +//* +//* @par Description: +//* This operation is described in Sec. 8.7.2.4 under the title +//* "Filtering process for edges for bS equal to 4" in ITU T Rec H.264. +//* +//* @param[in] x0 - pu1_src +//* Pointer to the src sample q0 +//* +//* @param[in] x1 - src_strd +//* Source stride +//* +//* @param[in] x2 - alpha_cb +//* Alpha Value for the boundary in U +//* +//* @param[in] x3 - beta_cb +//* Beta Value for the boundary in U +//* +//* @param[in] sp(0) - alpha_cr +//* Alpha Value for the boundary in V +//* +//* @param[in] sp(4) - beta_cr +//* Beta Value for the boundary in V +//* +//* @param[in] sp(8) - u4_bs +//* Packed Boundary strength array +//* +//* @param[in] sp(12) - pu1_cliptab_cb +//* tc0_table for U +//* +//* @param[in] sp(16) - pu1_cliptab_cr +//* tc0_table for V +//* +//* @returns +//* None +//* +//* @remarks +//* None +//* +//******************************************************************************* +//*/ + + .global ih264_deblk_chroma_horz_bslt4_av8 + +ih264_deblk_chroma_horz_bslt4_av8: + + // STMFD sp!,{x4-x9,x14} // + push_v_regs + stp x19, x20, [sp, #-16]! + mov x8, x7 + mov x7, x6 + ldr x9, [sp, #80] + sub x0, x0, x1, lsl #1 //x0 = uc_edgePixelU pointing to p1 of chroma U + rev w7, w7 // + mov v12.2s[0], w7 //D12[0] = ui_Bs + ld1 {v16.s}[0], [x8] //D16[0] contains cliptab_cb + ld1 {v17.s}[0], [x9] //D17[0] contains cliptab_cr + ld2 {v6.8b, v7.8b}, [x0], x1 //Q3=p1 + tbl v14.8b, {v16.16b}, v12.8b //Retreiving cliptab values for U + tbl v28.8b, {v17.16b}, v12.8b //Retrieving cliptab values for V + uxtl v12.8h, v12.8b //Q6 = uc_Bs in each 16 bit scalar + mov x6, x0 //Keeping a backup of the pointer to chroma U P0 + ld2 {v4.8b, v5.8b}, [x0], x1 //Q2=p0 + movi v30.8b, #1 // + dup v20.8b, w2 //D20 contains alpha_cb + dup v21.8b, w4 //D21 contains alpha_cr + mov v20.d[1], v21.d[0] + ld2 {v0.8b, v1.8b}, [x0], x1 //Q0=q0 + uxtl v14.8h, v14.8b // + uxtl v28.8h, v28.8b // + mov v15.d[0], v28.d[0] //D14 has cliptab values for U, D15 for V + mov v14.d[1], v28.d[0] + ld2 {v2.8b, v3.8b}, [x0] //Q1=q1 + usubl v10.8h, v1.8b, v5.8b // + usubl v8.8h, v0.8b, v4.8b //Q5,Q4 = (q0 - p0) + mov v6.d[1], v7.d[0] + mov v4.d[1], v5.d[0] + uabd v26.16b, v6.16b , v4.16b //Q13 = ABS(p1 - p0) + shl v10.8h, v10.8h, #2 //Q5 = (q0 - p0)<<2 + mov v0.d[1], v1.d[0] + uabd v22.16b, v4.16b , v0.16b //Q11 = ABS(p0 - q0) + shl v8.8h, v8.8h, #2 //Q4 = (q0 - p0)<<2 + mov v14.d[1], v15.d[0] + sli v14.8h, v14.8h, #8 + mov v15.d[0], v14.d[1] + mov v2.d[1], v3.d[0] + uabd v24.16b, v2.16b , v0.16b //Q12 = ABS(q1 - q0) + cmhs v18.16b, v22.16b, v20.16b + usubl v20.8h, v6.8b, v2.8b //Q10 = (p1 - q1)L + usubl v6.8h, v7.8b, v3.8b //Q3 = (p1 - q1)H + dup v16.8b, w3 //Q8 contains beta_cb + dup v17.8b, w5 //Q8 contains beta_cr + mov v16.d[1], v17.d[0] + add v8.8h, v8.8h , v20.8h // + add v10.8h, v10.8h , v6.8h //Q5,Q4 = [ (q0 - p0)<<2 ] + (p1 - q1) + cmhs v24.16b, v24.16b, v16.16b + cmgt v12.4h, v12.4h, #0 + sqrshrn v8.8b, v8.8h, #3 // + sqrshrn v9.8b, v10.8h, #3 //Q4 = i_macro = (((q0 - p0)<<2) + (p1 - q1) + 4)>>3 + mov v8.d[1], v9.d[0] + add v14.8b, v14.8b , v30.8b //D14 = C = C0+1 for U + cmhs v26.16b, v26.16b, v16.16b + orr v18.16b, v18.16b , v24.16b //Q9 = ( ABS(p0 - q0) >= Alpha ) | ( ABS(q1 - q0) >= Beta ) + abs v6.16b, v8.16b //Q4 = ABS (i_macro) + add v15.8b, v15.8b , v30.8b //D15 = C = C0+1 for V + mov v14.d[1], v15.d[0] + mov v13.8b, v12.8b + mov v12.d[1], v13.d[0] // + orr v18.16b, v18.16b , v26.16b //Q9 = ( ABS(p0 - q0) >= Alpha ) | ( ABS(q1 - q0) >= Beta ) | ( ABS(p1 - p0) >= Beta ) + umin v14.16b, v6.16b , v14.16b //Q7 = delta = (ABS(i_macro) > C) ? C : ABS(i_macro) + bic v12.16b, v12.16b , v18.16b //final condition + cmge v8.16b, v8.16b, #0 + and v14.16b, v14.16b , v12.16b //Making delta zero in places where values shouldn be filterd + uqadd v16.16b, v4.16b , v14.16b //Q8 = p0 + delta + uqsub v4.16b, v4.16b , v14.16b //Q2 = p0 - delta + uqadd v18.16b, v0.16b , v14.16b //Q9 = q0 + delta + uqsub v0.16b, v0.16b , v14.16b //Q0 = q0 - delta + bif v16.16b, v4.16b , v8.16b //Q8 = (i_macro >= 0 ) ? (p0+delta) : (p0-delta) + bif v0.16b, v18.16b , v8.16b //Q0 = (i_macro >= 0 ) ? (q0-delta) : (q0+delta) + mov v17.d[0], v16.d[1] + mov v1.d[0], v0.d[1] + st2 {v16.8b, v17.8b}, [x6], x1 // + st2 {v0.8b, v1.8b}, [x6] // + + ldp x19, x20, [sp], #16 + pop_v_regs + ret + + + + +///** +//******************************************************************************* +//* +//* @brief +//* Performs filtering of a chroma block vertical edge for cases where the +//* boundary strength is less than 4 in high profile +//* +//* @par Description: +//* This operation is described in Sec. 8.7.2.4 under the title +//* "Filtering process for edges for bS equal to 4" in ITU T Rec H.264. +//* +//* @param[in] x0 - pu1_src +//* Pointer to the src sample q0 +//* +//* @param[in] x1 - src_strd +//* Source stride +//* +//* @param[in] x2 - alpha_cb +//* Alpha Value for the boundary in U +//* +//* @param[in] x3 - beta_cb +//* Beta Value for the boundary in U +//* +//* @param[in] sp(0) - alpha_cr +//* Alpha Value for the boundary in V +//* +//* @param[in] sp(4) - beta_cr +//* Beta Value for the boundary in V +//* +//* @param[in] sp(8) - u4_bs +//* Packed Boundary strength array +//* +//* @param[in] sp(12) - pu1_cliptab_cb +//* tc0_table for U +//* +//* @param[in] sp(16) - pu1_cliptab_cr +//* tc0_table for V +//* +//* @returns +//* None +//* +//* @remarks +//* None +//* +//******************************************************************************* +//*/ + + .global ih264_deblk_chroma_vert_bslt4_av8 + +ih264_deblk_chroma_vert_bslt4_av8: + + // STMFD sp!,{x4-x7,x10-x12,x14} + push_v_regs + stp x19, x20, [sp, #-16]! + mov x10, x7 + ldr x11, [sp, #80] //x6 = u4_bs + sub x0, x0, #4 //point x0 to p1u of row0. + add x2, x2, x4, lsl #8 + add x3, x3, x5, lsl #8 + mov x12, x0 //keep a back up of x0 for buffer write + ld4 {v0.h, v1.h, v2.h, v3.h}[0], [x0], x1 + ld4 {v0.h, v1.h, v2.h, v3.h}[1], [x0], x1 + ld4 {v0.h, v1.h, v2.h, v3.h}[2], [x0], x1 + ld4 {v0.h, v1.h, v2.h, v3.h}[3], [x0], x1 + + ld4 {v4.h, v5.h, v6.h, v7.h}[0], [x0], x1 + ld4 {v4.h, v5.h, v6.h, v7.h}[1], [x0], x1 + ld4 {v4.h, v5.h, v6.h, v7.h}[2], [x0], x1 + ld4 {v4.h, v5.h, v6.h, v7.h}[3], [x0], x1 + + mov v10.16b, v2.16b + mov v2.16b, v1.16b + mov v1.16b, v4.16b + mov v4.16b, v10.16b + mov v10.16b, v6.16b + mov v6.16b, v3.16b + mov v3.16b, v5.16b + mov v5.16b, v10.16b + dup v22.8h, w2 //Q11 = alpha + mov v2.d[1], v3.d[0] + mov v4.d[1], v5.d[0] + uabd v8.16b, v2.16b , v4.16b //|p0-q0| + dup v24.8h, w3 //Q12 = beta + mov v25.d[0], v24.d[1] + mov v6.d[1], v7.d[0] + mov v0.d[1], v1.d[0] + uabd v10.16b, v6.16b , v4.16b //|q1-q0| + uabd v12.16b, v0.16b , v2.16b //|p1-p0| + cmhi v8.16b, v22.16b , v8.16b //|p0-q0| < alpha ? + usubl v14.8h, v0.8b, v6.8b + cmhi v10.16b, v24.16b , v10.16b //|q1-q0| < beta ? + usubl v16.8h, v1.8b, v7.8b //(p1 - q1) + cmhi v12.16b, v24.16b , v12.16b //|p1-p0| < beta ? + usubl v18.8h, v4.8b, v2.8b + and v8.16b, v8.16b , v10.16b //|p0-q0| < alpha && |q1-q0| < beta + usubl v20.8h, v5.8b, v3.8b //(q0 - p0) + movi v28.8h, #4 + ld1 {v24.s}[0], [x10] //Load ClipTable for U + ld1 {v25.s}[0], [x11] //Load ClipTable for V + rev w6, w6 //Blocking strengths + and v8.16b, v8.16b , v12.16b //|p0-q0| < alpha && |q1-q0| < beta && |p1-p0| < beta + mov v10.s[0], w6 + mla v14.8h, v18.8h , v28.8h + mla v16.8h, v20.8h , v28.8h //4*(q0 - p0) + (p1 - q1) + uxtl v10.8h, v10.8b + sli v10.4h, v10.4h, #8 + tbl v12.8b, {v24.16b}, v10.8b //tC0 for U + tbl v13.8b, {v25.16b}, v10.8b //tC0 for V + zip1 v31.8b, v12.8b, v13.8b + zip2 v13.8b, v12.8b, v13.8b + mov v12.8b, v31.8b + mov v12.d[1], v13.d[0] + uxtl v10.4s, v10.4h + sli v10.4s, v10.4s, #16 + movi v24.16b, #1 + add v12.16b, v12.16b , v24.16b //tC0 + 1 + cmhs v10.16b, v10.16b , v24.16b + and v8.16b, v8.16b , v10.16b //|p0-q0| < alpha && |q1-q0| < beta && |p1-p0| < beta && u4_bs != 0 + // Q0 - Q3(inputs), + // Q4 (|p0-q0| < alpha && |q1-q0| < beta && |p1-p0| < beta && u4_bs != 0), + // Q6 (tC) + srshr v14.8h, v14.8h, #3 + srshr v16.8h, v16.8h, #3 //(((q0 - p0) << 2) + (p1 - q1) + 4) >> 3) + cmgt v18.8h, v14.8h , #0 + cmgt v20.8h, v16.8h , #0 + xtn v18.8b, v18.8h + xtn v19.8b, v20.8h //Q9 = sign(delta) + mov v18.d[1], v19.d[0] + abs v14.8h, v14.8h + abs v16.8h, v16.8h + xtn v14.8b, v14.8h + xtn v15.8b, v16.8h + mov v14.d[1], v15.d[0] + umin v14.16b, v14.16b , v12.16b //Q7 = |delta| + uqadd v20.16b, v2.16b , v14.16b //p0+|delta| + uqadd v22.16b, v4.16b , v14.16b //q0+|delta| + uqsub v24.16b, v2.16b , v14.16b //p0-|delta| + uqsub v26.16b, v4.16b , v14.16b //q0-|delta| + bit v24.16b, v20.16b , v18.16b //p0 + delta + bit v22.16b, v26.16b , v18.16b //q0 - delta + bit v2.16b, v24.16b , v8.16b + bit v4.16b, v22.16b , v8.16b + mov v1.d[0], v0.d[1] + mov v3.d[0], v2.d[1] + mov v5.d[0], v4.d[1] + mov v7.d[0], v6.d[1] + mov v10.16b, v1.16b + mov v1.16b, v2.16b + mov v2.16b, v4.16b + mov v4.16b, v10.16b + mov v10.16b, v3.16b + mov v3.16b, v6.16b + mov v6.16b, v5.16b + mov v5.16b, v10.16b + st4 {v0.h, v1.h, v2.h, v3.h}[0], [x12], x1 + st4 {v0.h, v1.h, v2.h, v3.h}[1], [x12], x1 + st4 {v0.h, v1.h, v2.h, v3.h}[2], [x12], x1 + st4 {v0.h, v1.h, v2.h, v3.h}[3], [x12], x1 + + st4 {v4.h, v5.h, v6.h, v7.h}[0], [x12], x1 + st4 {v4.h, v5.h, v6.h, v7.h}[1], [x12], x1 + st4 {v4.h, v5.h, v6.h, v7.h}[2], [x12], x1 + st4 {v4.h, v5.h, v6.h, v7.h}[3], [x12], x1 + + ldp x19, x20, [sp], #16 + pop_v_regs + ret + + diff --git a/common/armv8/ih264_deblk_luma_av8.s b/common/armv8/ih264_deblk_luma_av8.s new file mode 100755 index 0000000..bcdb03f --- /dev/null +++ b/common/armv8/ih264_deblk_luma_av8.s @@ -0,0 +1,1084 @@ +//****************************************************************************** +//* +//* Copyright (C) 2015 The Android Open Source Project +//* +//* Licensed under the Apache License, Version 2.0 (the "License"); +//* you may not use this file except in compliance with the License. +//* You may obtain a copy of the License at: +//* +//* http://www.apache.org/licenses/LICENSE-2.0 +//* +//* Unless required by applicable law or agreed to in writing, software +//* distributed under the License is distributed on an "AS IS" BASIS, +//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +//* See the License for the specific language governing permissions and +//* limitations under the License. +//* +//***************************************************************************** +//* Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore +//*/ +///*****************************************************************************/ +///* */ +///* File Name : ih264_deblk_luma_av8.s */ +///* */ +///* Description : Contains function definitions for deblocking luma */ +///* edge. Functions are coded in NEON assembly and can */ +///* be compiled using ARM RVDS. */ +///* */ +///* List of Functions : ih264_deblk_luma_vert_bs4_av8() */ +///* ih264_deblk_luma_vert_bslt4_av8() */ +///* ih264_deblk_luma_horz_bs4_av8() */ +///* ih264_deblk_luma_horz_bslt4_av8() */ +///* */ +///* Issues / Problems : None */ +///* */ +///* Revision History : */ +///* */ +///* DD MM YYYY Author(s) Changes (Describe the changes made) */ +///* 28 11 2013 Ittiam Draft */ +///* */ +///*****************************************************************************/ + + +.text +.p2align 2 +.include "ih264_neon_macros.s" + + + +///** +//******************************************************************************* +//* +//* @brief +//* Performs filtering of a luma block horizontal edge for cases where the +//* boundary strength is less than 4 +//* +//* @par Description: +//* This operation is described in Sec. 8.7.2.4 under the title +//* "Filtering process for edges for bS equal to 4" in ITU T Rec H.264. +//* +//* @param[in] x0 - pu1_src +//* Pointer to the src sample q0 +//* +//* @param[in] x1 - src_strd +//* Source stride +//* +//* @param[in] x2 - alpha +//* Alpha Value for the boundary +//* +//* @param[in] x3 - beta +//* Beta Value for the boundary +//* +//* @param[in] sp(0) - u4_bs +//* Packed Boundary strength array +//* +//* @param[in] sp(4) - pu1_cliptab +//* tc0_table +//* +//* @returns +//* None +//* +//* @remarks +//* None +//* +//******************************************************************************* +//*/ + + .global ih264_deblk_luma_horz_bslt4_av8 + +ih264_deblk_luma_horz_bslt4_av8: + + // STMFD sp!,{x4-x7,x14} + push_v_regs + stp x19, x20, [sp, #-16]! + + //LDRD x4,x5,[SP,#0x14] //x4 = ui_Bs , x5 = *puc_ClpTab + sub x0, x0, x1, lsl #1 //x1 = uc_Horizonpad + sub x0, x0, x1 //x0 pointer to p2 + rev w4, w4 // + ld1 {v10.8b, v11.8b}, [x0], x1 //p2 values are loaded into q5 + mov v12.2s[0], w4 //d12[0] = ui_Bs + mov x6, x0 //keeping backup of pointer to p1 + ld1 {v8.8b, v9.8b}, [x0], x1 //p1 values are loaded into q4 + mov x7, x0 //keeping backup of pointer to p0 + ld1 {v6.8b, v7.8b}, [x0], x1 //p0 values are loaded into q3 + uxtl v12.8h, v12.8b //q6 = uc_Bs in each 16 bt scalar + ld1 {v0.8b, v1.8b}, [x0], x1 //q0 values are loaded into q0 + mov v10.d[1], v11.d[0] + mov v8.d[1], v9.d[0] + mov v6.d[1], v7.d[0] + uabd v26.16b, v8.16b, v6.16b + ld1 {v2.8b, v3.8b}, [x0], x1 //q1 values are loaded into q1 + mov v0.d[1], v1.d[0] + mov v2.d[1], v3.d[0] + uabd v22.16b, v6.16b, v0.16b + ld1 {v16.s}[0], [x5] //D16[0] contains cliptab + uabd v24.16b, v2.16b, v0.16b + ld1 {v4.8b, v5.8b}, [x0], x1 //q2 values are loaded into q2 + tbl v14.8b, {v16.16b}, v12.8b // + mov v4.d[1], v5.d[0] + dup v20.16b, w2 //Q10 contains alpha + dup v16.16b, w3 //Q8 contains beta + uxtl v12.4s, v12.4h // + uxtl v14.4s, v14.4h // + uabd v28.16b, v10.16b, v6.16b + uabd v30.16b, v4.16b, v0.16b + cmgt v12.4s, v12.4s, #0 + sli v14.4s, v14.4s, #8 + cmhs v18.16b, v22.16b, v20.16b + cmhs v24.16b, v24.16b, v16.16b + cmhs v26.16b, v26.16b, v16.16b + cmhi v20.16b, v16.16b , v28.16b //Q10=(Ap<Beta) + cmhi v22.16b, v16.16b , v30.16b //Q11=(Aq<Beta) + sli v14.4s, v14.4s, #16 + orr v18.16b, v18.16b , v24.16b //Q9 = ( ABS(p0 - q0) >= Alpha ) | ( ABS(q1 - q0) >= Beta ) + usubl v30.8h, v1.8b, v7.8b // + usubl v24.8h, v0.8b, v6.8b //Q15,Q12 = (q0 - p0) + orr v18.16b, v18.16b , v26.16b //Q9 = ( ABS(p0 - q0) >= Alpha ) | ( ABS(q1 - q0) >= Beta ) | ( ABS(p1 - p0) >= Beta ) + usubl v28.8h, v8.8b, v2.8b //Q14 = (p1 - q1)L + shl v26.8h, v30.8h, #2 //Q13 = (q0 - p0)<<2 + shl v24.8h, v24.8h, #2 //Q12 = (q0 - p0)<<2 + usubl v30.8h, v9.8b, v3.8b //Q15 = (p1 - q1)H + bic v12.16b, v12.16b , v18.16b //final condition + add v24.8h, v24.8h , v28.8h // + add v26.8h, v26.8h , v30.8h //Q13,Q12 = [ (q0 - p0)<<2 ] + (p1 - q1) + sub v18.16b, v14.16b , v20.16b //Q9 = C0 + (Ap < Beta) + urhadd v16.16b, v6.16b , v0.16b //Q8 = ((p0+q0+1) >> 1) + mov v17.d[0], v16.d[1] + sqrshrn v24.8b, v24.8h, #3 // + sqrshrn v25.8b, v26.8h, #3 //Q12 = i_macro = (((q0 - p0)<<2) + (p1 - q1) + 4)>>3 + mov v24.d[1], v25.d[0] + sub v18.16b, v18.16b , v22.16b //Q9 = C0 + (Ap < Beta) + (Aq < Beta) + and v20.16b, v20.16b , v12.16b // + and v22.16b, v22.16b , v12.16b // + abs v26.16b, v24.16b //Q13 = ABS (i_macro) + uaddl v28.8h, v17.8b, v11.8b // + uaddl v10.8h, v16.8b, v10.8b //Q14,Q5 = p2 + (p0+q0+1)>>1 + uaddl v30.8h, v17.8b, v5.8b // + umin v18.16b, v26.16b , v18.16b //Q9 = delta = (ABS(i_macro) > C) ? C : ABS(i_macro) + ushll v26.8h, v9.8b, #1 // + uaddl v4.8h, v16.8b, v4.8b //Q15,Q2 = q2 + (p0+q0+1)>>1 + ushll v16.8h, v8.8b, #1 //Q13,Q8 = (p1<<1) + and v18.16b, v18.16b , v12.16b //Making delta zero in places where values shouldn be filterd + sub v28.8h, v28.8h , v26.8h //Q14,Q5 = [p2 + (p0+q0+1)>>1] - (p1<<1) + sub v10.8h, v10.8h , v16.8h // + ushll v16.8h, v2.8b, #1 // + ushll v26.8h, v3.8b, #1 //Q13,Q8 = (q1<<1) + sqshrn v29.8b, v28.8h, #1 // + sqshrn v28.8b, v10.8h, #1 //Q14 = i_macro_p1 + mov v28.d[1], v29.d[0] + sub v4.8h, v4.8h , v16.8h // + sub v30.8h, v30.8h , v26.8h //Q15,Q2 = [q2 + (p0+q0+1)>>1] - (q1<<1) + neg v26.16b, v14.16b //Q13 = -C0 + smin v28.16b, v28.16b , v14.16b //Q14 = min(C0,i_macro_p1) + cmge v24.16b, v24.16b, #0 + sqshrn v31.8b, v30.8h, #1 // + sqshrn v30.8b, v4.8h, #1 //Q15 = i_macro_q1 + mov v30.d[1], v31.d[0] + smax v28.16b, v28.16b , v26.16b //Q14 = max( - C0 , min(C0, i_macro_p1) ) + uqadd v16.16b, v6.16b , v18.16b //Q8 = p0 + delta + uqsub v6.16b, v6.16b , v18.16b //Q3 = p0 - delta + smin v30.16b, v30.16b , v14.16b //Q15 = min(C0,i_macro_q1) + and v28.16b, v20.16b , v28.16b //condition check Ap<beta + uqadd v14.16b, v0.16b , v18.16b //Q7 = q0 + delta + uqsub v0.16b, v0.16b , v18.16b //Q0 = q0 - delta + smax v30.16b, v30.16b , v26.16b //Q15 = max( - C0 , min(C0, i_macro_q1) ) + bif v16.16b, v6.16b , v24.16b //Q8 = (i_macro >= 0 ) ? (p0+delta) : (p0-delta) + bif v0.16b, v14.16b , v24.16b //Q0 = (i_macro >= 0 ) ? (q0-delta) : (q0+delta) + add v28.16b, v28.16b , v8.16b // + and v30.16b, v22.16b , v30.16b //condition check Aq<beta + st1 {v16.16b}, [x7], x1 //writting back filtered value of p0 + add v30.16b, v30.16b , v2.16b // + st1 {v0.16b}, [x7], x1 //writting back filtered value of q0 + st1 {v28.16b}, [x6] //writting back filtered value of p1 + st1 {v30.16b}, [x7], x1 //writting back filtered value of q1 + + // LDMFD sp!,{x4-x7,pc} + ldp x19, x20, [sp], #16 + pop_v_regs + ret + + + +///** +//******************************************************************************* +//* +//* @brief +//* Performs filtering of a luma block horizontal edge when the +//* boundary strength is set to 4 +//* +//* @par Description: +//* This operation is described in Sec. 8.7.2.4 under the title +//* "Filtering process for edges for bS equal to 4" in ITU T Rec H.264. +//* +//* @param[in] x0 - pu1_src +//* Pointer to the src sample q0 +//* +//* @param[in] x1 - src_strd +//* Source stride +//* +//* @param[in] x2 - alpha +//* Alpha Value for the boundary +//* +//* @param[in] x3 - beta +//* Beta Value for the boundary +//* +//* @returns +//* None +//* +//* @remarks +//* None +//* +//******************************************************************************* +//*/ + + .global ih264_deblk_luma_horz_bs4_av8 + +ih264_deblk_luma_horz_bs4_av8: + + // Back up necessary registers on stack + // STMFD sp!,{x12,x14} + push_v_regs + stp x19, x20, [sp, #-16]! + + // Init + dup v0.16b, w2 //duplicate alpha + sub x12, x0, x1 //pointer to p0 = q0 - src_strd + dup v2.16b, w3 //duplicate beta + sub x14, x0, x1, lsl#1 //pointer to p1 = q0 - src_strd*2 + sub x2, x0, x1, lsl#2 //pointer to p3 = q0 - src_strd*4 + sub x3, x14, x1 //pointer to p2 = p1 - src_strd + + // Load Data + ld1 {v4.8b, v5.8b}, [x0], x1 //load q0 to Q2, q0 = q0 + src_strd + ld1 {v6.8b, v7.8b}, [x12] //load p0 to Q3 + ld1 {v8.8b, v9.8b}, [x0], x1 //load q1 to Q4, q0 = q0 + src_strd + ld1 {v10.8b, v11.8b}, [x14] //load p1 to Q5 + mov v4.d[1] , v5.d[0] + mov v6.d[1] , v7.d[0] + mov v8.d[1] , v9.d[0] + mov v10.d[1] , v11.d[0] + + // Filter Decision + uabd v12.16b , v4.16b, v6.16b + uabd v14.16b , v8.16b, v4.16b + uabd v16.16b , v10.16b, v6.16b + cmhs v18.16b, v12.16b , v0.16b //ABS(p0 - q0) >= Alpha + cmhs v14.16b, v14.16b , v2.16b //ABS(q1 - q0) >= Beta + cmhs v16.16b, v16.16b , v2.16b //ABS(q1 - q0) >= Beta + movi v20.16b, #2 + orr v18.16b, v18.16b , v14.16b //ABS(p0 - q0) >= Alpha || ABS(q1 - q0) >= Beta + ld1 {v14.8b, v15.8b}, [x0], x1 //load q2 to Q7, q0 = q0 + src_strd + mov v14.d[1] , v15.d[0] + orr v18.16b, v18.16b , v16.16b //ABS(p0 - q0) >= Alpha || ABS(q1 - q0) >= Beta || ABS(p1 - p0) >= Beta + usra v20.16b, v0.16b, #2 //alpha >>2 +2 + uabd v22.16b , v14.16b, v4.16b + uaddl v24.8h, v4.8b, v6.8b //p0+q0 L + uaddl v26.8h, v5.8b, v7.8b //p0+q0 H + cmhi v22.16b, v2.16b , v22.16b //Aq < Beta + cmhi v20.16b, v20.16b , v12.16b //(ABS(p0 - q0) <((Alpha >>2) + 2)) + // Deblock Filtering q0', q1', q2' + uaddw v28.8h, v24.8h , v8.8b //p0+q0+q1 L + uaddw v30.8h, v26.8h , v9.8b //p0+q0+q1 H + and v22.16b, v22.16b , v20.16b //(Aq < Beta && ABS(p0 - q0) <((Alpha >>2) + 2)) + // q0' if (Aq < Beta && ABS(p0 - q0) <((Alpha >>2) + 2)) TRUE + add v16.8h, v28.8h , v28.8h //2*(p0+q0+q1)L + add v0.8h, v30.8h , v30.8h //2*(p0+q0+q1)H + uaddw v16.8h, v16.8h , v14.8b //2*(p0+q0+q1)+q2 L + uaddw v0.8h, v0.8h , v15.8b //2*(p0+q0+q1)+q2 H + uaddw v16.8h, v16.8h , v10.8b //2*(p0+q0+q1)+q2 +p1 L + uaddw v0.8h, v0.8h , v11.8b //2*(p0+q0+q1)+q2 +p1 H + rshrn v12.8b, v16.8h, #3 //(2*(p0+q0+q1)+q2 +p1 +4)>> 3 L [q0'] + rshrn v13.8b, v0.8h, #3 //(2*(p0+q0+q1)+q2 +p1 +4)>> 3 H [q0'] + mov v12.d[1] , v13.d[0] + // q0" if (Aq < Beta && ABS(p0 - q0) <((Alpha >>2) + 2)) FALSE + uaddl v16.8h, v8.8b, v8.8b //2*q1 L + uaddl v0.8h, v9.8b, v9.8b //2*q1 H + uaddw v16.8h, v16.8h , v4.8b //2*q1+q0 L + uaddw v0.8h, v0.8h , v5.8b //2*q1+q0 H + uaddw v16.8h, v16.8h , v10.8b //2*q1+q0+p1 L + uaddw v0.8h, v0.8h , v11.8b //2*q1+q0+p1 H + rshrn v16.8b, v16.8h, #2 //(2*q1+q0+p1+2)>>2 L [q0"] + rshrn v17.8b, v0.8h, #2 //(2*q1+q0+p1+2)>>2 H [q0"] + mov v16.d[1] , v17.d[0] + uaddw v28.8h, v28.8h , v14.8b //p0+q0+q1+q2 L + uaddw v30.8h, v30.8h , v15.8b //p0+q0+q1+q2 H + ld1 {v0.8b, v1.8b}, [x0], x1 //load q3 to Q0, q0 = q0 + src_strd + mov v0.d[1] , v1.d[0] + bit v16.16b, v12.16b , v22.16b //choosing between q0' and q0" depending on condn + sub x0, x0, x1, lsl #2 //pointer to q0 + bic v22.16b, v22.16b , v18.16b //((ABS(p0 - q0) >= Alpha || ABS(q1 - q0) >= Beta || ABS(p1 - p0) >= Beta)) + // && (Aq < Beta && ABS(p0 - q0) <((Alpha >>2) + 2)) + rshrn v12.8b, v28.8h, #2 //(p0+q0+q1+q2+2)>>2 L [q1'] + rshrn v13.8b, v30.8h, #2 //(p0+q0+q1+q2+2)>>2 H [q1'] + mov v12.d[1] , v13.d[0] + bif v4.16b, v16.16b , v18.16b //choose q0 or filtered q0 + mov v5.d[0] , v4.d[1] + uaddl v16.8h, v14.8b, v0.8b //q2+q3,L + uaddl v0.8h, v15.8b, v1.8b //q2+q3,H + add v28.8h, v28.8h , v16.8h //p0+q0+q1+2*q2+q3 L + st1 {v4.8b, v5.8b}, [x0], x1 //store q0 + add v30.8h, v30.8h , v0.8h //p0+q0+q1+2*q2+q3 H + add v28.8h, v28.8h , v16.8h //p0+q0+q1+3*q2+2*q3 L + add v30.8h, v30.8h , v0.8h //p0+q0+q1+3*q2+2*q3 H + rshrn v0.8b, v28.8h, #3 //(p0+q0+q1+3*q2+2*q3+4)>>3 L [q2'] + rshrn v1.8b, v30.8h, #3 //(p0+q0+q1+3*q2+2*q3+4)>>3 H [q2'] + mov v0.d[1] , v1.d[0] + ld1 {v30.8b, v31.8b}, [x3] //load p2 to Q15 + mov v30.d[1] , v31.d[0] + bif v12.16b, v8.16b , v22.16b //choose q1 or filtered value of q1 + mov v13.d[0] , v12.d[1] + uabd v16.16b , v30.16b, v6.16b + uaddw v24.8h, v24.8h , v10.8b //p0+q0+p1 L + bif v0.16b, v14.16b , v22.16b //choose q2 or filtered q2 + mov v1.d[0] , v0.d[1] + uaddw v26.8h, v26.8h , v11.8b //p0+q0+p1 H + st1 {v12.8b, v13.8b}, [x0], x1 //store q1 + cmhi v16.16b, v2.16b , v16.16b //Ap < Beta + add v28.8h, v24.8h , v24.8h //2*(p0+q0+p1) L + add v4.8h, v26.8h , v26.8h //2*(p0+q0+p1) H + st1 {v0.8b, v1.8b}, [x0], x1 //store q2 + and v20.16b, v20.16b , v16.16b //((Ap < Beta) && (ABS(p0 - q0) <((Alpha >>2) + 2))) + uaddw v28.8h, v28.8h , v30.8b //2*(p0+q0+p1)+p2 l + uaddw v4.8h, v4.8h , v31.8b //2*(p0+q0+p1)+p2 H + uaddw v28.8h, v28.8h , v8.8b //2*(p0+q0+p1)+p2+q1 L + uaddw v4.8h, v4.8h , v9.8b //2*(p0+q0+p1)+p2+q1 H + rshrn v28.8b, v28.8h, #3 //(2*(p0+q0+p1)+p2+q1+4)>>3 L,p0' + rshrn v29.8b, v4.8h, #3 //(2*(p0+q0+p1)+p2+q1+4)>>3 H,p0' + mov v28.d[1] , v29.d[0] + movi v0.8b, #2 + movi v1.4h, #2 + uaddl v2.8h, v6.8b, v8.8b //p0+q1 L + umlal v2.8h, v10.8b, v0.8b //2*p1+p0+q1 L + uaddl v16.8h, v7.8b, v9.8b //p0+q1 H + umlal v16.8h, v11.8b, v0.8b //2*p1+p0+q1 H + uaddw v12.8h, v24.8h , v30.8b //(p0+q0+p1) +p2 L + ld1 {v24.8b, v25.8b}, [x2] //load p3,Q12 + mov v24.d[1] , v25.d[0] + uaddw v4.8h, v26.8h , v31.8b //(p0+q0+p1) +p2 H + uaddl v8.8h, v30.8b, v24.8b //p2+p3 L + rshrn v26.8b, v12.8h, #2 //((p0+q0+p1)+p2 +2)>>2,p1' L + rshrn v2.8b, v2.8h, #2 //(2*p1+p0+q1+2)>>2,p0"L + rshrn v27.8b, v4.8h, #2 //((p0+q0+p1)+p2 +2)>>2,p1' H + rshrn v3.8b, v16.8h, #2 //(2*p1+p0+q1+2)>>2,p0" H + mov v26.d[1] , v27.d[0] + mov v2.d[1] , v3.d[0] + uaddl v16.8h, v31.8b, v25.8b //p2+p3 H + mla v12.8h, v8.8h , v1.4h[0] //(p0+q0+p1)+3*p2+2*p3 L + mla v4.8h, v16.8h , v1.4h[0] //(p0+q0+p1)+3*p2+2*p3 H + bic v16.16b, v20.16b , v18.16b //((ABS(p0 - q0) >= Alpha || ABS(q1 - q0) >= Beta || ABS(p1 - p0) >= Beta)) + mov v17.d[0] , v16.d[1] //&& (Ap < Beta && ABS(p0 - q0) <((Alpha >>2) + 2)) + bit v2.16b, v28.16b , v20.16b //choosing between po' and p0" + mov v3.d[0] , v2.d[1] + rshrn v12.8b, v12.8h, #3 //((p0+q0+p1)+3*p2+2*p3+4)>>3 L p2' + rshrn v13.8b, v4.8h, #3 //((p0+q0+p1)+3*p2+2*p3+4)>>3 H p2' + mov v12.d[1] , v13.d[0] + bif v6.16b, v2.16b , v18.16b //choosing between p0 and filtered value of p0 + bit v10.16b, v26.16b , v16.16b //choosing between p1 and p1' + bit v30.16b, v12.16b , v16.16b //choosing between p2 and p2' + st1 {v6.16b}, [x12] //store p0 + st1 {v10.16b}, [x14] //store p1 + st1 {v30.16b}, [x3] //store p2 + + // LDMFD sp!,{x12,pc} + ldp x19, x20, [sp], #16 + pop_v_regs + ret + + + +///** +//******************************************************************************* +//* +//* @brief +//* Performs filtering of a luma block vertical edge for cases where the +//* boundary strength is less than 4 +//* +//* @par Description: +//* This operation is described in Sec. 8.7.2.4 under the title +//* "Filtering process for edges for bS equal to 4" in ITU T Rec H.264. +//* +//* @param[in] x0 - pu1_src +//* Pointer to the src sample q0 +//* +//* @param[in] x1 - src_strd +//* Source stride +//* +//* @param[in] x2 - alpha +//* Alpha Value for the boundary +//* +//* @param[in] x3 - beta +//* Beta Value for the boundary +//* +//* @param[in] sp(0) - u4_bs +//* Packed Boundary strength array +//* +//* @param[in] sp(4) - pu1_cliptab +//* tc0_table +//* +//* @returns +//* None +//* +//* @remarks +//* None +//* +//******************************************************************************* +//*/ + + .global ih264_deblk_luma_vert_bslt4_av8 + +ih264_deblk_luma_vert_bslt4_av8: + + // STMFD sp!,{x12,x14} + push_v_regs + stp x19, x20, [sp, #-16]! + + sub x0, x0, #4 //pointer uc_edgePixel-4 + mov x12, x4 + mov x14, x5 + mov x17, x0 + //loading p3:p2:p1:p0:q0:q1:q2:q3 for every row + ld1 {v0.8b}, [x0], x1 //row1 + ld1 {v2.8b}, [x0], x1 //row2 + ld1 {v4.8b}, [x0], x1 //row3 + rev w12, w12 //reversing ui_bs + ld1 {v6.8b}, [x0], x1 //row4 + mov v18.2s[0], w12 //d12[0] = ui_Bs + ld1 {v16.s}[0], [x14] //D16[0] contains cliptab + ld1 {v8.8b}, [x0], x1 //row5 + uxtl v18.8h, v18.8b //q6 = uc_Bs in each 16 bt scalar + ld1 {v10.8b}, [x0], x1 //row6 + ld1 {v12.8b}, [x0], x1 //row7 + tbl v16.8b, {v16.16b}, v18.8b //puc_ClipTab[uc_Bs] + ld1 {v14.8b}, [x0], x1 //row8 + ld1 {v1.8b}, [x0], x1 //row9 + uxtl v16.4s, v16.4h // + ld1 {v3.8b}, [x0], x1 //row10 + ld1 {v5.8b}, [x0], x1 //row11 + ld1 {v7.8b}, [x0], x1 //row12 + sli v16.4s, v16.4s, #8 // + ld1 {v9.8b}, [x0], x1 //row13 + ld1 {v11.8b}, [x0], x1 //row14 + ld1 {v13.8b}, [x0], x1 //row15 + sli v16.4s, v16.4s, #16 + ld1 {v15.8b}, [x0], x1 //row16 + + + //taking two 8x8 transposes + //2X2 transposes + trn1 v21.8b, v0.8b, v2.8b + trn2 v2.8b, v0.8b, v2.8b //row1 &2 + mov v0.8b, v21.8b + trn1 v21.8b, v4.8b, v6.8b + trn2 v6.8b, v4.8b, v6.8b //row3&row4 + mov v4.8b, v21.8b + trn1 v21.8b, v8.8b, v10.8b + trn2 v10.8b, v8.8b, v10.8b //row5&6 + mov v8.8b, v21.8b + trn1 v21.8b, v12.8b, v14.8b + trn2 v14.8b, v12.8b, v14.8b //row7 & 8 + mov v12.8b, v21.8b + trn1 v21.8b, v1.8b, v3.8b + trn2 v3.8b, v1.8b, v3.8b //row9 &10 + mov v1.8b, v21.8b + trn1 v21.8b, v5.8b, v7.8b + trn2 v7.8b, v5.8b, v7.8b //row11 & 12 + mov v5.8b, v21.8b + trn1 v21.8b, v9.8b, v11.8b + trn2 v11.8b, v9.8b, v11.8b //row13 &14 + mov v9.8b, v21.8b + trn1 v21.8b, v13.8b, v15.8b + trn2 v15.8b, v13.8b, v15.8b //row15 & 16 + mov v13.8b, v21.8b + //4x4 transposes + trn1 v21.4h, v2.4h, v6.4h + trn2 v6.4h, v2.4h, v6.4h //row2 & row4 + mov v2.8b, v21.8b + trn1 v21.4h, v10.4h, v14.4h + trn2 v14.4h, v10.4h, v14.4h //row6 & row8 + mov v10.8b, v21.8b + trn1 v21.4h, v3.4h, v7.4h + trn2 v7.4h, v3.4h, v7.4h //row10 & 12 + mov v3.8b, v21.8b + trn1 v21.4h, v11.4h, v15.4h + trn2 v15.4h, v11.4h, v15.4h //row14 & row16 + mov v11.8b, v21.8b + trn1 v21.2s, v6.2s, v14.2s + trn2 v14.2s, v6.2s, v14.2s //row4 & 8 + mov v6.8b, v21.8b + trn1 v21.2s, v7.2s, v15.2s + trn2 v15.2s, v7.2s, v15.2s //row 12 & 16 + mov v7.8b, v21.8b + //now Q3 ->p0 and Q7->q3 + trn1 v21.4h, v0.4h, v4.4h + trn2 v4.4h, v0.4h, v4.4h //row1 & 3 + mov v0.8b, v21.8b + trn1 v21.4h, v8.4h, v12.4h + trn2 v12.4h, v8.4h, v12.4h //row 5 & 7 + mov v8.8b, v21.8b + trn1 v21.4h, v1.4h, v5.4h + trn2 v5.4h, v1.4h, v5.4h //row9 & row11 + mov v1.8b, v21.8b + trn1 v21.4h, v9.4h, v13.4h + trn2 v13.4h, v9.4h, v13.4h //row13 & row15 + mov v9.8b, v21.8b + trn1 v21.2s, v0.2s, v8.2s + trn2 v8.2s, v0.2s, v8.2s //row1 & row5 + mov v0.8b, v21.8b + trn1 v21.2s, v1.2s, v9.2s + trn2 v9.2s, v1.2s, v9.2s //row9 & 13 + mov v1.8b, v21.8b + //now Q0->p3 & Q4->q0 + //starting processing as p0 and q0 are now ready + trn1 v21.2s, v2.2s, v10.2s + trn2 v10.2s, v2.2s, v10.2s //row2 &6 + mov v2.8b, v21.8b + mov v6.d[1] , v7.d[0] + mov v8.d[1] , v9.d[0] + urhadd v20.16b, v6.16b , v8.16b //((p0 + q0 + 1) >> 1) + mov v21.d[0], v20.d[1] + trn1 v31.2s, v3.2s, v11.2s + trn2 v11.2s, v3.2s, v11.2s //row10&row14 + mov v3.8b, v31.8b + movi v19.8b, #2 + mov v18.d[1], v19.d[0] + //now Q1->p2 & Q5->q1 + trn1 v31.2s, v4.2s, v12.2s + trn2 v12.2s, v4.2s, v12.2s //row3 & 7 + mov v4.8b, v31.8b + uabd v22.16b , v6.16b, v8.16b //ABS(q1 - q0) + trn1 v31.2s, v5.2s, v13.2s + trn2 v13.2s, v5.2s, v13.2s //row11 & row15 + mov v5.8b, v31.8b + mov v0.d[1] , v1.d[0] + mov v2.d[1] , v3.d[0] + mov v4.d[1] , v5.d[0] + mov v10.d[1] , v11.d[0] + mov v12.d[1] , v13.d[0] + mov v14.d[1] , v15.d[0] + uaddl v24.8h, v20.8b, v2.8b //(p2 + ((p0 + q0 + 1) >> 1) L + //now Q2->p1,Q6->q2 + uaddl v26.8h, v21.8b, v3.8b //(p2 + ((p0 + q0 + 1) >> 1) H + umlsl v24.8h, v4.8b, v19.8b //(p2 + ((p0 + q0 + 1) >> 1) - (p1 << 1)) L + umlsl v26.8h, v5.8b, v19.8b //(p2 + ((p0 + q0 + 1) >> 1) - (p1 << 1)) H + dup v28.16b, w2 //alpha + cmhs v22.16b, v22.16b , v28.16b //ABS(p0 - q0) >= Alpha(Alpha <=ABS(p0 - q0)) + dup v28.16b, w3 //beta + uabd v30.16b , v10.16b, v8.16b //ABS(q1 - q0) + sqshrn v24.8b, v24.8h, #1 //((p2 + ((p0 + q0 + 1) >> 1) - (p1 << 1)) >> 1) L + sqshrn v25.8b, v26.8h, #1 //((p2 + ((p0 + q0 + 1) >> 1) - (p1 << 1)) >> 1) H + mov v24.d[1], v25.d[0] + cmhs v30.16b, v30.16b , v28.16b //ABS(p0 - q0) >= Alpha(Alpha <=ABS(p0 - q0)) + uabd v26.16b , v4.16b, v6.16b //ABS(q1 - q0) + + smin v24.16b, v24.16b , v16.16b //min(deltap1 ,C0) + orr v22.16b, v22.16b , v30.16b //ABS(q1 - q0) >= Beta ||ABS(p0 - q0) >= Alpha + neg v30.16b, v16.16b //-C0 + cmhs v26.16b, v26.16b , v28.16b //ABS(p0 - q0) >= Alpha(Alpha <=ABS(p0 - q0)) + smax v24.16b, v24.16b , v30.16b //max(deltap1,-C0) + orr v22.16b, v22.16b , v26.16b //ABS(p0 - q0) >= Alpha || ABS(q1 - q0) >= Beta || ABS(p1 - p0) >= Beta) + uxtl v26.4s, v18.4h //ui_bs + uaddl v18.8h, v20.8b, v12.8b //q2 + ((p0 + q0 + 1) >> 1) L + cmeq v26.4s, v26.4s , #0 //ABS(p0 - q0) >= Alpha(Alpha <=ABS(p0 - q0)) + usubw v18.8h, v18.8h , v10.8b //(q2 + ((p0 + q0 + 1) >> 1) - q1) L + uaddl v20.8h, v21.8b, v13.8b //q2 + ((p0 + q0 + 1) >> 1) H + usubw v18.8h, v18.8h , v10.8b //(q2 + ((p0 + q0 + 1) >> 1) - 2*q1)L + usubw v20.8h, v20.8h , v11.8b //(q2 + ((p0 + q0 + 1) >> 1) - q1) H + orr v26.16b, v26.16b , v22.16b //(ABS(p0 - q0) >= Alpha || ABS(q1 - q0) >= Beta || ABS(p1 - p0) >= Beta)) &&(ui_bs) + usubw v20.8h, v20.8h , v11.8b //(q2 + ((p0 + q0 + 1) >> 1) - 2*q1) H + sqshrn v18.8b, v18.8h, #1 //((q2 + ((p0 + q0 + 1) >> 1) - (q1 << 1)) >> 1) L + uabd v22.16b , v2.16b, v6.16b //ABS(q1 - q0) + sqshrn v19.8b, v20.8h, #1 //((q2 + ((p0 + q0 + 1) >> 1) - (q1 << 1)) >> 1) H + mov v18.d[1], v19.d[0] + uabd v20.16b , v12.16b, v8.16b //ABS(q1 - q0) + cmhi v22.16b, v28.16b , v22.16b //Ap < Beta + smin v18.16b, v18.16b , v16.16b //min(delatq1,C0) + cmhi v20.16b, v28.16b , v20.16b //Aq <Beta + usubl v28.8h, v8.8b, v6.8b //(q0 - p0) L + smax v18.16b, v18.16b , v30.16b //max(deltaq1,-C0) + usubl v30.8h, v9.8b, v7.8b //(q0 - p0) H + shl v28.8h, v28.8h, #2 //(q0 - p0)<<2 L + sub v16.16b, v16.16b , v22.16b //C0 + (Ap < Beta) + shl v30.8h, v30.8h, #2 //(q0 - p0) << 2) H + uaddw v28.8h, v28.8h , v4.8b //((q0 - p0) << 2) + (p1 L + uaddw v30.8h, v30.8h , v5.8b //((q0 - p0) << 2) + (p1 H + usubw v28.8h, v28.8h , v10.8b //((q0 - p0) << 2) + (p1 - q1) L + usubw v30.8h, v30.8h , v11.8b //((q0 - p0) << 2) + (p1 - q1) H + bic v22.16b, v22.16b , v26.16b //final condition for p1 + rshrn v28.8b, v28.8h, #3 //delta = ((((q0 - p0) << 2) + (p1 - q1) + 4) >> 3); L + rshrn v29.8b, v30.8h, #3 //delta = ((((q0 - p0) << 2) + (p1 - q1) + 4) >> 3) H + mov v28.d[1], v29.d[0] + sub v16.16b, v16.16b , v20.16b //C0 + (Ap < Beta) + (Aq < Beta) + bic v20.16b, v20.16b , v26.16b //final condition for q1 + abs v30.16b, v28.16b //abs(delta) + and v24.16b, v24.16b , v22.16b //delatp1 + and v18.16b, v18.16b , v20.16b //delta q1 + umin v30.16b, v30.16b , v16.16b //min((abs(delta),C) + add v4.16b, v4.16b , v24.16b //p1+deltap1 + add v10.16b, v10.16b , v18.16b //q1+deltaq1 + mov v5.d[0], v4.d[1] + mov v11.d[0], v10.d[1] + bic v30.16b, v30.16b , v26.16b //abs(delta) of pixels to be changed only + // VCGE.S8 Q14, Q14,#0 //sign(delta) + cmge v28.16b, v28.16b , #0 + uqsub v22.16b, v6.16b , v30.16b //clip(p0-delta) + + trn1 v21.8b, v0.8b, v2.8b + trn2 v2.8b, v0.8b, v2.8b //row1 &2 + mov v0.8b, v21.8b + uqadd v6.16b, v6.16b , v30.16b //clip(p0+delta) + + trn1 v21.8b, v1.8b, v3.8b + trn2 v3.8b, v1.8b, v3.8b //row9 &10 + mov v1.8b, v21.8b + uqadd v24.16b, v8.16b , v30.16b //clip(q0+delta) + trn1 v21.8b, v12.8b, v14.8b + trn2 v14.8b, v12.8b, v14.8b //row7 & 8 + mov v12.8b, v21.8b + uqsub v8.16b, v8.16b , v30.16b //clip(q0-delta) + trn1 v21.8b, v13.8b, v15.8b + trn2 v15.8b, v13.8b, v15.8b //row15 & 16 + mov v13.8b, v21.8b + bif v6.16b, v22.16b , v28.16b //p0 + bif v8.16b, v24.16b , v28.16b //q0 + mov v7.d[0], v6.d[1] + mov v9.d[0], v8.d[1] + trn1 v21.8b, v4.8b, v6.8b + trn2 v6.8b, v4.8b, v6.8b //row3&row4 + mov v4.8b, v21.8b + trn1 v21.8b, v8.8b, v10.8b + trn2 v10.8b, v8.8b, v10.8b //row5&6 + mov v8.8b, v21.8b + trn1 v21.8b, v5.8b, v7.8b + trn2 v7.8b, v5.8b, v7.8b //row11 & 12 + mov v5.8b, v21.8b + trn1 v21.8b, v9.8b, v11.8b + trn2 v11.8b, v9.8b, v11.8b //row13 &14 + mov v9.8b, v21.8b + trn1 v21.4h, v2.4h, v6.4h + trn2 v6.4h, v2.4h, v6.4h //row2 & row4 + mov v2.8b, v21.8b + trn1 v21.4h, v10.4h, v14.4h + trn2 v14.4h, v10.4h, v14.4h //row6 & row8 + mov v10.8b, v21.8b + trn1 v21.4h, v3.4h, v7.4h + trn2 v7.4h, v3.4h, v7.4h //row10 & 12 + mov v3.8b, v21.8b + trn1 v21.4h, v11.4h, v15.4h + trn2 v15.4h, v11.4h, v15.4h //row14 & row16 + mov v11.8b, v21.8b + trn1 v21.2s, v6.2s, v14.2s + trn2 v14.2s, v6.2s, v14.2s //row4 & 8 + mov v6.8b, v21.8b + trn1 v21.2s, v7.2s, v15.2s + trn2 v15.2s, v7.2s, v15.2s //row 12 & 16 + mov v7.8b, v21.8b + //now Q3 ->p0 and Q7->q3 + trn1 v21.4h, v0.4h, v4.4h + trn2 v4.4h, v0.4h, v4.4h //row1 & 3 + mov v0.8b, v21.8b + trn1 v21.4h, v8.4h, v12.4h + trn2 v12.4h, v8.4h, v12.4h //row 5 & 7 + mov v8.8b, v21.8b + trn1 v21.4h, v1.4h, v5.4h + trn2 v5.4h, v1.4h, v5.4h //row9 & row11 + mov v1.8b, v21.8b + trn1 v21.4h, v9.4h, v13.4h + trn2 v13.4h, v9.4h, v13.4h //row13 & row15 + mov v9.8b, v21.8b + sub x0, x0, x1, lsl#4 //restore pointer + trn1 v21.2s, v0.2s, v8.2s + trn2 v8.2s, v0.2s, v8.2s //row1 & row5 + mov v0.8b, v21.8b + trn1 v21.2s, v1.2s, v9.2s + trn2 v9.2s, v1.2s, v9.2s //row9 & 13 + mov v1.8b, v21.8b + trn1 v21.2s, v2.2s, v10.2s + trn2 v10.2s, v2.2s, v10.2s //row2 &6 + mov v2.8b, v21.8b + trn1 v21.2s, v3.2s, v11.2s + trn2 v11.2s, v3.2s, v11.2s //row10&row14 + mov v3.8b, v21.8b + trn1 v21.2s, v4.2s, v12.2s + trn2 v12.2s, v4.2s, v12.2s //row3 & 7 + mov v4.8b, v21.8b + trn1 v21.2s, v5.2s, v13.2s + trn2 v13.2s, v5.2s, v13.2s //row11 & row15 + mov v5.8b, v21.8b + st1 {v0.8b}, [x0], x1 //row1 + st1 {v2.8b}, [x0], x1 //row2 + st1 {v4.8b}, [x0], x1 //row3 + st1 {v6.8b}, [x0], x1 //row4 + st1 {v8.8b}, [x0], x1 //row5 + st1 {v10.8b}, [x0], x1 //row6 + st1 {v12.8b}, [x0], x1 //row7 + st1 {v14.8b}, [x0], x1 //row8 + st1 {v1.8b}, [x0], x1 //row9 + st1 {v3.8b}, [x0], x1 //row10 + st1 {v5.8b}, [x0], x1 //row11 + st1 {v7.8b}, [x0], x1 //row12 + st1 {v9.8b}, [x0], x1 //row13 + st1 {v11.8b}, [x0], x1 //row14 + st1 {v13.8b}, [x0], x1 //row15 + st1 {v15.8b}, [x0], x1 //row16 + + // LDMFD sp!,{x12,pc} + ldp x19, x20, [sp], #16 + pop_v_regs + ret + + + +///** +//******************************************************************************* +//* +//* @brief +//* Performs filtering of a luma block vertical edge when the +//* boundary strength is set to 4 +//* +//* @par Description: +//* This operation is described in Sec. 8.7.2.4 under the title +//* "Filtering process for edges for bS equal to 4" in ITU T Rec H.264. +//* +//* @param[in] x0 - pu1_src +//* Pointer to the src sample q0 +//* +//* @param[in] x1 - src_strd +//* Source stride +//* +//* @param[in] x2 - alpha +//* Alpha Value for the boundary +//* +//* @param[in] x3 - beta +//* Beta Value for the boundary +//* +//* @returns +//* None +//* +//* @remarks +//* None +//* +//******************************************************************************* +//*/ + + .global ih264_deblk_luma_vert_bs4_av8 + +ih264_deblk_luma_vert_bs4_av8: + + // STMFD sp!,{x12,x14} + push_v_regs + stp x19, x20, [sp, #-16]! + + sub x0, x0, #4 //pointer uc_edgePixel-4 + mov x17, x0 + //loading p3:p2:p1:p0:q0:q1:q2:q3 for every row + ld1 {v0.8b}, [x0], x1 //row1 + ld1 {v2.8b}, [x0], x1 //row2 + ld1 {v4.8b}, [x0], x1 //row3 + ld1 {v6.8b}, [x0], x1 //row4 + ld1 {v8.8b}, [x0], x1 //row5 + ld1 {v10.8b}, [x0], x1 //row6 + ld1 {v12.8b}, [x0], x1 //row7 + ld1 {v14.8b}, [x0], x1 //row8 + ld1 {v1.8b}, [x0], x1 //row9 + ld1 {v3.8b}, [x0], x1 //row10 + ld1 {v5.8b}, [x0], x1 //row11 + ld1 {v7.8b}, [x0], x1 //row12 + ld1 {v9.8b}, [x0], x1 //row13 + ld1 {v11.8b}, [x0], x1 //row14 + ld1 {v13.8b}, [x0], x1 //row15 + ld1 {v15.8b}, [x0], x1 //row16 + + //taking two 8x8 transposes + //2X2 transposes + trn1 v21.8b, v0.8b, v2.8b + trn2 v2.8b, v0.8b, v2.8b //row1 &2 + mov v0.8b, v21.8b + trn1 v21.8b, v4.8b, v6.8b + trn2 v6.8b, v4.8b, v6.8b //row3&row4 + mov v4.8b, v21.8b + trn1 v21.8b, v8.8b, v10.8b + trn2 v10.8b, v8.8b, v10.8b //row5&6 + mov v8.8b, v21.8b + trn1 v21.8b, v12.8b, v14.8b + trn2 v14.8b, v12.8b, v14.8b //row7 & 8 + mov v12.8b, v21.8b + trn1 v21.8b, v1.8b, v3.8b + trn2 v3.8b, v1.8b, v3.8b //row9 &10 + mov v1.8b , v21.8b + trn1 v21.8b, v5.8b, v7.8b + trn2 v7.8b, v5.8b, v7.8b //row11 & 12 + mov v5.8b , v21.8b + trn1 v21.8b, v9.8b, v11.8b + trn2 v11.8b, v9.8b, v11.8b //row13 &14 + mov v9.8b , v21.8b + trn1 v21.8b, v13.8b, v15.8b + trn2 v15.8b, v13.8b, v15.8b //row15 & 16 + mov v13.8b , v21.8b + //4x4 transposes + trn1 v21.4h, v2.4h, v6.4h + trn2 v6.4h, v2.4h, v6.4h //row2 & row4 + mov v2.8b, v21.8b + trn1 v21.4h, v10.4h, v14.4h + trn2 v14.4h, v10.4h, v14.4h //row6 & row8 + mov v10.8b , v21.8b + trn1 v21.4h, v3.4h, v7.4h + trn2 v7.4h, v3.4h, v7.4h //row10 & 12 + mov v3.8b, v21.8b + trn1 v21.4h, v11.4h, v15.4h + trn2 v15.4h, v11.4h, v15.4h //row14 & row16 + mov v11.8b, v21.8b + trn1 v21.2s, v6.2s, v14.2s + trn2 v14.2s, v6.2s, v14.2s //row4 & 8 + mov v6.8b, v21.8b + trn1 v21.2s, v7.2s, v15.2s + trn2 v15.2s, v7.2s, v15.2s //row 12 & 16 + mov v7.8b, v21.8b + //now Q3 ->p0 and Q7->q3 + trn1 v21.4h, v0.4h, v4.4h + trn2 v4.4h, v0.4h, v4.4h //row1 & 3 + mov v0.8b , v21.8b + trn1 v21.4h, v8.4h, v12.4h + trn2 v12.4h, v8.4h, v12.4h //row 5 & 7 + mov v8.8b, v21.8b + trn1 v21.4h, v1.4h, v5.4h + trn2 v5.4h, v1.4h, v5.4h //row9 & row11 + mov v1.8b, v21.8b + trn1 v21.4h, v9.4h, v13.4h + trn2 v13.4h, v9.4h, v13.4h //row13 & row15 + mov v9.8b , v21.8b + trn1 v21.2s, v0.2s, v8.2s + trn2 v8.2s, v0.2s, v8.2s //row1 & row5 + mov v0.8b, v21.8b + trn1 v21.2s, v1.2s, v9.2s + trn2 v9.2s, v1.2s, v9.2s //row9 & 13 + mov v1.8b, v21.8b + //now Q0->p3 & Q4->q0 + //starting processing as p0 and q0 are now ready + //now Q1->p2 & Q5->q1 + mov v31.d[0], v14.d[0] + mov v31.d[1], v15.d[0] + trn1 v21.2s, v4.2s, v12.2s + trn2 v12.2s, v4.2s, v12.2s //row3 & 7 + mov v4.8b, v21.8b + movi v28.8h, #2 + trn1 v21.2s, v5.2s, v13.2s + trn2 v13.2s, v5.2s, v13.2s //row11 & row15 + mov v5.8b, v21.8b + uaddl v16.8h, v6.8b, v8.8b //p0+q0 L + trn1 v21.2s, v2.2s, v10.2s + trn2 v10.2s, v2.2s, v10.2s //row2 &6 + mov v2.8b, v21.8b + uaddl v18.8h, v7.8b, v9.8b //p0+q0 H + trn1 v21.2s, v3.2s, v11.2s + trn2 v11.2s, v3.2s, v11.2s //row10&row14 + mov v3.8b, v21.8b + uaddw v20.8h, v16.8h , v4.8b //p0+q0+p1 L + uaddw v22.8h, v18.8h , v5.8b //p0+q0+p1 H + uaddl v24.8h, v2.8b, v10.8b //p2+q1 L + uaddl v26.8h, v3.8b, v11.8b //p2+q1 H + mla v24.8h, v20.8h , v28.8h //p2 + X2(p1) + X2(p0) + X2(q0) + q1 L + mla v26.8h, v22.8h , v28.8h //p2 + X2(p1) + X2(p0) + X2(q0) + q1 H + movi v28.16b, #2 + uaddw v16.8h, v20.8h , v2.8b //p0+q0+p1+p2 L + uaddw v18.8h, v22.8h , v3.8b //p0+q0+p1+p2 H + dup v30.16b, w2 //duplicate alpha + rshrn v20.8b, v16.8h, #2 //(p2 + p1 + p0 + q0 + 2) >> 2)L p1' + rshrn v21.8b, v18.8h, #2 //(p2 + p1 + p0 + q0 + 2) >> 2)H p1' + mov v20.d[1] , v21.d[0] + mov v0.d[1] , v1.d[0] + mov v2.d[1] , v3.d[0] + mov v4.d[1] , v5.d[0] + mov v6.d[1] , v7.d[0] + mov v8.d[1] , v9.d[0] + mov v10.d[1] , v11.d[0] + mov v12.d[1] , v13.d[0] + mov v14.d[1] , v15.d[0] + uabd v22.16b , v6.16b, v8.16b + usra v28.16b, v30.16b, #2 //alpha >>2 +2 + uabd v30.16b , v2.16b, v6.16b + rshrn v24.8b, v24.8h, #3 //((p2 + X2(p1) + X2(p0) + X2(q0) + q1 + 4) >> 3) L p0' + rshrn v25.8b, v26.8h, #3 //((p2 + X2(p1) + X2(p0) + X2(q0) + q1 + 4) >> 3) H p0' + mov v24.d[1] , v25.d[0] + dup v26.16b, w3 //beta + cmhi v28.16b, v28.16b , v22.16b //ABS(p0 - q0) <((Alpha >>2) + 2) + uaddl v22.8h, v6.8b, v10.8b //p0+q1 L + cmhi v14.16b, v26.16b , v30.16b //beta>Ap + uaddl v30.8h, v7.8b, v11.8b //p0+q1 H + uaddw v22.8h, v22.8h , v4.8b //p0+q1+p1 L + uaddw v30.8h, v30.8h , v5.8b //p0+q1+p1 H + uaddw v22.8h, v22.8h , v4.8b //p0+q1+2*p1 L + uaddw v30.8h, v30.8h , v5.8b //p0+q1+2*p1 H + and v14.16b, v14.16b , v28.16b //(Ap < Beta && ABS(p0 - q0) <((Alpha >>2) + 2) + rshrn v22.8b, v22.8h, #2 //((X2(p1) + p0 + q1 + 2) >> 2) L p0" + rshrn v23.8b, v30.8h, #2 //((X2(p1) + p0 + q1 + 2) >> 2) H p0" + mov v22.d[1] , v23.d[0] + uaddl v30.8h, v2.8b, v0.8b //p2+p3 L + bif v24.16b, v22.16b , v14.16b //p0' or p0 " + uaddl v22.8h, v3.8b, v1.8b //p2+p3 H + add v30.8h, v30.8h , v30.8h //2*(p2+p3) L + add v22.8h, v22.8h , v22.8h //2*(p2+p3)H + add v16.8h, v16.8h , v30.8h //(X2(p3) + X3(p2) + p1 + p0 + q0) L + add v18.8h, v18.8h , v22.8h //(X2(p3) + X3(p2) + p1 + p0 + q0) H + uabd v30.16b , v12.16b, v8.16b + uabd v22.16b , v10.16b, v8.16b + rshrn v16.8b, v16.8h, #3 //((X2(p3) + X3(p2) + p1 + p0 + q0 + 4) >> 3); L p2' + rshrn v17.8b, v18.8h, #3 //((X2(p3) + X3(p2) + p1 + p0 + q0 + 4) >> 3); H p2' + mov v16.d[1] , v17.d[0] + uabd v18.16b , v4.16b, v6.16b + cmhi v30.16b, v26.16b , v30.16b //Aq < Beta + cmhs v22.16b, v22.16b, v26.16b + cmhs v18.16b, v18.16b, v26.16b + dup v26.16b, w2 //duplicate alpha + and v30.16b, v30.16b , v28.16b //(Aq < Beta && ABS(p0 - q0) <((Alpha >>2) + 2)) + uabd v28.16b , v6.16b, v8.16b + orr v22.16b, v22.16b , v18.16b //ABS(p1 - p0) >= Beta || ABS(q1 - q0) >= Beta + uaddl v18.8h, v6.8b, v8.8b //p0+q0 L + cmhs v28.16b, v28.16b, v26.16b + uaddl v26.8h, v7.8b, v9.8b //p0+q0 H + uaddw v18.8h, v18.8h , v10.8b //p0+q0+q1 L + orr v22.16b, v22.16b , v28.16b //ABS(p1 - p0) >= Beta || ABS(q1 - q0) >= Beta||ABS(p0 - q0) >= Alpha + uaddw v26.8h, v26.8h , v11.8b //p0+q0+q1 H + bic v14.16b, v14.16b , v22.16b //final condn for p's + movi v28.16b, #2 + bif v6.16b, v24.16b , v22.16b //final p0 + bit v2.16b, v16.16b , v14.16b //final p2 + bif v20.16b, v4.16b , v14.16b //final p1 + mov v7.d[0] , v6.d[1] + mov v3.d[0] , v2.d[1] + mov v21.d[0] , v20.d[1] + uaddl v24.8h, v8.8b, v4.8b //q0+p1 L + umlal v24.8h, v10.8b, v28.8b //X2(q1) + q0 + p1 L + uaddl v16.8h, v9.8b, v5.8b //q0+p1 H + umlal v16.8h, v11.8b, v28.8b //X2(q1) + q0 + p1 H + movi v28.8h, #2 + uaddl v14.8h, v4.8b, v12.8b //p1+q2 L + mla v14.8h, v18.8h , v28.8h //p1 + X2(p0) + X2(q0) + X2(q1) + q2L + uaddl v4.8h, v5.8b, v13.8b //p1+q2H + mla v4.8h, v26.8h , v28.8h //p1 + X2(p0) + X2(q0) + X2(q1) + q2H + rshrn v24.8b, v24.8h, #2 //(X2(q1) + q0 + p1 + 2) >> 2; L q0' + rshrn v25.8b, v16.8h, #2 //(X2(q1) + q0 + p1 + 2) >> 2; H q0' + mov v24.d[1] , v25.d[0] + uaddw v18.8h, v18.8h , v12.8b //p0 + q0 + q1 + q2 L + uaddw v26.8h, v26.8h , v13.8b //p0 + q0 + q1 + q2 H + rshrn v16.8b, v14.8h, #3 //(p1 + X2(p0) + X2(q0) + X2(q1) + q2 + 4) >> 3 L qo" + mov v14.16b, v31.16b + rshrn v17.8b, v4.8h, #3 //(p1 + X2(p0) + X2(q0) + X2(q1) + q2 + 4) >> 3 H qo" + mov v16.d[1] , v17.d[0] + rshrn v4.8b, v18.8h, #2 //p0 + q0 + q1 + q2 + 2)>>2 L q1' + rshrn v5.8b, v26.8h, #2 //p0 + q0 + q1 + q2 + 2)>>2 H q1' + mov v4.d[1] , v5.d[0] + bit v24.16b, v16.16b , v30.16b //q0' or q0" + bic v30.16b, v30.16b , v22.16b //final condn for q's + trn1 v31.8b, v0.8b, v2.8b + trn2 v2.8b, v0.8b, v2.8b //row1 &2 + mov v0.8b, v31.8b + bit v10.16b, v4.16b , v30.16b + mov v11.d[0] , v10.d[1] + mov v25.d[0] , v24.d[1] + mov v31.d[0] , v30.d[1] + trn1 v31.8b, v1.8b, v3.8b + trn2 v3.8b, v1.8b, v3.8b //row9 &10 + mov v1.8b, v31.8b + uaddl v16.8h, v12.8b, v14.8b //q2+q3 L + trn1 v31.8b, v20.8b, v6.8b + trn2 v6.8b, v20.8b, v6.8b //row3&row4 + mov v20.8b , v31.8b + uaddl v4.8h, v13.8b, v15.8b //q2+q3 H + trn1 v31.8b, v21.8b, v7.8b + trn2 v7.8b, v21.8b, v7.8b //row11 & 12 + mov v21.8b , v31.8b + mla v18.8h, v16.8h , v28.8h //X2(q3) + X3(q2) + q1 + q0 + p0 L + trn1 v31.4h, v2.4h, v6.4h + trn2 v6.4h, v2.4h, v6.4h //row2 & row4 + mov v2.8b, v31.8b + mla v26.8h, v4.8h , v28.8h //X2(q3) + X3(q2) + q1 + q0 + p0 H + trn1 v31.4h, v3.4h, v7.4h + trn2 v7.4h, v3.4h, v7.4h //row10 & 12 + mov v3.8b , v31.8b + bif v8.16b, v24.16b , v22.16b //final q0 + mov v9.d[0] , v8.d[1] + trn1 v31.4h, v0.4h, v20.4h + trn2 v20.4h, v0.4h, v20.4h //row1 & 3 + mov v0.8b , v31.8b + rshrn v18.8b, v18.8h, #3 //(X2(q3) + X3(q2) + q1 + q0 + p0 + 4) >> 3; L + trn1 v31.4h, v1.4h, v21.4h + trn2 v21.4h, v1.4h, v21.4h //row9 & row11 + mov v1.8b, v31.8b + rshrn v19.8b, v26.8h, #3 //(X2(q3) + X3(q2) + q1 + q0 + p0 + 4) >> 3; H + mov v18.d[1] , v19.d[0] + trn1 v31.8b, v8.8b, v10.8b + trn2 v10.8b, v8.8b, v10.8b //row5&6 + mov v8.8b, v31.8b + bit v12.16b, v18.16b , v30.16b //final q2 + mov v13.d[0] , v12.d[1] + trn1 v31.8b, v9.8b, v11.8b + trn2 v11.8b, v9.8b, v11.8b //row13 &14 + mov v9.8b, v31.8b + trn1 v31.8b, v12.8b, v14.8b + trn2 v14.8b, v12.8b, v14.8b //row7 & 8 + mov v12.8b, v31.8b + trn1 v31.8b, v13.8b, v15.8b + trn2 v15.8b, v13.8b, v15.8b //row15 & 16 + mov v13.8b , v31.8b + trn1 v31.4h, v10.4h, v14.4h + trn2 v14.4h, v10.4h, v14.4h //row6 & row8 + mov v10.8b, v31.8b + trn1 v31.4h, v11.4h, v15.4h + trn2 v15.4h, v11.4h, v15.4h //row14 & row16 + mov v11.8b, v31.8b + //now Q3 ->p0 and Q7->q3 + trn1 v31.4h, v8.4h, v12.4h + trn2 v12.4h, v8.4h, v12.4h //row 5 & 7 + mov v8.8b, v31.8b + trn1 v31.4h, v9.4h, v13.4h + trn2 v13.4h, v9.4h, v13.4h //row13 & row15 + mov v9.8b, v31.8b + sub x0, x0, x1, lsl#4 //restore pointer + trn1 v31.2s, v6.2s, v14.2s + trn2 v14.2s, v6.2s, v14.2s //row4 & 8 + mov v6.8b , v31.8b + trn1 v31.2s, v7.2s, v15.2s + trn2 v15.2s, v7.2s, v15.2s //row 12 & 16 + mov v7.8b, v31.8b + trn1 v31.2s, v0.2s, v8.2s + trn2 v8.2s, v0.2s, v8.2s //row1 & row5 + mov v0.8b , v31.8b + trn1 v31.2s, v1.2s, v9.2s + trn2 v9.2s, v1.2s, v9.2s //row9 & 13 + mov v1.8b , v31.8b + trn1 v31.2s, v2.2s, v10.2s + trn2 v10.2s, v2.2s, v10.2s //row2 &6 + mov v2.8b , v31.8b + trn1 v31.2s, v3.2s, v11.2s + trn2 v11.2s, v3.2s, v11.2s //row10&row14 + mov v3.8b , v31.8b + trn1 v31.2s, v20.2s, v12.2s + trn2 v12.2s, v20.2s, v12.2s //row3 & 7 + mov v20.8b , v31.8b + trn1 v31.2s, v21.2s, v13.2s + trn2 v13.2s, v21.2s, v13.2s //row11 & row15 + mov v21.8b, v31.8b + st1 {v0.8b}, [x0], x1 //row1 + st1 {v2.8b}, [x0], x1 //row2 + st1 {v20.8b}, [x0], x1 //row3 + st1 {v6.8b}, [x0], x1 //row4 + st1 {v8.8b}, [x0], x1 //row5 + st1 {v10.8b}, [x0], x1 //row6 + st1 {v12.8b}, [x0], x1 //row7 + st1 {v14.8b}, [x0], x1 //row8 + st1 {v1.8b}, [x0], x1 //row9 + st1 {v3.8b}, [x0], x1 //row10 + st1 {v21.8b}, [x0], x1 //row11 + st1 {v7.8b}, [x0], x1 //row12 + st1 {v9.8b}, [x0], x1 //row13 + st1 {v11.8b}, [x0], x1 //row14 + st1 {v13.8b}, [x0], x1 //row15 + st1 {v15.8b}, [x0], x1 //row16 + + // LDMFD sp!,{x12,pc} + ldp x19, x20, [sp], #16 + pop_v_regs + ret + + diff --git a/common/armv8/ih264_default_weighted_pred_av8.s b/common/armv8/ih264_default_weighted_pred_av8.s new file mode 100755 index 0000000..aefb902 --- /dev/null +++ b/common/armv8/ih264_default_weighted_pred_av8.s @@ -0,0 +1,353 @@ +//****************************************************************************** +//* +//* Copyright (C) 2015 The Android Open Source Project +//* +//* Licensed under the Apache License, Version 2.0 (the "License"); +//* you may not use this file except in compliance with the License. +//* You may obtain a copy of the License at: +//* +//* http://www.apache.org/licenses/LICENSE-2.0 +//* +//* Unless required by applicable law or agreed to in writing, software +//* distributed under the License is distributed on an "AS IS" BASIS, +//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +//* See the License for the specific language governing permissions and +//* limitations under the License. +//* +//***************************************************************************** +//* Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore +//*/ +///** +//****************************************************************************** +//* @file +//* ih264_default_weighted_pred_av8.s +//* +//* @brief +//* Contains function definitions for default weighted prediction. +//* Functions are coded using NEON intrinsics and can be compiled using ARM RVCT +//* +//* @author +//* Kaushik Senthoor R +//* +//* @par List of Functions: +//* +//* - ih264_default_weighted_pred_luma_av8() +//* - ih264_default_weighted_pred_chroma_av8() +//* +//* @remarks +//* None +//* +//******************************************************************************* +//*/ +//******************************************************************************* +//* @function +//* ih264_default_weighted_pred_luma_av8() +//* +//* @brief +//* This routine performs the default weighted prediction as described in sec +//* 8.4.2.3.1 titled "Default weighted sample prediction process" for luma. +//* +//* @par Description: +//* This function gets two ht x wd blocks, calculates their rounded-average and +//* stores it in the destination block. +//* +//* @param[in] puc_src1: +//* UWORD8 Pointer to the buffer containing the first input block. +//* +//* @param[in] puc_src2: +//* UWORD8 Pointer to the buffer containing the second input block. +//* +//* @param[out] puc_dst +//* UWORD8 pointer to the destination where the output block is stored. +//* +//* @param[in] src_strd1 +//* Stride of the first input buffer +//* +//* @param[in] src_strd2 +//* Stride of the second input buffer +//* +//* @param[in] dst_strd +//* Stride of the destination buffer +//* +//* @param[in] ht +//* integer height of the array +//* +//* @param[in] wd +//* integer width of the array +//* +//* @returns +//* None +//* +//* @remarks +//* (ht,wd) can be (4,4), (4,8), (8,4), (8,8), (8,16), (16,8) or (16,16). +//* +//******************************************************************************* +//*/ +//void ih264_default_weighted_pred_luma_av8(UWORD8 *puc_src1, +// UWORD8 *puc_src2, +// UWORD8 *puc_dst, +// WORD32 src_strd1, +// WORD32 src_strd2, +// WORD32 dst_strd, +// UWORD8 ht, +// UWORD8 wd) +// +//**************Variables Vs Registers***************************************** +// x0 => puc_src1 +// x1 => puc_src2 +// x2 => puc_dst +// x3 => src_strd1 +// [sp] => src_strd2 (x4) +// [sp+4] => dst_strd (x5) +// [sp+8] => ht (x6) +// [sp+12] => wd (x7) +// +.text +.p2align 2 +.include "ih264_neon_macros.s" + + + + .global ih264_default_weighted_pred_luma_av8 + +ih264_default_weighted_pred_luma_av8: + + push_v_regs + stp x19, x20, [sp, #-16]! + cmp w7, #16 + beq loop_16 //branch if wd is 16 + cmp w7, #8 + beq loop_8 //branch if wd is 8 + +loop_4: //each iteration processes four rows + + ld1 {v0.s}[0], [x0], x3 //load row 1 in source 1 + ld1 {v0.s}[1], [x0], x3 //load row 2 in source 1 + ld1 {v2.s}[0], [x1], x4 //load row 1 in source 2 + ld1 {v2.s}[1], [x1], x4 //load row 2 in source 2 + ld1 {v1.s}[0], [x0], x3 //load row 3 in source 1 + ld1 {v1.s}[1], [x0], x3 //load row 4 in source 1 + urhadd v0.8b, v0.8b , v2.8b + ld1 {v3.s}[0], [x1], x4 //load row 3 in source 2 + ld1 {v3.s}[1], [x1], x4 //load row 4 in source 2 + subs w6, w6, #4 //decrement ht by 4 + st1 {v0.s}[0], [x2], x5 //load row 1 in destination + st1 {v0.s}[1], [x2], x5 //load row 2 in destination + urhadd v1.8b, v1.8b , v3.8b + st1 {v1.s}[0], [x2], x5 //load row 3 in destination + st1 {v1.s}[1], [x2], x5 //load row 4 in destination + bgt loop_4 //if greater than 0 repeat the loop again + b end_loops + +loop_8: //each iteration processes four rows + + ld1 {v0.8b}, [x0], x3 //load row 1 in source 1 + ld1 {v4.8b}, [x1], x4 //load row 1 in source 2 + ld1 {v1.8b}, [x0], x3 //load row 2 in source 1 + ld1 {v5.8b}, [x1], x4 //load row 2 in source 2 + ld1 {v2.8b}, [x0], x3 //load row 3 in source 1 + urhadd v0.16b, v0.16b , v4.16b + urhadd v1.16b, v1.16b , v5.16b + ld1 {v6.8b}, [x1], x4 //load row 3 in source 2 + ld1 {v3.8b}, [x0], x3 //load row 4 in source 1 + urhadd v2.8b, v2.8b , v6.8b + ld1 {v7.8b}, [x1], x4 //load row 4 in source 2 + subs w6, w6, #4 //decrement ht by 4 + st1 {v0.8b}, [x2], x5 //load row 1 in destination + urhadd v3.8b, v3.8b , v7.8b + st1 {v1.8b}, [x2], x5 //load row 2 in destination + st1 {v2.8b}, [x2], x5 //load row 3 in destination + st1 {v3.8b}, [x2], x5 //load row 4 in destination + bgt loop_8 //if greater than 0 repeat the loop again + b end_loops + +loop_16: //each iteration processes eight rows + + ld1 {v0.8b, v1.8b}, [x0], x3 //load row 1 in source 1 + ld1 {v16.8b, v17.8b}, [x1], x4 //load row 1 in source 2 + ld1 {v2.8b, v3.8b}, [x0], x3 //load row 2 in source 1 + ld1 {v18.8b, v19.8b}, [x1], x4 //load row 2 in source 2 + urhadd v0.16b, v0.16b , v16.16b + urhadd v1.16b, v1.16b , v17.16b + ld1 {v4.8b, v5.8b}, [x0], x3 //load row 3 in source 1 + ld1 {v20.8b, v21.8b}, [x1], x4 //load row 3 in source 2 + urhadd v2.16b, v2.16b , v18.16b + urhadd v3.16b, v3.16b , v19.16b + ld1 {v6.8b, v7.8b}, [x0], x3 //load row 4 in source 1 + ld1 {v22.8b, v23.8b}, [x1], x4 //load row 4 in source 2 + urhadd v4.16b, v4.16b , v20.16b + urhadd v5.16b, v5.16b , v21.16b + ld1 {v8.8b, v9.8b}, [x0], x3 //load row 5 in source 1 + ld1 {v24.8b, v25.8b}, [x1], x4 //load row 5 in source 2 + urhadd v6.16b, v6.16b , v22.16b + urhadd v7.16b, v7.16b , v23.16b + ld1 {v10.8b, v11.8b}, [x0], x3 //load row 6 in source 1 + ld1 {v26.8b, v27.8b}, [x1], x4 //load row 6 in source 2 + urhadd v8.16b, v8.16b , v24.16b + urhadd v9.16b, v9.16b , v25.16b + ld1 {v12.8b, v13.8b}, [x0], x3 //load row 7 in source 1 + ld1 {v28.8b, v29.8b}, [x1], x4 //load row 7 in source 2 + urhadd v10.16b, v10.16b , v26.16b + urhadd v11.16b, v11.16b , v27.16b + ld1 {v14.8b, v15.8b}, [x0], x3 //load row 8 in source 1 + ld1 {v30.8b, v31.8b}, [x1], x4 //load row 8 in source 2 + urhadd v12.16b, v12.16b , v28.16b + urhadd v13.16b, v13.16b , v29.16b + st1 {v0.8b, v1.8b}, [x2], x5 //load row 1 in destination + st1 {v2.8b, v3.8b}, [x2], x5 //load row 2 in destination + urhadd v14.16b, v14.16b , v30.16b + urhadd v15.16b, v15.16b , v31.16b + st1 {v4.8b, v5.8b}, [x2], x5 //load row 3 in destination + st1 {v6.8b, v7.8b}, [x2], x5 //load row 4 in destination + subs w6, w6, #8 //decrement ht by 8 + st1 {v8.8b, v9.8b}, [x2], x5 //load row 5 in destination + st1 {v10.8b, v11.8b}, [x2], x5 //load row 6 in destination + st1 {v12.8b, v13.8b}, [x2], x5 //load row 7 in destination + st1 {v14.8b, v15.8b}, [x2], x5 //load row 8 in destination + bgt loop_16 //if greater than 0 repeat the loop again + +end_loops: + + // LDMFD sp!,{x4-x7,x15} //Reload the registers from sp + ldp x19, x20, [sp], #16 + pop_v_regs + ret + + +//******************************************************************************* +//* @function +//* ih264_default_weighted_pred_chroma_av8() +//* +//* @brief +//* This routine performs the default weighted prediction as described in sec +//* 8.4.2.3.1 titled "Default weighted sample prediction process" for chroma. +//* +//* @par Description: +//* This function gets two ht x wd blocks, calculates their rounded-average and +//* stores it in the destination block for U and V. +//* +//* @param[in] puc_src1: +//* UWORD8 Pointer to the buffer containing the first input block. +//* +//* @param[in] puc_src2: +//* UWORD8 Pointer to the buffer containing the second input block. +//* +//* @param[out] puc_dst +//* UWORD8 pointer to the destination where the output block is stored. +//* +//* @param[in] src_strd1 +//* Stride of the first input buffer +//* +//* @param[in] src_strd2 +//* Stride of the second input buffer +//* +//* @param[in] dst_strd +//* Stride of the destination buffer +//* +//* @param[in] ht +//* integer height of the array +//* +//* @param[in] wd +//* integer width of the array +//* +//* @returns +//* None +//* +//* @remarks +//* (ht,wd) can be (2,2), (2,4), (4,2), (4,4), (4,8), (8,4) or (8,8). +//* +//******************************************************************************* +//*/ +//void ih264_default_weighted_pred_chroma_av8(UWORD8 *puc_src1, +// UWORD8 *puc_src2, +// UWORD8 *puc_dst, +// WORD32 src_strd1, +// WORD32 src_strd2, +// WORD32 dst_strd, +// UWORD8 ht, +// UWORD8 wd) +// +//**************Variables Vs Registers***************************************** +// x0 => puc_src1 +// x1 => puc_src2 +// x2 => puc_dst +// x3 => src_strd1 +// [sp] => src_strd2 (x4) +// [sp+4] => dst_strd (x5) +// [sp+8] => ht (x6) +// [sp+12] => wd (x7) +// + + + + + .global ih264_default_weighted_pred_chroma_av8 + +ih264_default_weighted_pred_chroma_av8: + + push_v_regs + stp x19, x20, [sp, #-16]! + cmp w7, #8 + beq loop_8_uv //branch if wd is 8 + cmp w7, #4 + beq loop_4_uv //branch if wd is 4 + +loop_2_uv: //each iteration processes two rows + + ld1 {v0.s}[0], [x0], x3 //load row 1 in source 1 + ld1 {v0.s}[1], [x0], x3 //load row 2 in source 1 + ld1 {v1.s}[0], [x1], x4 //load row 1 in source 2 + ld1 {v1.s}[1], [x1], x4 //load row 2 in source 2 + urhadd v0.8b, v0.8b , v1.8b + subs w6, w6, #2 //decrement ht by 2 + st1 {v0.s}[0], [x2], x5 //load row 1 in destination + st1 {v0.s}[1], [x2], x5 //load row 2 in destination + bgt loop_2_uv //if greater than 0 repeat the loop again + b end_loops_uv + +loop_4_uv: //each iteration processes two rows + + ld1 {v0.8b}, [x0], x3 //load row 1 in source 1 + ld1 {v2.8b}, [x1], x4 //load row 1 in source 2 + ld1 {v1.8b}, [x0], x3 //load row 2 in source 1 + urhadd v0.8b, v0.8b , v2.8b + ld1 {v3.8b}, [x1], x4 //load row 2 in source 2 + urhadd v1.8b, v1.8b , v3.8b + st1 {v0.8b}, [x2], x5 //load row 1 in destination + subs w6, w6, #2 //decrement ht by 2 + st1 {v1.8b}, [x2], x5 //load row 2 in destination + bgt loop_4_uv //if greater than 0 repeat the loop again + b end_loops_uv + +loop_8_uv: //each iteration processes four rows + + ld1 {v0.8b, v1.8b}, [x0], x3 //load row 1 in source 1 + ld1 {v8.8b, v9.8b}, [x1], x4 //load row 1 in source 2 + ld1 {v2.8b, v3.8b}, [x0], x3 //load row 2 in source 1 + urhadd v0.16b, v0.16b , v8.16b + urhadd v1.16b, v1.16b , v9.16b + ld1 {v10.8b, v11.8b}, [x1], x4 //load row 2 in source 2 + ld1 {v4.8b, v5.8b}, [x0], x3 //load row 3 in source 1 + urhadd v2.16b, v2.16b , v10.16b + urhadd v3.16b, v3.16b , v11.16b + ld1 {v12.8b, v13.8b}, [x1], x4 //load row 3 in source 2 + ld1 {v6.8b, v7.8b}, [x0], x3 //load row 4 in source 1 + urhadd v4.16b, v4.16b , v12.16b + urhadd v5.16b, v5.16b , v13.16b + ld1 {v14.8b, v15.8b}, [x1], x4 //load row 4 in source 2 + st1 {v0.8b, v1.8b}, [x2], x5 //load row 1 in destination + urhadd v6.16b, v6.16b , v14.16b + urhadd v7.16b, v7.16b , v15.16b + st1 {v2.8b, v3.8b}, [x2], x5 //load row 2 in destination + subs w6, w6, #4 //decrement ht by 4 + st1 {v4.8b, v5.8b}, [x2], x5 //load row 3 in destination + st1 {v6.8b, v7.8b}, [x2], x5 //load row 4 in destination + bgt loop_8_uv //if greater than 0 repeat the loop again + +end_loops_uv: + ldp x19, x20, [sp], #16 + pop_v_regs + ret + + + diff --git a/common/armv8/ih264_ihadamard_scaling_av8.s b/common/armv8/ih264_ihadamard_scaling_av8.s new file mode 100755 index 0000000..712c9ae --- /dev/null +++ b/common/armv8/ih264_ihadamard_scaling_av8.s @@ -0,0 +1,250 @@ +//****************************************************************************** +//* +//* Copyright (C) 2015 The Android Open Source Project +//* +//* Licensed under the Apache License, Version 2.0 (the "License"); +//* you may not use this file except in compliance with the License. +//* You may obtain a copy of the License at: +//* +//* http://www.apache.org/licenses/LICENSE-2.0 +//* +//* Unless required by applicable law or agreed to in writing, software +//* distributed under the License is distributed on an "AS IS" BASIS, +//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +//* See the License for the specific language governing permissions and +//* limitations under the License. +//* +//***************************************************************************** +//* Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore +//*/ +///** +// ******************************************************************************* +// * @file +// * ih264_ihadamard_scaling_av8.s +// * +// * @brief +// * Contains function definitions for inverse hadamard transform on 4x4 DC outputs +// * of 16x16 intra-prediction +// * +// * @author +// * Mohit +// * +// * @par List of Functions: +// * - ih264_ihadamard_scaling_4x4_av8() +// * +// * @remarks +// * None +// * +.include "ih264_neon_macros.s" + +// ******************************************************************************* +// */ +// * @brief This function performs a 4x4 inverse hadamard transform on the 4x4 DC coefficients +// * of a 16x16 intra prediction macroblock, and then performs scaling. +// * prediction buffer +// * +// * @par Description: +// * The DC coefficients pass through a 2-stage inverse hadamard transform. +// * This inverse transformed content is scaled to based on Qp value. +// * +// * @param[in] pi2_src +// * input 4x4 block of DC coefficients +// * +// * @param[out] pi2_out +// * output 4x4 block +// * +// * @param[in] pu2_iscal_mat +// * pointer to scaling list +// * +// * @param[in] pu2_weigh_mat +// * pointer to weight matrix +// * +// * @param[in] u4_qp_div_6 +// * Floor (qp/6) +// * +// * @param[in] pi4_tmp +// * temporary buffer of size 1*16 +// * +// * @returns none +// * +// * @remarks none +// * +// ******************************************************************************* +// */ +// * +// ******************************************************************************* +// */ +// void ih264_ihadamard_scaling_4x4(word16* pi2_src, +// word16* pi2_out, +// const uword16 *pu2_iscal_mat, +// const uword16 *pu2_weigh_mat, +// uword32 u4_qp_div_6, +// word32* pi4_tmp) +//**************variables vs registers***************************************** +//x0 => *pi2_src +//x1 => *pi2_out +//x2 => *pu2_iscal_mat +//x3 => *pu2_weigh_mat +//x4=> u4_qp_div_6 + +.text +.p2align 2 + + .global ih264_ihadamard_scaling_4x4_av8 +ih264_ihadamard_scaling_4x4_av8: + +//only one shift is done in horizontal inverse because, +//if u4_qp_div_6 is lesser than 4 then shift value will be neagative and do negative left shift, in this case rnd_factor has value +//if u4_qp_div_6 is greater than 4 then shift value will be positive and do left shift, here rnd_factor is 0 + push_v_regs + +//=======================inverse hadamard transform================================ + + ld4 {v0.4h-v3.4h}, [x0] //load x4,x5,x6,x7 + + dup v14.4s, w4 // populate the u4_qp_div_6 + ld1 {v15.h}[0], [x3] // pu2_weigh_mat + ld1 {v16.h}[0], [x2] //pu2_iscal_mat + + saddl v4.4s, v0.4h, v3.4h //x0 = x4 + x7 + saddl v5.4s, v1.4h, v2.4h //x1 = x5 + x6 + ssubl v6.4s, v1.4h, v2.4h //x2 = x5 - x6 + ssubl v7.4s, v0.4h, v3.4h //x3 = x4 - x7 + + add v0.4s, v4.4s, v5.4s //pi4_tmp_ptr[0] = x0 + x1 + add v1.4s, v7.4s, v6.4s //pi4_tmp_ptr[1] = x3 + x2 + sub v2.4s, v4.4s, v5.4s //pi4_tmp_ptr[2] = x0 - x1 + sub v3.4s, v7.4s, v6.4s //pi4_tmp_ptr[3] = x3 - x2 + + umull v15.4s, v15.4h, v16.4h + dup v15.4s, v15.s[0] //pu2_weigh_mat[0]*pu2_iscal_mat[0] + + //transpose + trn1 v4.4s, v0.4s, v1.4s + trn2 v5.4s, v0.4s, v1.4s + trn1 v6.4s, v2.4s, v3.4s + trn2 v7.4s, v2.4s, v3.4s + + trn1 v0.2d, v4.2d, v6.2d + trn2 v2.2d, v4.2d, v6.2d + trn1 v1.2d, v5.2d, v7.2d + trn2 v3.2d, v5.2d, v7.2d + //end transpose + + add v4.4s, v0.4s, v3.4s //x0 = x4+x7 + add v5.4s, v1.4s, v2.4s //x1 = x5+x6 + sub v6.4s, v1.4s, v2.4s //x2 = x5-x6 + sub v7.4s, v0.4s, v3.4s //x3 = x4-x7 + + add v0.4s, v4.4s, v5.4s //pi4_tmp_ptr[0] = x0 + x1 + add v1.4s, v7.4s, v6.4s //pi4_tmp_ptr[1] = x3 + x2 + sub v2.4s, v4.4s, v5.4s //pi4_tmp_ptr[2] = x0 - x1 + sub v3.4s, v7.4s, v6.4s //pi4_tmp_ptr[3] = x3 - x2 + + mul v0.4s, v0.4s, v15.4s // q0 = p[i] = (x[i] * trns_coeff[i]) where i = 0..3 + mul v1.4s, v1.4s, v15.4s // q1 = p[i] = (x[i] * trns_coeff[i]) where i = 4..7 + mul v2.4s, v2.4s, v15.4s // q2 = p[i] = (x[i] * trns_coeff[i]) where i = 8..11 + mul v3.4s, v3.4s, v15.4s // q3 = p[i] = (x[i] * trns_coeff[i]) where i = 12..15 + + sshl v0.4s, v0.4s, v14.4s // q0 = q[i] = (p[i] << (qp/6)) where i = 0..3 + sshl v1.4s, v1.4s, v14.4s // q1 = q[i] = (p[i] << (qp/6)) where i = 4..7 + sshl v2.4s, v2.4s, v14.4s // q2 = q[i] = (p[i] << (qp/6)) where i = 8..11 + sshl v3.4s, v3.4s, v14.4s // q3 = q[i] = (p[i] << (qp/6)) where i = 12..15 + + sqrshrn v0.4h, v0.4s, #6 // d0 = c[i] = ((q[i] + 32) >> 4) where i = 0..3 + sqrshrn v1.4h, v1.4s, #6 // d1 = c[i] = ((q[i] + 32) >> 4) where i = 4..7 + sqrshrn v2.4h, v2.4s, #6 // d2 = c[i] = ((q[i] + 32) >> 4) where i = 8..11 + sqrshrn v3.4h, v3.4s, #6 // d3 = c[i] = ((q[i] + 32) >> 4) where i = 12..15 + + st1 {v0.4h-v3.4h}, [x1] //store the result + + pop_v_regs + ret + + +// ******************************************************************************* +// */ +// * @brief This function performs a 2x2 inverse hadamard transform for chroma block +// * +// * @par Description: +// * The DC coefficients pass through a 2-stage inverse hadamard transform. +// * This inverse transformed content is scaled to based on Qp value. +// * Both DC blocks of U and v blocks are processesd +// * +// * @param[in] pi2_src +// * input 1x8 block of ceffs. First 4 are from U and next from V +// * +// * @param[out] pi2_out +// * output 1x8 block +// * +// * @param[in] pu2_iscal_mat +// * pointer to scaling list +// * +// * @param[in] pu2_weigh_mat +// * pointer to weight matrix +// * +// * @param[in] u4_qp_div_6 +// * Floor (qp/6) +// * +// * @returns none +// * +// * @remarks none +// * +// ******************************************************************************* +// */ +// * +// ******************************************************************************* +// */ +// void ih264_ihadamard_scaling_2x2_uv(WORD16* pi2_src, +// WORD16* pi2_out, +// const UWORD16 *pu2_iscal_mat, +// const UWORD16 *pu2_weigh_mat, +// UWORD32 u4_qp_div_6, + + .global ih264_ihadamard_scaling_2x2_uv_av8 +ih264_ihadamard_scaling_2x2_uv_av8: + +//Registers used +// x0 : *pi2_src +// x1 : *pi2_out +// x2 : *pu2_iscal_mat +// x3 : *pu2_weigh_mat +// x4 : u4_qp_div_6 + push_v_regs + ld1 {v26.h}[0], [x2] + ld1 {v27.h}[0], [x3] + + sub w4, w4, #5 //qp/6 - 4 + dup v28.4s, w4 //load qp/6 + + ld2 {v0.4h, v1.4h}, [x0] //load 8 dc coeffs + //i2_x4,i2_x6,i2_y4,i1_y6 -> d0 + //i2_x5,i2_x7,i2_y5,i1_y6 -> d1 + + saddl v2.4s, v0.4h, v1.4h //i4_x0 = i4_x4 + i4_x5;...x2 + ssubl v4.4s, v0.4h, v1.4h //i4_x1 = i4_x4 - i4_x5;...x3 + + umull v30.4s, v26.4h, v27.4h //pu2_iscal_mat[0]*pu2_weigh_mat[0] + dup v30.4s, v30.s[0] + + trn1 v0.4s, v2.4s, v4.4s + trn2 v1.4s, v2.4s, v4.4s //i4_x0 i4_x1 -> q1 + + add v2.4s, v0.4s, v1.4s //i4_x4 = i4_x0+i4_x2;.. i4_x5 + sub v3.4s, v0.4s, v1.4s //i4_x6 = i4_x0-i4_x2;.. i4_x7 + + mul v2.4s, v2.4s, v30.4s + mul v3.4s, v3.4s, v30.4s + + sshl v2.4s, v2.4s, v28.4s + sshl v3.4s, v3.4s, v28.4s + + xtn v0.4h, v2.4s //i4_x4 i4_x5 i4_y4 i4_y5 + xtn v1.4h, v3.4s //i4_x6 i4_x7 i4_y6 i4_y7 + + st2 {v0.4s-v1.4s}, [x1] + pop_v_regs + ret + + + diff --git a/common/armv8/ih264_inter_pred_chroma_av8.s b/common/armv8/ih264_inter_pred_chroma_av8.s new file mode 100755 index 0000000..714e271 --- /dev/null +++ b/common/armv8/ih264_inter_pred_chroma_av8.s @@ -0,0 +1,392 @@ +//****************************************************************************** +//* +//* Copyright (C) 2015 The Android Open Source Project +//* +//* Licensed under the Apache License, Version 2.0 (the "License"); +//* you may not use this file except in compliance with the License. +//* You may obtain a copy of the License at: +//* +//* http://www.apache.org/licenses/LICENSE-2.0 +//* +//* Unless required by applicable law or agreed to in writing, software +//* distributed under the License is distributed on an "AS IS" BASIS, +//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +//* See the License for the specific language governing permissions and +//* limitations under the License. +//* +//***************************************************************************** +//* Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore +//*/ +///** +//****************************************************************************** +//* @file +//* ih264_inter_pred_chroma_av8.s +//* +//* @brief +//* Contains function definitions for inter prediction interpolation. +//* +//* @author +//* Ittaim +//* +//* @par List of Functions: +//* +//* - ih264_inter_pred_chroma_av8() +//* +//* @remarks +//* None +//* +//******************************************************************************* +//*/ + +///* All the functions here are replicated from ih264_inter_pred_filters.c +// + +///** +///** +///** +// +///** +//******************************************************************************* +//* +//* @brief +//* Interprediction chroma filter +//* +//* @par Description: +//* Applies filtering to chroma samples as mentioned in +//* sec 8.4.2.2.2 titled "chroma sample interpolation process" +//* +//* @param[in] pu1_src +//* UWORD8 pointer to the source containing alternate U and V samples +//* +//* @param[out] pu1_dst +//* UWORD8 pointer to the destination +//* +//* @param[in] src_strd +//* integer source stride +//* +//* @param[in] dst_strd +//* integer destination stride +//* +//* @param[in]uc_dx +//* dx value where the sample is to be produced(refer sec 8.4.2.2.2 ) +//* +//* @param[in] uc_dy +//* dy value where the sample is to be produced(refer sec 8.4.2.2.2 ) +//* +//* @param[in] ht +//* integer height of the array +//* +//* @param[in] wd +//* integer width of the array +//* +//* @returns +//* +//* @remarks +//* None +//* +//******************************************************************************* +//*/ + +//void ih264_inter_pred_chroma(UWORD8 *pu1_src, +// UWORD8 *pu1_dst, +// WORD32 src_strd, +// WORD32 dst_strd, +// UWORD8 u1_dx, +// UWORD8 u1_dy, +// WORD32 ht, +// WORD32 wd) +//**************Variables Vs Registers***************************************** +// x0 => *pu1_src +// x1 => *pu1_dst +// x2 => src_strd +// x3 => dst_strd +// x4 => u1_dx +// x5 => u1_dy +// x6 => height +// x7 => width +// +.text +.p2align 2 +.include "ih264_neon_macros.s" + + + + .global ih264_inter_pred_chroma_av8 + +ih264_inter_pred_chroma_av8: + + + + // STMFD sp!, {x4-x12, x14} //store register values to stack + push_v_regs + stp x19, x20, [sp, #-16]! + + + + + + sub x20, x4, #8 //8-u1_dx + neg x8, x20 + sub x20, x5, #8 //8-u1_dy + neg x9, x20 + mul x10, x8, x9 // + mul x11, x4, x9 // + + dup v28.8b, w10 + dup v29.8b, w11 + + mul x10, x8, x5 // + mul x11, x4, x5 // + + dup v30.8b, w10 + dup v31.8b, w11 + + subs x12, x7, #2 //if wd=4 branch to loop_4 + beq loop_2 + subs x12, x7, #4 //if wd=8 branch to loop_8 + beq loop_4 + +loop_8: + ld1 {v0.8b, v1.8b, v2.8b}, [x0], x2 //// Load row0 ; + ext v3.8b, v0.8b , v1.8b , #2 + ld1 {v5.8b, v6.8b, v7.8b}, [x0], x2 //// Load row1; + umull v20.8h, v0.8b, v28.8b + ext v8.8b, v5.8b , v6.8b , #2 + umlal v20.8h, v3.8b, v29.8b + ext v9.8b, v6.8b , v7.8b , #2 + umlal v20.8h, v5.8b, v30.8b + ext v4.8b, v1.8b , v2.8b , #2 + umlal v20.8h, v8.8b, v31.8b + sqrshrun v26.8b, v20.8h, #6 + umull v22.8h, v1.8b, v28.8b + ld1 {v10.8b, v11.8b, v12.8b}, [x0], x2 //// Load row2 ; + umlal v22.8h, v4.8b, v29.8b + ext v13.8b, v10.8b , v11.8b , #2 + umlal v22.8h, v6.8b, v30.8b + ext v14.8b, v11.8b , v12.8b , #2 + umlal v22.8h, v9.8b, v31.8b + sqrshrun v27.8b, v22.8h, #6 + umull v24.8h, v5.8b, v28.8b + st1 { v26.8b, v27.8b}, [x1], x3 ////Store dest row + umlal v24.8h, v8.8b, v29.8b + ld1 {v0.8b, v1.8b, v2.8b}, [x0], x2 //// Load row3 ; + umlal v24.8h, v10.8b, v30.8b + ext v3.8b, v0.8b , v1.8b , #2 + umlal v24.8h, v13.8b, v31.8b + ext v4.8b, v1.8b , v2.8b , #2 + umull v16.8h, v6.8b, v28.8b + sqrshrun v18.8b, v24.8h, #6 + umlal v16.8h, v9.8b, v29.8b + umlal v16.8h, v11.8b, v30.8b + umlal v16.8h, v14.8b, v31.8b + sqrshrun v19.8b, v16.8h, #6 + st1 {v18.8b, v19.8b}, [x1], x3 // store row 1 + umull v20.8h, v10.8b, v28.8b + umlal v20.8h, v13.8b, v29.8b + umlal v20.8h, v0.8b, v30.8b + umlal v20.8h, v3.8b, v31.8b + sqrshrun v26.8b, v20.8h, #6 + umull v24.8h, v11.8b, v28.8b + ld1 {v5.8b, v6.8b, v7.8b}, [x0], x2 //// Load row4; + umlal v24.8h, v14.8b, v29.8b + ext v8.8b, v5.8b , v6.8b , #2 + umlal v24.8h, v1.8b, v30.8b + ext v9.8b, v6.8b , v7.8b , #2 + umlal v24.8h, v4.8b, v31.8b + umull v20.8h, v0.8b, v28.8b + sqrshrun v27.8b, v24.8h, #6 + umlal v20.8h, v3.8b, v29.8b + st1 { v26.8b, v27.8b}, [x1], x3 ////Store dest row2 + umlal v20.8h, v5.8b, v30.8b + umlal v20.8h, v8.8b, v31.8b + umull v22.8h, v1.8b, v28.8b + umlal v22.8h, v4.8b, v29.8b + umlal v22.8h, v6.8b, v30.8b + sqrshrun v26.8b, v20.8h, #6 + umlal v22.8h, v9.8b, v31.8b + subs x12, x6, #4 + sqrshrun v27.8b, v22.8h, #6 + st1 { v26.8b, v27.8b}, [x1], x3 ////Store dest row3 + + beq end_func //If ht=4 + + ld1 {v10.8b, v11.8b, v12.8b}, [x0], x2 //// Load row5 + ext v13.8b, v10.8b , v11.8b , #2 + umull v24.8h, v5.8b, v28.8b + ext v14.8b, v11.8b , v12.8b , #2 + ld1 {v0.8b, v1.8b, v2.8b}, [x0], x2 //// Load row6; + umlal v24.8h, v8.8b, v29.8b + umlal v24.8h, v10.8b, v30.8b + umlal v24.8h, v13.8b, v31.8b + ext v3.8b, v0.8b , v1.8b , #2 + umull v16.8h, v6.8b, v28.8b + sqrshrun v18.8b, v24.8h, #6 + umlal v16.8h, v9.8b, v29.8b + umlal v16.8h, v11.8b, v30.8b + umlal v16.8h, v14.8b, v31.8b + ext v4.8b, v1.8b , v2.8b , #2 + sqrshrun v19.8b, v16.8h, #6 + st1 { v18.8b, v19.8b}, [x1], x3 // store row 4 + umull v20.8h, v10.8b, v28.8b + umlal v20.8h, v13.8b, v29.8b + umlal v20.8h, v0.8b, v30.8b + umlal v20.8h, v3.8b, v31.8b + ld1 {v5.8b, v6.8b, v7.8b}, [x0], x2 //// Load row7; + sqrshrun v26.8b, v20.8h, #6 + umull v24.8h, v11.8b, v28.8b + umlal v24.8h, v14.8b, v29.8b + ext v8.8b, v5.8b , v6.8b , #2 + umlal v24.8h, v1.8b, v30.8b + umlal v24.8h, v4.8b, v31.8b + ext v9.8b, v6.8b , v7.8b , #2 + sqrshrun v27.8b, v24.8h, #6 + st1 {v26.8b, v27.8b}, [x1], x3 ////Store dest row5 + umull v20.8h, v0.8b, v28.8b + umlal v20.8h, v3.8b, v29.8b + umlal v20.8h, v5.8b, v30.8b + umlal v20.8h, v8.8b, v31.8b + ld1 {v10.8b, v11.8b, v12.8b}, [x0], x2 //// Load row8 ; + sqrshrun v26.8b, v20.8h, #6 + umull v22.8h, v1.8b, v28.8b + umlal v22.8h, v4.8b, v29.8b + umlal v22.8h, v6.8b, v30.8b + ext v13.8b, v10.8b , v11.8b , #2 + umlal v22.8h, v9.8b, v31.8b + ext v14.8b, v11.8b , v12.8b , #2 + sqrshrun v27.8b, v22.8h, #6 + st1 { v26.8b, v27.8b}, [x1], x3 ////Store dest row6 + umull v24.8h, v5.8b, v28.8b + umlal v24.8h, v8.8b, v29.8b + umlal v24.8h, v10.8b, v30.8b + umlal v24.8h, v13.8b, v31.8b + umull v16.8h, v6.8b, v28.8b + sqrshrun v18.8b, v24.8h, #6 + umlal v16.8h, v9.8b, v29.8b + umlal v16.8h, v11.8b, v30.8b + umlal v16.8h, v14.8b, v31.8b + sqrshrun v19.8b, v16.8h, #6 + st1 { v18.8b, v19.8b}, [x1], x3 // store row 7 + b end_func + +loop_4: + ld1 {v0.8b, v1.8b}, [x0], x2 //// Load row0 ; + ext v2.8b, v0.8b , v1.8b , #2 + ld1 {v3.8b, v4.8b}, [x0], x2 //// Load row1; + ext v5.8b, v3.8b , v4.8b , #2 + umull v20.8h, v0.8b, v28.8b + umlal v20.8h, v2.8b, v29.8b + umlal v20.8h, v3.8b, v30.8b + umlal v20.8h, v5.8b, v31.8b + ld1 {v6.8b, v7.8b}, [x0], x2 //// Load row2 + sqrshrun v26.8b, v20.8h, #6 + ext v8.8b, v6.8b , v7.8b , #2 + st1 {v26.8b}, [x1], x3 ////Store dest row0 + umull v22.8h, v3.8b, v28.8b + umlal v22.8h, v5.8b, v29.8b + umlal v22.8h, v6.8b, v30.8b + umlal v22.8h, v8.8b, v31.8b + subs x12, x6, #2 + sqrshrun v27.8b, v22.8h, #6 + st1 {v27.8b}, [x1], x3 ////Store dest row1 + beq end_func //If ht=2 + + ld1 {v9.8b, v10.8b}, [x0], x2 //// Load row3; + ext v11.8b, v9.8b , v10.8b , #2 + umull v24.8h, v6.8b, v28.8b + umlal v24.8h, v8.8b, v29.8b + umlal v24.8h, v9.8b, v30.8b + umlal v24.8h, v11.8b, v31.8b + ld1 {v0.8b, v1.8b}, [x0], x2 //// Load row4 ; + sqrshrun v16.8b, v24.8h, #6 + ext v2.8b, v0.8b , v1.8b , #2 + st1 {v16.8b}, [x1], x3 ////Store dest row2 + umull v18.8h, v9.8b, v28.8b + umlal v18.8h, v11.8b, v29.8b + umlal v18.8h, v0.8b, v30.8b + umlal v18.8h, v2.8b, v31.8b + subs x12, x6, #4 + sqrshrun v17.8b, v18.8h, #6 + st1 {v17.8b}, [x1], x3 ////Store dest row3 + beq end_func //If ht=4 + + ld1 {v3.8b, v4.8b}, [x0], x2 //// Load row5; + ext v5.8b, v3.8b , v4.8b , #2 + umull v20.8h, v0.8b, v28.8b + umlal v20.8h, v2.8b, v29.8b + umlal v20.8h, v3.8b, v30.8b + umlal v20.8h, v5.8b, v31.8b + ld1 {v6.8b, v7.8b}, [x0], x2 //// Load row6 ; + sqrshrun v26.8b, v20.8h, #6 + ext v8.8b, v6.8b , v7.8b , #2 + st1 {v26.8b}, [x1], x3 ////Store dest row4 + umull v22.8h, v3.8b, v28.8b + umlal v22.8h, v5.8b, v29.8b + umlal v22.8h, v6.8b, v30.8b + umlal v22.8h, v8.8b, v31.8b + ld1 {v9.8b, v10.8b}, [x0], x2 //// Load row7; + sqrshrun v27.8b, v22.8h, #6 + ext v11.8b, v9.8b , v10.8b , #2 + st1 {v27.8b}, [x1], x3 ////Store dest row5 + umull v24.8h, v6.8b, v28.8b + umlal v24.8h, v8.8b, v29.8b + umlal v24.8h, v9.8b, v30.8b + umlal v24.8h, v11.8b, v31.8b + ld1 {v0.8b, v1.8b}, [x0], x2 //// Load row8; + sqrshrun v16.8b, v24.8h, #6 + ext v2.8b, v0.8b , v1.8b , #2 + st1 {v16.8b}, [x1], x3 ////Store dest row6 + umull v18.8h, v9.8b, v28.8b + umlal v18.8h, v11.8b, v29.8b + umlal v18.8h, v0.8b, v30.8b + umlal v18.8h, v2.8b, v31.8b + sqrshrun v17.8b, v18.8h, #6 + st1 {v17.8b}, [x1], x3 ////Store dest row7 + b end_func + +loop_2: + ld1 {v0.8b}, [x0], x2 //// Load row0 ; + ext v2.8b, v0.8b , v0.8b , #2 + ld1 {v3.8b}, [x0], x2 //// Load row1; + ext v5.8b, v3.8b , v3.8b , #2 + umull v20.8h, v0.8b, v28.8b + umlal v20.8h, v2.8b, v29.8b + umlal v20.8h, v3.8b, v30.8b + umlal v20.8h, v5.8b, v31.8b + ld1 {v6.8b}, [x0], x2 //// Load row2 + sqrshrun v26.8b, v20.8h, #6 + ext v8.8b, v6.8b , v6.8b , #2 + st1 {v26.s}[0], [x1], x3 ////Store dest row0 + umull v22.8h, v3.8b, v28.8b + umlal v22.8h, v5.8b, v29.8b + umlal v22.8h, v6.8b, v30.8b + umlal v22.8h, v8.8b, v31.8b + subs x12, x6, #2 + sqrshrun v27.8b, v22.8h, #6 + st1 {v27.s}[0], [x1], x3 ////Store dest row1 + beq end_func //If ht=2 + + ld1 {v9.8b}, [x0], x2 //// Load row3; + ext v11.8b, v9.8b , v9.8b , #2 + umull v24.8h, v6.8b, v28.8b + umlal v24.8h, v8.8b, v29.8b + umlal v24.8h, v9.8b, v30.8b + umlal v24.8h, v11.8b, v31.8b + ld1 {v0.8b}, [x0], x2 //// Load row4 ; + sqrshrun v16.8b, v24.8h, #6 + ext v2.8b, v0.8b , v0.8b , #2 + st1 {v16.s}[0], [x1], x3 ////Store dest row2 + umull v18.8h, v9.8b, v28.8b + umlal v18.8h, v11.8b, v29.8b + umlal v18.8h, v0.8b, v30.8b + umlal v18.8h, v2.8b, v31.8b + sqrshrun v17.8b, v18.8h, #6 + st1 {v17.s}[0], [x1], x3 ////Store dest row3 + + +end_func: + // LDMFD sp!,{x4-x12,PC} //Restoring registers from stack + ldp x19, x20, [sp], #16 + pop_v_regs + ret + + diff --git a/common/armv8/ih264_inter_pred_filters_luma_horz_av8.s b/common/armv8/ih264_inter_pred_filters_luma_horz_av8.s new file mode 100755 index 0000000..6ad463a --- /dev/null +++ b/common/armv8/ih264_inter_pred_filters_luma_horz_av8.s @@ -0,0 +1,530 @@ +//****************************************************************************** +//* +//* Copyright (C) 2015 The Android Open Source Project +//* +//* Licensed under the Apache License, Version 2.0 (the "License"); +//* you may not use this file except in compliance with the License. +//* You may obtain a copy of the License at: +//* +//* http://www.apache.org/licenses/LICENSE-2.0 +//* +//* Unless required by applicable law or agreed to in writing, software +//* distributed under the License is distributed on an "AS IS" BASIS, +//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +//* See the License for the specific language governing permissions and +//* limitations under the License. +//* +//***************************************************************************** +//* Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore +//*/ +///** +//****************************************************************************** +//* @file +//* ih264_inter_pred_luma_horz_av8.s +//* +//* @brief +//* Contains function definitions for inter prediction interpolation. +//* +//* @author +//* Ittiam +//* +//* @par List of Functions: +//* +//* - ih264_inter_pred_luma_horz_av8() +//* +//* @remarks +//* None +//* +//******************************************************************************* +//*/ + +///* All the functions here are replicated from ih264_inter_pred_filters.c +// + +///** +///** +//******************************************************************************* +//* +//* @brief +//* Interprediction luma filter for horizontal input +//* +//* @par Description: +//* Applies a 6 tap horizontal filter .The output is clipped to 8 bits +//* sec 8.4.2.2.1 titled "Luma sample interpolation process" +//* +//* @param[in] pu1_src +//* UWORD8 pointer to the source +//* +//* @param[out] pu1_dst +//* UWORD8 pointer to the destination +//* +//* @param[in] src_strd +//* integer source stride +//* +//* @param[in] dst_strd +//* integer destination stride +//* +//* @param[in] ht +//* integer height of the array +//* +//* @param[in] wd +//* integer width of the array +//* +//* @returns +//* +// @remarks +//* None +//* +//******************************************************************************* +//*/ + +//void ih264_inter_pred_luma_horz ( +// UWORD8 *pu1_src, +// UWORD8 *pu1_dst, +// WORD32 src_strd, +// WORD32 dst_strd, +// WORD32 ht, +// WORD32 wd ) + +//**************Variables Vs Registers***************************************** +// x0 => *pu1_src +// x1 => *pu1_dst +// x2 => src_strd +// x3 => dst_strd +// x4 => ht +// x5 => wd + +.text +.p2align 2 + +.include "ih264_neon_macros.s" + + + + .global ih264_inter_pred_luma_horz_av8 + +ih264_inter_pred_luma_horz_av8: + + + + + // STMFD sp!, {x4-x12, x14} //store register values to stack + push_v_regs + stp x19, x20, [sp, #-16]! + sub x0, x0, #2 //pu1_src-2 + sub x14, x4, #16 + movi v0.8b, #5 //filter coeff + subs x12, x5, #8 //if wd=8 branch to loop_8 + movi v1.8b, #20 //filter coeff + beq loop_8 + + subs x12, x5, #4 //if wd=4 branch to loop_4 + beq loop_4 + +loop_16: //when wd=16 + //// Processing row0 and row1 + ld1 {v2.8b, v3.8b, v4.8b}, [x0], x2 //// Load row0 + add x14, x14, #1 //for checking loop + ext v31.8b, v2.8b , v3.8b, #5 ////extract a[5] (column1,row0) + ld1 {v5.8b, v6.8b, v7.8b}, [x0], x2 //// Load row1 + ext v30.8b, v3.8b , v4.8b, #5 ////extract a[5] (column2,row0) + uaddl v8.8h, v31.8b, v2.8b //// a0 + a5 (column1,row0) + ext v28.8b, v5.8b , v6.8b, #5 ////extract a[5] (column1,row1) + uaddl v10.8h, v30.8b, v3.8b //// a0 + a5 (column2,row0) + ext v27.8b, v6.8b , v7.8b, #5 ////extract a[5] (column2,row1) + uaddl v14.8h, v28.8b, v5.8b //// a0 + a5 (column1,row1) + ext v31.8b, v2.8b , v3.8b, #2 ////extract a[2] (column1,row0) + uaddl v16.8h, v27.8b, v6.8b //// a0 + a5 (column2,row1) + ext v30.8b, v3.8b , v4.8b, #2 ////extract a[2] (column2,row0) + umlal v8.8h, v31.8b, v1.8b //// a0 + a5 + 20a2 (column1,row0) + ext v28.8b, v5.8b , v6.8b, #2 ////extract a[2] (column1,row1) + umlal v10.8h, v30.8b, v1.8b //// a0 + a5 + 20a2 (column2,row0) + ext v27.8b, v6.8b , v7.8b, #2 ////extract a[2] (column2,row1) + umlal v14.8h, v28.8b, v1.8b //// a0 + a5 + 20a2 (column1,row1) + ext v31.8b, v2.8b , v3.8b, #3 ////extract a[3] (column1,row0) + umlal v16.8h, v27.8b, v1.8b //// a0 + a5 + 20a2 (column2,row1) + ext v30.8b, v3.8b , v4.8b, #3 ////extract a[3] (column2,row0) + umlal v8.8h, v31.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column1,row0) + ext v28.8b, v5.8b , v6.8b, #3 ////extract a[3] (column1,row1) + umlal v10.8h, v30.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column2,row0) + ext v27.8b, v6.8b , v7.8b, #3 ////extract a[3] (column2,row1) + umlal v14.8h, v28.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column1,row1) + ext v31.8b, v2.8b , v3.8b, #1 ////extract a[1] (column1,row0) + umlal v16.8h, v27.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column2,row1) + ext v30.8b, v3.8b , v4.8b, #1 ////extract a[1] (column2,row0) + umlsl v8.8h, v31.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column1,row0) + ext v28.8b, v5.8b , v6.8b, #1 ////extract a[1] (column1,row1) + umlsl v10.8h, v30.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column2,row0) + ext v27.8b, v6.8b , v7.8b, #1 ////extract a[1] (column2,row1) + umlsl v14.8h, v28.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column1,row1) + ext v31.8b, v2.8b , v3.8b, #4 ////extract a[4] (column1,row0) + umlsl v16.8h, v27.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column2,row1) + ext v30.8b, v3.8b , v4.8b, #4 ////extract a[4] (column2,row0) + umlsl v8.8h, v31.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1,row0) + ext v28.8b, v5.8b , v6.8b, #4 ////extract a[4] (column1,row1) + umlsl v10.8h, v30.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column2,row0) + ext v27.8b, v6.8b , v7.8b, #4 ////extract a[4] (column2,row1) + umlsl v14.8h, v28.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1,row1) + ld1 {v2.8b, v3.8b, v4.8b}, [x0], x2 //// Load row2 + umlsl v16.8h, v27.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column2,row1) + + sqrshrun v20.8b, v8.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column1,row0) + ld1 {v5.8b, v6.8b, v7.8b}, [x0], x2 //// Load row3 + sqrshrun v21.8b, v10.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column2,row0) + ext v31.8b, v2.8b , v3.8b, #5 ////extract a[5] (column1,row2) + st1 {v20.8b, v21.8b}, [x1], x3 ////Store dest row0 + sqrshrun v23.8b, v14.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column1,row1) + ext v30.8b, v3.8b , v4.8b, #5 ////extract a[5] (column2,row2) + sqrshrun v24.8b, v16.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column2,row1) + + + +//// Processing row2 and row3 + ext v28.8b, v5.8b , v6.8b, #5 ////extract a[5] (column1,row3) + uaddl v8.8h, v31.8b, v2.8b //// a0 + a5 (column1,row2) + st1 {v23.8b, v24.8b}, [x1], x3 ////Store dest row1 + uaddl v10.8h, v30.8b, v3.8b //// a0 + a5 (column2,row2) + ext v27.8b, v6.8b , v7.8b, #5 ////extract a[5] (column2,row3) + uaddl v14.8h, v28.8b, v5.8b //// a0 + a5 (column1,row3) + ext v31.8b, v2.8b , v3.8b, #2 ////extract a[2] (column1,row2) + uaddl v16.8h, v27.8b, v6.8b //// a0 + a5 (column2,row3) + ext v30.8b, v3.8b , v4.8b, #2 ////extract a[2] (column2,row2) + umlal v8.8h, v31.8b, v1.8b //// a0 + a5 + 20a2 (column1,row2) + ext v27.8b, v6.8b , v7.8b, #2 ////extract a[2] (column2,row3) + umlal v10.8h, v30.8b, v1.8b //// a0 + a5 + 20a2 (column2,row2) + ext v28.8b, v5.8b , v6.8b, #2 ////extract a[2] (column1,row3) + umlal v14.8h, v28.8b, v1.8b //// a0 + a5 + 20a2 (column1,row3) + ext v31.8b, v2.8b , v3.8b, #3 ////extract a[3] (column1,row2) + umlal v16.8h, v27.8b, v1.8b //// a0 + a5 + 20a2 (column2,row3) + ext v30.8b, v3.8b , v4.8b, #3 ////extract a[3] (column2,row2) + umlal v8.8h, v31.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column1,row2) + ext v28.8b, v5.8b , v6.8b, #3 ////extract a[3] (column1,row3) + umlal v10.8h, v30.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column2,row2) + ext v27.8b, v6.8b , v7.8b, #3 ////extract a[3] (column2,row3) + umlal v14.8h, v28.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column1,row3) + ext v31.8b, v2.8b , v3.8b, #1 ////extract a[1] (column1,row2) + umlal v16.8h, v27.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column2,row3) + ext v30.8b, v3.8b , v4.8b, #1 ////extract a[1] (column2,row2) + umlsl v8.8h, v31.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column1,row2) + ext v28.8b, v5.8b , v6.8b, #1 ////extract a[1] (column1,row3) + umlsl v10.8h, v30.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column2,row2) + ext v27.8b, v6.8b , v7.8b, #1 ////extract a[1] (column2,row3) + umlsl v14.8h, v28.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column1,row3) + ext v31.8b, v2.8b , v3.8b, #4 ////extract a[4] (column1,row2) + umlsl v16.8h, v27.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column2,row3) + ext v30.8b, v3.8b , v4.8b, #4 ////extract a[4] (column2,row2) + umlsl v8.8h, v31.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1,row2) + ext v28.8b, v5.8b , v6.8b, #4 ////extract a[4] (column1,row3) + umlsl v10.8h, v30.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column2,row2) + ext v27.8b, v6.8b , v7.8b, #4 ////extract a[4] (column2,row3) + umlsl v14.8h, v28.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1,row3) + ld1 {v2.8b, v3.8b, v4.8b}, [x0], x2 //// Load row4 + umlsl v16.8h, v27.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column2,row3) + + sqrshrun v20.8b, v8.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column1,row2) + ld1 {v5.8b, v6.8b, v7.8b}, [x0], x2 //// Load row5 + sqrshrun v21.8b, v10.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column2,row2) + ext v31.8b, v2.8b , v3.8b, #5 ////extract a[5] (column1,row4) + st1 {v20.8b, v21.8b}, [x1], x3 ////Store dest row2 + sqrshrun v23.8b, v14.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column1,row3) + ext v30.8b, v3.8b , v4.8b, #5 ////extract a[5] (column2,row4) + sqrshrun v24.8b, v16.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column2,row3) + + +//// Processing row4 and row5 + ext v28.8b, v5.8b , v6.8b, #5 ////extract a[5] (column1,row5) + uaddl v8.8h, v31.8b, v2.8b //// a0 + a5 (column1,row4) + st1 {v23.8b, v24.8b}, [x1], x3 ////Store dest row3 + uaddl v10.8h, v30.8b, v3.8b //// a0 + a5 (column2,row4) + ext v27.8b, v6.8b , v7.8b, #5 ////extract a[5] (column2,row5) + uaddl v14.8h, v28.8b, v5.8b //// a0 + a5 (column1,row5) + ext v31.8b, v2.8b , v3.8b, #2 ////extract a[2] (column1,row4) + uaddl v16.8h, v27.8b, v6.8b //// a0 + a5 (column2,row5) + ext v30.8b, v3.8b , v4.8b, #2 ////extract a[2] (column2,row4) + umlal v8.8h, v31.8b, v1.8b //// a0 + a5 + 20a2 (column1,row4) + ext v27.8b, v6.8b , v7.8b, #2 ////extract a[2] (column2,row5) + umlal v10.8h, v30.8b, v1.8b //// a0 + a5 + 20a2 (column2,row4) + ext v28.8b, v5.8b , v6.8b, #2 ////extract a[2] (column1,row5) + umlal v14.8h, v28.8b, v1.8b //// a0 + a5 + 20a2 (column1,row5) + ext v31.8b, v2.8b , v3.8b, #3 ////extract a[3] (column1,row4) + umlal v16.8h, v27.8b, v1.8b //// a0 + a5 + 20a2 (column2,row5) + ext v30.8b, v3.8b , v4.8b, #3 ////extract a[3] (column2,row4) + umlal v8.8h, v31.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column1,row4) + ext v28.8b, v5.8b , v6.8b, #3 ////extract a[3] (column1,row5) + umlal v10.8h, v30.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column2,row4) + ext v27.8b, v6.8b , v7.8b, #3 ////extract a[3] (column2,row5) + umlal v14.8h, v28.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column1,row5) + ext v31.8b, v2.8b , v3.8b, #1 ////extract a[1] (column1,row4) + umlal v16.8h, v27.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column2,row5) + ext v30.8b, v3.8b , v4.8b, #1 ////extract a[1] (column2,row4) + umlsl v8.8h, v31.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column1,row4) + ext v28.8b, v5.8b , v6.8b, #1 ////extract a[1] (column1,row5) + umlsl v10.8h, v30.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column2,row4) + ext v27.8b, v6.8b , v7.8b, #1 ////extract a[1] (column2,row5) + umlsl v14.8h, v28.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column1,row4) + ext v31.8b, v2.8b , v3.8b, #4 ////extract a[4] (column1,row4) + umlsl v16.8h, v27.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column2,row5) + ext v30.8b, v3.8b , v4.8b, #4 ////extract a[4] (column2,row4) + umlsl v8.8h, v31.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1,row4) + ext v28.8b, v5.8b , v6.8b, #4 ////extract a[4] (column1,row5) + umlsl v10.8h, v30.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column2,row4) + ext v27.8b, v6.8b , v7.8b, #4 ////extract a[4] (column2,row5) + umlsl v14.8h, v28.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1,row5) + ld1 {v2.8b, v3.8b, v4.8b}, [x0], x2 //// Load row6 + umlsl v16.8h, v27.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column2,row5) + + sqrshrun v20.8b, v8.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column1,row4) + ld1 {v5.8b, v6.8b, v7.8b}, [x0], x2 //// Load row7 + sqrshrun v21.8b, v10.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column2,row4) + ext v31.8b, v2.8b , v3.8b, #5 ////extract a[5] (column1,row6) + st1 {v20.8b, v21.8b}, [x1], x3 ////Store dest row2 + sqrshrun v23.8b, v14.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column1,row5) + ext v30.8b, v3.8b , v4.8b, #5 ////extract a[5] (column2,row6) + sqrshrun v24.8b, v16.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column2,row5) + + + + //// Processing row6 and row7 + + ext v28.8b, v5.8b , v6.8b, #5 ////extract a[5] (column1,row7) + uaddl v8.8h, v31.8b, v2.8b //// a0 + a5 (column1,row6) + st1 {v23.8b, v24.8b}, [x1], x3 ////Store dest row5 + uaddl v10.8h, v30.8b, v3.8b //// a0 + a5 (column2,row6) + ext v27.8b, v6.8b , v7.8b, #5 ////extract a[5] (column2,row7) + uaddl v14.8h, v28.8b, v5.8b //// a0 + a5 (column1,row7) + ext v31.8b, v2.8b , v3.8b, #2 ////extract a[2] (column1,row6) + uaddl v16.8h, v27.8b, v6.8b //// a0 + a5 (column2,row7) + ext v30.8b, v3.8b , v4.8b, #2 ////extract a[2] (column2,row6) + umlal v8.8h, v31.8b, v1.8b //// a0 + a5 + 20a2 (column1,row6) + ext v27.8b, v6.8b , v7.8b, #2 ////extract a[2] (column2,row7) + umlal v10.8h, v30.8b, v1.8b //// a0 + a5 + 20a2 (column2,row6) + ext v28.8b, v5.8b , v6.8b, #2 ////extract a[2] (column1,row7) + umlal v14.8h, v28.8b, v1.8b //// a0 + a5 + 20a2 (column1,row7) + ext v31.8b, v2.8b , v3.8b, #3 ////extract a[3] (column1,row6) + umlal v16.8h, v27.8b, v1.8b //// a0 + a5 + 20a2 (column2,row7) + ext v30.8b, v3.8b , v4.8b, #3 ////extract a[3] (column2,row6) + umlal v8.8h, v31.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column1,row6) + ext v28.8b, v5.8b , v6.8b, #3 ////extract a[3] (column1,row7) + umlal v10.8h, v30.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column2,row6) + ext v27.8b, v6.8b , v7.8b, #3 ////extract a[3] (column2,row7) + umlal v14.8h, v28.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column1,row7) + ext v31.8b, v2.8b , v3.8b, #1 ////extract a[1] (column1,row6) + umlal v16.8h, v27.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column2,row7) + ext v30.8b, v3.8b , v4.8b, #1 ////extract a[1] (column2,row6) + umlsl v8.8h, v31.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column1,row6) + ext v28.8b, v5.8b , v6.8b, #1 ////extract a[1] (column1,row7) + umlsl v10.8h, v30.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column2,row6) + ext v27.8b, v6.8b , v7.8b, #1 ////extract a[1] (column2,row7) + umlsl v14.8h, v28.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column1,row6) + ext v31.8b, v2.8b , v3.8b, #4 ////extract a[4] (column1,row6) + umlsl v16.8h, v27.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column2,row7) + ext v30.8b, v3.8b , v4.8b, #4 ////extract a[4] (column2,row6) + umlsl v8.8h, v31.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1,row6) + ext v28.8b, v5.8b , v6.8b, #4 ////extract a[4] (column1,row7) + umlsl v10.8h, v30.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column2,row6) + ext v27.8b, v6.8b , v7.8b, #4 ////extract a[4] (column2,row6) + + sqrshrun v20.8b, v8.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column1,row6) + umlsl v14.8h, v28.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1,row7) + sqrshrun v21.8b, v10.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column2,row6) + umlsl v16.8h, v27.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column2,row7) + sqrshrun v23.8b, v14.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column1,row7) + st1 {v20.8b, v21.8b}, [x1], x3 ////Store dest row6 + sqrshrun v24.8b, v16.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column2,row7) + subs x12, x14, #1 // if height==16 - looping + st1 {v23.8b, v24.8b}, [x1], x3 ////Store dest row7 + + + + beq loop_16 + b end_func + + + +loop_8: +//// Processing row0 and row1 + + + ld1 {v5.8b, v6.8b}, [x0], x2 //// Load row1 + add x14, x14, #1 //for checking loop + ext v28.8b, v5.8b , v6.8b, #5 ////extract a[5] (column1,row1) + ld1 {v2.8b, v3.8b}, [x0], x2 //// Load row0 + ext v25.8b, v5.8b , v6.8b, #2 ////extract a[2] (column1,row1) + ext v31.8b, v2.8b , v3.8b, #5 ////extract a[5] (column1,row0) + ext v24.8b, v5.8b , v6.8b, #3 ////extract a[3] (column1,row1) + ext v23.8b, v5.8b , v6.8b, #1 ////extract a[1] (column1,row1) + ext v22.8b, v5.8b , v6.8b, #4 ////extract a[4] (column1,row1) + uaddl v14.8h, v28.8b, v5.8b //// a0 + a5 (column1,row1) + ext v29.8b, v2.8b , v3.8b, #3 ////extract a[3] (column1,row0) + umlal v14.8h, v25.8b, v1.8b //// a0 + a5 + 20a2 (column1,row1) + umlal v14.8h, v24.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column1,row1) + umlsl v14.8h, v23.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column1,row1) + umlsl v14.8h, v22.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1,row1) + ext v30.8b, v2.8b , v3.8b, #2 ////extract a[2] (column1,row0) + uaddl v8.8h, v31.8b, v2.8b //// a0 + a5 (column1,row0) + ext v27.8b, v2.8b , v3.8b, #1 ////extract a[1] (column1,row0) + ext v26.8b, v2.8b , v3.8b, #4 ////extract a[4] (column1,row0) + ld1 {v2.8b, v3.8b}, [x0], x2 //// Load row2 + umlal v8.8h, v29.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column1,row0) + umlal v8.8h, v30.8b, v1.8b //// a0 + a5 + 20a2 (column1,row0) + umlsl v8.8h, v27.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column1,row0) + umlsl v8.8h, v26.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1,row0) + ld1 {v5.8b, v6.8b}, [x0], x2 //// Load row3 + sqrshrun v23.8b, v14.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column1,row0) + + //// Processing row2 and row3 + ext v28.8b, v5.8b , v6.8b, #5 ////extract a[5] (column1,row3) + ext v25.8b, v5.8b , v6.8b, #2 ////extract a[2] (column1,row3) + ext v31.8b, v2.8b , v3.8b, #5 ////extract a[5] (column1,row2) + uaddl v14.8h, v28.8b, v5.8b //// a0 + a5 (column1,row3) + st1 {v23.8b}, [x1], x3 ////Store dest row0 + ext v24.8b, v5.8b , v6.8b, #3 ////extract a[3] (column1,row2) + ext v23.8b, v5.8b , v6.8b, #1 ////extract a[1] (column1,row3) + sqrshrun v20.8b, v8.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column1,row1) + ext v22.8b, v5.8b , v6.8b, #4 ////extract a[4] (column1,row3) + ext v29.8b, v2.8b , v3.8b, #3 ////extract a[3] (column1,row2) + umlal v14.8h, v25.8b, v1.8b //// a0 + a5 + 20a2 (column1,row3) + umlal v14.8h, v24.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column1,row3) + umlsl v14.8h, v23.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column1,row3) + umlsl v14.8h, v22.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1,row3) + st1 {v20.8b}, [x1], x3 ////Store dest row1 + uaddl v8.8h, v31.8b, v2.8b //// a0 + a5 (column1,row2) + ext v30.8b, v2.8b , v3.8b, #2 ////extract a[2] (column1,row2) + ext v27.8b, v2.8b , v3.8b, #1 ////extract a[1] (column1,row2) + ext v26.8b, v2.8b , v3.8b, #4 ////extract a[4] (column1,row2) + ld1 {v2.8b, v3.8b}, [x0], x2 //// Load row4 + umlal v8.8h, v29.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column1,row2) + umlal v8.8h, v30.8b, v1.8b //// a0 + a5 + 20a2 (column1,row2) + umlsl v8.8h, v27.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column1,row2) + umlsl v8.8h, v26.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1,row2) + ld1 {v5.8b, v6.8b}, [x0], x2 //// Load row3 + subs x9, x4, #4 + sqrshrun v23.8b, v14.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column1,row3) + ext v28.8b, v5.8b , v6.8b, #5 ////extract a[5] (column1,row5) + ext v25.8b, v5.8b , v6.8b, #2 ////extract a[2] (column1,row5) + ext v31.8b, v2.8b , v3.8b, #5 ////extract a[5] (column1,row4) + uaddl v14.8h, v28.8b, v5.8b //// a0 + a5 (column1,row5) + ext v24.8b, v5.8b , v6.8b, #3 ////extract a[3] (column1,row5) + sqrshrun v20.8b, v8.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column1,row2) + ext v22.8b, v5.8b , v6.8b, #4 ////extract a[4] (column1,row5) + ext v29.8b, v2.8b , v3.8b, #3 ////extract a[3] (column1,row4) + st1 {v20.8b}, [x1], x3 ////Store dest row2 + ext v30.8b, v2.8b , v3.8b, #2 ////extract a[2] (column1,row4) + uaddl v8.8h, v31.8b, v2.8b //// a0 + a5 (column1,row4) + st1 {v23.8b}, [x1], x3 ////Store dest row3 + beq end_func // Branch if height==4 + +//// Processing row4 and row5 + ext v23.8b, v5.8b , v6.8b, #1 ////extract a[1] (column1,row5) + umlal v14.8h, v25.8b, v1.8b //// a0 + a5 + 20a2 (column1,row5) + umlal v14.8h, v24.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column1,row5) + umlsl v14.8h, v23.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column1,row5) + umlsl v14.8h, v22.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1,row5) + ext v27.8b, v2.8b , v3.8b, #1 ////extract a[1] (column1,row4) + ext v26.8b, v2.8b , v3.8b, #4 ////extract a[4] (column1,row4) + ld1 {v2.8b, v3.8b}, [x0], x2 //// Load row6 + umlal v8.8h, v29.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column1,row4) + umlal v8.8h, v30.8b, v1.8b //// a0 + a5 + 20a2 (column1,row4) + umlsl v8.8h, v27.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column1,row4) + umlsl v8.8h, v26.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1,row4) + sqrshrun v23.8b, v14.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column1,row5) + ld1 {v5.8b, v6.8b}, [x0], x2 //// Load row7 + ext v31.8b, v2.8b , v3.8b, #5 ////extract a[5] (column1,row6) + ext v28.8b, v5.8b , v6.8b, #5 ////extract a[5] (column1,row7) + ext v25.8b, v5.8b , v6.8b, #2 ////extract a[2] (column1,row7) + uaddl v14.8h, v28.8b, v5.8b //// a0 + a5 (column1,row7) + ext v24.8b, v5.8b , v6.8b, #3 ////extract a[3] (column1,row7) + ext v22.8b, v5.8b , v6.8b, #4 ////extract a[4] (column1,row7) + sqrshrun v20.8b, v8.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column1,row4) + ext v29.8b, v2.8b , v3.8b, #3 ////extract a[3] (column1,row6) + ext v30.8b, v2.8b , v3.8b, #2 ////extract a[2] (column1,row6) + st1 {v20.8b}, [x1], x3 ////Store dest row4 + ext v27.8b, v2.8b , v3.8b, #1 ////extract a[1] (column1,row6) + uaddl v8.8h, v31.8b, v2.8b //// a0 + a5 (column1,row6) + ext v26.8b, v2.8b , v3.8b, #4 ////extract a[4] (column1,row6) + umlal v8.8h, v29.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column1,row6) + umlal v8.8h, v30.8b, v1.8b //// a0 + a5 + 20a2 (column1,row6) + umlsl v8.8h, v27.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column1,row6) + umlsl v8.8h, v26.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1,row6) + //// Processing row6 and row7 + st1 {v23.8b}, [x1], x3 ////Store dest row5 + ext v23.8b, v5.8b , v6.8b, #1 ////extract a[1] (column1,row7) + umlal v14.8h, v25.8b, v1.8b //// a0 + a5 + 20a2 (column1,row7) + umlal v14.8h, v24.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column1,row7) + umlsl v14.8h, v23.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column1,row7) + umlsl v14.8h, v22.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1,row7) + sqrshrun v20.8b, v8.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column1,row6) + subs x12, x14, #1 + st1 {v20.8b}, [x1], x3 ////Store dest row6 + sqrshrun v23.8b, v14.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column1,row7) + st1 {v23.8b}, [x1], x3 ////Store dest row7 + + beq loop_8 //looping if height ==16 + + b end_func +loop_4: + ld1 {v5.8b, v6.8b}, [x0], x2 //// Load row1 + ext v28.8b, v5.8b , v6.8b, #5 ////extract a[5] (column1,row1) + ld1 {v2.8b, v3.8b}, [x0], x2 //// Load row0 + ext v25.8b, v5.8b , v6.8b, #2 ////extract a[2] (column1,row1) + ext v31.8b, v2.8b , v3.8b, #5 ////extract a[5] (column1,row0) + uaddl v14.8h, v28.8b, v5.8b //// a0 + a5 (column1,row1) + ext v24.8b, v5.8b , v6.8b, #3 ////extract a[3] (column1,row1) + ext v23.8b, v5.8b , v6.8b, #1 ////extract a[1] (column1,row1) + ext v22.8b, v5.8b , v6.8b, #4 ////extract a[4] (column1,row1) + ext v29.8b, v2.8b , v3.8b, #3 ////extract a[3] (column1,row0) + umlal v14.8h, v25.8b, v1.8b //// a0 + a5 + 20a2 (column1,row1) + umlal v14.8h, v24.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column1,row1) + umlsl v14.8h, v23.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column1,row1) + umlsl v14.8h, v22.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1,row1) + uaddl v8.8h, v31.8b, v2.8b //// a0 + a5 (column1,row0) + ext v30.8b, v2.8b , v3.8b, #2 ////extract a[2] (column1,row0) + ext v27.8b, v2.8b , v3.8b, #1 ////extract a[1] (column1,row0) + ext v26.8b, v2.8b , v3.8b, #4 ////extract a[4] (column1,row0) + ld1 {v2.8b, v3.8b}, [x0], x2 //// Load row2 + umlal v8.8h, v29.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column1,row0) + umlal v8.8h, v30.8b, v1.8b //// a0 + a5 + 20a2 (column1,row0) + umlsl v8.8h, v27.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column1,row0) + umlsl v8.8h, v26.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1,row0) + ld1 {v5.8b, v6.8b}, [x0], x2 //// Load row3 + ext v28.8b, v5.8b , v6.8b, #5 ////extract a[5] (column1,row3) + ext v25.8b, v5.8b , v6.8b, #2 ////extract a[2] (column1,row3) + sqrshrun v23.8b, v14.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column1,row0) + ext v31.8b, v2.8b , v3.8b, #5 ////extract a[5] (column1,row2) + ext v24.8b, v5.8b , v6.8b, #3 ////extract a[3] (column1,row2) + st1 {v23.s}[0], [x1], x3 ////Store dest row0 + ext v23.8b, v5.8b , v6.8b, #1 ////extract a[1] (column1,row3) + ext v22.8b, v5.8b , v6.8b, #4 ////extract a[4] (column1,row3) + ext v29.8b, v2.8b , v3.8b, #3 ////extract a[3] (column1,row2) + sqrshrun v20.8b, v8.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column1,row1) + ext v30.8b, v2.8b , v3.8b, #2 ////extract a[2] (column1,row2) + ext v27.8b, v2.8b , v3.8b, #1 ////extract a[1] (column1,row2) + + //// Processing row2 and row3 + st1 {v20.s}[0], [x1], x3 ////Store dest row1 + uaddl v14.8h, v28.8b, v5.8b //// a0 + a5 (column1,row3) + ext v26.8b, v2.8b , v3.8b, #4 ////extract a[4] (column1,row2) + umlal v14.8h, v25.8b, v1.8b //// a0 + a5 + 20a2 (column1,row3) + umlal v14.8h, v24.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column1,row3) + umlsl v14.8h, v23.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column1,row3) + umlsl v14.8h, v22.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1,row3) + uaddl v8.8h, v31.8b, v2.8b //// a0 + a5 (column1,row2) + umlal v8.8h, v29.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column1,row2) + umlal v8.8h, v30.8b, v1.8b //// a0 + a5 + 20a2 (column1,row2) + umlsl v8.8h, v27.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column1,row2) + umlsl v8.8h, v26.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1,row2) + sqrshrun v23.8b, v14.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column1,row3) + sqrshrun v20.8b, v8.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column1,row2) + st1 {v20.s}[0], [x1], x3 ////Store dest row2 + subs x4, x4, #8 // Loop if height =8 + st1 {v23.s}[0], [x1], x3 ////Store dest row3 + beq loop_4 + +end_func: + // LDMFD sp!,{x4-x12,PC} //Restoring registers from stack + ldp x19, x20, [sp], #16 + pop_v_regs + ret + + + diff --git a/common/armv8/ih264_inter_pred_filters_luma_vert_av8.s b/common/armv8/ih264_inter_pred_filters_luma_vert_av8.s new file mode 100755 index 0000000..38934c9 --- /dev/null +++ b/common/armv8/ih264_inter_pred_filters_luma_vert_av8.s @@ -0,0 +1,452 @@ +//****************************************************************************** +//* +//* Copyright (C) 2015 The Android Open Source Project +//* +//* Licensed under the Apache License, Version 2.0 (the "License"); +//* you may not use this file except in compliance with the License. +//* You may obtain a copy of the License at: +//* +//* http://www.apache.org/licenses/LICENSE-2.0 +//* +//* Unless required by applicable law or agreed to in writing, software +//* distributed under the License is distributed on an "AS IS" BASIS, +//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +//* See the License for the specific language governing permissions and +//* limitations under the License. +//* +//***************************************************************************** +//* Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore +//*/ +///** +//****************************************************************************** +//* @file +//* ih264_inter_pred_luma_vert_av8.s +//* +//* @brief +//* Contains function definitions for inter prediction interpolation. +//* +//* @author +//* Ittiam +//* +//* @par List of Functions: +//* +//* - ih264_inter_pred_luma_vert_av8() +//* +//* @remarks +//* None +//* +//******************************************************************************* +//*/ + +///* All the functions here are replicated from ih264_inter_pred_filters.c +// + +///** +///** +///** +// ******************************************************************************* +// * +// * @brief +// * Interprediction luma filter for vertical input +// * +// * @par Description: +// * Applies a 6 tap vertcal filter.The output is clipped to 8 bits +// * sec 8.4.2.2.1 titled "Luma sample interpolation process" +// * +// * @param[in] pu1_src +// * UWORD8 pointer to the source +// * +// * @param[out] pu1_dst +// * UWORD8 pointer to the destination +// * +// * @param[in] src_strd +// * integer source stride +// * +// * @param[in] dst_strd +// * integer destination stride +// * +// * @param[in] ht +// * integer height of the array +// * +// * @param[in] wd +// * integer width of the array +// * +// * @returns +// * +// * @remarks +// * None +// * +// ******************************************************************************* + +//void ih264_inter_pred_luma_vert ( +// UWORD8 *pu1_src, +// UWORD8 *pu1_dst, +// WORD32 src_strd, +// WORD32 dst_strd, +// WORD32 ht, +// WORD32 wd ) + +//**************Variables Vs Registers***************************************** +// x0 => *pu1_src +// x1 => *pu1_dst +// x2 => src_strd +// x3 => dst_strd +// x4 => ht +// x5 => wd + +.text +.p2align 2 +.include "ih264_neon_macros.s" + + + + + .global ih264_inter_pred_luma_vert_av8 + +ih264_inter_pred_luma_vert_av8: + + // STMFD sp!, {x4-x12, x14} //store register values to stack + push_v_regs + stp x19, x20, [sp, #-16]! + + sub x0, x0, x2, lsl #1 //pu1_src-2*src_strd + + sub x14, x4, #16 + movi v22.8h, #20 // Filter coeff 0x14 into Q11 + + subs x12, x5, #8 //if wd=8 branch to loop_8 + movi v24.8h, #5 // Filter coeff 0x4 into Q12 + beq loop_8_start + + subs x12, x5, #4 //if wd=4 branch to loop_4 + beq loop_4_start + + + ld1 {v0.2s, v1.2s}, [x0], x2 // Vector load from src[0_0] + ld1 {v2.2s, v3.2s}, [x0], x2 // Vector load from src[1_0] + ld1 {v4.2s, v5.2s}, [x0], x2 // Vector load from src[2_0] + ld1 {v6.2s, v7.2s}, [x0], x2 // Vector load from src[3_0] + add x14, x14, #1 //for checking loop + ld1 {v8.2s, v9.2s}, [x0], x2 // Vector load from src[4_0] + uaddl v12.8h, v4.8b, v6.8b // temp1 = src[2_0] + src[3_0] + ld1 {v10.2s, v11.2s}, [x0], x2 // Vector load from src[5_0] + +loop_16: //when wd=16 + + uaddl v14.8h, v0.8b, v10.8b // temp = src[0_0] + src[5_0] + uaddl v16.8h, v2.8b, v8.8b // temp2 = src[1_0] + src[4_0] + mla v14.8h, v12.8h, v22.8h // temp += temp1 * 20 + uaddl v20.8h, v1.8b, v11.8b // temp4 = src[0_8] + src[5_8] + uaddl v18.8h, v5.8b, v7.8b // temp3 = src[2_8] + src[3_8] + mla v20.8h, v18.8h , v22.8h // temp4 += temp3 * 20 + ld1 {v0.2s, v1.2s}, [x0], x2 + uaddl v26.8h, v3.8b, v9.8b // temp5 = src[1_8] + src[4_8] + uaddl v12.8h, v6.8b, v8.8b + mls v14.8h, v16.8h , v24.8h // temp -= temp2 * 5 + uaddl v16.8h, v2.8b, v0.8b + uaddl v18.8h, v4.8b, v10.8b + mla v16.8h, v12.8h , v22.8h + mls v20.8h, v26.8h , v24.8h // temp4 -= temp5 * 5 + uaddl v26.8h, v5.8b, v11.8b + uaddl v12.8h, v7.8b, v9.8b + sqrshrun v30.8b, v14.8h, #5 // dst[0_0] = CLIP_U8((temp +16) >> 5) + uaddl v14.8h, v3.8b, v1.8b + ld1 {v2.2s, v3.2s}, [x0], x2 + mla v14.8h, v12.8h , v22.8h + mls v16.8h, v18.8h , v24.8h + sqrshrun v31.8b, v20.8h, #5 // dst[0_8] = CLIP_U8((temp4 +16) >> 5) + uaddl v18.8h, v4.8b, v2.8b + uaddl v12.8h, v8.8b, v10.8b + + st1 {v30.2s, v31.2s}, [x1], x3 // Vector store to dst[0_0] + mla v18.8h, v12.8h , v22.8h + uaddl v20.8h, v6.8b, v0.8b + mls v14.8h, v26.8h , v24.8h + sqrshrun v30.8b, v16.8h, #5 + uaddl v12.8h, v9.8b, v11.8b + uaddl v16.8h, v5.8b, v3.8b + uaddl v26.8h, v7.8b, v1.8b + mla v16.8h, v12.8h , v22.8h + mls v18.8h, v20.8h , v24.8h + ld1 {v4.2s, v5.2s}, [x0], x2 + + sqrshrun v31.8b, v14.8h, #5 + uaddl v12.8h, v10.8b, v0.8b + uaddl v14.8h, v6.8b, v4.8b + uaddl v20.8h, v8.8b, v2.8b + mla v14.8h, v12.8h , v22.8h + mls v16.8h, v26.8h , v24.8h + st1 {v30.2s, v31.2s}, [x1], x3 //store row 1 + sqrshrun v30.8b, v18.8h, #5 + uaddl v18.8h, v7.8b, v5.8b + uaddl v12.8h, v11.8b, v1.8b + mla v18.8h, v12.8h , v22.8h + uaddl v26.8h, v9.8b, v3.8b + mls v14.8h, v20.8h , v24.8h + ld1 {v6.2s, v7.2s}, [x0], x2 + sqrshrun v31.8b, v16.8h, #5 + mls v18.8h, v26.8h , v24.8h + uaddl v12.8h, v0.8b, v2.8b // temp1 = src[2_0] + src[3_0] + st1 {v30.2s, v31.2s}, [x1], x3 //store row 2 + uaddl v16.8h, v10.8b, v4.8b // temp2 = src[1_0] + src[4_0] + uaddl v20.8h, v9.8b, v7.8b // temp4 = src[0_8] + src[5_8] + sqrshrun v30.8b, v14.8h, #5 + uaddl v26.8h, v5.8b, v11.8b // temp5 = src[1_8] + src[4_8] + uaddl v14.8h, v8.8b, v6.8b // temp = src[0_0] + src[5_0] + sqrshrun v31.8b, v18.8h, #5 + mla v14.8h, v12.8h , v22.8h // temp += temp1 * 20 + uaddl v18.8h, v1.8b, v3.8b // temp3 = src[2_8] + src[3_8] + st1 {v30.2s, v31.2s}, [x1], x3 //store row 3 + // 4 rows processed + mla v20.8h, v18.8h , v22.8h // temp4 += temp3 * 20 + ld1 {v8.2s, v9.2s}, [x0], x2 + uaddl v12.8h, v2.8b, v4.8b + uaddl v18.8h, v3.8b, v5.8b + mls v14.8h, v16.8h , v24.8h // temp -= temp2 * 5 + uaddl v28.8h, v9.8b, v11.8b + uaddl v16.8h, v6.8b, v0.8b + mla v28.8h, v18.8h , v22.8h // temp4 += temp3 * 20 + mls v20.8h, v26.8h , v24.8h // temp4 -= temp5 * 5 + uaddl v26.8h, v1.8b, v7.8b + uaddl v18.8h, v5.8b, v7.8b + sqrshrun v30.8b, v14.8h, #5 // dst[0_0] = CLIP_U8((temp +16) >> 5) + uaddl v14.8h, v8.8b, v10.8b + + sqrshrun v31.8b, v20.8h, #5 // dst[0_8] = CLIP_U8((temp4 +16) >> 5) + ld1 {v10.2s, v11.2s}, [x0], x2 + mls v28.8h, v26.8h , v24.8h // temp4 -= temp5 * 5 + st1 {v30.2s, v31.2s}, [x1], x3 // store row 4 + mla v14.8h, v12.8h , v22.8h // temp += temp1 * 20 + uaddl v20.8h, v11.8b, v1.8b + uaddl v26.8h, v3.8b, v9.8b + mla v20.8h, v18.8h , v22.8h // temp4 += temp3 * 20 + uaddl v12.8h, v6.8b, v4.8b + uaddl v18.8h, v7.8b, v9.8b + sqrshrun v31.8b, v28.8h, #5 // dst[0_8] = CLIP_U8((temp4 +16) >> 5) + mls v14.8h, v16.8h , v24.8h // temp -= temp2 * 5 + uaddl v16.8h, v8.8b, v2.8b + sqrshrun v30.8b, v14.8h, #5 // dst[0_0] = CLIP_U8((temp +16) >> 5) + mls v20.8h, v26.8h , v24.8h // temp4 -= temp5 * 5 + uaddl v14.8h, v10.8b, v0.8b + st1 {v30.2s, v31.2s}, [x1], x3 // store row 5 + mla v14.8h, v12.8h , v22.8h // temp += temp1 * 20 + ld1 {v0.2s, v1.2s}, [x0], x2 + uaddl v26.8h, v5.8b, v11.8b + uaddl v12.8h, v8.8b, v6.8b + uaddl v28.8h, v0.8b, v2.8b + sqrshrun v31.8b, v20.8h, #5 // dst[0_8] = CLIP_U8((temp4 +16) >> 5) + mla v28.8h, v12.8h , v22.8h // temp += temp1 * 20 + uaddl v20.8h, v1.8b, v3.8b + mls v14.8h, v16.8h , v24.8h // temp -= temp2 * 5 + mla v20.8h, v18.8h , v22.8h // temp4 += temp3 * 20 + uaddl v16.8h, v10.8b, v4.8b + sqrshrun v30.8b, v14.8h, #5 // dst[0_0] = CLIP_U8((temp +16) >> 5) + mov v2.8b, v6.8b + mov v3.8b, v7.8b + mls v28.8h, v16.8h , v24.8h // temp -= temp2 * 5 + st1 {v30.2s, v31.2s}, [x1], x3 // store row 6 + sqrshrun v30.8b, v28.8h, #5 // dst[0_0] = CLIP_U8((temp +16) >> 5) + + swp v0.8b v4.8b + swp v1.8b v5.8b + + + + mls v20.8h, v26.8h , v24.8h // temp4 -= temp5 * 5 + mov v6.8b, v10.8b + mov v7.8b, v11.8b + subs x12, x14, #1 // if height==16 - looping + + swp v4.8b v8.8b + swp v5.8b v9.8b + + + sqrshrun v31.8b, v20.8h, #5 // dst[0_8] = CLIP_U8((temp4 +16) >> 5) + st1 {v30.2s, v31.2s}, [x1], x3 // store row 7 + bne end_func //if height =8 end function + add x14, x14, #1 //for checking loop + ld1 {v10.2s, v11.2s}, [x0], x2 + uaddl v12.8h, v4.8b, v6.8b // temp1 = src[2_0] + src[3_0] + + b loop_16 // looping if height =16 + +loop_8_start: +//// Processing row0 and row1 + + ld1 {v0.2s}, [x0], x2 // Vector load from src[0_0] + ld1 {v1.2s}, [x0], x2 // Vector load from src[1_0] + ld1 {v2.2s}, [x0], x2 // Vector load from src[2_0] + ld1 {v3.2s}, [x0], x2 // Vector load from src[3_0] + add x14, x14, #1 //for checking loop + ld1 {v4.2s}, [x0], x2 // Vector load from src[4_0] + ld1 {v5.2s}, [x0], x2 // Vector load from src[5_0] + +loop_8: + //for checking loop + uaddl v6.8h, v2.8b, v3.8b // temp1 = src[2_0] + src[3_0] + uaddl v8.8h, v0.8b, v5.8b // temp = src[0_0] + src[5_0] + uaddl v10.8h, v1.8b, v4.8b // temp2 = src[1_0] + src[4_0] + mla v8.8h, v6.8h , v22.8h // temp += temp1 * 20 + ld1 {v6.2s}, [x0], x2 + uaddl v14.8h, v3.8b, v4.8b + uaddl v16.8h, v1.8b, v6.8b + uaddl v18.8h, v2.8b, v5.8b + mls v8.8h, v10.8h , v24.8h // temp -= temp2 * 5 + mla v16.8h, v14.8h , v22.8h + ld1 {v7.2s}, [x0], x2 + uaddl v20.8h, v4.8b, v5.8b + uaddl v12.8h, v2.8b, v7.8b + uaddl v10.8h, v3.8b, v6.8b + mls v16.8h, v18.8h , v24.8h + sqrshrun v26.8b, v8.8h, #5 // dst[0_0] = CLIP_U8( (temp + 16) >> 5) + mla v12.8h, v20.8h , v22.8h + ld1 {v0.2s}, [x0], x2 + uaddl v14.8h, v5.8b, v6.8b + sqrshrun v27.8b, v16.8h, #5 + uaddl v20.8h, v3.8b, v0.8b + mls v12.8h, v10.8h , v24.8h + st1 {v26.2s}, [x1], x3 // Vector store to dst[0_0] + uaddl v18.8h, v4.8b, v7.8b + mla v20.8h, v14.8h , v22.8h + st1 {v27.2s}, [x1], x3 + sqrshrun v28.8b, v12.8h, #5 + st1 {v28.2s}, [x1], x3 + mls v20.8h, v18.8h , v24.8h + ld1 {v1.2s}, [x0], x2 + sqrshrun v29.8b, v20.8h, #5 + subs x9, x4, #4 + st1 {v29.2s}, [x1], x3 //store row 3 + + + beq end_func // Branch if height==4 + + + uaddl v14.8h, v6.8b, v7.8b // temp1 = src[2_0] + src[3_0] + uaddl v16.8h, v0.8b, v5.8b // temp = src[0_0] + src[5_0] + uaddl v18.8h, v1.8b, v4.8b // temp2 = src[1_0] + src[4_0] + mla v18.8h, v14.8h , v22.8h // temp += temp1 * 20 + ld1 {v2.2s}, [x0], x2 + mls v18.8h, v16.8h , v24.8h // temp -= temp2 * 5 + uaddl v8.8h, v0.8b, v7.8b + uaddl v10.8h, v1.8b, v6.8b + uaddl v12.8h, v2.8b, v5.8b + sqrshrun v26.8b, v18.8h, #5 + mla v12.8h, v8.8h , v22.8h + ld1 {v3.2s}, [x0], x2 + mls v12.8h, v10.8h , v24.8h + st1 {v26.2s}, [x1], x3 + sqrshrun v27.8b, v12.8h, #5 + st1 {v27.2s}, [x1], x3 + uaddl v14.8h, v0.8b, v1.8b // temp1 = src[2_0] + src[3_0] + uaddl v16.8h, v2.8b, v7.8b // temp = src[0_0] + src[5_0] + uaddl v18.8h, v3.8b, v6.8b // temp2 = src[1_0] + src[4_0] + mla v18.8h, v14.8h , v22.8h // temp += temp1 * 20 + ld1 {v4.2s}, [x0], x2 + mls v18.8h, v16.8h , v24.8h // temp -= temp2 * 5 + uaddl v8.8h, v2.8b, v1.8b + uaddl v10.8h, v3.8b, v0.8b + uaddl v12.8h, v4.8b, v7.8b + sqrshrun v26.8b, v18.8h, #5 + mla v12.8h, v8.8h , v22.8h + ld1 {v5.2s}, [x0], x2 + mls v12.8h, v10.8h , v24.8h + st1 {v26.2s}, [x1], x3 + sqrshrun v27.8b, v12.8h, #5 + subs x12, x14, #1 + st1 {v27.2s}, [x1], x3 + add x14, x14, #1 + beq loop_8 //looping if height ==16 + + b end_func + + +loop_4_start: +//// Processing row0 and row1 + + + ld1 {v0.s}[0], [x0], x2 // Vector load from src[0_0] + ld1 {v1.s}[0], [x0], x2 // Vector load from src[1_0] + ld1 {v2.s}[0], [x0], x2 // Vector load from src[2_0] + ld1 {v3.s}[0], [x0], x2 // Vector load from src[3_0] + ld1 {v4.s}[0], [x0], x2 // Vector load from src[4_0] + ld1 {v5.s}[0], [x0], x2 // Vector load from src[5_0] + + uaddl v6.8h, v2.8b, v3.8b // temp1 = src[2_0] + src[3_0] + uaddl v8.8h, v0.8b, v5.8b // temp = src[0_0] + src[5_0] + uaddl v10.8h, v1.8b, v4.8b // temp2 = src[1_0] + src[4_0] + mla v8.8h, v6.8h , v22.8h // temp += temp1 * 20 + ld1 {v6.2s}, [x0], x2 + uaddl v14.8h, v3.8b, v4.8b + uaddl v16.8h, v1.8b, v6.8b + uaddl v18.8h, v2.8b, v5.8b + mls v8.8h, v10.8h , v24.8h // temp -= temp2 * 5 + ld1 {v7.s}[0], [x0], x2 + mla v16.8h, v14.8h , v22.8h + uaddl v20.8h, v4.8b, v5.8b + uaddl v12.8h, v2.8b, v7.8b + uaddl v10.8h, v3.8b, v6.8b + mls v16.8h, v18.8h , v24.8h + sqrshrun v26.8b, v8.8h, #5 // dst[0_0] = CLIP_U8( (temp + 16) >> 5) + mla v12.8h, v20.8h , v22.8h + ld1 {v0.s}[0], [x0], x2 + uaddl v14.8h, v5.8b, v6.8b + sqrshrun v27.8b, v16.8h, #5 + uaddl v20.8h, v3.8b, v0.8b + mls v12.8h, v10.8h , v24.8h + st1 {v26.s}[0], [x1], x3 // Vector store to dst[0_0] + uaddl v18.8h, v4.8b, v7.8b + mla v20.8h, v14.8h , v22.8h + st1 {v27.s}[0], [x1], x3 + sqrshrun v28.8b, v12.8h, #5 + st1 {v28.s}[0], [x1], x3 + mls v20.8h, v18.8h , v24.8h + ld1 {v1.s}[0], [x0], x2 + sqrshrun v29.8b, v20.8h, #5 + st1 {v29.s}[0], [x1], x3 //store row 3 + + subs x9, x4, #4 + beq end_func // Branch if height==4 + + + uaddl v14.8h, v6.8b, v7.8b // temp1 = src[2_0] + src[3_0] + uaddl v16.8h, v0.8b, v5.8b // temp = src[0_0] + src[5_0] + uaddl v18.8h, v1.8b, v4.8b // temp2 = src[1_0] + src[4_0] + mla v18.8h, v14.8h , v22.8h // temp += temp1 * 20 + ld1 {v2.s}[0], [x0], x2 + mls v18.8h, v16.8h , v24.8h // temp -= temp2 * 5 + uaddl v8.8h, v0.8b, v7.8b + uaddl v10.8h, v1.8b, v6.8b + uaddl v12.8h, v2.8b, v5.8b + sqrshrun v26.8b, v18.8h, #5 + mla v12.8h, v8.8h , v22.8h + ld1 {v3.s}[0], [x0], x2 + mls v12.8h, v10.8h , v24.8h + st1 {v26.s}[0], [x1], x3 + sqrshrun v27.8b, v12.8h, #5 + st1 {v27.s}[0], [x1], x3 + uaddl v14.8h, v0.8b, v1.8b // temp1 = src[2_0] + src[3_0] + uaddl v16.8h, v2.8b, v7.8b // temp = src[0_0] + src[5_0] + uaddl v18.8h, v3.8b, v6.8b // temp2 = src[1_0] + src[4_0] + mla v18.8h, v14.8h , v22.8h // temp += temp1 * 20 + ld1 {v4.s}[0], [x0], x2 + mls v18.8h, v16.8h , v24.8h // temp -= temp2 * 5 + uaddl v8.8h, v2.8b, v1.8b + uaddl v10.8h, v3.8b, v0.8b + uaddl v12.8h, v4.8b, v7.8b + sqrshrun v26.8b, v18.8h, #5 + mla v12.8h, v8.8h , v22.8h + ld1 {v5.s}[0], [x0], x2 + mls v12.8h, v10.8h , v24.8h + st1 {v26.s}[0], [x1], x3 + sqrshrun v27.8b, v12.8h, #5 + st1 {v27.s}[0], [x1], x3 + + +end_func: + // LDMFD sp!,{x4-x12,PC} //Restoring registers from stack + ldp x19, x20, [sp], #16 + pop_v_regs + ret + + + diff --git a/common/armv8/ih264_inter_pred_luma_copy_av8.s b/common/armv8/ih264_inter_pred_luma_copy_av8.s new file mode 100755 index 0000000..1a76c1c --- /dev/null +++ b/common/armv8/ih264_inter_pred_luma_copy_av8.s @@ -0,0 +1,267 @@ +//****************************************************************************** +//* +//* Copyright (C) 2015 The Android Open Source Project +//* +//* Licensed under the Apache License, Version 2.0 (the "License"); +//* you may not use this file except in compliance with the License. +//* You may obtain a copy of the License at: +//* +//* http://www.apache.org/licenses/LICENSE-2.0 +//* +//* Unless required by applicable law or agreed to in writing, software +//* distributed under the License is distributed on an "AS IS" BASIS, +//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +//* See the License for the specific language governing permissions and +//* limitations under the License. +//* +//***************************************************************************** +//* Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore +//*/ +///** +///** +//******************************************************************************* +//* +//* @brief +//* Interprediction luma function for copy +//* +//* @par Description: +//* Copies the array of width 'wd' and height 'ht' from the location pointed +//* by 'src' to the location pointed by 'dst' +//* +//* @param[in] pu1_src +//* UWORD8 pointer to the source +//* +//* @param[out] pu1_dst +//* UWORD8 pointer to the destination +//* +//* @param[in] src_strd +//* integer source stride +//* +//* @param[in] dst_strd +//* integer destination stride +//* +//* +//* @param[in] ht +//* integer height of the array +//* +//* @param[in] wd +//* integer width of the array +//* +//* @returns +//* +//* @remarks +//* None +//* +//******************************************************************************* +//*/ +//void ih264_inter_pred_luma_copy ( +// UWORD8 *pu1_src, +// UWORD8 *pu1_dst, +// WORD32 src_strd, +// WORD32 dst_strd, +// WORD32 ht, +// WORD32 wd ) + +//**************Variables Vs Registers***************************************** +// x0 => *pu1_src +// x1 => *pu1_dst +// x2 => src_strd +// x3 => dst_strd +// x7 => ht +// x12 => wd + +.text +.p2align 2 +.include "ih264_neon_macros.s" + + + + .global ih264_inter_pred_luma_copy_av8 + +ih264_inter_pred_luma_copy_av8: + + push_v_regs + stp x19, x20, [sp, #-16]! + + mov x12, x5 + mov x7, x4 + cmp x7, #0 //checks ht == 0 + ble end_loops + tst x12, #15 //checks wd for multiples for 4 & 8 + beq core_loop_wd_16 + tst x12, #7 //checks wd for multiples for 4 & 8 + beq core_loop_wd_8 + sub x11, x12, #4 + +outer_loop_wd_4: + subs x4, x12, #0 //checks wd == 0 + ble end_inner_loop_wd_4 + +inner_loop_wd_4: + ld1 {v0.s}[0], [x0] //vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp, 0) + add x5, x0, x2 //pu1_src_tmp += src_strd + add x6, x1, x3 //pu1_dst_tmp += dst_strd + st1 {v0.s}[0], [x1] //vst1_lane_u32((uint32_t *)pu1_dst_tmp, src_tmp, 0) + ld1 {v0.s}[0], [x5], x2 //vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp, 0) + add x0, x0, #4 //pu1_src += 4 + st1 {v0.s}[0], [x6], x3 //vst1_lane_u32((uint32_t *)pu1_dst_tmp, src_tmp, 0) + ld1 {v0.s}[0], [x5], x2 //vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp, 0) + subs x4, x4, #4 //(wd -4) + st1 {v0.s}[0], [x6], x3 //vst1_lane_u32((uint32_t *)pu1_dst_tmp, src_tmp, 0) + ld1 {v0.s}[0], [x5], x2 //vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp, 0) + add x1, x1, #4 //pu1_dst += 4 + st1 {v0.s}[0], [x6], x3 //vst1_lane_u32((uint32_t *)pu1_dst_tmp, src_tmp, 0) + + bgt inner_loop_wd_4 + +end_inner_loop_wd_4: + subs x7, x7, #4 //ht - 4 + sub x0, x5, x11 //pu1_src = pu1_src_tmp + sub x1, x6, x11 //pu1_dst = pu1_dst_tmp + bgt outer_loop_wd_4 + +end_loops: + // LDMFD sp!,{x4-x12,x15} //Reload the registers from SP + ldp x19, x20, [sp], #16 + pop_v_regs + ret + + +core_loop_wd_8: + sub x11, x12, #8 + +outer_loop_wd_8: + subs x4, x12, #0 //checks wd + ble end_inner_loop_wd_8 + +inner_loop_wd_8: + add x5, x0, x2 //pu1_src_tmp += src_strd + ld1 {v0.8b}, [x0], #8 //vld1_u8(pu1_src_tmp) + add x6, x1, x3 //pu1_dst_tmp += dst_strd + st1 {v0.8b}, [x1], #8 //vst1_u8(pu1_dst_tmp, tmp_src) + ld1 {v1.8b}, [x5], x2 //vld1_u8(pu1_src_tmp) + st1 {v1.8b}, [x6], x3 //vst1_u8(pu1_dst_tmp, tmp_src) + subs x4, x4, #8 //wd - 8(Loop condition) + ld1 {v2.8b}, [x5], x2 //vld1_u8(pu1_src_tmp) + st1 {v2.8b}, [x6], x3 //vst1_u8(pu1_dst_tmp, tmp_src) + ld1 {v3.8b}, [x5], x2 //vld1_u8(pu1_src_tmp) + st1 {v3.8b}, [x6], x3 //vst1_u8(pu1_dst_tmp, tmp_src) + bgt inner_loop_wd_8 + +end_inner_loop_wd_8: + subs x7, x7, #4 //ht -= 4 + sub x0, x5, x11 //pu1_src = pu1_src_tmp + sub x1, x6, x11 //pu1_dst = pu1_dst_tmp + bgt outer_loop_wd_8 + + // LDMFD sp!,{x4-x12,x15} //Reload the registers from SP + ldp x19, x20, [sp], #16 + pop_v_regs + ret + +core_loop_wd_16: + sub x11, x12, #16 + +outer_loop_wd_16: + subs x4, x12, #0 //checks wd + ble end_inner_loop_wd_16 + +inner_loop_wd_16: + add x5, x0, x2 //pu1_src_tmp += src_strd + ld1 { v0.16b}, [x0], #16 //vld1_u8(pu1_src_tmp) + add x6, x1, x3 //pu1_dst_tmp += dst_strd + st1 { v0.16b}, [x1], #16 //vst1_u8(pu1_dst_tmp, tmp_src) + ld1 { v2.16b}, [x5], x2 //vld1_u8(pu1_src_tmp) + st1 { v2.16b}, [x6], x3 //vst1_u8(pu1_dst_tmp, tmp_src) + subs x4, x4, #16 //wd - 8(Loop condition) + ld1 { v4.16b}, [x5], x2 //vld1_u8(pu1_src_tmp) + st1 { v4.16b}, [x6], x3 //vst1_u8(pu1_dst_tmp, tmp_src) + ld1 { v6.16b}, [x5], x2 //vld1_u8(pu1_src_tmp) + st1 { v6.16b}, [x6], x3 //vst1_u8(pu1_dst_tmp, tmp_src) + bgt inner_loop_wd_16 + +end_inner_loop_wd_16: + subs x7, x7, #4 //ht -= 4 + sub x0, x5, x11 //pu1_src = pu1_src_tmp + sub x1, x6, x11 //pu1_dst = pu1_dst_tmp + bgt outer_loop_wd_16 + + + ldp x19, x20, [sp], #16 + pop_v_regs + ret + + +// /* +// ******************************************************************************** +// * +// * @brief This function copies a 4x4 block to destination +// * +// * @par Description: +// * Copies a 4x4 block to destination, where both src and dst are interleaved +// * +// * @param[in] pi2_src +// * Source +// * +// * @param[in] pu1_out +// * Output pointer +// * +// * @param[in] pred_strd, +// * Prediction buffer stride +// * +// * @param[in] out_strd +// * output buffer buffer Stride +// * +// * @returns none +// * +// * @remarks none +// * Currently wd and height is not used, ie a 4x4 block is always copied +// * +// ******************************************************************************* +// */ +// void ih264_interleave_copy(WORD16 *pi2_src, +// UWORD8 *pu1_out, +// WORD32 pred_strd, +// WORD32 out_strd +// WORD32 wd +// WORD32 ht) +// Register Usage +// x0 : pi2_src +// x1 : pu1_out +// x2 : src_strd +// x3 : out_strd +// Neon registers d0-d7, d16-d30 are used +// No need for pushing arm and neon registers + + .global ih264_interleave_copy_av8 +ih264_interleave_copy_av8: + push_v_regs + ld1 {v2.8b}, [x0], x2 //load src plane 1 => d2 &pred palne 2 => d3 + ld1 {v3.8b}, [x0], x2 + mov v2.d[1], v3.d[0] + ld1 {v4.8b}, [x0], x2 + ld1 {v5.8b}, [x0], x2 + mov v4.d[1], v5.d[0] + + mov x0, x1 + + ld1 {v18.8b}, [x1], x3 //load out [8 bit size) -8 coeffs + ld1 {v19.8b}, [x1], x3 + mov v18.d[1], v19.d[0] + movi v30.8h, #0x00ff + ld1 {v20.8b}, [x1], x3 + ld1 {v21.8b}, [x1], x3 + mov v20.d[1], v21.d[0] + + bit v18.16b, v2.16b , v30.16b + bit v20.16b, v4.16b , v30.16b + + st1 {v18.8b}, [x0], x3 //store out + st1 {v18.d}[1], [x0], x3 + st1 {v20.8b}, [x0], x3 + st1 {v20.d}[1], [x0], x3 + + pop_v_regs + ret + + diff --git a/common/armv8/ih264_inter_pred_luma_horz_hpel_vert_hpel_av8.s b/common/armv8/ih264_inter_pred_luma_horz_hpel_vert_hpel_av8.s new file mode 100755 index 0000000..ea7645e --- /dev/null +++ b/common/armv8/ih264_inter_pred_luma_horz_hpel_vert_hpel_av8.s @@ -0,0 +1,820 @@ +//****************************************************************************** +//* +//* Copyright (C) 2015 The Android Open Source Project +//* +//* Licensed under the Apache License, Version 2.0 (the "License"); +//* you may not use this file except in compliance with the License. +//* You may obtain a copy of the License at: +//* +//* http://www.apache.org/licenses/LICENSE-2.0 +//* +//* Unless required by applicable law or agreed to in writing, software +//* distributed under the License is distributed on an "AS IS" BASIS, +//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +//* See the License for the specific language governing permissions and +//* limitations under the License. +//* +//***************************************************************************** +//* Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore +//*/ +///** +//****************************************************************************** +//* @file +//* ih264_inter_pred_luma_horz_hpel_vert_hpel_av8.s +//* +//* @brief +//* Contains function definitions for inter prediction interpolation. +//* +//* @author +//* Mohit +//* +//* @par List of Functions: +//* +//* - ih264_inter_pred_luma_horz_hpel_vert_hpel_av8() +//* +//* @remarks +//* None +//* +//******************************************************************************* +//*/ + + + +//void ih264_inter_pred_luma_horz_hpel_vert_hpel(UWORD8 *pu1_src, +// UWORD8 *pu1_dst, +// WORD32 src_strd,, +// WORD32 dst_strd, +// WORD32 ht, +// WORD32 wd, +// UWORD8* pu1_tmp, +// UWORD32 dydx) + +//**************Variables Vs Registers***************************************** +// x0 => *pu1_src +// x1 => *pu1_dst +// x2 => src_strd +// x3 => dst_strd +// x4 => ht +// x5 => wd + + +.text +.p2align 2 +.include "ih264_neon_macros.s" + + + + .global ih264_inter_pred_luma_horz_hpel_vert_hpel_av8 + +ih264_inter_pred_luma_horz_hpel_vert_hpel_av8: + + //store register values to stack + push_v_regs + stp x19, x20, [sp, #-16]! + + sub x0, x0, x2, lsl #1 //pu1_src-2*src_strd + sub x0, x0, #2 //pu1_src-2 + + movi v26.8h, #0x14 // Filter coeff 20 into Q13 + movi v24.8h, #0x5 // Filter coeff 5 into Q12 + movi v27.8h, #0x14 // Filter coeff 20 into Q13 + movi v25.8h, #0x5 // Filter coeff 5 into Q12 + mov x7, #0x20 + mov x8, #0x30 + subs x12, x5, #4 //if wd=4 branch to loop_4 + beq loop_4_start + + subs x12, x5, #8 //if wd=8 branch to loop_8 + beq loop_8_start + + //when wd=16 + movi v28.8h, #0x14 // Filter coeff 20 into Q13 + movi v30.8h, #0x5 // Filter coeff 5 into Q12 + sub x2, x2, #16 + ld1 {v0.2s, v1.2s}, [x0], #16 // Vector load from src[0_0] + ld1 {v12.2s}, [x0], x2 // Vector load from src[0_0] + ld1 {v2.2s, v3.2s}, [x0], #16 // Vector load from src[1_0] + ld1 {v13.2s}, [x0], x2 // Vector load from src[1_0] + ld1 {v4.2s, v5.2s}, [x0], #16 // Vector load from src[2_0] + ld1 {v14.2s}, [x0], x2 // Vector load from src[2_0] + ld1 {v6.2s, v7.2s}, [x0], #16 // Vector load from src[3_0] + ld1 {v15.2s}, [x0], x2 // Vector load from src[3_0] + ld1 {v8.2s, v9.2s}, [x0], #16 // Vector load from src[4_0] + ld1 {v16.2s}, [x0], x2 // Vector load from src[4_0] +loop_16: + + ld1 {v10.2s, v11.2s}, [x0], #16 // Vector load from src[5_0] + ld1 {v17.2s}, [x0], x2 // Vector load from src[5_0] + + + uaddl v20.8h, v4.8b, v6.8b + uaddl v18.8h, v0.8b, v10.8b + uaddl v22.8h, v2.8b, v8.8b + mla v18.8h, v20.8h , v28.8h + uaddl v24.8h, v5.8b, v7.8b + uaddl v20.8h, v1.8b, v11.8b + uaddl v26.8h, v3.8b, v9.8b + mla v20.8h, v24.8h , v28.8h + uaddl v24.8h, v14.8b, v15.8b + mls v18.8h, v22.8h , v30.8h + uaddl v22.8h, v12.8b, v17.8b + mls v20.8h, v26.8h , v30.8h + uaddl v26.8h, v13.8b, v16.8b + mla v22.8h, v24.8h , v28.8h + mls v22.8h, v26.8h , v30.8h + + ext v24.16b, v18.16b , v20.16b , #4 + ext v26.16b, v18.16b , v20.16b , #6 + + ext v23.16b, v18.16b , v20.16b , #10 + add v0.8h, v24.8h , v26.8h + ext v24.16b, v18.16b , v20.16b , #2 + ext v26.16b, v18.16b , v20.16b , #8 + add v24.8h, v24.8h , v26.8h + + saddl v26.4s, v18.4h, v23.4h + smlal v26.4s, v0.4h, v28.4h + smlsl v26.4s, v24.4h, v30.4h + + saddl2 v23.4s, v18.8h, v23.8h + smlal2 v23.4s, v0.8h, v28.8h + smlsl2 v23.4s, v24.8h, v30.8h + + sqrshrun v18.4h, v26.4s, #10 + sqrshrun v19.4h, v23.4s, #10 + + + uqxtn v18.8b, v18.8h + uqxtn v19.8b, v19.8h + mov v18.2s[1], v19.2s[0] + + ext v24.16b, v20.16b , v22.16b , #4 + ext v26.16b, v20.16b , v22.16b , #6 + ext v0.16b, v20.16b , v22.16b , #10 + + add v25.8h, v24.8h , v26.8h + ext v24.16b, v20.16b , v22.16b , #2 + ext v26.16b, v20.16b , v22.16b , #8 + add v24.8h, v24.8h , v26.8h + + saddl v26.4s, v0.4h, v20.4h + smlal v26.4s, v25.4h, v28.4h + smlsl v26.4s, v24.4h, v30.4h + + saddl2 v22.4s, v0.8h, v20.8h + smlal2 v22.4s, v25.8h, v28.8h + smlsl2 v22.4s, v24.8h, v30.8h + + sqrshrun v19.4h, v26.4s, #10 + sqrshrun v25.4h, v22.4s, #10 + + uaddl v24.8h, v7.8b, v9.8b + + + + uqxtn v19.8b, v19.8h + uqxtn v25.8b, v25.8h + mov v19.2s[1], v25.2s[0] + + uaddl v22.8h, v4.8b, v10.8b + ld1 {v0.2s, v1.2s}, [x0], #16 // Vector load from src[6_0] + + + ld1 {v12.2s}, [x0], x2 // Vector load from src[6_0] + uaddl v20.8h, v6.8b, v8.8b + uaddl v26.8h, v5.8b, v11.8b + st1 {v18.2s, v19.2s}, [x1], x3 // store row 0 + + +//ROW_2 + + + uaddl v18.8h, v2.8b, v0.8b + + mla v18.8h, v20.8h , v28.8h + + uaddl v20.8h, v3.8b, v1.8b + + mla v20.8h, v24.8h , v28.8h + uaddl v24.8h, v15.8b, v16.8b + mls v18.8h, v22.8h , v30.8h + uaddl v22.8h, v13.8b, v12.8b + mls v20.8h, v26.8h , v30.8h + uaddl v26.8h, v14.8b, v17.8b + mla v22.8h, v24.8h , v28.8h + mls v22.8h, v26.8h , v30.8h + + ext v24.16b, v18.16b , v20.16b , #4 + ext v26.16b, v18.16b , v20.16b , #6 + + ext v23.16b, v18.16b , v20.16b , #10 + add v2.8h, v24.8h , v26.8h + ext v24.16b, v18.16b , v20.16b , #2 + ext v26.16b, v18.16b , v20.16b , #8 + add v24.8h, v24.8h , v26.8h + + saddl v26.4s, v18.4h, v23.4h + smlal v26.4s, v2.4h, v28.4h + smlsl v26.4s, v24.4h, v30.4h + + saddl2 v23.4s, v18.8h, v23.8h + smlal2 v23.4s, v2.8h, v28.8h + smlsl2 v23.4s, v24.8h, v30.8h + + sqrshrun v18.4h, v26.4s, #10 + sqrshrun v19.4h, v23.4s, #10 + + + + uqxtn v18.8b, v18.8h + uqxtn v19.8b, v19.8h + mov v18.2s[1], v19.2s[0] + + ext v24.16b, v20.16b , v22.16b , #4 + ext v26.16b, v20.16b , v22.16b , #6 + ext v2.16b, v20.16b , v22.16b , #10 + + add v25.8h, v24.8h , v26.8h + ext v24.16b, v20.16b , v22.16b , #2 + ext v26.16b, v20.16b , v22.16b , #8 + add v24.8h, v24.8h , v26.8h + + saddl v26.4s, v2.4h, v20.4h + smlal v26.4s, v25.4h, v28.4h + smlsl v26.4s, v24.4h, v30.4h + + saddl2 v22.4s, v2.8h, v20.8h + smlal2 v22.4s, v25.8h, v28.8h + smlsl2 v22.4s, v24.8h, v30.8h + + sqrshrun v19.4h, v26.4s, #10 + sqrshrun v25.4h, v22.4s, #10 + uaddl v24.8h, v9.8b, v11.8b + + uqxtn v19.8b, v19.8h + uqxtn v25.8b, v25.8h + mov v19.2s[1], v25.2s[0] + + + uaddl v22.8h, v6.8b, v0.8b + ld1 {v2.2s, v3.2s}, [x0], #16 // Vector load from src[7_0] + + + ld1 {v13.2s}, [x0], x2 // Vector load from src[7_0] + uaddl v20.8h, v8.8b, v10.8b + uaddl v26.8h, v7.8b, v1.8b + st1 {v18.2s, v19.2s}, [x1], x3 // store row 1 + +//ROW_3 + + + uaddl v18.8h, v4.8b, v2.8b + + mla v18.8h, v20.8h , v28.8h + + uaddl v20.8h, v5.8b, v3.8b + + mla v20.8h, v24.8h , v28.8h + uaddl v24.8h, v16.8b, v17.8b + mls v18.8h, v22.8h , v30.8h + uaddl v22.8h, v14.8b, v13.8b + mls v20.8h, v26.8h , v30.8h + uaddl v26.8h, v15.8b, v12.8b + mla v22.8h, v24.8h , v28.8h + mls v22.8h, v26.8h , v30.8h + + ext v24.16b, v18.16b , v20.16b , #4 + ext v26.16b, v18.16b , v20.16b , #6 + + ext v23.16b, v18.16b , v20.16b , #10 + add v4.8h, v24.8h , v26.8h + ext v24.16b, v18.16b , v20.16b , #2 + ext v26.16b, v18.16b , v20.16b , #8 + add v24.8h, v24.8h , v26.8h + + saddl v26.4s, v18.4h, v23.4h + smlal v26.4s, v4.4h, v28.4h + smlsl v26.4s, v24.4h, v30.4h + + saddl2 v23.4s, v18.8h, v23.8h + smlal2 v23.4s, v4.8h, v28.8h + smlsl2 v23.4s, v24.8h, v30.8h + + sqrshrun v18.4h, v26.4s, #10 + sqrshrun v19.4h, v23.4s, #10 + + + uqxtn v18.8b, v18.8h + uqxtn v19.8b, v19.8h + mov v18.2s[1], v19.2s[0] + + + ext v24.16b, v20.16b , v22.16b , #4 + ext v26.16b, v20.16b , v22.16b , #6 + ext v4.16b, v20.16b , v22.16b , #10 + + add v25.8h, v24.8h , v26.8h + ext v24.16b, v20.16b , v22.16b , #2 + ext v26.16b, v20.16b , v22.16b , #8 + add v24.8h, v24.8h , v26.8h + + saddl v26.4s, v4.4h, v20.4h + smlal v26.4s, v25.4h, v28.4h + smlsl v26.4s, v24.4h, v30.4h + + saddl2 v22.4s, v4.8h, v20.8h + smlal2 v22.4s, v25.8h, v28.8h + smlsl2 v22.4s, v24.8h, v30.8h + + sqrshrun v19.4h, v26.4s, #10 + sqrshrun v25.4h, v22.4s, #10 + + uaddl v24.8h, v11.8b, v1.8b + + + uqxtn v19.8b, v19.8h + uqxtn v25.8b, v25.8h + mov v19.2s[1], v25.2s[0] + + + + uaddl v22.8h, v8.8b, v2.8b + ld1 {v4.2s, v5.2s}, [x0], #16 // Vector load from src[8_0] + + + ld1 {v14.2s}, [x0], x2 // Vector load from src[8_0] + uaddl v20.8h, v10.8b, v0.8b + uaddl v26.8h, v9.8b, v3.8b + st1 {v18.2s, v19.2s}, [x1], x3 // store row 2 + + +//ROW_4 + + uaddl v18.8h, v6.8b, v4.8b + + mla v18.8h, v20.8h , v28.8h + + uaddl v20.8h, v7.8b, v5.8b + + mla v20.8h, v24.8h , v28.8h + uaddl v24.8h, v17.8b, v12.8b + mls v18.8h, v22.8h , v30.8h + uaddl v22.8h, v15.8b, v14.8b + mls v20.8h, v26.8h , v30.8h + uaddl v26.8h, v16.8b, v13.8b + mla v22.8h, v24.8h , v28.8h + mls v22.8h, v26.8h , v30.8h + + ext v24.16b, v18.16b , v20.16b , #4 + ext v26.16b, v18.16b , v20.16b , #6 + + ext v23.16b, v18.16b , v20.16b , #10 + add v6.8h, v24.8h , v26.8h + ext v24.16b, v18.16b , v20.16b , #2 + ext v26.16b, v18.16b , v20.16b , #8 + add v24.8h, v24.8h , v26.8h + + saddl v26.4s, v18.4h, v23.4h + smlal v26.4s, v6.4h, v28.4h + smlsl v26.4s, v24.4h, v30.4h + + saddl2 v23.4s, v18.8h, v23.8h + smlal2 v23.4s, v6.8h, v28.8h + smlsl2 v23.4s, v24.8h, v30.8h + + sqrshrun v18.4h, v26.4s, #10 + sqrshrun v19.4h, v23.4s, #10 + + uqxtn v18.8b, v18.8h + uqxtn v19.8b, v19.8h + mov v18.2s[1], v19.2s[0] + + + ext v24.16b, v20.16b , v22.16b , #4 + ext v26.16b, v20.16b , v22.16b , #6 + ext v6.16b, v20.16b , v22.16b , #10 + + add v25.8h, v24.8h , v26.8h + ext v24.16b, v20.16b , v22.16b , #2 + ext v26.16b, v20.16b , v22.16b , #8 + add v24.8h, v24.8h , v26.8h + + saddl v26.4s, v6.4h, v20.4h + smlal v26.4s, v25.4h, v28.4h + smlsl v26.4s, v24.4h, v30.4h + + saddl2 v22.4s, v6.8h, v20.8h + smlal2 v22.4s, v25.8h, v28.8h + smlsl2 v22.4s, v24.8h, v30.8h + + mov v6.16b, v2.16b + mov v7.16b, v3.16b + + mov v2.16b, v10.16b + mov v3.16b, v11.16b + + subs x4, x4, #4 + sqrshrun v19.4h, v26.4s, #10 + sqrshrun v25.4h, v22.4s, #10 + mov v10.16b, v0.16b + mov v11.16b, v1.16b + + mov v24.8b, v14.8b + + mov v14.16b, v12.16b + mov v15.16b, v13.16b + + + uqxtn v19.8b, v19.8h + uqxtn v25.8b, v25.8h + mov v19.2s[1], v25.2s[0] + + + + mov v0.16b, v8.16b + mov v1.16b, v9.16b + + mov v8.16b, v4.16b + mov v9.16b, v5.16b + + mov v12.16b, v16.16b + mov v13.16b, v17.16b + + mov v4.16b, v10.16b + mov v5.16b, v11.16b + + mov v16.8b, v24.8b + st1 {v18.2s, v19.2s}, [x1], x3 // store row 3 + + bgt loop_16 // looping if height =16 + b end_func + +loop_8_start: + ld1 {v0.2s, v1.2s}, [x0], x2 // Vector load from src[0_0] + ld1 {v2.2s, v3.2s}, [x0], x2 // Vector load from src[1_0] + ld1 {v4.2s, v5.2s}, [x0], x2 // Vector load from src[2_0] + ld1 {v6.2s, v7.2s}, [x0], x2 // Vector load from src[3_0] + ld1 {v8.2s, v9.2s}, [x0], x2 // Vector load from src[4_0] + +loop_8: + + ld1 {v10.2s, v11.2s}, [x0], x2 // Vector load from src[5_0] + uaddl v14.8h, v4.8b, v6.8b + uaddl v12.8h, v0.8b, v10.8b + uaddl v16.8h, v2.8b, v8.8b + mla v12.8h, v14.8h , v26.8h + uaddl v18.8h, v5.8b, v7.8b + uaddl v14.8h, v1.8b, v11.8b + uaddl v22.8h, v3.8b, v9.8b + mla v14.8h, v18.8h , v26.8h + mls v12.8h, v16.8h , v24.8h + ld1 {v0.2s, v1.2s}, [x0], x2 // Vector load from src[6_0] + uaddl v16.8h, v6.8b, v8.8b + mls v14.8h, v22.8h , v24.8h + uaddl v28.8h, v2.8b, v0.8b + + ext v22.16b, v12.16b , v14.16b , #10 + uaddl v18.8h, v4.8b, v10.8b + mla v28.8h, v16.8h , v26.8h + saddl v30.4s, v12.4h, v22.4h + + saddl2 v22.4s, v12.8h, v22.8h + ext v16.16b, v12.16b , v14.16b , #4 + mls v28.8h, v18.8h , v24.8h + ext v18.16b, v12.16b , v14.16b , #6 + ext v20.16b, v12.16b , v14.16b , #8 + ext v14.16b, v12.16b , v14.16b , #2 + add v16.8h, v16.8h , v18.8h + add v18.8h, v14.8h , v20.8h + uaddl v20.8h, v7.8b, v9.8b + smlal v30.4s, v16.4h, v26.4h + smlsl v30.4s, v18.4h, v24.4h + smlal2 v22.4s, v16.8h, v26.8h + smlsl2 v22.4s, v18.8h, v24.8h + uaddl v14.8h, v3.8b, v1.8b + + mla v14.8h, v20.8h , v26.8h + sqrshrun v12.4h, v30.4s, #10 + uaddl v16.8h, v5.8b, v11.8b + sqrshrun v13.4h, v22.4s, #10 + mls v14.8h, v16.8h , v24.8h + ld1 {v2.2s, v3.2s}, [x0], x2 // Vector load from src[7_0] + uqxtn v25.8b, v12.8h + uqxtn v13.8b, v13.8h + mov v25.2s[1], v13.2s[0] + uaddl v16.8h, v8.8b, v10.8b + + + ext v22.16b, v28.16b , v14.16b , #10 + uaddl v20.8h, v4.8b, v2.8b + saddl v30.4s, v28.4h, v22.4h + mla v20.8h, v16.8h , v26.8h + + saddl2 v22.4s, v28.8h, v22.8h + ext v16.16b, v28.16b , v14.16b , #4 + ext v18.16b, v28.16b , v14.16b , #6 + ext v12.16b, v28.16b , v14.16b , #8 + ext v14.16b, v28.16b , v14.16b , #2 + add v16.8h, v16.8h , v18.8h + add v18.8h, v12.8h , v14.8h + + smlal v30.4s, v16.4h, v26.4h + smlsl v30.4s, v18.4h, v24.4h + smlal2 v22.4s, v16.8h, v26.8h + smlsl2 v22.4s, v18.8h, v24.8h + + + uaddl v18.8h, v6.8b, v0.8b + sqrshrun v16.4h, v30.4s, #10 + + sqrshrun v17.4h, v22.4s, #10 + + mov v12.8b, v25.8b + mov v25.8b, v24.8b + + uaddl v28.8h, v9.8b, v11.8b + uqxtn v13.8b, v16.8h + uqxtn v17.8b, v17.8h + mov v13.2s[1], v17.2s[0] + + + uaddl v14.8h, v5.8b, v3.8b + uaddl v22.8h, v7.8b, v1.8b + mls v20.8h, v18.8h , v24.8h + st1 {v12.2s}, [x1], x3 // store row 0 + mla v14.8h, v28.8h , v26.8h + ld1 {v4.2s, v5.2s}, [x0], x2 // Vector load from src[8_0] + uaddl v30.8h, v10.8b, v0.8b + uaddl v28.8h, v6.8b, v4.8b + mls v14.8h, v22.8h , v24.8h + st1 {v13.2s}, [x1], x3 // store row 1 + mla v28.8h, v30.8h , v26.8h + + ext v22.16b, v20.16b , v14.16b , #10 + saddl v30.4s, v20.4h, v22.4h + + saddl2 v22.4s, v20.8h, v22.8h + ext v16.16b, v20.16b , v14.16b , #4 + ext v18.16b, v20.16b , v14.16b , #6 + ext v12.16b, v20.16b , v14.16b , #8 + ext v14.16b, v20.16b , v14.16b , #2 + add v16.8h, v16.8h , v18.8h + add v18.8h, v14.8h , v12.8h + uaddl v20.8h, v8.8b, v2.8b + smlal v30.4s, v16.4h, v26.4h + smlsl v30.4s, v18.4h, v24.4h + smlal2 v22.4s, v16.8h, v26.8h + smlsl2 v22.4s, v18.8h, v24.8h + uaddl v18.8h, v11.8b, v1.8b + uaddl v16.8h, v7.8b, v5.8b + sqrshrun v12.4h, v30.4s, #10 + uaddl v30.8h, v9.8b, v3.8b + mla v16.8h, v18.8h , v26.8h + sqrshrun v13.4h, v22.4s, #10 + mls v28.8h, v20.8h , v24.8h + + mls v16.8h, v30.8h , v24.8h + uqxtn v27.8b, v12.8h + uqxtn v13.8b, v13.8h + mov v27.2s[1], v13.2s[0] + + + ext v22.16b, v28.16b , v16.16b , #10 + + saddl v30.4s, v28.4h, v22.4h + + saddl2 v22.4s, v28.8h, v22.8h + ext v12.16b, v28.16b , v16.16b , #4 + ext v18.16b, v28.16b , v16.16b , #6 + ext v20.16b, v28.16b , v16.16b , #8 + ext v28.16b, v28.16b , v16.16b , #2 + add v12.8h, v12.8h , v18.8h + add v18.8h, v28.8h , v20.8h + + smlal v30.4s, v12.4h, v26.4h + smlsl v30.4s, v18.4h, v24.4h + smlal2 v22.4s, v12.8h, v26.8h + smlsl2 v22.4s, v18.8h, v24.8h + + + mov v12.8b, v27.8b + mov v27.8b, v26.8b + + sqrshrun v16.4h, v30.4s, #10 + + mov v6.16b, v2.16b + mov v7.16b, v3.16b + + sqrshrun v17.4h, v22.4s, #10 + + mov v2.16b, v10.16b + mov v3.16b, v11.16b + + mov v10.16b, v0.16b + mov v11.16b, v1.16b + + subs x4, x4, #4 + uqxtn v13.8b, v16.8h + uqxtn v17.8b, v17.8h + mov v13.2s[1], v17.2s[0] + + + mov v0.16b, v8.16b + mov v1.16b, v9.16b + + mov v8.16b, v4.16b + mov v9.16b, v5.16b + + mov v4.16b, v10.16b + mov v5.16b, v11.16b + + st1 {v12.2s}, [x1], x3 // store row 2 + st1 {v13.2s}, [x1], x3 // store row 3 + + bgt loop_8 //if height =8 loop + b end_func + +loop_4_start: + ld1 {v0.2s, v1.2s}, [x0], x2 // Vector load from src[0_0] + ld1 {v2.2s, v3.2s}, [x0], x2 // Vector load from src[1_0] + ld1 {v4.2s, v5.2s}, [x0], x2 // Vector load from src[2_0] + ld1 {v6.2s, v7.2s}, [x0], x2 // Vector load from src[3_0] + ld1 {v8.2s, v9.2s}, [x0], x2 // Vector load from src[4_0] + +loop_4: + ld1 {v10.2s, v11.2s}, [x0], x2 // Vector load from src[5_0] + uaddl v14.8h, v4.8b, v6.8b // temp1 = src[2_0] + src[3_0] + uaddl v12.8h, v0.8b, v10.8b // temp = src[0_0] + src[5_0] + uaddl v16.8h, v2.8b, v8.8b // temp2 = src[1_0] + src[4_0] + mla v12.8h, v14.8h , v26.8h // temp += temp1 * 20 + uaddl v18.8h, v5.8b, v7.8b // temp1 = src[2_0] + src[3_0] + uaddl v14.8h, v1.8b, v11.8b // temp = src[0_0] + src[5_0] + uaddl v22.8h, v3.8b, v9.8b // temp2 = src[1_0] + src[4_0] + mla v14.8h, v18.8h , v26.8h // temp += temp1 * 20 + mls v12.8h, v16.8h , v24.8h // temp -= temp2 * 5 + ld1 {v0.2s, v1.2s}, [x0], x2 // Vector load from src[6_0] + uaddl v16.8h, v6.8b, v8.8b + mls v14.8h, v22.8h , v24.8h // temp -= temp2 * 5 + //Q6 and Q7 have filtered values + uaddl v28.8h, v2.8b, v0.8b + + ext v22.16b, v12.16b , v14.16b , #10 + uaddl v18.8h, v4.8b, v10.8b + mla v28.8h, v16.8h , v26.8h + saddl v30.4s, v12.4h, v22.4h + + saddl v22.4s, v13.4h, v23.4h + ext v16.16b, v12.16b , v14.16b , #4 + mls v28.8h, v18.8h , v24.8h + ext v18.16b, v12.16b , v14.16b , #6 + ext v20.16b, v12.16b , v14.16b , #8 + ext v14.16b, v12.16b , v14.16b , #2 + add v16.8h, v16.8h , v18.8h + add v18.8h, v14.8h , v20.8h + uaddl v20.8h, v7.8b, v9.8b + smlal v30.4s, v16.4h, v26.4h + smlsl v30.4s, v18.4h, v24.4h + smlal v22.4s, v17.4h, v26.4h + smlsl v22.4s, v19.4h, v24.4h + uaddl v14.8h, v3.8b, v1.8b + + mla v14.8h, v20.8h , v26.8h + sqrshrun v12.4h, v30.4s, #10 + uaddl v16.8h, v5.8b, v11.8b + sqrshrun v13.4h, v22.4s, #10 + mls v14.8h, v16.8h , v24.8h + ld1 {v2.2s, v3.2s}, [x0], x2 // Vector load from src[7_0] + uqxtn v25.8b, v12.8h + uaddl v16.8h, v8.8b, v10.8b + + ext v22.16b, v28.16b , v14.16b , #10 + uaddl v20.8h, v4.8b, v2.8b + saddl v30.4s, v28.4h, v22.4h + mla v20.8h, v16.8h , v26.8h + + saddl v22.4s, v29.4h, v23.4h + ext v16.16b, v28.16b , v14.16b , #4 + ext v18.16b, v28.16b , v14.16b , #6 + ext v12.16b, v28.16b , v14.16b , #8 + ext v14.16b, v28.16b , v14.16b , #2 + add v16.8h, v16.8h , v18.8h + add v18.8h, v12.8h , v14.8h + + smlal v30.4s, v16.4h, v26.4h + smlsl v30.4s, v18.4h, v24.4h + smlal v22.4s, v17.4h, v26.4h + smlsl v22.4s, v19.4h, v24.4h + + + uaddl v18.8h, v6.8b, v0.8b + sqrshrun v16.4h, v30.4s, #10 + + sqrshrun v17.4h, v22.4s, #10 + + mov v12.8b, v25.8b + mov v25.8b, v24.8b + + uaddl v28.8h, v9.8b, v11.8b + uqxtn v13.8b, v16.8h + + + + uaddl v14.8h, v5.8b, v3.8b + uaddl v22.8h, v7.8b, v1.8b + mls v20.8h, v18.8h , v24.8h + st1 {v12.s}[0], [x1], x3 // store row 0 + mla v14.8h, v28.8h , v26.8h + ld1 {v4.2s, v5.2s}, [x0], x2 // Vector load from src[8_0] + uaddl v30.8h, v10.8b, v0.8b + uaddl v28.8h, v6.8b, v4.8b + mls v14.8h, v22.8h , v24.8h + st1 {v13.s}[0], [x1], x3 //store row 1 + mla v28.8h, v30.8h , v26.8h + + ext v22.16b, v20.16b , v14.16b , #10 + saddl v30.4s, v20.4h, v22.4h + + saddl v22.4s, v21.4h, v23.4h + ext v16.16b, v20.16b , v14.16b , #4 + ext v18.16b, v20.16b , v14.16b , #6 + ext v12.16b, v20.16b , v14.16b , #8 + ext v14.16b, v20.16b , v14.16b , #2 + add v16.8h, v16.8h , v18.8h + add v18.8h, v14.8h , v12.8h + uaddl v20.8h, v8.8b, v2.8b + smlal v30.4s, v16.4h, v26.4h + smlsl v30.4s, v18.4h, v24.4h + smlal v22.4s, v17.4h, v26.4h + smlsl v22.4s, v19.4h, v24.4h + uaddl v18.8h, v11.8b, v1.8b + uaddl v16.8h, v7.8b, v5.8b + sqrshrun v12.4h, v30.4s, #10 + uaddl v30.8h, v9.8b, v3.8b + mla v16.8h, v18.8h , v26.8h + sqrshrun v13.4h, v22.4s, #10 + mls v28.8h, v20.8h , v24.8h + + mls v16.8h, v30.8h , v24.8h + uqxtn v27.8b, v12.8h + + ext v22.16b, v28.16b , v16.16b , #10 + + saddl v30.4s, v28.4h, v22.4h + + saddl v22.4s, v29.4h, v23.4h + ext v12.16b, v28.16b , v16.16b , #4 + ext v18.16b, v28.16b , v16.16b , #6 + ext v20.16b, v28.16b , v16.16b , #8 + ext v28.16b, v28.16b , v16.16b , #2 + add v12.8h, v12.8h , v18.8h + add v18.8h, v28.8h , v20.8h + + smlal v30.4s, v12.4h, v26.4h + smlsl v30.4s, v18.4h, v24.4h + smlal v22.4s, v13.4h, v26.4h + smlsl v22.4s, v19.4h, v24.4h + + + mov v12.8b, v27.8b + mov v27.8b, v26.8b + + sqrshrun v16.4h, v30.4s, #10 + + mov v6.16b, v2.16b + mov v7.16b, v3.16b + + sqrshrun v17.4h, v22.4s, #10 + + mov v2.16b, v10.16b + mov v3.16b, v11.16b + + mov v10.16b, v0.16b + mov v11.16b, v1.16b + + subs x4, x4, #4 + uqxtn v13.8b, v16.8h + + mov v0.16b, v8.16b + mov v1.16b, v9.16b + + mov v8.16b, v4.16b + mov v9.16b, v5.16b + + + mov v4.16b, v10.16b + mov v5.16b, v11.16b + + + st1 {v12.s}[0], [x1], x3 // store row 2 + st1 {v13.s}[0], [x1], x3 // store row 3 + + bgt loop_4 + +end_func: + //Restoring registers from stack + ldp x19, x20, [sp], #16 + pop_v_regs + ret + + + diff --git a/common/armv8/ih264_inter_pred_luma_horz_hpel_vert_qpel_av8.s b/common/armv8/ih264_inter_pred_luma_horz_hpel_vert_qpel_av8.s new file mode 100755 index 0000000..3737e3f --- /dev/null +++ b/common/armv8/ih264_inter_pred_luma_horz_hpel_vert_qpel_av8.s @@ -0,0 +1,1120 @@ +//****************************************************************************** +//* +//* Copyright (C) 2015 The Android Open Source Project +//* +//* Licensed under the Apache License, Version 2.0 (the "License"); +//* you may not use this file except in compliance with the License. +//* You may obtain a copy of the License at: +//* +//* http://www.apache.org/licenses/LICENSE-2.0 +//* +//* Unless required by applicable law or agreed to in writing, software +//* distributed under the License is distributed on an "AS IS" BASIS, +//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +//* See the License for the specific language governing permissions and +//* limitations under the License. +//* +//***************************************************************************** +//* Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore +//*/ +///** +//****************************************************************************** +//* @file +//* ih264_inter_pred_luma_horz_hpel_vert_qpel_av8.s +//* +//* @brief +//* Contains function definitions for inter prediction interpolation. +//* +//* @author +//* Mohit +//* +//* @par List of Functions: +//* +//* - ih264_inter_pred_luma_horz_hpel_vert_qpel_av8() +//* +//* @remarks +//* None +//* +//******************************************************************************* +//*/ + +///* All the functions here are replicated from ih264_inter_pred_filters.c +// + +///** +///** +///** +//******************************************************************************* +//* +//* @brief +//* This function implements a two stage cascaded six tap filter. It +//* applies the six tap filter in the horizontal direction on the +//* predictor values, followed by applying the same filter in the +//* vertical direction on the output of the first stage. It then averages +//* the output of the 1st stage and the output of the 2nd stage to obtain +//* the quarter pel values. The six tap filtering operation is described +//* in sec 8.4.2.2.1 titled "Luma sample interpolation process". +//* +//* @par Description: +//* This function is called to obtain pixels lying at the following +//* location (1/2,1/4) or (1/2,3/4). The function interpolates +//* the predictors first in the horizontal direction and then in the +//* vertical direction to output the (1/2,1/2). It then averages +//* the output of the 2nd stage and (1/2,1/2) value to obtain (1/2,1/4) +//* or (1/2,3/4) depending on the offset. +//* +//* @param[in] pu1_src +//* UWORD8 pointer to the source +//* +//* @param[out] pu1_dst +//* UWORD8 pointer to the destination +//* +//* @param[in] src_strd +//* integer source stride +//* +//* @param[in] dst_strd +//* integer destination stride +//* +//* @param[in] ht +//* integer height of the array +//* +//* @param[in] wd +//* integer width of the array +//* +//* @param[in] pu1_tmp: temporary buffer +//* +//* @param[in] dydx: x and y reference offset for qpel calculations +//* +//* @returns +//* +//* @remarks +//* None +//* +//******************************************************************************* +//*/; + +//void ih264_inter_pred_luma_horz_hpel_vert_qpel(UWORD8 *pu1_src, +// UWORD8 *pu1_dst, +// WORD32 src_strd,, +// WORD32 dst_strd, +// WORD32 ht, +// WORD32 wd, +// UWORD8* pu1_tmp, +// UWORD32 dydx) + +//**************Variables Vs Registers***************************************** +// x0 => *pu1_src +// x1 => *pu1_dst +// x2 => src_strd +// x3 => dst_strd +// x4 => ht +// x5 => wd +// x7 => dydx +// x9 => *pu1_tmp + +.text +.p2align 2 +.include "ih264_neon_macros.s" + + + + .global ih264_inter_pred_luma_horz_hpel_vert_qpel_av8 + +ih264_inter_pred_luma_horz_hpel_vert_qpel_av8: + + + // store register values to stack + push_v_regs + stp x19, x20, [sp, #-16]! + + + + sub x0, x0, x2, lsl #1 // pu1_src-2*src_strd + sub x0, x0, #2 // pu1_src-2 + + mov x9, x6 + + lsr x7, x7, #3 // dydx >> 2 followed by dydx & 0x3 and dydx>>1 to obtain the deciding bit + + add x7, x7, #2 + mov x6, #48 + madd x7, x7, x6, x9 + + subs x12, x5, #4 //if wd=4 branch to loop_4 + beq loop_4_start + + subs x12, x5, #8 //if wd=8 branch to loop_8 + beq loop_8_start + + //when wd=16 + movi v22.8h, #20 // Filter coeff 0x14 into Q11 + movi v24.8h, #5 // Filter coeff 0x5 into Q12 + add x8, x0, #8 + add x14, x1, #8 + add x10, x9, #8 + mov x12, x4 + add x11, x7, #8 +loop_16_lowhalf_start: + ld1 {v0.2s, v1.2s}, [x0], x2 // row -2 load for horizontal filter + ext v5.8b, v0.8b , v1.8b , #5 + uaddl v6.8h, v0.8b, v5.8b + + ext v2.8b, v0.8b , v1.8b , #2 + ext v3.8b, v0.8b , v1.8b , #3 + uaddl v8.8h, v2.8b, v3.8b + ext v4.8b, v0.8b , v1.8b , #4 + mla v6.8h, v8.8h , v22.8h + ext v1.8b, v0.8b , v1.8b , #1 + uaddl v8.8h, v1.8b, v4.8b + ld1 {v0.2s, v1.2s}, [x0], x2 // row -1 load for horizontal filter + mls v6.8h, v8.8h , v24.8h + ext v5.8b, v0.8b , v1.8b , #5 + uaddl v8.8h, v0.8b, v5.8b + ext v2.8b, v0.8b , v1.8b , #2 + ext v3.8b, v0.8b , v1.8b , #3 + uaddl v10.8h, v2.8b, v3.8b + + st1 {v6.4s}, [x9], x6 // store temp buffer 0 + + ext v4.8b, v0.8b , v1.8b , #4 + mla v8.8h, v10.8h , v22.8h + ext v1.8b, v0.8b , v1.8b , #1 + uaddl v10.8h, v1.8b, v4.8b + ld1 {v0.2s, v1.2s}, [x0], x2 // row 0 load for horizontal filter + mls v8.8h, v10.8h , v24.8h + ext v5.8b, v0.8b , v1.8b , #5 + uaddl v10.8h, v0.8b, v5.8b + ext v2.8b, v0.8b , v1.8b , #2 + ext v3.8b, v0.8b , v1.8b , #3 + uaddl v12.8h, v2.8b, v3.8b + + st1 {v8.4s}, [x9], x6 // store temp buffer 1 + + ext v4.8b, v0.8b , v1.8b , #4 + mla v10.8h, v12.8h , v22.8h + ext v1.8b, v0.8b , v1.8b , #1 + uaddl v12.8h, v1.8b, v4.8b + ld1 {v0.2s, v1.2s}, [x0], x2 // row 1 load for horizontal filter + mls v10.8h, v12.8h , v24.8h + ext v5.8b, v0.8b , v1.8b , #5 + uaddl v12.8h, v0.8b, v5.8b + ext v2.8b, v0.8b , v1.8b , #2 + ext v3.8b, v0.8b , v1.8b , #3 + uaddl v14.8h, v2.8b, v3.8b + + st1 {v10.4s}, [x9], x6 // store temp buffer 2 + + ext v4.8b, v0.8b , v1.8b , #4 + mla v12.8h, v14.8h , v22.8h + ext v1.8b, v0.8b , v1.8b , #1 + uaddl v14.8h, v1.8b, v4.8b + ld1 {v0.2s, v1.2s}, [x0], x2 // row 2 load for horizontal filter + mls v12.8h, v14.8h , v24.8h + ext v5.8b, v0.8b , v1.8b , #5 + uaddl v14.8h, v0.8b, v5.8b + ext v2.8b, v0.8b , v1.8b , #2 + ext v3.8b, v0.8b , v1.8b , #3 + uaddl v16.8h, v2.8b, v3.8b + + st1 {v12.4s}, [x9], x6 // store temp buffer 3 + + ext v4.8b, v0.8b , v1.8b , #4 + mla v14.8h, v16.8h , v22.8h + ext v1.8b, v0.8b , v1.8b , #1 + uaddl v16.8h, v1.8b, v4.8b + + mls v14.8h, v16.8h , v24.8h +loop_16_lowhalf: + + ld1 {v0.2s, v1.2s}, [x0], x2 // row 3 load for horizontal filter + ext v5.8b, v0.8b , v1.8b , #5 + ext v2.8b, v0.8b , v1.8b , #2 + ext v3.8b, v0.8b , v1.8b , #3 + uaddl v16.8h, v0.8b, v5.8b + + st1 {v14.4s}, [x9], x6 // store temp buffer 4 + + uaddl v18.8h, v2.8b, v3.8b + ext v4.8b, v0.8b , v1.8b , #4 + mla v16.8h, v18.8h , v22.8h + ext v1.8b, v0.8b , v1.8b , #1 + add v28.8h, v8.8h , v14.8h + uaddl v18.8h, v1.8b, v4.8b + add v30.8h, v10.8h , v12.8h + mls v16.8h, v18.8h , v24.8h + ld1 {v0.2s, v1.2s}, [x0], x2 // row 4 load for hoorizontal filter + ext v5.8b, v0.8b , v1.8b , #5 + ext v2.8b, v0.8b , v1.8b , #2 + ext v3.8b, v0.8b , v1.8b , #3 + uaddl v20.8h, v0.8b, v5.8b + + st1 {v16.4s}, [x9], x6 // store temp buffer x5 + + saddl v18.4s, v6.4h, v16.4h + + ld1 {v26.4s}, [x7], x6 // load from temp buffer 0 + + saddl2 v6.4s, v6.8h, v16.8h + + sqrshrun v26.8b, v26.8h, #5 + + smlal v18.4s, v30.4h, v22.4h + smlsl v18.4s, v28.4h, v24.4h + smlal2 v6.4s, v30.8h, v22.8h + smlsl2 v6.4s, v28.8h, v24.8h + uaddl v2.8h, v2.8b, v3.8b + ext v4.8b, v0.8b , v1.8b , #4 + mla v20.8h, v2.8h , v22.8h + sqrshrun v18.4h, v18.4s, #10 + ext v1.8b, v0.8b , v1.8b , #1 + sqrshrun v19.4h, v6.4s, #10 + add v28.8h, v10.8h , v16.8h + uaddl v2.8h, v1.8b, v4.8b + add v30.8h, v12.8h , v14.8h + mls v20.8h, v2.8h , v24.8h + + uqxtn v18.8b, v18.8h + uqxtn v19.8b, v19.8h + mov v18.2s[1], v19.2s[0] + + ld1 {v0.2s, v1.2s}, [x0], x2 // row 5 load for horizontal filter + + urhadd v26.8b, v18.8b , v26.8b + + ext v5.8b, v0.8b , v1.8b , #5 + ext v2.8b, v0.8b , v1.8b , #2 + + st1 {v20.4s}, [x9], x6 // store temp buffer x6 + + saddl v18.4s, v8.4h, v20.4h + + saddl2 v6.4s, v8.8h, v20.8h + + ld1 {v8.4s}, [x7], x6 //load from temp buffer 1 + + + st1 {v26.2s}, [x1], x3 // store row 0 + + smlal v18.4s, v30.4h, v22.4h + smlsl v18.4s, v28.4h, v24.4h + smlal2 v6.4s, v30.8h, v22.8h + smlsl2 v6.4s, v28.8h, v24.8h + + sqrshrun v28.8b, v8.8h, #5 + ext v3.8b, v0.8b , v1.8b , #3 + uaddl v8.8h, v0.8b, v5.8b + uaddl v2.8h, v2.8b, v3.8b + sqrshrun v18.4h, v18.4s, #10 + ext v4.8b, v0.8b , v1.8b , #4 + sqrshrun v19.4h, v6.4s, #10 + mla v8.8h, v2.8h , v22.8h + ext v1.8b, v0.8b , v1.8b , #1 + add v26.8h, v12.8h , v20.8h + uaddl v2.8h, v1.8b, v4.8b + uqxtn v18.8b, v18.8h + uqxtn v19.8b, v19.8h + mov v18.2s[1], v19.2s[0] + add v30.8h, v14.8h , v16.8h + mls v8.8h, v2.8h , v24.8h + ld1 {v0.2s, v1.2s}, [x0], x2 // row 6 load for horizontal filter + + urhadd v28.8b, v28.8b , v18.8b + + ext v5.8b, v0.8b , v1.8b , #5 + ext v2.8b, v0.8b , v1.8b , #2 + ext v3.8b, v0.8b , v1.8b , #3 + + st1 {v28.2s}, [x1], x3 // store row 1 + + uaddl v28.8h, v0.8b, v5.8b + + st1 {v8.4s}, [x9], x6 // store temp buffer x7 + + saddl v18.4s, v10.4h, v8.4h + saddl2 v6.4s, v10.8h, v8.8h + + ld1 {v10.4s}, [x7], x6 // load from temp buffer 2 + + smlal v18.4s, v30.4h, v22.4h + smlsl v18.4s, v26.4h, v24.4h + + smlal2 v6.4s, v30.8h, v22.8h + smlsl2 v6.4s, v26.8h, v24.8h + + sqrshrun v26.8b, v10.8h, #5 + + uaddl v2.8h, v2.8b, v3.8b + ext v4.8b, v0.8b , v1.8b , #4 + mla v28.8h, v2.8h , v22.8h + sqrshrun v18.4h, v18.4s, #10 + ext v1.8b, v0.8b , v1.8b , #1 + sqrshrun v19.4h, v6.4s, #10 + add v10.8h, v14.8h , v8.8h + uaddl v2.8h, v1.8b, v4.8b + add v30.8h, v16.8h , v20.8h + mls v28.8h, v2.8h , v24.8h + uqxtn v27.8b, v18.8h + uqxtn v19.8b, v19.8h + mov v27.2s[1], v19.2s[0] + saddl v18.4s, v12.4h, v28.4h + saddl2 v6.4s, v12.8h, v28.8h + + urhadd v26.8b, v26.8b , v27.8b + + smlal v18.4s, v30.4h, v22.4h + smlsl v18.4s, v10.4h, v24.4h + smlal2 v6.4s, v30.8h, v22.8h + smlsl2 v6.4s, v10.8h, v24.8h + + st1 {v26.2s}, [x1], x3 // store row 2 + + st1 {v28.2s, v29.2s}, [x9] + + + sqrshrun v18.4h, v18.4s, #10 + + mov v10.16b, v20.16b + mov v11.16b, v21.16b + ld1 {v30.4s}, [x7], x6 // load from temp buffer 3 + + sqrshrun v19.4h, v6.4s, #10 + subs x4, x4, #4 + + sqrshrun v30.8b, v30.8h, #5 + + uqxtn v18.8b, v18.8h + uqxtn v19.8b, v19.8h + mov v18.2s[1], v19.2s[0] + + mov v12.16b, v8.16b + mov v13.16b, v9.16b + mov v6.16b, v14.16b + mov v7.16b, v15.16b + + urhadd v30.8b, v18.8b , v30.8b + + mov v8.16b, v16.16b + mov v9.16b, v17.16b + mov v14.16b, v28.16b + mov v15.16b, v29.16b + + st1 {v30.2s}, [x1], x3 // store row 3 + + bgt loop_16_lowhalf // looping if height =16 + + +loop_16_highhalf_start: + ld1 {v0.2s, v1.2s}, [x8], x2 + ext v5.8b, v0.8b , v1.8b , #5 + uaddl v6.8h, v0.8b, v5.8b + ext v2.8b, v0.8b , v1.8b , #2 + ext v3.8b, v0.8b , v1.8b , #3 + uaddl v8.8h, v2.8b, v3.8b + ext v4.8b, v0.8b , v1.8b , #4 + mla v6.8h, v8.8h , v22.8h + ext v1.8b, v0.8b , v1.8b , #1 + uaddl v8.8h, v1.8b, v4.8b + ld1 {v0.2s, v1.2s}, [x8], x2 + mls v6.8h, v8.8h , v24.8h + ext v5.8b, v0.8b , v1.8b , #5 + uaddl v8.8h, v0.8b, v5.8b + ext v2.8b, v0.8b , v1.8b , #2 + ext v3.8b, v0.8b , v1.8b , #3 + uaddl v10.8h, v2.8b, v3.8b + + st1 {v6.4s}, [x10], x6 + + ext v4.8b, v0.8b , v1.8b , #4 + mla v8.8h, v10.8h , v22.8h + ext v1.8b, v0.8b , v1.8b , #1 + uaddl v10.8h, v1.8b, v4.8b + ld1 {v0.2s, v1.2s}, [x8], x2 + mls v8.8h, v10.8h , v24.8h + ext v5.8b, v0.8b , v1.8b , #5 + uaddl v10.8h, v0.8b, v5.8b + ext v2.8b, v0.8b , v1.8b , #2 + ext v3.8b, v0.8b , v1.8b , #3 + uaddl v12.8h, v2.8b, v3.8b + + st1 {v8.4s}, [x10], x6 + + ext v4.8b, v0.8b , v1.8b , #4 + mla v10.8h, v12.8h , v22.8h + ext v1.8b, v0.8b , v1.8b , #1 + uaddl v12.8h, v1.8b, v4.8b + ld1 {v0.2s, v1.2s}, [x8], x2 + mls v10.8h, v12.8h , v24.8h + ext v5.8b, v0.8b , v1.8b , #5 + uaddl v12.8h, v0.8b, v5.8b + ext v2.8b, v0.8b , v1.8b , #2 + ext v3.8b, v0.8b , v1.8b , #3 + uaddl v14.8h, v2.8b, v3.8b + + st1 {v10.4s}, [x10], x6 + + ext v4.8b, v0.8b , v1.8b , #4 + mla v12.8h, v14.8h , v22.8h + ext v1.8b, v0.8b , v1.8b , #1 + uaddl v14.8h, v1.8b, v4.8b + ld1 {v0.2s, v1.2s}, [x8], x2 + mls v12.8h, v14.8h , v24.8h + ext v5.8b, v0.8b , v1.8b , #5 + uaddl v14.8h, v0.8b, v5.8b + ext v2.8b, v0.8b , v1.8b , #2 + ext v3.8b, v0.8b , v1.8b , #3 + uaddl v16.8h, v2.8b, v3.8b + + st1 {v12.4s}, [x10], x6 + + ext v4.8b, v0.8b , v1.8b , #4 + mla v14.8h, v16.8h , v22.8h + ext v1.8b, v0.8b , v1.8b , #1 + uaddl v16.8h, v1.8b, v4.8b + + mls v14.8h, v16.8h , v24.8h + +loop_16_highhalf: + + ld1 {v0.2s, v1.2s}, [x8], x2 + ext v5.8b, v0.8b , v1.8b , #5 + ext v2.8b, v0.8b , v1.8b , #2 + ext v3.8b, v0.8b , v1.8b , #3 + uaddl v16.8h, v0.8b, v5.8b + + st1 {v14.4s}, [x10], x6 + + uaddl v18.8h, v2.8b, v3.8b + ext v4.8b, v0.8b , v1.8b , #4 + mla v16.8h, v18.8h , v22.8h + ext v1.8b, v0.8b , v1.8b , #1 + add v28.8h, v8.8h , v14.8h + uaddl v18.8h, v1.8b, v4.8b + add v30.8h, v10.8h , v12.8h + mls v16.8h, v18.8h , v24.8h + ld1 {v0.2s, v1.2s}, [x8], x2 + ext v5.8b, v0.8b , v1.8b , #5 + ext v2.8b, v0.8b , v1.8b , #2 + ext v3.8b, v0.8b , v1.8b , #3 + uaddl v20.8h, v0.8b, v5.8b + + st1 {v16.4s}, [x10], x6 + + saddl v18.4s, v6.4h, v16.4h + + ld1 {v26.4s}, [x11], x6 + + saddl2 v6.4s, v6.8h, v16.8h + + sqrshrun v26.8b, v26.8h, #5 + + smlal v18.4s, v30.4h, v22.4h + smlsl v18.4s, v28.4h, v24.4h + smlal2 v6.4s, v30.8h, v22.8h + smlsl2 v6.4s, v28.8h, v24.8h + uaddl v2.8h, v2.8b, v3.8b + ext v4.8b, v0.8b , v1.8b , #4 + mla v20.8h, v2.8h , v22.8h + sqrshrun v18.4h, v18.4s, #10 + ext v1.8b, v0.8b , v1.8b , #1 + sqrshrun v19.4h, v6.4s, #10 + add v28.8h, v10.8h , v16.8h + uaddl v2.8h, v1.8b, v4.8b + add v30.8h, v12.8h , v14.8h + mls v20.8h, v2.8h , v24.8h + uqxtn v18.8b, v18.8h + uqxtn v19.8b, v19.8h + mov v18.2s[1], v19.2s[0] + ld1 {v0.2s, v1.2s}, [x8], x2 + + urhadd v26.8b, v18.8b , v26.8b + + ext v5.8b, v0.8b , v1.8b , #5 + ext v2.8b, v0.8b , v1.8b , #2 + + st1 {v20.4s}, [x10], x6 + + saddl v18.4s, v8.4h, v20.4h + saddl2 v6.4s, v8.8h, v20.8h + + ld1 {v8.4s}, [x11], x6 + + + st1 {v26.2s}, [x14], x3 //store row 0 + + smlal v18.4s, v30.4h, v22.4h + smlsl v18.4s, v28.4h, v24.4h + smlal2 v6.4s, v30.8h, v22.8h + smlsl2 v6.4s, v28.8h, v24.8h + sqrshrun v28.8b, v8.8h, #5 + ext v3.8b, v0.8b , v1.8b , #3 + uaddl v8.8h, v0.8b, v5.8b + uaddl v2.8h, v2.8b, v3.8b + sqrshrun v18.4h, v18.4s, #10 + ext v4.8b, v0.8b , v1.8b , #4 + sqrshrun v19.4h, v6.4s, #10 + mla v8.8h, v2.8h , v22.8h + ext v1.8b, v0.8b , v1.8b , #1 + add v26.8h, v12.8h , v20.8h + uaddl v2.8h, v1.8b, v4.8b + uqxtn v18.8b, v18.8h + uqxtn v19.8b, v19.8h + mov v18.2s[1], v19.2s[0] + add v30.8h, v14.8h , v16.8h + mls v8.8h, v2.8h , v24.8h + ld1 {v0.2s, v1.2s}, [x8], x2 + + urhadd v28.8b, v28.8b , v18.8b + + ext v5.8b, v0.8b , v1.8b , #5 + ext v2.8b, v0.8b , v1.8b , #2 + ext v3.8b, v0.8b , v1.8b , #3 + + st1 {v28.2s}, [x14], x3 //store row 1 + + uaddl v28.8h, v0.8b, v5.8b + + st1 {v8.4s}, [x10], x6 + + saddl v18.4s, v10.4h, v8.4h + saddl2 v6.4s, v10.8h, v8.8h + + ld1 {v10.4s}, [x11], x6 + + smlal v18.4s, v30.4h, v22.4h + smlsl v18.4s, v26.4h, v24.4h + smlal2 v6.4s, v30.8h, v22.8h + smlsl2 v6.4s, v26.8h, v24.8h + + sqrshrun v26.8b, v10.8h, #5 + uaddl v2.8h, v2.8b, v3.8b + ext v4.8b, v0.8b , v1.8b , #4 + mla v28.8h, v2.8h , v22.8h + sqrshrun v18.4h, v18.4s, #10 + ext v1.8b, v0.8b , v1.8b , #1 + sqrshrun v19.4h, v6.4s, #10 + add v10.8h, v14.8h , v8.8h + uaddl v2.8h, v1.8b, v4.8b + add v30.8h, v16.8h , v20.8h + mls v28.8h, v2.8h , v24.8h + uqxtn v27.8b, v18.8h + uqxtn v19.8b, v19.8h + mov v27.2s[1], v19.2s[0] + + + saddl v18.4s, v12.4h, v28.4h + saddl2 v6.4s, v12.8h, v28.8h + + urhadd v26.8b, v26.8b , v27.8b + + smlal v18.4s, v30.4h, v22.4h + smlsl v18.4s, v10.4h, v24.4h + smlal2 v6.4s, v30.8h, v22.8h + smlsl2 v6.4s, v10.8h, v24.8h + + st1 {v26.2s}, [x14], x3 // store row 2 + + st1 {v28.4s}, [x10] + + sqrshrun v18.4h, v18.4s, #10 + mov v10.16b, v20.16b + mov v11.16b, v21.16b + ld1 {v30.4s}, [x11], x6 + + sqrshrun v19.4h, v6.4s, #10 + subs x12, x12, #4 + + sqrshrun v30.8b, v30.8h, #5 + + uqxtn v18.8b, v18.8h + uqxtn v19.8b, v19.8h + mov v18.2s[1], v19.2s[0] + + mov v12.16b, v8.16b + mov v13.16b, v9.16b + mov v6.16b, v14.16b + mov v7.16b, v15.16b + urhadd v30.8b, v18.8b , v30.8b + + mov v8.16b, v16.16b + mov v9.16b, v17.16b + mov v14.16b, v28.16b + mov v15.16b, v29.16b + st1 {v30.2s}, [x14], x3 // store row 3 + + bgt loop_16_highhalf // looping if height = 8 or 16 + b end_func + +loop_8_start: + + movi v22.8h, #0x14 // Filter coeff 20 into Q11 + movi v24.8h, #5 // Filter coeff 5 into Q12 + ld1 {v0.2s, v1.2s}, [x0], x2 // row -2 load for horizontal filter + ext v5.8b, v0.8b , v1.8b , #5 + uaddl v6.8h, v0.8b, v5.8b + + ext v2.8b, v0.8b , v1.8b , #2 + ext v3.8b, v0.8b , v1.8b , #3 + uaddl v8.8h, v2.8b, v3.8b + ext v4.8b, v0.8b , v1.8b , #4 + mla v6.8h, v8.8h , v22.8h + ext v1.8b, v0.8b , v1.8b , #1 + uaddl v8.8h, v1.8b, v4.8b + ld1 {v0.2s, v1.2s}, [x0], x2 // row -1 load for horizontal filter + mls v6.8h, v8.8h , v24.8h + ext v5.8b, v0.8b , v1.8b , #5 + uaddl v8.8h, v0.8b, v5.8b + ext v2.8b, v0.8b , v1.8b , #2 + ext v3.8b, v0.8b , v1.8b , #3 + uaddl v10.8h, v2.8b, v3.8b + + st1 {v6.4s}, [x9], x6 // store temp buffer 0 + + ext v4.8b, v0.8b , v1.8b , #4 + mla v8.8h, v10.8h , v22.8h + ext v1.8b, v0.8b , v1.8b , #1 + uaddl v10.8h, v1.8b, v4.8b + ld1 {v0.2s, v1.2s}, [x0], x2 // row 0 load for horizontal filter + mls v8.8h, v10.8h , v24.8h + ext v5.8b, v0.8b , v1.8b , #5 + uaddl v10.8h, v0.8b, v5.8b + ext v2.8b, v0.8b , v1.8b , #2 + ext v3.8b, v0.8b , v1.8b , #3 + uaddl v12.8h, v2.8b, v3.8b + + st1 {v8.4s}, [x9], x6 // store temp buffer 1 + + ext v4.8b, v0.8b , v1.8b , #4 + mla v10.8h, v12.8h , v22.8h + ext v1.8b, v0.8b , v1.8b , #1 + uaddl v12.8h, v1.8b, v4.8b + ld1 {v0.2s, v1.2s}, [x0], x2 // row 1 load for horizontal filter + mls v10.8h, v12.8h , v24.8h + ext v5.8b, v0.8b , v1.8b , #5 + uaddl v12.8h, v0.8b, v5.8b + ext v2.8b, v0.8b , v1.8b , #2 + ext v3.8b, v0.8b , v1.8b , #3 + uaddl v14.8h, v2.8b, v3.8b + + st1 {v10.4s}, [x9], x6 // store temp buffer 2 + + ext v4.8b, v0.8b , v1.8b , #4 + mla v12.8h, v14.8h , v22.8h + ext v1.8b, v0.8b , v1.8b , #1 + uaddl v14.8h, v1.8b, v4.8b + ld1 {v0.2s, v1.2s}, [x0], x2 // row 2 load for horizontal filter + mls v12.8h, v14.8h , v24.8h + ext v5.8b, v0.8b , v1.8b , #5 + uaddl v14.8h, v0.8b, v5.8b + ext v2.8b, v0.8b , v1.8b , #2 + ext v3.8b, v0.8b , v1.8b , #3 + uaddl v16.8h, v2.8b, v3.8b + + st1 {v12.4s}, [x9], x6 // store temp buffer 3 + + ext v4.8b, v0.8b , v1.8b , #4 + mla v14.8h, v16.8h , v22.8h + ext v1.8b, v0.8b , v1.8b , #1 + uaddl v16.8h, v1.8b, v4.8b + + mls v14.8h, v16.8h , v24.8h +loop_8: + + ld1 {v0.2s, v1.2s}, [x0], x2 // row 3 load for horizontal filter + ext v5.8b, v0.8b , v1.8b , #5 + ext v2.8b, v0.8b , v1.8b , #2 + ext v3.8b, v0.8b , v1.8b , #3 + uaddl v16.8h, v0.8b, v5.8b + + st1 {v14.4s}, [x9], x6 // store temp buffer 4 + + uaddl v18.8h, v2.8b, v3.8b + ext v4.8b, v0.8b , v1.8b , #4 + mla v16.8h, v18.8h , v22.8h + ext v1.8b, v0.8b , v1.8b , #1 + add v28.8h, v8.8h , v14.8h + uaddl v18.8h, v1.8b, v4.8b + add v30.8h, v10.8h , v12.8h + mls v16.8h, v18.8h , v24.8h + ld1 {v0.2s, v1.2s} , [x0], x2 // row 4 load for hoorizontal filter + ext v5.8b, v0.8b , v1.8b , #5 + ext v2.8b, v0.8b , v1.8b , #2 + ext v3.8b, v0.8b , v1.8b , #3 + uaddl v20.8h, v0.8b, v5.8b + + st1 {v16.4s}, [x9], x6 // store temp buffer x5 + + saddl v18.4s, v6.4h, v16.4h + + ld1 {v26.4s}, [x7], x6 // load from temp buffer 0 + + saddl2 v6.4s, v6.8h, v16.8h + + sqrshrun v26.8b, v26.8h, #5 + + smlal v18.4s, v30.4h, v22.4h + smlsl v18.4s, v28.4h, v24.4h + smlal2 v6.4s, v30.8h, v22.8h + smlsl2 v6.4s, v28.8h, v24.8h + uaddl v2.8h, v2.8b, v3.8b + ext v4.8b, v0.8b , v1.8b , #4 + mla v20.8h, v2.8h , v22.8h + sqrshrun v18.4h, v18.4s, #10 + ext v1.8b, v0.8b , v1.8b , #1 + sqrshrun v19.4h, v6.4s, #10 + add v28.8h, v10.8h , v16.8h + uaddl v2.8h, v1.8b, v4.8b + add v30.8h, v12.8h , v14.8h + mls v20.8h, v2.8h , v24.8h + + uqxtn v18.8b, v18.8h + uqxtn v19.8b, v19.8h + mov v18.2s[1], v19.2s[0] + + ld1 {v0.2s, v1.2s}, [x0], x2 // row 5 load for horizontal filter + + urhadd v26.8b, v18.8b , v26.8b + + ext v5.8b, v0.8b , v1.8b , #5 + ext v2.8b, v0.8b , v1.8b , #2 + + st1 {v20.4s}, [x9], x6 // store temp buffer x6 + + saddl v18.4s, v8.4h, v20.4h + + saddl2 v6.4s, v8.8h, v20.8h + + ld1 {v8.4s}, [x7], x6 //load from temp buffer 1 + + + st1 {v26.2s}, [x1], x3 // store row 0 + + smlal v18.4s, v30.4h, v22.4h + smlsl v18.4s, v28.4h, v24.4h + + + + smlal2 v6.4s, v30.8h, v22.8h + smlsl2 v6.4s, v28.8h, v24.8h + + sqrshrun v28.8b, v8.8h, #5 + + ext v3.8b, v0.8b , v1.8b , #3 + uaddl v8.8h, v0.8b, v5.8b + uaddl v2.8h, v2.8b, v3.8b + sqrshrun v18.4h, v18.4s, #10 + ext v4.8b, v0.8b , v1.8b , #4 + sqrshrun v19.4h, v6.4s, #10 + mla v8.8h, v2.8h , v22.8h + ext v1.8b, v0.8b , v1.8b , #1 + add v26.8h, v12.8h , v20.8h + uaddl v2.8h, v1.8b, v4.8b + + + uqxtn v18.8b, v18.8h + uqxtn v19.8b, v19.8h + mov v18.2s[1], v19.2s[0] + + add v30.8h, v14.8h , v16.8h + mls v8.8h, v2.8h , v24.8h + ld1 {v0.2s, v1.2s}, [x0], x2 // row 6 load for horizontal filter + + urhadd v28.8b, v28.8b , v18.8b + + ext v5.8b, v0.8b , v1.8b , #5 + ext v2.8b, v0.8b , v1.8b , #2 + ext v3.8b, v0.8b , v1.8b , #3 + + st1 {v28.2s}, [x1], x3 // store row 1 + + uaddl v28.8h, v0.8b, v5.8b + + st1 {v8.4s}, [x9], x6 // store temp buffer x7 + + saddl v18.4s, v10.4h, v8.4h + saddl2 v6.4s, v10.8h, v8.8h + + ld1 {v10.4s}, [x7], x6 // load from temp buffer 2 + + smlal v18.4s, v30.4h, v22.4h + smlsl v18.4s, v26.4h, v24.4h + smlal2 v6.4s, v30.8h, v22.8h + smlsl2 v6.4s, v26.8h, v24.8h + + sqrshrun v26.8b, v10.8h, #5 + uaddl v2.8h, v2.8b, v3.8b + ext v4.8b, v0.8b , v1.8b , #4 + mla v28.8h, v2.8h , v22.8h + sqrshrun v18.4h, v18.4s, #10 + ext v1.8b, v0.8b , v1.8b , #1 + sqrshrun v19.4h, v6.4s, #10 + add v10.8h, v14.8h , v8.8h + uaddl v2.8h, v1.8b, v4.8b + add v30.8h, v16.8h , v20.8h + mls v28.8h, v2.8h , v24.8h + + uqxtn v27.8b, v18.8h + uqxtn v19.8b, v19.8h + + mov v27.2s[1], v19.2s[0] + + saddl v18.4s, v12.4h, v28.4h + saddl2 v6.4s, v12.8h, v28.8h + + urhadd v26.8b, v26.8b , v27.8b + + smlal v18.4s, v30.4h, v22.4h + smlsl v18.4s, v10.4h, v24.4h + smlal2 v6.4s, v30.8h, v22.8h + smlsl2 v6.4s, v10.8h, v24.8h + + st1 {v26.2s}, [x1], x3 // store row 2 + + st1 {v28.2s, v29.2s}, [x9] + + + sqrshrun v18.4h, v18.4s, #10 + mov v10.16b, v20.16b + mov v11.16b, v21.16b + ld1 {v30.4s}, [x7], x6 // load from temp buffer 3 + + sqrshrun v19.4h, v6.4s, #10 + subs x4, x4, #4 + + sqrshrun v30.8b, v30.8h, #5 + + + uqxtn v18.8b, v18.8h + uqxtn v19.8b, v19.8h + mov v18.2s[1], v19.2s[0] + + + mov v12.16b, v8.16b + mov v13.16b, v9.16b + mov v6.16b, v14.16b + mov v7.16b, v15.16b + + urhadd v30.8b, v18.8b , v30.8b + mov v8.16b, v16.16b + mov v9.16b, v17.16b + mov v14.16b, v28.16b + mov v15.16b, v29.16b + st1 {v30.2s}, [x1], x3 // store row 3 + + bgt loop_8 //if height =8 or 16 loop + b end_func + +loop_4_start: + movi v22.8h, #20 // Filter coeff 20 into D22 + movi v23.8h, #5 // Filter coeff 5 into D23 + + ld1 {v0.2s, v1.2s}, [x0], x2 //row -2 load + ext v5.8b, v0.8b , v1.8b , #5 + uaddl v6.8h, v0.8b, v5.8b + ext v2.8b, v0.8b , v1.8b , #2 + ext v3.8b, v0.8b , v1.8b , #3 + uaddl v8.8h, v2.8b, v3.8b + ext v4.8b, v0.8b , v1.8b , #4 + mla v6.4h, v8.4h , v22.4h + ext v1.8b, v0.8b , v1.8b , #1 + uaddl v8.8h, v1.8b, v4.8b + ld1 {v0.2s, v1.2s}, [x0], x2 // row -1 load + mls v6.4h, v8.4h , v23.4h + ext v5.8b, v0.8b , v1.8b , #5 + uaddl v8.8h, v0.8b, v5.8b + ext v2.8b, v0.8b , v1.8b , #2 + ext v3.8b, v0.8b , v1.8b , #3 + uaddl v10.8h, v2.8b, v3.8b + + st1 {v6.2s}, [x9], x6 // store temp buffer 0 + + ext v4.8b, v0.8b , v1.8b , #4 + mla v8.4h, v10.4h , v22.4h + ext v1.8b, v0.8b , v1.8b , #1 + uaddl v10.8h, v1.8b, v4.8b + ld1 {v0.2s, v1.2s}, [x0], x2 // row 0 load + mls v8.4h, v10.4h , v23.4h + ext v5.8b, v0.8b , v1.8b , #5 + uaddl v10.8h, v0.8b, v5.8b + ext v2.8b, v0.8b , v1.8b , #2 + ext v3.8b, v0.8b , v1.8b , #3 + uaddl v12.8h, v2.8b, v3.8b + + st1 {v8.2s}, [x9], x6 // store temp buffer 1 + + ext v4.8b, v0.8b , v1.8b , #4 + mla v10.4h, v12.4h , v22.4h + ext v1.8b, v0.8b , v1.8b , #1 + uaddl v12.8h, v1.8b, v4.8b + ld1 {v0.2s, v1.2s}, [x0], x2 // row 1 load + mls v10.4h, v12.4h , v23.4h + ext v5.8b, v0.8b , v1.8b , #5 + uaddl v12.8h, v0.8b, v5.8b + ext v2.8b, v0.8b , v1.8b , #2 + ext v3.8b, v0.8b , v1.8b , #3 + uaddl v14.8h, v2.8b, v3.8b + + st1 {v10.2s}, [x9], x6 // store temp buffer 2 + + ext v4.8b, v0.8b , v1.8b , #4 + mla v12.4h, v14.4h , v22.4h + ext v1.8b, v0.8b , v1.8b , #1 + uaddl v14.8h, v1.8b, v4.8b + ld1 {v0.2s, v1.2s}, [x0], x2 // row 2 load + mls v12.4h, v14.4h , v23.4h + ext v5.8b, v0.8b , v1.8b , #5 + uaddl v14.8h, v0.8b, v5.8b + ext v2.8b, v0.8b , v1.8b , #2 + ext v3.8b, v0.8b , v1.8b , #3 + uaddl v16.8h, v2.8b, v3.8b + ext v4.8b, v0.8b , v1.8b , #4 + mla v14.4h, v16.4h , v22.4h + ext v1.8b, v0.8b , v1.8b , #1 + uaddl v16.8h, v1.8b, v4.8b + + st1 {v12.2s}, [x9], x6 // store temp buffer 3 + + mls v14.4h, v16.4h , v23.4h + +loop_4: + + ld1 {v0.2s, v1.2s}, [x0], x2 // row 3 load + ext v5.8b, v0.8b , v1.8b , #5 + uaddl v16.8h, v0.8b, v5.8b + ext v2.8b, v0.8b , v1.8b , #2 + ext v3.8b, v0.8b , v1.8b , #3 + uaddl v18.8h, v2.8b, v3.8b + st1 {v14.2s}, [x9], x6 // store temp buffer 4 + ext v4.8b, v0.8b , v1.8b , #4 + mla v16.4h, v18.4h , v22.4h + ext v1.8b, v0.8b , v1.8b , #1 + uaddl v18.8h, v1.8b, v4.8b + add v2.4h, v10.4h , v12.4h + mls v16.4h, v18.4h , v23.4h + add v3.4h, v8.4h , v14.4h + ld1 {v18.2s, v19.2s}, [x0], x2 // row 4 load + ext v25.8b, v18.8b , v19.8b , #5 + uaddl v26.8h, v18.8b, v25.8b + ext v20.8b, v18.8b , v19.8b , #2 + + st1 {v16.2s}, [x9], x6 // store temp buffer 5 + + saddl v0.4s, v6.4h, v16.4h + smlal v0.4s, v2.4h, v22.4h + ext v21.8b, v18.8b , v19.8b , #3 + uaddl v28.8h, v20.8b, v21.8b + ext v24.8b, v18.8b , v19.8b , #4 + smlsl v0.4s, v3.4h, v23.4h + mla v26.4h, v28.4h , v22.4h + ext v19.8b, v18.8b , v19.8b , #1 + uaddl v28.8h, v19.8b, v24.8b + add v2.4h, v12.4h , v14.4h + mls v26.4h, v28.4h , v23.4h + sqrshrun v0.4h, v0.4s, #0xa + add v3.4h, v10.4h , v16.4h + ld1 {v18.2s, v19.2s}, [x0], x2 // row 5 load + ext v25.8b, v18.8b , v19.8b , #5 + uqxtn v11.8b, v0.8h + uaddl v28.8h, v18.8b, v25.8b + + st1 {v26.2s}, [x9], x6 // store temp buffer 6 + + //Q3 available here + ld1 {v6.2s}, [x7], x6 // load from temp buffer 0 + ld1 {v7.2s}, [x7], x6 // load from temp buffer 1 + + sqrshrun v9.8b, v6.8h, #5 + sqrshrun v7.8b, v7.8h, #5 + mov v9.2s[1], v7.2s[0] + + ext v20.8b, v18.8b , v19.8b , #2 + + saddl v0.4s, v8.4h, v26.4h + smlal v0.4s, v2.4h, v22.4h + ext v21.8b, v18.8b , v19.8b , #3 + uaddl v6.8h, v20.8b, v21.8b + ext v24.8b, v18.8b , v19.8b , #4 + smlsl v0.4s, v3.4h, v23.4h + mla v28.4h, v6.4h , v22.4h + ext v19.8b, v18.8b , v19.8b , #1 + uaddl v6.8h, v19.8b, v24.8b + add v2.4h, v14.4h , v16.4h + mls v28.4h, v6.4h , v23.4h + sqrshrun v0.4h, v0.4s, #0xa + add v3.4h, v12.4h , v26.4h + ld1 {v18.2s, v19.2s}, [x0], x2 // row 6 load + ext v25.8b, v18.8b , v19.8b , #5 + uqxtn v13.8b, v0.8h + + trn1 v11.2s, v11.2s, v13.2s + trn2 v13.2s, v11.2s, v13.2s + saddl v0.4s, v10.4h, v28.4h + urhadd v9.8b, v9.8b , v11.8b + + st1 {v28.2s}, [x9], x6 // store temp buffer 7 + + smlal v0.4s, v2.4h, v22.4h + uaddl v30.8h, v18.8b, v25.8b + + st1 {v9.s}[0], [x1], x3 // store row 0 + + ext v20.8b, v18.8b , v19.8b , #2 + + st1 {v9.s}[1], [x1], x3 // store row 1 + + ext v21.8b, v18.8b , v19.8b , #3 + smlsl v0.4s, v3.4h, v23.4h + uaddl v8.8h, v20.8b, v21.8b + ext v24.8b, v18.8b , v19.8b , #4 + mla v30.4h, v8.4h , v22.4h + ext v19.8b, v18.8b , v19.8b , #1 + uaddl v8.8h, v19.8b, v24.8b + sqrshrun v0.4h, v0.4s, #0xa + add v2.4h, v16.4h , v26.4h + mls v30.4h, v8.4h , v23.4h + uqxtn v4.8b, v0.8h + + add v3.4h, v14.4h , v28.4h + + + saddl v0.4s, v12.4h, v30.4h + + st1 {v30.2s}, [x9] + + smlal v0.4s, v2.4h, v22.4h + + ld1 {v8.2s}, [x7], x6 // load from temp buffer 2 + ld1 {v9.2s}, [x7], x6 // load from temp buffer 3 + smlsl v0.4s, v3.4h, v23.4h + subs x4, x4, #4 + + sqrshrun v10.8b, v8.8h, #5 + sqrshrun v9.8b, v9.8h, #5 + mov v10.2s[1], v9.2s[0] + + mov v12.8b, v28.8b + + sqrshrun v0.4h, v0.4s, #0xa + mov v6.8b, v14.8b + mov v8.8b, v16.8b + + uqxtn v5.8b, v0.8h + + trn1 v4.2s, v4.2s, v5.2s + trn2 v5.2s, v4.2s, v5.2s + urhadd v4.8b, v4.8b , v10.8b + mov v10.8b, v26.8b + mov v14.8b, v30.8b + + st1 {v4.s}[0], [x1], x3 // store row 2 + st1 {v4.s}[1], [x1], x3 // store row 3 + + bgt loop_4 + +end_func: + //Restoring registers from stack + ldp x19, x20, [sp], #16 + pop_v_regs + ret + + + diff --git a/common/armv8/ih264_inter_pred_luma_horz_qpel_av8.s b/common/armv8/ih264_inter_pred_luma_horz_qpel_av8.s new file mode 100755 index 0000000..39e3253 --- /dev/null +++ b/common/armv8/ih264_inter_pred_luma_horz_qpel_av8.s @@ -0,0 +1,597 @@ +//****************************************************************************** +//* +//* Copyright (C) 2015 The Android Open Source Project +//* +//* Licensed under the Apache License, Version 2.0 (the "License"); +//* you may not use this file except in compliance with the License. +//* You may obtain a copy of the License at: +//* +//* http://www.apache.org/licenses/LICENSE-2.0 +//* +//* Unless required by applicable law or agreed to in writing, software +//* distributed under the License is distributed on an "AS IS" BASIS, +//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +//* See the License for the specific language governing permissions and +//* limitations under the License. +//* +//***************************************************************************** +//* Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore +//*/ +///** +//****************************************************************************** +//* @file +//* ih264_inter_pred_luma_horz_qpel_av8.s +//* +//* @brief +//* Contains function definitions for inter prediction horizontal quarter pel interpolation. +//* +//* @author +//* Mohit +//* +//* @par List of Functions: +//* +//* - ih264_inter_pred_luma_horz_qpel_av8() +//* +//* @remarks +//* None +//* +//******************************************************************************* +//*/ + +///* All the functions here are replicated from ih264_inter_pred_filters.c +// + +///** +///** +//******************************************************************************* +//* +//* @brief +//* Quarter pel interprediction luma filter for horizontal input +//* +//* @par Description: +//* Applies a 6 tap horizontal filter .The output is clipped to 8 bits +//* sec 8.4.2.2.1 titled "Luma sample interpolation process" +//* +//* @param[in] pu1_src +//* UWORD8 pointer to the source +//* +//* @param[out] pu1_dst +//* UWORD8 pointer to the destination +//* +//* @param[in] src_strd +//* integer source stride +//* +//* @param[in] dst_strd +//* integer destination stride +//* +//* @param[in] ht +//* integer height of the array +//* +//* @param[in] wd +//* integer width of the array +//* +// @param[in] pu1_tmp: temporary buffer: UNUSED in this function +//* +//* @param[in] dydx: x and y reference offset for qpel calculations. +//* @returns +//* +// @remarks +//* None +//* +//******************************************************************************* +//*/ + +//void ih264_inter_pred_luma_horz ( +// UWORD8 *pu1_src, +// UWORD8 *pu1_dst, +// WORD32 src_strd, +// WORD32 dst_strd, +// WORD32 ht, +// WORD32 wd, +// UWORD8* pu1_tmp, +// UWORD32 dydx) + +//**************Variables Vs Registers***************************************** +// x0 => *pu1_src +// x1 => *pu1_dst +// x2 => src_strd +// x3 => dst_strd +// x4 => ht +// x5 => wd +// x7 => dydx + +.text +.p2align 2 +.include "ih264_neon_macros.s" + + + + + .global ih264_inter_pred_luma_horz_qpel_av8 + +ih264_inter_pred_luma_horz_qpel_av8: + + + push_v_regs + stp x19, x20, [sp, #-16]! + + + and x7, x7, #3 //Finds x-offset + add x7, x0, x7, lsr #1 //pu1_src + (x_offset>>1) + sub x0, x0, #2 //pu1_src-2 + sub x14, x4, #16 + movi v0.16b, #5 //filter coeff + subs x12, x5, #8 //if wd=8 branch to loop_8 + movi v1.16b, #20 //filter coeff + + beq loop_8 + + subs x12, x5, #4 //if wd=4 branch to loop_4 + beq loop_4 + +loop_16: //when wd=16 + //// Processing row0 and row1 + ld1 {v2.8b, v3.8b, v4.8b}, [x0], x2 //// Load row0 + add x14, x14, #1 //for checking loop + ext v31.8b, v2.8b , v3.8b , #5 + ld1 {v5.8b, v6.8b, v7.8b}, [x0], x2 //// Load row1 + ext v30.8b, v3.8b , v4.8b , #5 + uaddl v8.8h, v31.8b, v2.8b //// a0 + a5 (column1,row0) + ext v28.8b, v5.8b , v6.8b , #5 + uaddl v10.8h, v30.8b, v3.8b //// a0 + a5 (column2,row0) + ext v27.8b, v6.8b , v7.8b , #5 + uaddl v14.8h, v28.8b, v5.8b //// a0 + a5 (column1,row1) + ext v31.8b, v2.8b , v3.8b , #2 + uaddl v16.8h, v27.8b, v6.8b //// a0 + a5 (column2,row1) + ext v30.8b, v3.8b , v4.8b , #2 + umlal v8.8h, v31.8b, v1.8b //// a0 + a5 + 20a2 (column1,row0) + ext v28.8b, v5.8b , v6.8b , #2 + umlal v10.8h, v30.8b, v1.8b //// a0 + a5 + 20a2 (column2,row0) + ext v27.8b, v6.8b , v7.8b , #2 + umlal v14.8h, v28.8b, v1.8b //// a0 + a5 + 20a2 (column1,row1) + ext v31.8b, v2.8b , v3.8b , #3 + umlal v16.8h, v27.8b, v1.8b //// a0 + a5 + 20a2 (column2,row1) + ext v30.8b, v3.8b , v4.8b , #3 + umlal v8.8h, v31.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column1,row0) + ext v28.8b, v5.8b , v6.8b , #3 + umlal v10.8h, v30.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column2,row0) + ext v27.8b, v6.8b , v7.8b , #3 + umlal v14.8h, v28.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column1,row1) + ext v31.8b, v2.8b , v3.8b , #1 + umlal v16.8h, v27.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column2,row1) + ext v30.8b, v3.8b , v4.8b , #1 + umlsl v8.8h, v31.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column1,row0) + ext v28.8b, v5.8b , v6.8b , #1 + umlsl v10.8h, v30.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column2,row0) + ext v27.8b, v6.8b , v7.8b , #1 + umlsl v14.8h, v28.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column1,row1) + ext v31.8b, v2.8b , v3.8b , #4 + umlsl v16.8h, v27.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column2,row1) + ext v30.8b, v3.8b , v4.8b , #4 + umlsl v8.8h, v31.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1,row0) + ext v28.8b, v5.8b , v6.8b , #4 + umlsl v10.8h, v30.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column2,row0) + ext v27.8b, v6.8b , v7.8b , #4 + umlsl v14.8h, v28.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1,row1) + ld1 {v2.8b, v3.8b, v4.8b}, [x0], x2 //// Load row2 + umlsl v16.8h, v27.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column2,row1) + + ld1 {v12.2s, v13.2s}, [x7], x2 //Load value for interpolation (column1,row0) + sqrshrun v20.8b, v8.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column1,row0) + ld1 {v5.8b, v6.8b, v7.8b}, [x0], x2 //// Load row3 + sqrshrun v21.8b, v10.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column2,row0) + ext v31.8b, v2.8b , v3.8b , #5 + urhadd v20.16b, v12.16b , v20.16b //Interpolation step for qpel calculation + urhadd v21.16b, v13.16b , v21.16b //Interpolation step for qpel calculation + + sqrshrun v18.8b, v14.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column1,row1) + st1 {v20.8b, v21.8b}, [x1], x3 ////Store dest row0 + ext v30.8b, v3.8b , v4.8b , #5 + sqrshrun v19.8b, v16.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column2,row1) + + + +//// Processing row2 and row3 + ld1 {v12.2s, v13.2s}, [x7], x2 //Load value for interpolation (column1,row1) + ext v28.8b, v5.8b , v6.8b , #5 + urhadd v18.16b, v12.16b , v18.16b //Interpolation step for qpel calculation + urhadd v19.16b, v13.16b , v19.16b //Interpolation step for qpel calculation + + uaddl v8.8h, v31.8b, v2.8b //// a0 + a5 (column1,row2) + st1 {v18.8b, v19.8b}, [x1], x3 ////Store dest row1 + uaddl v10.8h, v30.8b, v3.8b //// a0 + a5 (column2,row2) + ext v27.8b, v6.8b , v7.8b , #5 + uaddl v14.8h, v28.8b, v5.8b //// a0 + a5 (column1,row3) + ext v31.8b, v2.8b , v3.8b , #2 + uaddl v16.8h, v27.8b, v6.8b //// a0 + a5 (column2,row3) + ext v30.8b, v3.8b , v4.8b , #2 + umlal v8.8h, v31.8b, v1.8b //// a0 + a5 + 20a2 (column1,row2) + ext v27.8b, v6.8b , v7.8b , #2 + umlal v10.8h, v30.8b, v1.8b //// a0 + a5 + 20a2 (column2,row2) + ext v28.8b, v5.8b , v6.8b , #2 + umlal v14.8h, v28.8b, v1.8b //// a0 + a5 + 20a2 (column1,row3) + ext v31.8b, v2.8b , v3.8b , #3 + umlal v16.8h, v27.8b, v1.8b //// a0 + a5 + 20a2 (column2,row3) + ext v30.8b, v3.8b , v4.8b , #3 + umlal v8.8h, v31.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column1,row2) + ext v28.8b, v5.8b , v6.8b , #3 + umlal v10.8h, v30.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column2,row2) + ext v27.8b, v6.8b , v7.8b , #3 + umlal v14.8h, v28.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column1,row3) + ext v31.8b, v2.8b , v3.8b , #1 + umlal v16.8h, v27.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column2,row3) + ext v30.8b, v3.8b , v4.8b , #1 + umlsl v8.8h, v31.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column1,row2) + ext v28.8b, v5.8b , v6.8b , #1 + umlsl v10.8h, v30.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column2,row2) + ext v27.8b, v6.8b , v7.8b , #1 + umlsl v14.8h, v28.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column1,row3) + ext v31.8b, v2.8b , v3.8b , #4 + umlsl v16.8h, v27.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column2,row3) + ext v30.8b, v3.8b , v4.8b , #4 + umlsl v8.8h, v31.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1,row2) + ext v28.8b, v5.8b , v6.8b , #4 + umlsl v10.8h, v30.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column2,row2) + ext v27.8b, v6.8b , v7.8b , #4 + umlsl v14.8h, v28.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1,row3) + ld1 {v2.8b, v3.8b, v4.8b}, [x0], x2 //// Load row4 + umlsl v16.8h, v27.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column2,row3) + + ld1 {v12.2s, v13.2s}, [x7], x2 //Load value for interpolation (column1,row2) + sqrshrun v20.8b, v8.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column1,row2) + ld1 {v5.8b, v6.8b, v7.8b}, [x0], x2 //// Load row5 + sqrshrun v21.8b, v10.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column2,row2) + ext v31.8b, v2.8b , v3.8b , #5 + urhadd v20.16b, v12.16b , v20.16b //Interpolation step for qpel calculation + urhadd v21.16b, v13.16b , v21.16b //Interpolation step for qpel calculation + + sqrshrun v18.8b, v14.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column1,row3) + ext v30.8b, v3.8b , v4.8b , #5 + st1 {v20.8b, v21.8b}, [x1], x3 ////Store dest row2 + sqrshrun v19.8b, v16.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column2,row3) + ld1 {v12.2s, v13.2s}, [x7], x2 //Load value for interpolation (column1,row3) + +//// Processing row4 and row5 + ext v28.8b, v5.8b , v6.8b , #5 + urhadd v18.16b, v12.16b , v18.16b //Interpolation step for qpel calculation + urhadd v19.16b, v13.16b , v19.16b //Interpolation step for qpel calculation + + uaddl v8.8h, v31.8b, v2.8b //// a0 + a5 (column1,row4) + st1 {v18.8b, v19.8b}, [x1], x3 ////Store dest row3 + uaddl v10.8h, v30.8b, v3.8b //// a0 + a5 (column2,row4) + ext v27.8b, v6.8b , v7.8b , #5 + uaddl v14.8h, v28.8b, v5.8b //// a0 + a5 (column1,row5) + ext v31.8b, v2.8b , v3.8b , #2 + uaddl v16.8h, v27.8b, v6.8b //// a0 + a5 (column2,row5) + ext v30.8b, v3.8b , v4.8b , #2 + umlal v8.8h, v31.8b, v1.8b //// a0 + a5 + 20a2 (column1,row4) + ext v27.8b, v6.8b , v7.8b , #2 + umlal v10.8h, v30.8b, v1.8b //// a0 + a5 + 20a2 (column2,row4) + ext v28.8b, v5.8b , v6.8b , #2 + umlal v14.8h, v28.8b, v1.8b //// a0 + a5 + 20a2 (column1,row5) + ext v31.8b, v2.8b , v3.8b , #3 + umlal v16.8h, v27.8b, v1.8b //// a0 + a5 + 20a2 (column2,row5) + ext v30.8b, v3.8b , v4.8b , #3 + umlal v8.8h, v31.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column1,row4) + ext v28.8b, v5.8b , v6.8b , #3 + umlal v10.8h, v30.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column2,row4) + ext v27.8b, v6.8b , v7.8b , #3 + umlal v14.8h, v28.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column1,row5) + ext v31.8b, v2.8b , v3.8b , #1 + umlal v16.8h, v27.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column2,row5) + ext v30.8b, v3.8b , v4.8b , #1 + umlsl v8.8h, v31.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column1,row4) + ext v28.8b, v5.8b , v6.8b , #1 + umlsl v10.8h, v30.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column2,row4) + ext v27.8b, v6.8b , v7.8b , #1 + umlsl v14.8h, v28.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column1,row4) + ext v31.8b, v2.8b , v3.8b , #4 + umlsl v16.8h, v27.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column2,row5) + ext v30.8b, v3.8b , v4.8b , #4 + umlsl v8.8h, v31.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1,row4) + ext v28.8b, v5.8b , v6.8b , #4 + umlsl v10.8h, v30.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column2,row4) + ext v27.8b, v6.8b , v7.8b , #4 + umlsl v14.8h, v28.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1,row5) + ld1 {v2.8b, v3.8b, v4.8b}, [x0], x2 //// Load row6 + umlsl v16.8h, v27.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column2,row5) + ld1 {v12.2s, v13.2s}, [x7], x2 //Load value for interpolation (column1,row4) + sqrshrun v20.8b, v8.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column1,row4) + ld1 {v5.8b, v6.8b, v7.8b}, [x0], x2 //// Load row7 + sqrshrun v21.8b, v10.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column2,row4) + ext v31.8b, v2.8b , v3.8b , #5 + urhadd v20.16b, v12.16b , v20.16b //Interpolation step for qpel calculation + urhadd v21.16b, v13.16b , v21.16b //Interpolation step for qpel calculation + + sqrshrun v18.8b, v14.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column1,row5) + st1 {v20.8b, v21.8b}, [x1], x3 ////Store dest row4 + ext v30.8b, v3.8b , v4.8b , #5 + sqrshrun v19.8b, v16.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column2,row5) + ld1 {v12.2s, v13.2s}, [x7], x2 //Load value for interpolation (column1,row5) + + + //// Processing row6 and row7 + + ext v28.8b, v5.8b , v6.8b , #5 + urhadd v18.16b, v12.16b , v18.16b //Interpolation step for qpel calculation + urhadd v19.16b, v13.16b , v19.16b //Interpolation step for qpel calculation + + uaddl v8.8h, v31.8b, v2.8b //// a0 + a5 (column1,row6) + st1 {v18.8b, v19.8b}, [x1], x3 ////Store dest row5 + uaddl v10.8h, v30.8b, v3.8b //// a0 + a5 (column2,row6) + ext v27.8b, v6.8b , v7.8b , #5 + uaddl v14.8h, v28.8b, v5.8b //// a0 + a5 (column1,row7) + ext v31.8b, v2.8b , v3.8b , #2 + uaddl v16.8h, v27.8b, v6.8b //// a0 + a5 (column2,row7) + ext v30.8b, v3.8b , v4.8b , #2 + umlal v8.8h, v31.8b, v1.8b //// a0 + a5 + 20a2 (column1,row6) + ext v27.8b, v6.8b , v7.8b , #2 + umlal v10.8h, v30.8b, v1.8b //// a0 + a5 + 20a2 (column2,row6) + ext v28.8b, v5.8b , v6.8b , #2 + umlal v14.8h, v28.8b, v1.8b //// a0 + a5 + 20a2 (column1,row7) + ext v31.8b, v2.8b , v3.8b , #3 + umlal v16.8h, v27.8b, v1.8b //// a0 + a5 + 20a2 (column2,row7) + ext v30.8b, v3.8b , v4.8b , #3 + umlal v8.8h, v31.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column1,row6) + ext v28.8b, v5.8b , v6.8b , #3 + umlal v10.8h, v30.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column2,row6) + ext v27.8b, v6.8b , v7.8b , #3 + umlal v14.8h, v28.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column1,row7) + ext v31.8b, v2.8b , v3.8b , #1 + umlal v16.8h, v27.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column2,row7) + ext v30.8b, v3.8b , v4.8b , #1 + umlsl v8.8h, v31.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column1,row6) + ext v28.8b, v5.8b , v6.8b , #1 + umlsl v10.8h, v30.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column2,row6) + ext v27.8b, v6.8b , v7.8b , #1 + umlsl v14.8h, v28.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column1,row6) + ext v31.8b, v2.8b , v3.8b , #4 + umlsl v16.8h, v27.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column2,row7) + ext v30.8b, v3.8b , v4.8b , #4 + umlsl v8.8h, v31.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1,row6) + ext v28.8b, v5.8b , v6.8b , #4 + umlsl v10.8h, v30.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column2,row6) + ext v27.8b, v6.8b , v7.8b , #4 + ld1 {v12.2s, v13.2s}, [x7], x2 //Load value for interpolation (column1,row6) + sqrshrun v20.8b, v8.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column1,row6) + umlsl v14.8h, v28.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1,row7) + sqrshrun v21.8b, v10.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column2,row6) + umlsl v16.8h, v27.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column2,row7) + urhadd v20.16b, v12.16b , v20.16b //Interpolation step for qpel calculation + urhadd v21.16b, v13.16b , v21.16b //Interpolation step for qpel calculation + + ld1 {v12.2s, v13.2s}, [x7], x2 //Load value for interpolation (column1,row7) + sqrshrun v18.8b, v14.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column1,row7) + st1 {v20.8b, v21.8b}, [x1], x3 ////Store dest row6 + sqrshrun v19.8b, v16.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column2,row7) + urhadd v18.16b, v12.16b , v18.16b //Interpolation step for qpel calculation + urhadd v19.16b, v13.16b , v19.16b //Interpolation step for qpel calculation + + subs x12, x14, #1 // if height==16 - looping + st1 {v18.8b, v19.8b}, [x1], x3 ////Store dest row7 + + + + beq loop_16 + b end_func + +loop_8: +//// Processing row0 and row1 + + ld1 {v5.8b, v6.8b}, [x0], x2 //// Load row1 + add x14, x14, #1 //for checking loop + ext v28.8b, v5.8b , v6.8b , #5 + ld1 {v2.8b, v3.8b}, [x0], x2 //// Load row0 + ext v25.8b, v5.8b , v6.8b , #2 + ext v31.8b, v2.8b , v3.8b , #5 + ext v24.8b, v5.8b , v6.8b , #3 + ext v23.8b, v5.8b , v6.8b , #1 + ext v22.8b, v5.8b , v6.8b , #4 + uaddl v14.8h, v28.8b, v5.8b //// a0 + a5 (column1,row1) + ext v29.8b, v2.8b , v3.8b , #3 + umlal v14.8h, v25.8b, v1.8b //// a0 + a5 + 20a2 (column1,row1) + umlal v14.8h, v24.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column1,row1) + umlsl v14.8h, v23.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column1,row1) + umlsl v14.8h, v22.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1,row1) + ext v30.8b, v2.8b , v3.8b , #2 + uaddl v8.8h, v31.8b, v2.8b //// a0 + a5 (column1,row0) + ext v27.8b, v2.8b , v3.8b , #1 + ext v26.8b, v2.8b , v3.8b , #4 + ld1 {v2.8b, v3.8b}, [x0], x2 //// Load row2 + umlal v8.8h, v29.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column1,row0) + umlal v8.8h, v30.8b, v1.8b //// a0 + a5 + 20a2 (column1,row0) + umlsl v8.8h, v27.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column1,row0) + umlsl v8.8h, v26.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1,row0) + ld1 {v5.8b, v6.8b}, [x0], x2 //// Load row3 + sqrshrun v18.8b, v14.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column1,row0) + + //// Processing row2 and row3 + ext v28.8b, v5.8b , v6.8b , #5 + ext v25.8b, v5.8b , v6.8b , #2 + ext v31.8b, v2.8b , v3.8b , #5 + uaddl v14.8h, v28.8b, v5.8b //// a0 + a5 (column1,row3) + ld1 {v12.2s}, [x7], x2 //Load value for interpolation (column1,row0) + ld1 {v13.2s}, [x7], x2 //Load value for interpolation (column1,row1) + ext v24.8b, v5.8b , v6.8b , #3 + ext v23.8b, v5.8b , v6.8b , #1 + sqrshrun v19.8b, v8.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column1,row1) + ext v22.8b, v5.8b , v6.8b , #4 + ext v29.8b, v2.8b , v3.8b , #3 + umlal v14.8h, v25.8b, v1.8b //// a0 + a5 + 20a2 (column1,row3) + umlal v14.8h, v24.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column1,row3) + umlsl v14.8h, v23.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column1,row3) + umlsl v14.8h, v22.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1,row3) + urhadd v18.16b, v12.16b , v18.16b //Interpolation step for qpel calculation + urhadd v19.16b, v13.16b , v19.16b //Interpolation step for qpel calculation + + st1 {v18.8b}, [x1], x3 ////Store dest row0 + st1 {v19.8b}, [x1], x3 ////Store dest row1 + uaddl v8.8h, v31.8b, v2.8b //// a0 + a5 (column1,row2) + ext v30.8b, v2.8b , v3.8b , #2 + ext v27.8b, v2.8b , v3.8b , #1 + ext v26.8b, v2.8b , v3.8b , #4 + ld1 {v2.8b, v3.8b}, [x0], x2 //// Load row4 + umlal v8.8h, v29.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column1,row2) + umlal v8.8h, v30.8b, v1.8b //// a0 + a5 + 20a2 (column1,row2) + umlsl v8.8h, v27.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column1,row2) + umlsl v8.8h, v26.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1,row2) + ld1 {v5.8b, v6.8b}, [x0], x2 //// Load row5 + subs x9, x4, #4 + sqrshrun v19.8b, v14.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column1,row3) + ld1 {v12.2s}, [x7], x2 //Load value for interpolation (column1,row2) + ld1 {v13.2s}, [x7], x2 //Load value for interpolation (column1,row3) + ext v28.8b, v5.8b , v6.8b , #5 + ext v25.8b, v5.8b , v6.8b , #2 + ext v31.8b, v2.8b , v3.8b , #5 + uaddl v14.8h, v28.8b, v5.8b //// a0 + a5 (column1,row5) + ext v24.8b, v5.8b , v6.8b , #3 + sqrshrun v18.8b, v8.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column1,row2) + ext v22.8b, v5.8b , v6.8b , #4 + ext v29.8b, v2.8b , v3.8b , #3 + urhadd v18.16b, v12.16b , v18.16b //Interpolation step for qpel calculation + urhadd v19.16b, v13.16b , v19.16b //Interpolation step for qpel calculation + + st1 {v18.8b}, [x1], x3 ////Store dest row2 + ext v30.8b, v2.8b , v3.8b , #2 + uaddl v8.8h, v31.8b, v2.8b //// a0 + a5 (column1,row4) + st1 {v19.8b}, [x1], x3 ////Store dest row3 + beq end_func // Branch if height==4 + +//// Processing row4 and row5 + ext v23.8b, v5.8b , v6.8b , #1 + umlal v14.8h, v25.8b, v1.8b //// a0 + a5 + 20a2 (column1,row5) + umlal v14.8h, v24.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column1,row5) + umlsl v14.8h, v23.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column1,row5) + umlsl v14.8h, v22.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1,row5) + ext v27.8b, v2.8b , v3.8b , #1 + ext v26.8b, v2.8b , v3.8b , #4 + ld1 {v2.8b, v3.8b}, [x0], x2 //// Load row6 + umlal v8.8h, v29.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column1,row4) + umlal v8.8h, v30.8b, v1.8b //// a0 + a5 + 20a2 (column1,row4) + umlsl v8.8h, v27.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column1,row4) + umlsl v8.8h, v26.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1,row4) + sqrshrun v19.8b, v14.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column1,row5) + ld1 {v5.8b, v6.8b}, [x0], x2 //// Load row7 + ext v31.8b, v2.8b , v3.8b , #5 + ext v28.8b, v5.8b , v6.8b , #5 + ld1 {v12.2s}, [x7], x2 //Load value for interpolation (column1,row4) + ld1 {v13.2s}, [x7], x2 //Load value for interpolation (column1,row5) + ext v25.8b, v5.8b , v6.8b , #2 + uaddl v14.8h, v28.8b, v5.8b //// a0 + a5 (column1,row7) + ext v24.8b, v5.8b , v6.8b , #3 + ext v22.8b, v5.8b , v6.8b , #4 + sqrshrun v18.8b, v8.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column1,row4) + ext v29.8b, v2.8b , v3.8b , #3 + ext v30.8b, v2.8b , v3.8b , #2 + urhadd v18.16b, v12.16b , v18.16b //Interpolation step for qpel calculation + urhadd v19.16b, v13.16b , v19.16b //Interpolation step for qpel calculation + + st1 {v18.8b}, [x1], x3 ////Store dest row4 + ext v27.8b, v2.8b , v3.8b , #1 + uaddl v8.8h, v31.8b, v2.8b //// a0 + a5 (column1,row6) + ext v26.8b, v2.8b , v3.8b , #4 + umlal v8.8h, v29.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column1,row6) + umlal v8.8h, v30.8b, v1.8b //// a0 + a5 + 20a2 (column1,row6) + umlsl v8.8h, v27.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column1,row6) + umlsl v8.8h, v26.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1,row6) + //// Processing row6 and row7 + st1 {v19.8b}, [x1], x3 ////Store dest row5 + ext v23.8b, v5.8b , v6.8b , #1 + umlal v14.8h, v25.8b, v1.8b //// a0 + a5 + 20a2 (column1,row7) + umlal v14.8h, v24.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column1,row7) + umlsl v14.8h, v23.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column1,row7) + umlsl v14.8h, v22.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1,row7) + ld1 {v12.2s}, [x7], x2 //Load value for interpolation (column1,row6) + ld1 {v13.2s}, [x7], x2 //Load value for interpolation (column1,row7) + sqrshrun v18.8b, v8.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column1,row6) + subs x12, x14, #1 + sqrshrun v19.8b, v14.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column1,row7) + urhadd v18.16b, v12.16b , v18.16b //Interpolation step for qpel calculation + urhadd v19.16b, v13.16b , v19.16b //Interpolation step for qpel calculation + + st1 {v18.8b}, [x1], x3 ////Store dest row6 + st1 {v19.8b}, [x1], x3 ////Store dest row7 + + beq loop_8 //looping if height ==16 + + b end_func + +loop_4: + ld1 {v5.8b, v6.8b}, [x0], x2 //// Load row1 + ext v28.8b, v5.8b , v6.8b , #5 + ld1 {v2.8b, v3.8b}, [x0], x2 //// Load row0 + ext v25.8b, v5.8b , v6.8b , #2 + ext v31.8b, v2.8b , v3.8b , #5 + uaddl v14.8h, v28.8b, v5.8b //// a0 + a5 (column1,row1) + ext v24.8b, v5.8b , v6.8b , #3 + ext v23.8b, v5.8b , v6.8b , #1 + ext v22.8b, v5.8b , v6.8b , #4 + ext v29.8b, v2.8b , v3.8b , #3 + umlal v14.8h, v25.8b, v1.8b //// a0 + a5 + 20a2 (column1,row1) + umlal v14.8h, v24.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column1,row1) + umlsl v14.8h, v23.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column1,row1) + umlsl v14.8h, v22.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1,row1) + uaddl v8.8h, v31.8b, v2.8b //// a0 + a5 (column1,row0) + ext v30.8b, v2.8b , v3.8b , #2 + ld1 {v12.2s}, [x7], x2 //Load value for interpolation (column1,row0) + ld1 {v13.2s}, [x7], x2 //Load value for interpolation (column1,row1) + ext v27.8b, v2.8b , v3.8b , #1 + ext v26.8b, v2.8b , v3.8b , #4 + ld1 {v2.8b, v3.8b}, [x0], x2 //// Load row2 + umlal v8.8h, v29.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column1,row0) + umlal v8.8h, v30.8b, v1.8b //// a0 + a5 + 20a2 (column1,row0) + umlsl v8.8h, v27.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column1,row0) + umlsl v8.8h, v26.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1,row0) + ld1 {v5.8b, v6.8b}, [x0], x2 //// Load row3 + ext v28.8b, v5.8b , v6.8b , #5 + ext v25.8b, v5.8b , v6.8b , #2 + sqrshrun v18.8b, v14.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column1,row0) + ext v31.8b, v2.8b , v3.8b , #5 + ext v24.8b, v5.8b , v6.8b , #3 + + ext v23.8b, v5.8b , v6.8b , #1 + ext v22.8b, v5.8b , v6.8b , #4 + ext v29.8b, v2.8b , v3.8b , #3 + sqrshrun v19.8b, v8.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column1,row1) + ext v30.8b, v2.8b , v3.8b , #2 + ext v27.8b, v2.8b , v3.8b , #1 + + //// Processing row2 and row3 + urhadd v18.16b, v12.16b , v18.16b //Interpolation step for qpel calculation + urhadd v19.16b, v13.16b , v19.16b //Interpolation step for qpel calculation + + st1 {v18.s}[0], [x1], x3 ////Store dest row0 + st1 {v19.s}[0], [x1], x3 ////Store dest row1 + uaddl v14.8h, v28.8b, v5.8b //// a0 + a5 (column1,row3) + ext v26.8b, v2.8b , v3.8b , #4 + ld1 {v12.2s}, [x7], x2 //Load value for interpolation (column1,row2) + ld1 {v13.2s}, [x7], x2 //Load value for interpolation (column1,row3) + + umlal v14.8h, v25.8b, v1.8b //// a0 + a5 + 20a2 (column1,row3) + umlal v14.8h, v24.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column1,row3) + umlsl v14.8h, v23.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column1,row3) + umlsl v14.8h, v22.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1,row3) + uaddl v8.8h, v31.8b, v2.8b //// a0 + a5 (column1,row2) + umlal v8.8h, v29.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column1,row2) + umlal v8.8h, v30.8b, v1.8b //// a0 + a5 + 20a2 (column1,row2) + umlsl v8.8h, v27.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column1,row2) + umlsl v8.8h, v26.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1,row2) + sqrshrun v19.8b, v14.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column1,row3) + sqrshrun v18.8b, v8.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column1,row2) + urhadd v18.16b, v12.16b , v18.16b //Interpolation step for qpel calculation + urhadd v19.16b, v13.16b , v19.16b //Interpolation step for qpel calculation + + st1 {v18.s}[0], [x1], x3 ////Store dest row2 + subs x4, x4, #8 // Loop if height =8 + st1 {v19.s}[0], [x1], x3 ////Store dest row3 + + beq loop_4 + +end_func: + + ldp x19, x20, [sp], #16 + pop_v_regs + ret + + + diff --git a/common/armv8/ih264_inter_pred_luma_horz_qpel_vert_hpel_av8.s b/common/armv8/ih264_inter_pred_luma_horz_qpel_vert_hpel_av8.s new file mode 100755 index 0000000..b1e4866 --- /dev/null +++ b/common/armv8/ih264_inter_pred_luma_horz_qpel_vert_hpel_av8.s @@ -0,0 +1,910 @@ +//****************************************************************************** +//* +//* Copyright (C) 2015 The Android Open Source Project +//* +//* Licensed under the Apache License, Version 2.0 (the "License"); +//* you may not use this file except in compliance with the License. +//* You may obtain a copy of the License at: +//* +//* http://www.apache.org/licenses/LICENSE-2.0 +//* +//* Unless required by applicable law or agreed to in writing, software +//* distributed under the License is distributed on an "AS IS" BASIS, +//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +//* See the License for the specific language governing permissions and +//* limitations under the License. +//* +//***************************************************************************** +//* Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore +//*/ +///** +//****************************************************************************** +//* @file +//* ih264_inter_pred_luma_horz_qpel_vert_hpel_av8.s +//* +//* @brief +//* Contains function definitions for inter prediction interpolation. +//* +//* @author +//* Mohit +//* +//* @par List of Functions: +//* +//* - ih264_inter_pred_luma_horz_qpel_vert_hpel_av8() +//* +//* @remarks +//* None +//* +//******************************************************************************* +//*/ + +///* All the functions here are replicated from ih264_inter_pred_filters.c +// + +///** +///** +///** +//******************************************************************************* +//* +//* @brief +//* This function implements a two stage cascaded six tap filter. It +//* applies the six tap filter in the vertical direction on the +//* predictor values, followed by applying the same filter in the +//* horizontal direction on the output of the first stage. It then averages +//* the output of the 1st stage and the final stage to obtain the quarter +//* pel values.The six tap filtering operation is described in sec 8.4.2.2.1 +//* titled "Luma sample interpolation process". +//* +//* @par Description: +//* This function is called to obtain pixels lying at the following +//* location (1/4,1/2) or (3/4,1/2). The function interpolates +//* the predictors first in the verical direction and then in the +//* horizontal direction to output the (1/2,1/2). It then averages +//* the output of the 2nd stage and (1/2,1/2) value to obtain (1/4,1/2) +//* or (3/4,1/2) depending on the offset. +//* +//* @param[in] pu1_src +//* UWORD8 pointer to the source +//* +//* @param[out] pu1_dst +//* UWORD8 pointer to the destination +//* +//* @param[in] src_strd +//* integer source stride +//* +//* @param[in] dst_strd +//* integer destination stride +//* +//* @param[in] ht +//* integer height of the array +//* +//* @param[in] wd +//* integer width of the array +//* +//* @param[in] pu1_tmp: temporary buffer +//* +//* @param[in] dydx: x and y reference offset for qpel calculations +//* +//* @returns +//* +//* @remarks +//* None +//* +//******************************************************************************* +//*/; + +//void ih264_inter_pred_luma_horz_qpel_vert_hpel(UWORD8 *pu1_src, +// UWORD8 *pu1_dst, +// WORD32 src_strd,, +// WORD32 dst_strd, +// WORD32 ht, +// WORD32 wd, +// UWORD8* pu1_tmp, +// UWORD32 dydx) + +//**************Variables Vs Registers***************************************** +// x0 => *pu1_src +// x1 => *pu1_dst +// x2 => src_strd +// x3 => dst_strd +// x4 => ht +// x5 => wd +// x6 => dydx +// x9 => *pu1_tmp + +.text +.p2align 2 +.include "ih264_neon_macros.s" + + + + .global ih264_inter_pred_luma_horz_qpel_vert_hpel_av8 + +ih264_inter_pred_luma_horz_qpel_vert_hpel_av8: + + // STMFD sp!, {x4-x12, x14} //store register values to stack + push_v_regs + stp x19, x20, [sp, #-16]! + + sub x0, x0, x2, lsl #1 //pu1_src-2*src_strd + sub x0, x0, #2 //pu1_src-2 + mov x9, x6 + mov x6, x7 + + and x6, x6, #2 // dydx & 0x3 followed by dydx>>1 and dydx<<1 + + add x7, x9, #4 + add x6, x7, x6 // pi16_pred1_temp += (x_offset>>1) + + movi v26.8h, #0x14 // Filter coeff 20 into Q13 + movi v24.8h, #0x5 // Filter coeff 5 into Q12 + movi v27.8h, #0x14 // Filter coeff 20 into Q13 + movi v25.8h, #0x5 // Filter coeff 5 into Q12 + mov x7, #0x20 + mov x8, #0x30 + subs x12, x5, #4 //if wd=4 branch to loop_4 + beq loop_4_start + + subs x12, x5, #8 //if wd=8 branch to loop_8 + beq loop_8_start + + //when wd=16 + movi v28.8h, #0x14 // Filter coeff 20 into Q13 + movi v30.8h, #0x5 // Filter coeff 5 into Q12 + sub x2, x2, #16 + ld1 {v0.2s, v1.2s}, [x0], #16 // Vector load from src[0_0] + ld1 {v12.2s}, [x0], x2 // Vector load from src[0_0] + ld1 {v2.2s, v3.2s}, [x0], #16 // Vector load from src[1_0] + ld1 {v13.2s}, [x0], x2 // Vector load from src[1_0] + ld1 {v4.2s, v5.2s}, [x0], #16 // Vector load from src[2_0] + ld1 {v14.2s}, [x0], x2 // Vector load from src[2_0] + ld1 {v6.2s, v7.2s}, [x0], #16 // Vector load from src[3_0] + ld1 {v15.2s}, [x0], x2 // Vector load from src[3_0] + ld1 {v8.2s, v9.2s}, [x0], #16 // Vector load from src[4_0] + ld1 {v16.2s}, [x0], x2 // Vector load from src[4_0] + +loop_16: + + ld1 {v10.2s, v11.2s}, [x0], #16 // Vector load from src[5_0] + ld1 {v17.2s}, [x0], x2 // Vector load from src[5_0] + + + uaddl v20.8h, v4.8b, v6.8b + uaddl v18.8h, v0.8b, v10.8b + uaddl v22.8h, v2.8b, v8.8b + mla v18.8h, v20.8h , v28.8h + uaddl v24.8h, v5.8b, v7.8b + uaddl v20.8h, v1.8b, v11.8b + uaddl v26.8h, v3.8b, v9.8b + mla v20.8h, v24.8h , v28.8h + uaddl v24.8h, v14.8b, v15.8b + mls v18.8h, v22.8h , v30.8h + uaddl v22.8h, v12.8b, v17.8b + mls v20.8h, v26.8h , v30.8h + uaddl v26.8h, v13.8b, v16.8b + mla v22.8h, v24.8h , v28.8h + mls v22.8h, v26.8h , v30.8h + st1 {v18.4s }, [x9], #16 + st1 {v20.4s}, [x9], #16 + ext v24.16b, v18.16b , v20.16b , #4 + ext v26.16b, v18.16b , v20.16b , #6 + st1 {v22.4s}, [x9] + ext v22.16b, v18.16b , v20.16b , #10 + add v0.8h, v24.8h , v26.8h + ext v24.16b, v18.16b , v20.16b , #2 + ext v26.16b, v18.16b , v20.16b , #8 + add v24.8h, v24.8h , v26.8h + + saddl v26.4s, v18.4h, v22.4h + smlal v26.4s, v0.4h, v28.4h + smlsl v26.4s, v24.4h, v30.4h + + saddl2 v22.4s, v18.8h, v22.8h + smlal2 v22.4s, v0.8h, v28.8h + smlsl2 v22.4s, v24.8h, v30.8h + + sqrshrun v18.4h, v26.4s, #10 + sqrshrun v19.4h, v22.4s, #10 + ld1 {v22.4s}, [x9], #16 + + uqxtn v18.8b, v18.8h + uqxtn v19.8b, v19.8h + mov v18.2s[1], v19.2s[0] + + ext v24.16b, v20.16b , v22.16b , #4 + ext v26.16b, v20.16b , v22.16b , #6 + ext v0.16b, v20.16b , v22.16b , #10 + st1 {v18.2s}, [x1] + add v18.8h, v24.8h , v26.8h + ext v24.16b, v20.16b , v22.16b , #2 + ext v26.16b, v20.16b , v22.16b , #8 + add v24.8h, v24.8h , v26.8h + + saddl v26.4s, v0.4h, v20.4h + smlal v26.4s, v18.4h, v28.4h + smlsl v26.4s, v24.4h, v30.4h + + saddl2 v22.4s, v0.8h, v20.8h + smlal2 v22.4s, v18.8h, v28.8h + smlsl2 v22.4s, v24.8h, v30.8h + + sqrshrun v19.4h, v26.4s, #10 + sqrshrun v18.4h, v22.4s, #10 + + uaddl v24.8h, v7.8b, v9.8b + ld1 {v20.4s}, [x6], #16 + ld1 {v22.4s}, [x6], x7 + + + uqxtn v19.8b, v19.8h + uqxtn v18.8b, v18.8h + mov v19.2s[1], v18.2s[0] + + ld1 {v18.2s}, [x1] + sqrshrun v20.8b, v20.8h, #5 + sqrshrun v21.8b, v22.8h, #5 + uaddl v22.8h, v4.8b, v10.8b + ld1 {v0.2s, v1.2s}, [x0], #16 // Vector load from src[6_0] + urhadd v18.16b, v18.16b , v20.16b + urhadd v19.16b, v19.16b , v21.16b + + ld1 {v12.2s}, [x0], x2 // Vector load from src[6_0] + uaddl v20.8h, v6.8b, v8.8b + uaddl v26.8h, v5.8b, v11.8b + st1 {v18.2s, v19.2s}, [x1], x3 // store row 0 + + +//ROW_2 + + + uaddl v18.8h, v2.8b, v0.8b + + mla v18.8h, v20.8h , v28.8h + + uaddl v20.8h, v3.8b, v1.8b + + mla v20.8h, v24.8h , v28.8h + uaddl v24.8h, v15.8b, v16.8b + mls v18.8h, v22.8h , v30.8h + uaddl v22.8h, v13.8b, v12.8b + mls v20.8h, v26.8h , v30.8h + uaddl v26.8h, v14.8b, v17.8b + mla v22.8h, v24.8h , v28.8h + mls v22.8h, v26.8h , v30.8h + st1 {v18.4s}, [x9], #16 + st1 {v20.4s}, [x9], #16 + ext v24.16b, v18.16b , v20.16b , #4 + ext v26.16b, v18.16b , v20.16b , #6 + st1 {v22.4s}, [x9] + ext v22.16b, v18.16b , v20.16b , #10 + add v2.8h, v24.8h , v26.8h + ext v24.16b, v18.16b , v20.16b , #2 + ext v26.16b, v18.16b , v20.16b , #8 + add v24.8h, v24.8h , v26.8h + + saddl v26.4s, v18.4h, v22.4h + smlal v26.4s, v2.4h, v28.4h + smlsl v26.4s, v24.4h, v30.4h + + saddl2 v22.4s, v18.8h, v22.8h + smlal2 v22.4s, v2.8h, v28.8h + smlsl2 v22.4s, v24.8h, v30.8h + + sqrshrun v18.4h, v26.4s, #10 + sqrshrun v19.4h, v22.4s, #10 + + ld1 {v22.4s}, [x9], #16 + + uqxtn v18.8b, v18.8h + uqxtn v19.8b, v19.8h + mov v18.2s[1], v19.2s[0] + + ext v24.16b, v20.16b , v22.16b , #4 + ext v26.16b, v20.16b , v22.16b , #6 + ext v2.16b, v20.16b , v22.16b , #10 + st1 {v18.2s}, [x1] + add v18.8h, v24.8h , v26.8h + ext v24.16b, v20.16b , v22.16b , #2 + ext v26.16b, v20.16b , v22.16b , #8 + add v24.8h, v24.8h , v26.8h + + saddl v26.4s, v2.4h, v20.4h + smlal v26.4s, v18.4h, v28.4h + smlsl v26.4s, v24.4h, v30.4h + + saddl2 v22.4s, v2.8h, v20.8h + smlal2 v22.4s, v18.8h, v28.8h + smlsl2 v22.4s, v24.8h, v30.8h + + sqrshrun v19.4h, v26.4s, #10 + sqrshrun v18.4h, v22.4s, #10 + uaddl v24.8h, v9.8b, v11.8b + ld1 {v20.4s}, [x6], #16 + ld1 {v22.4s}, [x6], x7 + uqxtn v19.8b, v19.8h + uqxtn v18.8b, v18.8h + mov v19.2s[1], v18.2s[0] + ld1 {v18.4s}, [x1] + sqrshrun v20.8b, v20.8h, #5 + sqrshrun v21.8b, v22.8h, #5 + + uaddl v22.8h, v6.8b, v0.8b + ld1 {v2.2s, v3.2s}, [x0], #16 // Vector load from src[7_0] + + urhadd v18.16b, v18.16b , v20.16b + urhadd v19.16b, v19.16b , v21.16b + ld1 {v13.2s}, [x0], x2 // Vector load from src[7_0] + uaddl v20.8h, v8.8b, v10.8b + uaddl v26.8h, v7.8b, v1.8b + st1 {v18.2s, v19.2s}, [x1], x3 // store row 1 + +//ROW_3 + + + uaddl v18.8h, v4.8b, v2.8b + + mla v18.8h, v20.8h , v28.8h + + uaddl v20.8h, v5.8b, v3.8b + + mla v20.8h, v24.8h , v28.8h + uaddl v24.8h, v16.8b, v17.8b + mls v18.8h, v22.8h , v30.8h + uaddl v22.8h, v14.8b, v13.8b + mls v20.8h, v26.8h , v30.8h + uaddl v26.8h, v15.8b, v12.8b + mla v22.8h, v24.8h , v28.8h + mls v22.8h, v26.8h , v30.8h + st1 {v18.4s}, [x9], #16 + st1 {v20.4s}, [x9], #16 + ext v24.16b, v18.16b , v20.16b , #4 + ext v26.16b, v18.16b , v20.16b , #6 + st1 {v22.4s}, [x9] + ext v22.16b, v18.16b , v20.16b , #10 + add v4.8h, v24.8h , v26.8h + ext v24.16b, v18.16b , v20.16b , #2 + ext v26.16b, v18.16b , v20.16b , #8 + add v24.8h, v24.8h , v26.8h + + saddl v26.4s, v18.4h, v22.4h + smlal v26.4s, v4.4h, v28.4h + smlsl v26.4s, v24.4h, v30.4h + + saddl2 v22.4s, v18.8h, v22.8h + smlal2 v22.4s, v4.8h, v28.8h + smlsl2 v22.4s, v24.8h, v30.8h + + sqrshrun v18.4h, v26.4s, #10 + sqrshrun v19.4h, v22.4s, #10 + ld1 {v22.4s}, [x9], #16 + + uqxtn v18.8b, v18.8h + uqxtn v19.8b, v19.8h + mov v18.2s[1], v19.2s[0] + + + ext v24.16b, v20.16b , v22.16b , #4 + ext v26.16b, v20.16b , v22.16b , #6 + ext v4.16b, v20.16b , v22.16b , #10 + st1 {v18.2s}, [x1] + add v18.8h, v24.8h , v26.8h + ext v24.16b, v20.16b , v22.16b , #2 + ext v26.16b, v20.16b , v22.16b , #8 + add v24.8h, v24.8h , v26.8h + + saddl v26.4s, v4.4h, v20.4h + smlal v26.4s, v18.4h, v28.4h + smlsl v26.4s, v24.4h, v30.4h + + saddl2 v22.4s, v4.8h, v20.8h + smlal2 v22.4s, v18.8h, v28.8h + smlsl2 v22.4s, v24.8h, v30.8h + + sqrshrun v19.4h, v26.4s, #10 + sqrshrun v18.4h, v22.4s, #10 + + uaddl v24.8h, v11.8b, v1.8b + ld1 {v20.4s}, [x6], #16 + ld1 {v22.4s}, [x6], x7 + + uqxtn v19.8b, v19.8h + uqxtn v18.8b, v18.8h + mov v19.2s[1], v18.2s[0] + + ld1 {v18.2s}, [x1] + sqrshrun v20.8b, v20.8h, #5 + sqrshrun v21.8b, v22.8h, #5 + + uaddl v22.8h, v8.8b, v2.8b + ld1 {v4.2s, v5.2s}, [x0], #16 // Vector load from src[8_0] + + urhadd v18.16b, v18.16b , v20.16b + urhadd v19.16b, v19.16b , v21.16b + ld1 {v14.2s}, [x0], x2 // Vector load from src[8_0] + uaddl v20.8h, v10.8b, v0.8b + uaddl v26.8h, v9.8b, v3.8b + st1 {v18.2s, v19.2s}, [x1], x3 // store row 2 + + +//ROW_4 + + uaddl v18.8h, v6.8b, v4.8b + + mla v18.8h, v20.8h , v28.8h + + uaddl v20.8h, v7.8b, v5.8b + + mla v20.8h, v24.8h , v28.8h + uaddl v24.8h, v17.8b, v12.8b + mls v18.8h, v22.8h , v30.8h + uaddl v22.8h, v15.8b, v14.8b + mls v20.8h, v26.8h , v30.8h + uaddl v26.8h, v16.8b, v13.8b + mla v22.8h, v24.8h , v28.8h + mls v22.8h, v26.8h , v30.8h + st1 {v18.4s}, [x9], #16 + st1 {v20.4s}, [x9], #16 + ext v24.16b, v18.16b , v20.16b , #4 + ext v26.16b, v18.16b , v20.16b , #6 + st1 {v22.4s}, [x9] + ext v22.16b, v18.16b , v20.16b , #10 + add v6.8h, v24.8h , v26.8h + ext v24.16b, v18.16b , v20.16b , #2 + ext v26.16b, v18.16b , v20.16b , #8 + add v24.8h, v24.8h , v26.8h + + saddl v26.4s, v18.4h, v22.4h + smlal v26.4s, v6.4h, v28.4h + smlsl v26.4s, v24.4h, v30.4h + + saddl2 v22.4s, v18.8h, v22.8h + smlal2 v22.4s, v6.8h, v28.8h + smlsl2 v22.4s, v24.8h, v30.8h + + sqrshrun v18.4h, v26.4s, #10 + sqrshrun v19.4h, v22.4s, #10 + ld1 {v22.4s}, [x9], #16 + uqxtn v18.8b, v18.8h + uqxtn v19.8b, v19.8h + mov v18.2s[1], v19.2s[0] + + + ext v24.16b, v20.16b , v22.16b , #4 + ext v26.16b, v20.16b , v22.16b , #6 + ext v6.16b, v20.16b , v22.16b , #10 + st1 {v18.2s}, [x1] + add v18.8h, v24.8h , v26.8h + ext v24.16b, v20.16b , v22.16b , #2 + ext v26.16b, v20.16b , v22.16b , #8 + add v24.8h, v24.8h , v26.8h + + saddl v26.4s, v6.4h, v20.4h + smlal v26.4s, v18.4h, v28.4h + smlsl v26.4s, v24.4h, v30.4h + + saddl2 v22.4s, v6.8h, v20.8h + smlal2 v22.4s, v18.8h, v28.8h + smlsl2 v22.4s, v24.8h, v30.8h + + mov v6.16b, v2.16b + mov v7.16b, v3.16b + + mov v2.16b, v10.16b + mov v3.16b, v11.16b + + subs x4, x4, #4 + sqrshrun v19.4h, v26.4s, #10 + sqrshrun v18.4h, v22.4s, #10 + mov v10.16b, v0.16b + mov v11.16b, v1.16b + + mov v24.8b, v14.8b + + mov v14.16b, v12.16b + mov v15.16b, v13.16b + + + uqxtn v19.8b, v19.8h + uqxtn v18.8b, v18.8h + mov v19.2s[1], v18.2s[0] + + ld1 {v20.4s}, [x6], #16 + ld1 {v22.4s}, [x6], x7 + ld1 {v18.2s}, [x1] + sqrshrun v20.8b, v20.8h, #5 + sqrshrun v21.8b, v22.8h, #5 + + mov v0.16b, v8.16b + mov v1.16b, v9.16b + + mov v8.16b, v4.16b + mov v9.16b, v5.16b + + mov v12.16b, v16.16b + mov v13.16b, v17.16b + urhadd v18.16b, v18.16b , v20.16b + urhadd v19.16b, v19.16b , v21.16b + + mov v4.16b, v10.16b + mov v5.16b, v11.16b + + mov v16.8b, v24.8b + st1 {v18.2s, v19.2s}, [x1], x3 // store row 3 + + bgt loop_16 // looping if height =16 + b end_func + +loop_8_start: + ld1 {v0.2s, v1.2s}, [x0], x2 // Vector load from src[0_0] + ld1 {v2.2s, v3.2s}, [x0], x2 // Vector load from src[1_0] + ld1 {v4.2s, v5.2s}, [x0], x2 // Vector load from src[2_0] + ld1 {v6.2s, v7.2s}, [x0], x2 // Vector load from src[3_0] + ld1 {v8.2s, v9.2s}, [x0], x2 // Vector load from src[4_0] + +loop_8: + + ld1 {v10.2s, v11.2s}, [x0], x2 // Vector load from src[5_0] + uaddl v14.8h, v4.8b, v6.8b + uaddl v12.8h, v0.8b, v10.8b + uaddl v16.8h, v2.8b, v8.8b + mla v12.8h, v14.8h , v26.8h + uaddl v18.8h, v5.8b, v7.8b + uaddl v14.8h, v1.8b, v11.8b + uaddl v22.8h, v3.8b, v9.8b + mla v14.8h, v18.8h , v26.8h + mls v12.8h, v16.8h , v24.8h + ld1 {v0.2s, v1.2s}, [x0], x2 // Vector load from src[6_0] + uaddl v16.8h, v6.8b, v8.8b + mls v14.8h, v22.8h , v24.8h + uaddl v28.8h, v2.8b, v0.8b + st1 {v12.4s}, [x9], #16 // store row 0 to temp buffer: col 0 + ext v22.16b, v12.16b , v14.16b , #10 + uaddl v18.8h, v4.8b, v10.8b + mla v28.8h, v16.8h , v26.8h + saddl v30.4s, v12.4h, v22.4h + st1 {v14.4s}, [x9], x7 // store row 0 to temp buffer: col 1 + saddl2 v22.4s, v12.8h, v22.8h + ext v16.16b, v12.16b , v14.16b , #4 + mls v28.8h, v18.8h , v24.8h + ext v18.16b, v12.16b , v14.16b , #6 + ext v20.16b, v12.16b , v14.16b , #8 + ext v14.16b, v12.16b , v14.16b , #2 + add v16.8h, v16.8h , v18.8h + add v18.8h, v14.8h , v20.8h + uaddl v20.8h, v7.8b, v9.8b + smlal v30.4s, v16.4h, v26.4h + smlsl v30.4s, v18.4h, v24.4h + smlal2 v22.4s, v16.8h, v26.8h + smlsl2 v22.4s, v18.8h, v24.8h + uaddl v14.8h, v3.8b, v1.8b + st1 {v28.4s}, [x9], #16 // store row 1 to temp buffer: col 0 + mla v14.8h, v20.8h , v26.8h + sqrshrun v12.4h, v30.4s, #10 + uaddl v16.8h, v5.8b, v11.8b + sqrshrun v13.4h, v22.4s, #10 + mls v14.8h, v16.8h , v24.8h + ld1 {v2.2s, v3.2s}, [x0], x2 // Vector load from src[7_0] + uqxtn v25.8b, v12.8h + uqxtn v13.8b, v13.8h + mov v25.2s[1], v13.2s[0] + uaddl v16.8h, v8.8b, v10.8b + + + ext v22.16b, v28.16b , v14.16b , #10 + uaddl v20.8h, v4.8b, v2.8b + saddl v30.4s, v28.4h, v22.4h + mla v20.8h, v16.8h , v26.8h + st1 {v14.4s}, [x9], x7 // store row 1 to temp buffer: col 1 + saddl2 v22.4s, v28.8h, v22.8h + ext v16.16b, v28.16b , v14.16b , #4 + ext v18.16b, v28.16b , v14.16b , #6 + ext v12.16b, v28.16b , v14.16b , #8 + ext v14.16b, v28.16b , v14.16b , #2 + add v16.8h, v16.8h , v18.8h + add v18.8h, v12.8h , v14.8h + ld1 {v14.4s, v15.4s}, [x6], x8 // load row 0 from temp buffer + smlal v30.4s, v16.4h, v26.4h + smlsl v30.4s, v18.4h, v24.4h + smlal2 v22.4s, v16.8h, v26.8h + smlsl2 v22.4s, v18.8h, v24.8h + sqrshrun v14.8b, v14.8h, #0x5 + ld1 {v28.4s, v29.4s}, [x6], x8 // load row 1 from temp buffer + uaddl v18.8h, v6.8b, v0.8b + sqrshrun v16.4h, v30.4s, #10 + sqrshrun v15.8b, v28.8h, #0x5 + sqrshrun v17.4h, v22.4s, #10 + + mov v12.8b, v25.8b + mov v25.8b, v24.8b + + uaddl v28.8h, v9.8b, v11.8b + uqxtn v13.8b, v16.8h + uqxtn v17.8b, v17.8h + mov v13.2s[1], v17.2s[0] + + urhadd v12.16b, v12.16b , v14.16b + urhadd v13.16b, v13.16b , v15.16b + uaddl v14.8h, v5.8b, v3.8b + uaddl v22.8h, v7.8b, v1.8b + mls v20.8h, v18.8h , v24.8h + st1 {v12.2s}, [x1], x3 // store row 0 + mla v14.8h, v28.8h , v26.8h + ld1 {v4.2s, v5.2s}, [x0], x2 // Vector load from src[8_0] + uaddl v30.8h, v10.8b, v0.8b + uaddl v28.8h, v6.8b, v4.8b + mls v14.8h, v22.8h , v24.8h + st1 {v13.2s}, [x1], x3 // store row 1 + mla v28.8h, v30.8h , v26.8h + st1 {v20.4s}, [x9], #16 // store row 2 to temp buffer: col 0 + ext v22.16b, v20.16b , v14.16b , #10 + saddl v30.4s, v20.4h, v22.4h + st1 {v14.2s, v15.2s}, [x9], x7 // store row 2 to temp buffer: col 0 + saddl2 v22.4s, v20.8h, v22.8h + ext v16.16b, v20.16b , v14.16b , #4 + ext v18.16b, v20.16b , v14.16b , #6 + ext v12.16b, v20.16b , v14.16b , #8 + ext v14.16b, v20.16b , v14.16b , #2 + add v16.8h, v16.8h , v18.8h + add v18.8h, v14.8h , v12.8h + uaddl v20.8h, v8.8b, v2.8b + smlal v30.4s, v16.4h, v26.4h + smlsl v30.4s, v18.4h, v24.4h + smlal2 v22.4s, v16.8h, v26.8h + smlsl2 v22.4s, v18.8h, v24.8h + uaddl v18.8h, v11.8b, v1.8b + uaddl v16.8h, v7.8b, v5.8b + sqrshrun v12.4h, v30.4s, #10 + uaddl v30.8h, v9.8b, v3.8b + mla v16.8h, v18.8h , v26.8h + sqrshrun v13.4h, v22.4s, #10 + mls v28.8h, v20.8h , v24.8h + ld1 {v14.4s, v15.4s}, [x6], x8 // load row 2 from temp buffer + mls v16.8h, v30.8h , v24.8h + uqxtn v27.8b, v12.8h + uqxtn v13.8b, v13.8h + mov v27.2s[1], v13.2s[0] + + sqrshrun v14.8b, v14.8h, #5 + ext v22.16b, v28.16b , v16.16b , #10 + st1 {v28.4s}, [x9], #16 // store row 3 to temp buffer: col 0 + saddl v30.4s, v28.4h, v22.4h + st1 {v16.2s, v17.2s}, [x9], x7 // store row 3 to temp buffer: col 1 + saddl2 v22.4s, v28.8h, v22.8h + ext v12.16b, v28.16b , v16.16b , #4 + ext v18.16b, v28.16b , v16.16b , #6 + ext v20.16b, v28.16b , v16.16b , #8 + ext v28.16b, v28.16b , v16.16b , #2 + add v12.8h, v12.8h , v18.8h + add v18.8h, v28.8h , v20.8h + ld1 {v16.4s, v17.4s}, [x6], x8 // load row 3 from temp buffer + smlal v30.4s, v12.4h, v26.4h + smlsl v30.4s, v18.4h, v24.4h + smlal2 v22.4s, v12.8h, v26.8h + smlsl2 v22.4s, v18.8h, v24.8h + sqrshrun v15.8b, v16.8h, #0x5 + + mov v12.8b, v27.8b + mov v27.8b, v26.8b + + sqrshrun v16.4h, v30.4s, #10 + + mov v6.16b, v2.16b + mov v7.16b, v3.16b + + sqrshrun v17.4h, v22.4s, #10 + + mov v2.16b, v10.16b + mov v3.16b, v11.16b + + mov v10.16b, v0.16b + mov v11.16b, v1.16b + + subs x4, x4, #4 + uqxtn v13.8b, v16.8h + uqxtn v17.8b, v17.8h + mov v13.2s[1], v17.2s[0] + urhadd v12.16b, v12.16b , v14.16b + urhadd v13.16b, v13.16b , v15.16b + + mov v0.16b, v8.16b + mov v1.16b, v9.16b + + mov v8.16b, v4.16b + mov v9.16b, v5.16b + + mov v4.16b, v10.16b + mov v5.16b, v11.16b + + st1 {v12.2s}, [x1], x3 // store row 2 + st1 {v13.2s}, [x1], x3 // store row 3 + + bgt loop_8 //if height =8 loop + b end_func + +loop_4_start: + ld1 {v0.2s, v1.2s}, [x0], x2 // Vector load from src[0_0] + ld1 {v2.2s, v3.2s}, [x0], x2 // Vector load from src[1_0] + ld1 {v4.2s, v5.2s}, [x0], x2 // Vector load from src[2_0] + ld1 {v6.2s, v7.2s}, [x0], x2 // Vector load from src[3_0] + ld1 {v8.2s, v9.2s}, [x0], x2 // Vector load from src[4_0] + +loop_4: + ld1 {v10.2s, v11.2s}, [x0], x2 // Vector load from src[5_0] + uaddl v14.8h, v4.8b, v6.8b // temp1 = src[2_0] + src[3_0] + uaddl v12.8h, v0.8b, v10.8b // temp = src[0_0] + src[5_0] + uaddl v16.8h, v2.8b, v8.8b // temp2 = src[1_0] + src[4_0] + mla v12.8h, v14.8h , v26.8h // temp += temp1 * 20 + uaddl v18.8h, v5.8b, v7.8b // temp1 = src[2_0] + src[3_0] + uaddl v14.8h, v1.8b, v11.8b // temp = src[0_0] + src[5_0] + uaddl v22.8h, v3.8b, v9.8b // temp2 = src[1_0] + src[4_0] + mla v14.8h, v18.8h , v26.8h // temp += temp1 * 20 + mls v12.8h, v16.8h , v24.8h // temp -= temp2 * 5 + ld1 {v0.2s, v1.2s}, [x0], x2 // Vector load from src[6_0] + uaddl v16.8h, v6.8b, v8.8b + mls v14.8h, v22.8h , v24.8h // temp -= temp2 * 5 + //Q6 and Q7 have filtered values + uaddl v28.8h, v2.8b, v0.8b + st1 {v12.4s}, [x9], #16 // store row 0 to temp buffer: col 0 + ext v22.16b, v12.16b , v14.16b , #10 + uaddl v18.8h, v4.8b, v10.8b + mla v28.8h, v16.8h , v26.8h + saddl v30.4s, v12.4h, v22.4h + st1 {v14.4s}, [x9], x7 // store row 0 to temp buffer: col 1 + saddl v22.4s, v13.4h, v23.4h + ext v16.16b, v12.16b , v14.16b , #4 + mls v28.8h, v18.8h , v24.8h + ext v18.16b, v12.16b , v14.16b , #6 + ext v20.16b, v12.16b , v14.16b , #8 + ext v14.16b, v12.16b , v14.16b , #2 + add v16.8h, v16.8h , v18.8h + add v18.8h, v14.8h , v20.8h + uaddl v20.8h, v7.8b, v9.8b + smlal v30.4s, v16.4h, v26.4h + smlsl v30.4s, v18.4h, v24.4h + smlal v22.4s, v17.4h, v26.4h + smlsl v22.4s, v19.4h, v24.4h + uaddl v14.8h, v3.8b, v1.8b + st1 {v28.4s}, [x9], #16 // store row 1 to temp buffer: col 0 + mla v14.8h, v20.8h , v26.8h + sqrshrun v12.4h, v30.4s, #10 + uaddl v16.8h, v5.8b, v11.8b + sqrshrun v13.4h, v22.4s, #10 + mls v14.8h, v16.8h , v24.8h + ld1 {v2.2s, v3.2s}, [x0], x2 // Vector load from src[7_0] + uqxtn v25.8b, v12.8h + uaddl v16.8h, v8.8b, v10.8b + + ext v22.16b, v28.16b , v14.16b , #10 + uaddl v20.8h, v4.8b, v2.8b + saddl v30.4s, v28.4h, v22.4h + mla v20.8h, v16.8h , v26.8h + st1 {v14.4s}, [x9], x7 // store row 1 to temp buffer: col 1 + saddl v22.4s, v29.4h, v23.4h + ext v16.16b, v28.16b , v14.16b , #4 + ext v18.16b, v28.16b , v14.16b , #6 + ext v12.16b, v28.16b , v14.16b , #8 + ext v14.16b, v28.16b , v14.16b , #2 + add v16.8h, v16.8h , v18.8h + add v18.8h, v12.8h , v14.8h + ld1 {v14.2s}, [x6], x8 //load row 0 from temp buffer + smlal v30.4s, v16.4h, v26.4h + smlsl v30.4s, v18.4h, v24.4h + smlal v22.4s, v17.4h, v26.4h + smlsl v22.4s, v19.4h, v24.4h + sqrshrun v14.8b, v14.8h, #0x5 + ld1 {v28.2s}, [x6], x8 //load row 1 from temp buffer + uaddl v18.8h, v6.8b, v0.8b + sqrshrun v16.4h, v30.4s, #10 + sqrshrun v15.8b, v28.8h, #0x5 + sqrshrun v17.4h, v22.4s, #10 + + mov v12.8b, v25.8b + mov v25.8b, v24.8b + + uaddl v28.8h, v9.8b, v11.8b + uqxtn v13.8b, v16.8h + + urhadd v12.16b, v12.16b , v14.16b + urhadd v13.16b, v13.16b , v15.16b + + uaddl v14.8h, v5.8b, v3.8b + uaddl v22.8h, v7.8b, v1.8b + mls v20.8h, v18.8h , v24.8h + st1 {v12.s}[0], [x1], x3 // store row 0 + mla v14.8h, v28.8h , v26.8h + ld1 {v4.2s, v5.2s}, [x0], x2 // Vector load from src[8_0] + uaddl v30.8h, v10.8b, v0.8b + uaddl v28.8h, v6.8b, v4.8b + mls v14.8h, v22.8h , v24.8h + st1 {v13.s}[0], [x1], x3 //store row 1 + mla v28.8h, v30.8h , v26.8h + st1 {v20.4s}, [x9], #16 // store row 2 to temp buffer: col 0 + ext v22.16b, v20.16b , v14.16b , #10 + saddl v30.4s, v20.4h, v22.4h + st1 {v14.4s}, [x9], x7 // store row 2 to temp buffer: col 1 + saddl v22.4s, v21.4h, v23.4h + ext v16.16b, v20.16b , v14.16b , #4 + ext v18.16b, v20.16b , v14.16b , #6 + ext v12.16b, v20.16b , v14.16b , #8 + ext v14.16b, v20.16b , v14.16b , #2 + add v16.8h, v16.8h , v18.8h + add v18.8h, v14.8h , v12.8h + uaddl v20.8h, v8.8b, v2.8b + smlal v30.4s, v16.4h, v26.4h + smlsl v30.4s, v18.4h, v24.4h + smlal v22.4s, v17.4h, v26.4h + smlsl v22.4s, v19.4h, v24.4h + uaddl v18.8h, v11.8b, v1.8b + uaddl v16.8h, v7.8b, v5.8b + sqrshrun v12.4h, v30.4s, #10 + uaddl v30.8h, v9.8b, v3.8b + mla v16.8h, v18.8h , v26.8h + sqrshrun v13.4h, v22.4s, #10 + mls v28.8h, v20.8h , v24.8h + ld1 {v14.2s}, [x6], x8 //load row 3 from temp buffer + mls v16.8h, v30.8h , v24.8h + uqxtn v27.8b, v12.8h + sqrshrun v14.8b, v14.8h, #5 + ext v22.16b, v28.16b , v16.16b , #10 + st1 {v28.4s}, [x9], #16 // store row 3 to temp buffer: col 0 + saddl v30.4s, v28.4h, v22.4h + st1 {v16.4s}, [x9], x7 // store row 3 to temp buffer: col 1 + saddl v22.4s, v29.4h, v23.4h + ext v12.16b, v28.16b , v16.16b , #4 + ext v18.16b, v28.16b , v16.16b , #6 + ext v20.16b, v28.16b , v16.16b , #8 + ext v28.16b, v28.16b , v16.16b , #2 + add v12.8h, v12.8h , v18.8h + add v18.8h, v28.8h , v20.8h + ld1 {v16.2s}, [x6], x8 //load row 4 from temp buffer + smlal v30.4s, v12.4h, v26.4h + smlsl v30.4s, v18.4h, v24.4h + smlal v22.4s, v13.4h, v26.4h + smlsl v22.4s, v19.4h, v24.4h + sqrshrun v15.8b, v16.8h, #0x5 + + mov v12.8b, v27.8b + mov v27.8b, v26.8b + + sqrshrun v16.4h, v30.4s, #10 + + mov v6.16b, v2.16b + mov v7.16b, v3.16b + + sqrshrun v17.4h, v22.4s, #10 + + mov v2.16b, v10.16b + mov v3.16b, v11.16b + + mov v10.16b, v0.16b + mov v11.16b, v1.16b + + subs x4, x4, #4 + uqxtn v13.8b, v16.8h + urhadd v12.16b, v12.16b , v14.16b + urhadd v13.16b, v13.16b , v15.16b + + mov v0.16b, v8.16b + mov v1.16b, v9.16b + + mov v8.16b, v4.16b + mov v9.16b, v5.16b + + + mov v4.16b, v10.16b + mov v5.16b, v11.16b + + + st1 {v12.s}[0], [x1], x3 // store row 2 + st1 {v13.s}[0], [x1], x3 // store row 3 + + bgt loop_4 + +end_func: + // LDMFD sp!,{x4-x12,PC} //Restoring registers from stack + ldp x19, x20, [sp], #16 + pop_v_regs + ret + + + diff --git a/common/armv8/ih264_inter_pred_luma_horz_qpel_vert_qpel_av8.s b/common/armv8/ih264_inter_pred_luma_horz_qpel_vert_qpel_av8.s new file mode 100755 index 0000000..ab663d0 --- /dev/null +++ b/common/armv8/ih264_inter_pred_luma_horz_qpel_vert_qpel_av8.s @@ -0,0 +1,958 @@ +//****************************************************************************** +//* +//* Copyright (C) 2015 The Android Open Source Project +//* +//* Licensed under the Apache License, Version 2.0 (the "License"); +//* you may not use this file except in compliance with the License. +//* You may obtain a copy of the License at: +//* +//* http://www.apache.org/licenses/LICENSE-2.0 +//* +//* Unless required by applicable law or agreed to in writing, software +//* distributed under the License is distributed on an "AS IS" BASIS, +//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +//* See the License for the specific language governing permissions and +//* limitations under the License. +//* +//***************************************************************************** +//* Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore +//*/ +///** +//****************************************************************************** +//* @file +//* ih264_inter_pred_luma_horz_qpel_vert_qpel_av8.s +//* +//* @brief +//* Contains function definitions for inter prediction interpolation. +//* +//* @author +//* Mohit +//* +//* @par List of Functions: +//* +//* - ih264_inter_pred_luma_horz_qpel_vert_qpel_av8() +//* +//* @remarks +//* None +//* +//******************************************************************************* +//*/ + +///* All the functions here are replicated from ih264_inter_pred_filters.c +// + +///** +///** +///** +//******************************************************************************* +//* +//* @brief +//* This function implements two six tap filters. It +//* applies the six tap filter in the horizontal direction on the +//* predictor values, then applies the same filter in the +//* vertical direction on the predictor values. It then averages these +//* two outputs to obtain quarter pel values in horizontal and vertical direction. +//* The six tap filtering operation is described in sec 8.4.2.2.1 titled +//* "Luma sample interpolation process" +//* +//* @par Description: +//* This function is called to obtain pixels lying at the following +//* location (1/4,1/4) or (3/4,1/4) or (1/4,3/4) or (3/4,3/4). +//* The function interpolates the predictors first in the horizontal direction +//* and then in the vertical direction, and then averages these two +//* values. +//* +//* @param[in] pu1_src +//* UWORD8 pointer to the source +//* +//* @param[out] pu1_dst +//* UWORD8 pointer to the destination +//* +//* @param[in] src_strd +//* integer source stride +//* +//* @param[in] dst_strd +//* integer destination stride +//* +//* @param[in] ht +//* integer height of the array +//* +//* @param[in] wd +//* integer width of the array +//* +//* @param[in] pu1_tmp: temporary buffer +//* +//* @param[in] dydx: x and y reference offset for qpel calculations +//* +//* @returns +//* +//* @remarks +//* None +//* +//******************************************************************************* +//*/; + +//void ih264_inter_pred_luma_horz_qpel_vert_qpel(UWORD8 *pu1_src, +// UWORD8 *pu1_dst, +// WORD32 src_strd,, +// WORD32 dst_strd, +// WORD32 ht, +// WORD32 wd, +// UWORD8* pu1_tmp, +// UWORD32 dydx) + +//**************Variables Vs Registers***************************************** +// x0 => *pu1_src +// x1 => *pu1_dst +// x2 => src_strd +// x3 => dst_strd +// x4 => ht +// x5 => wd +// x6 => dydx + +.text +.p2align 2 +.include "ih264_neon_macros.s" + + + + .global ih264_inter_pred_luma_horz_qpel_vert_qpel_av8 + +ih264_inter_pred_luma_horz_qpel_vert_qpel_av8: + + push_v_regs + stp x19, x20, [sp, #-16]! + mov x6, x7 + and x7, x6, #3 + add x7, x0, x7, lsr #1 //pu1_pred_vert = pu1_src + (x_offset>>1) + + and x6, x6, #12 //Finds y-offset + lsr x6, x6, #3 //dydx>>3 + mul x6, x2, x6 + add x6, x0, x6 //pu1_pred_horz = pu1_src + (y_offset>>1)*src_strd + sub x7, x7, x2, lsl #1 //pu1_pred_vert-2*src_strd + sub x6, x6, #2 //pu1_pred_horz-2 + movi v30.8b, #20 // Filter coeff 20 + movi v31.8b, #5 // Filter coeff 5 + + subs x12, x5, #4 //if wd=4 branch to loop_4 + beq loop_4_start + subs x12, x5, #8 //if wd=8 branch to loop_8 + beq loop_8_start + + ld1 {v0.2s, v1.2s}, [x7], x2 // Vector load from src[0_0] + ld1 {v2.2s, v3.2s}, [x7], x2 // Vector load from src[1_0] + + ld1 {v4.2s, v5.2s}, [x7], x2 // Vector load from src[2_0] + ld1 {v6.2s, v7.2s}, [x7], x2 // Vector load from src[3_0] + ld1 {v8.2s, v9.2s}, [x7], x2 // Vector load from src[4_0] + add x11, x6, #8 +loop_16: + ld1 {v10.2s, v11.2s}, [x7], x2 // Vector load from src[5_0] + ld1 {v18.2s, v19.2s}, [x6], x2 // horz row0, col 0 + uaddl v24.8h, v0.8b, v10.8b + umlal v24.8h, v4.8b, v30.8b + umlal v24.8h, v6.8b, v30.8b + umlsl v24.8h, v2.8b, v31.8b + umlsl v24.8h, v8.8b, v31.8b + ext v23.8b, v18.8b , v19.8b , #5 + ext v20.8b, v18.8b , v19.8b , #2 + ext v21.8b, v18.8b , v19.8b , #3 + ext v22.8b, v18.8b , v19.8b , #4 + ext v19.8b, v18.8b , v19.8b , #1 + sqrshrun v26.8b, v24.8h, #5 + uaddl v28.8h, v18.8b, v23.8b + umlal v28.8h, v20.8b, v30.8b + umlal v28.8h, v21.8b, v30.8b + umlsl v28.8h, v19.8b, v31.8b + umlsl v28.8h, v22.8b, v31.8b + ld1 {v18.2s, v19.2s}, [x11], x2 // horz row 0, col 1 + uaddl v24.8h, v1.8b, v11.8b + umlal v24.8h, v5.8b, v30.8b + umlal v24.8h, v7.8b, v30.8b + umlsl v24.8h, v3.8b, v31.8b + umlsl v24.8h, v9.8b, v31.8b + sqrshrun v28.8b, v28.8h, #5 + ext v23.8b, v18.8b , v19.8b , #5 + ext v20.8b, v18.8b , v19.8b , #2 + ext v21.8b, v18.8b , v19.8b , #3 + ext v22.8b, v18.8b , v19.8b , #4 + ext v19.8b, v18.8b , v19.8b , #1 + + sqrshrun v27.8b, v24.8h, #5 + ld1 {v12.2s, v13.2s}, [x7], x2 // src[6_0] + + uaddl v24.8h, v18.8b, v23.8b + umlal v24.8h, v20.8b, v30.8b + umlal v24.8h, v21.8b, v30.8b + umlsl v24.8h, v19.8b, v31.8b + umlsl v24.8h, v22.8b, v31.8b + + uaddl v16.8h, v2.8b, v12.8b + umlal v16.8h, v6.8b, v30.8b + umlal v16.8h, v8.8b, v30.8b + umlsl v16.8h, v4.8b, v31.8b + umlsl v16.8h, v10.8b, v31.8b + + sqrshrun v29.8b, v24.8h, #5 + ld1 {v18.2s, v19.2s}, [x6], x2 // horz row 1, col 0 + + uaddl v24.8h, v3.8b, v13.8b + umlal v24.8h, v7.8b, v30.8b + umlal v24.8h, v9.8b, v30.8b + umlsl v24.8h, v5.8b, v31.8b + umlsl v24.8h, v11.8b, v31.8b + urhadd v28.16b, v28.16b , v26.16b + urhadd v29.16b, v29.16b , v27.16b + sqrshrun v26.8b, v16.8h, #5 + ext v23.8b, v18.8b , v19.8b , #5 + ext v20.8b, v18.8b , v19.8b , #2 + ext v21.8b, v18.8b , v19.8b , #3 + ext v22.8b, v18.8b , v19.8b , #4 + st1 {v28.2s, v29.2s}, [x1], x3 // store row 0 + ext v19.8b, v18.8b , v19.8b , #1 + + sqrshrun v27.8b, v24.8h, #5 + + uaddl v28.8h, v18.8b, v23.8b + umlal v28.8h, v20.8b, v30.8b + umlal v28.8h, v21.8b, v30.8b + umlsl v28.8h, v19.8b, v31.8b + umlsl v28.8h, v22.8b, v31.8b + + ld1 {v18.2s, v19.2s}, [x11], x2 // horz row 1, col 1 + ld1 {v14.2s, v15.2s}, [x7], x2 // src[7_0] + + ext v23.8b, v18.8b , v19.8b , #5 + ext v20.8b, v18.8b , v19.8b , #2 + ext v21.8b, v18.8b , v19.8b , #3 + ext v22.8b, v18.8b , v19.8b , #4 + ext v19.8b, v18.8b , v19.8b , #1 + + sqrshrun v28.8b, v28.8h, #5 + uaddl v24.8h, v18.8b, v23.8b + umlal v24.8h, v20.8b, v30.8b + umlal v24.8h, v21.8b, v30.8b + umlsl v24.8h, v19.8b, v31.8b + umlsl v24.8h, v22.8b, v31.8b + + ld1 {v18.2s, v19.2s}, [x6], x2 // horz row 2, col 0 + uaddl v16.8h, v4.8b, v14.8b + umlal v16.8h, v8.8b, v30.8b + umlal v16.8h, v10.8b, v30.8b + umlsl v16.8h, v6.8b, v31.8b + umlsl v16.8h, v12.8b, v31.8b + + sqrshrun v29.8b, v24.8h, #5 + ext v23.8b, v18.8b , v19.8b , #5 + ext v20.8b, v18.8b , v19.8b , #2 + ext v21.8b, v18.8b , v19.8b , #3 + ext v22.8b, v18.8b , v19.8b , #4 + ext v19.8b, v18.8b , v19.8b , #1 + urhadd v28.16b, v28.16b , v26.16b + urhadd v29.16b, v29.16b , v27.16b + sqrshrun v26.8b, v16.8h, #5 + + uaddl v24.8h, v5.8b, v15.8b + umlal v24.8h, v9.8b, v30.8b + umlal v24.8h, v11.8b, v30.8b + umlsl v24.8h, v7.8b, v31.8b + umlsl v24.8h, v13.8b, v31.8b + + st1 {v28.2s, v29.2s}, [x1], x3 // store row 1 + + uaddl v28.8h, v18.8b, v23.8b + umlal v28.8h, v20.8b, v30.8b + umlal v28.8h, v21.8b, v30.8b + umlsl v28.8h, v19.8b, v31.8b + umlsl v28.8h, v22.8b, v31.8b + + ld1 {v18.2s, v19.2s}, [x11], x2 // horz row 2, col 1 + sqrshrun v27.8b, v24.8h, #5 + ext v23.8b, v18.8b , v19.8b , #5 + ext v20.8b, v18.8b , v19.8b , #2 + ext v21.8b, v18.8b , v19.8b , #3 + ext v22.8b, v18.8b , v19.8b , #4 + ext v19.8b, v18.8b , v19.8b , #1 + + sqrshrun v28.8b, v28.8h, #5 + ld1 {v16.2s, v17.2s}, [x7], x2 // src[8_0] + uaddl v24.8h, v18.8b, v23.8b + umlal v24.8h, v20.8b, v30.8b + umlal v24.8h, v21.8b, v30.8b + umlsl v24.8h, v19.8b, v31.8b + umlsl v24.8h, v22.8b, v31.8b + + ld1 {v18.2s, v19.2s}, [x6], x2 // horz row 3, col 0 + uaddl v0.8h, v6.8b, v16.8b + umlal v0.8h, v10.8b, v30.8b + umlal v0.8h, v12.8b, v30.8b + umlsl v0.8h, v8.8b, v31.8b + umlsl v0.8h, v14.8b, v31.8b + + sqrshrun v29.8b, v24.8h, #5 + + ext v23.8b, v18.8b , v19.8b , #5 + ext v20.8b, v18.8b , v19.8b , #2 + ext v21.8b, v18.8b , v19.8b , #3 + urhadd v28.16b, v28.16b , v26.16b + urhadd v29.16b, v29.16b , v27.16b + ext v22.8b, v18.8b , v19.8b , #4 + ext v19.8b, v18.8b , v19.8b , #1 + sqrshrun v26.8b, v0.8h, #5 + st1 {v28.2s, v29.2s}, [x1], x3 // store row 2 + + uaddl v24.8h, v18.8b, v23.8b + umlal v24.8h, v20.8b, v30.8b + umlal v24.8h, v21.8b, v30.8b + umlsl v24.8h, v19.8b, v31.8b + umlsl v24.8h, v22.8b, v31.8b + + ld1 {v18.2s, v19.2s}, [x11], x2 // horz row 3, col 1 + + uaddl v0.8h, v7.8b, v17.8b + umlal v0.8h, v11.8b, v30.8b + umlal v0.8h, v13.8b, v30.8b + umlsl v0.8h, v9.8b, v31.8b + umlsl v0.8h, v15.8b, v31.8b + + sqrshrun v28.8b, v24.8h, #5 + + ext v23.8b, v18.8b , v19.8b , #5 + ext v20.8b, v18.8b , v19.8b , #2 + ext v21.8b, v18.8b , v19.8b , #3 + ext v22.8b, v18.8b , v19.8b , #4 + ext v19.8b, v18.8b , v19.8b , #1 + + sqrshrun v27.8b, v0.8h, #5 + + uaddl v24.8h, v18.8b, v23.8b + umlal v24.8h, v20.8b, v30.8b + umlal v24.8h, v21.8b, v30.8b + umlsl v24.8h, v19.8b, v31.8b + umlsl v24.8h, v22.8b, v31.8b + + mov v0.16b, v8.16b + mov v1.16b, v9.16b + + mov v2.16b, v10.16b + mov v3.16b, v11.16b + + mov v4.16b, v12.16b + mov v5.16b, v13.16b + + mov v6.16b, v14.16b + mov v7.16b, v15.16b + + mov v8.16b, v16.16b + mov v9.16b, v17.16b + + sqrshrun v29.8b, v24.8h, #5 + urhadd v28.16b, v28.16b , v26.16b + urhadd v29.16b, v29.16b , v27.16b + st1 {v28.2s, v29.2s}, [x1], x3 // store row 3 + + ld1 {v10.2s, v11.2s}, [x7], x2 // Vector load from src[9_0] + ld1 {v18.2s, v19.2s}, [x6], x2 // horz row4, col 0 + uaddl v24.8h, v0.8b, v10.8b + umlal v24.8h, v4.8b, v30.8b + umlal v24.8h, v6.8b, v30.8b + umlsl v24.8h, v2.8b, v31.8b + umlsl v24.8h, v8.8b, v31.8b + ext v23.8b, v18.8b , v19.8b , #5 + ext v20.8b, v18.8b , v19.8b , #2 + ext v21.8b, v18.8b , v19.8b , #3 + ext v22.8b, v18.8b , v19.8b , #4 + ext v19.8b, v18.8b , v19.8b , #1 + sqrshrun v26.8b, v24.8h, #5 + uaddl v28.8h, v18.8b, v23.8b + umlal v28.8h, v20.8b, v30.8b + umlal v28.8h, v21.8b, v30.8b + umlsl v28.8h, v19.8b, v31.8b + umlsl v28.8h, v22.8b, v31.8b + ld1 {v18.2s, v19.2s}, [x11], x2 // horz row 4, col 1 + uaddl v24.8h, v1.8b, v11.8b + umlal v24.8h, v5.8b, v30.8b + umlal v24.8h, v7.8b, v30.8b + umlsl v24.8h, v3.8b, v31.8b + umlsl v24.8h, v9.8b, v31.8b + sqrshrun v28.8b, v28.8h, #5 + ext v23.8b, v18.8b , v19.8b , #5 + ext v20.8b, v18.8b , v19.8b , #2 + ext v21.8b, v18.8b , v19.8b , #3 + ext v22.8b, v18.8b , v19.8b , #4 + ext v19.8b, v18.8b , v19.8b , #1 + + sqrshrun v27.8b, v24.8h, #5 + ld1 {v12.2s, v13.2s}, [x7], x2 // src[10_0] + uaddl v24.8h, v18.8b, v23.8b + umlal v24.8h, v20.8b, v30.8b + umlal v24.8h, v21.8b, v30.8b + umlsl v24.8h, v19.8b, v31.8b + umlsl v24.8h, v22.8b, v31.8b + uaddl v16.8h, v2.8b, v12.8b + umlal v16.8h, v6.8b, v30.8b + umlal v16.8h, v8.8b, v30.8b + umlsl v16.8h, v4.8b, v31.8b + umlsl v16.8h, v10.8b, v31.8b + sqrshrun v29.8b, v24.8h, #5 + ld1 {v18.2s, v19.2s}, [x6], x2 // horz row 5, col 0 + uaddl v24.8h, v3.8b, v13.8b + umlal v24.8h, v7.8b, v30.8b + umlal v24.8h, v9.8b, v30.8b + umlsl v24.8h, v5.8b, v31.8b + umlsl v24.8h, v11.8b, v31.8b + urhadd v28.16b, v28.16b , v26.16b + urhadd v29.16b, v29.16b , v27.16b + sqrshrun v26.8b, v16.8h, #5 + ext v23.8b, v18.8b , v19.8b , #5 + ext v20.8b, v18.8b , v19.8b , #2 + ext v21.8b, v18.8b , v19.8b , #3 + ext v22.8b, v18.8b , v19.8b , #4 + st1 {v28.2s, v29.2s}, [x1], x3 // store row 4 + ext v19.8b, v18.8b , v19.8b , #1 + + sqrshrun v27.8b, v24.8h, #5 + + uaddl v28.8h, v18.8b, v23.8b + umlal v28.8h, v20.8b, v30.8b + umlal v28.8h, v21.8b, v30.8b + umlsl v28.8h, v19.8b, v31.8b + umlsl v28.8h, v22.8b, v31.8b + + ld1 {v18.2s, v19.2s}, [x11], x2 // horz row 5, col 1 + ld1 {v14.2s, v15.2s}, [x7], x2 // src[11_0] + + ext v23.8b, v18.8b , v19.8b , #5 + ext v20.8b, v18.8b , v19.8b , #2 + ext v21.8b, v18.8b , v19.8b , #3 + ext v22.8b, v18.8b , v19.8b , #4 + ext v19.8b, v18.8b , v19.8b , #1 + + sqrshrun v28.8b, v28.8h, #5 + uaddl v24.8h, v18.8b, v23.8b + umlal v24.8h, v20.8b, v30.8b + umlal v24.8h, v21.8b, v30.8b + umlsl v24.8h, v19.8b, v31.8b + umlsl v24.8h, v22.8b, v31.8b + + ld1 {v18.2s, v19.2s}, [x6], x2 // horz row 6, col 0 + uaddl v16.8h, v4.8b, v14.8b + umlal v16.8h, v8.8b, v30.8b + umlal v16.8h, v10.8b, v30.8b + umlsl v16.8h, v6.8b, v31.8b + umlsl v16.8h, v12.8b, v31.8b + + sqrshrun v29.8b, v24.8h, #5 + ext v23.8b, v18.8b , v19.8b , #5 + ext v20.8b, v18.8b , v19.8b , #2 + ext v21.8b, v18.8b , v19.8b , #3 + ext v22.8b, v18.8b , v19.8b , #4 + ext v19.8b, v18.8b , v19.8b , #1 + urhadd v28.16b, v28.16b , v26.16b + urhadd v29.16b, v29.16b , v27.16b + sqrshrun v26.8b, v16.8h, #5 + + uaddl v24.8h, v5.8b, v15.8b + umlal v24.8h, v9.8b, v30.8b + umlal v24.8h, v11.8b, v30.8b + umlsl v24.8h, v7.8b, v31.8b + umlsl v24.8h, v13.8b, v31.8b + + st1 {v28.2s, v29.2s}, [x1], x3 // store row 5 + + uaddl v28.8h, v18.8b, v23.8b + umlal v28.8h, v20.8b, v30.8b + umlal v28.8h, v21.8b, v30.8b + umlsl v28.8h, v19.8b, v31.8b + umlsl v28.8h, v22.8b, v31.8b + + ld1 {v18.2s, v19.2s}, [x11], x2 // horz row 6, col 1 + sqrshrun v27.8b, v24.8h, #5 + ext v23.8b, v18.8b , v19.8b , #5 + ext v20.8b, v18.8b , v19.8b , #2 + ext v21.8b, v18.8b , v19.8b , #3 + ext v22.8b, v18.8b , v19.8b , #4 + ext v19.8b, v18.8b , v19.8b , #1 + + sqrshrun v28.8b, v28.8h, #5 + ld1 {v16.2s, v17.2s}, [x7], x2 // src[12_0] + uaddl v24.8h, v18.8b, v23.8b + umlal v24.8h, v20.8b, v30.8b + umlal v24.8h, v21.8b, v30.8b + umlsl v24.8h, v19.8b, v31.8b + umlsl v24.8h, v22.8b, v31.8b + + ld1 {v18.2s, v19.2s}, [x6], x2 // horz row 7, col 0 + uaddl v0.8h, v6.8b, v16.8b + umlal v0.8h, v10.8b, v30.8b + umlal v0.8h, v12.8b, v30.8b + umlsl v0.8h, v8.8b, v31.8b + umlsl v0.8h, v14.8b, v31.8b + + sqrshrun v29.8b, v24.8h, #5 + + ext v23.8b, v18.8b , v19.8b , #5 + ext v20.8b, v18.8b , v19.8b , #2 + ext v21.8b, v18.8b , v19.8b , #3 + urhadd v28.16b, v28.16b , v26.16b + urhadd v29.16b, v29.16b , v27.16b + ext v22.8b, v18.8b , v19.8b , #4 + ext v19.8b, v18.8b , v19.8b , #1 + sqrshrun v26.8b, v0.8h, #5 + st1 {v28.2s, v29.2s}, [x1], x3 // store row 6 + + uaddl v24.8h, v18.8b, v23.8b + umlal v24.8h, v20.8b, v30.8b + umlal v24.8h, v21.8b, v30.8b + umlsl v24.8h, v19.8b, v31.8b + umlsl v24.8h, v22.8b, v31.8b + + ld1 {v18.2s, v19.2s}, [x11], x2 // horz row 7, col 1 + + uaddl v0.8h, v7.8b, v17.8b + umlal v0.8h, v11.8b, v30.8b + umlal v0.8h, v13.8b, v30.8b + umlsl v0.8h, v9.8b, v31.8b + umlsl v0.8h, v15.8b, v31.8b + + sqrshrun v28.8b, v24.8h, #5 + + ext v23.8b, v18.8b , v19.8b , #5 + ext v20.8b, v18.8b , v19.8b , #2 + ext v21.8b, v18.8b , v19.8b , #3 + ext v22.8b, v18.8b , v19.8b , #4 + ext v19.8b, v18.8b , v19.8b , #1 + + sqrshrun v27.8b, v0.8h, #5 + + uaddl v24.8h, v18.8b, v23.8b + umlal v24.8h, v20.8b, v30.8b + umlal v24.8h, v21.8b, v30.8b + umlsl v24.8h, v19.8b, v31.8b + umlsl v24.8h, v22.8b, v31.8b + + mov v0.16b, v8.16b + mov v1.16b, v9.16b + + mov v2.16b, v10.16b + mov v3.16b, v11.16b + + mov v4.16b, v12.16b + mov v5.16b, v13.16b + + mov v6.16b, v14.16b + mov v7.16b, v15.16b + + mov v8.16b, v16.16b + mov v9.16b, v17.16b + + sqrshrun v29.8b, v24.8h, #5 + subs x4, x4, #8 + urhadd v28.16b, v28.16b , v26.16b + urhadd v29.16b, v29.16b , v27.16b + st1 {v28.2s, v29.2s}, [x1], x3 // store row 7 + + beq end_func // stop looping if ht == 8 + b loop_16 + + +loop_8_start: + ld1 {v0.2s}, [x7], x2 // Vector load from src[0_0] + ld1 {v1.2s}, [x7], x2 // Vector load from src[1_0] + ld1 {v2.2s}, [x7], x2 // Vector load from src[2_0] + ld1 {v3.2s}, [x7], x2 // Vector load from src[3_0] + ld1 {v4.2s}, [x7], x2 // Vector load from src[4_0] + +loop_8: + ld1 {v5.2s}, [x7], x2 // Vector load from src[5_0] + uaddl v10.8h, v0.8b, v5.8b + umlal v10.8h, v2.8b, v30.8b + umlal v10.8h, v3.8b, v30.8b + umlsl v10.8h, v1.8b, v31.8b + umlsl v10.8h, v4.8b, v31.8b + ld1 {v12.2s, v13.2s}, [x6], x2 //horz row 0 + ext v17.8b, v12.8b , v13.8b , #5 + ext v14.8b, v12.8b , v13.8b , #2 + ext v15.8b, v12.8b , v13.8b , #3 + ext v16.8b, v12.8b , v13.8b , #4 + ext v13.8b, v12.8b , v13.8b , #1 + sqrshrun v26.8b, v10.8h, #5 + ld1 {v6.2s}, [x7], x2 // src[6_0] + uaddl v10.8h, v12.8b, v17.8b + umlal v10.8h, v14.8b, v30.8b + umlal v10.8h, v15.8b, v30.8b + umlsl v10.8h, v13.8b, v31.8b + umlsl v10.8h, v16.8b, v31.8b + ld1 {v12.2s, v13.2s}, [x6], x2 // horz row 1 + uaddl v18.8h, v1.8b, v6.8b + umlal v18.8h, v3.8b, v30.8b + umlal v18.8h, v4.8b, v30.8b + umlsl v18.8h, v2.8b, v31.8b + umlsl v18.8h, v5.8b, v31.8b + sqrshrun v28.8b, v10.8h, #5 + ext v17.8b, v12.8b , v13.8b , #5 + ext v14.8b, v12.8b , v13.8b , #2 + ext v15.8b, v12.8b , v13.8b , #3 + ext v16.8b, v12.8b , v13.8b , #4 + ext v13.8b, v12.8b , v13.8b , #1 + sqrshrun v27.8b, v18.8h, #5 + ld1 {v7.2s}, [x7], x2 // src[7_0] + uaddl v10.8h, v12.8b, v17.8b + umlal v10.8h, v14.8b, v30.8b + umlal v10.8h, v15.8b, v30.8b + umlsl v10.8h, v13.8b, v31.8b + umlsl v10.8h, v16.8b, v31.8b + ld1 {v12.2s, v13.2s}, [x6], x2 // horz row 2 + uaddl v18.8h, v2.8b, v7.8b + umlal v18.8h, v4.8b, v30.8b + umlal v18.8h, v5.8b, v30.8b + umlsl v18.8h, v3.8b, v31.8b + umlsl v18.8h, v6.8b, v31.8b + sqrshrun v29.8b, v10.8h, #5 + ext v17.8b, v12.8b , v13.8b , #5 + ext v14.8b, v12.8b , v13.8b , #2 + ext v15.8b, v12.8b , v13.8b , #3 + ext v16.8b, v12.8b , v13.8b , #4 + ext v13.8b, v12.8b , v13.8b , #1 + urhadd v26.16b, v26.16b , v28.16b + urhadd v27.16b, v27.16b , v29.16b + sqrshrun v28.8b, v18.8h, #5 + ld1 {v8.2s}, [x7], x2 // src[8_0] + uaddl v10.8h, v12.8b, v17.8b + umlal v10.8h, v14.8b, v30.8b + umlal v10.8h, v15.8b, v30.8b + umlsl v10.8h, v13.8b, v31.8b + umlsl v10.8h, v16.8b, v31.8b + ld1 {v12.2s, v13.2s}, [x6], x2 // horz row 3 + uaddl v18.8h, v3.8b, v8.8b + umlal v18.8h, v5.8b, v30.8b + umlal v18.8h, v6.8b, v30.8b + umlsl v18.8h, v4.8b, v31.8b + umlsl v18.8h, v7.8b, v31.8b + sqrshrun v24.8b, v10.8h, #5 + ext v17.8b, v12.8b , v13.8b , #5 + ext v14.8b, v12.8b , v13.8b , #2 + ext v15.8b, v12.8b , v13.8b , #3 + ext v16.8b, v12.8b , v13.8b , #4 + ext v13.8b, v12.8b , v13.8b , #1 + sqrshrun v29.8b, v18.8h, #5 + uaddl v10.8h, v12.8b, v17.8b + umlal v10.8h, v14.8b, v30.8b + umlal v10.8h, v15.8b, v30.8b + umlsl v10.8h, v13.8b, v31.8b + umlsl v10.8h, v16.8b, v31.8b + st1 {v26.2s}, [x1], x3 + + mov v0.16b, v4.16b + mov v1.16b, v5.16b + + st1 {v27.2s}, [x1], x3 + + mov v2.16b, v6.16b + mov v3.16b, v7.16b + + mov v4.8b, v8.8b + + sqrshrun v25.8b, v10.8h, #5 + subs x9, x4, #4 + urhadd v24.16b, v24.16b , v28.16b + urhadd v25.16b, v25.16b , v29.16b + st1 {v24.2s}, [x1], x3 + st1 {v25.2s}, [x1], x3 + beq end_func // Branch if height==4 + + ld1 {v5.2s}, [x7], x2 // Vector load from src[9_0] + uaddl v10.8h, v0.8b, v5.8b + umlal v10.8h, v2.8b, v30.8b + umlal v10.8h, v3.8b, v30.8b + umlsl v10.8h, v1.8b, v31.8b + umlsl v10.8h, v4.8b, v31.8b + ld1 {v12.2s, v13.2s}, [x6], x2 //horz row 4 + ext v17.8b, v12.8b , v13.8b , #5 + ext v14.8b, v12.8b , v13.8b , #2 + ext v15.8b, v12.8b , v13.8b , #3 + ext v16.8b, v12.8b , v13.8b , #4 + ext v13.8b, v12.8b , v13.8b , #1 + sqrshrun v26.8b, v10.8h, #5 + ld1 {v6.2s}, [x7], x2 // src[10_0] + uaddl v10.8h, v12.8b, v17.8b + umlal v10.8h, v14.8b, v30.8b + umlal v10.8h, v15.8b, v30.8b + umlsl v10.8h, v13.8b, v31.8b + umlsl v10.8h, v16.8b, v31.8b + ld1 {v12.2s, v13.2s}, [x6], x2 // horz row 5 + uaddl v18.8h, v1.8b, v6.8b + umlal v18.8h, v3.8b, v30.8b + umlal v18.8h, v4.8b, v30.8b + umlsl v18.8h, v2.8b, v31.8b + umlsl v18.8h, v5.8b, v31.8b + sqrshrun v28.8b, v10.8h, #5 + ext v17.8b, v12.8b , v13.8b , #5 + ext v14.8b, v12.8b , v13.8b , #2 + ext v15.8b, v12.8b , v13.8b , #3 + ext v16.8b, v12.8b , v13.8b , #4 + ext v13.8b, v12.8b , v13.8b , #1 + sqrshrun v27.8b, v18.8h, #5 + ld1 {v7.2s}, [x7], x2 // src[11_0] + uaddl v10.8h, v12.8b, v17.8b + umlal v10.8h, v14.8b, v30.8b + umlal v10.8h, v15.8b, v30.8b + umlsl v10.8h, v13.8b, v31.8b + umlsl v10.8h, v16.8b, v31.8b + ld1 {v12.2s, v13.2s}, [x6], x2 // horz row 6 + uaddl v18.8h, v2.8b, v7.8b + umlal v18.8h, v4.8b, v30.8b + umlal v18.8h, v5.8b, v30.8b + umlsl v18.8h, v3.8b, v31.8b + umlsl v18.8h, v6.8b, v31.8b + sqrshrun v29.8b, v10.8h, #5 + ext v17.8b, v12.8b , v13.8b , #5 + ext v14.8b, v12.8b , v13.8b , #2 + ext v15.8b, v12.8b , v13.8b , #3 + ext v16.8b, v12.8b , v13.8b , #4 + ext v13.8b, v12.8b , v13.8b , #1 + urhadd v26.16b, v26.16b , v28.16b + urhadd v27.16b, v27.16b , v29.16b + sqrshrun v28.8b, v18.8h, #5 + ld1 {v8.2s}, [x7], x2 // src[12_0] + uaddl v10.8h, v12.8b, v17.8b + umlal v10.8h, v14.8b, v30.8b + umlal v10.8h, v15.8b, v30.8b + umlsl v10.8h, v13.8b, v31.8b + umlsl v10.8h, v16.8b, v31.8b + ld1 {v12.2s, v13.2s}, [x6], x2 // horz row 7 + uaddl v18.8h, v3.8b, v8.8b + umlal v18.8h, v5.8b, v30.8b + umlal v18.8h, v6.8b, v30.8b + umlsl v18.8h, v4.8b, v31.8b + umlsl v18.8h, v7.8b, v31.8b + sqrshrun v24.8b, v10.8h, #5 + ext v17.8b, v12.8b , v13.8b , #5 + ext v14.8b, v12.8b , v13.8b , #2 + ext v15.8b, v12.8b , v13.8b , #3 + ext v16.8b, v12.8b , v13.8b , #4 + ext v13.8b, v12.8b , v13.8b , #1 + sqrshrun v29.8b, v18.8h, #5 + uaddl v10.8h, v12.8b, v17.8b + umlal v10.8h, v14.8b, v30.8b + umlal v10.8h, v15.8b, v30.8b + umlsl v10.8h, v13.8b, v31.8b + umlsl v10.8h, v16.8b, v31.8b + st1 {v26.2s}, [x1], x3 + + mov v0.16b, v4.16b + mov v1.16b, v5.16b + st1 {v27.2s}, [x1], x3 + + mov v2.16b, v6.16b + mov v3.16b, v7.16b + + mov v4.8b, v8.8b + mov v5.8b, v9.8b + + sqrshrun v25.8b, v10.8h, #5 + subs x4, x4, #8 + urhadd v24.16b, v24.16b , v28.16b + urhadd v25.16b, v25.16b , v29.16b + st1 {v24.2s}, [x1], x3 + st1 {v25.2s}, [x1], x3 + bgt loop_8 //if height =8 loop + b end_func + +loop_4_start: + ld1 {v0.s}[0], [x7], x2 // Vector load from src[0_0] + ld1 {v1.s}[0], [x7], x2 // Vector load from src[1_0] + + ld1 {v2.s}[0], [x7], x2 // Vector load from src[2_0] + ld1 {v3.s}[0], [x7], x2 // Vector load from src[3_0] + ld1 {v4.s}[0], [x7], x2 // Vector load from src[4_0] + + ld1 {v5.s}[0], [x7], x2 // Vector load from src[5_0] + uaddl v10.8h, v0.8b, v5.8b + umlal v10.8h, v2.8b, v30.8b + umlal v10.8h, v3.8b, v30.8b + umlsl v10.8h, v1.8b, v31.8b + umlsl v10.8h, v4.8b, v31.8b + ld1 {v12.2s, v13.2s}, [x6], x2 //load for horz filter row 0 + ext v17.8b, v12.8b , v13.8b , #5 + ext v14.8b, v12.8b , v13.8b , #2 + ext v15.8b, v12.8b , v13.8b , #3 + ext v16.8b, v12.8b , v13.8b , #4 + ext v13.8b, v12.8b , v13.8b , #1 + sqrshrun v26.8b, v10.8h, #5 + ld1 {v6.s}[0], [x7], x2 // Vector load from src[6_0] + uaddl v10.8h, v12.8b, v17.8b + umlal v10.8h, v14.8b, v30.8b + umlal v10.8h, v15.8b, v30.8b + umlsl v10.8h, v13.8b, v31.8b + umlsl v10.8h, v16.8b, v31.8b + ld1 {v12.2s, v13.2s}, [x6], x2 //horz row 1 + uaddl v18.8h, v1.8b, v6.8b + umlal v18.8h, v3.8b, v30.8b + umlal v18.8h, v4.8b, v30.8b + umlsl v18.8h, v2.8b, v31.8b + umlsl v18.8h, v5.8b, v31.8b + sqrshrun v28.8b, v10.8h, #5 + ext v17.8b, v12.8b , v13.8b , #5 + ext v14.8b, v12.8b , v13.8b , #2 + ext v15.8b, v12.8b , v13.8b , #3 + ext v16.8b, v12.8b , v13.8b , #4 + ext v13.8b, v12.8b , v13.8b , #1 + sqrshrun v27.8b, v18.8h, #5 + ld1 {v7.s}[0], [x7], x2 // Vector load from src[7_0] + uaddl v10.8h, v12.8b, v17.8b + umlal v10.8h, v14.8b, v30.8b + umlal v10.8h, v15.8b, v30.8b + umlsl v10.8h, v13.8b, v31.8b + umlsl v10.8h, v16.8b, v31.8b + ld1 {v12.2s, v13.2s}, [x6], x2 //horz row 2 + uaddl v18.8h, v2.8b, v7.8b + umlal v18.8h, v4.8b, v30.8b + umlal v18.8h, v5.8b, v30.8b + umlsl v18.8h, v3.8b, v31.8b + umlsl v18.8h, v6.8b, v31.8b + sqrshrun v29.8b, v10.8h, #5 + ext v17.8b, v12.8b , v13.8b , #5 + ext v14.8b, v12.8b , v13.8b , #2 + ext v15.8b, v12.8b , v13.8b , #3 + ext v16.8b, v12.8b , v13.8b , #4 + ext v13.8b, v12.8b , v13.8b , #1 + urhadd v26.16b, v26.16b , v28.16b + urhadd v27.16b, v27.16b , v29.16b + sqrshrun v28.8b, v18.8h, #5 + ld1 {v8.s}[0], [x7], x2 // Vector load from src[8_0] + uaddl v10.8h, v12.8b, v17.8b + umlal v10.8h, v14.8b, v30.8b + umlal v10.8h, v15.8b, v30.8b + umlsl v10.8h, v13.8b, v31.8b + umlsl v10.8h, v16.8b, v31.8b + ld1 {v12.2s, v13.2s}, [x6], x2 //horz row 3 + uaddl v18.8h, v3.8b, v8.8b + umlal v18.8h, v5.8b, v30.8b + umlal v18.8h, v6.8b, v30.8b + umlsl v18.8h, v4.8b, v31.8b + umlsl v18.8h, v7.8b, v31.8b + sqrshrun v24.8b, v10.8h, #5 + ext v17.8b, v12.8b , v13.8b , #5 + ext v14.8b, v12.8b , v13.8b , #2 + ext v15.8b, v12.8b , v13.8b , #3 + ext v16.8b, v12.8b , v13.8b , #4 + ext v13.8b, v12.8b , v13.8b , #1 + sqrshrun v29.8b, v18.8h, #5 + uaddl v10.8h, v12.8b, v17.8b + umlal v10.8h, v14.8b, v30.8b + umlal v10.8h, v15.8b, v30.8b + umlsl v10.8h, v13.8b, v31.8b + umlsl v10.8h, v16.8b, v31.8b + st1 {v26.s}[0], [x1], x3 + + mov v0.16b, v4.16b + mov v1.16b, v5.16b + + st1 {v27.s}[0], [x1], x3 + + mov v2.16b, v6.16b + mov v3.16b, v7.16b + mov v4.8b, v8.8b + + sqrshrun v25.8b, v10.8h, #5 + subs x4, x4, #4 + urhadd v24.16b, v24.16b , v28.16b + urhadd v25.16b, v25.16b , v29.16b + st1 {v24.s}[0], [x1], x3 + st1 {v25.s}[0], [x1], x3 + beq end_func // Branch if height==4 + + ld1 {v5.s}[0], [x7], x2 // Vector load from src[5_0] + uaddl v10.8h, v0.8b, v5.8b + umlal v10.8h, v2.8b, v30.8b + umlal v10.8h, v3.8b, v30.8b + umlsl v10.8h, v1.8b, v31.8b + umlsl v10.8h, v4.8b, v31.8b + ld1 {v12.2s, v13.2s}, [x6], x2 //load for horz filter row 4 + ext v17.8b, v12.8b , v13.8b , #5 + ext v14.8b, v12.8b , v13.8b , #2 + ext v15.8b, v12.8b , v13.8b , #3 + ext v16.8b, v12.8b , v13.8b , #4 + ext v13.8b, v12.8b , v13.8b , #1 + sqrshrun v26.8b, v10.8h, #5 + ld1 {v6.s}[0], [x7], x2 + uaddl v10.8h, v12.8b, v17.8b + umlal v10.8h, v14.8b, v30.8b + umlal v10.8h, v15.8b, v30.8b + umlsl v10.8h, v13.8b, v31.8b + umlsl v10.8h, v16.8b, v31.8b + ld1 {v12.2s, v13.2s}, [x6], x2 //horz row 5 + uaddl v18.8h, v1.8b, v6.8b + umlal v18.8h, v3.8b, v30.8b + umlal v18.8h, v4.8b, v30.8b + umlsl v18.8h, v2.8b, v31.8b + umlsl v18.8h, v5.8b, v31.8b + sqrshrun v28.8b, v10.8h, #5 + ext v17.8b, v12.8b , v13.8b , #5 + ext v14.8b, v12.8b , v13.8b , #2 + ext v15.8b, v12.8b , v13.8b , #3 + ext v16.8b, v12.8b , v13.8b , #4 + ext v13.8b, v12.8b , v13.8b , #1 + sqrshrun v27.8b, v18.8h, #5 + ld1 {v7.s}[0], [x7], x2 + uaddl v10.8h, v12.8b, v17.8b + umlal v10.8h, v14.8b, v30.8b + umlal v10.8h, v15.8b, v30.8b + umlsl v10.8h, v13.8b, v31.8b + umlsl v10.8h, v16.8b, v31.8b + ld1 {v12.2s, v13.2s}, [x6], x2 //horz row 6 + uaddl v18.8h, v2.8b, v7.8b + umlal v18.8h, v4.8b, v30.8b + umlal v18.8h, v5.8b, v30.8b + umlsl v18.8h, v3.8b, v31.8b + umlsl v18.8h, v6.8b, v31.8b + sqrshrun v29.8b, v10.8h, #5 + ext v17.8b, v12.8b , v13.8b , #5 + ext v14.8b, v12.8b , v13.8b , #2 + ext v15.8b, v12.8b , v13.8b , #3 + ext v16.8b, v12.8b , v13.8b , #4 + ext v13.8b, v12.8b , v13.8b , #1 + urhadd v26.16b, v26.16b , v28.16b + urhadd v27.16b, v27.16b , v29.16b + sqrshrun v28.8b, v18.8h, #5 + ld1 {v8.s}[0], [x7], x2 + uaddl v10.8h, v12.8b, v17.8b + umlal v10.8h, v14.8b, v30.8b + umlal v10.8h, v15.8b, v30.8b + umlsl v10.8h, v13.8b, v31.8b + umlsl v10.8h, v16.8b, v31.8b + ld1 {v12.2s, v13.2s}, [x6], x2 //horz row 7 + uaddl v18.8h, v3.8b, v8.8b + umlal v18.8h, v5.8b, v30.8b + umlal v18.8h, v6.8b, v30.8b + umlsl v18.8h, v4.8b, v31.8b + umlsl v18.8h, v7.8b, v31.8b + sqrshrun v24.8b, v10.8h, #5 + ext v17.8b, v12.8b , v13.8b , #5 + ext v14.8b, v12.8b , v13.8b , #2 + ext v15.8b, v12.8b , v13.8b , #3 + ext v16.8b, v12.8b , v13.8b , #4 + ext v13.8b, v12.8b , v13.8b , #1 + sqrshrun v29.8b, v18.8h, #5 + uaddl v10.8h, v12.8b, v17.8b + umlal v10.8h, v14.8b, v30.8b + umlal v10.8h, v15.8b, v30.8b + umlsl v10.8h, v13.8b, v31.8b + umlsl v10.8h, v16.8b, v31.8b + st1 {v26.s}[0], [x1], x3 + st1 {v27.s}[0], [x1], x3 + sqrshrun v25.8b, v10.8h, #5 + urhadd v24.16b, v24.16b , v28.16b + urhadd v25.16b, v25.16b , v29.16b + st1 {v24.s}[0], [x1], x3 + st1 {v25.s}[0], [x1], x3 + +end_func: + ldp x19, x20, [sp], #16 + pop_v_regs + ret + + + diff --git a/common/armv8/ih264_inter_pred_luma_vert_qpel_av8.s b/common/armv8/ih264_inter_pred_luma_vert_qpel_av8.s new file mode 100755 index 0000000..9d19a2d --- /dev/null +++ b/common/armv8/ih264_inter_pred_luma_vert_qpel_av8.s @@ -0,0 +1,511 @@ +//****************************************************************************** +//* +//* Copyright (C) 2015 The Android Open Source Project +//* +//* Licensed under the Apache License, Version 2.0 (the "License"); +//* you may not use this file except in compliance with the License. +//* You may obtain a copy of the License at: +//* +//* http://www.apache.org/licenses/LICENSE-2.0 +//* +//* Unless required by applicable law or agreed to in writing, software +//* distributed under the License is distributed on an "AS IS" BASIS, +//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +//* See the License for the specific language governing permissions and +//* limitations under the License. +//* +//***************************************************************************** +//* Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore +//*/ +///** +//****************************************************************************** +//* @file +//* ih264_inter_pred_luma_vert_qpel_av8.s +//* +//* @brief +//* Contains function definitions for inter prediction vertical quarter pel interpolation. +//* +//* @author +//* Mohit +//* +//* @par List of Functions: +//* +//* - ih264_inter_pred_luma_vert_qpel_av8() +//* +//* @remarks +//* None +//* +//******************************************************************************* +//*/ + +///* All the functions here are replicated from ih264_inter_pred_filters.c +// + +///** +///** +//******************************************************************************* +//* +//* @brief +//* Quarter pel interprediction luma filter for vertical input +//* +//* @par Description: +//* Applies a 6 tap horizontal filter .The output is clipped to 8 bits +//* sec 8.4.2.2.1 titled "Luma sample interpolation process" +//* +//* @param[in] pu1_src +//* UWORD8 pointer to the source +//* +//* @param[out] pu1_dst +//* UWORD8 pointer to the destination +//* +//* @param[in] src_strd +//* integer source stride +//* +//* @param[in] dst_strd +//* integer destination stride +//* +//* @param[in] ht +//* integer height of the array +//* +//* @param[in] wd +//* integer width of the array +//* +//* @param[in] pu1_tmp: temporary buffer: UNUSED in this function +//* +//* @param[in] dydx: x and y reference offset for qpel calculations. +//* @returns +//* +// @remarks +//* None +//* +//******************************************************************************* +//*/ + +//void ih264_inter_pred_luma_vert ( +// UWORD8 *pu1_src, +// UWORD8 *pu1_dst, +// WORD32 src_strd, +// WORD32 dst_strd, +// WORD32 ht, +// WORD32 wd, +// UWORD8* pu1_tmp, +// UWORD32 dydx) + +//**************Variables Vs Registers***************************************** +// x0 => *pu1_src +// x1 => *pu1_dst +// x2 => src_strd +// x3 => dst_strd +// x4 => ht +// x5 => wd +// x7 => dydx + +.text +.p2align 2 +.include "ih264_neon_macros.s" + + + + .global ih264_inter_pred_luma_vert_qpel_av8 + +ih264_inter_pred_luma_vert_qpel_av8: + + push_v_regs + stp x19, x20, [sp, #-16]! + + + and x7, x7, #12 //Finds y-offset + lsr x7, x7, #3 //dydx>>3 + mul x7, x2, x7 + add x7, x0, x7 //pu1_src + (y_offset>>1)*src_strd + sub x14, x4, #16 + movi v22.8h, #20 // Filter coeff 0x14 into Q11 + sub x0, x0, x2, lsl #1 //pu1_src-2*src_strd + subs x12, x5, #8 //if wd=8 branch to loop_8 + movi v24.8h, #5 // Filter coeff 0x4 into Q12 + beq loop_8_start + + subs x12, x5, #4 //if wd=4 branch to loop_4 + beq loop_4_start + + + ld1 {v0.2s, v1.2s}, [x0], x2 // Vector load from src[0_0] + ld1 {v2.2s, v3.2s}, [x0], x2 // Vector load from src[1_0] + ld1 {v4.2s, v5.2s}, [x0], x2 // Vector load from src[2_0] + ld1 {v6.2s, v7.2s}, [x0], x2 // Vector load from src[3_0] + add x14, x14, #1 //for checking loop + ld1 {v8.2s, v9.2s}, [x0], x2 // Vector load from src[4_0] + uaddl v12.8h, v4.8b, v6.8b // temp1 = src[2_0] + src[3_0] + ld1 {v10.2s, v11.2s}, [x0], x2 // Vector load from src[5_0] + +loop_16: //when wd=16 + + uaddl v14.8h, v0.8b, v10.8b // temp = src[0_0] + src[5_0] + uaddl v16.8h, v2.8b, v8.8b // temp2 = src[1_0] + src[4_0] + mla v14.8h, v12.8h , v22.8h // temp += temp1 * 20 + uaddl v20.8h, v1.8b, v11.8b // temp4 = src[0_8] + src[5_8] + uaddl v18.8h, v5.8b, v7.8b // temp3 = src[2_8] + src[3_8] + mla v20.8h, v18.8h , v22.8h // temp4 += temp3 * 20 + ld1 {v0.2s, v1.2s}, [x0], x2 + uaddl v26.8h, v3.8b, v9.8b // temp5 = src[1_8] + src[4_8] + uaddl v12.8h, v6.8b, v8.8b + mls v14.8h, v16.8h , v24.8h // temp -= temp2 * 5 + uaddl v16.8h, v2.8b, v0.8b + uaddl v18.8h, v4.8b, v10.8b + mla v16.8h, v12.8h , v22.8h + mls v20.8h, v26.8h , v24.8h // temp4 -= temp5 * 5 + uaddl v26.8h, v5.8b, v11.8b + uaddl v12.8h, v7.8b, v9.8b + sqrshrun v30.8b, v14.8h, #5 // dst[0_0] = CLIP_U8((temp +16) >> 5) + uaddl v14.8h, v3.8b, v1.8b + ld1 {v2.2s, v3.2s}, [x0], x2 + mla v14.8h, v12.8h , v22.8h + mls v16.8h, v18.8h , v24.8h + sqrshrun v31.8b, v20.8h, #5 // dst[0_8] = CLIP_U8((temp4 +16) >> 5) + ld1 {v20.2s, v21.2s}, [x7], x2 // Load for interpolation row 0 + urhadd v30.16b, v20.16b , v30.16b // Interpolation to obtain qpel value + urhadd v31.16b, v21.16b , v31.16b // Interpolation to obtain qpel value + uaddl v18.8h, v4.8b, v2.8b + uaddl v12.8h, v8.8b, v10.8b + st1 {v30.2s, v31.2s}, [x1], x3 // Vector store to dst[0_0] + mla v18.8h, v12.8h , v22.8h + uaddl v20.8h, v6.8b, v0.8b + mls v14.8h, v26.8h , v24.8h + sqrshrun v30.8b, v16.8h, #5 + uaddl v12.8h, v9.8b, v11.8b + uaddl v16.8h, v5.8b, v3.8b + uaddl v26.8h, v7.8b, v1.8b + mla v16.8h, v12.8h , v22.8h + mls v18.8h, v20.8h , v24.8h + ld1 {v4.2s, v5.2s}, [x0], x2 + sqrshrun v31.8b, v14.8h, #5 + ld1 {v14.2s, v15.2s}, [x7], x2 // Load for interpolation row 1 + uaddl v12.8h, v10.8b, v0.8b + urhadd v30.16b, v14.16b , v30.16b // Interpolation to obtain qpel value + urhadd v31.16b, v15.16b , v31.16b // Interpolation to obtain qpel value + uaddl v14.8h, v6.8b, v4.8b + uaddl v20.8h, v8.8b, v2.8b + mla v14.8h, v12.8h , v22.8h + mls v16.8h, v26.8h , v24.8h + st1 {v30.2s, v31.2s}, [x1], x3 //store row 1 + sqrshrun v30.8b, v18.8h, #5 + uaddl v18.8h, v7.8b, v5.8b + uaddl v12.8h, v11.8b, v1.8b + mla v18.8h, v12.8h , v22.8h + uaddl v26.8h, v9.8b, v3.8b + mls v14.8h, v20.8h , v24.8h + ld1 {v6.2s, v7.2s}, [x0], x2 + sqrshrun v31.8b, v16.8h, #5 + ld1 {v16.2s, v17.2s}, [x7], x2 // Load for interpolation row 2 + mls v18.8h, v26.8h , v24.8h + urhadd v30.16b, v16.16b , v30.16b // Interpolation to obtain qpel value + urhadd v31.16b, v17.16b , v31.16b // Interpolation to obtain qpel value + uaddl v12.8h, v0.8b, v2.8b // temp1 = src[2_0] + src[3_0] + st1 {v30.2s, v31.2s}, [x1], x3 //store row 2 + uaddl v16.8h, v10.8b, v4.8b // temp2 = src[1_0] + src[4_0] + uaddl v20.8h, v9.8b, v7.8b // temp4 = src[0_8] + src[5_8] + sqrshrun v30.8b, v14.8h, #5 + uaddl v26.8h, v5.8b, v11.8b // temp5 = src[1_8] + src[4_8] + uaddl v14.8h, v8.8b, v6.8b // temp = src[0_0] + src[5_0] + sqrshrun v31.8b, v18.8h, #5 + ld1 {v18.2s, v19.2s}, [x7], x2 // Load for interpolation row 3 + mla v14.8h, v12.8h , v22.8h // temp += temp1 * 20 + urhadd v30.16b, v18.16b , v30.16b // Interpolation to obtain qpel value + urhadd v31.16b, v19.16b , v31.16b // Interpolation to obtain qpel value + uaddl v18.8h, v1.8b, v3.8b // temp3 = src[2_8] + src[3_8] + st1 {v30.2s, v31.2s}, [x1], x3 //store row 3 + // 4 rows processed + mla v20.8h, v18.8h , v22.8h // temp4 += temp3 * 20 + ld1 {v8.2s, v9.2s}, [x0], x2 + uaddl v12.8h, v2.8b, v4.8b + uaddl v18.8h, v3.8b, v5.8b + mls v14.8h, v16.8h , v24.8h // temp -= temp2 * 5 + uaddl v28.8h, v9.8b, v11.8b + uaddl v16.8h, v6.8b, v0.8b + mla v28.8h, v18.8h , v22.8h // temp4 += temp3 * 20 + mls v20.8h, v26.8h , v24.8h // temp4 -= temp5 * 5 + uaddl v26.8h, v1.8b, v7.8b + uaddl v18.8h, v5.8b, v7.8b + sqrshrun v30.8b, v14.8h, #5 // dst[0_0] = CLIP_U8((temp +16) >> 5) + uaddl v14.8h, v8.8b, v10.8b + sqrshrun v31.8b, v20.8h, #5 // dst[0_8] = CLIP_U8((temp4 +16) >> 5) + ld1 {v20.2s, v21.2s}, [x7], x2 // Load for interpolation row 4 + ld1 {v10.2s, v11.2s}, [x0], x2 + urhadd v30.16b, v20.16b , v30.16b // Interpolation to obtain qpel value + urhadd v31.16b, v21.16b , v31.16b // Interpolation to obtain qpel value + mls v28.8h, v26.8h , v24.8h // temp4 -= temp5 * 5 + st1 {v30.2s, v31.2s}, [x1], x3 // store row 4 + mla v14.8h, v12.8h , v22.8h // temp += temp1 * 20 + uaddl v20.8h, v11.8b, v1.8b + uaddl v26.8h, v3.8b, v9.8b + mla v20.8h, v18.8h , v22.8h // temp4 += temp3 * 20 + uaddl v12.8h, v6.8b, v4.8b + uaddl v18.8h, v7.8b, v9.8b + sqrshrun v31.8b, v28.8h, #5 // dst[0_8] = CLIP_U8((temp4 +16) >> 5) + mls v14.8h, v16.8h , v24.8h // temp -= temp2 * 5 + uaddl v16.8h, v8.8b, v2.8b + sqrshrun v30.8b, v14.8h, #5 // dst[0_0] = CLIP_U8((temp +16) >> 5) + ld1 {v14.2s, v15.2s}, [x7], x2 // Load for interpolation row 5 + mls v20.8h, v26.8h , v24.8h // temp4 -= temp5 * 5 + urhadd v30.16b, v14.16b , v30.16b // Interpolation to obtain qpel value + urhadd v31.16b, v15.16b , v31.16b // Interpolation to obtain qpel value + uaddl v14.8h, v10.8b, v0.8b + st1 {v30.2s, v31.2s}, [x1], x3 // store row 5 + mla v14.8h, v12.8h , v22.8h // temp += temp1 * 20 + ld1 {v0.2s, v1.2s}, [x0], x2 + uaddl v26.8h, v5.8b, v11.8b + uaddl v12.8h, v8.8b, v6.8b + uaddl v28.8h, v0.8b, v2.8b + sqrshrun v31.8b, v20.8h, #5 // dst[0_8] = CLIP_U8((temp4 +16) >> 5) + mla v28.8h, v12.8h , v22.8h // temp += temp1 * 20 + uaddl v20.8h, v1.8b, v3.8b + mls v14.8h, v16.8h , v24.8h // temp -= temp2 * 5 + mla v20.8h, v18.8h , v22.8h // temp4 += temp3 * 20 + uaddl v16.8h, v10.8b, v4.8b + sqrshrun v30.8b, v14.8h, #5 // dst[0_0] = CLIP_U8((temp +16) >> 5) + ld1 {v14.2s, v15.2s}, [x7], x2 // Load for interpolation row 6 + mov v2.8b, v6.8b + mov v3.8b, v7.8b + urhadd v30.16b, v14.16b , v30.16b // Interpolation to obtain qpel value + urhadd v31.16b, v15.16b , v31.16b // Interpolation to obtain qpel value + + mls v28.8h, v16.8h , v24.8h // temp -= temp2 * 5 + st1 {v30.2s, v31.2s}, [x1], x3 // store row 6 + sqrshrun v30.8b, v28.8h, #5 // dst[0_0] = CLIP_U8((temp +16) >> 5) + swp v0.8b, v4.8b // swapping registers to put it in order + swp v1.8b, v5.8b // swapping registers to put it in order + + mls v20.8h, v26.8h , v24.8h // temp4 -= temp5 * 5 + mov v6.8b, v10.8b + mov v7.8b, v11.8b + subs x12, x14, #1 // if height==16 - looping + swp v4.8b, v8.8b + swp v5.8b, v9.8b + sqrshrun v31.8b, v20.8h, #5 // dst[0_8] = CLIP_U8((temp4 +16) >> 5) + ld1 {v20.2s, v21.2s}, [x7], x2 // Load for interpolation row 7 + urhadd v30.16b, v20.16b , v30.16b // Interpolation to obtain qpel value + urhadd v31.16b, v21.16b , v31.16b // Interpolation to obtain qpel value + st1 {v30.2s, v31.2s}, [x1], x3 // store row 7 + bne end_func //if height =8 end function + add x14, x14, #1 //for checking loop + ld1 {v10.2s, v11.2s}, [x0], x2 + uaddl v12.8h, v4.8b, v6.8b // temp1 = src[2_0] + src[3_0] + + b loop_16 // looping if height =16 + +loop_8_start: +//// Processing row0 and row1 + + ld1 {v0.2s}, [x0], x2 // Vector load from src[0_0] + ld1 {v1.2s}, [x0], x2 // Vector load from src[1_0] + ld1 {v2.2s}, [x0], x2 // Vector load from src[2_0] + ld1 {v3.2s}, [x0], x2 // Vector load from src[3_0] + add x14, x14, #1 //for checking loop + ld1 {v4.2s}, [x0], x2 // Vector load from src[4_0] + ld1 {v5.2s}, [x0], x2 // Vector load from src[5_0] + +loop_8: + //for checking loop + uaddl v6.8h, v2.8b, v3.8b // temp1 = src[2_0] + src[3_0] + uaddl v8.8h, v0.8b, v5.8b // temp = src[0_0] + src[5_0] + uaddl v10.8h, v1.8b, v4.8b // temp2 = src[1_0] + src[4_0] + mla v8.8h, v6.8h , v22.8h // temp += temp1 * 20 + ld1 {v6.2s}, [x0], x2 + uaddl v14.8h, v3.8b, v4.8b + uaddl v16.8h, v1.8b, v6.8b + uaddl v18.8h, v2.8b, v5.8b + mls v8.8h, v10.8h , v24.8h // temp -= temp2 * 5 + mla v16.8h, v14.8h , v22.8h + ld1 {v7.2s}, [x0], x2 + uaddl v20.8h, v4.8b, v5.8b + uaddl v12.8h, v2.8b, v7.8b + uaddl v10.8h, v3.8b, v6.8b + mls v16.8h, v18.8h , v24.8h + sqrshrun v26.8b, v8.8h, #5 // dst[0_0] = CLIP_U8( (temp + 16) >> 5) + mla v12.8h, v20.8h , v22.8h + ld1 {v8.2s}, [x7], x2 //Load value for interpolation (row0) + ld1 {v9.2s}, [x7], x2 //Load value for interpolation (row1) + ld1 {v0.2s}, [x0], x2 + uaddl v14.8h, v5.8b, v6.8b + sqrshrun v27.8b, v16.8h, #5 + urhadd v26.16b, v8.16b , v26.16b // Interpolation step for qpel calculation + urhadd v27.16b, v9.16b , v27.16b // Interpolation step for qpel calculation + + uaddl v20.8h, v3.8b, v0.8b + mls v12.8h, v10.8h , v24.8h + st1 {v26.2s}, [x1], x3 // Vector store to dst[0_0] + uaddl v18.8h, v4.8b, v7.8b + mla v20.8h, v14.8h , v22.8h + st1 {v27.2s}, [x1], x3 // Vector store to dst[1_0] + sqrshrun v28.8b, v12.8h, #5 + mls v20.8h, v18.8h , v24.8h + ld1 {v12.2s}, [x7], x2 //Load value for interpolation (row2) + ld1 {v13.2s}, [x7], x2 //Load value for interpolation (row3) + ld1 {v1.2s}, [x0], x2 + sqrshrun v29.8b, v20.8h, #5 + subs x9, x4, #4 + urhadd v28.16b, v12.16b , v28.16b + urhadd v29.16b, v13.16b , v29.16b + st1 {v28.2s}, [x1], x3 //store row 2 + st1 {v29.2s}, [x1], x3 //store row 3 + beq end_func // Branch if height==4 + uaddl v14.8h, v6.8b, v7.8b // temp1 = src[2_0] + src[3_0] + uaddl v16.8h, v0.8b, v5.8b // temp = src[0_0] + src[5_0] + uaddl v18.8h, v1.8b, v4.8b // temp2 = src[1_0] + src[4_0] + mla v18.8h, v14.8h , v22.8h // temp += temp1 * 20 + ld1 {v2.2s}, [x0], x2 + mls v18.8h, v16.8h , v24.8h // temp -= temp2 * 5 + uaddl v8.8h, v0.8b, v7.8b + uaddl v10.8h, v1.8b, v6.8b + uaddl v12.8h, v2.8b, v5.8b + sqrshrun v26.8b, v18.8h, #5 + mla v12.8h, v8.8h , v22.8h + ld1 {v18.2s}, [x7], x2 //Load value for interpolation (row4) + ld1 {v19.2s}, [x7], x2 //Load value for interpolation (row5) + ld1 {v3.2s}, [x0], x2 + mls v12.8h, v10.8h , v24.8h + sqrshrun v27.8b, v12.8h, #5 + urhadd v26.16b, v18.16b , v26.16b // Interpolation step for qpel calculation + urhadd v27.16b, v19.16b , v27.16b // Interpolation step for qpel calculation + + st1 {v26.2s}, [x1], x3 // store row 4 + st1 {v27.2s}, [x1], x3 // store row 5 + uaddl v14.8h, v0.8b, v1.8b // temp1 = src[2_0] + src[3_0] + uaddl v16.8h, v2.8b, v7.8b // temp = src[0_0] + src[5_0] + uaddl v18.8h, v3.8b, v6.8b // temp2 = src[1_0] + src[4_0] + mla v18.8h, v14.8h , v22.8h // temp += temp1 * 20 + ld1 {v4.2s}, [x0], x2 + mls v18.8h, v16.8h , v24.8h // temp -= temp2 * 5 + uaddl v8.8h, v2.8b, v1.8b + uaddl v10.8h, v3.8b, v0.8b + uaddl v12.8h, v4.8b, v7.8b + sqrshrun v26.8b, v18.8h, #5 + mla v12.8h, v8.8h , v22.8h + ld1 {v18.2s}, [x7], x2 //Load value for interpolation (row6) + ld1 {v19.2s}, [x7], x2 //Load value for interpolation (row7) + ld1 {v5.2s}, [x0], x2 + mls v12.8h, v10.8h , v24.8h + sqrshrun v27.8b, v12.8h, #5 + urhadd v26.16b, v18.16b , v26.16b // Interpolation step for qpel calculation + urhadd v27.16b, v19.16b , v27.16b // Interpolation step for qpel calculation + + subs x12, x14, #1 + st1 {v26.2s}, [x1], x3 // store row 6 + st1 {v27.2s}, [x1], x3 // store row 7 + add x14, x14, #1 + beq loop_8 //looping if height ==16 + + b end_func + + +loop_4_start: +//// Processing row0 and row1 + + + ld1 {v0.s}[0], [x0], x2 // Vector load from src[0_0] + ld1 {v1.s}[0], [x0], x2 // Vector load from src[1_0] + ld1 {v2.s}[0], [x0], x2 // Vector load from src[2_0] + ld1 {v3.s}[0], [x0], x2 // Vector load from src[3_0] + ld1 {v4.s}[0], [x0], x2 // Vector load from src[4_0] + ld1 {v5.s}[0], [x0], x2 // Vector load from src[5_0] + + uaddl v6.8h, v2.8b, v3.8b // temp1 = src[2_0] + src[3_0] + uaddl v8.8h, v0.8b, v5.8b // temp = src[0_0] + src[5_0] + uaddl v10.8h, v1.8b, v4.8b // temp2 = src[1_0] + src[4_0] + mla v8.8h, v6.8h , v22.8h // temp += temp1 * 20 + ld1 {v6.2s}, [x0], x2 + uaddl v14.8h, v3.8b, v4.8b + uaddl v16.8h, v1.8b, v6.8b + uaddl v18.8h, v2.8b, v5.8b + mls v8.8h, v10.8h , v24.8h // temp -= temp2 * 5 + ld1 {v7.s}[0], [x0], x2 + mla v16.8h, v14.8h , v22.8h + uaddl v20.8h, v4.8b, v5.8b + uaddl v12.8h, v2.8b, v7.8b + uaddl v10.8h, v3.8b, v6.8b + mls v16.8h, v18.8h , v24.8h + sqrshrun v26.8b, v8.8h, #5 // dst[0_0] = CLIP_U8( (temp + 16) >> 5) + ld1 {v8.s}[0], [x7], x2 //Load value for interpolation - row 0 + ld1 {v9.s}[0], [x7], x2 //Load value for interpolation - row 1 + mla v12.8h, v20.8h , v22.8h + ld1 {v0.s}[0], [x0], x2 + uaddl v14.8h, v5.8b, v6.8b + sqrshrun v27.8b, v16.8h, #5 + uaddl v20.8h, v3.8b, v0.8b + urhadd v26.16b, v26.16b , v8.16b //Interpolation step for qpel calculation + urhadd v27.16b, v27.16b , v9.16b //Interpolation step for qpel calculation + + mls v12.8h, v10.8h , v24.8h + st1 {v26.s}[0], [x1], x3 // Vector store to dst[0_0] + uaddl v18.8h, v4.8b, v7.8b + mla v20.8h, v14.8h , v22.8h + st1 {v27.s}[0], [x1], x3 // store row 1 + sqrshrun v28.8b, v12.8h, #5 + ld1 {v12.s}[0], [x7], x2 //Load value for interpolation - row 2 + ld1 {v13.s}[0], [x7], x2 //Load value for interpolation - row 3 + + mls v20.8h, v18.8h , v24.8h + ld1 {v1.s}[0], [x0], x2 + sqrshrun v29.8b, v20.8h, #5 + urhadd v28.16b, v12.16b , v28.16b //Interpolation step for qpel calculation + urhadd v29.16b, v13.16b , v29.16b //Interpolation step for qpel calculation + + st1 {v28.s}[0], [x1], x3 //store row 2 + st1 {v29.s}[0], [x1], x3 //store row 3 + + subs x9, x4, #4 + beq end_func // Branch if height==4 + + + uaddl v14.8h, v6.8b, v7.8b // temp1 = src[2_0] + src[3_0] + uaddl v16.8h, v0.8b, v5.8b // temp = src[0_0] + src[5_0] + uaddl v18.8h, v1.8b, v4.8b // temp2 = src[1_0] + src[4_0] + mla v18.8h, v14.8h , v22.8h // temp += temp1 * 20 + ld1 {v2.s}[0], [x0], x2 + mls v18.8h, v16.8h , v24.8h // temp -= temp2 * 5 + uaddl v8.8h, v0.8b, v7.8b + uaddl v10.8h, v1.8b, v6.8b + uaddl v12.8h, v2.8b, v5.8b + sqrshrun v26.8b, v18.8h, #5 + ld1 {v18.s}[0], [x7], x2 //Load value for interpolation - row 4 + ld1 {v19.s}[0], [x7], x2 //Load value for interpolation - row 5 + mla v12.8h, v8.8h , v22.8h + ld1 {v3.s}[0], [x0], x2 + mls v12.8h, v10.8h , v24.8h + sqrshrun v27.8b, v12.8h, #5 + urhadd v26.16b, v18.16b , v26.16b //Interpolation step for qpel calculation + urhadd v27.16b, v27.16b , v19.16b //Interpolation step for qpel calculation + + st1 {v26.s}[0], [x1], x3 //store row 4 + st1 {v27.s}[0], [x1], x3 // store row 5 + uaddl v14.8h, v0.8b, v1.8b // temp1 = src[2_0] + src[3_0] + uaddl v16.8h, v2.8b, v7.8b // temp = src[0_0] + src[5_0] + uaddl v18.8h, v3.8b, v6.8b // temp2 = src[1_0] + src[4_0] + mla v18.8h, v14.8h , v22.8h // temp += temp1 * 20 + ld1 {v4.s}[0], [x0], x2 + mls v18.8h, v16.8h , v24.8h // temp -= temp2 * 5 + uaddl v8.8h, v2.8b, v1.8b + uaddl v10.8h, v3.8b, v0.8b + uaddl v12.8h, v4.8b, v7.8b + sqrshrun v26.8b, v18.8h, #5 + ld1 {v18.s}[0], [x7], x2 //Load value for interpolation - row 6 + ld1 {v19.s}[0], [x7], x2 //Load value for interpolation - row 7 + mla v12.8h, v8.8h , v22.8h + ld1 {v5.s}[0], [x0], x2 + mls v12.8h, v10.8h , v24.8h + sqrshrun v27.8b, v12.8h, #5 + urhadd v26.16b, v18.16b , v26.16b //Interpolation step for qpel calculation + urhadd v27.16b, v19.16b , v27.16b //Interpolation step for qpel calculation + + st1 {v26.s}[0], [x1], x3 // store row 6 + st1 {v27.s}[0], [x1], x3 // store row 7 + + +end_func: + // LDMFD sp!,{x4-x12,PC} //Restoring registers from stack + ldp x19, x20, [sp], #16 + pop_v_regs + ret + + + diff --git a/common/armv8/ih264_intra_pred_chroma_av8.s b/common/armv8/ih264_intra_pred_chroma_av8.s new file mode 100755 index 0000000..62edfdc --- /dev/null +++ b/common/armv8/ih264_intra_pred_chroma_av8.s @@ -0,0 +1,574 @@ +//****************************************************************************** +//* +//* Copyright (C) 2015 The Android Open Source Project +//* +//* Licensed under the Apache License, Version 2.0 (the "License"); +//* you may not use this file except in compliance with the License. +//* You may obtain a copy of the License at: +//* +//* http://www.apache.org/licenses/LICENSE-2.0 +//* +//* Unless required by applicable law or agreed to in writing, software +//* distributed under the License is distributed on an "AS IS" BASIS, +//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +//* See the License for the specific language governing permissions and +//* limitations under the License. +//* +//***************************************************************************** +//* Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore +//*/ +///** +//****************************************************************************** +//* @file +//* ih264_intra_pred_chroma.s +//* +//* @brief +//* Contains function definitions for intra chroma prediction . +//* +//* @author +//* Ittiam +//* +//* @par List of Functions: +//* +//* - ih264_intra_pred_luma_chroma_mode_vert_av8() +//* - ih264_intra_pred_luma_chroma_mode_horz_av8() +//* - ih264_intra_pred_luma_chroma_mode_dc_av8() +//* - ih264_intra_pred_luma_chroma_mode_plane_av8() +//* +//* @remarks +//* None +//* +//******************************************************************************* +//*/ + +///* All the functions here are replicated from ih264_chroma_intra_pred_filters.c +// + +///** +///** +///** +// + + +.text +.p2align 2 +.include "ih264_neon_macros.s" + +.extern ih264_gai1_intrapred_chroma_plane_coeffs1 +.extern ih264_gai1_intrapred_chroma_plane_coeffs2 + + + +///** +//******************************************************************************* +//* +//*ih264_intra_pred_chroma_8x8_mode_dc +//* +//* @brief +//* Perform Intra prediction for chroma_8x8 mode:DC +//* +//* @par Description: +//* Perform Intra prediction for chroma_8x8 mode:DC ,described in sec 8.3.4.1 +//* +//* @param[in] pu1_src +//* UWORD8 pointer to the source containing alternate U and V samples +//* +//* @param[out] pu1_dst +//* UWORD8 pointer to the destination with alternate U and V samples +//* +//* @param[in] src_strd +//* integer source stride +//* +//* @param[in] dst_strd +//* integer destination stride +//* +//** @param[in] ui_neighboravailability +//* availability of neighbouring pixels +//* +//* @returns +//* +//* @remarks +//* None +//* +//*******************************************************************************/ +//void ih264_intra_pred_chroma_8x8_mode_dc(UWORD8 *pu1_src, +// UWORD8 *pu1_dst, +// WORD32 src_strd, +// WORD32 dst_strd, +// WORD32 ui_neighboravailability) + +//**************Variables Vs Registers***************************************** +// x0 => *pu1_src +// x1 => *pu1_dst +// x2 => src_strd +// x3 => dst_strd +// x4 => ui_neighboravailability + + + + .global ih264_intra_pred_chroma_8x8_mode_dc_av8 + +ih264_intra_pred_chroma_8x8_mode_dc_av8: + + + push_v_regs + stp x19, x20, [sp, #-16]! + + mov x19, #5 + ands x6, x4, x19 + beq none_available + cmp x6, #1 + beq left_only_available + cmp x6, #4 + beq top_only_available + +all_available: + ld1 {v0.8b, v1.8b}, [x0] + add x6, x0, #18 + ld1 {v2.8b, v3.8b}, [x6] + uxtl v0.8h, v0.8b + uxtl v1.8h, v1.8b + addp v0.4s, v0.4s , v0.4s + addp v1.4s, v1.4s , v1.4s + addp v0.4s, v0.4s , v0.4s + addp v1.4s, v1.4s , v1.4s + uxtl v2.8h, v2.8b + uxtl v3.8h, v3.8b + addp v2.4s, v2.4s , v2.4s + addp v3.4s, v3.4s , v3.4s + addp v2.4s, v2.4s , v2.4s + addp v3.4s, v3.4s , v3.4s + rshrn v5.8b, v0.8h, #2 + dup v21.8h, v5.h[0] + rshrn v6.8b, v3.8h, #2 + dup v20.8h, v6.h[0] + add v1.8h, v1.8h, v2.8h + rshrn v1.8b, v1.8h, #3 + dup v23.8h, v1.h[0] + mov v20.d[0], v23.d[0] + add v0.8h, v0.8h, v3.8h + rshrn v0.8b, v0.8h, #3 + dup v23.8h, v0.h[0] + mov v21.d[1], v23.d[0] + b store +left_only_available: + ld1 {v0.8b, v1.8b}, [x0] + uxtl v0.8h, v0.8b + uxtl v1.8h, v1.8b + addp v0.4s, v0.4s , v0.4s + addp v1.4s, v1.4s , v1.4s + addp v0.4s, v0.4s , v0.4s + addp v1.4s, v1.4s , v1.4s + rshrn v0.8b, v0.8h, #2 + rshrn v1.8b, v1.8h, #2 + dup v20.8h , v1.h[0] + dup v21.8h, v0.h[0] + b store + +top_only_available: + add x6, x0, #18 + ld1 {v0.8b, v1.8b}, [x6] + uxtl v0.8h, v0.8b + uxtl v1.8h, v1.8b + addp v0.4s, v0.4s , v0.4s + addp v1.4s, v1.4s , v1.4s + addp v0.4s, v0.4s , v0.4s + addp v1.4s, v1.4s , v1.4s + rshrn v0.8b, v0.8h, #2 + rshrn v1.8b, v1.8h, #2 + dup v20.8h , v0.h[0] + dup v21.8h, v1.h[0] + mov v20.d[1], v21.d[1] + mov v21.d[0], v20.d[0] + b store +none_available: + mov w15, #128 + dup v20.16b, w15 + dup v21.16b, w15 + + +store: + + st1 { v20.16b}, [x1], x3 + st1 { v20.16b}, [x1], x3 + st1 { v20.16b}, [x1], x3 + st1 { v20.16b}, [x1], x3 + st1 { v21.16b}, [x1], x3 + st1 { v21.16b}, [x1], x3 + st1 { v21.16b}, [x1], x3 + st1 { v21.16b}, [x1], x3 +end_func: + + ldp x19, x20, [sp], #16 + pop_v_regs + ret + + + + + +///****************************************************************************** + + +///** +//******************************************************************************* +//* +//*ih264_intra_pred_chroma_8x8_mode_horz +//* +//* @brief +//* Perform Intra prediction for chroma_8x8 mode:Horizontal +//* +//* @par Description: +//* Perform Intra prediction for chroma_8x8 mode:Horizontal ,described in sec 8.3.4.2 +//* +//* @param[in] pu1_src +//* UWORD8 pointer to the source containing alternate U and V samples +//* +//* @param[out] pu1_dst +//* UWORD8 pointer to the destination with alternate U and V samples +//* +//* @param[in] src_strd +//* integer source stride +//* +//* @param[in] dst_strd +//* integer destination stride +//* +//* @param[in] ui_neighboravailability +//* availability of neighbouring pixels(Not used in this function) +//* +//* @returns +//* +//* @remarks +//* None +//* +//******************************************************************************* +//*/ +//void ih264_intra_pred_chroma_8x8_mode_horz(UWORD8 *pu1_src, +// UWORD8 *pu1_dst, +// WORD32 src_strd, +// WORD32 dst_strd, +// WORD32 ui_neighboravailability) +//**************Variables Vs Registers***************************************** +// x0 => *pu1_src +// x1 => *pu1_dst +// x2 => src_strd +// x3 => dst_strd +// x4 => ui_neighboravailability + + + .global ih264_intra_pred_chroma_8x8_mode_horz_av8 + +ih264_intra_pred_chroma_8x8_mode_horz_av8: + + + + push_v_regs + ld1 {v0.8h}, [x0] + + dup v10.8h, v0.h[7] + dup v11.8h, v0.h[6] + dup v12.8h, v0.h[5] + dup v13.8h, v0.h[4] + st1 {v10.8h}, [x1], x3 + dup v14.8h, v0.h[3] + st1 {v11.8h}, [x1], x3 + dup v15.8h, v0.h[2] + st1 {v12.8h}, [x1], x3 + dup v16.8h, v0.h[1] + st1 {v13.8h}, [x1], x3 + dup v17.8h, v0.h[0] + st1 {v14.8h}, [x1], x3 + st1 {v15.8h}, [x1], x3 + st1 {v16.8h}, [x1], x3 + st1 {v17.8h}, [x1], x3 + + + pop_v_regs + ret + + + + + + +///** +//******************************************************************************* +//* +//*ih264_intra_pred_chroma_8x8_mode_vert +//* +//* @brief +//* Perform Intra prediction for chroma_8x8 mode:vertical +//* +//* @par Description: +//*Perform Intra prediction for chroma_8x8 mode:vertical ,described in sec 8.3.4.3 +//* +//* @param[in] pu1_src +//* UWORD8 pointer to the source containing alternate U and V samples +//* +//* @param[out] pu1_dst +//* UWORD8 pointer to the destination with alternate U and V samples +//* +//* @param[in] src_strd +//* integer source stride +//* +//* @param[in] dst_strd +//* integer destination stride +//* +//* @param[in] ui_neighboravailability +//* availability of neighbouring pixels(Not used in this function) +//* +//* @returns +//* +//* @remarks +//* None +//* +//******************************************************************************* +//void ih264_intra_pred_chroma_8x8_mode_vert(UWORD8 *pu1_src, +// UWORD8 *pu1_dst, +// WORD32 src_strd, +// WORD32 dst_strd, +// WORD32 ui_neighboravailability) + +//**************Variables Vs Registers***************************************** +// x0 => *pu1_src +// x1 => *pu1_dst +// x2 => src_strd +// x3 => dst_strd +// x4 => ui_neighboravailability + + + .global ih264_intra_pred_chroma_8x8_mode_vert_av8 + +ih264_intra_pred_chroma_8x8_mode_vert_av8: + + push_v_regs + + add x0, x0, #18 + ld1 {v0.8b, v1.8b}, [x0] + + st1 {v0.8b, v1.8b}, [x1], x3 + st1 {v0.8b, v1.8b}, [x1], x3 + st1 {v0.8b, v1.8b}, [x1], x3 + st1 {v0.8b, v1.8b}, [x1], x3 + st1 {v0.8b, v1.8b}, [x1], x3 + st1 {v0.8b, v1.8b}, [x1], x3 + st1 {v0.8b, v1.8b}, [x1], x3 + st1 {v0.8b, v1.8b}, [x1], x3 + + pop_v_regs + ret + + + + +///****************************************************************************** + + +///** +//******************************************************************************* +//* +//*ih264_intra_pred_chroma_8x8_mode_plane +//* +//* @brief +//* Perform Intra prediction for chroma_8x8 mode:PLANE +//* +//* @par Description: +//* Perform Intra prediction for chroma_8x8 mode:PLANE ,described in sec 8.3.4.4 +//* +//* @param[in] pu1_src +//* UWORD8 pointer to the source containing alternate U and V samples +//* +//* @param[out] pu1_dst +//* UWORD8 pointer to the destination with alternate U and V samples +//* +//* @param[in] src_strd +//* integer source stride +//* +//* @param[in] dst_strd +//* integer destination stride +//* +//* @param[in] ui_neighboravailability +//* availability of neighbouring pixels +//* +//* @returns +//* +//* @remarks +//* None +//* +//*******************************************************************************/ +//void ih264_intra_pred_chroma_8x8_mode_plane(UWORD8 *pu1_src, +// UWORD8 *pu1_dst, +// WORD32 src_strd, +// WORD32 dst_strd, +// WORD32 ui_neighboravailability) + +//**************Variables Vs Registers***************************************** +// x0 => *pu1_src +// x1 => *pu1_dst +// x2 => src_strd +// x3 => dst_strd +// x4 => ui_neighboravailability + + .global ih264_intra_pred_chroma_8x8_mode_plane_av8 +ih264_intra_pred_chroma_8x8_mode_plane_av8: + + push_v_regs + stp x19, x20, [sp, #-16]! + + ld1 {v0.2s}, [x0] + add x10, x0, #10 + ld1 {v1.2s}, [x10] + add x10, x10, #6 + rev64 v5.4h, v0.4h + ld1 {v2.2s}, [x10], #8 + add x10, x10, #2 + rev64 v7.4h, v2.4h + ld1 {v3.2s}, [x10] + sub x5, x3, #8 + adrp x12, :got:ih264_gai1_intrapred_chroma_plane_coeffs1 + ldr x12, [x12, #:got_lo12:ih264_gai1_intrapred_chroma_plane_coeffs1] + usubl v10.8h, v5.8b, v1.8b + ld1 {v8.8b, v9.8b}, [x12] // Load multiplication factors 1 to 8 into D3 + mov v8.d[1], v9.d[0] + usubl v12.8h, v3.8b, v7.8b + mul v14.8h, v10.8h , v8.8h + mul v16.8h, v12.8h , v8.8h + uzp1 v15.8h, v14.8h, v16.8h + uzp2 v16.8h, v14.8h, v16.8h + mov v14.16b, v15.16b + mov v15.d[0], v14.d[1] + mov v17.d[0], v16.d[1] + addp v14.4h, v14.4h, v14.4h + addp v15.4h, v15.4h, v15.4h + addp v16.4h, v16.4h, v16.4h + addp v17.4h, v17.4h, v17.4h + addp v14.4h, v14.4h, v14.4h + addp v15.4h, v15.4h, v15.4h + addp v16.4h, v16.4h, v16.4h + addp v17.4h, v17.4h, v17.4h + mov x6, #34 + dup v18.8h, w6 + smull v22.4s, v14.4h, v18.4h + smull v24.4s, v15.4h, v18.4h + smull v26.4s, v16.4h, v18.4h + smull v28.4s, v17.4h, v18.4h + rshrn v10.4h, v22.4s, #6 + rshrn v12.4h, v24.4s, #6 + rshrn v13.4h, v26.4s, #6 + rshrn v14.4h, v28.4s, #6 + ldrb w6, [x0], #1 + sxtw x6, w6 + add x10, x0, #31 + ldrb w8, [x0], #1 + sxtw x8, w8 + ldrb w7, [x10], #1 + sxtw x7, w7 + ldrb w9, [x10], #1 + sxtw x9, w9 + add x6, x6, x7 + add x8, x8, x9 + lsl x6, x6, #4 + lsl x8, x8, #4 + dup v0.8h, w6 + dup v2.8h, w8 + dup v4.8h, v12.h[0] + dup v6.8h, v10.h[0] + dup v24.8h, v14.h[0] + dup v26.8h, v13.h[0] + zip1 v5.8h, v4.8h, v24.8h + zip2 v24.8h, v4.8h, v24.8h + mov v4.16b, v5.16b + zip1 v7.8h, v6.8h, v26.8h + zip2 v26.8h, v6.8h, v26.8h + mov v6.16b, v7.16b + zip1 v1.8h, v0.8h, v2.8h + zip2 v2.8h, v0.8h, v2.8h + mov v0.16b, v1.16b + + adrp x12, :got:ih264_gai1_intrapred_chroma_plane_coeffs2 + ldr x12, [x12, #:got_lo12:ih264_gai1_intrapred_chroma_plane_coeffs2] + + ld1 {v8.2s, v9.2s}, [x12] + mov v8.d[1], v9.d[0] + mov v10.16b, v8.16b + mov v22.16b, v8.16b + zip1 v9.8h, v8.8h, v10.8h + zip2 v10.8h, v8.8h, v10.8h + mov v8.16b, v9.16b + mul v12.8h, v4.8h , v8.8h + mul v16.8h, v4.8h , v10.8h + add v12.8h, v0.8h , v12.8h + add v16.8h, v0.8h , v16.8h + dup v20.8h, v22.h[0] + mul v4.8h, v6.8h , v20.8h + dup v30.8h, v22.4h[1] + mul v18.8h, v6.8h , v20.8h + mul v14.8h, v6.8h , v30.8h + mul v8.8h, v6.8h , v30.8h + add v24.8h, v12.8h , v4.8h + add v0.8h, v16.8h , v18.8h + add v2.8h, v12.8h , v14.8h + sqrshrun v28.8b, v24.8h, #5 + add v26.8h, v16.8h , v8.8h + sqrshrun v29.8b, v0.8h, #5 + dup v20.8h, v22.4h[2] + st1 {v28.8b, v29.8b}, [x1], x3 + sqrshrun v28.8b, v2.8h, #5 + sqrshrun v29.8b, v26.8h, #5 + mul v4.8h, v6.8h , v20.8h + mul v18.8h, v6.8h , v20.8h + st1 {v28.8b, v29.8b}, [x1], x3 + add v24.8h, v12.8h , v4.8h + add v0.8h, v16.8h , v18.8h + dup v30.8h, v22.4h[3] + sqrshrun v28.8b, v24.8h, #5 + sqrshrun v29.8b, v0.8h, #5 + mul v14.8h, v6.8h , v30.8h + mul v8.8h, v6.8h , v30.8h + st1 {v28.8b, v29.8b}, [x1], x3 + add v2.8h, v12.8h , v14.8h + add v26.8h, v16.8h , v8.8h + dup v20.8h, v22.h[4] + sqrshrun v28.8b, v2.8h, #5 + sqrshrun v29.8b, v26.8h, #5 + mul v4.8h, v6.8h , v20.8h + mul v18.8h, v6.8h , v20.8h + st1 {v28.8b, v29.8b}, [x1], x3 + add v24.8h, v12.8h , v4.8h + add v0.8h, v16.8h , v18.8h + dup v30.8h, v22.h[5] + sqrshrun v28.8b, v24.8h, #5 + sqrshrun v29.8b, v0.8h, #5 + mul v14.8h, v6.8h , v30.8h + mul v8.8h, v6.8h , v30.8h + st1 {v28.8b, v29.8b}, [x1], x3 + add v2.8h, v12.8h , v14.8h + add v26.8h, v16.8h , v8.8h + dup v20.8h, v22.h[6] + sqrshrun v28.8b, v2.8h, #5 + sqrshrun v29.8b, v26.8h, #5 + mul v4.8h, v6.8h , v20.8h + mul v18.8h, v6.8h , v20.8h + st1 {v28.8b, v29.8b}, [x1], x3 + add v24.8h, v12.8h , v4.8h + add v0.8h, v16.8h , v18.8h + dup v30.8h, v22.h[7] + sqrshrun v28.8b, v24.8h, #5 + sqrshrun v29.8b, v0.8h, #5 + mul v14.8h, v6.8h , v30.8h + mul v8.8h, v6.8h , v30.8h + st1 {v28.8b, v29.8b}, [x1], x3 + add v2.8h, v12.8h , v14.8h + add v26.8h, v16.8h , v8.8h + sqrshrun v28.8b, v2.8h, #5 + sqrshrun v29.8b, v26.8h, #5 + st1 {v28.8b, v29.8b}, [x1], x3 + +end_func_plane: + + ldp x19, x20, [sp], #16 + pop_v_regs + ret + + + diff --git a/common/armv8/ih264_intra_pred_luma_16x16_av8.s b/common/armv8/ih264_intra_pred_luma_16x16_av8.s new file mode 100755 index 0000000..a9eb165 --- /dev/null +++ b/common/armv8/ih264_intra_pred_luma_16x16_av8.s @@ -0,0 +1,606 @@ +//****************************************************************************** +//* +//* Copyright (C) 2015 The Android Open Source Project +//* +//* Licensed under the Apache License, Version 2.0 (the "License"); +//* you may not use this file except in compliance with the License. +//* You may obtain a copy of the License at: +//* +//* http://www.apache.org/licenses/LICENSE-2.0 +//* +//* Unless required by applicable law or agreed to in writing, software +//* distributed under the License is distributed on an "AS IS" BASIS, +//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +//* See the License for the specific language governing permissions and +//* limitations under the License. +//* +//***************************************************************************** +//* Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore +//*/ +///** +//****************************************************************************** +//* @file +//* ih264_intra_pred_luma_16x16_av8.s +//* +//* @brief +//* Contains function definitions for intra 16x16 Luma prediction . +//* +//* @author +//* Ittiam +//* +//* @par List of Functions: +//* +//* - ih264_intra_pred_luma_16x16_mode_vert_av8() +//* - ih264_intra_pred_luma_16x16_mode_horz_av8() +//* - ih264_intra_pred_luma_16x16_mode_dc_av8() +//* - ih264_intra_pred_luma_16x16_mode_plane_av8() +//* +//* @remarks +//* None +//* +//******************************************************************************* +//*/ + +///* All the functions here are replicated from ih264_intra_pred_filters.c +// + +///** +///** +///** +// + + +.text +.p2align 2 +.include "ih264_neon_macros.s" +.extern ih264_gai1_intrapred_luma_plane_coeffs + + + +///** +//******************************************************************************* +//* +//*ih264_intra_pred_luma_16x16_mode_vert +//* +//* @brief +//* Perform Intra prediction for luma_16x16 mode:vertical +//* +//* @par Description: +//* Perform Intra prediction for luma_16x16 mode:Vertical ,described in sec 8.3.3.1 +//* +//* @param[in] pu1_src +//* UWORD8 pointer to the source +//* +//* @param[out] pu1_dst +//* UWORD8 pointer to the destination +//* +//* @param[in] src_strd +//* integer source stride +//* +//* @param[in] dst_strd +//* integer destination stride +//* +//* @param[in] ui_neighboravailability +//* availability of neighbouring pixels(Not used in this function) +//* +//* @returns +//* +//* @remarks +//* None +//* +//******************************************************************************* +//void ih264_intra_pred_luma_16x16_mode_vert(UWORD8 *pu1_src, +// UWORD8 *pu1_dst, +// WORD32 src_strd, +// WORD32 dst_strd, +// WORD32 ui_neighboravailability) + +//**************Variables Vs Registers***************************************** +// x0 => *pu1_src +// x1 => *pu1_dst +// x2 => src_strd +// x3 => dst_strd +// x4 => ui_neighboravailability + + + .global ih264_intra_pred_luma_16x16_mode_vert_av8 + +ih264_intra_pred_luma_16x16_mode_vert_av8: + + push_v_regs + + + add x0, x0, #17 + ld1 {v0.8b, v1.8b}, [x0] + + st1 {v0.8b, v1.8b}, [x1], x3 + st1 {v0.8b, v1.8b}, [x1], x3 + st1 {v0.8b, v1.8b}, [x1], x3 + st1 {v0.8b, v1.8b}, [x1], x3 + st1 {v0.8b, v1.8b}, [x1], x3 + st1 {v0.8b, v1.8b}, [x1], x3 + st1 {v0.8b, v1.8b}, [x1], x3 + st1 {v0.8b, v1.8b}, [x1], x3 + st1 {v0.8b, v1.8b}, [x1], x3 + st1 {v0.8b, v1.8b}, [x1], x3 + st1 {v0.8b, v1.8b}, [x1], x3 + st1 {v0.8b, v1.8b}, [x1], x3 + st1 {v0.8b, v1.8b}, [x1], x3 + st1 {v0.8b, v1.8b}, [x1], x3 + st1 {v0.8b, v1.8b}, [x1], x3 + st1 {v0.8b, v1.8b}, [x1], x3 + + pop_v_regs + ret + + + + + +///****************************************************************************** + + +///** +//******************************************************************************* +//* +//*ih264_intra_pred_luma_16x16_mode_horz +//* +//* @brief +//* Perform Intra prediction for luma_16x16 mode:horizontal +//* +//* @par Description: +//* Perform Intra prediction for luma_16x16 mode:horizontal ,described in sec 8.3.3.2 +//* +//* @param[in] pu1_src +//* UWORD8 pointer to the source +//* +//* @param[out] pu1_dst +//* UWORD8 pointer to the destination +//* +//* @param[in] src_strd +//* integer source stride +//* +//* @param[in] dst_strd +//* integer destination stride +//* +//* @param[in] ui_neighboravailability +//* availability of neighbouring pixels(Not used in this function) +//* +//* @returns +//* +//* @remarks +//* None +//* +//******************************************************************************* +//*/ +//void ih264_intra_pred_luma_16x16_mode_horz(UWORD8 *pu1_src, +// UWORD8 *pu1_dst, +// WORD32 src_strd, +// WORD32 dst_strd, +// WORD32 ui_neighboravailability) +//**************Variables Vs Registers***************************************** +// x0 => *pu1_src +// x1 => *pu1_dst +// x2 => src_strd +// x3 => dst_strd +// x4 => ui_neighboravailability + + .global ih264_intra_pred_luma_16x16_mode_horz_av8 + +ih264_intra_pred_luma_16x16_mode_horz_av8: + + + + push_v_regs + + ld1 {v0.16b}, [x0] + + + + dup v10.16b, v0.b[15] + dup v11.16b, v0.b[14] + dup v12.16b, v0.b[13] + dup v13.16b, v0.b[12] + st1 {v10.16b}, [x1], x3 + dup v14.16b, v0.b[11] + st1 {v11.16b}, [x1], x3 + dup v15.16b, v0.b[10] + st1 {v12.16b}, [x1], x3 + dup v16.16b, v0.b[9] + st1 {v13.16b}, [x1], x3 + dup v17.16b, v0.b[8] + st1 {v14.16b}, [x1], x3 + dup v18.16b, v0.b[7] + st1 {v15.16b}, [x1], x3 + dup v19.16b, v0.b[6] + st1 {v16.16b}, [x1], x3 + dup v20.16b, v0.b[5] + st1 {v17.16b}, [x1], x3 + dup v21.16b, v0.b[4] + st1 {v18.16b}, [x1], x3 + dup v22.16b, v0.b[3] + st1 {v19.16b}, [x1], x3 + dup v23.16b, v0.b[2] + st1 {v20.16b}, [x1], x3 + dup v24.16b, v0.b[1] + st1 {v21.16b}, [x1], x3 + dup v25.16b, v0.b[0] + st1 {v22.16b}, [x1], x3 + st1 {v23.16b}, [x1], x3 + st1 {v24.16b}, [x1], x3 + st1 {v25.16b}, [x1], x3 + + pop_v_regs + ret + + + + + + + +///****************************************************************************** + + +///** +//******************************************************************************* +//* +//*ih264_intra_pred_luma_16x16_mode_dc +//* +//* @brief +//* Perform Intra prediction for luma_16x16 mode:DC +//* +//* @par Description: +//* Perform Intra prediction for luma_16x16 mode:DC ,described in sec 8.3.3.3 +//* +//* @param[in] pu1_src +//* UWORD8 pointer to the source +//* +//* @param[out] pu1_dst +//* UWORD8 pointer to the destination +//* +//* @param[in] src_strd +//* integer source stride +//* +//* @param[in] dst_strd +//* integer destination stride +//* +//* @param[in] ui_neighboravailability +//* availability of neighbouring pixels +//* +//* @returns +//* +//* @remarks +//* None +//* +//*******************************************************************************/ +//void ih264_intra_pred_luma_16x16_mode_dc(UWORD8 *pu1_src, +// UWORD8 *pu1_dst, +// WORD32 src_strd, +// WORD32 dst_strd, +// WORD32 ui_neighboravailability) + +//**************Variables Vs Registers***************************************** +// x0 => *pu1_src +// x1 => *pu1_dst +// x2 => src_strd +// x3 => dst_strd +// x4 => ui_neighboravailability + + .global ih264_intra_pred_luma_16x16_mode_dc_av8 + +ih264_intra_pred_luma_16x16_mode_dc_av8: + + + + push_v_regs + stp x19, x20, [sp, #-16]! + + sub v0.16b, v0.16b, v0.16b + sub v1.16b, v1.16b, v1.16b + mov w10, #0 + mov w11 , #3 + ands x6, x4, #0x01 + beq top_available //LEFT NOT AVAILABLE + ld1 {v0.16b}, [x0] + add w10, w10, #8 + add w11, w11, #1 +top_available: + ands x6, x4, #0x04 + beq none_available + add x6, x0, #17 + ld1 {v1.16b}, [x6] + add w10, w10, #8 + add w11, w11, #1 + b summation +none_available: + cmp x4, #0 + bne summation + mov w15, #128 + dup v20.16b, w15 + b store +summation: + uaddl v2.8h, v0.8b, v1.8b + uaddl2 v3.8h, v0.16b, v1.16b + dup v10.8h, w10 + neg w11, w11 + dup v20.8h, w11 + add v0.8h, v2.8h, v3.8h + mov v1.d[0], v0.d[1] + add v0.4h, v0.4h, v1.4h + addp v0.4h, v0.4h , v0.4h + addp v0.4h, v0.4h , v0.4h + add v0.4h, v0.4h, v10.4h + uqshl v0.8h, v0.8h, v20.8h + sqxtun v0.8b, v0.8h + dup v20.16b, v0.b[0] + +store: + + st1 { v20.16b}, [x1], x3 + st1 { v20.16b}, [x1], x3 + st1 { v20.16b}, [x1], x3 + st1 { v20.16b}, [x1], x3 + st1 { v20.16b}, [x1], x3 + st1 { v20.16b}, [x1], x3 + st1 { v20.16b}, [x1], x3 + st1 { v20.16b}, [x1], x3 + st1 { v20.16b}, [x1], x3 + st1 { v20.16b}, [x1], x3 + st1 { v20.16b}, [x1], x3 + st1 { v20.16b}, [x1], x3 + st1 { v20.16b}, [x1], x3 + st1 { v20.16b}, [x1], x3 + st1 { v20.16b}, [x1], x3 + st1 { v20.16b}, [x1], x3 + + + +end_func: + + ldp x19, x20, [sp], #16 + pop_v_regs + ret + + + + + +///****************************************************************************** + + +///** +//******************************************************************************* +//* +//*ih264_intra_pred_luma_16x16_mode_plane +//* +//* @brief +//* Perform Intra prediction for luma_16x16 mode:PLANE +//* +//* @par Description: +//* Perform Intra prediction for luma_16x16 mode:PLANE ,described in sec 8.3.3.4 +//* +//* @param[in] pu1_src +//* UWORD8 pointer to the source +//* +//* @param[out] pu1_dst +//* UWORD8 pointer to the destination +//* +//* @param[in] src_strd +//* integer source stride +//* +//* @param[in] dst_strd +//* integer destination stride +//* +//* @param[in] ui_neighboravailability +//* availability of neighbouring pixels +//* +//* @returns +//* +//* @remarks +//* None +//* +//*******************************************************************************/ +//void ih264_intra_pred_luma_16x16_mode_plane(UWORD8 *pu1_src, +// UWORD8 *pu1_dst, +// WORD32 src_strd, +// WORD32 dst_strd, +// WORD32 ui_neighboravailability) + +//**************Variables Vs Registers***************************************** +// x0 => *pu1_src +// x1 => *pu1_dst +// x2 => src_strd +// x3 => dst_strd +// x4 => ui_neighboravailability + + .global ih264_intra_pred_luma_16x16_mode_plane_av8 +ih264_intra_pred_luma_16x16_mode_plane_av8: + + push_v_regs + stp x19, x20, [sp, #-16]! + mov x2, x1 + add x1, x0, #17 + add x0, x0, #15 + mov x8, #9 + sub x1, x1, #1 + mov x10, x1 //top_left + mov x4, #-1 + ld1 {v2.2s}, [x1], x8 + + adrp x7, :got:ih264_gai1_intrapred_luma_plane_coeffs + ldr x7, [x7, #:got_lo12:ih264_gai1_intrapred_luma_plane_coeffs] + + ld1 {v0.2s}, [x1] + rev64 v2.8b, v2.8b + ld1 {v6.2s, v7.2s}, [x7] + usubl v0.8h, v0.8b, v2.8b + uxtl v16.8h, v6.8b + mul v0.8h, v0.8h , v16.8h + uxtl v18.8h, v7.8b + add x7, x0, x4, lsl #3 + sub x0, x7, x4, lsl #1 + sub x20, x4, #0x0 + neg x14, x20 + addp v0.8h, v0.8h, v1.8h + ldrb w8, [x7], #-1 + sxtw x8, w8 + ldrb w9, [x0], #1 + sxtw x9, w9 + saddlp v0.2s, v0.4h + sub x12, x8, x9 + ldrb w8, [x7], #-1 + sxtw x8, w8 + saddlp v0.1d, v0.2s + ldrb w9, [x0], #1 + sxtw x9, w9 + sub x8, x8, x9 + shl v2.2s, v0.2s, #2 + add x12, x12, x8, lsl #1 + add v0.2s, v0.2s , v2.2s + ldrb w8, [x7], #-1 + sxtw x8, w8 + ldrb w9, [x0], #1 + sxtw x9, w9 + srshr v0.2s, v0.2s, #6 // i_b = D0[0] + sub x8, x8, x9 + ldrb w5, [x7], #-1 + sxtw x5, w5 + add x8, x8, x8, lsl #1 + dup v4.8h, v0.4h[0] + add x12, x12, x8 + ldrb w9, [x0], #1 + sxtw x9, w9 + mul v0.8h, v4.8h , v16.8h + sub x5, x5, x9 + mul v2.8h, v4.8h , v18.8h + add x12, x12, x5, lsl #2 + ldrb w8, [x7], #-1 + sxtw x8, w8 + ldrb w9, [x0], #1 + sxtw x9, w9 + sub x8, x8, x9 + ldrb w5, [x7], #-1 + sxtw x5, w5 + add x8, x8, x8, lsl #2 + ldrb w6, [x0], #1 + sxtw x6, w6 + add x12, x12, x8 + ldrb w8, [x7], #-1 + sxtw x8, w8 + ldrb w9, [x0], #1 + sxtw x9, w9 + sub x5, x5, x6 + sub x8, x8, x9 + add x5, x5, x5, lsl #1 + sub x20, x8, x8, lsl #3 + neg x8, x20 + add x12, x12, x5, lsl #1 + ldrb w5, [x7], #-1 + sxtw x5, w5 + ldrb w6, [x10] //top_left + sxtw x6, w6 + add x12, x12, x8 + sub x9, x5, x6 + ldrb w6, [x1, #7] + sxtw x6, w6 + add x12, x12, x9, lsl #3 // i_c = x12 + add x8, x5, x6 + add x12, x12, x12, lsl #2 + lsl x8, x8, #4 // i_a = x8 + add x12, x12, #0x20 + lsr x12, x12, #6 + shl v28.8h, v4.8h, #3 + dup v6.8h, w12 + dup v30.8h, w8 + shl v26.8h, v6.8h, #3 + sub v30.8h, v30.8h , v28.8h + sub v30.8h, v30.8h , v26.8h + add v28.8h, v30.8h , v6.8h + add v26.8h, v28.8h , v0.8h + add v28.8h, v28.8h , v2.8h + sqrshrun v20.8b, v26.8h, #5 + sqrshrun v21.8b, v28.8h, #5 + add v26.8h, v26.8h , v6.8h + add v28.8h, v28.8h , v6.8h + sqrshrun v22.8b, v26.8h, #5 + st1 {v20.2s, v21.2s}, [x2], x3 + sqrshrun v23.8b, v28.8h, #5 + add v26.8h, v26.8h , v6.8h + add v28.8h, v28.8h , v6.8h + sqrshrun v20.8b, v26.8h, #5 + st1 {v22.2s, v23.2s}, [x2], x3 + sqrshrun v21.8b, v28.8h, #5 + add v26.8h, v26.8h , v6.8h + add v28.8h, v28.8h , v6.8h + sqrshrun v22.8b, v26.8h, #5 + st1 {v20.2s, v21.2s}, [x2], x3 + sqrshrun v23.8b, v28.8h, #5 + add v26.8h, v26.8h , v6.8h + add v28.8h, v28.8h , v6.8h + sqrshrun v20.8b, v26.8h, #5 + st1 {v22.2s, v23.2s}, [x2], x3 + sqrshrun v21.8b, v28.8h, #5 + add v26.8h, v26.8h , v6.8h + add v28.8h, v28.8h , v6.8h + sqrshrun v22.8b, v26.8h, #5 + st1 {v20.2s, v21.2s}, [x2], x3 + sqrshrun v23.8b, v28.8h, #5 + add v26.8h, v26.8h , v6.8h + add v28.8h, v28.8h , v6.8h + sqrshrun v20.8b, v26.8h, #5 + st1 {v22.2s, v23.2s}, [x2], x3 + sqrshrun v21.8b, v28.8h, #5 + add v26.8h, v26.8h , v6.8h + add v28.8h, v28.8h , v6.8h + sqrshrun v22.8b, v26.8h, #5 + st1 {v20.2s, v21.2s}, [x2], x3 + sqrshrun v23.8b, v28.8h, #5 + add v26.8h, v26.8h , v6.8h + add v28.8h, v28.8h , v6.8h + sqrshrun v20.8b, v26.8h, #5 + st1 {v22.2s, v23.2s}, [x2], x3 + sqrshrun v21.8b, v28.8h, #5 + add v26.8h, v26.8h , v6.8h + add v28.8h, v28.8h , v6.8h + sqrshrun v22.8b, v26.8h, #5 + st1 {v20.2s, v21.2s}, [x2], x3 + sqrshrun v23.8b, v28.8h, #5 + add v26.8h, v26.8h , v6.8h + add v28.8h, v28.8h , v6.8h + sqrshrun v20.8b, v26.8h, #5 + st1 {v22.2s, v23.2s}, [x2], x3 + sqrshrun v21.8b, v28.8h, #5 + add v26.8h, v26.8h , v6.8h + add v28.8h, v28.8h , v6.8h + sqrshrun v22.8b, v26.8h, #5 + st1 {v20.2s, v21.2s}, [x2], x3 + sqrshrun v23.8b, v28.8h, #5 + add v26.8h, v26.8h , v6.8h + add v28.8h, v28.8h , v6.8h + sqrshrun v20.8b, v26.8h, #5 + st1 {v22.2s, v23.2s}, [x2], x3 + sqrshrun v21.8b, v28.8h, #5 + add v26.8h, v26.8h , v6.8h + add v28.8h, v28.8h , v6.8h + sqrshrun v22.8b, v26.8h, #5 + st1 {v20.2s, v21.2s}, [x2], x3 + sqrshrun v23.8b, v28.8h, #5 + add v26.8h, v26.8h , v6.8h + add v28.8h, v28.8h , v6.8h + sqrshrun v20.8b, v26.8h, #5 + st1 {v22.2s, v23.2s}, [x2], x3 + sqrshrun v21.8b, v28.8h, #5 + add v26.8h, v26.8h , v6.8h + add v28.8h, v28.8h , v6.8h + sqrshrun v22.8b, v26.8h, #5 + st1 {v20.2s, v21.2s}, [x2], x3 + sqrshrun v23.8b, v28.8h, #5 + st1 {v22.2s, v23.2s}, [x2], x3 + +end_func_plane: + + ldp x19, x20, [sp], #16 + pop_v_regs + ret + diff --git a/common/armv8/ih264_intra_pred_luma_4x4_av8.s b/common/armv8/ih264_intra_pred_luma_4x4_av8.s new file mode 100755 index 0000000..62e8cee --- /dev/null +++ b/common/armv8/ih264_intra_pred_luma_4x4_av8.s @@ -0,0 +1,876 @@ +//****************************************************************************** +//* +//* Copyright (C) 2015 The Android Open Source Project +//* +//* Licensed under the Apache License, Version 2.0 (the "License"); +//* you may not use this file except in compliance with the License. +//* You may obtain a copy of the License at: +//* +//* http://www.apache.org/licenses/LICENSE-2.0 +//* +//* Unless required by applicable law or agreed to in writing, software +//* distributed under the License is distributed on an "AS IS" BASIS, +//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +//* See the License for the specific language governing permissions and +//* limitations under the License. +//* +//***************************************************************************** +//* Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore +//*/ +///** +//****************************************************************************** +//* @file +//* ih264_intra_pred_luma_4x4_av8.s +//* +//* @brief +//* Contains function definitions for intra 4x4 Luma prediction . +//* +//* @author +//* Ittiam +//* +//* @par List of Functions: +//* +//* -ih264_intra_pred_luma_4x4_mode_vert_av8 +//* -ih264_intra_pred_luma_4x4_mode_horz_av8 +//* -ih264_intra_pred_luma_4x4_mode_dc_av8 +//* -ih264_intra_pred_luma_4x4_mode_diag_dl_av8 +//* -ih264_intra_pred_luma_4x4_mode_diag_dr_av8 +//* -ih264_intra_pred_luma_4x4_mode_vert_r_av8 +//* -ih264_intra_pred_luma_4x4_mode_horz_d_av8 +//* -ih264_intra_pred_luma_4x4_mode_vert_l_av8 +//* -ih264_intra_pred_luma_4x4_mode_horz_u_av8 +//* +//* @remarks +//* None +//* +//******************************************************************************* +//*/ + +///* All the functions here are replicated from ih264_intra_pred_filters.c +// + +///** +///** +///** +// + +.text +.p2align 2 +.include "ih264_neon_macros.s" + + + + +///** +//******************************************************************************* +//* +//*ih264_intra_pred_luma_4x4_mode_vert +//* +//* @brief +//* Perform Intra prediction for luma_4x4 mode:vertical +//* +//* @par Description: +//* Perform Intra prediction for luma_4x4 mode:vertical ,described in sec 8.3.1.2.1 +//* +//* @param[in] pu1_src +//* UWORD8 pointer to the source +//* +//* @param[out] pu1_dst +//* UWORD8 pointer to the destination +//* +//* @param[in] src_strd +//* integer source stride +//* +//* @param[in] dst_strd +//* integer destination stride +//* +//* @param[in] ui_neighboravailability +//* availability of neighbouring pixels(Not used in this function) +//* +//* @returns +//* +//* @remarks +//* None +//* +//******************************************************************************* +//void ih264_intra_pred_luma_4x4_mode_vert(UWORD8 *pu1_src, +// UWORD8 *pu1_dst, +// WORD32 src_strd, +// WORD32 dst_strd, +// WORD32 ui_neighboravailability) + +//**************Variables Vs Registers***************************************** +// x0 => *pu1_src +// x1 => *pu1_dst +// x2 => src_strd +// x3 => dst_strd +// x4 => ui_neighboravailability + + .global ih264_intra_pred_luma_4x4_mode_vert_av8 + +ih264_intra_pred_luma_4x4_mode_vert_av8: + + push_v_regs + + add x0, x0, #5 + + ld1 {v0.s}[0], [x0] + st1 {v0.s}[0], [x1], x3 + st1 {v0.s}[0], [x1], x3 + st1 {v0.s}[0], [x1], x3 + st1 {v0.s}[0], [x1], x3 + + pop_v_regs + ret + + + + + +///****************************************************************************** + + +///** +//******************************************************************************* +//* +//*ih264_intra_pred_luma_4x4_mode_horz +//* +//* @brief +//* Perform Intra prediction for luma_4x4 mode:horizontal +//* +//* @par Description: +//* Perform Intra prediction for luma_4x4 mode:horizontal ,described in sec 8.3.1.2.2 +//* +//* @param[in] pu1_src +//* UWORD8 pointer to the source +//* +//* @param[out] pu1_dst +//* UWORD8 pointer to the destination +//* +//* @param[in] src_strd +//* integer source stride +//* +//* @param[in] dst_strd +//* integer destination stride +//* +//* @param[in] ui_neighboravailability +//* availability of neighbouring pixels(Not used in this function) +//* +//* @returns +//* +//* @remarks +//* None +//* +//******************************************************************************* +//*/ +//void ih264_intra_pred_luma_4x4_mode_horz(UWORD8 *pu1_src, +// UWORD8 *pu1_dst, +// WORD32 src_strd, +// WORD32 dst_strd, +// WORD32 ui_neighboravailability) +//**************Variables Vs Registers***************************************** +// x0 => *pu1_src +// x1 => *pu1_dst +// x2 => src_strd +// x3 => dst_strd +// x4 => ui_neighboravailability + + + + .global ih264_intra_pred_luma_4x4_mode_horz_av8 + +ih264_intra_pred_luma_4x4_mode_horz_av8: + + push_v_regs + + ld1 {v1.s}[0], [x0] + dup v0.8b, v1.b[3] + dup v2.8b, v1.b[2] + st1 {v0.s}[0], [x1], x3 + dup v3.8b, v1.b[1] + st1 {v2.s}[0], [x1], x3 + dup v4.8b, v1.b[0] + st1 {v3.s}[0], [x1], x3 + st1 {v4.s}[0], [x1], x3 + + pop_v_regs + ret + + + + + + + +///****************************************************************************** + + +///** +//******************************************************************************* +//* +//*ih264_intra_pred_luma_4x4_mode_dc +//* +//* @brief +//* Perform Intra prediction for luma_4x4 mode:DC +//* +//* @par Description: +//* Perform Intra prediction for luma_4x4 mode:DC ,described in sec 8.3.1.2.3 +//* +//* @param[in] pu1_src +//* UWORD8 pointer to the source +//* +//* @param[out] pu1_dst +//* UWORD8 pointer to the destination +//* +//* @param[in] src_strd +//* integer source stride +//* +//* @param[in] dst_strd +//* integer destination stride +//* +//* @param[in] ui_neighboravailability +//* availability of neighbouring pixels +//* +//* @returns +//* +//* @remarks +//* None +//* +//*******************************************************************************/ +//void ih264_intra_pred_luma_4x4_mode_dc(UWORD8 *pu1_src, +// UWORD8 *pu1_dst, +// WORD32 src_strd, +// WORD32 dst_strd, +// WORD32 ui_neighboravailability) + +//**************Variables Vs Registers***************************************** +// x0 => *pu1_src +// x1 => *pu1_dst +// x2 => src_strd +// x3 => dst_strd +// x4 => ui_neighboravailability + + + + .global ih264_intra_pred_luma_4x4_mode_dc_av8 + +ih264_intra_pred_luma_4x4_mode_dc_av8: + + + + + push_v_regs + stp x19, x20, [sp, #-16]! + + ands x5, x4, #0x01 + beq top_available //LEFT NOT AVAILABLE + + add x10, x0, #3 + mov x2, #-1 + ldrb w5, [x10], #-1 + sxtw x5, w5 + ldrb w6, [x10], #-1 + sxtw x6, w6 + ldrb w7, [x10], #-1 + sxtw x7, w7 + add x5, x5, x6 + ldrb w8, [x10], #-1 + sxtw x8, w8 + add x5, x5, x7 + ands x11, x4, #0x04 // CHECKING IF TOP_AVAILABLE ELSE BRANCHING TO ONLY LEFT AVAILABLE + add x5, x5, x8 + beq left_available + add x10, x0, #5 + // BOTH LEFT AND TOP AVAILABLE + ldrb w6, [x10], #1 + sxtw x6, w6 + ldrb w7, [x10], #1 + sxtw x7, w7 + add x5, x5, x6 + ldrb w8, [x10], #1 + sxtw x8, w8 + add x5, x5, x7 + ldrb w9, [x10], #1 + sxtw x9, w9 + add x5, x5, x8 + add x5, x5, x9 + add x5, x5, #4 + lsr x5, x5, #3 + dup v0.8b, w5 + st1 {v0.s}[0], [x1], x3 + st1 {v0.s}[0], [x1], x3 + st1 {v0.s}[0], [x1], x3 + st1 {v0.s}[0], [x1], x3 + b end_func + +top_available: // ONLT TOP AVAILABLE + ands x11, x4, #0x04 // CHECKING TOP AVAILABILTY OR ELSE BRANCH TO NONE AVAILABLE + beq none_available + + add x10, x0, #5 + ldrb w6, [x10], #1 + sxtw x6, w6 + ldrb w7, [x10], #1 + sxtw x7, w7 + ldrb w8, [x10], #1 + sxtw x8, w8 + add x5, x6, x7 + ldrb w9, [x10], #1 + sxtw x9, w9 + add x5, x5, x8 + add x5, x5, x9 + add x5, x5, #2 + lsr x5, x5, #2 + dup v0.8b, w5 + st1 {v0.s}[0], [x1], x3 + st1 {v0.s}[0], [x1], x3 + st1 {v0.s}[0], [x1], x3 + st1 {v0.s}[0], [x1], x3 + b end_func + +left_available: //ONLY LEFT AVAILABLE + add x5, x5, #2 + lsr x5, x5, #2 + dup v0.8b, w5 + st1 {v0.s}[0], [x1], x3 + st1 {v0.s}[0], [x1], x3 + st1 {v0.s}[0], [x1], x3 + st1 {v0.s}[0], [x1], x3 + b end_func + +none_available: //NONE AVAILABLE + mov x5, #128 + dup v0.8b, w5 + st1 {v0.s}[0], [x1], x3 + st1 {v0.s}[0], [x1], x3 + st1 {v0.s}[0], [x1], x3 + st1 {v0.s}[0], [x1], x3 + b end_func + + +end_func: + + ldp x19, x20, [sp], #16 + pop_v_regs + ret + + + + + + + +///** +//******************************************************************************* +//* +//*ih264_intra_pred_luma_4x4_mode_diag_dl +//* +//* @brief +//* Perform Intra prediction for luma_4x4 mode:Diagonal_Down_Left +//* +//* @par Description: +//* Perform Intra prediction for luma_4x4 mode:Diagonal_Down_Left ,described in sec 8.3.1.2.4 +//* +//* @param[in] pu1_src +//* UWORD8 pointer to the source +//* +//* @param[out] pu1_dst +//* UWORD8 pointer to the destination +//* +//* @param[in] src_strd +//* integer source stride +//* +//* @param[in] dst_strd +//* integer destination stride +//* +//* @param[in] ui_neighboravailability +//* availability of neighbouring pixels +//* +//* @returns +//* +//* @remarks +//* None +//* +//*******************************************************************************/ +//void ih264_intra_pred_luma_4x4_mode_diag_dl(UWORD8 *pu1_src, +// UWORD8 *pu1_dst, +// WORD32 src_strd, +// WORD32 dst_strd, +// WORD32 ui_neighboravailability) + +//**************Variables Vs Registers***************************************** +// x0 => *pu1_src +// x1 => *pu1_dst +// x2 => src_strd +// x3 => dst_strd +// x4 => ui_neighboravailability + + + .global ih264_intra_pred_luma_4x4_mode_diag_dl_av8 + +ih264_intra_pred_luma_4x4_mode_diag_dl_av8: + + + push_v_regs + stp x19, x20, [sp, #-16]! + + add x0, x0, #5 + sub x5, x3, #2 + add x6, x0, #7 + ld1 {v0.8b}, [x0] + ext v1.8b, v0.8b , v0.8b , #1 + ext v2.8b, v0.8b , v0.8b , #2 + ld1 {v2.b}[6], [x6] + uaddl v20.8h, v0.8b, v1.8b + uaddl v22.8h, v1.8b, v2.8b + add v24.8h, v20.8h , v22.8h + sqrshrun v3.8b, v24.8h, #2 + st1 {v3.s}[0], [x1], x3 + ext v4.8b, v3.8b , v3.8b , #1 + st1 {v4.s}[0], [x1], x3 + st1 {v3.h}[1], [x1], #2 + st1 {v3.h}[2], [x1], x5 + st1 {v4.h}[1], [x1], #2 + st1 {v4.h}[2], [x1] + +end_func_diag_dl: + + ldp x19, x20, [sp], #16 + pop_v_regs + ret + + + + + + + + + +///** +//******************************************************************************* +//* +//*ih264_intra_pred_luma_4x4_mode_diag_dr +//* +//* @brief +//* Perform Intra prediction for luma_4x4 mode:Diagonal_Down_Right +//* +//* @par Description: +//* Perform Intra prediction for luma_4x4 mode:Diagonal_Down_Right ,described in sec 8.3.1.2.5 +//* +//* @param[in] pu1_src +//* UWORD8 pointer to the source +//* +//* @param[out] pu1_dst +//* UWORD8 pointer to the destination +//* +//* @param[in] src_strd +//* integer source stride +//* +//* @param[in] dst_strd +//* integer destination stride +//* +//* @param[in] ui_neighboravailability +//* availability of neighbouring pixels +//* +//* @returns +//* +//* @remarks +//* None +//* +//*******************************************************************************/ +//void ih264_intra_pred_luma_4x4_mode_diag_dr(UWORD8 *pu1_src, +// UWORD8 *pu1_dst, +// WORD32 src_strd, +// WORD32 dst_strd, +// WORD32 ui_neighboravailability) + +//**************Variables Vs Registers***************************************** +// x0 => *pu1_src +// x1 => *pu1_dst +// x2 => src_strd +// x3 => dst_strd +// x4 => ui_neighboravailability + + + .global ih264_intra_pred_luma_4x4_mode_diag_dr_av8 + +ih264_intra_pred_luma_4x4_mode_diag_dr_av8: + + push_v_regs + stp x19, x20, [sp, #-16]! + + + ld1 {v0.8b}, [x0] + add x0, x0, #1 + ld1 {v1.8b}, [x0] + ext v2.8b, v1.8b , v1.8b , #1 + uaddl v20.8h, v0.8b, v1.8b + uaddl v22.8h, v1.8b, v2.8b + add v24.8h, v20.8h , v22.8h + sqrshrun v3.8b, v24.8h, #2 + + ext v4.8b, v3.8b , v3.8b , #1 + sub x5, x3, #2 + st1 {v4.h}[1], [x1], #2 + st1 {v4.h}[2], [x1], x5 + st1 {v3.h}[1], [x1], #2 + st1 {v3.h}[2], [x1], x5 + st1 {v4.s}[0], [x1], x3 + st1 {v3.s}[0], [x1], x3 + +end_func_diag_dr: + ldp x19, x20, [sp], #16 + pop_v_regs + ret + + + + + + + +///** +//******************************************************************************* +//* +//*ih264_intra_pred_luma_4x4_mode_vert_r +//* +//* @brief +//* Perform Intra prediction for luma_4x4 mode:Vertical_Right +//* +//* @par Description: +//* Perform Intra prediction for luma_4x4 mode:Vertical_Right ,described in sec 8.3.1.2.6 +//* +//* @param[in] pu1_src +//* UWORD8 pointer to the source +//* +//* @param[out] pu1_dst +//* UWORD8 pointer to the destination +//* +//* @param[in] src_strd +//* integer source stride +//* +//* @param[in] dst_strd +//* integer destination stride +//* +//* @param[in] ui_neighboravailability +//* availability of neighbouring pixels +//* +//* @returns +//* +//* @remarks +//* None +//* +//*******************************************************************************/ +//void ih264_intra_pred_luma_4x4_mode_vert_r(UWORD8 *pu1_src, +// UWORD8 *pu1_dst, +// WORD32 src_strd, +// WORD32 dst_strd, +// WORD32 ui_neighboravailability) + +//**************Variables Vs Registers***************************************** +// x0 => *pu1_src +// x1 => *pu1_dst +// x2 => src_strd +// x3 => dst_strd +// x4 => ui_neighboravailability + + + .global ih264_intra_pred_luma_4x4_mode_vert_r_av8 + +ih264_intra_pred_luma_4x4_mode_vert_r_av8: + + push_v_regs + stp x19, x20, [sp, #-16]! + + + ld1 {v0.8b}, [x0] + add x0, x0, #1 + ld1 {v1.8b}, [x0] + ext v2.8b, v1.8b , v1.8b , #1 + uaddl v20.8h, v0.8b, v1.8b + uaddl v22.8h, v1.8b, v2.8b + add v24.8h, v20.8h , v22.8h + sqrshrun v4.8b, v20.8h, #1 + sqrshrun v3.8b, v24.8h, #2 + sub x5, x3, #2 + ext v5.8b, v3.8b , v3.8b , #3 + st1 {v4.s}[1], [x1], x3 + st1 {v5.s}[0], [x1], x3 + sub x8, x3, #3 + st1 {v3.b}[2], [x1], #1 + st1 {v4.h}[2], [x1], #2 + st1 {v4.b}[6], [x1], x8 + st1 {v3.b}[1], [x1], #1 + st1 {v5.h}[0], [x1], #2 + st1 {v5.b}[2], [x1] + + +end_func_vert_r: + ldp x19, x20, [sp], #16 + pop_v_regs + ret + + + + + +///** +//******************************************************************************* +//* +//*ih264_intra_pred_luma_4x4_mode_horz_d +//* +//* @brief +//* Perform Intra prediction for luma_4x4 mode:Horizontal_Down +//* +//* @par Description: +//* Perform Intra prediction for luma_4x4 mode:Horizontal_Down ,described in sec 8.3.1.2.7 +//* +//* @param[in] pu1_src +//* UWORD8 pointer to the source +//* +//* @param[out] pu1_dst +//* UWORD8 pointer to the destination +//* +//* @param[in] src_strd +//* integer source stride +//* +//* @param[in] dst_strd +//* integer destination stride +//* +//* @param[in] ui_neighboravailability +//* availability of neighbouring pixels +//* +//* @returns +//* +//* @remarks +//* None +//* +//*******************************************************************************/ +//void ih264_intra_pred_luma_4x4_mode_horz_d(UWORD8 *pu1_src, +// UWORD8 *pu1_dst, +// WORD32 src_strd, +// WORD32 dst_strd, +// WORD32 ui_neighboravailability) + +//**************Variables Vs Registers***************************************** +// x0 => *pu1_src +// x1 => *pu1_dst +// x2 => src_strd +// x3 => dst_strd +// x4 => ui_neighboravailability + + + .global ih264_intra_pred_luma_4x4_mode_horz_d_av8 + +ih264_intra_pred_luma_4x4_mode_horz_d_av8: + + push_v_regs + stp x19, x20, [sp, #-16]! + + ld1 {v0.8b}, [x0] + add x0, x0, #1 + ld1 {v1.8b}, [x0] + ext v2.8b, v1.8b , v0.8b , #1 + uaddl v20.8h, v0.8b, v1.8b + uaddl v22.8h, v1.8b, v2.8b + add v24.8h, v20.8h , v22.8h + sqrshrun v4.8b, v20.8h, #1 + sqrshrun v5.8b, v24.8h, #2 + sub x5, x3, #2 + mov v6.8b, v5.8b + trn1 v10.8b, v4.8b, v5.8b + trn2 v5.8b, v4.8b, v5.8b // + mov v4.8b, v10.8b + st1 {v5.h}[1], [x1], #2 + st1 {v6.h}[2], [x1], x5 + st1 {v4.h}[1], [x1], #2 + st1 {v5.h}[1], [x1], x5 + st1 {v5.h}[0], [x1], #2 + st1 {v4.h}[1], [x1], x5 + st1 {v4.h}[0], [x1], #2 + st1 {v5.h}[0], [x1], x5 + +end_func_horz_d: + ldp x19, x20, [sp], #16 + pop_v_regs + ret + + + + + + + +///** +//******************************************************************************* +//* +//*ih264_intra_pred_luma_4x4_mode_vert_l +//* +//* @brief +//* Perform Intra prediction for luma_4x4 mode:Vertical_Left +//* +//* @par Description: +//* Perform Intra prediction for luma_4x4 mode:Vertical_Left ,described in sec 8.3.1.2.8 +//* +//* @param[in] pu1_src +//* UWORD8 pointer to the source +//* +//* @param[out] pu1_dst +//* UWORD8 pointer to the destination +//* +//* @param[in] src_strd +//* integer source stride +//* +//* @param[in] dst_strd +//* integer destination stride +//* +//* @param[in] ui_neighboravailability +//* availability of neighbouring pixels +//* +//* @returns +//* +//* @remarks +//* None +//* +//*******************************************************************************/ +//void ih264_intra_pred_luma_4x4_mode_vert_l(UWORD8 *pu1_src, +// UWORD8 *pu1_dst, +// WORD32 src_strd, +// WORD32 dst_strd, +// WORD32 ui_neighboravailability) + +//**************Variables Vs Registers***************************************** +// x0 => *pu1_src +// x1 => *pu1_dst +// x2 => src_strd +// x3 => dst_strd +// x4 => ui_neighboravailability + + + .global ih264_intra_pred_luma_4x4_mode_vert_l_av8 + +ih264_intra_pred_luma_4x4_mode_vert_l_av8: + + push_v_regs + stp x19, x20, [sp, #-16]! + add x0, x0, #4 + ld1 {v0.8b}, [x0] + add x0, x0, #1 + ld1 {v1.8b}, [x0] + ext v2.8b, v1.8b , v0.8b , #1 + uaddl v20.8h, v0.8b, v1.8b + uaddl v22.8h, v1.8b, v2.8b + add v24.8h, v20.8h , v22.8h + sqrshrun v4.8b, v20.8h, #1 + sqrshrun v5.8b, v24.8h, #2 + ext v6.8b, v4.8b , v4.8b , #1 + ext v7.8b, v5.8b , v5.8b , #1 + st1 {v6.s}[0], [x1], x3 + ext v8.8b, v4.8b , v4.8b , #2 + ext v9.8b, v5.8b , v5.8b , #2 + st1 {v7.s}[0], [x1], x3 + st1 {v8.s}[0], [x1], x3 + st1 {v9.s}[0], [x1], x3 + +end_func_vert_l: + ldp x19, x20, [sp], #16 + pop_v_regs + ret + + + + + + + +///** +//******************************************************************************* +//* +//*ih264_intra_pred_luma_4x4_mode_horz_u +//* +//* @brief +//* Perform Intra prediction for luma_4x4 mode:Horizontal_Up +//* +//* @par Description: +//* Perform Intra prediction for luma_4x4 mode:Horizontal_Up ,described in sec 8.3.1.2.9 +//* +//* @param[in] pu1_src +//* UWORD8 pointer to the source +//* +//* @param[out] pu1_dst +//* UWORD8 pointer to the destination +//* +//* @param[in] src_strd +//* integer source stride +//* +//* @param[in] dst_strd +//* integer destination stride +//* +//* @param[in] ui_neighboravailability +//* availability of neighbouring pixels +//* +//* @returns +//* +//* @remarks +//* None +//* +//*******************************************************************************/ +//void ih264_intra_pred_luma_4x4_mode_horz_u(UWORD8 *pu1_src, +// UWORD8 *pu1_dst, +// WORD32 src_strd, +// WORD32 dst_strd, +// WORD32 ui_neighboravailability) + +//**************Variables Vs Registers***************************************** +// x0 => *pu1_src +// x1 => *pu1_dst +// x2 => src_strd +// x3 => dst_strd +// x4 => ui_neighboravailability + + + .global ih264_intra_pred_luma_4x4_mode_horz_u_av8 + +ih264_intra_pred_luma_4x4_mode_horz_u_av8: + + push_v_regs + stp x19, x20, [sp, #-16]! + mov x10, x0 + ld1 {v0.8b}, [x0] + ldrb w9, [x0], #1 + sxtw x9, w9 + ext v1.8b, v0.8b , v0.8b , #1 + ld1 {v0.b}[7], [x10] + ext v2.8b, v1.8b , v1.8b , #1 + uaddl v20.8h, v0.8b, v1.8b + uaddl v22.8h, v1.8b, v2.8b + add v24.8h, v20.8h , v22.8h + sqrshrun v4.8b, v20.8h, #1 + sqrshrun v5.8b, v24.8h, #2 + mov v6.8b, v4.8b + ext v6.8b, v5.8b , v4.8b , #1 + st1 {v4.b}[2], [x1], #1 + st1 {v6.b}[0], [x1], #1 + trn1 v10.8b, v6.8b, v5.8b + trn2 v5.8b, v6.8b, v5.8b // + mov v6.8b , v10.8b + sub x5, x3, #2 + trn1 v10.8b, v4.8b, v6.8b + trn2 v6.8b, v4.8b, v6.8b // + mov v4.8b , v10.8b + dup v7.8b, w9 + st1 {v6.h}[0], [x1], x5 + st1 {v6.h}[0], [x1], #2 + st1 {v5.h}[3], [x1], x5 + st1 {v5.h}[3], [x1], #2 + st1 {v7.h}[3], [x1], x5 + st1 {v7.s}[0], [x1], x3 + +end_func_horz_u: + ldp x19, x20, [sp], #16 + pop_v_regs + ret + + + diff --git a/common/armv8/ih264_intra_pred_luma_8x8_av8.s b/common/armv8/ih264_intra_pred_luma_8x8_av8.s new file mode 100755 index 0000000..2b972ca --- /dev/null +++ b/common/armv8/ih264_intra_pred_luma_8x8_av8.s @@ -0,0 +1,1084 @@ +//****************************************************************************** +//* +//* Copyright (C) 2015 The Android Open Source Project +//* +//* Licensed under the Apache License, Version 2.0 (the "License"); +//* you may not use this file except in compliance with the License. +//* You may obtain a copy of the License at: +//* +//* http://www.apache.org/licenses/LICENSE-2.0 +//* +//* Unless required by applicable law or agreed to in writing, software +//* distributed under the License is distributed on an "AS IS" BASIS, +//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +//* See the License for the specific language governing permissions and +//* limitations under the License. +//* +//***************************************************************************** +//* Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore +//*/ +///** +//****************************************************************************** +//* @file +//* ih264_intra_pred_luma_8x8_av8.s +//* +//* @brief +//* Contains function definitions for intra 8x8 Luma prediction . +//* +//* @author +//* Ittiam +//* +//* @par List of Functions: +//* +//* -ih264_intra_pred_luma_8x8_mode_vert_av8 +//* -ih264_intra_pred_luma_8x8_mode_horz_av8 +//* -ih264_intra_pred_luma_8x8_mode_dc_av8 +//* -ih264_intra_pred_luma_8x8_mode_diag_dl_av8 +//* -ih264_intra_pred_luma_8x8_mode_diag_dr_av8 +//* -ih264_intra_pred_luma_8x8_mode_vert_r_av8 +//* -ih264_intra_pred_luma_8x8_mode_horz_d_av8 +//* -ih264_intra_pred_luma_8x8_mode_vert_l_av8 +//* -ih264_intra_pred_luma_8x8_mode_horz_u_av8 +//* +//* @remarks +//* None +//* +//******************************************************************************* +//*/ + +///* All the functions here are replicated from ih264_intra_pred_filters.c +// + +///** +///** +///** + +.text +.p2align 2 +.include "ih264_neon_macros.s" + +.extern ih264_gai1_intrapred_luma_8x8_horz_u + + + +///** +//******************************************************************************* +//* +//*ih264_intra_pred_luma_8x8_mode_vert +//* +//* @brief +//* Perform Intra prediction for luma_8x8 mode:vertical +//* +//* @par Description: +//* Perform Intra prediction for luma_8x8 mode:vertical ,described in sec 8.3.2.2.2 +//* +//* @param[in] pu1_src +//* UWORD8 pointer to the source +//* +//* @param[out] pu1_dst +//* UWORD8 pointer to the destination +//* +//* @param[in] src_strd +//* integer source stride +//* +//* @param[in] dst_strd +//* integer destination stride +//* +//* @param[in] ui_neighboravailability +//* availability of neighbouring pixels(Not used in this function) +//* +//* @returns +//* +//* @remarks +//* None +//* +//******************************************************************************* +//void ih264_intra_pred_luma_8x8_mode_vert(UWORD8 *pu1_src, +// UWORD8 *pu1_dst, +// WORD32 src_strd, +// WORD32 dst_strd, +// WORD32 ui_neighboravailability) + +//**************Variables Vs Registers***************************************** +// x0 => *pu1_src +// x1 => *pu1_dst +// x2 => src_strd +// x3 => dst_strd +// x4 => ui_neighboravailability + + + .global ih264_intra_pred_luma_8x8_mode_vert_av8 + +ih264_intra_pred_luma_8x8_mode_vert_av8: + + // STMFD sp!, {x4-x12, x14} //store register values to stack + push_v_regs + //stp x19, x20,[sp,#-16]! + + add x0, x0, #9 + ld1 {v0.8b}, [x0] + + st1 {v0.8b}, [x1], x3 + st1 {v0.8b}, [x1], x3 + st1 {v0.8b}, [x1], x3 + st1 {v0.8b}, [x1], x3 + st1 {v0.8b}, [x1], x3 + st1 {v0.8b}, [x1], x3 + st1 {v0.8b}, [x1], x3 + st1 {v0.8b}, [x1], x3 + + // LDMFD sp!,{x4-x12,PC} //Restoring registers from stack + //ldp x19, x20,[sp],#16 + pop_v_regs + ret + + + + + +///****************************************************************************** + + +///** +//******************************************************************************* +//* +//*ih264_intra_pred_luma_8x8_mode_horz +//* +//* @brief +//* Perform Intra prediction for luma_8x8 mode:horizontal +//* +//* @par Description: +//* Perform Intra prediction for luma_8x8 mode:horizontal ,described in sec 8.3.2.2.2 +//* +//* @param[in] pu1_src +//* UWORD8 pointer to the source +//* +//* @param[out] pu1_dst +//* UWORD8 pointer to the destination +//* +//* @param[in] src_strd +//* integer source stride +//* +//* @param[in] dst_strd +//* integer destination stride +//* +//* @param[in] ui_neighboravailability +//* availability of neighbouring pixels(Not used in this function) +//* +//* @returns +//* +//* @remarks +//* None +//* +//******************************************************************************* +//*/ +//void ih264_intra_pred_luma_8x8_mode_horz(UWORD8 *pu1_src, +// UWORD8 *pu1_dst, +// WORD32 src_strd, +// WORD32 dst_strd, +// WORD32 ui_neighboravailability) +//**************Variables Vs Registers***************************************** +// x0 => *pu1_src +// x1 => *pu1_dst +// x2 => src_strd +// x3 => dst_strd +// x4 => ui_neighboravailability + + + .global ih264_intra_pred_luma_8x8_mode_horz_av8 + +ih264_intra_pred_luma_8x8_mode_horz_av8: + + + + // STMFD sp!, {x4-x12, x14} //store register values to stack + push_v_regs + stp x19, x20, [sp, #-16]! + add x0, x0, #7 + mov x2 , #-1 + + ldrb w5, [x0], #-1 + sxtw x5, w5 + ldrb w6, [x0], #-1 + sxtw x6, w6 + dup v0.8b, w5 + st1 {v0.8b}, [x1], x3 + ldrb w7, [x0], #-1 + sxtw x7, w7 + dup v1.8b, w6 + st1 {v1.8b}, [x1], x3 + dup v2.8b, w7 + ldrb w8, [x0], #-1 + sxtw x8, w8 + dup v3.8b, w8 + st1 {v2.8b}, [x1], x3 + ldrb w5, [x0], #-1 + sxtw x5, w5 + st1 {v3.8b}, [x1], x3 + dup v0.8b, w5 + ldrb w6, [x0], #-1 + sxtw x6, w6 + st1 {v0.8b}, [x1], x3 + ldrb w7, [x0], #-1 + sxtw x7, w7 + dup v1.8b, w6 + dup v2.8b, w7 + st1 {v1.8b}, [x1], x3 + ldrb w8, [x0], #-1 + sxtw x8, w8 + dup v3.8b, w8 + st1 {v2.8b}, [x1], x3 + st1 {v3.8b}, [x1], x3 + + // LDMFD sp!,{x4-x12,PC} //Restoring registers from stack + ldp x19, x20, [sp], #16 + pop_v_regs + ret + + + + + + + +///****************************************************************************** + + +///** +//******************************************************************************* +//* +//*ih264_intra_pred_luma_8x8_mode_dc +//* +//* @brief +//* Perform Intra prediction for luma_8x8 mode:DC +//* +//* @par Description: +//* Perform Intra prediction for luma_8x8 mode:DC ,described in sec 8.3.2.2.3 +//* +//* @param[in] pu1_src +//* UWORD8 pointer to the source +//* +//* @param[out] pu1_dst +//* UWORD8 pointer to the destination +//* +//* @param[in] src_strd +//* integer source stride +//* +//* @param[in] dst_strd +//* integer destination stride +//* +//* @param[in] ui_neighboravailability +//* availability of neighbouring pixels +//* +//* @returns +//* +//* @remarks +//* None +//* +//*******************************************************************************/ +//void ih264_intra_pred_luma_8x8_mode_dc(UWORD8 *pu1_src, +// UWORD8 *pu1_dst, +// WORD32 src_strd, +// WORD32 dst_strd, +// WORD32 ui_neighboravailability) + +//**************Variables Vs Registers***************************************** +// x0 => *pu1_src +// x1 => *pu1_dst +// x2 => src_strd +// x3 => dst_strd +// x4 => ui_neighboravailability + + + .global ih264_intra_pred_luma_8x8_mode_dc_av8 + +ih264_intra_pred_luma_8x8_mode_dc_av8: + + + + // STMFD sp!, {x4-x12, x14} //store register values to stack + push_v_regs + stp x19, x20, [sp, #-16]! + + ands x6, x4, #0x01 + beq top_available //LEFT NOT AVAILABLE + + add x10, x0, #7 + mov x2, #-1 + ldrb w5, [x10], -1 + sxtw x5, w5 + ldrb w6, [x10], -1 + sxtw x6, w6 + ldrb w7, [x10], -1 + sxtw x7, w7 + add x5, x5, x6 + ldrb w8, [x10], -1 + sxtw x8, w8 + add x5, x5, x7 + ldrb w6, [x10], -1 + sxtw x6, w6 + add x5, x5, x8 + ldrb w7, [x10], -1 + sxtw x7, w7 + add x5, x5, x6 + ldrb w8, [x10], -1 + sxtw x8, w8 + add x5, x5, x7 + ands x11, x4, #0x04 // CHECKING IF TOP_AVAILABLE ELSE BRANCHING TO ONLY LEFT AVAILABLE + add x5, x5, x8 + ldrb w6, [x10], -1 + sxtw x6, w6 + add x5, x5, x6 + beq left_available + add x10, x0, #9 + // BOTH LEFT AND TOP AVAILABLE + ld1 {v0.8b}, [x10] + uaddlp v1.4h, v0.8b + uaddlp v3.2s, v1.4h + uaddlp v2.1d, v3.2s + dup v10.8h, w5 + dup v8.8h, v2.4h[0] + add v12.8h, v8.8h , v10.8h + sqrshrun v31.8b, v12.8h, #4 + st1 {v31.8b}, [x1], x3 + st1 {v31.8b}, [x1], x3 + st1 {v31.8b}, [x1], x3 + st1 {v31.8b}, [x1], x3 + st1 {v31.8b}, [x1], x3 + st1 {v31.8b}, [x1], x3 + st1 {v31.8b}, [x1], x3 + st1 {v31.8b}, [x1], x3 + b end_func + +top_available: // ONLT TOP AVAILABLE + ands x11, x4, #0x04 // CHECKING TOP AVAILABILTY OR ELSE BRANCH TO NONE AVAILABLE + beq none_available + + add x10, x0, #9 + ld1 {v10.8b}, [x10] + uaddlp v14.4h, v10.8b + uaddlp v13.2s, v14.4h + uaddlp v12.1d, v13.2s + rshrn v4.8b, v12.8h, #3 + dup v31.8b, v4.8b[0] + st1 {v31.8b}, [x1], x3 + st1 {v31.8b}, [x1], x3 + st1 {v31.8b}, [x1], x3 + st1 {v31.8b}, [x1], x3 + st1 {v31.8b}, [x1], x3 + st1 {v31.8b}, [x1], x3 + st1 {v31.8b}, [x1], x3 + st1 {v31.8b}, [x1], x3 + b end_func + + +left_available: //ONLY LEFT AVAILABLE + add x5, x5, #4 + lsr x5, x5, #3 + dup v0.8b, w5 + st1 {v0.8b}, [x1], x3 + st1 {v0.8b}, [x1], x3 + st1 {v0.8b}, [x1], x3 + st1 {v0.8b}, [x1], x3 + st1 {v0.8b}, [x1], x3 + st1 {v0.8b}, [x1], x3 + st1 {v0.8b}, [x1], x3 + st1 {v0.8b}, [x1], x3 + b end_func + +none_available: //NONE AVAILABLE + mov x9, #128 + dup v0.8b, w9 + st1 {v0.8b}, [x1], x3 + st1 {v0.8b}, [x1], x3 + st1 {v0.8b}, [x1], x3 + st1 {v0.8b}, [x1], x3 + st1 {v0.8b}, [x1], x3 + st1 {v0.8b}, [x1], x3 + st1 {v0.8b}, [x1], x3 + st1 {v0.8b}, [x1], x3 + + +end_func: + + // LDMFD sp!,{x4-x12,PC} //Restoring registers from stack + ldp x19, x20, [sp], #16 + pop_v_regs + ret + + + + + + +///** +//******************************************************************************* +//* +//*ih264_intra_pred_luma_8x8_mode_diag_dl +//* +//* @brief +//* Perform Intra prediction for luma_8x8 mode:Diagonal_Down_Left +//* +//* @par Description: +//* Perform Intra prediction for luma_8x8 mode:Diagonal_Down_Left ,described in sec 8.3.2.2.4 +//* +//* @param[in] pu1_src +//* UWORD8 pointer to the source +//* +//* @param[out] pu1_dst +//* UWORD8 pointer to the destination +//* +//* @param[in] src_strd +//* integer source stride +//* +//* @param[in] dst_strd +//* integer destination stride +//* +//* @param[in] ui_neighboravailability +//* availability of neighbouring pixels +//* +//* @returns +//* +//* @remarks +//* None +//* +//*******************************************************************************/ +//void ih264_intra_pred_luma_8x8_mode_diag_dl(UWORD8 *pu1_src, +// UWORD8 *pu1_dst, +// WORD32 src_strd, +// WORD32 dst_strd, +// WORD32 ui_neighboravailability) + +//**************Variables Vs Registers***************************************** +// x0 => *pu1_src +// x1 => *pu1_dst +// x2 => src_strd +// x3 => dst_strd +// x4 => ui_neighboravailability + + .global ih264_intra_pred_luma_8x8_mode_diag_dl_av8 + +ih264_intra_pred_luma_8x8_mode_diag_dl_av8: + + // STMFD sp!, {x4-x12, x14} //store register values to stack + push_v_regs + stp x19, x20, [sp, #-16]! + + add x0, x0, #9 + sub x5, x3, #4 + add x6, x0, #15 + ld1 { v0.16b}, [x0] + mov v1.d[0], v0.d[1] + ext v4.16b, v0.16b , v0.16b , #2 + mov v5.d[0], v4.d[1] + ext v2.16b, v0.16b , v0.16b , #1 + mov v3.d[0], v2.d[1] + ld1 {v5.b}[6], [x6] + // q1 = q0 shifted to left once + // q2 = q1 shifted to left once + uaddl v20.8h, v0.8b, v2.8b //Adding for FILT121 + uaddl v22.8h, v1.8b, v3.8b + uaddl v24.8h, v2.8b, v4.8b + uaddl v26.8h, v3.8b, v5.8b + add v24.8h, v20.8h , v24.8h + add v26.8h, v22.8h , v26.8h + + sqrshrun v4.8b, v24.8h, #2 + sqrshrun v5.8b, v26.8h, #2 + mov v4.d[1], v5.d[0] + //Q2 has all FILT121 values + st1 {v4.8b}, [x1], x3 + ext v18.16b, v4.16b , v4.16b , #1 + ext v16.16b, v18.16b , v18.16b , #1 + st1 {v18.8b}, [x1], x3 + ext v14.16b, v16.16b , v16.16b , #1 + st1 {v16.8b}, [x1], x3 + st1 {v14.8b}, [x1], x3 + st1 {v4.s}[1], [x1], #4 + st1 {v5.s}[0], [x1], x5 + st1 {v18.s}[1], [x1], #4 + st1 {v18.s}[2], [x1], x5 + st1 {v16.s}[1], [x1], #4 + st1 {v16.s}[2], [x1], x5 + st1 {v14.s}[1], [x1], #4 + st1 {v14.s}[2], [x1], x5 + + +end_func_diag_dl: + // LDMFD sp!,{x4-x12,PC} //Restoring registers from stack + ldp x19, x20, [sp], #16 + pop_v_regs + ret + + + + +///** +//******************************************************************************* +//* +//*ih264_intra_pred_luma_8x8_mode_diag_dr +//* +//* @brief +//* Perform Intra prediction for luma_8x8 mode:Diagonal_Down_Right +//* +//* @par Description: +//* Perform Intra prediction for luma_8x8 mode:Diagonal_Down_Right ,described in sec 8.3.2.2.5 +//* +//* @param[in] pu1_src +//* UWORD8 pointer to the source +//* +//* @param[out] pu1_dst +//* UWORD8 pointer to the destination +//* +//* @param[in] src_strd +//* integer source stride +//* +//* @param[in] dst_strd +//* integer destination stride +//* +//* @param[in] ui_neighboravailability +//* availability of neighbouring pixels +//* +//* @returns +//* +//* @remarks +//* None +//* +//*******************************************************************************/ +//void ih264_intra_pred_luma_8x8_mode_diag_dr(UWORD8 *pu1_src, +// UWORD8 *pu1_dst, +// WORD32 src_strd, +// WORD32 dst_strd, +// WORD32 ui_neighboravailability) + +//**************Variables Vs Registers***************************************** +// x0 => *pu1_src +// x1 => *pu1_dst +// x2 => src_strd +// x3 => dst_strd +// x4 => ui_neighboravailability + + + .global ih264_intra_pred_luma_8x8_mode_diag_dr_av8 + +ih264_intra_pred_luma_8x8_mode_diag_dr_av8: + + // STMFD sp!, {x4-x12, x14} //store register values to stack + push_v_regs + stp x19, x20, [sp, #-16]! + + + ld1 { v0.16b}, [x0] + mov v1.d[0], v0.d[1] + add x0, x0, #1 + ld1 { v2.16b}, [x0] + mov v3.d[0], v2.d[1] + ext v4.16b, v2.16b , v2.16b , #1 + mov v5.d[0], v4.d[1] + // q1 = q0 shifted to left once + // q2 = q1 shifted to left once + uaddl v20.8h, v0.8b, v2.8b //Adding for FILT121 + uaddl v22.8h, v1.8b, v3.8b + uaddl v24.8h, v2.8b, v4.8b + uaddl v26.8h, v3.8b, v5.8b + add v24.8h, v20.8h , v24.8h + add v26.8h, v22.8h , v26.8h + sqrshrun v4.8b, v24.8h, #2 + sqrshrun v5.8b, v26.8h, #2 + mov v4.d[1], v5.d[0] + //Q2 has all FILT121 values + sub x5, x3, #4 + ext v18.16b, v4.16b , v4.16b , #15 + st1 {v18.d}[1], [x1], x3 + ext v16.16b, v18.16b , v18.16b , #15 + st1 {v16.d}[1], [x1], x3 + ext v14.16b, v16.16b , v16.16b , #15 + st1 {v14.d}[1], [x1], x3 + st1 {v4.s}[1], [x1], #4 + st1 {v5.s}[0], [x1], x5 + st1 {v18.s}[1], [x1], #4 + st1 {v18.s}[2], [x1], x5 + st1 {v16.s}[1], [x1], #4 + st1 {v16.s}[2], [x1], x5 + st1 {v14.s}[1], [x1], #4 + st1 {v14.s}[2], [x1], x5 + st1 {v4.8b}, [x1], x3 + +end_func_diag_dr: + // LDMFD sp!,{x4-x12,PC} //Restoring registers from stack + ldp x19, x20, [sp], #16 + pop_v_regs + ret + + + + +///** +//******************************************************************************* +//* +//*ih264_intra_pred_luma_8x8_mode_vert_r +//* +//* @brief +//* Perform Intra prediction for luma_8x8 mode:Vertical_Right +//* +//* @par Description: +//* Perform Intra prediction for luma_8x8 mode:Vertical_Right ,described in sec 8.3.2.2.6 +//* +//* @param[in] pu1_src +//* UWORD8 pointer to the source +//* +//* @param[out] pu1_dst +//* UWORD8 pointer to the destination +//* +//* @param[in] src_strd +//* integer source stride +//* +//* @param[in] dst_strd +//* integer destination stride +//* +//* @param[in] ui_neighboravailability +//* availability of neighbouring pixels +//* +//* @returns +//* +//* @remarks +//* None +//* +//*******************************************************************************/ +//void ih264_intra_pred_luma_8x8_mode_vert_r(UWORD8 *pu1_src, +// UWORD8 *pu1_dst, +// WORD32 src_strd, +// WORD32 dst_strd, +// WORD32 ui_neighboravailability) + +//**************Variables Vs Registers***************************************** +// x0 => *pu1_src +// x1 => *pu1_dst +// x2 => src_strd +// x3 => dst_strd +// x4 => ui_neighboravailability + + + .global ih264_intra_pred_luma_8x8_mode_vert_r_av8 + +ih264_intra_pred_luma_8x8_mode_vert_r_av8: + + // STMFD sp!, {x4-x12, x14} //store register values to stack + push_v_regs + stp x19, x20, [sp, #-16]! + + ld1 { v0.16b}, [x0] + mov v1.d[0], v0.d[1] + add x0, x0, #1 + ld1 { v2.16b}, [x0] + mov v3.d[0], v2.d[1] + ext v4.16b, v2.16b , v2.16b , #1 + mov v5.d[0], v4.d[1] + // q1 = q0 shifted to left once + // q2 = q1 shifted to left once + uaddl v20.8h, v0.8b, v2.8b + uaddl v22.8h, v1.8b, v3.8b + uaddl v24.8h, v2.8b, v4.8b + uaddl v26.8h, v3.8b, v5.8b + add v24.8h, v20.8h , v24.8h + add v26.8h, v22.8h , v26.8h + + sqrshrun v4.8b, v20.8h, #1 + sqrshrun v5.8b, v22.8h, #1 + mov v4.d[1], v5.d[0] + sqrshrun v6.8b, v24.8h, #2 + sqrshrun v7.8b, v26.8h, #2 + mov v6.d[1], v7.d[0] + //Q2 has all FILT11 values + //Q3 has all FILT121 values + sub x5, x3, #6 + sub x6, x3, #4 + st1 {v5.8b}, [x1], x3 // row 0 + ext v18.16b, v6.16b , v6.16b , #15 + mov v22.16b , v18.16b + ext v16.16b, v4.16b , v4.16b , #1 + st1 {v18.d}[1], [x1], x3 //row 1 + mov v14.16b , v16.16b + ext v20.16b, v4.16b , v4.16b , #15 + uzp1 v17.16b, v16.16b, v18.16b + uzp2 v18.16b, v16.16b, v18.16b + mov v16.16b , v17.16b + //row 2 + ext v12.16b, v16.16b , v16.16b , #1 + st1 {v20.d}[1], [x1] + st1 {v6.b}[6], [x1], x3 + //row 3 + + st1 {v12.h}[5], [x1], #2 + st1 {v6.s}[2], [x1], #4 + st1 {v6.h}[6], [x1], x5 + //row 4 + st1 {v18.h}[5], [x1], #2 + st1 {v4.s}[2], [x1], #4 + st1 {v4.h}[6], [x1], x5 + //row 5 + ext v26.16b, v18.16b , v18.16b , #1 + st1 {v16.h}[5], [x1], #2 + st1 {v22.s}[2], [x1], #4 + st1 {v22.h}[6], [x1], x5 + //row 6 + st1 {v26.h}[4], [x1], #2 + st1 {v26.b}[10], [x1], #1 + st1 {v4.b}[8], [x1], #1 + st1 {v14.s}[2], [x1], x6 + //row 7 + st1 {v12.s}[2], [x1], #4 + st1 {v6.s}[2], [x1], #4 + +end_func_vert_r: + // LDMFD sp!,{x4-x12,PC} //Restoring registers from stack + ldp x19, x20, [sp], #16 + pop_v_regs + ret + + + + +///** +//******************************************************************************* +//* +//*ih264_intra_pred_luma_8x8_mode_horz_d +//* +//* @brief +//* Perform Intra prediction for luma_8x8 mode:Horizontal_Down +//* +//* @par Description: +//* Perform Intra prediction for luma_8x8 mode:Horizontal_Down ,described in sec 8.3.2.2.7 +//* +//* @param[in] pu1_src +//* UWORD8 pointer to the source +//* +//* @param[out] pu1_dst +//* UWORD8 pointer to the destination +//* +//* @param[in] src_strd +//* integer source stride +//* +//* @param[in] dst_strd +//* integer destination stride +//* +//* @param[in] ui_neighboravailability +//* availability of neighbouring pixels +//* +//* @returns +//* +//* @remarks +//* None +//* +//*******************************************************************************/ +//void ih264_intra_pred_luma_8x8_mode_horz_d(UWORD8 *pu1_src, +// UWORD8 *pu1_dst, +// WORD32 src_strd, +// WORD32 dst_strd, +// WORD32 ui_neighboravailability) + +//**************Variables Vs Registers***************************************** +// x0 => *pu1_src +// x1 => *pu1_dst +// x2 => src_strd +// x3 => dst_strd +// x4 => ui_neighboravailability + + .global ih264_intra_pred_luma_8x8_mode_horz_d_av8 + +ih264_intra_pred_luma_8x8_mode_horz_d_av8: + + // STMFD sp!, {x4-x12, x14} //store register values to stack + push_v_regs + stp x19, x20, [sp, #-16]! + + ld1 { v0.16b}, [x0] + mov v1.d[0], v0.d[1] + add x0, x0, #1 + ld1 { v2.16b}, [x0] + mov v3.d[0], v2.d[1] + ext v4.16b, v2.16b , v2.16b , #1 + mov v5.d[0], v4.d[1] + // q1 = q0 shifted to left once + // q2 = q1 shifted to left once + uaddl v20.8h, v0.8b, v2.8b + uaddl v22.8h, v1.8b, v3.8b + uaddl v24.8h, v2.8b, v4.8b + uaddl v26.8h, v3.8b, v5.8b + add v24.8h, v20.8h , v24.8h + add v26.8h, v22.8h , v26.8h + + sqrshrun v4.8b, v20.8h, #1 + sqrshrun v5.8b, v22.8h, #1 + mov v4.d[1], v5.d[0] + sqrshrun v6.8b, v24.8h, #2 + sqrshrun v7.8b, v26.8h, #2 + mov v6.d[1], v7.d[0] + //Q2 has all FILT11 values + //Q3 has all FILT121 values + mov v8.16b, v4.16b + mov v10.16b, v6.16b + sub x6, x3, #6 + trn1 v9.16b, v8.16b, v10.16b + trn2 v10.16b, v8.16b, v10.16b // + mov v8.16b, v9.16b + mov v12.16b, v8.16b + mov v14.16b, v10.16b + sub x5, x3, #4 + trn1 v13.8h, v12.8h, v14.8h + trn2 v14.8h, v12.8h, v14.8h + mov v12.16b, v13.16b + ext v16.16b, v6.16b , v6.16b , #14 + //ROW 0 + st1 {v16.d}[1], [x1] + st1 {v10.h}[3], [x1], x3 + + //ROW 1 + st1 {v14.s}[1], [x1], #4 + st1 {v6.s}[2], [x1], x5 + //ROW 2 + st1 {v10.h}[2], [x1], #2 + st1 {v14.s}[1], [x1], #4 + st1 {v7.h}[0], [x1], x6 + //ROW 3 + st1 {v12.s}[1], [x1], #4 + st1 {v14.s}[1], [x1], x5 + //ROW 4 + st1 {v14.h}[1], [x1], #2 + st1 {v12.s}[1], [x1], #4 + st1 {v14.h}[2], [x1], x6 + //ROW 5 + st1 {v14.s}[0], [x1], #4 + st1 {v12.s}[1], [x1], x5 + //ROW 6 + st1 {v10.h}[0], [x1], #2 + st1 {v8.h}[1], [x1], #2 + st1 {v14.h}[1], [x1], #2 + st1 {v12.h}[2], [x1], x6 + //ROW 7 + st1 {v12.s}[0], [x1], #4 + st1 {v14.s}[0], [x1], x5 + +end_func_horz_d: + // LDMFD sp!,{x4-x12,PC} //Restoring registers from stack + ldp x19, x20, [sp], #16 + pop_v_regs + ret + + + + + +///** +//******************************************************************************* +//* +//*ih264_intra_pred_luma_8x8_mode_vert_l +//* +//* @brief +//* Perform Intra prediction for luma_8x8 mode:Vertical_Left +//* +//* @par Description: +//* Perform Intra prediction for luma_8x8 mode:Vertical_Left ,described in sec 8.3.2.2.8 +//* +//* @param[in] pu1_src +//* UWORD8 pointer to the source +//* +//* @param[out] pu1_dst +//* UWORD8 pointer to the destination +//* +//* @param[in] src_strd +//* integer source stride +//* +//* @param[in] dst_strd +//* integer destination stride +//* +//* @param[in] ui_neighboravailability +//* availability of neighbouring pixels +//* +//* @returns +//* +//* @remarks +//* None +//* +//*******************************************************************************/ +//void ih264_intra_pred_luma_8x8_mode_vert_l(UWORD8 *pu1_src, +// UWORD8 *pu1_dst, +// WORD32 src_strd, +// WORD32 dst_strd, +// WORD32 ui_neighboravailability) + +//**************Variables Vs Registers***************************************** +// x0 => *pu1_src +// x1 => *pu1_dst +// x2 => src_strd +// x3 => dst_strd +// x4 => ui_neighboravailability + + + .global ih264_intra_pred_luma_8x8_mode_vert_l_av8 + +ih264_intra_pred_luma_8x8_mode_vert_l_av8: + + // STMFD sp!, {x4-x12, x14} //Restoring registers from stack + push_v_regs + stp x19, x20, [sp, #-16]! + add x0, x0, #9 + ld1 { v0.16b}, [x0] + mov v1.d[0], v0.d[1] + add x0, x0, #1 + ld1 { v2.16b}, [x0] + mov v3.d[0], v2.d[1] + ext v4.16b, v2.16b , v2.16b , #1 + mov v5.d[0], v4.d[1] + uaddl v20.8h, v0.8b, v2.8b + uaddl v22.8h, v1.8b, v3.8b + uaddl v24.8h, v2.8b, v4.8b + uaddl v26.8h, v3.8b, v5.8b + add v24.8h, v20.8h , v24.8h + add v26.8h, v22.8h , v26.8h + + sqrshrun v4.8b, v20.8h, #1 + sqrshrun v5.8b, v22.8h, #1 + mov v4.d[1], v5.d[0] + sqrshrun v6.8b, v24.8h, #2 + ext v8.16b, v4.16b , v4.16b , #1 + sqrshrun v7.8b, v26.8h, #2 + mov v6.d[1], v7.d[0] + //Q2 has all FILT11 values + //Q3 has all FILT121 values + + ext v10.16b, v6.16b , v6.16b , #1 + //ROW 0,1 + st1 {v4.8b}, [x1], x3 + st1 {v6.8b}, [x1], x3 + + ext v12.16b, v8.16b , v8.16b , #1 + ext v14.16b, v10.16b , v10.16b , #1 + //ROW 2,3 + st1 {v8.8b}, [x1], x3 + st1 {v10.8b}, [x1], x3 + + ext v16.16b, v12.16b , v12.16b , #1 + ext v18.16b, v14.16b , v14.16b , #1 + //ROW 4,5 + st1 {v12.8b}, [x1], x3 + st1 {v14.8b}, [x1], x3 + //ROW 6,7 + st1 {v16.8b}, [x1], x3 + st1 {v18.8b}, [x1], x3 + +end_func_vert_l: + // LDMFD sp!,{x4-x12,PC} //Restoring registers from stack + ldp x19, x20, [sp], #16 + pop_v_regs + ret + + + + + +///** +//******************************************************************************* +//* +//*ih264_intra_pred_luma_8x8_mode_horz_u +//* +//* @brief +//* Perform Intra prediction for luma_8x8 mode:Horizontal_Up +//* +//* @par Description: +//* Perform Intra prediction for luma_8x8 mode:Horizontal_Up ,described in sec 8.3.2.2.9 +//* +//* @param[in] pu1_src +//* UWORD8 pointer to the source +//* +//* @param[out] pu1_dst +//* UWORD8 pointer to the destination +//* +//* @param[in] src_strd +//* integer source stride +//* +//* @param[in] dst_strd +//* integer destination stride +//* +//* @param[in] ui_neighboravailability +//* availability of neighbouring pixels +//* +//* @returns +//* +//* @remarks +//* None +//* +//*******************************************************************************/ +//void ih264_intra_pred_luma_8x8_mode_horz_u(UWORD8 *pu1_src, +// UWORD8 *pu1_dst, +// WORD32 src_strd, +// WORD32 dst_strd, +// WORD32 ui_neighboravailability) + +//**************Variables Vs Registers***************************************** +// x0 => *pu1_src +// x1 => *pu1_dst +// x2 => src_strd +// x3 => dst_strd +// x4 => ui_neighboravailability + + .global ih264_intra_pred_luma_8x8_mode_horz_u_av8 + +ih264_intra_pred_luma_8x8_mode_horz_u_av8: + + // STMFD sp!, {x4-x12, x14} //store register values to stack + push_v_regs + stp x19, x20, [sp, #-16]! + + ld1 {v0.8b}, [x0] + ld1 {v1.b}[7], [x0] + mov v0.d[1], v1.d[0] + ext v2.16b, v0.16b , v0.16b , #1 + mov v3.d[0], v2.d[1] + ext v4.16b, v2.16b , v2.16b , #1 + mov v5.d[0], v4.d[1] + + adrp x12, :got:ih264_gai1_intrapred_luma_8x8_horz_u + ldr x12, [x12, #:got_lo12:ih264_gai1_intrapred_luma_8x8_horz_u] + uaddl v20.8h, v0.8b, v2.8b + uaddl v22.8h, v1.8b, v3.8b + uaddl v24.8h, v2.8b, v4.8b + uaddl v26.8h, v3.8b, v5.8b + add v24.8h, v20.8h , v24.8h + add v26.8h, v22.8h , v26.8h + ld1 { v10.16b}, [x12] + mov v11.d[0], v10.d[1] + sqrshrun v4.8b, v20.8h, #1 + sqrshrun v5.8b, v22.8h, #1 + mov v4.d[1], v5.d[0] + sqrshrun v6.8b, v24.8h, #2 + sqrshrun v7.8b, v26.8h, #2 + mov v6.d[1], v7.d[0] + //Q2 has all FILT11 values + //Q3 has all FILT121 values + mov v30.16b, v4.16b + mov v31.16b, v6.16b + tbl v12.8b, {v30.16b, v31.16b}, v10.8b + dup v14.16b, v5.8b[7] // + tbl v13.8b, {v30.16b, v31.16b}, v11.8b + mov v12.d[1], v13.d[0] + ext v16.16b, v12.16b , v14.16b , #2 + ext v18.16b, v16.16b , v14.16b , #2 + st1 {v12.8b}, [x1], x3 //0 + ext v20.16b, v18.16b , v14.16b , #2 + st1 {v16.8b}, [x1], x3 //1 + st1 {v18.8b}, [x1], x3 //2 + st1 {v20.8b}, [x1], x3 //3 + st1 {v13.8b}, [x1], x3 //4 + st1 {v16.d}[1], [x1], x3 //5 + st1 {v18.d}[1], [x1], x3 //6 + st1 {v20.d}[1], [x1], x3 //7 + + +end_func_horz_u: + // LDMFD sp!,{x4-x12,PC} //Restoring registers from stack + ldp x19, x20, [sp], #16 + pop_v_regs + ret + + diff --git a/common/armv8/ih264_iquant_itrans_recon_av8.s b/common/armv8/ih264_iquant_itrans_recon_av8.s new file mode 100755 index 0000000..4c83036 --- /dev/null +++ b/common/armv8/ih264_iquant_itrans_recon_av8.s @@ -0,0 +1,778 @@ +//****************************************************************************** +//* +//* Copyright (C) 2015 The Android Open Source Project +//* +//* Licensed under the Apache License, Version 2.0 (the "License"); +//* you may not use this file except in compliance with the License. +//* You may obtain a copy of the License at: +//* +//* http://www.apache.org/licenses/LICENSE-2.0 +//* +//* Unless required by applicable law or agreed to in writing, software +//* distributed under the License is distributed on an "AS IS" BASIS, +//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +//* See the License for the specific language governing permissions and +//* limitations under the License. +//* +//***************************************************************************** +//* Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore +//*/ +///** +///******************************************************************************* +// * //file +// * ih264_iquant_itrans_recon_a9.s +// * +// * //brief +// * Contains function definitions for single stage inverse transform +// * +// * //author +// * Parthiban V +// * Mohit +// * Harinarayanaan +// * +// * //par List of Functions: +// * - ih264_iquant_itrans_recon_4x4_av8() +// * - ih264_iquant_itrans_recon_8x8_av8() +// * - ih264_iquant_itrans_recon_chroma_4x4_av8() +// * +// * //remarks +// * None +// * +// ******************************************************************************* + +.text +.p2align 2 +.include "ih264_neon_macros.s" + +///* +// ******************************************************************************* +// * +// * //brief +// * This function performs inverse quant and Inverse transform type Ci4 for 4*4 block +// * +// * //par Description: +// * Performs inverse transform Ci4 and adds the residue to get the +// * reconstructed block +// * +// * //param[in] pi2_src +// * Input 4x4 coefficients +// * +// * //param[in] pu1_pred +// * Prediction 4x4 block +// * +// * //param[out] pu1_out +// * Output 4x4 block +// * +// * //param[in] u4_qp_div_6 +// * QP +// * +// * //param[in] pu2_weigh_mat +// * Pointer to weight matrix +// * +// * //param[in] pred_strd, +// * Prediction stride +// * +// * //param[in] out_strd +// * Output Stride +// * +// *//param[in] pi2_tmp +// * temporary buffer of size 1*16 +// * +// * //param[in] pu2_iscal_mat +// * Pointer to the inverse quantization matrix +// * +// * //returns Void +// * +// * //remarks +// * None +// * +// ******************************************************************************* +// */ +//void ih264_iquant_itrans_recon_4x4(WORD16 *pi2_src, +// UWORD8 *pu1_pred, +// UWORD8 *pu1_out, +// WORD32 pred_strd, +// WORD32 out_strd, +// const UWORD16 *pu2_iscal_mat, +// const UWORD16 *pu2_weigh_mat, +// UWORD32 u4_qp_div_6, +// WORD32 *pi4_tmp, +// WORD32 iq_start_idx +// WORD16 *pi2_dc_ld_addr) +//**************Variables Vs Registers***************************************** +//x0 => *pi2_src +//x1 => *pu1_pred +//x2 => *pu1_out +//x3 => pred_strd +//x4 => out_strd +//x5 => *pu2_iscal_mat +//x6 => *pu2_weigh_mat +//x7 => u4_qp_div_6 +// => pi4_tmp +// => iq_start_idx +// => pi2_dc_ld_addr +//Only one shift is done in horizontal inverse because, +//if u4_qp_div_6 is lesser than 4 then shift value will be neagative and do negative left shift, in this case rnd_factor has value +//if u4_qp_div_6 is greater than 4 then shift value will be positive and do left shift, here rnd_factor is 0 + + .global ih264_iquant_itrans_recon_4x4_av8 +ih264_iquant_itrans_recon_4x4_av8: + + push_v_regs + + dup v30.4s, w7 //Populate the u4_qp_div_6 in Q15 + + ldr w8, [sp, #72] //Loads iq_start_idx + sxtw x8, w8 + + ldr x10, [sp, #80] //Load alternate dc address + + subs x8, x8, #1 // if x8 == 1 => intra case , so result of subtraction is zero and z flag is set + + +//=======================DEQUANT FROM HERE=================================== + + ld4 {v20.4h - v23.4h}, [x5] // load pu2_iscal_mat[i], i =0..15 + ld4 {v26.4h - v29.4h}, [x6] // pu2_weigh_mat[i], i =0..15 + ld4 {v16.4h - v19.4h}, [x0] // pi2_src_tmp[i], i =0..15 + + + mul v20.4h, v20.4h, v26.4h // x[i]=(scale[i] * dequant[i]) where i = 0..3 + mul v21.4h, v21.4h, v27.4h // x[i]=(scale[i] * dequant[i]) where i = 4..7 + mul v22.4h, v22.4h, v28.4h // x[i]=(scale[i] * dequant[i]) where i = 8..11 + mul v23.4h, v23.4h, v29.4h // x[i]=(scale[i] * dequant[i]) where i = 12..14 + + smull v0.4s, v16.4h, v20.4h // q0 = p[i] = (x[i] * trns_coeff[i]) where i = 0..3 + smull v2.4s, v17.4h, v21.4h // q1 = p[i] = (x[i] * trns_coeff[i]) where i = 4..7 + smull v4.4s, v18.4h, v22.4h // q2 = p[i] = (x[i] * trns_coeff[i]) where i = 8..11 + smull v6.4s, v19.4h, v23.4h // q3 = p[i] = (x[i] * trns_coeff[i]) where i = 12..15 + + sshl v0.4s, v0.4s, v30.4s // q0 = q[i] = (p[i] << (qp/6)) where i = 0..3 + sshl v2.4s, v2.4s, v30.4s // q1 = q[i] = (p[i] << (qp/6)) where i = 4..7 + sshl v4.4s, v4.4s, v30.4s // q2 = q[i] = (p[i] << (qp/6)) where i = 8..11 + sshl v6.4s, v6.4s, v30.4s // q3 = q[i] = (p[i] << (qp/6)) where i = 12..15 + + sqrshrn v0.4h, v0.4s, #0x4 // d0 = c[i] = ((q[i] + 32) >> 4) where i = 0..3 + sqrshrn v1.4h, v2.4s, #0x4 // d1 = c[i] = ((q[i] + 32) >> 4) where i = 4..7 + sqrshrn v2.4h, v4.4s, #0x4 // d2 = c[i] = ((q[i] + 32) >> 4) where i = 8..11 + sqrshrn v3.4h, v6.4s, #0x4 // d3 = c[i] = ((q[i] + 32) >> 4) where i = 12..15 + + bne skip_loading_luma_dc_src + ld1 {v0.h}[0], [x10] // loads signed halfword pi2_dc_ld_addr[0], if x8==1 +skip_loading_luma_dc_src: + + //========= PROCESS IDCT FROM HERE ======= + //Steps for Stage 1: + //------------------ + ld1 {v30.s}[0], [x1], x3 // i row load pu1_pred buffer + + sshr v8.4h, v1.4h, #1 // d1>>1 + sshr v9.4h, v3.4h, #1 // d3>>1 + + add v4.4h, v0.4h, v2.4h // x0 = d0 + d2// + sub v5.4h, v0.4h, v2.4h // x1 = d0 - d2// + sub v6.4h, v8.4h, v3.4h // x2 = (d1 >> 1) - d3// + add v7.4h, v1.4h, v9.4h // x3 = d1 + (d3 >> 1)// + + ld1 {v30.s}[1], [x1], x3 // ii row load pu1_pred buffer + + add v10.4h, v4.4h , v7.4h // x0+x3 + add v11.4h, v5.4h , v6.4h // x1+x2 + sub v12.4h, v5.4h , v6.4h // x1-x2 + sub v13.4h, v4.4h , v7.4h + + ld1 {v31.s}[0], [x1], x3 // iii row load pu1_pred buf + + + //Steps for Stage 2: + //transopose + trn1 v4.4h, v10.4h, v11.4h + trn2 v5.4h, v10.4h, v11.4h + trn1 v6.4h, v12.4h, v13.4h + trn2 v7.4h, v12.4h, v13.4h + + trn1 v10.2s, v4.2s, v6.2s // 0 + trn1 v11.2s, v5.2s, v7.2s // 8 + trn2 v12.2s, v4.2s, v6.2s // 4 + trn2 v13.2s, v5.2s, v7.2s + //end transpose + + sshr v18.4h, v11.4h, #1 // q0>>1 + sshr v19.4h, v13.4h, #1 // q1>>1 + + add v14.4h, v10.4h, v12.4h // x0 = q0 + q2// + sub v15.4h, v10.4h, v12.4h // x1 = q0 - q2// + sub v16.4h, v18.4h, v13.4h // x2 = (q1 >> 1) - q3// + add v17.4h, v11.4h, v19.4h // x3 = q1+ (q3 >> 3)// + + + ld1 {v31.s}[1], [x1], x3 // iv row load pu1_pred buffer + + add v20.4h, v14.4h, v17.4h // x0 + x3 + add v21.4h, v15.4h, v16.4h // x1 + x2 + sub v22.4h, v15.4h, v16.4h // x1 - x2 + sub v23.4h, v14.4h, v17.4h // x0 - x3 + + mov v20.d[1], v21.d[0] + mov v22.d[1], v23.d[0] + + srshr v20.8h, v20.8h, #6 + srshr v22.8h, v22.8h, #6 + + uaddw v20.8h, v20.8h , v30.8b + uaddw v22.8h, v22.8h , v31.8b + + sqxtun v0.8b, v20.8h + sqxtun v1.8b, v22.8h + + st1 {v0.s}[0], [x2], x4 //i row store the value + st1 {v0.s}[1], [x2], x4 //ii row store the value + st1 {v1.s}[0], [x2], x4 //iii row store the value + st1 {v1.s}[1], [x2] //iv row store the value + + pop_v_regs + ret + + +///** +// ******************************************************************************* +// * +// * @brief +// * This function performs inverse quant and Inverse transform type Ci4 for 4*4 block +// * +// * @par Description: +// * Performs inverse transform Ci4 and adds the residue to get the +// * reconstructed block +// * +// * @param[in] pi2_src +// * Input 4x4 coefficients +// * +// * @param[in] pu1_pred +// * Prediction 4x4 block +// * +// * @param[out] pu1_out +// * Output 4x4 block +// * +// * @param[in] u4_qp_div_6 +// * QP +// * +// * @param[in] pu2_weigh_mat +// * Pointer to weight matrix +// * +// * @param[in] pred_strd, +// * Prediction stride +// * +// * @param[in] out_strd +// * Output Stride +// * +// *@param[in] pi2_tmp +// * temporary buffer of size 1*16 +// * +// * @param[in] pu2_iscal_mat +// * Pointer to the inverse quantization matrix +// * +// * @returns Void +// * +// * @remarks +// * None +// * +// ******************************************************************************* +// */ +//void ih264_iquant_itrans_recon_chroma_4x4(WORD16 *pi2_src, +// UWORD8 *pu1_pred, +// UWORD8 *pu1_out, +// WORD32 pred_strd, +// WORD32 out_strd, +// const UWORD16 *pu2_iscal_mat, +// const UWORD16 *pu2_weigh_mat, +// UWORD32 u4_qp_div_6, +// WORD32 *pi4_tmp +// WORD16 *pi2_dc_src) +//**************Variables Vs Registers***************************************** +//x0 => *pi2_src +//x1 => *pu1_pred +//x2 => *pu1_out +//x3 => pred_strd +//x4 => out_strd +//x5 => *pu2_iscal_mat +//x6 => *pu2_weigh_mat +//x7 => u4_qp_div_6 +//sp => pi4_tmp +//sp#8 => *pi2_dc_src + + .global ih264_iquant_itrans_recon_chroma_4x4_av8 +ih264_iquant_itrans_recon_chroma_4x4_av8: + +//VLD4.S16 is used because the pointer is incremented by SUB_BLK_WIDTH_4x4 +//If the macro value changes need to change the instruction according to it. +//Only one shift is done in horizontal inverse because, +//if u4_qp_div_6 is lesser than 4 then shift value will be neagative and do negative left shift, in this case rnd_factor has value +//if u4_qp_div_6 is greater than 4 then shift value will be positive and do left shift, here rnd_factor is 0 + +//at the end of the fucntion, we could have moved 64 bits into heigher 64 bits of register and done further processing +//but it seem to give only reduce the number of instruction by 1. [Since a15 we saw add and sub to be very high throughput +//all instructions were taken as equal + + //reduce sp by 64 + push_v_regs + + dup v30.4s, w7 //Populate the u4_qp_div_6 in Q15 + + //was at sp + 8, hence now at sp+64+8 = sp+72 + ldr x10, [sp, #72] //Load alternate dc address + +//=======================DEQUANT FROM HERE=================================== + + ld4 {v20.4h - v23.4h}, [x5] // load pu2_iscal_mat[i], i =0..15 + ld4 {v26.4h - v29.4h}, [x6] // pu2_weigh_mat[i], i =0..15 + ld4 {v16.4h - v19.4h}, [x0] // pi2_src_tmp[i], i =0..15 + + + mul v20.4h, v20.4h, v26.4h // x[i]=(scale[i] * dequant[i]) where i = 0..3 + mul v21.4h, v21.4h, v27.4h // x[i]=(scale[i] * dequant[i]) where i = 4..7 + mul v22.4h, v22.4h, v28.4h // x[i]=(scale[i] * dequant[i]) where i = 8..11 + mul v23.4h, v23.4h, v29.4h // x[i]=(scale[i] * dequant[i]) where i = 12..14 + + smull v0.4s, v16.4h, v20.4h // q0 = p[i] = (x[i] * trns_coeff[i]) where i = 0..3 + smull v2.4s, v17.4h, v21.4h // q1 = p[i] = (x[i] * trns_coeff[i]) where i = 4..7 + smull v4.4s, v18.4h, v22.4h // q2 = p[i] = (x[i] * trns_coeff[i]) where i = 8..11 + smull v6.4s, v19.4h, v23.4h // q3 = p[i] = (x[i] * trns_coeff[i]) where i = 12..15 + + sshl v0.4s, v0.4s, v30.4s // q0 = q[i] = (p[i] << (qp/6)) where i = 0..3 + sshl v2.4s, v2.4s, v30.4s // q1 = q[i] = (p[i] << (qp/6)) where i = 4..7 + sshl v4.4s, v4.4s, v30.4s // q2 = q[i] = (p[i] << (qp/6)) where i = 8..11 + sshl v6.4s, v6.4s, v30.4s // q3 = q[i] = (p[i] << (qp/6)) where i = 12..15 + + sqrshrn v0.4h, v0.4s, #0x4 // d0 = c[i] = ((q[i] + 32) >> 4) where i = 0..3 + sqrshrn v1.4h, v2.4s, #0x4 // d1 = c[i] = ((q[i] + 32) >> 4) where i = 4..7 + sqrshrn v2.4h, v4.4s, #0x4 // d2 = c[i] = ((q[i] + 32) >> 4) where i = 8..11 + sqrshrn v3.4h, v6.4s, #0x4 // d3 = c[i] = ((q[i] + 32) >> 4) where i = 12..15 + + ld1 {v0.h}[0], [x10] // loads signed halfword pi2_dc_src[0] + + //========= PROCESS IDCT FROM HERE ======= + //Steps for Stage 1: + //------------------ + + sshr v8.4h, v1.4h, #1 // d1>>1 + sshr v9.4h, v3.4h, #1 // d3>>1 + + add v4.4h, v0.4h, v2.4h // x0 = d0 + d2// + sub v5.4h, v0.4h, v2.4h // x1 = d0 - d2// + sub v6.4h, v8.4h, v3.4h // x2 = (d1 >> 1) - d3// + add v7.4h, v1.4h, v9.4h // x3 = d1 + (d3 >> 1)// + + + add v10.4h, v4.4h , v7.4h // x0+x3 + add v11.4h, v5.4h , v6.4h // x1+x2 + sub v12.4h, v5.4h , v6.4h // x1-x2 + sub v13.4h, v4.4h , v7.4h + + ld1 {v26.8b}, [x1], x3 // i row load pu1_pred buffer + ld1 {v27.8b}, [x1], x3 // ii row load pu1_pred buffer + ld1 {v28.8b}, [x1], x3 // iii row load pu1_pred buf + ld1 {v29.8b}, [x1], x3 // iv row load pu1_pred buffer + + //Steps for Stage 2: + //transopose + trn1 v4.4h, v10.4h, v11.4h + trn2 v5.4h, v10.4h, v11.4h + trn1 v6.4h, v12.4h, v13.4h + trn2 v7.4h, v12.4h, v13.4h + + trn1 v10.2s, v4.2s, v6.2s // 0 + trn1 v11.2s, v5.2s, v7.2s // 8 + trn2 v12.2s, v4.2s, v6.2s // 4 + trn2 v13.2s, v5.2s, v7.2s + //end transpose + + sshr v18.4h, v11.4h, #1 // q0>>1 + sshr v19.4h, v13.4h, #1 // q1>>1 + + add v14.4h, v10.4h, v12.4h // x0 = q0 + q2// + sub v15.4h, v10.4h, v12.4h // x1 = q0 - q2// + sub v16.4h, v18.4h, v13.4h // x2 = (q1 >> 1) - q3// + add v17.4h, v11.4h, v19.4h // x3 = q1+ (q3 >> 3)// + + //Backup the output addr + mov x0, x2 + + //load outpt buufer for interleaving + ld1 {v10.8b}, [x2], x4 + ld1 {v11.8b}, [x2], x4 + ld1 {v12.8b}, [x2], x4 + ld1 {v13.8b}, [x2] + + add v20.4h, v14.4h, v17.4h // x0 + x3 + add v21.4h, v15.4h, v16.4h // x1 + x2 + sub v22.4h, v15.4h, v16.4h // x1 - x2 + sub v23.4h, v14.4h, v17.4h // x0 - x3 + + srshr v20.4h, v20.4h, #6 + srshr v21.4h, v21.4h, #6 + srshr v22.4h, v22.4h, #6 + srshr v23.4h, v23.4h, #6 + + //nop v30.8b //dummy for deinterleaving + movi v31.4h, #0x00ff //mask for interleaving [copy lower 8 bits] + + //Extract u/v plane from interleaved data + uzp1 v26.8b, v26.8b, v30.8b + uzp1 v27.8b, v27.8b, v30.8b + uzp1 v28.8b, v28.8b, v30.8b + uzp1 v29.8b, v29.8b, v30.8b + + uaddw v20.8h, v20.8h, v26.8b + uaddw v21.8h, v21.8h, v27.8b + uaddw v22.8h, v22.8h, v28.8b + uaddw v23.8h, v23.8h, v29.8b + + sqxtun v0.8b, v20.8h + sqxtun v1.8b, v21.8h + sqxtun v2.8b, v22.8h + sqxtun v3.8b, v23.8h + + //long the output so that we have 0 at msb and value at lsb + uxtl v6.8h, v0.8b + uxtl v7.8h, v1.8b + uxtl v8.8h, v2.8b + uxtl v9.8h, v3.8b + + //select lsbs from proceesd data and msbs from pu1_out loaded data + bit v10.8b, v6.8b, v31.8b + bit v11.8b, v7.8b, v31.8b + bit v12.8b, v8.8b, v31.8b + bit v13.8b, v9.8b, v31.8b + + //store the interleaved result + st1 {v10.8b}, [x0], x4 + st1 {v11.8b}, [x0], x4 + st1 {v12.8b}, [x0], x4 + st1 {v13.8b}, [x0] + + pop_v_regs + ret + +///* +// ******************************************************************************* +// * +// * //brief +// * This function performs inverse quant and Inverse transform type Ci4 for 8*8 block +// * +// * //par Description: +// * Performs inverse transform Ci8 and adds the residue to get the +// * reconstructed block +// * +// * //param[in] pi2_src +// * Input 4x4 coefficients +// * +// * //param[in] pu1_pred +// * Prediction 4x4 block +// * +// * //param[out] pu1_out +// * Output 4x4 block +// * +// * //param[in] u4_qp_div_6 +// * QP +// * +// * //param[in] pu2_weigh_mat +// * Pointer to weight matrix +// * +// * //param[in] pred_strd, +// * Prediction stride +// * +// * //param[in] out_strd +// * Output Stride +// * +// *//param[in] pi2_tmp +// * temporary buffer of size 1*64 +// * +// * //param[in] pu2_iscal_mat +// * Pointer to the inverse quantization matrix +// * +// * //returns Void +// * +// * //remarks +// * None +// * +// ******************************************************************************* +// */ +//void ih264_iquant_itrans_recon_8x8(WORD16 *pi2_src, +// UWORD8 *pu1_pred, +// UWORD8 *pu1_out, +// WORD32 pred_strd, +// WORD32 out_strd, +// const UWORD16 *pu2_iscal_mat, +// const UWORD16 *pu2_weigh_mat, +// UWORD32 u4_qp_div_6, +// WORD32 *pi4_tmp, +// WORD32 iq_start_idx +// WORD16 *pi2_dc_ld_addr) +//**************Variables Vs Registers***************************************** +//x0 => *pi2_src +//x1 => *pu1_pred +//x2 => *pu1_out +//x3 => pred_strd +//x4 => out_strd +//x5 => *pu2_iscal_mat +//x6 => *pu2_weigh_mat +//x7 => u4_qp_div_6 +//NOT USED => pi4_tmp +//NOT USED => iq_start_idx +//NOT USED => pi2_dc_ld_addr + + .global ih264_iquant_itrans_recon_8x8_av8 +ih264_iquant_itrans_recon_8x8_av8: + + push_v_regs + + ld1 {v8.8h -v11.8h}, [x5], #64 + ld1 {v12.8h-v15.8h}, [x5] + + ld1 {v16.8h -v19.8h}, [x6], #64 + ld1 {v20.8h -v23.8h}, [x6] + + mov x8, #16 + ld1 {v0.8h}, [x0], x8 + ld1 {v1.8h}, [x0], x8 + ld1 {v2.8h}, [x0], x8 + ld1 {v3.8h}, [x0], x8 + ld1 {v4.8h}, [x0], x8 + ld1 {v5.8h}, [x0], x8 + ld1 {v6.8h}, [x0], x8 + ld1 {v7.8h}, [x0] + + mul v8.8h, v8.8h, v16.8h + mul v9.8h, v9.8h, v17.8h + mul v10.8h, v10.8h, v18.8h + mul v11.8h, v11.8h, v19.8h + mul v12.8h, v12.8h, v20.8h + mul v13.8h, v13.8h, v21.8h + mul v14.8h, v14.8h, v22.8h + mul v15.8h, v15.8h, v23.8h + + smull v16.4s, v0.4h, v8.4h + smull2 v17.4s, v0.8h, v8.8h + smull v18.4s, v1.4h, v9.4h + smull2 v19.4s, v1.8h, v9.8h + smull v20.4s, v2.4h, v10.4h + smull2 v21.4s, v2.8h, v10.8h + smull v22.4s, v3.4h, v11.4h + smull2 v23.4s, v3.8h, v11.8h + smull v24.4s, v4.4h, v12.4h + smull2 v25.4s, v4.8h, v12.8h + smull v26.4s, v5.4h, v13.4h + smull2 v27.4s, v5.8h, v13.8h + smull v28.4s, v6.4h, v14.4h + smull2 v29.4s, v6.8h, v14.8h + smull v30.4s, v7.4h, v15.4h + smull2 v31.4s, v7.8h, v15.8h + + dup v0.4s, w7 + + sshl v16.4s, v16.4s, v0.4s + sshl v17.4s, v17.4s, v0.4s + sshl v18.4s, v18.4s, v0.4s + sshl v19.4s, v19.4s, v0.4s + sshl v20.4s, v20.4s, v0.4s + sshl v21.4s, v21.4s, v0.4s + sshl v22.4s, v22.4s, v0.4s + sshl v23.4s, v23.4s, v0.4s + sshl v24.4s, v24.4s, v0.4s + sshl v25.4s, v25.4s, v0.4s + sshl v26.4s, v26.4s, v0.4s + sshl v27.4s, v27.4s, v0.4s + sshl v28.4s, v28.4s, v0.4s + sshl v29.4s, v29.4s, v0.4s + sshl v30.4s, v30.4s, v0.4s + sshl v31.4s, v31.4s, v0.4s + + sqrshrn v0.4h, v16.4s, #6 + sqrshrn2 v0.8h, v17.4s, #6 + sqrshrn v1.4h, v18.4s, #6 + sqrshrn2 v1.8h, v19.4s, #6 + sqrshrn v2.4h, v20.4s, #6 + sqrshrn2 v2.8h, v21.4s, #6 + sqrshrn v3.4h, v22.4s, #6 + sqrshrn2 v3.8h, v23.4s, #6 + sqrshrn v4.4h, v24.4s, #6 + sqrshrn2 v4.8h, v25.4s, #6 + sqrshrn v5.4h, v26.4s, #6 + sqrshrn2 v5.8h, v27.4s, #6 + sqrshrn v6.4h, v28.4s, #6 + sqrshrn2 v6.8h, v29.4s, #6 + sqrshrn v7.4h, v30.4s, #6 + sqrshrn2 v7.8h, v31.4s, #6 + + //loop counter + mov x8, #2 +//1x8 transofORM +trans_1x8_1d: + + //transpose 8x8 + trn1 v8.8h, v0.8h, v1.8h + trn2 v9.8h, v0.8h, v1.8h + trn1 v10.8h, v2.8h, v3.8h + trn2 v11.8h, v2.8h, v3.8h + trn1 v12.8h, v4.8h, v5.8h + trn2 v13.8h, v4.8h, v5.8h + trn1 v14.8h, v6.8h, v7.8h + trn2 v15.8h, v6.8h, v7.8h + + trn1 v0.4s, v8.4s, v10.4s + trn2 v2.4s, v8.4s, v10.4s + trn1 v1.4s, v9.4s, v11.4s + trn2 v3.4s, v9.4s, v11.4s + trn1 v4.4s, v12.4s, v14.4s + trn2 v6.4s, v12.4s, v14.4s + trn1 v5.4s, v13.4s, v15.4s + trn2 v7.4s, v13.4s, v15.4s + + trn1 v8.2d, v0.2d, v4.2d //0 + trn2 v12.2d, v0.2d, v4.2d //1 + trn1 v9.2d, v1.2d, v5.2d //2 + trn2 v13.2d, v1.2d, v5.2d //3 + trn1 v10.2d, v2.2d, v6.2d //4 + trn2 v14.2d, v2.2d, v6.2d //5 + trn1 v11.2d, v3.2d, v7.2d //6 + trn2 v15.2d, v3.2d, v7.2d //7 + + // 1 3 5 6 7 + sshr v16.8h, v9.8h, #1 //(pi2_tmp_ptr[1] >> 1) + sshr v17.8h, v10.8h, #1 //(pi2_tmp_ptr[2] >> 1) + sshr v18.8h, v11.8h, #1 //(pi2_tmp_ptr[3] >> 1) + sshr v19.8h, v13.8h, #1 //(pi2_tmp_ptr[5] >> 1) + sshr v20.8h, v14.8h, #1 //(pi2_tmp_ptr[6] >> 1) + sshr v21.8h, v15.8h, #1 //(pi2_tmp_ptr[7] >> 1) + + add v0.8h, v8.8h, v12.8h // i_y0 = (pi2_tmp_ptr[0] + pi2_tmp_ptr[4] ); + sub v2.8h, v8.8h, v12.8h // i_y2 = (pi2_tmp_ptr[0] - pi2_tmp_ptr[4] ); + + sub v4.8h, v17.8h, v14.8h //i_y4 = ((pi2_tmp_ptr[2] >> 1) - pi2_tmp_ptr[6] ); + add v6.8h, v10.8h, v20.8h //i_y6 = (pi2_tmp_ptr[2] + (pi2_tmp_ptr[6] >> 1)); + + //-w3 + w5 + ssubl v22.4s, v13.4h, v11.4h + ssubl2 v23.4s, v13.8h, v11.8h + //w3 + w5 + saddl v24.4s, v13.4h, v11.4h + saddl2 v25.4s, v13.8h, v11.8h + //-w1 + w7 + ssubl v26.4s, v15.4h, v9.4h + ssubl2 v27.4s, v15.8h, v9.8h + //w1 + w7 + saddl v28.4s, v15.4h, v9.4h + saddl2 v29.4s, v15.8h, v9.8h + + //-w3 + w5 - w7 + ssubw v22.4s, v22.4s, v15.4h + ssubw2 v23.4s, v23.4s, v15.8h + //w3 + w5 + w1 + saddw v24.4s, v24.4s, v9.4h + saddw2 v25.4s, v25.4s, v9.8h + //-w1 + w7 + w5 + saddw v26.4s, v26.4s, v13.4h + saddw2 v27.4s, v27.4s, v13.8h + //w1 + w7 - w3 + ssubw v28.4s, v28.4s, v11.4h + ssubw2 v29.4s, v29.4s, v11.8h + + //-w3 + w5 - w7 - (w7 >> 1) + ssubw v22.4s, v22.4s, v21.4h + ssubw2 v23.4s, v23.4s, v21.8h + //w3 + w5 + w1 + (w1 >> 1) + saddw v24.4s, v24.4s, v16.4h + saddw2 v25.4s, v25.4s, v16.8h + //-w1 + w7 + w5 + (w5 >> 1) + saddw v26.4s, v26.4s, v19.4h + saddw2 v27.4s, v27.4s, v19.8h + //w1 + w7 - w3 - (w3 >> 1) + ssubw v28.4s, v28.4s, v18.4h + ssubw2 v29.4s, v29.4s, v18.8h + + xtn v1.4h, v22.4s + xtn2 v1.8h, v23.4s + xtn v3.4h, v28.4s + xtn2 v3.8h, v29.4s + xtn v5.4h, v26.4s + xtn2 v5.8h, v27.4s + xtn v7.4h, v24.4s + xtn2 v7.8h, v25.4s + + sshr v16.8h, v1.8h, #2 //(y1 >> 2) + sshr v17.8h, v3.8h, #2 //(y3 >> 2) + sshr v18.8h, v5.8h, #2 //(y5 >> 2) + sshr v19.8h, v7.8h, #2 //(y7 >> 2) + + add v8.8h, v0.8h, v6.8h + add v9.8h, v1.8h, v19.8h + add v10.8h, v2.8h, v4.8h + add v11.8h, v3.8h, v18.8h + sub v12.8h, v2.8h, v4.8h + sub v13.8h, v17.8h, v5.8h + sub v14.8h, v0.8h, v6.8h + sub v15.8h, v7.8h, v16.8h + + add v0.8h, v8.8h, v15.8h + add v1.8h, v10.8h, v13.8h + add v2.8h, v12.8h, v11.8h + add v3.8h, v14.8h, v9.8h + sub v4.8h, v14.8h, v9.8h + sub v5.8h, v12.8h, v11.8h + sub v6.8h, v10.8h, v13.8h + sub v7.8h, v8.8h, v15.8h + + subs x8, x8, #1 + bne trans_1x8_1d + + ld1 {v22.8b}, [x1], x3 + ld1 {v23.8b}, [x1], x3 + ld1 {v24.8b}, [x1], x3 + ld1 {v25.8b}, [x1], x3 + ld1 {v26.8b}, [x1], x3 + ld1 {v27.8b}, [x1], x3 + ld1 {v28.8b}, [x1], x3 + ld1 {v29.8b}, [x1] + + srshr v0.8h, v0.8h, #6 + srshr v1.8h, v1.8h, #6 + srshr v2.8h, v2.8h, #6 + srshr v3.8h, v3.8h, #6 + srshr v4.8h, v4.8h, #6 + srshr v5.8h, v5.8h, #6 + srshr v6.8h, v6.8h, #6 + srshr v7.8h, v7.8h, #6 + + uaddw v0.8h, v0.8h, v22.8b + uaddw v1.8h, v1.8h, v23.8b + uaddw v2.8h, v2.8h, v24.8b + uaddw v3.8h, v3.8h, v25.8b + uaddw v4.8h, v4.8h, v26.8b + uaddw v5.8h, v5.8h, v27.8b + uaddw v6.8h, v6.8h, v28.8b + uaddw v7.8h, v7.8h, v29.8b + + sqxtun v0.8b, v0.8h + sqxtun v1.8b, v1.8h + sqxtun v2.8b, v2.8h + sqxtun v3.8b, v3.8h + sqxtun v4.8b, v4.8h + sqxtun v5.8b, v5.8h + sqxtun v6.8b, v6.8h + sqxtun v7.8b, v7.8h + + st1 {v0.8b}, [x2], x4 + st1 {v1.8b}, [x2], x4 + st1 {v2.8b}, [x2], x4 + st1 {v3.8b}, [x2], x4 + st1 {v4.8b}, [x2], x4 + st1 {v5.8b}, [x2], x4 + st1 {v6.8b}, [x2], x4 + st1 {v7.8b}, [x2] + + pop_v_regs + ret + + + + diff --git a/common/armv8/ih264_iquant_itrans_recon_dc_av8.s b/common/armv8/ih264_iquant_itrans_recon_dc_av8.s new file mode 100755 index 0000000..8bb9c32 --- /dev/null +++ b/common/armv8/ih264_iquant_itrans_recon_dc_av8.s @@ -0,0 +1,397 @@ +//****************************************************************************** +//* +//* Copyright (C) 2015 The Android Open Source Project +//* +//* Licensed under the Apache License, Version 2.0 (the "License"); +//* you may not use this file except in compliance with the License. +//* You may obtain a copy of the License at: +//* +//* http://www.apache.org/licenses/LICENSE-2.0 +//* +//* Unless required by applicable law or agreed to in writing, software +//* distributed under the License is distributed on an "AS IS" BASIS, +//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +//* See the License for the specific language governing permissions and +//* limitations under the License. +//* +//***************************************************************************** +//* Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore +//*/ +///** +// ******************************************************************************* +// * @file +// * ih264_iquant_itrans_recon_dc_av8.s +// * +// * @brief +// * Contains function definitions for single stage inverse transform +// * +// * @author +// * Mohit +// * +// * @par List of Functions: +// * - ih264_iquant_itrans_recon_4x4_dc_av8() +// * - ih264_iquant_itrans_recon_8x8_dc_av8() +// * - ih264_iquant_itrans_recon_chroma_4x4_dc_av8() +// * +// * @remarks +// * None +// * +// ******************************************************************************* +//*/ + + +.include "ih264_neon_macros.s" + + +///** +// ******************************************************************************* +// * +// * @brief +// * This function performs inverse quant and Inverse transform type Ci4 for 4*4 block +// * for dc input pattern only, i.e. only the (0,0) element of the input 4x4 block is +// * non-zero. For complete function, refer ih264_iquant_itrans_recon_a9.s +// * +// * @par Description: +// * Performs inverse transform Ci4 and adds the residue to get the +// * reconstructed block +// * +// * @param[in] pi2_src +// * Input 4x4 coefficients +// * +// * @param[in] pu1_pred +// * Prediction 4x4 block +// * +// * @param[out] pu1_out +// * Output 4x4 block +// * +// * @param[in] u4_qp_div_6 +// * QP +// * +// * @param[in] pu2_weigh_mat +// * Pointer to weight matrix +// * +// * @param[in] pred_strd, +// * Prediction stride +// * +// * @param[in] out_strd +// * Output Stride +// * +// *@param[in] pi2_tmp +// * temporary buffer of size 1*16 +// * +// * @param[in] pu2_iscal_mat +// * Pointer to the inverse quantization matrix +// * +// * @returns Void +// * +// * @remarks +// * None +// * +// ******************************************************************************* +// */ +//void ih264_iquant_itrans_recon_4x4_dc(WORD16 *pi2_src, +// UWORD8 *pu1_pred, +// UWORD8 *pu1_out, +// WORD32 pred_strd, +// WORD32 out_strd, +// const UWORD16 *pu2_iscal_mat, +// const UWORD16 *pu2_weigh_mat, +// UWORD32 u4_qp_div_6, +// WORD32 *pi4_tmp, +// WORD32 iq_start_idx +// WORD16 *pi2_dc_ld_addr) +//**************Variables Vs Registers***************************************** +//x0 => *pi2_src +//x1 => *pu1_pred +//x2 => *pu1_out +//x3 => pred_strd +//x4 => out_strd +//x5 => *pu2_iscal_mat +//x6 => *pu2_weigh_mat +//x7 => u4_qp_div_6 +// => pi4_tmp +// => iq_start_idx +// => pi2_dc_ld_addr + +.text +.p2align 2 + + .global ih264_iquant_itrans_recon_4x4_dc_av8 +ih264_iquant_itrans_recon_4x4_dc_av8: + + ldr w8, [sp, #8] //Loads iq_start_idx + subs w8, w8, #1 // if x8 == 1 => intra case , so result of subtraction is zero and z flag is set + + ldr x10, [sp, #16] //Load alternate dc address + push_v_regs + dup v30.4s, w7 //Populate the u4_qp_div_6 in Q15 + + + bne donot_use_pi2_dc_ld_addr_luma_dc + ld1 {v0.h}[0], [x10] +donot_use_pi2_dc_ld_addr_luma_dc: + + beq donot_use_pi2_src_luma_dc + ld1 {v0.h}[0], [x5] + ld1 {v1.h}[0], [x6] + ld1 {v2.h}[0], [x0] + mul v0.4h, v1.4h, v0.4h + smull v0.4s, v0.4h, v2.4h + sshl v0.4s, v0.4s, v30.4s + sqrshrn v0.4h, v0.4s, #4 +donot_use_pi2_src_luma_dc: + + + dup v0.8h, v0.h[0] + srshr v0.8h, v0.8h, #6 + + ld1 {v1.s}[0], [x1], x3 + ld1 {v1.s}[1], [x1], x3 + ld1 {v2.s}[0], [x1], x3 + ld1 {v2.s}[1], [x1] + + uxtl v1.8h, v1.8b + uxtl v2.8h, v2.8b + + add v1.8h, v0.8h, v1.8h + add v2.8h, v0.8h, v2.8h + + sqxtun v1.8b, v1.8h + sqxtun v2.8b, v2.8h + + st1 {v1.s}[0], [x2], x4 + st1 {v1.s}[1], [x2], x4 + st1 {v2.s}[0], [x2], x4 + st1 {v2.s}[1], [x2] + pop_v_regs + ret + +// /* +// ******************************************************************************** +// * +// * @brief This function reconstructs a 4x4 sub block from quantized resiude and +// * prediction buffer if only dc value is present for residue +// * +// * @par Description: +// * The quantized residue is first inverse quantized, +// * This inverse quantized content is added to the prediction buffer to recon- +// * struct the end output +// * +// * @param[in] pi2_src +// * quantized dc coeffiient +// * +// * @param[in] pu1_pred +// * prediction 4x4 block in interleaved format +// * +// * @param[in] pred_strd, +// * Prediction buffer stride in interleaved format +// * +// * @param[in] out_strd +// * recon buffer Stride +// * +// * @returns none +// * +// * @remarks none +// * +// ******************************************************************************* +// */ +// void ih264_iquant_itrans_recon_chroma_4x4_dc(WORD16 *pi2_src, +// UWORD8 *pu1_pred, +// UWORD8 *pu1_out, +// WORD32 pred_strd, +// WORD32 out_strd, +// const UWORD16 *pu2_iscal_mat, +// const UWORD16 *pu2_weigh_mat, +// UWORD32 u4_qp_div_6, +// WORD16 *pi2_tmp, +// WORD16 *pi2_dc_src) +// Register Usage +// x0 : pi2_src +// x1 : pu1_pred +// x2 : pu1_out +// x3 : pred_strd +// x4 : out_strd +// x5 : pu2_iscal_mat +// x6 : pu2_weigh_mat +// x7 : u4_qp_div_6 +// : pi2_tmp +// : pi2_dc_src +// Neon registers d0-d7, d16-d30 are used +// No need for pushing arm and neon registers + + + .global ih264_iquant_itrans_recon_chroma_4x4_dc_av8 +ih264_iquant_itrans_recon_chroma_4x4_dc_av8: + + ldr x0, [sp, #8] + push_v_regs + ld1 {v0.h}[0], [x0] + dup v0.8h, v0.h[0] + srshr v0.8h, v0.8h, #6 + + + //backup pu1_out + mov x0, x2 + + //nop v3.16b //dummy for deinterleaving + movi v31.8h, #0x00ff //mask for interleaving [copy lower 8 bits] + + ld1 {v1.d}[0], [x1], x3 + ld1 {v1.d}[1], [x1], x3 + ld1 {v2.d}[0], [x1], x3 + ld1 {v2.d}[1], [x1], x3 + + ld1 {v11.d}[0], [x2], x4 //load pu1_out for interleaving + ld1 {v11.d}[1], [x2], x4 + ld1 {v12.d}[0], [x2], x4 + ld1 {v12.d}[1], [x2] + + uzp1 v1.16b, v1.16b, v3.16b + uzp1 v2.16b, v2.16b, v3.16b + + uaddw v1.8h, v0.8h, v1.8b + uaddw v2.8h, v0.8h, v2.8b + + sqxtun v1.8b, v1.8h + sqxtun v2.8b, v2.8h + + uxtl v1.8h, v1.8b + uxtl v2.8h, v2.8b + + bit v11.16b, v1.16b, v31.16b + bit v12.16b, v2.16b, v31.16b + + st1 {v11.d}[0], [x0], x4 + st1 {v11.d}[1], [x0], x4 + st1 {v12.d}[0], [x0], x4 + st1 {v12.d}[1], [x0] + pop_v_regs + ret + +///* +// ******************************************************************************* +// * +// * //brief +// * This function performs inverse quant and Inverse transform type Ci4 for 8*8 block +// * [Only for Dc coeff] +// * //par Description: +// * Performs inverse transform Ci8 and adds the residue to get the +// * reconstructed block +// * +// * //param[in] pi2_src +// * Input 4x4 coefficients +// * +// * //param[in] pu1_pred +// * Prediction 4x4 block +// * +// * //param[out] pu1_out +// * Output 4x4 block +// * +// * //param[in] u4_qp_div_6 +// * QP +// * +// * //param[in] pu2_weigh_mat +// * Pointer to weight matrix +// * +// * //param[in] pred_strd, +// * Prediction stride +// * +// * //param[in] out_strd +// * Output Stride +// * +// *//param[in] pi2_tmp +// * temporary buffer of size 1*64 +// * +// * //param[in] pu2_iscal_mat +// * Pointer to the inverse quantization matrix +// * +// * //returns Void +// * +// * //remarks +// * None +// * +// ******************************************************************************* +// */ +//void ih264_iquant_itrans_recon_dc_8x8(WORD16 *pi2_src, +// UWORD8 *pu1_pred, +// UWORD8 *pu1_out, +// WORD32 pred_strd, +// WORD32 out_strd, +// const UWORD16 *pu2_iscal_mat, +// const UWORD16 *pu2_weigh_mat, +// UWORD32 u4_qp_div_6, +// WORD32 *pi4_tmp, +// WORD32 iq_start_idx +// WORD16 *pi2_dc_ld_addr) +//**************Variables Vs Registers***************************************** +//x0 => *pi2_src +//x1 => *pu1_pred +//x2 => *pu1_out +//x3 => pred_strd +//x4 => out_strd +//x5 => *pu2_iscal_mat +//x6 => *pu2_weigh_mat +//x7 => u4_qp_div_6 +//NOT USED => pi4_tmp +//NOT USED => iq_start_idx +//NOT USED => pi2_dc_ld_addr + + .global ih264_iquant_itrans_recon_8x8_dc_av8 +ih264_iquant_itrans_recon_8x8_dc_av8: + + push_v_regs + + ld1 {v1.h}[0], [x5] + ld1 {v2.h}[0], [x6] + ld1 {v0.h}[0], [x0] + dup v3.4s, w7 + + + mul v1.8h, v1.8h, v2.8h + smull v0.4s, v0.4h, v1.4h + sshl v0.4s, v0.4s, v3.4s + + sqrshrn v0.4h, v0.4s, #6 + srshr v0.8h, v0.8h, #6 + dup v0.8h, v0.h[0] + + ld1 {v22.8b}, [x1], x3 + ld1 {v23.8b}, [x1], x3 + ld1 {v24.8b}, [x1], x3 + ld1 {v25.8b}, [x1], x3 + ld1 {v26.8b}, [x1], x3 + ld1 {v27.8b}, [x1], x3 + ld1 {v28.8b}, [x1], x3 + ld1 {v29.8b}, [x1] + + uaddw v1.8h, v0.8h, v22.8b + uaddw v2.8h, v0.8h, v23.8b + uaddw v3.8h, v0.8h, v24.8b + uaddw v8.8h, v0.8h, v25.8b + uaddw v9.8h, v0.8h, v26.8b + uaddw v10.8h, v0.8h, v27.8b + uaddw v11.8h, v0.8h, v28.8b + uaddw v12.8h, v0.8h, v29.8b + + sqxtun v1.8b, v1.8h + sqxtun v2.8b, v2.8h + sqxtun v3.8b, v3.8h + sqxtun v8.8b, v8.8h + sqxtun v9.8b, v9.8h + sqxtun v10.8b, v10.8h + sqxtun v11.8b, v11.8h + sqxtun v12.8b, v12.8h + + st1 {v1.8b}, [x2], x4 + st1 {v2.8b}, [x2], x4 + st1 {v3.8b}, [x2], x4 + st1 {v8.8b}, [x2], x4 + st1 {v9.8b}, [x2], x4 + st1 {v10.8b}, [x2], x4 + st1 {v11.8b}, [x2], x4 + st1 {v12.8b}, [x2] + + pop_v_regs + ret + + diff --git a/common/armv8/ih264_mem_fns_neon_av8.s b/common/armv8/ih264_mem_fns_neon_av8.s new file mode 100755 index 0000000..f5c2e29 --- /dev/null +++ b/common/armv8/ih264_mem_fns_neon_av8.s @@ -0,0 +1,274 @@ +//****************************************************************************** +//* +//* Copyright (C) 2015 The Android Open Source Project +//* +//* Licensed under the Apache License, Version 2.0 (the "License"); +//* you may not use this file except in compliance with the License. +//* You may obtain a copy of the License at: +//* +//* http://www.apache.org/licenses/LICENSE-2.0 +//* +//* Unless required by applicable law or agreed to in writing, software +//* distributed under the License is distributed on an "AS IS" BASIS, +//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +//* See the License for the specific language governing permissions and +//* limitations under the License. +//* +//***************************************************************************** +//* Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore +//*/ +///** +// ******************************************************************************* +// * @file +// * ih264_mem_fns_neon.s +// * +// * @brief +// * Contains function definitions for memory manipulation +// * +// * @author +// * Naveen SR +// * +// * @par List of Functions: +// * - ih264_memcpy_av8() +// * - ih264_memcpy_mul_8_av8() +// * - ih264_memset_mul_8_av8() +// * - ih264_memset_16bit_mul_8_av8() +// * - ih264_memset_16bit_av8() +// * +// * @remarks +// * None +// * +// ******************************************************************************* +//*/ + +.text +.p2align 2 +.include "ih264_neon_macros.s" +///** +//******************************************************************************* +//* +//* @brief +//* memcpy of a 1d array +//* +//* @par Description: +//* Does memcpy of 8bit data from source to destination for 8,16 or 32 number of bytes +//* +//* @param[in] pu1_dst +//* UWORD8 pointer to the destination +//* +//* @param[in] pu1_src +//* UWORD8 pointer to the source +//* +//* @param[in] num_bytes +//* number of bytes to copy +//* @returns +//* +//* @remarks +//* None +//* +//******************************************************************************* +//*/ +//void ih264_memcpy_mul_8(UWORD8 *pu1_dst, +// UWORD8 *pu1_src, +// UWORD8 num_bytes) +//**************Variables Vs Registers************************* +// x0 => *pu1_dst +// x1 => *pu1_src +// x2 => num_bytes + + + + + + .global ih264_memcpy_mul_8_av8 + +ih264_memcpy_mul_8_av8: + +loop_neon_memcpy_mul_8: + // Memcpy 8 bytes + ld1 {v0.8b}, [x1], #8 + st1 {v0.8b}, [x0], #8 + + subs x2, x2, #8 + bne loop_neon_memcpy_mul_8 + ret + + + +//******************************************************************************* +//*/ +//void ih264_memcpy(UWORD8 *pu1_dst, +// UWORD8 *pu1_src, +// UWORD8 num_bytes) +//**************Variables Vs Registers************************* +// x0 => *pu1_dst +// x1 => *pu1_src +// x2 => num_bytes + + + + .global ih264_memcpy_av8 + +ih264_memcpy_av8: + subs x2, x2, #8 + blt arm_memcpy +loop_neon_memcpy: + // Memcpy 8 bytes + ld1 {v0.8b}, [x1], #8 + st1 {v0.8b}, [x0], #8 + + subs x2, x2, #8 + bge loop_neon_memcpy + cmp x2, #-8 + beq end_func1 + +arm_memcpy: + add x2, x2, #8 + +loop_arm_memcpy: + ldrb w3, [x1], #1 + sxtw x3, w3 + strb w3, [x0], #1 + sxtw x3, w3 + subs x2, x2, #1 + bne loop_arm_memcpy + ret +end_func1: + ret + + +//void ih264_memset_mul_8(UWORD8 *pu1_dst, +// UWORD8 value, +// UWORD8 num_bytes) +//**************Variables Vs Registers************************* +// x0 => *pu1_dst +// x1 => value +// x2 => num_bytes + + + .global ih264_memset_mul_8_av8 + +ih264_memset_mul_8_av8: + +// Assumptions: numbytes is either 8, 16 or 32 + dup v0.8b, w1 +loop_memset_mul_8: + // Memset 8 bytes + st1 {v0.8b}, [x0], #8 + + subs x2, x2, #8 + bne loop_memset_mul_8 + + ret + + +//void ih264_memset(UWORD8 *pu1_dst, +// UWORD8 value, +// UWORD8 num_bytes) +//**************Variables Vs Registers************************* +// x0 => *pu1_dst +// x1 => value +// x2 => num_bytes + + + + .global ih264_memset_av8 + +ih264_memset_av8: + subs x2, x2, #8 + blt arm_memset + dup v0.8b, w1 +loop_neon_memset: + // Memcpy 8 bytes + st1 {v0.8b}, [x0], #8 + + subs x2, x2, #8 + bge loop_neon_memset + cmp x2, #-8 + beq end_func2 + +arm_memset: + add x2, x2, #8 + +loop_arm_memset: + strb w1, [x0], #1 + sxtw x1, w1 + subs x2, x2, #1 + bne loop_arm_memset + ret +end_func2: + ret + + + + + +//void ih264_memset_16bit_mul_8(UWORD16 *pu2_dst, +// UWORD16 value, +// UWORD8 num_words) +//**************Variables Vs Registers************************* +// x0 => *pu2_dst +// x1 => value +// x2 => num_words + + + .global ih264_memset_16bit_mul_8_av8 + +ih264_memset_16bit_mul_8_av8: + +// Assumptions: num_words is either 8, 16 or 32 + + // Memset 8 words + dup v0.4h, w1 +loop_memset_16bit_mul_8: + st1 {v0.4h}, [x0], #8 + st1 {v0.4h}, [x0], #8 + + subs x2, x2, #8 + bne loop_memset_16bit_mul_8 + + ret + + + +//void ih264_memset_16bit(UWORD16 *pu2_dst, +// UWORD16 value, +// UWORD8 num_words) +//**************Variables Vs Registers************************* +// x0 => *pu2_dst +// x1 => value +// x2 => num_words + + + + .global ih264_memset_16bit_av8 + +ih264_memset_16bit_av8: + subs x2, x2, #8 + blt arm_memset_16bit + dup v0.4h, w1 +loop_neon_memset_16bit: + // Memset 8 words + st1 {v0.4h}, [x0], #8 + st1 {v0.4h}, [x0], #8 + + subs x2, x2, #8 + bge loop_neon_memset_16bit + cmp x2, #-8 + beq end_func3 + +arm_memset_16bit: + add x2, x2, #8 + +loop_arm_memset_16bit: + strh w1, [x0], #2 + sxtw x1, w1 + subs x2, x2, #1 + bne loop_arm_memset_16bit + ret + +end_func3: + ret + + + diff --git a/common/armv8/ih264_neon_macros.s b/common/armv8/ih264_neon_macros.s new file mode 100755 index 0000000..6ff5b91 --- /dev/null +++ b/common/armv8/ih264_neon_macros.s @@ -0,0 +1,41 @@ +//****************************************************************************** +//* +//* Copyright (C) 2015 The Android Open Source Project +//* +//* Licensed under the Apache License, Version 2.0 (the "License"); +//* you may not use this file except in compliance with the License. +//* You may obtain a copy of the License at: +//* +//* http://www.apache.org/licenses/LICENSE-2.0 +//* +//* Unless required by applicable law or agreed to in writing, software +//* distributed under the License is distributed on an "AS IS" BASIS, +//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +//* See the License for the specific language governing permissions and +//* limitations under the License. +//* +//***************************************************************************** +//* Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore +//*/ +//******************************************************************************* + + +.macro push_v_regs + stp d8, d9, [sp, #-16]! + stp d10, d11, [sp, #-16]! + stp d12, d13, [sp, #-16]! + stp d14, d15, [sp, #-16]! +.endm +.macro pop_v_regs + ldp d14, d15, [sp], #16 + ldp d12, d13, [sp], #16 + ldp d10, d11, [sp], #16 + ldp d8, d9, [sp], #16 +.endm + +.macro swp reg1, reg2 + eor \reg1, \reg1, \reg2 + eor \reg2, \reg1, \reg2 + eor \reg1, \reg1, \reg2 +.endm + diff --git a/common/armv8/ih264_padding_neon_av8.s b/common/armv8/ih264_padding_neon_av8.s new file mode 100755 index 0000000..35d9c8a --- /dev/null +++ b/common/armv8/ih264_padding_neon_av8.s @@ -0,0 +1,784 @@ +//****************************************************************************** +//* +//* Copyright (C) 2015 The Android Open Source Project +//* +//* Licensed under the Apache License, Version 2.0 (the "License"); +//* you may not use this file except in compliance with the License. +//* You may obtain a copy of the License at: +//* +//* http://www.apache.org/licenses/LICENSE-2.0 +//* +//* Unless required by applicable law or agreed to in writing, software +//* distributed under the License is distributed on an "AS IS" BASIS, +//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +//* See the License for the specific language governing permissions and +//* limitations under the License. +//* +//***************************************************************************** +//* Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore +//*/ +///** +// ******************************************************************************* +// * @file +// * ih264_padding_neon.s +// * +// * @brief +// * Contains function definitions padding +// * +// * @author +// * Ittiam +// * +// * @par List of Functions: +// * - ih264_pad_top_av8() +// * - ih264_pad_left_luma_av8() +// * - ih264_pad_left_chroma_av8() +// * - ih264_pad_right_luma_av8() +// * - ih264_pad_right_chroma_av8() +// * +// * @remarks +// * None +// * +// ******************************************************************************* +//*/ + +.text +.p2align 2 +.include "ih264_neon_macros.s" +///** +//******************************************************************************* +//* +//* @brief pad at the top of a 2d array +//* +//* @par Description: +//* The top row of a 2d array is replicated for pad_size times at the top +//* +//* @param[in] pu1_src +//* UWORD8 pointer to the source +//* +//* @param[in] src_strd +//* integer source stride +//* +//* @param[in] wd +//* integer width of the array +//* +//* @param[in] pad_size +//* integer -padding size of the array +//* +//* @returns none +//* +//* @remarks none +//* +//******************************************************************************* +//*/ +//void ih264_pad_top(UWORD8 *pu1_src, +// WORD32 src_strd, +// WORD32 wd, +// WORD32 pad_size) +//**************Variables Vs Registers************************* +// x0 => *pu1_src +// x1 => src_strd +// x2 => wd +// x3 => pad_size + + .global ih264_pad_top_av8 + +ih264_pad_top_av8: + + // STMFD sp!, {x4-x11,x14} //stack stores the values of the arguments + push_v_regs + stp x19, x20, [sp, #-16]! + + sub x5, x0, x1 + sub x20, x1, #0 + neg x6, x20 + +loop_neon_memcpy_mul_16: + // Load 16 bytes + ld1 {v0.8b, v1.8b}, [x0], #16 + mov x4, x5 + mov x7, x3 + add x5, x5, #16 + +loop_neon_pad_top: + st1 {v0.8b, v1.8b}, [x4], x6 + subs x7, x7, #1 + bne loop_neon_pad_top + + subs x2, x2, #16 + bne loop_neon_memcpy_mul_16 + + // LDMFD sp!,{x4-x11,pc} //Reload the registers from SP + ldp x19, x20, [sp], #16 + pop_v_regs + ret + + + + +///** +//******************************************************************************* +//* +//* @brief +//* Padding (luma block) at the left of a 2d array +//* +//* @par Description: +//* The left column of a 2d array is replicated for pad_size times at the left +//* +//* +//* @param[in] pu1_src +//* UWORD8 pointer to the source +//* +//* @param[in] src_strd +//* integer source stride +//* +//* @param[in] ht +//* integer height of the array +//* +//* @param[in] wd +//* integer width of the array +//* +//* @param[in] pad_size +//* integer -padding size of the array +//* +//* @param[in] ht +//* integer height of the array +//* +//* @param[in] wd +//* integer width of the array +//* +//* @returns +//* +//* @remarks +//* None +//* +//******************************************************************************* +//*/ +//#if PAD_LEFT_LUMA == C +//void ih264_pad_left_luma(UWORD8 *pu1_src, +// WORD32 src_strd, +// WORD32 ht, +// WORD32 pad_size) +//**************Variables Vs Registers************************* +// x0 => *pu1_src +// x1 => src_strd +// x2 => ht +// x3 => pad_size + + + + .global ih264_pad_left_luma_av8 + +ih264_pad_left_luma_av8: + + // STMFD sp!, {x4-x11,x14} //stack stores the values of the arguments + push_v_regs + stp x19, x20, [sp, #-16]! + + + sub x4, x0, x3 + sub x6, x1, #16 + subs x5, x3, #16 + bne loop_32 +loop_16: // /*hard coded for width=16 ,height =8,16*/ + ldrb w8, [x0] + add x0, x0, x1 + sxtw x8, w8 + ldrb w9, [x0] + add x0, x0, x1 + sxtw x9, w9 + dup v0.16b, w8 + ldrb w10, [x0] + add x0, x0, x1 + sxtw x10, w10 + st1 {v0.16b}, [x4], x1 // 16 bytes store + dup v2.16b, w9 + st1 {v2.16b}, [x4], x1 // 16 bytes store + ldrb w11, [x0] + add x0, x0, x1 + sxtw x11, w11 + dup v4.16b, w10 + dup v6.16b, w11 + st1 {v4.16b}, [x4], x1 // 16 bytes store + ldrb w8, [x0] + add x0, x0, x1 + sxtw x8, w8 + st1 {v6.16b}, [x4], x1 // 16 bytes store + ldrb w9, [x0] + add x0, x0, x1 + sxtw x9, w9 + dup v0.16b, w8 + ldrb w10, [x0] + add x0, x0, x1 + sxtw x10, w10 + st1 {v0.16b}, [x4], x1 // 16 bytes store + dup v2.16b, w9 + ldrb w11, [x0] + add x0, x0, x1 + sxtw x11, w11 + st1 {v2.16b}, [x4], x1 // 16 bytes store + dup v4.16b, w10 + dup v6.16b, w11 + subs x2, x2, #8 + st1 {v4.16b}, [x4], x1 // 16 bytes store + st1 {v6.16b}, [x4], x1 // 16 bytes store + bne loop_16 + b end_func + +loop_32: // /*hard coded for width=32 ,height =8,16*/ + ldrb w8, [x0] + add x0, x0, x1 + sxtw x8, w8 + ldrb w9, [x0] + add x0, x0, x1 + sxtw x9, w9 + dup v0.16b, w8 + ldrb w10, [x0] + add x0, x0, x1 + sxtw x10, w10 + st1 {v0.16b}, [x4], #16 // 16 bytes store + dup v2.16b, w9 + st1 {v0.16b}, [x4], x6 + st1 {v2.16b}, [x4], #16 // 16 bytes store + dup v4.16b, w10 + st1 {v2.16b}, [x4], x6 // 16 bytes store + ldrb w11, [x0] + add x0, x0, x1 + sxtw x11, w11 + st1 {v4.16b}, [x4], #16 // 16 bytes store + dup v6.16b, w11 + st1 {v4.16b}, [x4], x6 // 16 bytes store + ldrb w8, [x0] + add x0, x0, x1 + sxtw x8, w8 + st1 {v6.16b}, [x4], #16 // 16 bytes store + dup v0.16b, w8 + ldrb w9, [x0] + add x0, x0, x1 + sxtw x9, w9 + st1 {v6.16b}, [x4], x6 // 16 bytes store + ldrb w10, [x0] + add x0, x0, x1 + sxtw x10, w10 + st1 {v0.16b}, [x4], #16 // 16 bytes store + dup v2.16b, w9 + st1 {v0.16b}, [x4], x6 // 16 bytes store + ldrb w11, [x0] + add x0, x0, x1 + sxtw x11, w11 + st1 {v2.16b}, [x4], #16 // 16 bytes store + dup v4.16b, w10 + st1 {v2.16b}, [x4], x6 // 16 bytes store + st1 {v4.16b}, [x4], #16 // 16 bytes store + dup v6.16b, w11 + st1 {v4.16b}, [x4], x6 // 16 bytes store + subs x2, x2, #8 + st1 {v6.16b}, [x4], #16 // 16 bytes store + st1 {v6.16b}, [x4], x6 // 16 bytes store + bne loop_32 + + + +end_func: + // LDMFD sp!,{x4-x11,pc} //Reload the registers from SP + ldp x19, x20, [sp], #16 + pop_v_regs + ret + + + + + +///** +//******************************************************************************* +//* +//* @brief +//* Padding (chroma block) at the left of a 2d array +//* +//* @par Description: +//* The left column of a 2d array is replicated for pad_size times at the left +//* +//* +//* @param[in] pu1_src +//* UWORD8 pointer to the source +//* +//* @param[in] src_strd +//* integer source stride +//* +//* @param[in] ht +//* integer height of the array +//* +//* @param[in] wd +//* integer width of the array (each colour component) +//* +//* @param[in] pad_size +//* integer -padding size of the array +//* +//* @param[in] ht +//* integer height of the array +//* +//* @param[in] wd +//* integer width of the array +//* +//* @returns +//* +//* @remarks +//* None +//* +//******************************************************************************* +//*/ +//#if PAD_LEFT_CHROMA == C +//void ih264_pad_left_chroma(UWORD8 *pu1_src, +// WORD32 src_strd, +// WORD32 ht, +// WORD32 pad_size) +//{ +// x0 => *pu1_src +// x1 => src_strd +// x2 => ht +// x3 => pad_size + + + + .global ih264_pad_left_chroma_av8 + +ih264_pad_left_chroma_av8: + + // STMFD sp!, {x4-x11, x14} //stack stores the values of the arguments + push_v_regs + stp x19, x20, [sp, #-16]! + + sub x4, x0, x3 + sub x6, x1, #16 + + +loop_32_l_c: // /*hard coded for width=32 ,height =4,8,12*/ + ldrh w8, [x0] + add x0, x0, x1 + sxtw x8, w8 + ldrh w9, [x0] + add x0, x0, x1 + sxtw x9, w9 + dup v0.8h, w8 + ldrh w10, [x0] + add x0, x0, x1 + sxtw x10, w10 + st1 {v0.16b}, [x4], #16 // 16 bytes store + dup v2.8h, w9 + st1 {v0.16b}, [x4], x6 // 16 bytes store + ldrh w11, [x0] + add x0, x0, x1 + sxtw x11, w11 + st1 {v2.16b}, [x4], #16 // 16 bytes store + dup v4.8h, w10 + st1 {v2.16b}, [x4], x6 // 16 bytes store + dup v6.8h, w11 + st1 {v4.16b}, [x4], #16 // 16 bytes store + st1 {v4.16b}, [x4], x6 // 16 bytes store + subs x2, x2, #4 + st1 {v6.16b}, [x4], #16 // 16 bytes store + st1 {v6.16b}, [x4], x6 // 16 bytes store + + + beq end_func_l_c ///* Branching when ht=4*/ + + ldrh w8, [x0] + add x0, x0, x1 + sxtw x8, w8 + ldrh w9, [x0] + add x0, x0, x1 + sxtw x9, w9 + dup v0.8h, w8 + ldrh w10, [x0] + add x0, x0, x1 + sxtw x10, w10 + st1 {v0.16b}, [x4], #16 // 16 bytes store + dup v2.8h, w9 + st1 {v0.16b}, [x4], x6 + ldrh w11, [x0] + add x0, x0, x1 + sxtw x11, w11 + st1 {v2.16b}, [x4], #16 // 16 bytes store + dup v4.8h, w10 + st1 {v2.16b}, [x4], x6 // 16 bytes store + dup v6.8h, w11 + st1 {v4.16b}, [x4], #16 // 16 bytes store + st1 {v4.16b}, [x4], x6 // 16 bytes store + subs x2, x2, #4 + st1 {v6.16b}, [x4], #16 // 16 bytes store + st1 {v6.16b}, [x4], x6 // 16 bytes store + + beq end_func_l_c ///* Branching when ht=8*/ + bne loop_32_l_c + + ldrh w8, [x0] + add x0, x0, x1 + sxtw x8, w8 + ldrh w9, [x0] + add x0, x0, x1 + sxtw x9, w9 + dup v0.8h, w8 + ldrh w10, [x0] + add x0, x0, x1 + sxtw x10, w10 + st1 {v0.16b}, [x4], #16 // 16 bytes store + dup v2.8h, w9 + st1 {v0.16b}, [x4], x6 + ldrh w11, [x0] + add x0, x0, x1 + sxtw x11, w11 + st1 {v2.16b}, [x4], #16 // 16 bytes store + dup v4.8h, w10 + st1 {v2.16b}, [x4], x6 // 16 bytes store + dup v6.8h, w11 + st1 {v4.16b}, [x4], #16 // 16 bytes store + st1 {v4.16b}, [x4], x6 // 16 bytes store + st1 {v6.16b}, [x4], #16 // 16 bytes store + st1 {v6.16b}, [x4], x6 // 16 bytes store + +end_func_l_c: + // LDMFD sp!,{x4-x11,pc} //Reload the registers from SP + ldp x19, x20, [sp], #16 + pop_v_regs + ret + + + + + +///** +//******************************************************************************* +//* +//* @brief +//* Padding (luma block) at the right of a 2d array +//* +//* @par Description: +//* The right column of a 2d array is replicated for pad_size times at the right +//* +//* +//* @param[in] pu1_src +//* UWORD8 pointer to the source +//* +//* @param[in] src_strd +//* integer source stride +//* +//* @param[in] ht +//* integer height of the array +//* +//* @param[in] wd +//* integer width of the array +//* +//* @param[in] pad_size +//* integer -padding size of the array +//* +//* @param[in] ht +//* integer height of the array +//* +//* @param[in] wd +//* integer width of the array +//* +//* @returns +//* +//* @remarks +//* None +//* +//******************************************************************************* +//*/ +//#if PAD_RIGHT_LUMA == C +//void ih264_pad_right_luma(UWORD8 *pu1_src, +// WORD32 src_strd, +// WORD32 ht, +// WORD32 pad_size) +//{ +// WORD32 row; +// +// for(row = 0; row < ht; row++) +// { +// memset(pu1_src, *(pu1_src -1), pad_size); +// +// pu1_src += src_strd; +// } +//} +// +// x0 => *pu1_src +// x1 => src_strd +// x2 => ht +// x3 => pad_size + + + + .global ih264_pad_right_luma_av8 + +ih264_pad_right_luma_av8: + + // STMFD sp!, {x4-x11, x14} //stack stores the values of the arguments + push_v_regs + stp x19, x20, [sp, #-16]! + + mov x4, x0 + sub x6, x1, #16 + sub x0, x0, #1 + subs x5, x3, #16 + bne loop_32 +loop_16_r: // /*hard coded for width=16 ,height =8,16*/ + ldrb w8, [x0] + add x0, x0, x1 + sxtw x8, w8 + ldrb w9, [x0] + add x0, x0, x1 + sxtw x9, w9 + dup v0.16b, w8 + ldrb w10, [x0] + add x0, x0, x1 + sxtw x10, w10 + st1 {v0.16b}, [x4], x1 // 16 bytes store + dup v2.16b, w9 + st1 {v2.16b}, [x4], x1 // 16 bytes store + ldrb w11, [x0] + add x0, x0, x1 + sxtw x11, w11 + dup v4.16b, w10 + dup v6.16b, w11 + st1 {v4.16b}, [x4], x1 // 16 bytes store + ldrb w8, [x0] + add x0, x0, x1 + sxtw x8, w8 + st1 {v6.16b}, [x4], x1 // 16 bytes store + ldrb w9, [x0] + add x0, x0, x1 + sxtw x9, w9 + dup v0.16b, w8 + ldrb w10, [x0] + add x0, x0, x1 + sxtw x10, w10 + st1 {v0.16b}, [x4], x1 // 16 bytes store + dup v2.16b, w9 + ldrb w11, [x0] + add x0, x0, x1 + sxtw x11, w11 + st1 {v2.16b}, [x4], x1 // 16 bytes store + dup v4.16b, w10 + dup v6.16b, w11 + subs x2, x2, #8 + st1 {v4.16b}, [x4], x1 // 16 bytes store + st1 {v6.16b}, [x4], x1 // 16 bytes store + bne loop_16_r + b end_func_r + +loop_32_r: // /*hard coded for width=32 ,height =8,16*/ + ldrb w8, [x0] + add x0, x0, x1 + sxtw x8, w8 + ldrb w9, [x0] + add x0, x0, x1 + sxtw x9, w9 + dup v0.16b, w8 + ldrb w10, [x0] + add x0, x0, x1 + sxtw x10, w10 + st1 {v0.16b}, [x4], #16 // 16 bytes store + dup v2.16b, w9 + st1 {v0.16b}, [x4], x6 + st1 {v2.16b}, [x4], #16 // 16 bytes store + dup v4.16b, w10 + st1 {v2.16b}, [x4], x6 // 16 bytes store + ldrb w11, [x0] + add x0, x0, x1 + sxtw x11, w11 + st1 {v4.16b}, [x4], #16 // 16 bytes store + dup v6.16b, w11 + st1 {v4.16b}, [x4], x6 // 16 bytes store + ldrb w8, [x0] + add x0, x0, x1 + sxtw x8, w8 + st1 {v6.16b}, [x4], #16 // 16 bytes store + ldrb w9, [x0] + add x0, x0, x1 + sxtw x9, w9 + dup v0.16b, w8 + st1 {v6.16b}, [x4], x6 // 16 bytes store + ldrb w10, [x0] + add x0, x0, x1 + sxtw x10, w10 + st1 {v0.16b}, [x4], #16 // 16 bytes store + dup v2.16b, w9 + st1 {v0.16b}, [x4], x6 // 16 bytes store + ldrb w11, [x0] + add x0, x0, x1 + sxtw x11, w11 + st1 {v2.16b}, [x4], #16 // 16 bytes store + dup v4.16b, w10 + st1 {v2.16b}, [x4], x6 // 16 bytes store + st1 {v4.16b}, [x4], #16 // 16 bytes store + dup v6.16b, w11 + st1 {v4.16b}, [x4], x6 // 16 bytes store + subs x2, x2, #8 + st1 {v6.16b}, [x4], #16 // 16 bytes store + st1 {v6.16b}, [x4], x6 // 16 bytes store + bne loop_32_r + + + +end_func_r: + // LDMFD sp!,{x4-x11,pc} //Reload the registers from SP + ldp x19, x20, [sp], #16 + pop_v_regs + ret + + + + + +///** +//******************************************************************************* +//* +//* @brief +//;* Padding (chroma block) at the right of a 2d array +//* +//* @par Description: +//* The right column of a 2d array is replicated for pad_size times at the right +//* +//* +//* @param[in] pu1_src +//;* UWORD8 pointer to the source +//* +//* @param[in] src_strd +//* integer source stride +//* +//* @param[in] ht +//;* integer height of the array +//* +//* @param[in] wd +//* integer width of the array (each colour component) +//* +//* @param[in] pad_size +//* integer -padding size of the array +//* +//* @param[in] ht +//;* integer height of the array +//* +//* @param[in] wd +//* integer width of the array +//* +//* @returns +//* +//* @remarks +//* None +//* +//******************************************************************************* +//*/ +//#if PAD_RIGHT_CHROMA == C +//void ih264_pad_right_chroma(UWORD8 *pu1_src, +// WORD32 src_strd, +// WORD32 ht, +// WORD32 pad_size) +// x0 => *pu1_src +// x1 => src_strd +// x2 => ht +// x3 => pad_size + + + + .global ih264_pad_right_chroma_av8 + +ih264_pad_right_chroma_av8: + + // STMFD sp!, {x4-x11, x14} //stack stores the values of the arguments + push_v_regs + stp x19, x20, [sp, #-16]! + + mov x4, x0 + sub x6, x1, #16 + sub x0, x0, #2 +loop_32_r_c: // /*hard coded for width=32 ,height =8,4*/ + ldrh w8, [x0] + add x0, x0, x1 + sxtw x8, w8 + ldrh w9, [x0] + add x0, x0, x1 + sxtw x9, w9 + dup v0.8h, w8 + ldrh w10, [x0] + add x0, x0, x1 + sxtw x10, w10 + st1 {v0.16b}, [x4], #16 // 16 bytes store + dup v2.8h, w9 + st1 {v0.16b}, [x4], x6 + st1 {v2.16b}, [x4], #16 // 16 bytes store + dup v4.8h, w10 + st1 {v2.16b}, [x4], x6 // 16 bytes store + subs x2, x2, #4 + ldrh w11, [x0] + add x0, x0, x1 + sxtw x11, w11 + st1 {v4.16b}, [x4], #16 // 16 bytes store + dup v6.8h, w11 + st1 {v4.16b}, [x4], x6 // 16 bytes store + st1 {v6.16b}, [x4], #16 // 16 bytes store + st1 {v6.16b}, [x4], x6 // 16 bytes store + + beq end_func_r_c ///* Branching when ht=4*/ + + ldrh w8, [x0] + add x0, x0, x1 + sxtw x8, w8 + dup v0.8h, w8 + ldrh w9, [x0] + add x0, x0, x1 + sxtw x9, w9 + ldrh w10, [x0] + add x0, x0, x1 + sxtw x10, w10 + st1 {v0.16b}, [x4], #16 // 16 bytes store + dup v2.8h, w9 + st1 {v0.16b}, [x4], x6 // 16 bytes store + ldrh w11, [x0] + add x0, x0, x1 + sxtw x11, w11 + st1 {v2.16b}, [x4], #16 // 16 bytes store + dup v4.8h, w10 + st1 {v2.16b}, [x4], x6 // 16 bytes store + st1 {v4.16b}, [x4], #16 // 16 bytes store + dup v6.8h, w11 + st1 {v4.16b}, [x4], x6 // 16 bytes store + subs x2, x2, #4 + st1 {v6.16b}, [x4], #16 // 16 bytes store + st1 {v6.16b}, [x4], x6 // 16 bytes store + + beq end_func_r_c ///* Branching when ht=8*/ + bne loop_32_r_c + ldrh w8, [x0] + add x0, x0, x1 + sxtw x8, w8 + dup v0.8h, w8 + ldrh w9, [x0] + add x0, x0, x1 + sxtw x9, w9 + ldrh w10, [x0] + add x0, x0, x1 + sxtw x10, w10 + st1 {v0.16b}, [x4], #16 // 16 bytes store + dup v2.8h, w9 + st1 {v0.16b}, [x4], x6 // 16 bytes store + ldrh w11, [x0] + add x0, x0, x1 + sxtw x11, w11 + st1 {v2.16b}, [x4], #16 // 16 bytes store + dup v4.8h, w10 + st1 {v2.16b}, [x4], x6 // 16 bytes store + st1 {v4.16b}, [x4], #16 // 16 bytes store + dup v6.8h, w11 + st1 {v4.16b}, [x4], x6 // 16 bytes store + st1 {v6.16b}, [x4], #16 // 16 bytes store + st1 {v6.16b}, [x4], x6 // 16 bytes store + +end_func_r_c: + // LDMFD sp!,{x4-x11,pc} //Reload the registers from SP + ldp x19, x20, [sp], #16 + pop_v_regs + ret + + + + + + diff --git a/common/armv8/ih264_platform_macros.h b/common/armv8/ih264_platform_macros.h new file mode 100755 index 0000000..1f67403 --- /dev/null +++ b/common/armv8/ih264_platform_macros.h @@ -0,0 +1,152 @@ +/****************************************************************************** + * + * Copyright (C) 2015 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ***************************************************************************** + * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore +*/ +/** +******************************************************************************* +* @file +* ih264_platform_macros.h +* +* @brief +* Platform specific Macro definitions used in the codec +* +* @author +* Ittiam +* +* @remarks +* None +* +******************************************************************************* +*/ +#ifndef _IHEVC_PLATFORM_MACROS_H_ +#define _IHEVC_PLATFORM_MACROS_H_ + +#ifndef ARMV8 +void ih264_arm_dsb(void); + +#define DATA_SYNC() ih264_arm_dsb() +static __inline WORD32 CLIP_U8(WORD32 x) +{ + asm("usat %0, #8, %1" : "=r"(x) : "r"(x)); + return x; +} + +static __inline WORD32 CLIP_S8(WORD32 x) +{ + asm("ssat %0, #8, %1" : "=r"(x) : "r"(x)); + return x; +} + +static __inline WORD32 CLIP_U10(WORD32 x) +{ + asm("usat %0, #10, %1" : "=r"(x) : "r"(x)); + return x; +} + +static __inline WORD32 CLIP_S10(WORD32 x) +{ + asm("ssat %0, #10, %1" : "=r"(x) : "r"(x)); + return x; +} + +static __inline WORD32 CLIP_U12(WORD32 x) +{ + asm("usat %0, #12, %1" : "=r"(x) : "r"(x)); + return x; +} + +static __inline WORD32 CLIP_S12(WORD32 x) +{ + asm("ssat %0, #12, %1" : "=r"(x) : "r"(x)); + return x; +} + +static __inline WORD32 CLIP_U16(WORD32 x) +{ + asm("usat %0, #16, %1" : "=r"(x) : "r"(x)); + return x; +} +static __inline WORD32 CLIP_S16(WORD32 x) +{ + asm("ssat %0, #16, %1" : "=r"(x) : "r"(x)); + return x; +} + + +static __inline UWORD32 ITT_BIG_ENDIAN(UWORD32 x) +{ + asm("rev %0, %1" : "=r"(x) : "r"(x)); + return x; +} +#else +#define DATA_SYNC() ; + +#define CLIP_U8(x) CLIP3(0, 255, (x)) +#define CLIP_S8(x) CLIP3(-128, 127, (x)) + +#define CLIP_U10(x) CLIP3(0, 1023, (x)) +#define CLIP_S10(x) CLIP3(-512, 511, (x)) + +#define CLIP_U12(x) CLIP3(0, 4095, (x)) +#define CLIP_S12(x) CLIP3(-2048, 2047, (x)) + +#define CLIP_U16(x) CLIP3(0, 65535, (x)) +#define CLIP_S16(x) CLIP3(-32768, 32767, (x)) + +#define ITT_BIG_ENDIAN(x) ((x & 0x000000ff) << 24) | \ + ((x & 0x0000ff00) << 8) | \ + ((x & 0x00ff0000) >> 8) | \ + ((UWORD32)x >> 24); +#endif + +#define SHL(x,y) (((y) < 32) ? ((x) << (y)) : 0) +#define SHR(x,y) (((y) < 32) ? ((x) >> (y)) : 0) + +#define SHR_NEG(val,shift) ((shift>0)?(val>>shift):(val<<(-shift))) +#define SHL_NEG(val,shift) ((shift<0)?(val>>(-shift)):(val<<shift)) + +#define INLINE inline + +static INLINE UWORD32 CLZ(UWORD32 u4_word) +{ + if(u4_word) + return (__builtin_clz(u4_word)); + else + return 32; +} +static INLINE UWORD32 CTZ(UWORD32 u4_word) +{ + if(0 == u4_word) + return 31; + else + { + unsigned int index; + index = __builtin_ctz(u4_word); + return (UWORD32)index; + } +} + + +#define NOP(nop_cnt) {UWORD32 nop_i; for (nop_i = 0; nop_i < nop_cnt; nop_i++);} + + +#define MEM_ALIGN8 __attribute__ ((aligned (8))) +#define MEM_ALIGN16 __attribute__ ((aligned (16))) +#define MEM_ALIGN32 __attribute__ ((aligned (32))) + +#endif /* _IHEVC_PLATFORM_MACROS_H_ */ diff --git a/common/armv8/ih264_resi_trans_quant_av8.s b/common/armv8/ih264_resi_trans_quant_av8.s new file mode 100755 index 0000000..dc1c680 --- /dev/null +++ b/common/armv8/ih264_resi_trans_quant_av8.s @@ -0,0 +1,731 @@ +//****************************************************************************** +//* +//* Copyright (C) 2015 The Android Open Source Project +//* +//* Licensed under the Apache License, Version 2.0 (the "License"); +//* you may not use this file except in compliance with the License. +//* You may obtain a copy of the License at: +//* +//* http://www.apache.org/licenses/LICENSE-2.0 +//* +//* Unless required by applicable law or agreed to in writing, software +//* distributed under the License is distributed on an "AS IS" BASIS, +//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +//* See the License for the specific language governing permissions and +//* limitations under the License. +//* +//***************************************************************************** +//* Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore +//*/ +///*****************************************************************************/ +///** +//******************************************************************************* +//* @file +//* ih264_resi_trans_quant_av8.c +//* +//* @brief +//* contains function definitions for residual and forward trans +//* +//* @author +//* ittiam +//* +//* @par list of functions: +//* ih264_resi_trans_quant_4x4_av8 +//* ih264_resi_trans_quant_8x8_av8 +//* ih264_resi_trans_quant_chroma_4x4_av8 +//* @remarks +//* none +//* +//******************************************************************************* +.include "ih264_neon_macros.s" +.text +.p2align 2 +//***************************************************************************** +//* +//* function name : ih264_resi_trans_quant_4x4 +//* description : this function does cf4 of h264 +//* +//* arguments : x0 :pointer to src buffer +// x1 :pointer to pred buffer +// x2 :pointer to dst buffer +// x3 :source stride +// x4 :pred stride, +// x5 :dst stride, +// x6 :pointer to scaling matrix, +// x7 :pointer to threshold matrix, +// stack qbits, +// rounding factor, +// pointer to store nnz +// pointer to store non quantized dc value +// values returned : none +// +// register usage : +// stack usage : 64 bytes +// cycles : +// interruptiaility : interruptable +// +// known limitations +// \assumptions : +// +// revision history : +// dd mm yyyy author(s) changes +// 1 12 2013 100633 first version +// 20 1 2014 100633 changes the api, optimization +// +//***************************************************************************** + + .global ih264_resi_trans_quant_4x4_av8 +ih264_resi_trans_quant_4x4_av8: + + //x0 :pointer to src buffer + //x1 :pointer to pred buffer + //x2 :pointer to dst buffer + //x3 :source stride + //x4 :pred stride + //x5 :dst stride, + //x6 :scale matirx, + //x7 :threshold matrix + // :qbits + // :round factor + // :nnz + // :pointer to store non quantized dc value + push_v_regs + //x0 :pointer to src buffer + //x1 :pointer to pred buffer + //x2 :pointer to dst buffer + //x3 :source stride + //x4 :pred stride + //x5 :scale matirx, + //x6 :threshold matrix + //x7 :qbits + //x8 :round factor + //x9 :nnz + //x10 :pointer to store non quantized dc value + + ldr w8, [sp, #64] //load round factor + ldr x10, [sp, #80] //load addres for non quant val + neg x7, x7 //negate the qbit value for usiing lsl + ldr x9, [sp, #72] + + //------------fucntion loading done----------------; + + ld1 {v30.8b}, [x0], x3 //load first 8 pix src row 1 + ld1 {v31.8b}, [x1], x4 //load first 8 pix pred row 1 + ld1 {v28.8b}, [x0], x3 //load first 8 pix src row 2 + ld1 {v29.8b}, [x1], x4 //load first 8 pix pred row 2 + ld1 {v26.8b}, [x0], x3 //load first 8 pix src row 3 + ld1 {v27.8b}, [x1], x4 //load first 8 pix pred row 3 + ld1 {v24.8b}, [x0] //load first 8 pix src row 4 + ld1 {v25.8b}, [x1] //load first 8 pix pred row 4 + + usubl v0.8h, v30.8b, v31.8b //find residue row 1 + usubl v2.8h, v28.8b, v29.8b //find residue row 2 + usubl v4.8h, v26.8b, v27.8b //find residue row 3 + usubl v6.8h, v24.8b, v25.8b //find residue row 4 + + trn1 v1.4h, v0.4h, v2.4h + trn2 v3.4h, v0.4h, v2.4h //t12 + trn1 v5.4h, v4.4h, v6.4h + trn2 v7.4h, v4.4h, v6.4h //t23 + + trn1 v0.2s, v1.2s, v5.2s + trn2 v4.2s, v1.2s, v5.2s //t13 + trn1 v2.2s, v3.2s, v7.2s + trn2 v6.2s, v3.2s, v7.2s //t14 + + add v8.4h, v0.4h, v6.4h //x0 = x4+x7 + add v9.4h, v2.4h, v4.4h //x1 = x5+x6 + sub v10.4h, v2.4h, v4.4h //x2 = x5-x6 + sub v11.4h, v0.4h, v6.4h //x3 = x4-x7 + + shl v12.4h, v10.4h, #1 //u_shift(x2,1,shft) + shl v13.4h, v11.4h, #1 //u_shift(x3,1,shft) + + add v14.4h, v8.4h, v9.4h //x4 = x0 + x1; + sub v16.4h, v8.4h, v9.4h //x6 = x0 - x1; + add v15.4h, v13.4h, v10.4h //x5 = u_shift(x3,1,shft) + x2; + sub v17.4h, v11.4h, v12.4h //x7 = x3 - u_shift(x2,1,shft); + + //taking transpose again so as to make do vert transform + trn1 v0.4h, v14.4h, v15.4h + trn2 v1.4h, v14.4h, v15.4h //t12 + trn1 v2.4h, v16.4h, v17.4h + trn2 v3.4h, v16.4h, v17.4h //t23 + + trn1 v14.2s, v0.2s, v2.2s + trn2 v16.2s, v0.2s, v2.2s //t13 + trn1 v15.2s, v1.2s, v3.2s + trn2 v17.2s, v1.2s, v3.2s //t24 + + //let us do vertical transform + //same code as horiz + add v18.4h, v14.4h , v17.4h //x0 = x4+x7 + add v19.4h, v15.4h , v16.4h //x1 = x5+x6 + sub v20.4h, v15.4h , v16.4h //x2 = x5-x6 + sub v21.4h, v14.4h , v17.4h //x3 = x4-x7 + + shl v22.4h, v20.4h, #1 //u_shift(x2,1,shft) + shl v23.4h, v21.4h, #1 //u_shift(x3,1,shft) + + dup v8.4s, w8 //load rounding value row 1 + + add v24.4h, v18.4h , v19.4h //x5 = x0 + x1; + sub v26.4h, v18.4h , v19.4h //x7 = x0 - x1; + add v25.4h, v23.4h , v20.4h //x6 = u_shift(x3,1,shft) + x2; + sub v27.4h, v21.4h , v22.4h //x8 = x3 - u_shift(x2,1,shft); + + dup v23.4s, w8 //load round factor values + + st1 {v24.h}[0], [x10] //store the dc value to alternate dc sddress +//core tranform is done for 4x8 block 1 + ld1 {v28.4h-v31.4h}, [x5] //load the scaling values + + abs v0.4h, v24.4h //abs val of row 1 + abs v1.4h, v25.4h //abs val of row 2 + abs v2.4h, v26.4h //abs val of row 3 + abs v3.4h, v27.4h //abs val of row 4 + + cmgt v4.4h, v24.4h, #0 + cmgt v5.4h, v25.4h, #0 + cmgt v6.4h, v26.4h, #0 + cmgt v7.4h, v27.4h, #0 + + smull v0.4s, v0.4h, v28.4h //multiply and add row 1 + smull v1.4s, v1.4h, v29.4h //multiply and add row 2 + smull v2.4s, v2.4h, v30.4h //multiply and add row 3 + smull v3.4s, v3.4h, v31.4h //multiply and add row 4 + + add v20.4s, v0.4s, v23.4s + add v21.4s, v1.4s, v23.4s + add v22.4s, v2.4s, v23.4s + add v23.4s, v3.4s, v23.4s + + dup v24.4s, w7 + + sshl v20.4s, v20.4s, v24.4s //shift row 1 + sshl v21.4s, v21.4s, v24.4s //shift row 2 + sshl v22.4s, v22.4s, v24.4s //shift row 3 + sshl v23.4s, v23.4s, v24.4s //shift row 4 + + xtn v20.4h, v20.4s //narrow row 1 + xtn v21.4h, v21.4s //narrow row 2 + xtn v22.4h, v22.4s //narrow row 3 + xtn v23.4h, v23.4s //narrow row 4 + + neg v24.8h, v20.8h //get negative + neg v25.8h, v21.8h //get negative + neg v26.8h, v22.8h //get negative + neg v27.8h, v23.8h //get negative + + //compare with zero for computng nnz + cmeq v0.4h, v20.4h, #0 + cmeq v1.4h, v21.4h, #0 + cmeq v2.4h, v22.4h, #0 + cmeq v3.4h, v23.4h, #0 + + bsl v4.8b, v20.8b, v24.8b //restore sign of row 1 and 2 + bsl v5.8b, v21.8b, v25.8b //restore sign of row 3 and 4 + bsl v6.8b, v22.8b, v26.8b //restore sign of row 1 and 2 + bsl v7.8b, v23.8b, v27.8b //restore sign of row 3 and 4 + + //narrow the comaprison result + mov v0.d[1], v2.d[0] + mov v1.d[1], v3.d[0] + + xtn v0.8b, v0.8h + xtn v1.8b, v1.8h + + ushr v0.8b, v0.8b, #7 //i reduce comaparison bit to a signle bit row 1 and 2 blk 1 and 2 [ keep the value for later use ] + ushr v1.8b, v1.8b, #7 //i reduce comaparison bit to a signle bit row 1 and 2 blk 1 and 2 [ keep the value for later use ] + + add v0.8b, v0.8b, v1.8b //i pair add nnz 1 + addp v0.8b, v0.8b, v0.8b //i pair add nnz 1 + addp v0.8b, v0.8b, v0.8b //i pair add nnz 1 + addp v0.8b, v0.8b, v0.8b //i pair add nnz 1 + + st1 {v4.4h-v7.4h}, [x2] //store blk + + movi v25.8b, #16 //get max nnz + sub v26.8b, v25.8b , v0.8b //invert current nnz + st1 {v26.b}[0], [x9] //write nnz + + pop_v_regs + ret + + +//***************************************************************************** +//* +//* function name : ih264_resi_trans_quant_chroma_4x4 +//* description : this function does residue calculation, forward transform +//* and quantization for 4x4 chroma block. +//* +//* arguments : x0 :pointer to src buffer +// x1 :pointer to pred buffer +// x2 :pointer to dst buffer +// x3 :source stride +// x4 :pred stride, +// x5 :dst stride, +// x6 :pointer to scaling matrix, +// x7 :pointer to threshold matrix, +// stack qbits, +// rounding factor, +// pointer to store nnz +// pointer to store unquantized dc values +// values returned : none +// +// register usage : +// stack usage : 64 bytes +// cycles : +// interruptiaility : interruptable +// +// known limitations +// \assumptions : +// +// revision history : +// dd mm yyyy author(s) changes +// 11 2 2015 100664 first version +// 25 2 2015 100633 first av8 version +//***************************************************************************** + + .global ih264_resi_trans_quant_chroma_4x4_av8 +ih264_resi_trans_quant_chroma_4x4_av8: + + //x0 :pointer to src buffer + //x1 :pointer to pred buffer + //x2 :pointer to dst buffer + //x3 :source stride + //stack :pred stride + // :scale matirx, + // :threshold matrix + // :qbits + // :round factor + // :nnz + // :pu1_dc_alt_addr + push_v_regs + //x0 :pointer to src buffer + //x1 :pointer to pred buffer + //x2 :pointer to dst buffer + //x3 :source stride + //x4 :pred stride + //x5 :scale matirx, + //x6 :threshold matrix + //x7 :qbits + //x8 :round factor + //x9 :nnz + //x10 :pointer to store non quantized dc value + + ldr w8, [sp, #64] //load round factor + ldr x10, [sp, #80] //load addres for non quant val + neg x7, x7 //negate the qbit value for usiing lsl + ldr x9, [sp, #72] + //------------fucntion loading done----------------; + + ld1 {v30.8b}, [x0], x3 //load first 8 pix src row 1 + ld1 {v31.8b}, [x1], x4 //load first 8 pix pred row 1 + ld1 {v28.8b}, [x0], x3 //load first 8 pix src row 2 + ld1 {v29.8b}, [x1], x4 //load first 8 pix pred row 2 + ld1 {v26.8b}, [x0], x3 //load first 8 pix src row 3 + ld1 {v27.8b}, [x1], x4 //load first 8 pix pred row 3 + ld1 {v24.8b}, [x0] //load first 8 pix src row 4 + ld1 {v25.8b}, [x1] //load first 8 pix pred row 4 + + + //deinterleave the loaded values + uzp1 v30.8b, v30.8b, v30.8b + uzp1 v31.8b, v31.8b, v31.8b + uzp1 v28.8b, v28.8b, v28.8b + uzp1 v29.8b, v29.8b, v29.8b + uzp1 v26.8b, v26.8b, v26.8b + uzp1 v27.8b, v27.8b, v27.8b + uzp1 v24.8b, v24.8b, v24.8b + uzp1 v25.8b, v25.8b, v25.8b + //this deinterleaving is the only differnece betweenchrom and luma fucntions + + usubl v0.8h, v30.8b, v31.8b //find residue row 1 + usubl v2.8h, v28.8b, v29.8b //find residue row 2 + usubl v4.8h, v26.8b, v27.8b //find residue row 3 + usubl v6.8h, v24.8b, v25.8b //find residue row 4 + + trn1 v1.4h, v0.4h, v2.4h + trn2 v3.4h, v0.4h, v2.4h //t12 + trn1 v5.4h, v4.4h, v6.4h + trn2 v7.4h, v4.4h, v6.4h //t23 + + trn1 v0.2s, v1.2s, v5.2s + trn2 v4.2s, v1.2s, v5.2s //t13 + trn1 v2.2s, v3.2s, v7.2s + trn2 v6.2s, v3.2s, v7.2s //t14 + + add v8.4h, v0.4h, v6.4h //x0 = x4+x7 + add v9.4h, v2.4h, v4.4h //x1 = x5+x6 + sub v10.4h, v2.4h, v4.4h //x2 = x5-x6 + sub v11.4h, v0.4h, v6.4h //x3 = x4-x7 + + shl v12.4h, v10.4h, #1 //u_shift(x2,1,shft) + shl v13.4h, v11.4h, #1 //u_shift(x3,1,shft) + + add v14.4h, v8.4h, v9.4h //x4 = x0 + x1; + sub v16.4h, v8.4h, v9.4h //x6 = x0 - x1; + add v15.4h, v13.4h, v10.4h //x5 = u_shift(x3,1,shft) + x2; + sub v17.4h, v11.4h, v12.4h //x7 = x3 - u_shift(x2,1,shft); + + //taking transpose again so as to make do vert transform + trn1 v0.4h, v14.4h, v15.4h + trn2 v1.4h, v14.4h, v15.4h //t12 + trn1 v2.4h, v16.4h, v17.4h + trn2 v3.4h, v16.4h, v17.4h //t23 + + trn1 v14.2s, v0.2s, v2.2s + trn2 v16.2s, v0.2s, v2.2s //t13 + trn1 v15.2s, v1.2s, v3.2s + trn2 v17.2s, v1.2s, v3.2s //t24 + + //let us do vertical transform + //same code as horiz + add v18.4h, v14.4h , v17.4h //x0 = x4+x7 + add v19.4h, v15.4h , v16.4h //x1 = x5+x6 + sub v20.4h, v15.4h , v16.4h //x2 = x5-x6 + sub v21.4h, v14.4h , v17.4h //x3 = x4-x7 + + shl v22.4h, v20.4h, #1 //u_shift(x2,1,shft) + shl v23.4h, v21.4h, #1 //u_shift(x3,1,shft) + + dup v8.4s, w8 //load rounding value row 1 + + add v24.4h, v18.4h , v19.4h //x5 = x0 + x1; + sub v26.4h, v18.4h , v19.4h //x7 = x0 - x1; + add v25.4h, v23.4h , v20.4h //x6 = u_shift(x3,1,shft) + x2; + sub v27.4h, v21.4h , v22.4h //x8 = x3 - u_shift(x2,1,shft); + + dup v23.4s, w8 //load round factor values + + st1 {v24.h}[0], [x10] //store the dc value to alternate dc sddress +//core tranform is done for 4x8 block 1 + ld1 {v28.4h-v31.4h}, [x5] //load the scaling values + + abs v0.4h, v24.4h //abs val of row 1 + abs v1.4h, v25.4h //abs val of row 2 + abs v2.4h, v26.4h //abs val of row 3 + abs v3.4h, v27.4h //abs val of row 4 + + cmgt v4.4h, v24.4h, #0 + cmgt v5.4h, v25.4h, #0 + cmgt v6.4h, v26.4h, #0 + cmgt v7.4h, v27.4h, #0 + + smull v0.4s, v0.4h, v28.4h //multiply and add row 1 + smull v1.4s, v1.4h, v29.4h //multiply and add row 2 + smull v2.4s, v2.4h, v30.4h //multiply and add row 3 + smull v3.4s, v3.4h, v31.4h //multiply and add row 4 + + add v20.4s, v0.4s, v23.4s + add v21.4s, v1.4s, v23.4s + add v22.4s, v2.4s, v23.4s + add v23.4s, v3.4s, v23.4s + + dup v24.4s, w7 + + sshl v20.4s, v20.4s, v24.4s //shift row 1 + sshl v21.4s, v21.4s, v24.4s //shift row 2 + sshl v22.4s, v22.4s, v24.4s //shift row 3 + sshl v23.4s, v23.4s, v24.4s //shift row 4 + + xtn v20.4h, v20.4s //narrow row 1 + xtn v21.4h, v21.4s //narrow row 2 + xtn v22.4h, v22.4s //narrow row 3 + xtn v23.4h, v23.4s //narrow row 4 + + neg v24.8h, v20.8h //get negative + neg v25.8h, v21.8h //get negative + neg v26.8h, v22.8h //get negative + neg v27.8h, v23.8h //get negative + + //compare with zero for computng nnz + cmeq v0.4h, v20.4h, #0 + cmeq v1.4h, v21.4h, #0 + cmeq v2.4h, v22.4h, #0 + cmeq v3.4h, v23.4h, #0 + + bsl v4.8b, v20.8b, v24.8b //restore sign of row 1 and 2 + bsl v5.8b, v21.8b, v25.8b //restore sign of row 3 and 4 + bsl v6.8b, v22.8b, v26.8b //restore sign of row 1 and 2 + bsl v7.8b, v23.8b, v27.8b //restore sign of row 3 and 4 + + //narrow the comaprison result + mov v0.d[1], v2.d[0] + mov v1.d[1], v3.d[0] + + xtn v0.8b, v0.8h + xtn v1.8b, v1.8h + + ushr v0.8b, v0.8b, #7 //i reduce comaparison bit to a signle bit row 1 and 2 blk 1 and 2 [ keep the value for later use ] + ushr v1.8b, v1.8b, #7 //i reduce comaparison bit to a signle bit row 1 and 2 blk 1 and 2 [ keep the value for later use ] + + add v0.8b, v0.8b, v1.8b //i pair add nnz 1 + addp v0.8b, v0.8b, v0.8b //i pair add nnz 1 + addp v0.8b, v0.8b, v0.8b //i pair add nnz 1 + addp v0.8b, v0.8b, v0.8b //i pair add nnz 1 + + st1 {v4.4h-v7.4h}, [x2] //store blk + + movi v25.8b, #16 //get max nnz + sub v26.8b, v25.8b , v0.8b //invert current nnz + st1 {v26.b}[0], [x9] //write nnz + + pop_v_regs + ret + + +//***************************************************************************** +//* +//* function name : ih264_hadamard_quant_4x4_av8 +//* description : this function does forward hadamard transform and +//* quantization for luma dc block +//* +//* arguments : x0 :pointer to src buffer +// x1 :pointer to dst buffer +// x2 :pu2_scale_matrix +// x2 :pu2_threshold_matrix +// x3 :u4_qbits +// x4 :u4_round_factor +// x5 :pu1_nnz +// values returned : none +// +// register usage : +// stack usage : 0 bytes +// cycles : around +// interruptiaility : interruptable +// +// known limitations +// \assumptions : +// +// revision history : +// dd mm yyyy author(s) changes +// 20 2 2015 100633 first version +// +//***************************************************************************** +//ih264_hadamard_quant_4x4_av8(word16 *pi2_src, word16 *pi2_dst, +// const uword16 *pu2_scale_matrix, +// const uword16 *pu2_threshold_matrix, uword32 u4_qbits, +// uword32 u4_round_factor,uword8 *pu1_nnz +// ) + .global ih264_hadamard_quant_4x4_av8 +ih264_hadamard_quant_4x4_av8: + +//x0 :pointer to src buffer +//x1 :pointer to dst buffer +//x2 :pu2_scale_matrix +//x3 :pu2_threshold_matrix +//x4 :u4_qbits +//x5 :u4_round_factor +//x6 :pu1_nnz + + push_v_regs + + ld4 {v0.4h-v3.4h}, [x0] //load 4x4 block + ld1 {v30.h}[0], [x2] //load pu2_scale_matrix[0] + + saddl v4.4s, v0.4h, v3.4h //x0 = x4 + x7; + saddl v5.4s, v1.4h, v2.4h //x1 = x5 + x6; + ssubl v6.4s, v1.4h, v2.4h //x2 = x5 - x6; + ssubl v7.4s, v0.4h, v3.4h //x3 = x4 - x7; + + dup v30.8h, v30.h[0] //pu2_scale_matrix[0] + + add v14.4s, v4.4s, v5.4s //pi2_dst[0] = x0 + x1; + add v15.4s, v7.4s, v6.4s //pi2_dst[1] = x3 + x2; + sub v16.4s, v4.4s, v5.4s //pi2_dst[2] = x0 - x1; + sub v17.4s, v7.4s, v6.4s //pi2_dst[3] = x3 - x2; + + //transpose 4x4 block + trn1 v18.4s, v14.4s, v15.4s + trn2 v19.4s, v14.4s, v15.4s + trn1 v20.4s, v16.4s, v17.4s + trn2 v21.4s, v16.4s, v17.4s + + trn1 v14.2d, v18.2d, v20.2d + trn2 v16.2d, v18.2d, v20.2d + trn1 v15.2d, v19.2d, v21.2d + trn2 v17.2d, v19.2d, v21.2d + //end transpose + + add v18.4s, v14.4s, v17.4s //x0 = x4 + x7; + add v19.4s, v15.4s, v16.4s //x1 = x5 + x6; + sub v20.4s, v15.4s, v16.4s //x2 = x5 - x6; + sub v21.4s, v14.4s, v17.4s //x3 = x4 - x7; + + dup v14.4s, w5 //round factor + dup v15.4s, v14.s[0] + dup v16.4s, v14.s[0] + dup v17.4s, v14.s[0] + + add v22.4s, v18.4s, v19.4s //(x0 + x1) + add v23.4s, v21.4s, v20.4s //(x3 + x2) + sub v24.4s, v18.4s, v19.4s //(x0 - x1) + sub v25.4s, v21.4s, v20.4s //(x3 - x2) + + shrn v0.4h, v22.4s, #1 //i4_value = (x0 + x1) >> 1; + shrn2 v0.8h, v23.4s, #1 //i4_value = (x3 + x2) >> 1; + shrn v1.4h, v24.4s, #1 //i4_value = (x0 - x1) >> 1; + shrn2 v1.8h, v25.4s, #1 //i4_value = (x3 - x2) >> 1; + + abs v2.8h, v0.8h + abs v3.8h, v1.8h + + cmgt v4.8h, v0.8h, #0 //get the sign row 1,2 + cmgt v5.8h, v1.8h, #0 + + neg w4, w4 //-u4_qbits + dup v22.4s, w4 //load -u4_qbits + + umlal v14.4s, v2.4h, v30.4h + umlal2 v15.4s, v2.8h, v30.8h + umlal v16.4s, v3.4h, v30.4h + umlal2 v17.4s, v3.8h, v30.8h + + ushl v14.4s, v14.4s, v22.4s + ushl v15.4s, v15.4s, v22.4s + ushl v16.4s, v16.4s, v22.4s + ushl v17.4s, v17.4s, v22.4s + + uqxtn v14.4h, v14.4s + uqxtn2 v14.8h, v15.4s + uqxtn v16.4h, v16.4s + uqxtn2 v16.8h, v17.4s + + neg v15.8h, v14.8h + neg v17.8h, v16.8h + + bsl v4.16b, v14.16b, v15.16b + bsl v5.16b, v16.16b, v17.16b + + cmeq v0.8h, v14.8h, #0 + cmeq v1.8h, v16.8h, #0 + + st1 {v4.8h-v5.8h}, [x1] + + movi v20.8b, #16 + + xtn v2.8b, v0.8h + xtn v3.8b, v1.8h + + ushr v2.8b, v2.8b, #7 + ushr v3.8b, v3.8b, #7 + + add v2.8b, v2.8b, v3.8b + addp v2.8b, v2.8b, v2.8b + addp v2.8b, v2.8b, v2.8b + addp v2.8b, v2.8b, v2.8b + sub v20.8b, v20.8b, v2.8b + st1 {v20.b}[0], [x6] + + pop_v_regs + ret + + +//***************************************************************************** +//* +//* function name : ih264_hadamard_quant_2x2_uv +//* description : this function does forward hadamard transform and +//* quantization for dc block of chroma for both planes +//* +//* arguments : x0 :pointer to src buffer +// x1 :pointer to dst buffer +// x2 :pu2_scale_matrix +// x2 :pu2_threshold_matrix +// x3 :u4_qbits +// x4 :u4_round_factor +// x5 :pu1_nnz +// values returned : none +// +// register usage : +// stack usage : 0 bytes +// cycles : around +// interruptiaility : interruptable +// +// known limitations +// \assumptions : +// +// revision history : +// dd mm yyyy author(s) changes +// 20 2 2015 100633 first version +// +//***************************************************************************** +// ih264_hadamard_quant_2x2_uv_av8(word16 *pi2_src, word16 *pi2_dst, +// const uword16 *pu2_scale_matrix, +// const uword16 *pu2_threshold_matrix, uword32 u4_qbits, +// uword32 u4_round_factor,uword8 *pu1_nnz +// ) + + .global ih264_hadamard_quant_2x2_uv_av8 +ih264_hadamard_quant_2x2_uv_av8: + + push_v_regs + + ld2 {v0.4h-v1.4h}, [x0] //load src + + ld1 {v30.h}[0], [x2] //load pu2_scale_matrix[0] + dup v30.4h, v30.4h[0] //pu2_scale_matrix + uxtl v30.4s, v30.4h //pu2_scale_matrix + + neg w4, w4 + dup v24.4s, w4 //u4_qbits + + dup v25.4s, w5 //round fact + dup v26.4s, v25.s[0] + + saddl v2.4s, v0.4h, v1.4h //x0 = x4 + x5;, x2 = x6 + x7; + ssubl v3.4s, v0.4h, v1.4h //x1 = x4 - x5; x3 = x6 - x7; + + trn1 v4.4s, v2.4s, v3.4s + trn2 v5.4s, v2.4s, v3.4s //q1 -> x0 x1, q2 -> x2 x3 + + add v0.4s, v4.4s , v5.4s // (x0 + x2) (x1 + x3) (y0 + y2); (y1 + y3); + sub v1.4s, v4.4s , v5.4s // (x0 - x2) (x1 - x3) (y0 - y2); (y1 - y3); + + abs v2.4s, v0.4s + abs v3.4s, v1.4s + + cmgt v4.4s, v0.4s, #0 //get the sign row 1,2 + cmgt v5.4s, v1.4s, #0 + + uqxtn v4.4h, v4.4s + sqxtn2 v4.8h, v5.4s + + mla v25.4s, v2.4s, v30.4s + mla v26.4s, v3.4s, v30.4s + + ushl v2.4s, v25.4s, v24.4s //>>qbit + ushl v3.4s, v26.4s, v24.4s //>>qbit + + uqxtn v2.4h, v2.4s + uqxtn2 v2.8h, v3.4s + + neg v5.8h, v2.8h + + bsl v4.16b, v2.16b, v5.16b //*sign + + //rearrange such that we get each plane coeffs as continous + mov v5.s[0], v4.s[1] + mov v4.s[1], v4.s[2] + mov v4.s[2], v5.s[0] + + cmeq v5.8h, v4.8h, #0 //compute nnz + xtn v5.8b, v5.8h //reduce nnz comparison to 1 bit + ushr v5.8b, v5.8b, #7 //reduce nnz comparison to 1 bit + movi v20.8b, #4 //since we add zeros, we need to subtract from 4 to get nnz + addp v5.8b, v5.8b, v5.8b //sum up nnz + addp v5.8b, v5.8b, v5.8b //sum up nnz + + st1 {v4.8h}, [x1] //store the block + + st1 {v4.8h}, [x1] //store the block + sub v20.8b, v20.8b, v5.8b //4- numzeros + + st1 {v20.h}[0], [x6] //store nnz + + pop_v_regs + ret + + + diff --git a/common/armv8/ih264_weighted_bi_pred_av8.s b/common/armv8/ih264_weighted_bi_pred_av8.s new file mode 100755 index 0000000..f7d0846 --- /dev/null +++ b/common/armv8/ih264_weighted_bi_pred_av8.s @@ -0,0 +1,574 @@ +//****************************************************************************** +//* +//* Copyright (C) 2015 The Android Open Source Project +//* +//* Licensed under the Apache License, Version 2.0 (the "License"); +//* you may not use this file except in compliance with the License. +//* You may obtain a copy of the License at: +//* +//* http://www.apache.org/licenses/LICENSE-2.0 +//* +//* Unless required by applicable law or agreed to in writing, software +//* distributed under the License is distributed on an "AS IS" BASIS, +//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +//* See the License for the specific language governing permissions and +//* limitations under the License. +//* +//***************************************************************************** +//* Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore +//*/ +///** +//****************************************************************************** +//* @file +//* ih264_weighted_bi_pred_av8.s +//* +//* @brief +//* Contains function definitions for weighted biprediction. +//* Functions are coded using NEON intrinsics and can be compiled using ARM RVCT +//* +//* @author +//* Kaushik Senthoor R +//* +//* @par List of Functions: +//* +//* - ih264_weighted_bi_pred_luma_av8() +//* - ih264_weighted_bi_pred_chroma_av8() +//* +//* @remarks +//* None +//* +//******************************************************************************* +//*/ +//******************************************************************************* +//* @function +//* ih264_weighted_bi_pred_luma_av8() +//* +//* @brief +//* This routine performs the default weighted prediction as described in sec +//* 8.4.2.3.2 titled "Weighted sample prediction process" for luma. +//* +//* @par Description: +//* This function gets two ht x wd blocks, calculates the weighted samples, +//* rounds off, adds offset and stores it in the destination block. +//* +//* @param[in] puc_src1 +//* UWORD8 Pointer to the buffer containing the input block 1. +//* +//* @param[in] puc_src2 +//* UWORD8 Pointer to the buffer containing the input block 2. +//* +//* @param[out] puc_dst +//* UWORD8 pointer to the destination where the output block is stored. +//* +//* @param[in] src_strd1 +//* Stride of the input buffer 1 +//* +//* @param[in] src_strd2 +//* Stride of the input buffer 2 +//* +//* @param[in] dst_strd +//* Stride of the destination buffer +//* +//* @param[in] log_WD +//* number of bits to be rounded off +//* +//* @param[in] wt1 +//* weight for the weighted prediction +//* +//* @param[in] wt2 +//* weight for the weighted prediction +//* +//* @param[in] ofst1 +//* offset 1 used after rounding off +//* +//* @param[in] ofst2 +//* offset 2 used after rounding off +//* +//* @param[in] ht +//* integer height of the array +//* +//* @param[in] wd +//* integer width of the array +//* +//* @returns +//* None +//* +//* @remarks +//* (ht,wd) can be (4,4), (4,8), (8,4), (8,8), (8,16), (16,8) or (16,16). +//* +//******************************************************************************* +//*/ +//void ih264_weighted_bi_pred_luma_av8(UWORD8 *puc_src1, +// UWORD8 *puc_src2, +// UWORD8 *puc_dst, +// WORD32 src_strd1, +// WORD32 src_strd2, +// WORD32 dst_strd, +// UWORD16 log_WD, +// UWORD32 wt1, +// UWORD32 wt2, +// UWORD16 ofst1, +// UWORD16 ofst2, +// UWORD8 ht, +// UWORD8 wd) +// +//**************Variables Vs Registers***************************************** +// x0 => puc_src1 +// x1 => puc_src2 +// x2 => puc_dst +// x3 => src_strd1 +// [sp] => src_strd2 (x4) +// [sp+4] => dst_strd (x5) +// [sp+8] => log_WD (x6) +// [sp+12] => wt1 (x7) +// [sp+16] => wt2 (x8) +// [sp+20] => ofst1 (x9) +// [sp+24] => ofst2 (x10) +// [sp+28] => ht (x11) +// [sp+32] => wd (x12) +// +.text +.p2align 2 +.include "ih264_neon_macros.s" + + + + .global ih264_weighted_bi_pred_luma_av8 + +ih264_weighted_bi_pred_luma_av8: + + // STMFD sp!, {x4-x12,x14} //stack stores the values of the arguments + push_v_regs + stp x19, x20, [sp, #-16]! + ldr x8, [sp, #80] //Load wt2 in x8 + ldr x9, [sp, #88] //Load ofst1 in x9 + add x6, x6, #1 //x6 = log_WD + 1 + sub x20, x6, #0 //x13 = -(log_WD + 1) + neg x10, x20 + dup v0.8h, w10 //Q0 = -(log_WD + 1) (32-bit) + ldr x10, [sp, #96] //Load ofst2 in x10 + ldr x11, [sp, #104] //Load ht in x11 + ldr x12, [sp, #112] //Load wd in x12 + add x9, x9, #1 //x9 = ofst1 + 1 + add x9, x9, x10 //x9 = ofst1 + ofst2 + 1 + mov v2.s[0], w7 + mov v2.s[1], w8 //D2 = {wt1(32-bit), wt2(32-bit)} + asr x9, x9, #1 //x9 = ofst = (ofst1 + ofst2 + 1) >> 1 + dup v3.8b, w9 //D3 = ofst (8-bit) + cmp w12, #16 + beq loop_16 //branch if wd is 16 + cmp w12, #8 //check if wd is 8 + beq loop_8 //branch if wd is 8 + +loop_4: //each iteration processes four rows + + ld1 {v4.s}[0], [x0], x3 //load row 1 in source 1 + ld1 {v4.s}[1], [x0], x3 //load row 2 in source 1 + ld1 {v6.s}[0], [x1], x4 //load row 1 in source 2 + ld1 {v6.s}[1], [x1], x4 //load row 2 in source 2 + uxtl v4.8h, v4.8b //converting rows 1,2 in source 1 to 16-bit + ld1 {v8.s}[0], [x0], x3 //load row 3 in source 1 + ld1 {v8.s}[1], [x0], x3 //load row 4 in source 1 + uxtl v6.8h, v6.8b //converting rows 1,2 in source 2 to 16-bit + ld1 {v10.s}[0], [x1], x4 //load row 3 in source 2 + ld1 {v10.s}[1], [x1], x4 //load row 4 in source 2 + uxtl v8.8h, v8.8b //converting rows 3,4 in source 1 to 16-bit + uxtl v10.8h, v10.8b //converting rows 3,4 in source 2 to 16-bit + mul v4.8h, v4.8h , v2.4h[0] //weight 1 mult. for rows 1,2 + mla v4.8h, v6.8h , v2.4h[2] //weight 2 mult. for rows 1,2 + mul v8.8h, v8.8h , v2.4h[0] //weight 1 mult. for rows 3,4 + mla v8.8h, v10.8h , v2.4h[2] //weight 2 mult. for rows 3,4 + subs w11, w11, #4 //decrement ht by 4 + srshl v4.8h, v4.8h , v0.8h //rounds off the weighted samples from rows 1,2 + srshl v8.8h, v8.8h , v0.8h //rounds off the weighted samples from rows 3,4 + saddw v4.8h, v4.8h , v3.8b //adding offset for rows 1,2 + saddw v8.8h, v8.8h , v3.8b //adding offset for rows 3,4 + sqxtun v4.8b, v4.8h //saturating rows 1,2 to unsigned 8-bit + sqxtun v8.8b, v8.8h //saturating rows 3,4 to unsigned 8-bit + st1 {v4.s}[0], [x2], x5 //store row 1 in destination + st1 {v4.s}[1], [x2], x5 //store row 2 in destination + st1 {v8.s}[0], [x2], x5 //store row 3 in destination + st1 {v8.s}[1], [x2], x5 //store row 4 in destination + bgt loop_4 //if greater than 0 repeat the loop again + b end_loops + +loop_8: //each iteration processes four rows + + ld1 {v4.8b}, [x0], x3 //load row 1 in source 1 + ld1 {v6.8b}, [x1], x4 //load row 1 in source 2 + ld1 {v8.8b}, [x0], x3 //load row 2 in source 1 + ld1 {v10.8b}, [x1], x4 //load row 2 in source 2 + uxtl v4.8h, v4.8b //converting row 1 in source 1 to 16-bit + ld1 {v12.8b}, [x0], x3 //load row 3 in source 1 + ld1 {v14.8b}, [x1], x4 //load row 3 in source 2 + uxtl v6.8h, v6.8b //converting row 1 in source 2 to 16-bit + ld1 {v16.8b}, [x0], x3 //load row 4 in source 1 + ld1 {v18.8b}, [x1], x4 //load row 4 in source 2 + uxtl v8.8h, v8.8b //converting row 2 in source 1 to 16-bit + uxtl v10.8h, v10.8b //converting row 2 in source 2 to 16-bit + mul v4.8h, v4.8h , v2.4h[0] //weight 1 mult. for row 1 + mla v4.8h, v6.8h , v2.4h[2] //weight 2 mult. for row 1 + uxtl v12.8h, v12.8b //converting row 3 in source 1 to 16-bit + uxtl v14.8h, v14.8b //converting row 3 in source 2 to 16-bit + mul v8.8h, v8.8h , v2.4h[0] //weight 1 mult. for row 2 + mla v8.8h, v10.8h , v2.4h[2] //weight 2 mult. for row 2 + uxtl v16.8h, v16.8b //converting row 4 in source 1 to 16-bit + uxtl v18.8h, v18.8b //converting row 4 in source 2 to 16-bit + mul v12.8h, v12.8h , v2.4h[0] //weight 1 mult. for row 3 + mla v12.8h, v14.8h , v2.4h[2] //weight 2 mult. for row 3 + mul v16.8h, v16.8h , v2.4h[0] //weight 1 mult. for row 4 + mla v16.8h, v18.8h , v2.4h[2] //weight 2 mult. for row 4 + srshl v4.8h, v4.8h , v0.8h //rounds off the weighted samples from row 1 + srshl v8.8h, v8.8h , v0.8h //rounds off the weighted samples from row 2 + srshl v12.8h, v12.8h , v0.8h //rounds off the weighted samples from row 3 + saddw v4.8h, v4.8h , v3.8b //adding offset for row 1 + srshl v16.8h, v16.8h , v0.8h //rounds off the weighted samples from row 4 + saddw v8.8h, v8.8h , v3.8b //adding offset for row 2 + saddw v12.8h, v12.8h , v3.8b //adding offset for row 3 + sqxtun v4.8b, v4.8h //saturating row 1 to unsigned 8-bit + saddw v16.8h, v16.8h , v3.8b //adding offset for row 4 + sqxtun v8.8b, v8.8h //saturating row 2 to unsigned 8-bit + sqxtun v12.8b, v12.8h //saturating row 3 to unsigned 8-bit + sqxtun v16.8b, v16.8h //saturating row 4 to unsigned 8-bit + st1 {v4.8b}, [x2], x5 //store row 1 in destination + st1 {v8.8b}, [x2], x5 //store row 2 in destination + subs w11, w11, #4 //decrement ht by 4 + st1 {v12.8b}, [x2], x5 //store row 3 in destination + st1 {v16.8b}, [x2], x5 //store row 4 in destination + bgt loop_8 //if greater than 0 repeat the loop again + b end_loops + +loop_16: //each iteration processes two rows + + ld1 {v4.8b, v5.8b}, [x0], x3 //load row 1 in source 1 + ld1 {v6.8b, v7.8b}, [x1], x4 //load row 1 in source 2 + ld1 {v8.8b, v9.8b}, [x0], x3 //load row 2 in source 1 + ld1 {v10.8b, v11.8b}, [x1], x4 //load row 2 in source 2 + uxtl v20.8h, v4.8b //converting row 1L in source 1 to 16-bit + ld1 {v12.8b, v13.8b}, [x0], x3 //load row 3 in source 1 + ld1 {v14.8b, v15.8b}, [x1], x4 //load row 3 in source 2 + uxtl v22.8h, v6.8b //converting row 1L in source 2 to 16-bit + ld1 {v16.8b, v17.8b}, [x0], x3 //load row 4 in source 1 + ld1 {v18.8b, v19.8b}, [x1], x4 //load row 4 in source 2 + uxtl v4.8h, v5.8b //converting row 1H in source 1 to 16-bit + uxtl v6.8h, v7.8b //converting row 1H in source 2 to 16-bit + mul v20.8h, v20.8h , v2.4h[0] //weight 1 mult. for row 1L + mla v20.8h, v22.8h , v2.4h[2] //weight 2 mult. for row 1L + uxtl v24.8h, v8.8b //converting row 2L in source 1 to 16-bit + uxtl v26.8h, v10.8b //converting row 2L in source 2 to 16-bit + mul v4.8h, v4.8h , v2.4h[0] //weight 1 mult. for row 1H + mla v4.8h, v6.8h , v2.4h[2] //weight 2 mult. for row 1H + uxtl v8.8h, v9.8b //converting row 2H in source 1 to 16-bit + uxtl v10.8h, v11.8b //converting row 2H in source 2 to 16-bit + mul v24.8h, v24.8h , v2.4h[0] //weight 1 mult. for row 2L + mla v24.8h, v26.8h , v2.4h[2] //weight 2 mult. for row 2L + uxtl v28.8h, v12.8b //converting row 3L in source 1 to 16-bit + uxtl v30.8h, v14.8b //converting row 3L in source 2 to 16-bit + mul v8.8h, v8.8h , v2.4h[0] //weight 1 mult. for row 2H + mla v8.8h, v10.8h , v2.4h[2] //weight 2 mult. for row 2H + uxtl v12.8h, v13.8b //converting row 3H in source 1 to 16-bit + uxtl v14.8h, v15.8b //converting row 3H in source 2 to 16-bit + mul v28.8h, v28.8h , v2.4h[0] //weight 1 mult. for row 3L + mla v28.8h, v30.8h , v2.4h[2] //weight 2 mult. for row 3L + uxtl v22.8h, v16.8b //converting row 4L in source 1 to 16-bit + uxtl v6.8h, v18.8b //converting row 4L in source 2 to 16-bit + mul v12.8h, v12.8h , v2.4h[0] //weight 1 mult. for row 3H + mla v12.8h, v14.8h , v2.4h[2] //weight 2 mult. for row 3H + uxtl v16.8h, v17.8b //converting row 4H in source 1 to 16-bit + uxtl v18.8h, v19.8b //converting row 4H in source 2 to 16-bit + mul v22.8h, v22.8h , v2.4h[0] //weight 1 mult. for row 4L + mla v22.8h, v6.8h , v2.4h[2] //weight 2 mult. for row 4L + srshl v20.8h, v20.8h , v0.8h //rounds off the weighted samples from row 1L + mul v16.8h, v16.8h , v2.4h[0] //weight 1 mult. for row 4H + mla v16.8h, v18.8h , v2.4h[2] //weight 2 mult. for row 4H + srshl v4.8h, v4.8h , v0.8h //rounds off the weighted samples from row 1H + srshl v24.8h, v24.8h , v0.8h //rounds off the weighted samples from row 2L + saddw v20.8h, v20.8h , v3.8b //adding offset for row 1L + srshl v8.8h, v8.8h , v0.8h //rounds off the weighted samples from row 2H + saddw v4.8h, v4.8h , v3.8b //adding offset for row 1H + srshl v28.8h, v28.8h , v0.8h //rounds off the weighted samples from row 3L + saddw v24.8h, v24.8h , v3.8b //adding offset for row 2L + srshl v12.8h, v12.8h , v0.8h //rounds off the weighted samples from row 3H + saddw v8.8h, v8.8h , v3.8b //adding offset for row 2H + srshl v22.8h, v22.8h , v0.8h //rounds off the weighted samples from row 4L + saddw v28.8h, v28.8h , v3.8b //adding offset for row 3L + srshl v16.8h, v16.8h , v0.8h //rounds off the weighted samples from row 4H + saddw v12.8h, v12.8h , v3.8b //adding offset for row 3H + sqxtun v26.8b, v20.8h //saturating row 1L to unsigned 8-bit + saddw v22.8h, v22.8h , v3.8b //adding offset for row 4L + sqxtun v27.8b, v4.8h //saturating row 1H to unsigned 8-bit + saddw v16.8h, v16.8h , v3.8b //adding offset for row 4H + sqxtun v10.8b, v24.8h //saturating row 2L to unsigned 8-bit + sqxtun v11.8b, v8.8h //saturating row 2H to unsigned 8-bit + sqxtun v30.8b, v28.8h //saturating row 3L to unsigned 8-bit + sqxtun v31.8b, v12.8h //saturating row 3H to unsigned 8-bit + st1 {v26.8b, v27.8b}, [x2], x5 //store row 1 in destination + sqxtun v14.8b, v22.8h //saturating row 4L to unsigned 8-bit + sqxtun v15.8b, v16.8h //saturating row 4H to unsigned 8-bit + st1 {v10.8b, v11.8b}, [x2], x5 //store row 2 in destination + subs w11, w11, #4 //decrement ht by 4 + st1 {v30.8b, v31.8b}, [x2], x5 //store row 3 in destination + st1 {v14.8b, v15.8b}, [x2], x5 //store row 4 in destination + bgt loop_16 //if greater than 0 repeat the loop again + +end_loops: + + // LDMFD sp!,{x4-x12,x15} //Reload the registers from sp + ldp x19, x20, [sp], #16 + pop_v_regs + ret + + +//******************************************************************************* +//* @function +//* ih264_weighted_bi_pred_chroma_av8() +//* +//* @brief +//* This routine performs the default weighted prediction as described in sec +//* 8.4.2.3.2 titled "Weighted sample prediction process" for chroma. +//* +//* @par Description: +//* This function gets two ht x wd blocks, calculates the weighted samples, +//* rounds off, adds offset and stores it in the destination block for U and V. +//* +//* @param[in] puc_src1 +//* UWORD8 Pointer to the buffer containing the input block 1. +//* +//* @param[in] puc_src2 +//* UWORD8 Pointer to the buffer containing the input block 2. +//* +//* @param[out] puc_dst +//* UWORD8 pointer to the destination where the output block is stored. +//* +//* @param[in] src_strd1 +//* Stride of the input buffer 1 +//* +//* @param[in] src_strd2 +//* Stride of the input buffer 2 +//* +//* @param[in] dst_strd +//* Stride of the destination buffer +//* +//* @param[in] log_WD +//* number of bits to be rounded off +//* +//* @param[in] wt1 +//* weights for the weighted prediction in U and V +//* +//* @param[in] wt2 +//* weights for the weighted prediction in U and V +//* +//* @param[in] ofst1 +//* offset 1 used after rounding off for U an dV +//* +//* @param[in] ofst2 +//* offset 2 used after rounding off for U and V +//* +//* @param[in] ht +//* integer height of the array +//* +//* @param[in] wd +//* integer width of the array +//* +//* @returns +//* None +//* +//* @remarks +//* (ht,wd) can be (2,2), (2,4), (4,2), (4,4), (4,8), (8,4) or (8,8). +//* +//******************************************************************************* +//*/ +//void ih264_weighted_bi_pred_chroma_av8(UWORD8 *puc_src1, +// UWORD8 *puc_src2, +// UWORD8 *puc_dst, +// WORD32 src_strd1, +// WORD32 src_strd2, +// WORD32 dst_strd, +// UWORD16 log_WD, +// UWORD32 wt1, +// UWORD32 wt2, +// UWORD16 ofst1, +// UWORD16 ofst2, +// UWORD8 ht, +// UWORD8 wd) +// +//**************Variables Vs Registers***************************************** +// x0 => puc_src1 +// x1 => puc_src2 +// x2 => puc_dst +// x3 => src_strd1 +// [sp] => src_strd2 (x4) +// [sp+4] => dst_strd (x5) +// [sp+8] => log_WD (x6) +// [sp+12] => wt1 (x7) +// [sp+16] => wt2 (x8) +// [sp+20] => ofst1 (x9) +// [sp+24] => ofst2 (x10) +// [sp+28] => ht (x11) +// [sp+32] => wd (x12) +// + + + + + + .global ih264_weighted_bi_pred_chroma_av8 + +ih264_weighted_bi_pred_chroma_av8: + + // STMFD sp!, {x4-x12,x14} //stack stores the values of the arguments + push_v_regs + stp x19, x20, [sp, #-16]! + + + ldr x8, [sp, #80] //Load wt2 in x8 + dup v4.4s, w8 //Q2 = (wt2_u, wt2_v) (32-bit) + dup v2.4s, w7 //Q1 = (wt1_u, wt1_v) (32-bit) + add x6, x6, #1 //x6 = log_WD + 1 + ldr w9, [sp, #88] //Load ofst1 in x9 + sxtw x9, w9 + ldr w10, [sp, #96] //Load ofst2 in x10 + sxtw x10, w10 + sub x20, x6, #0 //x12 = -(log_WD + 1) + neg x20, x20 + dup v0.8h, w20 //Q0 = -(log_WD + 1) (16-bit) + ldr w11, [sp, #104] //Load ht in x11 + ldr w12, [sp, #112] //Load wd in x12 + sxtw x11, w11 + sxtw x12, w12 + dup v20.8h, w9 //0ffset1 + dup v21.8h, w10 //0ffset2 + srhadd v6.8b, v20.8b, v21.8b + sxtl v6.8h, v6.8b + cmp w12, #8 //check if wd is 8 + beq loop_8_uv //branch if wd is 8 + cmp w12, #4 //check if wd is 4 + beq loop_4_uv //branch if wd is 4 + +loop_2_uv: //each iteration processes two rows + + ld1 {v8.s}[0], [x0], x3 //load row 1 in source 1 + ld1 {v8.s}[1], [x0], x3 //load row 2 in source 1 + ld1 {v10.s}[0], [x1], x4 //load row 1 in source 2 + ld1 {v10.s}[1], [x1], x4 //load row 2 in source 2 + uxtl v8.8h, v8.8b //converting rows 1,2 in source 1 to 16-bit + uxtl v10.8h, v10.8b //converting rows 1,2 in source 2 to 16-bit + mul v8.8h, v8.8h , v2.8h //weight 1 mult. for rows 1,2 + mla v8.8h, v10.8h , v4.8h //weight 2 mult. for rows 1,2 + srshl v8.8h, v8.8h , v0.8h //rounds off the weighted samples from rows 1,2 + add v8.8h, v8.8h , v6.8h //adding offset for rows 1,2 + sqxtun v8.8b, v8.8h //saturating rows 1,2 to unsigned 8-bit/ + st1 {v8.s}[0], [x2], x5 //store row 1 in destination + st1 {v8.s}[1], [x2], x5 //store row 2 in destination + subs w11, w11, #2 //decrement ht by 2 + bgt loop_2_uv //if greater than 0 repeat the loop again + b end_loops_uv + +loop_4_uv: //each iteration processes two rows + + ld1 {v8.8b}, [x0], x3 //load row 1 in source 1 + ld1 {v10.8b}, [x1], x4 //load row 1 in source 2 + uxtl v8.8h, v8.8b //converting row 1 in source 1 to 16-bit + ld1 {v12.8b}, [x0], x3 //load row 2 in source 1 + uxtl v10.8h, v10.8b //converting row 1 in source 2 to 16-bit + ld1 {v14.8b}, [x1], x4 //load row 2 in source 2 + uxtl v12.8h, v12.8b //converting row 2 in source 1 to 16-bit + mul v8.8h, v8.8h , v2.8h //weight 1 mult. for row 1 + mla v8.8h, v10.8h , v4.8h //weight 2 mult. for row 1 + uxtl v14.8h, v14.8b //converting row 2 in source 2 to 16-bit + mul v12.8h, v12.8h , v2.8h //weight 1 mult. for row 2 + mla v12.8h, v14.8h , v4.8h //weight 2 mult. for row 2 + subs w11, w11, #2 //decrement ht by 2 + srshl v8.8h, v8.8h , v0.8h //rounds off the weighted samples from row 1 + srshl v12.8h, v12.8h , v0.8h //rounds off the weighted samples from row 2 + add v8.8h, v8.8h , v6.8h //adding offset for row 1 + add v12.8h, v12.8h , v6.8h //adding offset for row 2 + sqxtun v8.8b, v8.8h //saturating row 1 to unsigned 8-bit + sqxtun v12.8b, v12.8h //saturating row 2 to unsigned 8-bit + st1 {v8.8b}, [x2], x5 //store row 1 in destination + st1 {v12.8b}, [x2], x5 //store row 2 in destination + bgt loop_4_uv //if greater than 0 repeat the loop again + b end_loops_uv + +loop_8_uv: //each iteration processes two rows + + ld1 {v8.8b, v9.8b}, [x0], x3 //load row 1 in source 1 + ld1 {v10.8b, v11.8b}, [x1], x4 //load row 1 in source 2 + ld1 {v12.8b, v13.8b}, [x0], x3 //load row 2 in source 1 + ld1 {v14.8b, v15.8b}, [x1], x4 //load row 2 in source 2 + uxtl v24.8h, v8.8b //converting row 1L in source 1 to 16-bit + ld1 {v16.8b, v17.8b}, [x0], x3 //load row 3 in source 1 + ld1 {v18.8b, v19.8b}, [x1], x4 //load row 3 in source 2 + uxtl v26.8h, v10.8b //converting row 1L in source 2 to 16-bit + ld1 {v20.8b, v21.8b}, [x0], x3 //load row 4 in source 1 + ld1 {v22.8b, v23.8b}, [x1], x4 //load row 4 in source 2 + uxtl v8.8h, v9.8b //converting row 1H in source 1 to 16-bit + uxtl v10.8h, v11.8b //converting row 1H in source 2 to 16-bit + mul v24.8h, v24.8h , v2.8h //weight 1 mult. for row 1L + mla v24.8h, v26.8h , v4.8h //weight 2 mult. for row 1L + uxtl v28.8h, v12.8b //converting row 2L in source 1 to 16-bit + uxtl v30.8h, v14.8b //converting row 2L in source 2 to 16-bit + mul v8.8h, v8.8h , v2.8h //weight 1 mult. for row 1H + mla v8.8h, v10.8h , v4.8h //weight 2 mult. for row 1H + uxtl v12.8h, v13.8b //converting row 2H in source 1 to 16-bit + uxtl v14.8h, v15.8b //converting row 2H in source 2 to 16-bit + mul v28.8h, v28.8h , v2.8h //weight 1 mult. for row 2L + mla v28.8h, v30.8h , v4.8h //weight 2 mult. for row 2L + uxtl v26.8h, v16.8b //converting row 3L in source 1 to 16-bit + uxtl v10.8h, v18.8b //converting row 3L in source 2 to 16-bit + mul v12.8h, v12.8h , v2.8h //weight 1 mult. for row 2H + mla v12.8h, v14.8h , v4.8h //weight 2 mult. for row 2H + uxtl v16.8h, v17.8b //converting row 3H in source 1 to 16-bit + uxtl v18.8h, v19.8b //converting row 3H in source 2 to 16-bit + mul v26.8h, v26.8h , v2.8h //weight 1 mult. for row 3L + mla v26.8h, v10.8h , v4.8h //weight 2 mult. for row 3L + uxtl v30.8h, v20.8b //converting row 4L in source 1 to 16-bit + uxtl v14.8h, v22.8b //converting row 4L in source 2 to 16-bit + mul v16.8h, v16.8h , v2.8h //weight 1 mult. for row 3H + mla v16.8h, v18.8h , v4.8h //weight 2 mult. for row 3H + uxtl v20.8h, v21.8b //converting row 4H in source 1 to 16-bit + uxtl v22.8h, v23.8b //converting row 4H in source 2 to 16-bit + mul v30.8h, v30.8h , v2.8h //weight 1 mult. for row 4L + mla v30.8h, v14.8h , v4.8h //weight 2 mult. for row 4L + srshl v24.8h, v24.8h , v0.8h //rounds off the weighted samples from row 1L + mul v20.8h, v20.8h , v2.8h //weight 1 mult. for row 4H + mla v20.8h, v22.8h , v4.8h //weight 2 mult. for row 4H + srshl v8.8h, v8.8h , v0.8h //rounds off the weighted samples from row 1H + srshl v28.8h, v28.8h , v0.8h //rounds off the weighted samples from row 2L + add v24.8h, v24.8h , v6.8h //adding offset for row 1L + srshl v12.8h, v12.8h , v0.8h //rounds off the weighted samples from row 2H + add v8.8h, v8.8h , v6.8h //adding offset for row 1H + srshl v26.8h, v26.8h , v0.8h //rounds off the weighted samples from row 3L + add v28.8h, v28.8h , v6.8h //adding offset for row 2L + srshl v16.8h, v16.8h , v0.8h //rounds off the weighted samples from row 3H + add v12.8h, v12.8h , v6.8h //adding offset for row 2H + srshl v30.8h, v30.8h , v0.8h //rounds off the weighted samples from row 4L + add v26.8h, v26.8h , v6.8h //adding offset for row 3L + srshl v20.8h, v20.8h , v0.8h //rounds off the weighted samples from row 4H + add v16.8h, v16.8h , v6.8h //adding offset for row 3H + sqxtun v10.8b, v24.8h //saturating row 1L to unsigned 8-bit + add v30.8h, v30.8h , v6.8h //adding offset for row 4L + sqxtun v11.8b, v8.8h //saturating row 1H to unsigned 8-bit + add v20.8h, v20.8h , v6.8h //adding offset for row 4H + sqxtun v18.8b, v28.8h //saturating row 2L to unsigned 8-bit + sqxtun v19.8b, v12.8h //saturating row 2H to unsigned 8-bit + sqxtun v14.8b, v26.8h //saturating row 3L to unsigned 8-bit + sqxtun v15.8b, v16.8h //saturating row 3H to unsigned 8-bit + st1 {v10.8b, v11.8b}, [x2], x5 //store row 1 in destination + sqxtun v22.8b, v30.8h //saturating row 4L to unsigned 8-bit + sqxtun v23.8b, v20.8h //saturating row 4H to unsigned 8-bit + st1 {v18.8b, v19.8b}, [x2], x5 //store row 2 in destination + subs w11, w11, #4 //decrement ht by 4 + st1 {v14.8b, v15.8b}, [x2], x5 //store row 3 in destination + st1 {v22.8b, v23.8b}, [x2], x5 //store row 4 in destination + bgt loop_8_uv //if greater than 0 repeat the loop again + +end_loops_uv: + + // LDMFD sp!,{x4-x12,x15} //Reload the registers from sp + ldp x19, x20, [sp], #16 + pop_v_regs + ret + + + diff --git a/common/armv8/ih264_weighted_pred_av8.s b/common/armv8/ih264_weighted_pred_av8.s new file mode 100755 index 0000000..6a03875 --- /dev/null +++ b/common/armv8/ih264_weighted_pred_av8.s @@ -0,0 +1,471 @@ +//****************************************************************************** +//* +//* Copyright (C) 2015 The Android Open Source Project +//* +//* Licensed under the Apache License, Version 2.0 (the "License"); +//* you may not use this file except in compliance with the License. +//* You may obtain a copy of the License at: +//* +//* http://www.apache.org/licenses/LICENSE-2.0 +//* +//* Unless required by applicable law or agreed to in writing, software +//* distributed under the License is distributed on an "AS IS" BASIS, +//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +//* See the License for the specific language governing permissions and +//* limitations under the License. +//* +//***************************************************************************** +//* Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore +//*/ +///** +//****************************************************************************** +//* @file +//* ih264_weighted_pred_av8.s +//* +//* @brief +//* Contains function definitions for weighted prediction. +//* Functions are coded using NEON intrinsics and can be compiled using ARM RVCT +//* +//* @author +//* Kaushik Senthoor R +//* +//* @par List of Functions: +//* +//* - ih264_weighted_pred_luma_av8() +//* - ih264_weighted_pred_chroma_av8() +//* +//* @remarks +//* None +//* +//******************************************************************************* +//*/ +//******************************************************************************* +//* @function +//* ih264_weighted_pred_luma_av8() +//* +//* @brief +//* This routine performs the default weighted prediction as described in sec +//* 8.4.2.3.2 titled "Weighted sample prediction process" for luma. +//* +//* @par Description: +//* This function gets a ht x wd block, calculates the weighted sample, rounds +//* off, adds offset and stores it in the destination block. +//* +//* @param[in] puc_src: +//* UWORD8 Pointer to the buffer containing the input block. +//* +//* @param[out] puc_dst +//* UWORD8 pointer to the destination where the output block is stored. +//* +//* @param[in] src_strd +//* Stride of the input buffer +//* +//* @param[in] dst_strd +//* Stride of the destination buffer +//* +//* @param[in] log_WD +//* number of bits to be rounded off +//* +//* @param[in] wt +//* weight for the weighted prediction +//* +//* @param[in] ofst +//* offset used after rounding off +//* +//* @param[in] ht +//* integer height of the array +//* +//* @param[in] wd +//* integer width of the array +//* +//* @returns +//* None +//* +//* @remarks +//* (ht,wd) can be (4,4), (4,8), (8,4), (8,8), (8,16), (16,8) or (16,16). +//* +//******************************************************************************* +//*/ +//void ih264_weighted_pred_luma_av8(UWORD8 *puc_src, +// UWORD8 *puc_dst, +// WORD32 src_strd, +// WORD32 dst_strd, +// UWORD8 log_WD, +// UWORD32 wt, +// UWORD16 ofst, +// UWORD8 ht, +// UWORD8 wd) +// +//**************Variables Vs Registers***************************************** +// x0 => puc_src +// x1 => puc_dst +// x2 => src_strd +// x3 => dst_strd +// [sp] => log_WD (x4) +// [sp+4] => wt (x5) +// [sp+8] => ofst (x6) +// [sp+12] => ht (x7) +// [sp+16] => wd (x8) +// +.text +.p2align 2 +.include "ih264_neon_macros.s" + + + + .global ih264_weighted_pred_luma_av8 + +ih264_weighted_pred_luma_av8: + + // STMFD sp!, {x4-x9,x14} //stack stores the values of the arguments + push_v_regs + stp x19, x20, [sp, #-16]! + ldr w8, [sp, #80] //Load wd + sxtw x8, w8 + + dup v2.4h, w5 //D2 = wt (16-bit) + sub x20, x4, #0 //x9 = -log_WD + neg x9, x20 + dup v3.8b, w6 //D3 = ofst (8-bit) + cmp w8, #16 //check if wd is 16 + dup v0.8h, w9 //Q0 = -log_WD (16-bit) + beq loop_16 //branch if wd is 16 + + cmp w8, #8 //check if wd is 8 + beq loop_8 //branch if wd is 8 + +loop_4: //each iteration processes four rows + + ld1 {v4.s}[0], [x0], x2 //load row 1 in source + ld1 {v4.s}[1], [x0], x2 //load row 2 in source + ld1 {v6.s}[0], [x0], x2 //load row 3 in source + ld1 {v6.s}[1], [x0], x2 //load row 4 in source + + uxtl v4.8h, v4.8b //converting rows 1,2 to 16-bit + uxtl v6.8h, v6.8b //converting rows 3,4 to 16-bit + + mul v4.8h, v4.8h , v2.4h[0] //weight mult. for rows 1,2 + mul v6.8h, v6.8h , v2.4h[0] //weight mult. for rows 3,4 + + subs w7, w7, #4 //decrement ht by 4 + srshl v4.8h, v4.8h , v0.8h //rounds off the weighted samples from rows 1,2 + srshl v6.8h, v6.8h , v0.8h //rounds off the weighted samples from rows 3,4 + + saddw v4.8h, v4.8h , v3.8b //adding offset for rows 1,2 + saddw v6.8h, v6.8h , v3.8b //adding offset for rows 3,4 + + sqxtun v4.8b, v4.8h //saturating rows 1,2 to unsigned 8-bit + sqxtun v6.8b, v6.8h //saturating rows 3,4 to unsigned 8-bit + + st1 {v4.s}[0], [x1], x3 //store row 1 in destination + st1 {v4.s}[1], [x1], x3 //store row 2 in destination + st1 {v6.s}[0], [x1], x3 //store row 3 in destination + st1 {v6.s}[1], [x1], x3 //store row 4 in destination + + bgt loop_4 //if greater than 0 repeat the loop again + + b end_loops + +loop_8: //each iteration processes four rows + + ld1 {v4.8b}, [x0], x2 //load row 1 in source + ld1 {v6.8b}, [x0], x2 //load row 2 in source + ld1 {v8.8b}, [x0], x2 //load row 3 in source + uxtl v4.8h, v4.8b //converting row 1 to 16-bit + ld1 {v10.8b}, [x0], x2 //load row 4 in source + uxtl v6.8h, v6.8b //converting row 2 to 16-bit + + uxtl v8.8h, v8.8b //converting row 3 to 16-bit + mul v4.8h, v4.8h , v2.4h[0] //weight mult. for row 1 + uxtl v10.8h, v10.8b //converting row 4 to 16-bit + mul v6.8h, v6.8h , v2.4h[0] //weight mult. for row 2 + mul v8.8h, v8.8h , v2.4h[0] //weight mult. for row 3 + mul v10.8h, v10.8h , v2.4h[0] //weight mult. for row 4 + + srshl v4.8h, v4.8h , v0.8h //rounds off the weighted samples from row 1 + srshl v6.8h, v6.8h , v0.8h //rounds off the weighted samples from row 2 + srshl v8.8h, v8.8h , v0.8h //rounds off the weighted samples from row 3 + saddw v4.8h, v4.8h , v3.8b //adding offset for row 1 + srshl v10.8h, v10.8h , v0.8h //rounds off the weighted samples from row 4 + saddw v6.8h, v6.8h , v3.8b //adding offset for row 2 + + saddw v8.8h, v8.8h , v3.8b //adding offset for row 3 + sqxtun v4.8b, v4.8h //saturating row 1 to unsigned 8-bit + saddw v10.8h, v10.8h , v3.8b //adding offset for row 4 + sqxtun v6.8b, v6.8h //saturating row 2 to unsigned 8-bit + sqxtun v8.8b, v8.8h //saturating row 3 to unsigned 8-bit + sqxtun v10.8b, v10.8h //saturating row 4 to unsigned 8-bit + + st1 {v4.8b}, [x1], x3 //store row 1 in destination + st1 {v6.8b}, [x1], x3 //store row 2 in destination + subs w7, w7, #4 //decrement ht by 4 + st1 {v8.8b}, [x1], x3 //store row 3 in destination + st1 {v10.8b}, [x1], x3 //store row 4 in destination + + bgt loop_8 //if greater than 0 repeat the loop again + + b end_loops + +loop_16: //each iteration processes two rows + + ld1 {v4.8b, v5.8b}, [x0], x2 //load row 1 in source + ld1 {v6.8b, v7.8b}, [x0], x2 //load row 2 in source + uxtl v12.8h, v4.8b //converting row 1L to 16-bit + ld1 {v8.8b, v9.8b}, [x0], x2 //load row 3 in source + uxtl v14.8h, v5.8b //converting row 1H to 16-bit + ld1 {v10.8b, v11.8b}, [x0], x2 //load row 4 in source + uxtl v16.8h, v6.8b //converting row 2L to 16-bit + mul v12.8h, v12.8h , v2.4h[0] //weight mult. for row 1L + uxtl v18.8h, v7.8b //converting row 2H to 16-bit + mul v14.8h, v14.8h , v2.4h[0] //weight mult. for row 1H + uxtl v20.8h, v8.8b //converting row 3L to 16-bit + mul v16.8h, v16.8h , v2.4h[0] //weight mult. for row 2L + uxtl v22.8h, v9.8b //converting row 3H to 16-bit + mul v18.8h, v18.8h , v2.4h[0] //weight mult. for row 2H + uxtl v24.8h, v10.8b //converting row 4L to 16-bit + mul v20.8h, v20.8h , v2.4h[0] //weight mult. for row 3L + uxtl v26.8h, v11.8b //converting row 4H to 16-bit + mul v22.8h, v22.8h , v2.4h[0] //weight mult. for row 3H + mul v24.8h, v24.8h , v2.4h[0] //weight mult. for row 4L + srshl v12.8h, v12.8h , v0.8h //rounds off the weighted samples from row 1L + mul v26.8h, v26.8h , v2.4h[0] //weight mult. for row 4H + srshl v14.8h, v14.8h , v0.8h //rounds off the weighted samples from row 1H + srshl v16.8h, v16.8h , v0.8h //rounds off the weighted samples from row 2L + saddw v12.8h, v12.8h , v3.8b //adding offset for row 1L + srshl v18.8h, v18.8h , v0.8h //rounds off the weighted samples from row 2H + saddw v14.8h, v14.8h , v3.8b //adding offset for row 1H + sqxtun v4.8b, v12.8h //saturating row 1L to unsigned 8-bit + srshl v20.8h, v20.8h , v0.8h //rounds off the weighted samples from row 3L + saddw v16.8h, v16.8h , v3.8b //adding offset for row 2L + sqxtun v5.8b, v14.8h //saturating row 1H to unsigned 8-bit + srshl v22.8h, v22.8h , v0.8h //rounds off the weighted samples from row 3H + saddw v18.8h, v18.8h , v3.8b //adding offset for row 2H + sqxtun v6.8b, v16.8h //saturating row 2L to unsigned 8-bit + srshl v24.8h, v24.8h , v0.8h //rounds off the weighted samples from row 4L + saddw v20.8h, v20.8h , v3.8b //adding offset for row 3L + sqxtun v7.8b, v18.8h //saturating row 2H to unsigned 8-bit + srshl v26.8h, v26.8h , v0.8h //rounds off the weighted samples from row 4H + saddw v22.8h, v22.8h , v3.8b //adding offset for row 3H + sqxtun v8.8b, v20.8h //saturating row 3L to unsigned 8-bit + saddw v24.8h, v24.8h , v3.8b //adding offset for row 4L + sqxtun v9.8b, v22.8h //saturating row 3H to unsigned 8-bit + saddw v26.8h, v26.8h , v3.8b //adding offset for row 4H + sqxtun v10.8b, v24.8h //saturating row 4L to unsigned 8-bit + st1 {v4.8b, v5.8b}, [x1], x3 //store row 1 in destination + sqxtun v11.8b, v26.8h //saturating row 4H to unsigned 8-bit + st1 {v6.8b, v7.8b}, [x1], x3 //store row 2 in destination + subs w7, w7, #4 //decrement ht by 4 + st1 {v8.8b, v9.8b}, [x1], x3 //store row 3 in destination + st1 {v10.8b, v11.8b}, [x1], x3 //store row 4 in destination + + bgt loop_16 //if greater than 0 repeat the loop again + +end_loops: + + // LDMFD sp!,{x4-x9,x15} //Reload the registers from sp + ldp x19, x20, [sp], #16 + pop_v_regs + ret + + +//******************************************************************************* +//* @function +//* ih264_weighted_pred_chroma_av8() +//* +//* @brief +//* This routine performs the default weighted prediction as described in sec +//* 8.4.2.3.2 titled "Weighted sample prediction process" for chroma. +//* +//* @par Description: +//* This function gets a ht x wd block, calculates the weighted sample, rounds +//* off, adds offset and stores it in the destination block for U and V. +//* +//* @param[in] puc_src: +//* UWORD8 Pointer to the buffer containing the input block. +//* +//* @param[out] puc_dst +//* UWORD8 pointer to the destination where the output block is stored. +//* +//* @param[in] src_strd +//* Stride of the input buffer +//* +//* @param[in] dst_strd +//* Stride of the destination buffer +//* +//* @param[in] log_WD +//* number of bits to be rounded off +//* +//* @param[in] wt +//* weights for the weighted prediction for U and V +//* +//* @param[in] ofst +//* offsets used after rounding off for U and V +//* +//* @param[in] ht +//* integer height of the array +//* +//* @param[in] wd +//* integer width of the array +//* +//* @returns +//* None +//* +//* @remarks +//* (ht,wd) can be (2,2), (2,4), (4,2), (4,4), (4,8), (8,4) or (8,8). +//* +//******************************************************************************* +//*/ +//void ih264_weighted_pred_chroma_av8(UWORD8 *puc_src, +// UWORD8 *puc_dst, +// WORD32 src_strd, +// WORD32 dst_strd, +// UWORD8 log_WD, +// UWORD32 wt, +// UWORD16 ofst, +// UWORD8 ht, +// UWORD8 wd) +// +//**************Variables Vs Registers***************************************** +// x0 => puc_src +// x1 => puc_dst +// x2 => src_strd +// x3 => dst_strd +// [sp] => log_WD (x4) +// [sp+4] => wt (x5) +// [sp+8] => ofst (x6) +// [sp+12] => ht (x7) +// [sp+16] => wd (x8) +// + + + + + .global ih264_weighted_pred_chroma_av8 + +ih264_weighted_pred_chroma_av8: + + // STMFD sp!, {x4-x9,x14} //stack stores the values of the arguments + push_v_regs + stp x19, x20, [sp, #-16]! + + ldr w8, [sp, #80] //Load wd + sxtw x8, w8 + + sub x20, x4, #0 //x9 = -log_WD + neg x9, x20 + dup v2.4s, w5 //Q1 = {wt_u (16-bit), wt_v (16-bit)} + + + dup v4.4h, w6 //D4 = {ofst_u (8-bit), ofst_v (8-bit)} + cmp w8, #8 //check if wd is 8 + dup v0.8h, w9 //Q0 = -log_WD (16-bit) + beq loop_8_uv //branch if wd is 8 + + cmp w8, #4 //check if ws is 4 + beq loop_4_uv //branch if wd is 4 + +loop_2_uv: //each iteration processes two rows + + ld1 {v6.s}[0], [x0], x2 //load row 1 in source + ld1 {v6.s}[1], [x0], x2 //load row 2 in source + uxtl v6.8h, v6.8b //converting rows 1,2 to 16-bit + mul v6.8h, v6.8h , v2.8h //weight mult. for rows 1,2 + srshl v6.8h, v6.8h , v0.8h //rounds off the weighted samples from rows 1,2 + saddw v6.8h, v6.8h , v4.8b //adding offset for rows 1,2 + sqxtun v6.8b, v6.8h //saturating rows 1,2 to unsigned 8-bit + subs w7, w7, #2 //decrement ht by 2 + st1 {v6.s}[0], [x1], x3 //store row 1 in destination + st1 {v6.s}[1], [x1], x3 //store row 2 in destination + bgt loop_2_uv //if greater than 0 repeat the loop again + b end_loops_uv + +loop_4_uv: //each iteration processes two rows + + ld1 {v6.8b}, [x0], x2 //load row 1 in source + ld1 {v8.8b}, [x0], x2 //load row 2 in source + uxtl v6.8h, v6.8b //converting row 1 to 16-bit + uxtl v8.8h, v8.8b //converting row 2 to 16-bit + mul v6.8h, v6.8h , v2.8h //weight mult. for row 1 + mul v8.8h, v8.8h , v2.8h //weight mult. for row 2 + subs w7, w7, #2 //decrement ht by 2 + srshl v6.8h, v6.8h , v0.8h //rounds off the weighted samples from row 1 + srshl v8.8h, v8.8h , v0.8h //rounds off the weighted samples from row 2 + saddw v6.8h, v6.8h , v4.8b //adding offset for row 1 + saddw v8.8h, v8.8h , v4.8b //adding offset for row 2 + sqxtun v6.8b, v6.8h //saturating row 1 to unsigned 8-bit + sqxtun v8.8b, v8.8h //saturating row 2 to unsigned 8-bit + st1 {v6.8b}, [x1], x3 //store row 1 in destination + st1 {v8.8b}, [x1], x3 //store row 2 in destination + + bgt loop_4_uv //if greater than 0 repeat the loop again + + b end_loops_uv + +loop_8_uv: //each iteration processes two rows + + ld1 {v6.8b, v7.8b}, [x0], x2 //load row 1 in source + ld1 {v8.8b, v9.8b}, [x0], x2 //load row 2 in source + uxtl v14.8h, v6.8b //converting row 1L to 16-bit + ld1 {v10.8b, v11.8b}, [x0], x2 //load row 3 in source + uxtl v16.8h, v7.8b //converting row 1H to 16-bit + ld1 {v12.8b, v13.8b}, [x0], x2 //load row 4 in source + + mul v14.8h, v14.8h , v2.8h //weight mult. for row 1L + uxtl v18.8h, v8.8b //converting row 2L to 16-bit + mul v16.8h, v16.8h , v2.8h //weight mult. for row 1H + uxtl v20.8h, v9.8b //converting row 2H to 16-bit + mul v18.8h, v18.8h , v2.8h //weight mult. for row 2L + uxtl v22.8h, v10.8b //converting row 3L to 16-bit + mul v20.8h, v20.8h , v2.8h //weight mult. for row 2H + uxtl v24.8h, v11.8b //converting row 3H to 16-bit + mul v22.8h, v22.8h , v2.8h //weight mult. for row 3L + uxtl v26.8h, v12.8b //converting row 4L to 16-bit + mul v24.8h, v24.8h , v2.8h //weight mult. for row 3H + uxtl v28.8h, v13.8b //converting row 4H to 16-bit + + mul v26.8h, v26.8h , v2.8h //weight mult. for row 4L + srshl v14.8h, v14.8h , v0.8h //rounds off the weighted samples from row 1L + mul v28.8h, v28.8h , v2.8h //weight mult. for row 4H + + srshl v16.8h, v16.8h , v0.8h //rounds off the weighted samples from row 1H + srshl v18.8h, v18.8h , v0.8h //rounds off the weighted samples from row 2L + saddw v14.8h, v14.8h , v4.8b //adding offset for row 1L + srshl v20.8h, v20.8h , v0.8h //rounds off the weighted samples from row 2H + saddw v16.8h, v16.8h , v4.8b //adding offset for row 1H + sqxtun v6.8b, v14.8h //saturating row 1L to unsigned 8-bit + srshl v22.8h, v22.8h , v0.8h //rounds off the weighted samples from row 3L + saddw v18.8h, v18.8h , v4.8b //adding offset for row 2L + sqxtun v7.8b, v16.8h //saturating row 1H to unsigned 8-bit + srshl v24.8h, v24.8h , v0.8h //rounds off the weighted samples from row 3H + saddw v20.8h, v20.8h , v4.8b //adding offset for row 2H + sqxtun v8.8b, v18.8h //saturating row 2L to unsigned 8-bit + srshl v26.8h, v26.8h , v0.8h //rounds off the weighted samples from row 4L + saddw v22.8h, v22.8h , v4.8b //adding offset for row 3L + sqxtun v9.8b, v20.8h //saturating row 2H to unsigned 8-bit + srshl v28.8h, v28.8h , v0.8h //rounds off the weighted samples from row 4H + saddw v24.8h, v24.8h , v4.8b //adding offset for row 3H + + sqxtun v10.8b, v22.8h //saturating row 3L to unsigned 8-bit + saddw v26.8h, v26.8h , v4.8b //adding offset for row 4L + sqxtun v11.8b, v24.8h //saturating row 3H to unsigned 8-bit + saddw v28.8h, v28.8h , v4.8b //adding offset for row 4H + + sqxtun v12.8b, v26.8h //saturating row 4L to unsigned 8-bit + st1 {v6.8b, v7.8b}, [x1], x3 //store row 1 in destination + sqxtun v13.8b, v28.8h //saturating row 4H to unsigned 8-bit + st1 {v8.8b, v9.8b}, [x1], x3 //store row 2 in destination + subs w7, w7, #4 //decrement ht by 4 + st1 {v10.8b, v11.8b}, [x1], x3 //store row 3 in destination + st1 {v12.8b, v13.8b}, [x1], x3 //store row 4 in destination + + bgt loop_8_uv //if greater than 0 repeat the loop again + +end_loops_uv: + + // LDMFD sp!,{x4-x9,x15} //Reload the registers from sp + ldp x19, x20, [sp], #16 + pop_v_regs + ret + + + diff --git a/common/ih264_buf_mgr.c b/common/ih264_buf_mgr.c new file mode 100755 index 0000000..ea4333e --- /dev/null +++ b/common/ih264_buf_mgr.c @@ -0,0 +1,696 @@ +/****************************************************************************** + * + * Copyright (C) 2015 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ***************************************************************************** + * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore +*/ +/** +******************************************************************************* +* @file +* ih264_buf_mgr.c +* +* @brief +* Contains function definitions for buffer management +* +* @author +* Srinivas T +* +* @par List of Functions: +* - ih264_buf_mgr_size() +* - ih264_buf_mgr_lock() +* - ih264_buf_mgr_unlock() +* - ih264_buf_mgr_yield() +* - ih264_buf_mgr_free() +* - ih264_buf_mgr_init() +* - ih264_buf_mgr_add() +* - ih264_buf_mgr_get_next_free() +* - ih264_buf_mgr_check_free() +* - ih264_buf_mgr_set_status() +* - ih264_buf_mgr_get_status() +* - ih264_buf_mgr_get_buf() +* - ih264_buf_mgr_get_bufid() +* - ih264_buf_mgr_get_num_active_buf() +* +* @remarks +* None +* +******************************************************************************* +*/ +#include <stdio.h> +#include <stdlib.h> +#include "ih264_typedefs.h" +#include "ih264_macros.h" +#include "ih264_defs.h" +#include "ih264_error.h" +#include "ih264_buf_mgr.h" + +#include "ithread.h" + +/** +******************************************************************************* +* +* @brief Returns size for buf queue context. Does not include buf queue buffer +* requirements +* +* @par Description +* Returns size for buf queue context. Does not include buf queue buffer +* requirements. Buffer size required to store the bufs should be allocated in +* addition to the value returned here. +* +* @returns Size of the buf queue context +* +* @remarks +* +******************************************************************************* +*/ +WORD32 ih264_buf_mgr_size(void) +{ + WORD32 size; + + size = sizeof(buf_mgr_t); + size += ithread_get_mutex_lock_size(); + + return size; +} + +/** +******************************************************************************* +* +* @brief +* Locks the buf_mgr context +* +* @par Description +* Locks the buf_mgr context by calling ithread_mutex_lock() +* +* @param[in] ps_buf_mgr +* Job Queue context +* +* @returns IH264_FAIL if mutex lock fails else IH264_SUCCESS +* +* @remarks +* +******************************************************************************* +*/ +IH264_ERROR_T ih264_buf_mgr_lock(buf_mgr_t *ps_buf_mgr) +{ + WORD32 retval; + retval = ithread_mutex_lock(ps_buf_mgr->pv_mutex); + if(retval) + { + return IH264_FAIL; + } + return IH264_SUCCESS; +} + +/** +******************************************************************************* +* +* @brief +* Unlocks the buf_mgr context +* +* @par Description +* Unlocks the buf_mgr context by calling ithread_mutex_unlock() +* +* @param[in] ps_buf_mgr +* Job Queue context +* +* @returns IH264_FAIL if mutex unlock fails else IH264_SUCCESS +* +* @remarks +* +******************************************************************************* +*/ + +IH264_ERROR_T ih264_buf_mgr_unlock(buf_mgr_t *ps_buf_mgr) +{ + WORD32 retval; + retval = ithread_mutex_unlock(ps_buf_mgr->pv_mutex); + if(retval) + { + return IH264_FAIL; + } + return IH264_SUCCESS; + +} +/** +******************************************************************************* +* +* @brief +* Yeilds the thread +* +* @par Description +* Unlocks the buf_mgr context by calling +* ih264_buf_mgr_unlock(), ithread_yield() and then ih264_buf_mgr_lock() +* buf_mgr is unlocked before to ensure the buf_mgr can be accessed by other threads +* If unlock is not done before calling yield then no other thread can access +* the buf_mgr functions and update buf_mgr. +* +* @param[in] ps_buf_mgr +* Job Queue context +* +* @returns IH264_FAIL if mutex lock unlock or yield fails else IH264_SUCCESS +* +* @remarks +* +******************************************************************************* +*/ +IH264_ERROR_T ih264_buf_mgr_yield(buf_mgr_t *ps_buf_mgr) +{ + + IH264_ERROR_T ret = IH264_SUCCESS; + + IH264_ERROR_T rettmp; + rettmp = ih264_buf_mgr_unlock(ps_buf_mgr); + RETURN_IF((rettmp != IH264_SUCCESS), rettmp); + + //ithread_usleep(10); + ithread_yield(); + + rettmp = ih264_buf_mgr_lock(ps_buf_mgr); + RETURN_IF((rettmp != IH264_SUCCESS), rettmp); + return ret; +} + + +/** +******************************************************************************* +* +* @brief free the buf queue pointers +* +* @par Description +* Frees the buf_mgr context +* +* @param[in] pv_buf +* Memoy for buf queue buffer and buf queue context +* +* @returns Pointer to buf queue context +* +* @remarks +* Since it will be called only once by master thread this is not thread safe. +* +******************************************************************************* +*/ +IH264_ERROR_T ih264_buf_mgr_free(buf_mgr_t *ps_buf_mgr) +{ + WORD32 ret; + ret = ithread_mutex_destroy(ps_buf_mgr->pv_mutex); + + if(0 == ret) + return IH264_SUCCESS; + else + return IH264_FAIL; +} +/** +******************************************************************************* +* +* @brief +* Buffer manager initialization function. +* +* @par Description: +* Initializes the buffer manager structure +* +* @param[in] ps_buf_mgr +* Pointer to the buffer manager +* +* @returns +* +* @remarks +* None +* +******************************************************************************* +*/ + + +void *ih264_buf_mgr_init(void *pv_buf) +{ + WORD32 id; + UWORD8 *pu1_buf; + buf_mgr_t *ps_buf_mgr; + pu1_buf = (UWORD8 *)pv_buf; + + ps_buf_mgr = (buf_mgr_t *)pu1_buf; + pu1_buf += sizeof(buf_mgr_t); + + ps_buf_mgr->pv_mutex = pu1_buf; + pu1_buf += ithread_get_mutex_lock_size(); + + ithread_mutex_init(ps_buf_mgr->pv_mutex); + + ps_buf_mgr->i4_max_buf_cnt = BUF_MGR_MAX_CNT; + ps_buf_mgr->i4_active_buf_cnt = 0; + + for(id = 0; id < BUF_MGR_MAX_CNT; id++) + { + ps_buf_mgr->au4_status[id] = 0; + ps_buf_mgr->apv_ptr[id] = NULL; + } + + return ps_buf_mgr; +} + + +/** +******************************************************************************* +* +* @brief +* Adds and increments the buffer and buffer count. +* +* @par Description: +* Adds a buffer to the buffer manager if it is not already present and +* increments the active buffer count +* +* @param[in] ps_buf_mgr +* Pointer to the buffer manager +* +* @param[in] pv_ptr +* Pointer to the buffer to be added +* +* @returns Returns 0 on success, -1 otherwise +* +* @remarks +* None +* +******************************************************************************* +*/ +IH264_ERROR_T ih264_buf_mgr_add(buf_mgr_t *ps_buf_mgr, + void *pv_ptr, + WORD32 buf_id) +{ + + IH264_ERROR_T ret = IH264_SUCCESS; + ret = ih264_buf_mgr_lock(ps_buf_mgr); + RETURN_IF((ret != IH264_SUCCESS), ret); + + /* Check if buffer ID is within allowed range */ + if(buf_id >= ps_buf_mgr->i4_max_buf_cnt) + { + ret = ih264_buf_mgr_unlock(ps_buf_mgr); + RETURN_IF((ret != IH264_SUCCESS), ret); + + return IH264_FAIL; + } + + /* Check if the current ID is being used to hold some other buffer */ + if((ps_buf_mgr->apv_ptr[buf_id] != NULL) && + (ps_buf_mgr->apv_ptr[buf_id] !=pv_ptr)) + { + ret = ih264_buf_mgr_unlock(ps_buf_mgr); + RETURN_IF((ret != IH264_SUCCESS), ret); + + return IH264_FAIL; + } + ps_buf_mgr->apv_ptr[buf_id] = pv_ptr; + ps_buf_mgr->i4_active_buf_cnt++; + + ret = ih264_buf_mgr_unlock(ps_buf_mgr); + RETURN_IF((ret != IH264_SUCCESS), ret); + + return ret; +} + +/** +******************************************************************************* +* +* @brief +* Gets the next free buffer. +* +* @par Description: +* Returns the next free buffer available and sets the corresponding status +* to DEC +* +* @param[in] ps_buf_mgr +* Pointer to the buffer manager +* +* @param[in] pi4_buf_id +* Pointer to the id of the free buffer +* +* @returns Pointer to the free buffer +* +* @remarks +* None +* +******************************************************************************* +*/ +void* ih264_buf_mgr_get_next_free(buf_mgr_t *ps_buf_mgr, WORD32 *pi4_buf_id) +{ + WORD32 id; + void *pv_ret_ptr; + IH264_ERROR_T ret = IH264_SUCCESS; + ret = ih264_buf_mgr_lock(ps_buf_mgr); + RETURN_IF((ret != IH264_SUCCESS), NULL); + + pv_ret_ptr = NULL; + for(id = 0; id < ps_buf_mgr->i4_active_buf_cnt; id++) + { + /* Check if the buffer is non-null and status is zero */ + if((ps_buf_mgr->au4_status[id] == 0) && (ps_buf_mgr->apv_ptr[id])) + { + *pi4_buf_id = id; + /* DEC is set to 1 */ + ps_buf_mgr->au4_status[id] = 1; + pv_ret_ptr = ps_buf_mgr->apv_ptr[id]; + break; + } + } + ret = ih264_buf_mgr_unlock(ps_buf_mgr); + RETURN_IF((ret != IH264_SUCCESS), NULL); + + return pv_ret_ptr; +} + + +/** +******************************************************************************* +* +* @brief +* Checks the buffer manager for free buffers available. +* +* @par Description: +* Checks if there are any free buffers available +* +* @param[in] ps_buf_mgr +* Pointer to the buffer manager +* +* @returns Returns 0 if available, -1 otherwise +* +* @remarks +* None +* +******************************************************************************* +*/ +IH264_ERROR_T ih264_buf_mgr_check_free(buf_mgr_t *ps_buf_mgr) +{ + WORD32 id; + IH264_ERROR_T ret = IH264_SUCCESS; + IH264_ERROR_T rettmp = IH264_SUCCESS; + rettmp = ih264_buf_mgr_lock(ps_buf_mgr); + RETURN_IF((rettmp != IH264_SUCCESS), ret); + + ret = IH264_FAIL; + for(id = 0; id < ps_buf_mgr->i4_active_buf_cnt; id++) + { + if((ps_buf_mgr->au4_status[id] == 0) && + (ps_buf_mgr->apv_ptr[id])) + { + ret = IH264_SUCCESS; + break; + } + } + rettmp = ih264_buf_mgr_unlock(ps_buf_mgr); + RETURN_IF((rettmp != IH264_SUCCESS), ret); + + return ret; + +} + + +/** +******************************************************************************* +* +* @brief +* Resets the status bits. +* +* @par Description: +* resets the status bits that the mask contains (status corresponding to +* the id) +* +* @param[in] ps_buf_mgr +* Pointer to the buffer manager +* +* @param[in] buf_id +* ID of the buffer status to be released +* +* @param[in] mask +* Contains the bits that are to be reset +* +* @returns 0 if success, -1 otherwise +* +* @remarks +* None +* +******************************************************************************* +*/ +IH264_ERROR_T ih264_buf_mgr_release(buf_mgr_t *ps_buf_mgr, + WORD32 buf_id, + UWORD32 mask) +{ + IH264_ERROR_T ret = IH264_SUCCESS; + ret = ih264_buf_mgr_lock(ps_buf_mgr); + RETURN_IF((ret != IH264_SUCCESS), ret); + + + /* If the given id is pointing to an id which is not yet added */ + if(buf_id >= ps_buf_mgr->i4_active_buf_cnt) + { + ret = ih264_buf_mgr_unlock(ps_buf_mgr); + RETURN_IF((ret != IH264_SUCCESS), ret); + return IH264_FAIL; + } + + ps_buf_mgr->au4_status[buf_id] &= ~mask; + + +/* If both the REF and DISP are zero, DEC is set to zero */ + if(ps_buf_mgr->au4_status[buf_id] == 1) + { + ps_buf_mgr->au4_status[buf_id] = 0; + } + + + ret = ih264_buf_mgr_unlock(ps_buf_mgr); + RETURN_IF((ret != IH264_SUCCESS), ret); + + return ret; +} + + +/** +******************************************************************************* +* +* @brief +* Sets the status bit. +* +* @par Description: +* sets the status bits that the mask contains (status corresponding to the +* id) +* +* +* @param[in] ps_buf_mgr +* Pointer to the buffer manager +* +* @param[in] buf_id +* ID of the buffer whose status needs to be modified +* +* +* @param[in] mask +* Contains the bits that are to be set +* +* @returns 0 if success, -1 otherwise +* +* @remarks +* None +* +******************************************************************************* +*/ +IH264_ERROR_T ih264_buf_mgr_set_status(buf_mgr_t *ps_buf_mgr, + WORD32 buf_id, + UWORD32 mask) +{ + IH264_ERROR_T ret = IH264_SUCCESS; + ret = ih264_buf_mgr_lock(ps_buf_mgr); + RETURN_IF((ret != IH264_SUCCESS), ret); + + if(buf_id >= ps_buf_mgr->i4_active_buf_cnt) + { + ret = ih264_buf_mgr_unlock(ps_buf_mgr); + RETURN_IF((ret != IH264_SUCCESS), ret); + return IH264_FAIL; + } + + + if((ps_buf_mgr->au4_status[buf_id] & mask) != 0) + { + ret = ih264_buf_mgr_unlock(ps_buf_mgr); + RETURN_IF((ret != IH264_SUCCESS), ret); + return IH264_FAIL; + } + + ps_buf_mgr->au4_status[buf_id] |= mask; + ret = ih264_buf_mgr_unlock(ps_buf_mgr); + RETURN_IF((ret != IH264_SUCCESS), ret); + + return ret; +} + + +/** +******************************************************************************* +* +* @brief +* Returns the status of the buffer. +* +* @par Description: +* Returns the status of the buffer corresponding to the id +* +* @param[in] ps_buf_mgr +* Pointer to the buffer manager +* +* @param[in] buf_id +* ID of the buffer status required +* +* @returns Status of the buffer corresponding to the id +* +* @remarks +* None +* +******************************************************************************* +*/ +WORD32 ih264_buf_mgr_get_status( buf_mgr_t *ps_buf_mgr, WORD32 buf_id ) +{ + IH264_ERROR_T ret = IH264_SUCCESS; + UWORD32 status; + + ret = ih264_buf_mgr_lock(ps_buf_mgr); + RETURN_IF((ret != IH264_SUCCESS), ret); + + status = ps_buf_mgr->au4_status[buf_id]; + + ret = ih264_buf_mgr_unlock(ps_buf_mgr); + RETURN_IF((ret != IH264_SUCCESS), ret); + + return status; +} + + +/** +******************************************************************************* +* +* @brief +* Gets the buffer from the buffer manager +* +* @par Description: +* Returns the pointer to the buffer corresponding to the id +* +* @param[in] ps_buf_mgr +* Pointer to the buffer manager +* +* @param[in] buf_id +* ID of the buffer required +* +* @returns Pointer to the buffer required +* +* @remarks +* None +* +******************************************************************************* +*/ +void* ih264_buf_mgr_get_buf(buf_mgr_t *ps_buf_mgr, WORD32 buf_id) +{ + IH264_ERROR_T ret = IH264_SUCCESS; + void *pv_buf; + ret = ih264_buf_mgr_lock(ps_buf_mgr); + RETURN_IF((ret != IH264_SUCCESS), NULL); + + pv_buf = ps_buf_mgr->apv_ptr[buf_id]; + + ret = ih264_buf_mgr_unlock(ps_buf_mgr); + RETURN_IF((ret != IH264_SUCCESS), NULL); + + return pv_buf; +} + + +/** +******************************************************************************* +* +* @brief +* Gets the buffer id from the buffer manager if the buffer is added to the +* buffer manager +* +* @par Description: +* Returns the buffer id corresponding to the given buffer if it exists +* +* @param[in] ps_buf_mgr +* Pointer to the buffer manager +* +* @param[in] pv_buf +* Pointer to the buffer +* +* @returns Buffer id if exists, else -1 +* +* @remarks +* None +* +******************************************************************************* +*/ +WORD32 ih264_buf_mgr_get_bufid(buf_mgr_t *ps_buf_mgr, void *pv_buf) +{ + WORD32 id; + WORD32 buf_id = -1; + IH264_ERROR_T ret = IH264_SUCCESS; + ret = ih264_buf_mgr_lock(ps_buf_mgr); + RETURN_IF((ret != IH264_SUCCESS), ret); + + for(id = 0; id < ps_buf_mgr->i4_active_buf_cnt; id++) + { + if(ps_buf_mgr->apv_ptr[id] == pv_buf) + { + buf_id = id; + break; + } + } + ret = ih264_buf_mgr_unlock(ps_buf_mgr); + RETURN_IF((ret != IH264_SUCCESS), ret); + + return buf_id; +} + + +/** +******************************************************************************* +* +* @brief +* Gets the no.of active buffer +* +* @par Description: +* Return the number of active buffers in the buffer manager +* +* @param[in] ps_buf_mgr +* Pointer to the buffer manager +* +* @returns number of active buffers +* +* @remarks +* None +* +******************************************************************************* +*/ +UWORD32 ih264_buf_mgr_get_num_active_buf(buf_mgr_t *ps_buf_mgr) +{ + UWORD32 u4_buf_cnt; + IH264_ERROR_T ret = IH264_SUCCESS; + + u4_buf_cnt = 0; + + ret = ih264_buf_mgr_lock(ps_buf_mgr); + RETURN_IF((ret != IH264_SUCCESS), ret); + u4_buf_cnt = ps_buf_mgr->i4_active_buf_cnt; + + ret = ih264_buf_mgr_unlock(ps_buf_mgr); + RETURN_IF((ret != IH264_SUCCESS), ret); + + return u4_buf_cnt; +} diff --git a/common/ih264_buf_mgr.h b/common/ih264_buf_mgr.h new file mode 100755 index 0000000..52efa70 --- /dev/null +++ b/common/ih264_buf_mgr.h @@ -0,0 +1,122 @@ +/****************************************************************************** + * + * Copyright (C) 2015 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ***************************************************************************** + * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore +*/ +/** +******************************************************************************* +* @file +* ih264_buf_mgr.h +* +* @brief +* Function declarations used for buffer management +* +* @remarks +* None +* +******************************************************************************* +*/ +#ifndef _IH264_BUF_MGR_H_ +#define _IH264_BUF_MGR_H_ + +#define BUF_MGR_MAX_CNT 64 + +/** Flag for current encoding decoder */ +#define BUF_MGR_CODEC (1 << 1) + +/** Flag for reference status */ +#define BUF_MGR_REF (1 << 2) + +/** Flag for I/O - Display/output in case of decoder, capture/input in case of encoder */ +#define BUF_MGR_IO (1 << 3) + +typedef struct +{ + /** + * Mutex used to keep the functions thread-safe + */ + void *pv_mutex; + + /** + * max_buf_cnt + */ + WORD32 i4_max_buf_cnt; + + /** + * active_buf_cnt + */ + WORD32 i4_active_buf_cnt; + + /** + * au4_status[BUF_MGR_MAX_CNT] + */ + UWORD32 au4_status[BUF_MGR_MAX_CNT]; + + /* The last three bit of status are: */ + + /* Bit 0 - IN USE */ + /* Bit 1 - CODEC */ + /* Bit 2 - REF */ + /* Bit 3 - DISP/IO/RECON */ + void *apv_ptr[BUF_MGR_MAX_CNT]; + +}buf_mgr_t; + +// Returns size of the buffer manager context +WORD32 ih264_buf_mgr_size(void); + +//Free buffer manager +IH264_ERROR_T ih264_buf_mgr_free(buf_mgr_t *ps_buf_mgr); + +// Initializes the buffer API structure +void *ih264_buf_mgr_init(void *pv_buf); + +// Add buffer to buffer manager. 0: success, -1: fail (u4_active_buf_cnt has reached u4_max_buf_cnt) +IH264_ERROR_T ih264_buf_mgr_add(buf_mgr_t *ps_buf_mgr, + void *pv_ptr, + WORD32 buf_id); + +// this function will set the buffer status to DEC +void* ih264_buf_mgr_get_next_free(buf_mgr_t *ps_buf_mgr, WORD32 *pi4_id); + +// this function will check if there are any free buffers +IH264_ERROR_T ih264_buf_mgr_check_free(buf_mgr_t *ps_buf_mgr); + +// mask will have who released it: DISP:REF:DEC +IH264_ERROR_T ih264_buf_mgr_release(buf_mgr_t *ps_buf_mgr, + WORD32 id, + UWORD32 mask); + +// sets the status to one or all of DISP:REF:DEC +IH264_ERROR_T ih264_buf_mgr_set_status(buf_mgr_t *ps_buf_mgr, + WORD32 id, + UWORD32 mask); + +// Gets status of the buffer +WORD32 ih264_buf_mgr_get_status(buf_mgr_t *ps_buf_mgr, WORD32 id); + +// pass the ID - buffer will be returned +void* ih264_buf_mgr_get_buf(buf_mgr_t *ps_buf_mgr, WORD32 id); +//Pass buffer to get ID +WORD32 ih264_buf_mgr_get_bufid(buf_mgr_t *ps_buf_mgr, void *pv_buf); + +// will return number of active buffers +UWORD32 ih264_buf_mgr_get_num_active_buf(buf_mgr_t *ps_buf_mgr); + + + +#endif /* _IH264_BUF_MGR_H_ */ diff --git a/common/ih264_cabac_tables.c b/common/ih264_cabac_tables.c new file mode 100755 index 0000000..118ca12 --- /dev/null +++ b/common/ih264_cabac_tables.c @@ -0,0 +1,10869 @@ +/****************************************************************************** + * + * Copyright (C) 2015 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ***************************************************************************** + * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore +*/ + + +/** +****************************************************************************** +* @file +* ih264_cabac_tables.c +* +* @brief +* This file contains H264 cabac tables for init contexts, rlps and +* cabac state trasnitions +* +* @author +* Ittiam +* +* @par List of Tables +* - gau1_ih264_cab_ctxts[] +* - gau1_ih264_next_state[] +* - gau1_ih264_cab_ctxts[][][] +* +****************************************************************************** +*/ + + +/*****************************************************************************/ +/* File Includes */ +/*****************************************************************************/ + +/* User include files */ +#include "ih264_typedefs.h" +#include "ih264_cabac_tables.h" + + +/*****************************************************************************/ +/* Extern global definitions */ +/*****************************************************************************/ + +/** + ****************************************************************************** + * @brief Table for rangeTabLPS depending on pStateIdx and qCodIRangeIdx + * input : pStateIdx(0-63) and qCodIRangeIdx(0-3) [(Range >> 6) & 0x3] + * output : RLPS + * + * @remarks See Table 9-35 of H264 spec for rangeTabLPS + ****************************************************************************** + */ +const UWORD8 gau1_ih264_cabac_rlps[64][4] = +{ + { 128, 176, 208, 240}, + { 128, 167, 197, 227}, + { 128, 158, 187, 216}, + { 123, 150, 178, 205}, + { 116, 142, 169, 195}, + { 111, 135, 160, 185}, + { 105, 128, 152, 175}, + { 100, 122, 144, 166}, + { 95, 116, 137, 158}, + { 90, 110, 130, 150}, + { 85, 104, 123, 142}, + { 81, 99, 117, 135}, + { 77, 94, 111, 128}, + { 73, 89, 105, 122}, + { 69, 85, 100, 116}, + { 66, 80, 95, 110}, + { 62, 76, 90, 104}, + { 59, 72, 86, 99}, + { 56, 69, 81, 94}, + { 53, 65, 77, 89}, + { 51, 62, 73, 85}, + { 48, 59, 69, 80}, + { 46, 56, 66, 76}, + { 43, 53, 63, 72}, + { 41, 50, 59, 69}, + { 39, 48, 56, 65}, + { 37, 45, 54, 62}, + { 35, 43, 51, 59}, + { 33, 41, 48, 56}, + { 32, 39, 46, 53}, + { 30, 37, 43, 50}, + { 29, 35, 41, 48}, + { 27, 33, 39, 45}, + { 26, 31, 37, 43}, + { 24, 30, 35, 41}, + { 23, 28, 33, 39}, + { 22, 27, 32, 37}, + { 21, 26, 30, 35}, + { 20, 24, 29, 33}, + { 19, 23, 27, 31}, + { 18, 22, 26, 30}, + { 17, 21, 25, 28}, + { 16, 20, 23, 27}, + { 15, 19, 22, 25}, + { 14, 18, 21, 24}, + { 14, 17, 20, 23}, + { 13, 16, 19, 22}, + { 12, 15, 18, 21}, + { 12, 14, 17, 20}, + { 11, 14, 16, 19}, + { 11, 13, 15, 18}, + { 10, 12, 15, 17}, + { 10, 12, 14, 16}, + { 9, 11, 13, 15}, + { 9, 11, 12, 14}, + { 8, 10, 12, 14}, + { 8, 9, 11, 13}, + { 7, 9, 11, 12}, + { 7, 9, 10, 12}, + { 7, 8, 10, 11}, + { 6, 8, 9, 11}, + { 6, 7, 9, 10}, + { 6, 7, 8, 9}, + { 2, 2, 2, 2} +}; + +/** + ****************************************************************************** + * @brief probaility+MPS state transition tables based on cur State and bin + * input : curpState[bits7-2] | curMPS[bit1] | decodedBin[bit0] + * output : nextpState[bits6-1] | nextMPS[bit0] + * @remarks Modified form of Table-9-36 State Transition table in H264 spec + ****************************************************************************** + */ +const UWORD8 gau1_ih264_next_state[64 * 2 * 2] = +{ +/*****************************************************************************/ +/* m=0,b=0 | m=0,b=1 | m=1,b=0 | m=1,b=1 */ +/*****************************************************************************/ + 2, 1, 0, 3,/* mps reversal for m=0,b=1 / m=1,b=0 */ + 4, 0, 1, 5, + 6, 2, 3, 7, + 8, 4, 5, 9, + 10, 4, 5, 11, + 12, 8, 9, 13, + 14, 8, 9, 15, + 16, 10, 11, 17, + 18, 12, 13, 19, + 20, 14, 15, 21, + 22, 16, 17, 23, + 24, 18, 19, 25, + 26, 18, 19, 27, + 28, 22, 23, 29, + 30, 22, 23, 31, + 32, 24, 25, 33, + 34, 26, 27, 35, + 36, 26, 27, 37, + 38, 30, 31, 39, + 40, 30, 31, 41, + 42, 32, 33, 43, + 44, 32, 33, 45, + 46, 36, 37, 47, + 48, 36, 37, 49, + 50, 38, 39, 51, + 52, 38, 39, 53, + 54, 42, 43, 55, + 56, 42, 43, 57, + 58, 44, 45, 59, + 60, 44, 45, 61, + 62, 46, 47, 63, + 64, 48, 49, 65, + 66, 48, 49, 67, + 68, 50, 51, 69, + 70, 52, 53, 71, + 72, 52, 53, 73, + 74, 54, 55, 75, + 76, 54, 55, 77, + 78, 56, 57, 79, + 80, 58, 59, 81, + 82, 58, 59, 83, + 84, 60, 61, 85, + 86, 60, 61, 87, + 88, 60, 61, 89, + 90, 62, 63, 91, + 92, 64, 65, 93, + 94, 64, 65, 95, + 96, 66, 67, 97, + 98, 66, 67, 99, + 100, 66, 67, 101, + 102, 68, 69, 103, + 104, 68, 69, 105, + 106, 70, 71, 107, + 108, 70, 71, 109, + 110, 70, 71, 111, + 112, 72, 73, 113, + 114, 72, 73, 115, + 116, 72, 73, 117, + 118, 74, 75, 119, + 120, 74, 75, 121, + 122, 74, 75, 123, + 124, 76, 77, 125, + 124, 76, 77, 125, + 126, 126, 127, 127 +}; + + +/* +****************************************************************************** +* As per H264 standard the cabac initialization of context variables +* are generated using following logic +* (ref: section 9.3.1.1 of ITU-T Rec. H.264 (03/2005)) +* +* The two values assigned to pStateIdx and valMPS during this initialization +* are derived from SliceQPY +* +* Given the two table entries [m, n] (for a given slice type, context index and +* cabac_init_idc), the initialization is specified by the following pseudo-code process +* +* preCtxState = Clip3( 1, 126, ( ( m * Clip3( 0, 51, SliceQPY ) ) >> 4 ) + n ) +* if( preCtxState <= 63 ) { +* pStateIdx = 63 - preCtxState +* valMPS = 0 +* } else { +* pStateIdx = preCtxState - 64 +* valMPS = 1 +* } +****************************************************************************** +*/ + +/** + ****************************************************************************** + * @brief Init context tables for all combinations of qp and cabac_init_idc + * @remarks Packing format MPS in lsb and pState in bits[1-6] + ****************************************************************************** + */ +const UWORD8 gau1_ih264_cab_ctxts[IH264_NUM_CABAC_INIT_IDC_PLUS_ONE][IH264_MAX_QP][IH264_NUM_CABAC_CTXTS] = +{ + + { + + { + /* Context Tables for P, SP, B Slices :: cabac_init_idc = 0, qp = 0 */ + + 124, 18, 21, 124, 18, 21, 125, 81, 20, 18, + 24, 60, 122, 124, 108, 28, 109, 12, 29, 3, + 2, 28, 19, 26, 1, 40, 124, 7, 53, 81, + 125, 81, 7, 29, 3, 2, 45, 63, 4, 36, + 11, 35, 65, 16, 7, 45, 49, 10, 25, 61, + 18, 11, 35, 49, 7, 21, 21, 33, 17, 10, + 44, 0, 0, 0, 39, 45, 67, 17, 44, 2, + 36, 29, 65, 125, 69, 75, 7, 37, 61, 39, + 93, 55, 77, 59, 125, 57, 51, 65, 89, 34, + 3, 12, 59, 21, 57, 47, 125, 18, 6, 8, + 11, 30, 9, 11, 49, 43, 29, 23, 27, 18, + 26, 9, 26, 42, 35, 0, 13, 7, 12, 25, + 56, 1, 4, 56, 76, 78, 68, 54, 59, 19, + 19, 34, 28, 73, 20, 20, 20, 4, 14, 14, + 0, 6, 2, 12, 11, 12, 48, 24, 9, 1, + 4, 0, 26, 48, 38, 22, 30, 6, 8, 8, + 60, 38, 40, 29, 6, 11, 70, 46, 38, 28, + 34, 38, 24, 32, 48, 2, 34, 18, 18, 10, + 0, 24, 12, 20, 22, 16, 36, 54, 20, 37, + 16, 29, 34, 64, 41, 112, 124, 120, 118, 124, + 124, 114, 114, 108, 88, 72, 66, 86, 58, 13, + 7, 8, 7, 66, 62, 56, 68, 64, 50, 40, + 44, 0, 8, 1, 61, 51, 89, 25, 38, 36, + 22, 1, 8, 13, 23, 37, 77, 27, 78, 42, + 30, 16, 8, 15, 39, 47, 111, 10, 68, 54, + 50, 40, 16, 10, 1, 21, 53, 13, 68, 64, + 42, 8, 10, 17, 35, 67, 10, 116, 98, 90, + 72, 46, 10, 13, 31, 43, 124, 85, 85, 47, + 101, 93, 69, 93, 85, 79, 87, 89, 97, 65, + 63, 55, 59, 61, 45, 7, 33, 43, 13, 6, + 10, 4, 26, 26, 28, 18, 44, 34, 24, 28, + 22, 44, 32, 16, 44, 38, 26, 20, 28, 0, + 1, 11, 8, 13, 38, 64, 40, 20, 58, 50, + 22, 46, 62, 38, 50, 26, 12, 40, 104, 98, + 104, 104, 108, 124, 124, 124, 124, 124, 124, 124, + 124, 124, 68, 124, 124, 124, 124, 124, 124, 108, + 74, 72, 12, 37, 23, 67, 123, 124, 124, 124, + 114, 110, 106, 82, 88, 62, 64, 44, 38, 32, + 3, 15, 6, 0, 3, 78, 86, 80, 62, 80, + 78, 46, 62, 68, 42, 12, 20, 4, 45, 46, + 24, 8, 31, 15, 11, 13, 5, 9, 19, 11, + 13, 7, 2, 13, 5, 3, 0, 124, 124, 124, + 124, 124, 120, 108, 72, 8, 5, 56, 42, 36, + 30, 14, 6, 2, 5, 25, 43, 35, 27, 35, + 33, 19, 21, 39, 15, 7, 4, 5, 5, 8, + 8, 124, 124, 124, 124, 122, 114, 92, 58, 2, + }, + + { + /* Context Tables for P, SP, B Slices :: cabac_init_idc = 0, qp = 1 */ + + 124, 18, 21, 124, 18, 21, 123, 77, 22, 20, + 24, 58, 120, 124, 108, 28, 103, 12, 27, 1, + 2, 28, 17, 24, 3, 40, 124, 9, 55, 81, + 121, 77, 7, 27, 1, 2, 43, 59, 6, 36, + 9, 33, 63, 16, 7, 43, 49, 10, 23, 59, + 18, 11, 33, 49, 5, 19, 19, 31, 15, 10, + 44, 0, 0, 0, 37, 45, 67, 15, 44, 2, + 36, 27, 63, 121, 65, 71, 3, 33, 57, 37, + 89, 51, 73, 57, 123, 55, 49, 63, 87, 36, + 1, 14, 57, 19, 55, 45, 121, 18, 6, 8, + 11, 32, 9, 9, 47, 41, 27, 21, 25, 18, + 26, 7, 26, 42, 33, 0, 11, 7, 12, 23, + 56, 1, 4, 56, 74, 78, 68, 54, 57, 17, + 17, 34, 28, 71, 20, 20, 20, 6, 14, 14, + 2, 8, 4, 12, 9, 12, 48, 24, 9, 1, + 4, 0, 26, 46, 38, 22, 30, 8, 10, 8, + 58, 38, 40, 27, 6, 11, 70, 46, 38, 28, + 34, 38, 24, 32, 48, 2, 34, 18, 18, 10, + 0, 24, 12, 20, 22, 16, 36, 54, 20, 35, + 16, 27, 34, 62, 39, 110, 124, 118, 116, 122, + 124, 112, 112, 104, 86, 70, 64, 82, 56, 15, + 7, 8, 7, 64, 60, 54, 66, 62, 48, 38, + 42, 0, 8, 1, 59, 49, 87, 23, 40, 36, + 22, 0, 10, 11, 21, 35, 73, 25, 78, 42, + 30, 16, 10, 13, 37, 45, 107, 10, 70, 56, + 50, 40, 18, 10, 1, 19, 51, 13, 70, 64, + 42, 8, 12, 15, 33, 65, 10, 116, 98, 90, + 72, 46, 10, 11, 29, 41, 124, 83, 83, 45, + 97, 89, 67, 89, 81, 75, 83, 85, 93, 63, + 61, 53, 57, 57, 43, 7, 31, 41, 11, 6, + 10, 4, 26, 26, 26, 16, 44, 34, 26, 28, + 22, 44, 32, 16, 44, 38, 26, 20, 28, 0, + 1, 9, 10, 13, 38, 64, 40, 20, 58, 50, + 24, 46, 60, 38, 50, 26, 12, 38, 104, 98, + 104, 102, 106, 124, 124, 124, 124, 124, 124, 124, + 124, 124, 66, 124, 124, 124, 124, 124, 124, 106, + 72, 70, 12, 35, 21, 63, 117, 124, 124, 124, + 112, 106, 104, 80, 84, 60, 62, 42, 36, 30, + 5, 15, 6, 0, 5, 76, 84, 78, 60, 78, + 76, 44, 60, 66, 40, 10, 18, 2, 45, 46, + 24, 8, 29, 13, 9, 11, 3, 7, 15, 9, + 11, 5, 6, 9, 3, 0, 4, 124, 124, 124, + 124, 124, 116, 102, 68, 4, 3, 58, 44, 38, + 32, 16, 8, 4, 3, 23, 41, 33, 25, 33, + 29, 15, 19, 37, 13, 5, 6, 3, 3, 8, + 8, 124, 124, 124, 124, 116, 108, 86, 52, 1, + }, + + { + /* Context Tables for P, SP, B Slices :: cabac_init_idc = 0, qp = 2 */ + + 124, 18, 21, 124, 18, 21, 119, 75, 22, 20, + 24, 56, 118, 122, 108, 28, 99, 12, 25, 0, + 2, 26, 17, 22, 5, 38, 120, 13, 57, 83, + 115, 75, 7, 25, 0, 2, 43, 57, 6, 34, + 9, 33, 61, 16, 7, 43, 49, 10, 23, 57, + 18, 11, 33, 49, 5, 19, 19, 31, 15, 10, + 44, 0, 0, 0, 35, 45, 67, 15, 42, 2, + 36, 27, 63, 117, 61, 67, 1, 29, 55, 35, + 87, 49, 71, 55, 119, 55, 49, 63, 85, 36, + 1, 14, 55, 19, 53, 45, 119, 18, 6, 8, + 11, 32, 9, 9, 47, 41, 27, 21, 25, 18, + 26, 7, 26, 42, 33, 0, 11, 7, 12, 23, + 54, 1, 4, 54, 72, 76, 66, 52, 55, 17, + 17, 32, 26, 71, 18, 20, 20, 6, 14, 14, + 4, 8, 4, 12, 9, 12, 46, 24, 11, 1, + 4, 1, 26, 44, 38, 22, 28, 8, 10, 8, + 56, 38, 38, 27, 6, 13, 68, 46, 38, 28, + 34, 38, 24, 32, 48, 2, 34, 18, 18, 10, + 0, 24, 12, 20, 22, 16, 34, 52, 18, 35, + 16, 27, 32, 60, 39, 106, 124, 114, 112, 118, + 120, 108, 108, 100, 82, 66, 60, 78, 52, 17, + 7, 8, 9, 62, 58, 52, 64, 58, 46, 36, + 40, 1, 6, 3, 59, 49, 85, 23, 40, 36, + 22, 0, 10, 11, 21, 35, 71, 23, 78, 42, + 30, 16, 10, 13, 35, 43, 103, 10, 70, 56, + 50, 40, 18, 10, 1, 19, 49, 13, 70, 64, + 42, 8, 12, 15, 33, 63, 10, 114, 96, 88, + 70, 46, 10, 11, 29, 41, 124, 81, 81, 43, + 95, 87, 65, 87, 79, 73, 81, 83, 89, 61, + 59, 53, 55, 55, 43, 9, 31, 39, 11, 6, + 8, 4, 24, 24, 24, 14, 42, 34, 26, 28, + 20, 42, 32, 16, 42, 36, 26, 20, 26, 0, + 1, 9, 10, 13, 36, 62, 38, 20, 56, 48, + 24, 44, 58, 38, 50, 24, 10, 34, 102, 96, + 102, 100, 104, 124, 124, 124, 124, 124, 124, 124, + 124, 124, 64, 124, 124, 124, 124, 124, 124, 102, + 70, 68, 12, 33, 21, 61, 113, 120, 120, 124, + 108, 102, 100, 76, 80, 58, 58, 40, 32, 28, + 7, 17, 4, 0, 7, 74, 82, 74, 56, 74, + 72, 42, 56, 62, 38, 8, 16, 0, 47, 44, + 22, 6, 29, 13, 9, 9, 3, 5, 13, 7, + 9, 3, 8, 7, 1, 2, 6, 124, 124, 124, + 124, 120, 110, 96, 62, 0, 3, 58, 44, 38, + 32, 18, 8, 4, 3, 23, 41, 33, 23, 33, + 27, 13, 19, 35, 11, 3, 6, 3, 1, 8, + 8, 124, 124, 124, 120, 110, 100, 78, 46, 7, + }, + + { + /* Context Tables for P, SP, B Slices :: cabac_init_idc = 0, qp = 3 */ + + 124, 18, 21, 124, 18, 21, 115, 71, 24, 20, + 22, 52, 114, 120, 108, 28, 95, 12, 23, 2, + 2, 24, 17, 20, 7, 38, 116, 15, 59, 83, + 109, 73, 7, 23, 2, 2, 41, 55, 8, 34, + 9, 31, 59, 14, 9, 43, 49, 10, 23, 57, + 18, 11, 33, 49, 3, 19, 19, 31, 13, 10, + 44, 0, 0, 0, 35, 45, 67, 13, 40, 2, + 36, 27, 63, 113, 57, 65, 2, 25, 53, 33, + 83, 47, 69, 53, 115, 53, 49, 61, 83, 36, + 1, 14, 55, 19, 53, 43, 115, 18, 4, 6, + 13, 32, 9, 9, 45, 41, 25, 21, 23, 18, + 26, 7, 26, 40, 33, 0, 11, 7, 12, 23, + 52, 1, 4, 52, 70, 74, 64, 50, 55, 15, + 17, 30, 26, 69, 18, 20, 20, 6, 14, 14, + 6, 8, 4, 12, 7, 12, 44, 24, 13, 1, + 4, 1, 24, 42, 38, 22, 26, 8, 10, 8, + 52, 38, 36, 27, 6, 13, 66, 46, 38, 28, + 34, 38, 24, 32, 48, 2, 32, 18, 18, 10, + 0, 22, 10, 18, 20, 14, 32, 50, 18, 35, + 14, 27, 30, 56, 39, 104, 124, 110, 108, 114, + 116, 104, 104, 96, 78, 64, 58, 74, 48, 19, + 7, 8, 9, 60, 56, 50, 60, 56, 42, 34, + 38, 3, 6, 3, 59, 49, 85, 21, 40, 36, + 22, 0, 10, 11, 21, 33, 69, 23, 78, 42, + 30, 16, 12, 11, 33, 41, 99, 10, 70, 56, + 50, 40, 20, 10, 1, 19, 49, 13, 70, 64, + 40, 8, 12, 15, 33, 61, 10, 114, 96, 86, + 68, 46, 10, 11, 27, 39, 124, 79, 79, 43, + 93, 85, 63, 83, 77, 71, 79, 79, 87, 61, + 57, 53, 55, 51, 43, 9, 31, 39, 11, 4, + 8, 4, 22, 22, 22, 12, 42, 32, 26, 26, + 20, 42, 30, 16, 40, 36, 24, 20, 24, 0, + 3, 9, 10, 15, 36, 62, 36, 20, 54, 48, + 24, 42, 56, 36, 48, 22, 10, 32, 100, 94, + 102, 98, 102, 122, 124, 124, 124, 124, 124, 124, + 124, 124, 62, 124, 124, 124, 124, 124, 124, 98, + 68, 66, 12, 31, 21, 59, 109, 116, 116, 124, + 104, 98, 96, 74, 76, 54, 56, 38, 30, 24, + 9, 19, 4, 1, 9, 72, 78, 72, 52, 70, + 68, 38, 54, 58, 34, 6, 12, 3, 49, 42, + 20, 4, 29, 11, 9, 9, 1, 5, 11, 5, + 7, 1, 10, 5, 0, 6, 8, 124, 124, 124, + 124, 116, 104, 90, 56, 3, 1, 60, 46, 40, + 32, 20, 10, 4, 1, 21, 41, 31, 23, 31, + 25, 11, 19, 35, 11, 3, 6, 1, 0, 8, + 8, 124, 124, 124, 114, 104, 92, 70, 38, 11, + }, + + { + /* Context Tables for P, SP, B Slices :: cabac_init_idc = 0, qp = 4 */ + + 124, 18, 21, 124, 18, 21, 113, 69, 24, 20, + 22, 50, 112, 116, 108, 28, 89, 10, 21, 2, + 2, 22, 17, 18, 9, 36, 112, 19, 61, 85, + 103, 71, 7, 21, 2, 2, 41, 53, 8, 32, + 9, 31, 59, 14, 9, 41, 49, 10, 23, 55, + 16, 13, 33, 49, 3, 17, 19, 29, 13, 10, + 44, 0, 0, 0, 33, 47, 67, 13, 38, 2, + 36, 27, 63, 111, 55, 61, 4, 23, 51, 31, + 81, 43, 67, 51, 111, 53, 47, 61, 81, 36, + 1, 14, 53, 19, 51, 43, 113, 16, 4, 6, + 13, 32, 9, 9, 45, 41, 25, 21, 23, 18, + 24, 7, 26, 40, 33, 0, 11, 7, 12, 23, + 52, 3, 4, 52, 68, 72, 62, 48, 53, 15, + 17, 28, 24, 69, 16, 20, 18, 6, 14, 14, + 8, 10, 4, 10, 7, 10, 42, 22, 15, 1, + 4, 3, 24, 40, 36, 20, 26, 10, 10, 8, + 50, 36, 34, 27, 6, 15, 66, 46, 38, 28, + 34, 38, 24, 32, 46, 2, 32, 18, 18, 10, + 1, 22, 10, 18, 20, 14, 32, 48, 16, 35, + 14, 27, 28, 54, 39, 100, 124, 106, 104, 110, + 112, 100, 100, 92, 74, 60, 54, 68, 44, 21, + 7, 6, 11, 58, 54, 48, 58, 52, 40, 32, + 34, 3, 4, 5, 59, 49, 83, 21, 40, 36, + 22, 0, 10, 11, 21, 33, 67, 21, 78, 42, + 30, 16, 12, 11, 33, 41, 95, 10, 70, 56, + 50, 40, 20, 10, 1, 19, 47, 13, 70, 62, + 40, 8, 12, 15, 33, 61, 10, 112, 94, 84, + 66, 46, 10, 11, 27, 39, 124, 77, 77, 41, + 89, 83, 61, 81, 73, 69, 75, 77, 83, 59, + 57, 51, 53, 49, 41, 11, 31, 37, 11, 4, + 6, 2, 20, 20, 20, 10, 40, 32, 26, 26, + 18, 40, 30, 16, 38, 34, 24, 18, 22, 1, + 3, 9, 10, 15, 34, 60, 34, 20, 52, 46, + 24, 40, 54, 36, 48, 20, 8, 28, 98, 94, + 100, 96, 98, 120, 124, 124, 124, 124, 124, 124, + 124, 124, 58, 124, 124, 124, 124, 124, 124, 94, + 66, 62, 12, 29, 19, 57, 105, 114, 112, 120, + 102, 94, 92, 70, 72, 52, 52, 34, 26, 22, + 11, 21, 2, 1, 11, 68, 76, 68, 50, 66, + 64, 36, 50, 54, 32, 4, 10, 5, 49, 40, + 20, 2, 29, 11, 7, 7, 1, 3, 9, 5, + 5, 0, 12, 3, 2, 8, 10, 124, 124, 124, + 122, 110, 98, 84, 50, 9, 1, 60, 46, 40, + 34, 20, 10, 6, 1, 21, 39, 31, 21, 31, + 23, 9, 19, 33, 9, 1, 6, 1, 2, 8, + 8, 124, 124, 122, 108, 98, 84, 62, 32, 17, + }, + + { + /* Context Tables for P, SP, B Slices :: cabac_init_idc = 0, qp = 5 */ + + 124, 18, 21, 124, 18, 21, 109, 65, 24, 20, + 20, 46, 108, 114, 108, 28, 85, 10, 19, 4, + 2, 22, 15, 16, 11, 36, 108, 23, 63, 85, + 97, 67, 7, 19, 4, 2, 41, 51, 8, 32, + 9, 31, 57, 14, 11, 41, 49, 10, 23, 53, + 16, 13, 33, 49, 1, 17, 17, 29, 11, 10, + 44, 0, 0, 0, 33, 47, 67, 11, 36, 2, + 36, 25, 63, 107, 51, 59, 8, 19, 47, 29, + 79, 41, 65, 49, 107, 51, 47, 59, 79, 36, + 1, 14, 53, 19, 51, 41, 109, 16, 4, 6, + 13, 32, 9, 7, 43, 41, 25, 21, 21, 18, + 24, 7, 26, 40, 31, 0, 9, 7, 12, 23, + 50, 3, 4, 50, 66, 72, 60, 46, 51, 13, + 17, 26, 24, 67, 16, 20, 18, 6, 14, 14, + 10, 10, 4, 10, 7, 10, 40, 22, 17, 1, + 4, 3, 22, 38, 36, 20, 24, 10, 10, 8, + 48, 36, 32, 27, 6, 15, 64, 46, 38, 28, + 34, 38, 24, 32, 46, 2, 32, 18, 18, 10, + 1, 22, 10, 16, 20, 14, 30, 46, 16, 35, + 12, 27, 26, 52, 39, 98, 122, 104, 102, 106, + 108, 96, 96, 88, 70, 56, 50, 64, 42, 23, + 7, 6, 11, 56, 52, 46, 56, 50, 36, 30, + 32, 5, 4, 5, 59, 49, 83, 21, 40, 36, + 22, 0, 10, 9, 19, 31, 65, 21, 78, 42, + 30, 16, 12, 9, 31, 39, 91, 10, 70, 56, + 50, 40, 20, 10, 1, 19, 45, 13, 72, 62, + 38, 8, 12, 15, 33, 59, 10, 112, 92, 82, + 64, 46, 10, 11, 27, 37, 124, 75, 75, 39, + 87, 81, 59, 79, 71, 67, 73, 73, 79, 57, + 55, 51, 53, 47, 41, 11, 29, 35, 11, 2, + 6, 2, 20, 18, 18, 8, 38, 30, 26, 24, + 18, 40, 30, 16, 36, 32, 24, 18, 20, 1, + 3, 9, 10, 15, 32, 60, 34, 20, 50, 44, + 24, 38, 52, 34, 46, 18, 6, 24, 96, 92, + 100, 94, 96, 116, 124, 124, 124, 124, 124, 124, + 124, 124, 56, 124, 124, 124, 124, 124, 122, 90, + 64, 60, 12, 27, 19, 55, 101, 110, 110, 116, + 98, 90, 88, 68, 68, 50, 48, 32, 22, 18, + 13, 23, 2, 1, 13, 66, 72, 64, 46, 64, + 62, 32, 48, 52, 28, 2, 8, 7, 51, 40, + 18, 0, 27, 9, 7, 7, 0, 1, 7, 3, + 3, 2, 16, 1, 4, 10, 14, 124, 124, 124, + 116, 106, 92, 78, 44, 13, 1, 62, 48, 42, + 34, 22, 10, 6, 0, 19, 39, 31, 19, 29, + 21, 7, 17, 31, 9, 1, 6, 0, 4, 8, + 8, 124, 124, 116, 102, 92, 78, 54, 24, 23, + }, + + { + /* Context Tables for P, SP, B Slices :: cabac_init_idc = 0, qp = 6 */ + + 124, 18, 23, 124, 18, 23, 105, 63, 26, 20, + 20, 44, 106, 112, 108, 28, 81, 10, 19, 6, + 2, 20, 15, 14, 13, 34, 106, 25, 65, 87, + 91, 65, 7, 19, 6, 2, 39, 49, 10, 30, + 7, 29, 55, 12, 11, 41, 49, 10, 21, 53, + 16, 13, 31, 49, 1, 17, 17, 29, 11, 10, + 44, 0, 0, 0, 31, 47, 67, 11, 36, 0, + 36, 25, 61, 103, 47, 55, 10, 15, 45, 27, + 75, 39, 63, 49, 105, 51, 47, 59, 79, 38, + 1, 14, 51, 17, 49, 41, 107, 16, 2, 4, + 15, 32, 9, 7, 43, 41, 23, 21, 21, 18, + 24, 5, 26, 38, 31, 0, 9, 7, 12, 23, + 48, 3, 4, 48, 64, 70, 60, 46, 51, 13, + 17, 26, 22, 67, 14, 20, 18, 6, 14, 14, + 10, 10, 4, 10, 5, 10, 38, 22, 17, 3, + 4, 5, 22, 36, 36, 20, 22, 10, 10, 8, + 44, 36, 30, 27, 6, 17, 62, 46, 36, 28, + 34, 38, 24, 32, 46, 2, 30, 18, 16, 10, + 1, 20, 8, 16, 18, 12, 28, 44, 14, 35, + 12, 25, 24, 48, 39, 94, 118, 100, 98, 102, + 104, 92, 92, 84, 66, 54, 48, 60, 38, 25, + 7, 6, 13, 54, 50, 44, 52, 46, 34, 28, + 30, 7, 2, 7, 59, 49, 81, 19, 40, 36, + 22, 2, 10, 9, 19, 31, 63, 19, 76, 42, + 30, 16, 14, 9, 29, 37, 87, 10, 72, 56, + 50, 40, 22, 10, 1, 17, 45, 13, 72, 62, + 38, 8, 12, 13, 31, 57, 10, 110, 92, 80, + 64, 46, 10, 9, 25, 37, 124, 75, 73, 39, + 85, 79, 57, 75, 69, 65, 71, 71, 77, 57, + 53, 51, 51, 43, 41, 13, 29, 35, 11, 2, + 4, 2, 18, 16, 16, 6, 38, 30, 26, 24, + 16, 38, 28, 16, 36, 32, 22, 18, 20, 1, + 5, 9, 10, 17, 32, 58, 32, 18, 48, 44, + 26, 38, 50, 34, 46, 18, 6, 22, 94, 90, + 98, 92, 94, 114, 124, 124, 124, 124, 124, 124, + 124, 122, 54, 124, 124, 124, 124, 124, 118, 86, + 62, 58, 12, 25, 19, 51, 95, 106, 106, 112, + 94, 86, 84, 64, 64, 46, 46, 30, 20, 16, + 15, 25, 0, 3, 15, 64, 70, 62, 42, 60, + 58, 30, 44, 48, 26, 1, 4, 11, 53, 38, + 16, 1, 27, 9, 7, 5, 0, 1, 3, 1, + 1, 4, 18, 2, 6, 14, 16, 124, 124, 120, + 112, 100, 88, 72, 40, 17, 0, 62, 48, 42, + 34, 24, 12, 6, 0, 19, 39, 29, 19, 29, + 19, 5, 17, 31, 7, 0, 6, 0, 6, 8, + 8, 124, 124, 112, 96, 84, 70, 48, 18, 27, + }, + + { + /* Context Tables for P, SP, B Slices :: cabac_init_idc = 0, qp = 7 */ + + 124, 18, 23, 124, 18, 23, 101, 59, 26, 20, + 18, 40, 102, 108, 108, 28, 75, 8, 17, 6, + 2, 18, 15, 12, 15, 34, 102, 29, 67, 87, + 85, 63, 7, 17, 6, 2, 39, 47, 10, 30, + 7, 29, 55, 12, 13, 39, 49, 10, 21, 51, + 14, 13, 31, 49, 0, 15, 17, 27, 9, 10, + 44, 0, 0, 0, 31, 47, 67, 9, 34, 0, + 36, 25, 61, 101, 43, 53, 14, 11, 43, 25, + 73, 35, 61, 47, 101, 49, 45, 57, 77, 38, + 1, 14, 51, 17, 49, 39, 103, 14, 2, 4, + 15, 32, 9, 7, 41, 41, 23, 21, 19, 18, + 22, 5, 26, 38, 31, 0, 9, 7, 12, 23, + 48, 3, 4, 48, 62, 68, 58, 44, 49, 11, + 17, 24, 22, 65, 14, 20, 16, 6, 14, 14, + 12, 12, 4, 10, 5, 10, 36, 22, 19, 3, + 4, 5, 20, 34, 34, 20, 22, 12, 10, 8, + 42, 34, 28, 27, 6, 17, 62, 46, 36, 28, + 34, 38, 24, 32, 46, 2, 30, 18, 16, 10, + 1, 20, 8, 14, 18, 12, 28, 42, 14, 35, + 10, 25, 22, 46, 39, 92, 114, 96, 94, 98, + 100, 88, 88, 80, 62, 50, 44, 54, 34, 27, + 7, 4, 13, 52, 48, 42, 50, 44, 30, 26, + 28, 7, 2, 7, 59, 49, 81, 19, 40, 36, + 22, 2, 10, 9, 19, 29, 61, 19, 76, 42, + 30, 16, 14, 7, 27, 37, 83, 10, 72, 56, + 50, 40, 22, 10, 1, 17, 43, 13, 72, 60, + 36, 8, 12, 13, 31, 57, 10, 110, 90, 78, + 62, 46, 10, 9, 25, 35, 124, 73, 71, 37, + 81, 77, 55, 73, 65, 63, 67, 67, 73, 55, + 51, 49, 51, 41, 39, 13, 29, 33, 11, 0, + 4, 0, 16, 14, 14, 4, 36, 28, 26, 22, + 16, 38, 28, 16, 34, 30, 22, 16, 18, 1, + 5, 9, 10, 17, 30, 58, 30, 18, 46, 42, + 26, 36, 48, 32, 44, 16, 4, 18, 92, 90, + 98, 90, 90, 110, 124, 124, 124, 124, 124, 124, + 124, 118, 50, 124, 124, 124, 124, 124, 112, 82, + 60, 56, 12, 23, 17, 49, 91, 104, 102, 108, + 92, 82, 80, 62, 60, 44, 42, 26, 16, 12, + 17, 27, 0, 3, 17, 60, 66, 58, 40, 56, + 54, 26, 42, 44, 22, 3, 2, 13, 53, 36, + 16, 3, 27, 7, 5, 5, 2, 0, 1, 0, + 0, 6, 20, 4, 8, 16, 18, 124, 122, 116, + 106, 96, 82, 66, 34, 21, 0, 64, 50, 44, + 36, 26, 12, 8, 2, 17, 37, 29, 17, 27, + 17, 3, 17, 29, 7, 0, 6, 2, 8, 8, + 8, 124, 124, 106, 90, 78, 62, 40, 10, 33, + }, + + { + /* Context Tables for P, SP, B Slices :: cabac_init_idc = 0, qp = 8 */ + + 124, 16, 23, 124, 16, 23, 99, 57, 26, 20, + 18, 38, 100, 106, 108, 28, 71, 8, 15, 8, + 2, 16, 15, 10, 19, 32, 98, 33, 69, 89, + 81, 61, 7, 15, 8, 2, 39, 45, 10, 28, + 7, 29, 53, 10, 13, 39, 51, 10, 21, 51, + 14, 15, 31, 49, 0, 15, 17, 27, 9, 10, + 44, 0, 0, 0, 29, 49, 67, 9, 32, 0, + 36, 25, 61, 97, 41, 49, 16, 9, 41, 23, + 71, 33, 59, 45, 97, 49, 45, 57, 75, 38, + 1, 14, 49, 17, 47, 39, 101, 14, 0, 2, + 17, 32, 9, 7, 41, 41, 23, 21, 19, 16, + 22, 5, 26, 36, 31, 0, 9, 7, 10, 23, + 46, 5, 4, 46, 58, 66, 56, 42, 49, 11, + 17, 22, 20, 65, 12, 18, 16, 6, 14, 14, + 14, 12, 4, 8, 5, 8, 34, 20, 21, 3, + 4, 7, 20, 32, 34, 18, 20, 12, 10, 8, + 38, 34, 26, 27, 6, 19, 60, 44, 36, 28, + 34, 36, 22, 32, 44, 0, 28, 18, 16, 8, + 3, 18, 6, 14, 16, 10, 26, 40, 12, 35, + 10, 25, 20, 42, 39, 88, 110, 92, 90, 94, + 94, 84, 84, 76, 58, 46, 40, 50, 30, 29, + 7, 4, 15, 50, 44, 38, 46, 40, 28, 22, + 24, 9, 0, 9, 59, 49, 79, 19, 40, 36, + 22, 2, 10, 9, 19, 29, 59, 17, 76, 42, + 30, 16, 14, 7, 27, 35, 81, 10, 72, 56, + 50, 38, 22, 10, 1, 17, 43, 13, 72, 60, + 36, 8, 12, 13, 31, 55, 10, 108, 88, 76, + 60, 44, 10, 9, 25, 35, 124, 71, 69, 37, + 79, 75, 55, 71, 63, 61, 65, 65, 71, 55, + 51, 49, 49, 39, 39, 15, 29, 33, 11, 0, + 2, 0, 14, 12, 10, 2, 34, 28, 26, 22, + 14, 36, 26, 14, 32, 28, 20, 16, 16, 3, + 7, 9, 10, 19, 28, 56, 28, 18, 44, 40, + 26, 34, 46, 32, 44, 14, 2, 14, 90, 88, + 96, 86, 88, 108, 124, 124, 124, 124, 124, 124, + 124, 112, 48, 124, 124, 124, 124, 122, 108, 78, + 56, 52, 12, 23, 17, 47, 87, 100, 98, 104, + 88, 76, 76, 58, 56, 40, 38, 24, 12, 10, + 19, 29, 1, 5, 19, 58, 64, 54, 36, 52, + 50, 24, 38, 40, 20, 5, 1, 17, 55, 34, + 14, 5, 27, 7, 5, 3, 2, 0, 0, 0, + 2, 8, 22, 6, 10, 18, 20, 122, 118, 112, + 102, 90, 76, 60, 28, 27, 0, 64, 50, 44, + 36, 26, 12, 8, 2, 17, 37, 29, 17, 27, + 15, 1, 17, 29, 5, 2, 6, 2, 8, 8, + 6, 124, 122, 102, 84, 72, 54, 32, 4, 39, + }, + + { + /* Context Tables for P, SP, B Slices :: cabac_init_idc = 0, qp = 9 */ + + 124, 16, 23, 124, 16, 23, 95, 55, 28, 20, + 18, 36, 98, 104, 108, 28, 67, 8, 13, 10, + 2, 16, 13, 8, 21, 30, 94, 35, 71, 91, + 75, 57, 7, 13, 10, 2, 37, 43, 12, 26, + 7, 27, 51, 10, 13, 39, 51, 10, 21, 49, + 14, 15, 31, 49, 0, 15, 15, 27, 9, 10, + 44, 0, 0, 0, 27, 49, 67, 9, 30, 0, + 36, 23, 61, 93, 37, 45, 18, 5, 37, 21, + 67, 31, 55, 43, 93, 49, 45, 57, 73, 38, + 1, 14, 47, 17, 45, 37, 99, 14, 0, 2, + 17, 32, 9, 5, 39, 39, 21, 21, 19, 16, + 22, 5, 26, 36, 29, 0, 7, 7, 10, 21, + 44, 5, 4, 44, 56, 66, 54, 40, 47, 11, + 15, 20, 18, 65, 10, 18, 16, 8, 14, 14, + 16, 12, 4, 8, 3, 8, 34, 20, 23, 3, + 4, 9, 20, 30, 34, 18, 18, 12, 10, 8, + 36, 34, 26, 27, 6, 21, 58, 44, 36, 28, + 34, 36, 22, 32, 44, 0, 28, 18, 16, 8, + 3, 18, 6, 14, 16, 10, 24, 40, 12, 35, + 10, 25, 18, 40, 39, 84, 108, 90, 88, 90, + 90, 82, 82, 72, 54, 44, 38, 46, 28, 31, + 7, 4, 17, 48, 42, 36, 44, 38, 26, 20, + 22, 11, 1, 11, 59, 47, 77, 17, 42, 36, + 22, 2, 12, 7, 17, 27, 57, 15, 76, 42, + 30, 16, 16, 7, 25, 33, 77, 10, 72, 56, + 50, 38, 24, 10, 1, 17, 41, 13, 74, 60, + 36, 8, 14, 13, 31, 53, 10, 108, 88, 76, + 58, 44, 10, 9, 23, 33, 124, 69, 67, 35, + 77, 71, 53, 67, 61, 57, 63, 63, 67, 53, + 49, 49, 47, 35, 39, 17, 27, 31, 11, 0, + 0, 0, 14, 10, 8, 0, 34, 28, 26, 22, + 14, 34, 26, 14, 30, 28, 20, 16, 14, 3, + 7, 7, 12, 19, 28, 54, 28, 18, 44, 40, + 26, 32, 44, 32, 44, 12, 2, 12, 90, 86, + 94, 84, 86, 106, 120, 120, 124, 124, 124, 124, + 124, 108, 46, 124, 124, 124, 124, 116, 104, 76, + 54, 50, 12, 21, 17, 45, 83, 96, 96, 100, + 84, 72, 74, 56, 52, 38, 36, 22, 10, 8, + 21, 29, 1, 5, 21, 56, 62, 52, 32, 50, + 48, 22, 36, 38, 18, 7, 3, 19, 57, 34, + 12, 5, 25, 7, 5, 1, 4, 2, 2, 2, + 4, 10, 26, 8, 12, 22, 24, 120, 116, 108, + 98, 84, 70, 54, 22, 31, 2, 64, 50, 46, + 36, 28, 14, 8, 4, 15, 37, 27, 15, 27, + 13, 2, 15, 27, 3, 4, 6, 4, 10, 8, + 6, 124, 118, 98, 80, 66, 48, 24, 1, 43, + }, + + { + /* Context Tables for P, SP, B Slices :: cabac_init_idc = 0, qp = 10 */ + + 124, 16, 23, 124, 16, 23, 91, 51, 28, 20, + 16, 32, 94, 100, 108, 28, 61, 6, 11, 10, + 2, 14, 13, 6, 23, 30, 90, 39, 73, 91, + 69, 55, 7, 11, 10, 2, 37, 41, 12, 26, + 7, 27, 51, 10, 15, 37, 51, 10, 21, 47, + 12, 15, 31, 49, 2, 13, 15, 25, 7, 10, + 44, 0, 0, 0, 27, 49, 67, 7, 28, 0, + 36, 23, 61, 91, 33, 43, 22, 1, 35, 19, + 65, 27, 53, 41, 89, 47, 43, 55, 71, 38, + 1, 14, 47, 17, 45, 37, 95, 12, 0, 2, + 17, 32, 9, 5, 39, 39, 21, 21, 17, 16, + 20, 5, 26, 36, 29, 0, 7, 7, 10, 21, + 44, 5, 4, 44, 54, 64, 52, 38, 45, 9, + 15, 18, 18, 63, 10, 18, 14, 8, 14, 14, + 18, 14, 4, 8, 3, 8, 32, 20, 25, 3, + 4, 9, 18, 28, 32, 18, 18, 14, 10, 8, + 34, 32, 24, 27, 6, 21, 58, 44, 36, 28, + 34, 36, 22, 32, 44, 0, 28, 18, 16, 8, + 3, 18, 6, 12, 16, 10, 24, 38, 10, 35, + 8, 25, 16, 38, 39, 82, 104, 86, 84, 86, + 86, 78, 78, 68, 50, 40, 34, 40, 24, 33, + 7, 2, 17, 46, 40, 34, 42, 34, 22, 18, + 20, 11, 1, 11, 59, 47, 77, 17, 42, 36, + 22, 2, 12, 7, 17, 27, 55, 15, 76, 42, + 30, 16, 16, 5, 23, 33, 73, 10, 72, 56, + 50, 38, 24, 10, 1, 17, 39, 13, 74, 58, + 34, 8, 14, 13, 31, 53, 10, 106, 86, 74, + 56, 44, 10, 9, 23, 33, 124, 67, 65, 33, + 73, 69, 51, 65, 57, 55, 59, 59, 63, 51, + 47, 47, 47, 33, 37, 17, 27, 29, 11, 1, + 0, 1, 12, 8, 6, 1, 32, 26, 26, 20, + 12, 34, 26, 14, 28, 26, 20, 14, 12, 3, + 7, 7, 12, 19, 26, 54, 26, 18, 42, 38, + 26, 30, 42, 30, 42, 10, 0, 8, 88, 86, + 94, 82, 82, 102, 116, 116, 124, 124, 124, 124, + 124, 104, 42, 118, 124, 118, 124, 112, 98, 72, + 52, 48, 12, 19, 15, 43, 79, 94, 92, 96, + 82, 68, 70, 52, 48, 36, 32, 18, 6, 4, + 23, 31, 3, 5, 23, 52, 58, 48, 30, 46, + 44, 18, 32, 34, 14, 9, 5, 21, 57, 32, + 12, 7, 25, 5, 3, 1, 4, 4, 4, 4, + 6, 12, 28, 10, 14, 24, 26, 120, 112, 104, + 92, 80, 64, 48, 16, 35, 2, 66, 52, 46, + 38, 30, 14, 10, 4, 15, 35, 27, 13, 25, + 11, 4, 15, 25, 3, 4, 6, 4, 12, 8, + 6, 124, 114, 92, 74, 60, 40, 16, 9, 49, + }, + + { + /* Context Tables for P, SP, B Slices :: cabac_init_idc = 0, qp = 11 */ + + 124, 16, 25, 124, 16, 25, 87, 49, 30, 20, + 16, 30, 92, 98, 108, 28, 57, 6, 11, 12, + 2, 12, 13, 4, 25, 28, 88, 41, 75, 93, + 63, 53, 7, 11, 12, 2, 35, 39, 14, 24, + 5, 25, 49, 8, 15, 37, 51, 10, 19, 47, + 12, 15, 29, 49, 2, 13, 15, 25, 7, 10, + 44, 0, 0, 0, 25, 49, 67, 7, 28, 1, + 36, 23, 59, 87, 29, 39, 24, 2, 33, 17, + 61, 25, 51, 41, 87, 47, 43, 55, 71, 40, + 1, 14, 45, 15, 43, 35, 93, 12, 1, 0, + 19, 32, 9, 5, 37, 39, 19, 21, 17, 16, + 20, 3, 26, 34, 29, 0, 7, 7, 10, 21, + 42, 5, 4, 42, 52, 62, 52, 38, 45, 9, + 15, 18, 16, 63, 8, 18, 14, 8, 14, 14, + 18, 14, 4, 8, 1, 8, 30, 20, 25, 5, + 4, 11, 18, 26, 32, 18, 16, 14, 10, 8, + 30, 32, 22, 27, 6, 23, 56, 44, 34, 28, + 34, 36, 22, 32, 44, 0, 26, 18, 14, 8, + 3, 16, 4, 12, 14, 8, 22, 36, 10, 35, + 8, 23, 14, 34, 39, 78, 100, 82, 80, 82, + 82, 74, 74, 64, 46, 38, 32, 36, 20, 35, + 7, 2, 19, 44, 38, 32, 38, 32, 20, 16, + 18, 13, 3, 13, 59, 47, 75, 15, 42, 36, + 22, 4, 12, 7, 17, 25, 53, 13, 74, 42, + 30, 16, 18, 5, 21, 31, 69, 10, 74, 56, + 50, 38, 26, 10, 1, 15, 39, 13, 74, 58, + 34, 8, 14, 11, 29, 51, 10, 106, 86, 72, + 56, 44, 10, 7, 21, 31, 124, 67, 63, 33, + 71, 67, 49, 61, 55, 53, 57, 57, 61, 51, + 45, 47, 45, 29, 37, 19, 27, 29, 11, 1, + 1, 1, 10, 6, 4, 3, 32, 26, 26, 20, + 12, 32, 24, 14, 28, 26, 18, 14, 12, 3, + 9, 7, 12, 21, 26, 52, 24, 16, 40, 38, + 28, 30, 40, 30, 42, 10, 0, 6, 86, 84, + 92, 80, 80, 100, 112, 112, 122, 120, 124, 124, + 120, 98, 40, 114, 124, 112, 124, 106, 94, 68, + 50, 46, 12, 17, 15, 39, 73, 90, 88, 92, + 78, 64, 66, 50, 44, 32, 30, 16, 4, 2, + 25, 33, 3, 7, 25, 50, 56, 46, 26, 42, + 40, 16, 30, 30, 12, 13, 9, 25, 59, 30, + 10, 9, 25, 5, 3, 0, 6, 4, 8, 6, + 8, 14, 30, 14, 16, 28, 28, 118, 110, 100, + 88, 74, 60, 42, 12, 39, 4, 66, 52, 48, + 38, 32, 16, 10, 6, 13, 35, 25, 13, 25, + 9, 6, 15, 25, 1, 6, 6, 6, 14, 8, + 6, 124, 110, 88, 68, 52, 32, 10, 15, 53, + }, + + { + /* Context Tables for P, SP, B Slices :: cabac_init_idc = 0, qp = 12 */ + + 124, 16, 25, 124, 16, 25, 85, 45, 30, 20, + 14, 26, 88, 96, 108, 28, 53, 6, 9, 14, + 2, 10, 13, 2, 27, 28, 84, 45, 77, 93, + 57, 51, 7, 9, 14, 2, 35, 37, 14, 24, + 5, 25, 47, 8, 17, 37, 51, 10, 19, 45, + 12, 17, 29, 49, 4, 13, 15, 25, 5, 10, + 44, 0, 0, 0, 25, 51, 67, 5, 26, 1, + 36, 23, 59, 83, 27, 37, 28, 4, 31, 15, + 59, 23, 49, 39, 83, 45, 43, 53, 69, 40, + 1, 14, 45, 15, 43, 35, 89, 12, 1, 0, + 19, 32, 9, 5, 37, 39, 19, 21, 15, 16, + 20, 3, 26, 34, 29, 0, 7, 7, 10, 21, + 40, 7, 4, 40, 50, 60, 50, 36, 43, 7, + 15, 16, 16, 61, 8, 18, 14, 8, 14, 14, + 20, 14, 4, 6, 1, 6, 28, 18, 27, 5, + 4, 11, 16, 24, 32, 16, 14, 14, 10, 8, + 28, 32, 20, 27, 6, 23, 54, 44, 34, 28, + 34, 36, 22, 32, 42, 0, 26, 18, 14, 8, + 5, 16, 4, 10, 14, 8, 20, 34, 8, 35, + 6, 23, 12, 32, 39, 76, 96, 78, 76, 78, + 78, 70, 70, 60, 42, 34, 28, 32, 16, 37, + 7, 2, 19, 42, 36, 30, 36, 28, 16, 14, + 14, 15, 3, 13, 59, 47, 75, 15, 42, 36, + 22, 4, 12, 7, 17, 25, 51, 13, 74, 42, + 30, 16, 18, 3, 21, 29, 65, 10, 74, 56, + 50, 38, 26, 10, 1, 15, 37, 13, 74, 58, + 32, 8, 14, 11, 29, 49, 10, 104, 84, 70, + 54, 44, 10, 7, 21, 31, 124, 65, 61, 31, + 69, 65, 47, 59, 53, 51, 55, 53, 57, 49, + 45, 47, 45, 27, 37, 19, 27, 27, 11, 3, + 1, 1, 8, 4, 2, 5, 30, 24, 26, 18, + 10, 32, 24, 14, 26, 24, 18, 14, 10, 5, + 9, 7, 12, 21, 24, 52, 22, 16, 38, 36, + 28, 28, 38, 28, 40, 8, 1, 2, 84, 82, + 92, 78, 78, 96, 108, 108, 118, 114, 124, 124, + 114, 94, 38, 108, 124, 106, 116, 100, 88, 64, + 48, 42, 12, 15, 15, 37, 69, 86, 84, 88, + 74, 60, 62, 46, 40, 30, 26, 14, 0, 1, + 27, 35, 5, 7, 27, 48, 52, 42, 22, 38, + 36, 12, 26, 26, 8, 15, 11, 27, 61, 28, + 8, 11, 25, 3, 3, 0, 6, 6, 10, 6, + 10, 16, 32, 16, 18, 30, 30, 118, 106, 96, + 82, 70, 54, 36, 6, 45, 4, 68, 54, 48, + 38, 32, 16, 10, 6, 13, 35, 25, 11, 23, + 7, 8, 15, 23, 1, 6, 6, 6, 16, 8, + 6, 122, 106, 82, 62, 46, 24, 2, 23, 59, + }, + + { + /* Context Tables for P, SP, B Slices :: cabac_init_idc = 0, qp = 13 */ + + 124, 16, 25, 124, 16, 25, 81, 43, 30, 20, + 14, 24, 86, 92, 108, 28, 47, 4, 7, 14, + 2, 10, 11, 0, 29, 26, 80, 49, 79, 95, + 51, 47, 7, 7, 14, 2, 35, 35, 14, 22, + 5, 25, 47, 8, 17, 35, 51, 10, 19, 43, + 10, 17, 29, 49, 4, 11, 13, 23, 5, 10, + 44, 0, 0, 0, 23, 51, 67, 5, 24, 1, + 36, 21, 59, 81, 23, 33, 30, 8, 27, 13, + 57, 19, 47, 37, 79, 45, 41, 53, 67, 40, + 1, 14, 43, 15, 41, 33, 87, 10, 1, 0, + 19, 32, 9, 3, 35, 39, 19, 21, 15, 16, + 18, 3, 26, 34, 27, 0, 5, 7, 10, 21, + 40, 7, 4, 40, 48, 60, 48, 34, 41, 7, + 15, 14, 14, 61, 6, 18, 12, 8, 14, 14, + 22, 16, 4, 6, 1, 6, 26, 18, 29, 5, + 4, 13, 16, 22, 30, 16, 14, 16, 10, 8, + 26, 30, 18, 27, 6, 25, 54, 44, 34, 28, + 34, 36, 22, 32, 42, 0, 26, 18, 14, 8, + 5, 16, 4, 10, 14, 8, 20, 32, 8, 35, + 6, 23, 10, 30, 39, 72, 92, 76, 74, 74, + 74, 66, 66, 56, 38, 30, 24, 26, 14, 39, + 7, 0, 21, 40, 34, 28, 34, 26, 14, 12, + 12, 15, 5, 15, 59, 47, 73, 15, 42, 36, + 22, 4, 12, 5, 15, 23, 49, 11, 74, 42, + 30, 16, 18, 3, 19, 29, 61, 10, 74, 56, + 50, 38, 26, 10, 1, 15, 35, 13, 76, 56, + 32, 8, 14, 11, 29, 49, 10, 104, 82, 68, + 52, 44, 10, 7, 21, 29, 124, 63, 59, 29, + 65, 63, 45, 57, 49, 49, 51, 51, 53, 47, + 43, 45, 43, 25, 35, 21, 25, 25, 11, 3, + 3, 3, 8, 2, 0, 7, 28, 24, 26, 18, + 10, 30, 24, 14, 24, 22, 18, 12, 8, 5, + 9, 7, 12, 21, 22, 50, 22, 16, 36, 34, + 28, 26, 36, 28, 40, 6, 3, 1, 82, 82, + 90, 76, 74, 94, 104, 104, 114, 110, 124, 122, + 108, 90, 34, 102, 124, 100, 108, 96, 84, 60, + 46, 40, 12, 13, 13, 35, 65, 84, 82, 84, + 72, 56, 58, 44, 36, 28, 22, 10, 3, 3, + 29, 37, 5, 7, 29, 44, 50, 38, 20, 36, + 34, 10, 24, 24, 6, 17, 13, 29, 61, 28, + 8, 13, 23, 3, 1, 2, 8, 8, 12, 8, + 12, 18, 36, 18, 20, 32, 34, 116, 102, 92, + 78, 64, 48, 30, 0, 49, 4, 68, 54, 50, + 40, 34, 16, 12, 8, 11, 33, 25, 9, 23, + 5, 10, 13, 21, 0, 8, 6, 8, 18, 8, + 6, 118, 102, 78, 56, 40, 18, 5, 29, 65, + }, + + { + /* Context Tables for P, SP, B Slices :: cabac_init_idc = 0, qp = 14 */ + + 122, 16, 25, 122, 16, 25, 77, 39, 32, 20, + 12, 20, 82, 90, 108, 28, 43, 4, 5, 16, + 2, 8, 11, 1, 31, 26, 76, 51, 81, 95, + 45, 45, 7, 5, 16, 2, 33, 33, 16, 22, + 5, 23, 45, 6, 19, 35, 51, 10, 19, 43, + 10, 17, 29, 49, 6, 11, 13, 23, 3, 10, + 44, 0, 0, 0, 23, 51, 67, 3, 22, 1, + 36, 21, 59, 77, 19, 31, 34, 12, 25, 11, + 53, 17, 45, 35, 75, 43, 41, 51, 65, 40, + 1, 14, 43, 15, 41, 33, 83, 10, 3, 1, + 21, 32, 9, 3, 35, 39, 17, 21, 13, 16, + 18, 3, 26, 32, 27, 0, 5, 7, 10, 21, + 38, 7, 4, 38, 46, 58, 46, 32, 41, 5, + 15, 12, 14, 59, 6, 18, 12, 8, 14, 14, + 24, 16, 4, 6, 0, 6, 24, 18, 31, 5, + 4, 13, 14, 20, 30, 16, 12, 16, 10, 8, + 22, 30, 16, 27, 6, 25, 52, 44, 34, 28, + 34, 36, 22, 32, 42, 0, 24, 18, 14, 8, + 5, 14, 2, 8, 12, 6, 18, 30, 6, 35, + 4, 23, 8, 26, 39, 70, 88, 72, 70, 70, + 70, 62, 62, 52, 34, 28, 22, 22, 10, 41, + 7, 0, 21, 38, 32, 26, 30, 22, 10, 10, + 10, 17, 5, 15, 59, 47, 73, 13, 42, 36, + 22, 4, 12, 5, 15, 23, 47, 11, 74, 42, + 30, 16, 20, 1, 17, 27, 57, 10, 74, 56, + 50, 38, 28, 10, 1, 15, 35, 13, 76, 56, + 30, 8, 14, 11, 29, 47, 10, 102, 82, 66, + 50, 44, 10, 7, 19, 29, 124, 61, 57, 29, + 63, 61, 43, 53, 47, 47, 49, 47, 51, 47, + 41, 45, 43, 21, 35, 21, 25, 25, 11, 5, + 3, 3, 6, 0, 1, 9, 28, 22, 26, 16, + 8, 30, 22, 14, 22, 22, 16, 12, 6, 5, + 11, 7, 12, 23, 22, 50, 20, 16, 34, 34, + 28, 24, 34, 26, 38, 4, 3, 3, 80, 80, + 90, 74, 72, 90, 100, 100, 110, 104, 120, 118, + 102, 84, 32, 96, 124, 94, 100, 90, 78, 56, + 44, 38, 12, 11, 13, 33, 61, 80, 78, 80, + 68, 52, 54, 40, 32, 24, 20, 8, 5, 7, + 31, 39, 7, 9, 31, 42, 46, 36, 16, 32, + 30, 6, 20, 20, 2, 19, 17, 33, 63, 26, + 6, 15, 23, 1, 1, 2, 8, 8, 14, 10, + 14, 20, 38, 20, 22, 36, 36, 116, 100, 88, + 72, 60, 42, 24, 5, 53, 6, 70, 56, 50, + 40, 36, 18, 12, 8, 11, 33, 23, 9, 21, + 3, 12, 13, 21, 0, 8, 6, 8, 20, 8, + 6, 116, 98, 72, 50, 34, 10, 13, 37, 69, + }, + + { + /* Context Tables for P, SP, B Slices :: cabac_init_idc = 0, qp = 15 */ + + 120, 16, 25, 120, 16, 25, 73, 37, 32, 20, + 12, 18, 80, 88, 108, 28, 39, 4, 3, 18, + 2, 6, 11, 3, 33, 24, 72, 55, 83, 97, + 39, 43, 7, 3, 18, 2, 33, 31, 16, 20, + 5, 23, 43, 6, 19, 35, 51, 10, 19, 41, + 10, 17, 29, 49, 6, 11, 13, 23, 3, 10, + 44, 0, 0, 0, 21, 51, 67, 3, 20, 1, + 36, 21, 59, 73, 15, 27, 36, 16, 23, 9, + 51, 15, 43, 33, 71, 43, 41, 51, 63, 40, + 1, 14, 41, 15, 39, 31, 81, 10, 3, 1, + 21, 32, 9, 3, 33, 39, 17, 21, 13, 16, + 18, 3, 26, 32, 27, 0, 5, 7, 10, 21, + 36, 7, 4, 36, 44, 56, 44, 30, 39, 5, + 15, 10, 12, 59, 4, 18, 12, 8, 14, 14, + 26, 16, 4, 6, 0, 6, 22, 18, 33, 5, + 4, 15, 14, 18, 30, 16, 10, 16, 10, 8, + 20, 30, 14, 27, 6, 27, 50, 44, 34, 28, + 34, 36, 22, 32, 42, 0, 24, 18, 14, 8, + 5, 14, 2, 8, 12, 6, 16, 28, 6, 35, + 4, 23, 6, 24, 39, 66, 84, 68, 66, 66, + 66, 58, 58, 48, 30, 24, 18, 18, 6, 43, + 7, 0, 23, 36, 30, 24, 28, 20, 8, 8, + 8, 19, 7, 17, 59, 47, 71, 13, 42, 36, + 22, 4, 12, 5, 15, 21, 45, 9, 74, 42, + 30, 16, 20, 1, 15, 25, 53, 10, 74, 56, + 50, 38, 28, 10, 1, 15, 33, 13, 76, 56, + 30, 8, 14, 11, 29, 45, 10, 102, 80, 64, + 48, 44, 10, 7, 19, 27, 124, 59, 55, 27, + 61, 59, 41, 51, 45, 45, 47, 45, 47, 45, + 39, 45, 41, 19, 35, 23, 25, 23, 11, 5, + 5, 3, 4, 1, 3, 11, 26, 22, 26, 16, + 8, 28, 22, 14, 20, 20, 16, 12, 4, 5, + 11, 7, 12, 23, 20, 48, 18, 16, 32, 32, + 28, 22, 32, 26, 38, 2, 5, 7, 78, 78, + 88, 72, 70, 88, 96, 96, 106, 100, 114, 112, + 96, 80, 30, 90, 118, 88, 92, 84, 74, 52, + 42, 36, 12, 9, 13, 31, 57, 76, 74, 76, + 64, 48, 50, 38, 28, 22, 16, 6, 9, 9, + 33, 41, 7, 9, 33, 40, 44, 32, 12, 28, + 26, 4, 18, 16, 0, 21, 19, 35, 65, 24, + 4, 17, 23, 1, 1, 4, 10, 10, 16, 12, + 16, 22, 40, 22, 24, 38, 38, 114, 96, 84, + 68, 54, 36, 18, 11, 57, 6, 70, 56, 52, + 40, 38, 18, 12, 10, 9, 33, 23, 7, 21, + 1, 14, 13, 19, 2, 10, 6, 10, 22, 8, + 6, 114, 94, 68, 44, 28, 2, 21, 43, 75, + }, + + { + /* Context Tables for P, SP, B Slices :: cabac_init_idc = 0, qp = 16 */ + + 116, 14, 27, 116, 14, 27, 71, 35, 32, 20, + 10, 14, 76, 84, 106, 28, 35, 2, 3, 18, + 0, 4, 11, 7, 37, 22, 68, 59, 85, 99, + 35, 41, 9, 3, 18, 0, 33, 29, 16, 18, + 5, 23, 43, 4, 21, 35, 53, 10, 19, 41, + 8, 19, 29, 49, 6, 11, 13, 23, 3, 8, + 44, 0, 0, 0, 21, 53, 67, 3, 18, 3, + 36, 21, 59, 71, 13, 25, 38, 18, 21, 7, + 49, 13, 41, 33, 69, 43, 41, 51, 63, 40, + 1, 14, 41, 15, 39, 31, 79, 8, 5, 3, + 23, 32, 9, 3, 33, 39, 17, 21, 13, 14, + 16, 3, 24, 30, 27, 1, 5, 7, 8, 21, + 34, 9, 2, 34, 40, 54, 42, 28, 39, 5, + 15, 8, 10, 59, 2, 16, 10, 8, 14, 14, + 26, 16, 4, 4, 0, 4, 20, 16, 35, 7, + 2, 17, 12, 16, 28, 14, 8, 16, 10, 8, + 16, 28, 12, 27, 6, 29, 48, 42, 32, 28, + 34, 34, 20, 32, 40, 1, 22, 18, 12, 6, + 7, 12, 0, 6, 10, 4, 14, 26, 4, 35, + 2, 23, 4, 20, 39, 62, 80, 64, 62, 62, + 60, 54, 54, 44, 26, 20, 14, 12, 2, 47, + 9, 1, 25, 34, 26, 20, 24, 16, 4, 4, + 4, 21, 9, 19, 59, 47, 71, 13, 42, 36, + 22, 4, 12, 5, 15, 21, 43, 9, 72, 42, + 30, 16, 20, 1, 15, 25, 51, 8, 74, 56, + 48, 36, 28, 10, 1, 15, 33, 13, 76, 54, + 28, 6, 14, 11, 29, 45, 10, 100, 78, 62, + 46, 42, 10, 7, 19, 27, 124, 59, 53, 27, + 59, 57, 41, 49, 43, 43, 45, 43, 45, 45, + 39, 45, 41, 17, 35, 25, 25, 23, 11, 7, + 7, 5, 2, 3, 7, 15, 24, 20, 26, 14, + 6, 26, 20, 12, 18, 18, 14, 10, 2, 7, + 13, 7, 12, 25, 18, 46, 16, 14, 30, 30, + 28, 20, 28, 24, 36, 0, 7, 11, 76, 76, + 86, 68, 66, 84, 92, 92, 100, 94, 108, 106, + 90, 74, 26, 84, 110, 82, 82, 78, 68, 48, + 38, 32, 12, 9, 13, 29, 53, 72, 70, 72, + 60, 42, 46, 34, 22, 18, 12, 2, 13, 13, + 35, 43, 9, 11, 37, 36, 40, 28, 8, 24, + 22, 0, 14, 12, 3, 25, 23, 39, 67, 22, + 2, 19, 23, 1, 1, 4, 10, 10, 18, 12, + 18, 22, 42, 24, 26, 40, 40, 112, 92, 78, + 62, 48, 30, 10, 17, 63, 6, 70, 56, 52, + 40, 38, 18, 12, 10, 9, 33, 23, 7, 21, + 0, 16, 13, 19, 2, 10, 6, 10, 22, 8, + 4, 110, 88, 62, 38, 20, 5, 29, 51, 81, + }, + + { + /* Context Tables for P, SP, B Slices :: cabac_init_idc = 0, qp = 17 */ + + 114, 14, 27, 114, 14, 27, 67, 31, 34, 22, + 10, 12, 74, 82, 106, 28, 29, 2, 1, 20, + 0, 4, 9, 9, 39, 22, 66, 61, 87, 99, + 29, 37, 9, 1, 20, 0, 31, 25, 18, 18, + 3, 21, 41, 4, 21, 33, 53, 10, 17, 39, + 8, 19, 27, 49, 8, 9, 11, 21, 1, 8, + 44, 0, 0, 0, 19, 53, 67, 1, 18, 3, + 36, 19, 57, 67, 9, 21, 42, 22, 17, 5, + 45, 9, 37, 31, 65, 41, 39, 49, 61, 42, + 0, 16, 39, 13, 37, 29, 75, 8, 5, 3, + 23, 34, 9, 1, 31, 37, 15, 19, 11, 14, + 16, 1, 24, 30, 25, 1, 3, 7, 8, 19, + 34, 9, 2, 34, 38, 54, 42, 28, 37, 3, + 13, 8, 10, 57, 2, 16, 10, 10, 14, 14, + 28, 18, 6, 4, 2, 4, 20, 16, 35, 7, + 2, 17, 12, 14, 28, 14, 8, 18, 12, 8, + 14, 28, 12, 25, 6, 29, 48, 42, 32, 28, + 34, 34, 20, 32, 40, 1, 22, 18, 12, 6, + 7, 12, 0, 6, 10, 4, 14, 26, 4, 33, + 2, 21, 4, 18, 37, 60, 78, 62, 60, 58, + 56, 52, 52, 40, 24, 18, 12, 8, 0, 49, + 9, 1, 25, 32, 24, 18, 22, 14, 2, 2, + 2, 21, 9, 19, 57, 45, 69, 11, 44, 36, + 22, 6, 14, 3, 13, 19, 39, 7, 72, 42, + 30, 16, 22, 0, 13, 23, 47, 8, 76, 58, + 48, 36, 30, 10, 1, 13, 31, 13, 78, 54, + 28, 6, 16, 9, 27, 43, 10, 100, 78, 62, + 46, 42, 10, 5, 17, 25, 124, 57, 51, 25, + 55, 53, 39, 45, 39, 39, 41, 39, 41, 43, + 37, 43, 39, 13, 33, 25, 23, 21, 9, 7, + 7, 5, 2, 3, 9, 17, 24, 20, 28, 14, + 6, 26, 20, 12, 18, 18, 14, 10, 2, 7, + 13, 5, 14, 25, 18, 46, 16, 14, 30, 30, + 30, 20, 26, 24, 36, 0, 7, 13, 76, 76, + 86, 66, 64, 82, 88, 88, 96, 90, 104, 102, + 86, 70, 24, 80, 104, 76, 74, 74, 64, 46, + 36, 30, 12, 7, 11, 25, 47, 70, 68, 70, + 58, 38, 44, 32, 18, 16, 10, 0, 15, 15, + 37, 43, 9, 11, 39, 34, 38, 26, 6, 22, + 20, 1, 12, 10, 5, 27, 25, 41, 67, 22, + 2, 19, 21, 0, 0, 6, 12, 12, 22, 14, + 20, 24, 46, 28, 28, 44, 44, 112, 90, 74, + 58, 44, 26, 4, 21, 67, 8, 72, 58, 54, + 42, 40, 20, 14, 12, 7, 31, 21, 5, 19, + 4, 20, 11, 17, 4, 12, 8, 12, 24, 8, + 4, 108, 84, 58, 34, 14, 11, 35, 57, 85, + }, + + { + /* Context Tables for P, SP, B Slices :: cabac_init_idc = 0, qp = 18 */ + + 112, 14, 27, 112, 14, 27, 63, 29, 34, 22, + 10, 10, 72, 80, 106, 28, 25, 2, 0, 22, + 0, 2, 9, 11, 41, 20, 62, 65, 89, 101, + 23, 35, 9, 0, 22, 0, 31, 23, 18, 16, + 3, 21, 39, 4, 21, 33, 53, 10, 17, 37, + 8, 19, 27, 49, 8, 9, 11, 21, 1, 8, + 44, 0, 0, 0, 17, 53, 67, 1, 16, 3, + 36, 19, 57, 63, 5, 17, 44, 26, 15, 3, + 43, 7, 35, 29, 61, 41, 39, 49, 59, 42, + 0, 16, 37, 13, 35, 29, 73, 8, 5, 3, + 23, 34, 9, 1, 31, 37, 15, 19, 11, 14, + 16, 1, 24, 30, 25, 1, 3, 7, 8, 19, + 32, 9, 2, 32, 36, 52, 40, 26, 35, 3, + 13, 6, 8, 57, 0, 16, 10, 10, 14, 14, + 30, 18, 6, 4, 2, 4, 18, 16, 37, 7, + 2, 19, 12, 12, 28, 14, 6, 18, 12, 8, + 12, 28, 10, 25, 6, 31, 46, 42, 32, 28, + 34, 34, 20, 32, 40, 1, 22, 18, 12, 6, + 7, 12, 0, 6, 10, 4, 12, 24, 2, 33, + 2, 21, 2, 16, 37, 56, 74, 58, 56, 54, + 52, 48, 48, 36, 20, 14, 8, 4, 3, 51, + 9, 1, 27, 30, 22, 16, 20, 10, 0, 0, + 0, 23, 11, 21, 57, 45, 67, 11, 44, 36, + 22, 6, 14, 3, 13, 19, 37, 5, 72, 42, + 30, 16, 22, 0, 11, 21, 43, 8, 76, 58, + 48, 36, 30, 10, 1, 13, 29, 13, 78, 54, + 28, 6, 16, 9, 27, 41, 10, 98, 76, 60, + 44, 42, 10, 5, 17, 25, 124, 55, 49, 23, + 53, 51, 37, 43, 37, 37, 39, 37, 37, 41, + 35, 43, 37, 11, 33, 27, 23, 19, 9, 7, + 9, 5, 0, 5, 11, 19, 22, 20, 28, 14, + 4, 24, 20, 12, 16, 16, 14, 10, 0, 7, + 13, 5, 14, 25, 16, 44, 14, 14, 28, 28, + 30, 18, 24, 24, 36, 1, 9, 17, 74, 74, + 84, 64, 62, 80, 84, 84, 92, 86, 98, 96, + 80, 66, 22, 74, 98, 70, 66, 68, 60, 42, + 34, 28, 12, 5, 11, 23, 43, 66, 64, 66, + 54, 34, 40, 28, 14, 14, 6, 1, 19, 17, + 39, 45, 11, 11, 41, 32, 36, 22, 2, 18, + 16, 3, 8, 6, 7, 29, 27, 43, 69, 20, + 0, 21, 21, 0, 0, 8, 12, 14, 24, 16, + 22, 26, 48, 30, 30, 46, 46, 110, 86, 70, + 54, 38, 20, 1, 27, 71, 8, 72, 58, 54, + 42, 42, 20, 14, 12, 7, 31, 21, 3, 19, + 6, 22, 11, 15, 6, 14, 8, 12, 26, 8, + 4, 106, 80, 54, 28, 8, 19, 43, 63, 91, + }, + + { + /* Context Tables for P, SP, B Slices :: cabac_init_idc = 0, qp = 19 */ + + 110, 14, 27, 110, 14, 27, 59, 25, 36, 22, + 8, 6, 68, 78, 106, 28, 21, 2, 2, 24, + 0, 0, 9, 13, 43, 20, 58, 67, 91, 101, + 17, 33, 9, 2, 24, 0, 29, 21, 20, 16, + 3, 19, 37, 2, 23, 33, 53, 10, 17, 37, + 8, 19, 27, 49, 10, 9, 11, 21, 0, 8, + 44, 0, 0, 0, 17, 53, 67, 0, 14, 3, + 36, 19, 57, 59, 1, 15, 48, 30, 13, 1, + 39, 5, 33, 27, 57, 39, 39, 47, 57, 42, + 0, 16, 37, 13, 35, 27, 69, 8, 7, 5, + 25, 34, 9, 1, 29, 37, 13, 19, 9, 14, + 16, 1, 24, 28, 25, 1, 3, 7, 8, 19, + 30, 9, 2, 30, 34, 50, 38, 24, 35, 1, + 13, 4, 8, 55, 0, 16, 10, 10, 14, 14, + 32, 18, 6, 4, 4, 4, 16, 16, 39, 7, + 2, 19, 10, 10, 28, 14, 4, 18, 12, 8, + 8, 28, 8, 25, 6, 31, 44, 42, 32, 28, + 34, 34, 20, 32, 40, 1, 20, 18, 12, 6, + 7, 10, 1, 4, 8, 2, 10, 22, 2, 33, + 0, 21, 0, 12, 37, 54, 70, 54, 52, 50, + 48, 44, 44, 32, 16, 12, 6, 0, 7, 53, + 9, 1, 27, 28, 20, 14, 16, 8, 3, 1, + 1, 25, 11, 21, 57, 45, 67, 9, 44, 36, + 22, 6, 14, 3, 13, 17, 35, 5, 72, 42, + 30, 16, 24, 2, 9, 19, 39, 8, 76, 58, + 48, 36, 32, 10, 1, 13, 29, 13, 78, 54, + 26, 6, 16, 9, 27, 39, 10, 98, 76, 58, + 42, 42, 10, 5, 15, 23, 124, 53, 47, 23, + 51, 49, 35, 39, 35, 35, 37, 33, 35, 41, + 33, 43, 37, 7, 33, 27, 23, 19, 9, 9, + 9, 5, 1, 7, 13, 21, 22, 18, 28, 12, + 4, 24, 18, 12, 14, 16, 12, 10, 1, 7, + 15, 5, 14, 27, 16, 44, 12, 14, 26, 28, + 30, 16, 22, 22, 34, 3, 9, 19, 72, 72, + 84, 62, 60, 76, 80, 80, 88, 80, 94, 92, + 74, 60, 20, 68, 92, 64, 58, 62, 54, 38, + 32, 26, 12, 3, 11, 21, 39, 62, 60, 62, + 50, 30, 36, 26, 10, 10, 4, 3, 21, 21, + 41, 47, 11, 13, 43, 30, 32, 20, 1, 14, + 12, 7, 6, 2, 11, 31, 31, 47, 71, 18, + 1, 23, 21, 2, 0, 8, 14, 14, 26, 18, + 24, 28, 50, 32, 32, 50, 48, 110, 84, 66, + 48, 34, 14, 7, 33, 75, 10, 74, 60, 56, + 42, 44, 22, 14, 14, 5, 31, 19, 3, 17, + 8, 24, 11, 15, 6, 14, 8, 14, 28, 8, + 4, 104, 76, 48, 22, 2, 27, 51, 71, 95, + }, + + { + /* Context Tables for P, SP, B Slices :: cabac_init_idc = 0, qp = 20 */ + + 106, 14, 27, 106, 14, 27, 57, 23, 36, 22, + 8, 4, 66, 74, 106, 28, 15, 0, 4, 24, + 0, 1, 9, 15, 45, 18, 54, 71, 93, 103, + 11, 31, 9, 4, 24, 0, 29, 19, 20, 14, + 3, 19, 37, 2, 23, 31, 53, 10, 17, 35, + 6, 21, 27, 49, 10, 7, 11, 19, 0, 8, + 44, 0, 0, 0, 15, 55, 67, 0, 12, 3, + 36, 19, 57, 57, 0, 11, 50, 32, 11, 0, + 37, 1, 31, 25, 53, 39, 37, 47, 55, 42, + 0, 16, 35, 13, 33, 27, 67, 6, 7, 5, + 25, 34, 9, 1, 29, 37, 13, 19, 9, 14, + 14, 1, 24, 28, 25, 1, 3, 7, 8, 19, + 30, 11, 2, 30, 32, 48, 36, 22, 33, 1, + 13, 2, 6, 55, 1, 16, 8, 10, 14, 14, + 34, 20, 6, 2, 4, 2, 14, 14, 41, 7, + 2, 21, 10, 8, 26, 12, 4, 20, 12, 8, + 6, 26, 6, 25, 6, 33, 44, 42, 32, 28, + 34, 34, 20, 32, 38, 1, 20, 18, 12, 6, + 9, 10, 1, 4, 8, 2, 10, 20, 0, 33, + 0, 21, 1, 10, 37, 50, 66, 50, 48, 46, + 44, 40, 40, 28, 12, 8, 2, 5, 11, 55, + 9, 3, 29, 26, 18, 12, 14, 4, 5, 3, + 5, 25, 13, 23, 57, 45, 65, 9, 44, 36, + 22, 6, 14, 3, 13, 17, 33, 3, 72, 42, + 30, 16, 24, 2, 9, 19, 35, 8, 76, 58, + 48, 36, 32, 10, 1, 13, 27, 13, 78, 52, + 26, 6, 16, 9, 27, 39, 10, 96, 74, 56, + 40, 42, 10, 5, 15, 23, 124, 51, 45, 21, + 47, 47, 33, 37, 31, 33, 33, 31, 31, 39, + 33, 41, 35, 5, 31, 29, 23, 17, 9, 9, + 11, 7, 3, 9, 15, 23, 20, 18, 28, 12, + 2, 22, 18, 12, 12, 14, 12, 8, 3, 9, + 15, 5, 14, 27, 14, 42, 10, 14, 24, 26, + 30, 14, 20, 22, 34, 5, 11, 23, 70, 72, + 82, 60, 56, 74, 76, 76, 84, 76, 88, 86, + 68, 56, 16, 62, 84, 58, 50, 58, 50, 34, + 30, 22, 12, 1, 9, 19, 35, 60, 56, 58, + 48, 26, 32, 22, 6, 8, 0, 7, 25, 23, + 43, 49, 13, 13, 45, 26, 30, 16, 3, 10, + 8, 9, 2, 1, 13, 33, 33, 49, 71, 16, + 1, 25, 21, 2, 2, 10, 14, 16, 28, 18, + 26, 30, 52, 34, 34, 52, 50, 108, 80, 62, + 44, 28, 8, 13, 39, 81, 10, 74, 60, 56, + 44, 44, 22, 16, 14, 5, 29, 19, 1, 17, + 10, 26, 11, 13, 8, 16, 8, 14, 30, 8, + 4, 100, 72, 44, 16, 3, 35, 59, 77, 101, + }, + + { + /* Context Tables for P, SP, B Slices :: cabac_init_idc = 0, qp = 21 */ + + 104, 14, 27, 104, 14, 27, 53, 19, 36, 22, + 6, 0, 62, 72, 106, 28, 11, 0, 6, 26, + 0, 1, 7, 17, 47, 18, 50, 75, 95, 103, + 5, 27, 9, 6, 26, 0, 29, 17, 20, 14, + 3, 19, 35, 2, 25, 31, 53, 10, 17, 33, + 6, 21, 27, 49, 12, 7, 9, 19, 2, 8, + 44, 0, 0, 0, 15, 55, 67, 2, 10, 3, + 36, 17, 57, 53, 4, 9, 54, 36, 7, 2, + 35, 0, 29, 23, 49, 37, 37, 45, 53, 42, + 0, 16, 35, 13, 33, 25, 63, 6, 7, 5, + 25, 34, 9, 0, 27, 37, 13, 19, 7, 14, + 14, 1, 24, 28, 23, 1, 1, 7, 8, 19, + 28, 11, 2, 28, 30, 48, 34, 20, 31, 0, + 13, 0, 6, 53, 1, 16, 8, 10, 14, 14, + 36, 20, 6, 2, 4, 2, 12, 14, 43, 7, + 2, 21, 8, 6, 26, 12, 2, 20, 12, 8, + 4, 26, 4, 25, 6, 33, 42, 42, 32, 28, + 34, 34, 20, 32, 38, 1, 20, 18, 12, 6, + 9, 10, 1, 2, 8, 2, 8, 18, 0, 33, + 1, 21, 3, 8, 37, 48, 62, 48, 46, 42, + 40, 36, 36, 24, 8, 4, 1, 9, 13, 57, + 9, 3, 29, 24, 16, 10, 12, 2, 9, 5, + 7, 27, 13, 23, 57, 45, 65, 9, 44, 36, + 22, 6, 14, 1, 11, 15, 31, 3, 72, 42, + 30, 16, 24, 4, 7, 17, 31, 8, 76, 58, + 48, 36, 32, 10, 1, 13, 25, 13, 80, 52, + 24, 6, 16, 9, 27, 37, 10, 96, 72, 54, + 38, 42, 10, 5, 15, 21, 124, 49, 43, 19, + 45, 45, 31, 35, 29, 31, 31, 27, 27, 37, + 31, 41, 35, 3, 31, 29, 21, 15, 9, 11, + 11, 7, 3, 11, 17, 25, 18, 16, 28, 10, + 2, 22, 18, 12, 10, 12, 12, 8, 5, 9, + 15, 5, 14, 27, 12, 42, 10, 14, 22, 24, + 30, 12, 18, 20, 32, 7, 13, 27, 68, 70, + 82, 58, 54, 70, 72, 72, 80, 70, 82, 82, + 62, 52, 14, 56, 78, 52, 42, 52, 44, 30, + 28, 20, 12, 0, 9, 17, 31, 56, 54, 54, + 44, 22, 28, 20, 2, 6, 3, 9, 29, 27, + 45, 51, 13, 13, 47, 24, 26, 12, 7, 8, + 6, 13, 0, 3, 17, 35, 35, 51, 73, 16, + 3, 27, 19, 4, 2, 10, 16, 18, 30, 20, + 28, 32, 56, 36, 36, 54, 54, 108, 76, 58, + 38, 24, 2, 19, 45, 85, 10, 76, 62, 58, + 44, 46, 22, 16, 16, 3, 29, 19, 0, 15, + 12, 28, 9, 11, 8, 16, 8, 16, 32, 8, + 4, 98, 68, 38, 10, 9, 41, 67, 85, 107, + }, + + { + /* Context Tables for P, SP, B Slices :: cabac_init_idc = 0, qp = 22 */ + + 102, 14, 29, 102, 14, 29, 49, 17, 38, 22, + 6, 1, 60, 70, 106, 28, 7, 0, 6, 28, + 0, 3, 7, 19, 49, 16, 48, 77, 97, 105, + 0, 25, 9, 6, 28, 0, 27, 15, 22, 12, + 1, 17, 33, 0, 25, 31, 53, 10, 15, 33, + 6, 21, 25, 49, 12, 7, 9, 19, 2, 8, + 44, 0, 0, 0, 13, 55, 67, 2, 10, 5, + 36, 17, 55, 49, 8, 5, 56, 40, 5, 4, + 31, 2, 27, 23, 47, 37, 37, 45, 53, 44, + 0, 16, 33, 11, 31, 25, 61, 6, 9, 7, + 27, 34, 9, 0, 27, 37, 11, 19, 7, 14, + 14, 0, 24, 26, 23, 1, 1, 7, 8, 19, + 26, 11, 2, 26, 28, 46, 34, 20, 31, 0, + 13, 0, 4, 53, 3, 16, 8, 10, 14, 14, + 36, 20, 6, 2, 6, 2, 10, 14, 43, 9, + 2, 23, 8, 4, 26, 12, 0, 20, 12, 8, + 0, 26, 2, 25, 6, 35, 40, 42, 30, 28, + 34, 34, 20, 32, 38, 1, 18, 18, 10, 6, + 9, 8, 3, 2, 6, 0, 6, 16, 1, 33, + 1, 19, 5, 4, 37, 44, 58, 44, 42, 38, + 36, 32, 32, 20, 4, 2, 3, 13, 17, 59, + 9, 3, 31, 22, 14, 8, 8, 1, 11, 7, + 9, 29, 15, 25, 57, 45, 63, 7, 44, 36, + 22, 8, 14, 1, 11, 15, 29, 1, 70, 42, + 30, 16, 26, 4, 5, 15, 27, 8, 78, 58, + 48, 36, 34, 10, 1, 11, 25, 13, 80, 52, + 24, 6, 16, 7, 25, 35, 10, 94, 72, 52, + 38, 42, 10, 3, 13, 21, 124, 49, 41, 19, + 43, 43, 29, 31, 27, 29, 29, 25, 25, 37, + 29, 41, 33, 0, 31, 31, 21, 15, 9, 11, + 13, 7, 5, 13, 19, 27, 18, 16, 28, 10, + 0, 20, 16, 12, 10, 12, 10, 8, 5, 9, + 17, 5, 14, 29, 12, 40, 8, 12, 20, 24, + 32, 12, 16, 20, 32, 7, 13, 29, 66, 68, + 80, 56, 52, 68, 68, 68, 76, 66, 78, 76, + 56, 46, 12, 52, 72, 46, 34, 46, 40, 26, + 26, 18, 12, 2, 9, 13, 25, 52, 50, 50, + 40, 18, 24, 16, 1, 2, 5, 11, 31, 29, + 47, 53, 15, 15, 49, 22, 24, 10, 11, 4, + 2, 15, 3, 7, 19, 39, 39, 55, 75, 14, + 5, 29, 19, 4, 2, 12, 16, 18, 34, 22, + 30, 34, 58, 40, 38, 58, 56, 106, 74, 54, + 34, 18, 1, 25, 49, 89, 12, 76, 62, 58, + 44, 48, 24, 16, 16, 3, 29, 17, 0, 15, + 14, 30, 9, 11, 10, 18, 8, 16, 34, 8, + 4, 96, 64, 34, 4, 17, 49, 73, 91, 111, + }, + + { + /* Context Tables for P, SP, B Slices :: cabac_init_idc = 0, qp = 23 */ + + 100, 14, 29, 100, 14, 29, 45, 13, 38, 22, + 4, 5, 56, 66, 106, 28, 1, 1, 8, 28, + 0, 5, 7, 21, 51, 16, 44, 81, 99, 105, + 6, 23, 9, 8, 28, 0, 27, 13, 22, 12, + 1, 17, 33, 0, 27, 29, 53, 10, 15, 31, + 4, 21, 25, 49, 14, 5, 9, 17, 4, 8, + 44, 0, 0, 0, 13, 55, 67, 4, 8, 5, + 36, 17, 55, 47, 12, 3, 60, 44, 3, 6, + 29, 6, 25, 21, 43, 35, 35, 43, 51, 44, + 0, 16, 33, 11, 31, 23, 57, 4, 9, 7, + 27, 34, 9, 0, 25, 37, 11, 19, 5, 14, + 12, 0, 24, 26, 23, 1, 1, 7, 8, 19, + 26, 11, 2, 26, 26, 44, 32, 18, 29, 2, + 13, 1, 4, 51, 3, 16, 6, 10, 14, 14, + 38, 22, 6, 2, 6, 2, 8, 14, 45, 9, + 2, 23, 6, 2, 24, 12, 0, 22, 12, 8, + 1, 24, 0, 25, 6, 35, 40, 42, 30, 28, + 34, 34, 20, 32, 38, 1, 18, 18, 10, 6, + 9, 8, 3, 0, 6, 0, 6, 14, 1, 33, + 3, 19, 7, 2, 37, 42, 54, 40, 38, 34, + 32, 28, 28, 16, 0, 1, 7, 19, 21, 61, + 9, 5, 31, 20, 12, 6, 6, 3, 15, 9, + 11, 29, 15, 25, 57, 45, 63, 7, 44, 36, + 22, 8, 14, 1, 11, 13, 27, 1, 70, 42, + 30, 16, 26, 6, 3, 15, 23, 8, 78, 58, + 48, 36, 34, 10, 1, 11, 23, 13, 80, 50, + 22, 6, 16, 7, 25, 35, 10, 94, 70, 50, + 36, 42, 10, 3, 13, 19, 124, 47, 39, 17, + 39, 41, 27, 29, 23, 27, 25, 21, 21, 35, + 27, 39, 33, 2, 29, 31, 21, 13, 9, 13, + 13, 9, 7, 15, 21, 29, 16, 14, 28, 8, + 0, 20, 16, 12, 8, 10, 10, 6, 7, 9, + 17, 5, 14, 29, 10, 40, 6, 12, 18, 22, + 32, 10, 14, 18, 30, 9, 15, 33, 64, 68, + 80, 54, 48, 64, 64, 64, 72, 60, 72, 72, + 50, 42, 8, 46, 64, 40, 26, 42, 34, 22, + 24, 16, 12, 4, 7, 11, 21, 50, 46, 46, + 38, 14, 20, 14, 5, 0, 9, 15, 35, 33, + 49, 55, 15, 15, 51, 18, 20, 6, 13, 0, + 1, 19, 5, 11, 23, 41, 41, 57, 75, 12, + 5, 31, 19, 6, 4, 12, 18, 20, 36, 24, + 32, 36, 60, 42, 40, 60, 58, 106, 70, 50, + 28, 14, 7, 31, 55, 93, 12, 78, 64, 60, + 46, 50, 24, 18, 18, 1, 27, 17, 2, 13, + 16, 32, 9, 9, 10, 18, 8, 18, 36, 8, + 4, 92, 60, 28, 1, 23, 57, 81, 99, 117, + }, + + { + /* Context Tables for P, SP, B Slices :: cabac_init_idc = 0, qp = 24 */ + + 96, 12, 29, 96, 12, 29, 43, 11, 38, 22, + 4, 7, 54, 64, 106, 28, 2, 1, 10, 30, + 0, 7, 7, 23, 55, 14, 40, 85, 101, 107, + 10, 21, 9, 10, 30, 0, 27, 11, 22, 10, + 1, 17, 31, 1, 27, 29, 55, 10, 15, 31, + 4, 23, 25, 49, 14, 5, 9, 17, 4, 8, + 44, 0, 0, 0, 11, 57, 67, 4, 6, 5, + 36, 17, 55, 43, 14, 0, 62, 46, 1, 8, + 27, 8, 23, 19, 39, 35, 35, 43, 49, 44, + 0, 16, 31, 11, 29, 23, 55, 4, 11, 9, + 29, 34, 9, 0, 25, 37, 11, 19, 5, 12, + 12, 0, 24, 24, 23, 1, 1, 7, 6, 19, + 24, 13, 2, 24, 22, 42, 30, 16, 29, 2, + 13, 3, 2, 51, 5, 14, 6, 10, 14, 14, + 40, 22, 6, 0, 6, 0, 6, 12, 47, 9, + 2, 25, 6, 0, 24, 10, 1, 22, 12, 8, + 5, 24, 1, 25, 6, 37, 38, 40, 30, 28, + 34, 32, 18, 32, 36, 3, 16, 18, 10, 4, + 11, 6, 5, 0, 4, 1, 4, 12, 3, 33, + 3, 19, 9, 1, 37, 38, 50, 36, 34, 30, + 26, 24, 24, 12, 3, 5, 11, 23, 25, 63, + 9, 5, 33, 18, 8, 2, 2, 7, 17, 13, + 15, 31, 17, 27, 57, 45, 61, 7, 44, 36, + 22, 8, 14, 1, 11, 13, 25, 0, 70, 42, + 30, 16, 26, 6, 3, 13, 21, 8, 78, 58, + 48, 34, 34, 10, 1, 11, 23, 13, 80, 50, + 22, 6, 16, 7, 25, 33, 10, 92, 68, 48, + 34, 40, 10, 3, 13, 19, 124, 45, 37, 17, + 37, 39, 27, 27, 21, 25, 23, 19, 19, 35, + 27, 39, 31, 4, 29, 33, 21, 13, 9, 13, + 15, 9, 9, 17, 25, 31, 14, 14, 28, 8, + 1, 18, 14, 10, 6, 8, 8, 6, 9, 11, + 19, 5, 14, 31, 8, 38, 4, 12, 16, 20, + 32, 8, 12, 18, 30, 11, 17, 37, 62, 66, + 78, 50, 46, 62, 60, 60, 66, 56, 66, 66, + 44, 36, 6, 40, 58, 34, 18, 36, 30, 18, + 20, 12, 12, 4, 7, 9, 17, 46, 42, 42, + 34, 8, 16, 10, 9, 3, 13, 17, 39, 35, + 51, 57, 17, 17, 53, 16, 18, 2, 17, 3, + 5, 21, 9, 15, 25, 43, 45, 61, 77, 10, + 7, 33, 19, 6, 4, 14, 18, 20, 38, 24, + 34, 38, 62, 44, 42, 62, 60, 104, 66, 46, + 24, 8, 13, 37, 61, 99, 12, 78, 64, 60, + 46, 50, 24, 18, 18, 1, 27, 17, 2, 13, + 18, 34, 9, 9, 12, 20, 8, 18, 36, 8, + 2, 90, 56, 24, 7, 29, 65, 89, 105, 123, + }, + + { + /* Context Tables for P, SP, B Slices :: cabac_init_idc = 0, qp = 25 */ + + 94, 12, 29, 94, 12, 29, 39, 9, 40, 22, + 4, 9, 52, 62, 106, 28, 6, 1, 12, 32, + 0, 7, 5, 25, 57, 12, 36, 87, 103, 109, + 16, 17, 9, 12, 32, 0, 25, 9, 24, 8, + 1, 15, 29, 1, 27, 29, 55, 10, 15, 29, + 4, 23, 25, 49, 14, 5, 7, 17, 4, 8, + 44, 0, 0, 0, 9, 57, 67, 4, 4, 5, + 36, 15, 55, 39, 18, 4, 64, 50, 2, 10, + 23, 10, 19, 17, 35, 35, 35, 43, 47, 44, + 0, 16, 29, 11, 27, 21, 53, 4, 11, 9, + 29, 34, 9, 2, 23, 35, 9, 19, 5, 12, + 12, 0, 24, 24, 21, 1, 0, 7, 6, 17, + 22, 13, 2, 22, 20, 42, 28, 14, 27, 2, + 11, 5, 0, 51, 7, 14, 6, 12, 14, 14, + 42, 22, 6, 0, 8, 0, 6, 12, 49, 9, + 2, 27, 6, 1, 24, 10, 3, 22, 12, 8, + 7, 24, 1, 25, 6, 39, 36, 40, 30, 28, + 34, 32, 18, 32, 36, 3, 16, 18, 10, 4, + 11, 6, 5, 0, 4, 1, 2, 12, 3, 33, + 3, 19, 11, 3, 37, 34, 48, 34, 32, 26, + 22, 22, 22, 8, 7, 7, 13, 27, 27, 65, + 9, 5, 35, 16, 6, 0, 0, 9, 19, 15, + 17, 33, 19, 29, 57, 43, 59, 5, 46, 36, + 22, 8, 16, 0, 9, 11, 23, 2, 70, 42, + 30, 16, 28, 6, 1, 11, 17, 8, 78, 58, + 48, 34, 36, 10, 1, 11, 21, 13, 82, 50, + 22, 6, 18, 7, 25, 31, 10, 92, 68, 48, + 32, 40, 10, 3, 11, 17, 124, 43, 35, 15, + 35, 35, 25, 23, 19, 21, 21, 17, 15, 33, + 25, 39, 29, 8, 29, 35, 19, 11, 9, 13, + 17, 9, 9, 19, 27, 33, 14, 14, 28, 8, + 1, 16, 14, 10, 4, 8, 8, 6, 11, 11, + 19, 3, 16, 31, 8, 36, 4, 12, 16, 20, + 32, 6, 10, 18, 30, 13, 17, 39, 62, 64, + 76, 48, 44, 60, 56, 56, 62, 52, 62, 60, + 40, 32, 4, 34, 52, 28, 10, 30, 26, 16, + 18, 10, 12, 6, 7, 7, 13, 42, 40, 38, + 30, 4, 14, 8, 13, 5, 15, 19, 41, 37, + 53, 57, 17, 17, 55, 14, 16, 0, 21, 5, + 7, 23, 11, 17, 27, 45, 47, 63, 79, 10, + 9, 33, 17, 6, 4, 16, 20, 22, 40, 26, + 36, 40, 66, 46, 44, 66, 64, 102, 64, 42, + 20, 2, 19, 43, 67, 103, 14, 78, 64, 62, + 46, 52, 26, 18, 20, 0, 27, 15, 4, 13, + 20, 38, 7, 7, 14, 22, 8, 20, 38, 8, + 2, 88, 52, 20, 11, 35, 71, 97, 111, 125, + }, + + { + /* Context Tables for P, SP, B Slices :: cabac_init_idc = 0, qp = 26 */ + + 92, 12, 29, 92, 12, 29, 35, 5, 40, 22, + 2, 13, 48, 58, 106, 28, 12, 3, 14, 32, + 0, 9, 5, 27, 59, 12, 32, 91, 105, 109, + 22, 15, 9, 14, 32, 0, 25, 7, 24, 8, + 1, 15, 29, 1, 29, 27, 55, 10, 15, 27, + 2, 23, 25, 49, 16, 3, 7, 15, 6, 8, + 44, 0, 0, 0, 9, 57, 67, 6, 2, 5, + 36, 15, 55, 37, 22, 6, 68, 54, 4, 12, + 21, 14, 17, 15, 31, 33, 33, 41, 45, 44, + 0, 16, 29, 11, 27, 21, 49, 2, 11, 9, + 29, 34, 9, 2, 23, 35, 9, 19, 3, 12, + 10, 0, 24, 24, 21, 1, 0, 7, 6, 17, + 22, 13, 2, 22, 18, 40, 26, 12, 25, 4, + 11, 7, 0, 49, 7, 14, 4, 12, 14, 14, + 44, 24, 6, 0, 8, 0, 4, 12, 51, 9, + 2, 27, 4, 3, 22, 10, 3, 24, 12, 8, + 9, 22, 3, 25, 6, 39, 36, 40, 30, 28, + 34, 32, 18, 32, 36, 3, 16, 18, 10, 4, + 11, 6, 5, 1, 4, 1, 2, 10, 5, 33, + 5, 19, 13, 5, 37, 32, 44, 30, 28, 22, + 18, 18, 18, 4, 11, 11, 17, 33, 31, 67, + 9, 7, 35, 14, 4, 1, 1, 13, 23, 17, + 19, 33, 19, 29, 57, 43, 59, 5, 46, 36, + 22, 8, 16, 0, 9, 11, 21, 2, 70, 42, + 30, 16, 28, 8, 0, 11, 13, 8, 78, 58, + 48, 34, 36, 10, 1, 11, 19, 13, 82, 48, + 20, 6, 18, 7, 25, 31, 10, 90, 66, 46, + 30, 40, 10, 3, 11, 17, 124, 41, 33, 13, + 31, 33, 23, 21, 15, 19, 17, 13, 11, 31, + 23, 37, 29, 10, 27, 35, 19, 9, 9, 15, + 17, 11, 11, 21, 29, 35, 12, 12, 28, 6, + 3, 16, 14, 10, 2, 6, 8, 4, 13, 11, + 19, 3, 16, 31, 6, 36, 2, 12, 14, 18, + 32, 4, 8, 16, 28, 15, 19, 43, 60, 64, + 76, 46, 40, 56, 52, 52, 58, 46, 56, 56, + 34, 28, 0, 28, 44, 22, 2, 26, 20, 12, + 16, 8, 12, 8, 5, 5, 9, 40, 36, 34, + 28, 0, 10, 4, 17, 7, 19, 23, 45, 41, + 55, 59, 19, 17, 57, 10, 12, 3, 23, 9, + 11, 27, 15, 21, 31, 47, 49, 65, 79, 8, + 9, 35, 17, 8, 6, 16, 20, 24, 42, 28, + 38, 42, 68, 48, 46, 68, 66, 102, 60, 38, + 14, 1, 25, 49, 73, 107, 14, 80, 66, 62, + 48, 54, 26, 20, 20, 0, 25, 15, 6, 11, + 22, 40, 7, 5, 14, 22, 8, 20, 40, 8, + 2, 84, 48, 14, 17, 41, 79, 105, 119, 125, + }, + + { + /* Context Tables for P, SP, B Slices :: cabac_init_idc = 0, qp = 27 */ + + 90, 12, 31, 90, 12, 31, 31, 3, 42, 22, + 2, 15, 46, 56, 106, 28, 16, 3, 14, 34, + 0, 11, 5, 29, 61, 10, 30, 93, 107, 111, + 28, 13, 9, 14, 34, 0, 23, 5, 26, 6, + 0, 13, 27, 3, 29, 27, 55, 10, 13, 27, + 2, 23, 23, 49, 16, 3, 7, 15, 6, 8, + 44, 0, 0, 0, 7, 57, 67, 6, 2, 7, + 36, 15, 53, 33, 26, 10, 70, 58, 6, 14, + 17, 16, 15, 15, 29, 33, 33, 41, 45, 46, + 0, 16, 27, 9, 25, 19, 47, 2, 13, 11, + 31, 34, 9, 2, 21, 35, 7, 19, 3, 12, + 10, 2, 24, 22, 21, 1, 0, 7, 6, 17, + 20, 13, 2, 20, 16, 38, 26, 12, 25, 4, + 11, 7, 1, 49, 9, 14, 4, 12, 14, 14, + 44, 24, 6, 0, 10, 0, 2, 12, 51, 11, + 2, 29, 4, 5, 22, 10, 5, 24, 12, 8, + 13, 22, 5, 25, 6, 41, 34, 40, 28, 28, + 34, 32, 18, 32, 36, 3, 14, 18, 8, 4, + 11, 4, 7, 1, 2, 3, 0, 8, 5, 33, + 5, 17, 15, 9, 37, 28, 40, 26, 24, 18, + 14, 14, 14, 0, 15, 13, 19, 37, 35, 69, + 9, 7, 37, 12, 2, 3, 5, 15, 25, 19, + 21, 35, 21, 31, 57, 43, 57, 3, 46, 36, + 22, 10, 16, 0, 9, 9, 19, 4, 68, 42, + 30, 16, 30, 8, 2, 9, 9, 8, 80, 58, + 48, 34, 38, 10, 1, 9, 19, 13, 82, 48, + 20, 6, 18, 5, 23, 29, 10, 90, 66, 44, + 30, 40, 10, 1, 9, 15, 124, 41, 31, 13, + 29, 31, 21, 17, 13, 17, 15, 11, 9, 31, + 21, 37, 27, 14, 27, 37, 19, 9, 9, 15, + 19, 11, 13, 23, 31, 37, 12, 12, 28, 6, + 3, 14, 12, 10, 2, 6, 6, 4, 13, 11, + 21, 3, 16, 33, 6, 34, 0, 10, 12, 18, + 34, 4, 6, 16, 28, 15, 19, 45, 58, 62, + 74, 44, 38, 54, 48, 48, 54, 42, 52, 50, + 28, 22, 1, 24, 38, 16, 5, 20, 16, 8, + 14, 6, 12, 10, 5, 1, 3, 36, 32, 30, + 24, 3, 6, 2, 21, 11, 21, 25, 47, 43, + 57, 61, 19, 19, 59, 8, 10, 5, 27, 13, + 15, 29, 17, 25, 33, 51, 53, 69, 81, 6, + 11, 37, 17, 8, 6, 18, 22, 24, 46, 30, + 40, 44, 70, 52, 48, 72, 68, 100, 58, 34, + 10, 7, 29, 55, 77, 111, 16, 80, 66, 64, + 48, 56, 28, 20, 22, 2, 25, 13, 6, 11, + 24, 42, 7, 5, 16, 24, 8, 22, 42, 8, + 2, 82, 44, 10, 23, 49, 87, 111, 125, 125, + }, + + { + /* Context Tables for P, SP, B Slices :: cabac_init_idc = 0, qp = 28 */ + + 86, 12, 31, 86, 12, 31, 29, 0, 42, 22, + 0, 19, 42, 54, 106, 28, 20, 3, 16, 36, + 0, 13, 5, 31, 63, 10, 26, 97, 109, 111, + 34, 11, 9, 16, 36, 0, 23, 3, 26, 6, + 0, 13, 25, 3, 31, 27, 55, 10, 13, 25, + 2, 25, 23, 49, 18, 3, 7, 15, 8, 8, + 44, 0, 0, 0, 7, 59, 67, 8, 0, 7, + 36, 15, 53, 29, 28, 12, 74, 60, 8, 16, + 15, 18, 13, 13, 25, 31, 33, 39, 43, 46, + 0, 16, 27, 9, 25, 19, 43, 2, 13, 11, + 31, 34, 9, 2, 21, 35, 7, 19, 1, 12, + 10, 2, 24, 22, 21, 1, 0, 7, 6, 17, + 18, 15, 2, 18, 14, 36, 24, 10, 23, 6, + 11, 9, 1, 47, 9, 14, 4, 12, 14, 14, + 46, 24, 6, 1, 10, 1, 0, 10, 53, 11, + 2, 29, 2, 7, 22, 8, 7, 24, 12, 8, + 15, 22, 7, 25, 6, 41, 32, 40, 28, 28, + 34, 32, 18, 32, 34, 3, 14, 18, 8, 4, + 13, 4, 7, 3, 2, 3, 1, 6, 7, 33, + 7, 17, 17, 11, 37, 26, 36, 22, 20, 14, + 10, 10, 10, 3, 19, 17, 23, 41, 39, 71, + 9, 7, 37, 10, 0, 5, 7, 19, 29, 21, + 25, 37, 21, 31, 57, 43, 57, 3, 46, 36, + 22, 10, 16, 0, 9, 9, 17, 4, 68, 42, + 30, 16, 30, 10, 2, 7, 5, 8, 80, 58, + 48, 34, 38, 10, 1, 9, 17, 13, 82, 48, + 18, 6, 18, 5, 23, 27, 10, 88, 64, 42, + 28, 40, 10, 1, 9, 15, 124, 39, 29, 11, + 27, 29, 19, 15, 11, 15, 13, 7, 5, 29, + 21, 37, 27, 16, 27, 37, 19, 7, 9, 17, + 19, 11, 15, 25, 33, 39, 10, 10, 28, 4, + 5, 14, 12, 10, 0, 4, 6, 4, 15, 13, + 21, 3, 16, 33, 4, 34, 1, 10, 10, 16, + 34, 2, 4, 14, 26, 17, 21, 49, 56, 60, + 74, 42, 36, 50, 44, 44, 50, 36, 46, 46, + 22, 18, 3, 18, 32, 10, 13, 14, 10, 4, + 12, 2, 12, 12, 5, 0, 0, 32, 28, 26, + 20, 7, 2, 1, 25, 13, 25, 27, 51, 47, + 59, 63, 21, 19, 61, 6, 6, 9, 31, 17, + 19, 33, 21, 29, 37, 53, 55, 71, 83, 4, + 13, 39, 17, 10, 6, 18, 22, 26, 48, 30, + 42, 46, 72, 54, 50, 74, 70, 100, 54, 30, + 4, 11, 35, 61, 83, 117, 16, 82, 68, 64, + 48, 56, 28, 20, 22, 2, 25, 13, 8, 9, + 26, 44, 7, 3, 16, 24, 8, 22, 44, 8, + 2, 80, 40, 4, 29, 55, 95, 119, 125, 125, + }, + + { + /* Context Tables for P, SP, B Slices :: cabac_init_idc = 0, qp = 29 */ + + 84, 12, 31, 84, 12, 31, 25, 2, 42, 22, + 0, 21, 40, 50, 106, 28, 26, 5, 18, 36, + 0, 13, 3, 33, 65, 8, 22, 101, 111, 113, + 40, 7, 9, 18, 36, 0, 23, 1, 26, 4, + 0, 13, 25, 3, 31, 25, 55, 10, 13, 23, + 0, 25, 23, 49, 18, 1, 5, 13, 8, 8, + 44, 0, 0, 0, 5, 59, 67, 8, 1, 7, + 36, 13, 53, 27, 32, 16, 76, 64, 12, 18, + 13, 22, 11, 11, 21, 31, 31, 39, 41, 46, + 0, 16, 25, 9, 23, 17, 41, 0, 13, 11, + 31, 34, 9, 4, 19, 35, 7, 19, 1, 12, + 8, 2, 24, 22, 19, 1, 2, 7, 6, 17, + 18, 15, 2, 18, 12, 36, 22, 8, 21, 6, + 11, 11, 3, 47, 11, 14, 2, 12, 14, 14, + 48, 26, 6, 1, 10, 1, 1, 10, 55, 11, + 2, 31, 2, 9, 20, 8, 7, 26, 12, 8, + 17, 20, 9, 25, 6, 43, 32, 40, 28, 28, + 34, 32, 18, 32, 34, 3, 14, 18, 8, 4, + 13, 4, 7, 3, 2, 3, 1, 4, 7, 33, + 7, 17, 19, 13, 37, 22, 32, 20, 18, 10, + 6, 6, 6, 7, 23, 21, 27, 47, 41, 73, + 9, 9, 39, 8, 1, 7, 9, 21, 31, 23, + 27, 37, 23, 33, 57, 43, 55, 3, 46, 36, + 22, 10, 16, 2, 7, 7, 15, 6, 68, 42, + 30, 16, 30, 10, 4, 7, 1, 8, 80, 58, + 48, 34, 38, 10, 1, 9, 15, 13, 84, 46, + 18, 6, 18, 5, 23, 27, 10, 88, 62, 40, + 26, 40, 10, 1, 9, 13, 124, 37, 27, 9, + 23, 27, 17, 13, 7, 13, 9, 5, 1, 27, + 19, 35, 25, 18, 25, 39, 17, 5, 9, 17, + 21, 13, 15, 27, 35, 41, 8, 10, 28, 4, + 5, 12, 12, 10, 1, 2, 6, 2, 17, 13, + 21, 3, 16, 33, 2, 32, 1, 10, 8, 14, + 34, 0, 2, 14, 26, 19, 23, 53, 54, 60, + 72, 40, 32, 48, 40, 40, 46, 32, 40, 40, + 16, 14, 7, 12, 24, 4, 21, 10, 6, 0, + 10, 0, 12, 14, 3, 2, 4, 30, 26, 22, + 18, 11, 1, 3, 29, 15, 29, 31, 55, 49, + 61, 65, 21, 19, 63, 2, 4, 13, 33, 19, + 21, 35, 23, 31, 39, 55, 57, 73, 83, 4, + 13, 41, 15, 10, 8, 20, 24, 28, 50, 32, + 44, 48, 76, 56, 52, 76, 74, 98, 50, 26, + 0, 17, 41, 67, 89, 121, 16, 82, 68, 66, + 50, 58, 28, 22, 24, 4, 23, 13, 10, 9, + 28, 46, 5, 1, 18, 26, 8, 24, 46, 8, + 2, 76, 36, 0, 35, 61, 101, 125, 125, 125, + }, + + { + /* Context Tables for P, SP, B Slices :: cabac_init_idc = 0, qp = 30 */ + + 82, 12, 31, 82, 12, 31, 21, 6, 44, 22, + 1, 25, 36, 48, 106, 28, 30, 5, 20, 38, + 0, 15, 3, 35, 67, 8, 18, 103, 113, 113, + 46, 5, 9, 20, 38, 0, 21, 0, 28, 4, + 0, 11, 23, 5, 33, 25, 55, 10, 13, 23, + 0, 25, 23, 49, 20, 1, 5, 13, 10, 8, + 44, 0, 0, 0, 5, 59, 67, 10, 3, 7, + 36, 13, 53, 23, 36, 18, 80, 68, 14, 20, + 9, 24, 9, 9, 17, 29, 31, 37, 39, 46, + 0, 16, 25, 9, 23, 17, 37, 0, 15, 13, + 33, 34, 9, 4, 19, 35, 5, 19, 0, 12, + 8, 2, 24, 20, 19, 1, 2, 7, 6, 17, + 16, 15, 2, 16, 10, 34, 20, 6, 21, 8, + 11, 13, 3, 45, 11, 14, 2, 12, 14, 14, + 50, 26, 6, 1, 12, 1, 3, 10, 57, 11, + 2, 31, 0, 11, 20, 8, 9, 26, 12, 8, + 21, 20, 11, 25, 6, 43, 30, 40, 28, 28, + 34, 32, 18, 32, 34, 3, 12, 18, 8, 4, + 13, 2, 9, 5, 0, 5, 3, 2, 9, 33, + 9, 17, 21, 17, 37, 20, 28, 16, 14, 6, + 2, 2, 2, 11, 27, 23, 29, 51, 45, 75, + 9, 9, 39, 6, 3, 9, 13, 25, 35, 25, + 29, 39, 23, 33, 57, 43, 55, 1, 46, 36, + 22, 10, 16, 2, 7, 7, 13, 6, 68, 42, + 30, 16, 32, 12, 6, 5, 2, 8, 80, 58, + 48, 34, 40, 10, 1, 9, 15, 13, 84, 46, + 16, 6, 18, 5, 23, 25, 10, 86, 62, 38, + 24, 40, 10, 1, 7, 13, 124, 35, 25, 9, + 21, 25, 15, 9, 5, 11, 7, 1, 0, 27, + 17, 35, 25, 22, 25, 39, 17, 5, 9, 19, + 21, 13, 17, 29, 37, 43, 8, 8, 28, 2, + 7, 12, 10, 10, 3, 2, 4, 2, 19, 13, + 23, 3, 16, 35, 2, 32, 3, 10, 6, 14, + 34, 1, 0, 12, 24, 21, 23, 55, 52, 58, + 72, 38, 30, 44, 36, 36, 42, 26, 36, 36, + 10, 8, 9, 6, 18, 1, 29, 4, 0, 3, + 8, 1, 12, 16, 3, 4, 8, 26, 22, 18, + 14, 15, 5, 7, 33, 19, 31, 33, 57, 53, + 63, 67, 23, 21, 65, 0, 0, 15, 37, 23, + 25, 39, 27, 35, 43, 57, 61, 77, 85, 2, + 15, 43, 15, 12, 8, 20, 24, 28, 52, 34, + 46, 50, 78, 58, 54, 80, 76, 98, 48, 22, + 5, 21, 47, 73, 95, 125, 18, 84, 70, 66, + 50, 60, 30, 22, 24, 4, 23, 11, 10, 7, + 30, 48, 5, 1, 18, 26, 8, 24, 48, 8, + 2, 74, 32, 5, 41, 67, 109, 125, 125, 125, + }, + + { + /* Context Tables for P, SP, B Slices :: cabac_init_idc = 0, qp = 31 */ + + 80, 12, 31, 80, 12, 31, 17, 8, 44, 22, + 1, 27, 34, 46, 106, 28, 34, 5, 22, 40, + 0, 17, 3, 37, 69, 6, 14, 107, 115, 115, + 52, 3, 9, 22, 40, 0, 21, 2, 28, 2, + 0, 11, 21, 5, 33, 25, 55, 10, 13, 21, + 0, 25, 23, 49, 20, 1, 5, 13, 10, 8, + 44, 0, 0, 0, 3, 59, 67, 10, 5, 7, + 36, 13, 53, 19, 40, 22, 82, 72, 16, 22, + 7, 26, 7, 7, 13, 29, 31, 37, 37, 46, + 0, 16, 23, 9, 21, 15, 35, 0, 15, 13, + 33, 34, 9, 4, 17, 35, 5, 19, 0, 12, + 8, 2, 24, 20, 19, 1, 2, 7, 6, 17, + 14, 15, 2, 14, 8, 32, 18, 4, 19, 8, + 11, 15, 5, 45, 13, 14, 2, 12, 14, 14, + 52, 26, 6, 1, 12, 1, 5, 10, 59, 11, + 2, 33, 0, 13, 20, 8, 11, 26, 12, 8, + 23, 20, 13, 25, 6, 45, 28, 40, 28, 28, + 34, 32, 18, 32, 34, 3, 12, 18, 8, 4, + 13, 2, 9, 5, 0, 5, 5, 0, 9, 33, + 9, 17, 23, 19, 37, 16, 24, 12, 10, 2, + 1, 1, 1, 15, 31, 27, 33, 55, 49, 77, + 9, 9, 41, 4, 5, 11, 15, 27, 37, 27, + 31, 41, 25, 35, 57, 43, 53, 1, 46, 36, + 22, 10, 16, 2, 7, 5, 11, 8, 68, 42, + 30, 16, 32, 12, 8, 3, 6, 8, 80, 58, + 48, 34, 40, 10, 1, 9, 13, 13, 84, 46, + 16, 6, 18, 5, 23, 23, 10, 86, 60, 36, + 22, 40, 10, 1, 7, 11, 124, 33, 23, 7, + 19, 23, 13, 7, 3, 9, 5, 0, 4, 25, + 15, 35, 23, 24, 25, 41, 17, 3, 9, 19, + 23, 13, 19, 31, 39, 45, 6, 8, 28, 2, + 7, 10, 10, 10, 5, 0, 4, 2, 21, 13, + 23, 3, 16, 35, 0, 30, 5, 10, 4, 12, + 34, 3, 1, 12, 24, 23, 25, 59, 50, 56, + 70, 36, 28, 42, 32, 32, 38, 22, 30, 30, + 4, 4, 11, 0, 12, 7, 37, 1, 3, 7, + 6, 3, 12, 18, 3, 6, 12, 22, 18, 14, + 10, 19, 9, 9, 37, 21, 35, 35, 61, 55, + 65, 69, 23, 21, 67, 1, 1, 19, 41, 27, + 29, 41, 29, 39, 45, 59, 63, 79, 87, 0, + 17, 45, 15, 12, 8, 22, 26, 30, 54, 36, + 48, 52, 80, 60, 56, 82, 78, 96, 44, 18, + 9, 27, 53, 79, 101, 125, 18, 84, 70, 68, + 50, 62, 30, 22, 26, 6, 23, 11, 12, 7, + 32, 50, 5, 0, 20, 28, 8, 26, 50, 8, + 2, 72, 28, 9, 47, 73, 117, 125, 125, 125, + }, + + { + /* Context Tables for P, SP, B Slices :: cabac_init_idc = 0, qp = 32 */ + + 76, 10, 33, 76, 10, 33, 15, 10, 44, 22, + 3, 31, 30, 42, 104, 28, 38, 7, 22, 40, + 1, 19, 3, 41, 73, 4, 10, 111, 117, 117, + 56, 1, 11, 22, 40, 1, 21, 4, 28, 0, + 0, 11, 21, 7, 35, 25, 57, 10, 13, 21, + 1, 27, 23, 49, 20, 1, 5, 13, 10, 6, + 44, 0, 0, 0, 3, 61, 67, 10, 7, 9, + 36, 13, 53, 17, 42, 24, 84, 74, 18, 24, + 5, 28, 5, 7, 11, 29, 31, 37, 37, 46, + 0, 16, 23, 9, 21, 15, 33, 1, 17, 15, + 35, 34, 9, 4, 17, 35, 5, 19, 0, 10, + 6, 2, 22, 18, 19, 3, 2, 7, 4, 17, + 12, 17, 0, 12, 4, 30, 16, 2, 19, 8, + 11, 17, 7, 45, 15, 12, 0, 12, 14, 14, + 52, 26, 6, 3, 12, 3, 7, 8, 61, 13, + 0, 35, 1, 15, 18, 6, 13, 26, 12, 8, + 27, 18, 15, 25, 6, 47, 26, 38, 26, 28, + 34, 30, 16, 32, 32, 5, 10, 18, 6, 2, + 15, 0, 11, 7, 1, 7, 7, 1, 11, 33, + 11, 17, 25, 23, 37, 12, 20, 8, 6, 1, + 7, 5, 5, 19, 35, 31, 37, 61, 53, 81, + 11, 11, 43, 2, 9, 15, 19, 31, 41, 31, + 35, 43, 27, 37, 57, 43, 53, 1, 46, 36, + 22, 10, 16, 2, 7, 5, 9, 8, 66, 42, + 30, 16, 32, 12, 8, 3, 8, 6, 80, 58, + 46, 32, 40, 10, 1, 9, 13, 13, 84, 44, + 14, 4, 18, 5, 23, 23, 10, 84, 58, 34, + 20, 38, 10, 1, 7, 11, 124, 33, 21, 7, + 17, 21, 13, 5, 1, 7, 3, 2, 6, 25, + 15, 35, 23, 26, 25, 43, 17, 3, 9, 21, + 25, 15, 21, 33, 43, 49, 4, 6, 28, 0, + 9, 8, 8, 8, 7, 1, 2, 0, 23, 15, + 25, 3, 16, 37, 1, 28, 7, 8, 2, 10, + 34, 5, 5, 10, 22, 25, 27, 63, 48, 54, + 68, 32, 24, 38, 28, 28, 32, 16, 24, 24, + 1, 1, 15, 5, 4, 13, 47, 7, 9, 11, + 2, 7, 12, 18, 3, 8, 16, 18, 14, 10, + 6, 25, 13, 13, 43, 25, 39, 39, 65, 59, + 67, 71, 25, 23, 71, 5, 5, 23, 45, 31, + 33, 45, 33, 43, 49, 63, 67, 83, 89, 1, + 19, 47, 15, 12, 8, 22, 26, 30, 56, 36, + 50, 52, 82, 62, 58, 84, 80, 94, 40, 12, + 15, 33, 59, 87, 107, 125, 18, 84, 70, 68, + 50, 62, 30, 22, 26, 6, 23, 11, 12, 7, + 34, 52, 5, 0, 20, 28, 8, 26, 50, 8, + 0, 68, 22, 15, 53, 81, 125, 125, 125, 125, + }, + + { + /* Context Tables for P, SP, B Slices :: cabac_init_idc = 0, qp = 33 */ + + 74, 10, 33, 74, 10, 33, 11, 14, 46, 24, + 3, 33, 28, 40, 104, 28, 44, 7, 24, 42, + 1, 19, 1, 43, 75, 4, 8, 113, 119, 117, + 62, 2, 11, 24, 42, 1, 19, 8, 30, 0, + 2, 9, 19, 7, 35, 23, 57, 10, 11, 19, + 1, 27, 21, 49, 22, 0, 3, 11, 12, 6, + 44, 0, 0, 0, 1, 61, 67, 12, 7, 9, + 36, 11, 51, 13, 46, 28, 88, 78, 22, 26, + 1, 32, 1, 5, 7, 27, 29, 35, 35, 48, + 2, 18, 21, 7, 19, 13, 29, 1, 17, 15, + 35, 36, 9, 6, 15, 33, 3, 17, 2, 10, + 6, 4, 22, 18, 17, 3, 4, 7, 4, 15, + 12, 17, 0, 12, 2, 30, 16, 2, 17, 10, + 9, 17, 7, 43, 15, 12, 0, 14, 14, 14, + 54, 28, 8, 3, 14, 3, 7, 8, 61, 13, + 0, 35, 1, 17, 18, 6, 13, 28, 14, 8, + 29, 18, 15, 23, 6, 47, 26, 38, 26, 28, + 34, 30, 16, 32, 32, 5, 10, 18, 6, 2, + 15, 0, 11, 7, 1, 7, 7, 1, 11, 31, + 11, 15, 25, 25, 35, 10, 18, 6, 4, 5, + 11, 7, 7, 23, 37, 33, 39, 65, 55, 83, + 11, 11, 43, 0, 11, 17, 21, 33, 43, 33, + 37, 43, 27, 37, 55, 41, 51, 0, 48, 36, + 22, 12, 18, 4, 5, 3, 5, 10, 66, 42, + 30, 16, 34, 14, 10, 1, 12, 6, 82, 60, + 46, 32, 42, 10, 1, 7, 11, 13, 86, 44, + 14, 4, 20, 3, 21, 21, 10, 84, 58, 34, + 20, 38, 10, 0, 5, 9, 124, 31, 19, 5, + 13, 17, 11, 1, 2, 3, 0, 6, 10, 23, + 13, 33, 21, 30, 23, 43, 15, 1, 7, 21, + 25, 15, 21, 33, 45, 51, 4, 6, 30, 0, + 9, 8, 8, 8, 7, 1, 2, 0, 23, 15, + 25, 1, 18, 37, 1, 28, 7, 8, 2, 10, + 36, 5, 7, 10, 22, 25, 27, 65, 48, 54, + 68, 30, 22, 36, 24, 24, 28, 12, 20, 20, + 5, 5, 17, 9, 1, 19, 55, 11, 13, 13, + 0, 9, 12, 20, 1, 12, 22, 16, 12, 8, + 4, 29, 15, 15, 47, 27, 41, 41, 67, 61, + 69, 71, 25, 23, 73, 7, 7, 25, 47, 33, + 35, 47, 35, 45, 51, 65, 69, 85, 89, 1, + 19, 47, 13, 14, 10, 24, 28, 32, 60, 38, + 52, 54, 86, 66, 60, 88, 84, 94, 38, 8, + 19, 37, 63, 93, 111, 125, 20, 86, 72, 70, + 52, 64, 32, 24, 28, 8, 21, 9, 14, 5, + 38, 56, 3, 2, 22, 30, 10, 28, 52, 8, + 0, 66, 18, 19, 57, 87, 125, 125, 125, 125, + }, + + { + /* Context Tables for P, SP, B Slices :: cabac_init_idc = 0, qp = 34 */ + + 72, 10, 33, 72, 10, 33, 7, 16, 46, 24, + 3, 35, 26, 38, 104, 28, 48, 7, 26, 44, + 1, 21, 1, 45, 77, 2, 4, 117, 121, 119, + 68, 4, 11, 26, 44, 1, 19, 10, 30, 1, + 2, 9, 17, 7, 35, 23, 57, 10, 11, 17, + 1, 27, 21, 49, 22, 0, 3, 11, 12, 6, + 44, 0, 0, 0, 0, 61, 67, 12, 9, 9, + 36, 11, 51, 9, 50, 32, 90, 82, 24, 28, + 0, 34, 0, 3, 3, 27, 29, 35, 33, 48, + 2, 18, 19, 7, 17, 13, 27, 1, 17, 15, + 35, 36, 9, 6, 15, 33, 3, 17, 2, 10, + 6, 4, 22, 18, 17, 3, 4, 7, 4, 15, + 10, 17, 0, 10, 0, 28, 14, 0, 15, 10, + 9, 19, 9, 43, 17, 12, 0, 14, 14, 14, + 56, 28, 8, 3, 14, 3, 9, 8, 63, 13, + 0, 37, 1, 19, 18, 6, 15, 28, 14, 8, + 31, 18, 17, 23, 6, 49, 24, 38, 26, 28, + 34, 30, 16, 32, 32, 5, 10, 18, 6, 2, + 15, 0, 11, 7, 1, 7, 9, 3, 13, 31, + 11, 15, 27, 27, 35, 6, 14, 2, 0, 9, + 15, 11, 11, 27, 41, 37, 43, 69, 59, 85, + 11, 11, 45, 1, 13, 19, 23, 37, 45, 35, + 39, 45, 29, 39, 55, 41, 49, 0, 48, 36, + 22, 12, 18, 4, 5, 3, 3, 12, 66, 42, + 30, 16, 34, 14, 12, 0, 16, 6, 82, 60, + 46, 32, 42, 10, 1, 7, 9, 13, 86, 44, + 14, 4, 20, 3, 21, 19, 10, 82, 56, 32, + 18, 38, 10, 0, 5, 9, 124, 29, 17, 3, + 11, 15, 9, 0, 4, 1, 2, 8, 14, 21, + 11, 33, 19, 32, 23, 45, 15, 0, 7, 21, + 27, 15, 23, 35, 47, 53, 2, 6, 30, 0, + 11, 6, 8, 8, 9, 3, 2, 0, 25, 15, + 25, 1, 18, 37, 3, 26, 9, 8, 0, 8, + 36, 7, 9, 10, 22, 27, 29, 69, 46, 52, + 66, 28, 20, 34, 20, 20, 24, 8, 14, 14, + 11, 9, 19, 15, 7, 25, 63, 17, 17, 17, + 1, 11, 12, 22, 1, 14, 26, 12, 8, 4, + 0, 33, 19, 19, 51, 29, 45, 43, 71, 63, + 71, 73, 27, 23, 75, 9, 9, 29, 51, 37, + 39, 49, 39, 49, 53, 67, 71, 87, 91, 3, + 21, 49, 13, 14, 10, 26, 28, 34, 62, 40, + 54, 56, 88, 68, 62, 90, 86, 92, 34, 4, + 23, 43, 69, 99, 117, 125, 20, 86, 72, 70, + 52, 66, 32, 24, 28, 8, 21, 9, 16, 5, + 40, 58, 3, 4, 24, 32, 10, 28, 54, 8, + 0, 64, 14, 23, 63, 93, 125, 125, 125, 125, + }, + + { + /* Context Tables for P, SP, B Slices :: cabac_init_idc = 0, qp = 35 */ + + 70, 10, 33, 70, 10, 33, 3, 20, 48, 24, + 5, 39, 22, 36, 104, 28, 52, 7, 28, 46, + 1, 23, 1, 47, 79, 2, 0, 119, 123, 119, + 74, 6, 11, 28, 46, 1, 17, 12, 32, 1, + 2, 7, 15, 9, 37, 23, 57, 10, 11, 17, + 1, 27, 21, 49, 24, 0, 3, 11, 14, 6, + 44, 0, 0, 0, 0, 61, 67, 14, 11, 9, + 36, 11, 51, 5, 54, 34, 94, 86, 26, 30, + 4, 36, 2, 1, 0, 25, 29, 33, 31, 48, + 2, 18, 19, 7, 17, 11, 23, 1, 19, 17, + 37, 36, 9, 6, 13, 33, 1, 17, 4, 10, + 6, 4, 22, 16, 17, 3, 4, 7, 4, 15, + 8, 17, 0, 8, 1, 26, 12, 1, 15, 12, + 9, 21, 9, 41, 17, 12, 0, 14, 14, 14, + 58, 28, 8, 3, 16, 3, 11, 8, 65, 13, + 0, 37, 3, 21, 18, 6, 17, 28, 14, 8, + 35, 18, 19, 23, 6, 49, 22, 38, 26, 28, + 34, 30, 16, 32, 32, 5, 8, 18, 6, 2, + 15, 1, 13, 9, 3, 9, 11, 5, 13, 31, + 13, 15, 29, 31, 35, 4, 10, 1, 3, 13, + 19, 15, 15, 31, 45, 39, 45, 73, 63, 87, + 11, 11, 45, 3, 15, 21, 27, 39, 49, 37, + 41, 47, 29, 39, 55, 41, 49, 2, 48, 36, + 22, 12, 18, 4, 5, 1, 1, 12, 66, 42, + 30, 16, 36, 16, 14, 2, 20, 6, 82, 60, + 46, 32, 44, 10, 1, 7, 9, 13, 86, 44, + 12, 4, 20, 3, 21, 17, 10, 82, 56, 30, + 16, 38, 10, 0, 3, 7, 124, 27, 15, 3, + 9, 13, 7, 4, 6, 0, 4, 12, 16, 21, + 9, 33, 19, 36, 23, 45, 15, 0, 7, 23, + 27, 15, 25, 37, 49, 55, 2, 4, 30, 1, + 11, 6, 6, 8, 11, 3, 0, 0, 27, 15, + 27, 1, 18, 39, 3, 26, 11, 8, 1, 8, + 36, 9, 11, 8, 20, 29, 29, 71, 44, 50, + 66, 26, 18, 30, 16, 16, 20, 2, 10, 10, + 17, 15, 21, 21, 13, 31, 71, 23, 23, 21, + 3, 13, 12, 24, 1, 16, 30, 8, 4, 0, + 3, 37, 23, 21, 55, 33, 47, 45, 73, 67, + 73, 75, 27, 25, 77, 11, 13, 31, 55, 41, + 43, 53, 41, 53, 57, 69, 75, 91, 93, 5, + 23, 51, 13, 16, 10, 26, 30, 34, 64, 42, + 56, 58, 90, 70, 64, 94, 88, 92, 32, 0, + 29, 47, 75, 105, 123, 125, 22, 88, 74, 72, + 52, 68, 34, 24, 30, 10, 21, 7, 16, 3, + 42, 60, 3, 4, 24, 32, 10, 30, 56, 8, + 0, 62, 10, 29, 69, 99, 125, 125, 125, 125, + }, + + { + /* Context Tables for P, SP, B Slices :: cabac_init_idc = 0, qp = 36 */ + + 66, 10, 33, 66, 10, 33, 1, 22, 48, 24, + 5, 41, 20, 32, 104, 28, 58, 9, 30, 46, + 1, 25, 1, 49, 81, 0, 3, 123, 125, 121, + 80, 8, 11, 30, 46, 1, 17, 14, 32, 3, + 2, 7, 15, 9, 37, 21, 57, 10, 11, 15, + 3, 29, 21, 49, 24, 2, 3, 9, 14, 6, + 44, 0, 0, 0, 2, 63, 67, 14, 13, 9, + 36, 11, 51, 3, 56, 38, 96, 88, 28, 32, + 6, 40, 4, 0, 4, 25, 27, 33, 29, 48, + 2, 18, 17, 7, 15, 11, 21, 3, 19, 17, + 37, 36, 9, 6, 13, 33, 1, 17, 4, 10, + 4, 4, 22, 16, 17, 3, 4, 7, 4, 15, + 8, 19, 0, 8, 3, 24, 10, 3, 13, 12, + 9, 23, 11, 41, 19, 12, 1, 14, 14, 14, + 60, 30, 8, 5, 16, 5, 13, 6, 67, 13, + 0, 39, 3, 23, 16, 4, 17, 30, 14, 8, + 37, 16, 21, 23, 6, 51, 22, 38, 26, 28, + 34, 30, 16, 32, 30, 5, 8, 18, 6, 2, + 17, 1, 13, 9, 3, 9, 11, 7, 15, 31, + 13, 15, 31, 33, 35, 0, 6, 5, 7, 17, + 23, 19, 19, 35, 49, 43, 49, 79, 67, 89, + 11, 13, 47, 5, 17, 23, 29, 43, 51, 39, + 45, 47, 31, 41, 55, 41, 47, 2, 48, 36, + 22, 12, 18, 4, 5, 1, 0, 14, 66, 42, + 30, 16, 36, 16, 14, 2, 24, 6, 82, 60, + 46, 32, 44, 10, 1, 7, 7, 13, 86, 42, + 12, 4, 20, 3, 21, 17, 10, 80, 54, 28, + 14, 38, 10, 0, 3, 7, 124, 25, 13, 1, + 5, 11, 5, 6, 10, 2, 8, 14, 20, 19, + 9, 31, 17, 38, 21, 47, 15, 2, 7, 23, + 29, 17, 27, 39, 51, 57, 0, 4, 30, 1, + 13, 4, 6, 8, 13, 5, 0, 1, 29, 17, + 27, 1, 18, 39, 5, 24, 13, 8, 3, 6, + 36, 11, 13, 8, 20, 31, 31, 75, 42, 50, + 64, 24, 14, 28, 12, 12, 16, 1, 4, 4, + 23, 19, 25, 27, 21, 37, 79, 27, 27, 25, + 5, 17, 12, 26, 0, 18, 34, 6, 0, 3, + 5, 41, 27, 25, 59, 35, 51, 49, 77, 69, + 75, 77, 29, 25, 79, 15, 15, 35, 57, 45, + 47, 55, 45, 57, 59, 71, 77, 93, 93, 7, + 23, 53, 13, 16, 12, 28, 30, 36, 66, 42, + 58, 60, 92, 72, 66, 96, 90, 90, 28, 3, + 33, 53, 81, 111, 125, 125, 22, 88, 74, 72, + 54, 68, 34, 26, 30, 10, 19, 7, 18, 3, + 44, 62, 3, 6, 26, 34, 10, 30, 58, 8, + 0, 58, 6, 33, 75, 105, 125, 125, 125, 125, + }, + + { + /* Context Tables for P, SP, B Slices :: cabac_init_idc = 0, qp = 37 */ + + 64, 10, 33, 64, 10, 33, 2, 26, 48, 24, + 7, 45, 16, 30, 104, 28, 62, 9, 32, 48, + 1, 25, 0, 51, 83, 0, 7, 125, 125, 121, + 86, 12, 11, 32, 48, 1, 17, 16, 32, 3, + 2, 7, 13, 9, 39, 21, 57, 10, 11, 13, + 3, 29, 21, 49, 26, 2, 1, 9, 16, 6, + 44, 0, 0, 0, 2, 63, 67, 16, 15, 9, + 36, 9, 51, 0, 60, 40, 100, 92, 32, 34, + 8, 42, 6, 2, 8, 23, 27, 31, 27, 48, + 2, 18, 17, 7, 15, 9, 17, 3, 19, 17, + 37, 36, 9, 8, 11, 33, 1, 17, 6, 10, + 4, 4, 22, 16, 15, 3, 6, 7, 4, 15, + 6, 19, 0, 6, 5, 24, 8, 5, 11, 14, + 9, 25, 11, 39, 19, 12, 1, 14, 14, 14, + 62, 30, 8, 5, 16, 5, 15, 6, 69, 13, + 0, 39, 5, 25, 16, 4, 19, 30, 14, 8, + 39, 16, 23, 23, 6, 51, 20, 38, 26, 28, + 34, 30, 16, 32, 30, 5, 8, 18, 6, 2, + 17, 1, 13, 11, 3, 9, 13, 9, 15, 31, + 15, 15, 33, 35, 35, 1, 2, 7, 9, 21, + 27, 23, 23, 39, 53, 47, 53, 83, 69, 91, + 11, 13, 47, 7, 19, 25, 31, 45, 55, 41, + 47, 49, 31, 41, 55, 41, 47, 2, 48, 36, + 22, 12, 18, 6, 3, 0, 2, 14, 66, 42, + 30, 16, 36, 18, 16, 4, 28, 6, 82, 60, + 46, 32, 44, 10, 1, 7, 5, 13, 88, 42, + 10, 4, 20, 3, 21, 15, 10, 80, 52, 26, + 12, 38, 10, 0, 3, 5, 124, 23, 11, 0, + 3, 9, 3, 8, 12, 4, 10, 18, 24, 17, + 7, 31, 17, 40, 21, 47, 13, 4, 7, 25, + 29, 17, 27, 41, 53, 59, 1, 2, 30, 3, + 13, 4, 6, 8, 15, 7, 0, 1, 31, 17, + 27, 1, 18, 39, 7, 24, 13, 8, 5, 4, + 36, 13, 15, 6, 18, 33, 33, 79, 40, 48, + 64, 22, 12, 24, 8, 8, 12, 7, 1, 0, + 29, 23, 27, 33, 27, 43, 87, 33, 33, 29, + 7, 19, 12, 28, 0, 20, 38, 2, 1, 7, + 9, 45, 31, 27, 63, 37, 55, 51, 81, 73, + 77, 79, 29, 25, 81, 17, 19, 39, 61, 47, + 49, 59, 47, 59, 63, 73, 79, 95, 95, 7, + 25, 55, 11, 18, 12, 28, 32, 38, 68, 44, + 60, 62, 96, 74, 68, 98, 94, 90, 24, 7, + 39, 57, 87, 117, 125, 125, 22, 90, 76, 74, + 54, 70, 34, 26, 32, 12, 19, 7, 20, 1, + 46, 64, 1, 8, 26, 34, 10, 32, 60, 8, + 0, 56, 2, 39, 81, 111, 125, 125, 125, 125, + }, + + { + /* Context Tables for P, SP, B Slices :: cabac_init_idc = 0, qp = 38 */ + + 62, 10, 35, 62, 10, 35, 6, 28, 50, 24, + 7, 47, 14, 28, 104, 28, 66, 9, 32, 50, + 1, 27, 0, 53, 85, 1, 9, 125, 125, 123, + 92, 14, 11, 32, 50, 1, 15, 18, 34, 5, + 4, 5, 11, 11, 39, 21, 57, 10, 9, 13, + 3, 29, 19, 49, 26, 2, 1, 9, 16, 6, + 44, 0, 0, 0, 4, 63, 67, 16, 15, 11, + 36, 9, 49, 4, 64, 44, 102, 96, 34, 36, + 12, 44, 8, 2, 10, 23, 27, 31, 27, 50, + 2, 18, 15, 5, 13, 9, 15, 3, 21, 19, + 39, 36, 9, 8, 11, 33, 0, 17, 6, 10, + 4, 6, 22, 14, 15, 3, 6, 7, 4, 15, + 4, 19, 0, 4, 7, 22, 8, 5, 11, 14, + 9, 25, 13, 39, 21, 12, 1, 14, 14, 14, + 62, 30, 8, 5, 18, 5, 17, 6, 69, 15, + 0, 41, 5, 27, 16, 4, 21, 30, 14, 8, + 43, 16, 25, 23, 6, 53, 18, 38, 24, 28, + 34, 30, 16, 32, 30, 5, 6, 18, 4, 2, + 17, 3, 15, 11, 5, 11, 15, 11, 17, 31, + 15, 13, 35, 39, 35, 5, 1, 11, 13, 25, + 31, 27, 27, 43, 57, 49, 55, 87, 73, 93, + 11, 13, 49, 9, 21, 27, 35, 49, 57, 43, + 49, 51, 33, 43, 55, 41, 45, 4, 48, 36, + 22, 14, 18, 6, 3, 0, 4, 16, 64, 42, + 30, 16, 38, 18, 18, 6, 32, 6, 84, 60, + 46, 32, 46, 10, 1, 5, 5, 13, 88, 42, + 10, 4, 20, 1, 19, 13, 10, 78, 52, 24, + 12, 38, 10, 2, 1, 5, 124, 23, 9, 0, + 1, 7, 1, 12, 14, 6, 12, 20, 26, 17, + 5, 31, 15, 44, 21, 49, 13, 4, 7, 25, + 31, 17, 29, 43, 55, 61, 1, 2, 30, 3, + 15, 2, 4, 8, 15, 7, 1, 1, 31, 17, + 29, 1, 18, 41, 7, 22, 15, 6, 7, 4, + 38, 13, 17, 6, 18, 33, 33, 81, 38, 46, + 62, 20, 10, 22, 4, 4, 8, 11, 5, 5, + 35, 29, 29, 37, 33, 49, 95, 39, 37, 33, + 9, 21, 12, 30, 0, 24, 44, 1, 5, 11, + 13, 49, 35, 31, 67, 41, 57, 53, 83, 75, + 79, 81, 31, 27, 83, 19, 21, 41, 65, 51, + 53, 61, 51, 63, 65, 77, 83, 99, 97, 9, + 27, 57, 11, 18, 12, 30, 32, 38, 72, 46, + 62, 64, 98, 78, 70, 102, 96, 88, 22, 11, + 43, 63, 91, 123, 125, 125, 24, 90, 76, 74, + 54, 72, 36, 26, 32, 12, 19, 5, 20, 1, + 48, 66, 1, 8, 28, 36, 10, 32, 62, 8, + 0, 54, 1, 43, 87, 119, 125, 125, 125, 125, + }, + + { + /* Context Tables for P, SP, B Slices :: cabac_init_idc = 0, qp = 39 */ + + 60, 10, 35, 60, 10, 35, 10, 32, 50, 24, + 9, 51, 10, 24, 104, 28, 72, 11, 34, 50, + 1, 29, 0, 55, 87, 1, 13, 125, 125, 123, + 98, 16, 11, 34, 50, 1, 15, 20, 34, 5, + 4, 5, 11, 11, 41, 19, 57, 10, 9, 11, + 5, 29, 19, 49, 28, 4, 1, 7, 18, 6, + 44, 0, 0, 0, 4, 63, 67, 18, 17, 11, + 36, 9, 49, 6, 68, 46, 106, 100, 36, 38, + 14, 48, 10, 4, 14, 21, 25, 29, 25, 50, + 2, 18, 15, 5, 13, 7, 11, 5, 21, 19, + 39, 36, 9, 8, 9, 33, 0, 17, 8, 10, + 2, 6, 22, 14, 15, 3, 6, 7, 4, 15, + 4, 19, 0, 4, 9, 20, 6, 7, 9, 16, + 9, 27, 13, 37, 21, 12, 3, 14, 14, 14, + 64, 32, 8, 5, 18, 5, 19, 6, 71, 15, + 0, 41, 7, 29, 14, 4, 21, 32, 14, 8, + 45, 14, 27, 23, 6, 53, 18, 38, 24, 28, + 34, 30, 16, 32, 30, 5, 6, 18, 4, 2, + 17, 3, 15, 13, 5, 11, 15, 13, 17, 31, + 17, 13, 37, 41, 35, 7, 5, 15, 17, 29, + 35, 31, 31, 47, 61, 53, 59, 93, 77, 95, + 11, 15, 49, 11, 23, 29, 37, 51, 61, 45, + 51, 51, 33, 43, 55, 41, 45, 4, 48, 36, + 22, 14, 18, 6, 3, 2, 6, 16, 64, 42, + 30, 16, 38, 20, 20, 6, 36, 6, 84, 60, + 46, 32, 46, 10, 1, 5, 3, 13, 88, 40, + 8, 4, 20, 1, 19, 13, 10, 78, 50, 22, + 10, 38, 10, 2, 1, 3, 124, 21, 7, 2, + 2, 5, 0, 14, 18, 8, 16, 24, 30, 15, + 3, 29, 15, 46, 19, 49, 13, 6, 7, 27, + 31, 19, 31, 45, 57, 63, 3, 0, 30, 5, + 15, 2, 4, 8, 17, 9, 1, 3, 33, 17, + 29, 1, 18, 41, 9, 22, 17, 6, 9, 2, + 38, 15, 19, 4, 16, 35, 35, 85, 36, 46, + 62, 18, 6, 18, 0, 0, 4, 17, 11, 9, + 41, 33, 33, 43, 41, 55, 103, 43, 43, 37, + 11, 23, 12, 32, 2, 26, 48, 3, 9, 15, + 15, 53, 39, 33, 71, 43, 61, 57, 87, 79, + 81, 83, 31, 27, 85, 23, 25, 45, 67, 55, + 57, 65, 53, 67, 69, 79, 85, 101, 97, 11, + 27, 59, 11, 20, 14, 30, 34, 40, 74, 48, + 64, 66, 100, 80, 72, 104, 98, 88, 18, 15, + 49, 67, 97, 125, 125, 125, 24, 92, 78, 76, + 56, 74, 36, 28, 34, 14, 17, 5, 22, 0, + 50, 68, 1, 10, 28, 36, 10, 34, 64, 8, + 0, 50, 5, 49, 93, 125, 125, 125, 125, 125, + }, + + { + /* Context Tables for P, SP, B Slices :: cabac_init_idc = 0, qp = 40 */ + + 56, 8, 35, 56, 8, 35, 12, 34, 50, 24, + 9, 53, 8, 22, 104, 28, 76, 11, 36, 52, + 1, 31, 0, 57, 91, 3, 17, 125, 125, 125, + 102, 18, 11, 36, 52, 1, 15, 22, 34, 7, + 4, 5, 9, 13, 41, 19, 59, 10, 9, 11, + 5, 31, 19, 49, 28, 4, 1, 7, 18, 6, + 44, 0, 0, 0, 6, 65, 67, 18, 19, 11, + 36, 9, 49, 10, 70, 50, 108, 102, 38, 40, + 16, 50, 12, 6, 18, 21, 25, 29, 23, 50, + 2, 18, 13, 5, 11, 7, 9, 5, 23, 21, + 41, 36, 9, 8, 9, 33, 0, 17, 8, 8, + 2, 6, 22, 12, 15, 3, 6, 7, 2, 15, + 2, 21, 0, 2, 13, 18, 4, 9, 9, 16, + 9, 29, 15, 37, 23, 10, 3, 14, 14, 14, + 66, 32, 8, 7, 18, 7, 21, 4, 73, 15, + 0, 43, 7, 31, 14, 2, 23, 32, 14, 8, + 49, 14, 29, 23, 6, 55, 16, 36, 24, 28, + 34, 28, 14, 32, 28, 7, 4, 18, 4, 0, + 19, 5, 17, 13, 7, 13, 17, 15, 19, 31, + 17, 13, 39, 45, 35, 11, 9, 19, 21, 33, + 41, 35, 35, 51, 65, 57, 63, 97, 81, 97, + 11, 15, 51, 13, 27, 33, 41, 55, 63, 49, + 55, 53, 35, 45, 55, 41, 43, 4, 48, 36, + 22, 14, 18, 6, 3, 2, 8, 18, 64, 42, + 30, 16, 38, 20, 20, 8, 38, 6, 84, 60, + 46, 30, 46, 10, 1, 5, 3, 13, 88, 40, + 8, 4, 20, 1, 19, 11, 10, 76, 48, 20, + 8, 36, 10, 2, 1, 3, 124, 19, 5, 2, + 4, 3, 0, 16, 20, 10, 18, 26, 32, 15, + 3, 29, 13, 48, 19, 51, 13, 6, 7, 27, + 33, 19, 33, 47, 61, 65, 5, 0, 30, 5, + 17, 0, 2, 6, 19, 11, 3, 3, 35, 19, + 31, 1, 18, 43, 11, 20, 19, 6, 11, 0, + 38, 17, 21, 4, 16, 37, 37, 89, 34, 44, + 60, 14, 4, 16, 3, 3, 1, 21, 17, 15, + 47, 39, 35, 49, 47, 61, 111, 49, 47, 41, + 15, 27, 12, 32, 2, 28, 52, 7, 13, 19, + 19, 59, 43, 37, 75, 47, 65, 59, 91, 81, + 83, 85, 33, 29, 87, 25, 27, 49, 71, 59, + 61, 67, 57, 71, 71, 81, 89, 105, 99, 13, + 29, 61, 11, 20, 14, 32, 34, 40, 76, 48, + 66, 68, 102, 82, 74, 106, 100, 86, 14, 19, + 53, 73, 103, 125, 125, 125, 24, 92, 78, 76, + 56, 74, 36, 28, 34, 14, 17, 5, 22, 0, + 52, 70, 1, 10, 30, 38, 10, 34, 64, 8, + 1, 48, 9, 53, 99, 125, 125, 125, 125, 125, + }, + + { + /* Context Tables for P, SP, B Slices :: cabac_init_idc = 0, qp = 41 */ + + 54, 8, 35, 54, 8, 35, 16, 36, 52, 24, + 9, 55, 6, 20, 104, 28, 80, 11, 38, 54, + 1, 31, 2, 59, 93, 5, 21, 125, 125, 125, + 108, 22, 11, 38, 54, 1, 13, 24, 36, 9, + 4, 3, 7, 13, 41, 19, 59, 10, 9, 9, + 5, 31, 19, 49, 28, 4, 0, 7, 18, 6, + 44, 0, 0, 0, 8, 65, 67, 18, 21, 11, + 36, 7, 49, 14, 74, 54, 110, 106, 42, 42, + 20, 52, 16, 8, 22, 21, 25, 29, 21, 50, + 2, 18, 11, 5, 9, 5, 7, 5, 23, 21, + 41, 36, 9, 10, 7, 31, 2, 17, 8, 8, + 2, 6, 22, 12, 13, 3, 8, 7, 2, 13, + 0, 21, 0, 0, 15, 18, 2, 11, 7, 16, + 7, 31, 17, 37, 25, 10, 3, 16, 14, 14, + 68, 32, 8, 7, 20, 7, 21, 4, 75, 15, + 0, 45, 7, 33, 14, 2, 25, 32, 14, 8, + 51, 14, 29, 23, 6, 57, 14, 36, 24, 28, + 34, 28, 14, 32, 28, 7, 4, 18, 4, 0, + 19, 5, 17, 13, 7, 13, 19, 15, 19, 31, + 17, 13, 41, 47, 35, 15, 11, 21, 23, 37, + 45, 37, 37, 55, 69, 59, 65, 101, 83, 99, + 11, 15, 53, 15, 29, 35, 43, 57, 65, 51, + 57, 55, 37, 47, 55, 39, 41, 6, 50, 36, + 22, 14, 20, 8, 1, 4, 10, 20, 64, 42, + 30, 16, 40, 20, 22, 10, 42, 6, 84, 60, + 46, 30, 48, 10, 1, 5, 1, 13, 90, 40, + 8, 4, 22, 1, 19, 9, 10, 76, 48, 20, + 6, 36, 10, 2, 0, 1, 124, 17, 3, 4, + 6, 0, 2, 20, 22, 14, 20, 28, 36, 13, + 1, 29, 11, 52, 19, 53, 11, 8, 7, 27, + 35, 19, 33, 49, 63, 67, 5, 0, 30, 5, + 17, 1, 2, 6, 21, 11, 3, 3, 37, 19, + 31, 0, 20, 43, 11, 18, 19, 6, 11, 0, + 38, 19, 23, 4, 16, 39, 37, 91, 34, 42, + 58, 12, 2, 14, 7, 7, 5, 25, 21, 21, + 51, 43, 37, 55, 53, 67, 119, 55, 51, 43, + 17, 29, 12, 34, 2, 30, 56, 11, 15, 23, + 23, 63, 45, 39, 79, 49, 67, 61, 93, 83, + 85, 85, 33, 29, 89, 27, 29, 51, 75, 61, + 63, 69, 59, 73, 73, 83, 91, 107, 101, 13, + 31, 61, 9, 20, 14, 34, 36, 42, 78, 50, + 68, 70, 106, 84, 76, 110, 104, 84, 12, 23, + 57, 79, 109, 125, 125, 125, 26, 92, 78, 78, + 56, 76, 38, 28, 36, 16, 17, 3, 24, 0, + 54, 74, 0, 12, 32, 40, 10, 36, 66, 8, + 1, 46, 13, 57, 103, 125, 125, 125, 125, 125, + }, + + { + /* Context Tables for P, SP, B Slices :: cabac_init_idc = 0, qp = 42 */ + + 52, 8, 35, 52, 8, 35, 20, 40, 52, 24, + 11, 59, 2, 16, 104, 28, 86, 13, 40, 54, + 1, 33, 2, 61, 95, 5, 25, 125, 125, 125, + 114, 24, 11, 40, 54, 1, 13, 26, 36, 9, + 4, 3, 7, 13, 43, 17, 59, 10, 9, 7, + 7, 31, 19, 49, 30, 6, 0, 5, 20, 6, + 44, 0, 0, 0, 8, 65, 67, 20, 23, 11, + 36, 7, 49, 16, 78, 56, 114, 110, 44, 44, + 22, 56, 18, 10, 26, 19, 23, 27, 19, 50, + 2, 18, 11, 5, 9, 5, 3, 7, 23, 21, + 41, 36, 9, 10, 7, 31, 2, 17, 10, 8, + 0, 6, 22, 12, 13, 3, 8, 7, 2, 13, + 0, 21, 0, 0, 17, 16, 0, 13, 5, 18, + 7, 33, 17, 35, 25, 10, 5, 16, 14, 14, + 70, 34, 8, 7, 20, 7, 23, 4, 77, 15, + 0, 45, 9, 35, 12, 2, 25, 34, 14, 8, + 53, 12, 31, 23, 6, 57, 14, 36, 24, 28, + 34, 28, 14, 32, 28, 7, 4, 18, 4, 0, + 19, 5, 17, 15, 7, 13, 19, 17, 21, 31, + 19, 13, 43, 49, 35, 17, 15, 25, 27, 41, + 49, 41, 41, 59, 73, 63, 69, 107, 87, 101, + 11, 17, 53, 17, 31, 37, 45, 61, 69, 53, + 59, 55, 37, 47, 55, 39, 41, 6, 50, 36, + 22, 14, 20, 8, 1, 4, 12, 20, 64, 42, + 30, 16, 40, 22, 24, 10, 46, 6, 84, 60, + 46, 30, 48, 10, 1, 5, 0, 13, 90, 38, + 6, 4, 22, 1, 19, 9, 10, 74, 46, 18, + 4, 36, 10, 2, 0, 1, 124, 15, 1, 6, + 10, 2, 4, 22, 26, 16, 24, 32, 40, 11, + 0, 27, 11, 54, 17, 53, 11, 10, 7, 29, + 35, 21, 35, 51, 65, 69, 7, 1, 30, 7, + 19, 1, 2, 6, 23, 13, 3, 5, 39, 19, + 31, 0, 20, 43, 13, 18, 21, 6, 13, 1, + 38, 21, 25, 2, 14, 41, 39, 95, 32, 42, + 58, 10, 1, 10, 11, 11, 9, 31, 27, 25, + 57, 47, 41, 61, 61, 73, 125, 59, 57, 47, + 19, 31, 12, 36, 4, 32, 60, 13, 19, 27, + 25, 67, 49, 43, 83, 51, 71, 65, 97, 87, + 87, 87, 35, 29, 91, 31, 33, 55, 77, 65, + 67, 73, 63, 77, 77, 85, 93, 109, 101, 15, + 31, 63, 9, 22, 16, 34, 36, 44, 80, 52, + 70, 72, 108, 86, 78, 112, 106, 84, 8, 27, + 63, 83, 115, 125, 125, 125, 26, 94, 80, 78, + 58, 78, 38, 30, 36, 16, 15, 3, 26, 2, + 56, 76, 0, 14, 32, 40, 10, 36, 68, 8, + 1, 42, 17, 63, 109, 125, 125, 125, 125, 125, + }, + + { + /* Context Tables for P, SP, B Slices :: cabac_init_idc = 0, qp = 43 */ + + 50, 8, 37, 50, 8, 37, 24, 42, 54, 24, + 11, 61, 0, 14, 104, 28, 90, 13, 40, 56, + 1, 35, 2, 63, 97, 7, 27, 125, 125, 125, + 120, 26, 11, 40, 56, 1, 11, 28, 38, 11, + 6, 1, 5, 15, 43, 17, 59, 10, 7, 7, + 7, 31, 17, 49, 30, 6, 0, 5, 20, 6, + 44, 0, 0, 0, 10, 65, 67, 20, 23, 13, + 36, 7, 47, 20, 82, 60, 116, 114, 46, 46, + 26, 58, 20, 10, 28, 19, 23, 27, 19, 52, + 2, 18, 9, 3, 7, 3, 1, 7, 25, 23, + 43, 36, 9, 10, 5, 31, 4, 17, 10, 8, + 0, 8, 22, 10, 13, 3, 8, 7, 2, 13, + 1, 21, 0, 1, 19, 14, 0, 13, 5, 18, + 7, 33, 19, 35, 27, 10, 5, 16, 14, 14, + 70, 34, 8, 7, 22, 7, 25, 4, 77, 17, + 0, 47, 9, 37, 12, 2, 27, 34, 14, 8, + 57, 12, 33, 23, 6, 59, 12, 36, 22, 28, + 34, 28, 14, 32, 28, 7, 2, 18, 2, 0, + 19, 7, 19, 15, 9, 15, 21, 19, 21, 31, + 19, 11, 45, 53, 35, 21, 19, 29, 31, 45, + 53, 45, 45, 63, 77, 65, 71, 111, 91, 103, + 11, 17, 55, 19, 33, 39, 49, 63, 71, 55, + 61, 57, 39, 49, 55, 39, 39, 8, 50, 36, + 22, 16, 20, 8, 1, 6, 14, 22, 62, 42, + 30, 16, 42, 22, 26, 12, 50, 6, 86, 60, + 46, 30, 50, 10, 1, 3, 0, 13, 90, 38, + 6, 4, 22, 0, 17, 7, 10, 74, 46, 16, + 4, 36, 10, 4, 2, 0, 124, 15, 0, 6, + 12, 4, 6, 26, 28, 18, 26, 34, 42, 11, + 2, 27, 9, 58, 17, 55, 11, 10, 7, 29, + 37, 21, 37, 53, 67, 71, 7, 1, 30, 7, + 19, 3, 0, 6, 23, 13, 5, 5, 39, 19, + 33, 0, 20, 45, 13, 16, 23, 4, 15, 1, + 40, 21, 27, 2, 14, 41, 39, 97, 30, 40, + 56, 8, 3, 8, 15, 15, 13, 35, 31, 31, + 63, 53, 43, 65, 67, 79, 125, 65, 61, 51, + 21, 33, 12, 38, 4, 36, 66, 17, 23, 31, + 29, 71, 53, 45, 87, 55, 73, 67, 99, 89, + 89, 89, 35, 31, 93, 33, 35, 57, 81, 69, + 71, 75, 65, 81, 79, 89, 97, 113, 103, 17, + 33, 65, 9, 22, 16, 36, 38, 44, 84, 54, + 72, 74, 110, 90, 80, 116, 108, 82, 6, 31, + 67, 89, 119, 125, 125, 125, 28, 94, 80, 80, + 58, 80, 40, 30, 38, 18, 15, 1, 26, 2, + 58, 78, 0, 14, 34, 42, 10, 38, 70, 8, + 1, 40, 21, 67, 115, 125, 125, 125, 125, 125, + }, + + { + /* Context Tables for P, SP, B Slices :: cabac_init_idc = 0, qp = 44 */ + + 46, 8, 37, 46, 8, 37, 26, 46, 54, 24, + 13, 65, 3, 12, 104, 28, 94, 13, 42, 58, + 1, 37, 2, 65, 99, 7, 31, 125, 125, 125, + 124, 28, 11, 42, 58, 1, 11, 30, 38, 11, + 6, 1, 3, 15, 45, 17, 59, 10, 7, 5, + 7, 33, 17, 49, 32, 6, 0, 5, 22, 6, + 44, 0, 0, 0, 10, 67, 67, 22, 25, 13, + 36, 7, 47, 24, 84, 62, 120, 116, 48, 48, + 28, 60, 22, 12, 32, 17, 23, 25, 17, 52, + 2, 18, 9, 3, 7, 3, 2, 7, 25, 23, + 43, 36, 9, 10, 5, 31, 4, 17, 12, 8, + 0, 8, 22, 10, 13, 3, 8, 7, 2, 13, + 3, 23, 0, 3, 21, 12, 1, 15, 3, 20, + 7, 35, 19, 33, 27, 10, 5, 16, 14, 14, + 72, 34, 8, 9, 22, 9, 27, 2, 79, 17, + 0, 47, 11, 39, 12, 0, 29, 34, 14, 8, + 59, 12, 35, 23, 6, 59, 10, 36, 22, 28, + 34, 28, 14, 32, 26, 7, 2, 18, 2, 0, + 21, 7, 19, 17, 9, 15, 23, 21, 23, 31, + 21, 11, 47, 55, 35, 23, 23, 33, 35, 49, + 57, 49, 49, 67, 81, 69, 75, 115, 95, 105, + 11, 17, 55, 21, 35, 41, 51, 67, 75, 57, + 65, 59, 39, 49, 55, 39, 39, 8, 50, 36, + 22, 16, 20, 8, 1, 6, 16, 22, 62, 42, + 30, 16, 42, 24, 26, 14, 54, 6, 86, 60, + 46, 30, 50, 10, 1, 3, 2, 13, 90, 38, + 4, 4, 22, 0, 17, 5, 10, 72, 44, 14, + 2, 36, 10, 4, 2, 0, 124, 13, 2, 8, + 14, 6, 8, 28, 30, 20, 28, 38, 46, 9, + 2, 27, 9, 60, 17, 55, 11, 12, 7, 31, + 37, 21, 39, 55, 69, 73, 9, 3, 30, 9, + 21, 3, 0, 6, 25, 15, 5, 5, 41, 21, + 33, 0, 20, 45, 15, 16, 25, 4, 17, 3, + 40, 23, 29, 0, 12, 43, 41, 101, 28, 38, + 56, 6, 5, 4, 19, 19, 17, 41, 37, 35, + 69, 57, 45, 71, 73, 85, 125, 71, 67, 55, + 23, 37, 12, 40, 4, 38, 70, 21, 27, 35, + 33, 75, 57, 49, 91, 57, 77, 69, 103, 93, + 91, 91, 37, 31, 95, 35, 39, 61, 85, 73, + 75, 79, 69, 85, 83, 91, 99, 115, 105, 19, + 35, 67, 9, 24, 16, 36, 38, 46, 86, 54, + 74, 76, 112, 92, 82, 118, 110, 82, 2, 35, + 73, 93, 125, 125, 125, 125, 28, 96, 82, 80, + 58, 80, 40, 30, 38, 18, 15, 1, 28, 4, + 60, 80, 0, 16, 34, 42, 10, 38, 72, 8, + 1, 38, 25, 73, 121, 125, 125, 125, 125, 125, + }, + + { + /* Context Tables for P, SP, B Slices :: cabac_init_idc = 0, qp = 45 */ + + 44, 8, 37, 44, 8, 37, 30, 48, 54, 24, + 13, 67, 5, 8, 104, 28, 100, 15, 44, 58, + 1, 37, 4, 67, 101, 9, 35, 125, 125, 125, + 124, 32, 11, 44, 58, 1, 11, 32, 38, 13, + 6, 1, 3, 15, 45, 15, 59, 10, 7, 3, + 9, 33, 17, 49, 32, 8, 2, 3, 22, 6, + 44, 0, 0, 0, 12, 67, 67, 22, 27, 13, + 36, 5, 47, 26, 88, 66, 122, 120, 52, 50, + 30, 64, 24, 14, 36, 17, 21, 25, 15, 52, + 2, 18, 7, 3, 5, 1, 4, 9, 25, 23, + 43, 36, 9, 12, 3, 31, 4, 17, 12, 8, + 1, 8, 22, 10, 11, 3, 10, 7, 2, 13, + 3, 23, 0, 3, 23, 12, 3, 17, 1, 20, + 7, 37, 21, 33, 29, 10, 7, 16, 14, 14, + 74, 36, 8, 9, 22, 9, 29, 2, 81, 17, + 0, 49, 11, 41, 10, 0, 29, 36, 14, 8, + 61, 10, 37, 23, 6, 61, 10, 36, 22, 28, + 34, 28, 14, 32, 26, 7, 2, 18, 2, 0, + 21, 7, 19, 17, 9, 15, 23, 23, 23, 31, + 21, 11, 49, 57, 35, 27, 27, 35, 37, 53, + 61, 53, 53, 71, 85, 73, 79, 121, 97, 107, + 11, 19, 57, 23, 37, 43, 53, 69, 77, 59, + 67, 59, 41, 51, 55, 39, 37, 8, 50, 36, + 22, 16, 20, 10, 0, 8, 18, 24, 62, 42, + 30, 16, 42, 24, 28, 14, 58, 6, 86, 60, + 46, 30, 50, 10, 1, 3, 4, 13, 92, 36, + 4, 4, 22, 0, 17, 5, 10, 72, 42, 12, + 0, 36, 10, 4, 2, 2, 124, 11, 4, 10, + 18, 8, 10, 30, 34, 22, 32, 40, 50, 7, + 4, 25, 7, 62, 15, 57, 9, 14, 7, 31, + 39, 23, 39, 57, 71, 75, 11, 3, 30, 9, + 21, 5, 0, 6, 27, 17, 5, 7, 43, 21, + 33, 0, 20, 45, 17, 14, 25, 4, 19, 5, + 40, 25, 31, 0, 12, 45, 43, 105, 26, 38, + 54, 4, 9, 2, 23, 23, 21, 45, 43, 41, + 75, 61, 49, 77, 81, 91, 125, 75, 71, 59, + 25, 39, 12, 42, 6, 40, 74, 23, 29, 39, + 35, 79, 61, 51, 95, 59, 81, 73, 107, 95, + 93, 93, 37, 31, 97, 39, 41, 65, 87, 75, + 77, 81, 71, 87, 85, 93, 101, 117, 105, 19, + 35, 69, 7, 24, 18, 38, 40, 48, 88, 56, + 76, 78, 116, 94, 84, 120, 114, 80, 1, 39, + 77, 99, 125, 125, 125, 125, 28, 96, 82, 82, + 60, 82, 40, 32, 40, 20, 13, 1, 30, 4, + 62, 82, 2, 18, 36, 44, 10, 40, 74, 8, + 1, 34, 29, 77, 125, 125, 125, 125, 125, 125, + }, + + { + /* Context Tables for P, SP, B Slices :: cabac_init_idc = 0, qp = 46 */ + + 42, 8, 37, 42, 8, 37, 34, 52, 56, 24, + 15, 71, 9, 6, 104, 28, 104, 15, 46, 60, + 1, 39, 4, 69, 103, 9, 39, 125, 125, 125, + 124, 34, 11, 46, 60, 1, 9, 34, 40, 13, + 6, 0, 1, 17, 47, 15, 59, 10, 7, 3, + 9, 33, 17, 49, 34, 8, 2, 3, 24, 6, + 44, 0, 0, 0, 12, 67, 67, 24, 29, 13, + 36, 5, 47, 30, 92, 68, 124, 124, 54, 52, + 34, 66, 26, 16, 40, 15, 21, 23, 13, 52, + 2, 18, 7, 3, 5, 1, 8, 9, 27, 25, + 45, 36, 9, 12, 3, 31, 6, 17, 14, 8, + 1, 8, 22, 8, 11, 3, 10, 7, 2, 13, + 5, 23, 0, 5, 25, 10, 5, 19, 1, 22, + 7, 39, 21, 31, 29, 10, 7, 16, 14, 14, + 76, 36, 8, 9, 24, 9, 31, 2, 83, 17, + 0, 49, 13, 43, 10, 0, 31, 36, 14, 8, + 65, 10, 39, 23, 6, 61, 8, 36, 22, 28, + 34, 28, 14, 32, 26, 7, 0, 18, 2, 0, + 21, 9, 21, 19, 11, 17, 25, 25, 25, 31, + 23, 11, 51, 61, 35, 29, 31, 39, 41, 57, + 65, 57, 57, 75, 89, 75, 81, 125, 101, 109, + 11, 19, 57, 25, 39, 45, 57, 73, 81, 61, + 69, 61, 41, 51, 55, 39, 37, 10, 50, 36, + 22, 16, 20, 10, 0, 8, 20, 24, 62, 42, + 30, 16, 44, 26, 30, 16, 62, 6, 86, 60, + 46, 30, 52, 10, 1, 3, 4, 13, 92, 36, + 2, 4, 22, 0, 17, 3, 10, 70, 42, 10, + 1, 36, 10, 4, 4, 2, 124, 9, 6, 10, + 20, 10, 12, 34, 36, 24, 34, 44, 52, 7, + 6, 25, 7, 66, 15, 57, 9, 14, 7, 33, + 39, 23, 41, 59, 73, 77, 11, 5, 30, 11, + 23, 5, 1, 6, 29, 17, 7, 7, 45, 21, + 35, 0, 20, 47, 17, 14, 27, 4, 21, 5, + 40, 27, 33, 1, 10, 47, 43, 107, 24, 36, + 54, 2, 11, 1, 27, 27, 25, 51, 47, 45, + 81, 67, 51, 83, 87, 97, 125, 81, 77, 63, + 27, 41, 12, 44, 6, 42, 78, 27, 33, 43, + 39, 83, 65, 55, 99, 63, 83, 75, 109, 99, + 95, 95, 39, 33, 99, 41, 45, 67, 91, 79, + 81, 85, 75, 91, 89, 95, 105, 121, 107, 21, + 37, 71, 7, 26, 18, 38, 40, 48, 90, 58, + 78, 80, 118, 96, 86, 124, 116, 80, 3, 43, + 83, 103, 125, 125, 125, 125, 30, 98, 84, 82, + 60, 84, 42, 32, 40, 20, 13, 0, 30, 6, + 64, 84, 2, 18, 36, 44, 10, 40, 76, 8, + 1, 32, 33, 83, 125, 125, 125, 125, 125, 125, + }, + + { + /* Context Tables for P, SP, B Slices :: cabac_init_idc = 0, qp = 47 */ + + 40, 8, 37, 40, 8, 37, 38, 54, 56, 24, + 15, 73, 11, 4, 104, 28, 108, 15, 48, 62, + 1, 41, 4, 71, 105, 11, 43, 125, 125, 125, + 124, 36, 11, 48, 62, 1, 9, 36, 40, 15, + 6, 0, 0, 17, 47, 15, 59, 10, 7, 1, + 9, 33, 17, 49, 34, 8, 2, 3, 24, 6, + 44, 0, 0, 0, 14, 67, 67, 24, 31, 13, + 36, 5, 47, 34, 96, 72, 124, 124, 56, 54, + 36, 68, 28, 18, 44, 15, 21, 23, 11, 52, + 2, 18, 5, 3, 3, 0, 10, 9, 27, 25, + 45, 36, 9, 12, 1, 31, 6, 17, 14, 8, + 1, 8, 22, 8, 11, 3, 10, 7, 2, 13, + 7, 23, 0, 7, 27, 8, 7, 21, 0, 22, + 7, 41, 23, 31, 31, 10, 7, 16, 14, 14, + 78, 36, 8, 9, 24, 9, 33, 2, 85, 17, + 0, 51, 13, 45, 10, 0, 33, 36, 14, 8, + 67, 10, 41, 23, 6, 63, 6, 36, 22, 28, + 34, 28, 14, 32, 26, 7, 0, 18, 2, 0, + 21, 9, 21, 19, 11, 17, 27, 27, 25, 31, + 23, 11, 53, 63, 35, 33, 35, 43, 45, 61, + 69, 61, 61, 79, 93, 79, 85, 125, 105, 111, + 11, 19, 59, 27, 41, 47, 59, 75, 83, 63, + 71, 63, 43, 53, 55, 39, 35, 10, 50, 36, + 22, 16, 20, 10, 0, 10, 22, 26, 62, 42, + 30, 16, 44, 26, 32, 18, 66, 6, 86, 60, + 46, 30, 52, 10, 1, 3, 6, 13, 92, 36, + 2, 4, 22, 0, 17, 1, 10, 70, 40, 8, + 3, 36, 10, 4, 4, 4, 124, 7, 8, 12, + 22, 12, 14, 36, 38, 26, 36, 46, 56, 5, + 8, 25, 5, 68, 15, 59, 9, 16, 7, 33, + 41, 23, 43, 61, 75, 79, 13, 5, 30, 11, + 23, 7, 1, 6, 31, 19, 7, 7, 47, 21, + 35, 0, 20, 47, 19, 12, 29, 4, 23, 7, + 40, 29, 35, 1, 10, 49, 45, 111, 22, 34, + 52, 0, 13, 3, 31, 31, 29, 55, 53, 51, + 87, 71, 53, 89, 93, 103, 125, 87, 81, 67, + 29, 43, 12, 46, 6, 44, 82, 31, 37, 47, + 43, 87, 69, 57, 103, 65, 87, 77, 113, 101, + 97, 97, 39, 33, 101, 43, 47, 71, 95, 83, + 85, 87, 77, 95, 91, 97, 107, 123, 109, 23, + 39, 73, 7, 26, 18, 40, 42, 50, 92, 60, + 80, 82, 120, 98, 88, 124, 118, 78, 7, 47, + 87, 109, 125, 125, 125, 125, 30, 98, 84, 84, + 60, 86, 42, 32, 42, 22, 13, 0, 32, 6, + 66, 86, 2, 20, 38, 46, 10, 42, 78, 8, + 1, 30, 37, 87, 125, 125, 125, 125, 125, 125, + }, + + { + /* Context Tables for P, SP, B Slices :: cabac_init_idc = 0, qp = 48 */ + + 36, 6, 39, 36, 6, 39, 40, 56, 56, 24, + 17, 77, 15, 0, 102, 28, 112, 17, 48, 62, + 3, 43, 4, 75, 109, 13, 47, 125, 125, 125, + 124, 38, 13, 48, 62, 3, 9, 38, 40, 17, + 6, 0, 0, 19, 49, 15, 61, 10, 7, 1, + 11, 35, 17, 49, 34, 8, 2, 3, 24, 4, + 44, 0, 0, 0, 14, 69, 67, 24, 33, 15, + 36, 5, 47, 36, 98, 74, 124, 124, 58, 56, + 38, 70, 30, 18, 46, 15, 21, 23, 11, 52, + 2, 18, 5, 3, 3, 0, 12, 11, 29, 27, + 47, 36, 9, 12, 1, 31, 6, 17, 14, 6, + 3, 8, 20, 6, 11, 5, 10, 7, 0, 13, + 9, 25, 1, 9, 31, 6, 9, 23, 0, 22, + 7, 43, 25, 31, 33, 8, 9, 16, 14, 14, + 78, 36, 8, 11, 24, 11, 35, 0, 87, 19, + 1, 53, 15, 47, 8, 1, 35, 36, 14, 8, + 71, 8, 43, 23, 6, 65, 4, 34, 20, 28, + 34, 26, 12, 32, 24, 9, 1, 18, 0, 1, + 23, 11, 23, 21, 13, 19, 29, 29, 27, 31, + 25, 11, 55, 67, 35, 37, 39, 47, 49, 65, + 75, 65, 65, 83, 97, 83, 89, 125, 109, 115, + 13, 21, 61, 29, 45, 51, 63, 79, 87, 67, + 75, 65, 45, 55, 55, 39, 35, 10, 50, 36, + 22, 16, 20, 10, 0, 10, 24, 26, 60, 42, + 30, 16, 44, 26, 32, 18, 68, 4, 86, 60, + 44, 28, 52, 10, 1, 3, 6, 13, 92, 34, + 0, 2, 22, 0, 17, 1, 10, 68, 38, 6, + 5, 34, 10, 4, 4, 4, 124, 7, 10, 12, + 24, 14, 14, 38, 40, 28, 38, 48, 58, 5, + 8, 25, 5, 70, 15, 61, 9, 16, 7, 35, + 43, 25, 45, 63, 79, 83, 15, 7, 30, 13, + 25, 9, 3, 4, 33, 21, 9, 9, 49, 23, + 37, 0, 20, 49, 21, 10, 31, 2, 25, 9, + 40, 31, 39, 3, 8, 51, 47, 115, 20, 32, + 50, 3, 17, 7, 35, 35, 35, 61, 59, 57, + 93, 77, 57, 95, 101, 109, 125, 93, 87, 71, + 33, 47, 12, 46, 6, 46, 86, 35, 41, 51, + 47, 93, 73, 61, 109, 69, 91, 81, 117, 105, + 99, 99, 41, 35, 105, 47, 51, 75, 99, 87, + 89, 91, 81, 99, 95, 101, 111, 125, 111, 25, + 41, 75, 7, 26, 18, 40, 42, 50, 94, 60, + 82, 82, 122, 100, 90, 124, 120, 76, 11, 53, + 93, 115, 125, 125, 125, 125, 30, 98, 84, 84, + 60, 86, 42, 32, 42, 22, 13, 0, 32, 6, + 68, 88, 2, 20, 38, 46, 10, 42, 78, 8, + 3, 26, 43, 93, 125, 125, 125, 125, 125, 125, + }, + + { + /* Context Tables for P, SP, B Slices :: cabac_init_idc = 0, qp = 49 */ + + 34, 6, 39, 34, 6, 39, 44, 60, 58, 26, + 17, 79, 17, 1, 102, 28, 118, 17, 50, 64, + 3, 43, 6, 77, 111, 13, 49, 125, 125, 125, + 124, 42, 13, 50, 64, 3, 7, 42, 42, 17, + 8, 2, 2, 19, 49, 13, 61, 10, 5, 0, + 11, 35, 15, 49, 36, 10, 4, 1, 26, 4, + 44, 0, 0, 0, 16, 69, 67, 26, 33, 15, + 36, 3, 45, 40, 102, 78, 124, 124, 62, 58, + 42, 74, 34, 20, 50, 13, 19, 21, 9, 54, + 4, 20, 3, 1, 1, 2, 16, 11, 29, 27, + 47, 38, 9, 14, 0, 29, 8, 15, 16, 6, + 3, 10, 20, 6, 9, 5, 12, 7, 0, 11, + 9, 25, 1, 9, 33, 6, 9, 23, 2, 24, + 5, 43, 25, 29, 33, 8, 9, 18, 14, 14, + 80, 38, 10, 11, 26, 11, 35, 0, 87, 19, + 1, 53, 15, 49, 8, 1, 35, 38, 16, 8, + 73, 8, 43, 21, 6, 65, 4, 34, 20, 28, + 34, 26, 12, 32, 24, 9, 1, 18, 0, 1, + 23, 11, 23, 21, 13, 19, 29, 29, 27, 29, + 25, 9, 55, 69, 33, 39, 41, 49, 51, 69, + 79, 67, 67, 87, 99, 85, 91, 125, 111, 117, + 13, 21, 61, 31, 47, 53, 65, 81, 89, 69, + 77, 65, 45, 55, 53, 37, 33, 12, 52, 36, + 22, 18, 22, 12, 2, 12, 28, 28, 60, 42, + 30, 16, 46, 28, 34, 20, 72, 4, 88, 62, + 44, 28, 54, 10, 1, 1, 8, 13, 94, 34, + 0, 2, 24, 2, 15, 0, 10, 68, 38, 6, + 5, 34, 10, 6, 6, 6, 124, 5, 12, 14, + 28, 18, 16, 42, 44, 32, 42, 52, 62, 3, + 10, 23, 3, 74, 13, 61, 7, 18, 5, 35, + 43, 25, 45, 63, 81, 85, 15, 7, 32, 13, + 25, 9, 3, 4, 33, 21, 9, 9, 49, 23, + 37, 2, 22, 49, 21, 10, 31, 2, 25, 9, + 42, 31, 41, 3, 8, 51, 47, 117, 20, 32, + 50, 5, 19, 9, 39, 39, 39, 65, 63, 61, + 97, 81, 59, 99, 107, 115, 125, 97, 91, 73, + 35, 49, 12, 48, 8, 50, 92, 37, 43, 53, + 49, 97, 75, 63, 113, 71, 93, 83, 119, 107, + 101, 99, 41, 35, 107, 49, 53, 77, 101, 89, + 91, 93, 83, 101, 97, 103, 113, 125, 111, 25, + 41, 75, 5, 28, 20, 42, 44, 52, 98, 62, + 84, 84, 124, 104, 92, 124, 124, 76, 13, 57, + 97, 119, 125, 125, 125, 125, 32, 100, 86, 86, + 62, 88, 44, 34, 44, 24, 11, 2, 34, 8, + 72, 92, 4, 22, 40, 48, 12, 44, 80, 8, + 3, 24, 47, 97, 125, 125, 125, 125, 125, 125, + }, + + { + /* Context Tables for P, SP, B Slices :: cabac_init_idc = 0, qp = 50 */ + + 32, 6, 39, 32, 6, 39, 48, 62, 58, 26, + 17, 81, 19, 3, 102, 28, 122, 17, 52, 66, + 3, 45, 6, 79, 113, 15, 53, 125, 125, 125, + 124, 44, 13, 52, 66, 3, 7, 44, 42, 19, + 8, 2, 4, 19, 49, 13, 61, 10, 5, 2, + 11, 35, 15, 49, 36, 10, 4, 1, 26, 4, + 44, 0, 0, 0, 18, 69, 67, 26, 35, 15, + 36, 3, 45, 44, 106, 82, 124, 124, 64, 60, + 44, 76, 36, 22, 54, 13, 19, 21, 7, 54, + 4, 20, 1, 1, 0, 2, 18, 11, 29, 27, + 47, 38, 9, 14, 0, 29, 8, 15, 16, 6, + 3, 10, 20, 6, 9, 5, 12, 7, 0, 11, + 11, 25, 1, 11, 35, 4, 11, 25, 4, 24, + 5, 45, 27, 29, 35, 8, 9, 18, 14, 14, + 82, 38, 10, 11, 26, 11, 37, 0, 89, 19, + 1, 55, 15, 51, 8, 1, 37, 38, 16, 8, + 75, 8, 45, 21, 6, 67, 2, 34, 20, 28, + 34, 26, 12, 32, 24, 9, 1, 18, 0, 1, + 23, 11, 23, 21, 13, 19, 31, 31, 29, 29, + 25, 9, 57, 71, 33, 43, 45, 53, 55, 73, + 83, 71, 71, 91, 103, 89, 95, 125, 115, 119, + 13, 21, 63, 33, 49, 55, 67, 85, 91, 71, + 79, 67, 47, 57, 53, 37, 31, 12, 52, 36, + 22, 18, 22, 12, 2, 12, 30, 30, 60, 42, + 30, 16, 46, 28, 36, 22, 76, 4, 88, 62, + 44, 28, 54, 10, 1, 1, 10, 13, 94, 34, + 0, 2, 24, 2, 15, 2, 10, 66, 36, 4, + 7, 34, 10, 6, 6, 6, 124, 3, 14, 16, + 30, 20, 18, 44, 46, 34, 44, 54, 66, 1, + 12, 23, 1, 76, 13, 63, 7, 20, 5, 35, + 45, 25, 47, 65, 83, 87, 17, 7, 32, 13, + 27, 11, 3, 4, 35, 23, 9, 9, 51, 23, + 37, 2, 22, 49, 23, 8, 33, 2, 27, 11, + 42, 33, 43, 3, 8, 53, 49, 121, 18, 30, + 48, 7, 21, 11, 43, 43, 43, 69, 69, 67, + 103, 85, 61, 105, 113, 121, 125, 103, 95, 77, + 37, 51, 12, 50, 8, 52, 96, 41, 47, 57, + 53, 101, 79, 67, 117, 73, 97, 85, 123, 109, + 103, 101, 43, 35, 109, 51, 55, 81, 105, 93, + 95, 95, 87, 105, 99, 105, 115, 125, 113, 27, + 43, 77, 5, 28, 20, 44, 44, 54, 100, 64, + 86, 86, 124, 106, 94, 124, 124, 74, 17, 61, + 101, 125, 125, 125, 125, 125, 32, 100, 86, 86, + 62, 90, 44, 34, 44, 24, 11, 2, 36, 8, + 74, 94, 4, 24, 42, 50, 12, 44, 82, 8, + 3, 22, 51, 101, 125, 125, 125, 125, 125, 125, + }, + + { + /* Context Tables for P, SP, B Slices :: cabac_init_idc = 0, qp = 51 */ + + 30, 6, 39, 30, 6, 39, 52, 66, 60, 26, + 19, 85, 23, 5, 102, 28, 124, 17, 54, 68, + 3, 47, 6, 81, 115, 15, 57, 125, 125, 125, + 124, 46, 13, 54, 68, 3, 5, 46, 44, 19, + 8, 4, 6, 21, 51, 13, 61, 10, 5, 2, + 11, 35, 15, 49, 38, 10, 4, 1, 28, 4, + 44, 0, 0, 0, 18, 69, 67, 28, 37, 15, + 36, 3, 45, 48, 110, 84, 124, 124, 66, 62, + 48, 78, 38, 24, 58, 11, 19, 19, 5, 54, + 4, 20, 1, 1, 0, 4, 22, 11, 31, 29, + 49, 38, 9, 14, 2, 29, 10, 15, 18, 6, + 3, 10, 20, 4, 9, 5, 12, 7, 0, 11, + 13, 25, 1, 13, 37, 2, 13, 27, 4, 26, + 5, 47, 27, 27, 35, 8, 9, 18, 14, 14, + 84, 38, 10, 11, 28, 11, 39, 0, 91, 19, + 1, 55, 17, 53, 8, 1, 39, 38, 16, 8, + 79, 8, 47, 21, 6, 67, 0, 34, 20, 28, + 34, 26, 12, 32, 24, 9, 3, 18, 0, 1, + 23, 13, 25, 23, 15, 21, 33, 33, 29, 29, + 27, 9, 59, 75, 33, 45, 49, 57, 59, 77, + 87, 75, 75, 95, 107, 91, 97, 125, 119, 121, + 13, 21, 63, 35, 51, 57, 71, 87, 95, 73, + 81, 69, 47, 57, 53, 37, 31, 14, 52, 36, + 22, 18, 22, 12, 2, 14, 32, 30, 60, 42, + 30, 16, 48, 30, 38, 24, 80, 4, 88, 62, + 44, 28, 56, 10, 1, 1, 10, 13, 94, 34, + 1, 2, 24, 2, 15, 4, 10, 66, 36, 2, + 9, 34, 10, 6, 8, 8, 124, 1, 16, 16, + 32, 22, 20, 48, 48, 36, 46, 58, 68, 1, + 14, 23, 1, 80, 13, 63, 7, 20, 5, 37, + 45, 25, 49, 67, 85, 89, 17, 9, 32, 15, + 27, 11, 5, 4, 37, 23, 11, 9, 53, 23, + 39, 2, 22, 51, 23, 8, 35, 2, 29, 11, + 42, 35, 45, 5, 6, 55, 49, 123, 16, 28, + 48, 9, 23, 15, 47, 47, 47, 75, 73, 71, + 109, 91, 63, 111, 119, 125, 125, 109, 101, 81, + 39, 53, 12, 52, 8, 54, 100, 45, 51, 61, + 57, 105, 83, 69, 121, 77, 99, 87, 125, 113, + 105, 103, 43, 37, 111, 53, 59, 83, 109, 97, + 99, 99, 89, 109, 103, 107, 119, 125, 115, 29, + 45, 79, 5, 30, 20, 44, 46, 54, 102, 66, + 88, 88, 124, 108, 96, 124, 124, 74, 19, 65, + 107, 125, 125, 125, 125, 125, 34, 102, 88, 88, + 62, 92, 46, 34, 46, 26, 11, 4, 36, 10, + 76, 96, 4, 24, 42, 50, 12, 46, 84, 8, + 3, 20, 55, 107, 125, 125, 125, 125, 125, 125, + }, + + }, + + { + + { + /* Context Tables for P, SP, B Slices :: cabac_init_idc = 1, qp = 0 */ + + 124, 18, 21, 124, 18, 21, 125, 81, 20, 18, + 24, 76, 124, 124, 108, 44, 109, 3, 15, 31, + 22, 26, 13, 18, 58, 82, 124, 122, 54, 11, + 125, 75, 25, 15, 31, 22, 11, 53, 22, 40, + 11, 37, 65, 8, 23, 47, 73, 14, 21, 43, + 8, 35, 45, 63, 5, 27, 13, 45, 17, 4, + 44, 0, 0, 0, 39, 45, 67, 17, 44, 2, + 96, 24, 33, 125, 55, 65, 35, 69, 77, 67, + 111, 71, 93, 77, 125, 33, 51, 61, 57, 48, + 3, 41, 125, 19, 81, 55, 125, 16, 14, 16, + 4, 20, 9, 21, 49, 79, 55, 51, 57, 25, + 47, 93, 83, 29, 97, 71, 125, 125, 125, 125, + 5, 29, 15, 17, 8, 16, 13, 23, 51, 111, + 23, 86, 82, 125, 18, 4, 10, 6, 4, 7, + 41, 21, 3, 22, 12, 4, 11, 13, 16, 15, + 10, 4, 44, 76, 62, 40, 32, 38, 24, 34, + 50, 5, 50, 42, 58, 51, 36, 70, 64, 124, + 124, 96, 124, 124, 124, 124, 124, 124, 124, 124, + 124, 124, 106, 124, 124, 124, 124, 124, 124, 124, + 112, 124, 124, 124, 54, 124, 124, 124, 124, 124, + 124, 124, 124, 124, 124, 124, 106, 90, 76, 44, + 23, 17, 27, 56, 64, 56, 66, 36, 42, 36, + 74, 18, 5, 14, 19, 7, 105, 97, 15, 4, + 20, 5, 27, 33, 41, 47, 125, 75, 48, 20, + 4, 23, 27, 55, 87, 95, 117, 25, 38, 22, + 12, 10, 17, 11, 11, 21, 45, 5, 58, 62, + 64, 22, 16, 7, 19, 51, 22, 118, 110, 110, + 88, 52, 4, 19, 13, 29, 124, 125, 121, 93, + 125, 121, 83, 115, 107, 77, 107, 105, 117, 63, + 73, 63, 95, 101, 51, 33, 37, 43, 35, 17, + 1, 7, 14, 11, 11, 11, 11, 7, 27, 1, + 4, 7, 1, 12, 3, 5, 2, 24, 5, 15, + 23, 13, 17, 6, 52, 32, 56, 52, 44, 44, + 30, 44, 44, 8, 26, 46, 5, 26, 124, 124, + 124, 124, 124, 124, 124, 124, 124, 124, 124, 124, + 124, 124, 108, 124, 124, 124, 124, 124, 124, 124, + 124, 124, 98, 74, 52, 16, 3, 124, 124, 124, + 124, 124, 124, 124, 124, 124, 124, 124, 124, 86, + 66, 38, 30, 28, 36, 82, 82, 84, 86, 70, + 78, 58, 42, 48, 26, 13, 18, 15, 39, 62, + 28, 18, 43, 35, 27, 35, 33, 19, 21, 39, + 15, 7, 4, 5, 5, 8, 8, 124, 124, 124, + 124, 124, 120, 106, 72, 12, 15, 78, 54, 42, + 22, 12, 0, 3, 7, 37, 35, 25, 17, 29, + 17, 9, 13, 25, 5, 2, 12, 4, 6, 18, + 10, 124, 124, 124, 124, 124, 120, 106, 72, 12, + }, + + { + /* Context Tables for P, SP, B Slices :: cabac_init_idc = 1, qp = 1 */ + + 124, 18, 21, 124, 18, 21, 123, 77, 22, 20, + 24, 74, 122, 124, 110, 44, 105, 3, 13, 29, + 22, 26, 11, 18, 56, 80, 122, 116, 50, 13, + 121, 73, 23, 13, 29, 22, 11, 51, 22, 40, + 9, 35, 63, 8, 23, 45, 71, 14, 19, 41, + 8, 33, 43, 61, 3, 25, 13, 43, 15, 4, + 44, 0, 0, 0, 37, 45, 67, 15, 44, 2, + 96, 24, 33, 121, 51, 61, 31, 63, 73, 63, + 107, 67, 89, 73, 121, 33, 49, 59, 55, 48, + 3, 39, 121, 17, 79, 53, 123, 16, 14, 16, + 4, 22, 9, 19, 47, 77, 53, 49, 55, 23, + 45, 89, 79, 27, 93, 67, 117, 117, 119, 121, + 3, 27, 13, 15, 8, 18, 11, 21, 49, 105, + 21, 82, 80, 121, 18, 6, 10, 8, 6, 5, + 37, 19, 1, 22, 12, 4, 9, 11, 14, 13, + 10, 4, 44, 74, 62, 40, 32, 38, 24, 34, + 48, 3, 50, 42, 58, 51, 36, 70, 64, 124, + 124, 94, 124, 124, 124, 122, 124, 124, 124, 124, + 124, 124, 104, 124, 124, 124, 124, 124, 124, 124, + 108, 124, 120, 124, 52, 124, 124, 124, 124, 124, + 124, 124, 124, 124, 124, 122, 104, 88, 74, 42, + 23, 17, 27, 56, 62, 54, 64, 34, 40, 34, + 72, 16, 5, 12, 19, 7, 103, 93, 13, 6, + 20, 3, 25, 31, 39, 45, 121, 71, 50, 22, + 6, 21, 25, 51, 83, 91, 113, 23, 40, 24, + 14, 12, 15, 9, 9, 19, 43, 5, 60, 62, + 64, 22, 18, 5, 19, 49, 22, 118, 110, 108, + 86, 52, 6, 17, 11, 27, 124, 121, 117, 89, + 121, 117, 79, 111, 103, 73, 103, 101, 111, 61, + 71, 61, 91, 97, 49, 31, 35, 41, 33, 15, + 1, 7, 14, 11, 11, 11, 9, 5, 25, 0, + 4, 5, 0, 12, 1, 3, 2, 24, 3, 13, + 21, 11, 15, 6, 50, 32, 54, 52, 44, 44, + 30, 44, 44, 8, 26, 44, 5, 24, 124, 124, + 124, 124, 124, 124, 124, 124, 124, 124, 124, 124, + 124, 124, 104, 124, 124, 124, 124, 124, 124, 124, + 122, 124, 96, 72, 50, 16, 3, 124, 124, 124, + 124, 124, 124, 124, 124, 124, 124, 124, 124, 84, + 64, 36, 30, 28, 34, 80, 80, 82, 82, 68, + 76, 56, 40, 46, 24, 13, 16, 15, 39, 60, + 26, 16, 41, 33, 25, 33, 29, 15, 19, 37, + 13, 5, 6, 3, 3, 8, 8, 124, 124, 124, + 124, 120, 112, 98, 64, 8, 13, 78, 56, 44, + 24, 14, 2, 1, 5, 35, 33, 23, 15, 27, + 15, 7, 11, 23, 3, 4, 12, 6, 8, 18, + 10, 124, 124, 124, 124, 120, 112, 98, 64, 8, + }, + + { + /* Context Tables for P, SP, B Slices :: cabac_init_idc = 1, qp = 2 */ + + 124, 18, 21, 124, 18, 21, 119, 75, 22, 20, + 24, 72, 118, 122, 110, 44, 101, 3, 13, 27, + 22, 24, 11, 16, 52, 78, 116, 108, 44, 17, + 115, 71, 23, 13, 27, 22, 11, 49, 22, 38, + 9, 35, 61, 8, 23, 45, 71, 14, 19, 41, + 8, 33, 43, 61, 3, 25, 13, 43, 15, 4, + 44, 0, 0, 0, 35, 45, 67, 15, 42, 2, + 94, 24, 33, 117, 49, 59, 27, 59, 71, 61, + 103, 65, 87, 71, 117, 33, 49, 59, 55, 48, + 3, 37, 117, 17, 77, 51, 119, 16, 14, 16, + 2, 22, 9, 19, 45, 75, 51, 47, 53, 23, + 43, 87, 77, 25, 91, 65, 107, 109, 113, 115, + 3, 27, 13, 15, 8, 18, 11, 21, 49, 101, + 21, 78, 76, 115, 18, 6, 10, 8, 6, 5, + 33, 17, 1, 22, 12, 4, 7, 9, 12, 13, + 10, 4, 42, 72, 60, 40, 30, 38, 24, 34, + 46, 3, 48, 40, 56, 51, 36, 68, 62, 124, + 124, 92, 120, 124, 124, 118, 124, 124, 124, 124, + 124, 124, 100, 124, 124, 124, 124, 124, 124, 124, + 104, 124, 116, 124, 48, 124, 124, 124, 124, 124, + 124, 124, 124, 124, 124, 118, 100, 84, 70, 38, + 23, 17, 29, 54, 60, 52, 62, 32, 38, 32, + 68, 14, 5, 10, 21, 9, 101, 91, 11, 6, + 20, 3, 23, 29, 37, 43, 117, 69, 50, 22, + 6, 19, 23, 49, 79, 87, 109, 21, 42, 26, + 16, 14, 13, 9, 9, 19, 41, 5, 62, 62, + 62, 22, 18, 5, 19, 49, 22, 118, 108, 106, + 84, 52, 6, 17, 11, 27, 124, 119, 115, 87, + 117, 113, 77, 107, 99, 71, 99, 97, 107, 59, + 69, 61, 89, 93, 49, 31, 35, 39, 33, 15, + 1, 7, 12, 11, 11, 11, 9, 5, 23, 0, + 4, 5, 0, 12, 1, 3, 2, 22, 3, 13, + 21, 11, 13, 4, 48, 32, 52, 50, 42, 42, + 30, 42, 42, 8, 26, 42, 5, 22, 124, 124, + 124, 124, 124, 124, 124, 124, 124, 124, 124, 124, + 124, 124, 100, 124, 124, 124, 124, 124, 124, 124, + 118, 118, 92, 68, 48, 14, 5, 124, 124, 124, + 124, 124, 124, 124, 124, 124, 124, 124, 124, 80, + 60, 32, 28, 26, 30, 78, 78, 78, 78, 64, + 72, 52, 38, 42, 22, 15, 14, 17, 41, 56, + 24, 14, 41, 33, 23, 33, 27, 13, 19, 35, + 11, 3, 6, 3, 1, 8, 8, 124, 124, 124, + 124, 114, 104, 90, 56, 2, 13, 78, 56, 44, + 24, 16, 2, 1, 5, 35, 33, 23, 15, 27, + 13, 5, 11, 23, 3, 4, 12, 6, 10, 18, + 10, 124, 124, 124, 124, 114, 104, 90, 56, 2, + }, + + { + /* Context Tables for P, SP, B Slices :: cabac_init_idc = 1, qp = 3 */ + + 124, 18, 21, 124, 18, 21, 115, 71, 24, 20, + 22, 68, 114, 120, 110, 44, 97, 3, 11, 25, + 22, 24, 11, 16, 50, 76, 112, 102, 40, 19, + 109, 69, 23, 11, 25, 22, 13, 47, 22, 38, + 9, 35, 61, 8, 23, 45, 71, 14, 19, 39, + 8, 33, 41, 61, 3, 25, 13, 43, 15, 4, + 44, 0, 0, 0, 35, 45, 67, 13, 40, 2, + 92, 22, 33, 111, 47, 57, 25, 55, 67, 57, + 99, 61, 85, 69, 113, 33, 49, 57, 55, 48, + 3, 35, 113, 17, 75, 51, 115, 16, 12, 14, + 2, 22, 9, 17, 45, 73, 49, 47, 51, 21, + 41, 83, 73, 25, 89, 63, 97, 99, 107, 109, + 3, 27, 13, 13, 8, 18, 9, 19, 47, 97, + 21, 74, 72, 109, 18, 6, 10, 8, 6, 3, + 31, 15, 1, 22, 12, 4, 7, 7, 10, 13, + 10, 2, 42, 70, 60, 40, 30, 38, 24, 34, + 44, 3, 46, 38, 56, 51, 36, 68, 62, 124, + 124, 90, 116, 124, 124, 114, 124, 124, 124, 124, + 124, 122, 96, 124, 124, 124, 124, 124, 124, 120, + 100, 124, 112, 124, 44, 124, 124, 124, 124, 124, + 124, 124, 124, 124, 124, 114, 96, 80, 68, 34, + 23, 17, 29, 52, 58, 50, 60, 30, 36, 30, + 64, 12, 7, 8, 23, 9, 101, 87, 9, 8, + 20, 3, 21, 29, 37, 43, 113, 67, 50, 22, + 8, 17, 21, 47, 77, 85, 105, 19, 42, 26, + 16, 14, 11, 7, 9, 19, 41, 5, 62, 62, + 60, 22, 18, 5, 19, 47, 22, 116, 108, 104, + 82, 52, 6, 17, 11, 27, 124, 117, 111, 85, + 115, 111, 75, 103, 95, 69, 97, 93, 103, 59, + 67, 59, 87, 89, 47, 31, 35, 39, 31, 15, + 1, 7, 12, 11, 11, 13, 7, 3, 21, 0, + 4, 3, 0, 12, 1, 3, 2, 22, 3, 13, + 21, 11, 13, 2, 46, 32, 50, 48, 40, 42, + 30, 40, 40, 8, 26, 40, 5, 20, 124, 124, + 122, 124, 124, 124, 124, 124, 124, 124, 124, 124, + 124, 124, 96, 124, 124, 124, 124, 124, 124, 124, + 114, 114, 88, 64, 44, 12, 7, 124, 124, 124, + 124, 124, 124, 124, 124, 124, 124, 118, 120, 76, + 56, 30, 26, 24, 28, 74, 74, 74, 74, 62, + 68, 48, 36, 40, 20, 17, 12, 19, 43, 54, + 22, 12, 41, 31, 23, 31, 25, 11, 19, 35, + 11, 3, 6, 1, 0, 8, 8, 124, 124, 124, + 118, 108, 96, 82, 48, 3, 13, 78, 56, 44, + 24, 16, 4, 1, 5, 33, 33, 23, 13, 25, + 11, 3, 11, 21, 3, 4, 12, 6, 10, 18, + 10, 124, 124, 124, 118, 108, 96, 82, 48, 3, + }, + + { + /* Context Tables for P, SP, B Slices :: cabac_init_idc = 1, qp = 4 */ + + 124, 18, 21, 124, 18, 21, 113, 69, 24, 20, + 22, 66, 110, 118, 110, 42, 93, 3, 11, 23, + 20, 22, 11, 14, 46, 74, 106, 94, 34, 23, + 103, 67, 23, 11, 23, 20, 13, 45, 22, 36, + 9, 33, 59, 8, 23, 45, 71, 14, 19, 39, + 8, 33, 41, 59, 3, 25, 13, 43, 13, 4, + 44, 0, 0, 0, 33, 47, 67, 13, 38, 2, + 90, 22, 33, 107, 45, 55, 21, 51, 65, 55, + 97, 59, 81, 67, 109, 33, 47, 57, 55, 48, + 3, 33, 109, 17, 75, 49, 111, 16, 12, 14, + 0, 22, 9, 17, 43, 71, 47, 45, 49, 21, + 41, 81, 71, 23, 87, 61, 87, 91, 101, 103, + 3, 25, 13, 13, 8, 18, 9, 19, 47, 93, + 21, 70, 68, 105, 18, 8, 10, 8, 6, 3, + 27, 13, 0, 20, 12, 4, 5, 7, 8, 13, + 10, 2, 40, 68, 58, 38, 28, 38, 24, 34, + 42, 3, 44, 36, 54, 51, 34, 66, 60, 124, + 124, 88, 112, 124, 124, 110, 124, 124, 124, 124, + 124, 118, 92, 118, 124, 124, 124, 124, 124, 114, + 96, 124, 108, 124, 42, 124, 124, 124, 124, 124, + 124, 124, 124, 124, 120, 110, 92, 76, 64, 30, + 23, 17, 31, 50, 56, 48, 56, 28, 32, 28, + 62, 10, 7, 6, 23, 11, 99, 85, 7, 8, + 20, 1, 21, 27, 35, 41, 109, 63, 50, 24, + 8, 17, 19, 45, 73, 81, 103, 19, 44, 28, + 18, 16, 9, 7, 9, 17, 39, 5, 64, 62, + 60, 20, 18, 5, 19, 47, 22, 116, 106, 102, + 80, 52, 6, 15, 11, 27, 124, 113, 109, 83, + 111, 107, 73, 101, 93, 67, 93, 91, 99, 57, + 65, 59, 85, 87, 47, 31, 35, 37, 31, 15, + 3, 7, 10, 11, 11, 13, 7, 3, 19, 0, + 4, 3, 0, 12, 1, 3, 2, 20, 3, 13, + 21, 11, 11, 0, 44, 32, 48, 48, 38, 40, + 30, 38, 38, 8, 26, 38, 5, 18, 124, 124, + 120, 124, 124, 124, 124, 124, 124, 124, 124, 124, + 124, 124, 92, 124, 124, 124, 124, 124, 124, 124, + 108, 108, 84, 60, 42, 10, 7, 124, 124, 124, + 124, 124, 124, 124, 124, 124, 124, 114, 114, 72, + 52, 26, 24, 24, 24, 72, 72, 72, 70, 58, + 64, 46, 34, 36, 18, 19, 8, 21, 43, 50, + 18, 8, 39, 31, 21, 31, 23, 9, 19, 33, + 9, 1, 6, 1, 2, 8, 8, 124, 124, 124, + 112, 100, 88, 72, 40, 9, 11, 78, 56, 44, + 24, 18, 4, 1, 5, 33, 33, 23, 13, 25, + 11, 1, 11, 21, 1, 6, 12, 6, 12, 18, + 10, 124, 124, 124, 112, 100, 88, 72, 40, 9, + }, + + { + /* Context Tables for P, SP, B Slices :: cabac_init_idc = 1, qp = 5 */ + + 124, 18, 21, 124, 18, 21, 109, 65, 24, 20, + 20, 64, 106, 116, 110, 42, 89, 3, 11, 21, + 20, 22, 11, 12, 42, 72, 102, 88, 30, 27, + 97, 65, 21, 11, 21, 20, 13, 43, 22, 36, + 9, 33, 57, 8, 23, 45, 71, 14, 19, 39, + 8, 33, 39, 59, 3, 25, 13, 43, 13, 4, + 44, 0, 0, 0, 33, 47, 67, 11, 36, 2, + 88, 20, 33, 101, 43, 53, 17, 47, 61, 51, + 93, 55, 79, 65, 103, 33, 47, 55, 53, 48, + 3, 31, 105, 17, 73, 49, 107, 16, 10, 12, + 0, 22, 9, 15, 43, 69, 45, 45, 47, 19, + 39, 77, 67, 21, 83, 59, 77, 83, 95, 97, + 1, 25, 11, 11, 8, 18, 7, 19, 45, 89, + 21, 66, 64, 99, 18, 8, 10, 8, 6, 1, + 25, 11, 0, 20, 12, 4, 5, 5, 6, 11, + 10, 0, 40, 66, 58, 38, 28, 38, 24, 34, + 40, 1, 42, 36, 54, 51, 34, 64, 58, 124, + 124, 86, 110, 124, 124, 106, 124, 124, 124, 124, + 122, 114, 88, 114, 124, 120, 124, 124, 124, 110, + 92, 124, 104, 124, 38, 124, 124, 124, 124, 124, + 124, 124, 124, 122, 116, 106, 88, 74, 60, 26, + 23, 17, 31, 48, 54, 46, 54, 26, 30, 26, + 58, 8, 9, 4, 25, 13, 97, 81, 5, 10, + 20, 1, 19, 27, 35, 39, 105, 61, 50, 24, + 10, 15, 17, 43, 71, 79, 99, 17, 46, 30, + 20, 16, 7, 5, 7, 17, 39, 5, 64, 62, + 58, 20, 18, 5, 19, 45, 22, 114, 104, 100, + 78, 52, 6, 15, 11, 25, 124, 111, 105, 79, + 107, 105, 71, 97, 89, 65, 89, 87, 95, 55, + 63, 57, 83, 83, 47, 31, 33, 37, 29, 15, + 3, 7, 10, 11, 11, 15, 5, 3, 17, 0, + 4, 3, 0, 12, 1, 3, 2, 20, 3, 13, + 21, 11, 11, 1, 42, 32, 46, 46, 38, 38, + 30, 38, 36, 8, 26, 36, 5, 16, 124, 124, + 118, 124, 124, 124, 124, 124, 124, 124, 124, 124, + 124, 124, 88, 124, 124, 124, 124, 124, 124, 122, + 104, 104, 80, 58, 38, 10, 9, 124, 124, 124, + 124, 124, 124, 124, 124, 124, 122, 110, 108, 68, + 48, 24, 24, 22, 20, 70, 68, 68, 66, 54, + 60, 42, 32, 34, 16, 19, 6, 23, 45, 48, + 16, 6, 39, 31, 19, 29, 21, 7, 17, 31, + 9, 1, 6, 0, 4, 8, 8, 124, 124, 118, + 106, 94, 80, 64, 32, 15, 11, 78, 56, 44, + 24, 18, 4, 0, 3, 31, 33, 23, 11, 25, + 9, 0, 11, 21, 1, 6, 12, 8, 12, 18, + 10, 124, 124, 118, 106, 94, 80, 64, 32, 15, + }, + + { + /* Context Tables for P, SP, B Slices :: cabac_init_idc = 1, qp = 6 */ + + 124, 18, 23, 124, 18, 23, 105, 63, 26, 20, + 20, 60, 102, 114, 110, 42, 87, 3, 9, 21, + 20, 20, 9, 12, 40, 68, 96, 80, 24, 29, + 93, 63, 21, 9, 21, 20, 15, 43, 22, 34, + 9, 33, 57, 8, 23, 43, 69, 14, 17, 37, + 8, 31, 39, 59, 3, 25, 13, 43, 13, 4, + 44, 0, 0, 0, 31, 47, 67, 11, 36, 0, + 88, 20, 33, 97, 41, 51, 15, 41, 59, 49, + 89, 53, 77, 63, 99, 33, 47, 55, 53, 48, + 3, 29, 99, 17, 71, 47, 103, 14, 10, 12, + 1, 24, 9, 15, 41, 69, 45, 43, 45, 19, + 37, 75, 65, 21, 81, 57, 67, 73, 89, 91, + 1, 25, 11, 11, 8, 18, 7, 17, 45, 85, + 19, 62, 60, 93, 18, 8, 10, 8, 8, 1, + 21, 9, 0, 20, 12, 4, 3, 3, 4, 11, + 10, 0, 38, 64, 56, 38, 26, 38, 24, 34, + 36, 1, 40, 34, 52, 51, 34, 64, 58, 124, + 124, 84, 106, 124, 124, 102, 124, 124, 124, 124, + 114, 110, 86, 110, 124, 116, 124, 124, 124, 104, + 88, 124, 100, 124, 34, 124, 124, 124, 124, 124, + 124, 124, 124, 118, 112, 100, 84, 70, 58, 24, + 23, 17, 33, 46, 52, 44, 52, 24, 28, 24, + 54, 6, 9, 2, 27, 13, 97, 79, 3, 10, + 20, 1, 17, 25, 33, 39, 101, 59, 52, 24, + 10, 13, 15, 41, 67, 75, 95, 15, 46, 30, + 20, 18, 5, 5, 7, 17, 37, 5, 66, 62, + 56, 20, 18, 5, 19, 45, 20, 114, 104, 98, + 76, 50, 6, 15, 11, 25, 124, 109, 103, 77, + 105, 101, 69, 93, 85, 63, 87, 83, 91, 55, + 61, 57, 81, 79, 45, 31, 33, 35, 29, 15, + 3, 7, 8, 11, 11, 15, 5, 1, 15, 0, + 4, 1, 2, 12, 0, 1, 2, 18, 3, 13, + 21, 11, 9, 3, 40, 32, 44, 44, 36, 38, + 30, 36, 36, 8, 24, 32, 7, 14, 124, 124, + 116, 124, 124, 124, 124, 124, 124, 124, 124, 124, + 124, 124, 84, 124, 124, 124, 124, 124, 124, 116, + 100, 98, 76, 54, 36, 8, 11, 124, 124, 124, + 124, 124, 124, 124, 124, 122, 116, 104, 102, 64, + 46, 20, 22, 20, 18, 66, 66, 64, 62, 52, + 56, 38, 30, 30, 14, 21, 4, 25, 47, 44, + 14, 4, 39, 29, 19, 29, 19, 5, 17, 31, + 7, 0, 6, 0, 6, 8, 8, 124, 124, 114, + 100, 88, 72, 56, 24, 21, 11, 78, 56, 44, + 24, 20, 6, 0, 3, 31, 31, 21, 11, 23, + 7, 2, 9, 19, 1, 6, 12, 8, 14, 18, + 10, 124, 124, 114, 100, 88, 72, 56, 24, 21, + }, + + { + /* Context Tables for P, SP, B Slices :: cabac_init_idc = 1, qp = 7 */ + + 124, 18, 23, 124, 18, 23, 101, 59, 26, 20, + 18, 58, 98, 112, 110, 42, 83, 3, 9, 19, + 18, 20, 9, 10, 36, 66, 92, 74, 20, 33, + 87, 61, 21, 9, 19, 18, 15, 41, 22, 34, + 9, 31, 55, 8, 23, 43, 69, 14, 17, 37, + 8, 31, 37, 57, 3, 25, 13, 43, 11, 4, + 44, 0, 0, 0, 31, 47, 67, 9, 34, 0, + 86, 18, 33, 91, 39, 49, 11, 37, 55, 45, + 87, 49, 73, 61, 95, 33, 45, 53, 53, 48, + 3, 27, 95, 17, 69, 47, 99, 14, 8, 10, + 1, 24, 9, 13, 41, 67, 43, 43, 43, 17, + 35, 71, 61, 19, 79, 55, 57, 65, 83, 85, + 1, 23, 11, 9, 8, 18, 5, 17, 43, 81, + 19, 58, 56, 87, 18, 10, 10, 8, 8, 0, + 19, 7, 2, 18, 12, 4, 3, 3, 2, 11, + 10, 1, 38, 62, 56, 36, 26, 38, 24, 34, + 34, 1, 38, 32, 52, 51, 34, 62, 56, 120, + 124, 82, 102, 124, 124, 98, 124, 122, 124, 124, + 108, 106, 82, 104, 124, 110, 124, 124, 124, 98, + 84, 124, 96, 124, 32, 124, 124, 124, 124, 124, + 124, 124, 124, 114, 106, 96, 80, 66, 54, 20, + 23, 17, 33, 44, 50, 42, 48, 22, 26, 22, + 52, 4, 11, 0, 27, 15, 95, 75, 1, 12, + 20, 0, 17, 25, 33, 37, 97, 55, 52, 26, + 12, 13, 13, 39, 65, 73, 91, 15, 48, 32, + 22, 18, 3, 3, 7, 15, 37, 5, 66, 62, + 56, 18, 18, 5, 19, 43, 20, 112, 102, 96, + 74, 50, 6, 13, 11, 25, 124, 105, 99, 75, + 101, 99, 67, 91, 83, 61, 83, 81, 87, 53, + 59, 55, 79, 75, 45, 31, 33, 35, 27, 15, + 5, 7, 8, 11, 11, 17, 3, 1, 13, 0, + 4, 1, 2, 12, 0, 1, 2, 18, 3, 13, + 21, 11, 9, 5, 38, 32, 42, 44, 34, 36, + 30, 34, 34, 8, 24, 30, 7, 12, 122, 124, + 114, 124, 124, 124, 124, 124, 124, 124, 124, 124, + 124, 124, 80, 124, 124, 124, 124, 124, 124, 112, + 96, 94, 72, 50, 32, 6, 11, 124, 124, 124, + 124, 124, 124, 124, 124, 118, 112, 100, 96, 60, + 42, 18, 20, 20, 14, 64, 62, 62, 58, 48, + 52, 36, 28, 28, 12, 23, 0, 27, 47, 42, + 10, 0, 37, 29, 17, 27, 17, 3, 17, 29, + 7, 0, 6, 2, 8, 8, 8, 124, 124, 108, + 94, 80, 64, 48, 16, 27, 9, 78, 56, 44, + 24, 20, 6, 0, 3, 29, 31, 21, 9, 23, + 5, 4, 9, 19, 0, 8, 12, 8, 14, 18, + 10, 124, 124, 108, 94, 80, 64, 48, 16, 27, + }, + + { + /* Context Tables for P, SP, B Slices :: cabac_init_idc = 1, qp = 8 */ + + 124, 16, 23, 124, 16, 23, 99, 57, 26, 20, + 18, 54, 92, 110, 110, 40, 79, 5, 9, 17, + 18, 18, 9, 8, 32, 64, 86, 66, 14, 37, + 81, 59, 21, 9, 17, 18, 17, 39, 22, 32, + 9, 31, 55, 6, 25, 43, 69, 14, 17, 37, + 8, 31, 37, 57, 3, 25, 13, 43, 11, 4, + 44, 0, 0, 0, 29, 49, 67, 9, 32, 0, + 84, 18, 35, 87, 37, 47, 9, 33, 53, 43, + 83, 47, 71, 59, 91, 33, 45, 53, 53, 48, + 3, 25, 91, 17, 69, 45, 95, 14, 8, 10, + 3, 24, 9, 13, 39, 65, 41, 41, 43, 17, + 35, 69, 59, 19, 77, 53, 49, 57, 77, 81, + 1, 23, 11, 9, 6, 18, 5, 17, 43, 77, + 19, 54, 52, 83, 18, 10, 10, 8, 8, 0, + 15, 7, 2, 18, 10, 4, 1, 1, 1, 11, + 10, 1, 36, 58, 54, 36, 24, 38, 24, 32, + 32, 1, 36, 30, 50, 51, 32, 60, 54, 116, + 124, 78, 98, 124, 124, 92, 124, 118, 124, 124, + 100, 102, 78, 100, 124, 106, 124, 124, 124, 92, + 80, 124, 92, 124, 28, 124, 124, 124, 124, 124, + 124, 124, 120, 110, 102, 92, 76, 62, 50, 16, + 23, 19, 35, 42, 46, 40, 46, 20, 22, 18, + 48, 2, 11, 1, 29, 17, 95, 73, 0, 12, + 20, 0, 15, 23, 31, 37, 93, 53, 52, 26, + 12, 11, 11, 37, 61, 69, 89, 13, 48, 32, + 22, 20, 1, 3, 7, 15, 35, 7, 68, 62, + 54, 18, 18, 5, 19, 43, 20, 112, 100, 94, + 72, 50, 6, 13, 11, 25, 124, 103, 97, 73, + 99, 95, 65, 87, 79, 59, 81, 77, 83, 53, + 59, 55, 77, 73, 45, 31, 33, 33, 27, 15, + 5, 7, 6, 11, 11, 17, 3, 1, 11, 0, + 2, 1, 2, 10, 0, 1, 2, 16, 3, 13, + 21, 11, 7, 7, 36, 32, 38, 42, 32, 34, + 28, 32, 32, 8, 24, 28, 7, 8, 120, 120, + 112, 124, 124, 124, 124, 124, 124, 124, 124, 124, + 124, 120, 76, 124, 124, 124, 124, 124, 124, 106, + 90, 88, 68, 46, 30, 4, 13, 124, 124, 124, + 124, 124, 124, 124, 124, 112, 106, 94, 90, 56, + 38, 14, 18, 18, 10, 60, 60, 58, 54, 44, + 48, 32, 24, 24, 8, 25, 1, 29, 49, 38, + 8, 1, 37, 29, 17, 27, 15, 1, 17, 29, + 5, 2, 6, 2, 8, 8, 6, 124, 120, 102, + 88, 74, 56, 38, 6, 33, 9, 78, 56, 44, + 24, 22, 6, 0, 3, 29, 31, 21, 9, 23, + 5, 4, 9, 19, 0, 8, 12, 8, 16, 18, + 8, 124, 120, 102, 88, 74, 56, 38, 6, 33, + }, + + { + /* Context Tables for P, SP, B Slices :: cabac_init_idc = 1, qp = 9 */ + + 124, 16, 23, 124, 16, 23, 95, 55, 28, 20, + 18, 52, 88, 108, 112, 40, 75, 5, 7, 15, + 18, 16, 9, 8, 30, 62, 82, 58, 8, 39, + 75, 57, 19, 7, 15, 18, 17, 37, 22, 32, + 7, 31, 53, 6, 25, 43, 69, 14, 17, 35, + 8, 31, 37, 57, 3, 25, 13, 41, 11, 4, + 44, 0, 0, 0, 27, 49, 67, 9, 30, 0, + 82, 18, 35, 83, 33, 45, 5, 29, 49, 41, + 79, 43, 69, 55, 85, 33, 45, 53, 51, 48, + 3, 23, 87, 15, 67, 43, 91, 14, 8, 10, + 3, 24, 9, 13, 37, 63, 39, 39, 41, 15, + 33, 67, 55, 17, 73, 51, 39, 47, 69, 75, + 0, 23, 9, 7, 6, 18, 5, 15, 41, 71, + 19, 50, 50, 77, 18, 10, 10, 8, 8, 2, + 11, 5, 2, 18, 10, 4, 0, 0, 3, 9, + 10, 1, 34, 56, 52, 36, 22, 38, 24, 32, + 30, 0, 34, 30, 48, 51, 32, 60, 54, 112, + 124, 76, 96, 124, 124, 88, 120, 114, 124, 124, + 94, 98, 74, 96, 124, 102, 124, 124, 124, 88, + 76, 124, 88, 124, 24, 124, 124, 124, 124, 124, + 124, 120, 116, 106, 98, 88, 74, 60, 48, 12, + 23, 19, 35, 42, 44, 38, 44, 18, 20, 16, + 44, 0, 11, 3, 31, 17, 93, 71, 2, 12, + 20, 0, 13, 21, 29, 35, 87, 51, 52, 26, + 12, 9, 9, 35, 57, 65, 85, 11, 50, 34, + 24, 22, 0, 3, 5, 15, 33, 7, 70, 62, + 52, 18, 20, 3, 19, 41, 20, 112, 100, 92, + 70, 50, 6, 13, 11, 23, 124, 101, 95, 69, + 95, 91, 63, 83, 75, 57, 77, 73, 79, 51, + 57, 53, 75, 69, 43, 29, 31, 31, 25, 15, + 5, 7, 4, 11, 11, 17, 3, 0, 9, 2, + 2, 0, 2, 10, 0, 1, 2, 14, 3, 11, + 19, 11, 5, 7, 34, 32, 36, 40, 32, 34, + 28, 32, 30, 8, 24, 26, 7, 6, 118, 118, + 112, 122, 124, 124, 124, 124, 124, 124, 124, 124, + 124, 114, 72, 124, 124, 124, 124, 124, 124, 100, + 86, 84, 66, 44, 28, 4, 15, 124, 124, 124, + 124, 124, 124, 124, 124, 108, 102, 90, 86, 52, + 34, 10, 18, 16, 8, 58, 58, 54, 50, 42, + 46, 28, 22, 20, 6, 25, 3, 29, 51, 34, + 6, 3, 37, 27, 15, 27, 13, 2, 15, 27, + 3, 4, 6, 4, 10, 8, 6, 124, 116, 98, + 82, 68, 48, 30, 1, 39, 9, 78, 56, 46, + 26, 24, 8, 2, 1, 29, 31, 21, 9, 21, + 3, 6, 9, 17, 0, 8, 12, 10, 18, 18, + 8, 124, 116, 98, 82, 68, 48, 30, 1, 39, + }, + + { + /* Context Tables for P, SP, B Slices :: cabac_init_idc = 1, qp = 10 */ + + 124, 16, 23, 124, 16, 23, 91, 51, 28, 20, + 16, 50, 84, 106, 112, 40, 71, 5, 7, 13, + 16, 16, 9, 6, 26, 60, 76, 52, 4, 43, + 69, 55, 19, 7, 13, 16, 17, 35, 22, 30, + 7, 29, 51, 6, 25, 43, 69, 14, 17, 35, + 8, 31, 35, 55, 3, 25, 13, 41, 9, 4, + 44, 0, 0, 0, 27, 49, 67, 7, 28, 0, + 80, 16, 35, 77, 31, 43, 1, 25, 47, 37, + 77, 41, 65, 53, 81, 33, 43, 51, 51, 48, + 3, 21, 83, 15, 65, 43, 87, 14, 6, 8, + 5, 24, 9, 11, 37, 61, 37, 39, 39, 15, + 31, 63, 53, 15, 71, 49, 29, 39, 63, 69, + 0, 21, 9, 7, 6, 18, 3, 15, 41, 67, + 19, 46, 46, 71, 18, 12, 10, 8, 8, 2, + 9, 3, 4, 16, 10, 4, 0, 0, 5, 9, + 10, 3, 34, 54, 52, 34, 22, 38, 24, 32, + 28, 0, 32, 28, 48, 51, 32, 58, 52, 108, + 124, 74, 92, 124, 124, 84, 114, 110, 124, 124, + 86, 94, 70, 90, 122, 96, 124, 124, 124, 82, + 72, 116, 84, 124, 22, 124, 124, 124, 124, 124, + 120, 116, 112, 102, 92, 84, 70, 56, 44, 8, + 23, 19, 37, 40, 42, 36, 40, 16, 18, 14, + 42, 1, 13, 5, 31, 19, 91, 67, 4, 14, + 20, 2, 13, 21, 29, 33, 83, 47, 52, 28, + 14, 9, 7, 33, 55, 63, 81, 11, 52, 36, + 26, 22, 2, 1, 5, 13, 33, 7, 70, 62, + 52, 16, 20, 3, 19, 41, 20, 110, 98, 90, + 68, 50, 6, 11, 11, 23, 124, 97, 91, 67, + 91, 89, 61, 81, 73, 55, 73, 71, 75, 49, + 55, 53, 73, 65, 43, 29, 31, 31, 25, 15, + 7, 7, 4, 11, 11, 19, 1, 0, 7, 2, + 2, 0, 2, 10, 0, 1, 2, 14, 3, 11, + 19, 11, 5, 9, 32, 32, 34, 40, 30, 32, + 28, 30, 28, 8, 24, 24, 7, 4, 116, 116, + 110, 118, 120, 124, 124, 124, 124, 124, 124, 124, + 124, 110, 68, 124, 124, 124, 124, 124, 124, 96, + 82, 78, 62, 40, 24, 2, 15, 124, 124, 124, + 124, 124, 124, 124, 124, 104, 96, 86, 80, 48, + 30, 8, 16, 16, 4, 56, 54, 52, 46, 38, + 42, 26, 20, 18, 4, 27, 7, 31, 51, 32, + 2, 7, 35, 27, 13, 25, 11, 4, 15, 25, + 3, 4, 6, 4, 12, 8, 6, 124, 112, 92, + 76, 60, 40, 22, 9, 45, 7, 78, 56, 46, + 26, 24, 8, 2, 1, 27, 31, 21, 7, 21, + 1, 8, 9, 17, 2, 10, 12, 10, 18, 18, + 8, 124, 112, 92, 76, 60, 40, 22, 9, 45, + }, + + { + /* Context Tables for P, SP, B Slices :: cabac_init_idc = 1, qp = 11 */ + + 124, 16, 25, 124, 16, 25, 87, 49, 30, 20, + 16, 46, 80, 104, 112, 40, 69, 5, 5, 13, + 16, 14, 7, 6, 24, 56, 72, 44, 1, 45, + 65, 53, 19, 5, 13, 16, 19, 35, 22, 30, + 7, 29, 51, 6, 25, 41, 67, 14, 15, 33, + 8, 29, 35, 55, 3, 25, 13, 41, 9, 4, + 44, 0, 0, 0, 25, 49, 67, 7, 28, 1, + 80, 16, 35, 73, 29, 41, 0, 19, 43, 35, + 73, 37, 63, 51, 77, 33, 43, 51, 51, 48, + 3, 19, 77, 15, 63, 41, 83, 12, 6, 8, + 5, 26, 9, 11, 35, 61, 37, 37, 37, 13, + 29, 61, 49, 15, 69, 47, 19, 29, 57, 63, + 0, 21, 9, 5, 6, 18, 3, 13, 39, 63, + 17, 42, 42, 65, 18, 12, 10, 8, 10, 4, + 5, 1, 4, 16, 10, 4, 2, 2, 7, 9, + 10, 3, 32, 52, 50, 34, 20, 38, 24, 32, + 24, 0, 30, 26, 46, 51, 32, 58, 52, 104, + 124, 72, 88, 122, 124, 80, 110, 106, 124, 124, + 80, 90, 68, 86, 114, 92, 124, 124, 124, 76, + 68, 110, 80, 124, 18, 124, 124, 124, 124, 124, + 116, 110, 108, 98, 88, 78, 66, 52, 42, 6, + 23, 19, 37, 38, 40, 34, 38, 14, 16, 12, + 38, 3, 13, 7, 33, 19, 91, 65, 6, 14, + 20, 2, 11, 19, 27, 33, 79, 45, 54, 28, + 14, 7, 5, 31, 51, 59, 77, 9, 52, 36, + 26, 24, 4, 1, 5, 13, 31, 7, 72, 62, + 50, 16, 20, 3, 19, 39, 18, 110, 98, 88, + 66, 48, 6, 11, 11, 23, 124, 95, 89, 65, + 89, 85, 59, 77, 69, 53, 71, 67, 71, 49, + 53, 51, 71, 61, 41, 29, 31, 29, 23, 15, + 7, 7, 2, 11, 11, 19, 1, 2, 5, 2, + 2, 2, 4, 10, 2, 0, 2, 12, 3, 11, + 19, 11, 3, 11, 30, 32, 32, 38, 28, 32, + 28, 28, 28, 8, 22, 20, 9, 2, 112, 114, + 108, 116, 116, 124, 124, 124, 124, 124, 124, 124, + 124, 104, 64, 124, 124, 124, 124, 124, 124, 90, + 78, 74, 58, 36, 22, 0, 17, 124, 124, 124, + 124, 124, 124, 120, 118, 98, 92, 80, 74, 44, + 28, 4, 14, 14, 2, 52, 52, 48, 42, 36, + 38, 22, 18, 14, 2, 29, 9, 33, 53, 28, + 0, 9, 35, 25, 13, 25, 9, 6, 15, 25, + 1, 6, 6, 6, 14, 8, 6, 124, 108, 88, + 70, 54, 32, 14, 17, 51, 7, 78, 56, 46, + 26, 26, 10, 2, 1, 27, 29, 19, 7, 19, + 0, 10, 7, 15, 2, 10, 12, 10, 20, 18, + 8, 124, 108, 88, 70, 54, 32, 14, 17, 51, + }, + + { + /* Context Tables for P, SP, B Slices :: cabac_init_idc = 1, qp = 12 */ + + 124, 16, 25, 124, 16, 25, 85, 45, 30, 20, + 14, 44, 76, 102, 112, 38, 65, 5, 5, 11, + 16, 14, 7, 4, 20, 54, 66, 38, 5, 49, + 59, 51, 19, 5, 11, 16, 19, 33, 22, 28, + 7, 29, 49, 6, 25, 41, 67, 14, 15, 33, + 8, 29, 33, 55, 3, 25, 13, 41, 9, 4, + 44, 0, 0, 0, 25, 51, 67, 5, 26, 1, + 78, 14, 35, 67, 27, 39, 4, 15, 41, 31, + 69, 35, 61, 49, 73, 33, 43, 49, 51, 48, + 3, 17, 73, 15, 63, 41, 79, 12, 4, 6, + 7, 26, 9, 9, 35, 59, 35, 37, 35, 13, + 29, 57, 47, 13, 67, 45, 9, 21, 51, 57, + 0, 21, 9, 5, 6, 18, 1, 13, 39, 59, + 17, 38, 38, 61, 18, 12, 10, 8, 10, 4, + 3, 0, 4, 16, 10, 4, 2, 4, 9, 9, + 10, 5, 32, 50, 50, 34, 20, 38, 24, 32, + 22, 0, 28, 24, 46, 51, 30, 56, 50, 100, + 124, 70, 84, 118, 120, 76, 104, 102, 124, 124, + 72, 86, 64, 82, 108, 86, 116, 124, 124, 70, + 64, 102, 76, 124, 14, 124, 124, 124, 124, 124, + 112, 106, 104, 94, 84, 74, 62, 48, 38, 2, + 23, 19, 39, 36, 38, 32, 36, 12, 12, 10, + 34, 5, 15, 9, 35, 21, 89, 61, 8, 16, + 20, 2, 9, 19, 27, 31, 75, 43, 54, 28, + 16, 5, 3, 29, 49, 57, 75, 7, 54, 38, + 28, 24, 6, 0, 5, 13, 31, 7, 72, 62, + 48, 16, 20, 3, 19, 39, 18, 108, 96, 86, + 64, 48, 6, 11, 11, 23, 124, 93, 85, 63, + 85, 83, 57, 73, 65, 51, 67, 63, 67, 47, + 51, 51, 69, 59, 41, 29, 31, 29, 23, 15, + 7, 7, 2, 11, 11, 21, 0, 2, 3, 2, + 2, 2, 4, 10, 2, 0, 2, 12, 3, 11, + 19, 11, 3, 13, 28, 32, 30, 36, 26, 30, + 28, 26, 26, 8, 22, 18, 9, 0, 110, 112, + 106, 112, 112, 124, 122, 124, 124, 124, 124, 124, + 122, 100, 60, 124, 124, 124, 124, 124, 118, 86, + 72, 68, 54, 32, 18, 1, 19, 124, 124, 124, + 124, 124, 124, 114, 112, 94, 86, 76, 68, 40, + 24, 2, 12, 12, 1, 50, 48, 44, 38, 32, + 34, 18, 16, 12, 0, 31, 11, 35, 55, 26, + 1, 11, 35, 25, 11, 23, 7, 8, 15, 23, + 1, 6, 6, 6, 16, 8, 6, 122, 104, 82, + 64, 48, 24, 4, 25, 57, 7, 78, 56, 46, + 26, 26, 10, 2, 1, 25, 29, 19, 5, 19, + 0, 12, 7, 15, 2, 10, 12, 10, 20, 18, + 8, 122, 104, 82, 64, 48, 24, 4, 25, 57, + }, + + { + /* Context Tables for P, SP, B Slices :: cabac_init_idc = 1, qp = 13 */ + + 124, 16, 25, 124, 16, 25, 81, 43, 30, 20, + 14, 42, 72, 100, 112, 38, 61, 5, 5, 9, + 14, 12, 7, 2, 16, 52, 62, 30, 11, 53, + 53, 49, 17, 5, 9, 14, 19, 31, 22, 28, + 7, 27, 47, 6, 25, 41, 67, 14, 15, 33, + 8, 29, 33, 53, 3, 25, 13, 41, 7, 4, + 44, 0, 0, 0, 23, 51, 67, 5, 24, 1, + 76, 14, 35, 63, 25, 37, 8, 11, 37, 29, + 67, 31, 57, 47, 67, 33, 41, 49, 49, 48, + 3, 15, 69, 15, 61, 39, 75, 12, 4, 6, + 7, 26, 9, 9, 33, 57, 33, 35, 33, 11, + 27, 55, 43, 11, 63, 43, 0, 13, 45, 51, + 2, 19, 7, 3, 6, 18, 1, 13, 37, 55, + 17, 34, 34, 55, 18, 14, 10, 8, 10, 6, + 0, 2, 6, 14, 10, 4, 4, 4, 11, 7, + 10, 5, 30, 48, 48, 32, 18, 38, 24, 32, + 20, 2, 26, 24, 44, 51, 30, 54, 48, 96, + 124, 68, 82, 114, 116, 72, 100, 98, 124, 124, + 66, 82, 60, 76, 102, 82, 110, 124, 124, 66, + 60, 96, 72, 124, 12, 124, 124, 124, 122, 120, + 108, 102, 100, 90, 78, 70, 58, 46, 34, 1, + 23, 19, 39, 34, 36, 30, 32, 10, 10, 8, + 32, 7, 15, 11, 35, 23, 87, 59, 10, 16, + 20, 4, 9, 17, 25, 29, 71, 39, 54, 30, + 16, 5, 1, 27, 45, 53, 71, 7, 56, 40, + 30, 26, 8, 0, 3, 11, 29, 7, 74, 62, + 48, 14, 20, 3, 19, 37, 18, 108, 94, 84, + 62, 48, 6, 9, 11, 21, 124, 89, 83, 59, + 81, 79, 55, 71, 63, 49, 63, 61, 63, 45, + 49, 49, 67, 55, 41, 29, 29, 27, 21, 15, + 9, 7, 0, 11, 11, 21, 0, 2, 1, 2, + 2, 2, 4, 10, 2, 0, 2, 10, 3, 11, + 19, 11, 1, 15, 26, 32, 28, 36, 26, 28, + 28, 26, 24, 8, 22, 16, 9, 1, 108, 110, + 104, 108, 108, 124, 118, 122, 124, 118, 124, 124, + 116, 94, 56, 124, 124, 124, 124, 118, 112, 80, + 68, 64, 50, 30, 16, 1, 19, 124, 124, 124, + 124, 118, 118, 110, 106, 90, 82, 72, 62, 36, + 20, 1, 12, 12, 5, 48, 46, 42, 34, 28, + 30, 16, 14, 8, 1, 31, 15, 37, 55, 22, + 5, 15, 33, 25, 9, 23, 5, 10, 13, 21, + 0, 8, 6, 8, 18, 8, 6, 120, 100, 76, + 58, 40, 16, 3, 33, 63, 5, 78, 56, 46, + 26, 28, 10, 4, 0, 25, 29, 19, 5, 19, + 2, 14, 7, 15, 4, 12, 12, 12, 22, 18, + 8, 120, 100, 76, 58, 40, 16, 3, 33, 63, + }, + + { + /* Context Tables for P, SP, B Slices :: cabac_init_idc = 1, qp = 14 */ + + 122, 16, 25, 122, 16, 25, 77, 39, 32, 20, + 12, 38, 68, 98, 112, 38, 57, 5, 3, 7, + 14, 12, 7, 2, 14, 50, 56, 24, 15, 55, + 47, 47, 17, 3, 7, 14, 21, 29, 22, 26, + 7, 27, 47, 6, 25, 41, 67, 14, 15, 31, + 8, 29, 31, 53, 3, 25, 13, 41, 7, 4, + 44, 0, 0, 0, 23, 51, 67, 3, 22, 1, + 74, 12, 35, 57, 23, 35, 10, 7, 35, 25, + 63, 29, 55, 45, 63, 33, 41, 47, 49, 48, + 3, 13, 65, 15, 59, 39, 71, 12, 2, 4, + 9, 26, 9, 7, 33, 55, 31, 35, 31, 11, + 25, 51, 41, 11, 61, 41, 10, 3, 39, 45, + 2, 19, 7, 3, 6, 18, 0, 11, 37, 51, + 17, 30, 30, 49, 18, 14, 10, 8, 10, 6, + 2, 4, 6, 14, 10, 4, 4, 6, 13, 7, + 10, 7, 30, 46, 48, 32, 18, 38, 24, 32, + 18, 2, 24, 22, 44, 51, 30, 54, 48, 92, + 122, 66, 78, 110, 110, 68, 94, 94, 124, 124, + 58, 78, 56, 72, 96, 76, 104, 122, 124, 60, + 56, 88, 68, 124, 8, 120, 124, 120, 116, 114, + 104, 98, 96, 86, 74, 66, 54, 42, 32, 5, + 23, 19, 41, 32, 34, 28, 30, 8, 8, 6, + 28, 9, 17, 13, 37, 23, 87, 55, 12, 18, + 20, 4, 7, 17, 25, 29, 67, 37, 54, 30, + 18, 3, 0, 25, 43, 51, 67, 5, 56, 40, + 30, 26, 10, 2, 3, 11, 29, 7, 74, 62, + 46, 14, 20, 3, 19, 37, 18, 106, 94, 82, + 60, 48, 6, 9, 11, 21, 124, 87, 79, 57, + 79, 77, 53, 67, 59, 47, 61, 57, 59, 45, + 47, 49, 65, 51, 39, 29, 29, 27, 21, 15, + 9, 7, 0, 11, 11, 23, 2, 4, 0, 2, + 2, 4, 4, 10, 2, 0, 2, 10, 3, 11, + 19, 11, 1, 17, 24, 32, 26, 34, 24, 28, + 28, 24, 22, 8, 22, 14, 9, 3, 106, 108, + 102, 106, 104, 120, 114, 118, 118, 114, 124, 120, + 110, 90, 52, 124, 124, 124, 124, 110, 106, 76, + 64, 58, 46, 26, 12, 3, 21, 124, 124, 124, + 120, 112, 114, 104, 100, 84, 76, 66, 56, 32, + 16, 3, 10, 10, 7, 44, 42, 38, 30, 26, + 26, 12, 12, 6, 3, 33, 17, 39, 57, 20, + 7, 17, 33, 23, 9, 21, 3, 12, 13, 21, + 0, 8, 6, 8, 20, 8, 6, 118, 96, 72, + 52, 34, 8, 11, 41, 69, 5, 78, 56, 46, + 26, 28, 12, 4, 0, 23, 29, 19, 3, 17, + 4, 16, 7, 13, 4, 12, 12, 12, 22, 18, + 8, 118, 96, 72, 52, 34, 8, 11, 41, 69, + }, + + { + /* Context Tables for P, SP, B Slices :: cabac_init_idc = 1, qp = 15 */ + + 120, 16, 25, 120, 16, 25, 73, 37, 32, 20, + 12, 36, 64, 96, 112, 38, 53, 5, 3, 5, + 14, 10, 7, 0, 10, 48, 52, 16, 21, 59, + 41, 45, 17, 3, 5, 14, 21, 27, 22, 26, + 7, 27, 45, 6, 25, 41, 67, 14, 15, 31, + 8, 29, 31, 53, 3, 25, 13, 41, 7, 4, + 44, 0, 0, 0, 21, 51, 67, 3, 20, 1, + 72, 12, 35, 53, 21, 33, 14, 3, 31, 23, + 59, 25, 53, 43, 59, 33, 41, 47, 49, 48, + 3, 11, 61, 15, 57, 37, 67, 12, 2, 4, + 9, 26, 9, 7, 31, 53, 29, 33, 29, 9, + 23, 49, 37, 9, 59, 39, 20, 4, 33, 39, + 2, 19, 7, 1, 6, 18, 0, 11, 35, 47, + 17, 26, 26, 43, 18, 14, 10, 8, 10, 8, + 6, 6, 6, 14, 10, 4, 6, 8, 15, 7, + 10, 7, 28, 44, 46, 32, 16, 38, 24, 32, + 16, 2, 22, 20, 42, 51, 30, 52, 46, 88, + 116, 64, 74, 106, 106, 64, 90, 90, 124, 124, + 52, 74, 52, 68, 90, 72, 98, 114, 124, 54, + 52, 82, 64, 124, 4, 116, 124, 116, 112, 110, + 100, 94, 92, 82, 70, 62, 50, 38, 28, 9, + 23, 19, 41, 30, 32, 26, 28, 6, 6, 4, + 24, 11, 17, 15, 39, 25, 85, 53, 14, 18, + 20, 4, 5, 15, 23, 27, 63, 35, 54, 30, + 18, 1, 2, 23, 39, 47, 63, 3, 58, 42, + 32, 28, 12, 2, 3, 11, 27, 7, 76, 62, + 44, 14, 20, 3, 19, 35, 18, 106, 92, 80, + 58, 48, 6, 9, 11, 21, 124, 85, 77, 55, + 75, 73, 51, 63, 55, 45, 57, 53, 55, 43, + 45, 47, 63, 47, 39, 29, 29, 25, 19, 15, + 9, 7, 1, 11, 11, 23, 2, 4, 2, 2, + 2, 4, 4, 10, 2, 0, 2, 8, 3, 11, + 19, 11, 0, 19, 22, 32, 24, 32, 22, 26, + 28, 22, 20, 8, 22, 12, 9, 5, 104, 106, + 100, 102, 100, 116, 110, 114, 114, 108, 122, 114, + 104, 84, 48, 124, 124, 124, 124, 104, 100, 70, + 60, 54, 42, 22, 10, 5, 23, 124, 124, 124, + 116, 106, 108, 100, 94, 80, 72, 62, 50, 28, + 12, 7, 8, 8, 11, 42, 40, 34, 26, 22, + 22, 8, 10, 2, 5, 35, 19, 41, 59, 16, + 9, 19, 33, 23, 7, 21, 1, 14, 13, 19, + 2, 10, 6, 10, 22, 8, 6, 116, 92, 66, + 46, 28, 0, 19, 49, 75, 5, 78, 56, 46, + 26, 30, 12, 4, 0, 23, 29, 19, 3, 17, + 6, 18, 7, 13, 4, 12, 12, 12, 24, 18, + 8, 116, 92, 66, 46, 28, 0, 19, 49, 75, + }, + + { + /* Context Tables for P, SP, B Slices :: cabac_init_idc = 1, qp = 16 */ + + 116, 14, 27, 116, 14, 27, 71, 35, 32, 20, + 10, 32, 58, 94, 112, 36, 51, 7, 3, 5, + 12, 8, 7, 1, 6, 44, 46, 8, 27, 63, + 37, 45, 17, 3, 5, 12, 23, 27, 22, 24, + 7, 27, 45, 4, 27, 41, 67, 12, 15, 31, + 8, 29, 31, 53, 3, 25, 15, 41, 7, 4, + 44, 0, 0, 0, 21, 53, 67, 3, 18, 3, + 70, 10, 37, 49, 19, 31, 16, 0, 29, 21, + 57, 23, 51, 41, 55, 33, 41, 47, 49, 48, + 3, 11, 57, 15, 57, 37, 65, 10, 0, 2, + 11, 26, 9, 7, 31, 53, 29, 33, 29, 9, + 23, 47, 35, 9, 57, 37, 28, 12, 27, 35, + 2, 19, 7, 1, 4, 18, 0, 11, 35, 43, + 17, 22, 22, 39, 18, 14, 10, 8, 10, 8, + 8, 6, 6, 12, 8, 4, 6, 8, 19, 7, + 10, 9, 26, 40, 44, 30, 14, 38, 24, 30, + 12, 2, 20, 18, 40, 51, 28, 50, 44, 82, + 108, 60, 70, 100, 100, 58, 84, 86, 110, 124, + 44, 68, 48, 62, 82, 66, 90, 104, 118, 48, + 48, 74, 60, 124, 0, 110, 118, 110, 106, 104, + 94, 88, 86, 78, 64, 56, 46, 34, 24, 13, + 23, 21, 43, 28, 28, 22, 24, 2, 2, 0, + 20, 13, 19, 17, 41, 27, 85, 51, 14, 18, + 20, 4, 5, 15, 23, 27, 59, 33, 54, 30, + 18, 1, 2, 21, 37, 45, 61, 3, 58, 42, + 32, 28, 14, 2, 3, 11, 27, 9, 76, 60, + 42, 12, 20, 3, 19, 35, 16, 104, 90, 76, + 56, 46, 6, 9, 11, 21, 124, 83, 75, 53, + 73, 71, 49, 61, 53, 43, 55, 51, 51, 43, + 45, 47, 61, 45, 39, 29, 29, 25, 19, 15, + 11, 9, 3, 11, 13, 25, 2, 4, 4, 2, + 0, 4, 4, 8, 2, 0, 2, 6, 3, 11, + 19, 11, 0, 21, 20, 32, 20, 30, 20, 24, + 26, 20, 18, 8, 20, 8, 11, 9, 100, 102, + 98, 98, 96, 110, 104, 108, 108, 102, 116, 108, + 96, 78, 44, 124, 124, 122, 120, 96, 92, 64, + 54, 48, 38, 18, 6, 7, 25, 118, 120, 120, + 110, 100, 102, 94, 86, 74, 66, 56, 44, 24, + 8, 11, 6, 6, 15, 38, 36, 30, 20, 18, + 18, 4, 6, 1, 9, 37, 23, 43, 61, 12, + 13, 23, 33, 23, 7, 21, 0, 16, 13, 19, + 2, 10, 6, 10, 22, 8, 4, 112, 88, 60, + 38, 20, 7, 29, 59, 81, 5, 78, 56, 46, + 26, 30, 12, 4, 0, 23, 29, 19, 3, 17, + 6, 18, 7, 13, 4, 12, 12, 12, 24, 16, + 6, 112, 88, 60, 38, 20, 7, 29, 59, 81, + }, + + { + /* Context Tables for P, SP, B Slices :: cabac_init_idc = 1, qp = 17 */ + + 114, 14, 27, 114, 14, 27, 67, 31, 34, 22, + 10, 30, 54, 92, 114, 36, 47, 7, 1, 3, + 12, 8, 5, 1, 4, 42, 42, 2, 31, 65, + 31, 43, 15, 1, 3, 12, 23, 25, 22, 24, + 5, 25, 43, 4, 27, 39, 65, 12, 13, 29, + 8, 27, 29, 51, 1, 23, 15, 39, 5, 4, + 44, 0, 0, 0, 19, 53, 67, 1, 18, 3, + 70, 10, 37, 43, 15, 27, 20, 6, 25, 17, + 53, 19, 47, 37, 49, 33, 39, 45, 47, 48, + 3, 9, 51, 13, 55, 35, 61, 10, 0, 2, + 11, 28, 9, 5, 29, 51, 27, 31, 27, 7, + 21, 43, 31, 7, 53, 33, 38, 22, 19, 29, + 4, 17, 5, 0, 4, 20, 2, 9, 33, 37, + 15, 18, 20, 33, 18, 16, 10, 10, 12, 10, + 12, 8, 8, 12, 8, 4, 8, 10, 21, 5, + 10, 9, 26, 38, 44, 30, 14, 38, 24, 30, + 10, 4, 20, 18, 40, 51, 28, 50, 44, 78, + 102, 58, 68, 96, 96, 54, 80, 82, 98, 124, + 38, 64, 46, 58, 76, 62, 84, 96, 110, 44, + 44, 68, 56, 124, 1, 106, 114, 106, 102, 100, + 90, 84, 82, 74, 60, 52, 44, 32, 22, 15, + 23, 21, 43, 28, 26, 20, 22, 0, 0, 1, + 18, 15, 19, 19, 41, 27, 83, 47, 16, 20, + 20, 6, 3, 13, 21, 25, 53, 29, 56, 32, + 20, 0, 4, 17, 33, 41, 57, 1, 60, 44, + 34, 30, 16, 4, 1, 9, 25, 9, 78, 60, + 42, 12, 22, 1, 19, 33, 16, 104, 90, 74, + 54, 46, 8, 7, 9, 19, 124, 79, 71, 49, + 69, 67, 45, 57, 49, 39, 51, 47, 45, 41, + 43, 45, 57, 41, 37, 27, 27, 23, 17, 13, + 11, 9, 3, 11, 13, 25, 4, 6, 6, 4, + 0, 6, 6, 8, 4, 2, 2, 6, 1, 9, + 17, 9, 2, 21, 18, 32, 18, 30, 20, 24, + 26, 20, 18, 8, 20, 6, 11, 11, 98, 100, + 98, 96, 94, 106, 100, 104, 104, 98, 112, 104, + 90, 74, 40, 122, 120, 114, 112, 90, 86, 60, + 50, 44, 36, 16, 4, 7, 25, 114, 116, 116, + 106, 96, 98, 90, 80, 70, 62, 52, 40, 22, + 6, 13, 6, 6, 17, 36, 34, 28, 16, 16, + 16, 2, 4, 3, 11, 37, 25, 43, 61, 10, + 15, 25, 31, 21, 5, 19, 4, 20, 11, 17, + 4, 12, 8, 12, 24, 8, 4, 110, 84, 56, + 32, 14, 15, 37, 67, 85, 3, 78, 58, 48, + 28, 32, 14, 6, 2, 21, 27, 17, 1, 15, + 8, 20, 5, 11, 6, 14, 12, 14, 26, 16, + 6, 110, 84, 56, 32, 14, 15, 37, 67, 85, + }, + + { + /* Context Tables for P, SP, B Slices :: cabac_init_idc = 1, qp = 18 */ + + 112, 14, 27, 112, 14, 27, 63, 29, 34, 22, + 10, 28, 50, 90, 114, 36, 43, 7, 1, 1, + 12, 6, 5, 3, 0, 40, 36, 5, 37, 69, + 25, 41, 15, 1, 1, 12, 23, 23, 22, 22, + 5, 25, 41, 4, 27, 39, 65, 12, 13, 29, + 8, 27, 29, 51, 1, 23, 15, 39, 5, 4, + 44, 0, 0, 0, 17, 53, 67, 1, 16, 3, + 68, 10, 37, 39, 13, 25, 24, 10, 23, 15, + 49, 17, 45, 35, 45, 33, 39, 45, 47, 48, + 3, 7, 47, 13, 53, 33, 57, 10, 0, 2, + 13, 28, 9, 5, 27, 49, 25, 29, 25, 7, + 19, 41, 29, 5, 51, 31, 48, 30, 13, 23, + 4, 17, 5, 0, 4, 20, 2, 9, 33, 33, + 15, 14, 16, 27, 18, 16, 10, 10, 12, 10, + 16, 10, 8, 12, 8, 4, 10, 12, 23, 5, + 10, 9, 24, 36, 42, 30, 12, 38, 24, 30, + 8, 4, 18, 16, 38, 51, 28, 48, 42, 74, + 96, 56, 64, 92, 92, 50, 76, 78, 86, 124, + 30, 60, 42, 54, 70, 58, 78, 88, 102, 38, + 40, 62, 52, 124, 5, 102, 110, 102, 98, 96, + 86, 80, 78, 70, 56, 48, 40, 28, 18, 19, + 23, 21, 45, 26, 24, 18, 20, 1, 1, 3, + 14, 17, 19, 21, 43, 29, 81, 45, 18, 20, + 20, 6, 1, 11, 19, 23, 49, 27, 56, 32, + 20, 2, 6, 15, 29, 37, 53, 0, 62, 46, + 36, 32, 18, 4, 1, 9, 23, 9, 80, 60, + 40, 12, 22, 1, 19, 33, 16, 104, 88, 72, + 52, 46, 8, 7, 9, 19, 124, 77, 69, 47, + 65, 63, 43, 53, 45, 37, 47, 43, 41, 39, + 41, 45, 55, 37, 37, 27, 27, 21, 17, 13, + 11, 9, 5, 11, 13, 25, 4, 6, 8, 4, + 0, 6, 6, 8, 4, 2, 2, 4, 1, 9, + 17, 9, 4, 23, 16, 32, 16, 28, 18, 22, + 26, 18, 16, 8, 20, 4, 11, 13, 96, 98, + 96, 92, 90, 102, 96, 100, 100, 92, 106, 98, + 84, 68, 36, 114, 112, 106, 102, 84, 80, 54, + 46, 38, 32, 12, 2, 9, 27, 110, 112, 110, + 102, 90, 92, 84, 74, 66, 56, 48, 34, 18, + 2, 17, 4, 4, 21, 34, 32, 24, 12, 12, + 12, 1, 2, 7, 13, 39, 27, 45, 63, 6, + 17, 27, 31, 21, 3, 19, 6, 22, 11, 15, + 6, 14, 8, 12, 26, 8, 4, 108, 80, 50, + 26, 8, 23, 45, 75, 91, 3, 78, 58, 48, + 28, 34, 14, 6, 2, 21, 27, 17, 1, 15, + 10, 22, 5, 11, 6, 14, 12, 14, 28, 16, + 6, 108, 80, 50, 26, 8, 23, 45, 75, 91, + }, + + { + /* Context Tables for P, SP, B Slices :: cabac_init_idc = 1, qp = 19 */ + + 110, 14, 27, 110, 14, 27, 59, 25, 36, 22, + 8, 24, 46, 88, 114, 36, 39, 7, 0, 0, + 12, 6, 5, 3, 1, 38, 32, 11, 41, 71, + 19, 39, 15, 0, 0, 12, 25, 21, 22, 22, + 5, 25, 41, 4, 27, 39, 65, 12, 13, 27, + 8, 27, 27, 51, 1, 23, 15, 39, 5, 4, + 44, 0, 0, 0, 17, 53, 67, 0, 14, 3, + 66, 8, 37, 33, 11, 23, 26, 14, 19, 11, + 45, 13, 43, 33, 41, 33, 39, 43, 47, 48, + 3, 5, 43, 13, 51, 33, 53, 10, 1, 0, + 13, 28, 9, 3, 27, 47, 23, 29, 23, 5, + 17, 37, 25, 5, 49, 29, 58, 40, 7, 17, + 4, 17, 5, 2, 4, 20, 4, 7, 31, 29, + 15, 10, 12, 21, 18, 16, 10, 10, 12, 12, + 18, 12, 8, 12, 8, 4, 10, 14, 25, 5, + 10, 11, 24, 34, 42, 30, 12, 38, 24, 30, + 6, 4, 16, 14, 38, 51, 28, 48, 42, 70, + 90, 54, 60, 88, 86, 46, 70, 74, 72, 124, + 24, 56, 38, 50, 64, 52, 72, 80, 94, 32, + 36, 54, 48, 124, 9, 98, 106, 98, 92, 90, + 82, 76, 74, 66, 52, 44, 36, 24, 16, 23, + 23, 21, 45, 24, 22, 16, 18, 3, 3, 5, + 10, 19, 21, 23, 45, 29, 81, 41, 20, 22, + 20, 6, 0, 11, 19, 23, 45, 25, 56, 32, + 22, 4, 8, 13, 27, 35, 49, 2, 62, 46, + 36, 32, 20, 6, 1, 9, 23, 9, 80, 60, + 38, 12, 22, 1, 19, 31, 16, 102, 88, 70, + 50, 46, 8, 7, 9, 19, 124, 75, 65, 45, + 63, 61, 41, 49, 41, 35, 45, 39, 37, 39, + 39, 43, 53, 33, 35, 27, 27, 21, 15, 13, + 11, 9, 5, 11, 13, 27, 6, 8, 10, 4, + 0, 8, 6, 8, 4, 2, 2, 4, 1, 9, + 17, 9, 4, 25, 14, 32, 14, 26, 16, 22, + 26, 16, 14, 8, 20, 2, 11, 15, 94, 96, + 94, 90, 86, 98, 92, 96, 94, 88, 100, 92, + 78, 64, 32, 106, 104, 98, 92, 76, 74, 50, + 42, 34, 28, 8, 1, 11, 29, 106, 106, 106, + 96, 84, 88, 80, 68, 60, 52, 42, 28, 14, + 1, 19, 2, 2, 23, 30, 28, 20, 8, 10, + 8, 5, 0, 9, 15, 41, 29, 47, 65, 4, + 19, 29, 31, 19, 3, 17, 8, 24, 11, 15, + 6, 14, 8, 14, 28, 8, 4, 106, 76, 46, + 20, 2, 31, 53, 83, 97, 3, 78, 58, 48, + 28, 34, 16, 6, 2, 19, 27, 17, 0, 13, + 12, 24, 5, 9, 6, 14, 12, 14, 28, 16, + 6, 106, 76, 46, 20, 2, 31, 53, 83, 97, + }, + + { + /* Context Tables for P, SP, B Slices :: cabac_init_idc = 1, qp = 20 */ + + 106, 14, 27, 106, 14, 27, 57, 23, 36, 22, + 8, 22, 42, 86, 114, 34, 35, 7, 0, 2, + 10, 4, 5, 5, 5, 36, 26, 19, 47, 75, + 13, 37, 15, 0, 2, 10, 25, 19, 22, 20, + 5, 23, 39, 4, 27, 39, 65, 12, 13, 27, + 8, 27, 27, 49, 1, 23, 15, 39, 3, 4, + 44, 0, 0, 0, 15, 55, 67, 0, 12, 3, + 64, 8, 37, 29, 9, 21, 30, 18, 17, 9, + 43, 11, 39, 31, 37, 33, 37, 43, 47, 48, + 3, 3, 39, 13, 51, 31, 49, 10, 1, 0, + 15, 28, 9, 3, 25, 45, 21, 27, 21, 5, + 17, 35, 23, 3, 47, 27, 68, 48, 1, 11, + 4, 15, 5, 2, 4, 20, 4, 7, 31, 25, + 15, 6, 8, 17, 18, 18, 10, 10, 12, 12, + 22, 14, 10, 10, 8, 4, 12, 14, 27, 5, + 10, 11, 22, 32, 40, 28, 10, 38, 24, 30, + 4, 4, 14, 12, 36, 51, 26, 46, 40, 66, + 82, 52, 56, 84, 82, 42, 66, 70, 60, 124, + 16, 52, 34, 44, 58, 48, 64, 70, 86, 26, + 32, 48, 44, 124, 11, 94, 102, 92, 88, 86, + 78, 72, 70, 62, 46, 40, 32, 20, 12, 27, + 23, 21, 47, 22, 20, 14, 14, 5, 7, 7, + 8, 21, 21, 25, 45, 31, 79, 39, 22, 22, + 20, 8, 0, 9, 17, 21, 41, 21, 56, 34, + 22, 4, 10, 11, 23, 31, 47, 2, 64, 48, + 38, 34, 22, 6, 1, 7, 21, 9, 82, 60, + 38, 10, 22, 1, 19, 31, 16, 102, 86, 68, + 48, 46, 8, 5, 9, 19, 124, 71, 63, 43, + 59, 57, 39, 47, 39, 33, 41, 37, 33, 37, + 37, 43, 51, 31, 35, 27, 27, 19, 15, 13, + 13, 9, 7, 11, 13, 27, 6, 8, 12, 4, + 0, 8, 6, 8, 4, 2, 2, 2, 1, 9, + 17, 9, 6, 27, 12, 32, 12, 26, 14, 20, + 26, 14, 12, 8, 20, 0, 11, 17, 92, 94, + 92, 86, 82, 94, 88, 90, 90, 82, 94, 86, + 72, 58, 28, 96, 96, 90, 82, 70, 66, 44, + 36, 28, 24, 4, 3, 13, 29, 100, 102, 100, + 92, 78, 82, 74, 62, 56, 46, 38, 22, 10, + 5, 23, 0, 2, 27, 28, 26, 18, 4, 6, + 4, 7, 1, 13, 17, 43, 33, 49, 65, 0, + 23, 33, 29, 19, 1, 17, 10, 26, 11, 13, + 8, 16, 8, 14, 30, 8, 4, 104, 72, 40, + 14, 5, 39, 63, 91, 103, 1, 78, 58, 48, + 28, 36, 16, 6, 2, 19, 27, 17, 0, 13, + 12, 26, 5, 9, 8, 16, 12, 14, 30, 16, + 6, 104, 72, 40, 14, 5, 39, 63, 91, 103, + }, + + { + /* Context Tables for P, SP, B Slices :: cabac_init_idc = 1, qp = 21 */ + + 104, 14, 27, 104, 14, 27, 53, 19, 36, 22, + 6, 20, 38, 84, 114, 34, 31, 7, 0, 4, + 10, 4, 5, 7, 9, 34, 22, 25, 51, 79, + 7, 35, 13, 0, 4, 10, 25, 17, 22, 20, + 5, 23, 37, 4, 27, 39, 65, 12, 13, 27, + 8, 27, 25, 49, 1, 23, 15, 39, 3, 4, + 44, 0, 0, 0, 15, 55, 67, 2, 10, 3, + 62, 6, 37, 23, 7, 19, 34, 22, 13, 5, + 39, 7, 37, 29, 31, 33, 37, 41, 45, 48, + 3, 1, 35, 13, 49, 31, 45, 10, 3, 1, + 15, 28, 9, 1, 25, 43, 19, 27, 19, 3, + 15, 31, 19, 1, 43, 25, 78, 56, 4, 5, + 6, 15, 3, 4, 4, 20, 6, 7, 29, 21, + 15, 2, 4, 11, 18, 18, 10, 10, 12, 14, + 24, 16, 10, 10, 8, 4, 12, 16, 29, 3, + 10, 13, 22, 30, 40, 28, 10, 38, 24, 30, + 2, 6, 12, 12, 36, 51, 26, 44, 38, 62, + 76, 50, 54, 80, 78, 38, 60, 66, 48, 124, + 10, 48, 30, 40, 52, 42, 58, 62, 78, 22, + 28, 40, 40, 124, 15, 90, 98, 88, 84, 82, + 74, 68, 66, 58, 42, 36, 28, 18, 8, 31, + 23, 21, 47, 20, 18, 12, 12, 7, 9, 9, + 4, 23, 23, 27, 47, 33, 77, 35, 24, 24, + 20, 8, 2, 9, 17, 19, 37, 19, 56, 34, + 24, 6, 12, 9, 21, 29, 43, 4, 66, 50, + 40, 34, 24, 8, 0, 7, 21, 9, 82, 60, + 36, 10, 22, 1, 19, 29, 16, 100, 84, 66, + 46, 46, 8, 5, 9, 17, 124, 69, 59, 39, + 55, 55, 37, 43, 35, 31, 37, 33, 29, 35, + 35, 41, 49, 27, 35, 27, 25, 19, 13, 13, + 13, 9, 7, 11, 13, 29, 8, 8, 14, 4, + 0, 8, 6, 8, 4, 2, 2, 2, 1, 9, + 17, 9, 6, 29, 10, 32, 10, 24, 14, 18, + 26, 14, 10, 8, 20, 1, 11, 19, 90, 92, + 90, 82, 78, 90, 84, 86, 84, 76, 88, 80, + 66, 54, 24, 88, 88, 82, 72, 64, 60, 40, + 32, 24, 20, 2, 7, 13, 31, 96, 96, 96, + 88, 72, 76, 70, 56, 52, 42, 34, 16, 6, + 9, 25, 0, 0, 31, 26, 22, 14, 0, 2, + 0, 11, 3, 15, 19, 43, 35, 51, 67, 1, + 25, 35, 29, 19, 0, 15, 12, 28, 9, 11, + 8, 16, 8, 16, 32, 8, 4, 102, 68, 34, + 8, 11, 47, 71, 99, 109, 1, 78, 58, 48, + 28, 36, 16, 8, 4, 17, 27, 17, 2, 13, + 14, 28, 5, 9, 8, 16, 12, 16, 30, 16, + 6, 102, 68, 34, 8, 11, 47, 71, 99, 109, + }, + + { + /* Context Tables for P, SP, B Slices :: cabac_init_idc = 1, qp = 22 */ + + 102, 14, 29, 102, 14, 29, 49, 17, 38, 22, + 6, 16, 34, 82, 114, 34, 29, 7, 2, 4, + 10, 2, 3, 7, 11, 30, 16, 33, 57, 81, + 3, 33, 13, 2, 4, 10, 27, 17, 22, 18, + 5, 23, 37, 4, 27, 37, 63, 12, 11, 25, + 8, 25, 25, 49, 1, 23, 15, 39, 3, 4, + 44, 0, 0, 0, 13, 55, 67, 2, 10, 5, + 62, 6, 37, 19, 5, 17, 36, 28, 11, 3, + 35, 5, 35, 27, 27, 33, 37, 41, 45, 48, + 3, 0, 29, 13, 47, 29, 41, 8, 3, 1, + 17, 30, 9, 1, 23, 43, 19, 25, 17, 3, + 13, 29, 17, 1, 41, 23, 88, 66, 10, 0, + 6, 15, 3, 4, 4, 20, 6, 5, 29, 17, + 13, 1, 0, 5, 18, 18, 10, 10, 14, 14, + 28, 18, 10, 10, 8, 4, 14, 18, 31, 3, + 10, 13, 20, 28, 38, 28, 8, 38, 24, 30, + 1, 6, 10, 10, 34, 51, 26, 44, 38, 58, + 70, 48, 50, 74, 72, 34, 56, 62, 34, 124, + 2, 44, 28, 36, 44, 38, 52, 54, 68, 16, + 24, 34, 36, 124, 19, 86, 94, 84, 78, 76, + 70, 62, 62, 54, 38, 30, 24, 14, 6, 33, + 23, 21, 49, 18, 16, 10, 10, 9, 11, 11, + 0, 25, 23, 29, 49, 33, 77, 33, 26, 24, + 20, 8, 4, 7, 15, 19, 33, 17, 58, 34, + 24, 8, 14, 7, 17, 25, 39, 6, 66, 50, + 40, 36, 26, 8, 0, 7, 19, 9, 84, 60, + 34, 10, 22, 1, 19, 29, 14, 100, 84, 64, + 44, 44, 8, 5, 9, 17, 124, 67, 57, 37, + 53, 51, 35, 39, 31, 29, 35, 29, 25, 35, + 33, 41, 47, 23, 33, 27, 25, 17, 13, 13, + 13, 9, 9, 11, 13, 29, 8, 10, 16, 4, + 0, 10, 8, 8, 6, 4, 2, 0, 1, 9, + 17, 9, 8, 31, 8, 32, 8, 22, 12, 18, + 26, 12, 10, 8, 18, 5, 13, 21, 86, 90, + 88, 80, 74, 86, 80, 82, 80, 72, 82, 76, + 60, 48, 20, 80, 80, 74, 64, 56, 54, 34, + 28, 18, 16, 1, 9, 15, 33, 92, 92, 90, + 82, 66, 72, 64, 50, 46, 36, 28, 10, 2, + 11, 29, 1, 1, 33, 22, 20, 10, 3, 0, + 3, 15, 5, 19, 21, 45, 37, 53, 69, 5, + 27, 37, 29, 17, 0, 15, 14, 30, 9, 11, + 10, 18, 8, 16, 34, 8, 4, 100, 64, 30, + 2, 17, 55, 79, 107, 115, 1, 78, 58, 48, + 28, 38, 18, 8, 4, 17, 25, 15, 2, 11, + 16, 30, 3, 7, 8, 16, 12, 16, 32, 16, + 6, 100, 64, 30, 2, 17, 55, 79, 107, 115, + }, + + { + /* Context Tables for P, SP, B Slices :: cabac_init_idc = 1, qp = 23 */ + + 100, 14, 29, 100, 14, 29, 45, 13, 38, 22, + 4, 14, 30, 80, 114, 34, 25, 7, 2, 6, + 8, 2, 3, 9, 15, 28, 12, 39, 61, 85, + 2, 31, 13, 2, 6, 8, 27, 15, 22, 18, + 5, 21, 35, 4, 27, 37, 63, 12, 11, 25, + 8, 25, 23, 47, 1, 23, 15, 39, 1, 4, + 44, 0, 0, 0, 13, 55, 67, 4, 8, 5, + 60, 4, 37, 13, 3, 15, 40, 32, 7, 0, + 33, 1, 31, 25, 23, 33, 35, 39, 45, 48, + 3, 2, 25, 13, 45, 29, 37, 8, 5, 3, + 17, 30, 9, 0, 23, 41, 17, 25, 15, 1, + 11, 25, 13, 0, 39, 21, 98, 74, 16, 6, + 6, 13, 3, 6, 4, 20, 8, 5, 27, 13, + 13, 5, 3, 0, 18, 20, 10, 10, 14, 16, + 30, 20, 12, 8, 8, 4, 14, 18, 33, 3, + 10, 15, 20, 26, 38, 26, 8, 38, 24, 30, + 3, 6, 8, 8, 34, 51, 26, 42, 36, 54, + 64, 46, 46, 70, 68, 30, 50, 58, 22, 124, + 3, 40, 24, 30, 38, 32, 46, 44, 60, 10, + 20, 26, 32, 124, 21, 82, 90, 80, 74, 72, + 66, 58, 58, 50, 32, 26, 20, 10, 2, 37, + 23, 21, 49, 16, 14, 8, 6, 11, 13, 13, + 1, 27, 25, 31, 49, 35, 75, 29, 28, 26, + 20, 10, 4, 7, 15, 17, 29, 13, 58, 36, + 26, 8, 16, 5, 15, 23, 35, 6, 68, 52, + 42, 36, 28, 10, 0, 5, 19, 9, 84, 60, + 34, 8, 22, 1, 19, 27, 14, 98, 82, 62, + 42, 44, 8, 3, 9, 17, 124, 63, 53, 35, + 49, 49, 33, 37, 29, 27, 31, 27, 21, 33, + 31, 39, 45, 19, 33, 27, 25, 17, 11, 13, + 15, 9, 9, 11, 13, 31, 10, 10, 18, 4, + 0, 10, 8, 8, 6, 4, 2, 0, 1, 9, + 17, 9, 8, 33, 6, 32, 6, 22, 10, 16, + 26, 10, 8, 8, 18, 7, 13, 23, 84, 88, + 86, 76, 70, 82, 76, 76, 74, 66, 76, 70, + 54, 44, 16, 70, 72, 66, 54, 50, 48, 30, + 24, 14, 12, 5, 13, 17, 33, 86, 86, 86, + 78, 60, 66, 60, 44, 42, 32, 24, 4, 1, + 15, 31, 3, 1, 37, 20, 16, 8, 7, 3, + 7, 17, 7, 21, 23, 47, 41, 55, 69, 7, + 31, 41, 27, 17, 2, 13, 16, 32, 9, 9, + 10, 18, 8, 18, 36, 8, 4, 98, 60, 24, + 3, 25, 63, 87, 115, 121, 0, 78, 58, 48, + 28, 38, 18, 8, 4, 15, 25, 15, 4, 11, + 18, 32, 3, 7, 10, 18, 12, 16, 32, 16, + 6, 98, 60, 24, 3, 25, 63, 87, 115, 121, + }, + + { + /* Context Tables for P, SP, B Slices :: cabac_init_idc = 1, qp = 24 */ + + 96, 12, 29, 96, 12, 29, 43, 11, 38, 22, + 4, 10, 24, 78, 114, 32, 21, 9, 2, 8, + 8, 0, 3, 11, 19, 26, 6, 47, 67, 89, + 8, 29, 13, 2, 8, 8, 29, 13, 22, 16, + 5, 21, 35, 2, 29, 37, 63, 12, 11, 25, + 8, 25, 23, 47, 1, 23, 15, 39, 1, 4, + 44, 0, 0, 0, 11, 57, 67, 4, 6, 5, + 58, 4, 39, 9, 1, 13, 42, 36, 5, 2, + 29, 0, 29, 23, 19, 33, 35, 39, 45, 48, + 3, 4, 21, 13, 45, 27, 33, 8, 5, 3, + 19, 30, 9, 0, 21, 39, 15, 23, 15, 1, + 11, 23, 11, 0, 37, 19, 106, 82, 22, 10, + 6, 13, 3, 6, 2, 20, 8, 5, 27, 9, + 13, 9, 7, 4, 18, 20, 10, 10, 14, 16, + 34, 20, 12, 8, 6, 4, 16, 20, 37, 3, + 10, 15, 18, 22, 36, 26, 6, 38, 24, 28, + 5, 6, 6, 6, 32, 51, 24, 40, 34, 50, + 56, 42, 42, 66, 62, 24, 46, 54, 8, 124, + 11, 36, 20, 26, 32, 28, 38, 36, 52, 4, + 16, 20, 28, 124, 25, 78, 84, 74, 68, 66, + 60, 54, 52, 46, 28, 22, 16, 6, 1, 41, + 23, 23, 51, 14, 10, 6, 4, 13, 17, 17, + 5, 29, 25, 33, 51, 37, 75, 27, 30, 26, + 20, 10, 6, 5, 13, 17, 25, 11, 58, 36, + 26, 10, 18, 3, 11, 19, 33, 8, 68, 52, + 42, 38, 30, 10, 0, 5, 17, 11, 86, 60, + 32, 8, 22, 1, 19, 27, 14, 98, 80, 60, + 40, 44, 8, 3, 9, 17, 124, 61, 51, 33, + 47, 45, 31, 33, 25, 25, 29, 23, 17, 33, + 31, 39, 43, 17, 33, 27, 25, 15, 11, 13, + 15, 9, 11, 11, 13, 31, 10, 10, 20, 4, + 1, 10, 8, 6, 6, 4, 2, 1, 1, 9, + 17, 9, 10, 35, 4, 32, 2, 20, 8, 14, + 24, 8, 6, 8, 18, 9, 13, 27, 82, 84, + 84, 72, 66, 78, 72, 72, 70, 60, 70, 64, + 48, 38, 12, 62, 64, 56, 44, 42, 40, 24, + 18, 8, 8, 9, 15, 19, 35, 82, 82, 80, + 72, 54, 60, 54, 38, 36, 26, 18, 1, 5, + 19, 35, 5, 3, 41, 16, 14, 4, 11, 7, + 11, 21, 11, 25, 27, 49, 43, 57, 71, 11, + 33, 43, 27, 17, 2, 13, 18, 34, 9, 9, + 12, 20, 8, 18, 36, 8, 2, 96, 56, 18, + 9, 31, 71, 97, 125, 125, 0, 78, 58, 48, + 28, 40, 18, 8, 4, 15, 25, 15, 4, 11, + 18, 32, 3, 7, 10, 18, 12, 16, 34, 16, + 4, 96, 56, 18, 9, 31, 71, 97, 125, 125, + }, + + { + /* Context Tables for P, SP, B Slices :: cabac_init_idc = 1, qp = 25 */ + + 94, 12, 29, 94, 12, 29, 39, 9, 40, 22, + 4, 8, 20, 76, 116, 32, 17, 9, 4, 10, + 8, 1, 3, 11, 21, 24, 2, 55, 73, 91, + 14, 27, 11, 4, 10, 8, 29, 11, 22, 16, + 3, 21, 33, 2, 29, 37, 63, 12, 11, 23, + 8, 25, 23, 47, 1, 23, 15, 37, 1, 4, + 44, 0, 0, 0, 9, 57, 67, 4, 4, 5, + 56, 4, 39, 5, 2, 11, 46, 40, 1, 4, + 25, 4, 27, 19, 13, 33, 35, 39, 43, 48, + 3, 6, 17, 11, 43, 25, 29, 8, 5, 3, + 19, 30, 9, 0, 19, 37, 13, 21, 13, 0, + 9, 21, 7, 2, 33, 17, 116, 92, 30, 16, + 8, 13, 1, 8, 2, 20, 8, 3, 25, 3, + 13, 13, 9, 10, 18, 20, 10, 10, 14, 18, + 38, 22, 12, 8, 6, 4, 18, 22, 39, 1, + 10, 15, 16, 20, 34, 26, 4, 38, 24, 28, + 7, 8, 4, 6, 30, 51, 24, 40, 34, 46, + 50, 40, 40, 62, 58, 20, 42, 50, 3, 124, + 17, 32, 16, 22, 26, 24, 32, 28, 44, 0, + 12, 14, 24, 124, 29, 74, 80, 70, 64, 62, + 56, 50, 48, 42, 24, 18, 14, 4, 3, 45, + 23, 23, 51, 14, 8, 4, 2, 15, 19, 19, + 9, 31, 25, 35, 53, 37, 73, 25, 32, 26, + 20, 10, 8, 3, 11, 15, 19, 9, 58, 36, + 26, 12, 20, 1, 7, 15, 29, 10, 70, 54, + 44, 40, 32, 10, 2, 5, 15, 11, 88, 60, + 30, 8, 24, 0, 19, 25, 14, 98, 80, 58, + 38, 44, 8, 3, 9, 15, 124, 59, 49, 29, + 43, 41, 29, 29, 21, 23, 25, 19, 13, 31, + 29, 37, 41, 13, 31, 25, 23, 13, 9, 13, + 15, 9, 13, 11, 13, 31, 10, 12, 22, 6, + 1, 12, 8, 6, 6, 4, 2, 3, 1, 7, + 15, 9, 12, 35, 2, 32, 0, 18, 8, 14, + 24, 8, 4, 8, 18, 11, 13, 29, 80, 82, + 84, 70, 62, 74, 68, 68, 66, 56, 64, 58, + 42, 32, 8, 54, 56, 48, 34, 36, 34, 18, + 14, 4, 6, 11, 17, 19, 37, 78, 78, 76, + 68, 50, 56, 50, 32, 32, 22, 14, 5, 9, + 23, 39, 5, 5, 43, 14, 12, 0, 15, 9, + 13, 25, 13, 29, 29, 49, 45, 57, 73, 15, + 35, 45, 27, 15, 4, 13, 20, 38, 7, 7, + 14, 22, 8, 20, 38, 8, 2, 94, 52, 14, + 15, 37, 79, 105, 125, 125, 0, 78, 58, 50, + 30, 42, 20, 10, 6, 15, 25, 15, 4, 9, + 20, 34, 3, 5, 10, 18, 12, 18, 36, 16, + 4, 94, 52, 14, 15, 37, 79, 105, 125, 125, + }, + + { + /* Context Tables for P, SP, B Slices :: cabac_init_idc = 1, qp = 26 */ + + 92, 12, 29, 92, 12, 29, 35, 5, 40, 22, + 2, 6, 16, 74, 116, 32, 13, 9, 4, 12, + 6, 1, 3, 13, 25, 22, 3, 61, 77, 95, + 20, 25, 11, 4, 12, 6, 29, 9, 22, 14, + 3, 19, 31, 2, 29, 37, 63, 12, 11, 23, + 8, 25, 21, 45, 1, 23, 15, 37, 0, 4, + 44, 0, 0, 0, 9, 57, 67, 6, 2, 5, + 54, 2, 39, 0, 4, 9, 50, 44, 0, 8, + 23, 6, 23, 17, 9, 33, 33, 37, 43, 48, + 3, 8, 13, 11, 41, 25, 25, 8, 7, 5, + 21, 30, 9, 2, 19, 35, 11, 21, 11, 0, + 7, 17, 5, 4, 31, 15, 124, 100, 36, 22, + 8, 11, 1, 8, 2, 20, 10, 3, 25, 0, + 13, 17, 13, 16, 18, 22, 10, 10, 14, 18, + 40, 24, 14, 6, 6, 4, 18, 22, 41, 1, + 10, 17, 16, 18, 34, 24, 4, 38, 24, 28, + 9, 8, 2, 4, 30, 51, 24, 38, 32, 42, + 44, 38, 36, 58, 54, 16, 36, 46, 15, 124, + 25, 28, 12, 16, 20, 18, 26, 18, 36, 5, + 8, 6, 20, 124, 31, 70, 76, 66, 60, 58, + 52, 46, 44, 38, 18, 14, 10, 0, 7, 49, + 23, 23, 53, 12, 6, 2, 1, 17, 21, 21, + 11, 33, 27, 37, 53, 39, 71, 21, 34, 28, + 20, 12, 8, 3, 11, 13, 15, 5, 58, 38, + 28, 12, 22, 0, 5, 13, 25, 10, 72, 56, + 46, 40, 34, 12, 2, 3, 15, 11, 88, 60, + 30, 6, 24, 0, 19, 25, 14, 96, 78, 56, + 36, 44, 8, 1, 9, 15, 124, 55, 45, 27, + 39, 39, 27, 27, 19, 21, 21, 17, 9, 29, + 27, 37, 39, 9, 31, 25, 23, 13, 9, 13, + 17, 9, 13, 11, 13, 33, 12, 12, 24, 6, + 1, 12, 8, 6, 6, 4, 2, 3, 1, 7, + 15, 9, 12, 37, 0, 32, 1, 18, 6, 12, + 24, 6, 2, 8, 18, 13, 13, 31, 78, 80, + 82, 66, 58, 70, 64, 62, 60, 50, 58, 52, + 36, 28, 4, 44, 48, 40, 24, 30, 28, 14, + 10, 1, 2, 15, 21, 21, 37, 72, 72, 70, + 64, 44, 50, 44, 26, 28, 16, 10, 11, 13, + 27, 41, 7, 5, 47, 12, 8, 1, 19, 13, + 17, 27, 15, 31, 31, 51, 49, 59, 73, 17, + 39, 49, 25, 15, 6, 11, 22, 40, 7, 5, + 14, 22, 8, 20, 40, 8, 2, 92, 48, 8, + 21, 45, 87, 113, 125, 125, 2, 78, 58, 50, + 30, 42, 20, 10, 6, 13, 25, 15, 6, 9, + 22, 36, 3, 5, 12, 20, 12, 18, 36, 16, + 4, 92, 48, 8, 21, 45, 87, 113, 125, 125, + }, + + { + /* Context Tables for P, SP, B Slices :: cabac_init_idc = 1, qp = 27 */ + + 90, 12, 31, 90, 12, 31, 31, 3, 42, 22, + 2, 2, 12, 72, 116, 32, 11, 9, 6, 12, + 6, 3, 1, 13, 27, 18, 7, 69, 83, 97, + 24, 23, 11, 6, 12, 6, 31, 9, 22, 14, + 3, 19, 31, 2, 29, 35, 61, 12, 9, 21, + 8, 23, 21, 45, 1, 23, 15, 37, 0, 4, + 44, 0, 0, 0, 7, 57, 67, 6, 2, 7, + 54, 2, 39, 4, 6, 7, 52, 50, 4, 10, + 19, 10, 21, 15, 5, 33, 33, 37, 43, 48, + 3, 10, 7, 11, 39, 23, 21, 6, 7, 5, + 21, 32, 9, 2, 17, 35, 11, 19, 9, 2, + 5, 15, 1, 4, 29, 13, 124, 110, 42, 28, + 8, 11, 1, 10, 2, 20, 10, 1, 23, 4, + 11, 21, 17, 22, 18, 22, 10, 10, 16, 20, + 44, 26, 14, 6, 6, 4, 20, 24, 43, 1, + 10, 17, 14, 16, 32, 24, 2, 38, 24, 28, + 13, 8, 0, 2, 28, 51, 24, 38, 32, 38, + 38, 36, 32, 52, 48, 12, 32, 42, 29, 124, + 31, 24, 10, 12, 12, 14, 20, 10, 26, 11, + 4, 0, 16, 124, 35, 66, 72, 62, 54, 52, + 48, 40, 40, 34, 14, 8, 6, 3, 9, 51, + 23, 23, 53, 10, 4, 0, 3, 19, 23, 23, + 15, 35, 27, 39, 55, 39, 71, 19, 36, 28, + 20, 12, 10, 1, 9, 13, 11, 3, 60, 38, + 28, 14, 24, 2, 1, 9, 21, 12, 72, 56, + 46, 42, 36, 12, 2, 3, 13, 11, 90, 60, + 28, 6, 24, 0, 19, 23, 12, 96, 78, 54, + 34, 42, 8, 1, 9, 15, 124, 53, 43, 25, + 37, 35, 25, 23, 15, 19, 19, 13, 5, 29, + 25, 35, 37, 5, 29, 25, 23, 11, 7, 13, + 17, 9, 15, 11, 13, 33, 12, 14, 26, 6, + 1, 14, 10, 6, 8, 6, 2, 5, 1, 7, + 15, 9, 14, 39, 1, 32, 3, 16, 4, 12, + 24, 4, 2, 8, 16, 17, 15, 33, 74, 78, + 80, 64, 54, 66, 60, 58, 56, 46, 52, 48, + 30, 22, 0, 36, 40, 32, 16, 22, 22, 8, + 6, 5, 1, 19, 23, 23, 39, 68, 68, 66, + 58, 38, 46, 40, 20, 22, 12, 4, 17, 17, + 29, 45, 9, 7, 49, 8, 6, 5, 23, 15, + 21, 31, 17, 35, 33, 53, 51, 61, 75, 21, + 41, 51, 25, 13, 6, 11, 24, 42, 7, 5, + 16, 24, 8, 22, 42, 8, 2, 90, 44, 4, + 27, 51, 95, 121, 125, 125, 2, 78, 58, 50, + 30, 44, 22, 10, 6, 13, 23, 13, 6, 7, + 24, 38, 1, 3, 12, 20, 12, 18, 38, 16, + 4, 90, 44, 4, 27, 51, 95, 121, 125, 125, + }, + + { + /* Context Tables for P, SP, B Slices :: cabac_init_idc = 1, qp = 28 */ + + 86, 12, 31, 86, 12, 31, 29, 0, 42, 22, + 0, 0, 8, 70, 116, 30, 7, 9, 6, 14, + 6, 3, 1, 15, 31, 16, 13, 75, 87, 101, + 30, 21, 11, 6, 14, 6, 31, 7, 22, 12, + 3, 19, 29, 2, 29, 35, 61, 12, 9, 21, + 8, 23, 19, 45, 1, 23, 15, 37, 0, 4, + 44, 0, 0, 0, 7, 59, 67, 8, 0, 7, + 52, 0, 39, 10, 8, 5, 56, 54, 6, 14, + 15, 12, 19, 13, 1, 33, 33, 35, 43, 48, + 3, 12, 3, 11, 39, 23, 17, 6, 9, 7, + 23, 32, 9, 4, 17, 33, 9, 19, 7, 2, + 5, 11, 0, 6, 27, 11, 124, 118, 48, 34, + 8, 11, 1, 10, 2, 20, 12, 1, 23, 8, + 11, 25, 21, 26, 18, 22, 10, 10, 16, 20, + 46, 28, 14, 6, 6, 4, 20, 26, 45, 1, + 10, 19, 14, 14, 32, 24, 2, 38, 24, 28, + 15, 8, 1, 0, 28, 51, 22, 36, 30, 34, + 30, 34, 28, 48, 44, 8, 26, 38, 41, 124, + 39, 20, 6, 8, 6, 8, 12, 2, 18, 17, + 0, 7, 12, 124, 39, 62, 68, 56, 50, 48, + 44, 36, 36, 30, 10, 4, 2, 7, 13, 55, + 23, 23, 55, 8, 2, 1, 5, 21, 27, 25, + 19, 37, 29, 41, 57, 41, 69, 15, 38, 30, + 20, 12, 12, 1, 9, 11, 7, 1, 60, 38, + 30, 16, 26, 4, 0, 7, 19, 14, 74, 58, + 48, 42, 38, 14, 2, 3, 13, 11, 90, 60, + 26, 6, 24, 0, 19, 23, 12, 94, 76, 52, + 32, 42, 8, 1, 9, 15, 124, 51, 39, 23, + 33, 33, 23, 19, 11, 17, 15, 9, 1, 27, + 23, 35, 35, 3, 29, 25, 23, 11, 7, 13, + 17, 9, 15, 11, 13, 35, 14, 14, 28, 6, + 1, 14, 10, 6, 8, 6, 2, 5, 1, 7, + 15, 9, 14, 41, 3, 32, 5, 14, 2, 10, + 24, 2, 0, 8, 16, 19, 15, 35, 72, 76, + 78, 60, 50, 62, 56, 54, 50, 40, 46, 42, + 24, 18, 3, 28, 32, 24, 6, 16, 14, 4, + 0, 11, 5, 23, 27, 25, 41, 64, 62, 60, + 54, 32, 40, 34, 14, 18, 6, 0, 23, 21, + 33, 47, 11, 9, 53, 6, 2, 9, 27, 19, + 25, 35, 19, 37, 35, 55, 53, 63, 77, 23, + 43, 53, 25, 13, 8, 9, 26, 44, 7, 3, + 16, 24, 8, 22, 44, 8, 2, 88, 40, 1, + 33, 57, 103, 125, 125, 125, 2, 78, 58, 50, + 30, 44, 22, 10, 6, 11, 23, 13, 8, 7, + 24, 40, 1, 3, 12, 20, 12, 18, 38, 16, + 4, 88, 40, 1, 33, 57, 103, 125, 125, 125, + }, + + { + /* Context Tables for P, SP, B Slices :: cabac_init_idc = 1, qp = 29 */ + + 84, 12, 31, 84, 12, 31, 25, 2, 42, 22, + 0, 1, 4, 68, 116, 30, 3, 9, 6, 16, + 4, 5, 1, 17, 35, 14, 17, 83, 93, 105, + 36, 19, 9, 6, 16, 4, 31, 5, 22, 12, + 3, 17, 27, 2, 29, 35, 61, 12, 9, 21, + 8, 23, 19, 43, 1, 23, 15, 37, 2, 4, + 44, 0, 0, 0, 5, 59, 67, 8, 1, 7, + 50, 0, 39, 14, 10, 3, 60, 58, 10, 16, + 13, 16, 15, 11, 4, 33, 31, 35, 41, 48, + 3, 14, 0, 11, 37, 21, 13, 6, 9, 7, + 23, 32, 9, 4, 15, 31, 7, 17, 5, 4, + 3, 9, 4, 8, 23, 9, 124, 124, 54, 40, + 10, 9, 0, 12, 2, 20, 12, 1, 21, 12, + 11, 29, 25, 32, 18, 24, 10, 10, 16, 22, + 50, 30, 16, 4, 6, 4, 22, 26, 47, 0, + 10, 19, 12, 12, 30, 22, 0, 38, 24, 28, + 17, 10, 3, 0, 26, 51, 22, 34, 28, 30, + 24, 32, 26, 44, 40, 4, 22, 34, 53, 124, + 45, 16, 2, 2, 0, 4, 6, 7, 10, 21, + 3, 13, 8, 124, 41, 58, 64, 52, 46, 44, + 40, 32, 32, 26, 4, 0, 1, 9, 17, 59, + 23, 23, 55, 6, 0, 3, 9, 23, 29, 27, + 21, 39, 29, 43, 57, 43, 67, 13, 40, 30, + 20, 14, 12, 0, 7, 9, 3, 2, 60, 40, + 30, 16, 28, 6, 4, 3, 15, 14, 76, 60, + 50, 44, 40, 14, 4, 1, 11, 11, 92, 60, + 26, 4, 24, 0, 19, 21, 12, 94, 74, 50, + 30, 42, 8, 0, 9, 13, 124, 47, 37, 19, + 29, 29, 21, 17, 9, 15, 11, 7, 2, 25, + 21, 33, 33, 0, 29, 25, 21, 9, 5, 13, + 19, 9, 17, 11, 13, 35, 14, 14, 30, 6, + 1, 14, 10, 6, 8, 6, 2, 7, 1, 7, + 15, 9, 16, 43, 5, 32, 7, 14, 2, 8, + 24, 2, 1, 8, 16, 21, 15, 37, 70, 74, + 76, 56, 46, 58, 52, 48, 46, 34, 40, 36, + 18, 12, 7, 18, 24, 16, 3, 10, 8, 1, + 3, 15, 9, 25, 29, 25, 41, 58, 58, 56, + 50, 26, 34, 30, 8, 14, 2, 3, 29, 25, + 37, 51, 11, 9, 57, 4, 0, 11, 31, 23, + 29, 37, 21, 41, 37, 55, 57, 65, 77, 27, + 47, 57, 23, 13, 10, 9, 28, 46, 5, 1, + 18, 26, 8, 24, 46, 8, 2, 86, 36, 7, + 39, 65, 111, 125, 125, 125, 4, 78, 58, 50, + 30, 46, 22, 12, 8, 11, 23, 13, 8, 7, + 26, 42, 1, 3, 14, 22, 12, 20, 40, 16, + 4, 86, 36, 7, 39, 65, 111, 125, 125, 125, + }, + + { + /* Context Tables for P, SP, B Slices :: cabac_init_idc = 1, qp = 30 */ + + 82, 12, 31, 82, 12, 31, 21, 6, 44, 22, + 1, 5, 0, 66, 116, 30, 0, 9, 8, 18, + 4, 5, 1, 17, 37, 12, 23, 89, 97, 107, + 42, 17, 9, 8, 18, 4, 33, 3, 22, 10, + 3, 17, 27, 2, 29, 35, 61, 12, 9, 19, + 8, 23, 17, 43, 1, 23, 15, 37, 2, 4, + 44, 0, 0, 0, 5, 59, 67, 10, 3, 7, + 48, 1, 39, 20, 12, 1, 62, 62, 12, 20, + 9, 18, 13, 9, 8, 33, 31, 33, 41, 48, + 3, 16, 4, 11, 35, 21, 9, 6, 11, 9, + 25, 32, 9, 6, 15, 29, 5, 17, 3, 4, + 1, 5, 6, 8, 21, 7, 124, 124, 60, 46, + 10, 9, 0, 12, 2, 20, 14, 0, 21, 16, + 11, 33, 29, 38, 18, 24, 10, 10, 16, 22, + 52, 32, 16, 4, 6, 4, 22, 28, 49, 0, + 10, 21, 12, 10, 30, 22, 0, 38, 24, 28, + 19, 10, 5, 1, 26, 51, 22, 34, 28, 26, + 18, 30, 22, 40, 34, 0, 16, 30, 67, 124, + 53, 12, 1, 1, 5, 1, 0, 15, 2, 27, + 7, 21, 4, 124, 45, 54, 60, 48, 40, 38, + 36, 28, 28, 22, 0, 3, 5, 13, 19, 63, + 23, 23, 57, 4, 1, 5, 11, 25, 31, 29, + 25, 41, 31, 45, 59, 43, 67, 9, 42, 32, + 20, 14, 14, 0, 7, 9, 0, 4, 60, 40, + 32, 18, 30, 8, 6, 1, 11, 16, 76, 60, + 50, 44, 42, 16, 4, 1, 11, 11, 92, 60, + 24, 4, 24, 0, 19, 21, 12, 92, 74, 48, + 28, 42, 8, 0, 9, 13, 124, 45, 33, 17, + 27, 27, 19, 13, 5, 13, 9, 3, 6, 25, + 19, 33, 31, 4, 27, 25, 21, 9, 5, 13, + 19, 9, 17, 11, 13, 37, 16, 16, 32, 6, + 1, 16, 10, 6, 8, 6, 2, 7, 1, 7, + 15, 9, 16, 45, 7, 32, 9, 12, 0, 8, + 24, 0, 3, 8, 16, 23, 15, 39, 68, 72, + 74, 54, 42, 54, 48, 44, 40, 30, 34, 30, + 12, 8, 11, 10, 16, 8, 13, 2, 2, 5, + 7, 21, 13, 29, 33, 27, 43, 54, 52, 50, + 44, 20, 30, 24, 2, 8, 3, 9, 35, 29, + 41, 53, 13, 11, 59, 0, 3, 15, 35, 25, + 33, 41, 23, 43, 39, 57, 59, 67, 79, 29, + 49, 59, 23, 11, 10, 7, 30, 48, 5, 1, + 18, 26, 8, 24, 48, 8, 2, 84, 32, 11, + 45, 71, 119, 125, 125, 125, 4, 78, 58, 50, + 30, 46, 24, 12, 8, 9, 23, 13, 10, 5, + 28, 44, 1, 1, 14, 22, 12, 20, 40, 16, + 4, 84, 32, 11, 45, 71, 119, 125, 125, 125, + }, + + { + /* Context Tables for P, SP, B Slices :: cabac_init_idc = 1, qp = 31 */ + + 80, 12, 31, 80, 12, 31, 17, 8, 44, 22, + 1, 7, 3, 64, 116, 30, 4, 9, 8, 20, + 4, 7, 1, 19, 41, 10, 27, 97, 103, 111, + 48, 15, 9, 8, 20, 4, 33, 1, 22, 10, + 3, 17, 25, 2, 29, 35, 61, 12, 9, 19, + 8, 23, 17, 43, 1, 23, 15, 37, 2, 4, + 44, 0, 0, 0, 3, 59, 67, 10, 5, 7, + 46, 1, 39, 24, 14, 0, 66, 66, 16, 22, + 5, 22, 11, 7, 12, 33, 31, 33, 41, 48, + 3, 18, 8, 11, 33, 19, 5, 6, 11, 9, + 25, 32, 9, 6, 13, 27, 3, 15, 1, 6, + 0, 3, 10, 10, 19, 5, 124, 124, 66, 52, + 10, 9, 0, 14, 2, 20, 14, 0, 19, 20, + 11, 37, 33, 44, 18, 24, 10, 10, 16, 24, + 56, 34, 16, 4, 6, 4, 24, 30, 51, 0, + 10, 21, 10, 8, 28, 22, 1, 38, 24, 28, + 21, 10, 7, 3, 24, 51, 22, 32, 26, 22, + 12, 28, 18, 36, 30, 3, 12, 26, 79, 124, + 59, 8, 5, 5, 11, 5, 5, 23, 5, 33, + 11, 27, 0, 124, 49, 50, 56, 44, 36, 34, + 32, 24, 24, 18, 3, 7, 9, 17, 23, 67, + 23, 23, 57, 2, 3, 7, 13, 27, 33, 31, + 29, 43, 31, 47, 61, 45, 65, 7, 44, 32, + 20, 14, 16, 2, 5, 7, 4, 6, 60, 40, + 32, 20, 32, 10, 10, 2, 7, 18, 78, 62, + 52, 46, 44, 16, 4, 1, 9, 11, 94, 60, + 22, 4, 24, 0, 19, 19, 12, 92, 72, 46, + 26, 42, 8, 0, 9, 13, 124, 43, 31, 15, + 23, 23, 17, 9, 1, 11, 5, 0, 10, 23, + 17, 31, 29, 8, 27, 25, 21, 7, 3, 13, + 19, 9, 19, 11, 13, 37, 16, 16, 34, 6, + 1, 16, 10, 6, 8, 6, 2, 9, 1, 7, + 15, 9, 18, 47, 9, 32, 11, 10, 1, 6, + 24, 1, 5, 8, 16, 25, 15, 41, 66, 70, + 72, 50, 38, 50, 44, 40, 36, 24, 28, 24, + 6, 2, 15, 2, 8, 0, 23, 3, 3, 11, + 11, 25, 17, 33, 35, 29, 45, 50, 48, 46, + 40, 14, 24, 20, 3, 4, 7, 13, 41, 33, + 45, 57, 15, 13, 63, 1, 5, 19, 39, 29, + 37, 45, 25, 47, 41, 59, 61, 69, 81, 33, + 51, 61, 23, 11, 12, 7, 32, 50, 5, 0, + 20, 28, 8, 26, 50, 8, 2, 82, 28, 17, + 51, 77, 125, 125, 125, 125, 4, 78, 58, 50, + 30, 48, 24, 12, 8, 9, 23, 13, 10, 5, + 30, 46, 1, 1, 14, 22, 12, 20, 42, 16, + 4, 82, 28, 17, 51, 77, 125, 125, 125, 125, + }, + + { + /* Context Tables for P, SP, B Slices :: cabac_init_idc = 1, qp = 32 */ + + 76, 10, 33, 76, 10, 33, 15, 10, 44, 22, + 3, 11, 9, 62, 116, 28, 6, 11, 8, 20, + 2, 9, 1, 21, 45, 6, 33, 105, 109, 115, + 52, 15, 9, 8, 20, 2, 35, 1, 22, 8, + 3, 17, 25, 0, 31, 35, 61, 10, 9, 19, + 8, 23, 17, 43, 1, 23, 17, 37, 2, 4, + 44, 0, 0, 0, 3, 61, 67, 10, 7, 9, + 44, 3, 41, 28, 16, 2, 68, 70, 18, 24, + 3, 24, 9, 5, 16, 33, 31, 33, 41, 48, + 3, 18, 12, 11, 33, 19, 3, 4, 13, 11, + 27, 32, 9, 6, 13, 27, 3, 15, 1, 6, + 0, 1, 12, 10, 17, 3, 124, 124, 72, 56, + 10, 9, 0, 14, 0, 20, 14, 0, 19, 24, + 11, 41, 37, 48, 18, 24, 10, 10, 16, 24, + 58, 34, 16, 2, 4, 4, 24, 30, 55, 0, + 10, 23, 8, 4, 26, 20, 3, 38, 24, 26, + 25, 10, 9, 5, 22, 51, 20, 30, 24, 16, + 4, 24, 14, 30, 24, 9, 6, 22, 93, 124, + 67, 2, 9, 11, 19, 11, 13, 33, 15, 39, + 15, 35, 3, 124, 53, 44, 50, 38, 30, 28, + 26, 18, 18, 14, 9, 13, 13, 21, 27, 71, + 23, 25, 59, 0, 7, 11, 17, 31, 37, 35, + 33, 45, 33, 49, 63, 47, 65, 5, 44, 32, + 20, 14, 16, 2, 5, 7, 8, 8, 60, 40, + 32, 20, 32, 12, 12, 4, 5, 18, 78, 62, + 52, 46, 46, 16, 4, 1, 9, 13, 94, 58, + 20, 2, 24, 0, 19, 19, 10, 90, 70, 42, + 24, 40, 8, 0, 9, 13, 124, 41, 29, 13, + 21, 21, 15, 7, 0, 9, 3, 2, 14, 23, + 17, 31, 27, 10, 27, 25, 21, 7, 3, 13, + 21, 11, 21, 11, 15, 39, 16, 16, 36, 6, + 3, 16, 10, 4, 8, 6, 2, 11, 1, 7, + 15, 9, 18, 49, 11, 32, 15, 8, 3, 4, + 22, 3, 7, 8, 14, 29, 17, 45, 62, 66, + 70, 46, 34, 44, 38, 34, 30, 18, 22, 18, + 1, 3, 19, 7, 0, 9, 33, 11, 11, 17, + 17, 31, 21, 37, 39, 31, 47, 44, 42, 40, + 34, 8, 18, 14, 11, 1, 13, 19, 47, 37, + 49, 61, 17, 15, 67, 5, 9, 23, 45, 33, + 41, 49, 29, 51, 45, 61, 65, 71, 83, 37, + 55, 65, 23, 11, 12, 7, 34, 52, 5, 0, + 20, 28, 8, 26, 50, 8, 0, 78, 24, 23, + 59, 85, 125, 125, 125, 125, 4, 78, 58, 50, + 30, 48, 24, 12, 8, 9, 23, 13, 10, 5, + 30, 46, 1, 1, 14, 22, 12, 20, 42, 14, + 2, 78, 24, 23, 59, 85, 125, 125, 125, 125, + }, + + { + /* Context Tables for P, SP, B Slices :: cabac_init_idc = 1, qp = 33 */ + + 74, 10, 33, 74, 10, 33, 11, 14, 46, 24, + 3, 13, 13, 60, 118, 28, 10, 11, 10, 22, + 2, 9, 0, 21, 47, 4, 37, 111, 113, 117, + 58, 13, 7, 10, 22, 2, 35, 0, 22, 8, + 1, 15, 23, 0, 31, 33, 59, 10, 7, 17, + 8, 21, 15, 41, 0, 21, 17, 35, 4, 4, + 44, 0, 0, 0, 1, 61, 67, 12, 7, 9, + 44, 3, 41, 34, 20, 6, 72, 76, 22, 28, + 0, 28, 5, 1, 22, 33, 29, 31, 39, 48, + 3, 20, 18, 9, 31, 17, 0, 4, 13, 11, + 27, 34, 9, 8, 11, 25, 1, 13, 0, 8, + 2, 2, 16, 12, 13, 0, 124, 124, 80, 62, + 12, 7, 2, 16, 0, 22, 16, 2, 17, 30, + 9, 45, 39, 54, 18, 26, 10, 12, 18, 26, + 62, 36, 18, 2, 4, 4, 26, 32, 57, 2, + 10, 23, 8, 2, 26, 20, 3, 38, 24, 26, + 27, 12, 9, 5, 22, 51, 20, 30, 24, 12, + 1, 22, 12, 26, 20, 13, 2, 18, 105, 124, + 73, 1, 11, 15, 25, 15, 19, 41, 23, 43, + 19, 41, 7, 124, 55, 40, 46, 34, 26, 24, + 22, 14, 14, 10, 13, 17, 15, 23, 29, 73, + 23, 25, 59, 0, 9, 13, 19, 33, 39, 37, + 35, 47, 33, 51, 63, 47, 63, 1, 46, 34, + 20, 16, 18, 4, 3, 5, 14, 12, 62, 42, + 34, 22, 34, 16, 16, 8, 1, 20, 80, 64, + 54, 48, 48, 18, 6, 0, 7, 13, 96, 58, + 20, 2, 26, 2, 19, 17, 10, 90, 70, 40, + 22, 40, 10, 2, 7, 11, 124, 37, 25, 9, + 17, 17, 11, 3, 4, 5, 0, 6, 20, 21, + 15, 29, 23, 14, 25, 23, 19, 5, 1, 11, + 21, 11, 21, 11, 15, 39, 18, 18, 38, 8, + 3, 18, 12, 4, 10, 8, 2, 11, 0, 5, + 13, 7, 20, 49, 13, 32, 17, 8, 3, 4, + 22, 3, 7, 8, 14, 31, 17, 47, 60, 64, + 70, 44, 32, 40, 34, 30, 26, 14, 18, 14, + 7, 7, 23, 15, 5, 17, 41, 17, 17, 21, + 21, 35, 23, 39, 41, 31, 47, 40, 38, 36, + 30, 4, 14, 10, 17, 5, 17, 23, 51, 39, + 51, 63, 17, 15, 69, 7, 11, 25, 49, 35, + 43, 51, 31, 53, 47, 61, 67, 71, 83, 39, + 57, 67, 21, 9, 14, 5, 38, 56, 3, 2, + 22, 30, 10, 28, 52, 8, 0, 76, 20, 27, + 65, 91, 125, 125, 125, 125, 6, 78, 60, 52, + 32, 50, 26, 14, 10, 7, 21, 11, 12, 3, + 32, 48, 0, 0, 16, 24, 12, 22, 44, 14, + 2, 76, 20, 27, 65, 91, 125, 125, 125, 125, + }, + + { + /* Context Tables for P, SP, B Slices :: cabac_init_idc = 1, qp = 34 */ + + 72, 10, 33, 72, 10, 33, 7, 16, 46, 24, + 3, 15, 17, 58, 118, 28, 14, 11, 10, 24, + 2, 11, 0, 23, 51, 2, 43, 119, 119, 121, + 64, 11, 7, 10, 24, 2, 35, 2, 22, 6, + 1, 15, 21, 0, 31, 33, 59, 10, 7, 17, + 8, 21, 15, 41, 0, 21, 17, 35, 4, 4, + 44, 0, 0, 0, 0, 61, 67, 12, 9, 9, + 42, 3, 41, 38, 22, 8, 76, 80, 24, 30, + 4, 30, 3, 0, 26, 33, 29, 31, 39, 48, + 3, 22, 22, 9, 29, 15, 4, 4, 13, 11, + 29, 34, 9, 8, 9, 23, 0, 11, 2, 8, + 4, 4, 18, 14, 11, 2, 124, 124, 86, 68, + 12, 7, 2, 16, 0, 22, 16, 2, 17, 34, + 9, 49, 43, 60, 18, 26, 10, 12, 18, 26, + 66, 38, 18, 2, 4, 4, 28, 34, 59, 2, + 10, 23, 6, 0, 24, 20, 5, 38, 24, 26, + 29, 12, 11, 7, 20, 51, 20, 28, 22, 8, + 7, 20, 8, 22, 16, 17, 1, 14, 117, 124, + 81, 5, 15, 19, 31, 19, 25, 49, 31, 49, + 23, 47, 11, 124, 59, 36, 42, 30, 22, 20, + 18, 10, 10, 6, 17, 21, 19, 27, 33, 77, + 23, 25, 61, 1, 11, 15, 21, 35, 41, 39, + 39, 49, 33, 53, 65, 49, 61, 0, 48, 34, + 20, 16, 20, 6, 1, 3, 18, 14, 62, 42, + 34, 24, 36, 18, 20, 12, 2, 22, 82, 66, + 56, 50, 50, 18, 6, 0, 5, 13, 98, 58, + 18, 2, 26, 2, 19, 17, 10, 90, 68, 38, + 20, 40, 10, 2, 7, 11, 124, 35, 23, 7, + 13, 13, 9, 0, 8, 3, 4, 10, 24, 19, + 13, 29, 21, 18, 25, 23, 19, 3, 1, 11, + 21, 11, 23, 11, 15, 39, 18, 18, 40, 8, + 3, 18, 12, 4, 10, 8, 2, 13, 0, 5, + 13, 7, 22, 51, 15, 32, 19, 6, 5, 2, + 22, 5, 9, 8, 14, 33, 17, 49, 58, 62, + 68, 40, 28, 36, 30, 26, 22, 8, 12, 8, + 13, 13, 27, 23, 13, 25, 51, 23, 23, 27, + 25, 41, 27, 43, 43, 33, 49, 36, 34, 30, + 26, 1, 8, 4, 23, 9, 23, 27, 57, 43, + 55, 67, 19, 17, 73, 9, 13, 29, 53, 39, + 47, 55, 33, 57, 49, 63, 69, 73, 85, 43, + 59, 69, 21, 9, 16, 5, 40, 58, 3, 4, + 24, 32, 10, 28, 54, 8, 0, 74, 16, 33, + 71, 97, 125, 125, 125, 125, 6, 78, 60, 52, + 32, 52, 26, 14, 10, 7, 21, 11, 12, 3, + 34, 50, 0, 0, 16, 24, 12, 22, 46, 14, + 2, 74, 16, 33, 71, 97, 125, 125, 125, 125, + }, + + { + /* Context Tables for P, SP, B Slices :: cabac_init_idc = 1, qp = 35 */ + + 70, 10, 33, 70, 10, 33, 3, 20, 48, 24, + 5, 19, 21, 56, 118, 28, 18, 11, 12, 26, + 2, 11, 0, 23, 53, 0, 47, 125, 123, 123, + 70, 9, 7, 12, 26, 2, 37, 4, 22, 6, + 1, 15, 21, 0, 31, 33, 59, 10, 7, 15, + 8, 21, 13, 41, 0, 21, 17, 35, 4, 4, + 44, 0, 0, 0, 0, 61, 67, 14, 11, 9, + 40, 5, 41, 44, 24, 10, 78, 84, 28, 34, + 8, 34, 1, 2, 30, 33, 29, 29, 39, 48, + 3, 24, 26, 9, 27, 15, 8, 4, 15, 13, + 29, 34, 9, 10, 9, 21, 2, 11, 4, 10, + 6, 8, 22, 14, 9, 4, 124, 124, 92, 74, + 12, 7, 2, 18, 0, 22, 18, 4, 15, 38, + 9, 53, 47, 66, 18, 26, 10, 12, 18, 28, + 68, 40, 18, 2, 4, 4, 28, 36, 61, 2, + 10, 25, 6, 1, 24, 20, 5, 38, 24, 26, + 31, 12, 13, 9, 20, 51, 20, 28, 22, 4, + 13, 18, 4, 18, 10, 21, 7, 10, 125, 124, + 87, 9, 19, 23, 37, 25, 31, 57, 39, 55, + 27, 55, 15, 124, 63, 32, 38, 26, 16, 14, + 14, 6, 6, 2, 21, 25, 23, 31, 35, 81, + 23, 25, 61, 3, 13, 17, 23, 37, 43, 41, + 43, 51, 35, 55, 67, 49, 61, 4, 50, 36, + 20, 16, 22, 6, 1, 3, 22, 16, 62, 42, + 36, 26, 38, 20, 22, 14, 6, 24, 82, 66, + 56, 50, 52, 20, 6, 0, 5, 13, 98, 58, + 16, 2, 26, 2, 19, 15, 10, 88, 68, 36, + 18, 40, 10, 2, 7, 11, 124, 33, 19, 5, + 11, 11, 7, 4, 12, 1, 6, 14, 28, 19, + 11, 27, 19, 22, 23, 23, 19, 3, 0, 11, + 21, 11, 23, 11, 15, 41, 20, 20, 42, 8, + 3, 20, 12, 4, 10, 8, 2, 13, 0, 5, + 13, 7, 22, 53, 17, 32, 21, 4, 7, 2, + 22, 7, 11, 8, 14, 35, 17, 51, 56, 60, + 66, 38, 24, 32, 26, 22, 16, 4, 6, 2, + 19, 17, 31, 31, 21, 33, 61, 31, 29, 31, + 29, 45, 31, 47, 47, 35, 51, 32, 28, 26, + 20, 7, 4, 0, 29, 15, 27, 33, 63, 47, + 59, 69, 21, 19, 75, 13, 17, 33, 57, 41, + 51, 59, 35, 59, 51, 65, 71, 75, 87, 45, + 61, 71, 21, 7, 16, 3, 42, 60, 3, 4, + 24, 32, 10, 30, 56, 8, 0, 72, 12, 37, + 77, 103, 125, 125, 125, 125, 6, 78, 60, 52, + 32, 52, 28, 14, 10, 5, 21, 11, 14, 1, + 36, 52, 0, 2, 16, 24, 12, 22, 46, 14, + 2, 72, 12, 37, 77, 103, 125, 125, 125, 125, + }, + + { + /* Context Tables for P, SP, B Slices :: cabac_init_idc = 1, qp = 36 */ + + 66, 10, 33, 66, 10, 33, 1, 22, 48, 24, + 5, 21, 25, 54, 118, 26, 22, 11, 12, 28, + 0, 13, 0, 25, 57, 1, 53, 125, 125, 125, + 76, 7, 7, 12, 28, 0, 37, 6, 22, 4, + 1, 13, 19, 0, 31, 33, 59, 10, 7, 15, + 8, 21, 13, 39, 0, 21, 17, 35, 6, 4, + 44, 0, 0, 0, 2, 63, 67, 14, 13, 9, + 38, 5, 41, 48, 26, 12, 82, 88, 30, 36, + 10, 36, 2, 4, 34, 33, 27, 29, 39, 48, + 3, 26, 30, 9, 27, 13, 12, 4, 15, 13, + 31, 34, 9, 10, 7, 19, 4, 9, 6, 10, + 6, 10, 24, 16, 7, 6, 124, 124, 98, 80, + 12, 5, 2, 18, 0, 22, 18, 4, 15, 42, + 9, 57, 51, 70, 18, 28, 10, 12, 18, 28, + 72, 42, 20, 0, 4, 4, 30, 36, 63, 2, + 10, 25, 4, 3, 22, 18, 7, 38, 24, 26, + 33, 12, 15, 11, 18, 51, 18, 26, 20, 0, + 21, 16, 0, 14, 6, 25, 11, 6, 125, 124, + 95, 13, 23, 29, 43, 29, 39, 67, 47, 61, + 31, 61, 19, 124, 65, 28, 34, 20, 12, 10, + 10, 2, 2, 1, 27, 29, 27, 35, 39, 85, + 23, 25, 63, 5, 15, 19, 27, 39, 47, 43, + 45, 53, 35, 57, 67, 51, 59, 6, 52, 36, + 20, 18, 22, 8, 0, 1, 26, 20, 62, 44, + 36, 26, 40, 22, 26, 18, 8, 24, 84, 68, + 58, 52, 54, 20, 6, 2, 3, 13, 100, 58, + 16, 0, 26, 2, 19, 15, 10, 88, 66, 34, + 16, 40, 10, 4, 7, 11, 124, 29, 17, 3, + 7, 7, 5, 6, 14, 0, 10, 16, 32, 17, + 9, 27, 17, 24, 23, 23, 19, 1, 0, 11, + 23, 11, 25, 11, 15, 41, 20, 20, 44, 8, + 3, 20, 12, 4, 10, 8, 2, 15, 0, 5, + 13, 7, 24, 55, 19, 32, 23, 4, 9, 0, + 22, 9, 13, 8, 14, 37, 17, 53, 54, 58, + 64, 34, 20, 28, 22, 16, 12, 1, 0, 3, + 25, 23, 35, 41, 29, 41, 71, 37, 37, 37, + 35, 51, 35, 51, 49, 37, 51, 26, 24, 20, + 16, 13, 1, 5, 35, 19, 33, 37, 69, 51, + 63, 73, 23, 19, 79, 15, 19, 35, 61, 45, + 55, 61, 37, 63, 53, 67, 75, 77, 87, 49, + 65, 75, 19, 7, 18, 3, 44, 62, 3, 6, + 26, 34, 10, 30, 58, 8, 0, 70, 8, 43, + 83, 111, 125, 125, 125, 125, 8, 78, 60, 52, + 32, 54, 28, 14, 10, 5, 21, 11, 14, 1, + 36, 54, 0, 2, 18, 26, 12, 22, 48, 14, + 2, 70, 8, 43, 83, 111, 125, 125, 125, 125, + }, + + { + /* Context Tables for P, SP, B Slices :: cabac_init_idc = 1, qp = 37 */ + + 64, 10, 33, 64, 10, 33, 2, 26, 48, 24, + 7, 23, 29, 52, 118, 26, 26, 11, 12, 30, + 0, 13, 0, 27, 61, 3, 57, 125, 125, 125, + 82, 5, 5, 12, 30, 0, 37, 8, 22, 4, + 1, 13, 17, 0, 31, 33, 59, 10, 7, 15, + 8, 21, 11, 39, 0, 21, 17, 35, 6, 4, + 44, 0, 0, 0, 2, 63, 67, 16, 15, 9, + 36, 7, 41, 54, 28, 14, 86, 92, 34, 40, + 14, 40, 4, 6, 40, 33, 27, 27, 37, 48, + 3, 28, 34, 9, 25, 13, 16, 4, 17, 15, + 31, 34, 9, 12, 7, 17, 6, 9, 8, 12, + 8, 14, 28, 18, 3, 8, 124, 124, 104, 86, + 14, 5, 4, 20, 0, 22, 20, 4, 13, 46, + 9, 61, 55, 76, 18, 28, 10, 12, 18, 30, + 74, 44, 20, 0, 4, 4, 30, 38, 65, 4, + 10, 27, 4, 5, 22, 18, 7, 38, 24, 26, + 35, 14, 17, 11, 18, 51, 18, 24, 18, 3, + 27, 14, 1, 10, 2, 29, 17, 2, 125, 124, + 101, 17, 27, 33, 49, 35, 45, 75, 55, 65, + 35, 69, 23, 124, 69, 24, 30, 16, 8, 6, + 6, 1, 1, 5, 31, 33, 31, 37, 43, 89, + 23, 25, 63, 7, 17, 21, 29, 41, 49, 45, + 49, 55, 37, 59, 69, 53, 57, 10, 54, 38, + 20, 18, 24, 8, 0, 0, 30, 22, 62, 44, + 38, 28, 42, 24, 28, 20, 12, 26, 86, 70, + 60, 52, 56, 22, 8, 2, 3, 13, 100, 58, + 14, 0, 26, 2, 19, 13, 10, 86, 64, 32, + 14, 40, 10, 4, 7, 9, 124, 27, 13, 0, + 3, 5, 3, 10, 18, 2, 14, 20, 36, 15, + 7, 25, 15, 28, 23, 23, 17, 1, 2, 11, + 23, 11, 25, 11, 15, 43, 22, 20, 46, 8, + 3, 20, 12, 4, 10, 8, 2, 15, 0, 5, + 13, 7, 24, 57, 21, 32, 25, 2, 9, 1, + 22, 9, 15, 8, 14, 39, 17, 55, 52, 56, + 62, 30, 16, 24, 18, 12, 6, 7, 5, 9, + 31, 27, 39, 49, 37, 49, 81, 43, 43, 41, + 39, 55, 39, 53, 53, 37, 53, 22, 18, 16, + 12, 19, 7, 9, 41, 23, 37, 41, 75, 55, + 67, 75, 23, 21, 83, 17, 23, 39, 65, 49, + 59, 65, 39, 65, 55, 67, 77, 79, 89, 51, + 67, 77, 19, 7, 20, 1, 46, 64, 1, 8, + 26, 34, 10, 32, 60, 8, 0, 68, 4, 49, + 89, 117, 125, 125, 125, 125, 8, 78, 60, 52, + 32, 54, 28, 16, 12, 3, 21, 11, 16, 1, + 38, 56, 0, 2, 18, 26, 12, 24, 48, 14, + 2, 68, 4, 49, 89, 117, 125, 125, 125, 125, + }, + + { + /* Context Tables for P, SP, B Slices :: cabac_init_idc = 1, qp = 38 */ + + 62, 10, 35, 62, 10, 35, 6, 28, 50, 24, + 7, 27, 33, 50, 118, 26, 28, 11, 14, 30, + 0, 15, 2, 27, 63, 7, 63, 125, 125, 125, + 86, 3, 5, 14, 30, 0, 39, 8, 22, 2, + 1, 13, 17, 0, 31, 31, 57, 10, 5, 13, + 8, 19, 11, 39, 0, 21, 17, 35, 6, 4, + 44, 0, 0, 0, 4, 63, 67, 16, 15, 11, + 36, 7, 41, 58, 30, 16, 88, 98, 36, 42, + 18, 42, 6, 8, 44, 33, 27, 27, 37, 48, + 3, 30, 40, 9, 23, 11, 20, 2, 17, 15, + 33, 36, 9, 12, 5, 17, 6, 7, 10, 12, + 10, 16, 30, 18, 1, 10, 124, 124, 110, 92, + 14, 5, 4, 20, 0, 22, 20, 6, 13, 50, + 7, 65, 59, 82, 18, 28, 10, 12, 20, 30, + 78, 46, 20, 0, 4, 4, 32, 40, 67, 4, + 10, 27, 2, 7, 20, 18, 9, 38, 24, 26, + 39, 14, 19, 13, 16, 51, 18, 24, 18, 7, + 33, 12, 5, 4, 3, 33, 21, 1, 125, 124, + 109, 21, 29, 37, 57, 39, 51, 83, 65, 71, + 39, 75, 27, 124, 73, 20, 26, 12, 2, 0, + 2, 7, 5, 9, 35, 39, 35, 41, 45, 91, + 23, 25, 65, 9, 19, 23, 31, 43, 51, 47, + 53, 57, 37, 61, 71, 53, 57, 12, 56, 38, + 20, 18, 26, 10, 2, 0, 34, 24, 64, 44, + 38, 30, 44, 26, 32, 24, 16, 28, 86, 70, + 60, 54, 58, 22, 8, 2, 1, 13, 102, 58, + 12, 0, 26, 2, 19, 13, 8, 86, 64, 30, + 12, 38, 10, 4, 7, 9, 124, 25, 11, 2, + 1, 1, 1, 14, 22, 4, 16, 24, 40, 15, + 5, 25, 13, 32, 21, 23, 17, 0, 2, 11, + 23, 11, 27, 11, 15, 43, 22, 22, 48, 8, + 3, 22, 14, 4, 12, 10, 2, 17, 0, 5, + 13, 7, 26, 59, 23, 32, 27, 0, 11, 1, + 22, 11, 15, 8, 12, 43, 19, 57, 48, 54, + 60, 28, 12, 20, 14, 8, 2, 11, 11, 13, + 37, 33, 43, 57, 45, 57, 89, 51, 49, 47, + 43, 61, 43, 57, 55, 39, 55, 18, 14, 10, + 6, 25, 11, 15, 47, 29, 43, 47, 81, 59, + 69, 79, 25, 23, 85, 21, 25, 43, 69, 51, + 63, 69, 41, 69, 57, 69, 79, 81, 91, 55, + 69, 79, 19, 5, 20, 1, 48, 66, 1, 8, + 28, 36, 10, 32, 62, 8, 0, 66, 0, 53, + 95, 123, 125, 125, 125, 125, 8, 78, 60, 52, + 32, 56, 30, 16, 12, 3, 19, 9, 16, 0, + 40, 58, 2, 4, 18, 26, 12, 24, 50, 14, + 2, 66, 0, 53, 95, 123, 125, 125, 125, 125, + }, + + { + /* Context Tables for P, SP, B Slices :: cabac_init_idc = 1, qp = 39 */ + + 60, 10, 35, 60, 10, 35, 10, 32, 50, 24, + 9, 29, 37, 48, 118, 26, 32, 11, 14, 32, + 1, 15, 2, 29, 67, 9, 67, 125, 125, 125, + 92, 1, 5, 14, 32, 1, 39, 10, 22, 2, + 1, 11, 15, 0, 31, 31, 57, 10, 5, 13, + 8, 19, 9, 37, 0, 21, 17, 35, 8, 4, + 44, 0, 0, 0, 4, 63, 67, 18, 17, 11, + 34, 9, 41, 64, 32, 18, 92, 102, 40, 46, + 20, 46, 10, 10, 48, 33, 25, 25, 37, 48, + 3, 32, 44, 9, 21, 11, 24, 2, 19, 17, + 33, 36, 9, 14, 5, 15, 8, 7, 12, 14, + 12, 20, 34, 20, 0, 12, 124, 124, 116, 98, + 14, 3, 4, 22, 0, 22, 22, 6, 11, 54, + 7, 69, 63, 88, 18, 30, 10, 12, 20, 32, + 80, 48, 22, 1, 4, 4, 32, 40, 69, 4, + 10, 29, 2, 9, 20, 16, 9, 38, 24, 26, + 41, 14, 21, 15, 16, 51, 18, 22, 16, 11, + 39, 10, 9, 0, 7, 37, 27, 5, 125, 124, + 115, 25, 33, 43, 63, 45, 57, 93, 73, 77, + 43, 83, 31, 124, 75, 16, 22, 8, 1, 3, + 1, 11, 9, 13, 41, 43, 39, 45, 49, 95, + 23, 25, 65, 11, 21, 25, 35, 45, 53, 49, + 55, 59, 39, 63, 71, 55, 55, 16, 58, 40, + 20, 20, 26, 10, 2, 2, 38, 28, 64, 46, + 40, 30, 46, 28, 34, 26, 20, 28, 88, 72, + 62, 54, 60, 24, 8, 4, 1, 13, 102, 58, + 12, 1, 26, 2, 19, 11, 8, 84, 62, 28, + 10, 38, 10, 6, 7, 9, 124, 21, 7, 4, + 2, 0, 0, 16, 24, 6, 20, 26, 44, 13, + 3, 23, 11, 36, 21, 23, 17, 0, 4, 11, + 25, 11, 27, 11, 15, 45, 24, 22, 50, 8, + 3, 22, 14, 4, 12, 10, 2, 17, 0, 5, + 13, 7, 26, 61, 25, 32, 29, 0, 13, 3, + 22, 13, 17, 8, 12, 45, 19, 59, 46, 52, + 58, 24, 8, 16, 10, 2, 3, 17, 17, 19, + 43, 37, 47, 67, 53, 65, 99, 57, 55, 51, + 47, 65, 47, 61, 59, 41, 55, 12, 8, 6, + 2, 31, 17, 19, 53, 33, 47, 51, 87, 63, + 73, 81, 27, 23, 89, 23, 29, 45, 73, 55, + 67, 71, 43, 71, 59, 71, 83, 83, 91, 57, + 73, 83, 17, 5, 22, 0, 50, 68, 1, 10, + 28, 36, 10, 34, 64, 8, 0, 64, 3, 59, + 101, 125, 125, 125, 125, 125, 10, 78, 60, 52, + 32, 56, 30, 16, 12, 1, 19, 9, 18, 0, + 42, 60, 2, 4, 20, 28, 12, 24, 50, 14, + 2, 64, 3, 59, 101, 125, 125, 125, 125, 125, + }, + + { + /* Context Tables for P, SP, B Slices :: cabac_init_idc = 1, qp = 40 */ + + 56, 8, 35, 56, 8, 35, 12, 34, 50, 24, + 9, 33, 43, 46, 118, 24, 36, 13, 14, 34, + 1, 17, 2, 31, 71, 11, 73, 125, 125, 125, + 98, 0, 5, 14, 34, 1, 41, 12, 22, 0, + 1, 11, 15, 1, 33, 31, 57, 10, 5, 13, + 8, 19, 9, 37, 0, 21, 17, 35, 8, 4, + 44, 0, 0, 0, 6, 65, 67, 18, 19, 11, + 32, 9, 43, 68, 34, 20, 94, 106, 42, 48, + 24, 48, 12, 12, 52, 33, 25, 25, 37, 48, + 3, 34, 48, 9, 21, 9, 28, 2, 19, 17, + 35, 36, 9, 14, 3, 13, 10, 5, 12, 14, + 12, 22, 36, 20, 2, 14, 124, 124, 122, 102, + 14, 3, 4, 22, 1, 22, 22, 6, 11, 58, + 7, 73, 67, 92, 18, 30, 10, 12, 20, 32, + 84, 48, 22, 1, 2, 4, 34, 42, 73, 4, + 10, 29, 0, 13, 18, 16, 11, 38, 24, 24, + 43, 14, 23, 17, 14, 51, 16, 20, 14, 15, + 47, 6, 13, 3, 13, 43, 31, 9, 125, 124, + 123, 29, 37, 47, 69, 49, 65, 101, 81, 83, + 47, 89, 35, 124, 79, 12, 16, 2, 7, 9, + 7, 15, 15, 17, 45, 47, 43, 49, 53, 99, + 23, 27, 67, 13, 25, 27, 37, 47, 57, 53, + 59, 61, 39, 65, 73, 57, 55, 18, 60, 40, + 20, 20, 28, 12, 4, 2, 42, 30, 64, 46, + 40, 32, 48, 30, 38, 30, 22, 30, 88, 72, + 62, 56, 62, 24, 8, 4, 0, 15, 104, 58, + 10, 1, 26, 2, 19, 11, 8, 84, 60, 26, + 8, 38, 10, 6, 7, 9, 124, 19, 5, 6, + 4, 4, 2, 20, 28, 8, 22, 30, 48, 13, + 3, 23, 9, 38, 21, 23, 17, 2, 4, 11, + 25, 11, 29, 11, 15, 45, 24, 22, 52, 8, + 5, 22, 14, 2, 12, 10, 2, 19, 0, 5, + 13, 7, 28, 63, 27, 32, 33, 1, 15, 5, + 20, 15, 19, 8, 12, 47, 19, 63, 44, 48, + 56, 20, 4, 12, 6, 1, 7, 23, 23, 25, + 49, 43, 51, 75, 61, 75, 109, 65, 63, 57, + 53, 71, 51, 65, 61, 43, 57, 8, 4, 0, + 3, 37, 23, 25, 59, 39, 53, 57, 93, 67, + 77, 85, 29, 25, 93, 27, 31, 49, 77, 59, + 71, 75, 47, 75, 63, 73, 85, 85, 93, 61, + 75, 85, 17, 5, 22, 0, 52, 70, 1, 10, + 30, 38, 10, 34, 64, 8, 1, 62, 7, 65, + 107, 125, 125, 125, 125, 125, 10, 78, 60, 52, + 32, 58, 30, 16, 12, 1, 19, 9, 18, 0, + 42, 60, 2, 4, 20, 28, 12, 24, 52, 14, + 0, 62, 7, 65, 107, 125, 125, 125, 125, 125, + }, + + { + /* Context Tables for P, SP, B Slices :: cabac_init_idc = 1, qp = 41 */ + + 54, 8, 35, 54, 8, 35, 16, 36, 52, 24, + 9, 35, 47, 44, 120, 24, 40, 13, 16, 36, + 1, 19, 2, 31, 73, 13, 77, 125, 125, 125, + 104, 2, 3, 16, 36, 1, 41, 14, 22, 0, + 0, 11, 13, 1, 33, 31, 57, 10, 5, 11, + 8, 19, 9, 37, 0, 21, 17, 33, 8, 4, + 44, 0, 0, 0, 8, 65, 67, 18, 21, 11, + 30, 9, 43, 72, 38, 22, 98, 110, 46, 50, + 28, 52, 14, 16, 58, 33, 25, 25, 35, 48, + 3, 36, 52, 7, 19, 7, 32, 2, 19, 17, + 35, 36, 9, 14, 1, 11, 12, 3, 14, 16, + 14, 24, 40, 22, 6, 16, 124, 124, 124, 108, + 16, 3, 6, 24, 1, 22, 22, 8, 9, 64, + 7, 77, 69, 98, 18, 30, 10, 12, 20, 34, + 88, 50, 22, 1, 2, 4, 36, 44, 75, 6, + 10, 29, 1, 15, 16, 16, 13, 38, 24, 24, + 45, 16, 25, 17, 12, 51, 16, 20, 14, 19, + 53, 4, 15, 7, 17, 47, 35, 13, 125, 124, + 125, 33, 41, 51, 75, 53, 71, 109, 89, 87, + 51, 95, 39, 124, 83, 8, 12, 1, 11, 13, + 11, 19, 19, 21, 49, 51, 45, 51, 55, 103, + 23, 27, 67, 13, 27, 29, 39, 49, 59, 55, + 63, 63, 39, 67, 75, 57, 53, 20, 62, 40, + 20, 20, 30, 14, 6, 4, 48, 32, 64, 46, + 40, 34, 50, 32, 42, 34, 26, 32, 90, 74, + 64, 58, 64, 24, 10, 4, 2, 15, 106, 58, + 8, 1, 28, 4, 19, 9, 8, 84, 60, 24, + 6, 38, 10, 6, 7, 7, 124, 17, 3, 10, + 8, 8, 4, 24, 32, 10, 26, 34, 52, 11, + 1, 21, 7, 42, 19, 21, 15, 4, 6, 11, + 25, 11, 31, 11, 15, 45, 24, 24, 54, 10, + 5, 24, 14, 2, 12, 10, 2, 21, 0, 3, + 11, 7, 30, 63, 29, 32, 35, 3, 15, 5, + 20, 15, 21, 8, 12, 49, 19, 65, 42, 46, + 56, 18, 0, 8, 2, 5, 11, 27, 29, 31, + 55, 49, 55, 83, 69, 83, 119, 71, 69, 63, + 57, 75, 53, 67, 63, 43, 59, 4, 0, 3, + 7, 41, 27, 29, 65, 43, 57, 61, 97, 71, + 81, 89, 29, 27, 95, 29, 33, 53, 81, 61, + 73, 79, 49, 79, 65, 73, 87, 85, 95, 65, + 77, 87, 17, 3, 24, 0, 54, 74, 0, 12, + 32, 40, 10, 36, 66, 8, 1, 60, 11, 69, + 113, 125, 125, 125, 125, 125, 10, 78, 60, 54, + 34, 60, 32, 18, 14, 1, 19, 9, 18, 2, + 44, 62, 2, 6, 20, 28, 12, 26, 54, 14, + 0, 60, 11, 69, 113, 125, 125, 125, 125, 125, + }, + + { + /* Context Tables for P, SP, B Slices :: cabac_init_idc = 1, qp = 42 */ + + 52, 8, 35, 52, 8, 35, 20, 40, 52, 24, + 11, 37, 51, 42, 120, 24, 44, 13, 16, 38, + 3, 19, 2, 33, 77, 15, 83, 125, 125, 125, + 110, 4, 3, 16, 38, 3, 41, 16, 22, 1, + 0, 9, 11, 1, 33, 31, 57, 10, 5, 11, + 8, 19, 7, 35, 0, 21, 17, 33, 10, 4, + 44, 0, 0, 0, 8, 65, 67, 20, 23, 11, + 28, 11, 43, 78, 40, 24, 102, 114, 48, 54, + 30, 54, 18, 18, 62, 33, 23, 23, 35, 48, + 3, 38, 56, 7, 17, 7, 36, 2, 21, 19, + 37, 36, 9, 16, 1, 9, 14, 3, 16, 16, + 16, 28, 42, 24, 8, 18, 124, 124, 124, 114, + 16, 1, 6, 24, 1, 22, 24, 8, 9, 68, + 7, 81, 73, 104, 18, 32, 10, 12, 20, 34, + 90, 52, 24, 3, 2, 4, 36, 44, 77, 6, + 10, 31, 1, 17, 16, 14, 13, 38, 24, 24, + 47, 16, 27, 19, 12, 51, 16, 18, 12, 23, + 59, 2, 19, 11, 21, 51, 41, 17, 125, 124, + 125, 37, 45, 57, 81, 59, 77, 119, 97, 93, + 55, 103, 43, 124, 85, 4, 8, 5, 15, 17, + 15, 23, 23, 25, 55, 55, 49, 55, 59, 107, + 23, 27, 69, 15, 29, 31, 43, 51, 61, 57, + 65, 65, 41, 69, 75, 59, 51, 24, 64, 42, + 20, 22, 30, 14, 6, 6, 52, 36, 64, 48, + 42, 34, 52, 34, 44, 36, 30, 32, 92, 76, + 66, 58, 66, 26, 10, 6, 2, 15, 106, 58, + 8, 3, 28, 4, 19, 9, 8, 82, 58, 22, + 4, 38, 10, 8, 7, 7, 124, 13, 0, 12, + 12, 10, 6, 26, 34, 12, 30, 36, 56, 9, + 0, 21, 5, 46, 19, 21, 15, 4, 6, 11, + 27, 11, 31, 11, 15, 47, 26, 24, 56, 10, + 5, 24, 14, 2, 12, 10, 2, 21, 0, 3, + 11, 7, 30, 65, 31, 32, 37, 3, 17, 7, + 20, 17, 23, 8, 12, 51, 19, 67, 40, 44, + 54, 14, 3, 4, 1, 11, 17, 33, 35, 37, + 61, 53, 59, 93, 77, 91, 125, 77, 75, 67, + 61, 81, 57, 71, 67, 45, 59, 1, 5, 9, + 11, 47, 33, 35, 71, 47, 63, 65, 103, 75, + 85, 91, 31, 27, 99, 31, 37, 55, 85, 65, + 77, 81, 51, 81, 67, 75, 91, 87, 95, 67, + 81, 91, 15, 3, 26, 2, 56, 76, 0, 14, + 32, 40, 10, 36, 68, 8, 1, 58, 15, 75, + 119, 125, 125, 125, 125, 125, 12, 78, 60, 54, + 34, 60, 32, 18, 14, 0, 19, 9, 20, 2, + 46, 64, 2, 6, 22, 30, 12, 26, 54, 14, + 0, 58, 15, 75, 119, 125, 125, 125, 125, 125, + }, + + { + /* Context Tables for P, SP, B Slices :: cabac_init_idc = 1, qp = 43 */ + + 50, 8, 37, 50, 8, 37, 24, 42, 54, 24, + 11, 41, 55, 40, 120, 24, 46, 13, 18, 38, + 3, 21, 4, 33, 79, 19, 87, 125, 125, 125, + 114, 6, 3, 18, 38, 3, 43, 16, 22, 1, + 0, 9, 11, 1, 33, 29, 55, 10, 3, 9, + 8, 17, 7, 35, 0, 21, 17, 33, 10, 4, + 44, 0, 0, 0, 10, 65, 67, 20, 23, 13, + 28, 11, 43, 82, 42, 26, 104, 120, 52, 56, + 34, 58, 20, 20, 66, 33, 23, 23, 35, 48, + 3, 40, 62, 7, 15, 5, 40, 0, 21, 19, + 37, 38, 9, 16, 0, 9, 14, 1, 18, 18, + 18, 30, 46, 24, 10, 20, 124, 124, 124, 120, + 16, 1, 6, 26, 1, 22, 24, 10, 7, 72, + 5, 85, 77, 110, 18, 32, 10, 12, 22, 36, + 94, 54, 24, 3, 2, 4, 38, 46, 79, 6, + 10, 31, 3, 19, 14, 14, 15, 38, 24, 24, + 51, 16, 29, 21, 10, 51, 16, 18, 12, 27, + 65, 0, 23, 17, 27, 55, 45, 21, 125, 124, + 125, 41, 47, 61, 89, 63, 83, 125, 107, 99, + 59, 109, 47, 124, 89, 0, 4, 9, 21, 23, + 19, 29, 27, 29, 59, 61, 53, 59, 61, 109, + 23, 27, 69, 17, 31, 33, 45, 53, 63, 59, + 69, 67, 41, 71, 77, 59, 51, 26, 66, 42, + 20, 22, 32, 16, 8, 6, 56, 38, 66, 48, + 42, 36, 54, 36, 48, 40, 34, 34, 92, 76, + 66, 60, 68, 26, 10, 6, 4, 15, 108, 58, + 6, 3, 28, 4, 19, 7, 6, 82, 58, 20, + 2, 36, 10, 8, 7, 7, 124, 11, 2, 14, + 14, 14, 8, 30, 38, 14, 32, 40, 60, 9, + 2, 19, 3, 50, 17, 21, 15, 6, 8, 11, + 27, 11, 33, 11, 15, 47, 26, 26, 58, 10, + 5, 26, 16, 2, 14, 12, 2, 23, 0, 3, + 11, 7, 32, 67, 33, 32, 39, 5, 19, 7, + 20, 19, 23, 8, 10, 55, 21, 69, 36, 42, + 52, 12, 7, 0, 5, 15, 21, 37, 41, 41, + 67, 59, 63, 101, 85, 99, 125, 85, 81, 73, + 65, 85, 61, 75, 69, 47, 61, 5, 9, 13, + 17, 53, 37, 39, 77, 53, 67, 71, 109, 79, + 87, 95, 33, 29, 101, 35, 39, 59, 89, 67, + 81, 85, 53, 85, 69, 77, 93, 89, 97, 71, + 83, 93, 15, 1, 26, 2, 58, 78, 0, 14, + 34, 42, 10, 38, 70, 8, 1, 56, 19, 79, + 125, 125, 125, 125, 125, 125, 12, 78, 60, 54, + 34, 62, 34, 18, 14, 0, 17, 7, 20, 4, + 48, 66, 4, 8, 22, 30, 12, 26, 56, 14, + 0, 56, 19, 79, 125, 125, 125, 125, 125, 125, + }, + + { + /* Context Tables for P, SP, B Slices :: cabac_init_idc = 1, qp = 44 */ + + 46, 8, 37, 46, 8, 37, 26, 46, 54, 24, + 13, 43, 59, 38, 120, 22, 50, 13, 18, 40, + 3, 21, 4, 35, 83, 21, 93, 125, 125, 125, + 120, 8, 3, 18, 40, 3, 43, 18, 22, 3, + 0, 9, 9, 1, 33, 29, 55, 10, 3, 9, + 8, 17, 5, 35, 0, 21, 17, 33, 10, 4, + 44, 0, 0, 0, 10, 67, 67, 22, 25, 13, + 26, 13, 43, 88, 44, 28, 108, 124, 54, 60, + 38, 60, 22, 22, 70, 33, 23, 21, 35, 48, + 3, 42, 66, 7, 15, 5, 44, 0, 23, 21, + 39, 38, 9, 18, 0, 7, 16, 1, 20, 18, + 18, 34, 48, 26, 12, 22, 124, 124, 124, 124, + 16, 1, 6, 26, 1, 22, 26, 10, 7, 76, + 5, 89, 81, 114, 18, 32, 10, 12, 22, 36, + 96, 56, 24, 3, 2, 4, 38, 48, 81, 6, + 10, 33, 3, 21, 14, 14, 15, 38, 24, 24, + 53, 16, 31, 23, 10, 51, 14, 16, 10, 31, + 73, 1, 27, 21, 31, 59, 51, 25, 125, 124, + 125, 45, 51, 65, 95, 69, 91, 125, 115, 105, + 63, 117, 51, 124, 93, 3, 0, 15, 25, 27, + 23, 33, 31, 33, 63, 65, 57, 63, 65, 113, + 23, 27, 71, 19, 33, 35, 47, 55, 67, 61, + 73, 69, 43, 73, 79, 61, 49, 30, 68, 44, + 20, 22, 34, 16, 8, 8, 60, 40, 66, 48, + 44, 38, 56, 38, 50, 42, 36, 36, 94, 78, + 68, 60, 70, 28, 10, 6, 4, 15, 108, 58, + 4, 3, 28, 4, 19, 7, 6, 80, 56, 18, + 0, 36, 10, 8, 7, 7, 124, 9, 6, 16, + 18, 16, 10, 34, 42, 16, 36, 44, 64, 7, + 4, 19, 1, 52, 17, 21, 15, 6, 8, 11, + 27, 11, 33, 11, 15, 49, 28, 26, 60, 10, + 5, 26, 16, 2, 14, 12, 2, 23, 0, 3, + 11, 7, 32, 69, 35, 32, 41, 7, 21, 9, + 20, 21, 25, 8, 10, 57, 21, 71, 34, 40, + 50, 8, 11, 3, 9, 19, 27, 43, 47, 47, + 73, 63, 67, 109, 93, 107, 125, 91, 89, 77, + 71, 91, 65, 79, 73, 49, 63, 9, 15, 19, + 21, 59, 43, 45, 83, 57, 73, 75, 115, 83, + 91, 97, 35, 31, 105, 37, 43, 63, 93, 71, + 85, 89, 55, 87, 71, 79, 95, 91, 99, 73, + 85, 95, 15, 1, 28, 4, 60, 80, 0, 16, + 34, 42, 10, 38, 72, 8, 1, 54, 23, 85, + 125, 125, 125, 125, 125, 125, 12, 78, 60, 54, + 34, 62, 34, 18, 14, 2, 17, 7, 22, 4, + 48, 68, 4, 8, 22, 30, 12, 26, 56, 14, + 0, 54, 23, 85, 125, 125, 125, 125, 125, 125, + }, + + { + /* Context Tables for P, SP, B Slices :: cabac_init_idc = 1, qp = 45 */ + + 44, 8, 37, 44, 8, 37, 30, 48, 54, 24, + 13, 45, 63, 36, 120, 22, 54, 13, 18, 42, + 5, 23, 4, 37, 87, 23, 97, 125, 125, 125, + 124, 10, 1, 18, 42, 5, 43, 20, 22, 3, + 0, 7, 7, 1, 33, 29, 55, 10, 3, 9, + 8, 17, 5, 33, 0, 21, 17, 33, 12, 4, + 44, 0, 0, 0, 12, 67, 67, 22, 27, 13, + 24, 13, 43, 92, 46, 30, 112, 124, 58, 62, + 40, 64, 26, 24, 76, 33, 21, 21, 33, 48, + 3, 44, 70, 7, 13, 3, 48, 0, 23, 21, + 39, 38, 9, 18, 2, 5, 18, 0, 22, 20, + 20, 36, 52, 28, 16, 24, 124, 124, 124, 124, + 18, 0, 8, 28, 1, 22, 26, 10, 5, 80, + 5, 93, 85, 120, 18, 34, 10, 12, 22, 38, + 100, 58, 26, 5, 2, 4, 40, 48, 83, 8, + 10, 33, 5, 23, 12, 12, 17, 38, 24, 24, + 55, 18, 33, 23, 8, 51, 14, 14, 8, 35, + 79, 3, 29, 25, 35, 63, 55, 29, 125, 124, + 125, 49, 55, 71, 101, 73, 97, 125, 123, 109, + 67, 123, 55, 124, 95, 7, 3, 19, 29, 31, + 27, 37, 35, 37, 69, 69, 61, 65, 69, 117, + 23, 27, 71, 21, 35, 37, 51, 57, 69, 63, + 75, 71, 43, 75, 79, 63, 47, 32, 70, 44, + 20, 24, 34, 18, 10, 10, 64, 44, 66, 50, + 44, 38, 58, 40, 54, 46, 40, 36, 96, 80, + 70, 62, 72, 28, 12, 8, 6, 15, 110, 58, + 4, 5, 28, 4, 19, 5, 6, 80, 54, 16, + 1, 36, 10, 10, 7, 5, 124, 5, 8, 20, + 22, 20, 12, 36, 44, 18, 40, 46, 68, 5, + 6, 17, 0, 56, 17, 21, 13, 8, 10, 11, + 29, 11, 35, 11, 15, 49, 28, 26, 62, 10, + 5, 26, 16, 2, 14, 12, 2, 25, 0, 3, + 11, 7, 34, 71, 37, 32, 43, 7, 21, 11, + 20, 21, 27, 8, 10, 59, 21, 73, 32, 38, + 48, 4, 15, 7, 13, 25, 31, 49, 53, 53, + 79, 69, 71, 119, 101, 115, 125, 97, 95, 83, + 75, 95, 69, 81, 75, 49, 63, 15, 19, 23, + 25, 65, 49, 49, 89, 61, 77, 79, 121, 87, + 95, 101, 35, 31, 109, 39, 45, 65, 97, 75, + 89, 91, 57, 91, 73, 79, 99, 93, 99, 77, + 89, 99, 13, 1, 30, 4, 62, 82, 2, 18, + 36, 44, 10, 40, 74, 8, 1, 52, 27, 91, + 125, 125, 125, 125, 125, 125, 14, 78, 60, 54, + 34, 64, 34, 20, 16, 2, 17, 7, 22, 4, + 50, 70, 4, 8, 24, 32, 12, 28, 58, 14, + 0, 52, 27, 91, 125, 125, 125, 125, 125, 125, + }, + + { + /* Context Tables for P, SP, B Slices :: cabac_init_idc = 1, qp = 46 */ + + 42, 8, 37, 42, 8, 37, 34, 52, 56, 24, + 15, 49, 67, 34, 120, 22, 58, 13, 20, 44, + 5, 23, 4, 37, 89, 25, 103, 125, 125, 125, + 124, 12, 1, 20, 44, 5, 45, 22, 22, 5, + 0, 7, 7, 1, 33, 29, 55, 10, 3, 7, + 8, 17, 3, 33, 0, 21, 17, 33, 12, 4, + 44, 0, 0, 0, 12, 67, 67, 24, 29, 13, + 22, 15, 43, 98, 48, 32, 114, 124, 60, 66, + 44, 66, 28, 26, 80, 33, 21, 19, 33, 48, + 3, 46, 74, 7, 11, 3, 52, 0, 25, 23, + 41, 38, 9, 20, 2, 3, 20, 0, 24, 20, + 22, 40, 54, 28, 18, 26, 124, 124, 124, 124, + 18, 0, 8, 28, 1, 22, 28, 12, 5, 84, + 5, 97, 89, 124, 18, 34, 10, 12, 22, 38, + 102, 60, 26, 5, 2, 4, 40, 50, 85, 8, + 10, 35, 5, 25, 12, 12, 17, 38, 24, 24, + 57, 18, 35, 25, 8, 51, 14, 14, 8, 39, + 85, 5, 33, 29, 41, 67, 61, 33, 125, 124, + 125, 53, 59, 75, 107, 79, 103, 125, 125, 115, + 71, 125, 59, 124, 99, 11, 7, 23, 35, 37, + 31, 41, 39, 41, 73, 73, 65, 69, 71, 121, + 23, 27, 73, 23, 37, 39, 53, 59, 71, 65, + 79, 73, 45, 77, 81, 63, 47, 36, 72, 46, + 20, 24, 36, 18, 10, 10, 68, 46, 66, 50, + 46, 40, 60, 42, 56, 48, 44, 38, 96, 80, + 70, 62, 74, 30, 12, 8, 6, 15, 110, 58, + 2, 5, 28, 4, 19, 5, 6, 78, 54, 14, + 3, 36, 10, 10, 7, 5, 124, 3, 12, 22, + 24, 22, 14, 40, 48, 20, 42, 50, 72, 5, + 8, 17, 2, 60, 15, 21, 13, 8, 10, 11, + 29, 11, 35, 11, 15, 51, 30, 28, 64, 10, + 5, 28, 16, 2, 14, 12, 2, 25, 0, 3, + 11, 7, 34, 73, 39, 32, 45, 9, 23, 11, + 20, 23, 29, 8, 10, 61, 21, 75, 30, 36, + 46, 2, 19, 11, 17, 29, 37, 53, 59, 59, + 85, 73, 75, 125, 109, 123, 125, 105, 101, 87, + 79, 101, 73, 85, 79, 51, 65, 19, 25, 29, + 31, 71, 53, 55, 95, 67, 83, 85, 125, 91, + 99, 103, 37, 33, 111, 43, 49, 69, 101, 77, + 93, 95, 59, 93, 75, 81, 101, 95, 101, 79, + 91, 101, 13, 0, 30, 6, 64, 84, 2, 18, + 36, 44, 10, 40, 76, 8, 1, 50, 31, 95, + 125, 125, 125, 125, 125, 125, 14, 78, 60, 54, + 34, 64, 36, 20, 16, 4, 17, 7, 24, 6, + 52, 72, 4, 10, 24, 32, 12, 28, 58, 14, + 0, 50, 31, 95, 125, 125, 125, 125, 125, 125, + }, + + { + /* Context Tables for P, SP, B Slices :: cabac_init_idc = 1, qp = 47 */ + + 40, 8, 37, 40, 8, 37, 38, 54, 56, 24, + 15, 51, 71, 32, 120, 22, 62, 13, 20, 46, + 5, 25, 4, 39, 93, 27, 107, 125, 125, 125, + 124, 14, 1, 20, 46, 5, 45, 24, 22, 5, + 0, 7, 5, 1, 33, 29, 55, 10, 3, 7, + 8, 17, 3, 33, 0, 21, 17, 33, 12, 4, + 44, 0, 0, 0, 14, 67, 67, 24, 31, 13, + 20, 15, 43, 102, 50, 34, 118, 124, 64, 68, + 48, 70, 30, 28, 84, 33, 21, 19, 33, 48, + 3, 48, 78, 7, 9, 1, 56, 0, 25, 23, + 41, 38, 9, 20, 4, 1, 22, 2, 26, 22, + 24, 42, 58, 30, 20, 28, 124, 124, 124, 124, + 18, 0, 8, 30, 1, 22, 28, 12, 3, 88, + 5, 101, 93, 124, 18, 34, 10, 12, 22, 40, + 106, 62, 26, 5, 2, 4, 42, 52, 87, 8, + 10, 35, 7, 27, 10, 12, 19, 38, 24, 24, + 59, 18, 37, 27, 6, 51, 14, 12, 6, 43, + 91, 7, 37, 33, 45, 71, 65, 37, 125, 124, + 125, 57, 63, 79, 113, 83, 109, 125, 125, 121, + 75, 125, 63, 124, 103, 15, 11, 27, 39, 41, + 35, 45, 43, 45, 77, 77, 69, 73, 75, 125, + 23, 27, 73, 25, 39, 41, 55, 61, 73, 67, + 83, 75, 45, 79, 83, 65, 45, 38, 74, 46, + 20, 24, 38, 20, 12, 12, 72, 48, 66, 50, + 46, 42, 62, 44, 60, 52, 48, 40, 98, 82, + 72, 64, 76, 30, 12, 8, 8, 15, 112, 58, + 0, 5, 28, 4, 19, 3, 6, 78, 52, 12, + 5, 36, 10, 10, 7, 5, 124, 1, 14, 24, + 28, 26, 16, 44, 52, 22, 46, 54, 76, 3, + 10, 15, 4, 64, 15, 21, 13, 10, 12, 11, + 29, 11, 37, 11, 15, 51, 30, 28, 66, 10, + 5, 28, 16, 2, 14, 12, 2, 27, 0, 3, + 11, 7, 36, 75, 41, 32, 47, 11, 25, 13, + 20, 25, 31, 8, 10, 63, 21, 77, 28, 34, + 44, 1, 23, 15, 21, 33, 41, 59, 65, 65, + 91, 79, 79, 125, 117, 125, 125, 111, 107, 93, + 83, 105, 77, 89, 81, 53, 67, 23, 29, 33, + 35, 77, 59, 59, 101, 71, 87, 89, 125, 95, + 103, 107, 39, 35, 115, 45, 51, 73, 105, 81, + 97, 99, 61, 97, 77, 83, 103, 97, 103, 83, + 93, 103, 13, 0, 32, 6, 66, 86, 2, 20, + 38, 46, 10, 42, 78, 8, 1, 48, 35, 101, + 125, 125, 125, 125, 125, 125, 14, 78, 60, 54, + 34, 66, 36, 20, 16, 4, 17, 7, 24, 6, + 54, 74, 4, 10, 24, 32, 12, 28, 60, 14, + 0, 48, 35, 101, 125, 125, 125, 125, 125, 125, + }, + + { + /* Context Tables for P, SP, B Slices :: cabac_init_idc = 1, qp = 48 */ + + 36, 6, 39, 36, 6, 39, 40, 56, 56, 24, + 17, 55, 77, 30, 120, 20, 64, 15, 20, 46, + 7, 27, 4, 41, 97, 31, 113, 125, 125, 125, + 124, 14, 1, 20, 46, 7, 47, 24, 22, 7, + 0, 7, 5, 3, 35, 29, 55, 8, 3, 7, + 8, 17, 3, 33, 0, 21, 19, 33, 12, 4, + 44, 0, 0, 0, 14, 69, 67, 24, 33, 15, + 18, 17, 45, 106, 52, 36, 120, 124, 66, 70, + 50, 72, 32, 30, 88, 33, 21, 19, 33, 48, + 3, 48, 82, 7, 9, 1, 58, 1, 27, 25, + 43, 38, 9, 20, 4, 1, 22, 2, 26, 22, + 24, 44, 60, 30, 22, 30, 124, 124, 124, 124, + 18, 0, 8, 30, 3, 22, 28, 12, 3, 92, + 5, 105, 97, 124, 18, 34, 10, 12, 22, 40, + 108, 62, 26, 7, 0, 4, 42, 52, 91, 8, + 10, 37, 9, 31, 8, 10, 21, 38, 24, 22, + 63, 18, 39, 29, 4, 51, 12, 10, 4, 49, + 99, 11, 41, 39, 51, 77, 71, 41, 125, 124, + 125, 63, 67, 85, 121, 89, 117, 125, 125, 125, + 79, 125, 67, 124, 107, 21, 17, 33, 45, 47, + 41, 51, 49, 49, 83, 83, 73, 77, 79, 125, + 23, 29, 75, 27, 43, 45, 59, 65, 77, 71, + 87, 77, 47, 81, 85, 67, 45, 40, 74, 46, + 20, 24, 38, 20, 12, 12, 76, 50, 66, 50, + 46, 42, 62, 46, 62, 54, 50, 40, 98, 82, + 72, 64, 78, 30, 12, 8, 8, 17, 112, 56, + 1, 7, 28, 4, 19, 3, 4, 76, 50, 8, + 7, 34, 10, 10, 7, 5, 124, 0, 16, 26, + 30, 28, 18, 46, 54, 24, 48, 56, 80, 3, + 10, 15, 6, 66, 15, 21, 13, 10, 12, 11, + 31, 13, 39, 11, 17, 53, 30, 28, 68, 10, + 7, 28, 16, 0, 14, 12, 2, 29, 0, 3, + 11, 7, 36, 77, 43, 32, 51, 13, 27, 15, + 18, 27, 33, 8, 8, 67, 23, 81, 24, 30, + 42, 5, 27, 21, 27, 39, 47, 65, 71, 71, + 99, 85, 83, 125, 125, 125, 125, 119, 115, 99, + 89, 111, 81, 93, 85, 55, 69, 29, 35, 39, + 41, 83, 65, 65, 109, 77, 93, 95, 125, 99, + 107, 111, 41, 37, 119, 49, 55, 77, 111, 85, + 101, 103, 65, 101, 81, 85, 107, 99, 105, 87, + 97, 107, 13, 0, 32, 6, 68, 88, 2, 20, + 38, 46, 10, 42, 78, 8, 3, 44, 39, 107, + 125, 125, 125, 125, 125, 125, 14, 78, 60, 54, + 34, 66, 36, 20, 16, 4, 17, 7, 24, 6, + 54, 74, 4, 10, 24, 32, 12, 28, 60, 12, + 1, 44, 39, 107, 125, 125, 125, 125, 125, 125, + }, + + { + /* Context Tables for P, SP, B Slices :: cabac_init_idc = 1, qp = 49 */ + + 34, 6, 39, 34, 6, 39, 44, 60, 58, 26, + 17, 57, 81, 28, 122, 20, 68, 15, 22, 48, + 7, 27, 6, 41, 99, 33, 117, 125, 125, 125, + 124, 16, 0, 22, 48, 7, 47, 26, 22, 7, + 2, 5, 3, 3, 35, 27, 53, 8, 1, 5, + 8, 15, 1, 31, 2, 19, 19, 31, 14, 4, + 44, 0, 0, 0, 16, 69, 67, 26, 33, 15, + 18, 17, 45, 112, 56, 40, 124, 124, 70, 74, + 54, 76, 36, 34, 94, 33, 19, 17, 31, 48, + 3, 50, 88, 5, 7, 0, 62, 1, 27, 25, + 43, 40, 9, 22, 6, 0, 24, 4, 28, 24, + 26, 48, 64, 32, 26, 34, 124, 124, 124, 124, + 20, 2, 10, 32, 3, 24, 30, 14, 1, 98, + 3, 109, 99, 124, 18, 36, 10, 14, 24, 42, + 112, 64, 28, 7, 0, 4, 44, 54, 93, 10, + 10, 37, 9, 33, 8, 10, 21, 38, 24, 22, + 65, 20, 39, 29, 4, 51, 12, 10, 4, 53, + 105, 13, 43, 43, 55, 81, 75, 45, 125, 124, + 125, 67, 69, 89, 125, 93, 123, 125, 125, 125, + 83, 125, 71, 124, 109, 25, 21, 37, 49, 51, + 45, 55, 53, 53, 87, 87, 75, 79, 81, 125, + 23, 29, 75, 27, 45, 47, 61, 67, 79, 73, + 89, 79, 47, 83, 85, 67, 43, 44, 76, 48, + 20, 26, 40, 22, 14, 14, 82, 54, 68, 52, + 48, 44, 64, 50, 66, 58, 54, 42, 100, 84, + 74, 66, 80, 32, 14, 10, 10, 17, 114, 56, + 1, 7, 30, 6, 19, 1, 4, 76, 50, 6, + 9, 34, 12, 12, 5, 3, 124, 4, 20, 30, + 34, 32, 22, 50, 58, 28, 52, 60, 86, 1, + 12, 13, 10, 70, 13, 19, 11, 12, 14, 9, + 31, 13, 39, 11, 17, 53, 32, 30, 70, 12, + 7, 30, 18, 0, 16, 14, 2, 29, 2, 1, + 9, 5, 38, 77, 45, 32, 53, 13, 27, 15, + 18, 27, 33, 8, 8, 69, 23, 83, 22, 28, + 42, 7, 29, 25, 31, 43, 51, 69, 75, 75, + 105, 89, 87, 125, 125, 125, 125, 125, 121, 103, + 93, 115, 83, 95, 87, 55, 69, 33, 39, 43, + 45, 87, 69, 69, 115, 81, 97, 99, 125, 101, + 109, 113, 41, 37, 121, 51, 57, 79, 115, 87, + 103, 105, 67, 103, 83, 85, 109, 99, 105, 89, + 99, 109, 11, 2, 34, 8, 72, 92, 4, 22, + 40, 48, 12, 44, 80, 8, 3, 42, 43, 111, + 125, 125, 125, 125, 125, 125, 16, 78, 62, 56, + 36, 68, 38, 22, 18, 6, 15, 5, 26, 8, + 56, 76, 6, 12, 26, 34, 12, 30, 62, 12, + 1, 42, 43, 111, 125, 125, 125, 125, 125, 125, + }, + + { + /* Context Tables for P, SP, B Slices :: cabac_init_idc = 1, qp = 50 */ + + 32, 6, 39, 32, 6, 39, 48, 62, 58, 26, + 17, 59, 85, 26, 122, 20, 72, 15, 22, 50, + 7, 29, 6, 43, 103, 35, 123, 125, 125, 125, + 124, 18, 0, 22, 50, 7, 47, 28, 22, 9, + 2, 5, 1, 3, 35, 27, 53, 8, 1, 5, + 8, 15, 1, 31, 2, 19, 19, 31, 14, 4, + 44, 0, 0, 0, 18, 69, 67, 26, 35, 15, + 16, 17, 45, 116, 58, 42, 124, 124, 72, 76, + 58, 78, 38, 36, 98, 33, 19, 17, 31, 48, + 3, 52, 92, 5, 5, 2, 66, 1, 27, 25, + 45, 40, 9, 22, 8, 2, 26, 6, 30, 24, + 28, 50, 66, 34, 28, 36, 124, 124, 124, 124, + 20, 2, 10, 32, 3, 24, 30, 14, 1, 102, + 3, 113, 103, 124, 18, 36, 10, 14, 24, 42, + 116, 66, 28, 7, 0, 4, 46, 56, 95, 10, + 10, 37, 11, 35, 6, 10, 23, 38, 24, 22, + 67, 20, 41, 31, 2, 51, 12, 8, 2, 57, + 111, 15, 47, 47, 59, 85, 79, 49, 125, 124, + 125, 71, 73, 93, 125, 97, 125, 125, 125, 125, + 87, 125, 75, 124, 113, 29, 25, 41, 53, 55, + 49, 59, 57, 57, 91, 91, 79, 83, 85, 125, + 23, 29, 77, 29, 47, 49, 63, 69, 81, 75, + 93, 81, 47, 85, 87, 69, 41, 46, 78, 48, + 20, 26, 42, 24, 16, 16, 86, 56, 68, 52, + 48, 46, 66, 52, 70, 62, 58, 44, 102, 86, + 76, 68, 82, 32, 14, 10, 12, 17, 116, 56, + 3, 7, 30, 6, 19, 1, 4, 76, 48, 4, + 11, 34, 12, 12, 5, 3, 124, 6, 22, 32, + 38, 36, 24, 54, 62, 30, 56, 64, 90, 0, + 14, 13, 12, 74, 13, 19, 11, 14, 14, 9, + 31, 13, 41, 11, 17, 53, 32, 30, 72, 12, + 7, 30, 18, 0, 16, 14, 2, 31, 2, 1, + 9, 5, 40, 79, 47, 32, 55, 15, 29, 17, + 18, 29, 35, 8, 8, 71, 23, 85, 20, 26, + 40, 11, 33, 29, 35, 47, 55, 75, 81, 81, + 111, 95, 91, 125, 125, 125, 125, 125, 125, 109, + 97, 121, 87, 99, 89, 57, 71, 37, 43, 49, + 49, 93, 75, 75, 121, 85, 103, 103, 125, 105, + 113, 117, 43, 39, 125, 53, 59, 83, 119, 91, + 107, 109, 69, 107, 85, 87, 111, 101, 107, 93, + 101, 111, 11, 2, 36, 8, 74, 94, 4, 24, + 42, 50, 12, 44, 82, 8, 3, 40, 47, 117, + 125, 125, 125, 125, 125, 125, 16, 78, 62, 56, + 36, 70, 38, 22, 18, 6, 15, 5, 26, 8, + 58, 78, 6, 12, 26, 34, 12, 30, 64, 12, + 1, 40, 47, 117, 125, 125, 125, 125, 125, 125, + }, + + { + /* Context Tables for P, SP, B Slices :: cabac_init_idc = 1, qp = 51 */ + + 30, 6, 39, 30, 6, 39, 52, 66, 60, 26, + 19, 63, 89, 24, 122, 20, 76, 15, 24, 52, + 7, 29, 6, 43, 105, 37, 125, 125, 125, 125, + 124, 20, 0, 24, 52, 7, 49, 30, 22, 9, + 2, 5, 1, 3, 35, 27, 53, 8, 1, 3, + 8, 15, 0, 31, 2, 19, 19, 31, 14, 4, + 44, 0, 0, 0, 18, 69, 67, 28, 37, 15, + 14, 19, 45, 122, 60, 44, 124, 124, 76, 80, + 62, 82, 40, 38, 102, 33, 19, 15, 31, 48, + 3, 54, 96, 5, 3, 2, 70, 1, 29, 27, + 45, 40, 9, 24, 8, 4, 28, 6, 32, 26, + 30, 54, 70, 34, 30, 38, 124, 124, 124, 124, + 20, 2, 10, 34, 3, 24, 32, 16, 0, 106, + 3, 117, 107, 124, 18, 36, 10, 14, 24, 44, + 118, 68, 28, 7, 0, 4, 46, 58, 97, 10, + 10, 39, 11, 37, 6, 10, 23, 38, 24, 22, + 69, 20, 43, 33, 2, 51, 12, 8, 2, 61, + 117, 17, 51, 51, 65, 89, 85, 53, 125, 124, + 125, 75, 77, 97, 125, 103, 125, 125, 125, 125, + 91, 125, 79, 124, 117, 33, 29, 45, 59, 61, + 53, 63, 61, 61, 95, 95, 83, 87, 87, 125, + 23, 29, 77, 31, 49, 51, 65, 71, 83, 77, + 97, 83, 49, 87, 89, 69, 41, 50, 80, 50, + 20, 26, 44, 24, 16, 16, 90, 58, 68, 52, + 50, 48, 68, 54, 72, 64, 62, 46, 102, 86, + 76, 68, 84, 34, 14, 10, 12, 17, 116, 56, + 5, 7, 30, 6, 19, 0, 4, 74, 48, 2, + 13, 34, 12, 12, 5, 3, 124, 8, 26, 34, + 40, 38, 26, 58, 66, 32, 58, 68, 94, 0, + 16, 11, 14, 78, 11, 19, 11, 14, 16, 9, + 31, 13, 41, 11, 17, 55, 34, 32, 74, 12, + 7, 32, 18, 0, 16, 14, 2, 31, 2, 1, + 9, 5, 40, 81, 49, 32, 57, 17, 31, 17, + 18, 31, 37, 8, 8, 73, 23, 87, 18, 24, + 38, 13, 37, 33, 39, 51, 61, 79, 87, 87, + 117, 99, 95, 125, 125, 125, 125, 125, 125, 113, + 101, 125, 91, 103, 93, 59, 73, 41, 49, 53, + 55, 99, 79, 79, 125, 91, 107, 109, 125, 109, + 117, 119, 45, 41, 125, 57, 63, 87, 123, 93, + 111, 113, 71, 109, 87, 89, 113, 103, 109, 95, + 103, 113, 11, 4, 36, 10, 76, 96, 4, 24, + 42, 50, 12, 46, 84, 8, 3, 38, 51, 121, + 125, 125, 125, 125, 125, 125, 16, 78, 62, 56, + 36, 70, 40, 22, 18, 8, 15, 5, 28, 10, + 60, 80, 6, 14, 26, 34, 12, 30, 64, 12, + 1, 38, 51, 121, 125, 125, 125, 125, 125, 125, + }, + + }, + + { + + { + /* Context Tables for P, SP, B Slices :: cabac_init_idc = 2, qp = 0 */ + + 124, 18, 21, 124, 18, 21, 125, 81, 20, 18, + 24, 94, 124, 124, 24, 2, 71, 94, 43, 77, + 12, 12, 19, 12, 46, 106, 124, 124, 42, 67, + 125, 107, 21, 43, 77, 12, 59, 49, 38, 16, + 51, 79, 105, 12, 10, 41, 65, 0, 43, 85, + 0, 23, 53, 75, 16, 31, 23, 67, 26, 6, + 44, 0, 0, 0, 39, 45, 67, 17, 44, 2, + 58, 49, 125, 125, 55, 63, 41, 45, 51, 55, + 125, 25, 79, 53, 125, 33, 25, 41, 29, 16, + 4, 39, 125, 31, 81, 55, 125, 3, 31, 17, + 57, 14, 9, 15, 69, 45, 49, 37, 17, 7, + 17, 51, 11, 8, 5, 12, 15, 15, 10, 21, + 38, 11, 2, 24, 32, 42, 44, 20, 25, 29, + 39, 22, 7, 53, 7, 17, 23, 33, 39, 1, + 64, 1, 61, 23, 0, 21, 56, 72, 55, 3, + 11, 27, 5, 2, 9, 35, 66, 112, 80, 21, + 5, 121, 52, 124, 124, 125, 48, 42, 58, 68, + 64, 52, 42, 46, 60, 40, 54, 32, 16, 10, + 6, 38, 38, 42, 30, 14, 22, 52, 28, 10, + 30, 36, 11, 60, 0, 124, 124, 124, 106, 124, + 124, 124, 124, 92, 76, 68, 60, 96, 86, 19, + 58, 64, 38, 94, 54, 54, 70, 84, 86, 102, + 94, 42, 59, 14, 12, 50, 125, 103, 37, 2, + 20, 8, 43, 51, 61, 57, 125, 73, 12, 7, + 15, 27, 43, 49, 81, 69, 125, 37, 30, 4, + 5, 13, 23, 31, 39, 57, 89, 31, 11, 23, + 10, 10, 29, 39, 35, 71, 35, 50, 2, 10, + 8, 19, 25, 45, 39, 47, 124, 125, 125, 113, + 125, 101, 107, 109, 107, 99, 109, 113, 121, 61, + 77, 71, 85, 125, 57, 12, 45, 61, 55, 27, + 15, 19, 1, 35, 1, 12, 7, 9, 7, 9, + 27, 1, 9, 29, 16, 8, 3, 18, 38, 6, + 13, 25, 45, 13, 1, 13, 16, 14, 11, 3, + 21, 18, 18, 25, 37, 27, 27, 42, 124, 124, + 124, 124, 124, 124, 124, 124, 124, 124, 124, 124, + 124, 124, 104, 124, 124, 124, 124, 124, 124, 96, + 124, 124, 92, 50, 36, 18, 31, 124, 124, 124, + 124, 96, 96, 76, 82, 94, 90, 70, 44, 70, + 32, 2, 64, 74, 78, 80, 94, 66, 68, 44, + 42, 6, 22, 6, 29, 119, 20, 14, 4, 60, + 26, 4, 29, 21, 17, 17, 23, 15, 0, 13, + 23, 17, 7, 20, 8, 22, 9, 124, 124, 124, + 124, 112, 102, 80, 50, 1, 15, 52, 38, 28, + 14, 8, 0, 7, 9, 31, 29, 21, 17, 17, + 23, 15, 0, 13, 23, 17, 7, 20, 8, 22, + 9, 124, 124, 124, 124, 112, 102, 80, 50, 1, + }, + + { + /* Context Tables for P, SP, B Slices :: cabac_init_idc = 2, qp = 1 */ + + 124, 18, 21, 124, 18, 21, 123, 77, 22, 20, + 24, 92, 124, 124, 26, 4, 67, 92, 41, 73, + 12, 12, 15, 12, 44, 104, 124, 120, 38, 67, + 123, 103, 19, 41, 73, 12, 57, 47, 40, 16, + 49, 77, 101, 10, 8, 41, 65, 0, 41, 83, + 0, 23, 51, 73, 16, 29, 21, 65, 28, 6, + 44, 0, 0, 0, 37, 45, 67, 15, 44, 2, + 58, 47, 123, 121, 51, 61, 37, 41, 49, 51, + 123, 23, 75, 51, 121, 33, 25, 41, 29, 18, + 4, 37, 121, 29, 79, 53, 123, 3, 29, 17, + 55, 16, 9, 13, 67, 43, 47, 35, 15, 5, + 15, 49, 9, 10, 5, 12, 13, 13, 10, 19, + 40, 9, 2, 26, 34, 44, 46, 22, 25, 27, + 37, 22, 7, 51, 7, 15, 21, 31, 35, 2, + 66, 2, 57, 23, 1, 19, 58, 74, 55, 3, + 9, 27, 3, 2, 7, 31, 66, 112, 82, 17, + 7, 117, 50, 124, 124, 123, 48, 42, 58, 68, + 64, 52, 42, 46, 60, 40, 54, 32, 16, 10, + 6, 38, 38, 42, 30, 14, 22, 52, 28, 8, + 30, 36, 11, 58, 0, 124, 124, 124, 104, 124, + 124, 124, 124, 90, 74, 64, 58, 92, 84, 21, + 56, 62, 36, 92, 54, 54, 68, 82, 84, 100, + 92, 40, 59, 14, 12, 48, 123, 99, 33, 4, + 20, 8, 41, 49, 59, 55, 123, 69, 14, 5, + 13, 25, 39, 47, 77, 67, 121, 35, 32, 6, + 3, 11, 21, 29, 37, 55, 85, 29, 7, 21, + 12, 10, 27, 37, 33, 69, 33, 52, 4, 12, + 10, 17, 23, 43, 37, 45, 124, 123, 123, 109, + 123, 97, 103, 105, 103, 95, 105, 109, 115, 59, + 75, 69, 83, 119, 55, 10, 43, 59, 53, 25, + 15, 17, 1, 33, 1, 12, 7, 9, 5, 9, + 27, 1, 9, 27, 16, 8, 3, 18, 38, 6, + 13, 23, 41, 13, 1, 11, 16, 14, 11, 3, + 19, 18, 18, 23, 35, 25, 25, 40, 124, 124, + 124, 124, 124, 124, 124, 124, 124, 124, 124, 124, + 124, 124, 100, 124, 124, 124, 124, 124, 124, 94, + 120, 120, 90, 48, 34, 18, 31, 124, 124, 124, + 120, 92, 94, 74, 78, 92, 86, 68, 40, 66, + 30, 0, 62, 72, 74, 78, 92, 64, 66, 42, + 40, 4, 22, 6, 29, 117, 18, 12, 2, 58, + 24, 2, 27, 19, 15, 15, 19, 13, 2, 11, + 19, 15, 5, 22, 10, 24, 7, 124, 124, 124, + 124, 108, 100, 76, 48, 3, 13, 54, 40, 30, + 16, 10, 2, 5, 7, 29, 27, 19, 15, 15, + 19, 13, 2, 11, 19, 15, 5, 22, 10, 24, + 7, 124, 124, 124, 124, 108, 100, 76, 48, 3, + }, + + { + /* Context Tables for P, SP, B Slices :: cabac_init_idc = 2, qp = 2 */ + + 124, 18, 21, 124, 18, 21, 119, 75, 22, 20, + 24, 88, 120, 124, 28, 4, 63, 88, 41, 71, + 12, 12, 13, 10, 42, 102, 120, 114, 34, 69, + 119, 101, 19, 41, 71, 12, 57, 45, 40, 16, + 47, 75, 99, 8, 6, 41, 65, 0, 41, 81, + 0, 23, 51, 73, 16, 29, 21, 63, 28, 6, + 44, 0, 0, 0, 35, 45, 67, 15, 42, 2, + 58, 45, 121, 117, 49, 59, 33, 37, 47, 49, + 119, 21, 73, 49, 117, 35, 25, 41, 29, 18, + 4, 35, 117, 29, 77, 51, 119, 3, 29, 17, + 55, 16, 9, 13, 65, 43, 45, 35, 15, 5, + 15, 47, 7, 10, 5, 12, 13, 13, 10, 19, + 40, 9, 2, 26, 34, 44, 46, 22, 27, 25, + 35, 20, 7, 51, 7, 13, 21, 31, 33, 4, + 68, 6, 53, 25, 3, 19, 58, 74, 57, 3, + 9, 29, 1, 2, 7, 29, 66, 112, 82, 15, + 9, 115, 48, 124, 124, 121, 48, 42, 58, 66, + 62, 52, 42, 46, 58, 38, 52, 32, 16, 10, + 6, 36, 36, 40, 30, 14, 22, 50, 26, 6, + 28, 34, 11, 56, 1, 124, 124, 124, 100, 120, + 124, 124, 124, 88, 70, 60, 54, 88, 80, 23, + 54, 60, 32, 90, 52, 52, 66, 78, 80, 96, + 88, 36, 59, 12, 10, 44, 121, 97, 31, 6, + 20, 8, 39, 47, 57, 53, 119, 67, 16, 3, + 11, 23, 37, 45, 75, 65, 117, 33, 32, 6, + 3, 11, 19, 27, 35, 53, 83, 29, 5, 19, + 12, 10, 25, 35, 33, 67, 31, 52, 6, 12, + 10, 15, 21, 41, 35, 43, 124, 121, 119, 105, + 119, 95, 101, 101, 99, 93, 101, 105, 111, 57, + 73, 67, 81, 113, 55, 8, 43, 57, 51, 25, + 15, 17, 1, 33, 1, 10, 7, 9, 3, 9, + 27, 1, 9, 27, 16, 8, 3, 16, 36, 6, + 13, 23, 39, 15, 1, 9, 14, 14, 11, 3, + 19, 18, 18, 23, 33, 25, 25, 36, 124, 124, + 124, 124, 124, 124, 124, 124, 124, 124, 124, 124, + 124, 124, 96, 124, 124, 124, 124, 124, 122, 90, + 116, 116, 86, 46, 32, 16, 31, 124, 124, 124, + 116, 88, 90, 70, 74, 88, 82, 64, 36, 62, + 26, 1, 60, 70, 70, 74, 88, 60, 62, 40, + 38, 2, 20, 4, 29, 115, 16, 10, 1, 56, + 22, 0, 27, 19, 13, 13, 17, 11, 4, 11, + 17, 13, 3, 22, 12, 26, 5, 124, 124, 124, + 120, 104, 96, 72, 44, 5, 11, 54, 40, 32, + 18, 12, 2, 3, 7, 27, 27, 19, 13, 13, + 17, 11, 4, 11, 17, 13, 3, 22, 12, 26, + 5, 124, 124, 124, 120, 104, 96, 72, 44, 5, + }, + + { + /* Context Tables for P, SP, B Slices :: cabac_init_idc = 2, qp = 3 */ + + 124, 18, 21, 124, 18, 21, 115, 71, 24, 20, + 22, 84, 118, 122, 28, 4, 59, 86, 41, 67, + 12, 10, 11, 8, 40, 100, 116, 106, 30, 71, + 115, 97, 19, 41, 67, 12, 55, 43, 42, 16, + 45, 73, 97, 6, 4, 41, 67, 0, 41, 79, + 0, 25, 51, 73, 16, 29, 21, 61, 30, 6, + 44, 0, 0, 0, 35, 45, 67, 13, 40, 2, + 56, 45, 119, 113, 47, 57, 31, 35, 45, 47, + 115, 19, 71, 47, 113, 37, 25, 41, 29, 20, + 4, 33, 113, 29, 75, 49, 115, 3, 29, 17, + 55, 18, 9, 11, 63, 43, 43, 35, 15, 5, + 13, 45, 7, 10, 5, 12, 13, 13, 10, 19, + 40, 9, 2, 28, 34, 46, 46, 24, 27, 25, + 33, 20, 7, 51, 7, 11, 21, 29, 31, 6, + 70, 8, 49, 25, 5, 17, 58, 74, 59, 3, + 7, 29, 1, 2, 7, 27, 66, 112, 82, 13, + 11, 111, 46, 124, 124, 117, 48, 42, 56, 64, + 62, 50, 40, 46, 58, 36, 50, 32, 16, 10, + 4, 36, 34, 38, 28, 14, 22, 48, 26, 4, + 28, 32, 11, 54, 1, 124, 124, 122, 98, 116, + 124, 124, 124, 86, 66, 56, 52, 84, 76, 27, + 52, 58, 28, 88, 50, 50, 64, 76, 76, 92, + 84, 34, 59, 10, 8, 42, 117, 93, 27, 6, + 20, 8, 37, 45, 55, 51, 115, 65, 18, 1, + 9, 23, 35, 43, 71, 63, 113, 33, 34, 8, + 1, 9, 17, 27, 35, 51, 81, 29, 1, 17, + 12, 10, 23, 35, 33, 65, 29, 54, 8, 14, + 10, 13, 21, 39, 35, 43, 124, 117, 117, 103, + 115, 93, 97, 99, 97, 89, 97, 101, 107, 57, + 71, 67, 79, 107, 55, 6, 43, 55, 49, 25, + 15, 17, 1, 31, 1, 8, 7, 9, 3, 9, + 27, 1, 9, 27, 14, 8, 3, 14, 34, 6, + 13, 23, 37, 17, 1, 7, 12, 14, 11, 3, + 17, 18, 16, 21, 31, 25, 25, 34, 124, 124, + 124, 124, 124, 124, 124, 124, 124, 124, 124, 124, + 124, 124, 92, 124, 124, 124, 124, 124, 118, 86, + 112, 110, 82, 44, 30, 14, 31, 124, 124, 124, + 112, 84, 86, 68, 70, 84, 78, 60, 32, 58, + 22, 3, 58, 68, 66, 72, 84, 58, 58, 36, + 34, 0, 18, 2, 29, 113, 14, 6, 3, 54, + 20, 1, 27, 17, 13, 13, 15, 9, 6, 11, + 15, 11, 1, 24, 14, 26, 3, 124, 124, 124, + 116, 100, 92, 68, 40, 7, 11, 56, 42, 34, + 18, 14, 4, 3, 5, 27, 27, 17, 13, 13, + 15, 9, 6, 11, 15, 11, 1, 24, 14, 26, + 3, 124, 124, 124, 116, 100, 92, 68, 40, 7, + }, + + { + /* Context Tables for P, SP, B Slices :: cabac_init_idc = 2, qp = 4 */ + + 124, 18, 21, 124, 18, 21, 113, 69, 24, 20, + 22, 80, 114, 120, 30, 4, 57, 82, 41, 65, + 10, 10, 9, 6, 36, 96, 112, 100, 24, 73, + 111, 95, 19, 41, 65, 10, 55, 41, 42, 14, + 45, 71, 93, 4, 0, 43, 67, 0, 39, 77, + 1, 25, 51, 73, 16, 29, 21, 61, 30, 6, + 44, 0, 0, 0, 33, 47, 67, 13, 38, 2, + 56, 43, 117, 109, 45, 55, 27, 31, 45, 45, + 111, 17, 69, 45, 107, 37, 27, 41, 31, 20, + 2, 31, 107, 27, 75, 49, 111, 3, 29, 17, + 55, 18, 9, 11, 61, 43, 43, 33, 15, 5, + 13, 43, 5, 10, 7, 10, 13, 13, 10, 19, + 40, 9, 2, 28, 34, 46, 46, 24, 29, 23, + 33, 18, 7, 49, 7, 9, 19, 29, 27, 10, + 72, 12, 45, 27, 7, 17, 60, 74, 61, 3, + 7, 31, 0, 2, 7, 25, 66, 112, 82, 9, + 13, 109, 44, 124, 124, 115, 46, 42, 56, 64, + 60, 50, 40, 46, 56, 34, 48, 30, 16, 10, + 4, 34, 34, 36, 28, 12, 20, 46, 24, 2, + 26, 30, 11, 50, 3, 124, 124, 118, 94, 114, + 124, 124, 124, 84, 62, 50, 48, 80, 72, 29, + 48, 56, 26, 86, 48, 48, 60, 72, 72, 88, + 82, 30, 59, 8, 6, 38, 115, 91, 25, 8, + 20, 8, 35, 43, 53, 51, 111, 61, 20, 1, + 9, 21, 31, 41, 69, 61, 107, 31, 34, 8, + 1, 9, 15, 25, 33, 51, 79, 29, 0, 15, + 12, 10, 21, 33, 33, 63, 27, 54, 10, 14, + 10, 11, 19, 37, 33, 41, 124, 115, 113, 99, + 113, 91, 95, 95, 93, 87, 95, 97, 101, 55, + 69, 65, 77, 101, 53, 4, 41, 53, 49, 25, + 15, 17, 3, 31, 3, 6, 7, 9, 1, 9, + 27, 1, 9, 25, 14, 6, 3, 12, 32, 4, + 13, 23, 35, 19, 3, 7, 12, 12, 11, 3, + 17, 16, 16, 21, 31, 25, 25, 30, 124, 124, + 124, 124, 124, 124, 124, 124, 124, 124, 124, 124, + 124, 124, 88, 124, 124, 124, 124, 124, 114, 82, + 108, 106, 78, 40, 28, 12, 31, 124, 124, 124, + 108, 80, 82, 64, 66, 80, 74, 56, 28, 52, + 20, 7, 56, 66, 60, 68, 82, 54, 54, 34, + 32, 1, 16, 0, 29, 111, 10, 4, 7, 50, + 18, 3, 27, 17, 11, 11, 13, 9, 6, 9, + 13, 9, 0, 24, 16, 28, 3, 124, 124, 120, + 112, 96, 88, 62, 36, 11, 9, 56, 42, 34, + 20, 14, 4, 1, 5, 25, 27, 17, 11, 11, + 13, 9, 6, 9, 13, 9, 0, 24, 16, 28, + 3, 124, 124, 120, 112, 96, 88, 62, 36, 11, + }, + + { + /* Context Tables for P, SP, B Slices :: cabac_init_idc = 2, qp = 5 */ + + 124, 18, 21, 124, 18, 21, 109, 65, 24, 20, + 20, 76, 112, 118, 32, 4, 53, 78, 39, 61, + 10, 10, 7, 4, 34, 94, 108, 94, 20, 73, + 107, 93, 19, 39, 61, 10, 55, 39, 42, 14, + 43, 69, 91, 2, 1, 43, 67, 0, 39, 75, + 1, 25, 51, 73, 16, 27, 21, 59, 32, 6, + 44, 0, 0, 0, 33, 47, 67, 11, 36, 2, + 54, 43, 113, 103, 43, 53, 25, 29, 43, 43, + 107, 15, 67, 43, 103, 39, 27, 41, 31, 20, + 2, 29, 103, 27, 73, 47, 107, 3, 29, 17, + 53, 18, 9, 9, 59, 41, 41, 33, 15, 3, + 11, 41, 5, 10, 7, 10, 11, 13, 10, 19, + 42, 9, 2, 30, 36, 46, 46, 24, 29, 23, + 31, 18, 7, 49, 7, 7, 19, 27, 25, 12, + 74, 14, 41, 27, 9, 15, 60, 74, 63, 3, + 5, 31, 2, 2, 7, 21, 66, 112, 82, 7, + 15, 105, 42, 124, 124, 113, 46, 42, 54, 62, + 60, 50, 38, 46, 56, 32, 46, 30, 16, 10, + 4, 34, 32, 34, 26, 12, 20, 44, 24, 0, + 24, 30, 11, 48, 3, 124, 124, 116, 92, 110, + 124, 124, 124, 82, 58, 46, 46, 76, 68, 31, + 46, 54, 22, 84, 46, 46, 58, 70, 68, 84, + 78, 28, 59, 6, 4, 34, 111, 87, 23, 8, + 20, 8, 33, 41, 51, 49, 107, 59, 22, 0, + 7, 19, 29, 39, 65, 59, 103, 29, 36, 10, + 0, 7, 13, 23, 33, 49, 77, 27, 2, 13, + 12, 10, 19, 33, 31, 61, 25, 54, 12, 14, + 10, 9, 17, 35, 33, 39, 124, 113, 111, 97, + 109, 89, 91, 93, 89, 83, 91, 93, 97, 53, + 67, 63, 75, 95, 53, 2, 41, 51, 47, 25, + 15, 17, 3, 29, 3, 4, 7, 9, 0, 9, + 27, 1, 9, 25, 12, 6, 3, 10, 30, 4, + 13, 23, 33, 19, 3, 5, 10, 12, 11, 3, + 17, 16, 14, 21, 29, 25, 25, 28, 124, 124, + 124, 124, 124, 124, 124, 124, 124, 124, 124, 124, + 124, 124, 84, 124, 124, 124, 124, 124, 110, 80, + 104, 100, 74, 38, 26, 10, 31, 124, 124, 124, + 104, 76, 78, 62, 62, 76, 70, 52, 24, 48, + 16, 9, 54, 64, 56, 66, 78, 52, 50, 32, + 30, 3, 14, 1, 29, 109, 8, 2, 9, 48, + 16, 5, 27, 15, 11, 9, 11, 7, 8, 9, + 11, 7, 2, 26, 18, 28, 1, 124, 124, 116, + 108, 92, 84, 58, 32, 13, 9, 58, 44, 36, + 22, 16, 6, 1, 5, 23, 27, 15, 11, 9, + 11, 7, 8, 9, 11, 7, 2, 26, 18, 28, + 1, 124, 124, 116, 108, 92, 84, 58, 32, 13, + }, + + { + /* Context Tables for P, SP, B Slices :: cabac_init_idc = 2, qp = 6 */ + + 124, 18, 23, 124, 18, 23, 105, 63, 26, 20, + 20, 74, 108, 116, 32, 6, 49, 76, 39, 59, + 10, 8, 5, 2, 32, 92, 106, 86, 16, 75, + 103, 89, 19, 39, 59, 10, 53, 37, 44, 14, + 41, 67, 89, 1, 3, 43, 69, 0, 39, 75, + 1, 27, 49, 73, 14, 27, 21, 57, 32, 6, + 44, 0, 0, 0, 31, 47, 67, 11, 36, 0, + 54, 41, 111, 99, 41, 51, 21, 25, 41, 41, + 103, 13, 65, 43, 99, 41, 27, 41, 31, 22, + 2, 27, 99, 27, 71, 45, 103, 3, 29, 17, + 53, 20, 11, 9, 59, 41, 39, 33, 13, 3, + 11, 39, 3, 10, 7, 10, 11, 13, 10, 19, + 42, 9, 2, 30, 36, 48, 48, 26, 31, 21, + 29, 16, 7, 49, 7, 5, 19, 27, 23, 14, + 74, 18, 39, 29, 11, 15, 60, 74, 63, 5, + 5, 33, 2, 0, 5, 19, 66, 112, 84, 5, + 17, 103, 40, 124, 124, 109, 46, 42, 54, 60, + 58, 48, 38, 44, 54, 32, 46, 30, 14, 10, + 2, 32, 30, 32, 26, 12, 20, 44, 22, 3, + 24, 28, 11, 46, 5, 124, 124, 112, 88, 106, + 124, 124, 124, 78, 54, 42, 42, 72, 64, 35, + 44, 50, 18, 80, 44, 44, 56, 66, 64, 80, + 74, 24, 59, 4, 2, 32, 109, 85, 19, 10, + 20, 8, 31, 41, 51, 47, 105, 57, 24, 2, + 5, 19, 27, 37, 63, 57, 99, 29, 36, 10, + 0, 7, 11, 23, 31, 47, 75, 27, 6, 11, + 12, 10, 19, 31, 31, 61, 25, 56, 12, 16, + 10, 7, 17, 35, 31, 39, 124, 109, 107, 93, + 105, 85, 89, 89, 87, 81, 87, 89, 93, 53, + 65, 63, 75, 89, 53, 0, 41, 51, 45, 25, + 15, 17, 3, 29, 3, 2, 7, 9, 0, 9, + 27, 1, 9, 25, 12, 6, 3, 8, 28, 4, + 13, 23, 31, 21, 3, 3, 8, 12, 11, 3, + 15, 16, 14, 19, 27, 25, 25, 24, 124, 124, + 124, 124, 124, 124, 124, 124, 124, 124, 124, 124, + 124, 124, 80, 124, 124, 124, 124, 124, 104, 76, + 100, 96, 70, 36, 24, 8, 31, 124, 124, 124, + 100, 72, 76, 58, 58, 72, 64, 48, 20, 44, + 12, 11, 52, 60, 52, 62, 74, 48, 46, 28, + 26, 5, 12, 3, 31, 107, 6, 1, 13, 46, + 12, 7, 25, 15, 9, 9, 9, 5, 10, 9, + 9, 5, 4, 26, 20, 30, 0, 124, 124, 112, + 104, 88, 80, 54, 28, 15, 7, 58, 44, 38, + 22, 18, 6, 0, 3, 23, 25, 15, 9, 9, + 9, 5, 10, 9, 9, 5, 4, 26, 20, 30, + 0, 124, 124, 112, 104, 88, 80, 54, 28, 15, + }, + + { + /* Context Tables for P, SP, B Slices :: cabac_init_idc = 2, qp = 7 */ + + 124, 18, 23, 124, 18, 23, 101, 59, 26, 20, + 18, 70, 106, 114, 34, 6, 47, 72, 39, 55, + 8, 8, 3, 0, 30, 90, 102, 80, 10, 77, + 99, 87, 19, 39, 55, 8, 53, 35, 44, 14, + 41, 65, 85, 3, 5, 43, 69, 0, 37, 73, + 3, 27, 49, 73, 14, 27, 21, 55, 34, 6, + 44, 0, 0, 0, 31, 47, 67, 9, 34, 0, + 52, 41, 109, 95, 39, 49, 19, 23, 39, 39, + 99, 11, 63, 41, 93, 41, 29, 41, 33, 22, + 2, 25, 93, 25, 71, 45, 99, 3, 29, 17, + 53, 20, 11, 7, 57, 41, 37, 31, 13, 3, + 9, 37, 3, 10, 9, 10, 11, 13, 10, 19, + 42, 9, 2, 32, 36, 48, 48, 26, 31, 21, + 29, 16, 7, 47, 7, 3, 17, 25, 19, 18, + 76, 20, 35, 29, 13, 13, 62, 74, 65, 5, + 3, 33, 4, 0, 5, 17, 66, 112, 84, 1, + 19, 99, 38, 124, 124, 107, 46, 42, 52, 60, + 58, 48, 36, 44, 54, 30, 44, 30, 14, 10, + 2, 32, 30, 30, 24, 12, 20, 42, 22, 5, + 22, 26, 11, 44, 5, 124, 124, 108, 86, 104, + 124, 124, 124, 76, 50, 38, 40, 68, 60, 37, + 42, 48, 16, 78, 42, 42, 52, 64, 60, 76, + 72, 22, 59, 2, 0, 28, 105, 81, 17, 10, + 20, 8, 29, 39, 49, 47, 101, 53, 26, 4, + 5, 17, 23, 35, 59, 55, 93, 27, 38, 12, + 2, 5, 9, 21, 31, 45, 73, 27, 8, 9, + 12, 10, 17, 31, 31, 59, 23, 56, 14, 16, + 10, 5, 15, 33, 31, 37, 124, 107, 105, 91, + 103, 83, 85, 87, 83, 77, 83, 85, 87, 51, + 63, 61, 73, 83, 51, 1, 39, 49, 43, 25, + 15, 17, 3, 27, 5, 0, 7, 9, 2, 9, + 27, 1, 9, 23, 10, 4, 3, 6, 26, 2, + 13, 23, 29, 23, 5, 1, 8, 10, 11, 3, + 15, 14, 12, 19, 27, 25, 25, 22, 124, 124, + 124, 124, 124, 124, 124, 124, 124, 124, 124, 124, + 124, 124, 76, 124, 124, 124, 124, 124, 100, 72, + 96, 90, 66, 34, 22, 6, 31, 124, 122, 124, + 96, 68, 72, 56, 54, 68, 60, 44, 16, 40, + 10, 15, 50, 58, 48, 60, 72, 46, 42, 26, + 24, 7, 10, 5, 31, 105, 2, 3, 15, 42, + 10, 9, 25, 13, 9, 7, 7, 3, 10, 7, + 7, 3, 6, 28, 22, 30, 0, 124, 120, 108, + 100, 84, 76, 48, 24, 17, 7, 60, 46, 38, + 24, 20, 8, 0, 3, 21, 25, 13, 9, 7, + 7, 3, 10, 7, 7, 3, 6, 28, 22, 30, + 0, 124, 120, 108, 100, 84, 76, 48, 24, 17, + }, + + { + /* Context Tables for P, SP, B Slices :: cabac_init_idc = 2, qp = 8 */ + + 124, 16, 23, 124, 16, 23, 99, 57, 26, 20, + 18, 66, 102, 112, 34, 6, 43, 68, 39, 53, + 8, 6, 1, 1, 26, 86, 98, 72, 6, 79, + 95, 85, 19, 39, 53, 8, 53, 35, 44, 12, + 39, 63, 83, 5, 9, 45, 71, 0, 37, 71, + 3, 29, 49, 73, 14, 27, 21, 55, 34, 6, + 44, 0, 0, 0, 29, 49, 67, 9, 32, 0, + 52, 39, 107, 91, 37, 49, 15, 19, 39, 37, + 95, 11, 61, 39, 89, 43, 29, 43, 33, 22, + 0, 25, 89, 25, 69, 43, 97, 3, 29, 17, + 53, 20, 11, 7, 55, 41, 37, 31, 13, 3, + 9, 35, 1, 10, 9, 8, 11, 13, 8, 19, + 42, 9, 2, 32, 36, 48, 48, 26, 33, 19, + 27, 14, 7, 47, 7, 1, 17, 25, 17, 20, + 78, 24, 31, 31, 15, 13, 62, 74, 67, 5, + 3, 35, 4, 0, 5, 15, 66, 112, 84, 0, + 21, 97, 36, 118, 124, 105, 44, 42, 52, 58, + 56, 46, 36, 44, 52, 28, 42, 28, 14, 8, + 0, 30, 28, 28, 24, 10, 18, 40, 20, 7, + 20, 24, 11, 40, 7, 124, 124, 104, 82, 100, + 120, 124, 124, 74, 46, 32, 36, 62, 56, 41, + 38, 46, 12, 76, 40, 40, 50, 60, 56, 72, + 68, 18, 59, 0, 1, 24, 103, 79, 15, 12, + 20, 8, 29, 37, 47, 45, 97, 51, 26, 4, + 3, 17, 21, 33, 57, 53, 89, 27, 38, 12, + 2, 5, 9, 21, 29, 45, 71, 27, 10, 7, + 12, 10, 15, 29, 31, 57, 21, 56, 16, 16, + 10, 3, 15, 31, 29, 37, 124, 105, 101, 87, + 99, 81, 83, 83, 81, 75, 81, 81, 83, 51, + 61, 61, 71, 77, 51, 3, 39, 47, 43, 25, + 15, 17, 5, 27, 5, 1, 7, 9, 2, 9, + 27, 3, 9, 23, 10, 4, 5, 4, 24, 2, + 15, 23, 27, 25, 5, 1, 6, 10, 11, 5, + 15, 14, 12, 19, 25, 25, 25, 18, 124, 124, + 124, 124, 124, 124, 124, 124, 124, 124, 124, 124, + 124, 122, 72, 124, 124, 124, 124, 122, 96, 68, + 90, 86, 62, 30, 18, 4, 31, 122, 118, 124, + 92, 62, 68, 52, 48, 64, 56, 40, 12, 34, + 6, 17, 46, 56, 42, 56, 68, 42, 38, 22, + 20, 9, 8, 7, 31, 103, 0, 7, 19, 40, + 8, 11, 25, 13, 7, 7, 5, 3, 12, 7, + 5, 3, 8, 28, 22, 32, 2, 122, 116, 104, + 96, 80, 72, 44, 20, 21, 5, 60, 46, 40, + 24, 20, 8, 2, 3, 21, 25, 13, 7, 7, + 5, 3, 12, 7, 5, 3, 8, 28, 22, 32, + 2, 122, 116, 104, 96, 80, 72, 44, 20, 21, + }, + + { + /* Context Tables for P, SP, B Slices :: cabac_init_idc = 2, qp = 9 */ + + 124, 16, 23, 124, 16, 23, 95, 55, 28, 20, + 18, 62, 98, 112, 36, 6, 39, 66, 37, 49, + 8, 6, 0, 1, 24, 84, 94, 66, 2, 79, + 91, 81, 17, 37, 49, 8, 51, 33, 46, 12, + 37, 61, 81, 7, 11, 45, 71, 0, 37, 69, + 3, 29, 49, 73, 14, 25, 19, 53, 34, 6, + 44, 0, 0, 0, 27, 49, 67, 9, 30, 0, + 52, 37, 103, 85, 35, 47, 11, 15, 37, 35, + 91, 9, 57, 37, 85, 45, 29, 43, 33, 24, + 0, 23, 85, 25, 67, 41, 93, 3, 27, 17, + 51, 22, 11, 5, 53, 39, 35, 31, 13, 1, + 7, 33, 0, 10, 9, 8, 9, 11, 8, 19, + 44, 9, 2, 32, 38, 50, 48, 28, 33, 17, + 25, 12, 7, 47, 7, 0, 17, 23, 15, 22, + 80, 28, 27, 33, 17, 11, 62, 76, 69, 5, + 3, 35, 6, 0, 5, 11, 66, 112, 84, 2, + 23, 95, 34, 114, 124, 101, 44, 42, 52, 56, + 56, 46, 36, 44, 52, 26, 40, 28, 14, 8, + 0, 30, 26, 28, 24, 10, 18, 38, 18, 9, + 20, 24, 11, 38, 7, 124, 124, 102, 80, 96, + 116, 124, 124, 72, 42, 28, 34, 58, 54, 43, + 36, 44, 8, 74, 38, 38, 48, 56, 54, 68, + 64, 16, 59, 0, 3, 22, 99, 75, 11, 14, + 20, 8, 27, 35, 45, 43, 93, 49, 28, 6, + 1, 15, 19, 31, 55, 51, 85, 25, 40, 14, + 4, 5, 7, 19, 27, 43, 67, 25, 14, 5, + 14, 10, 13, 27, 29, 55, 19, 58, 18, 18, + 12, 1, 13, 29, 27, 35, 124, 101, 97, 83, + 95, 79, 81, 79, 77, 71, 77, 77, 79, 49, + 59, 59, 69, 69, 51, 5, 39, 45, 41, 23, + 15, 17, 5, 27, 5, 3, 7, 9, 4, 9, + 27, 3, 9, 23, 10, 4, 5, 4, 22, 2, + 15, 21, 23, 25, 5, 0, 4, 10, 11, 5, + 13, 14, 12, 17, 23, 23, 23, 14, 124, 124, + 124, 124, 124, 124, 124, 124, 124, 124, 124, 124, + 124, 116, 68, 124, 124, 124, 124, 116, 92, 66, + 86, 82, 60, 28, 16, 2, 31, 118, 114, 120, + 88, 58, 64, 50, 44, 60, 52, 36, 8, 30, + 2, 19, 44, 54, 38, 54, 64, 40, 34, 20, + 18, 11, 6, 7, 31, 101, 1, 9, 23, 38, + 6, 13, 25, 11, 5, 5, 1, 1, 14, 7, + 3, 1, 10, 30, 24, 34, 4, 120, 114, 100, + 92, 76, 68, 40, 16, 23, 3, 60, 48, 42, + 26, 22, 10, 4, 1, 19, 25, 11, 5, 5, + 1, 1, 14, 7, 3, 1, 10, 30, 24, 34, + 4, 120, 114, 100, 92, 76, 68, 40, 16, 23, + }, + + { + /* Context Tables for P, SP, B Slices :: cabac_init_idc = 2, qp = 10 */ + + 124, 16, 23, 124, 16, 23, 91, 51, 28, 20, + 16, 58, 96, 110, 38, 6, 37, 62, 37, 47, + 6, 6, 2, 3, 22, 82, 90, 60, 3, 81, + 87, 79, 17, 37, 47, 6, 51, 31, 46, 12, + 37, 59, 77, 9, 13, 45, 71, 0, 35, 67, + 5, 29, 49, 73, 14, 25, 19, 51, 36, 6, + 44, 0, 0, 0, 27, 49, 67, 7, 28, 0, + 50, 37, 101, 81, 33, 45, 9, 13, 35, 33, + 87, 7, 55, 35, 79, 45, 31, 43, 35, 24, + 0, 21, 79, 23, 67, 41, 89, 3, 27, 17, + 51, 22, 11, 5, 51, 39, 33, 29, 13, 1, + 7, 31, 0, 10, 11, 8, 9, 11, 8, 19, + 44, 9, 2, 34, 38, 50, 48, 28, 35, 17, + 25, 12, 7, 45, 7, 2, 15, 23, 11, 26, + 82, 30, 23, 33, 19, 11, 64, 76, 71, 5, + 1, 37, 8, 0, 5, 9, 66, 112, 84, 6, + 25, 91, 32, 108, 124, 99, 44, 42, 50, 56, + 54, 46, 34, 44, 50, 24, 38, 28, 14, 8, + 0, 28, 26, 26, 22, 10, 18, 36, 18, 11, + 18, 22, 11, 36, 9, 120, 124, 98, 76, 94, + 112, 124, 124, 70, 38, 24, 30, 54, 50, 45, + 34, 42, 6, 72, 36, 36, 44, 54, 50, 64, + 62, 12, 59, 1, 5, 18, 97, 73, 9, 14, + 20, 8, 25, 33, 43, 43, 89, 45, 30, 8, + 1, 13, 15, 29, 51, 49, 79, 23, 40, 14, + 4, 3, 5, 17, 27, 41, 65, 25, 16, 3, + 14, 10, 11, 27, 29, 53, 17, 58, 20, 18, + 12, 0, 11, 27, 27, 33, 124, 99, 95, 81, + 93, 77, 77, 77, 73, 69, 73, 73, 73, 47, + 57, 57, 67, 63, 49, 7, 37, 43, 39, 23, + 15, 17, 5, 25, 7, 5, 7, 9, 6, 9, + 27, 3, 9, 21, 8, 2, 5, 2, 20, 0, + 15, 21, 21, 27, 7, 2, 4, 8, 11, 5, + 13, 12, 10, 17, 23, 23, 23, 12, 124, 124, + 124, 124, 124, 124, 124, 124, 124, 124, 124, 124, + 124, 112, 64, 124, 124, 124, 124, 110, 88, 62, + 82, 76, 56, 26, 14, 0, 31, 114, 108, 114, + 84, 54, 60, 46, 40, 56, 48, 32, 4, 26, + 0, 23, 42, 52, 34, 50, 62, 36, 30, 18, + 16, 13, 4, 9, 31, 99, 5, 11, 25, 34, + 4, 15, 25, 11, 5, 3, 0, 0, 14, 5, + 1, 0, 12, 30, 26, 34, 4, 120, 110, 96, + 88, 72, 64, 34, 12, 25, 3, 62, 48, 42, + 28, 24, 10, 4, 1, 17, 25, 11, 5, 3, + 0, 0, 14, 5, 1, 0, 12, 30, 26, 34, + 4, 120, 110, 96, 88, 72, 64, 34, 12, 25, + }, + + { + /* Context Tables for P, SP, B Slices :: cabac_init_idc = 2, qp = 11 */ + + 124, 16, 25, 124, 16, 25, 87, 49, 30, 20, + 16, 56, 92, 108, 38, 8, 33, 60, 37, 43, + 6, 4, 4, 5, 20, 80, 88, 52, 7, 83, + 83, 75, 17, 37, 43, 6, 49, 29, 48, 12, + 35, 57, 75, 13, 15, 45, 73, 0, 35, 67, + 5, 31, 47, 73, 12, 25, 19, 49, 36, 6, + 44, 0, 0, 0, 25, 49, 67, 7, 28, 1, + 50, 35, 99, 77, 31, 43, 5, 9, 33, 31, + 83, 5, 53, 35, 75, 47, 31, 43, 35, 26, + 0, 19, 75, 23, 65, 39, 85, 3, 27, 17, + 51, 24, 13, 3, 51, 39, 31, 29, 11, 1, + 5, 29, 2, 10, 11, 8, 9, 11, 8, 19, + 44, 9, 2, 34, 38, 52, 50, 30, 35, 15, + 23, 10, 7, 45, 7, 4, 15, 21, 9, 28, + 82, 34, 21, 35, 21, 9, 64, 76, 71, 7, + 1, 37, 8, 1, 3, 7, 66, 112, 86, 8, + 27, 89, 30, 102, 124, 95, 44, 42, 50, 54, + 54, 44, 34, 42, 50, 24, 38, 28, 12, 8, + 1, 28, 24, 24, 22, 10, 18, 36, 16, 15, + 18, 20, 11, 34, 9, 114, 124, 94, 74, 90, + 108, 124, 122, 66, 34, 20, 28, 50, 46, 49, + 32, 38, 2, 68, 34, 34, 42, 50, 46, 60, + 58, 10, 59, 3, 7, 16, 93, 69, 5, 16, + 20, 8, 23, 33, 43, 41, 87, 43, 32, 10, + 0, 13, 13, 27, 49, 47, 75, 23, 42, 16, + 6, 3, 3, 17, 25, 39, 63, 25, 20, 1, + 14, 10, 11, 25, 29, 53, 17, 60, 20, 20, + 12, 2, 11, 27, 25, 33, 124, 95, 91, 77, + 89, 73, 75, 73, 71, 65, 69, 69, 69, 47, + 55, 57, 67, 57, 49, 9, 37, 43, 37, 23, + 15, 17, 5, 25, 7, 7, 7, 9, 6, 9, + 27, 3, 9, 21, 8, 2, 5, 0, 18, 0, + 15, 21, 19, 29, 7, 4, 2, 8, 11, 5, + 11, 12, 10, 15, 21, 23, 23, 8, 124, 122, + 124, 124, 124, 124, 124, 124, 124, 124, 124, 124, + 124, 106, 60, 124, 124, 124, 124, 106, 82, 58, + 78, 72, 52, 24, 12, 1, 31, 110, 104, 110, + 80, 50, 58, 44, 36, 52, 42, 28, 0, 22, + 3, 25, 40, 48, 30, 48, 58, 34, 26, 14, + 12, 15, 2, 11, 33, 97, 7, 15, 29, 32, + 0, 17, 23, 9, 3, 3, 2, 2, 16, 5, + 0, 2, 14, 32, 28, 36, 6, 118, 106, 92, + 84, 68, 60, 30, 8, 27, 1, 62, 50, 44, + 28, 26, 12, 6, 0, 17, 23, 9, 3, 3, + 2, 2, 16, 5, 0, 2, 14, 32, 28, 36, + 6, 118, 106, 92, 84, 68, 60, 30, 8, 27, + }, + + { + /* Context Tables for P, SP, B Slices :: cabac_init_idc = 2, qp = 12 */ + + 124, 16, 25, 124, 16, 25, 85, 45, 30, 20, + 14, 52, 90, 106, 40, 8, 29, 56, 37, 41, + 6, 4, 6, 7, 16, 76, 84, 46, 11, 85, + 79, 73, 17, 37, 41, 6, 49, 27, 48, 10, + 33, 55, 73, 15, 19, 47, 73, 0, 35, 65, + 5, 31, 47, 73, 12, 25, 19, 49, 38, 6, + 44, 0, 0, 0, 25, 51, 67, 5, 26, 1, + 48, 35, 97, 73, 29, 41, 3, 7, 33, 29, + 79, 3, 51, 33, 71, 49, 31, 43, 35, 26, + 1, 17, 71, 23, 63, 37, 81, 3, 27, 17, + 51, 24, 13, 3, 49, 39, 31, 29, 11, 1, + 5, 27, 2, 10, 11, 6, 9, 11, 8, 19, + 44, 9, 2, 36, 38, 52, 50, 30, 37, 15, + 21, 10, 7, 45, 7, 6, 15, 21, 7, 30, + 84, 36, 17, 35, 23, 9, 64, 76, 73, 7, + 0, 39, 10, 1, 3, 5, 66, 112, 86, 10, + 29, 85, 28, 96, 120, 93, 42, 42, 48, 52, + 52, 44, 32, 42, 48, 22, 36, 26, 12, 8, + 1, 26, 22, 22, 20, 8, 16, 34, 16, 17, + 16, 18, 11, 30, 11, 110, 124, 90, 70, 86, + 104, 124, 116, 64, 30, 14, 24, 46, 42, 51, + 28, 36, 1, 66, 32, 32, 40, 48, 42, 56, + 54, 6, 59, 5, 9, 12, 91, 67, 3, 16, + 20, 8, 21, 31, 41, 39, 83, 41, 34, 10, + 2, 11, 11, 25, 45, 45, 71, 21, 42, 16, + 6, 1, 1, 15, 25, 39, 61, 25, 22, 0, + 14, 10, 9, 25, 29, 51, 15, 60, 22, 20, + 12, 4, 9, 25, 25, 31, 124, 93, 89, 75, + 85, 71, 71, 71, 67, 63, 67, 65, 65, 45, + 53, 55, 65, 51, 49, 11, 37, 41, 37, 23, + 15, 17, 7, 23, 7, 9, 7, 9, 8, 9, + 27, 3, 9, 21, 6, 2, 5, 1, 16, 0, + 15, 21, 17, 31, 7, 4, 0, 8, 11, 5, + 11, 12, 8, 15, 19, 23, 23, 6, 124, 120, + 124, 124, 124, 124, 124, 124, 124, 124, 124, 124, + 124, 100, 56, 124, 124, 124, 124, 100, 78, 54, + 74, 66, 48, 20, 10, 3, 31, 104, 100, 106, + 76, 46, 54, 40, 32, 48, 38, 24, 3, 16, + 7, 27, 38, 46, 24, 44, 54, 30, 22, 12, + 10, 17, 0, 13, 33, 95, 9, 17, 31, 30, + 1, 19, 23, 9, 3, 1, 4, 2, 18, 5, + 2, 4, 16, 32, 30, 36, 8, 118, 102, 88, + 80, 64, 56, 26, 4, 31, 1, 64, 50, 46, + 30, 26, 12, 6, 0, 15, 23, 9, 3, 1, + 4, 2, 18, 5, 2, 4, 16, 32, 30, 36, + 8, 118, 102, 88, 80, 64, 56, 26, 4, 31, + }, + + { + /* Context Tables for P, SP, B Slices :: cabac_init_idc = 2, qp = 13 */ + + 124, 16, 25, 124, 16, 25, 81, 43, 30, 20, + 14, 48, 86, 104, 42, 8, 27, 52, 35, 37, + 4, 4, 8, 9, 14, 74, 80, 40, 17, 85, + 75, 71, 17, 35, 37, 4, 49, 25, 48, 10, + 33, 53, 69, 17, 21, 47, 73, 0, 33, 63, + 7, 31, 47, 73, 12, 23, 19, 47, 38, 6, + 44, 0, 0, 0, 23, 51, 67, 5, 24, 1, + 48, 33, 93, 67, 27, 39, 0, 3, 31, 27, + 75, 1, 49, 31, 65, 49, 33, 43, 37, 26, + 1, 15, 65, 21, 63, 37, 77, 3, 27, 17, + 49, 24, 13, 1, 47, 37, 29, 27, 11, 0, + 3, 25, 4, 10, 13, 6, 7, 11, 8, 19, + 46, 9, 2, 36, 40, 52, 50, 30, 37, 13, + 21, 8, 7, 43, 7, 8, 13, 19, 3, 34, + 86, 40, 13, 37, 25, 7, 66, 76, 75, 7, + 0, 39, 12, 1, 3, 1, 66, 112, 86, 14, + 31, 83, 26, 92, 114, 91, 42, 42, 48, 52, + 52, 44, 32, 42, 48, 20, 34, 26, 12, 8, + 1, 26, 22, 20, 20, 8, 16, 32, 14, 19, + 14, 18, 11, 28, 11, 106, 124, 88, 68, 84, + 100, 124, 112, 62, 26, 10, 22, 42, 38, 53, + 26, 34, 3, 64, 30, 30, 36, 44, 38, 52, + 52, 4, 59, 7, 11, 8, 87, 63, 1, 18, + 20, 8, 19, 29, 39, 39, 79, 37, 36, 12, + 2, 9, 7, 23, 43, 43, 65, 19, 44, 18, + 8, 1, 0, 13, 23, 37, 59, 23, 24, 2, + 14, 10, 7, 23, 27, 49, 13, 60, 24, 20, + 12, 6, 7, 23, 23, 29, 124, 91, 85, 71, + 83, 69, 69, 67, 63, 59, 63, 61, 59, 43, + 51, 53, 63, 45, 47, 13, 35, 39, 35, 23, + 15, 17, 7, 23, 9, 11, 7, 9, 10, 9, + 27, 3, 9, 19, 6, 0, 5, 3, 14, 1, + 15, 21, 15, 31, 9, 6, 0, 6, 11, 5, + 11, 10, 8, 15, 19, 23, 23, 2, 124, 118, + 124, 124, 124, 124, 124, 124, 124, 124, 124, 124, + 120, 96, 52, 124, 124, 124, 124, 94, 74, 52, + 70, 62, 44, 18, 8, 5, 31, 100, 94, 100, + 72, 42, 50, 38, 28, 44, 34, 20, 7, 12, + 9, 31, 36, 44, 20, 42, 52, 28, 18, 10, + 8, 19, 1, 15, 33, 93, 13, 19, 35, 26, + 3, 21, 23, 7, 1, 0, 6, 4, 18, 3, + 4, 6, 18, 34, 32, 38, 8, 116, 98, 84, + 76, 60, 52, 20, 0, 33, 0, 64, 52, 46, + 32, 28, 14, 8, 0, 13, 23, 7, 1, 0, + 6, 4, 18, 3, 4, 6, 18, 34, 32, 38, + 8, 116, 98, 84, 76, 60, 52, 20, 0, 33, + }, + + { + /* Context Tables for P, SP, B Slices :: cabac_init_idc = 2, qp = 14 */ + + 122, 16, 25, 122, 16, 25, 77, 39, 32, 20, + 12, 44, 84, 102, 42, 8, 23, 50, 35, 35, + 4, 2, 10, 11, 12, 72, 76, 32, 21, 87, + 71, 67, 17, 35, 35, 4, 47, 23, 50, 10, + 31, 51, 67, 19, 23, 47, 75, 0, 33, 61, + 7, 33, 47, 73, 12, 23, 19, 45, 40, 6, + 44, 0, 0, 0, 23, 51, 67, 3, 22, 1, + 46, 33, 91, 63, 25, 37, 2, 1, 29, 25, + 71, 0, 47, 29, 61, 51, 33, 43, 37, 28, + 1, 13, 61, 21, 61, 35, 73, 3, 27, 17, + 49, 26, 13, 1, 45, 37, 27, 27, 11, 0, + 3, 23, 4, 10, 13, 6, 7, 11, 8, 19, + 46, 9, 2, 38, 40, 54, 50, 32, 39, 13, + 19, 8, 7, 43, 7, 10, 13, 19, 1, 36, + 88, 42, 9, 37, 27, 7, 66, 76, 77, 7, + 2, 41, 12, 1, 3, 0, 66, 112, 86, 16, + 33, 79, 24, 86, 108, 87, 42, 42, 46, 50, + 50, 42, 30, 42, 46, 18, 32, 26, 12, 8, + 3, 24, 20, 18, 18, 8, 16, 30, 14, 21, + 14, 16, 11, 26, 13, 102, 120, 84, 64, 80, + 96, 124, 106, 60, 22, 6, 18, 38, 34, 57, + 24, 32, 7, 62, 28, 28, 34, 42, 34, 48, + 48, 0, 59, 9, 13, 6, 85, 61, 2, 18, + 20, 8, 17, 27, 37, 37, 75, 35, 38, 14, + 4, 9, 5, 21, 39, 41, 61, 19, 44, 18, + 8, 0, 2, 13, 23, 35, 57, 23, 28, 4, + 14, 10, 5, 23, 27, 47, 11, 62, 26, 22, + 12, 8, 7, 21, 23, 29, 124, 87, 83, 69, + 79, 67, 65, 65, 61, 57, 59, 57, 55, 43, + 49, 53, 61, 39, 47, 15, 35, 37, 33, 23, + 15, 17, 7, 21, 9, 13, 7, 9, 10, 9, + 27, 3, 9, 19, 4, 0, 5, 5, 12, 1, + 15, 21, 13, 33, 9, 8, 1, 6, 11, 5, + 9, 10, 6, 13, 17, 23, 23, 0, 124, 116, + 122, 122, 122, 124, 124, 124, 122, 124, 124, 124, + 114, 90, 48, 124, 120, 118, 120, 88, 70, 48, + 66, 56, 40, 16, 6, 7, 31, 96, 90, 96, + 68, 38, 46, 34, 24, 40, 30, 16, 11, 8, + 13, 33, 34, 42, 16, 38, 48, 24, 14, 6, + 4, 21, 3, 17, 33, 91, 15, 23, 37, 24, + 5, 23, 23, 7, 1, 0, 8, 6, 20, 3, + 6, 8, 20, 34, 34, 38, 10, 116, 94, 80, + 72, 56, 48, 16, 3, 35, 0, 66, 52, 48, + 32, 30, 14, 8, 2, 13, 23, 7, 1, 0, + 8, 6, 20, 3, 6, 8, 20, 34, 34, 38, + 10, 116, 94, 80, 72, 56, 48, 16, 3, 35, + }, + + { + /* Context Tables for P, SP, B Slices :: cabac_init_idc = 2, qp = 15 */ + + 120, 16, 25, 120, 16, 25, 73, 37, 32, 20, + 12, 40, 80, 100, 44, 8, 19, 46, 35, 31, + 4, 2, 12, 13, 10, 70, 72, 26, 25, 89, + 67, 65, 17, 35, 31, 4, 47, 21, 50, 10, + 29, 49, 65, 21, 25, 47, 75, 0, 33, 59, + 7, 33, 47, 73, 12, 23, 19, 43, 40, 6, + 44, 0, 0, 0, 21, 51, 67, 3, 20, 1, + 46, 31, 89, 59, 23, 35, 6, 2, 27, 23, + 67, 2, 45, 27, 57, 53, 33, 43, 37, 28, + 1, 11, 57, 21, 59, 33, 69, 3, 27, 17, + 49, 26, 13, 0, 43, 37, 25, 27, 11, 0, + 1, 21, 6, 10, 13, 6, 7, 11, 8, 19, + 46, 9, 2, 38, 40, 54, 50, 32, 39, 11, + 17, 6, 7, 43, 7, 12, 13, 17, 0, 38, + 90, 46, 5, 39, 29, 5, 66, 76, 79, 7, + 2, 41, 14, 1, 3, 2, 66, 112, 86, 18, + 35, 77, 22, 80, 102, 85, 42, 42, 46, 48, + 50, 42, 30, 42, 46, 16, 30, 26, 12, 8, + 3, 24, 18, 16, 18, 8, 16, 28, 12, 23, + 12, 14, 11, 24, 13, 98, 116, 80, 62, 76, + 92, 118, 102, 58, 18, 2, 16, 34, 30, 59, + 22, 30, 11, 60, 26, 26, 32, 38, 30, 44, + 44, 1, 59, 11, 15, 2, 81, 57, 4, 20, + 20, 8, 15, 25, 35, 35, 71, 33, 40, 16, + 6, 7, 3, 19, 37, 39, 57, 17, 46, 20, + 10, 0, 4, 11, 21, 33, 55, 23, 30, 6, + 14, 10, 3, 21, 27, 45, 9, 62, 28, 22, + 12, 10, 5, 19, 21, 27, 124, 85, 79, 65, + 75, 65, 63, 61, 57, 53, 55, 53, 51, 41, + 47, 51, 59, 33, 47, 17, 35, 35, 31, 23, + 15, 17, 7, 21, 9, 15, 7, 9, 12, 9, + 27, 3, 9, 19, 4, 0, 5, 7, 10, 1, + 15, 21, 11, 35, 9, 10, 3, 6, 11, 5, + 9, 10, 6, 13, 15, 23, 23, 3, 122, 114, + 120, 118, 118, 124, 124, 124, 118, 120, 124, 122, + 108, 84, 44, 122, 114, 110, 110, 82, 66, 44, + 62, 52, 36, 14, 4, 9, 31, 92, 86, 92, + 64, 34, 42, 32, 20, 36, 26, 12, 15, 4, + 17, 35, 32, 40, 12, 36, 44, 22, 10, 4, + 2, 23, 5, 19, 33, 89, 17, 25, 41, 22, + 7, 25, 23, 5, 0, 2, 10, 8, 22, 3, + 8, 10, 22, 36, 36, 40, 12, 114, 90, 76, + 68, 52, 44, 12, 7, 37, 2, 66, 54, 50, + 34, 32, 16, 10, 2, 11, 23, 5, 0, 2, + 10, 8, 22, 3, 8, 10, 22, 36, 36, 40, + 12, 114, 90, 76, 68, 52, 44, 12, 7, 37, + }, + + { + /* Context Tables for P, SP, B Slices :: cabac_init_idc = 2, qp = 16 */ + + 116, 14, 27, 116, 14, 27, 71, 35, 32, 20, + 10, 36, 76, 98, 44, 8, 17, 42, 35, 29, + 2, 0, 14, 15, 6, 66, 68, 18, 31, 91, + 63, 63, 17, 35, 29, 2, 47, 21, 50, 8, + 29, 49, 63, 25, 29, 49, 77, 1, 33, 59, + 9, 35, 47, 73, 10, 23, 19, 43, 40, 4, + 44, 0, 0, 0, 21, 53, 67, 3, 18, 3, + 44, 31, 87, 55, 21, 35, 8, 4, 27, 21, + 65, 2, 43, 27, 53, 55, 35, 45, 39, 28, + 3, 11, 53, 21, 59, 33, 67, 3, 27, 17, + 49, 26, 15, 0, 43, 37, 25, 27, 11, 0, + 1, 19, 6, 10, 15, 4, 7, 11, 6, 19, + 46, 9, 2, 38, 40, 54, 50, 32, 41, 11, + 17, 4, 7, 43, 9, 12, 13, 17, 2, 40, + 90, 48, 3, 41, 33, 5, 66, 76, 81, 9, + 2, 43, 14, 3, 3, 4, 66, 110, 86, 20, + 37, 75, 18, 74, 94, 83, 40, 42, 44, 46, + 48, 40, 28, 40, 44, 14, 28, 24, 10, 6, + 5, 22, 16, 14, 16, 6, 14, 26, 10, 27, + 10, 12, 11, 20, 15, 92, 110, 76, 58, 72, + 86, 110, 96, 54, 14, 3, 12, 28, 26, 63, + 18, 26, 15, 56, 24, 24, 28, 34, 26, 40, + 40, 5, 59, 13, 17, 1, 79, 55, 6, 20, + 20, 8, 15, 25, 35, 35, 69, 31, 40, 16, + 6, 7, 1, 17, 35, 39, 53, 17, 46, 20, + 10, 0, 4, 11, 21, 33, 53, 23, 32, 8, + 14, 8, 3, 21, 27, 45, 9, 62, 28, 22, + 12, 12, 5, 19, 21, 27, 124, 83, 77, 63, + 73, 63, 61, 59, 55, 51, 53, 51, 47, 41, + 47, 51, 59, 27, 47, 21, 35, 35, 31, 23, + 15, 17, 9, 21, 11, 17, 9, 9, 12, 11, + 27, 5, 9, 19, 2, 1, 7, 9, 8, 3, + 17, 21, 9, 37, 11, 10, 5, 4, 11, 7, + 9, 8, 4, 13, 15, 23, 23, 7, 118, 112, + 116, 114, 112, 124, 124, 124, 112, 114, 124, 116, + 100, 78, 40, 114, 106, 102, 98, 76, 60, 40, + 56, 46, 32, 10, 0, 11, 31, 86, 80, 86, + 60, 28, 38, 28, 14, 32, 20, 8, 21, 1, + 21, 39, 28, 36, 6, 32, 40, 18, 6, 0, + 1, 25, 7, 21, 35, 87, 21, 29, 45, 18, + 11, 29, 23, 5, 0, 2, 12, 8, 22, 3, + 10, 10, 24, 36, 36, 40, 12, 112, 86, 72, + 62, 46, 40, 6, 11, 41, 2, 66, 54, 50, + 34, 32, 16, 10, 2, 11, 23, 5, 0, 2, + 12, 8, 22, 3, 10, 10, 24, 36, 36, 40, + 12, 112, 86, 72, 62, 46, 40, 6, 11, 41, + }, + + { + /* Context Tables for P, SP, B Slices :: cabac_init_idc = 2, qp = 17 */ + + 114, 14, 27, 114, 14, 27, 67, 31, 34, 22, + 10, 34, 74, 98, 46, 10, 13, 40, 33, 25, + 2, 0, 18, 15, 4, 64, 66, 12, 35, 91, + 59, 59, 15, 33, 25, 2, 45, 19, 52, 8, + 27, 47, 59, 27, 31, 49, 77, 1, 31, 57, + 9, 35, 45, 71, 10, 21, 17, 41, 42, 4, + 44, 0, 0, 0, 19, 53, 67, 1, 18, 3, + 44, 29, 83, 49, 17, 33, 12, 8, 25, 17, + 61, 4, 39, 25, 47, 55, 35, 45, 39, 30, + 3, 9, 47, 19, 57, 31, 63, 3, 25, 17, + 47, 28, 15, 2, 41, 35, 23, 25, 9, 2, + 0, 17, 8, 12, 15, 4, 5, 9, 6, 17, + 48, 7, 2, 40, 42, 56, 52, 34, 41, 9, + 15, 4, 7, 41, 9, 14, 11, 15, 6, 44, + 92, 52, 0, 41, 35, 3, 68, 78, 81, 9, + 4, 43, 16, 3, 1, 8, 66, 110, 88, 24, + 39, 71, 16, 70, 88, 79, 40, 42, 44, 46, + 48, 40, 28, 40, 44, 14, 28, 24, 10, 6, + 5, 22, 16, 14, 16, 6, 14, 26, 10, 29, + 10, 12, 11, 18, 15, 88, 106, 74, 56, 70, + 82, 104, 92, 52, 12, 7, 10, 24, 24, 65, + 16, 24, 17, 54, 24, 24, 26, 32, 24, 38, + 38, 7, 59, 13, 17, 3, 75, 51, 10, 22, + 20, 8, 13, 23, 33, 33, 65, 27, 42, 18, + 8, 5, 2, 15, 31, 37, 47, 15, 48, 22, + 12, 2, 6, 9, 19, 31, 49, 21, 36, 10, + 16, 8, 1, 19, 25, 43, 7, 64, 30, 24, + 14, 14, 3, 17, 19, 25, 124, 79, 73, 59, + 69, 59, 57, 55, 51, 47, 49, 47, 41, 39, + 45, 49, 57, 19, 45, 23, 33, 33, 29, 21, + 15, 15, 9, 19, 11, 17, 9, 9, 14, 11, + 27, 5, 9, 17, 2, 1, 7, 9, 8, 3, + 17, 19, 5, 37, 11, 12, 5, 4, 11, 7, + 7, 8, 4, 11, 13, 21, 21, 9, 116, 110, + 114, 112, 108, 120, 120, 118, 108, 110, 118, 112, + 94, 74, 36, 108, 100, 96, 88, 72, 56, 38, + 52, 42, 30, 8, 1, 11, 31, 82, 76, 82, + 56, 24, 36, 26, 10, 30, 16, 6, 25, 5, + 23, 41, 26, 34, 2, 30, 38, 16, 4, 1, + 3, 27, 7, 21, 35, 85, 23, 31, 47, 16, + 13, 31, 21, 3, 2, 4, 16, 10, 24, 1, + 14, 12, 26, 38, 38, 42, 14, 112, 84, 70, + 58, 42, 38, 2, 13, 43, 4, 68, 56, 52, + 36, 34, 18, 12, 4, 9, 21, 3, 2, 4, + 16, 10, 24, 1, 14, 12, 26, 38, 38, 42, + 14, 112, 84, 70, 58, 42, 38, 2, 13, 43, + }, + + { + /* Context Tables for P, SP, B Slices :: cabac_init_idc = 2, qp = 18 */ + + 112, 14, 27, 112, 14, 27, 63, 29, 34, 22, + 10, 30, 70, 96, 48, 10, 9, 36, 33, 23, + 2, 0, 20, 17, 2, 62, 62, 6, 39, 93, + 55, 57, 15, 33, 23, 2, 45, 17, 52, 8, + 25, 45, 57, 29, 33, 49, 77, 1, 31, 55, + 9, 35, 45, 71, 10, 21, 17, 39, 42, 4, + 44, 0, 0, 0, 17, 53, 67, 1, 16, 3, + 44, 27, 81, 45, 15, 31, 16, 12, 23, 15, + 57, 6, 37, 23, 43, 57, 35, 45, 39, 30, + 3, 7, 43, 19, 55, 29, 59, 3, 25, 17, + 47, 28, 15, 2, 39, 35, 21, 25, 9, 2, + 0, 15, 10, 12, 15, 4, 5, 9, 6, 17, + 48, 7, 2, 40, 42, 56, 52, 34, 43, 7, + 13, 2, 7, 41, 9, 16, 11, 15, 8, 46, + 94, 56, 4, 43, 37, 3, 68, 78, 83, 9, + 4, 45, 18, 3, 1, 10, 66, 110, 88, 26, + 41, 69, 14, 64, 82, 77, 40, 42, 44, 44, + 46, 40, 28, 40, 42, 12, 26, 24, 10, 6, + 5, 20, 14, 12, 16, 6, 14, 24, 8, 31, + 8, 10, 11, 16, 17, 84, 102, 70, 52, 66, + 78, 98, 88, 50, 8, 11, 6, 20, 20, 67, + 14, 22, 21, 52, 22, 22, 24, 28, 20, 34, + 34, 11, 59, 15, 19, 7, 73, 49, 12, 24, + 20, 8, 11, 21, 31, 31, 61, 25, 44, 20, + 10, 3, 4, 13, 29, 35, 43, 13, 48, 22, + 12, 2, 8, 7, 17, 29, 47, 21, 38, 12, + 16, 8, 0, 17, 25, 41, 5, 64, 32, 24, + 14, 16, 1, 15, 17, 23, 124, 77, 69, 55, + 65, 57, 55, 51, 47, 45, 45, 43, 37, 37, + 43, 47, 55, 13, 45, 25, 33, 31, 27, 21, + 15, 15, 9, 19, 11, 19, 9, 9, 16, 11, + 27, 5, 9, 17, 2, 1, 7, 11, 6, 3, + 17, 19, 3, 39, 11, 14, 7, 4, 11, 7, + 7, 8, 4, 11, 11, 21, 21, 13, 114, 108, + 112, 108, 104, 114, 114, 112, 104, 104, 112, 106, + 88, 68, 32, 100, 92, 88, 78, 66, 52, 34, + 48, 38, 26, 6, 3, 13, 31, 78, 72, 78, + 52, 20, 32, 22, 6, 26, 12, 2, 29, 9, + 27, 43, 24, 32, 1, 26, 34, 12, 0, 3, + 5, 29, 9, 23, 35, 83, 25, 33, 51, 14, + 15, 33, 21, 3, 4, 6, 18, 12, 26, 1, + 16, 14, 28, 38, 40, 44, 16, 110, 80, 66, + 54, 38, 34, 1, 17, 45, 6, 68, 56, 54, + 38, 36, 18, 14, 4, 7, 21, 3, 4, 6, + 18, 12, 26, 1, 16, 14, 28, 38, 40, 44, + 16, 110, 80, 66, 54, 38, 34, 1, 17, 45, + }, + + { + /* Context Tables for P, SP, B Slices :: cabac_init_idc = 2, qp = 19 */ + + 110, 14, 27, 110, 14, 27, 59, 25, 36, 22, + 8, 26, 68, 94, 48, 10, 5, 34, 33, 19, + 2, 1, 22, 19, 0, 60, 58, 1, 43, 95, + 51, 53, 15, 33, 19, 2, 43, 15, 54, 8, + 23, 43, 55, 31, 35, 49, 79, 1, 31, 53, + 9, 37, 45, 71, 10, 21, 17, 37, 44, 4, + 44, 0, 0, 0, 17, 53, 67, 0, 14, 3, + 42, 27, 79, 41, 13, 29, 18, 14, 21, 13, + 53, 8, 35, 21, 39, 59, 35, 45, 39, 32, + 3, 5, 39, 19, 53, 27, 55, 3, 25, 17, + 47, 30, 15, 4, 37, 35, 19, 25, 9, 2, + 2, 13, 10, 12, 15, 4, 5, 9, 6, 17, + 48, 7, 2, 42, 42, 58, 52, 36, 43, 7, + 11, 2, 7, 41, 9, 18, 11, 13, 10, 48, + 96, 58, 8, 43, 39, 1, 68, 78, 85, 9, + 6, 45, 18, 3, 1, 12, 66, 110, 88, 28, + 43, 65, 12, 58, 76, 73, 40, 42, 42, 42, + 46, 38, 26, 40, 42, 10, 24, 24, 10, 6, + 7, 20, 12, 10, 14, 6, 14, 22, 8, 33, + 8, 8, 11, 14, 17, 80, 98, 66, 50, 62, + 74, 92, 82, 48, 4, 15, 4, 16, 16, 71, + 12, 20, 25, 50, 20, 20, 22, 26, 16, 30, + 30, 13, 59, 17, 21, 9, 69, 45, 16, 24, + 20, 8, 9, 19, 29, 29, 57, 23, 46, 22, + 12, 3, 6, 11, 25, 33, 39, 13, 50, 24, + 14, 4, 10, 7, 17, 27, 45, 21, 42, 14, + 16, 8, 2, 17, 25, 39, 3, 66, 34, 26, + 14, 18, 1, 13, 17, 23, 124, 73, 67, 53, + 61, 55, 51, 49, 45, 41, 41, 39, 33, 37, + 41, 47, 53, 7, 45, 27, 33, 29, 25, 21, + 15, 15, 9, 17, 11, 21, 9, 9, 16, 11, + 27, 5, 9, 17, 0, 1, 7, 13, 4, 3, + 17, 19, 1, 41, 11, 16, 9, 4, 11, 7, + 5, 8, 2, 9, 9, 21, 21, 15, 112, 106, + 110, 104, 100, 110, 110, 106, 98, 98, 106, 100, + 82, 62, 28, 92, 86, 80, 68, 60, 48, 30, + 44, 32, 22, 4, 5, 15, 31, 74, 68, 74, + 48, 16, 28, 20, 2, 22, 8, 1, 33, 13, + 31, 45, 22, 30, 5, 24, 30, 10, 3, 7, + 9, 31, 11, 25, 35, 81, 27, 37, 53, 12, + 17, 35, 21, 1, 4, 6, 20, 14, 28, 1, + 18, 16, 30, 40, 42, 44, 18, 110, 76, 62, + 50, 34, 30, 5, 21, 47, 6, 70, 58, 56, + 38, 38, 20, 14, 6, 7, 21, 1, 4, 6, + 20, 14, 28, 1, 18, 16, 30, 40, 42, 44, + 18, 110, 76, 62, 50, 34, 30, 5, 21, 47, + }, + + { + /* Context Tables for P, SP, B Slices :: cabac_init_idc = 2, qp = 20 */ + + 106, 14, 27, 106, 14, 27, 57, 23, 36, 22, + 8, 22, 64, 92, 50, 10, 3, 30, 33, 17, + 0, 1, 24, 21, 3, 56, 54, 7, 49, 97, + 47, 51, 15, 33, 17, 0, 43, 13, 54, 6, + 23, 41, 51, 33, 39, 51, 79, 1, 29, 51, + 11, 37, 45, 71, 10, 21, 17, 37, 44, 4, + 44, 0, 0, 0, 15, 55, 67, 0, 12, 3, + 42, 25, 77, 37, 11, 27, 22, 18, 21, 11, + 49, 10, 33, 19, 33, 59, 37, 45, 41, 32, + 5, 3, 33, 17, 53, 27, 51, 3, 25, 17, + 47, 30, 15, 4, 35, 35, 19, 23, 9, 2, + 2, 11, 12, 12, 17, 2, 5, 9, 6, 17, + 48, 7, 2, 42, 42, 58, 52, 36, 45, 5, + 11, 0, 7, 39, 9, 20, 9, 13, 14, 52, + 98, 62, 12, 45, 41, 1, 70, 78, 87, 9, + 6, 47, 20, 3, 1, 14, 66, 110, 88, 32, + 45, 63, 10, 52, 70, 71, 38, 42, 42, 42, + 44, 38, 26, 40, 40, 8, 22, 22, 10, 6, + 7, 18, 12, 8, 14, 4, 12, 20, 6, 35, + 6, 6, 11, 10, 19, 76, 94, 62, 46, 60, + 70, 84, 78, 46, 0, 21, 0, 12, 12, 73, + 8, 18, 27, 48, 18, 18, 18, 22, 12, 26, + 28, 17, 59, 19, 23, 13, 67, 43, 18, 26, + 20, 8, 7, 17, 27, 29, 53, 19, 48, 22, + 12, 1, 10, 9, 23, 31, 33, 11, 50, 24, + 14, 4, 12, 5, 15, 27, 43, 21, 44, 16, + 16, 8, 4, 15, 25, 37, 1, 66, 36, 26, + 14, 20, 0, 11, 15, 21, 124, 71, 63, 49, + 59, 53, 49, 45, 41, 39, 39, 35, 27, 35, + 39, 45, 51, 1, 43, 29, 31, 27, 25, 21, + 15, 15, 11, 17, 13, 23, 9, 9, 18, 11, + 27, 5, 9, 15, 0, 3, 7, 15, 2, 5, + 17, 19, 0, 43, 13, 16, 9, 2, 11, 7, + 5, 6, 2, 9, 9, 21, 21, 19, 110, 104, + 108, 102, 94, 104, 104, 100, 94, 92, 98, 94, + 74, 58, 24, 84, 78, 72, 58, 54, 44, 26, + 40, 28, 18, 0, 7, 17, 31, 68, 62, 68, + 44, 12, 24, 16, 1, 18, 4, 5, 37, 19, + 33, 49, 20, 28, 11, 20, 28, 6, 7, 9, + 11, 33, 13, 27, 35, 79, 31, 39, 57, 8, + 19, 37, 21, 1, 6, 8, 22, 14, 28, 0, + 20, 18, 32, 40, 44, 46, 18, 108, 72, 58, + 46, 30, 26, 11, 25, 51, 8, 70, 58, 56, + 40, 38, 20, 16, 6, 5, 21, 1, 6, 8, + 22, 14, 28, 0, 20, 18, 32, 40, 44, 46, + 18, 108, 72, 58, 46, 30, 26, 11, 25, 51, + }, + + { + /* Context Tables for P, SP, B Slices :: cabac_init_idc = 2, qp = 21 */ + + 104, 14, 27, 104, 14, 27, 53, 19, 36, 22, + 6, 18, 62, 90, 52, 10, 0, 26, 31, 13, + 0, 1, 26, 23, 5, 54, 50, 13, 53, 97, + 43, 49, 15, 31, 13, 0, 43, 11, 54, 6, + 21, 39, 49, 35, 41, 51, 79, 1, 29, 49, + 11, 37, 45, 71, 10, 19, 17, 35, 46, 4, + 44, 0, 0, 0, 15, 55, 67, 2, 10, 3, + 40, 25, 73, 31, 9, 25, 24, 20, 19, 9, + 45, 12, 31, 17, 29, 61, 37, 45, 41, 32, + 5, 1, 29, 17, 51, 25, 47, 3, 25, 17, + 45, 30, 15, 6, 33, 33, 17, 23, 9, 4, + 4, 9, 12, 12, 17, 2, 3, 9, 6, 17, + 50, 7, 2, 44, 44, 58, 52, 36, 45, 5, + 9, 0, 7, 39, 9, 22, 9, 11, 16, 54, + 100, 64, 16, 45, 43, 0, 70, 78, 89, 9, + 8, 47, 22, 3, 1, 18, 66, 110, 88, 34, + 47, 59, 8, 48, 64, 69, 38, 42, 40, 40, + 44, 38, 24, 40, 40, 6, 20, 22, 10, 6, + 7, 18, 10, 6, 12, 4, 12, 18, 6, 37, + 4, 6, 11, 8, 19, 72, 90, 60, 44, 56, + 66, 78, 72, 44, 3, 25, 1, 8, 8, 75, + 6, 16, 31, 46, 16, 16, 16, 20, 8, 22, + 24, 19, 59, 21, 25, 17, 63, 39, 20, 26, + 20, 8, 5, 15, 25, 27, 49, 17, 50, 24, + 14, 0, 12, 7, 19, 29, 29, 9, 52, 26, + 16, 6, 14, 3, 15, 25, 41, 19, 46, 18, + 16, 8, 6, 15, 23, 35, 0, 66, 38, 26, + 14, 22, 2, 9, 15, 19, 124, 69, 61, 47, + 55, 51, 45, 43, 37, 35, 35, 31, 23, 33, + 37, 43, 49, 4, 43, 31, 31, 25, 23, 21, + 15, 15, 11, 15, 13, 25, 9, 9, 20, 11, + 27, 5, 9, 15, 1, 3, 7, 17, 0, 5, + 17, 19, 2, 43, 13, 18, 11, 2, 11, 7, + 5, 6, 0, 9, 7, 21, 21, 21, 108, 102, + 106, 98, 90, 100, 98, 94, 88, 86, 92, 88, + 68, 52, 20, 76, 72, 64, 48, 48, 40, 24, + 36, 22, 14, 1, 9, 19, 31, 64, 58, 64, + 40, 8, 20, 14, 5, 14, 0, 9, 41, 23, + 37, 51, 18, 26, 15, 18, 24, 4, 11, 11, + 13, 35, 15, 29, 35, 77, 33, 41, 59, 6, + 21, 39, 21, 0, 6, 10, 24, 16, 30, 0, + 22, 20, 34, 42, 46, 46, 20, 108, 68, 54, + 42, 26, 22, 15, 29, 53, 8, 72, 60, 58, + 42, 40, 22, 16, 6, 3, 21, 0, 6, 10, + 24, 16, 30, 0, 22, 20, 34, 42, 46, 46, + 20, 108, 68, 54, 42, 26, 22, 15, 29, 53, + }, + + { + /* Context Tables for P, SP, B Slices :: cabac_init_idc = 2, qp = 22 */ + + 102, 14, 29, 102, 14, 29, 49, 17, 38, 22, + 6, 16, 58, 88, 52, 12, 4, 24, 31, 11, + 0, 3, 28, 25, 7, 52, 48, 21, 57, 99, + 39, 45, 15, 31, 11, 0, 41, 9, 56, 6, + 19, 37, 47, 39, 43, 51, 81, 1, 29, 49, + 11, 39, 43, 71, 8, 19, 17, 33, 46, 4, + 44, 0, 0, 0, 13, 55, 67, 2, 10, 5, + 40, 23, 71, 27, 7, 23, 28, 24, 17, 7, + 41, 14, 29, 17, 25, 63, 37, 45, 41, 34, + 5, 0, 25, 17, 49, 23, 43, 3, 25, 17, + 45, 32, 17, 6, 33, 33, 15, 23, 7, 4, + 4, 7, 14, 12, 17, 2, 3, 9, 6, 17, + 50, 7, 2, 44, 44, 60, 54, 38, 47, 3, + 7, 1, 7, 39, 9, 24, 9, 11, 18, 56, + 100, 68, 18, 47, 45, 0, 70, 78, 89, 11, + 8, 49, 22, 5, 0, 20, 66, 110, 90, 36, + 49, 57, 6, 42, 58, 65, 38, 42, 40, 38, + 42, 36, 24, 38, 38, 6, 20, 22, 8, 6, + 9, 16, 8, 4, 12, 4, 12, 18, 4, 41, + 4, 4, 11, 6, 21, 66, 86, 56, 40, 52, + 62, 72, 68, 40, 7, 29, 5, 4, 4, 79, + 4, 12, 35, 42, 14, 14, 14, 16, 4, 18, + 20, 23, 59, 23, 27, 19, 61, 37, 24, 28, + 20, 8, 3, 15, 25, 25, 47, 15, 52, 26, + 16, 0, 14, 5, 17, 27, 25, 9, 52, 26, + 16, 6, 16, 3, 13, 23, 39, 19, 50, 20, + 16, 8, 6, 13, 23, 35, 0, 68, 38, 28, + 14, 24, 2, 9, 13, 19, 124, 65, 57, 43, + 51, 47, 43, 39, 35, 33, 31, 27, 19, 33, + 35, 43, 49, 10, 43, 33, 31, 25, 21, 21, + 15, 15, 11, 15, 13, 27, 9, 9, 20, 11, + 27, 5, 9, 15, 1, 3, 7, 19, 1, 5, + 17, 19, 4, 45, 13, 20, 13, 2, 11, 7, + 3, 6, 0, 7, 5, 21, 21, 25, 106, 100, + 104, 94, 86, 94, 94, 88, 84, 80, 86, 82, + 62, 46, 16, 70, 64, 56, 38, 44, 34, 20, + 32, 18, 10, 3, 11, 21, 31, 60, 54, 60, + 36, 4, 18, 10, 9, 10, 5, 13, 45, 27, + 41, 53, 16, 22, 19, 14, 20, 0, 15, 15, + 17, 37, 17, 31, 37, 75, 35, 45, 63, 4, + 25, 41, 19, 0, 8, 10, 26, 18, 32, 0, + 24, 22, 36, 42, 48, 48, 22, 106, 64, 50, + 38, 22, 18, 19, 33, 55, 10, 72, 60, 60, + 42, 42, 22, 18, 8, 3, 19, 0, 8, 10, + 26, 18, 32, 0, 24, 22, 36, 42, 48, 48, + 22, 106, 64, 50, 38, 22, 18, 19, 33, 55, + }, + + { + /* Context Tables for P, SP, B Slices :: cabac_init_idc = 2, qp = 23 */ + + 100, 14, 29, 100, 14, 29, 45, 13, 38, 22, + 4, 12, 56, 86, 54, 12, 6, 20, 31, 7, + 1, 3, 30, 27, 9, 50, 44, 27, 63, 101, + 35, 43, 15, 31, 7, 1, 41, 7, 56, 6, + 19, 35, 43, 41, 45, 51, 81, 1, 27, 47, + 13, 39, 43, 71, 8, 19, 17, 31, 48, 4, + 44, 0, 0, 0, 13, 55, 67, 4, 8, 5, + 38, 23, 69, 23, 5, 21, 30, 26, 15, 5, + 37, 16, 27, 15, 19, 63, 39, 45, 43, 34, + 5, 2, 19, 15, 49, 23, 39, 3, 25, 17, + 45, 32, 17, 8, 31, 33, 13, 21, 7, 4, + 6, 5, 14, 12, 19, 2, 3, 9, 6, 17, + 50, 7, 2, 46, 44, 60, 54, 38, 47, 3, + 7, 1, 7, 37, 9, 26, 7, 9, 22, 60, + 102, 70, 22, 47, 47, 2, 72, 78, 91, 11, + 10, 49, 24, 5, 0, 22, 66, 110, 90, 40, + 51, 53, 4, 36, 52, 63, 38, 42, 38, 38, + 42, 36, 22, 38, 38, 4, 18, 22, 8, 6, + 9, 16, 8, 2, 10, 4, 12, 16, 4, 43, + 2, 2, 11, 4, 21, 62, 82, 52, 38, 50, + 58, 66, 62, 38, 11, 33, 7, 0, 0, 81, + 2, 10, 37, 40, 12, 12, 10, 14, 0, 14, + 18, 25, 59, 25, 29, 23, 57, 33, 26, 28, + 20, 8, 1, 13, 23, 25, 43, 11, 54, 28, + 16, 2, 18, 3, 13, 25, 19, 7, 54, 28, + 18, 8, 18, 1, 13, 21, 37, 19, 52, 22, + 16, 8, 8, 13, 23, 33, 2, 68, 40, 28, + 14, 26, 4, 7, 13, 17, 124, 63, 55, 41, + 49, 45, 39, 37, 31, 29, 27, 23, 13, 31, + 33, 41, 47, 16, 41, 35, 29, 23, 19, 21, + 15, 15, 11, 13, 15, 29, 9, 9, 22, 11, + 27, 5, 9, 13, 3, 5, 7, 21, 3, 7, + 17, 19, 6, 47, 15, 22, 13, 0, 11, 7, + 3, 4, 1, 7, 5, 21, 21, 27, 104, 98, + 102, 92, 80, 90, 88, 82, 78, 74, 80, 76, + 56, 42, 12, 62, 58, 48, 28, 38, 30, 16, + 28, 12, 6, 5, 13, 23, 31, 56, 48, 54, + 32, 0, 14, 8, 13, 6, 9, 17, 49, 31, + 43, 57, 14, 20, 23, 12, 18, 1, 19, 17, + 19, 39, 19, 33, 37, 73, 39, 47, 65, 0, + 27, 43, 19, 2, 8, 12, 28, 20, 32, 2, + 26, 24, 38, 44, 50, 48, 22, 106, 60, 46, + 34, 18, 14, 25, 37, 57, 10, 74, 62, 60, + 44, 44, 24, 18, 8, 1, 19, 2, 8, 12, + 28, 20, 32, 2, 26, 24, 38, 44, 50, 48, + 22, 106, 60, 46, 34, 18, 14, 25, 37, 57, + }, + + { + /* Context Tables for P, SP, B Slices :: cabac_init_idc = 2, qp = 24 */ + + 96, 12, 29, 96, 12, 29, 43, 11, 38, 22, + 4, 8, 52, 84, 54, 12, 10, 16, 31, 5, + 1, 5, 32, 29, 13, 46, 40, 35, 67, 103, + 31, 41, 15, 31, 5, 1, 41, 7, 56, 4, + 17, 33, 41, 43, 49, 53, 83, 1, 27, 45, + 13, 41, 43, 71, 8, 19, 17, 31, 48, 4, + 44, 0, 0, 0, 11, 57, 67, 4, 6, 5, + 38, 21, 67, 19, 3, 21, 34, 30, 15, 3, + 33, 16, 25, 13, 15, 65, 39, 47, 43, 34, + 7, 2, 15, 15, 47, 21, 37, 3, 25, 17, + 45, 32, 17, 8, 29, 33, 13, 21, 7, 4, + 6, 3, 16, 12, 19, 0, 3, 9, 4, 17, + 50, 7, 2, 46, 44, 60, 54, 38, 49, 1, + 5, 3, 7, 37, 9, 28, 7, 9, 24, 62, + 104, 74, 26, 49, 49, 2, 72, 78, 93, 11, + 10, 51, 24, 5, 0, 24, 66, 110, 90, 42, + 53, 51, 2, 30, 44, 61, 36, 42, 38, 36, + 40, 34, 22, 38, 36, 2, 16, 20, 8, 4, + 11, 14, 6, 0, 10, 2, 10, 14, 2, 45, + 0, 0, 11, 0, 23, 58, 78, 48, 34, 46, + 52, 58, 58, 36, 15, 39, 11, 5, 3, 85, + 1, 8, 41, 38, 10, 10, 8, 10, 3, 10, + 14, 29, 59, 27, 31, 27, 55, 31, 28, 30, + 20, 8, 1, 11, 21, 23, 39, 9, 54, 28, + 18, 2, 20, 1, 11, 23, 15, 7, 54, 28, + 18, 8, 18, 1, 11, 21, 35, 19, 54, 24, + 16, 8, 10, 11, 23, 31, 4, 68, 42, 28, + 14, 28, 4, 5, 11, 17, 124, 61, 51, 37, + 45, 43, 37, 33, 29, 27, 25, 19, 9, 31, + 31, 41, 45, 22, 41, 37, 29, 21, 19, 21, + 15, 15, 13, 13, 15, 31, 9, 9, 22, 11, + 27, 7, 9, 13, 3, 5, 9, 23, 5, 7, + 19, 19, 8, 49, 15, 22, 15, 0, 11, 9, + 3, 4, 1, 7, 3, 21, 21, 31, 102, 96, + 100, 88, 76, 84, 82, 76, 74, 68, 72, 70, + 48, 36, 8, 54, 50, 40, 18, 32, 26, 12, + 22, 8, 2, 9, 17, 25, 31, 50, 44, 50, + 28, 5, 10, 4, 19, 2, 13, 21, 53, 37, + 47, 59, 10, 18, 29, 8, 14, 5, 23, 21, + 23, 41, 21, 35, 37, 71, 41, 51, 69, 1, + 29, 45, 19, 2, 10, 12, 30, 20, 34, 2, + 28, 24, 40, 44, 50, 50, 24, 104, 56, 42, + 30, 14, 10, 29, 41, 61, 12, 74, 62, 62, + 44, 44, 24, 20, 8, 1, 19, 2, 10, 12, + 30, 20, 34, 2, 28, 24, 40, 44, 50, 50, + 24, 104, 56, 42, 30, 14, 10, 29, 41, 61, + }, + + { + /* Context Tables for P, SP, B Slices :: cabac_init_idc = 2, qp = 25 */ + + 94, 12, 29, 94, 12, 29, 39, 9, 40, 22, + 4, 4, 48, 84, 56, 12, 14, 14, 29, 1, + 1, 5, 34, 29, 15, 44, 36, 41, 71, 103, + 27, 37, 13, 29, 1, 1, 39, 5, 58, 4, + 15, 31, 39, 45, 51, 53, 83, 1, 27, 43, + 13, 41, 43, 71, 8, 17, 15, 29, 48, 4, + 44, 0, 0, 0, 9, 57, 67, 4, 4, 5, + 38, 19, 63, 13, 1, 19, 38, 34, 13, 1, + 29, 18, 21, 11, 11, 67, 39, 47, 43, 36, + 7, 4, 11, 15, 45, 19, 33, 3, 23, 17, + 43, 34, 17, 10, 27, 31, 11, 21, 7, 6, + 8, 1, 18, 12, 19, 0, 1, 7, 4, 17, + 52, 7, 2, 46, 46, 62, 54, 40, 49, 0, + 3, 5, 7, 37, 9, 30, 7, 7, 26, 64, + 106, 78, 30, 51, 51, 4, 72, 80, 95, 11, + 10, 51, 26, 5, 0, 28, 66, 110, 90, 44, + 55, 49, 0, 26, 38, 57, 36, 42, 38, 34, + 40, 34, 22, 38, 36, 0, 14, 20, 8, 4, + 11, 14, 4, 0, 10, 2, 10, 12, 0, 47, + 0, 0, 11, 1, 23, 54, 74, 46, 32, 42, + 48, 52, 54, 34, 19, 43, 13, 9, 5, 87, + 3, 6, 45, 36, 8, 8, 6, 6, 5, 6, + 10, 31, 59, 27, 33, 29, 51, 27, 32, 32, + 20, 8, 0, 9, 19, 21, 35, 7, 56, 30, + 20, 4, 22, 0, 9, 21, 11, 5, 56, 30, + 20, 8, 20, 0, 9, 19, 31, 17, 58, 26, + 18, 8, 12, 9, 21, 29, 6, 70, 44, 30, + 16, 30, 6, 3, 9, 15, 124, 57, 47, 33, + 41, 41, 35, 29, 25, 23, 21, 15, 5, 29, + 29, 39, 43, 30, 41, 39, 29, 19, 17, 19, + 15, 15, 13, 13, 15, 33, 9, 9, 24, 11, + 27, 7, 9, 13, 3, 5, 9, 23, 7, 7, + 19, 17, 12, 49, 15, 24, 17, 0, 11, 9, + 1, 4, 1, 5, 1, 19, 19, 35, 100, 94, + 98, 84, 72, 78, 78, 70, 70, 64, 66, 66, + 42, 30, 4, 46, 44, 34, 8, 26, 22, 10, + 18, 4, 0, 11, 19, 27, 31, 46, 40, 46, + 24, 9, 6, 2, 23, 1, 17, 25, 57, 41, + 51, 61, 8, 16, 33, 6, 10, 7, 27, 23, + 25, 43, 23, 35, 37, 69, 43, 53, 73, 3, + 31, 47, 19, 4, 12, 14, 34, 22, 36, 2, + 30, 26, 42, 46, 52, 52, 26, 102, 54, 38, + 26, 10, 6, 33, 45, 63, 14, 74, 64, 64, + 46, 46, 26, 22, 10, 0, 19, 4, 12, 14, + 34, 22, 36, 2, 30, 26, 42, 46, 52, 52, + 26, 102, 54, 38, 26, 10, 6, 33, 45, 63, + }, + + { + /* Context Tables for P, SP, B Slices :: cabac_init_idc = 2, qp = 26 */ + + 92, 12, 29, 92, 12, 29, 35, 5, 40, 22, + 2, 0, 46, 82, 58, 12, 16, 10, 29, 0, + 3, 5, 36, 31, 17, 42, 32, 47, 77, 105, + 23, 35, 13, 29, 0, 3, 39, 3, 58, 4, + 15, 29, 35, 47, 53, 53, 83, 1, 25, 41, + 15, 41, 43, 71, 8, 17, 15, 27, 50, 4, + 44, 0, 0, 0, 9, 57, 67, 6, 2, 5, + 36, 19, 61, 9, 0, 17, 40, 36, 11, 0, + 25, 20, 19, 9, 5, 67, 41, 47, 45, 36, + 7, 6, 5, 13, 45, 19, 29, 3, 23, 17, + 43, 34, 17, 10, 25, 31, 9, 19, 7, 6, + 8, 0, 18, 12, 21, 0, 1, 7, 4, 17, + 52, 7, 2, 48, 46, 62, 54, 40, 51, 0, + 3, 5, 7, 35, 9, 32, 5, 7, 30, 68, + 108, 80, 34, 51, 53, 4, 74, 80, 97, 11, + 12, 53, 28, 5, 0, 30, 66, 110, 90, 48, + 57, 45, 1, 20, 32, 55, 36, 42, 36, 34, + 38, 34, 20, 38, 34, 1, 12, 20, 8, 4, + 11, 12, 4, 1, 8, 2, 10, 10, 0, 49, + 1, 1, 11, 3, 25, 50, 70, 42, 28, 40, + 44, 46, 48, 32, 23, 47, 17, 13, 9, 89, + 5, 4, 47, 34, 6, 6, 2, 4, 9, 2, + 8, 35, 59, 29, 35, 33, 49, 25, 34, 32, + 20, 8, 2, 7, 17, 21, 31, 3, 58, 32, + 20, 6, 26, 2, 5, 19, 5, 3, 56, 30, + 20, 10, 22, 2, 9, 17, 29, 17, 60, 28, + 18, 8, 14, 9, 21, 27, 8, 70, 46, 30, + 16, 32, 8, 1, 9, 13, 124, 55, 45, 31, + 39, 39, 31, 27, 21, 21, 17, 11, 0, 27, + 27, 37, 41, 36, 39, 41, 27, 17, 15, 19, + 15, 15, 13, 11, 17, 35, 9, 9, 26, 11, + 27, 7, 9, 11, 5, 7, 9, 25, 9, 9, + 19, 17, 14, 51, 17, 26, 17, 1, 11, 9, + 1, 2, 3, 5, 1, 19, 19, 37, 98, 92, + 96, 82, 66, 74, 72, 64, 64, 58, 60, 60, + 36, 26, 0, 38, 36, 26, 1, 20, 18, 6, + 14, 1, 3, 13, 21, 29, 31, 42, 34, 40, + 20, 13, 2, 1, 27, 5, 21, 29, 61, 45, + 53, 65, 6, 14, 37, 2, 8, 11, 31, 25, + 27, 45, 25, 37, 37, 67, 47, 55, 75, 7, + 33, 49, 19, 4, 12, 16, 36, 24, 36, 4, + 32, 28, 44, 46, 54, 52, 26, 102, 50, 34, + 22, 6, 2, 39, 49, 65, 14, 76, 64, 64, + 48, 48, 26, 22, 10, 2, 19, 4, 12, 16, + 36, 24, 36, 4, 32, 28, 44, 46, 54, 52, + 26, 102, 50, 34, 22, 6, 2, 39, 49, 65, + }, + + { + /* Context Tables for P, SP, B Slices :: cabac_init_idc = 2, qp = 27 */ + + 90, 12, 31, 90, 12, 31, 31, 3, 42, 22, + 2, 1, 42, 80, 58, 14, 20, 8, 29, 4, + 3, 7, 38, 33, 19, 40, 30, 55, 81, 107, + 19, 31, 13, 29, 4, 3, 37, 1, 60, 4, + 13, 27, 33, 51, 55, 53, 85, 1, 25, 41, + 15, 43, 41, 71, 6, 17, 15, 25, 50, 4, + 44, 0, 0, 0, 7, 57, 67, 6, 2, 7, + 36, 17, 59, 5, 2, 15, 44, 40, 9, 2, + 21, 22, 17, 9, 1, 69, 41, 47, 45, 38, + 7, 8, 1, 13, 43, 17, 25, 3, 23, 17, + 43, 36, 19, 12, 25, 31, 7, 19, 5, 6, + 10, 2, 20, 12, 21, 0, 1, 7, 4, 17, + 52, 7, 2, 48, 46, 64, 56, 42, 51, 2, + 1, 7, 7, 35, 9, 34, 5, 5, 32, 70, + 108, 84, 36, 53, 55, 6, 74, 80, 97, 13, + 12, 53, 28, 7, 2, 32, 66, 110, 92, 50, + 59, 43, 3, 14, 26, 51, 36, 42, 36, 32, + 38, 32, 20, 36, 34, 1, 12, 20, 6, 4, + 13, 12, 2, 3, 8, 2, 10, 10, 1, 53, + 1, 3, 11, 5, 25, 44, 66, 38, 26, 36, + 40, 40, 44, 28, 27, 51, 19, 17, 13, 93, + 7, 0, 51, 30, 4, 4, 0, 0, 13, 1, + 4, 37, 59, 31, 37, 35, 45, 21, 38, 34, + 20, 8, 4, 7, 17, 19, 29, 1, 60, 34, + 22, 6, 28, 4, 3, 17, 1, 3, 58, 32, + 22, 10, 24, 2, 7, 15, 27, 17, 64, 30, + 18, 8, 14, 7, 21, 27, 8, 72, 46, 32, + 16, 34, 8, 1, 7, 13, 124, 51, 41, 27, + 35, 35, 29, 23, 19, 17, 13, 7, 4, 27, + 25, 37, 41, 42, 39, 43, 27, 17, 13, 19, + 15, 15, 13, 11, 17, 37, 9, 9, 26, 11, + 27, 7, 9, 11, 5, 7, 9, 27, 11, 9, + 19, 17, 16, 53, 17, 28, 19, 1, 11, 9, + 0, 2, 3, 3, 0, 19, 19, 41, 96, 90, + 94, 78, 62, 68, 68, 58, 60, 52, 54, 54, + 30, 20, 3, 32, 30, 18, 11, 16, 12, 2, + 10, 5, 7, 15, 23, 31, 31, 38, 30, 36, + 16, 17, 0, 3, 31, 9, 27, 33, 65, 49, + 57, 67, 4, 10, 41, 0, 4, 13, 35, 29, + 31, 47, 27, 39, 39, 65, 49, 59, 79, 9, + 37, 51, 17, 6, 14, 16, 38, 26, 38, 4, + 34, 30, 46, 48, 56, 54, 28, 100, 46, 30, + 18, 2, 1, 43, 53, 67, 16, 76, 66, 66, + 48, 50, 28, 24, 12, 2, 17, 6, 14, 16, + 38, 26, 38, 4, 34, 30, 46, 48, 56, 54, + 28, 100, 46, 30, 18, 2, 1, 43, 53, 67, + }, + + { + /* Context Tables for P, SP, B Slices :: cabac_init_idc = 2, qp = 28 */ + + 86, 12, 31, 86, 12, 31, 29, 0, 42, 22, + 0, 5, 40, 78, 60, 14, 24, 4, 29, 6, + 3, 7, 40, 35, 23, 36, 26, 61, 85, 109, + 15, 29, 13, 29, 6, 3, 37, 0, 60, 2, + 11, 25, 31, 53, 59, 55, 85, 1, 25, 39, + 15, 43, 41, 71, 6, 17, 15, 25, 52, 4, + 44, 0, 0, 0, 7, 59, 67, 8, 0, 7, + 34, 17, 57, 1, 4, 13, 46, 42, 9, 4, + 17, 24, 15, 7, 2, 71, 41, 47, 45, 38, + 9, 10, 2, 13, 41, 15, 21, 3, 23, 17, + 43, 36, 19, 12, 23, 31, 7, 19, 5, 6, + 10, 4, 20, 12, 21, 1, 1, 7, 4, 17, + 52, 7, 2, 50, 46, 64, 56, 42, 53, 2, + 0, 7, 7, 35, 9, 36, 5, 5, 34, 72, + 110, 86, 40, 53, 57, 6, 74, 80, 99, 13, + 14, 55, 30, 7, 2, 34, 66, 110, 92, 52, + 61, 39, 5, 8, 20, 49, 34, 42, 34, 30, + 36, 32, 18, 36, 32, 3, 10, 18, 6, 4, + 13, 10, 0, 5, 6, 0, 8, 8, 1, 55, + 3, 5, 11, 9, 27, 40, 62, 34, 22, 32, + 36, 32, 38, 26, 31, 57, 23, 21, 17, 95, + 11, 1, 55, 28, 2, 2, 1, 1, 17, 5, + 0, 41, 59, 33, 39, 39, 43, 19, 40, 34, + 20, 8, 6, 5, 15, 17, 25, 0, 62, 34, + 24, 8, 30, 6, 0, 15, 2, 1, 58, 32, + 22, 12, 26, 4, 7, 15, 25, 17, 66, 32, + 18, 8, 16, 7, 21, 25, 10, 72, 48, 32, + 16, 36, 10, 0, 7, 11, 124, 49, 39, 25, + 31, 33, 25, 21, 15, 15, 11, 3, 8, 25, + 23, 35, 39, 48, 39, 45, 27, 15, 13, 19, + 15, 15, 15, 9, 17, 39, 9, 9, 28, 11, + 27, 7, 9, 11, 7, 7, 9, 29, 13, 9, + 19, 17, 18, 55, 17, 28, 21, 1, 11, 9, + 0, 2, 5, 3, 2, 19, 19, 43, 94, 88, + 92, 74, 58, 64, 62, 52, 54, 46, 46, 48, + 22, 14, 7, 24, 22, 10, 21, 10, 8, 1, + 6, 11, 11, 19, 25, 33, 31, 32, 26, 32, + 12, 21, 3, 7, 35, 13, 31, 37, 69, 55, + 61, 69, 2, 8, 47, 3, 0, 17, 39, 31, + 33, 49, 29, 41, 39, 63, 51, 61, 81, 11, + 39, 53, 17, 6, 14, 18, 40, 26, 40, 4, + 36, 32, 48, 48, 58, 54, 30, 100, 42, 26, + 14, 1, 5, 47, 57, 71, 16, 78, 66, 68, + 50, 50, 28, 24, 12, 4, 17, 6, 14, 18, + 40, 26, 40, 4, 36, 32, 48, 48, 58, 54, + 30, 100, 42, 26, 14, 1, 5, 47, 57, 71, + }, + + { + /* Context Tables for P, SP, B Slices :: cabac_init_idc = 2, qp = 29 */ + + 84, 12, 31, 84, 12, 31, 25, 2, 42, 22, + 0, 9, 36, 76, 62, 14, 26, 0, 27, 10, + 5, 7, 42, 37, 25, 34, 22, 67, 91, 109, + 11, 27, 13, 27, 10, 5, 37, 2, 60, 2, + 11, 23, 27, 55, 61, 55, 85, 1, 23, 37, + 17, 43, 41, 71, 6, 15, 15, 23, 52, 4, + 44, 0, 0, 0, 5, 59, 67, 8, 1, 7, + 34, 15, 53, 4, 6, 11, 50, 46, 7, 6, + 13, 26, 13, 5, 8, 71, 43, 47, 47, 38, + 9, 12, 8, 11, 41, 15, 17, 3, 23, 17, + 41, 36, 19, 14, 21, 29, 5, 17, 5, 8, + 12, 6, 22, 12, 23, 1, 0, 7, 4, 17, + 54, 7, 2, 50, 48, 64, 56, 42, 53, 4, + 0, 9, 7, 33, 9, 38, 3, 3, 38, 76, + 112, 90, 44, 55, 59, 8, 76, 80, 101, 13, + 14, 55, 32, 7, 2, 38, 66, 110, 92, 56, + 63, 37, 7, 4, 14, 47, 34, 42, 34, 30, + 36, 32, 18, 36, 32, 5, 8, 18, 6, 4, + 13, 10, 0, 7, 6, 0, 8, 6, 3, 57, + 5, 5, 11, 11, 27, 36, 58, 32, 20, 30, + 32, 26, 34, 24, 35, 61, 25, 25, 21, 97, + 13, 3, 57, 26, 0, 0, 5, 5, 21, 9, + 1, 43, 59, 35, 41, 43, 39, 15, 42, 36, + 20, 8, 8, 3, 13, 17, 21, 4, 64, 36, + 24, 10, 34, 8, 2, 13, 8, 0, 60, 34, + 24, 12, 28, 6, 5, 13, 23, 15, 68, 34, + 18, 8, 18, 5, 19, 23, 12, 72, 50, 32, + 16, 38, 12, 2, 5, 9, 124, 47, 35, 21, + 29, 31, 23, 17, 11, 11, 7, 0, 14, 23, + 21, 33, 37, 54, 37, 47, 25, 13, 11, 19, + 15, 15, 15, 9, 19, 41, 9, 9, 30, 11, + 27, 7, 9, 9, 7, 9, 9, 31, 15, 11, + 19, 17, 20, 55, 19, 30, 21, 3, 11, 9, + 0, 0, 5, 3, 2, 19, 19, 47, 92, 86, + 90, 72, 52, 58, 56, 46, 50, 40, 40, 42, + 16, 10, 11, 16, 16, 2, 31, 4, 4, 3, + 2, 15, 15, 21, 27, 35, 31, 28, 20, 26, + 8, 25, 7, 9, 39, 17, 35, 41, 73, 59, + 63, 73, 0, 6, 51, 5, 1, 19, 43, 33, + 35, 51, 31, 43, 39, 61, 55, 63, 85, 15, + 41, 55, 17, 8, 16, 20, 42, 28, 40, 6, + 38, 34, 50, 50, 60, 56, 30, 98, 38, 22, + 10, 5, 9, 53, 61, 73, 18, 78, 68, 68, + 52, 52, 30, 26, 12, 6, 17, 8, 16, 20, + 42, 28, 40, 6, 38, 34, 50, 50, 60, 56, + 30, 98, 38, 22, 10, 5, 9, 53, 61, 73, + }, + + { + /* Context Tables for P, SP, B Slices :: cabac_init_idc = 2, qp = 30 */ + + 82, 12, 31, 82, 12, 31, 21, 6, 44, 22, + 1, 13, 34, 74, 62, 14, 30, 1, 27, 12, + 5, 9, 44, 39, 27, 32, 18, 75, 95, 111, + 7, 23, 13, 27, 12, 5, 35, 4, 62, 2, + 9, 21, 25, 57, 63, 55, 87, 1, 23, 35, + 17, 45, 41, 71, 6, 15, 15, 21, 54, 4, + 44, 0, 0, 0, 5, 59, 67, 10, 3, 7, + 32, 15, 51, 8, 8, 9, 52, 48, 5, 8, + 9, 28, 11, 3, 12, 73, 43, 47, 47, 40, + 9, 14, 12, 11, 39, 13, 13, 3, 23, 17, + 41, 38, 19, 14, 19, 29, 3, 17, 5, 8, + 12, 8, 22, 12, 23, 1, 0, 7, 4, 17, + 54, 7, 2, 52, 48, 66, 56, 44, 55, 4, + 2, 9, 7, 33, 9, 40, 3, 3, 40, 78, + 114, 92, 48, 55, 61, 8, 76, 80, 103, 13, + 16, 57, 32, 7, 2, 40, 66, 110, 92, 58, + 65, 33, 9, 1, 8, 43, 34, 42, 32, 28, + 34, 30, 16, 36, 30, 7, 6, 18, 6, 4, + 15, 8, 1, 9, 4, 0, 8, 4, 3, 59, + 5, 7, 11, 13, 29, 32, 54, 28, 16, 26, + 28, 20, 28, 22, 39, 65, 29, 29, 25, 101, + 15, 5, 61, 24, 1, 1, 7, 7, 25, 13, + 5, 47, 59, 37, 43, 45, 37, 13, 46, 36, + 20, 8, 10, 1, 11, 15, 17, 6, 66, 38, + 26, 10, 36, 10, 6, 11, 12, 0, 60, 34, + 24, 14, 30, 6, 5, 11, 21, 15, 72, 36, + 18, 8, 20, 5, 19, 21, 14, 74, 52, 34, + 16, 40, 12, 4, 5, 9, 124, 43, 33, 19, + 25, 29, 19, 15, 9, 9, 3, 4, 18, 23, + 19, 33, 35, 60, 37, 49, 25, 11, 9, 19, + 15, 15, 15, 7, 19, 43, 9, 9, 30, 11, + 27, 7, 9, 9, 9, 9, 9, 33, 17, 11, + 19, 17, 22, 57, 19, 32, 23, 3, 11, 9, + 2, 0, 7, 1, 4, 19, 19, 49, 90, 84, + 88, 68, 48, 54, 52, 40, 44, 34, 34, 36, + 10, 4, 15, 8, 8, 5, 41, 1, 0, 7, + 1, 21, 19, 23, 29, 37, 31, 24, 16, 22, + 4, 29, 11, 13, 43, 21, 39, 45, 77, 63, + 67, 75, 1, 4, 55, 9, 5, 23, 47, 37, + 39, 53, 33, 45, 39, 59, 57, 67, 87, 17, + 43, 57, 17, 8, 16, 20, 44, 30, 42, 6, + 40, 36, 52, 50, 62, 56, 32, 98, 34, 18, + 6, 9, 13, 57, 65, 75, 18, 80, 68, 70, + 52, 54, 30, 26, 14, 6, 17, 8, 16, 20, + 44, 30, 42, 6, 40, 36, 52, 50, 62, 56, + 32, 98, 34, 18, 6, 9, 13, 57, 65, 75, + }, + + { + /* Context Tables for P, SP, B Slices :: cabac_init_idc = 2, qp = 31 */ + + 80, 12, 31, 80, 12, 31, 17, 8, 44, 22, + 1, 17, 30, 72, 64, 14, 34, 5, 27, 16, + 5, 9, 46, 41, 29, 30, 14, 81, 99, 113, + 3, 21, 13, 27, 16, 5, 35, 6, 62, 2, + 7, 19, 23, 59, 65, 55, 87, 1, 23, 33, + 17, 45, 41, 71, 6, 15, 15, 19, 54, 4, + 44, 0, 0, 0, 3, 59, 67, 10, 5, 7, + 32, 13, 49, 12, 10, 7, 56, 52, 3, 10, + 5, 30, 9, 1, 16, 75, 43, 47, 47, 40, + 9, 16, 16, 11, 37, 11, 9, 3, 23, 17, + 41, 38, 19, 16, 17, 29, 1, 17, 5, 8, + 14, 10, 24, 12, 23, 1, 0, 7, 4, 17, + 54, 7, 2, 52, 48, 66, 56, 44, 55, 6, + 4, 11, 7, 33, 9, 42, 3, 1, 42, 80, + 116, 96, 52, 57, 63, 10, 76, 80, 105, 13, + 16, 57, 34, 7, 2, 42, 66, 110, 92, 60, + 67, 31, 11, 7, 2, 41, 34, 42, 32, 26, + 34, 30, 16, 36, 30, 9, 4, 18, 6, 4, + 15, 8, 3, 11, 4, 0, 8, 2, 5, 61, + 7, 9, 11, 15, 29, 28, 50, 24, 14, 22, + 24, 14, 24, 20, 43, 69, 31, 33, 29, 103, + 17, 7, 65, 22, 3, 3, 9, 11, 29, 17, + 9, 49, 59, 39, 45, 49, 33, 9, 48, 38, + 20, 8, 12, 0, 9, 13, 13, 8, 68, 40, + 28, 12, 38, 12, 8, 9, 16, 2, 62, 36, + 26, 14, 32, 8, 3, 9, 19, 15, 74, 38, + 18, 8, 22, 3, 19, 19, 16, 74, 54, 34, + 16, 42, 14, 6, 3, 7, 124, 41, 29, 15, + 21, 27, 17, 11, 5, 5, 0, 8, 22, 21, + 17, 31, 33, 66, 37, 51, 25, 9, 7, 19, + 15, 15, 15, 7, 19, 45, 9, 9, 32, 11, + 27, 7, 9, 9, 9, 9, 9, 35, 19, 11, + 19, 17, 24, 59, 19, 34, 25, 3, 11, 9, + 2, 0, 7, 1, 6, 19, 19, 53, 88, 82, + 86, 64, 44, 48, 46, 34, 40, 28, 28, 30, + 4, 1, 19, 0, 2, 13, 51, 7, 3, 11, + 5, 25, 23, 25, 31, 39, 31, 20, 12, 18, + 0, 33, 15, 15, 47, 25, 43, 49, 81, 67, + 71, 77, 3, 2, 59, 11, 9, 25, 51, 39, + 41, 55, 35, 47, 39, 57, 59, 69, 91, 19, + 45, 59, 17, 10, 18, 22, 46, 32, 44, 6, + 42, 38, 54, 52, 64, 58, 34, 96, 30, 14, + 2, 13, 17, 61, 69, 77, 20, 80, 70, 72, + 54, 56, 32, 28, 14, 8, 17, 10, 18, 22, + 46, 32, 44, 6, 42, 38, 54, 52, 64, 58, + 34, 96, 30, 14, 2, 13, 17, 61, 69, 77, + }, + + { + /* Context Tables for P, SP, B Slices :: cabac_init_idc = 2, qp = 32 */ + + 76, 10, 33, 76, 10, 33, 15, 10, 44, 22, + 3, 21, 26, 70, 64, 14, 36, 9, 27, 18, + 7, 11, 48, 43, 33, 26, 10, 89, 105, 115, + 0, 19, 13, 27, 18, 7, 35, 6, 62, 0, + 7, 19, 21, 63, 69, 57, 89, 3, 23, 33, + 19, 47, 41, 71, 4, 15, 15, 19, 54, 2, + 44, 0, 0, 0, 3, 61, 67, 10, 7, 9, + 30, 13, 47, 16, 12, 7, 58, 54, 3, 12, + 3, 30, 7, 1, 20, 77, 45, 49, 49, 40, + 11, 16, 20, 11, 37, 11, 7, 3, 23, 17, + 41, 38, 21, 16, 17, 29, 1, 17, 5, 8, + 14, 12, 24, 12, 25, 3, 0, 7, 2, 17, + 54, 7, 2, 52, 48, 66, 56, 44, 57, 6, + 4, 13, 7, 33, 11, 42, 3, 1, 44, 82, + 116, 98, 54, 59, 67, 10, 76, 80, 107, 15, + 16, 59, 34, 9, 2, 44, 66, 108, 92, 62, + 69, 29, 15, 13, 5, 39, 32, 42, 30, 24, + 32, 28, 14, 34, 28, 11, 2, 16, 4, 2, + 17, 6, 5, 13, 2, 1, 6, 0, 7, 65, + 9, 11, 11, 19, 31, 22, 44, 20, 10, 18, + 18, 6, 18, 16, 47, 75, 35, 39, 33, 107, + 21, 11, 69, 18, 5, 5, 13, 15, 33, 21, + 13, 53, 59, 41, 47, 53, 31, 7, 50, 38, + 20, 8, 12, 0, 9, 13, 11, 10, 68, 40, + 28, 12, 40, 14, 10, 9, 20, 2, 62, 36, + 26, 14, 32, 8, 3, 9, 17, 15, 76, 40, + 18, 6, 22, 3, 19, 19, 16, 74, 54, 34, + 16, 44, 14, 6, 3, 7, 124, 39, 27, 13, + 19, 25, 15, 9, 3, 3, 2, 10, 26, 21, + 17, 31, 33, 72, 37, 55, 25, 9, 7, 19, + 15, 15, 17, 7, 21, 47, 11, 9, 32, 13, + 27, 9, 9, 9, 11, 11, 11, 37, 21, 13, + 21, 17, 26, 61, 21, 34, 27, 5, 11, 11, + 2, 1, 9, 1, 6, 19, 19, 57, 84, 80, + 82, 60, 38, 42, 40, 28, 34, 22, 20, 24, + 3, 7, 23, 7, 5, 21, 63, 13, 9, 15, + 11, 31, 27, 29, 35, 41, 31, 14, 6, 12, + 3, 39, 19, 19, 53, 29, 49, 53, 87, 73, + 75, 81, 7, 1, 65, 15, 13, 29, 55, 43, + 45, 57, 37, 49, 41, 55, 63, 73, 95, 23, + 49, 63, 17, 10, 18, 22, 48, 32, 44, 6, + 44, 38, 56, 52, 64, 58, 34, 94, 26, 10, + 3, 19, 21, 67, 73, 81, 20, 80, 70, 72, + 54, 56, 32, 28, 14, 8, 17, 10, 18, 22, + 48, 32, 44, 6, 44, 38, 56, 52, 64, 58, + 34, 94, 26, 10, 3, 19, 21, 67, 73, 81, + }, + + { + /* Context Tables for P, SP, B Slices :: cabac_init_idc = 2, qp = 33 */ + + 74, 10, 33, 74, 10, 33, 11, 14, 46, 24, + 3, 23, 24, 70, 66, 16, 40, 11, 25, 22, + 7, 11, 52, 43, 35, 24, 8, 95, 109, 115, + 4, 15, 11, 25, 22, 7, 33, 8, 64, 0, + 5, 17, 17, 65, 71, 57, 89, 3, 21, 31, + 19, 47, 39, 69, 4, 13, 13, 17, 56, 2, + 44, 0, 0, 0, 1, 61, 67, 12, 7, 9, + 30, 11, 43, 22, 16, 5, 62, 58, 1, 16, + 0, 32, 3, 0, 26, 77, 45, 49, 49, 42, + 11, 18, 26, 9, 35, 9, 3, 3, 21, 17, + 39, 40, 21, 18, 15, 27, 0, 15, 3, 10, + 16, 14, 26, 14, 25, 3, 2, 5, 2, 15, + 56, 5, 2, 54, 50, 68, 58, 46, 57, 8, + 6, 13, 7, 31, 11, 44, 1, 0, 48, 86, + 118, 102, 58, 59, 69, 12, 78, 82, 107, 15, + 18, 59, 36, 9, 4, 48, 66, 108, 94, 66, + 71, 25, 17, 17, 11, 35, 32, 42, 30, 24, + 32, 28, 14, 34, 28, 11, 2, 16, 4, 2, + 17, 6, 5, 13, 2, 1, 6, 0, 7, 67, + 9, 11, 11, 21, 31, 18, 40, 18, 8, 16, + 14, 0, 14, 14, 49, 79, 37, 43, 35, 109, + 23, 13, 71, 16, 5, 5, 15, 17, 35, 23, + 15, 55, 59, 41, 47, 55, 27, 3, 54, 40, + 20, 8, 14, 2, 7, 11, 7, 14, 70, 42, + 30, 14, 44, 16, 14, 7, 26, 4, 64, 38, + 28, 16, 34, 10, 1, 7, 13, 13, 80, 42, + 20, 6, 24, 1, 17, 17, 18, 76, 56, 36, + 18, 46, 16, 8, 1, 5, 124, 35, 23, 9, + 15, 21, 11, 5, 0, 0, 6, 14, 32, 19, + 15, 29, 31, 80, 35, 57, 23, 7, 5, 17, + 15, 13, 17, 5, 21, 47, 11, 9, 34, 13, + 27, 9, 9, 7, 11, 11, 11, 37, 21, 13, + 21, 15, 30, 61, 21, 36, 27, 5, 11, 11, + 4, 1, 9, 0, 8, 17, 17, 59, 82, 78, + 80, 58, 34, 38, 36, 22, 30, 18, 14, 20, + 9, 11, 27, 13, 11, 27, 73, 17, 13, 17, + 15, 35, 29, 31, 37, 41, 31, 10, 2, 8, + 7, 43, 21, 21, 57, 31, 53, 55, 91, 77, + 77, 83, 9, 3, 69, 17, 15, 31, 57, 45, + 47, 59, 37, 49, 41, 53, 65, 75, 97, 25, + 51, 65, 15, 12, 20, 24, 52, 34, 46, 8, + 48, 40, 58, 54, 66, 60, 36, 94, 24, 8, + 7, 23, 23, 71, 75, 83, 22, 82, 72, 74, + 56, 58, 34, 30, 16, 10, 15, 12, 20, 24, + 52, 34, 46, 8, 48, 40, 58, 54, 66, 60, + 36, 94, 24, 8, 7, 23, 23, 71, 75, 83, + }, + + { + /* Context Tables for P, SP, B Slices :: cabac_init_idc = 2, qp = 34 */ + + 72, 10, 33, 72, 10, 33, 7, 16, 46, 24, + 3, 27, 20, 68, 68, 16, 44, 15, 25, 24, + 7, 11, 54, 45, 37, 22, 4, 101, 113, 117, + 8, 13, 11, 25, 24, 7, 33, 10, 64, 0, + 3, 15, 15, 67, 73, 57, 89, 3, 21, 29, + 19, 47, 39, 69, 4, 13, 13, 15, 56, 2, + 44, 0, 0, 0, 0, 61, 67, 12, 9, 9, + 30, 9, 41, 26, 18, 3, 66, 62, 0, 18, + 4, 34, 1, 2, 30, 79, 45, 49, 49, 42, + 11, 20, 30, 9, 33, 7, 0, 3, 21, 17, + 39, 40, 21, 18, 13, 27, 2, 15, 3, 10, + 16, 16, 28, 14, 25, 3, 2, 5, 2, 15, + 56, 5, 2, 54, 50, 68, 58, 46, 59, 10, + 8, 15, 7, 31, 11, 46, 1, 0, 50, 88, + 120, 106, 62, 61, 71, 12, 78, 82, 109, 15, + 18, 61, 38, 9, 4, 50, 66, 108, 94, 68, + 73, 23, 19, 23, 17, 33, 32, 42, 30, 22, + 30, 28, 14, 34, 26, 13, 0, 16, 4, 2, + 17, 4, 7, 15, 2, 1, 6, 1, 9, 69, + 11, 13, 11, 23, 33, 14, 36, 14, 4, 12, + 10, 5, 10, 12, 53, 83, 41, 47, 39, 111, + 25, 15, 75, 14, 7, 7, 17, 21, 39, 27, + 19, 59, 59, 43, 49, 59, 25, 1, 56, 42, + 20, 8, 16, 4, 5, 9, 3, 16, 72, 44, + 32, 16, 46, 18, 16, 5, 30, 6, 64, 38, + 28, 16, 36, 12, 0, 5, 11, 13, 82, 44, + 20, 6, 26, 0, 17, 15, 20, 76, 58, 36, + 18, 48, 18, 10, 0, 3, 124, 33, 19, 5, + 11, 19, 9, 1, 4, 2, 10, 18, 36, 17, + 13, 27, 29, 86, 35, 59, 23, 5, 3, 17, + 15, 13, 17, 5, 21, 49, 11, 9, 36, 13, + 27, 9, 9, 7, 11, 11, 11, 39, 23, 13, + 21, 15, 32, 63, 21, 38, 29, 5, 11, 11, + 4, 1, 9, 0, 10, 17, 17, 63, 80, 76, + 78, 54, 30, 32, 30, 16, 26, 12, 8, 14, + 15, 17, 31, 21, 19, 35, 83, 23, 17, 21, + 19, 39, 33, 33, 39, 43, 31, 6, 1, 4, + 11, 47, 25, 25, 61, 35, 57, 59, 95, 81, + 81, 85, 11, 5, 73, 21, 19, 35, 61, 47, + 49, 61, 39, 51, 41, 51, 67, 77, 101, 27, + 53, 67, 15, 12, 22, 26, 54, 36, 48, 8, + 50, 42, 60, 54, 68, 62, 38, 92, 20, 4, + 11, 27, 27, 75, 79, 85, 24, 82, 72, 76, + 58, 60, 34, 32, 16, 12, 15, 12, 22, 26, + 54, 36, 48, 8, 50, 42, 60, 54, 68, 62, + 38, 92, 20, 4, 11, 27, 27, 75, 79, 85, + }, + + { + /* Context Tables for P, SP, B Slices :: cabac_init_idc = 2, qp = 35 */ + + 70, 10, 33, 70, 10, 33, 3, 20, 48, 24, + 5, 31, 18, 66, 68, 16, 48, 17, 25, 28, + 7, 13, 56, 47, 39, 20, 0, 109, 117, 119, + 12, 9, 11, 25, 28, 7, 31, 12, 66, 0, + 1, 13, 13, 69, 75, 57, 91, 3, 21, 27, + 19, 49, 39, 69, 4, 13, 13, 13, 58, 2, + 44, 0, 0, 0, 0, 61, 67, 14, 11, 9, + 28, 9, 39, 30, 20, 1, 68, 64, 2, 20, + 8, 36, 0, 4, 34, 81, 45, 49, 49, 44, + 11, 22, 34, 9, 31, 5, 4, 3, 21, 17, + 39, 42, 21, 20, 11, 27, 4, 15, 3, 10, + 18, 18, 28, 14, 25, 3, 2, 5, 2, 15, + 56, 5, 2, 56, 50, 70, 58, 48, 59, 10, + 10, 15, 7, 31, 11, 48, 1, 2, 52, 90, + 122, 108, 66, 61, 73, 14, 78, 82, 111, 15, + 20, 61, 38, 9, 4, 52, 66, 108, 94, 70, + 75, 19, 21, 29, 23, 29, 32, 42, 28, 20, + 30, 26, 12, 34, 26, 15, 1, 16, 4, 2, + 19, 4, 9, 17, 0, 1, 6, 3, 9, 71, + 11, 15, 11, 25, 33, 10, 32, 10, 2, 8, + 6, 11, 4, 10, 57, 87, 43, 51, 43, 115, + 27, 17, 79, 12, 9, 9, 19, 23, 43, 31, + 23, 61, 59, 45, 51, 61, 21, 2, 60, 42, + 20, 8, 18, 6, 3, 7, 0, 18, 74, 46, + 34, 16, 48, 20, 20, 3, 34, 6, 66, 40, + 30, 18, 38, 12, 0, 3, 9, 13, 86, 46, + 20, 6, 28, 0, 17, 13, 22, 78, 60, 38, + 18, 50, 18, 12, 0, 3, 124, 29, 17, 3, + 7, 17, 5, 0, 6, 6, 14, 22, 40, 17, + 11, 27, 27, 92, 35, 61, 23, 3, 1, 17, + 15, 13, 17, 3, 21, 51, 11, 9, 36, 13, + 27, 9, 9, 7, 13, 11, 11, 41, 25, 13, + 21, 15, 34, 65, 21, 40, 31, 5, 11, 11, + 6, 1, 11, 2, 12, 17, 17, 65, 78, 74, + 76, 50, 26, 28, 26, 10, 20, 6, 2, 8, + 21, 23, 35, 29, 25, 43, 93, 29, 21, 25, + 23, 45, 37, 35, 41, 45, 31, 2, 5, 0, + 15, 51, 29, 27, 65, 39, 61, 63, 99, 85, + 85, 87, 13, 7, 77, 23, 23, 37, 65, 51, + 53, 63, 41, 53, 41, 49, 69, 81, 103, 29, + 55, 69, 15, 14, 22, 26, 56, 38, 50, 8, + 52, 44, 62, 56, 70, 62, 40, 92, 16, 0, + 15, 31, 31, 79, 83, 87, 24, 84, 74, 78, + 58, 62, 36, 32, 18, 12, 15, 14, 22, 26, + 56, 38, 50, 8, 52, 44, 62, 56, 70, 62, + 40, 92, 16, 0, 15, 31, 31, 79, 83, 87, + }, + + { + /* Context Tables for P, SP, B Slices :: cabac_init_idc = 2, qp = 36 */ + + 66, 10, 33, 66, 10, 33, 1, 22, 48, 24, + 5, 35, 14, 64, 70, 16, 50, 21, 25, 30, + 9, 13, 58, 49, 43, 16, 3, 115, 123, 121, + 16, 7, 11, 25, 30, 9, 31, 14, 66, 1, + 1, 11, 9, 71, 79, 59, 91, 3, 19, 25, + 21, 49, 39, 69, 4, 13, 13, 13, 58, 2, + 44, 0, 0, 0, 2, 63, 67, 14, 13, 9, + 28, 7, 37, 34, 22, 0, 72, 68, 2, 22, + 12, 38, 2, 6, 40, 81, 47, 49, 51, 44, + 13, 24, 40, 7, 31, 5, 8, 3, 21, 17, + 39, 42, 21, 20, 9, 27, 4, 13, 3, 10, + 18, 20, 30, 14, 27, 5, 2, 5, 2, 15, + 56, 5, 2, 56, 50, 70, 58, 48, 61, 12, + 10, 17, 7, 29, 11, 50, 0, 2, 56, 94, + 124, 112, 70, 63, 75, 14, 80, 82, 113, 15, + 20, 63, 40, 9, 4, 54, 66, 108, 94, 74, + 77, 17, 23, 35, 29, 27, 30, 42, 28, 20, + 28, 26, 12, 34, 24, 17, 3, 14, 4, 2, + 19, 2, 9, 19, 0, 3, 4, 5, 11, 73, + 13, 17, 11, 29, 35, 6, 28, 6, 1, 6, + 2, 19, 0, 8, 61, 93, 47, 55, 47, 117, + 31, 19, 81, 10, 11, 11, 23, 27, 47, 35, + 25, 65, 59, 47, 53, 65, 19, 4, 62, 44, + 20, 8, 20, 8, 1, 7, 4, 22, 76, 46, + 34, 18, 52, 22, 22, 1, 40, 8, 66, 40, + 30, 18, 40, 14, 2, 3, 7, 13, 88, 48, + 20, 6, 30, 2, 17, 11, 24, 78, 62, 38, + 18, 52, 20, 14, 2, 1, 124, 27, 13, 0, + 5, 15, 3, 4, 10, 8, 16, 26, 46, 15, + 9, 25, 25, 98, 33, 63, 21, 1, 1, 17, + 15, 13, 19, 3, 23, 53, 11, 9, 38, 13, + 27, 9, 9, 5, 13, 13, 11, 43, 27, 15, + 21, 15, 36, 67, 23, 40, 31, 7, 11, 11, + 6, 3, 11, 2, 12, 17, 17, 69, 76, 72, + 74, 48, 20, 22, 20, 4, 16, 0, 5, 2, + 29, 27, 39, 37, 33, 51, 103, 35, 25, 29, + 27, 49, 41, 39, 43, 47, 31, 3, 11, 5, + 19, 55, 33, 31, 69, 43, 65, 67, 103, 91, + 87, 91, 15, 9, 83, 27, 25, 41, 69, 53, + 55, 65, 43, 55, 41, 47, 73, 83, 107, 33, + 57, 71, 15, 14, 24, 28, 58, 38, 50, 10, + 54, 46, 64, 56, 72, 64, 40, 90, 12, 3, + 19, 35, 35, 85, 87, 91, 26, 84, 74, 78, + 60, 62, 36, 34, 18, 14, 15, 14, 24, 28, + 58, 38, 50, 10, 54, 46, 64, 56, 72, 64, + 40, 90, 12, 3, 19, 35, 35, 85, 87, 91, + }, + + { + /* Context Tables for P, SP, B Slices :: cabac_init_idc = 2, qp = 37 */ + + 64, 10, 33, 64, 10, 33, 2, 26, 48, 24, + 7, 39, 12, 62, 72, 16, 54, 25, 23, 34, + 9, 13, 60, 51, 45, 14, 7, 121, 125, 121, + 20, 5, 11, 23, 34, 9, 31, 16, 66, 1, + 0, 9, 7, 73, 81, 59, 91, 3, 19, 23, + 21, 49, 39, 69, 4, 11, 13, 11, 60, 2, + 44, 0, 0, 0, 2, 63, 67, 16, 15, 9, + 26, 7, 33, 40, 24, 2, 74, 70, 4, 24, + 16, 40, 4, 8, 44, 83, 47, 49, 51, 44, + 13, 26, 44, 7, 29, 3, 12, 3, 21, 17, + 37, 42, 21, 22, 7, 25, 6, 13, 3, 12, + 20, 22, 30, 14, 27, 5, 4, 5, 2, 15, + 58, 5, 2, 58, 52, 70, 58, 48, 61, 12, + 12, 17, 7, 29, 11, 52, 0, 4, 58, 96, + 124, 114, 74, 63, 77, 16, 80, 82, 115, 15, + 22, 63, 42, 9, 4, 58, 66, 108, 94, 76, + 79, 13, 25, 39, 35, 25, 30, 42, 26, 18, + 28, 26, 10, 34, 24, 19, 5, 14, 4, 2, + 19, 2, 11, 21, 1, 3, 4, 7, 11, 75, + 15, 17, 11, 31, 35, 2, 24, 4, 3, 2, + 1, 25, 5, 6, 65, 97, 49, 59, 51, 119, + 33, 21, 85, 8, 13, 13, 25, 29, 51, 39, + 29, 67, 59, 49, 55, 69, 15, 8, 64, 44, + 20, 8, 22, 10, 0, 5, 8, 24, 78, 48, + 36, 20, 54, 24, 26, 0, 44, 10, 68, 42, + 32, 20, 42, 16, 2, 1, 5, 11, 90, 50, + 20, 6, 32, 2, 15, 9, 26, 78, 64, 38, + 18, 54, 22, 16, 2, 0, 124, 25, 11, 2, + 1, 13, 0, 6, 14, 12, 20, 30, 50, 13, + 7, 23, 23, 104, 33, 65, 21, 0, 0, 17, + 15, 13, 19, 1, 23, 55, 11, 9, 40, 13, + 27, 9, 9, 5, 15, 13, 11, 45, 29, 15, + 21, 15, 38, 67, 23, 42, 33, 7, 11, 11, + 6, 3, 13, 2, 14, 17, 17, 71, 74, 70, + 72, 44, 16, 18, 14, 1, 10, 5, 11, 3, + 35, 33, 43, 45, 39, 59, 113, 41, 29, 31, + 31, 55, 45, 41, 45, 49, 31, 7, 15, 9, + 23, 59, 37, 33, 73, 47, 69, 71, 107, 95, + 91, 93, 17, 11, 87, 29, 29, 43, 73, 55, + 57, 67, 45, 57, 41, 45, 75, 85, 109, 35, + 59, 73, 15, 16, 24, 30, 60, 40, 52, 10, + 56, 48, 66, 58, 74, 64, 42, 90, 8, 7, + 23, 39, 39, 89, 91, 93, 26, 86, 76, 80, + 62, 64, 38, 34, 18, 16, 15, 16, 24, 30, + 60, 40, 52, 10, 56, 48, 66, 58, 74, 64, + 42, 90, 8, 7, 23, 39, 39, 89, 91, 93, + }, + + { + /* Context Tables for P, SP, B Slices :: cabac_init_idc = 2, qp = 38 */ + + 62, 10, 35, 62, 10, 35, 6, 28, 50, 24, + 7, 41, 8, 60, 72, 18, 58, 27, 23, 36, + 9, 15, 62, 53, 47, 12, 9, 125, 125, 123, + 24, 1, 11, 23, 36, 9, 29, 18, 68, 1, + 2, 7, 5, 77, 83, 59, 93, 3, 19, 23, + 21, 51, 37, 69, 2, 11, 13, 9, 60, 2, + 44, 0, 0, 0, 4, 63, 67, 16, 15, 11, + 26, 5, 31, 44, 26, 4, 78, 74, 6, 26, + 20, 42, 6, 8, 48, 85, 47, 49, 51, 46, + 13, 28, 48, 7, 27, 1, 16, 3, 21, 17, + 37, 44, 23, 22, 7, 25, 8, 13, 1, 12, + 20, 24, 32, 14, 27, 5, 4, 5, 2, 15, + 58, 5, 2, 58, 52, 72, 60, 50, 63, 14, + 14, 19, 7, 29, 11, 54, 0, 4, 60, 98, + 124, 118, 76, 65, 79, 16, 80, 82, 115, 17, + 22, 65, 42, 11, 6, 60, 66, 108, 96, 78, + 81, 11, 27, 45, 41, 21, 30, 42, 26, 16, + 26, 24, 10, 32, 22, 19, 5, 14, 2, 2, + 21, 0, 13, 23, 1, 3, 4, 7, 13, 79, + 15, 19, 11, 33, 37, 3, 20, 0, 7, 1, + 5, 31, 9, 2, 69, 101, 53, 63, 55, 123, + 35, 25, 89, 4, 15, 15, 27, 33, 55, 43, + 33, 71, 59, 51, 57, 71, 13, 10, 68, 46, + 20, 8, 24, 10, 0, 3, 10, 26, 80, 50, + 38, 20, 56, 26, 28, 2, 48, 10, 68, 42, + 32, 20, 44, 16, 4, 0, 3, 11, 94, 52, + 20, 6, 32, 4, 15, 9, 26, 80, 64, 40, + 18, 56, 22, 16, 4, 0, 124, 21, 7, 6, + 2, 9, 2, 10, 16, 14, 24, 34, 54, 13, + 5, 23, 23, 110, 33, 67, 21, 0, 2, 17, + 15, 13, 19, 1, 23, 57, 11, 9, 40, 13, + 27, 9, 9, 5, 15, 13, 11, 47, 31, 15, + 21, 15, 40, 69, 23, 44, 35, 7, 11, 11, + 8, 3, 13, 4, 16, 17, 17, 75, 72, 68, + 70, 40, 12, 12, 10, 7, 6, 11, 17, 9, + 41, 39, 47, 51, 47, 67, 123, 45, 35, 35, + 35, 59, 49, 43, 47, 51, 31, 11, 19, 13, + 27, 63, 39, 37, 77, 51, 75, 75, 111, 99, + 95, 95, 19, 15, 91, 33, 33, 47, 77, 59, + 61, 69, 47, 59, 43, 43, 77, 89, 113, 37, + 63, 75, 13, 16, 26, 30, 62, 42, 54, 10, + 58, 50, 68, 58, 76, 66, 44, 88, 4, 11, + 27, 43, 43, 93, 95, 95, 28, 86, 76, 82, + 62, 66, 38, 36, 20, 16, 13, 16, 26, 30, + 62, 42, 54, 10, 58, 50, 68, 58, 76, 66, + 44, 88, 4, 11, 27, 43, 43, 93, 95, 95, + }, + + { + /* Context Tables for P, SP, B Slices :: cabac_init_idc = 2, qp = 39 */ + + 60, 10, 35, 60, 10, 35, 10, 32, 50, 24, + 9, 45, 6, 58, 74, 18, 60, 31, 23, 40, + 11, 15, 64, 55, 49, 10, 13, 125, 125, 125, + 28, 0, 11, 23, 40, 11, 29, 20, 68, 1, + 2, 5, 1, 79, 85, 59, 93, 3, 17, 21, + 23, 51, 37, 69, 2, 11, 13, 7, 62, 2, + 44, 0, 0, 0, 4, 63, 67, 18, 17, 11, + 24, 5, 29, 48, 28, 6, 80, 76, 8, 28, + 24, 44, 8, 10, 54, 85, 49, 49, 53, 46, + 13, 30, 54, 5, 27, 1, 20, 3, 21, 17, + 37, 44, 23, 24, 5, 25, 10, 11, 1, 12, + 22, 26, 32, 14, 29, 5, 4, 5, 2, 15, + 58, 5, 2, 60, 52, 72, 60, 50, 63, 14, + 14, 19, 7, 27, 11, 56, 2, 6, 64, 102, + 124, 120, 80, 65, 81, 18, 82, 82, 117, 17, + 24, 65, 44, 11, 6, 62, 66, 108, 96, 82, + 83, 7, 29, 51, 47, 19, 30, 42, 24, 16, + 26, 24, 8, 32, 22, 21, 7, 14, 2, 2, + 21, 0, 13, 25, 3, 3, 4, 9, 13, 81, + 17, 21, 11, 35, 37, 7, 16, 3, 9, 3, + 9, 37, 15, 0, 73, 105, 55, 67, 59, 125, + 37, 27, 91, 2, 17, 17, 31, 35, 59, 47, + 35, 73, 59, 53, 59, 75, 9, 14, 70, 46, + 20, 8, 26, 12, 2, 3, 14, 30, 82, 52, + 38, 22, 60, 28, 32, 4, 54, 12, 70, 44, + 34, 22, 46, 18, 4, 2, 1, 11, 96, 54, + 20, 6, 34, 4, 15, 7, 28, 80, 66, 40, + 18, 58, 24, 18, 4, 2, 124, 19, 5, 8, + 4, 7, 6, 12, 20, 18, 28, 38, 60, 11, + 3, 21, 21, 116, 31, 69, 19, 2, 4, 17, + 15, 13, 19, 0, 25, 59, 11, 9, 42, 13, + 27, 9, 9, 3, 17, 15, 11, 49, 33, 17, + 21, 15, 42, 71, 25, 46, 35, 9, 11, 11, + 8, 5, 15, 4, 16, 17, 17, 77, 70, 66, + 68, 38, 6, 8, 4, 13, 0, 17, 23, 15, + 47, 43, 51, 59, 53, 75, 125, 51, 39, 39, + 39, 65, 53, 45, 49, 53, 31, 15, 25, 19, + 31, 67, 43, 39, 81, 55, 79, 79, 115, 103, + 97, 99, 21, 17, 95, 35, 35, 49, 81, 61, + 63, 71, 49, 61, 43, 41, 81, 91, 115, 41, + 65, 77, 13, 18, 26, 32, 64, 44, 54, 12, + 60, 52, 70, 60, 78, 66, 44, 88, 0, 15, + 31, 47, 47, 99, 99, 97, 28, 88, 78, 82, + 64, 68, 40, 36, 20, 18, 13, 18, 26, 32, + 64, 44, 54, 12, 60, 52, 70, 60, 78, 66, + 44, 88, 0, 15, 31, 47, 47, 99, 99, 97, + }, + + { + /* Context Tables for P, SP, B Slices :: cabac_init_idc = 2, qp = 40 */ + + 56, 8, 35, 56, 8, 35, 12, 34, 50, 24, + 9, 49, 2, 56, 74, 18, 64, 35, 23, 42, + 11, 17, 66, 57, 53, 6, 17, 125, 125, 125, + 32, 2, 11, 23, 42, 11, 29, 20, 68, 3, + 4, 3, 0, 81, 89, 61, 95, 3, 17, 19, + 23, 53, 37, 69, 2, 11, 13, 7, 62, 2, + 44, 0, 0, 0, 6, 65, 67, 18, 19, 11, + 24, 3, 27, 52, 30, 6, 84, 80, 8, 30, + 28, 44, 10, 12, 58, 87, 49, 51, 53, 46, + 15, 30, 58, 5, 25, 0, 22, 3, 21, 17, + 37, 44, 23, 24, 3, 25, 10, 11, 1, 12, + 22, 28, 34, 14, 29, 7, 4, 5, 0, 15, + 58, 5, 2, 60, 52, 72, 60, 50, 65, 16, + 16, 21, 7, 27, 11, 58, 2, 6, 66, 104, + 124, 124, 84, 67, 83, 18, 82, 82, 119, 17, + 24, 67, 44, 11, 6, 64, 66, 108, 96, 84, + 85, 5, 31, 57, 55, 17, 28, 42, 24, 14, + 24, 22, 8, 32, 20, 23, 9, 12, 2, 0, + 23, 1, 15, 27, 3, 5, 2, 11, 15, 83, + 19, 23, 11, 39, 39, 11, 12, 7, 13, 7, + 15, 45, 19, 1, 77, 111, 59, 73, 63, 125, + 41, 29, 95, 0, 19, 19, 33, 39, 63, 51, + 39, 77, 59, 55, 61, 79, 7, 16, 72, 48, + 20, 8, 26, 14, 4, 1, 18, 32, 82, 52, + 40, 22, 62, 30, 34, 6, 58, 12, 70, 44, + 34, 22, 46, 18, 6, 2, 0, 11, 98, 56, + 20, 6, 36, 6, 15, 5, 30, 80, 68, 40, + 18, 60, 24, 20, 6, 2, 124, 17, 1, 12, + 8, 5, 8, 16, 22, 20, 30, 42, 64, 11, + 1, 21, 19, 122, 31, 71, 19, 4, 4, 17, + 15, 13, 21, 0, 25, 61, 11, 9, 42, 13, + 27, 11, 9, 3, 17, 15, 13, 51, 35, 17, + 23, 15, 44, 73, 25, 46, 37, 9, 11, 13, + 8, 5, 15, 4, 18, 17, 17, 81, 68, 64, + 66, 34, 2, 2, 1, 19, 3, 23, 31, 21, + 55, 49, 55, 67, 61, 83, 125, 57, 43, 43, + 45, 69, 57, 49, 53, 55, 31, 21, 29, 23, + 35, 73, 47, 43, 87, 59, 83, 83, 119, 109, + 101, 101, 25, 19, 101, 39, 39, 53, 85, 65, + 67, 73, 51, 63, 43, 39, 83, 95, 119, 43, + 67, 79, 13, 18, 28, 32, 66, 44, 56, 12, + 62, 52, 72, 60, 78, 68, 46, 86, 3, 19, + 35, 51, 51, 103, 103, 101, 30, 88, 78, 84, + 64, 68, 40, 38, 20, 18, 13, 18, 28, 32, + 66, 44, 56, 12, 62, 52, 72, 60, 78, 68, + 46, 86, 3, 19, 35, 51, 51, 103, 103, 101, + }, + + { + /* Context Tables for P, SP, B Slices :: cabac_init_idc = 2, qp = 41 */ + + 54, 8, 35, 54, 8, 35, 16, 36, 52, 24, + 9, 53, 1, 56, 76, 18, 68, 37, 21, 46, + 11, 17, 68, 57, 55, 4, 21, 125, 125, 125, + 36, 6, 9, 21, 46, 11, 27, 22, 70, 3, + 6, 1, 2, 83, 91, 61, 95, 3, 17, 17, + 23, 53, 37, 69, 2, 9, 11, 5, 62, 2, + 44, 0, 0, 0, 8, 65, 67, 18, 21, 11, + 24, 1, 23, 58, 32, 8, 88, 84, 10, 32, + 32, 46, 14, 14, 62, 89, 49, 51, 53, 48, + 15, 32, 62, 5, 23, 2, 26, 3, 19, 17, + 35, 46, 23, 26, 1, 23, 12, 11, 1, 14, + 24, 30, 36, 14, 29, 7, 6, 3, 0, 15, + 60, 5, 2, 60, 54, 74, 60, 52, 65, 18, + 18, 23, 7, 27, 11, 60, 2, 8, 68, 106, + 124, 124, 88, 69, 85, 20, 82, 84, 121, 17, + 24, 67, 46, 11, 6, 68, 66, 108, 96, 86, + 87, 3, 33, 61, 61, 13, 28, 42, 24, 12, + 24, 22, 8, 32, 20, 25, 11, 12, 2, 0, + 23, 1, 17, 27, 3, 5, 2, 13, 17, 85, + 19, 23, 11, 41, 39, 15, 8, 9, 15, 11, + 19, 51, 23, 3, 81, 115, 61, 77, 65, 125, + 43, 31, 99, 1, 21, 21, 35, 43, 65, 55, + 43, 79, 59, 55, 63, 81, 3, 20, 76, 50, + 20, 8, 28, 16, 6, 0, 22, 34, 84, 54, + 42, 24, 64, 32, 36, 8, 62, 14, 72, 46, + 36, 22, 48, 20, 8, 4, 4, 9, 102, 58, + 22, 6, 38, 8, 13, 3, 32, 82, 70, 42, + 20, 62, 26, 22, 8, 4, 124, 13, 2, 16, + 12, 3, 10, 20, 26, 24, 34, 46, 68, 9, + 0, 19, 17, 124, 31, 73, 19, 6, 6, 15, + 15, 13, 21, 0, 25, 63, 11, 9, 44, 13, + 27, 11, 9, 3, 17, 15, 13, 51, 37, 17, + 23, 13, 48, 73, 25, 48, 39, 9, 11, 13, + 10, 5, 15, 6, 20, 15, 15, 85, 66, 62, + 64, 30, 1, 3, 5, 25, 7, 27, 37, 25, + 61, 55, 59, 75, 67, 89, 125, 63, 47, 45, + 49, 73, 59, 51, 55, 57, 31, 25, 33, 27, + 39, 77, 51, 45, 91, 63, 87, 87, 123, 113, + 105, 103, 27, 21, 105, 41, 43, 55, 89, 67, + 69, 75, 53, 63, 43, 37, 85, 97, 123, 45, + 69, 81, 13, 20, 30, 34, 70, 46, 58, 12, + 64, 54, 74, 62, 80, 70, 48, 84, 5, 23, + 39, 55, 55, 107, 107, 103, 32, 88, 80, 86, + 66, 70, 42, 40, 22, 20, 13, 20, 30, 34, + 70, 46, 58, 12, 64, 54, 74, 62, 80, 70, + 48, 84, 5, 23, 39, 55, 55, 107, 107, 103, + }, + + { + /* Context Tables for P, SP, B Slices :: cabac_init_idc = 2, qp = 42 */ + + 52, 8, 35, 52, 8, 35, 20, 40, 52, 24, + 11, 57, 3, 54, 78, 18, 70, 41, 21, 48, + 13, 17, 70, 59, 57, 2, 25, 125, 125, 125, + 40, 8, 9, 21, 48, 13, 27, 24, 70, 3, + 6, 0, 6, 85, 93, 61, 95, 3, 15, 15, + 25, 53, 37, 69, 2, 9, 11, 3, 64, 2, + 44, 0, 0, 0, 8, 65, 67, 20, 23, 11, + 22, 1, 21, 62, 34, 10, 90, 86, 12, 34, + 36, 48, 16, 16, 68, 89, 51, 51, 55, 48, + 15, 34, 68, 3, 23, 2, 30, 3, 19, 17, + 35, 46, 23, 26, 0, 23, 14, 9, 1, 14, + 24, 32, 36, 14, 31, 7, 6, 3, 0, 15, + 60, 5, 2, 62, 54, 74, 60, 52, 67, 18, + 18, 23, 7, 25, 11, 62, 4, 8, 72, 110, + 124, 124, 92, 69, 87, 20, 84, 84, 123, 17, + 26, 69, 48, 11, 6, 70, 66, 108, 96, 90, + 89, 0, 35, 67, 67, 11, 28, 42, 22, 12, + 22, 22, 6, 32, 18, 27, 13, 12, 2, 0, + 23, 3, 17, 29, 5, 5, 2, 15, 17, 87, + 21, 25, 11, 43, 41, 19, 4, 13, 19, 13, + 23, 57, 29, 5, 85, 119, 65, 81, 69, 125, + 45, 33, 101, 3, 23, 23, 39, 45, 69, 59, + 45, 83, 59, 57, 65, 85, 1, 22, 78, 50, + 20, 8, 30, 18, 8, 0, 26, 38, 86, 56, + 42, 26, 68, 34, 40, 10, 68, 16, 72, 46, + 36, 24, 50, 22, 8, 6, 6, 9, 104, 60, + 22, 6, 40, 8, 13, 1, 34, 82, 72, 42, + 20, 64, 28, 24, 8, 6, 124, 11, 4, 18, + 14, 1, 14, 22, 30, 26, 38, 50, 74, 7, + 2, 17, 15, 124, 29, 75, 17, 8, 8, 15, + 15, 13, 21, 2, 27, 65, 11, 9, 46, 13, + 27, 11, 9, 1, 19, 17, 13, 53, 39, 19, + 23, 13, 50, 75, 27, 50, 39, 11, 11, 13, + 10, 7, 17, 6, 20, 15, 15, 87, 64, 60, + 62, 28, 7, 7, 11, 31, 13, 33, 43, 31, + 67, 59, 63, 83, 75, 97, 125, 69, 51, 49, + 53, 79, 63, 53, 57, 59, 31, 29, 39, 33, + 43, 81, 55, 49, 95, 67, 91, 91, 125, 117, + 107, 107, 29, 23, 109, 45, 45, 59, 93, 69, + 71, 77, 55, 65, 43, 35, 89, 99, 125, 49, + 71, 83, 13, 20, 30, 36, 72, 48, 58, 14, + 66, 56, 76, 62, 82, 70, 48, 84, 9, 27, + 43, 59, 59, 113, 111, 105, 32, 90, 80, 86, + 68, 72, 42, 40, 22, 22, 13, 20, 30, 36, + 72, 48, 58, 14, 66, 56, 76, 62, 82, 70, + 48, 84, 9, 27, 43, 59, 59, 113, 111, 105, + }, + + { + /* Context Tables for P, SP, B Slices :: cabac_init_idc = 2, qp = 43 */ + + 50, 8, 37, 50, 8, 37, 24, 42, 54, 24, + 11, 59, 7, 52, 78, 20, 74, 43, 21, 52, + 13, 19, 72, 61, 59, 0, 27, 125, 125, 125, + 44, 12, 9, 21, 52, 13, 25, 26, 72, 3, + 8, 2, 8, 89, 95, 61, 97, 3, 15, 15, + 25, 55, 35, 69, 0, 9, 11, 1, 64, 2, + 44, 0, 0, 0, 10, 65, 67, 20, 23, 13, + 22, 0, 19, 66, 36, 12, 94, 90, 14, 36, + 40, 50, 18, 16, 72, 91, 51, 51, 55, 50, + 15, 36, 72, 3, 21, 4, 34, 3, 19, 17, + 35, 48, 25, 28, 0, 23, 16, 9, 0, 14, + 26, 34, 38, 14, 31, 7, 6, 3, 0, 15, + 60, 5, 2, 62, 54, 76, 62, 54, 67, 20, + 20, 25, 7, 25, 11, 64, 4, 10, 74, 112, + 124, 124, 94, 71, 89, 22, 84, 84, 123, 19, + 26, 69, 48, 13, 8, 72, 66, 108, 98, 92, + 91, 2, 37, 73, 73, 7, 28, 42, 22, 10, + 22, 20, 6, 30, 18, 27, 13, 12, 0, 0, + 25, 3, 19, 31, 5, 5, 2, 15, 19, 91, + 21, 27, 11, 45, 41, 25, 0, 17, 21, 17, + 27, 63, 33, 9, 89, 123, 67, 85, 73, 125, + 47, 37, 105, 7, 25, 25, 41, 49, 73, 63, + 49, 85, 59, 59, 67, 87, 2, 26, 82, 52, + 20, 8, 32, 18, 8, 2, 28, 40, 88, 58, + 44, 26, 70, 36, 42, 12, 72, 16, 74, 48, + 38, 24, 52, 22, 10, 8, 8, 9, 108, 62, + 22, 6, 40, 10, 13, 1, 34, 84, 72, 44, + 20, 66, 28, 24, 10, 6, 124, 7, 8, 22, + 18, 2, 16, 26, 32, 30, 42, 54, 78, 7, + 4, 17, 15, 124, 29, 77, 17, 8, 10, 15, + 15, 13, 21, 2, 27, 67, 11, 9, 46, 13, + 27, 11, 9, 1, 19, 17, 13, 55, 41, 19, + 23, 13, 52, 77, 27, 52, 41, 11, 11, 13, + 12, 7, 17, 8, 22, 15, 15, 91, 62, 58, + 60, 24, 11, 13, 15, 37, 17, 39, 49, 37, + 73, 65, 67, 89, 81, 105, 125, 73, 57, 53, + 57, 83, 67, 55, 59, 61, 31, 33, 43, 37, + 47, 85, 57, 51, 99, 71, 97, 95, 125, 121, + 111, 109, 31, 27, 113, 47, 49, 61, 97, 73, + 75, 79, 57, 67, 45, 33, 91, 103, 125, 51, + 75, 85, 11, 22, 32, 36, 74, 50, 60, 14, + 68, 58, 78, 64, 84, 72, 50, 82, 13, 31, + 47, 63, 63, 117, 115, 107, 34, 90, 82, 88, + 68, 74, 44, 42, 24, 22, 11, 22, 32, 36, + 74, 50, 60, 14, 68, 58, 78, 64, 84, 72, + 50, 82, 13, 31, 47, 63, 63, 117, 115, 107, + }, + + { + /* Context Tables for P, SP, B Slices :: cabac_init_idc = 2, qp = 44 */ + + 46, 8, 37, 46, 8, 37, 26, 46, 54, 24, + 13, 63, 9, 50, 80, 20, 78, 47, 21, 54, + 13, 19, 74, 63, 63, 3, 31, 125, 125, 125, + 48, 14, 9, 21, 54, 13, 25, 28, 72, 5, + 10, 4, 10, 91, 99, 63, 97, 3, 15, 13, + 25, 55, 35, 69, 0, 9, 11, 1, 66, 2, + 44, 0, 0, 0, 10, 67, 67, 22, 25, 13, + 20, 0, 17, 70, 38, 14, 96, 92, 14, 38, + 44, 52, 20, 18, 76, 93, 51, 51, 55, 50, + 17, 38, 76, 3, 19, 6, 38, 3, 19, 17, + 35, 48, 25, 28, 2, 23, 16, 9, 0, 14, + 26, 36, 38, 14, 31, 9, 6, 3, 0, 15, + 60, 5, 2, 64, 54, 76, 62, 54, 69, 20, + 22, 25, 7, 25, 11, 66, 4, 10, 76, 114, + 124, 124, 98, 71, 91, 22, 84, 84, 125, 19, + 28, 71, 50, 13, 8, 74, 66, 108, 98, 94, + 93, 6, 39, 79, 79, 5, 26, 42, 20, 8, + 20, 20, 4, 30, 16, 29, 15, 10, 0, 0, + 25, 5, 21, 33, 7, 7, 0, 17, 19, 93, + 23, 29, 11, 49, 43, 29, 3, 21, 25, 21, + 31, 71, 39, 11, 93, 125, 71, 89, 77, 125, + 51, 39, 109, 9, 27, 27, 43, 51, 77, 67, + 53, 89, 59, 61, 69, 91, 4, 28, 84, 52, + 20, 8, 34, 20, 10, 4, 32, 42, 90, 58, + 46, 28, 72, 38, 46, 14, 76, 18, 74, 48, + 38, 26, 54, 24, 10, 8, 10, 9, 110, 64, + 22, 6, 42, 10, 13, 0, 36, 84, 74, 44, + 20, 68, 30, 26, 10, 8, 124, 5, 10, 24, + 22, 4, 20, 28, 36, 32, 44, 58, 82, 5, + 6, 15, 13, 124, 29, 79, 17, 10, 10, 15, + 15, 13, 23, 4, 27, 69, 11, 9, 48, 13, + 27, 11, 9, 1, 21, 17, 13, 57, 43, 19, + 23, 13, 54, 79, 27, 52, 43, 11, 11, 13, + 12, 7, 19, 8, 24, 15, 15, 93, 60, 56, + 58, 20, 15, 17, 21, 43, 23, 45, 57, 43, + 81, 71, 71, 97, 89, 113, 125, 79, 61, 57, + 61, 89, 71, 59, 61, 63, 31, 39, 47, 41, + 51, 89, 61, 55, 103, 75, 101, 99, 125, 125, + 115, 111, 33, 29, 119, 51, 53, 65, 101, 75, + 77, 81, 59, 69, 45, 31, 93, 105, 125, 53, + 77, 87, 11, 22, 32, 38, 76, 50, 62, 14, + 70, 60, 80, 64, 86, 72, 52, 82, 17, 35, + 51, 67, 67, 121, 119, 111, 34, 92, 82, 90, + 70, 74, 44, 42, 24, 24, 11, 22, 32, 38, + 76, 50, 62, 14, 70, 60, 80, 64, 86, 72, + 52, 82, 17, 35, 51, 67, 67, 121, 119, 111, + }, + + { + /* Context Tables for P, SP, B Slices :: cabac_init_idc = 2, qp = 45 */ + + 44, 8, 37, 44, 8, 37, 30, 48, 54, 24, + 13, 67, 13, 48, 82, 20, 80, 51, 19, 58, + 15, 19, 76, 65, 65, 5, 35, 125, 125, 125, + 52, 16, 9, 19, 58, 15, 25, 30, 72, 5, + 10, 6, 14, 93, 101, 63, 97, 3, 13, 11, + 27, 55, 35, 69, 0, 7, 11, 0, 66, 2, + 44, 0, 0, 0, 12, 67, 67, 22, 27, 13, + 20, 2, 13, 76, 40, 16, 100, 96, 16, 40, + 48, 54, 22, 20, 82, 93, 53, 51, 57, 50, + 17, 40, 82, 1, 19, 6, 42, 3, 19, 17, + 33, 48, 25, 30, 4, 21, 18, 7, 0, 16, + 28, 38, 40, 14, 33, 9, 8, 3, 0, 15, + 62, 5, 2, 64, 56, 76, 62, 54, 69, 22, + 22, 27, 7, 23, 11, 68, 6, 12, 80, 118, + 124, 124, 102, 73, 93, 24, 86, 84, 125, 19, + 28, 71, 52, 13, 8, 78, 66, 108, 98, 98, + 95, 8, 41, 83, 85, 3, 26, 42, 20, 8, + 20, 20, 4, 30, 16, 31, 17, 10, 0, 0, + 25, 5, 21, 35, 7, 7, 0, 19, 21, 95, + 25, 29, 11, 51, 43, 33, 7, 23, 27, 23, + 35, 77, 43, 13, 97, 125, 73, 93, 81, 125, + 53, 41, 111, 11, 29, 29, 47, 55, 81, 71, + 55, 91, 59, 63, 71, 95, 8, 32, 86, 54, + 20, 8, 36, 22, 12, 4, 36, 46, 92, 60, + 46, 30, 76, 40, 48, 16, 82, 20, 76, 50, + 40, 26, 56, 26, 12, 10, 12, 7, 112, 66, + 22, 6, 44, 12, 11, 2, 38, 84, 76, 44, + 20, 70, 32, 28, 12, 10, 124, 3, 14, 28, + 24, 6, 22, 32, 40, 36, 48, 62, 88, 3, + 8, 13, 11, 124, 27, 81, 15, 12, 12, 15, + 15, 13, 23, 4, 29, 71, 11, 9, 50, 13, + 27, 11, 9, 0, 21, 19, 13, 59, 45, 21, + 23, 13, 56, 79, 29, 54, 43, 13, 11, 13, + 12, 9, 19, 8, 24, 15, 15, 97, 58, 54, + 56, 18, 21, 23, 27, 49, 27, 51, 63, 49, + 87, 75, 75, 105, 95, 121, 125, 85, 65, 59, + 65, 93, 75, 61, 63, 65, 31, 43, 53, 47, + 55, 93, 65, 57, 107, 79, 105, 103, 125, 125, + 117, 115, 35, 31, 123, 53, 55, 67, 105, 77, + 79, 83, 61, 71, 45, 29, 97, 107, 125, 57, + 79, 89, 11, 24, 34, 40, 78, 52, 62, 16, + 72, 62, 82, 66, 88, 74, 52, 80, 21, 39, + 55, 71, 71, 125, 123, 113, 36, 92, 84, 90, + 72, 76, 46, 44, 24, 26, 11, 24, 34, 40, + 78, 52, 62, 16, 72, 62, 82, 66, 88, 74, + 52, 80, 21, 39, 55, 71, 71, 125, 123, 113, + }, + + { + /* Context Tables for P, SP, B Slices :: cabac_init_idc = 2, qp = 46 */ + + 42, 8, 37, 42, 8, 37, 34, 52, 56, 24, + 15, 71, 15, 46, 82, 20, 84, 53, 19, 60, + 15, 21, 78, 67, 67, 7, 39, 125, 125, 125, + 56, 20, 9, 19, 60, 15, 23, 32, 74, 5, + 12, 8, 16, 95, 103, 63, 99, 3, 13, 9, + 27, 57, 35, 69, 0, 7, 11, 2, 68, 2, + 44, 0, 0, 0, 12, 67, 67, 24, 29, 13, + 18, 2, 11, 80, 42, 18, 102, 98, 18, 42, + 52, 56, 24, 22, 86, 95, 53, 51, 57, 52, + 17, 42, 86, 1, 17, 8, 46, 3, 19, 17, + 33, 50, 25, 30, 6, 21, 20, 7, 0, 16, + 28, 40, 40, 14, 33, 9, 8, 3, 0, 15, + 62, 5, 2, 66, 56, 78, 62, 56, 71, 22, + 24, 27, 7, 23, 11, 70, 6, 12, 82, 120, + 124, 124, 106, 73, 95, 24, 86, 84, 125, 19, + 30, 73, 52, 13, 8, 80, 66, 108, 98, 100, + 97, 12, 43, 89, 91, 0, 26, 42, 18, 6, + 18, 18, 2, 30, 14, 33, 19, 10, 0, 0, + 27, 7, 23, 37, 9, 7, 0, 21, 21, 97, + 25, 31, 11, 53, 45, 37, 11, 27, 31, 27, + 39, 83, 49, 15, 101, 125, 77, 97, 85, 125, + 55, 43, 115, 13, 31, 31, 49, 57, 85, 75, + 59, 95, 59, 65, 73, 97, 10, 34, 90, 54, + 20, 8, 38, 24, 14, 6, 40, 48, 94, 62, + 48, 30, 78, 42, 52, 18, 86, 20, 76, 50, + 40, 28, 58, 26, 12, 12, 14, 7, 116, 68, + 22, 6, 46, 12, 11, 4, 40, 86, 78, 46, + 20, 72, 32, 30, 12, 10, 124, 0, 16, 30, + 28, 8, 26, 34, 42, 38, 52, 66, 92, 3, + 10, 13, 9, 124, 27, 83, 15, 14, 14, 15, + 15, 13, 23, 6, 29, 73, 11, 9, 50, 13, + 27, 11, 9, 0, 23, 19, 13, 61, 47, 21, + 23, 13, 58, 81, 29, 56, 45, 13, 11, 13, + 14, 9, 21, 10, 26, 15, 15, 99, 56, 52, + 54, 14, 25, 27, 31, 55, 33, 57, 69, 55, + 93, 81, 79, 113, 103, 125, 125, 91, 69, 63, + 69, 99, 79, 63, 65, 67, 31, 47, 57, 51, + 59, 97, 69, 61, 111, 83, 109, 107, 125, 125, + 121, 117, 37, 33, 125, 57, 59, 71, 109, 81, + 83, 85, 63, 73, 45, 27, 99, 111, 125, 59, + 81, 91, 11, 24, 34, 40, 80, 54, 64, 16, + 74, 64, 84, 66, 90, 74, 54, 80, 25, 43, + 59, 75, 75, 125, 125, 115, 36, 94, 84, 92, + 72, 78, 46, 44, 26, 26, 11, 24, 34, 40, + 80, 54, 64, 16, 74, 64, 84, 66, 90, 74, + 54, 80, 25, 43, 59, 75, 75, 125, 125, 115, + }, + + { + /* Context Tables for P, SP, B Slices :: cabac_init_idc = 2, qp = 47 */ + + 40, 8, 37, 40, 8, 37, 38, 54, 56, 24, + 15, 75, 19, 44, 84, 20, 88, 57, 19, 64, + 15, 21, 80, 69, 69, 9, 43, 125, 125, 125, + 60, 22, 9, 19, 64, 15, 23, 34, 74, 5, + 14, 10, 18, 97, 105, 63, 99, 3, 13, 7, + 27, 57, 35, 69, 0, 7, 11, 4, 68, 2, + 44, 0, 0, 0, 14, 67, 67, 24, 31, 13, + 18, 4, 9, 84, 44, 20, 106, 102, 20, 44, + 56, 58, 26, 24, 90, 97, 53, 51, 57, 52, + 17, 44, 90, 1, 15, 10, 50, 3, 19, 17, + 33, 50, 25, 32, 8, 21, 22, 7, 0, 16, + 30, 42, 42, 14, 33, 9, 8, 3, 0, 15, + 62, 5, 2, 66, 56, 78, 62, 56, 71, 24, + 26, 29, 7, 23, 11, 72, 6, 14, 84, 122, + 124, 124, 110, 75, 97, 26, 86, 84, 125, 19, + 30, 73, 54, 13, 8, 82, 66, 108, 98, 102, + 99, 14, 45, 95, 97, 2, 26, 42, 18, 4, + 18, 18, 2, 30, 14, 35, 21, 10, 0, 0, + 27, 7, 25, 39, 9, 7, 0, 23, 23, 99, + 27, 33, 11, 55, 45, 41, 15, 31, 33, 31, + 43, 89, 53, 17, 105, 125, 79, 101, 89, 125, + 57, 45, 119, 15, 33, 33, 51, 61, 89, 79, + 63, 97, 59, 67, 75, 101, 14, 38, 92, 56, + 20, 8, 40, 26, 16, 8, 44, 50, 96, 64, + 50, 32, 80, 44, 54, 20, 90, 22, 78, 52, + 42, 28, 60, 28, 14, 14, 16, 7, 118, 70, + 22, 6, 48, 14, 11, 6, 42, 86, 80, 46, + 20, 74, 34, 32, 14, 12, 124, 2, 20, 34, + 32, 10, 28, 38, 46, 42, 56, 70, 96, 1, + 12, 11, 7, 124, 27, 85, 15, 16, 16, 15, + 15, 13, 23, 6, 29, 75, 11, 9, 52, 13, + 27, 11, 9, 0, 23, 19, 13, 63, 49, 21, + 23, 13, 60, 83, 29, 58, 47, 13, 11, 13, + 14, 9, 21, 10, 28, 15, 15, 103, 54, 50, + 52, 10, 29, 33, 37, 61, 37, 63, 75, 61, + 99, 87, 83, 121, 109, 125, 125, 97, 73, 67, + 73, 103, 83, 65, 67, 69, 31, 51, 61, 55, + 63, 101, 73, 63, 115, 87, 113, 111, 125, 125, + 125, 119, 39, 35, 125, 59, 63, 73, 113, 83, + 85, 87, 65, 75, 45, 25, 101, 113, 125, 61, + 83, 93, 11, 26, 36, 42, 82, 56, 66, 16, + 76, 66, 86, 68, 92, 76, 56, 78, 29, 47, + 63, 79, 79, 125, 125, 117, 38, 94, 86, 94, + 74, 80, 48, 46, 26, 28, 11, 26, 36, 42, + 82, 56, 66, 16, 76, 66, 86, 68, 92, 76, + 56, 78, 29, 47, 63, 79, 79, 125, 125, 117, + }, + + { + /* Context Tables for P, SP, B Slices :: cabac_init_idc = 2, qp = 48 */ + + 36, 6, 39, 36, 6, 39, 40, 56, 56, 24, + 17, 79, 23, 42, 84, 20, 90, 61, 19, 66, + 17, 23, 82, 71, 73, 13, 47, 125, 125, 125, + 64, 24, 9, 19, 66, 17, 23, 34, 74, 7, + 14, 10, 20, 101, 109, 65, 101, 5, 13, 7, + 29, 59, 35, 69, 1, 7, 11, 4, 68, 0, + 44, 0, 0, 0, 14, 69, 67, 24, 33, 15, + 16, 4, 7, 88, 46, 20, 108, 104, 20, 46, + 58, 58, 28, 24, 94, 99, 55, 53, 59, 52, + 19, 44, 94, 1, 15, 10, 52, 3, 19, 17, + 33, 50, 27, 32, 8, 21, 22, 7, 0, 16, + 30, 44, 42, 14, 35, 11, 8, 3, 1, 15, + 62, 5, 2, 66, 56, 78, 62, 56, 73, 24, + 26, 31, 7, 23, 13, 72, 6, 14, 86, 124, + 124, 124, 112, 77, 101, 26, 86, 84, 125, 21, + 30, 75, 54, 15, 8, 84, 66, 106, 98, 104, + 101, 16, 49, 101, 105, 4, 24, 42, 16, 2, + 16, 16, 0, 28, 12, 37, 23, 8, 1, 1, + 29, 9, 27, 41, 11, 9, 1, 25, 25, 103, + 29, 35, 11, 59, 47, 47, 21, 35, 37, 35, + 49, 97, 59, 21, 109, 125, 83, 107, 93, 125, + 61, 49, 123, 19, 35, 35, 55, 65, 93, 83, + 67, 101, 59, 69, 77, 105, 16, 40, 94, 56, + 20, 8, 40, 26, 16, 8, 46, 52, 96, 64, + 50, 32, 82, 46, 56, 20, 94, 22, 78, 52, + 42, 28, 60, 28, 14, 14, 18, 7, 120, 72, + 22, 4, 48, 14, 11, 6, 42, 86, 80, 46, + 20, 76, 34, 32, 14, 12, 124, 4, 22, 36, + 34, 12, 30, 40, 48, 44, 58, 72, 100, 1, + 12, 11, 7, 124, 27, 89, 15, 16, 16, 15, + 15, 13, 25, 6, 31, 77, 13, 9, 52, 15, + 27, 13, 9, 0, 25, 21, 15, 65, 51, 23, + 25, 13, 62, 85, 31, 58, 49, 15, 11, 15, + 14, 11, 23, 10, 28, 15, 15, 107, 50, 48, + 48, 6, 35, 39, 43, 67, 43, 69, 83, 67, + 107, 93, 87, 125, 117, 125, 125, 103, 79, 71, + 79, 109, 87, 69, 71, 71, 31, 57, 67, 61, + 67, 107, 77, 67, 121, 91, 119, 115, 125, 125, + 125, 123, 43, 39, 125, 63, 67, 77, 117, 87, + 89, 89, 67, 77, 47, 23, 105, 117, 125, 65, + 87, 97, 11, 26, 36, 42, 84, 56, 66, 16, + 78, 66, 88, 68, 92, 76, 56, 76, 33, 51, + 69, 85, 83, 125, 125, 121, 38, 94, 86, 94, + 74, 80, 48, 46, 26, 28, 11, 26, 36, 42, + 84, 56, 66, 16, 78, 66, 88, 68, 92, 76, + 56, 76, 33, 51, 69, 85, 83, 125, 125, 121, + }, + + { + /* Context Tables for P, SP, B Slices :: cabac_init_idc = 2, qp = 49 */ + + 34, 6, 39, 34, 6, 39, 44, 60, 58, 26, + 17, 81, 25, 42, 86, 22, 94, 63, 17, 70, + 17, 23, 86, 71, 75, 15, 49, 125, 125, 125, + 68, 28, 7, 17, 70, 17, 21, 36, 76, 7, + 16, 12, 24, 103, 111, 65, 101, 5, 11, 5, + 29, 59, 33, 67, 1, 5, 9, 6, 70, 0, + 44, 0, 0, 0, 16, 69, 67, 26, 33, 15, + 16, 6, 3, 94, 50, 22, 112, 108, 22, 50, + 62, 60, 32, 26, 100, 99, 55, 53, 59, 54, + 19, 46, 100, 0, 13, 12, 56, 3, 17, 17, + 31, 52, 27, 34, 10, 19, 24, 5, 2, 18, + 32, 46, 44, 16, 35, 11, 10, 1, 1, 13, + 64, 3, 2, 68, 58, 80, 64, 58, 73, 26, + 28, 31, 7, 21, 13, 74, 8, 16, 90, 124, + 124, 124, 116, 77, 103, 28, 88, 86, 125, 21, + 32, 75, 56, 15, 10, 88, 66, 106, 100, 108, + 103, 20, 51, 105, 111, 8, 24, 42, 16, 2, + 16, 16, 0, 28, 12, 37, 23, 8, 1, 1, + 29, 9, 27, 41, 11, 9, 1, 25, 25, 105, + 29, 35, 11, 61, 47, 51, 25, 37, 39, 37, + 53, 103, 63, 23, 111, 125, 85, 111, 95, 125, + 63, 51, 125, 21, 35, 35, 57, 67, 95, 85, + 69, 103, 59, 69, 77, 107, 20, 44, 98, 58, + 20, 8, 42, 28, 18, 10, 50, 56, 98, 66, + 52, 34, 86, 48, 60, 22, 100, 24, 80, 54, + 44, 30, 62, 30, 16, 16, 22, 5, 124, 74, + 24, 4, 50, 16, 9, 8, 44, 88, 82, 48, + 22, 78, 36, 34, 16, 14, 124, 8, 26, 40, + 38, 16, 34, 44, 52, 48, 62, 76, 106, 0, + 14, 9, 5, 124, 25, 91, 13, 18, 18, 13, + 15, 11, 25, 8, 31, 77, 13, 9, 54, 15, + 27, 13, 9, 2, 25, 21, 15, 65, 51, 23, + 25, 11, 66, 85, 31, 60, 49, 15, 11, 15, + 16, 11, 23, 12, 30, 13, 13, 109, 48, 46, + 46, 4, 39, 43, 47, 73, 47, 73, 89, 71, + 113, 97, 91, 125, 123, 125, 125, 107, 83, 73, + 83, 113, 89, 71, 73, 71, 31, 61, 71, 65, + 71, 111, 79, 69, 125, 93, 123, 117, 125, 125, + 125, 125, 45, 41, 125, 65, 69, 79, 119, 89, + 91, 91, 67, 77, 47, 21, 107, 119, 125, 67, + 89, 99, 9, 28, 38, 44, 88, 58, 68, 18, + 82, 68, 90, 70, 94, 78, 58, 76, 35, 53, + 73, 89, 85, 125, 125, 123, 40, 96, 88, 96, + 76, 82, 50, 48, 28, 30, 9, 28, 38, 44, + 88, 58, 68, 18, 82, 68, 90, 70, 94, 78, + 58, 76, 35, 53, 73, 89, 85, 125, 125, 123, + }, + + { + /* Context Tables for P, SP, B Slices :: cabac_init_idc = 2, qp = 50 */ + + 32, 6, 39, 32, 6, 39, 48, 62, 58, 26, + 17, 85, 29, 40, 88, 22, 98, 67, 17, 72, + 17, 23, 88, 73, 77, 17, 53, 125, 125, 125, + 72, 30, 7, 17, 72, 17, 21, 38, 76, 7, + 18, 14, 26, 105, 113, 65, 101, 5, 11, 3, + 29, 59, 33, 67, 1, 5, 9, 8, 70, 0, + 44, 0, 0, 0, 18, 69, 67, 26, 35, 15, + 16, 8, 1, 98, 52, 24, 116, 112, 24, 52, + 66, 62, 34, 28, 104, 101, 55, 53, 59, 54, + 19, 48, 104, 0, 11, 14, 60, 3, 17, 17, + 31, 52, 27, 34, 12, 19, 26, 5, 2, 18, + 32, 48, 46, 16, 35, 11, 10, 1, 1, 13, + 64, 3, 2, 68, 58, 80, 64, 58, 75, 28, + 30, 33, 7, 21, 13, 76, 8, 16, 92, 124, + 124, 124, 120, 79, 105, 28, 88, 86, 125, 21, + 32, 77, 58, 15, 10, 90, 66, 106, 100, 110, + 105, 22, 53, 111, 117, 10, 24, 42, 16, 0, + 14, 16, 0, 28, 10, 39, 25, 8, 1, 1, + 29, 11, 29, 43, 11, 9, 1, 27, 27, 107, + 31, 37, 11, 63, 49, 55, 29, 41, 43, 41, + 57, 109, 67, 25, 115, 125, 89, 115, 99, 125, + 65, 53, 125, 23, 37, 37, 59, 71, 99, 89, + 73, 107, 59, 71, 79, 111, 22, 46, 100, 60, + 20, 8, 44, 30, 20, 12, 54, 58, 100, 68, + 54, 36, 88, 50, 62, 24, 104, 26, 80, 54, + 44, 30, 64, 32, 18, 18, 24, 5, 124, 76, + 24, 4, 52, 18, 9, 10, 46, 88, 84, 48, + 22, 80, 38, 36, 18, 16, 124, 10, 30, 44, + 42, 18, 36, 48, 56, 50, 66, 80, 110, 2, + 16, 7, 3, 124, 25, 93, 13, 20, 20, 13, + 15, 11, 25, 8, 31, 79, 13, 9, 56, 15, + 27, 13, 9, 2, 25, 21, 15, 67, 53, 23, + 25, 11, 68, 87, 31, 62, 51, 15, 11, 15, + 16, 11, 23, 12, 32, 13, 13, 113, 46, 44, + 44, 0, 43, 49, 53, 79, 51, 79, 95, 77, + 119, 103, 95, 125, 125, 125, 125, 113, 87, 77, + 87, 117, 93, 73, 75, 73, 31, 65, 75, 69, + 75, 115, 83, 73, 125, 97, 125, 121, 125, 125, + 125, 125, 47, 43, 125, 69, 73, 83, 123, 91, + 93, 93, 69, 79, 47, 19, 109, 121, 125, 69, + 91, 101, 9, 28, 40, 46, 90, 60, 70, 18, + 84, 70, 92, 70, 96, 80, 60, 74, 39, 57, + 77, 93, 89, 125, 125, 125, 42, 96, 88, 98, + 78, 84, 50, 50, 28, 32, 9, 28, 40, 46, + 90, 60, 70, 18, 84, 70, 92, 70, 96, 80, + 60, 74, 39, 57, 77, 93, 89, 125, 125, 125, + }, + + { + /* Context Tables for P, SP, B Slices :: cabac_init_idc = 2, qp = 51 */ + + 30, 6, 39, 30, 6, 39, 52, 66, 60, 26, + 19, 89, 31, 38, 88, 22, 102, 69, 17, 76, + 17, 25, 90, 75, 79, 19, 57, 125, 125, 125, + 76, 34, 7, 17, 76, 17, 19, 40, 78, 7, + 20, 16, 28, 107, 115, 65, 103, 5, 11, 1, + 29, 61, 33, 67, 1, 5, 9, 10, 72, 0, + 44, 0, 0, 0, 18, 69, 67, 28, 37, 15, + 14, 8, 0, 102, 54, 26, 118, 114, 26, 54, + 70, 64, 36, 30, 108, 103, 55, 53, 59, 56, + 19, 50, 108, 0, 9, 16, 64, 3, 17, 17, + 31, 54, 27, 36, 14, 19, 28, 5, 2, 18, + 34, 50, 46, 16, 35, 11, 10, 1, 1, 13, + 64, 3, 2, 70, 58, 82, 64, 60, 75, 28, + 32, 33, 7, 21, 13, 78, 8, 18, 94, 124, + 124, 124, 124, 79, 107, 30, 88, 86, 125, 21, + 34, 77, 58, 15, 10, 92, 66, 106, 100, 112, + 107, 26, 55, 117, 123, 14, 24, 42, 14, 1, + 14, 14, 1, 28, 10, 41, 27, 8, 1, 1, + 31, 11, 31, 45, 13, 9, 1, 29, 27, 109, + 31, 39, 11, 65, 49, 59, 33, 45, 45, 45, + 61, 115, 73, 27, 119, 125, 91, 119, 103, 125, + 67, 55, 125, 25, 39, 39, 61, 73, 103, 93, + 77, 109, 59, 73, 81, 113, 26, 50, 104, 60, + 20, 8, 46, 32, 22, 14, 58, 60, 102, 70, + 56, 36, 90, 52, 66, 26, 108, 26, 82, 56, + 46, 32, 66, 32, 18, 20, 26, 5, 124, 78, + 24, 4, 54, 18, 9, 12, 48, 90, 86, 50, + 22, 82, 38, 38, 18, 16, 124, 14, 32, 46, + 46, 20, 40, 50, 58, 54, 70, 84, 114, 2, + 18, 7, 1, 124, 25, 95, 13, 22, 22, 13, + 15, 11, 25, 10, 31, 81, 13, 9, 56, 15, + 27, 13, 9, 2, 27, 21, 15, 69, 55, 23, + 25, 11, 70, 89, 31, 64, 53, 15, 11, 15, + 18, 11, 25, 14, 34, 13, 13, 115, 44, 42, + 42, 3, 47, 53, 57, 85, 57, 85, 101, 83, + 125, 109, 99, 125, 125, 125, 125, 119, 91, 81, + 91, 123, 97, 75, 77, 75, 31, 69, 79, 73, + 79, 119, 87, 75, 125, 101, 125, 125, 125, 125, + 125, 125, 49, 45, 125, 71, 77, 85, 125, 95, + 97, 95, 71, 81, 47, 17, 111, 125, 125, 71, + 93, 103, 9, 30, 40, 46, 92, 62, 72, 18, + 86, 72, 94, 72, 98, 80, 62, 74, 43, 61, + 81, 97, 93, 125, 125, 125, 42, 98, 90, 100, + 78, 86, 52, 50, 30, 32, 9, 30, 40, 46, + 92, 62, 72, 18, 86, 72, 94, 72, 98, 80, + 62, 74, 43, 61, 81, 97, 93, 125, 125, 125, + }, + + }, + + { + + { + /* Context Tables for I, SI Slices :: qp = 0 */ + + 124, 18, 21, 124, 18, 21, 125, 81, 20, 18, + 24, 60, 122, 124, 108, 28, 109, 12, 29, 3, + 2, 28, 19, 26, 1, 40, 124, 7, 53, 81, + 125, 81, 7, 29, 3, 2, 45, 63, 4, 36, + 11, 35, 65, 16, 7, 45, 49, 10, 25, 61, + 18, 11, 35, 49, 7, 21, 21, 33, 17, 10, + 44, 0, 0, 0, 39, 45, 67, 17, 44, 2, + 104, 16, 11, 125, 77, 37, 21, 87, 125, 125, + 125, 63, 125, 101, 125, 119, 103, 117, 103, 0, + 9, 41, 81, 13, 59, 53, 125, 21, 67, 55, + 125, 14, 37, 25, 123, 59, 47, 27, 15, 0, + 9, 41, 2, 3, 4, 14, 5, 1, 4, 29, + 26, 22, 56, 38, 50, 36, 34, 38, 92, 24, + 26, 88, 60, 2, 89, 73, 75, 55, 61, 49, + 41, 45, 39, 47, 61, 13, 17, 21, 8, 77, + 73, 63, 23, 17, 23, 15, 34, 11, 2, 3, + 52, 17, 12, 18, 2, 17, 124, 108, 76, 90, + 108, 88, 52, 90, 68, 60, 66, 36, 10, 2, + 4, 50, 36, 48, 42, 38, 36, 44, 28, 58, + 42, 16, 24, 34, 51, 124, 124, 124, 124, 124, + 124, 124, 124, 124, 124, 124, 124, 124, 124, 92, + 124, 120, 82, 124, 124, 124, 124, 120, 116, 124, + 94, 82, 30, 52, 6, 9, 67, 15, 42, 26, + 18, 2, 10, 0, 17, 21, 55, 7, 72, 48, + 38, 34, 1, 9, 29, 27, 45, 57, 16, 6, + 2, 3, 19, 25, 33, 49, 93, 67, 41, 31, + 19, 21, 45, 65, 67, 107, 29, 60, 30, 20, + 2, 15, 31, 45, 53, 67, 124, 59, 41, 31, + 5, 15, 2, 6, 8, 23, 2, 10, 5, 31, + 15, 9, 38, 2, 54, 46, 72, 68, 38, 54, + 62, 42, 30, 2, 34, 1, 81, 67, 65, 49, + 43, 43, 43, 49, 5, 27, 25, 25, 10, 25, + 39, 71, 63, 63, 25, 21, 13, 23, 9, 3, + 19, 2, 2, 9, 23, 16, 1, 13, 114, 88, + 94, 98, 100, 104, 96, 94, 80, 80, 86, 74, + 38, 46, 32, 92, 84, 82, 72, 68, 56, 26, + 12, 0, 27, 37, 61, 11, 91, 124, 124, 124, + 124, 124, 124, 124, 124, 124, 124, 124, 122, 100, + 56, 10, 124, 124, 66, 124, 124, 124, 120, 124, + 116, 104, 116, 102, 104, 68, 74, 48, 5, 84, + 64, 26, 113, 97, 101, 43, 57, 51, 15, 35, + 33, 9, 13, 14, 9, 26, 21, 124, 124, 124, + 124, 120, 114, 58, 18, 37, 23, 80, 58, 40, + 18, 16, 4, 1, 9, 57, 85, 67, 53, 53, + 49, 19, 31, 45, 19, 13, 11, 5, 1, 10, + 8, 124, 124, 124, 124, 120, 108, 86, 54, 7, + }, + + { + /* Context Tables for I, SI Slices :: qp = 1 */ + + 124, 18, 21, 124, 18, 21, 123, 77, 22, 20, + 24, 58, 120, 124, 108, 28, 103, 12, 27, 1, + 2, 28, 17, 24, 3, 40, 124, 9, 55, 81, + 121, 77, 7, 27, 1, 2, 43, 59, 6, 36, + 9, 33, 63, 16, 7, 43, 49, 10, 23, 59, + 18, 11, 33, 49, 5, 19, 19, 31, 15, 10, + 44, 0, 0, 0, 37, 45, 67, 15, 44, 2, + 104, 16, 11, 123, 75, 37, 19, 83, 123, 123, + 123, 59, 123, 97, 123, 115, 101, 115, 101, 2, + 7, 39, 79, 11, 57, 51, 123, 19, 65, 53, + 123, 16, 35, 23, 119, 57, 45, 25, 13, 2, + 7, 39, 4, 1, 4, 14, 3, 1, 4, 27, + 26, 22, 56, 38, 50, 36, 34, 38, 90, 24, + 26, 86, 58, 2, 87, 71, 73, 53, 59, 47, + 39, 43, 37, 45, 57, 13, 17, 19, 6, 75, + 71, 63, 21, 17, 21, 13, 34, 9, 2, 3, + 50, 15, 12, 16, 2, 17, 124, 108, 76, 90, + 108, 88, 52, 90, 68, 58, 66, 36, 10, 2, + 4, 50, 36, 48, 42, 38, 34, 44, 28, 56, + 40, 16, 22, 32, 51, 124, 124, 124, 124, 124, + 124, 124, 124, 124, 124, 124, 124, 124, 120, 88, + 124, 118, 80, 124, 124, 124, 124, 116, 112, 122, + 90, 78, 30, 50, 4, 9, 67, 13, 44, 28, + 20, 4, 10, 0, 15, 19, 53, 5, 74, 50, + 40, 34, 0, 7, 27, 25, 43, 55, 18, 8, + 4, 1, 17, 23, 31, 47, 89, 65, 37, 29, + 17, 19, 43, 63, 65, 103, 27, 62, 32, 22, + 4, 13, 29, 43, 51, 65, 124, 57, 39, 29, + 5, 13, 2, 8, 10, 21, 4, 12, 3, 29, + 15, 9, 38, 4, 54, 46, 70, 68, 38, 52, + 60, 42, 30, 2, 32, 1, 79, 65, 63, 47, + 41, 41, 41, 47, 5, 25, 23, 23, 10, 23, + 37, 69, 61, 63, 25, 19, 13, 21, 9, 3, + 17, 2, 2, 7, 21, 16, 1, 13, 114, 88, + 94, 98, 98, 104, 96, 94, 80, 80, 86, 74, + 38, 44, 30, 90, 82, 80, 70, 66, 54, 26, + 12, 0, 25, 35, 59, 11, 89, 124, 124, 124, + 124, 124, 124, 124, 124, 124, 124, 122, 118, 96, + 54, 10, 124, 124, 64, 124, 124, 124, 116, 124, + 112, 100, 112, 98, 100, 66, 70, 46, 7, 82, + 62, 24, 109, 93, 97, 41, 55, 49, 11, 33, + 31, 9, 11, 18, 5, 30, 19, 124, 124, 124, + 124, 116, 110, 54, 14, 39, 21, 82, 58, 40, + 18, 18, 4, 1, 9, 55, 83, 65, 51, 51, + 45, 17, 29, 43, 17, 11, 9, 3, 0, 12, + 8, 124, 124, 124, 124, 118, 106, 82, 52, 7, + }, + + { + /* Context Tables for I, SI Slices :: qp = 2 */ + + 124, 18, 21, 124, 18, 21, 119, 75, 22, 20, + 24, 56, 118, 122, 108, 28, 99, 12, 25, 0, + 2, 26, 17, 22, 5, 38, 120, 13, 57, 83, + 115, 75, 7, 25, 0, 2, 43, 57, 6, 34, + 9, 33, 61, 16, 7, 43, 49, 10, 23, 57, + 18, 11, 33, 49, 5, 19, 19, 31, 15, 10, + 44, 0, 0, 0, 35, 45, 67, 15, 42, 2, + 104, 16, 11, 121, 73, 37, 19, 81, 119, 119, + 121, 57, 119, 95, 119, 113, 99, 113, 99, 4, + 7, 37, 77, 11, 57, 49, 119, 19, 65, 53, + 121, 16, 35, 23, 117, 57, 43, 25, 13, 2, + 7, 37, 4, 1, 2, 14, 3, 1, 4, 27, + 26, 22, 54, 38, 48, 36, 34, 38, 86, 24, + 26, 82, 56, 0, 85, 69, 71, 51, 57, 45, + 37, 41, 37, 43, 55, 13, 17, 19, 4, 75, + 69, 63, 21, 17, 19, 13, 32, 7, 2, 3, + 48, 13, 10, 14, 2, 19, 120, 106, 74, 88, + 106, 86, 50, 88, 68, 56, 64, 36, 10, 2, + 4, 48, 34, 46, 40, 36, 32, 42, 26, 52, + 38, 14, 20, 30, 51, 124, 124, 124, 124, 124, + 124, 124, 124, 124, 124, 124, 124, 124, 116, 82, + 124, 114, 76, 124, 124, 124, 124, 112, 108, 116, + 86, 74, 28, 46, 2, 11, 67, 13, 44, 28, + 20, 4, 10, 0, 15, 19, 51, 5, 74, 50, + 40, 34, 2, 7, 25, 25, 41, 53, 20, 10, + 4, 1, 15, 23, 31, 45, 87, 63, 35, 27, + 17, 19, 41, 61, 63, 101, 27, 62, 32, 22, + 4, 11, 27, 41, 49, 63, 124, 57, 39, 29, + 5, 13, 2, 8, 10, 21, 4, 12, 1, 29, + 15, 9, 36, 4, 52, 44, 68, 66, 38, 50, + 58, 42, 30, 0, 30, 3, 77, 63, 61, 47, + 41, 41, 39, 45, 5, 25, 23, 23, 8, 23, + 37, 69, 59, 63, 25, 19, 13, 19, 9, 3, + 15, 2, 2, 7, 19, 14, 1, 15, 112, 88, + 94, 96, 96, 102, 94, 92, 78, 78, 84, 72, + 36, 42, 28, 86, 80, 76, 66, 64, 52, 24, + 10, 0, 25, 35, 59, 13, 87, 124, 124, 124, + 124, 124, 124, 124, 124, 124, 124, 118, 114, 92, + 52, 8, 124, 120, 62, 124, 124, 124, 112, 120, + 108, 96, 108, 94, 96, 62, 66, 42, 9, 78, + 58, 20, 107, 91, 95, 39, 53, 47, 7, 31, + 29, 9, 9, 20, 3, 32, 17, 124, 124, 124, + 124, 110, 104, 48, 10, 41, 21, 82, 58, 40, + 18, 18, 4, 1, 9, 53, 81, 63, 49, 49, + 43, 15, 27, 41, 15, 9, 7, 3, 2, 12, + 8, 124, 124, 124, 122, 114, 102, 78, 48, 9, + }, + + { + /* Context Tables for I, SI Slices :: qp = 3 */ + + 124, 18, 21, 124, 18, 21, 115, 71, 24, 20, + 22, 52, 114, 120, 108, 28, 95, 12, 23, 2, + 2, 24, 17, 20, 7, 38, 116, 15, 59, 83, + 109, 73, 7, 23, 2, 2, 41, 55, 8, 34, + 9, 31, 59, 14, 9, 43, 49, 10, 23, 57, + 18, 11, 33, 49, 3, 19, 19, 31, 13, 10, + 44, 0, 0, 0, 35, 45, 67, 13, 40, 2, + 104, 16, 11, 119, 71, 37, 17, 79, 115, 115, + 117, 55, 115, 93, 115, 111, 97, 111, 97, 6, + 7, 35, 75, 11, 55, 49, 115, 19, 63, 51, + 119, 16, 35, 21, 113, 55, 41, 25, 13, 2, + 7, 35, 6, 0, 2, 14, 3, 1, 4, 27, + 26, 20, 54, 38, 46, 36, 34, 38, 82, 24, + 24, 78, 54, 1, 83, 67, 69, 49, 55, 45, + 35, 41, 35, 41, 53, 13, 17, 19, 2, 73, + 67, 63, 21, 17, 17, 13, 30, 5, 2, 3, + 46, 11, 10, 12, 2, 21, 118, 104, 74, 86, + 104, 84, 50, 86, 66, 54, 62, 36, 10, 2, + 2, 46, 32, 44, 38, 34, 30, 40, 26, 48, + 36, 14, 18, 28, 51, 124, 124, 124, 124, 124, + 124, 124, 124, 124, 124, 124, 124, 124, 110, 78, + 124, 110, 74, 124, 122, 124, 118, 108, 102, 112, + 82, 68, 26, 42, 0, 13, 67, 13, 46, 28, + 20, 4, 10, 0, 15, 19, 51, 5, 74, 50, + 40, 34, 4, 5, 25, 23, 41, 51, 22, 10, + 6, 1, 13, 21, 29, 45, 85, 61, 33, 25, + 15, 19, 39, 59, 61, 99, 25, 62, 32, 22, + 4, 9, 27, 39, 47, 61, 124, 55, 37, 27, + 5, 13, 2, 8, 10, 21, 4, 12, 1, 29, + 15, 9, 36, 6, 50, 42, 66, 64, 38, 48, + 56, 42, 30, 0, 28, 3, 75, 61, 59, 45, + 39, 39, 39, 43, 5, 25, 23, 21, 8, 23, + 37, 67, 57, 63, 25, 19, 13, 17, 9, 3, + 13, 2, 2, 7, 17, 12, 1, 17, 110, 86, + 92, 94, 94, 100, 92, 90, 76, 76, 82, 70, + 34, 40, 26, 84, 78, 74, 62, 60, 50, 22, + 10, 1, 25, 35, 59, 13, 85, 124, 124, 124, + 124, 124, 124, 124, 124, 124, 124, 114, 108, 88, + 48, 6, 122, 118, 58, 124, 124, 120, 108, 116, + 104, 92, 104, 90, 90, 58, 62, 38, 11, 74, + 54, 18, 105, 89, 93, 37, 51, 45, 5, 29, + 27, 9, 7, 24, 0, 36, 15, 124, 124, 124, + 124, 104, 98, 42, 6, 43, 21, 82, 58, 40, + 18, 18, 4, 1, 9, 53, 79, 61, 47, 47, + 41, 15, 27, 39, 15, 9, 7, 3, 2, 12, + 8, 124, 124, 124, 118, 110, 98, 74, 44, 11, + }, + + { + /* Context Tables for I, SI Slices :: qp = 4 */ + + 124, 18, 21, 124, 18, 21, 113, 69, 24, 20, + 22, 50, 112, 116, 108, 28, 89, 10, 21, 2, + 2, 22, 17, 18, 9, 36, 112, 19, 61, 85, + 103, 71, 7, 21, 2, 2, 41, 53, 8, 32, + 9, 31, 59, 14, 9, 41, 49, 10, 23, 55, + 16, 13, 33, 49, 3, 17, 19, 29, 13, 10, + 44, 0, 0, 0, 33, 47, 67, 13, 38, 2, + 104, 16, 11, 117, 69, 37, 17, 75, 113, 111, + 115, 53, 113, 89, 111, 109, 97, 109, 97, 6, + 7, 33, 73, 11, 55, 47, 111, 19, 63, 51, + 117, 16, 33, 21, 111, 55, 41, 25, 11, 2, + 7, 35, 6, 0, 0, 12, 3, 1, 4, 27, + 26, 20, 52, 38, 46, 36, 34, 36, 78, 24, + 24, 74, 52, 3, 81, 65, 67, 47, 55, 43, + 33, 39, 35, 39, 51, 13, 17, 17, 0, 73, + 65, 63, 21, 17, 17, 13, 28, 3, 2, 3, + 42, 9, 8, 10, 2, 23, 114, 102, 72, 84, + 102, 82, 48, 84, 66, 50, 60, 34, 10, 2, + 2, 44, 32, 42, 38, 32, 28, 38, 24, 44, + 34, 12, 16, 26, 51, 124, 124, 124, 124, 124, + 124, 124, 124, 124, 124, 124, 124, 118, 106, 72, + 124, 108, 70, 124, 118, 124, 114, 102, 98, 106, + 78, 64, 24, 40, 3, 15, 67, 13, 46, 30, + 20, 4, 10, 0, 15, 19, 49, 3, 76, 50, + 40, 34, 6, 5, 23, 23, 39, 51, 24, 12, + 6, 1, 13, 21, 29, 43, 83, 61, 31, 25, + 15, 19, 37, 57, 61, 97, 25, 64, 32, 22, + 4, 7, 25, 39, 45, 59, 124, 55, 37, 27, + 5, 13, 2, 8, 10, 19, 4, 12, 0, 29, + 15, 9, 34, 6, 48, 40, 64, 62, 38, 44, + 54, 40, 30, 1, 26, 5, 75, 61, 57, 45, + 39, 39, 37, 41, 7, 25, 23, 21, 6, 23, + 37, 67, 55, 63, 25, 17, 13, 17, 9, 3, + 11, 2, 0, 7, 15, 12, 3, 19, 108, 86, + 92, 92, 92, 98, 90, 88, 74, 74, 80, 68, + 32, 38, 24, 80, 74, 70, 58, 58, 48, 20, + 8, 1, 25, 35, 59, 15, 85, 124, 124, 124, + 124, 124, 124, 124, 124, 124, 120, 110, 104, 84, + 46, 4, 118, 114, 56, 124, 124, 116, 104, 110, + 100, 88, 100, 86, 86, 54, 58, 34, 13, 70, + 50, 14, 103, 87, 91, 37, 49, 43, 1, 27, + 25, 9, 5, 26, 2, 38, 15, 124, 124, 124, + 124, 98, 92, 36, 2, 45, 21, 82, 58, 40, + 18, 18, 4, 1, 9, 51, 77, 59, 45, 47, + 39, 13, 25, 37, 13, 7, 5, 1, 4, 14, + 8, 124, 124, 124, 114, 106, 94, 70, 40, 13, + }, + + { + /* Context Tables for I, SI Slices :: qp = 5 */ + + 124, 18, 21, 124, 18, 21, 109, 65, 24, 20, + 20, 46, 108, 114, 108, 28, 85, 10, 19, 4, + 2, 22, 15, 16, 11, 36, 108, 23, 63, 85, + 97, 67, 7, 19, 4, 2, 41, 51, 8, 32, + 9, 31, 57, 14, 11, 41, 49, 10, 23, 53, + 16, 13, 33, 49, 1, 17, 17, 29, 11, 10, + 44, 0, 0, 0, 33, 47, 67, 11, 36, 2, + 104, 16, 11, 115, 67, 37, 15, 73, 109, 107, + 111, 51, 109, 87, 107, 107, 95, 107, 95, 8, + 7, 31, 71, 11, 53, 45, 107, 19, 63, 49, + 113, 18, 33, 19, 109, 53, 39, 25, 11, 4, + 5, 33, 8, 2, 0, 12, 3, 1, 4, 27, + 26, 18, 50, 38, 44, 36, 34, 36, 74, 24, + 22, 72, 50, 5, 79, 63, 65, 45, 53, 41, + 31, 37, 33, 37, 49, 13, 17, 17, 1, 71, + 63, 63, 19, 17, 15, 13, 26, 1, 2, 3, + 40, 7, 8, 8, 2, 23, 112, 100, 72, 82, + 100, 80, 46, 84, 66, 48, 58, 34, 10, 2, + 0, 44, 30, 40, 36, 30, 26, 38, 22, 40, + 32, 10, 14, 24, 51, 124, 124, 124, 124, 124, + 124, 124, 124, 124, 124, 124, 124, 114, 102, 68, + 120, 104, 66, 124, 114, 120, 110, 98, 94, 100, + 74, 58, 22, 36, 5, 15, 67, 13, 46, 30, + 20, 4, 10, 0, 15, 19, 49, 3, 76, 50, + 40, 34, 8, 3, 21, 23, 37, 49, 26, 14, + 6, 0, 11, 19, 27, 43, 81, 59, 27, 23, + 15, 17, 35, 55, 59, 95, 23, 64, 34, 22, + 4, 5, 23, 37, 43, 57, 124, 55, 37, 25, + 5, 13, 2, 8, 10, 19, 4, 14, 0, 29, + 15, 9, 32, 8, 46, 38, 62, 62, 38, 42, + 52, 40, 30, 3, 24, 5, 73, 59, 55, 43, + 37, 37, 37, 39, 7, 25, 23, 21, 4, 23, + 37, 65, 53, 63, 25, 17, 13, 15, 9, 3, + 9, 2, 0, 7, 13, 10, 3, 19, 106, 86, + 90, 92, 90, 96, 88, 86, 74, 72, 78, 66, + 30, 36, 22, 78, 72, 68, 54, 56, 46, 18, + 6, 3, 25, 33, 59, 15, 83, 124, 124, 124, + 124, 124, 124, 124, 124, 120, 116, 106, 100, 80, + 42, 2, 114, 110, 54, 122, 124, 112, 100, 106, + 96, 84, 96, 82, 80, 50, 54, 30, 15, 66, + 46, 12, 101, 83, 89, 35, 47, 41, 2, 25, + 23, 9, 3, 30, 6, 42, 13, 124, 124, 124, + 124, 94, 86, 32, 1, 47, 21, 82, 58, 40, + 18, 18, 4, 1, 9, 51, 75, 57, 43, 45, + 37, 11, 25, 35, 11, 5, 3, 1, 4, 14, + 8, 124, 124, 124, 112, 102, 90, 66, 36, 15, + }, + + { + /* Context Tables for I, SI Slices :: qp = 6 */ + + 124, 18, 23, 124, 18, 23, 105, 63, 26, 20, + 20, 44, 106, 112, 108, 28, 81, 10, 19, 6, + 2, 20, 15, 14, 13, 34, 106, 25, 65, 87, + 91, 65, 7, 19, 6, 2, 39, 49, 10, 30, + 7, 29, 55, 12, 11, 41, 49, 10, 21, 53, + 16, 13, 31, 49, 1, 17, 17, 29, 11, 10, + 44, 0, 0, 0, 31, 47, 67, 11, 36, 0, + 104, 16, 11, 113, 67, 37, 15, 71, 105, 103, + 109, 49, 105, 85, 103, 105, 93, 105, 93, 10, + 7, 29, 71, 9, 53, 45, 103, 19, 61, 49, + 111, 18, 33, 19, 105, 53, 37, 23, 11, 4, + 5, 31, 8, 2, 1, 12, 3, 1, 4, 27, + 26, 18, 50, 38, 42, 36, 34, 36, 70, 24, + 22, 68, 48, 7, 79, 61, 65, 45, 51, 41, + 29, 37, 33, 37, 45, 13, 17, 17, 3, 71, + 61, 63, 19, 17, 13, 11, 24, 1, 2, 3, + 38, 5, 6, 6, 2, 25, 108, 98, 70, 82, + 98, 80, 46, 82, 64, 46, 56, 34, 10, 2, + 0, 42, 28, 38, 34, 30, 24, 36, 22, 36, + 30, 10, 12, 22, 51, 124, 124, 124, 124, 124, + 124, 124, 124, 124, 124, 124, 124, 108, 96, 62, + 116, 100, 64, 124, 108, 114, 104, 94, 88, 96, + 68, 54, 20, 32, 7, 17, 67, 11, 48, 30, + 22, 4, 10, 0, 15, 19, 47, 3, 76, 52, + 40, 34, 10, 3, 21, 21, 37, 47, 28, 14, + 8, 0, 9, 19, 27, 41, 79, 57, 25, 21, + 13, 17, 35, 55, 57, 91, 23, 64, 34, 22, + 6, 5, 23, 35, 43, 55, 124, 53, 35, 25, + 5, 13, 2, 8, 10, 19, 6, 14, 2, 29, + 15, 11, 32, 8, 44, 36, 60, 60, 38, 40, + 50, 40, 30, 3, 22, 7, 71, 57, 53, 43, + 37, 37, 35, 39, 7, 23, 21, 19, 4, 23, + 37, 65, 51, 63, 25, 17, 13, 13, 9, 3, + 7, 0, 0, 7, 13, 8, 3, 21, 104, 84, + 90, 90, 88, 96, 88, 84, 72, 72, 76, 64, + 28, 34, 20, 74, 70, 64, 50, 52, 42, 16, + 6, 3, 25, 33, 57, 17, 81, 124, 124, 124, + 124, 124, 124, 124, 124, 116, 110, 102, 94, 76, + 40, 2, 112, 108, 50, 118, 124, 108, 96, 102, + 92, 80, 90, 78, 76, 46, 50, 28, 19, 62, + 42, 8, 99, 81, 87, 33, 45, 39, 4, 23, + 21, 9, 1, 32, 8, 44, 11, 124, 124, 124, + 118, 88, 82, 26, 5, 51, 19, 82, 58, 40, + 18, 18, 4, 1, 9, 49, 73, 57, 41, 43, + 35, 11, 23, 33, 11, 5, 3, 1, 6, 14, + 8, 124, 124, 122, 108, 100, 88, 60, 34, 17, + }, + + { + /* Context Tables for I, SI Slices :: qp = 7 */ + + 124, 18, 23, 124, 18, 23, 101, 59, 26, 20, + 18, 40, 102, 108, 108, 28, 75, 8, 17, 6, + 2, 18, 15, 12, 15, 34, 102, 29, 67, 87, + 85, 63, 7, 17, 6, 2, 39, 47, 10, 30, + 7, 29, 55, 12, 13, 39, 49, 10, 21, 51, + 14, 13, 31, 49, 0, 15, 17, 27, 9, 10, + 44, 0, 0, 0, 31, 47, 67, 9, 34, 0, + 104, 16, 11, 111, 65, 37, 13, 67, 103, 99, + 105, 47, 103, 81, 99, 103, 91, 103, 93, 12, + 7, 27, 69, 9, 51, 43, 99, 19, 61, 47, + 109, 18, 31, 17, 103, 51, 37, 23, 9, 4, + 5, 29, 10, 4, 1, 10, 3, 1, 4, 27, + 26, 16, 48, 38, 42, 36, 34, 34, 66, 24, + 20, 64, 46, 9, 77, 59, 63, 43, 49, 39, + 27, 35, 31, 35, 43, 13, 17, 15, 5, 69, + 59, 63, 19, 17, 13, 11, 22, 0, 2, 3, + 34, 3, 6, 4, 2, 27, 106, 96, 70, 80, + 96, 78, 44, 80, 64, 44, 54, 34, 10, 2, + 1, 40, 28, 36, 34, 28, 22, 34, 20, 32, + 28, 8, 10, 20, 51, 124, 124, 124, 124, 124, + 124, 124, 124, 124, 124, 120, 122, 104, 92, 58, + 112, 98, 60, 124, 104, 110, 100, 88, 84, 90, + 64, 48, 18, 30, 11, 19, 67, 11, 48, 32, + 22, 4, 10, 0, 15, 19, 47, 1, 78, 52, + 40, 34, 12, 1, 19, 21, 35, 45, 30, 16, + 8, 0, 7, 17, 25, 41, 77, 57, 23, 21, + 13, 17, 33, 53, 57, 89, 21, 66, 34, 22, + 6, 3, 21, 33, 41, 53, 124, 53, 35, 23, + 5, 13, 2, 8, 10, 17, 6, 14, 2, 29, + 15, 11, 30, 10, 42, 34, 58, 58, 38, 38, + 48, 38, 30, 5, 20, 7, 69, 57, 51, 41, + 35, 35, 35, 37, 7, 23, 21, 19, 2, 23, + 37, 63, 49, 63, 25, 15, 13, 13, 9, 3, + 5, 0, 0, 7, 11, 8, 5, 23, 102, 84, + 88, 88, 86, 94, 86, 82, 70, 70, 74, 62, + 26, 32, 18, 72, 66, 62, 46, 50, 40, 14, + 4, 5, 25, 33, 57, 17, 79, 124, 124, 124, + 124, 124, 124, 124, 122, 112, 106, 98, 90, 72, + 36, 0, 108, 104, 48, 114, 124, 104, 92, 98, + 88, 76, 86, 74, 70, 42, 46, 24, 21, 58, + 38, 6, 97, 79, 85, 33, 43, 37, 8, 21, + 19, 9, 0, 36, 12, 48, 11, 124, 124, 122, + 112, 82, 76, 20, 9, 53, 19, 82, 58, 40, + 18, 18, 4, 1, 9, 49, 71, 55, 39, 41, + 33, 9, 23, 31, 9, 3, 1, 0, 6, 16, + 8, 124, 124, 118, 104, 96, 84, 56, 30, 19, + }, + + { + /* Context Tables for I, SI Slices :: qp = 8 */ + + 124, 16, 23, 124, 16, 23, 99, 57, 26, 20, + 18, 38, 100, 106, 108, 28, 71, 8, 15, 8, + 2, 16, 15, 10, 19, 32, 98, 33, 69, 89, + 81, 61, 7, 15, 8, 2, 39, 45, 10, 28, + 7, 29, 53, 10, 13, 39, 51, 10, 21, 51, + 14, 15, 31, 49, 0, 15, 17, 27, 9, 10, + 44, 0, 0, 0, 29, 49, 67, 9, 32, 0, + 104, 16, 11, 109, 63, 37, 13, 65, 99, 95, + 103, 45, 99, 79, 97, 101, 91, 101, 91, 12, + 7, 25, 67, 9, 51, 43, 97, 19, 61, 47, + 107, 18, 31, 17, 101, 51, 35, 23, 9, 4, + 5, 29, 10, 4, 3, 10, 3, 1, 4, 27, + 26, 16, 46, 38, 40, 36, 34, 34, 62, 24, + 20, 60, 44, 11, 75, 57, 61, 41, 49, 39, + 25, 35, 31, 33, 41, 13, 17, 15, 9, 69, + 57, 63, 19, 19, 11, 11, 20, 2, 2, 3, + 32, 1, 4, 2, 2, 29, 102, 94, 68, 78, + 94, 76, 42, 78, 62, 40, 52, 32, 10, 2, + 1, 38, 26, 34, 32, 26, 20, 32, 18, 28, + 24, 6, 8, 18, 51, 124, 124, 124, 124, 124, + 124, 124, 124, 124, 122, 116, 116, 98, 86, 52, + 108, 94, 56, 122, 100, 104, 94, 84, 78, 84, + 60, 44, 16, 26, 13, 21, 69, 11, 48, 32, + 22, 4, 10, 0, 15, 19, 45, 1, 78, 52, + 40, 34, 14, 1, 19, 21, 35, 45, 32, 16, + 8, 0, 7, 17, 25, 39, 75, 55, 21, 19, + 13, 17, 31, 51, 55, 87, 21, 66, 34, 22, + 6, 1, 21, 33, 39, 53, 124, 53, 35, 23, + 5, 13, 2, 8, 10, 17, 6, 14, 4, 29, + 15, 11, 28, 10, 40, 32, 56, 56, 38, 34, + 44, 38, 30, 7, 18, 9, 69, 55, 49, 41, + 35, 35, 33, 35, 9, 23, 21, 19, 0, 23, + 37, 63, 49, 65, 25, 15, 13, 11, 9, 3, + 5, 0, 1, 7, 9, 6, 5, 25, 100, 82, + 88, 86, 82, 92, 84, 80, 68, 68, 72, 60, + 24, 30, 16, 68, 64, 58, 42, 46, 38, 12, + 2, 5, 25, 33, 57, 19, 79, 124, 124, 124, + 124, 124, 124, 122, 116, 108, 102, 94, 84, 68, + 34, 1, 104, 100, 44, 110, 122, 98, 86, 92, + 82, 72, 82, 68, 66, 38, 40, 20, 23, 54, + 34, 2, 95, 77, 83, 31, 41, 37, 10, 19, + 19, 9, 0, 38, 14, 50, 9, 124, 124, 116, + 106, 76, 70, 14, 13, 55, 19, 82, 58, 40, + 18, 18, 4, 1, 9, 47, 71, 53, 37, 41, + 31, 9, 21, 31, 9, 3, 1, 0, 8, 16, + 6, 124, 124, 114, 100, 92, 80, 52, 26, 21, + }, + + { + /* Context Tables for I, SI Slices :: qp = 9 */ + + 124, 16, 23, 124, 16, 23, 95, 55, 28, 20, + 18, 36, 98, 104, 108, 28, 67, 8, 13, 10, + 2, 16, 13, 8, 21, 30, 94, 35, 71, 91, + 75, 57, 7, 13, 10, 2, 37, 43, 12, 26, + 7, 27, 51, 10, 13, 39, 51, 10, 21, 49, + 14, 15, 31, 49, 0, 15, 15, 27, 9, 10, + 44, 0, 0, 0, 27, 49, 67, 9, 30, 0, + 104, 16, 11, 107, 61, 37, 13, 63, 95, 91, + 99, 41, 95, 77, 93, 99, 89, 99, 89, 14, + 5, 23, 65, 9, 49, 41, 93, 19, 59, 47, + 103, 20, 31, 17, 97, 51, 33, 23, 9, 6, + 3, 27, 10, 4, 3, 10, 1, 1, 4, 25, + 26, 16, 46, 38, 38, 36, 34, 34, 58, 24, + 20, 58, 42, 11, 73, 55, 59, 39, 47, 37, + 23, 33, 31, 31, 39, 13, 17, 15, 11, 67, + 55, 63, 17, 19, 9, 11, 18, 4, 2, 3, + 30, 0, 2, 0, 2, 29, 100, 92, 68, 76, + 92, 74, 42, 78, 62, 38, 50, 32, 10, 2, + 1, 38, 24, 32, 30, 24, 18, 32, 18, 26, + 22, 6, 6, 16, 51, 124, 124, 124, 124, 124, + 124, 124, 124, 124, 118, 112, 112, 92, 82, 46, + 106, 90, 54, 118, 96, 100, 90, 80, 74, 80, + 56, 40, 16, 22, 15, 21, 69, 11, 50, 32, + 22, 6, 10, 0, 13, 19, 43, 1, 78, 52, + 42, 34, 16, 0, 17, 19, 33, 43, 34, 18, + 10, 2, 5, 15, 25, 37, 73, 53, 17, 17, + 11, 15, 29, 49, 53, 85, 19, 66, 36, 24, + 6, 0, 19, 31, 37, 51, 124, 51, 33, 21, + 5, 13, 2, 10, 12, 17, 6, 16, 6, 29, + 15, 11, 28, 10, 38, 32, 54, 56, 38, 32, + 42, 38, 30, 7, 16, 11, 67, 53, 47, 41, + 33, 35, 31, 33, 9, 23, 21, 17, 0, 23, + 37, 63, 47, 65, 25, 15, 13, 9, 9, 3, + 3, 0, 1, 7, 7, 4, 5, 25, 98, 82, + 88, 86, 80, 90, 82, 78, 68, 66, 70, 60, + 24, 28, 14, 66, 62, 54, 38, 44, 36, 12, + 2, 5, 23, 31, 57, 21, 77, 124, 124, 124, + 124, 124, 124, 118, 112, 104, 98, 90, 80, 64, + 32, 3, 100, 98, 42, 106, 118, 94, 82, 88, + 78, 68, 78, 64, 62, 36, 36, 16, 25, 50, + 30, 1, 93, 73, 79, 29, 39, 35, 14, 17, + 17, 9, 2, 42, 16, 54, 7, 124, 124, 112, + 100, 72, 64, 10, 17, 57, 19, 82, 58, 40, + 18, 20, 4, 1, 9, 45, 69, 51, 35, 39, + 27, 7, 19, 29, 7, 1, 0, 0, 10, 16, + 6, 124, 122, 112, 98, 88, 76, 48, 22, 21, + }, + + { + /* Context Tables for I, SI Slices :: qp = 10 */ + + 124, 16, 23, 124, 16, 23, 91, 51, 28, 20, + 16, 32, 94, 100, 108, 28, 61, 6, 11, 10, + 2, 14, 13, 6, 23, 30, 90, 39, 73, 91, + 69, 55, 7, 11, 10, 2, 37, 41, 12, 26, + 7, 27, 51, 10, 15, 37, 51, 10, 21, 47, + 12, 15, 31, 49, 2, 13, 15, 25, 7, 10, + 44, 0, 0, 0, 27, 49, 67, 7, 28, 0, + 104, 16, 11, 105, 59, 37, 11, 59, 93, 87, + 97, 39, 93, 73, 89, 97, 87, 97, 89, 16, + 5, 21, 63, 9, 49, 39, 89, 19, 59, 45, + 101, 20, 29, 15, 95, 49, 33, 23, 7, 6, + 3, 25, 12, 6, 5, 8, 1, 1, 4, 25, + 26, 14, 44, 38, 38, 36, 34, 32, 54, 24, + 18, 54, 40, 13, 71, 53, 57, 37, 45, 35, + 21, 31, 29, 29, 37, 13, 17, 13, 13, 67, + 53, 63, 17, 19, 9, 11, 16, 6, 2, 3, + 26, 2, 2, 1, 2, 31, 96, 90, 66, 74, + 90, 72, 40, 76, 62, 36, 48, 32, 10, 2, + 3, 36, 24, 30, 30, 22, 16, 30, 16, 22, + 20, 4, 4, 14, 51, 124, 124, 124, 124, 124, + 124, 124, 124, 124, 114, 108, 106, 88, 78, 42, + 102, 88, 50, 112, 92, 96, 86, 74, 70, 74, + 52, 34, 14, 20, 19, 23, 69, 11, 50, 34, + 22, 6, 10, 0, 13, 19, 43, 0, 80, 52, + 42, 34, 18, 0, 15, 19, 31, 41, 36, 20, + 10, 2, 3, 15, 23, 37, 71, 53, 15, 17, + 11, 15, 27, 47, 53, 83, 19, 68, 36, 24, + 6, 2, 17, 29, 35, 49, 124, 51, 33, 21, + 5, 13, 2, 10, 12, 15, 6, 16, 6, 29, + 15, 11, 26, 12, 36, 30, 52, 54, 38, 30, + 40, 36, 30, 9, 14, 11, 65, 53, 45, 39, + 33, 33, 31, 31, 9, 23, 21, 17, 1, 23, + 37, 61, 45, 65, 25, 13, 13, 9, 9, 3, + 1, 0, 1, 7, 5, 4, 7, 27, 96, 82, + 86, 84, 78, 88, 80, 76, 66, 64, 68, 58, + 22, 26, 12, 62, 58, 52, 34, 42, 34, 10, + 0, 7, 23, 31, 57, 21, 75, 124, 124, 124, + 124, 124, 120, 114, 106, 100, 94, 86, 76, 60, + 28, 5, 96, 94, 40, 102, 114, 90, 78, 84, + 74, 64, 74, 60, 56, 32, 32, 12, 27, 46, + 26, 3, 91, 71, 77, 29, 37, 33, 18, 15, + 15, 9, 4, 44, 20, 56, 7, 124, 120, 106, + 94, 66, 58, 4, 21, 59, 19, 82, 58, 40, + 18, 20, 4, 1, 9, 45, 67, 49, 33, 37, + 25, 5, 19, 27, 5, 0, 2, 2, 10, 18, + 6, 120, 118, 108, 94, 84, 72, 44, 18, 23, + }, + + { + /* Context Tables for I, SI Slices :: qp = 11 */ + + 124, 16, 25, 124, 16, 25, 87, 49, 30, 20, + 16, 30, 92, 98, 108, 28, 57, 6, 11, 12, + 2, 12, 13, 4, 25, 28, 88, 41, 75, 93, + 63, 53, 7, 11, 12, 2, 35, 39, 14, 24, + 5, 25, 49, 8, 15, 37, 51, 10, 19, 47, + 12, 15, 29, 49, 2, 13, 15, 25, 7, 10, + 44, 0, 0, 0, 25, 49, 67, 7, 28, 1, + 104, 16, 11, 103, 59, 37, 11, 57, 89, 83, + 93, 37, 89, 71, 85, 95, 85, 95, 87, 18, + 5, 19, 63, 7, 47, 39, 85, 19, 57, 45, + 99, 20, 29, 15, 91, 49, 31, 21, 7, 6, + 3, 23, 12, 6, 5, 8, 1, 1, 4, 25, + 26, 14, 44, 38, 36, 36, 34, 32, 50, 24, + 18, 50, 38, 15, 71, 51, 57, 37, 43, 35, + 19, 31, 29, 29, 33, 13, 17, 13, 15, 65, + 51, 63, 17, 19, 7, 9, 14, 6, 2, 3, + 24, 4, 0, 3, 2, 33, 94, 88, 66, 74, + 88, 72, 40, 74, 60, 34, 46, 32, 10, 2, + 3, 34, 22, 28, 28, 22, 14, 28, 16, 18, + 18, 4, 2, 12, 51, 124, 124, 124, 124, 124, + 124, 124, 124, 124, 108, 104, 102, 82, 72, 36, + 98, 84, 48, 108, 86, 90, 80, 70, 64, 70, + 46, 30, 12, 16, 21, 25, 69, 9, 52, 34, + 24, 6, 10, 0, 13, 19, 41, 0, 80, 54, + 42, 34, 20, 2, 15, 17, 31, 39, 38, 20, + 12, 2, 1, 13, 23, 35, 69, 51, 13, 15, + 9, 15, 27, 47, 51, 79, 17, 68, 36, 24, + 8, 2, 17, 27, 35, 47, 124, 49, 31, 19, + 5, 13, 2, 10, 12, 15, 8, 16, 8, 29, + 15, 13, 26, 12, 34, 28, 50, 52, 38, 28, + 38, 36, 30, 9, 12, 13, 63, 51, 43, 39, + 31, 33, 29, 31, 9, 21, 19, 15, 1, 23, + 37, 61, 43, 65, 25, 13, 13, 7, 9, 3, + 0, 1, 1, 7, 5, 2, 7, 29, 94, 80, + 86, 82, 76, 88, 80, 74, 64, 64, 66, 56, + 20, 24, 10, 60, 56, 48, 30, 38, 30, 8, + 0, 7, 23, 31, 55, 23, 73, 124, 124, 124, + 124, 124, 116, 110, 102, 96, 88, 82, 70, 56, + 26, 5, 94, 92, 36, 98, 108, 86, 74, 80, + 70, 60, 68, 56, 52, 28, 28, 10, 31, 42, + 22, 7, 89, 69, 75, 27, 35, 31, 20, 13, + 13, 9, 6, 48, 22, 60, 5, 122, 118, 102, + 88, 60, 54, 1, 25, 63, 17, 82, 58, 40, + 18, 20, 4, 1, 9, 43, 65, 49, 31, 35, + 23, 5, 17, 25, 5, 0, 2, 2, 12, 18, + 6, 118, 116, 104, 90, 82, 70, 38, 16, 25, + }, + + { + /* Context Tables for I, SI Slices :: qp = 12 */ + + 124, 16, 25, 124, 16, 25, 85, 45, 30, 20, + 14, 26, 88, 96, 108, 28, 53, 6, 9, 14, + 2, 10, 13, 2, 27, 28, 84, 45, 77, 93, + 57, 51, 7, 9, 14, 2, 35, 37, 14, 24, + 5, 25, 47, 8, 17, 37, 51, 10, 19, 45, + 12, 17, 29, 49, 4, 13, 15, 25, 5, 10, + 44, 0, 0, 0, 25, 51, 67, 5, 26, 1, + 104, 16, 11, 101, 57, 37, 9, 55, 85, 79, + 91, 35, 85, 69, 81, 93, 85, 93, 85, 18, + 5, 17, 61, 7, 47, 37, 81, 19, 57, 43, + 97, 20, 29, 13, 89, 47, 29, 21, 7, 6, + 3, 23, 14, 8, 7, 8, 1, 1, 4, 25, + 26, 12, 42, 38, 34, 36, 34, 32, 46, 24, + 16, 46, 36, 17, 69, 49, 55, 35, 43, 33, + 17, 29, 27, 27, 31, 13, 17, 13, 17, 65, + 49, 63, 17, 19, 5, 9, 12, 8, 2, 3, + 22, 6, 0, 5, 2, 35, 90, 86, 64, 72, + 86, 70, 38, 72, 60, 30, 44, 30, 10, 2, + 5, 32, 20, 26, 26, 20, 12, 26, 14, 14, + 16, 2, 0, 10, 51, 124, 124, 122, 124, 124, + 124, 124, 124, 122, 104, 100, 96, 78, 68, 32, + 94, 80, 44, 104, 82, 86, 76, 66, 60, 64, + 42, 24, 10, 12, 23, 27, 69, 9, 52, 34, + 24, 6, 10, 0, 13, 19, 41, 0, 80, 54, + 42, 34, 22, 2, 13, 17, 29, 39, 40, 22, + 12, 2, 1, 13, 21, 35, 67, 49, 11, 13, + 9, 15, 25, 45, 49, 77, 17, 68, 36, 24, + 8, 4, 15, 27, 33, 45, 124, 49, 31, 19, + 5, 13, 2, 10, 12, 15, 8, 16, 8, 29, + 15, 13, 24, 14, 32, 26, 48, 50, 38, 24, + 36, 36, 30, 11, 10, 13, 63, 49, 41, 37, + 31, 31, 29, 29, 11, 21, 19, 15, 3, 23, + 37, 59, 41, 65, 25, 13, 13, 5, 9, 3, + 2, 1, 3, 7, 3, 0, 7, 31, 92, 80, + 84, 80, 74, 86, 78, 72, 62, 62, 64, 54, + 18, 22, 8, 56, 54, 46, 26, 36, 28, 6, + 1, 9, 23, 31, 55, 23, 73, 124, 124, 124, + 124, 124, 112, 106, 96, 92, 84, 78, 66, 52, + 22, 7, 90, 88, 34, 94, 104, 82, 70, 74, + 66, 56, 64, 52, 46, 24, 24, 6, 33, 38, + 18, 9, 87, 67, 73, 25, 33, 29, 24, 11, + 11, 9, 8, 50, 26, 62, 3, 118, 114, 96, + 82, 54, 48, 7, 29, 65, 17, 82, 58, 40, + 18, 20, 4, 1, 9, 43, 63, 47, 29, 35, + 21, 3, 17, 23, 3, 2, 4, 2, 12, 18, + 6, 116, 112, 100, 86, 78, 66, 34, 12, 27, + }, + + { + /* Context Tables for I, SI Slices :: qp = 13 */ + + 124, 16, 25, 124, 16, 25, 81, 43, 30, 20, + 14, 24, 86, 92, 108, 28, 47, 4, 7, 14, + 2, 10, 11, 0, 29, 26, 80, 49, 79, 95, + 51, 47, 7, 7, 14, 2, 35, 35, 14, 22, + 5, 25, 47, 8, 17, 35, 51, 10, 19, 43, + 10, 17, 29, 49, 4, 11, 13, 23, 5, 10, + 44, 0, 0, 0, 23, 51, 67, 5, 24, 1, + 104, 16, 11, 99, 55, 37, 9, 51, 83, 75, + 87, 33, 83, 65, 77, 91, 83, 91, 85, 20, + 5, 15, 59, 7, 45, 35, 77, 19, 57, 43, + 93, 22, 27, 13, 87, 47, 29, 21, 5, 8, + 1, 21, 14, 8, 7, 6, 1, 1, 4, 25, + 26, 12, 40, 38, 34, 36, 34, 30, 42, 24, + 16, 44, 34, 19, 67, 47, 53, 33, 41, 31, + 15, 27, 27, 25, 29, 13, 17, 11, 19, 63, + 47, 63, 15, 19, 5, 9, 10, 10, 2, 3, + 18, 8, 1, 7, 2, 35, 88, 84, 64, 70, + 84, 68, 36, 72, 60, 28, 42, 30, 10, 2, + 5, 32, 20, 24, 26, 18, 10, 26, 12, 10, + 14, 0, 1, 8, 51, 122, 124, 118, 124, 122, + 120, 120, 120, 118, 100, 96, 92, 72, 64, 26, + 90, 78, 40, 98, 78, 82, 72, 60, 56, 58, + 38, 20, 8, 10, 27, 27, 69, 9, 52, 36, + 24, 6, 10, 0, 13, 19, 39, 2, 82, 54, + 42, 34, 24, 4, 11, 17, 27, 37, 42, 24, + 12, 4, 0, 11, 21, 33, 65, 49, 7, 13, + 9, 13, 23, 43, 49, 75, 15, 70, 38, 24, + 8, 6, 13, 25, 31, 43, 124, 49, 31, 17, + 5, 13, 2, 10, 12, 13, 8, 18, 10, 29, + 15, 13, 22, 14, 30, 24, 46, 50, 38, 22, + 34, 34, 30, 13, 8, 15, 61, 49, 39, 37, + 29, 31, 27, 27, 11, 21, 19, 15, 5, 23, + 37, 59, 39, 65, 25, 11, 13, 5, 9, 3, + 4, 1, 3, 7, 1, 0, 9, 31, 90, 80, + 84, 80, 72, 84, 76, 70, 62, 60, 62, 52, + 16, 20, 6, 54, 50, 42, 22, 34, 26, 4, + 3, 9, 23, 29, 55, 25, 71, 124, 124, 124, + 124, 120, 108, 102, 92, 88, 80, 74, 62, 48, + 20, 9, 86, 84, 32, 90, 100, 78, 66, 70, + 62, 52, 60, 48, 42, 20, 20, 2, 35, 34, + 14, 13, 85, 63, 71, 25, 31, 27, 28, 9, + 9, 9, 10, 54, 28, 66, 3, 116, 110, 92, + 76, 50, 42, 11, 33, 67, 17, 82, 58, 40, + 18, 20, 4, 1, 9, 41, 61, 45, 27, 33, + 19, 1, 15, 21, 1, 4, 6, 4, 14, 20, + 6, 112, 110, 98, 84, 74, 62, 30, 8, 29, + }, + + { + /* Context Tables for I, SI Slices :: qp = 14 */ + + 122, 16, 25, 122, 16, 25, 77, 39, 32, 20, + 12, 20, 82, 90, 108, 28, 43, 4, 5, 16, + 2, 8, 11, 1, 31, 26, 76, 51, 81, 95, + 45, 45, 7, 5, 16, 2, 33, 33, 16, 22, + 5, 23, 45, 6, 19, 35, 51, 10, 19, 43, + 10, 17, 29, 49, 6, 11, 13, 23, 3, 10, + 44, 0, 0, 0, 23, 51, 67, 3, 22, 1, + 104, 16, 11, 97, 53, 37, 7, 49, 79, 71, + 85, 31, 79, 63, 73, 89, 81, 89, 83, 22, + 5, 13, 57, 7, 45, 35, 73, 19, 55, 41, + 91, 22, 27, 11, 83, 45, 27, 21, 5, 8, + 1, 19, 16, 10, 9, 6, 1, 1, 4, 25, + 26, 10, 40, 38, 32, 36, 34, 30, 38, 24, + 14, 40, 32, 21, 65, 45, 51, 31, 39, 31, + 13, 27, 25, 23, 27, 13, 17, 11, 21, 63, + 45, 63, 15, 19, 3, 9, 8, 12, 2, 3, + 16, 10, 1, 9, 2, 37, 84, 82, 62, 68, + 82, 66, 36, 70, 58, 26, 40, 30, 10, 2, + 7, 30, 18, 22, 24, 16, 8, 24, 12, 6, + 12, 0, 3, 6, 51, 120, 122, 116, 124, 118, + 116, 116, 116, 112, 94, 92, 86, 68, 58, 22, + 86, 74, 38, 94, 74, 76, 66, 56, 50, 54, + 34, 14, 6, 6, 29, 29, 69, 9, 54, 36, + 24, 6, 10, 0, 13, 19, 39, 2, 82, 54, + 42, 34, 26, 4, 11, 15, 27, 35, 44, 24, + 14, 4, 2, 11, 19, 33, 63, 47, 5, 11, + 7, 13, 21, 41, 47, 73, 15, 70, 38, 24, + 8, 8, 13, 23, 29, 41, 124, 47, 29, 17, + 5, 13, 2, 10, 12, 13, 8, 18, 10, 29, + 15, 13, 22, 16, 28, 22, 44, 48, 38, 20, + 32, 34, 30, 13, 6, 15, 59, 47, 37, 35, + 29, 29, 27, 25, 11, 21, 19, 13, 5, 23, + 37, 57, 37, 65, 25, 11, 13, 3, 9, 3, + 6, 1, 3, 7, 0, 1, 9, 33, 88, 78, + 82, 78, 70, 82, 74, 68, 60, 58, 60, 50, + 14, 18, 4, 50, 48, 40, 18, 30, 24, 2, + 3, 11, 23, 29, 55, 25, 69, 124, 124, 122, + 122, 114, 104, 98, 86, 84, 76, 70, 56, 44, + 16, 11, 82, 82, 28, 86, 96, 74, 62, 66, + 58, 48, 56, 44, 36, 16, 16, 1, 37, 30, + 10, 15, 83, 61, 69, 23, 29, 25, 30, 7, + 7, 9, 12, 56, 32, 68, 1, 112, 108, 86, + 70, 44, 36, 17, 37, 69, 17, 82, 58, 40, + 18, 20, 4, 1, 9, 41, 59, 43, 25, 31, + 17, 1, 15, 19, 1, 4, 6, 4, 14, 20, + 6, 110, 106, 94, 80, 70, 58, 26, 4, 31, + }, + + { + /* Context Tables for I, SI Slices :: qp = 15 */ + + 120, 16, 25, 120, 16, 25, 73, 37, 32, 20, + 12, 18, 80, 88, 108, 28, 39, 4, 3, 18, + 2, 6, 11, 3, 33, 24, 72, 55, 83, 97, + 39, 43, 7, 3, 18, 2, 33, 31, 16, 20, + 5, 23, 43, 6, 19, 35, 51, 10, 19, 41, + 10, 17, 29, 49, 6, 11, 13, 23, 3, 10, + 44, 0, 0, 0, 21, 51, 67, 3, 20, 1, + 104, 16, 11, 95, 51, 37, 7, 47, 75, 67, + 81, 29, 75, 61, 69, 87, 79, 87, 81, 24, + 5, 11, 55, 7, 43, 33, 69, 19, 55, 41, + 89, 22, 27, 11, 81, 45, 25, 21, 5, 8, + 1, 17, 16, 10, 9, 6, 1, 1, 4, 25, + 26, 10, 38, 38, 30, 36, 34, 30, 34, 24, + 14, 36, 30, 23, 63, 43, 49, 29, 37, 29, + 11, 25, 25, 21, 25, 13, 17, 11, 23, 61, + 43, 63, 15, 19, 1, 9, 6, 14, 2, 3, + 14, 12, 3, 11, 2, 39, 82, 80, 62, 66, + 80, 64, 34, 68, 58, 24, 38, 30, 10, 2, + 7, 28, 16, 20, 22, 14, 6, 22, 10, 2, + 10, 1, 5, 4, 51, 116, 120, 112, 120, 114, + 112, 112, 112, 108, 90, 88, 82, 62, 54, 16, + 82, 70, 34, 90, 70, 72, 62, 52, 46, 48, + 30, 10, 4, 2, 31, 31, 69, 9, 54, 36, + 24, 6, 10, 0, 13, 19, 37, 2, 82, 54, + 42, 34, 28, 6, 9, 15, 25, 33, 46, 26, + 14, 4, 4, 9, 19, 31, 61, 45, 3, 9, + 7, 13, 19, 39, 45, 71, 13, 70, 38, 24, + 8, 10, 11, 21, 27, 39, 124, 47, 29, 15, + 5, 13, 2, 10, 12, 13, 8, 18, 12, 29, + 15, 13, 20, 16, 26, 20, 42, 46, 38, 18, + 30, 34, 30, 15, 4, 17, 57, 45, 35, 35, + 27, 29, 25, 23, 11, 21, 19, 13, 7, 23, + 37, 57, 35, 65, 25, 11, 13, 1, 9, 3, + 8, 1, 3, 7, 2, 3, 9, 35, 86, 78, + 82, 76, 68, 80, 72, 66, 58, 56, 58, 48, + 12, 16, 2, 48, 46, 36, 14, 28, 22, 0, + 5, 11, 23, 29, 55, 27, 67, 124, 124, 118, + 118, 108, 100, 94, 82, 80, 72, 66, 52, 40, + 14, 13, 78, 78, 26, 82, 92, 70, 58, 62, + 54, 44, 52, 40, 32, 12, 12, 5, 39, 26, + 6, 19, 81, 59, 67, 21, 27, 23, 34, 5, + 5, 9, 14, 60, 34, 72, 0, 110, 104, 82, + 64, 38, 30, 23, 41, 71, 17, 82, 58, 40, + 18, 20, 4, 1, 9, 39, 57, 41, 23, 29, + 15, 0, 13, 17, 0, 6, 8, 4, 16, 20, + 6, 108, 104, 90, 76, 66, 54, 22, 0, 33, + }, + + { + /* Context Tables for I, SI Slices :: qp = 16 */ + + 116, 14, 27, 116, 14, 27, 71, 35, 32, 20, + 10, 14, 76, 84, 106, 28, 35, 2, 3, 18, + 0, 4, 11, 7, 37, 22, 68, 59, 85, 99, + 35, 41, 9, 3, 18, 0, 33, 29, 16, 18, + 5, 23, 43, 4, 21, 35, 53, 10, 19, 41, + 8, 19, 29, 49, 6, 11, 13, 23, 3, 8, + 44, 0, 0, 0, 21, 53, 67, 3, 18, 3, + 104, 14, 11, 93, 51, 37, 7, 45, 73, 65, + 79, 27, 73, 59, 67, 85, 79, 85, 81, 24, + 5, 11, 55, 7, 43, 33, 67, 19, 55, 41, + 87, 22, 27, 11, 79, 45, 25, 21, 5, 8, + 1, 17, 16, 10, 11, 4, 1, 3, 4, 25, + 24, 8, 36, 38, 28, 34, 34, 28, 30, 22, + 12, 32, 28, 25, 63, 43, 49, 29, 37, 29, + 9, 25, 25, 21, 23, 15, 17, 11, 27, 61, + 43, 63, 15, 21, 1, 9, 4, 14, 2, 3, + 10, 12, 5, 13, 2, 41, 78, 78, 60, 64, + 78, 62, 32, 66, 56, 20, 36, 28, 8, 2, + 9, 26, 14, 18, 20, 12, 4, 20, 8, 1, + 6, 3, 9, 0, 51, 112, 116, 108, 116, 110, + 106, 106, 106, 102, 84, 82, 76, 56, 48, 10, + 78, 66, 30, 84, 64, 66, 56, 46, 40, 42, + 24, 4, 2, 1, 35, 33, 71, 9, 54, 36, + 24, 6, 10, 1, 13, 19, 37, 2, 82, 54, + 42, 34, 30, 6, 9, 15, 25, 33, 46, 26, + 14, 4, 4, 9, 19, 31, 59, 45, 1, 9, + 7, 13, 19, 39, 45, 69, 13, 70, 38, 24, + 8, 10, 11, 21, 27, 39, 124, 47, 29, 15, + 5, 13, 2, 10, 12, 13, 8, 18, 12, 29, + 15, 15, 18, 16, 24, 18, 40, 44, 36, 14, + 26, 32, 28, 17, 0, 19, 57, 45, 33, 35, + 27, 29, 25, 23, 13, 21, 19, 13, 9, 23, + 37, 57, 35, 67, 25, 11, 13, 1, 11, 3, + 8, 3, 5, 7, 2, 5, 11, 37, 84, 76, + 80, 74, 64, 78, 70, 64, 56, 54, 56, 46, + 10, 12, 1, 44, 42, 32, 10, 24, 18, 1, + 7, 13, 23, 29, 55, 29, 67, 124, 122, 114, + 112, 102, 94, 88, 76, 74, 66, 60, 46, 34, + 10, 15, 74, 74, 22, 78, 86, 64, 52, 56, + 48, 40, 46, 34, 26, 8, 6, 9, 43, 22, + 2, 23, 79, 57, 65, 21, 27, 23, 36, 5, + 5, 9, 14, 62, 36, 74, 0, 106, 100, 76, + 56, 32, 24, 29, 47, 75, 17, 82, 56, 38, + 18, 20, 4, 3, 9, 39, 57, 41, 23, 29, + 13, 0, 13, 17, 0, 6, 8, 4, 16, 20, + 4, 104, 100, 86, 72, 62, 50, 16, 3, 35, + }, + + { + /* Context Tables for I, SI Slices :: qp = 17 */ + + 114, 14, 27, 114, 14, 27, 67, 31, 34, 22, + 10, 12, 74, 82, 106, 28, 29, 2, 1, 20, + 0, 4, 9, 9, 39, 22, 66, 61, 87, 99, + 29, 37, 9, 1, 20, 0, 31, 25, 18, 18, + 3, 21, 41, 4, 21, 33, 53, 10, 17, 39, + 8, 19, 27, 49, 8, 9, 11, 21, 1, 8, + 44, 0, 0, 0, 19, 53, 67, 1, 18, 3, + 104, 14, 11, 89, 49, 37, 5, 41, 69, 61, + 75, 23, 69, 55, 63, 81, 77, 83, 79, 26, + 3, 9, 53, 5, 41, 31, 63, 17, 53, 39, + 83, 24, 25, 9, 75, 43, 23, 19, 3, 10, + 0, 15, 18, 12, 11, 4, 0, 3, 4, 23, + 24, 8, 36, 38, 28, 34, 34, 28, 28, 22, + 12, 30, 26, 25, 61, 41, 47, 27, 35, 27, + 7, 23, 23, 19, 19, 15, 17, 9, 29, 59, + 41, 63, 13, 21, 0, 7, 4, 16, 2, 3, + 8, 14, 5, 15, 2, 41, 76, 78, 60, 64, + 78, 62, 32, 66, 56, 18, 36, 28, 8, 2, + 9, 26, 14, 18, 20, 12, 2, 20, 8, 3, + 4, 3, 11, 1, 51, 110, 114, 106, 114, 108, + 102, 102, 102, 98, 80, 78, 72, 52, 44, 6, + 76, 64, 28, 80, 60, 62, 52, 42, 36, 38, + 20, 0, 2, 3, 37, 33, 71, 7, 56, 38, + 26, 8, 10, 1, 11, 17, 35, 4, 84, 56, + 44, 34, 32, 8, 7, 13, 23, 31, 48, 28, + 16, 6, 6, 7, 17, 29, 55, 43, 2, 7, + 5, 11, 17, 37, 43, 65, 11, 72, 40, 26, + 10, 12, 9, 19, 25, 37, 124, 45, 27, 13, + 5, 11, 2, 12, 14, 11, 10, 20, 14, 27, + 15, 15, 18, 18, 24, 18, 38, 44, 36, 12, + 24, 32, 28, 17, 1, 19, 55, 43, 31, 33, + 25, 27, 23, 21, 13, 19, 17, 11, 9, 21, + 35, 55, 33, 67, 25, 9, 13, 0, 11, 3, + 10, 3, 5, 5, 4, 5, 11, 37, 84, 76, + 80, 74, 62, 78, 70, 64, 56, 54, 56, 46, + 10, 10, 3, 42, 40, 30, 8, 22, 16, 1, + 7, 13, 21, 27, 53, 29, 65, 120, 118, 110, + 108, 98, 90, 84, 72, 70, 62, 56, 42, 30, + 8, 15, 72, 72, 20, 76, 82, 60, 48, 52, + 44, 36, 42, 30, 22, 6, 2, 11, 45, 20, + 0, 25, 75, 53, 61, 19, 25, 21, 40, 3, + 3, 9, 16, 66, 40, 78, 2, 104, 98, 72, + 50, 28, 20, 33, 51, 77, 15, 84, 56, 38, + 18, 22, 4, 3, 9, 37, 55, 39, 21, 27, + 9, 2, 11, 15, 2, 8, 10, 6, 18, 22, + 4, 102, 98, 84, 70, 60, 48, 12, 5, 35, + }, + + { + /* Context Tables for I, SI Slices :: qp = 18 */ + + 112, 14, 27, 112, 14, 27, 63, 29, 34, 22, + 10, 10, 72, 80, 106, 28, 25, 2, 0, 22, + 0, 2, 9, 11, 41, 20, 62, 65, 89, 101, + 23, 35, 9, 0, 22, 0, 31, 23, 18, 16, + 3, 21, 39, 4, 21, 33, 53, 10, 17, 37, + 8, 19, 27, 49, 8, 9, 11, 21, 1, 8, + 44, 0, 0, 0, 17, 53, 67, 1, 16, 3, + 104, 14, 11, 87, 47, 37, 5, 39, 65, 57, + 73, 21, 65, 53, 59, 79, 75, 81, 77, 28, + 3, 7, 51, 5, 41, 29, 59, 17, 53, 39, + 81, 24, 25, 9, 73, 43, 21, 19, 3, 10, + 0, 13, 18, 12, 13, 4, 0, 3, 4, 23, + 24, 8, 34, 38, 26, 34, 34, 28, 24, 22, + 12, 26, 24, 27, 59, 39, 45, 25, 33, 25, + 5, 21, 23, 17, 17, 15, 17, 9, 31, 59, + 39, 63, 13, 21, 2, 7, 2, 18, 2, 3, + 6, 16, 7, 17, 2, 43, 72, 76, 58, 62, + 76, 60, 30, 64, 56, 16, 34, 28, 8, 2, + 9, 24, 12, 16, 18, 10, 0, 18, 6, 7, + 2, 5, 13, 3, 51, 106, 112, 102, 110, 104, + 98, 98, 98, 92, 76, 74, 66, 46, 40, 0, + 72, 60, 24, 76, 56, 58, 48, 38, 32, 32, + 16, 3, 0, 7, 39, 35, 71, 7, 56, 38, + 26, 8, 10, 1, 11, 17, 33, 4, 84, 56, + 44, 34, 34, 8, 5, 13, 21, 29, 50, 30, + 16, 6, 8, 7, 17, 27, 53, 41, 4, 5, + 5, 11, 15, 35, 41, 63, 11, 72, 40, 26, + 10, 14, 7, 17, 23, 35, 124, 45, 27, 13, + 5, 11, 2, 12, 14, 11, 10, 20, 16, 27, + 15, 15, 16, 18, 22, 16, 36, 42, 36, 10, + 22, 32, 28, 19, 3, 21, 53, 41, 29, 33, + 25, 27, 21, 19, 13, 19, 17, 11, 11, 21, + 35, 55, 31, 67, 25, 9, 13, 2, 11, 3, + 12, 3, 5, 5, 6, 7, 11, 39, 82, 76, + 80, 72, 60, 76, 68, 62, 54, 52, 54, 44, + 8, 8, 5, 38, 38, 26, 4, 20, 14, 3, + 9, 13, 21, 27, 53, 31, 63, 116, 114, 106, + 104, 92, 86, 80, 66, 66, 58, 52, 38, 26, + 6, 17, 68, 68, 18, 72, 78, 56, 44, 48, + 40, 32, 38, 26, 18, 2, 1, 15, 47, 16, + 3, 29, 73, 51, 59, 17, 23, 19, 44, 1, + 1, 9, 18, 68, 42, 80, 4, 102, 94, 66, + 44, 22, 14, 39, 55, 79, 15, 84, 56, 38, + 18, 22, 4, 3, 9, 35, 53, 37, 19, 25, + 7, 4, 9, 13, 4, 10, 12, 6, 20, 22, + 4, 100, 94, 80, 66, 56, 44, 8, 9, 37, + }, + + { + /* Context Tables for I, SI Slices :: qp = 19 */ + + 110, 14, 27, 110, 14, 27, 59, 25, 36, 22, + 8, 6, 68, 78, 106, 28, 21, 2, 2, 24, + 0, 0, 9, 13, 43, 20, 58, 67, 91, 101, + 17, 33, 9, 2, 24, 0, 29, 21, 20, 16, + 3, 19, 37, 2, 23, 33, 53, 10, 17, 37, + 8, 19, 27, 49, 10, 9, 11, 21, 0, 8, + 44, 0, 0, 0, 17, 53, 67, 0, 14, 3, + 104, 14, 11, 85, 45, 37, 3, 37, 61, 53, + 69, 19, 61, 51, 55, 77, 73, 79, 75, 30, + 3, 5, 49, 5, 39, 29, 55, 17, 51, 37, + 79, 24, 25, 7, 69, 41, 19, 19, 3, 10, + 0, 11, 20, 14, 13, 4, 0, 3, 4, 23, + 24, 6, 34, 38, 24, 34, 34, 28, 20, 22, + 10, 22, 22, 29, 57, 37, 43, 23, 31, 25, + 3, 21, 21, 15, 15, 15, 17, 9, 33, 57, + 37, 63, 13, 21, 4, 7, 0, 20, 2, 3, + 4, 18, 7, 19, 2, 45, 70, 74, 58, 60, + 74, 58, 30, 62, 54, 14, 32, 28, 8, 2, + 11, 22, 10, 14, 16, 8, 1, 16, 6, 11, + 0, 5, 15, 5, 51, 104, 108, 100, 106, 100, + 94, 94, 94, 88, 70, 70, 62, 42, 34, 3, + 68, 56, 22, 72, 52, 52, 42, 34, 26, 28, + 12, 9, 1, 11, 41, 37, 71, 7, 58, 38, + 26, 8, 10, 1, 11, 17, 33, 4, 84, 56, + 44, 34, 36, 10, 5, 11, 21, 27, 52, 30, + 18, 6, 10, 5, 15, 27, 51, 39, 6, 3, + 3, 11, 13, 33, 39, 61, 9, 72, 40, 26, + 10, 16, 7, 15, 21, 33, 124, 43, 25, 11, + 5, 11, 2, 12, 14, 11, 10, 20, 16, 27, + 15, 15, 16, 20, 20, 14, 34, 40, 36, 8, + 20, 32, 28, 19, 5, 21, 51, 39, 27, 31, + 23, 25, 21, 17, 13, 19, 17, 9, 11, 21, + 35, 53, 29, 67, 25, 9, 13, 4, 11, 3, + 14, 3, 5, 5, 8, 9, 11, 41, 80, 74, + 78, 70, 58, 74, 66, 60, 52, 50, 52, 42, + 6, 6, 7, 36, 36, 24, 0, 16, 12, 5, + 9, 15, 21, 27, 53, 31, 61, 112, 110, 102, + 100, 86, 82, 76, 62, 62, 54, 48, 32, 22, + 2, 19, 64, 66, 14, 68, 74, 52, 40, 44, + 36, 28, 34, 22, 12, 1, 5, 19, 49, 12, + 7, 31, 71, 49, 57, 15, 21, 17, 46, 0, + 0, 9, 20, 72, 46, 84, 6, 98, 92, 62, + 38, 16, 8, 45, 59, 81, 15, 84, 56, 38, + 18, 22, 4, 3, 9, 35, 51, 35, 17, 23, + 5, 4, 9, 11, 4, 10, 12, 6, 20, 22, + 4, 98, 92, 76, 62, 52, 40, 4, 13, 39, + }, + + { + /* Context Tables for I, SI Slices :: qp = 20 */ + + 106, 14, 27, 106, 14, 27, 57, 23, 36, 22, + 8, 4, 66, 74, 106, 28, 15, 0, 4, 24, + 0, 1, 9, 15, 45, 18, 54, 71, 93, 103, + 11, 31, 9, 4, 24, 0, 29, 19, 20, 14, + 3, 19, 37, 2, 23, 31, 53, 10, 17, 35, + 6, 21, 27, 49, 10, 7, 11, 19, 0, 8, + 44, 0, 0, 0, 15, 55, 67, 0, 12, 3, + 104, 14, 11, 83, 43, 37, 3, 33, 59, 49, + 67, 17, 59, 47, 51, 75, 73, 77, 75, 30, + 3, 3, 47, 5, 39, 27, 51, 17, 51, 37, + 77, 24, 23, 7, 67, 41, 19, 19, 1, 10, + 0, 11, 20, 14, 15, 2, 0, 3, 4, 23, + 24, 6, 32, 38, 24, 34, 34, 26, 16, 22, + 10, 18, 20, 31, 55, 35, 41, 21, 31, 23, + 1, 19, 21, 13, 13, 15, 17, 7, 35, 57, + 35, 63, 13, 21, 4, 7, 1, 22, 2, 3, + 0, 20, 9, 21, 2, 47, 66, 72, 56, 58, + 72, 56, 28, 60, 54, 10, 30, 26, 8, 2, + 11, 20, 10, 12, 16, 6, 3, 14, 4, 15, + 1, 7, 17, 7, 51, 100, 106, 96, 102, 96, + 90, 88, 90, 82, 66, 66, 56, 36, 30, 9, + 64, 54, 18, 66, 48, 48, 38, 28, 22, 22, + 8, 13, 3, 13, 45, 39, 71, 7, 58, 40, + 26, 8, 10, 1, 11, 17, 31, 6, 86, 56, + 44, 34, 38, 10, 3, 11, 19, 27, 54, 32, + 18, 6, 10, 5, 15, 25, 49, 39, 8, 3, + 3, 11, 11, 31, 39, 59, 9, 74, 40, 26, + 10, 18, 5, 15, 19, 31, 124, 43, 25, 11, + 5, 11, 2, 12, 14, 9, 10, 20, 18, 27, + 15, 15, 14, 20, 18, 12, 32, 38, 36, 4, + 18, 30, 28, 21, 7, 23, 51, 39, 25, 31, + 23, 25, 19, 15, 15, 19, 17, 9, 13, 21, + 35, 53, 27, 67, 25, 7, 13, 4, 11, 3, + 16, 3, 7, 5, 10, 9, 13, 43, 78, 74, + 78, 68, 56, 72, 64, 58, 50, 48, 50, 40, + 4, 4, 9, 32, 32, 20, 3, 14, 10, 7, + 11, 15, 21, 27, 53, 33, 61, 106, 104, 98, + 94, 80, 78, 72, 56, 58, 50, 44, 28, 18, + 0, 21, 60, 62, 12, 64, 70, 48, 36, 38, + 32, 24, 30, 18, 8, 5, 9, 23, 51, 8, + 11, 35, 69, 47, 55, 15, 19, 15, 50, 2, + 2, 9, 22, 74, 48, 86, 6, 96, 88, 56, + 32, 10, 2, 51, 63, 83, 15, 84, 56, 38, + 18, 22, 4, 3, 9, 33, 49, 33, 15, 23, + 3, 6, 7, 9, 6, 12, 14, 8, 22, 24, + 4, 94, 88, 72, 58, 48, 36, 0, 17, 41, + }, + + { + /* Context Tables for I, SI Slices :: qp = 21 */ + + 104, 14, 27, 104, 14, 27, 53, 19, 36, 22, + 6, 0, 62, 72, 106, 28, 11, 0, 6, 26, + 0, 1, 7, 17, 47, 18, 50, 75, 95, 103, + 5, 27, 9, 6, 26, 0, 29, 17, 20, 14, + 3, 19, 35, 2, 25, 31, 53, 10, 17, 33, + 6, 21, 27, 49, 12, 7, 9, 19, 2, 8, + 44, 0, 0, 0, 15, 55, 67, 2, 10, 3, + 104, 14, 11, 81, 41, 37, 1, 31, 55, 45, + 63, 15, 55, 45, 47, 73, 71, 75, 73, 32, + 3, 1, 45, 5, 37, 25, 47, 17, 51, 35, + 73, 26, 23, 5, 65, 39, 17, 19, 1, 12, + 2, 9, 22, 16, 15, 2, 0, 3, 4, 23, + 24, 4, 30, 38, 22, 34, 34, 26, 12, 22, + 8, 16, 18, 33, 53, 33, 39, 19, 29, 21, + 0, 17, 19, 11, 11, 15, 17, 7, 37, 55, + 33, 63, 11, 21, 6, 7, 3, 24, 2, 3, + 1, 22, 9, 23, 2, 47, 64, 70, 56, 56, + 70, 54, 26, 60, 54, 8, 28, 26, 8, 2, + 13, 20, 8, 10, 14, 4, 5, 14, 2, 19, + 3, 9, 19, 9, 51, 96, 104, 92, 98, 94, + 86, 84, 86, 78, 62, 62, 52, 32, 26, 13, + 60, 50, 14, 62, 44, 44, 34, 24, 18, 16, + 4, 19, 5, 17, 47, 39, 71, 7, 58, 40, + 26, 8, 10, 1, 11, 17, 31, 6, 86, 56, + 44, 34, 40, 12, 1, 11, 17, 25, 56, 34, + 18, 8, 12, 3, 13, 25, 47, 37, 12, 1, + 3, 9, 9, 29, 37, 57, 7, 74, 42, 26, + 10, 20, 3, 13, 17, 29, 124, 43, 25, 9, + 5, 11, 2, 12, 14, 9, 10, 22, 18, 27, + 15, 15, 12, 22, 16, 10, 30, 38, 36, 2, + 16, 30, 28, 23, 9, 23, 49, 37, 23, 29, + 21, 23, 19, 13, 15, 19, 17, 9, 15, 21, + 35, 51, 25, 67, 25, 7, 13, 6, 11, 3, + 18, 3, 7, 5, 12, 11, 13, 43, 76, 74, + 76, 68, 54, 70, 62, 56, 50, 46, 48, 38, + 2, 2, 11, 30, 30, 18, 7, 12, 8, 9, + 13, 17, 21, 25, 53, 33, 59, 102, 100, 94, + 90, 76, 74, 68, 52, 54, 46, 40, 24, 14, + 3, 23, 56, 58, 10, 60, 66, 44, 32, 34, + 28, 20, 26, 14, 2, 9, 13, 27, 53, 4, + 15, 37, 67, 43, 53, 13, 17, 13, 54, 4, + 4, 9, 24, 78, 52, 90, 8, 92, 84, 52, + 26, 6, 3, 55, 67, 85, 15, 84, 56, 38, + 18, 22, 4, 3, 9, 33, 47, 31, 13, 21, + 1, 8, 7, 7, 8, 14, 16, 8, 22, 24, + 4, 92, 86, 70, 56, 44, 32, 3, 21, 43, + }, + + { + /* Context Tables for I, SI Slices :: qp = 22 */ + + 102, 14, 29, 102, 14, 29, 49, 17, 38, 22, + 6, 1, 60, 70, 106, 28, 7, 0, 6, 28, + 0, 3, 7, 19, 49, 16, 48, 77, 97, 105, + 0, 25, 9, 6, 28, 0, 27, 15, 22, 12, + 1, 17, 33, 0, 25, 31, 53, 10, 15, 33, + 6, 21, 25, 49, 12, 7, 9, 19, 2, 8, + 44, 0, 0, 0, 13, 55, 67, 2, 10, 5, + 104, 14, 11, 79, 41, 37, 1, 29, 51, 41, + 61, 13, 51, 43, 43, 71, 69, 73, 71, 34, + 3, 0, 45, 3, 37, 25, 43, 17, 49, 35, + 71, 26, 23, 5, 61, 39, 15, 17, 1, 12, + 2, 7, 22, 16, 17, 2, 0, 3, 4, 23, + 24, 4, 30, 38, 20, 34, 34, 26, 8, 22, + 8, 12, 16, 35, 53, 31, 39, 19, 27, 21, + 2, 17, 19, 11, 7, 15, 17, 7, 39, 55, + 31, 63, 11, 21, 8, 5, 5, 24, 2, 3, + 3, 24, 11, 25, 2, 49, 60, 68, 54, 56, + 68, 54, 26, 58, 52, 6, 26, 26, 8, 2, + 13, 18, 6, 8, 12, 4, 7, 12, 2, 23, + 5, 9, 21, 11, 51, 94, 100, 90, 94, 90, + 82, 80, 82, 72, 56, 58, 46, 26, 20, 19, + 56, 46, 12, 58, 38, 38, 28, 20, 12, 12, + 1, 23, 7, 21, 49, 41, 71, 5, 60, 40, + 28, 8, 10, 1, 11, 17, 29, 6, 86, 58, + 44, 34, 42, 12, 1, 9, 17, 23, 58, 34, + 20, 8, 14, 3, 13, 23, 45, 35, 14, 0, + 1, 9, 9, 29, 35, 53, 7, 74, 42, 26, + 12, 20, 3, 11, 17, 27, 124, 41, 23, 9, + 5, 11, 2, 12, 14, 9, 12, 22, 20, 27, + 15, 17, 12, 22, 14, 8, 28, 36, 36, 0, + 14, 30, 28, 23, 11, 25, 47, 35, 21, 29, + 21, 23, 17, 13, 15, 17, 15, 7, 15, 21, + 35, 51, 23, 67, 25, 7, 13, 8, 11, 3, + 20, 5, 7, 5, 12, 13, 13, 45, 74, 72, + 76, 66, 52, 70, 62, 54, 48, 46, 46, 36, + 0, 0, 13, 26, 28, 14, 11, 8, 4, 11, + 13, 17, 21, 25, 51, 35, 57, 98, 96, 90, + 86, 70, 70, 64, 46, 50, 40, 36, 18, 10, + 5, 23, 54, 56, 6, 56, 60, 40, 28, 30, + 24, 16, 20, 10, 1, 13, 17, 29, 57, 0, + 19, 41, 65, 41, 51, 11, 15, 11, 56, 6, + 6, 9, 26, 80, 54, 92, 10, 90, 82, 46, + 20, 0, 7, 61, 71, 89, 13, 84, 56, 38, + 18, 22, 4, 3, 9, 31, 45, 31, 11, 19, + 0, 8, 5, 5, 8, 14, 16, 8, 24, 24, + 4, 90, 82, 66, 52, 42, 30, 9, 23, 45, + }, + + { + /* Context Tables for I, SI Slices :: qp = 23 */ + + 100, 14, 29, 100, 14, 29, 45, 13, 38, 22, + 4, 5, 56, 66, 106, 28, 1, 1, 8, 28, + 0, 5, 7, 21, 51, 16, 44, 81, 99, 105, + 6, 23, 9, 8, 28, 0, 27, 13, 22, 12, + 1, 17, 33, 0, 27, 29, 53, 10, 15, 31, + 4, 21, 25, 49, 14, 5, 9, 17, 4, 8, + 44, 0, 0, 0, 13, 55, 67, 4, 8, 5, + 104, 14, 11, 77, 39, 37, 0, 25, 49, 37, + 57, 11, 49, 39, 39, 69, 67, 71, 71, 36, + 3, 2, 43, 3, 35, 23, 39, 17, 49, 33, + 69, 26, 21, 3, 59, 37, 15, 17, 0, 12, + 2, 5, 24, 18, 17, 0, 0, 3, 4, 23, + 24, 2, 28, 38, 20, 34, 34, 24, 4, 22, + 6, 8, 14, 37, 51, 29, 37, 17, 25, 19, + 4, 15, 17, 9, 5, 15, 17, 5, 41, 53, + 29, 63, 11, 21, 8, 5, 7, 26, 2, 3, + 7, 26, 11, 27, 2, 51, 58, 66, 54, 54, + 66, 52, 24, 56, 52, 4, 24, 26, 8, 2, + 15, 16, 6, 6, 12, 2, 9, 10, 0, 27, + 7, 11, 23, 13, 51, 90, 98, 86, 90, 86, + 78, 74, 78, 68, 52, 54, 42, 22, 16, 23, + 52, 44, 8, 52, 34, 34, 24, 14, 8, 6, + 5, 29, 9, 23, 53, 43, 71, 5, 60, 42, + 28, 8, 10, 1, 11, 17, 29, 8, 88, 58, + 44, 34, 44, 14, 0, 9, 15, 21, 60, 36, + 20, 8, 16, 1, 11, 23, 43, 35, 16, 0, + 1, 9, 7, 27, 35, 51, 5, 76, 42, 26, + 12, 22, 1, 9, 15, 25, 124, 41, 23, 7, + 5, 11, 2, 12, 14, 7, 12, 22, 20, 27, + 15, 17, 10, 24, 12, 6, 26, 34, 36, 1, + 12, 28, 28, 25, 13, 25, 45, 35, 19, 27, + 19, 21, 17, 11, 15, 17, 15, 7, 17, 21, + 35, 49, 21, 67, 25, 5, 13, 8, 11, 3, + 22, 5, 7, 5, 14, 13, 15, 47, 72, 72, + 74, 64, 50, 68, 60, 52, 46, 44, 44, 34, + 1, 1, 15, 24, 24, 12, 15, 6, 2, 13, + 15, 19, 21, 25, 51, 35, 55, 94, 92, 86, + 80, 64, 66, 60, 42, 46, 36, 32, 14, 6, + 9, 25, 50, 52, 4, 52, 56, 36, 24, 26, + 20, 12, 16, 6, 7, 17, 21, 33, 59, 3, + 23, 43, 63, 39, 49, 11, 13, 9, 60, 8, + 8, 9, 28, 84, 58, 96, 10, 86, 78, 42, + 14, 5, 13, 67, 75, 91, 13, 84, 56, 38, + 18, 22, 4, 3, 9, 31, 43, 29, 9, 17, + 2, 10, 5, 3, 10, 16, 18, 10, 24, 26, + 4, 86, 80, 62, 48, 38, 26, 13, 27, 47, + }, + + { + /* Context Tables for I, SI Slices :: qp = 24 */ + + 96, 12, 29, 96, 12, 29, 43, 11, 38, 22, + 4, 7, 54, 64, 106, 28, 2, 1, 10, 30, + 0, 7, 7, 23, 55, 14, 40, 85, 101, 107, + 10, 21, 9, 10, 30, 0, 27, 11, 22, 10, + 1, 17, 31, 1, 27, 29, 55, 10, 15, 31, + 4, 23, 25, 49, 14, 5, 9, 17, 4, 8, + 44, 0, 0, 0, 11, 57, 67, 4, 6, 5, + 104, 14, 11, 75, 37, 37, 0, 23, 45, 33, + 55, 9, 45, 37, 37, 67, 67, 69, 69, 36, + 3, 4, 41, 3, 35, 23, 37, 17, 49, 33, + 67, 26, 21, 3, 57, 37, 13, 17, 0, 12, + 2, 5, 24, 18, 19, 0, 0, 3, 4, 23, + 24, 2, 26, 38, 18, 34, 34, 24, 0, 22, + 6, 4, 12, 39, 49, 27, 35, 15, 25, 19, + 6, 15, 17, 7, 3, 15, 17, 5, 45, 53, + 27, 63, 11, 23, 10, 5, 9, 28, 2, 3, + 9, 28, 13, 29, 2, 53, 54, 64, 52, 52, + 64, 50, 22, 54, 50, 0, 22, 24, 8, 2, + 15, 14, 4, 4, 10, 0, 11, 8, 1, 31, + 11, 13, 25, 15, 51, 86, 94, 82, 86, 82, + 74, 70, 74, 62, 46, 50, 36, 16, 10, 29, + 48, 40, 4, 48, 30, 28, 18, 10, 2, 0, + 9, 33, 11, 27, 55, 45, 73, 5, 60, 42, + 28, 8, 10, 1, 11, 17, 27, 8, 88, 58, + 44, 34, 46, 14, 0, 9, 15, 21, 62, 36, + 20, 8, 16, 1, 11, 21, 41, 33, 18, 2, + 1, 9, 5, 25, 33, 49, 5, 76, 42, 26, + 12, 24, 1, 9, 13, 25, 124, 41, 23, 7, + 5, 11, 2, 12, 14, 7, 12, 22, 22, 27, + 15, 17, 8, 24, 10, 4, 24, 32, 36, 5, + 8, 28, 28, 27, 15, 27, 45, 33, 17, 27, + 19, 21, 15, 9, 17, 17, 15, 7, 19, 21, + 35, 49, 21, 69, 25, 5, 13, 10, 11, 3, + 22, 5, 9, 5, 16, 15, 15, 49, 70, 70, + 74, 62, 46, 66, 58, 50, 44, 42, 42, 32, + 3, 3, 17, 20, 22, 8, 19, 2, 0, 15, + 17, 19, 21, 25, 51, 37, 55, 88, 86, 82, + 76, 58, 60, 54, 36, 42, 32, 28, 8, 2, + 11, 27, 46, 48, 0, 48, 52, 30, 18, 20, + 14, 8, 12, 0, 11, 21, 27, 37, 61, 7, + 27, 47, 61, 37, 47, 9, 11, 9, 62, 10, + 8, 9, 28, 86, 60, 98, 12, 84, 74, 36, + 8, 11, 19, 73, 79, 93, 13, 84, 56, 38, + 18, 22, 4, 3, 9, 29, 43, 27, 7, 17, + 4, 10, 3, 3, 10, 16, 18, 10, 26, 26, + 2, 84, 76, 58, 44, 34, 22, 17, 31, 49, + }, + + { + /* Context Tables for I, SI Slices :: qp = 25 */ + + 94, 12, 29, 94, 12, 29, 39, 9, 40, 22, + 4, 9, 52, 62, 106, 28, 6, 1, 12, 32, + 0, 7, 5, 25, 57, 12, 36, 87, 103, 109, + 16, 17, 9, 12, 32, 0, 25, 9, 24, 8, + 1, 15, 29, 1, 27, 29, 55, 10, 15, 29, + 4, 23, 25, 49, 14, 5, 7, 17, 4, 8, + 44, 0, 0, 0, 9, 57, 67, 4, 4, 5, + 104, 14, 11, 73, 35, 37, 0, 21, 41, 29, + 51, 5, 41, 35, 33, 65, 65, 67, 67, 38, + 1, 6, 39, 3, 33, 21, 33, 17, 47, 33, + 63, 28, 21, 3, 53, 37, 11, 17, 0, 14, + 4, 3, 24, 18, 19, 0, 2, 3, 4, 21, + 24, 2, 26, 38, 16, 34, 34, 24, 3, 22, + 6, 2, 10, 39, 47, 25, 33, 13, 23, 17, + 8, 13, 17, 5, 1, 15, 17, 5, 47, 51, + 25, 63, 9, 23, 12, 5, 11, 30, 2, 3, + 11, 30, 15, 31, 2, 53, 52, 62, 52, 50, + 62, 48, 22, 54, 50, 1, 20, 24, 8, 2, + 15, 14, 2, 2, 8, 1, 13, 8, 1, 33, + 13, 13, 27, 17, 51, 84, 92, 80, 84, 80, + 70, 66, 70, 58, 42, 46, 32, 10, 6, 35, + 46, 36, 2, 44, 26, 24, 14, 6, 1, 3, + 13, 37, 11, 31, 57, 45, 73, 5, 62, 42, + 28, 10, 10, 1, 9, 17, 25, 8, 88, 58, + 46, 34, 48, 16, 2, 7, 13, 19, 64, 38, + 22, 10, 18, 0, 11, 19, 39, 31, 22, 4, + 0, 7, 3, 23, 31, 47, 3, 76, 44, 28, + 12, 26, 0, 7, 11, 23, 124, 39, 21, 5, + 5, 11, 2, 14, 16, 7, 12, 24, 24, 27, + 15, 17, 8, 24, 8, 4, 22, 32, 36, 7, + 6, 28, 28, 27, 17, 29, 43, 31, 15, 27, + 17, 21, 13, 7, 17, 17, 15, 5, 19, 21, + 35, 49, 19, 69, 25, 5, 13, 12, 11, 3, + 24, 5, 9, 5, 18, 17, 15, 49, 68, 70, + 74, 62, 44, 64, 56, 48, 44, 40, 40, 32, + 3, 5, 19, 18, 20, 4, 23, 0, 1, 15, + 17, 19, 19, 23, 51, 39, 53, 84, 82, 78, + 72, 54, 56, 50, 32, 38, 28, 24, 4, 1, + 13, 29, 42, 46, 1, 44, 48, 26, 14, 16, + 10, 4, 8, 3, 15, 23, 31, 41, 63, 11, + 31, 51, 59, 33, 43, 7, 9, 7, 66, 12, + 10, 9, 30, 90, 62, 102, 14, 82, 72, 32, + 2, 15, 25, 77, 83, 95, 13, 84, 56, 38, + 18, 24, 4, 3, 9, 27, 41, 25, 5, 15, + 8, 12, 1, 1, 12, 18, 20, 10, 28, 26, + 2, 82, 74, 56, 42, 30, 18, 21, 35, 49, + }, + + { + /* Context Tables for I, SI Slices :: qp = 26 */ + + 92, 12, 29, 92, 12, 29, 35, 5, 40, 22, + 2, 13, 48, 58, 106, 28, 12, 3, 14, 32, + 0, 9, 5, 27, 59, 12, 32, 91, 105, 109, + 22, 15, 9, 14, 32, 0, 25, 7, 24, 8, + 1, 15, 29, 1, 29, 27, 55, 10, 15, 27, + 2, 23, 25, 49, 16, 3, 7, 15, 6, 8, + 44, 0, 0, 0, 9, 57, 67, 6, 2, 5, + 104, 14, 11, 71, 33, 37, 2, 17, 39, 25, + 49, 3, 39, 31, 29, 63, 63, 65, 67, 40, + 1, 8, 37, 3, 33, 19, 29, 17, 47, 31, + 61, 28, 19, 1, 51, 35, 11, 17, 2, 14, + 4, 1, 26, 20, 21, 1, 2, 3, 4, 21, + 24, 0, 24, 38, 16, 34, 34, 22, 7, 22, + 4, 1, 8, 41, 45, 23, 31, 11, 21, 15, + 10, 11, 15, 3, 0, 15, 17, 3, 49, 51, + 23, 63, 9, 23, 12, 5, 13, 32, 2, 3, + 15, 32, 15, 33, 2, 55, 48, 60, 50, 48, + 60, 46, 20, 52, 50, 3, 18, 24, 8, 2, + 17, 12, 2, 0, 8, 3, 15, 6, 3, 37, + 15, 15, 29, 19, 51, 80, 90, 76, 80, 76, + 66, 60, 66, 52, 38, 42, 26, 6, 2, 39, + 42, 34, 1, 38, 22, 20, 10, 0, 5, 9, + 17, 43, 13, 33, 61, 47, 73, 5, 62, 44, + 28, 10, 10, 1, 9, 17, 25, 10, 90, 58, + 46, 34, 50, 16, 4, 7, 11, 17, 66, 40, + 22, 10, 20, 0, 9, 19, 37, 31, 24, 4, + 0, 7, 1, 21, 31, 45, 3, 78, 44, 28, + 12, 28, 2, 5, 9, 21, 124, 39, 21, 5, + 5, 11, 2, 14, 16, 5, 12, 24, 24, 27, + 15, 17, 6, 26, 6, 2, 20, 30, 36, 9, + 4, 26, 28, 29, 19, 29, 41, 31, 13, 25, + 17, 19, 13, 5, 17, 17, 15, 5, 21, 21, + 35, 47, 17, 69, 25, 3, 13, 12, 11, 3, + 26, 5, 9, 5, 20, 17, 17, 51, 66, 70, + 72, 60, 42, 62, 54, 46, 42, 38, 38, 30, + 5, 7, 21, 14, 16, 2, 27, 1, 3, 17, + 19, 21, 19, 23, 51, 39, 51, 80, 78, 74, + 66, 48, 52, 46, 26, 34, 24, 20, 0, 5, + 17, 31, 38, 42, 3, 40, 44, 22, 10, 12, + 6, 0, 4, 7, 21, 27, 35, 45, 65, 15, + 35, 53, 57, 31, 41, 7, 7, 5, 70, 14, + 12, 9, 32, 92, 66, 104, 14, 78, 68, 26, + 3, 21, 31, 83, 87, 97, 13, 84, 56, 38, + 18, 24, 4, 3, 9, 27, 39, 23, 3, 13, + 10, 14, 1, 0, 14, 20, 22, 12, 28, 28, + 2, 78, 70, 52, 38, 26, 14, 25, 39, 51, + }, + + { + /* Context Tables for I, SI Slices :: qp = 27 */ + + 90, 12, 31, 90, 12, 31, 31, 3, 42, 22, + 2, 15, 46, 56, 106, 28, 16, 3, 14, 34, + 0, 11, 5, 29, 61, 10, 30, 93, 107, 111, + 28, 13, 9, 14, 34, 0, 23, 5, 26, 6, + 0, 13, 27, 3, 29, 27, 55, 10, 13, 27, + 2, 23, 23, 49, 16, 3, 7, 15, 6, 8, + 44, 0, 0, 0, 7, 57, 67, 6, 2, 7, + 104, 14, 11, 69, 33, 37, 2, 15, 35, 21, + 45, 1, 35, 29, 25, 61, 61, 63, 65, 42, + 1, 10, 37, 1, 31, 19, 25, 17, 45, 31, + 59, 28, 19, 1, 47, 35, 9, 15, 2, 14, + 4, 0, 26, 20, 21, 1, 2, 3, 4, 21, + 24, 0, 24, 38, 14, 34, 34, 22, 11, 22, + 4, 5, 6, 43, 45, 21, 31, 11, 19, 15, + 12, 11, 15, 3, 4, 15, 17, 3, 51, 49, + 21, 63, 9, 23, 14, 3, 15, 32, 2, 3, + 17, 34, 17, 35, 2, 57, 46, 58, 50, 48, + 58, 46, 20, 50, 48, 5, 16, 24, 8, 2, + 17, 10, 0, 1, 6, 3, 17, 4, 3, 41, + 17, 15, 31, 21, 51, 78, 86, 74, 76, 72, + 62, 56, 62, 48, 32, 38, 22, 0, 3, 45, + 38, 30, 3, 34, 16, 14, 4, 3, 11, 13, + 23, 47, 15, 37, 63, 49, 73, 3, 64, 44, + 30, 10, 10, 1, 9, 17, 23, 10, 90, 60, + 46, 34, 52, 18, 4, 5, 11, 15, 68, 40, + 24, 10, 22, 2, 9, 17, 35, 29, 26, 6, + 2, 7, 1, 21, 29, 41, 1, 78, 44, 28, + 14, 28, 2, 3, 9, 19, 124, 37, 19, 3, + 5, 11, 2, 14, 16, 5, 14, 24, 26, 27, + 15, 19, 6, 26, 4, 0, 18, 28, 36, 11, + 2, 26, 28, 29, 21, 31, 39, 29, 11, 25, + 15, 19, 11, 5, 17, 15, 13, 3, 21, 21, + 35, 47, 15, 69, 25, 3, 13, 14, 11, 3, + 28, 7, 9, 5, 20, 19, 17, 53, 64, 68, + 72, 58, 40, 62, 54, 44, 40, 38, 36, 28, + 7, 9, 23, 12, 14, 1, 31, 5, 7, 19, + 19, 21, 19, 23, 49, 41, 49, 76, 74, 70, + 62, 42, 48, 42, 22, 30, 18, 16, 5, 9, + 19, 31, 36, 40, 7, 36, 38, 18, 6, 8, + 2, 3, 1, 11, 25, 31, 39, 47, 69, 19, + 39, 57, 55, 29, 39, 5, 5, 3, 72, 16, + 14, 9, 34, 96, 68, 108, 16, 76, 66, 22, + 9, 27, 35, 89, 91, 101, 11, 84, 56, 38, + 18, 24, 4, 3, 9, 25, 37, 23, 1, 11, + 12, 14, 0, 2, 14, 20, 22, 12, 30, 28, + 2, 76, 68, 48, 34, 24, 12, 31, 41, 53, + }, + + { + /* Context Tables for I, SI Slices :: qp = 28 */ + + 86, 12, 31, 86, 12, 31, 29, 0, 42, 22, + 0, 19, 42, 54, 106, 28, 20, 3, 16, 36, + 0, 13, 5, 31, 63, 10, 26, 97, 109, 111, + 34, 11, 9, 16, 36, 0, 23, 3, 26, 6, + 0, 13, 25, 3, 31, 27, 55, 10, 13, 25, + 2, 25, 23, 49, 18, 3, 7, 15, 8, 8, + 44, 0, 0, 0, 7, 59, 67, 8, 0, 7, + 104, 14, 11, 67, 31, 37, 4, 13, 31, 17, + 43, 0, 31, 27, 21, 59, 61, 61, 63, 42, + 1, 12, 35, 1, 31, 17, 21, 17, 45, 29, + 57, 28, 19, 0, 45, 33, 7, 15, 2, 14, + 4, 0, 28, 22, 23, 1, 2, 3, 4, 21, + 24, 1, 22, 38, 12, 34, 34, 22, 15, 22, + 2, 9, 4, 45, 43, 19, 29, 9, 19, 13, + 14, 9, 13, 1, 6, 15, 17, 3, 53, 49, + 19, 63, 9, 23, 16, 3, 17, 34, 2, 3, + 19, 36, 17, 37, 2, 59, 42, 56, 48, 46, + 56, 44, 18, 48, 48, 9, 14, 22, 8, 2, + 19, 8, 1, 3, 4, 5, 19, 2, 5, 45, + 19, 17, 33, 23, 51, 74, 84, 70, 72, 68, + 58, 52, 58, 42, 28, 34, 16, 3, 7, 49, + 34, 26, 7, 30, 12, 10, 0, 7, 15, 19, + 27, 53, 17, 41, 65, 51, 73, 3, 64, 44, + 30, 10, 10, 1, 9, 17, 23, 10, 90, 60, + 46, 34, 54, 18, 6, 5, 9, 15, 70, 42, + 24, 10, 22, 2, 7, 17, 33, 27, 28, 8, + 2, 7, 0, 19, 27, 39, 1, 78, 44, 28, + 14, 30, 4, 3, 7, 17, 124, 37, 19, 3, + 5, 11, 2, 14, 16, 5, 14, 24, 26, 27, + 15, 19, 4, 28, 2, 1, 16, 26, 36, 15, + 0, 26, 28, 31, 23, 31, 39, 27, 9, 23, + 15, 17, 11, 3, 19, 15, 13, 3, 23, 21, + 35, 45, 13, 69, 25, 3, 13, 16, 11, 3, + 30, 7, 11, 5, 22, 21, 17, 55, 62, 68, + 70, 56, 38, 60, 52, 42, 38, 36, 34, 26, + 9, 11, 25, 8, 12, 3, 35, 7, 9, 21, + 21, 23, 19, 23, 49, 41, 49, 70, 68, 66, + 58, 36, 44, 38, 16, 26, 14, 12, 9, 13, + 23, 33, 32, 36, 9, 32, 34, 14, 2, 2, + 1, 7, 5, 15, 31, 35, 43, 51, 71, 23, + 43, 59, 53, 27, 37, 3, 3, 1, 76, 18, + 16, 9, 36, 98, 72, 110, 18, 72, 62, 16, + 15, 33, 41, 95, 95, 103, 11, 84, 56, 38, + 18, 24, 4, 3, 9, 25, 35, 21, 0, 11, + 14, 16, 0, 4, 16, 22, 24, 12, 30, 28, + 2, 74, 64, 44, 30, 20, 8, 35, 45, 55, + }, + + { + /* Context Tables for I, SI Slices :: qp = 29 */ + + 84, 12, 31, 84, 12, 31, 25, 2, 42, 22, + 0, 21, 40, 50, 106, 28, 26, 5, 18, 36, + 0, 13, 3, 33, 65, 8, 22, 101, 111, 113, + 40, 7, 9, 18, 36, 0, 23, 1, 26, 4, + 0, 13, 25, 3, 31, 25, 55, 10, 13, 23, + 0, 25, 23, 49, 18, 1, 5, 13, 8, 8, + 44, 0, 0, 0, 5, 59, 67, 8, 1, 7, + 104, 14, 11, 65, 29, 37, 4, 9, 29, 13, + 39, 2, 29, 23, 17, 57, 59, 59, 63, 44, + 1, 14, 33, 1, 29, 15, 17, 17, 45, 29, + 53, 30, 17, 0, 43, 33, 7, 15, 4, 16, + 6, 2, 28, 22, 23, 3, 2, 3, 4, 21, + 24, 1, 20, 38, 12, 34, 34, 20, 19, 22, + 2, 11, 2, 47, 41, 17, 27, 7, 17, 11, + 16, 7, 13, 0, 8, 15, 17, 1, 55, 47, + 17, 63, 7, 23, 16, 3, 19, 36, 2, 3, + 23, 38, 19, 39, 2, 59, 40, 54, 48, 44, + 54, 42, 16, 48, 48, 11, 12, 22, 8, 2, + 19, 8, 1, 5, 4, 7, 21, 2, 7, 49, + 21, 19, 35, 25, 51, 70, 82, 66, 68, 66, + 54, 46, 54, 38, 24, 30, 12, 9, 11, 55, + 30, 24, 11, 24, 8, 6, 3, 13, 19, 25, + 31, 57, 19, 43, 69, 51, 73, 3, 64, 46, + 30, 10, 10, 1, 9, 17, 21, 12, 92, 60, + 46, 34, 56, 20, 8, 5, 7, 13, 72, 44, + 24, 12, 24, 4, 7, 15, 31, 27, 32, 8, + 2, 5, 2, 17, 27, 37, 0, 80, 46, 28, + 14, 32, 6, 1, 5, 15, 124, 37, 19, 1, + 5, 11, 2, 14, 16, 3, 14, 26, 28, 27, + 15, 19, 2, 28, 0, 3, 14, 26, 36, 17, + 1, 24, 28, 33, 25, 33, 37, 27, 7, 23, + 13, 17, 9, 1, 19, 15, 13, 3, 25, 21, + 35, 45, 11, 69, 25, 1, 13, 16, 11, 3, + 32, 7, 11, 5, 24, 21, 19, 55, 60, 68, + 70, 56, 36, 58, 50, 40, 38, 34, 32, 24, + 11, 13, 27, 6, 8, 7, 39, 9, 11, 23, + 23, 23, 19, 21, 49, 43, 47, 66, 64, 62, + 52, 32, 40, 34, 12, 22, 10, 8, 13, 17, + 25, 35, 28, 32, 11, 28, 30, 10, 1, 1, + 5, 11, 9, 19, 35, 39, 47, 55, 73, 27, + 47, 63, 51, 23, 35, 3, 1, 0, 80, 20, + 18, 9, 38, 102, 74, 114, 18, 70, 58, 12, + 21, 37, 47, 99, 99, 105, 11, 84, 56, 38, + 18, 24, 4, 3, 9, 23, 33, 19, 2, 9, + 16, 18, 2, 6, 18, 24, 26, 14, 32, 30, + 2, 70, 62, 42, 28, 16, 4, 39, 49, 57, + }, + + { + /* Context Tables for I, SI Slices :: qp = 30 */ + + 82, 12, 31, 82, 12, 31, 21, 6, 44, 22, + 1, 25, 36, 48, 106, 28, 30, 5, 20, 38, + 0, 15, 3, 35, 67, 8, 18, 103, 113, 113, + 46, 5, 9, 20, 38, 0, 21, 0, 28, 4, + 0, 11, 23, 5, 33, 25, 55, 10, 13, 23, + 0, 25, 23, 49, 20, 1, 5, 13, 10, 8, + 44, 0, 0, 0, 5, 59, 67, 10, 3, 7, + 104, 14, 11, 63, 27, 37, 6, 7, 25, 9, + 37, 4, 25, 21, 13, 55, 57, 57, 61, 46, + 1, 16, 31, 1, 29, 15, 13, 17, 43, 27, + 51, 30, 17, 2, 39, 31, 5, 15, 4, 16, + 6, 4, 30, 24, 25, 3, 2, 3, 4, 21, + 24, 3, 20, 38, 10, 34, 34, 20, 23, 22, + 0, 15, 0, 49, 39, 15, 25, 5, 15, 11, + 18, 7, 11, 2, 10, 15, 17, 1, 57, 47, + 15, 63, 7, 23, 18, 3, 21, 38, 2, 3, + 25, 40, 19, 41, 2, 61, 36, 52, 46, 42, + 52, 40, 16, 46, 46, 13, 10, 22, 8, 2, + 21, 6, 3, 7, 2, 9, 23, 0, 7, 53, + 23, 19, 37, 27, 51, 68, 78, 64, 64, 62, + 50, 42, 50, 32, 18, 26, 6, 13, 17, 59, + 26, 20, 13, 20, 4, 0, 9, 17, 25, 29, + 35, 63, 21, 47, 71, 53, 73, 3, 66, 46, + 30, 10, 10, 1, 9, 17, 21, 12, 92, 60, + 46, 34, 58, 20, 8, 3, 7, 11, 74, 44, + 26, 12, 26, 4, 5, 15, 29, 25, 34, 10, + 4, 5, 4, 15, 25, 35, 0, 80, 46, 28, + 14, 34, 6, 0, 3, 13, 124, 35, 17, 1, + 5, 11, 2, 14, 16, 3, 14, 26, 28, 27, + 15, 19, 2, 30, 1, 5, 12, 24, 36, 19, + 3, 24, 28, 33, 27, 33, 35, 25, 5, 21, + 13, 15, 9, 0, 19, 15, 13, 1, 25, 21, + 35, 43, 9, 69, 25, 1, 13, 18, 11, 3, + 34, 7, 11, 5, 26, 23, 19, 57, 58, 66, + 68, 54, 34, 56, 48, 38, 36, 32, 30, 22, + 13, 15, 29, 2, 6, 9, 43, 13, 13, 25, + 23, 25, 19, 21, 49, 43, 45, 62, 60, 58, + 48, 26, 36, 30, 6, 18, 6, 4, 19, 21, + 29, 37, 24, 30, 15, 24, 26, 6, 5, 5, + 9, 15, 13, 23, 41, 43, 51, 59, 75, 31, + 51, 65, 49, 21, 33, 1, 0, 2, 82, 22, + 20, 9, 40, 104, 78, 116, 20, 66, 56, 6, + 27, 43, 53, 105, 103, 107, 11, 84, 56, 38, + 18, 24, 4, 3, 9, 23, 31, 17, 4, 7, + 18, 18, 2, 8, 18, 24, 26, 14, 32, 30, + 2, 68, 58, 38, 24, 12, 0, 43, 53, 59, + }, + + { + /* Context Tables for I, SI Slices :: qp = 31 */ + + 80, 12, 31, 80, 12, 31, 17, 8, 44, 22, + 1, 27, 34, 46, 106, 28, 34, 5, 22, 40, + 0, 17, 3, 37, 69, 6, 14, 107, 115, 115, + 52, 3, 9, 22, 40, 0, 21, 2, 28, 2, + 0, 11, 21, 5, 33, 25, 55, 10, 13, 21, + 0, 25, 23, 49, 20, 1, 5, 13, 10, 8, + 44, 0, 0, 0, 3, 59, 67, 10, 5, 7, + 104, 14, 11, 61, 25, 37, 6, 5, 21, 5, + 33, 6, 21, 19, 9, 53, 55, 55, 59, 48, + 1, 18, 29, 1, 27, 13, 9, 17, 43, 27, + 49, 30, 17, 2, 37, 31, 3, 15, 4, 16, + 6, 6, 30, 24, 25, 3, 2, 3, 4, 21, + 24, 3, 18, 38, 8, 34, 34, 20, 27, 22, + 0, 19, 1, 51, 37, 13, 23, 3, 13, 9, + 20, 5, 11, 4, 12, 15, 17, 1, 59, 45, + 13, 63, 7, 23, 20, 3, 23, 40, 2, 3, + 27, 42, 21, 43, 2, 63, 34, 50, 46, 40, + 50, 38, 14, 44, 46, 15, 8, 22, 8, 2, + 21, 4, 5, 9, 0, 11, 25, 1, 9, 57, + 25, 21, 39, 29, 51, 64, 76, 60, 60, 58, + 46, 38, 46, 28, 14, 22, 2, 19, 21, 65, + 22, 16, 17, 16, 0, 3, 13, 21, 29, 35, + 39, 67, 23, 51, 73, 55, 73, 3, 66, 46, + 30, 10, 10, 1, 9, 17, 19, 12, 92, 60, + 46, 34, 60, 22, 10, 3, 5, 9, 76, 46, + 26, 12, 28, 6, 5, 13, 27, 23, 36, 12, + 4, 5, 6, 13, 23, 33, 2, 80, 46, 28, + 14, 36, 8, 2, 1, 11, 124, 35, 17, 0, + 5, 11, 2, 14, 16, 3, 14, 26, 30, 27, + 15, 19, 0, 30, 3, 7, 10, 22, 36, 21, + 5, 24, 28, 35, 29, 35, 33, 23, 3, 21, + 11, 15, 7, 2, 19, 15, 13, 1, 27, 21, + 35, 43, 7, 69, 25, 1, 13, 20, 11, 3, + 36, 7, 11, 5, 28, 25, 19, 59, 56, 66, + 68, 52, 32, 54, 46, 36, 34, 30, 28, 20, + 15, 17, 31, 0, 4, 13, 47, 15, 15, 27, + 25, 25, 19, 21, 49, 45, 43, 58, 56, 54, + 44, 20, 32, 26, 2, 14, 2, 0, 23, 25, + 31, 39, 20, 26, 17, 20, 22, 2, 9, 9, + 13, 19, 17, 27, 45, 47, 55, 63, 77, 35, + 55, 69, 47, 19, 31, 0, 2, 4, 86, 24, + 22, 9, 42, 108, 80, 120, 22, 64, 52, 2, + 33, 49, 59, 111, 107, 109, 11, 84, 56, 38, + 18, 24, 4, 3, 9, 21, 29, 15, 6, 5, + 20, 20, 4, 10, 20, 26, 28, 14, 34, 30, + 2, 66, 56, 34, 20, 8, 3, 47, 57, 61, + }, + + { + /* Context Tables for I, SI Slices :: qp = 32 */ + + 76, 10, 33, 76, 10, 33, 15, 10, 44, 22, + 3, 31, 30, 42, 104, 28, 38, 7, 22, 40, + 1, 19, 3, 41, 73, 4, 10, 111, 117, 117, + 56, 1, 11, 22, 40, 1, 21, 4, 28, 0, + 0, 11, 21, 7, 35, 25, 57, 10, 13, 21, + 1, 27, 23, 49, 20, 1, 5, 13, 10, 6, + 44, 0, 0, 0, 3, 61, 67, 10, 7, 9, + 104, 12, 11, 59, 25, 37, 6, 3, 19, 3, + 31, 8, 19, 17, 7, 51, 55, 53, 59, 48, + 1, 18, 29, 1, 27, 13, 7, 17, 43, 27, + 47, 30, 17, 2, 35, 31, 3, 15, 4, 16, + 6, 6, 30, 24, 27, 5, 2, 5, 4, 21, + 22, 5, 16, 38, 6, 32, 34, 18, 31, 20, + 1, 23, 3, 53, 37, 13, 23, 3, 13, 9, + 22, 5, 11, 4, 14, 17, 17, 1, 63, 45, + 13, 63, 7, 25, 20, 3, 25, 40, 2, 3, + 31, 42, 23, 45, 2, 65, 30, 48, 44, 38, + 48, 36, 12, 42, 44, 19, 6, 20, 6, 2, + 23, 2, 7, 11, 1, 13, 27, 3, 11, 61, + 29, 23, 43, 33, 51, 60, 72, 56, 56, 54, + 40, 32, 40, 22, 8, 16, 3, 25, 27, 71, + 18, 12, 21, 10, 5, 9, 19, 27, 35, 41, + 45, 73, 25, 55, 77, 57, 75, 3, 66, 46, + 30, 10, 10, 3, 9, 17, 19, 12, 92, 60, + 46, 34, 62, 22, 10, 3, 5, 9, 76, 46, + 26, 12, 28, 6, 5, 13, 25, 23, 38, 12, + 4, 5, 6, 13, 23, 31, 2, 80, 46, 28, + 14, 36, 8, 2, 1, 11, 124, 35, 17, 0, + 5, 11, 2, 14, 16, 3, 14, 26, 30, 27, + 15, 21, 1, 30, 5, 9, 8, 20, 34, 25, + 9, 22, 26, 37, 33, 37, 33, 23, 1, 21, + 11, 15, 7, 2, 21, 15, 13, 1, 29, 21, + 35, 43, 7, 71, 25, 1, 13, 20, 13, 3, + 36, 9, 13, 5, 28, 27, 21, 61, 54, 64, + 66, 50, 28, 52, 44, 34, 32, 28, 26, 18, + 17, 21, 35, 3, 0, 17, 51, 19, 19, 29, + 27, 27, 19, 21, 49, 47, 43, 52, 50, 50, + 38, 14, 26, 20, 3, 8, 3, 5, 29, 31, + 35, 41, 16, 22, 21, 16, 16, 3, 15, 15, + 19, 23, 23, 33, 51, 51, 61, 67, 81, 39, + 59, 73, 45, 17, 29, 0, 2, 4, 88, 24, + 22, 9, 42, 110, 82, 122, 22, 60, 48, 3, + 41, 55, 65, 117, 113, 113, 11, 84, 54, 36, + 18, 24, 4, 5, 9, 21, 29, 15, 6, 5, + 22, 20, 4, 10, 20, 26, 28, 14, 34, 30, + 0, 62, 52, 30, 16, 4, 7, 53, 61, 63, + }, + + { + /* Context Tables for I, SI Slices :: qp = 33 */ + + 74, 10, 33, 74, 10, 33, 11, 14, 46, 24, + 3, 33, 28, 40, 104, 28, 44, 7, 24, 42, + 1, 19, 1, 43, 75, 4, 8, 113, 119, 117, + 62, 2, 11, 24, 42, 1, 19, 8, 30, 0, + 2, 9, 19, 7, 35, 23, 57, 10, 11, 19, + 1, 27, 21, 49, 22, 0, 3, 11, 12, 6, + 44, 0, 0, 0, 1, 61, 67, 12, 7, 9, + 104, 12, 11, 55, 23, 37, 8, 0, 15, 0, + 27, 12, 15, 13, 3, 47, 53, 51, 57, 50, + 0, 20, 27, 0, 25, 11, 3, 15, 41, 25, + 43, 32, 15, 4, 31, 29, 1, 13, 6, 18, + 8, 8, 32, 26, 27, 5, 4, 5, 4, 19, + 22, 5, 16, 38, 6, 32, 34, 18, 33, 20, + 1, 25, 5, 53, 35, 11, 21, 1, 11, 7, + 24, 3, 9, 6, 18, 17, 17, 0, 65, 43, + 11, 63, 5, 25, 22, 1, 25, 42, 2, 3, + 33, 44, 23, 47, 2, 65, 28, 48, 44, 38, + 48, 36, 12, 42, 44, 21, 6, 20, 6, 2, + 23, 2, 7, 11, 1, 13, 29, 3, 11, 63, + 31, 23, 45, 35, 51, 58, 70, 54, 54, 52, + 36, 28, 36, 18, 4, 12, 7, 29, 31, 75, + 16, 10, 23, 6, 9, 13, 23, 31, 39, 45, + 49, 77, 25, 57, 79, 57, 75, 1, 68, 48, + 32, 12, 10, 3, 7, 15, 17, 14, 94, 62, + 48, 34, 64, 24, 12, 1, 3, 7, 78, 48, + 28, 14, 30, 8, 3, 11, 21, 21, 42, 14, + 6, 3, 8, 11, 21, 27, 4, 82, 48, 30, + 16, 38, 10, 4, 0, 9, 124, 33, 15, 2, + 5, 9, 2, 16, 18, 1, 16, 28, 32, 25, + 15, 21, 1, 32, 5, 9, 6, 20, 34, 27, + 11, 22, 26, 37, 35, 37, 31, 21, 0, 19, + 9, 13, 5, 4, 21, 13, 11, 0, 29, 19, + 33, 41, 5, 71, 25, 0, 13, 22, 13, 3, + 38, 9, 13, 3, 30, 27, 21, 61, 54, 64, + 66, 50, 26, 52, 44, 34, 32, 28, 26, 18, + 17, 23, 37, 5, 1, 19, 53, 21, 21, 29, + 27, 27, 17, 19, 47, 47, 41, 48, 46, 46, + 34, 10, 22, 16, 7, 4, 7, 9, 33, 35, + 37, 41, 14, 20, 23, 14, 12, 7, 19, 19, + 23, 27, 27, 37, 55, 53, 65, 69, 83, 41, + 61, 75, 41, 13, 25, 2, 4, 6, 92, 26, + 24, 9, 44, 114, 86, 124, 24, 58, 46, 7, + 47, 59, 69, 121, 117, 115, 9, 86, 54, 36, + 18, 26, 4, 5, 9, 19, 27, 13, 8, 3, + 26, 22, 6, 12, 22, 28, 30, 16, 36, 32, + 0, 60, 50, 28, 14, 2, 9, 57, 63, 63, + }, + + { + /* Context Tables for I, SI Slices :: qp = 34 */ + + 72, 10, 33, 72, 10, 33, 7, 16, 46, 24, + 3, 35, 26, 38, 104, 28, 48, 7, 26, 44, + 1, 21, 1, 45, 77, 2, 4, 117, 121, 119, + 68, 4, 11, 26, 44, 1, 19, 10, 30, 1, + 2, 9, 17, 7, 35, 23, 57, 10, 11, 17, + 1, 27, 21, 49, 22, 0, 3, 11, 12, 6, + 44, 0, 0, 0, 0, 61, 67, 12, 9, 9, + 104, 12, 11, 53, 21, 37, 8, 2, 11, 4, + 25, 14, 11, 11, 0, 45, 51, 49, 55, 52, + 0, 22, 25, 0, 25, 9, 0, 15, 41, 25, + 41, 32, 15, 4, 29, 29, 0, 13, 6, 18, + 8, 10, 32, 26, 29, 5, 4, 5, 4, 19, + 22, 5, 14, 38, 4, 32, 34, 18, 37, 20, + 1, 29, 7, 55, 33, 9, 19, 0, 9, 5, + 26, 1, 9, 8, 20, 17, 17, 0, 67, 43, + 9, 63, 5, 25, 24, 1, 27, 44, 2, 3, + 35, 46, 25, 49, 2, 67, 24, 46, 42, 36, + 46, 34, 10, 40, 44, 23, 4, 20, 6, 2, + 23, 0, 9, 13, 3, 15, 31, 5, 13, 67, + 33, 25, 47, 37, 51, 54, 68, 50, 50, 48, + 32, 24, 32, 12, 0, 8, 13, 35, 35, 81, + 12, 6, 27, 2, 13, 17, 27, 35, 43, 51, + 53, 81, 27, 61, 81, 59, 75, 1, 68, 48, + 32, 12, 10, 3, 7, 15, 15, 14, 94, 62, + 48, 34, 66, 24, 14, 1, 1, 5, 80, 50, + 28, 14, 32, 8, 3, 9, 19, 19, 44, 16, + 6, 3, 10, 9, 19, 25, 4, 82, 48, 30, + 16, 40, 12, 6, 2, 7, 124, 33, 15, 2, + 5, 9, 2, 16, 18, 1, 16, 28, 34, 25, + 15, 21, 3, 32, 7, 11, 4, 18, 34, 29, + 13, 22, 26, 39, 37, 39, 29, 19, 2, 19, + 9, 13, 3, 6, 21, 13, 11, 0, 31, 19, + 33, 41, 3, 71, 25, 0, 13, 24, 13, 3, + 40, 9, 13, 3, 32, 29, 21, 63, 52, 64, + 66, 48, 24, 50, 42, 32, 30, 26, 24, 16, + 19, 25, 39, 9, 3, 23, 57, 23, 23, 31, + 29, 27, 17, 19, 47, 49, 39, 44, 42, 42, + 30, 4, 18, 12, 13, 0, 11, 13, 37, 39, + 39, 43, 10, 16, 25, 10, 8, 11, 23, 23, + 27, 31, 31, 41, 59, 57, 69, 73, 85, 45, + 65, 79, 39, 11, 23, 4, 6, 8, 96, 28, + 26, 9, 46, 116, 88, 124, 26, 56, 42, 13, + 53, 65, 75, 125, 121, 117, 9, 86, 54, 36, + 18, 26, 4, 5, 9, 17, 25, 11, 10, 1, + 28, 24, 8, 14, 24, 30, 32, 16, 38, 32, + 0, 58, 46, 24, 10, 1, 13, 61, 67, 65, + }, + + { + /* Context Tables for I, SI Slices :: qp = 35 */ + + 70, 10, 33, 70, 10, 33, 3, 20, 48, 24, + 5, 39, 22, 36, 104, 28, 52, 7, 28, 46, + 1, 23, 1, 47, 79, 2, 0, 119, 123, 119, + 74, 6, 11, 28, 46, 1, 17, 12, 32, 1, + 2, 7, 15, 9, 37, 23, 57, 10, 11, 17, + 1, 27, 21, 49, 24, 0, 3, 11, 14, 6, + 44, 0, 0, 0, 0, 61, 67, 14, 11, 9, + 104, 12, 11, 51, 19, 37, 10, 4, 7, 8, + 21, 16, 7, 9, 4, 43, 49, 47, 53, 54, + 0, 24, 23, 0, 23, 9, 4, 15, 39, 23, + 39, 32, 15, 6, 25, 27, 2, 13, 6, 18, + 8, 12, 34, 28, 29, 5, 4, 5, 4, 19, + 22, 7, 14, 38, 2, 32, 34, 18, 41, 20, + 3, 33, 9, 57, 31, 7, 17, 2, 7, 5, + 28, 1, 7, 10, 22, 17, 17, 0, 69, 41, + 7, 63, 5, 25, 26, 1, 29, 46, 2, 3, + 37, 48, 25, 51, 2, 69, 22, 44, 42, 34, + 44, 32, 10, 38, 42, 25, 2, 20, 6, 2, + 25, 1, 11, 15, 5, 17, 33, 7, 13, 71, + 35, 25, 49, 39, 51, 52, 64, 48, 46, 44, + 28, 20, 28, 8, 5, 4, 17, 39, 41, 85, + 8, 2, 29, 1, 17, 23, 33, 39, 49, 55, + 57, 87, 29, 65, 83, 61, 75, 1, 70, 48, + 32, 12, 10, 3, 7, 15, 15, 14, 94, 62, + 48, 34, 68, 26, 14, 0, 1, 3, 82, 50, + 30, 14, 34, 10, 1, 9, 17, 17, 46, 18, + 8, 3, 12, 7, 17, 23, 6, 82, 48, 30, + 16, 42, 12, 8, 4, 5, 124, 31, 13, 4, + 5, 9, 2, 16, 18, 1, 16, 28, 34, 25, + 15, 21, 3, 34, 9, 13, 2, 16, 34, 31, + 15, 22, 26, 39, 39, 39, 27, 17, 4, 17, + 7, 11, 3, 8, 21, 13, 11, 2, 31, 19, + 33, 39, 1, 71, 25, 0, 13, 26, 13, 3, + 42, 9, 13, 3, 34, 31, 21, 65, 50, 62, + 64, 46, 22, 48, 40, 30, 28, 24, 22, 14, + 21, 27, 41, 11, 5, 25, 61, 27, 25, 33, + 29, 29, 17, 19, 47, 49, 37, 40, 38, 38, + 26, 1, 14, 8, 17, 3, 15, 17, 43, 43, + 43, 45, 6, 14, 29, 6, 4, 15, 27, 27, + 31, 35, 35, 45, 65, 61, 73, 77, 87, 49, + 69, 81, 37, 9, 21, 6, 8, 10, 98, 30, + 28, 9, 48, 120, 92, 124, 28, 52, 40, 17, + 59, 71, 81, 125, 125, 119, 9, 86, 54, 36, + 18, 26, 4, 5, 9, 17, 23, 9, 12, 0, + 30, 24, 8, 16, 24, 30, 32, 16, 38, 32, + 0, 56, 44, 20, 6, 5, 17, 65, 71, 67, + }, + + { + /* Context Tables for I, SI Slices :: qp = 36 */ + + 66, 10, 33, 66, 10, 33, 1, 22, 48, 24, + 5, 41, 20, 32, 104, 28, 58, 9, 30, 46, + 1, 25, 1, 49, 81, 0, 3, 123, 125, 121, + 80, 8, 11, 30, 46, 1, 17, 14, 32, 3, + 2, 7, 15, 9, 37, 21, 57, 10, 11, 15, + 3, 29, 21, 49, 24, 2, 3, 9, 14, 6, + 44, 0, 0, 0, 2, 63, 67, 14, 13, 9, + 104, 12, 11, 49, 17, 37, 10, 8, 5, 12, + 19, 18, 5, 5, 8, 41, 49, 45, 53, 54, + 0, 26, 21, 0, 23, 7, 8, 15, 39, 23, + 37, 32, 13, 6, 23, 27, 2, 13, 8, 18, + 8, 12, 34, 28, 31, 7, 4, 5, 4, 19, + 22, 7, 12, 38, 2, 32, 34, 16, 45, 20, + 3, 37, 11, 59, 29, 5, 15, 4, 7, 3, + 30, 0, 7, 12, 24, 17, 17, 2, 71, 41, + 5, 63, 5, 25, 26, 1, 31, 48, 2, 3, + 41, 50, 27, 53, 2, 71, 18, 42, 40, 32, + 42, 30, 8, 36, 42, 29, 0, 18, 6, 2, + 25, 3, 11, 17, 5, 19, 35, 9, 15, 75, + 37, 27, 51, 41, 51, 48, 62, 44, 42, 40, + 24, 14, 24, 2, 9, 0, 23, 45, 45, 91, + 4, 0, 33, 7, 21, 27, 37, 45, 53, 61, + 61, 91, 31, 67, 87, 63, 75, 1, 70, 50, + 32, 12, 10, 3, 7, 15, 13, 16, 96, 62, + 48, 34, 70, 26, 16, 0, 0, 3, 84, 52, + 30, 14, 34, 10, 1, 7, 15, 17, 48, 18, + 8, 3, 14, 5, 17, 21, 6, 84, 48, 30, + 16, 44, 14, 8, 6, 3, 124, 31, 13, 4, + 5, 9, 2, 16, 18, 0, 16, 28, 36, 25, + 15, 21, 5, 34, 11, 15, 0, 14, 34, 35, + 17, 20, 26, 41, 41, 41, 27, 17, 6, 17, + 7, 11, 1, 10, 23, 13, 11, 2, 33, 19, + 33, 39, 0, 71, 25, 2, 13, 26, 13, 3, + 44, 9, 15, 3, 36, 31, 23, 67, 48, 62, + 64, 44, 20, 46, 38, 28, 26, 22, 20, 12, + 23, 29, 43, 15, 9, 29, 65, 29, 27, 35, + 31, 29, 17, 19, 47, 51, 37, 34, 32, 34, + 20, 7, 10, 4, 23, 7, 19, 21, 47, 47, + 45, 47, 2, 10, 31, 2, 0, 19, 31, 33, + 35, 39, 39, 49, 69, 65, 77, 81, 89, 53, + 73, 85, 35, 7, 19, 6, 10, 12, 102, 32, + 30, 9, 50, 122, 94, 124, 28, 50, 36, 23, + 65, 77, 87, 125, 125, 121, 9, 86, 54, 36, + 18, 26, 4, 5, 9, 15, 21, 7, 14, 0, + 32, 26, 10, 18, 26, 32, 34, 18, 40, 34, + 0, 52, 40, 16, 2, 9, 21, 69, 75, 69, + }, + + { + /* Context Tables for I, SI Slices :: qp = 37 */ + + 64, 10, 33, 64, 10, 33, 2, 26, 48, 24, + 7, 45, 16, 30, 104, 28, 62, 9, 32, 48, + 1, 25, 0, 51, 83, 0, 7, 125, 125, 121, + 86, 12, 11, 32, 48, 1, 17, 16, 32, 3, + 2, 7, 13, 9, 39, 21, 57, 10, 11, 13, + 3, 29, 21, 49, 26, 2, 1, 9, 16, 6, + 44, 0, 0, 0, 2, 63, 67, 16, 15, 9, + 104, 12, 11, 47, 15, 37, 12, 10, 1, 16, + 15, 20, 1, 3, 12, 39, 47, 43, 51, 56, + 0, 28, 19, 0, 21, 5, 12, 15, 39, 21, + 33, 34, 13, 8, 21, 25, 4, 13, 8, 20, + 10, 14, 36, 30, 31, 7, 4, 5, 4, 19, + 22, 9, 10, 38, 0, 32, 34, 16, 49, 20, + 5, 39, 13, 61, 27, 3, 13, 6, 5, 1, + 32, 2, 5, 14, 26, 17, 17, 2, 73, 39, + 3, 63, 3, 25, 28, 1, 33, 50, 2, 3, + 43, 52, 27, 55, 2, 71, 16, 40, 40, 30, + 40, 28, 6, 36, 42, 31, 1, 18, 6, 2, + 27, 3, 13, 19, 7, 21, 37, 9, 17, 79, + 39, 29, 53, 43, 51, 44, 60, 40, 38, 38, + 20, 10, 20, 1, 13, 3, 27, 49, 49, 95, + 0, 3, 37, 11, 25, 31, 41, 49, 57, 67, + 65, 97, 33, 71, 89, 63, 75, 1, 70, 50, + 32, 12, 10, 3, 7, 15, 13, 16, 96, 62, + 48, 34, 72, 28, 18, 0, 2, 1, 86, 54, + 30, 16, 36, 12, 0, 7, 13, 15, 52, 20, + 8, 1, 16, 3, 15, 19, 8, 84, 50, 30, + 16, 46, 16, 10, 8, 1, 124, 31, 13, 6, + 5, 9, 2, 16, 18, 0, 16, 30, 36, 25, + 15, 21, 7, 36, 13, 17, 1, 14, 34, 37, + 19, 20, 26, 43, 43, 41, 25, 15, 8, 15, + 5, 9, 1, 12, 23, 13, 11, 2, 35, 19, + 33, 37, 2, 71, 25, 2, 13, 28, 13, 3, + 46, 9, 15, 3, 38, 33, 23, 67, 46, 62, + 62, 44, 18, 44, 36, 26, 26, 20, 18, 10, + 25, 31, 45, 17, 11, 31, 69, 31, 29, 37, + 33, 31, 17, 17, 47, 51, 35, 30, 28, 30, + 16, 11, 6, 0, 27, 11, 23, 25, 51, 51, + 49, 49, 1, 6, 33, 1, 3, 23, 35, 37, + 39, 43, 43, 53, 75, 69, 81, 85, 91, 57, + 77, 87, 33, 3, 17, 8, 12, 14, 106, 34, + 32, 9, 52, 124, 98, 124, 30, 46, 32, 27, + 71, 81, 93, 125, 125, 123, 9, 86, 54, 36, + 18, 26, 4, 5, 9, 15, 19, 5, 16, 2, + 34, 28, 10, 20, 28, 34, 36, 18, 40, 34, + 0, 50, 38, 14, 0, 13, 25, 73, 79, 71, + }, + + { + /* Context Tables for I, SI Slices :: qp = 38 */ + + 62, 10, 35, 62, 10, 35, 6, 28, 50, 24, + 7, 47, 14, 28, 104, 28, 66, 9, 32, 50, + 1, 27, 0, 53, 85, 1, 9, 125, 125, 123, + 92, 14, 11, 32, 50, 1, 15, 18, 34, 5, + 4, 5, 11, 11, 39, 21, 57, 10, 9, 13, + 3, 29, 19, 49, 26, 2, 1, 9, 16, 6, + 44, 0, 0, 0, 4, 63, 67, 16, 15, 11, + 104, 12, 11, 45, 15, 37, 12, 12, 2, 20, + 13, 22, 2, 1, 16, 37, 45, 41, 49, 58, + 0, 30, 19, 2, 21, 5, 16, 15, 37, 21, + 31, 34, 13, 8, 17, 25, 6, 11, 8, 20, + 10, 16, 36, 30, 33, 7, 4, 5, 4, 19, + 22, 9, 10, 38, 1, 32, 34, 16, 53, 20, + 5, 43, 15, 63, 27, 1, 13, 6, 3, 1, + 34, 2, 5, 14, 30, 17, 17, 2, 75, 39, + 1, 63, 3, 25, 30, 0, 35, 50, 2, 3, + 45, 54, 29, 57, 2, 73, 12, 38, 38, 30, + 38, 28, 6, 34, 40, 33, 3, 18, 6, 2, + 27, 5, 15, 21, 9, 21, 39, 11, 17, 83, + 41, 29, 55, 45, 51, 42, 56, 38, 34, 34, + 16, 6, 16, 7, 19, 7, 33, 55, 55, 101, + 3, 7, 39, 15, 31, 37, 47, 53, 63, 71, + 71, 101, 35, 75, 91, 65, 75, 0, 72, 50, + 34, 12, 10, 3, 7, 15, 11, 16, 96, 64, + 48, 34, 74, 28, 18, 2, 2, 0, 88, 54, + 32, 16, 38, 12, 0, 5, 11, 13, 54, 22, + 10, 1, 16, 3, 13, 15, 8, 84, 50, 30, + 18, 46, 16, 12, 8, 0, 124, 29, 11, 6, + 5, 9, 2, 16, 18, 0, 18, 30, 38, 25, + 15, 23, 7, 36, 15, 19, 3, 12, 34, 39, + 21, 20, 26, 43, 45, 43, 23, 13, 10, 15, + 5, 9, 0, 12, 23, 11, 9, 4, 35, 19, + 33, 37, 4, 71, 25, 2, 13, 30, 13, 3, + 48, 11, 15, 3, 38, 35, 23, 69, 44, 60, + 62, 42, 16, 44, 36, 24, 24, 20, 16, 8, + 27, 33, 47, 21, 13, 35, 73, 35, 33, 39, + 33, 31, 17, 17, 45, 53, 33, 26, 24, 26, + 12, 17, 2, 3, 33, 15, 29, 29, 57, 55, + 51, 49, 3, 4, 37, 5, 9, 27, 39, 41, + 43, 47, 49, 57, 79, 73, 85, 87, 95, 61, + 81, 91, 31, 1, 15, 10, 14, 16, 108, 36, + 34, 9, 54, 124, 100, 124, 32, 44, 30, 33, + 77, 87, 97, 125, 125, 125, 7, 86, 54, 36, + 18, 26, 4, 5, 9, 13, 17, 5, 18, 4, + 36, 28, 12, 22, 28, 34, 36, 18, 42, 34, + 0, 48, 34, 10, 3, 15, 27, 79, 81, 73, + }, + + { + /* Context Tables for I, SI Slices :: qp = 39 */ + + 60, 10, 35, 60, 10, 35, 10, 32, 50, 24, + 9, 51, 10, 24, 104, 28, 72, 11, 34, 50, + 1, 29, 0, 55, 87, 1, 13, 125, 125, 123, + 98, 16, 11, 34, 50, 1, 15, 20, 34, 5, + 4, 5, 11, 11, 41, 19, 57, 10, 9, 11, + 5, 29, 19, 49, 28, 4, 1, 7, 18, 6, + 44, 0, 0, 0, 4, 63, 67, 18, 17, 11, + 104, 12, 11, 43, 13, 37, 14, 16, 4, 24, + 9, 24, 4, 2, 20, 35, 43, 39, 49, 60, + 0, 32, 17, 2, 19, 3, 20, 15, 37, 19, + 29, 34, 11, 10, 15, 23, 6, 11, 10, 20, + 10, 18, 38, 32, 33, 9, 4, 5, 4, 19, + 22, 11, 8, 38, 1, 32, 34, 14, 57, 20, + 7, 47, 17, 65, 25, 0, 11, 8, 1, 0, + 36, 4, 3, 16, 32, 17, 17, 4, 77, 37, + 0, 63, 3, 25, 30, 0, 37, 52, 2, 3, + 49, 56, 29, 59, 2, 75, 10, 36, 38, 28, + 36, 26, 4, 32, 40, 35, 5, 18, 6, 2, + 29, 7, 15, 23, 9, 23, 41, 13, 19, 87, + 43, 31, 57, 47, 51, 38, 54, 34, 30, 30, + 12, 0, 12, 11, 23, 11, 37, 59, 59, 105, + 7, 9, 43, 21, 35, 41, 51, 59, 67, 77, + 75, 107, 37, 77, 95, 67, 75, 0, 72, 52, + 34, 12, 10, 3, 7, 15, 11, 18, 98, 64, + 48, 34, 76, 30, 20, 2, 4, 2, 90, 56, + 32, 16, 40, 14, 2, 5, 9, 13, 56, 22, + 10, 1, 18, 1, 13, 13, 10, 86, 50, 30, + 18, 48, 18, 14, 10, 2, 124, 29, 11, 8, + 5, 9, 2, 16, 18, 2, 18, 30, 38, 25, + 15, 23, 9, 38, 17, 21, 5, 10, 34, 41, + 23, 18, 26, 45, 47, 43, 21, 13, 12, 13, + 3, 7, 0, 14, 23, 11, 9, 4, 37, 19, + 33, 35, 6, 71, 25, 4, 13, 30, 13, 3, + 50, 11, 15, 3, 40, 35, 25, 71, 42, 60, + 60, 40, 14, 42, 34, 22, 22, 18, 14, 6, + 29, 35, 49, 23, 17, 37, 77, 37, 35, 41, + 35, 33, 17, 17, 45, 53, 31, 22, 20, 22, + 6, 23, 1, 7, 37, 19, 33, 33, 61, 59, + 55, 51, 7, 0, 39, 9, 13, 31, 43, 45, + 47, 51, 53, 61, 85, 77, 89, 91, 97, 65, + 85, 93, 29, 0, 13, 10, 16, 18, 112, 38, + 36, 9, 56, 124, 104, 124, 32, 40, 26, 37, + 83, 93, 103, 125, 125, 125, 7, 86, 54, 36, + 18, 26, 4, 5, 9, 13, 15, 3, 20, 6, + 38, 30, 12, 24, 30, 36, 38, 20, 42, 36, + 0, 44, 32, 6, 7, 19, 31, 83, 85, 75, + }, + + { + /* Context Tables for I, SI Slices :: qp = 40 */ + + 56, 8, 35, 56, 8, 35, 12, 34, 50, 24, + 9, 53, 8, 22, 104, 28, 76, 11, 36, 52, + 1, 31, 0, 57, 91, 3, 17, 125, 125, 125, + 102, 18, 11, 36, 52, 1, 15, 22, 34, 7, + 4, 5, 9, 13, 41, 19, 59, 10, 9, 11, + 5, 31, 19, 49, 28, 4, 1, 7, 18, 6, + 44, 0, 0, 0, 6, 65, 67, 18, 19, 11, + 104, 12, 11, 41, 11, 37, 14, 18, 8, 28, + 7, 26, 8, 4, 22, 33, 43, 37, 47, 60, + 0, 34, 15, 2, 19, 3, 22, 15, 37, 19, + 27, 34, 11, 10, 13, 23, 8, 11, 10, 20, + 10, 18, 38, 32, 35, 9, 4, 5, 4, 19, + 22, 11, 6, 38, 3, 32, 34, 14, 61, 20, + 7, 51, 19, 67, 23, 2, 9, 10, 1, 0, + 38, 4, 3, 18, 34, 17, 17, 4, 81, 37, + 2, 63, 3, 27, 32, 0, 39, 54, 2, 3, + 51, 58, 31, 61, 2, 77, 6, 34, 36, 26, + 34, 24, 2, 30, 38, 39, 7, 16, 6, 2, + 29, 9, 17, 25, 11, 25, 43, 15, 21, 91, + 47, 33, 59, 49, 51, 34, 50, 30, 26, 26, + 8, 3, 8, 17, 29, 15, 43, 65, 65, 111, + 11, 13, 47, 25, 39, 47, 57, 63, 73, 83, + 79, 111, 39, 81, 97, 69, 77, 0, 72, 52, + 34, 12, 10, 3, 7, 15, 9, 18, 98, 64, + 48, 34, 78, 30, 20, 2, 4, 2, 92, 56, + 32, 16, 40, 14, 2, 3, 7, 11, 58, 24, + 10, 1, 20, 0, 11, 11, 10, 86, 50, 30, + 18, 50, 18, 14, 12, 2, 124, 29, 11, 8, + 5, 9, 2, 16, 18, 2, 18, 30, 40, 25, + 15, 23, 11, 38, 19, 23, 7, 8, 34, 45, + 27, 18, 26, 47, 49, 45, 21, 11, 14, 13, + 3, 7, 2, 16, 25, 11, 9, 4, 39, 19, + 33, 35, 6, 73, 25, 4, 13, 32, 13, 3, + 50, 11, 17, 3, 42, 37, 25, 73, 40, 58, + 60, 38, 10, 40, 32, 20, 20, 16, 12, 4, + 31, 37, 51, 27, 19, 41, 81, 41, 37, 43, + 37, 33, 17, 17, 45, 55, 31, 16, 14, 18, + 2, 29, 7, 13, 43, 23, 37, 37, 67, 63, + 57, 53, 11, 3, 43, 13, 17, 37, 49, 51, + 53, 55, 57, 67, 89, 81, 95, 95, 99, 69, + 89, 97, 27, 2, 11, 12, 18, 18, 114, 40, + 36, 9, 56, 124, 106, 124, 34, 38, 22, 43, + 89, 99, 109, 125, 125, 125, 7, 86, 54, 36, + 18, 26, 4, 5, 9, 11, 15, 1, 22, 6, + 40, 30, 14, 24, 30, 36, 38, 20, 44, 36, + 1, 42, 28, 2, 11, 23, 35, 87, 89, 77, + }, + + { + /* Context Tables for I, SI Slices :: qp = 41 */ + + 54, 8, 35, 54, 8, 35, 16, 36, 52, 24, + 9, 55, 6, 20, 104, 28, 80, 11, 38, 54, + 1, 31, 2, 59, 93, 5, 21, 125, 125, 125, + 108, 22, 11, 38, 54, 1, 13, 24, 36, 9, + 4, 3, 7, 13, 41, 19, 59, 10, 9, 9, + 5, 31, 19, 49, 28, 4, 0, 7, 18, 6, + 44, 0, 0, 0, 8, 65, 67, 18, 21, 11, + 104, 12, 11, 39, 9, 37, 14, 20, 12, 32, + 3, 30, 12, 6, 26, 31, 41, 35, 45, 62, + 2, 36, 13, 2, 17, 1, 26, 15, 35, 19, + 23, 36, 11, 10, 9, 23, 10, 11, 10, 22, + 12, 20, 38, 32, 35, 9, 6, 5, 4, 17, + 22, 11, 6, 38, 5, 32, 34, 14, 65, 20, + 7, 53, 21, 67, 21, 4, 7, 12, 0, 2, + 40, 6, 3, 20, 36, 17, 17, 4, 83, 35, + 4, 63, 1, 27, 34, 0, 41, 56, 2, 3, + 53, 60, 33, 63, 2, 77, 4, 32, 36, 24, + 32, 22, 2, 30, 38, 41, 9, 16, 6, 2, + 29, 9, 19, 27, 13, 27, 45, 15, 21, 93, + 49, 33, 61, 51, 51, 32, 48, 28, 24, 24, + 4, 7, 4, 21, 33, 19, 47, 71, 69, 117, + 13, 17, 49, 29, 43, 51, 61, 67, 77, 87, + 83, 115, 39, 85, 99, 69, 77, 0, 74, 52, + 34, 14, 10, 3, 5, 15, 7, 18, 98, 64, + 50, 34, 80, 32, 22, 4, 6, 4, 94, 58, + 34, 18, 42, 16, 2, 1, 5, 9, 62, 26, + 12, 0, 22, 2, 9, 9, 12, 86, 52, 32, + 18, 52, 20, 16, 14, 4, 124, 27, 9, 10, + 5, 9, 2, 18, 20, 2, 18, 32, 42, 25, + 15, 23, 11, 38, 21, 23, 9, 8, 34, 47, + 29, 18, 26, 47, 51, 47, 19, 9, 16, 13, + 1, 7, 4, 18, 25, 11, 9, 6, 39, 19, + 33, 35, 8, 73, 25, 4, 13, 34, 13, 3, + 52, 11, 17, 3, 44, 39, 25, 73, 38, 58, + 60, 38, 8, 38, 30, 18, 20, 14, 10, 4, + 31, 39, 53, 29, 21, 45, 85, 43, 39, 43, + 37, 33, 15, 15, 45, 57, 29, 12, 10, 14, + 1, 33, 11, 17, 47, 27, 41, 41, 71, 67, + 59, 55, 15, 5, 45, 17, 21, 41, 53, 55, + 57, 59, 61, 71, 93, 83, 99, 99, 101, 73, + 93, 101, 25, 6, 7, 14, 20, 20, 118, 42, + 38, 9, 58, 124, 108, 124, 36, 36, 20, 47, + 95, 103, 115, 125, 125, 125, 7, 86, 54, 36, + 18, 28, 4, 5, 9, 9, 13, 0, 24, 8, + 44, 32, 16, 26, 32, 38, 40, 20, 46, 36, + 1, 40, 26, 0, 13, 27, 39, 91, 93, 77, + }, + + { + /* Context Tables for I, SI Slices :: qp = 42 */ + + 52, 8, 35, 52, 8, 35, 20, 40, 52, 24, + 11, 59, 2, 16, 104, 28, 86, 13, 40, 54, + 1, 33, 2, 61, 95, 5, 25, 125, 125, 125, + 114, 24, 11, 40, 54, 1, 13, 26, 36, 9, + 4, 3, 7, 13, 43, 17, 59, 10, 9, 7, + 7, 31, 19, 49, 30, 6, 0, 5, 20, 6, + 44, 0, 0, 0, 8, 65, 67, 20, 23, 11, + 104, 12, 11, 37, 7, 37, 16, 24, 14, 36, + 1, 32, 14, 10, 30, 29, 39, 33, 45, 64, + 2, 38, 11, 2, 17, 0, 30, 15, 35, 17, + 21, 36, 9, 12, 7, 21, 10, 11, 12, 22, + 12, 22, 40, 34, 37, 11, 6, 5, 4, 17, + 22, 13, 4, 38, 5, 32, 34, 12, 69, 20, + 9, 57, 23, 69, 19, 6, 5, 14, 2, 4, + 42, 8, 1, 22, 38, 17, 17, 6, 85, 35, + 6, 63, 1, 27, 34, 0, 43, 58, 2, 3, + 57, 62, 33, 65, 2, 79, 0, 30, 34, 22, + 30, 20, 0, 28, 38, 43, 11, 16, 6, 2, + 31, 11, 19, 29, 13, 29, 47, 17, 23, 97, + 51, 35, 63, 53, 51, 28, 46, 24, 20, 20, + 0, 13, 0, 27, 37, 23, 53, 75, 73, 121, + 17, 19, 53, 35, 47, 55, 65, 73, 81, 93, + 87, 121, 41, 87, 103, 71, 77, 0, 74, 54, + 34, 14, 10, 3, 5, 15, 7, 20, 100, 64, + 50, 34, 82, 32, 24, 4, 8, 6, 96, 60, + 34, 18, 44, 16, 4, 1, 3, 9, 64, 26, + 12, 0, 24, 4, 9, 7, 12, 88, 52, 32, + 18, 54, 22, 18, 16, 6, 124, 27, 9, 10, + 5, 9, 2, 18, 20, 4, 18, 32, 42, 25, + 15, 23, 13, 40, 23, 25, 11, 6, 34, 49, + 31, 16, 26, 49, 53, 47, 17, 9, 18, 11, + 1, 5, 4, 20, 25, 11, 9, 6, 41, 19, + 33, 33, 10, 73, 25, 6, 13, 34, 13, 3, + 54, 11, 17, 3, 46, 39, 27, 75, 36, 58, + 58, 36, 6, 36, 28, 16, 18, 12, 8, 2, + 33, 41, 55, 33, 25, 47, 89, 45, 41, 45, + 39, 35, 15, 15, 45, 57, 27, 8, 6, 10, + 7, 39, 15, 21, 53, 31, 45, 45, 75, 71, + 63, 57, 19, 9, 47, 21, 25, 45, 57, 59, + 61, 63, 65, 75, 99, 87, 103, 103, 103, 77, + 97, 103, 23, 8, 5, 14, 22, 22, 122, 44, + 40, 9, 60, 124, 112, 124, 36, 32, 16, 53, + 101, 109, 121, 125, 125, 125, 7, 86, 54, 36, + 18, 28, 4, 5, 9, 9, 11, 2, 26, 10, + 46, 34, 16, 28, 34, 40, 42, 22, 46, 38, + 1, 36, 22, 3, 17, 31, 43, 95, 97, 79, + }, + + { + /* Context Tables for I, SI Slices :: qp = 43 */ + + 50, 8, 37, 50, 8, 37, 24, 42, 54, 24, + 11, 61, 0, 14, 104, 28, 90, 13, 40, 56, + 1, 35, 2, 63, 97, 7, 27, 125, 125, 125, + 120, 26, 11, 40, 56, 1, 11, 28, 38, 11, + 6, 1, 5, 15, 43, 17, 59, 10, 7, 7, + 7, 31, 17, 49, 30, 6, 0, 5, 20, 6, + 44, 0, 0, 0, 10, 65, 67, 20, 23, 13, + 104, 12, 11, 35, 7, 37, 16, 26, 18, 40, + 2, 34, 18, 12, 34, 27, 37, 31, 43, 66, + 2, 40, 11, 4, 15, 0, 34, 15, 33, 17, + 19, 36, 9, 12, 3, 21, 12, 9, 12, 22, + 12, 24, 40, 34, 37, 11, 6, 5, 4, 17, + 22, 13, 4, 38, 7, 32, 34, 12, 73, 20, + 9, 61, 25, 71, 19, 8, 5, 14, 4, 4, + 44, 8, 1, 22, 42, 17, 17, 6, 87, 33, + 8, 63, 1, 27, 36, 2, 45, 58, 2, 3, + 59, 64, 35, 67, 2, 81, 1, 28, 34, 22, + 28, 20, 0, 26, 36, 45, 13, 16, 6, 2, + 31, 13, 21, 31, 15, 29, 49, 19, 23, 101, + 53, 35, 65, 55, 51, 26, 42, 22, 16, 16, + 3, 17, 3, 31, 43, 27, 57, 81, 79, 125, + 21, 23, 55, 39, 53, 61, 71, 77, 87, 97, + 93, 125, 43, 91, 105, 73, 77, 2, 76, 54, + 36, 14, 10, 3, 5, 15, 5, 20, 100, 66, + 50, 34, 84, 34, 24, 6, 8, 8, 98, 60, + 36, 18, 46, 18, 4, 0, 1, 7, 66, 28, + 14, 0, 24, 4, 7, 3, 14, 88, 52, 32, + 20, 54, 22, 20, 16, 8, 124, 25, 7, 12, + 5, 9, 2, 18, 20, 4, 20, 32, 44, 25, + 15, 25, 13, 40, 25, 27, 13, 4, 34, 51, + 33, 16, 26, 49, 55, 49, 15, 7, 20, 11, + 0, 5, 6, 20, 25, 9, 7, 8, 41, 19, + 33, 33, 12, 73, 25, 6, 13, 36, 13, 3, + 56, 13, 17, 3, 46, 41, 27, 77, 34, 56, + 58, 34, 4, 36, 28, 14, 16, 12, 6, 0, + 35, 43, 57, 35, 27, 51, 93, 49, 45, 47, + 39, 35, 15, 15, 43, 59, 25, 4, 2, 6, + 11, 45, 19, 25, 57, 35, 51, 49, 81, 75, + 65, 57, 21, 11, 51, 25, 31, 49, 61, 63, + 65, 67, 71, 79, 103, 91, 107, 105, 107, 81, + 101, 107, 21, 10, 3, 16, 24, 24, 124, 46, + 42, 9, 62, 124, 114, 124, 38, 30, 14, 57, + 107, 115, 125, 125, 125, 125, 5, 86, 54, 36, + 18, 28, 4, 5, 9, 7, 9, 2, 28, 12, + 48, 34, 18, 30, 34, 40, 42, 22, 48, 38, + 1, 34, 20, 7, 21, 33, 45, 101, 99, 81, + }, + + { + /* Context Tables for I, SI Slices :: qp = 44 */ + + 46, 8, 37, 46, 8, 37, 26, 46, 54, 24, + 13, 65, 3, 12, 104, 28, 94, 13, 42, 58, + 1, 37, 2, 65, 99, 7, 31, 125, 125, 125, + 124, 28, 11, 42, 58, 1, 11, 30, 38, 11, + 6, 1, 3, 15, 45, 17, 59, 10, 7, 5, + 7, 33, 17, 49, 32, 6, 0, 5, 22, 6, + 44, 0, 0, 0, 10, 67, 67, 22, 25, 13, + 104, 12, 11, 33, 5, 37, 18, 28, 22, 44, + 4, 36, 22, 14, 38, 25, 37, 29, 41, 66, + 2, 42, 9, 4, 15, 2, 38, 15, 33, 15, + 17, 36, 9, 14, 1, 19, 14, 9, 12, 22, + 12, 24, 42, 36, 39, 11, 6, 5, 4, 17, + 22, 15, 2, 38, 9, 32, 34, 12, 77, 20, + 11, 65, 27, 73, 17, 10, 3, 16, 4, 6, + 46, 10, 0, 24, 44, 17, 17, 6, 89, 33, + 10, 63, 1, 27, 38, 2, 47, 60, 2, 3, + 61, 66, 35, 69, 2, 83, 5, 26, 32, 20, + 26, 18, 1, 24, 36, 49, 15, 14, 6, 2, + 33, 15, 23, 33, 17, 31, 51, 21, 25, 105, + 55, 37, 67, 57, 51, 22, 40, 18, 12, 12, + 7, 21, 7, 37, 47, 31, 63, 85, 83, 125, + 25, 27, 59, 43, 57, 65, 75, 81, 91, 103, + 97, 125, 45, 95, 107, 75, 77, 2, 76, 54, + 36, 14, 10, 3, 5, 15, 5, 20, 100, 66, + 50, 34, 86, 34, 26, 6, 10, 8, 100, 62, + 36, 18, 46, 18, 6, 0, 0, 5, 68, 30, + 14, 0, 26, 6, 5, 1, 14, 88, 52, 32, + 20, 56, 24, 20, 18, 10, 124, 25, 7, 12, + 5, 9, 2, 18, 20, 4, 20, 32, 44, 25, + 15, 25, 15, 42, 27, 29, 15, 2, 34, 55, + 35, 16, 26, 51, 57, 49, 15, 5, 22, 9, + 0, 3, 6, 22, 27, 9, 7, 8, 43, 19, + 33, 31, 14, 73, 25, 6, 13, 38, 13, 3, + 58, 13, 19, 3, 48, 43, 27, 79, 32, 56, + 56, 32, 2, 34, 26, 12, 14, 10, 4, 1, + 37, 45, 59, 39, 29, 53, 97, 51, 47, 49, + 41, 37, 15, 15, 43, 59, 25, 1, 3, 2, + 15, 51, 23, 29, 63, 39, 55, 53, 85, 79, + 69, 59, 25, 15, 53, 29, 35, 53, 65, 69, + 69, 71, 75, 83, 109, 95, 111, 109, 109, 85, + 105, 109, 19, 12, 1, 18, 26, 26, 124, 48, + 44, 9, 64, 124, 118, 124, 40, 26, 10, 63, + 113, 121, 125, 125, 125, 125, 5, 86, 54, 36, + 18, 28, 4, 5, 9, 7, 7, 4, 30, 12, + 50, 36, 18, 32, 36, 42, 44, 22, 48, 38, + 1, 32, 16, 11, 25, 37, 49, 105, 103, 83, + }, + + { + /* Context Tables for I, SI Slices :: qp = 45 */ + + 44, 8, 37, 44, 8, 37, 30, 48, 54, 24, + 13, 67, 5, 8, 104, 28, 100, 15, 44, 58, + 1, 37, 4, 67, 101, 9, 35, 125, 125, 125, + 124, 32, 11, 44, 58, 1, 11, 32, 38, 13, + 6, 1, 3, 15, 45, 15, 59, 10, 7, 3, + 9, 33, 17, 49, 32, 8, 2, 3, 22, 6, + 44, 0, 0, 0, 12, 67, 67, 22, 27, 13, + 104, 12, 11, 31, 3, 37, 18, 32, 24, 48, + 8, 38, 24, 18, 42, 23, 35, 27, 41, 68, + 2, 44, 7, 4, 13, 4, 42, 15, 33, 15, + 13, 38, 7, 14, 0, 19, 14, 9, 14, 24, + 14, 26, 42, 36, 39, 13, 6, 5, 4, 17, + 22, 15, 0, 38, 9, 32, 34, 10, 81, 20, + 11, 67, 29, 75, 15, 12, 1, 18, 6, 8, + 48, 12, 0, 26, 46, 17, 17, 8, 91, 31, + 12, 63, 0, 27, 38, 2, 49, 62, 2, 3, + 65, 68, 37, 71, 2, 83, 7, 24, 32, 18, + 24, 16, 3, 24, 36, 51, 17, 14, 6, 2, + 33, 15, 23, 35, 17, 33, 53, 21, 27, 109, + 57, 39, 69, 59, 51, 18, 38, 14, 8, 10, + 11, 27, 11, 41, 51, 35, 67, 91, 87, 125, + 29, 29, 63, 49, 61, 69, 79, 87, 95, 109, + 101, 125, 47, 97, 111, 75, 77, 2, 76, 56, + 36, 14, 10, 3, 5, 15, 3, 22, 102, 66, + 50, 34, 88, 36, 28, 6, 12, 10, 102, 64, + 36, 20, 48, 20, 6, 2, 2, 5, 72, 30, + 14, 2, 28, 8, 5, 0, 16, 90, 54, 32, + 20, 58, 26, 22, 20, 12, 124, 25, 7, 14, + 5, 9, 2, 18, 20, 6, 20, 34, 46, 25, + 15, 25, 17, 42, 29, 31, 17, 2, 34, 57, + 37, 14, 26, 53, 59, 51, 13, 5, 24, 9, + 2, 3, 8, 24, 27, 9, 7, 8, 45, 19, + 33, 31, 16, 73, 25, 8, 13, 38, 13, 3, + 60, 13, 19, 3, 50, 43, 29, 79, 30, 56, + 56, 32, 0, 32, 24, 10, 14, 8, 2, 3, + 39, 47, 61, 41, 33, 57, 101, 53, 49, 51, + 43, 37, 15, 13, 43, 61, 23, 5, 7, 1, + 21, 55, 27, 33, 67, 43, 59, 57, 89, 83, + 71, 61, 29, 19, 55, 33, 39, 57, 69, 73, + 73, 75, 79, 87, 113, 99, 115, 113, 111, 89, + 109, 113, 17, 16, 0, 18, 28, 28, 124, 50, + 46, 9, 66, 124, 120, 124, 40, 24, 6, 67, + 119, 125, 125, 125, 125, 125, 5, 86, 54, 36, + 18, 28, 4, 5, 9, 5, 5, 6, 32, 14, + 52, 38, 20, 34, 38, 44, 46, 24, 50, 40, + 1, 28, 14, 13, 27, 41, 53, 109, 107, 85, + }, + + { + /* Context Tables for I, SI Slices :: qp = 46 */ + + 42, 8, 37, 42, 8, 37, 34, 52, 56, 24, + 15, 71, 9, 6, 104, 28, 104, 15, 46, 60, + 1, 39, 4, 69, 103, 9, 39, 125, 125, 125, + 124, 34, 11, 46, 60, 1, 9, 34, 40, 13, + 6, 0, 1, 17, 47, 15, 59, 10, 7, 3, + 9, 33, 17, 49, 34, 8, 2, 3, 24, 6, + 44, 0, 0, 0, 12, 67, 67, 24, 29, 13, + 104, 12, 11, 29, 1, 37, 20, 34, 28, 52, + 10, 40, 28, 20, 46, 21, 33, 25, 39, 70, + 2, 46, 5, 4, 13, 4, 46, 15, 31, 13, + 11, 38, 7, 16, 4, 17, 16, 9, 14, 24, + 14, 28, 44, 38, 41, 13, 6, 5, 4, 17, + 22, 17, 0, 38, 11, 32, 34, 10, 85, 20, + 13, 71, 31, 77, 13, 14, 0, 20, 8, 8, + 50, 12, 2, 28, 48, 17, 17, 8, 93, 31, + 14, 63, 0, 27, 40, 2, 51, 64, 2, 3, + 67, 70, 37, 73, 2, 85, 11, 22, 30, 16, + 22, 14, 3, 22, 34, 53, 19, 14, 6, 2, + 35, 17, 25, 37, 19, 35, 55, 23, 27, 113, + 59, 39, 71, 61, 51, 16, 34, 12, 4, 6, + 15, 31, 15, 47, 57, 39, 73, 95, 93, 125, + 33, 33, 65, 53, 65, 75, 85, 91, 101, 113, + 105, 125, 49, 101, 113, 77, 77, 2, 78, 56, + 36, 14, 10, 3, 5, 15, 3, 22, 102, 66, + 50, 34, 90, 36, 28, 8, 12, 12, 104, 64, + 38, 20, 50, 20, 8, 2, 4, 3, 74, 32, + 16, 2, 30, 10, 3, 2, 16, 90, 54, 32, + 20, 60, 26, 24, 22, 14, 124, 23, 5, 14, + 5, 9, 2, 18, 20, 6, 20, 34, 46, 25, + 15, 25, 17, 44, 31, 33, 19, 0, 34, 59, + 39, 14, 26, 53, 61, 51, 11, 3, 26, 7, + 2, 1, 8, 26, 27, 9, 7, 10, 45, 19, + 33, 29, 18, 73, 25, 8, 13, 40, 13, 3, + 62, 13, 19, 3, 52, 45, 29, 81, 28, 54, + 54, 30, 1, 30, 22, 8, 12, 6, 0, 5, + 41, 49, 63, 45, 35, 59, 105, 57, 51, 53, + 43, 39, 15, 13, 43, 61, 21, 9, 11, 5, + 25, 61, 31, 37, 73, 47, 63, 61, 95, 87, + 75, 63, 33, 21, 59, 37, 43, 61, 73, 77, + 77, 79, 83, 91, 119, 103, 119, 117, 113, 93, + 113, 115, 15, 18, 2, 20, 30, 30, 124, 52, + 48, 9, 68, 124, 124, 124, 42, 20, 4, 73, + 125, 125, 125, 125, 125, 125, 5, 86, 54, 36, + 18, 28, 4, 5, 9, 5, 3, 8, 34, 16, + 54, 38, 20, 36, 38, 44, 46, 24, 50, 40, + 1, 26, 10, 17, 31, 45, 57, 113, 111, 87, + }, + + { + /* Context Tables for I, SI Slices :: qp = 47 */ + + 40, 8, 37, 40, 8, 37, 38, 54, 56, 24, + 15, 73, 11, 4, 104, 28, 108, 15, 48, 62, + 1, 41, 4, 71, 105, 11, 43, 125, 125, 125, + 124, 36, 11, 48, 62, 1, 9, 36, 40, 15, + 6, 0, 0, 17, 47, 15, 59, 10, 7, 1, + 9, 33, 17, 49, 34, 8, 2, 3, 24, 6, + 44, 0, 0, 0, 14, 67, 67, 24, 31, 13, + 104, 12, 11, 27, 0, 37, 20, 36, 32, 56, + 14, 42, 32, 22, 50, 19, 31, 23, 37, 72, + 2, 48, 3, 4, 11, 6, 50, 15, 31, 13, + 9, 38, 7, 16, 6, 17, 18, 9, 14, 24, + 14, 30, 44, 38, 41, 13, 6, 5, 4, 17, + 22, 17, 1, 38, 13, 32, 34, 10, 89, 20, + 13, 75, 33, 79, 11, 16, 2, 22, 10, 10, + 52, 14, 2, 30, 50, 17, 17, 8, 95, 29, + 16, 63, 0, 27, 42, 2, 53, 66, 2, 3, + 69, 72, 39, 75, 2, 87, 13, 20, 30, 14, + 20, 12, 5, 20, 34, 55, 21, 14, 6, 2, + 35, 19, 27, 39, 21, 37, 57, 25, 29, 117, + 61, 41, 73, 63, 51, 12, 32, 8, 0, 2, + 19, 35, 19, 51, 61, 43, 77, 101, 97, 125, + 37, 37, 69, 57, 69, 79, 89, 95, 105, 119, + 109, 125, 51, 105, 115, 79, 77, 2, 78, 56, + 36, 14, 10, 3, 5, 15, 1, 22, 102, 66, + 50, 34, 92, 38, 30, 8, 14, 14, 106, 66, + 38, 20, 52, 22, 8, 4, 6, 1, 76, 34, + 16, 2, 32, 12, 1, 4, 18, 90, 54, 32, + 20, 62, 28, 26, 24, 16, 124, 23, 5, 16, + 5, 9, 2, 18, 20, 6, 20, 34, 48, 25, + 15, 25, 19, 44, 33, 35, 21, 1, 34, 61, + 41, 14, 26, 55, 63, 53, 9, 1, 28, 7, + 4, 1, 10, 28, 27, 9, 7, 10, 47, 19, + 33, 29, 20, 73, 25, 8, 13, 42, 13, 3, + 64, 13, 19, 3, 54, 47, 29, 83, 26, 54, + 54, 28, 3, 28, 20, 6, 10, 4, 1, 7, + 43, 51, 65, 47, 37, 63, 109, 59, 53, 55, + 45, 39, 15, 13, 43, 63, 19, 13, 15, 9, + 29, 67, 35, 41, 77, 51, 67, 65, 99, 91, + 77, 65, 37, 25, 61, 41, 47, 65, 77, 81, + 81, 83, 87, 95, 123, 107, 123, 121, 115, 97, + 117, 119, 13, 20, 4, 22, 32, 32, 124, 54, + 50, 9, 70, 124, 124, 124, 44, 18, 0, 77, + 125, 125, 125, 125, 125, 125, 5, 86, 54, 36, + 18, 28, 4, 5, 9, 3, 1, 10, 36, 18, + 56, 40, 22, 38, 40, 46, 48, 24, 52, 40, + 1, 24, 8, 21, 35, 49, 61, 117, 115, 89, + }, + + { + /* Context Tables for I, SI Slices :: qp = 48 */ + + 36, 6, 39, 36, 6, 39, 40, 56, 56, 24, + 17, 77, 15, 0, 102, 28, 112, 17, 48, 62, + 3, 43, 4, 75, 109, 13, 47, 125, 125, 125, + 124, 38, 13, 48, 62, 3, 9, 38, 40, 17, + 6, 0, 0, 19, 49, 15, 61, 10, 7, 1, + 11, 35, 17, 49, 34, 8, 2, 3, 24, 4, + 44, 0, 0, 0, 14, 69, 67, 24, 33, 15, + 104, 10, 11, 25, 0, 37, 20, 38, 34, 58, + 16, 44, 34, 24, 52, 17, 31, 21, 37, 72, + 2, 48, 3, 4, 11, 6, 52, 15, 31, 13, + 7, 38, 7, 16, 8, 17, 18, 9, 14, 24, + 14, 30, 44, 38, 43, 15, 6, 7, 4, 17, + 20, 19, 3, 38, 15, 30, 34, 8, 93, 18, + 15, 79, 35, 81, 11, 16, 2, 22, 10, 10, + 54, 14, 2, 30, 52, 19, 17, 8, 99, 29, + 16, 63, 0, 29, 42, 2, 55, 66, 2, 3, + 73, 72, 41, 77, 2, 89, 17, 18, 28, 12, + 18, 10, 7, 18, 32, 59, 23, 12, 4, 2, + 37, 21, 29, 41, 23, 39, 59, 27, 31, 121, + 65, 43, 77, 67, 51, 8, 28, 4, 3, 1, + 25, 41, 25, 57, 67, 49, 83, 107, 103, 125, + 41, 41, 73, 63, 75, 85, 95, 101, 111, 125, + 115, 125, 53, 109, 119, 81, 79, 2, 78, 56, + 36, 14, 10, 5, 5, 15, 1, 22, 102, 66, + 50, 34, 94, 38, 30, 8, 14, 14, 106, 66, + 38, 20, 52, 22, 8, 4, 8, 1, 78, 34, + 16, 2, 32, 12, 1, 6, 18, 90, 54, 32, + 20, 62, 28, 26, 24, 16, 124, 23, 5, 16, + 5, 9, 2, 18, 20, 6, 20, 34, 48, 25, + 15, 27, 21, 44, 35, 37, 23, 3, 32, 65, + 45, 12, 24, 57, 67, 55, 9, 1, 30, 7, + 4, 1, 10, 28, 29, 9, 7, 10, 49, 19, + 33, 29, 20, 75, 25, 8, 13, 42, 15, 3, + 64, 15, 21, 3, 54, 49, 31, 85, 24, 52, + 52, 26, 7, 26, 18, 4, 8, 2, 3, 9, + 45, 55, 69, 51, 41, 67, 113, 63, 57, 57, + 47, 41, 15, 13, 43, 65, 19, 19, 21, 13, + 35, 73, 41, 47, 83, 57, 73, 71, 105, 97, + 81, 67, 41, 29, 65, 45, 53, 71, 83, 87, + 87, 87, 93, 101, 125, 111, 125, 125, 119, 101, + 121, 123, 11, 22, 6, 22, 32, 32, 124, 54, + 50, 9, 70, 124, 124, 124, 44, 14, 3, 83, + 125, 125, 125, 125, 125, 125, 5, 86, 52, 34, + 18, 28, 4, 7, 9, 3, 1, 10, 36, 18, + 58, 40, 22, 38, 40, 46, 48, 24, 52, 40, + 3, 20, 4, 25, 39, 53, 65, 123, 119, 91, + }, + + { + /* Context Tables for I, SI Slices :: qp = 49 */ + + 34, 6, 39, 34, 6, 39, 44, 60, 58, 26, + 17, 79, 17, 1, 102, 28, 118, 17, 50, 64, + 3, 43, 6, 77, 111, 13, 49, 125, 125, 125, + 124, 42, 13, 50, 64, 3, 7, 42, 42, 17, + 8, 2, 2, 19, 49, 13, 61, 10, 5, 0, + 11, 35, 15, 49, 36, 10, 4, 1, 26, 4, + 44, 0, 0, 0, 16, 69, 67, 26, 33, 15, + 104, 10, 11, 21, 2, 37, 22, 42, 38, 62, + 20, 48, 38, 28, 56, 13, 29, 19, 35, 74, + 4, 50, 1, 6, 9, 8, 56, 13, 29, 11, + 3, 40, 5, 18, 12, 15, 20, 7, 16, 26, + 16, 32, 46, 40, 43, 15, 8, 7, 4, 15, + 20, 19, 3, 38, 15, 30, 34, 8, 95, 18, + 15, 81, 37, 81, 9, 18, 4, 24, 12, 12, + 56, 16, 4, 32, 56, 19, 17, 10, 101, 27, + 18, 63, 2, 29, 44, 4, 55, 68, 2, 3, + 75, 74, 41, 79, 2, 89, 19, 18, 28, 12, + 18, 10, 7, 18, 32, 61, 23, 12, 4, 2, + 37, 21, 29, 41, 23, 39, 61, 27, 31, 123, + 67, 43, 79, 69, 51, 6, 26, 2, 5, 3, + 29, 45, 29, 61, 71, 53, 87, 111, 107, 125, + 43, 43, 75, 67, 79, 89, 99, 105, 115, 125, + 119, 125, 53, 111, 121, 81, 79, 4, 80, 58, + 38, 16, 10, 5, 3, 13, 0, 24, 104, 68, + 52, 34, 96, 40, 32, 10, 16, 16, 108, 68, + 40, 22, 54, 24, 10, 6, 12, 0, 82, 36, + 18, 4, 34, 14, 0, 10, 20, 92, 56, 34, + 22, 64, 30, 28, 26, 18, 124, 21, 3, 18, + 5, 7, 2, 20, 22, 8, 22, 36, 50, 23, + 15, 27, 21, 46, 35, 37, 25, 3, 32, 67, + 47, 12, 24, 57, 69, 55, 7, 0, 32, 5, + 6, 0, 12, 30, 29, 7, 5, 12, 49, 17, + 31, 27, 22, 75, 25, 10, 13, 44, 15, 3, + 66, 15, 21, 1, 56, 49, 31, 85, 24, 52, + 52, 26, 9, 26, 18, 4, 8, 2, 3, 9, + 45, 57, 71, 53, 43, 69, 115, 65, 59, 57, + 47, 41, 13, 11, 41, 65, 17, 23, 25, 17, + 39, 77, 45, 51, 87, 61, 77, 75, 109, 101, + 83, 67, 43, 31, 67, 47, 57, 75, 87, 91, + 91, 91, 97, 105, 125, 113, 125, 125, 121, 103, + 123, 125, 7, 26, 10, 24, 34, 34, 124, 56, + 52, 9, 72, 124, 124, 124, 46, 12, 5, 87, + 125, 125, 125, 125, 125, 125, 3, 88, 52, 34, + 18, 30, 4, 7, 9, 1, 0, 12, 38, 20, + 62, 42, 24, 40, 42, 48, 50, 26, 54, 42, + 3, 18, 2, 27, 41, 55, 67, 125, 121, 91, + }, + + { + /* Context Tables for I, SI Slices :: qp = 50 */ + + 32, 6, 39, 32, 6, 39, 48, 62, 58, 26, + 17, 81, 19, 3, 102, 28, 122, 17, 52, 66, + 3, 45, 6, 79, 113, 15, 53, 125, 125, 125, + 124, 44, 13, 52, 66, 3, 7, 44, 42, 19, + 8, 2, 4, 19, 49, 13, 61, 10, 5, 2, + 11, 35, 15, 49, 36, 10, 4, 1, 26, 4, + 44, 0, 0, 0, 18, 69, 67, 26, 35, 15, + 104, 10, 11, 19, 4, 37, 22, 44, 42, 66, + 22, 50, 42, 30, 60, 11, 27, 17, 33, 76, + 4, 52, 0, 6, 9, 10, 60, 13, 29, 11, + 1, 40, 5, 18, 14, 15, 22, 7, 16, 26, + 16, 34, 46, 40, 45, 15, 8, 7, 4, 15, + 20, 19, 5, 38, 17, 30, 34, 8, 99, 18, + 15, 85, 39, 83, 7, 20, 6, 26, 14, 14, + 58, 18, 4, 34, 58, 19, 17, 10, 103, 27, + 20, 63, 2, 29, 46, 4, 57, 70, 2, 3, + 77, 76, 43, 81, 2, 91, 23, 16, 26, 10, + 16, 8, 9, 16, 32, 63, 25, 12, 4, 2, + 37, 23, 31, 43, 25, 41, 63, 29, 33, 125, + 69, 45, 81, 71, 51, 2, 24, 1, 9, 7, + 33, 49, 33, 67, 75, 57, 93, 117, 111, 125, + 47, 47, 79, 71, 83, 93, 103, 109, 119, 125, + 123, 125, 55, 115, 123, 83, 79, 4, 80, 58, + 38, 16, 10, 5, 3, 13, 2, 24, 104, 68, + 52, 34, 98, 40, 34, 10, 18, 18, 110, 70, + 40, 22, 56, 24, 10, 8, 14, 2, 84, 38, + 18, 4, 36, 16, 2, 12, 20, 92, 56, 34, + 22, 66, 32, 30, 28, 20, 124, 21, 3, 18, + 5, 7, 2, 20, 22, 8, 22, 36, 52, 23, + 15, 27, 23, 46, 37, 39, 27, 5, 32, 69, + 49, 12, 24, 59, 71, 57, 5, 2, 34, 5, + 6, 0, 14, 32, 29, 7, 5, 12, 51, 17, + 31, 27, 24, 75, 25, 10, 13, 46, 15, 3, + 68, 15, 21, 1, 58, 51, 31, 87, 22, 52, + 52, 24, 11, 24, 16, 2, 6, 0, 5, 11, + 47, 59, 73, 57, 45, 73, 119, 67, 61, 59, + 49, 41, 13, 11, 41, 67, 15, 27, 29, 21, + 43, 83, 49, 55, 93, 65, 81, 79, 113, 105, + 85, 69, 47, 35, 69, 51, 61, 79, 91, 95, + 95, 95, 101, 109, 125, 117, 125, 125, 123, 107, + 125, 125, 5, 28, 12, 26, 36, 36, 124, 58, + 54, 9, 74, 124, 124, 124, 48, 10, 9, 93, + 125, 125, 125, 125, 125, 125, 3, 88, 52, 34, + 18, 30, 4, 7, 9, 0, 2, 14, 40, 22, + 64, 44, 26, 42, 44, 50, 52, 26, 56, 42, + 3, 16, 1, 31, 45, 59, 71, 125, 125, 93, + }, + + { + /* Context Tables for I, SI Slices :: qp = 51 */ + + 30, 6, 39, 30, 6, 39, 52, 66, 60, 26, + 19, 85, 23, 5, 102, 28, 124, 17, 54, 68, + 3, 47, 6, 81, 115, 15, 57, 125, 125, 125, + 124, 46, 13, 54, 68, 3, 5, 46, 44, 19, + 8, 4, 6, 21, 51, 13, 61, 10, 5, 2, + 11, 35, 15, 49, 38, 10, 4, 1, 28, 4, + 44, 0, 0, 0, 18, 69, 67, 28, 37, 15, + 104, 10, 11, 17, 6, 37, 24, 46, 46, 70, + 26, 52, 46, 32, 64, 9, 25, 15, 31, 78, + 4, 54, 2, 6, 7, 10, 64, 13, 27, 9, + 0, 40, 5, 20, 18, 13, 24, 7, 16, 26, + 16, 36, 48, 42, 45, 15, 8, 7, 4, 15, + 20, 21, 5, 38, 19, 30, 34, 8, 103, 18, + 17, 89, 41, 85, 5, 22, 8, 28, 16, 14, + 60, 18, 6, 36, 60, 19, 17, 10, 105, 25, + 22, 63, 2, 29, 48, 4, 59, 72, 2, 3, + 79, 78, 43, 83, 2, 93, 25, 14, 26, 8, + 14, 6, 9, 14, 30, 65, 27, 12, 4, 2, + 39, 25, 33, 45, 27, 43, 65, 31, 33, 125, + 71, 45, 83, 73, 51, 0, 20, 3, 13, 11, + 37, 53, 37, 71, 81, 61, 97, 121, 117, 125, + 51, 51, 81, 75, 87, 99, 109, 113, 125, 125, + 125, 125, 57, 119, 125, 85, 79, 4, 82, 58, + 38, 16, 10, 5, 3, 13, 2, 24, 104, 68, + 52, 34, 100, 42, 34, 12, 18, 20, 112, 70, + 42, 22, 58, 26, 12, 8, 16, 4, 86, 40, + 20, 4, 38, 18, 4, 14, 22, 92, 56, 34, + 22, 68, 32, 32, 30, 22, 124, 19, 1, 20, + 5, 7, 2, 20, 22, 8, 22, 36, 52, 23, + 15, 27, 23, 48, 39, 41, 29, 7, 32, 71, + 51, 12, 24, 59, 73, 57, 3, 4, 36, 3, + 8, 2, 14, 34, 29, 7, 5, 14, 51, 17, + 31, 25, 26, 75, 25, 10, 13, 48, 15, 3, + 70, 15, 21, 1, 60, 53, 31, 89, 20, 50, + 50, 22, 13, 22, 14, 0, 4, 1, 7, 13, + 49, 61, 75, 59, 47, 75, 123, 71, 63, 61, + 49, 43, 13, 11, 41, 67, 13, 31, 33, 25, + 47, 89, 53, 59, 97, 69, 85, 83, 119, 109, + 89, 71, 51, 37, 73, 55, 65, 83, 95, 99, + 99, 99, 105, 113, 125, 121, 125, 125, 125, 111, + 125, 125, 3, 30, 14, 28, 38, 38, 124, 60, + 56, 9, 76, 124, 124, 124, 50, 6, 11, 97, + 125, 125, 125, 125, 125, 125, 3, 88, 52, 34, + 18, 30, 4, 7, 9, 0, 4, 16, 42, 24, + 66, 44, 26, 44, 44, 50, 52, 26, 56, 42, + 3, 14, 3, 35, 49, 63, 75, 125, 125, 95, + }, + + }, + +}; diff --git a/common/ih264_cabac_tables.h b/common/ih264_cabac_tables.h new file mode 100755 index 0000000..0cef51e --- /dev/null +++ b/common/ih264_cabac_tables.h @@ -0,0 +1,101 @@ +/****************************************************************************** + * + * Copyright (C) 2015 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ***************************************************************************** + * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore +*/ + +/** +****************************************************************************** +* @file ih264_cabac_tables.h +* +* @brief +* This file contains enumerations, macros and extern declarations of H264 +* cabac tables +* +* @author +* Ittiam +* +* @remarks +* none +****************************************************************************** +*/ + +#ifndef IH264_CABAC_TABLES_H_ +#define IH264_CABAC_TABLES_H_ + +/*****************************************************************************/ +/* Constant Macros */ +/*****************************************************************************/ + +/** +****************************************************************************** + * @brief maximum range of cabac_init_idc (0-2) +****************************************************************************** + */ +#define IH264_NUM_CABAC_INIT_IDC_PLUS_ONE 4 + +/** +****************************************************************************** + * @brief max range of qps in H264 (0-51) +****************************************************************************** + */ +#define IH264_MAX_QP 52 + +/** +****************************************************************************** + * @brief max range of cabac contexts in H264 (0-459) +****************************************************************************** + */ +#define IH264_NUM_CABAC_CTXTS 460 + +/*****************************************************************************/ +/* Extern global declarations */ +/*****************************************************************************/ + +/** + ****************************************************************************** + * @breif Table for rangeTabLPS depending on pStateIdx and qCodIRangeIdx + * input : pStateIdx(0-63) and qCodIRangeIdx(0-3) [(Range >> 6) & 0x3] + * output : RLps + * + * @remarks See Table 9-35 of H264 spec for rangeTabLPS + ******************************************************************************* + */ +extern const UWORD8 gau1_ih264_cabac_rlps[64][4]; + + +/** + ****************************************************************************** + * @breif probability+MPS state transition tables based on cur State and bin + * input : curpState[bits7-2] | curMPS[bit1] | decodedBin[bit0] + * output : nextpState[bits6-1] | nextMPS[bit0] + * @remarks Modified form of Table-9-36 State Transition table in H264 spec + ****************************************************************************** + */ +extern const UWORD8 gau1_ih264_next_state[128*2]; + + +/** + ****************************************************************************** + * @brief Init context tables for all combinations of qp and cabac_init_idc + * @remarks Packing format MPS in lsb and pState in bits[1-6] + ****************************************************************************** + */ +extern const UWORD8 gau1_ih264_cab_ctxts[IH264_NUM_CABAC_INIT_IDC_PLUS_ONE][IH264_MAX_QP][IH264_NUM_CABAC_CTXTS]; + + +#endif /* IH264_CABAC_TABLES_H_ */ diff --git a/common/ih264_cavlc_tables.c b/common/ih264_cavlc_tables.c new file mode 100755 index 0000000..f122ab9 --- /dev/null +++ b/common/ih264_cavlc_tables.c @@ -0,0 +1,282 @@ +/****************************************************************************** + * + * Copyright (C) 2015 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ***************************************************************************** + * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore +*/ + + +/** +****************************************************************************** +* @file +* ih264_cavlc_tables.c +* +* @brief +* This file contains H264 cavlc tables for encoding coeff_tokens, levels, total +* zeros and runs before zeros +* +* @author +* Ittiam +* +* @par List of Tables +* - gu1_code_coeff_token_table +* - gu1_size_coeff_token_table +* - gu1_code_coeff_token_table_chroma +* - gu1_size_coeff_token_table_chroma +* - gu1_threshold_vlc_level +* - gu1_size_zero_table +* - gu1_code_zero_table +* - gu1_size_zero_table_chroma +* - gu1_code_zero_table_chroma +* - gu1_index_zero_table +* - gu1_size_run_table +* - gu1_code_run_table +* - gu4_codeword_level_tables +* - gu1_codesize_level_tables +* +* @remarks +* none +* +****************************************************************************** +*/ + +/*****************************************************************************/ +/* File Includes */ +/*****************************************************************************/ + +/* User include files */ +#include "ih264_typedefs.h" +#include "ih264_cavlc_tables.h" + + +/*****************************************************************************/ +/* Extern global definitions */ +/*****************************************************************************/ + +/** + ****************************************************************************** + * @brief Assignment of cbp to a codenum for intra and inter prediction modes + * chroma format idc != 0 + * input : cbp, intra - 0/inter - 1 + * output : codenum + * @remarks Table 9-4 – Assignment of codeNum to values of coded_block_pattern + * for macroblock prediction modes in H264 spec + ****************************************************************************** + */ +const UWORD8 gu1_cbp_map_tables[48][2]= +{ + { 3, 0}, {29, 2}, {30, 3}, {17, 7}, {31, 4}, {18, 8}, {37, 17}, { 8, 13}, + {32, 5}, {38, 18}, {19, 9}, { 9, 14}, {20, 10}, {10, 15}, {11, 16}, { 2, 11}, + {16, 1}, {33, 32}, {34, 33}, {21, 36}, {35, 34}, {22, 37}, {39, 44}, { 4, 40}, + {36, 35}, {40, 45}, {23, 38}, { 5, 41}, {24, 39}, { 6, 42}, { 7, 43}, { 1, 19}, + {41, 6}, {42, 24}, {43, 25}, {25, 20}, {44, 26}, {26, 21}, {46, 46}, {12, 28}, + {45, 27}, {47, 47}, {27, 22}, {13, 29}, {28, 23}, {14, 30}, {15, 31}, { 0, 12}, +}; + + +/** + ****************************************************************************** + * @brief total non-zero coefficients and numbers of trailing ones of a residual + * block are mapped to coeff_token using the tables given below. + * input : VLC-Num | Trailing ones | Total coeffs + * output : coeff_token (code word, size of the code word) + * @remarks Table-9-5 coeff_token mapping to TotalCoeff( coeff_token ) + * and TrailingOnes( coeff_token ) in H264 spec + ****************************************************************************** + */ +const UWORD8 gu1_code_coeff_token_table[3][4][16] = +{ + { + { 5, 7, 7, 7, 7, 15, 11, 8, 15, 11, 15, 11, 15, 11, 7, 4, }, + { 1, 4, 6, 6, 6, 6, 14, 10, 14, 10, 14, 10, 1, 14, 10, 6, }, + { 0, 1, 5, 5, 5, 5, 5, 13, 9, 13, 9, 13, 9, 13, 9, 5, }, + { 0, 0, 3, 3, 4, 4, 4, 4, 4, 12, 12, 8, 12, 8, 12, 8, }, + }, + { + {11, 7, 7, 7, 4, 7, 15, 11, 15, 11, 8, 15, 11, 7, 9, 7, }, + { 2, 7, 10, 6, 6, 6, 6, 14, 10, 14, 10, 14, 10, 11, 8, 6, }, + { 0, 3, 9, 5, 5, 5, 5, 13, 9, 13, 9, 13, 9, 6, 10, 5, }, + { 0, 0, 5, 4, 6, 8, 4, 4, 4, 12, 8, 12, 12, 8, 1, 4, }, + }, + { + {15, 11, 8, 15, 11, 9, 8, 15, 11, 15, 11, 8, 13, 9, 5, 1, }, + {14, 15, 12, 10, 8, 14, 10, 14, 14, 10, 14, 10, 7, 12, 8, 4, }, + { 0, 13, 14, 11, 9, 13, 9, 13, 10, 13, 9, 13, 9, 11, 7, 3, }, + { 0, 0, 12, 11, 10, 9, 8, 13, 12, 12, 12, 8, 12, 10, 6, 2, }, + }, +}; + +const UWORD8 gu1_size_coeff_token_table[3][4][16] = +{ + { + { 6, 8, 9, 10, 11, 13, 13, 13, 14, 14, 15, 15, 16, 16, 16, 16, }, + { 2, 6, 8, 9, 10, 11, 13, 13, 14, 14, 15, 15, 15, 16, 16, 16, }, + { 0, 3, 7, 8, 9, 10, 11, 13, 13, 14, 14, 15, 15, 16, 16, 16, }, + { 0, 0, 5, 6, 7, 8, 9, 10, 11, 13, 14, 14, 15, 15, 16, 16, }, + }, + { + { 6, 6, 7, 8, 8, 9, 11, 11, 12, 12, 12, 13, 13, 13, 14, 14, }, + { 2, 5, 6, 6, 7, 8, 9, 11, 11, 12, 12, 13, 13, 14, 14, 14, }, + { 0, 3, 6, 6, 7, 8, 9, 11, 11, 12, 12, 13, 13, 13, 14, 14, }, + { 0, 0, 4, 4, 5, 6, 6, 7, 9, 11, 11, 12, 13, 13, 13, 14, }, + }, + { + { 6, 6, 6, 7, 7, 7, 7, 8, 8, 9, 9, 9, 10, 10, 10, 10, }, + { 4, 5, 5, 5, 5, 6, 6, 7, 8, 8, 9, 9, 9, 10, 10, 10, }, + { 0, 4, 5, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 10, }, + { 0, 0, 4, 4, 4, 4, 4, 5, 6, 7, 8, 8, 9, 10, 10, 10, }, + }, +}; +const UWORD8 gu1_code_coeff_token_table_chroma[4][4] = +{ + { 7, 4, 3, 2, }, + { 1, 6, 3, 3, }, + { 0, 1, 2, 2, }, + { 0, 0, 5, 0, }, +}; + +const UWORD8 gu1_size_coeff_token_table_chroma[4][4] = +{ + { 6, 6, 6, 6, }, + { 1, 6, 7, 8, }, + { 0, 3, 7, 8, }, + { 0, 0, 6, 7, }, +}; + +/** + ****************************************************************************** + * @brief After encoding the current Level, to encode the next level, the choice + * of VLC table needs to be updated. The update is carried basing on a set of thresholds. + * These thresholds are listed in the table below for lookup. + * input : suffix_length + * output : threshold + ****************************************************************************** + */ +const UWORD8 gu1_threshold_vlc_level[6] = +{ + 0, 3, 6, 12, 24, 48 +}; + + +/** + ****************************************************************************** + * @brief table for encoding total number of zeros + * input : coeff_token, total zeros + * output : code word, size of the code word + * @remarks Table-9-7, 9-8 total_zeros tables for 4x4 blocks with + * TotalCoeff( coeff_token ) in H264 spec + ****************************************************************************** + */ +const UWORD8 gu1_size_zero_table[135] = +{ + 1, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 9, + 3, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 6, 6, 6, 6, + 4, 3, 3, 3, 4, 4, 3, 3, 4, 5, 5, 6, 5, 6, + 5, 3, 4, 4, 3, 3, 3, 4, 3, 4, 5, 5, 5, + 4, 4, 4, 3, 3, 3, 3, 3, 4, 5, 4, 5, + 6, 5, 3, 3, 3, 3, 3, 3, 4, 3, 6, + 6, 5, 3, 3, 3, 2, 3, 4, 3, 6, + 6, 4, 5, 3, 2, 2, 3, 3, 6, + 6, 6, 4, 2, 2, 3, 2, 5, + 5, 5, 3, 2, 2, 2, 4, + 4, 4, 3, 3, 1, 3, + 4, 4, 2, 1, 3, + 3, 3, 1, 2, + 2, 2, 1, + 1, 1, +}; +const UWORD8 gu1_code_zero_table[135] = +{ + 1, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 1, + 7, 6, 5, 4, 3, 5, 4, 3, 2, 3, 2, 3, 2, 1, 0, + 5, 7, 6, 5, 4, 3, 4, 3, 2, 3, 2, 1, 1, 0, + 3, 7, 5, 4, 6, 5, 4, 3, 3, 2, 2, 1, 0, + 5, 4, 3, 7, 6, 5, 4, 3, 2, 1, 1, 0, + 1, 1, 7, 6, 5, 4, 3, 2, 1, 1, 0, + 1, 1, 5, 4, 3, 3, 2, 1, 1, 0, + 1, 1, 1, 3, 3, 2, 2, 1, 0, + 1, 0, 1, 3, 2, 1, 1, 1, + 1, 0, 1, 3, 2, 1, 1, + 0, 1, 1, 2, 1, 3, + 0, 1, 1, 1, 1, + 0, 1, 1, 1, + 0, 1, 1, + 0, 1, +}; +const UWORD8 gu1_size_zero_table_chroma[9] = +{ + 1, 2, 3, 3, + 1, 2, 2, + 1, 1, +}; +const UWORD8 gu1_code_zero_table_chroma[9] = +{ + 1, 1, 1, 0, + 1, 1, 0, + 1, 0, +}; + +/** + ****************************************************************************** + * @brief index to access zero table (look up) + * input : TotalCoeff( coeff_token ) + * output : index to access zero table + ****************************************************************************** + */ +const UWORD8 gu1_index_zero_table[15] = +{ + 0, 16, 31, 45, 58, 70, 81, 91, 100, 108, 115, 121, 126, 130, 133, +}; + +/** + ****************************************************************************** + * @brief table for encoding runs of zeros before + * input : zeros left, runs of zeros before + * output : code word, size of the code word + * @remarks Table-9-10 table for run_before in H264 spec + ****************************************************************************** + */ +const UWORD8 gu1_size_run_table[42] = +{ + 1, 1, + 1, 2, 2, + 2, 2, 2, 2, + 2, 2, 2, 3, 3, + 2, 2, 3, 3, 3, 3, + 2, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 4, 5, 6, 7, 8, 9, 10, 11, +}; +const UWORD8 gu1_code_run_table[42] = +{ + 1, 0, + 1, 1, 0, + 3, 2, 1, 0, + 3, 2, 1, 1, 0, + 3, 2, 3, 2, 1, 0, + 3, 0, 1, 3, 2, 5, 4, + 7, 6, 5, 4, 3, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, +}; +/** + ****************************************************************************** + * @brief index to access zero table (look up) + * input : TotalCoeff( coeff_token ) + * output : index to access zero table + ****************************************************************************** + */ +const UWORD8 gu1_index_run_table[7] = +{ + 0, 2, 5, 9, 14, 20, 27, +}; diff --git a/common/ih264_cavlc_tables.h b/common/ih264_cavlc_tables.h new file mode 100755 index 0000000..78057b5 --- /dev/null +++ b/common/ih264_cavlc_tables.h @@ -0,0 +1,133 @@ +/****************************************************************************** + * + * Copyright (C) 2015 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ***************************************************************************** + * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore +*/ + +/** +****************************************************************************** +* @file ih264_cavlc_tables.h +* +* @brief +* This file contains enumerations, macros and extern declarations of H264 +* cavlc tables +* +* @author +* Ittiam +* +* @remarks +* none +****************************************************************************** +*/ + +#ifndef IH264_CAVLC_TABLES_H_ +#define IH264_CAVLC_TABLES_H_ + +/*****************************************************************************/ +/* Constant Macros */ +/*****************************************************************************/ +/** +****************************************************************************** + * @brief maximum zeros left +****************************************************************************** + */ +#define MAX_ZERO_LEFT 6 + +/*****************************************************************************/ +/* Extern global declarations */ +/*****************************************************************************/ + +/** + ****************************************************************************** + * @brief Assignment of cbp to a codenum for intra and inter prediction modes + * chroma format idc != 0 + * input : cbp, intra - 0/inter - 1 + * output : codenum + * @remarks Table 9-4 – Assignment of codeNum to values of coded_block_pattern + * for macroblock prediction modes in H264 spec + ****************************************************************************** + */ +extern const UWORD8 gu1_cbp_map_tables[48][2]; + +/** + ****************************************************************************** + * @brief total non-zero coefficients and numbers of trailing ones of a residual + * block are mapped to coefftoken using the tables given below. + * input : VLC-Num | Trailing ones | Total coeffs + * output : coeff_token (code word, size of the code word) + * @remarks Table-9-5 coeff_token mapping to TotalCoeff( coeff_token ) + * and TrailingOnes( coeff_token ) in H264 spec + ****************************************************************************** + */ +extern const UWORD8 gu1_code_coeff_token_table[3][4][16]; +extern const UWORD8 gu1_size_coeff_token_table[3][4][16]; +extern const UWORD8 gu1_code_coeff_token_table_chroma[4][4]; +extern const UWORD8 gu1_size_coeff_token_table_chroma[4][4]; + +/** + ****************************************************************************** + * @brief Thresholds for determining whether to increment Level table number. + * input : suffix_length + * output : threshold + ****************************************************************************** + */ +extern const UWORD8 gu1_threshold_vlc_level[6]; + +/** + ****************************************************************************** + * @brief table for encoding total number of zeros + * input : coeff_token, total zeros + * output : code word, size of the code word + * @remarks Table-9-7, 9-8 total_zeros tables for 4x4 blocks with + * TotalCoeff( coeff_token ) in H264 spec + ****************************************************************************** + */ +extern const UWORD8 gu1_size_zero_table[135]; +extern const UWORD8 gu1_code_zero_table[135]; +extern const UWORD8 gu1_size_zero_table_chroma[9]; +extern const UWORD8 gu1_code_zero_table_chroma[9]; + +/** + ****************************************************************************** + * @brief index to access zero table (for speed) + * input : TotalCoeff( coeff_token ) + * output : index to access zero table + ****************************************************************************** + */ +extern const UWORD8 gu1_index_zero_table[15]; + +/** + ****************************************************************************** + * @brief table for encoding runs of zeros before + * input : zeros left, runs of zeros before + * output : code word, size of the code word + * @remarks Table-9-10 table for run_before in H264 spec + ****************************************************************************** + */ +extern const UWORD8 gu1_size_run_table[42]; +extern const UWORD8 gu1_code_run_table[42]; + +/** + ****************************************************************************** + * @brief index to access run table (look up) + * input : zeros left + * output : index to access run table + ****************************************************************************** + */ +extern const UWORD8 gu1_index_run_table[7]; + +#endif /* IH264_CAVLC_TABLES_H_ */ diff --git a/common/ih264_chroma_intra_pred_filters.c b/common/ih264_chroma_intra_pred_filters.c new file mode 100755 index 0000000..ee145e5 --- /dev/null +++ b/common/ih264_chroma_intra_pred_filters.c @@ -0,0 +1,478 @@ +/****************************************************************************** + * + * Copyright (C) 2015 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ***************************************************************************** + * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore +*/ +/** +******************************************************************************* +* @file +* ih264_chroma_intra_pred_filters.c +* +* @brief +* Contains function definitions for chroma intra prediction filters +* +* @author +* Ittiam +* +* @par List of Functions: +* -ih264_intra_pred_chroma_8x8_mode_dc +* -ih264_intra_pred_chroma_8x8_mode_horz +* -ih264_intra_pred_chroma_8x8_mode_vert +* -ih264_intra_pred_chroma_8x8_mode_plane +* +* @remarks +* None +* +******************************************************************************* +*/ + +/*****************************************************************************/ +/* File Includes */ +/*****************************************************************************/ + +/* System include files */ +#include <stdio.h> +#include <stddef.h> +#include <string.h> + +/* User include files */ +#include "ih264_defs.h" +#include "ih264_typedefs.h" +#include "ih264_macros.h" +#include "ih264_platform_macros.h" +#include "ih264_intra_pred_filters.h" + +/* Global variables used only in assembly files*/ +const WORD8 ih264_gai1_intrapred_chroma_plane_coeffs1[] = +{ 0x01,0x00,0x01,0x00, + 0x02,0x00,0x02,0x00, + 0x03,0x00,0x03,0x00, + 0x04,0x00,0x04,0x00 +}; + const WORD8 ih264_gai1_intrapred_chroma_plane_coeffs2[] = + { 0xfd,0xff,0xfe,0xff, + 0xff,0xff,0x00,0x00, + 0x01,0x00,0x02,0x00, + 0x03,0x00,0x04,0x00, + }; + +/*****************************************************************************/ +/* Chroma Intra prediction 8x8 filters */ +/*****************************************************************************/ + +/** +******************************************************************************* +* +* ih264_intra_pred_chroma_8x8_mode_dc +* +* @brief +* Perform Intra prediction for chroma_8x8 mode:DC +* +* @par Description: +* Perform Intra prediction for chroma_8x8 mode:DC ,described in sec 8.3.4.1 +* +* @param[in] pu1_src +* UWORD8 pointer to the source containing alternate U and V samples +* +* @param[out] pu1_dst +* UWORD8 pointer to the destination with alternate U and V samples +* +* @param[in] src_strd +* integer source stride +* +* @param[in] dst_strd +* integer destination stride +* +** @param[in] ngbr_avail +* availability of neighbouring pixels +* +* @returns +* +* @remarks +* None +* +****************************************************************************** +*/ +void ih264_intra_pred_chroma_8x8_mode_dc(UWORD8 *pu1_src, + UWORD8 *pu1_dst, + WORD32 src_strd, + WORD32 dst_strd, + WORD32 ngbr_avail) +{ + WORD32 left_avail, left_avail1, left_avail2; /* availability of left predictors (only for DC) */ + WORD32 top_avail; /* availability of top predictors (only for DC) */ + UWORD8 *pu1_left = NULL; /* Pointer to start of left predictors */ + UWORD8 *pu1_top = NULL; /* Pointer to start of top predictors */ + UNUSED(src_strd); + + /* temporary variables to store accumulated first left half,second left half, + * first top half,second top half of U and V values*/ + WORD32 val_u_l1 = 0, val_u_l2 = 0, val_u_t1 = 0, val_u_t2 = 0; + WORD32 val_v_l1 = 0, val_v_l2 = 0, val_v_t1 = 0, val_v_t2 = 0; + + WORD32 val_u1 = 0, val_u2 = 0, val_v1 = 0, val_v2 = 0; + + WORD32 col, row; /*loop variables*/ + + left_avail = ngbr_avail & 0x11; + left_avail1 = ngbr_avail & 1; + left_avail2 = (ngbr_avail >> 4) & 1; + top_avail = (ngbr_avail >> 2) & 1; + + pu1_top = pu1_src + 2 * BLK8x8SIZE + 2; + pu1_left = pu1_src + 2 * BLK8x8SIZE - 2; + + if(left_avail1) + { /* First 4x4 block*/ + val_u_l1 += *pu1_left; + val_v_l1 += *(pu1_left + 1); + pu1_left -= 2; + val_u_l1 += *pu1_left; + val_v_l1 += *(pu1_left + 1); + pu1_left -= 2; + val_u_l1 += *pu1_left; + val_v_l1 += *(pu1_left + 1); + pu1_left -= 2; + val_u_l1 += *pu1_left + 2; + val_v_l1 += *(pu1_left + 1) + 2; + pu1_left -= 2; + } + else + pu1_left -= 2 * 4; + + if(left_avail2) + { + /* Second 4x4 block*/ + val_u_l2 += *pu1_left; + val_v_l2 += *(pu1_left + 1); + pu1_left -= 2; + val_u_l2 += *pu1_left; + val_v_l2 += *(pu1_left + 1); + pu1_left -= 2; + val_u_l2 += *pu1_left; + val_v_l2 += *(pu1_left + 1); + pu1_left -= 2; + val_u_l2 += *pu1_left + 2; + val_v_l2 += *(pu1_left + 1) + 2; + pu1_left -= 2; + } + else + pu1_left -= 2 * 4; + + if(top_avail) + { + val_u_t1 += *pu1_top + *(pu1_top + 2) + *(pu1_top + 4) + + *(pu1_top + 6) + 2; + val_u_t2 += *(pu1_top + 8) + *(pu1_top + 10) + *(pu1_top + 12) + + *(pu1_top + 14) + 2; + val_v_t1 += *(pu1_top + 1) + *(pu1_top + 3) + *(pu1_top + 5) + + *(pu1_top + 7) + 2; + val_v_t2 += *(pu1_top + 9) + *(pu1_top + 11) + *(pu1_top + 13) + + *(pu1_top + 15) + 2; + } + + if(left_avail + top_avail) + { + val_u1 = (left_avail1 + top_avail) ? + ((val_u_l1 + val_u_t1) + >> (1 + left_avail1 + top_avail)) :128; + val_v1 = (left_avail1 + top_avail) ? + ((val_v_l1 + val_v_t1) + >> (1 + left_avail1 + top_avail)) :128; + if(top_avail) + { + val_u2 = val_u_t2 >> 2; + val_v2 = val_v_t2 >> 2; + } + else if(left_avail1) + { + val_u2 = val_u_l1 >> 2; + val_v2 = val_v_l1 >> 2; + } + else + { + val_u2 = val_v2 = 128; + } + + for(row = 0; row < 4; row++) + { + /*top left 4x4 block*/ + for(col = 0; col < 8; col += 2) + { + *(pu1_dst + row * dst_strd + col) = val_u1; + *(pu1_dst + row * dst_strd + col + 1) = val_v1; + } + /*top right 4x4 block*/ + for(col = 8; col < 16; col += 2) + { + *(pu1_dst + row * dst_strd + col) = val_u2; + *(pu1_dst + row * dst_strd + col + 1) = val_v2; + } + } + + if(left_avail2) + { + val_u1 = val_u_l2 >> 2; + val_v1 = val_v_l2 >> 2; + } + else if(top_avail) + { + val_u1 = val_u_t1 >> 2; + val_v1 = val_v_t1 >> 2; + } + else + { + val_u1 = val_v1 = 128; + } + val_u2 = (left_avail2 + top_avail) ? + ((val_u_l2 + val_u_t2) + >> (1 + left_avail2 + top_avail)) : 128; + val_v2 = (left_avail2 + top_avail) ? + ((val_v_l2 + val_v_t2) + >> (1 + left_avail2 + top_avail)) : 128; + + for(row = 4; row < 8; row++) + { /*bottom left 4x4 block*/ + for(col = 0; col < 8; col += 2) + { + *(pu1_dst + row * dst_strd + col) = val_u1; + *(pu1_dst + row * dst_strd + col + 1) = val_v1; + } + /*bottom right 4x4 block*/ + for(col = 8; col < 16; col += 2) + { + *(pu1_dst + row * dst_strd + col) = val_u2; + *(pu1_dst + row * dst_strd + col + 1) = val_v2; + } + } + } + else + { + /* Both left and top are unavailable, set the block to 128 */ + for(row = 0; row < 8; row++) + { + memset(pu1_dst + row * dst_strd, 128, 8 * sizeof(UWORD16)); + } + } +} + +/** +******************************************************************************* +* +*ih264_intra_pred_chroma_8x8_mode_horz +* +* @brief +* Perform Intra prediction for chroma_8x8 mode:Horizontal +* +* @par Description: +* Perform Intra prediction for chroma_8x8 mode:Horizontal ,described in sec 8.3.4.2 +* +* @param[in] pu1_src +* UWORD8 pointer to the source containing alternate U and V samples +* +* @param[out] pu1_dst +* UWORD8 pointer to the destination with alternate U and V samples +* +* @param[in] src_strd +* integer source stride +* +* @param[in] dst_strd +* integer destination stride +* +* @param[in] ngbr_avail +* availability of neighbouring pixels(Not used in this function) +* +* @returns +* +* @remarks +* None +* +****************************************************************************** +*/ +void ih264_intra_pred_chroma_8x8_mode_horz(UWORD8 *pu1_src, + UWORD8 *pu1_dst, + WORD32 src_strd, + WORD32 dst_strd, + WORD32 ngbr_avail) +{ + + UWORD8 *pu1_left = NULL; /* Pointer to start of top predictors */ + WORD32 rows, cols; /* loop variables*/ + UNUSED(src_strd); + UNUSED(ngbr_avail); + pu1_left = pu1_src + 2 * BLK8x8SIZE - 2; + for(rows = 0; rows < 8; rows++) + { + for(cols = 0; cols < 16; cols += 2) + { + *(pu1_dst + rows * dst_strd + cols) = *pu1_left; + + *(pu1_dst + rows * dst_strd + cols + 1) = *(pu1_left + 1); + } + pu1_left -= 2; + } + +} + +/** +******************************************************************************* +* +*ih264_intra_pred_chroma_8x8_mode_vert +* +* @brief +* Perform Intra prediction for chroma_8x8 mode:vertical +* +* @par Description: +* Perform Intra prediction for chroma_8x8 mode:vertical ,described in sec 8.3.4.3 +* +* @param[in] pu1_src +* UWORD8 pointer to the source containing alternate U and V samples +* +* @param[out] pu1_dst +* UWORD8 pointer to the destination with alternate U and V samples +* +* @param[in] src_strd +* integer source stride +* +* @param[in] dst_strd +* integer destination stride +* +* @param[in] ngbr_avail +* availability of neighbouring pixels(Not used in this function) +* +* @returns +* +* @remarks +* None +* +******************************************************************************* +*/ +void ih264_intra_pred_chroma_8x8_mode_vert(UWORD8 *pu1_src, + UWORD8 *pu1_dst, + WORD32 src_strd, + WORD32 dst_strd, + WORD32 ngbr_avail) +{ + + UWORD8 *pu1_top = NULL; /* Pointer to start of top predictors */ + WORD32 row;/*loop variable*/ + UNUSED(src_strd); + UNUSED(ngbr_avail); + pu1_top = pu1_src + 2 * BLK8x8SIZE + 2; + + /* 8 bytes are copied from src to dst */ + for(row = 0; row < 2; row++) + { + memcpy(pu1_dst, pu1_top, 16); + + pu1_dst += dst_strd; + memcpy(pu1_dst, pu1_top, 16); + + pu1_dst += dst_strd; + memcpy(pu1_dst, pu1_top, 16); + + pu1_dst += dst_strd; + memcpy(pu1_dst, pu1_top, 16); + + pu1_dst += dst_strd; + } +} + +/** +******************************************************************************* +* +* ih264_intra_pred_chroma_8x8_mode_plane +* +* @brief +* Perform Intra prediction for chroma_8x8 mode:PLANE +* +* @par Description: +* Perform Intra prediction for chroma_8x8 mode:PLANE ,described in sec 8.3.4.4 +* +* @param[in] pu1_src +* UWORD8 pointer to the source containing alternate U and V samples +* +* @param[out] pu1_dst +* UWORD8 pointer to the destination with alternate U and V samples +* +* @param[in] src_strd +* integer source stride +* +* @param[in] dst_strd +* integer destination stride +* +* @param[in] ngbr_avail +* availability of neighbouring pixels(Not used in this function) +* +* @returns +* +* @remarks +* None +* +****************************************************************************** +*/ +void ih264_intra_pred_chroma_8x8_mode_plane(UWORD8 *pu1_src, + UWORD8 *pu1_dst, + WORD32 src_strd, + WORD32 dst_strd, + WORD32 ngbr_avail) +{ + + UWORD8 *pu1_left = NULL; /* Pointer to start of left predictors */ + UWORD8 *pu1_top = NULL; /* Pointer to start of top predictors */ + WORD32 val = 0; + WORD32 rows, cols; /* loop variables*/ + WORD32 a_u, b_u, c_u, h_u, v_u; /* Implementing section 8.3.4.4 . The variables represent the corresponding variables in the section*/ + WORD32 a_v, b_v, c_v, h_v, v_v; + UNUSED(src_strd); + UNUSED(ngbr_avail); + a_u = b_u = c_u = h_u = v_u = 0; + a_v = b_v = c_v = h_v = v_v = 0; + /* As chroma format 4:2:0 is used,xCF = 4 * ( chroma_format_idc = = 3 ) = 0 and + yCF = 4 * ( chroma_format_idc != 1 ) = 0 */ + pu1_top = pu1_src + 2 * BLK8x8SIZE + 2; + pu1_left = pu1_src + 2 * BLK8x8SIZE - 2; + /* Implementing section 8.3.4.4 */ + for(cols = 0; cols < 4; cols++) + { + h_u += (cols + 1) * (pu1_top[8 + 2 * cols] - pu1_top[4 - 2 * cols]);/*section 8.3.4.4 equation (8-144)*/ + h_v += (cols + 1) * (pu1_top[8 + 2 * cols + 1] - pu1_top[4 - 2 * cols+ 1]); + + v_u += (cols + 1) * (pu1_left[(4 + cols) * (-2)] - pu1_left[(2 - cols) * (-2)]); + v_v += (cols + 1) * (pu1_left[(4 + cols) * (-2) + 1] - pu1_left[(2 - cols) * (-2) + 1]);/*section 8.3.4.4 equation (8-145)*/ + } + a_u = 16 * (pu1_left[7 * (-2)] + pu1_top[14]); + a_v = 16 * (pu1_left[7 * (-2) + 1] + pu1_top[15]);/*section 8.3.3.4 equation (8-141)*/ + b_u = (34 * h_u + 32) >> 6;/*section 8.3.3.4 equation (8-142)*/ + b_v = (34 * h_v + 32) >> 6;/*section 8.3.3.4 equation (8-142)*/ + c_u = (34 * v_u + 32) >> 6;/*section 8.3.3.4 equation (8-143)*/ + c_v = (34 * v_v + 32) >> 6;/*section 8.3.3.4 equation (8-143)*/ + + for(rows = 0; rows < 8; rows++) + { + for(cols = 0; cols < 8; cols++) + { + val = (a_u + b_u * (cols - 3) + c_u * (rows - 3) );/*section 8.3.4.4 equation (8-140)*/ + val = (val + 16) >> 5; + *(pu1_dst + rows * dst_strd + 2 * cols) = CLIP_U8(val); + val = (a_v + b_v * (cols - 3) + c_v * (rows - 3) );/*section 8.3.4.4 equation (8-140)*/ + val = (val + 16) >> 5; + *(pu1_dst + rows * dst_strd + 2 * cols + 1) = CLIP_U8(val); + } + } +} + diff --git a/common/ih264_common_tables.c b/common/ih264_common_tables.c new file mode 100755 index 0000000..c53c276 --- /dev/null +++ b/common/ih264_common_tables.c @@ -0,0 +1,725 @@ +/****************************************************************************** + * + * Copyright (C) 2015 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ***************************************************************************** + * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore +*/ +/** +******************************************************************************* +* @file +* ih264_common_tables.c +* +* @brief +* Contains common global tables +* +* @author +* Harish M +* +* @par List of Functions: +* +* @remarks +* None +* +******************************************************************************* +*/ + +/*****************************************************************************/ +/* File Includes */ +/*****************************************************************************/ + +/* User include files */ +#include "ih264_typedefs.h" +#include "ih264_defs.h" +#include "ih264_macros.h" +#include "ih264_structs.h" +#include "ih264_common_tables.h" + + +/*****************************************************************************/ +/* Extern global definitions */ +/*****************************************************************************/ + +/** + ****************************************************************************** + * @brief while encoding, basing on the input configuration parameters, the + * the level of the bitstream is computed basing on the table below. + * input : table_idx + * output : level_idc or cpb size + * @remarks Table A-1 – level table limits + ****************************************************************************** + */ +const level_tables_t gas_ih264_lvl_tbl[16] = +{ + { IH264_LEVEL_10, 1485, 99, 297, 64, 175, 64 }, + { IH264_LEVEL_11, 1485, 99, 297, 128, 350, 64 }, + { IH264_LEVEL_1B, 3000, 396, 675, 192, 500, 128 }, + { IH264_LEVEL_12, 6000, 396, 1782, 384, 1000, 128 }, + { IH264_LEVEL_13, 11880, 396, 1782, 768, 2000, 128 }, + { IH264_LEVEL_20, 11880, 396, 1782, 2000, 2000, 128 }, + { IH264_LEVEL_21, 19800, 792, 3564, 4000, 4000, 256 }, + { IH264_LEVEL_22, 20250, 1620, 6075, 4000, 4000, 256 }, + { IH264_LEVEL_30, 40500, 1620, 6075, 10000, 10000, 256 }, + { IH264_LEVEL_31, 108000, 3600, 13500, 14000, 14000, 512 }, + { IH264_LEVEL_32, 216000, 5120, 15360, 20000, 20000, 512 }, + { IH264_LEVEL_40, 245760, 8192, 24576, 20000, 25000, 512 }, + { IH264_LEVEL_41, 245760, 8192, 24576, 50000, 62500, 512 }, + { IH264_LEVEL_42, 522240, 8704, 26112, 50000, 62500, 512 }, + { IH264_LEVEL_50, 589824, 22080, 82800, 135000, 135000, 512 }, + { IH264_LEVEL_51, 983040, 36864, 138240, 240000, 240000, 512 }, +}; + + +/** + * Array containing supported levels + */ +const WORD32 gai4_ih264_levels[] = +{ + IH264_LEVEL_10, + IH264_LEVEL_11, + IH264_LEVEL_12, + IH264_LEVEL_13, + IH264_LEVEL_20, + IH264_LEVEL_21, + IH264_LEVEL_22, + IH264_LEVEL_30, + IH264_LEVEL_31, + IH264_LEVEL_32, + IH264_LEVEL_40, + IH264_LEVEL_41, + IH264_LEVEL_42, + IH264_LEVEL_50, + IH264_LEVEL_51, +}; + + +/** + * Array giving size of max luma samples in a picture for a given level + */ +const WORD32 gai4_ih264_max_luma_pic_size[] = +{ + /* Level 1 */ + 25344, + /* Level 1.1 */ + 101376, + /* Level 1.2 */ + 101376, + /* Level 1.3 */ + 101376, + /* Level 2 */ + 101376, + /* Level 2.1 */ + 202752, + /* Level 2.2 */ + 414720, + /* Level 3 */ + 414720, + /* Level 3.1 */ + 921600, + /* Level 3.1 */ + 1310720, + /* Level 4 */ + 2097152, + /* Level 4.1 */ + 2097152, + /* Level 4.2 */ + 2228224, + /* Level 5 */ + 5652480, + /* Level 5.1 */ + 9437184 +}; + + +/** Max width and height allowed for a given level */ +/** This is derived as SQRT(8 * gai4_ih264_max_luma_pic_size[]) */ +const WORD32 gai4_ih264_max_wd_ht[] = +{ + /* Level 1 */ + 451, + /* Level 1.1 */ + 901, + /* Level 1.2 */ + 901, + /* Level 1.3 */ + 901, + /* Level 2 */ + 901, + /* Level 2.1 */ + 1274, + /* Level 2.2 */ + 1822, + /* Level 3 */ + 1822, + /* Level 3.1 */ + 2716, + /* Level 3.2 */ + 3239, + /* Level 4 */ + 4096, + /* Level 4.1 */ + 4096, + /* Level 4.2 */ + 4223, + /* Level 5 */ + 6725, + /* Level 5.1 */ + 8689 +}; + +/** Min width and height allowed for a given level */ +/** This is derived as gai4_ih264_max_luma_pic_size[]/gai4_ih264_max_wd_ht[] */ +const WORD32 gai4_ih264_min_wd_ht[] = +{ + /* Level 1 */ + 57, + /* Level 1.1 */ + 113, + /* Level 1.2 */ + 113, + /* Level 1.3 */ + 113, + /* Level 2 */ + 113, + /* Level 2.1 */ + 160, + /* Level 2.2 */ + 228, + /* Level 3 */ + 228, + /* Level 3.1 */ + 340, + /* Level 3.2 */ + 405, + /* Level 4 */ + 512, + /* Level 4.1 */ + 512, + /* Level 4.2 */ + 528, + /* Level 5 */ + 841, + /* Level 5.1 */ + 1087 + +}; + + +/** Table 7-11 Macroblock types for I slices */ +intra_mbtype_info_t gas_ih264_i_mbtype_info[] = +{ + /* For first entry, if transform_size_8x8_flag is 1, mode will be MBPART_I8x8 */ + /* This has to be taken care while accessing the table */ + {0, MBPART_I4x4, VERT_I16x16, 0, 0}, + {0, MBPART_I16x16, VERT_I16x16, 0, 0}, + {0, MBPART_I16x16, HORZ_I16x16, 0, 0}, + {0, MBPART_I16x16, DC_I16x16, 0, 0}, + {0, MBPART_I16x16, PLANE_I16x16, 0, 0}, + {0, MBPART_I16x16, VERT_I16x16, 1, 0}, + {0, MBPART_I16x16, HORZ_I16x16, 1, 0}, + {0, MBPART_I16x16, DC_I16x16, 1, 0}, + {0, MBPART_I16x16, PLANE_I16x16, 1, 0}, + {0, MBPART_I16x16, VERT_I16x16, 2, 0}, + {0, MBPART_I16x16, HORZ_I16x16, 2, 0}, + {0, MBPART_I16x16, DC_I16x16, 2, 0}, + {0, MBPART_I16x16, PLANE_I16x16, 2, 0}, + {0, MBPART_I16x16, VERT_I16x16, 0, 15}, + {0, MBPART_I16x16, HORZ_I16x16, 0, 15}, + {0, MBPART_I16x16, DC_I16x16, 0, 15}, + {0, MBPART_I16x16, PLANE_I16x16, 0, 15}, + {0, MBPART_I16x16, VERT_I16x16, 1, 15}, + {0, MBPART_I16x16, HORZ_I16x16, 1, 15}, + {0, MBPART_I16x16, DC_I16x16, 1, 15}, + {0, MBPART_I16x16, PLANE_I16x16, 1, 15}, + {0, MBPART_I16x16, VERT_I16x16, 2, 15}, + {0, MBPART_I16x16, HORZ_I16x16, 2, 15}, + {0, MBPART_I16x16, DC_I16x16, 2, 15}, + {0, MBPART_I16x16, PLANE_I16x16, 2, 15}, + {0, MBPART_IPCM, VERT_I16x16, 0, 0} +}; + +/** Table 7-13 Macroblock types for P slices */ +inter_mbtype_info_t gas_ih264_p_mbtype_info[] = +{ + {1, MBPART_L0, MBPART_NA, 16, 16}, + {2, MBPART_L0, MBPART_L0, 16, 8}, + {2, MBPART_L0, MBPART_L0, 8, 16}, + {4, MBPART_NA, MBPART_NA, 8, 8}, + {4, MBPART_NA, MBPART_NA, 8, 8}, +}; + +/** Table 7-14 Macroblock types for B slices */ +inter_mbtype_info_t gas_ih264_b_mbtype_info[] = +{ + {0, MBPART_DIRECT, MBPART_NA, 8, 8, }, + {1, MBPART_L0, MBPART_NA, 16, 16, }, + {1, MBPART_L1, MBPART_NA, 16, 16, }, + {1, MBPART_BI, MBPART_NA, 16, 16, }, + {2, MBPART_L0, MBPART_L0, 16, 8, }, + {2, MBPART_L0, MBPART_L0, 8, 16, }, + {2, MBPART_L1, MBPART_L1, 16, 8, }, + {2, MBPART_L1, MBPART_L1, 8, 16, }, + {2, MBPART_L0, MBPART_L1, 16, 8, }, + {2, MBPART_L0, MBPART_L1, 8, 16, }, + {2, MBPART_L1, MBPART_L0, 16, 8, }, + {2, MBPART_L1, MBPART_L0, 8, 16, }, + {2, MBPART_L0, MBPART_BI, 16, 8, }, + {2, MBPART_L0, MBPART_BI, 8, 16, }, + {2, MBPART_L1, MBPART_BI, 16, 8, }, + {2, MBPART_L1, MBPART_BI, 8, 16, }, + {2, MBPART_BI, MBPART_L0, 16, 8, }, + {2, MBPART_BI, MBPART_L0, 8, 16, }, + {2, MBPART_BI, MBPART_L1, 16, 8, }, + {2, MBPART_BI, MBPART_L1, 8, 16, }, + {2, MBPART_BI, MBPART_BI, 16, 8, }, + {2, MBPART_BI, MBPART_BI, 8, 16, }, + {4, MBPART_NA, MBPART_NA, 8, 8, }, +}; + +/** Table 7-17 – Sub-macroblock types in P macroblocks */ +submbtype_info_t gas_ih264_p_submbtype_info[] = +{ + {1, MBPART_L0, 8, 8}, + {2, MBPART_L0, 8, 4}, + {2, MBPART_L0, 4, 8}, + {4, MBPART_L0, 4, 4}, +}; + +/** Table 7-18 – Sub-macroblock types in B macroblocks */ +submbtype_info_t gas_ih264_b_submbtype_info[] = +{ + {4, MBPART_DIRECT, 4, 4}, + {1, MBPART_L0, 8, 8}, + {1, MBPART_L1, 8, 8}, + {1, MBPART_BI, 8, 8}, + {2, MBPART_L0, 8, 4}, + {2, MBPART_L0, 4, 8}, + {2, MBPART_L1, 8, 4}, + {2, MBPART_L1, 4, 8}, + {2, MBPART_BI, 8, 4}, + {2, MBPART_BI, 4, 8}, + {4, MBPART_L0, 4, 4}, + {4, MBPART_L1, 4, 4}, + {4, MBPART_BI, 4, 4}, +}; + + + + +const UWORD8 gau1_ih264_inv_scan_prog4x4[] = +{ + 0, 1, 4, 8, + 5, 2, 3, 6, + 9, 12, 13, 10, + 7, 11, 14, 15 +}; + +const UWORD8 gau1_ih264_inv_scan_int4x4[] = +{ + 0, 4, 1, 8, + 12, 5, 9, 13, + 2, 6, 10, 14, + 3, 7, 11, 15 +}; + +/** Inverse scan tables for individual 4x4 blocks of 8x8 transform coeffs of CAVLC */ +/* progressive */ +const UWORD8 gau1_ih264_inv_scan_prog8x8_cavlc[64] = +{ + 0, 9, 17, 18, 12, 40, 27, 7, + 35, 57, 29, 30, 58, 38, 53, 47, + 1, 2, 24, 11, 19, 48, 20, 14, + 42, 50, 22, 37, 59, 31, 60, 55, + 8, 3, 32, 4, 26, 41, 13, 21, + 49, 43, 15, 44, 52, 39, 61, 62, + 16, 10, 25, 5, 33, 34, 6, 28, + 56, 36, 23, 51, 45, 46, 54, 63 +}; + +/* interlace */ +const UWORD8 gau1_ih264_inv_scan_int8x8_cavlc[64] = +{ + 0, 9, 2, 56, 18, 26, 34, 27, + 35, 28, 36, 29, 45, 7, 54, 39, + 8, 24, 25, 33, 41, 11, 42, 12, + 43, 13, 44, 14, 53, 15, 62, 47, + 16, 32, 40, 10, 49, 4, 50, 5, + 51, 6, 52, 22, 61, 38, 23, 55, + 1, 17, 48, 3, 57, 19, 58, 20, + 59, 21, 60, 37, 30, 46, 31, 63 +}; + + + +/*Inverse scan tables for individual 8x8 blocks of 8x8 transform coeffs of CABAC */ +/* progressive */ + +const UWORD8 gau1_ih264_inv_scan_prog8x8_cabac[64] = +{ + 0, 1, 8, 16, 9, 2, 3, 10, + 17, 24, 32, 25, 18, 11, 4, 5, + 12, 19, 26, 33, 40, 48, 41, 34, + 27, 20, 13, 6, 7, 14, 21, 28, + 35, 42, 49, 56, 57, 50, 43, 36, + 29, 22, 15, 23, 30, 37, 44, 51, + 58, 59, 52, 45, 38, 31, 39, 46, + 53, 60, 61, 54, 47, 55, 62, 63 +}; + + +/* interlace */ + +const UWORD8 gau1_ih264_inv_scan_int8x8_cabac[64] = +{ + 0, 8, 16, 1, 9, 24, 32, 17, + 2, 25, 40, 48, 56, 33, 10, 3, + 18, 41, 49, 57, 26, 11, 4, 19, + 34, 42, 50, 58, 27, 12, 5, 20, + 35, 43, 51, 59, 28, 13, 6, 21, + 36, 44, 52, 60, 29, 14, 22, 37, + 45, 53, 61, 30, 7, 15, 38, 46, + 54, 62, 23, 31, 39, 47, 55, 63 +}; + + +const UWORD8 *gpau1_ih264_inv_scan8x8[] = +{ + gau1_ih264_inv_scan_prog8x8_cavlc, + gau1_ih264_inv_scan_int8x8_cavlc, + gau1_ih264_inv_scan_prog8x8_cabac, + gau1_ih264_inv_scan_int8x8_cabac +}; + +const UWORD8 *gpau1_ih264_inv_scan4x4[] = +{ + gau1_ih264_inv_scan_prog4x4, + gau1_ih264_inv_scan_int4x4, +}; + +const UWORD8 gau1_ih264_8x8_subblk_idx[] = +{ + 0, 1, 4, 5, + 2, 3, 6, 7, + 8, 9, 12, 13, + 10, 11, 14, 15 +}; + + +/* Table 8-15 Chroma QP offset table */ +const UWORD8 gau1_ih264_chroma_qp[] = +{ + 0, 1, 2, 3, 4, 5, 6, 7, + 8, 9, 10, 11, 12, 13, 14, 15, + 16, 17, 18, 19, 20, 21, 22, 23, + 24, 25, 26, 27, 28, 29, 29, 30, + 31, 32, 32, 33, 34, 34, 35, 35, + 36, 36, 37, 37, 37, 38, 38, 38, + 39, 39, 39, 39 +}; + + +/** +****************************************************************************** +* @brief look up table to compute neigbour availability of 4x4 blocks +* input : subblk idx, mb neighbor availability +* output : sub blk neighbor availability +* @remarks +****************************************************************************** +*/ +const UWORD8 gau1_ih264_4x4_ngbr_avbl[16][16] = +{ + { 0x0, 0x1, 0xc, 0x7, 0x1, 0x1, 0xf, 0x7, 0xc, 0xf, 0xc, 0x7, 0xf, 0x7, 0xf, 0x7 }, + { 0x1, 0x1, 0xf, 0x7, 0x1, 0x1, 0xf, 0x7, 0xf, 0xf, 0xf, 0x7, 0xf, 0x7, 0xf, 0x7 }, + { 0x2, 0x1, 0xc, 0x7, 0x1, 0x1, 0xf, 0x7, 0xc, 0xf, 0xc, 0x7, 0xf, 0x7, 0xf, 0x7 }, + { 0x3, 0x1, 0xf, 0x7, 0x1, 0x1, 0xf, 0x7, 0xf, 0xf, 0xf, 0x7, 0xf, 0x7, 0xf, 0x7 }, + + { 0xc, 0xf, 0xc, 0x7, 0xf, 0x7, 0xf, 0x7, 0xc, 0xf, 0xc, 0x7, 0xf, 0x7, 0xf, 0x7 }, + { 0xd, 0xf, 0xf, 0x7, 0xf, 0x7, 0xf, 0x7, 0xf, 0xf, 0xf, 0x7, 0xf, 0x7, 0xf, 0x7 }, + { 0xe, 0xf, 0xc, 0x7, 0xf, 0x7, 0xf, 0x7, 0xc, 0xf, 0xc, 0x7, 0xf, 0x7, 0xf, 0x7 }, + { 0xf, 0xf, 0xf, 0x7, 0xf, 0x7, 0xf, 0x7, 0xf, 0xf, 0xf, 0x7, 0xf, 0x7, 0xf, 0x7 }, + + { 0x0, 0x1, 0xc, 0x7, 0x1, 0x9, 0xf, 0x7, 0xc, 0xf, 0xc, 0x7, 0xf, 0x7, 0xf, 0x7 }, + { 0x1, 0x1, 0xf, 0x7, 0x1, 0x9, 0xf, 0x7, 0xf, 0xf, 0xf, 0x7, 0xf, 0x7, 0xf, 0x7 }, + { 0x2, 0x1, 0xc, 0x7, 0x1, 0x9, 0xf, 0x7, 0xc, 0xf, 0xc, 0x7, 0xf, 0x7, 0xf, 0x7 }, + { 0x3, 0x1, 0xf, 0x7, 0x1, 0x9, 0xf, 0x7, 0xf, 0xf, 0xf, 0x7, 0xf, 0x7, 0xf, 0x7 }, + + { 0xc, 0xf, 0xc, 0x7, 0xf, 0xf, 0xf, 0x7, 0xc, 0xf, 0xc, 0x7, 0xf, 0x7, 0xf, 0x7 }, + { 0xd, 0xf, 0xf, 0x7, 0xf, 0xf, 0xf, 0x7, 0xf, 0xf, 0xf, 0x7, 0xf, 0x7, 0xf, 0x7 }, + { 0xe, 0xf, 0xc, 0x7, 0xf, 0xf, 0xf, 0x7, 0xc, 0xf, 0xc, 0x7, 0xf, 0x7, 0xf, 0x7 }, + { 0xf, 0xf, 0xf, 0x7, 0xf, 0xf, 0xf, 0x7, 0xf, 0xf, 0xf, 0x7, 0xf, 0x7, 0xf, 0x7 }, +}; + + +/** +****************************************************************************** +* @brief look up table to compute neigbour availability of 8x8 blocks +* input : subblk idx, mb neighbor availability +* output : sub blk neighbor availability +* @remarks +****************************************************************************** +*/ +const UWORD8 gau1_ih264_8x8_ngbr_avbl[16][4] = +{ + { 0x0, 0x1, 0xc, 0x7 }, + { 0x1, 0x1, 0xf, 0x7 }, + { 0x2, 0x1, 0xc, 0x7 }, + { 0x3, 0x1, 0xf, 0x7 }, + + { 0xc, 0x7, 0xc, 0x7 }, + { 0xd, 0x7, 0xf, 0x7 }, + { 0xe, 0x7, 0xc, 0x7 }, + { 0xf, 0x7, 0xf, 0x7 }, + + { 0x0, 0x9, 0xc, 0x7 }, + { 0x1, 0x9, 0xf, 0x7 }, + { 0x2, 0x9, 0xc, 0x7 }, + { 0x3, 0x9, 0xf, 0x7 }, + + { 0xc, 0xf, 0xc, 0x7 }, + { 0xd, 0xf, 0xf, 0x7 }, + { 0xe, 0xf, 0xc, 0x7 }, + { 0xf, 0xf, 0xf, 0x7 }, +}; + +/** Table 7-3 Default intra 4x4 scaling list */ +const UWORD16 gau2_ih264_default_intra4x4_scaling_list[] = +{ + 6, 13, 13, 20, + 20, 20, 28, 28, + 28, 28, 32, 32, + 32, 37, 37, 42 +}; + +/** Table 7-3 Default inter 4x4 scaling list */ +const UWORD16 gau2_ih264_default_inter4x4_scaling_list[] = +{ + 10, 14, 14, 20, + 20, 20, 24, 24, + 24, 24, 27, 27, + 27, 30, 30, 34 +}; + +/* Inverse scanned output of gau2_ih264_default_intra4x4_scaling_list */ +const UWORD16 gau2_ih264_default_intra4x4_weight_scale[] = +{ + 6, 13, 20, 28, + 13, 20, 28, 32, + 20, 28, 32, 37, + 28, 32, 37, 42 +}; + +/* Inverse scanned output of gau2_ih264_default_inter4x4_scaling_list */ +const UWORD16 gau2_ih264_default_inter4x4_weight_scale[] = +{ + 10, 14, 20, 24, + 14, 20, 24, 27, + 20, 24, 27, 30, + 24, 27, 30, 34 +}; + +/** Table 7-4 Default intra 8x8 scaling list */ +const UWORD16 gau2_ih264_default_intra8x8_scaling_list[] = +{ + 6, 10, 10, 13, 11, 13, 16, 16, + 16, 16, 18, 18, 18, 18, 18, 23, + 23, 23, 23, 23, 23, 25, 25, 25, + 25, 25, 25, 25, 27, 27, 27, 27, + 27, 27, 27, 27, 29, 29, 29, 29, + 29, 29, 29, 31, 31, 31, 31, 31, + 31, 33, 33, 33, 33, 33, 36, 36, + 36, 36, 38, 38, 38, 40, 40, 42 +}; + +/** Table 7-4 Default inter 8x8 scaling list */ +const UWORD16 gau2_ih264_default_inter8x8_scaling_list[] = +{ + 9, 13, 13, 15, 13, 15, 17, 17, + 17, 17, 19, 19, 19, 19, 19, 21, + 21, 21, 21, 21, 21, 22, 22, 22, + 22, 22, 22, 22, 24, 24, 24, 24, + 24, 24, 24, 24, 25, 25, 25, 25, + 25, 25, 25, 27, 27, 27, 27, 27, + 27, 28, 28, 28, 28, 28, 30, 30, + 30, 30, 32, 32, 32, 33, 33, 35 +}; + +/* Inverse scanned output of gau2_ih264_default_intra8x8_scaling_list */ +const UWORD16 gau2_ih264_default_intra8x8_weight_scale[] = +{ + 6, 10, 13, 16, 18, 23, 25, 27, + 10, 11, 16, 18, 23, 25, 27, 29, + 13, 16, 18, 23, 25, 27, 29, 31, + 16, 18, 23, 25, 27, 29, 31, 33, + 18, 23, 25, 27, 29, 31, 33, 36, + 23, 25, 27, 29, 31, 33, 36, 38, + 25, 27, 29, 31, 33, 36, 38, 40, + 27, 29, 31, 33, 36, 38, 40, 42 +}; + +/* Inverse scanned output of gau2_ih264_default_inter8x8_scaling_list */ +const UWORD16 gau2_ih264_default_inter8x8_weight_scale[] = +{ + 9, 13, 15, 17, 19, 21, 22, 24, + 13, 13, 17, 19, 21, 22, 24, 25, + 15, 17, 19, 21, 22, 24, 25, 27, + 17, 19, 21, 22, 24, 25, 27, 28, + 19, 21, 22, 24, 25, 27, 28, 30, + 21, 22, 24, 25, 27, 28, 30, 32, + 22, 24, 25, 27, 28, 30, 32, 33, + 24, 25, 27, 28, 30, 32, 33, 35 +}; +/* Eq 7-8 Flat scaling matrix for 4x4 */ +const UWORD16 gau2_ih264_flat_4x4_weight_scale[] = +{ + 16, 16, 16, 16, + 16, 16, 16, 16, + 16, 16, 16, 16, + 16, 16, 16, 16 +}; + +/* Eq 7-9 Flat scaling matrix for 8x8 */ +const UWORD16 gau2_ih264_flat_8x8_weight_scale[] = +{ + 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16 +}; + + +/** + ****************************************************************************** + * @brief Scale Table for inverse quantizing 4x4 subblock. To inverse quantize + * a given 4x4 quantized block, the coefficient at index location (i,j) is scaled + * by one of the constants in this table and right shift the result by abs (4 - + * floor(qp/6)), here qp is the quantization parameter used to quantize the mb. + * + * input : 16 * qp%6, index location (i,j) + * output : scale constant. + * + * @remarks 16 constants for each index position of the subblock and 6 for each + * qp%6 in the range 0-5 inclusive. + ****************************************************************************** + */ + +const UWORD16 gau2_ih264_iquant_scale_matrix_4x4[96] = +{ + 10, 13, 10, 13, + 13, 16, 13, 16, + 10, 13, 10, 13, + 13, 16, 13, 16, + + 11, 14, 11, 14, + 14, 18, 14, 18, + 11, 14, 11, 14, + 14, 18, 14, 18, + + 13, 16, 13, 16, + 16, 20, 16, 20, + 13, 16, 13, 16, + 16, 20, 16, 20, + + 14, 18, 14, 18, + 18, 23, 18, 23, + 14, 18, 14, 18, + 18, 23, 18, 23, + + 16, 20, 16, 20, + 20, 25, 20, 25, + 16, 20, 16, 20, + 20, 25, 20, 25, + + 18, 23, 18, 23, + 23, 29, 23, 29, + 18, 23, 18, 23, + 23, 29, 23, 29, + +}; + +/** + ****************************************************************************** + * @brief Scale Table for inverse quantizing 8x8 subblock. To inverse quantize + * a given 8x8 quantized block, the coefficient at index location (i,j) is scaled + * by one of the constants in this table and right shift the result by abs (4 - + * floor(qp/6)), here qp is the quantization parameter used to quantize the mb. + * + * input : qp%6, index location (i,j) + * output : scale constant. + * + * @remarks 64 constants for each index position of the subblock and 6 for each + * qp%6 in the range 0-5 inclusive. + ****************************************************************************** + */ +const UWORD16 gau2_ih264_iquant_scale_matrix_8x8 [384] = +{ + 20, 19, 25, 19, 20, 19, 25, 19, + 19, 18, 24, 18, 19, 18, 24, 18, + 25, 24, 32, 24, 25, 24, 32, 24, + 19, 18, 24, 18, 19, 18, 24, 18, + 20, 19, 25, 19, 20, 19, 25, 19, + 19, 18, 24, 18, 19, 18, 24, 18, + 25, 24, 32, 24, 25, 24, 32, 24, + 19, 18, 24, 18, 19, 18, 24, 18, + + 22, 21, 28, 21, 22, 21, 28, 21, + 21, 19, 26, 19, 21, 19, 26, 19, + 28, 26, 35, 26, 28, 26, 35, 26, + 21, 19, 26, 19, 21, 19, 26, 19, + 22, 21, 28, 21, 22, 21, 28, 21, + 21, 19, 26, 19, 21, 19, 26, 19, + 28, 26, 35, 26, 28, 26, 35, 26, + 21, 19, 26, 19, 21, 19, 26, 19, + + 26, 24, 33, 24, 26, 24, 33, 24, + 24, 23, 31, 23, 24, 23, 31, 23, + 33, 31, 42, 31, 33, 31, 42, 31, + 24, 23, 31, 23, 24, 23, 31, 23, + 26, 24, 33, 24, 26, 24, 33, 24, + 24, 23, 31, 23, 24, 23, 31, 23, + 33, 31, 42, 31, 33, 31, 42, 31, + 24, 23, 31, 23, 24, 23, 31, 23, + + 28, 26, 35, 26, 28, 26, 35, 26, + 26, 25, 33, 25, 26, 25, 33, 25, + 35, 33, 45, 33, 35, 33, 45, 33, + 26, 25, 33, 25, 26, 25, 33, 25, + 28, 26, 35, 26, 28, 26, 35, 26, + 26, 25, 33, 25, 26, 25, 33, 25, + 35, 33, 45, 33, 35, 33, 45, 33, + 26, 25, 33, 25, 26, 25, 33, 25, + + 32, 30, 40, 30, 32, 30, 40, 30, + 30, 28, 38, 28, 30, 28, 38, 28, + 40, 38, 51, 38, 40, 38, 51, 38, + 30, 28, 38, 28, 30, 28, 38, 28, + 32, 30, 40, 30, 32, 30, 40, 30, + 30, 28, 38, 28, 30, 28, 38, 28, + 40, 38, 51, 38, 40, 38, 51, 38, + 30, 28, 38, 28, 30, 28, 38, 28, + + 36, 34, 46, 34, 36, 34, 46, 34, + 34, 32, 43, 32, 34, 32, 43, 32, + 46, 43, 58, 43, 46, 43, 58, 43, + 34, 32, 43, 32, 34, 32, 43, 32, + 36, 34, 46, 34, 36, 34, 46, 34, + 34, 32, 43, 32, 34, 32, 43, 32, + 46, 43, 58, 43, 46, 43, 58, 43, + 34, 32, 43, 32, 34, 32, 43, 32, + +}; diff --git a/common/ih264_common_tables.h b/common/ih264_common_tables.h new file mode 100755 index 0000000..3127a2c --- /dev/null +++ b/common/ih264_common_tables.h @@ -0,0 +1,136 @@ +/****************************************************************************** + * + * Copyright (C) 2015 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ***************************************************************************** + * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore +*/ +/** +******************************************************************************* +* @file +* ih264_common_tables.h +* +* @brief +* Common tables +* +* @author +* Harish +* +* @par List of Functions: +* +* @remarks +* None +* +******************************************************************************* +*/ + +#ifndef _IH264_COMMON_TABLES_H_ +#define _IH264_COMMON_TABLES_H_ + + +/*****************************************************************************/ +/* Structures */ +/*****************************************************************************/ + +/** +****************************************************************************** + * @brief level tables +****************************************************************************** + */ +typedef struct +{ + /* level */ + IH264_LEVEL_T u4_level_idc; + + /* max macroblock processing rate */ + UWORD32 u4_max_mbps; + + /* max frame size in mbs */ + UWORD32 u4_max_fs; + + /* max dpb size / 768 */ + UWORD32 u4_max_dpb_size; + + /* max bit rate */ + UWORD32 u4_max_br; + + /* max cpb size */ + UWORD32 u4_max_cpb_size; + + /* max vertical MV component range */ + UWORD32 u4_max_mv_y; + +}level_tables_t; + +/*****************************************************************************/ +/* Extern global declarations */ +/*****************************************************************************/ + +/** + ****************************************************************************** + * @brief while encoding, basing on the input configuration parameters, the + * the level of the bitstream is computed basing on the table below. + * input : table_idx + * output : level_idc or cpb size + * @remarks Table A-1 – level table limits + ****************************************************************************** + */ +extern const level_tables_t gas_ih264_lvl_tbl[16]; + +extern const WORD32 gai4_ih264_levels[]; +extern const WORD32 gai4_ih264_max_luma_pic_size[]; +extern const WORD32 gai4_ih264_max_wd_ht[]; +extern const WORD32 gai4_ih264_min_wd_ht[]; + +extern intra_mbtype_info_t gas_ih264_i_mbtype_info[]; +extern inter_mbtype_info_t gas_ih264_p_mbtype_info[]; +extern inter_mbtype_info_t gas_ih264_b_mbtype_info[]; +extern submbtype_info_t gas_ih264_p_submbtype_info[]; +extern submbtype_info_t gas_ih264_b_submbtype_info[]; + + +extern const UWORD8 gau1_ih264_inv_scan_prog4x4[]; +extern const UWORD8 gau1_ih264_inv_scan_int4x4[]; +extern const UWORD8 gau1_ih264_inv_scan_prog8x8_cavlc[64]; +extern const UWORD8 gau1_ih264_inv_scan_int8x8_cavlc[64]; +extern const UWORD8 gau1_ih264_inv_scan_prog8x8_cabac[64]; +extern const UWORD8 gau1_ih264_inv_scan_int8x8_cabac[64]; + +extern const UWORD8 *gpau1_ih264_inv_scan8x8[]; +extern const UWORD8 *gpau1_ih264_inv_scan4x4[]; + +extern const UWORD8 gau1_ih264_8x8_subblk_idx[]; + +extern const UWORD8 gau1_ih264_chroma_qp[]; + +extern const UWORD8 gau1_ih264_4x4_ngbr_avbl[16][16]; +extern const UWORD8 gau1_ih264_8x8_ngbr_avbl[16][4]; + + +extern const UWORD16 gau2_ih264_default_inter4x4_weight_scale[]; +extern const UWORD16 gau2_ih264_default_intra4x4_weight_scale[]; +extern const UWORD16 gau2_ih264_default_intra4x4_scaling_list[]; +extern const UWORD16 gau2_ih264_default_inter4x4_scaling_list[]; +extern const UWORD16 gau2_ih264_default_intra8x8_scaling_list[]; +extern const UWORD16 gau2_ih264_default_inter8x8_scaling_list[]; +extern const UWORD16 gau2_ih264_default_intra8x8_weight_scale[]; +extern const UWORD16 gau2_ih264_default_inter8x8_weight_scale[]; +extern const UWORD16 gau2_ih264_flat_4x4_weight_scale[]; +extern const UWORD16 gau2_ih264_flat_8x8_weight_scale[]; + +extern const UWORD16 gau2_ih264_iquant_scale_matrix_4x4 [96]; +extern const UWORD16 gau2_ih264_iquant_scale_matrix_8x8 [384]; + +#endif /*_IH264_COMMON_TABLES_H_*/ diff --git a/common/ih264_deblk_edge_filters.c b/common/ih264_deblk_edge_filters.c new file mode 100755 index 0000000..d2ffefd --- /dev/null +++ b/common/ih264_deblk_edge_filters.c @@ -0,0 +1,2087 @@ +/****************************************************************************** + * + * Copyright (C) 2015 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ***************************************************************************** + * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore +*/ +/**************************************************************************** */ +/* */ +/* File Name : ih264_deblk_edge_filters.c */ +/* */ +/* Description : Contains function definitions for deblocking */ +/* */ +/* List of Functions : ih264_deblk_luma_vert_bs4() */ +/* ih264_deblk_luma_horz_bs4() */ +/* ih264_deblk_luma_vert_bslt4() */ +/* ih264_deblk_luma_horz_bslt4() */ +/* ih264_deblk_luma_vert_bs4_mbaff() */ +/* ih264_deblk_luma_vert_bslt4_mbaff() */ +/* ih264_deblk_chroma_vert_bs4_bp() */ +/* ih264_deblk_chroma_horz_bs4_bp() */ +/* ih264_deblk_chroma_vert_bslt4_bp() */ +/* ih264_deblk_chroma_horz_bslt4_bp() */ +/* ih264_deblk_chroma_vert_bs4_mbaff_bp() */ +/* ih264_deblk_chroma_vert_bslt4_mbaff_bp() */ +/* ih264_deblk_chroma_vert_bs4() */ +/* ih264_deblk_chroma_horz_bs4() */ +/* ih264_deblk_chroma_vert_bslt4() */ +/* ih264_deblk_chroma_horz_bslt4() */ +/* ih264_deblk_chroma_vert_bs4_mbaff() */ +/* ih264_deblk_chroma_vert_bslt4_mbaff() */ +/* */ +/* Issues / Problems : None */ +/* */ +/* Revision History : */ +/* */ +/* DD MM YYYY Author(s) Changes (Describe the changes made) */ +/* 28 11 2013 Ittiam Draft */ +/* 29 12 2014 Kaushik Added double-call vertical */ +/* Senthoor deblocking and high profile */ +/* deblocking functions */ +/* */ +/******************************************************************************/ + +/*****************************************************************************/ +/* File Includes */ +/*****************************************************************************/ + +/* System include files */ +#include <stdio.h> + +/* User include files */ +#include "ih264_typedefs.h" +#include "ih264_platform_macros.h" +#include "ih264_deblk_edge_filters.h" +#include "ih264_macros.h" + +/*****************************************************************************/ +/* Function Definitions */ +/*****************************************************************************/ + +/*****************************************************************************/ +/* */ +/* Function Name : ih264_deblk_luma_vert_bs4() */ +/* */ +/* Description : This function performs filtering of a luma block */ +/* vertical edge when the boundary strength is set to 4. */ +/* */ +/* Inputs : pu1_src - pointer to the src sample q0 */ +/* src_strd - source stride */ +/* alpha - alpha value for the boundary */ +/* beta - beta value for the boundary */ +/* */ +/* Globals : None */ +/* */ +/* Processing : This operation is described in Sec. 8.7.2.4 under the */ +/* title "Filtering process for edges for bS equal to 4" in */ +/* ITU T Rec H.264. */ +/* */ +/* Outputs : None */ +/* */ +/* Returns : None */ +/* */ +/* Issues : None */ +/* */ +/* Revision History: */ +/* */ +/* DD MM YYYY Author(s) Changes (Describe the changes made) */ +/* 28 11 2013 Ittiam Draft */ +/* */ +/*****************************************************************************/ +void ih264_deblk_luma_vert_bs4(UWORD8 *pu1_src, + WORD32 src_strd, + WORD32 alpha, + WORD32 beta) +{ + UWORD8 p3, p2, p1, p0, q0, q1, q2, q3; + WORD32 pos_p3, pos_p2, pos_p1, pos_p0; + WORD32 pos_q0, pos_q1, pos_q2,pos_q3; + UWORD8 a_p, a_q; /* threshold variables */ + WORD32 blk_strd = src_strd << 2; /* block_increment = src_strd * 4 */ + UWORD8 *pu1_src_temp; + WORD8 i = 0, edge; + + pos_q0 = 0; + pos_q1 = 1; + pos_q2 = 2; + pos_q3 = 3; + pos_p0 = -1; + pos_p1 = -2; + pos_p2 = -3; + pos_p3 = -4; + + for(edge = 0; edge < 4; edge++, pu1_src += blk_strd) + { + pu1_src_temp = pu1_src; + for(i = 0; i < 4; ++i, pu1_src_temp += src_strd) + { + q0 = pu1_src_temp[pos_q0]; + q1 = pu1_src_temp[pos_q1]; + p0 = pu1_src_temp[pos_p0]; + p1 = pu1_src_temp[pos_p1]; + + /* Filter Decision */ + if((ABS(p0 - q0) >= alpha) || + (ABS(q1 - q0) >= beta) || + (ABS(p1 - p0) >= beta)) + continue; + + p2 = pu1_src_temp[pos_p2]; + p3 = pu1_src_temp[pos_p3]; + q2 = pu1_src_temp[pos_q2]; + q3 = pu1_src_temp[pos_q3]; + + if(ABS(p0 - q0) < ((alpha >> 2) + 2)) + { + /* Threshold Variables */ + a_p = (UWORD8)ABS(p2 - p0); + a_q = (UWORD8)ABS(q2 - q0); + + if(a_p < beta) + { + /* p0', p1', p2' */ + pu1_src_temp[pos_p0] = ((p2 + X2(p1) + X2(p0) + X2(q0) + q1 + + 4) >> 3); + pu1_src_temp[pos_p1] = ((p2 + p1 + p0 + q0 + 2) >> 2); + pu1_src_temp[pos_p2] = + ((X2(p3) + X3(p2) + p1 + p0 + q0 + + 4) >> 3); + } + else + { + /* p0'*/ + pu1_src_temp[pos_p0] = ((X2(p1) + p0 + q1 + 2) >> 2); + } + + if(a_q < beta) + { + /* q0', q1', q2' */ + pu1_src_temp[pos_q0] = (p1 + X2(p0) + X2(q0) + X2(q1) + q2 + + 4) >> 3; + pu1_src_temp[pos_q1] = (p0 + q0 + q1 + q2 + 2) >> 2; + pu1_src_temp[pos_q2] = (X2(q3) + X3(q2) + q1 + q0 + p0 + 4) + >> 3; + } + else + { + /* q0'*/ + pu1_src_temp[pos_q0] = (X2(q1) + q0 + p1 + 2) >> 2; + } + } + else + { + /* p0', q0'*/ + pu1_src_temp[pos_p0] = ((X2(p1) + p0 + q1 + 2) >> 2); + pu1_src_temp[pos_q0] = (X2(q1) + q0 + p1 + 2) >> 2; + } + } + } +} + +/*****************************************************************************/ +/* */ +/* Function Name : ih264_deblk_luma_horz_bs4() */ +/* */ +/* Description : This function performs filtering of a luma block */ +/* horizontal edge when the boundary strength is set to 4. */ +/* */ +/* Inputs : pu1_src - pointer to the src sample q0 */ +/* src_strd - source stride */ +/* alpha - alpha value for the boundary */ +/* beta - beta value for the boundary */ +/* */ +/* Globals : None */ +/* */ +/* Processing : This operation is described in Sec. 8.7.2.4 under the */ +/* title "Filtering process for edges for bS equal to 4" in */ +/* ITU T Rec H.264. */ +/* */ +/* Outputs : None */ +/* */ +/* Returns : None */ +/* */ +/* Issues : None */ +/* */ +/* Revision History: */ +/* */ +/* DD MM YYYY Author(s) Changes (Describe the changes made) */ +/* 28 11 2013 Ittiam Draft */ +/* */ +/*****************************************************************************/ +void ih264_deblk_luma_horz_bs4(UWORD8 *pu1_src, + WORD32 src_strd, + WORD32 alpha, + WORD32 beta) +{ + UWORD8 p3, p2, p1, p0, q0, q1, q2, q3; + WORD32 pos_p3, pos_p2, pos_p1, pos_p0, pos_q0, pos_q1, + pos_q2, pos_q3; + UWORD8 a_p, a_q; /* threshold variables */ + UWORD8 *pu1_p3; /* pointer to the src sample p3 */ + UWORD8 *pu1_p3_temp; + UWORD8 *pu1_src_temp; + WORD8 i = 0, edge; + + pu1_p3 = pu1_src - (src_strd << 2); + pos_q0 = 0; + pos_q1 = src_strd; + pos_q2 = X2(src_strd); + pos_q3 = X3(src_strd); + pos_p0 = X3(src_strd); + pos_p1 = X2(src_strd); + pos_p2 = src_strd; + pos_p3 = 0; + + for(edge = 0; edge < 4; edge++, pu1_src += 4, pu1_p3 += 4) + { + pu1_src_temp = pu1_src; + pu1_p3_temp = pu1_p3; + for(i = 0; i < 4; ++i, pu1_src_temp++, pu1_p3_temp++) + { + q0 = pu1_src_temp[pos_q0]; + q1 = pu1_src_temp[pos_q1]; + p0 = pu1_p3_temp[pos_p0]; + p1 = pu1_p3_temp[pos_p1]; + + /* Filter Decision */ + if((ABS(p0 - q0) >= alpha) || + (ABS(q1 - q0) >= beta) || + (ABS(p1 - p0) >= beta)) + continue; + + p2 = pu1_p3_temp[pos_p2]; + p3 = pu1_p3_temp[pos_p3]; + q2 = pu1_src_temp[pos_q2]; + q3 = pu1_src_temp[pos_q3]; + + if(ABS(p0 - q0) < ((alpha >> 2) + 2)) + { + /* Threshold Variables */ + a_p = ABS(p2 - p0); + a_q = ABS(q2 - q0); + + if((a_p < beta)) + { + /* p0', p1', p2' */ + pu1_p3_temp[pos_p0] = (p2 + X2(p1) + X2(p0) + X2(q0) + q1 + + 4) >> 3; + pu1_p3_temp[pos_p1] = (p2 + p1 + p0 + q0 + 2) >> 2; + pu1_p3_temp[pos_p2] = + (X2(p3) + X3(p2) + p1 + p0 + q0 + + 4) >> 3; + } + else + { + /* p0'*/ + pu1_p3_temp[pos_p0] = (X2(p1) + p0 + q1 + 2) >> 2; + } + + if(a_q < beta) + { + /* q0', q1', q2' */ + pu1_src_temp[pos_q0] = (p1 + X2(p0) + X2(q0) + X2(q1) + + q2 + 4) >> 3; + pu1_src_temp[pos_q1] = (p0 + q0 + q1 + q2 + 2) >> 2; + pu1_src_temp[pos_q2] = (X2(q3) + X3(q2) + q1 + q0 + p0 + + 4) >> 3; + } + else + { + /* q0'*/ + pu1_src_temp[pos_q0] = (X2(q1) + q0 + p1 + 2) >> 2; + } + } + else + { + /* p0', q0'*/ + pu1_p3_temp[pos_p0] = (X2(p1) + p0 + q1 + 2) >> 2; + pu1_src_temp[pos_q0] = (X2(q1) + q0 + p1 + 2) >> 2; + } + } + } +} + +/*****************************************************************************/ +/* */ +/* Function Name : ih264_deblk_chroma_vert_bs4_bp() */ +/* */ +/* Description : This function performs filtering of a chroma block */ +/* vertical edge when the boundary strength is set to 4. */ +/* */ +/* Inputs : pu1_src - pointer to the src sample q0 of U */ +/* src_strd - source stride */ +/* alpha - alpha value for the boundary */ +/* beta - beta value for the boundary */ +/* */ +/* Globals : None */ +/* */ +/* Processing : This operation is described in Sec. 8.7.2.4 under the */ +/* title "Filtering process for edges for bS equal to 4" in */ +/* ITU T Rec H.264. */ +/* */ +/* Outputs : None */ +/* */ +/* Returns : None */ +/* */ +/* Issues : None */ +/* */ +/* Revision History: */ +/* */ +/* DD MM YYYY Author(s) Changes (Describe the changes made) */ +/* 28 11 2013 Ittiam Draft */ +/* */ +/*****************************************************************************/ +void ih264_deblk_chroma_vert_bs4_bp(UWORD8 *pu1_src, + WORD32 src_strd, + WORD32 alpha, + WORD32 beta) +{ + UWORD8 *pu1_src_u = pu1_src; /* pointer to the src sample q0 of U */ + UWORD8 *pu1_src_v = pu1_src + 1; /* pointer to the src sample q0 of V */ + UWORD8 p1_u, p0_u, q0_u, q1_u, p1_v, p0_v, q0_v, q1_v; + WORD32 blk_strd = src_strd << 1; /* block_increment = src_strd * 2 */ + WORD32 pos_p1, pos_p0, pos_q0, pos_q1; + UWORD8 *pu1_src_temp_u, *pu1_src_temp_v; + WORD8 i = 0, edge; + + pos_q0 = 0; + pos_q1 = 2; + pos_p0 = -2; + pos_p1 = -4; + + for(edge = 0; edge < 4; + edge++, pu1_src_u += blk_strd, pu1_src_v += blk_strd) + { + pu1_src_temp_u = pu1_src_u; + pu1_src_temp_v = pu1_src_v; + for(i = 0; i < 2; ++i, pu1_src_temp_u += src_strd, pu1_src_temp_v += + src_strd) + { + q0_u = pu1_src_temp_u[pos_q0]; + q1_u = pu1_src_temp_u[pos_q1]; + p0_u = pu1_src_temp_u[pos_p0]; + p1_u = pu1_src_temp_u[pos_p1]; + q0_v = pu1_src_temp_v[pos_q0]; + q1_v = pu1_src_temp_v[pos_q1]; + p0_v = pu1_src_temp_v[pos_p0]; + p1_v = pu1_src_temp_v[pos_p1]; + + /* Filter Decision */ + if((ABS(p0_u - q0_u) < alpha) && + (ABS(q1_u - q0_u) < beta) && + (ABS(p1_u - p0_u) < beta)) + { + /* p0' */ + pu1_src_temp_u[pos_p0] = ((X2(p1_u) + p0_u + q1_u + 2) >> 2); + /* q0' */ + pu1_src_temp_u[pos_q0] = (X2(q1_u) + q0_u + p1_u + 2) >> 2; + } + + /* Filter Decision */ + if((ABS(p0_v - q0_v) < alpha) && + (ABS(q1_v - q0_v) < beta) && + (ABS(p1_v - p0_v) < beta)) + { + /* p0' */ + pu1_src_temp_v[pos_p0] = ((X2(p1_v) + p0_v + q1_v + 2) >> 2); + /* q0' */ + pu1_src_temp_v[pos_q0] = (X2(q1_v) + q0_v + p1_v + 2) >> 2; + } + } + } +} + +/*****************************************************************************/ +/* */ +/* Function Name : ih264_deblk_chroma_horz_bs4_bp() */ +/* */ +/* Description : This function performs filtering of a chroma block */ +/* horizontal edge when the boundary strength is set to 4. */ +/* */ +/* Inputs : pu1_src - pointer to the src sample q0 of U */ +/* src_strd - source stride */ +/* alpha - alpha value for the boundary */ +/* beta - beta value for the boundary */ +/* */ +/* Globals : None */ +/* */ +/* Processing : This operation is described in Sec. 8.7.2.4 under the */ +/* title "Filtering process for edges for bS equal to 4" in */ +/* ITU T Rec H.264. */ +/* */ +/* Outputs : None */ +/* */ +/* Returns : None */ +/* */ +/* Issues : None */ +/* */ +/* Revision History: */ +/* */ +/* DD MM YYYY Author(s) Changes (Describe the changes made) */ +/* 28 11 2013 Ittiam Draft */ +/* */ +/*****************************************************************************/ +void ih264_deblk_chroma_horz_bs4_bp(UWORD8 *pu1_src, + WORD32 src_strd, + WORD32 alpha, + WORD32 beta) +{ + UWORD8 *pu1_src_u = pu1_src; /* pointer to the src sample q0 of U */ + UWORD8 *pu1_src_v = pu1_src + 1; /* pointer to the src sample q0 of V */ + UWORD8 p1_u, p0_u, q0_u, q1_u, p1_v, p0_v, q0_v, q1_v; + WORD32 pos_p1, pos_p0, pos_q0, pos_q1; + UWORD8 *pu1_src_temp_u, *pu1_src_temp_v; + UWORD8 *pu1_p1_u; /* pointer to the src sample p1 of U */ + UWORD8 *pu1_p1_v; /* pointer to the src sample p1 of U */ + UWORD8 *pu1_p1_temp_u, *pu1_p1_temp_v; + WORD8 i = 0, edge; + + pu1_p1_u = pu1_src_u - (src_strd << 1); + pu1_p1_v = pu1_src_v - (src_strd << 1); + pos_q0 = 0; + pos_q1 = src_strd; + pos_p0 = src_strd; + pos_p1 = 0; + + for(edge = 0; edge < 4; edge++, pu1_src_u += 4, pu1_p1_u += 4, + pu1_src_v += 4, pu1_p1_v += 4) + { + pu1_src_temp_u = pu1_src_u; + pu1_p1_temp_u = pu1_p1_u; + pu1_src_temp_v = pu1_src_v; + pu1_p1_temp_v = pu1_p1_v; + for(i = 0; i < 2; ++i, pu1_src_temp_u += 2, pu1_p1_temp_u += 2, + pu1_src_temp_v += 2, pu1_p1_temp_v += 2) + { + q0_u = pu1_src_temp_u[pos_q0]; + q1_u = pu1_src_temp_u[pos_q1]; + p0_u = pu1_p1_temp_u[pos_p0]; + p1_u = pu1_p1_temp_u[pos_p1]; + + q0_v = pu1_src_temp_v[pos_q0]; + q1_v = pu1_src_temp_v[pos_q1]; + p0_v = pu1_p1_temp_v[pos_p0]; + p1_v = pu1_p1_temp_v[pos_p1]; + + /* Filter Decision */ + if((ABS(p0_u - q0_u) < alpha) && + (ABS(q1_u - q0_u) < beta) && + (ABS(p1_u - p0_u) < beta)) + { + /* p0' */ + pu1_p1_temp_u[pos_p0] = (X2(p1_u) + p0_u + q1_u + 2) >> 2; + /* q0' */ + pu1_src_temp_u[pos_q0] = (X2(q1_u) + q0_u + p1_u + 2) >> 2; + } + + /* Filter Decision */ + if((ABS(p0_v - q0_v) < alpha) && + (ABS(q1_v - q0_v) < beta) && + (ABS(p1_v - p0_v) < beta)) + { + /* p0' */ + pu1_p1_temp_v[pos_p0] = (X2(p1_v) + p0_v + q1_v + 2) >> 2; + /* q0' */ + pu1_src_temp_v[pos_q0] = (X2(q1_v) + q0_v + p1_v + 2) >> 2; + } + } + } +} + +/*****************************************************************************/ +/* */ +/* Function Name : ih264_deblk_luma_vert_bslt4() */ +/* */ +/* Description : This function performs filtering of a luma block */ +/* vertical edge when the boundary strength is less than 4. */ +/* */ +/* Inputs : pu1_src - pointer to the src sample q0 */ +/* src_strd - source stride */ +/* alpha - alpha value for the boundary */ +/* beta - beta value for the boundary */ +/* u4_bs - packed Boundary strength array */ +/* pu1_cliptab - tc0_table */ +/* */ +/* Globals : None */ +/* */ +/* Processing : This operation is described in Sec. 8.7.2.3 under the */ +/* title "Filtering process for edges for bS less than 4" */ +/* in ITU T Rec H.264. */ +/* */ +/* Outputs : None */ +/* */ +/* Returns : None */ +/* */ +/* Issues : None */ +/* */ +/* Revision History: */ +/* */ +/* DD MM YYYY Author(s) Changes (Describe the changes made) */ +/* 28 11 2013 Ittiam Draft */ +/* */ +/*****************************************************************************/ +void ih264_deblk_luma_vert_bslt4(UWORD8 *pu1_src, + WORD32 src_strd, + WORD32 alpha, + WORD32 beta, + UWORD32 u4_bs, + const UWORD8 *pu1_cliptab) +{ + WORD8 i = 0, edge; + UWORD8 p2, p1, p0, q0, q1, q2; + WORD32 pos_p2, pos_p1, pos_p0, pos_q0, pos_q1, pos_q2; + UWORD8 a_p, a_q; /* threshold variables */ + WORD32 blk_strd = src_strd << 2; /* block_increment = src_strd * 4 */ + UWORD8 *pu1_src_temp; + WORD8 delta; + WORD8 tc; + WORD16 val; + UWORD8 tc0, u1_bs; + + pos_q0 = 0; + pos_q1 = 1; + pos_q2 = 2; + pos_p0 = -1; + pos_p1 = -2; + pos_p2 = -3; + + for(edge = 0; edge < 4; edge++, pu1_src += blk_strd) + { + pu1_src_temp = pu1_src; + /* Filter Decision */ + u1_bs = (UWORD8)((u4_bs >> ((3 - edge) << 3)) & 0x0ff); + if(!u1_bs) + continue; + /* tc0 */ + tc0 = pu1_cliptab[u1_bs]; + for(i = 0; i < 4; ++i, pu1_src_temp += src_strd) + { + q0 = pu1_src_temp[pos_q0]; + q1 = pu1_src_temp[pos_q1]; + p0 = pu1_src_temp[pos_p0]; + p1 = pu1_src_temp[pos_p1]; + + /* Filter Decision */ + if((ABS(p0 - q0) >= alpha) || + (ABS(q1 - q0) >= beta) || + (ABS(p1 - p0) >= beta)) + continue; + + q2 = pu1_src_temp[pos_q2]; + p2 = pu1_src_temp[pos_p2]; + + a_p = ABS(p2 - p0); + a_q = ABS(q2 - q0); + + /* tc */ + tc = tc0 + (a_p < beta) + (a_q < beta); + + val = ((((q0 - p0) << 2) + (p1 - q1) + 4) >> 3); + delta = CLIP3(-tc, tc, val); + + /* p0' */ + val = p0 + delta; + pu1_src_temp[pos_p0] = CLIP_U8(val); + /* q0' */ + val = q0 - delta; + pu1_src_temp[pos_q0] = CLIP_U8(val); + + /* Luma only */ + if(a_p < beta) + { + /* p1' */ + val = ((p2 + ((p0 + q0 + 1) >> 1) - (p1 << 1)) >> 1); + pu1_src_temp[pos_p1] += CLIP3(-tc0, tc0, val); + } + + if(a_q < beta) + { + /* q1' */ + val = ((q2 + ((p0 + q0 + 1) >> 1) - (q1 << 1)) >> 1); + pu1_src_temp[pos_q1] += CLIP3(-tc0, tc0, val); + } + } + } +} + +/*****************************************************************************/ +/* */ +/* Function Name : ih264_deblk_chroma_vert_bslt4_bp() */ +/* */ +/* Description : This function performs filtering of a chroma block */ +/* vertical edge when the boundary strength is less than 4. */ +/* */ +/* Inputs : pu1_src - pointer to the src sample q0 of U */ +/* src_strd - source stride */ +/* alpha - alpha value for the boundary */ +/* beta - beta value for the boundary */ +/* u4_bs - packed Boundary strength array */ +/* pu1_cliptab - tc0_table */ +/* */ +/* Globals : None */ +/* */ +/* Processing : This operation is described in Sec. 8.7.2.3 under the */ +/* title "Filtering process for edges for bS less than 4" */ +/* in ITU T Rec H.264. */ +/* */ +/* Outputs : None */ +/* */ +/* Returns : None */ +/* */ +/* Issues : None */ +/* */ +/* Revision History: */ +/* */ +/* DD MM YYYY Author(s) Changes (Describe the changes made) */ +/* 28 11 2013 Ittiam Draft */ +/* */ +/*****************************************************************************/ +void ih264_deblk_chroma_vert_bslt4_bp(UWORD8 *pu1_src, + WORD32 src_strd, + WORD32 alpha, + WORD32 beta, + UWORD32 u4_bs, + const UWORD8 *pu1_cliptab) +{ + UWORD8 *pu1_src_u = pu1_src; /* Pointer to the src sample q0 of plane U*/ + UWORD8 *pu1_src_v = pu1_src + 1; /* Pointer to the src sample q0 of plane V*/ + UWORD8 p1_u, p0_u, q0_u, q1_u, p1_v, p0_v, q0_v, q1_v; + WORD32 blk_strd = src_strd << 1; /* block_increment = src_strd * (4 >> 1)*/ + WORD32 pos_p1, pos_p0, pos_q0, pos_q1; + UWORD8 *pu1_src_temp_u, *pu1_src_temp_v; + WORD8 i = 0, edge; + WORD8 delta; + WORD8 tc; + WORD16 val; + UWORD8 tc0, u1_bs; + + pos_q0 = 0; + pos_q1 = 2; + pos_p0 = -2; + pos_p1 = -4; + + for(edge = 0; edge < 4; + edge++, pu1_src_u += blk_strd, pu1_src_v += blk_strd) + { + pu1_src_temp_u = pu1_src_u; + pu1_src_temp_v = pu1_src_v; + /* Filter Decision */ + u1_bs = (UWORD8)((u4_bs >> ((3 - edge) << 3)) & 0x0ff); + if(!u1_bs) + continue; + /* tc0 */ + tc0 = pu1_cliptab[u1_bs]; + tc = tc0 + 1; + for(i = 0; i < 2; ++i, pu1_src_temp_u += src_strd, pu1_src_temp_v += + src_strd) + { + q0_u = pu1_src_temp_u[pos_q0]; + q1_u = pu1_src_temp_u[pos_q1]; + p0_u = pu1_src_temp_u[pos_p0]; + p1_u = pu1_src_temp_u[pos_p1]; + + q0_v = pu1_src_temp_v[pos_q0]; + q1_v = pu1_src_temp_v[pos_q1]; + p0_v = pu1_src_temp_v[pos_p0]; + p1_v = pu1_src_temp_v[pos_p1]; + + /* Filter Decision */ + if((ABS(p0_u - q0_u) < alpha) && + (ABS(q1_u - q0_u) < beta) && + (ABS(p1_u - p0_u) < beta)) + { + val = ((((q0_u - p0_u) << 2) + (p1_u - q1_u) + 4) >> 3); + delta = CLIP3(-tc, tc, val); + /* p0' */ + val = p0_u + delta; + pu1_src_temp_u[pos_p0] = CLIP_U8(val); + /* q0' */ + val = q0_u - delta; + pu1_src_temp_u[pos_q0] = CLIP_U8(val); + } + + /* Filter Decision */ + if((ABS(p0_v - q0_v) < alpha) && + (ABS(q1_v - q0_v) < beta) && + (ABS(p1_v - p0_v) < beta)) + { + val = ((((q0_v - p0_v) << 2) + (p1_v - q1_v) + 4) >> 3); + delta = CLIP3(-tc, tc, val); + /* p0' */ + val = p0_v + delta; + pu1_src_temp_v[pos_p0] = CLIP_U8(val); + /* q0' */ + val = q0_v - delta; + pu1_src_temp_v[pos_q0] = CLIP_U8(val); + } + } + } +} + +/*****************************************************************************/ +/* */ +/* Function Name : ih264_deblk_luma_horz_bslt4() */ +/* */ +/* Description : This function performs filtering of a luma block */ +/* horizontal edge when boundary strength is less than 4. */ +/* */ +/* Inputs : pu1_src - pointer to the src sample q0 */ +/* src_strd - source stride */ +/* alpha - alpha value for the boundary */ +/* beta - beta value for the boundary */ +/* u4_bs - packed Boundary strength array */ +/* pu1_cliptab - tc0_table */ +/* */ +/* Globals : None */ +/* */ +/* Processing : This operation is described in Sec. 8.7.2.3 under the */ +/* title "Filtering process for edges for bS less than 4" */ +/* in ITU T Rec H.264. */ +/* */ +/* Outputs : None */ +/* */ +/* Returns : None */ +/* */ +/* Issues : None */ +/* */ +/* Revision History: */ +/* */ +/* DD MM YYYY Author(s) Changes (Describe the changes made) */ +/* 28 11 2013 Ittiam Draft */ +/* */ +/*****************************************************************************/ +void ih264_deblk_luma_horz_bslt4(UWORD8 *pu1_src, + WORD32 src_strd, + WORD32 alpha, + WORD32 beta, + UWORD32 u4_bs, + const UWORD8 *pu1_cliptab) +{ + UWORD8 p2, p1, p0, q0, q1, q2; + WORD32 pos_p2, pos_p1, pos_p0, pos_q0, pos_q1, pos_q2; + UWORD8 a_p, a_q; /* Threshold variables */ + UWORD8 *pu1_p2; /* Pointer to the src sample p2 */ + UWORD8 *pu1_p2_temp; + UWORD8 *pu1_src_temp; + WORD8 i = 0, edge; + WORD8 delta; + WORD8 tc; + WORD16 val; + UWORD8 tc0, u1_bs; + + pu1_p2 = pu1_src - (src_strd << 2); + pos_q0 = 0; + pos_q1 = src_strd; + pos_q2 = X2(src_strd); + pos_p0 = X3(src_strd); + pos_p1 = X2(src_strd); + pos_p2 = src_strd; + + for(edge = 0; edge < 4; edge++, pu1_src += 4, pu1_p2 += 4) + { + pu1_src_temp = pu1_src; + pu1_p2_temp = pu1_p2; + + /* Filter Decision */ + u1_bs = (UWORD8)((u4_bs >> ((3 - edge) << 3)) & 0x0ff); + if(!u1_bs) + continue; + /* tc0 */ + tc0 = pu1_cliptab[u1_bs]; + + for(i = 0; i < 4; ++i, pu1_src_temp++, pu1_p2_temp++) + { + q0 = pu1_src_temp[pos_q0]; + q1 = pu1_src_temp[pos_q1]; + p0 = pu1_p2_temp[pos_p0]; + p1 = pu1_p2_temp[pos_p1]; + + /* Filter Decision */ + if((ABS(p0 - q0) >= alpha) || + (ABS(q1 - q0) >= beta) || + (ABS(p1 - p0) >= beta)) + continue; + + q2 = pu1_src_temp[pos_q2]; + p2 = pu1_p2_temp[pos_p2]; + + a_p = ABS(p2 - p0); + a_q = ABS(q2 - q0); + + /* tc */ + tc = tc0 + (a_p < beta) + (a_q < beta); + val = ((((q0 - p0) << 2) + (p1 - q1) + 4) >> 3); + delta = CLIP3(-tc, tc, val); + /* p0' */ + val = p0 + delta; + pu1_p2_temp[pos_p0] = CLIP_U8(val); + /* q0' */ + val = q0 - delta; + pu1_src_temp[pos_q0] = CLIP_U8(val); + + /* Luma */ + if(a_p < beta) + { + /* p1' */ + val = ((p2 + ((p0 + q0 + 1) >> 1) - (p1 << 1)) >> 1); + pu1_p2_temp[pos_p1] += CLIP3(-tc0, tc0, val); + } + + if(a_q < beta) + { + /* q1' */ + val = ((q2 + ((p0 + q0 + 1) >> 1) - (q1 << 1)) >> 1); + pu1_src_temp[pos_q1] += CLIP3(-tc0, tc0, val); + } + } + } +} + +/*****************************************************************************/ +/* */ +/* Function Name : ih264_deblk_chroma_horz_bslt4_bp() */ +/* */ +/* Description : This function performs filtering of a chroma block */ +/* horizontal edge when boundary strength is less than 4. */ +/* */ +/* Inputs : pu1_src - pointer to the src sample q0 of U */ +/* src_strd - source stride */ +/* alpha - alpha value for the boundary */ +/* beta - beta value for the boundary */ +/* u4_bs - packed Boundary strength array */ +/* pu1_cliptab - tc0_table */ +/* */ +/* Globals : None */ +/* */ +/* Processing : This operation is described in Sec. 8.7.2.3 under the */ +/* title "Filtering process for edges for bS less than 4" */ +/* in ITU T Rec H.264. */ +/* */ +/* Outputs : None */ +/* */ +/* Returns : None */ +/* */ +/* Issues : None */ +/* */ +/* Revision History: */ +/* */ +/* DD MM YYYY Author(s) Changes (Describe the changes made) */ +/* 28 11 2013 Ittiam Draft */ +/* */ +/*****************************************************************************/ +void ih264_deblk_chroma_horz_bslt4_bp(UWORD8 *pu1_src, + WORD32 src_strd, + WORD32 alpha, + WORD32 beta, + UWORD32 u4_bs, + const UWORD8 *pu1_cliptab) +{ + UWORD8 *pu1_src_u = pu1_src; /* Pointer to the src sample q0 of plane U*/ + UWORD8 *pu1_src_v = pu1_src + 1; /* Pointer to the src sample q0 of plane V*/ + UWORD8 p1_u, p0_u, q0_u, q1_u, p1_v, p0_v, q0_v, q1_v; + WORD32 pos_p1, pos_p0, pos_q0, pos_q1; + UWORD8 *pu1_src_temp_u, *pu1_src_temp_v; + UWORD8 *pu1_p1_u; /* Pointer to the src sample p1 of plane U*/ + UWORD8 *pu1_p1_v; /* Pointer to the src sample p1 of plane V*/ + UWORD8 *pu1_p1_temp_u, *pu1_p1_temp_v; + WORD8 i = 0, edge; + WORD8 delta; + WORD8 tc; + WORD16 val; + UWORD8 u1_bs; + UWORD8 tc0; + + pu1_p1_u = pu1_src_u - (src_strd << 1); + pu1_p1_v = pu1_src_v - (src_strd << 1); + pos_q0 = 0; + pos_q1 = src_strd; + pos_p0 = src_strd; + pos_p1 = 0; + + for(edge = 0; edge < 4; edge++, pu1_src_u += 4, pu1_p1_u += 4, + pu1_src_v += 4, pu1_p1_v += 4) + { + pu1_src_temp_u = pu1_src_u; + pu1_p1_temp_u = pu1_p1_u; + pu1_src_temp_v = pu1_src_v; + pu1_p1_temp_v = pu1_p1_v; + + /* Filter Decision */ + u1_bs = (UWORD8)((u4_bs >> ((3 - edge) << 3)) & 0x0ff); + if(!u1_bs) + continue; + /* tc0 */ + tc0 = pu1_cliptab[u1_bs]; + + for(i = 0; i < 2; ++i, pu1_src_temp_u += 2, pu1_p1_temp_u += 2, + pu1_src_temp_v += 2, pu1_p1_temp_v += 2) + { + q0_u = pu1_src_temp_u[pos_q0]; + q1_u = pu1_src_temp_u[pos_q1]; + p0_u = pu1_p1_temp_u[pos_p0]; + p1_u = pu1_p1_temp_u[pos_p1]; + + q0_v = pu1_src_temp_v[pos_q0]; + q1_v = pu1_src_temp_v[pos_q1]; + p0_v = pu1_p1_temp_v[pos_p0]; + p1_v = pu1_p1_temp_v[pos_p1]; + + /* tc */ + tc = tc0 + 1; + /* Filter Decision */ + if(ABS(p0_u - q0_u) < alpha && ABS(q1_u - q0_u) < beta + && ABS(p1_u - p0_u) < beta) + { + val = ((((q0_u - p0_u) << 2) + (p1_u - q1_u) + 4) >> 3); + delta = CLIP3(-tc, tc, val); + /* p0' */ + val = p0_u + delta; + pu1_p1_temp_u[pos_p0] = CLIP_U8(val); + /* q0' */ + val = q0_u - delta; + pu1_src_temp_u[pos_q0] = CLIP_U8(val); + } + /* Filter Decision */ + if(ABS(p0_v - q0_v) < alpha && ABS(q1_v - q0_v) < beta + && ABS(p1_v - p0_v) < beta) + { + val = ((((q0_v - p0_v) << 2) + (p1_v - q1_v) + 4) >> 3); + delta = CLIP3(-tc, tc, val); + /* p0' */ + val = p0_v + delta; + pu1_p1_temp_v[pos_p0] = CLIP_U8(val); + /* q0' */ + val = q0_v - delta; + pu1_src_temp_v[pos_q0] = CLIP_U8(val); + } + } + } +} + +/*****************************************************************************/ +/* Function Definitions for vertical edge deblocking for double-call */ +/*****************************************************************************/ + +/*****************************************************************************/ +/* */ +/* Function Name : ih264_deblk_luma_vert_bs4_mbaff() */ +/* */ +/* Description : This function performs filtering of a luma block */ +/* vertical edge when boundary strength is set to 4. */ +/* */ +/* Inputs : pu1_src - pointer to the src sample q0 */ +/* src_strd - source stride */ +/* alpha - alpha value for the boundary */ +/* beta - beta value for the boundary */ +/* */ +/* Globals : None */ +/* */ +/* Processing : When the function is called twice, this operation is as */ +/* described in Sec. 8.7.2.3 under the title "Filtering */ +/* process for edges for bS equal to 4" in ITU T Rec H.264. */ +/* */ +/* Outputs : None */ +/* */ +/* Returns : None */ +/* */ +/* Issues : None */ +/* */ +/* Revision History: */ +/* */ +/* DD MM YYYY Author(s) Changes (Describe the changes made) */ +/* 29 12 2014 Kaushik Draft */ +/* Senthoor */ +/* */ +/*****************************************************************************/ +void ih264_deblk_luma_vert_bs4_mbaff(UWORD8 *pu1_src, + WORD32 src_strd, + WORD32 alpha, + WORD32 beta) +{ + UWORD8 p3, p2, p1, p0, q0, q1, q2, q3; + WORD32 pos_p3, pos_p2, pos_p1, pos_p0; + WORD32 pos_q0, pos_q1, pos_q2, pos_q3; + UWORD8 a_p, a_q; /* threshold variables */ + WORD32 blk_strd = src_strd << 1; /* block_increment = src_strd * 2 */ + UWORD8 *pu1_src_temp; + WORD8 i = 0, edge; + + pos_q0 = 0; + pos_q1 = 1; + pos_q2 = 2; + pos_q3 = 3; + pos_p0 = -1; + pos_p1 = -2; + pos_p2 = -3; + pos_p3 = -4; + + for(edge = 0; edge < 4; edge++, pu1_src += blk_strd) + { + pu1_src_temp = pu1_src; + for(i = 0; i < 2; ++i, pu1_src_temp += src_strd) + { + q0 = pu1_src_temp[pos_q0]; + q1 = pu1_src_temp[pos_q1]; + p0 = pu1_src_temp[pos_p0]; + p1 = pu1_src_temp[pos_p1]; + + /* Filter Decision */ + if((ABS(p0 - q0) >= alpha) || + (ABS(q1 - q0) >= beta) || + (ABS(p1 - p0) >= beta)) + continue; + + p2 = pu1_src_temp[pos_p2]; + p3 = pu1_src_temp[pos_p3]; + q2 = pu1_src_temp[pos_q2]; + q3 = pu1_src_temp[pos_q3]; + + if(ABS(p0 - q0) < ((alpha >> 2) + 2)) + { + /* Threshold Variables */ + a_p = (UWORD8)ABS(p2 - p0); + a_q = (UWORD8)ABS(q2 - q0); + + if(a_p < beta) + { + /* p0', p1', p2' */ + pu1_src_temp[pos_p0] = ((p2 + X2(p1) + X2(p0) + X2(q0) + q1 + + 4) >> 3); + pu1_src_temp[pos_p1] = ((p2 + p1 + p0 + q0 + 2) >> 2); + pu1_src_temp[pos_p2] = + ((X2(p3) + X3(p2) + p1 + p0 + q0 + + 4) >> 3); + } + else + { + /* p0'*/ + pu1_src_temp[pos_p0] = ((X2(p1) + p0 + q1 + 2) >> 2); + } + + if(a_q < beta) + { + /* q0', q1', q2' */ + pu1_src_temp[pos_q0] = (p1 + X2(p0) + X2(q0) + X2(q1) + q2 + + 4) >> 3; + pu1_src_temp[pos_q1] = (p0 + q0 + q1 + q2 + 2) >> 2; + pu1_src_temp[pos_q2] = (X2(q3) + X3(q2) + q1 + q0 + p0 + 4) + >> 3; + } + else + { + /* q0'*/ + pu1_src_temp[pos_q0] = (X2(q1) + q0 + p1 + 2) >> 2; + } + } + else + { + /* p0', q0'*/ + pu1_src_temp[pos_p0] = ((X2(p1) + p0 + q1 + 2) >> 2); + pu1_src_temp[pos_q0] = (X2(q1) + q0 + p1 + 2) >> 2; + } + } + } +} + +/*****************************************************************************/ +/* */ +/* Function Name : ih264_deblk_chroma_vert_bs4_mbaff_bp() */ +/* */ +/* Description : This function performs filtering of a chroma block */ +/* vertical edge when boundary strength is set to 4. */ +/* */ +/* Inputs : pu1_src - pointer to the src sample q0 of U */ +/* src_strd - source stride */ +/* alpha - alpha value for the boundary */ +/* beta - beta value for the boundary */ +/* */ +/* Globals : None */ +/* */ +/* Processing : When the function is called twice, this operation is as */ +/* described in Sec. 8.7.2.3 under the title "Filtering */ +/* process for edges for bS equal to 4" in ITU T Rec H.264. */ +/* */ +/* Outputs : None */ +/* */ +/* Returns : None */ +/* */ +/* Issues : None */ +/* */ +/* Revision History: */ +/* */ +/* DD MM YYYY Author(s) Changes (Describe the changes made) */ +/* 29 12 2014 Kaushik Draft */ +/* Senthoor */ +/* */ +/*****************************************************************************/ +void ih264_deblk_chroma_vert_bs4_mbaff_bp(UWORD8 *pu1_src, + WORD32 src_strd, + WORD32 alpha, + WORD32 beta) +{ + UWORD8 *pu1_src_u = pu1_src; /* Pointer to the src sample q0 of U */ + UWORD8 *pu1_src_v = pu1_src + 1; /* Pointer to the src sample q0 of V */ + UWORD8 p1_u, p0_u, q0_u, q1_u, p1_v, p0_v, q0_v, q1_v; + WORD32 blk_strd = src_strd; + WORD32 pos_p1, pos_p0, pos_q0, pos_q1; + UWORD8 *pu1_src_temp_u, *pu1_src_temp_v; + WORD8 edge; + + pos_q0 = 0; + pos_q1 = 2; + pos_p0 = -2; + pos_p1 = -4; + + for(edge = 0; edge < 4; + edge++, pu1_src_u += blk_strd, pu1_src_v += blk_strd) + { + pu1_src_temp_u = pu1_src_u; + pu1_src_temp_v = pu1_src_v; + + q0_u = pu1_src_temp_u[pos_q0]; + q1_u = pu1_src_temp_u[pos_q1]; + p0_u = pu1_src_temp_u[pos_p0]; + p1_u = pu1_src_temp_u[pos_p1]; + q0_v = pu1_src_temp_v[pos_q0]; + q1_v = pu1_src_temp_v[pos_q1]; + p0_v = pu1_src_temp_v[pos_p0]; + p1_v = pu1_src_temp_v[pos_p1]; + + /* Filter Decision */ + if((ABS(p0_u - q0_u) < alpha) && + (ABS(q1_u - q0_u) < beta) && + (ABS(p1_u - p0_u) < beta)) + { + /* p0' */ + pu1_src_temp_u[pos_p0] = ((X2(p1_u) + p0_u + q1_u + 2) >> 2); + /* q0' */ + pu1_src_temp_u[pos_q0] = (X2(q1_u) + q0_u + p1_u + 2) >> 2; + } + + /* Filter Decision */ + if(ABS(p0_v - q0_v) < alpha && ABS(q1_v - q0_v) < beta + && ABS(p1_v - p0_v) < beta) + { + /* p0' */ + pu1_src_temp_v[pos_p0] = ((X2(p1_v) + p0_v + q1_v + 2) >> 2); + /* q0' */ + pu1_src_temp_v[pos_q0] = (X2(q1_v) + q0_v + p1_v + 2) >> 2; + } + } +} + +/*****************************************************************************/ +/* */ +/* Function Name : ih264_deblk_luma_vert_bslt4_mbaff() */ +/* */ +/* Description : This function performs filtering of a luma block */ +/* vertical edge when boundary strength is less than 4. */ +/* */ +/* Inputs : pu1_src - pointer to the src sample q0 */ +/* src_strd - source stride */ +/* alpha - alpha value for the boundary */ +/* beta - beta value for the boundary */ +/* u4_bs - packed Boundary strength array */ +/* pu1_cliptab - tc0_table */ +/* */ +/* Globals : None */ +/* */ +/* Processing : When the function is called twice, this operation is as */ +/* described in Sec. 8.7.2.3 under the title "Filtering */ +/* process for edges for bS less than 4" in ITU T Rec H.264.*/ +/* */ +/* Outputs : None */ +/* */ +/* Returns : None */ +/* */ +/* Issues : None */ +/* */ +/* Revision History: */ +/* */ +/* DD MM YYYY Author(s) Changes (Describe the changes made) */ +/* 29 12 2014 Kaushik Draft */ +/* Senthoor */ +/* */ +/*****************************************************************************/ +void ih264_deblk_luma_vert_bslt4_mbaff(UWORD8 *pu1_src, + WORD32 src_strd, + WORD32 alpha, + WORD32 beta, + UWORD32 u4_bs, + const UWORD8 *pu1_cliptab) +{ + WORD8 i = 0, edge; + UWORD8 p2, p1, p0, q0, q1, q2; + WORD32 pos_p2, pos_p1, pos_p0, pos_q0, pos_q1, pos_q2; + UWORD8 a_p, a_q; /* Threshold variables */ + WORD32 blk_strd = src_strd << 1; /* block_increment = src_strd * 2 */ + UWORD8 *pu1_src_temp; + WORD8 delta; + WORD8 tc; + WORD16 val; + UWORD8 tc0, u1_bs; + + pos_q0 = 0; + pos_q1 = 1; + pos_q2 = 2; + pos_p0 = -1; + pos_p1 = -2; + pos_p2 = -3; + + for(edge = 0; edge < 4; edge++, pu1_src += blk_strd) + { + pu1_src_temp = pu1_src; + /* Filter Decision */ + u1_bs = (UWORD8)((u4_bs >> ((3 - edge) << 3)) & 0x0ff); + if(!u1_bs) + continue; + /* tc0 */ + tc0 = pu1_cliptab[u1_bs]; + for(i = 0; i < 2; ++i, pu1_src_temp += src_strd) + { + q0 = pu1_src_temp[pos_q0]; + q1 = pu1_src_temp[pos_q1]; + p0 = pu1_src_temp[pos_p0]; + p1 = pu1_src_temp[pos_p1]; + + /* Filter Decision */ + if((ABS(p0 - q0) >= alpha) || + (ABS(q1 - q0) >= beta) || + (ABS(p1 - p0) >= beta)) + continue; + + q2 = pu1_src_temp[pos_q2]; + p2 = pu1_src_temp[pos_p2]; + + a_p = ABS(p2 - p0); + a_q = ABS(q2 - q0); + + /* tc */ + tc = tc0 + (a_p < beta) + (a_q < beta); + + val = ((((q0 - p0) << 2) + (p1 - q1) + 4) >> 3); + delta = CLIP3(-tc, tc, val); + /* p0' */ + val = p0 + delta; + pu1_src_temp[pos_p0] = CLIP_U8(val); + /* q0' */ + val = q0 - delta; + pu1_src_temp[pos_q0] = CLIP_U8(val); + + /* Luma only */ + if(a_p < beta) + { + /* p1' */ + val = ((p2 + ((p0 + q0 + 1) >> 1) - (p1 << 1)) >> 1); + pu1_src_temp[pos_p1] += CLIP3(-tc0, tc0, val); + } + + if(a_q < beta) + { + /* q1' */ + val = ((q2 + ((p0 + q0 + 1) >> 1) - (q1 << 1)) >> 1); + pu1_src_temp[pos_q1] += CLIP3(-tc0, tc0, val); + } + } + } +} + +/*****************************************************************************/ +/* */ +/* Function Name : ih264_deblk_chroma_vert_bslt4_mbaff_bp() */ +/* */ +/* Description : This function performs filtering of a chroma block */ +/* vertical edge when boundary strength is less than 4. */ +/* */ +/* Inputs : pu1_src - pointer to the src sample q0 of U */ +/* src_strd - source stride */ +/* alpha - alpha value for the boundary */ +/* beta - beta value for the boundary */ +/* u4_bs - packed Boundary strength array */ +/* pu1_cliptab - tc0_table */ +/* */ +/* Globals : None */ +/* */ +/* Processing : When the function is called twice, this operation is as */ +/* described in Sec. 8.7.2.3 under the title "Filtering */ +/* process for edges for bS less than 4" in ITU T Rec H.264.*/ +/* */ +/* Outputs : None */ +/* */ +/* Returns : None */ +/* */ +/* Issues : None */ +/* */ +/* Revision History: */ +/* */ +/* DD MM YYYY Author(s) Changes (Describe the changes made) */ +/* 29 12 2014 Kaushik Draft */ +/* Senthoor */ +/* */ +/*****************************************************************************/ +void ih264_deblk_chroma_vert_bslt4_mbaff_bp(UWORD8 *pu1_src, + WORD32 src_strd, + WORD32 alpha, + WORD32 beta, + UWORD32 u4_bs, + const UWORD8 *pu1_cliptab) +{ + UWORD8 *pu1_src_u = pu1_src; /* Pointer to the src sample q0 of plane U*/ + UWORD8 *pu1_src_v = pu1_src + 1; /* Pointer to the src sample q0 of plane V*/ + UWORD8 p1_u, p0_u, q0_u, q1_u, p1_v, p0_v, q0_v, q1_v; + WORD32 blk_strd = src_strd; + WORD32 pos_p1, pos_p0, pos_q0, pos_q1; + UWORD8 *pu1_src_temp_u, *pu1_src_temp_v; + WORD8 edge; + WORD8 delta; + WORD8 tc; + WORD16 val; + UWORD8 tc0, u1_bs; + + pos_q0 = 0; + pos_q1 = 2; + pos_p0 = -2; + pos_p1 = -4; + + for(edge = 0; edge < 4; + edge++, pu1_src_u += blk_strd, pu1_src_v += blk_strd) + { + pu1_src_temp_u = pu1_src_u; + pu1_src_temp_v = pu1_src_v; + /* Filter Decision */ + u1_bs = (UWORD8)((u4_bs >> ((3 - edge) << 3)) & 0x0ff); + if(!u1_bs) + continue; + /* tc0 */ + tc0 = pu1_cliptab[u1_bs]; + tc = tc0 + 1; + + q0_u = pu1_src_temp_u[pos_q0]; + q1_u = pu1_src_temp_u[pos_q1]; + p0_u = pu1_src_temp_u[pos_p0]; + p1_u = pu1_src_temp_u[pos_p1]; + + q0_v = pu1_src_temp_v[pos_q0]; + q1_v = pu1_src_temp_v[pos_q1]; + p0_v = pu1_src_temp_v[pos_p0]; + p1_v = pu1_src_temp_v[pos_p1]; + + /* Filter Decision */ + if((ABS(p0_u - q0_u) < alpha) && + (ABS(q1_u - q0_u) < beta) && + (ABS(p1_u - p0_u) < beta)) + { + val = ((((q0_u - p0_u) << 2) + (p1_u - q1_u) + 4) >> 3); + delta = CLIP3(-tc, tc, val); + /* p0' */ + val = p0_u + delta; + pu1_src_temp_u[pos_p0] = CLIP_U8(val); + /* q0' */ + val = q0_u - delta; + pu1_src_temp_u[pos_q0] = CLIP_U8(val); + } + + /* Filter Decision */ + if((ABS(p0_v - q0_v) < alpha) && + (ABS(q1_v - q0_v) < beta) && + (ABS(p1_v - p0_v) < beta)) + { + val = ((((q0_v - p0_v) << 2) + (p1_v - q1_v) + 4) >> 3); + delta = CLIP3(-tc, tc, val); + /* p0' */ + val = p0_v + delta; + pu1_src_temp_v[pos_p0] = CLIP_U8(val); + /* q0' */ + val = q0_v - delta; + pu1_src_temp_v[pos_q0] = CLIP_U8(val); + } + } +} + +/*****************************************************************************/ +/* Function Definitions for chroma deblocking in high profile */ +/*****************************************************************************/ + +/*****************************************************************************/ +/* */ +/* Function Name : ih264_deblk_chroma_vert_bs4() */ +/* */ +/* Description : This function performs filtering of a chroma block */ +/* vertical edge when the boundary strength is set to 4 in */ +/* high profile. */ +/* */ +/* Inputs : pu1_src - pointer to the src sample q0 of U */ +/* src_strd - source stride */ +/* alpha_cb - alpha value for the boundary in U */ +/* beta_cb - beta value for the boundary in U */ +/* alpha_cr - alpha value for the boundary in V */ +/* beta_cr - beta value for the boundary in V */ +/* */ +/* Globals : None */ +/* */ +/* Processing : This operation is described in Sec. 8.7.2.4 under the */ +/* title "Filtering process for edges for bS equal to 4" in */ +/* ITU T Rec H.264 with alpha and beta values different in */ +/* U and V. */ +/* */ +/* Outputs : None */ +/* */ +/* Returns : None */ +/* */ +/* Issues : None */ +/* */ +/* Revision History: */ +/* */ +/* DD MM YYYY Author(s) Changes (Describe the changes made) */ +/* 29 12 2014 Kaushik Draft */ +/* Senthoor */ +/* */ +/*****************************************************************************/ +void ih264_deblk_chroma_vert_bs4(UWORD8 *pu1_src, + WORD32 src_strd, + WORD32 alpha_cb, + WORD32 beta_cb, + WORD32 alpha_cr, + WORD32 beta_cr) +{ + UWORD8 *pu1_src_u = pu1_src; /* Pointer to the src sample q0 of U */ + UWORD8 *pu1_src_v = pu1_src + 1; /* Pointer to the src sample q0 of V */ + UWORD8 p1_u, p0_u, q0_u, q1_u, p1_v, p0_v, q0_v, q1_v; + WORD32 blk_strd = src_strd << 1; /* block_increment = src_strd * 2*/ + WORD32 pos_p1, pos_p0, pos_q0, pos_q1; + UWORD8 *pu1_src_temp_u, *pu1_src_temp_v; + WORD8 i = 0, edge; + + pos_q0 = 0; + pos_q1 = 2; + pos_p0 = -2; + pos_p1 = -4; + + for(edge = 0; edge < 4; + edge++, pu1_src_u += blk_strd, pu1_src_v += blk_strd) + { + pu1_src_temp_u = pu1_src_u; + pu1_src_temp_v = pu1_src_v; + for(i = 0; i < 2; ++i, pu1_src_temp_u += src_strd, pu1_src_temp_v += + src_strd) + { + q0_u = pu1_src_temp_u[pos_q0]; + q1_u = pu1_src_temp_u[pos_q1]; + p0_u = pu1_src_temp_u[pos_p0]; + p1_u = pu1_src_temp_u[pos_p1]; + q0_v = pu1_src_temp_v[pos_q0]; + q1_v = pu1_src_temp_v[pos_q1]; + p0_v = pu1_src_temp_v[pos_p0]; + p1_v = pu1_src_temp_v[pos_p1]; + + /* Filter Decision */ + if((ABS(p0_u - q0_u) < alpha_cb) && + (ABS(q1_u - q0_u) < beta_cb) && + (ABS(p1_u - p0_u) < beta_cb)) + { + /* p0' */ + pu1_src_temp_u[pos_p0] = ((X2(p1_u) + p0_u + q1_u + 2) >> 2); + /* q0' */ + pu1_src_temp_u[pos_q0] = (X2(q1_u) + q0_u + p1_u + 2) >> 2; + } + + /* Filter Decision */ + if((ABS(p0_v - q0_v) < alpha_cr) && + (ABS(q1_v - q0_v) < beta_cr) && + (ABS(p1_v - p0_v) < beta_cr)) + { + /* p0' */ + pu1_src_temp_v[pos_p0] = ((X2(p1_v) + p0_v + q1_v + 2) >> 2); + /* q0' */ + pu1_src_temp_v[pos_q0] = (X2(q1_v) + q0_v + p1_v + 2) >> 2; + } + } + } +} + +/*****************************************************************************/ +/* */ +/* Function Name : ih264_deblk_chroma_horz_bs4() */ +/* */ +/* Description : This function performs filtering of a chroma block */ +/* horizontal edge when the boundary strength is set to 4 */ +/* in high profile. */ +/* */ +/* Inputs : pu1_src - pointer to the src sample q0 of U */ +/* src_strd - source stride */ +/* alpha_cb - alpha value for the boundary in U */ +/* beta_cb - beta value for the boundary in U */ +/* alpha_cr - alpha value for the boundary in V */ +/* beta_cr - beta value for the boundary in V */ +/* */ +/* Globals : None */ +/* */ +/* Processing : This operation is described in Sec. 8.7.2.4 under the */ +/* title "Filtering process for edges for bS equal to 4" in */ +/* ITU T Rec H.264 with alpha and beta values different in */ +/* U and V. */ +/* */ +/* Outputs : None */ +/* */ +/* Returns : None */ +/* */ +/* Issues : None */ +/* */ +/* Revision History: */ +/* */ +/* DD MM YYYY Author(s) Changes (Describe the changes made) */ +/* 29 12 2014 Kaushik Draft */ +/* Senthoor */ +/* */ +/*****************************************************************************/ +void ih264_deblk_chroma_horz_bs4(UWORD8 *pu1_src, + WORD32 src_strd, + WORD32 alpha_cb, + WORD32 beta_cb, + WORD32 alpha_cr, + WORD32 beta_cr) +{ + UWORD8 *pu1_src_u = pu1_src; /* Pointer to the src sample q0 of U */ + UWORD8 *pu1_src_v = pu1_src + 1; /* Pointer to the src sample q0 of V */ + UWORD8 p1_u, p0_u, q0_u, q1_u, p1_v, p0_v, q0_v, q1_v; + WORD32 pos_p1, pos_p0, pos_q0, pos_q1; + UWORD8 *pu1_src_temp_u, *pu1_src_temp_v; + UWORD8 *pu1_p1_u; /* Pointer to the src sample p1 of U */ + UWORD8 *pu1_p1_v; /* Pointer to the src sample p1 of U */ + UWORD8 *pu1_p1_temp_u, *pu1_p1_temp_v; + WORD8 i = 0, edge; + + pu1_p1_u = pu1_src_u - (src_strd << 1); + pu1_p1_v = pu1_src_v - (src_strd << 1); + pos_q0 = 0; + pos_q1 = src_strd; + pos_p0 = src_strd; + pos_p1 = 0; + + for(edge = 0; edge < 4; edge++, pu1_src_u += 4, pu1_p1_u += 4, pu1_src_v += + 4, pu1_p1_v += 4) + { + pu1_src_temp_u = pu1_src_u; + pu1_p1_temp_u = pu1_p1_u; + pu1_src_temp_v = pu1_src_v; + pu1_p1_temp_v = pu1_p1_v; + for(i = 0; i < 2; ++i, pu1_src_temp_u += 2, pu1_p1_temp_u += 2, + pu1_src_temp_v += 2, pu1_p1_temp_v += 2) + { + q0_u = pu1_src_temp_u[pos_q0]; + q1_u = pu1_src_temp_u[pos_q1]; + p0_u = pu1_p1_temp_u[pos_p0]; + p1_u = pu1_p1_temp_u[pos_p1]; + + q0_v = pu1_src_temp_v[pos_q0]; + q1_v = pu1_src_temp_v[pos_q1]; + p0_v = pu1_p1_temp_v[pos_p0]; + p1_v = pu1_p1_temp_v[pos_p1]; + + /* Filter Decision */ + if(ABS(p0_u - q0_u) < alpha_cb && ABS(q1_u - q0_u) < beta_cb + && ABS(p1_u - p0_u) < beta_cb) + { + /* p0' */ + pu1_p1_temp_u[pos_p0] = (X2(p1_u) + p0_u + q1_u + 2) >> 2; + /* q0' */ + pu1_src_temp_u[pos_q0] = (X2(q1_u) + q0_u + p1_u + 2) >> 2; + } + + /* Filter Decision */ + if(ABS(p0_v - q0_v) < alpha_cr && ABS(q1_v - q0_v) < beta_cr + && ABS(p1_v - p0_v) < beta_cr) + { + /* p0' */ + pu1_p1_temp_v[pos_p0] = (X2(p1_v) + p0_v + q1_v + 2) >> 2; + /* q0' */ + pu1_src_temp_v[pos_q0] = (X2(q1_v) + q0_v + p1_v + 2) >> 2; + } + } + } +} + +/*****************************************************************************/ +/* */ +/* Function Name : ih264_deblk_chroma_vert_bslt4() */ +/* */ +/* Description : This function performs filtering of a chroma block */ +/* vertical edge when the boundary strength is less than 4 */ +/* in high profile. */ +/* */ +/* Inputs : pu1_src - pointer to the src sample q0 of U */ +/* src_strd - source stride */ +/* alpha_cb - alpha value for the boundary in U */ +/* beta_cb - beta value for the boundary in U */ +/* alpha_cr - alpha value for the boundary in V */ +/* beta_cr - beta value for the boundary in V */ +/* u4_bs - packed Boundary strength array */ +/* pu1_cliptab_cb - tc0_table for U */ +/* pu1_cliptab_cr - tc0_table for V */ +/* */ +/* Globals : None */ +/* */ +/* Processing : This operation is described in Sec. 8.7.2.3 under the */ +/* title "Filtering process for edges for bS less than 4" */ +/* in ITU T Rec H.264 with alpha and beta values different */ +/* in U and V. */ +/* */ +/* Outputs : None */ +/* */ +/* Returns : None */ +/* */ +/* Issues : None */ +/* */ +/* Revision History: */ +/* */ +/* DD MM YYYY Author(s) Changes (Describe the changes made) */ +/* 29 12 2014 Kaushik Draft */ +/* Senthoor */ +/* */ +/*****************************************************************************/ +void ih264_deblk_chroma_vert_bslt4(UWORD8 *pu1_src, + WORD32 src_strd, + WORD32 alpha_cb, + WORD32 beta_cb, + WORD32 alpha_cr, + WORD32 beta_cr, + UWORD32 u4_bs, + const UWORD8 *pu1_cliptab_cb, + const UWORD8 *pu1_cliptab_cr) +{ + UWORD8 *pu1_src_u = pu1_src; /* Pointer to the src sample q0 of plane U*/ + UWORD8 *pu1_src_v = pu1_src + 1; /* Pointer to the src sample q0 of plane V*/ + UWORD8 p1_u, p0_u, q0_u, q1_u, p1_v, p0_v, q0_v, q1_v; + WORD32 blk_strd = src_strd << 1; /* block_increment = src_strd * 2 */ + WORD32 pos_p1, pos_p0, pos_q0, pos_q1; + UWORD8 *pu1_src_temp_u, *pu1_src_temp_v; + WORD8 i = 0, edge; + WORD8 delta; + WORD8 tcb, tcr; + WORD16 val; + UWORD8 tcb0, tcr0, u1_bs; + + pos_q0 = 0; + pos_q1 = 2; + pos_p0 = -2; + pos_p1 = -4; + + for(edge = 0; edge < 4; + edge++, pu1_src_u += blk_strd, pu1_src_v += blk_strd) + { + pu1_src_temp_u = pu1_src_u; + pu1_src_temp_v = pu1_src_v; + /* Filter Decision */ + u1_bs = (UWORD8)((u4_bs >> ((3 - edge) << 3)) & 0x0ff); + if(!u1_bs) + continue; + /* tc0 */ + tcb0 = pu1_cliptab_cb[u1_bs]; + tcr0 = pu1_cliptab_cr[u1_bs]; + tcb = tcb0 + 1; + tcr = tcr0 + 1; + for(i = 0; i < 2; ++i, pu1_src_temp_u += src_strd, pu1_src_temp_v += + src_strd) + { + q0_u = pu1_src_temp_u[pos_q0]; + q1_u = pu1_src_temp_u[pos_q1]; + p0_u = pu1_src_temp_u[pos_p0]; + p1_u = pu1_src_temp_u[pos_p1]; + + q0_v = pu1_src_temp_v[pos_q0]; + q1_v = pu1_src_temp_v[pos_q1]; + p0_v = pu1_src_temp_v[pos_p0]; + p1_v = pu1_src_temp_v[pos_p1]; + + /* Filter Decision */ + if(ABS(p0_u - q0_u) < alpha_cb && ABS(q1_u - q0_u) < beta_cb + && ABS(p1_u - p0_u) < beta_cb) + { + val = ((((q0_u - p0_u) << 2) + (p1_u - q1_u) + 4) >> 3); + delta = CLIP3(-tcb, tcb, val); + /* p0' */ + val = p0_u + delta; + pu1_src_temp_u[pos_p0] = CLIP_U8(val); + /* q0' */ + val = q0_u - delta; + pu1_src_temp_u[pos_q0] = CLIP_U8(val); + } + + /* Filter Decision */ + if(ABS(p0_v - q0_v) < alpha_cr && ABS(q1_v - q0_v) < beta_cr + && ABS(p1_v - p0_v) < beta_cr) + { + val = ((((q0_v - p0_v) << 2) + (p1_v - q1_v) + 4) >> 3); + delta = CLIP3(-tcr, tcr, val); + /* p0' */ + val = p0_v + delta; + pu1_src_temp_v[pos_p0] = CLIP_U8(val); + /* q0' */ + val = q0_v - delta; + pu1_src_temp_v[pos_q0] = CLIP_U8(val); + } + } + } +} + +/*****************************************************************************/ +/* */ +/* Function Name : ih264_deblk_chroma_horz_bslt4() */ +/* */ +/* Description : This function performs filtering of a chroma block */ +/* horizontal edge when the boundary strength is less than */ +/* 4 in high profile. */ +/* */ +/* Inputs : pu1_src - pointer to the src sample q0 of U */ +/* src_strd - source stride */ +/* alpha_cb - alpha value for the boundary in U */ +/* beta_cb - beta value for the boundary in U */ +/* alpha_cr - alpha value for the boundary in V */ +/* beta_cr - beta value for the boundary in V */ +/* u4_bs - packed Boundary strength array */ +/* pu1_cliptab_cb - tc0_table for U */ +/* pu1_cliptab_cr - tc0_table for V */ +/* */ +/* Globals : None */ +/* */ +/* Processing : This operation is described in Sec. 8.7.2.3 under the */ +/* title "Filtering process for edges for bS less than 4" */ +/* in ITU T Rec H.264 with alpha and beta values different */ +/* in U and V. */ +/* */ +/* Outputs : None */ +/* */ +/* Returns : None */ +/* */ +/* Issues : None */ +/* */ +/* Revision History: */ +/* */ +/* DD MM YYYY Author(s) Changes (Describe the changes made) */ +/* 29 12 2014 Kaushik Draft */ +/* Senthoor */ +/* */ +/*****************************************************************************/ +void ih264_deblk_chroma_horz_bslt4(UWORD8 *pu1_src, + WORD32 src_strd, + WORD32 alpha_cb, + WORD32 beta_cb, + WORD32 alpha_cr, + WORD32 beta_cr, + UWORD32 u4_bs, + const UWORD8 *pu1_cliptab_cb, + const UWORD8 *pu1_cliptab_cr) +{ + UWORD8 *pu1_src_u = pu1_src; /* Pointer to the src sample q0 of plane U*/ + UWORD8 *pu1_src_v = pu1_src + 1; /* Pointer to the src sample q0 of plane V*/ + UWORD8 p1_u, p0_u, q0_u, q1_u, p1_v, p0_v, q0_v, q1_v; + WORD32 pos_p1, pos_p0, pos_q0, pos_q1; + UWORD8 *pu1_src_temp_u, *pu1_src_temp_v; + UWORD8 *pu1_p1_u; /* Pointer to the src sample p1 of plane U*/ + UWORD8 *pu1_p1_v; /* Pointer to the src sample p1 of plane V*/ + UWORD8 *pu1_p1_temp_u, *pu1_p1_temp_v; + WORD8 i = 0, edge; + WORD8 delta; + WORD8 tcb, tcr; + WORD16 val; + UWORD8 u1_bs; + UWORD8 tcb0, tcr0; + + pu1_p1_u = pu1_src_u - (src_strd << 1); + pu1_p1_v = pu1_src_v - (src_strd << 1); + pos_q0 = 0; + pos_q1 = src_strd; + pos_p0 = src_strd; + pos_p1 = 0; + + for(edge = 0; edge < 4; edge++, pu1_src_u += 4, pu1_p1_u += 4, + pu1_src_v += 4, pu1_p1_v += 4) + { + pu1_src_temp_u = pu1_src_u; + pu1_p1_temp_u = pu1_p1_u; + pu1_src_temp_v = pu1_src_v; + pu1_p1_temp_v = pu1_p1_v; + + /* Filter Decision */ + u1_bs = (UWORD8)((u4_bs >> ((3 - edge) << 3)) & 0x0ff); + if(!u1_bs) + continue; + /* tc0 */ + tcb0 = pu1_cliptab_cb[u1_bs]; + tcr0 = pu1_cliptab_cr[u1_bs]; + + for(i = 0; i < 2; ++i, pu1_src_temp_u += 2, pu1_p1_temp_u += 2, + pu1_src_temp_v += 2, pu1_p1_temp_v += 2) + { + q0_u = pu1_src_temp_u[pos_q0]; + q1_u = pu1_src_temp_u[pos_q1]; + p0_u = pu1_p1_temp_u[pos_p0]; + p1_u = pu1_p1_temp_u[pos_p1]; + + q0_v = pu1_src_temp_v[pos_q0]; + q1_v = pu1_src_temp_v[pos_q1]; + p0_v = pu1_p1_temp_v[pos_p0]; + p1_v = pu1_p1_temp_v[pos_p1]; + + /* tc */ + tcb = tcb0 + 1; + tcr = tcr0 + 1; + /* Filter Decision */ + if(ABS(p0_u - q0_u) < alpha_cb && ABS(q1_u - q0_u) < beta_cb + && ABS(p1_u - p0_u) < beta_cb) + { + val = ((((q0_u - p0_u) << 2) + (p1_u - q1_u) + 4) >> 3); + delta = CLIP3(-tcb, tcb, val); + /* p0' */ + val = p0_u + delta; + pu1_p1_temp_u[pos_p0] = CLIP_U8(val); + /* q0' */ + val = q0_u - delta; + pu1_src_temp_u[pos_q0] = CLIP_U8(val); + } + /* Filter Decision */ + if(ABS(p0_v - q0_v) < alpha_cr && ABS(q1_v - q0_v) < beta_cr + && ABS(p1_v - p0_v) < beta_cr) + { + val = ((((q0_v - p0_v) << 2) + (p1_v - q1_v) + 4) >> 3); + delta = CLIP3(-tcr, tcr, val); + /* p0' */ + val = p0_v + delta; + pu1_p1_temp_v[pos_p0] = CLIP_U8(val); + /* q0' */ + val = q0_v - delta; + pu1_src_temp_v[pos_q0] = CLIP_U8(val); + } + } + } +} + +/*****************************************************************************/ +/* */ +/* Function Name : ih264_deblk_chroma_vert_bs4_mbaff() */ +/* */ +/* Description : This function performs filtering of a chroma block */ +/* vertical edge when boundary strength is set to 4 in high */ +/* profile. */ +/* */ +/* Inputs : pu1_src - pointer to the src sample q0 of U */ +/* src_strd - source stride */ +/* alpha_cb - alpha value for the boundary in U */ +/* beta_cb - beta value for the boundary in U */ +/* alpha_cr - alpha value for the boundary in V */ +/* beta_cr - beta value for the boundary in V */ +/* u4_bs - packed Boundary strength array */ +/* pu1_cliptab_cb - tc0_table for U */ +/* pu1_cliptab_cr - tc0_table for V */ +/* */ +/* Globals : None */ +/* */ +/* Processing : When the function is called twice, this operation is as */ +/* described in Sec. 8.7.2.4 under the title "Filtering */ +/* process for edges for bS equal to 4" in ITU T Rec H.264 */ +/* with alpha and beta values different in U and V. */ +/* */ +/* Outputs : None */ +/* */ +/* Returns : None */ +/* */ +/* Issues : None */ +/* */ +/* Revision History: */ +/* */ +/* DD MM YYYY Author(s) Changes (Describe the changes made) */ +/* 29 12 2014 Kaushik Draft */ +/* Senthoor */ +/* */ +/*****************************************************************************/ +void ih264_deblk_chroma_vert_bs4_mbaff(UWORD8 *pu1_src, + WORD32 src_strd, + WORD32 alpha_cb, + WORD32 beta_cb, + WORD32 alpha_cr, + WORD32 beta_cr) +{ + UWORD8 *pu1_src_u = pu1_src; /* Pointer to the src sample q0 of U */ + UWORD8 *pu1_src_v = pu1_src + 1; /* Pointer to the src sample q0 of V */ + UWORD8 p1_u, p0_u, q0_u, q1_u, p1_v, p0_v, q0_v, q1_v; + WORD32 blk_strd = src_strd; + WORD32 pos_p1, pos_p0, pos_q0, pos_q1; + UWORD8 *pu1_src_temp_u, *pu1_src_temp_v; + WORD8 edge; + + pos_q0 = 0; + pos_q1 = 2; + pos_p0 = -2; + pos_p1 = -4; + + for(edge = 0; edge < 4; + edge++, pu1_src_u += blk_strd, pu1_src_v += blk_strd) + { + pu1_src_temp_u = pu1_src_u; + pu1_src_temp_v = pu1_src_v; + q0_u = pu1_src_temp_u[pos_q0]; + q1_u = pu1_src_temp_u[pos_q1]; + p0_u = pu1_src_temp_u[pos_p0]; + p1_u = pu1_src_temp_u[pos_p1]; + q0_v = pu1_src_temp_v[pos_q0]; + q1_v = pu1_src_temp_v[pos_q1]; + p0_v = pu1_src_temp_v[pos_p0]; + p1_v = pu1_src_temp_v[pos_p1]; + + /* Filter Decision */ + if((ABS(p0_u - q0_u) < alpha_cb) && + (ABS(q1_u - q0_u) < beta_cb) && + (ABS(p1_u - p0_u) < beta_cb)) + { + /* p0' */ + pu1_src_temp_u[pos_p0] = ((X2(p1_u) + p0_u + q1_u + 2) >> 2); + /* q0' */ + pu1_src_temp_u[pos_q0] = (X2(q1_u) + q0_u + p1_u + 2) >> 2; + } + + /* Filter Decision */ + if((ABS(p0_v - q0_v) < alpha_cr) && + (ABS(q1_v - q0_v) < beta_cr) && + (ABS(p1_v - p0_v) < beta_cr)) + { + /* p0' */ + pu1_src_temp_v[pos_p0] = ((X2(p1_v) + p0_v + q1_v + 2) >> 2); + /* q0' */ + pu1_src_temp_v[pos_q0] = (X2(q1_v) + q0_v + p1_v + 2) >> 2; + } + } +} + +/*****************************************************************************/ +/* */ +/* Function Name : ih264_deblk_chroma_vert_bslt4_mbaff() */ +/* */ +/* Description : This function performs filtering of a chroma block */ +/* vertical edge when boundary strength is less than 4 in */ +/* high profile. */ +/* */ +/* Inputs : pu1_src - pointer to the src sample q0 of U */ +/* src_strd - source stride */ +/* alpha_cb - alpha value for the boundary in U */ +/* beta_cb - beta value for the boundary in U */ +/* alpha_cr - alpha value for the boundary in V */ +/* beta_cr - beta value for the boundary in V */ +/* u4_bs - packed Boundary strength array */ +/* pu1_cliptab_cb - tc0_table for U */ +/* pu1_cliptab_cr - tc0_table for V */ +/* */ +/* Globals : None */ +/* */ +/* Processing : When the function is called twice, this operation is as */ +/* described in Sec. 8.7.2.4 under the title "Filtering */ +/* process for edges for bS less than 4" in ITU T Rec H.264 */ +/* with alpha and beta values different in U and V. */ +/* */ +/* Outputs : None */ +/* */ +/* Returns : None */ +/* */ +/* Issues : None */ +/* */ +/* Revision History: */ +/* */ +/* DD MM YYYY Author(s) Changes (Describe the changes made) */ +/* 29 12 2014 Kaushik Draft */ +/* Senthoor */ +/* */ +/*****************************************************************************/ +void ih264_deblk_chroma_vert_bslt4_mbaff(UWORD8 *pu1_src, + WORD32 src_strd, + WORD32 alpha_cb, + WORD32 beta_cb, + WORD32 alpha_cr, + WORD32 beta_cr, + UWORD32 u4_bs, + const UWORD8 *pu1_cliptab_cb, + const UWORD8 *pu1_cliptab_cr) +{ + UWORD8 *pu1_src_u = pu1_src; /* Pointer to the src sample q0 of plane U*/ + UWORD8 *pu1_src_v = pu1_src + 1; /* Pointer to the src sample q0 of plane V*/ + UWORD8 p1_u, p0_u, q0_u, q1_u, p1_v, p0_v, q0_v, q1_v; + WORD32 blk_strd = src_strd; + WORD32 pos_p1, pos_p0, pos_q0, pos_q1; + UWORD8 *pu1_src_temp_u, *pu1_src_temp_v; + WORD8 edge; + WORD8 delta; + WORD8 tcb, tcr; + WORD16 val; + UWORD8 tcb0, tcr0, u1_bs; + + pos_q0 = 0; + pos_q1 = 2; + pos_p0 = -2; + pos_p1 = -4; + + for(edge = 0; edge < 4; + edge++, pu1_src_u += blk_strd, pu1_src_v += blk_strd) + { + pu1_src_temp_u = pu1_src_u; + pu1_src_temp_v = pu1_src_v; + /* Filter Decision */ + u1_bs = (UWORD8)((u4_bs >> ((3 - edge) << 3)) & 0x0ff); + if(!u1_bs) + continue; + /* tc0 */ + tcb0 = pu1_cliptab_cb[u1_bs]; + tcr0 = pu1_cliptab_cr[u1_bs]; + tcb = tcb0 + 1; + tcr = tcr0 + 1; + q0_u = pu1_src_temp_u[pos_q0]; + q1_u = pu1_src_temp_u[pos_q1]; + p0_u = pu1_src_temp_u[pos_p0]; + p1_u = pu1_src_temp_u[pos_p1]; + + q0_v = pu1_src_temp_v[pos_q0]; + q1_v = pu1_src_temp_v[pos_q1]; + p0_v = pu1_src_temp_v[pos_p0]; + p1_v = pu1_src_temp_v[pos_p1]; + + /* Filter Decision */ + if((ABS(p0_u - q0_u) < alpha_cb) && + (ABS(q1_u - q0_u) < beta_cb) && + (ABS(p1_u - p0_u) < beta_cb)) + { + val = ((((q0_u - p0_u) << 2) + (p1_u - q1_u) + 4) >> 3); + delta = CLIP3(-tcb, tcb, val); + /* p0' */ + val = p0_u + delta; + pu1_src_temp_u[pos_p0] = CLIP_U8(val); + /* q0' */ + val = q0_u - delta; + pu1_src_temp_u[pos_q0] = CLIP_U8(val); + } + + /* Filter Decision */ + if((ABS(p0_v - q0_v) < alpha_cr) && + (ABS(q1_v - q0_v) < beta_cr) && + (ABS(p1_v - p0_v) < beta_cr)) + { + val = ((((q0_v - p0_v) << 2) + (p1_v - q1_v) + 4) >> 3); + delta = CLIP3(-tcr, tcr, val); + /* p0' */ + val = p0_v + delta; + pu1_src_temp_v[pos_p0] = CLIP_U8(val); + /* q0' */ + val = q0_v - delta; + pu1_src_temp_v[pos_q0] = CLIP_U8(val); + } + } +} diff --git a/common/ih264_deblk_edge_filters.h b/common/ih264_deblk_edge_filters.h new file mode 100755 index 0000000..4079dd2 --- /dev/null +++ b/common/ih264_deblk_edge_filters.h @@ -0,0 +1,195 @@ +/****************************************************************************** + * + * Copyright (C) 2015 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ***************************************************************************** + * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore +*/ +/** + ******************************************************************************* + * @file + * ih264_deblk_edge_filters.h + * + * @brief + * This file contains declarations of functions used for deblocking + * + * @author + * Ittiam + * + * @remarks + * None + * + ******************************************************************************* + */ + +#ifndef IH264_DEBLK_H_ +#define IH264_DEBLK_H_ + +/*****************************************************************************/ +/* Extern Function Declarations */ +/*****************************************************************************/ + +typedef void ih264_deblk_edge_bslt4_ft(UWORD8 *pu1_src, + WORD32 src_strd, + WORD32 alpha, + WORD32 beta, + UWORD32 u4_bs, + const UWORD8 *pu1_cliptab ); + +typedef void ih264_deblk_edge_bs4_ft(UWORD8 *pu1_src, + WORD32 src_strd, + WORD32 alpha, + WORD32 beta ); + +typedef void ih264_deblk_chroma_edge_bslt4_ft(UWORD8 *pu1_src, + WORD32 src_strd, + WORD32 alpha_cb, + WORD32 beta_cb, + WORD32 alpha_cr, + WORD32 beta_cr, + UWORD32 u4_bs, + const UWORD8 *pu1_cliptab_cb, + const UWORD8 *pu1_cliptab_cr); + +typedef void ih264_deblk_chroma_edge_bs4_ft(UWORD8 *pu1_src, + WORD32 src_strd, + WORD32 alpha_cb, + WORD32 beta_cb, + WORD32 alpha_cr, + WORD32 beta_cr); + + + +ih264_deblk_edge_bs4_ft ih264_deblk_luma_horz_bs4; +ih264_deblk_edge_bs4_ft ih264_deblk_luma_vert_bs4; +ih264_deblk_edge_bs4_ft ih264_deblk_luma_vert_bs4_mbaff; + + +ih264_deblk_edge_bs4_ft ih264_deblk_chroma_horz_bs4_bp; +ih264_deblk_edge_bs4_ft ih264_deblk_chroma_vert_bs4_bp; +ih264_deblk_edge_bs4_ft ih264_deblk_chroma_vert_bs4_mbaff_bp; + + +ih264_deblk_edge_bslt4_ft ih264_deblk_luma_horz_bslt4; +ih264_deblk_edge_bslt4_ft ih264_deblk_luma_vert_bslt4; +ih264_deblk_edge_bslt4_ft ih264_deblk_luma_vert_bslt4_mbaff; + + +ih264_deblk_edge_bslt4_ft ih264_deblk_chroma_horz_bslt4_bp; +ih264_deblk_edge_bslt4_ft ih264_deblk_chroma_vert_bslt4_bp; +ih264_deblk_edge_bslt4_ft ih264_deblk_chroma_vert_bslt4_mbaff_bp; + +ih264_deblk_chroma_edge_bs4_ft ih264_deblk_chroma_vert_bs4; +ih264_deblk_chroma_edge_bs4_ft ih264_deblk_chroma_horz_bs4; +ih264_deblk_chroma_edge_bs4_ft ih264_deblk_chroma_vert_bs4_mbaff; +ih264_deblk_chroma_edge_bs4_ft ih264_deblk_chroma_horz_bs4_mbaff; + +ih264_deblk_chroma_edge_bslt4_ft ih264_deblk_chroma_vert_bslt4; +ih264_deblk_chroma_edge_bslt4_ft ih264_deblk_chroma_horz_bslt4; +ih264_deblk_chroma_edge_bslt4_ft ih264_deblk_chroma_vert_bslt4_mbaff; +ih264_deblk_chroma_edge_bslt4_ft ih264_deblk_chroma_horz_bslt4_mbaff; + + +/*A9*/ +ih264_deblk_edge_bs4_ft ih264_deblk_luma_horz_bs4_a9; +ih264_deblk_edge_bs4_ft ih264_deblk_luma_vert_bs4_a9; +ih264_deblk_edge_bs4_ft ih264_deblk_luma_vert_bs4_mbaff_a9; + + +ih264_deblk_edge_bs4_ft ih264_deblk_chroma_horz_bs4_bp_a9; +ih264_deblk_edge_bs4_ft ih264_deblk_chroma_vert_bs4_bp_a9; +ih264_deblk_edge_bs4_ft ih264_deblk_chroma_vert_bs4_mbaff_bp_a9; + + +ih264_deblk_edge_bslt4_ft ih264_deblk_luma_horz_bslt4_a9; +ih264_deblk_edge_bslt4_ft ih264_deblk_luma_vert_bslt4_a9; +ih264_deblk_edge_bslt4_ft ih264_deblk_luma_vert_bslt4_mbaff_a9; + + +ih264_deblk_edge_bslt4_ft ih264_deblk_chroma_horz_bslt4_bp_a9; +ih264_deblk_edge_bslt4_ft ih264_deblk_chroma_vert_bslt4_bp_a9; +ih264_deblk_edge_bslt4_ft ih264_deblk_chroma_vert_bslt4_mbaff_bp_a9; + +ih264_deblk_chroma_edge_bs4_ft ih264_deblk_chroma_vert_bs4_a9; +ih264_deblk_chroma_edge_bs4_ft ih264_deblk_chroma_horz_bs4_a9; +ih264_deblk_chroma_edge_bs4_ft ih264_deblk_chroma_vert_bs4_mbaff_a9; +ih264_deblk_chroma_edge_bs4_ft ih264_deblk_chroma_horz_bs4_mbaff_a9; + +ih264_deblk_chroma_edge_bslt4_ft ih264_deblk_chroma_vert_bslt4_a9; +ih264_deblk_chroma_edge_bslt4_ft ih264_deblk_chroma_horz_bslt4_a9; +ih264_deblk_chroma_edge_bslt4_ft ih264_deblk_chroma_vert_bslt4_mbaff_a9; +ih264_deblk_chroma_edge_bslt4_ft ih264_deblk_chroma_horz_bslt4_mbaff_a9; + +/*AV8*/ +ih264_deblk_edge_bs4_ft ih264_deblk_luma_horz_bs4_av8; +ih264_deblk_edge_bs4_ft ih264_deblk_luma_vert_bs4_av8; +ih264_deblk_edge_bs4_ft ih264_deblk_luma_vert_bs4_mbaff_av8; + + +ih264_deblk_edge_bs4_ft ih264_deblk_chroma_horz_bs4_bp_av8; +ih264_deblk_edge_bs4_ft ih264_deblk_chroma_vert_bs4_bp_av8; +ih264_deblk_edge_bs4_ft ih264_deblk_chroma_vert_bs4_mbaff_bp_av8; + + +ih264_deblk_edge_bslt4_ft ih264_deblk_luma_horz_bslt4_av8; +ih264_deblk_edge_bslt4_ft ih264_deblk_luma_vert_bslt4_av8; +ih264_deblk_edge_bslt4_ft ih264_deblk_luma_vert_bslt4_mbaff_av8; + + +ih264_deblk_edge_bslt4_ft ih264_deblk_chroma_horz_bslt4_bp_av8; +ih264_deblk_edge_bslt4_ft ih264_deblk_chroma_vert_bslt4_bp_av8; +ih264_deblk_edge_bslt4_ft ih264_deblk_chroma_vert_bslt4_mbaff_bp_av8; + +ih264_deblk_chroma_edge_bs4_ft ih264_deblk_chroma_vert_bs4_av8; +ih264_deblk_chroma_edge_bs4_ft ih264_deblk_chroma_horz_bs4_av8; +ih264_deblk_chroma_edge_bs4_ft ih264_deblk_chroma_vert_bs4_mbaff_av8; +ih264_deblk_chroma_edge_bs4_ft ih264_deblk_chroma_horz_bs4_mbaff_av8; + +ih264_deblk_chroma_edge_bslt4_ft ih264_deblk_chroma_vert_bslt4_av8; +ih264_deblk_chroma_edge_bslt4_ft ih264_deblk_chroma_horz_bslt4_av8; +ih264_deblk_chroma_edge_bslt4_ft ih264_deblk_chroma_vert_bslt4_mbaff_av8; +ih264_deblk_chroma_edge_bslt4_ft ih264_deblk_chroma_horz_bslt4_mbaff_av8; + +/*SSE3*/ +ih264_deblk_edge_bs4_ft ih264_deblk_luma_horz_bs4_ssse3; +ih264_deblk_edge_bs4_ft ih264_deblk_luma_vert_bs4_ssse3; +ih264_deblk_edge_bs4_ft ih264_deblk_luma_vert_bs4_mbaff_ssse3; + + +ih264_deblk_edge_bs4_ft ih264_deblk_chroma_horz_bs4_bp_ssse3; +ih264_deblk_edge_bs4_ft ih264_deblk_chroma_vert_bs4_bp_ssse3; +ih264_deblk_edge_bs4_ft ih264_deblk_chroma_vert_bs4_mbaff_bp_ssse3; + + +ih264_deblk_edge_bslt4_ft ih264_deblk_luma_horz_bslt4_ssse3; +ih264_deblk_edge_bslt4_ft ih264_deblk_luma_vert_bslt4_ssse3; +ih264_deblk_edge_bslt4_ft ih264_deblk_luma_vert_bslt4_mbaff_ssse3; + + +ih264_deblk_edge_bslt4_ft ih264_deblk_chroma_horz_bslt4_bp_ssse3; +ih264_deblk_edge_bslt4_ft ih264_deblk_chroma_vert_bslt4_bp_ssse3; +ih264_deblk_edge_bslt4_ft ih264_deblk_chroma_vert_bslt4_mbaff_bp_ssse3; + +ih264_deblk_chroma_edge_bs4_ft ih264_deblk_chroma_vert_bs4_ssse3; +ih264_deblk_chroma_edge_bs4_ft ih264_deblk_chroma_horz_bs4_ssse3; +ih264_deblk_chroma_edge_bs4_ft ih264_deblk_chroma_vert_bs4_mbaff_ssse3; +ih264_deblk_chroma_edge_bs4_ft ih264_deblk_chroma_horz_bs4_mbaff_ssse3; + +ih264_deblk_chroma_edge_bslt4_ft ih264_deblk_chroma_vert_bslt4_ssse3; +ih264_deblk_chroma_edge_bslt4_ft ih264_deblk_chroma_horz_bslt4_ssse3; +ih264_deblk_chroma_edge_bslt4_ft ih264_deblk_chroma_vert_bslt4_mbaff_ssse3; +ih264_deblk_chroma_edge_bslt4_ft ih264_deblk_chroma_horz_bslt4_mbaff_ssse3; + +#endif /* IH264_DEBLK_H_ */ diff --git a/common/ih264_deblk_tables.c b/common/ih264_deblk_tables.c new file mode 100755 index 0000000..91e28e0 --- /dev/null +++ b/common/ih264_deblk_tables.c @@ -0,0 +1,119 @@ +/****************************************************************************** + * + * Copyright (C) 2015 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ***************************************************************************** + * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore +*/ +/** +******************************************************************************* +* @file +* ih264_deblk_tables.c +* +* @brief +* Contains tables used for deblocking +* +* @author +* Ittiam +* +* @par List of Tables: +* - guc_ih264_qp_scale_cr[] +* - guc_ih264_alpha_table[] +* - guc_ih264_beta_table[] +* - guc_ih264_clip_table[][] +* +* @remarks +* None +* +******************************************************************************* +*/ + +/*****************************************************************************/ +/* File Includes */ +/*****************************************************************************/ + +/* System include files */ +#include <stdio.h> + +/* User include files */ +#include "ih264_typedefs.h" +#include "ih264_deblk_tables.h" + +/*****************************************************************************/ +/* Extern global definitions */ +/*****************************************************************************/ + +/** + ****************************************************************************** + * @brief alpha & beta tables for deblocking + * input : indexA [0-51] & indexB [0-51] + * output : alpha & beta + * + * @remarks Table 8-16 – in H264 Specification, + * Derivation of offset dependent threshold variables + * alpha and beta from indexA and indexB + ****************************************************************************** + */ +const UWORD8 gu1_ih264_alpha_table[52] = +{ + /* indexA :: 0-51 inclusive */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 4, 4, 5, 6, 7, 8, 9, 10, + 12, 13, 15, 17, 20, 22, 25, 28, + 32, 36, 40, 45, 50, 56, 63, 71, + 80, 90, 101, 113, 127, 144, 162, 182, + 203, 226, 255, 255, +}; + +const UWORD8 gu1_ih264_beta_table[52] = +{ + /* indexB :: 0-51 inclusive */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 2, 2, 2, 3, 3, 3, 3, 4, + 4, 4, 6, 6, 7, 7, 8, 8, + 9, 9, 10, 10, 11, 11, 12, 12, + 13, 13, 14, 14, 15, 15, 16, 16, + 17, 17, 18, 18, +}; + +/** + ****************************************************************************** + * @brief t'C0 table for deblocking + * input : indexA [0-51] and bS [1,3] + * output : t'C0 + * + * @remarks Table 8-17 – in H264 Specification, + * Value of variable t'C0 as a function of indexA and bS + ****************************************************************************** + */ +const UWORD8 gu1_ih264_clip_table[52][4] = +{ + /* indexA :: 0-51 inclusive */ + { 0, 0, 0, 0}, { 0, 0, 0, 0}, { 0, 0, 0, 0}, { 0, 0, 0, 0}, + { 0, 0, 0, 0}, { 0, 0, 0, 0}, { 0, 0, 0, 0}, { 0, 0, 0, 0}, + { 0, 0, 0, 0}, { 0, 0, 0, 0}, { 0, 0, 0, 0}, { 0, 0, 0, 0}, + { 0, 0, 0, 0}, { 0, 0, 0, 0}, { 0, 0, 0, 0}, { 0, 0, 0, 0}, + { 0, 0, 0, 0}, { 0, 0, 0, 1}, { 0, 0, 0, 1}, { 0, 0, 0, 1}, + { 0, 0, 0, 1}, { 0, 0, 1, 1}, { 0, 0, 1, 1}, { 0, 1, 1, 1}, + { 0, 1, 1, 1}, { 0, 1, 1, 1}, { 0, 1, 1, 1}, { 0, 1, 1, 2}, + { 0, 1, 1, 2}, { 0, 1, 1, 2}, { 0, 1, 1, 2}, { 0, 1, 2, 3}, + { 0, 1, 2, 3}, { 0, 2, 2, 3}, { 0, 2, 2, 4}, { 0, 2, 3, 4}, + { 0, 2, 3, 4}, { 0, 3, 3, 5}, { 0, 3, 4, 6}, { 0, 3, 4, 6}, + { 0, 4, 5, 7}, { 0, 4, 5, 8}, { 0, 4, 6, 9}, { 0, 5, 7,10}, + { 0, 6, 8,11}, { 0, 6, 8,13}, { 0, 7,10,14}, { 0, 8,11,16}, + { 0, 9,12,18}, { 0,10,13,20}, { 0,11,15,23}, { 0,13,17,25}, +}; diff --git a/common/ih264_deblk_tables.h b/common/ih264_deblk_tables.h new file mode 100755 index 0000000..3935dcb --- /dev/null +++ b/common/ih264_deblk_tables.h @@ -0,0 +1,73 @@ +/****************************************************************************** + * + * Copyright (C) 2015 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ***************************************************************************** + * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore +*/ +/** + ******************************************************************************* + * @file + * ih264_deblk_tables.h + * + * @brief + * This file contains declarations of tables used for deblocking + * + * @author + * Ittiam + * + * @par List of Functions: + * + * @remarks + * None + * + ******************************************************************************* + */ + +#ifndef IH264_DEBLK_TABLES_H_ +#define IH264_DEBLK_TABLES_H_ + +/*****************************************************************************/ +/* Extern global declarations */ +/*****************************************************************************/ + +/** + ****************************************************************************** + * @brief alpha & beta tables for deblocking + * input : indexA [0-51] & indexB [0-51] + * output : alpha & beta + * + * @remarks Table 8-16 – in H264 Specification, + * Derivation of offset dependent threshold variables + * alpha and beta from indexA and indexB + ****************************************************************************** + */ +extern const UWORD8 gu1_ih264_alpha_table[52]; + +extern const UWORD8 gu1_ih264_beta_table[52]; + +/** + ****************************************************************************** + * @brief t'C0 table for deblocking + * input : indexA [0-51] and bS [1,3] + * output : t'C0 + * + * @remarks Table 8-17 – in H264 Specification, + * Value of variable t'C0 as a function of indexA and bS + ****************************************************************************** + */ +extern const UWORD8 gu1_ih264_clip_table[52][4]; + +#endif /* IH264_DEBLK_TABLES_H_ */ diff --git a/common/ih264_debug.h b/common/ih264_debug.h new file mode 100755 index 0000000..96ff2a7 --- /dev/null +++ b/common/ih264_debug.h @@ -0,0 +1,61 @@ +/****************************************************************************** + * + * Copyright (C) 2015 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ***************************************************************************** + * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore +*/ +/** +******************************************************************************* +* @file +* ih264_debug.h +* +* @brief +* Definitions for codec debugging +* +* @author +* Ittiam +* +* @par List of Functions: +* +* @remarks +* None +* +******************************************************************************* +*/ +#ifndef _IH264_DEBUG_H_ +#define _IH264_DEBUG_H_ + + +#if DEBUG_PRINT + +#define DEBUG(...) \ +{ \ + printf("\n[H264 DBG] %s/%d:: ", __FUNCTION__, __LINE__); \ + printf(__VA_ARGS__); \ +} + +#else + +#define DEBUG(...) {} + +#endif + + +#define ASSERT(x) assert((x)) + + +#endif /* _IH264_DEBUG_H_ */ + diff --git a/common/ih264_defs.h b/common/ih264_defs.h new file mode 100755 index 0000000..8d7e387 --- /dev/null +++ b/common/ih264_defs.h @@ -0,0 +1,690 @@ +/****************************************************************************** + * + * Copyright (C) 2015 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ***************************************************************************** + * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore +*/ +/** +******************************************************************************* +* @file +* ih264_defs.h +* +* @brief +* Definitions used in the codec +* +* @author +* Ittiam +* +* +* @remarks +* None +* +******************************************************************************* +*/ + +#ifndef IH264_DEFS_H_ +#define IH264_DEFS_H_ + +/*****************************************************************************/ +/* Enums */ +/*****************************************************************************/ + + +/*****************************************************************************/ +/* Profile and Levels */ +/*****************************************************************************/ + +/** +****************************************************************************** + * @enum PROFILE_IDC + * @brief Defines the set of possible profiles +****************************************************************************** +*/ +enum +{ + IH264_PROFILE_BASELINE = 66, + IH264_PROFILE_MAIN = 77, + IH264_PROFILE_EXTENDED = 88, + IH264_PROFILE_HIGH = 100, + IH264_PROFILE_HIGH10 = 110, + IH264_PROFILE_HIGH422 = 122, + IH264_PROFILE_HIGH444 = 144, +}; + +/** +****************************************************************************** + * @enum LEVEL_IDC + * @brief Defines the set of possible levels +****************************************************************************** +*/ +typedef enum +{ + IH264_LEVEL_10 = 10, + IH264_LEVEL_1B = 9, + IH264_LEVEL_11 = 11, + IH264_LEVEL_12 = 12, + IH264_LEVEL_13 = 13, + IH264_LEVEL_20 = 20, + IH264_LEVEL_21 = 21, + IH264_LEVEL_22 = 22, + IH264_LEVEL_30 = 30, + IH264_LEVEL_31 = 31, + IH264_LEVEL_32 = 32, + IH264_LEVEL_40 = 40, + IH264_LEVEL_41 = 41, + IH264_LEVEL_42 = 42, + IH264_LEVEL_50 = 50, + IH264_LEVEL_51 = 51, +}IH264_LEVEL_T; + + +/** +****************************************************************************** + * @enum PIC TYPES + * @brief Defines the set of possible picture type - not signaled in bitstream +****************************************************************************** +*/ +typedef enum +{ + PIC_NA = 0x7FFFFFFF, + PIC_IDR = 0, + PIC_I = 1, + PIC_P = 2, + PIC_B = 3, + PIC_P_NONREF = 4, + PIC_B_NONREF = 5, + PIC_MAX, +}PIC_TYPE_T; + +/** +****************************************************************************** + * @enum FRAME-FIELD types + * @brief Defines the set of possible field types. +****************************************************************************** +*/ +enum +{ + TOP_FIELD, + BOTTOM_FIELD, + FRAME, +}; + +/** +****************************************************************************** + * @enum SLICE TYPES + * @brief Defines the set of possible SLICE TYPES +****************************************************************************** +*/ +enum +{ + PSLICE = 0, + BSLICE = 1, + ISLICE = 2, + SPSLICE = 3, + SISLICE = 4, + MAXSLICE_TYPE, +}; + +/** +****************************************************************************** + * @enum NAL_UNIT_TYPE + * @brief Defines the set of possible nal unit types +****************************************************************************** +*/ +enum +{ + NAL_UNSPEC_0 = 0, + NAL_SLICE_NON_IDR = 1, + NAL_SLICE_DPA = 2, + NAL_SLICE_DPB = 3, + NAL_SLICE_DPC = 4, + NAL_SLICE_IDR = 5, + NAL_SEI = 6, + NAL_SPS = 7, + NAL_PPS = 8, + NAL_AUD = 9, + NAL_EOSEQ = 10, + NAL_EOSTR = 11, + NAL_FILLER = 12, + NAL_SPSE = 13, + NAL_RES_18 = 14, + NAL_AUX_PIC = 19, + NAL_RES_23 = 20, + NAL_UNSPEC_31 = 24, +}; + +/** +****************************************************************************** + * @enum CHROMA_FORMAT_IDC + * @brief Defines the set of possible chroma formats + * Note Chorma format Do not change enum values +****************************************************************************** +*/ +enum +{ + CHROMA_FMT_IDC_MONOCHROME = 0, + CHROMA_FMT_IDC_YUV420 = 1, + CHROMA_FMT_IDC_YUV422 = 2, + CHROMA_FMT_IDC_YUV444 = 3, + CHROMA_FMT_IDC_YUV444_PLANES = 4, +}; + + +/** +****************************************************************************** + * @enum MBMODES_I16x16 + * @brief Defines the set of possible intra 16x16 mb modes +****************************************************************************** +*/ +typedef enum +{ + VERT_I16x16 = 0, + HORZ_I16x16 = 1, + DC_I16x16 = 2, + PLANE_I16x16 = 3, + MAX_I16x16 = 4, +}MBMODES_I16x16; + +/** +****************************************************************************** + * @enum MBMODES_I4x4 + * @brief Defines the set of possible intra 4x4 mb modes +****************************************************************************** +*/ +typedef enum +{ + VERT_I4x4 = 0, + HORZ_I4x4 = 1, + DC_I4x4 = 2, + DIAG_DL_I4x4 = 3, + DIAG_DR_I4x4 = 4, + VERT_R_I4x4 = 5, + HORZ_D_I4x4 = 6, + VERT_L_I4x4 = 7, + HORZ_U_I4x4 = 8, + MAX_I4x4 = 9, +}MBMODES_I4x4; + +/** +****************************************************************************** + * @enum MBMODES_I8x8 + * @brief Defines the set of possible intra 8x8 mb modes +****************************************************************************** +*/ +typedef enum +{ + VERT_I8x8 = 0, + HORZ_I8x8 = 1, + DC_I8x8 = 2, + DIAG_DL_I8x8 = 3, + DIAG_DR_I8x8 = 4, + VERT_R_I8x8 = 5, + HORZ_D_I8x8 = 6, + VERT_L_I8x8 = 7, + HORZ_U_I8x8 = 8, + MAX_I8x8 = 9, +}MBMODES_I8x8; + +/** +****************************************************************************** + * @enum MBMODES_CHROMA_I8x8 (Chroma) + * @brief Defines the set of possible intra 8x8 mb modes for chroma +****************************************************************************** +*/ +typedef enum +{ + DC_CH_I8x8 = 0, + HORZ_CH_I8x8 = 1, + VERT_CH_I8x8 = 2, + PLANE_CH_I8x8 = 3, + MAX_CH_I8x8 = 4, +}MBMODES_CHROMA_I8x8; + +/** +****************************************************************************** + * @enum MBTYPES + * @brief Defines the set of possible macro block types +****************************************************************************** +*/ +typedef enum +{ + I16x16 = 0, + I4x4 = 1, + I8x8 = 2, + P16x16 = 3, + P16x8 = 4, + P8x16 = 5, + P8x8 = 6, + PSKIP = 7, + IPCM = 8, + MAX_MBTYPES, +}MBTYPES_T; + +/* Prediction list */ +/* Do not change enum values */ +enum +{ + PRED_L0 = 0, + PRED_L1 = 1, + PRED_BI = 2 +}; + + +/** +****************************************************************************** + * @enum ENTROPY_BLK_TYPE + * @brief Defines the nature of blocks employed in entropy coding +****************************************************************************** +*/ +typedef enum +{ + ENTROPY_BLK_INVALID = -1, + CAVLC_LUMA_4x4_DC = 0, + CAVLC_LUMA_4x4_AC = 1, + CAVLC_LUMA_4x4 = 2, + CAVLC_CHROMA_4x4_DC = 3, + CAVLC_CHROMA_4x4_AC = 4, +} ENTROPY_BLK_TYPE; + +/** +****************************************************************************** + * @enum ENTROPY_MODE + * @brief Entropy coding modes +****************************************************************************** +*/ +typedef enum +{ + CAVLC = 0, + CABAC = 1, +} ENTROPY_MODE; + +/** +****************************************************************************** + * @enum COMPONENT_TYPE + * @brief components Y, U & V +****************************************************************************** +*/ +typedef enum +{ + Y, + U, + V, +} COMPONENT_TYPE; + + +/** +****************************************************************************** + * @enum MBPART_PREDMODE_T + * @brief MbPartPredMode Table 7-11 to 7-14 +****************************************************************************** +*/ +typedef enum +{ + MBPART_NA, + MBPART_I4x4, + MBPART_I8x8, + MBPART_I16x16, + MBPART_L0, + MBPART_L1, + MBPART_BI, + MBPART_DIRECT, + MBPART_IPCM, +}MBPART_PREDMODE_T; + + +typedef enum +{ + I_NxN, + I_16x16_0_0_0, + I_16x16_1_0_0, + I_16x16_2_0_0, + I_16x16_3_0_0, + I_16x16_0_1_0, + I_16x16_1_1_0, + I_16x16_2_1_0, + I_16x16_3_1_0, + I_16x16_0_2_0, + I_16x16_1_2_0, + I_16x16_2_2_0, + I_16x16_3_2_0, + I_16x16_0_0_1, + I_16x16_1_0_1, + I_16x16_2_0_1, + I_16x16_3_0_1, + I_16x16_0_1_1, + I_16x16_1_1_1, + I_16x16_2_1_1, + I_16x16_3_1_1, + I_16x16_0_2_1, + I_16x16_1_2_1, + I_16x16_2_2_1, + I_16x16_3_2_1, + I_PCM, +}MBTYPE_ISLICE_T; + +typedef enum +{ + P_L0_16x16, + P_L0_L0_16x8, + P_L0_L0_8x16, + P_8x8, + P_8x8REF0, + P_SKIP +}MBTYPE_PSLICE_T; + +typedef enum +{ + B_DIRECT_16x16, + B_L0_16x16, + B_L1_16x16, + B_BI_16x16, + B_L0_L0_16x8, + B_L0_L0_8x16, + B_L1_L1_16x8, + B_L1_L1_8x16, + B_L0_L1_16x8, + B_L0_L1_8x16, + B_L1_L0_16x8, + B_L1_L0_8x16, + B_L0_BI_16x8, + B_L0_BI_8x16, + B_L1_BI_16x8, + B_L1_BI_8x16, + B_BI_L0_16x8, + B_BI_L0_8x16, + B_BI_L1_16x8, + B_BI_L1_8x16, + B_BI_BI_16x8, + B_BI_BI_8x16, + B_8x8, + B_SKIP, +}MBTYPE_BSLICE_T; + + +typedef enum +{ + P_L0_8x8, + P_L0_8x4, + P_L0_4x8, + P_L0_4x4, +}SUBMBTYPE_PSLICE_T; + +typedef enum +{ + B_DIRECT_8x8, + B_L0_8x8, + B_L1_8x8, + B_BI_8x8, + B_L0_8x4, + B_L0_4x8, + B_L1_8x4, + B_L1_4x8, + B_BI_8x4, + B_BI_4x8, + B_L0_4x4, + B_L1_4x4, + B_BI_4x4, +}SUBMBTYPE_BSLICE_T; + +/** + * DC Mode pattern for 4 4x4 sub blocks in an MB row + */ +#define DC_I16X16_MB_ROW (DC_I16x16 << 24) | (DC_I16x16 << 16) | \ + (DC_I16x16 << 8) | DC_I16x16 + + + +/*****************************************************************************/ +/* Constant Macros */ +/*****************************************************************************/ + +/*****************************************************************************/ +/* Reference frame defs */ +/*****************************************************************************/ +/* Maximum DPB size */ +#define MAX_DPB_SIZE 16 + +/* Maximum mmco commands in slice header */ +#define MAX_MMCO_COMMANDS 32 + +/* Maximum reference reorder idc */ +#define MAX_MODICATION_IDC 32 + +/*****************************************************************************/ +/* SPS restrictions */ +/*****************************************************************************/ + +/* Number of SPS allowed */ +/* An extra buffer is allocated to write the parsed data + * It is copied to the appropriate location later */ +#define MAX_SPS_CNT (32 + 1) + +/* Maximum long term reference pics */ +#define MAX_LTREF_PICS_SPS 16 + +/* Maximum short term reference pics */ +#define MAX_STREF_PICS_SPS 64 + + +/*****************************************************************************/ +/* PPS restrictions */ +/*****************************************************************************/ + +/* Number of PPS allowed */ +/* An extra buffer is allocated to write the parsed data + * It is copied to the appropriate location later */ +#define MAX_PPS_CNT (256 + 1) + +/*****************************************************************************/ +/* Macro definitions for sizes of MB, PU, TU, CU */ +/*****************************************************************************/ +#define MB_SIZE 16 +#define BLK8x8SIZE 8 +#define BLK_SIZE 4 + + +/* TU Size Range */ +#define MAX_TU_SIZE 8 +#define MIN_TU_SIZE 4 + +/* Max Transform Size */ +#define MAX_TRANS_SIZE (MAX_TU_SIZE*MAX_TU_SIZE) + +/* PU Size Range */ +#define MAX_PU_SIZE 16 +#define MIN_PU_SIZE 4 + +/* Number of max TU in a MB row */ +#define MAX_TU_IN_MB_ROW ((MB_SIZE / MIN_TU_SIZE)) + +/* Number of max PU in a CTb row */ +#define MAX_PU_IN_MB_ROW ((MB_SIZE / MIN_PU_SIZE)) + + +/* Number of max PU in a MB */ +/*****************************************************************************/ +/* Note though for 64 x 64 MB, Max PU in MB is 128, in order to store */ +/* intra pred info, 256 entries are needed */ +/*****************************************************************************/ +#define MAX_PU_IN_MB ((MB_SIZE / MIN_PU_SIZE) * \ + (MB_SIZE / MIN_PU_SIZE)) + +/* Number of max TU in a MB */ +#define MAX_TU_IN_MB ((MB_SIZE / MIN_TU_SIZE) * \ + (MB_SIZE / MIN_TU_SIZE)) + + + +/** + * Maximum transform depths + */ +#define MAX_TRAFO_DEPTH 5 + +#define MAX_DC_4x4_SUBBLK_LUMA 1 +#define MAX_AC_4x4_SUBBLK_LUMA 16 +#define MAX_DC_4x4_SUBBLK_CHROMA 2 +#define MAX_AC_4x4_SUBBLK_CHROMA 8 + +#define MAX_4x4_SUBBLKS (MAX_DC_4x4_SUBBLK_LUMA + MAX_DC_4x4_SUBBLK_CHROMA +\ + MAX_AC_4x4_SUBBLK_LUMA + MAX_AC_4x4_SUBBLK_CHROMA) + +/* Max number of deblocking edges */ +#define MAX_VERT_DEBLK_EDGES ((MB_SIZE/8) * (MB_SIZE/4)) +#define MAX_HORZ_DEBLK_EDGES ((MB_SIZE/4) * (MB_SIZE/8)) + +/* Qp can not change below 8x8 level */ +#define MAX_DEBLK_QP_CNT ((MB_SIZE/8) * (MB_SIZE/8)) + +/*****************************************************************************/ +/* Parsing related macros */ +/*****************************************************************************/ +#define SUBBLK_COEFF_CNT 16 + +/* Quant and Trans defs */ + +/*****************************************************************************/ +/* Sizes for Transform functions */ +/*****************************************************************************/ +#define TRANS_SIZE_4 4 +#define TRANS_SIZE_8 8 +#define TRANS_SIZE_16 16 +#define TRANS_SIZE_32 32 + + +#define IT_SHIFT_STAGE_1 7 +#define IT_SHIFT_STAGE_2 12 + +/** + * @breif Maximum transform dynamic range (excluding sign bit) + */ +#define MAX_TR_DYNAMIC_RANGE 15 + +/** + * @brief Q(QP%6) * IQ(QP%6) = 2^20 + */ +#define QUANT_IQUANT_SHIFT 20 + +/** + * @breif Q factor for Qp%6 multiplication + */ +#define QUANT_SHIFT 14 + +/** + * @breif Q shift factor for flat rescale matrix weights + */ +#define FLAT_RESCALE_MAT_Q_SHIFT 11 + +/** + * @breif Scaling matrix is represented in Q15 format + */ +#define SCALING_Q_SHIFT 15 + +/** + * @brief rounding factor for quantization represented in Q9 format + */ +#define QUANT_ROUND_FACTOR_Q 9 + +/** + * @brief Minimum qp supported in H264 spec + */ +#define MIN_H264_QP 0 + +/** + * @brief Maximum qp supported in H264 spec + */ +#define MAX_H264_QP 51 + +/** + * @breif Total number of transform sizes + * used for sizeID while getting scale matrix + */ +#define NUM_UNIQUE_TRANS_SIZE 4 + +/** + * @breif Maximum number of bits in frameNumber signaling + */ +#define MAX_BITS_IN_FRAME_NUM 16 + +/** + * @breif Maximum number of bits in POC LSB signaling + */ +#define MAX_BITS_IN_POC_LSB 16 + + +/** + * @breif Maximum PIC Order Count type + */ +#define MAX_PIC_ORDER_COUNT_TYPE 2 + + +/** + * @breif Maximum Weighted bipred idc + */ +#define MAX_WEIGHT_BIPRED_IDC 2 + +/*****************************************************************************/ +/* Number of scaling matrices for each transform size */ +/*****************************************************************************/ +#define SCALE_MAT_CNT_TRANS_SIZE_4 6 +#define SCALE_MAT_CNT_TRANS_SIZE_8 6 +#define SCALE_MAT_CNT_TRANS_SIZE_16 6 +#define SCALE_MAT_CNT_TRANS_SIZE_32 2 + +/* Maximum number of scale matrices for a given transform size */ +#define SCALE_MAT_CNT_MAX_PER_TRANS_SIZE 6 + +/* Total number of scale matrices */ +#define TOTAL_SCALE_MAT_COUNT (SCALE_MAT_CNT_TRANS_SIZE_4 + \ + SCALE_MAT_CNT_TRANS_SIZE_8 + \ + SCALE_MAT_CNT_TRANS_SIZE_16 + \ + SCALE_MAT_CNT_TRANS_SIZE_32) + + +/*****************************************************************************/ +/* Intra pred Macros */ +/*****************************************************************************/ +/** Planar Intra prediction mode */ +#define INTRA_PLANAR 0 + +/** DC Intra prediction mode */ +#define INTRA_DC 1 + +/** Gives angular mode for intra prediction */ +#define INTRA_ANGULAR(x) (x) + +/** Following is used to signal no intra prediction in case of pcm blocks + */ +#define INTRA_PRED_NONE 63 + + +/** Following is used to signal no intra prediction is needed for first three + * 4x4 luma blocks in case of 4x4 TU sizes + * Also used in pcm cases + */ +#define INTRA_PRED_CHROMA_IDX_NONE 7 + + +/** +****************************************************************************** + * @brief neighbor availability masks +****************************************************************************** + */ +#define LEFT_MB_AVAILABLE_MASK 0x01 +#define TOP_LEFT_MB_AVAILABLE_MASK 0x02 +#define TOP_MB_AVAILABLE_MASK 0x04 +#define TOP_RIGHT_MB_AVAILABLE_MASK 0x08 + +#endif /* IH264_DEFS_H_ */ diff --git a/common/ih264_disp_mgr.c b/common/ih264_disp_mgr.c new file mode 100755 index 0000000..2bdb524 --- /dev/null +++ b/common/ih264_disp_mgr.c @@ -0,0 +1,186 @@ +/****************************************************************************** + * + * Copyright (C) 2015 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ***************************************************************************** + * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore +*/ +/** +******************************************************************************* +* @file +* ih264_disp_mgr.c +* +* @brief +* Contains function definitions for display management +* +* @author +* Srinivas T +* +* @par List of Functions: +* - ih264_disp_mgr_init() +* - ih264_disp_mgr_add() +* - ih264_disp_mgr_get() +* +* @remarks +* None +* +******************************************************************************* +*/ +#include <stdlib.h> +#include "ih264_typedefs.h" +#include "ih264_macros.h" +#include "ih264_disp_mgr.h" + + +/** +******************************************************************************* +* +* @brief +* Initialization function for display buffer manager +* +* @par Description: +* Initializes the display buffer management structure +* +* @param[in] ps_disp_mgr +* Pointer to the display buffer management structure +* +* @returns none +* +* @remarks +* None +* +******************************************************************************* +*/ +void ih264_disp_mgr_init(disp_mgr_t *ps_disp_mgr) +{ + WORD32 id; + + ps_disp_mgr->u4_last_abs_poc = DEFAULT_POC; + + for(id = 0; id < DISP_MGR_MAX_CNT; id++) + { + ps_disp_mgr->ai4_abs_poc[id] = DEFAULT_POC; + ps_disp_mgr->apv_ptr[id] = NULL; + } +} + + +/** +******************************************************************************* +* +* @brief +* Adds a buffer to the display manager +* +* @par Description: +* Adds a buffer to the display buffer manager +* +* @param[in] ps_disp_mgr +* Pointer to the display buffer management structure +* +* @param[in] buf_id +* ID of the display buffer +* +* @param[in] abs_poc +* Absolute POC of the display buffer +* +* @param[in] pv_ptr +* Pointer to the display buffer +* +* @returns 0 if success, -1 otherwise +* +* @remarks +* None +* +******************************************************************************* +*/ +WORD32 ih264_disp_mgr_add(disp_mgr_t *ps_disp_mgr, + WORD32 buf_id, + WORD32 abs_poc, + void *pv_ptr) +{ + if(buf_id >= DISP_MGR_MAX_CNT) + { + return (-1); + } + + if(ps_disp_mgr->apv_ptr[buf_id] != NULL) + { + return (-1); + } + + ps_disp_mgr->apv_ptr[buf_id] = pv_ptr; + ps_disp_mgr->ai4_abs_poc[buf_id] = abs_poc; + return 0; +} + + +/** +******************************************************************************* +* +* @brief +* Gets the next buffer +* +* @par Description: +* Gets the next display buffer +* +* @param[in] ps_disp_mgr +* Pointer to the display buffer structure +* +* @param[out] pi4_buf_id +* Pointer to hold buffer id of the display buffer being returned +* +* @returns Pointer to the next display buffer +* +* @remarks +* None +* +******************************************************************************* +*/ +void* ih264_disp_mgr_get(disp_mgr_t *ps_disp_mgr, WORD32 *pi4_buf_id) +{ + WORD32 id; + void *pv_ret_ptr; + WORD32 i4_min_poc; + WORD32 min_poc_id; + + + pv_ret_ptr = NULL; + i4_min_poc = 0x7FFFFFFF; + min_poc_id = -1; + + /* Find minimum POC */ + for(id = 0; id < DISP_MGR_MAX_CNT; id++) + { + if((DEFAULT_POC != ps_disp_mgr->ai4_abs_poc[id]) && + (ps_disp_mgr->ai4_abs_poc[id] <= i4_min_poc)) + { + i4_min_poc = ps_disp_mgr->ai4_abs_poc[id]; + min_poc_id = id; + } + } + *pi4_buf_id = min_poc_id; + /* If all pocs are still default_poc then return NULL */ + if(-1 == min_poc_id) + { + return NULL; + } + + pv_ret_ptr = ps_disp_mgr->apv_ptr[min_poc_id]; + + /* Set abs poc to default and apv_ptr to null so that the buffer is not returned again */ + ps_disp_mgr->apv_ptr[min_poc_id] = NULL; + ps_disp_mgr->ai4_abs_poc[min_poc_id] = DEFAULT_POC; + return pv_ret_ptr; +} diff --git a/common/ih264_disp_mgr.h b/common/ih264_disp_mgr.h new file mode 100755 index 0000000..6f56493 --- /dev/null +++ b/common/ih264_disp_mgr.h @@ -0,0 +1,70 @@ +/****************************************************************************** + * + * Copyright (C) 2015 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ***************************************************************************** + * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore +*/ +/** +******************************************************************************* +* @file +* ih264_disp_mgr.h +* +* @brief +* Function declarations used for display management +* +* @author +* Srinivas T +* +* +* @remarks +* None +* +******************************************************************************* +*/ +#ifndef _DISP_MGR_H_ +#define _DISP_MGR_H_ + +#define DISP_MGR_MAX_CNT 64 +#define DEFAULT_POC 0x7FFFFFFF + +typedef struct +{ + /** + * last_abs_poc + */ + UWORD32 u4_last_abs_poc; + + /** + * au4_abs_poc[DISP_MGR_MAX_CNT] + */ + WORD32 ai4_abs_poc[DISP_MGR_MAX_CNT]; + + /** + * apv_ptr[DISP_MGR_MAX_CNT] + */ + void *apv_ptr[DISP_MGR_MAX_CNT]; +}disp_mgr_t; + +void ih264_disp_mgr_init(disp_mgr_t *ps_disp_mgr); + +WORD32 ih264_disp_mgr_add(disp_mgr_t *ps_disp_mgr, + WORD32 id, + WORD32 abs_poc, + void *pv_ptr); + +void* ih264_disp_mgr_get(disp_mgr_t *ps_disp_mgr, WORD32 *pi4_buf_id); + +#endif //_DISP_MGR_H_ diff --git a/common/ih264_dpb_mgr.c b/common/ih264_dpb_mgr.c new file mode 100755 index 0000000..8e087d3 --- /dev/null +++ b/common/ih264_dpb_mgr.c @@ -0,0 +1,1176 @@ +/****************************************************************************** + * + * Copyright (C) 2015 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ***************************************************************************** + * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore +*/ +/** + ******************************************************************************* + * @file + * ih264_dpb_mgr.c + * + * @brief + * Function definitions used for decoded picture buffer management + * + * @author + * Srinivas T + * + * @par List of Functions: + * - ih264_dpb_mgr_init() + * - ih264_dpb_mgr_sort_short_term_fields_by_frame_num() + * - ih264_dpb_mgr_sort_short_term_fields_by_poc_l0() + * - ih264_dpb_mgr_sort_short_term_fields_by_poc_l1() + * - ih264_dpb_mgr_sort_long_term_fields_by_frame_idx() + * - ih264_dpb_mgr_alternate_ref_fields() + * - ih264_dpb_mgr_insert_ref_field() + * - ih264_dpb_mgr_insert_ref_frame() + * - ih264_dpb_mgr_count_ref_frames() + * - ih264_dpb_mgr_delete_ref_frame() + * - ih264_dpb_mgr_delete_long_ref_fields_max_frame_idx() + * - ih264_dpb_mgr_delete_short_ref_frame() + * - ih264_dpb_mgr_delete_all_ref_frames() + * - ih264_dpb_mgr_reset() + * - ih264_dpb_mgr_release_pics() + * + * @remarks + * None + * + ******************************************************************************* + */ + +#include <stdio.h> +#include <stdlib.h> +#include <assert.h> + +#include "ih264_typedefs.h" +#include "ih264_defs.h" +#include "ih264_macros.h" +#include "ih264_error.h" +#include "ih264_structs.h" +#include "ih264_buf_mgr.h" +#include "ih264_dpb_mgr.h" +#include "ih264_debug.h" + +/** + ******************************************************************************* + * + * @brief + * DPB manager initializer + * + * @par Description: + * Initialises the DPB manager structure + * + * @param[in] ps_dpb_mgr + * Pointer to the DPB manager structure + * + * @returns + * + * @remarks + * + * + ******************************************************************************* + */ + +void ih264_dpb_mgr_init(dpb_mgr_t *ps_dpb_mgr) +{ + UWORD32 i; + dpb_info_t *ps_dpb_info = ps_dpb_mgr->as_dpb_info; + for(i = 0; i < MAX_DPB_BUFS; i++) + { + ps_dpb_info[i].ps_prev_dpb = NULL; + ps_dpb_info[i].ps_pic_buf = NULL; + ps_dpb_mgr->as_top_field_pics[i].i4_used_as_ref = INVALID; + ps_dpb_mgr->as_bottom_field_pics[i].i4_used_as_ref = INVALID; + ps_dpb_mgr->as_top_field_pics[i].i1_field_type = INVALID; + ps_dpb_mgr->as_bottom_field_pics[i].i1_field_type = INVALID; + ps_dpb_mgr->as_top_field_pics[i].i4_long_term_frame_idx = -1; + ps_dpb_mgr->as_bottom_field_pics[i].i4_long_term_frame_idx = -1; + } + + ps_dpb_mgr->u1_num_short_term_ref_bufs = 0; + ps_dpb_mgr->u1_num_long_term_ref_bufs = 0; + ps_dpb_mgr->ps_dpb_short_term_head = NULL; + ps_dpb_mgr->ps_dpb_long_term_head = NULL; +} + +/** + ******************************************************************************* + * + * @brief + * Function to sort sort term pics by frame_num. + * + * @par Description: + * Sorts short term fields by frame_num. For 2 fields having same frame_num, + * orders them based on requested first field type. + * + * @param[in] ps_dpb_mgr + * Pointer to the DPB manager structure + * + * @param[in] curr_frame_num + * frame_num of the current pic + * + * @param[in] first_field_type + * For complementary fields, required first field + * + * @param[in] max_frame_num + * Maximum frame_num allowed + * + * @returns + * + * @remarks + * + * + ******************************************************************************* + */ +WORD32 ih264_dpb_mgr_sort_short_term_fields_by_frame_num(dpb_mgr_t *ps_dpb_mgr, + WORD32 curr_frame_num, + WORD32 first_field_type, + WORD32 max_frame_num) +{ + dpb_info_t *ps_dpb_node1 = ps_dpb_mgr->ps_dpb_short_term_head; + dpb_info_t *ps_dpb_node2; + WORD32 frame_num_node1; + WORD32 frame_num_node2; + pic_buf_t *ps_pic_buf; + + if(ps_dpb_node1 == NULL) + return -1; + + for (; ps_dpb_node1 != NULL; ps_dpb_node1 = ps_dpb_node1->ps_prev_dpb) + { + for (ps_dpb_node2 = ps_dpb_node1->ps_prev_dpb; ps_dpb_node2 != NULL; ps_dpb_node2 = ps_dpb_node2->ps_prev_dpb) + { + frame_num_node1 = ps_dpb_node1->ps_pic_buf->i4_frame_num; + frame_num_node2 = ps_dpb_node2->ps_pic_buf->i4_frame_num; + + if(frame_num_node1 > curr_frame_num) + frame_num_node1 = frame_num_node1 - max_frame_num; + if(frame_num_node2 > curr_frame_num) + frame_num_node2 = frame_num_node2 - max_frame_num; + + if(frame_num_node1 < frame_num_node2) + { + ps_pic_buf = ps_dpb_node1->ps_pic_buf; + ps_dpb_node1->ps_pic_buf = ps_dpb_node2->ps_pic_buf; + ps_dpb_node2->ps_pic_buf = ps_pic_buf; + } + } + } + + /** + * For frames and complementary field pairs, + * ensure first_field_type appears first in the list + */ + ps_dpb_node1 = ps_dpb_mgr->ps_dpb_short_term_head; + ps_dpb_node2 = ps_dpb_node1->ps_prev_dpb; + while(ps_dpb_node2 != NULL) + { + pic_buf_t *ps_pic_node1 = ps_dpb_node1->ps_pic_buf; + pic_buf_t *ps_pic_node2 = ps_dpb_node2->ps_pic_buf; + frame_num_node1 = ps_pic_node1->i4_frame_num; + frame_num_node2 = ps_pic_node2->i4_frame_num; + if(frame_num_node1 == frame_num_node2) + { + ASSERT(ps_pic_node1->i1_field_type != ps_pic_node2->i1_field_type); + if(ps_pic_node1->i1_field_type != first_field_type) + { + ps_dpb_node1->ps_pic_buf = ps_pic_node2; + ps_dpb_node2->ps_pic_buf = ps_pic_node1; + } + } + ps_dpb_node1 = ps_dpb_node2; + ps_dpb_node2 = ps_dpb_node2->ps_prev_dpb; + } + return 0; + +} + +/** + ******************************************************************************* + * + * @brief + * Function to sort sort term pics by poc for list 0. + * + * @par Description: + * Orders all the pocs less than current poc in the descending order. + * Then orders all the pocs greater than current poc in the ascending order. + * + * @param[in] ps_dpb_mgr + * Pointer to the DPB manager structure + * + * @param[in] curr_poc + * Poc of the current pic + * + * @param[in] first_field_type + * For complementary fields, required first field + * + * @returns + * + * @remarks + * + * + ******************************************************************************* + */ +WORD32 ih264_dpb_mgr_sort_short_term_fields_by_poc_l0(dpb_mgr_t *ps_dpb_mgr, + WORD32 curr_poc, + WORD32 first_field_type) +{ + dpb_info_t *ps_dpb_node1 = ps_dpb_mgr->ps_dpb_short_term_head; + dpb_info_t *ps_dpb_node2; + WORD32 poc_node1; + WORD32 poc_node2; + WORD32 frame_num_node1; + WORD32 frame_num_node2; + pic_buf_t *ps_pic_buf; + + if(ps_dpb_node1 == NULL) + return -1; + + /** + * Sort the fields by poc. + * All POCs less than current poc are first placed in the descending order. + * Then all POCs greater than current poc are placed in the ascending order. + */ + for (; ps_dpb_node1 != NULL; ps_dpb_node1 = ps_dpb_node1->ps_prev_dpb) + { + for (ps_dpb_node2 = ps_dpb_node1->ps_prev_dpb; ps_dpb_node2 != NULL; ps_dpb_node2 = ps_dpb_node2->ps_prev_dpb) + { + poc_node1 = ps_dpb_node1->ps_pic_buf->i4_abs_poc; + poc_node2 = ps_dpb_node2->ps_pic_buf->i4_abs_poc; + ASSERT(poc_node1 != curr_poc); + ASSERT(poc_node2 != curr_poc); + if(((poc_node1 < curr_poc) && (poc_node2 > curr_poc)) || + ((poc_node1 < curr_poc) && (poc_node2 < curr_poc) && (poc_node1 > poc_node2)) || + ((poc_node1 > curr_poc) && (poc_node2 > curr_poc) && (poc_node1 < poc_node2))) + continue; + + ps_pic_buf = ps_dpb_node1->ps_pic_buf; + ps_dpb_node1->ps_pic_buf = ps_dpb_node2->ps_pic_buf; + ps_dpb_node2->ps_pic_buf = ps_pic_buf; + } + } + + ps_dpb_node1 = ps_dpb_mgr->ps_dpb_short_term_head; + ps_dpb_node2 = ps_dpb_node1->ps_prev_dpb; + while(ps_dpb_node2 != NULL) + { + pic_buf_t *ps_pic_node1 = ps_dpb_node1->ps_pic_buf; + pic_buf_t *ps_pic_node2 = ps_dpb_node2->ps_pic_buf; + frame_num_node1 = ps_pic_node1->i4_frame_num; + frame_num_node2 = ps_pic_node2->i4_frame_num; + if(frame_num_node1 == frame_num_node2) + { + ASSERT(ps_pic_node1->i1_field_type != ps_pic_node2->i1_field_type); + if(ps_pic_node1->i1_field_type != first_field_type) + { + ps_dpb_node1->ps_pic_buf = ps_pic_node2; + ps_dpb_node2->ps_pic_buf = ps_pic_node1; + } + } + ps_dpb_node1 = ps_dpb_node2; + ps_dpb_node2 = ps_dpb_node2->ps_prev_dpb; + } + return 0; + +} + +/** + ******************************************************************************* + * + * @brief + * Function to sort sort term pics by poc for list 1. + * + * @par Description: + * Orders all the pocs greater than current poc in the ascending order. + * Then rrders all the pocs less than current poc in the descending order. + * + * @param[in] ps_dpb_mgr + * Pointer to the DPB manager structure + * + * @param[in] curr_poc + * Poc of the current pic + * + * @param[in] first_field_type + * For complementary fields, required first field + * + * @returns + * + * @remarks + * + * + ******************************************************************************* + */ +WORD32 ih264_dpb_mgr_sort_short_term_fields_by_poc_l1(dpb_mgr_t *ps_dpb_mgr, + WORD32 curr_poc, + WORD32 first_field_type) +{ + dpb_info_t *ps_dpb_node1 = ps_dpb_mgr->ps_dpb_short_term_head; + dpb_info_t *ps_dpb_node2; + WORD32 poc_node1; + WORD32 poc_node2; + WORD32 frame_num_node1; + WORD32 frame_num_node2; + pic_buf_t *ps_pic_buf; + + if(ps_dpb_node1 == NULL) + return -1; + + /** + * Sort the fields by poc. + * All POCs greater than current poc are first placed in the ascending order. + * Then all POCs less than current poc are placed in the decending order. + */ + for (; ps_dpb_node1 != NULL; ps_dpb_node1 = ps_dpb_node1->ps_prev_dpb) + { + for (ps_dpb_node2 = ps_dpb_node1->ps_prev_dpb; ps_dpb_node2 != NULL; ps_dpb_node2 = ps_dpb_node2->ps_prev_dpb) + { + poc_node1 = ps_dpb_node1->ps_pic_buf->i4_abs_poc; + poc_node2 = ps_dpb_node2->ps_pic_buf->i4_abs_poc; + ASSERT(poc_node1 != curr_poc); + ASSERT(poc_node2 != curr_poc); + if(((poc_node1 > curr_poc) && (poc_node2 < curr_poc)) || + ((poc_node1 < curr_poc) && (poc_node2 < curr_poc) && (poc_node1 > poc_node2)) || + ((poc_node1 > curr_poc) && (poc_node2 > curr_poc) && (poc_node1 < poc_node2))) + continue; + + ps_pic_buf = ps_dpb_node1->ps_pic_buf; + ps_dpb_node1->ps_pic_buf = ps_dpb_node2->ps_pic_buf; + ps_dpb_node2->ps_pic_buf = ps_pic_buf; + } + } + + ps_dpb_node1 = ps_dpb_mgr->ps_dpb_short_term_head; + ps_dpb_node2 = ps_dpb_node1->ps_prev_dpb; + while(ps_dpb_node2 != NULL) + { + pic_buf_t *ps_pic_node1 = ps_dpb_node1->ps_pic_buf; + pic_buf_t *ps_pic_node2 = ps_dpb_node2->ps_pic_buf; + frame_num_node1 = ps_pic_node1->i4_frame_num; + frame_num_node2 = ps_pic_node2->i4_frame_num; + if(frame_num_node1 == frame_num_node2) + { + ASSERT(ps_pic_node1->i1_field_type != ps_pic_node2->i1_field_type); + if(ps_pic_node1->i1_field_type != first_field_type) + { + ps_dpb_node1->ps_pic_buf = ps_pic_node2; + ps_dpb_node2->ps_pic_buf = ps_pic_node1; + } + } + ps_dpb_node1 = ps_dpb_node2; + ps_dpb_node2 = ps_dpb_node2->ps_prev_dpb; + } + return 0; +} +/** + ******************************************************************************* + * + * @brief + * Function to sort long term pics by long term frame idx. + * + * @par Description: + * Sorts long term fields by long term frame idx. For 2 fields + * having same frame_num, orders them based on requested first field type. + * + * @param[in] ps_dpb_mgr + * Pointer to the DPB manager structure + * + * @param[in] first_field_type + * For complementary fields, required first field + * + * @returns + * + * @remarks + * + * + ******************************************************************************* + */ +WORD32 ih264_dpb_mgr_sort_long_term_fields_by_frame_idx(dpb_mgr_t *ps_dpb_mgr, + WORD32 first_field_type) +{ + dpb_info_t *ps_dpb_node1 = ps_dpb_mgr->ps_dpb_long_term_head; + dpb_info_t *ps_dpb_node2; + WORD32 frame_idx_node1; + WORD32 frame_idx_node2; + pic_buf_t *ps_pic_buf; + + if(ps_dpb_node1 == NULL) + return -1; + + /* Sort the fields by frame idx */ + for (; ps_dpb_node1 != NULL; ps_dpb_node1 = ps_dpb_node1->ps_prev_dpb) + { + for (ps_dpb_node2 = ps_dpb_node1->ps_prev_dpb; ps_dpb_node2 != NULL; ps_dpb_node2 = ps_dpb_node2->ps_prev_dpb) + { + frame_idx_node1 = ps_dpb_node1->ps_pic_buf->i4_long_term_frame_idx; + frame_idx_node2 = ps_dpb_node2->ps_pic_buf->i4_long_term_frame_idx; + + if(frame_idx_node1 > frame_idx_node2) + { + ps_pic_buf = ps_dpb_node1->ps_pic_buf; + ps_dpb_node1->ps_pic_buf = ps_dpb_node2->ps_pic_buf; + ps_dpb_node2->ps_pic_buf = ps_pic_buf; + } + } + } + + /** + * For frames and complementary field pairs, + * ensure first_field_type appears first in the list + */ + ps_dpb_node1 = ps_dpb_mgr->ps_dpb_long_term_head; + ps_dpb_node2 = ps_dpb_node1->ps_prev_dpb; + while(ps_dpb_node2 != NULL) + { + pic_buf_t *ps_pic_node1 = ps_dpb_node1->ps_pic_buf; + pic_buf_t *ps_pic_node2 = ps_dpb_node2->ps_pic_buf; + frame_idx_node1 = ps_pic_node1->i4_long_term_frame_idx; + frame_idx_node2 = ps_pic_node2->i4_long_term_frame_idx; + if(frame_idx_node1 == frame_idx_node2) + { + ASSERT(ps_pic_node1->i1_field_type != ps_pic_node2->i1_field_type); + if(ps_pic_node1->i1_field_type != first_field_type) + { + ps_dpb_node1->ps_pic_buf = ps_pic_node2; + ps_dpb_node2->ps_pic_buf = ps_pic_node1; + } + } + ps_dpb_node1 = ps_dpb_node2; + ps_dpb_node2 = ps_dpb_node2->ps_prev_dpb; + } + return 0; +} + +/** + ******************************************************************************* + * + * @brief + * Function to alternate fields. + * + * @par Description: + * In the ordered list of fields, alternate fields starting with + * first_field_type + * + * @param[in] ps_dpb_mgr + * Pointer to the DPB manager structure + * + * @param[in] reference_type + * This is used to select between short-term and long-term linked list. + * + * @param[in] first_field_type + * For complementary fields, required first field + * + * @returns + * + * @remarks + * + * + ******************************************************************************* + */ +WORD32 ih264_dpb_mgr_alternate_ref_fields(dpb_mgr_t *ps_dpb_mgr, + WORD32 reference_type, + WORD32 first_field_type) +{ + dpb_info_t s_dpb_head; + dpb_info_t *ps_dpb_head; + dpb_info_t *ps_dpb_node1; + dpb_info_t *ps_dpb_node2; + dpb_info_t *ps_dpb_node3; + dpb_info_t *ps_dpb_node4; + WORD32 expected_field; + + expected_field = first_field_type; + + ps_dpb_head = &s_dpb_head; + + ps_dpb_head->ps_prev_dpb = (reference_type == SHORT_TERM_REF) ? + ps_dpb_mgr->ps_dpb_short_term_head: + ps_dpb_mgr->ps_dpb_long_term_head; + + ps_dpb_node1 = ps_dpb_head; + ps_dpb_node2 = ps_dpb_node1->ps_prev_dpb; + while(ps_dpb_node2 != NULL) + { + pic_buf_t *ps_pic_node2 = ps_dpb_node2->ps_pic_buf; + if(ps_pic_node2->i1_field_type != expected_field) + { + /* + * If it is not expected field, loop over the node till + * the expected field. + */ + ps_dpb_node3 = ps_dpb_node2; + ps_dpb_node4 = ps_dpb_node2->ps_prev_dpb; + while((ps_dpb_node4 != NULL) && + (ps_dpb_node4->ps_pic_buf->i1_field_type != expected_field)) + { + ps_dpb_node3 = ps_dpb_node4; + ps_dpb_node4 = ps_dpb_node4->ps_prev_dpb; + } + if(ps_dpb_node4 != NULL) + { + ps_dpb_node1->ps_prev_dpb = ps_dpb_node4; + ps_dpb_node3->ps_prev_dpb = ps_dpb_node4->ps_prev_dpb; + ps_dpb_node4->ps_prev_dpb = ps_dpb_node2; + } + else + { + /* node4 null means we have reached the end */ + break; + } + } + ps_dpb_node1 = ps_dpb_node1->ps_prev_dpb; + ps_dpb_node2 = ps_dpb_node1->ps_prev_dpb; + expected_field = (ps_dpb_node1->ps_pic_buf->i1_field_type == TOP_FIELD)? + BOTTOM_FIELD:TOP_FIELD; + } + + if((reference_type == SHORT_TERM_REF)) + { + ps_dpb_mgr->ps_dpb_short_term_head = ps_dpb_head->ps_prev_dpb; + } + else + { + ps_dpb_mgr->ps_dpb_long_term_head = ps_dpb_head->ps_prev_dpb; + } + + return 0; +} + +/** + ******************************************************************************* + * + * @brief + * Add a ref field to short-term or long-term linked list. + * + * @par Description: + * This function adds a ref field to either short-term or long-term linked + * list. It picks up memory for the link from the array of dpb_info in + * dpb_mgr. The field is added to the beginning of the linked list and the + * head is set the the field. + * + * @param[in] ps_dpb_mgr + * Pointer to the DPB manager structure + * + * @param[in] ps_pic_buf + * Pic buf structure for the field being added. + * + * @param[in] reference_type + * This is used to select between short-term and long-term linked list. + * + * @param[in] frame_num + * frame_num for the field. + * + * @param[in] long_term_frame_idx + * If the ref being added is long-term, long_term_frame_idx of the field. + * Otherwise invalid. + * + * @returns + * + * @remarks + * + * + ******************************************************************************* + */ +WORD32 ih264_dpb_mgr_insert_ref_field(dpb_mgr_t *ps_dpb_mgr, + pic_buf_t *ps_pic_buf, + WORD32 reference_type, + UWORD32 frame_num, + WORD32 long_term_frame_idx) +{ + WORD32 i; + dpb_info_t *ps_dpb_info; + dpb_info_t *ps_dpb_head; + + ps_dpb_info = ps_dpb_mgr->as_dpb_info; + + /* Return error if buffer is already present in the DPB */ + for(i = 0; i < MAX_DPB_BUFS; i++) + { + if( (ps_dpb_info[i].ps_pic_buf == ps_pic_buf) + && (ps_dpb_info[i].ps_pic_buf->i4_used_as_ref == reference_type) ) + { + return (-1); + } + } + + /* Find an unused DPB location */ + for(i = 0; i < MAX_DPB_BUFS; i++) + { + if(NULL == ps_dpb_info[i].ps_pic_buf) + { + break; + } + } + if(i == MAX_DPB_BUFS) + { + return (-1); + } + + ps_dpb_head = (reference_type == SHORT_TERM_REF) + ?ps_dpb_mgr->ps_dpb_short_term_head + :ps_dpb_mgr->ps_dpb_long_term_head; + + if(reference_type == SHORT_TERM_REF) + long_term_frame_idx = -1; + + /* Create DPB info */ + ps_dpb_info[i].ps_pic_buf = ps_pic_buf; + ps_dpb_info[i].ps_prev_dpb = ps_dpb_head; + ps_dpb_info[i].ps_pic_buf->i4_used_as_ref = reference_type; + ps_dpb_info[i].ps_pic_buf->i4_frame_num = frame_num; + ps_dpb_info[i].ps_pic_buf->i4_long_term_frame_idx = long_term_frame_idx; + + /* update the head node of linked list to point to the current picture */ + if(reference_type == SHORT_TERM_REF) + { + ps_dpb_mgr->ps_dpb_short_term_head = ps_dpb_info + i; + + /* Increment Short term buffer count */ + ps_dpb_mgr->u1_num_short_term_ref_bufs++; + + } + else + { + ps_dpb_mgr->ps_dpb_long_term_head = ps_dpb_info + i; + + /* Increment Long term buffer count */ + ps_dpb_mgr->u1_num_long_term_ref_bufs++; + } + + return 0; +} + +/** + ******************************************************************************* + * + * @brief + * Add a ref frame to short-term or long-term linked list. + * + * @par Description: + * This function adds a ref frame to either short-term or long-term linked + * list. Internally it calls add ref field twice to add top and bottom field. + * + * @param[in] ps_dpb_mgr + * Pointer to the DPB manager structure + * + * @param[in] ps_pic_buf + * Pic buf structure for the field being added. + * + * @param[in] reference_type + * This is used to select between short-term and long-term linked list. + * + * @param[in] frame_num + * frame_num for the field. + * + * @param[in] long_term_frame_idx + * If the ref being added is long-term, long_term_frame_idx of the field. + * Otherwise invalid. + * + * @returns + * + * @remarks + * + * + ******************************************************************************* + */ +WORD32 ih264_dpb_mgr_insert_ref_frame(dpb_mgr_t *ps_dpb_mgr, + pic_buf_t *ps_pic_buf, + WORD32 reference_type, + UWORD32 frame_num, + WORD32 long_term_frame_idx) +{ + WORD32 buf_id; + pic_buf_t *ps_pic_top; + pic_buf_t *ps_pic_bottom; + WORD32 ret; + + /* + * For a frame, since the ps_pic_buf passed to this function is that of top field + * obtain bottom field using buf_id. + */ + ps_pic_top = ps_pic_buf; + buf_id = ps_pic_top->i4_buf_id; + ps_pic_bottom = &ps_dpb_mgr->as_bottom_field_pics[buf_id]; + + /* Insert top field */ + ret = ih264_dpb_mgr_insert_ref_field(ps_dpb_mgr, + ps_pic_top, + reference_type, + frame_num, + long_term_frame_idx); + + if(ret != 0) + return ret; + + /* Insert bottom field */ + ret = ih264_dpb_mgr_insert_ref_field(ps_dpb_mgr, + ps_pic_bottom, + reference_type, + frame_num, + long_term_frame_idx); + + if(ret != 0) + return ret; + + return ret; +} + +/** + ******************************************************************************* + * + * @brief + * Returns the number of ref frames in both the linked list. + * + * @par Description: + * Returns the count of number of frames, number of complementary field pairs + * and number of unpaired fields. + * + * @param[in] ps_dpb_mgr + * Pointer to the DPB manager structure + * + * @param[in] curr_frame_num + * frame_num for the field. + * + * @param[in] max_frame_num + * Maximum frame_num allowed + * + * @returns + * + * @remarks + * + * + ******************************************************************************* + */ +WORD32 ih264_dpb_mgr_count_ref_frames(dpb_mgr_t *ps_dpb_mgr, + WORD32 curr_frame_num, + WORD32 max_frame_num) +{ + WORD32 numShortTerm = 0; + WORD32 numLongTerm = 0; + dpb_info_t *ps_dpb_node; + WORD32 frame_num; + WORD32 prev_frame_num; + + /* + * Compute the number of short-term frames/complementary field pairs/ + * unpaired fields + */ + if(ps_dpb_mgr->ps_dpb_short_term_head != NULL) + { + /* Sort the short-term list by frame_num */ + ih264_dpb_mgr_sort_short_term_fields_by_frame_num(ps_dpb_mgr, + curr_frame_num, + TOP_FIELD, + max_frame_num); + + ps_dpb_node = ps_dpb_mgr->ps_dpb_short_term_head; + if(ps_dpb_node != NULL) + { + numShortTerm++; + prev_frame_num = ps_dpb_node->ps_pic_buf->i4_frame_num; + ps_dpb_node = ps_dpb_node->ps_prev_dpb; + } + + while(ps_dpb_node != NULL) + { + frame_num = ps_dpb_node->ps_pic_buf->i4_frame_num; + if(frame_num != prev_frame_num) + numShortTerm++; + prev_frame_num = ps_dpb_node->ps_pic_buf->i4_frame_num; + ps_dpb_node = ps_dpb_node->ps_prev_dpb; + } + } + + /* + * Compute the number of long-term frames/complementary field pairs/ + * unpaired fields + */ + if(ps_dpb_mgr->ps_dpb_long_term_head != NULL) + { + ih264_dpb_mgr_sort_long_term_fields_by_frame_idx(ps_dpb_mgr, + TOP_FIELD); + + ps_dpb_node = ps_dpb_mgr->ps_dpb_long_term_head; + if(ps_dpb_node != NULL) + { + numLongTerm++; + prev_frame_num = ps_dpb_node->ps_pic_buf->i4_frame_num; + ps_dpb_node = ps_dpb_node->ps_prev_dpb; + } + + while(ps_dpb_node != NULL) + { + frame_num = ps_dpb_node->ps_pic_buf->i4_frame_num; + if(frame_num != prev_frame_num) + numLongTerm++; + prev_frame_num = ps_dpb_node->ps_pic_buf->i4_frame_num; + ps_dpb_node = ps_dpb_node->ps_prev_dpb; + } + } + return (numShortTerm + numLongTerm); +} + +/** + ******************************************************************************* + * + * @brief + * Deletes the ref frame at the end of the linked list. + * + * @par Description: + * Deletes the ref frame at the end of the linked list. For unpaired fields, + * it deletes just the last node. For frame or complementary field pair, it + * deletes the last two nodes. + * + * @param[in] ps_dpb_mgr + * Pointer to the DPB manager structure + * + * @param[in] reference_type + * This is used to select between short-term and long-term linked list. + * + * @returns + * + * @remarks + * + * + ******************************************************************************* + */ +WORD32 ih264_dpb_mgr_delete_ref_frame(dpb_mgr_t *ps_dpb_mgr, + WORD32 reference_type) +{ + dpb_info_t *ps_dpb_node1; + dpb_info_t *ps_dpb_node2; + dpb_info_t *ps_dpb_node3; + + /* + * Assumption: The nodes sorted for frame num. + */ + + + /* Select bw short-term and long-term list. */ + ps_dpb_node1 = (reference_type == SHORT_TERM_REF) + ?ps_dpb_mgr->ps_dpb_short_term_head + :ps_dpb_mgr->ps_dpb_long_term_head; + /* If null, no entries in the list. Hence return. */ + if(ps_dpb_node1 == NULL) + return 0; + + /* If only one node in the list, set as unsed for refer and return. */ + if(ps_dpb_node1->ps_prev_dpb == NULL) + { + /* Set the picture as unused for reference */ + ps_dpb_node1->ps_pic_buf->i4_used_as_ref = UNUSED_FOR_REF; + ps_dpb_node1->ps_pic_buf = NULL; + + if(reference_type == SHORT_TERM_REF) + { + ps_dpb_mgr->ps_dpb_short_term_head = NULL; + + /* Increment Short term buffer count */ + ps_dpb_mgr->u1_num_short_term_ref_bufs = 0; + + } + else + { + ps_dpb_mgr->ps_dpb_long_term_head = NULL; + + /* Increment Long term buffer count */ + ps_dpb_mgr->u1_num_long_term_ref_bufs = 0; + + } + return 0; + } + + /** + * If there are only 2 nodes in the list, set second node as unused for reference. + * If the frame_num of second node and first node is same, set first node also as + * unused for reference and set the corresponding head to NULL. + */ + ps_dpb_node2 = ps_dpb_node1->ps_prev_dpb; + if(ps_dpb_node2->ps_prev_dpb == NULL) + { + /* Set the picture as unused for reference */ + if(ps_dpb_node2->ps_pic_buf->i4_frame_num == ps_dpb_node1->ps_pic_buf->i4_frame_num) + { + /* Set the picture as unused for reference */ + ps_dpb_node1->ps_pic_buf->i4_used_as_ref = UNUSED_FOR_REF; + ps_dpb_node1->ps_pic_buf = NULL; + if(reference_type == SHORT_TERM_REF) + { + ps_dpb_mgr->ps_dpb_short_term_head = NULL; + + /* Increment Short term buffer count */ + ps_dpb_mgr->u1_num_short_term_ref_bufs = 0; + + } + else + { + ps_dpb_mgr->ps_dpb_long_term_head = NULL; + + /* Increment Long term buffer count */ + ps_dpb_mgr->u1_num_long_term_ref_bufs = 0; + + } + + } + ps_dpb_node2->ps_pic_buf->i4_used_as_ref = UNUSED_FOR_REF; + ps_dpb_node2->ps_pic_buf = NULL; + ps_dpb_node1->ps_prev_dpb = NULL; + return 0; + } + /* + * If there are more than 2 nodes, run a loop to get the last 3 nodes. + */ + ps_dpb_node3 = ps_dpb_node2->ps_prev_dpb; + while(ps_dpb_node3->ps_prev_dpb != NULL) + { + ps_dpb_node1 = ps_dpb_node2; + ps_dpb_node2 = ps_dpb_node3; + ps_dpb_node3 = ps_dpb_node3->ps_prev_dpb; + } + /* + * If node 2 and node 3 frame_nums are same, set node 2 also as unsed for + * reference and del reference from node1. + */ + if(ps_dpb_node2->ps_pic_buf->i4_frame_num == ps_dpb_node3->ps_pic_buf->i4_frame_num) + { + ps_dpb_node2->ps_pic_buf->i4_used_as_ref = UNUSED_FOR_REF; + ps_dpb_node2->ps_pic_buf = NULL; + ps_dpb_node1->ps_prev_dpb = NULL; + + } + /* Set the third node as unused for reference */ + ps_dpb_node3->ps_pic_buf->i4_used_as_ref = UNUSED_FOR_REF; + ps_dpb_node3->ps_pic_buf = NULL; + ps_dpb_node2->ps_prev_dpb = NULL; + + return 0; +} +/** + ******************************************************************************* + * + * @brief + * Delete long-term ref fields above max frame idx. + * + * @par Description: + * Deletes all the long-term ref fields having idx greater than max_frame_idx + * + * @param[in] ps_dpb_mgr + * Pointer to the DPB manager structure + * + * @param[in] max_frame_idx + * Max long-term frame idx allowed. + * + * @returns + * + * @remarks + * + * + ******************************************************************************* + */ +WORD32 ih264_dpb_mgr_delete_long_ref_fields_max_frame_idx(dpb_mgr_t *ps_dpb_mgr, + WORD32 max_frame_idx) +{ + dpb_info_t *ps_dpb_node1; + dpb_info_t *ps_dpb_node2; + /* + * Loop until there is node which isn't to be deleted is encountered. + */ + while(ps_dpb_mgr->ps_dpb_long_term_head != NULL) + { + if(ps_dpb_mgr->ps_dpb_long_term_head->ps_pic_buf->i4_long_term_frame_idx + <= max_frame_idx) + { + break; + } + ps_dpb_mgr->ps_dpb_long_term_head->ps_pic_buf->i4_used_as_ref = UNUSED_FOR_REF; + ps_dpb_mgr->ps_dpb_long_term_head->ps_pic_buf = NULL; + ps_dpb_mgr->ps_dpb_long_term_head = ps_dpb_mgr->ps_dpb_long_term_head->ps_prev_dpb; + } + + ps_dpb_node1 = ps_dpb_mgr->ps_dpb_long_term_head; + if(ps_dpb_node1 == NULL) + return 0; + /* + * With the node that isn't to be deleted as head, loop until the end. + */ + ps_dpb_node2 = ps_dpb_node1->ps_prev_dpb; + while(ps_dpb_node2 != NULL) + { + if(ps_dpb_node2->ps_pic_buf->i4_long_term_frame_idx > max_frame_idx) + { + ps_dpb_node2->ps_pic_buf->i4_used_as_ref = UNUSED_FOR_REF; + ps_dpb_node2->ps_pic_buf = NULL; + ps_dpb_node1->ps_prev_dpb = ps_dpb_node2->ps_prev_dpb; + } + ps_dpb_node1 = ps_dpb_node1->ps_prev_dpb; + if(ps_dpb_node1 == NULL) + break; + ps_dpb_node2 = ps_dpb_node1->ps_prev_dpb; + } + return 0; +} + +/** + ******************************************************************************* + * + * @brief + * Deletes the short-term with least frame_num + * + * @par Description: + * Deletes the short-term with least frame_num. It sorts the function the + * short-term linked list by frame-num and the function that deletes the last + * frame in the linked list. + * + * @param[in] ps_dpb_mgr + * Pointer to the DPB manager structure + * + * @param[in] curr_frame_num + * frame_num of the current pic + * + * @param[in] max_frame_num + * Maximum frame_num allowed + * + * @returns + * + * @remarks + * + * + ******************************************************************************* + */ +WORD32 ih264_dpb_mgr_delete_short_ref_frame(dpb_mgr_t *ps_dpb_mgr, + WORD32 curr_frame_num, + WORD32 max_frame_num) +{ + WORD32 ret; + /* Sort the short-term list by frame_num */ + ret = ih264_dpb_mgr_sort_short_term_fields_by_frame_num(ps_dpb_mgr, + curr_frame_num, + TOP_FIELD, + max_frame_num); + + /* Delete the last reference frame or field */ + ret = ih264_dpb_mgr_delete_ref_frame(ps_dpb_mgr,SHORT_TERM_REF); + + if(ret != 0) + { + ASSERT(0); + } + + return ret; +} +/** + ******************************************************************************* + * + * @brief + * Deletes all the ref frames. + * + * @par Description: + * Deletes all of the ref frames/fields in the short-term and long-term linked + * list. + * + * @param[in] ps_dpb_mgr + * Pointer to the DPB manager structure + * + * @returns + * + * @remarks + * + * + ******************************************************************************* + */ +WORD32 ih264_dpb_mgr_delete_all_ref_frames(dpb_mgr_t *ps_dpb_mgr) +{ + /* Loop over short-term linked list. */ + while(ps_dpb_mgr->ps_dpb_short_term_head != NULL) + { + ih264_dpb_mgr_delete_ref_frame(ps_dpb_mgr,SHORT_TERM_REF); + } + + /* Loop over long-term linked list. */ + while(ps_dpb_mgr->ps_dpb_long_term_head != NULL) + { + ih264_dpb_mgr_delete_ref_frame(ps_dpb_mgr,LONG_TERM_REF); + } + return 0; +} + + +void ih264_dpb_mgr_reset(dpb_mgr_t *ps_dpb_mgr, buf_mgr_t *ps_buf_mgr) +{ + WORD32 i; + dpb_info_t *ps_dpb_info; + ASSERT(0); + + + ps_dpb_info = ps_dpb_mgr->as_dpb_info; + + for(i = 0; i < MAX_DPB_BUFS; i++) + { + if(ps_dpb_info[i].ps_pic_buf->i4_used_as_ref) + { + ps_dpb_info[i].ps_pic_buf->i4_used_as_ref = UNUSED_FOR_REF; + ps_dpb_info[i].ps_prev_dpb = NULL; + //Release physical buffer + ih264_buf_mgr_release(ps_buf_mgr, ps_dpb_info[i].ps_pic_buf->i4_buf_id, + BUF_MGR_REF); + + ps_dpb_info[i].ps_pic_buf = NULL; + } + } + ps_dpb_mgr->u1_num_short_term_ref_bufs = 0; + ps_dpb_mgr->u1_num_long_term_ref_bufs = 0; + ps_dpb_mgr->ps_dpb_short_term_head = NULL; + ps_dpb_mgr->ps_dpb_long_term_head = NULL; + +} + +/** + ******************************************************************************* + * + * @brief + * deletes all pictures from DPB + * + * @par Description: + * Deletes all pictures present in the DPB manager + * + * @param[in] ps_buf_mgr + * Pointer to buffer manager structure + * + * @param[in] u1_disp_bufs + * Number of buffers to be deleted + * + * @returns + * + * @remarks + * + * + ******************************************************************************* + */ + +void ih264_dpb_mgr_release_pics(buf_mgr_t *ps_buf_mgr, UWORD8 u1_disp_bufs) +{ + WORD8 i; + UWORD32 buf_status; + ASSERT(0); + + for(i = 0; i < u1_disp_bufs; i++) + { + buf_status = ih264_buf_mgr_get_status(ps_buf_mgr, i); + if(0 != buf_status) + { + ih264_buf_mgr_release((buf_mgr_t *)ps_buf_mgr, i, BUF_MGR_REF); + } + } +} diff --git a/common/ih264_dpb_mgr.h b/common/ih264_dpb_mgr.h new file mode 100755 index 0000000..b0cf0fd --- /dev/null +++ b/common/ih264_dpb_mgr.h @@ -0,0 +1,186 @@ +/****************************************************************************** + * + * Copyright (C) 2015 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ***************************************************************************** + * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore +*/ + +/** + ******************************************************************************* + * @file + * ih264_dpb_mgr.h + * + * @brief + * Function declarations used for decoded picture buffer management + * + * @author + * Srinivas T + * + * + * @remarks + * None + * + ******************************************************************************* + */ +#ifndef _IH264_DPB_MGR_H_ +#define _IH264_DPB_MGR_H_ + +/* Temporary definitions. Have to be defined later */ + +#define MAX_DPB_BUFS (MAX_DPB_SIZE * 4) + +#define MARK_ST_PICNUM_AS_NONREF 1 +#define MARK_LT_INDEX_AS_NONREF 2 +#define MARK_ST_PICNUM_AS_LT_INDEX 3 +#define RESET_REF_PICTURES 5 + +typedef struct dpb_info_t dpb_info_t; + +enum +{ + INVALID = -1, + UNUSED_FOR_REF = 0 , + LONG_TERM_REF , + SHORT_TERM_REF , +}; +struct dpb_info_t +{ + /** + * Pointer to picture buffer structure + */ + pic_buf_t *ps_pic_buf; + + /** + * Link to the DPB buffer with previous link + */ + dpb_info_t *ps_prev_dpb; + +}; + +typedef struct +{ + /** + * Pointer to the most recent pic Num + */ + dpb_info_t *ps_dpb_short_term_head; + + /** + * Pointer to the most recent pic Num + */ + dpb_info_t *ps_dpb_long_term_head; + + /** + * Physical storage for dpbInfo for ref bufs + */ + dpb_info_t as_dpb_info[MAX_DPB_BUFS]; + + /** + * Array of structures for bottom field. + */ + pic_buf_t as_top_field_pics[MAX_DPB_BUFS]; + + /** + * Array of structures for bottom field. + */ + pic_buf_t as_bottom_field_pics[MAX_DPB_BUFS]; + + /** + * Number of short-term reference buffers + */ + UWORD8 u1_num_short_term_ref_bufs; + + /** + * Number of long-term reference buffers + */ + UWORD8 u1_num_long_term_ref_bufs; + + /** + * buffer ID current frame + */ + WORD32 i4_cur_frame_buf_id; + +} dpb_mgr_t; + +void ih264_dpb_mgr_init(dpb_mgr_t *ps_dpb_mgr); + +WORD32 ih264_dpb_mgr_insert_ref_frame(dpb_mgr_t *ps_dpb_mgr, + pic_buf_t *ps_pic_buf, + WORD32 reference_type, + UWORD32 frame_num, + WORD32 long_term_frame_idx); + +WORD32 ih264_dpb_mgr_delete_ref_frame(dpb_mgr_t *ps_dpb_mgr, + WORD32 reference_type); + +WORD32 ih264_dpb_mgr_delete_all_ref_frames(dpb_mgr_t *ps_dpb_mgr); + +WORD32 ih264_dpb_mgr_count_ref_frames(dpb_mgr_t *ps_dpb_mgr, + WORD32 curr_frame_num, + WORD32 max_frame_num); + +WORD32 ih264_dpb_mgr_delete_short_ref_frame(dpb_mgr_t *ps_dpb_mgr, + WORD32 curr_frame_num, + WORD32 max_frame_num); + +WORD32 ih264_dpb_mgr_insert_ref_field(dpb_mgr_t *ps_dpb_mgr, + pic_buf_t *ps_pic_buf, + WORD32 reference_type, + UWORD32 frame_num, + WORD32 long_term_frame_idx); + +WORD32 ih264_dpb_mgr_delete_ref_field(dpb_mgr_t *ps_dpb_mgr, + WORD32 reference_type); + +WORD32 ih264_dpb_mgr_alternate_ref_fields(dpb_mgr_t *ps_dpb_mgr, + WORD32 reference_type, + WORD32 first_field_type); + +WORD32 ih264_dpb_mgr_sort_short_term_fields_by_frame_num(dpb_mgr_t *ps_dpb_mgr, + WORD32 curr_frame_num, + WORD32 first_field_type, + WORD32 max_frame_num); + +WORD32 ih264_dpb_mgr_sort_short_term_fields_by_poc_l0(dpb_mgr_t *ps_dpb_mgr, + WORD32 curr_poc, + WORD32 first_field_type); + +WORD32 ih264_dpb_mgr_sort_short_term_fields_by_poc_l1(dpb_mgr_t *ps_dpb_mgr, + WORD32 curr_poc, + WORD32 first_field_type); + +WORD32 ih264_dpb_mgr_sort_long_term_fields_by_frame_idx(dpb_mgr_t *ps_dpb_mgr, + WORD32 first_field_type); + +WORD32 ih264_dpb_mgr_delete_long_ref_fields_max_frame_idx(dpb_mgr_t *ps_dpb_mgr, + WORD32 max_frame_idx); + +void ih264_dpb_mgr_del_ref(dpb_mgr_t *ps_dpb_mgr, + buf_mgr_t *ps_buf_mgr, + WORD32 u4_abs_poc); + +pic_buf_t *ih264_dpb_mgr_get_ref_by_nearest_poc(dpb_mgr_t *ps_dpb_mgr, + WORD32 cur_abs_poc); + +pic_buf_t *ih264_dpb_mgr_get_ref_by_poc(dpb_mgr_t *ps_dpb_mgr, WORD32 abs_poc); + +pic_buf_t *ih264_dpb_mgr_get_ref_by_poc_lsb(dpb_mgr_t *ps_dpb_mgr, + WORD32 poc_lsb); + +void ih264_dpb_mgr_reset(dpb_mgr_t *ps_dpb_mgr, buf_mgr_t *ps_buf_mgr); + +void ih264_dpb_mgr_release_pics(buf_mgr_t *ps_buf_mgr, UWORD8 u1_disp_bufs); + +#endif /* _IH264_DPB_MGR_H_ */ diff --git a/common/ih264_error.h b/common/ih264_error.h new file mode 100755 index 0000000..ff1662d --- /dev/null +++ b/common/ih264_error.h @@ -0,0 +1,68 @@ +/****************************************************************************** + * + * Copyright (C) 2015 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ***************************************************************************** + * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore +*/ +/** +******************************************************************************* +* @file +* ih264_error.h +* +* @brief +* Definitions related to error handling for common modules +* +* @author +* Harish +* +* @par List of Functions: +* +* @remarks +* None +* +******************************************************************************* +*/ + +#ifndef _IH264_ERROR_H_ +#define _IH264_ERROR_H_ + +/** + * Enumerations for error codes used in the codec. + * Not all these are expected to be returned to the application. + * Only select few will be exported + */ +typedef enum +{ + /** + * No error + */ + IH264_SUCCESS = 0, + /** + * Start error code for decoder + */ + IH264_DEC_ERROR_START = 0x100, + + /** + * Start error code for encoder + */ + IH264_ENC_ERROR_START = 0x200, + /** + * Generic failure + */ + IH264_FAIL = 0x7FFFFFFF +}IH264_ERROR_T; + +#endif /* _IH264_ERROR_H_ */ diff --git a/common/ih264_ihadamard_scaling.c b/common/ih264_ihadamard_scaling.c new file mode 100755 index 0000000..e4729c8 --- /dev/null +++ b/common/ih264_ihadamard_scaling.c @@ -0,0 +1,216 @@ +/****************************************************************************** + * + * Copyright (C) 2015 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ***************************************************************************** + * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore +*/ +/** + ******************************************************************************* + * @file + * ih264_ihadamard_scaling.c + * + * @brief + * Contains definition of functions for h264 inverse hadamard 4x4 transform and scaling + * + * @author + * Mohit + * + * @par List of Functions: + * - ih264_ihadamard_scaling_4x4() + * + * @remarks + * + ******************************************************************************* + */ + +/*****************************************************************************/ +/* File Includes */ +/*****************************************************************************/ + +/* User include files */ +#include "ih264_typedefs.h" +#include "ih264_defs.h" +#include "ih264_trans_macros.h" +#include "ih264_macros.h" +#include "ih264_trans_data.h" +#include "ih264_size_defs.h" +#include "ih264_structs.h" +#include "ih264_trans_quant_itrans_iquant.h" + +/* + ******************************************************************************** + * + * @brief This function performs a 4x4 inverse hadamard transform on the 4x4 DC coefficients + * of a 16x16 intra prediction macroblock, and then performs scaling. + * prediction buffer + * + * @par Description: + * The DC coefficients pass through a 2-stage inverse hadamard transform. + * This inverse transformed content is scaled to based on Qp value. + * + * @param[in] pi2_src + * input 4x4 block of DC coefficients + * + * @param[out] pi2_out + * output 4x4 block + * + * @param[in] pu2_iscal_mat + * pointer to scaling list + * + * @param[in] pu2_weigh_mat + * pointer to weight matrix + * + * @param[in] u4_qp_div_6 + * Floor (qp/6) + * + * @param[in] pi4_tmp + * temporary buffer of size 1*16 + * + * @returns none + * + * @remarks none + * + ******************************************************************************* + */ +void ih264_ihadamard_scaling_4x4(WORD16* pi2_src, + WORD16* pi2_out, + const UWORD16 *pu2_iscal_mat, + const UWORD16 *pu2_weigh_mat, + UWORD32 u4_qp_div_6, + WORD32* pi4_tmp) +{ + WORD32 i; + WORD32 x0, x1, x2, x3, x4, x5, x6, x7; + WORD16* pi2_src_ptr, *pi2_out_ptr; + WORD32* pi4_tmp_ptr; + WORD32 rnd_fact = (u4_qp_div_6 < 6) ? (1 << (5 - u4_qp_div_6)) : 0; + pi4_tmp_ptr = pi4_tmp; + pi2_src_ptr = pi2_src; + pi2_out_ptr = pi2_out; + // Horizontal transform + for(i = 0; i < SUB_BLK_WIDTH_4x4; i++) + { + x4 = pi2_src_ptr[0]; + x5 = pi2_src_ptr[1]; + x6 = pi2_src_ptr[2]; + x7 = pi2_src_ptr[3]; + + x0 = x4 + x7; + x1 = x5 + x6; + x2 = x5 - x6; + x3 = x4 - x7; + + pi4_tmp_ptr[0] = x0 + x1; + pi4_tmp_ptr[1] = x2 + x3; + pi4_tmp_ptr[2] = x0 - x1; + pi4_tmp_ptr[3] = x3 - x2; + + pi4_tmp_ptr += SUB_BLK_WIDTH_4x4; + pi2_src_ptr += SUB_BLK_WIDTH_4x4; + } + pi4_tmp_ptr = pi4_tmp; + // Vertical Transform + for(i = 0; i < SUB_BLK_WIDTH_4x4; i++) + { + x4 = pi4_tmp_ptr[0]; + x5 = pi4_tmp_ptr[4]; + x6 = pi4_tmp_ptr[8]; + x7 = pi4_tmp_ptr[12]; + + x0 = x4 + x7; + x1 = x5 + x6; + x2 = x5 - x6; + x3 = x4 - x7; + + pi4_tmp_ptr[0] = x0 + x1; + pi4_tmp_ptr[4] = x2 + x3; + pi4_tmp_ptr[8] = x0 - x1; + pi4_tmp_ptr[12] = x3 - x2; + + pi4_tmp_ptr++; + } + pi4_tmp_ptr = pi4_tmp; + //Scaling + for(i = 0; i < (SUB_BLK_WIDTH_4x4 * SUB_BLK_WIDTH_4x4); i++) + { + INV_QUANT(pi4_tmp_ptr[i], pu2_iscal_mat[0], pu2_weigh_mat[0], u4_qp_div_6, + rnd_fact, 6); + pi2_out_ptr[i] = pi4_tmp_ptr[i]; + } +} + +void ih264_ihadamard_scaling_2x2_uv(WORD16* pi2_src, + WORD16* pi2_out, + const UWORD16 *pu2_iscal_mat, + const UWORD16 *pu2_weigh_mat, + UWORD32 u4_qp_div_6, + WORD32* pi4_tmp) +{ + WORD32 i4_x0,i4_x1,i4_x2,i4_x3,i4_x4,i4_x5,i4_x6,i4_x7; + WORD32 i4_y0,i4_y1,i4_y2,i4_y3,i4_y4,i4_y5,i4_y6,i4_y7; + + UNUSED(pi4_tmp); + + i4_x4 = pi2_src[0]; + i4_x5 = pi2_src[1]; + i4_x6 = pi2_src[2]; + i4_x7 = pi2_src[3]; + + i4_x0 = i4_x4 + i4_x5; + i4_x1 = i4_x4 - i4_x5; + i4_x2 = i4_x6 + i4_x7; + i4_x3 = i4_x6 - i4_x7; + + i4_x4 = i4_x0+i4_x2; + i4_x5 = i4_x1+i4_x3; + i4_x6 = i4_x0-i4_x2; + i4_x7 = i4_x1-i4_x3; + + INV_QUANT(i4_x4,pu2_iscal_mat[0],pu2_weigh_mat[0],u4_qp_div_6,0,5); + INV_QUANT(i4_x5,pu2_iscal_mat[0],pu2_weigh_mat[0],u4_qp_div_6,0,5); + INV_QUANT(i4_x6,pu2_iscal_mat[0],pu2_weigh_mat[0],u4_qp_div_6,0,5); + INV_QUANT(i4_x7,pu2_iscal_mat[0],pu2_weigh_mat[0],u4_qp_div_6,0,5); + + pi2_out[0] = i4_x4; + pi2_out[1] = i4_x5; + pi2_out[2] = i4_x6; + pi2_out[3] = i4_x7; + + i4_y4 = pi2_src[4]; + i4_y5 = pi2_src[5]; + i4_y6 = pi2_src[6]; + i4_y7 = pi2_src[7]; + + i4_y0 = i4_y4 + i4_y5; + i4_y1 = i4_y4 - i4_y5; + i4_y2 = i4_y6 + i4_y7; + i4_y3 = i4_y6 - i4_y7; + + i4_y4 = i4_y0+i4_y2; + i4_y5 = i4_y1+i4_y3; + i4_y6 = i4_y0-i4_y2; + i4_y7 = i4_y1-i4_y3; + + INV_QUANT(i4_y4,pu2_iscal_mat[0],pu2_weigh_mat[0],u4_qp_div_6,0,5); + INV_QUANT(i4_y5,pu2_iscal_mat[0],pu2_weigh_mat[0],u4_qp_div_6,0,5); + INV_QUANT(i4_y6,pu2_iscal_mat[0],pu2_weigh_mat[0],u4_qp_div_6,0,5); + INV_QUANT(i4_y7,pu2_iscal_mat[0],pu2_weigh_mat[0],u4_qp_div_6,0,5); + + pi2_out[4] = i4_y4; + pi2_out[5] = i4_y5; + pi2_out[6] = i4_y6; + pi2_out[7] = i4_y7; +} diff --git a/common/ih264_inter_pred_filters.c b/common/ih264_inter_pred_filters.c new file mode 100755 index 0000000..7d1e407 --- /dev/null +++ b/common/ih264_inter_pred_filters.c @@ -0,0 +1,1042 @@ +/****************************************************************************** + * + * Copyright (C) 2015 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ***************************************************************************** + * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore +*/ +/** + ******************************************************************************* + * @file + * ih264_inter_pred_filters.c + * + * @brief + * Contains function definitions for inter prediction interpolation filters + * + * @author + * Ittiam + * + * @par List of Functions: + * - ih264_inter_pred_luma_copy + * - ih264_interleave_copy + * - ih264_inter_pred_luma_horz + * - ih264_inter_pred_luma_vert + * - ih264_inter_pred_luma_horz_hpel_vert_hpel + * - ih264_inter_pred_luma_horz_qpel + * - ih264_inter_pred_luma_vert_qpel + * - ih264_inter_pred_luma_horz_qpel_vert_qpel + * - ih264_inter_pred_luma_horz_hpel_vert_qpel + * - ih264_inter_pred_luma_horz_qpel_vert_hpel + * - ih264_inter_pred_luma_bilinear + * - ih264_inter_pred_chroma + * + * @remarks + * None + * + ******************************************************************************* + */ + +/*****************************************************************************/ +/* File Includes */ +/*****************************************************************************/ + +/* User include files */ +#include "ih264_typedefs.h" +#include "ih264_macros.h" +#include "ih264_platform_macros.h" +#include "ih264_inter_pred_filters.h" + + +/*****************************************************************************/ +/* Constant Data variables */ +/*****************************************************************************/ + +/* coefficients for 6 tap filtering*/ +const WORD32 ih264_g_six_tap[3] ={1,-5,20}; + + +/*****************************************************************************/ +/* Function definitions . */ +/*****************************************************************************/ +/** + ******************************************************************************* + * + * @brief + * Interprediction luma function for copy + * + * @par Description: + * Copies the array of width 'wd' and height 'ht' from the location pointed + * by 'src' to the location pointed by 'dst' + * + * @param[in] pu1_src + * UWORD8 pointer to the source + * + * @param[out] pu1_dst + * UWORD8 pointer to the destination + * + * @param[in] src_strd + * integer source stride + * + * @param[in] dst_strd + * integer destination stride + * + * + * @param[in] ht + * integer height of the array + * + * @param[in] wd + * integer width of the array + * + * @returns + * + * @remarks + * None + * + ******************************************************************************* + */ + +void ih264_inter_pred_luma_copy(UWORD8 *pu1_src, + UWORD8 *pu1_dst, + WORD32 src_strd, + WORD32 dst_strd, + WORD32 ht, + WORD32 wd, + UWORD8* pu1_tmp, + WORD32 dydx) +{ + WORD32 row, col; + UNUSED(pu1_tmp); + UNUSED(dydx); + for(row = 0; row < ht; row++) + { + for(col = 0; col < wd; col++) + { + pu1_dst[col] = pu1_src[col]; + } + + pu1_src += src_strd; + pu1_dst += dst_strd; + } +} + +/** + ******************************************************************************* + * + * @brief + * Fucntion for copying to an interleaved destination + * + * @par Description: + * Copies the array of width 'wd' and height 'ht' from the location pointed + * by 'src' to the location pointed by 'dst' + * + * @param[in] pu1_src + * UWORD8 pointer to the source + * + * @param[out] pu1_dst + * UWORD8 pointer to the destination + * + * @param[in] src_strd + * integer source stride + * + * @param[in] dst_strd + * integer destination stride + * + * @param[in] ht + * integer height of the array + * + * @param[in] wd + * integer width of the array + * + * @returns + * + * @remarks + * The alternate elements of src will be copied to alternate locations in dsr + * Other locations are not touched + * + ******************************************************************************* + */ +void ih264_interleave_copy(UWORD8 *pu1_src, + UWORD8 *pu1_dst, + WORD32 src_strd, + WORD32 dst_strd, + WORD32 ht, + WORD32 wd) +{ + WORD32 row, col; + wd *= 2; + + for(row = 0; row < ht; row++) + { + for(col = 0; col < wd; col+=2) + { + pu1_dst[col] = pu1_src[col]; + } + + pu1_src += src_strd; + pu1_dst += dst_strd; + } +} + +/** + ******************************************************************************* + * + * @brief + * Interprediction luma filter for horizontal input + * + * @par Description: + * Applies a 6 tap horizontal filter .The output is clipped to 8 bits + * sec 8.4.2.2.1 titled "Luma sample interpolation process" + * + * @param[in] pu1_src + * UWORD8 pointer to the source + * + * @param[out] pu1_dst + * UWORD8 pointer to the destination + * + * @param[in] src_strd + * integer source stride + * + * @param[in] dst_strd + * integer destination stride + * + * @param[in] ht + * integer height of the array + * + * @param[in] wd + * integer width of the array + * + * @returns + * + * @remarks + * None + * + ******************************************************************************* + */ +void ih264_inter_pred_luma_horz(UWORD8 *pu1_src, + UWORD8 *pu1_dst, + WORD32 src_strd, + WORD32 dst_strd, + WORD32 ht, + WORD32 wd, + UWORD8* pu1_tmp, + WORD32 dydx) +{ + WORD32 row, col; + WORD16 i2_tmp; + UNUSED(pu1_tmp); + UNUSED(dydx); + + for(row = 0; row < ht; row++) + { + for(col = 0; col < wd; col++) + { + i2_tmp = 0;/*ih264_g_six_tap[] is the array containing the filter coeffs*/ + i2_tmp = ih264_g_six_tap[0] * + (pu1_src[col - 2] + pu1_src[col + 3]) + + ih264_g_six_tap[1] * + (pu1_src[col - 1] + pu1_src[col + 2]) + + ih264_g_six_tap[2] * + (pu1_src[col] + pu1_src[col + 1]); + i2_tmp = (i2_tmp + 16) >> 5; + pu1_dst[col] = CLIP_U8(i2_tmp); + } + + pu1_src += src_strd; + pu1_dst += dst_strd; + } + +} + +/** + ******************************************************************************* + * + * @brief + * Interprediction luma filter for vertical input + * + * @par Description: + * Applies a 6 tap vertical filter.The output is clipped to 8 bits + * sec 8.4.2.2.1 titled "Luma sample interpolation process" + * + * @param[in] pu1_src + * UWORD8 pointer to the source + * + * @param[out] pu1_dst + * UWORD8 pointer to the destination + * + * @param[in] src_strd + * integer source stride + * + * @param[in] dst_strd + * integer destination stride + * + * @param[in] ht + * integer height of the array + * + * @param[in] wd + * integer width of the array + * + * @returns + * + * @remarks + * None + * + ******************************************************************************* + */ +void ih264_inter_pred_luma_vert(UWORD8 *pu1_src, + UWORD8 *pu1_dst, + WORD32 src_strd, + WORD32 dst_strd, + WORD32 ht, + WORD32 wd, + UWORD8* pu1_tmp, + WORD32 dydx) +{ + WORD32 row, col; + WORD16 i2_tmp; + UNUSED(pu1_tmp); + UNUSED(dydx); + + for(row = 0; row < ht; row++) + { + for(col = 0; col < wd; col++) + { + i2_tmp = 0; /*ih264_g_six_tap[] is the array containing the filter coeffs*/ + i2_tmp = ih264_g_six_tap[0] * + (pu1_src[col - 2 * src_strd] + pu1_src[col + 3 * src_strd]) + + ih264_g_six_tap[1] * + (pu1_src[col - 1 * src_strd] + pu1_src[col + 2 * src_strd]) + + ih264_g_six_tap[2] * + (pu1_src[col] + pu1_src[col + 1 * src_strd]); + i2_tmp = (i2_tmp + 16) >> 5; + pu1_dst[col] = CLIP_U8(i2_tmp); + } + pu1_src += src_strd; + pu1_dst += dst_strd; + } +} + +/*! + ************************************************************************** + * \if Function name : ih264_inter_pred_luma_horz_hpel_vert_hpel \endif + * + * \brief + * This function implements a two stage cascaded six tap filter. It + * applies the six tap filter in the horizontal direction on the + * predictor values, followed by applying the same filter in the + * vertical direction on the output of the first stage. The six tap + * filtering operation is described in sec 8.4.2.2.1 titled "Luma sample + * interpolation process" + * + * \param pu1_src: Pointer to the buffer containing the predictor values. + * pu1_src could point to the frame buffer or the predictor buffer. + * \param pu1_dst: Pointer to the destination buffer where the output of + * the six tap filter is stored. + * \param ht: Height of the rectangular pixel grid to be interpolated + * \param wd: Width of the rectangular pixel grid to be interpolated + * \param src_strd: Width of the buffer pointed to by pu1_src. + * \param dst_strd: Width of the destination buffer + * \param pu1_tmp: temporary buffer. + * \param dydx: x and y reference offset for qpel calculations: UNUSED in this function. + * + * \return + * None. + * + * \note + * This function takes the 8 bit predictor values, applies the six tap + * filter in the horizontal direction and outputs the result clipped to + * 8 bit precision. The input is stored in the buffer pointed to by + * pu1_src while the output is stored in the buffer pointed by pu1_dst. + * Both pu1_src and pu1_dst could point to the same buffer i.e. the + * six tap filter could be done in place. + * + ************************************************************************** + */ +void ih264_inter_pred_luma_horz_hpel_vert_hpel(UWORD8 *pu1_src, + UWORD8 *pu1_dst, + WORD32 src_strd, + WORD32 dst_strd, + WORD32 ht, + WORD32 wd, + UWORD8* pu1_tmp, + WORD32 dydx) +{ + WORD32 row, col; + WORD32 tmp; + WORD16* pi2_pred1_temp; + WORD16* pi2_pred1; + UNUSED(dydx); + pi2_pred1_temp = (WORD16*)pu1_tmp; + pi2_pred1_temp += 2; + pi2_pred1 = pi2_pred1_temp; + for(row = 0; row < ht; row++) + { + for(col = -2; col < wd + 3; col++) + { + tmp = 0;/*ih264_g_six_tap[] is the array containing the filter coeffs*/ + tmp = ih264_g_six_tap[0] * + (pu1_src[col - 2 * src_strd] + pu1_src[col + 3 * src_strd]) + + ih264_g_six_tap[1] * + (pu1_src[col - 1 * src_strd] + pu1_src[col + 2 * src_strd]) + + ih264_g_six_tap[2] * + (pu1_src[col] + pu1_src[col + 1 * src_strd]); + pi2_pred1_temp[col] = tmp; + } + pu1_src += src_strd; + pi2_pred1_temp = pi2_pred1_temp + wd + 5; + } + + for(row = 0; row < ht; row++) + { + for(col = 0; col < wd; col++) + { + tmp = 0;/*ih264_g_six_tap[] is the array containing the filter coeffs*/ + tmp = ih264_g_six_tap[0] * + (pi2_pred1[col - 2] + pi2_pred1[col + 3]) + + ih264_g_six_tap[1] * + (pi2_pred1[col - 1] + pi2_pred1[col + 2]) + + ih264_g_six_tap[2] * (pi2_pred1[col] + pi2_pred1[col + 1]); + tmp = (tmp + 512) >> 10; + pu1_dst[col] = CLIP_U8(tmp); + } + pi2_pred1 += (wd + 5); + pu1_dst += dst_strd; + } +} + +/*! + ************************************************************************** + * \if Function name : ih264_inter_pred_luma_horz_qpel \endif + * + * \brief + * This routine applies the six tap filter to the predictors in the + * horizontal direction. The six tap filtering operation is described in + * sec 8.4.2.2.1 titled "Luma sample interpolation process" + * + * \param pu1_src: Pointer to the buffer containing the predictor values. + * pu1_src could point to the frame buffer or the predictor buffer. + * \param pu1_dst: Pointer to the destination buffer where the output of + * the six tap filter is stored. + * \param ht: Height of the rectangular pixel grid to be interpolated + * \param wd: Width of the rectangular pixel grid to be interpolated + * \param src_strd: Width of the buffer pointed to by pu1_src. + * \param dst_strd: Width of the destination buffer + * \param pu1_tmp: temporary buffer: UNUSED in this function + * \param dydx: x and y reference offset for qpel calculations. + * + * \return + * None. + * + * \note + * This function takes the 8 bit predictor values, applies the six tap + * filter in the horizontal direction and outputs the result clipped to + * 8 bit precision. The input is stored in the buffer pointed to by + * pu1_src while the output is stored in the buffer pointed by pu1_dst. + * Both pu1_src and pu1_dst could point to the same buffer i.e. the + * six tap filter could be done in place. + * + ************************************************************************** + */ +void ih264_inter_pred_luma_horz_qpel(UWORD8 *pu1_src, + UWORD8 *pu1_dst, + WORD32 src_strd, + WORD32 dst_strd, + WORD32 ht, + WORD32 wd, + UWORD8* pu1_tmp, + WORD32 dydx) +{ + WORD32 row, col; + UWORD8 *pu1_pred1; + WORD32 x_offset = dydx & 0x3; + UNUSED(pu1_tmp); + pu1_pred1 = pu1_src + (x_offset >> 1); + + for(row = 0; row < ht; row++) + { + for(col = 0; col < wd; col++, pu1_src++, pu1_dst++) + { + WORD16 i2_temp; + /* The logic below implements the following equation + i2_temp = puc_pred[-2] - 5 * (puc_pred[-1] + puc_pred[2]) + + 20 * (puc_pred[0] + puc_pred[1]) + puc_pred[3]; */ + i2_temp = pu1_src[-2] + pu1_src[3] + - (pu1_src[-1] + pu1_src[2]) + + ((pu1_src[0] + pu1_src[1] - pu1_src[-1] - pu1_src[2]) << 2) + + ((pu1_src[0] + pu1_src[1]) << 4); + i2_temp = (i2_temp + 16) >> 5; + i2_temp = CLIP_U8(i2_temp); + *pu1_dst = (i2_temp + *pu1_pred1 + 1) >> 1; + + pu1_pred1++; + } + pu1_dst += dst_strd - wd; + pu1_src += src_strd - wd; + pu1_pred1 += src_strd - wd; + } +} + +/*! + ************************************************************************** + * \if Function name : ih264_inter_pred_luma_vert_qpel \endif + * + * \brief + * This routine applies the six tap filter to the predictors in the + * vertical direction and interpolates them to obtain pixels at quarter vertical + * positions (0, 1/4) and (0, 3/4). The six tap filtering operation is + * described in sec 8.4.2.2.1 titled "Luma sample interpolation process" + * + * \param pu1_src: Pointer to the buffer containing the predictor values. + * pu1_src could point to the frame buffer or the predictor buffer. + * \param pu1_dst: Pointer to the destination buffer where the output of + * the six tap filter is stored. + * \param ht: Height of the rectangular pixel grid to be interpolated + * \param wd: Width of the rectangular pixel grid to be interpolated + * \param src_strd: Width of the buffer pointed to by puc_pred. + * \param dst_strd: Width of the destination buffer + * \param pu1_tmp: temporary buffer: UNUSED in this function + * \param dydx: x and y reference offset for qpel calculations. + * + * \return + * void + * + * \note + * This function takes the 8 bit predictor values, applies the six tap + * filter in the vertical direction and outputs the result clipped to + * 8 bit precision. The input is stored in the buffer pointed to by + * puc_pred while the output is stored in the buffer pointed by puc_dest. + * Both puc_pred and puc_dest could point to the same buffer i.e. the + * six tap filter could be done in place. + * + * \para <title> + * <paragraph> + * ... + ************************************************************************** + */ +void ih264_inter_pred_luma_vert_qpel(UWORD8 *pu1_src, + UWORD8 *pu1_dst, + WORD32 src_strd, + WORD32 dst_strd, + WORD32 ht, + WORD32 wd, + UWORD8* pu1_tmp, + WORD32 dydx) +{ + WORD32 row, col; + WORD32 y_offset = dydx >> 2; + WORD32 off1, off2, off3; + UWORD8 *pu1_pred1; + UNUSED(pu1_tmp); + y_offset = y_offset & 0x3; + + off1 = src_strd; + off2 = src_strd << 1; + off3 = off1 + off2; + + pu1_pred1 = pu1_src + (y_offset >> 1) * src_strd; + + for(row = 0; row < ht; row++) + { + for(col = 0; col < wd; col++, pu1_dst++, pu1_src++, pu1_pred1++) + { + WORD16 i2_temp; + /* The logic below implements the following equation + i16_temp = puc_pred[-2*src_strd] + puc_pred[3*src_strd] - + 5 * (puc_pred[-1*src_strd] + puc_pred[2*src_strd]) + + 20 * (puc_pred[0] + puc_pred[src_strd]); */ + i2_temp = pu1_src[-off2] + pu1_src[off3] + - (pu1_src[-off1] + pu1_src[off2]) + + ((pu1_src[0] + pu1_src[off1] - pu1_src[-off1] - pu1_src[off2]) << 2) + + ((pu1_src[0] + pu1_src[off1]) << 4); + i2_temp = (i2_temp + 16) >> 5; + i2_temp = CLIP_U8(i2_temp); + + *pu1_dst = (i2_temp + *pu1_pred1 + 1) >> 1; + } + pu1_src += src_strd - wd; + pu1_pred1 += src_strd - wd; + pu1_dst += dst_strd - wd; + } +} + +/*! + ************************************************************************** + * \if Function name : ih264_inter_pred_luma_horz_qpel_vert_qpel \endif + * + * \brief + * This routine applies the six tap filter to the predictors in the + * vertical and horizontal direction and averages them to get pixels at locations + * (1/4,1/4), (1/4, 3/4), (3/4, 1/4) & (3/4, 3/4). The six tap filtering operation + * is described in sec 8.4.2.2.1 titled "Luma sample interpolation process" + * + * \param pu1_src: Pointer to the buffer containing the predictor values. + * pu1_src could point to the frame buffer or the predictor buffer. + * \param pu1_dst: Pointer to the destination buffer where the output of + * the six tap filter is stored. + * \param wd: Width of the rectangular pixel grid to be interpolated + * \param ht: Height of the rectangular pixel grid to be interpolated + * \param src_strd: Width of the buffer pointed to by puc_pred. + * \param dst_strd: Width of the destination buffer + * \param pu1_tmp: temporary buffer, UNUSED in this function + * \param dydx: x and y reference offset for qpel calculations. + * + * \return + * void + * + * \note + * This function takes the 8 bit predictor values, applies the six tap + * filter in the vertical direction and outputs the result clipped to + * 8 bit precision. The input is stored in the buffer pointed to by + * puc_pred while the output is stored in the buffer pointed by puc_dest. + * Both puc_pred and puc_dest could point to the same buffer i.e. the + * six tap filter could be done in place. + * + * \para <title> + * <paragraph> + * ... + ************************************************************************** + */ +void ih264_inter_pred_luma_horz_qpel_vert_qpel(UWORD8 *pu1_src, + UWORD8 *pu1_dst, + WORD32 src_strd, + WORD32 dst_strd, + WORD32 ht, + WORD32 wd, + UWORD8* pu1_tmp, + WORD32 dydx) +{ + WORD32 row, col; + WORD32 x_offset = dydx & 0x3; + WORD32 y_offset = dydx >> 2; + + WORD32 off1, off2, off3; + UWORD8* pu1_pred_vert, *pu1_pred_horz; + UNUSED(pu1_tmp); + y_offset = y_offset & 0x3; + + off1 = src_strd; + off2 = src_strd << 1; + off3 = off1 + off2; + + pu1_pred_horz = pu1_src + (y_offset >> 1) * src_strd; + pu1_pred_vert = pu1_src + (x_offset >> 1); + + for(row = 0; row < ht; row++) + { + for(col = 0; col < wd; + col++, pu1_dst++, pu1_pred_vert++, pu1_pred_horz++) + { + WORD16 i2_temp_vert, i2_temp_horz; + /* The logic below implements the following equation + i2_temp = puc_pred[-2*src_strd] + puc_pred[3*src_strd] - + 5 * (puc_pred[-1*src_strd] + puc_pred[2*src_strd]) + + 20 * (puc_pred[0] + puc_pred[src_strd]); */ + i2_temp_vert = pu1_pred_vert[-off2] + pu1_pred_vert[off3] + - (pu1_pred_vert[-off1] + pu1_pred_vert[off2]) + + ((pu1_pred_vert[0] + pu1_pred_vert[off1] + - pu1_pred_vert[-off1] + - pu1_pred_vert[off2]) << 2) + + ((pu1_pred_vert[0] + pu1_pred_vert[off1]) << 4); + i2_temp_vert = (i2_temp_vert + 16) >> 5; + i2_temp_vert = CLIP_U8(i2_temp_vert); + + /* The logic below implements the following equation + i16_temp = puc_pred[-2] - 5 * (puc_pred[-1] + puc_pred[2]) + + 20 * (puc_pred[0] + puc_pred[1]) + puc_pred[3]; */ + i2_temp_horz = pu1_pred_horz[-2] + pu1_pred_horz[3] + - (pu1_pred_horz[-1] + pu1_pred_horz[2]) + + ((pu1_pred_horz[0] + pu1_pred_horz[1] + - pu1_pred_horz[-1] + - pu1_pred_horz[2]) << 2) + + ((pu1_pred_horz[0] + pu1_pred_horz[1]) << 4); + i2_temp_horz = (i2_temp_horz + 16) >> 5; + i2_temp_horz = CLIP_U8(i2_temp_horz); + *pu1_dst = (i2_temp_vert + i2_temp_horz + 1) >> 1; + } + pu1_pred_vert += (src_strd - wd); + pu1_pred_horz += (src_strd - wd); + pu1_dst += (dst_strd - wd); + } +} + +/*! + ************************************************************************** + * \if Function name : ih264_inter_pred_luma_horz_qpel_vert_hpel \endif + * + * \brief + * This routine applies the six tap filter to the predictors in the vertical + * and horizontal direction to obtain the pixel at (1/2,1/2). It then interpolates + * pixel at (0,1/2) and (1/2,1/2) to obtain pixel at (1/4,1/2). Similarly for (3/4,1/2). + * The six tap filtering operation is described in sec 8.4.2.2.1 titled + * "Luma sample interpolation process" + * + * \param pu1_src: Pointer to the buffer containing the predictor values. + * pu1_src could point to the frame buffer or the predictor buffer. + * \param pu1_dst: Pointer to the destination buffer where the output of + * the six tap filter followed by interpolation is stored. + * \param wd: Width of the rectangular pixel grid to be interpolated + * \param ht: Height of the rectangular pixel grid to be interpolated + * \param src_strd: Width of the buffer pointed to by puc_pred. + * \param dst_strd: Width of the destination buffer + * \param pu1_tmp: buffer to store temporary output after 1st 6-tap filter. + * \param dydx: x and y reference offset for qpel calculations. + * + * \return + * void + * + * \note + * This function takes the 8 bit predictor values, applies the six tap + * filter in the vertical direction and outputs the result clipped to + * 8 bit precision. The input is stored in the buffer pointed to by + * puc_pred while the output is stored in the buffer pointed by puc_dest. + * Both puc_pred and puc_dest could point to the same buffer i.e. the + * six tap filter could be done in place. + * + * \para <title> + * <paragraph> + * ... + ************************************************************************** + */ +void ih264_inter_pred_luma_horz_qpel_vert_hpel(UWORD8 *pu1_src, + UWORD8 *pu1_dst, + WORD32 src_strd, + WORD32 dst_strd, + WORD32 ht, + WORD32 wd, + UWORD8* pu1_tmp, + WORD32 dydx) +{ + WORD32 row, col; + WORD32 tmp; + WORD16* pi2_pred1_temp, *pi2_pred1; + UWORD8* pu1_dst_tmp; + WORD32 x_offset = dydx & 0x3; + WORD16 i2_macro; + + pi2_pred1_temp = (WORD16*)pu1_tmp; + pi2_pred1_temp += 2; + pi2_pred1 = pi2_pred1_temp; + pu1_dst_tmp = pu1_dst; + + for(row = 0; row < ht; row++) + { + for(col = -2; col < wd + 3; col++) + { + tmp = 0;/*ih264_g_six_tap[] is the array containing the filter coeffs*/ + tmp = ih264_g_six_tap[0] * + (pu1_src[col - 2 * src_strd] + pu1_src[col + 3 * src_strd]) + + ih264_g_six_tap[1] * + (pu1_src[col - 1 * src_strd] + pu1_src[col + 2 * src_strd]) + + ih264_g_six_tap[2] * + (pu1_src[col] + pu1_src[col + 1 * src_strd]); + pi2_pred1_temp[col] = tmp; + } + + pu1_src += src_strd; + pi2_pred1_temp = pi2_pred1_temp + wd + 5; + } + + pi2_pred1_temp = pi2_pred1; + for(row = 0; row < ht; row++) + { + for(col = 0; col < wd; col++) + { + tmp = 0;/*ih264_g_six_tap[] is the array containing the filter coeffs*/ + tmp = ih264_g_six_tap[0] * + (pi2_pred1[col - 2] + pi2_pred1[col + 3]) + + ih264_g_six_tap[1] * + (pi2_pred1[col - 1] + pi2_pred1[col + 2]) + + ih264_g_six_tap[2] * + (pi2_pred1[col] + pi2_pred1[col + 1]); + tmp = (tmp + 512) >> 10; + pu1_dst[col] = CLIP_U8(tmp); + } + pi2_pred1 += (wd + 5); + pu1_dst += dst_strd; + } + + pu1_dst = pu1_dst_tmp; + pi2_pred1_temp += (x_offset >> 1); + for(row = ht; row != 0; row--) + { + for(col = wd; col != 0; col--, pu1_dst++, pi2_pred1_temp++) + { + UWORD8 uc_temp; + /* Clipping the output of the six tap filter obtained from the + first stage of the 2d filter stage */ + *pi2_pred1_temp = (*pi2_pred1_temp + 16) >> 5; + i2_macro = (*pi2_pred1_temp); + uc_temp = CLIP_U8(i2_macro); + *pu1_dst = (*pu1_dst + uc_temp + 1) >> 1; + } + pi2_pred1_temp += 5; + pu1_dst += dst_strd - wd; + } +} + +/*! + ************************************************************************** + * \if Function name : ih264_inter_pred_luma_horz_hpel_vert_qpel \endif + * + * \brief + * This routine applies the six tap filter to the predictors in the horizontal + * and vertical direction to obtain the pixel at (1/2,1/2). It then interpolates + * pixel at (1/2,0) and (1/2,1/2) to obtain pixel at (1/2,1/4). Similarly for (1/2,3/4). + * The six tap filtering operation is described in sec 8.4.2.2.1 titled + * "Luma sample interpolation process" + * + * \param pu1_src: Pointer to the buffer containing the predictor values. + * pu1_src could point to the frame buffer or the predictor buffer. + * \param pu1_dst: Pointer to the destination buffer where the output of + * the six tap filter followed by interpolation is stored. + * \param wd: Width of the rectangular pixel grid to be interpolated + * \param ht: Height of the rectangular pixel grid to be interpolated + * \param src_strd: Width of the buffer pointed to by puc_pred. + * \param dst_strd: Width of the destination buffer + * \param pu1_tmp: buffer to store temporary output after 1st 6-tap filter. + * \param dydx: x and y reference offset for qpel calculations. + * + * \return + * void + * + * \note + * This function takes the 8 bit predictor values, applies the six tap + * filter in the vertical direction and outputs the result clipped to + * 8 bit precision. The input is stored in the buffer pointed to by + * puc_pred while the output is stored in the buffer pointed by puc_dest. + * Both puc_pred and puc_dest could point to the same buffer i.e. the + * six tap filter could be done in place. + * + * \para <title> + * <paragraph> + * ... + ************************************************************************** + */ +void ih264_inter_pred_luma_horz_hpel_vert_qpel(UWORD8 *pu1_src, + UWORD8 *pu1_dst, + WORD32 src_strd, + WORD32 dst_strd, + WORD32 ht, + WORD32 wd, + UWORD8* pu1_tmp, + WORD32 dydx) +{ + + WORD32 row, col; + WORD32 tmp; + WORD32 y_offset = dydx >> 2; + WORD16* pi2_pred1_temp, *pi2_pred1; + UWORD8* pu1_dst_tmp; + //WORD32 x_offset = dydx & 0x3; + WORD16 i2_macro; + + y_offset = y_offset & 0x3; + + pi2_pred1_temp = (WORD16*)pu1_tmp; + pi2_pred1_temp += 2 * wd; + pi2_pred1 = pi2_pred1_temp; + pu1_dst_tmp = pu1_dst; + pu1_src -= 2 * src_strd; + for(row = -2; row < ht + 3; row++) + { + for(col = 0; col < wd; col++) + { + tmp = 0;/*ih264_g_six_tap[] is the array containing the filter coeffs*/ + tmp = ih264_g_six_tap[0] * (pu1_src[col - 2] + pu1_src[col + 3]) + + ih264_g_six_tap[1] * (pu1_src[col - 1] + pu1_src[col + 2]) + + ih264_g_six_tap[2] * (pu1_src[col] + pu1_src[col + 1]); + pi2_pred1_temp[col - 2 * wd] = tmp; + } + + pu1_src += src_strd; + pi2_pred1_temp += wd; + } + pi2_pred1_temp = pi2_pred1; + for(row = 0; row < ht; row++) + { + for(col = 0; col < wd; col++) + { + tmp = 0;/*ih264_g_six_tap[] is the array containing the filter coeffs*/ + tmp = ih264_g_six_tap[0] * (pi2_pred1[col - 2 * wd] + pi2_pred1[col + 3 * wd]) + + ih264_g_six_tap[1] * (pi2_pred1[col - 1 * wd] + pi2_pred1[col + 2 * wd]) + + ih264_g_six_tap[2] * (pi2_pred1[col] + pi2_pred1[col + 1 * wd]); + tmp = (tmp + 512) >> 10; + pu1_dst[col] = CLIP_U8(tmp); + } + pi2_pred1 += wd; + pu1_dst += dst_strd; + } + pu1_dst = pu1_dst_tmp; + pi2_pred1_temp += (y_offset >> 1) * wd; + for(row = ht; row != 0; row--) + + { + for(col = wd; col != 0; col--, pu1_dst++, pi2_pred1_temp++) + { + UWORD8 u1_temp; + /* Clipping the output of the six tap filter obtained from the + first stage of the 2d filter stage */ + *pi2_pred1_temp = (*pi2_pred1_temp + 16) >> 5; + i2_macro = (*pi2_pred1_temp); + u1_temp = CLIP_U8(i2_macro); + *pu1_dst = (*pu1_dst + u1_temp + 1) >> 1; + } + //pi16_pred1_temp += wd; + pu1_dst += dst_strd - wd; + } +} + +/** + ******************************************************************************* + * function:ih264_inter_pred_luma_bilinear + * + * @brief + * This routine applies the bilinear filter to the predictors . + * The filtering operation is described in + * sec 8.4.2.2.1 titled "Luma sample interpolation process" + * + * @par Description: +\note + * This function is called to obtain pixels lying at the following + * locations (1/4,1), (3/4,1),(1,1/4), (1,3/4) ,(1/4,1/2), (3/4,1/2),(1/2,1/4), (1/2,3/4),(3/4,1/4),(1/4,3/4),(3/4,3/4)&& (1/4,1/4) . + * The function averages the two adjacent values from the two input arrays in horizontal direction. + * + * + * @param[in] pu1_src1: + * UWORD8 Pointer to the buffer containing the first input array. + * + * @param[in] pu1_src2: + * UWORD8 Pointer to the buffer containing the second input array. + * + * @param[out] pu1_dst + * UWORD8 pointer to the destination where the output of bilinear filter is stored. + * + * @param[in] src_strd1 + * Stride of the first input buffer + * + * @param[in] src_strd2 + * Stride of the second input buffer + * + * @param[in] dst_strd + * integer destination stride of pu1_dst + * + * @param[in] ht + * integer height of the array + * + * @param[in] wd + * integer width of the array + * + * @returns + * + * @remarks + * None + * + ******************************************************************************* + */ +void ih264_inter_pred_luma_bilinear(UWORD8 *pu1_src1, + UWORD8 *pu1_src2, + UWORD8 *pu1_dst, + WORD32 src_strd1, + WORD32 src_strd2, + WORD32 dst_strd, + WORD32 ht, + WORD32 wd) +{ + WORD32 row, col; + WORD16 i2_tmp; + + for(row = 0; row < ht; row++) + { + for(col = 0; col < wd; col++) + { + i2_tmp = pu1_src1[col] + pu1_src2[col]; + i2_tmp = (i2_tmp + 1) >> 1; + pu1_dst[col] = CLIP_U8(i2_tmp); + } + pu1_src1 += src_strd1; + pu1_src2 += src_strd2; + pu1_dst += dst_strd; + } + +} + +/** + ******************************************************************************* + * + * @brief + * Interprediction chroma filter + * + * @par Description: + * Applies filtering to chroma samples as mentioned in + * sec 8.4.2.2.2 titled "chroma sample interpolation process" + * + * @param[in] pu1_src + * UWORD8 pointer to the source containing alternate U and V samples + * + * @param[out] pu1_dst + * UWORD8 pointer to the destination + * + * @param[in] src_strd + * integer source stride + * + * @param[in] dst_strd + * integer destination stride + * + * @param[in] u1_dx + * dx value where the sample is to be produced(refer sec 8.4.2.2.2 ) + * + * @param[in] u1_dy + * dy value where the sample is to be produced(refer sec 8.4.2.2.2 ) + * + * @param[in] ht + * integer height of the array + * + * @param[in] wd + * integer width of the array + * + * @returns + * + * @remarks + * None + * + ******************************************************************************* + */ +void ih264_inter_pred_chroma(UWORD8 *pu1_src, + UWORD8 *pu1_dst, + WORD32 src_strd, + WORD32 dst_strd, + WORD32 dx, + WORD32 dy, + WORD32 ht, + WORD32 wd) +{ + WORD32 row, col; + WORD16 i2_tmp; + + for(row = 0; row < ht; row++) + { + for(col = 0; col < 2 * wd; col++) + { + i2_tmp = 0; /* applies equation (8-266) in section 8.4.2.2.2 */ + i2_tmp = (8 - dx) * (8 - dy) * pu1_src[col] + + (dx) * (8 - dy) * pu1_src[col + 2] + + (8 - dx) * (dy) * (pu1_src + src_strd)[col] + + (dx) * (dy) * (pu1_src + src_strd)[col + 2]; + i2_tmp = (i2_tmp + 32) >> 6; + pu1_dst[col] = CLIP_U8(i2_tmp); + } + pu1_src += src_strd; + pu1_dst += dst_strd; + } +} diff --git a/common/ih264_inter_pred_filters.h b/common/ih264_inter_pred_filters.h new file mode 100755 index 0000000..c439ab8 --- /dev/null +++ b/common/ih264_inter_pred_filters.h @@ -0,0 +1,241 @@ +/****************************************************************************** + * + * Copyright (C) 2015 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ***************************************************************************** + * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore +*/ + +/** + ******************************************************************************* + * @file + * ih264_inter_pred_filters.h + * + * @brief + * Declarations of functions used for inter prediction + * + * @author + * Ittiam + * + * @par List of Functions: + * -ih264_inter_pred_luma_copy + * -ih264_interleave_copy + * -ih264_inter_pred_luma_horz + * -ih264_inter_pred_luma_vert + * -ih264_inter_pred_luma_horz_hpel_vert_hpel + * -ih264_inter_pred_luma_vert_qpel + * -ih264_inter_pred_luma_horz_qpel + * -ih264_inter_pred_luma_horz_qpel_vert_qpel + * -ih264_inter_pred_luma_horz_qpel_vert_hpel + * -ih264_inter_pred_luma_horz_hpel_vert_qpel + * -ih264_inter_pred_luma_bilinear + * -ih264_inter_pred_chroma + * -ih264_inter_pred_luma_copy_a9q + * -ih264_interleave_copy_a9 + * -ih264_inter_pred_luma_horz_a9q + * -ih264_inter_pred_luma_vert_a9q + * -ih264_inter_pred_luma_bilinear_a9q + * -ih264_inter_pred_luma_horz_hpel_vert_hpel_a9q + * -ih264_inter_pred_luma_horz_qpel_a9q + * -ih264_inter_pred_luma_vert_qpel_a9q + * -ih264_inter_pred_luma_horz_qpel_vert_qpel_a9q + * -ih264_inter_pred_luma_horz_qpel_vert_hpel_a9q + * -ih264_inter_pred_luma_horz_hpel_vert_qpel_a9q + * -ih264_inter_pred_chroma_a9q + * -ih264_inter_pred_luma_copy_av8 + * -ih264_interleave_copy_av8 + * -ih264_inter_pred_luma_horz_av8 + * -ih264_inter_pred_luma_vert_av8 + * -ih264_inter_pred_luma_bilinear_av8 + * -ih264_inter_pred_luma_horz_hpel_vert_hpel_av8 + * -ih264_inter_pred_luma_horz_qpel_av8 + * -ih264_inter_pred_luma_vert_qpel_av8 + * -ih264_inter_pred_luma_horz_qpel_vert_qpel_av8 + * -ih264_inter_pred_luma_horz_qpel_vert_hpel_av8 + * -ih264_inter_pred_luma_horz_hpel_vert_qpel_av8 + * -ih264_inter_pred_chroma_av8 + * -ih264_inter_pred_chroma_dx_zero_av8 + * -ih264_inter_pred_chroma_dy_zero_av8 + * -ih264_inter_pred_luma_copy_ssse3 + * -ih264_inter_pred_luma_copy_ssse3 + * -ih264_inter_pred_luma_horz_ssse3 + * -ih264_inter_pred_luma_vert_ssse3 + * -ih264_inter_pred_luma_bilinear_ssse3 + * -ih264_inter_pred_luma_horz_hpel_vert_hpel_ssse3 + * -ih264_inter_pred_luma_horz_qpel_ssse3 + * -ih264_inter_pred_luma_vert_qpel_ssse3 + * -ih264_inter_pred_luma_horz_qpel_vert_qpel_ssse3 + * -ih264_inter_pred_luma_horz_qpel_vert_hpel_ssse3 + * -ih264_inter_pred_luma_horz_hpel_vert_qpel_ssse3 + * -ih264_inter_pred_chroma_ssse3 + * + * @remarks + * None + * + ******************************************************************************* + */ + +#ifndef _IH264_INTER_PRED_H_ +#define _IH264_INTER_PRED_H_ + +/*****************************************************************************/ +/* Constant Data variables */ +/*****************************************************************************/ + +extern const WORD32 ih264_g_six_tap[3];/* coefficients for 6 tap filtering*/ + +/*****************************************************************************/ +/* Extern Function Declarations */ +/*****************************************************************************/ + +typedef void ih264_inter_pred_luma_ft(UWORD8 *pu1_src, + UWORD8 *pu1_dst, + WORD32 src_strd, + WORD32 dst_strd, + WORD32 ht, + WORD32 wd, + UWORD8* pu1_tmp, + WORD32 dydx); + +typedef void ih264_interleave_copy_ft(UWORD8 *pu1_src, + UWORD8 *pu1_dst, + WORD32 src_strd, + WORD32 dst_strd, + WORD32 ht, + WORD32 wd); + +typedef void ih264_inter_pred_luma_bilinear_ft(UWORD8 *pu1_src1, + UWORD8 *pu1_src2, + UWORD8 *pu1_dst, + WORD32 src_strd1, + WORD32 src_strd2, + WORD32 dst_strd, + WORD32 height, + WORD32 width); + +typedef void ih264_inter_pred_chroma_ft(UWORD8 *pu1_src, + UWORD8 *pu1_dst, + WORD32 src_strd, + WORD32 dst_strd, + WORD32 dx, + WORD32 dy, + WORD32 ht, + WORD32 wd); + +/* No NEON Declarations */ + +ih264_inter_pred_luma_ft ih264_inter_pred_luma_copy; + +ih264_interleave_copy_ft ih264_interleave_copy; + +ih264_inter_pred_luma_ft ih264_inter_pred_luma_horz; + +ih264_inter_pred_luma_ft ih264_inter_pred_luma_vert; + +ih264_inter_pred_luma_ft ih264_inter_pred_luma_horz_hpel_vert_hpel; + +ih264_inter_pred_luma_ft ih264_inter_pred_luma_vert_qpel; + +ih264_inter_pred_luma_ft ih264_inter_pred_luma_horz_qpel; + +ih264_inter_pred_luma_ft ih264_inter_pred_luma_horz_qpel_vert_qpel; + +ih264_inter_pred_luma_ft ih264_inter_pred_luma_horz_qpel_vert_hpel; + +ih264_inter_pred_luma_ft ih264_inter_pred_luma_horz_hpel_vert_qpel; + +ih264_inter_pred_luma_bilinear_ft ih264_inter_pred_luma_bilinear; + +ih264_inter_pred_chroma_ft ih264_inter_pred_chroma; + +/* A9 NEON Declarations */ +ih264_inter_pred_luma_ft ih264_inter_pred_luma_copy_a9q; + +ih264_interleave_copy_ft ih264_interleave_copy_a9; + +ih264_inter_pred_luma_ft ih264_inter_pred_luma_horz_a9q; + +ih264_inter_pred_luma_ft ih264_inter_pred_luma_vert_a9q; + +ih264_inter_pred_luma_bilinear_ft ih264_inter_pred_luma_bilinear_a9q; + +ih264_inter_pred_luma_ft ih264_inter_pred_luma_horz_hpel_vert_hpel_a9q; + +ih264_inter_pred_luma_ft ih264_inter_pred_luma_horz_qpel_a9q; + +ih264_inter_pred_luma_ft ih264_inter_pred_luma_vert_qpel_a9q; + +ih264_inter_pred_luma_ft ih264_inter_pred_luma_horz_qpel_vert_qpel_a9q; + +ih264_inter_pred_luma_ft ih264_inter_pred_luma_horz_qpel_vert_hpel_a9q; + +ih264_inter_pred_luma_ft ih264_inter_pred_luma_horz_hpel_vert_qpel_a9q; + +ih264_inter_pred_chroma_ft ih264_inter_pred_chroma_a9q; + +/* AV8 NEON Declarations */ +ih264_inter_pred_luma_ft ih264_inter_pred_luma_copy_av8; + +ih264_interleave_copy_ft ih264_interleave_copy_av8; + +ih264_inter_pred_luma_ft ih264_inter_pred_luma_horz_av8; + +ih264_inter_pred_luma_ft ih264_inter_pred_luma_vert_av8; + +ih264_inter_pred_luma_ft ih264_inter_pred_luma_horz_hpel_vert_hpel_av8; + +ih264_inter_pred_luma_ft ih264_inter_pred_luma_horz_qpel_av8; + +ih264_inter_pred_luma_ft ih264_inter_pred_luma_vert_qpel_av8; + +ih264_inter_pred_luma_ft ih264_inter_pred_luma_horz_qpel_vert_qpel_av8; + +ih264_inter_pred_luma_ft ih264_inter_pred_luma_horz_qpel_vert_hpel_av8; + +ih264_inter_pred_luma_ft ih264_inter_pred_luma_horz_hpel_vert_qpel_av8; + +ih264_inter_pred_chroma_ft ih264_inter_pred_chroma_av8; + +ih264_inter_pred_chroma_ft ih264_inter_pred_chroma_dx_zero_av8; + +ih264_inter_pred_chroma_ft ih264_inter_pred_chroma_dy_zero_av8; + + +/* SSSE3 Intrinsic Declarations */ +ih264_inter_pred_luma_ft ih264_inter_pred_luma_copy_ssse3; + +ih264_inter_pred_luma_ft ih264_inter_pred_luma_horz_ssse3; + +ih264_inter_pred_luma_ft ih264_inter_pred_luma_vert_ssse3; + +ih264_inter_pred_luma_bilinear_ft ih264_inter_pred_luma_bilinear_ssse3; + +ih264_inter_pred_luma_ft ih264_inter_pred_luma_horz_hpel_vert_hpel_ssse3; + +ih264_inter_pred_luma_ft ih264_inter_pred_luma_horz_qpel_ssse3; + +ih264_inter_pred_luma_ft ih264_inter_pred_luma_vert_qpel_ssse3; + +ih264_inter_pred_luma_ft ih264_inter_pred_luma_horz_qpel_vert_qpel_ssse3; + +ih264_inter_pred_luma_ft ih264_inter_pred_luma_horz_qpel_vert_hpel_ssse3; + +ih264_inter_pred_luma_ft ih264_inter_pred_luma_horz_hpel_vert_qpel_ssse3; + +ih264_inter_pred_chroma_ft ih264_inter_pred_chroma_ssse3; + +#endif + +/** Nothing past this point */ diff --git a/common/ih264_intra_pred_filters.h b/common/ih264_intra_pred_filters.h new file mode 100755 index 0000000..caf6b33 --- /dev/null +++ b/common/ih264_intra_pred_filters.h @@ -0,0 +1,331 @@ +/****************************************************************************** + * + * Copyright (C) 2015 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ***************************************************************************** + * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore +*/ +/** + ******************************************************************************* + * @file + * ih264_intra_pred_filters.h + * + * @brief + * Declarations of functions used for intra prediction + * + * @author + * Ittiam + * + * @remarks + * None + * + ******************************************************************************* + */ + +#ifndef IH264_INTRA_PRED_FILTERS_H_ + +#define IH264_INTRA_PRED_FILTERS_H_ + +/*****************************************************************************/ +/* Macro Expansion */ +/*****************************************************************************/ + +/*! Filter (1,2,1) i.e (a + 2b + c) / 4 */ +#define FILT121(a,b,c) ((a + (b<<1) + c + 2)>>2) +/*! Filter (1,1) i.e (a + b) / 2 */ +#define FILT11(a,b) ((a + b + 1)>>1) +/*****************************************************************************/ +/* Global Variables */ +/*****************************************************************************/ + +/* Global variables used only in assembly files*/ +extern const WORD8 ih264_gai1_intrapred_luma_plane_coeffs[]; +extern const WORD8 ih264_gai1_intrapred_chroma_plane_coeffs1[]; +extern const WORD8 ih264_gai1_intrapred_chroma_plane_coeffs2[]; +extern const WORD8 ih264_gai1_intrapred_luma_8x8_horz_u[]; + +/*****************************************************************************/ +/* Extern Function Declarations */ +/*****************************************************************************/ + + +typedef void ih264_intra_pred_ref_filtering_ft(UWORD8 *pu1_left, + UWORD8 *pu1_topleft, + UWORD8 *pu1_top, + UWORD8 *pu1_dst, + WORD32 left_strd, + WORD32 ngbr_avail); + +typedef void ih264_intra_pred_luma_ft(UWORD8 *pu1_src, + UWORD8 *pu1_dst, + WORD32 src_strd, + WORD32 dst_strd, + WORD32 ngbr_avail); + +/* No Neon Definitions */ + +/* Luma 4x4 Intra pred filters */ + +ih264_intra_pred_luma_ft ih264_intra_pred_luma_4x4_mode_vert; + +ih264_intra_pred_luma_ft ih264_intra_pred_luma_4x4_mode_horz; + +ih264_intra_pred_luma_ft ih264_intra_pred_luma_4x4_mode_dc; + +ih264_intra_pred_luma_ft ih264_intra_pred_luma_4x4_mode_diag_dl; + +ih264_intra_pred_luma_ft ih264_intra_pred_luma_4x4_mode_diag_dr; + +ih264_intra_pred_luma_ft ih264_intra_pred_luma_4x4_mode_vert_r; + +ih264_intra_pred_luma_ft ih264_intra_pred_luma_4x4_mode_horz_d; + +ih264_intra_pred_luma_ft ih264_intra_pred_luma_4x4_mode_vert_l; + +ih264_intra_pred_luma_ft ih264_intra_pred_luma_4x4_mode_horz_u; + +/* Luma 8x8 Intra pred filters */ + +ih264_intra_pred_luma_ft ih264_intra_pred_luma_8x8_mode_vert; + +ih264_intra_pred_luma_ft ih264_intra_pred_luma_8x8_mode_horz; + +ih264_intra_pred_luma_ft ih264_intra_pred_luma_8x8_mode_dc; + +ih264_intra_pred_luma_ft ih264_intra_pred_luma_8x8_mode_diag_dl; + +ih264_intra_pred_luma_ft ih264_intra_pred_luma_8x8_mode_diag_dr; + +ih264_intra_pred_luma_ft ih264_intra_pred_luma_8x8_mode_vert_r; + +ih264_intra_pred_luma_ft ih264_intra_pred_luma_8x8_mode_horz_d; + +ih264_intra_pred_luma_ft ih264_intra_pred_luma_8x8_mode_vert_l; + +ih264_intra_pred_luma_ft ih264_intra_pred_luma_8x8_mode_horz_u; + +/* Luma 16x16 Intra pred filters */ + +ih264_intra_pred_luma_ft ih264_intra_pred_luma_16x16_mode_vert; + +ih264_intra_pred_luma_ft ih264_intra_pred_luma_16x16_mode_horz; + +ih264_intra_pred_luma_ft ih264_intra_pred_luma_16x16_mode_dc; + +ih264_intra_pred_luma_ft ih264_intra_pred_luma_16x16_mode_plane; + +/* Chroma 8x8 Intra pred filters */ + +typedef ih264_intra_pred_luma_ft ih264_intra_pred_chroma_ft; + +ih264_intra_pred_chroma_ft ih264_intra_pred_chroma_8x8_mode_dc; + +ih264_intra_pred_chroma_ft ih264_intra_pred_chroma_8x8_mode_horz; + +ih264_intra_pred_chroma_ft ih264_intra_pred_chroma_8x8_mode_vert; + +ih264_intra_pred_chroma_ft ih264_intra_pred_chroma_8x8_mode_plane; + + +ih264_intra_pred_ref_filtering_ft ih264_intra_pred_luma_8x8_mode_ref_filtering; + +/* A9 Definition */ + +/* Luma 4x4 Intra pred filters */ + +ih264_intra_pred_luma_ft ih264_intra_pred_luma_4x4_mode_vert_a9q; + +ih264_intra_pred_luma_ft ih264_intra_pred_luma_4x4_mode_horz_a9q; + +ih264_intra_pred_luma_ft ih264_intra_pred_luma_4x4_mode_dc_a9q; + +ih264_intra_pred_luma_ft ih264_intra_pred_luma_4x4_mode_diag_dl_a9q; + +ih264_intra_pred_luma_ft ih264_intra_pred_luma_4x4_mode_diag_dr_a9q; + +ih264_intra_pred_luma_ft ih264_intra_pred_luma_4x4_mode_vert_r_a9q; + +ih264_intra_pred_luma_ft ih264_intra_pred_luma_4x4_mode_horz_d_a9q; + +ih264_intra_pred_luma_ft ih264_intra_pred_luma_4x4_mode_vert_l_a9q; + +ih264_intra_pred_luma_ft ih264_intra_pred_luma_4x4_mode_horz_u_a9q; + +/* Luma 8x8 Intra pred filters */ + +ih264_intra_pred_ref_filtering_ft ih264_intra_pred_luma_8x8_mode_ref_filtering_a9q; + +ih264_intra_pred_luma_ft ih264_intra_pred_luma_8x8_mode_vert_a9q; + +ih264_intra_pred_luma_ft ih264_intra_pred_luma_8x8_mode_horz_a9q; + +ih264_intra_pred_luma_ft ih264_intra_pred_luma_8x8_mode_dc_a9q; + +ih264_intra_pred_luma_ft ih264_intra_pred_luma_8x8_mode_diag_dl_a9q; + +ih264_intra_pred_luma_ft ih264_intra_pred_luma_8x8_mode_diag_dr_a9q; + +ih264_intra_pred_luma_ft ih264_intra_pred_luma_8x8_mode_vert_r_a9q; + +ih264_intra_pred_luma_ft ih264_intra_pred_luma_8x8_mode_horz_d_a9q; + +ih264_intra_pred_luma_ft ih264_intra_pred_luma_8x8_mode_vert_l_a9q; + +ih264_intra_pred_luma_ft ih264_intra_pred_luma_8x8_mode_horz_u_a9q; + +/* Luma 16x16 Intra pred filters */ + +ih264_intra_pred_luma_ft ih264_intra_pred_luma_16x16_mode_vert_a9q; + +ih264_intra_pred_luma_ft ih264_intra_pred_luma_16x16_mode_horz_a9q; + +ih264_intra_pred_luma_ft ih264_intra_pred_luma_16x16_mode_dc_a9q; + +ih264_intra_pred_luma_ft ih264_intra_pred_luma_16x16_mode_plane_a9q; + +/* Chroma 8x8 Intra pred filters */ + +ih264_intra_pred_chroma_ft ih264_intra_pred_chroma_8x8_mode_dc_a9q; + +ih264_intra_pred_chroma_ft ih264_intra_pred_chroma_8x8_mode_horz_a9q; + +ih264_intra_pred_chroma_ft ih264_intra_pred_chroma_8x8_mode_vert_a9q; + +ih264_intra_pred_chroma_ft ih264_intra_pred_chroma_8x8_mode_plane_a9q; + +/* X86 Intrinsic Definitions */ + +/* Luma 4x4 Intra pred filters */ + +ih264_intra_pred_luma_ft ih264_intra_pred_luma_4x4_mode_vert_ssse3; + +ih264_intra_pred_luma_ft ih264_intra_pred_luma_4x4_mode_horz_ssse3; + +ih264_intra_pred_luma_ft ih264_intra_pred_luma_4x4_mode_dc_ssse3; + +ih264_intra_pred_luma_ft ih264_intra_pred_luma_4x4_mode_diag_dl_ssse3; + +ih264_intra_pred_luma_ft ih264_intra_pred_luma_4x4_mode_diag_dr_ssse3; + +ih264_intra_pred_luma_ft ih264_intra_pred_luma_4x4_mode_vert_r_ssse3; + +ih264_intra_pred_luma_ft ih264_intra_pred_luma_4x4_mode_horz_d_ssse3; + +ih264_intra_pred_luma_ft ih264_intra_pred_luma_4x4_mode_vert_l_ssse3; + +ih264_intra_pred_luma_ft ih264_intra_pred_luma_4x4_mode_horz_u_ssse3; + +/* Luma 8x8 Intra pred filters */ + +ih264_intra_pred_luma_ft ih264_intra_pred_luma_8x8_mode_vert_ssse3; + +ih264_intra_pred_luma_ft ih264_intra_pred_luma_8x8_mode_horz_ssse3; + +ih264_intra_pred_luma_ft ih264_intra_pred_luma_8x8_mode_dc_ssse3; + +ih264_intra_pred_luma_ft ih264_intra_pred_luma_8x8_mode_diag_dl_ssse3; + +ih264_intra_pred_luma_ft ih264_intra_pred_luma_8x8_mode_diag_dr_ssse3; + +ih264_intra_pred_luma_ft ih264_intra_pred_luma_8x8_mode_vert_r_ssse3; + +ih264_intra_pred_luma_ft ih264_intra_pred_luma_8x8_mode_horz_d_ssse3; + +ih264_intra_pred_luma_ft ih264_intra_pred_luma_8x8_mode_vert_l_ssse3; + +ih264_intra_pred_luma_ft ih264_intra_pred_luma_8x8_mode_horz_u_ssse3; + +/* Luma 16x16 Intra pred filters */ + +ih264_intra_pred_luma_ft ih264_intra_pred_luma_16x16_mode_vert_ssse3; + +ih264_intra_pred_luma_ft ih264_intra_pred_luma_16x16_mode_horz_ssse3; + +ih264_intra_pred_luma_ft ih264_intra_pred_luma_16x16_mode_dc_ssse3; + +ih264_intra_pred_luma_ft ih264_intra_pred_luma_16x16_mode_plane_ssse3; + +/* Chroma 8x8 Intra pred filters */ + +ih264_intra_pred_chroma_ft ih264_intra_pred_chroma_8x8_mode_dc_ssse3; + +ih264_intra_pred_chroma_ft ih264_intra_pred_chroma_8x8_mode_horz_ssse3; + +ih264_intra_pred_chroma_ft ih264_intra_pred_chroma_8x8_mode_vert_ssse3; + +ih264_intra_pred_chroma_ft ih264_intra_pred_chroma_8x8_mode_plane_ssse3; + +/* AV8 Definition */ + +/* Luma 4x4 Intra pred filters */ +ih264_intra_pred_luma_ft ih264_intra_pred_luma_4x4_mode_vert_av8; + +ih264_intra_pred_luma_ft ih264_intra_pred_luma_4x4_mode_horz_av8; + +ih264_intra_pred_luma_ft ih264_intra_pred_luma_4x4_mode_dc_av8; + +ih264_intra_pred_luma_ft ih264_intra_pred_luma_4x4_mode_diag_dl_av8; + +ih264_intra_pred_luma_ft ih264_intra_pred_luma_4x4_mode_diag_dr_av8; + +ih264_intra_pred_luma_ft ih264_intra_pred_luma_4x4_mode_vert_r_av8; + +ih264_intra_pred_luma_ft ih264_intra_pred_luma_4x4_mode_horz_d_av8; + +ih264_intra_pred_luma_ft ih264_intra_pred_luma_4x4_mode_vert_l_av8; + +ih264_intra_pred_luma_ft ih264_intra_pred_luma_4x4_mode_horz_u_av8; + +/* Luma 8x8 Intra pred filters */ + +ih264_intra_pred_luma_ft ih264_intra_pred_luma_8x8_mode_vert_av8; + +ih264_intra_pred_luma_ft ih264_intra_pred_luma_8x8_mode_horz_av8; + +ih264_intra_pred_luma_ft ih264_intra_pred_luma_8x8_mode_dc_av8; + +ih264_intra_pred_luma_ft ih264_intra_pred_luma_8x8_mode_diag_dl_av8; + +ih264_intra_pred_luma_ft ih264_intra_pred_luma_8x8_mode_diag_dr_av8; + +ih264_intra_pred_luma_ft ih264_intra_pred_luma_8x8_mode_vert_r_av8; + +ih264_intra_pred_luma_ft ih264_intra_pred_luma_8x8_mode_horz_d_av8; + +ih264_intra_pred_luma_ft ih264_intra_pred_luma_8x8_mode_vert_l_av8; + +ih264_intra_pred_luma_ft ih264_intra_pred_luma_8x8_mode_horz_u_av8; + +/* Luma 16x16 Intra pred filters */ + +ih264_intra_pred_luma_ft ih264_intra_pred_luma_16x16_mode_vert_av8; + +ih264_intra_pred_luma_ft ih264_intra_pred_luma_16x16_mode_horz_av8; + +ih264_intra_pred_luma_ft ih264_intra_pred_luma_16x16_mode_dc_av8; + +ih264_intra_pred_luma_ft ih264_intra_pred_luma_16x16_mode_plane_av8; + +/* Chroma 8x8 Intra pred filters */ + +ih264_intra_pred_chroma_ft ih264_intra_pred_chroma_8x8_mode_dc_av8; + +ih264_intra_pred_chroma_ft ih264_intra_pred_chroma_8x8_mode_horz_av8; + +ih264_intra_pred_chroma_ft ih264_intra_pred_chroma_8x8_mode_vert_av8; + +ih264_intra_pred_chroma_ft ih264_intra_pred_chroma_8x8_mode_plane_av8; + +#endif /* IH264_INTRA_PRED_FILTERS_H_ */ diff --git a/common/ih264_iquant_itrans_recon.c b/common/ih264_iquant_itrans_recon.c new file mode 100755 index 0000000..3c14046 --- /dev/null +++ b/common/ih264_iquant_itrans_recon.c @@ -0,0 +1,873 @@ +/****************************************************************************** + * + * Copyright (C) 2015 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ***************************************************************************** + * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore +*/ +/** + ******************************************************************************* + * @file + * ih264_iquant_itrans_recon.c + * + * @brief + * Contains definition of functions for h264 inverse quantization inverse transformation and recon + * + * @author + * Ittiam + * + * @par List of Functions: + * - ih264_iquant_itrans_recon_4x4() + * - ih264_iquant_itrans_recon_8x8() + * - ih264_iquant_itrans_recon_4x4_dc() + * - ih264_iquant_itrans_recon_8x8_dc() + * - ih264_iquant_itrans_recon_chroma_4x4() + * -ih264_iquant_itrans_recon_chroma_4x4_dc() + * + * @remarks + * + ******************************************************************************* + */ + +/*****************************************************************************/ +/* File Includes */ +/*****************************************************************************/ + +/* User include files */ +#include "ih264_typedefs.h" +#include "ih264_defs.h" +#include "ih264_trans_macros.h" +#include "ih264_macros.h" +#include "ih264_platform_macros.h" +#include "ih264_trans_data.h" +#include "ih264_size_defs.h" +#include "ih264_structs.h" +#include "ih264_trans_quant_itrans_iquant.h" + +/* + ******************************************************************************** + * + * @brief This function reconstructs a 4x4 sub block from quantized resiude and + * prediction buffer + * + * @par Description: + * The quantized residue is first inverse quantized, then inverse transformed. + * This inverse transformed content is added to the prediction buffer to recon- + * struct the end output + * + * @param[in] pi2_src + * quantized 4x4 block + * + * @param[in] pu1_pred + * prediction 4x4 block + * + * @param[out] pu1_out + * reconstructed 4x4 block + * + * @param[in] src_strd + * quantization buffer stride + * + * @param[in] pred_strd, + * Prediction buffer stride + * + * @param[in] out_strd + * recon buffer Stride + * + * @param[in] pu2_scaling_list + * pointer to scaling list + * + * @param[in] pu2_norm_adjust + * pointer to inverse scale matrix + * + * @param[in] u4_qp_div_6 + * Floor (qp/6) + * + * @param[in] pi4_tmp + * temporary buffer of size 1*16 + * + * @returns none + * + * @remarks none + * + ******************************************************************************* + */ +void ih264_iquant_itrans_recon_4x4(WORD16 *pi2_src, + UWORD8 *pu1_pred, + UWORD8 *pu1_out, + WORD32 pred_strd, + WORD32 out_strd, + const UWORD16 *pu2_iscal_mat, + const UWORD16 *pu2_weigh_mat, + UWORD32 u4_qp_div_6, + WORD16 *pi2_tmp, + WORD32 iq_start_idx, + WORD16 *pi2_dc_ld_addr +) +{ + WORD16 *pi2_src_ptr = pi2_src; + WORD16 *pi2_tmp_ptr = pi2_tmp; + UWORD8 *pu1_pred_ptr = pu1_pred; + UWORD8 *pu1_out_ptr = pu1_out; + WORD16 x0, x1, x2, x3, i; + WORD32 q0, q1, q2, q3; + WORD16 i_macro; + WORD16 rnd_fact = (u4_qp_div_6 < 4) ? 1 << (3 - u4_qp_div_6) : 0; + + /* inverse quant */ + /*horizontal inverse transform */ + for(i = 0; i < SUB_BLK_WIDTH_4x4; i++) + { + q0 = pi2_src_ptr[0]; + INV_QUANT(q0, pu2_iscal_mat[0], pu2_weigh_mat[0], u4_qp_div_6, rnd_fact, + 4); + if (i==0 && iq_start_idx == 1) + q0 = pi2_dc_ld_addr[0]; // Restoring dc value for intra case + + q2 = pi2_src_ptr[2]; + INV_QUANT(q2, pu2_iscal_mat[2], pu2_weigh_mat[2], u4_qp_div_6, rnd_fact, + 4); + + x0 = q0 + q2; + x1 = q0 - q2; + + q1 = pi2_src_ptr[1]; + INV_QUANT(q1, pu2_iscal_mat[1], pu2_weigh_mat[1], u4_qp_div_6, rnd_fact, + 4); + + q3 = pi2_src_ptr[3]; + INV_QUANT(q3, pu2_iscal_mat[3], pu2_weigh_mat[3], u4_qp_div_6, rnd_fact, + 4); + + x2 = (q1 >> 1) - q3; + x3 = q1 + (q3 >> 1); + + pi2_tmp_ptr[0] = x0 + x3; + pi2_tmp_ptr[1] = x1 + x2; + pi2_tmp_ptr[2] = x1 - x2; + pi2_tmp_ptr[3] = x0 - x3; + + pi2_src_ptr += SUB_BLK_WIDTH_4x4; + pi2_tmp_ptr += SUB_BLK_WIDTH_4x4; + pu2_iscal_mat += SUB_BLK_WIDTH_4x4; + pu2_weigh_mat += SUB_BLK_WIDTH_4x4; + } + + /* vertical inverse transform */ + pi2_tmp_ptr = pi2_tmp; + for(i = 0; i < SUB_BLK_WIDTH_4x4; i++) + { + pu1_pred_ptr = pu1_pred; + pu1_out = pu1_out_ptr; + + x0 = (pi2_tmp_ptr[0] + pi2_tmp_ptr[8]); + x1 = (pi2_tmp_ptr[0] - pi2_tmp_ptr[8]); + x2 = (pi2_tmp_ptr[4] >> 1) - pi2_tmp_ptr[12]; + x3 = pi2_tmp_ptr[4] + (pi2_tmp_ptr[12] >> 1); + + /* inverse prediction */ + i_macro = x0 + x3; + i_macro = ((i_macro + 32) >> 6); + i_macro += *pu1_pred_ptr; + *pu1_out = CLIP_U8(i_macro); + pu1_pred_ptr += pred_strd; + pu1_out += out_strd; + + i_macro = x1 + x2; + i_macro = ((i_macro + 32) >> 6); + i_macro += *pu1_pred_ptr; + *pu1_out = CLIP_U8(i_macro); + pu1_pred_ptr += pred_strd; + pu1_out += out_strd; + + i_macro = x1 - x2; + i_macro = ((i_macro + 32) >> 6); + i_macro += *pu1_pred_ptr; + *pu1_out = CLIP_U8(i_macro); + pu1_pred_ptr += pred_strd; + pu1_out += out_strd; + + i_macro = x0 - x3; + i_macro = ((i_macro + 32) >> 6); + i_macro += *pu1_pred_ptr; + *pu1_out = CLIP_U8(i_macro); + + pi2_tmp_ptr++; + pu1_out_ptr++; + pu1_pred++; + } + +} + +void ih264_iquant_itrans_recon_4x4_dc(WORD16 *pi2_src, + UWORD8 *pu1_pred, + UWORD8 *pu1_out, + WORD32 pred_strd, + WORD32 out_strd, + const UWORD16 *pu2_iscal_mat, + const UWORD16 *pu2_weigh_mat, + UWORD32 u4_qp_div_6, + WORD16 *pi2_tmp, + WORD32 iq_start_idx, + WORD16 *pi2_dc_ld_addr) +{ + UWORD8 *pu1_pred_ptr = pu1_pred; + UWORD8 *pu1_out_ptr = pu1_out; + WORD32 q0; + WORD16 x, i_macro, i; + WORD16 rnd_fact = (u4_qp_div_6 < 4) ? 1 << (3 - u4_qp_div_6) : 0; + UNUSED(pi2_tmp); + + if (iq_start_idx == 0) + { + q0 = pi2_src[0]; + INV_QUANT(q0, pu2_iscal_mat[0], pu2_weigh_mat[0], u4_qp_div_6, rnd_fact, 4); + } + else + { + q0 = pi2_dc_ld_addr[0]; // Restoring dc value for intra case3 + } + i_macro = ((q0 + 32) >> 6); + for(i = 0; i < SUB_BLK_WIDTH_4x4; i++) + { + pu1_pred_ptr = pu1_pred; + pu1_out = pu1_out_ptr; + + /* inverse prediction */ + + x = i_macro + *pu1_pred_ptr; + *pu1_out = CLIP_U8(x); + pu1_pred_ptr += pred_strd; + pu1_out += out_strd; + + x = i_macro + *pu1_pred_ptr; + *pu1_out = CLIP_U8(x); + pu1_pred_ptr += pred_strd; + pu1_out += out_strd; + + x = i_macro + *pu1_pred_ptr; + *pu1_out = CLIP_U8(x); + pu1_pred_ptr += pred_strd; + pu1_out += out_strd; + + x = i_macro + *pu1_pred_ptr; + *pu1_out = CLIP_U8(x); + + pu1_out_ptr++; + pu1_pred++; + } +} + +/** + ******************************************************************************* + * + * @brief + * This function performs inverse quant and Inverse transform type Ci4 for 8x8 block + * + * @par Description: + * Performs inverse transform Ci8 and adds the residue to get the + * reconstructed block + * + * @param[in] pi2_src + * Input 8x8coefficients + * + * @param[in] pu1_pred + * Prediction 8x8 block + * + * @param[out] pu1_recon + * Output 8x8 block + * + * @param[in] q_div + * QP/6 + * + * @param[in] q_rem + * QP%6 + * + * @param[in] q_lev + * Quantizer level + * + * @param[in] src_strd + * Input stride + * + * @param[in] pred_strd, + * Prediction stride + * + * @param[in] out_strd + * Output Stride + * + * @param[in] pi4_tmp + * temporary buffer of size 1*16 we dont need a bigger blcok since we reuse + * the tmp for each block + * + * @param[in] pu4_iquant_mat + * Pointer to the inverse quantization matrix + * + * @returns Void + * + * @remarks + * None + * + ******************************************************************************* + */ +void ih264_iquant_itrans_recon_8x8(WORD16 *pi2_src, + UWORD8 *pu1_pred, + UWORD8 *pu1_out, + WORD32 pred_strd, + WORD32 out_strd, + const UWORD16 *pu2_iscale_mat, + const UWORD16 *pu2_weigh_mat, + UWORD32 qp_div, + WORD16 *pi2_tmp, + WORD32 iq_start_idx, + WORD16 *pi2_dc_ld_addr +) +{ + WORD32 i; + WORD16 *pi2_tmp_ptr = pi2_tmp; + UWORD8 *pu1_pred_ptr = pu1_pred; + UWORD8 *pu1_out_ptr = pu1_out; + WORD16 i_z0, i_z1, i_z2, i_z3, i_z4, i_z5, i_z6, i_z7; + WORD16 i_y0, i_y1, i_y2, i_y3, i_y4, i_y5, i_y6, i_y7; + WORD16 i_macro; + WORD32 q; + WORD32 rnd_fact = (qp_div < 6) ? (1 << (5 - qp_div)) : 0; + UNUSED(iq_start_idx); + UNUSED(pi2_dc_ld_addr); + /*************************************************************/ + /* De quantization of coefficients. Will be replaced by SIMD */ + /* operations on platform. Note : DC coeff is not scaled */ + /*************************************************************/ + for(i = 0; i < (SUB_BLK_WIDTH_8x8 * SUB_BLK_WIDTH_8x8); i++) + { + q = pi2_src[i]; + INV_QUANT(q, pu2_iscale_mat[i], pu2_weigh_mat[i], qp_div, rnd_fact, 6); + pi2_tmp_ptr[i] = q; + } + /* Perform Inverse transform */ + /*--------------------------------------------------------------------*/ + /* IDCT [ Horizontal transformation ] */ + /*--------------------------------------------------------------------*/ + for(i = 0; i < SUB_BLK_WIDTH_8x8; i++) + { + /*------------------------------------------------------------------*/ + /* y0 = w0 + w4 */ + /* y1 = -w3 + w5 - w7 - (w7 >> 1) */ + /* y2 = w0 - w4 */ + /* y3 = w1 + w7 - w3 - (w3 >> 1) */ + /* y4 = (w2 >> 1) - w6 */ + /* y5 = -w1 + w7 + w5 + (w5 >> 1) */ + /* y6 = w2 + (w6 >> 1) */ + /* y7 = w3 + w5 + w1 + (w1 >> 1) */ + /*------------------------------------------------------------------*/ + i_y0 = (pi2_tmp_ptr[0] + pi2_tmp_ptr[4] ); + + i_y1 = ((WORD32)(-pi2_tmp_ptr[3]) + pi2_tmp_ptr[5] - pi2_tmp_ptr[7] + - (pi2_tmp_ptr[7] >> 1)); + + i_y2 = (pi2_tmp_ptr[0] - pi2_tmp_ptr[4] ); + + i_y3 = ((WORD32)pi2_tmp_ptr[1] + pi2_tmp_ptr[7] - pi2_tmp_ptr[3] + - (pi2_tmp_ptr[3] >> 1)); + + i_y4 = ((pi2_tmp_ptr[2] >> 1) - pi2_tmp_ptr[6] ); + + i_y5 = ((WORD32)(-pi2_tmp_ptr[1]) + pi2_tmp_ptr[7] + pi2_tmp_ptr[5] + + (pi2_tmp_ptr[5] >> 1)); + + i_y6 = (pi2_tmp_ptr[2] + (pi2_tmp_ptr[6] >> 1)); + + i_y7 = ((WORD32)pi2_tmp_ptr[3] + pi2_tmp_ptr[5] + pi2_tmp_ptr[1] + + (pi2_tmp_ptr[1] >> 1)); + + /*------------------------------------------------------------------*/ + /* z0 = y0 + y6 */ + /* z1 = y1 + (y7 >> 2) */ + /* z2 = y2 + y4 */ + /* z3 = y3 + (y5 >> 2) */ + /* z4 = y2 - y4 */ + /* z5 = (y3 >> 2) - y5 */ + /* z6 = y0 - y6 */ + /* z7 = y7 - (y1 >> 2) */ + /*------------------------------------------------------------------*/ + i_z0 = i_y0 + i_y6; + i_z1 = i_y1 + (i_y7 >> 2); + i_z2 = i_y2 + i_y4; + i_z3 = i_y3 + (i_y5 >> 2); + i_z4 = i_y2 - i_y4; + i_z5 = (i_y3 >> 2) - i_y5; + i_z6 = i_y0 - i_y6; + i_z7 = i_y7 - (i_y1 >> 2); + + /*------------------------------------------------------------------*/ + /* x0 = z0 + z7 */ + /* x1 = z2 + z5 */ + /* x2 = z4 + z3 */ + /* x3 = z6 + z1 */ + /* x4 = z6 - z1 */ + /* x5 = z4 - z3 */ + /* x6 = z2 - z5 */ + /* x7 = z0 - z7 */ + /*------------------------------------------------------------------*/ + pi2_tmp_ptr[0] = i_z0 + i_z7; + pi2_tmp_ptr[1] = i_z2 + i_z5; + pi2_tmp_ptr[2] = i_z4 + i_z3; + pi2_tmp_ptr[3] = i_z6 + i_z1; + pi2_tmp_ptr[4] = i_z6 - i_z1; + pi2_tmp_ptr[5] = i_z4 - i_z3; + pi2_tmp_ptr[6] = i_z2 - i_z5; + pi2_tmp_ptr[7] = i_z0 - i_z7; + + /* move to the next row */ + //pi2_src_ptr += SUB_BLK_WIDTH_8x8; + pi2_tmp_ptr += SUB_BLK_WIDTH_8x8; + } + /*--------------------------------------------------------------------*/ + /* IDCT [ Vertical transformation] and Xij = (xij + 32)>>6 */ + /* */ + /* Add the prediction and store it back to reconstructed frame buffer */ + /* [Prediction buffer itself in this case] */ + /*--------------------------------------------------------------------*/ + + pi2_tmp_ptr = pi2_tmp; + for(i = 0; i < SUB_BLK_WIDTH_8x8; i++) + { + pu1_pred_ptr = pu1_pred; + pu1_out = pu1_out_ptr; + /*------------------------------------------------------------------*/ + /* y0j = w0j + w4j */ + /* y1j = -w3j + w5j -w7j -(w7j >> 1) */ + /* y2j = w0j -w4j */ + /* y3j = w1j + w7j -w3j -(w3j >> 1) */ + /* y4j = ( w2j >> 1 ) -w6j */ + /* y5j = -w1j + w7j + w5j + (w5j >> 1) */ + /* y6j = w2j + ( w6j >> 1 ) */ + /* y7j = w3j + w5j + w1j + (w1j >> 1) */ + /*------------------------------------------------------------------*/ + i_y0 = pi2_tmp_ptr[0] + pi2_tmp_ptr[32]; + + i_y1 = (WORD32)(-pi2_tmp_ptr[24]) + pi2_tmp_ptr[40] - pi2_tmp_ptr[56] + - (pi2_tmp_ptr[56] >> 1); + + i_y2 = pi2_tmp_ptr[0] - pi2_tmp_ptr[32]; + + i_y3 = (WORD32)pi2_tmp_ptr[8] + pi2_tmp_ptr[56] - pi2_tmp_ptr[24] + - (pi2_tmp_ptr[24] >> 1); + + i_y4 = (pi2_tmp_ptr[16] >> 1) - pi2_tmp_ptr[48]; + + i_y5 = (WORD32)(-pi2_tmp_ptr[8]) + pi2_tmp_ptr[56] + pi2_tmp_ptr[40] + + (pi2_tmp_ptr[40] >> 1); + + i_y6 = pi2_tmp_ptr[16] + (pi2_tmp_ptr[48] >> 1); + + i_y7 = (WORD32)pi2_tmp_ptr[24] + pi2_tmp_ptr[40] + pi2_tmp_ptr[8] + + (pi2_tmp_ptr[8] >> 1); + + /*------------------------------------------------------------------*/ + /* z0j = y0j + y6j */ + /* z1j = y1j + (y7j >> 2) */ + /* z2j = y2j + y4j */ + /* z3j = y3j + (y5j >> 2) */ + /* z4j = y2j -y4j */ + /* z5j = (y3j >> 2) -y5j */ + /* z6j = y0j -y6j */ + /* z7j = y7j -(y1j >> 2) */ + /*------------------------------------------------------------------*/ + i_z0 = i_y0 + i_y6; + i_z1 = i_y1 + (i_y7 >> 2); + i_z2 = i_y2 + i_y4; + i_z3 = i_y3 + (i_y5 >> 2); + i_z4 = i_y2 - i_y4; + i_z5 = (i_y3 >> 2) - i_y5; + i_z6 = i_y0 - i_y6; + i_z7 = i_y7 - (i_y1 >> 2); + + /*------------------------------------------------------------------*/ + /* x0j = z0j + z7j */ + /* x1j = z2j + z5j */ + /* x2j = z4j + z3j */ + /* x3j = z6j + z1j */ + /* x4j = z6j -z1j */ + /* x5j = z4j -z3j */ + /* x6j = z2j -z5j */ + /* x7j = z0j -z7j */ + /*------------------------------------------------------------------*/ + i_macro = ((i_z0 + i_z7 + 32) >> 6) + *pu1_pred_ptr; + *pu1_out = CLIP_U8(i_macro); + /* Change uc_recBuffer to Point to next element in the same column*/ + pu1_pred_ptr += pred_strd; + pu1_out += out_strd; + + i_macro = ((i_z2 + i_z5 + 32) >> 6) + *pu1_pred_ptr; + *pu1_out = CLIP_U8(i_macro); + pu1_pred_ptr += pred_strd; + pu1_out += out_strd; + + i_macro = ((i_z4 + i_z3 + 32) >> 6) + *pu1_pred_ptr; + *pu1_out = CLIP_U8(i_macro); + pu1_pred_ptr += pred_strd; + pu1_out += out_strd; + + i_macro = ((i_z6 + i_z1 + 32) >> 6) + *pu1_pred_ptr; + *pu1_out = CLIP_U8(i_macro); + pu1_pred_ptr += pred_strd; + pu1_out += out_strd; + + i_macro = ((i_z6 - i_z1 + 32) >> 6) + *pu1_pred_ptr; + *pu1_out = CLIP_U8(i_macro); + pu1_pred_ptr += pred_strd; + pu1_out += out_strd; + + i_macro = ((i_z4 - i_z3 + 32) >> 6) + *pu1_pred_ptr; + *pu1_out = CLIP_U8(i_macro); + pu1_pred_ptr += pred_strd; + pu1_out += out_strd; + + i_macro = ((i_z2 - i_z5 + 32) >> 6) + *pu1_pred_ptr; + *pu1_out = CLIP_U8(i_macro); + pu1_pred_ptr += pred_strd; + pu1_out += out_strd; + + i_macro = ((i_z0 - i_z7 + 32) >> 6) + *pu1_pred_ptr; + *pu1_out = CLIP_U8(i_macro); + + pi2_tmp_ptr++; + pu1_out_ptr++; + pu1_pred++; + } +} + +void ih264_iquant_itrans_recon_8x8_dc(WORD16 *pi2_src, + UWORD8 *pu1_pred, + UWORD8 *pu1_out, + WORD32 pred_strd, + WORD32 out_strd, + const UWORD16 *pu2_iscale_mat, + const UWORD16 *pu2_weigh_mat, + UWORD32 qp_div, + WORD16 *pi2_tmp, + WORD32 iq_start_idx, + WORD16 *pi2_dc_ld_addr) +{ + UWORD8 *pu1_pred_ptr = pu1_pred; + UWORD8 *pu1_out_ptr = pu1_out; + WORD16 x, i, i_macro; + WORD32 q; + WORD32 rnd_fact = (qp_div < 6) ? (1 << (5 - qp_div)) : 0; + UNUSED(pi2_tmp); + UNUSED(iq_start_idx); + UNUSED(pi2_dc_ld_addr); + /*************************************************************/ + /* Dequantization of coefficients. Will be replaced by SIMD */ + /* operations on platform. Note : DC coeff is not scaled */ + /*************************************************************/ + q = pi2_src[0]; + INV_QUANT(q, pu2_iscale_mat[0], pu2_weigh_mat[0], qp_div, rnd_fact, 6); + i_macro = (q + 32) >> 6; + /* Perform Inverse transform */ + /*--------------------------------------------------------------------*/ + /* IDCT [ Horizontal transformation ] */ + /*--------------------------------------------------------------------*/ + /*--------------------------------------------------------------------*/ + /* IDCT [ Vertical transformation] and Xij = (xij + 32)>>6 */ + /* */ + /* Add the prediction and store it back to reconstructed frame buffer */ + /* [Prediction buffer itself in this case] */ + /*--------------------------------------------------------------------*/ + for(i = 0; i < SUB_BLK_WIDTH_8x8; i++) + { + pu1_pred_ptr = pu1_pred; + pu1_out = pu1_out_ptr; + + x = i_macro + *pu1_pred_ptr; + *pu1_out = CLIP_U8(x); + /* Change uc_recBuffer to Point to next element in the same column*/ + pu1_pred_ptr += pred_strd; + pu1_out += out_strd; + + x = i_macro + *pu1_pred_ptr; + *pu1_out = CLIP_U8(x); + pu1_pred_ptr += pred_strd; + pu1_out += out_strd; + + x = i_macro + *pu1_pred_ptr; + *pu1_out = CLIP_U8(x); + pu1_pred_ptr += pred_strd; + pu1_out += out_strd; + + x = i_macro + *pu1_pred_ptr; + *pu1_out = CLIP_U8(x); + pu1_pred_ptr += pred_strd; + pu1_out += out_strd; + + x = i_macro + *pu1_pred_ptr; + *pu1_out = CLIP_U8(x); + pu1_pred_ptr += pred_strd; + pu1_out += out_strd; + + x = i_macro + *pu1_pred_ptr; + *pu1_out = CLIP_U8(x); + pu1_pred_ptr += pred_strd; + pu1_out += out_strd; + + x = i_macro + *pu1_pred_ptr; + *pu1_out = CLIP_U8(x); + pu1_pred_ptr += pred_strd; + pu1_out += out_strd; + + x = i_macro + *pu1_pred_ptr; + *pu1_out = CLIP_U8(x); + + pu1_out_ptr++; + pu1_pred++; + } +} + +/* + ******************************************************************************** + * + * @brief This function reconstructs a 4x4 sub block from quantized resiude and + * prediction buffer + * + * @par Description: + * The quantized residue is first inverse quantized, then inverse transformed. + * This inverse transformed content is added to the prediction buffer to recon- + * struct the end output + * + * @param[in] pi2_src + * quantized 4x4 block + * + * @param[in] pu1_pred + * prediction 4x4 block + * + * @param[out] pu1_out + * reconstructed 4x4 block + * + * @param[in] src_strd + * quantization buffer stride + * + * @param[in] pred_strd, + * Prediction buffer stride + * + * @param[in] out_strd + * recon buffer Stride + * + * @param[in] pu2_scaling_list + * pointer to scaling list + * + * @param[in] pu2_norm_adjust + * pointer to inverse scale matrix + * + * @param[in] u4_qp_div_6 + * Floor (qp/6) + * + * @param[in] pi4_tmp + * temporary buffer of size 1*16 + * + * @returns none + * + * @remarks none + * + ******************************************************************************* + */ +void ih264_iquant_itrans_recon_chroma_4x4(WORD16 *pi2_src, + UWORD8 *pu1_pred, + UWORD8 *pu1_out, + WORD32 pred_strd, + WORD32 out_strd, + const UWORD16 *pu2_iscal_mat, + const UWORD16 *pu2_weigh_mat, + UWORD32 u4_qp_div_6, + WORD16 *pi2_tmp, + WORD16 *pi2_dc_src) +{ + WORD16 *pi2_src_ptr = pi2_src; + WORD16 *pi2_tmp_ptr = pi2_tmp; + UWORD8 *pu1_pred_ptr = pu1_pred; + UWORD8 *pu1_out_ptr = pu1_out; + WORD16 x0, x1, x2, x3, i; + WORD32 q0, q1, q2, q3; + WORD16 i_macro; + WORD16 rnd_fact = (u4_qp_div_6 < 4) ? 1 << (3 - u4_qp_div_6) : 0; + + /* inverse quant */ + /*horizontal inverse transform */ + for(i = 0; i < SUB_BLK_WIDTH_4x4; i++) + { + if(i==0) + { + q0 = pi2_dc_src[0]; + } + else + { + q0 = pi2_src_ptr[0]; + INV_QUANT(q0, pu2_iscal_mat[0], pu2_weigh_mat[0], u4_qp_div_6, rnd_fact, 4); + } + + q2 = pi2_src_ptr[2]; + INV_QUANT(q2, pu2_iscal_mat[2], pu2_weigh_mat[2], u4_qp_div_6, rnd_fact, + 4); + + x0 = q0 + q2; + x1 = q0 - q2; + + q1 = pi2_src_ptr[1]; + INV_QUANT(q1, pu2_iscal_mat[1], pu2_weigh_mat[1], u4_qp_div_6, rnd_fact, + 4); + + q3 = pi2_src_ptr[3]; + INV_QUANT(q3, pu2_iscal_mat[3], pu2_weigh_mat[3], u4_qp_div_6, rnd_fact, + 4); + + x2 = (q1 >> 1) - q3; + x3 = q1 + (q3 >> 1); + + pi2_tmp_ptr[0] = x0 + x3; + pi2_tmp_ptr[1] = x1 + x2; + pi2_tmp_ptr[2] = x1 - x2; + pi2_tmp_ptr[3] = x0 - x3; + + pi2_src_ptr += SUB_BLK_WIDTH_4x4; + pi2_tmp_ptr += SUB_BLK_WIDTH_4x4; + pu2_iscal_mat += SUB_BLK_WIDTH_4x4; + pu2_weigh_mat += SUB_BLK_WIDTH_4x4; + } + + /* vertical inverse transform */ + pi2_tmp_ptr = pi2_tmp; + for(i = 0; i < SUB_BLK_WIDTH_4x4; i++) + { + pu1_pred_ptr = pu1_pred; + pu1_out = pu1_out_ptr; + + x0 = (pi2_tmp_ptr[0] + pi2_tmp_ptr[8]); + x1 = (pi2_tmp_ptr[0] - pi2_tmp_ptr[8]); + x2 = (pi2_tmp_ptr[4] >> 1) - pi2_tmp_ptr[12]; + x3 = pi2_tmp_ptr[4] + (pi2_tmp_ptr[12] >> 1); + + /* inverse prediction */ + i_macro = x0 + x3; + i_macro = ((i_macro + 32) >> 6); + i_macro += *pu1_pred_ptr; + *pu1_out = CLIP_U8(i_macro); + pu1_pred_ptr += pred_strd; + pu1_out += out_strd; + + i_macro = x1 + x2; + i_macro = ((i_macro + 32) >> 6); + i_macro += *pu1_pred_ptr; + *pu1_out = CLIP_U8(i_macro); + pu1_pred_ptr += pred_strd; + pu1_out += out_strd; + + i_macro = x1 - x2; + i_macro = ((i_macro + 32) >> 6); + i_macro += *pu1_pred_ptr; + *pu1_out = CLIP_U8(i_macro); + pu1_pred_ptr += pred_strd; + pu1_out += out_strd; + + i_macro = x0 - x3; + i_macro = ((i_macro + 32) >> 6); + i_macro += *pu1_pred_ptr; + *pu1_out = CLIP_U8(i_macro); + + pi2_tmp_ptr++; + pu1_out_ptr+= 2; //Interleaved store for output + pu1_pred+= 2; //Interleaved load for pred buffer + } +} + +/* + ******************************************************************************** + * + * @brief This function reconstructs a 4x4 sub block from quantized resiude and + * prediction buffer if only dc value is present for residue + * + * @par Description: + * The quantized residue is first inverse quantized, + * This inverse quantized content is added to the prediction buffer to recon- + * struct the end output + * + * @param[in] pi2_src + * quantized dc coefficient + * + * @param[in] pu1_pred + * prediction 4x4 block in interleaved format + * + * @param[in] pred_strd, + * Prediction buffer stride in interleaved format + * + * @param[in] out_strd + * recon buffer Stride + * + * @returns none + * + * @remarks none + * + ******************************************************************************* + */ + +void ih264_iquant_itrans_recon_chroma_4x4_dc(WORD16 *pi2_src, + UWORD8 *pu1_pred, + UWORD8 *pu1_out, + WORD32 pred_strd, + WORD32 out_strd, + const UWORD16 *pu2_iscal_mat, + const UWORD16 *pu2_weigh_mat, + UWORD32 u4_qp_div_6, + WORD16 *pi2_tmp, + WORD16 *pi2_dc_src) +{ + UWORD8 *pu1_pred_ptr = pu1_pred; + UWORD8 *pu1_out_ptr = pu1_out; + WORD32 q0; + WORD16 x, i_macro, i; + UNUSED(pi2_src); + UNUSED(pu2_iscal_mat); + UNUSED(pu2_weigh_mat); + UNUSED(u4_qp_div_6); + UNUSED(pi2_tmp); + + q0 = pi2_dc_src[0]; // Restoring dc value for intra case3 + i_macro = ((q0 + 32) >> 6); + + for(i = 0; i < SUB_BLK_WIDTH_4x4; i++) + { + pu1_pred_ptr = pu1_pred; + pu1_out = pu1_out_ptr; + + /* inverse prediction */ + x = i_macro + *pu1_pred_ptr; + *pu1_out = CLIP_U8(x); + pu1_pred_ptr += pred_strd; + pu1_out += out_strd; + + x = i_macro + *pu1_pred_ptr; + *pu1_out = CLIP_U8(x); + pu1_pred_ptr += pred_strd; + pu1_out += out_strd; + + x = i_macro + *pu1_pred_ptr; + *pu1_out = CLIP_U8(x); + pu1_pred_ptr += pred_strd; + pu1_out += out_strd; + + x = i_macro + *pu1_pred_ptr; + *pu1_out = CLIP_U8(x); + + pu1_out_ptr+=2; + pu1_pred+=2; + } +} diff --git a/common/ih264_itrans_recon.h b/common/ih264_itrans_recon.h new file mode 100755 index 0000000..fd1f239 --- /dev/null +++ b/common/ih264_itrans_recon.h @@ -0,0 +1,71 @@ +/****************************************************************************** + * + * Copyright (C) 2015 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ***************************************************************************** + * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore +*/ +/** +******************************************************************************* +* @file +* ih264_itrans_recon.h +* +* @brief +* Contains function declarations for inverse transform and reconstruction of +* the quantized macro blocks +* +* @author +* Ittiam +* +* @par List of Functions: +* - ih264_itrans_recon_ft +* - ih264_itrans_recon_4x4 +* - ih264_itrans_recon_8x8 +* - ih264_itrans_recon_4x4_a9 +* +* +* @remarks +* None +* +******************************************************************************* +*/ + +#ifndef IH264_ITRANS_RECON_H_ +#define IH264_ITRANS_RECON_H_ + +/*****************************************************************************/ +/* Extern Function Declarations */ +/*****************************************************************************/ + +typedef void ih264_itrans_recon_ft(WORD16 *pi2_src, + UWORD8 *pu1_pred, + UWORD8 *pu1_recon, + WORD32 src_strd, + WORD32 pred_strd, + WORD32 dst_strd, + UWORD32 q_lev, + WORD32 *pi4_tmp); + +/*C declarations*/ + +ih264_itrans_recon_ft ih264_itrans_recon_4x4; + +ih264_itrans_recon_ft ih264_itrans_recon_8x8; + +/*A9 declarations */ + +ih264_itrans_recon_ft ih264_itrans_recon_4x4_a9; + +#endif /* IH264_ITRANS_RECON_H_ */ diff --git a/common/ih264_list.c b/common/ih264_list.c new file mode 100755 index 0000000..736b41c --- /dev/null +++ b/common/ih264_list.c @@ -0,0 +1,574 @@ +/****************************************************************************** + * + * Copyright (C) 2015 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ***************************************************************************** + * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore +*/ +/** +******************************************************************************* +* @file +* ih264_list.c +* +* @brief +* Contains functions for buf queue +* +* @author +* Harish +* +* @par List of Functions: +* ih264_list_size() +* ih264_list_lock() +* ih264_list_unlock() +* ih264_list_yield() +* ih264_list_free() +* ih264_list_init() +* ih264_list_reset() +* ih264_list_deinit() +* ih264_list_terminate() +* ih264_list_queue() +* ih264_list_dequeue() +* +* @remarks +* None +* +******************************************************************************* +*/ +/*****************************************************************************/ +/* File Includes */ +/*****************************************************************************/ +#include <stdio.h> +#include <stddef.h> +#include <stdlib.h> +#include <string.h> +#include <assert.h> + +#include "ih264_typedefs.h" +#include "ithread.h" +#include "ih264_platform_macros.h" +#include "ih264_macros.h" +#include "ih264_debug.h" +#include "ih264_error.h" +#include "ih264_list.h" + +/** +******************************************************************************* +* +* @brief Returns size for buf queue context. Does not include buf queue buffer +* requirements +* +* @par Description +* Returns size for buf queue context. Does not include buf queue buffer +* requirements. Buffer size required to store the bufs should be allocated in +* addition to the value returned here. +* +* @returns Size of the buf queue context +* +* @remarks +* +******************************************************************************* +*/ +WORD32 ih264_list_size(WORD32 num_entries, WORD32 entry_size) +{ + WORD32 size; + WORD32 clz; + size = sizeof(list_t); + size += ithread_get_mutex_lock_size(); + + /* Use next power of two number of entries*/ + clz = CLZ(num_entries); + num_entries = 1 << (32 - clz); + + size += num_entries * entry_size; + return size; +} + +/** +******************************************************************************* +* +* @brief +* Locks the list context +* +* @par Description +* Locks the list context by calling ithread_mutex_lock() +* +* @param[in] ps_list +* Job Queue context +* +* @returns IH264_FAIL if mutex lock fails else IH264_SUCCESS +* +* @remarks +* +******************************************************************************* +*/ +IH264_ERROR_T ih264_list_lock(list_t *ps_list) +{ + WORD32 retval; + retval = ithread_mutex_lock(ps_list->pv_mutex); + if(retval) + { + return IH264_FAIL; + } + return IH264_SUCCESS; +} + +/** +******************************************************************************* +* +* @brief +* Unlocks the list context +* +* @par Description +* Unlocks the list context by calling ithread_mutex_unlock() +* +* @param[in] ps_list +* Job Queue context +* +* @returns IH264_FAIL if mutex unlock fails else IH264_SUCCESS +* +* @remarks +* +******************************************************************************* +*/ + +IH264_ERROR_T ih264_list_unlock(list_t *ps_list) +{ + WORD32 retval; + retval = ithread_mutex_unlock(ps_list->pv_mutex); + if(retval) + { + return IH264_FAIL; + } + return IH264_SUCCESS; + +} +/** +******************************************************************************* +* +* @brief +* Yields the thread +* +* @par Description +* Unlocks the list context by calling +* ih264_list_unlock(), ithread_yield() and then ih264_list_lock() +* list is unlocked before to ensure the list can be accessed by other threads +* If unlock is not done before calling yield then no other thread can access +* the list functions and update list. +* +* @param[in] ps_list +* Job Queue context +* +* @returns IH264_FAIL if mutex lock unlock or yield fails else IH264_SUCCESS +* +* @remarks +* +******************************************************************************* +*/ +IH264_ERROR_T ih264_list_yield(list_t *ps_list) +{ + + IH264_ERROR_T ret = IH264_SUCCESS; + + IH264_ERROR_T rettmp; + rettmp = ih264_list_unlock(ps_list); + RETURN_IF((rettmp != IH264_SUCCESS), rettmp); + + ithread_yield(); + + if(ps_list->i4_yeild_interval_us > 0) + ithread_usleep(ps_list->i4_yeild_interval_us); + + rettmp = ih264_list_lock(ps_list); + RETURN_IF((rettmp != IH264_SUCCESS), rettmp); + return ret; +} + + +/** +******************************************************************************* +* +* @brief free the buf queue pointers +* +* @par Description +* Frees the list context +* +* @param[in] pv_buf +* Memory for buf queue buffer and buf queue context +* +* @returns Pointer to buf queue context +* +* @remarks +* Since it will be called only once by master thread this is not thread safe. +* +******************************************************************************* +*/ +IH264_ERROR_T ih264_list_free(list_t *ps_list) +{ + WORD32 ret; + ret = ithread_mutex_destroy(ps_list->pv_mutex); + + if(0 == ret) + return IH264_SUCCESS; + else + return IH264_FAIL; +} + +/** +******************************************************************************* +* +* @brief Initialize the buf queue +* +* @par Description +* Initializes the list context and sets write and read pointers to start of +* buf queue buffer +* +* @param[in] pv_buf +* Memoy for buf queue buffer and buf queue context +* +* @param[in] buf_size +* Size of the total memory allocated +* +* @returns Pointer to buf queue context +* +* @remarks +* Since it will be called only once by master thread this is not thread safe. +* +******************************************************************************* +*/ +void* ih264_list_init(void *pv_buf, + WORD32 buf_size, + WORD32 num_entries, + WORD32 entry_size, + WORD32 yeild_interval_us) +{ + list_t *ps_list; + UWORD8 *pu1_buf; + + pu1_buf = (UWORD8 *)pv_buf; + + ps_list = (list_t *)pu1_buf; + pu1_buf += sizeof(list_t); + buf_size -= sizeof(list_t); + + ps_list->pv_mutex = pu1_buf; + pu1_buf += ithread_get_mutex_lock_size(); + buf_size -= ithread_get_mutex_lock_size(); + + if (buf_size <= 0) + return NULL; + + ithread_mutex_init(ps_list->pv_mutex); + + /* Ensure num_entries is power of two */ + ASSERT(0 == (num_entries & (num_entries - 1))); + + /* Ensure remaining buffer is large enough to hold given number of entries */ + ASSERT((num_entries * entry_size) <= buf_size); + + ps_list->pv_buf_base = pu1_buf; + ps_list->i4_terminate = 0; + ps_list->i4_entry_size = entry_size; + ps_list->i4_buf_rd_idx = 0; + ps_list->i4_buf_wr_idx = 0; + ps_list->i4_log2_buf_max_idx = 32 - CLZ(num_entries); + ps_list->i4_buf_max_idx = num_entries; + ps_list->i4_yeild_interval_us = yeild_interval_us; + + return ps_list; +} +/** +******************************************************************************* +* +* @brief +* Resets the list context +* +* @par Description +* Resets the list context by initializing buf queue context elements +* +* @param[in] ps_list +* Job Queue context +* +* @returns IH264_FAIL if lock unlock fails else IH264_SUCCESS +* +* @remarks +* +******************************************************************************* +*/ +IH264_ERROR_T ih264_list_reset(list_t *ps_list) +{ + IH264_ERROR_T ret = IH264_SUCCESS; + ret = ih264_list_lock(ps_list); + RETURN_IF((ret != IH264_SUCCESS), ret); + + ps_list->i4_terminate = 0; + ps_list->i4_buf_rd_idx = 0; + ps_list->i4_buf_wr_idx = 0; + + ret = ih264_list_unlock(ps_list); + RETURN_IF((ret != IH264_SUCCESS), ret); + + return ret; +} + +/** +******************************************************************************* +* +* @brief +* Deinitializes the list context +* +* @par Description +* Deinitializes the list context by calling ih264_list_reset() +* and then destrying the mutex created +* +* @param[in] ps_list +* Job Queue context +* +* @returns IH264_FAIL if lock unlock fails else IH264_SUCCESS +* +* @remarks +* +******************************************************************************* +*/ +IH264_ERROR_T ih264_list_deinit(list_t *ps_list) +{ + WORD32 retval; + IH264_ERROR_T ret = IH264_SUCCESS; + + ret = ih264_list_reset(ps_list); + RETURN_IF((ret != IH264_SUCCESS), ret); + + retval = ithread_mutex_destroy(ps_list->pv_mutex); + if(retval) + { + return IH264_FAIL; + } + + return IH264_SUCCESS; +} + + +/** +******************************************************************************* +* +* @brief +* Terminates the list +* +* @par Description +* Terminates the list by setting a flag in context. +* +* @param[in] ps_list +* Job Queue context +* +* @returns IH264_FAIL if lock unlock fails else IH264_SUCCESS +* +* @remarks +* +******************************************************************************* +*/ + +IH264_ERROR_T ih264_list_terminate(list_t *ps_list) +{ + IH264_ERROR_T ret = IH264_SUCCESS; + ret = ih264_list_lock(ps_list); + RETURN_IF((ret != IH264_SUCCESS), ret); + + ps_list->i4_terminate = 1; + + ret = ih264_list_unlock(ps_list); + RETURN_IF((ret != IH264_SUCCESS), ret); + return ret; +} + + +/** +******************************************************************************* +* +* @brief Adds a buf to the queue +* +* @par Description +* Adds a buf to the queue and updates wr address to next location. +* Format/content of the buf structure is abstracted and hence size of the buf +* buffer is being passed. +* +* @param[in] ps_list +* Job Queue context +* +* @param[in] pv_buf +* Pointer to the location that contains details of the buf to be added +* +* @param[in] buf_size +* Size of the buf buffer +* +* @param[in] blocking +* To signal if the write is blocking or non-blocking. +* +* @returns +* +* @remarks +* Job Queue buffer is assumed to be allocated to handle worst case number of bufs +* Wrap around is not supported +* +******************************************************************************* +*/ +IH264_ERROR_T ih264_list_queue(list_t *ps_list, void *pv_buf, WORD32 blocking) +{ + IH264_ERROR_T ret = IH264_SUCCESS; + IH264_ERROR_T rettmp; + + WORD32 diff; + void *pv_buf_wr; + + volatile WORD32 *pi4_wr_idx, *pi4_rd_idx; + WORD32 buf_size = ps_list->i4_entry_size; + + + rettmp = ih264_list_lock(ps_list); + RETURN_IF((rettmp != IH264_SUCCESS), rettmp); + + + + while(1) + { + /* Ensure wr idx does not go beyond rd idx by more than number of entries + */ + pi4_wr_idx = &ps_list->i4_buf_wr_idx; + pi4_rd_idx = &ps_list->i4_buf_rd_idx; + diff = *pi4_wr_idx - *pi4_rd_idx; + + if(diff < ps_list->i4_buf_max_idx) + { + WORD32 wr_idx; + wr_idx = ps_list->i4_buf_wr_idx & (ps_list->i4_buf_max_idx - 1); + pv_buf_wr = (UWORD8 *)ps_list->pv_buf_base + wr_idx * buf_size; + + memcpy(pv_buf_wr, pv_buf, buf_size); + ps_list->i4_buf_wr_idx++; + break; + } + else + { + /* wr is ahead, so wait for rd to consume */ + if(blocking) + { + ih264_list_yield(ps_list); + } + else + { + ret = IH264_FAIL; + break; + } + } + + } + ps_list->i4_terminate = 0; + + rettmp = ih264_list_unlock(ps_list); + RETURN_IF((rettmp != IH264_SUCCESS), rettmp); + + return ret; +} +/** +******************************************************************************* +* +* @brief Gets next from the Job queue +* +* @par Description +* Gets next buf from the buf queue and updates rd address to next location. +* Format/content of the buf structure is abstracted and hence size of the buf +* buffer is being passed. If it is a blocking call and if there is no new buf +* then this functions unlocks the mutex and calls yield and then locks it back. +* and continues till a buf is available or terminate is set +* +* @param[in] ps_list +* Job Queue context +* +* @param[out] pv_buf +* Pointer to the location that contains details of the buf to be written +* +* @param[in] buf_size +* Size of the buf buffer +* +* @param[in] blocking +* To signal if the read is blocking or non-blocking. +* +* @returns +* +* @remarks +* Job Queue buffer is assumed to be allocated to handle worst case number of bufs +* Wrap around is not supported +* +******************************************************************************* +*/ +IH264_ERROR_T ih264_list_dequeue(list_t *ps_list, void *pv_buf, WORD32 blocking) +{ + IH264_ERROR_T ret = IH264_SUCCESS; + IH264_ERROR_T rettmp; + WORD32 buf_size = ps_list->i4_entry_size; + WORD32 diff; + + void *pv_buf_rd; + volatile WORD32 *pi4_wr_idx, *pi4_rd_idx; + + rettmp = ih264_list_lock(ps_list); + RETURN_IF((rettmp != IH264_SUCCESS), rettmp); + + while(1) + { + /* Ensure wr idx is ahead of rd idx and + * wr idx does not go beyond rd idx by more than number of entries + */ + pi4_wr_idx = &ps_list->i4_buf_wr_idx; + pi4_rd_idx = &ps_list->i4_buf_rd_idx; + diff = *pi4_wr_idx - *pi4_rd_idx; + + + if(diff > 0) + { + WORD32 rd_idx; + rd_idx = ps_list->i4_buf_rd_idx & (ps_list->i4_buf_max_idx - 1); + pv_buf_rd = (UWORD8 *)ps_list->pv_buf_base + rd_idx * buf_size; + + memcpy(pv_buf, pv_buf_rd, buf_size); + ps_list->i4_buf_rd_idx++; + break; + } + else + { + /* If terminate is signaled then break */ + if(ps_list->i4_terminate) + { + ret = IH264_FAIL; + break; + } + /* wr is ahead, so wait for rd to consume */ + if(blocking) + { + ih264_list_yield(ps_list); + } + else + { + ret = IH264_FAIL; + break; + } + } + + } + + + rettmp = ih264_list_unlock(ps_list); + RETURN_IF((rettmp != IH264_SUCCESS), rettmp); + + return ret; +} diff --git a/common/ih264_list.h b/common/ih264_list.h new file mode 100755 index 0000000..fc59d95 --- /dev/null +++ b/common/ih264_list.h @@ -0,0 +1,93 @@ +/****************************************************************************** + * + * Copyright (C) 2015 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ***************************************************************************** + * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore +*/ +/** +******************************************************************************* +* @file +* ih264_list.h +* +* @brief +* Contains functions for buf queue +* +* @author +* Harish +* +* @par List of Functions: +* +* @remarks +* None +* +******************************************************************************* +*/ + +#ifndef _IH264_LIST_H_ +#define _IH264_LIST_H_ + +typedef struct +{ + /** Pointer to buffer base which contains the bufs */ + void *pv_buf_base; + + /** Mutex used to keep the functions thread-safe */ + void *pv_mutex; + + /** Current write index */ + volatile WORD32 i4_buf_wr_idx; + + /** Current read index */ + volatile WORD32 i4_buf_rd_idx; + + /** Maximum index */ + WORD32 i4_buf_max_idx; + + /** Log2(buf_max_idx) - + * To ensure number of entries is power of two + * This makes it easier to wrap around by using AND with buf_max_idx - 1 + * */ + WORD32 i4_log2_buf_max_idx; + + /** Flag to indicate list has to be terminated */ + WORD32 i4_terminate; + + /** Size of each entry */ + WORD32 i4_entry_size; + + /** If the list is to be used frequently send this as zero, else send a large value + * to ensure cores are not loaded unnecessarily. + * For eg: For picture level queues this can be a large value like 100us + * but for jobq this will be zero. + */ + WORD32 i4_yeild_interval_us; + +}list_t; + +WORD32 ih264_list_size(WORD32 num_entries, WORD32 entry_size); +void* ih264_list_init(void *pv_buf, + WORD32 buf_size, + WORD32 num_entries, + WORD32 entry_size, + WORD32 yeild_interval_us); +IH264_ERROR_T ih264_list_free(list_t *ps_list); +IH264_ERROR_T ih264_list_reset(list_t *ps_list); +IH264_ERROR_T ih264_list_deinit(list_t *ps_list); +IH264_ERROR_T ih264_list_terminate(list_t *ps_list); +IH264_ERROR_T ih264_list_queue(list_t *ps_list, void *pv_buf, WORD32 blocking); +IH264_ERROR_T ih264_list_dequeue(list_t *ps_list, void *pv_buf, WORD32 blocking); + +#endif /* _IH264_PROCESS_SLICE_H_ */ diff --git a/common/ih264_luma_intra_pred_filters.c b/common/ih264_luma_intra_pred_filters.c new file mode 100755 index 0000000..4a5b143 --- /dev/null +++ b/common/ih264_luma_intra_pred_filters.c @@ -0,0 +1,1933 @@ +/****************************************************************************** + * + * Copyright (C) 2015 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ***************************************************************************** + * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore +*/ +/** + ******************************************************************************* + * @file + * ih264_luma_intra_pred_filters.c + * + * @brief + * Contains function definitions for intra prediction filters + * + * @author + * Ittiam + * + * @par List of Functions: + * - ih264_intra_pred_luma_4x4_mode_vert + * - ih264_intra_pred_luma_4x4_mode_horz + * - ih264_intra_pred_luma_4x4_mode_dc + * - ih264_intra_pred_luma_4x4_mode_diag_dl + * - ih264_intra_pred_luma_4x4_mode_diag_dr + * - ih264_intra_pred_luma_4x4_mode_vert_r + * - ih264_intra_pred_luma_4x4_mode_horz_d + * - ih264_intra_pred_luma_4x4_mode_vert_l + * - ih264_intra_pred_luma_4x4_mode_horz_u + * - ih264_intra_pred_luma_8x8_mode_ref_filtering + * - ih264_intra_pred_luma_8x8_mode_vert + * - ih264_intra_pred_luma_8x8_mode_horz + * - ih264_intra_pred_luma_8x8_mode_dc + * - ih264_intra_pred_luma_8x8_mode_diag_dl + * - ih264_intra_pred_luma_8x8_mode_diag_dr + * - ih264_intra_pred_luma_8x8_mode_vert_r + * - ih264_intra_pred_luma_8x8_mode_horz_d + * - ih264_intra_pred_luma_8x8_mode_vert_l + * - ih264_intra_pred_luma_8x8_mode_horz_u + * - ih264_intra_pred_luma_16x16_mode_vert + * - ih264_intra_pred_luma_16x16_mode_horz + * - ih264_intra_pred_luma_16x16_mode_dc + * - ih264_intra_pred_luma_16x16_mode_plane + * + * + * @remarks + * None + * + ****************************************************************************** + */ + +/*****************************************************************************/ +/* File Includes */ +/*****************************************************************************/ +/* System include files */ +#include <stdio.h> +#include <stddef.h> +#include <string.h> + +/* User include files */ +#include "ih264_defs.h" +#include "ih264_typedefs.h" +#include "ih264_macros.h" +#include "ih264_platform_macros.h" +#include "ih264_intra_pred_filters.h" + +/* Global variables used only in assembly files*/ +const WORD8 ih264_gai1_intrapred_luma_plane_coeffs[] = +{ 0x01, 0x02, 0x03, 0x04, + 0x05, 0x06, 0x07, 0x08, + 0x09, 0x0A, 0x0B, 0x0C, + 0x0D, 0x0E, 0x0F, 0x10, }; + +const WORD8 ih264_gai1_intrapred_luma_8x8_horz_u[] = +{ 0x06,0x15,0x05,0x14, + 0x04,0x13,0x03,0x12, + 0x02,0x11,0x01,0x10, + 0x00,0x1F,0x0F,0x0F +}; + +/******************* LUMA INTRAPREDICTION *******************/ + +/******************* 4x4 Modes *******************/ + +/** + ******************************************************************************* + * + *ih264_intra_pred_luma_4x4_mode_vert + * + * @brief + * Perform Intra prediction for luma_4x4 mode:vertical + * + * @par Description: + * Perform Intra prediction for luma_4x4 mode:vertical ,described in sec 8.3.1.2.1 + * + * @param[in] pu1_src + * UWORD8 pointer to the source + * + * @param[out] pu1_dst + * UWORD8 pointer to the destination + * + * @param[in] src_strd + * integer source stride + * + * @param[in] dst_strd + * integer destination stride + * + * @param[in] ngbr_avail + * availability of neighbouring pixels(Not used in this function) + * + * @returns + * + * @remarks + * None + * + ******************************************************************************* + */ +void ih264_intra_pred_luma_4x4_mode_vert(UWORD8 *pu1_src, + UWORD8 *pu1_dst, + WORD32 src_strd, + WORD32 dst_strd, + WORD32 ngbr_avail) +{ + UWORD8 *pu1_top = NULL; /* Pointer to start of top predictors */ + UNUSED(src_strd); + UNUSED(ngbr_avail); + pu1_top = pu1_src + BLK_SIZE + 1; + + memcpy(pu1_dst, pu1_top, 4); + memcpy(pu1_dst + dst_strd, pu1_top, 4); + memcpy(pu1_dst + 2 * dst_strd, pu1_top, 4); + memcpy(pu1_dst + 3 * dst_strd, pu1_top, 4); +} + +/** + ******************************************************************************* + * + *ih264_intra_pred_luma_4x4_mode_horz + * + * @brief + * Perform Intra prediction for luma_4x4 mode:horizontal + * + * @par Description: + * Perform Intra prediction for luma_4x4 mode:horizontal ,described in sec 8.3.1.2.2 + * + * @param[in] pu1_src + * UWORD8 pointer to the source + * + * @param[out] pu1_dst + * UWORD8 pointer to the destination + * + * @param[in] src_strd + * integer source stride + * + * @param[in] dst_strd + * integer destination stride + * + * @param[in] ngbr_avail + * availability of neighbouring pixels(Not used in this function) + * + * @returns + * + * @remarks + * None + * + ******************************************************************************* + */ +void ih264_intra_pred_luma_4x4_mode_horz(UWORD8 *pu1_src, + UWORD8 *pu1_dst, + WORD32 src_strd, + WORD32 dst_strd, + WORD32 ngbr_avail) +{ + UWORD8 *pu1_left = NULL; /* Pointer to start of left predictors */ + + UNUSED(src_strd); + UNUSED(ngbr_avail); + pu1_left = pu1_src + BLK_SIZE - 1; + + memset(pu1_dst, *pu1_left, 4); + memset(pu1_dst + dst_strd, *(pu1_left - 1), 4); + memset(pu1_dst + 2 * dst_strd, *(pu1_left - 2), 4); + memset(pu1_dst + 3 * dst_strd, *(pu1_left - 3), 4); +} + +/** + ******************************************************************************* + * + *ih264_intra_pred_luma_4x4_mode_dc + * + * @brief + * Perform Intra prediction for luma_4x4 mode:DC + * + * @par Description: + * Perform Intra prediction for luma_4x4 mode:DC ,described in sec 8.3.1.2.3 + * + * @param[in] pu1_src + * UWORD8 pointer to the source + * + * @param[out] pu1_dst + * UWORD8 pointer to the destination + * + * @param[in] src_strd + * integer source stride + * + * @param[in] dst_strd + * integer destination stride + * + * @param[in] ngbr_avail + * availability of neighbouring pixels + * + * @returns + * + * @remarks + * None + * + *******************************************************************************/ +void ih264_intra_pred_luma_4x4_mode_dc(UWORD8 *pu1_src, + UWORD8 *pu1_dst, + WORD32 src_strd, + WORD32 dst_strd, + WORD32 ngbr_avail) +{ + UWORD8 u1_useleft; /* availability of left predictors (only for DC) */ + UWORD8 u1_usetop; /* availability of top predictors (only for DC) */ + UWORD8 *pu1_left = NULL; /* Pointer to start of left predictors */ + UWORD8 *pu1_top = NULL; /* Pointer to start of top predictors */ + WORD32 val = 0; + UNUSED(src_strd); + UNUSED(ngbr_avail); + u1_useleft = BOOLEAN(ngbr_avail & LEFT_MB_AVAILABLE_MASK); + u1_usetop = BOOLEAN(ngbr_avail & TOP_MB_AVAILABLE_MASK); + pu1_top = pu1_src + BLK_SIZE + 1; + pu1_left = pu1_src + BLK_SIZE - 1; + + if(u1_useleft) + { + val += *pu1_left--; + val += *pu1_left--; + val += *pu1_left--; + val += *pu1_left + 2; + } + if(u1_usetop) + { + val += *pu1_top + *(pu1_top + 1) + *(pu1_top + 2) + *(pu1_top + 3) + + 2; + } + /* Since 2 is added if either left/top pred is there, + val still being zero implies both preds are not there */ + val = (val) ? (val >> (1 + u1_useleft + u1_usetop)) : 128; + + /* 4 bytes are copied from src to dst */ + memset(pu1_dst, val, 4); + memset(pu1_dst + dst_strd, val, 4); + memset(pu1_dst + 2 * dst_strd, val, 4); + memset(pu1_dst + 3 * dst_strd, val, 4); +} + +/** + ******************************************************************************* + * + *ih264_intra_pred_luma_4x4_mode_diag_dl + * + * @brief + * Perform Intra prediction for luma_4x4 mode:Diagonal_Down_Left + * + * @par Description: + * Perform Intra prediction for luma_4x4 mode:Diagonal_Down_Left ,described in sec 8.3.1.2.4 + * + * @param[in] pu1_src + * UWORD8 pointer to the source + * + * @param[out] pu1_dst + * UWORD8 pointer to the destination + * + * @param[in] src_strd + * integer source stride + * + * @param[in] dst_strd + * integer destination stride + * + * @param[in] ngbr_avail + * availability of neighbouring pixels(Not used in this function) + * + * @returns + * + * @remarks + * None + * + *******************************************************************************/ +void ih264_intra_pred_luma_4x4_mode_diag_dl(UWORD8 *pu1_src, + UWORD8 *pu1_dst, + WORD32 src_strd, + WORD32 dst_strd, + WORD32 ngbr_avail) +{ + UWORD8 *pu1_top = NULL; /* Pointer to start of top predictors */ + UWORD32 ui4_a, ui4_b, ui4_c, ui4_d, ui4_e, ui4_f, ui4_g, ui4_h; + UWORD8 predicted_pixels[7]; + UNUSED(src_strd); + UNUSED(ngbr_avail); + pu1_top = pu1_src +BLK_SIZE + 1; + + ui4_a = *pu1_top++; + ui4_b = *pu1_top++; + ui4_c = *pu1_top++; + ui4_d = *pu1_top++; + ui4_e = *pu1_top++; + ui4_f = *pu1_top++; + ui4_g = *pu1_top++; + ui4_h = *pu1_top; + + predicted_pixels[0] = FILT121(ui4_a, ui4_b, ui4_c); + predicted_pixels[1] = FILT121(ui4_b, ui4_c, ui4_d); + predicted_pixels[2] = FILT121(ui4_c, ui4_d, ui4_e); + predicted_pixels[3] = FILT121(ui4_d, ui4_e, ui4_f); + predicted_pixels[4] = FILT121(ui4_e, ui4_f, ui4_g); + predicted_pixels[5] = FILT121(ui4_f, ui4_g, ui4_h); + predicted_pixels[6] = FILT121(ui4_g, ui4_h, ui4_h); + + memcpy(pu1_dst, predicted_pixels, 4); + memcpy(pu1_dst + dst_strd, predicted_pixels + 1, 4); + memcpy(pu1_dst + 2 * dst_strd, predicted_pixels + 2, 4); + memcpy(pu1_dst + 3 * dst_strd, predicted_pixels + 3, 4); +} + +/** + ******************************************************************************* + * + *ih264_intra_pred_luma_4x4_mode_diag_dr + * + * @brief + * Perform Intra prediction for luma_4x4 mode:Diagonal_Down_Right + * + * @par Description: + * Perform Intra prediction for luma_4x4 mode:Diagonal_Down_Right ,described in sec 8.3.1.2.5 + * + * @param[in] pu1_src + * UWORD8 pointer to the source + * + * @param[out] pu1_dst + * UWORD8 pointer to the destination + * + * @param[in] src_strd + * integer source stride + * + * @param[in] dst_strd + * integer destination stride + * + * @param[in] ngbr_avail + * availability of neighbouring pixels(Not used in this function) + * + * @returns + * + * @remarks + * None + * + *******************************************************************************/ +void ih264_intra_pred_luma_4x4_mode_diag_dr(UWORD8 *pu1_src, + UWORD8 *pu1_dst, + WORD32 src_strd, + WORD32 dst_strd, + WORD32 ngbr_avail) +{ + UWORD8 *pu1_top = NULL; /* Pointer to start of top predictors */ + UWORD8 *pu1_left = NULL; /* Pointer to start of left predictors */ + UWORD8 *pu1_topleft = NULL;/* Pointer to top left predictor */ + UWORD32 ui4_a, ui4_b, ui4_c, ui4_d, ui4_i, ui4_j, ui4_k, ui4_l, ui4_m; + UWORD8 predicted_pixels[7]; + UNUSED(src_strd); + UNUSED(ngbr_avail); + pu1_top = pu1_src + BLK_SIZE + 1; + pu1_left = pu1_src + BLK_SIZE - 1; + pu1_topleft = pu1_src +BLK_SIZE; + + ui4_a = *pu1_top++; + ui4_b = *pu1_top++; + ui4_c = *pu1_top++; + ui4_d = *pu1_top++; + ui4_i = *pu1_left--; + ui4_j = *pu1_left--; + ui4_k = *pu1_left--; + ui4_l = *pu1_left; + ui4_m = *pu1_topleft; + + predicted_pixels[2] = FILT121(ui4_j, ui4_i, ui4_m); + predicted_pixels[1] = FILT121(ui4_k, ui4_j, ui4_i); + predicted_pixels[0] = FILT121(ui4_l, ui4_k, ui4_j); + predicted_pixels[3] = FILT121(ui4_i, ui4_m, ui4_a); + predicted_pixels[4] = FILT121(ui4_m, ui4_a, ui4_b); + predicted_pixels[5] = FILT121(ui4_a, ui4_b, ui4_c); + predicted_pixels[6] = FILT121(ui4_b, ui4_c, ui4_d); + + memcpy(pu1_dst, predicted_pixels + 3, 4); + memcpy(pu1_dst + dst_strd, predicted_pixels + 2, 4); + memcpy(pu1_dst + 2 * dst_strd, predicted_pixels + 1, 4); + memcpy(pu1_dst + 3 * dst_strd, predicted_pixels, 4); +} + +/** + ******************************************************************************* + * + *ih264_intra_pred_luma_4x4_mode_vert_r + * + * @brief + * Perform Intra prediction for luma_4x4 mode:Vertical_Right + * + * @par Description: + * Perform Intra prediction for luma_4x4 mode:Vertical_Right ,described in sec 8.3.1.2.6 + * + * @param[in] pu1_src + * UWORD8 pointer to the source + * + * @param[out] pu1_dst + * UWORD8 pointer to the destination + * + * @param[in] src_strd + * integer source stride + * + * @param[in] dst_strd + * integer destination stride + * + * @param[in] ngbr_avail + * availability of neighbouring pixels(Not used in this function) + * + * @returns + * + * @remarks + * None + * + *******************************************************************************/ +void ih264_intra_pred_luma_4x4_mode_vert_r(UWORD8 *pu1_src, + UWORD8 *pu1_dst, + WORD32 src_strd, + WORD32 dst_strd, + WORD32 ngbr_avail) +{ + + UWORD32 ui4_a, ui4_b, ui4_c, ui4_d, ui4_i, ui4_j, ui4_k, ui4_m; + UWORD8 *pu1_top = NULL; /* Pointer to start of top predictors */ + UWORD8 *pu1_left = NULL; /* Pointer to start of left predictors */ + UWORD8 *pu1_topleft = NULL;/* Pointer to top left predictor */ + UWORD8 predicted_pixels[10]; + UNUSED(src_strd); + UNUSED(ngbr_avail); + pu1_top = pu1_src +BLK_SIZE + 1; + pu1_left = pu1_src + BLK_SIZE - 1; + pu1_topleft = pu1_src + BLK_SIZE; + + ui4_a = *pu1_top++; + ui4_b = *pu1_top++; + ui4_c = *pu1_top++; + ui4_d = *pu1_top++; + ui4_i = *pu1_left--; + ui4_j = *pu1_left--; + ui4_k = *pu1_left; + ui4_m = *pu1_topleft; + + predicted_pixels[6] = FILT11(ui4_m, ui4_a); + predicted_pixels[7] = FILT11(ui4_a, ui4_b); + predicted_pixels[8] = FILT11(ui4_b, ui4_c); + predicted_pixels[9] = FILT11(ui4_c, ui4_d); + predicted_pixels[1] = FILT121(ui4_i, ui4_m, ui4_a); + predicted_pixels[2] = FILT121(ui4_m, ui4_a, ui4_b); + predicted_pixels[3] = FILT121(ui4_a, ui4_b, ui4_c); + predicted_pixels[4] = FILT121(ui4_b, ui4_c, ui4_d); + predicted_pixels[5] = FILT121(ui4_j, ui4_i, ui4_m); + predicted_pixels[0] = FILT121(ui4_k, ui4_j, ui4_i); + + memcpy(pu1_dst, predicted_pixels + 6, 4); + memcpy(pu1_dst + dst_strd, predicted_pixels + 1, 4); + memcpy(pu1_dst + 2 * dst_strd, predicted_pixels + 5, 4); + memcpy(pu1_dst + 3 * dst_strd, predicted_pixels, 4); +} + +/* + ******************************************************************************* + * + *ih264_intra_pred_luma_4x4_mode_horz_d + * + * @brief + * Perform Intra prediction for luma_4x4 mode:Horizontal_Down + * + * @par Description: + * Perform Intra prediction for luma_4x4 mode:Horizontal_Down ,described in sec 8.3.1.2.7 + * + * @param[in] pu1_src + * UWORD8 pointer to the source + * + * @param[out] pu1_dst + * UWORD8 pointer to the destination + * + * @param[in] src_strd + * integer source stride + * + * @param[in] dst_strd + * integer destination stride + * + * @param[in] ngbr_avail + * availability of neighbouring pixels(Not used in this function) + * + * @returns + * + * @remarks + * None + * + *******************************************************************************/ +void ih264_intra_pred_luma_4x4_mode_horz_d(UWORD8 *pu1_src, + UWORD8 *pu1_dst, + WORD32 src_strd, + WORD32 dst_strd, + WORD32 ngbr_avail) +{ + UWORD8 *pu1_top = NULL; /* Pointer to start of top predictors */ + UWORD8 *pu1_left = NULL; /* Pointer to start of left predictors */ + UWORD8 *pu1_topleft = NULL;/* Pointer to top left predictor */ + UWORD32 ui4_a, ui4_b, ui4_c, ui4_i, ui4_j, ui4_k, ui4_l, ui4_m; + UWORD8 predicted_pixels[10]; + UNUSED(src_strd); + UNUSED(ngbr_avail); + pu1_top = pu1_src + BLK_SIZE + 1; + pu1_left = pu1_src + BLK_SIZE - 1; + pu1_topleft = pu1_src + BLK_SIZE; + + ui4_a = *pu1_top++; + ui4_b = *pu1_top++; + ui4_c = *pu1_top++; + ui4_i = *pu1_left--; + ui4_j = *pu1_left--; + ui4_k = *pu1_left--; + ui4_l = *pu1_left--; + ui4_m = *pu1_topleft; + + predicted_pixels[6] = FILT11(ui4_i, ui4_m); + predicted_pixels[7] = FILT121(ui4_i, ui4_m, ui4_a); + predicted_pixels[8] = FILT121(ui4_m, ui4_a, ui4_b); + predicted_pixels[9] = FILT121(ui4_a, ui4_b, ui4_c); + predicted_pixels[1] = FILT121(ui4_l, ui4_k, ui4_j); + predicted_pixels[2] = FILT11(ui4_k, ui4_j); + predicted_pixels[3] = FILT121(ui4_k, ui4_j, ui4_i); + predicted_pixels[4] = FILT11(ui4_j, ui4_i); + predicted_pixels[5] = FILT121(ui4_j, ui4_i, ui4_m); + predicted_pixels[0] = FILT11(ui4_l, ui4_k); + + memcpy(pu1_dst, predicted_pixels + 6, 4); + memcpy(pu1_dst + dst_strd, predicted_pixels + 4, 4); + memcpy(pu1_dst + 2 * dst_strd, predicted_pixels + 2, 4); + memcpy(pu1_dst + 3 * dst_strd, predicted_pixels, 4); +} + +/** + ******************************************************************************* + * + *ih264_intra_pred_luma_4x4_mode_vert_l + * + * @brief + * Perform Intra prediction for luma_4x4 mode:Vertical_Left + * + * @par Description: + * Perform Intra prediction for luma_4x4 mode:Vertical_Left ,described in sec 8.3.1.2.8 + * + * @param[in] pu1_src + * UWORD8 pointer to the source + * + * @param[out] pu1_dst + * UWORD8 pointer to the destination + * + * @param[in] src_strd + * integer source stride + * + * @param[in] dst_strd + * integer destination stride + * + * @param[in] ngbr_avail + * availability of neighbouring pixels(Not used in this function) + * + * @returns + * + * @remarks + * None + * + *******************************************************************************/ +void ih264_intra_pred_luma_4x4_mode_vert_l(UWORD8 *pu1_src, + UWORD8 *pu1_dst, + WORD32 src_strd, + WORD32 dst_strd, + WORD32 ngbr_avail) +{ + UWORD8 *pu1_top = NULL; /* Pointer to start of top predictors */ + UWORD32 ui4_a, ui4_b, ui4_c, ui4_d, ui4_e, ui4_f, ui4_g; + UWORD8 predicted_pixels[10]; + UNUSED(src_strd); + UNUSED(ngbr_avail); + pu1_top = pu1_src + BLK_SIZE + 1; + + ui4_a = *pu1_top++; + ui4_b = *pu1_top++; + ui4_c = *pu1_top++; + ui4_d = *pu1_top++; + ui4_e = *pu1_top++; + ui4_f = *pu1_top++; + ui4_g = *pu1_top; + + predicted_pixels[5] = FILT11(ui4_a, ui4_b); + predicted_pixels[6] = FILT11(ui4_b, ui4_c); + predicted_pixels[7] = FILT11(ui4_c, ui4_d); + predicted_pixels[8] = FILT11(ui4_d, ui4_e); + predicted_pixels[0] = FILT121(ui4_a, ui4_b, ui4_c); + predicted_pixels[1] = FILT121(ui4_b, ui4_c, ui4_d); + predicted_pixels[2] = FILT121(ui4_c, ui4_d, ui4_e); + predicted_pixels[3] = FILT121(ui4_d, ui4_e, ui4_f); + predicted_pixels[9] = FILT11(ui4_e, ui4_f); + predicted_pixels[4] = FILT121(ui4_e, ui4_f, ui4_g); + + memcpy(pu1_dst, predicted_pixels + 5, 4); + memcpy(pu1_dst + dst_strd, predicted_pixels, 4); + memcpy(pu1_dst + 2 * dst_strd, predicted_pixels + 6, 4); + memcpy(pu1_dst + 3 * dst_strd, predicted_pixels + 1, 4); +} + +/** + ******************************************************************************* + * + *ih264_intra_pred_luma_4x4_mode_horz_u + * + * @brief + * Perform Intra prediction for luma_4x4 mode:Horizontal_Up + * + * @par Description: + * Perform Intra prediction for luma_4x4 mode:Horizontal_Up ,described in sec 8.3.1.2.9 + * + * @param[in] pu1_src + * UWORD8 pointer to the source + * + * @param[out] pu1_dst + * UWORD8 pointer to the destination + * + * @param[in] src_strd + * integer source stride + * + * @param[in] dst_strd + * integer destination stride + * + * @param[in] ngbr_avail + * availability of neighbouring pixels(Not used in this function) + * + * @returns + * + * @remarks + * None + * + *******************************************************************************/ +void ih264_intra_pred_luma_4x4_mode_horz_u(UWORD8 *pu1_src, + UWORD8 *pu1_dst, + WORD32 src_strd, + WORD32 dst_strd, + WORD32 ngbr_avail) +{ + UWORD8 *pu1_left = NULL; /* Pointer to start of left predictors */ + UWORD32 ui4_i, ui4_j, ui4_k, ui4_l; + UWORD8 predicted_pixels[10]; + UNUSED(src_strd); + UNUSED(ngbr_avail); + pu1_left = pu1_src + BLK_SIZE - 1; + + ui4_i = *pu1_left--; + ui4_j = *pu1_left--; + ui4_k = *pu1_left--; + ui4_l = *pu1_left--; + + predicted_pixels[0] = FILT11(ui4_j, ui4_i); + predicted_pixels[1] = FILT121(ui4_k, ui4_j, ui4_i); + predicted_pixels[2] = FILT11(ui4_k, ui4_j); + predicted_pixels[3] = FILT121(ui4_l, ui4_k, ui4_j); + predicted_pixels[4] = FILT11(ui4_l, ui4_k); + predicted_pixels[5] = FILT121(ui4_l, ui4_l, ui4_k); + predicted_pixels[6] = ui4_l; + predicted_pixels[7] = ui4_l; + predicted_pixels[8] = ui4_l; + predicted_pixels[9] = ui4_l; + + memcpy(pu1_dst, predicted_pixels, 4); + memcpy(pu1_dst + dst_strd, predicted_pixels + 2, 4); + memcpy(pu1_dst + 2 * dst_strd, predicted_pixels + 4, 4); + memcpy(pu1_dst + 3 * dst_strd, predicted_pixels + 6, 4); +} + +/******************* 8x8 Modes *******************/ + +/** + ******************************************************************************* + * + *ih264_intra_pred_luma_8x8_mode_ref_filtering + * + * @brief + * Reference sample filtering process for Intra_8x8 sample prediction + * + * @par Description: + * Perform Reference sample filtering process for Intra_8x8 sample prediction ,described in sec 8.3.2.2.1 + * + * @param[in] pu1_src + * UWORD8 pointer to the source + * + * @param[out] pu1_dst + * UWORD8 pointer to the destination + * + * @param[in] src_strd + * integer source stride[Not Used] + * + * @param[in] dst_strd + * integer destination stride[Not Used] + * + * @param[in] ngbr_avail + * availability of neighbouring pixels(Not used in this function) + * + * @returns + * + * @remarks + * None + * + * + *******************************************************************************/ +void ih264_intra_pred_luma_8x8_mode_ref_filtering(UWORD8 *pu1_left, + UWORD8 *pu1_topleft, + UWORD8 *pu1_top, + UWORD8 *pu1_dst, + WORD32 left_strd, + WORD32 ngbr_avail) +{ + WORD32 top_avail, left_avail, top_left_avail, top_right_avail; + + left_avail = BOOLEAN(ngbr_avail & LEFT_MB_AVAILABLE_MASK); + top_avail = BOOLEAN(ngbr_avail & TOP_MB_AVAILABLE_MASK); + top_left_avail = BOOLEAN(ngbr_avail & TOP_LEFT_MB_AVAILABLE_MASK); + top_right_avail = BOOLEAN(ngbr_avail & TOP_RIGHT_MB_AVAILABLE_MASK); + + if(top_avail) + { + WORD32 i; + UWORD32 u4_xm1; + + if(!top_right_avail) + { + memset(pu1_dst + 8 + 1 + 8, pu1_top[7], 8); + top_right_avail = 1; + } + else + { + memcpy(pu1_dst + 8 + 1 + 8, pu1_top + 8, 8); + } + + if(top_left_avail) + { + pu1_dst[8 + 1 + 0] = FILT121((*pu1_topleft), pu1_top[0], + pu1_top[1]); + + } + else + { + pu1_dst[8 + 1] = ((3 * pu1_top[0]) + pu1_top[1] + 2) >> 2; + } + + for(i = 1; i <= 6; i++) + { + pu1_dst[8 + 1 + i] = FILT121(pu1_top[i - 1], pu1_top[i], + pu1_top[i + 1]); + + } + /* First byte of Top Right input is in pu1_dst[8 + 1 + 8]*/ + pu1_dst[8 + 1 + 7] = FILT121(pu1_top[6], pu1_top[7], + pu1_dst[8 + 1 + 8]); + + /* filtered output and source in same buf, to prevent output(x - 1) + being over written in process */ + u4_xm1 = pu1_top[7]; + + for(i = 8; i <= 14; i++) + { + UWORD32 u4_x; + u4_x = (u4_xm1 + (pu1_dst[8 + 1 + i] << 1) + pu1_dst[8 + 1 + i + 1] + + 2) >> 2; + /* assigning u4_xm1 from the un-filtered values for the next iteration */ + u4_xm1 = pu1_dst[8 + 1 + i]; + pu1_dst[8 + 1 + i] = u4_x; + } + + pu1_dst[8 + 1 + 15] = (u4_xm1 + (3 * pu1_dst[8 + 1 + 15]) + 2) >> 2; + + } + + /* pu1_topleft is overloaded. It is both: */ + /* a. A pointer for the top left pixel */ + /* b. An indicator of availability of top left. */ + /* If it is null then top left not available */ + if(top_left_avail) + { + if((!top_avail) || (!left_avail)) + { + if(top_avail) + pu1_dst[8] = (3 * pu1_topleft[0] + pu1_top[0] + 2) >> 2; + else if(left_avail) + pu1_dst[8] = (3 * pu1_topleft[0] + pu1_left[0] + 2) >> 2; + } + else + { + pu1_dst[8] = FILT121(pu1_top[0], (*pu1_topleft), pu1_left[0]); + } + } + + if(left_avail) + { + UWORD32 idx; + if(0 != pu1_topleft) + { + pu1_dst[7] = FILT121((*pu1_topleft), pu1_left[0], + pu1_left[left_strd]); + } + else + { + pu1_dst[7] = ((3 * pu1_left[0]) + pu1_left[left_strd] + 2) >> 2; + } + + for(idx = 1; idx <= 6; idx++) + { + pu1_dst[7 - idx] = FILT121(pu1_left[(idx - 1) * left_strd], + pu1_left[idx * left_strd], + pu1_left[(idx + 1) * left_strd]); + + } + pu1_dst[0] = (pu1_left[6 * left_strd] + 3 * pu1_left[7 * left_strd] + 2) + >> 2; + + } +} + +/** + ******************************************************************************* + * + *ih264_intra_pred_luma_8x8_mode_vert + * + * @brief + * Perform Intra prediction for luma_8x8 mode:vertical + * + * @par Description: + * Perform Intra prediction for luma_8x8 mode:vertical ,described in sec 8.3.2.2.2 + * + * @param[in] pu1_src + * UWORD8 pointer to the source + * + * @param[out] pu1_dst + * UWORD8 pointer to the destination + * + * @param[in] src_strd + * integer source stride + * + * @param[in] dst_strd + * integer destination stride + * + * @param[in] ngbr_avail + * availability of neighbouring pixels(Not used in this function) + * + * @returns + * + * @remarks + * None + * + ******************************************************************************* + */ +void ih264_intra_pred_luma_8x8_mode_vert(UWORD8 *pu1_src, + UWORD8 *pu1_dst, + WORD32 src_strd, + WORD32 dst_strd, + WORD32 ngbr_avail) +{ + UWORD8 *pu1_top = NULL; + UNUSED(src_strd); + UNUSED(ngbr_avail); + pu1_top = pu1_src + BLK8x8SIZE + 1; + + memcpy(pu1_dst, pu1_top, 8); + memcpy(pu1_dst + dst_strd, pu1_top, 8); + memcpy(pu1_dst + 2 * dst_strd, pu1_top, 8); + memcpy(pu1_dst + 3 * dst_strd, pu1_top, 8); + memcpy(pu1_dst + 4 * dst_strd, pu1_top, 8); + memcpy(pu1_dst + 5 * dst_strd, pu1_top, 8); + memcpy(pu1_dst + 6 * dst_strd, pu1_top, 8); + memcpy(pu1_dst + 7 * dst_strd, pu1_top, 8); +} + +/** + ******************************************************************************* + * + *ih264_intra_pred_luma_8x8_mode_horz + * + * @brief + * Perform Intra prediction for luma_8x8 mode:horizontal + * + * @par Description: + * Perform Intra prediction for luma_8x8 mode:horizontal ,described in sec 8.3.2.2.2 + * + * @param[in] pu1_src + * UWORD8 pointer to the source + * + * @param[out] pu1_dst + * UWORD8 pointer to the destination + * + * @param[in] src_strd + * integer source stride + * + * @param[in] dst_strd + * integer destination stride + * + * @param[in] ngbr_avail + * availability of neighbouring pixels(Not used in this function) + * + * @returns + * + * @remarks + * None + * + ******************************************************************************* + */ + +void ih264_intra_pred_luma_8x8_mode_horz(UWORD8 *pu1_src, + UWORD8 *pu1_dst, + WORD32 src_strd, + WORD32 dst_strd, + WORD32 ngbr_avail) +{ + UWORD8 *pu1_left = pu1_src + BLK8x8SIZE - 1; + UNUSED(src_strd); + UNUSED(ngbr_avail); + memset(pu1_dst, *pu1_left, 8); + memset(pu1_dst + dst_strd, *(pu1_left - 1), 8); + memset(pu1_dst + 2 * dst_strd, *(pu1_left - 2), 8); + memset(pu1_dst + 3 * dst_strd, *(pu1_left - 3), 8); + memset(pu1_dst + 4 * dst_strd, *(pu1_left - 4), 8); + memset(pu1_dst + 5 * dst_strd, *(pu1_left - 5), 8); + memset(pu1_dst + 6 * dst_strd, *(pu1_left - 6), 8); + memset(pu1_dst + 7 * dst_strd, *(pu1_left - 7), 8); +} + +/** + ******************************************************************************* + * + *ih264_intra_pred_luma_8x8_mode_dc + * + * @brief + * Perform Intra prediction for luma_8x8 mode:DC + * + * @par Description: + * Perform Intra prediction for luma_8x8 mode:DC ,described in sec 8.3.2.2.4 + * + * @param[in] pu1_src + * UWORD8 pointer to the source + * + * @param[out] pu1_dst + * UWORD8 pointer to the destination + * + * @param[in] src_strd + * integer source stride + * + * @param[in] dst_strd + * integer destination stride + * + * @param[in] ngbr_avail + * availability of neighbouring pixels + * + * @returns + * + * @remarks + * None + * + *******************************************************************************/ +void ih264_intra_pred_luma_8x8_mode_dc(UWORD8 *pu1_src, + UWORD8 *pu1_dst, + WORD32 src_strd, + WORD32 dst_strd, + WORD32 ngbr_avail) +{ + UWORD8 u1_useleft; /* availability of left predictors (only for DC) */ + UWORD8 u1_usetop; /* availability of top predictors (only for DC) */ + UWORD8 *pu1_left = NULL; /* Pointer to start of left predictors */ + UWORD8 *pu1_top = NULL; /* Pointer to start of top predictors */ + WORD32 row; + WORD32 val = 0; + UNUSED(src_strd); + + u1_useleft = BOOLEAN(ngbr_avail & LEFT_MB_AVAILABLE_MASK); + u1_usetop = BOOLEAN(ngbr_avail & TOP_MB_AVAILABLE_MASK); + pu1_top = pu1_src + BLK8x8SIZE + 1; + pu1_left = pu1_src + BLK8x8SIZE - 1; + + if(u1_useleft) + { + for(row = 0; row < BLK8x8SIZE; row++) + val += *(pu1_left - row); + val += 4; + } + if(u1_usetop) + { + for(row = 0; row < BLK8x8SIZE; row++) + val += *(pu1_top + row); + val += 4; + } + + /* Since 4 is added if either left/top pred is there, + val still being zero implies both preds are not there */ + val = (val) ? (val >> (2 + u1_useleft + u1_usetop)) : 128; + + memset(pu1_dst, val, 8); + memset(pu1_dst + dst_strd, val, 8); + memset(pu1_dst + 2 * dst_strd, val, 8); + memset(pu1_dst + 3 * dst_strd, val, 8); + memset(pu1_dst + 4 * dst_strd, val, 8); + memset(pu1_dst + 5 * dst_strd, val, 8); + memset(pu1_dst + 6 * dst_strd, val, 8); + memset(pu1_dst + 7 * dst_strd, val, 8); +} + +/** + ******************************************************************************* + * + *ih264_intra_pred_luma_8x8_mode_diag_dl + * + * @brief + * Perform Intra prediction for luma_8x8 mode:Diagonal_Down_Left + * + * @par Description: + * Perform Intra prediction for luma_8x8 mode:Diagonal_Down_Left ,described in sec 8.3.2.2.5 + * + * @param[in] pu1_src + * UWORD8 pointer to the source + * + * @param[out] pu1_dst + * UWORD8 pointer to the destination + * + * @param[in] src_strd + * integer source stride + * + * @param[in] dst_strd + * integer destination stride + * + * @param[in] ngbr_avail + * availability of neighbouring pixels(Not used in this function) + * + * @returns + * + * @remarks + * None + * + *******************************************************************************/ +void ih264_intra_pred_luma_8x8_mode_diag_dl(UWORD8 *pu1_src, + UWORD8 *pu1_dst, + WORD32 src_strd, + WORD32 dst_strd, + WORD32 ngbr_avail) +{ + UWORD8 *pu1_top = NULL; /* Pointer to start of top predictors */ + UWORD32 ui4_a, ui4_b, ui4_c, ui4_d, ui4_e, ui4_f, ui4_g, ui4_h; + UWORD32 ui4_i, ui4_j, ui4_k, ui4_l, ui4_m, ui4_n, ui4_o, ui4_p; + UWORD8 predicted_pixels[15]; + UNUSED(src_strd); + UNUSED(ngbr_avail); + pu1_top = pu1_src + BLK8x8SIZE + 1; + + ui4_a = *pu1_top++; + ui4_b = *pu1_top++; + ui4_c = *pu1_top++; + ui4_d = *pu1_top++; + ui4_e = *pu1_top++; + ui4_f = *pu1_top++; + ui4_g = *pu1_top++; + ui4_h = *pu1_top++; + ui4_i = *pu1_top++; + ui4_j = *pu1_top++; + ui4_k = *pu1_top++; + ui4_l = *pu1_top++; + ui4_m = *pu1_top++; + ui4_n = *pu1_top++; + ui4_o = *pu1_top++; + ui4_p = *pu1_top; + + predicted_pixels[0] = FILT121(ui4_a, ui4_b, ui4_c); + predicted_pixels[1] = FILT121(ui4_b, ui4_c, ui4_d); + predicted_pixels[2] = FILT121(ui4_c, ui4_d, ui4_e); + predicted_pixels[3] = FILT121(ui4_d, ui4_e, ui4_f); + predicted_pixels[4] = FILT121(ui4_e, ui4_f, ui4_g); + predicted_pixels[5] = FILT121(ui4_f, ui4_g, ui4_h); + predicted_pixels[6] = FILT121(ui4_g, ui4_h, ui4_i); + predicted_pixels[7] = FILT121(ui4_h, ui4_i, ui4_j); + predicted_pixels[8] = FILT121(ui4_i, ui4_j, ui4_k); + predicted_pixels[9] = FILT121(ui4_j, ui4_k, ui4_l); + predicted_pixels[10] = FILT121(ui4_k, ui4_l, ui4_m); + predicted_pixels[11] = FILT121(ui4_l, ui4_m, ui4_n); + predicted_pixels[12] = FILT121(ui4_m, ui4_n, ui4_o); + predicted_pixels[13] = FILT121(ui4_n, ui4_o, ui4_p); + predicted_pixels[14] = FILT121(ui4_o, ui4_p, ui4_p); + + memcpy(pu1_dst, predicted_pixels, 8); + memcpy(pu1_dst + dst_strd, predicted_pixels + 1, 8); + memcpy(pu1_dst + 2 * dst_strd, predicted_pixels + 2, 8); + memcpy(pu1_dst + 3 * dst_strd, predicted_pixels + 3, 8); + memcpy(pu1_dst + 4 * dst_strd, predicted_pixels + 4, 8); + memcpy(pu1_dst + 5 * dst_strd, predicted_pixels + 5, 8); + memcpy(pu1_dst + 6 * dst_strd, predicted_pixels + 6, 8); + memcpy(pu1_dst + 7 * dst_strd, predicted_pixels + 7, 8); +} + +/** + ******************************************************************************* + * + *ih264_intra_pred_luma_8x8_mode_diag_dr + * + * @brief + * Perform Intra prediction for luma_8x8 mode:Diagonal_Down_Right + * + * @par Description: + * Perform Intra prediction for luma_8x8 mode:Diagonal_Down_Right ,described in sec 8.3.2.2.6 + * + * @param[in] pu1_src + * UWORD8 pointer to the source + * + * @param[out] pu1_dst + * UWORD8 pointer to the destination + * + * @param[in] src_strd + * integer source stride + * + * @param[in] dst_strd + * integer destination stride + * + * @param[in] ngbr_avail + * availability of neighbouring pixels(Not used in this function) + * + * @returns + * + * @remarks + * None + * + *******************************************************************************/ +void ih264_intra_pred_luma_8x8_mode_diag_dr(UWORD8 *pu1_src, + UWORD8 *pu1_dst, + WORD32 src_strd, + WORD32 dst_strd, + WORD32 ngbr_avail) +{ + UWORD8 *pu1_left = NULL; /* Pointer to start of left predictors */ + UWORD8 *pu1_top = NULL; /* Pointer to start of top predictors */ + UWORD8 *pu1_topleft = NULL; /* Pointer to start of top left predictors */ + UWORD32 ui4_a; + UWORD32 ui4_b, ui4_c, ui4_d, ui4_e, ui4_f, ui4_g, ui4_h, ui4_i; + UWORD32 ui4_j, ui4_k, ui4_l, ui4_m, ui4_n, ui4_o, ui4_p, ui4_q; + UWORD8 predicted_pixels[15]; + UNUSED(src_strd); + UNUSED(ngbr_avail); + pu1_top = pu1_src + BLK8x8SIZE + 1; + pu1_left = pu1_src + BLK8x8SIZE - 1; + pu1_topleft = pu1_src + BLK8x8SIZE; + + ui4_a = *pu1_topleft; + ui4_b = *pu1_top++; + ui4_c = *pu1_top++; + ui4_d = *pu1_top++; + ui4_e = *pu1_top++; + ui4_f = *pu1_top++; + ui4_g = *pu1_top++; + ui4_h = *pu1_top++; + ui4_i = *pu1_top; + ui4_j = *pu1_left--; + ui4_k = *pu1_left--; + ui4_l = *pu1_left--; + ui4_m = *pu1_left--; + ui4_n = *pu1_left--; + ui4_o = *pu1_left--; + ui4_p = *pu1_left--; + ui4_q = *pu1_left; + + predicted_pixels[6] = FILT121(ui4_a, ui4_j, ui4_k); + predicted_pixels[5] = FILT121(ui4_j, ui4_k, ui4_l); + predicted_pixels[4] = FILT121(ui4_k, ui4_l, ui4_m); + predicted_pixels[3] = FILT121(ui4_l, ui4_m, ui4_n); + predicted_pixels[2] = FILT121(ui4_m, ui4_n, ui4_o); + predicted_pixels[1] = FILT121(ui4_n, ui4_o, ui4_p); + predicted_pixels[0] = FILT121(ui4_o, ui4_p, ui4_q); + predicted_pixels[7] = FILT121(ui4_b, ui4_a, ui4_j); + predicted_pixels[8] = FILT121(ui4_a, ui4_b, ui4_c); + predicted_pixels[9] = FILT121(ui4_b, ui4_c, ui4_d); + predicted_pixels[10] = FILT121(ui4_c, ui4_d, ui4_e); + predicted_pixels[11] = FILT121(ui4_d, ui4_e, ui4_f); + predicted_pixels[12] = FILT121(ui4_e, ui4_f, ui4_g); + predicted_pixels[13] = FILT121(ui4_f, ui4_g, ui4_h); + predicted_pixels[14] = FILT121(ui4_g, ui4_h, ui4_i); + + memcpy(pu1_dst, predicted_pixels + 7, 8); + memcpy(pu1_dst + dst_strd, predicted_pixels + 6, 8); + memcpy(pu1_dst + 2 * dst_strd, predicted_pixels + 5, 8); + memcpy(pu1_dst + 3 * dst_strd, predicted_pixels + 4, 8); + memcpy(pu1_dst + 4 * dst_strd, predicted_pixels + 3, 8); + memcpy(pu1_dst + 5 * dst_strd, predicted_pixels + 2, 8); + memcpy(pu1_dst + 6 * dst_strd, predicted_pixels + 1, 8); + memcpy(pu1_dst + 7 * dst_strd, predicted_pixels, 8); +} + +/** + ******************************************************************************* + * + *ih264_intra_pred_luma_8x8_mode_vert_r + * + * @brief + * Perform Intra prediction for luma_8x8 mode:Vertical_Right + * + * @par Description: + * Perform Intra prediction for luma_8x8 mode:Vertical_Right ,described in sec 8.3.2.2.7 + * + * @param[in] pu1_src + * UWORD8 pointer to the source + * + * @param[out] pu1_dst + * UWORD8 pointer to the destination + * + * @param[in] src_strd + * integer source stride + * + * @param[in] dst_strd + * integer destination stride + * + * @param[in] ngbr_avail + * availability of neighbouring pixels(Not used in this function) + * + * @returns + * + * @remarks + * None + * + *******************************************************************************/ +void ih264_intra_pred_luma_8x8_mode_vert_r(UWORD8 *pu1_src, + UWORD8 *pu1_dst, + WORD32 src_strd, + WORD32 dst_strd, + WORD32 ngbr_avail) +{ + UWORD8 *pu1_left = NULL; /* Pointer to start of left predictors */ + UWORD8 *pu1_top = NULL; /* Pointer to start of top predictors */ + UWORD8 *pu1_topleft = NULL; /* Pointer to start of top left predictors */ + UWORD32 ui4_a; + UWORD32 ui4_b, ui4_c, ui4_d, ui4_e, ui4_f, ui4_g, ui4_h, ui4_i; + UWORD32 ui4_j, ui4_k, ui4_l, ui4_m, ui4_n, ui4_o, ui4_p; + UWORD8 predicted_pixels[22]; + + UNUSED(src_strd); + UNUSED(ngbr_avail); + pu1_top = pu1_src + BLK8x8SIZE + 1; + pu1_left = pu1_src + BLK8x8SIZE - 1; + pu1_topleft = pu1_src + BLK8x8SIZE; + + ui4_a = *pu1_topleft; + + ui4_b = *pu1_top++; + ui4_c = *pu1_top++; + ui4_d = *pu1_top++; + ui4_e = *pu1_top++; + ui4_f = *pu1_top++; + ui4_g = *pu1_top++; + ui4_h = *pu1_top++; + ui4_i = *pu1_top; + ui4_j = *pu1_left--; + ui4_k = *pu1_left--; + ui4_l = *pu1_left--; + ui4_m = *pu1_left--; + ui4_n = *pu1_left--; + ui4_o = *pu1_left--; + ui4_p = *pu1_left--; + + predicted_pixels[0] = FILT121(ui4_o, ui4_n, ui4_m); + predicted_pixels[1] = FILT121(ui4_m, ui4_l, ui4_k); + predicted_pixels[2] = FILT121(ui4_k, ui4_j, ui4_a); + predicted_pixels[3] = FILT11(ui4_a, ui4_b); + predicted_pixels[4] = FILT11(ui4_b, ui4_c); + predicted_pixels[5] = FILT11(ui4_c, ui4_d); + predicted_pixels[6] = FILT11(ui4_d, ui4_e); + predicted_pixels[7] = FILT11(ui4_e, ui4_f); + predicted_pixels[8] = FILT11(ui4_f, ui4_g); + predicted_pixels[9] = FILT11(ui4_g, ui4_h); + predicted_pixels[10] = FILT11(ui4_h, ui4_i); + predicted_pixels[11] = FILT121(ui4_p, ui4_o, ui4_n); + predicted_pixels[12] = FILT121(ui4_n, ui4_m, ui4_l); + predicted_pixels[13] = FILT121(ui4_l, ui4_k, ui4_j); + predicted_pixels[14] = FILT121(ui4_b, ui4_a, ui4_j); + predicted_pixels[15] = FILT121(ui4_a, ui4_b, ui4_c); + predicted_pixels[16] = FILT121(ui4_b, ui4_c, ui4_d); + predicted_pixels[17] = FILT121(ui4_c, ui4_d, ui4_e); + predicted_pixels[18] = FILT121(ui4_d, ui4_e, ui4_f); + predicted_pixels[19] = FILT121(ui4_e, ui4_f, ui4_g); + predicted_pixels[20] = FILT121(ui4_f, ui4_g, ui4_h); + predicted_pixels[21] = FILT121(ui4_g, ui4_h, ui4_i); + + memcpy(pu1_dst, predicted_pixels + 3, 8); + memcpy(pu1_dst + 1 * dst_strd, predicted_pixels + 14, 8); + memcpy(pu1_dst + 2 * dst_strd, predicted_pixels + 2, 8); + memcpy(pu1_dst + 3 * dst_strd, predicted_pixels + 13, 8); + memcpy(pu1_dst + 4 * dst_strd, predicted_pixels + 1, 8); + memcpy(pu1_dst + 5 * dst_strd, predicted_pixels + 12, 8); + memcpy(pu1_dst + 6 * dst_strd, predicted_pixels, 8); + memcpy(pu1_dst + 7 * dst_strd, predicted_pixels + 11, 8); + +} + +/* + ******************************************************************************* + * + *ih264_intra_pred_luma_8x8_mode_horz_d + * + * @brief + * Perform Intra prediction for luma_8x8 mode:Horizontal_Down + * + * @par Description: + * Perform Intra prediction for luma_8x8 mode:Horizontal_Down ,described in sec 8.3.2.2.8 + * + * @param[in] pu1_src + * UWORD8 pointer to the source + * + * @param[out] pu1_dst + * UWORD8 pointer to the destination + * + * @param[in] src_strd + * integer source stride + * + * @param[in] dst_strd + * integer destination stride + * + * @param[in] ngbr_avail + * availability of neighbouring pixels(Not used in this function) + * + * @returns + * + * @remarks + * None + * + *******************************************************************************/ + +void ih264_intra_pred_luma_8x8_mode_horz_d(UWORD8 *pu1_src, + UWORD8 *pu1_dst, + WORD32 src_strd, + WORD32 dst_strd, + WORD32 ngbr_avail) +{ + UWORD8 *pu1_left = NULL; /* Pointer to start of left predictors */ + UWORD8 *pu1_top = NULL; /* Pointer to start of top predictors */ + UWORD8 *pu1_topleft = NULL; /* Pointer to start of top left predictors */ + UWORD32 ui4_a; + UWORD32 ui4_b, ui4_c, ui4_d, ui4_e, ui4_f, ui4_g, ui4_h, ui4_i; + UWORD32 ui4_j, ui4_k, ui4_l, ui4_m, ui4_n, ui4_o, ui4_p; + UWORD8 predicted_pixels[22]; + UNUSED(src_strd); + UNUSED(ngbr_avail); + pu1_top = pu1_src + BLK8x8SIZE + 1; + pu1_left = pu1_src + BLK8x8SIZE - 1; + pu1_topleft = pu1_src + BLK8x8SIZE; + + ui4_a = *pu1_topleft; + ui4_j = *pu1_top++; + ui4_k = *pu1_top++; + ui4_l = *pu1_top++; + ui4_m = *pu1_top++; + ui4_n = *pu1_top++; + ui4_o = *pu1_top++; + ui4_p = *pu1_top++; + ui4_b = *pu1_left--; + ui4_c = *pu1_left--; + ui4_d = *pu1_left--; + ui4_e = *pu1_left--; + ui4_f = *pu1_left--; + ui4_g = *pu1_left--; + ui4_h = *pu1_left--; + ui4_i = *pu1_left; + + predicted_pixels[0] = FILT11(ui4_h, ui4_i); + predicted_pixels[1] = FILT121(ui4_g, ui4_h, ui4_i); + predicted_pixels[2] = FILT11(ui4_g, ui4_h); + predicted_pixels[3] = FILT121(ui4_f, ui4_g, ui4_h); + predicted_pixels[4] = FILT11(ui4_f, ui4_g); + predicted_pixels[5] = FILT121(ui4_e, ui4_f, ui4_g); + predicted_pixels[6] = FILT11(ui4_e, ui4_f); + predicted_pixels[7] = FILT121(ui4_d, ui4_e, ui4_f); + predicted_pixels[8] = FILT11(ui4_d, ui4_e); + predicted_pixels[9] = FILT121(ui4_c, ui4_d, ui4_e); + predicted_pixels[10] = FILT11(ui4_c, ui4_d); + predicted_pixels[11] = FILT121(ui4_b, ui4_c, ui4_d); + predicted_pixels[12] = FILT11(ui4_b, ui4_c); + predicted_pixels[13] = FILT121(ui4_a, ui4_b, ui4_c); + predicted_pixels[14] = FILT11(ui4_a, ui4_b); + predicted_pixels[15] = FILT121(ui4_j, ui4_a, ui4_b); + predicted_pixels[16] = FILT121(ui4_k, ui4_j, ui4_a); + predicted_pixels[17] = FILT121(ui4_l, ui4_k, ui4_j); + predicted_pixels[18] = FILT121(ui4_m, ui4_l, ui4_k); + predicted_pixels[19] = FILT121(ui4_n, ui4_m, ui4_l); + predicted_pixels[20] = FILT121(ui4_o, ui4_n, ui4_m); + predicted_pixels[21] = FILT121(ui4_p, ui4_o, ui4_n); + + memcpy(pu1_dst, predicted_pixels + 14, 8); + memcpy(pu1_dst + dst_strd, predicted_pixels + 12, 8); + memcpy(pu1_dst + 2 * dst_strd, predicted_pixels + 10, 8); + memcpy(pu1_dst + 3 * dst_strd, predicted_pixels + 8, 8); + memcpy(pu1_dst + 4 * dst_strd, predicted_pixels + 6, 8); + memcpy(pu1_dst + 5 * dst_strd, predicted_pixels + 4, 8); + memcpy(pu1_dst + 6 * dst_strd, predicted_pixels + 2, 8); + memcpy(pu1_dst + 7 * dst_strd, predicted_pixels, 8); +} + +/** + ******************************************************************************* + * + *ih264_intra_pred_luma_8x8_mode_vert_l + * + * @brief + * Perform Intra prediction for luma_8x8 mode:Vertical_Left + * + * @par Description: + * Perform Intra prediction for luma_8x8 mode:Vertical_Left ,described in sec 8.3.2.2.9 + * + * @param[in] pu1_src + * UWORD8 pointer to the source + * + * @param[out] pu1_dst + * UWORD8 pointer to the destination + * + * @param[in] src_strd + * integer source stride + * + * @param[in] dst_strd + * integer destination stride + * + * @param[in] ngbr_avail + * availability of neighbouring pixels(Not used in this function) + * + * @returns + * + * @remarks + * None + * + *******************************************************************************/ + +void ih264_intra_pred_luma_8x8_mode_vert_l(UWORD8 *pu1_src, + UWORD8 *pu1_dst, + WORD32 src_strd, + WORD32 dst_strd, + WORD32 ngbr_avail) +{ + UWORD8 *pu1_top = NULL; /* Pointer to start of top predictors */ + UWORD32 ui4_a, ui4_b, ui4_c, ui4_d, ui4_e, ui4_f, ui4_g, ui4_h; + UWORD32 ui4_i, ui4_j, ui4_k, ui4_l, ui4_m; + UWORD8 predicted_pixels[22]; + UNUSED(src_strd); + UNUSED(ngbr_avail); + pu1_top = pu1_src + BLK8x8SIZE + 1; + + ui4_a = *pu1_top++; + ui4_b = *pu1_top++; + ui4_c = *pu1_top++; + ui4_d = *pu1_top++; + ui4_e = *pu1_top++; + ui4_f = *pu1_top++; + ui4_g = *pu1_top++; + ui4_h = *pu1_top++; + ui4_i = *pu1_top++; + ui4_j = *pu1_top++; + ui4_k = *pu1_top++; + ui4_l = *pu1_top++; + ui4_m = *pu1_top++; + + predicted_pixels[0] = FILT11(ui4_a, ui4_b); + predicted_pixels[1] = FILT11(ui4_b, ui4_c); + predicted_pixels[2] = FILT11(ui4_c, ui4_d); + predicted_pixels[3] = FILT11(ui4_d, ui4_e); + predicted_pixels[4] = FILT11(ui4_e, ui4_f); + predicted_pixels[5] = FILT11(ui4_f, ui4_g); + predicted_pixels[6] = FILT11(ui4_g, ui4_h); + predicted_pixels[7] = FILT11(ui4_h, ui4_i); + predicted_pixels[8] = FILT11(ui4_i, ui4_j); + predicted_pixels[9] = FILT11(ui4_j, ui4_k); + predicted_pixels[10] = FILT11(ui4_k, ui4_l); + predicted_pixels[11] = FILT121(ui4_a, ui4_b, ui4_c); + predicted_pixels[12] = FILT121(ui4_b, ui4_c, ui4_d); + predicted_pixels[13] = FILT121(ui4_c, ui4_d, ui4_e); + predicted_pixels[14] = FILT121(ui4_d, ui4_e, ui4_f); + predicted_pixels[15] = FILT121(ui4_e, ui4_f, ui4_g); + predicted_pixels[16] = FILT121(ui4_f, ui4_g, ui4_h); + predicted_pixels[17] = FILT121(ui4_g, ui4_h, ui4_i); + predicted_pixels[18] = FILT121(ui4_h, ui4_i, ui4_j); + predicted_pixels[19] = FILT121(ui4_i, ui4_j, ui4_k); + predicted_pixels[20] = FILT121(ui4_j, ui4_k, ui4_l); + predicted_pixels[21] = FILT121(ui4_k, ui4_l, ui4_m); + + memcpy(pu1_dst, predicted_pixels, 8); + memcpy(pu1_dst + 2 * dst_strd, predicted_pixels + 1, 8); + memcpy(pu1_dst + 4 * dst_strd, predicted_pixels + 2, 8); + memcpy(pu1_dst + 6 * dst_strd, predicted_pixels + 3, 8); + memcpy(pu1_dst + 1 * dst_strd, predicted_pixels + 11, 8); + memcpy(pu1_dst + 3 * dst_strd, predicted_pixels + 12, 8); + memcpy(pu1_dst + 5 * dst_strd, predicted_pixels + 13, 8); + memcpy(pu1_dst + 7 * dst_strd, predicted_pixels + 14, 8); +} + +/** + ******************************************************************************* + * + *ih264_intra_pred_luma_8x8_mode_horz_u + * + * @brief + * Perform Intra prediction for luma_8x8 mode:Horizontal_Up + * + * @par Description: + * Perform Intra prediction for luma_8x8 mode:Horizontal_Up ,described in sec 8.3.2.2.10 + * + * @param[in] pu1_src + * UWORD8 pointer to the source + * + * @param[out] pu1_dst + * UWORD8 pointer to the destination + * + * @param[in] src_strd + * integer source stride + * + * @param[in] dst_strd + * integer destination stride + * + * @param[in] ngbr_avail + * availability of neighbouring pixels(Not used in this function) + * + * @returns + * + * @remarks + * None + * + *******************************************************************************/ + +void ih264_intra_pred_luma_8x8_mode_horz_u(UWORD8 *pu1_src, + UWORD8 *pu1_dst, + WORD32 src_strd, + WORD32 dst_strd, + WORD32 ngbr_avail) + +{ + UWORD8 *pu1_left = NULL; /* Pointer to start of left predictors */ + UWORD32 ui4_j, ui4_k, ui4_l, ui4_m, ui4_n, ui4_o, ui4_p, ui4_q; + UWORD8 predicted_pixels[22]; + UNUSED(src_strd); + UNUSED(ngbr_avail); + pu1_left = pu1_src + BLK8x8SIZE - 1; + + ui4_j = *pu1_left--; + ui4_k = *pu1_left--; + ui4_l = *pu1_left--; + ui4_m = *pu1_left--; + ui4_n = *pu1_left--; + ui4_o = *pu1_left--; + ui4_p = *pu1_left--; + ui4_q = *pu1_left; + + pu1_left = pu1_src + BLK8x8SIZE - 1; + + predicted_pixels[0] = FILT11(ui4_j, ui4_k); + predicted_pixels[1] = FILT121(ui4_j, ui4_k, ui4_l); + predicted_pixels[2] = FILT11(ui4_k, ui4_l); + predicted_pixels[3] = FILT121(ui4_k, ui4_l, ui4_m); + predicted_pixels[4] = FILT11(ui4_l, ui4_m); + predicted_pixels[5] = FILT121(ui4_l, ui4_m, ui4_n); + predicted_pixels[6] = FILT11(ui4_m, ui4_n); + predicted_pixels[7] = FILT121(ui4_m, ui4_n, ui4_o); + predicted_pixels[8] = FILT11(ui4_n, ui4_o); + predicted_pixels[9] = FILT121(ui4_n, ui4_o, ui4_p); + predicted_pixels[10] = FILT11(ui4_o, ui4_p); + predicted_pixels[11] = FILT121(ui4_o, ui4_p, ui4_q); + predicted_pixels[12] = FILT11(ui4_p, ui4_q); + predicted_pixels[13] = FILT121(ui4_p, ui4_q, ui4_q); + memset(predicted_pixels+14,ui4_q,8); + + memcpy(pu1_dst, predicted_pixels, 8); + memcpy(pu1_dst + 1 * dst_strd, predicted_pixels + 2, 8); + memcpy(pu1_dst + 2 * dst_strd, predicted_pixels + 4, 8); + memcpy(pu1_dst + 3 * dst_strd, predicted_pixels + 6, 8); + memcpy(pu1_dst + 4 * dst_strd, predicted_pixels + 8, 8); + memcpy(pu1_dst + 5 * dst_strd, predicted_pixels + 10, 8); + memcpy(pu1_dst + 6 * dst_strd, predicted_pixels + 12, 8); + memcpy(pu1_dst + 7 * dst_strd, predicted_pixels + 14, 8); +} + + +/******************* 16x16 Modes *******************/ + +/** + ******************************************************************************* + * + *ih264_intra_pred_luma_16x16_mode_vert + * + * @brief + * Perform Intra prediction for luma_16x16 mode:Vertical + * + * @par Description: + * Perform Intra prediction for luma_16x16 mode:Vertical, described in sec 8.3.3.1 + * + * @param[in] pu1_src + * UWORD8 pointer to the source + * + * @param[out] pu1_dst + * UWORD8 pointer to the destination + * + * @param[in] src_strd + * integer source stride + * + * @param[in] dst_strd + * integer destination stride + * + * @param[in] ngbr_avail + * availability of neighbouring pixels (Not used in this function) + * + * @returns + * + * @remarks + * None + * + *******************************************************************************/ + +void ih264_intra_pred_luma_16x16_mode_vert(UWORD8 *pu1_src, + UWORD8 *pu1_dst, + WORD32 src_strd, + WORD32 dst_strd, + WORD32 ngbr_avail) +{ + UWORD8 *pu1_top = NULL; /* Pointer to start of top predictors */ + WORD32 rows; /* loop variables*/ + UNUSED(src_strd); + UNUSED(ngbr_avail); + pu1_top = pu1_src + MB_SIZE + 1; + + for(rows = 0; rows < 16; rows += 4, pu1_dst += dst_strd) + { + memcpy(pu1_dst, pu1_top, 16); + pu1_dst += dst_strd; + memcpy(pu1_dst, pu1_top, 16); + pu1_dst += dst_strd; + memcpy(pu1_dst, pu1_top, 16); + pu1_dst += dst_strd; + memcpy(pu1_dst, pu1_top, 16); + } +} + +/** + ******************************************************************************* + * + *ih264_intra_pred_luma_16x16_mode_horz + * + * @brief + * Perform Intra prediction for luma_16x16 mode:Horizontal + * + * @par Description: + * Perform Intra prediction for luma_16x16 mode:Horizontal, described in sec 8.3.3.2 + * + * @param[in] pu1_src + * UWORD8 pointer to the source + * + * @param[out] pu1_dst + * UWORD8 pointer to the destination + * + * @param[in] src_strd + * integer source stride + * + * @param[in] dst_strd + * integer destination stride + * + * @param[in] ngbr_avail + * availability of neighbouring pixels(Not used in this function) + * + * @returns + * + * @remarks + * None + * + *******************************************************************************/ + +void ih264_intra_pred_luma_16x16_mode_horz(UWORD8 *pu1_src, + UWORD8 *pu1_dst, + WORD32 src_strd, + WORD32 dst_strd, + WORD32 ngbr_avail) +{ + UWORD8 *pu1_left = NULL; /* Pointer to start of top predictors */ + WORD32 rows; + UNUSED(src_strd); + UNUSED(ngbr_avail); + pu1_left = pu1_src + MB_SIZE - 1; + + for(rows = 0; rows < 16; rows += 4, pu1_dst += dst_strd, pu1_left --) + { + memset(pu1_dst, *pu1_left, 16); /* copy the left value to the entire row*/ + pu1_left --; + pu1_dst += dst_strd; + memset(pu1_dst, *pu1_left, 16); + pu1_left --; + pu1_dst += dst_strd; + memset(pu1_dst, *pu1_left, 16); + pu1_left --; + pu1_dst += dst_strd; + memset(pu1_dst, *pu1_left, 16); + } +} + +/** + ******************************************************************************* + * + *ih264_intra_pred_luma_16x16_mode_dc + * + * @brief + * Perform Intra prediction for luma_16x16 mode:DC + * + * @par Description: + * Perform Intra prediction for luma_16x16 mode:DC, described in sec 8.3.3.3 + * + * @param[in] pu1_src + * UWORD8 pointer to the source + * + * @param[out] pu1_dst + * UWORD8 pointer to the destination + * + * @param[in] src_strd + * integer source stride + * + * @param[in] dst_strd + * integer destination stride + * + ** @param[in] ngbr_avail + * availability of neighbouring pixels + * + * @returns + * + * @remarks + * None + * + *******************************************************************************/ + +void ih264_intra_pred_luma_16x16_mode_dc(UWORD8 *pu1_src, + UWORD8 *pu1_dst, + WORD32 src_strd, + WORD32 dst_strd, + WORD32 ngbr_avail) +{ + WORD8 u1_useleft; /* availability of left predictors (only for DC) */ + UWORD8 u1_usetop; /* availability of top predictors (only for DC) */ + UWORD8 *pu1_left = NULL; /* Pointer to start of left predictors */ + UWORD8 *pu1_top = NULL; /* Pointer to start of top predictors */ + WORD32 rows; /* loop variables*/ + WORD32 val = 0; + UNUSED(src_strd); + + u1_useleft = BOOLEAN(ngbr_avail & LEFT_MB_AVAILABLE_MASK); + u1_usetop = BOOLEAN(ngbr_avail & TOP_MB_AVAILABLE_MASK); + pu1_top = pu1_src + MB_SIZE + 1; + pu1_left = pu1_src + MB_SIZE - 1; + if(u1_useleft) + { + for(rows = 0; rows < 16; rows++) + val += *(pu1_left - rows); + val += 8; + } + if(u1_usetop) + { + for(rows = 0; rows < 16; rows++) + val += *(pu1_top + rows); + val += 8; + } + /* Since 8 is added if either left/top pred is there, + val still being zero implies both preds are not there */ + val = (val) ? (val >> (3 + u1_useleft + u1_usetop)) : 128; + + for(rows = 0; rows < 16; rows += 4, pu1_dst += dst_strd) + { + memset(pu1_dst, val, 16); + pu1_dst += dst_strd; + memset(pu1_dst, val, 16); + pu1_dst += dst_strd; + memset(pu1_dst, val, 16); + pu1_dst += dst_strd; + memset(pu1_dst, val, 16); + } +} + +/** + ******************************************************************************* + * + *ih264_intra_pred_luma_16x16_mode_plane + * + * @brief + * Perform Intra prediction for luma_16x16 mode:PLANE + * + * @par Description: + * Perform Intra prediction for luma_16x16 mode:PLANE, described in sec 8.3.3.4 + * + * @param[in] pu1_src + * UWORD8 pointer to the source + * + * @param[out] pu1_dst + * UWORD8 pointer to the destination + * + * @param[in] src_strd + * integer source stride + * + * @param[in] dst_strd + * integer destination stride + * + * @param[in] ngbr_avail + * availability of neighbouring pixels(Not used in this function) + * + * @returns + * + * @remarks + * None + * + *******************************************************************************/ + +void ih264_intra_pred_luma_16x16_mode_plane(UWORD8 *pu1_src, + UWORD8 *pu1_dst, + WORD32 src_strd, + WORD32 dst_strd, + WORD32 ngbr_avail) +{ + /*! Written with no multiplications */ + UWORD8 *pu1_left = NULL; /* Pointer to start of left predictors */ + UWORD8 *pu1_top = NULL; /* Pointer to start of top predictors */ + UWORD8 *pu1_topleft = NULL; + WORD32 a, b, c, tmp; + UWORD8 *pu1_tmp1, *pu1_tmp2; + WORD32 shift; + UNUSED(src_strd); + UNUSED(ngbr_avail); + pu1_top = pu1_src + MB_SIZE + 1; + pu1_left = pu1_src + MB_SIZE - 1; + pu1_topleft = pu1_src + MB_SIZE; + + { + a = (*(pu1_top + 15) + *(pu1_left - 15)) << 4; + + /*! Implement Sum(x*(P((x+7),-1) - P((x-7),-1))) x=1...8 */ + pu1_tmp1 = pu1_top + 8; + pu1_tmp2 = pu1_tmp1 - 2; + + /* Pixel diffs are only 9 bits; + so sign extension allows shifts to be used even for signed */ + b = ((*pu1_tmp1++) - (*pu1_tmp2--)); /* x=1 */ + b += ((*pu1_tmp1++) - (*pu1_tmp2--)) << 1; /* x=2 */ + tmp = ((*pu1_tmp1++) - (*pu1_tmp2--)); + b += (tmp << 1) + tmp; /* x=3 */ + b += ((*pu1_tmp1++) - (*pu1_tmp2--)) << 2; /* x=4 */ + + tmp = ((*pu1_tmp1++) - (*pu1_tmp2--)); + b += (tmp << 2) + tmp; /* x=5 */ + tmp = ((*pu1_tmp1++) - (*pu1_tmp2--)); + b += (tmp << 2) + (tmp << 1); /* x=6 */ + tmp = ((*pu1_tmp1++) - (*pu1_tmp2--)); + b += (tmp << 3) - tmp; /* x=7 */ + b += ((*pu1_tmp1) - (*pu1_topleft)) << 3; /* x=8 */ + + b = ((b << 2) + b + 32) >> 6; /*! (5*H + 32)>>6 */ + + /*! Implement Sum(y*(P(-1,(y+7)) - P(-1,(y-7)))) y=1...8 */ + pu1_tmp1 = pu1_left - 8; + pu1_tmp2 = pu1_tmp1 + 2; + + c = ((*pu1_tmp1) - (*pu1_tmp2)); /* y=1 */ + pu1_tmp1--; + pu1_tmp2++; + c += ((*pu1_tmp1) - (*pu1_tmp2)) << 1; /* y=2 */ + pu1_tmp1--; + pu1_tmp2++; + tmp = ((*pu1_tmp1) - (*pu1_tmp2)); + c += (tmp << 1) + tmp; /* y=3 */ + pu1_tmp1--; + pu1_tmp2++; + c += ((*pu1_tmp1) - (*pu1_tmp2)) << 2; /* y=4 */ + pu1_tmp1--; + pu1_tmp2++; + + tmp = ((*pu1_tmp1) - (*pu1_tmp2)); + c += (tmp << 2) + tmp; /* y=5 */ + pu1_tmp1--; + pu1_tmp2++; + tmp = ((*pu1_tmp1) - (*pu1_tmp2)); + c += (tmp << 2) + (tmp << 1); /* y=6 */ + pu1_tmp1--; + pu1_tmp2++; + tmp = ((*pu1_tmp1) - (*pu1_tmp2)); + c += (tmp << 3) - tmp; /* y=7 */ + pu1_tmp1--; //pu1_tmp2 ++; + /* Modified to get (-1,-1) location as *(pu1_top - 1) instead of (pu1_left - ui4_stride) */ + //c += ((*pu1_tmp1) - (*(pu1_top - 1)))<<3; /* y=8 */ + c += ((*pu1_tmp1) - (*pu1_topleft)) << 3; /* y=8 */ + + c = ((c << 2) + c + 32) >> 6; /*! (5*V + 32)>>32 */ + shift = 3; + } + + /*! Now from the plane parameters a, b, and c, + compute the fitted plane values over the block */ + { + WORD32 tmp1, tmpx, tmpx_init, j, i; + + tmpx_init = -(b << shift); /* -8b */ + tmp = a - (c << shift) + 16; /* a-((4or8)*c)+16 */ + for(i = 0; i < 16; i++) + { + tmp += c; /*increment every time by c to get c*(y-7or3)*/ + tmpx = tmpx_init; /* Init to -8b */ + for(j = 0; j < 16; j++) + { + tmpx += b; /* increment every time by b to get b*(x-7or3) */ + tmp1 = (tmp + tmpx) >> 5; + *pu1_dst++ = CLIP_U8(tmp1); + } + pu1_dst += (dst_strd - 16); + } + } +} diff --git a/common/ih264_macros.h b/common/ih264_macros.h new file mode 100755 index 0000000..6e4cb16 --- /dev/null +++ b/common/ih264_macros.h @@ -0,0 +1,110 @@ +/****************************************************************************** + * + * Copyright (C) 2015 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ***************************************************************************** + * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore +*/ + +/********************************************************************************* +* @file +* ih264_macros.h +* +* @brief +* Macro definitions used in the codec +* +* @author +* Ittiam +* +* @remarks +* None +* +******************************************************************************* +*/ +#ifndef _IH264_MACROS_H_ +#define _IH264_MACROS_H_ + +/*****************************************************************************/ +/* Function Macros */ +/*****************************************************************************/ +#define RETURN_IF(cond, retval) if(cond) {return (retval);} +#define UNUSED(x) ((void)(x)) + +#define ALIGN128(x) ((((x) + 127) >> 7) << 7) +#define ALIGN64(x) ((((x) + 63) >> 6) << 6) +#define ALIGN32(x) ((((x) + 31) >> 5) << 5) +#define ALIGN16(x) ((((x) + 15) >> 4) << 4) +#define ALIGN8(x) ((((x) + 7) >> 3) << 3) +#define ALIGN4(x) ((((x) + 3) >> 2) << 2) + + +/** +****************************************************************************** + * @brief Min, Max +****************************************************************************** + */ +#define MAX(a,b) ((a > b)?(a):(b)) +#define MIN(a,b) ((a < b)?(a):(b)) +#define MIN3(a,b,c) ((a) < (b)) ? (((a) < (c)) ? (a) : (c)) : (((b) < (c)) ? (b) : (c)) +#define MAX3(a,b,c) ((a) > (b)) ? (((a) > (c)) ? (a) : (c)) : (((b) > (c)) ? (b) : (c)) +/** +****************************************************************************** + * @brief Div, Mod +****************************************************************************** + */ +#define MOD(x,y) ((x)%(y)) +#define DIV(x,y) ((x)/(y)) + +/** +****************************************************************************** + * @brief Clip +****************************************************************************** + */ +#define CLIP3(miny, maxy, y) (((y) < (miny))?(miny):(((y) > (maxy))?(maxy):(y))) + +/** +****************************************************************************** + * @brief True, False +****************************************************************************** + */ +#define BOOLEAN(x) (!!(x)) + +/** +****************************************************************************** + * @brief Frequently used multiplications x2. x3, and x4 +****************************************************************************** + */ +#define X2(a) ((a) << 1) +#define X3(a) (((a) << 1) + (a)) +#define X4(a) ((a) << 2) + +/** +****************************************************************************** + * @brief Misc +****************************************************************************** + */ +#define ABS(x) ((x) < 0 ? (-(x)) : (x)) +#define SIGNXY(x,y) (((y) < 0) ? (-1 * (x)) : (x)) + +#define SIGN(x) (((x) >= 0) ? (((x) > 0) ? 1 : 0) : -1) + +#define RESET_BIT(x, pos) (x) = (x) & ~(1 << pos); +#define SET_BIT(x, pos) (x) = (x) | (1 << pos); +#define GET_BIT(x, pos) ((x) >> (pos)) & 0x1 + +#define INSERT_BIT(x, pos, bit) { RESET_BIT(x, pos); (x) = (x) | (bit << pos); } +#endif /*_IH264_MACROS_H_*/ + + diff --git a/common/ih264_mem_fns.c b/common/ih264_mem_fns.c new file mode 100755 index 0000000..1c1f328 --- /dev/null +++ b/common/ih264_mem_fns.c @@ -0,0 +1,176 @@ +/****************************************************************************** + * + * Copyright (C) 2015 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ***************************************************************************** + * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore +*/ +/** + ******************************************************************************* + * @file + * ih264_mem_fns.c + * + * @brief + * Functions used for memory operations + * + * @author + * Ittiam + * + * @par List of Functions: + * ih264_memcpy() + * ih264_memcpy_mul_8() + * ih264_memset() + * ih264_memset_mul_8() + * ih264_memset_16bit() + * ih264_memset_16bit_mul_8() + * + * @remarks + * None + * + ****************************************************************************** + */ + +/*****************************************************************************/ +/* File Includes */ +/*****************************************************************************/ +/* System include files */ +#include <stdio.h> +#include <stddef.h> +#include <stdlib.h> +#include <string.h> +#include <assert.h> + + +/* User include files */ +#include "ih264_typedefs.h" +#include "ih264_mem_fns.h" + +/** + ******************************************************************************* + * + * @brief + * memcpy of a 8,16 or 32 bytes + * + * @par Description: + * Does memcpy of 8bit data from source to destination for 8,16 or 32 number of bytes + * + * @param[in] pu1_dst + * UWORD8 pointer to the destination + * + * @param[in] pu1_src + * UWORD8 pointer to the source + * + * @param[in] num_bytes + * number of bytes to copy + * @returns + * + * @remarks + * None + * + ******************************************************************************* + */ + +void ih264_memcpy(UWORD8 *pu1_dst, UWORD8 *pu1_src, UWORD32 num_bytes) +{ + memcpy(pu1_dst, pu1_src, num_bytes); +} + + +void ih264_memcpy_mul_8(UWORD8 *pu1_dst, UWORD8 *pu1_src, UWORD32 num_bytes) +{ + memcpy(pu1_dst, pu1_src, num_bytes); +} + +/** + ******************************************************************************* + * + * @brief + * memset of a 8,16 or 32 bytes + * + * @par Description: + * Does memset of 8bit data for 8,16 or 32 number of bytes + * + * @param[in] pu1_dst + * UWORD8 pointer to the destination + * + * @param[in] value + * UWORD8 value used for memset + * + * @param[in] num_bytes + * number of bytes to set + * @returns + * + * @remarks + * None + * + ******************************************************************************* + */ + +void ih264_memset(UWORD8 *pu1_dst, UWORD8 value, UWORD32 num_bytes) +{ + memset(pu1_dst, value, num_bytes); +} + + +void ih264_memset_mul_8(UWORD8 *pu1_dst, UWORD8 value, UWORD32 num_bytes) +{ + memset(pu1_dst, value, num_bytes); +} + +/** + ******************************************************************************* + * + * @brief + * memset of 16bit data of a 8,16 or 32 bytes + * + * @par Description: + * Does memset of 16bit data for 8,16 or 32 number of bytes + * + * @param[in] pu2_dst + * UWORD8 pointer to the destination + * + * @param[in] value + * UWORD16 value used for memset + * + * @param[in] num_words + * number of words to set + * @returns + * + * @remarks + * None + * + ******************************************************************************* + */ + +void ih264_memset_16bit(UWORD16 *pu2_dst, UWORD16 value, UWORD32 num_words) +{ + UWORD32 i; + for(i = 0; i < num_words; i++) + { + *pu2_dst++ = value; + } +} + +void ih264_memset_16bit_mul_8(UWORD16 *pu2_dst, + UWORD16 value, + UWORD32 num_words) +{ + UWORD32 i; + for(i = 0; i < num_words; i++) + { + *pu2_dst++ = value; + } +} + diff --git a/common/ih264_mem_fns.h b/common/ih264_mem_fns.h new file mode 100755 index 0000000..e0167f4 --- /dev/null +++ b/common/ih264_mem_fns.h @@ -0,0 +1,126 @@ +/****************************************************************************** + * + * Copyright (C) 2015 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ***************************************************************************** + * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore +*/ +/** +******************************************************************************* +* @file +* ih264_mem_fns.h +* +* @brief +* Function declarations used for memory functions +* +* @author +* Ittiam +* +* @remarks +* None +* +******************************************************************************* +*/ +#ifndef _IH264_MEM_FNS_H_ +#define _IH264_MEM_FNS_H_ + +typedef void ih264_memcpy_ft(UWORD8 *pu1_dst, UWORD8 *pu1_src, UWORD32 num_bytes); + +typedef void ih264_memcpy_mul_8_ft(UWORD8 *pu1_dst, UWORD8 *pu1_src, UWORD32 num_bytes); +/** + ******************************************************************************* + * + * @brief + * memset of a 8,16 or 32 bytes + * + * @par Description: + * Does memset of 8bit data for 8,16 or 32 number of bytes + * + * @param[in] pu1_dst + * UWORD8 pointer to the destination + * + * @param[in] value + * UWORD8 value used for memset + * + * @param[in] num_bytes + * number of bytes to set + * @returns + * + * @remarks + * None + * + ******************************************************************************* + */ +typedef void ih264_memset_ft(UWORD8 *pu1_dst, UWORD8 value, UWORD32 num_bytes); + +typedef void ih264_memset_mul_8_ft(UWORD8 *pu1_dst, UWORD8 value, UWORD32 num_bytes); + +/** + ******************************************************************************* + * + * @brief + * memset of 16bit data of a 8,16 or 32 bytes + * + * @par Description: + * Does memset of 16bit data for 8,16 or 32 number of bytes + * + * @param[in] pu2_dst + * UWORD8 pointer to the destination + * + * @param[in] value + * UWORD16 value used for memset + * + * @param[in] num_words + * number of words to set + * @returns + * + * @remarks + * None + * + ******************************************************************************* + */ +typedef void ih264_memset_16bit_ft(UWORD16 *pu2_dst, UWORD16 value, UWORD32 num_words); + +typedef void ih264_memset_16bit_mul_8_ft(UWORD16 *pu2_dst, UWORD16 value, UWORD32 num_words); + +/* C function declarations */ +ih264_memcpy_ft ih264_memcpy; +ih264_memcpy_mul_8_ft ih264_memcpy_mul_8; +ih264_memset_ft ih264_memset; +ih264_memset_mul_8_ft ih264_memset_mul_8; +ih264_memset_16bit_ft ih264_memset_16bit; +ih264_memset_16bit_mul_8_ft ih264_memset_16bit_mul_8; + +/* A9 Q function declarations */ +ih264_memcpy_ft ih264_memcpy_a9q; +ih264_memcpy_mul_8_ft ih264_memcpy_mul_8_a9q; +ih264_memset_ft ih264_memset_a9q; +ih264_memset_mul_8_ft ih264_memset_mul_8_a9q; +ih264_memset_16bit_ft ih264_memset_16bit_a9q; +ih264_memset_16bit_mul_8_ft ih264_memset_16bit_mul_8_a9q; + +/* AV8 function declarations */ +ih264_memcpy_ft ih264_memcpy_av8; +ih264_memcpy_mul_8_ft ih264_memcpy_mul_8_av8; +ih264_memset_ft ih264_memset_av8; +ih264_memset_mul_8_ft ih264_memset_mul_8_av8; +ih264_memset_16bit_ft ih264_memset_16bit_av8; +ih264_memset_16bit_mul_8_ft ih264_memset_16bit_mul_8_av8; + + +ih264_memcpy_mul_8_ft ih264_memcpy_mul_8_ssse3; +ih264_memset_mul_8_ft ih264_memset_mul_8_ssse3; +ih264_memset_16bit_mul_8_ft ih264_memset_16bit_mul_8_ssse3; +#endif //_MEM_FNS_H_ diff --git a/common/ih264_padding.c b/common/ih264_padding.c new file mode 100755 index 0000000..8e8f3e2 --- /dev/null +++ b/common/ih264_padding.c @@ -0,0 +1,331 @@ +/****************************************************************************** + * + * Copyright (C) 2015 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ***************************************************************************** + * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore +*/ + +/** +******************************************************************************* +* @file +* ih264_padding.c +* +* @brief +* Contains function definitions for Padding +* +* @author +* Ittiam +* +* @par List of Functions: +* - ih264_pad_top() +* - ih264_pad_bottom() +* - ih264_pad_left_luma() +* - ih264_pad_left_chroma() +* - ih264_pad_right_luma() +* - ih264_pad_right_chroma() +* +* @remarks +* None +* +******************************************************************************* +*/ + +/*****************************************************************************/ +/* File Includes */ +/*****************************************************************************/ + +/* System include files */ +#include <stddef.h> +#include <string.h> + +/* User include files */ +#include "ih264_typedefs.h" +#include "ih264_macros.h" +#include "ih264_padding.h" + + +/*****************************************************************************/ +/* Function Definitions */ +/*****************************************************************************/ + +/** +******************************************************************************* +* +* @brief pad at the top of a 2d array +* +* @par Description: +* The top row of a 2d array is replicated for pad_size times at the top +* +* @param[in] pu1_src +* UWORD8 pointer to the source +* +* @param[in] src_strd +* integer source stride +* +* @param[in] wd +* integer width of the array +* +* @param[in] pad_size +* integer -padding size of the array +* +* @returns none +* +* @remarks none +* +******************************************************************************* +*/ +void ih264_pad_top(UWORD8 *pu1_src, + WORD32 src_strd, + WORD32 wd, + WORD32 pad_size) +{ + WORD32 row; + + for(row = 1; row <= pad_size; row++) + { + memcpy(pu1_src - row * src_strd, pu1_src, wd); + } +} + + + +/** +******************************************************************************* +* +* @brief pad at the bottom of a 2d array +* +* @par Description: +* The bottom row of a 2d array is replicated for pad_size times at the bottom +* +* @param[in] pu1_src +* UWORD8 pointer to the source +* +* @param[in] src_strd +* integer source stride +* +* @param[in] wd +* integer width of the array +* +* @param[in] pad_size +* integer -padding size of the array +* +* @returns none +* +* @remarks none +* +******************************************************************************* +*/ +void ih264_pad_bottom(UWORD8 *pu1_src, + WORD32 src_strd, + WORD32 wd, + WORD32 pad_size) +{ + WORD32 row; + + for(row = 1; row <= pad_size; row++) + { + memcpy(pu1_src + (row - 1) * src_strd, pu1_src - 1 * src_strd, wd); + } +} + +/** +******************************************************************************* +* +* @brief pad (luma block) at the left of a 2d array +* +* @par Description: +* The left column of a 2d array is replicated for pad_size times to the left +* +* @param[in] pu1_src +* UWORD8 pointer to the source +* +* @param[in] src_strd +* integer source stride +* +* @param[in] ht +* integer height of the array +* +* @param[in] pad_size +* integer -padding size of the array +* +* @returns none +* +* @remarks none +* +******************************************************************************* + */ +void ih264_pad_left_luma(UWORD8 *pu1_src, + WORD32 src_strd, + WORD32 ht, + WORD32 pad_size) +{ + WORD32 row; + + for(row = 0; row < ht; row++) + { + + memset(pu1_src - pad_size, *pu1_src, pad_size); + + pu1_src += src_strd; + } +} + +/** +******************************************************************************* +* +* @brief pad (chroma block) at the left of a 2d array +* +* @par Description: +* The left column of a 2d array is replicated for pad_size times to the left +* +* @param[in] pu1_src +* UWORD8 pointer to the source +* +* @param[in] src_strd +* integer source stride +* +* @param[in] ht +* integer height of the array +* +* @param[in] pad_size +* integer -padding size of the array +* +* @returns none +* +* @remarks none +* +******************************************************************************* +*/ +void ih264_pad_left_chroma(UWORD8 *pu1_src, + WORD32 src_strd, + WORD32 ht, + WORD32 pad_size) +{ + /* temp var */ + WORD32 row, col; + UWORD16 u2_uv_val; + + /* pointer to src */ + UWORD16 *pu2_src = (UWORD16 *)pu1_src; + + src_strd >>= 1; + pad_size >>= 1; + + for(row = 0; row < ht; row++) + { + u2_uv_val = pu2_src[0]; + + for (col = -pad_size; col < 0; col++) + { + pu2_src[col] = u2_uv_val; + } + + pu2_src += src_strd; + } +} + +/** +******************************************************************************* +* +* @brief pad (luma block) at the right of a 2d array +* +* @par Description: +* The right column of a 2d array is replicated for pad_size times at the right +* +* @param[in] pu1_src +* UWORD8 pointer to the source +* +* @param[in] src_strd +* integer source stride +* +* @param[in] ht +* integer height of the array +* +* @param[in] pad_size +* integer -padding size of the array +* +* @returns none +* +* @remarks none +* +******************************************************************************* +*/ +void ih264_pad_right_luma(UWORD8 *pu1_src, + WORD32 src_strd, + WORD32 ht, + WORD32 pad_size) +{ + WORD32 row; + + for(row = 0; row < ht; row++) + { + memset(pu1_src, *(pu1_src -1), pad_size); + + pu1_src += src_strd; + } +} + +/** +******************************************************************************* +* +* @brief pad (chroma block) at the right of a 2d array +* +* @par Description: +* The right column of a 2d array is replicated for pad_size times at the right +* +* @param[in] pu1_src +* UWORD8 pointer to the source +* +* @param[in] src_strd +* integer source stride +* +* @param[in] ht +* integer height of the array +* +* @param[in] pad_size +* integer -padding size of the array +* +* @returns none +* +* @remarks none +* +******************************************************************************* +*/ +void ih264_pad_right_chroma(UWORD8 *pu1_src, + WORD32 src_strd, + WORD32 ht, + WORD32 pad_size) +{ + WORD32 row, col; + UWORD16 u2_uv_val; + UWORD16 *pu2_src = (UWORD16 *)pu1_src; + + src_strd >>= 1; + pad_size >>= 1; + + for(row = 0; row < ht; row++) + { + u2_uv_val = pu2_src[-1]; + + for (col = 0; col < pad_size; col++) + { + pu2_src[col] = u2_uv_val; + } + + pu2_src += src_strd; + } +} + diff --git a/common/ih264_padding.h b/common/ih264_padding.h new file mode 100755 index 0000000..e4e18fb --- /dev/null +++ b/common/ih264_padding.h @@ -0,0 +1,74 @@ +/****************************************************************************** + * + * Copyright (C) 2015 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ***************************************************************************** + * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore +*/ + +/** +******************************************************************************* +* @file +* ih264_padding.h +* +* @brief +* Declarations for padding functions +* +* @author +* Ittiam +* +* @remarks +* None +* +******************************************************************************* +*/ +#ifndef _IH264_PADDING_H_ +#define _IH264_PADDING_H_ + +/*****************************************************************************/ +/* Function Declarations */ +/*****************************************************************************/ + +typedef void ih264_pad(UWORD8 *, WORD32, WORD32, WORD32); + +/* C function declarations */ +ih264_pad ih264_pad_top; +ih264_pad ih264_pad_bottom; +ih264_pad ih264_pad_left_luma; +ih264_pad ih264_pad_left_chroma; +ih264_pad ih264_pad_right_luma; +ih264_pad ih264_pad_right_chroma; + +/* A9 Q function declarations */ +ih264_pad ih264_pad_top_a9q; +ih264_pad ih264_pad_left_luma_a9q; +ih264_pad ih264_pad_left_chroma_a9q; +ih264_pad ih264_pad_right_luma_a9q; +ih264_pad ih264_pad_right_chroma_a9q; + +/* AV8 function declarations */ +ih264_pad ih264_pad_top_av8; +ih264_pad ih264_pad_left_luma_av8; +ih264_pad ih264_pad_left_chroma_av8; +ih264_pad ih264_pad_right_luma_av8; +ih264_pad ih264_pad_right_chroma_av8; + + +ih264_pad ih264_pad_left_luma_ssse3; +ih264_pad ih264_pad_left_chroma_ssse3; +ih264_pad ih264_pad_right_luma_ssse3; +ih264_pad ih264_pad_right_chroma_ssse3; + +#endif /*_IH264_PADDING_H_*/ diff --git a/common/ih264_resi_trans.h b/common/ih264_resi_trans.h new file mode 100755 index 0000000..ee0add3 --- /dev/null +++ b/common/ih264_resi_trans.h @@ -0,0 +1,70 @@ +/****************************************************************************** + * + * Copyright (C) 2015 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ***************************************************************************** + * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore +*/ +/** +******************************************************************************* +* @file +* ih264_resi_trans.h +* +* @brief +* Functions declarations for residue and forward transform +* +* @par List of Functions: +* - ih264_resi_trans_ft +* - ih264_resi_trans_4x4 +* - ih264_resi_trans_4x4 +* - ih264_resi_trans_4x4_a9 +* - ih264_resi_trans_4x4_a9 +* +* @author +* Ittiam +* +* @remarks +* None +* +******************************************************************************* +*/ + +#ifndef IH264_RESI_TRANS_H_ +#define IH264_RESI_TRANS_H_ + +/*****************************************************************************/ +/* Extern Function Declarations */ +/*****************************************************************************/ + +typedef void ih264_resi_trans_ft(UWORD8 *pu1_src, + UWORD8 *pu1_pred, + WORD32 *pi4_out, + WORD32 src_strd, + WORD32 pred_strd, + WORD32 out_strd); + +/*C functions*/ + +ih264_resi_trans_ft ih264_resi_trans_4x4; + +ih264_resi_trans_ft ih264_resi_trans_8x8; + +/*A9 functions*/ + +ih264_resi_trans_ft ih264_resi_trans_4x4_a9; + +ih264_resi_trans_ft ih264_resi_trans_8x8_a9; + +#endif /* IH264_RESI_TRANS_H_ */ diff --git a/common/ih264_resi_trans_quant.c b/common/ih264_resi_trans_quant.c new file mode 100755 index 0000000..cf1d43c --- /dev/null +++ b/common/ih264_resi_trans_quant.c @@ -0,0 +1,814 @@ +/****************************************************************************** + * + * Copyright (C) 2015 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ***************************************************************************** + * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore +*/ +/** + ******************************************************************************* + * @file + * ih264_resi_trans_quant.c + * + * @brief + * Contains function definitions single stage forward transform for H.264 + * It will calculate the residue, do the cf and then do quantization + * + * @author + * Ittiam + * + * @par List of Functions: + * - ih264_resi_trans_quant_4x4() + * - ih264_resi_trans_quant_chroma_4x4 + * - ih264_hadamard_quant_4x4 + * - ih264_hadamard_quant_2x2_uv + * - ih264_resi_trans_quant_8x8 + * + * @remarks + ******************************************************************************* + */ + +/*****************************************************************************/ +/* File Includes */ +/*****************************************************************************/ + +/* System include files */ +#include <stddef.h> + +/* User include files */ +#include "ih264_typedefs.h" +#include "ih264_defs.h" +#include "ih264_size_defs.h" +#include "ih264_macros.h" +#include "ih264_trans_macros.h" +#include "ih264_trans_data.h" +#include "ih264_structs.h" +#include "ih264_trans_quant_itrans_iquant.h" + +/** + ******************************************************************************* + * + * @brief + * This function performs forward transform and quantization on a 4*4 block + * + * @par Description: + * The function accepts source buffer and estimation buffer. From these, it + * computes the residue. This is residue is then transformed and quantized. + * The transform and quantization are in placed computed. They use the residue + * buffer for this. + * + * @param[in] pu1_src + * Pointer to source sub-block + * + * @param[in] pu1_pred + * Pointer to prediction sub-block + * + * @param[in] pi2_out + * Pointer to residual sub-block + * + * @param[in] src_strd + * Source stride + * + * @param[in] pred_strd + * Prediction stride + * + * @param[in] dst_strd + * Destination stride + * + * @param[in] u4_qbits + * QP_BITS_h264_4x4 + floor(QP/6) + * + * @param[in] pu2_threshold_matrix + * Pointer to Forward Quant Threshold Matrix + * + * @param[in] pu2_scale_matrix + * Pointer to Forward Quant Scale Matrix + * + * @param[in] u4_round_factor + * Quantization Round factor + * + * @param[out] pu1_nnz + * Total non-zero coefficients in the current sub-block + * + * @returns + * + * @remarks + * None + * + ******************************************************************************* + */ +void ih264_resi_trans_quant_4x4(UWORD8 *pu1_src, + UWORD8 *pu1_pred, + WORD16 *pi2_out, + WORD32 src_strd, + WORD32 pred_strd, + const UWORD16 *pu2_scale_matrix, + const UWORD16 *pu2_threshold_matrix, + UWORD32 u4_qbits, + UWORD32 u4_round_factor, + UWORD8 *pu1_nnz, + WORD16 *pi2_alt_dc_addr) +{ + UWORD32 i; + WORD32 x0, x1, x2, x3, x4, x5, x6, x7; + WORD32 i4_value, i4_sign; + UWORD32 u4_abs_value; + WORD16 *pi2_out_tmp = pi2_out; + UWORD32 u4_nonzero_coeff = 0; + + for (i = 0; i < SUB_BLK_WIDTH_4x4; i++) + { + /* computing prediction error (residue) */ + x4 = pu1_src[0] - pu1_pred[0]; + x5 = pu1_src[1] - pu1_pred[1]; + x6 = pu1_src[2] - pu1_pred[2]; + x7 = pu1_src[3] - pu1_pred[3]; + + /* Horizontal transform */ + x0 = x4 + x7; + x1 = x5 + x6; + x2 = x5 - x6; + x3 = x4 - x7; + + pi2_out_tmp[0] = x0 + x1; + pi2_out_tmp[1] = (x3 <<1) + x2; + pi2_out_tmp[2] = x0 - x1; + pi2_out_tmp[3] = x3 - (x2<<1); + + /* pointing to next row; */ + pu1_src += src_strd; + pu1_pred += pred_strd; + pi2_out_tmp += 4; + + } + pi2_out_tmp = pi2_out; + for (i = 0; i < SUB_BLK_WIDTH_4x4; i++) + { + + /* Vertical transform and quantization */ + x4 = pi2_out_tmp[0]; + x5 = pi2_out_tmp[4]; + x6 = pi2_out_tmp[8]; + x7 = pi2_out_tmp[12]; + + + x0 = x4 + x7; + x1 = x5 + x6; + x2 = x5 - x6; + x3 = x4 - x7; + + /* quantization is done in place */ + + i4_value = x0 + x1; + + if(i==0) + { + (*pi2_alt_dc_addr) = i4_value; + } + + FWD_QUANT(i4_value, u4_abs_value, i4_sign, pu2_threshold_matrix[0], pu2_scale_matrix[0], u4_round_factor, u4_qbits, u4_nonzero_coeff); + pi2_out_tmp[0] = i4_value; + + + i4_value = (x3 << 1) + x2; + FWD_QUANT(i4_value, u4_abs_value, i4_sign, pu2_threshold_matrix[4], pu2_scale_matrix[4], u4_round_factor, u4_qbits, u4_nonzero_coeff); + pi2_out_tmp[4] = i4_value; + + + i4_value = x0 - x1; + FWD_QUANT(i4_value, u4_abs_value, i4_sign, pu2_threshold_matrix[8], pu2_scale_matrix[8], u4_round_factor, u4_qbits, u4_nonzero_coeff); + pi2_out_tmp[8] = i4_value; + + + i4_value = x3 - (x2 << 1); + FWD_QUANT(i4_value, u4_abs_value, i4_sign, pu2_threshold_matrix[12], pu2_scale_matrix[12], u4_round_factor, u4_qbits, u4_nonzero_coeff); + pi2_out_tmp[12] = i4_value; + + pi2_out_tmp ++; + pu2_scale_matrix++; + pu2_threshold_matrix++; + } + + /* Return total nonzero coefficients in the current sub block */ + *pu1_nnz = u4_nonzero_coeff; +} +/** + ******************************************************************************* + * + * @brief + * This function performs forward transform and quantization on a 4*4 chroma block + * with interleaved values + * + * @par Description: + * The function accepts source buffer and estimation buffer. From these, it + * computes the residue. This is residue is then transformed and quantized. + * The transform and quantization are in placed computed. They use the residue + * buffer for this. + * + * @param[in] pu1_src + * Pointer to source sub-block + * + * @param[in] pu1_pred + * Pointer to prediction sub-block + * + * @param[in] pi2_out + * Pointer to residual sub-block + * + * @param[in] src_strd + * Source stride + * + * @param[in] pred_strd + * Prediction stride + * + * @param[in] dst_strd + * Destination stride + * + * @param[in] u4_qbits + * QP_BITS_h264_4x4 + floor(QP/6) + * + * @param[in] pu2_threshold_matrix + * Pointer to Forward Quant Threshold Matrix + * + * @param[in] pu2_scale_matrix + * Pointer to Forward Quant Scale Matrix + * + * @param[in] u4_round_factor + * Quantization Round factor + * + * @param[out] pu1_nnz + * Total non-zero coefficients in the current sub-block + * + * @returns + * + * @remarks + * None + * + ******************************************************************************* + */ +void ih264_resi_trans_quant_chroma_4x4(UWORD8 *pu1_src, + UWORD8 *pu1_pred, + WORD16 *pi2_out, + WORD32 src_strd, + WORD32 pred_strd, + const UWORD16 *pu2_scale_matrix, + const UWORD16 *pu2_threshold_matrix, + UWORD32 u4_qbits, + UWORD32 u4_round_factor, + UWORD8 *pu1_nnz, + WORD16 *pu1_dc_alt_addr) +{ + UWORD32 i; + WORD32 x0, x1, x2, x3, x4, x5, x6, x7; + WORD32 i4_value, i4_sign; + UWORD32 u4_abs_value; + WORD16 *pi2_out_tmp = pi2_out; + UWORD32 u4_nonzero_coeff = 0; + + for (i = 0; i < SUB_BLK_WIDTH_4x4; i++) + { + /* computing prediction error (residue) */ + x4 = pu1_src[0] - pu1_pred[0]; + x5 = pu1_src[2] - pu1_pred[2]; + x6 = pu1_src[4] - pu1_pred[4]; + x7 = pu1_src[6] - pu1_pred[6]; + + /* Horizontal transform */ + x0 = x4 + x7; + x1 = x5 + x6; + x2 = x5 - x6; + x3 = x4 - x7; + + pi2_out_tmp[0] = x0 + x1; + pi2_out_tmp[1] = (x3 <<1) + x2; + pi2_out_tmp[2] = x0 - x1; + pi2_out_tmp[3] = x3 - (x2<<1); + + /* pointing to next row; */ + pu1_src += src_strd; + pu1_pred += pred_strd; + pi2_out_tmp += 4; + + } + pi2_out_tmp = pi2_out; + for (i = 0; i < SUB_BLK_WIDTH_4x4; i++) + { + + /* Vertical transform and quantization */ + x4 = pi2_out_tmp[0]; + x5 = pi2_out_tmp[4]; + x6 = pi2_out_tmp[8]; + x7 = pi2_out_tmp[12]; + + + x0 = x4 + x7; + x1 = x5 + x6; + x2 = x5 - x6; + x3 = x4 - x7; + + /* quantization is done in place */ + + i4_value = x0 + x1; + + if(i==0) + { + *pu1_dc_alt_addr = i4_value; + } + + FWD_QUANT(i4_value, u4_abs_value, i4_sign, pu2_threshold_matrix[0], + pu2_scale_matrix[0], u4_round_factor, u4_qbits, + u4_nonzero_coeff); + pi2_out_tmp[0] = i4_value; + + i4_value = (x3 << 1) + x2; + FWD_QUANT(i4_value, u4_abs_value, i4_sign, pu2_threshold_matrix[4], + pu2_scale_matrix[4], u4_round_factor, u4_qbits, + u4_nonzero_coeff); + pi2_out_tmp[4] = i4_value; + + i4_value = x0 - x1; + FWD_QUANT(i4_value, u4_abs_value, i4_sign, pu2_threshold_matrix[8], + pu2_scale_matrix[8], u4_round_factor, u4_qbits, + u4_nonzero_coeff); + pi2_out_tmp[8] = i4_value; + + i4_value = x3 - (x2 << 1); + FWD_QUANT(i4_value, u4_abs_value, i4_sign, pu2_threshold_matrix[12], + pu2_scale_matrix[12], u4_round_factor, u4_qbits, + u4_nonzero_coeff); + pi2_out_tmp[12] = i4_value; + + pi2_out_tmp ++; + pu2_scale_matrix++; + pu2_threshold_matrix++; + } + + /* Return total nonzero coefficients in the current sub block */ + *pu1_nnz = u4_nonzero_coeff; +} + +/** + ******************************************************************************* + * + * @brief + * This function performs forward hadamard transform and quantization on a 4*4 block + * + * @par Description: + * The function accepts source buffer and estimation buffer. From these, it + * computes the residue. This is residue is then transformed and quantized. + * The transform and quantization are in placed computed. They use the residue + * buffer for this. + * + * @param[in] pu1_src + * Pointer to source sub-block + * + * @param[in] pu1_pred + * Pointer to prediction sub-block + * + * @param[in] pi2_out + * Pointer to residual sub-block + * + * @param[in] src_strd + * Source stride + * + * @param[in] pred_strd + * Prediction stride + * + * @param[in] dst_strd + * Destination stride + * + * @param[in] u4_qbits + * QP_BITS_h264_4x4 + floor(QP/6) + * + * @param[in] pu2_threshold_matrix + * Pointer to Forward Quant Threshold Matrix + * + * @param[in] pu2_scale_matrix + * Pointer to Forward Quant Scale Matrix + * + * @param[in] u4_round_factor + * Quantization Round factor + * + * @param[out] pu1_nnz + * Total non-zero coefficients in the current sub-block + * + * @returns + * + * @remarks + * None + * + */ + +void ih264_hadamard_quant_4x4(WORD16 *pi2_src, + WORD16 *pi2_dst, + const UWORD16 *pu2_scale_matrix, + const UWORD16 *pu2_threshold_matrix, + UWORD32 u4_qbits, + UWORD32 u4_round_factor, + UWORD8 *pu1_nnz) +{ + WORD32 i; + WORD32 x0,x1,x2,x3,x4,x5,x6,x7,i4_value; + UWORD32 u4_abs_value; + WORD32 i4_sign; + + *pu1_nnz = 0; + + for (i = 0; i < SUB_BLK_WIDTH_4x4; i++) + { + x4 = pi2_src[0]; + x5 = pi2_src[1]; + x6 = pi2_src[2]; + x7 = pi2_src[3]; + + x0 = x4 + x7; + x1 = x5 + x6; + x2 = x5 - x6; + x3 = x4 - x7; + + pi2_dst[0] = x0 + x1; + pi2_dst[1] = x3 + x2; + pi2_dst[2] = x0 - x1; + pi2_dst[3] = x3 - x2; + + pi2_src += 4; + pi2_dst += 4; + } + + /* Vertical transform and quantization */ + pi2_dst -= SUB_BLK_WIDTH_4x4<<2; + + for (i = 0; i < SUB_BLK_WIDTH_4x4; i++) + { + x4 = pi2_dst[0]; + x5 = pi2_dst[4]; + x6 = pi2_dst[8]; + x7 = pi2_dst[12] ; + + x0 = x4 + x7; + x1 = x5 + x6; + x2 = x5 - x6; + x3 = x4 - x7; + + + i4_value = (x0 + x1) >> 1; + FWD_QUANT(i4_value, u4_abs_value, i4_sign, pu2_threshold_matrix[0], + pu2_scale_matrix[0], u4_round_factor, u4_qbits, pu1_nnz[0]); + pi2_dst[0] = i4_value; + + i4_value = (x3 + x2) >> 1; + FWD_QUANT(i4_value, u4_abs_value, i4_sign, pu2_threshold_matrix[0], + pu2_scale_matrix[0], u4_round_factor, u4_qbits, pu1_nnz[0]); + pi2_dst[4] = i4_value; + + i4_value = (x0 - x1) >> 1; + FWD_QUANT(i4_value, u4_abs_value, i4_sign, pu2_threshold_matrix[0], + pu2_scale_matrix[0], u4_round_factor, u4_qbits, pu1_nnz[0]); + pi2_dst[8] = i4_value; + + i4_value = (x3 - x2) >> 1; + FWD_QUANT(i4_value, u4_abs_value, i4_sign, pu2_threshold_matrix[0], + pu2_scale_matrix[0], u4_round_factor, u4_qbits, pu1_nnz[0]); + pi2_dst[12] = i4_value; + + pi2_dst ++; + } +} + +/** + ******************************************************************************* + * + * @brief + * This function performs forward hadamard transform and quantization on a 2*2 block + * for both U and V planes + * + * @par Description: + * The function accepts source buffer and estimation buffer. From these, it + * computes the residue. This is residue is then transformed and quantized. + * The transform and quantization are in placed computed. They use the residue + * buffer for this. + * + * @param[in] pu1_src + * Pointer to source sub-block + * + * @param[in] pu1_pred + * Pointer to prediction sub-block + * + * @param[in] pi2_out + * Pointer to residual sub-block + * + * @param[in] src_strd + * Source stride + * + * @param[in] pred_strd + * Prediction stride + * + * @param[in] dst_strd + * Destination stride + * + * @param[in] u4_qbits + * QP_BITS_h264_4x4 + floor(QP/6) + * + * @param[in] pu2_threshold_matrix + * Pointer to Forward Quant Threshold Matrix + * + * @param[in] pu2_scale_matrix + * Pointer to Forward Quant Scale Matrix + * + * @param[in] u4_round_factor + * Quantization Round factor + * + * @param[out] pu1_nnz + * Total non-zero coefficients in the current sub-block + * + * @returns + * + * @remarks + * NNZ for dc is populated at 0 and 5th position of pu1_nnz + * + */ + +void ih264_hadamard_quant_2x2_uv(WORD16 *pi2_src, + WORD16 *pi2_dst, + const UWORD16 *pu2_scale_matrix, + const UWORD16 *pu2_threshold_matrix, + UWORD32 u4_qbits, + UWORD32 u4_round_factor, + UWORD8 *pu1_nnz) +{ + WORD32 x0, x1, x2, x3, x4, x5, x6, x7; + WORD32 i4_value, i4_sign, plane; + UWORD32 u4_abs_value; + + for(plane = 0; plane < 2; plane++) + { + pu1_nnz[plane] = 0; + + /* Horizontal transform */ + x4 = pi2_src[0]; + x5 = pi2_src[1]; + x6 = pi2_src[2]; + x7 = pi2_src[3]; + + x0 = x4 + x5; + x1 = x4 - x5; + x2 = x6 + x7; + x3 = x6 - x7; + + /* Vertical transform and quantization */ + i4_value = (x0 + x2); + FWD_QUANT(i4_value, u4_abs_value, i4_sign, pu2_threshold_matrix[0], + pu2_scale_matrix[0], u4_round_factor, u4_qbits, + pu1_nnz[plane]); + pi2_dst[0] = i4_value; + + i4_value = (x0 - x2); + FWD_QUANT(i4_value, u4_abs_value, i4_sign, pu2_threshold_matrix[0], + pu2_scale_matrix[0], u4_round_factor, u4_qbits, + pu1_nnz[plane]); + pi2_dst[2] = i4_value; + + i4_value = (x1 - x3); + FWD_QUANT(i4_value, u4_abs_value, i4_sign, pu2_threshold_matrix[0], + pu2_scale_matrix[0], u4_round_factor, u4_qbits, + pu1_nnz[plane]); + pi2_dst[3] = i4_value; + + i4_value = (x1 + x3); + FWD_QUANT(i4_value, u4_abs_value, i4_sign, pu2_threshold_matrix[0], + pu2_scale_matrix[0], u4_round_factor, u4_qbits, + pu1_nnz[plane]); + pi2_dst[1] = i4_value; + + pi2_dst += 4; + pi2_src += 4; + + } +} + +/* + ******************************************************************************* + * + * @brief + * This function performs Single stage forward transform CF8 and quantization on 8*8 blocks + * for h.264 + * + * @par Description: + * Performs single stage 8x8 forward transform CF8 after calculating the residue + * The result is then quantized + * + * @param[in] pu1_src + * Input 8x8 pixels + * + * @param[in] pu1_pred + * Input 8x8 pixels + * + * @param[in] pi1_out + * Output 8x8 pixels + * + * @param[in] u4_thresh + * Threshold under which the coeffs are not quantized + * + * @param[in] u4_qp_div + * QP/6 + * + * @param[in] u4_qp_rem + * QP%6 + * + * @param[in] u2_src_stride + * Source stride + * + * @param[in] pred_strd + * stride for prediciton buffer + * + * @param[in] dst_strd + * stride for destination buffer + * + * @param[in] pu4_quant_mat + * Pointer to the 4x4 quantization matrix + * + * @returns Void + * + * + ******************************************************************************* + */ +void ih264_resi_trans_quant_8x8(UWORD8 *pu1_src, + UWORD8 *pu1_pred, + WORD16 *pi2_out, + WORD32 src_strd, + WORD32 pred_strd, + const UWORD16 *pu2_scale_matrix, + const UWORD16 *pu2_threshold_matrix, + UWORD32 u4_qbits, + UWORD32 u4_round_factor, + UWORD8 *pu1_nnz, + WORD16 *pu1_dc_alt_addr) + +{ + WORD16 *pi2_out_tmp = pi2_out; + UWORD32 i; + WORD32 a0, a1, a2, a3, a4, a5, a6, a7; + WORD32 r0, r1, r2, r3, r4, r5, r6, r7; + WORD32 i4_sign; + UWORD32 u4_abs_value; + UWORD32 u4_nonzero_coeff = 0; + + UNUSED(pu1_dc_alt_addr); + + /*Horizontal transform */ + /* we are going to use the a's and r's in a twisted way since */ + /*i dont want to declare more variables */ + for(i = 0; i < SUB_BLK_WIDTH_8x8; ++i) + { + r0 = pu1_src[0]; + r0 -= pu1_pred[0]; + r1 = pu1_src[1]; + r1 -= pu1_pred[1]; + r2 = pu1_src[2];r2 -= pu1_pred[2]; + r3 = pu1_src[3];r3 -= pu1_pred[3]; + r4 = pu1_src[4];r4 -= pu1_pred[4]; + r5 = pu1_src[5];r5 -= pu1_pred[5]; + r6 = pu1_src[6];r6 -= pu1_pred[6]; + r7 = pu1_src[7];r7 -= pu1_pred[7]; + + + a0 = r0 + r7; + a1 = r1 + r6; + a2 = r2 + r5; + a3 = r3 + r4; + + a4 = a0 + a3; + a5 = a1 + a2; + a6 = a0 - a3; + a7 = a1 - a2; + + pi2_out_tmp[0] = a4 + a5; + + pi2_out_tmp[2] = a6 + (a7>>1); + pi2_out_tmp[4] = a4 - a5; + pi2_out_tmp[6] = (a6>>1) - a7; + + a0 = r0 - r7; + a1 = r1 - r6; + a2 = r2 - r5; + a3 = r3 - r4; + + a4 = a1 + a2 + ((a0>>1) + a0); + a5 = a0 - a3 - ((a2>>1) + a2); + a6 = a0 + a3 - ((a1>>1) + a1); + a7 = a1 - a2 + ((a3>>1) + a3); + + pi2_out_tmp[1] = a4 + (a7>>2); + pi2_out_tmp[3] = a5 + (a6>>2); + pi2_out_tmp[5] = a6 - (a5>>2); + pi2_out_tmp[7] = (a4>>2) - a7; + + pu1_src += src_strd; + pu1_pred += pred_strd; + pi2_out_tmp += 8; + } + + /*vertical transform and quant */ + + pi2_out_tmp = pi2_out; + + for (i = 0; i < SUB_BLK_WIDTH_8x8; ++i) + { + + r0 = pi2_out_tmp[0]; + r1 = pi2_out_tmp[8]; + r2 = pi2_out_tmp[16]; + r3 = pi2_out_tmp[24]; + r4 = pi2_out_tmp[32]; + r5 = pi2_out_tmp[40]; + r6 = pi2_out_tmp[48]; + r7 = pi2_out_tmp[56]; + + a0 = r0 + r7; + a1 = r1 + r6; + a2 = r2 + r5; + a3 = r3 + r4; + + a4 = a0 + a3; + a5 = a1 + a2; + a6 = a0 - a3; + a7 = a1 - a2; + + a0 = r0 - r7; + a1 = r1 - r6; + a2 = r2 - r5; + a3 = r3 - r4; + + r0 = a4 + a5; + r2 = a6 + (a7>>1); + r4 = a4 - a5; + r6 = (a6>>1) - a7; + + a4 = a1 + a2 + ((a0>>1) + a0); + a5 = a0 - a3 - ((a2>>1) + a2); + a6 = a0 + a3 - ((a1>>1) + a1); + a7 = a1 - a2 + ((a3>>1) + a3); + + r1 = a4 + (a7>>2); + r3 = a5 + (a6>>2); + r5 = a6 - (a5>>2); + r7 = (a4>>2) - a7; + + FWD_QUANT(r0, u4_abs_value, i4_sign, pu2_threshold_matrix[0], + pu2_scale_matrix[0], u4_round_factor, u4_qbits, + u4_nonzero_coeff); + pi2_out_tmp[0] = r0; + + FWD_QUANT(r1, u4_abs_value, i4_sign, pu2_threshold_matrix[8], + pu2_scale_matrix[8], u4_round_factor, u4_qbits, + u4_nonzero_coeff); + pi2_out_tmp[8] = r1; + + FWD_QUANT(r2, u4_abs_value, i4_sign, pu2_threshold_matrix[16], + pu2_scale_matrix[16], u4_round_factor, u4_qbits, + u4_nonzero_coeff); + pi2_out_tmp[16] = r2; + + FWD_QUANT(r3, u4_abs_value, i4_sign, pu2_threshold_matrix[24], + pu2_scale_matrix[24], u4_round_factor, u4_qbits, + u4_nonzero_coeff); + pi2_out_tmp[24] = r3; + + FWD_QUANT(r4, u4_abs_value, i4_sign, pu2_threshold_matrix[32], + pu2_scale_matrix[32], u4_round_factor, u4_qbits, + u4_nonzero_coeff); + pi2_out_tmp[32] = r4; + + FWD_QUANT(r5, u4_abs_value, i4_sign, pu2_threshold_matrix[40], + pu2_scale_matrix[40], u4_round_factor, u4_qbits, + u4_nonzero_coeff); + pi2_out_tmp[40] = r5; + + FWD_QUANT(r6, u4_abs_value, i4_sign, pu2_threshold_matrix[48], + pu2_scale_matrix[48], u4_round_factor, u4_qbits, + u4_nonzero_coeff); + pi2_out_tmp[48] = r6; + + FWD_QUANT(r7, u4_abs_value, i4_sign, pu2_threshold_matrix[56], + pu2_scale_matrix[56], u4_round_factor, u4_qbits, + u4_nonzero_coeff); + pi2_out_tmp[56] = r7; + + pi2_out_tmp++; + pu2_scale_matrix++; + pu2_threshold_matrix++; + } + /* Return total nonzero coefficients in the current sub block */ + *pu1_nnz = u4_nonzero_coeff; +} diff --git a/common/ih264_size_defs.h b/common/ih264_size_defs.h new file mode 100755 index 0000000..e2a8b76 --- /dev/null +++ b/common/ih264_size_defs.h @@ -0,0 +1,85 @@ +/****************************************************************************** + * + * Copyright (C) 2015 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ***************************************************************************** + * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore +*/ +/** + ******************************************************************************* + * @file + * ih264_size_defs.h + * + * @brief + * Contains declaration of global variables for H264 transform , quant and inverse quant + * + * @author + * Ittiam + * + * @remarks + * + ********************************************************************************/ + +#ifndef IH264_SIZE_DEFS_H_ +#define IH264_SIZE_DEFS_H_ + +/*****************************************************************************/ +/* Constant Macros */ +/*****************************************************************************/ + +/*-----------------------Primary defs--------------------------*/ + +/*Width of a 4x4 block*/ +#define SUB_BLK_WIDTH_4x4 4 + +/*Width of an 8x8 block*/ +#define SUB_BLK_WIDTH_8x8 8 + +/*Number of chroma blocks in a row of coffs*/ +#define SUB_BLK_COUNT_CHROMA_4x4_420 2 + +/*Number of luma blocks in a row of coffs*/ +#define SUB_BLK_COUNT_LUMA_4x4 4 + +/*Numbr of chroma planes*/ +#define NUM_CHROMA_PLANES 2 + +/*Constant bit shifts*/ +#define QP_BITS_h264_4x4 15 +#define QP_BITS_h264_8x8 16 + + +/*---------------------------Derived defs------------------------*/ + +/*Number of coefficients ina 4x4 block*/ +#define COFF_CNT_SUB_BLK_4x4 SUB_BLK_WIDTH_4x4*SUB_BLK_WIDTH_4x4; + +/*Number of luma blocks in a row of coffs*/ +#define SUB_BLK_LUMA_4X4_CNT_MB SUB_BLK_COUNT_LUMA_4x4 * SUB_BLK_COUNT_LUMA_4x4 + +/*Number of chroma coffs in an MB*/ +#define SUB_BLK_CHROMA_4X4_CNT_MB SUB_BLK_COUNT_CHROMA_4x4_420 * SUB_BLK_COUNT_CHROMA_4x4_420 +#define SUB_BLK_CHROMA_4X4_CNT_MB_BIPLANE SUB_BLK_CHROMA_4X4_CNT_MB*NUM_CHROMA_PLANES + +/*Size of trans buff = 4x4 for DC block + 4x4 * coffs for 4x4 ac blocks*/ +#define SIZE_TRANS_BUFF (SUB_BLK_WIDTH_4x4*SUB_BLK_WIDTH_4x4*+ \ + SUB_BLK_WIDTH_4x4*SUB_BLK_WIDTH_4x4* \ + SUB_BLK_COUNT_LUMA_4x4*SUB_BLK_COUNT_LUMA_4x4) + +/*memory size = memory size of 4x4 block of resi coff + 4x4 for DC coff block */ +#define SIZE_TMP_BUFF_ITRANS ((SUB_BLK_WIDTH_4x4*SUB_BLK_WIDTH_4x4) +\ + (SUB_BLK_WIDTH_4x4*SUB_BLK_WIDTH_4x4)) + +#endif /* IH264_DEFS_H_ */ diff --git a/common/ih264_structs.h b/common/ih264_structs.h new file mode 100755 index 0000000..fa4e142 --- /dev/null +++ b/common/ih264_structs.h @@ -0,0 +1,1722 @@ +/****************************************************************************** + * + * Copyright (C) 2015 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ***************************************************************************** + * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore +*/ + +/** + ******************************************************************************* + * @file + * ih264_structs.h + * + * @brief + * Structure definitions used in the code + * + * @author + * Ittiam + * + * @par List of Functions: + * + * @remarks + * None + * + ******************************************************************************* + */ + +#ifndef _IH264_STRUCTS_H_ +#define _IH264_STRUCTS_H_ + +/** MB Type info for Intra MBs */ +typedef struct +{ + UWORD32 u4_num_mbpart; + MBPART_PREDMODE_T e_mbpart_predmode; + MBMODES_I16x16 e_intra_predmode; + UWORD32 u4_cpb_chroma; + UWORD32 u4_cpb_luma; +}intra_mbtype_info_t; + +/** MB Type info for Inter MBs */ +typedef struct +{ + UWORD32 u4_num_mbpart; + MBPART_PREDMODE_T e_mbpart_predmode_0; + MBPART_PREDMODE_T e_mbpart_predmode_1; + UWORD32 u4_mbpart_wd; + UWORD32 u4_mbpart_ht; +}inter_mbtype_info_t; + + +/** Sub MB Type info for Inter MBs */ +typedef struct +{ + UWORD32 u4_num_mbpart; + MBPART_PREDMODE_T e_mbpart_predmode; + UWORD32 u4_mbpart_wd; + UWORD32 u4_mbpart_ht; +}submbtype_info_t; + +/** + * Picture buffer + */ +typedef struct +{ + UWORD8* pu1_luma; + UWORD8* pu1_chroma; + + WORD32 i4_abs_poc; + WORD32 i4_poc_lsb; + + + /** Lower 32 bit of time stamp */ + UWORD32 u4_timestamp_low; + + /** Upper 32 bit of time stamp */ + UWORD32 u4_timestamp_high; + + WORD32 i4_used_as_ref; + + /** + * frame_num in the slice header + */ + WORD32 i4_frame_num; + + /** + * Long-term frame idx + * TODO: store in frame_num + */ + WORD32 i4_long_term_frame_idx; + + /* + * 0: Top Field + * 1: Bottom Field + */ + WORD8 i1_field_type; + + /** + * buffer ID from frame buffer manager + */ + WORD32 i4_buf_id; + +} pic_buf_t; + + +/** + * Reference List + */ +typedef struct +{ + void *pv_pic_buf; + + void *pv_mv_buf; + +} ref_list_t; + + +/** + * Motion vector + */ +typedef struct +{ + /** + * Horizontal Motion Vector + */ + WORD16 i2_mvx; + + /** + * Vertical Motion Vector + */ + WORD16 i2_mvy; +} mv_t; + +/*****************************************************************************/ +/* Following results in packed 48 bit structure. If mv_t included */ +/* ref_pic_buf_id, then 8 bits will be wasted for each mv for aligning. */ +/* Also using mv_t as elements directly instead of a pointer to l0 and l1 */ +/* mvs. Since pointer takes 4 bytes and MV itself is 4 bytes. It does not */ +/* really help using pointers. */ +/*****************************************************************************/ + +/** + * PU Motion Vector info + */ +typedef struct +{ + /** + * L0 Motion Vector + */ + mv_t s_l0_mv; + + /** + * L1 Motion Vector + */ + mv_t s_l1_mv; + + /** + * L0 Ref index + */ + WORD8 i1_l0_ref_idx; + + /** + * L1 Ref index + */ + WORD8 i1_l1_ref_idx; + + /** + * L0 Ref Pic Buf ID + */ + WORD8 i1_l0_ref_pic_buf_id; + + /** + * L1 Ref Pic Buf ID + */ + WORD8 i1_l1_ref_pic_buf_id; + +} pu_mv_t; + +/** + * PU information + */ +typedef struct +{ + + /** + * Motion Vectors + */ + pu_mv_t s_mv; + + /** + * PU X position in terms of min PU (4x4) units + */ + UWORD32 b2_pos_x : 2; + + /** + * PU Y position in terms of min PU (4x4) units + */ + UWORD32 b2_pos_y : 2; + + /** + * PU width in pixels = (b2_wd + 1) << 2 + */ + UWORD32 b2_wd : 2; + + /** + * PU height in pixels = (b2_ht + 1) << 2 + */ + UWORD32 b2_ht : 2; + + /** + * Intra or Inter flag for each partition - 0 or 1 + */ + UWORD32 b1_intra_flag : 1; + + /** + * PRED_L0, PRED_L1, PRED_BI + */ + UWORD32 b2_pred_mode : 2; + +} pu_t; + + +/** + * MB information to be stored for entire frame + */ +typedef struct +{ + /** + * Transform sizes 0: 4x4, 1: 8x8, + */ + UWORD32 b1_trans_size : 1; + + /** + * CBP - 4 bits for Y, 1 for U and 1 for V + */ + UWORD32 b6_cbp: 6; + + /** + * Intra pred sizes 0: 4x4, 1: 8x8, 2: 16x16 + */ + UWORD32 b2_intra_pred_size : 2; + + /** + * Flag to signal if the current MB is IPCM + */ + UWORD32 b1_ipcm : 1; + +}mb_t; + +/*****************************************************************************/ +/* Info from last TU row of MB is stored in a row level neighbour buffer */ +/* , which will be used for Boundary Strength computation */ +/*****************************************************************************/ +/** + * MB neighbor info + */ +typedef struct +{ + /** + * Slice index of the mb + */ + UWORD16 u2_slice_idx; + + /*************************************************************************/ + /* CBF of bottom TU row (replicated in 4 pixel boundary) */ + /* MSB contains CBF of first TU in the last row and LSB contains CBF */ + /* of last TU in the last row */ + /*************************************************************************/ + /** + * CBF of bottom TU row + */ + UWORD16 u2_packed_cbf; + + /*************************************************************************/ + /* QP of bottom TU row (replicated at 8 pixel boundary (Since QP can */ + /* not change at less than min CU granularity) */ + /*************************************************************************/ + /** + * QP of bottom TU row + */ + UWORD8 u1_qp; + +} mb_top_ny_info_t; + +/** + * MB level context + */ +typedef struct _mb_ctxt_t +{ + /*************************************************************************/ + /* Tile boundary can be detected by looking at tile start x and tile */ + /* start y. And based on the tile, slice and frame boundary the */ + /* following will be initialized. */ + /*************************************************************************/ + /** + * Pointer to left MB + */ + /* If not available, this will be set to NULL */ + struct _mb_ctxt_t *ps_mb_left; + + /** + * Pointer to top-left MB + */ + /* If not available, this will be set to NULL */ + mb_top_ny_info_t *ps_mb_ny_topleft; + + /** + * Pointer to top MB + */ + /* If not available, this will be set to NULL */ + mb_top_ny_info_t *ps_mb_ny_top; + + /** + * Pointer to top-right MB + */ + /* If not available, this will be set to NULL */ + mb_top_ny_info_t *ps_mb_ny_topright; + + /*************************************************************************/ + /* Pointer to PU data. */ + /* This points to a MV Bank stored at frame level. Though this */ + /* pointer can be derived by reading offset at frame level, it is */ + /* stored here for faster access. Can be removed if storage of MB */ + /* structure is critical */ + /*************************************************************************/ + /** + * Pointer to PU data + */ + pu_t *ps_pu; + + /*************************************************************************/ + /* Pointer to a PU map stored at frame level, */ + /* Though this pointer can be derived by multiplying MB address with */ + /* number of minTUs in a MB, it is stored here for faster access. */ + /* Can be removed if storage of MB structure is critical */ + /*************************************************************************/ + /** + * Pointer to a PU map stored at frame level + */ + UWORD8 *pu1_pu_map; + + /** + * Number of TUs filled in as_tu + */ + /*************************************************************************/ + /* Having the first entry as 32 bit data, helps in keeping each of */ + /* the structures aligned to 32 bits at MB level */ + /*************************************************************************/ + WORD32 i4_tu_cnt; + + /** + * Pointer to transform coeff data + */ + /*************************************************************************/ + /* Following format is repeated for every coded TU */ + /* Luma Block */ + /* num_coeffs : 16 bits */ + /* zero_cols : 8 bits ( 1 bit per 4 columns) */ + /* sig_coeff_map : ((TU Size * TU Size) + 31) >> 5 number of WORD32s */ + /* coeff_data : Non zero coefficients */ + /* Cb Block (only for last TU in 4x4 case else for every luma TU) */ + /* num_coeffs : 16 bits */ + /* zero_cols : 8 bits ( 1 bit per 4 columns) */ + /* sig_coeff_map : ((TU Size * TU Size) + 31) >> 5 number of WORD32s */ + /* coeff_data : Non zero coefficients */ + /* Cr Block (only for last TU in 4x4 case else for every luma TU) */ + /* num_coeffs : 16 bits */ + /* zero_cols : 8 bits ( 1 bit per 4 columns) */ + /* sig_coeff_map : ((TU Size * TU Size) + 31) >> 5 number of WORD32s */ + /* coeff_data : Non zero coefficients */ + /*************************************************************************/ + void *pv_coeff_data; + + /** + * Slice to which the MB belongs to + */ + WORD32 i4_slice_idx; + + /** + * MB column position + */ + WORD32 i4_pos_x; + + /** + * MB row position + */ + WORD32 i4_pos_y; + + /** + * Number of PUs filled in ps_pu + */ + WORD32 i4_pu_cnt; + + /** + * Index of current PU being processed in ps_pu + */ + /* Scratch variable set to 0 at the start of any PU processing function */ + WORD32 i4_pu_idx; + + /** + * Vertical Boundary strength + */ + /* Two bits per edge. + Stored in format. BS[15] | BS[14] | .. |BS[0]*/ + UWORD32 *pu4_vert_bs; + + /** + * Horizontal Boundary strength + */ + + /* Two bits per edge. + Stored in format. BS[15] | BS[14] | .. |BS[0]*/ + UWORD32 *pu4_horz_bs; + + /** + * Qp array stored for each 8x8 pixels + */ + UWORD8 *pu1_qp; + + /** + * Pointer to current frame's pu_t array + */ + pu_t *ps_frm_pu; + + /** + * Pointer to current frame's pu_t index array, which stores starting index + * of pu_t for every MB + */ + UWORD32 *pu4_frm_pu_idx; + + /** + * Pointer to current frame's pu map array + */ + UWORD8 *pu1_frm_pu_map; + + /*************************************************************************/ + /* Need to add encoder specific elements for identifying the order of */ + /* coding for CU, TU and PU if any */ + /*************************************************************************/ +} mb_ctxt_t; + +/*************************************************************************/ +/* The following describes how each of the CU cases are handled */ +/*************************************************************************/ + +/*************************************************************************/ +/* For SKIP MB */ +/* One Inter PU with appropriate MV */ +/* One TU which says CBP is zero and size is 16x16 */ +/*************************************************************************/ + +/*************************************************************************/ +/* For Inter MB */ +/* M Inter PU with appropriate MVs (M between 1 to 4) */ +/* Number of TUs derived based on transform size */ +/*************************************************************************/ + +/*************************************************************************/ +/* For Intra MB */ +/* Number of TUs derived based on transform size */ +/* N Intra Modes are signaled along with coeff data at the start */ +/*************************************************************************/ + +/*************************************************************************/ +/* For Intra PCM MB */ +/* One TU which says ipcm is 1 */ +/*************************************************************************/ + + + +/** + * Structure to hold quantization parameters of an mb + */ +typedef struct +{ + + /* + * mb qp + */ + UWORD8 u1_mb_qp; + + /* + * mb qp / 6 + */ + UWORD8 u1_qp_div; + + /* + * mb qp mod 6 + */ + UWORD8 u1_qp_rem; + + /* + * QP bits + */ + UWORD8 u1_qbits; + + /* + * forward scale matrix + */ + const UWORD16 *pu2_scale_mat; + + /* + * threshold matrix for quantization + */ + UWORD16 *pu2_thres_mat; + + /* + * Threshold to compare the sad with + */ + UWORD16 *pu2_sad_thrsh; + + /* + * qp dependent rounding constant + */ + UWORD32 u4_dead_zone; + + /* + * inverse scale matrix + */ + const UWORD16 *pu2_iscale_mat; + + /* + * Weight matrix in iquant + */ + UWORD16 *pu2_weigh_mat; + +}quant_params_t; + +/** + * Structure to hold Profile tier level info for a given layer + */ + +typedef struct +{ + /** + * NAL unit type + */ + WORD8 i1_nal_unit_type; + + /** + * NAL ref idc + */ + WORD8 i1_nal_ref_idc; + + +} nal_header_t; + +/** + * HRD parameters Info + */ +typedef struct +{ + /** + * Specifies the number of alternative CPB specifications in the + * bitstream + */ + UWORD8 u1_cpb_cnt_minus1; + + /** + * (together with bit_rate_value_minus1) specifies the + * maximum input bit rate of the i-th CPB + */ + UWORD32 u4_bit_rate_scale; + + /** + * (together with cpb_size_du_value_minus1) specifies + * CPB size of the i-th CPB when the CPB operates + * at the access unit level + */ + UWORD32 u4_cpb_size_scale; + + /** + * (together with bit_rate_scale) specifies the + * maximum input bit rate for the i-th CPB + */ + UWORD32 au4_bit_rate_value_minus1[32]; + /** + * together with cpb_size_scale to specify the + * CPB size when the CPB operates at the access unit level. + */ + UWORD32 au4_cpb_size_value_minus1[32]; + + /** + * if 1, specifies that the HSS operates in a constant bit rate (CBR) mode + * if 0, specifies that the HSS operates in a intermittent bit rate (CBR) mode + */ + UWORD8 au1_cbr_flag[32]; + + + /** + * specifies the length, in bits for initial cpb delay (nal/vcl)syntax in bp sei + */ + UWORD8 u1_initial_cpb_removal_delay_length_minus1; + + /** + * specifies the length, in bits for the cpb delay syntax in pt_sei + */ + UWORD8 u1_cpb_removal_delay_length_minus1; + + /** + * specifies the length, in bits, of the pic_dpb_output_delay syntax element in the pt SEI message + */ + UWORD8 u1_dpb_output_delay_length_minus1; + + /** + * Specifies length of the time offset parameter + */ + UWORD8 u1_time_offset_length; + +}hrd_params_t; + + +/** + * Structure to hold VUI parameters Info + */ +typedef struct +{ + /** + * indicates the presence of aspect_ratio + */ + UWORD8 u1_aspect_ratio_info_present_flag; + + /** + * specifies the aspect ratio of the luma samples + */ + UWORD8 u1_aspect_ratio_idc; + + /** + * width of the luma samples. user dependent + */ + UWORD16 u2_sar_width; + + /** + * Height of the luma samples. user dependent + */ + UWORD16 u2_sar_height; + + /** + * if 1, specifies that the overscan_appropriate_flag is present + * if 0, the preferred display method for the video signal is unspecified + */ + UWORD8 u1_overscan_info_present_flag; + + /** + * if 1,indicates that the cropped decoded pictures output + * are suitable for display using overscan + */ + UWORD8 u1_overscan_appropriate_flag; + + /** + * if 1 specifies that video_format, video_full_range_flag and + * colour_description_present_flag are present + */ + UWORD8 u1_video_signal_type_present_flag; + + /** + * pal, secam, ntsc, ... + */ + UWORD8 u1_video_format; + + /** + * indicates the black level and range of the luma and chroma signals + */ + UWORD8 u1_video_full_range_flag; + + /** + * if 1,to 1 specifies that colour_primaries, transfer_characteristics + * and matrix_coefficients are present + */ + UWORD8 u1_colour_description_present_flag; + + /** + * indicates the chromaticity coordinates of the source primaries + */ + UWORD8 u1_colour_primaries; + + /** + * indicates the opto-electronic transfer characteristic of the source picture + */ + UWORD8 u1_transfer_characteristics; + + /** + * the matrix coefficients used in deriving luma and chroma signals + * from the green, blue, and red primaries + */ + UWORD8 u1_matrix_coefficients; + + /** + * if 1, specifies that chroma_sample_loc_type_top_field and + * chroma_sample_loc_type_bottom_field are present + */ + UWORD8 u1_chroma_loc_info_present_flag; + + /** + * location of chroma samples + */ + UWORD8 u1_chroma_sample_loc_type_top_field; + + UWORD8 u1_chroma_sample_loc_type_bottom_field; + + /** + * Indicates the presence of the + * num_units_in_ticks, time_scale flag + */ + UWORD8 u1_vui_timing_info_present_flag; + + /** + * Number of units that + * correspond to one increment of the + * clock. Indicates the resolution + */ + UWORD32 u4_vui_num_units_in_tick; + + /** + * The number of time units that pass in one second + */ + UWORD32 u4_vui_time_scale; + + /** + * Flag indicating that time difference between two frames is a constant + */ + UWORD8 u1_fixed_frame_rate_flag; + + /** + * Indicates the presence of NAL HRD parameters + */ + UWORD8 u1_nal_hrd_parameters_present_flag; + + /** + * NAL level HRD parameters + */ + hrd_params_t s_nal_hrd_parameters; + + /** + * Indicates the presence of VCL HRD parameters + */ + UWORD8 u1_vcl_hrd_parameters_present_flag; + + /** + * VCL level HRD parameters + */ + hrd_params_t s_vcl_hrd_parameters; + + /** + * Specifies the HRD operational mode + */ + UWORD8 u1_low_delay_hrd_flag; + + /** + * Indicates presence of SEI messages which include pic_struct syntax element + */ + UWORD8 u1_pic_struct_present_flag; + + /** + * 1, specifies that the following cvs bitstream restriction parameters are present + */ + UWORD8 u1_bitstream_restriction_flag; + + /** + * if 0, indicates that no pel outside the pic boundaries and + * no sub-pels derived using pels outside the pic boundaries is used for inter prediction + */ + UWORD8 u1_motion_vectors_over_pic_boundaries_flag; + + /** + * Indicates a number of bytes not exceeded by the sum of the sizes of the VCL NAL units + * associated with any coded picture + */ + UWORD8 u1_max_bytes_per_pic_denom; + + /** + * Indicates an upper bound for the number of bits of coding_unit() data + */ + UWORD8 u1_max_bits_per_mb_denom; + + /** + * Indicate the maximum absolute value of a decoded horizontal MV component + * in quarter-pel luma units + */ + UWORD8 u1_log2_max_mv_length_horizontal; + + /** + * Indicate the maximum absolute value of a decoded vertical MV component + * in quarter-pel luma units + */ + UWORD8 u1_log2_max_mv_length_vertical; + + /** + * Max number of frames that are not synchronized in display and decode order + */ + UWORD8 u1_num_reorder_frames; + + /** + * specifies required size of the HRD DPB in units of frame buffers. + */ + UWORD8 u1_max_dec_frame_buffering; + +} vui_t; + + +/** + * Structure to hold SPS info + */ +typedef struct +{ + /** + * profile_idc + */ + UWORD8 u1_profile_idc; + + /** constraint_set0_flag */ + UWORD8 u1_constraint_set0_flag; + + /** constraint_set1_flag */ + UWORD8 u1_constraint_set1_flag; + + /** constraint_set2_flag */ + UWORD8 u1_constraint_set2_flag; + + /** constraint_set3_flag */ + UWORD8 u1_constraint_set3_flag; + + /** + * level_idc + */ + UWORD8 u1_level_idc; + + /** + * seq_parameter_set_id + */ + UWORD8 u1_sps_id; + + + /** + * chroma_format_idc + */ + UWORD8 u1_chroma_format_idc; + + /** + * residual_colour_transform_flag + */ + WORD8 i1_residual_colour_transform_flag; + + /** + * bit_depth_luma_minus8 + */ + WORD8 i1_bit_depth_luma; + + /** + * bit_depth_chroma_minus8 + */ + WORD8 i1_bit_depth_chroma; + + /** + * qpprime_y_zero_transform_bypass_flag + */ + WORD8 i1_qpprime_y_zero_transform_bypass_flag; + + /** + * seq_scaling_matrix_present_flag + */ + WORD8 i1_seq_scaling_matrix_present_flag; + + /** + * seq_scaling_list_present_flag + */ + WORD8 ai1_seq_scaling_list_present_flag[8]; + + /** + * log2_max_frame_num_minus4 + */ + WORD8 i1_log2_max_frame_num; + + /** + * MaxFrameNum in the standard + * 1 << i1_log2_max_frame_num + */ + WORD32 i4_max_frame_num; + + /** + * pic_order_cnt_type + */ + WORD8 i1_pic_order_cnt_type; + + /** + * log2_max_pic_order_cnt_lsb_minus4 + */ + WORD8 i1_log2_max_pic_order_cnt_lsb; + + /** + * MaxPicOrderCntLsb in the standard. + * 1 << log2_max_pic_order_cnt_lsb_minus4 + */ + WORD32 i4_max_pic_order_cnt_lsb; + + /** + * delta_pic_order_always_zero_flag + */ + WORD8 i1_delta_pic_order_always_zero_flag; + + /** + * offset_for_non_ref_pic + */ + WORD32 i4_offset_for_non_ref_pic; + + /** + * offset_for_top_to_bottom_field + */ + WORD32 i4_offset_for_top_to_bottom_field; + + /** + * num_ref_frames_in_pic_order_cnt_cycle + */ + UWORD8 u1_num_ref_frames_in_pic_order_cnt_cycle; + + /** + * Offset_for_ref_frame + */ + WORD32 ai4_offset_for_ref_frame[256]; + + /** + * max_num_ref_frames + */ + UWORD8 u1_max_num_ref_frames; + + /** + * gaps_in_frame_num_value_allowed_flag + */ + WORD8 i1_gaps_in_frame_num_value_allowed_flag; + + /** + * pic_width_in_mbs_minus1 + */ + WORD16 i2_pic_width_in_mbs_minus1; + + /** + * pic_height_in_map_units_minus1 + */ + WORD16 i2_pic_height_in_map_units_minus1; + + /** + * frame_mbs_only_flag + */ + WORD8 i1_frame_mbs_only_flag; + + /** + * mb_adaptive_frame_field_flag + */ + WORD8 i1_mb_adaptive_frame_field_flag; + + /** + * direct_8x8_inference_flag + */ + WORD8 i1_direct_8x8_inference_flag; + + /** + * frame_cropping_flag + */ + WORD8 i1_frame_cropping_flag; + + /** + * frame_crop_left_offset + */ + WORD16 i2_frame_crop_left_offset; + + /** + * frame_crop_right_offset + */ + WORD16 i2_frame_crop_right_offset; + + /** + * frame_crop_top_offset + */ + WORD16 i2_frame_crop_top_offset; + + /** + * frame_crop_bottom_offset + */ + WORD16 i2_frame_crop_bottom_offset; + + /** + * vui_parameters_present_flag + */ + WORD8 i1_vui_parameters_present_flag; + + /** + * vui_parameters_Structure_info + */ + vui_t s_vui_parameters; + + /** + * Flag to give status of SPS structure + */ + WORD8 i1_sps_valid; + + /** + * Coded Picture width + */ + WORD32 i2_pic_wd; + + /** + * Coded Picture height + */ + WORD32 i2_pic_ht; + + /** + * Picture width in MB units + */ + + WORD16 i2_pic_wd_in_mb; + + /** + * Picture height in MB units + */ + + WORD16 i2_pic_ht_in_mb; + + /** + * useDefaultScalingMatrixFlag + */ + WORD8 ai1_use_default_scaling_matrix_flag[8]; + + /** + * 4x4 Scaling lists after inverse zig zag scan + */ + UWORD16 au2_4x4_weight_scale[6][16]; + + /** + * 4x4 Scaling lists after inverse zig zag scan + */ + UWORD16 au2_8x8_weight_scale[2][64]; + +} sps_t; + + +/** + * Structure to hold PPS info + */ +typedef struct +{ + /** + * pic_parameter_set_id + */ + UWORD8 u1_pps_id; + + /** + * seq_parameter_set_id + */ + UWORD8 u1_sps_id; + + /** + * Entropy coding : 0-VLC; 1 - CABAC + */ + UWORD8 u1_entropy_coding_mode_flag; + + /* + * Pic order present flag + */ + UWORD8 u1_pic_order_present_flag; + + /* + * Number of slice groups + */ + UWORD8 u1_num_slice_groups; + + /* + * Slice group map type + */ + UWORD8 u1_slice_group_map_type; + + /* + * Maximum reference picture index in the reference list 0 : range [0 - 31] + */ + WORD8 i1_num_ref_idx_l0_default_active; + + /* + * Maximum reference picture index in the reference list 1 : range [0 - 31] + */ + WORD8 i1_num_ref_idx_l1_default_active; + + /** + * weighted_pred_flag + */ + WORD8 i1_weighted_pred_flag; + + /** + * weighted_bipred_flag + */ + WORD8 i1_weighted_bipred_idc; + + /** + * pic_init_qp_minus26 + */ + WORD8 i1_pic_init_qp; + + /** + * pic_init_qs_minus26 + */ + WORD8 i1_pic_init_qs; + + /* + * Chroma QP offset w.r.t QPY {-12,12} + */ + WORD8 i1_chroma_qp_index_offset; + + /** + * deblocking_filter_control_present_flag + */ + WORD8 i1_deblocking_filter_control_present_flag; + + /** + * constrained_intra_pred_flag + */ + WORD8 i1_constrained_intra_pred_flag; + + /** + * redundant_pic_cnt_present_flag + */ + WORD8 i1_redundant_pic_cnt_present_flag; + + /** + * transform_8x8_mode_flag + */ + WORD8 i1_transform_8x8_mode_flag; + + /** + * pic_scaling_matrix_present_flag + */ + WORD8 i1_pic_scaling_matrix_present_flag; + + /* + * Second chroma QP offset + */ + WORD8 i1_second_chroma_qp_index_offset; + + + /** + * useDefaultScalingMatrixFlag + */ + WORD8 ai1_use_default_scaling_matrix_flag[8]; + + /** + * 4x4 Scaling lists after inverse zig zag scan + */ + UWORD16 au2_4x4_weight_scale[6][16]; + + /** + * 4x4 Scaling lists after inverse zig zag scan + */ + UWORD16 au2_8x8_weight_scale[2][64]; + + + /** + * pic_scaling_list_present_flag + */ + WORD8 ai1_pic_scaling_list_present_flag[8]; + + /** + * Flag to give status of PPS structure + */ + WORD8 i1_pps_valid; + + +} pps_t; + +/** + * MMCO commands and params. + */ +typedef struct +{ + /* memory management control operation command */ + UWORD8 u1_memory_management_control_operation; + + /* + * Contains difference of pic nums of short-term pic/frame + * 1. To signal it as "unused for reference" if mmco = 1 + * 2. To signal it as "used for long-term reference" if mmco = 3 + */ + UWORD32 u4_difference_of_pic_nums_minus1; + + /* Long-term pic num to be set as "unused for reference" */ + UWORD8 u1_long_term_pic_num; + + /* + * Assign a long-term idx to a picture as follows + * 1. Assign to a short-term pic if mmco = 3 + * 2. Assign to the current pic if mmco = 6 + */ + UWORD8 u1_long_term_frame_idx; + + /* + * The max long-term idx. The long-term pics having idx above + * are set as "unused for reference + */ + UWORD8 u1_max_long_term_frame_idx_plus1; + +}mmco_prms_t; + +/** + * Structure to hold Reference picture list modification info + */ +typedef struct +{ + /* ref_pic_list_modification_flag_l0 */ + WORD8 i1_ref_pic_list_modification_flag_l0; + + /* Modification required in list0 */ + WORD8 i1_modification_of_pic_nums_idc_l0[MAX_MODICATION_IDC]; + + /* + * The absolute difference between the picture number of + * the picture being moved to the current index in + * list0 and the picture number prediction value + */ + UWORD32 u4_abs_diff_pic_num_minus1_l0[MAX_MODICATION_IDC]; + + /* + * The long-term picture number of the picture being moved + * to the current index in list0 + */ + UWORD8 u1_long_term_pic_num_l0[MAX_MODICATION_IDC]; + + /* ref_pic_list_modification_flag_l1 */ + WORD8 i1_ref_pic_list_modification_flag_l1; + + /* Modification required in list1 */ + WORD8 i1_modification_of_pic_nums_idc_l1[MAX_MODICATION_IDC]; + + /* + * The absolute difference between the picture number of + * the picture being moved to the current index in + * list1 and the picture number prediction value + */ + UWORD32 u4_abs_diff_pic_num_minus1_l1[MAX_MODICATION_IDC]; + + /* + * The long-term picture number of the picture being moved + * to the current index in list1 + */ + UWORD8 u1_long_term_pic_num_l1[MAX_MODICATION_IDC]; +}rplm_t; + +/** + * Structure to hold Slice Header info + */ +typedef struct +{ + + /* + * nal_unit_type + */ + WORD8 i1_nal_unit_type; + + /* + * nal_unit_idc + */ + WORD8 i1_nal_unit_idc; + + /* + * first_mb_in_slice + */ + UWORD16 u2_first_mb_in_slice; + + /* + * slice_type + */ + UWORD8 u1_slice_type; + + /* + * pic_parameter_set_id + */ + UWORD8 u1_pps_id; + + /* + * frame_num + */ + WORD32 i4_frame_num; + + /* + * field_pic_flag + */ + WORD8 i1_field_pic_flag; + + /* + * bottom_field_flag + */ + WORD8 i1_bottom_field_flag; + + /* + * second_field + */ + WORD8 i1_second_field_flag; + + /* + * idr_pic_id + */ + UWORD16 u2_idr_pic_id ; + + /* + * pic_order_cnt_lsb + */ + UWORD16 i4_pic_order_cnt_lsb; + + /* + * delta_pic_order_cnt_bottom + */ + WORD32 i4_delta_pic_order_cnt_bottom; + + /* + * delta_pic_order_cnt + */ + WORD32 ai4_delta_pic_order_cnt[2]; + + /* + * redundant_pic_cnt + */ + UWORD8 u1_redundant_pic_cnt; + + /* + * direct_spatial_mv_pred_flag + */ + UWORD8 u1_direct_spatial_mv_pred_flag; + + /* + * num_ref_idx_active_override_flag + */ + UWORD8 u1_num_ref_idx_active_override_flag; + + /* + * num_ref_idx_l0_active + */ + WORD8 i1_num_ref_idx_l0_active; + + /* + * num_ref_idx_l1_active_minus1 + */ + WORD8 i1_num_ref_idx_l1_active; + + /* + * ref_pic_list_reordering_flag_l0 + */ + UWORD8 u1_ref_idx_reordering_flag_l0; + + /** + * Reference prediction list modification + */ + rplm_t s_rplm; + + /** + * L0 Reference pic lists + */ + ref_list_t as_ref_pic_list0[MAX_DPB_SIZE]; + + /** + * L1 Reference pic lists + */ + ref_list_t as_ref_pic_list1[MAX_DPB_SIZE]; + + /* + * weighted_bipred_idc + */ + WORD8 u1_weighted_bipred_idc; + + /* + * no_output_of_prior_pics_flag + */ + UWORD8 u1_no_output_of_prior_pics_flag; + + /* + * long_term_reference_flag + */ + UWORD8 u1_long_term_reference_flag; + + /* + * adaptive_ref_pic_marking_mode_flag + */ + UWORD8 u1_adaptive_ref_pic_marking_mode_flag; + + /* + * Array to structures to store mmco commands + * and parameters. + */ + mmco_prms_t as_mmco_prms[MAX_MMCO_COMMANDS]; + + /* + * entropy_coding_mode_flag + */ + WORD8 u1_entropy_coding_mode_flag; + + /* + * cabac_init_idc + */ + WORD8 i1_cabac_init_idc; + + /* + * i1_slice_qp + */ + WORD8 i1_slice_qp; + + /* + * sp_for_switch_flag + */ + UWORD8 u1_sp_for_switch_flag; + + /* + * slice_qs_delta + */ + UWORD8 u1_slice_qs; + + /* + * disable_deblocking_filter_idc + */ + WORD8 u1_disable_deblocking_filter_idc; + + /* + * slice_alpha_c0_offset_div2 + */ + WORD8 i1_slice_alpha_c0_offset_div2; + + /* + * slice_beta_offset_div2 + */ + WORD8 i1_slice_beta_offset_div2; + + /* + * num_slice_groups_minus1 + */ + WORD8 u1_num_slice_groups_minus1; + + /* + * slice_group_change_cycle + */ + WORD8 u1_slice_group_change_cycle; + + /** + * Start MB X + */ + UWORD16 i2_mb_x; + + /** + * Start MB Y + */ + UWORD16 i2_mb_y; + + /** + * Absolute POC. Contains minimum of top and bottom POC. + */ + WORD32 i4_abs_pic_order_cnt; + + /** + * Absolute top POC. Contains top poc for frame or top + * field. Invalid for bottom field. + */ + WORD32 i4_abs_top_pic_order_cnt; + + /** + * Absolute top POC. Contains bottom poc for frame or bottom + * field. Invalid for top field. + */ + WORD32 i4_abs_bottom_pic_order_cnt; + + /** Flag signaling if the current slice is ref slice */ + UWORD8 i1_nal_ref_idc; + + /** Flag to indicate if the current slice is MBAFF Frame */ + UWORD8 u1_mbaff_frame_flag; + + /** luma_log2_weight_denom */ + UWORD8 u1_luma_log2_weight_denom; + + /** chroma_log2_weight_denom */ + UWORD8 u1_chroma_log2_weight_denom; + + /** luma_weight_l0_flag */ + UWORD8 au1_luma_weight_l0_flag[MAX_DPB_SIZE]; + + /** luma_weight_l0 : (-128, 127 )is the range of weights + * when weighted pred is enabled, 128 is default value */ + WORD16 ai2_luma_weight_l0[MAX_DPB_SIZE]; + + /** luma_offset_l0 : (-128, 127 )is the range of offset + * when weighted pred is enabled, 0 is default value */ + WORD8 ai1_luma_offset_l0[MAX_DPB_SIZE]; + + /** chroma_weight_l0_flag */ + UWORD8 au1_chroma_weight_l0_flag[MAX_DPB_SIZE]; + + /** chroma_weight_l0 : (-128, 127 )is the range of weights + * when weighted pred is enabled, 128 is default value*/ + WORD16 ai2_chroma_weight_l0[MAX_DPB_SIZE][2]; + + /** chroma_offset_l0 : (-128, 127 )is the range of offset + * when weighted pred is enabled, 0 is default value*/ + WORD8 ai1_chroma_offset_l0[MAX_DPB_SIZE][2]; + + /** luma_weight_l0_flag */ + UWORD8 au1_luma_weight_l1_flag[MAX_DPB_SIZE]; + + /** luma_weight_l1 : (-128, 127 )is the range of weights + * when weighted pred is enabled, 128 is default value */ + WORD16 ai2_luma_weight_l1[MAX_DPB_SIZE]; + + /** luma_offset_l1 : (-128, 127 )is the range of offset + * when weighted pred is enabled, 0 is default value */ + WORD8 ai1_luma_offset_l1[MAX_DPB_SIZE]; + + /** chroma_weight_l1_flag */ + UWORD8 au1_chroma_weight_l1_flag[MAX_DPB_SIZE]; + + /** chroma_weight_l1 : (-128, 127 )is the range of weights + * when weighted pred is enabled, 128 is default value */ + WORD16 ai2_chroma_weight_l1[MAX_DPB_SIZE][2]; + + /** chroma_offset_l1 :(-128, 127 )is the range of offset + * when weighted pred is enabled, 0 is default value */ + WORD8 ai1_chroma_offset_l1[MAX_DPB_SIZE][2]; +}slice_header_t; + + +/*****************************************************************************/ +/* The following can be used to type cast coefficient data that is stored */ +/* per subblock. Note that though i2_level is shown as an array that */ +/* holds 16 coefficients, only the first few entries will be valid. Next */ +/* subblocks data starts after the valid number of coefficients. Number */ +/* of non-zero coefficients will be derived using number of non-zero bits */ +/* in sig coeff map */ +/*****************************************************************************/ + +/** + * Structure to hold coefficient info for a 2x2 chroma DC transform + */ +typedef struct +{ + /** + * significant coefficient map + */ + UWORD8 u1_sig_coeff_map; + + /** + * sub block position + */ + UWORD8 u1_subblk_pos; + + /** + * holds coefficients + */ + WORD16 ai2_level[2 * 2]; +}tu_sblk2x2_coeff_data_t; + +/** + * Structure to hold coefficient info for a 4x4 transform + */ +typedef struct +{ + /** + * significant coefficient map + */ + UWORD16 u2_sig_coeff_map; + + /** + * sub block position + */ + UWORD16 u2_subblk_pos; + + /** + * holds coefficients + */ + WORD16 ai2_level[SUBBLK_COEFF_CNT]; +}tu_sblk4x4_coeff_data_t; + +/** + * Structure to hold coefficient info for a 8x8 transform + */ +typedef struct +{ + + /** + * significant coefficient map + */ + UWORD32 au4_sig_coeff_map[2]; + + /** + * sub block position + */ + UWORD16 u2_subblk_pos; + + /** + * holds coefficients + */ + WORD16 ai2_level[TRANS_SIZE_8 * TRANS_SIZE_8]; +}tu_blk8x8_coeff_data_t; + + +/** + * Structure to hold coefficient info for a 16x16 IPCM MB + */ +typedef struct +{ + /** + * holds coefficients + */ + UWORD8 au1_level[MB_SIZE * MB_SIZE * 3 / 2]; +}tu_ipcm_coeff_data_t; + + +typedef struct +{ + /** + * Transform sizes 0: 4x4, 1: 8x8, + */ + UWORD32 b1_trans_size : 1; + + /** + * Flag to signal if the current MB is IPCM + */ + UWORD32 b1_ipcm : 1; + + /** + * Intra pred sizes 0: 4x4, 1: 8x8, 2: 16x16 + */ + UWORD32 b2_intra_pred_size : 2; + + /** + * Chroma intra mode + */ + UWORD32 b2_intra_chroma_pred_mode: 2; + + /** + * Number of coded subblocks in the current MB, for which + * tu data is sent. Maximum of 27 subblocks in the following + * order. + * 1 4x4 luma DC(for intra16x16), + * 16 4x4 luma, + * 2 2x2 chroma DC, + * 8 4x4 chroma, + */ + WORD32 b5_num_coded_sblks: 5; + + /** + * Flag to signal if 4x4 subblock for DC values (in INTRA 16x16 MB) + * is coded + */ + UWORD32 b1_luma_dc_coded: 1; + + /** + * Flag to signal if 4x4 subblock for DC values (in INTRA 16x16 MB) + * is coded + */ + UWORD32 b1_chroma_dc_coded: 1; + + /** + * CSBP - 16 bits, 1 bit for each 4x4 + * for intra16x16 mb_type only ac coefficients are + */ + UWORD32 b16_luma_csbp: 16; + + /** + * CSBP - 16 bits, 1 bit for each 4x4 + * for intra16x16 mb_type only ac coefficients are + */ + UWORD32 b8_chroma_csbp: 8; + + /** + * Luma Intra pred modes, + * Based on intra pred size either 16, 4 or 1 entry will be + * populated below. + */ + UWORD8 au1_luma_intra_modes[16]; + +}intra_mb_t; + + +typedef struct +{ + /** + * Transform sizes 0: 4x4, 1: 8x8, + */ + UWORD8 b1_trans_size : 1; + + + /** + * Skip flag + */ + UWORD8 b1_skip : 1; + + + /** + * Number of coded subblocks in the current MB, for which + * tu data is sent. Maximum of 26 subblocks in the following + * order. + * 16 4x4 luma, + * 2 2x2 chroma DC, + * 8 4x4 chroma, + */ + WORD32 b5_num_coded_sblks: 5; + + /** + * CSBP - 16 bits, 1 bit for each 4x4 + * for intra16x16 mb_type only ac coefficients are + */ + UWORD32 b16_luma_csbp: 16; + + /** + * CSBP - 16 bits, 1 bit for each 4x4 + * for intra16x16 mb_type only ac coefficients are + */ + UWORD32 b16_chroma_csbp: 8; +}inter_mb_t; + +#endif /* _IH264_STRUCTS_H_ */ diff --git a/common/ih264_trans_data.c b/common/ih264_trans_data.c new file mode 100755 index 0000000..a1231e6 --- /dev/null +++ b/common/ih264_trans_data.c @@ -0,0 +1,312 @@ +/****************************************************************************** + * + * Copyright (C) 2015 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ***************************************************************************** + * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore +*/ +/** + ******************************************************************************* + * @file + * ih264_trans_data.c + * + * @brief + * Contains definition of global variables for H264 encoder + * + * @author + * Ittiam + * + * @remarks + * + ******************************************************************************* + */ + +#include "ih264_typedefs.h" +#include "ih264_trans_data.h" + +/*****************************************************************************/ +/* Extern global definitions */ +/*****************************************************************************/ + +/* + * Since we don't have a division operation in neon + * we will multiply by LCM of 16,6,10 and scale accordingly + * so care that to get the actual transform you need to divide by LCM + * LCM = 240 + */ + +const UWORD16 g_scal_coff_h264_4x4[16] ={ + 15,40,40,40, + 40,24,40,24, + 15,40,40,15, + 40,24,40,24}; + + + +const UWORD16 g_scal_coff_h264_8x8[16]= + { + 16, 15, 20, 15, + 15, 14, 19, 14, + 20, 19, 25, 19, + 15, 14, 19, 14 + }; +/* + * The scaling is by an 8x8 matrix, but due its 4x4 symmetry we can use + * a 4x4 matrix for scaling + * now since divide is to be avoided, we will compute 1/ values and scale it up + * to preserve information since our data is max 10 bit +1 sign bit we can shift a maximum of 21 bits up + * hence multiply the matrix as such +{16.000 15.059 20.227 15.059 +15.059 14.173 19.051 14.173 +20.227 19.051 25.600 19.051 +15.059 14.173 19.051 14.173}; +{512, 544, 405, 544, +544, 578, 430, 578, +405, 430, 320, 430, +544, 578, 430, 578};*/ + + +/** + ****************************************************************************** + * @brief Scale Table for quantizing 4x4 subblock. To quantize a given 4x4 DCT + * transformed block, the coefficient at index location (i,j) is scaled by one of + * the constants in this table and right shift the result by (QP_BITS_h264_4x4 + + * floor(qp/6)), here qp is the quantization parameter used to quantize the mb. + * + * input : qp%6, index location (i,j) + * output : scale constant. + * + * @remarks 16 constants for each index position of the subblock and 6 for each + * qp%6 in the range 0-5 inclusive. + ****************************************************************************** + */ +const UWORD16 gu2_quant_scale_matrix_4x4[96] = +{ + 13107, 8066, 13107, 8066, + 8066, 5243, 8066, 5243, + 13107, 8066, 13107, 8066, + 8066, 5243, 8066, 5243, + + 11916, 7490, 11916, 7490, + 7490, 4660, 7490, 4660, + 11916, 7490, 11916, 7490, + 7490, 4660, 7490, 4660, + + 10082, 6554, 10082, 6554, + 6554, 4194, 6554, 4194, + 10082, 6554, 10082, 6554, + 6554, 4194, 6554, 4194, + + 9362, 5825, 9362, 5825, + 5825, 3647, 5825, 3647, + 9362, 5825, 9362, 5825, + 5825, 3647, 5825, 3647, + + 8192, 5243, 8192, 5243, + 5243, 3355, 5243, 3355, + 8192, 5243, 8192, 5243, + 5243, 3355, 5243, 3355, + + 7282, 4559, 7282, 4559, + 4559, 2893, 4559, 2893, + 7282, 4559, 7282, 4559, + 4559, 2893, 4559, 2893, + +}; + +/** + ****************************************************************************** + * @brief Round Factor for quantizing subblock. While quantizing a given 4x4 DCT + * transformed block, the coefficient at index location (i,j) is scaled by one of + * the constants in the table gu2_forward_quant_scalar_4x4 and then right shift + * the result by (QP_BITS_h264_4x4 + floor(qp/6)). + * Before right shifting a round factor is added. + * The round factor can be any value [a * (1 << (QP_BITS_h264_4x4 + floor(qp/6)))] + * for 'a' lies in the range 0-0.5. + * Here qp is the quantization parameter used to quantize the mb. + * + * input : qp/6 + * output : round factor. + * + * @remarks The round factor is constructed by setting a = 1/3 + * + * round factor constructed by setting a = 1/3 + { + 10922, 21845, 43690, 87381, + 174762, 349525, 699050, 1398101, + 2796202, + } + * + * round factor constructed by setting a = 0.49 + *{ + 16056, 32112, 64225, + 128450, 256901, 513802, + 1027604, 2055208, 4110417, + }; + + * round factor constructed by setting a = 0.5 + 16384, 32768, 65536, + 131072, 262144, 524288, + 1048576, 2097152, 4194304, + + ****************************************************************************** + */ +const UWORD32 gu4_forward_quant_round_factor_4x4[9] = +{ + 10922, 21845, 43690, 87381, + 174762, 349525, 699050, 1398101, + 2796202, +}; + + + +/** + ****************************************************************************** + * @brief Threshold Table. Quantizing the given DCT coefficient is done only if + * it exceeds the threshold value presented in this table. + * + * input : qp/6, qp%6, index location (i,j) + * output : Threshold constant. + * + * @remarks 16 constants for each index position of the subblock and 6 for each + * qp%6 in the range 0-5 inclusive and 9 for each qp/6 in the range 0-51. + ****************************************************************************** + */ +const UWORD16 gu2_forward_quant_threshold_4x4[96] = +{ + 426, 693, 426, 693, + 693, 1066, 693, 1066, + 426, 693, 426, 693, + 693, 1066, 693, 1066, + + 469, 746, 469, 746, + 746, 1200, 746, 1200, + 469, 746, 469, 746, + 746, 1200, 746, 1200, + + 554, 853, 554, 853, + 853, 1333, 853, 1333, + 554, 853, 554, 853, + 853, 1333, 853, 1333, + + 597, 960, 597, 960, + 960, 1533, 960, 1533, + 597, 960, 597, 960, + 960, 1533, 960, 1533, + + 682, 1066, 682, 1066, + 1066, 1666, 1066, 1666, + 682, 1066, 682, 1066, + 1066, 1666, 1066, 1666, + + 767, 1226, 767, 1226, + 1226, 1933, 1226, 1933, + 767, 1226, 767, 1226, + 1226, 1933, 1226, 1933, +}; + +/** + ****************************************************************************** + * @brief Scale Table for quantizing 8x8 subblock. To quantize a given 8x8 DCT + * transformed block, the coefficient at index location (i,j) is scaled by one of + * the constants in this table and right shift the result by (QP_BITS_h264_8x8 + + * floor(qp/6)), here qp is the quantization parameter used to quantize the mb. + * + * input : qp%6, index location (i,j) + * output : scale constant. + * + * @remarks 64 constants for each index position of the subblock and 6 for each + * qp%6 in the range 0-5 inclusive. + ****************************************************************************** + */ +const UWORD16 gu2_quant_scale_matrix_8x8 [384] = +{ + 13107, 12222, 16777, 12222, 13107, 12222, 16777, 12222, + 12222, 11428, 15481, 11428, 12222, 11428, 15481, 11428, + 16777, 15481, 20972, 15481, 16777, 15481, 20972, 15481, + 12222, 11428, 15481, 11428, 12222, 11428, 15481, 11428, + 13107, 12222, 16777, 12222, 13107, 12222, 16777, 12222, + 12222, 11428, 15481, 11428, 12222, 11428, 15481, 11428, + 16777, 15481, 20972, 15481, 16777, 15481, 20972, 15481, + 12222, 11428, 15481, 11428, 12222, 11428, 15481, 11428, + + 11916, 11058, 14980, 11058, 11916, 11058, 14980, 11058, + 11058, 10826, 14290, 10826, 11058, 10826, 14290, 10826, + 14980, 14290, 19174, 14290, 14980, 14290, 19174, 14290, + 11058, 10826, 14290, 10826, 11058, 10826, 14290, 10826, + 11916, 11058, 14980, 11058, 11916, 11058, 14980, 11058, + 11058, 10826, 14290, 10826, 11058, 10826, 14290, 10826, + 14980, 14290, 19174, 14290, 14980, 14290, 19174, 14290, + 11058, 10826, 14290, 10826, 11058, 10826, 14290, 10826, + + 10082, 9675, 12710, 9675, 10082, 9675, 12710, 9675, + 9675, 8943, 11985, 8943, 9675, 8943, 11985, 8943, + 12710, 11985, 15978, 11985, 12710, 11985, 15978, 11985, + 9675, 8943, 11985, 8943, 9675, 8943, 11985, 8943, + 10082, 9675, 12710, 9675, 10082, 9675, 12710, 9675, + 9675, 8943, 11985, 8943, 9675, 8943, 11985, 8943, + 12710, 11985, 15978, 11985, 12710, 11985, 15978, 11985, + 9675, 8943, 11985, 8943, 9675, 8943, 11985, 8943, + + 9362, 8931, 11984, 8931, 9362, 8931, 11984, 8931, + 8931, 8228, 11259, 8228, 8931, 8228, 11259, 8228, + 11984, 11259, 14913, 11259, 11984, 11259, 14913, 11259, + 8931, 8228, 11259, 8228, 8931, 8228, 11259, 8228, + 9362, 8931, 11984, 8931, 9362, 8931, 11984, 8931, + 8931, 8228, 11259, 8228, 8931, 8228, 11259, 8228, + 11984, 11259, 14913, 11259, 11984, 11259, 14913, 11259, + 8931, 8228, 11259, 8228, 8931, 8228, 11259, 8228, + + 8192, 7740, 10486, 7740, 8192, 7740, 10486, 7740, + 7740, 7346, 9777, 7346, 7740, 7346, 9777, 7346, + 10486, 9777, 13159, 9777, 10486, 9777, 13159, 9777, + 7740, 7346, 9777, 7346, 7740, 7346, 9777, 7346, + 8192, 7740, 10486, 7740, 8192, 7740, 10486, 7740, + 7740, 7346, 9777, 7346, 7740, 7346, 9777, 7346, + 10486, 9777, 13159, 9777, 10486, 9777, 13159, 9777, + 7740, 7346, 9777, 7346, 7740, 7346, 9777, 7346, + + 7282, 6830, 9118, 6830, 7282, 6830, 9118, 6830, + 6830, 6428, 8640, 6428, 6830, 6428, 8640, 6428, + 9118, 8640, 11570, 8640, 9118, 8640, 11570, 8640, + 6830, 6428, 8640, 6428, 6830, 6428, 8640, 6428, + 7282, 6830, 9118, 6830, 7282, 6830, 9118, 6830, + 6830, 6428, 8640, 6428, 6830, 6428, 8640, 6428, + 9118, 8640, 11570, 8640, 9118, 8640, 11570, 8640, + 6830, 6428, 8640, 6428, 6830, 6428, 8640, 6428, + +}; + + +/** + ****************************************************************************** + * @brief Specification of QPc as a function of qPi + * + * input : qp luma + * output : qp chroma. + * + * @remarks Refer Table 8-15 of h264 specification. + ****************************************************************************** + */ +const UWORD8 gu1_qpc_fqpi[52] = +{ + 0, 1, 2, 3, 4, 5, 6, 7, + 8, 9, 10, 11, 12, 13, 14, 15, + 16, 17, 18, 19, 20, 21, 22, 23, + 24, 25, 26, 27, 28, 29, 29, 30, + 31, 32, 32, 33, 34, 34, 35, 35, + 36, 36, 37, 37, 37, 38, 38, 38, + 39, 39, 39, 39, +}; diff --git a/common/ih264_trans_data.h b/common/ih264_trans_data.h new file mode 100755 index 0000000..dc77ae7 --- /dev/null +++ b/common/ih264_trans_data.h @@ -0,0 +1,125 @@ +/****************************************************************************** + * + * Copyright (C) 2015 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ***************************************************************************** + * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore +*/ +/** + ******************************************************************************* + * @file + * ih264_trans_data.h + * + * @brief + * Contains declaration of global variables for H264 transform , qnat and inverse quant + * + * @author + * Ittiam + * + * @remarks + * + ******************************************************************************* + */ +#ifndef IH264_GLOBAL_DATA_H_ +#define IH264_GLOBAL_DATA_H_ + +/*****************************************************************************/ +/* Extern global declarations */ +/*****************************************************************************/ + +/* Scaling matrices for h264 quantization */ +extern const UWORD16 g_scal_coff_h264_4x4[16]; +extern const UWORD16 g_scal_coff_h264_8x8[16]; + + +/** + ****************************************************************************** + * @brief Scale Table for quantizing 4x4 subblock. To quantize a given 4x4 DCT + * transformed block, the coefficient at index location (i,j) is scaled by one of + * the constants in this table and right shift the result by (QP_BITS_h264_4x4 + + * floor(qp/6)), here qp is the quantization parameter used to quantize the mb. + * + * input : qp%6, index location (i,j) + * output : scale constant. + * + * @remarks 16 constants for each index position of the subblock and 6 for each + * qp%6 in the range 0-5 inclusive. + ****************************************************************************** + */ +extern const UWORD16 gu2_quant_scale_matrix_4x4[96]; + +/** + ****************************************************************************** + * @brief Round Factor for quantizing subblock. While quantizing a given 4x4 DCT + * transformed block, the coefficient at index location (i,j) is scaled by one of + * the constants in the table gu2_forward_quant_scalar_4x4 and then right shift + * the result by (QP_BITS_h264_4x4 + floor(qp/6)). + * Before right shifting a round factor is added. + * The round factor can be any value [a * (1 << (QP_BITS_h264_4x4 + floor(qp/6)))] + * for 'a' lies in the range 0-0.5. + * Here qp is the quantization parameter used to quantize the mb. + * + * input : qp/6 + * output : round factor. + * + * @remarks The round factor is constructed by setting a = 1/3 + ****************************************************************************** + */ +extern const UWORD32 gu4_forward_quant_round_factor_4x4[9]; + +/** + ****************************************************************************** + * @brief Threshold Table. Quantizing the given DCT coefficient is done only if + * it exceeds the threshold value presented in this table. + * + * input : qp/6, qp%6, index location (i,j) + * output : Threshold constant. + * + * @remarks 16 constants for each index position of the subblock and 6 for each + * qp%6 in the range 0-5 inclusive and 9 for each qp/6 in the range 0-51. + ****************************************************************************** + */ +extern const UWORD16 gu2_forward_quant_threshold_4x4[96]; + +/** + ****************************************************************************** + * @brief Scale Table for quantizing 8x8 subblock. To quantize a given 8x8 DCT + * transformed block, the coefficient at index location (i,j) is scaled by one of + * the constants in this table and right shift the result by (QP_BITS_h264_8x8 + + * floor(qp/6)), here qp is the quantization parameter used to quantize the mb. + * + * input : qp%6, index location (i,j) + * output : scale constant. + * + * @remarks 64 constants for each index position of the subblock and 6 for each + * qp%6 in the range 0-5 inclusive. + ****************************************************************************** + */ +extern const UWORD16 gu2_quant_scale_matrix_8x8 [384]; + +/** + ****************************************************************************** + * @brief Specification of QPc as a function of qPi + * + * input : qp luma + * output : qp chroma. + * + * @remarks Refer Table 8-15 of h264 specification. + ****************************************************************************** + */ +extern const UWORD8 gu1_qpc_fqpi[52]; + + +#endif /* IH264_GLOBAL_DATA_H_ */ diff --git a/common/ih264_trans_macros.h b/common/ih264_trans_macros.h new file mode 100755 index 0000000..f114d0e --- /dev/null +++ b/common/ih264_trans_macros.h @@ -0,0 +1,124 @@ +/****************************************************************************** + * + * Copyright (C) 2015 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ***************************************************************************** + * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore +*/ +/** +******************************************************************************* +* @file +* ih264_trans_macros.h +* +* @brief +* The file contains definitions of macros that perform forward and inverse +* quantization +* +* @author +* Ittiam +* +* @remark +* None +* +******************************************************************************* +*/ + +#ifndef IH264_TRANS_MACROS_H_ +#define IH264_TRANS_MACROS_H_ + +/*****************************************************************************/ +/* Function Macros */ +/*****************************************************************************/ + +/** +****************************************************************************** + * @brief Macro to perform forward quantization. + * @description The value to be quantized is first compared with a threshold. + * If the value is less than the threshold, the quantization value is returned + * as zero else the value is quantized traditionally as per the rules of + * h264 specification +****************************************************************************** + */ +#define FWD_QUANT(i4_value, u4_abs_value, i4_sign, threshold, scale, rndfactor, qbits, u4_nnz) \ + {\ + if (i4_value < 0)\ + {\ + u4_abs_value = -i4_value;\ + i4_sign = -1;\ + }\ + else\ + {\ + u4_abs_value = i4_value;\ + i4_sign = 1;\ + }\ + if (u4_abs_value < threshold)\ + {\ + i4_value = 0;\ + }\ + else\ + {\ + u4_abs_value *= scale;\ + u4_abs_value += rndfactor;\ + u4_abs_value >>= qbits;\ + i4_value = u4_abs_value * i4_sign;\ + if (i4_value)\ + {\ + u4_nnz++;\ + }\ + }\ + } + +/** +****************************************************************************** + * @brief Macro to perform inverse quantization. + * @remarks The value can also be de-quantized as + * if (u4_qp_div_6 < 4) + * { + * i4_value = (quant_scale * weight_scale * i4_value + (1 << (3-u4_qp_div_6))) + * i4_value >>= (4 - u4_qp_div_6) + * } + * else + * { + * i4_value = (quant_scale * weight_scale * i4_value) << (u4_qp_div_6 -4) + * } +****************************************************************************** + */ +#define INV_QUANT(i4_value, quant_scale, weight_scale, u4_qp_div_6, rndfactor, qbits)\ + {\ + i4_value *= quant_scale;\ + i4_value *= weight_scale;\ + i4_value += rndfactor;\ + i4_value <<= u4_qp_div_6;\ + i4_value >>= qbits;\ + } + +#define QUANT_H264(x,y,w,z,shft) (shft = ABS(x),\ + shft *= y,\ + shft += z,\ + shft = shft>>w,\ + shft = SIGNXY(shft,x)) + +#define IQUANT_H264(x,y,wscal,w,shft) (shft = x, \ + shft *=y, \ + shft *=wscal, \ + shft = shft<<w) + +#define IQUANT_lev_H264(x,y,wscal,add_f,w,shft) (shft = x, \ + shft *=y, \ + shft *=wscal, \ + shft+= add_f, \ + shft = shft>>w) + +#endif /* IH264_TRANS_MACROS_H_ */ diff --git a/common/ih264_trans_quant_itrans_iquant.h b/common/ih264_trans_quant_itrans_iquant.h new file mode 100755 index 0000000..83551aa --- /dev/null +++ b/common/ih264_trans_quant_itrans_iquant.h @@ -0,0 +1,232 @@ +/****************************************************************************** + * + * Copyright (C) 2015 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ***************************************************************************** + * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore +*/ +/** + ******************************************************************************* + * @file + * ih264_trans_quant.h + * + * @brief + * Contains declarations for forward and inverse transform paths for H264 + * + * @author + * Ittiam + * + * @remarks + * + ******************************************************************************* + */ + +#ifndef IH264_TRANS_QUANT_H_ +#define IH264_TRANS_QUANT_H_ + +/*****************************************************************************/ +/* Extern Function Declarations */ +/*****************************************************************************/ + + +typedef void ih264_resi_trans_dctrans_quant_ft(UWORD8*pu1_src, + UWORD8 *pu1_pred, + WORD16 *pi2_out, + WORD32 src_strd, + WORD32 pred_strd, + WORD32 dst_strd, + const UWORD16 *pu2_scale_mat, + const UWORD16 *pu2_thresh_mat, + UWORD32 u4_qbit, + UWORD32 u4_round_fact, + UWORD8 *pu1_nnz); + +typedef void ih264_idctrans_iquant_itrans_recon_ft(WORD16 *pi2_src, + UWORD8 *pu1_pred, + UWORD8 *pu1_out, + WORD32 src_strd, + WORD32 pred_strd, + WORD32 out_strd, + const UWORD16 *pu2_iscale_mat, + const UWORD16 *pu2_weigh_mat, + UWORD32 qp_div, + UWORD32 pi4_cntrl, + WORD32 *pi4_tmp); + + +/*Function prototype declarations*/ +typedef void ih264_resi_trans_quant_ft(UWORD8*pu1_src, + UWORD8 *pu1_pred, + WORD16 *pi2_out, + WORD32 src_strd, + WORD32 pred_strd, + const UWORD16 *pu2_scale_mat, + const UWORD16 *pu2_thresh_mat, + UWORD32 u4_qbit, + UWORD32 u4_round_fact, + UWORD8 *pu1_nnz, + WORD16 *pi2_alt_dc_addr); + +typedef void ih264_luma_16x16_resi_trans_dctrans_quant_ft(UWORD8 *pu1_src, + UWORD8 *pu1_pred, + WORD16 *pi2_out, + WORD32 src_strd, + WORD32 pred_strd, + WORD32 dst_strd, + const UWORD16 *pu2_scale_matrix, + const UWORD16 *pu2_threshold_matrix, + UWORD32 u4_qbits, + UWORD32 u4_round_factor, + UWORD8 *pu1_nnz, + UWORD32 u4_dc_flag); + +typedef void ih264_chroma_8x8_resi_trans_dctrans_quant_ft(UWORD8 *pu1_src, + UWORD8 *pu1_pred, + WORD16 *pi2_out, + WORD32 src_strd, + WORD32 pred_strd, + WORD32 dst_strd, + const UWORD16 *pu2_scale_matrix, + const UWORD16 *pu2_threshold_matrix, + UWORD32 u4_qbits, + UWORD32 u4_round_factor, + UWORD8 *pu1_nnz); + +typedef void ih264_iquant_itrans_recon_ft(WORD16 *pi2_src, + UWORD8 *pu1_pred, + UWORD8 *pu1_out, + WORD32 pred_strd, + WORD32 out_strd, + const UWORD16 *pu2_iscale_mat, + const UWORD16 *pu2_weigh_mat, + UWORD32 qp_div, + WORD16 *pi2_tmp, + WORD32 iq_start_idx, + WORD16 *pi2_dc_ld_addr); + + +typedef void ih264_iquant_itrans_recon_chroma_ft(WORD16 *pi2_src, + UWORD8 *pu1_pred, + UWORD8 *pu1_out, + WORD32 pred_strd, + WORD32 out_strd, + const UWORD16 *pu2_iscal_mat, + const UWORD16 *pu2_weigh_mat, + UWORD32 u4_qp_div_6, + WORD16 *pi2_tmp, + WORD16 *pi2_dc_src); + + +typedef void ih264_luma_16x16_idctrans_iquant_itrans_recon_ft(WORD16 *pi2_src, + UWORD8 *pu1_pred, + UWORD8 *pu1_out, + WORD32 src_strd, + WORD32 pred_strd, + WORD32 out_strd, + const UWORD16 *pu2_iscale_mat, + const UWORD16 *pu2_weigh_mat, + UWORD32 qp_div, + UWORD32 pi4_cntrl, + UWORD32 u4_dc_trans_flag, + WORD32 *pi4_tmp); + +typedef void ih264_chroma_8x8_idctrans_iquant_itrans_recon_ft(WORD16 *pi2_src, + UWORD8 *pu1_pred, + UWORD8 *pu1_out, + WORD32 src_strd, + WORD32 pred_strd, + WORD32 out_strd, + const UWORD16 *pu2_iscale_mat, + const UWORD16 *pu2_weigh_mat, + UWORD32 qp_div, + UWORD32 pi4_cntrl, + WORD32 *pi4_tmp); + +typedef void ih264_ihadamard_scaling_ft(WORD16* pi2_src, + WORD16* pi2_out, + const UWORD16 *pu2_iscal_mat, + const UWORD16 *pu2_weigh_mat, + UWORD32 u4_qp_div_6, + WORD32* pi4_tmp); + +typedef void ih264_hadamard_quant_ft(WORD16 *pi2_src, WORD16 *pi2_dst, + const UWORD16 *pu2_scale_matrix, + const UWORD16 *pu2_threshold_matrix, UWORD32 u4_qbits, + UWORD32 u4_round_factor,UWORD8 *pu1_nnz); + +ih264_resi_trans_quant_ft ih264_resi_trans_quant_4x4; +ih264_resi_trans_quant_ft ih264_resi_trans_quant_chroma_4x4; +ih264_resi_trans_quant_ft ih264_resi_trans_quant_8x8; +ih264_iquant_itrans_recon_ft ih264_iquant_itrans_recon_4x4; +ih264_iquant_itrans_recon_ft ih264_iquant_itrans_recon_8x8; +ih264_iquant_itrans_recon_ft ih264_iquant_itrans_recon_4x4_dc; +ih264_iquant_itrans_recon_ft ih264_iquant_itrans_recon_8x8_dc; +ih264_iquant_itrans_recon_chroma_ft ih264_iquant_itrans_recon_chroma_4x4; +ih264_iquant_itrans_recon_chroma_ft ih264_iquant_itrans_recon_chroma_4x4_dc; +ih264_ihadamard_scaling_ft ih264_ihadamard_scaling_4x4; +ih264_ihadamard_scaling_ft ih264_ihadamard_scaling_2x2_uv; +ih264_hadamard_quant_ft ih264_hadamard_quant_4x4; +ih264_hadamard_quant_ft ih264_hadamard_quant_2x2_uv; + +/*A9 Declarations*/ +ih264_resi_trans_quant_ft ih264_resi_trans_quant_4x4_a9; +ih264_resi_trans_quant_ft ih264_resi_trans_quant_chroma_4x4_a9; +ih264_iquant_itrans_recon_ft ih264_iquant_itrans_recon_4x4_a9; +ih264_iquant_itrans_recon_ft ih264_iquant_itrans_recon_8x8_a9; +ih264_iquant_itrans_recon_ft ih264_iquant_itrans_recon_4x4_dc_a9; +ih264_iquant_itrans_recon_ft ih264_iquant_itrans_recon_8x8_dc_a9; +ih264_iquant_itrans_recon_chroma_ft ih264_iquant_itrans_recon_chroma_4x4_a9; +ih264_iquant_itrans_recon_chroma_ft ih264_iquant_itrans_recon_chroma_4x4_dc_a9; +ih264_luma_16x16_resi_trans_dctrans_quant_ft ih264_luma_16x16_resi_trans_dctrans_quant_a9; +ih264_chroma_8x8_resi_trans_dctrans_quant_ft ih264_chroma_8x8_resi_trans_dctrans_quant_a9; +ih264_luma_16x16_idctrans_iquant_itrans_recon_ft ih264_luma_16x16_idctrans_iquant_itrans_recon_a9; +ih264_chroma_8x8_idctrans_iquant_itrans_recon_ft ih264_chroma_8x8_idctrans_iquant_itrans_recon_a9; +ih264_ihadamard_scaling_ft ih264_ihadamard_scaling_4x4_a9; +ih264_ihadamard_scaling_ft ih264_ihadamard_scaling_2x2_uv_a9; +ih264_hadamard_quant_ft ih264_hadamard_quant_4x4_a9; +ih264_hadamard_quant_ft ih264_hadamard_quant_2x2_uv_a9; + +/*Av8 Declarations*/ +ih264_resi_trans_quant_ft ih264_resi_trans_quant_4x4_av8; +ih264_resi_trans_quant_ft ih264_resi_trans_quant_chroma_4x4_av8; +ih264_iquant_itrans_recon_ft ih264_iquant_itrans_recon_4x4_av8; +ih264_iquant_itrans_recon_ft ih264_iquant_itrans_recon_8x8_av8; +ih264_iquant_itrans_recon_ft ih264_iquant_itrans_recon_4x4_dc_av8; +ih264_iquant_itrans_recon_ft ih264_iquant_itrans_recon_8x8_dc_av8; +ih264_iquant_itrans_recon_chroma_ft ih264_iquant_itrans_recon_chroma_4x4_av8; +ih264_iquant_itrans_recon_chroma_ft ih264_iquant_itrans_recon_chroma_4x4_dc_av8; +ih264_ihadamard_scaling_ft ih264_ihadamard_scaling_4x4_av8; +ih264_ihadamard_scaling_ft ih264_ihadamard_scaling_2x2_uv_av8; +ih264_hadamard_quant_ft ih264_hadamard_quant_4x4_av8; +ih264_hadamard_quant_ft ih264_hadamard_quant_2x2_uv_av8; + +/*SSSE3 Declarations*/ +ih264_iquant_itrans_recon_ft ih264_iquant_itrans_recon_4x4_ssse3; +ih264_iquant_itrans_recon_ft ih264_iquant_itrans_recon_8x8_ssse3; +ih264_iquant_itrans_recon_ft ih264_iquant_itrans_recon_4x4_dc_ssse3; +ih264_iquant_itrans_recon_ft ih264_iquant_itrans_recon_8x8_dc_ssse3; +ih264_iquant_itrans_recon_chroma_ft ih264_iquant_itrans_recon_chroma_4x4_dc_ssse3; +ih264_ihadamard_scaling_ft ih264_ihadamard_scaling_4x4_ssse3; +ih264_ihadamard_scaling_ft ih264_ihadamard_scaling_2x2_uv_ssse3; +/*SSSE42 Declarations*/ +ih264_resi_trans_quant_ft ih264_resi_trans_quant_4x4_sse42; +ih264_resi_trans_quant_ft ih264_resi_trans_quant_chroma_4x4_sse42; +ih264_iquant_itrans_recon_ft ih264_iquant_itrans_recon_4x4_sse42; +ih264_iquant_itrans_recon_chroma_ft ih264_iquant_itrans_recon_chroma_4x4_sse42; +ih264_ihadamard_scaling_ft ih264_ihadamard_scaling_4x4_sse42; +ih264_hadamard_quant_ft ih264_hadamard_quant_4x4_sse42; +ih264_hadamard_quant_ft ih264_hadamard_quant_2x2_uv_sse42; + +#endif /* IH264_TRANS_QUANT_H_ */ diff --git a/common/ih264_typedefs.h b/common/ih264_typedefs.h new file mode 100755 index 0000000..8e4685a --- /dev/null +++ b/common/ih264_typedefs.h @@ -0,0 +1,64 @@ +/****************************************************************************** + * + * Copyright (C) 2015 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ***************************************************************************** + * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore +*/ +/** +******************************************************************************* +* @file +* ih264_typedefs.h +* +* @brief +* Type definitions used in the code +* +* @author +* Ittiam +* +* @remarks +* None +* +******************************************************************************* +*/ + +#ifndef _IH264_TYPEDEFS_H_ +#define _IH264_TYPEDEFS_H_ + + +/*****************************************************************************/ +/* Unsigned data types */ +/*****************************************************************************/ +typedef unsigned char UWORD8; +typedef unsigned short UWORD16; +typedef unsigned int UWORD32; +typedef unsigned long long UWORD64; + + +/*****************************************************************************/ +/* Signed data types */ +/*****************************************************************************/ +typedef signed char WORD8; +typedef short WORD16; +typedef int WORD32; + + +/*****************************************************************************/ +/* Miscellaneous data types */ +/*****************************************************************************/ +typedef char CHAR; +typedef double DOUBLE; + +#endif /* _IH264_TYPEDEFS_H_ */ diff --git a/common/ih264_weighted_pred.c b/common/ih264_weighted_pred.c new file mode 100755 index 0000000..d5d73f2 --- /dev/null +++ b/common/ih264_weighted_pred.c @@ -0,0 +1,495 @@ +/****************************************************************************** + * + * Copyright (C) 2015 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ***************************************************************************** + * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore +*/ +/*****************************************************************************/ +/* */ +/* File Name : ih264_weighted_pred.c */ +/* */ +/* Description : Contains function definitions for weighted */ +/* prediction functions */ +/* */ +/* List of Functions : ih264_default_weighted_pred_luma() */ +/* ih264_default_weighted_pred_chroma() */ +/* ih264_weighted_pred_luma() */ +/* ih264_weighted_pred_chroma() */ +/* ih264_weighted_bipred_luma() */ +/* ih264_weighted_bipred_chroma() */ +/* */ +/* Issues / Problems : None */ +/* */ +/* Revision History : */ +/* */ +/* DD MM YYYY Author(s) Changes */ +/* 07 01 2015 Kaushik Initial version */ +/* Senthoor */ +/* */ +/*****************************************************************************/ +/*****************************************************************************/ +/* File Includes */ +/*****************************************************************************/ + +/* User include files */ +#include "ih264_typedefs.h" +#include "ih264_macros.h" +#include "ih264_platform_macros.h" +#include "ih264_weighted_pred.h" + +/*****************************************************************************/ +/* Function definitions . */ +/*****************************************************************************/ +/*****************************************************************************/ +/* */ +/* Function Name : ih264_default_weighted_pred_luma */ +/* */ +/* Description : This function performs the default weighted prediction */ +/* as described in sec 8.4.2.3.1 titled "Default weighted */ +/* sample prediction process" for luma. The function gets */ +/* two ht x wd blocks, calculates their rounded-average and */ +/* stores it in the destination block. (ht,wd) can be */ +/* (4,4), (8,4), (4,8), (8,8), (16,8), (8,16) or (16,16). */ +/* */ +/* Inputs : puc_src1 - Pointer to source 1 */ +/* puc_src2 - Pointer to source 2 */ +/* puc_dst - Pointer to destination */ +/* src_strd1 - stride for source 1 */ +/* src_strd1 - stride for source 2 */ +/* dst_strd - stride for destination */ +/* ht - height of the block */ +/* wd - width of the block */ +/* */ +/* Issues : None */ +/* */ +/* Revision History: */ +/* */ +/* DD MM YYYY Author(s) Changes */ +/* 07 01 2015 Kaushik Initial Version */ +/* Senthoor */ +/* */ +/*****************************************************************************/ +void ih264_default_weighted_pred_luma(UWORD8 *pu1_src1, + UWORD8 *pu1_src2, + UWORD8 *pu1_dst, + WORD32 src_strd1, + WORD32 src_strd2, + WORD32 dst_strd, + WORD32 ht, + WORD32 wd) +{ + WORD32 i, j; + + src_strd1 -= wd; + src_strd2 -= wd; + dst_strd -= wd; + + for(i = 0; i < ht; i++) + { + for(j = 0; j < wd; j++, pu1_src1++, pu1_src2++, pu1_dst++) + *pu1_dst = (*pu1_src1 + *pu1_src2 + 1) >> 1; + + pu1_src1 += src_strd1; + pu1_src2 += src_strd2; + pu1_dst += dst_strd; + } +} + +/*****************************************************************************/ +/* */ +/* Function Name : ih264_default_weighted_pred_chroma */ +/* */ +/* Description : This function performs the default weighted prediction */ +/* as described in sec 8.4.2.3.1 titled "Default weighted */ +/* sample prediction process" for chroma. The function gets */ +/* two ht x wd blocks, calculates their rounded-average and */ +/* stores it in the destination block. (ht,wd) can be */ +/* (2,2), (4,2) , (2,4), (4,4), (8,4), (4,8) or (8,8). */ +/* */ +/* Inputs : puc_src1 - Pointer to source 1 */ +/* puc_src2 - Pointer to source 2 */ +/* puc_dst - Pointer to destination */ +/* src_strd1 - stride for source 1 */ +/* src_strd1 - stride for source 2 */ +/* dst_strd - stride for destination */ +/* ht - height of the block */ +/* wd - width of the block */ +/* */ +/* Issues : None */ +/* */ +/* Revision History: */ +/* */ +/* DD MM YYYY Author(s) Changes */ +/* 07 01 2015 Kaushik Initial Version */ +/* Senthoor */ +/* */ +/*****************************************************************************/ +void ih264_default_weighted_pred_chroma(UWORD8 *pu1_src1, + UWORD8 *pu1_src2, + UWORD8 *pu1_dst, + WORD32 src_strd1, + WORD32 src_strd2, + WORD32 dst_strd, + WORD32 ht, + WORD32 wd) +{ + WORD32 i, j; + + wd = wd << 1; + + src_strd1 -= wd; + src_strd2 -= wd; + dst_strd -= wd; + + for(i = 0; i < ht; i++) + { + for(j = 0; j < wd; j++, pu1_src1++, pu1_src2++, pu1_dst++) + *pu1_dst = (*pu1_src1 + *pu1_src2 + 1) >> 1; + + pu1_src1 += src_strd1; + pu1_src2 += src_strd2; + pu1_dst += dst_strd; + } +} + +/*****************************************************************************/ +/* */ +/* Function Name : ih264_weighted_pred_luma */ +/* */ +/* Description : This function performs the weighted prediction as */ +/* described in sec 8.4.2.3.2 titled "Weighted sample */ +/* prediction process" for luma. The function gets one */ +/* ht x wd block, weights it, rounds it off, offsets it, */ +/* saturates it to unsigned 8-bit and stores it in the */ +/* destination block. (ht,wd) can be (4,4), (8,4), (4,8), */ +/* (8,8), (16,8), (8,16) or (16,16). */ +/* */ +/* Inputs : puc_src - Pointer to source */ +/* puc_dst - Pointer to destination */ +/* src_strd - stride for source */ +/* dst_strd - stride for destination */ +/* log_wd - number of bits to be rounded off */ +/* wt - weight value */ +/* ofst - offset value */ +/* ht - height of the block */ +/* wd - width of the block */ +/* */ +/* Issues : None */ +/* */ +/* Revision History: */ +/* */ +/* DD MM YYYY Author(s) Changes */ +/* 07 01 2015 Kaushik Initial Version */ +/* Senthoor */ +/* */ +/*****************************************************************************/ +void ih264_weighted_pred_luma(UWORD8 *pu1_src, + UWORD8 *pu1_dst, + WORD32 src_strd, + WORD32 dst_strd, + WORD32 log_wd, + WORD32 wt, + WORD32 ofst, + WORD32 ht, + WORD32 wd) +{ + WORD32 i, j; + + wt = (WORD16)(wt & 0xffff); + ofst = (WORD8)(ofst & 0xff); + + src_strd -= wd; + dst_strd -= wd; + + if(log_wd >= 1) + { + WORD32 i_ofst = (1 << (log_wd - 1)) + (ofst << log_wd); + for(i = 0; i < ht; i++) + { + for(j = 0; j < wd; j++, pu1_src++, pu1_dst++) + *pu1_dst = CLIP_U8((wt * (*pu1_src) + i_ofst) >> log_wd); + + pu1_src += src_strd; + pu1_dst += dst_strd; + } + } + else + { + for(i = 0; i < ht; i++) + { + for(j = 0; j < wd; j++, pu1_src++, pu1_dst++) + *pu1_dst = CLIP_U8(wt * (*pu1_src) + ofst); + + pu1_src += src_strd; + pu1_dst += dst_strd; + } + } +} + +/*****************************************************************************/ +/* */ +/* Function Name : ih264_weighted_pred_chroma */ +/* */ +/* Description : This function performs the weighted prediction as */ +/* described in sec 8.4.2.3.2 titled "Weighted sample */ +/* prediction process" for chroma. The function gets one */ +/* ht x wd block, weights it, rounds it off, offsets it, */ +/* saturates it to unsigned 8-bit and stores it in the */ +/* destination block. (ht,wd) can be (2,2), (4,2), (2,4), */ +/* (4,4), (8,4), (4,8) or (8,8). */ +/* */ +/* Inputs : puc_src - Pointer to source */ +/* puc_dst - Pointer to destination */ +/* src_strd - stride for source */ +/* dst_strd - stride for destination */ +/* log_wd - number of bits to be rounded off */ +/* wt - weight values for u and v */ +/* ofst - offset values for u and v */ +/* ht - height of the block */ +/* wd - width of the block */ +/* */ +/* Issues : None */ +/* */ +/* Revision History: */ +/* */ +/* DD MM YYYY Author(s) Changes */ +/* 07 01 2015 Kaushik Initial Version */ +/* Senthoor */ +/* */ +/*****************************************************************************/ +void ih264_weighted_pred_chroma(UWORD8 *pu1_src, + UWORD8 *pu1_dst, + WORD32 src_strd, + WORD32 dst_strd, + WORD32 log_wd, + WORD32 wt, + WORD32 ofst, + WORD32 ht, + WORD32 wd) +{ + WORD32 i, j; + WORD32 wt_u, wt_v; + WORD32 ofst_u, ofst_v; + + wt_u = (WORD16)(wt & 0xffff); + wt_v = (WORD16)(wt >> 16); + + ofst_u = (WORD8)(ofst & 0xff); + ofst_v = (WORD8)(ofst >> 8); + + src_strd -= wd << 1; + dst_strd -= wd << 1; + + if(log_wd >= 1) + { + ofst_u = (1 << (log_wd - 1)) + (ofst_u << log_wd); + ofst_v = (1 << (log_wd - 1)) + (ofst_v << log_wd); + + for(i = 0; i < ht; i++) + { + for(j = 0; j < wd; j++, pu1_src++, pu1_dst++) + { + *pu1_dst = CLIP_U8((wt_u * (*pu1_src) + ofst_u) >> log_wd); + pu1_src++; + pu1_dst++; + *pu1_dst = CLIP_U8((wt_v * (*pu1_src) + ofst_v) >> log_wd); + } + pu1_src += src_strd; + pu1_dst += dst_strd; + } + } + else + { + for(i = 0; i < ht; i++) + { + for(j = 0; j < wd; j++, pu1_src++, pu1_dst++) + { + *pu1_dst = CLIP_U8(wt_u * (*pu1_src) + ofst_u); + pu1_src++; + pu1_dst++; + *pu1_dst = CLIP_U8(wt_v * (*pu1_src) + ofst_v); + } + pu1_src += src_strd; + pu1_dst += dst_strd; + } + } +} + +/*****************************************************************************/ +/* */ +/* Function Name : ih264_weighted_bi_pred_luma */ +/* */ +/* Description : This function performs the weighted biprediction as */ +/* described in sec 8.4.2.3.2 titled "Weighted sample */ +/* prediction process" for luma. The function gets two */ +/* ht x wd blocks, weights them, adds them, rounds off the */ +/* sum, offsets it, saturates it to unsigned 8-bit and */ +/* stores it in the destination block. (ht,wd) can be */ +/* (4,4), (8,4), (4,8), (8,8), (16,8), (8,16) or (16,16). */ +/* */ +/* Inputs : puc_src1 - Pointer to source 1 */ +/* puc_src2 - Pointer to source 2 */ +/* puc_dst - Pointer to destination */ +/* src_strd1 - stride for source 1 */ +/* src_strd2 - stride for source 2 */ +/* dst_strd2 - stride for destination */ +/* log_wd - number of bits to be rounded off */ +/* wt1 - weight value for source 1 */ +/* wt2 - weight value for source 2 */ +/* ofst1 - offset value for source 1 */ +/* ofst2 - offset value for source 2 */ +/* ht - height of the block */ +/* wd - width of the block */ +/* */ +/* Issues : None */ +/* */ +/* Revision History: */ +/* */ +/* DD MM YYYY Author(s) Changes */ +/* 07 01 2015 Kaushik Initial Version */ +/* Senthoor */ +/* */ +/*****************************************************************************/ +void ih264_weighted_bi_pred_luma(UWORD8 *pu1_src1, + UWORD8 *pu1_src2, + UWORD8 *pu1_dst, + WORD32 src_strd1, + WORD32 src_strd2, + WORD32 dst_strd, + WORD32 log_wd, + WORD32 wt1, + WORD32 wt2, + WORD32 ofst1, + WORD32 ofst2, + WORD32 ht, + WORD32 wd) +{ + WORD32 i, j; + WORD32 shft, ofst; + + ofst1 = (WORD8)(ofst1 & 0xff); + ofst2 = (WORD8)(ofst2 & 0xff); + wt1 = (WORD16)(wt1 & 0xffff); + wt2 = (WORD16)(wt2 & 0xffff); + ofst = (ofst1 + ofst2 + 1) >> 1; + + shft = log_wd + 1; + ofst = (1 << log_wd) + (ofst << shft); + + src_strd1 -= wd; + src_strd2 -= wd; + dst_strd -= wd; + + for(i = 0; i < ht; i++) + { + for(j = 0; j < wd; j++, pu1_src1++, pu1_src2++, pu1_dst++) + *pu1_dst = CLIP_U8((wt1 * (*pu1_src1) + wt2 * (*pu1_src2) + ofst) >> shft); + + pu1_src1 += src_strd1; + pu1_src2 += src_strd2; + pu1_dst += dst_strd; + } +} + +/*****************************************************************************/ +/* */ +/* Function Name : ih264_weighted_bi_pred_chroma */ +/* */ +/* Description : This function performs the weighted biprediction as */ +/* described in sec 8.4.2.3.2 titled "Weighted sample */ +/* prediction process" for chroma. The function gets two */ +/* ht x wd blocks, weights them, adds them, rounds off the */ +/* sum, offsets it, saturates it to unsigned 8-bit and */ +/* stores it in the destination block. (ht,wd) can be */ +/* (2,2), (4,2), (2,4), (4,4), (8,4), (4,8) or (8,8). */ +/* */ +/* Inputs : puc_src1 - Pointer to source 1 */ +/* puc_src2 - Pointer to source 2 */ +/* puc_dst - Pointer to destination */ +/* src_strd1 - stride for source 1 */ +/* src_strd2 - stride for source 2 */ +/* dst_strd2 - stride for destination */ +/* log_wd - number of bits to be rounded off */ +/* wt1 - weight values for u and v in source 1 */ +/* wt2 - weight values for u and v in source 2 */ +/* ofst1 - offset value for u and v in source 1 */ +/* ofst2 - offset value for u and v in source 2 */ +/* ht - height of the block */ +/* wd - width of the block */ +/* */ +/* Issues : None */ +/* */ +/* Revision History: */ +/* */ +/* DD MM YYYY Author(s) Changes */ +/* 07 01 2015 Kaushik Initial Version */ +/* Senthoor */ +/* */ +/*****************************************************************************/ +void ih264_weighted_bi_pred_chroma(UWORD8 *pu1_src1, + UWORD8 *pu1_src2, + UWORD8 *pu1_dst, + WORD32 src_strd1, + WORD32 src_strd2, + WORD32 dst_strd, + WORD32 log_wd, + WORD32 wt1, + WORD32 wt2, + WORD32 ofst1, + WORD32 ofst2, + WORD32 ht, + WORD32 wd) +{ + WORD32 i, j; + WORD32 wt1_u, wt1_v, wt2_u, wt2_v; + WORD32 ofst1_u, ofst1_v, ofst2_u, ofst2_v; + WORD32 ofst_u, ofst_v; + WORD32 shft; + + ofst1_u = (WORD8)(ofst1 & 0xff); + ofst1_v = (WORD8)(ofst1 >> 8); + ofst2_u = (WORD8)(ofst2 & 0xff); + ofst2_v = (WORD8)(ofst2 >> 8); + wt1_u = (WORD16)(wt1 & 0xffff); + wt1_v = (WORD16)(wt1 >> 16); + wt2_u = (WORD16)(wt2 & 0xffff); + wt2_v = (WORD16)(wt2 >> 16); + ofst_u = (ofst1_u + ofst2_u + 1) >> 1; + ofst_v = (ofst1_v + ofst2_v + 1) >> 1; + + src_strd1 -= wd << 1; + src_strd2 -= wd << 1; + dst_strd -= wd << 1; + + shft = log_wd + 1; + ofst_u = (1 << log_wd) + (ofst_u << shft); + ofst_v = (1 << log_wd) + (ofst_v << shft); + + for(i = 0; i < ht; i++) + { + for(j = 0; j < wd; j++, pu1_src1++, pu1_src2++, pu1_dst++) + { + *pu1_dst = CLIP_U8((wt1_u * (*pu1_src1) + wt2_u * (*pu1_src2) + ofst_u) >> shft); + pu1_src1++; + pu1_src2++; + pu1_dst++; + *pu1_dst = CLIP_U8((wt1_v * (*pu1_src1) + wt2_v * (*pu1_src2) + ofst_v) >> shft); + } + pu1_src1 += src_strd1; + pu1_src2 += src_strd2; + pu1_dst += dst_strd; + } +} diff --git a/common/ih264_weighted_pred.h b/common/ih264_weighted_pred.h new file mode 100755 index 0000000..f9b93b0 --- /dev/null +++ b/common/ih264_weighted_pred.h @@ -0,0 +1,164 @@ +/****************************************************************************** + * + * Copyright (C) 2015 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ***************************************************************************** + * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore +*/ + +/** +******************************************************************************* +* @file +* ih264_weighted_pred.h +* +* @brief +* Declarations of functions used for weighted prediction +* +* @author +* Ittiam +* +* @par List of Functions: +* -ih264_default_weighted_pred_luma +* -ih264_default_weighted_pred_chroma +* -ih264_weighted_pred_luma +* -ih264_weighted_pred_chroma +* -ih264_weighted_bi_pred_luma +* -ih264_weighted_bi_pred_chroma +* -ih264_default_weighted_pred_luma_a9q +* -ih264_default_weighted_pred_chroma_a9q +* -ih264_weighted_pred_luma_a9q +* -ih264_weighted_pred_luma_a9q +* -ih264_weighted_bi_pred_luma_a9q +* -ih264_weighted_bi_pred_chroma_a9q +* -ih264_default_weighted_pred_luma_av8 +* -ih264_default_weighted_pred_chroma_av8 +* -ih264_weighted_pred_luma_av8 +* -ih264_weighted_pred_chroma_av8 +* -ih264_weighted_bi_pred_luma_av8 +* -ih264_weighted_bi_pred_chroma_av8 +* -ih264_default_weighted_pred_luma_sse42 +* -ih264_default_weighted_pred_chroma_sse42 +* -ih264_weighted_pred_luma_sse42 +* -ih264_weighted_pred_chroma_sse42 +* -ih264_weighted_bi_pred_luma_sse42 +* -ih264_weighted_bi_pred_chroma_sse42 +* +* +* @remarks +* None +* +******************************************************************************* +*/ + +#ifndef IH264_WEIGHTED_PRED_H_ +#define IH264_WEIGHTED_PRED_H_ + +/*****************************************************************************/ +/* Extern Function Declarations */ +/*****************************************************************************/ +typedef void ih264_default_weighted_pred_ft(UWORD8 *puc_src1, + UWORD8 *puc_src2, + UWORD8 *puc_dst, + WORD32 src_strd1, + WORD32 src_strd2, + WORD32 dst_strd, + WORD32 ht, + WORD32 wd); + +typedef void ih264_weighted_pred_ft(UWORD8 *puc_src, + UWORD8 *puc_dst, + WORD32 src_strd, + WORD32 dst_strd, + WORD32 log_wd, + WORD32 wt, + WORD32 ofst, + WORD32 ht, + WORD32 wd); + +typedef void ih264_weighted_bi_pred_ft(UWORD8 *puc_src1, + UWORD8 *puc_src2, + UWORD8 *puc_dst, + WORD32 src_strd1, + WORD32 src_strd2, + WORD32 dst_strd, + WORD32 log_wd, + WORD32 wt1, + WORD32 wt2, + WORD32 ofst1, + WORD32 ofst2, + WORD32 ht, + WORD32 wd); + +/* No NEON Declarations */ + +ih264_default_weighted_pred_ft ih264_default_weighted_pred_luma; + +ih264_default_weighted_pred_ft ih264_default_weighted_pred_chroma; + +ih264_weighted_pred_ft ih264_weighted_pred_luma; + +ih264_weighted_pred_ft ih264_weighted_pred_chroma; + +ih264_weighted_bi_pred_ft ih264_weighted_bi_pred_luma; + +ih264_weighted_bi_pred_ft ih264_weighted_bi_pred_chroma; + +/* A9 NEON Declarations */ + +ih264_default_weighted_pred_ft ih264_default_weighted_pred_luma_a9q; + +ih264_default_weighted_pred_ft ih264_default_weighted_pred_chroma_a9q; + +ih264_weighted_pred_ft ih264_weighted_pred_luma_a9q; + +ih264_weighted_pred_ft ih264_weighted_pred_chroma_a9q; + +ih264_weighted_bi_pred_ft ih264_weighted_bi_pred_luma_a9q; + +ih264_weighted_bi_pred_ft ih264_weighted_bi_pred_chroma_a9q; + + +/* AV8 NEON Declarations */ + +ih264_default_weighted_pred_ft ih264_default_weighted_pred_luma_av8; + +ih264_default_weighted_pred_ft ih264_default_weighted_pred_chroma_av8; + +ih264_weighted_pred_ft ih264_weighted_pred_luma_av8; + +ih264_weighted_pred_ft ih264_weighted_pred_chroma_av8; + +ih264_weighted_bi_pred_ft ih264_weighted_bi_pred_luma_av8; + +ih264_weighted_bi_pred_ft ih264_weighted_bi_pred_chroma_av8; + + +/* SSE42 Intrinsic Declarations */ + +ih264_default_weighted_pred_ft ih264_default_weighted_pred_luma_sse42; + +ih264_default_weighted_pred_ft ih264_default_weighted_pred_chroma_sse42; + +ih264_weighted_pred_ft ih264_weighted_pred_luma_sse42; + +ih264_weighted_pred_ft ih264_weighted_pred_chroma_sse42; + +ih264_weighted_bi_pred_ft ih264_weighted_bi_pred_luma_sse42; + +ih264_weighted_bi_pred_ft ih264_weighted_bi_pred_chroma_sse42; + +#endif /* IH264_WEIGHTED_PRED_H_ */ + +/** Nothing past this point */ diff --git a/common/ithread.c b/common/ithread.c new file mode 100755 index 0000000..4ffb98a --- /dev/null +++ b/common/ithread.c @@ -0,0 +1,604 @@ +/****************************************************************************** + * + * Copyright (C) 2015 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ***************************************************************************** + * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore +*/ +/*****************************************************************************/ +/* */ +/* File Name : ithread.c */ +/* */ +/* Description : Contains abstraction for threads, mutex and semaphores*/ +/* */ +/* List of Functions : */ +/* */ +/* Issues / Problems : None */ +/* */ +/* Revision History : */ +/* */ +/* DD MM YYYY Author(s) Changes */ +/* 07 09 2012 Harish Initial Version */ +/*****************************************************************************/ +/*****************************************************************************/ +/* File Includes */ +/*****************************************************************************/ +#include <string.h> +#include "ih264_typedefs.h" + +/* + * If the end target is bare metal, then there shall be no OS. + * In this case, the functions ithread_* used inside the h264 encoder library to assist multicore + * will not longer be functional. To resolve link issues, the functions are re-defined with no body. + */ +#ifndef BAREMETAL + + +#include "ithread.h" +#include <sys/types.h> + + +#define UNUSED(x) ((void)(x)) + +#ifndef X86_MSVC +//#define PTHREAD_AFFINITY +//#define SYSCALL_AFFINITY + +#ifdef PTHREAD_AFFINITY +#define _GNU_SOURCE +#define __USE_GNU +#endif + +#include <pthread.h> +#include <sched.h> +#include <semaphore.h> +#include <unistd.h> +#ifdef PTHREAD_AFFINITY +#include <sys/prctl.h> +#endif + +#endif + +#ifdef X86_MSVC + +#include <windows.h> +#define SEM_MAX_COUNT 100 +#define SEM_INCREMENT_COUNT 1 + +UWORD32 ithread_get_handle_size(void) +{ + return (sizeof(HANDLE)); +} + +UWORD32 ithread_get_mutex_lock_size(void) +{ + return (sizeof(HANDLE)); +} + +WORD32 ithread_create(void *thread_handle, void *attribute, void *strt, void *argument) +{ + HANDLE *ppv_thread_handle; + HANDLE thread_handle_value; + + UNUSED(attribute); + + if(0 == thread_handle) + return -1; + + ppv_thread_handle = (HANDLE *)thread_handle; + thread_handle_value = (void *)CreateThread + (NULL, /* Attributes */ + 1024*128, /* Stack i4_size */ + (LPTHREAD_START_ROUTINE)strt, /* Thread function */ + argument, /* Parameters */ + 0, /* Creation flags */ + NULL); /* Thread ID */ + *ppv_thread_handle = (HANDLE)thread_handle_value; + + return 0; +} + +WORD32 ithread_join(void *thread_handle, void ** val_ptr) +{ + HANDLE *ppv_thread_handle; + HANDLE thread_handle_value; + + UNUSED(val_ptr); + + if(0 == thread_handle) + return -1; + + ppv_thread_handle = (HANDLE *)thread_handle; + thread_handle_value = *ppv_thread_handle; + + if(WAIT_OBJECT_0 == WaitForSingleObject(thread_handle_value, INFINITE)) + { + CloseHandle(thread_handle_value); + } + + return 0; +} + +void ithread_exit(void *thread_handle) +{ + HANDLE *ppv_thread_handle; + HANDLE thread_handle_value; + DWORD thread_exit_code; + + if(0 == thread_handle) + return; + + ppv_thread_handle = (HANDLE *)thread_handle; + thread_handle_value = *ppv_thread_handle; + /* Get exit code for thread. If the return value is 0, means thread is busy */ + if( 0 != GetExitCodeThread(thread_handle_value, &thread_exit_code)) + { + TerminateThread(thread_handle_value, thread_exit_code); + } + + return; +} + +WORD32 ithread_get_mutex_struct_size(void) +{ + return (sizeof(HANDLE)); +} + +WORD32 ithread_mutex_init(void *mutex) +{ + HANDLE *ppv_mutex_handle; + HANDLE mutex_handle_value; + + if(0 == mutex) + return -1; + + ppv_mutex_handle = (HANDLE *)mutex; + mutex_handle_value = CreateSemaphore(NULL, 1, 1, NULL); + *ppv_mutex_handle = mutex_handle_value; + return 0; +} + +WORD32 ithread_mutex_destroy(void *mutex) +{ + HANDLE *ppv_mutex_handle; + HANDLE mutex_handle_value; + + if(0 == mutex) + return -1; + + ppv_mutex_handle = (HANDLE *)mutex; + mutex_handle_value = *ppv_mutex_handle; + CloseHandle(mutex_handle_value); + return 0; +} + +WORD32 ithread_mutex_lock(void *mutex) +{ + HANDLE *ppv_mutex_handle; + HANDLE mutex_handle_value; + DWORD result = 0; + + if(0 == mutex) + return -1; + + ppv_mutex_handle = (HANDLE *)mutex; + mutex_handle_value = *ppv_mutex_handle; + result = WaitForSingleObject(mutex_handle_value, INFINITE); + + if(WAIT_OBJECT_0 == result) + return 0; + + return 1; + +} + +WORD32 ithread_mutex_unlock(void *mutex) +{ + HANDLE *ppv_mutex_handle; + HANDLE mutex_handle_value; + DWORD result = 0; + + if(0 == mutex) + return -1; + + ppv_mutex_handle = (HANDLE *)mutex; + mutex_handle_value = *ppv_mutex_handle; + result = ReleaseSemaphore(mutex_handle_value, 1, NULL); + + if(0 == result) + return -1; + + return 0; +} + +void ithread_yield(void) { } + +void ithread_usleep(UWORD32 u4_time_us) +{ + UWORD32 u4_time_ms = u4_time_us / 1000; + Sleep(u4_time_ms); +} + +void ithread_msleep(UWORD32 u4_time_ms) +{ + Sleep(u4_time_ms); +} + +void ithread_sleep(UWORD32 u4_time) +{ + UWORD32 u4_time_ms = u4_time * 1000; + Sleep(u4_time_ms); +} + +UWORD32 ithread_get_sem_struct_size(void) +{ + return (sizeof(HANDLE)); +} + +WORD32 ithread_sem_init(void *sem,WORD32 pshared,UWORD32 value) +{ + HANDLE *sem_handle = (HANDLE *)sem; + HANDLE sem_handle_value; + + if(0 == sem) + return -1; + + sem_handle_value = CreateSemaphore(NULL, /* Security Attribute*/ + value, /* Initial count */ + SEM_MAX_COUNT,/* Max value */ + NULL); /* Name, not used */ + *sem_handle = sem_handle_value; + return 0; +} + +WORD32 ithread_sem_post(void *sem) +{ + HANDLE *sem_handle = (HANDLE *)sem; + HANDLE sem_handle_value; + + if(0 == sem) + return -1; + + sem_handle_value = *sem_handle; + + /* Post on Semaphore by releasing the lock on mutex */ + if(ReleaseSemaphore(sem_handle_value, SEM_INCREMENT_COUNT, NULL)) + return 0; + + return -1; +} + +WORD32 ithread_sem_wait(void *sem) +{ + DWORD result = 0; + HANDLE *sem_handle = (HANDLE *)sem; + HANDLE sem_handle_value; + + if(0 == sem) + return -1; + + sem_handle_value = *sem_handle; + + /* Wait on Semaphore object infinitly */ + result = WaitForSingleObject(sem_handle_value, INFINITE); + + /* If lock on semaphore is acquired, return SUCCESS */ + if(WAIT_OBJECT_0 == result) + return 0; + + /* If call timeouts, return FAILURE */ + if(WAIT_TIMEOUT == result) + return -1; + + return 0; +} + +WORD32 ithread_sem_destroy(void *sem) +{ + HANDLE *sem_handle = (HANDLE *)sem; + HANDLE sem_handle_value; + + if(0 == sem) + return -1; + + sem_handle_value = *sem_handle; + + if(FALSE == CloseHandle(sem_handle_value) ) + { + return -1; + } + return 0; +} + +WORD32 ithread_set_affinity(WORD32 core_id) +{ + return 1; +} + +#else + +UWORD32 ithread_get_handle_size(void) +{ + return sizeof(pthread_t); +} + +UWORD32 ithread_get_mutex_lock_size(void) +{ + return sizeof(pthread_mutex_t); +} + + +WORD32 ithread_create(void *thread_handle, void *attribute, void *strt, void *argument) +{ + UNUSED(attribute); + return pthread_create((pthread_t *)thread_handle, NULL,(void *(*)(void *)) strt, argument); +} + +WORD32 ithread_join(void *thread_handle, void ** val_ptr) +{ + UNUSED(val_ptr); + pthread_t *pthread_handle = (pthread_t *)thread_handle; + return pthread_join(*pthread_handle, NULL); +} + +void ithread_exit(void *val_ptr) +{ + return pthread_exit(val_ptr); +} + +WORD32 ithread_get_mutex_struct_size(void) +{ + return(sizeof(pthread_mutex_t)); +} +WORD32 ithread_mutex_init(void *mutex) +{ + return pthread_mutex_init((pthread_mutex_t *) mutex, NULL); +} + +WORD32 ithread_mutex_destroy(void *mutex) +{ + return pthread_mutex_destroy((pthread_mutex_t *) mutex); +} + +WORD32 ithread_mutex_lock(void *mutex) +{ + return pthread_mutex_lock((pthread_mutex_t *)mutex); +} + +WORD32 ithread_mutex_unlock(void *mutex) +{ + return pthread_mutex_unlock((pthread_mutex_t *)mutex); +} + +void ithread_yield(void) +{ + sched_yield(); +} + +void ithread_sleep(UWORD32 u4_time) +{ + usleep(u4_time * 1000 * 1000); +} + +void ithread_msleep(UWORD32 u4_time_ms) +{ + usleep(u4_time_ms * 1000); +} + +void ithread_usleep(UWORD32 u4_time_us) +{ + usleep(u4_time_us); +} + +UWORD32 ithread_get_sem_struct_size(void) +{ + return(sizeof(sem_t)); +} + + +WORD32 ithread_sem_init(void *sem,WORD32 pshared,UWORD32 value) +{ + return sem_init((sem_t *)sem,pshared,value); +} + +WORD32 ithread_sem_post(void *sem) +{ + return sem_post((sem_t *)sem); +} + + +WORD32 ithread_sem_wait(void *sem) +{ + return sem_wait((sem_t *)sem); +} + + +WORD32 ithread_sem_destroy(void *sem) +{ + return sem_destroy((sem_t *)sem); +} + +void ithread_set_name(CHAR *pc_thread_name) +{ + +#ifndef WIN32 +#ifndef QNX +#ifndef IOS + UNUSED(pc_thread_name); +//prctl(PR_SET_NAME, (unsigned long)pu1_thread_name, 0, 0, 0); +#endif +#endif +#endif + +} +WORD32 ithread_set_affinity(WORD32 core_id) +{ +#ifdef PTHREAD_AFFINITY + cpu_set_t cpuset; + int num_cores = sysconf(_SC_NPROCESSORS_ONLN); + pthread_t cur_thread = pthread_self(); + + if (core_id >= num_cores) + return -1; + + CPU_ZERO(&cpuset); + CPU_SET(core_id, &cpuset); + + return pthread_setaffinity_np(cur_thread, sizeof(cpu_set_t), &cpuset); + +#elif SYSCALL_AFFINITY + WORD32 i4_sys_res; + UNUSED(core_id); + + pid_t pid = gettid(); + + + i4_sys_res = syscall(__NR_sched_setaffinity, pid, sizeof(i4_mask), &i4_mask); + if (i4_sys_res) + { + //WORD32 err; + //err = errno; + //perror("Error in setaffinity syscall PERROR : "); + //LOG_ERROR("Error in the syscall setaffinity: mask=0x%x err=0x%x", i4_mask, i4_sys_res); + return -1; + } +#else + UNUSED(core_id); +#endif + return 1; + +} +#endif + +#else + +UWORD32 ithread_get_handle_size(void) +{ + return sizeof(int); +} + +UWORD32 ithread_get_mutex_lock_size(void) +{ + return sizeof(int); +} + +UWORD32 ithread_get_cond_size(void) +{ + return(sizeof(int)); +} +WORD32 ithread_create(void *thread_handle, void *attribute, void *strt, void *argument) +{ + return 0; +} + +WORD32 ithread_join(void *thread_handle, void ** val_ptr) +{ + return 0; +} + +void ithread_exit(void *val_ptr) +{ + return; +} + +WORD32 ithread_mutex_init(void *mutex) +{ + return 0; +} + +WORD32 ithread_mutex_destroy(void *mutex) +{ + return 0; +} + +WORD32 ithread_mutex_lock(void *mutex) +{ + return 0; +} + +WORD32 ithread_mutex_unlock(void *mutex) +{ + return 0; +} + +void ithread_yield(void) +{ + return; +} + +void ithread_sleep(UWORD32 u4_time_in_us) +{ + return; +} + +void ithread_usleep(UWORD32 u4_time_us) +{ + return; +} + +UWORD32 ithread_get_sem_strcut_size(void) +{ + return(sizeof(int)); +} + + +WORD32 ithread_sem_init(void *sem,WORD32 pshared,UWORD32 value) +{ + return 0; +} + +WORD32 ithread_sem_post(void *sem) +{ + return 0; +} + + +WORD32 ithread_sem_wait(void *sem) +{ + return 0; +} + +WORD32 ithread_sem_destroy(void *sem) +{ + return 0; +} + +void ithread_set_name(UWORD8 *pu1_thread_name) +{ + return; +} + +void ithread_condition_init(void *condition) +{ + return; +} + +void ithread_condition_signal(void * condition) +{ + return; +} + + + +void ithread_condition_wait(void *condition,void *mutex) +{ + return; +} + +WORD32 ithread_set_affinity(WORD32 core_id) +{ + return 1; +} +#endif diff --git a/common/ithread.h b/common/ithread.h new file mode 100755 index 0000000..f926f83 --- /dev/null +++ b/common/ithread.h @@ -0,0 +1,104 @@ +/****************************************************************************** + * + * Copyright (C) 2015 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ***************************************************************************** + * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore +*/ +/*****************************************************************************/ +/* */ +/* File Name : ithread.h */ +/* */ +/* Description : This file contains all the necessary structure and */ +/* enumeration definitions needed for the Application */ +/* Program Interface(API) of the */ +/* Thread Abstraction Layer */ +/* */ +/* List of Functions : ithread_get_handle_size */ +/* ithread_get_mutex_lock_size */ +/* ithread_create */ +/* ithread_exit */ +/* ithread_join */ +/* ithread_get_mutex_struct_size */ +/* ithread_mutex_init */ +/* ithread_mutex_destroy */ +/* ithread_mutex_lock */ +/* ithread_mutex_unlock */ +/* ithread_yield */ +/* ithread_sleep */ +/* ithread_msleep */ +/* ithread_usleep */ +/* ithread_get_sem_struct_size */ +/* ithread_sem_init */ +/* ithread_sem_post */ +/* ithread_sem_wait */ +/* ithread_sem_destroy */ +/* ithread_set_affinity */ +/* */ +/* Issues / Problems : None */ +/* */ +/* Revision History : */ +/* */ +/* DD MM YYYY Author(s) Changes */ +/* 06 09 2012 Harish Initial Version */ +/* */ +/*****************************************************************************/ + +#ifndef _ITHREAD_H_ +#define _ITHREAD_H_ + +UWORD32 ithread_get_handle_size(void); + +UWORD32 ithread_get_mutex_lock_size(void); + +WORD32 ithread_create(void *thread_handle, void *attribute, void *strt, void *argument); + +void ithread_exit(void *val_ptr); + +WORD32 ithread_join(void *thread_id, void ** val_ptr); + +WORD32 ithread_get_mutex_struct_size(void); + +WORD32 ithread_mutex_init(void *mutex); + +WORD32 ithread_mutex_destroy(void *mutex); + +WORD32 ithread_mutex_lock(void *mutex); + +WORD32 ithread_mutex_unlock(void *mutex); + +void ithread_yield(void); + +void ithread_sleep(UWORD32 u4_time); + +void ithread_msleep(UWORD32 u4_time_ms); + +void ithread_usleep(UWORD32 u4_time_us); + +UWORD32 ithread_get_sem_struct_size(void); + +WORD32 ithread_sem_init(void *sem,WORD32 pshared,UWORD32 value); + +WORD32 ithread_sem_post(void *sem); + +WORD32 ithread_sem_wait(void *sem); + +WORD32 ithread_sem_destroy(void *sem); + +WORD32 ithread_set_affinity(WORD32 core_id); + +void ithread_set_name(CHAR *pc_thread_name); + +#endif /* _ITHREAD_H_ */ diff --git a/common/mips/ih264_platform_macros.h b/common/mips/ih264_platform_macros.h new file mode 100755 index 0000000..d098372 --- /dev/null +++ b/common/mips/ih264_platform_macros.h @@ -0,0 +1,102 @@ +/****************************************************************************** + * + * Copyright (C) 2015 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ***************************************************************************** + * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore +*/ +/** +******************************************************************************* +* @file +* ih264_platform_macros.h +* +* @brief +* Platform specific Macro definitions used in the codec +* +* @author +* Ittiam +* +* @remarks +* None +* +******************************************************************************* +*/ + + +#ifndef _IH264_PLATFORM_MACROS_H_ +#define _IH264_PLATFORM_MACROS_H_ + +#define CLIP_U8(x) CLIP3(0, 255, (x)) +#define CLIP_S8(x) CLIP3(-128, 127, (x)) + +#define CLIP_U10(x) CLIP3(0, 1023, (x)) +#define CLIP_S10(x) CLIP3(-512, 511, (x)) + +#define CLIP_U12(x) CLIP3(0, 4095, (x)) +#define CLIP_S12(x) CLIP3(-2048, 2047, (x)) + +#define CLIP_U16(x) CLIP3(0, 65535, (x)) +#define CLIP_S16(x) CLIP3(-32768, 32767, (x)) + +#define MEM_ALIGN16 __attribute__ ((aligned (16))) + +#define SHL(x,y) (((y) < 32) ? ((x) << (y)) : 0) +#define SHR(x,y) (((y) < 32) ? ((x) >> (y)) : 0) + +#define SHR_NEG(val,shift) ((shift>0)?(val>>shift):(val<<(-shift))) +#define SHL_NEG(val,shift) ((shift<0)?(val>>(-shift)):(val<<shift)) + + +#define ITT_BIG_ENDIAN(x) ((x << 24)) | \ + ((x & 0x0000ff00) << 8) | \ + ((x & 0x00ff0000) >> 8) | \ + ((UWORD32)x >> 24); + + +#define NOP(nop_cnt) {UWORD32 nop_i; for (nop_i = 0; nop_i < nop_cnt; nop_i++);} + +#define PLD(a) + +static __inline UWORD32 CLZ(UWORD32 u4_word) +{ + if(u4_word) + return(__builtin_clz(u4_word)); + else + return 32; +} + +static __inline UWORD32 CTZ(UWORD32 u4_word) +{ + if(0 == u4_word) + return 31; + else + { + unsigned int index; + index = __builtin_ctz(u4_word); + return (UWORD32)index; + } +} + +#define DATA_SYNC() + +#define INLINE + +#define PREFETCH(ptr, type) + +#define MEM_ALIGN8 __attribute__ ((aligned (8))) +#define MEM_ALIGN16 __attribute__ ((aligned (16))) +#define MEM_ALIGN32 __attribute__ ((aligned (32))) + +#endif /* _IH264_PLATFORM_MACROS_H_ */ diff --git a/common/x86/ih264_chroma_intra_pred_filters_ssse3.c b/common/x86/ih264_chroma_intra_pred_filters_ssse3.c new file mode 100755 index 0000000..45101a4 --- /dev/null +++ b/common/x86/ih264_chroma_intra_pred_filters_ssse3.c @@ -0,0 +1,433 @@ +/****************************************************************************** + * + * Copyright (C) 2015 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ***************************************************************************** + * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore +*/ +/** +******************************************************************************* +* @file +* ih264_chroma_intra_pred_filters_ssse3.c +* +* @brief +* Contains function definitions for chroma intra prediction filters in x86 +* intrinsics +* +* @author +* Ittiam +* +* @par List of Functions: +* -ih264_intra_pred_chroma_8x8_mode_horz_ssse3 +* -ih264_intra_pred_chroma_8x8_mode_vert_ssse3 +* -ih264_intra_pred_chroma_8x8_mode_plane_ssse3 +* +* @remarks +* None +* +******************************************************************************* +*/ + +/*****************************************************************************/ +/* File Includes */ +/*****************************************************************************/ + +/* System include files */ +#include <stdio.h> +#include <stddef.h> +#include <string.h> + +/* User include files */ +#include "ih264_defs.h" +#include "ih264_typedefs.h" +#include "ih264_macros.h" +#include "ih264_platform_macros.h" +#include "ih264_intra_pred_filters.h" + + +/*****************************************************************************/ +/* Chroma Intra prediction 8x8 filters */ +/*****************************************************************************/ +/** +******************************************************************************* +* +* ih264_intra_pred_chroma_8x8_mode_horz_ssse3 +* +* @brief +* Perform Intra prediction for chroma_8x8 mode:Horizontal +* +* @par Description: +* Perform Intra prediction for chroma_8x8 mode:Horizontal ,described in sec 8.3.4.2 +* +* @param[in] pu1_src +* UWORD8 pointer to the source containing alternate U and V samples +* +* @param[out] pu1_dst +* UWORD8 pointer to the destination with alternate U and V samples +* +* @param[in] src_strd +* integer source stride +* +* @param[in] dst_strd +* integer destination stride +* +* @param[in] ngbr_avail +* availability of neighbouring pixels(Not used in this function) +* +* @returns +* +* @remarks +* None +* +****************************************************************************** +*/ +void ih264_intra_pred_chroma_8x8_mode_horz_ssse3(UWORD8 *pu1_src, + UWORD8 *pu1_dst, + WORD32 src_strd, + WORD32 dst_strd, + WORD32 ngbr_avail) +{ + + UWORD8 *pu1_left; /* Pointer to start of top predictors */ + WORD32 dst_strd2; + + __m128i left_16x8b, left_sh_16x8b; + __m128i row1_16x8b, row2_16x8b; + __m128i const_14_15_16x8b; + + UNUSED(src_strd); + UNUSED(ngbr_avail); + + pu1_left = pu1_src + 2 * BLK8x8SIZE - 2; + + left_16x8b = _mm_loadu_si128((__m128i *)(pu1_left - 14)); + + const_14_15_16x8b = _mm_set1_epi16(0x0f0e); + + dst_strd2 = dst_strd << 1; + left_sh_16x8b = _mm_slli_si128(left_16x8b, 2); + row1_16x8b = _mm_shuffle_epi8(left_16x8b, const_14_15_16x8b); + row2_16x8b = _mm_shuffle_epi8(left_sh_16x8b, const_14_15_16x8b); + _mm_storeu_si128((__m128i *)pu1_dst, row1_16x8b); + _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd), row2_16x8b); + + left_16x8b = _mm_slli_si128(left_16x8b, 4); + left_sh_16x8b = _mm_slli_si128(left_sh_16x8b, 4); + pu1_dst += dst_strd2; + row1_16x8b = _mm_shuffle_epi8(left_16x8b, const_14_15_16x8b); + row2_16x8b = _mm_shuffle_epi8(left_sh_16x8b, const_14_15_16x8b); + _mm_storeu_si128((__m128i *)pu1_dst, row1_16x8b); + _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd), row2_16x8b); + + left_16x8b = _mm_slli_si128(left_16x8b, 4); + left_sh_16x8b = _mm_slli_si128(left_sh_16x8b, 4); + pu1_dst += dst_strd2; + row1_16x8b = _mm_shuffle_epi8(left_16x8b, const_14_15_16x8b); + row2_16x8b = _mm_shuffle_epi8(left_sh_16x8b, const_14_15_16x8b); + _mm_storeu_si128((__m128i *)pu1_dst, row1_16x8b); + _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd), row2_16x8b); + + left_16x8b = _mm_slli_si128(left_16x8b, 4); + left_sh_16x8b = _mm_slli_si128(left_sh_16x8b, 4); + pu1_dst += dst_strd2; + row1_16x8b = _mm_shuffle_epi8(left_16x8b, const_14_15_16x8b); + row2_16x8b = _mm_shuffle_epi8(left_sh_16x8b, const_14_15_16x8b); + _mm_storeu_si128((__m128i *)pu1_dst, row1_16x8b); + _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd), row2_16x8b); +} + +/** +******************************************************************************* +* +* ih264_intra_pred_chroma_8x8_mode_vert_ssse3 +* +* @brief +* Perform Intra prediction for chroma_8x8 mode:vertical +* +* @par Description: +* Perform Intra prediction for chroma_8x8 mode:vertical ,described in sec 8.3.4.3 +* +* @param[in] pu1_src +* UWORD8 pointer to the source containing alternate U and V samples +* +* @param[out] pu1_dst +* UWORD8 pointer to the destination with alternate U and V samples +* +* @param[in] src_strd +* integer source stride +* +* @param[in] dst_strd +* integer destination stride +* +* @param[in] ngbr_avail +* availability of neighbouring pixels(Not used in this function) +* +* @returns +* +* @remarks +* None +* +******************************************************************************* +*/ +void ih264_intra_pred_chroma_8x8_mode_vert_ssse3(UWORD8 *pu1_src, + UWORD8 *pu1_dst, + WORD32 src_strd, + WORD32 dst_strd, + WORD32 ngbr_avail) +{ + UWORD8 *pu1_top; /* Pointer to start of top predictors */ + WORD32 dst_strd2; + + __m128i top_16x8b; + + UNUSED(src_strd); + UNUSED(ngbr_avail); + + pu1_top = pu1_src + 2 * BLK8x8SIZE + 2; + + top_16x8b = _mm_loadu_si128((__m128i *)pu1_top); + + dst_strd2 = dst_strd << 1; + _mm_storeu_si128((__m128i *)pu1_dst, top_16x8b); + _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd), top_16x8b); + + pu1_dst += dst_strd2; + _mm_storeu_si128((__m128i *)pu1_dst, top_16x8b); + _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd), top_16x8b); + + pu1_dst += dst_strd2; + _mm_storeu_si128((__m128i *)pu1_dst, top_16x8b); + _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd), top_16x8b); + + pu1_dst += dst_strd2; + _mm_storeu_si128((__m128i *)pu1_dst, top_16x8b); + _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd), top_16x8b); +} + +/** +******************************************************************************* +* +* ih264_intra_pred_chroma_8x8_mode_plane_ssse3 +* +* @brief +* Perform Intra prediction for chroma_8x8 mode:PLANE +* +* @par Description: +* Perform Intra prediction for chroma_8x8 mode:PLANE ,described in sec 8.3.4.4 +* +* @param[in] pu1_src +* UWORD8 pointer to the source containing alternate U and V samples +* +* @param[out] pu1_dst +* UWORD8 pointer to the destination with alternate U and V samples +* +* @param[in] src_strd +* integer source stride +* +* @param[in] dst_strd +* integer destination stride +* +* @param[in] ngbr_avail +* availability of neighbouring pixels(Not used in this function) +* +* @returns +* +* @remarks +* None +* +****************************************************************************** +*/ +void ih264_intra_pred_chroma_8x8_mode_plane_ssse3(UWORD8 *pu1_src, + UWORD8 *pu1_dst, + WORD32 src_strd, + WORD32 dst_strd, + WORD32 ngbr_avail) +{ + UWORD8 *pu1_left, *pu1_top; + WORD32 a_u, a_v, b_u, b_v, c_u, c_v; + + __m128i mul_8x16b, shuffle_8x16b; + + UNUSED(src_strd); + UNUSED(ngbr_avail); + + pu1_top = pu1_src + MB_SIZE + 2; + pu1_left = pu1_src + MB_SIZE - 2; + + mul_8x16b = _mm_setr_epi16(1, 2, 3, 4, 1, 2, 3, 4); + shuffle_8x16b = _mm_setr_epi16(0xff00, 0xff02, 0xff04, 0xff06, + 0xff01, 0xff03, 0xff05, 0xff07); + + //calculating a, b and c + { + WORD32 h_u, h_v, v_u, v_v; + WORD32 temp1, temp2; + + __m128i h_val1_16x8b, h_val2_16x8b; + __m128i h_val1_8x16b, h_val2_8x16b, h_val_4x32b; + __m128i v_val1_16x8b, v_val2_16x8b; + __m128i v_val1_8x16b, v_val2_8x16b, v_val_4x32b; + __m128i hv_val_4x32b; + + h_val1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_top + 8)); + h_val2_16x8b = _mm_loadl_epi64((__m128i *)(pu1_top - 2)); + v_val1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_left - 14)); + v_val2_16x8b = _mm_loadl_epi64((__m128i *)(pu1_left - 4)); + + // reversing the order + h_val2_16x8b = _mm_shufflelo_epi16(h_val2_16x8b, 0x1b); + v_val1_16x8b = _mm_shufflelo_epi16(v_val1_16x8b, 0x1b); + + // separating u and v and 8-bit to 16-bit conversion + h_val1_8x16b = _mm_shuffle_epi8(h_val1_16x8b, shuffle_8x16b); + h_val2_8x16b = _mm_shuffle_epi8(h_val2_16x8b, shuffle_8x16b); + v_val1_8x16b = _mm_shuffle_epi8(v_val1_16x8b, shuffle_8x16b); + v_val2_8x16b = _mm_shuffle_epi8(v_val2_16x8b, shuffle_8x16b); + + h_val1_8x16b = _mm_sub_epi16(h_val1_8x16b, h_val2_8x16b); + v_val1_8x16b = _mm_sub_epi16(v_val1_8x16b, v_val2_8x16b); + + h_val_4x32b = _mm_madd_epi16(mul_8x16b, h_val1_8x16b); + v_val_4x32b = _mm_madd_epi16(mul_8x16b, v_val1_8x16b); + + temp1 = _mm_extract_epi16(h_val1_16x8b, 3); + temp2 = _mm_extract_epi16(v_val1_16x8b, 3); + + hv_val_4x32b = _mm_hadd_epi32(h_val_4x32b, v_val_4x32b); + + a_u = ((temp1 & 0xff) + (temp2 & 0xff)) << 4; + a_v = ((temp1 >> 8) + (temp2 >> 8)) << 4; + + h_u = _mm_extract_epi16(hv_val_4x32b, 0); + h_v = _mm_extract_epi16(hv_val_4x32b, 2); + v_u = _mm_extract_epi16(hv_val_4x32b, 4); + v_v = _mm_extract_epi16(hv_val_4x32b, 6); + + h_u = (h_u << 16) >> 15; // sign-extension and multiplication by 2 + h_v = (h_v << 16) >> 15; + v_u = (v_u << 16) >> 15; + v_v = (v_v << 16) >> 15; + + b_u = ((h_u << 4) + h_u + 32) >> 6; + b_v = ((h_v << 4) + h_v + 32) >> 6; + c_u = ((v_u << 4) + v_u + 32) >> 6; + c_v = ((v_v << 4) + v_v + 32) >> 6; + } + //using a, b and c to compute the fitted plane values + { + __m128i const_8x16b, c2_8x16b; + __m128i res1_l_8x16b, res1_h_8x16b; + __m128i res2_l_8x16b, res2_h_8x16b; + __m128i res1_sh_l_8x16b, res1_sh_h_8x16b, res1_16x8b; + __m128i res2_sh_l_8x16b, res2_sh_h_8x16b, res2_16x8b; + + WORD32 b_u2, b_v2, b_u3, b_v3; + WORD32 const_u, const_v; + WORD32 dst_strd2; + + const_u = a_u - (c_u << 1) - c_u + 16; + const_v = a_v - (c_v << 1) - c_v + 16; + + b_u2 = b_u << 1; + b_v2 = b_v << 1; + b_u3 = b_u + b_u2; + b_v3 = b_v + b_v2; + + const_8x16b = _mm_setr_epi16(const_u, const_v, const_u, const_v, const_u, const_v, const_u, const_v); + res1_l_8x16b = _mm_setr_epi16(-b_u3, -b_v3, -b_u2, -b_v2, -b_u, -b_v, 0, 0); + //contains {-b*3, -b*2, -b*1, b*0} + res1_h_8x16b = _mm_setr_epi16(b_u, b_v, b_u2, b_v2, b_u3, b_v3, b_u << 2, b_v << 2); + //contains {b*1, b*2, b*3, b*4} + c2_8x16b = _mm_setr_epi16(c_u, c_v, c_u, c_v, c_u, c_v, c_u, c_v); + + // rows 1, 2 + res1_l_8x16b = _mm_add_epi16(res1_l_8x16b, const_8x16b); + res1_h_8x16b = _mm_add_epi16(res1_h_8x16b, const_8x16b); + res2_l_8x16b = _mm_add_epi16(res1_l_8x16b, c2_8x16b); + res2_h_8x16b = _mm_add_epi16(res1_h_8x16b, c2_8x16b); + + res1_sh_l_8x16b = _mm_srai_epi16(res1_l_8x16b, 5); + res1_sh_h_8x16b = _mm_srai_epi16(res1_h_8x16b, 5); + res2_sh_l_8x16b = _mm_srai_epi16(res2_l_8x16b, 5); + res2_sh_h_8x16b = _mm_srai_epi16(res2_h_8x16b, 5); + + dst_strd2 = dst_strd << 1; + c2_8x16b = _mm_slli_epi16(c2_8x16b, 1); + + res1_16x8b = _mm_packus_epi16(res1_sh_l_8x16b, res1_sh_h_8x16b); + res2_16x8b = _mm_packus_epi16(res2_sh_l_8x16b, res2_sh_h_8x16b); + + _mm_storeu_si128((__m128i *)pu1_dst, res1_16x8b); + _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd), res2_16x8b); + + // rows 3, 4 + res1_l_8x16b = _mm_add_epi16(res1_l_8x16b, c2_8x16b); + res1_h_8x16b = _mm_add_epi16(res1_h_8x16b, c2_8x16b); + res2_l_8x16b = _mm_add_epi16(res2_l_8x16b, c2_8x16b); + res2_h_8x16b = _mm_add_epi16(res2_h_8x16b, c2_8x16b); + + res1_sh_l_8x16b = _mm_srai_epi16(res1_l_8x16b, 5); + res1_sh_h_8x16b = _mm_srai_epi16(res1_h_8x16b, 5); + res2_sh_l_8x16b = _mm_srai_epi16(res2_l_8x16b, 5); + res2_sh_h_8x16b = _mm_srai_epi16(res2_h_8x16b, 5); + + pu1_dst += dst_strd2; + + res1_16x8b = _mm_packus_epi16(res1_sh_l_8x16b, res1_sh_h_8x16b); + res2_16x8b = _mm_packus_epi16(res2_sh_l_8x16b, res2_sh_h_8x16b); + + _mm_storeu_si128((__m128i *)pu1_dst, res1_16x8b); + _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd), res2_16x8b); + + // rows 5, 6 + res1_l_8x16b = _mm_add_epi16(res1_l_8x16b, c2_8x16b); + res1_h_8x16b = _mm_add_epi16(res1_h_8x16b, c2_8x16b); + res2_l_8x16b = _mm_add_epi16(res2_l_8x16b, c2_8x16b); + res2_h_8x16b = _mm_add_epi16(res2_h_8x16b, c2_8x16b); + + res1_sh_l_8x16b = _mm_srai_epi16(res1_l_8x16b, 5); + res1_sh_h_8x16b = _mm_srai_epi16(res1_h_8x16b, 5); + res2_sh_l_8x16b = _mm_srai_epi16(res2_l_8x16b, 5); + res2_sh_h_8x16b = _mm_srai_epi16(res2_h_8x16b, 5); + + pu1_dst += dst_strd2; + + res1_16x8b = _mm_packus_epi16(res1_sh_l_8x16b, res1_sh_h_8x16b); + res2_16x8b = _mm_packus_epi16(res2_sh_l_8x16b, res2_sh_h_8x16b); + + _mm_storeu_si128((__m128i *)pu1_dst, res1_16x8b); + _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd), res2_16x8b); + + // rows 7, 8 + res1_l_8x16b = _mm_add_epi16(res1_l_8x16b, c2_8x16b); + res1_h_8x16b = _mm_add_epi16(res1_h_8x16b, c2_8x16b); + res2_l_8x16b = _mm_add_epi16(res2_l_8x16b, c2_8x16b); + res2_h_8x16b = _mm_add_epi16(res2_h_8x16b, c2_8x16b); + + res1_sh_l_8x16b = _mm_srai_epi16(res1_l_8x16b, 5); + res1_sh_h_8x16b = _mm_srai_epi16(res1_h_8x16b, 5); + res2_sh_l_8x16b = _mm_srai_epi16(res2_l_8x16b, 5); + res2_sh_h_8x16b = _mm_srai_epi16(res2_h_8x16b, 5); + + pu1_dst += dst_strd2; + + res1_16x8b = _mm_packus_epi16(res1_sh_l_8x16b, res1_sh_h_8x16b); + res2_16x8b = _mm_packus_epi16(res2_sh_l_8x16b, res2_sh_h_8x16b); + + _mm_storeu_si128((__m128i *)pu1_dst, res1_16x8b); + _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd), res2_16x8b); + + } +} diff --git a/common/x86/ih264_deblk_chroma_ssse3.c b/common/x86/ih264_deblk_chroma_ssse3.c new file mode 100755 index 0000000..a36447a --- /dev/null +++ b/common/x86/ih264_deblk_chroma_ssse3.c @@ -0,0 +1,1087 @@ +/****************************************************************************** + * + * Copyright (C) 2015 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ***************************************************************************** + * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore +*/ +/*****************************************************************************/ +/* */ +/* File Name : ih264_deblk_chroma_ssse3.c */ +/* */ +/* Description : Contains function definitions for deblocking */ +/* */ +/* List of Functions : ih264_deblk_chroma_vert_bs4_ssse3() */ +/* ih264_deblk_chroma_horz_bs4_ssse3() */ +/* ih264_deblk_chroma_vert_bslt4_ssse3() */ +/* ih264_deblk_chroma_horz_bslt4_ssse3() */ +/* ih264_deblk_chroma_vert_bs4_mbaff_ssse3() */ +/* ih264_deblk_chroma_vert_bslt4_mbaff_ssse3() */ +/* */ +/* Issues / Problems : None */ +/* */ +/* Revision History : */ +/* */ +/* DD MM YYYY Author(s) Changes (Describe the changes made) */ +/* 12 02 2015 Naveen Kumar P Added chrom deblocking ssse3 */ +/* intrinsics */ +/* */ +/*****************************************************************************/ + +/*****************************************************************************/ +/* File Includes */ +/*****************************************************************************/ + +/* System include files */ +#include <stdio.h> + +/* User include files */ +#include "ih264_typedefs.h" +#include "ih264_platform_macros.h" +#include "ih264_deblk_edge_filters.h" +#include "ih264_macros.h" + +/*****************************************************************************/ +/* Function Definitions */ +/*****************************************************************************/ + +/*****************************************************************************/ +/* */ +/* Function Name : ih264_deblk_chroma_vert_bs4_ssse3() */ +/* */ +/* Description : This function performs filtering of a chroma block */ +/* vertical edge when the boundary strength is set to 4 in */ +/* high profile. */ +/* */ +/* Inputs : pu1_src - pointer to the src sample q0 of U */ +/* src_strd - source stride */ +/* alpha_cb - alpha value for the boundary in U */ +/* beta_cb - beta value for the boundary in U */ +/* alpha_cr - alpha value for the boundary in V */ +/* beta_cr - beta value for the boundary in V */ +/* */ +/* Globals : None */ +/* */ +/* Processing : This operation is described in Sec. 8.7.2.4 under the */ +/* title "Filtering process for edges for bS equal to 4" in */ +/* ITU T Rec H.264 with alpha and beta values different in */ +/* U and V. */ +/* */ +/* Outputs : None */ +/* */ +/* Returns : None */ +/* */ +/* Issues : None */ +/* */ +/* Revision History: */ +/* */ +/* DD MM YYYY Author(s) Changes (Describe the changes made) */ +/* 12 02 2015 Naveen Kumar P Initial version */ +/* */ +/*****************************************************************************/ +void ih264_deblk_chroma_vert_bs4_ssse3(UWORD8 *pu1_src, + WORD32 src_strd, + WORD32 alpha_cb, + WORD32 beta_cb, + WORD32 alpha_cr, + WORD32 beta_cr) +{ + UWORD8 *pu1_src_uv = pu1_src; /* Pointer to the src sample q0 of plane U*/ + WORD32 alpha_cbcr = (alpha_cr << 16) + alpha_cb; + WORD32 beta_cbcr = (beta_cr << 16) + beta_cb; + __m128i linea, lineb, linec, lined, linee, linef, lineg, lineh; + __m128i temp1, temp2, temp3, temp4; + + __m128i q0_uv_16x8, p0_uv_16x8, q1_uv_16x8, p1_uv_16x8; + __m128i q0_uv_8x16, p0_uv_8x16, q1_uv_8x16, p1_uv_8x16; + __m128i flag1, flag2; + __m128i diff, alpha_cbcr_16x8, beta_cbcr_16x8; + __m128i zero = _mm_setzero_si128(); + __m128i p0_uv_8x16_1, p0_uv_8x16_2, q0_uv_8x16_1, q0_uv_8x16_2; + + /* Load and transpose the pixel values */ + linea = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4)); + lineb = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + src_strd)); + linec = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + 2 * src_strd)); + lined = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + 3 * src_strd)); + linee = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + 4 * src_strd)); + linef = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + 5 * src_strd)); + lineg = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + 6 * src_strd)); + lineh = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + 7 * src_strd)); + + temp1 = _mm_unpacklo_epi16(linea, lineb); + temp2 = _mm_unpacklo_epi16(linec, lined); + temp3 = _mm_unpacklo_epi16(linee, linef); + temp4 = _mm_unpacklo_epi16(lineg, lineh); + + p1_uv_8x16 = _mm_unpacklo_epi32(temp1, temp2); + p0_uv_8x16 = _mm_unpacklo_epi32(temp3, temp4); + q0_uv_8x16 = _mm_unpackhi_epi32(temp1, temp2); + q1_uv_8x16 = _mm_unpackhi_epi32(temp3, temp4); + + p1_uv_16x8 = _mm_unpacklo_epi64(p1_uv_8x16, p0_uv_8x16); + p0_uv_16x8 = _mm_unpackhi_epi64(p1_uv_8x16, p0_uv_8x16); + q0_uv_16x8 = _mm_unpacklo_epi64(q0_uv_8x16, q1_uv_8x16); + q1_uv_16x8 = _mm_unpackhi_epi64(q0_uv_8x16, q1_uv_8x16); + /* End of transpose */ + + q0_uv_8x16 = _mm_unpacklo_epi8(q0_uv_16x8, zero); + q1_uv_8x16 = _mm_unpacklo_epi8(q1_uv_16x8, zero); + p1_uv_8x16 = _mm_unpacklo_epi8(p1_uv_16x8, zero); + p0_uv_8x16 = _mm_unpacklo_epi8(p0_uv_16x8, zero); + + diff = _mm_subs_epi16(p0_uv_8x16, q0_uv_8x16); //Condn 1 + diff = _mm_abs_epi16(diff); + alpha_cbcr_16x8 = _mm_set1_epi32(alpha_cbcr); + flag1 = _mm_cmpgt_epi16(alpha_cbcr_16x8, diff); + + diff = _mm_subs_epi16(q1_uv_8x16, q0_uv_8x16); //Condtn 2 + diff = _mm_abs_epi16(diff); + beta_cbcr_16x8 = _mm_set1_epi32(beta_cbcr); + flag1 = _mm_and_si128(flag1, _mm_cmpgt_epi16(beta_cbcr_16x8, diff)); + + diff = _mm_subs_epi16(p1_uv_8x16, p0_uv_8x16); //Condtn 3 + diff = _mm_abs_epi16(diff); + flag1 = _mm_and_si128(flag1, _mm_cmpgt_epi16(beta_cbcr_16x8, diff)); + + temp1 = _mm_slli_epi16(p1_uv_8x16, 1); + temp2 = _mm_add_epi16(p0_uv_8x16, q1_uv_8x16); + temp1 = _mm_add_epi16(temp1, _mm_set1_epi16(2)); + temp1 = _mm_add_epi16(temp1, temp2); + p0_uv_8x16_1 = _mm_srai_epi16(temp1, 2); + + temp1 = _mm_slli_epi16(q1_uv_8x16, 1); + temp2 = _mm_add_epi16(p1_uv_8x16, q0_uv_8x16); + temp1 = _mm_add_epi16(temp1, _mm_set1_epi16(2)); + temp1 = _mm_add_epi16(temp1, temp2); + q0_uv_8x16_1 = _mm_srai_epi16(temp1, 2); + + q0_uv_8x16 = _mm_unpackhi_epi8(q0_uv_16x8, zero); + q1_uv_8x16 = _mm_unpackhi_epi8(q1_uv_16x8, zero); + p1_uv_8x16 = _mm_unpackhi_epi8(p1_uv_16x8, zero); + p0_uv_8x16 = _mm_unpackhi_epi8(p0_uv_16x8, zero); + + diff = _mm_subs_epi16(p0_uv_8x16, q0_uv_8x16); //Condn 1 + diff = _mm_abs_epi16(diff); + alpha_cbcr_16x8 = _mm_set1_epi32(alpha_cbcr); + flag2 = _mm_cmpgt_epi16(alpha_cbcr_16x8, diff); + + diff = _mm_subs_epi16(q1_uv_8x16, q0_uv_8x16); //Condtn 2 + diff = _mm_abs_epi16(diff); + beta_cbcr_16x8 = _mm_set1_epi32(beta_cbcr); + flag2 = _mm_and_si128(flag2, _mm_cmpgt_epi16(beta_cbcr_16x8, diff)); + + diff = _mm_subs_epi16(p1_uv_8x16, p0_uv_8x16); //Condtn 3 + diff = _mm_abs_epi16(diff); + flag2 = _mm_and_si128(flag2, _mm_cmpgt_epi16(beta_cbcr_16x8, diff)); + + temp1 = _mm_slli_epi16(p1_uv_8x16, 1); + temp2 = _mm_add_epi16(p0_uv_8x16, q1_uv_8x16); + temp1 = _mm_add_epi16(temp1, _mm_set1_epi16(2)); + temp1 = _mm_add_epi16(temp1, temp2); + p0_uv_8x16_2 = _mm_srai_epi16(temp1, 2); + + temp1 = _mm_slli_epi16(q1_uv_8x16, 1); + temp2 = _mm_add_epi16(p1_uv_8x16, q0_uv_8x16); + temp1 = _mm_add_epi16(temp1, _mm_set1_epi16(2)); + temp1 = _mm_add_epi16(temp1, temp2); + q0_uv_8x16_2 = _mm_srai_epi16(temp1, 2); + + p0_uv_8x16_2 = _mm_packus_epi16(p0_uv_8x16_1, p0_uv_8x16_2); + q0_uv_8x16_2 = _mm_packus_epi16(q0_uv_8x16_1, q0_uv_8x16_2); + + flag1 = _mm_packs_epi16(flag1, flag2); + + p0_uv_8x16_1 = _mm_and_si128(p0_uv_16x8, + _mm_xor_si128(flag1, _mm_set1_epi8(0xFF))); + p0_uv_8x16_2 = _mm_and_si128(p0_uv_8x16_2, flag1); + p0_uv_16x8 = _mm_add_epi8(p0_uv_8x16_1, p0_uv_8x16_2); + + q0_uv_8x16_1 = _mm_and_si128(q0_uv_16x8, + _mm_xor_si128(flag1, _mm_set1_epi8(0xFF))); + q0_uv_8x16_2 = _mm_and_si128(q0_uv_8x16_2, flag1); + q0_uv_16x8 = _mm_add_epi8(q0_uv_8x16_1, q0_uv_8x16_2); + + /* Inverse-transpose and store back */ + temp1 = _mm_unpacklo_epi16(p1_uv_16x8, p0_uv_16x8); + temp2 = _mm_unpackhi_epi16(p1_uv_16x8, p0_uv_16x8); + temp3 = _mm_unpacklo_epi16(q0_uv_16x8, q1_uv_16x8); + temp4 = _mm_unpackhi_epi16(q0_uv_16x8, q1_uv_16x8); + + linea = _mm_unpacklo_epi32(temp1, temp3); + lineb = _mm_srli_si128(linea, 8); + linec = _mm_unpackhi_epi32(temp1, temp3); + lined = _mm_srli_si128(linec, 8); + linee = _mm_unpacklo_epi32(temp2, temp4); + linef = _mm_srli_si128(linee, 8); + lineg = _mm_unpackhi_epi32(temp2, temp4); + lineh = _mm_srli_si128(lineg, 8); + + _mm_storel_epi64((__m128i *)(pu1_src_uv - 4), linea); + _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + src_strd), lineb); + _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + 2 * src_strd), linec); + _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + 3 * src_strd), lined); + _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + 4 * src_strd), linee); + _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + 5 * src_strd), linef); + _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + 6 * src_strd), lineg); + _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + 7 * src_strd), lineh); + +} + +/*****************************************************************************/ +/* */ +/* Function Name : ih264_deblk_chroma_horz_bs4_ssse3() */ +/* */ +/* Description : This function performs filtering of a chroma block */ +/* horizontal edge when the boundary strength is set to 4 */ +/* in high profile. */ +/* */ +/* Inputs : pu1_src - pointer to the src sample q0 of U */ +/* src_strd - source stride */ +/* alpha_cb - alpha value for the boundary in U */ +/* beta_cb - beta value for the boundary in U */ +/* alpha_cr - alpha value for the boundary in V */ +/* beta_cr - beta value for the boundary in V */ +/* */ +/* Globals : None */ +/* */ +/* Processing : This operation is described in Sec. 8.7.2.4 under the */ +/* title "Filtering process for edges for bS equal to 4" in */ +/* ITU T Rec H.264 with alpha and beta values different in */ +/* U and V. */ +/* */ +/* Outputs : None */ +/* */ +/* Returns : None */ +/* */ +/* Issues : None */ +/* */ +/* Revision History: */ +/* */ +/* DD MM YYYY Author(s) Changes (Describe the changes made) */ +/* 12 02 2015 Naveen Kumar P Initial version */ +/* */ +/*****************************************************************************/ +void ih264_deblk_chroma_horz_bs4_ssse3(UWORD8 *pu1_src, + WORD32 src_strd, + WORD32 alpha_cb, + WORD32 beta_cb, + WORD32 alpha_cr, + WORD32 beta_cr) +{ + UWORD8 *pu1_src_uv = pu1_src; /* Pointer to the src sample q0 of plane U*/ + WORD16 i16_posP1, i16_posP0, i16_posQ1; + + UWORD8 *pu1_HorzPixelUV; /*! < Pointer to the first pixel of the boundary */ + WORD32 alpha_cbcr = (alpha_cr << 16) + alpha_cb; + WORD32 beta_cbcr = (beta_cr << 16) + beta_cb; + __m128i q0_uv_16x8, p0_uv_16x8, q1_uv_16x8, p1_uv_16x8; + __m128i q0_uv_8x16, p0_uv_8x16, q1_uv_8x16, p1_uv_8x16; + __m128i flag1, flag2; + __m128i diff, alpha_cbcr_16x8, beta_cbcr_16x8; + __m128i zero = _mm_setzero_si128(); + __m128i p0_uv_8x16_1, p0_uv_8x16_2, q0_uv_8x16_1, q0_uv_8x16_2; + __m128i temp1, temp2; + + pu1_HorzPixelUV = pu1_src_uv - (src_strd << 1); + + i16_posQ1 = src_strd; + i16_posP0 = src_strd; + i16_posP1 = 0; + + q0_uv_16x8 = _mm_loadu_si128((__m128i *)(pu1_src_uv)); + q1_uv_16x8 = _mm_loadu_si128((__m128i *)(pu1_src_uv + i16_posQ1)); + p1_uv_16x8 = _mm_loadu_si128((__m128i *)(pu1_HorzPixelUV + i16_posP1)); + p0_uv_16x8 = _mm_loadu_si128((__m128i *)(pu1_HorzPixelUV + i16_posP0)); + + q0_uv_8x16 = _mm_unpacklo_epi8(q0_uv_16x8, zero); + q1_uv_8x16 = _mm_unpacklo_epi8(q1_uv_16x8, zero); + p1_uv_8x16 = _mm_unpacklo_epi8(p1_uv_16x8, zero); + p0_uv_8x16 = _mm_unpacklo_epi8(p0_uv_16x8, zero); + + diff = _mm_subs_epi16(p0_uv_8x16, q0_uv_8x16); //Condn 1 + diff = _mm_abs_epi16(diff); + alpha_cbcr_16x8 = _mm_set1_epi32(alpha_cbcr); + flag1 = _mm_cmpgt_epi16(alpha_cbcr_16x8, diff); + + diff = _mm_subs_epi16(q1_uv_8x16, q0_uv_8x16); //Condtn 2 + diff = _mm_abs_epi16(diff); + beta_cbcr_16x8 = _mm_set1_epi32(beta_cbcr); + flag1 = _mm_and_si128(flag1, _mm_cmpgt_epi16(beta_cbcr_16x8, diff)); + + diff = _mm_subs_epi16(p1_uv_8x16, p0_uv_8x16); //Condtn 3 + diff = _mm_abs_epi16(diff); + flag1 = _mm_and_si128(flag1, _mm_cmpgt_epi16(beta_cbcr_16x8, diff)); + + temp1 = _mm_slli_epi16(p1_uv_8x16, 1); + temp2 = _mm_add_epi16(p0_uv_8x16, q1_uv_8x16); + temp1 = _mm_add_epi16(temp1, _mm_set1_epi16(2)); + temp1 = _mm_add_epi16(temp1, temp2); + p0_uv_8x16_1 = _mm_srai_epi16(temp1, 2); + + temp1 = _mm_slli_epi16(q1_uv_8x16, 1); + temp2 = _mm_add_epi16(p1_uv_8x16, q0_uv_8x16); + temp1 = _mm_add_epi16(temp1, _mm_set1_epi16(2)); + temp1 = _mm_add_epi16(temp1, temp2); + q0_uv_8x16_1 = _mm_srai_epi16(temp1, 2); + + q0_uv_8x16 = _mm_unpackhi_epi8(q0_uv_16x8, zero); + q1_uv_8x16 = _mm_unpackhi_epi8(q1_uv_16x8, zero); + p1_uv_8x16 = _mm_unpackhi_epi8(p1_uv_16x8, zero); + p0_uv_8x16 = _mm_unpackhi_epi8(p0_uv_16x8, zero); + + diff = _mm_subs_epi16(p0_uv_8x16, q0_uv_8x16); //Condn 1 + diff = _mm_abs_epi16(diff); + alpha_cbcr_16x8 = _mm_set1_epi32(alpha_cbcr); + flag2 = _mm_cmpgt_epi16(alpha_cbcr_16x8, diff); + + diff = _mm_subs_epi16(q1_uv_8x16, q0_uv_8x16); //Condtn 2 + diff = _mm_abs_epi16(diff); + beta_cbcr_16x8 = _mm_set1_epi32(beta_cbcr); + flag2 = _mm_and_si128(flag2, _mm_cmpgt_epi16(beta_cbcr_16x8, diff)); + + diff = _mm_subs_epi16(p1_uv_8x16, p0_uv_8x16); //Condtn 3 + diff = _mm_abs_epi16(diff); + flag2 = _mm_and_si128(flag2, _mm_cmpgt_epi16(beta_cbcr_16x8, diff)); + + temp1 = _mm_slli_epi16(p1_uv_8x16, 1); + temp2 = _mm_add_epi16(p0_uv_8x16, q1_uv_8x16); + temp1 = _mm_add_epi16(temp1, _mm_set1_epi16(2)); + temp1 = _mm_add_epi16(temp1, temp2); + p0_uv_8x16_2 = _mm_srai_epi16(temp1, 2); + + temp1 = _mm_slli_epi16(q1_uv_8x16, 1); + temp2 = _mm_add_epi16(p1_uv_8x16, q0_uv_8x16); + temp1 = _mm_add_epi16(temp1, _mm_set1_epi16(2)); + temp1 = _mm_add_epi16(temp1, temp2); + q0_uv_8x16_2 = _mm_srai_epi16(temp1, 2); + + p0_uv_8x16_2 = _mm_packus_epi16(p0_uv_8x16_1, p0_uv_8x16_2); + q0_uv_8x16_2 = _mm_packus_epi16(q0_uv_8x16_1, q0_uv_8x16_2); + + flag1 = _mm_packs_epi16(flag1, flag2); + + p0_uv_8x16_1 = _mm_and_si128(p0_uv_16x8, + _mm_xor_si128(flag1, _mm_set1_epi8(0xFF))); + p0_uv_8x16_2 = _mm_and_si128(p0_uv_8x16_2, flag1); + p0_uv_8x16_1 = _mm_add_epi8(p0_uv_8x16_1, p0_uv_8x16_2); + _mm_storeu_si128((__m128i *)(pu1_HorzPixelUV + i16_posP0), p0_uv_8x16_1); + + q0_uv_8x16_1 = _mm_and_si128(q0_uv_16x8, + _mm_xor_si128(flag1, _mm_set1_epi8(0xFF))); + q0_uv_8x16_2 = _mm_and_si128(q0_uv_8x16_2, flag1); + q0_uv_8x16_1 = _mm_add_epi8(q0_uv_8x16_1, q0_uv_8x16_2); + _mm_storeu_si128((__m128i *)(pu1_src_uv), q0_uv_8x16_1); + +} + +/*****************************************************************************/ +/* */ +/* Function Name : ih264_deblk_chroma_vert_bslt4_ssse3() */ +/* */ +/* Description : This function performs filtering of a chroma block */ +/* vertical edge when the boundary strength is less than 4 */ +/* in high profile. */ +/* */ +/* Inputs : pu1_src - pointer to the src sample q0 of U */ +/* src_strd - source stride */ +/* alpha_cb - alpha value for the boundary in U */ +/* beta_cb - beta value for the boundary in U */ +/* alpha_cr - alpha value for the boundary in V */ +/* beta_cr - beta value for the boundary in V */ +/* u4_bs - packed Boundary strength array */ +/* pu1_cliptab_cb - tc0_table for U */ +/* pu1_cliptab_cr - tc0_table for V */ +/* */ +/* Globals : None */ +/* */ +/* Processing : This operation is described in Sec. 8.7.2.3 under the */ +/* title "Filtering process for edges for bS less than 4" */ +/* in ITU T Rec H.264 with alpha and beta values different */ +/* in U and V. */ +/* */ +/* Outputs : None */ +/* */ +/* Returns : None */ +/* */ +/* Issues : None */ +/* */ +/* Revision History: */ +/* */ +/* DD MM YYYY Author(s) Changes (Describe the changes made) */ +/* 12 02 2015 Naveen Kumar P Initial version */ +/* */ +/*****************************************************************************/ +void ih264_deblk_chroma_vert_bslt4_ssse3(UWORD8 *pu1_src, + WORD32 src_strd, + WORD32 alpha_cb, + WORD32 beta_cb, + WORD32 alpha_cr, + WORD32 beta_cr, + UWORD32 u4_bs, + const UWORD8 *pu1_cliptab_cb, + const UWORD8 *pu1_cliptab_cr) +{ + UWORD8 *pu1_src_uv = pu1_src; /* Pointer to the src sample q0 of plane U*/ + UWORD8 u1_Bs0, u1_Bs1, u1_Bs2, u1_Bs3; + WORD32 alpha_cbcr = (alpha_cr << 16) + alpha_cb; + WORD32 beta_cbcr = (beta_cr << 16) + beta_cb; + __m128i linea, lineb, linec, lined, linee, linef, lineg, lineh; + __m128i temp1, temp2, temp3, temp4; + + __m128i q0_uv_16x8, p0_uv_16x8, q1_uv_16x8, p1_uv_16x8; + __m128i q0_uv_8x16, p0_uv_8x16, q1_uv_8x16, p1_uv_8x16; + __m128i flag_bs, flag1, flag2; + __m128i diff, diff1, alpha_cbcr_16x8, beta_cbcr_16x8, in_macro; + __m128i zero = _mm_setzero_si128(); + __m128i C0_uv_8x16; + __m128i p0_uv_8x16_1, p0_uv_8x16_2, q0_uv_8x16_1, q0_uv_8x16_2; + + u1_Bs0 = (u4_bs >> 24) & 0xff; + u1_Bs1 = (u4_bs >> 16) & 0xff; + u1_Bs2 = (u4_bs >> 8) & 0xff; + u1_Bs3 = (u4_bs >> 0) & 0xff; + + flag_bs = _mm_set_epi8(u1_Bs3, u1_Bs3, u1_Bs3, u1_Bs3, u1_Bs2, u1_Bs2, + u1_Bs2, u1_Bs2, u1_Bs1, u1_Bs1, u1_Bs1, u1_Bs1, + u1_Bs0, u1_Bs0, u1_Bs0, u1_Bs0); + flag_bs = _mm_cmpeq_epi8(flag_bs, zero); //Set flag to 1s and 0s + flag_bs = _mm_xor_si128(flag_bs, _mm_set1_epi8(0xFF)); //Invert for required mask + + /* Load and transpose the pixel values */ + linea = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4)); + lineb = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + src_strd)); + linec = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + 2 * src_strd)); + lined = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + 3 * src_strd)); + linee = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + 4 * src_strd)); + linef = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + 5 * src_strd)); + lineg = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + 6 * src_strd)); + lineh = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + 7 * src_strd)); + + temp1 = _mm_unpacklo_epi16(linea, lineb); + temp2 = _mm_unpacklo_epi16(linec, lined); + temp3 = _mm_unpacklo_epi16(linee, linef); + temp4 = _mm_unpacklo_epi16(lineg, lineh); + + p1_uv_8x16 = _mm_unpacklo_epi32(temp1, temp2); + p0_uv_8x16 = _mm_unpacklo_epi32(temp3, temp4); + q0_uv_8x16 = _mm_unpackhi_epi32(temp1, temp2); + q1_uv_8x16 = _mm_unpackhi_epi32(temp3, temp4); + + p1_uv_16x8 = _mm_unpacklo_epi64(p1_uv_8x16, p0_uv_8x16); + p0_uv_16x8 = _mm_unpackhi_epi64(p1_uv_8x16, p0_uv_8x16); + q0_uv_16x8 = _mm_unpacklo_epi64(q0_uv_8x16, q1_uv_8x16); + q1_uv_16x8 = _mm_unpackhi_epi64(q0_uv_8x16, q1_uv_8x16); + /* End of transpose */ + + q0_uv_8x16 = _mm_unpacklo_epi8(q0_uv_16x8, zero); + q1_uv_8x16 = _mm_unpacklo_epi8(q1_uv_16x8, zero); + p1_uv_8x16 = _mm_unpacklo_epi8(p1_uv_16x8, zero); + p0_uv_8x16 = _mm_unpacklo_epi8(p0_uv_16x8, zero); + + diff = _mm_subs_epi16(p0_uv_8x16, q0_uv_8x16); //Condn 1 + diff = _mm_abs_epi16(diff); + alpha_cbcr_16x8 = _mm_set1_epi32(alpha_cbcr); + flag1 = _mm_cmpgt_epi16(alpha_cbcr_16x8, diff); + + diff = _mm_subs_epi16(q1_uv_8x16, q0_uv_8x16); //Condtn 2 + diff = _mm_abs_epi16(diff); + beta_cbcr_16x8 = _mm_set1_epi32(beta_cbcr); + flag1 = _mm_and_si128(flag1, _mm_cmpgt_epi16(beta_cbcr_16x8, diff)); + + diff = _mm_subs_epi16(p1_uv_8x16, p0_uv_8x16); //Condtn 3 + diff = _mm_abs_epi16(diff); + flag1 = _mm_and_si128(flag1, _mm_cmpgt_epi16(beta_cbcr_16x8, diff)); + + diff = _mm_subs_epi16(q0_uv_8x16, p0_uv_8x16); + diff = _mm_slli_epi16(diff, 2); + diff1 = _mm_subs_epi16(p1_uv_8x16, q1_uv_8x16); + diff = _mm_add_epi16(diff, diff1); + diff = _mm_add_epi16(diff, _mm_set1_epi16(4)); + in_macro = _mm_srai_epi16(diff, 3); + + C0_uv_8x16 = _mm_set_epi16(pu1_cliptab_cr[u1_Bs1], pu1_cliptab_cb[u1_Bs1], + pu1_cliptab_cr[u1_Bs1], pu1_cliptab_cb[u1_Bs1], + pu1_cliptab_cr[u1_Bs0], pu1_cliptab_cb[u1_Bs0], + pu1_cliptab_cr[u1_Bs0], pu1_cliptab_cb[u1_Bs0]); + + C0_uv_8x16 = _mm_add_epi16(C0_uv_8x16, _mm_set1_epi16(1)); + + in_macro = _mm_min_epi16(C0_uv_8x16, in_macro); //CLIP3 + C0_uv_8x16 = _mm_subs_epi16(zero, C0_uv_8x16); + in_macro = _mm_max_epi16(C0_uv_8x16, in_macro); + + p0_uv_8x16_1 = _mm_add_epi16(p0_uv_8x16, in_macro); + q0_uv_8x16_1 = _mm_sub_epi16(q0_uv_8x16, in_macro); + + q0_uv_8x16 = _mm_unpackhi_epi8(q0_uv_16x8, zero); + q1_uv_8x16 = _mm_unpackhi_epi8(q1_uv_16x8, zero); + p1_uv_8x16 = _mm_unpackhi_epi8(p1_uv_16x8, zero); + p0_uv_8x16 = _mm_unpackhi_epi8(p0_uv_16x8, zero); + + diff = _mm_subs_epi16(p0_uv_8x16, q0_uv_8x16); //Condn 1 + diff = _mm_abs_epi16(diff); + alpha_cbcr_16x8 = _mm_set1_epi32(alpha_cbcr); + flag2 = _mm_cmpgt_epi16(alpha_cbcr_16x8, diff); + + diff = _mm_subs_epi16(q1_uv_8x16, q0_uv_8x16); //Condtn 2 + diff = _mm_abs_epi16(diff); + beta_cbcr_16x8 = _mm_set1_epi32(beta_cbcr); + flag2 = _mm_and_si128(flag2, _mm_cmpgt_epi16(beta_cbcr_16x8, diff)); + + diff = _mm_subs_epi16(p1_uv_8x16, p0_uv_8x16); //Condtn 3 + diff = _mm_abs_epi16(diff); + flag2 = _mm_and_si128(flag2, _mm_cmpgt_epi16(beta_cbcr_16x8, diff)); + + diff = _mm_subs_epi16(q0_uv_8x16, p0_uv_8x16); + diff = _mm_slli_epi16(diff, 2); + diff1 = _mm_subs_epi16(p1_uv_8x16, q1_uv_8x16); + diff = _mm_add_epi16(diff, diff1); + diff = _mm_add_epi16(diff, _mm_set1_epi16(4)); + in_macro = _mm_srai_epi16(diff, 3); + + C0_uv_8x16 = _mm_set_epi16(pu1_cliptab_cr[u1_Bs3], pu1_cliptab_cb[u1_Bs3], + pu1_cliptab_cr[u1_Bs3], pu1_cliptab_cb[u1_Bs3], + pu1_cliptab_cr[u1_Bs2], pu1_cliptab_cb[u1_Bs2], + pu1_cliptab_cr[u1_Bs2], pu1_cliptab_cb[u1_Bs2]); + + C0_uv_8x16 = _mm_add_epi16(C0_uv_8x16, _mm_set1_epi16(1)); + + in_macro = _mm_min_epi16(C0_uv_8x16, in_macro); //CLIP3 + C0_uv_8x16 = _mm_subs_epi16(zero, C0_uv_8x16); + in_macro = _mm_max_epi16(C0_uv_8x16, in_macro); + + p0_uv_8x16_2 = _mm_add_epi16(p0_uv_8x16, in_macro); + q0_uv_8x16_2 = _mm_sub_epi16(q0_uv_8x16, in_macro); + + p0_uv_8x16_2 = _mm_packus_epi16(p0_uv_8x16_1, p0_uv_8x16_2); + q0_uv_8x16_2 = _mm_packus_epi16(q0_uv_8x16_1, q0_uv_8x16_2); + + flag1 = _mm_packs_epi16(flag1, flag2); + flag1 = _mm_and_si128(flag1, flag_bs); //Final flag (BS condition + other 3 conditions) + + p0_uv_8x16_1 = _mm_and_si128(p0_uv_16x8, + _mm_xor_si128(flag1, _mm_set1_epi8(0xFF))); + p0_uv_8x16_2 = _mm_and_si128(p0_uv_8x16_2, flag1); + p0_uv_16x8 = _mm_add_epi8(p0_uv_8x16_1, p0_uv_8x16_2); + + q0_uv_8x16_1 = _mm_and_si128(q0_uv_16x8, + _mm_xor_si128(flag1, _mm_set1_epi8(0xFF))); + q0_uv_8x16_2 = _mm_and_si128(q0_uv_8x16_2, flag1); + q0_uv_16x8 = _mm_add_epi8(q0_uv_8x16_1, q0_uv_8x16_2); + + /* Inverse-transpose and store back */ + temp1 = _mm_unpacklo_epi16(p1_uv_16x8, p0_uv_16x8); + temp2 = _mm_unpackhi_epi16(p1_uv_16x8, p0_uv_16x8); + temp3 = _mm_unpacklo_epi16(q0_uv_16x8, q1_uv_16x8); + temp4 = _mm_unpackhi_epi16(q0_uv_16x8, q1_uv_16x8); + + linea = _mm_unpacklo_epi32(temp1, temp3); + lineb = _mm_srli_si128(linea, 8); + linec = _mm_unpackhi_epi32(temp1, temp3); + lined = _mm_srli_si128(linec, 8); + linee = _mm_unpacklo_epi32(temp2, temp4); + linef = _mm_srli_si128(linee, 8); + lineg = _mm_unpackhi_epi32(temp2, temp4); + lineh = _mm_srli_si128(lineg, 8); + + _mm_storel_epi64((__m128i *)(pu1_src_uv - 4), linea); + _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + src_strd), lineb); + _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + 2 * src_strd), linec); + _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + 3 * src_strd), lined); + _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + 4 * src_strd), linee); + _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + 5 * src_strd), linef); + _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + 6 * src_strd), lineg); + _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + 7 * src_strd), lineh); + +} + +/*****************************************************************************/ +/* */ +/* Function Name : ih264_deblk_chroma_horz_bslt4_ssse3() */ +/* */ +/* Description : This function performs filtering of a chroma block */ +/* horizontal edge when the boundary strength is less than */ +/* 4 in high profile. */ +/* */ +/* Inputs : pu1_src - pointer to the src sample q0 of U */ +/* src_strd - source stride */ +/* alpha_cb - alpha value for the boundary in U */ +/* beta_cb - beta value for the boundary in U */ +/* alpha_cr - alpha value for the boundary in V */ +/* beta_cr - beta value for the boundary in V */ +/* u4_bs - packed Boundary strength array */ +/* pu1_cliptab_cb - tc0_table for U */ +/* pu1_cliptab_cr - tc0_table for V */ +/* */ +/* Globals : None */ +/* */ +/* Processing : This operation is described in Sec. 8.7.2.3 under the */ +/* title "Filtering process for edges for bS less than 4" */ +/* in ITU T Rec H.264 with alpha and beta values different */ +/* in U and V. */ +/* */ +/* Outputs : None */ +/* */ +/* Returns : None */ +/* */ +/* Issues : None */ +/* */ +/* Revision History: */ +/* */ +/* DD MM YYYY Author(s) Changes (Describe the changes made) */ +/* 12 02 2015 Naveen Kumar P Initial version */ +/* */ +/*****************************************************************************/ +void ih264_deblk_chroma_horz_bslt4_ssse3(UWORD8 *pu1_src, + WORD32 src_strd, + WORD32 alpha_cb, + WORD32 beta_cb, + WORD32 alpha_cr, + WORD32 beta_cr, + UWORD32 u4_bs, + const UWORD8 *pu1_cliptab_cb, + const UWORD8 *pu1_cliptab_cr) +{ + UWORD8 *pu1_src_uv = pu1_src; /* Pointer to the src sample q0 of plane U*/ + WORD16 i16_posP1, i16_posP0, i16_posQ1; + UWORD8 u1_Bs0, u1_Bs1, u1_Bs2, u1_Bs3; + + UWORD8 *pu1_HorzPixelUV; /*! < Pointer to the first pixel of the boundary */ + WORD32 alpha_cbcr = (alpha_cr << 16) + alpha_cb; + WORD32 beta_cbcr = (beta_cr << 16) + beta_cb; + __m128i q0_uv_16x8, p0_uv_16x8, q1_uv_16x8, p1_uv_16x8; + __m128i q0_uv_8x16, p0_uv_8x16, q1_uv_8x16, p1_uv_8x16; + __m128i flag_bs, flag1, flag2; + __m128i diff, diff1, alpha_cbcr_16x8, beta_cbcr_16x8, in_macro; + __m128i zero = _mm_setzero_si128(); + __m128i C0_uv_8x16; + __m128i p0_uv_8x16_1, p0_uv_8x16_2, q0_uv_8x16_1, q0_uv_8x16_2; + + pu1_HorzPixelUV = pu1_src_uv - (src_strd << 1); + + i16_posQ1 = src_strd; + i16_posP0 = src_strd; + i16_posP1 = 0; + + u1_Bs0 = (u4_bs >> 24) & 0xff; + u1_Bs1 = (u4_bs >> 16) & 0xff; + u1_Bs2 = (u4_bs >> 8) & 0xff; + u1_Bs3 = (u4_bs >> 0) & 0xff; + + flag_bs = _mm_set_epi8(u1_Bs3, u1_Bs3, u1_Bs3, u1_Bs3, u1_Bs2, u1_Bs2, + u1_Bs2, u1_Bs2, u1_Bs1, u1_Bs1, u1_Bs1, u1_Bs1, + u1_Bs0, u1_Bs0, u1_Bs0, u1_Bs0); + flag_bs = _mm_cmpeq_epi8(flag_bs, zero); //Set flag to 1s and 0s + flag_bs = _mm_xor_si128(flag_bs, _mm_set1_epi8(0xFF)); //Invert for required mask + + q0_uv_16x8 = _mm_loadu_si128((__m128i *)(pu1_src_uv)); + q1_uv_16x8 = _mm_loadu_si128((__m128i *)(pu1_src_uv + i16_posQ1)); + p1_uv_16x8 = _mm_loadu_si128((__m128i *)(pu1_HorzPixelUV + i16_posP1)); + p0_uv_16x8 = _mm_loadu_si128((__m128i *)(pu1_HorzPixelUV + i16_posP0)); + + q0_uv_8x16 = _mm_unpacklo_epi8(q0_uv_16x8, zero); + q1_uv_8x16 = _mm_unpacklo_epi8(q1_uv_16x8, zero); + p1_uv_8x16 = _mm_unpacklo_epi8(p1_uv_16x8, zero); + p0_uv_8x16 = _mm_unpacklo_epi8(p0_uv_16x8, zero); + + diff = _mm_subs_epi16(p0_uv_8x16, q0_uv_8x16); //Condn 1 + diff = _mm_abs_epi16(diff); + alpha_cbcr_16x8 = _mm_set1_epi32(alpha_cbcr); + flag1 = _mm_cmpgt_epi16(alpha_cbcr_16x8, diff); + + diff = _mm_subs_epi16(q1_uv_8x16, q0_uv_8x16); //Condtn 2 + diff = _mm_abs_epi16(diff); + beta_cbcr_16x8 = _mm_set1_epi32(beta_cbcr); + flag1 = _mm_and_si128(flag1, _mm_cmpgt_epi16(beta_cbcr_16x8, diff)); + + diff = _mm_subs_epi16(p1_uv_8x16, p0_uv_8x16); //Condtn 3 + diff = _mm_abs_epi16(diff); + flag1 = _mm_and_si128(flag1, _mm_cmpgt_epi16(beta_cbcr_16x8, diff)); + + diff = _mm_subs_epi16(q0_uv_8x16, p0_uv_8x16); + diff = _mm_slli_epi16(diff, 2); + diff1 = _mm_subs_epi16(p1_uv_8x16, q1_uv_8x16); + diff = _mm_add_epi16(diff, diff1); + diff = _mm_add_epi16(diff, _mm_set1_epi16(4)); + in_macro = _mm_srai_epi16(diff, 3); + + C0_uv_8x16 = _mm_set_epi16(pu1_cliptab_cr[u1_Bs1], pu1_cliptab_cb[u1_Bs1], + pu1_cliptab_cr[u1_Bs1], pu1_cliptab_cb[u1_Bs1], + pu1_cliptab_cr[u1_Bs0], pu1_cliptab_cb[u1_Bs0], + pu1_cliptab_cr[u1_Bs0], pu1_cliptab_cb[u1_Bs0]); + + C0_uv_8x16 = _mm_add_epi16(C0_uv_8x16, _mm_set1_epi16(1)); + + in_macro = _mm_min_epi16(C0_uv_8x16, in_macro); //CLIP3 + C0_uv_8x16 = _mm_subs_epi16(zero, C0_uv_8x16); + in_macro = _mm_max_epi16(C0_uv_8x16, in_macro); + + p0_uv_8x16_1 = _mm_add_epi16(p0_uv_8x16, in_macro); + q0_uv_8x16_1 = _mm_sub_epi16(q0_uv_8x16, in_macro); + + q0_uv_8x16 = _mm_unpackhi_epi8(q0_uv_16x8, zero); + q1_uv_8x16 = _mm_unpackhi_epi8(q1_uv_16x8, zero); + p1_uv_8x16 = _mm_unpackhi_epi8(p1_uv_16x8, zero); + p0_uv_8x16 = _mm_unpackhi_epi8(p0_uv_16x8, zero); + + diff = _mm_subs_epi16(p0_uv_8x16, q0_uv_8x16); //Condn 1 + diff = _mm_abs_epi16(diff); + alpha_cbcr_16x8 = _mm_set1_epi32(alpha_cbcr); + flag2 = _mm_cmpgt_epi16(alpha_cbcr_16x8, diff); + + diff = _mm_subs_epi16(q1_uv_8x16, q0_uv_8x16); //Condtn 2 + diff = _mm_abs_epi16(diff); + beta_cbcr_16x8 = _mm_set1_epi32(beta_cbcr); + flag2 = _mm_and_si128(flag2, _mm_cmpgt_epi16(beta_cbcr_16x8, diff)); + + diff = _mm_subs_epi16(p1_uv_8x16, p0_uv_8x16); //Condtn 3 + diff = _mm_abs_epi16(diff); + flag2 = _mm_and_si128(flag2, _mm_cmpgt_epi16(beta_cbcr_16x8, diff)); + + diff = _mm_subs_epi16(q0_uv_8x16, p0_uv_8x16); + diff = _mm_slli_epi16(diff, 2); + diff1 = _mm_subs_epi16(p1_uv_8x16, q1_uv_8x16); + diff = _mm_add_epi16(diff, diff1); + diff = _mm_add_epi16(diff, _mm_set1_epi16(4)); + in_macro = _mm_srai_epi16(diff, 3); + + C0_uv_8x16 = _mm_set_epi16(pu1_cliptab_cr[u1_Bs3], pu1_cliptab_cb[u1_Bs3], + pu1_cliptab_cr[u1_Bs3], pu1_cliptab_cb[u1_Bs3], + pu1_cliptab_cr[u1_Bs2], pu1_cliptab_cb[u1_Bs2], + pu1_cliptab_cr[u1_Bs2], pu1_cliptab_cb[u1_Bs2]); + + C0_uv_8x16 = _mm_add_epi16(C0_uv_8x16, _mm_set1_epi16(1)); + + in_macro = _mm_min_epi16(C0_uv_8x16, in_macro); //CLIP3 + C0_uv_8x16 = _mm_subs_epi16(zero, C0_uv_8x16); + in_macro = _mm_max_epi16(C0_uv_8x16, in_macro); + + p0_uv_8x16_2 = _mm_add_epi16(p0_uv_8x16, in_macro); + q0_uv_8x16_2 = _mm_sub_epi16(q0_uv_8x16, in_macro); + + p0_uv_8x16_2 = _mm_packus_epi16(p0_uv_8x16_1, p0_uv_8x16_2); + q0_uv_8x16_2 = _mm_packus_epi16(q0_uv_8x16_1, q0_uv_8x16_2); + + flag1 = _mm_packs_epi16(flag1, flag2); + flag1 = _mm_and_si128(flag1, flag_bs); //Final flag (BS condition + other 3 conditions) + + p0_uv_8x16_1 = _mm_and_si128(p0_uv_16x8, + _mm_xor_si128(flag1, _mm_set1_epi8(0xFF))); + p0_uv_8x16_2 = _mm_and_si128(p0_uv_8x16_2, flag1); + p0_uv_8x16_1 = _mm_add_epi8(p0_uv_8x16_1, p0_uv_8x16_2); + _mm_storeu_si128((__m128i *)(pu1_HorzPixelUV + i16_posP0), p0_uv_8x16_1); + + q0_uv_8x16_1 = _mm_and_si128(q0_uv_16x8, + _mm_xor_si128(flag1, _mm_set1_epi8(0xFF))); + q0_uv_8x16_2 = _mm_and_si128(q0_uv_8x16_2, flag1); + q0_uv_8x16_1 = _mm_add_epi8(q0_uv_8x16_1, q0_uv_8x16_2); + _mm_storeu_si128((__m128i *)(pu1_src_uv), q0_uv_8x16_1); + +} + +/*****************************************************************************/ +/* */ +/* Function Name : ih264_deblk_chroma_vert_bs4_mbaff_ssse3() */ +/* */ +/* Description : This function performs filtering of a chroma block */ +/* vertical edge when boundary strength is set to 4 in high */ +/* profile. */ +/* */ +/* Inputs : pu1_src - pointer to the src sample q0 of U */ +/* src_strd - source stride */ +/* alpha_cb - alpha value for the boundary in U */ +/* beta_cb - beta value for the boundary in U */ +/* alpha_cr - alpha value for the boundary in V */ +/* beta_cr - beta value for the boundary in V */ +/* u4_bs - packed Boundary strength array */ +/* pu1_cliptab_cb - tc0_table for U */ +/* pu1_cliptab_cr - tc0_table for V */ +/* */ +/* Globals : None */ +/* */ +/* Processing : When the function is called twice, this operation is as */ +/* described in Sec. 8.7.2.4 under the title "Filtering */ +/* process for edges for bS equal to 4" in ITU T Rec H.264 */ +/* with alpha and beta values different in U and V. */ +/* */ +/* Outputs : None */ +/* */ +/* Returns : None */ +/* */ +/* Issues : None */ +/* */ +/* Revision History: */ +/* */ +/* DD MM YYYY Author(s) Changes (Describe the changes made) */ +/* 12 02 2015 Naveen Kumar P Initial version */ +/* */ +/*****************************************************************************/ +void ih264_deblk_chroma_vert_bs4_mbaff_ssse3(UWORD8 *pu1_src, + WORD32 src_strd, + WORD32 alpha_cb, + WORD32 beta_cb, + WORD32 alpha_cr, + WORD32 beta_cr) +{ + UWORD8 *pu1_src_uv = pu1_src; /* Pointer to the src sample q0 of plane U*/ + WORD32 alpha_cbcr = (alpha_cr << 16) + alpha_cb; + WORD32 beta_cbcr = (beta_cr << 16) + beta_cb; + __m128i linea, lineb, linec, lined; + __m128i temp1, temp2; + + __m128i q0_uv_16x8, p0_uv_16x8, q1_uv_16x8, p1_uv_16x8; + __m128i q0_uv_8x16, p0_uv_8x16, q1_uv_8x16, p1_uv_8x16; + __m128i flag1; + __m128i diff, alpha_cbcr_16x8, beta_cbcr_16x8; + __m128i zero = _mm_setzero_si128(); + __m128i p0_uv_8x16_1, p0_uv_8x16_2, q0_uv_8x16_1, q0_uv_8x16_2; + + /* Load and transpose the pixel values */ + linea = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4)); + lineb = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + src_strd)); + linec = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + 2 * src_strd)); + lined = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + 3 * src_strd)); + + temp1 = _mm_unpacklo_epi16(linea, lineb); + temp2 = _mm_unpacklo_epi16(linec, lined); + + p1_uv_16x8 = _mm_unpacklo_epi32(temp1, temp2); + p0_uv_16x8 = _mm_srli_si128(p1_uv_16x8, 8); + q0_uv_16x8 = _mm_unpackhi_epi32(temp1, temp2); + q1_uv_16x8 = _mm_srli_si128(q0_uv_16x8, 8); + /* End of transpose */ + + q0_uv_8x16 = _mm_unpacklo_epi8(q0_uv_16x8, zero); + q1_uv_8x16 = _mm_unpacklo_epi8(q1_uv_16x8, zero); + p1_uv_8x16 = _mm_unpacklo_epi8(p1_uv_16x8, zero); + p0_uv_8x16 = _mm_unpacklo_epi8(p0_uv_16x8, zero); + + diff = _mm_subs_epi16(p0_uv_8x16, q0_uv_8x16); //Condn 1 + diff = _mm_abs_epi16(diff); + alpha_cbcr_16x8 = _mm_set1_epi32(alpha_cbcr); + flag1 = _mm_cmpgt_epi16(alpha_cbcr_16x8, diff); + + diff = _mm_subs_epi16(q1_uv_8x16, q0_uv_8x16); //Condtn 2 + diff = _mm_abs_epi16(diff); + beta_cbcr_16x8 = _mm_set1_epi32(beta_cbcr); + flag1 = _mm_and_si128(flag1, _mm_cmpgt_epi16(beta_cbcr_16x8, diff)); + + diff = _mm_subs_epi16(p1_uv_8x16, p0_uv_8x16); //Condtn 3 + diff = _mm_abs_epi16(diff); + flag1 = _mm_and_si128(flag1, _mm_cmpgt_epi16(beta_cbcr_16x8, diff)); + + temp1 = _mm_slli_epi16(p1_uv_8x16, 1); + temp2 = _mm_add_epi16(p0_uv_8x16, q1_uv_8x16); + temp1 = _mm_add_epi16(temp1, _mm_set1_epi16(2)); + temp1 = _mm_add_epi16(temp1, temp2); + p0_uv_8x16_1 = _mm_srai_epi16(temp1, 2); + + temp1 = _mm_slli_epi16(q1_uv_8x16, 1); + temp2 = _mm_add_epi16(p1_uv_8x16, q0_uv_8x16); + temp1 = _mm_add_epi16(temp1, _mm_set1_epi16(2)); + temp1 = _mm_add_epi16(temp1, temp2); + q0_uv_8x16_1 = _mm_srai_epi16(temp1, 2); + + p0_uv_8x16_2 = _mm_packus_epi16(p0_uv_8x16_1, p0_uv_8x16_1); + q0_uv_8x16_2 = _mm_packus_epi16(q0_uv_8x16_1, q0_uv_8x16_1); + + flag1 = _mm_packs_epi16(flag1, flag1); + + p0_uv_8x16_1 = _mm_and_si128(p0_uv_16x8, + _mm_xor_si128(flag1, _mm_set1_epi8(0xFF))); + p0_uv_8x16_2 = _mm_and_si128(p0_uv_8x16_2, flag1); + p0_uv_16x8 = _mm_add_epi8(p0_uv_8x16_1, p0_uv_8x16_2); + + q0_uv_8x16_1 = _mm_and_si128(q0_uv_16x8, + _mm_xor_si128(flag1, _mm_set1_epi8(0xFF))); + q0_uv_8x16_2 = _mm_and_si128(q0_uv_8x16_2, flag1); + q0_uv_16x8 = _mm_add_epi8(q0_uv_8x16_1, q0_uv_8x16_2); + + /* Inverse-transpose and store back */ + temp1 = _mm_unpacklo_epi16(p1_uv_16x8, p0_uv_16x8); + temp2 = _mm_unpacklo_epi16(q0_uv_16x8, q1_uv_16x8); + + linea = _mm_unpacklo_epi32(temp1, temp2); + lineb = _mm_srli_si128(linea, 8); + linec = _mm_unpackhi_epi32(temp1, temp2); + lined = _mm_srli_si128(linec, 8); + + _mm_storel_epi64((__m128i *)(pu1_src_uv - 4), linea); + _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + src_strd), lineb); + _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + 2 * src_strd), linec); + _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + 3 * src_strd), lined); + +} + +/*****************************************************************************/ +/* */ +/* Function Name : ih264_deblk_chroma_vert_bslt4_mbaff_ssse3() */ +/* */ +/* Description : This function performs filtering of a chroma block */ +/* vertical edge when boundary strength is less than 4 in */ +/* high profile. */ +/* */ +/* Inputs : pu1_src - pointer to the src sample q0 of U */ +/* src_strd - source stride */ +/* alpha_cb - alpha value for the boundary in U */ +/* beta_cb - beta value for the boundary in U */ +/* alpha_cr - alpha value for the boundary in V */ +/* beta_cr - beta value for the boundary in V */ +/* u4_bs - packed Boundary strength array */ +/* pu1_cliptab_cb - tc0_table for U */ +/* pu1_cliptab_cr - tc0_table for V */ +/* */ +/* Globals : None */ +/* */ +/* Processing : When the function is called twice, this operation is as */ +/* described in Sec. 8.7.2.4 under the title "Filtering */ +/* process for edges for bS less than 4" in ITU T Rec H.264 */ +/* with alpha and beta values different in U and V. */ +/* */ +/* Outputs : None */ +/* */ +/* Returns : None */ +/* */ +/* Issues : None */ +/* */ +/* Revision History: */ +/* */ +/* DD MM YYYY Author(s) Changes (Describe the changes made) */ +/* 12 02 2015 Naveen Kumar P Initial version */ +/* */ +/*****************************************************************************/ +void ih264_deblk_chroma_vert_bslt4_mbaff_ssse3(UWORD8 *pu1_src, + WORD32 src_strd, + WORD32 alpha_cb, + WORD32 beta_cb, + WORD32 alpha_cr, + WORD32 beta_cr, + UWORD32 u4_bs, + const UWORD8 *pu1_cliptab_cb, + const UWORD8 *pu1_cliptab_cr) +{ + UWORD8 *pu1_src_uv = pu1_src; /* Pointer to the src sample q0 of plane U*/ + UWORD8 u1_Bs0, u1_Bs1, u1_Bs2, u1_Bs3; + WORD32 alpha_cbcr = (alpha_cr << 16) + alpha_cb; + WORD32 beta_cbcr = (beta_cr << 16) + beta_cb; + __m128i linea, lineb, linec, lined; + __m128i temp1, temp2; + + __m128i q0_uv_16x8, p0_uv_16x8, q1_uv_16x8, p1_uv_16x8; + __m128i q0_uv_8x16, p0_uv_8x16, q1_uv_8x16, p1_uv_8x16; + __m128i flag_bs, flag1; + __m128i diff, diff1, alpha_cbcr_16x8, beta_cbcr_16x8, in_macro; + __m128i zero = _mm_setzero_si128(); + __m128i C0_uv_8x16; + __m128i p0_uv_8x16_1, p0_uv_8x16_2, q0_uv_8x16_1, q0_uv_8x16_2; + + u1_Bs0 = (u4_bs >> 24) & 0xff; + u1_Bs1 = (u4_bs >> 16) & 0xff; + u1_Bs2 = (u4_bs >> 8) & 0xff; + u1_Bs3 = (u4_bs >> 0) & 0xff; + + flag_bs = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, u1_Bs3, u1_Bs3, u1_Bs2, + u1_Bs2, u1_Bs1, u1_Bs1, u1_Bs0, u1_Bs0); + flag_bs = _mm_cmpeq_epi8(flag_bs, zero); //Set flag to 1s and 0s + flag_bs = _mm_xor_si128(flag_bs, _mm_set1_epi8(0xFF)); //Invert for required mask + + /* Load and transpose the pixel values */ + linea = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4)); + lineb = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + src_strd)); + linec = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + 2 * src_strd)); + lined = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + 3 * src_strd)); + + temp1 = _mm_unpacklo_epi16(linea, lineb); + temp2 = _mm_unpacklo_epi16(linec, lined); + + p1_uv_16x8 = _mm_unpacklo_epi32(temp1, temp2); + p0_uv_16x8 = _mm_srli_si128(p1_uv_16x8, 8); + q0_uv_16x8 = _mm_unpackhi_epi32(temp1, temp2); + q1_uv_16x8 = _mm_srli_si128(q0_uv_16x8, 8); + /* End of transpose */ + + q0_uv_8x16 = _mm_unpacklo_epi8(q0_uv_16x8, zero); + q1_uv_8x16 = _mm_unpacklo_epi8(q1_uv_16x8, zero); + p1_uv_8x16 = _mm_unpacklo_epi8(p1_uv_16x8, zero); + p0_uv_8x16 = _mm_unpacklo_epi8(p0_uv_16x8, zero); + + diff = _mm_subs_epi16(p0_uv_8x16, q0_uv_8x16); //Condn 1 + diff = _mm_abs_epi16(diff); + alpha_cbcr_16x8 = _mm_set1_epi32(alpha_cbcr); + flag1 = _mm_cmpgt_epi16(alpha_cbcr_16x8, diff); + + diff = _mm_subs_epi16(q1_uv_8x16, q0_uv_8x16); //Condtn 2 + diff = _mm_abs_epi16(diff); + beta_cbcr_16x8 = _mm_set1_epi32(beta_cbcr); + flag1 = _mm_and_si128(flag1, _mm_cmpgt_epi16(beta_cbcr_16x8, diff)); + + diff = _mm_subs_epi16(p1_uv_8x16, p0_uv_8x16); //Condtn 3 + diff = _mm_abs_epi16(diff); + flag1 = _mm_and_si128(flag1, _mm_cmpgt_epi16(beta_cbcr_16x8, diff)); + + diff = _mm_subs_epi16(q0_uv_8x16, p0_uv_8x16); + diff = _mm_slli_epi16(diff, 2); + diff1 = _mm_subs_epi16(p1_uv_8x16, q1_uv_8x16); + diff = _mm_add_epi16(diff, diff1); + diff = _mm_add_epi16(diff, _mm_set1_epi16(4)); + in_macro = _mm_srai_epi16(diff, 3); + + C0_uv_8x16 = _mm_set_epi16(pu1_cliptab_cr[u1_Bs3], pu1_cliptab_cb[u1_Bs3], + pu1_cliptab_cr[u1_Bs2], pu1_cliptab_cb[u1_Bs2], + pu1_cliptab_cr[u1_Bs1], pu1_cliptab_cb[u1_Bs1], + pu1_cliptab_cr[u1_Bs0], pu1_cliptab_cb[u1_Bs0]); + + C0_uv_8x16 = _mm_add_epi16(C0_uv_8x16, _mm_set1_epi16(1)); + + in_macro = _mm_min_epi16(C0_uv_8x16, in_macro); //CLIP3 + C0_uv_8x16 = _mm_subs_epi16(zero, C0_uv_8x16); + in_macro = _mm_max_epi16(C0_uv_8x16, in_macro); + + p0_uv_8x16_1 = _mm_add_epi16(p0_uv_8x16, in_macro); + q0_uv_8x16_1 = _mm_sub_epi16(q0_uv_8x16, in_macro); + + p0_uv_8x16_2 = _mm_packus_epi16(p0_uv_8x16_1, p0_uv_8x16_1); + q0_uv_8x16_2 = _mm_packus_epi16(q0_uv_8x16_1, q0_uv_8x16_1); + + flag1 = _mm_packs_epi16(flag1, flag1); + flag1 = _mm_and_si128(flag1, flag_bs); //Final flag (BS condition + other 3 conditions) + + p0_uv_8x16_1 = _mm_and_si128(p0_uv_16x8, + _mm_xor_si128(flag1, _mm_set1_epi8(0xFF))); + p0_uv_8x16_2 = _mm_and_si128(p0_uv_8x16_2, flag1); + p0_uv_16x8 = _mm_add_epi8(p0_uv_8x16_1, p0_uv_8x16_2); + + q0_uv_8x16_1 = _mm_and_si128(q0_uv_16x8, + _mm_xor_si128(flag1, _mm_set1_epi8(0xFF))); + q0_uv_8x16_2 = _mm_and_si128(q0_uv_8x16_2, flag1); + q0_uv_16x8 = _mm_add_epi8(q0_uv_8x16_1, q0_uv_8x16_2); + + /* Inverse-transpose and store back */ + temp1 = _mm_unpacklo_epi16(p1_uv_16x8, p0_uv_16x8); + temp2 = _mm_unpacklo_epi16(q0_uv_16x8, q1_uv_16x8); + + linea = _mm_unpacklo_epi32(temp1, temp2); + lineb = _mm_srli_si128(linea, 8); + linec = _mm_unpackhi_epi32(temp1, temp2); + lined = _mm_srli_si128(linec, 8); + + _mm_storel_epi64((__m128i *)(pu1_src_uv - 4), linea); + _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + src_strd), lineb); + _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + 2 * src_strd), linec); + _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + 3 * src_strd), lined); + +} + diff --git a/common/x86/ih264_deblk_luma_ssse3.c b/common/x86/ih264_deblk_luma_ssse3.c new file mode 100755 index 0000000..440d5f0 --- /dev/null +++ b/common/x86/ih264_deblk_luma_ssse3.c @@ -0,0 +1,2012 @@ +/****************************************************************************** + * + * Copyright (C) 2015 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ***************************************************************************** + * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore +*/ +/*****************************************************************************/ +/* */ +/* File Name : ih264_deblk_luma_ssse3.c */ +/* */ +/* Description : Contains function definitions for deblocking */ +/* */ +/* List of Functions : ih264_deblk_luma_vert_bs4_ssse3() */ +/* ih264_deblk_luma_horz_bs4_ssse3() */ +/* ih264_deblk_luma_vert_bslt4_ssse3() */ +/* ih264_deblk_luma_horz_bslt4_ssse3() */ +/* ih264_deblk_luma_vert_bs4_mbaff_ssse3() */ +/* ih264_deblk_luma_vert_bslt4_mbaff_ssse3() */ +/* */ +/* Issues / Problems : None */ +/* */ +/* Revision History : */ +/* */ +/* DD MM YYYY Author(s) Changes (Describe the changes made) */ +/* 12 02 2015 Naveen Kumar P Added luma deblocking ssse3 */ +/* intrinsics */ +/* */ +/*****************************************************************************/ + +/*****************************************************************************/ +/* File Includes */ +/*****************************************************************************/ + +/* System include files */ +#include <stdio.h> + +/* User include files */ +#include "ih264_typedefs.h" +#include "ih264_platform_macros.h" +#include "ih264_deblk_edge_filters.h" +#include "ih264_macros.h" + +/*****************************************************************************/ +/* Function Definitions */ +/*****************************************************************************/ + +/*****************************************************************************/ +/* */ +/* Function Name : ih264_deblk_luma_vert_bs4_ssse3() */ +/* */ +/* Description : This function performs filtering of a luma block */ +/* vertical edge when the boundary strength is set to 4. */ +/* */ +/* Inputs : pu1_src - pointer to the src sample q0 */ +/* src_strd - source stride */ +/* alpha - alpha value for the boundary */ +/* beta - beta value for the boundary */ +/* */ +/* Globals : None */ +/* */ +/* Processing : This operation is described in Sec. 8.7.2.4 under the */ +/* title "Filtering process for edges for bS equal to 4" in */ +/* ITU T Rec H.264. */ +/* */ +/* Outputs : None */ +/* */ +/* Returns : None */ +/* */ +/* Issues : None */ +/* */ +/* Revision History: */ +/* */ +/* DD MM YYYY Author(s) Changes (Describe the changes made) */ +/* 12 02 2015 Naveen Kumar P Initial version */ +/* */ +/*****************************************************************************/ +void ih264_deblk_luma_vert_bs4_ssse3(UWORD8 *pu1_src, + WORD32 src_strd, + WORD32 alpha, + WORD32 beta) +{ + __m128i zero = _mm_setzero_si128(); + __m128i q0_16x8, q1_16x8, q2_16x8, q3_16x8; + __m128i p0_16x8, p1_16x8, p2_16x8, p3_16x8; + __m128i q0_8x16, q1_8x16, q2_8x16, q3_8x16; + __m128i p0_8x16, p1_8x16, p2_8x16, p3_8x16; + __m128i q0_16x8_1; + __m128i p0_16x8_1; + __m128i q0_16x8_2, q1_16x8_2, q2_16x8_2; + __m128i p0_16x8_2, p1_16x8_2, p2_16x8_2; + __m128i temp1, temp2, temp3, temp4, temp5, temp6; + __m128i Alpha_8x16, Beta_8x16; + __m128i flag1_16x8, flag2_16x8, flag3_16x8, flag4_16x8; + __m128i const_val2_16x8 = _mm_set1_epi16(2); + __m128i line1, line2, line3, line4, line5, line6, line7, line8; + + Alpha_8x16 = _mm_set1_epi16(alpha); + Beta_8x16 = _mm_set1_epi16(beta); + + line1 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 0 * src_strd)); + line2 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 1 * src_strd)); + line3 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 2 * src_strd)); + line4 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 3 * src_strd)); + line5 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 4 * src_strd)); + line6 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 5 * src_strd)); + line7 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 6 * src_strd)); + line8 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 7 * src_strd)); + + temp1 = _mm_unpacklo_epi8(line1, line2); + temp2 = _mm_unpacklo_epi8(line3, line4); + temp3 = _mm_unpacklo_epi8(line5, line6); + temp4 = _mm_unpacklo_epi8(line7, line8); + + line1 = _mm_unpacklo_epi16(temp1, temp2); + line2 = _mm_unpackhi_epi16(temp1, temp2); + line3 = _mm_unpacklo_epi16(temp3, temp4); + line4 = _mm_unpackhi_epi16(temp3, temp4); + + p1_8x16 = _mm_unpacklo_epi32(line1, line3); + p0_8x16 = _mm_unpackhi_epi32(line1, line3); + q0_8x16 = _mm_unpacklo_epi32(line2, line4); + q1_8x16 = _mm_unpackhi_epi32(line2, line4); + + line1 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 8 * src_strd)); + line2 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 9 * src_strd)); + line3 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 10 * src_strd)); + line4 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 11 * src_strd)); + line5 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 12 * src_strd)); + line6 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 13 * src_strd)); + line7 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 14 * src_strd)); + line8 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 15 * src_strd)); + + temp1 = _mm_unpacklo_epi8(line1, line2); + temp2 = _mm_unpacklo_epi8(line3, line4); + temp3 = _mm_unpacklo_epi8(line5, line6); + temp4 = _mm_unpacklo_epi8(line7, line8); + + line1 = _mm_unpacklo_epi16(temp1, temp2); + line2 = _mm_unpackhi_epi16(temp1, temp2); + line3 = _mm_unpacklo_epi16(temp3, temp4); + line4 = _mm_unpackhi_epi16(temp3, temp4); + + temp1 = _mm_unpacklo_epi32(line1, line3); + temp2 = _mm_unpackhi_epi32(line1, line3); + temp3 = _mm_unpacklo_epi32(line2, line4); + temp4 = _mm_unpackhi_epi32(line2, line4); + + p3_16x8 = _mm_unpacklo_epi64(p1_8x16, temp1); + p2_16x8 = _mm_unpackhi_epi64(p1_8x16, temp1); + q2_16x8 = _mm_unpacklo_epi64(q1_8x16, temp4); + q3_16x8 = _mm_unpackhi_epi64(q1_8x16, temp4); + p1_16x8 = _mm_unpacklo_epi64(p0_8x16, temp2); + p0_16x8 = _mm_unpackhi_epi64(p0_8x16, temp2); + q0_16x8 = _mm_unpacklo_epi64(q0_8x16, temp3); + q1_16x8 = _mm_unpackhi_epi64(q0_8x16, temp3); + + //Cond1 (ABS(p0 - q0) < alpha) + temp1 = _mm_subs_epu8(q0_16x8, p0_16x8); + temp2 = _mm_subs_epu8(p0_16x8, q0_16x8); + temp1 = _mm_add_epi8(temp1, temp2); + + temp2 = _mm_unpacklo_epi8(temp1, zero); + temp1 = _mm_unpackhi_epi8(temp1, zero); + + temp2 = _mm_cmpgt_epi16(Alpha_8x16, temp2); + temp1 = _mm_cmpgt_epi16(Alpha_8x16, temp1); + + flag1_16x8 = _mm_packs_epi16(temp2, temp1); + + //Cond2 (ABS(q1 - q0) < beta) + temp1 = _mm_subs_epu8(q0_16x8, q1_16x8); + temp2 = _mm_subs_epu8(q1_16x8, q0_16x8); + temp1 = _mm_add_epi8(temp1, temp2); + + temp2 = _mm_unpacklo_epi8(temp1, zero); + temp1 = _mm_unpackhi_epi8(temp1, zero); + + temp2 = _mm_cmpgt_epi16(Beta_8x16, temp2); + temp1 = _mm_cmpgt_epi16(Beta_8x16, temp1); + + flag2_16x8 = _mm_packs_epi16(temp2, temp1); + + flag1_16x8 = _mm_and_si128(flag1_16x8, flag2_16x8); + + //Cond3 (ABS(p1 - p0) < beta) + temp1 = _mm_subs_epu8(p0_16x8, p1_16x8); + temp2 = _mm_subs_epu8(p1_16x8, p0_16x8); + temp1 = _mm_add_epi8(temp1, temp2); + + temp2 = _mm_unpacklo_epi8(temp1, zero); + temp1 = _mm_unpackhi_epi8(temp1, zero); + + temp2 = _mm_cmpgt_epi16(Beta_8x16, temp2); + temp1 = _mm_cmpgt_epi16(Beta_8x16, temp1); + + flag2_16x8 = _mm_packs_epi16(temp2, temp1); + + // !((ABS(p0 - q0) < alpha) || (ABS(q1 - q0) < beta) || (ABS(p1 - p0) < beta)) + flag1_16x8 = _mm_and_si128(flag1_16x8, flag2_16x8); + + // (ABS(p0 - q0) < ((alpha >> 2) + 2)) + temp1 = _mm_subs_epu8(p0_16x8, q0_16x8); + temp2 = _mm_subs_epu8(q0_16x8, p0_16x8); + temp1 = _mm_add_epi8(temp1, temp2); + Alpha_8x16 = _mm_srai_epi16(Alpha_8x16, 2); + Alpha_8x16 = _mm_add_epi16(Alpha_8x16, const_val2_16x8); + + temp2 = _mm_unpacklo_epi8(temp1, zero); + temp1 = _mm_unpackhi_epi8(temp1, zero); + temp2 = _mm_cmpgt_epi16(Alpha_8x16, temp2); + temp1 = _mm_cmpgt_epi16(Alpha_8x16, temp1); + + flag2_16x8 = _mm_packs_epi16(temp2, temp1); + flag2_16x8 = _mm_and_si128(flag1_16x8, flag2_16x8); + + // (ABS(p2 - p0) < beta) + temp1 = _mm_subs_epu8(p0_16x8, p2_16x8); + temp2 = _mm_subs_epu8(p2_16x8, p0_16x8); + temp1 = _mm_add_epi8(temp1, temp2); + + temp2 = _mm_unpacklo_epi8(temp1, zero); + temp1 = _mm_unpackhi_epi8(temp1, zero); + temp2 = _mm_cmpgt_epi16(Beta_8x16, temp2); + temp1 = _mm_cmpgt_epi16(Beta_8x16, temp1); + + flag3_16x8 = _mm_packs_epi16(temp2, temp1); + flag3_16x8 = _mm_and_si128(flag3_16x8, flag2_16x8); + + // (ABS(q2 - q0) < beta) + temp1 = _mm_subs_epu8(q0_16x8, q2_16x8); + temp2 = _mm_subs_epu8(q2_16x8, q0_16x8); + temp1 = _mm_add_epi8(temp1, temp2); + + temp2 = _mm_unpacklo_epi8(temp1, zero); + temp1 = _mm_unpackhi_epi8(temp1, zero); + temp2 = _mm_cmpgt_epi16(Beta_8x16, temp2); + temp1 = _mm_cmpgt_epi16(Beta_8x16, temp1); + + flag4_16x8 = _mm_packs_epi16(temp2, temp1); + flag4_16x8 = _mm_and_si128(flag4_16x8, flag2_16x8); + + // First 8 pixels + p3_8x16 = _mm_unpacklo_epi8(p3_16x8, zero); + p2_8x16 = _mm_unpacklo_epi8(p2_16x8, zero); + p1_8x16 = _mm_unpacklo_epi8(p1_16x8, zero); + p0_8x16 = _mm_unpacklo_epi8(p0_16x8, zero); + q0_8x16 = _mm_unpacklo_epi8(q0_16x8, zero); + q1_8x16 = _mm_unpacklo_epi8(q1_16x8, zero); + q2_8x16 = _mm_unpacklo_epi8(q2_16x8, zero); + q3_8x16 = _mm_unpacklo_epi8(q3_16x8, zero); + + // p0_1 and q0_1 + temp1 = _mm_add_epi16(p0_8x16, q1_8x16); + temp2 = _mm_add_epi16(p1_8x16, q0_8x16); + temp5 = _mm_add_epi16(temp1, const_val2_16x8); + temp6 = _mm_add_epi16(temp2, const_val2_16x8); + temp3 = _mm_slli_epi16(p1_8x16, 1); + temp4 = _mm_slli_epi16(q1_8x16, 1); + temp1 = _mm_add_epi16(temp5, temp3); + temp2 = _mm_add_epi16(temp6, temp4); + p0_16x8_1 = _mm_srai_epi16(temp1, 2); + q0_16x8_1 = _mm_srai_epi16(temp2, 2); + + // p1_2 and q1_2 + temp6 = _mm_add_epi16(temp6, p0_8x16); + temp5 = _mm_add_epi16(temp5, q0_8x16); + temp1 = _mm_add_epi16(temp6, p2_8x16); + temp2 = _mm_add_epi16(temp5, q2_8x16); + p1_16x8_2 = _mm_srai_epi16(temp1, 2); + q1_16x8_2 = _mm_srai_epi16(temp2, 2); + + // p0_2 and q0_2 + temp1 = _mm_add_epi16(temp3, p2_8x16); + temp2 = _mm_add_epi16(temp4, q2_8x16); + temp1 = _mm_add_epi16(temp1, q1_8x16); + temp2 = _mm_add_epi16(temp2, p1_8x16); + temp3 = _mm_add_epi16(p0_8x16, q0_8x16); + temp3 = _mm_slli_epi16(temp3, 1); + temp1 = _mm_add_epi16(temp1, temp3); + temp2 = _mm_add_epi16(temp2, temp3); + temp1 = _mm_add_epi16(temp1, _mm_set1_epi16(4)); + temp2 = _mm_add_epi16(temp2, _mm_set1_epi16(4)); + p0_16x8_2 = _mm_srai_epi16(temp1, 3); + q0_16x8_2 = _mm_srai_epi16(temp2, 3); + + // p2_2 and q2_2 + temp1 = _mm_add_epi16(temp6, const_val2_16x8); + temp2 = _mm_add_epi16(temp5, const_val2_16x8); + temp3 = _mm_slli_epi16(p2_8x16, 1); + temp4 = _mm_slli_epi16(q2_8x16, 1); + temp3 = _mm_add_epi16(p2_8x16, temp3); + temp4 = _mm_add_epi16(q2_8x16, temp4); + temp5 = _mm_slli_epi16(p3_8x16, 1); + temp6 = _mm_slli_epi16(q3_8x16, 1); + temp1 = _mm_add_epi16(temp1, temp3); + temp2 = _mm_add_epi16(temp2, temp4); + temp1 = _mm_add_epi16(temp1, temp5); + temp2 = _mm_add_epi16(temp2, temp6); + p2_16x8_2 = _mm_srai_epi16(temp1, 3); + q2_16x8_2 = _mm_srai_epi16(temp2, 3); + + // Second 8 pixels and packing with first 8 pixels + p3_8x16 = _mm_unpackhi_epi8(p3_16x8, zero); + p2_8x16 = _mm_unpackhi_epi8(p2_16x8, zero); + p1_8x16 = _mm_unpackhi_epi8(p1_16x8, zero); + p0_8x16 = _mm_unpackhi_epi8(p0_16x8, zero); + q0_8x16 = _mm_unpackhi_epi8(q0_16x8, zero); + q1_8x16 = _mm_unpackhi_epi8(q1_16x8, zero); + q2_8x16 = _mm_unpackhi_epi8(q2_16x8, zero); + q3_8x16 = _mm_unpackhi_epi8(q3_16x8, zero); + + // p0_1 and q0_1 + temp1 = _mm_add_epi16(p0_8x16, q1_8x16); + temp2 = _mm_add_epi16(p1_8x16, q0_8x16); + temp5 = _mm_add_epi16(temp1, const_val2_16x8); + temp6 = _mm_add_epi16(temp2, const_val2_16x8); + temp3 = _mm_slli_epi16(p1_8x16, 1); + temp4 = _mm_slli_epi16(q1_8x16, 1); + temp1 = _mm_add_epi16(temp5, temp3); + temp2 = _mm_add_epi16(temp6, temp4); + temp1 = _mm_srai_epi16(temp1, 2); + temp2 = _mm_srai_epi16(temp2, 2); + p0_16x8_1 = _mm_packus_epi16(p0_16x8_1, temp1); + q0_16x8_1 = _mm_packus_epi16(q0_16x8_1, temp2); + + // p1_2 and q1_2 + temp6 = _mm_add_epi16(temp6, p0_8x16); + temp5 = _mm_add_epi16(temp5, q0_8x16); + temp1 = _mm_add_epi16(temp6, p2_8x16); + temp2 = _mm_add_epi16(temp5, q2_8x16); + temp1 = _mm_srai_epi16(temp1, 2); + temp2 = _mm_srai_epi16(temp2, 2); + p1_16x8_2 = _mm_packus_epi16(p1_16x8_2, temp1); + q1_16x8_2 = _mm_packus_epi16(q1_16x8_2, temp2); + + // p0_2 and q0_2 + temp1 = _mm_add_epi16(temp3, p2_8x16); + temp2 = _mm_add_epi16(temp4, q2_8x16); + temp1 = _mm_add_epi16(temp1, q1_8x16); + temp2 = _mm_add_epi16(temp2, p1_8x16); + temp3 = _mm_add_epi16(p0_8x16, q0_8x16); + temp3 = _mm_slli_epi16(temp3, 1); + temp1 = _mm_add_epi16(temp1, temp3); + temp2 = _mm_add_epi16(temp2, temp3); + temp1 = _mm_add_epi16(temp1, _mm_set1_epi16(4)); + temp2 = _mm_add_epi16(temp2, _mm_set1_epi16(4)); + temp1 = _mm_srai_epi16(temp1, 3); + temp2 = _mm_srai_epi16(temp2, 3); + p0_16x8_2 = _mm_packus_epi16(p0_16x8_2, temp1); + q0_16x8_2 = _mm_packus_epi16(q0_16x8_2, temp2); + + // p2_2 and q2_2 + temp1 = _mm_add_epi16(temp6, const_val2_16x8); + temp2 = _mm_add_epi16(temp5, const_val2_16x8); + temp3 = _mm_slli_epi16(p2_8x16, 1); + temp4 = _mm_slli_epi16(q2_8x16, 1); + temp3 = _mm_add_epi16(p2_8x16, temp3); + temp4 = _mm_add_epi16(q2_8x16, temp4); + temp5 = _mm_slli_epi16(p3_8x16, 1); + temp6 = _mm_slli_epi16(q3_8x16, 1); + temp1 = _mm_add_epi16(temp1, temp3); + temp2 = _mm_add_epi16(temp2, temp4); + temp1 = _mm_add_epi16(temp1, temp5); + temp2 = _mm_add_epi16(temp2, temp6); + temp1 = _mm_srai_epi16(temp1, 3); + temp2 = _mm_srai_epi16(temp2, 3); + p2_16x8_2 = _mm_packus_epi16(p2_16x8_2, temp1); + q2_16x8_2 = _mm_packus_epi16(q2_16x8_2, temp2); + + // p0 and q0 + p0_16x8 = _mm_and_si128(p0_16x8, + _mm_xor_si128(flag1_16x8, _mm_set1_epi8(0xFF))); + p0_16x8_1 = _mm_and_si128(p0_16x8_1, flag1_16x8); + p0_16x8 = _mm_add_epi8(p0_16x8, p0_16x8_1); + q0_16x8 = _mm_and_si128(q0_16x8, + _mm_xor_si128(flag1_16x8, _mm_set1_epi8(0xFF))); + q0_16x8_1 = _mm_and_si128(q0_16x8_1, flag1_16x8); + q0_16x8 = _mm_add_epi8(q0_16x8, q0_16x8_1); + + // p0 and q0 + p0_16x8 = _mm_and_si128(p0_16x8, + _mm_xor_si128(flag3_16x8, _mm_set1_epi8(0xFF))); + p0_16x8_2 = _mm_and_si128(p0_16x8_2, flag3_16x8); + p0_16x8 = _mm_add_epi8(p0_16x8, p0_16x8_2); + q0_16x8 = _mm_and_si128(q0_16x8, + _mm_xor_si128(flag4_16x8, _mm_set1_epi8(0xFF))); + q0_16x8_2 = _mm_and_si128(q0_16x8_2, flag4_16x8); + q0_16x8 = _mm_add_epi8(q0_16x8, q0_16x8_2); + + // p1 and q1 + p1_16x8 = _mm_and_si128(p1_16x8, + _mm_xor_si128(flag3_16x8, _mm_set1_epi8(0xFF))); + p1_16x8_2 = _mm_and_si128(p1_16x8_2, flag3_16x8); + p1_16x8 = _mm_add_epi8(p1_16x8, p1_16x8_2); + q1_16x8 = _mm_and_si128(q1_16x8, + _mm_xor_si128(flag4_16x8, _mm_set1_epi8(0xFF))); + q1_16x8_2 = _mm_and_si128(q1_16x8_2, flag4_16x8); + q1_16x8 = _mm_add_epi8(q1_16x8, q1_16x8_2); + + // p2 and q2 + p2_16x8 = _mm_and_si128(p2_16x8, + _mm_xor_si128(flag3_16x8, _mm_set1_epi8(0xFF))); + p2_16x8_2 = _mm_and_si128(p2_16x8_2, flag3_16x8); + p2_16x8 = _mm_add_epi8(p2_16x8, p2_16x8_2); + q2_16x8 = _mm_and_si128(q2_16x8, + _mm_xor_si128(flag4_16x8, _mm_set1_epi8(0xFF))); + q2_16x8_2 = _mm_and_si128(q2_16x8_2, flag4_16x8); + q2_16x8 = _mm_add_epi8(q2_16x8, q2_16x8_2); + + temp1 = _mm_unpacklo_epi8(p3_16x8, p2_16x8); + temp2 = _mm_unpacklo_epi8(p1_16x8, p0_16x8); + temp3 = _mm_unpacklo_epi8(q0_16x8, q1_16x8); + temp4 = _mm_unpacklo_epi8(q2_16x8, q3_16x8); + + p3_8x16 = _mm_unpacklo_epi16(temp1, temp2); + p2_8x16 = _mm_unpackhi_epi16(temp1, temp2); + q2_8x16 = _mm_unpacklo_epi16(temp3, temp4); + q3_8x16 = _mm_unpackhi_epi16(temp3, temp4); + + line1 = _mm_unpacklo_epi32(p3_8x16, q2_8x16); + line2 = _mm_srli_si128(line1, 8); + line3 = _mm_unpackhi_epi32(p3_8x16, q2_8x16); + line4 = _mm_srli_si128(line3, 8); + line5 = _mm_unpacklo_epi32(p2_8x16, q3_8x16); + line6 = _mm_srli_si128(line5, 8); + line7 = _mm_unpackhi_epi32(p2_8x16, q3_8x16); + line8 = _mm_srli_si128(line7, 8); + + _mm_storel_epi64((__m128i *)(pu1_src - 4 + 0 * src_strd), line1); + _mm_storel_epi64((__m128i *)(pu1_src - 4 + 1 * src_strd), line2); + _mm_storel_epi64((__m128i *)(pu1_src - 4 + 2 * src_strd), line3); + _mm_storel_epi64((__m128i *)(pu1_src - 4 + 3 * src_strd), line4); + _mm_storel_epi64((__m128i *)(pu1_src - 4 + 4 * src_strd), line5); + _mm_storel_epi64((__m128i *)(pu1_src - 4 + 5 * src_strd), line6); + _mm_storel_epi64((__m128i *)(pu1_src - 4 + 6 * src_strd), line7); + _mm_storel_epi64((__m128i *)(pu1_src - 4 + 7 * src_strd), line8); + + temp1 = _mm_unpackhi_epi8(p3_16x8, p2_16x8); + temp2 = _mm_unpackhi_epi8(p1_16x8, p0_16x8); + temp3 = _mm_unpackhi_epi8(q0_16x8, q1_16x8); + temp4 = _mm_unpackhi_epi8(q2_16x8, q3_16x8); + + p3_8x16 = _mm_unpacklo_epi16(temp1, temp2); + p2_8x16 = _mm_unpackhi_epi16(temp1, temp2); + q2_8x16 = _mm_unpacklo_epi16(temp3, temp4); + q3_8x16 = _mm_unpackhi_epi16(temp3, temp4); + + line1 = _mm_unpacklo_epi32(p3_8x16, q2_8x16); + line2 = _mm_srli_si128(line1, 8); + line3 = _mm_unpackhi_epi32(p3_8x16, q2_8x16); + line4 = _mm_srli_si128(line3, 8); + line5 = _mm_unpacklo_epi32(p2_8x16, q3_8x16); + line6 = _mm_srli_si128(line5, 8); + line7 = _mm_unpackhi_epi32(p2_8x16, q3_8x16); + line8 = _mm_srli_si128(line7, 8); + + _mm_storel_epi64((__m128i *)(pu1_src - 4 + 8 * src_strd), line1); + _mm_storel_epi64((__m128i *)(pu1_src - 4 + 9 * src_strd), line2); + _mm_storel_epi64((__m128i *)(pu1_src - 4 + 10 * src_strd), line3); + _mm_storel_epi64((__m128i *)(pu1_src - 4 + 11 * src_strd), line4); + _mm_storel_epi64((__m128i *)(pu1_src - 4 + 12 * src_strd), line5); + _mm_storel_epi64((__m128i *)(pu1_src - 4 + 13 * src_strd), line6); + _mm_storel_epi64((__m128i *)(pu1_src - 4 + 14 * src_strd), line7); + _mm_storel_epi64((__m128i *)(pu1_src - 4 + 15 * src_strd), line8); + +} + +/*****************************************************************************/ +/* */ +/* Function Name : ih264_deblk_luma_horz_bs4_ssse3() */ +/* */ +/* Description : This function performs filtering of a luma block */ +/* horizontal edge when the boundary strength is set to 4. */ +/* */ +/* Inputs : pu1_src - pointer to the src sample q0 */ +/* src_strd - source stride */ +/* alpha - alpha value for the boundary */ +/* beta - beta value for the boundary */ +/* */ +/* Globals : None */ +/* */ +/* Processing : This operation is described in Sec. 8.7.2.4 under the */ +/* title "Filtering process for edges for bS equal to 4" in */ +/* ITU T Rec H.264. */ +/* */ +/* Outputs : None */ +/* */ +/* Returns : None */ +/* */ +/* Issues : None */ +/* */ +/* Revision History: */ +/* */ +/* DD MM YYYY Author(s) Changes (Describe the changes made) */ +/* 12 02 2015 Naveen Kumar P Initial version */ +/* */ +/*****************************************************************************/ +void ih264_deblk_luma_horz_bs4_ssse3(UWORD8 *pu1_src, + WORD32 src_strd, + WORD32 alpha, + WORD32 beta) +{ + WORD16 i16_posP3, i16_posP2, i16_posP1, i16_posP0; + WORD16 i16_posQ1, i16_posQ2, i16_posQ3; + UWORD8 *pu1_HorzPixel; + __m128i zero = _mm_setzero_si128(); + __m128i q0_16x8, q1_16x8, q2_16x8, q3_16x8; + __m128i p0_16x8, p1_16x8, p2_16x8, p3_16x8; + __m128i q0_8x16, q1_8x16, q2_8x16, q3_8x16; + __m128i p0_8x16, p1_8x16, p2_8x16, p3_8x16; + __m128i q0_16x8_1; + __m128i p0_16x8_1; + __m128i q0_16x8_2, q1_16x8_2, q2_16x8_2; + __m128i p0_16x8_2, p1_16x8_2, p2_16x8_2; + __m128i temp1, temp2, temp3, temp4, temp5, temp6; + __m128i Alpha_8x16, Beta_8x16; + __m128i flag1_16x8, flag2_16x8, flag3_16x8, flag4_16x8; + __m128i const_val2_16x8 = _mm_set1_epi16(2); + + pu1_HorzPixel = pu1_src - (src_strd << 2); + + i16_posQ1 = src_strd; + i16_posQ2 = X2(src_strd); + i16_posQ3 = X3(src_strd); + i16_posP0 = X3(src_strd); + i16_posP1 = X2(src_strd); + i16_posP2 = src_strd; + i16_posP3 = 0; + + Alpha_8x16 = _mm_set1_epi16(alpha); + Beta_8x16 = _mm_set1_epi16(beta); + + p3_16x8 = _mm_loadu_si128((__m128i *)(pu1_HorzPixel + i16_posP3)); + p2_16x8 = _mm_loadu_si128((__m128i *)(pu1_HorzPixel + i16_posP2)); + p1_16x8 = _mm_loadu_si128((__m128i *)(pu1_HorzPixel + i16_posP1)); + p0_16x8 = _mm_loadu_si128((__m128i *)(pu1_HorzPixel + i16_posP0)); + q0_16x8 = _mm_loadu_si128((__m128i *)(pu1_src)); + q1_16x8 = _mm_loadu_si128((__m128i *)(pu1_src + i16_posQ1)); + q2_16x8 = _mm_loadu_si128((__m128i *)(pu1_src + i16_posQ2)); + q3_16x8 = _mm_loadu_si128((__m128i *)(pu1_src + i16_posQ3)); + + //Cond1 (ABS(p0 - q0) < alpha) + temp1 = _mm_subs_epu8(q0_16x8, p0_16x8); + temp2 = _mm_subs_epu8(p0_16x8, q0_16x8); + temp1 = _mm_add_epi8(temp1, temp2); + + temp2 = _mm_unpacklo_epi8(temp1, zero); + temp1 = _mm_unpackhi_epi8(temp1, zero); + + temp2 = _mm_cmpgt_epi16(Alpha_8x16, temp2); + temp1 = _mm_cmpgt_epi16(Alpha_8x16, temp1); + + flag1_16x8 = _mm_packs_epi16(temp2, temp1); + + //Cond2 (ABS(q1 - q0) < beta) + temp1 = _mm_subs_epu8(q0_16x8, q1_16x8); + temp2 = _mm_subs_epu8(q1_16x8, q0_16x8); + temp1 = _mm_add_epi8(temp1, temp2); + + temp2 = _mm_unpacklo_epi8(temp1, zero); + temp1 = _mm_unpackhi_epi8(temp1, zero); + + temp2 = _mm_cmpgt_epi16(Beta_8x16, temp2); + temp1 = _mm_cmpgt_epi16(Beta_8x16, temp1); + + flag2_16x8 = _mm_packs_epi16(temp2, temp1); + + flag1_16x8 = _mm_and_si128(flag1_16x8, flag2_16x8); + + //Cond3 (ABS(p1 - p0) < beta) + temp1 = _mm_subs_epu8(p0_16x8, p1_16x8); + temp2 = _mm_subs_epu8(p1_16x8, p0_16x8); + temp1 = _mm_add_epi8(temp1, temp2); + + temp2 = _mm_unpacklo_epi8(temp1, zero); + temp1 = _mm_unpackhi_epi8(temp1, zero); + + temp2 = _mm_cmpgt_epi16(Beta_8x16, temp2); + temp1 = _mm_cmpgt_epi16(Beta_8x16, temp1); + + flag2_16x8 = _mm_packs_epi16(temp2, temp1); + + // !((ABS(p0 - q0) < alpha) || (ABS(q1 - q0) < beta) || (ABS(p1 - p0) < beta)) + flag1_16x8 = _mm_and_si128(flag1_16x8, flag2_16x8); + + // (ABS(p0 - q0) < ((alpha >> 2) + 2)) + temp1 = _mm_subs_epu8(p0_16x8, q0_16x8); + temp2 = _mm_subs_epu8(q0_16x8, p0_16x8); + temp1 = _mm_add_epi8(temp1, temp2); + Alpha_8x16 = _mm_srai_epi16(Alpha_8x16, 2); + Alpha_8x16 = _mm_add_epi16(Alpha_8x16, const_val2_16x8); + + temp2 = _mm_unpacklo_epi8(temp1, zero); + temp1 = _mm_unpackhi_epi8(temp1, zero); + temp2 = _mm_cmpgt_epi16(Alpha_8x16, temp2); + temp1 = _mm_cmpgt_epi16(Alpha_8x16, temp1); + + flag2_16x8 = _mm_packs_epi16(temp2, temp1); + flag2_16x8 = _mm_and_si128(flag1_16x8, flag2_16x8); + + // (ABS(p2 - p0) < beta) + temp1 = _mm_subs_epu8(p0_16x8, p2_16x8); + temp2 = _mm_subs_epu8(p2_16x8, p0_16x8); + temp1 = _mm_add_epi8(temp1, temp2); + + temp2 = _mm_unpacklo_epi8(temp1, zero); + temp1 = _mm_unpackhi_epi8(temp1, zero); + temp2 = _mm_cmpgt_epi16(Beta_8x16, temp2); + temp1 = _mm_cmpgt_epi16(Beta_8x16, temp1); + + flag3_16x8 = _mm_packs_epi16(temp2, temp1); + flag3_16x8 = _mm_and_si128(flag3_16x8, flag2_16x8); + + // (ABS(q2 - q0) < beta) + temp1 = _mm_subs_epu8(q0_16x8, q2_16x8); + temp2 = _mm_subs_epu8(q2_16x8, q0_16x8); + temp1 = _mm_add_epi8(temp1, temp2); + + temp2 = _mm_unpacklo_epi8(temp1, zero); + temp1 = _mm_unpackhi_epi8(temp1, zero); + temp2 = _mm_cmpgt_epi16(Beta_8x16, temp2); + temp1 = _mm_cmpgt_epi16(Beta_8x16, temp1); + + flag4_16x8 = _mm_packs_epi16(temp2, temp1); + flag4_16x8 = _mm_and_si128(flag4_16x8, flag2_16x8); + + // First 8 pixels + p3_8x16 = _mm_unpacklo_epi8(p3_16x8, zero); + p2_8x16 = _mm_unpacklo_epi8(p2_16x8, zero); + p1_8x16 = _mm_unpacklo_epi8(p1_16x8, zero); + p0_8x16 = _mm_unpacklo_epi8(p0_16x8, zero); + q0_8x16 = _mm_unpacklo_epi8(q0_16x8, zero); + q1_8x16 = _mm_unpacklo_epi8(q1_16x8, zero); + q2_8x16 = _mm_unpacklo_epi8(q2_16x8, zero); + q3_8x16 = _mm_unpacklo_epi8(q3_16x8, zero); + + // p0_1 and q0_1 + temp1 = _mm_add_epi16(p0_8x16, q1_8x16); + temp2 = _mm_add_epi16(p1_8x16, q0_8x16); + temp5 = _mm_add_epi16(temp1, const_val2_16x8); + temp6 = _mm_add_epi16(temp2, const_val2_16x8); + temp3 = _mm_slli_epi16(p1_8x16, 1); + temp4 = _mm_slli_epi16(q1_8x16, 1); + temp1 = _mm_add_epi16(temp5, temp3); + temp2 = _mm_add_epi16(temp6, temp4); + p0_16x8_1 = _mm_srai_epi16(temp1, 2); + q0_16x8_1 = _mm_srai_epi16(temp2, 2); + + // p1_2 and q1_2 + temp6 = _mm_add_epi16(temp6, p0_8x16); + temp5 = _mm_add_epi16(temp5, q0_8x16); + temp1 = _mm_add_epi16(temp6, p2_8x16); + temp2 = _mm_add_epi16(temp5, q2_8x16); + p1_16x8_2 = _mm_srai_epi16(temp1, 2); + q1_16x8_2 = _mm_srai_epi16(temp2, 2); + + // p0_2 and q0_2 + temp1 = _mm_add_epi16(temp3, p2_8x16); + temp2 = _mm_add_epi16(temp4, q2_8x16); + temp1 = _mm_add_epi16(temp1, q1_8x16); + temp2 = _mm_add_epi16(temp2, p1_8x16); + temp3 = _mm_add_epi16(p0_8x16, q0_8x16); + temp3 = _mm_slli_epi16(temp3, 1); + temp1 = _mm_add_epi16(temp1, temp3); + temp2 = _mm_add_epi16(temp2, temp3); + temp1 = _mm_add_epi16(temp1, _mm_set1_epi16(4)); + temp2 = _mm_add_epi16(temp2, _mm_set1_epi16(4)); + p0_16x8_2 = _mm_srai_epi16(temp1, 3); + q0_16x8_2 = _mm_srai_epi16(temp2, 3); + + // p2_2 and q2_2 + temp1 = _mm_add_epi16(temp6, const_val2_16x8); + temp2 = _mm_add_epi16(temp5, const_val2_16x8); + temp3 = _mm_slli_epi16(p2_8x16, 1); + temp4 = _mm_slli_epi16(q2_8x16, 1); + temp3 = _mm_add_epi16(p2_8x16, temp3); + temp4 = _mm_add_epi16(q2_8x16, temp4); + temp5 = _mm_slli_epi16(p3_8x16, 1); + temp6 = _mm_slli_epi16(q3_8x16, 1); + temp1 = _mm_add_epi16(temp1, temp3); + temp2 = _mm_add_epi16(temp2, temp4); + temp1 = _mm_add_epi16(temp1, temp5); + temp2 = _mm_add_epi16(temp2, temp6); + p2_16x8_2 = _mm_srai_epi16(temp1, 3); + q2_16x8_2 = _mm_srai_epi16(temp2, 3); + + // Second 8 pixels and packing with first 8 pixels + p3_8x16 = _mm_unpackhi_epi8(p3_16x8, zero); + p2_8x16 = _mm_unpackhi_epi8(p2_16x8, zero); + p1_8x16 = _mm_unpackhi_epi8(p1_16x8, zero); + p0_8x16 = _mm_unpackhi_epi8(p0_16x8, zero); + q0_8x16 = _mm_unpackhi_epi8(q0_16x8, zero); + q1_8x16 = _mm_unpackhi_epi8(q1_16x8, zero); + q2_8x16 = _mm_unpackhi_epi8(q2_16x8, zero); + q3_8x16 = _mm_unpackhi_epi8(q3_16x8, zero); + + // p0_1 and q0_1 + temp1 = _mm_add_epi16(p0_8x16, q1_8x16); + temp2 = _mm_add_epi16(p1_8x16, q0_8x16); + temp5 = _mm_add_epi16(temp1, const_val2_16x8); + temp6 = _mm_add_epi16(temp2, const_val2_16x8); + temp3 = _mm_slli_epi16(p1_8x16, 1); + temp4 = _mm_slli_epi16(q1_8x16, 1); + temp1 = _mm_add_epi16(temp5, temp3); + temp2 = _mm_add_epi16(temp6, temp4); + temp1 = _mm_srai_epi16(temp1, 2); + temp2 = _mm_srai_epi16(temp2, 2); + p0_16x8_1 = _mm_packus_epi16(p0_16x8_1, temp1); + q0_16x8_1 = _mm_packus_epi16(q0_16x8_1, temp2); + + // p1_2 and q1_2 + temp6 = _mm_add_epi16(temp6, p0_8x16); + temp5 = _mm_add_epi16(temp5, q0_8x16); + temp1 = _mm_add_epi16(temp6, p2_8x16); + temp2 = _mm_add_epi16(temp5, q2_8x16); + temp1 = _mm_srai_epi16(temp1, 2); + temp2 = _mm_srai_epi16(temp2, 2); + p1_16x8_2 = _mm_packus_epi16(p1_16x8_2, temp1); + q1_16x8_2 = _mm_packus_epi16(q1_16x8_2, temp2); + + // p0_2 and q0_2 + temp1 = _mm_add_epi16(temp3, p2_8x16); + temp2 = _mm_add_epi16(temp4, q2_8x16); + temp1 = _mm_add_epi16(temp1, q1_8x16); + temp2 = _mm_add_epi16(temp2, p1_8x16); + temp3 = _mm_add_epi16(p0_8x16, q0_8x16); + temp3 = _mm_slli_epi16(temp3, 1); + temp1 = _mm_add_epi16(temp1, temp3); + temp2 = _mm_add_epi16(temp2, temp3); + temp1 = _mm_add_epi16(temp1, _mm_set1_epi16(4)); + temp2 = _mm_add_epi16(temp2, _mm_set1_epi16(4)); + temp1 = _mm_srai_epi16(temp1, 3); + temp2 = _mm_srai_epi16(temp2, 3); + p0_16x8_2 = _mm_packus_epi16(p0_16x8_2, temp1); + q0_16x8_2 = _mm_packus_epi16(q0_16x8_2, temp2); + + // p2_2 and q2_2 + temp1 = _mm_add_epi16(temp6, const_val2_16x8); + temp2 = _mm_add_epi16(temp5, const_val2_16x8); + temp3 = _mm_slli_epi16(p2_8x16, 1); + temp4 = _mm_slli_epi16(q2_8x16, 1); + temp3 = _mm_add_epi16(p2_8x16, temp3); + temp4 = _mm_add_epi16(q2_8x16, temp4); + temp5 = _mm_slli_epi16(p3_8x16, 1); + temp6 = _mm_slli_epi16(q3_8x16, 1); + temp1 = _mm_add_epi16(temp1, temp3); + temp2 = _mm_add_epi16(temp2, temp4); + temp1 = _mm_add_epi16(temp1, temp5); + temp2 = _mm_add_epi16(temp2, temp6); + temp1 = _mm_srai_epi16(temp1, 3); + temp2 = _mm_srai_epi16(temp2, 3); + p2_16x8_2 = _mm_packus_epi16(p2_16x8_2, temp1); + q2_16x8_2 = _mm_packus_epi16(q2_16x8_2, temp2); + + // p0 and q0 + p0_16x8 = _mm_and_si128(p0_16x8, + _mm_xor_si128(flag1_16x8, _mm_set1_epi8(0xFF))); + p0_16x8_1 = _mm_and_si128(p0_16x8_1, flag1_16x8); + p0_16x8 = _mm_add_epi8(p0_16x8, p0_16x8_1); + q0_16x8 = _mm_and_si128(q0_16x8, + _mm_xor_si128(flag1_16x8, _mm_set1_epi8(0xFF))); + q0_16x8_1 = _mm_and_si128(q0_16x8_1, flag1_16x8); + q0_16x8 = _mm_add_epi8(q0_16x8, q0_16x8_1); + + // p0 and q0 + p0_16x8 = _mm_and_si128(p0_16x8, + _mm_xor_si128(flag3_16x8, _mm_set1_epi8(0xFF))); + p0_16x8_2 = _mm_and_si128(p0_16x8_2, flag3_16x8); + p0_16x8 = _mm_add_epi8(p0_16x8, p0_16x8_2); + q0_16x8 = _mm_and_si128(q0_16x8, + _mm_xor_si128(flag4_16x8, _mm_set1_epi8(0xFF))); + q0_16x8_2 = _mm_and_si128(q0_16x8_2, flag4_16x8); + q0_16x8 = _mm_add_epi8(q0_16x8, q0_16x8_2); + + // p1 and q1 + p1_16x8 = _mm_and_si128(p1_16x8, + _mm_xor_si128(flag3_16x8, _mm_set1_epi8(0xFF))); + p1_16x8_2 = _mm_and_si128(p1_16x8_2, flag3_16x8); + p1_16x8 = _mm_add_epi8(p1_16x8, p1_16x8_2); + q1_16x8 = _mm_and_si128(q1_16x8, + _mm_xor_si128(flag4_16x8, _mm_set1_epi8(0xFF))); + q1_16x8_2 = _mm_and_si128(q1_16x8_2, flag4_16x8); + q1_16x8 = _mm_add_epi8(q1_16x8, q1_16x8_2); + + // p2 and q2 + p2_16x8 = _mm_and_si128(p2_16x8, + _mm_xor_si128(flag3_16x8, _mm_set1_epi8(0xFF))); + p2_16x8_2 = _mm_and_si128(p2_16x8_2, flag3_16x8); + p2_16x8 = _mm_add_epi8(p2_16x8, p2_16x8_2); + q2_16x8 = _mm_and_si128(q2_16x8, + _mm_xor_si128(flag4_16x8, _mm_set1_epi8(0xFF))); + q2_16x8_2 = _mm_and_si128(q2_16x8_2, flag4_16x8); + q2_16x8 = _mm_add_epi8(q2_16x8, q2_16x8_2); + + _mm_storeu_si128((__m128i *)(pu1_HorzPixel + i16_posP2), p2_16x8); + _mm_storeu_si128((__m128i *)(pu1_HorzPixel + i16_posP1), p1_16x8); + _mm_storeu_si128((__m128i *)(pu1_HorzPixel + i16_posP0), p0_16x8); + + _mm_storeu_si128((__m128i *)(pu1_src), q0_16x8); + _mm_storeu_si128((__m128i *)(pu1_src + i16_posQ1), q1_16x8); + _mm_storeu_si128((__m128i *)(pu1_src + i16_posQ2), q2_16x8); + +} + +/*****************************************************************************/ +/* */ +/* Function Name : ih264_deblk_luma_vert_bslt4_ssse3() */ +/* */ +/* Description : This function performs filtering of a luma block */ +/* vertical edge when the boundary strength is less than 4. */ +/* */ +/* Inputs : pu1_src - pointer to the src sample q0 */ +/* src_strd - source stride */ +/* alpha - alpha value for the boundary */ +/* beta - beta value for the boundary */ +/* u4_bs - packed Boundary strength array */ +/* pu1_cliptab - tc0_table */ +/* */ +/* Globals : None */ +/* */ +/* Processing : This operation is described in Sec. 8.7.2.3 under the */ +/* title "Filtering process for edges for bS less than 4" */ +/* in ITU T Rec H.264. */ +/* */ +/* Outputs : None */ +/* */ +/* Returns : None */ +/* */ +/* Issues : None */ +/* */ +/* Revision History: */ +/* */ +/* DD MM YYYY Author(s) Changes (Describe the changes made) */ +/* 12 02 2015 Naveen Kumar P Initial version */ +/* */ +/*****************************************************************************/ +void ih264_deblk_luma_vert_bslt4_ssse3(UWORD8 *pu1_src, + WORD32 src_strd, + WORD32 alpha, + WORD32 beta, + UWORD32 u4_bs, + const UWORD8 *pu1_cliptab) +{ + UWORD8 u1_Bs, u1_Bs1; + + UWORD32 j = 0; + + __m128i linea, lineb, linec, lined, linee, linef, lineg, lineh; + __m128i int1, int2, int3, int4, high1, high2; + __m128i flag, flag1, i_C, i_C0; + __m128i i_Ap, i_Aq, diff, const1, const2, in_macro, in_macrotemp, temp, + temp1; + __m128i zero = _mm_setzero_si128(); + + for(j = 0; j <= 8 * src_strd; j += 8 * src_strd) + { + //Transpose + linea = _mm_loadl_epi64((__m128i *)(pu1_src - 3 + j)); + lineb = _mm_loadl_epi64((__m128i *)(pu1_src - 3 + src_strd + j)); + linec = _mm_loadl_epi64((__m128i *)(pu1_src - 3 + 2 * src_strd + j)); + lined = _mm_loadl_epi64((__m128i *)(pu1_src - 3 + 3 * src_strd + j)); + + linea = _mm_unpacklo_epi8(linea, zero); + lineb = _mm_unpacklo_epi8(lineb, zero); + linec = _mm_unpacklo_epi8(linec, zero); + lined = _mm_unpacklo_epi8(lined, zero); + + int1 = _mm_unpacklo_epi16(linea, lineb); + lineb = _mm_unpackhi_epi16(linea, lineb); + + int2 = _mm_unpacklo_epi16(linec, lined); + lined = _mm_unpackhi_epi16(linec, lined); + + linea = _mm_unpacklo_epi16(int1, int2); + int1 = _mm_unpackhi_epi16(int1, int2); + + linec = _mm_unpacklo_epi16(lineb, lined); + high1 = _mm_unpackhi_epi16(lineb, lined); + + linee = _mm_loadl_epi64((__m128i *)(pu1_src - 3 + 4 * src_strd + j)); + linef = _mm_loadl_epi64((__m128i *)(pu1_src - 3 + 5 * src_strd + j)); + lineg = _mm_loadl_epi64((__m128i *)(pu1_src - 3 + 6 * src_strd + j)); + lineh = _mm_loadl_epi64((__m128i *)(pu1_src - 3 + 7 * src_strd + j)); + + linee = _mm_unpacklo_epi8(linee, zero); + linef = _mm_unpacklo_epi8(linef, zero); + lineg = _mm_unpacklo_epi8(lineg, zero); + lineh = _mm_unpacklo_epi8(lineh, zero); + + int2 = _mm_unpacklo_epi16(linee, linef); + linef = _mm_unpackhi_epi16(linee, linef); + + int3 = _mm_unpacklo_epi16(lineg, lineh); + lineh = _mm_unpackhi_epi16(lineg, lineh); + + linee = _mm_unpacklo_epi16(int2, int3); + int2 = _mm_unpackhi_epi16(int2, int3); + + lineg = _mm_unpacklo_epi16(linef, lineh); + high2 = _mm_unpackhi_epi16(linef, lineh); + + int4 = _mm_unpacklo_epi16(linea, linee); + lineb = _mm_unpackhi_epi16(linea, linee); + + int3 = _mm_unpacklo_epi16(int1, int2); + lined = _mm_unpackhi_epi16(int1, int2); + + int2 = _mm_unpacklo_epi16(linec, lineg); + linef = _mm_unpackhi_epi16(linec, lineg); + + linea = int4; + linec = int3; + linee = int2; + + lineg = _mm_unpacklo_epi16(high1, high2); + lineh = _mm_unpackhi_epi16(high1, high2); + + //end of transpose + + u1_Bs = (u4_bs >> 24) & 0xff; + u1_Bs1 = (u4_bs >> 16) & 0xff; + u4_bs <<= 16; + + flag1 = _mm_set_epi16(u1_Bs1, u1_Bs, u1_Bs1, u1_Bs, u1_Bs1, u1_Bs, + u1_Bs1, u1_Bs); + flag1 = _mm_cmpeq_epi16(flag1, zero); //Set flag to 1s and 0s + flag1 = _mm_xor_si128(flag1, _mm_set1_epi16(0xFFFF)); //Invert for required mask + + i_C0 = _mm_set_epi16(pu1_cliptab[u1_Bs1], pu1_cliptab[u1_Bs], + pu1_cliptab[u1_Bs1], pu1_cliptab[u1_Bs], + pu1_cliptab[u1_Bs1], pu1_cliptab[u1_Bs], + pu1_cliptab[u1_Bs1], pu1_cliptab[u1_Bs]); + + diff = _mm_subs_epi16(linec, lined); //Condn 1 + diff = _mm_abs_epi16(diff); + const1 = _mm_set1_epi16(alpha); + flag = _mm_cmpgt_epi16(const1, diff); + + diff = _mm_subs_epi16(linee, lined); //Condtn 2 + diff = _mm_abs_epi16(diff); + const1 = _mm_set1_epi16(beta); + flag = _mm_and_si128(flag, _mm_cmpgt_epi16(const1, diff)); + + diff = _mm_subs_epi16(lineb, linec); //Condtn 3 + diff = _mm_abs_epi16(diff); + flag = _mm_and_si128(flag, _mm_cmpgt_epi16(const1, diff)); //Const 1= Beta from now on + + flag = _mm_and_si128(flag, flag1); //Final flag (ui_B condition + other 3 conditions) + + //Adding Ap<Beta and Aq<Beta + i_Ap = _mm_subs_epi16(linea, linec); + i_Ap = _mm_abs_epi16(i_Ap); + const2 = _mm_cmpgt_epi16(const1, i_Ap); + const2 = _mm_subs_epi16(zero, const2); //Make FFFF=1 and 0000=0 + i_C = _mm_add_epi16(i_C0, const2); + + i_Aq = _mm_subs_epi16(linef, lined); + i_Aq = _mm_abs_epi16(i_Aq); + const2 = _mm_cmpgt_epi16(const1, i_Aq); + const2 = _mm_subs_epi16(zero, const2); + i_C = _mm_add_epi16(i_C, const2); + + //Calculate in_macro + diff = _mm_subs_epi16(lined, linec); + diff = _mm_slli_epi16(diff, 2); + const2 = _mm_subs_epi16(lineb, linee); + diff = _mm_add_epi16(diff, const2); + const2 = _mm_set1_epi16(4); + diff = _mm_add_epi16(diff, const2); + in_macro = _mm_srai_epi16(diff, 3); + + in_macro = _mm_min_epi16(i_C, in_macro); //CLIP3 + i_C = _mm_subs_epi16(zero, i_C); + in_macro = _mm_max_epi16(i_C, in_macro); + + //Compute and store + in_macrotemp = _mm_add_epi16(linec, in_macro); + in_macrotemp = _mm_and_si128(in_macrotemp, flag); + temp = _mm_and_si128(linec, + _mm_xor_si128(flag, _mm_set1_epi16(0xFFFF))); + temp = _mm_add_epi16(temp, in_macrotemp); + //temp= _mm_packus_epi16 (temp, zero); + //_mm_storel_epi64(uc_HorzPixel+i16_posP0+i, in_macrotemp); + + in_macrotemp = _mm_subs_epi16(lined, in_macro); + in_macrotemp = _mm_and_si128(in_macrotemp, flag); + temp1 = _mm_and_si128(lined, + _mm_xor_si128(flag, _mm_set1_epi16(0xFFFF))); + temp1 = _mm_add_epi16(temp1, in_macrotemp); + //temp1= _mm_packus_epi16 (temp1, zero); + //_mm_storel_epi64(pu1_src+i, in_macrotemp); + + //If Ap<Beta + flag1 = _mm_cmpgt_epi16(const1, i_Ap); + flag1 = _mm_and_si128(flag, flag1); + in_macrotemp = _mm_add_epi16(linec, lined); + in_macrotemp = _mm_add_epi16(in_macrotemp, _mm_set1_epi16(1)); + in_macrotemp = _mm_srai_epi16(in_macrotemp, 1); + in_macro = _mm_add_epi16(in_macrotemp, linea); + in_macro = _mm_subs_epi16(in_macro, _mm_slli_epi16(lineb, 1)); + in_macro = _mm_srai_epi16(in_macro, 1); + + in_macro = _mm_min_epi16(i_C0, in_macro); //CLIP3 + i_C0 = _mm_subs_epi16(zero, i_C0); + in_macro = _mm_max_epi16(i_C0, in_macro); + + in_macro = _mm_and_si128(in_macro, flag1); + lineb = _mm_add_epi16(lineb, in_macro); + //in_macro= _mm_packus_epi16 (i_p1, zero); + //_mm_storel_epi64(uc_HorzPixel+i16_posP1+i, in_macro); + + flag1 = _mm_cmpgt_epi16(const1, i_Aq); + flag1 = _mm_and_si128(flag, flag1); + in_macro = _mm_add_epi16(in_macrotemp, linef); + in_macro = _mm_subs_epi16(in_macro, _mm_slli_epi16(linee, 1)); + in_macro = _mm_srai_epi16(in_macro, 1); + + i_C0 = _mm_abs_epi16(i_C0); + in_macro = _mm_min_epi16(i_C0, in_macro); //CLIP3 + i_C0 = _mm_subs_epi16(zero, i_C0); + in_macro = _mm_max_epi16(i_C0, in_macro); + + in_macro = _mm_and_si128(in_macro, flag1); + linee = _mm_add_epi16(linee, in_macro); + //in_macro= _mm_packus_epi16 (i_q1, zero); + //_mm_storel_epi64(pu1_src+i16_posQ1+i, in_macro); + linec = temp; + lined = temp1; + //End of filtering + + int1 = _mm_unpacklo_epi16(linea, linee); + linee = _mm_unpackhi_epi16(linea, linee); + + int2 = _mm_unpacklo_epi16(linec, lineg); + lineg = _mm_unpackhi_epi16(linec, lineg); + + linea = _mm_unpacklo_epi16(int1, int2); + int3 = _mm_unpackhi_epi16(int1, int2); + + linec = _mm_unpacklo_epi16(linee, lineg); + lineg = _mm_unpackhi_epi16(linee, lineg); + + int1 = _mm_unpacklo_epi16(lineb, linef); + linef = _mm_unpackhi_epi16(lineb, linef); + + int2 = _mm_unpacklo_epi16(lined, lineh); + lineh = _mm_unpackhi_epi16(lined, lineh); + + lineb = _mm_unpacklo_epi16(int1, int2); + int4 = _mm_unpackhi_epi16(int1, int2); + + lined = _mm_unpacklo_epi16(linef, lineh); + lineh = _mm_unpackhi_epi16(linef, lineh); + + int1 = _mm_unpackhi_epi16(linea, lineb); + linea = _mm_unpacklo_epi16(linea, lineb); + + int2 = _mm_unpacklo_epi16(int3, int4); + high1 = _mm_unpackhi_epi16(int3, int4); + + lineb = _mm_unpacklo_epi16(linec, lined); + linef = _mm_unpackhi_epi16(linec, lined); + + lined = _mm_unpacklo_epi16(lineg, lineh); + lineh = _mm_unpackhi_epi16(lineg, lineh); + + linee = int1; + lineg = high1; + linec = int2; + //End of inverse transpose + + //Packs and stores + linea = _mm_packus_epi16(linea, zero); + _mm_storel_epi64((__m128i *)(pu1_src - 3 + j), linea); + + lineb = _mm_packus_epi16(lineb, zero); + _mm_storel_epi64((__m128i *)(pu1_src - 3 + src_strd + j), lineb); + + linec = _mm_packus_epi16(linec, zero); + _mm_storel_epi64((__m128i *)(pu1_src - 3 + 2 * src_strd + j), linec); + + lined = _mm_packus_epi16(lined, zero); + _mm_storel_epi64((__m128i *)(pu1_src - 3 + 3 * src_strd + j), lined); + + linee = _mm_packus_epi16(linee, zero); + _mm_storel_epi64((__m128i *)(pu1_src - 3 + 4 * src_strd + j), linee); + + linef = _mm_packus_epi16(linef, zero); + _mm_storel_epi64((__m128i *)(pu1_src - 3 + 5 * src_strd + j), linef); + + lineg = _mm_packus_epi16(lineg, zero); + _mm_storel_epi64((__m128i *)(pu1_src - 3 + 6 * src_strd + j), lineg); + + lineh = _mm_packus_epi16(lineh, zero); + _mm_storel_epi64((__m128i *)(pu1_src - 3 + 7 * src_strd + j), lineh); + + } +} + +/*****************************************************************************/ +/* */ +/* Function Name : ih264_deblk_luma_horz_bslt4_ssse3() */ +/* */ +/* Description : This function performs filtering of a luma block */ +/* horizontal edge when boundary strength is less than 4. */ +/* */ +/* Inputs : pu1_src - pointer to the src sample q0 */ +/* src_strd - source stride */ +/* alpha - alpha value for the boundary */ +/* beta - beta value for the boundary */ +/* u4_bs - packed Boundary strength array */ +/* pu1_cliptab - tc0_table */ +/* */ +/* Globals : None */ +/* */ +/* Processing : This operation is described in Sec. 8.7.2.3 under the */ +/* title "Filtering process for edges for bS less than 4" */ +/* in ITU T Rec H.264. */ +/* */ +/* Outputs : None */ +/* */ +/* Returns : None */ +/* */ +/* Issues : None */ +/* */ +/* Revision History: */ +/* */ +/* DD MM YYYY Author(s) Changes (Describe the changes made) */ +/* 12 02 2015 Naveen Kumar P Initial version */ +/* */ +/*****************************************************************************/ +void ih264_deblk_luma_horz_bslt4_ssse3(UWORD8 *pu1_src, + WORD32 src_strd, + WORD32 alpha, + WORD32 beta, + UWORD32 u4_bs, + const UWORD8 *pu1_cliptab) +{ + WORD16 i16_posP2, i16_posP1, i16_posP0, i16_posQ1, i16_posQ2; + UWORD8 *pu1_HorzPixel; + __m128i zero = _mm_setzero_si128(); + __m128i bs_flag_16x8b, C0_16x8, C0_8x16, C0_hi_8x16, C_8x16, C_hi_8x16; + __m128i q0_16x8, q1_16x8, q2_16x8, p0_16x8, p1_16x8, p2_16x8; + __m128i temp1, temp2; + __m128i Alpha_8x16, Beta_8x16, flag1_16x8, flag2_16x8, flag3_16x8; + __m128i in_macro_16x8, in_macro_hi_16x8; + __m128i const_val4_8x16; + UWORD8 u1_Bs0, u1_Bs1, u1_Bs2, u1_Bs3; + UWORD8 clip0, clip1, clip2, clip3; + + pu1_HorzPixel = pu1_src - (src_strd << 2); + + i16_posQ1 = src_strd; + i16_posQ2 = X2(src_strd); + i16_posP0 = X3(src_strd); + i16_posP1 = X2(src_strd); + i16_posP2 = src_strd; + + q0_16x8 = _mm_loadu_si128((__m128i *)(pu1_src)); + q1_16x8 = _mm_loadu_si128((__m128i *)(pu1_src + i16_posQ1)); + + u1_Bs0 = (u4_bs >> 24) & 0xff; + u1_Bs1 = (u4_bs >> 16) & 0xff; + u1_Bs2 = (u4_bs >> 8) & 0xff; + u1_Bs3 = (u4_bs >> 0) & 0xff; + clip0 = pu1_cliptab[u1_Bs0]; + clip1 = pu1_cliptab[u1_Bs1]; + clip2 = pu1_cliptab[u1_Bs2]; + clip3 = pu1_cliptab[u1_Bs3]; + + Alpha_8x16 = _mm_set1_epi16(alpha); + Beta_8x16 = _mm_set1_epi16(beta); + + bs_flag_16x8b = _mm_set_epi8(u1_Bs3, u1_Bs3, u1_Bs3, u1_Bs3, u1_Bs2, u1_Bs2, + u1_Bs2, u1_Bs2, u1_Bs1, u1_Bs1, u1_Bs1, u1_Bs1, + u1_Bs0, u1_Bs0, u1_Bs0, u1_Bs0); + + C0_16x8 = _mm_set_epi8(clip3, clip3, clip3, clip3, clip2, clip2, clip2, + clip2, clip1, clip1, clip1, clip1, clip0, clip0, + clip0, clip0); + + bs_flag_16x8b = _mm_cmpeq_epi8(bs_flag_16x8b, zero); + bs_flag_16x8b = _mm_xor_si128(bs_flag_16x8b, _mm_set1_epi8(0xFF)); //Invert for required mask + C0_8x16 = _mm_unpacklo_epi8(C0_16x8, zero); + C0_hi_8x16 = _mm_unpackhi_epi8(C0_16x8, zero); + + p1_16x8 = _mm_loadu_si128((__m128i *)(pu1_HorzPixel + i16_posP1)); + p0_16x8 = _mm_loadu_si128((__m128i *)(pu1_HorzPixel + i16_posP0)); + p2_16x8 = _mm_loadu_si128((__m128i *)(pu1_HorzPixel + i16_posP2)); + q2_16x8 = _mm_loadu_si128((__m128i *)(pu1_src + i16_posQ2)); + + //Cond1 (ABS(p0 - q0) < alpha) + temp1 = _mm_subs_epu8(q0_16x8, p0_16x8); + temp2 = _mm_subs_epu8(p0_16x8, q0_16x8); + temp1 = _mm_add_epi8(temp1, temp2); + + temp2 = _mm_unpacklo_epi8(temp1, zero); + temp1 = _mm_unpackhi_epi8(temp1, zero); + + temp2 = _mm_cmpgt_epi16(Alpha_8x16, temp2); + temp1 = _mm_cmpgt_epi16(Alpha_8x16, temp1); + + flag1_16x8 = _mm_packs_epi16(temp2, temp1); + flag1_16x8 = _mm_and_si128(flag1_16x8, bs_flag_16x8b); + + //Cond2 (ABS(q1 - q0) < beta) + temp1 = _mm_subs_epu8(q0_16x8, q1_16x8); + temp2 = _mm_subs_epu8(q1_16x8, q0_16x8); + temp1 = _mm_add_epi8(temp1, temp2); + + temp2 = _mm_unpacklo_epi8(temp1, zero); + temp1 = _mm_unpackhi_epi8(temp1, zero); + + temp2 = _mm_cmpgt_epi16(Beta_8x16, temp2); + temp1 = _mm_cmpgt_epi16(Beta_8x16, temp1); + + flag2_16x8 = _mm_packs_epi16(temp2, temp1); + + flag1_16x8 = _mm_and_si128(flag1_16x8, flag2_16x8); + + //Cond3 (ABS(p1 - p0) < beta) + temp1 = _mm_subs_epu8(p0_16x8, p1_16x8); + temp2 = _mm_subs_epu8(p1_16x8, p0_16x8); + temp1 = _mm_add_epi8(temp1, temp2); + + temp2 = _mm_unpacklo_epi8(temp1, zero); + temp1 = _mm_unpackhi_epi8(temp1, zero); + + temp2 = _mm_cmpgt_epi16(Beta_8x16, temp2); + temp1 = _mm_cmpgt_epi16(Beta_8x16, temp1); + + flag2_16x8 = _mm_packs_epi16(temp2, temp1); + + // !((ABS(p0 - q0) < alpha) || (ABS(q1 - q0) < beta) || (ABS(p1 - p0) < beta)) + flag1_16x8 = _mm_and_si128(flag1_16x8, flag2_16x8); + + // (ABS(p2 - p0) < beta) + temp1 = _mm_subs_epu8(p0_16x8, p2_16x8); + temp2 = _mm_subs_epu8(p2_16x8, p0_16x8); + temp1 = _mm_add_epi8(temp1, temp2); + + temp2 = _mm_unpacklo_epi8(temp1, zero); + temp1 = _mm_unpackhi_epi8(temp1, zero); + temp2 = _mm_cmpgt_epi16(Beta_8x16, temp2); + temp1 = _mm_cmpgt_epi16(Beta_8x16, temp1); + + flag2_16x8 = _mm_packs_epi16(temp2, temp1); + flag2_16x8 = _mm_and_si128(flag1_16x8, flag2_16x8); + + temp2 = _mm_subs_epi16(zero, temp2); + temp1 = _mm_subs_epi16(zero, temp1); + + C_8x16 = _mm_add_epi16(C0_8x16, temp2); + C_hi_8x16 = _mm_add_epi16(C0_hi_8x16, temp1); + + // (ABS(q2 - q0) < beta) + temp1 = _mm_subs_epu8(q0_16x8, q2_16x8); + temp2 = _mm_subs_epu8(q2_16x8, q0_16x8); + temp1 = _mm_add_epi8(temp1, temp2); + + temp2 = _mm_unpacklo_epi8(temp1, zero); + temp1 = _mm_unpackhi_epi8(temp1, zero); + temp2 = _mm_cmpgt_epi16(Beta_8x16, temp2); + temp1 = _mm_cmpgt_epi16(Beta_8x16, temp1); + + flag3_16x8 = _mm_packs_epi16(temp2, temp1); + flag3_16x8 = _mm_and_si128(flag1_16x8, flag3_16x8); + + temp2 = _mm_subs_epi16(zero, temp2); + temp1 = _mm_subs_epi16(zero, temp1); + + C_8x16 = _mm_add_epi16(C_8x16, temp2); + C_hi_8x16 = _mm_add_epi16(C_hi_8x16, temp1); + + const_val4_8x16 = _mm_set1_epi16(4); + temp1 = _mm_subs_epi16(_mm_unpacklo_epi8(q0_16x8, zero), + _mm_unpacklo_epi8(p0_16x8, zero)); + temp2 = _mm_subs_epi16(_mm_unpacklo_epi8(p1_16x8, zero), + _mm_unpacklo_epi8(q1_16x8, zero)); + temp1 = _mm_slli_epi16(temp1, 2); + temp1 = _mm_add_epi16(temp1, temp2); + temp1 = _mm_add_epi16(temp1, const_val4_8x16); + in_macro_16x8 = _mm_srai_epi16(temp1, 3); + + temp1 = _mm_subs_epi16(_mm_unpackhi_epi8(q0_16x8, zero), + _mm_unpackhi_epi8(p0_16x8, zero)); + temp2 = _mm_subs_epi16(_mm_unpackhi_epi8(p1_16x8, zero), + _mm_unpackhi_epi8(q1_16x8, zero)); + temp1 = _mm_slli_epi16(temp1, 2); + temp1 = _mm_add_epi16(temp1, temp2); + temp1 = _mm_add_epi16(temp1, const_val4_8x16); + in_macro_hi_16x8 = _mm_srai_epi16(temp1, 3); + + in_macro_16x8 = _mm_min_epi16(C_8x16, in_macro_16x8); //CLIP3 + in_macro_hi_16x8 = _mm_min_epi16(C_hi_8x16, in_macro_hi_16x8); //CLIP3 + C_8x16 = _mm_subs_epi16(zero, C_8x16); + C_hi_8x16 = _mm_subs_epi16(zero, C_hi_8x16); + in_macro_16x8 = _mm_max_epi16(C_8x16, in_macro_16x8); //CLIP3 + in_macro_hi_16x8 = _mm_max_epi16(C_hi_8x16, in_macro_hi_16x8); //CLIP3 + + temp1 = _mm_add_epi16(_mm_unpacklo_epi8(p0_16x8, zero), in_macro_16x8); + temp2 = _mm_add_epi16(_mm_unpackhi_epi8(p0_16x8, zero), in_macro_hi_16x8); + + temp1 = _mm_packus_epi16(temp1, temp2); + + temp1 = _mm_and_si128(temp1, flag1_16x8); + temp2 = _mm_and_si128(p0_16x8, + _mm_xor_si128(flag1_16x8, _mm_set1_epi16(0xFFFF))); + + temp1 = _mm_add_epi8(temp1, temp2); + + _mm_storeu_si128((__m128i *)(pu1_HorzPixel + i16_posP0), temp1); + + temp1 = _mm_sub_epi16(_mm_unpacklo_epi8(q0_16x8, zero), in_macro_16x8); + temp2 = _mm_sub_epi16(_mm_unpackhi_epi8(q0_16x8, zero), in_macro_hi_16x8); + + temp1 = _mm_packus_epi16(temp1, temp2); + + temp1 = _mm_and_si128(temp1, flag1_16x8); + temp2 = _mm_and_si128(q0_16x8, + _mm_xor_si128(flag1_16x8, _mm_set1_epi16(0xFFFF))); + + temp1 = _mm_add_epi8(temp1, temp2); + _mm_storeu_si128((__m128i *)(pu1_src), temp1); + + //if(Ap < Beta) + temp1 = _mm_avg_epu16(_mm_unpacklo_epi8(q0_16x8, zero), + _mm_unpacklo_epi8(p0_16x8, zero)); + temp2 = _mm_slli_epi16(_mm_unpacklo_epi8(p1_16x8, zero), 1); + //temp2 = _mm_subs_epi16(zero,temp2); + temp2 = _mm_subs_epi16(_mm_unpacklo_epi8(p2_16x8, zero), temp2); + temp2 = _mm_add_epi16(temp1, temp2); + in_macro_16x8 = _mm_srai_epi16(temp2, 1); + + temp1 = _mm_avg_epu16(_mm_unpackhi_epi8(q0_16x8, zero), + _mm_unpackhi_epi8(p0_16x8, zero)); + temp2 = _mm_slli_epi16(_mm_unpackhi_epi8(p1_16x8, zero), 1); + //temp2 = _mm_subs_epi16(zero,temp2); + temp2 = _mm_subs_epi16(_mm_unpackhi_epi8(p2_16x8, zero), temp2); + temp2 = _mm_add_epi16(temp1, temp2); + in_macro_hi_16x8 = _mm_srai_epi16(temp2, 1); + + in_macro_16x8 = _mm_min_epi16(C0_8x16, in_macro_16x8); //CLIP3 + in_macro_hi_16x8 = _mm_min_epi16(C0_hi_8x16, in_macro_hi_16x8); //CLIP3 + C0_8x16 = _mm_subs_epi16(zero, C0_8x16); + C0_hi_8x16 = _mm_subs_epi16(zero, C0_hi_8x16); + in_macro_16x8 = _mm_max_epi16(C0_8x16, in_macro_16x8); //CLIP3 + in_macro_hi_16x8 = _mm_max_epi16(C0_hi_8x16, in_macro_hi_16x8); //CLIP3 + + temp1 = _mm_add_epi16(_mm_unpacklo_epi8(p1_16x8, zero), in_macro_16x8); + temp2 = _mm_add_epi16(_mm_unpackhi_epi8(p1_16x8, zero), in_macro_hi_16x8); + + temp1 = _mm_packus_epi16(temp1, temp2); + + temp1 = _mm_and_si128(temp1, flag2_16x8); + temp2 = _mm_and_si128(p1_16x8, + _mm_xor_si128(flag2_16x8, _mm_set1_epi16(0xFFFF))); + temp1 = _mm_add_epi8(temp1, temp2); + _mm_storeu_si128((__m128i *)(pu1_HorzPixel + i16_posP1), temp1); + + //if(Aq < Beta) + temp1 = _mm_avg_epu16(_mm_unpacklo_epi8(q0_16x8, zero), + _mm_unpacklo_epi8(p0_16x8, zero)); + temp2 = _mm_slli_epi16(_mm_unpacklo_epi8(q1_16x8, zero), 1); + //temp2 = _mm_slli_epi16 (temp2, 1); + temp2 = _mm_subs_epi16(_mm_unpacklo_epi8(q2_16x8, zero), temp2); + temp2 = _mm_add_epi16(temp1, temp2); + in_macro_16x8 = _mm_srai_epi16(temp2, 1); + + temp1 = _mm_avg_epu16(_mm_unpackhi_epi8(q0_16x8, zero), + _mm_unpackhi_epi8(p0_16x8, zero)); + temp2 = _mm_slli_epi16(_mm_unpackhi_epi8(q1_16x8, zero), 1); + //temp2 = _mm_slli_epi16 (temp2, 1); + temp2 = _mm_subs_epi16(_mm_unpackhi_epi8(q2_16x8, zero), temp2); + temp2 = _mm_add_epi16(temp1, temp2); + in_macro_hi_16x8 = _mm_srai_epi16(temp2, 1); + + in_macro_16x8 = _mm_max_epi16(C0_8x16, in_macro_16x8); //CLIP3 + in_macro_hi_16x8 = _mm_max_epi16(C0_hi_8x16, in_macro_hi_16x8); //CLIP3 + C0_8x16 = _mm_subs_epi16(zero, C0_8x16); + C0_hi_8x16 = _mm_subs_epi16(zero, C0_hi_8x16); + in_macro_16x8 = _mm_min_epi16(C0_8x16, in_macro_16x8); //CLIP3 + in_macro_hi_16x8 = _mm_min_epi16(C0_hi_8x16, in_macro_hi_16x8); //CLIP3 + + temp1 = _mm_add_epi16(_mm_unpacklo_epi8(q1_16x8, zero), in_macro_16x8); + temp2 = _mm_add_epi16(_mm_unpackhi_epi8(q1_16x8, zero), in_macro_hi_16x8); + + temp1 = _mm_packus_epi16(temp1, temp2); + + temp1 = _mm_and_si128(temp1, flag3_16x8); + temp2 = _mm_and_si128(q1_16x8, + _mm_xor_si128(flag3_16x8, _mm_set1_epi16(0xFFFF))); + temp1 = _mm_add_epi8(temp1, temp2); + + _mm_storeu_si128((__m128i *)(pu1_src + i16_posQ1), temp1); + +} + +/*****************************************************************************/ +/* */ +/* Function Name : ih264_deblk_luma_vert_bs4_mbaff_ssse3() */ +/* */ +/* Description : This function performs filtering of a luma block */ +/* vertical edge when boundary strength is set to 4. */ +/* */ +/* Inputs : pu1_src - pointer to the src sample q0 */ +/* src_strd - source stride */ +/* alpha - alpha value for the boundary */ +/* beta - beta value for the boundary */ +/* */ +/* Globals : None */ +/* */ +/* Processing : When the function is called twice, this operation is as */ +/* described in Sec. 8.7.2.3 under the title "Filtering */ +/* process for edges for bS equal to 4" in ITU T Rec H.264. */ +/* */ +/* Outputs : None */ +/* */ +/* Returns : None */ +/* */ +/* Issues : None */ +/* */ +/* Revision History: */ +/* */ +/* DD MM YYYY Author(s) Changes (Describe the changes made) */ +/* 12 02 2015 Naveen Kumar P Initial version */ +/* */ +/*****************************************************************************/ +void ih264_deblk_luma_vert_bs4_mbaff_ssse3(UWORD8 *pu1_src, + WORD32 src_strd, + WORD32 alpha, + WORD32 beta) +{ + __m128i zero = _mm_setzero_si128(); + __m128i q0_16x8, q1_16x8, q2_16x8, q3_16x8; + __m128i p0_16x8, p1_16x8, p2_16x8, p3_16x8; + __m128i q0_8x16, q1_8x16, q2_8x16, q3_8x16; + __m128i p0_8x16, p1_8x16, p2_8x16, p3_8x16; + __m128i q0_16x8_1; + __m128i p0_16x8_1; + __m128i q0_16x8_2, q1_16x8_2, q2_16x8_2; + __m128i p0_16x8_2, p1_16x8_2, p2_16x8_2; + __m128i temp1, temp2, temp3, temp4, temp5, temp6; + __m128i Alpha_8x16, Beta_8x16; + __m128i flag1_16x8, flag2_16x8, flag3_16x8, flag4_16x8; + __m128i const_val2_16x8 = _mm_set1_epi16(2); + __m128i line1, line2, line3, line4, line5, line6, line7, line8; + + Alpha_8x16 = _mm_set1_epi16(alpha); + Beta_8x16 = _mm_set1_epi16(beta); + + line1 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 0 * src_strd)); + line2 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 1 * src_strd)); + line3 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 2 * src_strd)); + line4 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 3 * src_strd)); + line5 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 4 * src_strd)); + line6 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 5 * src_strd)); + line7 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 6 * src_strd)); + line8 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 7 * src_strd)); + + temp1 = _mm_unpacklo_epi8(line1, line2); + temp2 = _mm_unpacklo_epi8(line3, line4); + temp3 = _mm_unpacklo_epi8(line5, line6); + temp4 = _mm_unpacklo_epi8(line7, line8); + + line1 = _mm_unpacklo_epi16(temp1, temp2); + line2 = _mm_unpackhi_epi16(temp1, temp2); + line3 = _mm_unpacklo_epi16(temp3, temp4); + line4 = _mm_unpackhi_epi16(temp3, temp4); + + p1_8x16 = _mm_unpacklo_epi32(line1, line3); + p0_8x16 = _mm_unpackhi_epi32(line1, line3); + q0_8x16 = _mm_unpacklo_epi32(line2, line4); + q1_8x16 = _mm_unpackhi_epi32(line2, line4); + + p3_16x8 = _mm_unpacklo_epi64(p1_8x16, zero); + p2_16x8 = _mm_unpackhi_epi64(p1_8x16, zero); + q2_16x8 = _mm_unpacklo_epi64(q1_8x16, zero); + q3_16x8 = _mm_unpackhi_epi64(q1_8x16, zero); + p1_16x8 = _mm_unpacklo_epi64(p0_8x16, zero); + p0_16x8 = _mm_unpackhi_epi64(p0_8x16, zero); + q0_16x8 = _mm_unpacklo_epi64(q0_8x16, zero); + q1_16x8 = _mm_unpackhi_epi64(q0_8x16, zero); + + //Cond1 (ABS(p0 - q0) < alpha) + temp1 = _mm_subs_epu8(q0_16x8, p0_16x8); + temp2 = _mm_subs_epu8(p0_16x8, q0_16x8); + temp1 = _mm_add_epi8(temp1, temp2); + + temp2 = _mm_unpacklo_epi8(temp1, zero); + temp1 = _mm_unpackhi_epi8(temp1, zero); + + temp2 = _mm_cmpgt_epi16(Alpha_8x16, temp2); + temp1 = _mm_cmpgt_epi16(Alpha_8x16, temp1); + + flag1_16x8 = _mm_packs_epi16(temp2, temp1); + + //Cond2 (ABS(q1 - q0) < beta) + temp1 = _mm_subs_epu8(q0_16x8, q1_16x8); + temp2 = _mm_subs_epu8(q1_16x8, q0_16x8); + temp1 = _mm_add_epi8(temp1, temp2); + + temp2 = _mm_unpacklo_epi8(temp1, zero); + temp1 = _mm_unpackhi_epi8(temp1, zero); + + temp2 = _mm_cmpgt_epi16(Beta_8x16, temp2); + temp1 = _mm_cmpgt_epi16(Beta_8x16, temp1); + + flag2_16x8 = _mm_packs_epi16(temp2, temp1); + + flag1_16x8 = _mm_and_si128(flag1_16x8, flag2_16x8); + + //Cond3 (ABS(p1 - p0) < beta) + temp1 = _mm_subs_epu8(p0_16x8, p1_16x8); + temp2 = _mm_subs_epu8(p1_16x8, p0_16x8); + temp1 = _mm_add_epi8(temp1, temp2); + + temp2 = _mm_unpacklo_epi8(temp1, zero); + temp1 = _mm_unpackhi_epi8(temp1, zero); + + temp2 = _mm_cmpgt_epi16(Beta_8x16, temp2); + temp1 = _mm_cmpgt_epi16(Beta_8x16, temp1); + + flag2_16x8 = _mm_packs_epi16(temp2, temp1); + + // !((ABS(p0 - q0) < alpha) || (ABS(q1 - q0) < beta) || (ABS(p1 - p0) < beta)) + flag1_16x8 = _mm_and_si128(flag1_16x8, flag2_16x8); + + // (ABS(p0 - q0) < ((alpha >> 2) + 2)) + temp1 = _mm_subs_epu8(p0_16x8, q0_16x8); + temp2 = _mm_subs_epu8(q0_16x8, p0_16x8); + temp1 = _mm_add_epi8(temp1, temp2); + Alpha_8x16 = _mm_srai_epi16(Alpha_8x16, 2); + Alpha_8x16 = _mm_add_epi16(Alpha_8x16, const_val2_16x8); + + temp2 = _mm_unpacklo_epi8(temp1, zero); + temp1 = _mm_unpackhi_epi8(temp1, zero); + temp2 = _mm_cmpgt_epi16(Alpha_8x16, temp2); + temp1 = _mm_cmpgt_epi16(Alpha_8x16, temp1); + + flag2_16x8 = _mm_packs_epi16(temp2, temp1); + flag2_16x8 = _mm_and_si128(flag1_16x8, flag2_16x8); + + // (ABS(p2 - p0) < beta) + temp1 = _mm_subs_epu8(p0_16x8, p2_16x8); + temp2 = _mm_subs_epu8(p2_16x8, p0_16x8); + temp1 = _mm_add_epi8(temp1, temp2); + + temp2 = _mm_unpacklo_epi8(temp1, zero); + temp1 = _mm_unpackhi_epi8(temp1, zero); + temp2 = _mm_cmpgt_epi16(Beta_8x16, temp2); + temp1 = _mm_cmpgt_epi16(Beta_8x16, temp1); + + flag3_16x8 = _mm_packs_epi16(temp2, temp1); + flag3_16x8 = _mm_and_si128(flag3_16x8, flag2_16x8); + + // (ABS(q2 - q0) < beta) + temp1 = _mm_subs_epu8(q0_16x8, q2_16x8); + temp2 = _mm_subs_epu8(q2_16x8, q0_16x8); + temp1 = _mm_add_epi8(temp1, temp2); + + temp2 = _mm_unpacklo_epi8(temp1, zero); + temp1 = _mm_unpackhi_epi8(temp1, zero); + temp2 = _mm_cmpgt_epi16(Beta_8x16, temp2); + temp1 = _mm_cmpgt_epi16(Beta_8x16, temp1); + + flag4_16x8 = _mm_packs_epi16(temp2, temp1); + flag4_16x8 = _mm_and_si128(flag4_16x8, flag2_16x8); + + // First 8 pixels + p3_8x16 = _mm_unpacklo_epi8(p3_16x8, zero); + p2_8x16 = _mm_unpacklo_epi8(p2_16x8, zero); + p1_8x16 = _mm_unpacklo_epi8(p1_16x8, zero); + p0_8x16 = _mm_unpacklo_epi8(p0_16x8, zero); + q0_8x16 = _mm_unpacklo_epi8(q0_16x8, zero); + q1_8x16 = _mm_unpacklo_epi8(q1_16x8, zero); + q2_8x16 = _mm_unpacklo_epi8(q2_16x8, zero); + q3_8x16 = _mm_unpacklo_epi8(q3_16x8, zero); + + // p0_1 and q0_1 + temp1 = _mm_add_epi16(p0_8x16, q1_8x16); + temp2 = _mm_add_epi16(p1_8x16, q0_8x16); + temp5 = _mm_add_epi16(temp1, const_val2_16x8); + temp6 = _mm_add_epi16(temp2, const_val2_16x8); + temp3 = _mm_slli_epi16(p1_8x16, 1); + temp4 = _mm_slli_epi16(q1_8x16, 1); + temp1 = _mm_add_epi16(temp5, temp3); + temp2 = _mm_add_epi16(temp6, temp4); + p0_16x8_1 = _mm_srai_epi16(temp1, 2); + q0_16x8_1 = _mm_srai_epi16(temp2, 2); + + // p1_2 and q1_2 + temp6 = _mm_add_epi16(temp6, p0_8x16); + temp5 = _mm_add_epi16(temp5, q0_8x16); + temp1 = _mm_add_epi16(temp6, p2_8x16); + temp2 = _mm_add_epi16(temp5, q2_8x16); + p1_16x8_2 = _mm_srai_epi16(temp1, 2); + q1_16x8_2 = _mm_srai_epi16(temp2, 2); + + // p0_2 and q0_2 + temp1 = _mm_add_epi16(temp3, p2_8x16); + temp2 = _mm_add_epi16(temp4, q2_8x16); + temp1 = _mm_add_epi16(temp1, q1_8x16); + temp2 = _mm_add_epi16(temp2, p1_8x16); + temp3 = _mm_add_epi16(p0_8x16, q0_8x16); + temp3 = _mm_slli_epi16(temp3, 1); + temp1 = _mm_add_epi16(temp1, temp3); + temp2 = _mm_add_epi16(temp2, temp3); + temp1 = _mm_add_epi16(temp1, _mm_set1_epi16(4)); + temp2 = _mm_add_epi16(temp2, _mm_set1_epi16(4)); + p0_16x8_2 = _mm_srai_epi16(temp1, 3); + q0_16x8_2 = _mm_srai_epi16(temp2, 3); + + // p2_2 and q2_2 + temp1 = _mm_add_epi16(temp6, const_val2_16x8); + temp2 = _mm_add_epi16(temp5, const_val2_16x8); + temp3 = _mm_slli_epi16(p2_8x16, 1); + temp4 = _mm_slli_epi16(q2_8x16, 1); + temp3 = _mm_add_epi16(p2_8x16, temp3); + temp4 = _mm_add_epi16(q2_8x16, temp4); + temp5 = _mm_slli_epi16(p3_8x16, 1); + temp6 = _mm_slli_epi16(q3_8x16, 1); + temp1 = _mm_add_epi16(temp1, temp3); + temp2 = _mm_add_epi16(temp2, temp4); + temp1 = _mm_add_epi16(temp1, temp5); + temp2 = _mm_add_epi16(temp2, temp6); + p2_16x8_2 = _mm_srai_epi16(temp1, 3); + q2_16x8_2 = _mm_srai_epi16(temp2, 3); + + // p0_1 and q0_1 + p0_16x8_1 = _mm_packus_epi16(p0_16x8_1, zero); + q0_16x8_1 = _mm_packus_epi16(q0_16x8_1, zero); + + // p1_2 and q1_2 + p1_16x8_2 = _mm_packus_epi16(p1_16x8_2, zero); + q1_16x8_2 = _mm_packus_epi16(q1_16x8_2, zero); + + // p0_2 and q0_2 + p0_16x8_2 = _mm_packus_epi16(p0_16x8_2, zero); + q0_16x8_2 = _mm_packus_epi16(q0_16x8_2, zero); + + // p2_2 and q2_2 + p2_16x8_2 = _mm_packus_epi16(p2_16x8_2, zero); + q2_16x8_2 = _mm_packus_epi16(q2_16x8_2, zero); + + // p0 and q0 + p0_16x8 = _mm_and_si128(p0_16x8, + _mm_xor_si128(flag1_16x8, _mm_set1_epi8(0xFF))); + p0_16x8_1 = _mm_and_si128(p0_16x8_1, flag1_16x8); + p0_16x8 = _mm_add_epi8(p0_16x8, p0_16x8_1); + q0_16x8 = _mm_and_si128(q0_16x8, + _mm_xor_si128(flag1_16x8, _mm_set1_epi8(0xFF))); + q0_16x8_1 = _mm_and_si128(q0_16x8_1, flag1_16x8); + q0_16x8 = _mm_add_epi8(q0_16x8, q0_16x8_1); + + // p0 and q0 + p0_16x8 = _mm_and_si128(p0_16x8, + _mm_xor_si128(flag3_16x8, _mm_set1_epi8(0xFF))); + p0_16x8_2 = _mm_and_si128(p0_16x8_2, flag3_16x8); + p0_16x8 = _mm_add_epi8(p0_16x8, p0_16x8_2); + q0_16x8 = _mm_and_si128(q0_16x8, + _mm_xor_si128(flag4_16x8, _mm_set1_epi8(0xFF))); + q0_16x8_2 = _mm_and_si128(q0_16x8_2, flag4_16x8); + q0_16x8 = _mm_add_epi8(q0_16x8, q0_16x8_2); + + // p1 and q1 + p1_16x8 = _mm_and_si128(p1_16x8, + _mm_xor_si128(flag3_16x8, _mm_set1_epi8(0xFF))); + p1_16x8_2 = _mm_and_si128(p1_16x8_2, flag3_16x8); + p1_16x8 = _mm_add_epi8(p1_16x8, p1_16x8_2); + q1_16x8 = _mm_and_si128(q1_16x8, + _mm_xor_si128(flag4_16x8, _mm_set1_epi8(0xFF))); + q1_16x8_2 = _mm_and_si128(q1_16x8_2, flag4_16x8); + q1_16x8 = _mm_add_epi8(q1_16x8, q1_16x8_2); + + // p2 and q2 + p2_16x8 = _mm_and_si128(p2_16x8, + _mm_xor_si128(flag3_16x8, _mm_set1_epi8(0xFF))); + p2_16x8_2 = _mm_and_si128(p2_16x8_2, flag3_16x8); + p2_16x8 = _mm_add_epi8(p2_16x8, p2_16x8_2); + q2_16x8 = _mm_and_si128(q2_16x8, + _mm_xor_si128(flag4_16x8, _mm_set1_epi8(0xFF))); + q2_16x8_2 = _mm_and_si128(q2_16x8_2, flag4_16x8); + q2_16x8 = _mm_add_epi8(q2_16x8, q2_16x8_2); + + temp1 = _mm_unpacklo_epi8(p3_16x8, p2_16x8); + temp2 = _mm_unpacklo_epi8(p1_16x8, p0_16x8); + temp3 = _mm_unpacklo_epi8(q0_16x8, q1_16x8); + temp4 = _mm_unpacklo_epi8(q2_16x8, q3_16x8); + + p3_8x16 = _mm_unpacklo_epi16(temp1, temp2); + p2_8x16 = _mm_unpackhi_epi16(temp1, temp2); + q2_8x16 = _mm_unpacklo_epi16(temp3, temp4); + q3_8x16 = _mm_unpackhi_epi16(temp3, temp4); + + line1 = _mm_unpacklo_epi32(p3_8x16, q2_8x16); + line2 = _mm_srli_si128(line1, 8); + line3 = _mm_unpackhi_epi32(p3_8x16, q2_8x16); + line4 = _mm_srli_si128(line3, 8); + line5 = _mm_unpacklo_epi32(p2_8x16, q3_8x16); + line6 = _mm_srli_si128(line5, 8); + line7 = _mm_unpackhi_epi32(p2_8x16, q3_8x16); + line8 = _mm_srli_si128(line7, 8); + + _mm_storel_epi64((__m128i *)(pu1_src - 4 + 0 * src_strd), line1); + _mm_storel_epi64((__m128i *)(pu1_src - 4 + 1 * src_strd), line2); + _mm_storel_epi64((__m128i *)(pu1_src - 4 + 2 * src_strd), line3); + _mm_storel_epi64((__m128i *)(pu1_src - 4 + 3 * src_strd), line4); + _mm_storel_epi64((__m128i *)(pu1_src - 4 + 4 * src_strd), line5); + _mm_storel_epi64((__m128i *)(pu1_src - 4 + 5 * src_strd), line6); + _mm_storel_epi64((__m128i *)(pu1_src - 4 + 6 * src_strd), line7); + _mm_storel_epi64((__m128i *)(pu1_src - 4 + 7 * src_strd), line8); + +} + +/*****************************************************************************/ +/* */ +/* Function Name : ih264_deblk_luma_vert_bslt4_mbaff_ssse3() */ +/* */ +/* Description : This function performs filtering of a luma block */ +/* vertical edge when boundary strength is less than 4. */ +/* */ +/* Inputs : pu1_src - pointer to the src sample q0 */ +/* src_strd - source stride */ +/* alpha - alpha value for the boundary */ +/* beta - beta value for the boundary */ +/* u4_bs - packed Boundary strength array */ +/* pu1_cliptab - tc0_table */ +/* */ +/* Globals : None */ +/* */ +/* Processing : When the function is called twice, this operation is as */ +/* described in Sec. 8.7.2.3 under the title "Filtering */ +/* process for edges for bS less than 4" in ITU T Rec H.264.*/ +/* */ +/* Outputs : None */ +/* */ +/* Returns : None */ +/* */ +/* Issues : None */ +/* */ +/* Revision History: */ +/* */ +/* DD MM YYYY Author(s) Changes (Describe the changes made) */ +/* 12 02 2015 Naveen Kumar P Initial version */ +/* */ +/*****************************************************************************/ +void ih264_deblk_luma_vert_bslt4_mbaff_ssse3(UWORD8 *pu1_src, + WORD32 src_strd, + WORD32 alpha, + WORD32 beta, + UWORD32 u4_bs, + const UWORD8 *pu1_cliptab) +{ + __m128i zero = _mm_setzero_si128(); + __m128i bs_flag_16x8b, C0_16x8, C0_8x16, C_8x16; + __m128i q0_16x8, q1_16x8, q2_16x8, q3_16x8; + __m128i p0_16x8, p1_16x8, p2_16x8, p3_16x8; + __m128i temp1, temp2, temp3, temp4; + __m128i Alpha_8x16, Beta_8x16, flag1_16x8, flag2_16x8, flag3_16x8; + __m128i in_macro_16x8; + __m128i const_val4_8x16; + UWORD8 u1_Bs0, u1_Bs1, u1_Bs2, u1_Bs3; + UWORD8 clip0, clip1, clip2, clip3; + __m128i line1, line2, line3, line4, line5, line6, line7, line8; + __m128i q0_16x8_1, q1_16x8_1, q0_16x8_2; + __m128i p0_16x8_1, p1_16x8_1, p0_16x8_2; + + line1 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 0 * src_strd)); + line2 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 1 * src_strd)); + line3 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 2 * src_strd)); + line4 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 3 * src_strd)); + line5 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 4 * src_strd)); + line6 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 5 * src_strd)); + line7 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 6 * src_strd)); + line8 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 7 * src_strd)); + + temp1 = _mm_unpacklo_epi8(line1, line2); + temp2 = _mm_unpacklo_epi8(line3, line4); + temp3 = _mm_unpacklo_epi8(line5, line6); + temp4 = _mm_unpacklo_epi8(line7, line8); + + line1 = _mm_unpacklo_epi16(temp1, temp2); + line2 = _mm_unpackhi_epi16(temp1, temp2); + line3 = _mm_unpacklo_epi16(temp3, temp4); + line4 = _mm_unpackhi_epi16(temp3, temp4); + + temp1 = _mm_unpacklo_epi32(line1, line3); + temp2 = _mm_unpackhi_epi32(line1, line3); + temp3 = _mm_unpacklo_epi32(line2, line4); + temp4 = _mm_unpackhi_epi32(line2, line4); + + p3_16x8 = _mm_unpacklo_epi64(temp1, zero); + p2_16x8 = _mm_unpackhi_epi64(temp1, zero); + q2_16x8 = _mm_unpacklo_epi64(temp4, zero); + q3_16x8 = _mm_unpackhi_epi64(temp4, zero); + p1_16x8 = _mm_unpacklo_epi64(temp2, zero); + p0_16x8 = _mm_unpackhi_epi64(temp2, zero); + q0_16x8 = _mm_unpacklo_epi64(temp3, zero); + q1_16x8 = _mm_unpackhi_epi64(temp3, zero); + + u1_Bs0 = (u4_bs >> 24) & 0xff; + u1_Bs1 = (u4_bs >> 16) & 0xff; + u1_Bs2 = (u4_bs >> 8) & 0xff; + u1_Bs3 = (u4_bs >> 0) & 0xff; + clip0 = pu1_cliptab[u1_Bs0]; + clip1 = pu1_cliptab[u1_Bs1]; + clip2 = pu1_cliptab[u1_Bs2]; + clip3 = pu1_cliptab[u1_Bs3]; + + Alpha_8x16 = _mm_set1_epi16(alpha); + Beta_8x16 = _mm_set1_epi16(beta); + + bs_flag_16x8b = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, u1_Bs3, u1_Bs3, u1_Bs2, + u1_Bs2, u1_Bs1, u1_Bs1, u1_Bs0, u1_Bs0); + + C0_16x8 = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, clip3, clip3, clip2, clip2, + clip1, clip1, clip0, clip0); + + bs_flag_16x8b = _mm_cmpeq_epi8(bs_flag_16x8b, zero); + bs_flag_16x8b = _mm_xor_si128(bs_flag_16x8b, _mm_set1_epi8(0xFF)); //Invert for required mask + C0_8x16 = _mm_unpacklo_epi8(C0_16x8, zero); + + //Cond1 (ABS(p0 - q0) < alpha) + temp1 = _mm_subs_epu8(q0_16x8, p0_16x8); + temp2 = _mm_subs_epu8(p0_16x8, q0_16x8); + temp1 = _mm_add_epi8(temp1, temp2); + + temp2 = _mm_unpacklo_epi8(temp1, zero); + temp2 = _mm_cmpgt_epi16(Alpha_8x16, temp2); + + flag1_16x8 = _mm_packs_epi16(temp2, zero); + flag1_16x8 = _mm_and_si128(flag1_16x8, bs_flag_16x8b); + + //Cond2 (ABS(q1 - q0) < beta) + temp1 = _mm_subs_epu8(q0_16x8, q1_16x8); + temp2 = _mm_subs_epu8(q1_16x8, q0_16x8); + temp1 = _mm_add_epi8(temp1, temp2); + + temp2 = _mm_unpacklo_epi8(temp1, zero); + temp2 = _mm_cmpgt_epi16(Beta_8x16, temp2); + + flag2_16x8 = _mm_packs_epi16(temp2, zero); + flag1_16x8 = _mm_and_si128(flag1_16x8, flag2_16x8); + + //Cond3 (ABS(p1 - p0) < beta) + temp1 = _mm_subs_epu8(p0_16x8, p1_16x8); + temp2 = _mm_subs_epu8(p1_16x8, p0_16x8); + temp1 = _mm_add_epi8(temp1, temp2); + + temp2 = _mm_unpacklo_epi8(temp1, zero); + temp2 = _mm_cmpgt_epi16(Beta_8x16, temp2); + + flag2_16x8 = _mm_packs_epi16(temp2, zero); + + // !((ABS(p0 - q0) < alpha) || (ABS(q1 - q0) < beta) || (ABS(p1 - p0) < beta)) + flag1_16x8 = _mm_and_si128(flag1_16x8, flag2_16x8); + + // (ABS(p2 - p0) < beta) + temp1 = _mm_subs_epu8(p0_16x8, p2_16x8); + temp2 = _mm_subs_epu8(p2_16x8, p0_16x8); + temp1 = _mm_add_epi8(temp1, temp2); + + temp2 = _mm_unpacklo_epi8(temp1, zero); + temp2 = _mm_cmpgt_epi16(Beta_8x16, temp2); + + flag2_16x8 = _mm_packs_epi16(temp2, zero); + flag2_16x8 = _mm_and_si128(flag1_16x8, flag2_16x8); + + temp2 = _mm_subs_epi16(zero, temp2); + + C_8x16 = _mm_add_epi16(C0_8x16, temp2); + + // (ABS(q2 - q0) < beta) + temp1 = _mm_subs_epu8(q0_16x8, q2_16x8); + temp2 = _mm_subs_epu8(q2_16x8, q0_16x8); + temp1 = _mm_add_epi8(temp1, temp2); + + temp2 = _mm_unpacklo_epi8(temp1, zero); + temp2 = _mm_cmpgt_epi16(Beta_8x16, temp2); + + flag3_16x8 = _mm_packs_epi16(temp2, zero); + flag3_16x8 = _mm_and_si128(flag1_16x8, flag3_16x8); + + temp2 = _mm_subs_epi16(zero, temp2); + + C_8x16 = _mm_add_epi16(C_8x16, temp2); + + const_val4_8x16 = _mm_set1_epi16(4); + temp1 = _mm_subs_epi16(_mm_unpacklo_epi8(q0_16x8, zero), + _mm_unpacklo_epi8(p0_16x8, zero)); + temp2 = _mm_subs_epi16(_mm_unpacklo_epi8(p1_16x8, zero), + _mm_unpacklo_epi8(q1_16x8, zero)); + temp1 = _mm_slli_epi16(temp1, 2); + temp1 = _mm_add_epi16(temp1, temp2); + temp1 = _mm_add_epi16(temp1, const_val4_8x16); + in_macro_16x8 = _mm_srai_epi16(temp1, 3); + + in_macro_16x8 = _mm_min_epi16(C_8x16, in_macro_16x8); //CLIP3 + C_8x16 = _mm_subs_epi16(zero, C_8x16); + in_macro_16x8 = _mm_max_epi16(C_8x16, in_macro_16x8); //CLIP3 + + // p0 + temp1 = _mm_add_epi16(_mm_unpacklo_epi8(p0_16x8, zero), in_macro_16x8); + + temp1 = _mm_packus_epi16(temp1, zero); + + p0_16x8_1 = _mm_and_si128(temp1, flag1_16x8); + p0_16x8_2 = _mm_and_si128( + p0_16x8, _mm_xor_si128(flag1_16x8, _mm_set1_epi16(0xFFFF))); + + p0_16x8_1 = _mm_add_epi8(p0_16x8_1, p0_16x8_2); + + // q0 + temp1 = _mm_sub_epi16(_mm_unpacklo_epi8(q0_16x8, zero), in_macro_16x8); + + temp1 = _mm_packus_epi16(temp1, zero); + + q0_16x8_1 = _mm_and_si128(temp1, flag1_16x8); + q0_16x8_2 = _mm_and_si128( + q0_16x8, _mm_xor_si128(flag1_16x8, _mm_set1_epi16(0xFFFF))); + + q0_16x8_1 = _mm_add_epi8(q0_16x8_1, q0_16x8_2); + + //if(Ap < Beta) + temp1 = _mm_avg_epu16(_mm_unpacklo_epi8(q0_16x8, zero), + _mm_unpacklo_epi8(p0_16x8, zero)); + temp2 = _mm_slli_epi16(_mm_unpacklo_epi8(p1_16x8, zero), 1); + //temp2 = _mm_subs_epi16(zero,temp2); + temp2 = _mm_subs_epi16(_mm_unpacklo_epi8(p2_16x8, zero), temp2); + temp2 = _mm_add_epi16(temp1, temp2); + in_macro_16x8 = _mm_srai_epi16(temp2, 1); + + in_macro_16x8 = _mm_min_epi16(C0_8x16, in_macro_16x8); //CLIP3 + C0_8x16 = _mm_subs_epi16(zero, C0_8x16); + in_macro_16x8 = _mm_max_epi16(C0_8x16, in_macro_16x8); //CLIP3 + + // p1 + temp1 = _mm_add_epi16(_mm_unpacklo_epi8(p1_16x8, zero), in_macro_16x8); + + temp1 = _mm_packus_epi16(temp1, zero); + + p1_16x8_1 = _mm_and_si128(temp1, flag2_16x8); + p1_16x8 = _mm_and_si128(p1_16x8, + _mm_xor_si128(flag2_16x8, _mm_set1_epi16(0xFFFF))); + p1_16x8 = _mm_add_epi8(p1_16x8, p1_16x8_1); + + //if(Aq < Beta) + temp1 = _mm_avg_epu16(_mm_unpacklo_epi8(q0_16x8, zero), + _mm_unpacklo_epi8(p0_16x8, zero)); + temp2 = _mm_slli_epi16(_mm_unpacklo_epi8(q1_16x8, zero), 1); + //temp2 = _mm_slli_epi16 (temp2, 1); + temp2 = _mm_subs_epi16(_mm_unpacklo_epi8(q2_16x8, zero), temp2); + temp2 = _mm_add_epi16(temp1, temp2); + in_macro_16x8 = _mm_srai_epi16(temp2, 1); + + in_macro_16x8 = _mm_max_epi16(C0_8x16, in_macro_16x8); //CLIP3 + C0_8x16 = _mm_subs_epi16(zero, C0_8x16); + in_macro_16x8 = _mm_min_epi16(C0_8x16, in_macro_16x8); //CLIP3 + + temp1 = _mm_add_epi16(_mm_unpacklo_epi8(q1_16x8, zero), in_macro_16x8); + + // q1 + temp1 = _mm_packus_epi16(temp1, zero); + + q1_16x8_1 = _mm_and_si128(temp1, flag3_16x8); + q1_16x8 = _mm_and_si128(q1_16x8, + _mm_xor_si128(flag3_16x8, _mm_set1_epi16(0xFFFF))); + q1_16x8 = _mm_add_epi8(q1_16x8, q1_16x8_1); + + temp1 = _mm_unpacklo_epi8(p3_16x8, p2_16x8); + temp2 = _mm_unpacklo_epi8(p1_16x8, p0_16x8_1); + temp3 = _mm_unpacklo_epi8(q0_16x8_1, q1_16x8); + temp4 = _mm_unpacklo_epi8(q2_16x8, q3_16x8); + + line7 = _mm_unpacklo_epi16(temp1, temp2); + temp1 = _mm_unpackhi_epi16(temp1, temp2); + line8 = _mm_unpacklo_epi16(temp3, temp4); + temp2 = _mm_unpackhi_epi16(temp3, temp4); + + line1 = _mm_unpacklo_epi32(line7, line8); + line2 = _mm_srli_si128(line1, 8); + line3 = _mm_unpackhi_epi32(line7, line8); + line4 = _mm_srli_si128(line3, 8); + line5 = _mm_unpacklo_epi32(temp1, temp2); + line6 = _mm_srli_si128(line5, 8); + line7 = _mm_unpackhi_epi32(temp1, temp2); + line8 = _mm_srli_si128(line7, 8); + + _mm_storel_epi64((__m128i *)(pu1_src - 4 + 0 * src_strd), line1); + _mm_storel_epi64((__m128i *)(pu1_src - 4 + 1 * src_strd), line2); + _mm_storel_epi64((__m128i *)(pu1_src - 4 + 2 * src_strd), line3); + _mm_storel_epi64((__m128i *)(pu1_src - 4 + 3 * src_strd), line4); + _mm_storel_epi64((__m128i *)(pu1_src - 4 + 4 * src_strd), line5); + _mm_storel_epi64((__m128i *)(pu1_src - 4 + 5 * src_strd), line6); + _mm_storel_epi64((__m128i *)(pu1_src - 4 + 6 * src_strd), line7); + _mm_storel_epi64((__m128i *)(pu1_src - 4 + 7 * src_strd), line8); +} + diff --git a/common/x86/ih264_ihadamard_scaling_sse42.c b/common/x86/ih264_ihadamard_scaling_sse42.c new file mode 100755 index 0000000..895291b --- /dev/null +++ b/common/x86/ih264_ihadamard_scaling_sse42.c @@ -0,0 +1,238 @@ +/****************************************************************************** + * + * Copyright (C) 2015 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ***************************************************************************** + * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore +*/ +/** + ******************************************************************************* + * @file + * ih264_ihadamard_scaling_sse42.c + * + * @brief + * Contains definition of functions for h264 inverse hadamard 4x4 transform and scaling + * + * @author + * Mohit + * + * @par List of Functions: + * - ih264_ihadamard_scaling_4x4_sse42() + * - ih264_ihadamard_scaling_2x2_uv_ssse42() + * + * @remarks + * + ******************************************************************************* + */ +/*****************************************************************************/ +/* File Includes */ +/*****************************************************************************/ + +/* User include files */ +#include "ih264_typedefs.h" +#include "ih264_defs.h" +#include "ih264_trans_macros.h" +#include "ih264_macros.h" +#include "ih264_trans_data.h" +#include "ih264_size_defs.h" +#include "ih264_structs.h" +#include "ih264_trans_quant_itrans_iquant.h" +#include <immintrin.h> + +/* + ******************************************************************************** + * + * @brief This function performs a 4x4 inverse hadamard transform on the 4x4 DC coefficients + * of a 16x16 intra prediction macroblock, and then performs scaling. + * prediction buffer + * + * @par Description: + * The DC coefficients pass through a 2-stage inverse hadamard transform. + * This inverse transformed content is scaled to based on Qp value. + * + * @param[in] pi2_src + * input 4x4 block of DC coefficients + * + * @param[out] pi2_out + * output 4x4 block + * + * @param[in] pu2_iscal_mat + * pointer to scaling list + * + * @param[in] pu2_weigh_mat + * pointer to weight matrix + * + * @param[in] u4_qp_div_6 + * Floor (qp/6) + * + * @param[in] pi4_tmp + * temporary buffer of size 1*16 + * + * @returns none + * + * @remarks none + * + ******************************************************************************* + */ +void ih264_ihadamard_scaling_4x4_sse42(WORD16* pi2_src, WORD16* pi2_out, + const UWORD16 *pu2_iscal_mat, const UWORD16 *pu2_weigh_mat, + UWORD32 u4_qp_div_6, WORD32* pi4_tmp) { + __m128i src_r0_r1, src_r2_r3; + __m128i src_r0, src_r1, src_r2, src_r3; + __m128i temp0, temp1, temp2, temp3; + __m128i add_rshift = _mm_set1_epi32((1 << (5 - u4_qp_div_6))); + __m128i mult_val = _mm_set1_epi32(pu2_iscal_mat[0] * pu2_weigh_mat[0]); + + src_r0_r1 = _mm_loadu_si128((__m128i *) (pi2_src)); //a00 a01 a02 a03 a10 a11 a12 a13 -- the source matrix 0th,1st row + src_r2_r3 = _mm_loadu_si128((__m128i *) (pi2_src + 8)); //a20 a21 a22 a23 a30 a31 a32 a33 -- the source matrix 2nd,3rd row + //sign_reg = _mm_cmpgt_epi16(zero_8x16b, src_r0_r1); + src_r0 = _mm_cvtepi16_epi32(src_r0_r1); + src_r0_r1 = _mm_srli_si128(src_r0_r1, 8); + src_r1 = _mm_cvtepi16_epi32(src_r0_r1); + + src_r2 = _mm_cvtepi16_epi32(src_r2_r3); + src_r2_r3 = _mm_srli_si128(src_r2_r3, 8); + src_r3 = _mm_cvtepi16_epi32(src_r2_r3); + + /* Perform Inverse transform */ + /*-------------------------------------------------------------*/ + /* IDCT [ Horizontal transformation ] */ + /*-------------------------------------------------------------*/ + // Matrix transpose + /* + * a0 a1 a2 a3 + * b0 b1 b2 b3 + * c0 c1 c2 c3 + * d0 d1 d2 d3 + */ + temp0 = _mm_unpacklo_epi32(src_r0, src_r1); //a0 b0 a1 b1 + temp2 = _mm_unpacklo_epi32(src_r2, src_r3); //c0 d0 c1 d1 + temp1 = _mm_unpackhi_epi32(src_r0, src_r1); //a2 b2 a3 b3 + temp3 = _mm_unpackhi_epi32(src_r2, src_r3); //c2 d2 c3 d3 + src_r0 = _mm_unpacklo_epi64(temp0, temp2); //a0 b0 c0 d0 + src_r1 = _mm_unpackhi_epi64(temp0, temp2); //a1 b1 c1 d1 + src_r2 = _mm_unpacklo_epi64(temp1, temp3); //a2 b2 c2 d2 + src_r3 = _mm_unpackhi_epi64(temp1, temp3); //a3 b3 c3 d3 + + temp0 = _mm_add_epi32(src_r0, src_r3); + temp1 = _mm_add_epi32(src_r1, src_r2); + temp2 = _mm_sub_epi32(src_r1, src_r2); + temp3 = _mm_sub_epi32(src_r0, src_r3); + + src_r0 = _mm_add_epi32(temp0, temp1); + src_r1 = _mm_add_epi32(temp2, temp3); + src_r2 = _mm_sub_epi32(temp0, temp1); + src_r3 = _mm_sub_epi32(temp3, temp2); + + /*-------------------------------------------------------------*/ + /* IDCT [ Vertical transformation ] */ + /*-------------------------------------------------------------*/ + // Matrix transpose + /* + * a0 b0 c0 d0 + * a1 b1 c1 d1 + * a2 b2 c2 d2 + * a3 b3 c3 d3 + */ + temp0 = _mm_unpacklo_epi32(src_r0, src_r1); //a0 a1 b0 b1 + temp2 = _mm_unpacklo_epi32(src_r2, src_r3); //a2 a3 b2 b3 + temp1 = _mm_unpackhi_epi32(src_r0, src_r1); //c0 c1 d0 d1 + temp3 = _mm_unpackhi_epi32(src_r2, src_r3); //c2 c3 d2 d3 + src_r0 = _mm_unpacklo_epi64(temp0, temp2); //a0 a1 a2 a3 + src_r1 = _mm_unpackhi_epi64(temp0, temp2); //b0 b1 b2 b3 + src_r2 = _mm_unpacklo_epi64(temp1, temp3); //c0 c1 c2 c3 + src_r3 = _mm_unpackhi_epi64(temp1, temp3); //d0 d1 d2 d3 + + temp0 = _mm_add_epi32(src_r0, src_r3); + temp1 = _mm_add_epi32(src_r1, src_r2); + temp2 = _mm_sub_epi32(src_r1, src_r2); + temp3 = _mm_sub_epi32(src_r0, src_r3); + + src_r0 = _mm_add_epi32(temp0, temp1); + src_r1 = _mm_add_epi32(temp2, temp3); + src_r2 = _mm_sub_epi32(temp0, temp1); + src_r3 = _mm_sub_epi32(temp3, temp2); + + src_r0 = _mm_mullo_epi32(src_r0, mult_val); + src_r1 = _mm_mullo_epi32(src_r1, mult_val); + src_r2 = _mm_mullo_epi32(src_r2, mult_val); + src_r3 = _mm_mullo_epi32(src_r3, mult_val); + + //Scaling + if (u4_qp_div_6 >= 6) { + src_r0 = _mm_slli_epi32(src_r0, u4_qp_div_6 - 6); + src_r1 = _mm_slli_epi32(src_r1, u4_qp_div_6 - 6); + src_r2 = _mm_slli_epi32(src_r2, u4_qp_div_6 - 6); + src_r3 = _mm_slli_epi32(src_r3, u4_qp_div_6 - 6); + } else { + temp0 = _mm_add_epi32(src_r0, add_rshift); + temp1 = _mm_add_epi32(src_r1, add_rshift); + temp2 = _mm_add_epi32(src_r2, add_rshift); + temp3 = _mm_add_epi32(src_r3, add_rshift); + src_r0 = _mm_srai_epi32(temp0, 6 - u4_qp_div_6); + src_r1 = _mm_srai_epi32(temp1, 6 - u4_qp_div_6); + src_r2 = _mm_srai_epi32(temp2, 6 - u4_qp_div_6); + src_r3 = _mm_srai_epi32(temp3, 6 - u4_qp_div_6); + } + src_r0_r1 = _mm_packs_epi32(src_r0, src_r1); + src_r2_r3 = _mm_packs_epi32(src_r2, src_r3); + + _mm_storeu_si128((__m128i *) (&pi2_out[0]), src_r0_r1); + _mm_storeu_si128((__m128i *) (&pi2_out[8]), src_r2_r3); +} + +void ih264_ihadamard_scaling_2x2_uv_sse42(WORD16* pi2_src, + WORD16* pi2_out, + const UWORD16 *pu2_iscal_mat, + const UWORD16 *pu2_weigh_mat, + UWORD32 u4_qp_div_6, + WORD32* pi4_tmp) +{ + UNUSED(pi4_tmp); + __m128i src, plane_0, plane_1, temp0, temp1, sign_reg; + __m128i zero_8x16b = _mm_setzero_si128(); + __m128i scale_val = _mm_set1_epi32((WORD32)(pu2_iscal_mat[0] * pu2_weigh_mat[0])); + src = _mm_loadu_si128((__m128i *) pi2_src); //a0 a1 a2 a3 b0 b1 b2 b3 + sign_reg = _mm_cmpgt_epi16(zero_8x16b, src); + plane_0 = _mm_unpacklo_epi16(src, sign_reg); //a0 a1 a2 a3 -- 32 bits + plane_1 = _mm_unpackhi_epi16(src, sign_reg); //b0 b1 b2 b3 -- 32 bits + + temp0 = _mm_hadd_epi32(plane_0, plane_1); //a0+a1 a2+a3 b0+b1 b2+b3 + temp1 = _mm_hsub_epi32(plane_0, plane_1); //a0-a1 a2-a3 b0-b1 b2-b3 + plane_0 = _mm_hadd_epi32(temp0, temp1); //a0+a1+a2+a3 b0+b1+b2+b3 a0-a1+a2-a3 b0-b1+b2-b3 + plane_1 = _mm_hsub_epi32(temp0, temp1); //a0+a1-a2-a3 b0+b1-b2-b3 a0-a1-a2+a3 b0-b1-b2+b3 + temp0 = _mm_unpacklo_epi32(plane_0, plane_1); //a0+a1+a2+a3 a0+a1-a2-a3 b0+b1+b2+b3 b0+b1-b2-b3 + temp1 = _mm_unpackhi_epi32(plane_0, plane_1); //a0-a1+a2-a3 a0-a1-a2+a3 b0-b1+b2-b3 b0-b1-b2+b3 + + plane_0 = _mm_unpacklo_epi64(temp0, temp1); //a0+a1+a2+a3 a0+a1-a2-a3 a0-a1+a2-a3 a0-a1-a2+a3 + plane_1 = _mm_unpackhi_epi64(temp0, temp1); //b0+b1+b2+b3 b0+b1-b2-b3 b0-b1+b2-b3 b0-b1-b2+b3 + + plane_0 = _mm_shuffle_epi32(plane_0, 0xd8); //a0+a1+a2+a3 a0-a1+a2-a3 a0+a1-a2-a3 a0-a1-a2+a3 + plane_1 = _mm_shuffle_epi32(plane_1, 0xd8); //b0+b1+b2+b3 b0-b1+b2-b3 b0+b1-b2-b3 b0-b1-b2+b3 + + temp0 = _mm_mullo_epi32(scale_val, plane_0); //multiply by pu2_iscal_mat[0] * pu2_weigh_mat[0] + temp1 = _mm_mullo_epi32(scale_val, plane_1); //multiply by pu2_iscal_mat[0] * pu2_weigh_mat[0] + + temp0 = _mm_slli_epi32(temp0, u4_qp_div_6); + temp1 = _mm_slli_epi32(temp1, u4_qp_div_6); + + temp0 = _mm_srai_epi32(temp0, 5); + temp1 = _mm_srai_epi32(temp1, 5); + + temp0 = _mm_packs_epi32(temp0, temp1); //Final values are 16-bits only. + + _mm_storeu_si128((__m128i *) (&pi2_out[0]), temp0); + +} diff --git a/common/x86/ih264_ihadamard_scaling_ssse3.c b/common/x86/ih264_ihadamard_scaling_ssse3.c new file mode 100755 index 0000000..232d9fa --- /dev/null +++ b/common/x86/ih264_ihadamard_scaling_ssse3.c @@ -0,0 +1,200 @@ +/****************************************************************************** + * + * Copyright (C) 2015 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ***************************************************************************** + * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore +*/ +/** + ******************************************************************************* + * @file + * ih264_ihadamard_scaling_ssse3.c + * + * @brief + * Contains definition of functions for h264 inverse hadamard 4x4 transform and scaling + * + * @author + * Mohit + * + * @par List of Functions: + * - ih264_ihadamard_scaling_4x4_ssse3() + * + * @remarks + * + ******************************************************************************* + */ +/*****************************************************************************/ +/* File Includes */ +/*****************************************************************************/ + +/* User include files */ +#include "ih264_typedefs.h" +#include "ih264_defs.h" +#include "ih264_trans_macros.h" +#include "ih264_macros.h" +#include "ih264_trans_data.h" +#include "ih264_size_defs.h" +#include "ih264_structs.h" +#include "ih264_trans_quant_itrans_iquant.h" +#include <immintrin.h> + +/* + ******************************************************************************** + * + * @brief This function performs a 4x4 inverse hadamard transform on the 4x4 DC coefficients + * of a 16x16 intra prediction macroblock, and then performs scaling. + * prediction buffer + * + * @par Description: + * The DC coefficients pass through a 2-stage inverse hadamard transform. + * This inverse transformed content is scaled to based on Qp value. + * + * @param[in] pi2_src + * input 4x4 block of DC coefficients + * + * @param[out] pi2_out + * output 4x4 block + * + * @param[in] pu2_iscal_mat + * pointer to scaling list + * + * @param[in] pu2_weigh_mat + * pointer to weight matrix + * + * @param[in] u4_qp_div_6 + * Floor (qp/6) + * + * @param[in] pi4_tmp + * temporary buffer of size 1*16 + * + * @returns none + * + * @remarks none + * + ******************************************************************************* + */ +void ih264_ihadamard_scaling_4x4_ssse3(WORD16* pi2_src, WORD16* pi2_out, + const UWORD16 *pu2_iscal_mat, const UWORD16 *pu2_weigh_mat, + UWORD32 u4_qp_div_6, WORD32* pi4_tmp) { + int val = 0xFFFF; + __m128i src_r0_r1, src_r2_r3, sign_reg, zero_8x16b = _mm_setzero_si128(); + __m128i src_r0, src_r1, src_r2, src_r3; + __m128i temp0, temp1, temp2, temp3; + __m128i add_rshift = _mm_set1_epi32((1 << (5 - u4_qp_div_6))); + __m128i mult_val = _mm_set1_epi32(pu2_iscal_mat[0] * pu2_weigh_mat[0]); + + __m128i mask = _mm_set1_epi32(val); + mult_val = _mm_and_si128(mult_val, mask); + + src_r0_r1 = _mm_loadu_si128((__m128i *) (pi2_src)); //a00 a01 a02 a03 a10 a11 a12 a13 -- the source matrix 0th,1st row + src_r2_r3 = _mm_loadu_si128((__m128i *) (pi2_src + 8)); //a20 a21 a22 a23 a30 a31 a32 a33 -- the source matrix 2nd,3rd row + sign_reg = _mm_cmpgt_epi16(zero_8x16b, src_r0_r1); + src_r0 = _mm_unpacklo_epi16(src_r0_r1, sign_reg); + src_r1 = _mm_unpackhi_epi16(src_r0_r1, sign_reg); + sign_reg = _mm_cmpgt_epi16(zero_8x16b, src_r2_r3); + src_r2 = _mm_unpacklo_epi16(src_r2_r3, sign_reg); + src_r3 = _mm_unpackhi_epi16(src_r2_r3, sign_reg); + + /* Perform Inverse transform */ + /*-------------------------------------------------------------*/ + /* IDCT [ Horizontal transformation ] */ + /*-------------------------------------------------------------*/ + // Matrix transpose + /* + * a0 a1 a2 a3 + * b0 b1 b2 b3 + * c0 c1 c2 c3 + * d0 d1 d2 d3 + */ + temp0 = _mm_unpacklo_epi32(src_r0, src_r1); //a0 b0 a1 b1 + temp2 = _mm_unpacklo_epi32(src_r2, src_r3); //c0 d0 c1 d1 + temp1 = _mm_unpackhi_epi32(src_r0, src_r1); //a2 b2 a3 b3 + temp3 = _mm_unpackhi_epi32(src_r2, src_r3); //c2 d2 c3 d3 + src_r0 = _mm_unpacklo_epi64(temp0, temp2); //a0 b0 c0 d0 + src_r1 = _mm_unpackhi_epi64(temp0, temp2); //a1 b1 c1 d1 + src_r2 = _mm_unpacklo_epi64(temp1, temp3); //a2 b2 c2 d2 + src_r3 = _mm_unpackhi_epi64(temp1, temp3); //a3 b3 c3 d3 + + temp0 = _mm_add_epi32(src_r0, src_r3); + temp1 = _mm_add_epi32(src_r1, src_r2); + temp2 = _mm_sub_epi32(src_r1, src_r2); + temp3 = _mm_sub_epi32(src_r0, src_r3); + + src_r0 = _mm_add_epi32(temp0, temp1); + src_r1 = _mm_add_epi32(temp2, temp3); + src_r2 = _mm_sub_epi32(temp0, temp1); + src_r3 = _mm_sub_epi32(temp3, temp2); + + /*-------------------------------------------------------------*/ + /* IDCT [ Vertical transformation ] */ + /*-------------------------------------------------------------*/ + // Matrix transpose + /* + * a0 b0 c0 d0 + * a1 b1 c1 d1 + * a2 b2 c2 d2 + * a3 b3 c3 d3 + */ + temp0 = _mm_unpacklo_epi32(src_r0, src_r1); //a0 a1 b0 b1 + temp2 = _mm_unpacklo_epi32(src_r2, src_r3); //a2 a3 b2 b3 + temp1 = _mm_unpackhi_epi32(src_r0, src_r1); //c0 c1 d0 d1 + temp3 = _mm_unpackhi_epi32(src_r2, src_r3); //c2 c3 d2 d3 + src_r0 = _mm_unpacklo_epi64(temp0, temp2); //a0 a1 a2 a3 + src_r1 = _mm_unpackhi_epi64(temp0, temp2); //b0 b1 b2 b3 + src_r2 = _mm_unpacklo_epi64(temp1, temp3); //c0 c1 c2 c3 + src_r3 = _mm_unpackhi_epi64(temp1, temp3); //d0 d1 d2 d3 + + temp0 = _mm_add_epi32(src_r0, src_r3); + temp1 = _mm_add_epi32(src_r1, src_r2); + temp2 = _mm_sub_epi32(src_r1, src_r2); + temp3 = _mm_sub_epi32(src_r0, src_r3); + + src_r0 = _mm_add_epi32(temp0, temp1); + src_r1 = _mm_add_epi32(temp2, temp3); + src_r2 = _mm_sub_epi32(temp0, temp1); + src_r3 = _mm_sub_epi32(temp3, temp2); + + src_r0 = _mm_and_si128(src_r0, mask); + src_r1 = _mm_and_si128(src_r1, mask); + src_r2 = _mm_and_si128(src_r2, mask); + src_r3 = _mm_and_si128(src_r3, mask); + + src_r0 = _mm_madd_epi16(src_r0, mult_val); + src_r1 = _mm_madd_epi16(src_r1, mult_val); + src_r2 = _mm_madd_epi16(src_r2, mult_val); + src_r3 = _mm_madd_epi16(src_r3, mult_val); + + //Scaling + if (u4_qp_div_6 >= 6) { + src_r0 = _mm_slli_epi32(src_r0, u4_qp_div_6 - 6); + src_r1 = _mm_slli_epi32(src_r1, u4_qp_div_6 - 6); + src_r2 = _mm_slli_epi32(src_r2, u4_qp_div_6 - 6); + src_r3 = _mm_slli_epi32(src_r3, u4_qp_div_6 - 6); + } else { + temp0 = _mm_add_epi32(src_r0, add_rshift); + temp1 = _mm_add_epi32(src_r1, add_rshift); + temp2 = _mm_add_epi32(src_r2, add_rshift); + temp3 = _mm_add_epi32(src_r3, add_rshift); + src_r0 = _mm_srai_epi32(temp0, 6 - u4_qp_div_6); + src_r1 = _mm_srai_epi32(temp1, 6 - u4_qp_div_6); + src_r2 = _mm_srai_epi32(temp2, 6 - u4_qp_div_6); + src_r3 = _mm_srai_epi32(temp3, 6 - u4_qp_div_6); + } + src_r0_r1 = _mm_packs_epi32(src_r0, src_r1); + src_r2_r3 = _mm_packs_epi32(src_r2, src_r3); + + _mm_storeu_si128((__m128i *) (&pi2_out[0]), src_r0_r1); + _mm_storeu_si128((__m128i *) (&pi2_out[8]), src_r2_r3); +} diff --git a/common/x86/ih264_inter_pred_filters_ssse3.c b/common/x86/ih264_inter_pred_filters_ssse3.c new file mode 100755 index 0000000..64e364e --- /dev/null +++ b/common/x86/ih264_inter_pred_filters_ssse3.c @@ -0,0 +1,4375 @@ +/****************************************************************************** + * + * Copyright (C) 2015 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ***************************************************************************** + * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore +*/ +/*****************************************************************************/ +/* */ +/* File Name : ih264_inter_pred_filters_intr_ssse3.c */ +/* */ +/* Description : Contains function definitions for weighted */ +/* prediction functions in x86 sse4 intrinsics */ +/* */ +/* List of Functions : ih264_inter_pred_luma_copy_ssse3() */ +/* ih264_inter_pred_luma_horz_ssse3() */ +/* ih264_inter_pred_luma_vert_ssse3() */ +/* ih264_inter_pred_luma_horz_hpel_vert_hpel_ssse3() */ +/* ih264_inter_pred_luma_horz_qpel_ssse3() */ +/* ih264_inter_pred_luma_vert_qpel_ssse3() */ +/* ih264_inter_pred_luma_horz_qpel_vert_qpel_ssse3() */ +/* ih264_inter_pred_luma_horz_hpel_vert_qpel_ssse3() */ +/* ih264_inter_pred_luma_horz_qpel_vert_hpel_ssse3() */ +/* ih264_inter_pred_chroma_ssse3() */ +/* */ +/* Issues / Problems : None */ +/* */ +/* Revision History : */ +/* */ +/* DD MM YYYY Author(s) Changes */ +/* 13 02 2015 Kaushik Initial version */ +/* Senthoor */ +/* */ +/*****************************************************************************/ +/*****************************************************************************/ +/* File Includes */ +/*****************************************************************************/ + +#include <immintrin.h> +#include "ih264_typedefs.h" +#include "ih264_macros.h" +#include "ih264_platform_macros.h" +#include "ih264_inter_pred_filters.h" + +/*****************************************************************************/ +/* Constant Data variables */ +/*****************************************************************************/ + +/* coefficients for 6 tap filtering*/ +//const WORD32 ih264_g_six_tap[3] ={1,-5,20}; +/*****************************************************************************/ +/* Function definitions . */ +/*****************************************************************************/ +/*****************************************************************************/ +/* */ +/* Function Name : ih264_inter_pred_luma_copy_ssse3 */ +/* */ +/* Description : This function copies the contents of ht x wd block from */ +/* source to destination. (ht,wd) can be (4,4), (8,4), */ +/* (4,8), (8,8), (16,8), (8,16) or (16,16). */ +/* */ +/* Inputs : puc_src - pointer to source */ +/* puc_dst - pointer to destination */ +/* src_strd - stride for source */ +/* dst_strd - stride for destination */ +/* ht - height of the block */ +/* wd - width of the block */ +/* */ +/* Issues : None */ +/* */ +/* Revision History: */ +/* */ +/* DD MM YYYY Author(s) Changes */ +/* 13 02 2015 Kaushik Initial Version */ +/* Senthoor */ +/* */ +/*****************************************************************************/ +void ih264_inter_pred_luma_copy_ssse3(UWORD8 *pu1_src, + UWORD8 *pu1_dst, + WORD32 src_strd, + WORD32 dst_strd, + WORD32 ht, + WORD32 wd, + UWORD8* pu1_tmp, + WORD32 dydx) +{ + __m128i y_0_16x8b, y_1_16x8b, y_2_16x8b, y_3_16x8b; + + UNUSED(pu1_tmp); + UNUSED(dydx); + + WORD32 src_strd2, src_strd3, src_strd4, dst_strd2, dst_strd3, dst_strd4; + + src_strd2 = src_strd << 1; + dst_strd2 = dst_strd << 1; + src_strd4 = src_strd << 2; + dst_strd4 = dst_strd << 2; + src_strd3 = src_strd2 + src_strd; + dst_strd3 = dst_strd2 + dst_strd; + + if(wd == 4) + { + __m128i mask_full_128b, mask_low_32b; + + mask_full_128b = _mm_set1_epi8(0xff); + mask_low_32b = _mm_srli_si128(mask_full_128b, 12); + // mask for first four bytes + + do + { + y_0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src); + y_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + src_strd)); + y_2_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + src_strd2)); + y_3_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + src_strd3)); + + _mm_maskmoveu_si128(y_0_16x8b, mask_low_32b, (char*)pu1_dst); + _mm_maskmoveu_si128(y_1_16x8b, mask_low_32b, (char*)(pu1_dst + dst_strd)); + _mm_maskmoveu_si128(y_2_16x8b, mask_low_32b, (char*)(pu1_dst + dst_strd2)); + _mm_maskmoveu_si128(y_3_16x8b, mask_low_32b, (char*)(pu1_dst + dst_strd3)); + + ht -= 4; + pu1_src += src_strd4; + pu1_dst += dst_strd4; + } + while(ht > 0); + } + else if(wd == 8) + { + do + { + y_0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src); + y_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + src_strd)); + y_2_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + src_strd2)); + y_3_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + src_strd3)); + + _mm_storel_epi64((__m128i *)pu1_dst, y_0_16x8b); + _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd), y_1_16x8b); + _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd2), y_2_16x8b); + _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd3), y_3_16x8b); + + ht -= 4; + pu1_src += src_strd4; + pu1_dst += dst_strd4; + } + while(ht > 0); + } + else // wd == 16 + { + WORD32 src_strd5, src_strd6, src_strd7, src_strd8; + WORD32 dst_strd5, dst_strd6, dst_strd7, dst_strd8; + + __m128i y_4_16x8b, y_5_16x8b, y_6_16x8b, y_7_16x8b; + + src_strd5 = src_strd2 + src_strd3; + dst_strd5 = dst_strd2 + dst_strd3; + src_strd6 = src_strd3 << 1; + dst_strd6 = dst_strd3 << 1; + src_strd7 = src_strd3 + src_strd4; + dst_strd7 = dst_strd3 + dst_strd4; + src_strd8 = src_strd << 3; + dst_strd8 = dst_strd << 3; + + do + { + y_0_16x8b = _mm_loadu_si128((__m128i *)pu1_src); + y_1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + src_strd)); + y_2_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + src_strd2)); + y_3_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + src_strd3)); + y_4_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + src_strd4)); + y_5_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + src_strd5)); + y_6_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + src_strd6)); + y_7_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + src_strd7)); + + _mm_storeu_si128((__m128i *)pu1_dst, y_0_16x8b); + _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd), y_1_16x8b); + _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd2), y_2_16x8b); + _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd3), y_3_16x8b); + _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd4), y_4_16x8b); + _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd5), y_5_16x8b); + _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd6), y_6_16x8b); + _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd7), y_7_16x8b); + + ht -= 8; + pu1_src += src_strd8; + pu1_dst += dst_strd8; + } + while(ht > 0); + } +} + +/*****************************************************************************/ +/* */ +/* Function Name : ih264_inter_pred_luma_horz_ssse3 */ +/* */ +/* Description : This function applies a horizontal 6-tap filter on */ +/* ht x wd block as mentioned in sec. 8.4.2.2.1 titled */ +/* "Luma sample interpolation process". (ht,wd) can be */ +/* (4,4), (8,4), (4,8), (8,8), (16,8), (8,16) or (16,16). */ +/* */ +/* Inputs : puc_src - pointer to source */ +/* puc_dst - pointer to destination */ +/* src_strd - stride for source */ +/* dst_strd - stride for destination */ +/* ht - height of the block */ +/* wd - width of the block */ +/* */ +/* Issues : None */ +/* */ +/* Revision History: */ +/* */ +/* DD MM YYYY Author(s) Changes */ +/* 13 02 2015 Kaushik Initial Version */ +/* Senthoor */ +/* */ +/*****************************************************************************/ +void ih264_inter_pred_luma_horz_ssse3(UWORD8 *pu1_src, + UWORD8 *pu1_dst, + WORD32 src_strd, + WORD32 dst_strd, + WORD32 ht, + WORD32 wd, + UWORD8* pu1_tmp, + WORD32 dydx) +{ + __m128i coeff0_1_16x8b, coeff2_3_16x8b, coeff4_5_16x8b; + __m128i const_val16_8x16b; + + UNUSED(pu1_tmp); + UNUSED(dydx); + + pu1_src -= 2; // the filter input starts from x[-2] (till x[3]) + + coeff0_1_16x8b = _mm_set1_epi32(0xFB01FB01); //c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 + coeff2_3_16x8b = _mm_set1_epi32(0x14141414); //c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 + coeff4_5_16x8b = _mm_set1_epi32(0x01FB01FB); //c4 c5 c4 c5 c4 c5 c4 c5 c4 c5 c4 c5 c4 c5 c4 c5 + //c0 = c5 = 1, c1 = c4 = -5, c2 = c3 = 20 + const_val16_8x16b = _mm_set1_epi16(16); + + if(wd == 4) + { + __m128i src_r0_16x8b, src_r1_16x8b, src_r0r1_16x8b; + __m128i src_r0_sht_16x8b, src_r1_sht_16x8b; + + __m128i res_r0r1_t1_8x16b, res_r0r1_t2_8x16b, res_r0r1_t3_8x16b; + __m128i res_r0r1_16x8b; + + __m128i mask_full_16x8b, mask_low32b; + + mask_full_16x8b = _mm_set1_epi8(0xff); + mask_low32b = _mm_srli_si128(mask_full_16x8b, 12); // mask for first four bytes + + //Row0 : a0 a1 a2 a3 a4 a5 a6 a7 a8 a9..... + //Row1 : b0 b1 b2 b3 b4 b5 b6 b7 b8 b9..... + + do + { + src_r0_16x8b = _mm_loadu_si128((__m128i *)pu1_src); //a0 a1 a2 a3 a4 a5 a6 a7 a8 a9....a15 + src_r1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + src_strd)); //b0 b1 b2 b3 b4 b5 b6 b7 b8 b9....b15 + + src_r0_sht_16x8b = _mm_srli_si128(src_r0_16x8b, 1); //a1 a2 a3 a4 a5 a6 a7 a8 a9....a15 0 + src_r1_sht_16x8b = _mm_srli_si128(src_r1_16x8b, 1); //b1 b2 b3 b4 b5 b6 b7 b8 b9....b15 0 + + src_r0_16x8b = _mm_unpacklo_epi8(src_r0_16x8b, src_r0_sht_16x8b); //a0 a1 a1 a2 a2 a3 a3 a4 a4 a5 a5 a6 a6 a7 a7 a8 + src_r1_16x8b = _mm_unpacklo_epi8(src_r1_16x8b, src_r1_sht_16x8b); //b0 b1 b1 b2 b2 b3 b3 b4 b4 b5 b5 b6 b6 b7 b7 b8 + + src_r0r1_16x8b = _mm_unpacklo_epi64(src_r0_16x8b, src_r1_16x8b); //a0 a1 a1 a2 a2 a3 a3 a4 b0 b1 b1 b2 b2 b3 b3 b4 + res_r0r1_t1_8x16b = _mm_maddubs_epi16(src_r0r1_16x8b, coeff0_1_16x8b); //a0*c0+a1*c1 a1*c0+a2*c1 a2*c0+a3*c1 a3*c0+a4*c1 + //b0*c0+b1*c1 b1*c0+b2*c1 b2*c0+b3*c1 b3*c0+b4*c1 + + src_r0_16x8b = _mm_srli_si128(src_r0_16x8b, 4); //a2 a3 a3 a4 a4 a5 a5 a6 a6 a7 a7 a8 0 0 0 0 + src_r1_16x8b = _mm_srli_si128(src_r1_16x8b, 4); //b2 b3 b3 b4 b4 b5 b5 b6 b6 b7 b7 b8 0 0 0 0 + + src_r0r1_16x8b = _mm_unpacklo_epi64(src_r0_16x8b, src_r1_16x8b); //a2 a3 a3 a4 a4 a5 a5 a6 b2 b3 b3 b4 b4 b5 b5 b6 + res_r0r1_t2_8x16b = _mm_maddubs_epi16(src_r0r1_16x8b, coeff2_3_16x8b); //a2*c2+a3*c3 a3*c2+a4*c3 a4*c2+a5*c3 a5*c2+a6*c3 + //b2*c2+b3*c3 b3*c2+b4*c3 b4*c2+b5*c3 b5*c2+b6*c3 + + src_r0_16x8b = _mm_srli_si128(src_r0_16x8b, 4); //a4 a5 a5 a6 a6 a7 a7 a8 0 0 0 0 0 0 0 0 + src_r1_16x8b = _mm_srli_si128(src_r1_16x8b, 4); //b4 b5 b5 b6 b6 b7 b7 b8 0 0 0 0 0 0 0 0 + + src_r0r1_16x8b = _mm_unpacklo_epi64(src_r0_16x8b, src_r1_16x8b); //a4 a5 a5 a6 a6 a7 a7 a8 b4 b5 b5 b6 b6 b7 b7 b8 + res_r0r1_t3_8x16b = _mm_maddubs_epi16(src_r0r1_16x8b, coeff4_5_16x8b); //a4*c4+a5*c5 a5*c4+a6*c5 a6*c4+a7*c5 a7*c4+a8*c5 + //b4*c4+b5*c5 b5*c4+b6*c5 b4*c6+b7*c5 b7*c4+b8*c5 + + res_r0r1_t1_8x16b = _mm_add_epi16(res_r0r1_t1_8x16b, res_r0r1_t2_8x16b); + res_r0r1_t3_8x16b = _mm_add_epi16(const_val16_8x16b, res_r0r1_t3_8x16b); + res_r0r1_t1_8x16b = _mm_add_epi16(res_r0r1_t1_8x16b, res_r0r1_t3_8x16b); //a0*c0+a1*c1+a2*c2+a3*c3+a4*a4+a5*c5 + 16; + //a1*c0+a2*c1+a2*c2+a3*c3+a5*a4+a6*c5 + 16; + //a2*c0+a3*c1+a4*c2+a5*c3+a6*a4+a7*c5 + 16; + //a3*c0+a4*c1+a5*c2+a6*c3+a6*a4+a8*c5 + 16; + //b0*c0+b1*c1+b2*c2+b3*c3+b4*b4+b5*c5 + 16; + //b1*c0+b2*c1+b2*c2+b3*c3+b5*b4+b6*c5 + 16; + //b2*c0+b3*c1+b4*c2+b5*c3+b6*b4+b7*c5 + 16; + //b3*c0+b4*c1+b5*c2+b6*c3+b6*b4+b8*c5 + 16; + + res_r0r1_t1_8x16b = _mm_srai_epi16(res_r0r1_t1_8x16b, 5); //shifting right by 5 bits. + + res_r0r1_16x8b = _mm_packus_epi16(res_r0r1_t1_8x16b, res_r0r1_t1_8x16b); + + _mm_maskmoveu_si128(res_r0r1_16x8b, mask_low32b, (char*)pu1_dst); + res_r0r1_16x8b = _mm_srli_si128(res_r0r1_16x8b, 4); + _mm_maskmoveu_si128(res_r0r1_16x8b, mask_low32b, (char*)(pu1_dst + dst_strd)); + + ht -= 2; + pu1_src += src_strd << 1; + pu1_dst += dst_strd << 1; + } + while(ht > 0); + } + else if(wd == 8) + { + __m128i src_r0_16x8b, src_r1_16x8b, src_r0_sht_16x8b, src_r1_sht_16x8b; + __m128i src_r0_t1_16x8b, src_r1_t1_16x8b; + + __m128i res_r0_t1_8x16b, res_r0_t2_8x16b, res_r0_t3_8x16b; + __m128i res_r1_t1_8x16b, res_r1_t2_8x16b, res_r1_t3_8x16b; + + //Row0 : a0 a1 a2 a3 a4 a5 a6 a7 a8 a9..... + //Row1 : b0 b1 b2 b3 b4 b5 b6 b7 b8 b9..... + + do + { + src_r0_16x8b = _mm_loadu_si128((__m128i *)pu1_src); //a0 a1 a2 a3 a4 a5 a6 a7 a8 a9....a15 + src_r1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + src_strd)); //b0 b1 b2 b3 b4 b5 b6 b7 b8 b9....b15 + + src_r0_sht_16x8b = _mm_srli_si128(src_r0_16x8b, 1); //a1 a2 a3 a4 a5 a6 a7 a8 a9....a15 0 + src_r1_sht_16x8b = _mm_srli_si128(src_r1_16x8b, 1); //b1 b2 b3 b4 b5 b6 b7 b8 b9....b15 0 + + src_r0_t1_16x8b = _mm_unpacklo_epi8(src_r0_16x8b, src_r0_sht_16x8b); //a0 a1 a1 a2 a2 a3 a3 a4 a4 a5 a5 a6 a6 a7 a7 a8 + src_r1_t1_16x8b = _mm_unpacklo_epi8(src_r1_16x8b, src_r1_sht_16x8b); //b0 b1 b1 b2 b2 b3 b3 b4 b4 b5 b5 b6 b6 b7 b7 b8 + + res_r0_t1_8x16b = _mm_maddubs_epi16(src_r0_t1_16x8b, coeff0_1_16x8b); //a0*c0+a1*c1 a1*c0+a2*c1 a2*c0+a3*c1 a3*c0+a4*c1 + //a4*c0+a5*c1 a5*c0+a6*c1 a6*c0+a7*c1 a7*c0+a8*c1 + res_r1_t1_8x16b = _mm_maddubs_epi16(src_r1_t1_16x8b, coeff0_1_16x8b); //b0*c0+b1*c1 b1*c0+b2*c1 b2*c0+b3*c1 b3*c0+b4*c1 + //b4*c0+b5*c1 b5*c0+b6*c1 b6*c0+b7*c1 b7*c0+b8*c1 + + src_r0_16x8b = _mm_srli_si128(src_r0_16x8b, 2); //a2 a3 a4 a5 a6 a7 a8 a9....a15 0 0 + src_r1_16x8b = _mm_srli_si128(src_r1_16x8b, 2); //b2 b3 b4 b5 b6 b7 b8 b9....b15 0 0 + + src_r0_sht_16x8b = _mm_srli_si128(src_r0_sht_16x8b, 2); //a3 a4 a5 a6 a7 a8 a9....a15 0 0 0 + src_r1_sht_16x8b = _mm_srli_si128(src_r1_sht_16x8b, 2); //b3 b4 b5 b6 b7 b8 b9....b15 0 0 0 + + src_r0_t1_16x8b = _mm_unpacklo_epi8(src_r0_16x8b, src_r0_sht_16x8b); //a2 a3 a3 a4 a4 a5 a5 a6 a6 a7 a7 a8 a8 a9 a9 a10 + src_r1_t1_16x8b = _mm_unpacklo_epi8(src_r1_16x8b, src_r1_sht_16x8b); //b2 b3 b3 b4 b4 b5 b5 b6 b6 b7 b7 b8 a8 a9 a9 a10 + + res_r0_t2_8x16b = _mm_maddubs_epi16(src_r0_t1_16x8b, coeff2_3_16x8b); //a2*c2+a3*c3 a3*c2+a4*c3 a4*c2+a5*c3 a5*c2+a6*c3 + //a6*c2+a7*c3 a7*c2+a8*c3 a8*c2+a9*c3 a9*c2+a10*c3 + res_r1_t2_8x16b = _mm_maddubs_epi16(src_r1_t1_16x8b, coeff2_3_16x8b); //b2*c2+b3*c3 b3*c2+b4*c3 b2*c4+b5*c3 b5*c2+b6*c3 + //b6*c2+b7*c3 b7*c2+b8*c3 b8*c2+b9*c3 b9*c2+b10*c3 + + src_r0_16x8b = _mm_srli_si128(src_r0_16x8b, 2); //a4 a5 a6 a7 a8 a9....a15 0 0 0 0 + src_r1_16x8b = _mm_srli_si128(src_r1_16x8b, 2); //b4 b5 b6 b7 b8 b9....b15 0 0 0 0 + + src_r0_sht_16x8b = _mm_srli_si128(src_r0_sht_16x8b, 2); //a5 a6 a7 a8 a9....a15 0 0 0 0 0 + src_r1_sht_16x8b = _mm_srli_si128(src_r1_sht_16x8b, 2); //b5 b6 b7 b8 b9....b15 0 0 0 0 0 + + src_r0_t1_16x8b = _mm_unpacklo_epi8(src_r0_16x8b, src_r0_sht_16x8b); //a4 a5 a5 a6 a6 a7 a7 a8 a8 a9 a9 a10 a10 a11 a11 a12 + src_r1_t1_16x8b = _mm_unpacklo_epi8(src_r1_16x8b, src_r1_sht_16x8b); //b4 b5 b5 b6 b6 b7 b7 b8 b8 b9 b9 b10 b10 b11 b11 b12 + + res_r0_t3_8x16b = _mm_maddubs_epi16(src_r0_t1_16x8b, coeff4_5_16x8b); //a4*c4+a5*c5 a5*c4+a6*c5 a6*c4+a7*c5 a7*c4+a8*c5 + //a8*c4+a9*c5 a9*c4+a10*c5 a10*c4+a11*c5 a11*c4+a12*c5 + res_r1_t3_8x16b = _mm_maddubs_epi16(src_r1_t1_16x8b, coeff4_5_16x8b); //b4*c4+b5*c5 b5*c4+b6*c5 b6*c4+b7*c5 b7*c4+b8*c5 + //b8*c4+b9*c5 b9*c4+b10*c5 b10*c4+b11*c5 b11*c4+b12*c5 + res_r0_t1_8x16b = _mm_add_epi16(res_r0_t1_8x16b, res_r0_t2_8x16b); + res_r1_t1_8x16b = _mm_add_epi16(res_r1_t1_8x16b, res_r1_t2_8x16b); + res_r0_t3_8x16b = _mm_add_epi16(res_r0_t3_8x16b, const_val16_8x16b); + res_r1_t3_8x16b = _mm_add_epi16(res_r1_t3_8x16b, const_val16_8x16b); + res_r0_t1_8x16b = _mm_add_epi16(res_r0_t1_8x16b, res_r0_t3_8x16b); + res_r1_t1_8x16b = _mm_add_epi16(res_r1_t1_8x16b, res_r1_t3_8x16b); + + res_r0_t1_8x16b = _mm_srai_epi16(res_r0_t1_8x16b, 5); //shifting right by 5 bits. + res_r1_t1_8x16b = _mm_srai_epi16(res_r1_t1_8x16b, 5); + + src_r0_16x8b = _mm_packus_epi16(res_r0_t1_8x16b, res_r0_t1_8x16b); + src_r1_16x8b = _mm_packus_epi16(res_r1_t1_8x16b, res_r1_t1_8x16b); + + _mm_storel_epi64((__m128i *)pu1_dst, src_r0_16x8b); + _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd), src_r1_16x8b); + + ht -= 2; + pu1_src += src_strd << 1; + pu1_dst += dst_strd << 1; + } + while(ht > 0); + } + else // wd == 16 + { + __m128i src_r0_16x8b, src_r1_16x8b, src_r0_sht_16x8b, src_r1_sht_16x8b; + __m128i src_r0_t1_16x8b, src_r1_t1_16x8b; + + __m128i res_r0_t1_8x16b, res_r0_t2_8x16b, res_r0_t3_8x16b; + __m128i res_r1_t1_8x16b, res_r1_t2_8x16b, res_r1_t3_8x16b; + + //Row0 : a0 a1 a2 a3 a4 a5 a6 a7 a8 a9..... + //Row0 : b0 b1 b2 b3 b4 b5 b6 b7 b8 b9..... + //b0 is same a8. Similarly other bn pixels are same as a(n+8) pixels. + + do + { + src_r0_16x8b = _mm_loadu_si128((__m128i *)pu1_src); //a0 a1 a2 a3 a4 a5 a6 a7 a8 a9....a15 + src_r1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + 8)); //b0 b1 b2 b3 b4 b5 b6 b7 b8 b9....b15 + + src_r0_sht_16x8b = _mm_srli_si128(src_r0_16x8b, 1); //a1 a2 a3 a4 a5 a6 a7 a8 a9....a15 0 + src_r1_sht_16x8b = _mm_srli_si128(src_r1_16x8b, 1); //b1 b2 b3 b4 b5 b6 b7 b8 b9....b15 0 + + src_r0_t1_16x8b = _mm_unpacklo_epi8(src_r0_16x8b, src_r0_sht_16x8b); //a0 a1 a1 a2 a2 a3 a3 a4 a4 a5 a5 a6 a6 a7 a7 a8 + src_r1_t1_16x8b = _mm_unpacklo_epi8(src_r1_16x8b, src_r1_sht_16x8b); //b0 b1 b1 b2 b2 b3 b3 b4 b4 b5 b5 b6 b6 b7 b7 b8 + + res_r0_t1_8x16b = _mm_maddubs_epi16(src_r0_t1_16x8b, coeff0_1_16x8b); //a0*c0+a1*c1 a1*c0+a2*c1 a2*c0+a3*c1 a3*c0+a4*c1 + //a4*c0+a5*c1 a5*c0+a6*c1 a6*c0+a7*c1 a7*c0+a8*c1 + res_r1_t1_8x16b = _mm_maddubs_epi16(src_r1_t1_16x8b, coeff0_1_16x8b); //b0*c0+b1*c1 b1*c0+b2*c1 b2*c0+b3*c1 b3*c0+b4*c1 + //b4*c0+b5*c1 b5*c0+b6*c1 b6*c0+b7*c1 b7*c0+b8*c1 + + src_r0_16x8b = _mm_srli_si128(src_r0_16x8b, 2); //a2 a3 a4 a5 a6 a7 a8 a9....a15 0 0 + src_r1_16x8b = _mm_srli_si128(src_r1_16x8b, 2); //b2 b3 b4 b5 b6 b7 b8 b9....b15 0 0 + + src_r0_sht_16x8b = _mm_srli_si128(src_r0_sht_16x8b, 2); //a3 a4 a5 a6 a7 a8 a9....a15 0 0 0 + src_r1_sht_16x8b = _mm_srli_si128(src_r1_sht_16x8b, 2); //b3 b4 b5 b6 b7 b8 b9....b15 0 0 0 + + src_r0_t1_16x8b = _mm_unpacklo_epi8(src_r0_16x8b, src_r0_sht_16x8b); //a2 a3 a3 a4 a4 a5 a5 a6 a6 a7 a7 a8 a8 a9 a9 a10 + src_r1_t1_16x8b = _mm_unpacklo_epi8(src_r1_16x8b, src_r1_sht_16x8b); //b2 b3 b3 b4 b4 b5 b5 b6 b6 b7 b7 b8 a8 a9 a9 a10 + + res_r0_t2_8x16b = _mm_maddubs_epi16(src_r0_t1_16x8b, coeff2_3_16x8b); //a2*c2+a3*c3 a3*c2+a4*c3 a4*c2+a5*c3 a5*c2+a6*c3 + //a6*c2+a7*c3 a7*c2+a8*c3 a8*c2+a9*c3 a9*c2+a10*c3 + res_r1_t2_8x16b = _mm_maddubs_epi16(src_r1_t1_16x8b, coeff2_3_16x8b); //b2*c2+b3*c3 b3*c2+b4*c3 b2*c4+b5*c3 b5*c2+b6*c3 + //b6*c2+b7*c3 b7*c2+b8*c3 b8*c2+b9*c3 b9*c2+b10*c3 + + src_r0_16x8b = _mm_srli_si128(src_r0_16x8b, 2); //a4 a5 a6 a7 a8 a9....a15 0 0 0 0 + src_r1_16x8b = _mm_srli_si128(src_r1_16x8b, 2); //b4 b5 b6 b7 b8 b9....b15 0 0 0 0 + + src_r0_sht_16x8b = _mm_srli_si128(src_r0_sht_16x8b, 2); //a5 a6 a7 a8 a9....a15 0 0 0 0 0 + src_r1_sht_16x8b = _mm_srli_si128(src_r1_sht_16x8b, 2); //b5 b6 b7 b8 b9....b15 0 0 0 0 0 + + src_r0_t1_16x8b = _mm_unpacklo_epi8(src_r0_16x8b, src_r0_sht_16x8b); //a4 a5 a5 a6 a6 a7 a7 a8 a8 a9 a9 a10 a10 a11 a11 a12 + src_r1_t1_16x8b = _mm_unpacklo_epi8(src_r1_16x8b, src_r1_sht_16x8b); //b4 b5 b5 b6 b6 b7 b7 b8 b8 b9 b9 b10 b10 b11 b11 b12 + + res_r0_t3_8x16b = _mm_maddubs_epi16(src_r0_t1_16x8b, coeff4_5_16x8b); //a4*c4+a5*c5 a5*c4+a6*c5 a6*c4+a7*c5 a7*c4+a8*c5 + //a8*c4+a9*c5 a9*c4+a10*c5 a10*c4+a11*c5 a11*c4+a12*c5 + res_r1_t3_8x16b = _mm_maddubs_epi16(src_r1_t1_16x8b, coeff4_5_16x8b); //b4*c4+b5*c5 b5*c4+b6*c5 b6*c4+b7*c5 b7*c4+b8*c5 + //b8*c4+b9*c5 b9*c4+b10*c5 b10*c4+b11*c5 b11*c4+b12*c5 + res_r0_t1_8x16b = _mm_add_epi16(res_r0_t1_8x16b, res_r0_t2_8x16b); + res_r1_t1_8x16b = _mm_add_epi16(res_r1_t1_8x16b, res_r1_t2_8x16b); + res_r0_t3_8x16b = _mm_add_epi16(res_r0_t3_8x16b, const_val16_8x16b); + res_r1_t3_8x16b = _mm_add_epi16(res_r1_t3_8x16b, const_val16_8x16b); + res_r0_t1_8x16b = _mm_add_epi16(res_r0_t1_8x16b, res_r0_t3_8x16b); + res_r1_t1_8x16b = _mm_add_epi16(res_r1_t1_8x16b, res_r1_t3_8x16b); + + res_r0_t1_8x16b = _mm_srai_epi16(res_r0_t1_8x16b, 5); //shifting right by 5 bits. + res_r1_t1_8x16b = _mm_srai_epi16(res_r1_t1_8x16b, 5); + + src_r0_16x8b = _mm_packus_epi16(res_r0_t1_8x16b, res_r1_t1_8x16b); + _mm_storeu_si128((__m128i *)pu1_dst, src_r0_16x8b); + + ht--; + pu1_src += src_strd; + pu1_dst += dst_strd; + } + while(ht > 0); + } +} + +/*****************************************************************************/ +/* */ +/* Function Name : ih264_inter_pred_luma_vert_ssse3 */ +/* */ +/* Description : This function applies a vertical 6-tap filter on */ +/* ht x wd block as mentioned in sec. 8.4.2.2.1 titled */ +/* "Luma sample interpolation process". (ht,wd) can be */ +/* (4,4), (8,4), (4,8), (8,8), (16,8), (8,16) or (16,16). */ +/* */ +/* Inputs : puc_src - pointer to source */ +/* puc_dst - pointer to destination */ +/* src_strd - stride for source */ +/* dst_strd - stride for destination */ +/* ht - height of the block */ +/* wd - width of the block */ +/* */ +/* Issues : None */ +/* */ +/* Revision History: */ +/* */ +/* DD MM YYYY Author(s) Changes */ +/* 13 02 2015 Kaushik Initial Version */ +/* Senthoor */ +/* */ +/*****************************************************************************/ +void ih264_inter_pred_luma_vert_ssse3(UWORD8 *pu1_src, + UWORD8 *pu1_dst, + WORD32 src_strd, + WORD32 dst_strd, + WORD32 ht, + WORD32 wd, + UWORD8* pu1_tmp, + WORD32 dydx) +{ + __m128i src_r0_16x8b, src_r1_16x8b, src_r2_16x8b, src_r3_16x8b, src_r4_16x8b; + __m128i src_r5_16x8b, src_r6_16x8b; + __m128i src_r0r1_16x8b, src_r2r3_16x8b, src_r4r5_16x8b; + + __m128i res_16x8b, res_t1_8x16b, res_t2_8x16b, res_t3_8x16b; + + __m128i coeff0_1_16x8b, coeff2_3_16x8b, coeff4_5_16x8b; + __m128i const_val16_8x16b; + + UNUSED(pu1_tmp); + UNUSED(dydx); + + coeff0_1_16x8b = _mm_set1_epi32(0xFB01FB01); //c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 + coeff2_3_16x8b = _mm_set1_epi32(0x14141414); //c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 + coeff4_5_16x8b = _mm_set1_epi32(0x01FB01FB); //c4 c5 c4 c5 c4 c5 c4 c5 c4 c5 c4 c5 c4 c5 c4 c5 + //c0 = c5 = 1, c1 = c4 = -5, c2 = c3 = 20 + const_val16_8x16b = _mm_set1_epi16(16); + + pu1_src -= src_strd << 1; // the filter input starts from x[-2] (till x[3]) + + if(wd == 4) + { + __m128i mask_low32b; + + mask_low32b = _mm_set1_epi8(0xff); + + //Epilogue: Load all the pred rows except sixth and seventh row + // for the first and second row processing. + src_r0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src); + pu1_src += src_strd; + src_r1_16x8b = _mm_loadl_epi64((__m128i *)pu1_src); + pu1_src += src_strd; + src_r2_16x8b = _mm_loadl_epi64((__m128i *)pu1_src); + pu1_src += src_strd; + src_r3_16x8b = _mm_loadl_epi64((__m128i *)pu1_src); + pu1_src += src_strd; + src_r4_16x8b = _mm_loadl_epi64((__m128i *)pu1_src); + pu1_src += src_strd; + + mask_low32b = _mm_srli_si128(mask_low32b, 12); // mask for first four bytes + + src_r0_16x8b = _mm_unpacklo_epi32(src_r0_16x8b, src_r1_16x8b); + src_r1_16x8b = _mm_unpacklo_epi32(src_r1_16x8b, src_r2_16x8b); + src_r2_16x8b = _mm_unpacklo_epi32(src_r2_16x8b, src_r3_16x8b); + src_r3_16x8b = _mm_unpacklo_epi32(src_r3_16x8b, src_r4_16x8b); + + do + { + src_r5_16x8b = _mm_loadl_epi64((__m128i *)pu1_src); + src_r6_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + src_strd)); + + src_r4_16x8b = _mm_unpacklo_epi32(src_r4_16x8b, src_r5_16x8b); + src_r5_16x8b = _mm_unpacklo_epi32(src_r5_16x8b, src_r6_16x8b); + + src_r0r1_16x8b = _mm_unpacklo_epi8(src_r0_16x8b, src_r1_16x8b); + src_r2r3_16x8b = _mm_unpacklo_epi8(src_r2_16x8b, src_r3_16x8b); + src_r4r5_16x8b = _mm_unpacklo_epi8(src_r4_16x8b, src_r5_16x8b); + + res_t1_8x16b = _mm_maddubs_epi16(src_r0r1_16x8b, coeff0_1_16x8b); + res_t2_8x16b = _mm_maddubs_epi16(src_r2r3_16x8b, coeff2_3_16x8b); + res_t3_8x16b = _mm_maddubs_epi16(src_r4r5_16x8b, coeff4_5_16x8b); + + res_t1_8x16b = _mm_add_epi16(res_t1_8x16b, res_t2_8x16b); + res_t3_8x16b = _mm_add_epi16(res_t3_8x16b, const_val16_8x16b); + res_t1_8x16b = _mm_add_epi16(res_t1_8x16b, res_t3_8x16b); + + res_t1_8x16b = _mm_srai_epi16(res_t1_8x16b, 5); //shifting right by 5 bits. + res_16x8b = _mm_packus_epi16(res_t1_8x16b, res_t1_8x16b); + + _mm_maskmoveu_si128(res_16x8b, mask_low32b, (char*)pu1_dst); + res_16x8b = _mm_srli_si128(res_16x8b, 4); + _mm_maskmoveu_si128(res_16x8b, mask_low32b, (char*)(pu1_dst + dst_strd)); + + src_r0_16x8b = src_r2_16x8b; + src_r1_16x8b = src_r3_16x8b; + src_r2_16x8b = src_r4_16x8b; + src_r3_16x8b = src_r5_16x8b; + src_r4_16x8b = src_r6_16x8b; + + ht -= 2; + pu1_src += src_strd << 1; + pu1_dst += dst_strd << 1; + } + while(ht > 0); + } + + else if(wd == 8) + { + //Epilogue: Load all the pred rows except sixth and seventh row + // for the first and second row processing. + src_r0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src); + pu1_src += src_strd; + src_r1_16x8b = _mm_loadl_epi64((__m128i *)pu1_src); + pu1_src += src_strd; + src_r2_16x8b = _mm_loadl_epi64((__m128i *)pu1_src); + pu1_src += src_strd; + src_r3_16x8b = _mm_loadl_epi64((__m128i *)pu1_src); + pu1_src += src_strd; + src_r4_16x8b = _mm_loadl_epi64((__m128i *)pu1_src); + pu1_src += src_strd; + + src_r0_16x8b = _mm_unpacklo_epi64(src_r0_16x8b, src_r1_16x8b); + src_r1_16x8b = _mm_unpacklo_epi64(src_r1_16x8b, src_r2_16x8b); + src_r2_16x8b = _mm_unpacklo_epi64(src_r2_16x8b, src_r3_16x8b); + src_r3_16x8b = _mm_unpacklo_epi64(src_r3_16x8b, src_r4_16x8b); + + do + { + src_r5_16x8b = _mm_loadl_epi64((__m128i *)pu1_src); + src_r6_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + src_strd)); + + src_r4_16x8b = _mm_unpacklo_epi64(src_r4_16x8b, src_r5_16x8b); + src_r5_16x8b = _mm_unpacklo_epi64(src_r5_16x8b, src_r6_16x8b); + + src_r0r1_16x8b = _mm_unpacklo_epi8(src_r0_16x8b, src_r1_16x8b); + src_r2r3_16x8b = _mm_unpacklo_epi8(src_r2_16x8b, src_r3_16x8b); + src_r4r5_16x8b = _mm_unpacklo_epi8(src_r4_16x8b, src_r5_16x8b); + + res_t1_8x16b = _mm_maddubs_epi16(src_r0r1_16x8b, coeff0_1_16x8b); + res_t2_8x16b = _mm_maddubs_epi16(src_r2r3_16x8b, coeff2_3_16x8b); + res_t3_8x16b = _mm_maddubs_epi16(src_r4r5_16x8b, coeff4_5_16x8b); + + res_t1_8x16b = _mm_add_epi16(res_t1_8x16b, res_t2_8x16b); + res_t3_8x16b = _mm_add_epi16(res_t3_8x16b, const_val16_8x16b); + res_t1_8x16b = _mm_add_epi16(res_t1_8x16b, res_t3_8x16b); + + res_t1_8x16b = _mm_srai_epi16(res_t1_8x16b, 5); //shifting right by 5 bits. + res_16x8b = _mm_packus_epi16(res_t1_8x16b, res_t1_8x16b); + + _mm_storel_epi64((__m128i *)pu1_dst, res_16x8b); + + src_r0r1_16x8b = _mm_unpackhi_epi8(src_r0_16x8b, src_r1_16x8b); + src_r2r3_16x8b = _mm_unpackhi_epi8(src_r2_16x8b, src_r3_16x8b); + src_r4r5_16x8b = _mm_unpackhi_epi8(src_r4_16x8b, src_r5_16x8b); + + res_t1_8x16b = _mm_maddubs_epi16(src_r0r1_16x8b, coeff0_1_16x8b); + res_t2_8x16b = _mm_maddubs_epi16(src_r2r3_16x8b, coeff2_3_16x8b); + res_t3_8x16b = _mm_maddubs_epi16(src_r4r5_16x8b, coeff4_5_16x8b); + + res_t1_8x16b = _mm_add_epi16(res_t1_8x16b, res_t2_8x16b); + res_t3_8x16b = _mm_add_epi16(res_t3_8x16b, const_val16_8x16b); + res_t1_8x16b = _mm_add_epi16(res_t1_8x16b, res_t3_8x16b); + + res_t1_8x16b = _mm_srai_epi16(res_t1_8x16b, 5); //shifting right by 5 bits. + res_16x8b = _mm_packus_epi16(res_t1_8x16b, res_t1_8x16b); + + _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd), res_16x8b); + + src_r0_16x8b = src_r2_16x8b; + src_r1_16x8b = src_r3_16x8b; + src_r2_16x8b = src_r4_16x8b; + src_r3_16x8b = src_r5_16x8b; + src_r4_16x8b = src_r6_16x8b; + + ht -= 2; + pu1_src += src_strd << 1; + pu1_dst += dst_strd << 1; + } + while(ht > 0); + } + else // wd == 16 + { + __m128i res_t0_8x16b; + + //Epilogue: Load all the pred rows except sixth and seventh row + // for the first and second row processing. + src_r0_16x8b = _mm_loadu_si128((__m128i *)pu1_src); + pu1_src += src_strd; + src_r1_16x8b = _mm_loadu_si128((__m128i *)pu1_src); + pu1_src += src_strd; + src_r2_16x8b = _mm_loadu_si128((__m128i *)pu1_src); + pu1_src += src_strd; + src_r3_16x8b = _mm_loadu_si128((__m128i *)pu1_src); + pu1_src += src_strd; + src_r4_16x8b = _mm_loadu_si128((__m128i *)pu1_src); + pu1_src += src_strd; + + do + { + src_r5_16x8b = _mm_loadu_si128((__m128i *)pu1_src); + src_r6_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + src_strd)); + + src_r0r1_16x8b = _mm_unpacklo_epi8(src_r0_16x8b, src_r1_16x8b); + src_r2r3_16x8b = _mm_unpacklo_epi8(src_r2_16x8b, src_r3_16x8b); + src_r4r5_16x8b = _mm_unpacklo_epi8(src_r4_16x8b, src_r5_16x8b); + + res_t1_8x16b = _mm_maddubs_epi16(src_r0r1_16x8b, coeff0_1_16x8b); + res_t2_8x16b = _mm_maddubs_epi16(src_r2r3_16x8b, coeff2_3_16x8b); + res_t3_8x16b = _mm_maddubs_epi16(src_r4r5_16x8b, coeff4_5_16x8b); + + res_t1_8x16b = _mm_add_epi16(res_t1_8x16b, res_t2_8x16b); + res_t3_8x16b = _mm_add_epi16(res_t3_8x16b, const_val16_8x16b); + res_t1_8x16b = _mm_add_epi16(res_t1_8x16b, res_t3_8x16b); + res_t0_8x16b = _mm_srai_epi16(res_t1_8x16b, 5); //shifting right by 5 bits. + + src_r0r1_16x8b = _mm_unpackhi_epi8(src_r0_16x8b, src_r1_16x8b); + src_r2r3_16x8b = _mm_unpackhi_epi8(src_r2_16x8b, src_r3_16x8b); + src_r4r5_16x8b = _mm_unpackhi_epi8(src_r4_16x8b, src_r5_16x8b); + + res_t1_8x16b = _mm_maddubs_epi16(src_r0r1_16x8b, coeff0_1_16x8b); + res_t2_8x16b = _mm_maddubs_epi16(src_r2r3_16x8b, coeff2_3_16x8b); + res_t3_8x16b = _mm_maddubs_epi16(src_r4r5_16x8b, coeff4_5_16x8b); + + res_t1_8x16b = _mm_add_epi16(res_t1_8x16b, res_t2_8x16b); + res_t3_8x16b = _mm_add_epi16(res_t3_8x16b, const_val16_8x16b); + res_t1_8x16b = _mm_add_epi16(res_t1_8x16b, res_t3_8x16b); + res_t1_8x16b = _mm_srai_epi16(res_t1_8x16b, 5); //shifting right by 5 bits. + + res_16x8b = _mm_packus_epi16(res_t0_8x16b, res_t1_8x16b); + + _mm_storeu_si128((__m128i *)pu1_dst, res_16x8b); + + src_r0r1_16x8b = _mm_unpacklo_epi8(src_r1_16x8b, src_r2_16x8b); + src_r2r3_16x8b = _mm_unpacklo_epi8(src_r3_16x8b, src_r4_16x8b); + src_r4r5_16x8b = _mm_unpacklo_epi8(src_r5_16x8b, src_r6_16x8b); + + res_t1_8x16b = _mm_maddubs_epi16(src_r0r1_16x8b, coeff0_1_16x8b); + res_t2_8x16b = _mm_maddubs_epi16(src_r2r3_16x8b, coeff2_3_16x8b); + res_t3_8x16b = _mm_maddubs_epi16(src_r4r5_16x8b, coeff4_5_16x8b); + + res_t1_8x16b = _mm_add_epi16(res_t1_8x16b, res_t2_8x16b); + res_t3_8x16b = _mm_add_epi16(res_t3_8x16b, const_val16_8x16b); + res_t1_8x16b = _mm_add_epi16(res_t1_8x16b, res_t3_8x16b); + res_t0_8x16b = _mm_srai_epi16(res_t1_8x16b, 5); //shifting right by 5 bits. + + src_r0r1_16x8b = _mm_unpackhi_epi8(src_r1_16x8b, src_r2_16x8b); + src_r2r3_16x8b = _mm_unpackhi_epi8(src_r3_16x8b, src_r4_16x8b); + src_r4r5_16x8b = _mm_unpackhi_epi8(src_r5_16x8b, src_r6_16x8b); + + res_t1_8x16b = _mm_maddubs_epi16(src_r0r1_16x8b, coeff0_1_16x8b); + res_t2_8x16b = _mm_maddubs_epi16(src_r2r3_16x8b, coeff2_3_16x8b); + res_t3_8x16b = _mm_maddubs_epi16(src_r4r5_16x8b, coeff4_5_16x8b); + + res_t1_8x16b = _mm_add_epi16(res_t1_8x16b, res_t2_8x16b); + res_t3_8x16b = _mm_add_epi16(res_t3_8x16b, const_val16_8x16b); + res_t1_8x16b = _mm_add_epi16(res_t1_8x16b, res_t3_8x16b); + res_t1_8x16b = _mm_srai_epi16(res_t1_8x16b, 5); //shifting right by 5 bits. + + res_16x8b = _mm_packus_epi16(res_t0_8x16b, res_t1_8x16b); + + _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd), res_16x8b); + + src_r0_16x8b = src_r2_16x8b; + src_r1_16x8b = src_r3_16x8b; + src_r2_16x8b = src_r4_16x8b; + src_r3_16x8b = src_r5_16x8b; + src_r4_16x8b = src_r6_16x8b; + + ht -= 2; + pu1_src += src_strd << 1; + pu1_dst += dst_strd << 1; + } + while(ht > 0); + } +} + +/*****************************************************************************/ +/* */ +/* Function Name : ih264_inter_pred_luma_horz_hpel_vert_hpel_ssse3 */ +/* */ +/* Description : This function implements a two stage cascaded six tap */ +/* filter, horizontally and then vertically on ht x wd */ +/* block as mentioned in sec. 8.4.2.2.1 titled "Luma sample */ +/* interpolation process". (ht,wd) can be (4,4), (8,4), */ +/* (4,8), (8,8), (16,8), (8,16) or (16,16). */ +/* */ +/* Inputs : puc_src - pointer to source */ +/* puc_dst - pointer to destination */ +/* src_strd - stride for source */ +/* dst_strd - stride for destination */ +/* ht - height of the block */ +/* wd - width of the block */ +/* pu1_tmp - pointer to temporary buffer */ +/* */ +/* Issues : None */ +/* */ +/* Revision History: */ +/* */ +/* DD MM YYYY Author(s) Changes */ +/* 13 02 2015 Kaushik Initial Version */ +/* Senthoor */ +/* */ +/*****************************************************************************/ +void ih264_inter_pred_luma_horz_hpel_vert_hpel_ssse3(UWORD8 *pu1_src, + UWORD8 *pu1_dst, + WORD32 src_strd, + WORD32 dst_strd, + WORD32 ht, + WORD32 wd, + UWORD8* pu1_tmp, + WORD32 dydx) +{ + UNUSED(dydx); + + if(wd == 4) + { + WORD16 *pi2_temp; + + pu1_tmp += 4; + pu1_src -= src_strd << 1; + pi2_temp = (WORD16 *)pu1_tmp; + pu1_src -= 2; // the filter input starts from x[-2] (till x[3]) + + // Horizontal 6-tap filtering + { + WORD32 ht_tmp = ht + 4; + + __m128i src_r0_16x8b, src_r1_16x8b; + __m128i src_r0_sht_16x8b, src_r1_sht_16x8b; + __m128i src_r0r1_t1_16x8b; + __m128i res_r0r1_t1_8x16b, res_r0r1_t2_8x16b, res_r0r1_t3_8x16b; + __m128i coeff0_1_16x8b, coeff2_3_16x8b, coeff4_5_16x8b; + + coeff0_1_16x8b = _mm_set1_epi32(0xFB01FB01); //c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 + coeff2_3_16x8b = _mm_set1_epi32(0x14141414); //c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 + coeff4_5_16x8b = _mm_set1_epi32(0x01FB01FB); //c4 c5 c5 c5 c4 c5 c5 c5 c4 c5 c5 c5 c4 c5 c5 c5 + //c0 = c5 = 1, c1 = c4 = -5, c2 = c3 = 20 + //Row0 : a0 a1 a2 a3 a4 a5 a6 a7 a8 a9..... + //Row1 : b0 b1 b2 b3 b4 b5 b6 b7 b8 b9..... + + do + { + src_r0_16x8b = _mm_loadu_si128((__m128i *)pu1_src); //a0 a1 a2 a3 a4 a5 a6 a7 a8 a9....a15 + src_r1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + src_strd)); //b0 b1 b2 b3 b4 b5 b6 b7 b8 b9....b15 + + src_r0_sht_16x8b = _mm_srli_si128(src_r0_16x8b, 1); //a1 a2 a3 a4 a5 a6 a7 a8 a9....a15 0 + src_r1_sht_16x8b = _mm_srli_si128(src_r1_16x8b, 1); //b1 b2 b3 b4 b5 b6 b7 b8 b9....b15 0 + + src_r0_16x8b = _mm_unpacklo_epi8(src_r0_16x8b, src_r0_sht_16x8b); //a0 a1 a1 a2 a2 a3 a3 a4 a4 a5 a5 a6 a6 a7 a7 a8 + src_r1_16x8b = _mm_unpacklo_epi8(src_r1_16x8b, src_r1_sht_16x8b); //b0 b1 b1 b2 b2 b3 b3 b4 b4 b5 b5 b6 b6 b7 b7 b8 + + src_r0r1_t1_16x8b = _mm_unpacklo_epi64(src_r0_16x8b, src_r1_16x8b); //a0 a1 a1 a2 a2 a3 a3 a4 b0 b1 b1 b2 b2 b3 b3 b4 + res_r0r1_t1_8x16b = _mm_maddubs_epi16(src_r0r1_t1_16x8b, coeff0_1_16x8b); //a0*c0+a1*c1 a1*c0+a2*c1 a2*c0+a3*c1 a3*c0+a4*c1 + //b0*c0+b1*c1 b1*c0+b2*c1 b2*c0+b3*c1 b3*c0+b4*c1 + + src_r0_16x8b = _mm_srli_si128(src_r0_16x8b, 4); //a2 a3 a3 a4 a4 a5 a5 a6 a6 a7 a7 a8 0 0 0 0 + src_r1_16x8b = _mm_srli_si128(src_r1_16x8b, 4); //b2 b3 b3 b4 b4 b5 b5 b6 b6 b7 b7 b8 0 0 0 0 + + src_r0r1_t1_16x8b = _mm_unpacklo_epi64(src_r0_16x8b, src_r1_16x8b); //a2 a3 a3 a4 a4 a5 a5 a6 b2 b3 b3 b4 b4 b5 b5 b6 + res_r0r1_t2_8x16b = _mm_maddubs_epi16(src_r0r1_t1_16x8b, coeff2_3_16x8b); //a2*c2+a3*c3 a3*c2+a4*c3 a4*c2+a5*c3 a5*c2+a6*c3 + //b2*c2+b3*c3 b3*c2+b4*c3 b4*c2+b5*c3 b5*c2+b6*c3 + + src_r0_16x8b = _mm_srli_si128(src_r0_16x8b, 4); //a4 a5 a5 a6 a6 a7 a7 a8 0 0 0 0 0 0 0 0 + src_r1_16x8b = _mm_srli_si128(src_r1_16x8b, 4); //b4 b5 b5 b6 b6 b7 b7 b8 0 0 0 0 0 0 0 0 + + src_r0r1_t1_16x8b = _mm_unpacklo_epi64(src_r0_16x8b, src_r1_16x8b); //a4 a5 a5 a6 a6 a7 a7 a8 b4 b5 b5 b6 b6 b7 b7 b8 + res_r0r1_t3_8x16b = _mm_maddubs_epi16(src_r0r1_t1_16x8b, coeff4_5_16x8b); //a4*c4+a5*c5 a5*c4+a6*c5 a6*c4+a7*c5 a7*c4+a8*c5 + //b4*c4+b5*c5 b5*c4+b6*c5 b4*c6+b7*c5 b7*c4+b8*c5 + res_r0r1_t1_8x16b = _mm_add_epi16(res_r0r1_t1_8x16b, res_r0r1_t2_8x16b); + res_r0r1_t1_8x16b = _mm_add_epi16(res_r0r1_t3_8x16b, res_r0r1_t1_8x16b); + + _mm_storeu_si128((__m128i *)pi2_temp, res_r0r1_t1_8x16b); + + ht_tmp -= 2; + pu1_src += src_strd << 1; + pi2_temp += 8; + } + while(ht_tmp > 0); + + src_r0_16x8b = _mm_loadu_si128((__m128i *)pu1_src); //a0 a1 a2 a3 a4 a5 a6 a7 a8 a9....a15 + src_r0_sht_16x8b = _mm_srli_si128(src_r0_16x8b, 1); //a1 a2 a3 a4 a5 a6 a7 a8 a9....a15 0 + + src_r0_16x8b = _mm_unpacklo_epi8(src_r0_16x8b, src_r0_sht_16x8b); //a0 a1 a1 a2 a2 a3 a3 a4 a4 a5 a5 a6 a6 a7 a7 a8 + res_r0r1_t1_8x16b = _mm_maddubs_epi16(src_r0_16x8b, coeff0_1_16x8b); //a0*c0+a1*c1 a1*c0+a2*c1 a2*c0+a3*c1 a3*c0+a4*c1 + + src_r0_16x8b = _mm_srli_si128(src_r0_16x8b,4); //a2 a3 a3 a4 a4 a5 a5 a6 a6 a7 a7 a8 0 0 0 0 + res_r0r1_t2_8x16b = _mm_maddubs_epi16(src_r0_16x8b, coeff2_3_16x8b); //a2*c2+a3*c3 a3*c2+a4*c3 a4*c2+a5*c3 a5*c2+a6*c3 + + src_r0_16x8b = _mm_srli_si128(src_r0_16x8b,4); //a4 a5 a5 a6 a6 a7 a7 a8 0 0 0 0 0 0 0 0 + res_r0r1_t3_8x16b = _mm_maddubs_epi16(src_r0_16x8b, coeff4_5_16x8b); //a4*c4+a5*c5 a5*c4+a6*c5 a6*c4+a7*c5 a7*c4+a8*c5 + + res_r0r1_t1_8x16b = _mm_add_epi16(res_r0r1_t1_8x16b, res_r0r1_t2_8x16b); + res_r0r1_t1_8x16b = _mm_add_epi16(res_r0r1_t3_8x16b, res_r0r1_t1_8x16b); + + _mm_storel_epi64((__m128i *)pi2_temp, res_r0r1_t1_8x16b); + } + + pi2_temp = (WORD16 *)pu1_tmp; + + // Vertical 6-tap filtering + { + __m128i src_r0_8x16b, src_r1_8x16b, src_r2_8x16b, src_r3_8x16b, + src_r4_8x16b; + __m128i src_r5_8x16b, src_r6_8x16b; + __m128i src_t1_8x16b, src_t2_8x16b; + + __m128i res_t0_4x32b, res_t1_4x32b, res_t2_4x32b, res_t3_4x32b; + __m128i res_8x16b, res_16x8b; + + __m128i coeff0_1_8x16b, coeff2_3_8x16b, coeff4_5_8x16b; + __m128i const_val512_4x32b, mask_low32b; + + mask_low32b = _mm_set1_epi8(0xff); + + coeff0_1_8x16b = _mm_set1_epi32(0xFFFB0001); + coeff2_3_8x16b = _mm_set1_epi32(0x00140014); + coeff4_5_8x16b = _mm_set1_epi32(0x0001FFFB); + + mask_low32b = _mm_srli_si128(mask_low32b, 12); + const_val512_4x32b = _mm_set1_epi32(512); + + src_r0_8x16b = _mm_loadl_epi64((__m128i *)(pi2_temp)); + src_r1_8x16b = _mm_loadl_epi64((__m128i *)(pi2_temp + 4)); + src_r2_8x16b = _mm_loadl_epi64((__m128i *)(pi2_temp + 8)); + src_r3_8x16b = _mm_loadl_epi64((__m128i *)(pi2_temp + 12)); + src_r4_8x16b = _mm_loadl_epi64((__m128i *)(pi2_temp + 16)); + pi2_temp += 20; + + do + { + src_r5_8x16b = _mm_loadl_epi64((__m128i *)pi2_temp); + src_r6_8x16b = _mm_loadl_epi64((__m128i *)(pi2_temp + 4)); + + src_r0_8x16b = _mm_unpacklo_epi16(src_r0_8x16b, src_r1_8x16b); + src_t1_8x16b = _mm_unpacklo_epi16(src_r2_8x16b, src_r3_8x16b); + src_t2_8x16b = _mm_unpacklo_epi16(src_r4_8x16b, src_r5_8x16b); + + res_t1_4x32b = _mm_madd_epi16(src_r0_8x16b, coeff0_1_8x16b); + res_t2_4x32b = _mm_madd_epi16(src_t1_8x16b, coeff2_3_8x16b); + res_t3_4x32b = _mm_madd_epi16(src_t2_8x16b, coeff4_5_8x16b); + + res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t2_4x32b); + res_t3_4x32b = _mm_add_epi32(res_t3_4x32b, const_val512_4x32b); + res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t3_4x32b); + + res_t0_4x32b = _mm_srai_epi32(res_t1_4x32b, 10); + + src_r1_8x16b = _mm_unpacklo_epi16(src_r1_8x16b, src_r2_8x16b); + src_t1_8x16b = _mm_unpacklo_epi16(src_r3_8x16b, src_r4_8x16b); + src_t2_8x16b = _mm_unpacklo_epi16(src_r5_8x16b, src_r6_8x16b); + + res_t1_4x32b = _mm_madd_epi16(src_r1_8x16b, coeff0_1_8x16b); + res_t2_4x32b = _mm_madd_epi16(src_t1_8x16b, coeff2_3_8x16b); + res_t3_4x32b = _mm_madd_epi16(src_t2_8x16b, coeff4_5_8x16b); + + res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t2_4x32b); + res_t3_4x32b = _mm_add_epi32(res_t3_4x32b, const_val512_4x32b); + res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t3_4x32b); + + res_t1_4x32b = _mm_srai_epi32(res_t1_4x32b, 10); + + res_8x16b = _mm_packs_epi32(res_t0_4x32b, res_t1_4x32b); + res_16x8b = _mm_packus_epi16(res_8x16b, res_8x16b); + + _mm_maskmoveu_si128(res_16x8b, mask_low32b, (char*)pu1_dst); + res_16x8b = _mm_srli_si128(res_16x8b, 4); + _mm_maskmoveu_si128(res_16x8b, mask_low32b, (char*)(pu1_dst + dst_strd)); + + src_r0_8x16b = src_r2_8x16b; + src_r1_8x16b = src_r3_8x16b; + src_r2_8x16b = src_r4_8x16b; + src_r3_8x16b = src_r5_8x16b; + src_r4_8x16b = src_r6_8x16b; + + ht -= 2; + pi2_temp += 8; + pu1_dst += dst_strd << 1; + } + while(ht > 0); + } + } + else if(wd == 8) + { + WORD16 *pi2_temp; + + pu1_tmp += 4; + pu1_src -= src_strd << 1; + pi2_temp = (WORD16 *)pu1_tmp; + pu1_src -= 2; // the filter input starts from x[-2] (till x[3]) + + // Horizontal 6-tap filtering + { + WORD32 ht_tmp = ht + 4; + + __m128i src_r0_16x8b, src_r1_16x8b; + __m128i src_r0_sht_16x8b, src_r1_sht_16x8b; + __m128i src_r0_t1_16x8b, src_r1_t1_16x8b; + __m128i res_r0_t1_8x16b, res_r0_t2_8x16b, res_r0_t3_8x16b; + __m128i res_r1_t1_8x16b, res_r1_t2_8x16b, res_r1_t3_8x16b; + __m128i coeff0_1_16x8b, coeff2_3_16x8b, coeff4_5_16x8b; + + coeff0_1_16x8b = _mm_set1_epi32(0xFB01FB01); //c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 + coeff2_3_16x8b = _mm_set1_epi32(0x14141414); //c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 + coeff4_5_16x8b = _mm_set1_epi32(0x01FB01FB); //c4 c5 c5 c5 c4 c5 c5 c5 c4 c5 c5 c5 c4 c5 c5 c5 + //c0 = c5 = 1, c1 = c4 = -5, c2 = c3 = 20 + //Row0 : a0 a1 a2 a3 a4 a5 a6 a7 a8 a9..... + //Row1 : b0 b1 b2 b3 b4 b5 b6 b7 b8 b9..... + + do + { + src_r0_16x8b = _mm_loadu_si128((__m128i *)pu1_src); //a0 a1 a2 a3 a4 a5 a6 a7 a8 a9 a10 a11 a12 a13 a14 a15 + src_r1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + src_strd)); //b0 b1 b2 b3 b4 b5 b6 b7 b8 b9 b10 b11 b12 b13 b14 b15 + + src_r0_sht_16x8b = _mm_srli_si128(src_r0_16x8b, 1); //a1 a2 a3 a4 a5 a6 a7 a8 a9....a15 0 + src_r1_sht_16x8b = _mm_srli_si128(src_r1_16x8b, 1); //b1 b2 b3 b4 b5 b6 b7 b8 b9....b15 0 + + src_r0_t1_16x8b = _mm_unpacklo_epi8(src_r0_16x8b, src_r0_sht_16x8b); //a0 a1 a1 a2 a2 a3 a3 a4 a4 a5 a5 a6 a6 a7 a7 a8 + src_r1_t1_16x8b = _mm_unpacklo_epi8(src_r1_16x8b, src_r1_sht_16x8b); //b0 b1 b1 b2 b2 b3 b3 b4 b4 b5 b5 b6 b6 b7 b7 b8 + + res_r0_t1_8x16b = _mm_maddubs_epi16(src_r0_t1_16x8b, coeff0_1_16x8b); //a0*c0+a1*c1 a1*c0+a2*c1 a2*c0+a3*c1 a3*c0+a4*c1 + //a4*c0+a5*c1 a5*c0+a6*c1 a6*c0+a7*c1 a7*c0+a8*c1 + res_r1_t1_8x16b = _mm_maddubs_epi16(src_r1_t1_16x8b, coeff0_1_16x8b); //b0*c0+b1*c1 b1*c0+b2*c1 b2*c0+b3*c1 b3*c0+b4*c1 + //b4*c0+b5*c1 b5*c0+b6*c1 b6*c0+b7*c1 b7*c0+b8*c1 + + src_r0_16x8b = _mm_srli_si128(src_r0_16x8b, 2); //a2 a3 a4 a5 a6 a7 a8 a9....a15 0 0 + src_r1_16x8b = _mm_srli_si128(src_r1_16x8b, 2); //b2 b3 b4 b5 b6 b7 b8 b9....b15 0 0 + + src_r0_sht_16x8b = _mm_srli_si128(src_r0_sht_16x8b, 2); //a3 a4 a5 a6 a7 a8 a9....a15 0 0 0 + src_r1_sht_16x8b = _mm_srli_si128(src_r1_sht_16x8b, 2); //b3 b4 b5 b6 b7 b8 b9....b15 0 0 0 + + src_r0_t1_16x8b = _mm_unpacklo_epi8(src_r0_16x8b, src_r0_sht_16x8b); //a2 a3 a3 a4 a4 a5 a5 a6 a6 a7 a7 a8 a8 a9 a9 a10 + src_r1_t1_16x8b = _mm_unpacklo_epi8(src_r1_16x8b, src_r1_sht_16x8b); //b2 b3 b3 b4 b4 b5 b5 b6 b6 b7 b7 b8 a8 a9 a9 a10 + + res_r0_t2_8x16b = _mm_maddubs_epi16(src_r0_t1_16x8b, coeff2_3_16x8b); //a2*c2+a3*c3 a3*c2+a4*c3 a4*c2+a5*c3 a5*c2+a6*c3 + //a6*c2+a7*c3 a7*c2+a8*c3 a8*c2+a9*c3 a9*c2+a10*c3 + res_r1_t2_8x16b = _mm_maddubs_epi16(src_r1_t1_16x8b, coeff2_3_16x8b); //b2*c2+b3*c3 b3*c2+b4*c3 b2*c4+b5*c3 b5*c2+b6*c3 + //b6*c2+b7*c3 b7*c2+b8*c3 b8*c2+b9*c3 b9*c2+b10*c3 + + src_r0_16x8b = _mm_srli_si128(src_r0_16x8b, 2); //a4 a5 a6 a7 a8 a9....a15 0 0 0 0 + src_r1_16x8b = _mm_srli_si128(src_r1_16x8b, 2); //b4 b5 b6 b7 b8 b9....b15 0 0 0 0 + + src_r0_sht_16x8b = _mm_srli_si128(src_r0_sht_16x8b, 2); //a5 a6 a7 a8 a9....a15 0 0 0 0 0 + src_r1_sht_16x8b = _mm_srli_si128(src_r1_sht_16x8b, 2); //b5 b6 b7 b8 b9....b15 0 0 0 0 0 + + src_r0_t1_16x8b = _mm_unpacklo_epi8(src_r0_16x8b, src_r0_sht_16x8b); //a4 a5 a5 a6 a6 a7 a7 a8 a8 a9 a9 a10 a10 a11 a11 a12 + src_r1_t1_16x8b = _mm_unpacklo_epi8(src_r1_16x8b, src_r1_sht_16x8b); //b4 b5 b5 b6 b6 b7 b7 b8 b8 b9 b9 b10 b10 b11 b11 b12 + + res_r0_t3_8x16b = _mm_maddubs_epi16(src_r0_t1_16x8b, coeff4_5_16x8b); //a4*c4+a5*c5 a5*c4+a6*c5 a6*c4+a7*c5 a7*c4+a8*c5 + //a8*c4+a9*c5 a9*c4+a10*c5 a10*c4+a11*c5 a11*c4+a12*c5 + res_r1_t3_8x16b = _mm_maddubs_epi16(src_r1_t1_16x8b, coeff4_5_16x8b); //b4*c4+b5*c5 b5*c4+b6*c5 b6*c4+b7*c5 b7*c4+b8*c5 + //b8*c4+b9*c5 b9*c4+b10*c5 b10*c4+b11*c5 b11*c4+b12*c5 + res_r0_t1_8x16b = _mm_add_epi16(res_r0_t1_8x16b, res_r0_t2_8x16b); + res_r0_t1_8x16b = _mm_add_epi16(res_r0_t1_8x16b, res_r0_t3_8x16b); + + res_r1_t1_8x16b = _mm_add_epi16(res_r1_t1_8x16b, res_r1_t2_8x16b); + res_r1_t1_8x16b = _mm_add_epi16(res_r1_t1_8x16b, res_r1_t3_8x16b); + + _mm_storeu_si128((__m128i *)pi2_temp, res_r0_t1_8x16b); + _mm_storeu_si128((__m128i *)(pi2_temp + 8), res_r1_t1_8x16b); + + ht_tmp -= 2; + pu1_src += src_strd << 1; + pi2_temp += 16; + } + while(ht_tmp > 0); + + src_r0_16x8b = _mm_loadu_si128((__m128i *)pu1_src); //a0 a1 a2 a3 a4 a5 a6 a7 a8 a9 a10 a11 a12 a13 a14 a15 + src_r0_sht_16x8b = _mm_srli_si128(src_r0_16x8b, 1); //a1 a2 a3 a4 a5 a6 a7 a8 a9....a15 0 + + src_r0_t1_16x8b = _mm_unpacklo_epi8(src_r0_16x8b,src_r0_sht_16x8b); //a0 a1 a1 a2 a2 a3 a3 a4 a4 a5 a5 a6 a6 a7 a7 a8 + res_r0_t1_8x16b = _mm_maddubs_epi16(src_r0_t1_16x8b,coeff0_1_16x8b); //a0*c0+a1*c1 a1*c0+a2*c1 a2*c0+a3*c1 a3*c0+a4*c1 + //a4*c0+a5*c1 a5*c0+a6*c1 a6*c0+a7*c1 a7*c0+a8*c1 + + src_r0_16x8b = _mm_srli_si128(src_r0_16x8b, 2); //a2 a3 a4 a5 a6 a7 a8 a9....a15 0 0 + src_r0_sht_16x8b = _mm_srli_si128(src_r0_sht_16x8b, 2); //a3 a4 a5 a6 a7 a8 a9....a15 0 0 0 + + src_r0_t1_16x8b = _mm_unpacklo_epi8(src_r0_16x8b, src_r0_sht_16x8b); //a2 a3 a3 a4 a4 a5 a5 a6 a6 a7 a7 a8 a8 a9 a9 a10 + res_r0_t2_8x16b = _mm_maddubs_epi16(src_r0_t1_16x8b, coeff2_3_16x8b); //a2*c2+a3*c3 a3*c2+a4*c3 a4*c2+a5*c3 a5*c2+a6*c3 + //a6*c2+a7*c3 a7*c2+a8*c3 a8*c2+a9*c3 a9*c2+a10*c3 + + src_r0_16x8b = _mm_srli_si128(src_r0_16x8b, 2); //a4 a5 a6 a7 a8 a9....a15 0 0 0 0 + src_r0_sht_16x8b = _mm_srli_si128(src_r0_sht_16x8b, 2); //a5 a6 a7 a8 a9....a15 0 0 0 0 0 + + src_r0_t1_16x8b = _mm_unpacklo_epi8(src_r0_16x8b, src_r0_sht_16x8b); //a4 a5 a5 a6 a6 a7 a7 a8 a8 a9 a9 a10 a10 a11 a11 a12 + res_r0_t3_8x16b = _mm_maddubs_epi16(src_r0_t1_16x8b, coeff4_5_16x8b); //a4*c4+a5*c5 a5*c4+a6*c5 a6*c4+a7*c5 a7*c4+a8*c5 + //a8*c4+a9*c5 a9*c4+a10*c5 a10*c4+a11*c5 a11*c4+a12*c5 + res_r0_t1_8x16b = _mm_add_epi16(res_r0_t1_8x16b, res_r0_t2_8x16b); + res_r0_t1_8x16b = _mm_add_epi16(res_r0_t1_8x16b, res_r0_t3_8x16b); + + _mm_storeu_si128((__m128i *)pi2_temp, res_r0_t1_8x16b); + } + + pi2_temp = (WORD16 *)pu1_tmp; + + // Vertical 6-tap filtering + { + __m128i src_r0_8x16b, src_r1_8x16b, src_r2_8x16b, src_r3_8x16b, + src_r4_8x16b; + __m128i src_r5_8x16b, src_r6_8x16b; + __m128i src_r0r1_8x16b, src_r2r3_8x16b, src_r4r5_8x16b; + + __m128i res_t1_4x32b, res_t2_4x32b, res_t3_4x32b; + __m128i res_c0_4x32b, res_c1_4x32b; + __m128i res_8x16b, res_16x8b; + + __m128i coeff0_1_8x16b, coeff2_3_8x16b, coeff4_5_8x16b; + __m128i const_val512_4x32b; + + coeff0_1_8x16b = _mm_set1_epi32(0xFFFB0001); + coeff2_3_8x16b = _mm_set1_epi32(0x00140014); + coeff4_5_8x16b = _mm_set1_epi32(0x0001FFFB); + + const_val512_4x32b = _mm_set1_epi32(512); + + src_r0_8x16b = _mm_loadu_si128((__m128i *)pi2_temp); + src_r1_8x16b = _mm_loadu_si128((__m128i *)(pi2_temp + 8)); + src_r2_8x16b = _mm_loadu_si128((__m128i *)(pi2_temp + 16)); + src_r3_8x16b = _mm_loadu_si128((__m128i *)(pi2_temp + 24)); + src_r4_8x16b = _mm_loadu_si128((__m128i *)(pi2_temp + 32)); + pi2_temp += 40; + + do + { + src_r5_8x16b = _mm_loadu_si128((__m128i *)pi2_temp); + src_r6_8x16b = _mm_loadu_si128((__m128i *)(pi2_temp + 8)); + + src_r0r1_8x16b = _mm_unpacklo_epi16(src_r0_8x16b, src_r1_8x16b); + src_r2r3_8x16b = _mm_unpacklo_epi16(src_r2_8x16b, src_r3_8x16b); + src_r4r5_8x16b = _mm_unpacklo_epi16(src_r4_8x16b, src_r5_8x16b); + + res_t1_4x32b = _mm_madd_epi16(src_r0r1_8x16b, coeff0_1_8x16b); + res_t2_4x32b = _mm_madd_epi16(src_r2r3_8x16b, coeff2_3_8x16b); + res_t3_4x32b = _mm_madd_epi16(src_r4r5_8x16b, coeff4_5_8x16b); + + res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t2_4x32b); + res_t3_4x32b = _mm_add_epi32(res_t3_4x32b, const_val512_4x32b); + res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t3_4x32b); + res_c0_4x32b = _mm_srai_epi32(res_t1_4x32b, 10); + + src_r0r1_8x16b = _mm_unpackhi_epi16(src_r0_8x16b, src_r1_8x16b); + src_r2r3_8x16b = _mm_unpackhi_epi16(src_r2_8x16b, src_r3_8x16b); + src_r4r5_8x16b = _mm_unpackhi_epi16(src_r4_8x16b, src_r5_8x16b); + + res_t1_4x32b = _mm_madd_epi16(src_r0r1_8x16b, coeff0_1_8x16b); + res_t2_4x32b = _mm_madd_epi16(src_r2r3_8x16b, coeff2_3_8x16b); + res_t3_4x32b = _mm_madd_epi16(src_r4r5_8x16b, coeff4_5_8x16b); + + res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t2_4x32b); + res_t3_4x32b = _mm_add_epi32(res_t3_4x32b, const_val512_4x32b); + res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t3_4x32b); + res_c1_4x32b = _mm_srai_epi32(res_t1_4x32b, 10); + + res_8x16b = _mm_packs_epi32(res_c0_4x32b, res_c1_4x32b); + res_16x8b = _mm_packus_epi16(res_8x16b, res_8x16b); + + _mm_storel_epi64((__m128i *)pu1_dst, res_16x8b); + + src_r0r1_8x16b = _mm_unpacklo_epi16(src_r1_8x16b, src_r2_8x16b); + src_r2r3_8x16b = _mm_unpacklo_epi16(src_r3_8x16b, src_r4_8x16b); + src_r4r5_8x16b = _mm_unpacklo_epi16(src_r5_8x16b, src_r6_8x16b); + + res_t1_4x32b = _mm_madd_epi16(src_r0r1_8x16b, coeff0_1_8x16b); + res_t2_4x32b = _mm_madd_epi16(src_r2r3_8x16b, coeff2_3_8x16b); + res_t3_4x32b = _mm_madd_epi16(src_r4r5_8x16b, coeff4_5_8x16b); + + res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t2_4x32b); + res_t3_4x32b = _mm_add_epi32(res_t3_4x32b, const_val512_4x32b); + res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t3_4x32b); + res_c0_4x32b = _mm_srai_epi32(res_t1_4x32b, 10); + + src_r0r1_8x16b = _mm_unpackhi_epi16(src_r1_8x16b, src_r2_8x16b); + src_r2r3_8x16b = _mm_unpackhi_epi16(src_r3_8x16b, src_r4_8x16b); + src_r4r5_8x16b = _mm_unpackhi_epi16(src_r5_8x16b, src_r6_8x16b); + + res_t1_4x32b = _mm_madd_epi16(src_r0r1_8x16b, coeff0_1_8x16b); + res_t2_4x32b = _mm_madd_epi16(src_r2r3_8x16b, coeff2_3_8x16b); + res_t3_4x32b = _mm_madd_epi16(src_r4r5_8x16b, coeff4_5_8x16b); + + res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t2_4x32b); + res_t3_4x32b = _mm_add_epi32(res_t3_4x32b, const_val512_4x32b); + res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t3_4x32b); + res_c1_4x32b = _mm_srai_epi32(res_t1_4x32b, 10); + + res_8x16b = _mm_packs_epi32(res_c0_4x32b, res_c1_4x32b); + res_16x8b = _mm_packus_epi16(res_8x16b, res_8x16b); + + _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd), res_16x8b); + + src_r0_8x16b = src_r2_8x16b; + src_r1_8x16b = src_r3_8x16b; + src_r2_8x16b = src_r4_8x16b; + src_r3_8x16b = src_r5_8x16b; + src_r4_8x16b = src_r6_8x16b; + + ht -= 2; + pi2_temp += 16; + pu1_dst += dst_strd << 1; + } + while(ht > 0); + } + } + else // wd == 16 + { + WORD16 *pi2_temp; + WORD32 ht_tmp; + + pu1_tmp += 4; + pu1_src -= src_strd << 1; + pi2_temp = (WORD16 *)pu1_tmp; + pu1_src -= 2; // the filter input starts from x[-2] (till x[3]) + + // Horizontal 6-tap filtering + { + ht_tmp = ht + 5; + + __m128i src_r0_16x8b, src_r1_16x8b, src_r0_sht_16x8b, src_r1_sht_16x8b; + __m128i src_r0_t1_16x8b, src_r1_t1_16x8b; + + __m128i res_r0_t1_8x16b, res_r0_t2_8x16b, res_r0_t3_8x16b; + __m128i res_r1_t1_8x16b, res_r1_t2_8x16b, res_r1_t3_8x16b; + + __m128i coeff0_1_16x8b, coeff2_3_16x8b, coeff4_5_16x8b; + + coeff0_1_16x8b = _mm_set1_epi32(0xFB01FB01); //c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 + coeff2_3_16x8b = _mm_set1_epi32(0x14141414); //c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 + coeff4_5_16x8b = _mm_set1_epi32(0x01FB01FB); //c4 c5 c5 c5 c4 c5 c5 c5 c4 c5 c5 c5 c4 c5 c5 c5 + //c0 = c5 = 1, c1 = c4 = -5, c2 = c3 = 20 + //Row0 : a0 a1 a2 a3 a4 a5 a6 a7 a8 a9..... + //Row0 : b0 b1 b2 b3 b4 b5 b6 b7 b8 b9..... + //b0 is same a8. Similarly other bn pixels are same as a(n+8) pixels. + + do + { + src_r0_16x8b = _mm_loadu_si128((__m128i *)pu1_src); //a0 a1 a2 a3 a4 a5 a6 a7 a8 a9....a15 + src_r1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + 8)); //b0 b1 b2 b3 b4 b5 b6 b7 b8 b9....b15 + + src_r0_sht_16x8b = _mm_srli_si128(src_r0_16x8b, 1); //a1 a2 a3 a4 a5 a6 a7 a8 a9....a15 0 + src_r1_sht_16x8b = _mm_srli_si128(src_r1_16x8b, 1); //b1 b2 b3 b4 b5 b6 b7 b8 b9....b15 0 + + src_r0_t1_16x8b = _mm_unpacklo_epi8(src_r0_16x8b, src_r0_sht_16x8b); //a0 a1 a1 a2 a2 a3 a3 a4 a4 a5 a5 a6 a6 a7 a7 a8 + src_r1_t1_16x8b = _mm_unpacklo_epi8(src_r1_16x8b, src_r1_sht_16x8b); //b0 b1 b1 b2 b2 b3 b3 b4 b4 b5 b5 b6 b6 b7 b7 b8 + + res_r0_t1_8x16b = _mm_maddubs_epi16(src_r0_t1_16x8b, coeff0_1_16x8b); //a0*c0+a1*c1 a1*c0+a2*c1 a2*c0+a3*c1 a3*c0+a4*c1 + //a4*c0+a5*c1 a5*c0+a6*c1 a6*c0+a7*c1 a7*c0+a8*c1 + res_r1_t1_8x16b = _mm_maddubs_epi16(src_r1_t1_16x8b, coeff0_1_16x8b); //b0*c0+b1*c1 b1*c0+b2*c1 b2*c0+b3*c1 b3*c0+b4*c1 + //b4*c0+b5*c1 b5*c0+b6*c1 b6*c0+b7*c1 b7*c0+b8*c1 + + src_r0_16x8b = _mm_srli_si128(src_r0_16x8b, 2); //a2 a3 a4 a5 a6 a7 a8 a9....a15 0 0 + src_r1_16x8b = _mm_srli_si128(src_r1_16x8b, 2); //b2 b3 b4 b5 b6 b7 b8 b9....b15 0 0 + + src_r0_sht_16x8b = _mm_srli_si128(src_r0_sht_16x8b, 2); //a3 a4 a5 a6 a7 a8 a9....a15 0 0 0 + src_r1_sht_16x8b = _mm_srli_si128(src_r1_sht_16x8b, 2); //b3 b4 b5 b6 b7 b8 b9....b15 0 0 0 + + src_r0_t1_16x8b = _mm_unpacklo_epi8(src_r0_16x8b, src_r0_sht_16x8b); //a2 a3 a3 a4 a4 a5 a5 a6 a6 a7 a7 a8 a8 a9 a9 a10 + src_r1_t1_16x8b = _mm_unpacklo_epi8(src_r1_16x8b, src_r1_sht_16x8b); //b2 b3 b3 b4 b4 b5 b5 b6 b6 b7 b7 b8 a8 a9 a9 a10 + + res_r0_t2_8x16b = _mm_maddubs_epi16(src_r0_t1_16x8b, coeff2_3_16x8b); //a2*c2+a3*c3 a3*c2+a4*c3 a4*c2+a5*c3 a5*c2+a6*c3 + //a6*c2+a7*c3 a7*c2+a8*c3 a8*c2+a9*c3 a9*c2+a10*c3 + res_r1_t2_8x16b = _mm_maddubs_epi16(src_r1_t1_16x8b, coeff2_3_16x8b); //b2*c2+b3*c3 b3*c2+b4*c3 b2*c4+b5*c3 b5*c2+b6*c3 + //b6*c2+b7*c3 b7*c2+b8*c3 b8*c2+b9*c3 b9*c2+b10*c3 + + src_r0_16x8b = _mm_srli_si128(src_r0_16x8b, 2); //a4 a5 a6 a7 a8 a9....a15 0 0 0 0 + src_r1_16x8b = _mm_srli_si128(src_r1_16x8b, 2); //b4 b5 b6 b7 b8 b9....b15 0 0 0 0 + + src_r0_sht_16x8b = _mm_srli_si128(src_r0_sht_16x8b, 2); //a5 a6 a7 a8 a9....a15 0 0 0 0 0 + src_r1_sht_16x8b = _mm_srli_si128(src_r1_sht_16x8b, 2); //b5 b6 b7 b8 b9....b15 0 0 0 0 0 + + src_r0_t1_16x8b = _mm_unpacklo_epi8(src_r0_16x8b, src_r0_sht_16x8b); //a4 a5 a5 a6 a6 a7 a7 a8 a8 a9 a9 a10 a10 a11 a11 a12 + src_r1_t1_16x8b = _mm_unpacklo_epi8(src_r1_16x8b, src_r1_sht_16x8b); //b4 b5 b5 b6 b6 b7 b7 b8 b8 b9 b9 b10 b10 b11 b11 b12 + + res_r0_t3_8x16b = _mm_maddubs_epi16(src_r0_t1_16x8b, coeff4_5_16x8b); //a4*c4+a5*c5 a5*c4+a6*c5 a6*c4+a7*c5 a7*c4+a8*c5 + //a8*c4+a9*c5 a9*c4+a10*c5 a10*c4+a11*c5 a11*c4+a12*c5 + res_r1_t3_8x16b = _mm_maddubs_epi16(src_r1_t1_16x8b, coeff4_5_16x8b); //b4*c4+b5*c5 b5*c4+b6*c5 b6*c4+b7*c5 b7*c4+b8*c5 + //b8*c4+b9*c5 b9*c4+b10*c5 b10*c4+b11*c5 b11*c4+b12*c5 + res_r0_t1_8x16b = _mm_add_epi16(res_r0_t1_8x16b, res_r0_t2_8x16b); + res_r0_t1_8x16b = _mm_add_epi16(res_r0_t1_8x16b, res_r0_t3_8x16b); + + res_r1_t1_8x16b = _mm_add_epi16(res_r1_t1_8x16b, res_r1_t2_8x16b); + res_r1_t1_8x16b = _mm_add_epi16(res_r1_t1_8x16b, res_r1_t3_8x16b); + + _mm_storeu_si128((__m128i *)pi2_temp, res_r0_t1_8x16b); + _mm_storeu_si128((__m128i *)(pi2_temp + 8), res_r1_t1_8x16b); + + ht_tmp--; + pu1_src += src_strd; + pi2_temp += 16; + } + while(ht_tmp > 0); + } + + pi2_temp = (WORD16 *)pu1_tmp; + + // Vertical 6-tap filtering + { + WORD16 *pi2_temp2; + UWORD8 *pu1_dst2; + WORD32 ht_tmp; + + __m128i src_r0_8x16b, src_r1_8x16b, src_r2_8x16b, src_r3_8x16b, src_r4_8x16b; + __m128i src_r5_8x16b, src_r6_8x16b; + __m128i src_r0r1_8x16b, src_r2r3_8x16b, src_r4r5_8x16b; + + __m128i res_t1_4x32b, res_t2_4x32b, res_t3_4x32b; + __m128i res_c0_4x32b, res_c1_4x32b; + __m128i res_8x16b, res_16x8b; + + __m128i coeff0_1_8x16b, coeff2_3_8x16b, coeff4_5_8x16b; + __m128i const_val512_4x32b; + + coeff0_1_8x16b = _mm_set1_epi32(0xFFFB0001); + coeff2_3_8x16b = _mm_set1_epi32(0x00140014); + coeff4_5_8x16b = _mm_set1_epi32(0x0001FFFB); + + const_val512_4x32b = _mm_set1_epi32(512); + + pi2_temp2 = pi2_temp + 8; + pu1_dst2 = pu1_dst + 8; + ht_tmp = ht; + + /**********************************************************/ + /* Do first height x 8 block */ + /**********************************************************/ + src_r0_8x16b = _mm_loadu_si128((__m128i *)pi2_temp); + src_r1_8x16b = _mm_loadu_si128((__m128i *)(pi2_temp + 16)); + src_r2_8x16b = _mm_loadu_si128((__m128i *)(pi2_temp + 32)); + src_r3_8x16b = _mm_loadu_si128((__m128i *)(pi2_temp + 48)); + src_r4_8x16b = _mm_loadu_si128((__m128i *)(pi2_temp + 64)); + pi2_temp += 80; + + do + { + src_r5_8x16b = _mm_loadu_si128((__m128i *)pi2_temp); + src_r6_8x16b = _mm_loadu_si128((__m128i *)(pi2_temp + 16)); + + src_r0r1_8x16b = _mm_unpacklo_epi16(src_r0_8x16b, src_r1_8x16b); + src_r2r3_8x16b = _mm_unpacklo_epi16(src_r2_8x16b, src_r3_8x16b); + src_r4r5_8x16b = _mm_unpacklo_epi16(src_r4_8x16b, src_r5_8x16b); + + res_t1_4x32b = _mm_madd_epi16(src_r0r1_8x16b, coeff0_1_8x16b); + res_t2_4x32b = _mm_madd_epi16(src_r2r3_8x16b, coeff2_3_8x16b); + res_t3_4x32b = _mm_madd_epi16(src_r4r5_8x16b, coeff4_5_8x16b); + + res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t2_4x32b); + res_t3_4x32b = _mm_add_epi32(res_t3_4x32b, const_val512_4x32b); + res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t3_4x32b); + res_c0_4x32b = _mm_srai_epi32(res_t1_4x32b, 10); + + src_r0r1_8x16b = _mm_unpackhi_epi16(src_r0_8x16b, src_r1_8x16b); + src_r2r3_8x16b = _mm_unpackhi_epi16(src_r2_8x16b, src_r3_8x16b); + src_r4r5_8x16b = _mm_unpackhi_epi16(src_r4_8x16b, src_r5_8x16b); + + res_t1_4x32b = _mm_madd_epi16(src_r0r1_8x16b, coeff0_1_8x16b); + res_t2_4x32b = _mm_madd_epi16(src_r2r3_8x16b, coeff2_3_8x16b); + res_t3_4x32b = _mm_madd_epi16(src_r4r5_8x16b, coeff4_5_8x16b); + + res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t2_4x32b); + res_t3_4x32b = _mm_add_epi32(res_t3_4x32b, const_val512_4x32b); + res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t3_4x32b); + res_c1_4x32b = _mm_srai_epi32(res_t1_4x32b, 10); + + res_8x16b = _mm_packs_epi32(res_c0_4x32b, res_c1_4x32b); + res_16x8b = _mm_packus_epi16(res_8x16b, res_8x16b); + + _mm_storel_epi64((__m128i *)pu1_dst, res_16x8b); + + src_r0r1_8x16b = _mm_unpacklo_epi16(src_r1_8x16b, src_r2_8x16b); + src_r2r3_8x16b = _mm_unpacklo_epi16(src_r3_8x16b, src_r4_8x16b); + src_r4r5_8x16b = _mm_unpacklo_epi16(src_r5_8x16b, src_r6_8x16b); + + res_t1_4x32b = _mm_madd_epi16(src_r0r1_8x16b, coeff0_1_8x16b); + res_t2_4x32b = _mm_madd_epi16(src_r2r3_8x16b, coeff2_3_8x16b); + res_t3_4x32b = _mm_madd_epi16(src_r4r5_8x16b, coeff4_5_8x16b); + + res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t2_4x32b); + res_t3_4x32b = _mm_add_epi32(res_t3_4x32b, const_val512_4x32b); + res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t3_4x32b); + res_c0_4x32b = _mm_srai_epi32(res_t1_4x32b, 10); + + src_r0r1_8x16b = _mm_unpackhi_epi16(src_r1_8x16b, src_r2_8x16b); + src_r2r3_8x16b = _mm_unpackhi_epi16(src_r3_8x16b, src_r4_8x16b); + src_r4r5_8x16b = _mm_unpackhi_epi16(src_r5_8x16b, src_r6_8x16b); + + res_t1_4x32b = _mm_madd_epi16(src_r0r1_8x16b, coeff0_1_8x16b); + res_t2_4x32b = _mm_madd_epi16(src_r2r3_8x16b, coeff2_3_8x16b); + res_t3_4x32b = _mm_madd_epi16(src_r4r5_8x16b, coeff4_5_8x16b); + + res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t2_4x32b); + res_t3_4x32b = _mm_add_epi32(res_t3_4x32b, const_val512_4x32b); + res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t3_4x32b); + res_c1_4x32b = _mm_srai_epi32(res_t1_4x32b, 10); + + res_8x16b = _mm_packs_epi32(res_c0_4x32b, res_c1_4x32b); + res_16x8b = _mm_packus_epi16(res_8x16b, res_8x16b); + + _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd), res_16x8b); + + src_r0_8x16b = src_r2_8x16b; + src_r1_8x16b = src_r3_8x16b; + src_r2_8x16b = src_r4_8x16b; + src_r3_8x16b = src_r5_8x16b; + src_r4_8x16b = src_r6_8x16b; + + ht_tmp -= 2; + pi2_temp += 32; + pu1_dst += dst_strd << 1; + } + while(ht_tmp > 0); + + /**********************************************************/ + /* Do second ht x 8 block */ + /**********************************************************/ + src_r0_8x16b = _mm_loadu_si128((__m128i *)pi2_temp2); + src_r1_8x16b = _mm_loadu_si128((__m128i *)(pi2_temp2 + 16)); + src_r2_8x16b = _mm_loadu_si128((__m128i *)(pi2_temp2 + 32)); + src_r3_8x16b = _mm_loadu_si128((__m128i *)(pi2_temp2 + 48)); + src_r4_8x16b = _mm_loadu_si128((__m128i *)(pi2_temp2 + 64)); + pi2_temp2 += 80; + + do + { + src_r5_8x16b = _mm_loadu_si128((__m128i *)pi2_temp2); + src_r6_8x16b = _mm_loadu_si128((__m128i *)(pi2_temp2 + 16)); + + src_r0r1_8x16b = _mm_unpacklo_epi16(src_r0_8x16b, src_r1_8x16b); + src_r2r3_8x16b = _mm_unpacklo_epi16(src_r2_8x16b, src_r3_8x16b); + src_r4r5_8x16b = _mm_unpacklo_epi16(src_r4_8x16b, src_r5_8x16b); + + res_t1_4x32b = _mm_madd_epi16(src_r0r1_8x16b, coeff0_1_8x16b); + res_t2_4x32b = _mm_madd_epi16(src_r2r3_8x16b, coeff2_3_8x16b); + res_t3_4x32b = _mm_madd_epi16(src_r4r5_8x16b, coeff4_5_8x16b); + + res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t2_4x32b); + res_t3_4x32b = _mm_add_epi32(res_t3_4x32b, const_val512_4x32b); + res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t3_4x32b); + res_c0_4x32b = _mm_srai_epi32(res_t1_4x32b, 10); + + src_r0r1_8x16b = _mm_unpackhi_epi16(src_r0_8x16b, src_r1_8x16b); + src_r2r3_8x16b = _mm_unpackhi_epi16(src_r2_8x16b, src_r3_8x16b); + src_r4r5_8x16b = _mm_unpackhi_epi16(src_r4_8x16b, src_r5_8x16b); + + res_t1_4x32b = _mm_madd_epi16(src_r0r1_8x16b, coeff0_1_8x16b); + res_t2_4x32b = _mm_madd_epi16(src_r2r3_8x16b, coeff2_3_8x16b); + res_t3_4x32b = _mm_madd_epi16(src_r4r5_8x16b, coeff4_5_8x16b); + + res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t2_4x32b); + res_t3_4x32b = _mm_add_epi32(res_t3_4x32b, const_val512_4x32b); + res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t3_4x32b); + res_c1_4x32b = _mm_srai_epi32(res_t1_4x32b, 10); + + res_8x16b = _mm_packs_epi32(res_c0_4x32b, res_c1_4x32b); + res_16x8b = _mm_packus_epi16(res_8x16b, res_8x16b); + + _mm_storel_epi64((__m128i *)pu1_dst2, res_16x8b); + + src_r0r1_8x16b = _mm_unpacklo_epi16(src_r1_8x16b, src_r2_8x16b); + src_r2r3_8x16b = _mm_unpacklo_epi16(src_r3_8x16b, src_r4_8x16b); + src_r4r5_8x16b = _mm_unpacklo_epi16(src_r5_8x16b, src_r6_8x16b); + + res_t1_4x32b = _mm_madd_epi16(src_r0r1_8x16b, coeff0_1_8x16b); + res_t2_4x32b = _mm_madd_epi16(src_r2r3_8x16b, coeff2_3_8x16b); + res_t3_4x32b = _mm_madd_epi16(src_r4r5_8x16b, coeff4_5_8x16b); + + res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t2_4x32b); + res_t3_4x32b = _mm_add_epi32(res_t3_4x32b, const_val512_4x32b); + res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t3_4x32b); + res_c0_4x32b = _mm_srai_epi32(res_t1_4x32b, 10); + + src_r0r1_8x16b = _mm_unpackhi_epi16(src_r1_8x16b, src_r2_8x16b); + src_r2r3_8x16b = _mm_unpackhi_epi16(src_r3_8x16b, src_r4_8x16b); + src_r4r5_8x16b = _mm_unpackhi_epi16(src_r5_8x16b, src_r6_8x16b); + + res_t1_4x32b = _mm_madd_epi16(src_r0r1_8x16b, coeff0_1_8x16b); + res_t2_4x32b = _mm_madd_epi16(src_r2r3_8x16b, coeff2_3_8x16b); + res_t3_4x32b = _mm_madd_epi16(src_r4r5_8x16b, coeff4_5_8x16b); + + res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t2_4x32b); + res_t3_4x32b = _mm_add_epi32(res_t3_4x32b, const_val512_4x32b); + res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t3_4x32b); + res_c1_4x32b = _mm_srai_epi32(res_t1_4x32b, 10); + + res_8x16b = _mm_packs_epi32(res_c0_4x32b, res_c1_4x32b); + res_16x8b = _mm_packus_epi16(res_8x16b, res_8x16b); + + _mm_storel_epi64((__m128i *)(pu1_dst2 + dst_strd), res_16x8b); + + src_r0_8x16b = src_r2_8x16b; + src_r1_8x16b = src_r3_8x16b; + src_r2_8x16b = src_r4_8x16b; + src_r3_8x16b = src_r5_8x16b; + src_r4_8x16b = src_r6_8x16b; + + ht -= 2; + pi2_temp2 += 32; + pu1_dst2 += dst_strd << 1; + } + while(ht > 0); + } + } +} + +/*****************************************************************************/ +/* */ +/* Function Name : ih264_inter_pred_luma_horz_qpel_ssse3 */ +/* */ +/* Description : This function implements a six-tap filter horizontally */ +/* on ht x wd block and averages the values with the source */ +/* pixels to calculate horizontal quarter-pel as mentioned */ +/* in sec. 8.4.2.2.1 titled "Luma sample interpolation */ +/* process". (ht,wd) can be (4,4), (8,4), (4,8), (8,8), */ +/* (16,8), (8,16) or (16,16). */ +/* */ +/* Inputs : puc_src - pointer to source */ +/* puc_dst - pointer to destination */ +/* src_strd - stride for source */ +/* dst_strd - stride for destination */ +/* ht - height of the block */ +/* wd - width of the block */ +/* pu1_tmp - pointer to temporary buffer */ +/* dydx - x and y reference offset for q-pel */ +/* calculations */ +/* */ +/* Issues : None */ +/* */ +/* Revision History: */ +/* */ +/* DD MM YYYY Author(s) Changes */ +/* 13 02 2015 Kaushik Initial Version */ +/* Senthoor */ +/* */ +/*****************************************************************************/ +void ih264_inter_pred_luma_horz_qpel_ssse3(UWORD8 *pu1_src, + UWORD8 *pu1_dst, + WORD32 src_strd, + WORD32 dst_strd, + WORD32 ht, + WORD32 wd, + UWORD8* pu1_tmp, + WORD32 dydx) +{ + WORD32 x_offset; + UWORD8 *pu1_pred1; + + __m128i src_r0_16x8b, src_r1_16x8b; + __m128i src_r0_sht_16x8b, src_r1_sht_16x8b; + __m128i coeff0_1_16x8b, coeff2_3_16x8b, coeff4_5_16x8b; + __m128i const_val16_8x16b; + + UNUSED(pu1_tmp); + + x_offset = dydx & 3; + + coeff0_1_16x8b = _mm_set1_epi32(0xFB01FB01); //c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 + coeff2_3_16x8b = _mm_set1_epi32(0x14141414); //c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 + coeff4_5_16x8b = _mm_set1_epi32(0x01FB01FB); //c4 c5 c4 c5 c4 c5 c4 c5 c4 c5 c4 c5 c4 c5 c4 c5 + //c0 = c5 = 1, c1 = c4 = -5, c2 = c3 = 20 + pu1_pred1 = pu1_src + (x_offset >> 1); + + const_val16_8x16b = _mm_set1_epi16(16); + + pu1_src -= 2; // the filter input starts from x[-2] (till x[3]) + + if(wd == 4) + { + __m128i src_r0r1_16x8b; + + __m128i res_r0r1_t1_8x16b, res_r0r1_t2_8x16b, res_r0r1_t3_8x16b; + __m128i res_r0r1_16x8b; + + __m128i mask_full_16x8b, mask_low32b; + + mask_full_16x8b = _mm_set1_epi8(0xff); + mask_low32b = _mm_srli_si128(mask_full_16x8b, 12); // mask for first four bytes + + //Row0 : a0 a1 a2 a3 a4 a5 a6 a7 a8 a9..... + //Row1 : b0 b1 b2 b3 b4 b5 b6 b7 b8 b9..... + + do + { + src_r0_16x8b = _mm_loadu_si128((__m128i *)pu1_src); //a0 a1 a2 a3 a4 a5 a6 a7 a8 a9....a15 + src_r1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + src_strd)); //b0 b1 b2 b3 b4 b5 b6 b7 b8 b9....b15 + + src_r0_sht_16x8b = _mm_srli_si128(src_r0_16x8b, 1); //a1 a2 a3 a4 a5 a6 a7 a8 a9....a15 0 + src_r1_sht_16x8b = _mm_srli_si128(src_r1_16x8b, 1); //b1 b2 b3 b4 b5 b6 b7 b8 b9....b15 0 + + src_r0_16x8b = _mm_unpacklo_epi8(src_r0_16x8b, src_r0_sht_16x8b); //a0 a1 a1 a2 a2 a3 a3 a4 a4 a5 a5 a6 a6 a7 a7 a8 + src_r1_16x8b = _mm_unpacklo_epi8(src_r1_16x8b, src_r1_sht_16x8b); //b0 b1 b1 b2 b2 b3 b3 b4 b4 b5 b5 b6 b6 b7 b7 b8 + + src_r0r1_16x8b = _mm_unpacklo_epi64(src_r0_16x8b, src_r1_16x8b); //a0 a1 a1 a2 a2 a3 a3 a4 b0 b1 b1 b2 b2 b3 b3 b4 + res_r0r1_t1_8x16b = _mm_maddubs_epi16(src_r0r1_16x8b, coeff0_1_16x8b); //a0*c0+a1*c1 a1*c0+a2*c1 a2*c0+a3*c1 a3*c0+a4*c1 + //b0*c0+b1*c1 b1*c0+b2*c1 b2*c0+b3*c1 b3*c0+b4*c1 + + src_r0_16x8b = _mm_srli_si128(src_r0_16x8b, 4); //a2 a3 a3 a4 a4 a5 a5 a6 a6 a7 a7 a8 0 0 0 0 + src_r1_16x8b = _mm_srli_si128(src_r1_16x8b, 4); //b2 b3 b3 b4 b4 b5 b5 b6 b6 b7 b7 b8 0 0 0 0 + + src_r0r1_16x8b = _mm_unpacklo_epi64(src_r0_16x8b, src_r1_16x8b); //a2 a3 a3 a4 a4 a5 a5 a6 b2 b3 b3 b4 b4 b5 b5 b6 + res_r0r1_t2_8x16b = _mm_maddubs_epi16(src_r0r1_16x8b, coeff2_3_16x8b); //a2*c2+a3*c3 a3*c2+a4*c3 a4*c2+a5*c3 a5*c2+a6*c3 + //b2*c2+b3*c3 b3*c2+b4*c3 b4*c2+b5*c3 b5*c2+b6*c3 + + src_r0_16x8b = _mm_srli_si128(src_r0_16x8b, 4); //a4 a5 a5 a6 a6 a7 a7 a8 0 0 0 0 0 0 0 0 + src_r1_16x8b = _mm_srli_si128(src_r1_16x8b, 4); //b4 b5 b5 b6 b6 b7 b7 b8 0 0 0 0 0 0 0 0 + + src_r0r1_16x8b = _mm_unpacklo_epi64(src_r0_16x8b, src_r1_16x8b); //a4 a5 a5 a6 a6 a7 a7 a8 b4 b5 b5 b6 b6 b7 b7 b8 + res_r0r1_t3_8x16b = _mm_maddubs_epi16(src_r0r1_16x8b, coeff4_5_16x8b); //a4*c4+a5*c5 a5*c4+a6*c5 a6*c4+a7*c5 a7*c4+a8*c5 + //b4*c4+b5*c5 b5*c4+b6*c5 b4*c6+b7*c5 b7*c4+b8*c5 + src_r0_16x8b = _mm_loadl_epi64((__m128i *)pu1_pred1); + src_r1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_pred1 + src_strd)); + + res_r0r1_t1_8x16b = _mm_add_epi16(res_r0r1_t1_8x16b, res_r0r1_t2_8x16b); + res_r0r1_t3_8x16b = _mm_add_epi16(const_val16_8x16b, res_r0r1_t3_8x16b); + res_r0r1_t1_8x16b = _mm_add_epi16(res_r0r1_t1_8x16b, res_r0r1_t3_8x16b); //a0*c0+a1*c1+a2*c2+a3*c3+a4*a4+a5*c5 + 16; + //a1*c0+a2*c1+a2*c2+a3*c3+a5*a4+a6*c5 + 16; + //a2*c0+a3*c1+a4*c2+a5*c3+a6*a4+a7*c5 + 16; + //a3*c0+a4*c1+a5*c2+a6*c3+a6*a4+a8*c5 + 16; + //b0*c0+b1*c1+b2*c2+b3*c3+b4*b4+b5*c5 + 16; + //b1*c0+b2*c1+b2*c2+b3*c3+b5*b4+b6*c5 + 16; + //b2*c0+b3*c1+b4*c2+b5*c3+b6*b4+b7*c5 + 16; + //b3*c0+b4*c1+b5*c2+b6*c3+b6*b4+b8*c5 + 16; + src_r0r1_16x8b = _mm_unpacklo_epi32(src_r0_16x8b,src_r1_16x8b); + + res_r0r1_t1_8x16b = _mm_srai_epi16(res_r0r1_t1_8x16b, 5); //shifting right by 5 bits. + + res_r0r1_16x8b = _mm_packus_epi16(res_r0r1_t1_8x16b, res_r0r1_t1_8x16b); + res_r0r1_16x8b = _mm_avg_epu8(src_r0r1_16x8b, res_r0r1_16x8b); //computing q-pel + + _mm_maskmoveu_si128(res_r0r1_16x8b, mask_low32b, (char*)pu1_dst); + res_r0r1_16x8b = _mm_srli_si128(res_r0r1_16x8b, 4); + _mm_maskmoveu_si128(res_r0r1_16x8b, mask_low32b, (char*)(pu1_dst + dst_strd)); + + ht -= 2; + pu1_src += src_strd << 1; + pu1_pred1 += src_strd << 1; + pu1_dst += dst_strd << 1; + } + while(ht > 0); + } + else if(wd == 8) + { + __m128i src_r0_t1_16x8b, src_r1_t1_16x8b; + + __m128i res_r0_t1_8x16b, res_r0_t2_8x16b, res_r0_t3_8x16b; + __m128i res_r1_t1_8x16b, res_r1_t2_8x16b, res_r1_t3_8x16b; + __m128i res_r0_16x8b, res_r1_16x8b; + + //Row0 : a0 a1 a2 a3 a4 a5 a6 a7 a8 a9..... + //Row1 : b0 b1 b2 b3 b4 b5 b6 b7 b8 b9..... + + do + { + src_r0_16x8b = _mm_loadu_si128((__m128i *)pu1_src); //a0 a1 a2 a3 a4 a5 a6 a7 a8 a9....a15 + src_r1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + src_strd)); //b0 b1 b2 b3 b4 b5 b6 b7 b8 b9....b15 + + src_r0_sht_16x8b = _mm_srli_si128(src_r0_16x8b, 1); //a1 a2 a3 a4 a5 a6 a7 a8 a9....a15 0 + src_r1_sht_16x8b = _mm_srli_si128(src_r1_16x8b, 1); //b1 b2 b3 b4 b5 b6 b7 b8 b9....b15 0 + + src_r0_t1_16x8b = _mm_unpacklo_epi8(src_r0_16x8b, src_r0_sht_16x8b); //a0 a1 a1 a2 a2 a3 a3 a4 a4 a5 a5 a6 a6 a7 a7 a8 + src_r1_t1_16x8b = _mm_unpacklo_epi8(src_r1_16x8b, src_r1_sht_16x8b); //b0 b1 b1 b2 b2 b3 b3 b4 b4 b5 b5 b6 b6 b7 b7 b8 + + res_r0_t1_8x16b = _mm_maddubs_epi16(src_r0_t1_16x8b, coeff0_1_16x8b); //a0*c0+a1*c1 a1*c0+a2*c1 a2*c0+a3*c1 a3*c0+a4*c1 + //a4*c0+a5*c1 a5*c0+a6*c1 a6*c0+a7*c1 a7*c0+a8*c1 + res_r1_t1_8x16b = _mm_maddubs_epi16(src_r1_t1_16x8b, coeff0_1_16x8b); //b0*c0+b1*c1 b1*c0+b2*c1 b2*c0+b3*c1 b3*c0+b4*c1 + //b4*c0+b5*c1 b5*c0+b6*c1 b6*c0+b7*c1 b7*c0+b8*c1 + + src_r0_16x8b = _mm_srli_si128(src_r0_16x8b, 2); //a2 a3 a4 a5 a6 a7 a8 a9....a15 0 0 + src_r1_16x8b = _mm_srli_si128(src_r1_16x8b, 2); //b2 b3 b4 b5 b6 b7 b8 b9....b15 0 0 + + src_r0_sht_16x8b = _mm_srli_si128(src_r0_sht_16x8b, 2); //a3 a4 a5 a6 a7 a8 a9....a15 0 0 0 + src_r1_sht_16x8b = _mm_srli_si128(src_r1_sht_16x8b, 2); //b3 b4 b5 b6 b7 b8 b9....b15 0 0 0 + + src_r0_t1_16x8b = _mm_unpacklo_epi8(src_r0_16x8b, src_r0_sht_16x8b); //a2 a3 a3 a4 a4 a5 a5 a6 a6 a7 a7 a8 a8 a9 a9 a10 + src_r1_t1_16x8b = _mm_unpacklo_epi8(src_r1_16x8b, src_r1_sht_16x8b); //b2 b3 b3 b4 b4 b5 b5 b6 b6 b7 b7 b8 a8 a9 a9 a10 + + res_r0_t2_8x16b = _mm_maddubs_epi16(src_r0_t1_16x8b, coeff2_3_16x8b); //a2*c2+a3*c3 a3*c2+a4*c3 a4*c2+a5*c3 a5*c2+a6*c3 + //a6*c2+a7*c3 a7*c2+a8*c3 a8*c2+a9*c3 a9*c2+a10*c3 + res_r1_t2_8x16b = _mm_maddubs_epi16(src_r1_t1_16x8b, coeff2_3_16x8b); //b2*c2+b3*c3 b3*c2+b4*c3 b2*c4+b5*c3 b5*c2+b6*c3 + //b6*c2+b7*c3 b7*c2+b8*c3 b8*c2+b9*c3 b9*c2+b10*c3 + + src_r0_16x8b = _mm_srli_si128(src_r0_16x8b, 2); //a4 a5 a6 a7 a8 a9....a15 0 0 0 0 + src_r1_16x8b = _mm_srli_si128(src_r1_16x8b, 2); //b4 b5 b6 b7 b8 b9....b15 0 0 0 0 + + src_r0_sht_16x8b = _mm_srli_si128(src_r0_sht_16x8b, 2); //a5 a6 a7 a8 a9....a15 0 0 0 0 0 + src_r1_sht_16x8b = _mm_srli_si128(src_r1_sht_16x8b, 2); //b5 b6 b7 b8 b9....b15 0 0 0 0 0 + + src_r0_t1_16x8b = _mm_unpacklo_epi8(src_r0_16x8b, src_r0_sht_16x8b); //a4 a5 a5 a6 a6 a7 a7 a8 a8 a9 a9 a10 a10 a11 a11 a12 + src_r1_t1_16x8b = _mm_unpacklo_epi8(src_r1_16x8b, src_r1_sht_16x8b); //b4 b5 b5 b6 b6 b7 b7 b8 b8 b9 b9 b10 b10 b11 b11 b12 + + res_r0_t3_8x16b = _mm_maddubs_epi16(src_r0_t1_16x8b, coeff4_5_16x8b); //a4*c4+a5*c5 a5*c4+a6*c5 a6*c4+a7*c5 a7*c4+a8*c5 + //a8*c4+a9*c5 a9*c4+a10*c5 a10*c4+a11*c5 a11*c4+a12*c5 + res_r1_t3_8x16b = _mm_maddubs_epi16(src_r1_t1_16x8b, coeff4_5_16x8b); //b4*c4+b5*c5 b5*c4+b6*c5 b6*c4+b7*c5 b7*c4+b8*c5 + //b8*c4+b9*c5 b9*c4+b10*c5 b10*c4+b11*c5 b11*c4+b12*c5 + src_r0_16x8b = _mm_loadl_epi64((__m128i *)pu1_pred1); + src_r1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_pred1 + src_strd)); + + res_r0_t1_8x16b = _mm_add_epi16(res_r0_t1_8x16b, res_r0_t2_8x16b); + res_r1_t1_8x16b = _mm_add_epi16(res_r1_t1_8x16b, res_r1_t2_8x16b); + res_r0_t3_8x16b = _mm_add_epi16(const_val16_8x16b, res_r0_t3_8x16b); + res_r1_t3_8x16b = _mm_add_epi16(const_val16_8x16b, res_r1_t3_8x16b); + res_r0_t1_8x16b = _mm_add_epi16(res_r0_t1_8x16b, res_r0_t3_8x16b); + res_r1_t1_8x16b = _mm_add_epi16(res_r1_t1_8x16b, res_r1_t3_8x16b); + + res_r0_t1_8x16b = _mm_srai_epi16(res_r0_t1_8x16b, 5); + res_r1_t1_8x16b = _mm_srai_epi16(res_r1_t1_8x16b, 5); //shifting right by 5 bits. + + res_r0_16x8b = _mm_packus_epi16(res_r0_t1_8x16b, res_r0_t1_8x16b); + res_r1_16x8b = _mm_packus_epi16(res_r1_t1_8x16b, res_r1_t1_8x16b); + + res_r0_16x8b = _mm_avg_epu8(src_r0_16x8b, res_r0_16x8b); + res_r1_16x8b = _mm_avg_epu8(src_r1_16x8b, res_r1_16x8b); //computing q-pel + + _mm_storel_epi64((__m128i *)pu1_dst, res_r0_16x8b); + _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd), res_r1_16x8b); + + ht -= 2; + pu1_src += src_strd << 1; + pu1_pred1 += src_strd << 1; + pu1_dst += dst_strd << 1; + } + while(ht > 0); + } + else // wd == 16 + { + __m128i src_r0_t1_16x8b, src_r1_t1_16x8b; + + __m128i res_r0_t1_8x16b, res_r0_t2_8x16b, res_r0_t3_8x16b; + __m128i res_r1_t1_8x16b, res_r1_t2_8x16b, res_r1_t3_8x16b; + __m128i res_16x8b; + + //Row0 : a0 a1 a2 a3 a4 a5 a6 a7 a8 a9..... + //Row0 : b0 b1 b2 b3 b4 b5 b6 b7 b8 b9..... + //b0 is same a8. Similarly other bn pixels are same as a(n+8) pixels. + + do + { + src_r0_16x8b = _mm_loadu_si128((__m128i *)pu1_src); //a0 a1 a2 a3 a4 a5 a6 a7 a8 a9....a15 + src_r1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + 8)); //b0 b1 b2 b3 b4 b5 b6 b7 b8 b9....b15 + + src_r0_sht_16x8b = _mm_srli_si128(src_r0_16x8b, 1); //a1 a2 a3 a4 a5 a6 a7 a8 a9....a15 0 + src_r1_sht_16x8b = _mm_srli_si128(src_r1_16x8b, 1); //b1 b2 b3 b4 b5 b6 b7 b8 b9....b15 0 + + src_r0_t1_16x8b = _mm_unpacklo_epi8(src_r0_16x8b, src_r0_sht_16x8b); //a0 a1 a1 a2 a2 a3 a3 a4 a4 a5 a5 a6 a6 a7 a7 a8 + src_r1_t1_16x8b = _mm_unpacklo_epi8(src_r1_16x8b, src_r1_sht_16x8b); //b0 b1 b1 b2 b2 b3 b3 b4 b4 b5 b5 b6 b6 b7 b7 b8 + + res_r0_t1_8x16b = _mm_maddubs_epi16(src_r0_t1_16x8b, coeff0_1_16x8b); //a0*c0+a1*c1 a1*c0+a2*c1 a2*c0+a3*c1 a3*c0+a4*c1 + //a4*c0+a5*c1 a5*c0+a6*c1 a6*c0+a7*c1 a7*c0+a8*c1 + res_r1_t1_8x16b = _mm_maddubs_epi16(src_r1_t1_16x8b, coeff0_1_16x8b); //b0*c0+b1*c1 b1*c0+b2*c1 b2*c0+b3*c1 b3*c0+b4*c1 + //b4*c0+b5*c1 b5*c0+b6*c1 b6*c0+b7*c1 b7*c0+b8*c1 + + src_r0_16x8b = _mm_srli_si128(src_r0_16x8b, 2); //a2 a3 a4 a5 a6 a7 a8 a9....a15 0 0 + src_r1_16x8b = _mm_srli_si128(src_r1_16x8b, 2); //b2 b3 b4 b5 b6 b7 b8 b9....b15 0 0 + + src_r0_sht_16x8b = _mm_srli_si128(src_r0_sht_16x8b, 2); //a3 a4 a5 a6 a7 a8 a9....a15 0 0 0 + src_r1_sht_16x8b = _mm_srli_si128(src_r1_sht_16x8b, 2); //b3 b4 b5 b6 b7 b8 b9....b15 0 0 0 + + src_r0_t1_16x8b = _mm_unpacklo_epi8(src_r0_16x8b, src_r0_sht_16x8b); //a2 a3 a3 a4 a4 a5 a5 a6 a6 a7 a7 a8 a8 a9 a9 a10 + src_r1_t1_16x8b = _mm_unpacklo_epi8(src_r1_16x8b, src_r1_sht_16x8b); //b2 b3 b3 b4 b4 b5 b5 b6 b6 b7 b7 b8 a8 a9 a9 a10 + + res_r0_t2_8x16b = _mm_maddubs_epi16(src_r0_t1_16x8b, coeff2_3_16x8b); //a2*c2+a3*c3 a3*c2+a4*c3 a4*c2+a5*c3 a5*c2+a6*c3 + //a6*c2+a7*c3 a7*c2+a8*c3 a8*c2+a9*c3 a9*c2+a10*c3 + res_r1_t2_8x16b = _mm_maddubs_epi16(src_r1_t1_16x8b, coeff2_3_16x8b); //b2*c2+b3*c3 b3*c2+b4*c3 b2*c4+b5*c3 b5*c2+b6*c3 + //b6*c2+b7*c3 b7*c2+b8*c3 b8*c2+b9*c3 b9*c2+b10*c3 + + src_r0_16x8b = _mm_srli_si128(src_r0_16x8b, 2); //a4 a5 a6 a7 a8 a9....a15 0 0 0 0 + src_r1_16x8b = _mm_srli_si128(src_r1_16x8b, 2); //b4 b5 b6 b7 b8 b9....b15 0 0 0 0 + + src_r0_sht_16x8b = _mm_srli_si128(src_r0_sht_16x8b, 2); //a5 a6 a7 a8 a9....a15 0 0 0 0 0 + src_r1_sht_16x8b = _mm_srli_si128(src_r1_sht_16x8b, 2); //b5 b6 b7 b8 b9....b15 0 0 0 0 0 + + src_r0_t1_16x8b = _mm_unpacklo_epi8(src_r0_16x8b, src_r0_sht_16x8b); //a4 a5 a5 a6 a6 a7 a7 a8 a8 a9 a9 a10 a10 a11 a11 a12 + src_r1_t1_16x8b = _mm_unpacklo_epi8(src_r1_16x8b, src_r1_sht_16x8b); //b4 b5 b5 b6 b6 b7 b7 b8 b8 b9 b9 b10 b10 b11 b11 b12 + + res_r0_t3_8x16b = _mm_maddubs_epi16(src_r0_t1_16x8b, coeff4_5_16x8b); //a4*c4+a5*c5 a5*c4+a6*c5 a6*c4+a7*c5 a7*c4+a8*c5 + //a8*c4+a9*c5 a9*c4+a10*c5 a10*c4+a11*c5 a11*c4+a12*c5 + res_r1_t3_8x16b = _mm_maddubs_epi16(src_r1_t1_16x8b, coeff4_5_16x8b); //b4*c4+b5*c5 b5*c4+b6*c5 b6*c4+b7*c5 b7*c4+b8*c5 + //b8*c4+b9*c5 b9*c4+b10*c5 b10*c4+b11*c5 b11*c4+b12*c5 + src_r0_16x8b = _mm_loadu_si128((__m128i *)pu1_pred1); + + res_r0_t1_8x16b = _mm_add_epi16(res_r0_t1_8x16b, res_r0_t2_8x16b); + res_r1_t1_8x16b = _mm_add_epi16(res_r1_t1_8x16b, res_r1_t2_8x16b); + res_r0_t3_8x16b = _mm_add_epi16(const_val16_8x16b, res_r0_t3_8x16b); + res_r1_t3_8x16b = _mm_add_epi16(const_val16_8x16b, res_r1_t3_8x16b); + res_r0_t1_8x16b = _mm_add_epi16(res_r0_t1_8x16b, res_r0_t3_8x16b); + res_r1_t1_8x16b = _mm_add_epi16(res_r1_t1_8x16b, res_r1_t3_8x16b); + + res_r0_t1_8x16b = _mm_srai_epi16(res_r0_t1_8x16b, 5); + res_r1_t1_8x16b = _mm_srai_epi16(res_r1_t1_8x16b, 5); //shifting right by 5 bits + + res_16x8b = _mm_packus_epi16(res_r0_t1_8x16b, res_r1_t1_8x16b); + res_16x8b = _mm_avg_epu8(src_r0_16x8b, res_16x8b); //computing q-pel + + _mm_storeu_si128((__m128i *)pu1_dst, res_16x8b); + + ht--; + pu1_src += src_strd; + pu1_pred1 += src_strd; + pu1_dst += dst_strd; + } + while(ht > 0); + } +} + +/*****************************************************************************/ +/* */ +/* Function Name : ih264_inter_pred_luma_vert_qpel_ssse3 */ +/* */ +/* Description : This function implements a six-tap filter vertically on */ +/* ht x wd block and averages the values with the source */ +/* pixels to calculate vertical quarter-pel as mentioned in */ +/* sec. 8.4.2.2.1 titled "Luma sample interpolation */ +/* process". (ht,wd) can be (4,4), (8,4), (4,8), (8,8), */ +/* (16,8), (8,16) or (16,16). */ +/* */ +/* Inputs : puc_src - pointer to source */ +/* puc_dst - pointer to destination */ +/* src_strd - stride for source */ +/* dst_strd - stride for destination */ +/* ht - height of the block */ +/* wd - width of the block */ +/* pu1_tmp - pointer to temporary buffer */ +/* dydx - x and y reference offset for q-pel */ +/* calculations */ +/* */ +/* Issues : None */ +/* */ +/* Revision History: */ +/* */ +/* DD MM YYYY Author(s) Changes */ +/* 13 02 2015 Kaushik Initial Version */ +/* Senthoor */ +/* */ +/*****************************************************************************/ +void ih264_inter_pred_luma_vert_qpel_ssse3(UWORD8 *pu1_src, + UWORD8 *pu1_dst, + WORD32 src_strd, + WORD32 dst_strd, + WORD32 ht, + WORD32 wd, + UWORD8* pu1_tmp, + WORD32 dydx) +{ + WORD32 y_offset; + UWORD8 *pu1_pred1; + + UNUSED(pu1_tmp); + + __m128i src_r0_16x8b, src_r1_16x8b, src_r2_16x8b, src_r3_16x8b, src_r4_16x8b; + __m128i src_r5_16x8b, src_r6_16x8b; + __m128i src_r0r1_16x8b, src_r2r3_16x8b, src_r4r5_16x8b; + __m128i res_16x8b, res_t1_8x16b, res_t2_8x16b, res_t3_8x16b; + + __m128i coeff0_1_16x8b, coeff2_3_16x8b, coeff4_5_16x8b; + __m128i const_val16_8x16b; + + y_offset = dydx & 0xf; + + coeff0_1_16x8b = _mm_set1_epi32(0xFB01FB01); //c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 + coeff2_3_16x8b = _mm_set1_epi32(0x14141414); //c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 + coeff4_5_16x8b = _mm_set1_epi32(0x01FB01FB); //c4 c5 c4 c5 c4 c5 c4 c5 c4 c5 c4 c5 c4 c5 c4 c5 + //c0 = c5 = 1, c1 = c4 = -5, c2 = c3 = 20 + + pu1_pred1 = pu1_src + (y_offset >> 3) * src_strd; + + const_val16_8x16b = _mm_set1_epi16(16); + + pu1_src -= src_strd << 1; // the filter input starts from x[-2] (till x[3]) + + if(wd == 4) + { + __m128i mask_low32b; + + mask_low32b = _mm_set1_epi8(0xff); + + //Epilogue: Load all the pred rows except sixth and seventh row + // for the first and second row processing. + src_r0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src); + pu1_src += src_strd; + src_r1_16x8b = _mm_loadl_epi64((__m128i *)pu1_src); + pu1_src += src_strd; + src_r2_16x8b = _mm_loadl_epi64((__m128i *)pu1_src); + pu1_src += src_strd; + src_r3_16x8b = _mm_loadl_epi64((__m128i *)pu1_src); + pu1_src += src_strd; + src_r4_16x8b = _mm_loadl_epi64((__m128i *)pu1_src); + pu1_src += src_strd; + + mask_low32b = _mm_srli_si128(mask_low32b, 12); // mask for first four bytes + + src_r0_16x8b = _mm_unpacklo_epi32(src_r0_16x8b, src_r1_16x8b); + src_r1_16x8b = _mm_unpacklo_epi32(src_r1_16x8b, src_r2_16x8b); + src_r2_16x8b = _mm_unpacklo_epi32(src_r2_16x8b, src_r3_16x8b); + src_r3_16x8b = _mm_unpacklo_epi32(src_r3_16x8b, src_r4_16x8b); + + do + { + src_r5_16x8b = _mm_loadl_epi64((__m128i *)pu1_src); + src_r6_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + src_strd)); + + src_r4_16x8b = _mm_unpacklo_epi32(src_r4_16x8b, src_r5_16x8b); + src_r5_16x8b = _mm_unpacklo_epi32(src_r5_16x8b, src_r6_16x8b); + + src_r0r1_16x8b = _mm_unpacklo_epi8(src_r0_16x8b, src_r1_16x8b); + src_r2r3_16x8b = _mm_unpacklo_epi8(src_r2_16x8b, src_r3_16x8b); + src_r4r5_16x8b = _mm_unpacklo_epi8(src_r4_16x8b, src_r5_16x8b); + + res_t1_8x16b = _mm_maddubs_epi16(src_r0r1_16x8b, coeff0_1_16x8b); + res_t2_8x16b = _mm_maddubs_epi16(src_r2r3_16x8b, coeff2_3_16x8b); + res_t3_8x16b = _mm_maddubs_epi16(src_r4r5_16x8b, coeff4_5_16x8b); + + src_r0_16x8b = _mm_loadl_epi64((__m128i *)pu1_pred1); + src_r1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_pred1 + src_strd)); + + res_t1_8x16b = _mm_add_epi16(res_t1_8x16b, res_t2_8x16b); + res_t3_8x16b = _mm_add_epi16(const_val16_8x16b, res_t3_8x16b); + res_t1_8x16b = _mm_add_epi16(res_t1_8x16b, res_t3_8x16b); + + src_r0r1_16x8b = _mm_unpacklo_epi32(src_r0_16x8b,src_r1_16x8b); + + res_t1_8x16b = _mm_srai_epi16(res_t1_8x16b, 5); //shifting right by 5 bits. + + res_16x8b = _mm_packus_epi16(res_t1_8x16b, res_t1_8x16b); + + res_16x8b = _mm_avg_epu8(src_r0r1_16x8b, res_16x8b); //computing q-pel + + _mm_maskmoveu_si128(res_16x8b, mask_low32b, (char*)pu1_dst); + res_16x8b = _mm_srli_si128(res_16x8b, 4); + _mm_maskmoveu_si128(res_16x8b, mask_low32b, (char*)(pu1_dst + dst_strd)); + + src_r0_16x8b = src_r2_16x8b; + src_r1_16x8b = src_r3_16x8b; + src_r2_16x8b = src_r4_16x8b; + src_r3_16x8b = src_r5_16x8b; + src_r4_16x8b = src_r6_16x8b; + + ht -= 2; + pu1_src += src_strd << 1; + pu1_pred1 += src_strd << 1; + pu1_dst += dst_strd << 1; + } + while(ht > 0); + } + + else if(wd == 8) + { + //Epilogue: Load all the pred rows except sixth and seventh row + // for the first and second row processing. + src_r0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src); + pu1_src += src_strd; + src_r1_16x8b = _mm_loadl_epi64((__m128i *)pu1_src); + pu1_src += src_strd; + src_r2_16x8b = _mm_loadl_epi64((__m128i *)pu1_src); + pu1_src += src_strd; + src_r3_16x8b = _mm_loadl_epi64((__m128i *)pu1_src); + pu1_src += src_strd; + src_r4_16x8b = _mm_loadl_epi64((__m128i *)pu1_src); + pu1_src += src_strd; + + src_r0_16x8b = _mm_unpacklo_epi64(src_r0_16x8b, src_r1_16x8b); + src_r1_16x8b = _mm_unpacklo_epi64(src_r1_16x8b, src_r2_16x8b); + src_r2_16x8b = _mm_unpacklo_epi64(src_r2_16x8b, src_r3_16x8b); + src_r3_16x8b = _mm_unpacklo_epi64(src_r3_16x8b, src_r4_16x8b); + + do + { + src_r5_16x8b = _mm_loadl_epi64((__m128i *)pu1_src); + src_r6_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + src_strd)); + + src_r4_16x8b = _mm_unpacklo_epi64(src_r4_16x8b, src_r5_16x8b); + src_r5_16x8b = _mm_unpacklo_epi64(src_r5_16x8b, src_r6_16x8b); + + src_r0r1_16x8b = _mm_unpacklo_epi8(src_r0_16x8b, src_r1_16x8b); + src_r2r3_16x8b = _mm_unpacklo_epi8(src_r2_16x8b, src_r3_16x8b); + src_r4r5_16x8b = _mm_unpacklo_epi8(src_r4_16x8b, src_r5_16x8b); + + res_t1_8x16b = _mm_maddubs_epi16(src_r0r1_16x8b, coeff0_1_16x8b); + res_t2_8x16b = _mm_maddubs_epi16(src_r2r3_16x8b, coeff2_3_16x8b); + res_t3_8x16b = _mm_maddubs_epi16(src_r4r5_16x8b, coeff4_5_16x8b); + + src_r0r1_16x8b = _mm_loadl_epi64((__m128i *)pu1_pred1); + + res_t1_8x16b = _mm_add_epi16(res_t1_8x16b, res_t2_8x16b); + res_t3_8x16b = _mm_add_epi16(const_val16_8x16b, res_t3_8x16b); + res_t1_8x16b = _mm_add_epi16(res_t1_8x16b, res_t3_8x16b); + + res_t1_8x16b = _mm_srai_epi16(res_t1_8x16b, 5); //shifting right by 5 bits. + + res_16x8b = _mm_packus_epi16(res_t1_8x16b, res_t1_8x16b); + res_16x8b = _mm_avg_epu8(src_r0r1_16x8b, res_16x8b); //computing q-pel + + _mm_storel_epi64((__m128i *)pu1_dst, res_16x8b); + + src_r0r1_16x8b = _mm_unpackhi_epi8(src_r0_16x8b, src_r1_16x8b); + src_r2r3_16x8b = _mm_unpackhi_epi8(src_r2_16x8b, src_r3_16x8b); + src_r4r5_16x8b = _mm_unpackhi_epi8(src_r4_16x8b, src_r5_16x8b); + + res_t1_8x16b = _mm_maddubs_epi16(src_r0r1_16x8b, coeff0_1_16x8b); + res_t2_8x16b = _mm_maddubs_epi16(src_r2r3_16x8b, coeff2_3_16x8b); + res_t3_8x16b = _mm_maddubs_epi16(src_r4r5_16x8b, coeff4_5_16x8b); + + src_r0r1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_pred1 + src_strd)); + + res_t1_8x16b = _mm_add_epi16(res_t1_8x16b, res_t2_8x16b); + res_t3_8x16b = _mm_add_epi16(const_val16_8x16b, res_t3_8x16b); + res_t1_8x16b = _mm_add_epi16(res_t1_8x16b, res_t3_8x16b); + + res_t1_8x16b = _mm_srai_epi16(res_t1_8x16b, 5); //shifting right by 5 bits. + + res_16x8b = _mm_packus_epi16(res_t1_8x16b, res_t1_8x16b); + res_16x8b = _mm_avg_epu8(src_r0r1_16x8b, res_16x8b); //computing q-pel + + _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd), res_16x8b); + + src_r0_16x8b = src_r2_16x8b; + src_r1_16x8b = src_r3_16x8b; + src_r2_16x8b = src_r4_16x8b; + src_r3_16x8b = src_r5_16x8b; + src_r4_16x8b = src_r6_16x8b; + + ht -= 2; + pu1_src += src_strd << 1; + pu1_pred1 += src_strd << 1; + pu1_dst += dst_strd << 1; + } + while(ht > 0); + } + else // wd == 16 + { + __m128i res_t0_8x16b; + + //Epilogue: Load all the pred rows except sixth and seventh row + // for the first and second row processing. + src_r0_16x8b = _mm_loadu_si128((__m128i *)pu1_src); + pu1_src += src_strd; + src_r1_16x8b = _mm_loadu_si128((__m128i *)pu1_src); + pu1_src += src_strd; + src_r2_16x8b = _mm_loadu_si128((__m128i *)pu1_src); + pu1_src += src_strd; + src_r3_16x8b = _mm_loadu_si128((__m128i *)pu1_src); + pu1_src += src_strd; + src_r4_16x8b = _mm_loadu_si128((__m128i *)pu1_src); + pu1_src += src_strd; + + do + { + src_r5_16x8b = _mm_loadu_si128((__m128i *)pu1_src); + src_r6_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + src_strd)); + + src_r0r1_16x8b = _mm_unpacklo_epi8(src_r0_16x8b, src_r1_16x8b); + src_r2r3_16x8b = _mm_unpacklo_epi8(src_r2_16x8b, src_r3_16x8b); + src_r4r5_16x8b = _mm_unpacklo_epi8(src_r4_16x8b, src_r5_16x8b); + + res_t1_8x16b = _mm_maddubs_epi16(src_r0r1_16x8b, coeff0_1_16x8b); + res_t2_8x16b = _mm_maddubs_epi16(src_r2r3_16x8b, coeff2_3_16x8b); + res_t3_8x16b = _mm_maddubs_epi16(src_r4r5_16x8b, coeff4_5_16x8b); + + res_t1_8x16b = _mm_add_epi16(res_t1_8x16b, res_t2_8x16b); + res_t3_8x16b = _mm_add_epi16(const_val16_8x16b, res_t3_8x16b); + res_t1_8x16b = _mm_add_epi16(res_t1_8x16b, res_t3_8x16b); + + res_t0_8x16b = _mm_srai_epi16(res_t1_8x16b, 5); //shifting right by 5 bits. + + src_r0r1_16x8b = _mm_unpackhi_epi8(src_r0_16x8b, src_r1_16x8b); + src_r2r3_16x8b = _mm_unpackhi_epi8(src_r2_16x8b, src_r3_16x8b); + src_r4r5_16x8b = _mm_unpackhi_epi8(src_r4_16x8b, src_r5_16x8b); + + res_t1_8x16b = _mm_maddubs_epi16(src_r0r1_16x8b, coeff0_1_16x8b); + res_t2_8x16b = _mm_maddubs_epi16(src_r2r3_16x8b, coeff2_3_16x8b); + res_t3_8x16b = _mm_maddubs_epi16(src_r4r5_16x8b, coeff4_5_16x8b); + + src_r0r1_16x8b = _mm_loadu_si128((__m128i *)pu1_pred1); + + res_t1_8x16b = _mm_add_epi16(res_t1_8x16b, res_t2_8x16b); + res_t3_8x16b = _mm_add_epi16(const_val16_8x16b, res_t3_8x16b); + res_t1_8x16b = _mm_add_epi16(res_t1_8x16b, res_t3_8x16b); + + res_t1_8x16b = _mm_srai_epi16(res_t1_8x16b, 5); //shifting right by 5 bits. + + res_16x8b = _mm_packus_epi16(res_t0_8x16b, res_t1_8x16b); + res_16x8b = _mm_avg_epu8(src_r0r1_16x8b, res_16x8b); //computing q-pel + + _mm_storeu_si128((__m128i *)pu1_dst, res_16x8b); + + src_r0r1_16x8b = _mm_unpacklo_epi8(src_r1_16x8b, src_r2_16x8b); + src_r2r3_16x8b = _mm_unpacklo_epi8(src_r3_16x8b, src_r4_16x8b); + src_r4r5_16x8b = _mm_unpacklo_epi8(src_r5_16x8b, src_r6_16x8b); + + res_t1_8x16b = _mm_maddubs_epi16(src_r0r1_16x8b, coeff0_1_16x8b); + res_t2_8x16b = _mm_maddubs_epi16(src_r2r3_16x8b, coeff2_3_16x8b); + res_t3_8x16b = _mm_maddubs_epi16(src_r4r5_16x8b, coeff4_5_16x8b); + + res_t1_8x16b = _mm_add_epi16(res_t1_8x16b, res_t2_8x16b); + res_t3_8x16b = _mm_add_epi16(const_val16_8x16b, res_t3_8x16b); + res_t1_8x16b = _mm_add_epi16(res_t1_8x16b, res_t3_8x16b); + + res_t0_8x16b = _mm_srai_epi16(res_t1_8x16b, 5); //shifting right by 5 bits. + + src_r0r1_16x8b = _mm_unpackhi_epi8(src_r1_16x8b, src_r2_16x8b); + src_r2r3_16x8b = _mm_unpackhi_epi8(src_r3_16x8b, src_r4_16x8b); + src_r4r5_16x8b = _mm_unpackhi_epi8(src_r5_16x8b, src_r6_16x8b); + + res_t1_8x16b = _mm_maddubs_epi16(src_r0r1_16x8b, coeff0_1_16x8b); + res_t2_8x16b = _mm_maddubs_epi16(src_r2r3_16x8b, coeff2_3_16x8b); + res_t3_8x16b = _mm_maddubs_epi16(src_r4r5_16x8b, coeff4_5_16x8b); + + src_r0r1_16x8b = _mm_loadu_si128((__m128i *)(pu1_pred1 + src_strd)); + + res_t1_8x16b = _mm_add_epi16(res_t1_8x16b, res_t2_8x16b); + res_t3_8x16b = _mm_add_epi16(const_val16_8x16b, res_t3_8x16b); + res_t1_8x16b = _mm_add_epi16(res_t1_8x16b, res_t3_8x16b); + + res_t1_8x16b = _mm_srai_epi16(res_t1_8x16b, 5); //shifting right by 5 bits. + + res_16x8b = _mm_packus_epi16(res_t0_8x16b, res_t1_8x16b); + res_16x8b = _mm_avg_epu8(src_r0r1_16x8b, res_16x8b); //computing q-pel + + _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd), res_16x8b); + + src_r0_16x8b = src_r2_16x8b; + src_r1_16x8b = src_r3_16x8b; + src_r2_16x8b = src_r4_16x8b; + src_r3_16x8b = src_r5_16x8b; + src_r4_16x8b = src_r6_16x8b; + + ht -= 2; + pu1_src += src_strd << 1; + pu1_pred1 += src_strd << 1; + pu1_dst += dst_strd << 1; + } + while(ht > 0); + } +} + +/*****************************************************************************/ +/* */ +/* Function Name : ih264_inter_pred_luma_horz_qpel_vert_qpel_ssse3 */ +/* */ +/* Description : This function implements a six-tap filter vertically and */ +/* horizontally on ht x wd block separately and averages */ +/* the two sets of values to calculate values at (1/4,1/4), */ +/* (1/4, 3/4), (3/4, 1/4) or (3/4, 3/4) as mentioned in */ +/* sec. 8.4.2.2.1 titled "Luma sample interpolation */ +/* process". (ht,wd) can be (4,4), (8,4), (4,8), (8,8), */ +/* (16,8), (8,16) or (16,16). */ +/* */ +/* Inputs : puc_src - pointer to source */ +/* puc_dst - pointer to destination */ +/* src_strd - stride for source */ +/* dst_strd - stride for destination */ +/* ht - height of the block */ +/* wd - width of the block */ +/* pu1_tmp - pointer to temporary buffer */ +/* dydx - x and y reference offset for q-pel */ +/* calculations */ +/* */ +/* Issues : None */ +/* */ +/* Revision History: */ +/* */ +/* DD MM YYYY Author(s) Changes */ +/* 13 02 2015 Kaushik Initial Version */ +/* Senthoor */ +/* */ +/*****************************************************************************/ +void ih264_inter_pred_luma_horz_qpel_vert_qpel_ssse3(UWORD8 *pu1_src, + UWORD8 *pu1_dst, + WORD32 src_strd, + WORD32 dst_strd, + WORD32 ht, + WORD32 wd, + UWORD8* pu1_tmp, + WORD32 dydx) +{ + WORD32 ht_temp; + UWORD8 *pu1_pred_vert,*pu1_pred_horiz; + UWORD8 *pu1_tmp1, *pu1_tmp2; + WORD32 x_offset, y_offset; + + pu1_tmp1 = pu1_tmp; + + dydx &= 0xf; + ht_temp = ht; + x_offset = dydx & 0x3; + y_offset = dydx >> 2; + pu1_tmp2 = pu1_tmp1; + + pu1_pred_vert = pu1_src + (x_offset >> 1) - 2*src_strd; + pu1_pred_horiz = pu1_src + (y_offset >> 1) * src_strd - 2; + //the filter input starts from x[-2] (till x[3]) + + __m128i coeff0_1_16x8b, coeff2_3_16x8b, coeff4_5_16x8b; + __m128i const_val16_8x16b; + + coeff0_1_16x8b = _mm_set1_epi32(0xFB01FB01); //c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 + coeff2_3_16x8b = _mm_set1_epi32(0x14141414); //c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 + coeff4_5_16x8b = _mm_set1_epi32(0x01FB01FB); //c4 c5 c5 c5 c4 c5 c5 c5 c4 c5 c5 c5 c4 c5 c5 c5 + //c0 = c5 = 1, c1 = c4 = -5, c2 = c3 = 20 + const_val16_8x16b = _mm_set1_epi16(16); + + if(wd == 4) + { + //vertical q-pel filter + { + __m128i src_r0_16x8b, src_r1_16x8b, src_r2_16x8b, src_r3_16x8b, src_r4_16x8b; + __m128i src_r5_16x8b, src_r6_16x8b; + __m128i src_r0r1_16x8b, src_r2r3_16x8b, src_r4r5_16x8b; + + __m128i res_r0r1_16x8b, res_t1_8x16b, res_t2_8x16b, res_t3_8x16b; + + //epilogue: Load all the pred rows except sixth and seventh row for the + //first and second row processing. + src_r0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_pred_vert)); + pu1_pred_vert = pu1_pred_vert + src_strd; + + src_r1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_pred_vert)); + pu1_pred_vert = pu1_pred_vert + src_strd; + src_r0_16x8b = _mm_unpacklo_epi32(src_r0_16x8b, src_r1_16x8b); + + src_r2_16x8b = _mm_loadl_epi64((__m128i *)(pu1_pred_vert)); + pu1_pred_vert = pu1_pred_vert + src_strd; + src_r1_16x8b = _mm_unpacklo_epi32(src_r1_16x8b, src_r2_16x8b); + + src_r3_16x8b = _mm_loadl_epi64((__m128i *)(pu1_pred_vert)); + pu1_pred_vert = pu1_pred_vert + src_strd; + src_r2_16x8b = _mm_unpacklo_epi32(src_r2_16x8b, src_r3_16x8b); + + src_r4_16x8b = _mm_loadl_epi64((__m128i *)(pu1_pred_vert)); + pu1_pred_vert = pu1_pred_vert + src_strd; + src_r3_16x8b = _mm_unpacklo_epi32(src_r3_16x8b, src_r4_16x8b); + + //Core Loop: Process all the rows. + do + { + src_r5_16x8b = _mm_loadl_epi64((__m128i *)(pu1_pred_vert)); + src_r4_16x8b = _mm_unpacklo_epi32(src_r4_16x8b, src_r5_16x8b); + + src_r6_16x8b = _mm_loadl_epi64((__m128i *)(pu1_pred_vert + src_strd)); + src_r5_16x8b = _mm_unpacklo_epi32(src_r5_16x8b, src_r6_16x8b); + + src_r0r1_16x8b = _mm_unpacklo_epi8(src_r0_16x8b, src_r1_16x8b); + src_r2r3_16x8b = _mm_unpacklo_epi8(src_r2_16x8b, src_r3_16x8b); + src_r4r5_16x8b = _mm_unpacklo_epi8(src_r4_16x8b, src_r5_16x8b); + + res_t1_8x16b = _mm_maddubs_epi16(src_r0r1_16x8b, coeff0_1_16x8b); + res_t2_8x16b = _mm_maddubs_epi16(src_r2r3_16x8b, coeff2_3_16x8b); + res_t3_8x16b = _mm_maddubs_epi16(src_r4r5_16x8b, coeff4_5_16x8b); + + res_t1_8x16b = _mm_add_epi16(res_t1_8x16b, res_t2_8x16b); + res_t3_8x16b = _mm_add_epi16(const_val16_8x16b, res_t3_8x16b); + res_t1_8x16b = _mm_add_epi16(res_t3_8x16b, res_t1_8x16b); + + res_t1_8x16b = _mm_srai_epi16(res_t1_8x16b, 5); //shifting right by 5 bits. + res_r0r1_16x8b = _mm_packus_epi16(res_t1_8x16b, res_t1_8x16b); + + _mm_storel_epi64((__m128i *)pu1_tmp1, res_r0r1_16x8b); + + src_r0_16x8b = src_r2_16x8b; + src_r1_16x8b = src_r3_16x8b; + src_r2_16x8b = src_r4_16x8b; + src_r3_16x8b = src_r5_16x8b; + src_r4_16x8b = src_r6_16x8b; + + ht_temp -= 2; + pu1_pred_vert += src_strd << 1; + pu1_tmp1 += 8; + } + while(ht_temp > 0); + } + + //horizontal q-pel filter + { + __m128i src_r0_16x8b, src_r1_16x8b; + __m128i src_r0_sht_16x8b, src_r1_sht_16x8b; + __m128i src_r0r1_vpel_16x8b, src_r0r1_t1_16x8b; + + __m128i res_r0r1_t1_8x16b, res_r0r1_t2_8x16b, res_r0r1_t3_8x16b; + __m128i res_r0r1_16x8b; + + __m128i mask_low32b; + + mask_low32b = _mm_set1_epi8(0xff); + mask_low32b = _mm_srli_si128(mask_low32b, 12); + + //Row0 : a0 a1 a2 a3 a4 a5 a6 a7 a8 a9..... + //Row1 : b0 b1 b2 b3 b4 b5 b6 b7 b8 b9..... + + do + { + src_r0_16x8b = _mm_loadu_si128((__m128i *)pu1_pred_horiz); //a0 a1 a2 a3 a4 a5 a6 a7 a8 a9....a15 + src_r1_16x8b = _mm_loadu_si128((__m128i *)(pu1_pred_horiz + src_strd)); //b0 b1 b2 b3 b4 b5 b6 b7 b8 b9....b15 + + src_r0r1_vpel_16x8b = _mm_loadl_epi64((__m128i *)pu1_tmp2); + + src_r0_sht_16x8b = _mm_srli_si128(src_r0_16x8b, 1); //a1 a2 a3 a4 a5 a6 a7 a8 a9....a15 0 + src_r1_sht_16x8b = _mm_srli_si128(src_r1_16x8b, 1); //b1 b2 b3 b4 b5 b6 b7 b8 b9....b15 0 + + src_r0_16x8b = _mm_unpacklo_epi8(src_r0_16x8b, src_r0_sht_16x8b); //a0 a1 a1 a2 a2 a3 a3 a4 a4 a5 a5 a6 a6 a7 a7 a8 + src_r1_16x8b = _mm_unpacklo_epi8(src_r1_16x8b, src_r1_sht_16x8b); //b0 b1 b1 b2 b2 b3 b3 b4 b4 b5 b5 b6 b6 b7 b7 b8 + + src_r0r1_t1_16x8b = _mm_unpacklo_epi64(src_r0_16x8b, src_r1_16x8b); //a0 a1 a1 a2 a2 a3 a3 a4 b0 b1 b1 b2 b2 b3 b3 b4 + res_r0r1_t1_8x16b = _mm_maddubs_epi16(src_r0r1_t1_16x8b, coeff0_1_16x8b); //a0*c0+a1*c1 a1*c0+a2*c1 a2*c0+a3*c1 a3*c0+a4*c1 + //b0*c0+b1*c1 b1*c0+b2*c1 b2*c0+b3*c1 b3*c0+b4*c1 + + src_r0_16x8b = _mm_srli_si128(src_r0_16x8b, 4); //a2 a3 a3 a4 a4 a5 a5 a6 a6 a7 a7 a8 0 0 0 0 + src_r1_16x8b = _mm_srli_si128(src_r1_16x8b, 4); //b2 b3 b3 b4 b4 b5 b5 b6 b6 b7 b7 b8 0 0 0 0 + + src_r0r1_t1_16x8b = _mm_unpacklo_epi64(src_r0_16x8b, src_r1_16x8b); //a2 a3 a3 a4 a4 a5 a5 a6 b2 b3 b3 b4 b4 b5 b5 b6 + res_r0r1_t2_8x16b = _mm_maddubs_epi16(src_r0r1_t1_16x8b, coeff2_3_16x8b); //a2*c2+a3*c3 a3*c2+a4*c3 a4*c2+a5*c3 a5*c2+a6*c3 + //b2*c2+b3*c3 b3*c2+b4*c3 b4*c2+b5*c3 b5*c2+b6*c3 + + src_r0_16x8b = _mm_srli_si128(src_r0_16x8b, 4); //a4 a5 a5 a6 a6 a7 a7 a8 0 0 0 0 0 0 0 0 + src_r1_16x8b = _mm_srli_si128(src_r1_16x8b, 4); //b4 b5 b5 b6 b6 b7 b7 b8 0 0 0 0 0 0 0 0 + + src_r0r1_t1_16x8b = _mm_unpacklo_epi64(src_r0_16x8b, src_r1_16x8b); //a4 a5 a5 a6 a6 a7 a7 a8 b4 b5 b5 b6 b6 b7 b7 b8 + res_r0r1_t3_8x16b = _mm_maddubs_epi16(src_r0r1_t1_16x8b, coeff4_5_16x8b); //a4*c4+a5*c5 a5*c4+a6*c5 a6*c4+a7*c5 a7*c4+a8*c5 + //b4*c4+b5*c5 b5*c4+b6*c5 b4*c6+b7*c5 b7*c4+b8*c5 + + res_r0r1_t1_8x16b = _mm_add_epi16(res_r0r1_t1_8x16b, res_r0r1_t2_8x16b); + res_r0r1_t3_8x16b = _mm_add_epi16(const_val16_8x16b, res_r0r1_t3_8x16b); + res_r0r1_t1_8x16b = _mm_add_epi16(res_r0r1_t1_8x16b, res_r0r1_t3_8x16b); //a0*c0+a1*c1+a2*c2+a3*c3+a4*a4+a5*c5 + 15; + //a1*c0+a2*c1+a2*c2+a3*c3+a5*a4+a6*c5 + 15; + //a2*c0+a3*c1+a4*c2+a5*c3+a6*a4+a7*c5 + 15; + //a3*c0+a4*c1+a5*c2+a6*c3+a6*a4+a8*c5 + 15; + //b0*c0+b1*c1+b2*c2+b3*c3+b4*b4+b5*c5 + 15; + //b1*c0+b2*c1+b2*c2+b3*c3+b5*b4+b6*c5 + 15; + //b2*c0+b3*c1+b4*c2+b5*c3+b6*b4+b7*c5 + 15; + //b3*c0+b4*c1+b5*c2+b6*c3+b6*b4+b8*c5 + 15; + + res_r0r1_t1_8x16b = _mm_srai_epi16(res_r0r1_t1_8x16b, 5); //shifting right by 5 bits. + + res_r0r1_16x8b = _mm_packus_epi16(res_r0r1_t1_8x16b,res_r0r1_t1_8x16b); + + res_r0r1_16x8b = _mm_avg_epu8(res_r0r1_16x8b,src_r0r1_vpel_16x8b); + + _mm_maskmoveu_si128(res_r0r1_16x8b, mask_low32b, (char*)pu1_dst); + res_r0r1_16x8b = _mm_srli_si128(res_r0r1_16x8b, 4); + _mm_maskmoveu_si128(res_r0r1_16x8b, mask_low32b, (char*)(pu1_dst + dst_strd)); + + ht -= 2; + pu1_pred_horiz += src_strd << 1; + pu1_tmp2 += 8; + pu1_dst += dst_strd << 1; + } + while(ht > 0); + } + } + else if(wd == 8) + { + //vertical q-pel filter + { + __m128i src_r0_16x8b, src_r1_16x8b, src_r2_16x8b, src_r3_16x8b; + __m128i src_r4_16x8b, src_r5_16x8b, src_r6_16x8b; + __m128i src_r0r1_16x8b, src_r2r3_16x8b, src_r4r5_16x8b; + + __m128i res_16x8b, res_t1_8x16b, res_t2_8x16b, res_t3_8x16b; + + //epilogue: Load all the pred rows except sixth and seventh row for the + //first and second row processing. + src_r0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_pred_vert)); + pu1_pred_vert = pu1_pred_vert + src_strd; + + src_r1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_pred_vert)); + pu1_pred_vert = pu1_pred_vert + src_strd; + src_r0_16x8b = _mm_unpacklo_epi64(src_r0_16x8b, src_r1_16x8b); + + src_r2_16x8b = _mm_loadl_epi64((__m128i *)(pu1_pred_vert)); + pu1_pred_vert = pu1_pred_vert + src_strd; + src_r1_16x8b = _mm_unpacklo_epi64(src_r1_16x8b, src_r2_16x8b); + + src_r3_16x8b = _mm_loadl_epi64((__m128i *)(pu1_pred_vert)); + pu1_pred_vert = pu1_pred_vert + src_strd; + src_r2_16x8b = _mm_unpacklo_epi64(src_r2_16x8b, src_r3_16x8b); + + src_r4_16x8b = _mm_loadl_epi64((__m128i *)(pu1_pred_vert)); + pu1_pred_vert = pu1_pred_vert + src_strd; + src_r3_16x8b = _mm_unpacklo_epi64(src_r3_16x8b, src_r4_16x8b); + + //Core Loop: Process all the rows. + do + { + src_r5_16x8b = _mm_loadl_epi64((__m128i *)(pu1_pred_vert)); + src_r4_16x8b = _mm_unpacklo_epi64(src_r4_16x8b, src_r5_16x8b); + + src_r6_16x8b = _mm_loadl_epi64((__m128i *)(pu1_pred_vert + src_strd)); + src_r5_16x8b = _mm_unpacklo_epi64(src_r5_16x8b, src_r6_16x8b); + + src_r0r1_16x8b = _mm_unpacklo_epi8(src_r0_16x8b, src_r1_16x8b); + src_r2r3_16x8b = _mm_unpacklo_epi8(src_r2_16x8b, src_r3_16x8b); + src_r4r5_16x8b = _mm_unpacklo_epi8(src_r4_16x8b, src_r5_16x8b); + + res_t1_8x16b = _mm_maddubs_epi16(src_r0r1_16x8b, coeff0_1_16x8b); + res_t2_8x16b = _mm_maddubs_epi16(src_r2r3_16x8b, coeff2_3_16x8b); + res_t3_8x16b = _mm_maddubs_epi16(src_r4r5_16x8b, coeff4_5_16x8b); + + res_t1_8x16b = _mm_add_epi16(res_t1_8x16b, res_t2_8x16b); + res_t3_8x16b = _mm_add_epi16(const_val16_8x16b, res_t3_8x16b); + res_t1_8x16b = _mm_add_epi16(res_t3_8x16b, res_t1_8x16b); + + res_t1_8x16b = _mm_srai_epi16(res_t1_8x16b, 5); //shifting right by 5 bits. + res_16x8b = _mm_packus_epi16(res_t1_8x16b, res_t1_8x16b); + + _mm_storel_epi64((__m128i *)(pu1_tmp1), res_16x8b); + + src_r0r1_16x8b = _mm_unpackhi_epi8(src_r0_16x8b, src_r1_16x8b); + src_r2r3_16x8b = _mm_unpackhi_epi8(src_r2_16x8b, src_r3_16x8b); + src_r4r5_16x8b = _mm_unpackhi_epi8(src_r4_16x8b, src_r5_16x8b); + + res_t1_8x16b = _mm_maddubs_epi16(src_r0r1_16x8b, coeff0_1_16x8b); + res_t2_8x16b = _mm_maddubs_epi16(src_r2r3_16x8b, coeff2_3_16x8b); + res_t3_8x16b = _mm_maddubs_epi16(src_r4r5_16x8b, coeff4_5_16x8b); + + res_t1_8x16b = _mm_add_epi16(res_t1_8x16b, res_t2_8x16b); + res_t3_8x16b = _mm_add_epi16(const_val16_8x16b, res_t3_8x16b); + res_t1_8x16b = _mm_add_epi16(res_t3_8x16b, res_t1_8x16b); + + res_t1_8x16b = _mm_srai_epi16(res_t1_8x16b, 5); //shifting right by 5 bits. + res_16x8b = _mm_packus_epi16(res_t1_8x16b, res_t1_8x16b); + + _mm_storel_epi64((__m128i *)(pu1_tmp1 + 8), res_16x8b); + + src_r0_16x8b = src_r2_16x8b; + src_r1_16x8b = src_r3_16x8b; + src_r2_16x8b = src_r4_16x8b; + src_r3_16x8b = src_r5_16x8b; + src_r4_16x8b = src_r6_16x8b; + + ht_temp -= 2; + pu1_pred_vert += src_strd << 1; + pu1_tmp1 += 16; + } + while(ht_temp > 0); + } + + //horizontal q-pel filter + { + __m128i src_r0_16x8b, src_r1_16x8b, src_r0_sht_16x8b, src_r1_sht_16x8b; + __m128i src_r0_t1_16x8b, src_r1_t1_16x8b; + __m128i src_r0_vpel_16x8b, src_r1_vpel_16x8b; + + __m128i res_r0_t1_8x16b, res_r0_t2_8x16b, res_r0_t3_8x16b; + __m128i res_r1_t1_8x16b, res_r1_t2_8x16b, res_r1_t3_8x16b, res_16x8b; + + //Row0 : a0 a1 a2 a3 a4 a5 a6 a7 a8 a9..... + //Row1 : b0 b1 b2 b3 b4 b5 b6 b7 b8 b9..... + + do + { + src_r0_16x8b = _mm_loadu_si128((__m128i *)(pu1_pred_horiz)); //a0 a1 a2 a3 a4 a5 a6 a7 a8 a9....a15 + src_r1_16x8b = _mm_loadu_si128((__m128i *)(pu1_pred_horiz + src_strd)); //b0 b1 b2 b3 b4 b5 b6 b7 b8 b9....b15 + + src_r0_vpel_16x8b = _mm_loadl_epi64((__m128i *)(pu1_tmp2)); //a2 a3 a4 a5 a6 a7 a8....a15 0 or + //a3 a4 a5 a6 a7 a8 a9....a15 0 + src_r1_vpel_16x8b = _mm_loadl_epi64((__m128i *)(pu1_tmp2 + 8)); + //b2 b3 b4 b5 b6 b7 b8....b15 0 or + //b3 b4 b5 b6 b7 b8 b9....b15 0 + + src_r0_sht_16x8b = _mm_srli_si128(src_r0_16x8b, 1); //a1 a2 a3 a4 a5 a6 a7 a8 a9....a15 0 + src_r1_sht_16x8b = _mm_srli_si128(src_r1_16x8b, 1); //b1 b2 b3 b4 b5 b6 b7 b8 b9....b15 0 + + src_r0_t1_16x8b = _mm_unpacklo_epi8(src_r0_16x8b, src_r0_sht_16x8b); //a0 a1 a1 a2 a2 a3 a3 a4 a4 a5 a5 a6 a6 a7 a7 a8 + src_r1_t1_16x8b = _mm_unpacklo_epi8(src_r1_16x8b, src_r1_sht_16x8b); //b0 b1 b1 b2 b2 b3 b3 b4 b4 b5 b5 b6 b6 b7 b7 b8 + + res_r0_t1_8x16b = _mm_maddubs_epi16(src_r0_t1_16x8b, coeff0_1_16x8b); //a0*c0+a1*c1 a1*c0+a2*c1 a2*c0+a3*c1 a3*c0+a4*c1 + //a4*c0+a5*c1 a5*c0+a6*c1 a6*c0+a7*c1 a7*c0+a8*c1 + res_r1_t1_8x16b = _mm_maddubs_epi16(src_r1_t1_16x8b, coeff0_1_16x8b); //b0*c0+b1*c1 b1*c0+b2*c1 b2*c0+b3*c1 b3*c0+b4*c1 + //b4*c0+b5*c1 b5*c0+b6*c1 b6*c0+b7*c1 b7*c0+b8*c1 + + src_r0_16x8b = _mm_srli_si128(src_r0_16x8b, 2); //a2 a3 a4 a5 a6 a7 a8 a9....a15 0 0 + src_r1_16x8b = _mm_srli_si128(src_r1_16x8b, 2); //b2 b3 b4 b5 b6 b7 b8 b9....b15 0 0 + + src_r0_sht_16x8b = _mm_srli_si128(src_r0_sht_16x8b, 2); //a3 a4 a5 a6 a7 a8 a9....a15 0 0 0 + src_r1_sht_16x8b = _mm_srli_si128(src_r1_sht_16x8b, 2); //b3 b4 b5 b6 b7 b8 b9....b15 0 0 0 + + src_r0_t1_16x8b = _mm_unpacklo_epi8(src_r0_16x8b, src_r0_sht_16x8b); //a2 a3 a3 a4 a4 a5 a5 a6 a6 a7 a7 a8 a8 a9 a9 a10 + src_r1_t1_16x8b = _mm_unpacklo_epi8(src_r1_16x8b, src_r1_sht_16x8b); //b2 b3 b3 b4 b4 b5 b5 b6 b6 b7 b7 b8 a8 a9 a9 a10 + + res_r0_t2_8x16b = _mm_maddubs_epi16(src_r0_t1_16x8b, coeff2_3_16x8b); //a2*c2+a3*c3 a3*c2+a4*c3 a4*c2+a5*c3 a5*c2+a6*c3 + //a6*c2+a7*c3 a7*c2+a8*c3 a8*c2+a9*c3 a9*c2+a10*c3 + res_r1_t2_8x16b = _mm_maddubs_epi16(src_r1_t1_16x8b, coeff2_3_16x8b); //b2*c2+b3*c3 b3*c2+b4*c3 b2*c4+b5*c3 b5*c2+b6*c3 + //b6*c2+b7*c3 b7*c2+b8*c3 b8*c2+b9*c3 b9*c2+b10*c3 + + src_r0_16x8b = _mm_srli_si128(src_r0_16x8b, 2); //a4 a5 a6 a7 a8 a9....a15 0 0 0 0 + src_r1_16x8b = _mm_srli_si128(src_r1_16x8b, 2); //b4 b5 b6 b7 b8 b9....b15 0 0 0 0 + + src_r0_sht_16x8b = _mm_srli_si128(src_r0_sht_16x8b, 2); //a5 a6 a7 a8 a9....a15 0 0 0 0 0 + src_r1_sht_16x8b = _mm_srli_si128(src_r1_sht_16x8b, 2); //b5 b6 b7 b8 b9....b15 0 0 0 0 0 + + src_r0_t1_16x8b = _mm_unpacklo_epi8(src_r0_16x8b, src_r0_sht_16x8b); //a4 a5 a5 a6 a6 a7 a7 a8 a8 a9 a9 a10 a10 a11 a11 a12 + src_r1_t1_16x8b = _mm_unpacklo_epi8(src_r1_16x8b, src_r1_sht_16x8b); //b4 b5 b5 b6 b6 b7 b7 b8 b8 b9 b9 b10 b10 b11 b11 b12 + + res_r0_t3_8x16b = _mm_maddubs_epi16(src_r0_t1_16x8b, coeff4_5_16x8b); //a4*c4+a5*c5 a5*c4+a6*c5 a6*c4+a7*c5 a7*c4+a8*c5 + //a8*c4+a9*c5 a9*c4+a10*c5 a10*c4+a11*c5 a11*c4+a12*c5 + res_r1_t3_8x16b = _mm_maddubs_epi16(src_r1_t1_16x8b, coeff4_5_16x8b); //b4*c4+b5*c5 b5*c4+b6*c5 b6*c4+b7*c5 b7*c4+b8*c5 + //b8*c4+b9*c5 b9*c4+b10*c5 b10*c4+b11*c5 b11*c4+b12*c5 + res_r0_t1_8x16b = _mm_add_epi16(res_r0_t1_8x16b, res_r0_t2_8x16b); + res_r0_t3_8x16b = _mm_add_epi16(const_val16_8x16b, res_r0_t3_8x16b); + res_r0_t1_8x16b = _mm_add_epi16(res_r0_t1_8x16b, res_r0_t3_8x16b); + res_r0_t1_8x16b = _mm_srai_epi16(res_r0_t1_8x16b, 5); //shifting right by 5 bits. + + res_16x8b = _mm_packus_epi16(res_r0_t1_8x16b, res_r0_t1_8x16b); + res_16x8b = _mm_avg_epu8(res_16x8b, src_r0_vpel_16x8b); + + _mm_storel_epi64((__m128i *)(pu1_dst), res_16x8b); + + res_r1_t1_8x16b = _mm_add_epi16(res_r1_t1_8x16b, res_r1_t2_8x16b); + res_r1_t3_8x16b = _mm_add_epi16(const_val16_8x16b, res_r1_t3_8x16b); + res_r1_t1_8x16b = _mm_add_epi16(res_r1_t1_8x16b, res_r1_t3_8x16b); + res_r1_t1_8x16b = _mm_srai_epi16(res_r1_t1_8x16b, 5); //shifting right by 5 bits. + + res_16x8b = _mm_packus_epi16(res_r1_t1_8x16b, res_r1_t1_8x16b); + res_16x8b = _mm_avg_epu8(res_16x8b,src_r1_vpel_16x8b); + + _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd), res_16x8b); + + ht -= 2; + pu1_pred_horiz += src_strd << 1; + pu1_dst += dst_strd << 1; + pu1_tmp2 += 16; + } + while(ht > 0); + } + } + else // wd == 16 + { + //vertical q-pel filter + { + __m128i src_r0_16x8b, src_r1_16x8b, src_r2_16x8b, src_r3_16x8b; + __m128i src_r4_16x8b, src_r5_16x8b, src_r6_16x8b; + __m128i src_r0r1_16x8b, src_r2r3_16x8b, src_r4r5_16x8b; + + __m128i res_t0_8x16b, res_t1_8x16b, res_t2_8x16b, res_t3_8x16b; + __m128i res_16x8b; + + //epilogue: Load all the pred rows except sixth and seventh row for the + //first and second row processing. + src_r0_16x8b = _mm_loadu_si128((__m128i *)(pu1_pred_vert)); + pu1_pred_vert = pu1_pred_vert + src_strd; + src_r1_16x8b = _mm_loadu_si128((__m128i *)(pu1_pred_vert)); + pu1_pred_vert = pu1_pred_vert + src_strd; + src_r2_16x8b = _mm_loadu_si128((__m128i *)(pu1_pred_vert)); + pu1_pred_vert = pu1_pred_vert + src_strd; + src_r3_16x8b = _mm_loadu_si128((__m128i *)(pu1_pred_vert)); + pu1_pred_vert = pu1_pred_vert + src_strd; + src_r4_16x8b = _mm_loadu_si128((__m128i *)(pu1_pred_vert)); + pu1_pred_vert = pu1_pred_vert + src_strd; + + //Core Loop: Process all the rows. + do + { + src_r5_16x8b = _mm_loadu_si128((__m128i *)(pu1_pred_vert)); + src_r6_16x8b = _mm_loadu_si128((__m128i *)(pu1_pred_vert + src_strd)); + + src_r0r1_16x8b = _mm_unpacklo_epi8(src_r0_16x8b, src_r1_16x8b); + src_r2r3_16x8b = _mm_unpacklo_epi8(src_r2_16x8b, src_r3_16x8b); + src_r4r5_16x8b = _mm_unpacklo_epi8(src_r4_16x8b, src_r5_16x8b); + + res_t1_8x16b = _mm_maddubs_epi16(src_r0r1_16x8b, coeff0_1_16x8b); + res_t2_8x16b = _mm_maddubs_epi16(src_r2r3_16x8b, coeff2_3_16x8b); + res_t3_8x16b = _mm_maddubs_epi16(src_r4r5_16x8b, coeff4_5_16x8b); + + res_t1_8x16b = _mm_add_epi16(res_t1_8x16b, res_t2_8x16b); + res_t3_8x16b = _mm_add_epi16(const_val16_8x16b, res_t3_8x16b); + res_t1_8x16b = _mm_add_epi16(res_t1_8x16b, res_t3_8x16b); + res_t0_8x16b = _mm_srai_epi16(res_t1_8x16b, 5); //shifting right by 5 bits. + + src_r0r1_16x8b = _mm_unpackhi_epi8(src_r0_16x8b, src_r1_16x8b); + src_r2r3_16x8b = _mm_unpackhi_epi8(src_r2_16x8b, src_r3_16x8b); + src_r4r5_16x8b = _mm_unpackhi_epi8(src_r4_16x8b, src_r5_16x8b); + + res_t1_8x16b = _mm_maddubs_epi16(src_r0r1_16x8b, coeff0_1_16x8b); + res_t2_8x16b = _mm_maddubs_epi16(src_r2r3_16x8b, coeff2_3_16x8b); + res_t3_8x16b = _mm_maddubs_epi16(src_r4r5_16x8b, coeff4_5_16x8b); + + res_t1_8x16b = _mm_add_epi16(res_t1_8x16b, res_t2_8x16b); + res_t3_8x16b = _mm_add_epi16(const_val16_8x16b, res_t3_8x16b); + res_t1_8x16b = _mm_add_epi16(res_t1_8x16b, res_t3_8x16b); + res_t1_8x16b = _mm_srai_epi16(res_t1_8x16b, 5); //shifting right by 5 bits. + + res_16x8b = _mm_packus_epi16(res_t0_8x16b, res_t1_8x16b); + + _mm_storeu_si128((__m128i *)(pu1_tmp1), res_16x8b); + + src_r0r1_16x8b = _mm_unpacklo_epi8(src_r1_16x8b, src_r2_16x8b); + src_r2r3_16x8b = _mm_unpacklo_epi8(src_r3_16x8b, src_r4_16x8b); + src_r4r5_16x8b = _mm_unpacklo_epi8(src_r5_16x8b, src_r6_16x8b); + + res_t1_8x16b = _mm_maddubs_epi16(src_r0r1_16x8b, coeff0_1_16x8b); + res_t2_8x16b = _mm_maddubs_epi16(src_r2r3_16x8b, coeff2_3_16x8b); + res_t3_8x16b = _mm_maddubs_epi16(src_r4r5_16x8b, coeff4_5_16x8b); + + res_t1_8x16b = _mm_add_epi16(res_t1_8x16b, res_t2_8x16b); + res_t3_8x16b = _mm_add_epi16(const_val16_8x16b, res_t3_8x16b); + res_t1_8x16b = _mm_add_epi16(res_t1_8x16b, res_t3_8x16b); + res_t0_8x16b = _mm_srai_epi16(res_t1_8x16b, 5); //shifting right by 5 bits. + + src_r0r1_16x8b = _mm_unpackhi_epi8(src_r1_16x8b, src_r2_16x8b); + src_r2r3_16x8b = _mm_unpackhi_epi8(src_r3_16x8b, src_r4_16x8b); + src_r4r5_16x8b = _mm_unpackhi_epi8(src_r5_16x8b, src_r6_16x8b); + + res_t1_8x16b = _mm_maddubs_epi16(src_r0r1_16x8b, coeff0_1_16x8b); + res_t2_8x16b = _mm_maddubs_epi16(src_r2r3_16x8b, coeff2_3_16x8b); + res_t3_8x16b = _mm_maddubs_epi16(src_r4r5_16x8b, coeff4_5_16x8b); + + res_t1_8x16b = _mm_add_epi16(res_t1_8x16b, res_t2_8x16b); + res_t3_8x16b = _mm_add_epi16(const_val16_8x16b, res_t3_8x16b); + res_t1_8x16b = _mm_add_epi16(res_t1_8x16b, res_t3_8x16b); + res_t1_8x16b = _mm_srai_epi16(res_t1_8x16b, 5); //shifting right by 5 bits. + + res_16x8b = _mm_packus_epi16(res_t0_8x16b, res_t1_8x16b); + + _mm_storeu_si128((__m128i *)(pu1_tmp1 + 16), res_16x8b); + + src_r0_16x8b = src_r2_16x8b; + src_r1_16x8b = src_r3_16x8b; + src_r2_16x8b = src_r4_16x8b; + src_r3_16x8b = src_r5_16x8b; + src_r4_16x8b = src_r6_16x8b; + + ht_temp -= 2; + pu1_pred_vert += src_strd << 1; + pu1_tmp1 += 32; + } + while(ht_temp > 0); + } + //horizontal q-pel filter + { + __m128i src_r0_16x8b, src_r1_16x8b, src_r0_sht_16x8b, src_r1_sht_16x8b; + __m128i src_r0_t1_16x8b, src_r1_t1_16x8b; + __m128i src_vpel_16x8b; + + __m128i res_r0_t1_8x16b, res_r0_t2_8x16b, res_r0_t3_8x16b; + __m128i res_r1_t1_8x16b, res_r1_t2_8x16b, res_r1_t3_8x16b; + __m128i res_16x8b; + + //Row0 : a0 a1 a2 a3 a4 a5 a6 a7 a8 a9..... + //Row0 : b0 b1 b2 b3 b4 b5 b6 b7 b8 b9..... + //b0 is same a8. Similarly other bn pixels are same as a(n+8) pixels. + + do + { + src_r0_16x8b = _mm_loadu_si128((__m128i *)(pu1_pred_horiz)); //a0 a1 a2 a3 a4 a5 a6 a7 a8 a9....a15 + src_r1_16x8b = _mm_loadu_si128((__m128i *)(pu1_pred_horiz + 8)); //b0 b1 b2 b3 b4 b5 b6 b7 b8 b9....b15 + src_vpel_16x8b = _mm_loadu_si128((__m128i *)(pu1_tmp2)); + + src_r0_sht_16x8b = _mm_srli_si128(src_r0_16x8b, 1); //a1 a2 a3 a4 a5 a6 a7 a8 a9....a15 0 + src_r1_sht_16x8b = _mm_srli_si128(src_r1_16x8b, 1); //b1 b2 b3 b4 b5 b6 b7 b8 b9....b15 0 + + src_r0_t1_16x8b = _mm_unpacklo_epi8(src_r0_16x8b, src_r0_sht_16x8b); //a0 a1 a1 a2 a2 a3 a3 a4 a4 a5 a5 a6 a6 a7 a7 a8 + src_r1_t1_16x8b = _mm_unpacklo_epi8(src_r1_16x8b, src_r1_sht_16x8b); //b0 b1 b1 b2 b2 b3 b3 b4 b4 b5 b5 b6 b6 b7 b7 b8 + + res_r0_t1_8x16b = _mm_maddubs_epi16(src_r0_t1_16x8b, coeff0_1_16x8b); //a0*c0+a1*c1 a1*c0+a2*c1 a2*c0+a3*c1 a3*c0+a4*c1 + //a4*c0+a5*c1 a5*c0+a6*c1 a6*c0+a7*c1 a7*c0+a8*c1 + res_r1_t1_8x16b = _mm_maddubs_epi16(src_r1_t1_16x8b, coeff0_1_16x8b); //b0*c0+b1*c1 b1*c0+b2*c1 b2*c0+b3*c1 b3*c0+b4*c1 + //b4*c0+b5*c1 b5*c0+b6*c1 b6*c0+b7*c1 b7*c0+b8*c1 + + src_r0_16x8b = _mm_srli_si128(src_r0_16x8b, 2); //a2 a3 a4 a5 a6 a7 a8 a9....a15 0 0 + src_r1_16x8b = _mm_srli_si128(src_r1_16x8b, 2); //b2 b3 b4 b5 b6 b7 b8 b9....b15 0 0 + + src_r0_sht_16x8b = _mm_srli_si128(src_r0_sht_16x8b, 2); //a3 a4 a5 a6 a7 a8 a9....a15 0 0 0 + src_r1_sht_16x8b = _mm_srli_si128(src_r1_sht_16x8b, 2); //b3 b4 b5 b6 b7 b8 b9....b15 0 0 0 + + src_r0_t1_16x8b = _mm_unpacklo_epi8(src_r0_16x8b, src_r0_sht_16x8b); //a2 a3 a3 a4 a4 a5 a5 a6 a6 a7 a7 a8 a8 a9 a9 a10 + src_r1_t1_16x8b = _mm_unpacklo_epi8(src_r1_16x8b, src_r1_sht_16x8b); //b2 b3 b3 b4 b4 b5 b5 b6 b6 b7 b7 b8 a8 a9 a9 a10 + + res_r0_t2_8x16b = _mm_maddubs_epi16(src_r0_t1_16x8b, coeff2_3_16x8b); //a2*c2+a3*c3 a3*c2+a4*c3 a4*c2+a5*c3 a5*c2+a6*c3 + //a6*c2+a7*c3 a7*c2+a8*c3 a8*c2+a9*c3 a9*c2+a10*c3 + res_r1_t2_8x16b = _mm_maddubs_epi16(src_r1_t1_16x8b, coeff2_3_16x8b); //b2*c2+b3*c3 b3*c2+b4*c3 b2*c4+b5*c3 b5*c2+b6*c3 + //b6*c2+b7*c3 b7*c2+b8*c3 b8*c2+b9*c3 b9*c2+b10*c3 + + src_r0_16x8b = _mm_srli_si128(src_r0_16x8b, 2); //a4 a5 a6 a7 a8 a9....a15 0 0 0 0 + src_r1_16x8b = _mm_srli_si128(src_r1_16x8b, 2); //b4 b5 b6 b7 b8 b9....b15 0 0 0 0 + + src_r0_sht_16x8b = _mm_srli_si128(src_r0_sht_16x8b, 2); //a5 a6 a7 a8 a9....a15 0 0 0 0 0 + src_r1_sht_16x8b = _mm_srli_si128(src_r1_sht_16x8b, 2); //b5 b6 b7 b8 b9....b15 0 0 0 0 0 + + src_r0_t1_16x8b = _mm_unpacklo_epi8(src_r0_16x8b, src_r0_sht_16x8b); //a4 a5 a5 a6 a6 a7 a7 a8 a8 a9 a9 a10 a10 a11 a11 a12 + src_r1_t1_16x8b = _mm_unpacklo_epi8(src_r1_16x8b, src_r1_sht_16x8b); //b4 b5 b5 b6 b6 b7 b7 b8 b8 b9 b9 b10 b10 b11 b11 b12 + + res_r0_t3_8x16b = _mm_maddubs_epi16(src_r0_t1_16x8b, coeff4_5_16x8b); //a4*c4+a5*c5 a5*c4+a6*c5 a6*c4+a7*c5 a7*c4+a8*c5 + //a8*c4+a9*c5 a9*c4+a10*c5 a10*c4+a11*c5 a11*c4+a12*c5 + res_r1_t3_8x16b = _mm_maddubs_epi16(src_r1_t1_16x8b, coeff4_5_16x8b); //b4*c4+b5*c5 b5*c4+b6*c5 b6*c4+b7*c5 b7*c4+b8*c5 + //b8*c4+b9*c5 b9*c4+b10*c5 b10*c4+b11*c5 b11*c4+b12*c5 + res_r0_t1_8x16b = _mm_add_epi16(res_r0_t1_8x16b, res_r0_t2_8x16b); + res_r0_t3_8x16b = _mm_add_epi16(const_val16_8x16b, res_r0_t3_8x16b); + res_r0_t1_8x16b = _mm_add_epi16(res_r0_t1_8x16b, res_r0_t3_8x16b); + res_r0_t1_8x16b = _mm_srai_epi16(res_r0_t1_8x16b, 5); //shifting right by 5 bits. + + res_r1_t1_8x16b = _mm_add_epi16(res_r1_t1_8x16b, res_r1_t2_8x16b); + res_r1_t1_8x16b = _mm_add_epi16(res_r1_t1_8x16b, res_r1_t3_8x16b); + res_r1_t1_8x16b = _mm_add_epi16(res_r1_t1_8x16b, const_val16_8x16b); + res_r1_t1_8x16b = _mm_srai_epi16(res_r1_t1_8x16b, 5); //shifting right by 5 bits. + + res_16x8b = _mm_packus_epi16(res_r0_t1_8x16b, res_r1_t1_8x16b); + + res_16x8b = _mm_avg_epu8(res_16x8b, src_vpel_16x8b); + _mm_storeu_si128((__m128i *)(pu1_dst), res_16x8b); + + ht --; + pu1_pred_horiz += src_strd; + pu1_dst += dst_strd; + pu1_tmp2 += 16; + } + while(ht > 0); + } + } +} + +/*****************************************************************************/ +/* */ +/* Function Name : ih264_inter_pred_luma_horz_qpel_vert_hpel_ssse3 */ +/* */ +/* Description : This function implements a six-tap filter vertically and */ +/* horizontally on ht x wd block separately and averages */ +/* the two sets of values to calculate values at (1/4,1/2), */ +/* or (3/4, 1/2) as mentioned in sec. 8.4.2.2.1 titled */ +/* "Luma sample interpolation process". (ht,wd) can be */ +/* (4,4), (8,4), (4,8), (8,8), (16,8), (8,16) or (16,16). */ +/* */ +/* Inputs : puc_src - pointer to source */ +/* puc_dst - pointer to destination */ +/* src_strd - stride for source */ +/* dst_strd - stride for destination */ +/* ht - height of the block */ +/* wd - width of the block */ +/* pu1_tmp - pointer to temporary buffer */ +/* dydx - x and y reference offset for q-pel */ +/* calculations */ +/* */ +/* Issues : None */ +/* */ +/* Revision History: */ +/* */ +/* DD MM YYYY Author(s) Changes */ +/* 13 02 2015 Kaushik Initial Version */ +/* Senthoor */ +/* */ +/*****************************************************************************/ +void ih264_inter_pred_luma_horz_qpel_vert_hpel_ssse3(UWORD8 *pu1_src, + UWORD8 *pu1_dst, + WORD32 src_strd, + WORD32 dst_strd, + WORD32 ht, + WORD32 wd, + UWORD8* pu1_tmp, + WORD32 dydx) +{ + WORD32 ht_temp; + WORD32 x_offset; + WORD32 off0,off1, off2, off3, off4, off5; + WORD16 *pi2_temp1,*pi2_temp2,*pi2_temp3; + + ht_temp = ht; + x_offset = dydx & 0x3; + pi2_temp1 = (WORD16 *)pu1_tmp; + pi2_temp2 = pi2_temp1; + pi2_temp3 = pi2_temp1 + (x_offset >> 1); + + pu1_src -= 2 * src_strd; + pu1_src -= 2; + pi2_temp3 += 2; + //the filter input starts from x[-2] (till x[3]) + + if(wd == 4) + { + //vertical half-pel + { + __m128i src_r0_16x8b, src_r1_16x8b, src_r2_16x8b, src_r3_16x8b, src_r4_16x8b; + __m128i src_r5_16x8b, src_r6_16x8b; + __m128i src_r0r1_16x8b, src_r2r3_16x8b, src_r4r5_16x8b; + + __m128i res_t1_8x16b, res_t2_8x16b, res_t3_8x16b; + + __m128i coeff0_1_16x8b, coeff2_3_16x8b, coeff4_5_16x8b; + + coeff0_1_16x8b = _mm_set1_epi32(0xFB01FB01); //c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 + coeff2_3_16x8b = _mm_set1_epi32(0x14141414); //c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 + coeff4_5_16x8b = _mm_set1_epi32(0x01FB01FB); //c4 c5 c5 c5 c4 c5 c5 c5 c4 c5 c5 c5 c4 c5 c5 c5 + //c0 = c5 = 1, c1 = c4 = -5, c2 = c3 = 20 + off0 = -((src_strd << 2) + src_strd) + 8; + off1 = -(src_strd << 2) + 8; + off2 = -((src_strd << 1) + src_strd) + 8; + off3 = -(src_strd << 1) + 8; + off4 = -src_strd + 8; + off5 = 8; + + //epilogue: Load all the pred rows except sixth and seventh row for the + //first and second row processing. + src_r0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src)); + pu1_src = pu1_src + src_strd; + + src_r1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src)); + pu1_src = pu1_src + src_strd; + + src_r2_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src)); + pu1_src = pu1_src + src_strd; + + src_r3_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src)); + pu1_src = pu1_src + src_strd; + + src_r4_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src)); + pu1_src = pu1_src + src_strd; + + //Core Loop: Process all the rows. + do + { + src_r5_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src)); + + src_r0r1_16x8b = _mm_unpacklo_epi8(src_r0_16x8b, src_r1_16x8b); + src_r2r3_16x8b = _mm_unpacklo_epi8(src_r2_16x8b, src_r3_16x8b); + src_r4r5_16x8b = _mm_unpacklo_epi8(src_r4_16x8b, src_r5_16x8b); + + res_t1_8x16b = _mm_maddubs_epi16(src_r0r1_16x8b, coeff0_1_16x8b); + res_t2_8x16b = _mm_maddubs_epi16(src_r2r3_16x8b, coeff2_3_16x8b); + res_t3_8x16b = _mm_maddubs_epi16(src_r4r5_16x8b, coeff4_5_16x8b); + + res_t1_8x16b = _mm_add_epi16(res_t2_8x16b, res_t1_8x16b); + res_t1_8x16b = _mm_add_epi16(res_t3_8x16b, res_t1_8x16b); + + _mm_storeu_si128((__m128i *)(pi2_temp1), res_t1_8x16b); + + pi2_temp1[8] = pu1_src[off0] + pu1_src[off5] + - (pu1_src[off1] + pu1_src[off4]) + + ((pu1_src[off2] + pu1_src[off3] - pu1_src[off1] - pu1_src[off4]) << 2) + + ((pu1_src[off2] + pu1_src[off3]) << 4); + + pu1_src = pu1_src + src_strd; + pi2_temp1 = pi2_temp1 + 9; + + src_r6_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src)); + + src_r0r1_16x8b = _mm_unpacklo_epi8(src_r1_16x8b, src_r2_16x8b); + src_r2r3_16x8b = _mm_unpacklo_epi8(src_r3_16x8b, src_r4_16x8b); + src_r4r5_16x8b = _mm_unpacklo_epi8(src_r5_16x8b, src_r6_16x8b); + + res_t1_8x16b = _mm_maddubs_epi16(src_r0r1_16x8b, coeff0_1_16x8b); + res_t2_8x16b = _mm_maddubs_epi16(src_r2r3_16x8b, coeff2_3_16x8b); + res_t3_8x16b = _mm_maddubs_epi16(src_r4r5_16x8b, coeff4_5_16x8b); + + res_t1_8x16b = _mm_add_epi16(res_t2_8x16b, res_t1_8x16b); + res_t1_8x16b = _mm_add_epi16(res_t3_8x16b, res_t1_8x16b); + + _mm_storeu_si128((__m128i *)(pi2_temp1), res_t1_8x16b); + + pi2_temp1[8] = pu1_src[off0] + pu1_src[off5] + - (pu1_src[off1] + pu1_src[off4]) + + ((pu1_src[off2] + pu1_src[off3] - pu1_src[off1] - pu1_src[off4]) << 2) + + ((pu1_src[off2] + pu1_src[off3]) << 4); + + ht_temp -= 2; + pu1_src = pu1_src + src_strd; + pi2_temp1 = pi2_temp1 + 9; + + src_r0_16x8b = src_r2_16x8b; + src_r1_16x8b = src_r3_16x8b; + src_r2_16x8b = src_r4_16x8b; + src_r3_16x8b = src_r5_16x8b; + src_r4_16x8b = src_r6_16x8b; + } + while(ht_temp > 0); + } + + //horizontal q-pel + { + __m128i src_r0_8x16b, src_r1_8x16b, src_r2_8x16b; + __m128i src_r3_8x16b, src_r4_8x16b, src_r5_8x16b; + __m128i src_r0r1_c0_8x16b, src_r2r3_c0_8x16b, src_r4r5_c0_8x16b; + __m128i src_hpel_16x8b, src_hpel_8x16b; + + __m128i res_t1_4x32b, res_t2_4x32b, res_t3_4x32b; + __m128i res_8x16b, res_16x8b; + + __m128i coeff0_1_8x16b, coeff2_3_8x16b, coeff4_5_8x16b; + __m128i const_val512_4x32b, const_val16_8x16b; + __m128i mask_low32b; + + mask_low32b = _mm_set1_epi8(0xff); + + coeff0_1_8x16b = _mm_set1_epi32(0xFFFB0001); + coeff2_3_8x16b = _mm_set1_epi32(0x00140014); + coeff4_5_8x16b = _mm_set1_epi32(0x0001FFFB); + + mask_low32b = _mm_srli_si128(mask_low32b, 12); + + const_val512_4x32b = _mm_set1_epi32(512); + const_val16_8x16b = _mm_set1_epi16(16); + + do + { + src_r0_8x16b = _mm_loadl_epi64((__m128i *)(pi2_temp2)); + src_r1_8x16b = _mm_loadu_si128((__m128i *)(pi2_temp2 + 1)); + src_r2_8x16b = _mm_srli_si128(src_r1_8x16b, 2); + src_r3_8x16b = _mm_srli_si128(src_r1_8x16b, 4); + src_r4_8x16b = _mm_srli_si128(src_r1_8x16b, 6); + src_r5_8x16b = _mm_srli_si128(src_r1_8x16b, 8); + + src_r0r1_c0_8x16b = _mm_unpacklo_epi16(src_r0_8x16b, src_r1_8x16b); + src_r2r3_c0_8x16b = _mm_unpacklo_epi16(src_r2_8x16b, src_r3_8x16b); + src_r4r5_c0_8x16b = _mm_unpacklo_epi16(src_r4_8x16b, src_r5_8x16b); + + res_t1_4x32b = _mm_madd_epi16(src_r0r1_c0_8x16b, coeff0_1_8x16b); + res_t2_4x32b = _mm_madd_epi16(src_r2r3_c0_8x16b, coeff2_3_8x16b); + res_t3_4x32b = _mm_madd_epi16(src_r4r5_c0_8x16b, coeff4_5_8x16b); + + res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t2_4x32b); + res_t3_4x32b = _mm_add_epi32(const_val512_4x32b, res_t3_4x32b); + res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t3_4x32b); + res_t1_4x32b = _mm_srai_epi32(res_t1_4x32b, 10); + + res_8x16b = _mm_packs_epi32(res_t1_4x32b, res_t1_4x32b); + res_16x8b = _mm_packus_epi16(res_8x16b, res_8x16b); + + src_hpel_8x16b = _mm_loadl_epi64((__m128i *)(pi2_temp3)); + src_hpel_8x16b = _mm_add_epi16(src_hpel_8x16b, const_val16_8x16b); + src_hpel_8x16b = _mm_srai_epi16(src_hpel_8x16b, 5); //shifting right by 5 bits. + src_hpel_16x8b = _mm_packus_epi16(src_hpel_8x16b, src_hpel_8x16b); + + res_16x8b = _mm_avg_epu8(res_16x8b, src_hpel_16x8b); + + _mm_maskmoveu_si128(res_16x8b, mask_low32b, (char*)pu1_dst); + + ht--; + pi2_temp2 = pi2_temp2 + 4 + 5; + pi2_temp3 = pi2_temp3 + 4 + 5; + pu1_dst = pu1_dst + dst_strd; + } + while(ht > 0); + } + } + else if(wd == 8) + { + // vertical half-pel + { + __m128i src_r0_16x8b, src_r1_16x8b, src_r2_16x8b, src_r3_16x8b, src_r4_16x8b; + __m128i src_r5_16x8b, src_r6_16x8b; + __m128i src_r0r1_16x8b, src_r2r3_16x8b, src_r4r5_16x8b; + + __m128i res_t1_8x16b, res_t2_8x16b, res_t3_8x16b; + + __m128i coeff0_1_16x8b, coeff2_3_16x8b, coeff4_5_16x8b; + + coeff0_1_16x8b = _mm_set1_epi32(0xFB01FB01); //c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 + coeff2_3_16x8b = _mm_set1_epi32(0x14141414); //c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 + coeff4_5_16x8b = _mm_set1_epi32(0x01FB01FB); //c4 c5 c5 c5 c4 c5 c5 c5 c4 c5 c5 c5 c4 c5 c5 c5 + + //epilogue: Load all the pred rows except sixth and seventh row for the + //first and second row processing. + src_r0_16x8b = _mm_loadu_si128((__m128i *)(pu1_src)); + pu1_src = pu1_src + src_strd; + + src_r1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src)); + pu1_src = pu1_src + src_strd; + + src_r2_16x8b = _mm_loadu_si128((__m128i *)(pu1_src)); + pu1_src = pu1_src + src_strd; + + src_r3_16x8b = _mm_loadu_si128((__m128i *)(pu1_src)); + pu1_src = pu1_src + src_strd; + + src_r4_16x8b = _mm_loadu_si128((__m128i *)(pu1_src)); + pu1_src = pu1_src + src_strd; + + //Core Loop: Process all the rows. + do + { + src_r5_16x8b = _mm_loadu_si128((__m128i *)(pu1_src)); + src_r6_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + src_strd)); + + src_r0r1_16x8b = _mm_unpacklo_epi8(src_r0_16x8b, src_r1_16x8b); + src_r2r3_16x8b = _mm_unpacklo_epi8(src_r2_16x8b, src_r3_16x8b); + src_r4r5_16x8b = _mm_unpacklo_epi8(src_r4_16x8b, src_r5_16x8b); + + res_t1_8x16b = _mm_maddubs_epi16(src_r0r1_16x8b, coeff0_1_16x8b); + res_t2_8x16b = _mm_maddubs_epi16(src_r2r3_16x8b, coeff2_3_16x8b); + res_t3_8x16b = _mm_maddubs_epi16(src_r4r5_16x8b, coeff4_5_16x8b); + + res_t1_8x16b = _mm_add_epi16(res_t1_8x16b, res_t2_8x16b); + res_t1_8x16b = _mm_add_epi16(res_t3_8x16b, res_t1_8x16b); + + _mm_storeu_si128((__m128i *)(pi2_temp1), res_t1_8x16b); + + src_r0r1_16x8b = _mm_unpackhi_epi8(src_r0_16x8b, src_r1_16x8b); + src_r2r3_16x8b = _mm_unpackhi_epi8(src_r2_16x8b, src_r3_16x8b); + src_r4r5_16x8b = _mm_unpackhi_epi8(src_r4_16x8b, src_r5_16x8b); + + res_t1_8x16b = _mm_maddubs_epi16(src_r0r1_16x8b, coeff0_1_16x8b); + res_t2_8x16b = _mm_maddubs_epi16(src_r2r3_16x8b, coeff2_3_16x8b); + res_t3_8x16b = _mm_maddubs_epi16(src_r4r5_16x8b, coeff4_5_16x8b); + + res_t1_8x16b = _mm_add_epi16(res_t1_8x16b, res_t2_8x16b); + res_t1_8x16b = _mm_add_epi16(res_t3_8x16b, res_t1_8x16b); + + _mm_storeu_si128((__m128i *)(pi2_temp1 + 8), res_t1_8x16b); + + src_r0r1_16x8b = _mm_unpacklo_epi8(src_r1_16x8b, src_r2_16x8b); + src_r2r3_16x8b = _mm_unpacklo_epi8(src_r3_16x8b, src_r4_16x8b); + src_r4r5_16x8b = _mm_unpacklo_epi8(src_r5_16x8b, src_r6_16x8b); + + res_t1_8x16b = _mm_maddubs_epi16(src_r0r1_16x8b, coeff0_1_16x8b); + res_t2_8x16b = _mm_maddubs_epi16(src_r2r3_16x8b, coeff2_3_16x8b); + res_t3_8x16b = _mm_maddubs_epi16(src_r4r5_16x8b, coeff4_5_16x8b); + + res_t1_8x16b = _mm_add_epi16(res_t1_8x16b, res_t2_8x16b); + res_t1_8x16b = _mm_add_epi16(res_t3_8x16b, res_t1_8x16b); + + _mm_storeu_si128((__m128i *)(pi2_temp1 + 8 + 5), res_t1_8x16b); + + src_r0r1_16x8b = _mm_unpackhi_epi8(src_r1_16x8b, src_r2_16x8b); + src_r2r3_16x8b = _mm_unpackhi_epi8(src_r3_16x8b, src_r4_16x8b); + src_r4r5_16x8b = _mm_unpackhi_epi8(src_r5_16x8b, src_r6_16x8b); + + res_t1_8x16b = _mm_maddubs_epi16(src_r0r1_16x8b, coeff0_1_16x8b); + res_t2_8x16b = _mm_maddubs_epi16(src_r2r3_16x8b, coeff2_3_16x8b); + res_t3_8x16b = _mm_maddubs_epi16(src_r4r5_16x8b, coeff4_5_16x8b); + + res_t1_8x16b = _mm_add_epi16(res_t1_8x16b, res_t2_8x16b); + res_t1_8x16b = _mm_add_epi16(res_t3_8x16b, res_t1_8x16b); + + _mm_storeu_si128((__m128i *)(pi2_temp1 + 8 + 5 + 8), res_t1_8x16b); + + src_r0_16x8b = src_r2_16x8b; + src_r1_16x8b = src_r3_16x8b; + src_r2_16x8b = src_r4_16x8b; + src_r3_16x8b = src_r5_16x8b; + src_r4_16x8b = src_r6_16x8b; + + ht_temp -= 2; + pu1_src = pu1_src + (src_strd << 1); + pi2_temp1 = pi2_temp1 + (13 << 1); + } + while(ht_temp > 0); + } + // horizontal q-pel + { + __m128i src_r0_8x16b, src_r1_8x16b, src_r2_8x16b, src_r3_8x16b; + __m128i src_r4_8x16b, src_r5_8x16b; + __m128i src_r0r1_c0_8x16b, src_r2r3_c0_8x16b, src_r4r5_c0_8x16b; + __m128i src_r0r1_c1_8x16b, src_r2r3_c1_8x16b, src_r4r5_c1_8x16b; + __m128i src_hpel_8x16b, src_hpel_16x8b; + + __m128i res_t0_4x32b, res_t1_4x32b, res_t2_4x32b, res_t3_4x32b; + __m128i res_8x16b, res_16x8b; + + __m128i coeff0_1_8x16b, coeff2_3_8x16b, coeff4_5_8x16b; + __m128i const_val512_4x32b, const_val16_8x16b; + + coeff0_1_8x16b = _mm_set1_epi32(0xFFFB0001); + coeff2_3_8x16b = _mm_set1_epi32(0x00140014); + coeff4_5_8x16b = _mm_set1_epi32(0x0001FFFB); + + const_val512_4x32b = _mm_set1_epi32(512); + const_val16_8x16b = _mm_set1_epi16(16); + + do + { + src_r0_8x16b = _mm_loadu_si128((__m128i *)(pi2_temp2)); + src_r1_8x16b = _mm_loadu_si128((__m128i *)(pi2_temp2 + 1)); + src_r2_8x16b = _mm_loadu_si128((__m128i *)(pi2_temp2 + 2)); + src_r3_8x16b = _mm_loadu_si128((__m128i *)(pi2_temp2 + 3)); + src_r4_8x16b = _mm_loadu_si128((__m128i *)(pi2_temp2 + 4)); + src_r5_8x16b = _mm_loadu_si128((__m128i *)(pi2_temp2 + 5)); + + src_r0r1_c0_8x16b = _mm_unpacklo_epi16(src_r0_8x16b, src_r1_8x16b); + src_r2r3_c0_8x16b = _mm_unpacklo_epi16(src_r2_8x16b, src_r3_8x16b); + src_r4r5_c0_8x16b = _mm_unpacklo_epi16(src_r4_8x16b, src_r5_8x16b); + + src_r0r1_c1_8x16b = _mm_unpackhi_epi16(src_r0_8x16b, src_r1_8x16b); + src_r2r3_c1_8x16b = _mm_unpackhi_epi16(src_r2_8x16b, src_r3_8x16b); + src_r4r5_c1_8x16b = _mm_unpackhi_epi16(src_r4_8x16b, src_r5_8x16b); + + res_t1_4x32b = _mm_madd_epi16(src_r0r1_c0_8x16b, coeff0_1_8x16b); + res_t2_4x32b = _mm_madd_epi16(src_r2r3_c0_8x16b, coeff2_3_8x16b); + res_t3_4x32b = _mm_madd_epi16(src_r4r5_c0_8x16b, coeff4_5_8x16b); + + res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t2_4x32b); + res_t3_4x32b = _mm_add_epi32(res_t3_4x32b, const_val512_4x32b); + res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t3_4x32b); + + res_t0_4x32b = _mm_srai_epi32(res_t1_4x32b, 10); + + res_t1_4x32b = _mm_madd_epi16(src_r0r1_c1_8x16b, coeff0_1_8x16b); + res_t2_4x32b = _mm_madd_epi16(src_r2r3_c1_8x16b, coeff2_3_8x16b); + res_t3_4x32b = _mm_madd_epi16(src_r4r5_c1_8x16b, coeff4_5_8x16b); + + res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t2_4x32b); + res_t3_4x32b = _mm_add_epi32(res_t3_4x32b, const_val512_4x32b); + res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t3_4x32b); + + res_t1_4x32b = _mm_srai_epi32(res_t1_4x32b, 10); + + res_8x16b = _mm_packs_epi32(res_t0_4x32b, res_t1_4x32b); + res_16x8b = _mm_packus_epi16(res_8x16b, res_8x16b); + + src_hpel_8x16b = _mm_loadu_si128((__m128i *)(pi2_temp3)); + src_hpel_8x16b = _mm_add_epi16(src_hpel_8x16b, const_val16_8x16b); + src_hpel_8x16b = _mm_srai_epi16(src_hpel_8x16b, 5); //shifting right by 5 bits. + src_hpel_16x8b = _mm_packus_epi16(src_hpel_8x16b, src_hpel_8x16b); + + res_16x8b = _mm_avg_epu8(res_16x8b, src_hpel_16x8b); + + _mm_storel_epi64((__m128i *)(pu1_dst), res_16x8b); + + ht--; + pi2_temp2 = pi2_temp2 + 8 + 5; + pi2_temp3 = pi2_temp3 + 8 + 5; + pu1_dst = pu1_dst + dst_strd; + } + while(ht > 0); + } + } + else // wd == 16 + { + // vertical half-pel + { + __m128i src_r0_16x8b, src_r1_16x8b, src_r2_16x8b, src_r3_16x8b; + __m128i src_r4_16x8b, src_r5_16x8b; + __m128i src_r0_c2_16x8b, src_r1_c2_16x8b, src_r2_c2_16x8b, src_r3_c2_16x8b; + __m128i src_r4_c2_16x8b, src_r5_c2_16x8b; + __m128i src_r0r1_16x8b, src_r2r3_16x8b, src_r4r5_16x8b; + + __m128i res_t1_8x16b, res_t2_8x16b, res_t3_8x16b; + + __m128i coeff0_1_16x8b,coeff2_3_16x8b,coeff4_5_16x8b; + + coeff0_1_16x8b = _mm_set1_epi32(0xFB01FB01); //c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 + coeff2_3_16x8b = _mm_set1_epi32(0x14141414); //c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 + coeff4_5_16x8b = _mm_set1_epi32(0x01FB01FB); //c4 c5 c5 c5 c4 c5 c5 c5 c4 c5 c5 c5 c4 c5 c5 c5 + + src_r0_16x8b = _mm_loadu_si128((__m128i *)(pu1_src)); + src_r0_c2_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + 16)); + pu1_src = pu1_src + src_strd; + src_r1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src)); + src_r1_c2_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + 16)); + pu1_src = pu1_src + src_strd; + src_r2_16x8b = _mm_loadu_si128((__m128i *)(pu1_src)); + src_r2_c2_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + 16)); + pu1_src = pu1_src + src_strd; + src_r3_16x8b = _mm_loadu_si128((__m128i *)(pu1_src)); + src_r3_c2_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + 16)); + pu1_src = pu1_src + src_strd; + src_r4_16x8b = _mm_loadu_si128((__m128i *)(pu1_src)); + src_r4_c2_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + 16)); + pu1_src = pu1_src + src_strd; + + //Core Loop: Process all the rows. + do + { + src_r5_16x8b = _mm_loadu_si128((__m128i *)(pu1_src)); + src_r5_c2_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + 16)); + + src_r0r1_16x8b = _mm_unpacklo_epi8(src_r0_16x8b, src_r1_16x8b); + src_r2r3_16x8b = _mm_unpacklo_epi8(src_r2_16x8b, src_r3_16x8b); + src_r4r5_16x8b = _mm_unpacklo_epi8(src_r4_16x8b, src_r5_16x8b); + + res_t1_8x16b = _mm_maddubs_epi16(src_r0r1_16x8b, coeff0_1_16x8b); + res_t2_8x16b = _mm_maddubs_epi16(src_r2r3_16x8b, coeff2_3_16x8b); + res_t3_8x16b = _mm_maddubs_epi16(src_r4r5_16x8b, coeff4_5_16x8b); + + res_t1_8x16b = _mm_add_epi16(res_t1_8x16b, res_t2_8x16b); + res_t1_8x16b = _mm_add_epi16(res_t3_8x16b, res_t1_8x16b); + + _mm_storeu_si128((__m128i *)(pi2_temp1), res_t1_8x16b); + + src_r0r1_16x8b = _mm_unpackhi_epi8(src_r0_16x8b, src_r1_16x8b); + src_r2r3_16x8b = _mm_unpackhi_epi8(src_r2_16x8b, src_r3_16x8b); + src_r4r5_16x8b = _mm_unpackhi_epi8(src_r4_16x8b, src_r5_16x8b); + + res_t1_8x16b = _mm_maddubs_epi16(src_r0r1_16x8b, coeff0_1_16x8b); + res_t2_8x16b = _mm_maddubs_epi16(src_r2r3_16x8b, coeff2_3_16x8b); + res_t3_8x16b = _mm_maddubs_epi16(src_r4r5_16x8b, coeff4_5_16x8b); + + res_t1_8x16b = _mm_add_epi16(res_t1_8x16b, res_t2_8x16b); + res_t1_8x16b = _mm_add_epi16(res_t3_8x16b, res_t1_8x16b); + + _mm_storeu_si128((__m128i *)(pi2_temp1 + 8), res_t1_8x16b); + + src_r0r1_16x8b = _mm_unpacklo_epi8(src_r0_c2_16x8b, src_r1_c2_16x8b); + src_r2r3_16x8b = _mm_unpacklo_epi8(src_r2_c2_16x8b, src_r3_c2_16x8b); + src_r4r5_16x8b = _mm_unpacklo_epi8(src_r4_c2_16x8b, src_r5_c2_16x8b); + + res_t1_8x16b = _mm_maddubs_epi16(src_r0r1_16x8b, coeff0_1_16x8b); + res_t2_8x16b = _mm_maddubs_epi16(src_r2r3_16x8b, coeff2_3_16x8b); + res_t3_8x16b = _mm_maddubs_epi16(src_r4r5_16x8b, coeff4_5_16x8b); + + res_t1_8x16b = _mm_add_epi16(res_t1_8x16b, res_t2_8x16b); + res_t1_8x16b = _mm_add_epi16(res_t3_8x16b, res_t1_8x16b); + + _mm_storeu_si128((__m128i *)(pi2_temp1 + 16), res_t1_8x16b); + + src_r0_16x8b = src_r1_16x8b; + src_r1_16x8b = src_r2_16x8b; + src_r2_16x8b = src_r3_16x8b; + src_r3_16x8b = src_r4_16x8b; + src_r4_16x8b = src_r5_16x8b; + + src_r0_c2_16x8b = src_r1_c2_16x8b; + src_r1_c2_16x8b = src_r2_c2_16x8b; + src_r2_c2_16x8b = src_r3_c2_16x8b; + src_r3_c2_16x8b = src_r4_c2_16x8b; + src_r4_c2_16x8b = src_r5_c2_16x8b; + + ht_temp--; + pu1_src = pu1_src + src_strd; + pi2_temp1 = pi2_temp1 + 16 + 5; + } + while(ht_temp > 0); + } + // horizontal q-pel + { + __m128i src_r0_8x16b, src_r1_8x16b, src_r2_8x16b, src_r3_8x16b; + __m128i src_r4_8x16b, src_r5_8x16b; + __m128i src_r0r1_8x16b, src_r2r3_8x16b, src_r4r5_8x16b; + __m128i src_hpel1_8x16b, src_hpel2_8x16b, src_hpel_16x8b; + + __m128i res_t0_4x32b, res_t1_4x32b, res_t2_4x32b, res_t3_4x32b; + __m128i res_c0_8x16b, res_c1_8x16b, res_16x8b; + + __m128i coeff0_1_8x16b, coeff2_3_8x16b, coeff4_5_8x16b; + __m128i const_val512_4x32b, const_val16_8x16b; + + coeff0_1_8x16b = _mm_set1_epi32(0xFFFB0001); + coeff2_3_8x16b = _mm_set1_epi32(0x00140014); + coeff4_5_8x16b = _mm_set1_epi32(0x0001FFFB); + + const_val512_4x32b = _mm_set1_epi32(512); + const_val16_8x16b = _mm_set1_epi16(16); + + do + { + src_r0_8x16b = _mm_loadu_si128((__m128i *)(pi2_temp2)); + src_r1_8x16b = _mm_loadu_si128((__m128i *)(pi2_temp2 + 1)); + src_r2_8x16b = _mm_loadu_si128((__m128i *)(pi2_temp2 + 2)); + src_r3_8x16b = _mm_loadu_si128((__m128i *)(pi2_temp2 + 3)); + src_r4_8x16b = _mm_loadu_si128((__m128i *)(pi2_temp2 + 4)); + src_r5_8x16b = _mm_loadu_si128((__m128i *)(pi2_temp2 + 5)); + + src_r0r1_8x16b = _mm_unpacklo_epi16(src_r0_8x16b, src_r1_8x16b); + src_r2r3_8x16b = _mm_unpacklo_epi16(src_r2_8x16b, src_r3_8x16b); + src_r4r5_8x16b = _mm_unpacklo_epi16(src_r4_8x16b, src_r5_8x16b); + + res_t1_4x32b = _mm_madd_epi16(src_r0r1_8x16b, coeff0_1_8x16b); + res_t2_4x32b = _mm_madd_epi16(src_r2r3_8x16b, coeff2_3_8x16b); + res_t3_4x32b = _mm_madd_epi16(src_r4r5_8x16b, coeff4_5_8x16b); + + res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t2_4x32b); + res_t3_4x32b = _mm_add_epi32(res_t3_4x32b, const_val512_4x32b); + res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t3_4x32b); + res_t0_4x32b = _mm_srai_epi32(res_t1_4x32b, 10); + + src_r0r1_8x16b = _mm_unpackhi_epi16(src_r0_8x16b, src_r1_8x16b); + src_r2r3_8x16b = _mm_unpackhi_epi16(src_r2_8x16b, src_r3_8x16b); + src_r4r5_8x16b = _mm_unpackhi_epi16(src_r4_8x16b, src_r5_8x16b); + + res_t1_4x32b = _mm_madd_epi16(src_r0r1_8x16b, coeff0_1_8x16b); + res_t2_4x32b = _mm_madd_epi16(src_r2r3_8x16b, coeff2_3_8x16b); + res_t3_4x32b = _mm_madd_epi16(src_r4r5_8x16b, coeff4_5_8x16b); + + res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t2_4x32b); + res_t3_4x32b = _mm_add_epi32(const_val512_4x32b, res_t3_4x32b); + res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t3_4x32b); + res_t1_4x32b = _mm_srai_epi32(res_t1_4x32b, 10); + + res_c0_8x16b = _mm_packs_epi32(res_t0_4x32b, res_t1_4x32b); + + src_r0_8x16b = _mm_loadu_si128((__m128i *)(pi2_temp2 + 8)); + src_r1_8x16b = _mm_loadu_si128((__m128i *)(pi2_temp2 + 8 + 1)); + src_r2_8x16b = _mm_loadu_si128((__m128i *)(pi2_temp2 + 8 + 2)); + src_r3_8x16b = _mm_loadu_si128((__m128i *)(pi2_temp2 + 8 + 3)); + src_r4_8x16b = _mm_loadu_si128((__m128i *)(pi2_temp2 + 8 + 4)); + src_r5_8x16b = _mm_loadu_si128((__m128i *)(pi2_temp2 + 8 + 5)); + + src_r0r1_8x16b = _mm_unpacklo_epi16(src_r0_8x16b, src_r1_8x16b); + src_r2r3_8x16b = _mm_unpacklo_epi16(src_r2_8x16b, src_r3_8x16b); + src_r4r5_8x16b = _mm_unpacklo_epi16(src_r4_8x16b, src_r5_8x16b); + + res_t1_4x32b = _mm_madd_epi16(src_r0r1_8x16b, coeff0_1_8x16b); + res_t2_4x32b = _mm_madd_epi16(src_r2r3_8x16b, coeff2_3_8x16b); + res_t3_4x32b = _mm_madd_epi16(src_r4r5_8x16b, coeff4_5_8x16b); + + res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t2_4x32b); + res_t3_4x32b = _mm_add_epi32(const_val512_4x32b, res_t3_4x32b); + res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t3_4x32b); + res_t0_4x32b = _mm_srai_epi32(res_t1_4x32b ,10); + + src_r0r1_8x16b = _mm_unpackhi_epi16(src_r0_8x16b, src_r1_8x16b); + src_r2r3_8x16b = _mm_unpackhi_epi16(src_r2_8x16b, src_r3_8x16b); + src_r4r5_8x16b = _mm_unpackhi_epi16(src_r4_8x16b, src_r5_8x16b); + + res_t1_4x32b = _mm_madd_epi16(src_r0r1_8x16b, coeff0_1_8x16b); + res_t2_4x32b = _mm_madd_epi16(src_r2r3_8x16b, coeff2_3_8x16b); + res_t3_4x32b = _mm_madd_epi16(src_r4r5_8x16b, coeff4_5_8x16b); + + res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t2_4x32b); + res_t3_4x32b = _mm_add_epi32(const_val512_4x32b, res_t3_4x32b); + res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t3_4x32b); + res_t1_4x32b = _mm_srai_epi32(res_t1_4x32b, 10); + + res_c1_8x16b = _mm_packs_epi32(res_t0_4x32b, res_t1_4x32b); + res_16x8b = _mm_packus_epi16(res_c0_8x16b, res_c1_8x16b); + + src_hpel1_8x16b = _mm_loadu_si128((__m128i *)(pi2_temp3)); + src_hpel1_8x16b = _mm_add_epi16(src_hpel1_8x16b, const_val16_8x16b); + src_hpel1_8x16b = _mm_srai_epi16(src_hpel1_8x16b, 5); //shifting right by 5 bits. + + src_hpel2_8x16b = _mm_loadu_si128((__m128i *)(pi2_temp3 + 8)); + src_hpel2_8x16b = _mm_add_epi16(src_hpel2_8x16b, const_val16_8x16b); + src_hpel2_8x16b = _mm_srai_epi16(src_hpel2_8x16b, 5); //shifting right by 5 bits. + + src_hpel_16x8b = _mm_packus_epi16(src_hpel1_8x16b, src_hpel2_8x16b); + res_16x8b = _mm_avg_epu8(res_16x8b, src_hpel_16x8b); + + _mm_storeu_si128((__m128i *)(pu1_dst), res_16x8b); + + ht--; + pi2_temp2 = pi2_temp2 + 16 + 5; + pi2_temp3 = pi2_temp3 + 16 + 5; + pu1_dst = pu1_dst + dst_strd; + } + while(ht > 0); + } + } +} + +/*****************************************************************************/ +/* */ +/* Function Name : ih264_inter_pred_luma_horz_hpel_vert_qpel_ssse3 */ +/* */ +/* Description : This function implements a six-tap filter vertically and */ +/* horizontally on ht x wd block separately and averages */ +/* the two sets of values to calculate values at (1/2,1/4), */ +/* or (1/2, 3/4) as mentioned in sec. 8.4.2.2.1 titled */ +/* "Luma sample interpolation process". (ht,wd) can be */ +/* (4,4), (8,4), (4,8), (8,8), (16,8), (8,16) or (16,16). */ +/* */ +/* Inputs : puc_src - pointer to source */ +/* puc_dst - pointer to destination */ +/* src_strd - stride for source */ +/* dst_strd - stride for destination */ +/* ht - height of the block */ +/* wd - width of the block */ +/* pu1_tmp - pointer to temporary buffer */ +/* dydx - x and y reference offset for q-pel */ +/* calculations */ +/* */ +/* Issues : None */ +/* */ +/* Revision History: */ +/* */ +/* DD MM YYYY Author(s) Changes */ +/* 13 02 2015 Kaushik Initial Version */ +/* Senthoor */ +/* */ +/*****************************************************************************/ +void ih264_inter_pred_luma_horz_hpel_vert_qpel_ssse3(UWORD8 *pu1_src, + UWORD8 *pu1_dst, + WORD32 src_strd, + WORD32 dst_strd, + WORD32 ht, + WORD32 wd, + UWORD8* pu1_tmp, + WORD32 dydx) +{ + WORD32 ht_temp; + WORD32 y_offset; + WORD16 *pi2_temp1,*pi2_temp2,*pi2_temp3; + + y_offset = (dydx & 0xf) >> 2; + pi2_temp1 = (WORD16 *)pu1_tmp; + pi2_temp2 = pi2_temp1; + pi2_temp3 = pi2_temp1 + (y_offset >> 1) * wd; + + ht_temp = ht + 5; + pu1_src -= src_strd << 1; + pu1_src -= 2; + pi2_temp3 += wd << 1; + //the filter input starts from x[-2] (till x[3]) + + if(wd == 4) + { + // horizontal half-pel + { + __m128i src_r0_16x8b, src_r1_16x8b, src_r0r1_t1_16x8b; + __m128i src_r0_sht_16x8b, src_r1_sht_16x8b; + __m128i res_r0r1_t1_8x16b, res_r0r1_t2_8x16b, res_r0r1_t3_8x16b; + __m128i coeff0_1_16x8b, coeff2_3_16x8b, coeff4_5_16x8b; + + coeff0_1_16x8b = _mm_set1_epi32(0xFB01FB01); //c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 + coeff2_3_16x8b = _mm_set1_epi32(0x14141414); //c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 + coeff4_5_16x8b = _mm_set1_epi32(0x01FB01FB); //c4 c5 c5 c5 c4 c5 c5 c5 c4 c5 c5 c5 c4 c5 c5 c5 + + //Row0 : a0 a1 a2 a3 a4 a5 a6 a7 a8 a9..... + //Row1 : b0 b1 b2 b3 b4 b5 b6 b7 b8 b9..... + + do + { + src_r0_16x8b = _mm_loadu_si128((__m128i *)pu1_src); //a0 a1 a2 a3 a4 a5 a6 a7 a8 a9....a15 + src_r1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + src_strd)); //b0 b1 b2 b3 b4 b5 b6 b7 b8 b9....b15 + + src_r0_sht_16x8b = _mm_srli_si128(src_r0_16x8b, 1); //a1 a2 a3 a4 a5 a6 a7 a8 a9....a15 0 + src_r1_sht_16x8b = _mm_srli_si128(src_r1_16x8b, 1); //b1 b2 b3 b4 b5 b6 b7 b8 b9....b15 0 + + src_r0_16x8b = _mm_unpacklo_epi8(src_r0_16x8b, src_r0_sht_16x8b); //a0 a1 a1 a2 a2 a3 a3 a4 a4 a5 a5 a6 a6 a7 a7 a8 + src_r1_16x8b = _mm_unpacklo_epi8(src_r1_16x8b, src_r1_sht_16x8b); //b0 b1 b1 b2 b2 b3 b3 b4 b4 b5 b5 b6 b6 b7 b7 b8 + + src_r0r1_t1_16x8b = _mm_unpacklo_epi64(src_r0_16x8b, src_r1_16x8b); //a0 a1 a1 a2 a2 a3 a3 a4 b0 b1 b1 b2 b2 b3 b3 b4 + res_r0r1_t1_8x16b = _mm_maddubs_epi16(src_r0r1_t1_16x8b, coeff0_1_16x8b); //a0*c0+a1*c1 a1*c0+a2*c1 a2*c0+a3*c1 a3*c0+a4*c1 + //b0*c0+b1*c1 b1*c0+b2*c1 b2*c0+b3*c1 b3*c0+b4*c1 + + src_r0_16x8b = _mm_srli_si128(src_r0_16x8b, 4); //a2 a3 a3 a4 a4 a5 a5 a6 a6 a7 a7 a8 0 0 0 0 + src_r1_16x8b = _mm_srli_si128(src_r1_16x8b, 4); //b2 b3 b3 b4 b4 b5 b5 b6 b6 b7 b7 b8 0 0 0 0 + + src_r0r1_t1_16x8b = _mm_unpacklo_epi64(src_r0_16x8b, src_r1_16x8b); //a2 a3 a3 a4 a4 a5 a5 a6 b2 b3 b3 b4 b4 b5 b5 b6 + res_r0r1_t2_8x16b = _mm_maddubs_epi16(src_r0r1_t1_16x8b, coeff2_3_16x8b); //a2*c2+a3*c3 a3*c2+a4*c3 a4*c2+a5*c3 a5*c2+a6*c3 + //b2*c2+b3*c3 b3*c2+b4*c3 b4*c2+b5*c3 b5*c2+b6*c3 + + src_r0_16x8b = _mm_srli_si128(src_r0_16x8b, 4); //a4 a5 a5 a6 a6 a7 a7 a8 0 0 0 0 0 0 0 0 + src_r1_16x8b = _mm_srli_si128(src_r1_16x8b, 4); //b4 b5 b5 b6 b6 b7 b7 b8 0 0 0 0 0 0 0 0 + + src_r0r1_t1_16x8b = _mm_unpacklo_epi64(src_r0_16x8b, src_r1_16x8b); //a4 a5 a5 a6 a6 a7 a7 a8 b4 b5 b5 b6 b6 b7 b7 b8 + res_r0r1_t3_8x16b = _mm_maddubs_epi16(src_r0r1_t1_16x8b, coeff4_5_16x8b); //a4*c4+a5*c5 a5*c4+a6*c5 a6*c4+a7*c5 a7*c4+a8*c5 + //b4*c4+b5*c5 b5*c4+b6*c5 b4*c6+b7*c5 b7*c4+b8*c5 + + res_r0r1_t1_8x16b = _mm_add_epi16(res_r0r1_t1_8x16b, res_r0r1_t2_8x16b); + res_r0r1_t1_8x16b = _mm_add_epi16(res_r0r1_t1_8x16b, res_r0r1_t3_8x16b); + + + _mm_storeu_si128((__m128i *)(pi2_temp1), res_r0r1_t1_8x16b); + + ht_temp -= 2; + pu1_src = pu1_src + (src_strd << 1); + pi2_temp1 = pi2_temp1 + (4 << 1); + } + while(ht_temp > 0); + } + // vertical q-pel + { + __m128i src_r0_8x16b, src_r1_8x16b, src_r2_8x16b, src_r3_8x16b; + __m128i src_r4_8x16b, src_r5_8x16b, src_r6_8x16b; + __m128i src_r0r1_c0_8x16b, src_r2r3_c0_8x16b, src_r4r5_c0_8x16b; + __m128i src_hpel_16x8b, src_hpel_8x16b; + + __m128i res_t0_4x32b, res_t1_4x32b, res_t2_4x32b, res_t3_4x32b; + __m128i res_8x16b, res_16x8b; + + __m128i coeff0_1_8x16b, coeff2_3_8x16b, coeff4_5_8x16b; + __m128i const_val512_4x32b, const_val16_8x16b; + __m128i mask_low32b; + + mask_low32b = _mm_set1_epi8(0xff); + const_val512_4x32b = _mm_set1_epi32(512); + const_val16_8x16b = _mm_set1_epi16(16); + mask_low32b = _mm_srli_si128(mask_low32b, 12); + + coeff0_1_8x16b = _mm_set1_epi32(0xFFFB0001); + coeff2_3_8x16b = _mm_set1_epi32(0x00140014); + coeff4_5_8x16b = _mm_set1_epi32(0x0001FFFB); + + src_r0_8x16b = _mm_loadl_epi64((__m128i *)(pi2_temp2)); + src_r1_8x16b = _mm_loadl_epi64((__m128i *)(pi2_temp2 + 4)); + src_r2_8x16b = _mm_loadl_epi64((__m128i *)(pi2_temp2 + 8)); + src_r3_8x16b = _mm_loadl_epi64((__m128i *)(pi2_temp2 + 12)); + src_r4_8x16b = _mm_loadl_epi64((__m128i *)(pi2_temp2 + 16)); + pi2_temp2 += 20; + + do + { + src_r5_8x16b = _mm_loadl_epi64((__m128i *)(pi2_temp2)); + src_r6_8x16b = _mm_loadl_epi64((__m128i *)(pi2_temp2 + 4)); + + src_r0r1_c0_8x16b = _mm_unpacklo_epi16(src_r0_8x16b, src_r1_8x16b); + src_r2r3_c0_8x16b = _mm_unpacklo_epi16(src_r2_8x16b, src_r3_8x16b); + src_r4r5_c0_8x16b = _mm_unpacklo_epi16(src_r4_8x16b, src_r5_8x16b); + + res_t1_4x32b = _mm_madd_epi16(src_r0r1_c0_8x16b, coeff0_1_8x16b); + res_t2_4x32b = _mm_madd_epi16(src_r2r3_c0_8x16b, coeff2_3_8x16b); + res_t3_4x32b = _mm_madd_epi16(src_r4r5_c0_8x16b, coeff4_5_8x16b); + + res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t2_4x32b); + res_t3_4x32b = _mm_add_epi32(res_t3_4x32b, const_val512_4x32b); + res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t3_4x32b); + res_t0_4x32b = _mm_srai_epi32(res_t1_4x32b, 10); + + src_r0r1_c0_8x16b = _mm_unpacklo_epi16(src_r1_8x16b, src_r2_8x16b); + src_r2r3_c0_8x16b = _mm_unpacklo_epi16(src_r3_8x16b, src_r4_8x16b); + src_r4r5_c0_8x16b = _mm_unpacklo_epi16(src_r5_8x16b, src_r6_8x16b); + + res_t1_4x32b = _mm_madd_epi16(src_r0r1_c0_8x16b, coeff0_1_8x16b); + res_t2_4x32b = _mm_madd_epi16(src_r2r3_c0_8x16b, coeff2_3_8x16b); + res_t3_4x32b = _mm_madd_epi16(src_r4r5_c0_8x16b, coeff4_5_8x16b); + + res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t2_4x32b); + res_t3_4x32b = _mm_add_epi32(res_t3_4x32b, const_val512_4x32b); + res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t3_4x32b); + res_t1_4x32b = _mm_srai_epi32(res_t1_4x32b, 10); + + res_8x16b = _mm_packs_epi32(res_t0_4x32b, res_t1_4x32b); + res_16x8b = _mm_packus_epi16(res_8x16b, res_8x16b); + + src_hpel_8x16b = _mm_loadu_si128((__m128i *)pi2_temp3); + src_hpel_8x16b = _mm_add_epi16(src_hpel_8x16b, const_val16_8x16b); + src_hpel_8x16b = _mm_srai_epi16(src_hpel_8x16b, 5); //shifting right by 5 bits. + src_hpel_16x8b = _mm_packus_epi16(src_hpel_8x16b, src_hpel_8x16b); + + res_16x8b = _mm_avg_epu8(res_16x8b, src_hpel_16x8b); + + _mm_maskmoveu_si128(res_16x8b, mask_low32b, (char *)(pu1_dst)); + res_16x8b = _mm_srli_si128(res_16x8b, 4); + _mm_maskmoveu_si128(res_16x8b, mask_low32b, (char *)(pu1_dst + dst_strd)); + + src_r0_8x16b = src_r2_8x16b; + src_r1_8x16b = src_r3_8x16b; + src_r2_8x16b = src_r4_8x16b; + src_r3_8x16b = src_r5_8x16b; + src_r4_8x16b = src_r6_8x16b; + + ht -= 2; + pi2_temp2 = pi2_temp2 + (4 << 1); + pi2_temp3 = pi2_temp3 + (4 << 1); + pu1_dst = pu1_dst + (dst_strd << 1); + } + while(ht > 0); + } + } + else if(wd == 8) + { + // horizontal half-pel + { + __m128i src_r0_16x8b, src_r1_16x8b, src_r0_sht_16x8b, src_r1_sht_16x8b; + __m128i src_r0_t1_16x8b, src_r1_t1_16x8b; + + __m128i res_r0_t1_8x16b, res_r0_t2_8x16b, res_r0_t3_8x16b; + __m128i res_r1_t1_8x16b, res_r1_t2_8x16b, res_r1_t3_8x16b; + + __m128i coeff0_1_16x8b, coeff2_3_16x8b, coeff4_5_16x8b; + + coeff0_1_16x8b = _mm_set1_epi32(0xFB01FB01); //c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 + coeff2_3_16x8b = _mm_set1_epi32(0x14141414); //c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 + coeff4_5_16x8b = _mm_set1_epi32(0x01FB01FB); //c4 c5 c5 c5 c4 c5 c5 c5 c4 c5 c5 c5 c4 c5 c5 c5 + + //Row0 : a0 a1 a2 a3 a4 a5 a6 a7 a8 a9..... + //Row1 : b0 b1 b2 b3 b4 b5 b6 b7 b8 b9..... + + do + { + src_r0_16x8b = _mm_loadu_si128((__m128i *)(pu1_src)); //a0 a1 a2 a3 a4 a5 a6 a7 a8 a9....a15 + src_r1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + src_strd)); //b0 b1 b2 b3 b4 b5 b6 b7 b8 b9....b15 + + src_r0_sht_16x8b = _mm_srli_si128(src_r0_16x8b, 1); //a1 a2 a3 a4 a5 a6 a7 a8 a9....a15 0 + src_r1_sht_16x8b = _mm_srli_si128(src_r1_16x8b, 1); //b1 b2 b3 b4 b5 b6 b7 b8 b9....b15 0 + + src_r0_t1_16x8b = _mm_unpacklo_epi8(src_r0_16x8b, src_r0_sht_16x8b); //a0 a1 a1 a2 a2 a3 a3 a4 a4 a5 a5 a6 a6 a7 a7 a8 + src_r1_t1_16x8b = _mm_unpacklo_epi8(src_r1_16x8b, src_r1_sht_16x8b); //b0 b1 b1 b2 b2 b3 b3 b4 b4 b5 b5 b6 b6 b7 b7 b8 + + res_r0_t1_8x16b = _mm_maddubs_epi16(src_r0_t1_16x8b, coeff0_1_16x8b); //a0*c0+a1*c1 a1*c0+a2*c1 a2*c0+a3*c1 a3*c0+a4*c1 + //a4*c0+a5*c1 a5*c0+a6*c1 a6*c0+a7*c1 a7*c0+a8*c1 + res_r1_t1_8x16b = _mm_maddubs_epi16(src_r1_t1_16x8b, coeff0_1_16x8b); //b0*c0+b1*c1 b1*c0+b2*c1 b2*c0+b3*c1 b3*c0+b4*c1 + //b4*c0+b5*c1 b5*c0+b6*c1 b6*c0+b7*c1 b7*c0+b8*c1 + + src_r0_16x8b = _mm_srli_si128(src_r0_16x8b, 2); //a2 a3 a4 a5 a6 a7 a8 a9....a15 0 0 + src_r1_16x8b = _mm_srli_si128(src_r1_16x8b, 2); //b2 b3 b4 b5 b6 b7 b8 b9....b15 0 0 + + src_r0_sht_16x8b = _mm_srli_si128(src_r0_sht_16x8b, 2); //a3 a4 a5 a6 a7 a8 a9....a15 0 0 0 + src_r1_sht_16x8b = _mm_srli_si128(src_r1_sht_16x8b, 2); //b3 b4 b5 b6 b7 b8 b9....b15 0 0 0 + + src_r0_t1_16x8b = _mm_unpacklo_epi8(src_r0_16x8b, src_r0_sht_16x8b); //a2 a3 a3 a4 a4 a5 a5 a6 a6 a7 a7 a8 a8 a9 a9 a10 + src_r1_t1_16x8b = _mm_unpacklo_epi8(src_r1_16x8b, src_r1_sht_16x8b); //b2 b3 b3 b4 b4 b5 b5 b6 b6 b7 b7 b8 a8 a9 a9 a10 + + res_r0_t2_8x16b = _mm_maddubs_epi16(src_r0_t1_16x8b, coeff2_3_16x8b); //a2*c2+a3*c3 a3*c2+a4*c3 a4*c2+a5*c3 a5*c2+a6*c3 + //a6*c2+a7*c3 a7*c2+a8*c3 a8*c2+a9*c3 a9*c2+a10*c3 + res_r1_t2_8x16b = _mm_maddubs_epi16(src_r1_t1_16x8b, coeff2_3_16x8b); //b2*c2+b3*c3 b3*c2+b4*c3 b2*c4+b5*c3 b5*c2+b6*c3 + //b6*c2+b7*c3 b7*c2+b8*c3 b8*c2+b9*c3 b9*c2+b10*c3 + + src_r0_16x8b = _mm_srli_si128(src_r0_16x8b, 2); //a4 a5 a6 a7 a8 a9....a15 0 0 0 0 + src_r1_16x8b = _mm_srli_si128(src_r1_16x8b, 2); //b4 b5 b6 b7 b8 b9....b15 0 0 0 0 + + src_r0_sht_16x8b = _mm_srli_si128(src_r0_sht_16x8b, 2); //a5 a6 a7 a8 a9....a15 0 0 0 0 0 + src_r1_sht_16x8b = _mm_srli_si128(src_r1_sht_16x8b, 2); //b5 b6 b7 b8 b9....b15 0 0 0 0 0 + + src_r0_t1_16x8b = _mm_unpacklo_epi8(src_r0_16x8b, src_r0_sht_16x8b); //a4 a5 a5 a6 a6 a7 a7 a8 a8 a9 a9 a10 a10 a11 a11 a12 + src_r1_t1_16x8b = _mm_unpacklo_epi8(src_r1_16x8b, src_r1_sht_16x8b); //b4 b5 b5 b6 b6 b7 b7 b8 b8 b9 b9 b10 b10 b11 b11 b12 + + res_r0_t3_8x16b = _mm_maddubs_epi16(src_r0_t1_16x8b, coeff4_5_16x8b); //a4*c4+a5*c5 a5*c4+a6*c5 a6*c4+a7*c5 a7*c4+a8*c5 + //a8*c4+a9*c5 a9*c4+a10*c5 a10*c4+a11*c5 a11*c4+a12*c5 + res_r1_t3_8x16b = _mm_maddubs_epi16(src_r1_t1_16x8b, coeff4_5_16x8b); //b4*c4+b5*c5 b5*c4+b6*c5 b6*c4+b7*c5 b7*c4+b8*c5 + //b8*c4+b9*c5 b9*c4+b10*c5 b10*c4+b11*c5 b11*c4+b12*c5 + res_r0_t1_8x16b = _mm_add_epi16(res_r0_t1_8x16b, res_r0_t2_8x16b); + res_r0_t1_8x16b = _mm_add_epi16(res_r0_t1_8x16b, res_r0_t3_8x16b); + + res_r1_t1_8x16b = _mm_add_epi16(res_r1_t1_8x16b, res_r1_t2_8x16b); + res_r1_t1_8x16b = _mm_add_epi16(res_r1_t1_8x16b, res_r1_t3_8x16b); + + _mm_storeu_si128((__m128i *)(pi2_temp1), res_r0_t1_8x16b); + _mm_storeu_si128((__m128i *)(pi2_temp1 + 8), res_r1_t1_8x16b); + + ht_temp -= 2; + pu1_src = pu1_src + (src_strd << 1); + pi2_temp1 = pi2_temp1 + (8 << 1); + } + while(ht_temp > 0); + } + // vertical q-pel + { + __m128i src_r0_8x16b, src_r1_8x16b, src_r2_8x16b, src_r3_8x16b; + __m128i src_r4_8x16b, src_r5_8x16b, src_r6_8x16b; + __m128i src_r0r1_8x16b, src_r2r3_8x16b, src_r4r5_8x16b; + __m128i src_hpel_8x16b, src_hpel_16x8b; + + __m128i res_t0_4x32b, res_t1_4x32b, res_t2_4x32b, res_t3_4x32b; + __m128i res_8x16b, res_16x8b; + + __m128i coeff0_1_8x16b, coeff2_3_8x16b, coeff4_5_8x16b; + __m128i const_val512_4x32b, const_val16_8x16b; + + coeff0_1_8x16b = _mm_set1_epi32(0xFFFB0001); + coeff2_3_8x16b = _mm_set1_epi32(0x00140014); + coeff4_5_8x16b = _mm_set1_epi32(0x0001FFFB); + + const_val512_4x32b = _mm_set1_epi32(512); + const_val16_8x16b = _mm_set1_epi16(16); + + src_r0_8x16b = _mm_loadu_si128((__m128i *)(pi2_temp2)); + src_r1_8x16b = _mm_loadu_si128((__m128i *)(pi2_temp2 + 8)); + src_r2_8x16b = _mm_loadu_si128((__m128i *)(pi2_temp2 + 16)); + src_r3_8x16b = _mm_loadu_si128((__m128i *)(pi2_temp2 + 24)); + src_r4_8x16b = _mm_loadu_si128((__m128i *)(pi2_temp2 + 32)); + pi2_temp2 += 40; + + do + { + src_r5_8x16b = _mm_loadu_si128((__m128i *)(pi2_temp2)); + src_r6_8x16b = _mm_loadu_si128((__m128i *)(pi2_temp2 + 8)); + + src_r0r1_8x16b = _mm_unpacklo_epi16(src_r0_8x16b, src_r1_8x16b); + src_r2r3_8x16b = _mm_unpacklo_epi16(src_r2_8x16b, src_r3_8x16b); + src_r4r5_8x16b = _mm_unpacklo_epi16(src_r4_8x16b, src_r5_8x16b); + + res_t1_4x32b = _mm_madd_epi16(src_r0r1_8x16b, coeff0_1_8x16b); + res_t2_4x32b = _mm_madd_epi16(src_r2r3_8x16b, coeff2_3_8x16b); + res_t3_4x32b = _mm_madd_epi16(src_r4r5_8x16b, coeff4_5_8x16b); + + res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t2_4x32b); + res_t3_4x32b = _mm_add_epi32(res_t3_4x32b, const_val512_4x32b); + res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t3_4x32b); + res_t0_4x32b = _mm_srai_epi32(res_t1_4x32b, 10); + + src_r0r1_8x16b = _mm_unpackhi_epi16(src_r0_8x16b, src_r1_8x16b); + src_r2r3_8x16b = _mm_unpackhi_epi16(src_r2_8x16b, src_r3_8x16b); + src_r4r5_8x16b = _mm_unpackhi_epi16(src_r4_8x16b, src_r5_8x16b); + + res_t1_4x32b = _mm_madd_epi16(src_r0r1_8x16b, coeff0_1_8x16b); + res_t2_4x32b = _mm_madd_epi16(src_r2r3_8x16b, coeff2_3_8x16b); + res_t3_4x32b = _mm_madd_epi16(src_r4r5_8x16b, coeff4_5_8x16b); + + res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t2_4x32b); + res_t3_4x32b = _mm_add_epi32(res_t3_4x32b, const_val512_4x32b); + res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t3_4x32b); + res_t1_4x32b = _mm_srai_epi32(res_t1_4x32b, 10); + + res_8x16b = _mm_packs_epi32(res_t0_4x32b, res_t1_4x32b); + res_16x8b = _mm_packus_epi16(res_8x16b, res_8x16b); + + src_hpel_8x16b = _mm_loadu_si128((__m128i *)pi2_temp3); + src_hpel_8x16b = _mm_add_epi16(const_val16_8x16b, src_hpel_8x16b); + src_hpel_8x16b = _mm_srai_epi16(src_hpel_8x16b, 5); //shifting right by 5 bits. + src_hpel_16x8b = _mm_packus_epi16(src_hpel_8x16b, src_hpel_8x16b); + + res_16x8b = _mm_avg_epu8(res_16x8b, src_hpel_16x8b); + + _mm_storel_epi64((__m128i *)(pu1_dst), res_16x8b); + + src_r0r1_8x16b = _mm_unpacklo_epi16(src_r1_8x16b, src_r2_8x16b); + src_r2r3_8x16b = _mm_unpacklo_epi16(src_r3_8x16b, src_r4_8x16b); + src_r4r5_8x16b = _mm_unpacklo_epi16(src_r5_8x16b, src_r6_8x16b); + + res_t1_4x32b = _mm_madd_epi16(src_r0r1_8x16b, coeff0_1_8x16b); + res_t2_4x32b = _mm_madd_epi16(src_r2r3_8x16b, coeff2_3_8x16b); + res_t3_4x32b = _mm_madd_epi16(src_r4r5_8x16b, coeff4_5_8x16b); + + res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t2_4x32b); + res_t3_4x32b = _mm_add_epi32(res_t3_4x32b, const_val512_4x32b); + res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t3_4x32b); + res_t0_4x32b = _mm_srai_epi32(res_t1_4x32b, 10); + + src_r0r1_8x16b = _mm_unpackhi_epi16(src_r1_8x16b, src_r2_8x16b); + src_r2r3_8x16b = _mm_unpackhi_epi16(src_r3_8x16b, src_r4_8x16b); + src_r4r5_8x16b = _mm_unpackhi_epi16(src_r5_8x16b, src_r6_8x16b); + + res_t1_4x32b = _mm_madd_epi16(src_r0r1_8x16b, coeff0_1_8x16b); + res_t2_4x32b = _mm_madd_epi16(src_r2r3_8x16b, coeff2_3_8x16b); + res_t3_4x32b = _mm_madd_epi16(src_r4r5_8x16b, coeff4_5_8x16b); + + res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t2_4x32b); + res_t3_4x32b = _mm_add_epi32(res_t3_4x32b, const_val512_4x32b); + res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t3_4x32b); + res_t1_4x32b = _mm_srai_epi32(res_t1_4x32b, 10); + + res_8x16b = _mm_packs_epi32(res_t0_4x32b, res_t1_4x32b); + res_16x8b = _mm_packus_epi16(res_8x16b, res_8x16b); + + src_hpel_8x16b = _mm_loadu_si128((__m128i *)(pi2_temp3 + 8)); + src_hpel_8x16b = _mm_add_epi16(const_val16_8x16b, src_hpel_8x16b); + src_hpel_8x16b = _mm_srai_epi16(src_hpel_8x16b, 5); //shifting right by 5 bits. + src_hpel_16x8b = _mm_packus_epi16(src_hpel_8x16b, src_hpel_8x16b); + + res_16x8b = _mm_avg_epu8(res_16x8b, src_hpel_16x8b); + + _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd), res_16x8b); + + src_r0_8x16b = src_r2_8x16b; + src_r1_8x16b = src_r3_8x16b; + src_r2_8x16b = src_r4_8x16b; + src_r3_8x16b = src_r5_8x16b; + src_r4_8x16b = src_r6_8x16b; + + ht -= 2; + pi2_temp2 = pi2_temp2 + (8 << 1); + pi2_temp3 = pi2_temp3 + (8 << 1); + pu1_dst = pu1_dst + (dst_strd << 1); + } + while(ht > 0); + } + } + else // wd == 16 + { + UWORD8 *pu1_dst1; + WORD16 *pi2_temp4,*pi2_temp5; + + pu1_dst1 = pu1_dst + 8; + pi2_temp4 = pi2_temp2 + 8; + pi2_temp5 = pi2_temp3 + 8; + + // horizontal half-pel + { + __m128i src_r0_16x8b, src_r1_16x8b, src_r0_sht_16x8b, src_r1_sht_16x8b; + __m128i src_r0_t1_16x8b, src_r1_t1_16x8b; + + __m128i res_r0_t1_8x16b, res_r0_t2_8x16b, res_r0_t3_8x16b; + __m128i res_r1_t1_8x16b, res_r1_t2_8x16b, res_r1_t3_8x16b; + + __m128i coeff0_1_16x8b, coeff2_3_16x8b, coeff4_5_16x8b; + + coeff0_1_16x8b = _mm_set1_epi32(0xFB01FB01); //c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 + coeff2_3_16x8b = _mm_set1_epi32(0x14141414); //c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 + coeff4_5_16x8b = _mm_set1_epi32(0x01FB01FB); //c4 c5 c5 c5 c4 c5 c5 c5 c4 c5 c5 c5 c4 c5 c5 c5 + + //Row0 : a0 a1 a2 a3 a4 a5 a6 a7 a8 a9..... + //Row0 : b0 b1 b2 b3 b4 b5 b6 b7 b8 b9..... + //b0 is same a8. Similarly other bn pixels are same as a(n+8) pixels. + + do + { + src_r0_16x8b = _mm_loadu_si128((__m128i *)(pu1_src)); //a0 a1 a2 a3 a4 a5 a6 a7 a8 a9....a15 + src_r1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + 8)); //b0 b1 b2 b3 b4 b5 b6 b7 b8 b9....b15 + + src_r0_sht_16x8b = _mm_srli_si128(src_r0_16x8b, 1); //a1 a2 a3 a4 a5 a6 a7 a8 a9....a15 0 + src_r1_sht_16x8b = _mm_srli_si128(src_r1_16x8b, 1); //b1 b2 b3 b4 b5 b6 b7 b8 b9....b15 0 + + src_r0_t1_16x8b = _mm_unpacklo_epi8(src_r0_16x8b, src_r0_sht_16x8b); //a0 a1 a1 a2 a2 a3 a3 a4 a4 a5 a5 a6 a6 a7 a7 a8 + src_r1_t1_16x8b = _mm_unpacklo_epi8(src_r1_16x8b, src_r1_sht_16x8b); //b0 b1 b1 b2 b2 b3 b3 b4 b4 b5 b5 b6 b6 b7 b7 b8 + + res_r0_t1_8x16b = _mm_maddubs_epi16(src_r0_t1_16x8b, coeff0_1_16x8b); //a0*c0+a1*c1 a1*c0+a2*c1 a2*c0+a3*c1 a3*c0+a4*c1 + //a4*c0+a5*c1 a5*c0+a6*c1 a6*c0+a7*c1 a7*c0+a8*c1 + res_r1_t1_8x16b = _mm_maddubs_epi16(src_r1_t1_16x8b, coeff0_1_16x8b); //b0*c0+b1*c1 b1*c0+b2*c1 b2*c0+b3*c1 b3*c0+b4*c1 + //b4*c0+b5*c1 b5*c0+b6*c1 b6*c0+b7*c1 b7*c0+b8*c1 + + src_r0_16x8b = _mm_srli_si128(src_r0_16x8b, 2); //a2 a3 a4 a5 a6 a7 a8 a9....a15 0 0 + src_r1_16x8b = _mm_srli_si128(src_r1_16x8b, 2); //b2 b3 b4 b5 b6 b7 b8 b9....b15 0 0 + + src_r0_sht_16x8b = _mm_srli_si128(src_r0_sht_16x8b, 2); //a3 a4 a5 a6 a7 a8 a9....a15 0 0 0 + src_r1_sht_16x8b = _mm_srli_si128(src_r1_sht_16x8b, 2); //b3 b4 b5 b6 b7 b8 b9....b15 0 0 0 + + src_r0_t1_16x8b = _mm_unpacklo_epi8(src_r0_16x8b, src_r0_sht_16x8b); //a2 a3 a3 a4 a4 a5 a5 a6 a6 a7 a7 a8 a8 a9 a9 a10 + src_r1_t1_16x8b = _mm_unpacklo_epi8(src_r1_16x8b, src_r1_sht_16x8b); //b2 b3 b3 b4 b4 b5 b5 b6 b6 b7 b7 b8 a8 a9 a9 a10 + + res_r0_t2_8x16b = _mm_maddubs_epi16(src_r0_t1_16x8b, coeff2_3_16x8b); //a2*c2+a3*c3 a3*c2+a4*c3 a4*c2+a5*c3 a5*c2+a6*c3 + //a6*c2+a7*c3 a7*c2+a8*c3 a8*c2+a9*c3 a9*c2+a10*c3 + res_r1_t2_8x16b = _mm_maddubs_epi16(src_r1_t1_16x8b, coeff2_3_16x8b); //b2*c2+b3*c3 b3*c2+b4*c3 b2*c4+b5*c3 b5*c2+b6*c3 + //b6*c2+b7*c3 b7*c2+b8*c3 b8*c2+b9*c3 b9*c2+b10*c3 + + src_r0_16x8b = _mm_srli_si128(src_r0_16x8b, 2); //a4 a5 a6 a7 a8 a9....a15 0 0 0 0 + src_r1_16x8b = _mm_srli_si128(src_r1_16x8b, 2); //b4 b5 b6 b7 b8 b9....b15 0 0 0 0 + + src_r0_sht_16x8b = _mm_srli_si128(src_r0_sht_16x8b, 2); //a5 a6 a7 a8 a9....a15 0 0 0 0 0 + src_r1_sht_16x8b = _mm_srli_si128(src_r1_sht_16x8b, 2); //b5 b6 b7 b8 b9....b15 0 0 0 0 0 + + src_r0_t1_16x8b = _mm_unpacklo_epi8(src_r0_16x8b, src_r0_sht_16x8b); //a4 a5 a5 a6 a6 a7 a7 a8 a8 a9 a9 a10 a10 a11 a11 a12 + src_r1_t1_16x8b = _mm_unpacklo_epi8(src_r1_16x8b, src_r1_sht_16x8b); //b4 b5 b5 b6 b6 b7 b7 b8 b8 b9 b9 b10 b10 b11 b11 b12 + + res_r0_t3_8x16b = _mm_maddubs_epi16(src_r0_t1_16x8b, coeff4_5_16x8b); //a4*c4+a5*c5 a5*c4+a6*c5 a6*c4+a7*c5 a7*c4+a8*c5 + //a8*c4+a9*c5 a9*c4+a10*c5 a10*c4+a11*c5 a11*c4+a12*c5 + res_r1_t3_8x16b = _mm_maddubs_epi16(src_r1_t1_16x8b, coeff4_5_16x8b); //b4*c4+b5*c5 b5*c4+b6*c5 b6*c4+b7*c5 b7*c4+b8*c5 + //b8*c4+b9*c5 b9*c4+b10*c5 b10*c4+b11*c5 b11*c4+b12*c5 + res_r0_t1_8x16b = _mm_add_epi16(res_r0_t1_8x16b, res_r0_t2_8x16b); + res_r0_t1_8x16b = _mm_add_epi16(res_r0_t1_8x16b, res_r0_t3_8x16b); + + res_r1_t1_8x16b = _mm_add_epi16(res_r1_t1_8x16b, res_r1_t2_8x16b); + res_r1_t1_8x16b = _mm_add_epi16(res_r1_t1_8x16b, res_r1_t3_8x16b); + + _mm_storeu_si128((__m128i *)(pi2_temp1), res_r0_t1_8x16b); + _mm_storeu_si128((__m128i *)(pi2_temp1 + 8), res_r1_t1_8x16b); + + ht_temp--; + pu1_src = pu1_src + src_strd; + pi2_temp1 = pi2_temp1 + 16; + } + while(ht_temp > 0); + } + // vertical q-pel + { + __m128i src_r0_8x16b, src_r1_8x16b, src_r2_8x16b, src_r3_8x16b, src_r4_8x16b; + __m128i src_r5_8x16b, src_r6_8x16b; + __m128i src_r0r1_8x16b, src_r2r3_8x16b, src_r4r5_8x16b; + __m128i src_hpel_8x16b, src_hpel_16x8b; + + __m128i res_t0_4x32b, res_t1_4x32b, res_t2_4x32b, res_t3_4x32b; + __m128i res_8x16b, res_16x8b; + + __m128i coeff0_1_8x16b, coeff2_3_8x16b, coeff4_5_8x16b; + __m128i const_val512_4x32b, const_val16_8x16b; + + coeff0_1_8x16b = _mm_set1_epi32(0xFFFB0001); + coeff2_3_8x16b = _mm_set1_epi32(0x00140014); + coeff4_5_8x16b = _mm_set1_epi32(0x0001FFFB); + + const_val512_4x32b = _mm_set1_epi32(512); + const_val16_8x16b = _mm_set1_epi16(16); + + /**********************************************************/ + /* Do first height x 8 block */ + /**********************************************************/ + src_r0_8x16b = _mm_loadu_si128((__m128i *)(pi2_temp2)); + src_r1_8x16b = _mm_loadu_si128((__m128i *)(pi2_temp2 + 16)); + src_r2_8x16b = _mm_loadu_si128((__m128i *)(pi2_temp2 + 32)); + src_r3_8x16b = _mm_loadu_si128((__m128i *)(pi2_temp2 + 48)); + src_r4_8x16b = _mm_loadu_si128((__m128i *)(pi2_temp2 + 64)); + pi2_temp2 += 80; + + ht_temp = ht; + do + { + src_r5_8x16b = _mm_loadu_si128((__m128i *)(pi2_temp2)); + src_r6_8x16b = _mm_loadu_si128((__m128i *)(pi2_temp2 + 16)); + + src_r0r1_8x16b = _mm_unpacklo_epi16(src_r0_8x16b, src_r1_8x16b); + src_r2r3_8x16b = _mm_unpacklo_epi16(src_r2_8x16b, src_r3_8x16b); + src_r4r5_8x16b = _mm_unpacklo_epi16(src_r4_8x16b, src_r5_8x16b); + + res_t1_4x32b = _mm_madd_epi16(src_r0r1_8x16b, coeff0_1_8x16b); + res_t2_4x32b = _mm_madd_epi16(src_r2r3_8x16b, coeff2_3_8x16b); + res_t3_4x32b = _mm_madd_epi16(src_r4r5_8x16b, coeff4_5_8x16b); + + res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t2_4x32b); + res_t3_4x32b = _mm_add_epi32(res_t3_4x32b, const_val512_4x32b); + res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t3_4x32b); + res_t0_4x32b = _mm_srai_epi32(res_t1_4x32b, 10); + + src_r0r1_8x16b = _mm_unpackhi_epi16(src_r0_8x16b, src_r1_8x16b); + src_r2r3_8x16b = _mm_unpackhi_epi16(src_r2_8x16b, src_r3_8x16b); + src_r4r5_8x16b = _mm_unpackhi_epi16(src_r4_8x16b, src_r5_8x16b); + + res_t1_4x32b = _mm_madd_epi16(src_r0r1_8x16b, coeff0_1_8x16b); + res_t2_4x32b = _mm_madd_epi16(src_r2r3_8x16b, coeff2_3_8x16b); + res_t3_4x32b = _mm_madd_epi16(src_r4r5_8x16b, coeff4_5_8x16b); + + res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t2_4x32b); + res_t3_4x32b = _mm_add_epi32(res_t3_4x32b, const_val512_4x32b); + res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t3_4x32b); + res_t1_4x32b = _mm_srai_epi32(res_t1_4x32b, 10); + + res_8x16b = _mm_packs_epi32(res_t0_4x32b, res_t1_4x32b); + res_16x8b = _mm_packus_epi16(res_8x16b, res_8x16b); + + src_hpel_8x16b = _mm_loadu_si128((__m128i *)(pi2_temp3)); + src_hpel_8x16b = _mm_add_epi16(src_hpel_8x16b, const_val16_8x16b); + src_hpel_8x16b = _mm_srai_epi16(src_hpel_8x16b, 5); //shifting right by 5 bits. + src_hpel_16x8b = _mm_packus_epi16(src_hpel_8x16b, src_hpel_8x16b); + + res_16x8b = _mm_avg_epu8(res_16x8b, src_hpel_16x8b); + _mm_storel_epi64((__m128i *)(pu1_dst), res_16x8b); + + src_r0r1_8x16b = _mm_unpacklo_epi16(src_r1_8x16b, src_r2_8x16b); + src_r2r3_8x16b = _mm_unpacklo_epi16(src_r3_8x16b, src_r4_8x16b); + src_r4r5_8x16b = _mm_unpacklo_epi16(src_r5_8x16b, src_r6_8x16b); + + res_t1_4x32b = _mm_madd_epi16(src_r0r1_8x16b, coeff0_1_8x16b); + res_t2_4x32b = _mm_madd_epi16(src_r2r3_8x16b, coeff2_3_8x16b); + res_t3_4x32b = _mm_madd_epi16(src_r4r5_8x16b, coeff4_5_8x16b); + + res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t2_4x32b); + res_t3_4x32b = _mm_add_epi32(res_t3_4x32b, const_val512_4x32b); + res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t3_4x32b); + res_t0_4x32b = _mm_srai_epi32(res_t1_4x32b, 10); + + src_r0r1_8x16b = _mm_unpackhi_epi16(src_r1_8x16b, src_r2_8x16b); + src_r2r3_8x16b = _mm_unpackhi_epi16(src_r3_8x16b, src_r4_8x16b); + src_r4r5_8x16b = _mm_unpackhi_epi16(src_r5_8x16b, src_r6_8x16b); + + res_t1_4x32b = _mm_madd_epi16(src_r0r1_8x16b, coeff0_1_8x16b); + res_t2_4x32b = _mm_madd_epi16(src_r2r3_8x16b, coeff2_3_8x16b); + res_t3_4x32b = _mm_madd_epi16(src_r4r5_8x16b, coeff4_5_8x16b); + + res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t2_4x32b); + res_t3_4x32b = _mm_add_epi32(res_t3_4x32b, const_val512_4x32b); + res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t3_4x32b); + res_t1_4x32b = _mm_srai_epi32(res_t1_4x32b, 10); + + res_8x16b = _mm_packs_epi32(res_t0_4x32b, res_t1_4x32b); + res_16x8b = _mm_packus_epi16(res_8x16b, res_8x16b); + + src_hpel_8x16b = _mm_loadu_si128((__m128i *)(pi2_temp3 + 16)); + src_hpel_8x16b = _mm_add_epi16(src_hpel_8x16b, const_val16_8x16b); + src_hpel_8x16b = _mm_srai_epi16(src_hpel_8x16b, 5); //shifting right by 5 bits. + src_hpel_16x8b = _mm_packus_epi16(src_hpel_8x16b, src_hpel_8x16b); + + res_16x8b = _mm_avg_epu8(res_16x8b, src_hpel_16x8b); + _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd), res_16x8b); + + src_r0_8x16b = src_r2_8x16b; + src_r1_8x16b = src_r3_8x16b; + src_r2_8x16b = src_r4_8x16b; + src_r3_8x16b = src_r5_8x16b; + src_r4_8x16b = src_r6_8x16b; + + ht_temp -= 2; + pi2_temp3 = pi2_temp3 + (16 << 1); + pi2_temp2 = pi2_temp2 + (16 << 1); + pu1_dst = pu1_dst + (dst_strd << 1); + } + while(ht_temp > 0); + + /**********************************************************/ + /* Do second height * 8 block */ + /**********************************************************/ + src_r0_8x16b = _mm_loadu_si128((__m128i *)(pi2_temp4)); + src_r1_8x16b = _mm_loadu_si128((__m128i *)(pi2_temp4 + 16)); + src_r2_8x16b = _mm_loadu_si128((__m128i *)(pi2_temp4 + 32)); + src_r3_8x16b = _mm_loadu_si128((__m128i *)(pi2_temp4 + 48)); + src_r4_8x16b = _mm_loadu_si128((__m128i *)(pi2_temp4 + 64)); + pi2_temp4 += 80; + + do + { + src_r5_8x16b = _mm_loadu_si128((__m128i *)(pi2_temp4)); + src_r6_8x16b = _mm_loadu_si128((__m128i *)(pi2_temp4 + 16)); + + src_r0r1_8x16b = _mm_unpacklo_epi16(src_r0_8x16b, src_r1_8x16b); + src_r2r3_8x16b = _mm_unpacklo_epi16(src_r2_8x16b, src_r3_8x16b); + src_r4r5_8x16b = _mm_unpacklo_epi16(src_r4_8x16b, src_r5_8x16b); + + res_t1_4x32b = _mm_madd_epi16(src_r0r1_8x16b, coeff0_1_8x16b); + res_t2_4x32b = _mm_madd_epi16(src_r2r3_8x16b, coeff2_3_8x16b); + res_t3_4x32b = _mm_madd_epi16(src_r4r5_8x16b, coeff4_5_8x16b); + + res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t2_4x32b); + res_t3_4x32b = _mm_add_epi32(res_t3_4x32b, const_val512_4x32b); + res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t3_4x32b); + res_t0_4x32b = _mm_srai_epi32(res_t1_4x32b, 10); + + src_r0r1_8x16b = _mm_unpackhi_epi16(src_r0_8x16b, src_r1_8x16b); + src_r2r3_8x16b = _mm_unpackhi_epi16(src_r2_8x16b, src_r3_8x16b); + src_r4r5_8x16b = _mm_unpackhi_epi16(src_r4_8x16b, src_r5_8x16b); + + res_t1_4x32b = _mm_madd_epi16(src_r0r1_8x16b, coeff0_1_8x16b); + res_t2_4x32b = _mm_madd_epi16(src_r2r3_8x16b, coeff2_3_8x16b); + res_t3_4x32b = _mm_madd_epi16(src_r4r5_8x16b, coeff4_5_8x16b); + + res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t2_4x32b); + res_t3_4x32b = _mm_add_epi32(res_t3_4x32b, const_val512_4x32b); + res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t3_4x32b); + res_t1_4x32b = _mm_srai_epi32(res_t1_4x32b, 10); + + res_8x16b = _mm_packs_epi32(res_t0_4x32b, res_t1_4x32b); + res_16x8b = _mm_packus_epi16(res_8x16b, res_8x16b); + + src_hpel_8x16b = _mm_loadu_si128((__m128i *)(pi2_temp5)); + src_hpel_8x16b = _mm_add_epi16(src_hpel_8x16b, const_val16_8x16b); + src_hpel_8x16b = _mm_srai_epi16(src_hpel_8x16b, 5); //shifting right by 5 bits. + src_hpel_16x8b = _mm_packus_epi16(src_hpel_8x16b, src_hpel_8x16b); + + res_16x8b = _mm_avg_epu8(res_16x8b, src_hpel_16x8b); + _mm_storel_epi64((__m128i *)(pu1_dst1), res_16x8b); + + src_r0r1_8x16b = _mm_unpacklo_epi16(src_r1_8x16b, src_r2_8x16b); + src_r2r3_8x16b = _mm_unpacklo_epi16(src_r3_8x16b, src_r4_8x16b); + src_r4r5_8x16b = _mm_unpacklo_epi16(src_r5_8x16b, src_r6_8x16b); + + res_t1_4x32b = _mm_madd_epi16(src_r0r1_8x16b, coeff0_1_8x16b); + res_t2_4x32b = _mm_madd_epi16(src_r2r3_8x16b, coeff2_3_8x16b); + res_t3_4x32b = _mm_madd_epi16(src_r4r5_8x16b, coeff4_5_8x16b); + + res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t2_4x32b); + res_t3_4x32b = _mm_add_epi32(res_t3_4x32b, const_val512_4x32b); + res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t3_4x32b); + res_t0_4x32b = _mm_srai_epi32(res_t1_4x32b, 10); + + src_r0r1_8x16b = _mm_unpackhi_epi16(src_r1_8x16b, src_r2_8x16b); + src_r2r3_8x16b = _mm_unpackhi_epi16(src_r3_8x16b, src_r4_8x16b); + src_r4r5_8x16b = _mm_unpackhi_epi16(src_r5_8x16b, src_r6_8x16b); + + res_t1_4x32b = _mm_madd_epi16(src_r0r1_8x16b, coeff0_1_8x16b); + res_t2_4x32b = _mm_madd_epi16(src_r2r3_8x16b, coeff2_3_8x16b); + res_t3_4x32b = _mm_madd_epi16(src_r4r5_8x16b, coeff4_5_8x16b); + + res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t2_4x32b); + res_t3_4x32b = _mm_add_epi32(res_t3_4x32b, const_val512_4x32b); + res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t3_4x32b); + res_t1_4x32b = _mm_srai_epi32(res_t1_4x32b, 10); + + res_8x16b = _mm_packs_epi32(res_t0_4x32b, res_t1_4x32b); + res_16x8b = _mm_packus_epi16(res_8x16b, res_8x16b); + + src_hpel_8x16b = _mm_loadu_si128((__m128i *)(pi2_temp5 + 16)); + src_hpel_8x16b = _mm_add_epi16(src_hpel_8x16b, const_val16_8x16b); + src_hpel_8x16b = _mm_srai_epi16(src_hpel_8x16b, 5); //shifting right by 5 bits. + src_hpel_16x8b = _mm_packus_epi16(src_hpel_8x16b, src_hpel_8x16b); + + res_16x8b = _mm_avg_epu8(res_16x8b, src_hpel_16x8b); + _mm_storel_epi64((__m128i *)(pu1_dst1 + dst_strd), res_16x8b); + + src_r0_8x16b = src_r2_8x16b; + src_r1_8x16b = src_r3_8x16b; + src_r2_8x16b = src_r4_8x16b; + src_r3_8x16b = src_r5_8x16b; + src_r4_8x16b = src_r6_8x16b; + + ht -= 2; + pi2_temp5 = pi2_temp5 + (16 << 1); + pi2_temp4 = pi2_temp4 + (16 << 1); + pu1_dst1 = pu1_dst1 + (dst_strd << 1); + } + while(ht > 0); + } + } +} + +/*****************************************************************************/ +/* */ +/* Function Name : ih264_inter_pred_chroma_ssse3 */ +/* */ +/* Description : This function implements a four-tap 2D filter as */ +/* mentioned in sec. 8.4.2.2.2 titled "Chroma sample */ +/* "interpolation process". (ht,wd) can be (2,2), (4,2), */ +/* (2,4), (4,4), (8,4), (4,8) or (8,8). */ +/* */ +/* Inputs : puc_src - pointer to source */ +/* puc_dst - pointer to destination */ +/* src_strd - stride for source */ +/* dst_strd - stride for destination */ +/* dx - x position of destination value */ +/* dy - y position of destination value */ +/* ht - height of the block */ +/* wd - width of the block */ +/* */ +/* Issues : None */ +/* */ +/* Revision History: */ +/* */ +/* DD MM YYYY Author(s) Changes */ +/* 13 02 2015 Kaushik Initial Version */ +/* Senthoor */ +/* */ +/*****************************************************************************/ +void ih264_inter_pred_chroma_ssse3(UWORD8 *pu1_src, + UWORD8 *pu1_dst, + WORD32 src_strd, + WORD32 dst_strd, + WORD32 dx, + WORD32 dy, + WORD32 ht, + WORD32 wd) +{ + WORD32 i, j, A, B, C, D; + + i = 8 - dx; + j = 8 - dy; + + A = i * j; + B = dx * j; + C = i * dy; + D = dx * dy; + + if(wd == 2) + { + WORD32 tmp1, tmp2, tmp3, tmp4; + + do + { + //U + tmp1 = A * pu1_src[0] + B * pu1_src[2] + C * pu1_src[src_strd] + D * pu1_src[src_strd + 2]; + tmp2 = A * pu1_src[2] + B * pu1_src[4] + C * pu1_src[src_strd + 2] + D * pu1_src[src_strd + 4]; + //V + tmp3 = A * pu1_src[1] + B * pu1_src[3] + C * pu1_src[src_strd + 1] + D * pu1_src[src_strd + 3]; + tmp4 = A * pu1_src[3] + B * pu1_src[5] + C * pu1_src[src_strd + 3] + D * pu1_src[src_strd + 5]; + + tmp1 = (tmp1 + 32) >> 6; + tmp2 = (tmp2 + 32) >> 6; + tmp3 = (tmp3 + 32) >> 6; + tmp4 = (tmp4 + 32) >> 6; + + pu1_dst[0] = CLIP_U8(tmp1); + pu1_dst[2] = CLIP_U8(tmp2); + pu1_dst[1] = CLIP_U8(tmp3); + pu1_dst[3] = CLIP_U8(tmp4); + + pu1_src += src_strd; + pu1_dst += dst_strd; + + tmp1 = A * pu1_src[0] + B * pu1_src[2] + C * pu1_src[src_strd] + D * pu1_src[src_strd + 2]; + tmp2 = A * pu1_src[2] + B * pu1_src[4] + C * pu1_src[src_strd + 2] + D * pu1_src[src_strd + 4]; + tmp3 = A * pu1_src[1] + B * pu1_src[3] + C * pu1_src[src_strd + 1] + D * pu1_src[src_strd + 3]; + tmp4 = A * pu1_src[3] + B * pu1_src[5] + C * pu1_src[src_strd + 3] + D * pu1_src[src_strd + 5]; + + tmp1 = (tmp1 + 32) >> 6; + tmp2 = (tmp2 + 32) >> 6; + tmp3 = (tmp3 + 32) >> 6; + tmp4 = (tmp4 + 32) >> 6; + + pu1_dst[0] = CLIP_U8(tmp1); + pu1_dst[2] = CLIP_U8(tmp2); + pu1_dst[1] = CLIP_U8(tmp3); + pu1_dst[3] = CLIP_U8(tmp4); + + ht -= 2; + pu1_src += src_strd; + pu1_dst += dst_strd; + } + while(ht > 0); + + /* + WORD32 AB, CD; + + __m128i src_r1_16x8b, src_r2_16x8b, src_r3_16x8b; + __m128i src_r1r2_16x8b, src_r2r3_16x8b; + __m128i res_AB_8x16b, res_CD_8x16b, res_8x16b, res_16x8b; + __m128i mask_low32b; + + __m128i coeffAB_16x8b, coeffCD_16x8b, round_add32_8x16b; + __m128i const_shuff_16x8b; + + AB = (B << 8) + A; + CD = (D << 8) + C; + + coeffAB_16x8b = _mm_set1_epi16(AB); + coeffCD_16x8b = _mm_set1_epi16(CD); + + round_add32_8x16b = _mm_set1_epi16(32); + + mask_low32b = _mm_set1_epi8(0xff); + src_r1_16x8b = _mm_loadl_epi64((__m128i *)pu1_src); //u1[0] v1[0] u1[1] v1[1] u1[2] v1[2] u1[3] v1[3] + pu1_src += src_strd; + + const_shuff_16x8b = _mm_setr_epi32(0x03010200, 0x05030402, 0x0b090a08, 0x0d0b0c0a); + mask_low32b = _mm_srli_si128(mask_low32b, 12); + + do + { + src_r2_16x8b = _mm_loadl_epi64((__m128i *)pu1_src); //u2[0] v2[0] u2[1] v2[1] u1[2] v2[2] u2[3] v2[3] + src_r3_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + src_strd)); //u3[0] v3[0] u3[1] v3[1] u3[2] v3[2] u3[3] v3[3] + + src_r1r2_16x8b = _mm_unpacklo_epi64(src_r1_16x8b, src_r2_16x8b); + src_r2r3_16x8b = _mm_unpacklo_epi64(src_r2_16x8b, src_r3_16x8b); + + src_r1r2_16x8b = _mm_shuffle_epi8(src_r1r2_16x8b, const_shuff_16x8b); //u1[0] u1[1] v1[0] v1[1] u1[1] u1[2] v1[1] v1[2] + //u2[0] u2[1] v2[0] v2[1] u2[1] u2[2] v2[1] v2[2] + src_r2r3_16x8b = _mm_shuffle_epi8(src_r2r3_16x8b, const_shuff_16x8b); //u2[0] u2[1] v2[0] v2[1] u2[1] u2[2] v2[1] v2[2] + //u3[0] u3[1] v3[0] v3[1] u3[1] u3[2] v3[1] v3[2] + res_AB_8x16b = _mm_maddubs_epi16(src_r1r2_16x8b, coeffAB_16x8b); + res_CD_8x16b = _mm_maddubs_epi16(src_r2r3_16x8b, coeffCD_16x8b); + + res_8x16b = _mm_add_epi16(res_AB_8x16b, round_add32_8x16b); + res_8x16b = _mm_add_epi16(res_8x16b, res_CD_8x16b); + res_8x16b = _mm_srai_epi16(res_8x16b, 6); + res_16x8b = _mm_packus_epi16(res_8x16b, res_8x16b); + + _mm_maskmoveu_si128(res_16x8b, mask_low32b, (char *)pu1_dst); + + ht -= 2; + pu1_src += src_strd << 1; + res_16x8b = _mm_srli_si128(res_16x8b, 4); + src_r1_16x8b = src_r3_16x8b; + + _mm_maskmoveu_si128(res_16x8b, mask_low32b, (char *)(pu1_dst + dst_strd)); + + pu1_dst += dst_strd << 1; + } + while(ht > 0); + */ + } + else if(wd == 4) + { + WORD32 AB, CD; + + __m128i src_r1_16x8b, src_r2_16x8b, src_r3_16x8b; + __m128i res1_AB_8x16b, res1_CD_8x16b, res1_8x16b, res1_16x8b; + __m128i res2_AB_8x16b, res2_CD_8x16b, res2_8x16b, res2_16x8b; + + __m128i coeffAB_16x8b, coeffCD_16x8b, round_add32_8x16b; + __m128i const_shuff_16x8b; + + AB = (B << 8) + A; + CD = (D << 8) + C; + + coeffAB_16x8b = _mm_set1_epi16(AB); + coeffCD_16x8b = _mm_set1_epi16(CD); + + round_add32_8x16b = _mm_set1_epi16(32); + + const_shuff_16x8b = _mm_setr_epi32(0x03010200, 0x05030402, 0x07050604, 0x09070806); + + src_r1_16x8b = _mm_loadu_si128((__m128i *)pu1_src); + src_r1_16x8b = _mm_shuffle_epi8(src_r1_16x8b, const_shuff_16x8b); + pu1_src += src_strd; + + do + { + src_r2_16x8b = _mm_loadu_si128((__m128i *)pu1_src); + src_r3_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + src_strd)); + + src_r2_16x8b = _mm_shuffle_epi8(src_r2_16x8b, const_shuff_16x8b); + src_r3_16x8b = _mm_shuffle_epi8(src_r3_16x8b, const_shuff_16x8b); + + res1_AB_8x16b = _mm_maddubs_epi16(src_r1_16x8b, coeffAB_16x8b); + res1_CD_8x16b = _mm_maddubs_epi16(src_r2_16x8b, coeffCD_16x8b); + res2_AB_8x16b = _mm_maddubs_epi16(src_r2_16x8b, coeffAB_16x8b); + res2_CD_8x16b = _mm_maddubs_epi16(src_r3_16x8b, coeffCD_16x8b); + + res1_8x16b = _mm_add_epi16(res1_AB_8x16b, res1_CD_8x16b); + res2_8x16b = _mm_add_epi16(res2_AB_8x16b, res2_CD_8x16b); + res1_8x16b = _mm_add_epi16(res1_8x16b, round_add32_8x16b); + res2_8x16b = _mm_add_epi16(res2_8x16b, round_add32_8x16b); + + res1_8x16b = _mm_srai_epi16(res1_8x16b, 6); + res2_8x16b = _mm_srai_epi16(res2_8x16b, 6); + + res1_16x8b = _mm_packus_epi16(res1_8x16b, res1_8x16b); + res2_16x8b = _mm_packus_epi16(res2_8x16b, res2_8x16b); + + _mm_storel_epi64((__m128i *)pu1_dst, res1_16x8b); + _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd), res2_16x8b); + + src_r1_16x8b = src_r3_16x8b; + + ht -= 2; + pu1_src += src_strd << 1; + pu1_dst += dst_strd << 1; + } + while(ht > 0); + } + else // wd == 8 + { + WORD32 AB, CD; + + __m128i src_r1l_16x8b, src_r2l_16x8b; + __m128i src_r1h_16x8b, src_r2h_16x8b; + + __m128i res_l_AB_8x16b, res_l_CD_8x16b; + __m128i res_h_AB_8x16b, res_h_CD_8x16b; + __m128i res_l_8x16b, res_h_8x16b, res_16x8b; + + __m128i coeffAB_16x8b, coeffCD_16x8b, round_add32_8x16b; + __m128i const_shuff_16x8b; + + AB = (B << 8) + A; + CD = (D << 8) + C; + + coeffAB_16x8b = _mm_set1_epi16(AB); + coeffCD_16x8b = _mm_set1_epi16(CD); + + round_add32_8x16b = _mm_set1_epi16(32); + + const_shuff_16x8b = _mm_setr_epi32(0x03010200, 0x05030402, 0x07050604, 0x09070806); + + src_r1l_16x8b = _mm_loadu_si128((__m128i *)pu1_src); + src_r1h_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + 8)); + + src_r1l_16x8b = _mm_shuffle_epi8(src_r1l_16x8b, const_shuff_16x8b); + src_r1h_16x8b = _mm_shuffle_epi8(src_r1h_16x8b, const_shuff_16x8b); + + pu1_src += src_strd; + + do + { + //row 1 + src_r2l_16x8b = _mm_loadu_si128((__m128i *)pu1_src); + src_r2h_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + 8)); + + src_r2l_16x8b = _mm_shuffle_epi8(src_r2l_16x8b, const_shuff_16x8b); + src_r2h_16x8b = _mm_shuffle_epi8(src_r2h_16x8b, const_shuff_16x8b); + + res_l_AB_8x16b = _mm_maddubs_epi16(src_r1l_16x8b, coeffAB_16x8b); + res_h_AB_8x16b = _mm_maddubs_epi16(src_r1h_16x8b, coeffAB_16x8b); + res_l_CD_8x16b = _mm_maddubs_epi16(src_r2l_16x8b, coeffCD_16x8b); + res_h_CD_8x16b = _mm_maddubs_epi16(src_r2h_16x8b, coeffCD_16x8b); + + res_l_8x16b = _mm_add_epi16(res_l_AB_8x16b, round_add32_8x16b); + res_h_8x16b = _mm_add_epi16(res_h_AB_8x16b, round_add32_8x16b); + res_l_8x16b = _mm_add_epi16(res_l_8x16b, res_l_CD_8x16b); + res_h_8x16b = _mm_add_epi16(res_h_8x16b, res_h_CD_8x16b); + + res_l_8x16b = _mm_srai_epi16(res_l_8x16b, 6); + res_h_8x16b = _mm_srai_epi16(res_h_8x16b, 6); + + res_16x8b = _mm_packus_epi16(res_l_8x16b, res_h_8x16b); + + _mm_storeu_si128((__m128i *)pu1_dst, res_16x8b); + + pu1_src += src_strd; + pu1_dst += dst_strd; + + //row 2 + src_r1l_16x8b = _mm_loadu_si128((__m128i *)pu1_src); + src_r1h_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + 8)); + + src_r1l_16x8b = _mm_shuffle_epi8(src_r1l_16x8b, const_shuff_16x8b); + src_r1h_16x8b = _mm_shuffle_epi8(src_r1h_16x8b, const_shuff_16x8b); + + res_l_AB_8x16b = _mm_maddubs_epi16(src_r2l_16x8b, coeffAB_16x8b); + res_h_AB_8x16b = _mm_maddubs_epi16(src_r2h_16x8b, coeffAB_16x8b); + res_l_CD_8x16b = _mm_maddubs_epi16(src_r1l_16x8b, coeffCD_16x8b); + res_h_CD_8x16b = _mm_maddubs_epi16(src_r1h_16x8b, coeffCD_16x8b); + + res_l_8x16b = _mm_add_epi16(res_l_AB_8x16b, round_add32_8x16b); + res_h_8x16b = _mm_add_epi16(res_h_AB_8x16b, round_add32_8x16b); + res_l_8x16b = _mm_add_epi16(res_l_8x16b, res_l_CD_8x16b); + res_h_8x16b = _mm_add_epi16(res_h_8x16b, res_h_CD_8x16b); + + res_l_8x16b = _mm_srai_epi16(res_l_8x16b, 6); + res_h_8x16b = _mm_srai_epi16(res_h_8x16b, 6); + + res_16x8b = _mm_packus_epi16(res_l_8x16b, res_h_8x16b); + + _mm_storeu_si128((__m128i *)pu1_dst, res_16x8b); + + pu1_src += src_strd; + pu1_dst += dst_strd; + + //row 3 + src_r2l_16x8b = _mm_loadu_si128((__m128i *)pu1_src); + src_r2h_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + 8)); + + src_r2l_16x8b = _mm_shuffle_epi8(src_r2l_16x8b, const_shuff_16x8b); + src_r2h_16x8b = _mm_shuffle_epi8(src_r2h_16x8b, const_shuff_16x8b); + + res_l_AB_8x16b = _mm_maddubs_epi16(src_r1l_16x8b, coeffAB_16x8b); + res_h_AB_8x16b = _mm_maddubs_epi16(src_r1h_16x8b, coeffAB_16x8b); + res_l_CD_8x16b = _mm_maddubs_epi16(src_r2l_16x8b, coeffCD_16x8b); + res_h_CD_8x16b = _mm_maddubs_epi16(src_r2h_16x8b, coeffCD_16x8b); + + res_l_8x16b = _mm_add_epi16(res_l_AB_8x16b, round_add32_8x16b); + res_h_8x16b = _mm_add_epi16(res_h_AB_8x16b, round_add32_8x16b); + res_l_8x16b = _mm_add_epi16(res_l_8x16b, res_l_CD_8x16b); + res_h_8x16b = _mm_add_epi16(res_h_8x16b, res_h_CD_8x16b); + + res_l_8x16b = _mm_srai_epi16(res_l_8x16b, 6); + res_h_8x16b = _mm_srai_epi16(res_h_8x16b, 6); + + res_16x8b = _mm_packus_epi16(res_l_8x16b, res_h_8x16b); + + _mm_storeu_si128((__m128i *)pu1_dst, res_16x8b); + + pu1_src += src_strd; + pu1_dst += dst_strd; + + //row 1 + src_r1l_16x8b = _mm_loadu_si128((__m128i *)pu1_src); + src_r1h_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + 8)); + + src_r1l_16x8b = _mm_shuffle_epi8(src_r1l_16x8b, const_shuff_16x8b); + src_r1h_16x8b = _mm_shuffle_epi8(src_r1h_16x8b, const_shuff_16x8b); + + res_l_AB_8x16b = _mm_maddubs_epi16(src_r2l_16x8b, coeffAB_16x8b); + res_h_AB_8x16b = _mm_maddubs_epi16(src_r2h_16x8b, coeffAB_16x8b); + res_l_CD_8x16b = _mm_maddubs_epi16(src_r1l_16x8b, coeffCD_16x8b); + res_h_CD_8x16b = _mm_maddubs_epi16(src_r1h_16x8b, coeffCD_16x8b); + + res_l_8x16b = _mm_add_epi16(res_l_AB_8x16b, round_add32_8x16b); + res_h_8x16b = _mm_add_epi16(res_h_AB_8x16b, round_add32_8x16b); + res_l_8x16b = _mm_add_epi16(res_l_8x16b, res_l_CD_8x16b); + res_h_8x16b = _mm_add_epi16(res_h_8x16b, res_h_CD_8x16b); + + res_l_8x16b = _mm_srai_epi16(res_l_8x16b, 6); + res_h_8x16b = _mm_srai_epi16(res_h_8x16b, 6); + + res_16x8b = _mm_packus_epi16(res_l_8x16b, res_h_8x16b); + + _mm_storeu_si128((__m128i *)pu1_dst, res_16x8b); + + ht -= 4; + pu1_src += src_strd; + pu1_dst += dst_strd; + } + while(ht > 0); + } +} diff --git a/common/x86/ih264_iquant_itrans_recon_dc_ssse3.c b/common/x86/ih264_iquant_itrans_recon_dc_ssse3.c new file mode 100755 index 0000000..d43c8e2 --- /dev/null +++ b/common/x86/ih264_iquant_itrans_recon_dc_ssse3.c @@ -0,0 +1,437 @@ +/****************************************************************************** + * + * Copyright (C) 2015 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ***************************************************************************** + * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore +*/ +/** + ******************************************************************************* + * @file + * ih264_iquant_itrans_recon_dc_ssse3.c + * + * @brief + * Contains function definitions for inverse quantization, inverse + * transform and reconstruction + * + * @author + * Mohit [100664] + * + * @par List of Functions: + * - ihevc_iquant_itrans_recon_4x4_dc_ssse3() + * - ihevc_iquant_itrans_recon_8x8_dc_ssse3() + * + * @remarks + * None + * + ******************************************************************************* + */ +/* User include files */ +#include "ih264_typedefs.h" +#include "ih264_defs.h" +#include "ih264_trans_macros.h" +#include "ih264_macros.h" +#include "ih264_platform_macros.h" +#include "ih264_trans_data.h" +#include "ih264_size_defs.h" +#include "ih264_structs.h" +#include "ih264_trans_quant_itrans_iquant.h" +#include <immintrin.h> + +/* + ******************************************************************************** + * + * @brief This function reconstructs a 4x4 sub block from quantized resiude and + * prediction buffer for dc input pattern only, i.e. only the (0,0) element of the input + * 4x4 block is non-zero. For complete function, refer ih264_iquant_itrans_recon_ssse3.c + * + * @par Description: + * The quantized residue is first inverse quantized, then inverse transformed. + * This inverse transformed content is added to the prediction buffer to recon- + * struct the end output + * + * @param[in] pi2_src + * quantized 4x4 block + * + * @param[in] pu1_pred + * prediction 4x4 block + * + * @param[out] pu1_out + * reconstructed 4x4 block + * + * @param[in] src_strd + * quantization buffer stride + * + * @param[in] pred_strd, + * Prediction buffer stride + * + * @param[in] out_strd + * recon buffer Stride + * + * @param[in] pu2_scaling_list + * pointer to scaling list + * + * @param[in] pu2_norm_adjust + * pointer to inverse scale matrix + * + * @param[in] u4_qp_div_6 + * Floor (qp/6) + * + * @param[in] pi4_tmp + * temporary buffer of size 1*16 + * + * @returns none + * + * @remarks none + * + ******************************************************************************* + */ +void ih264_iquant_itrans_recon_4x4_dc_ssse3(WORD16 *pi2_src, + UWORD8 *pu1_pred, + UWORD8 *pu1_out, + WORD32 pred_strd, + WORD32 out_strd, + const UWORD16 *pu2_iscal_mat, + const UWORD16 *pu2_weigh_mat, + UWORD32 u4_qp_div_6, + WORD16 *pi2_tmp, + WORD32 iq_start_idx, + WORD16 *pi2_dc_ld_addr) +{ + UWORD32 *pu4_out = (UWORD32 *)pu1_out; + WORD32 q0 = pi2_src[0]; + WORD16 i_macro, rnd_fact = (u4_qp_div_6 < 4) ? 1 << (3 - u4_qp_div_6) : 0; + INV_QUANT(q0, pu2_iscal_mat[0], pu2_weigh_mat[0], u4_qp_div_6, rnd_fact, 4); + + if (iq_start_idx != 0 ) + q0 = pi2_dc_ld_addr[0]; // Restoring dc value for intra case + + i_macro = ((q0 + 32) >> 6); + + __m128i predload_r,pred_r0, pred_r1, pred_r2, pred_r3; + __m128i sign_reg; + __m128i zero_8x16b = _mm_setzero_si128(); // all bits reset to zero + __m128i temp4, temp5, temp6, temp7; + __m128i value_add = _mm_set1_epi16(i_macro); + + zero_8x16b = _mm_setzero_si128(); // all bits reset to zero + //Load pred buffer + predload_r = _mm_loadl_epi64((__m128i *) (&pu1_pred[0])); //p00 p01 p02 p03 0 0 0 0 0 0 0 0 -- all 8 bits + pred_r0 = _mm_unpacklo_epi8(predload_r, zero_8x16b); //p00 p01 p02 p03 0 0 0 0 -- all 16 bits + predload_r = _mm_loadl_epi64((__m128i *) (&pu1_pred[pred_strd])); //p10 p11 p12 p13 0 0 0 0 0 0 0 0 -- all 8 bits + pred_r1 = _mm_unpacklo_epi8(predload_r, zero_8x16b); //p10 p11 p12 p13 0 0 0 0 -- all 16 bits + predload_r = _mm_loadl_epi64((__m128i *) (&pu1_pred[2*pred_strd])); //p20 p21 p22 p23 0 0 0 0 0 0 0 0 -- all 8 bits + pred_r2 = _mm_unpacklo_epi8(predload_r, zero_8x16b); //p20 p21 p22 p23 0 0 0 0 -- all 16 bits + predload_r = _mm_loadl_epi64((__m128i *) (&pu1_pred[3*pred_strd])); //p30 p31 p32 p33 0 0 0 0 0 0 0 0 -- all 8 bits + pred_r3 = _mm_unpacklo_epi8(predload_r, zero_8x16b); //p30 p31 p32 p33 0 0 0 0 -- all 16 bits + + pred_r0 = _mm_unpacklo_epi64(pred_r0, pred_r1); //p00 p01 p02 p03 p10 p11 p12 p13 + pred_r2 = _mm_unpacklo_epi64(pred_r2, pred_r3); //p20 p21 p22p p23 p30 p31 p32 p33 + + temp4 = _mm_add_epi16(value_add, pred_r0); + temp5 = _mm_add_epi16(value_add, pred_r2); + /*------------------------------------------------------------------*/ + //Clipping the results to 8 bits + sign_reg = _mm_cmpgt_epi16(temp4, zero_8x16b); // sign check + temp4 = _mm_and_si128(temp4, sign_reg); + sign_reg = _mm_cmpgt_epi16(temp5, zero_8x16b); // sign check + temp5 = _mm_and_si128(temp5, sign_reg); + + temp4 = _mm_packus_epi16(temp4,temp5); + temp5 = _mm_srli_si128(temp4,4); + temp6 = _mm_srli_si128(temp5,4); + temp7 = _mm_srli_si128(temp6,4); + + *pu4_out = _mm_cvtsi128_si32(temp4); + pu1_out += out_strd; + pu4_out = (UWORD32 *)(pu1_out); + *(pu4_out) = _mm_cvtsi128_si32(temp5); + pu1_out += out_strd; + pu4_out = (UWORD32 *)(pu1_out); + *(pu4_out) = _mm_cvtsi128_si32(temp6); + pu1_out += out_strd; + pu4_out = (UWORD32 *)(pu1_out); + *(pu4_out) = _mm_cvtsi128_si32(temp7); +} +/** + ******************************************************************************* + * + * @brief + * This function performs inverse quant and Inverse transform type Ci4 for 8x8 block + * for dc input pattern only, i.e. only the (0,0) element of the input 8x8 block is + * non-zero. For complete function, refer ih264_iquant_itrans_recon_ssse3.c + * + * @par Description: + * Performs inverse transform Ci8 and adds the residue to get the + * reconstructed block + * + * @param[in] pi2_src + * Input 8x8coefficients + * + * @param[in] pu1_pred + * Prediction 8x8 block + * + * @param[out] pu1_recon + * Output 8x8 block + * + * @param[in] q_div + * QP/6 + * + * @param[in] q_rem + * QP%6 + * + * @param[in] q_lev + * Quantizer level + * + * @param[in] u4_src_stride + * Input stride + * + * @param[in] u4_pred_stride, + * Prediction stride + * + * @param[in] u4_out_stride + * Output Stride + * + * @param[in] pi4_tmp + * temporary buffer of size 1*64 + * the tmp for each block + * + * @param[in] pu4_iquant_mat + * Pointer to the inverse quantization matrix + * + * @returns Void + * + * @remarks + * None + * + ******************************************************************************* + */ + +void ih264_iquant_itrans_recon_8x8_dc_ssse3 (WORD16 *pi2_src, + UWORD8 *pu1_pred, + UWORD8 *pu1_out, + WORD32 pred_strd, + WORD32 out_strd, + const UWORD16 *pu2_iscale_mat, + const UWORD16 *pu2_weigh_mat, + UWORD32 qp_div, + WORD16 *pi2_tmp, + WORD32 iq_start_idx, + WORD16 *pi2_dc_ld_addr) +{ + WORD32 q0 = pi2_src[0]; + WORD16 i_macro, rnd_fact = (qp_div < 6) ? 1 << (5 - qp_div) : 0; + INV_QUANT(q0, pu2_iscale_mat[0], pu2_weigh_mat[0], qp_div, rnd_fact, 6); + i_macro = ((q0 + 32) >> 6); + + __m128i predload_r,pred_r0, pred_r1, pred_r2, pred_r3,pred_r4,pred_r5,pred_r6,pred_r7; + __m128i sign_reg; + __m128i zero_8x16b = _mm_setzero_si128(); // all bits reset to zero + __m128i temp1,temp2,temp3,temp4, temp5, temp6, temp7,temp8; + __m128i value_add = _mm_set1_epi16(i_macro); + + //Load pred buffer row 0 + predload_r = _mm_loadl_epi64((__m128i *)(&pu1_pred[0])); //p0 p1 p2 p3 p4 p5 p6 p7 0 0 0 0 0 0 0 0 -- all 8 bits + pred_r0 = _mm_unpacklo_epi8(predload_r, zero_8x16b); //p0 p1 p2 p3 p4 p5 p6 p7 -- all 16 bits + //Load pred buffer row 1 + predload_r = _mm_loadl_epi64((__m128i *)(&pu1_pred[pred_strd])); //p0 p1 p2 p3 p4 p5 p6 p7 0 0 0 0 0 0 0 0 -- all 8 bits + pred_r1 = _mm_unpacklo_epi8(predload_r, zero_8x16b); //p0 p1 p2 p3 p4 p5 p6 p7 -- all 16 bits + //Load pred buffer row 2 + predload_r = _mm_loadl_epi64( + (__m128i *)(&pu1_pred[2 * pred_strd])); //p0 p1 p2 p3 p4 p5 p6 p7 0 0 0 0 0 0 0 0 -- all 8 bits + pred_r2 = _mm_unpacklo_epi8(predload_r, zero_8x16b); //p0 p1 p2 p3 p4 p5 p6 p7 -- all 16 bits + //Load pred buffer row 3 + predload_r = _mm_loadl_epi64( + (__m128i *)(&pu1_pred[3 * pred_strd])); //p0 p1 p2 p3 p4 p5 p6 p7 0 0 0 0 0 0 0 0 -- all 8 bits + pred_r3 = _mm_unpacklo_epi8(predload_r, zero_8x16b); //p0 p1 p2 p3 p4 p5 p6 p7 -- all 16 bits + //Load pred buffer row 4 + predload_r = _mm_loadl_epi64( + (__m128i *)(&pu1_pred[4 * pred_strd])); //p0 p1 p2 p3 p4 p5 p6 p7 0 0 0 0 0 0 0 0 -- all 8 bits + pred_r4 = _mm_unpacklo_epi8(predload_r, zero_8x16b); //p0 p1 p2 p3 p4 p5 p6 p7 -- all 16 bits + //Load pred buffer row 5 + predload_r = _mm_loadl_epi64( + (__m128i *)(&pu1_pred[5 * pred_strd])); //p0 p1 p2 p3 p4 p5 p6 p7 0 0 0 0 0 0 0 0 -- all 8 bit + pred_r5 = _mm_unpacklo_epi8(predload_r, zero_8x16b); //p0 p1 p2 p3 p4 p5 p6 p7 -- all 16 bits + //Load pred buffer row 6 + predload_r = _mm_loadl_epi64( + (__m128i *)(&pu1_pred[6 * pred_strd])); //p0 p1 p2 p3 p4 p5 p6 p7 0 0 0 0 0 0 0 0 -- all 8 bits + pred_r6 = _mm_unpacklo_epi8(predload_r, zero_8x16b); //p0 p1 p2 p3 p4 p5 p6 p7 -- all 16 bits + //Load pred buffer row 7 + predload_r = _mm_loadl_epi64( + (__m128i *)(&pu1_pred[7 * pred_strd])); //p0 p1 p2 p3 p4 p5 p6 p7 0 0 0 0 0 0 0 0 -- all 8 bits + pred_r7 = _mm_unpacklo_epi8(predload_r, zero_8x16b); //p0 p1 p2 p3 p4 p5 p6 p7 -- all 16 bits + + temp1 = _mm_add_epi16(value_add, pred_r0); + + temp2 = _mm_add_epi16(value_add, pred_r1); + + temp3 = _mm_add_epi16(value_add, pred_r2); + + temp4 = _mm_add_epi16(value_add, pred_r3); + + temp5 = _mm_add_epi16(value_add, pred_r4); + + temp6 = _mm_add_epi16(value_add, pred_r5); + + temp7 = _mm_add_epi16(value_add, pred_r6); + + temp8 = _mm_add_epi16(value_add, pred_r7); + /*------------------------------------------------------------------*/ + //Clipping the results to 8 bits + sign_reg = _mm_cmpgt_epi16(temp1, zero_8x16b); // sign check + temp1 = _mm_and_si128(temp1, sign_reg); + sign_reg = _mm_cmpgt_epi16(temp2, zero_8x16b); // sign check + temp2 = _mm_and_si128(temp2, sign_reg); + sign_reg = _mm_cmpgt_epi16(temp3, zero_8x16b); // sign check + temp3 = _mm_and_si128(temp3, sign_reg); + sign_reg = _mm_cmpgt_epi16(temp4, zero_8x16b); // sign check + temp4 = _mm_and_si128(temp4, sign_reg); + sign_reg = _mm_cmpgt_epi16(temp5, zero_8x16b); // sign check + temp5 = _mm_and_si128(temp5, sign_reg); + sign_reg = _mm_cmpgt_epi16(temp6, zero_8x16b); // sign check + temp6 = _mm_and_si128(temp6, sign_reg); + sign_reg = _mm_cmpgt_epi16(temp7, zero_8x16b); // sign check + temp7 = _mm_and_si128(temp7, sign_reg); + sign_reg = _mm_cmpgt_epi16(temp8, zero_8x16b); // sign check + temp8 = _mm_and_si128(temp8, sign_reg); + + temp1 = _mm_packus_epi16(temp1, zero_8x16b); + temp2 = _mm_packus_epi16(temp2, zero_8x16b); + temp3 = _mm_packus_epi16(temp3, zero_8x16b); + temp4 = _mm_packus_epi16(temp4, zero_8x16b); + temp5 = _mm_packus_epi16(temp5, zero_8x16b); + temp6 = _mm_packus_epi16(temp6, zero_8x16b); + temp7 = _mm_packus_epi16(temp7, zero_8x16b); + temp8 = _mm_packus_epi16(temp8, zero_8x16b); + + _mm_storel_epi64((__m128i *)(&pu1_out[0]), temp1); + _mm_storel_epi64((__m128i *)(&pu1_out[out_strd]), temp2); + _mm_storel_epi64((__m128i *)(&pu1_out[2 * out_strd]), temp3); + _mm_storel_epi64((__m128i *)(&pu1_out[3 * out_strd]), temp4); + _mm_storel_epi64((__m128i *)(&pu1_out[4 * out_strd]), temp5); + _mm_storel_epi64((__m128i *)(&pu1_out[5 * out_strd]), temp6); + _mm_storel_epi64((__m128i *)(&pu1_out[6 * out_strd]), temp7); + _mm_storel_epi64((__m128i *)(&pu1_out[7 * out_strd]), temp8); +} + +/* + ******************************************************************************** + * + * @brief This function reconstructs a 4x4 sub block from quantized chroma resiude and + * prediction buffer + * + * @par Description: + * The quantized residue is first inverse quantized, then inverse transformed. + * This inverse transformed content is added to the prediction buffer to recon- + * struct the end output + * + * @param[in] pi2_src + * quantized 4x4 block + * + * @param[in] pu1_pred + * prediction 4x4 block + * + * @param[out] pu1_out + * reconstructed 4x4 block + * + * @param[in] src_strd + * quantization buffer stride + * + * @param[in] pred_strd, + * Prediction buffer stride + * + * @param[in] out_strd + * recon buffer Stride + * + * @param[in] pu2_scaling_list + * pointer to scaling list + * + * @param[in] pu2_norm_adjust + * pointer to inverse scale matrix + * + * @param[in] u4_qp_div_6 + * Floor (qp/6) + * + * @param[in] pi4_tmp + * temporary buffer of size 1*16 + * + * @returns none + * + * @remarks none + * + ******************************************************************************* + */ +void ih264_iquant_itrans_recon_chroma_4x4_dc_ssse3(WORD16 *pi2_src, + UWORD8 *pu1_pred, + UWORD8 *pu1_out, + WORD32 pred_strd, + WORD32 out_strd, + const UWORD16 *pu2_iscal_mat, + const UWORD16 *pu2_weigh_mat, + UWORD32 u4_qp_div_6, + WORD16 *pi2_tmp, + WORD16 *pi2_dc_src) + { + WORD16 q0 = pi2_dc_src[0]; // DC value won't be dequantized for chroma inverse transform + WORD16 i_macro = ((q0 + 32) >> 6); + + __m128i pred_r0, pred_r1, pred_r2, pred_r3, sign_reg; + __m128i zero_8x16b = _mm_setzero_si128(); // all bits reset to zero + __m128i chroma_mask = _mm_set1_epi16 (0xFF); + __m128i value_add = _mm_set1_epi16(i_macro); + + //Load pred buffer + pred_r0 = _mm_loadl_epi64((__m128i *) (&pu1_pred[0])); //p00 p01 p02 p03 0 0 0 0 0 0 0 0 -- all 8 bits + pred_r1 = _mm_loadl_epi64((__m128i *) (&pu1_pred[pred_strd])); //p10 p11 p12 p13 0 0 0 0 0 0 0 0 -- all 8 bits + pred_r2 = _mm_loadl_epi64((__m128i *) (&pu1_pred[2 * pred_strd])); //p20 p21 p22 p23 0 0 0 0 0 0 0 0 -- all 8 bits + pred_r3 = _mm_loadl_epi64((__m128i *) (&pu1_pred[3 * pred_strd])); //p30 p31 p32 p33 0 0 0 0 0 0 0 0 -- all 8 bits + + pred_r0 = _mm_and_si128(pred_r0, chroma_mask); + pred_r1 = _mm_and_si128(pred_r1, chroma_mask); + pred_r2 = _mm_and_si128(pred_r2, chroma_mask); + pred_r3 = _mm_and_si128(pred_r3, chroma_mask); + + pred_r0 = _mm_unpacklo_epi64(pred_r0, pred_r1); //p00 p01 p02 p03 p10 p11 p12 p13 + pred_r2 = _mm_unpacklo_epi64(pred_r2, pred_r3); //p20 p21 p22p p23 p30 p31 p32 p33 + + pred_r0 = _mm_add_epi16(value_add, pred_r0); + pred_r2 = _mm_add_epi16(value_add, pred_r2); + + /*------------------------------------------------------------------*/ + //Clipping the results to 8 bits + sign_reg = _mm_cmpgt_epi16(pred_r0, zero_8x16b); // sign check + pred_r0 = _mm_and_si128(pred_r0, sign_reg); + sign_reg = _mm_cmpgt_epi16(pred_r2, zero_8x16b); + pred_r2 = _mm_and_si128(pred_r2, sign_reg); + + pred_r0 = _mm_packus_epi16(pred_r0, pred_r2); + pred_r1 = _mm_srli_si128(pred_r0, 4); + pred_r2 = _mm_srli_si128(pred_r1, 4); + pred_r3 = _mm_srli_si128(pred_r2, 4); + + pred_r0 = _mm_unpacklo_epi8(pred_r0, zero_8x16b); //p00 p01 p02 p03 -- all 16 bits + pred_r1 = _mm_unpacklo_epi8(pred_r1, zero_8x16b); //p10 p11 p12 p13 -- all 16 bits + pred_r2 = _mm_unpacklo_epi8(pred_r2, zero_8x16b); //p20 p21 p22 p23 -- all 16 bits + pred_r3 = _mm_unpacklo_epi8(pred_r3, zero_8x16b); //p30 p31 p32 p33 -- all 16 bits + + chroma_mask = _mm_unpacklo_epi64(chroma_mask, zero_8x16b); //1 0 1 0 1 0 1 0 0 0 0 0 0 0 0 0 -- 8 bits + + _mm_maskmoveu_si128(pred_r0, chroma_mask, (char *)(&pu1_out[0])); + _mm_maskmoveu_si128(pred_r1, chroma_mask, (char *)(&pu1_out[out_strd])); + _mm_maskmoveu_si128(pred_r2, chroma_mask, (char *)(&pu1_out[2*out_strd])); + _mm_maskmoveu_si128(pred_r3, chroma_mask, (char *)(&pu1_out[3*out_strd])); +} + + diff --git a/common/x86/ih264_iquant_itrans_recon_sse42.c b/common/x86/ih264_iquant_itrans_recon_sse42.c new file mode 100755 index 0000000..2a4ea3f --- /dev/null +++ b/common/x86/ih264_iquant_itrans_recon_sse42.c @@ -0,0 +1,554 @@ +/****************************************************************************** + * + * Copyright (C) 2015 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ***************************************************************************** + * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore +*/ +/** + ******************************************************************************* + * @file + * ih264_iquant_itrans_recon_sse42.c + * + * @brief + * Contains function definitions for inverse quantization, inverse + * transform and reconstruction + * + * @author + * Mohit [100664] + * + * @par List of Functions: + * - ihevc_iquant_itrans_recon_4x4_sse42() + * - ihevc_iquant_itrans_recon_chroma_4x4_sse42() + * + * @remarks + * None + * + ******************************************************************************* + */ +/* User include files */ +#include "ih264_typedefs.h" +#include "ih264_defs.h" +#include "ih264_trans_macros.h" +#include "ih264_macros.h" +#include "ih264_platform_macros.h" +#include "ih264_trans_data.h" +#include "ih264_size_defs.h" +#include "ih264_structs.h" +#include "ih264_trans_quant_itrans_iquant.h" +#include <immintrin.h> + +/* + ******************************************************************************** + * + * @brief This function reconstructs a 4x4 sub block from quantized resiude and + * prediction buffer + * + * @par Description: + * The quantized residue is first inverse quantized, then inverse transformed. + * This inverse transformed content is added to the prediction buffer to recon- + * struct the end output + * + * @param[in] pi2_src + * quantized 4x4 block + * + * @param[in] pu1_pred + * prediction 4x4 block + * + * @param[out] pu1_out + * reconstructed 4x4 block + * + * @param[in] src_strd + * quantization buffer stride + * + * @param[in] pred_strd, + * Prediction buffer stride + * + * @param[in] out_strd + * recon buffer Stride + * + * @param[in] pu2_scaling_list + * pointer to scaling list + * + * @param[in] pu2_norm_adjust + * pointer to inverse scale matrix + * + * @param[in] u4_qp_div_6 + * Floor (qp/6) + * + * @param[in] pi4_tmp + * temporary buffer of size 1*16 + * + * @returns none + * + * @remarks none + * + ******************************************************************************* + */ +void ih264_iquant_itrans_recon_4x4_sse42(WORD16 *pi2_src, + UWORD8 *pu1_pred, + UWORD8 *pu1_out, + WORD32 pred_strd, + WORD32 out_strd, + const UWORD16 *pu2_iscal_mat, + const UWORD16 *pu2_weigh_mat, + UWORD32 u4_qp_div_6, + WORD16 *pi2_tmp, + WORD32 iq_start_idx, + WORD16 *pi2_dc_ld_addr) + { + UWORD32 *pu4_out = (UWORD32 *) pu1_out; + __m128i src_r0_r1, src_r2_r3; + __m128i src_r0, src_r1, src_r2, src_r3; + __m128i scalemat_r0_r1, scalemat_r2_r3; + __m128i pred_r0, pred_r1, pred_r2, pred_r3; + __m128i sign_reg, dequant_r0_r1, dequant_r2_r3; + __m128i zero_8x16b = _mm_setzero_si128(); // all bits reset to zero + __m128i temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7; + __m128i resq_r0, resq_r1, resq_r2, resq_r3; + __m128i add_rshift = _mm_set1_epi32((1 << (3 - u4_qp_div_6))); + __m128i value_32 = _mm_set1_epi32(32); + + /*************************************************************/ + /* Dequantization of coefficients. Will be replaced by SIMD */ + /* operations on platform */ + /*************************************************************/ + src_r0_r1 = _mm_loadu_si128((__m128i *) (pi2_src)); //a00 a01 a02 a03 a10 a11 a12 a13 -- the source matrix 0th,1st row + src_r2_r3 = _mm_loadu_si128((__m128i *) (pi2_src + 8)); //a20 a21 a22 a23 a30 a31 a32 a33 -- the source matrix 2nd,3rd row + scalemat_r0_r1 = _mm_loadu_si128((__m128i *) (pu2_iscal_mat)); //b00 b01 b02 b03 b10 b11 b12 b13 -- the scaling matrix 0th,1st row + scalemat_r2_r3 = _mm_loadu_si128((__m128i *) (pu2_iscal_mat + 8)); //b20 b21 b22 b23 b30 b31 b32 b33 -- the scaling matrix 2nd,3rd row + dequant_r0_r1 = _mm_loadu_si128((__m128i *) (pu2_weigh_mat)); //q00 q01 q02 q03 q10 q11 q12 q13 -- all 16 bits + dequant_r2_r3 = _mm_loadu_si128((__m128i *) (pu2_weigh_mat + 8)); //q20 q21 q22 q23 q30 q31 q32 q33 -- all 16 bits + + temp0 = _mm_mullo_epi16(scalemat_r0_r1, dequant_r0_r1); //b00*q00 b01*q01 b02*q02 b03*q03 b10*q10 b11*q11 b12*q12 b13*q13 -- 16 bit result + temp1 = _mm_mullo_epi16(scalemat_r2_r3, dequant_r2_r3); //b00*q00 b01*q01 b02*q02 b03*q03 b10*q10 b11*q11 b12*q12 b13*q13 -- 16 bit result + + temp4 = _mm_unpacklo_epi16(temp0, zero_8x16b); // b00*q00 0 b01*q01 0 b02*q02 0 b03*q03 0 -- 16 bit long + temp5 = _mm_unpackhi_epi16(temp0, zero_8x16b); // b10*q10 0 b11*q11 0 b12*q12 0 b13*q13 0 -- 16 bit long + temp6 = _mm_unpacklo_epi16(temp1, zero_8x16b); // b00*q00 0 b01*q01 0 b02*q02 0 b03*q03 0 -- 16 bit long + temp7 = _mm_unpackhi_epi16(temp1, zero_8x16b); // b10*q10 0 b11*q11 0 b12*q12 0 b13*q13 0 -- 16 bit long + + src_r0 = _mm_unpacklo_epi16(src_r0_r1, zero_8x16b); // a00 0 a01 0 a02 0 a03 0 -- 16 bit long + src_r1 = _mm_unpackhi_epi16(src_r0_r1, zero_8x16b); // a10 0 a11 0 a12 0 a13 0 -- 16 bit long + src_r2 = _mm_unpacklo_epi16(src_r2_r3, zero_8x16b); // a20 0 a21 0 a22 0 a23 0 -- 16 bit long + src_r3 = _mm_unpackhi_epi16(src_r2_r3, zero_8x16b); // a30 0 a31 0 a32 0 a33 0 -- 16 bit long + + temp4 = _mm_madd_epi16(src_r0, temp4); //a00*b00*q00 a10*b10*q10 a20*b20*q20 a30*b30 q30 -- 32 bits long + temp5 = _mm_madd_epi16(src_r1, temp5); + temp6 = _mm_madd_epi16(src_r2, temp6); + temp7 = _mm_madd_epi16(src_r3, temp7); + + if (u4_qp_div_6 >= 4) { + resq_r0 = _mm_slli_epi32(temp4, u4_qp_div_6 - 4); + resq_r1 = _mm_slli_epi32(temp5, u4_qp_div_6 - 4); + resq_r2 = _mm_slli_epi32(temp6, u4_qp_div_6 - 4); + resq_r3 = _mm_slli_epi32(temp7, u4_qp_div_6 - 4); + } else { + temp4 = _mm_add_epi32(temp4, add_rshift); + temp5 = _mm_add_epi32(temp5, add_rshift); + temp6 = _mm_add_epi32(temp6, add_rshift); + temp7 = _mm_add_epi32(temp7, add_rshift); + resq_r0 = _mm_srai_epi32(temp4, 4 - u4_qp_div_6); + resq_r1 = _mm_srai_epi32(temp5, 4 - u4_qp_div_6); + resq_r2 = _mm_srai_epi32(temp6, 4 - u4_qp_div_6); + resq_r3 = _mm_srai_epi32(temp7, 4 - u4_qp_div_6); + } + + if (iq_start_idx == 1) + resq_r0 = _mm_insert_epi32(resq_r0,(WORD32)pi2_dc_ld_addr[0],0); + /* Perform Inverse transform */ + /*-------------------------------------------------------------*/ + /* IDCT [ Horizontal transformation ] */ + /*-------------------------------------------------------------*/ + // Matrix transpose + /* + * a0 a1 a2 a3 + * b0 b1 b2 b3 + * c0 c1 c2 c3 + * d0 d1 d2 d3 + */ + temp1 = _mm_unpacklo_epi32(resq_r0, resq_r1); //a0 b0 a1 b1 + temp3 = _mm_unpacklo_epi32(resq_r2, resq_r3); //c0 d0 c1 d1 + temp2 = _mm_unpackhi_epi32(resq_r0, resq_r1); //a2 b2 a3 b3 + temp4 = _mm_unpackhi_epi32(resq_r2, resq_r3); //c2 d2 c3 d3 + resq_r0 = _mm_unpacklo_epi64(temp1, temp3); //a0 b0 c0 d0 + resq_r1 = _mm_unpackhi_epi64(temp1, temp3); //a1 b1 c1 d1 + resq_r2 = _mm_unpacklo_epi64(temp2, temp4); //a2 b2 c2 d2 + resq_r3 = _mm_unpackhi_epi64(temp2, temp4); //a3 b3 c3 d3 + //Transform starts -- horizontal transform + /*------------------------------------------------------------------*/ + /* z0 = w0 + w2 */ + temp0 = _mm_add_epi32(resq_r0, resq_r2); + /* z1 = w0 - w2 */ + temp1 = _mm_sub_epi32(resq_r0, resq_r2); + /* z2 = (w1 >> 1) - w3 */ + temp2 = _mm_srai_epi32(resq_r1, 1); //(w1>>1) + temp2 = _mm_sub_epi32(temp2, resq_r3); //(w1>>1) - w3 + /* z3 = w1 + (w3 >> 1) */ + temp3 = _mm_srai_epi32(resq_r3, 1); //(w3>>1) + w1 + temp3 = _mm_add_epi32(temp3, resq_r1); + /*----------------------------------------------------------*/ + /* x0 = z0 + z3 */ + resq_r0 = _mm_add_epi32(temp0, temp3); + /* x1 = z1 + z2 */ + resq_r1 = _mm_add_epi32(temp1, temp2); + /* x2 = z1 - z2 */ + resq_r2 = _mm_sub_epi32(temp1, temp2); + /* x3 = z0 - z3 */ + resq_r3 = _mm_sub_epi32(temp0, temp3); + // Matrix transpose + /* + * a0 b0 c0 d0 + * a1 b1 c1 d1 + * a2 b2 c2 d2 + * a3 b3 c3 d3 + */ + temp1 = _mm_unpacklo_epi32(resq_r0, resq_r1); //a0 a1 b0 b1 + temp3 = _mm_unpacklo_epi32(resq_r2, resq_r3); //a2 a3 b2 b3 + temp2 = _mm_unpackhi_epi32(resq_r0, resq_r1); //c0 c1 d0 d1 + temp4 = _mm_unpackhi_epi32(resq_r2, resq_r3); //c2 c3 d2 d3 + resq_r0 = _mm_unpacklo_epi64(temp1, temp3); //a0 a1 a2 a3 + resq_r1 = _mm_unpackhi_epi64(temp1, temp3); //b0 b1 b2 b3 + resq_r2 = _mm_unpacklo_epi64(temp2, temp4); //c0 c1 c2 c3 + resq_r3 = _mm_unpackhi_epi64(temp2, temp4); //d0 d1 d2 d3 + //Transform ends -- horizontal transform + + //Load pred buffer + pred_r0 = _mm_loadl_epi64((__m128i *) (&pu1_pred[0])); //p00 p01 p02 p03 0 0 0 0 0 0 0 0 -- all 8 bits + pred_r1 = _mm_loadl_epi64((__m128i *) (&pu1_pred[pred_strd])); //p10 p11 p12 p13 0 0 0 0 0 0 0 0 -- all 8 bits + pred_r2 = _mm_loadl_epi64((__m128i *) (&pu1_pred[2 * pred_strd])); //p20 p21 p22 p23 0 0 0 0 0 0 0 0 -- all 8 bits + pred_r3 = _mm_loadl_epi64((__m128i *) (&pu1_pred[3 * pred_strd])); //p30 p31 p32 p33 0 0 0 0 0 0 0 0 -- all 8 bits + + pred_r0 = _mm_cvtepu8_epi32(pred_r0); //p00 p01 p02 p03 -- all 32 bits + pred_r1 = _mm_cvtepu8_epi32(pred_r1); //p10 p11 p12 p13 -- all 32 bits + pred_r2 = _mm_cvtepu8_epi32(pred_r2); //p20 p21 p22 p23 -- all 32 bits + pred_r3 = _mm_cvtepu8_epi32(pred_r3); //p30 p31 p32 p33 -- all 32 bits + + /*--------------------------------------------------------------*/ + /* IDCT [ Vertical transformation] and Xij = (xij + 32)>>6 */ + /* */ + /* Add the prediction and store it back to same buffer */ + /*--------------------------------------------------------------*/ + /* z0j = y0j + y2j */ + temp0 = _mm_add_epi32(resq_r0, resq_r2); + /* z1j = y0j - y2j */ + temp1 = _mm_sub_epi32(resq_r0, resq_r2); + /* z2j = (y1j>>1) - y3j */ + temp2 = _mm_srai_epi32(resq_r1, 1); //(y1j>>1) + temp2 = _mm_sub_epi32(temp2, resq_r3); + /* z3j = y1j + (y3j>>1) */ + temp3 = _mm_srai_epi32(resq_r3, 1); //(y3j>>1) + temp3 = _mm_add_epi32(temp3, resq_r1); + + /* x0j = z0j + z3j */ + temp4 = _mm_add_epi32(temp0, temp3); + temp4 = _mm_add_epi32(temp4, value_32); + temp4 = _mm_srai_epi32(temp4, 6); + temp4 = _mm_add_epi32(temp4, pred_r0); + /* x1j = z1j + z2j */ + temp5 = _mm_add_epi32(temp1, temp2); + temp5 = _mm_add_epi32(temp5, value_32); + temp5 = _mm_srai_epi32(temp5, 6); + temp5 = _mm_add_epi32(temp5, pred_r1); + /* x2j = z1j - z2j */ + temp6 = _mm_sub_epi32(temp1, temp2); + temp6 = _mm_add_epi32(temp6, value_32); + temp6 = _mm_srai_epi32(temp6, 6); + temp6 = _mm_add_epi32(temp6, pred_r2); + /* x3j = z0j - z3j */ + temp7 = _mm_sub_epi32(temp0, temp3); + temp7 = _mm_add_epi32(temp7, value_32); + temp7 = _mm_srai_epi32(temp7, 6); + temp7 = _mm_add_epi32(temp7, pred_r3); + + // 32-bit to 16-bit conversion + temp0 = _mm_packs_epi32(temp4, temp5); + temp1 = _mm_packs_epi32(temp6, temp7); + /*------------------------------------------------------------------*/ + //Clipping the results to 8 bits + sign_reg = _mm_cmpgt_epi16(temp0, zero_8x16b); // sign check + temp0 = _mm_and_si128(temp0, sign_reg); + sign_reg = _mm_cmpgt_epi16(temp1, zero_8x16b); + temp1 = _mm_and_si128(temp1, sign_reg); + + resq_r0 = _mm_packus_epi16(temp0, temp1); + resq_r1 = _mm_srli_si128(resq_r0, 4); + resq_r2 = _mm_srli_si128(resq_r1, 4); + resq_r3 = _mm_srli_si128(resq_r2, 4); + + *pu4_out = _mm_cvtsi128_si32(resq_r0); + pu1_out += out_strd; + pu4_out = (UWORD32 *) (pu1_out); + *(pu4_out) = _mm_cvtsi128_si32(resq_r1); + pu1_out += out_strd; + pu4_out = (UWORD32 *) (pu1_out); + *(pu4_out) = _mm_cvtsi128_si32(resq_r2); + pu1_out += out_strd; + pu4_out = (UWORD32 *) (pu1_out); + *(pu4_out) = _mm_cvtsi128_si32(resq_r3); +} + +/* + ******************************************************************************** + * + * @brief This function reconstructs a 4x4 sub block from quantized chroma resiude and + * prediction buffer + * + * @par Description: + * The quantized residue is first inverse quantized, then inverse transformed. + * This inverse transformed content is added to the prediction buffer to recon- + * struct the end output + * + * @param[in] pi2_src + * quantized 4x4 block + * + * @param[in] pu1_pred + * prediction 4x4 block + * + * @param[out] pu1_out + * reconstructed 4x4 block + * + * @param[in] src_strd + * quantization buffer stride + * + * @param[in] pred_strd, + * Prediction buffer stride + * + * @param[in] out_strd + * recon buffer Stride + * + * @param[in] pu2_scaling_list + * pointer to scaling list + * + * @param[in] pu2_norm_adjust + * pointer to inverse scale matrix + * + * @param[in] u4_qp_div_6 + * Floor (qp/6) + * + * @param[in] pi4_tmp + * temporary buffer of size 1*16 + * + * @returns none + * + * @remarks none + * + ******************************************************************************* + */ +void ih264_iquant_itrans_recon_chroma_4x4_sse42(WORD16 *pi2_src, + UWORD8 *pu1_pred, + UWORD8 *pu1_out, + WORD32 pred_strd, + WORD32 out_strd, + const UWORD16 *pu2_iscal_mat, + const UWORD16 *pu2_weigh_mat, + UWORD32 u4_qp_div_6, + WORD16 *pi2_tmp, + WORD16 *pi2_dc_ld_addr) + { + __m128i src_r0_r1, src_r2_r3; + __m128i src_r0, src_r1, src_r2, src_r3; + __m128i scalemat_r0_r1, scalemat_r2_r3; + __m128i pred_r0, pred_r1, pred_r2, pred_r3; + __m128i sign_reg, dequant_r0_r1, dequant_r2_r3; + __m128i zero_8x16b = _mm_setzero_si128(); // all bits reset to zero + __m128i temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7; + __m128i resq_r0, resq_r1, resq_r2, resq_r3; + __m128i add_rshift = _mm_set1_epi32((1 << (3 - u4_qp_div_6))); + __m128i value_32 = _mm_set1_epi32(32); + __m128i chroma_mask = _mm_set1_epi16 (0xFF); + /*************************************************************/ + /* Dequantization of coefficients. Will be replaced by SIMD */ + /* operations on platform */ + /*************************************************************/ + src_r0_r1 = _mm_loadu_si128((__m128i *) (pi2_src)); //a00 a01 a02 a03 a10 a11 a12 a13 -- the source matrix 0th,1st row + src_r2_r3 = _mm_loadu_si128((__m128i *) (pi2_src + 8)); //a20 a21 a22 a23 a30 a31 a32 a33 -- the source matrix 2nd,3rd row + scalemat_r0_r1 = _mm_loadu_si128((__m128i *) (pu2_iscal_mat)); //b00 b01 b02 b03 b10 b11 b12 b13 -- the scaling matrix 0th,1st row + scalemat_r2_r3 = _mm_loadu_si128((__m128i *) (pu2_iscal_mat + 8)); //b20 b21 b22 b23 b30 b31 b32 b33 -- the scaling matrix 2nd,3rd row + dequant_r0_r1 = _mm_loadu_si128((__m128i *) (pu2_weigh_mat)); //q00 q01 q02 q03 q10 q11 q12 q13 -- all 16 bits + dequant_r2_r3 = _mm_loadu_si128((__m128i *) (pu2_weigh_mat + 8)); //q20 q21 q22 q23 q30 q31 q32 q33 -- all 16 bits + + temp0 = _mm_mullo_epi16(scalemat_r0_r1, dequant_r0_r1); //b00*q00 b01*q01 b02*q02 b03*q03 b10*q10 b11*q11 b12*q12 b13*q13 -- 16 bit result + temp1 = _mm_mullo_epi16(scalemat_r2_r3, dequant_r2_r3); //b00*q00 b01*q01 b02*q02 b03*q03 b10*q10 b11*q11 b12*q12 b13*q13 -- 16 bit result + + temp4 = _mm_unpacklo_epi16(temp0, zero_8x16b); // b00*q00 0 b01*q01 0 b02*q02 0 b03*q03 0 -- 16 bit long + temp5 = _mm_unpackhi_epi16(temp0, zero_8x16b); // b10*q10 0 b11*q11 0 b12*q12 0 b13*q13 0 -- 16 bit long + temp6 = _mm_unpacklo_epi16(temp1, zero_8x16b); // b00*q00 0 b01*q01 0 b02*q02 0 b03*q03 0 -- 16 bit long + temp7 = _mm_unpackhi_epi16(temp1, zero_8x16b); // b10*q10 0 b11*q11 0 b12*q12 0 b13*q13 0 -- 16 bit long + + src_r0 = _mm_unpacklo_epi16(src_r0_r1, zero_8x16b); // a00 0 a01 0 a02 0 a03 0 -- 16 bit long + src_r1 = _mm_unpackhi_epi16(src_r0_r1, zero_8x16b); // a10 0 a11 0 a12 0 a13 0 -- 16 bit long + src_r2 = _mm_unpacklo_epi16(src_r2_r3, zero_8x16b); // a20 0 a21 0 a22 0 a23 0 -- 16 bit long + src_r3 = _mm_unpackhi_epi16(src_r2_r3, zero_8x16b); // a30 0 a31 0 a32 0 a33 0 -- 16 bit long + + temp4 = _mm_madd_epi16(src_r0, temp4); //a00*b00*q00 a10*b10*q10 a20*b20*q20 a30*b30 q30 -- 32 bits long + temp5 = _mm_madd_epi16(src_r1, temp5); + temp6 = _mm_madd_epi16(src_r2, temp6); + temp7 = _mm_madd_epi16(src_r3, temp7); + + if (u4_qp_div_6 >= 4) { + resq_r0 = _mm_slli_epi32(temp4, u4_qp_div_6 - 4); + resq_r1 = _mm_slli_epi32(temp5, u4_qp_div_6 - 4); + resq_r2 = _mm_slli_epi32(temp6, u4_qp_div_6 - 4); + resq_r3 = _mm_slli_epi32(temp7, u4_qp_div_6 - 4); + } else { + temp4 = _mm_add_epi32(temp4, add_rshift); + temp5 = _mm_add_epi32(temp5, add_rshift); + temp6 = _mm_add_epi32(temp6, add_rshift); + temp7 = _mm_add_epi32(temp7, add_rshift); + resq_r0 = _mm_srai_epi32(temp4, 4 - u4_qp_div_6); + resq_r1 = _mm_srai_epi32(temp5, 4 - u4_qp_div_6); + resq_r2 = _mm_srai_epi32(temp6, 4 - u4_qp_div_6); + resq_r3 = _mm_srai_epi32(temp7, 4 - u4_qp_div_6); + } + + resq_r0 = _mm_insert_epi32(resq_r0,(WORD32)pi2_dc_ld_addr[0],0); + /* Perform Inverse transform */ + /*-------------------------------------------------------------*/ + /* IDCT [ Horizontal transformation ] */ + /*-------------------------------------------------------------*/ + // Matrix transpose + /* + * a0 a1 a2 a3 + * b0 b1 b2 b3 + * c0 c1 c2 c3 + * d0 d1 d2 d3 + */ + temp1 = _mm_unpacklo_epi32(resq_r0, resq_r1); //a0 b0 a1 b1 + temp3 = _mm_unpacklo_epi32(resq_r2, resq_r3); //c0 d0 c1 d1 + temp2 = _mm_unpackhi_epi32(resq_r0, resq_r1); //a2 b2 a3 b3 + temp4 = _mm_unpackhi_epi32(resq_r2, resq_r3); //c2 d2 c3 d3 + resq_r0 = _mm_unpacklo_epi64(temp1, temp3); //a0 b0 c0 d0 + resq_r1 = _mm_unpackhi_epi64(temp1, temp3); //a1 b1 c1 d1 + resq_r2 = _mm_unpacklo_epi64(temp2, temp4); //a2 b2 c2 d2 + resq_r3 = _mm_unpackhi_epi64(temp2, temp4); //a3 b3 c3 d3 + //Transform starts -- horizontal transform + /*------------------------------------------------------------------*/ + /* z0 = w0 + w2 */ + temp0 = _mm_add_epi32(resq_r0, resq_r2); + /* z1 = w0 - w2 */ + temp1 = _mm_sub_epi32(resq_r0, resq_r2); + /* z2 = (w1 >> 1) - w3 */ + temp2 = _mm_srai_epi32(resq_r1, 1); //(w1>>1) + temp2 = _mm_sub_epi32(temp2, resq_r3); //(w1>>1) - w3 + /* z3 = w1 + (w3 >> 1) */ + temp3 = _mm_srai_epi32(resq_r3, 1); //(w3>>1) + w1 + temp3 = _mm_add_epi32(temp3, resq_r1); + /*----------------------------------------------------------*/ + /* x0 = z0 + z3 */ + resq_r0 = _mm_add_epi32(temp0, temp3); + /* x1 = z1 + z2 */ + resq_r1 = _mm_add_epi32(temp1, temp2); + /* x2 = z1 - z2 */ + resq_r2 = _mm_sub_epi32(temp1, temp2); + /* x3 = z0 - z3 */ + resq_r3 = _mm_sub_epi32(temp0, temp3); + // Matrix transpose + /* + * a0 b0 c0 d0 + * a1 b1 c1 d1 + * a2 b2 c2 d2 + * a3 b3 c3 d3 + */ + temp1 = _mm_unpacklo_epi32(resq_r0, resq_r1); //a0 a1 b0 b1 + temp3 = _mm_unpacklo_epi32(resq_r2, resq_r3); //a2 a3 b2 b3 + temp2 = _mm_unpackhi_epi32(resq_r0, resq_r1); //c0 c1 d0 d1 + temp4 = _mm_unpackhi_epi32(resq_r2, resq_r3); //c2 c3 d2 d3 + resq_r0 = _mm_unpacklo_epi64(temp1, temp3); //a0 a1 a2 a3 + resq_r1 = _mm_unpackhi_epi64(temp1, temp3); //b0 b1 b2 b3 + resq_r2 = _mm_unpacklo_epi64(temp2, temp4); //c0 c1 c2 c3 + resq_r3 = _mm_unpackhi_epi64(temp2, temp4); //d0 d1 d2 d3 + //Transform ends -- horizontal transform + + //Load pred buffer + pred_r0 = _mm_loadl_epi64((__m128i *) (&pu1_pred[0])); //p00 p01 p02 p03 0 0 0 0 0 0 0 0 -- all 8 bits + pred_r1 = _mm_loadl_epi64((__m128i *) (&pu1_pred[pred_strd])); //p10 p11 p12 p13 0 0 0 0 0 0 0 0 -- all 8 bits + pred_r2 = _mm_loadl_epi64((__m128i *) (&pu1_pred[2 * pred_strd])); //p20 p21 p22 p23 0 0 0 0 0 0 0 0 -- all 8 bits + pred_r3 = _mm_loadl_epi64((__m128i *) (&pu1_pred[3 * pred_strd])); //p30 p31 p32 p33 0 0 0 0 0 0 0 0 -- all 8 bits + + pred_r0 = _mm_and_si128(pred_r0, chroma_mask); + pred_r1 = _mm_and_si128(pred_r1, chroma_mask); + pred_r2 = _mm_and_si128(pred_r2, chroma_mask); + pred_r3 = _mm_and_si128(pred_r3, chroma_mask); + + pred_r0 = _mm_cvtepu16_epi32(pred_r0); //p00 p01 p02 p03 -- all 32 bits + pred_r1 = _mm_cvtepu16_epi32(pred_r1); //p10 p11 p12 p13 -- all 32 bits + pred_r2 = _mm_cvtepu16_epi32(pred_r2); //p20 p21 p22 p23 -- all 32 bits + pred_r3 = _mm_cvtepu16_epi32(pred_r3); //p30 p31 p32 p33 -- all 32 bits + + /*--------------------------------------------------------------*/ + /* IDCT [ Vertical transformation] and Xij = (xij + 32)>>6 */ + /* */ + /* Add the prediction and store it back to same buffer */ + /*--------------------------------------------------------------*/ + /* z0j = y0j + y2j */ + temp0 = _mm_add_epi32(resq_r0, resq_r2); + /* z1j = y0j - y2j */ + temp1 = _mm_sub_epi32(resq_r0, resq_r2); + /* z2j = (y1j>>1) - y3j */ + temp2 = _mm_srai_epi32(resq_r1, 1); //(y1j>>1) + temp2 = _mm_sub_epi32(temp2, resq_r3); + /* z3j = y1j + (y3j>>1) */ + temp3 = _mm_srai_epi32(resq_r3, 1); //(y3j>>1) + temp3 = _mm_add_epi32(temp3, resq_r1); + + /* x0j = z0j + z3j */ + temp4 = _mm_add_epi32(temp0, temp3); + temp4 = _mm_add_epi32(temp4, value_32); + temp4 = _mm_srai_epi32(temp4, 6); + temp4 = _mm_add_epi32(temp4, pred_r0); + /* x1j = z1j + z2j */ + temp5 = _mm_add_epi32(temp1, temp2); + temp5 = _mm_add_epi32(temp5, value_32); + temp5 = _mm_srai_epi32(temp5, 6); + temp5 = _mm_add_epi32(temp5, pred_r1); + /* x2j = z1j - z2j */ + temp6 = _mm_sub_epi32(temp1, temp2); + temp6 = _mm_add_epi32(temp6, value_32); + temp6 = _mm_srai_epi32(temp6, 6); + temp6 = _mm_add_epi32(temp6, pred_r2); + /* x3j = z0j - z3j */ + temp7 = _mm_sub_epi32(temp0, temp3); + temp7 = _mm_add_epi32(temp7, value_32); + temp7 = _mm_srai_epi32(temp7, 6); + temp7 = _mm_add_epi32(temp7, pred_r3); + + // 32-bit to 16-bit conversion + temp0 = _mm_packs_epi32(temp4, temp5); + temp1 = _mm_packs_epi32(temp6, temp7); + /*------------------------------------------------------------------*/ + //Clipping the results to 8 bits + sign_reg = _mm_cmpgt_epi16(temp0, zero_8x16b); // sign check + temp0 = _mm_and_si128(temp0, sign_reg); + sign_reg = _mm_cmpgt_epi16(temp1, zero_8x16b); + temp1 = _mm_and_si128(temp1, sign_reg); + + resq_r0 = _mm_packus_epi16(temp0, temp1); + resq_r1 = _mm_srli_si128(resq_r0, 4); + resq_r2 = _mm_srli_si128(resq_r1, 4); + resq_r3 = _mm_srli_si128(resq_r2, 4); + + resq_r0 = _mm_cvtepu8_epi16(resq_r0); //p00 p01 p02 p03 -- all 16 bits + resq_r1 = _mm_cvtepu8_epi16(resq_r1); //p10 p11 p12 p13 -- all 16 bits + resq_r2 = _mm_cvtepu8_epi16(resq_r2); //p20 p21 p22 p23 -- all 16 bits + resq_r3 = _mm_cvtepu8_epi16(resq_r3); //p30 p31 p32 p33 -- all 16 bits + + chroma_mask = _mm_unpacklo_epi64(chroma_mask, zero_8x16b); + + _mm_maskmoveu_si128(resq_r0, chroma_mask, (char *)(&pu1_out[0])); + _mm_maskmoveu_si128(resq_r1, chroma_mask, (char *)(&pu1_out[out_strd])); + _mm_maskmoveu_si128(resq_r2, chroma_mask, (char *)(&pu1_out[2*out_strd])); + _mm_maskmoveu_si128(resq_r3, chroma_mask, (char *)(&pu1_out[3*out_strd])); +} diff --git a/common/x86/ih264_iquant_itrans_recon_ssse3.c b/common/x86/ih264_iquant_itrans_recon_ssse3.c new file mode 100755 index 0000000..ca1397e --- /dev/null +++ b/common/x86/ih264_iquant_itrans_recon_ssse3.c @@ -0,0 +1,1035 @@ +/****************************************************************************** + * + * Copyright (C) 2015 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ***************************************************************************** + * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore +*/ +/** + ******************************************************************************* + * @file + * ih264_iquant_itrans_recon_ssse3.c + * + * @brief + * Contains function definitions for inverse quantization, inverse + * transform and reconstruction + * + * @author + * Mohit [100664] + * + * @par List of Functions: + * - ihevc_iquant_itrans_recon_4x4_ssse3() + * - ihevc_iquant_itrans_recon_8x8_ssse3() + * + * @remarks + * None + * + ******************************************************************************* + */ +/* User include files */ +#include "ih264_typedefs.h" +#include "ih264_defs.h" +#include "ih264_trans_macros.h" +#include "ih264_macros.h" +#include "ih264_platform_macros.h" +#include "ih264_trans_data.h" +#include "ih264_size_defs.h" +#include "ih264_structs.h" +#include "ih264_trans_quant_itrans_iquant.h" +#include <immintrin.h> + +/* + ******************************************************************************** + * + * @brief This function reconstructs a 4x4 sub block from quantized resiude and + * prediction buffer + * + * @par Description: + * The quantized residue is first inverse quantized, then inverse transformed. + * This inverse transformed content is added to the prediction buffer to recon- + * struct the end output + * + * @param[in] pi2_src + * quantized 4x4 block + * + * @param[in] pu1_pred + * prediction 4x4 block + * + * @param[out] pu1_out + * reconstructed 4x4 block + * + * @param[in] src_strd + * quantization buffer stride + * + * @param[in] pred_strd, + * Prediction buffer stride + * + * @param[in] out_strd + * recon buffer Stride + * + * @param[in] pu2_scaling_list + * pointer to scaling list + * + * @param[in] pu2_norm_adjust + * pointer to inverse scale matrix + * + * @param[in] u4_qp_div_6 + * Floor (qp/6) + * + * @param[in] pi4_tmp + * temporary buffer of size 1*16 + * + * @returns none + * + * @remarks none + * + ******************************************************************************* + */ +void ih264_iquant_itrans_recon_4x4_ssse3(WORD16 *pi2_src, + UWORD8 *pu1_pred, + UWORD8 *pu1_out, + WORD32 pred_strd, + WORD32 out_strd, + const UWORD16 *pu2_iscal_mat, + const UWORD16 *pu2_weigh_mat, + UWORD32 u4_qp_div_6, + WORD16 *pi2_tmp, + WORD32 iq_start_idx, + WORD16 *pi2_dc_ld_addr) +{ + UWORD32 *pu4_out = (UWORD32 *) pu1_out; + __m128i src_r0_r1, src_r2_r3; + __m128i src_r0, src_r1, src_r2, src_r3; + __m128i scalemat_r0_r1, scalemat_r2_r3, predload_r; + __m128i pred_r0, pred_r1, pred_r2, pred_r3; + __m128i sign_reg, dequant_r0_r1, dequant_r2_r3; + __m128i zero_8x16b = _mm_setzero_si128(); // all bits reset to zero + __m128i temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7; + __m128i resq_r0, resq_r1, resq_r2, resq_r3; + __m128i add_rshift = _mm_set1_epi32((1 << (3 - u4_qp_div_6))); + __m128i value_32 = _mm_set1_epi32(32); + + /*************************************************************/ + /* Dequantization of coefficients. Will be replaced by SIMD */ + /* operations on platform */ + /*************************************************************/ + src_r0_r1 = _mm_loadu_si128((__m128i *) (pi2_src)); //a00 a01 a02 a03 a10 a11 a12 a13 -- the source matrix 0th,1st row + src_r2_r3 = _mm_loadu_si128((__m128i *) (pi2_src + 8)); //a20 a21 a22 a23 a30 a31 a32 a33 -- the source matrix 2nd,3rd row + scalemat_r0_r1 = _mm_loadu_si128((__m128i *) (pu2_iscal_mat)); //b00 b01 b02 b03 b10 b11 b12 b13 -- the scaling matrix 0th,1st row + scalemat_r2_r3 = _mm_loadu_si128((__m128i *) (pu2_iscal_mat + 8)); //b20 b21 b22 b23 b30 b31 b32 b33 -- the scaling matrix 2nd,3rd row + dequant_r0_r1 = _mm_loadu_si128((__m128i *) (pu2_weigh_mat)); //q00 q01 q02 q03 q10 q11 q12 q13 -- all 16 bits + dequant_r2_r3 = _mm_loadu_si128((__m128i *) (pu2_weigh_mat + 8)); //q20 q21 q22 q23 q30 q31 q32 q33 -- all 16 bits + + temp0 = _mm_mullo_epi16(scalemat_r0_r1, dequant_r0_r1); //b00*q00 b01*q01 b02*q02 b03*q03 b10*q10 b11*q11 b12*q12 b13*q13 -- 16 bit result + temp1 = _mm_mullo_epi16(scalemat_r2_r3, dequant_r2_r3); //b00*q00 b01*q01 b02*q02 b03*q03 b10*q10 b11*q11 b12*q12 b13*q13 -- 16 bit result + + temp4 = _mm_unpacklo_epi16(temp0, zero_8x16b); // b00*q00 0 b01*q01 0 b02*q02 0 b03*q03 0 -- 16 bit long + temp5 = _mm_unpackhi_epi16(temp0, zero_8x16b); // b10*q10 0 b11*q11 0 b12*q12 0 b13*q13 0 -- 16 bit long + temp6 = _mm_unpacklo_epi16(temp1, zero_8x16b); // b00*q00 0 b01*q01 0 b02*q02 0 b03*q03 0 -- 16 bit long + temp7 = _mm_unpackhi_epi16(temp1, zero_8x16b); // b10*q10 0 b11*q11 0 b12*q12 0 b13*q13 0 -- 16 bit long + + src_r0 = _mm_unpacklo_epi16(src_r0_r1, zero_8x16b); // a00 0 a01 0 a02 0 a03 0 -- 16 bit long + src_r1 = _mm_unpackhi_epi16(src_r0_r1, zero_8x16b); // a10 0 a11 0 a12 0 a13 0 -- 16 bit long + src_r2 = _mm_unpacklo_epi16(src_r2_r3, zero_8x16b); // a20 0 a21 0 a22 0 a23 0 -- 16 bit long + src_r3 = _mm_unpackhi_epi16(src_r2_r3, zero_8x16b); // a30 0 a31 0 a32 0 a33 0 -- 16 bit long + + temp4 = _mm_madd_epi16(src_r0, temp4); //a00*b00*q00 a10*b10*q10 a20*b20*q20 a30*b30 q30 -- 32 bits long + temp5 = _mm_madd_epi16(src_r1, temp5); + temp6 = _mm_madd_epi16(src_r2, temp6); + temp7 = _mm_madd_epi16(src_r3, temp7); + + if (u4_qp_div_6 >= 4) { + resq_r0 = _mm_slli_epi32(temp4, u4_qp_div_6 - 4); + resq_r1 = _mm_slli_epi32(temp5, u4_qp_div_6 - 4); + resq_r2 = _mm_slli_epi32(temp6, u4_qp_div_6 - 4); + resq_r3 = _mm_slli_epi32(temp7, u4_qp_div_6 - 4); + } else { + temp4 = _mm_add_epi32(temp4, add_rshift); + temp5 = _mm_add_epi32(temp5, add_rshift); + temp6 = _mm_add_epi32(temp6, add_rshift); + temp7 = _mm_add_epi32(temp7, add_rshift); + resq_r0 = _mm_srai_epi32(temp4, 4 - u4_qp_div_6); + resq_r1 = _mm_srai_epi32(temp5, 4 - u4_qp_div_6); + resq_r2 = _mm_srai_epi32(temp6, 4 - u4_qp_div_6); + resq_r3 = _mm_srai_epi32(temp7, 4 - u4_qp_div_6); + } + + if (iq_start_idx == 1) + { + resq_r0 = _mm_insert_epi16(resq_r0,(WORD32)pi2_src[0],0); + if (pi2_src[0] >= 0) + resq_r0 = _mm_insert_epi16(resq_r0,0,1); + else + resq_r0 = _mm_insert_epi16(resq_r0,-1,1); + } + /* Perform Inverse transform */ + /*-------------------------------------------------------------*/ + /* IDCT [ Horizontal transformation ] */ + /*-------------------------------------------------------------*/ + // Matrix transpose + /* + * a0 a1 a2 a3 + * b0 b1 b2 b3 + * c0 c1 c2 c3 + * d0 d1 d2 d3 + */ + temp1 = _mm_unpacklo_epi32(resq_r0, resq_r1); //a0 b0 a1 b1 + temp3 = _mm_unpacklo_epi32(resq_r2, resq_r3); //c0 d0 c1 d1 + temp2 = _mm_unpackhi_epi32(resq_r0, resq_r1); //a2 b2 a3 b3 + temp4 = _mm_unpackhi_epi32(resq_r2, resq_r3); //c2 d2 c3 d3 + resq_r0 = _mm_unpacklo_epi64(temp1, temp3); //a0 b0 c0 d0 + resq_r1 = _mm_unpackhi_epi64(temp1, temp3); //a1 b1 c1 d1 + resq_r2 = _mm_unpacklo_epi64(temp2, temp4); //a2 b2 c2 d2 + resq_r3 = _mm_unpackhi_epi64(temp2, temp4); //a3 b3 c3 d3 + //Transform starts -- horizontal transform + /*------------------------------------------------------------------*/ + /* z0 = w0 + w2 */ + temp0 = _mm_add_epi32(resq_r0, resq_r2); + /* z1 = w0 - w2 */ + temp1 = _mm_sub_epi32(resq_r0, resq_r2); + /* z2 = (w1 >> 1) - w3 */ + temp2 = _mm_srai_epi32(resq_r1, 1); //(w1>>1) + temp2 = _mm_sub_epi32(temp2, resq_r3); //(w1>>1) - w3 + /* z3 = w1 + (w3 >> 1) */ + temp3 = _mm_srai_epi32(resq_r3, 1); //(w3>>1) + w1 + temp3 = _mm_add_epi32(temp3, resq_r1); + /*----------------------------------------------------------*/ + /* x0 = z0 + z3 */ + resq_r0 = _mm_add_epi32(temp0, temp3); + /* x1 = z1 + z2 */ + resq_r1 = _mm_add_epi32(temp1, temp2); + /* x2 = z1 - z2 */ + resq_r2 = _mm_sub_epi32(temp1, temp2); + /* x3 = z0 - z3 */ + resq_r3 = _mm_sub_epi32(temp0, temp3); + // Matrix transpose + /* + * a0 b0 c0 d0 + * a1 b1 c1 d1 + * a2 b2 c2 d2 + * a3 b3 c3 d3 + */ + temp1 = _mm_unpacklo_epi32(resq_r0, resq_r1); //a0 a1 b0 b1 + temp3 = _mm_unpacklo_epi32(resq_r2, resq_r3); //a2 a3 b2 b3 + temp2 = _mm_unpackhi_epi32(resq_r0, resq_r1); //c0 c1 d0 d1 + temp4 = _mm_unpackhi_epi32(resq_r2, resq_r3); //c2 c3 d2 d3 + resq_r0 = _mm_unpacklo_epi64(temp1, temp3); //a0 a1 a2 a3 + resq_r1 = _mm_unpackhi_epi64(temp1, temp3); //b0 b1 b2 b3 + resq_r2 = _mm_unpacklo_epi64(temp2, temp4); //c0 c1 c2 c3 + resq_r3 = _mm_unpackhi_epi64(temp2, temp4); //d0 d1 d2 d3 + //Transform ends -- horizontal transform + + zero_8x16b = _mm_setzero_si128(); // all bits reset to zero + //Load pred buffer + predload_r = _mm_loadl_epi64((__m128i *) (&pu1_pred[0])); //p00 p01 p02 p03 0 0 0 0 0 0 0 0 -- all 8 bits + pred_r0 = _mm_unpacklo_epi8(predload_r, zero_8x16b); //p00 p01 p02 p03 0 0 0 0 -- all 16 bits + + predload_r = _mm_loadl_epi64((__m128i *) (&pu1_pred[pred_strd])); //p10 p11 p12 p13 0 0 0 0 0 0 0 0 -- all 8 bits + pred_r1 = _mm_unpacklo_epi8(predload_r, zero_8x16b); //p10 p11 p12 p13 0 0 0 0 -- all 16 bits + + predload_r = _mm_loadl_epi64((__m128i *) (&pu1_pred[2 * pred_strd])); //p20 p21 p22 p23 0 0 0 0 0 0 0 0 -- all 8 bits + pred_r2 = _mm_unpacklo_epi8(predload_r, zero_8x16b); //p20 p21 p22 p23 0 0 0 0 -- all 16 bits + + predload_r = _mm_loadl_epi64((__m128i *) (&pu1_pred[3 * pred_strd])); //p30 p31 p32 p33 0 0 0 0 0 0 0 0 -- all 8 bits + pred_r3 = _mm_unpacklo_epi8(predload_r, zero_8x16b); //p30 p31 p32 p33 0 0 0 0 -- all 16 bits + pred_r0 = _mm_unpacklo_epi16(pred_r0, zero_8x16b); //p00 p01 p02 p03 -- 32 bits sign extended + pred_r1 = _mm_unpacklo_epi16(pred_r1, zero_8x16b); //p10 p11 p12 p13 -- 32 bits sign extended + pred_r2 = _mm_unpacklo_epi16(pred_r2, zero_8x16b); //p20 p21 p22 p23 -- 32 bits sign extended + pred_r3 = _mm_unpacklo_epi16(pred_r3, zero_8x16b); //p30 p31 p32 p33 -- 32 bits sign extended + + /*--------------------------------------------------------------*/ + /* IDCT [ Vertical transformation] and Xij = (xij + 32)>>6 */ + /* */ + /* Add the prediction and store it back to same buffer */ + /*--------------------------------------------------------------*/ + /* z0j = y0j + y2j */ + temp0 = _mm_add_epi32(resq_r0, resq_r2); + /* z1j = y0j - y2j */ + temp1 = _mm_sub_epi32(resq_r0, resq_r2); + /* z2j = (y1j>>1) - y3j */ + temp2 = _mm_srai_epi32(resq_r1, 1); //(y1j>>1) + temp2 = _mm_sub_epi32(temp2, resq_r3); + /* z3j = y1j + (y3j>>1) */ + temp3 = _mm_srai_epi32(resq_r3, 1); //(y3j>>1) + temp3 = _mm_add_epi32(temp3, resq_r1); + + /* x0j = z0j + z3j */ + temp4 = _mm_add_epi32(temp0, temp3); + temp4 = _mm_add_epi32(temp4, value_32); + temp4 = _mm_srai_epi32(temp4, 6); + temp4 = _mm_add_epi32(temp4, pred_r0); + /* x1j = z1j + z2j */ + temp5 = _mm_add_epi32(temp1, temp2); + temp5 = _mm_add_epi32(temp5, value_32); + temp5 = _mm_srai_epi32(temp5, 6); + temp5 = _mm_add_epi32(temp5, pred_r1); + /* x2j = z1j - z2j */ + temp6 = _mm_sub_epi32(temp1, temp2); + temp6 = _mm_add_epi32(temp6, value_32); + temp6 = _mm_srai_epi32(temp6, 6); + temp6 = _mm_add_epi32(temp6, pred_r2); + /* x3j = z0j - z3j */ + temp7 = _mm_sub_epi32(temp0, temp3); + temp7 = _mm_add_epi32(temp7, value_32); + temp7 = _mm_srai_epi32(temp7, 6); + temp7 = _mm_add_epi32(temp7, pred_r3); + + // 32-bit to 16-bit conversion + temp0 = _mm_packs_epi32(temp4, temp5); + temp1 = _mm_packs_epi32(temp6, temp7); + /*------------------------------------------------------------------*/ + //Clipping the results to 8 bits + sign_reg = _mm_cmpgt_epi16(temp0, zero_8x16b); // sign check + temp0 = _mm_and_si128(temp0, sign_reg); + sign_reg = _mm_cmpgt_epi16(temp1, zero_8x16b); + temp1 = _mm_and_si128(temp1, sign_reg); + + resq_r0 = _mm_packus_epi16(temp0, temp1); + resq_r1 = _mm_srli_si128(resq_r0, 4); + resq_r2 = _mm_srli_si128(resq_r1, 4); + resq_r3 = _mm_srli_si128(resq_r2, 4); + + *pu4_out = _mm_cvtsi128_si32(resq_r0); + pu1_out += out_strd; + pu4_out = (UWORD32 *) (pu1_out); + *(pu4_out) = _mm_cvtsi128_si32(resq_r1); + pu1_out += out_strd; + pu4_out = (UWORD32 *) (pu1_out); + *(pu4_out) = _mm_cvtsi128_si32(resq_r2); + pu1_out += out_strd; + pu4_out = (UWORD32 *) (pu1_out); + *(pu4_out) = _mm_cvtsi128_si32(resq_r3); +} +/** + ******************************************************************************* + * + * @brief + * This function performs inverse quant and Inverse transform type Ci4 for 8x8 block + * + * @par Description: + * Performs inverse transform Ci8 and adds the residue to get the + * reconstructed block + * + * @param[in] pi2_src + * Input 8x8coefficients + * + * @param[in] pu1_pred + * Prediction 8x8 block + * + * @param[out] pu1_recon + * Output 8x8 block + * + * @param[in] q_div + * QP/6 + * + * @param[in] q_rem + * QP%6 + * + * @param[in] q_lev + * Quantizer level + * + * @param[in] u4_src_stride + * Input stride + * + * @param[in] u4_pred_stride, + * Prediction stride + * + * @param[in] u4_out_stride + * Output Stride + * + * @param[in] pi4_tmp + * temporary buffer of size 1*64 + * the tmp for each block + * + * @param[in] pu4_iquant_mat + * Pointer to the inverse quantization matrix + * + * @returns Void + * + * @remarks + * None + * + ******************************************************************************* + */ + +void ih264_iquant_itrans_recon_8x8_ssse3(WORD16 *pi2_src, + UWORD8 *pu1_pred, + UWORD8 *pu1_out, + WORD32 pred_strd, + WORD32 out_strd, + const UWORD16 *pu2_iscale_mat, + const UWORD16 *pu2_weigh_mat, + UWORD32 qp_div, + WORD16 *pi2_tmp, + WORD32 iq_start_idx, + WORD16 *pi2_dc_ld_addr) +{ + __m128i src_r0; + __m128i scalemat_r0; + __m128i zero_8x16b = _mm_setzero_si128(); // all bits reset to zero + // __m128i one_8x16b = _mm_set1_epi8(255); // all bits set to 1 + // __m128i one_zero_mask = _mm_unpacklo_epi16(one_8x16b, zero_8x16b); // 1 0 1 0 1 0 1 0 --- 16 bits size + __m128i value_32 = _mm_set1_epi32(32); + __m128i add_rshift = _mm_set1_epi32((1 << (5 - qp_div))); + __m128i dequant_r0; + __m128i predload_r; + __m128i pred_r0_1, pred_r1_1, pred_r2_1, pred_r3_1, pred_r4_1, pred_r5_1, + pred_r6_1, pred_r7_1; + __m128i sign_reg; + __m128i src_r0_1, src_r0_2; + __m128i scalemat_r0_1, scalemat_r0_2; + __m128i temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8; + __m128i temp10, temp11, temp12, temp13, temp14, temp15, temp16, temp17, + temp18, temp19, temp20; + // To store dequantization results + __m128i resq_r0_1, resq_r0_2, resq_r1_1, resq_r1_2, resq_r2_1, resq_r2_2, + resq_r3_1, resq_r3_2, resq_r4_1, resq_r4_2, resq_r5_1, resq_r5_2, + resq_r6_1, resq_r6_2, resq_r7_1, resq_r7_2; + + /*************************************************************/ + /* Dequantization of coefficients. Will be replaced by SIMD */ + /* operations on platform. Note : DC coeff is not scaled */ + /*************************************************************/ + + // Row 0 processing + src_r0 = _mm_loadu_si128((__m128i *) (pi2_src)); //a00 a01 a02 a03 a04 a05 a06 a07 -- the source matrix 0th row + scalemat_r0 = _mm_loadu_si128((__m128i *) (pu2_iscale_mat)); //b00 b01 b02 b03 b04 b05 b06 b07 -- the scaling matrix 0th row + dequant_r0 = _mm_loadu_si128((__m128i *) (&pu2_weigh_mat[0])); //q0 q1 q2 q3 q4 q5 q6 q7 -- all 16 bits + src_r0_1 = _mm_unpacklo_epi16(src_r0, zero_8x16b); //a00 0 a01 0 a02 0 a03 0 -- 16 bit long + src_r0_2 = _mm_unpackhi_epi16(src_r0, zero_8x16b); // a04 0 a05 0 a06 0 a07 0 -- 16 bit long + temp10 = _mm_mullo_epi16(scalemat_r0, dequant_r0); //b00*q0 b01*q1 b02*q2 b03*q3 b04*q4 b05*q5 b06*q6 b07*q7 -- 16 bit result + scalemat_r0_1 = _mm_unpacklo_epi16(temp10, zero_8x16b); // b00*q0 0 b01*q1 0 b02*q2 0 b03*q3 0 -- 16 bit long + scalemat_r0_2 = _mm_unpackhi_epi16(temp10, zero_8x16b); // b04*q4 0 b05*q5 0 b06*q6 0 b07*q7 0 -- 16 bit long + + temp5 = _mm_madd_epi16(src_r0_1, scalemat_r0_1); // a00*b00*q0 a01*b01*q1 a02*b02*q2 a03*b03*q3 -- 32 bits long + temp7 = _mm_madd_epi16(src_r0_2, scalemat_r0_2); // a04*b04*q4 a05*b05*q5 a06*b06*q6 a07*b07*q7 -- 32 bits long + + if (qp_div >= 6) { + resq_r0_1 = _mm_slli_epi32(temp5, qp_div - 6); + resq_r0_2 = _mm_slli_epi32(temp7, qp_div - 6); + } else { + temp5 = _mm_add_epi32(temp5, add_rshift); + temp7 = _mm_add_epi32(temp7, add_rshift); + resq_r0_1 = _mm_srai_epi32(temp5, 6 - qp_div); + resq_r0_2 = _mm_srai_epi32(temp7, 6 - qp_div); + } + resq_r0_1 = _mm_packs_epi32(resq_r0_1, resq_r0_2); //a00*b00*q0 a01*b01*q1 a02*b02*q2 a03*b03*q3 a04*b04*q4 a05*b05*q5 a06*b06*q6 a07*b07*q7 -- 16 bit long + // Row 1 processing + src_r0 = _mm_loadu_si128((__m128i *) (pi2_src + 8)); //a00 a01 a02 a03 a04 a05 a06 a07 a08 -- the source matrix 1st row + scalemat_r0 = _mm_loadu_si128((__m128i *) (pu2_iscale_mat + 8)); //b00 b01 b02 b03 b04 b05 b06 b07 b08 -- the scaling matrix 1st row + dequant_r0 = _mm_loadu_si128((__m128i *) (&pu2_weigh_mat[8])); //q0 q1 q2 q3 q4 q5 q6 q7 -- all 16 bits + src_r0_1 = _mm_unpacklo_epi16(src_r0, zero_8x16b); //a00 0 a01 0 a02 0 a03 0 -- 16 bit long + src_r0_2 = _mm_unpackhi_epi16(src_r0, zero_8x16b); // a04 0 a05 0 a06 0 a07 0 -- 16 bit long + temp10 = _mm_mullo_epi16(scalemat_r0, dequant_r0); //b00*q0 b01*q1 b02*q2 b03*q3 b04*q4 b05*q5 b06*q6 b07*q7 -- 16 bit result + scalemat_r0_1 = _mm_unpacklo_epi16(temp10, zero_8x16b); // b00*q0 0 b01*q1 0 b02*q2 0 b03*q3 0 -- 16 bit long + scalemat_r0_2 = _mm_unpackhi_epi16(temp10, zero_8x16b); // b04*q4 0 b05*q5 0 b06*q6 0 b07*q7 0 -- 16 bit long + temp5 = _mm_madd_epi16(src_r0_1, scalemat_r0_1); // a00*b00*q0 a01*b01*q1 a02*b02*q2 a03*b03*q3 -- 32 bits long + temp7 = _mm_madd_epi16(src_r0_2, scalemat_r0_2); // a04*b04*q4 a05*b05*q5 a06*b06*q6 a07*b07*q7 -- 32 bits long + if (qp_div >= 6) { + resq_r1_1 = _mm_slli_epi32(temp5, qp_div - 6); + resq_r1_2 = _mm_slli_epi32(temp7, qp_div - 6); + } else { + temp5 = _mm_add_epi32(temp5, add_rshift); + temp7 = _mm_add_epi32(temp7, add_rshift); + resq_r1_1 = _mm_srai_epi32(temp5, 6 - qp_div); + resq_r1_2 = _mm_srai_epi32(temp7, 6 - qp_div); + } + resq_r1_1 = _mm_packs_epi32(resq_r1_1, resq_r1_2); //a00*b00*q0 a01*b01*q1 a02*b02*q2 a03*b03*q3 a04*b04*q4 a05*b05*q5 a06*b06*q6 a07*b07*q7 -- 16 bit long + // Row 2 processing + src_r0 = _mm_loadu_si128((__m128i *) (pi2_src + 16)); //a00 a01 a02 a03 a04 a05 a06 a07 a08 -- the source matrix 2nd row + scalemat_r0 = _mm_loadu_si128((__m128i *) (pu2_iscale_mat + 16)); //b00 b01 b02 b03 b04 b05 b06 b07 b08 -- the scaling matrix 2nd row + dequant_r0 = _mm_loadu_si128((__m128i *) (&pu2_weigh_mat[16])); //q0 q1 q2 q3 q4 q5 q6 q7 -- all 16 bits + src_r0_1 = _mm_unpacklo_epi16(src_r0, zero_8x16b); //a00 0 a01 0 a02 0 a03 0 -- 16 bit long + src_r0_2 = _mm_unpackhi_epi16(src_r0, zero_8x16b); // a04 0 a05 0 a06 0 a07 0 -- 16 bit long + temp10 = _mm_mullo_epi16(scalemat_r0, dequant_r0); //b00*q0 b01*q1 b02*q2 b03*q3 b04*q4 b05*q5 b06*q6 b07*q7 -- 16 bit result + scalemat_r0_1 = _mm_unpacklo_epi16(temp10, zero_8x16b); // b00*q0 0 b01*q1 0 b02*q2 0 b03*q3 0 -- 16 bit long + scalemat_r0_2 = _mm_unpackhi_epi16(temp10, zero_8x16b); // b04*q4 0 b05*q5 0 b06*q6 0 b07*q7 0 -- 16 bit long + temp5 = _mm_madd_epi16(src_r0_1, scalemat_r0_1); // a00*b00*q0 a01*b01*q1 a02*b02*q2 a03*b03*q3 -- 32 bits long + temp7 = _mm_madd_epi16(src_r0_2, scalemat_r0_2); // a04*b04*q4 a05*b05*q5 a06*b06*q6 a07*b07*q7 -- 32 bits long + if (qp_div >= 6) { + resq_r2_1 = _mm_slli_epi32(temp5, qp_div - 6); + resq_r2_2 = _mm_slli_epi32(temp7, qp_div - 6); + } else { + temp5 = _mm_add_epi32(temp5, add_rshift); + temp7 = _mm_add_epi32(temp7, add_rshift); + resq_r2_1 = _mm_srai_epi32(temp5, 6 - qp_div); + resq_r2_2 = _mm_srai_epi32(temp7, 6 - qp_div); + } + resq_r2_1 = _mm_packs_epi32(resq_r2_1, resq_r2_2); //a00*b00*q0 a01*b01*q1 a02*b02*q2 a03*b03*q3 a04*b04*q4 a05*b05*q5 a06*b06*q6 a07*b07*q7 -- 16 bit long + // Row 3 processing + src_r0 = _mm_loadu_si128((__m128i *) (pi2_src + 24)); //a00 a01 a02 a03 a04 a05 a06 a07 a08 -- the source matrix 3rd row + scalemat_r0 = _mm_loadu_si128((__m128i *) (pu2_iscale_mat + 24)); //b00 b01 b02 b03 b04 b05 b06 b07 b08 -- the scaling matrix 3rd row + dequant_r0 = _mm_loadu_si128((__m128i *) (&pu2_weigh_mat[24])); //q0 q1 q2 q3 q4 q5 q6 q7 -- all 16 bits + src_r0_1 = _mm_unpacklo_epi16(src_r0, zero_8x16b); //a00 0 a01 0 a02 0 a03 0 -- 16 bit long + src_r0_2 = _mm_unpackhi_epi16(src_r0, zero_8x16b); // a04 0 a05 0 a06 0 a07 0 -- 16 bit long + temp10 = _mm_mullo_epi16(scalemat_r0, dequant_r0); //b00*q0 b01*q1 b02*q2 b03*q3 b04*q4 b05*q5 b06*q6 b07*q7 -- 16 bit result + scalemat_r0_1 = _mm_unpacklo_epi16(temp10, zero_8x16b); // b00*q0 0 b01*q1 0 b02*q2 0 b03*q3 0 -- 16 bit long + scalemat_r0_2 = _mm_unpackhi_epi16(temp10, zero_8x16b); // b04*q4 0 b05*q5 0 b06*q6 0 b07*q7 0 -- 16 bit long + temp5 = _mm_madd_epi16(src_r0_1, scalemat_r0_1); // a00*b00*q0 a01*b01*q1 a02*b02*q2 a03*b03*q3 - 32 bits long + temp7 = _mm_madd_epi16(src_r0_2, scalemat_r0_2); // a04*b04*q4 a05*b05*q5 a06*b06*q6 a07*b07*q7 -- 32 bits long + if (qp_div >= 6) { + resq_r3_1 = _mm_slli_epi32(temp5, qp_div - 6); + resq_r3_2 = _mm_slli_epi32(temp7, qp_div - 6); + } else { + temp5 = _mm_add_epi32(temp5, add_rshift); + temp7 = _mm_add_epi32(temp7, add_rshift); + resq_r3_1 = _mm_srai_epi32(temp5, 6 - qp_div); + resq_r3_2 = _mm_srai_epi32(temp7, 6 - qp_div); + } + resq_r3_1 = _mm_packs_epi32(resq_r3_1, resq_r3_2); //a00*b00*q0 a01*b01*q1 a02*b02*q2 a03*b03*q3 a04*b04*q4 a05*b05*q5 a06*b06*q6 a07*b07*q7 -- 16 bit long + // Row 4 processing + src_r0 = _mm_loadu_si128((__m128i *) (pi2_src + 32)); //a00 a01 a02 a03 a04 a05 a06 a07 a08 -- the source matrix 4th row + scalemat_r0 = _mm_loadu_si128((__m128i *) (pu2_iscale_mat + 32)); //b00 b01 b02 b03 b04 b05 b06 b07 b08 -- the scaling matrix 4th row + dequant_r0 = _mm_loadu_si128((__m128i *) (&pu2_weigh_mat[32])); //q0 q1 q2 q3 q4 q5 q6 q7 -- all 16 bits + src_r0_1 = _mm_unpacklo_epi16(src_r0, zero_8x16b); //a00 0 a01 0 a02 0 a03 0 -- 16 bit long + src_r0_2 = _mm_unpackhi_epi16(src_r0, zero_8x16b); // a04 0 a05 0 a06 0 a07 0 -- 16 bit long + temp10 = _mm_mullo_epi16(scalemat_r0, dequant_r0); //b00*q0 b01*q1 b02*q2 b03*q3 b04*q4 b05*q5 b06*q6 b07*q7 -- 16 bit result + scalemat_r0_1 = _mm_unpacklo_epi16(temp10, zero_8x16b); // b00*q0 0 b01*q1 0 b02*q2 0 b03*q3 0 -- 16 bit long + scalemat_r0_2 = _mm_unpackhi_epi16(temp10, zero_8x16b); // b04*q4 0 b05*q5 0 b06*q6 0 b07*q7 0 -- 16 bit long + temp5 = _mm_madd_epi16(src_r0_1, scalemat_r0_1); // a00*b00*q0 a01*b01*q1 a02*b02*q2 a03*b03*q3 -- 32 bits long + temp7 = _mm_madd_epi16(src_r0_2, scalemat_r0_2); // a04*b04*q4 a05*b05*q5 a06*b06*q6 a07*b07*q7 -- 32 bits long + if (qp_div >= 6) { + resq_r4_1 = _mm_slli_epi32(temp5, qp_div - 6); + resq_r4_2 = _mm_slli_epi32(temp7, qp_div - 6); + + } else { + temp5 = _mm_add_epi32(temp5, add_rshift); + temp7 = _mm_add_epi32(temp7, add_rshift); + resq_r4_1 = _mm_srai_epi32(temp5, 6 - qp_div); + resq_r4_2 = _mm_srai_epi32(temp7, 6 - qp_div); + } + resq_r4_1 = _mm_packs_epi32(resq_r4_1, resq_r4_2); //a00*b00*q0 a01*b01*q1 a02*b02*q2 a03*b03*q3 a04*b04*q4 a05*b05*q5 a06*b06*q6 a07*b07*q7 -- 16 bit long + // Row 5 processing + src_r0 = _mm_loadu_si128((__m128i *) (pi2_src + 40)); //a00 a01 a02 a03 a04 a05 a06 a07 a08 -- the source matrix 5th row + scalemat_r0 = _mm_loadu_si128((__m128i *) (pu2_iscale_mat + 40)); //b00 b01 b02 b03 b04 b05 b06 b07 b08 -- the scaling matrix 5th row + dequant_r0 = _mm_loadu_si128((__m128i *) (&pu2_weigh_mat[40])); //q0 q1 q2 q3 q4 q5 q6 q7 -- all 16 bits + src_r0_1 = _mm_unpacklo_epi16(src_r0, zero_8x16b); //a00 0 a01 0 a02 0 a03 0 -- 16 bit long + src_r0_2 = _mm_unpackhi_epi16(src_r0, zero_8x16b); // a04 0 a05 0 a06 0 a07 0 -- 16 bit long + temp10 = _mm_mullo_epi16(scalemat_r0, dequant_r0); //b00*q0 b01*q1 b02*q2 b03*q3 b04*q4 b05*q5 b06*q6 b07*q7 -- 16 bit result + scalemat_r0_1 = _mm_unpacklo_epi16(temp10, zero_8x16b); // b00*q0 0 b01*q1 0 b02*q2 0 b03*q3 0 -- 16 bit long + scalemat_r0_2 = _mm_unpackhi_epi16(temp10, zero_8x16b); // b04*q4 0 b05*q5 0 b06*q6 0 b07*q7 0 -- 16 bit long + temp5 = _mm_madd_epi16(src_r0_1, scalemat_r0_1); // a00*b00*q0 a01*b01*q1 a02*b02*q2 a03*b03*q3 -- 32 bits long + temp7 = _mm_madd_epi16(src_r0_2, scalemat_r0_2); // a04*b04*q4 a05*b05*q5 a06*b06*q6 a07*b07*q7 -- 32 bits long + if (qp_div >= 6) { + resq_r5_1 = _mm_slli_epi32(temp5, qp_div - 6); + resq_r5_2 = _mm_slli_epi32(temp7, qp_div - 6); + //resq_r5_1 = _mm_and_si128(resq_r5_1,one_zero_mask); + //resq_r5_2 = _mm_and_si128(resq_r5_2,one_zero_mask); + } else { + temp5 = _mm_add_epi32(temp5, add_rshift); + temp7 = _mm_add_epi32(temp7, add_rshift); + resq_r5_1 = _mm_srai_epi32(temp5, 6 - qp_div); + resq_r5_2 = _mm_srai_epi32(temp7, 6 - qp_div); + } + resq_r5_1 = _mm_packs_epi32(resq_r5_1, resq_r5_2); //a00*b00*q0 a01*b01*q1 a02*b02*q2 a03*b03*q3 a04*b04*q4 a05*b05*q5 a06*b06*q6 a07*b07*q7 -- 16 bit long + // Row 6 processing + src_r0 = _mm_loadu_si128((__m128i *) (pi2_src + 48)); //a00 a01 a02 a03 a04 a05 a06 a07 a08 -- the source matrix 6th row + scalemat_r0 = _mm_loadu_si128((__m128i *) (pu2_iscale_mat + 48)); //b00 b01 b02 b03 b04 b05 b06 b07 b08 -- the scaling matrix 6th row + dequant_r0 = _mm_loadu_si128((__m128i *) (&pu2_weigh_mat[48])); //q0 q1 q2 q3 q4 q5 q6 q7 -- all 16 bits + src_r0_1 = _mm_unpacklo_epi16(src_r0, zero_8x16b); //a00 0 a01 0 a02 0 a03 0 -- 16 bit long + src_r0_2 = _mm_unpackhi_epi16(src_r0, zero_8x16b); // a04 0 a05 0 a06 0 a07 0 -- 16 bit long + temp10 = _mm_mullo_epi16(scalemat_r0, dequant_r0); //b00*q0 b01*q1 b02*q2 b03*q3 b04*q4 b05*q5 b06*q6 b07*q7 -- 16 bit result + scalemat_r0_1 = _mm_unpacklo_epi16(temp10, zero_8x16b); // b00*q0 0 b01*q1 0 b02*q2 0 b03*q3 0 -- 16 bit long + scalemat_r0_2 = _mm_unpackhi_epi16(temp10, zero_8x16b); // b04*q4 0 b05*q5 0 b06*q6 0 b07*q7 0 -- 16 bit long + temp5 = _mm_madd_epi16(src_r0_1, scalemat_r0_1); // a00*b00*q0 a01*b01*q1 a02*b02*q2 a03*b03*q3 -- 32 bits long + temp7 = _mm_madd_epi16(src_r0_2, scalemat_r0_2); // a04*b04*q4 a05*b05*q5 a06*b06*q6 a07*b07*q7 -- 32 bits long + if (qp_div >= 6) { + resq_r6_1 = _mm_slli_epi32(temp5, qp_div - 6); + resq_r6_2 = _mm_slli_epi32(temp7, qp_div - 6); + //resq_r6_1 = _mm_and_si128(resq_r6_1,one_zero_mask); + //resq_r6_2 = _mm_and_si128(resq_r6_2,one_zero_mask); + } else { + temp5 = _mm_add_epi32(temp5, add_rshift); + temp7 = _mm_add_epi32(temp7, add_rshift); + resq_r6_1 = _mm_srai_epi32(temp5, 6 - qp_div); + resq_r6_2 = _mm_srai_epi32(temp7, 6 - qp_div); + //resq_r6_1 = _mm_and_si128(resq_r6_1,one_zero_mask); + //resq_r6_2 = _mm_and_si128(resq_r6_2,one_zero_mask); + } + resq_r6_1 = _mm_packs_epi32(resq_r6_1, resq_r6_2); //a00*b00*q0 a01*b01*q1 a02*b02*q2 a03*b03*q3 a04*b04*q4 a05*b05*q5 a06*b06*q6 a07*b07*q7 -- 16 bit long + // Row 7 processing + src_r0 = _mm_loadu_si128((__m128i *) (pi2_src + 56)); //a00 a01 a02 a03 a04 a05 a06 a07 a08 -- the source matrix 7th row + scalemat_r0 = _mm_loadu_si128((__m128i *) (pu2_iscale_mat + 56)); //b00 b01 b02 b03 b04 b05 b06 b07 b08 -- the scaling matrix 7th row + dequant_r0 = _mm_loadu_si128((__m128i *) (&pu2_weigh_mat[56])); //q0 q1 q2 q3 q4 q5 q6 q7 -- all 16 bits + src_r0_1 = _mm_unpacklo_epi16(src_r0, zero_8x16b); //a00 0 a01 0 a02 0 a03 0 -- 16 bit long + src_r0_2 = _mm_unpackhi_epi16(src_r0, zero_8x16b); // a04 0 a05 0 a06 0 a07 0 -- 16 bit long + temp10 = _mm_mullo_epi16(scalemat_r0, dequant_r0); //b00*q0 b01*q1 b02*q2 b03*q3 b04*q4 b05*q5 b06*q6 b07*q7 -- 16 bit result + scalemat_r0_1 = _mm_unpacklo_epi16(temp10, zero_8x16b); // b00*q0 0 b01*q1 0 b02*q2 0 b03*q3 0 -- 16 bit long + scalemat_r0_2 = _mm_unpackhi_epi16(temp10, zero_8x16b); // b04*q4 0 b05*q5 0 b06*q6 0 b07*q7 0 -- 16 bit long + temp5 = _mm_madd_epi16(src_r0_1, scalemat_r0_1); // a00*b00*q0 a01*b01*q1 a02*b02*q2 a03*b03*q3 -- 32 bits long + temp7 = _mm_madd_epi16(src_r0_2, scalemat_r0_2); // a04*b04*q4 a05*b05*q5 a06*b06*q6 a07*b07*q7 -- 32 bits long + if (qp_div >= 6) { + resq_r7_1 = _mm_slli_epi32(temp5, qp_div - 6); + resq_r7_2 = _mm_slli_epi32(temp7, qp_div - 6); + } else { + temp5 = _mm_add_epi32(temp5, add_rshift); + temp7 = _mm_add_epi32(temp7, add_rshift); + resq_r7_1 = _mm_srai_epi32(temp5, 6 - qp_div); + resq_r7_2 = _mm_srai_epi32(temp7, 6 - qp_div); + } + resq_r7_1 = _mm_packs_epi32(resq_r7_1, resq_r7_2); //a00*b00*q0 a01*b01*q1 a02*b02*q2 a03*b03*q3 a04*b04*q4 a05*b05*q5 a06*b06*q6 a07*b07*q7 -- 16 bit long + /* Perform Inverse transform */ + /*--------------------------------------------------------------------*/ + /* IDCT [ Horizontal transformation ] */ + /*--------------------------------------------------------------------*/ + // Matrix transpose + /* + * a0 a1 a2 a3 a4 a5 a6 a7 + * b0 b1 b2 b3 b4 b5 b6 b7 + * c0 c1 c2 c3 c4 c5 c6 c7 + * d0 d1 d2 d3 d4 d5 d6 d7 + */ + temp1 = _mm_unpacklo_epi16(resq_r0_1, resq_r1_1); //a0 b0 a1 b1 a2 b2 a3 b3 + temp3 = _mm_unpacklo_epi16(resq_r2_1, resq_r3_1); //c0 d0 c1 d1 c2 d2 c3 d3 + temp2 = _mm_unpackhi_epi16(resq_r0_1, resq_r1_1); //a4 b4 a5 b5 a6 b6 a7 b7 + temp4 = _mm_unpackhi_epi16(resq_r2_1, resq_r3_1); //c4 d4 c5 d5 c6 d6 c7 d7 + resq_r0_1 = _mm_unpacklo_epi32(temp1, temp3); //a0 b0 c0 d0 a1 b1 c1 d1 + resq_r1_1 = _mm_unpackhi_epi32(temp1, temp3); //a2 b2 c2 d2 a3 b3 c3 d3 + resq_r2_1 = _mm_unpacklo_epi32(temp2, temp4); //a4 b4 c4 d4 a5 b5 c5 d5 + resq_r3_1 = _mm_unpackhi_epi32(temp2, temp4); //a6 b6 c6 d6 a7 b7 c7 d7 + /* + * e0 e1 e2 e3 e4 e5 e6 e7 + * f0 f1 f2 f3 f4 f5 f6 f7 + * g0 g1 g2 g3 g4 g5 g6 g7 + * h0 h1 h2 h3 h4 h5 h6 h7 + */ + temp1 = _mm_unpacklo_epi16(resq_r4_1, resq_r5_1); //e0 f0 e1 f1 e2 f2 e2 f3 + temp3 = _mm_unpacklo_epi16(resq_r6_1, resq_r7_1); //g0 h0 g1 h1 g2 h2 g3 h3 + temp2 = _mm_unpackhi_epi16(resq_r4_1, resq_r5_1); //e4 f4 e5 f5 e6 f6 e7 f7 + temp4 = _mm_unpackhi_epi16(resq_r6_1, resq_r7_1); //g4 h4 g5 h5 g6 h6 g7 h7 + resq_r4_1 = _mm_unpacklo_epi32(temp1, temp3); //e0 f0 g0 h0 e1 f1 g1 h1 + resq_r5_1 = _mm_unpackhi_epi32(temp1, temp3); //e2 f2 g2 h2 e3 f3 g3 h3 + resq_r6_1 = _mm_unpacklo_epi32(temp2, temp4); //e4 f4 g4 h4 e5 f5 g5 h5 + resq_r7_1 = _mm_unpackhi_epi32(temp2, temp4); //e6 f6 g6 h6 e7 f7 g7 h7 + /* + * a0 b0 c0 d0 a1 b1 c1 d1 + * a2 b2 c2 d2 a3 b3 c3 d3 + * a4 b4 c4 d4 a5 b5 c5 d5 + * a6 b6 c6 d6 a7 b7 c7 d7 + * e0 f0 g0 h0 e1 f1 g1 h1 + * e2 f2 g2 h2 e3 f3 g3 h3 + * e4 f4 g4 h4 e5 f5 g5 h5 + * e6 f6 g6 h6 e7 f7 g7 h7 + */ + resq_r0_2 = _mm_unpacklo_epi64(resq_r0_1, resq_r4_1); //a0 b0 c0 d0 e0 f0 g0 h0 + resq_r1_2 = _mm_unpackhi_epi64(resq_r0_1, resq_r4_1); //a1 b1 c1 d1 e1 f1 g1 h1 + resq_r2_2 = _mm_unpacklo_epi64(resq_r1_1, resq_r5_1); //a2 b2 c2 d2 e2 f2 g2 h2 + resq_r3_2 = _mm_unpackhi_epi64(resq_r1_1, resq_r5_1); //a3 b3 c3 d3 e3 f3 g3 h3 + resq_r4_2 = _mm_unpacklo_epi64(resq_r2_1, resq_r6_1); //a4 b4 c4 d4 e4 f4 g4 h4 + resq_r5_2 = _mm_unpackhi_epi64(resq_r2_1, resq_r6_1); //a5 b5 c5 d5 e5 f5 g5 h5 + resq_r6_2 = _mm_unpacklo_epi64(resq_r3_1, resq_r7_1); //a6 b6 c6 d6 e6 f6 g6 h6 + resq_r7_2 = _mm_unpackhi_epi64(resq_r3_1, resq_r7_1); //a7 b7 c7 d7 e7 f7 g7 h7 + + sign_reg = _mm_cmpgt_epi16(zero_8x16b, resq_r1_2); + resq_r1_1 = _mm_unpacklo_epi16(resq_r1_2, sign_reg); //a1 b1 c1 d1 -- 32 bit + resq_r1_2 = _mm_unpackhi_epi16(resq_r1_2, sign_reg); //e1 f1 g1 h1 -- 32 bit + sign_reg = _mm_cmpgt_epi16(zero_8x16b, resq_r3_2); + resq_r3_1 = _mm_unpacklo_epi16(resq_r3_2, sign_reg); //a3 b3 c3 d3 -- 32 bit + resq_r3_2 = _mm_unpackhi_epi16(resq_r3_2, sign_reg); //e3 f3 g3 h3 -- 32 bit + sign_reg = _mm_cmpgt_epi16(zero_8x16b, resq_r5_2); + resq_r5_1 = _mm_unpacklo_epi16(resq_r5_2, sign_reg); //a5 b5 c5 d5 -- 32 bit + resq_r5_2 = _mm_unpackhi_epi16(resq_r5_2, sign_reg); //e5 f5 g5 h5 -- 32 bit + sign_reg = _mm_cmpgt_epi16(zero_8x16b, resq_r7_2); + resq_r7_1 = _mm_unpacklo_epi16(resq_r7_2, sign_reg); //a7 b7 c7 d7 -- 32 bit + resq_r7_2 = _mm_unpackhi_epi16(resq_r7_2, sign_reg); //e7 f7 g7 h7 -- 32 bit + //Transform starts -- horizontal transform + /*------------------------------------------------------------------*/ + /* y0 = w0 + w4 */ + temp1 = _mm_add_epi16(resq_r0_2, resq_r4_2); + /* y2 = w0 - w4 */ + temp3 = _mm_sub_epi16(resq_r0_2, resq_r4_2); + /* y1 = -w3 + w5 - w7 - (w7 >> 1) */ + temp2 = _mm_sub_epi32(resq_r5_1, resq_r3_1); //-w3+w5 + temp10 = _mm_sub_epi32(resq_r5_2, resq_r3_2); + temp4 = _mm_sub_epi32(temp2, resq_r7_1); //-w3+w5-w7 + temp12 = _mm_sub_epi32(temp10, resq_r7_2); + temp5 = _mm_srai_epi32(resq_r7_1, 1); //w7>>1 + temp13 = _mm_srai_epi32(resq_r7_2, 1); + temp2 = _mm_sub_epi32(temp4, temp5); //-w3+w5-w7 -(w7>>1) + temp10 = _mm_sub_epi32(temp12, temp13); + temp2 = _mm_packs_epi32(temp2, temp10); + /* y3 = w1 + w7 - w3 - (w3 >> 1) */ + temp4 = _mm_add_epi32(resq_r1_1, resq_r7_1); //w1+w7 + temp12 = _mm_add_epi32(resq_r1_2, resq_r7_2); + temp4 = _mm_sub_epi32(temp4, resq_r3_1); //w1+w7-w3 + temp12 = _mm_sub_epi32(temp12, resq_r3_2); + temp5 = _mm_srai_epi32(resq_r3_1, 1); //w3>>1 + temp13 = _mm_srai_epi32(resq_r3_2, 1); + temp4 = _mm_sub_epi32(temp4, temp5); //w1+w7-w3-(w3>>1) + temp12 = _mm_sub_epi32(temp12, temp13); + temp4 = _mm_packs_epi32(temp4, temp12); + /* y4 = (w2 >> 1) - w6 */ + temp5 = _mm_srai_epi16(resq_r2_2, 1); //w2>>1 + temp5 = _mm_sub_epi16(temp5, resq_r6_2); //(w2>>1)-w6 + /* y5 = -w1 + w7 + w5 + (w5 >> 1) */ + temp6 = _mm_sub_epi32(resq_r7_1, resq_r1_1); //w7-w1 + temp14 = _mm_sub_epi32(resq_r7_2, resq_r1_2); + temp6 = _mm_add_epi32(temp6, resq_r5_1); //w7-w1+w5 + temp14 = _mm_add_epi32(temp14, resq_r5_2); + temp7 = _mm_srai_epi32(resq_r5_1, 1); //w5>>1 + temp15 = _mm_srai_epi32(resq_r5_2, 1); + temp6 = _mm_add_epi32(temp6, temp7); //w7-w1_w5+(w5>>1) + temp14 = _mm_add_epi32(temp14, temp15); + temp6 = _mm_packs_epi32(temp6, temp14); + /* y6 = w2 + (w6 >> 1) */ + temp7 = _mm_srai_epi16(resq_r6_2, 1); //w6>>1 + temp7 = _mm_add_epi16(temp7, resq_r2_2); //(w6>>1)+w2 + /* y7 = w3 + w5 + w1 + (w1 >> 1) */ + temp8 = _mm_add_epi32(resq_r3_1, resq_r5_1); //w3+w5 + temp16 = _mm_add_epi32(resq_r3_2, resq_r5_2); + temp8 = _mm_add_epi32(temp8, resq_r1_1); //w3+w5+w1 + temp16 = _mm_add_epi32(temp16, resq_r1_2); + temp17 = _mm_srai_epi32(resq_r1_1, 1); //w1>>1 + temp18 = _mm_srai_epi32(resq_r1_2, 1); + temp8 = _mm_add_epi32(temp8, temp17); //w3+w5+w1+(w1>>1) + temp16 = _mm_add_epi32(temp16, temp18); + temp8 = _mm_packs_epi32(temp8, temp16); + /*------------------------------------------------------------------*/ + /*------------------------------------------------------------------*/ + /* z0 = y0 + y6 */ + resq_r0_1 = _mm_add_epi16(temp1, temp7); + /* z1 = y1 + (y7 >> 2) */ + resq_r1_1 = _mm_srai_epi16(temp8, 2); + resq_r1_1 = _mm_add_epi16(resq_r1_1, temp2); + /* z2 = y2 + y4 */ + resq_r2_1 = _mm_add_epi16(temp3, temp5); + /* z3 = y3 + (y5 >> 2) */ + resq_r3_1 = _mm_srai_epi16(temp6, 2); + resq_r3_1 = _mm_add_epi16(resq_r3_1, temp4); + /* z4 = y2 - y4 */ + resq_r4_1 = _mm_sub_epi16(temp3, temp5); + /* z5 = (y3 >> 2) - y5 */ + resq_r5_1 = _mm_srai_epi16(temp4, 2); + resq_r5_1 = _mm_sub_epi16(resq_r5_1, temp6); + /* z6 = y0 - y6 */ + resq_r6_1 = _mm_sub_epi16(temp1, temp7); + /* z7 = y7 - (y1 >> 2) */ + resq_r7_1 = _mm_srai_epi16(temp2, 2); + resq_r7_1 = _mm_sub_epi16(temp8, resq_r7_1); + /*------------------------------------------------------------------*/ + /*------------------------------------------------------------------*/ + /* x0 = z0 + z7 */ + temp1 = _mm_add_epi16(resq_r0_1, resq_r7_1); + /* x1 = z2 + z5 */ + temp2 = _mm_add_epi16(resq_r2_1, resq_r5_1); + /* x2 = z4 + z3 */ + temp3 = _mm_add_epi16(resq_r4_1, resq_r3_1); + /* x3 = z6 + z1 */ + temp4 = _mm_add_epi16(resq_r6_1, resq_r1_1); + /* x4 = z6 - z1 */ + temp5 = _mm_sub_epi16(resq_r6_1, resq_r1_1); + /* x5 = z4 - z3 */ + temp6 = _mm_sub_epi16(resq_r4_1, resq_r3_1); + /* x6 = z2 - z5 */ + temp7 = _mm_sub_epi16(resq_r2_1, resq_r5_1); + /* x7 = z0 - z7 */ + temp8 = _mm_sub_epi16(resq_r0_1, resq_r7_1); + /*------------------------------------------------------------------*/ + // Matrix transpose + /* + * a0 b0 c0 d0 e0 f0 g0 h0 + * a1 b1 c1 d1 e1 f1 g1 h1 + * a2 b2 c2 d2 e2 f2 g2 h2 + * a3 b3 c3 d3 e3 f3 g3 h3 + */ + temp17 = _mm_unpacklo_epi16(temp1, temp2); //a0 a1 b0 b1 c0 c1 d0 d1 + temp19 = _mm_unpacklo_epi16(temp3, temp4); //a2 a3 b2 b3 c2 c3 d2 d3 + temp18 = _mm_unpackhi_epi16(temp1, temp2); //e0 e1 f0 f1 g0 g1 h0 h1 + temp20 = _mm_unpackhi_epi16(temp3, temp4); //e2 e3 f2 f3 g2 g3 h2 h3 + + resq_r0_1 = _mm_unpacklo_epi32(temp17, temp19); //a0 a1 a2 a3 b0 b1 b2 b3 + resq_r1_1 = _mm_unpackhi_epi32(temp17, temp19); //c0 c1 c2 c3 d0 d1 d2 d3 + resq_r2_1 = _mm_unpacklo_epi32(temp18, temp20); //e0 e1 e2 e3 f0 f1 f2 f3 + resq_r3_1 = _mm_unpackhi_epi32(temp18, temp20); //g0 g2 g2 g3 h0 h1 h2 h3 + /* + * a4 b4 c4 d4 e4 f4 g4 h4 + * a5 b5 c5 d5 e5 f5 g5 h5 + * a6 b6 c6 d6 e6 f6 g6 h6 + * a7 b7 c7 d7 e7 f7 g7 h7 + */ + temp17 = _mm_unpacklo_epi16(temp5, temp6); //a4 a5 b4 b5 c4 c5 d4 d5 + temp19 = _mm_unpacklo_epi16(temp7, temp8); //a6 a7 b6 b7 c6 c7 d6 d7 + temp18 = _mm_unpackhi_epi16(temp5, temp6); //e4 e5 f4 f5 g4 g5 h4 h5 + temp20 = _mm_unpackhi_epi16(temp7, temp8); //e6 e7 f6 f7 g6 g7 h6 h7 + + resq_r4_1 = _mm_unpacklo_epi32(temp17, temp19); //a4 a5 a6 a7 b4 b5 b6 b7 + resq_r5_1 = _mm_unpackhi_epi32(temp17, temp19); //c4 c5 c6 c7 d4 d5 d6 d7 + resq_r6_1 = _mm_unpacklo_epi32(temp18, temp20); //e4 e5 e6 e7 f4 f5 f6 f7 + resq_r7_1 = _mm_unpackhi_epi32(temp18, temp20); //g4 g5 g6 g7 h4 h5 h6 h7 + /* a0 a1 a2 a3 b0 b1 b2 b3 + * c0 c1 c2 c3 d0 d1 d2 d3 + * e0 e1 e2 e3 f0 f1 f2 f3 + * g0 g2 g2 g3 h0 h1 h2 h3 + * a4 a5 a6 a7 b4 b5 b6 b7 + * c4 c5 c6 c7 d4 d5 d6 d7 + * e4 e5 e6 e7 f4 f5 f6 f7 + * g4 g5 g6 g7 h4 h5 h6 h7 + */ + resq_r0_2 = _mm_unpacklo_epi64(resq_r0_1, resq_r4_1); //a0 a1 a2 a3 a4 a5 a6 a7 + resq_r1_2 = _mm_unpackhi_epi64(resq_r0_1, resq_r4_1); //b0 b1 b2 b3 b4 b5 b6 b7 + resq_r2_2 = _mm_unpacklo_epi64(resq_r1_1, resq_r5_1); //c0 c1 c2 c3 c4 c5 c6 c7 + resq_r3_2 = _mm_unpackhi_epi64(resq_r1_1, resq_r5_1); //d0 d1 d2 d3 d4 d5 d6 d7 + resq_r4_2 = _mm_unpacklo_epi64(resq_r2_1, resq_r6_1); //e0 e1 e2 e3 e4 e5 e6 e7 + resq_r5_2 = _mm_unpackhi_epi64(resq_r2_1, resq_r6_1); //f0 f1 f2 f3 f4 f5 f6 f7 + resq_r6_2 = _mm_unpacklo_epi64(resq_r3_1, resq_r7_1); //g0 g1 g2 g3 g4 g5 g6 g7 + resq_r7_2 = _mm_unpackhi_epi64(resq_r3_1, resq_r7_1); //h0 h1 h2 h3 h4 h5 h6 h7 + + sign_reg = _mm_cmpgt_epi16(zero_8x16b, resq_r1_2); + resq_r1_1 = _mm_unpacklo_epi16(resq_r1_2, sign_reg); //a1 b1 c1 d1 -- 32 bit + resq_r1_2 = _mm_unpackhi_epi16(resq_r1_2, sign_reg); //e1 f1 g1 h1 -- 32 bit + sign_reg = _mm_cmpgt_epi16(zero_8x16b, resq_r3_2); + resq_r3_1 = _mm_unpacklo_epi16(resq_r3_2, sign_reg); //a3 b3 c3 d3 -- 32 bit + resq_r3_2 = _mm_unpackhi_epi16(resq_r3_2, sign_reg); //e3 f3 g3 h3 -- 32 bit + sign_reg = _mm_cmpgt_epi16(zero_8x16b, resq_r5_2); + resq_r5_1 = _mm_unpacklo_epi16(resq_r5_2, sign_reg); //a5 b5 c5 d5 -- 32 bit + resq_r5_2 = _mm_unpackhi_epi16(resq_r5_2, sign_reg); //e5 f5 g5 h5 -- 32 bit + sign_reg = _mm_cmpgt_epi16(zero_8x16b, resq_r7_2); + resq_r7_1 = _mm_unpacklo_epi16(resq_r7_2, sign_reg); //a7 b7 c7 d7 -- 32 bit + resq_r7_2 = _mm_unpackhi_epi16(resq_r7_2, sign_reg); //e7 f7 g7 h7 -- 32 bit + + zero_8x16b = _mm_setzero_si128(); // all bits reset to zero + //Load pred buffer row 0 + predload_r = _mm_loadl_epi64((__m128i *) (&pu1_pred[0])); //p0 p1 p2 p3 p4 p5 p6 p7 0 0 0 0 0 0 0 0 -- all 8 bits + pred_r0_1 = _mm_unpacklo_epi8(predload_r, zero_8x16b); //p0 p1 p2 p3 p4 p5 p6 p7 -- all 16 bits + //Load pred buffer row 1 + predload_r = _mm_loadl_epi64((__m128i *) (&pu1_pred[pred_strd])); //p0 p1 p2 p3 p4 p5 p6 p7 0 0 0 0 0 0 0 0 -- all 8 bits + pred_r1_1 = _mm_unpacklo_epi8(predload_r, zero_8x16b); //p0 p1 p2 p3 p4 p5 p6 p7 -- all 16 bits + //Load pred buffer row 2 + predload_r = _mm_loadl_epi64((__m128i *) (&pu1_pred[2 * pred_strd])); //p0 p1 p2 p3 p4 p5 p6 p7 0 0 0 0 0 0 0 0 -- all 8 bits + pred_r2_1 = _mm_unpacklo_epi8(predload_r, zero_8x16b); //p0 p1 p2 p3 p4 p5 p6 p7 -- all 16 bits + //Load pred buffer row 3 + predload_r = _mm_loadl_epi64((__m128i *) (&pu1_pred[3 * pred_strd])); //p0 p1 p2 p3 p4 p5 p6 p7 0 0 0 0 0 0 0 0 -- all 8 bits + pred_r3_1 = _mm_unpacklo_epi8(predload_r, zero_8x16b); //p0 p1 p2 p3 p4 p5 p6 p7 -- all 16 bits + //Load pred buffer row 4 + predload_r = _mm_loadl_epi64((__m128i *) (&pu1_pred[4 * pred_strd])); //p0 p1 p2 p3 p4 p5 p6 p7 0 0 0 0 0 0 0 0 -- all 8 bits + pred_r4_1 = _mm_unpacklo_epi8(predload_r, zero_8x16b); //p0 p1 p2 p3 p4 p5 p6 p7 -- all 16 bits + //Load pred buffer row 5 + predload_r = _mm_loadl_epi64((__m128i *) (&pu1_pred[5 * pred_strd])); //p0 p1 p2 p3 p4 p5 p6 p7 0 0 0 0 0 0 0 0 -- all 8 bit + pred_r5_1 = _mm_unpacklo_epi8(predload_r, zero_8x16b); //p0 p1 p2 p3 p4 p5 p6 p7 -- all 16 bits + //Load pred buffer row 6 + predload_r = _mm_loadl_epi64((__m128i *) (&pu1_pred[6 * pred_strd])); //p0 p1 p2 p3 p4 p5 p6 p7 0 0 0 0 0 0 0 0 -- all 8 bits + pred_r6_1 = _mm_unpacklo_epi8(predload_r, zero_8x16b); //p0 p1 p2 p3 p4 p5 p6 p7 -- all 16 bits + //Load pred buffer row 7 + predload_r = _mm_loadl_epi64((__m128i *) (&pu1_pred[7 * pred_strd])); //p0 p1 p2 p3 p4 p5 p6 p7 0 0 0 0 0 0 0 0 -- all 8 bits + pred_r7_1 = _mm_unpacklo_epi8(predload_r, zero_8x16b); //p0 p1 p2 p3 p4 p5 p6 p7 -- all 16 bits + + /*--------------------------------------------------------------------*/ + /* IDCT [ Vertical transformation] and Xij = (xij + 32)>>6 */ + /* */ + /* Add the prediction and store it back to reconstructed frame buffer */ + /* [Prediction buffer itself in this case] */ + /*--------------------------------------------------------------------*/ + + /* y0j = w0j + w4j */ + temp1 = _mm_add_epi16(resq_r0_2, resq_r4_2); + /* y2j = w0j - w4j */ + temp3 = _mm_sub_epi16(resq_r0_2, resq_r4_2); + /* y1j = -w3j + w5j - w7j - (w7j >> 1) */ + temp2 = _mm_sub_epi32(resq_r5_1, resq_r3_1); //-w3+w5 + temp10 = _mm_sub_epi32(resq_r5_2, resq_r3_2); + temp4 = _mm_sub_epi32(temp2, resq_r7_1); //-w3+w5-w7 + temp12 = _mm_sub_epi32(temp10, resq_r7_2); + temp5 = _mm_srai_epi32(resq_r7_1, 1); //w7>>1 + temp13 = _mm_srai_epi32(resq_r7_2, 1); + temp2 = _mm_sub_epi32(temp4, temp5); //-w3+w5-w7 -(w7>>1) + temp10 = _mm_sub_epi32(temp12, temp13); + temp2 = _mm_packs_epi32(temp2, temp10); + /* y3j = w1j + w7j - w3j - (w3j >> 1) */ + temp4 = _mm_add_epi32(resq_r1_1, resq_r7_1); //w1+w7 + temp12 = _mm_add_epi32(resq_r1_2, resq_r7_2); + temp4 = _mm_sub_epi32(temp4, resq_r3_1); //w1+w7-w3 + temp12 = _mm_sub_epi32(temp12, resq_r3_2); + temp5 = _mm_srai_epi32(resq_r3_1, 1); //w3>>1 + temp13 = _mm_srai_epi32(resq_r3_2, 1); + temp4 = _mm_sub_epi32(temp4, temp5); //w1+w7-w3-(w3>>1) + temp12 = _mm_sub_epi32(temp12, temp13); + temp4 = _mm_packs_epi32(temp4, temp12); + /* y4j = (w2j >> 1) - w6j */ + temp5 = _mm_srai_epi16(resq_r2_2, 1); //w2>>1 + temp5 = _mm_sub_epi16(temp5, resq_r6_2); //(w2>>1)-w6 + /* y5j = -w1j + w7j + w5j + (w5j >> 1) */ + temp6 = _mm_sub_epi32(resq_r7_1, resq_r1_1); //w7-w1 + temp14 = _mm_sub_epi32(resq_r7_2, resq_r1_2); + temp6 = _mm_add_epi32(temp6, resq_r5_1); //w7-w1+w5 + temp14 = _mm_add_epi32(temp14, resq_r5_2); + temp7 = _mm_srai_epi32(resq_r5_1, 1); //w5>>1 + temp15 = _mm_srai_epi32(resq_r5_2, 1); + temp6 = _mm_add_epi32(temp6, temp7); //w7-w1_w5+(w5>>1) + temp14 = _mm_add_epi32(temp14, temp15); + temp6 = _mm_packs_epi32(temp6, temp14); + /* y6j = w2j + (w6j >> 1) */ + temp7 = _mm_srai_epi16(resq_r6_2, 1); //w6>>1 + temp7 = _mm_add_epi16(temp7, resq_r2_2); //(w6>>1)+w2 + /* y7j = w3j + w5j + w1j + (w1j >> 1) */ + temp8 = _mm_add_epi32(resq_r3_1, resq_r5_1); //w3+w5 + temp16 = _mm_add_epi32(resq_r3_2, resq_r5_2); + temp8 = _mm_add_epi32(temp8, resq_r1_1); //w3+w5+w1 + temp16 = _mm_add_epi32(temp16, resq_r1_2); + temp17 = _mm_srai_epi32(resq_r1_1, 1); //w1>>1 + temp18 = _mm_srai_epi32(resq_r1_2, 1); + temp8 = _mm_add_epi32(temp8, temp17); //w3+w5+w1+(w1>>1) + temp16 = _mm_add_epi32(temp16, temp18); + temp8 = _mm_packs_epi32(temp8, temp16); + /*------------------------------------------------------------------*/ + /*------------------------------------------------------------------*/ + /* z0j = y0j + y6j */ + resq_r0_1 = _mm_add_epi16(temp1, temp7); + /* z1j = y1j + (y7j >> 2) */ + resq_r1_1 = _mm_srai_epi16(temp8, 2); + resq_r1_1 = _mm_add_epi16(resq_r1_1, temp2); + /* z2j = y2j + y4j */ + resq_r2_1 = _mm_add_epi16(temp3, temp5); + /* z3j = y3j + (y5j >> 2) */ + resq_r3_1 = _mm_srai_epi16(temp6, 2); + resq_r3_1 = _mm_add_epi16(resq_r3_1, temp4); + /* z4j = y2j - y4j */ + resq_r4_1 = _mm_sub_epi16(temp3, temp5); + /* z5j = (y3j >> 2) - y5j */ + resq_r5_1 = _mm_srai_epi16(temp4, 2); + resq_r5_1 = _mm_sub_epi16(resq_r5_1, temp6); + /* z6j = y0j - y6j */ + resq_r6_1 = _mm_sub_epi16(temp1, temp7); + /* z7j = y7j - (y1j >> 2) */ + resq_r7_1 = _mm_srai_epi16(temp2, 2); + resq_r7_1 = _mm_sub_epi16(temp8, resq_r7_1); + /*------------------------------------------------------------------*/ + + /*------------------------------------------------------------------*/ + /* x0j = z0j + z7j */ + temp1 = _mm_add_epi16(resq_r0_1, resq_r7_1); + sign_reg = _mm_cmpgt_epi16(zero_8x16b, temp1); + temp10 = _mm_unpacklo_epi16(temp1, sign_reg); + temp11 = _mm_unpackhi_epi16(temp1, sign_reg); + temp10 = _mm_add_epi32(temp10, value_32); + temp11 = _mm_add_epi32(temp11, value_32); + temp10 = _mm_srai_epi32(temp10, 6); + temp11 = _mm_srai_epi32(temp11, 6); + temp10 = _mm_packs_epi32(temp10, temp11); + temp1 = _mm_add_epi16(temp10, pred_r0_1); + /* x1j = z2j + z5j */ + temp2 = _mm_add_epi16(resq_r2_1, resq_r5_1); + sign_reg = _mm_cmpgt_epi16(zero_8x16b, temp2); + temp10 = _mm_unpacklo_epi16(temp2, sign_reg); + temp11 = _mm_unpackhi_epi16(temp2, sign_reg); + temp10 = _mm_add_epi32(temp10, value_32); + temp11 = _mm_add_epi32(temp11, value_32); + temp10 = _mm_srai_epi32(temp10, 6); + temp11 = _mm_srai_epi32(temp11, 6); + temp10 = _mm_packs_epi32(temp10, temp11); + temp2 = _mm_add_epi16(temp10, pred_r1_1); + /* x2j = z4j + z3j */ + temp3 = _mm_add_epi16(resq_r4_1, resq_r3_1); + sign_reg = _mm_cmpgt_epi16(zero_8x16b, temp3); + temp10 = _mm_unpacklo_epi16(temp3, sign_reg); + temp11 = _mm_unpackhi_epi16(temp3, sign_reg); + temp10 = _mm_add_epi32(temp10, value_32); + temp11 = _mm_add_epi32(temp11, value_32); + temp10 = _mm_srai_epi32(temp10, 6); + temp11 = _mm_srai_epi32(temp11, 6); + temp10 = _mm_packs_epi32(temp10, temp11); + temp3 = _mm_add_epi16(temp10, pred_r2_1); + /* x3j = z6j + z1j */ + temp4 = _mm_add_epi16(resq_r6_1, resq_r1_1); + sign_reg = _mm_cmpgt_epi16(zero_8x16b, temp4); + temp10 = _mm_unpacklo_epi16(temp4, sign_reg); + temp11 = _mm_unpackhi_epi16(temp4, sign_reg); + temp10 = _mm_add_epi32(temp10, value_32); + temp11 = _mm_add_epi32(temp11, value_32); + temp10 = _mm_srai_epi32(temp10, 6); + temp11 = _mm_srai_epi32(temp11, 6); + temp10 = _mm_packs_epi32(temp10, temp11); + temp4 = _mm_add_epi16(temp10, pred_r3_1); + /* x4j = z6j - z1j */ + temp5 = _mm_sub_epi16(resq_r6_1, resq_r1_1); + sign_reg = _mm_cmpgt_epi16(zero_8x16b, temp5); + temp10 = _mm_unpacklo_epi16(temp5, sign_reg); + temp11 = _mm_unpackhi_epi16(temp5, sign_reg); + temp10 = _mm_add_epi32(temp10, value_32); + temp11 = _mm_add_epi32(temp11, value_32); + temp10 = _mm_srai_epi32(temp10, 6); + temp11 = _mm_srai_epi32(temp11, 6); + temp10 = _mm_packs_epi32(temp10, temp11); + temp5 = _mm_add_epi16(temp10, pred_r4_1); + /* x5j = z4j - z3j */ + temp6 = _mm_sub_epi16(resq_r4_1, resq_r3_1); + sign_reg = _mm_cmpgt_epi16(zero_8x16b, temp6); + temp10 = _mm_unpacklo_epi16(temp6, sign_reg); + temp11 = _mm_unpackhi_epi16(temp6, sign_reg); + temp10 = _mm_add_epi32(temp10, value_32); + temp11 = _mm_add_epi32(temp11, value_32); + temp10 = _mm_srai_epi32(temp10, 6); + temp11 = _mm_srai_epi32(temp11, 6); + temp10 = _mm_packs_epi32(temp10, temp11); + temp6 = _mm_add_epi16(temp10, pred_r5_1); + /* x6j = z2j - z5j */ + temp7 = _mm_sub_epi16(resq_r2_1, resq_r5_1); + sign_reg = _mm_cmpgt_epi16(zero_8x16b, temp7); + temp10 = _mm_unpacklo_epi16(temp7, sign_reg); + temp11 = _mm_unpackhi_epi16(temp7, sign_reg); + temp10 = _mm_add_epi32(temp10, value_32); + temp11 = _mm_add_epi32(temp11, value_32); + temp10 = _mm_srai_epi32(temp10, 6); + temp11 = _mm_srai_epi32(temp11, 6); + temp10 = _mm_packs_epi32(temp10, temp11); + temp7 = _mm_add_epi16(temp10, pred_r6_1); + /* x7j = z0j - z7j */ + temp8 = _mm_sub_epi16(resq_r0_1, resq_r7_1); + sign_reg = _mm_cmpgt_epi16(zero_8x16b, temp8); + temp10 = _mm_unpacklo_epi16(temp8, sign_reg); + temp11 = _mm_unpackhi_epi16(temp8, sign_reg); + temp10 = _mm_add_epi32(temp10, value_32); + temp11 = _mm_add_epi32(temp11, value_32); + temp10 = _mm_srai_epi32(temp10, 6); + temp11 = _mm_srai_epi32(temp11, 6); + temp10 = _mm_packs_epi32(temp10, temp11); + temp8 = _mm_add_epi16(temp10, pred_r7_1); + /*------------------------------------------------------------------*/ + //Clipping the results to 8 bits + sign_reg = _mm_cmpgt_epi16(temp1, zero_8x16b); // sign check + temp1 = _mm_and_si128(temp1, sign_reg); + sign_reg = _mm_cmpgt_epi16(temp2, zero_8x16b); // sign check + temp2 = _mm_and_si128(temp2, sign_reg); + sign_reg = _mm_cmpgt_epi16(temp3, zero_8x16b); // sign check + temp3 = _mm_and_si128(temp3, sign_reg); + sign_reg = _mm_cmpgt_epi16(temp4, zero_8x16b); // sign check + temp4 = _mm_and_si128(temp4, sign_reg); + sign_reg = _mm_cmpgt_epi16(temp5, zero_8x16b); // sign check + temp5 = _mm_and_si128(temp5, sign_reg); + sign_reg = _mm_cmpgt_epi16(temp6, zero_8x16b); // sign check + temp6 = _mm_and_si128(temp6, sign_reg); + sign_reg = _mm_cmpgt_epi16(temp7, zero_8x16b); // sign check + temp7 = _mm_and_si128(temp7, sign_reg); + sign_reg = _mm_cmpgt_epi16(temp8, zero_8x16b); // sign check + temp8 = _mm_and_si128(temp8, sign_reg); + + resq_r0_2 = _mm_packus_epi16(temp1, zero_8x16b); + resq_r1_2 = _mm_packus_epi16(temp2, zero_8x16b); + resq_r2_2 = _mm_packus_epi16(temp3, zero_8x16b); + resq_r3_2 = _mm_packus_epi16(temp4, zero_8x16b); + resq_r4_2 = _mm_packus_epi16(temp5, zero_8x16b); + resq_r5_2 = _mm_packus_epi16(temp6, zero_8x16b); + resq_r6_2 = _mm_packus_epi16(temp7, zero_8x16b); + resq_r7_2 = _mm_packus_epi16(temp8, zero_8x16b); + + _mm_storel_epi64((__m128i *) (&pu1_out[0]), resq_r0_2); + _mm_storel_epi64((__m128i *) (&pu1_out[out_strd]), resq_r1_2); + _mm_storel_epi64((__m128i *) (&pu1_out[2 * out_strd]), resq_r2_2); + _mm_storel_epi64((__m128i *) (&pu1_out[3 * out_strd]), resq_r3_2); + _mm_storel_epi64((__m128i *) (&pu1_out[4 * out_strd]), resq_r4_2); + _mm_storel_epi64((__m128i *) (&pu1_out[5 * out_strd]), resq_r5_2); + _mm_storel_epi64((__m128i *) (&pu1_out[6 * out_strd]), resq_r6_2); + _mm_storel_epi64((__m128i *) (&pu1_out[7 * out_strd]), resq_r7_2); +} + diff --git a/common/x86/ih264_luma_intra_pred_filters_ssse3.c b/common/x86/ih264_luma_intra_pred_filters_ssse3.c new file mode 100755 index 0000000..5a35372 --- /dev/null +++ b/common/x86/ih264_luma_intra_pred_filters_ssse3.c @@ -0,0 +1,2282 @@ +/****************************************************************************** + * + * Copyright (C) 2015 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ***************************************************************************** + * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore +*/ +/** + ******************************************************************************* + * @file + * ih264_luma_intra_pred_filters_ssse3.c + * + * @brief + * Contains function definitions for luma intra prediction filters in x86 + * intrinsics + * + * @author + * Ittiam + * + * @par List of Functions: + * - ih264_intra_pred_luma_4x4_mode_vert_ssse3 + * - ih264_intra_pred_luma_4x4_mode_horz_ssse3 + * - ih264_intra_pred_luma_4x4_mode_dc_ssse3 + * - ih264_intra_pred_luma_4x4_mode_diag_dl_ssse3 + * - ih264_intra_pred_luma_4x4_mode_diag_dr_ssse3 + * - ih264_intra_pred_luma_4x4_mode_vert_r_ssse3 + * - ih264_intra_pred_luma_4x4_mode_horz_d_ssse3 + * - ih264_intra_pred_luma_4x4_mode_vert_l_ssse3 + * - ih264_intra_pred_luma_4x4_mode_horz_u_ssse3 + * - ih264_intra_pred_luma_8x8_mode_vert_ssse3 + * - ih264_intra_pred_luma_8x8_mode_horz_ssse3 + * - ih264_intra_pred_luma_8x8_mode_dc_ssse3 + * - ih264_intra_pred_luma_8x8_mode_diag_dl_ssse3 + * - ih264_intra_pred_luma_8x8_mode_diag_dr_ssse3 + * - ih264_intra_pred_luma_8x8_mode_vert_r_ssse3 + * - ih264_intra_pred_luma_8x8_mode_horz_d_ssse3 + * - ih264_intra_pred_luma_8x8_mode_vert_l_ssse3 + * - ih264_intra_pred_luma_8x8_mode_horz_u_ssse3 + * - ih264_intra_pred_luma_16x16_mode_vert_ssse3 + * - ih264_intra_pred_luma_16x16_mode_horz_ssse3 + * - ih264_intra_pred_luma_16x16_mode_dc_ssse3 + * - ih264_intra_pred_luma_16x16_mode_plane_ssse3 + * + * @remarks + * None + * + ****************************************************************************** + */ + +/*****************************************************************************/ +/* File Includes */ +/*****************************************************************************/ +/* System include files */ +#include <stdio.h> +#include <stddef.h> +#include <string.h> +#include <immintrin.h> + +/* User include files */ +#include "ih264_defs.h" +#include "ih264_typedefs.h" +#include "ih264_macros.h" +#include "ih264_platform_macros.h" +#include "ih264_intra_pred_filters.h" + + + +/******************* LUMA INTRAPREDICTION *******************/ + +/******************* 4x4 Modes *******************/ + +/** + ******************************************************************************* + * + * ih264_intra_pred_luma_4x4_mode_vert_ssse3 + * + * @brief + * Perform Intra prediction for luma_4x4 mode:vertical + * + * @par Description: + * Perform Intra prediction for luma_4x4 mode:vertical ,described in sec 8.3.1.2.1 + * + * @param[in] pu1_src + * UWORD8 pointer to the source + * + * @param[out] pu1_dst + * UWORD8 pointer to the destination + * + * @param[in] src_strd + * integer source stride + * + * @param[in] dst_strd + * integer destination stride + * + * @param[in] ngbr_avail + * availability of neighbouring pixels(Not used in this function) + * + * @returns + * + * @remarks + * None + * + ******************************************************************************* + */ +void ih264_intra_pred_luma_4x4_mode_vert_ssse3(UWORD8 *pu1_src, + UWORD8 *pu1_dst, + WORD32 src_strd, + WORD32 dst_strd, + WORD32 ngbr_avail) +{ + UWORD8 *pu1_top; + WORD32 dst_strd2, dst_strd3; + + __m128i top_16x8b; + __m128i mask_full_128b, mask_low_32b; + + UNUSED(src_strd); + UNUSED(ngbr_avail); + + mask_full_128b = _mm_set1_epi8(0xff); + + pu1_top = pu1_src + BLK_SIZE + 1; + + mask_low_32b = _mm_srli_si128(mask_full_128b, 12); + + top_16x8b = _mm_loadl_epi64((__m128i *)pu1_top); + + dst_strd2 = dst_strd << 1; + dst_strd3 = dst_strd + dst_strd2; + + _mm_maskmoveu_si128(top_16x8b, mask_low_32b, (char*)pu1_dst); + _mm_maskmoveu_si128(top_16x8b, mask_low_32b, (char*)(pu1_dst + dst_strd)); + _mm_maskmoveu_si128(top_16x8b, mask_low_32b, (char*)(pu1_dst + dst_strd2)); + _mm_maskmoveu_si128(top_16x8b, mask_low_32b, (char*)(pu1_dst + dst_strd3)); +} + +/** + ******************************************************************************* + * + *ih264_intra_pred_luma_4x4_mode_horz_ssse3 + * + * @brief + * Perform Intra prediction for luma_4x4 mode:horizontal + * + * @par Description: + * Perform Intra prediction for luma_4x4 mode:horizontal ,described in sec 8.3.1.2.2 + * + * @param[in] pu1_src + * UWORD8 pointer to the source + * + * @param[out] pu1_dst + * UWORD8 pointer to the destination + * + * @param[in] src_strd + * integer source stride + * + * @param[in] dst_strd + * integer destination stride + * + * @param[in] ngbr_avail + * availability of neighbouring pixels(Not used in this function) + * + * @returns + * + * @remarks + * None + * + ******************************************************************************* + */ +void ih264_intra_pred_luma_4x4_mode_horz_ssse3(UWORD8 *pu1_src, + UWORD8 *pu1_dst, + WORD32 src_strd, + WORD32 dst_strd, + WORD32 ngbr_avail) +{ + UWORD8 *pu1_left; + WORD32 dst_strd2, dst_strd3; + WORD32 val1, val2; + + __m128i left_16x8b; + __m128i row1_16x8b, row2_16x8b, row3_16x8b, row4_16x8b; + __m128i mask_full_128b, mask_low_32b; + + UNUSED(src_strd); + UNUSED(ngbr_avail); + + mask_full_128b = _mm_set1_epi8(0xff); + + pu1_left = pu1_src + BLK_SIZE - 1; + + mask_low_32b = _mm_srli_si128(mask_full_128b, 12); + left_16x8b = _mm_loadl_epi64((__m128i *)(pu1_left - 3)); + + val1 = _mm_extract_epi16(left_16x8b, 1); + val2 = _mm_extract_epi16(left_16x8b, 0); + + row1_16x8b = _mm_set1_epi8(val1 >> 8); + row2_16x8b = _mm_set1_epi8(val1 & 0xff); + row3_16x8b = _mm_set1_epi8(val2 >> 8); + row4_16x8b = _mm_set1_epi8(val2 & 0xff); + + dst_strd2 = dst_strd << 1; + dst_strd3 = dst_strd + dst_strd2; + + _mm_maskmoveu_si128(row1_16x8b, mask_low_32b, (char*)pu1_dst); + _mm_maskmoveu_si128(row2_16x8b, mask_low_32b, (char*)(pu1_dst + dst_strd)); + _mm_maskmoveu_si128(row3_16x8b, mask_low_32b, (char*)(pu1_dst + dst_strd2)); + _mm_maskmoveu_si128(row4_16x8b, mask_low_32b, (char*)(pu1_dst + dst_strd3)); +} + +/** + ******************************************************************************* + * + * ih264_intra_pred_luma_4x4_mode_dc_ssse3 + * + * @brief + * Perform Intra prediction for luma_4x4 mode:DC + * + * @par Description: + * Perform Intra prediction for luma_4x4 mode:DC ,described in sec 8.3.1.2.3 + * + * @param[in] pu1_src + * UWORD8 pointer to the source + * + * @param[out] pu1_dst + * UWORD8 pointer to the destination + * + * @param[in] src_strd + * integer source stride + * + * @param[in] dst_strd + * integer destination stride + * + * @param[in] ngbr_avail + * availability of neighbouring pixels + * + * @returns + * + * @remarks + * None + * + *******************************************************************************/ +void ih264_intra_pred_luma_4x4_mode_dc_ssse3(UWORD8 *pu1_src, + UWORD8 *pu1_dst, + WORD32 src_strd, + WORD32 dst_strd, + WORD32 ngbr_avail) +{ + UWORD8 u1_useleft; /* availability of left predictors (only for DC) */ + UWORD8 u1_usetop; /* availability of top predictors (only for DC) */ + UWORD8 *pu1_left, *pu1_top; + WORD32 dc_val, flag; + WORD32 dst_strd2, dst_strd3; + + __m128i mask_full_128b, mask_low_32b; + __m128i dcval_16x8b; + + UNUSED(src_strd); + UNUSED(ngbr_avail); + + mask_full_128b = _mm_set1_epi8(0xff); + + u1_useleft = BOOLEAN(ngbr_avail & LEFT_MB_AVAILABLE_MASK); + u1_usetop = BOOLEAN(ngbr_avail & TOP_MB_AVAILABLE_MASK); + + pu1_left = pu1_src + BLK_SIZE - 1; + pu1_top = pu1_src + BLK_SIZE + 1; + + mask_low_32b = _mm_srli_si128(mask_full_128b, 12); + + flag = u1_useleft + u1_usetop; + + if(flag) + { + WORD32 shft, ofst = 0; + + __m128i left_16x8b, top_16x8b, val_16x8b, tmp_8x16b, zero_vector; + + if(u1_useleft) + { + left_16x8b = _mm_loadl_epi64((__m128i *)(pu1_left - 3)); + ofst += 2; + } + else + left_16x8b = _mm_setzero_si128(); + + zero_vector = _mm_setzero_si128(); + + if(u1_usetop) + { + top_16x8b = _mm_loadl_epi64((__m128i *)pu1_top); + ofst += 2; + } + else + top_16x8b = _mm_setzero_si128(); + + shft = flag + 1; + val_16x8b = _mm_unpacklo_epi32(left_16x8b, top_16x8b); + tmp_8x16b = _mm_sad_epu8(val_16x8b, zero_vector); + + dc_val = _mm_extract_epi16(tmp_8x16b, 0); + dc_val = (dc_val + ofst) >> shft; + } + else + dc_val = 128; + + dst_strd2 = dst_strd << 1; + dst_strd3 = dst_strd + dst_strd2; + + dcval_16x8b = _mm_set1_epi8(dc_val); + + _mm_maskmoveu_si128(dcval_16x8b, mask_low_32b, (char*)pu1_dst); + _mm_maskmoveu_si128(dcval_16x8b, mask_low_32b, (char*)(pu1_dst + dst_strd)); + _mm_maskmoveu_si128(dcval_16x8b, mask_low_32b, (char*)(pu1_dst + dst_strd2)); + _mm_maskmoveu_si128(dcval_16x8b, mask_low_32b, (char*)(pu1_dst + dst_strd3)); +} + +/** + ******************************************************************************* + * + * ih264_intra_pred_luma_4x4_mode_diag_dl_ssse3 + * + * @brief + * Perform Intra prediction for luma_4x4 mode:Diagonal_Down_Left + * + * @par Description: + * Perform Intra prediction for luma_4x4 mode:Diagonal_Down_Left ,described in sec 8.3.1.2.4 + * + * @param[in] pu1_src + * UWORD8 pointer to the source + * + * @param[out] pu1_dst + * UWORD8 pointer to the destination + * + * @param[in] src_strd + * integer source stride + * + * @param[in] dst_strd + * integer destination stride + * + * @param[in] ngbr_avail + * availability of neighbouring pixels(Not used in this function) + * + * @returns + * + * @remarks + * None + * + *******************************************************************************/ +void ih264_intra_pred_luma_4x4_mode_diag_dl_ssse3(UWORD8 *pu1_src, + UWORD8 *pu1_dst, + WORD32 src_strd, + WORD32 dst_strd, + WORD32 ngbr_avail) +{ + UWORD8 *pu1_top; + WORD32 dst_strd2, dst_strd3; + + __m128i top_16x8b, top_8x16b, top_sh_8x16b; + __m128i res1_8x16b, res2_8x16b, res_16x8b; + __m128i zero_vector, const_2_8x16b; + __m128i mask_full_128b, mask_low_32b; + + UNUSED(src_strd); + UNUSED(ngbr_avail); + + pu1_top = pu1_src + BLK_SIZE + 1; + + top_16x8b = _mm_loadl_epi64((__m128i *)pu1_top); + zero_vector = _mm_setzero_si128(); + top_8x16b = _mm_unpacklo_epi8(top_16x8b, zero_vector); //t0 t1 t2 t3 t4 t5 t6 t7 + + mask_full_128b = _mm_set1_epi8(0xff); + top_sh_8x16b = _mm_srli_si128(top_8x16b, 2); //t1 t2 t3 t4 t5 t6 t7 0 + const_2_8x16b = _mm_set1_epi16(2); + + top_sh_8x16b = _mm_shufflehi_epi16(top_sh_8x16b, 0xa4); //t1 t2 t3 t4 t5 t6 t7 t7 + res1_8x16b = _mm_add_epi16(top_8x16b, top_sh_8x16b); + mask_low_32b = _mm_srli_si128(mask_full_128b, 12); + res2_8x16b = _mm_srli_si128(res1_8x16b, 2); + + res1_8x16b = _mm_add_epi16(res1_8x16b, const_2_8x16b); + res1_8x16b = _mm_add_epi16(res2_8x16b, res1_8x16b); + res1_8x16b = _mm_srai_epi16(res1_8x16b, 2); + + dst_strd2 = dst_strd << 1; + dst_strd3 = dst_strd + dst_strd2; + + res_16x8b = _mm_packus_epi16(res1_8x16b, res1_8x16b); + _mm_maskmoveu_si128(res_16x8b, mask_low_32b, (char*)pu1_dst); + res_16x8b = _mm_srli_si128(res_16x8b, 1); + _mm_maskmoveu_si128(res_16x8b, mask_low_32b, (char*)(pu1_dst + dst_strd)); + res_16x8b = _mm_srli_si128(res_16x8b, 1); + _mm_maskmoveu_si128(res_16x8b, mask_low_32b, (char*)(pu1_dst + dst_strd2)); + res_16x8b = _mm_srli_si128(res_16x8b, 1); + _mm_maskmoveu_si128(res_16x8b, mask_low_32b, (char*)(pu1_dst + dst_strd3)); +} + +/** + ******************************************************************************* + * + * ih264_intra_pred_luma_4x4_mode_diag_dr_ssse3 + * + * @brief + * Perform Intra prediction for luma_4x4 mode:Diagonal_Down_Right + * + * @par Description: + * Perform Intra prediction for luma_4x4 mode:Diagonal_Down_Right ,described in sec 8.3.1.2.5 + * + * @param[in] pu1_src + * UWORD8 pointer to the source + * + * @param[out] pu1_dst + * UWORD8 pointer to the destination + * + * @param[in] src_strd + * integer source stride + * + * @param[in] dst_strd + * integer destination stride + * + * @param[in] ngbr_avail + * availability of neighbouring pixels(Not used in this function) + * + * @returns + * + * @remarks + * None + * + *******************************************************************************/ +void ih264_intra_pred_luma_4x4_mode_diag_dr_ssse3(UWORD8 *pu1_src, + UWORD8 *pu1_dst, + WORD32 src_strd, + WORD32 dst_strd, + WORD32 ngbr_avail) +{ + UWORD8 *pu1_left; + WORD32 dst_strd2, dst_strd3; + + __m128i top_left_16x8b, top_left_8x16b; + __m128i top_left_sh_16x8b, top_left_sh_8x16b; + __m128i res1_8x16b, res2_8x16b; + __m128i res1_16x8b, res2_16x8b; + __m128i zero_vector, const_2_8x16b; + __m128i mask_full_128b, mask_low_32b; + + UNUSED(src_strd); + UNUSED(ngbr_avail); + + pu1_left = pu1_src + BLK_SIZE - 1; + + top_left_16x8b = _mm_loadu_si128((__m128i *)(pu1_left - 3)); //l3 l2 l1 l0 tl t0 t1 t2... + zero_vector = _mm_setzero_si128(); + top_left_sh_16x8b = _mm_srli_si128(top_left_16x8b, 1); //l2 l1 l0 tl t0 t1 t2 t3... + + top_left_8x16b = _mm_unpacklo_epi8(top_left_16x8b, zero_vector); + top_left_sh_8x16b = _mm_unpacklo_epi8(top_left_sh_16x8b, zero_vector); + + mask_full_128b = _mm_set1_epi8(0xff); + res1_8x16b = _mm_add_epi16(top_left_8x16b, top_left_sh_8x16b); //l3+l2 l2+l1 l1+l0 l0+tl tl+t0 t0+t1 t1+t2 t2+t3... + const_2_8x16b = _mm_set1_epi16(2); + res2_8x16b = _mm_srli_si128(res1_8x16b, 2); //l2+l1 l1+l0 l0+tl tl+t0 t0+t1 t1+t2 t2+t3... + + res1_8x16b = _mm_add_epi16(res1_8x16b, const_2_8x16b); + mask_low_32b = _mm_srli_si128(mask_full_128b, 12); + res1_8x16b = _mm_add_epi16(res2_8x16b, res1_8x16b); //l3+2*l2+l1+2 l2+2*l1+l0+2... + res1_8x16b = _mm_srai_epi16(res1_8x16b, 2); + res1_16x8b = _mm_packus_epi16(res1_8x16b, res1_8x16b); + + dst_strd2 = dst_strd << 1; + dst_strd3 = dst_strd + dst_strd2; + + res2_16x8b = _mm_srli_si128(res1_16x8b, 3); + _mm_maskmoveu_si128(res2_16x8b, mask_low_32b, (char*)pu1_dst); + res2_16x8b = _mm_srli_si128(res1_16x8b, 2); + _mm_maskmoveu_si128(res2_16x8b, mask_low_32b, (char*)(pu1_dst + dst_strd)); + res2_16x8b = _mm_srli_si128(res1_16x8b, 1); + _mm_maskmoveu_si128(res2_16x8b, mask_low_32b, (char*)(pu1_dst + dst_strd2)); + _mm_maskmoveu_si128(res1_16x8b, mask_low_32b, (char*)(pu1_dst + dst_strd3)); +} + +/** + ******************************************************************************* + * + * ih264_intra_pred_luma_4x4_mode_vert_r_ssse3 + * + * @brief + * Perform Intra prediction for luma_4x4 mode:Vertical_Right + * + * @par Description: + * Perform Intra prediction for luma_4x4 mode:Vertical_Right ,described in sec 8.3.1.2.6 + * + * @param[in] pu1_src + * UWORD8 pointer to the source + * + * @param[out] pu1_dst + * UWORD8 pointer to the destination + * + * @param[in] src_strd + * integer source stride + * + * @param[in] dst_strd + * integer destination stride + * + * @param[in] ngbr_avail + * availability of neighbouring pixels(Not used in this function) + * + * @returns + * + * @remarks + * None + * + *******************************************************************************/ +void ih264_intra_pred_luma_4x4_mode_vert_r_ssse3(UWORD8 *pu1_src, + UWORD8 *pu1_dst, + WORD32 src_strd, + WORD32 dst_strd, + WORD32 ngbr_avail) +{ + UWORD8 *pu1_left; + WORD32 dst_strd2, dst_strd3; + + __m128i val_16x8b, temp_16x8b; + __m128i w11_a1_16x8b, w11_a2_16x8b; + __m128i w121_a1_8x16b, w121_a2_8x16b, w121_sh_8x16b; + __m128i row1_16x8b, row2_16x8b, row3_16x8b, row4_16x8b; + __m128i zero_vector, const_2_8x16b; + __m128i mask_full_128b, mask_low_32b; + + UNUSED(src_strd); + UNUSED(ngbr_avail); + + mask_full_128b = _mm_set1_epi8(0xff); + mask_low_32b = _mm_srli_si128(mask_full_128b, 12); + + pu1_left = pu1_src + BLK_SIZE - 1; + + val_16x8b = _mm_loadl_epi64((__m128i *)(pu1_left - 2)); + zero_vector = _mm_setzero_si128(); + + w121_a1_8x16b = _mm_unpacklo_epi8(val_16x8b, zero_vector); //l2 l1 l0 tl t0 t1 t2 t3 + w11_a1_16x8b = _mm_srli_si128(val_16x8b, 3); + w121_a2_8x16b = _mm_srli_si128(w121_a1_8x16b, 2); //l1 l0 tl t0 t1 t2 t3 0 + w11_a2_16x8b = _mm_srli_si128(val_16x8b, 4); + + w121_a1_8x16b = _mm_add_epi16(w121_a1_8x16b, w121_a2_8x16b); //l2+l1 l1+l0 l0+tl tl+t0 t0+t1 t1+t2 t2+t3 t3 + row1_16x8b = _mm_avg_epu8(w11_a1_16x8b, w11_a2_16x8b); + w121_a2_8x16b = _mm_srli_si128(w121_a1_8x16b, 2); //l1+l0 l0+tl tl+t0 t0+t1 t1+t2 t2+t3 t3 0 + + const_2_8x16b = _mm_set1_epi16(2); + w121_a1_8x16b = _mm_add_epi16(w121_a1_8x16b, w121_a2_8x16b); //l2+2*l1+l0 l1+2*l0+tl ... + w121_a1_8x16b = _mm_add_epi16(w121_a1_8x16b, const_2_8x16b); + w121_a1_8x16b = _mm_srai_epi16(w121_a1_8x16b, 2); + + w121_sh_8x16b = _mm_shufflelo_epi16(w121_a1_8x16b, 0xe1); + w121_sh_8x16b = _mm_srli_si128(w121_sh_8x16b, 2); + + row4_16x8b = _mm_packus_epi16(w121_sh_8x16b, w121_sh_8x16b); + temp_16x8b = _mm_slli_si128(w121_a1_8x16b, 13); + row2_16x8b = _mm_srli_si128(row4_16x8b, 1); + row3_16x8b = _mm_alignr_epi8(row1_16x8b, temp_16x8b, 15); + + dst_strd2 = dst_strd << 1; + dst_strd3 = dst_strd + dst_strd2; + + _mm_maskmoveu_si128(row1_16x8b, mask_low_32b, (char*)pu1_dst); + _mm_maskmoveu_si128(row2_16x8b, mask_low_32b, (char*)(pu1_dst + dst_strd)); + _mm_maskmoveu_si128(row3_16x8b, mask_low_32b, (char*)(pu1_dst + dst_strd2)); + _mm_maskmoveu_si128(row4_16x8b, mask_low_32b, (char*)(pu1_dst + dst_strd3)); +} + +/* + ******************************************************************************* + * + * ih264_intra_pred_luma_4x4_mode_horz_d_ssse3 + * + * @brief + * Perform Intra prediction for luma_4x4 mode:Horizontal_Down + * + * @par Description: + * Perform Intra prediction for luma_4x4 mode:Horizontal_Down ,described in sec 8.3.1.2.7 + * + * @param[in] pu1_src + * UWORD8 pointer to the source + * + * @param[out] pu1_dst + * UWORD8 pointer to the destination + * + * @param[in] src_strd + * integer source stride + * + * @param[in] dst_strd + * integer destination stride + * + * @param[in] ngbr_avail + * availability of neighbouring pixels(Not used in this function) + * + * @returns + * + * @remarks + * None + * + *******************************************************************************/ +void ih264_intra_pred_luma_4x4_mode_horz_d_ssse3(UWORD8 *pu1_src, + UWORD8 *pu1_dst, + WORD32 src_strd, + WORD32 dst_strd, + WORD32 ngbr_avail) +{ + UWORD8 *pu1_left; + WORD32 dst_strd2, dst_strd3; + WORD32 val_121_t0t1; + + __m128i val_16x8b, val_sh_16x8b; + __m128i w11_16x8b; + __m128i w121_a1_8x16b, w121_a2_8x16b, w121_16x8b; + __m128i row1_16x8b, row2_16x8b, row3_16x8b, row4_16x8b; + + __m128i zero_vector, const_2_8x16b; + __m128i mask_full_128b, mask_low_32b; + + UNUSED(src_strd); + UNUSED(ngbr_avail); + + mask_full_128b = _mm_set1_epi8(0xff); + mask_low_32b = _mm_srli_si128(mask_full_128b, 12); + + pu1_left = pu1_src + BLK_SIZE - 1; + + val_16x8b = _mm_loadl_epi64((__m128i *)(pu1_left - 3)); + zero_vector = _mm_setzero_si128(); + val_sh_16x8b = _mm_srli_si128(val_16x8b, 1); + w11_16x8b = _mm_avg_epu8(val_16x8b, val_sh_16x8b); + + w121_a1_8x16b = _mm_unpacklo_epi8(val_16x8b, zero_vector); //l3 l2 l1 l0 tl t0 t1 t2 + w121_a2_8x16b = _mm_srli_si128(w121_a1_8x16b, 2); //l2 l1 l0 tl t0 t1 t2 0 + w121_a1_8x16b = _mm_add_epi16(w121_a1_8x16b, w121_a2_8x16b); //l3+l2 l2+l1 l1+l0 l0+tl tl+t0 t0+t1 t1+t2 t2 + w121_a2_8x16b = _mm_srli_si128(w121_a1_8x16b, 2); //l2+l1 l1+l0 l0+tl tl+t0 t0+t1 t1+t2 t2 0 + + zero_vector = _mm_setzero_si128(); + const_2_8x16b = _mm_set1_epi16(2); + + w121_a1_8x16b = _mm_add_epi16(w121_a1_8x16b, w121_a2_8x16b); //l3+2*l2+l1 l2+2*l1+l0 l1+2*l0+tl ... + w121_a1_8x16b = _mm_add_epi16(w121_a1_8x16b, const_2_8x16b); + w121_a1_8x16b = _mm_srai_epi16(w121_a1_8x16b, 2); + + w121_16x8b = _mm_packus_epi16(w121_a1_8x16b, w121_a1_8x16b); + + row4_16x8b = _mm_unpacklo_epi8(w11_16x8b, w121_16x8b); + val_121_t0t1 = _mm_extract_epi16(w121_16x8b, 2); + row4_16x8b = _mm_insert_epi16(row4_16x8b, val_121_t0t1, 4); + + dst_strd2 = dst_strd << 1; + dst_strd3 = dst_strd + dst_strd2; + + row1_16x8b = _mm_srli_si128(row4_16x8b, 6); + row2_16x8b = _mm_srli_si128(row4_16x8b, 4); + row3_16x8b = _mm_srli_si128(row4_16x8b, 2); + + _mm_maskmoveu_si128(row1_16x8b, mask_low_32b, (char*)pu1_dst); + _mm_maskmoveu_si128(row2_16x8b, mask_low_32b, (char*)(pu1_dst + dst_strd)); + _mm_maskmoveu_si128(row3_16x8b, mask_low_32b, (char*)(pu1_dst + dst_strd2)); + _mm_maskmoveu_si128(row4_16x8b, mask_low_32b, (char*)(pu1_dst + dst_strd3)); +} + +/** + ******************************************************************************* + * + * ih264_intra_pred_luma_4x4_mode_vert_l_ssse3 + * + * @brief + * Perform Intra prediction for luma_4x4 mode:Vertical_Left + * + * @par Description: + * Perform Intra prediction for luma_4x4 mode:Vertical_Left ,described in sec 8.3.1.2.8 + * + * @param[in] pu1_src + * UWORD8 pointer to the source + * + * @param[out] pu1_dst + * UWORD8 pointer to the destination + * + * @param[in] src_strd + * integer source stride + * + * @param[in] dst_strd + * integer destination stride + * + * @param[in] ngbr_avail + * availability of neighbouring pixels(Not used in this function) + * + * @returns + * + * @remarks + * None + * + *******************************************************************************/ +void ih264_intra_pred_luma_4x4_mode_vert_l_ssse3(UWORD8 *pu1_src, + UWORD8 *pu1_dst, + WORD32 src_strd, + WORD32 dst_strd, + WORD32 ngbr_avail) +{ + UWORD8 *pu1_top; + WORD32 dst_strd2, dst_strd3; + + __m128i val_16x8b, val_sh_16x8b; + __m128i w121_a1_8x16b, w121_a2_8x16b; + __m128i row1_16x8b, row2_16x8b, row3_16x8b, row4_16x8b; + + __m128i zero_vector, const_2_8x16b; + __m128i mask_full_128b, mask_low_32b; + + UNUSED(src_strd); + UNUSED(ngbr_avail); + + mask_full_128b = _mm_set1_epi8(0xff); + mask_low_32b = _mm_srli_si128(mask_full_128b, 12); + + pu1_top = pu1_src +BLK_SIZE + 1; + + val_16x8b = _mm_loadl_epi64((__m128i *)pu1_top); + zero_vector = _mm_setzero_si128(); + val_sh_16x8b = _mm_srli_si128(val_16x8b, 1); + row1_16x8b = _mm_avg_epu8(val_16x8b, val_sh_16x8b); + + w121_a1_8x16b = _mm_unpacklo_epi8(val_16x8b, zero_vector); //t0 t1 t2 t3 t4 t5... + w121_a2_8x16b = _mm_srli_si128(w121_a1_8x16b, 2); //t1 t2 t3 t4 t5 t6... + w121_a1_8x16b = _mm_add_epi16(w121_a1_8x16b, w121_a2_8x16b); //t0+t1 t1+t2 t2+t3 t3+t4 t4+t5... + w121_a2_8x16b = _mm_srli_si128(w121_a1_8x16b, 2); //t1+t2 t2+t3 t3+t4 t4+t5 t5+t6... + + zero_vector = _mm_setzero_si128(); + const_2_8x16b = _mm_set1_epi16(2); + + w121_a1_8x16b = _mm_add_epi16(w121_a1_8x16b, w121_a2_8x16b); //t0+2*t1+t2 t1+2*t2+t3 t2+2*t3+t4... + w121_a1_8x16b = _mm_add_epi16(w121_a1_8x16b, const_2_8x16b); + w121_a1_8x16b = _mm_srai_epi16(w121_a1_8x16b, 2); + + row2_16x8b = _mm_packus_epi16(w121_a1_8x16b, w121_a1_8x16b); + + dst_strd2 = dst_strd << 1; + dst_strd3 = dst_strd + dst_strd2; + + row3_16x8b = _mm_srli_si128(row1_16x8b, 1); + row4_16x8b = _mm_srli_si128(row2_16x8b, 1); + + _mm_maskmoveu_si128(row1_16x8b, mask_low_32b, (char*)pu1_dst); + _mm_maskmoveu_si128(row2_16x8b, mask_low_32b, (char*)(pu1_dst + dst_strd)); + _mm_maskmoveu_si128(row3_16x8b, mask_low_32b, (char*)(pu1_dst + dst_strd2)); + _mm_maskmoveu_si128(row4_16x8b, mask_low_32b, (char*)(pu1_dst + dst_strd3)); +} + +/** + ******************************************************************************* + * + * ih264_intra_pred_luma_4x4_mode_horz_u_ssse3 + * + * @brief + * Perform Intra prediction for luma_4x4 mode:Horizontal_Up + * + * @par Description: + * Perform Intra prediction for luma_4x4 mode:Horizontal_Up ,described in sec 8.3.1.2.9 + * + * @param[in] pu1_src + * UWORD8 pointer to the source + * + * @param[out] pu1_dst + * UWORD8 pointer to the destination + * + * @param[in] src_strd + * integer source stride + * + * @param[in] dst_strd + * integer destination stride + * + * @param[in] ngbr_avail + * availability of neighbouring pixels(Not used in this function) + * + * @returns + * + * @remarks + * None + * + *******************************************************************************/ +void ih264_intra_pred_luma_4x4_mode_horz_u_ssse3(UWORD8 *pu1_src, + UWORD8 *pu1_dst, + WORD32 src_strd, + WORD32 dst_strd, + WORD32 ngbr_avail) +{ + UWORD8 *pu1_left; + WORD32 dst_strd2, dst_strd3; + + __m128i val_16x8b, val_sh_16x8b; + __m128i w11_16x8b; + __m128i w121_a1_8x16b, w121_a2_8x16b, w121_16x8b; + __m128i row1_16x8b, row2_16x8b, row3_16x8b, row4_16x8b; + + __m128i zero_vector, const_2_8x16b, rev_16x8b; + __m128i mask_full_128b, mask_low_32b; + + UNUSED(src_strd); + UNUSED(ngbr_avail); + + mask_full_128b = _mm_set1_epi8(0xff); + mask_low_32b = _mm_srli_si128(mask_full_128b, 12); + + pu1_left = pu1_src + BLK_SIZE - 1; + + zero_vector = _mm_setzero_si128(); + rev_16x8b = _mm_setr_epi8(3, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0); + + val_16x8b = _mm_loadl_epi64((__m128i *)(pu1_left - 3)); //l3 l2 l1 l0 0 0 0... + val_16x8b = _mm_shuffle_epi8(val_16x8b, rev_16x8b); //l0 l1 l2 l3 l3 l3 l3... + + val_sh_16x8b = _mm_srli_si128(val_16x8b, 1); + w11_16x8b = _mm_avg_epu8(val_16x8b, val_sh_16x8b); + + w121_a1_8x16b = _mm_unpacklo_epi8(val_16x8b, zero_vector); //l0 l1 l2 l3 l3 l3... + w121_a2_8x16b = _mm_srli_si128(w121_a1_8x16b, 2); //l1 l2 l3 l3 l3 l3... + + w121_a1_8x16b = _mm_add_epi16(w121_a1_8x16b, w121_a2_8x16b); //l0+t1 l1+l2 l2+l3 2*l3 2*l3... + w121_a2_8x16b = _mm_srli_si128(w121_a1_8x16b, 2); //l1+t2 l2+l3 2*l3 2*l3 2*l3... + + zero_vector = _mm_setzero_si128(); + const_2_8x16b = _mm_set1_epi16(2); + + w121_a1_8x16b = _mm_add_epi16(w121_a1_8x16b, w121_a2_8x16b); //l0+2*l1+l2 l1+2*l2+l3 l2+3*l3 4*l3 4*l3... + w121_a1_8x16b = _mm_add_epi16(w121_a1_8x16b, const_2_8x16b); + w121_a1_8x16b = _mm_srai_epi16(w121_a1_8x16b, 2); + + w121_16x8b = _mm_packus_epi16(w121_a1_8x16b, w121_a1_8x16b); + + dst_strd2 = dst_strd << 1; + dst_strd3 = dst_strd + dst_strd2; + + row1_16x8b = _mm_unpacklo_epi8(w11_16x8b, w121_16x8b); + row2_16x8b = _mm_srli_si128(row1_16x8b, 2); + row3_16x8b = _mm_srli_si128(row1_16x8b, 4); + row4_16x8b = _mm_srli_si128(row1_16x8b, 6); + + _mm_maskmoveu_si128(row1_16x8b, mask_low_32b, (char*)pu1_dst); + _mm_maskmoveu_si128(row2_16x8b, mask_low_32b, (char*)(pu1_dst + dst_strd)); + _mm_maskmoveu_si128(row3_16x8b, mask_low_32b, (char*)(pu1_dst + dst_strd2)); + _mm_maskmoveu_si128(row4_16x8b, mask_low_32b, (char*)(pu1_dst + dst_strd3)); +} + +/******************* 8x8 Modes *******************/ + +/** + ******************************************************************************* + * + * ih264_intra_pred_luma_8x8_mode_vert_ssse3 + * + * @brief + * Perform Intra prediction for luma_8x8 mode:vertical + * + * @par Description: + * Perform Intra prediction for luma_8x8 mode:vertical ,described in sec 8.3.2.2.2 + * + * @param[in] pu1_src + * UWORD8 pointer to the source + * + * @param[out] pu1_dst + * UWORD8 pointer to the destination + * + * @param[in] src_strd + * integer source stride + * + * @param[in] dst_strd + * integer destination stride + * + * @param[in] ngbr_avail + * availability of neighbouring pixels(Not used in this function) + * + * @returns + * + * @remarks + * None + * + ******************************************************************************* + */ +void ih264_intra_pred_luma_8x8_mode_vert_ssse3(UWORD8 *pu1_src, + UWORD8 *pu1_dst, + WORD32 src_strd, + WORD32 dst_strd, + WORD32 ngbr_avail) +{ + UWORD8 *pu1_top = NULL; + __m128i top_8x8b; + UNUSED(src_strd); + UNUSED(ngbr_avail); + pu1_top = pu1_src + BLK8x8SIZE + 1; + + top_8x8b = _mm_loadl_epi64((__m128i *)pu1_top); + + _mm_storel_epi64((__m128i *)(pu1_dst + 0 * dst_strd), top_8x8b); + _mm_storel_epi64((__m128i *)(pu1_dst + 1 * dst_strd), top_8x8b); + _mm_storel_epi64((__m128i *)(pu1_dst + 2 * dst_strd), top_8x8b); + _mm_storel_epi64((__m128i *)(pu1_dst + 3 * dst_strd), top_8x8b); + _mm_storel_epi64((__m128i *)(pu1_dst + 4 * dst_strd), top_8x8b); + _mm_storel_epi64((__m128i *)(pu1_dst + 5 * dst_strd), top_8x8b); + _mm_storel_epi64((__m128i *)(pu1_dst + 6 * dst_strd), top_8x8b); + _mm_storel_epi64((__m128i *)(pu1_dst + 7 * dst_strd), top_8x8b); +} + +/** + ******************************************************************************* + * + * ih264_intra_pred_luma_8x8_mode_horz_ssse3 + * + * @brief + * Perform Intra prediction for luma_8x8 mode:horizontal + * + * @par Description: + * Perform Intra prediction for uma_8x8 mode:horizontal ,described in sec 8.3.2.2.2 + * + * @param[in] pu1_src + * UWORD8 pointer to the source + * + * @param[out] pu1_dst + * UWORD8 pointer to the destination + * + * @param[in] src_strd + * integer source stride + * + * @param[in] dst_strd + * integer destination stride + * + * @param[in] ngbr_avail + * availability of neighbouring pixels(Not used in this function) + * + * @returns + * + * @remarks + * None + * + ******************************************************************************* + */ +void ih264_intra_pred_luma_8x8_mode_horz_ssse3(UWORD8 *pu1_src, + UWORD8 *pu1_dst, + WORD32 src_strd, + WORD32 dst_strd, + WORD32 ngbr_avail) +{ + UWORD8 *pu1_left = pu1_src + BLK8x8SIZE - 1; + __m128i row1_8x8b, row2_8x8b, row3_8x8b, row4_8x8b; + __m128i row5_8x8b, row6_8x8b, row7_8x8b, row8_8x8b; + + UNUSED(src_strd); + UNUSED(ngbr_avail); + + row1_8x8b = _mm_set1_epi8(pu1_left[0]); + row2_8x8b = _mm_set1_epi8(pu1_left[-1]); + row3_8x8b = _mm_set1_epi8(pu1_left[-2]); + row4_8x8b = _mm_set1_epi8(pu1_left[-3]); + row5_8x8b = _mm_set1_epi8(pu1_left[-4]); + row6_8x8b = _mm_set1_epi8(pu1_left[-5]); + row7_8x8b = _mm_set1_epi8(pu1_left[-6]); + row8_8x8b = _mm_set1_epi8(pu1_left[-7]); + + _mm_storel_epi64((__m128i *)(pu1_dst + 0 * dst_strd), row1_8x8b); + _mm_storel_epi64((__m128i *)(pu1_dst + 1 * dst_strd), row2_8x8b); + _mm_storel_epi64((__m128i *)(pu1_dst + 2 * dst_strd), row3_8x8b); + _mm_storel_epi64((__m128i *)(pu1_dst + 3 * dst_strd), row4_8x8b); + _mm_storel_epi64((__m128i *)(pu1_dst + 4 * dst_strd), row5_8x8b); + _mm_storel_epi64((__m128i *)(pu1_dst + 5 * dst_strd), row6_8x8b); + _mm_storel_epi64((__m128i *)(pu1_dst + 6 * dst_strd), row7_8x8b); + _mm_storel_epi64((__m128i *)(pu1_dst + 7 * dst_strd), row8_8x8b); +} + +/** + ******************************************************************************* + * + * ih264_intra_pred_luma_8x8_mode_dc_ssse3 + * + * @brief + * Perform Intra prediction for luma_8x8 mode:DC + * + * @par Description: + * Perform Intra prediction for luma_8x8 mode:DC ,described in sec 8.3.2.2.4 + * + * @param[in] pu1_src + * UWORD8 pointer to the source + * + * @param[out] pu1_dst + * UWORD8 pointer to the destination + * + * @param[in] src_strd + * integer source stride + * + * @param[in] dst_strd + * integer destination stride + * + * @param[in] ngbr_avail + * availability of neighbouring pixels + * + * @returns + * + * @remarks + * None + * + *******************************************************************************/ +void ih264_intra_pred_luma_8x8_mode_dc_ssse3(UWORD8 *pu1_src, + UWORD8 *pu1_dst, + WORD32 src_strd, + WORD32 dst_strd, + WORD32 ngbr_avail) +{ + UWORD8 u1_useleft; /* availability of left predictors (only for DC) */ + UWORD8 u1_usetop; /* availability of top predictors (only for DC) */ + UWORD8 *pu1_left = NULL; /* Pointer to start of left predictors */ + UWORD8 *pu1_top = NULL; /* Pointer to start of top predictors */ + __m128i dc_val_8x8b; + WORD32 dc_val = 0; + UNUSED(src_strd); + + u1_useleft = BOOLEAN(ngbr_avail & LEFT_MB_AVAILABLE_MASK); + u1_usetop = BOOLEAN(ngbr_avail & TOP_MB_AVAILABLE_MASK); + pu1_top = pu1_src + BLK8x8SIZE + 1; + pu1_left = pu1_src + BLK8x8SIZE - 1; + + if(u1_useleft || u1_usetop) + { + WORD32 shft = 2; + __m128i val_8x8b, zero_8x8b, sum_8x16b; + + zero_8x8b = _mm_setzero_si128(); + + if(u1_useleft) + { + val_8x8b = _mm_loadl_epi64((__m128i *)(pu1_left - 7)); + sum_8x16b = _mm_sad_epu8(zero_8x8b, val_8x8b); + + shft++; + dc_val += 4; + dc_val += _mm_extract_epi16(sum_8x16b, 0); + } + if(u1_usetop) + { + val_8x8b = _mm_loadl_epi64((__m128i *)pu1_top); + sum_8x16b = _mm_sad_epu8(zero_8x8b, val_8x8b); + + shft++; + dc_val += 4; + dc_val += _mm_extract_epi16(sum_8x16b, 0); + } + dc_val = dc_val >> shft; + } + else + dc_val = 128; + + dc_val_8x8b = _mm_set1_epi8(dc_val); + + _mm_storel_epi64((__m128i *)(pu1_dst + 0 * dst_strd), dc_val_8x8b); + _mm_storel_epi64((__m128i *)(pu1_dst + 1 * dst_strd), dc_val_8x8b); + _mm_storel_epi64((__m128i *)(pu1_dst + 2 * dst_strd), dc_val_8x8b); + _mm_storel_epi64((__m128i *)(pu1_dst + 3 * dst_strd), dc_val_8x8b); + _mm_storel_epi64((__m128i *)(pu1_dst + 4 * dst_strd), dc_val_8x8b); + _mm_storel_epi64((__m128i *)(pu1_dst + 5 * dst_strd), dc_val_8x8b); + _mm_storel_epi64((__m128i *)(pu1_dst + 6 * dst_strd), dc_val_8x8b); + _mm_storel_epi64((__m128i *)(pu1_dst + 7 * dst_strd), dc_val_8x8b); +} + +/** + ******************************************************************************* + * + * ih264_intra_pred_luma_8x8_mode_diag_dl_ssse3 + * + * @brief + * Perform Intra prediction for luma_8x8 mode:Diagonal_Down_Left + * + * @par Description: + * Perform Intra prediction for luma_8x8 mode:Diagonal_Down_Left ,described in sec 8.3.2.2.5 + * + * @param[in] pu1_src + * UWORD8 pointer to the source + * + * @param[out] pu1_dst + * UWORD8 pointer to the destination + * + * @param[in] src_strd + * integer source stride + * + * @param[in] dst_strd + * integer destination stride + * + * @param[in] ngbr_avail + * availability of neighbouring pixels(Not used in this function) + * + * @returns + * + * @remarks + * None + * + *******************************************************************************/ +void ih264_intra_pred_luma_8x8_mode_diag_dl_ssse3(UWORD8 *pu1_src, + UWORD8 *pu1_dst, + WORD32 src_strd, + WORD32 dst_strd, + WORD32 ngbr_avail) +{ + UWORD8 *pu1_top = NULL; /* Pointer to start of top predictors */ + __m128i top_16x8; + __m128i out_15x16; + __m128i a0_8x16, a1_8x16, a2_8x16; + __m128i temp1, temp2; + __m128i res1_8x16, res2_8x16; + __m128i zero = _mm_setzero_si128(); + __m128i const_val2_8x16 = _mm_set1_epi16(2); + + UNUSED(src_strd); + UNUSED(ngbr_avail); + + pu1_top = pu1_src + BLK8x8SIZE + 1; + + top_16x8 = _mm_loadu_si128((__m128i *)(pu1_top)); + + temp1 = _mm_srli_si128(top_16x8, 1); + temp2 = _mm_srli_si128(top_16x8, 2); + a0_8x16 = _mm_unpacklo_epi8(top_16x8, zero); + a1_8x16 = _mm_unpacklo_epi8(temp1, zero); + a2_8x16 = _mm_unpacklo_epi8(temp2, zero); + + a0_8x16 = _mm_add_epi16(a0_8x16, a2_8x16); + a1_8x16 = _mm_add_epi16(a1_8x16, a1_8x16); + a0_8x16 = _mm_add_epi16(a0_8x16, const_val2_8x16); + a0_8x16 = _mm_add_epi16(a0_8x16, a1_8x16); + res1_8x16 = _mm_srai_epi16(a0_8x16, 2); + + temp2 = _mm_srli_si128(top_16x8, 2); + temp1 = _mm_srli_si128(top_16x8, 1); + a2_8x16 = _mm_unpackhi_epi8(temp2, zero); + a0_8x16 = _mm_unpackhi_epi8(top_16x8, zero); + a2_8x16 = _mm_shufflehi_epi16(a2_8x16, 0x14); + a1_8x16 = _mm_unpackhi_epi8(temp1, zero); + + a0_8x16 = _mm_add_epi16(a0_8x16, a2_8x16); + a1_8x16 = _mm_add_epi16(a1_8x16, a1_8x16); + a0_8x16 = _mm_add_epi16(a0_8x16, const_val2_8x16); + a0_8x16 = _mm_add_epi16(a0_8x16, a1_8x16); + res2_8x16 = _mm_srai_epi16(a0_8x16, 2); + + out_15x16 = _mm_packus_epi16(res1_8x16, res2_8x16); + + _mm_storel_epi64((__m128i *)(pu1_dst + 0 * dst_strd), out_15x16); + out_15x16 = _mm_srli_si128(out_15x16, 1); + _mm_storel_epi64((__m128i *)(pu1_dst + 1 * dst_strd), out_15x16); + out_15x16 = _mm_srli_si128(out_15x16, 1); + _mm_storel_epi64((__m128i *)(pu1_dst + 2 * dst_strd), out_15x16); + out_15x16 = _mm_srli_si128(out_15x16, 1); + _mm_storel_epi64((__m128i *)(pu1_dst + 3 * dst_strd), out_15x16); + out_15x16 = _mm_srli_si128(out_15x16, 1); + _mm_storel_epi64((__m128i *)(pu1_dst + 4 * dst_strd), out_15x16); + out_15x16 = _mm_srli_si128(out_15x16, 1); + _mm_storel_epi64((__m128i *)(pu1_dst + 5 * dst_strd), out_15x16); + out_15x16 = _mm_srli_si128(out_15x16, 1); + _mm_storel_epi64((__m128i *)(pu1_dst + 6 * dst_strd), out_15x16); + out_15x16 = _mm_srli_si128(out_15x16, 1); + _mm_storel_epi64((__m128i *)(pu1_dst + 7 * dst_strd), out_15x16); +} + +/** + ******************************************************************************* + * + * ih264_intra_pred_luma_8x8_mode_diag_dr_ssse3 + * + * @brief + * Perform Intra prediction for luma_8x8 mode:Diagonal_Down_Right + * + * @par Description: + * Perform Intra prediction for luma_8x8 mode:Diagonal_Down_Right ,described in sec 8.3.2.2.6 + * + * @param[in] pu1_src + * UWORD8 pointer to the source + * + * @param[out] pu1_dst + * UWORD8 pointer to the destination + * + * @param[in] src_strd + * integer source stride + * + * @param[in] dst_strd + * integer destination stride + * + * @param[in] ngbr_avail + * availability of neighbouring pixels(Not used in this function) + * + * @returns + * + * @remarks + * None + * + *******************************************************************************/ +void ih264_intra_pred_luma_8x8_mode_diag_dr_ssse3(UWORD8 *pu1_src, + UWORD8 *pu1_dst, + WORD32 src_strd, + WORD32 dst_strd, + WORD32 ngbr_avail) +{ + UWORD8 *pu1_left = NULL; /* Pointer to start of left predictors */ + UWORD8 *pu1_top = NULL; /* Pointer to start of top predictors */ + __m128i top_8x8, left_16x8; + __m128i out_15x16; + __m128i a0_8x16, a1_8x16, a2_8x16; + __m128i temp1, temp2; + __m128i res1_8x16, res2_8x16; + __m128i zero = _mm_setzero_si128(); + __m128i const_val2_8x16 = _mm_set1_epi16(2); + __m128i str_8x8; + + UNUSED(src_strd); + UNUSED(ngbr_avail); + + pu1_left = pu1_src + BLK8x8SIZE - 1; + pu1_top = pu1_src + BLK8x8SIZE + 1; + + left_16x8 = _mm_loadu_si128((__m128i *)(pu1_left - 7)); + + temp1 = _mm_srli_si128(left_16x8, 1); + temp2 = _mm_srli_si128(left_16x8, 2); + a0_8x16 = _mm_unpacklo_epi8(left_16x8, zero); + a1_8x16 = _mm_unpacklo_epi8(temp1, zero); + a2_8x16 = _mm_unpacklo_epi8(temp2, zero); + + a0_8x16 = _mm_add_epi16(a0_8x16, a2_8x16); + a1_8x16 = _mm_add_epi16(a1_8x16, a1_8x16); + a0_8x16 = _mm_add_epi16(a0_8x16, const_val2_8x16); + a0_8x16 = _mm_add_epi16(a0_8x16, a1_8x16); + res1_8x16 = _mm_srai_epi16(a0_8x16, 2); + + top_8x8 = _mm_loadu_si128((__m128i *)(pu1_top - 1)); + + temp1 = _mm_srli_si128(top_8x8, 1); + temp2 = _mm_srli_si128(top_8x8, 2); + a0_8x16 = _mm_unpacklo_epi8(top_8x8, zero); + a1_8x16 = _mm_unpacklo_epi8(temp1, zero); + a2_8x16 = _mm_unpacklo_epi8(temp2, zero); + + a0_8x16 = _mm_add_epi16(a0_8x16, a2_8x16); + a1_8x16 = _mm_add_epi16(a1_8x16, a1_8x16); + a0_8x16 = _mm_add_epi16(a0_8x16, const_val2_8x16); + a0_8x16 = _mm_add_epi16(a0_8x16, a1_8x16); + res2_8x16 = _mm_srai_epi16(a0_8x16, 2); + + out_15x16 = _mm_packus_epi16(res1_8x16, res2_8x16); + + str_8x8 = _mm_srli_si128(out_15x16, 7); + _mm_storel_epi64((__m128i *)(pu1_dst + 0 * dst_strd), str_8x8); + str_8x8 = _mm_srli_si128(out_15x16, 6); + _mm_storel_epi64((__m128i *)(pu1_dst + 1 * dst_strd), str_8x8); + str_8x8 = _mm_srli_si128(out_15x16, 5); + _mm_storel_epi64((__m128i *)(pu1_dst + 2 * dst_strd), str_8x8); + str_8x8 = _mm_srli_si128(out_15x16, 4); + _mm_storel_epi64((__m128i *)(pu1_dst + 3 * dst_strd), str_8x8); + str_8x8 = _mm_srli_si128(out_15x16, 3); + _mm_storel_epi64((__m128i *)(pu1_dst + 4 * dst_strd), str_8x8); + str_8x8 = _mm_srli_si128(out_15x16, 2); + _mm_storel_epi64((__m128i *)(pu1_dst + 5 * dst_strd), str_8x8); + str_8x8 = _mm_srli_si128(out_15x16, 1); + _mm_storel_epi64((__m128i *)(pu1_dst + 6 * dst_strd), str_8x8); + _mm_storel_epi64((__m128i *)(pu1_dst + 7 * dst_strd), out_15x16); +} + +/** + ******************************************************************************* + * + * ih264_intra_pred_luma_8x8_mode_vert_r_ssse3 + * + * @brief + * Perform Intra prediction for luma_8x8 mode:Vertical_Right + * + * @par Description: + * Perform Intra prediction for luma_8x8 mode:Vertical_Right ,described in sec 8.3.2.2.7 + * + * @param[in] pu1_src + * UWORD8 pointer to the source + * + * @param[out] pu1_dst + * UWORD8 pointer to the destination + * + * @param[in] src_strd + * integer source stride + * + * @param[in] dst_strd + * integer destination stride + * + * @param[in] ngbr_avail + * availability of neighbouring pixels(Not used in this function) + * + * @returns + * + * @remarks + * None + * + *******************************************************************************/ +void ih264_intra_pred_luma_8x8_mode_vert_r_ssse3(UWORD8 *pu1_src, + UWORD8 *pu1_dst, + WORD32 src_strd, + WORD32 dst_strd, + WORD32 ngbr_avail) +{ + UWORD8 *pu1_left = NULL; /* Pointer to start of left predictors */ + UWORD8 *pu1_top = NULL; /* Pointer to start of top predictors */ + __m128i top_8x8, left_16x8; + __m128i out1_16x16, out2_16x16; + __m128i a0_8x16, a1_8x16, a2_8x16; + __m128i temp1, temp2; + __m128i res1_8x16, res2_8x16, res3_8x16; + __m128i zero = _mm_setzero_si128(); + __m128i const_val2_8x16 = _mm_set1_epi16(2); + __m128i str_8x8; + __m128i mask = _mm_set1_epi32(0xFFFF); + + UNUSED(src_strd); + UNUSED(ngbr_avail); + + pu1_left = pu1_src + BLK8x8SIZE - 1; + pu1_top = pu1_src + BLK8x8SIZE + 1; + + left_16x8 = _mm_loadu_si128((__m128i *)(pu1_left - 6)); + + temp1 = _mm_srli_si128(left_16x8, 1); + temp2 = _mm_srli_si128(left_16x8, 2); + a0_8x16 = _mm_unpacklo_epi8(left_16x8, zero); + a1_8x16 = _mm_unpacklo_epi8(temp1, zero); + a2_8x16 = _mm_unpacklo_epi8(temp2, zero); + + a0_8x16 = _mm_add_epi16(a0_8x16, a2_8x16); + a1_8x16 = _mm_add_epi16(a1_8x16, a1_8x16); + a0_8x16 = _mm_add_epi16(a0_8x16, const_val2_8x16); + a0_8x16 = _mm_add_epi16(a0_8x16, a1_8x16); + res1_8x16 = _mm_srai_epi16(a0_8x16, 2); + + top_8x8 = _mm_loadu_si128((__m128i *)(pu1_top - 1)); + + temp1 = _mm_srli_si128(top_8x8, 1); + temp2 = _mm_srli_si128(top_8x8, 2); + a0_8x16 = _mm_unpacklo_epi8(top_8x8, zero); + a1_8x16 = _mm_unpacklo_epi8(temp1, zero); + a2_8x16 = _mm_unpacklo_epi8(temp2, zero); + + res3_8x16 = _mm_avg_epu16(a0_8x16, a1_8x16); + + a0_8x16 = _mm_add_epi16(a0_8x16, a2_8x16); + a1_8x16 = _mm_add_epi16(a1_8x16, a1_8x16); + a0_8x16 = _mm_add_epi16(a0_8x16, const_val2_8x16); + a0_8x16 = _mm_add_epi16(a0_8x16, a1_8x16); + res2_8x16 = _mm_srai_epi16(a0_8x16, 2); + + str_8x8 = _mm_packus_epi16(res3_8x16, zero); + _mm_storel_epi64((__m128i *)(pu1_dst + 0 * dst_strd), str_8x8); + + temp1 = _mm_and_si128(res1_8x16, mask); + temp1 = _mm_packs_epi32(temp1, temp1); + out1_16x16 = _mm_packus_epi16(temp1, res2_8x16); + + res1_8x16 = _mm_slli_si128(res1_8x16, 2); + temp1 = _mm_and_si128(res1_8x16, mask); + temp1 = _mm_packs_epi32(temp1, temp1); + out2_16x16 = _mm_packus_epi16(temp1, res3_8x16); + + str_8x8 = _mm_srli_si128(out1_16x16, 7); + _mm_storel_epi64((__m128i *)(pu1_dst + 1 * dst_strd), str_8x8); + + str_8x8 = _mm_srli_si128(out2_16x16, 7); + _mm_storel_epi64((__m128i *)(pu1_dst + 2 * dst_strd), str_8x8); + + str_8x8 = _mm_srli_si128(out1_16x16, 6); + _mm_storel_epi64((__m128i *)(pu1_dst + 3 * dst_strd), str_8x8); + + str_8x8 = _mm_srli_si128(out2_16x16, 6); + _mm_storel_epi64((__m128i *)(pu1_dst + 4 * dst_strd), str_8x8); + + str_8x8 = _mm_srli_si128(out1_16x16, 5); + _mm_storel_epi64((__m128i *)(pu1_dst + 5 * dst_strd), str_8x8); + + str_8x8 = _mm_srli_si128(out2_16x16, 5); + _mm_storel_epi64((__m128i *)(pu1_dst + 6 * dst_strd), str_8x8); + + str_8x8 = _mm_srli_si128(out1_16x16, 4); + _mm_storel_epi64((__m128i *)(pu1_dst + 7 * dst_strd), str_8x8); +} + +/* + ******************************************************************************* + * + * ih264_intra_pred_luma_8x8_mode_horz_d_ssse3 + * + * @brief + * Perform Intra prediction for luma_8x8 mode:Horizontal_Down + * + * @par Description: + * Perform Intra prediction for luma_8x8 mode:Horizontal_Down ,described in sec 8.3.2.2.8 + * + * @param[in] pu1_src + * UWORD8 pointer to the source + * + * @param[out] pu1_dst + * UWORD8 pointer to the destination + * + * @param[in] src_strd + * integer source stride + * + * @param[in] dst_strd + * integer destination stride + * + * @param[in] ngbr_avail + * availability of neighbouring pixels(Not used in this function) + * + * @returns + * + * @remarks + * None + * + *******************************************************************************/ +void ih264_intra_pred_luma_8x8_mode_horz_d_ssse3(UWORD8 *pu1_src, + UWORD8 *pu1_dst, + WORD32 src_strd, + WORD32 dst_strd, + WORD32 ngbr_avail) +{ + UWORD8 *pu1_left = NULL; /* Pointer to start of left predictors */ + __m128i pels_16x16; + __m128i temp1, temp2, temp3, temp4; + __m128i a0_8x16, a1_8x16, a2_8x16; + __m128i zero = _mm_setzero_si128(); + __m128i const_val2_8x16 = _mm_set1_epi16(2); + __m128i res1_8x16, res2_8x16; + __m128i out1_16x16, out2_16x16; + __m128i str_8x8; + UNUSED(src_strd); + UNUSED(ngbr_avail); + + pu1_left = pu1_src + BLK8x8SIZE - 1; + + pels_16x16 = _mm_loadu_si128((__m128i *)(pu1_left - 7)); + + temp1 = _mm_srli_si128(pels_16x16, 1); + temp2 = _mm_srli_si128(pels_16x16, 2); + a0_8x16 = _mm_unpacklo_epi8(pels_16x16, zero); + a1_8x16 = _mm_unpacklo_epi8(temp1, zero); + a2_8x16 = _mm_unpacklo_epi8(temp2, zero); + + res1_8x16 = _mm_avg_epu16(a0_8x16, a1_8x16); + + a0_8x16 = _mm_add_epi16(a0_8x16, a2_8x16); + a1_8x16 = _mm_add_epi16(a1_8x16, a1_8x16); + a0_8x16 = _mm_add_epi16(a0_8x16, const_val2_8x16); + a0_8x16 = _mm_add_epi16(a0_8x16, a1_8x16); + res2_8x16 = _mm_srai_epi16(a0_8x16, 2); + + temp3 = _mm_unpacklo_epi16(res1_8x16, res2_8x16); + temp4 = _mm_unpackhi_epi16(res1_8x16, res2_8x16); + out2_16x16 = _mm_packus_epi16(temp3, temp4); + + a0_8x16 = _mm_unpackhi_epi8(pels_16x16, zero); + a1_8x16 = _mm_unpackhi_epi8(temp1, zero); + a2_8x16 = _mm_unpackhi_epi8(temp2, zero); + + a0_8x16 = _mm_add_epi16(a0_8x16, a2_8x16); + a1_8x16 = _mm_add_epi16(a1_8x16, a1_8x16); + a0_8x16 = _mm_add_epi16(a0_8x16, const_val2_8x16); + a0_8x16 = _mm_add_epi16(a0_8x16, a1_8x16); + res2_8x16 = _mm_srai_epi16(a0_8x16, 2); + + out1_16x16 = _mm_packus_epi16(res2_8x16, zero); + temp1 = _mm_srli_si128(out2_16x16, 8); + out1_16x16 = _mm_unpacklo_epi64(temp1, out1_16x16); + + str_8x8 = _mm_srli_si128(out1_16x16, 6); + _mm_storel_epi64((__m128i *)(pu1_dst + 0 * dst_strd), str_8x8); + str_8x8 = _mm_srli_si128(out1_16x16, 4); + _mm_storel_epi64((__m128i *)(pu1_dst + 1 * dst_strd), str_8x8); + str_8x8 = _mm_srli_si128(out1_16x16, 2); + _mm_storel_epi64((__m128i *)(pu1_dst + 2 * dst_strd), str_8x8); + _mm_storel_epi64((__m128i *)(pu1_dst + 3 * dst_strd), out1_16x16); + + str_8x8 = _mm_srli_si128(out2_16x16, 6); + _mm_storel_epi64((__m128i *)(pu1_dst + 4 * dst_strd), str_8x8); + str_8x8 = _mm_srli_si128(out2_16x16, 4); + _mm_storel_epi64((__m128i *)(pu1_dst + 5 * dst_strd), str_8x8); + str_8x8 = _mm_srli_si128(out2_16x16, 2); + _mm_storel_epi64((__m128i *)(pu1_dst + 6 * dst_strd), str_8x8); + _mm_storel_epi64((__m128i *)(pu1_dst + 7 * dst_strd), out2_16x16); +} + +/** + ******************************************************************************* + * + * ih264_intra_pred_luma_8x8_mode_vert_l_ssse3 + * + * @brief + * Perform Intra prediction for luma_8x8 mode:Vertical_Left + * + * @par Description: + * Perform Intra prediction for luma_8x8 mode:Vertical_Left ,described in sec 8.3.2.2.9 + * + * @param[in] pu1_src + * UWORD8 pointer to the source + * + * @param[out] pu1_dst + * UWORD8 pointer to the destination + * + * @param[in] src_strd + * integer source stride + * + * @param[in] dst_strd + * integer destination stride + * + * @param[in] ngbr_avail + * availability of neighbouring pixels(Not used in this function) + * + * @returns + * + * @remarks + * None + * + *******************************************************************************/ + +void ih264_intra_pred_luma_8x8_mode_vert_l_ssse3(UWORD8 *pu1_src, + UWORD8 *pu1_dst, + WORD32 src_strd, + WORD32 dst_strd, + WORD32 ngbr_avail) +{ + UWORD8 *pu1_top = NULL; /* Pointer to start of top predictors */ + __m128i top_16x16; + __m128i temp1, temp2; + __m128i a0_8x16, a1_8x16, a2_8x16; + __m128i zero = _mm_setzero_si128(); + __m128i const_val2_8x16 = _mm_set1_epi16(2); + __m128i res1_8x16, res2_8x16, res3_8x16, res4_8x16; + __m128i out1_16x16, out2_16x16; + UNUSED(src_strd); + UNUSED(ngbr_avail); + pu1_top = pu1_src + BLK8x8SIZE + 1; + + top_16x16 = _mm_loadu_si128((__m128i *)(pu1_top)); + temp1 = _mm_srli_si128(top_16x16, 1); + temp2 = _mm_srli_si128(top_16x16, 2); + a0_8x16 = _mm_unpacklo_epi8(top_16x16, zero); + a1_8x16 = _mm_unpacklo_epi8(temp1, zero); + a2_8x16 = _mm_unpacklo_epi8(temp2, zero); + + res1_8x16 = _mm_avg_epu16(a0_8x16, a1_8x16); + + a0_8x16 = _mm_add_epi16(a0_8x16, a2_8x16); + a1_8x16 = _mm_add_epi16(a1_8x16, a1_8x16); + a0_8x16 = _mm_add_epi16(a0_8x16, const_val2_8x16); + a0_8x16 = _mm_add_epi16(a0_8x16, a1_8x16); + res2_8x16 = _mm_srai_epi16(a0_8x16, 2); + + a0_8x16 = _mm_unpackhi_epi8(top_16x16, zero); + a1_8x16 = _mm_unpackhi_epi8(temp1, zero); + a2_8x16 = _mm_unpackhi_epi8(temp2, zero); + + res3_8x16 = _mm_avg_epu16(a0_8x16, a1_8x16); + + a0_8x16 = _mm_add_epi16(a0_8x16, a2_8x16); + a1_8x16 = _mm_add_epi16(a1_8x16, a1_8x16); + a0_8x16 = _mm_add_epi16(a0_8x16, const_val2_8x16); + a0_8x16 = _mm_add_epi16(a0_8x16, a1_8x16); + res4_8x16 = _mm_srai_epi16(a0_8x16, 2); + + out1_16x16 = _mm_packus_epi16(res1_8x16, res3_8x16); + out2_16x16 = _mm_packus_epi16(res2_8x16, res4_8x16); + + _mm_storel_epi64((__m128i *)(pu1_dst + 0 * dst_strd), out1_16x16); + _mm_storel_epi64((__m128i *)(pu1_dst + 1 * dst_strd), out2_16x16); + out1_16x16 = _mm_srli_si128(out1_16x16, 1); + out2_16x16 = _mm_srli_si128(out2_16x16, 1); + _mm_storel_epi64((__m128i *)(pu1_dst + 2 * dst_strd), out1_16x16); + _mm_storel_epi64((__m128i *)(pu1_dst + 3 * dst_strd), out2_16x16); + out1_16x16 = _mm_srli_si128(out1_16x16, 1); + out2_16x16 = _mm_srli_si128(out2_16x16, 1); + _mm_storel_epi64((__m128i *)(pu1_dst + 4 * dst_strd), out1_16x16); + _mm_storel_epi64((__m128i *)(pu1_dst + 5 * dst_strd), out2_16x16); + out1_16x16 = _mm_srli_si128(out1_16x16, 1); + out2_16x16 = _mm_srli_si128(out2_16x16, 1); + _mm_storel_epi64((__m128i *)(pu1_dst + 6 * dst_strd), out1_16x16); + _mm_storel_epi64((__m128i *)(pu1_dst + 7 * dst_strd), out2_16x16); +} + +/** + ******************************************************************************* + * + * ih264_intra_pred_luma_8x8_mode_horz_u_ssse3 + * + * @brief + * Perform Intra prediction for luma_8x8 mode:Horizontal_Up + * + * @par Description: + * Perform Intra prediction for luma_8x8 mode:Horizontal_Up ,described in sec 8.3.2.2.10 + * + * @param[in] pu1_src + * UWORD8 pointer to the source + * + * @param[out] pu1_dst + * UWORD8 pointer to the destination + * + * @param[in] src_strd + * integer source stride + * + * @param[in] dst_strd + * integer destination stride + * + * @param[in] ngbr_avail + * availability of neighbouring pixels(Not used in this function) + * + * @returns + * + * @remarks + * None + * + *******************************************************************************/ +void ih264_intra_pred_luma_8x8_mode_horz_u_ssse3(UWORD8 *pu1_src, + UWORD8 *pu1_dst, + WORD32 src_strd, + WORD32 dst_strd, + WORD32 ngbr_avail) +{ + UWORD8 *pu1_left = NULL; /* Pointer to start of left predictors */ + __m128i left_16x16; + __m128i temp1, temp2; + __m128i a0_8x16, a1_8x16, a2_8x16; + __m128i zero = _mm_setzero_si128(); + __m128i const_val2_8x16 = _mm_set1_epi16(2); + __m128i res1_8x16, res2_8x16; + __m128i out1_16x16; + __m128i str_8x8; + __m128i shuffle_16x16; + UNUSED(src_strd); + UNUSED(ngbr_avail); + + pu1_left = pu1_src + BLK8x8SIZE - 1; + shuffle_16x16 = _mm_set_epi8(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, + 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, + 0x0F); + + left_16x16 = _mm_loadu_si128((__m128i *)(pu1_left - 7)); + temp1 = _mm_srli_si128(left_16x16, 1); + a0_8x16 = _mm_unpacklo_epi8(left_16x16, zero); + a0_8x16 = _mm_slli_si128(a0_8x16, 2); + a1_8x16 = _mm_unpacklo_epi8(left_16x16, zero); + a0_8x16 = _mm_shufflelo_epi16(a0_8x16, 0xE5); + a2_8x16 = _mm_unpacklo_epi8(temp1, zero); + + res1_8x16 = _mm_avg_epu16(a0_8x16, a1_8x16); + + a0_8x16 = _mm_add_epi16(a0_8x16, a2_8x16); + a1_8x16 = _mm_add_epi16(a1_8x16, a1_8x16); + a0_8x16 = _mm_add_epi16(a0_8x16, const_val2_8x16); + a0_8x16 = _mm_add_epi16(a0_8x16, a1_8x16); + res2_8x16 = _mm_srai_epi16(a0_8x16, 2); + + temp1 = _mm_unpacklo_epi16(res1_8x16, res2_8x16); + temp2 = _mm_unpackhi_epi16(res1_8x16, res2_8x16); + out1_16x16 = _mm_packus_epi16(temp1, temp2); + out1_16x16 = _mm_shuffle_epi8(out1_16x16, shuffle_16x16); + + str_8x8 = _mm_srli_si128(out1_16x16, 1); + _mm_storel_epi64((__m128i *)(pu1_dst + 0 * dst_strd), str_8x8); + str_8x8 = _mm_srli_si128(out1_16x16, 3); + _mm_storel_epi64((__m128i *)(pu1_dst + 1 * dst_strd), str_8x8); + str_8x8 = _mm_srli_si128(out1_16x16, 5); + _mm_storel_epi64((__m128i *)(pu1_dst + 2 * dst_strd), str_8x8); + str_8x8 = _mm_srli_si128(out1_16x16, 7); + _mm_storel_epi64((__m128i *)(pu1_dst + 3 * dst_strd), str_8x8); + temp1 = _mm_set1_epi8(pu1_left[-7]); + str_8x8 = _mm_unpacklo_epi64(str_8x8, temp1); + str_8x8 = _mm_srli_si128(str_8x8, 2); + _mm_storel_epi64((__m128i *)(pu1_dst + 4 * dst_strd), str_8x8); + str_8x8 = _mm_srli_si128(str_8x8, 2); + _mm_storel_epi64((__m128i *)(pu1_dst + 5 * dst_strd), str_8x8); + str_8x8 = _mm_srli_si128(str_8x8, 2); + _mm_storel_epi64((__m128i *)(pu1_dst + 6 * dst_strd), str_8x8); + str_8x8 = _mm_srli_si128(str_8x8, 2); + _mm_storel_epi64((__m128i *)(pu1_dst + 7 * dst_strd), str_8x8); + +} + + +/******************* 16x16 Modes *******************/ + +/** + ******************************************************************************* + * + *ih264_intra_pred_luma_16x16_mode_vert_ssse3 + * + * @brief + * Perform Intra prediction for luma_16x16 mode:Vertical + * + * @par Description: + * Perform Intra prediction for luma_16x16 mode:Vertical, described in sec 8.3.3.1 + * + * @param[in] pu1_src + * UWORD8 pointer to the source + * + * @param[out] pu1_dst + * UWORD8 pointer to the destination + * + * @param[in] src_strd + * integer source stride + * + * @param[in] dst_strd + * integer destination stride + * + * @param[in] ngbr_avail + * availability of neighbouring pixels (Not used in this function) + * + * @returns + * + * @remarks + * None + * + *******************************************************************************/ +void ih264_intra_pred_luma_16x16_mode_vert_ssse3(UWORD8 *pu1_src, + UWORD8 *pu1_dst, + WORD32 src_strd, + WORD32 dst_strd, + WORD32 ngbr_avail) +{ + UWORD8 *pu1_top; + WORD32 dst_strd2, dst_strd3, dst_strd4; + + __m128i top_16x8b; + + UNUSED(src_strd); + UNUSED(ngbr_avail); + + pu1_top = pu1_src + MB_SIZE + 1; + + dst_strd2 = dst_strd << 1; + dst_strd4 = dst_strd << 2; + + top_16x8b = _mm_loadu_si128((__m128i *)pu1_top); + + dst_strd3 = dst_strd + dst_strd2; + + _mm_storeu_si128((__m128i *)pu1_dst, top_16x8b); + _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd), top_16x8b); + _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd2), top_16x8b); + _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd3), top_16x8b); + pu1_dst += dst_strd4; + + _mm_storeu_si128((__m128i *)pu1_dst, top_16x8b); + _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd), top_16x8b); + _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd2), top_16x8b); + _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd3), top_16x8b); + pu1_dst += dst_strd4; + + _mm_storeu_si128((__m128i *)pu1_dst, top_16x8b); + _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd), top_16x8b); + _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd2), top_16x8b); + _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd3), top_16x8b); + pu1_dst += dst_strd4; + + _mm_storeu_si128((__m128i *)pu1_dst, top_16x8b); + _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd), top_16x8b); + _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd2), top_16x8b); + _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd3), top_16x8b); +} + +/** + ******************************************************************************* + * + *ih264_intra_pred_luma_16x16_mode_horz_ssse3 + * + * @brief + * Perform Intra prediction for luma_16x16 mode:Horizontal + * + * @par Description: + * Perform Intra prediction for luma_16x16 mode:Horizontal, described in sec 8.3.3.2 + * + * @param[in] pu1_src + * UWORD8 pointer to the source + * + * @param[out] pu1_dst + * UWORD8 pointer to the destination + * + * @param[in] src_strd + * integer source stride + * + * @param[in] dst_strd + * integer destination stride + * + * @param[in] ngbr_avail + * availability of neighbouring pixels(Not used in this function) + * + * @returns + * + * @remarks + * None + * + *******************************************************************************/ +void ih264_intra_pred_luma_16x16_mode_horz_ssse3(UWORD8 *pu1_src, + UWORD8 *pu1_dst, + WORD32 src_strd, + WORD32 dst_strd, + WORD32 ngbr_avail) +{ + UWORD8 *pu1_left; + WORD32 dst_strd2, dst_strd3, dst_strd4; + WORD32 val1, val2; + + __m128i val_16x8b; + __m128i row1_16x8b, row2_16x8b, row3_16x8b, row4_16x8b; + + UNUSED(src_strd); + UNUSED(ngbr_avail); + + pu1_left = pu1_src + MB_SIZE - 1; + + dst_strd4 = dst_strd << 2; + + val_16x8b = _mm_loadu_si128((__m128i *)(pu1_left - 15)); + + dst_strd2 = dst_strd << 1; + dst_strd3 = dst_strd4 - dst_strd; + + val1 = _mm_extract_epi16(val_16x8b, 7); + val2 = _mm_extract_epi16(val_16x8b, 6); + + row1_16x8b = _mm_set1_epi8(val1 >> 8); + row2_16x8b = _mm_set1_epi8(val1 & 0xff); + row3_16x8b = _mm_set1_epi8(val2 >> 8); + row4_16x8b = _mm_set1_epi8(val2 & 0xff); + + _mm_storeu_si128((__m128i *)pu1_dst, row1_16x8b); + _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd), row2_16x8b); + _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd2), row3_16x8b); + _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd3), row4_16x8b); + + val1 = _mm_extract_epi16(val_16x8b, 5); + val2 = _mm_extract_epi16(val_16x8b, 4); + + pu1_dst += dst_strd4; + row1_16x8b = _mm_set1_epi8(val1 >> 8); + row2_16x8b = _mm_set1_epi8(val1 & 0xff); + row3_16x8b = _mm_set1_epi8(val2 >> 8); + row4_16x8b = _mm_set1_epi8(val2 & 0xff); + + _mm_storeu_si128((__m128i *)pu1_dst, row1_16x8b); + _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd), row2_16x8b); + _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd2), row3_16x8b); + _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd3), row4_16x8b); + + val1 = _mm_extract_epi16(val_16x8b, 3); + val2 = _mm_extract_epi16(val_16x8b, 2); + + pu1_dst += dst_strd4; + row1_16x8b = _mm_set1_epi8(val1 >> 8); + row2_16x8b = _mm_set1_epi8(val1 & 0xff); + row3_16x8b = _mm_set1_epi8(val2 >> 8); + row4_16x8b = _mm_set1_epi8(val2 & 0xff); + + _mm_storeu_si128((__m128i *)pu1_dst, row1_16x8b); + _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd), row2_16x8b); + _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd2), row3_16x8b); + _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd3), row4_16x8b); + + val1 = _mm_extract_epi16(val_16x8b, 1); + val2 = _mm_extract_epi16(val_16x8b, 0); + + pu1_dst += dst_strd4; + row1_16x8b = _mm_set1_epi8(val1 >> 8); + row2_16x8b = _mm_set1_epi8(val1 & 0xff); + row3_16x8b = _mm_set1_epi8(val2 >> 8); + row4_16x8b = _mm_set1_epi8(val2 & 0xff); + + _mm_storeu_si128((__m128i *)pu1_dst, row1_16x8b); + _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd), row2_16x8b); + _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd2), row3_16x8b); + _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd3), row4_16x8b); +} + +/** + ******************************************************************************* + * + *ih264_intra_pred_luma_16x16_mode_dc_ssse3 + * + * @brief + * Perform Intra prediction for luma_16x16 mode:DC + * + * @par Description: + * Perform Intra prediction for luma_16x16 mode:DC, described in sec 8.3.3.3 + * + * @param[in] pu1_src + * UWORD8 pointer to the source + * + * @param[out] pu1_dst + * UWORD8 pointer to the destination + * + * @param[in] src_strd + * integer source stride + * + * @param[in] dst_strd + * integer destination stride + * + ** @param[in] ngbr_avail + * availability of neighbouring pixels + * + * @returns + * + * @remarks + * None + * + *******************************************************************************/ +void ih264_intra_pred_luma_16x16_mode_dc_ssse3(UWORD8 *pu1_src, + UWORD8 *pu1_dst, + WORD32 src_strd, + WORD32 dst_strd, + WORD32 ngbr_avail) +{ + WORD8 u1_useleft, u1_usetop; + WORD32 dc_val; + + WORD32 dst_strd2, dst_strd3, dst_strd4; + + __m128i dc_val_16x8b; + + UNUSED(src_strd); + + u1_useleft = BOOLEAN(ngbr_avail & LEFT_MB_AVAILABLE_MASK); + u1_usetop = BOOLEAN(ngbr_avail & TOP_MB_AVAILABLE_MASK); + + if(u1_useleft || u1_usetop) + { + WORD32 shft; + __m128i val_16x8b, zero_16x8b, sum_8x16b; + + dc_val = 0; + shft = 3; + + zero_16x8b = _mm_setzero_si128(); + + if(u1_useleft) + { + UWORD8 *pu1_left; + + pu1_left = pu1_src + MB_SIZE - 1; + + val_16x8b = _mm_loadu_si128((__m128i *)(pu1_left - 15)); + sum_8x16b = _mm_sad_epu8(zero_16x8b, val_16x8b); + + shft++; + dc_val += 8; + dc_val += _mm_extract_epi16(sum_8x16b, 0); + dc_val += _mm_extract_epi16(sum_8x16b, 4); + } + if(u1_usetop) + { + UWORD8 *pu1_top; + + pu1_top = pu1_src + MB_SIZE + 1; + + val_16x8b = _mm_loadu_si128((__m128i *)pu1_top); + sum_8x16b = _mm_sad_epu8(zero_16x8b, val_16x8b); + + shft++; + dc_val += 8; + dc_val += _mm_extract_epi16(sum_8x16b, 0); + dc_val += _mm_extract_epi16(sum_8x16b, 4); + } + dc_val = dc_val >> shft; + } + else + dc_val = 128; + + dc_val_16x8b = _mm_set1_epi8(dc_val); + + dst_strd2 = dst_strd << 1; + dst_strd4 = dst_strd << 2; + dst_strd3 = dst_strd + dst_strd2; + + _mm_storeu_si128((__m128i *)pu1_dst, dc_val_16x8b); + _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd), dc_val_16x8b); + _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd2), dc_val_16x8b); + _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd3), dc_val_16x8b); + pu1_dst += dst_strd4; + + _mm_storeu_si128((__m128i *)pu1_dst, dc_val_16x8b); + _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd), dc_val_16x8b); + _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd2), dc_val_16x8b); + _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd3), dc_val_16x8b); + pu1_dst += dst_strd4; + + _mm_storeu_si128((__m128i *)pu1_dst, dc_val_16x8b); + _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd), dc_val_16x8b); + _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd2), dc_val_16x8b); + _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd3), dc_val_16x8b); + pu1_dst += dst_strd4; + + _mm_storeu_si128((__m128i *)pu1_dst, dc_val_16x8b); + _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd), dc_val_16x8b); + _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd2), dc_val_16x8b); + _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd3), dc_val_16x8b); +} + +/** + ******************************************************************************* + * + *ih264_intra_pred_luma_16x16_mode_plane_ssse3 + * + * @brief + * Perform Intra prediction for luma_16x16 mode:PLANE + * + * @par Description: + * Perform Intra prediction for luma_16x16 mode:PLANE, described in sec 8.3.3.4 + * + * @param[in] pu1_src + * UWORD8 pointer to the source + * + * @param[out] pu1_dst + * UWORD8 pointer to the destination + * + * @param[in] src_strd + * integer source stride + * + * @param[in] dst_strd + * integer destination stride + * + * @param[in] ngbr_avail + * availability of neighbouring pixels(Not used in this function) + * + * @returns + * + * @remarks + * None + * + *******************************************************************************/ +void ih264_intra_pred_luma_16x16_mode_plane_ssse3(UWORD8 *pu1_src, + UWORD8 *pu1_dst, + WORD32 src_strd, + WORD32 dst_strd, + WORD32 ngbr_avail) +{ + UWORD8 *pu1_left, *pu1_top; + WORD32 a, b, c; + + __m128i rev_8x16b, mul_8x16b, zero_16x8b; + + UNUSED(src_strd); + UNUSED(ngbr_avail); + + pu1_top = pu1_src + MB_SIZE + 1; + pu1_left = pu1_src + MB_SIZE - 1; + + rev_8x16b = _mm_setr_epi16(0x0f0e, 0x0d0c, 0x0b0a, 0x0908, 0x0706, 0x0504, 0x0302, 0x0100); + //used to reverse the order of 16-bit values in a vector + + mul_8x16b = _mm_setr_epi16(1, 2, 3, 4, 5, 6, 7, 8); + zero_16x8b = _mm_setzero_si128(); + + //calculating a, b and c + { + WORD32 h, v; + + __m128i h_val1_16x8b, h_val2_16x8b; + __m128i h_val1_8x16b, h_val2_8x16b, h_val_4x32b; + __m128i v_val1_16x8b, v_val2_16x8b; + __m128i v_val1_8x16b, v_val2_8x16b, v_val_4x32b; + __m128i hv_val_4x32b; + + a = (pu1_top[15] + pu1_left[-15]) << 4; + + h_val1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_top + 8)); + h_val2_16x8b = _mm_loadl_epi64((__m128i *)(pu1_top - 1)); + v_val1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_left - 15)); + v_val2_16x8b = _mm_loadl_epi64((__m128i *)(pu1_left - 6)); + + h_val1_8x16b = _mm_unpacklo_epi8(h_val1_16x8b, zero_16x8b); + h_val2_8x16b = _mm_unpacklo_epi8(h_val2_16x8b, zero_16x8b); + v_val1_8x16b = _mm_unpacklo_epi8(v_val1_16x8b, zero_16x8b); + v_val2_8x16b = _mm_unpacklo_epi8(v_val2_16x8b, zero_16x8b); + + h_val2_8x16b = _mm_shuffle_epi8(h_val2_8x16b, rev_8x16b); + v_val1_8x16b = _mm_shuffle_epi8(v_val1_8x16b, rev_8x16b); + + h_val1_8x16b = _mm_sub_epi16(h_val1_8x16b, h_val2_8x16b); + v_val1_8x16b = _mm_sub_epi16(v_val1_8x16b, v_val2_8x16b); + + h_val_4x32b = _mm_madd_epi16(mul_8x16b, h_val1_8x16b); + v_val_4x32b = _mm_madd_epi16(mul_8x16b, v_val1_8x16b); + + hv_val_4x32b = _mm_hadd_epi32(h_val_4x32b, v_val_4x32b); + hv_val_4x32b = _mm_hadd_epi32(hv_val_4x32b, hv_val_4x32b); + + h = _mm_extract_epi16(hv_val_4x32b, 0); + v = _mm_extract_epi16(hv_val_4x32b, 2); + h = (h << 16) >> 16; + v = (v << 16) >> 16; + + b = ((h << 2) + h + 32) >> 6; + c = ((v << 2) + v + 32) >> 6; + } + + //using a, b and c to compute the fitted plane values + { + __m128i const_8x16b, b_8x16b, c_8x16b, c2_8x16b; + __m128i res1_l_8x16b, res1_h_8x16b; + __m128i res2_l_8x16b, res2_h_8x16b; + __m128i res1_sh_l_8x16b, res1_sh_h_8x16b, res1_16x8b; + __m128i res2_sh_l_8x16b, res2_sh_h_8x16b, res2_16x8b; + + b_8x16b = _mm_set1_epi16(b); + c_8x16b = _mm_set1_epi16(c); + c2_8x16b = _mm_set1_epi16(c << 1); + const_8x16b = _mm_set1_epi16(a - c*7 + 16); + + res1_h_8x16b = _mm_mullo_epi16(mul_8x16b, b_8x16b); + //contains {b*1, b*2, b*3,... b*8} + + res1_l_8x16b = _mm_shuffle_epi8(res1_h_8x16b, rev_8x16b); + res1_l_8x16b = _mm_srli_si128(res1_l_8x16b, 2); + res1_l_8x16b = _mm_sub_epi16(zero_16x8b, res1_l_8x16b); + //contains {-b*7, -b*6,... -b*1, b*0} + + // rows 1, 2 + res1_h_8x16b = _mm_add_epi16(res1_h_8x16b, const_8x16b); + res1_l_8x16b = _mm_add_epi16(res1_l_8x16b, const_8x16b); + res2_h_8x16b = _mm_add_epi16(res1_h_8x16b, c_8x16b); + res2_l_8x16b = _mm_add_epi16(res1_l_8x16b, c_8x16b); + + res1_sh_h_8x16b = _mm_srai_epi16(res1_h_8x16b, 5); + res1_sh_l_8x16b = _mm_srai_epi16(res1_l_8x16b, 5); + res2_sh_h_8x16b = _mm_srai_epi16(res2_h_8x16b, 5); + res2_sh_l_8x16b = _mm_srai_epi16(res2_l_8x16b, 5); + + res1_16x8b = _mm_packus_epi16(res1_sh_l_8x16b, res1_sh_h_8x16b); + res2_16x8b = _mm_packus_epi16(res2_sh_l_8x16b, res2_sh_h_8x16b); + + _mm_storeu_si128((__m128i *)pu1_dst, res1_16x8b); + _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd), res2_16x8b); + + // rows 3, 4 + res1_h_8x16b = _mm_add_epi16(res1_h_8x16b, c2_8x16b); + res1_l_8x16b = _mm_add_epi16(res1_l_8x16b, c2_8x16b); + res2_h_8x16b = _mm_add_epi16(res2_h_8x16b, c2_8x16b); + res2_l_8x16b = _mm_add_epi16(res2_l_8x16b, c2_8x16b); + + res1_sh_h_8x16b = _mm_srai_epi16(res1_h_8x16b, 5); + res1_sh_l_8x16b = _mm_srai_epi16(res1_l_8x16b, 5); + res2_sh_h_8x16b = _mm_srai_epi16(res2_h_8x16b, 5); + res2_sh_l_8x16b = _mm_srai_epi16(res2_l_8x16b, 5); + + pu1_dst += dst_strd << 1; + + res1_16x8b = _mm_packus_epi16(res1_sh_l_8x16b, res1_sh_h_8x16b); + res2_16x8b = _mm_packus_epi16(res2_sh_l_8x16b, res2_sh_h_8x16b); + + _mm_storeu_si128((__m128i *)pu1_dst, res1_16x8b); + _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd), res2_16x8b); + + // rows 5, 6 + res1_h_8x16b = _mm_add_epi16(res1_h_8x16b, c2_8x16b); + res1_l_8x16b = _mm_add_epi16(res1_l_8x16b, c2_8x16b); + res2_h_8x16b = _mm_add_epi16(res2_h_8x16b, c2_8x16b); + res2_l_8x16b = _mm_add_epi16(res2_l_8x16b, c2_8x16b); + + res1_sh_h_8x16b = _mm_srai_epi16(res1_h_8x16b, 5); + res1_sh_l_8x16b = _mm_srai_epi16(res1_l_8x16b, 5); + res2_sh_h_8x16b = _mm_srai_epi16(res2_h_8x16b, 5); + res2_sh_l_8x16b = _mm_srai_epi16(res2_l_8x16b, 5); + + pu1_dst += dst_strd << 1; + + res1_16x8b = _mm_packus_epi16(res1_sh_l_8x16b, res1_sh_h_8x16b); + res2_16x8b = _mm_packus_epi16(res2_sh_l_8x16b, res2_sh_h_8x16b); + + _mm_storeu_si128((__m128i *)pu1_dst, res1_16x8b); + _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd), res2_16x8b); + + // rows 7, 8 + res1_h_8x16b = _mm_add_epi16(res1_h_8x16b, c2_8x16b); + res1_l_8x16b = _mm_add_epi16(res1_l_8x16b, c2_8x16b); + res2_h_8x16b = _mm_add_epi16(res2_h_8x16b, c2_8x16b); + res2_l_8x16b = _mm_add_epi16(res2_l_8x16b, c2_8x16b); + + res1_sh_h_8x16b = _mm_srai_epi16(res1_h_8x16b, 5); + res1_sh_l_8x16b = _mm_srai_epi16(res1_l_8x16b, 5); + res2_sh_h_8x16b = _mm_srai_epi16(res2_h_8x16b, 5); + res2_sh_l_8x16b = _mm_srai_epi16(res2_l_8x16b, 5); + + pu1_dst += dst_strd << 1; + + res1_16x8b = _mm_packus_epi16(res1_sh_l_8x16b, res1_sh_h_8x16b); + res2_16x8b = _mm_packus_epi16(res2_sh_l_8x16b, res2_sh_h_8x16b); + + _mm_storeu_si128((__m128i *)pu1_dst, res1_16x8b); + _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd), res2_16x8b); + + // rows 9, 10 + res1_h_8x16b = _mm_add_epi16(res1_h_8x16b, c2_8x16b); + res1_l_8x16b = _mm_add_epi16(res1_l_8x16b, c2_8x16b); + res2_h_8x16b = _mm_add_epi16(res2_h_8x16b, c2_8x16b); + res2_l_8x16b = _mm_add_epi16(res2_l_8x16b, c2_8x16b); + + res1_sh_h_8x16b = _mm_srai_epi16(res1_h_8x16b, 5); + res1_sh_l_8x16b = _mm_srai_epi16(res1_l_8x16b, 5); + res2_sh_h_8x16b = _mm_srai_epi16(res2_h_8x16b, 5); + res2_sh_l_8x16b = _mm_srai_epi16(res2_l_8x16b, 5); + + pu1_dst += dst_strd << 1; + + res1_16x8b = _mm_packus_epi16(res1_sh_l_8x16b, res1_sh_h_8x16b); + res2_16x8b = _mm_packus_epi16(res2_sh_l_8x16b, res2_sh_h_8x16b); + + _mm_storeu_si128((__m128i *)pu1_dst, res1_16x8b); + _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd), res2_16x8b); + + // rows 11, 12 + res1_h_8x16b = _mm_add_epi16(res1_h_8x16b, c2_8x16b); + res1_l_8x16b = _mm_add_epi16(res1_l_8x16b, c2_8x16b); + res2_h_8x16b = _mm_add_epi16(res2_h_8x16b, c2_8x16b); + res2_l_8x16b = _mm_add_epi16(res2_l_8x16b, c2_8x16b); + + res1_sh_h_8x16b = _mm_srai_epi16(res1_h_8x16b, 5); + res1_sh_l_8x16b = _mm_srai_epi16(res1_l_8x16b, 5); + res2_sh_h_8x16b = _mm_srai_epi16(res2_h_8x16b, 5); + res2_sh_l_8x16b = _mm_srai_epi16(res2_l_8x16b, 5); + + pu1_dst += dst_strd << 1; + + res1_16x8b = _mm_packus_epi16(res1_sh_l_8x16b, res1_sh_h_8x16b); + res2_16x8b = _mm_packus_epi16(res2_sh_l_8x16b, res2_sh_h_8x16b); + + _mm_storeu_si128((__m128i *)pu1_dst, res1_16x8b); + _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd), res2_16x8b); + + // rows 13, 14 + res1_h_8x16b = _mm_add_epi16(res1_h_8x16b, c2_8x16b); + res1_l_8x16b = _mm_add_epi16(res1_l_8x16b, c2_8x16b); + res2_h_8x16b = _mm_add_epi16(res2_h_8x16b, c2_8x16b); + res2_l_8x16b = _mm_add_epi16(res2_l_8x16b, c2_8x16b); + + res1_sh_h_8x16b = _mm_srai_epi16(res1_h_8x16b, 5); + res1_sh_l_8x16b = _mm_srai_epi16(res1_l_8x16b, 5); + res2_sh_h_8x16b = _mm_srai_epi16(res2_h_8x16b, 5); + res2_sh_l_8x16b = _mm_srai_epi16(res2_l_8x16b, 5); + + pu1_dst += dst_strd << 1; + + res1_16x8b = _mm_packus_epi16(res1_sh_l_8x16b, res1_sh_h_8x16b); + res2_16x8b = _mm_packus_epi16(res2_sh_l_8x16b, res2_sh_h_8x16b); + + _mm_storeu_si128((__m128i *)pu1_dst, res1_16x8b); + _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd), res2_16x8b); + + // rows 15, 16 + res1_h_8x16b = _mm_add_epi16(res1_h_8x16b, c2_8x16b); + res1_l_8x16b = _mm_add_epi16(res1_l_8x16b, c2_8x16b); + res2_h_8x16b = _mm_add_epi16(res2_h_8x16b, c2_8x16b); + res2_l_8x16b = _mm_add_epi16(res2_l_8x16b, c2_8x16b); + + res1_sh_h_8x16b = _mm_srai_epi16(res1_h_8x16b, 5); + res1_sh_l_8x16b = _mm_srai_epi16(res1_l_8x16b, 5); + res2_sh_h_8x16b = _mm_srai_epi16(res2_h_8x16b, 5); + res2_sh_l_8x16b = _mm_srai_epi16(res2_l_8x16b, 5); + + pu1_dst += dst_strd << 1; + + res1_16x8b = _mm_packus_epi16(res1_sh_l_8x16b, res1_sh_h_8x16b); + res2_16x8b = _mm_packus_epi16(res2_sh_l_8x16b, res2_sh_h_8x16b); + + _mm_storeu_si128((__m128i *)pu1_dst, res1_16x8b); + _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd), res2_16x8b); + } +} diff --git a/common/x86/ih264_mem_fns_ssse3.c b/common/x86/ih264_mem_fns_ssse3.c new file mode 100755 index 0000000..8ca1f3e --- /dev/null +++ b/common/x86/ih264_mem_fns_ssse3.c @@ -0,0 +1,169 @@ +/****************************************************************************** + * + * Copyright (C) 2015 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ***************************************************************************** + * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore +*/ +/** + ******************************************************************************* + * @file + * ih264_mem_fns_atom_intr.c + * + * @brief + * Functions used for memory operations + * + * @author + * Ittiam + * + * @par List of Functions: + * + * @remarks + * None + * + ******************************************************************************* + */ + +/*****************************************************************************/ +/* File Includes */ +/*****************************************************************************/ +#include <stdio.h> +#include <stddef.h> +#include <stdlib.h> +#include <string.h> +#include <assert.h> + +#include "ih264_typedefs.h" +#include "ih264_mem_fns.h" + +#include <immintrin.h> + +/** + ******************************************************************************* + * + * @brief + * memcpy of a 8,16 or 32 bytes + * + * @par Description: + * Does memcpy of 8bit data from source to destination for 8,16 or 32 number of bytes + * + * @param[in] pu1_dst + * UWORD8 pointer to the destination + * + * @param[in] pu1_src + * UWORD8 pointer to the source + * + * @param[in] num_bytes + * number of bytes to copy + * @returns + * + * @remarks + * None + * + ******************************************************************************* + */ + + + + +void ih264_memcpy_mul_8_ssse3(UWORD8 *pu1_dst, UWORD8 *pu1_src, UWORD32 num_bytes) +{ + int col; + for(col = num_bytes; col >= 8; col -= 8) + { + __m128i src_temp16x8b; + src_temp16x8b = _mm_loadl_epi64((__m128i *)(pu1_src)); + pu1_src += 8; + _mm_storel_epi64((__m128i *)(pu1_dst), src_temp16x8b); + pu1_dst += 8; + } +} + +/** + ******************************************************************************* + * + * @brief + * memset of a 8,16 or 32 bytes + * + * @par Description: + * Does memset of 8bit data for 8,16 or 32 number of bytes + * + * @param[in] pu1_dst + * UWORD8 pointer to the destination + * + * @param[in] value + * UWORD8 value used for memset + * + * @param[in] num_bytes + * number of bytes to set + * @returns + * + * @remarks + * None + * + ******************************************************************************* + */ + + +void ih264_memset_mul_8_ssse3(UWORD8 *pu1_dst, UWORD8 value, UWORD32 num_bytes) +{ + int col; + __m128i src_temp16x8b; + src_temp16x8b = _mm_set1_epi8(value); + for(col = num_bytes; col >= 8; col -= 8) + { + _mm_storel_epi64((__m128i *)(pu1_dst), src_temp16x8b); + pu1_dst += 8; + } +} + +/** + ******************************************************************************* + * + * @brief + * memset of 16bit data of a 8,16 or 32 bytes + * + * @par Description: + * Does memset of 16bit data for 8,16 or 32 number of bytes + * + * @param[in] pu2_dst + * UWORD8 pointer to the destination + * + * @param[in] value + * UWORD16 value used for memset + * + * @param[in] num_words + * number of words to set + * @returns + * + * @remarks + * None + * + ******************************************************************************* + */ + + +void ih264_memset_16bit_mul_8_ssse3(UWORD16 *pu2_dst, UWORD16 value, UWORD32 num_words) +{ + int col; + __m128i src_temp16x8b; + src_temp16x8b = _mm_set1_epi16(value); + for(col = num_words; col >= 8; col -= 8) + { + _mm_storeu_si128((__m128i *)(pu2_dst), src_temp16x8b); + pu2_dst += 8; + } +} + diff --git a/common/x86/ih264_padding_ssse3.c b/common/x86/ih264_padding_ssse3.c new file mode 100755 index 0000000..6dadd39 --- /dev/null +++ b/common/x86/ih264_padding_ssse3.c @@ -0,0 +1,335 @@ +/****************************************************************************** + * + * Copyright (C) 2015 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ***************************************************************************** + * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore +*/ +/** +******************************************************************************* +* @file +* ih264_padding_atom_intr.c +* +* @brief +* Contains function definitions for Padding +* +* @author +* Srinivas T +* +* @par List of Functions: +* - ih264_pad_left_luma_ssse3() +* - ih264_pad_left_chroma_ssse3() +* - ih264_pad_right_luma_ssse3() +* - ih264_pad_right_chroma_ssse3() +* +* @remarks +* None +* +******************************************************************************* +*/ + +#include <string.h> +#include <assert.h> +#include "ih264_typedefs.h" +#include "ih264_platform_macros.h" +#include "ih264_mem_fns.h" +#include "ih264_debug.h" + +#include <immintrin.h> + + +/** +******************************************************************************* +* +* @brief +* Padding (luma block) at the left of a 2d array +* +* @par Description: +* The left column of a 2d array is replicated for pad_size times at the left +* +* +* @param[in] pu1_src +* UWORD8 pointer to the source +* +* @param[in] src_strd +* integer source stride +* +* @param[in] ht +* integer height of the array +* +* @param[in] wd +* integer width of the array +* +* @param[in] pad_size +* integer -padding size of the array +* +* @param[in] ht +* integer height of the array +* +* @param[in] wd +* integer width of the array +* +* @returns +* +* @remarks +* None +* +******************************************************************************* +*/ + +void ih264_pad_left_luma_ssse3(UWORD8 *pu1_src, + WORD32 src_strd, + WORD32 ht, + WORD32 pad_size) +{ + WORD32 row; + WORD32 i; + UWORD8 *pu1_dst; + __m128i const0_16x8b; + + const0_16x8b = _mm_setzero_si128(); + + ASSERT(pad_size % 8 == 0); + + for(row = 0; row < ht; row++) + { + __m128i src_temp0_16x8b; + + src_temp0_16x8b = _mm_loadu_si128((__m128i *)pu1_src); + pu1_dst = pu1_src - pad_size; + src_temp0_16x8b = _mm_shuffle_epi8(src_temp0_16x8b, const0_16x8b); + for(i = 0; i < pad_size; i += 8) + { + _mm_storel_epi64((__m128i *)(pu1_dst + i), src_temp0_16x8b); + } + pu1_src += src_strd; + } + +} + + + +/** +******************************************************************************* +* +* @brief +* Padding (chroma block) at the left of a 2d array +* +* @par Description: +* The left column of a 2d array is replicated for pad_size times at the left +* +* +* @param[in] pu1_src +* UWORD8 pointer to the source +* +* @param[in] src_strd +* integer source stride +* +* @param[in] ht +* integer height of the array +* +* @param[in] wd +* integer width of the array (each colour component) +* +* @param[in] pad_size +* integer -padding size of the array +* +* @param[in] ht +* integer height of the array +* +* @param[in] wd +* integer width of the array +* +* @returns +* +* @remarks +* None +* +******************************************************************************* +*/ + +void ih264_pad_left_chroma_ssse3(UWORD8 *pu1_src, + WORD32 src_strd, + WORD32 ht, + WORD32 pad_size) +{ + WORD32 row; + WORD32 col; + UWORD8 *pu1_dst; + __m128i const0_16x8b, const1_16x8b; + const0_16x8b = _mm_setzero_si128(); + const1_16x8b = _mm_set1_epi8(1); + const0_16x8b = _mm_unpacklo_epi8(const0_16x8b, const1_16x8b); + + ASSERT(pad_size % 8 == 0); + for(row = 0; row < ht; row++) + { + __m128i src_temp0_16x8b; + + src_temp0_16x8b = _mm_loadu_si128((__m128i *)pu1_src); + pu1_dst = pu1_src - pad_size; + src_temp0_16x8b = _mm_shuffle_epi8(src_temp0_16x8b, const0_16x8b); + + for(col = 0; col < pad_size; col += 8) + { + _mm_storel_epi64((__m128i *)(pu1_dst + col), src_temp0_16x8b); + } + pu1_src += src_strd; + } + +} + + + +/** +******************************************************************************* +* +* @brief +* Padding (luma block) at the right of a 2d array +* +* @par Description: +* The right column of a 2d array is replicated for pad_size times at the right +* +* +* @param[in] pu1_src +* UWORD8 pointer to the source +* +* @param[in] src_strd +* integer source stride +* +* @param[in] ht +* integer height of the array +* +* @param[in] wd +* integer width of the array +* +* @param[in] pad_size +* integer -padding size of the array +* +* @param[in] ht +* integer height of the array +* +* @param[in] wd +* integer width of the array +* +* @returns +* +* @remarks +* None +* +******************************************************************************* +*/ + +void ih264_pad_right_luma_ssse3(UWORD8 *pu1_src, + WORD32 src_strd, + WORD32 ht, + WORD32 pad_size) +{ + WORD32 row; + WORD32 col; + UWORD8 *pu1_dst; + __m128i const0_16x8b; + + ASSERT(pad_size % 8 == 0); + + for(row = 0; row < ht; row++) + { + __m128i src_temp0_16x8b; + + src_temp0_16x8b = _mm_loadu_si128((__m128i *)(pu1_src - 1)); + const0_16x8b = _mm_setzero_si128(); + pu1_dst = pu1_src; + src_temp0_16x8b = _mm_shuffle_epi8(src_temp0_16x8b, const0_16x8b); + for(col = 0; col < pad_size; col += 8) + { + _mm_storel_epi64((__m128i *)(pu1_dst + col), src_temp0_16x8b); + } + pu1_src += src_strd; + } + +} + + + +/** +******************************************************************************* +* +* @brief +* Padding (chroma block) at the right of a 2d array +* +* @par Description: +* The right column of a 2d array is replicated for pad_size times at the right +* +* +* @param[in] pu1_src +* UWORD8 pointer to the source +* +* @param[in] src_strd +* integer source stride +* +* @param[in] ht +* integer height of the array +* +* @param[in] wd +* integer width of the array (each colour component) +* +* @param[in] pad_size +* integer -padding size of the array +* +* @param[in] ht +* integer height of the array +* +* @param[in] wd +* integer width of the array +* +* @returns +* +* @remarks +* None +* +******************************************************************************* +*/ + +void ih264_pad_right_chroma_ssse3(UWORD8 *pu1_src, + WORD32 src_strd, + WORD32 ht, + WORD32 pad_size) +{ + WORD32 row; + WORD32 col; + UWORD8 *pu1_dst; + __m128i const0_16x8b, const1_16x8b; + const0_16x8b = _mm_setzero_si128(); + const1_16x8b = _mm_set1_epi8(1); + const0_16x8b = _mm_unpacklo_epi8(const0_16x8b, const1_16x8b); + + ASSERT(pad_size % 8 == 0); + + for(row = 0; row < ht; row++) + { + __m128i src_temp0_16x8b; + + src_temp0_16x8b = _mm_loadu_si128((__m128i *)(pu1_src - 2)); + pu1_dst = pu1_src; + src_temp0_16x8b = _mm_shuffle_epi8(src_temp0_16x8b, const0_16x8b); + for(col = 0; col < pad_size; col += 8) + { + _mm_storel_epi64((__m128i *)(pu1_dst + col), src_temp0_16x8b); + } + + pu1_src += src_strd; + } +} + diff --git a/common/x86/ih264_platform_macros.h b/common/x86/ih264_platform_macros.h new file mode 100755 index 0000000..e4b9821 --- /dev/null +++ b/common/x86/ih264_platform_macros.h @@ -0,0 +1,114 @@ +/****************************************************************************** + * + * Copyright (C) 2015 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ***************************************************************************** + * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore +*/ +/** +******************************************************************************* +* @file +* ih264_platform_macros.h +* +* @brief +* Platform specific Macro definitions used in the codec +* +* @author +* Ittiam +* +* @remarks +* None +* +******************************************************************************* +*/ + + +#ifndef _IH264_PLATFORM_MACROS_H_ +#define _IH264_PLATFORM_MACROS_H_ + +#include <immintrin.h> + + +#define CLIP_U8(x) CLIP3(0, 255, (x)) +#define CLIP_S8(x) CLIP3(-128, 127, (x)) + +#define CLIP_U10(x) CLIP3(0, 1023, (x)) +#define CLIP_S10(x) CLIP3(-512, 511, (x)) + +#define CLIP_U12(x) CLIP3(0, 4095, (x)) +#define CLIP_S12(x) CLIP3(-2048, 2047, (x)) + +#define CLIP_U16(x) CLIP3(0, 65535, (x)) +#define CLIP_S16(x) CLIP3(-32768, 32767, (x)) + +#define MEM_ALIGN16 __attribute__ ((aligned (16))) + +#define SHL(x,y) (((y) < 32) ? ((x) << (y)) : 0) +#define SHR(x,y) (((y) < 32) ? ((x) >> (y)) : 0) + +#define SHR_NEG(val,shift) ((shift>0)?(val>>shift):(val<<(-shift))) +#define SHL_NEG(val,shift) ((shift<0)?(val>>(-shift)):(val<<shift)) + + +#define ITT_BIG_ENDIAN(x) ((x << 24)) | \ + ((x & 0x0000ff00) << 8) | \ + ((x & 0x00ff0000) >> 8) | \ + ((UWORD32)x >> 24); + + +#define NOP(nop_cnt) {UWORD32 nop_i; for (nop_i = 0; nop_i < nop_cnt; nop_i++);} + +#define PLD(a) + +static __inline UWORD32 CLZ(UWORD32 u4_word) +{ + if(u4_word) + return(__builtin_clz(u4_word)); + else + return 32; +} + +static __inline UWORD32 CTZ(UWORD32 u4_word) +{ + if(0 == u4_word) + return 31; + else + { + unsigned int index; + index = __builtin_ctz(u4_word); + return (UWORD32)index; + } +} + +#define DATA_SYNC() __sync_synchronize() + + + +//#define INLINE __inline +#define INLINE + +#define PREFETCH_ENABLE 1 + +#if PREFETCH_ENABLE +#define PREFETCH(ptr, type) _mm_prefetch(ptr, type); +#else +#define PREFETCH(ptr, type) +#endif + +#define MEM_ALIGN8 __attribute__ ((aligned (8))) +#define MEM_ALIGN16 __attribute__ ((aligned (16))) +#define MEM_ALIGN32 __attribute__ ((aligned (32))) + +#endif /* _IH264_PLATFORM_MACROS_H_ */ diff --git a/common/x86/ih264_resi_trans_quant_sse42.c b/common/x86/ih264_resi_trans_quant_sse42.c new file mode 100755 index 0000000..c267651 --- /dev/null +++ b/common/x86/ih264_resi_trans_quant_sse42.c @@ -0,0 +1,984 @@ +/****************************************************************************** + * + * Copyright (C) 2015 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ***************************************************************************** + * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore +*/ +/** + ******************************************************************************* + * @file + * ih264_resi_trans_quant_sse42.c + * + * @brief + * Contains function definitions single stage forward transform for H.264 + * It will calculate the residue, do the cf and then do quantization + * + * @author + * Mohit [100664] + * + * @par List of Functions: + * - ih264_resi_trans_quant_4x4_sse42() + * - ih264_resi_trans_quant_chroma_4x4_sse42() + * + * @remarks + * None + * + ******************************************************************************* + */ +/* System include files */ +#include <stddef.h> + +/* User include files */ +#include "ih264_typedefs.h" +#include "ih264_defs.h" +#include "ih264_size_defs.h" +#include "ih264_macros.h" +#include "ih264_trans_macros.h" +#include "ih264_trans_data.h" +#include "ih264_structs.h" +#include "ih264_trans_quant_itrans_iquant.h" +#include <immintrin.h> +/** + ******************************************************************************* + * + * @brief + * This function performs forward transform and quantization on a 4*4 block + * + * @par Description: + * The function accepts source buffer and estimation buffer. From these, it + * computes the residue. This is residue is then transformed and quantized. + * The transform and quantization are in placed computed. They use the residue + * buffer for this. + * + * @param[in] pu1_src + * Pointer to source sub-block + * + * @param[in] pu1_pred + * Pointer to prediction sub-block + * + * @param[in] pi2_out + * Pointer to residual sub-block + * + * @param[in] src_strd + * Source stride + * + * @param[in] pred_strd + * Prediction stride + * + * @param[in] dst_strd + * Destination stride + * + * @param[in] u4_qbits + * QP_BITS_h264_4x4 + floor(QP/6) + * + * @param[in] pu2_threshold_matrix + * Pointer to Forward Quant Threshold Matrix + * + * @param[in] pu2_scale_matrix + * Pointer to Forward Quant Scale Matrix + * + * @param[in] u4_round_factor + * Quantization Round factor + * + * @param[out] pu1_nnz + * Total non-zero coefficients in the current sub-block + * + * @returns + * + * @remarks + * None + * + ******************************************************************************* + */ +void ih264_resi_trans_quant_4x4_sse42(UWORD8 *pu1_src, UWORD8 *pu1_pred, + WORD16 *pi2_out, WORD32 src_strd, WORD32 pred_strd, + const UWORD16 *pu2_scale_matrix, const UWORD16 *pu2_threshold_matrix, + UWORD32 u4_qbits, UWORD32 u4_round_factor, UWORD8 *pu1_nnz, + WORD16 *pi2_alt_dc_addr) +{ + WORD32 tmp_dc, u4_zero_coeff, u4_nonzero_coeff = 0; + WORD32 mask0, mask1; + __m128i sum0, sum1, sum2, cmp0, cmp1; + __m128i rnd_fact = _mm_set1_epi32(u4_round_factor); + __m128i temp_2 = _mm_set1_epi16(2); + __m128i temp_1 = _mm_set1_epi16(1); + __m128i src_r0, src_r1, src_r2, src_r3; + __m128i pred_r0, pred_r1, pred_r2, pred_r3; + __m128i temp0, temp1, temp2, temp3; + __m128i zero_8x16b = _mm_setzero_si128(); // all bits reset to zero + __m128i sign_reg0, sign_reg2; + __m128i scalemat_r0_r1, scalemat_r2_r3; + scalemat_r0_r1 = _mm_loadu_si128((__m128i *) (pu2_scale_matrix)); //b00 b01 b02 b03 b10 b11 b12 b13 -- the scaling matrix 0th,1st row + scalemat_r2_r3 = _mm_loadu_si128((__m128i *) (pu2_scale_matrix + 8)); //b20 b21 b22 b23 b30 b31 b32 b33 -- the scaling matrix 2nd,3rd row + src_r0 = _mm_loadl_epi64((__m128i *) (&pu1_src[0])); //a00 a01 a02 a03 0 0 0 0 0 0 0 0 -- all 8 bits + src_r1 = _mm_loadl_epi64((__m128i *) (&pu1_src[src_strd])); //a10 a11 a12 a13 0 0 0 0 0 0 0 0 -- all 8 bits + src_r2 = _mm_loadl_epi64((__m128i *) (&pu1_src[2 * src_strd])); //a20 a21 a22 a23 0 0 0 0 0 0 0 0 -- all 8 bits + src_r3 = _mm_loadl_epi64((__m128i *) (&pu1_src[3 * src_strd])); //a30 a31 a32 a33 0 0 0 0 0 0 0 0 -- all 8 bits + + src_r0 = _mm_cvtepu8_epi16(src_r0); + src_r1 = _mm_cvtepu8_epi16(src_r1); + src_r2 = _mm_cvtepu8_epi16(src_r2); + src_r3 = _mm_cvtepu8_epi16(src_r3); + + pred_r0 = _mm_loadl_epi64((__m128i *) (&pu1_pred[0])); //p00 p01 p02 p03 0 0 0 0 0 0 0 0 -- all 8 bits + pred_r1 = _mm_loadl_epi64((__m128i *) (&pu1_pred[pred_strd])); //p10 p11 p12 p13 0 0 0 0 0 0 0 0 -- all 8 bits + pred_r2 = _mm_loadl_epi64((__m128i *) (&pu1_pred[2 * pred_strd])); //p20 p21 p22 p23 0 0 0 0 0 0 0 0 -- all 8 bits + pred_r3 = _mm_loadl_epi64((__m128i *) (&pu1_pred[3 * pred_strd])); //p30 p31 p32 p33 0 0 0 0 0 0 0 0 -- all 8 bits + + pred_r0 = _mm_cvtepu8_epi16(pred_r0); //p00 p01 p02 p03 -- all 16 bits + pred_r1 = _mm_cvtepu8_epi16(pred_r1); //p10 p11 p12 p13 -- all 16 bits + pred_r2 = _mm_cvtepu8_epi16(pred_r2); //p20 p21 p22 p23 -- all 16 bits + pred_r3 = _mm_cvtepu8_epi16(pred_r3); //p30 p31 p32 p33 -- all 16 bits + + src_r0 = _mm_sub_epi16(src_r0, pred_r0); + src_r1 = _mm_sub_epi16(src_r1, pred_r1); + src_r2 = _mm_sub_epi16(src_r2, pred_r2); + src_r3 = _mm_sub_epi16(src_r3, pred_r3); + + /* Perform Forward transform */ + /*-------------------------------------------------------------*/ + /* DCT [ Horizontal transformation ] */ + /*-------------------------------------------------------------*/ + // Matrix transpose + /* + * a0 a1 a2 a3 + * b0 b1 b2 b3 + * c0 c1 c2 c3 + * d0 d1 d2 d3 + */ + temp0 = _mm_unpacklo_epi16(src_r0, src_r1); //a0 b0 a1 b1 a2 b2 a3 b3 + temp2 = _mm_unpacklo_epi16(src_r2, src_r3); //c0 d0 c1 d1 c2 d2 c3 d3 + temp1 = _mm_unpacklo_epi32(temp0, temp2); //a0 b0 c0 d0 a1 b1 c1 d1 + temp3 = _mm_unpackhi_epi32(temp0, temp2); //a2 b2 c2 d2 a3 b3 c3 d3 + + src_r0 = _mm_unpacklo_epi64(temp1, zero_8x16b); //a0 b0 c0 d0 + src_r1 = _mm_unpackhi_epi64(temp1, zero_8x16b); //a1 b1 c1 d1 + src_r2 = _mm_unpacklo_epi64(temp3, zero_8x16b); //a2 b2 c2 d2 + src_r3 = _mm_unpackhi_epi64(temp3, zero_8x16b); //a3 b3 c3 d3 + + /*----------------------------------------------------------*/ + /* x0 = z0 + z3 */ + temp0 = _mm_add_epi16(src_r0, src_r3); + /* x1 = z1 + z2 */ + temp1 = _mm_add_epi16(src_r1, src_r2); + /* x2 = z1 - z2 */ + temp2 = _mm_sub_epi16(src_r1, src_r2); + /* x3 = z0 - z3 */ + temp3 = _mm_sub_epi16(src_r0, src_r3); + + /* z0 = x0 + x1 */ + src_r0 = _mm_add_epi16(temp0, temp1); + /* z1 = (x3 << 1) + x2 */ + src_r1 = _mm_slli_epi16(temp3, 1); //(x3<<1) + src_r1 = _mm_add_epi16(src_r1, temp2); + /* z2 = x0 - x1 */ + src_r2 = _mm_sub_epi16(temp0, temp1); + /* z3 = x3 - (x2 << 1) */ + src_r3 = _mm_slli_epi16(temp2, 1); //(x2<<1) + src_r3 = _mm_sub_epi16(temp3, src_r3); + + // Matrix transpose + /* + * a0 b0 c0 d0 + * a1 b1 c1 d1 + * a2 b2 c2 d2 + * a3 b3 c3 d3 + */ + temp0 = _mm_unpacklo_epi16(src_r0, src_r1); //a0 a1 b0 b1 c0 c1 d0 d1 + temp2 = _mm_unpacklo_epi16(src_r2, src_r3); //a2 a3 b2 b3 c2 c3 d2 d3 + temp1 = _mm_unpacklo_epi32(temp0, temp2); //a0 a1 a2 a3 b0 b1 b2 b3 + temp3 = _mm_unpackhi_epi32(temp0, temp2); //c0 c1 c2 c3 d0 d1 d2 d3 + + src_r0 = _mm_unpacklo_epi64(temp1, zero_8x16b); //a0 a1 a2 a3 + src_r1 = _mm_unpackhi_epi64(temp1, zero_8x16b); //b0 b1 b2 b3 + src_r2 = _mm_unpacklo_epi64(temp3, zero_8x16b); //c0 c1 c2 c3 + src_r3 = _mm_unpackhi_epi64(temp3, zero_8x16b); //d0 d1 d2 d3 + + /*----------------------------------------------------------*/ + /* x0 = z0 + z3 */ + temp0 = _mm_add_epi16(src_r0, src_r3); + /* x1 = z1 + z2 */ + temp1 = _mm_add_epi16(src_r1, src_r2); + /* x2 = z1 - z2 */ + temp2 = _mm_sub_epi16(src_r1, src_r2); + /* x3 = z0 - z3 */ + temp3 = _mm_sub_epi16(src_r0, src_r3); + + /* z0 = x0 + x1 */ + src_r0 = _mm_add_epi16(temp0, temp1); + /* z1 = (x3 << 1) + x2 */ + src_r1 = _mm_slli_epi16(temp3, 1); //(x3<<1) + src_r1 = _mm_add_epi16(src_r1, temp2); + /* z2 = x0 - x1 */ + src_r2 = _mm_sub_epi16(temp0, temp1); + /* z3 = x3 - (x2 << 1) */ + src_r3 = _mm_slli_epi16(temp2, 1); //(x2<<1) + src_r3 = _mm_sub_epi16(temp3, src_r3); + + tmp_dc = _mm_extract_epi16(src_r0,0); //a0 + *pi2_alt_dc_addr = tmp_dc; + + src_r0 = _mm_unpacklo_epi64(src_r0, src_r1); //a0 a1 a2 a3 b0 b1 b2 b3 + src_r2 = _mm_unpacklo_epi64(src_r2, src_r3); //c0 c1 c2 c3 d0 d1 d2 d3 + sign_reg0 = _mm_cmpgt_epi16(zero_8x16b,src_r0); + sign_reg2 = _mm_cmpgt_epi16(zero_8x16b,src_r2); + + sign_reg0 = _mm_mullo_epi16(temp_2,sign_reg0); + sign_reg2 = _mm_mullo_epi16(temp_2,sign_reg2); + + sign_reg0 = _mm_add_epi16(temp_1,sign_reg0); + sign_reg2 = _mm_add_epi16(temp_1,sign_reg2); + + src_r0 = _mm_abs_epi16(src_r0); + src_r2 = _mm_abs_epi16(src_r2); + + src_r1 = _mm_srli_si128(src_r0, 8); + src_r0 = _mm_cvtepu16_epi32(src_r0); + src_r1 = _mm_cvtepu16_epi32(src_r1); + src_r3 = _mm_srli_si128(src_r2, 8); + src_r2 = _mm_cvtepu16_epi32(src_r2); + src_r3 = _mm_cvtepu16_epi32(src_r3); + + temp0 = _mm_cvtepu16_epi32(scalemat_r0_r1); + scalemat_r0_r1 = _mm_srli_si128(scalemat_r0_r1, 8); + temp2 = _mm_cvtepu16_epi32(scalemat_r2_r3); + scalemat_r2_r3 = _mm_srli_si128(scalemat_r2_r3, 8); + temp1 = _mm_cvtepu16_epi32(scalemat_r0_r1); + temp3 = _mm_cvtepu16_epi32(scalemat_r2_r3); + + temp0 = _mm_mullo_epi32(temp0, src_r0); + temp1 = _mm_mullo_epi32(temp1, src_r1); + temp2 = _mm_mullo_epi32(temp2, src_r2); + temp3 = _mm_mullo_epi32(temp3, src_r3); + + temp0 = _mm_add_epi32(temp0,rnd_fact); + temp1 = _mm_add_epi32(temp1,rnd_fact); + temp2 = _mm_add_epi32(temp2,rnd_fact); + temp3 = _mm_add_epi32(temp3,rnd_fact); + + temp0 = _mm_srli_epi32(temp0,u4_qbits); + temp1 = _mm_srli_epi32(temp1,u4_qbits); + temp2 = _mm_srli_epi32(temp2,u4_qbits); + temp3 = _mm_srli_epi32(temp3,u4_qbits); + + temp0 = _mm_packs_epi32 (temp0,temp1); + temp2 = _mm_packs_epi32 (temp2,temp3); + + temp0 = _mm_sign_epi16(temp0, sign_reg0); + temp2 = _mm_sign_epi16(temp2, sign_reg2); + + _mm_storeu_si128((__m128i *) (&pi2_out[0]), temp0); + _mm_storeu_si128((__m128i *) (&pi2_out[8]), temp2); + + cmp0 = _mm_cmpeq_epi16(temp0, zero_8x16b); + cmp1 = _mm_cmpeq_epi16(temp2, zero_8x16b); + + mask0 = _mm_movemask_epi8(cmp0); + mask1 = _mm_movemask_epi8(cmp1); + u4_zero_coeff = 0; + if(mask0) + { + if(mask0 == 0xffff) + u4_zero_coeff+=8; + else + { + cmp0 = _mm_and_si128(temp_1, cmp0); + sum0 = _mm_hadd_epi16(cmp0, zero_8x16b); + sum1 = _mm_hadd_epi16(sum0, zero_8x16b); + sum2 = _mm_hadd_epi16(sum1, zero_8x16b); + u4_zero_coeff += _mm_cvtsi128_si32(sum2); + } + } + if(mask1) + { + if(mask1 == 0xffff) + u4_zero_coeff+=8; + else + { + cmp1 = _mm_and_si128(temp_1, cmp1); + sum0 = _mm_hadd_epi16(cmp1, zero_8x16b); + sum1 = _mm_hadd_epi16(sum0, zero_8x16b); + sum2 = _mm_hadd_epi16(sum1, zero_8x16b); + u4_zero_coeff += _mm_cvtsi128_si32(sum2); + } + } + + /* Return total nonzero coefficients in the current sub block */ + u4_nonzero_coeff = 16 - u4_zero_coeff; + *pu1_nnz = u4_nonzero_coeff; +} + +/** + ******************************************************************************* + * + * @brief + * This function performs forward transform and quantization on a 4*4 chroma block + * + * @par Description: + * The function accepts source buffer and estimation buffer. From these, it + * computes the residue. This is residue is then transformed and quantized. + * The transform and quantization are in placed computed. They use the residue + * buffer for this. + * + * @param[in] pu1_src + * Pointer to source sub-block + * + * @param[in] pu1_pred + * Pointer to prediction sub-block + * + * @param[in] pi2_out + * Pointer to residual sub-block + * + * @param[in] src_strd + * Source stride + * + * @param[in] pred_strd + * Prediction stride + * + * @param[in] dst_strd + * Destination stride + * + * @param[in] u4_qbits + * QP_BITS_h264_4x4 + floor(QP/6) + * + * @param[in] pu2_threshold_matrix + * Pointer to Forward Quant Threshold Matrix + * + * @param[in] pu2_scale_matrix + * Pointer to Forward Quant Scale Matrix + * + * @param[in] u4_round_factor + * Quantization Round factor + * + * @param[out] pu1_nnz + * Total non-zero coefficients in the current sub-block + * + * @returns + * + * @remarks + * None + * + ******************************************************************************* + */ +void ih264_resi_trans_quant_chroma_4x4_sse42(UWORD8 *pu1_src,UWORD8 *pu1_pred,WORD16 *pi2_out, + WORD32 src_strd,WORD32 pred_strd, + const UWORD16 *pu2_scale_matrix, + const UWORD16 *pu2_threshold_matrix, + UWORD32 u4_qbits,UWORD32 u4_round_factor, + UWORD8 *pu1_nnz, WORD16 *pi2_alt_dc_addr) +{ + WORD32 tmp_dc, u4_zero_coeff, u4_nonzero_coeff = 0; + WORD32 mask0, mask1; + __m128i cmp0, cmp1, sum0, sum1, sum2; + __m128i rnd_fact = _mm_set1_epi32(u4_round_factor); + __m128i temp_2 = _mm_set1_epi16(2); + __m128i temp_1 = _mm_set1_epi16(1); + __m128i src_r0, src_r1, src_r2, src_r3; + __m128i pred_r0, pred_r1, pred_r2, pred_r3; + __m128i temp0, temp1, temp2, temp3; + __m128i zero_8x16b = _mm_setzero_si128(); // all bits reset to zero + __m128i sign_reg0, sign_reg2; + __m128i scalemat_r0_r1, scalemat_r2_r3; + __m128i chroma_mask = _mm_set1_epi16 (0xFF); + + scalemat_r0_r1 = _mm_loadu_si128((__m128i *) (pu2_scale_matrix)); //b00 b01 b02 b03 b10 b11 b12 b13 -- the scaling matrix 0th,1st row + scalemat_r2_r3 = _mm_loadu_si128((__m128i *) (pu2_scale_matrix + 8)); //b20 b21 b22 b23 b30 b31 b32 b33 -- the scaling matrix 2nd,3rd row + src_r0 = _mm_loadl_epi64((__m128i *) (&pu1_src[0])); //a00 a01 a02 a03 0 0 0 0 0 0 0 0 -- all 8 bits + src_r1 = _mm_loadl_epi64((__m128i *) (&pu1_src[src_strd])); //a10 a11 a12 a13 0 0 0 0 0 0 0 0 -- all 8 bits + src_r2 = _mm_loadl_epi64((__m128i *) (&pu1_src[2 * src_strd])); //a20 a21 a22 a23 0 0 0 0 0 0 0 0 -- all 8 bits + src_r3 = _mm_loadl_epi64((__m128i *) (&pu1_src[3 * src_strd])); //a30 a31 a32 a33 0 0 0 0 0 0 0 0 -- all 8 bits + + src_r0 = _mm_and_si128(src_r0, chroma_mask); + src_r1 = _mm_and_si128(src_r1, chroma_mask); + src_r2 = _mm_and_si128(src_r2, chroma_mask); + src_r3 = _mm_and_si128(src_r3, chroma_mask); +// src_r0 = _mm_cvtepu8_epi16(src_r0); +// src_r1 = _mm_cvtepu8_epi16(src_r1); +// src_r2 = _mm_cvtepu8_epi16(src_r2); +// src_r3 = _mm_cvtepu8_epi16(src_r3); + + pred_r0 = _mm_loadl_epi64((__m128i *) (&pu1_pred[0])); //p00 p01 p02 p03 0 0 0 0 0 0 0 0 -- all 8 bits + pred_r1 = _mm_loadl_epi64((__m128i *) (&pu1_pred[pred_strd])); //p10 p11 p12 p13 0 0 0 0 0 0 0 0 -- all 8 bits + pred_r2 = _mm_loadl_epi64((__m128i *) (&pu1_pred[2 * pred_strd])); //p20 p21 p22 p23 0 0 0 0 0 0 0 0 -- all 8 bits + pred_r3 = _mm_loadl_epi64((__m128i *) (&pu1_pred[3 * pred_strd])); //p30 p31 p32 p33 0 0 0 0 0 0 0 0 -- all 8 bits + + pred_r0 = _mm_and_si128(pred_r0, chroma_mask); + pred_r1 = _mm_and_si128(pred_r1, chroma_mask); + pred_r2 = _mm_and_si128(pred_r2, chroma_mask); + pred_r3 = _mm_and_si128(pred_r3, chroma_mask); +// pred_r0 = _mm_cvtepu8_epi16(pred_r0); //p00 p01 p02 p03 -- all 16 bits +// pred_r1 = _mm_cvtepu8_epi16(pred_r1); //p10 p11 p12 p13 -- all 16 bits +// pred_r2 = _mm_cvtepu8_epi16(pred_r2); //p20 p21 p22 p23 -- all 16 bits +// pred_r3 = _mm_cvtepu8_epi16(pred_r3); //p30 p31 p32 p33 -- all 16 bits + + src_r0 = _mm_sub_epi16(src_r0, pred_r0); + src_r1 = _mm_sub_epi16(src_r1, pred_r1); + src_r2 = _mm_sub_epi16(src_r2, pred_r2); + src_r3 = _mm_sub_epi16(src_r3, pred_r3); + + /* Perform Forward transform */ + /*-------------------------------------------------------------*/ + /* DCT [ Horizontal transformation ] */ + /*-------------------------------------------------------------*/ + // Matrix transpose + /* + * a0 a1 a2 a3 + * b0 b1 b2 b3 + * c0 c1 c2 c3 + * d0 d1 d2 d3 + */ + temp0 = _mm_unpacklo_epi16(src_r0, src_r1); //a0 b0 a1 b1 a2 b2 a3 b3 + temp2 = _mm_unpacklo_epi16(src_r2, src_r3); //c0 d0 c1 d1 c2 d2 c3 d3 + temp1 = _mm_unpacklo_epi32(temp0, temp2); //a0 b0 c0 d0 a1 b1 c1 d1 + temp3 = _mm_unpackhi_epi32(temp0, temp2); //a2 b2 c2 d2 a3 b3 c3 d3 + + src_r0 = _mm_unpacklo_epi64(temp1, zero_8x16b); //a0 b0 c0 d0 + src_r1 = _mm_unpackhi_epi64(temp1, zero_8x16b); //a1 b1 c1 d1 + src_r2 = _mm_unpacklo_epi64(temp3, zero_8x16b); //a2 b2 c2 d2 + src_r3 = _mm_unpackhi_epi64(temp3, zero_8x16b); //a3 b3 c3 d3 + + /*----------------------------------------------------------*/ + /* x0 = z0 + z3 */ + temp0 = _mm_add_epi16(src_r0, src_r3); + /* x1 = z1 + z2 */ + temp1 = _mm_add_epi16(src_r1, src_r2); + /* x2 = z1 - z2 */ + temp2 = _mm_sub_epi16(src_r1, src_r2); + /* x3 = z0 - z3 */ + temp3 = _mm_sub_epi16(src_r0, src_r3); + + /* z0 = x0 + x1 */ + src_r0 = _mm_add_epi16(temp0, temp1); + /* z1 = (x3 << 1) + x2 */ + src_r1 = _mm_slli_epi16(temp3, 1); //(x3<<1) + src_r1 = _mm_add_epi16(src_r1, temp2); + /* z2 = x0 - x1 */ + src_r2 = _mm_sub_epi16(temp0, temp1); + /* z3 = x3 - (x2 << 1) */ + src_r3 = _mm_slli_epi16(temp2, 1); //(x2<<1) + src_r3 = _mm_sub_epi16(temp3, src_r3); + + // Matrix transpose + /* + * a0 b0 c0 d0 + * a1 b1 c1 d1 + * a2 b2 c2 d2 + * a3 b3 c3 d3 + */ + temp0 = _mm_unpacklo_epi16(src_r0, src_r1); //a0 a1 b0 b1 c0 c1 d0 d1 + temp2 = _mm_unpacklo_epi16(src_r2, src_r3); //a2 a3 b2 b3 c2 c3 d2 d3 + temp1 = _mm_unpacklo_epi32(temp0, temp2); //a0 a1 a2 a3 b0 b1 b2 b3 + temp3 = _mm_unpackhi_epi32(temp0, temp2); //c0 c1 c2 c3 d0 d1 d2 d3 + + src_r0 = _mm_unpacklo_epi64(temp1, zero_8x16b); //a0 a1 a2 a3 + src_r1 = _mm_unpackhi_epi64(temp1, zero_8x16b); //b0 b1 b2 b3 + src_r2 = _mm_unpacklo_epi64(temp3, zero_8x16b); //c0 c1 c2 c3 + src_r3 = _mm_unpackhi_epi64(temp3, zero_8x16b); //d0 d1 d2 d3 + + /*----------------------------------------------------------*/ + /* x0 = z0 + z3 */ + temp0 = _mm_add_epi16(src_r0, src_r3); + /* x1 = z1 + z2 */ + temp1 = _mm_add_epi16(src_r1, src_r2); + /* x2 = z1 - z2 */ + temp2 = _mm_sub_epi16(src_r1, src_r2); + /* x3 = z0 - z3 */ + temp3 = _mm_sub_epi16(src_r0, src_r3); + + /* z0 = x0 + x1 */ + src_r0 = _mm_add_epi16(temp0, temp1); + /* z1 = (x3 << 1) + x2 */ + src_r1 = _mm_slli_epi16(temp3, 1); //(x3<<1) + src_r1 = _mm_add_epi16(src_r1, temp2); + /* z2 = x0 - x1 */ + src_r2 = _mm_sub_epi16(temp0, temp1); + /* z3 = x3 - (x2 << 1) */ + src_r3 = _mm_slli_epi16(temp2, 1); //(x2<<1) + src_r3 = _mm_sub_epi16(temp3, src_r3); + + tmp_dc = _mm_extract_epi16(src_r0,0); //a0 + *pi2_alt_dc_addr = tmp_dc; + + src_r0 = _mm_unpacklo_epi64(src_r0, src_r1); //a0 a1 a2 a3 b0 b1 b2 b3 + src_r2 = _mm_unpacklo_epi64(src_r2, src_r3); //c0 c1 c2 c3 d0 d1 d2 d3 + sign_reg0 = _mm_cmpgt_epi16(zero_8x16b,src_r0); + sign_reg2 = _mm_cmpgt_epi16(zero_8x16b,src_r2); + + sign_reg0 = _mm_mullo_epi16(temp_2,sign_reg0); + sign_reg2 = _mm_mullo_epi16(temp_2,sign_reg2); + + sign_reg0 = _mm_add_epi16(temp_1,sign_reg0); + sign_reg2 = _mm_add_epi16(temp_1,sign_reg2); + + src_r0 = _mm_abs_epi16(src_r0); + src_r2 = _mm_abs_epi16(src_r2); + + src_r1 = _mm_srli_si128(src_r0, 8); + src_r0 = _mm_cvtepu16_epi32(src_r0); + src_r1 = _mm_cvtepu16_epi32(src_r1); + src_r3 = _mm_srli_si128(src_r2, 8); + src_r2 = _mm_cvtepu16_epi32(src_r2); + src_r3 = _mm_cvtepu16_epi32(src_r3); + + temp0 = _mm_cvtepu16_epi32(scalemat_r0_r1); + scalemat_r0_r1 = _mm_srli_si128(scalemat_r0_r1, 8); + temp2 = _mm_cvtepu16_epi32(scalemat_r2_r3); + scalemat_r2_r3 = _mm_srli_si128(scalemat_r2_r3, 8); + temp1 = _mm_cvtepu16_epi32(scalemat_r0_r1); + temp3 = _mm_cvtepu16_epi32(scalemat_r2_r3); + + temp0 = _mm_mullo_epi32(temp0, src_r0); + temp1 = _mm_mullo_epi32(temp1, src_r1); + temp2 = _mm_mullo_epi32(temp2, src_r2); + temp3 = _mm_mullo_epi32(temp3, src_r3); + + temp0 = _mm_add_epi32(temp0,rnd_fact); + temp1 = _mm_add_epi32(temp1,rnd_fact); + temp2 = _mm_add_epi32(temp2,rnd_fact); + temp3 = _mm_add_epi32(temp3,rnd_fact); + + temp0 = _mm_srli_epi32(temp0,u4_qbits); + temp1 = _mm_srli_epi32(temp1,u4_qbits); + temp2 = _mm_srli_epi32(temp2,u4_qbits); + temp3 = _mm_srli_epi32(temp3,u4_qbits); + + temp0 = _mm_packs_epi32 (temp0,temp1); + temp2 = _mm_packs_epi32 (temp2,temp3); + + temp0 = _mm_sign_epi16(temp0, sign_reg0); + temp2 = _mm_sign_epi16(temp2, sign_reg2); + + //temp0 = _mm_insert_epi16(temp0, tmp_dc, 0); + + _mm_storeu_si128((__m128i *) (&pi2_out[0]), temp0); + _mm_storeu_si128((__m128i *) (&pi2_out[8]), temp2); + + cmp0 = _mm_cmpeq_epi16(temp0, zero_8x16b); + cmp1 = _mm_cmpeq_epi16(temp2, zero_8x16b); + + mask0 = _mm_movemask_epi8(cmp0); + mask1 = _mm_movemask_epi8(cmp1); + u4_zero_coeff = 0; + if(mask0) + { + if(mask0 == 0xffff) + u4_zero_coeff+=8; + else + { + cmp0 = _mm_and_si128(temp_1, cmp0); + sum0 = _mm_hadd_epi16(cmp0, zero_8x16b); + sum1 = _mm_hadd_epi16(sum0, zero_8x16b); + sum2 = _mm_hadd_epi16(sum1, zero_8x16b); + u4_zero_coeff += _mm_cvtsi128_si32(sum2); + } + } + if(mask1) + { + if(mask1 == 0xffff) + u4_zero_coeff+=8; + else + { + cmp1 = _mm_and_si128(temp_1, cmp1); + sum0 = _mm_hadd_epi16(cmp1, zero_8x16b); + sum1 = _mm_hadd_epi16(sum0, zero_8x16b); + sum2 = _mm_hadd_epi16(sum1, zero_8x16b); + u4_zero_coeff += _mm_cvtsi128_si32(sum2); + } + } + + /* Return total nonzero coefficients in the current sub block */ + u4_nonzero_coeff = 16 - u4_zero_coeff; + *pu1_nnz = u4_nonzero_coeff; + +} + + +/** + ******************************************************************************* + * + * @brief + * This function performs forward hadamard transform and quantization on a 4*4 block + * + * @par Description: + * The function accepts source buffer and estimation buffer. From these, it + * computes the residue. This is residue is then transformed and quantized. + * The transform and quantization are in placed computed. They use the residue + * buffer for this. + * + * @param[in] pu1_src + * Pointer to source sub-block + * + * @param[in] pu1_pred + * Pointer to prediction sub-block + * + * @param[in] pi2_out + * Pointer to residual sub-block + * + * @param[in] src_strd + * Source stride + * + * @param[in] pred_strd + * Prediction stride + * + * @param[in] dst_strd + * Destination stride + * + * @param[in] u4_qbits + * QP_BITS_h264_4x4 + floor(QP/6) + * + * @param[in] pu2_threshold_matrix + * Pointer to Forward Quant Threshold Matrix + * + * @param[in] pu2_scale_matrix + * Pointer to Forward Quant Scale Matrix + * + * @param[in] u4_round_factor + * Quantization Round factor + * + * @param[out] pu1_nnz + * Total non-zero coefficients in the current sub-block + * + * @returns + * + * @remarks + * None + * + */ + +void ih264_hadamard_quant_4x4_sse42(WORD16 *pi2_src, WORD16 *pi2_dst, + const UWORD16 *pu2_scale_matrix, + const UWORD16 *pu2_threshold_matrix, UWORD32 u4_qbits, + UWORD32 u4_round_factor,UWORD8 *pu1_nnz + ) +{ + WORD32 u4_zero_coeff,u4_nonzero_coeff=0; + __m128i cmp0, cmp1, sum0, sum1, sum2; + WORD32 mask0, mask1; + __m128i src_r0_r1, src_r2_r3, sign_reg; + __m128i src_r0, src_r1, src_r2, src_r3; + __m128i zero_8x16b = _mm_setzero_si128(); + __m128i temp0, temp1, temp2, temp3; + __m128i sign_reg0, sign_reg1, sign_reg2, sign_reg3; + __m128i temp_1 = _mm_set1_epi16(1); + __m128i rnd_fact = _mm_set1_epi32(u4_round_factor); + __m128i scale_val = _mm_set1_epi32(pu2_scale_matrix[0]); + + src_r0_r1 = _mm_loadu_si128((__m128i *) (pi2_src)); //a00 a01 a02 a03 a10 a11 a12 a13 -- the source matrix 0th,1st row + src_r2_r3 = _mm_loadu_si128((__m128i *) (pi2_src + 8)); //a20 a21 a22 a23 a30 a31 a32 a33 -- the source matrix 2nd,3rd row + sign_reg = _mm_cmpgt_epi16(zero_8x16b, src_r0_r1); + src_r0 = _mm_unpacklo_epi16(src_r0_r1, sign_reg); //a0 a1 a2 a3 + src_r1 = _mm_unpackhi_epi16(src_r0_r1, sign_reg); //b0 b1 b2 b3 + sign_reg = _mm_cmpgt_epi16(zero_8x16b, src_r2_r3); + src_r2 = _mm_unpacklo_epi16(src_r2_r3, sign_reg); //c0 c1 c2 c3 + src_r3 = _mm_unpackhi_epi16(src_r2_r3, sign_reg); //d0 d1 d2 d3 + + /* Perform Inverse transform */ + /*-------------------------------------------------------------*/ + /* Forward DC transform [ Horizontal transformation ] */ + /*-------------------------------------------------------------*/ + // Matrix transpose + /* + * a0 a1 a2 a3 + * b0 b1 b2 b3 + * c0 c1 c2 c3 + * d0 d1 d2 d3 + */ + temp0 = _mm_unpacklo_epi32(src_r0, src_r1); //a0 b0 a1 b1 + temp2 = _mm_unpacklo_epi32(src_r2, src_r3); //c0 d0 c1 d1 + temp1 = _mm_unpackhi_epi32(src_r0, src_r1); //a2 b2 a3 b3 + temp3 = _mm_unpackhi_epi32(src_r2, src_r3); //c2 d2 c3 d3 + src_r0 = _mm_unpacklo_epi64(temp0, temp2); //a0 b0 c0 d0 + src_r1 = _mm_unpackhi_epi64(temp0, temp2); //a1 b1 c1 d1 + src_r2 = _mm_unpacklo_epi64(temp1, temp3); //a2 b2 c2 d2 + src_r3 = _mm_unpackhi_epi64(temp1, temp3); //a3 b3 c3 d3 + + temp0 = _mm_add_epi32(src_r0, src_r3); + temp1 = _mm_add_epi32(src_r1, src_r2); + temp2 = _mm_sub_epi32(src_r1, src_r2); + temp3 = _mm_sub_epi32(src_r0, src_r3); + + src_r0 = _mm_add_epi32(temp0, temp1); + src_r1 = _mm_add_epi32(temp2, temp3); + src_r2 = _mm_sub_epi32(temp0, temp1); + src_r3 = _mm_sub_epi32(temp3, temp2); + + /*-------------------------------------------------------------*/ + /* Forward DC transform [ Vertical transformation ] */ + /*-------------------------------------------------------------*/ + // Matrix transpose + /* + * a0 b0 c0 d0 + * a1 b1 c1 d1 + * a2 b2 c2 d2 + * a3 b3 c3 d3 + */ + temp0 = _mm_unpacklo_epi32(src_r0, src_r1); //a0 a1 b0 b1 + temp2 = _mm_unpacklo_epi32(src_r2, src_r3); //a2 a3 b2 b3 + temp1 = _mm_unpackhi_epi32(src_r0, src_r1); //c0 c1 d0 d1 + temp3 = _mm_unpackhi_epi32(src_r2, src_r3); //c2 c3 d2 d3 + src_r0 = _mm_unpacklo_epi64(temp0, temp2); //a0 a1 a2 a3 + src_r1 = _mm_unpackhi_epi64(temp0, temp2); //b0 b1 b2 b3 + src_r2 = _mm_unpacklo_epi64(temp1, temp3); //c0 c1 c2 c3 + src_r3 = _mm_unpackhi_epi64(temp1, temp3); //d0 d1 d2 d3 + + temp0 = _mm_add_epi32(src_r0, src_r3); + temp1 = _mm_add_epi32(src_r1, src_r2); + temp2 = _mm_sub_epi32(src_r1, src_r2); + temp3 = _mm_sub_epi32(src_r0, src_r3); + + src_r0 = _mm_add_epi32(temp0, temp1); + src_r1 = _mm_add_epi32(temp2, temp3); + src_r2 = _mm_sub_epi32(temp0, temp1); + src_r3 = _mm_sub_epi32(temp3, temp2); + + src_r0 = _mm_srai_epi32(src_r0, 1); + src_r1 = _mm_srai_epi32(src_r1, 1); + src_r2 = _mm_srai_epi32(src_r2, 1); + src_r3 = _mm_srai_epi32(src_r3, 1); + + // Quantization + sign_reg0 = _mm_cmpgt_epi32(zero_8x16b, src_r0); //Find sign of each value for later restoration + sign_reg1 = _mm_cmpgt_epi32(zero_8x16b, src_r1); + sign_reg2 = _mm_cmpgt_epi32(zero_8x16b, src_r2); + sign_reg3 = _mm_cmpgt_epi32(zero_8x16b, src_r3); + + sign_reg0 = _mm_packs_epi32(sign_reg0, sign_reg1); //Sign = -1 or 0 depending on <0 or >0 respectively + sign_reg2 = _mm_packs_epi32(sign_reg2, sign_reg3); + + sign_reg0 = _mm_slli_epi16(sign_reg0, 1); //Sign = -2 or 0 depending on <0 or >0 respectively + sign_reg2 = _mm_slli_epi16(sign_reg2, 1); + + sign_reg0 = _mm_add_epi16(temp_1,sign_reg0); //Sign = -1 or 1 depending on <0 or >0 respectively + sign_reg2 = _mm_add_epi16(temp_1,sign_reg2); + + src_r0 = _mm_abs_epi32(src_r0); //Absolute values + src_r1 = _mm_abs_epi32(src_r1); + src_r2 = _mm_abs_epi32(src_r2); + src_r3 = _mm_abs_epi32(src_r3); + + temp0 = _mm_mullo_epi32(scale_val, src_r0); //multiply by pu2_scale_matrix[0] + temp1 = _mm_mullo_epi32(scale_val, src_r1); + temp2 = _mm_mullo_epi32(scale_val, src_r2); + temp3 = _mm_mullo_epi32(scale_val, src_r3); + + temp0 = _mm_add_epi32(temp0,rnd_fact); //Add round factor + temp1 = _mm_add_epi32(temp1,rnd_fact); + temp2 = _mm_add_epi32(temp2,rnd_fact); + temp3 = _mm_add_epi32(temp3,rnd_fact); + + temp0 = _mm_srli_epi32(temp0,u4_qbits); //RIght shift by qbits, unsigned variable, so shift right immediate works + temp1 = _mm_srli_epi32(temp1,u4_qbits); + temp2 = _mm_srli_epi32(temp2,u4_qbits); + temp3 = _mm_srli_epi32(temp3,u4_qbits); + + temp0 = _mm_packs_epi32 (temp0,temp1); //Final values are 16-bits only. + temp2 = _mm_packs_epi32 (temp2,temp3); + + temp0 = _mm_sign_epi16(temp0, sign_reg0); //Sign restoration + temp2 = _mm_sign_epi16(temp2, sign_reg2); + + _mm_storeu_si128((__m128i *) (&pi2_dst[0]), temp0); + _mm_storeu_si128((__m128i *) (&pi2_dst[8]), temp2); + + cmp0 = _mm_cmpeq_epi16(temp0, zero_8x16b); + cmp1 = _mm_cmpeq_epi16(temp2, zero_8x16b); + + mask0 = _mm_movemask_epi8(cmp0); + mask1 = _mm_movemask_epi8(cmp1); + u4_zero_coeff = 0; + if(mask0) + { + if(mask0 == 0xffff) + u4_zero_coeff+=8; + else + { + cmp0 = _mm_and_si128(temp_1, cmp0); + sum0 = _mm_hadd_epi16(cmp0, zero_8x16b); + sum1 = _mm_hadd_epi16(sum0, zero_8x16b); + sum2 = _mm_hadd_epi16(sum1, zero_8x16b); + u4_zero_coeff += _mm_cvtsi128_si32(sum2); + } + } + if(mask1) + { + if(mask1 == 0xffff) + u4_zero_coeff+=8; + else + { + cmp1 = _mm_and_si128(temp_1, cmp1); + sum0 = _mm_hadd_epi16(cmp1, zero_8x16b); + sum1 = _mm_hadd_epi16(sum0, zero_8x16b); + sum2 = _mm_hadd_epi16(sum1, zero_8x16b); + u4_zero_coeff += _mm_cvtsi128_si32(sum2); + } + } + + /* Return total nonzero coefficients in the current sub block */ + u4_nonzero_coeff = 16 - u4_zero_coeff; + pu1_nnz[0] = u4_nonzero_coeff; +} + + +/** + ******************************************************************************* + * + * @brief + * This function performs forward hadamard transform and quantization on a 2*2 block + * for both U and V planes + * + * @par Description: + * The function accepts source buffer and estimation buffer. From these, it + * computes the residue. This is residue is then transformed and quantized. + * The transform and quantization are in placed computed. They use the residue + * buffer for this. + * + * @param[in] pu1_src + * Pointer to source sub-block + * + * @param[in] pu1_pred + * Pointer to prediction sub-block + * + * @param[in] pi2_out + * Pointer to residual sub-block + * + * @param[in] src_strd + * Source stride + * + * @param[in] pred_strd + * Prediction stride + * + * @param[in] dst_strd + * Destination stride + * + * @param[in] u4_qbits + * QP_BITS_h264_4x4 + floor(QP/6) + * + * @param[in] pu2_threshold_matrix + * Pointer to Forward Quant Threshold Matrix + * + * @param[in] pu2_scale_matrix + * Pointer to Forward Quant Scale Matrix + * + * @param[in] u4_round_factor + * Quantization Round factor + * + * @param[out] pu1_nnz + * Total non-zero coefficients in the current sub-block + * + * @returns + * + * @remarks + * NNZ for dc is populated at 0 and 5th position of pu1_nnz + * + */ + +void ih264_hadamard_quant_2x2_uv_sse42(WORD16 *pi2_src, WORD16 *pi2_dst, + const UWORD16 *pu2_scale_matrix, + const UWORD16 *pu2_threshold_matrix, UWORD32 u4_qbits, + UWORD32 u4_round_factor,UWORD8 *pu1_nnz) +{ + WORD32 val, nonzero_coeff_0, nonzero_coeff_1=0; + nonzero_coeff_0 = 0; + __m128i cmp, cmp0, cmp1; + __m128i sum0, sum1; + WORD32 mask, mask0, mask1; + __m128i src, plane_0, plane_1, temp0, temp1, sign_reg; + __m128i zero_8x16b = _mm_setzero_si128(); + __m128i scale_val = _mm_set1_epi32(pu2_scale_matrix[0]); + __m128i sign_reg0, sign_reg1; + __m128i temp_1 = _mm_set1_epi16(1); + __m128i rnd_fact = _mm_set1_epi32(u4_round_factor); + + src = _mm_loadu_si128((__m128i *)pi2_src); //a0 a1 a2 a3 b0 b1 b2 b3 + sign_reg = _mm_cmpgt_epi16(zero_8x16b, src); + plane_0 = _mm_unpacklo_epi16(src, sign_reg); //a0 a1 a2 a3 -- 32 bits + plane_1 = _mm_unpackhi_epi16(src, sign_reg); //b0 b1 b2 b3 -- 32 bits + + temp0 = _mm_hadd_epi32(plane_0, plane_1); //a0+a1 a2+a3 b0+b1 b2+b3 + temp1 = _mm_hsub_epi32(plane_0, plane_1); //a0-a1 a2-a3 b0-b1 b2-b3 + + plane_0 = _mm_hadd_epi32(temp0, temp1); //a0+a1+a2+a3 b0+b1+b2+b3 a0-a1+a2-a3 b0-b1+b2-b3 + plane_1 = _mm_hsub_epi32(temp0, temp1); //a0+a1-a2-a3 b0+b1-b2-b3 a0-a1-a2+a3 b0-b1-b2+b3 + + temp0 = _mm_unpacklo_epi32(plane_0, plane_1); //a0+a1+a2+a3 a0+a1-a2-a3 b0+b1+b2+b3 b0+b1-b2-b3 + temp1 = _mm_unpackhi_epi32(plane_0, plane_1); //a0-a1+a2-a3 a0-a1-a2+a3 b0-b1+b2-b3 b0-b1-b2+b3 + + plane_0 = _mm_unpacklo_epi64(temp0, temp1); //a0+a1+a2+a3 a0+a1-a2-a3 a0-a1+a2-a3 a0-a1-a2+a3 + plane_1 = _mm_unpackhi_epi64(temp0, temp1); //b0+b1+b2+b3 b0+b1-b2-b3 b0-b1+b2-b3 b0-b1-b2+b3 + + plane_0 = _mm_shuffle_epi32(plane_0, 0xd8); //a0+a1+a2+a3 a0-a1+a2-a3 a0+a1-a2-a3 a0-a1-a2+a3 + plane_1 = _mm_shuffle_epi32(plane_1, 0xd8); //b0+b1+b2+b3 b0-b1+b2-b3 b0+b1-b2-b3 b0-b1-b2+b3 + // Quantization + sign_reg0 = _mm_cmpgt_epi32(zero_8x16b, plane_0); //Find sign of each value for later restoration + sign_reg1 = _mm_cmpgt_epi32(zero_8x16b, plane_1); + + sign_reg0 = _mm_packs_epi32(sign_reg0, sign_reg1); //Sign = -1 or 0 depending on <0 or >0 respectively + sign_reg0 = _mm_slli_epi16(sign_reg0, 1); //Sign = -2 or 0 depending on <0 or >0 respectively + sign_reg0 = _mm_add_epi16(temp_1,sign_reg0); //Sign = -1 or 1 depending on <0 or >0 respectively + + plane_0 = _mm_abs_epi32(plane_0); //Absolute values + plane_1 = _mm_abs_epi32(plane_1); + + temp0 = _mm_mullo_epi32(scale_val, plane_0); //multiply by pu2_scale_matrix[0] + temp1 = _mm_mullo_epi32(scale_val, plane_1); //multiply by pu2_scale_matrix[0] + + temp0 = _mm_add_epi32(temp0,rnd_fact); //Add round factor + temp1 = _mm_add_epi32(temp1,rnd_fact); + + temp0 = _mm_srli_epi32(temp0,u4_qbits); //RIght shift by qbits, unsigned variable, so shift right immediate works + temp1 = _mm_srli_epi32(temp1,u4_qbits); + + temp0 = _mm_packs_epi32 (temp0,temp1); //Final values are 16-bits only. + temp0 = _mm_sign_epi16(temp0, sign_reg0); //Sign restoration + + _mm_storeu_si128((__m128i *) (&pi2_dst[0]), temp0); + + cmp = _mm_cmpeq_epi16(temp0, zero_8x16b); + mask = _mm_movemask_epi8(cmp); + mask0 = mask & 0xff; + mask1 = mask>>8; + if(mask0) + { + if(mask0 == 0xff) + nonzero_coeff_0 += 4; + else + { + cmp0 = _mm_and_si128(temp_1, cmp); + sum0 = _mm_hadd_epi16(cmp0, zero_8x16b); + sum1 = _mm_hadd_epi16(sum0, zero_8x16b); + val = _mm_cvtsi128_si32(sum1); + val = val & 0xffff; + nonzero_coeff_0 += val; + } + } + if(mask1) + { + if(mask1 == 0xff) + nonzero_coeff_1 += 4; + else + { + cmp1 = _mm_srli_si128(cmp, 8); + cmp1 = _mm_and_si128(temp_1, cmp1); + sum0 = _mm_hadd_epi16(cmp1, zero_8x16b); + sum1 = _mm_hadd_epi16(sum0, zero_8x16b); + nonzero_coeff_1 += _mm_cvtsi128_si32(sum1); + } + } + + pu1_nnz[0] = 4 - nonzero_coeff_0; + pu1_nnz[1] = 4 - nonzero_coeff_1; + +} diff --git a/common/x86/ih264_weighted_pred_sse42.c b/common/x86/ih264_weighted_pred_sse42.c new file mode 100755 index 0000000..b1684b7 --- /dev/null +++ b/common/x86/ih264_weighted_pred_sse42.c @@ -0,0 +1,1349 @@ +/****************************************************************************** + * + * Copyright (C) 2015 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ***************************************************************************** + * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore +*/ +/*****************************************************************************/ +/* */ +/* File Name : ih264_weighted_pred_intr_sse42.c */ +/* */ +/* Description : Contains function definitions for weighted */ +/* prediction functions in x86 sse4 intrinsics */ +/* */ +/* List of Functions : ih264_default_weighted_pred_luma_sse42() */ +/* ih264_default_weighted_pred_chroma_sse42() */ +/* ih264_weighted_pred_luma_sse42() */ +/* ih264_weighted_pred_chroma_sse42() */ +/* ih264_weighted_bipred_luma_sse42() */ +/* ih264_weighted_bipred_chroma_sse42() */ +/* */ +/* Issues / Problems : None */ +/* */ +/* Revision History : */ +/* */ +/* DD MM YYYY Author(s) Changes */ +/* 30 01 2015 Kaushik Initial version */ +/* Senthoor */ +/* */ +/*****************************************************************************/ +/*****************************************************************************/ +/* File Includes */ +/*****************************************************************************/ + +#include <immintrin.h> +#include "ih264_typedefs.h" +#include "ih264_macros.h" +#include "ih264_platform_macros.h" +#include "ih264_weighted_pred.h" + +/*****************************************************************************/ +/* Function definitions . */ +/*****************************************************************************/ +/*****************************************************************************/ +/* */ +/* Function Name : ih264_default_weighted_pred_luma_sse42 */ +/* */ +/* Description : This function performs the default weighted prediction */ +/* as described in sec 8.4.2.3.1 titled "Default weighted */ +/* sample prediction process" for luma. The function gets */ +/* two ht x wd blocks, calculates their rounded-average and */ +/* stores it in the destination block. (ht,wd) can be */ +/* (4,4), (8,4), (4,8), (8,8), (16,8), (8,16) or (16,16). */ +/* */ +/* Inputs : pu1_src1 - Pointer to source 1 */ +/* pu1_src2 - Pointer to source 2 */ +/* pu1_dst - Pointer to destination */ +/* src_strd1 - stride for source 1 */ +/* src_strd1 - stride for source 2 */ +/* dst_strd - stride for destination */ +/* ht - height of the block */ +/* wd - width of the block */ +/* */ +/* Issues : None */ +/* */ +/* Revision History: */ +/* */ +/* DD MM YYYY Author(s) Changes */ +/* 04 02 2015 Kaushik Initial Version */ +/* Senthoor */ +/* */ +/*****************************************************************************/ +void ih264_default_weighted_pred_luma_sse42(UWORD8 *pu1_src1, + UWORD8 *pu1_src2, + UWORD8 *pu1_dst, + WORD32 src_strd1, + WORD32 src_strd2, + WORD32 dst_strd, + WORD32 ht, + WORD32 wd) +{ + __m128i y0_0_16x8b, y0_1_16x8b, y0_2_16x8b, y0_3_16x8b; + __m128i y1_0_16x8b, y1_1_16x8b, y1_2_16x8b, y1_3_16x8b; + + if(wd == 4) + { + __m128i mask_full_16x8b, mask_ll4B_16x8b; + + mask_full_16x8b = _mm_set1_epi8(0xff); + mask_ll4B_16x8b = _mm_srli_si128(mask_full_16x8b, 12); + // mask for first four bytes + + do + { + y0_0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src1); + y0_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src1 + src_strd1)); + y0_2_16x8b = _mm_loadl_epi64( + (__m128i *)(pu1_src1 + (src_strd1 << 1))); + y0_3_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src1 + src_strd1 * 3)); + + y1_0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src2); + y1_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src2 + src_strd2)); + y1_2_16x8b = _mm_loadl_epi64( + (__m128i *)(pu1_src2 + (src_strd2 << 1))); + y1_3_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src2 + src_strd2 * 3)); + + y0_0_16x8b = _mm_avg_epu8(y0_0_16x8b, y1_0_16x8b); + y0_1_16x8b = _mm_avg_epu8(y0_1_16x8b, y1_1_16x8b); + y0_2_16x8b = _mm_avg_epu8(y0_2_16x8b, y1_2_16x8b); + y0_3_16x8b = _mm_avg_epu8(y0_3_16x8b, y1_3_16x8b); + + _mm_maskmoveu_si128(y0_0_16x8b, mask_ll4B_16x8b, (char*)pu1_dst); + _mm_maskmoveu_si128(y0_1_16x8b, mask_ll4B_16x8b, + (char*)(pu1_dst + dst_strd)); + _mm_maskmoveu_si128(y0_2_16x8b, mask_ll4B_16x8b, + (char*)(pu1_dst + (dst_strd << 1))); + _mm_maskmoveu_si128(y0_3_16x8b, mask_ll4B_16x8b, + (char*)(pu1_dst + dst_strd * 3)); + + ht -= 4; + pu1_src1 += src_strd1 << 2; + pu1_src2 += src_strd2 << 2; + pu1_dst += dst_strd << 2; + } + while(ht > 0); + } + else if(wd == 8) + { + do + { + y0_0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src1); + y0_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src1 + src_strd1)); + y0_2_16x8b = _mm_loadl_epi64( + (__m128i *)(pu1_src1 + (src_strd1 << 1))); + y0_3_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src1 + src_strd1 * 3)); + + y1_0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src2); + y1_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src2 + src_strd2)); + y1_2_16x8b = _mm_loadl_epi64( + (__m128i *)(pu1_src2 + (src_strd2 << 1))); + y1_3_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src2 + src_strd2 * 3)); + + y0_0_16x8b = _mm_avg_epu8(y0_0_16x8b, y1_0_16x8b); + y0_1_16x8b = _mm_avg_epu8(y0_1_16x8b, y1_1_16x8b); + y0_2_16x8b = _mm_avg_epu8(y0_2_16x8b, y1_2_16x8b); + y0_3_16x8b = _mm_avg_epu8(y0_3_16x8b, y1_3_16x8b); + + _mm_storel_epi64((__m128i *)pu1_dst, y0_0_16x8b); + _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd), y0_1_16x8b); + _mm_storel_epi64((__m128i *)(pu1_dst + (dst_strd << 1)), y0_2_16x8b); + _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd * 3), y0_3_16x8b); + + ht -= 4; + pu1_src1 += src_strd1 << 2; + pu1_src2 += src_strd2 << 2; + pu1_dst += dst_strd << 2; + } + while(ht > 0); + } + else // wd == 16 + { + __m128i y0_4_16x8b, y0_5_16x8b, y0_6_16x8b, y0_7_16x8b; + __m128i y1_4_16x8b, y1_5_16x8b, y1_6_16x8b, y1_7_16x8b; + + do + { + y0_0_16x8b = _mm_loadu_si128((__m128i *)pu1_src1); + y0_1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src1 + src_strd1)); + y0_2_16x8b = _mm_loadu_si128( + (__m128i *)(pu1_src1 + (src_strd1 << 1))); + y0_3_16x8b = _mm_loadu_si128((__m128i *)(pu1_src1 + src_strd1 * 3)); + y0_4_16x8b = _mm_loadu_si128( + (__m128i *)(pu1_src1 + (src_strd1 << 2))); + y0_5_16x8b = _mm_loadu_si128((__m128i *)(pu1_src1 + src_strd1 * 5)); + y0_6_16x8b = _mm_loadu_si128((__m128i *)(pu1_src1 + src_strd1 * 6)); + y0_7_16x8b = _mm_loadu_si128((__m128i *)(pu1_src1 + src_strd1 * 7)); + + y1_0_16x8b = _mm_loadu_si128((__m128i *)pu1_src2); + y1_1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src2 + src_strd2)); + y1_2_16x8b = _mm_loadu_si128( + (__m128i *)(pu1_src2 + (src_strd2 << 1))); + y1_3_16x8b = _mm_loadu_si128((__m128i *)(pu1_src2 + src_strd2 * 3)); + y1_4_16x8b = _mm_loadu_si128( + (__m128i *)(pu1_src2 + (src_strd2 << 2))); + y1_5_16x8b = _mm_loadu_si128((__m128i *)(pu1_src2 + src_strd2 * 5)); + y1_6_16x8b = _mm_loadu_si128((__m128i *)(pu1_src2 + src_strd2 * 6)); + y1_7_16x8b = _mm_loadu_si128((__m128i *)(pu1_src2 + src_strd2 * 7)); + + y0_0_16x8b = _mm_avg_epu8(y0_0_16x8b, y1_0_16x8b); + y0_1_16x8b = _mm_avg_epu8(y0_1_16x8b, y1_1_16x8b); + y0_2_16x8b = _mm_avg_epu8(y0_2_16x8b, y1_2_16x8b); + y0_3_16x8b = _mm_avg_epu8(y0_3_16x8b, y1_3_16x8b); + y0_4_16x8b = _mm_avg_epu8(y0_4_16x8b, y1_4_16x8b); + y0_5_16x8b = _mm_avg_epu8(y0_5_16x8b, y1_5_16x8b); + y0_6_16x8b = _mm_avg_epu8(y0_6_16x8b, y1_6_16x8b); + y0_7_16x8b = _mm_avg_epu8(y0_7_16x8b, y1_7_16x8b); + + _mm_storeu_si128((__m128i *)pu1_dst, y0_0_16x8b); + _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd), y0_1_16x8b); + _mm_storeu_si128((__m128i *)(pu1_dst + (dst_strd << 1)), y0_2_16x8b); + _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd * 3), y0_3_16x8b); + _mm_storeu_si128((__m128i *)(pu1_dst + (dst_strd << 2)), y0_4_16x8b); + _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd * 5), y0_5_16x8b); + _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd * 6), y0_6_16x8b); + _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd * 7), y0_7_16x8b); + + ht -= 8; + pu1_src1 += src_strd1 << 3; + pu1_src2 += src_strd2 << 3; + pu1_dst += dst_strd << 3; + } + while(ht > 0); + } +} + +/*****************************************************************************/ +/* */ +/* Function Name : ih264_default_weighted_pred_chroma_sse42 */ +/* */ +/* Description : This function performs the default weighted prediction */ +/* as described in sec 8.4.2.3.1 titled "Default weighted */ +/* sample prediction process" for chroma. The function gets */ +/* two ht x wd blocks, calculates their rounded-average and */ +/* stores it in the destination block. (ht,wd) can be */ +/* (2,2), (4,2) , (2,4), (4,4), (8,4), (4,8) or (8,8). */ +/* */ +/* Inputs : pu1_src1 - Pointer to source 1 */ +/* pu1_src2 - Pointer to source 2 */ +/* pu1_dst - Pointer to destination */ +/* src_strd1 - stride for source 1 */ +/* src_strd1 - stride for source 2 */ +/* dst_strd - stride for destination */ +/* ht - height of the block */ +/* wd - width of the block */ +/* */ +/* Issues : None */ +/* */ +/* Revision History: */ +/* */ +/* DD MM YYYY Author(s) Changes */ +/* 04 02 2015 Kaushik Initial Version */ +/* Senthoor */ +/* */ +/*****************************************************************************/ +void ih264_default_weighted_pred_chroma_sse42(UWORD8 *pu1_src1, + UWORD8 *pu1_src2, + UWORD8 *pu1_dst, + WORD32 src_strd1, + WORD32 src_strd2, + WORD32 dst_strd, + WORD32 ht, + WORD32 wd) +{ + __m128i uv0_0_16x8b, uv0_1_16x8b; + __m128i uv1_0_16x8b, uv1_1_16x8b; + + if(wd == 2) + { + __m128i mask_full_16x8b, mask_ll4B_16x8b; + + mask_full_16x8b = _mm_set1_epi8(0xff); + mask_ll4B_16x8b = _mm_srli_si128(mask_full_16x8b, 12); + // mask for first four bytes + + do + { + uv0_0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src1); + uv0_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src1 + src_strd1)); + + uv1_0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src2); + uv1_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src2 + src_strd2)); + + uv0_0_16x8b = _mm_avg_epu8(uv0_0_16x8b, uv1_0_16x8b); + uv0_1_16x8b = _mm_avg_epu8(uv0_1_16x8b, uv1_1_16x8b); + + _mm_maskmoveu_si128(uv0_0_16x8b, mask_ll4B_16x8b, (char*)pu1_dst); + _mm_maskmoveu_si128(uv0_1_16x8b, mask_ll4B_16x8b, + (char*)(pu1_dst + dst_strd)); + + ht -= 2; + pu1_src1 += src_strd1 << 1; + pu1_src2 += src_strd2 << 1; + pu1_dst += dst_strd << 1; + } + while(ht > 0); + } + else if(wd == 4) + { + do + { + uv0_0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src1); + uv0_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src1 + src_strd1)); + + uv1_0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src2); + uv1_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src2 + src_strd2)); + + uv0_0_16x8b = _mm_avg_epu8(uv0_0_16x8b, uv1_0_16x8b); + uv0_1_16x8b = _mm_avg_epu8(uv0_1_16x8b, uv1_1_16x8b); + + _mm_storel_epi64((__m128i *)pu1_dst, uv0_0_16x8b); + _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd), uv0_1_16x8b); + + ht -= 2; + pu1_src1 += src_strd1 << 1; + pu1_src2 += src_strd2 << 1; + pu1_dst += dst_strd << 1; + } + while(ht > 0); + } + else // wd == 8 + { + __m128i uv0_2_16x8b, uv0_3_16x8b; + __m128i uv1_2_16x8b, uv1_3_16x8b; + + do + { + uv0_0_16x8b = _mm_loadu_si128((__m128i *)pu1_src1); + uv0_1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src1 + src_strd1)); + uv0_2_16x8b = _mm_loadu_si128( + (__m128i *)(pu1_src1 + (src_strd1 << 1))); + uv0_3_16x8b = _mm_loadu_si128( + (__m128i *)(pu1_src1 + src_strd1 * 3)); + + uv1_0_16x8b = _mm_loadu_si128((__m128i *)pu1_src2); + uv1_1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src2 + src_strd2)); + uv1_2_16x8b = _mm_loadu_si128( + (__m128i *)(pu1_src2 + (src_strd2 << 1))); + uv1_3_16x8b = _mm_loadu_si128( + (__m128i *)(pu1_src2 + src_strd2 * 3)); + + uv0_0_16x8b = _mm_avg_epu8(uv0_0_16x8b, uv1_0_16x8b); + uv0_1_16x8b = _mm_avg_epu8(uv0_1_16x8b, uv1_1_16x8b); + uv0_2_16x8b = _mm_avg_epu8(uv0_2_16x8b, uv1_2_16x8b); + uv0_3_16x8b = _mm_avg_epu8(uv0_3_16x8b, uv1_3_16x8b); + + _mm_storeu_si128((__m128i *)pu1_dst, uv0_0_16x8b); + _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd), uv0_1_16x8b); + _mm_storeu_si128( + (__m128i *)(pu1_dst + (dst_strd << 1)), uv0_2_16x8b); + _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd * 3), uv0_3_16x8b); + + ht -= 4; + pu1_src1 += src_strd1 << 2; + pu1_src2 += src_strd2 << 2; + pu1_dst += dst_strd << 2; + } + while(ht > 0); + } +} + +/*****************************************************************************/ +/* */ +/* Function Name : ih264_weighted_pred_luma_sse42 */ +/* */ +/* Description : This function performs the weighted prediction as */ +/* described in sec 8.4.2.3.2 titled "Weighted sample */ +/* prediction process" for luma. The function gets one */ +/* ht x wd block, weights it, rounds it off, offsets it, */ +/* saturates it to unsigned 8-bit and stores it in the */ +/* destination block. (ht,wd) can be (4,4), (8,4), (4,8), */ +/* (8,8), (16,8), (8,16) or (16,16). */ +/* */ +/* Inputs : pu1_src - Pointer to source */ +/* pu1_dst - Pointer to destination */ +/* src_strd - stride for source */ +/* dst_strd - stride for destination */ +/* log_wd - number of bits to be rounded off */ +/* wt - weight value */ +/* ofst - offset value */ +/* ht - height of the block */ +/* wd - width of the block */ +/* */ +/* Issues : None */ +/* */ +/* Revision History: */ +/* */ +/* DD MM YYYY Author(s) Changes */ +/* 04 02 2015 Kaushik Initial Version */ +/* Senthoor */ +/* */ +/*****************************************************************************/ +void ih264_weighted_pred_luma_sse42(UWORD8 *pu1_src, + UWORD8 *pu1_dst, + WORD32 src_strd, + WORD32 dst_strd, + WORD32 log_wd, + WORD32 wt, + WORD32 ofst, + WORD32 ht, + WORD32 wd) +{ + __m128i y_0_16x8b, y_1_16x8b, y_2_16x8b, y_3_16x8b; + + __m128i wt_8x16b, round_8x16b, ofst_8x16b; + + WORD32 round_val; + + wt = (WORD16)(wt & 0xffff); + round_val = 1 << (log_wd - 1); + ofst = (WORD8)(ofst & 0xff); + + wt_8x16b = _mm_set1_epi16(wt); + round_8x16b = _mm_set1_epi16(round_val); + ofst_8x16b = _mm_set1_epi16(ofst); + + if(wd == 4) + { + __m128i y_0_8x16b, y_2_8x16b; + + __m128i mask_full_16x8b, mask_ll4B_16x8b; + + mask_full_16x8b = _mm_set1_epi8(0xff); + mask_ll4B_16x8b = _mm_srli_si128(mask_full_16x8b, 12); + // mask for first four bytes + + do + { + y_0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src); + y_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + src_strd)); + y_2_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + (src_strd << 1))); + y_3_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + src_strd * 3)); + + y_0_16x8b = _mm_unpacklo_epi32(y_0_16x8b, y_1_16x8b); + y_2_16x8b = _mm_unpacklo_epi32(y_2_16x8b, y_3_16x8b); + + y_0_8x16b = _mm_cvtepu8_epi16(y_0_16x8b); + y_2_8x16b = _mm_cvtepu8_epi16(y_2_16x8b); + + y_0_8x16b = _mm_mullo_epi16(y_0_8x16b, wt_8x16b); + y_2_8x16b = _mm_mullo_epi16(y_2_8x16b, wt_8x16b); + + y_0_8x16b = _mm_adds_epi16(round_8x16b, y_0_8x16b); + y_2_8x16b = _mm_adds_epi16(round_8x16b, y_2_8x16b); + + y_0_8x16b = _mm_srai_epi16(y_0_8x16b, log_wd); + y_2_8x16b = _mm_srai_epi16(y_2_8x16b, log_wd); + + y_0_8x16b = _mm_adds_epi16(ofst_8x16b, y_0_8x16b); + y_2_8x16b = _mm_adds_epi16(ofst_8x16b, y_2_8x16b); + + y_0_16x8b = _mm_packus_epi16(y_0_8x16b, y_2_8x16b); + y_1_16x8b = _mm_srli_si128(y_0_16x8b, 4); + y_2_16x8b = _mm_srli_si128(y_0_16x8b, 8); + y_3_16x8b = _mm_srli_si128(y_0_16x8b, 12); + + _mm_maskmoveu_si128(y_0_16x8b, mask_ll4B_16x8b, (char*)pu1_dst); + _mm_maskmoveu_si128(y_1_16x8b, mask_ll4B_16x8b, + (char*)(pu1_dst + dst_strd)); + _mm_maskmoveu_si128(y_2_16x8b, mask_ll4B_16x8b, + (char*)(pu1_dst + (dst_strd << 1))); + _mm_maskmoveu_si128(y_3_16x8b, mask_ll4B_16x8b, + (char*)(pu1_dst + dst_strd * 3)); + + ht -= 4; + pu1_src += src_strd << 2; + pu1_dst += dst_strd << 2; + } + while(ht > 0); + } + else if(wd == 8) + { + __m128i y_0_8x16b, y_1_8x16b, y_2_8x16b, y_3_8x16b; + + do + { + y_0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src); + y_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + src_strd)); + y_2_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + (src_strd << 1))); + y_3_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + src_strd * 3)); + + y_0_8x16b = _mm_cvtepu8_epi16(y_0_16x8b); + y_1_8x16b = _mm_cvtepu8_epi16(y_1_16x8b); + y_2_8x16b = _mm_cvtepu8_epi16(y_2_16x8b); + y_3_8x16b = _mm_cvtepu8_epi16(y_3_16x8b); + + y_0_8x16b = _mm_mullo_epi16(y_0_8x16b, wt_8x16b); + y_1_8x16b = _mm_mullo_epi16(y_1_8x16b, wt_8x16b); + y_2_8x16b = _mm_mullo_epi16(y_2_8x16b, wt_8x16b); + y_3_8x16b = _mm_mullo_epi16(y_3_8x16b, wt_8x16b); + + y_0_8x16b = _mm_adds_epi16(round_8x16b, y_0_8x16b); + y_1_8x16b = _mm_adds_epi16(round_8x16b, y_1_8x16b); + y_2_8x16b = _mm_adds_epi16(round_8x16b, y_2_8x16b); + y_3_8x16b = _mm_adds_epi16(round_8x16b, y_3_8x16b); + + y_0_8x16b = _mm_srai_epi16(y_0_8x16b, log_wd); + y_1_8x16b = _mm_srai_epi16(y_1_8x16b, log_wd); + y_2_8x16b = _mm_srai_epi16(y_2_8x16b, log_wd); + y_3_8x16b = _mm_srai_epi16(y_3_8x16b, log_wd); + + y_0_8x16b = _mm_adds_epi16(ofst_8x16b, y_0_8x16b); + y_1_8x16b = _mm_adds_epi16(ofst_8x16b, y_1_8x16b); + y_2_8x16b = _mm_adds_epi16(ofst_8x16b, y_2_8x16b); + y_3_8x16b = _mm_adds_epi16(ofst_8x16b, y_3_8x16b); + + y_0_16x8b = _mm_packus_epi16(y_0_8x16b, y_1_8x16b); + y_2_16x8b = _mm_packus_epi16(y_2_8x16b, y_3_8x16b); + y_1_16x8b = _mm_srli_si128(y_0_16x8b, 8); + y_3_16x8b = _mm_srli_si128(y_2_16x8b, 8); + + _mm_storel_epi64((__m128i *)pu1_dst, y_0_16x8b); + _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd), y_1_16x8b); + _mm_storel_epi64((__m128i *)(pu1_dst + (dst_strd << 1)), y_2_16x8b); + _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd * 3), y_3_16x8b); + + ht -= 4; + pu1_src += src_strd << 2; + pu1_dst += dst_strd << 2; + } + while(ht > 0); + } + else // wd == 16 + { + __m128i y_0L_8x16b, y_1L_8x16b, y_2L_8x16b, y_3L_8x16b; + __m128i y_0H_8x16b, y_1H_8x16b, y_2H_8x16b, y_3H_8x16b; + + __m128i zero_16x8b; + zero_16x8b = _mm_set1_epi8(0); + + do + { + y_0_16x8b = _mm_loadu_si128((__m128i *)pu1_src); + y_1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + src_strd)); + y_2_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + (src_strd << 1))); + y_3_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + src_strd * 3)); + + y_0L_8x16b = _mm_cvtepu8_epi16(y_0_16x8b); + y_0H_8x16b = _mm_unpackhi_epi8(y_0_16x8b, zero_16x8b); + y_1L_8x16b = _mm_cvtepu8_epi16(y_1_16x8b); + y_1H_8x16b = _mm_unpackhi_epi8(y_1_16x8b, zero_16x8b); + y_2L_8x16b = _mm_cvtepu8_epi16(y_2_16x8b); + y_2H_8x16b = _mm_unpackhi_epi8(y_2_16x8b, zero_16x8b); + y_3L_8x16b = _mm_cvtepu8_epi16(y_3_16x8b); + y_3H_8x16b = _mm_unpackhi_epi8(y_3_16x8b, zero_16x8b); + + y_0L_8x16b = _mm_mullo_epi16(y_0L_8x16b, wt_8x16b); + y_0H_8x16b = _mm_mullo_epi16(y_0H_8x16b, wt_8x16b); + y_1L_8x16b = _mm_mullo_epi16(y_1L_8x16b, wt_8x16b); + y_1H_8x16b = _mm_mullo_epi16(y_1H_8x16b, wt_8x16b); + y_2L_8x16b = _mm_mullo_epi16(y_2L_8x16b, wt_8x16b); + y_2H_8x16b = _mm_mullo_epi16(y_2H_8x16b, wt_8x16b); + y_3L_8x16b = _mm_mullo_epi16(y_3L_8x16b, wt_8x16b); + y_3H_8x16b = _mm_mullo_epi16(y_3H_8x16b, wt_8x16b); + + y_0L_8x16b = _mm_adds_epi16(round_8x16b, y_0L_8x16b); + y_0H_8x16b = _mm_adds_epi16(round_8x16b, y_0H_8x16b); + y_1L_8x16b = _mm_adds_epi16(round_8x16b, y_1L_8x16b); + y_1H_8x16b = _mm_adds_epi16(round_8x16b, y_1H_8x16b); + y_2L_8x16b = _mm_adds_epi16(round_8x16b, y_2L_8x16b); + y_2H_8x16b = _mm_adds_epi16(round_8x16b, y_2H_8x16b); + y_3L_8x16b = _mm_adds_epi16(round_8x16b, y_3L_8x16b); + y_3H_8x16b = _mm_adds_epi16(round_8x16b, y_3H_8x16b); + + y_0L_8x16b = _mm_srai_epi16(y_0L_8x16b, log_wd); + y_0H_8x16b = _mm_srai_epi16(y_0H_8x16b, log_wd); + y_1L_8x16b = _mm_srai_epi16(y_1L_8x16b, log_wd); + y_1H_8x16b = _mm_srai_epi16(y_1H_8x16b, log_wd); + y_2L_8x16b = _mm_srai_epi16(y_2L_8x16b, log_wd); + y_2H_8x16b = _mm_srai_epi16(y_2H_8x16b, log_wd); + y_3L_8x16b = _mm_srai_epi16(y_3L_8x16b, log_wd); + y_3H_8x16b = _mm_srai_epi16(y_3H_8x16b, log_wd); + + y_0L_8x16b = _mm_adds_epi16(ofst_8x16b, y_0L_8x16b); + y_0H_8x16b = _mm_adds_epi16(ofst_8x16b, y_0H_8x16b); + y_1L_8x16b = _mm_adds_epi16(ofst_8x16b, y_1L_8x16b); + y_1H_8x16b = _mm_adds_epi16(ofst_8x16b, y_1H_8x16b); + y_2L_8x16b = _mm_adds_epi16(ofst_8x16b, y_2L_8x16b); + y_2H_8x16b = _mm_adds_epi16(ofst_8x16b, y_2H_8x16b); + y_3L_8x16b = _mm_adds_epi16(ofst_8x16b, y_3L_8x16b); + y_3H_8x16b = _mm_adds_epi16(ofst_8x16b, y_3H_8x16b); + + y_0_16x8b = _mm_packus_epi16(y_0L_8x16b, y_0H_8x16b); + y_1_16x8b = _mm_packus_epi16(y_1L_8x16b, y_1H_8x16b); + y_2_16x8b = _mm_packus_epi16(y_2L_8x16b, y_2H_8x16b); + y_3_16x8b = _mm_packus_epi16(y_3L_8x16b, y_3H_8x16b); + + _mm_storeu_si128((__m128i *)pu1_dst, y_0_16x8b); + _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd), y_1_16x8b); + _mm_storeu_si128((__m128i *)(pu1_dst + (dst_strd << 1)), y_2_16x8b); + _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd * 3), y_3_16x8b); + + ht -= 4; + pu1_src += src_strd << 2; + pu1_dst += dst_strd << 2; + } + while(ht > 0); + } +} + +/*****************************************************************************/ +/* */ +/* Function Name : ih264_weighted_pred_chroma_sse42 */ +/* */ +/* Description : This function performs the weighted prediction as */ +/* described in sec 8.4.2.3.2 titled "Weighted sample */ +/* prediction process" for chroma. The function gets one */ +/* ht x wd block, weights it, rounds it off, offsets it, */ +/* saturates it to unsigned 8-bit and stores it in the */ +/* destination block. (ht,wd) can be (2,2), (4,2), (2,4), */ +/* (4,4), (8,4), (4,8) or (8,8). */ +/* */ +/* Inputs : pu1_src - Pointer to source */ +/* pu1_dst - Pointer to destination */ +/* src_strd - stride for source */ +/* dst_strd - stride for destination */ +/* log_wd - number of bits to be rounded off */ +/* wt - weight values for u and v */ +/* ofst - offset values for u and v */ +/* ht - height of the block */ +/* wd - width of the block */ +/* */ +/* Issues : None */ +/* */ +/* Revision History: */ +/* */ +/* DD MM YYYY Author(s) Changes */ +/* 04 02 2015 Kaushik Initial Version */ +/* Senthoor */ +/* */ +/*****************************************************************************/ +void ih264_weighted_pred_chroma_sse42(UWORD8 *pu1_src, + UWORD8 *pu1_dst, + WORD32 src_strd, + WORD32 dst_strd, + WORD32 log_wd, + WORD32 wt, + WORD32 ofst, + WORD32 ht, + WORD32 wd) +{ + __m128i y_0_16x8b, y_1_16x8b; + + __m128i wt_8x16b, round_8x16b, ofst_8x16b; + + WORD32 ofst_u, ofst_v; + WORD32 round_val; + + ofst_u = (WORD8)(ofst & 0xff); + ofst_v = (WORD8)(ofst >> 8); + round_val = 1 << (log_wd - 1); + ofst = (ofst_u & 0xffff) | (ofst_v << 16); + + wt_8x16b = _mm_set1_epi32(wt); + round_8x16b = _mm_set1_epi16(round_val); + ofst_8x16b = _mm_set1_epi32(ofst); + + if(wd == 2) + { + __m128i y_0_8x16b; + + __m128i mask_full_16x8b, mask_ll4B_16x8b; + + mask_full_16x8b = _mm_set1_epi8(0xff); + mask_ll4B_16x8b = _mm_srli_si128(mask_full_16x8b, 12); + // mask for first four bytes + + do + { + y_0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src); + y_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + src_strd)); + + y_0_16x8b = _mm_unpacklo_epi32(y_0_16x8b, y_1_16x8b); + + y_0_8x16b = _mm_cvtepu8_epi16(y_0_16x8b); + + y_0_8x16b = _mm_mullo_epi16(y_0_8x16b, wt_8x16b); + + y_0_8x16b = _mm_adds_epi16(round_8x16b, y_0_8x16b); + + y_0_8x16b = _mm_srai_epi16(y_0_8x16b, log_wd); + + y_0_8x16b = _mm_adds_epi16(ofst_8x16b, y_0_8x16b); + + y_0_16x8b = _mm_packus_epi16(y_0_8x16b, y_0_8x16b); + y_1_16x8b = _mm_srli_si128(y_0_16x8b, 4); + + _mm_maskmoveu_si128(y_0_16x8b, mask_ll4B_16x8b, (char*)pu1_dst); + _mm_maskmoveu_si128(y_1_16x8b, mask_ll4B_16x8b, + (char*)(pu1_dst + dst_strd)); + + ht -= 2; + pu1_src += src_strd << 1; + pu1_dst += dst_strd << 1; + } + while(ht > 0); + } + else if(wd == 4) + { + __m128i y_0_8x16b, y_1_8x16b; + + do + { + y_0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src); + y_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + src_strd)); + + y_0_8x16b = _mm_cvtepu8_epi16(y_0_16x8b); + y_1_8x16b = _mm_cvtepu8_epi16(y_1_16x8b); + + y_0_8x16b = _mm_mullo_epi16(y_0_8x16b, wt_8x16b); + y_1_8x16b = _mm_mullo_epi16(y_1_8x16b, wt_8x16b); + + y_0_8x16b = _mm_adds_epi16(round_8x16b, y_0_8x16b); + y_1_8x16b = _mm_adds_epi16(round_8x16b, y_1_8x16b); + + y_0_8x16b = _mm_srai_epi16(y_0_8x16b, log_wd); + y_1_8x16b = _mm_srai_epi16(y_1_8x16b, log_wd); + + y_0_8x16b = _mm_adds_epi16(ofst_8x16b, y_0_8x16b); + y_1_8x16b = _mm_adds_epi16(ofst_8x16b, y_1_8x16b); + + y_0_16x8b = _mm_packus_epi16(y_0_8x16b, y_1_8x16b); + y_1_16x8b = _mm_srli_si128(y_0_16x8b, 8); + + _mm_storel_epi64((__m128i *)pu1_dst, y_0_16x8b); + _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd), y_1_16x8b); + + ht -= 2; + pu1_src += src_strd << 1; + pu1_dst += dst_strd << 1; + } + while(ht > 0); + } + else // wd == 16 + { + __m128i y_2_16x8b, y_3_16x8b; + __m128i y_0L_8x16b, y_1L_8x16b, y_2L_8x16b, y_3L_8x16b; + __m128i y_0H_8x16b, y_1H_8x16b, y_2H_8x16b, y_3H_8x16b; + + __m128i zero_16x8b; + zero_16x8b = _mm_set1_epi8(0); + + do + { + y_0_16x8b = _mm_loadu_si128((__m128i *)pu1_src); + y_1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + src_strd)); + y_2_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + (src_strd << 1))); + y_3_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + src_strd * 3)); + + y_0L_8x16b = _mm_cvtepu8_epi16(y_0_16x8b); + y_0H_8x16b = _mm_unpackhi_epi8(y_0_16x8b, zero_16x8b); + y_1L_8x16b = _mm_cvtepu8_epi16(y_1_16x8b); + y_1H_8x16b = _mm_unpackhi_epi8(y_1_16x8b, zero_16x8b); + y_2L_8x16b = _mm_cvtepu8_epi16(y_2_16x8b); + y_2H_8x16b = _mm_unpackhi_epi8(y_2_16x8b, zero_16x8b); + y_3L_8x16b = _mm_cvtepu8_epi16(y_3_16x8b); + y_3H_8x16b = _mm_unpackhi_epi8(y_3_16x8b, zero_16x8b); + + y_0L_8x16b = _mm_mullo_epi16(y_0L_8x16b, wt_8x16b); + y_0H_8x16b = _mm_mullo_epi16(y_0H_8x16b, wt_8x16b); + y_1L_8x16b = _mm_mullo_epi16(y_1L_8x16b, wt_8x16b); + y_1H_8x16b = _mm_mullo_epi16(y_1H_8x16b, wt_8x16b); + y_2L_8x16b = _mm_mullo_epi16(y_2L_8x16b, wt_8x16b); + y_2H_8x16b = _mm_mullo_epi16(y_2H_8x16b, wt_8x16b); + y_3L_8x16b = _mm_mullo_epi16(y_3L_8x16b, wt_8x16b); + y_3H_8x16b = _mm_mullo_epi16(y_3H_8x16b, wt_8x16b); + + y_0L_8x16b = _mm_adds_epi16(round_8x16b, y_0L_8x16b); + y_0H_8x16b = _mm_adds_epi16(round_8x16b, y_0H_8x16b); + y_1L_8x16b = _mm_adds_epi16(round_8x16b, y_1L_8x16b); + y_1H_8x16b = _mm_adds_epi16(round_8x16b, y_1H_8x16b); + y_2L_8x16b = _mm_adds_epi16(round_8x16b, y_2L_8x16b); + y_2H_8x16b = _mm_adds_epi16(round_8x16b, y_2H_8x16b); + y_3L_8x16b = _mm_adds_epi16(round_8x16b, y_3L_8x16b); + y_3H_8x16b = _mm_adds_epi16(round_8x16b, y_3H_8x16b); + + y_0L_8x16b = _mm_srai_epi16(y_0L_8x16b, log_wd); + y_0H_8x16b = _mm_srai_epi16(y_0H_8x16b, log_wd); + y_1L_8x16b = _mm_srai_epi16(y_1L_8x16b, log_wd); + y_1H_8x16b = _mm_srai_epi16(y_1H_8x16b, log_wd); + y_2L_8x16b = _mm_srai_epi16(y_2L_8x16b, log_wd); + y_2H_8x16b = _mm_srai_epi16(y_2H_8x16b, log_wd); + y_3L_8x16b = _mm_srai_epi16(y_3L_8x16b, log_wd); + y_3H_8x16b = _mm_srai_epi16(y_3H_8x16b, log_wd); + + y_0L_8x16b = _mm_adds_epi16(ofst_8x16b, y_0L_8x16b); + y_0H_8x16b = _mm_adds_epi16(ofst_8x16b, y_0H_8x16b); + y_1L_8x16b = _mm_adds_epi16(ofst_8x16b, y_1L_8x16b); + y_1H_8x16b = _mm_adds_epi16(ofst_8x16b, y_1H_8x16b); + y_2L_8x16b = _mm_adds_epi16(ofst_8x16b, y_2L_8x16b); + y_2H_8x16b = _mm_adds_epi16(ofst_8x16b, y_2H_8x16b); + y_3L_8x16b = _mm_adds_epi16(ofst_8x16b, y_3L_8x16b); + y_3H_8x16b = _mm_adds_epi16(ofst_8x16b, y_3H_8x16b); + + y_0_16x8b = _mm_packus_epi16(y_0L_8x16b, y_0H_8x16b); + y_1_16x8b = _mm_packus_epi16(y_1L_8x16b, y_1H_8x16b); + y_2_16x8b = _mm_packus_epi16(y_2L_8x16b, y_2H_8x16b); + y_3_16x8b = _mm_packus_epi16(y_3L_8x16b, y_3H_8x16b); + + _mm_storeu_si128((__m128i *)pu1_dst, y_0_16x8b); + _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd), y_1_16x8b); + _mm_storeu_si128((__m128i *)(pu1_dst + (dst_strd << 1)), y_2_16x8b); + _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd * 3), y_3_16x8b); + + ht -= 4; + pu1_src += src_strd << 2; + pu1_dst += dst_strd << 2; + } + while(ht > 0); + } +} + +/*****************************************************************************/ +/* */ +/* Function Name : ih264_weighted_bi_pred_luma_sse42 */ +/* */ +/* Description : This function performs the weighted biprediction as */ +/* described in sec 8.4.2.3.2 titled "Weighted sample */ +/* prediction process" for luma. The function gets two */ +/* ht x wd blocks, weights them, adds them, rounds off the */ +/* sum, offsets it, saturates it to unsigned 8-bit and */ +/* stores it in the destination block. (ht,wd) can be */ +/* (4,4), (8,4), (4,8), (8,8), (16,8), (8,16) or (16,16). */ +/* */ +/* Inputs : pu1_src1 - Pointer to source 1 */ +/* pu1_src2 - Pointer to source 2 */ +/* pu1_dst - Pointer to destination */ +/* src_strd1 - stride for source 1 */ +/* src_strd2 - stride for source 2 */ +/* dst_strd2 - stride for destination */ +/* log_wd - number of bits to be rounded off */ +/* wt1 - weight value for source 1 */ +/* wt2 - weight value for source 2 */ +/* ofst1 - offset value for source 1 */ +/* ofst2 - offset value for source 2 */ +/* ht - height of the block */ +/* wd - width of the block */ +/* */ +/* Issues : None */ +/* */ +/* Revision History: */ +/* */ +/* DD MM YYYY Author(s) Changes */ +/* 04 02 2015 Kaushik Initial Version */ +/* Senthoor */ +/* */ +/*****************************************************************************/ +void ih264_weighted_bi_pred_luma_sse42(UWORD8 *pu1_src1, + UWORD8 *pu1_src2, + UWORD8 *pu1_dst, + WORD32 src_strd1, + WORD32 src_strd2, + WORD32 dst_strd, + WORD32 log_wd, + WORD32 wt1, + WORD32 wt2, + WORD32 ofst1, + WORD32 ofst2, + WORD32 ht, + WORD32 wd) +{ + __m128i y1_0_16x8b, y1_1_16x8b; + __m128i y2_0_16x8b, y2_1_16x8b; + + __m128i wt1_8x16b, wt2_8x16b; + __m128i ofst_8x16b, round_8x16b; + + WORD32 ofst; + WORD32 round_val, shft; + + wt1 = (WORD16)(wt1 & 0xffff); + wt2 = (WORD16)(wt2 & 0xffff); + round_val = 1 << log_wd; + shft = log_wd + 1; + ofst1 = (WORD8)(ofst1 & 0xff); + ofst2 = (WORD8)(ofst2 & 0xff); + ofst = (ofst1 + ofst2 + 1) >> 1; + + wt1_8x16b = _mm_set1_epi16(wt1); + wt2_8x16b = _mm_set1_epi16(wt2); + round_8x16b = _mm_set1_epi16(round_val); + ofst_8x16b = _mm_set1_epi16(ofst); + + if(wd == 4) + { + __m128i y1_2_16x8b, y1_3_16x8b; + __m128i y2_2_16x8b, y2_3_16x8b; + + __m128i y1_0_8x16b, y1_2_8x16b; + __m128i y2_0_8x16b, y2_2_8x16b; + + __m128i mask_ll4B_16x8b; + + mask_ll4B_16x8b = _mm_set1_epi8(0xff); + mask_ll4B_16x8b = _mm_srli_si128(mask_ll4B_16x8b, 12); + // mask for first four bytes + + do + { + y1_0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src1); + y1_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src1 + src_strd1)); + y1_2_16x8b = _mm_loadl_epi64( + (__m128i *)(pu1_src1 + (src_strd1 << 1))); + y1_3_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src1 + src_strd1 * 3)); + + y2_0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src2); + y2_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src2 + src_strd2)); + y2_2_16x8b = _mm_loadl_epi64( + (__m128i *)(pu1_src2 + (src_strd2 << 1))); + y2_3_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src2 + src_strd2 * 3)); + + y1_0_16x8b = _mm_unpacklo_epi32(y1_0_16x8b, y1_1_16x8b); + y1_2_16x8b = _mm_unpacklo_epi32(y1_2_16x8b, y1_3_16x8b); + y2_0_16x8b = _mm_unpacklo_epi32(y2_0_16x8b, y2_1_16x8b); + y2_2_16x8b = _mm_unpacklo_epi32(y2_2_16x8b, y2_3_16x8b); + + y1_0_8x16b = _mm_cvtepu8_epi16(y1_0_16x8b); + y1_2_8x16b = _mm_cvtepu8_epi16(y1_2_16x8b); + y2_0_8x16b = _mm_cvtepu8_epi16(y2_0_16x8b); + y2_2_8x16b = _mm_cvtepu8_epi16(y2_2_16x8b); + + y1_0_8x16b = _mm_mullo_epi16(y1_0_8x16b, wt1_8x16b); + y2_0_8x16b = _mm_mullo_epi16(y2_0_8x16b, wt2_8x16b); + y1_2_8x16b = _mm_mullo_epi16(y1_2_8x16b, wt1_8x16b); + y2_2_8x16b = _mm_mullo_epi16(y2_2_8x16b, wt2_8x16b); + + y1_0_8x16b = _mm_adds_epi16(y1_0_8x16b, y2_0_8x16b); + y1_2_8x16b = _mm_adds_epi16(y1_2_8x16b, y2_2_8x16b); + + y1_0_8x16b = _mm_adds_epi16(round_8x16b, y1_0_8x16b); + y1_2_8x16b = _mm_adds_epi16(round_8x16b, y1_2_8x16b); + + y1_0_8x16b = _mm_srai_epi16(y1_0_8x16b, shft); + y1_2_8x16b = _mm_srai_epi16(y1_2_8x16b, shft); + + y1_0_8x16b = _mm_adds_epi16(ofst_8x16b, y1_0_8x16b); + y1_2_8x16b = _mm_adds_epi16(ofst_8x16b, y1_2_8x16b); + + y1_0_16x8b = _mm_packus_epi16(y1_0_8x16b, y1_2_8x16b); + y1_1_16x8b = _mm_srli_si128(y1_0_16x8b, 4); + y1_2_16x8b = _mm_srli_si128(y1_0_16x8b, 8); + y1_3_16x8b = _mm_srli_si128(y1_0_16x8b, 12); + + _mm_maskmoveu_si128(y1_0_16x8b, mask_ll4B_16x8b, (char*)pu1_dst); + _mm_maskmoveu_si128(y1_1_16x8b, mask_ll4B_16x8b, + (char*)(pu1_dst + dst_strd)); + _mm_maskmoveu_si128(y1_2_16x8b, mask_ll4B_16x8b, + (char*)(pu1_dst + (dst_strd << 1))); + _mm_maskmoveu_si128(y1_3_16x8b, mask_ll4B_16x8b, + (char*)(pu1_dst + dst_strd * 3)); + + ht -= 4; + pu1_src1 += src_strd1 << 2; + pu1_src2 += src_strd2 << 2; + pu1_dst += dst_strd << 2; + } + while(ht > 0); + } + else if(wd == 8) + { + __m128i y1_2_16x8b, y1_3_16x8b; + __m128i y2_2_16x8b, y2_3_16x8b; + + __m128i y1_0_8x16b, y1_1_8x16b, y1_2_8x16b, y1_3_8x16b; + __m128i y2_0_8x16b, y2_1_8x16b, y2_2_8x16b, y2_3_8x16b; + + do + { + y1_0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src1); + y1_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src1 + src_strd1)); + y1_2_16x8b = _mm_loadl_epi64( + (__m128i *)(pu1_src1 + (src_strd1 << 1))); + y1_3_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src1 + src_strd1 * 3)); + + y2_0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src2); + y2_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src2 + src_strd2)); + y2_2_16x8b = _mm_loadl_epi64( + (__m128i *)(pu1_src2 + (src_strd2 << 1))); + y2_3_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src2 + src_strd2 * 3)); + + y1_0_8x16b = _mm_cvtepu8_epi16(y1_0_16x8b); + y1_1_8x16b = _mm_cvtepu8_epi16(y1_1_16x8b); + y1_2_8x16b = _mm_cvtepu8_epi16(y1_2_16x8b); + y1_3_8x16b = _mm_cvtepu8_epi16(y1_3_16x8b); + + y2_0_8x16b = _mm_cvtepu8_epi16(y2_0_16x8b); + y2_1_8x16b = _mm_cvtepu8_epi16(y2_1_16x8b); + y2_2_8x16b = _mm_cvtepu8_epi16(y2_2_16x8b); + y2_3_8x16b = _mm_cvtepu8_epi16(y2_3_16x8b); + + y1_0_8x16b = _mm_mullo_epi16(y1_0_8x16b, wt1_8x16b); + y2_0_8x16b = _mm_mullo_epi16(y2_0_8x16b, wt2_8x16b); + y1_1_8x16b = _mm_mullo_epi16(y1_1_8x16b, wt1_8x16b); + y2_1_8x16b = _mm_mullo_epi16(y2_1_8x16b, wt2_8x16b); + + y1_2_8x16b = _mm_mullo_epi16(y1_2_8x16b, wt1_8x16b); + y2_2_8x16b = _mm_mullo_epi16(y2_2_8x16b, wt2_8x16b); + y1_3_8x16b = _mm_mullo_epi16(y1_3_8x16b, wt1_8x16b); + y2_3_8x16b = _mm_mullo_epi16(y2_3_8x16b, wt2_8x16b); + + y1_0_8x16b = _mm_adds_epi16(y1_0_8x16b, y2_0_8x16b); + y1_1_8x16b = _mm_adds_epi16(y1_1_8x16b, y2_1_8x16b); + y1_2_8x16b = _mm_adds_epi16(y1_2_8x16b, y2_2_8x16b); + y1_3_8x16b = _mm_adds_epi16(y1_3_8x16b, y2_3_8x16b); + + y1_0_8x16b = _mm_adds_epi16(round_8x16b, y1_0_8x16b); + y1_1_8x16b = _mm_adds_epi16(round_8x16b, y1_1_8x16b); + y1_2_8x16b = _mm_adds_epi16(round_8x16b, y1_2_8x16b); + y1_3_8x16b = _mm_adds_epi16(round_8x16b, y1_3_8x16b); + + y1_0_8x16b = _mm_srai_epi16(y1_0_8x16b, shft); + y1_1_8x16b = _mm_srai_epi16(y1_1_8x16b, shft); + y1_2_8x16b = _mm_srai_epi16(y1_2_8x16b, shft); + y1_3_8x16b = _mm_srai_epi16(y1_3_8x16b, shft); + + y1_0_8x16b = _mm_adds_epi16(ofst_8x16b, y1_0_8x16b); + y1_1_8x16b = _mm_adds_epi16(ofst_8x16b, y1_1_8x16b); + y1_2_8x16b = _mm_adds_epi16(ofst_8x16b, y1_2_8x16b); + y1_3_8x16b = _mm_adds_epi16(ofst_8x16b, y1_3_8x16b); + + y1_0_16x8b = _mm_packus_epi16(y1_0_8x16b, y1_1_8x16b); + y1_2_16x8b = _mm_packus_epi16(y1_2_8x16b, y1_3_8x16b); + y1_1_16x8b = _mm_srli_si128(y1_0_16x8b, 8); + y1_3_16x8b = _mm_srli_si128(y1_2_16x8b, 8); + + _mm_storel_epi64((__m128i *)pu1_dst, y1_0_16x8b); + _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd), y1_1_16x8b); + _mm_storel_epi64((__m128i *)(pu1_dst + (dst_strd << 1)), y1_2_16x8b); + _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd * 3), y1_3_16x8b); + + ht -= 4; + pu1_src1 += src_strd1 << 2; + pu1_src2 += src_strd2 << 2; + pu1_dst += dst_strd << 2; + } + while(ht > 0); + } + else // wd == 16 + { + __m128i y1_0L_8x16b, y1_0H_8x16b, y1_1L_8x16b, y1_1H_8x16b; + __m128i y2_0L_8x16b, y2_0H_8x16b, y2_1L_8x16b, y2_1H_8x16b; + + __m128i zero_16x8b; + zero_16x8b = _mm_set1_epi8(0); + + do + { + y1_0_16x8b = _mm_loadu_si128((__m128i *)pu1_src1); + y1_1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src1 + src_strd1)); + y2_0_16x8b = _mm_loadu_si128((__m128i *)pu1_src2); + y2_1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src2 + src_strd2)); + + y1_0L_8x16b = _mm_cvtepu8_epi16(y1_0_16x8b); + y1_0H_8x16b = _mm_unpackhi_epi8(y1_0_16x8b, zero_16x8b); + y1_1L_8x16b = _mm_cvtepu8_epi16(y1_1_16x8b); + y1_1H_8x16b = _mm_unpackhi_epi8(y1_1_16x8b, zero_16x8b); + + y2_0L_8x16b = _mm_cvtepu8_epi16(y2_0_16x8b); + y2_0H_8x16b = _mm_unpackhi_epi8(y2_0_16x8b, zero_16x8b); + y2_1L_8x16b = _mm_cvtepu8_epi16(y2_1_16x8b); + y2_1H_8x16b = _mm_unpackhi_epi8(y2_1_16x8b, zero_16x8b); + + y1_0L_8x16b = _mm_mullo_epi16(y1_0L_8x16b, wt1_8x16b); + y1_0H_8x16b = _mm_mullo_epi16(y1_0H_8x16b, wt1_8x16b); + y1_1L_8x16b = _mm_mullo_epi16(y1_1L_8x16b, wt1_8x16b); + y1_1H_8x16b = _mm_mullo_epi16(y1_1H_8x16b, wt1_8x16b); + + y2_0L_8x16b = _mm_mullo_epi16(y2_0L_8x16b, wt2_8x16b); + y2_0H_8x16b = _mm_mullo_epi16(y2_0H_8x16b, wt2_8x16b); + y2_1L_8x16b = _mm_mullo_epi16(y2_1L_8x16b, wt2_8x16b); + y2_1H_8x16b = _mm_mullo_epi16(y2_1H_8x16b, wt2_8x16b); + + y1_0L_8x16b = _mm_adds_epi16(y1_0L_8x16b, y2_0L_8x16b); + y1_0H_8x16b = _mm_adds_epi16(y1_0H_8x16b, y2_0H_8x16b); + y1_1L_8x16b = _mm_adds_epi16(y1_1L_8x16b, y2_1L_8x16b); + y1_1H_8x16b = _mm_adds_epi16(y1_1H_8x16b, y2_1H_8x16b); + + y1_0L_8x16b = _mm_adds_epi16(round_8x16b, y1_0L_8x16b); + y1_0H_8x16b = _mm_adds_epi16(round_8x16b, y1_0H_8x16b); + y1_1L_8x16b = _mm_adds_epi16(round_8x16b, y1_1L_8x16b); + y1_1H_8x16b = _mm_adds_epi16(round_8x16b, y1_1H_8x16b); + + y1_0L_8x16b = _mm_srai_epi16(y1_0L_8x16b, shft); + y1_0H_8x16b = _mm_srai_epi16(y1_0H_8x16b, shft); + y1_1L_8x16b = _mm_srai_epi16(y1_1L_8x16b, shft); + y1_1H_8x16b = _mm_srai_epi16(y1_1H_8x16b, shft); + + y1_0L_8x16b = _mm_adds_epi16(ofst_8x16b, y1_0L_8x16b); + y1_0H_8x16b = _mm_adds_epi16(ofst_8x16b, y1_0H_8x16b); + y1_1L_8x16b = _mm_adds_epi16(ofst_8x16b, y1_1L_8x16b); + y1_1H_8x16b = _mm_adds_epi16(ofst_8x16b, y1_1H_8x16b); + + y1_0_16x8b = _mm_packus_epi16(y1_0L_8x16b, y1_0H_8x16b); + y1_1_16x8b = _mm_packus_epi16(y1_1L_8x16b, y1_1H_8x16b); + + _mm_storeu_si128((__m128i *)pu1_dst, y1_0_16x8b); + _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd), y1_1_16x8b); + + ht -= 2; + pu1_src1 += src_strd1 << 1; + pu1_src2 += src_strd2 << 1; + pu1_dst += dst_strd << 1; + } + while(ht > 0); + } +} + +/*****************************************************************************/ +/* */ +/* Function Name : ih264_weighted_bi_pred_chroma_sse42 */ +/* */ +/* Description : This function performs the weighted biprediction as */ +/* described in sec 8.4.2.3.2 titled "Weighted sample */ +/* prediction process" for chroma. The function gets two */ +/* ht x wd blocks, weights them, adds them, rounds off the */ +/* sum, offsets it, saturates it to unsigned 8-bit and */ +/* stores it in the destination block. (ht,wd) can be */ +/* (2,2), (4,2), (2,4), (4,4), (8,4), (4,8) or (8,8). */ +/* */ +/* Inputs : pu1_src1 - Pointer to source 1 */ +/* pu1_src2 - Pointer to source 2 */ +/* pu1_dst - Pointer to destination */ +/* src_strd1 - stride for source 1 */ +/* src_strd2 - stride for source 2 */ +/* dst_strd2 - stride for destination */ +/* log_wd - number of bits to be rounded off */ +/* wt1 - weight values for u and v in source 1 */ +/* wt2 - weight values for u and v in source 2 */ +/* ofst1 - offset value for u and v in source 1 */ +/* ofst2 - offset value for u and v in source 2 */ +/* ht - height of the block */ +/* wd - width of the block */ +/* */ +/* Issues : None */ +/* */ +/* Revision History: */ +/* */ +/* DD MM YYYY Author(s) Changes */ +/* 04 02 2015 Kaushik Initial Version */ +/* Senthoor */ +/* */ +/*****************************************************************************/ +void ih264_weighted_bi_pred_chroma_sse42(UWORD8 *pu1_src1, + UWORD8 *pu1_src2, + UWORD8 *pu1_dst, + WORD32 src_strd1, + WORD32 src_strd2, + WORD32 dst_strd, + WORD32 log_wd, + WORD32 wt1, + WORD32 wt2, + WORD32 ofst1, + WORD32 ofst2, + WORD32 ht, + WORD32 wd) +{ + __m128i y1_0_16x8b, y1_1_16x8b; + __m128i y2_0_16x8b, y2_1_16x8b; + + __m128i wt1_8x16b, wt2_8x16b; + __m128i ofst_8x16b, round_8x16b; + + WORD32 ofst1_u, ofst2_u, ofst_u; + WORD32 ofst1_v, ofst2_v, ofst_v; + WORD32 round_val, shft, ofst_val; + + round_val = 1 << log_wd; + shft = log_wd + 1; + + ofst1_u = (WORD8)(ofst1 & 0xff); + ofst1_v = (WORD8)(ofst1 >> 8); + ofst2_u = (WORD8)(ofst2 & 0xff); + ofst2_v = (WORD8)(ofst2 >> 8); + + wt1_8x16b = _mm_set1_epi32(wt1); + wt2_8x16b = _mm_set1_epi32(wt2); + + ofst_u = (ofst1_u + ofst2_u + 1) >> 1; + ofst_v = (ofst1_v + ofst2_v + 1) >> 1; + ofst_val = (ofst_u & 0xffff) | (ofst_v << 16); + + round_8x16b = _mm_set1_epi16(round_val); + ofst_8x16b = _mm_set1_epi32(ofst_val); + + if(wd == 2) + { + __m128i y1_0_8x16b, y2_0_8x16b; + + __m128i mask_full_16x8b, mask_ll4B_16x8b; + + mask_full_16x8b = _mm_set1_epi8(0xff); + mask_ll4B_16x8b = _mm_srli_si128(mask_full_16x8b, 12); + + do + { + y1_0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src1); + y1_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src1 + src_strd1)); + + y2_0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src2); + y2_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src2 + src_strd2)); + + y1_0_16x8b = _mm_unpacklo_epi32(y1_0_16x8b, y1_1_16x8b); + y2_0_16x8b = _mm_unpacklo_epi32(y2_0_16x8b, y2_1_16x8b); + + y1_0_8x16b = _mm_cvtepu8_epi16(y1_0_16x8b); + y2_0_8x16b = _mm_cvtepu8_epi16(y2_0_16x8b); + + y1_0_8x16b = _mm_mullo_epi16(y1_0_8x16b, wt1_8x16b); + y2_0_8x16b = _mm_mullo_epi16(y2_0_8x16b, wt2_8x16b); + + y1_0_8x16b = _mm_adds_epi16(y1_0_8x16b, y2_0_8x16b); + y1_0_8x16b = _mm_adds_epi16(round_8x16b, y1_0_8x16b); + + y1_0_8x16b = _mm_srai_epi16(y1_0_8x16b, shft); + y1_0_8x16b = _mm_adds_epi16(ofst_8x16b, y1_0_8x16b); + + y1_0_16x8b = _mm_packus_epi16(y1_0_8x16b, y1_0_8x16b); + y1_1_16x8b = _mm_srli_si128(y1_0_16x8b, 4); + + _mm_maskmoveu_si128(y1_0_16x8b, mask_ll4B_16x8b, (char*)pu1_dst); + _mm_maskmoveu_si128(y1_1_16x8b, mask_ll4B_16x8b, + (char*)(pu1_dst + dst_strd)); + + ht -= 2; + pu1_src1 += src_strd1 << 1; + pu1_src2 += src_strd2 << 1; + pu1_dst += dst_strd << 1; + } + while(ht > 0); + } + else if(wd == 4) + { + __m128i y1_0_8x16b, y1_1_8x16b; + __m128i y2_0_8x16b, y2_1_8x16b; + + do + { + y1_0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src1); + y1_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src1 + src_strd1)); + + y2_0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src2); + y2_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src2 + src_strd2)); + + y1_0_8x16b = _mm_cvtepu8_epi16(y1_0_16x8b); + y1_1_8x16b = _mm_cvtepu8_epi16(y1_1_16x8b); + + y2_0_8x16b = _mm_cvtepu8_epi16(y2_0_16x8b); + y2_1_8x16b = _mm_cvtepu8_epi16(y2_1_16x8b); + + y1_0_8x16b = _mm_mullo_epi16(y1_0_8x16b, wt1_8x16b); + y2_0_8x16b = _mm_mullo_epi16(y2_0_8x16b, wt2_8x16b); + y1_1_8x16b = _mm_mullo_epi16(y1_1_8x16b, wt1_8x16b); + y2_1_8x16b = _mm_mullo_epi16(y2_1_8x16b, wt2_8x16b); + + y1_0_8x16b = _mm_adds_epi16(y1_0_8x16b, y2_0_8x16b); + y1_1_8x16b = _mm_adds_epi16(y1_1_8x16b, y2_1_8x16b); + + y1_0_8x16b = _mm_adds_epi16(round_8x16b, y1_0_8x16b); + y1_1_8x16b = _mm_adds_epi16(round_8x16b, y1_1_8x16b); + + y1_0_8x16b = _mm_srai_epi16(y1_0_8x16b, shft); + y1_1_8x16b = _mm_srai_epi16(y1_1_8x16b, shft); + + y1_0_8x16b = _mm_adds_epi16(ofst_8x16b, y1_0_8x16b); + y1_1_8x16b = _mm_adds_epi16(ofst_8x16b, y1_1_8x16b); + + y1_0_16x8b = _mm_packus_epi16(y1_0_8x16b, y1_1_8x16b); + y1_1_16x8b = _mm_srli_si128(y1_0_16x8b, 8); + + _mm_storel_epi64((__m128i *)pu1_dst, y1_0_16x8b); + _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd), y1_1_16x8b); + + ht -= 2; + pu1_src1 += src_strd1 << 1; + pu1_src2 += src_strd2 << 1; + pu1_dst += dst_strd << 1; + } + while(ht > 0); + } + else // wd == 8 + { + __m128i y1_0L_8x16b, y1_0H_8x16b, y1_1L_8x16b, y1_1H_8x16b; + __m128i y2_0L_8x16b, y2_0H_8x16b, y2_1L_8x16b, y2_1H_8x16b; + + __m128i zero_16x8b; + zero_16x8b = _mm_set1_epi8(0); + + do + { + y1_0_16x8b = _mm_loadu_si128((__m128i *)pu1_src1); + y1_1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src1 + src_strd1)); + y2_0_16x8b = _mm_loadu_si128((__m128i *)pu1_src2); + y2_1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src2 + src_strd2)); + + y1_0L_8x16b = _mm_cvtepu8_epi16(y1_0_16x8b); + y1_0H_8x16b = _mm_unpackhi_epi8(y1_0_16x8b, zero_16x8b); + y1_1L_8x16b = _mm_cvtepu8_epi16(y1_1_16x8b); + y1_1H_8x16b = _mm_unpackhi_epi8(y1_1_16x8b, zero_16x8b); + + y2_0L_8x16b = _mm_cvtepu8_epi16(y2_0_16x8b); + y2_0H_8x16b = _mm_unpackhi_epi8(y2_0_16x8b, zero_16x8b); + y2_1L_8x16b = _mm_cvtepu8_epi16(y2_1_16x8b); + y2_1H_8x16b = _mm_unpackhi_epi8(y2_1_16x8b, zero_16x8b); + + y1_0L_8x16b = _mm_mullo_epi16(y1_0L_8x16b, wt1_8x16b); + y1_0H_8x16b = _mm_mullo_epi16(y1_0H_8x16b, wt1_8x16b); + y1_1L_8x16b = _mm_mullo_epi16(y1_1L_8x16b, wt1_8x16b); + y1_1H_8x16b = _mm_mullo_epi16(y1_1H_8x16b, wt1_8x16b); + + y2_0L_8x16b = _mm_mullo_epi16(y2_0L_8x16b, wt2_8x16b); + y2_0H_8x16b = _mm_mullo_epi16(y2_0H_8x16b, wt2_8x16b); + y2_1L_8x16b = _mm_mullo_epi16(y2_1L_8x16b, wt2_8x16b); + y2_1H_8x16b = _mm_mullo_epi16(y2_1H_8x16b, wt2_8x16b); + + y1_0L_8x16b = _mm_adds_epi16(y1_0L_8x16b, y2_0L_8x16b); + y1_0H_8x16b = _mm_adds_epi16(y1_0H_8x16b, y2_0H_8x16b); + y1_1L_8x16b = _mm_adds_epi16(y1_1L_8x16b, y2_1L_8x16b); + y1_1H_8x16b = _mm_adds_epi16(y1_1H_8x16b, y2_1H_8x16b); + + y1_0L_8x16b = _mm_adds_epi16(round_8x16b, y1_0L_8x16b); + y1_0H_8x16b = _mm_adds_epi16(round_8x16b, y1_0H_8x16b); + y1_1L_8x16b = _mm_adds_epi16(round_8x16b, y1_1L_8x16b); + y1_1H_8x16b = _mm_adds_epi16(round_8x16b, y1_1H_8x16b); + + y1_0L_8x16b = _mm_srai_epi16(y1_0L_8x16b, shft); + y1_0H_8x16b = _mm_srai_epi16(y1_0H_8x16b, shft); + y1_1L_8x16b = _mm_srai_epi16(y1_1L_8x16b, shft); + y1_1H_8x16b = _mm_srai_epi16(y1_1H_8x16b, shft); + + y1_0L_8x16b = _mm_adds_epi16(ofst_8x16b, y1_0L_8x16b); + y1_0H_8x16b = _mm_adds_epi16(ofst_8x16b, y1_0H_8x16b); + y1_1L_8x16b = _mm_adds_epi16(ofst_8x16b, y1_1L_8x16b); + y1_1H_8x16b = _mm_adds_epi16(ofst_8x16b, y1_1H_8x16b); + + y1_0_16x8b = _mm_packus_epi16(y1_0L_8x16b, y1_0H_8x16b); + y1_1_16x8b = _mm_packus_epi16(y1_1L_8x16b, y1_1H_8x16b); + + _mm_storeu_si128((__m128i *)pu1_dst, y1_0_16x8b); + _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd), y1_1_16x8b); + + ht -= 2; + pu1_src1 += src_strd1 << 1; + pu1_src2 += src_strd2 << 1; + pu1_dst += dst_strd << 1; + } + while(ht > 0); + } +} |