summaryrefslogtreecommitdiffstats
path: root/common
diff options
context:
space:
mode:
authorHamsalekha S <hamsalekha.s@ittiam.com>2015-03-13 21:24:58 +0530
committerHamsalekha S <hamsalekha.s@ittiam.com>2015-04-02 15:59:02 +0530
commit8d3d303c7942ced6a987a52db8977d768dc3605f (patch)
treecc806c96794356996b13ba9970941d0aed74a97e /common
parent3956d913d37327dcb340f836e604b04bd478b158 (diff)
downloadandroid_external_libavc-8d3d303c7942ced6a987a52db8977d768dc3605f.tar.gz
android_external_libavc-8d3d303c7942ced6a987a52db8977d768dc3605f.tar.bz2
android_external_libavc-8d3d303c7942ced6a987a52db8977d768dc3605f.zip
Initial version
Change-Id: I7efe9a589cd24edf86e8d086b40c27cbbf8b4017
Diffstat (limited to 'common')
-rwxr-xr-xcommon/arm/ih264_arm_memory_barrier.s77
-rwxr-xr-xcommon/arm/ih264_deblk_chroma_a9.s1337
-rwxr-xr-xcommon/arm/ih264_deblk_luma_a9.s1092
-rwxr-xr-xcommon/arm/ih264_default_weighted_pred_a9q.s359
-rwxr-xr-xcommon/arm/ih264_ihadamard_scaling_a9.s250
-rwxr-xr-xcommon/arm/ih264_inter_pred_chroma_a9q.s254
-rwxr-xr-xcommon/arm/ih264_inter_pred_filters_luma_horz_a9q.s245
-rwxr-xr-xcommon/arm/ih264_inter_pred_filters_luma_vert_a9q.s301
-rwxr-xr-xcommon/arm/ih264_inter_pred_luma_bilinear_a9q.s398
-rwxr-xr-xcommon/arm/ih264_inter_pred_luma_copy_a9q.s253
-rwxr-xr-xcommon/arm/ih264_inter_pred_luma_horz_hpel_vert_hpel_a9q.s441
-rwxr-xr-xcommon/arm/ih264_inter_pred_luma_horz_hpel_vert_qpel_a9q.s1044
-rwxr-xr-xcommon/arm/ih264_inter_pred_luma_horz_qpel_a9q.s266
-rwxr-xr-xcommon/arm/ih264_inter_pred_luma_horz_qpel_vert_hpel_a9q.s505
-rwxr-xr-xcommon/arm/ih264_inter_pred_luma_horz_qpel_vert_qpel_a9q.s355
-rwxr-xr-xcommon/arm/ih264_inter_pred_luma_vert_qpel_a9q.s330
-rwxr-xr-xcommon/arm/ih264_intra_pred_chroma_a9q.s551
-rwxr-xr-xcommon/arm/ih264_intra_pred_luma_16x16_a9q.s520
-rwxr-xr-xcommon/arm/ih264_intra_pred_luma_4x4_a9q.s842
-rwxr-xr-xcommon/arm/ih264_intra_pred_luma_8x8_a9q.s1037
-rwxr-xr-xcommon/arm/ih264_iquant_itrans_recon_a9.s871
-rwxr-xr-xcommon/arm/ih264_iquant_itrans_recon_dc_a9.s399
-rwxr-xr-xcommon/arm/ih264_itrans_recon_a9.s216
-rwxr-xr-xcommon/arm/ih264_mem_fns_neon.s268
-rwxr-xr-xcommon/arm/ih264_padding_neon.s646
-rwxr-xr-xcommon/arm/ih264_platform_macros.h152
-rwxr-xr-xcommon/arm/ih264_resi_trans_a9.s604
-rwxr-xr-xcommon/arm/ih264_resi_trans_quant_a9.s694
-rwxr-xr-xcommon/arm/ih264_weighted_bi_pred_a9q.s642
-rwxr-xr-xcommon/arm/ih264_weighted_pred_a9q.s479
-rwxr-xr-xcommon/armv8/ih264_deblk_chroma_av8.s585
-rwxr-xr-xcommon/armv8/ih264_deblk_luma_av8.s1084
-rwxr-xr-xcommon/armv8/ih264_default_weighted_pred_av8.s353
-rwxr-xr-xcommon/armv8/ih264_ihadamard_scaling_av8.s250
-rwxr-xr-xcommon/armv8/ih264_inter_pred_chroma_av8.s392
-rwxr-xr-xcommon/armv8/ih264_inter_pred_filters_luma_horz_av8.s530
-rwxr-xr-xcommon/armv8/ih264_inter_pred_filters_luma_vert_av8.s452
-rwxr-xr-xcommon/armv8/ih264_inter_pred_luma_copy_av8.s267
-rwxr-xr-xcommon/armv8/ih264_inter_pred_luma_horz_hpel_vert_hpel_av8.s820
-rwxr-xr-xcommon/armv8/ih264_inter_pred_luma_horz_hpel_vert_qpel_av8.s1120
-rwxr-xr-xcommon/armv8/ih264_inter_pred_luma_horz_qpel_av8.s597
-rwxr-xr-xcommon/armv8/ih264_inter_pred_luma_horz_qpel_vert_hpel_av8.s910
-rwxr-xr-xcommon/armv8/ih264_inter_pred_luma_horz_qpel_vert_qpel_av8.s958
-rwxr-xr-xcommon/armv8/ih264_inter_pred_luma_vert_qpel_av8.s511
-rwxr-xr-xcommon/armv8/ih264_intra_pred_chroma_av8.s574
-rwxr-xr-xcommon/armv8/ih264_intra_pred_luma_16x16_av8.s606
-rwxr-xr-xcommon/armv8/ih264_intra_pred_luma_4x4_av8.s876
-rwxr-xr-xcommon/armv8/ih264_intra_pred_luma_8x8_av8.s1084
-rwxr-xr-xcommon/armv8/ih264_iquant_itrans_recon_av8.s778
-rwxr-xr-xcommon/armv8/ih264_iquant_itrans_recon_dc_av8.s397
-rwxr-xr-xcommon/armv8/ih264_mem_fns_neon_av8.s274
-rwxr-xr-xcommon/armv8/ih264_neon_macros.s41
-rwxr-xr-xcommon/armv8/ih264_padding_neon_av8.s784
-rwxr-xr-xcommon/armv8/ih264_platform_macros.h152
-rwxr-xr-xcommon/armv8/ih264_resi_trans_quant_av8.s731
-rwxr-xr-xcommon/armv8/ih264_weighted_bi_pred_av8.s574
-rwxr-xr-xcommon/armv8/ih264_weighted_pred_av8.s471
-rwxr-xr-xcommon/ih264_buf_mgr.c696
-rwxr-xr-xcommon/ih264_buf_mgr.h122
-rwxr-xr-xcommon/ih264_cabac_tables.c10869
-rwxr-xr-xcommon/ih264_cabac_tables.h101
-rwxr-xr-xcommon/ih264_cavlc_tables.c282
-rwxr-xr-xcommon/ih264_cavlc_tables.h133
-rwxr-xr-xcommon/ih264_chroma_intra_pred_filters.c478
-rwxr-xr-xcommon/ih264_common_tables.c725
-rwxr-xr-xcommon/ih264_common_tables.h136
-rwxr-xr-xcommon/ih264_deblk_edge_filters.c2087
-rwxr-xr-xcommon/ih264_deblk_edge_filters.h195
-rwxr-xr-xcommon/ih264_deblk_tables.c119
-rwxr-xr-xcommon/ih264_deblk_tables.h73
-rwxr-xr-xcommon/ih264_debug.h61
-rwxr-xr-xcommon/ih264_defs.h690
-rwxr-xr-xcommon/ih264_disp_mgr.c186
-rwxr-xr-xcommon/ih264_disp_mgr.h70
-rwxr-xr-xcommon/ih264_dpb_mgr.c1176
-rwxr-xr-xcommon/ih264_dpb_mgr.h186
-rwxr-xr-xcommon/ih264_error.h68
-rwxr-xr-xcommon/ih264_ihadamard_scaling.c216
-rwxr-xr-xcommon/ih264_inter_pred_filters.c1042
-rwxr-xr-xcommon/ih264_inter_pred_filters.h241
-rwxr-xr-xcommon/ih264_intra_pred_filters.h331
-rwxr-xr-xcommon/ih264_iquant_itrans_recon.c873
-rwxr-xr-xcommon/ih264_itrans_recon.h71
-rwxr-xr-xcommon/ih264_list.c574
-rwxr-xr-xcommon/ih264_list.h93
-rwxr-xr-xcommon/ih264_luma_intra_pred_filters.c1933
-rwxr-xr-xcommon/ih264_macros.h110
-rwxr-xr-xcommon/ih264_mem_fns.c176
-rwxr-xr-xcommon/ih264_mem_fns.h126
-rwxr-xr-xcommon/ih264_padding.c331
-rwxr-xr-xcommon/ih264_padding.h74
-rwxr-xr-xcommon/ih264_resi_trans.h70
-rwxr-xr-xcommon/ih264_resi_trans_quant.c814
-rwxr-xr-xcommon/ih264_size_defs.h85
-rwxr-xr-xcommon/ih264_structs.h1722
-rwxr-xr-xcommon/ih264_trans_data.c312
-rwxr-xr-xcommon/ih264_trans_data.h125
-rwxr-xr-xcommon/ih264_trans_macros.h124
-rwxr-xr-xcommon/ih264_trans_quant_itrans_iquant.h232
-rwxr-xr-xcommon/ih264_typedefs.h64
-rwxr-xr-xcommon/ih264_weighted_pred.c495
-rwxr-xr-xcommon/ih264_weighted_pred.h164
-rwxr-xr-xcommon/ithread.c604
-rwxr-xr-xcommon/ithread.h104
-rwxr-xr-xcommon/mips/ih264_platform_macros.h102
-rwxr-xr-xcommon/x86/ih264_chroma_intra_pred_filters_ssse3.c433
-rwxr-xr-xcommon/x86/ih264_deblk_chroma_ssse3.c1087
-rwxr-xr-xcommon/x86/ih264_deblk_luma_ssse3.c2012
-rwxr-xr-xcommon/x86/ih264_ihadamard_scaling_sse42.c238
-rwxr-xr-xcommon/x86/ih264_ihadamard_scaling_ssse3.c200
-rwxr-xr-xcommon/x86/ih264_inter_pred_filters_ssse3.c4375
-rwxr-xr-xcommon/x86/ih264_iquant_itrans_recon_dc_ssse3.c437
-rwxr-xr-xcommon/x86/ih264_iquant_itrans_recon_sse42.c554
-rwxr-xr-xcommon/x86/ih264_iquant_itrans_recon_ssse3.c1035
-rwxr-xr-xcommon/x86/ih264_luma_intra_pred_filters_ssse3.c2282
-rwxr-xr-xcommon/x86/ih264_mem_fns_ssse3.c169
-rwxr-xr-xcommon/x86/ih264_padding_ssse3.c335
-rwxr-xr-xcommon/x86/ih264_platform_macros.h114
-rwxr-xr-xcommon/x86/ih264_resi_trans_quant_sse42.c984
-rwxr-xr-xcommon/x86/ih264_weighted_pred_sse42.c1349
120 files changed, 76864 insertions, 0 deletions
diff --git a/common/arm/ih264_arm_memory_barrier.s b/common/arm/ih264_arm_memory_barrier.s
new file mode 100755
index 0000000..523218f
--- /dev/null
+++ b/common/arm/ih264_arm_memory_barrier.s
@@ -0,0 +1,77 @@
+@/******************************************************************************
+@ *
+@ * Copyright (C) 2015 The Android Open Source Project
+@ *
+@ * Licensed under the Apache License, Version 2.0 (the "License");
+@ * you may not use this file except in compliance with the License.
+@ * You may obtain a copy of the License at:
+@ *
+@ * http://www.apache.org/licenses/LICENSE-2.0
+@ *
+@ * Unless required by applicable law or agreed to in writing, software
+@ * distributed under the License is distributed on an "AS IS" BASIS,
+@ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@ * See the License for the specific language governing permissions and
+@ * limitations under the License.
+@ *
+@ *****************************************************************************
+@ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
+@*/
+@/**
+@*******************************************************************************
+@* @file
+@* ih264_arm_memory_barrier.s
+@*
+@* @brief
+@* Contains function definitions for data synchronization.
+@*
+@* @author
+@* Ittiam
+@*
+@* @par List of Functions:
+@*
+@*
+@* @remarks
+@* None
+@*
+@*******************************************************************************
+
+.text
+.p2align 2
+
+
+@*****************************************************************************
+@*
+@* Function Name : ih264_arm_dsb
+@* Description : Adds DSB
+@* Revision History :
+@* DD MM YYYY Author(s) Changes
+@* 03 07 2008 100355 First version
+@*
+@*****************************************************************************
+
+ .global ih264_arm_dsb
+ih264_arm_dsb:
+ dsb
+ bx lr
+
+
+
+@*****************************************************************************
+@*
+@* Function Name : ih264_arm_dmb
+@* Description : Adds DMB
+@* Revision History :
+@* DD MM YYYY Author(s) Changes
+@* 03 07 2008 100355 First version
+@*
+@*****************************************************************************
+
+ .global ih264_arm_dmb
+
+ih264_arm_dmb:
+ dmb
+ bx lr
+
+
+
diff --git a/common/arm/ih264_deblk_chroma_a9.s b/common/arm/ih264_deblk_chroma_a9.s
new file mode 100755
index 0000000..66102a7
--- /dev/null
+++ b/common/arm/ih264_deblk_chroma_a9.s
@@ -0,0 +1,1337 @@
+@/******************************************************************************
+@ *
+@ * Copyright (C) 2015 The Android Open Source Project
+@ *
+@ * Licensed under the Apache License, Version 2.0 (the "License");
+@ * you may not use this file except in compliance with the License.
+@ * You may obtain a copy of the License at:
+@ *
+@ * http://www.apache.org/licenses/LICENSE-2.0
+@ *
+@ * Unless required by applicable law or agreed to in writing, software
+@ * distributed under the License is distributed on an "AS IS" BASIS,
+@ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@ * See the License for the specific language governing permissions and
+@ * limitations under the License.
+@ *
+@ *****************************************************************************
+@ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
+@*/
+@/*****************************************************************************/
+@/* */
+@/* File Name : ih264_deblk_chroma_a9.s */
+@/* */
+@/* Description : Contains function definitions for deblocking luma */
+@/* edge. Functions are coded in NEON assembly and can */
+@/* be compiled using ARM RVDS. */
+@/* */
+@/* List of Functions : ih264_deblk_chroma_vert_bs4_bp_a9() */
+@/* ih264_deblk_chroma_vert_bslt4_bp_a9() */
+@/* ih264_deblk_chroma_horz_bs4_bp_a9() */
+@/* ih264_deblk_chroma_horz_bslt4_bp_a9() */
+@/* ih264_deblk_chroma_vert_bs4_mbaff_bp_a9() */
+@/* ih264_deblk_chroma_vert_bslt4_mbaff_bp_a9() */
+@/* ih264_deblk_chroma_vert_bs4_a9() */
+@/* ih264_deblk_chroma_vert_bslt4_a9() */
+@/* ih264_deblk_chroma_horz_bs4_a9() */
+@/* ih264_deblk_chroma_horz_bslt4_a9() */
+@/* ih264_deblk_chroma_vert_bs4_mbaff_a9() */
+@/* ih264_deblk_chroma_vert_bslt4_mbaff_a9() */
+@/* */
+@/* Issues / Problems : None */
+@/* */
+@/* Revision History : */
+@/* */
+@/* DD MM YYYY Author(s) Changes (Describe the changes made) */
+@/* 28 11 2013 Ittiam Draft */
+@/* 05 01 2015 Kaushik Added double-call functions for */
+@/* Senthoor vertical deblocking, and high */
+@/* profile functions. */
+@/* */
+@/*****************************************************************************/
+
+
+.text
+.p2align 2
+
+@/**
+@*******************************************************************************
+@*
+@* @brief
+@* Performs filtering of a chroma block horizontal edge when the
+@* boundary strength is set to 4
+@*
+@* @par Description:
+@* This operation is described in Sec. 8.7.2.4 under the title
+@* "Filtering process for edges for bS equal to 4" in ITU T Rec H.264.
+@*
+@* @param[in] r0 - pu1_src
+@* Pointer to the src sample q0
+@*
+@* @param[in] r1 - src_strd
+@* Source stride
+@*
+@* @param[in] r2 - alpha
+@* Alpha Value for the boundary
+@*
+@* @param[in] r3 - beta
+@* Beta Value for the boundary
+@*
+@* @returns
+@* None
+@*
+@* @remarks
+@* None
+@*
+@*******************************************************************************
+@*/
+
+ .global ih264_deblk_chroma_horz_bs4_bp_a9
+
+ih264_deblk_chroma_horz_bs4_bp_a9:
+
+ stmfd sp!, {r4, lr} @
+ vpush {d8 - d15}
+ sub r0, r0, r1, lsl #1 @R0 = uc_edgePixel pointing to p1 of chroma
+ vld2.8 {d6, d7}, [r0], r1 @D6 = p1u , D7 = p1v
+ mov r4, r0 @Keeping a backup of the pointer p0 of chroma
+ vld2.8 {d4, d5}, [r0], r1 @D4 = p0u , D5 = p0v
+ vdup.8 q10, r2 @Q10 contains alpha
+ vld2.8 {d0, d1}, [r0], r1 @D0 = q0u , D1 = q0v
+ vaddl.u8 q4, d6, d0 @
+ vaddl.u8 q5, d7, d1 @Q4,Q5 = q0 + p1
+ vmov.i8 d31, #2 @
+ vld2.8 {d2, d3}, [r0] @D2 = q1u , D3 = q1v
+ vabd.u8 q13, q3, q2 @Q13 = ABS(p1 - p0)
+ vmlal.u8 q4, d2, d31 @
+ vmlal.u8 q5, d3, d31 @Q5,Q4 = (X2(q1U) + q0U + p1U)
+ vabd.u8 q11, q2, q0 @Q11 = ABS(p0 - q0)
+ vabd.u8 q12, q1, q0 @Q12 = ABS(q1 - q0)
+ vaddl.u8 q7, d4, d2 @
+ vaddl.u8 q14, d5, d3 @Q14,Q7 = P0 + Q1
+ vdup.8 q8, r3 @Q8 contains beta
+ vmlal.u8 q7, d6, d31 @
+ vmlal.u8 q14, d7, d31 @Q14,Q7 = (X2(p1U) + p0U + q1U)
+ vcge.u8 q9, q11, q10 @Q9 = ( ABS(p0 - q0) >= Alpha )
+ vcge.u8 q12, q12, q8 @Q12= ( ABS(q1 - q0) >= Beta )
+ vcge.u8 q13, q13, q8 @Q13= ( ABS(p1 - p0) >= Beta )
+ vrshrn.u16 d8, q4, #2 @
+ vrshrn.u16 d9, q5, #2 @Q4 = (X2(q1U) + q0U + p1U + 2) >> 2
+ vorr q9, q9, q12 @Q9 = ( ABS(p0 - q0) >= Alpha ) | ( ABS(q1 - q0) >= Beta )
+ vrshrn.u16 d10, q7, #2 @
+ vrshrn.u16 d11, q14, #2 @Q5 = (X2(p1U) + p0U + q1U + 2) >> 2
+ vorr q9, q9, q13 @Q9 = ( ABS(p0 - q0) >= Alpha ) | ( ABS(q1 - q0) >= Beta ) | ( ABS(p1 - p0) >= Beta )
+ vbit q5, q2, q9 @
+ vbit q4, q0, q9 @
+ vst2.8 {d10, d11}, [r4], r1 @
+ vst2.8 {d8, d9}, [r4] @
+ vpop {d8 - d15}
+ ldmfd sp!, {r4, pc} @
+
+
+
+@/**
+@*******************************************************************************
+@*
+@* @brief
+@* Performs filtering of a chroma block vertical edge when the
+@* boundary strength is set to 4
+@*
+@* @par Description:
+@* This operation is described in Sec. 8.7.2.4 under the title
+@* "Filtering process for edges for bS equal to 4" in ITU T Rec H.264.
+@*
+@* @param[in] r0 - pu1_src
+@* Pointer to the src sample q0
+@*
+@* @param[in] r1 - src_strd
+@* Source stride
+@*
+@* @param[in] r2 - alpha
+@* Alpha Value for the boundary
+@*
+@* @param[in] r3 - beta
+@* Beta Value for the boundary
+@*
+@* @returns
+@* None
+@*
+@* @remarks
+@* None
+@*
+@*******************************************************************************
+@*/
+
+ .global ih264_deblk_chroma_vert_bs4_bp_a9
+
+ih264_deblk_chroma_vert_bs4_bp_a9:
+
+ stmfd sp!, {r12, r14}
+ vpush {d8 - d15}
+ sub r0, r0, #4 @point r0 to p1u of row0.
+ mov r12, r0 @keep a back up of r0 for buffer write
+
+ vld4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0], r1
+ vld4.16 {d0[1], d2[1], d4[1], d6[1]}, [r0], r1
+ vld4.16 {d0[2], d2[2], d4[2], d6[2]}, [r0], r1
+ vld4.16 {d0[3], d2[3], d4[3], d6[3]}, [r0], r1
+
+ vld4.16 {d1[0], d3[0], d5[0], d7[0]}, [r0], r1
+ vld4.16 {d1[1], d3[1], d5[1], d7[1]}, [r0], r1
+ vld4.16 {d1[2], d3[2], d5[2], d7[2]}, [r0], r1
+ vld4.16 {d1[3], d3[3], d5[3], d7[3]}, [r0], r1
+
+ vdup.8 q11, r2 @Q4 = alpha
+ vdup.8 q12, r3 @Q5 = beta
+ vmov.i8 d31, #2
+
+ vabd.u8 q4, q1, q2 @|p0-q0|
+ vabd.u8 q5, q3, q2 @|q1-q0|
+ vabd.u8 q6, q0, q1 @|p1-p0|
+ vaddl.u8 q7, d2, d6
+ vaddl.u8 q8, d3, d7 @(p0 + q1)
+ vclt.u8 q4, q4, q11 @|p0-q0| < alpha ?
+ vclt.u8 q5, q5, q12 @|q1-q0| < beta ?
+ vclt.u8 q6, q6, q12 @|p1-p0| < beta ?
+ vmlal.u8 q7, d0, d31
+ vmlal.u8 q8, d1, d31 @2*p1 + (p0 + q1)
+ vaddl.u8 q9, d0, d4
+ vaddl.u8 q10, d1, d5 @(p1 + q0)
+ vand.u8 q4, q4, q5 @|p0-q0| < alpha && |q1-q0| < beta
+ vmlal.u8 q9, d6, d31
+ vmlal.u8 q10, d7, d31 @2*q1 + (p1 + q0)
+
+ vrshrn.i16 d14, q7, #2
+ vrshrn.i16 d15, q8, #2 @(2*p1 + (p0 + q1) + 2) >> 2
+ vand.u8 q4, q4, q6 @|p0-q0| < alpha && |q1-q0| < beta && |p1-p0| < beta
+ vrshrn.i16 d18, q9, #2
+ vrshrn.i16 d19, q10, #2 @(2*q1 + (p1 + q0) + 2) >> 2
+
+ vbit q1, q7, q4
+ vbit q2, q9, q4
+
+ vst4.16 {d0[0], d2[0], d4[0], d6[0]}, [r12], r1
+ vst4.16 {d0[1], d2[1], d4[1], d6[1]}, [r12], r1
+ vst4.16 {d0[2], d2[2], d4[2], d6[2]}, [r12], r1
+ vst4.16 {d0[3], d2[3], d4[3], d6[3]}, [r12], r1
+
+ vst4.16 {d1[0], d3[0], d5[0], d7[0]}, [r12], r1
+ vst4.16 {d1[1], d3[1], d5[1], d7[1]}, [r12], r1
+ vst4.16 {d1[2], d3[2], d5[2], d7[2]}, [r12], r1
+ vst4.16 {d1[3], d3[3], d5[3], d7[3]}, [r12], r1
+ vpop {d8 - d15}
+ ldmfd sp!, {r12, pc}
+
+
+
+@/**
+@*******************************************************************************
+@*
+@* @brief
+@* Performs filtering of a chroma block horizontal edge for cases where the
+@* boundary strength is less than 4
+@*
+@* @par Description:
+@* This operation is described in Sec. 8.7.2.4 under the title
+@* "Filtering process for edges for bS equal to 4" in ITU T Rec H.264.
+@*
+@* @param[in] r0 - pu1_src
+@* Pointer to the src sample q0
+@*
+@* @param[in] r1 - src_strd
+@* Source stride
+@*
+@* @param[in] r2 - alpha
+@* Alpha Value for the boundary
+@*
+@* @param[in] r3 - beta
+@* Beta Value for the boundary
+@*
+@* @param[in] sp(0) - u4_bs
+@* Packed Boundary strength array
+@*
+@* @param[in] sp(4) - pu1_cliptab
+@* tc0_table
+@*
+@* @returns
+@* None
+@*
+@* @remarks
+@* None
+@*
+@*******************************************************************************
+@*/
+
+ .global ih264_deblk_chroma_horz_bslt4_bp_a9
+
+ih264_deblk_chroma_horz_bslt4_bp_a9:
+
+ stmfd sp!, {r4-r6, lr} @
+
+ ldrd r4, r5, [sp, #0x10] @r4 = u4_bs , r5 = pu1_cliptab
+ vpush {d8 - d15}
+ sub r0, r0, r1, lsl #1 @R0 = uc_edgePixelU pointing to p2 of chroma U
+ rev r4, r4 @
+ vmov.32 d12[0], r4 @d12[0] = ui_Bs
+ vld1.32 d16[0], [r5] @D16[0] contains cliptab
+ vld2.8 {d6, d7}, [r0], r1 @Q3=p1
+ vtbl.8 d14, {d16}, d12 @
+ vmovl.u8 q6, d12 @q6 = uc_Bs in each 16 bit scalar
+ mov r6, r0 @Keeping a backup of the pointer to chroma U P0
+ vld2.8 {d4, d5}, [r0], r1 @Q2=p0
+ vmov.i8 d30, #1 @
+ vdup.8 q10, r2 @Q10 contains alpha
+ vld2.8 {d0, d1}, [r0], r1 @Q0=q0
+ vmovl.u8 q7, d14 @
+ vld2.8 {d2, d3}, [r0] @Q1=q1
+ vsubl.u8 q5, d1, d5 @
+ vsubl.u8 q4, d0, d4 @Q5,Q4 = (q0 - p0)
+ vabd.u8 q13, q3, q2 @Q13 = ABS(p1 - p0)
+ vshl.i16 q5, q5, #2 @Q5 = (q0 - p0)<<2
+ vabd.u8 q11, q2, q0 @Q11 = ABS(p0 - q0)
+ vshl.i16 q4, q4, #2 @Q4 = (q0 - p0)<<2
+ vsli.16 q7, q7, #8 @
+ vabd.u8 q12, q1, q0 @Q12 = ABS(q1 - q0)
+ vcge.u8 q9, q11, q10 @Q9 = ( ABS(p0 - q0) >= Alpha )
+ vsubl.u8 q10, d6, d2 @Q10 = (p1 - q1)L
+ vsubl.u8 q3, d7, d3 @Q3 = (p1 - q1)H
+ vdup.8 q8, r3 @Q8 contains beta
+ vadd.i16 q4, q4, q10 @
+ vadd.i16 q5, q5, q3 @Q5,Q4 = [ (q0 - p0)<<2 ] + (p1 - q1)
+ vcge.u8 q12, q12, q8 @Q12= ( ABS(q1 - q0) >= Beta )
+ vcgt.s16 d12, d12, #0 @Q6 = (us_Bs > 0)
+ vqrshrn.s16 d8, q4, #3 @
+ vqrshrn.s16 d9, q5, #3 @Q4 = i_macro = (((q0 - p0)<<2) + (p1 - q1) + 4)>>3
+ vadd.i8 d14, d14, d30 @Q7 = C = C0+1
+ vcge.u8 q13, q13, q8 @Q13= ( ABS(p1 - p0) >= Beta )
+ vorr q9, q9, q12 @Q9 = ( ABS(p0 - q0) >= Alpha ) | ( ABS(q1 - q0) >= Beta )
+ vabs.s8 q3, q4 @Q4 = ABS (i_macro)
+ vmov.i8 d15, d14 @
+ vmov.i8 d13, d12 @
+ vorr q9, q9, q13 @Q9 = ( ABS(p0 - q0) >= Alpha ) | ( ABS(q1 - q0) >= Beta ) | ( ABS(p1 - p0) >= Beta )
+ vmin.u8 q7, q3, q7 @Q7 = delta = (ABS(i_macro) > C) ? C : ABS(i_macro)
+ vbic q6, q6, q9 @final condition
+ vcge.s8 q4, q4, #0 @Q4 = (i_macro >= 0)
+ vand q7, q7, q6 @Making delta zero in places where values shouldn be filterd
+ vqadd.u8 q8, q2, q7 @Q8 = p0 + delta
+ vqsub.u8 q2, q2, q7 @Q2 = p0 - delta
+ vqadd.u8 q9, q0, q7 @Q9 = q0 + delta
+ vqsub.u8 q0, q0, q7 @Q0 = q0 - delta
+ vbif q8, q2, q4 @Q8 = (i_macro >= 0 ) ? (p0+delta) : (p0-delta)
+ vbif q0, q9, q4 @Q0 = (i_macro >= 0 ) ? (q0-delta) : (q0+delta)
+ vst2.8 {d16, d17}, [r6], r1 @
+ vst2.8 {d0, d1}, [r6] @
+ vpop {d8 - d15}
+ ldmfd sp!, {r4-r6, pc} @
+
+
+
+@/**
+@*******************************************************************************
+@*
+@* @brief
+@* Performs filtering of a chroma block vertical edge for cases where the
+@* boundary strength is less than 4
+@*
+@* @par Description:
+@* This operation is described in Sec. 8.7.2.4 under the title
+@* "Filtering process for edges for bS equal to 4" in ITU T Rec H.264.
+@*
+@* @param[in] r0 - pu1_src
+@* Pointer to the src sample q0
+@*
+@* @param[in] r1 - src_strd
+@* Source stride
+@*
+@* @param[in] r2 - alpha
+@* Alpha Value for the boundary
+@*
+@* @param[in] r3 - beta
+@* Beta Value for the boundary
+@*
+@* @param[in] sp(0) - u4_bs
+@* Packed Boundary strength array
+@*
+@* @param[in] sp(4) - pu1_cliptab
+@* tc0_table
+@*
+@* @returns
+@* None
+@*
+@* @remarks
+@* None
+@*
+@*******************************************************************************
+@*/
+
+ .global ih264_deblk_chroma_vert_bslt4_bp_a9
+
+ih264_deblk_chroma_vert_bslt4_bp_a9:
+
+ stmfd sp!, {r10-r12, r14}
+
+ sub r0, r0, #4 @point r0 to p1u of row0.
+ ldr r11, [sp, #16] @r12 = ui_Bs
+
+ ldr r10, [sp, #20] @r14 = puc_ClipTab
+ mov r12, r0 @keep a back up of r0 for buffer write
+ vpush {d8 - d15}
+ vld4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0], r1
+ vld4.16 {d0[1], d2[1], d4[1], d6[1]}, [r0], r1
+ vld4.16 {d0[2], d2[2], d4[2], d6[2]}, [r0], r1
+ vld4.16 {d0[3], d2[3], d4[3], d6[3]}, [r0], r1
+
+ vld4.16 {d1[0], d3[0], d5[0], d7[0]}, [r0], r1
+ vld4.16 {d1[1], d3[1], d5[1], d7[1]}, [r0], r1
+ vld4.16 {d1[2], d3[2], d5[2], d7[2]}, [r0], r1
+ vld4.16 {d1[3], d3[3], d5[3], d7[3]}, [r0], r1
+
+
+ vdup.8 q11, r2 @Q4 = alpha
+ vabd.u8 q4, q1, q2 @|p0-q0|
+ vdup.8 q12, r3 @Q5 = beta
+ vabd.u8 q5, q3, q2 @|q1-q0|
+ vabd.u8 q6, q0, q1 @|p1-p0|
+ vclt.u8 q4, q4, q11 @|p0-q0| < alpha ?
+ vsubl.u8 q7, d0, d6
+ vclt.u8 q5, q5, q12 @|q1-q0| < beta ?
+ vsubl.u8 q8, d1, d7 @(p1 - q1)
+ vclt.u8 q6, q6, q12 @|p1-p0| < beta ?
+ vsubl.u8 q9, d4, d2
+ vand.u8 q4, q4, q5 @|p0-q0| < alpha && |q1-q0| < beta
+ vsubl.u8 q10, d5, d3 @(q0 - p0)
+ vmov.u16 q14, #4
+ vld1.32 {d24[0]}, [r10] @Load ClipTable
+ rev r11, r11 @Blocking strengths
+ vand.u8 q4, q4, q6 @|p0-q0| < alpha && |q1-q0| < beta && |p1-p0| < beta
+
+ vmov.32 d10[0], r11
+
+ vmla.s16 q7, q9, q14
+ vmla.s16 q8, q10, q14 @4*(q0 - p0) + (p1 - q1)
+
+ vmovl.u8 q5, d10
+
+
+ vsli.u16 d10, d10, #8
+ vmovl.u16 q5, d10
+ vsli.u32 q5, q5, #16
+ vtbl.8 d12, {d24}, d10
+ vtbl.8 d13, {d24}, d11 @tC0
+ vmov.u8 q12, #1
+ vadd.u8 q6, q6, q12 @tC0 + 1
+ vcge.u8 q5, q5, q12 @u4_bS > 0 ?
+ vand.u8 q4, q4, q5 @|p0-q0| < alpha && |q1-q0| < beta && |p1-p0| < beta && u4_bs != 0
+
+ @ Q0 - Q3(inputs),
+ @ Q4 (|p0-q0| < alpha && |q1-q0| < beta && |p1-p0| < beta && u4_bs != 0),
+ @ Q6 (tC)
+
+ vrshr.s16 q7, q7, #3
+ vrshr.s16 q8, q8, #3 @(((q0 - p0) << 2) + (p1 - q1) + 4) >> 3)
+
+ vcgt.s16 q9, q7, #0
+ vcgt.s16 q10, q8, #0
+ vmovn.i16 d18, q9
+ vmovn.i16 d19, q10 @Q9 = sign(delta)
+ vabs.s16 q7, q7
+ vabs.s16 q8, q8
+ vmovn.u16 d14, q7
+ vmovn.u16 d15, q8
+ vmin.u8 q7, q7, q6 @Q7 = |delta|
+
+ vqadd.u8 q10, q1, q7 @p0+|delta|
+ vqadd.u8 q11, q2, q7 @q0+|delta|
+ vqsub.u8 q12, q1, q7 @p0-|delta|
+ vqsub.u8 q13, q2, q7 @q0-|delta|
+
+ vbit q12, q10, q9 @p0 + delta
+ vbit q11, q13, q9 @q0 - delta
+
+ vbit q1, q12, q4
+ vbit q2, q11, q4
+
+ vst4.16 {d0[0], d2[0], d4[0], d6[0]}, [r12], r1
+ vst4.16 {d0[1], d2[1], d4[1], d6[1]}, [r12], r1
+ vst4.16 {d0[2], d2[2], d4[2], d6[2]}, [r12], r1
+ vst4.16 {d0[3], d2[3], d4[3], d6[3]}, [r12], r1
+
+ vst4.16 {d1[0], d3[0], d5[0], d7[0]}, [r12], r1
+ vst4.16 {d1[1], d3[1], d5[1], d7[1]}, [r12], r1
+ vst4.16 {d1[2], d3[2], d5[2], d7[2]}, [r12], r1
+ vst4.16 {d1[3], d3[3], d5[3], d7[3]}, [r12], r1
+ vpop {d8 - d15}
+ ldmfd sp!, {r10-r12, pc}
+
+
+
+@/**
+@*******************************************************************************
+@*
+@* @brief
+@* Performs filtering of a chroma block vertical edge when the
+@* boundary strength is set to 4 on calling twice
+@*
+@* @par Description:
+@* This operation is described in Sec. 8.7.2.4 under the title
+@* "Filtering process for edges for bS equal to 4" in ITU T Rec H.264.
+@*
+@* @param[in] r0 - pu1_src
+@* Pointer to the src sample q0
+@*
+@* @param[in] r1 - src_strd
+@* Source stride
+@*
+@* @param[in] r2 - alpha
+@* Alpha Value for the boundary
+@*
+@* @param[in] r3 - beta
+@* Beta Value for the boundary
+@*
+@* @returns
+@* None
+@*
+@* @remarks
+@* None
+@*
+@*******************************************************************************
+@*/
+
+ .global ih264_deblk_chroma_vert_bs4_mbaff_bp_a9
+
+ih264_deblk_chroma_vert_bs4_mbaff_bp_a9:
+
+ stmfd sp!, {r12, r14}
+ vpush {d8 - d15}
+ sub r0, r0, #4 @point r0 to p1u of row0.
+ mov r12, r0 @keep a back up of r0 for buffer write
+
+ vld4.16 {d0[0], d1[0], d2[0], d3[0]}, [r0], r1
+ vld4.16 {d0[1], d1[1], d2[1], d3[1]}, [r0], r1
+ vld4.16 {d0[2], d1[2], d2[2], d3[2]}, [r0], r1
+ vld4.16 {d0[3], d1[3], d2[3], d3[3]}, [r0], r1
+
+ vdup.8 d11, r2 @D11 = alpha
+ vdup.8 d12, r3 @D12 = beta
+ vmov.i8 d31, #2
+
+ vabd.u8 d4, d1, d2 @|p0-q0|
+ vabd.u8 d5, d3, d2 @|q1-q0|
+ vabd.u8 d6, d0, d1 @|p1-p0|
+ vaddl.u8 q14, d1, d3 @(p0 + q1)
+ vclt.u8 d4, d4, d11 @|p0-q0| < alpha ?
+ vclt.u8 d5, d5, d12 @|q1-q0| < beta ?
+ vclt.u8 d6, d6, d12 @|p1-p0| < beta ?
+ vmlal.u8 q14, d0, d31 @2*p1 + (p0 + q1)
+ vaddl.u8 q13, d0, d2 @(p1 + q0)
+ vand.u8 d4, d4, d5 @|p0-q0| < alpha && |q1-q0| < beta
+ vmlal.u8 q13, d3, d31 @2*q1 + (p1 + q0)
+
+ vrshrn.i16 d7, q14, #2 @(2*p1 + (p0 + q1) + 2) >> 2
+ vand.u8 d4, d4, d6 @|p0-q0| < alpha && |q1-q0| < beta && |p1-p0| < beta
+ vrshrn.i16 d9, q13, #2 @(2*q1 + (p1 + q0) + 2) >> 2
+
+ vbit d1, d7, d4
+ vbit d2, d9, d4
+
+ vst4.16 {d0[0], d1[0], d2[0], d3[0]}, [r12], r1
+ vst4.16 {d0[1], d1[1], d2[1], d3[1]}, [r12], r1
+ vst4.16 {d0[2], d1[2], d2[2], d3[2]}, [r12], r1
+ vst4.16 {d0[3], d1[3], d2[3], d3[3]}, [r12], r1
+ vpop {d8 - d15}
+ ldmfd sp!, {r12, pc}
+
+
+
+@/**
+@*******************************************************************************
+@*
+@* @brief
+@* Performs filtering of a chroma block vertical edge for cases where the
+@* boundary strength is less than 4 on calling twice
+@*
+@* @par Description:
+@* This operation is described in Sec. 8.7.2.4 under the title
+@* "Filtering process for edges for bS equal to 4" in ITU T Rec H.264.
+@*
+@* @param[in] r0 - pu1_src
+@* Pointer to the src sample q0
+@*
+@* @param[in] r1 - src_strd
+@* Source stride
+@*
+@* @param[in] r2 - alpha
+@* Alpha Value for the boundary
+@*
+@* @param[in] r3 - beta
+@* Beta Value for the boundary
+@*
+@* @param[in] sp(0) - u4_bs
+@* Packed Boundary strength array
+@*
+@* @param[in] sp(4) - pu1_cliptab
+@* tc0_table
+@*
+@* @returns
+@* None
+@*
+@* @remarks
+@* None
+@*
+@*******************************************************************************
+@*/
+
+ .global ih264_deblk_chroma_vert_bslt4_mbaff_bp_a9
+
+ih264_deblk_chroma_vert_bslt4_mbaff_bp_a9:
+
+ stmfd sp!, {r10-r12, r14}
+
+ sub r0, r0, #4 @point r0 to p1u of row0.
+ ldr r11, [sp, #16] @r11 = ui_Bs
+
+ ldr r10, [sp, #20] @r10 = puc_ClipTab
+ mov r12, r0 @keep a back up of r0 for buffer write
+ vpush {d8 - d15}
+ vld4.16 {d0[0], d1[0], d2[0], d3[0]}, [r0], r1
+ vld4.16 {d0[1], d1[1], d2[1], d3[1]}, [r0], r1
+ vld4.16 {d0[2], d1[2], d2[2], d3[2]}, [r0], r1
+ vld4.16 {d0[3], d1[3], d2[3], d3[3]}, [r0], r1
+
+ vdup.8 d11, r2 @D11 = alpha
+ vabd.u8 d4, d1, d2 @|p0-q0|
+ vdup.8 d12, r3 @D12 = beta
+ vabd.u8 d5, d3, d2 @|q1-q0|
+ vabd.u8 d6, d0, d1 @|p1-p0|
+ vclt.u8 d4, d4, d11 @|p0-q0| < alpha ?
+ vclt.u8 d5, d5, d12 @|q1-q0| < beta ?
+ vsubl.u8 q14, d0, d3 @(p1 - q1)
+ vclt.u8 d6, d6, d12 @|p1-p0| < beta ?
+ vand.u8 d4, d4, d5 @|p0-q0| < alpha && |q1-q0| < beta
+ vsubl.u8 q12, d2, d1 @(q0 - p0)
+ vmov.u16 q10, #4
+
+ vld1.32 {d31[0]}, [r10] @Load ClipTable
+ rev r11, r11 @Blocking strengths
+ vand.u8 d4, d4, d6 @|p0-q0| < alpha && |q1-q0| < beta && |p1-p0| < beta
+ vmov.32 d22[0], r11
+ vmla.s16 q14, q12, q10 @4*(q0 - p0) + (p1 - q1)
+ vmovl.u8 q11, d22
+ vsli.u16 d22, d22, #8
+ vtbl.8 d6, {d31}, d22 @tC0
+ vmov.u8 d12, #1
+ vadd.u8 d6, d6, d12 @tC0 + 1
+ vcge.u8 d5, d22, d12 @u4_bS > 0 ?
+ vand.u8 d4, d4, d5 @|p0-q0| < alpha && |q1-q0| < beta && |p1-p0| < beta && u4_bs != 0
+
+ @ D0 - D3(inputs),
+ @ D4 (|p0-q0| < alpha && |q1-q0| < beta && |p1-p0| < beta && u4_bs != 0),
+ @ D6 (tC)
+
+ vrshr.s16 q14, q14, #3 @(((q0 - p0) << 2) + (p1 - q1) + 4) >> 3)
+
+ vcgt.s16 q13, q14, #0
+ vmovn.i16 d9, q13 @D9 = sign(delta)
+ vabs.s16 q14, q14
+ vmovn.u16 d7, q14
+ vmin.u8 d7, d7, d6 @D7 = |delta|
+
+ vqadd.u8 d10, d1, d7 @p0+|delta|
+ vqadd.u8 d11, d2, d7 @q0+|delta|
+ vqsub.u8 d12, d1, d7 @p0-|delta|
+ vqsub.u8 d13, d2, d7 @q0-|delta|
+
+ vbit d12, d10, d9 @p0 + delta
+ vbit d11, d13, d9 @q0 - delta
+
+ vbit d1, d12, d4
+ vbit d2, d11, d4
+
+ vst4.16 {d0[0], d1[0], d2[0], d3[0]}, [r12], r1
+ vst4.16 {d0[1], d1[1], d2[1], d3[1]}, [r12], r1
+ vst4.16 {d0[2], d1[2], d2[2], d3[2]}, [r12], r1
+ vst4.16 {d0[3], d1[3], d2[3], d3[3]}, [r12], r1
+ vpop {d8 - d15}
+ ldmfd sp!, {r10-r12, pc}
+
+
+
+@/**
+@*******************************************************************************
+@*
+@* @brief
+@* Performs filtering of a chroma block horizontal edge when the
+@* boundary strength is set to 4 in high profile
+@*
+@* @par Description:
+@* This operation is described in Sec. 8.7.2.4 under the title
+@* "Filtering process for edges for bS equal to 4" in ITU T Rec H.264.
+@*
+@* @param[in] r0 - pu1_src
+@* Pointer to the src sample q0
+@*
+@* @param[in] r1 - src_strd
+@* Source stride
+@*
+@* @param[in] r2 - alpha_cb
+@* Alpha Value for the boundary in U
+@*
+@* @param[in] r3 - beta_cb
+@* Beta Value for the boundary in U
+@*
+@* @param[in] sp(0) - alpha_cr
+@* Alpha Value for the boundary in V
+@*
+@* @param[in] sp(4) - beta_cr
+@* Beta Value for the boundary in V
+@*
+@* @returns
+@* None
+@*
+@* @remarks
+@* None
+@*
+@*******************************************************************************
+@*/
+
+ .global ih264_deblk_chroma_horz_bs4_a9
+
+ih264_deblk_chroma_horz_bs4_a9:
+
+ stmfd sp!, {r4-r6, lr} @
+
+ ldr r5, [sp, #16] @R5 = alpha_cr
+ ldr r6, [sp, #20] @R6 = beta_cr
+ vpush {d8 - d15}
+ sub r0, r0, r1, lsl #1 @R0 = uc_edgePixel pointing to p1 of chroma
+ vld2.8 {d6, d7}, [r0], r1 @D6 = p1u , D7 = p1v
+ mov r4, r0 @Keeping a backup of the pointer p0 of chroma
+ vld2.8 {d4, d5}, [r0], r1 @D4 = p0u , D5 = p0v
+ vdup.8 d20, r2 @D20 contains alpha_cb
+ vdup.8 d21, r5 @D21 contains alpha_cr
+ vld2.8 {d0, d1}, [r0], r1 @D0 = q0u , D1 = q0v
+ vaddl.u8 q4, d6, d0 @
+ vaddl.u8 q5, d7, d1 @Q4,Q5 = q0 + p1
+ vmov.i8 d31, #2 @
+ vld2.8 {d2, d3}, [r0] @D2 = q1u , D3 = q1v
+ vabd.u8 q13, q3, q2 @Q13 = ABS(p1 - p0)
+ vmlal.u8 q4, d2, d31 @
+ vmlal.u8 q5, d3, d31 @Q5,Q4 = (X2(q1U) + q0U + p1U)
+ vabd.u8 q11, q2, q0 @Q11 = ABS(p0 - q0)
+ vabd.u8 q12, q1, q0 @Q12 = ABS(q1 - q0)
+ vaddl.u8 q7, d4, d2 @
+ vaddl.u8 q14, d5, d3 @Q14,Q7 = P0 + Q1
+ vdup.8 d16, r3 @D16 contains beta_cb
+ vdup.8 d17, r6 @D17 contains beta_cr
+ vmlal.u8 q7, d6, d31 @
+ vmlal.u8 q14, d7, d31 @Q14,Q7 = (X2(p1U) + p0U + q1U)
+ vcge.u8 q9, q11, q10 @Q9 = ( ABS(p0 - q0) >= Alpha )
+ vcge.u8 q12, q12, q8 @Q12= ( ABS(q1 - q0) >= Beta )
+ vcge.u8 q13, q13, q8 @Q13= ( ABS(p1 - p0) >= Beta )
+ vrshrn.u16 d8, q4, #2 @
+ vrshrn.u16 d9, q5, #2 @Q4 = (X2(q1U) + q0U + p1U + 2) >> 2
+ vorr q9, q9, q12 @Q9 = ( ABS(p0 - q0) >= Alpha ) | ( ABS(q1 - q0) >= Beta )
+ vrshrn.u16 d10, q7, #2 @
+ vrshrn.u16 d11, q14, #2 @Q5 = (X2(p1U) + p0U + q1U + 2) >> 2
+ vorr q9, q9, q13 @Q9 = ( ABS(p0 - q0) >= Alpha ) | ( ABS(q1 - q0) >= Beta ) | ( ABS(p1 - p0) >= Beta )
+ vbit q5, q2, q9 @
+ vbit q4, q0, q9 @
+ vst2.8 {d10, d11}, [r4], r1 @
+ vst2.8 {d8, d9}, [r4] @
+ vpop {d8 - d15}
+ ldmfd sp!, {r4-r6, pc} @
+
+
+
+@/**
+@*******************************************************************************
+@*
+@* @brief
+@* Performs filtering of a chroma block vertical edge when the
+@* boundary strength is set to 4 in high profile
+@*
+@* @par Description:
+@* This operation is described in Sec. 8.7.2.4 under the title
+@* "Filtering process for edges for bS equal to 4" in ITU T Rec H.264.
+@*
+@* @param[in] r0 - pu1_src
+@* Pointer to the src sample q0
+@*
+@* @param[in] r1 - src_strd
+@* Source stride
+@*
+@* @param[in] r2 - alpha_cb
+@* Alpha Value for the boundary in U
+@*
+@* @param[in] r3 - beta_cb
+@* Beta Value for the boundary in U
+@*
+@* @param[in] sp(0) - alpha_cr
+@* Alpha Value for the boundary in V
+@*
+@* @param[in] sp(4) - beta_cr
+@* Beta Value for the boundary in V
+@*
+@* @returns
+@* None
+@*
+@* @remarks
+@* None
+@*
+@*******************************************************************************
+@*/
+
+ .global ih264_deblk_chroma_vert_bs4_a9
+
+ih264_deblk_chroma_vert_bs4_a9:
+
+ stmfd sp!, {r4, r5, r12, r14}
+
+ sub r0, r0, #4 @point r0 to p1u of row0.
+ mov r12, r0 @keep a back up of r0 for buffer write
+
+ ldr r4, [sp, #16] @r4 = alpha_cr
+ ldr r5, [sp, #20] @r5 = beta_cr
+ add r2, r2, r4, lsl #8 @r2 = (alpha_cr,alpha_cb)
+ add r3, r3, r5, lsl #8 @r3 = (beta_cr,beta_cb)
+ vpush {d8 - d15}
+ vld4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0], r1
+ vld4.16 {d0[1], d2[1], d4[1], d6[1]}, [r0], r1
+ vld4.16 {d0[2], d2[2], d4[2], d6[2]}, [r0], r1
+ vld4.16 {d0[3], d2[3], d4[3], d6[3]}, [r0], r1
+
+ vld4.16 {d1[0], d3[0], d5[0], d7[0]}, [r0], r1
+ vld4.16 {d1[1], d3[1], d5[1], d7[1]}, [r0], r1
+ vld4.16 {d1[2], d3[2], d5[2], d7[2]}, [r0], r1
+ vld4.16 {d1[3], d3[3], d5[3], d7[3]}, [r0], r1
+
+ vdup.16 q11, r2 @Q11 = alpha
+ vdup.16 q12, r3 @Q12 = beta
+ vmov.i8 d31, #2
+
+ vabd.u8 q4, q1, q2 @|p0-q0|
+ vabd.u8 q5, q3, q2 @|q1-q0|
+ vabd.u8 q6, q0, q1 @|p1-p0|
+ vaddl.u8 q7, d2, d6
+ vaddl.u8 q8, d3, d7 @(p0 + q1)
+ vclt.u8 q4, q4, q11 @|p0-q0| < alpha ?
+ vclt.u8 q5, q5, q12 @|q1-q0| < beta ?
+ vclt.u8 q6, q6, q12 @|p1-p0| < beta ?
+ vmlal.u8 q7, d0, d31
+ vmlal.u8 q8, d1, d31 @2*p1 + (p0 + q1)
+ vaddl.u8 q9, d0, d4
+ vaddl.u8 q10, d1, d5 @(p1 + q0)
+ vand.u8 q4, q4, q5 @|p0-q0| < alpha && |q1-q0| < beta
+ vmlal.u8 q9, d6, d31
+ vmlal.u8 q10, d7, d31 @2*q1 + (p1 + q0)
+
+ vrshrn.i16 d14, q7, #2
+ vrshrn.i16 d15, q8, #2 @(2*p1 + (p0 + q1) + 2) >> 2
+ vand.u8 q4, q4, q6 @|p0-q0| < alpha && |q1-q0| < beta && |p1-p0| < beta
+ vrshrn.i16 d18, q9, #2
+ vrshrn.i16 d19, q10, #2 @(2*q1 + (p1 + q0) + 2) >> 2
+
+ vbit q1, q7, q4
+ vbit q2, q9, q4
+
+ vst4.16 {d0[0], d2[0], d4[0], d6[0]}, [r12], r1
+ vst4.16 {d0[1], d2[1], d4[1], d6[1]}, [r12], r1
+ vst4.16 {d0[2], d2[2], d4[2], d6[2]}, [r12], r1
+ vst4.16 {d0[3], d2[3], d4[3], d6[3]}, [r12], r1
+
+ vst4.16 {d1[0], d3[0], d5[0], d7[0]}, [r12], r1
+ vst4.16 {d1[1], d3[1], d5[1], d7[1]}, [r12], r1
+ vst4.16 {d1[2], d3[2], d5[2], d7[2]}, [r12], r1
+ vst4.16 {d1[3], d3[3], d5[3], d7[3]}, [r12], r1
+ vpop {d8 - d15}
+ ldmfd sp!, {r4, r5, r12, pc}
+
+
+
+@/**
+@*******************************************************************************
+@*
+@* @brief
+@* Performs filtering of a chroma block horizontal edge for cases where the
+@* boundary strength is less than 4 in high profile
+@*
+@* @par Description:
+@* This operation is described in Sec. 8.7.2.4 under the title
+@* "Filtering process for edges for bS equal to 4" in ITU T Rec H.264.
+@*
+@* @param[in] r0 - pu1_src
+@* Pointer to the src sample q0
+@*
+@* @param[in] r1 - src_strd
+@* Source stride
+@*
+@* @param[in] r2 - alpha_cb
+@* Alpha Value for the boundary in U
+@*
+@* @param[in] r3 - beta_cb
+@* Beta Value for the boundary in U
+@*
+@* @param[in] sp(0) - alpha_cr
+@* Alpha Value for the boundary in V
+@*
+@* @param[in] sp(4) - beta_cr
+@* Beta Value for the boundary in V
+@*
+@* @param[in] sp(8) - u4_bs
+@* Packed Boundary strength array
+@*
+@* @param[in] sp(12) - pu1_cliptab_cb
+@* tc0_table for U
+@*
+@* @param[in] sp(16) - pu1_cliptab_cr
+@* tc0_table for V
+@*
+@* @returns
+@* None
+@*
+@* @remarks
+@* None
+@*
+@*******************************************************************************
+@*/
+
+ .global ih264_deblk_chroma_horz_bslt4_a9
+
+ih264_deblk_chroma_horz_bslt4_a9:
+
+ stmfd sp!, {r4-r9, lr} @
+
+ ldrd r4, r5, [sp, #28] @R4 = alpha_cr , R5 = beta_cr
+ ldr r7, [sp, #36] @R7 = u4_bs
+ ldrd r8, r9, [sp, #40] @R8 = pu1_cliptab_cb , R9 = pu1_cliptab_cr
+ sub r0, r0, r1, lsl #1 @R0 = uc_edgePixelU pointing to p1 of chroma U
+ vpush {d8 - d15}
+ rev r7, r7 @
+ vmov.32 d12[0], r7 @D12[0] = ui_Bs
+
+ vld1.32 d16[0], [r8] @D16[0] contains cliptab_cb
+ vld1.32 d17[0], [r9] @D17[0] contains cliptab_cr
+ vld2.8 {d6, d7}, [r0], r1 @Q3=p1
+ vtbl.8 d14, {d16}, d12 @Retreiving cliptab values for U
+ vtbl.8 d28, {d17}, d12 @Retrieving cliptab values for V
+ vmovl.u8 q6, d12 @Q6 = uc_Bs in each 16 bit scalar
+ mov r6, r0 @Keeping a backup of the pointer to chroma U P0
+ vld2.8 {d4, d5}, [r0], r1 @Q2=p0
+ vmov.i8 d30, #1 @
+ vdup.8 d20, r2 @D20 contains alpha_cb
+ vdup.8 d21, r4 @D21 contains alpha_cr
+ vld2.8 {d0, d1}, [r0], r1 @Q0=q0
+ vmovl.u8 q7, d14 @
+ vmovl.u8 q14, d28 @
+ vmov.i16 d15, d28 @D14 has cliptab values for U, D15 for V
+ vld2.8 {d2, d3}, [r0] @Q1=q1
+ vsubl.u8 q5, d1, d5 @
+ vsubl.u8 q4, d0, d4 @Q5,Q4 = (q0 - p0)
+ vabd.u8 q13, q3, q2 @Q13 = ABS(p1 - p0)
+ vshl.i16 q5, q5, #2 @Q5 = (q0 - p0)<<2
+ vabd.u8 q11, q2, q0 @Q11 = ABS(p0 - q0)
+ vshl.i16 q4, q4, #2 @Q4 = (q0 - p0)<<2
+ vsli.16 q7, q7, #8 @
+ vabd.u8 q12, q1, q0 @Q12 = ABS(q1 - q0)
+ vcge.u8 q9, q11, q10 @Q9 = ( ABS(p0 - q0) >= Alpha )
+ vsubl.u8 q10, d6, d2 @Q10 = (p1 - q1)L
+ vsubl.u8 q3, d7, d3 @Q3 = (p1 - q1)H
+ vdup.8 d16, r3 @Q8 contains beta_cb
+ vdup.8 d17, r5 @Q8 contains beta_cr
+ vadd.i16 q4, q4, q10 @
+ vadd.i16 q5, q5, q3 @Q5,Q4 = [ (q0 - p0)<<2 ] + (p1 - q1)
+ vcge.u8 q12, q12, q8 @Q12= ( ABS(q1 - q0) >= Beta )
+ vcgt.s16 d12, d12, #0 @Q6 = (us_Bs > 0)
+ vqrshrn.s16 d8, q4, #3 @
+ vqrshrn.s16 d9, q5, #3 @Q4 = i_macro = (((q0 - p0)<<2) + (p1 - q1) + 4)>>3
+ vadd.i8 d14, d14, d30 @D14 = C = C0+1 for U
+ vcge.u8 q13, q13, q8 @Q13= ( ABS(p1 - p0) >= Beta )
+ vorr q9, q9, q12 @Q9 = ( ABS(p0 - q0) >= Alpha ) | ( ABS(q1 - q0) >= Beta )
+ vabs.s8 q3, q4 @Q4 = ABS (i_macro)
+ vadd.i8 d15, d15, d30 @D15 = C = C0+1 for V
+ vmov.i8 d13, d12 @
+ vorr q9, q9, q13 @Q9 = ( ABS(p0 - q0) >= Alpha ) | ( ABS(q1 - q0) >= Beta ) | ( ABS(p1 - p0) >= Beta )
+ vmin.u8 q7, q3, q7 @Q7 = delta = (ABS(i_macro) > C) ? C : ABS(i_macro)
+ vbic q6, q6, q9 @final condition
+ vcge.s8 q4, q4, #0 @Q4 = (i_macro >= 0)
+ vand q7, q7, q6 @Making delta zero in places where values shouldn be filterd
+ vqadd.u8 q8, q2, q7 @Q8 = p0 + delta
+ vqsub.u8 q2, q2, q7 @Q2 = p0 - delta
+ vqadd.u8 q9, q0, q7 @Q9 = q0 + delta
+ vqsub.u8 q0, q0, q7 @Q0 = q0 - delta
+ vbif q8, q2, q4 @Q8 = (i_macro >= 0 ) ? (p0+delta) : (p0-delta)
+ vbif q0, q9, q4 @Q0 = (i_macro >= 0 ) ? (q0-delta) : (q0+delta)
+ vst2.8 {d16, d17}, [r6], r1 @
+ vst2.8 {d0, d1}, [r6] @
+ vpop {d8 - d15}
+ ldmfd sp!, {r4-r9, pc} @
+
+
+
+@/**
+@*******************************************************************************
+@*
+@* @brief
+@* Performs filtering of a chroma block vertical edge for cases where the
+@* boundary strength is less than 4 in high profile
+@*
+@* @par Description:
+@* This operation is described in Sec. 8.7.2.4 under the title
+@* "Filtering process for edges for bS equal to 4" in ITU T Rec H.264.
+@*
+@* @param[in] r0 - pu1_src
+@* Pointer to the src sample q0
+@*
+@* @param[in] r1 - src_strd
+@* Source stride
+@*
+@* @param[in] r2 - alpha_cb
+@* Alpha Value for the boundary in U
+@*
+@* @param[in] r3 - beta_cb
+@* Beta Value for the boundary in U
+@*
+@* @param[in] sp(0) - alpha_cr
+@* Alpha Value for the boundary in V
+@*
+@* @param[in] sp(4) - beta_cr
+@* Beta Value for the boundary in V
+@*
+@* @param[in] sp(8) - u4_bs
+@* Packed Boundary strength array
+@*
+@* @param[in] sp(12) - pu1_cliptab_cb
+@* tc0_table for U
+@*
+@* @param[in] sp(16) - pu1_cliptab_cr
+@* tc0_table for V
+@*
+@* @returns
+@* None
+@*
+@* @remarks
+@* None
+@*
+@*******************************************************************************
+@*/
+
+ .global ih264_deblk_chroma_vert_bslt4_a9
+
+ih264_deblk_chroma_vert_bslt4_a9:
+
+ stmfd sp!, {r4-r7, r10-r12, r14}
+
+ sub r0, r0, #4 @point r0 to p1u of row0.
+ ldrd r4, r5, [sp, #32] @R4 = alpha_cr , R5 = beta_cr
+ add r2, r2, r4, lsl #8
+ add r3, r3, r5, lsl #8
+ ldr r6, [sp, #40] @R6 = u4_bs
+ ldrd r10, r11, [sp, #44] @R10 = pu1_cliptab_cb , R11 = pu1_cliptab_cr
+ vpush {d8 - d15}
+ mov r12, r0 @keep a back up of R0 for buffer write
+
+ vld4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0], r1
+ vld4.16 {d0[1], d2[1], d4[1], d6[1]}, [r0], r1
+ vld4.16 {d0[2], d2[2], d4[2], d6[2]}, [r0], r1
+ vld4.16 {d0[3], d2[3], d4[3], d6[3]}, [r0], r1
+
+ vld4.16 {d1[0], d3[0], d5[0], d7[0]}, [r0], r1
+ vld4.16 {d1[1], d3[1], d5[1], d7[1]}, [r0], r1
+ vld4.16 {d1[2], d3[2], d5[2], d7[2]}, [r0], r1
+ vld4.16 {d1[3], d3[3], d5[3], d7[3]}, [r0], r1
+
+
+ vdup.16 q11, r2 @Q11 = alpha
+ vabd.u8 q4, q1, q2 @|p0-q0|
+ vdup.16 q12, r3 @Q12 = beta
+ vabd.u8 q5, q3, q2 @|q1-q0|
+ vabd.u8 q6, q0, q1 @|p1-p0|
+ vclt.u8 q4, q4, q11 @|p0-q0| < alpha ?
+ vsubl.u8 q7, d0, d6
+ vclt.u8 q5, q5, q12 @|q1-q0| < beta ?
+ vsubl.u8 q8, d1, d7 @(p1 - q1)
+ vclt.u8 q6, q6, q12 @|p1-p0| < beta ?
+ vsubl.u8 q9, d4, d2
+ vand.u8 q4, q4, q5 @|p0-q0| < alpha && |q1-q0| < beta
+ vsubl.u8 q10, d5, d3 @(q0 - p0)
+ vmov.u16 q14, #4
+ vld1.32 {d24[0]}, [r10] @Load ClipTable for U
+ vld1.32 {d25[0]}, [r11] @Load ClipTable for V
+ rev r6, r6 @Blocking strengths
+ vand.u8 q4, q4, q6 @|p0-q0| < alpha && |q1-q0| < beta && |p1-p0| < beta
+
+ vmov.32 d10[0], r6
+
+ vmla.s16 q7, q9, q14
+ vmla.s16 q8, q10, q14 @4*(q0 - p0) + (p1 - q1)
+
+ vmovl.u8 q5, d10
+ vsli.u16 d10, d10, #8
+ vtbl.8 d12, {d24}, d10 @tC0 for U
+ vtbl.8 d13, {d25}, d10 @tC0 for V
+ vzip.8 d12, d13
+ vmovl.u16 q5, d10
+ vsli.u32 q5, q5, #16
+ vmov.u8 q12, #1
+ vadd.u8 q6, q6, q12 @tC0 + 1
+ vcge.u8 q5, q5, q12 @u4_bS > 0 ?
+ vand.u8 q4, q4, q5 @|p0-q0| < alpha && |q1-q0| < beta && |p1-p0| < beta && u4_bs != 0
+
+ @ Q0 - Q3(inputs),
+ @ Q4 (|p0-q0| < alpha && |q1-q0| < beta && |p1-p0| < beta && u4_bs != 0),
+ @ Q6 (tC)
+
+ vrshr.s16 q7, q7, #3
+ vrshr.s16 q8, q8, #3 @(((q0 - p0) << 2) + (p1 - q1) + 4) >> 3)
+
+ vcgt.s16 q9, q7, #0
+ vcgt.s16 q10, q8, #0
+ vmovn.i16 d18, q9
+ vmovn.i16 d19, q10 @Q9 = sign(delta)
+ vabs.s16 q7, q7
+ vabs.s16 q8, q8
+ vmovn.u16 d14, q7
+ vmovn.u16 d15, q8
+ vmin.u8 q7, q7, q6 @Q7 = |delta|
+
+ vqadd.u8 q10, q1, q7 @p0+|delta|
+ vqadd.u8 q11, q2, q7 @q0+|delta|
+ vqsub.u8 q12, q1, q7 @p0-|delta|
+ vqsub.u8 q13, q2, q7 @q0-|delta|
+
+ vbit q12, q10, q9 @p0 + delta
+ vbit q11, q13, q9 @q0 - delta
+
+ vbit q1, q12, q4
+ vbit q2, q11, q4
+
+ vst4.16 {d0[0], d2[0], d4[0], d6[0]}, [r12], r1
+ vst4.16 {d0[1], d2[1], d4[1], d6[1]}, [r12], r1
+ vst4.16 {d0[2], d2[2], d4[2], d6[2]}, [r12], r1
+ vst4.16 {d0[3], d2[3], d4[3], d6[3]}, [r12], r1
+
+ vst4.16 {d1[0], d3[0], d5[0], d7[0]}, [r12], r1
+ vst4.16 {d1[1], d3[1], d5[1], d7[1]}, [r12], r1
+ vst4.16 {d1[2], d3[2], d5[2], d7[2]}, [r12], r1
+ vst4.16 {d1[3], d3[3], d5[3], d7[3]}, [r12], r1
+ vpop {d8 - d15}
+ ldmfd sp!, {r4-r7, r10-r12, pc}
+
+
+
+@/**
+@*******************************************************************************
+@*
+@* @brief
+@* Performs filtering of a chroma block vertical edge when the
+@* boundary strength is set to 4 on calling twice in high profile
+@*
+@* @par Description:
+@* This operation is described in Sec. 8.7.2.4 under the title
+@* "Filtering process for edges for bS equal to 4" in ITU T Rec H.264.
+@*
+@* @param[in] r0 - pu1_src
+@* Pointer to the src sample q0
+@*
+@* @param[in] r1 - src_strd
+@* Source stride
+@*
+@* @param[in] r2 - alpha_cb
+@* Alpha Value for the boundary in U
+@*
+@* @param[in] r3 - beta_cb
+@* Beta Value for the boundary in U
+@*
+@* @param[in] sp(0) - alpha_cr
+@* Alpha Value for the boundary in V
+@*
+@* @param[in] sp(4) - beta_cr
+@* Beta Value for the boundary in V
+@*
+@* @returns
+@* None
+@*
+@* @remarks
+@* None
+@*
+@*******************************************************************************
+@*/
+
+ .global ih264_deblk_chroma_vert_bs4_mbaff_a9
+
+ih264_deblk_chroma_vert_bs4_mbaff_a9:
+
+ stmfd sp!, {r4, r5, r12, r14}
+
+ sub r0, r0, #4 @point r0 to p1u of row0.
+ mov r12, r0 @keep a back up of r0 for buffer write
+ ldrd r4, r5, [sp, #16] @R4 = alpha_cr , R5 = beta_cr
+ add r2, r2, r4, lsl #8
+ add r3, r3, r5, lsl #8
+ vpush {d8 - d15}
+ vld4.16 {d0[0], d1[0], d2[0], d3[0]}, [r0], r1
+ vld4.16 {d0[1], d1[1], d2[1], d3[1]}, [r0], r1
+ vld4.16 {d0[2], d1[2], d2[2], d3[2]}, [r0], r1
+ vld4.16 {d0[3], d1[3], d2[3], d3[3]}, [r0], r1
+
+ vdup.16 d11, r2 @D11 = alpha
+ vdup.16 d12, r3 @D12 = beta
+ vmov.i8 d31, #2
+
+ vabd.u8 d4, d1, d2 @|p0-q0|
+ vabd.u8 d5, d3, d2 @|q1-q0|
+ vabd.u8 d6, d0, d1 @|p1-p0|
+ vaddl.u8 q14, d1, d3 @(p0 + q1)
+ vclt.u8 d4, d4, d11 @|p0-q0| < alpha ?
+ vclt.u8 d5, d5, d12 @|q1-q0| < beta ?
+ vclt.u8 d6, d6, d12 @|p1-p0| < beta ?
+ vmlal.u8 q14, d0, d31 @2*p1 + (p0 + q1)
+ vaddl.u8 q13, d0, d2 @(p1 + q0)
+ vand.u8 d4, d4, d5 @|p0-q0| < alpha && |q1-q0| < beta
+ vmlal.u8 q13, d3, d31 @2*q1 + (p1 + q0)
+
+ vrshrn.i16 d7, q14, #2 @(2*p1 + (p0 + q1) + 2) >> 2
+ vand.u8 d4, d4, d6 @|p0-q0| < alpha && |q1-q0| < beta && |p1-p0| < beta
+ vrshrn.i16 d9, q13, #2 @(2*q1 + (p1 + q0) + 2) >> 2
+
+ vbit d1, d7, d4
+ vbit d2, d9, d4
+
+ vst4.16 {d0[0], d1[0], d2[0], d3[0]}, [r12], r1
+ vst4.16 {d0[1], d1[1], d2[1], d3[1]}, [r12], r1
+ vst4.16 {d0[2], d1[2], d2[2], d3[2]}, [r12], r1
+ vst4.16 {d0[3], d1[3], d2[3], d3[3]}, [r12], r1
+ vpop {d8 - d15}
+ ldmfd sp!, {r4, r5, r12, pc}
+
+
+
+@/**
+@*******************************************************************************
+@*
+@* @brief
+@* Performs filtering of a chroma block vertical edge for cases where the
+@* boundary strength is less than 4 on calling twice in high profile
+@*
+@* @par Description:
+@* This operation is described in Sec. 8.7.2.4 under the title
+@* "Filtering process for edges for bS equal to 4" in ITU T Rec H.264.
+@*
+@* @param[in] r0 - pu1_src
+@* Pointer to the src sample q0
+@*
+@* @param[in] r1 - src_strd
+@* Source stride
+@*
+@* @param[in] r2 - alpha_cb
+@* Alpha Value for the boundary in U
+@*
+@* @param[in] r3 - beta_cb
+@* Beta Value for the boundary in U
+@*
+@* @param[in] sp(0) - alpha_cr
+@* Alpha Value for the boundary in V
+@*
+@* @param[in] sp(4) - beta_cr
+@* Beta Value for the boundary in V
+@*
+@* @param[in] sp(8) - u4_bs
+@* Packed Boundary strength array
+@*
+@* @param[in] sp(12) - pu1_cliptab_cb
+@* tc0_table for U
+@*
+@* @param[in] sp(16) - pu1_cliptab_cr
+@* tc0_table for V
+@*
+@* @returns
+@* None
+@*
+@* @remarks
+@* None
+@*
+@*******************************************************************************
+@*/
+
+ .global ih264_deblk_chroma_vert_bslt4_mbaff_a9
+
+ih264_deblk_chroma_vert_bslt4_mbaff_a9:
+
+ stmfd sp!, {r4-r6, r10-r12, r14}
+
+ sub r0, r0, #4 @point r0 to p1u of row0.
+ mov r12, r0 @keep a back up of r0 for buffer write
+
+ ldrd r4, r5, [sp, #28] @R4 = alpha_cr , R5 = beta_cr
+ add r2, r2, r4, lsl #8
+ add r3, r3, r5, lsl #8
+ ldr r6, [sp, #36] @R6 = u4_bs
+ ldrd r10, r11, [sp, #40] @R10 = pu1_cliptab_cb , R11 = pu1_cliptab_cr
+ vpush {d8 - d15}
+ vld4.16 {d0[0], d1[0], d2[0], d3[0]}, [r0], r1
+ vld4.16 {d0[1], d1[1], d2[1], d3[1]}, [r0], r1
+ vld4.16 {d0[2], d1[2], d2[2], d3[2]}, [r0], r1
+ vld4.16 {d0[3], d1[3], d2[3], d3[3]}, [r0], r1
+
+ vdup.16 d11, r2 @D11 = alpha
+ vabd.u8 d4, d1, d2 @|p0-q0|
+ vdup.16 d12, r3 @D12 = beta
+ vabd.u8 d5, d3, d2 @|q1-q0|
+ vabd.u8 d6, d0, d1 @|p1-p0|
+ vclt.u8 d4, d4, d11 @|p0-q0| < alpha ?
+ vclt.u8 d5, d5, d12 @|q1-q0| < beta ?
+ vsubl.u8 q14, d0, d3 @(p1 - q1)
+ vclt.u8 d6, d6, d12 @|p1-p0| < beta ?
+ vand.u8 d4, d4, d5 @|p0-q0| < alpha && |q1-q0| < beta
+ vsubl.u8 q12, d2, d1 @(q0 - p0)
+ vmov.u16 q10, #4
+
+ vld1.32 {d31[1]}, [r10] @Load ClipTable for U
+ vld1.32 {d31[0]}, [r11] @Load ClipTable for V
+ rev r6, r6 @Blocking strengths
+ vand.u8 d4, d4, d6 @|p0-q0| < alpha && |q1-q0| < beta && |p1-p0| < beta
+ vmov.32 d22[0], r6
+ vmla.s16 q14, q12, q10 @4*(q0 - p0) + (p1 - q1)
+ vmovl.u8 q11, d22
+ vsli.u16 d22, d22, #8
+ vmov.u16 d13, #4
+ vadd.u8 d22, d22, d13
+ vtbl.8 d6, {d31}, d22 @tC0
+ vmov.u8 d12, #1
+ vsub.u8 d22, d22, d13
+ vadd.u8 d6, d6, d12 @tC0 + 1
+ vcge.u8 d5, d22, d12 @u4_bS > 0 ?
+ vand.u8 d4, d4, d5 @|p0-q0| < alpha && |q1-q0| < beta && |p1-p0| < beta && u4_bs != 0
+
+ @ D0 - D3(inputs),
+ @ D4 (|p0-q0| < alpha && |q1-q0| < beta && |p1-p0| < beta && u4_bs != 0),
+ @ D6 (tC)
+
+ vrshr.s16 q14, q14, #3 @(((q0 - p0) << 2) + (p1 - q1) + 4) >> 3)
+
+ vcgt.s16 q13, q14, #0
+ vmovn.i16 d9, q13 @D9 = sign(delta)
+ vabs.s16 q14, q14
+ vmovn.u16 d7, q14
+ vmin.u8 d7, d7, d6 @D7 = |delta|
+
+ vqadd.u8 d10, d1, d7 @p0+|delta|
+ vqadd.u8 d11, d2, d7 @q0+|delta|
+ vqsub.u8 d12, d1, d7 @p0-|delta|
+ vqsub.u8 d13, d2, d7 @q0-|delta|
+
+ vbit d12, d10, d9 @p0 + delta
+ vbit d11, d13, d9 @q0 - delta
+
+ vbit d1, d12, d4
+ vbit d2, d11, d4
+
+ vst4.16 {d0[0], d1[0], d2[0], d3[0]}, [r12], r1
+ vst4.16 {d0[1], d1[1], d2[1], d3[1]}, [r12], r1
+ vst4.16 {d0[2], d1[2], d2[2], d3[2]}, [r12], r1
+ vst4.16 {d0[3], d1[3], d2[3], d3[3]}, [r12], r1
+ vpop {d8 - d15}
+ ldmfd sp!, {r4-r6, r10-r12, pc}
+
+
+
diff --git a/common/arm/ih264_deblk_luma_a9.s b/common/arm/ih264_deblk_luma_a9.s
new file mode 100755
index 0000000..3e6a4d9
--- /dev/null
+++ b/common/arm/ih264_deblk_luma_a9.s
@@ -0,0 +1,1092 @@
+@/******************************************************************************
+@ *
+@ * Copyright (C) 2015 The Android Open Source Project
+@ *
+@ * Licensed under the Apache License, Version 2.0 (the "License");
+@ * you may not use this file except in compliance with the License.
+@ * You may obtain a copy of the License at:
+@ *
+@ * http://www.apache.org/licenses/LICENSE-2.0
+@ *
+@ * Unless required by applicable law or agreed to in writing, software
+@ * distributed under the License is distributed on an "AS IS" BASIS,
+@ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@ * See the License for the specific language governing permissions and
+@ * limitations under the License.
+@ *
+@ *****************************************************************************
+@ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
+@*/
+@/*****************************************************************************/
+@/* */
+@/* File Name : ih264_deblk_luma_a9.s */
+@/* */
+@/* Description : Contains function definitions for deblocking luma */
+@/* edge. Functions are coded in NEON assembly and can */
+@/* be compiled using ARM RVDS. */
+@/* */
+@/* List of Functions : ih264_deblk_luma_vert_bs4_a9() */
+@/* ih264_deblk_luma_vert_bslt4_a9() */
+@/* ih264_deblk_luma_horz_bs4_a9() */
+@/* ih264_deblk_luma_horz_bslt4_a9() */
+@/* ih264_deblk_luma_vert_bs4_mbaff_a9() */
+@/* ih264_deblk_luma_vert_bslt4_mbaff_a9() */
+@/* */
+@/* Issues / Problems : None */
+@/* */
+@/* Revision History : */
+@/* */
+@/* DD MM YYYY Author(s) Changes (Describe the changes made) */
+@/* 28 11 2013 Ittiam Draft */
+@/* 05 01 2015 Kaushik Added double-call functions for */
+@/* Senthoor vertical deblocking. */
+@/* */
+@/*****************************************************************************/
+
+
+.text
+.p2align 2
+
+@/**
+@*******************************************************************************
+@*
+@* @brief
+@* Performs filtering of a luma block horizontal edge for cases where the
+@* boundary strength is less than 4
+@*
+@* @par Description:
+@* This operation is described in Sec. 8.7.2.4 under the title
+@* "Filtering process for edges for bS equal to 4" in ITU T Rec H.264.
+@*
+@* @param[in] r0 - pu1_src
+@* Pointer to the src sample q0
+@*
+@* @param[in] r1 - src_strd
+@* Source stride
+@*
+@* @param[in] r2 - alpha
+@* Alpha Value for the boundary
+@*
+@* @param[in] r3 - beta
+@* Beta Value for the boundary
+@*
+@* @param[in] sp(0) - u4_bs
+@* Packed Boundary strength array
+@*
+@* @param[in] sp(4) - pu1_cliptab
+@* tc0_table
+@*
+@* @returns
+@* None
+@*
+@* @remarks
+@* None
+@*
+@*******************************************************************************
+@*/
+
+ .global ih264_deblk_luma_horz_bslt4_a9
+
+ih264_deblk_luma_horz_bslt4_a9:
+
+ stmfd sp!, {r4-r7, lr}
+
+ ldrd r4, r5, [sp, #0x14] @r4 = ui_Bs , r5 = *puc_ClpTab
+ vpush {d8 - d15}
+ sub r0, r0, r1, lsl #1 @R1 = uc_Horizonpad
+ sub r0, r0, r1 @r0 pointer to p2
+ rev r4, r4 @
+ vld1.8 {q5}, [r0], r1 @p2 values are loaded into q5
+ vmov.32 d12[0], r4 @d12[0] = ui_Bs
+ mov r6, r0 @keeping backup of pointer to p1
+ vld1.8 {q4}, [r0], r1 @p1 values are loaded into q4
+ mov r7, r0 @keeping backup of pointer to p0
+ vld1.8 {q3}, [r0], r1 @p0 values are loaded into q3
+ vmovl.u8 q6, d12 @q6 = uc_Bs in each 16 bt scalar
+ vld1.8 {q0}, [r0], r1 @q0 values are loaded into q0
+ vabd.u8 q13, q4, q3 @Q13 = ABS(p1 - p0)
+ vld1.8 {q1}, [r0], r1 @q1 values are loaded into q1
+ vabd.u8 q11, q3, q0 @Q11 = ABS(p0 - q0)
+ vld1.32 d16[0], [r5] @D16[0] contains cliptab
+ vabd.u8 q12, q1, q0 @Q12 = ABS(q1 - q0)
+ vld1.8 {q2}, [r0], r1 @q2 values are loaded into q2
+ vtbl.8 d14, {d16}, d12 @
+ vdup.8 q10, r2 @Q10 contains alpha
+ vdup.8 q8, r3 @Q8 contains beta
+ vmovl.u16 q6, d12 @
+ vmovl.u16 q7, d14 @
+ vabd.u8 q14, q5, q3 @Q14 = Ap = ABS(p2 - p0)
+ vabd.u8 q15, q2, q0 @Q15 = Aq = ABS(q2 - q0)
+ vcgt.s32 q6, q6, #0 @Q6 = (us_Bs > 0)
+ vsli.32 q7, q7, #8 @
+ vcge.u8 q9, q11, q10 @Q9 = ( ABS(p0 - q0) >= Alpha )
+ vcge.u8 q12, q12, q8 @Q12=( ABS(q1 - q0) >= Beta )
+ vcge.u8 q13, q13, q8 @Q13=( ABS(p1 - p0) >= Beta )
+ vcgt.u8 q10, q8, q14 @Q10=(Ap<Beta)
+ vcgt.u8 q11, q8, q15 @Q11=(Aq<Beta)
+ vsli.32 q7, q7, #16 @Q7 = C0
+ vorr q9, q9, q12 @Q9 = ( ABS(p0 - q0) >= Alpha ) | ( ABS(q1 - q0) >= Beta )
+ vsubl.u8 q15, d1, d7 @
+ vsubl.u8 q12, d0, d6 @Q15,Q12 = (q0 - p0)
+ vorr q9, q9, q13 @Q9 = ( ABS(p0 - q0) >= Alpha ) | ( ABS(q1 - q0) >= Beta ) | ( ABS(p1 - p0) >= Beta )
+ vsubl.u8 q14, d8, d2 @Q14 = (p1 - q1)L
+ vshl.i16 q13, q15, #2 @Q13 = (q0 - p0)<<2
+ vshl.i16 q12, q12, #2 @Q12 = (q0 - p0)<<2
+ vsubl.u8 q15, d9, d3 @Q15 = (p1 - q1)H
+ vbic q6, q6, q9 @final condition
+ vadd.i16 q12, q12, q14 @
+ vadd.i16 q13, q13, q15 @Q13,Q12 = [ (q0 - p0)<<2 ] + (p1 - q1)
+ vsub.i8 q9, q7, q10 @Q9 = C0 + (Ap < Beta)
+ vrhadd.u8 q8, q3, q0 @Q8 = ((p0+q0+1) >> 1)
+ vqrshrn.s16 d24, q12, #3 @
+ vqrshrn.s16 d25, q13, #3 @Q12 = i_macro = (((q0 - p0)<<2) + (p1 - q1) + 4)>>3
+ vsub.i8 q9, q9, q11 @Q9 = C0 + (Ap < Beta) + (Aq < Beta)
+ vand.i8 q10, q10, q6 @
+ vand.i8 q11, q11, q6 @
+ vabs.s8 q13, q12 @Q13 = ABS (i_macro)
+ vaddl.u8 q14, d17, d11 @
+ vaddl.u8 q5, d16, d10 @Q14,Q5 = p2 + (p0+q0+1)>>1
+ vaddl.u8 q15, d17, d5 @
+ vmin.u8 q9, q13, q9 @Q9 = delta = (ABS(i_macro) > C) ? C : ABS(i_macro)
+ vshll.u8 q13, d9, #1 @
+ vaddl.u8 q2, d16, d4 @Q15,Q2 = q2 + (p0+q0+1)>>1
+ vshll.u8 q8, d8, #1 @Q13,Q8 = (p1<<1)
+ vand q9, q9, q6 @Making delta zero in places where values shouldn be filterd
+ vsub.i16 q14, q14, q13 @Q14,Q5 = [p2 + (p0+q0+1)>>1] - (p1<<1)
+ vsub.i16 q5, q5, q8 @
+ vshll.u8 q8, d2, #1 @
+ vshll.u8 q13, d3, #1 @Q13,Q8 = (q1<<1)
+ vqshrn.s16 d29, q14, #1 @
+ vqshrn.s16 d28, q5, #1 @Q14 = i_macro_p1
+ vsub.i16 q2, q2, q8 @
+ vsub.i16 q15, q15, q13 @Q15,Q2 = [q2 + (p0+q0+1)>>1] - (q1<<1)
+ vneg.s8 q13, q7 @Q13 = -C0
+ vmin.s8 q14, q14, q7 @Q14 = min(C0,i_macro_p1)
+ vcge.s8 q12, q12, #0 @Q12 = (i_macro >= 0)
+ vqshrn.s16 d31, q15, #1 @
+ vqshrn.s16 d30, q2, #1 @Q15 = i_macro_q1
+ vmax.s8 q14, q14, q13 @Q14 = max( - C0 , min(C0, i_macro_p1) )
+ vqadd.u8 q8, q3, q9 @Q8 = p0 + delta
+ vqsub.u8 q3, q3, q9 @Q3 = p0 - delta
+ vmin.s8 q15, q15, q7 @Q15 = min(C0,i_macro_q1)
+ vand.i8 q14, q10, q14 @condition check Ap<beta
+ vqadd.u8 q7, q0, q9 @Q7 = q0 + delta
+ vqsub.u8 q0, q0, q9 @Q0 = q0 - delta
+ vmax.s8 q15, q15, q13 @Q15 = max( - C0 , min(C0, i_macro_q1) )
+ vbif q8, q3, q12 @Q8 = (i_macro >= 0 ) ? (p0+delta) : (p0-delta)
+ vbif q0, q7, q12 @Q0 = (i_macro >= 0 ) ? (q0-delta) : (q0+delta)
+ vadd.i8 q14, q14, q4 @
+ vand.i8 q15, q11, q15 @condition check Aq<beta
+ vst1.8 {q8}, [r7], r1 @writting back filtered value of p0
+ vadd.i8 q15, q15, q1 @
+ vst1.8 {q0}, [r7], r1 @writting back filtered value of q0
+ vst1.8 {q14}, [r6] @writting back filtered value of p1
+ vst1.8 {q15}, [r7], r1 @writting back filtered value of q1
+ vpop {d8 - d15}
+ ldmfd sp!, {r4-r7, pc}
+
+
+
+@/**
+@*******************************************************************************
+@*
+@* @brief
+@* Performs filtering of a luma block horizontal edge when the
+@* boundary strength is set to 4
+@*
+@* @par Description:
+@* This operation is described in Sec. 8.7.2.4 under the title
+@* "Filtering process for edges for bS equal to 4" in ITU T Rec H.264.
+@*
+@* @param[in] r0 - pu1_src
+@* Pointer to the src sample q0
+@*
+@* @param[in] r1 - src_strd
+@* Source stride
+@*
+@* @param[in] r2 - alpha
+@* Alpha Value for the boundary
+@*
+@* @param[in] r3 - beta
+@* Beta Value for the boundary
+@*
+@* @returns
+@* None
+@*
+@* @remarks
+@* None
+@*
+@*******************************************************************************
+@*/
+
+ .global ih264_deblk_luma_horz_bs4_a9
+
+ih264_deblk_luma_horz_bs4_a9:
+
+ @ Back up necessary registers on stack
+ stmfd sp!, {r12, r14}
+ vpush {d8 - d15}
+ @ Init
+ vdup.8 q0, r2 @duplicate alpha
+ sub r12, r0, r1 @pointer to p0 = q0 - src_strd
+ vdup.8 q1, r3 @duplicate beta
+ sub r14, r0, r1, lsl#1 @pointer to p1 = q0 - src_strd*2
+ sub r2, r0, r1, lsl#2 @pointer to p3 = q0 - src_strd*4
+ sub r3, r14, r1 @pointer to p2 = p1 - src_strd
+
+ @ Load Data
+ vld1.8 {d4, d5}, [r0], r1 @load q0 to Q2, q0 = q0 + src_strd
+ vld1.8 {d6, d7}, [r12] @load p0 to Q3
+ vld1.8 {d8, d9}, [r0], r1 @load q1 to Q4, q0 = q0 + src_strd
+ vld1.8 {d10, d11}, [r14] @load p1 to Q5
+
+ @ Filter Decision
+ vabd.u8 q6, q2, q3 @ABS(p0 - q0)
+ vabd.u8 q7, q4, q2 @ABS(q1 - q0)
+ vabd.u8 q8, q5, q3 @ABS(p1 - p0)
+ vcge.u8 q9, q6, q0 @ABS(p0 - q0) >= Alpha
+ vcge.u8 q7, q7, q1 @ABS(q1 - q0) >= Beta
+ vcge.u8 q8, q8, q1 @ABS(p1 - p0) >= Beta
+ vmov.i8 q10, #2
+ vorr q9, q9, q7 @ABS(p0 - q0) >= Alpha || ABS(q1 - q0) >= Beta
+ vld1.8 {d14, d15}, [r0], r1 @load q2 to Q7, q0 = q0 + src_strd
+ vorr q9, q9, q8 @ABS(p0 - q0) >= Alpha || ABS(q1 - q0) >= Beta || ABS(p1 - p0) >= Beta
+ vsra.u8 q10, q0, #2 @((Alpha >> 2) + 2)
+ vabd.u8 q11, q7, q2 @Aq = ABS(q2 - q0)
+ vaddl.u8 q12, d4, d6 @p0+q0 L
+ vaddl.u8 q13, d5, d7 @p0+q0 H
+ vclt.u8 q11, q11, q1 @Aq < Beta
+ vclt.u8 q10, q6, q10 @(ABS(p0 - q0) <((Alpha >>2) + 2))
+
+ @ Deblock Filtering q0', q1', q2'
+ vaddw.u8 q14, q12, d8 @p0+q0+q1 L
+ vaddw.u8 q15, q13, d9 @p0+q0+q1 H
+ vand q11, q11, q10 @(Aq < Beta && ABS(p0 - q0) <((Alpha >>2) + 2))
+ @ q0' if (Aq < Beta && ABS(p0 - q0) <((Alpha >>2) + 2)) TRUE
+ vadd.i16 q8, q14, q14 @2*(p0+q0+q1)L
+ vadd.i16 q0, q15, q15 @2*(p0+q0+q1)H
+ vaddw.u8 q8, q8, d14 @2*(p0+q0+q1)+q2 L
+ vaddw.u8 q0, q0, d15 @2*(p0+q0+q1)+q2 H
+ vaddw.u8 q8, q8, d10 @2*(p0+q0+q1)+q2 +p1 L
+ vaddw.u8 q0, q0, d11 @2*(p0+q0+q1)+q2 +p1 H
+ vrshrn.u16 d12, q8, #3 @(2*(p0+q0+q1)+q2 +p1 +4)>> 3 L [q0']
+ vrshrn.u16 d13, q0, #3 @(2*(p0+q0+q1)+q2 +p1 +4)>> 3 H [q0']
+ @ q0" if (Aq < Beta && ABS(p0 - q0) <((Alpha >>2) + 2)) FALSE
+ vaddl.u8 q8, d8, d8 @2*q1 L
+ vaddl.u8 q0, d9, d9 @2*q1 H
+ vaddw.u8 q8, q8, d4 @2*q1+q0 L
+ vaddw.u8 q0, q0, d5 @2*q1+q0 H
+ vaddw.u8 q8, q8, d10 @2*q1+q0+p1 L
+ vaddw.u8 q0, q0, d11 @2*q1+q0+p1 H
+ vrshrn.u16 d16, q8, #2 @(2*q1+q0+p1+2)>>2 L [q0"]
+ vrshrn.u16 d17, q0, #2 @(2*q1+q0+p1+2)>>2 H [q0"]
+ @ q1'
+ vaddw.u8 q14, q14, d14 @p0+q0+q1+q2 L
+ vaddw.u8 q15, q15, d15 @p0+q0+q1+q2 H
+ vld1.8 {q0}, [r0], r1 @load q3 to Q0, q0 = q0 + src_strd
+ vbit q8, q6, q11 @choosing between q0' and q0" depending on condn
+ sub r0, r0, r1, lsl #2 @pointer to q0
+ vbic q11, q11, q9 @((ABS(p0 - q0) >= Alpha || ABS(q1 - q0) >= Beta || ABS(p1 - p0) >= Beta))
+ @ && (Aq < Beta && ABS(p0 - q0) <((Alpha >>2) + 2))
+ vrshrn.u16 d12, q14, #2 @(p0+q0+q1+q2+2)>>2 L [q1']
+ vrshrn.u16 d13, q15, #2 @(p0+q0+q1+q2+2)>>2 H [q1']
+ vbif q2, q8, q9 @choose q0 or filtered q0
+ @ q2'
+ vaddl.u8 q8, d14, d0 @q2+q3,L
+ vaddl.u8 q0, d15, d1 @q2+q3,H
+ vadd.i16 q14, q14, q8 @p0+q0+q1+2*q2+q3 L
+ vst1.8 {d4, d5}, [r0], r1 @store q0
+ vadd.i16 q15, q15, q0 @p0+q0+q1+2*q2+q3 H
+ vadd.i16 q14, q14, q8 @p0+q0+q1+3*q2+2*q3 L
+ vadd.i16 q15, q15, q0 @p0+q0+q1+3*q2+2*q3 H
+ vrshrn.u16 d0, q14, #3 @(p0+q0+q1+3*q2+2*q3+4)>>3 L [q2']
+ vrshrn.u16 d1, q15, #3 @(p0+q0+q1+3*q2+2*q3+4)>>3 H [q2']
+ vld1.8 {d30, d31}, [r3] @load p2 to Q15
+ vbif q6, q4, q11 @choose q1 or filtered value of q1
+
+ vabd.u8 q8, q15, q3 @Ap,ABS(p2 - p0)
+ vaddw.u8 q12, q12, d10 @p0+q0+p1 L
+ vbif q0, q7, q11 @choose q2 or filtered q2
+ vaddw.u8 q13, q13, d11 @p0+q0+p1 H
+ vst1.8 {d12, d13}, [r0], r1 @store q1
+ vclt.u8 q8, q8, q1 @Ap < Beta
+ vadd.i16 q14, q12, q12 @2*(p0+q0+p1) L
+ vadd.i16 q2, q13, q13 @2*(p0+q0+p1) H
+ vst1.8 {d0, d1}, [r0], r1 @store q2
+ vand q10, q10, q8 @((Ap < Beta) && (ABS(p0 - q0) <((Alpha >>2) + 2)))
+ vaddw.u8 q14, q14, d30 @2*(p0+q0+p1)+p2 l
+ vaddw.u8 q2, q2, d31 @2*(p0+q0+p1)+p2 H
+ vaddw.u8 q14, q14, d8 @2*(p0+q0+p1)+p2+q1 L
+ vaddw.u8 q2, q2, d9 @2*(p0+q0+p1)+p2+q1 H
+ vrshrn.u16 d28, q14, #3 @(2*(p0+q0+p1)+p2+q1+4)>>3 L,p0'
+ vrshrn.u16 d29, q2, #3 @(2*(p0+q0+p1)+p2+q1+4)>>3 H,p0'
+ vmov.i8 d0, #2
+ vmov.i16 d1, #2
+ vaddl.u8 q1, d6, d8 @p0+q1 L
+ vmlal.u8 q1, d10, d0 @2*p1+p0+q1 L
+ vaddl.u8 q8, d7, d9 @p0+q1 H
+ vmlal.u8 q8, d11, d0 @2*p1+p0+q1 H
+ vaddw.u8 q6, q12, d30 @(p0+q0+p1) +p2 L
+ vld1.8 {d24, d25}, [r2] @load p3,Q12
+ vaddw.u8 q2, q13, d31 @(p0+q0+p1) +p2 H
+ vaddl.u8 q4, d30, d24 @p2+p3 L
+ vrshrn.u16 d26, q6, #2 @((p0+q0+p1)+p2 +2)>>2,p1' L
+ vrshrn.u16 d2, q1, #2 @(2*p1+p0+q1+2)>>2,p0"L
+ vrshrn.u16 d27, q2, #2 @((p0+q0+p1)+p2 +2)>>2,p1' H
+ vrshrn.u16 d3, q8, #2 @(2*p1+p0+q1+2)>>2,p0" H
+ vaddl.u8 q8, d31, d25 @p2+p3 H
+ vmla.u16 q6, q4, d1[0] @(p0+q0+p1)+3*p2+2*p3 L
+ vmla.u16 q2, q8, d1[0] @(p0+q0+p1)+3*p2+2*p3 H
+ vbic q8, q10, q9 @((ABS(p0 - q0) >= Alpha || ABS(q1 - q0) >= Beta || ABS(p1 - p0) >= Beta))
+ @&& (Ap < Beta && ABS(p0 - q0) <((Alpha >>2) + 2))
+ vbit q1, q14, q10 @choosing between po' and p0"
+ vrshrn.u16 d12, q6, #3 @((p0+q0+p1)+3*p2+2*p3+4)>>3 L p2'
+ vrshrn.u16 d13, q2, #3 @((p0+q0+p1)+3*p2+2*p3+4)>>3 H p2'
+ vbif q3, q1, q9 @choosing between p0 and filtered value of p0
+ vbit q5, q13, q8 @choosing between p1 and p1'
+ vbit q15, q6, q8 @choosing between p2 and p2'
+ vst1.8 {d6, d7}, [r12] @store p0
+ vst1.8 {d10, d11}, [r14] @store p1
+ vst1.8 {d30, d31}, [r3] @store p2
+ vpop {d8 - d15}
+ ldmfd sp!, {r12, pc}
+
+
+
+@/**
+@*******************************************************************************
+@*
+@* @brief
+@* Performs filtering of a luma block vertical edge for cases where the
+@* boundary strength is less than 4
+@*
+@* @par Description:
+@* This operation is described in Sec. 8.7.2.4 under the title
+@* "Filtering process for edges for bS equal to 4" in ITU T Rec H.264.
+@*
+@* @param[in] r0 - pu1_src
+@* Pointer to the src sample q0
+@*
+@* @param[in] r1 - src_strd
+@* Source stride
+@*
+@* @param[in] r2 - alpha
+@* Alpha Value for the boundary
+@*
+@* @param[in] r3 - beta
+@* Beta Value for the boundary
+@*
+@* @param[in] sp(0) - u4_bs
+@* Packed Boundary strength array
+@*
+@* @param[in] sp(4) - pu1_cliptab
+@* tc0_table
+@*
+@* @returns
+@* None
+@*
+@* @remarks
+@* None
+@*
+@*******************************************************************************
+@*/
+
+ .global ih264_deblk_luma_vert_bslt4_a9
+
+ih264_deblk_luma_vert_bslt4_a9:
+
+ stmfd sp!, {r12, lr}
+
+ sub r0, r0, #4 @pointer uc_edgePixel-4
+ ldr r12, [sp, #8] @r12 = ui_Bs
+ ldr r14, [sp, #12] @r14 = *puc_ClpTab
+ vpush {d8 - d15}
+ @loading p3:p2:p1:p0:q0:q1:q2:q3 for every row
+ vld1.8 {d0}, [r0], r1 @row1
+ vld1.8 d2, [r0], r1 @row2
+ vld1.8 d4, [r0], r1 @row3
+ rev r12, r12 @reversing ui_bs
+ vld1.8 d6, [r0], r1 @row4
+ vmov.32 d18[0], r12 @d12[0] = ui_Bs
+ vld1.32 d16[0], [r14] @D16[0] contains cliptab
+ vld1.8 d8, [r0], r1 @row5
+ vmovl.u8 q9, d18 @q6 = uc_Bs in each 16 bt scalar
+ vld1.8 d10, [r0], r1 @row6
+ vld1.8 d12, [r0], r1 @row7
+ vtbl.8 d16, {d16}, d18 @puc_ClipTab[uc_Bs]
+ vld1.8 d14, [r0], r1 @row8
+ vld1.8 d1, [r0], r1 @row9
+ vmovl.u16 q8, d16 @
+ vld1.8 d3, [r0], r1 @row10
+ vld1.8 d5, [r0], r1 @row11
+ vld1.8 d7, [r0], r1 @row12
+ vsli.32 q8, q8, #8 @
+ vld1.8 d9, [r0], r1 @row13
+ vld1.8 d11, [r0], r1 @row14
+ vld1.8 d13, [r0], r1 @row15
+ vsli.32 q8, q8, #16 @Q8 = C0
+ vld1.8 d15, [r0], r1 @row16
+
+ @taking two 8x8 transposes
+ @2X2 transposes
+ vtrn.8 d0, d2 @row1 &2
+ vtrn.8 d4, d6 @row3&row4
+ vtrn.8 d8, d10 @row5&6
+ vtrn.8 d12, d14 @row7 & 8
+ vtrn.8 d1, d3 @row9 &10
+ vtrn.8 d5, d7 @row11 & 12
+ vtrn.8 d9, d11 @row13 &14
+ vtrn.8 d13, d15 @row15 & 16
+ @4x4 transposes
+ vtrn.16 d2, d6 @row2 & row4
+ vtrn.16 d10, d14 @row6 & row8
+ vtrn.16 d3, d7 @row10 & 12
+ vtrn.16 d11, d15 @row14 & row16
+ vtrn.32 d6, d14 @row4 & 8
+ vtrn.32 d7, d15 @row 12 & 16
+
+ @now Q3 ->p0 and Q7->q3
+ vtrn.16 d0, d4 @row1 & 3
+ vtrn.16 d8, d12 @row 5 & 7
+ vtrn.16 d1, d5 @row9 & row11
+ vtrn.16 d9, d13 @row13 & row15
+ vtrn.32 d0, d8 @row1 & row5
+ vtrn.32 d1, d9 @row9 & 13
+
+ @now Q0->p3 & Q4->q0
+ @starting processing as p0 and q0 are now ready
+ vtrn.32 d2, d10 @row2 &6
+ vrhadd.u8 q10, q3, q4 @((p0 + q0 + 1) >> 1)
+ vtrn.32 d3, d11 @row10&row14
+ vmov.i8 d19, #2
+ @now Q1->p2 & Q5->q1
+ vtrn.32 d4, d12 @row3 & 7
+ vabd.u8 q11, q3, q4 @ABS(p0 - q0)
+ vtrn.32 d5, d13 @row11 & row15
+ vaddl.u8 q12, d20, d2 @(p2 + ((p0 + q0 + 1) >> 1) L
+ @now Q2->p1,Q6->q2
+ vaddl.u8 q13, d21, d3 @(p2 + ((p0 + q0 + 1) >> 1) H
+ vmlsl.u8 q12, d4, d19 @(p2 + ((p0 + q0 + 1) >> 1) - (p1 << 1)) L
+ vmlsl.u8 q13, d5, d19 @(p2 + ((p0 + q0 + 1) >> 1) - (p1 << 1)) H
+ vdup.8 q14, r2 @alpha
+ vcle.u8 q11, q14, q11 @ABS(p0 - q0) >= Alpha(Alpha <=ABS(p0 - q0))
+ vdup.i8 q14, r3 @beta
+ vabd.u8 q15, q5, q4 @ABS(q1 - q0)
+ vqshrn.s16 d24, q12, #1 @((p2 + ((p0 + q0 + 1) >> 1) - (p1 << 1)) >> 1) L
+ vqshrn.s16 d25 , q13, #1 @((p2 + ((p0 + q0 + 1) >> 1) - (p1 << 1)) >> 1) H
+ vcge.u8 q15, q15, q14 @ABS(q1 - q0) >= Beta
+ vabd.u8 q13, q2, q3 @ABS(p1 - p0)
+ vmin.s8 q12, q12, q8 @min(deltap1 ,C0)
+ vorr q11, q11, q15 @ABS(q1 - q0) >= Beta ||ABS(p0 - q0) >= Alpha
+ vneg.s8 q15, q8 @-C0
+ vcge.u8 q13, q13, q14 @ABS(p1 - p0) >= Beta
+ vmax.s8 q12, q12, q15 @max(deltap1,-C0)
+ vorr q11, q11, q13 @ABS(p0 - q0) >= Alpha || ABS(q1 - q0) >= Beta || ABS(p1 - p0) >= Beta)
+ vmovl.u16 q13, d18 @ui_bs
+ vaddl.u8 q9, d20, d12 @q2 + ((p0 + q0 + 1) >> 1) L
+ vceq.u32 q13, q13, #0 @ui_bs == 0
+ vsubw.u8 q9, q9, d10 @(q2 + ((p0 + q0 + 1) >> 1) - q1) L
+ vaddl.u8 q10, d21, d13 @q2 + ((p0 + q0 + 1) >> 1) H
+ vsubw.u8 q9, q9, d10 @(q2 + ((p0 + q0 + 1) >> 1) - 2*q1)L
+ vsubw.u8 q10, q10, d11 @(q2 + ((p0 + q0 + 1) >> 1) - q1) H
+ vorr q13, q13, q11 @(ABS(p0 - q0) >= Alpha || ABS(q1 - q0) >= Beta || ABS(p1 - p0) >= Beta)) &&(ui_bs)
+ vsubw.u8 q10, q10, d11 @(q2 + ((p0 + q0 + 1) >> 1) - 2*q1) H
+ vqshrn.s16 d18, q9, #1 @((q2 + ((p0 + q0 + 1) >> 1) - (q1 << 1)) >> 1) L
+ vabd.u8 q11, q1, q3 @Ap = ABS(p2 - p0)
+ vqshrn.s16 d19, q10, #1 @((q2 + ((p0 + q0 + 1) >> 1) - (q1 << 1)) >> 1) H
+ vabd.u8 q10, q6, q4 @Aq= ABS(q2 - q0)
+ vclt.u8 q11, q11, q14 @Ap < Beta
+ vmin.s8 q9, q9, q8 @min(delatq1,C0)
+ vclt.u8 q10, q10, q14 @Aq <Beta
+ vsubl.u8 q14, d8, d6 @(q0 - p0) L
+ vmax.s8 q9, q9, q15 @max(deltaq1,-C0)
+ vsubl.u8 q15, d9, d7 @(q0 - p0) H
+ vshl.s16 q14, q14, #2 @(q0 - p0)<<2 L
+ vsub.u8 q8, q8, q11 @C0 + (Ap < Beta)
+ vshl.s16 q15, q15, #2 @(q0 - p0) << 2) H
+ vaddw.u8 q14, q14, d4 @((q0 - p0) << 2) + (p1 L
+ vaddw.u8 q15, q15, d5 @((q0 - p0) << 2) + (p1 H
+ vsubw.u8 q14, q14, d10 @((q0 - p0) << 2) + (p1 - q1) L
+ vsubw.u8 q15, q15, d11 @((q0 - p0) << 2) + (p1 - q1) H
+ vbic q11, q11, q13 @final condition for p1
+ vrshrn.s16 d28, q14, #3 @delta = ((((q0 - p0) << 2) + (p1 - q1) + 4) >> 3); L
+ vrshrn.s16 d29, q15, #3 @delta = ((((q0 - p0) << 2) + (p1 - q1) + 4) >> 3) H
+ vsub.u8 q8, q8, q10 @C0 + (Ap < Beta) + (Aq < Beta)
+ vbic q10, q10, q13 @final condition for q1
+ vabs.s8 q15, q14 @abs(delta)
+ vand q12, q12, q11 @delatp1
+ vand q9, q9, q10 @delta q1
+ vmin.u8 q15, q15, q8 @min((abs(delta),C)
+ vadd.i8 q2, q2, q12 @p1+deltap1
+ vadd.i8 q5, q5, q9 @q1+deltaq1
+ vbic q15, q15, q13 @abs(delta) of pixels to be changed only
+ vcge.s8 q14, q14, #0 @sign(delta)
+ vqsub.u8 q11, q3, q15 @clip(p0-delta)
+ vtrn.8 d0, d2 @row1 &2
+ vqadd.u8 q3, q3, q15 @clip(p0+delta)
+ vtrn.8 d1, d3 @row9 &10
+ vqadd.u8 q12, q4, q15 @clip(q0+delta)
+ vtrn.8 d12, d14 @row7 & 8
+ vqsub.u8 q4, q4, q15 @clip(q0-delta)
+ vtrn.8 d13, d15 @row15 & 16
+ vbif q3, q11, q14 @p0
+ vbif q4, q12, q14 @q0
+ vtrn.8 d4, d6 @row3&row4
+ vtrn.8 d8, d10 @row5&6
+ vtrn.8 d5, d7 @row11 & 12
+ vtrn.8 d9, d11 @row13 &14
+ vtrn.16 d2, d6 @row2 & row4
+ vtrn.16 d10, d14 @row6 & row8
+ vtrn.16 d3, d7 @row10 & 12
+ vtrn.16 d11, d15 @row14 & row16
+ vtrn.32 d6, d14 @row4 & 8
+ vtrn.32 d7, d15 @row 12 & 16
+ @now Q3 ->p0 and Q7->q3
+ vtrn.16 d0, d4 @row1 & 3
+ vtrn.16 d8, d12 @row 5 & 7
+ vtrn.16 d1, d5 @row9 & row11
+ vtrn.16 d9, d13 @row13 & row15
+ sub r0, r0, r1, lsl#4 @restore pointer
+ vtrn.32 d0, d8 @row1 & row5
+ vtrn.32 d1, d9 @row9 & 13
+ vtrn.32 d2, d10 @row2 &6
+ vtrn.32 d3, d11 @row10&row14
+ vtrn.32 d4, d12 @row3 & 7
+ vtrn.32 d5, d13 @row11 & row15
+ vst1.8 {d0}, [r0], r1 @row1
+ vst1.8 d2, [r0], r1 @row2
+ vst1.8 d4, [r0], r1 @row3
+ vst1.8 d6, [r0], r1 @row4
+ vst1.8 d8, [r0], r1 @row5
+ vst1.8 d10, [r0], r1 @row6
+ vst1.8 d12, [r0], r1 @row7
+ vst1.8 d14, [r0], r1 @row8
+ vst1.8 d1, [r0], r1 @row9
+ vst1.8 d3, [r0], r1 @row10
+ vst1.8 d5, [r0], r1 @row11
+ vst1.8 d7, [r0], r1 @row12
+ vst1.8 d9, [r0], r1 @row13
+ vst1.8 d11, [r0], r1 @row14
+ vst1.8 d13, [r0], r1 @row15
+ vst1.8 d15, [r0], r1 @row16
+ vpop {d8 - d15}
+ ldmfd sp!, {r12, pc}
+
+
+
+@/**
+@*******************************************************************************
+@*
+@* @brief
+@* Performs filtering of a luma block vertical edge when the
+@* boundary strength is set to 4
+@*
+@* @par Description:
+@* This operation is described in Sec. 8.7.2.4 under the title
+@* "Filtering process for edges for bS equal to 4" in ITU T Rec H.264.
+@*
+@* @param[in] r0 - pu1_src
+@* Pointer to the src sample q0
+@*
+@* @param[in] r1 - src_strd
+@* Source stride
+@*
+@* @param[in] r2 - alpha
+@* Alpha Value for the boundary
+@*
+@* @param[in] r3 - beta
+@* Beta Value for the boundary
+@*
+@* @returns
+@* None
+@*
+@* @remarks
+@* None
+@*
+@*******************************************************************************
+@*/
+
+ .global ih264_deblk_luma_vert_bs4_a9
+
+ih264_deblk_luma_vert_bs4_a9:
+
+ stmfd sp!, {r12, lr}
+ vpush {d8 - d15}
+ sub r0, r0, #4 @pointer uc_edgePixel-4
+ @loading p3:p2:p1:p0:q0:q1:q2:q3 for every row
+ vld1.8 d0, [r0], r1 @row1
+ vld1.8 d2, [r0], r1 @row2
+ vld1.8 d4, [r0], r1 @row3
+ vld1.8 d6, [r0], r1 @row4
+ vld1.8 d8, [r0], r1 @row5
+ vld1.8 d10, [r0], r1 @row6
+ vld1.8 d12, [r0], r1 @row7
+ vld1.8 d14, [r0], r1 @row8
+ vld1.8 d1, [r0], r1 @row9
+ vld1.8 d3, [r0], r1 @row10
+ vld1.8 d5, [r0], r1 @row11
+ vld1.8 d7, [r0], r1 @row12
+ vld1.8 d9, [r0], r1 @row13
+ vld1.8 d11, [r0], r1 @row14
+ vld1.8 d13, [r0], r1 @row15
+ vld1.8 d15, [r0], r1 @row16
+ @taking two 8x8 transposes
+ @2X2 transposes
+ vtrn.8 d0, d2 @row1 &2
+ vtrn.8 d4, d6 @row3&row4
+ vtrn.8 d8, d10 @row5&6
+ vtrn.8 d12, d14 @row7 & 8
+ vtrn.8 d1, d3 @row9 &10
+ vtrn.8 d5, d7 @row11 & 12
+ vtrn.8 d9, d11 @row13 &14
+ vtrn.8 d13, d15 @row15 & 16
+ @4x4 transposes
+ vtrn.16 d2, d6 @row2 & row4
+ vtrn.16 d10, d14 @row6 & row8
+ vtrn.16 d3, d7 @row10 & 12
+ vtrn.16 d11, d15 @row14 & row16
+ vtrn.32 d6, d14 @row4 & 8
+ vtrn.32 d7, d15 @row 12 & 16
+ @now Q3 ->p0 and Q7->q3
+ vtrn.16 d0, d4 @row1 & 3
+ vtrn.16 d8, d12 @row 5 & 7
+ vtrn.16 d1, d5 @row9 & row11
+ vtrn.16 d9, d13 @row13 & row15
+ vtrn.32 d0, d8 @row1 & row5
+ vtrn.32 d1, d9 @row9 & 13
+ @now Q0->p3 & Q4->q0
+ @starting processing as p0 and q0 are now ready
+ @now Q1->p2 & Q5->q1
+ vpush {q7} @saving in stack
+ vtrn.32 d4, d12 @row3 & 7
+ vmov.i16 q14, #2
+ vtrn.32 d5, d13 @row11 & row15
+ vaddl.u8 q8, d6, d8 @p0+q0 L
+ vtrn.32 d2, d10 @row2 &6
+ vaddl.u8 q9, d7, d9 @p0+q0 H
+ vtrn.32 d3, d11 @row10&row14
+ vaddw.u8 q10, q8, d4 @p0+q0+p1 L
+ vaddw.u8 q11, q9, d5 @p0+q0+p1 H
+ vaddl.u8 q12, d2, d10 @p2+q1 L
+ vaddl.u8 q13, d3, d11 @p2+q1 H
+ vmla.u16 q12, q10, q14 @p2 + X2(p1) + X2(p0) + X2(q0) + q1 L
+ vmla.u16 q13, q11, q14 @p2 + X2(p1) + X2(p0) + X2(q0) + q1 H
+ vmov.i8 q14, #2
+ vaddw.u8 q8, q10, d2 @p0+q0+p1+p2 L
+ vaddw.u8 q9, q11, d3 @p0+q0+p1+p2 H
+ vdup.i8 q15, r2 @duplicate alpha
+ vrshrn.u16 d20, q8, #2 @(p2 + p1 + p0 + q0 + 2) >> 2)L p1'
+ vrshrn.u16 d21, q9, #2 @(p2 + p1 + p0 + q0 + 2) >> 2)H p1'
+ vabd.u8 q11, q3, q4 @ABD(p0-q0)
+ vsra.u8 q14, q15, #2 @alpha >>2 +2
+ vabd.u8 q15, q1, q3 @Ap = ABD(p2-p0)
+ vrshrn.u16 d24, q12, #3 @((p2 + X2(p1) + X2(p0) + X2(q0) + q1 + 4) >> 3) L p0'
+ vrshrn.u16 d25, q13, #3 @((p2 + X2(p1) + X2(p0) + X2(q0) + q1 + 4) >> 3) H p0'
+ vdup.i8 q13, r3 @beta
+ vcgt.u8 q14, q14, q11 @ABS(p0 - q0) <((Alpha >>2) + 2)
+ vaddl.u8 q11, d6, d10 @p0+q1 L
+ vcgt.u8 q7, q13, q15 @beta>Ap
+ vaddl.u8 q15, d7, d11 @p0+q1 H
+ vaddw.u8 q11, q11, d4 @p0+q1+p1 L
+ vaddw.u8 q15, q15, d5 @p0+q1+p1 H
+ vaddw.u8 q11, q11, d4 @p0+q1+2*p1 L
+ vaddw.u8 q15, q15, d5 @p0+q1+2*p1 H
+ vand q7, q7, q14 @(Ap < Beta && ABS(p0 - q0) <((Alpha >>2) + 2)
+ vrshrn.u16 d22, q11, #2 @((X2(p1) + p0 + q1 + 2) >> 2) L p0"
+ vrshrn.u16 d23, q15, #2 @((X2(p1) + p0 + q1 + 2) >> 2) H p0"
+ vaddl.u8 q15, d2, d0 @p2+p3 L
+ vbif q12, q11, q7 @p0' or p0 "
+ vaddl.u8 q11, d3, d1 @p2+p3 H
+ vadd.u16 q15, q15, q15 @2*(p2+p3) L
+ vadd.u16 q11, q11, q11 @2*(p2+p3)H
+ vadd.u16 q8, q8, q15 @(X2(p3) + X3(p2) + p1 + p0 + q0) L
+ vadd.u16 q9, q9, q11 @(X2(p3) + X3(p2) + p1 + p0 + q0) H
+ vabd.u8 q15, q6, q4 @Aq = abs(q2-q0)
+ vabd.u8 q11, q5, q4 @ABS(Q1-Q0)
+ vrshrn.u16 d16, q8, #3 @((X2(p3) + X3(p2) + p1 + p0 + q0 + 4) >> 3); L p2'
+ vrshrn.u16 d17, q9, #3 @((X2(p3) + X3(p2) + p1 + p0 + q0 + 4) >> 3); H p2'
+ vabd.u8 q9, q2, q3 @ABS(p1-p0)
+ vcgt.u8 q15, q13, q15 @Aq < Beta
+ vcge.u8 q11, q11, q13 @ABS(q1 - q0) >= Beta
+ vcge.u8 q9, q9, q13 @ABS(p1 - p0) >= beta
+ vdup.i8 q13, r2 @duplicate alpha
+ vand q15, q15, q14 @(Aq < Beta && ABS(p0 - q0) <((Alpha >>2) + 2))
+ vabd.u8 q14, q3, q4 @abs(p0-q0)
+ vorr q11, q11, q9 @ABS(p1 - p0) >= Beta || ABS(q1 - q0) >= Beta
+ vaddl.u8 q9, d6, d8 @p0+q0 L
+ vcge.u8 q14, q14, q13 @ABS(p0 - q0) >= Alpha
+ vaddl.u8 q13, d7, d9 @p0+q0 H
+ vaddw.u8 q9, q9, d10 @p0+q0+q1 L
+ vorr q11, q11, q14 @ABS(p1 - p0) >= Beta || ABS(q1 - q0) >= Beta||ABS(p0 - q0) >= Alpha
+ vaddw.u8 q13, q13, d11 @p0+q0+q1 H
+ vbic q7, q7, q11 @final condn for p's
+ vmov.i8 q14, #2
+ vbif q3, q12, q11 @final p0
+ vbit q1, q8, q7 @final p2
+ vbif q10, q2, q7 @final p1
+ vaddl.u8 q12, d8, d4 @q0+p1 L
+ vmlal.u8 q12, d10, d28 @X2(q1) + q0 + p1 L
+ vaddl.u8 q8, d9, d5 @q0+p1 H
+ vmlal.u8 q8, d11, d28 @X2(q1) + q0 + p1 H
+ vmov.i16 q14, #2
+ vaddl.u8 q7, d4, d12 @p1+q2 L
+ vmla.u16 q7, q9, q14 @p1 + X2(p0) + X2(q0) + X2(q1) + q2L
+ vaddl.u8 q2, d5, d13 @p1+q2H
+ vmla.u16 q2, q13, q14 @p1 + X2(p0) + X2(q0) + X2(q1) + q2H
+ vrshrn.u16 d24, q12, #2 @(X2(q1) + q0 + p1 + 2) >> 2; L q0'
+ vrshrn.u16 d25, q8, #2 @(X2(q1) + q0 + p1 + 2) >> 2; H q0'
+ vaddw.u8 q9, q9, d12 @p0 + q0 + q1 + q2 L
+ vaddw.u8 q13, q13, d13 @p0 + q0 + q1 + q2 H
+ vrshrn.u16 d16, q7, #3 @(p1 + X2(p0) + X2(q0) + X2(q1) + q2 + 4) >> 3 L qo"
+ vpop {q7}
+ vrshrn.u16 d17, q2, #3 @(p1 + X2(p0) + X2(q0) + X2(q1) + q2 + 4) >> 3 H qo"
+ vrshrn.u16 d4, q9, #2 @p0 + q0 + q1 + q2 + 2)>>2 L q1'
+ vrshrn.u16 d5, q13, #2 @p0 + q0 + q1 + q2 + 2)>>2 H q1'
+ vbit q12, q8, q15 @q0' or q0"
+ vbic q15, q15, q11 @final condn for q's
+ vtrn.8 d0, d2 @row1 &2
+ vbit q5, q2, q15 @final q1
+ vtrn.8 d1, d3 @row9 &10
+ vaddl.u8 q8, d12, d14 @q2+q3 L
+ vtrn.8 d20, d6 @row3&row4
+ vaddl.u8 q2, d13, d15 @q2+q3 H
+ vtrn.8 d21, d7 @row11 & 12
+ vmla.u16 q9, q8, q14 @X2(q3) + X3(q2) + q1 + q0 + p0 L
+ vtrn.16 d2, d6 @row2 & row4
+ vmla.u16 q13, q2, q14 @X2(q3) + X3(q2) + q1 + q0 + p0 H
+ vtrn.16 d3, d7 @row10 & 12
+ vbif q4, q12, q11 @final q0
+ vtrn.16 d0, d20 @row1 & 3
+ vrshrn.u16 d18, q9, #3 @(X2(q3) + X3(q2) + q1 + q0 + p0 + 4) >> 3; L
+ vtrn.16 d1, d21 @row9 & row11
+ vrshrn.u16 d19, q13, #3 @(X2(q3) + X3(q2) + q1 + q0 + p0 + 4) >> 3; H
+ vtrn.8 d8, d10 @row5&6
+ vbit q6, q9, q15 @final q2
+ vtrn.8 d9, d11 @row13 &14
+ vtrn.8 d12, d14 @row7 & 8
+ vtrn.8 d13, d15 @row15 & 16
+ vtrn.16 d10, d14 @row6 & row8
+ vtrn.16 d11, d15 @row14 & row16
+ @now Q3 ->p0 and Q7->q3
+ vtrn.16 d8, d12 @row 5 & 7
+ vtrn.16 d9, d13 @row13 & row15
+ sub r0, r0, r1, lsl#4 @restore pointer
+ vtrn.32 d6, d14 @row4 & 8
+ vtrn.32 d7, d15 @row 12 & 16
+ vtrn.32 d0, d8 @row1 & row5
+ vtrn.32 d1, d9 @row9 & 13
+ vtrn.32 d2, d10 @row2 &6
+ vtrn.32 d3, d11 @row10&row14
+ vtrn.32 d20, d12 @row3 & 7
+ vtrn.32 d21, d13 @row11 & row15
+ vst1.8 d0, [r0], r1 @row1
+ vst1.8 d2, [r0], r1 @row2
+ vst1.8 d20, [r0], r1 @row3
+ vst1.8 d6, [r0], r1 @row4
+ vst1.8 d8, [r0], r1 @row5
+ vst1.8 d10, [r0], r1 @row6
+ vst1.8 d12, [r0], r1 @row7
+ vst1.8 d14, [r0], r1 @row8
+ vst1.8 d1, [r0], r1 @row9
+ vst1.8 d3, [r0], r1 @row10
+ vst1.8 d21, [r0], r1 @row11
+ vst1.8 d7, [r0], r1 @row12
+ vst1.8 d9, [r0], r1 @row13
+ vst1.8 d11, [r0], r1 @row14
+ vst1.8 d13, [r0], r1 @row15
+ vst1.8 d15, [r0], r1 @row16
+ vpop {d8 - d15}
+ ldmfd sp!, {r12, pc}
+
+
+
+@/**
+@*******************************************************************************
+@*
+@* @brief
+@* Performs filtering of a luma block vertical edge when the
+@* boundary strength is set to 4 on calling twice
+@*
+@* @par Description:
+@* This operation is described in Sec. 8.7.2.4 under the title
+@* "Filtering process for edges for bS equal to 4" in ITU T Rec H.264.
+@*
+@* @param[in] r0 - pu1_src
+@* Pointer to the src sample q0
+@*
+@* @param[in] r1 - src_strd
+@* Source stride
+@*
+@* @param[in] r2 - alpha
+@* Alpha Value for the boundary
+@*
+@* @param[in] r3 - beta
+@* Beta Value for the boundary
+@*
+@* @returns
+@* None
+@*
+@* @remarks
+@* None
+@*
+@*******************************************************************************
+@*/
+
+ .global ih264_deblk_luma_vert_bs4_mbaff_a9
+
+ih264_deblk_luma_vert_bs4_mbaff_a9:
+
+ stmfd sp!, {lr}
+
+ sub r0, r0, #4 @pointer uc_edgePixel-4
+ vpush {d8 - d15}
+ @loading [p3:p2],[p1:p0]:[q0:q1]:[q2:q3] for every row
+ vld4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0], r1
+ vld4.16 {d0[1], d2[1], d4[1], d6[1]}, [r0], r1
+ vld4.16 {d0[2], d2[2], d4[2], d6[2]}, [r0], r1
+ vld4.16 {d0[3], d2[3], d4[3], d6[3]}, [r0], r1
+ vld4.16 {d1[0], d3[0], d5[0], d7[0]}, [r0], r1
+ vld4.16 {d1[1], d3[1], d5[1], d7[1]}, [r0], r1
+ vld4.16 {d1[2], d3[2], d5[2], d7[2]}, [r0], r1
+ vld4.16 {d1[3], d3[3], d5[3], d7[3]}, [r0], r1
+
+ vuzp.8 d0, d1 @D0->p3, D1->p2
+ vuzp.8 d2, d3 @D2->p1, D3->p0
+ vuzp.8 d4, d5 @D4->q0, D5->q1
+ vuzp.8 d6, d7 @D6->q2, D7->q3
+
+ vmov.i16 q14, #2
+ vaddl.u8 q4, d3, d4 @p0+q0
+ vaddw.u8 q5, q4, d2 @p0+q0+p1
+ vaddl.u8 q6, d1, d5 @p2+q1
+ vmla.u16 q6, q5, q14 @p2 + X2(p1) + X2(p0) + X2(q0) + q1
+
+ vmov.i8 d14, #2
+ vaddw.u8 q4, q5, d1 @p0+q0+p1+p2
+ vdup.i8 d15, r2 @duplicate alpha
+ vrshrn.u16 d10, q4, #2 @(p2 + p1 + p0 + q0 + 2) >> 2) p1'
+ vabd.u8 d11, d3, d4 @ABD(p0-q0)
+ vsra.u8 d14, d15, #2 @alpha >>2 +2
+ vabd.u8 d15, d1, d3 @Ap = ABD(p2-p0)
+ vrshrn.u16 d12, q6, #3 @((p2 + X2(p1) + X2(p0) + X2(q0) + q1 + 4) >> 3) p0'
+ vdup.i8 d13, r3 @beta
+ vcgt.u8 d14, d14, d11 @ABS(p0 - q0) <((Alpha >>2) + 2)
+ vaddl.u8 q8, d3, d5 @p0+q1
+ vcgt.u8 d26, d13, d15 @beta>Ap
+ vaddw.u8 q8, q8, d2 @p0+q1+p1
+ vaddw.u8 q8, q8, d2 @p0+q1+2*p1
+ vand d26, d26, d14 @(Ap < Beta && ABS(p0 - q0) <((Alpha >>2) + 2)
+ vrshrn.u16 d11, q8, #2 @((X2(p1) + p0 + q1 + 2) >> 2) p0"
+ vbif d12, d11, d26 @p0' or p0 "
+ vaddl.u8 q9, d1, d0 @p2+p3
+ vadd.u16 q9, q9, q9 @2*(p2+p3)
+ vadd.u16 q4, q4, q9 @(X2(p3) + X3(p2) + p1 + p0 + q0)
+ vabd.u8 d15, d6, d4 @Aq = abs(q2-q0)
+ vabd.u8 d11, d5, d4 @ABS(q1-q0)
+ vrshrn.u16 d8, q4, #3 @((X2(p3) + X3(p2) + p1 + p0 + q0 + 4) >> 3); p2'
+ vabd.u8 d9, d2, d3 @ABS(p1-p0)
+ vcgt.u8 d15, d13, d15 @Aq < Beta
+ vcge.u8 d11, d11, d13 @ABS(q1 - q0) >= Beta
+ vcge.u8 d9, d9, d13 @ABS(p1 - p0) >= beta
+ vdup.i8 d13, r2 @duplicate alpha
+ vand d15, d15, d14 @(Aq < Beta && ABS(p0 - q0) <((Alpha >>2) + 2))
+ vabd.u8 d14, d3, d4 @abs(p0-q0)
+ vorr d11, d11, d9 @ABS(p1 - p0) >= Beta || ABS(q1 - q0) >= Beta
+ vcge.u8 d14, d14, d13 @ABS(p0 - q0) >= Alpha
+ vaddl.u8 q10, d3, d4 @p0+q0
+ vorr d11, d11, d14 @ABS(p1 - p0) >= Beta || ABS(q1 - q0) >= Beta||ABS(p0 - q0) >= Alpha
+ vaddw.u8 q10, q10, d5 @p0+q0+q1
+ vbic d26, d26, d11 @final condn for p's
+ vmov.i8 d14, #2
+ vbif d3, d12, d11 @final p0
+ vbit d1, d8, d26 @final p2
+ vbif d10, d2, d26 @final p1
+ vaddl.u8 q6, d4, d2 @q0+p1
+ vmlal.u8 q6, d5, d14 @X2(q1) + q0 + p1
+
+ vaddl.u8 q11, d2, d6 @p1+q2
+ vmla.u16 q11, q10, q14 @p1 + X2(p0) + X2(q0) + X2(q1) + q2
+ vrshrn.u16 d12, q6, #2 @(X2(q1) + q0 + p1 + 2) >> 2; q0'
+ vaddw.u8 q10, q10, d6 @p0 + q0 + q1 + q2
+ vrshrn.u16 d8, q11, #3 @(p1 + X2(p0) + X2(q0) + X2(q1) + q2 + 4) >> 3 qo"
+
+ vrshrn.u16 d2, q10, #2 @p0 + q0 + q1 + q2 + 2)>>2 q1'
+ vbit d12, d8, d15 @q0' or q0"
+ vbic d15, d15, d11 @final condn for q's
+ vbit d5, d2, d15 @final q1
+ vaddl.u8 q12, d6, d7 @q2+q3
+ vmla.u16 q10, q12, q14 @X2(q3) + X3(q2) + q1 + q0 + p0
+ vbif d4, d12, d11 @final q0
+ vrshrn.u16 d9, q10, #3 @(X2(q3) + X3(q2) + q1 + q0 + p0 + 4) >> 3;
+ vbit d6, d9, d15 @final q2
+ vand d2, d10, d10 @D0->p3, D1->p2, D2->p1, D3->p0, D4->q0, D5->q1, D6->q2, D7->q3
+
+ vzip.8 d0, d1 @D0,D1 -> [p3:p2]
+ vzip.8 d2, d3 @D2,D3 -> [p1:p0]
+ vzip.8 d4, d5 @D4,D5 -> [q0:q1]
+ vzip.8 d6, d7 @D6,D7 -> [q2:q3]
+
+ sub r0, r0, r1, lsl#3 @restore pointer
+
+ @storing [p3:p2],[p1:p0]:[q0:q1]:[q2:q3] in every row
+ vst4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0], r1
+ vst4.16 {d0[1], d2[1], d4[1], d6[1]}, [r0], r1
+ vst4.16 {d0[2], d2[2], d4[2], d6[2]}, [r0], r1
+ vst4.16 {d0[3], d2[3], d4[3], d6[3]}, [r0], r1
+ vst4.16 {d1[0], d3[0], d5[0], d7[0]}, [r0], r1
+ vst4.16 {d1[1], d3[1], d5[1], d7[1]}, [r0], r1
+ vst4.16 {d1[2], d3[2], d5[2], d7[2]}, [r0], r1
+ vst4.16 {d1[3], d3[3], d5[3], d7[3]}, [r0], r1
+ vpop {d8 - d15}
+ ldmfd sp!, {pc}
+
+
+
+@/**
+@*******************************************************************************
+@*
+@* @brief
+@* Performs filtering of a luma block vertical edge for cases where the
+@* boundary strength is less than 4 on calling twice
+@*
+@* @par Description:
+@* This operation is described in Sec. 8.7.2.4 under the title
+@* "Filtering process for edges for bS equal to 4" in ITU T Rec H.264.
+@*
+@* @param[in] r0 - pu1_src
+@* Pointer to the src sample q0
+@*
+@* @param[in] r1 - src_strd
+@* Source stride
+@*
+@* @param[in] r2 - alpha
+@* Alpha Value for the boundary
+@*
+@* @param[in] r3 - beta
+@* Beta Value for the boundary
+@*
+@* @param[in] sp(0) - u4_bs
+@* Packed Boundary strength array
+@*
+@* @param[in] sp(4) - pu1_cliptab
+@* tc0_table
+@*
+@* @returns
+@* None
+@*
+@* @remarks
+@* None
+@*
+@*******************************************************************************
+@*/
+
+ .global ih264_deblk_luma_vert_bslt4_mbaff_a9
+
+ih264_deblk_luma_vert_bslt4_mbaff_a9:
+
+ stmfd sp!, {r12, lr}
+
+ sub r0, r0, #4 @pointer uc_edgePixel-4
+ ldr r12, [sp, #8] @r12 = ui_Bs
+ ldr r14, [sp, #12] @r14 = pu1_ClipTab
+ vpush {d8 - d15}
+ @loading [p3:p2],[p1:p0]:[q0:q1]:[q2:q3] for every row
+ vld4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0], r1
+ vld4.16 {d0[1], d2[1], d4[1], d6[1]}, [r0], r1
+ vld4.16 {d0[2], d2[2], d4[2], d6[2]}, [r0], r1
+ vld4.16 {d0[3], d2[3], d4[3], d6[3]}, [r0], r1
+ vld4.16 {d1[0], d3[0], d5[0], d7[0]}, [r0], r1
+ vld4.16 {d1[1], d3[1], d5[1], d7[1]}, [r0], r1
+ vld4.16 {d1[2], d3[2], d5[2], d7[2]}, [r0], r1
+ vld4.16 {d1[3], d3[3], d5[3], d7[3]}, [r0], r1
+
+ vuzp.8 d0, d1 @D0->p3, D1->p2
+ vuzp.8 d2, d3 @D2->p1, D3->p0
+ vuzp.8 d4, d5 @D4->q0, D5->q1
+ vuzp.8 d6, d7 @D6->q2, D7->q3
+
+ rev r12, r12 @reversing ui_bs
+ vmov.32 d8[0], r12 @D8[0] = ui_Bs
+ vld1.32 d9[0], [r14] @D9[0] contains cliptab
+ vmovl.u8 q15, d8 @D30 = ui_Bs in each 16 bt scalar
+ vtbl.8 d8, {d9}, d30 @puc_ClipTab[ui_Bs]
+ vsli.16 d8, d8, #8 @D8 = C0
+
+ vrhadd.u8 d10, d3, d4 @((p0 + q0 + 1) >> 1)
+ vmov.i8 d31, #2
+ vabd.u8 d11, d3, d4 @ABS(p0 - q0)
+ vaddl.u8 q6, d10, d1 @(p2 + ((p0 + q0 + 1) >> 1)
+ vmlsl.u8 q6, d2, d31 @(p2 + ((p0 + q0 + 1) >> 1) - (p1 << 1))
+ vdup.8 d14, r2 @alpha
+ vcle.u8 d11, d14, d11 @ABS(p0 - q0) >= Alpha(Alpha <=ABS(p0 - q0))
+ vdup.i8 d14, r3 @beta
+ vabd.u8 d15, d5, d4 @ABS(q1 - q0)
+ vqshrn.s16 d12, q6, #1 @((p2 + ((p0 + q0 + 1) >> 1) - (p1 << 1)) >> 1)
+ vcge.u8 d15, d15, d14 @ABS(q1 - q0) >= Beta
+ vabd.u8 d13, d2, d3 @ABS(p1 - p0)
+ vmin.s8 d12, d12, d8 @min(deltap1 ,C0)
+ vorr d11, d11, d15 @ABS(q1 - q0) >= Beta ||ABS(p0 - q0) >= Alpha
+ vneg.s8 d15, d8 @-C0
+ vcge.u8 d13, d13, d14 @ABS(p1 - p0) >= Beta
+ vmax.s8 d12, d12, d15 @max(deltap1,-C0)
+ vorr d11, d11, d13 @ABS(p0 - q0) >= Alpha || ABS(q1 - q0) >= Beta || ABS(p1 - p0) >= Beta)
+ vceq.u16 d13, d30, #0 @ui_bs == 0
+ vaddl.u8 q14, d10, d6 @q2 + ((p0 + q0 + 1) >> 1)
+ vsubw.u8 q14, q14, d5 @q2 + ((p0 + q0 + 1) >> 1) - q1
+ vsubw.u8 q14, q14, d5 @q2 + ((p0 + q0 + 1) >> 1) - 2*q1
+ vorr d13, d13, d11 @(ABS(p0 - q0) >= Alpha || ABS(q1 - q0) >= Beta || ABS(p1 - p0) >= Beta))
+ @|| (ui_bs == 0)
+ vqshrn.s16 d9, q14, #1 @(q2 + ((p0 + q0 + 1) >> 1) - (q1 << 1)) >> 1
+ vabd.u8 d11, d1, d3 @Ap = ABS(p2 - p0)
+ vabd.u8 d10, d6, d4 @Aq= ABS(q2 - q0)
+ vclt.u8 d11, d11, d14 @Ap < Beta
+ vmin.s8 d9, d9, d8 @min(deltaq1,C0)
+ vclt.u8 d10, d10, d14 @Aq < Beta
+ vmax.s8 d9, d9, d15 @max(deltaq1,-C0)
+ vsubl.u8 q7, d4, d3 @q0 - p0
+ vshl.s16 q7, q7, #2 @(q0 - p0) << 2
+ vsub.u8 d8, d8, d11 @C0 + (Ap < Beta)
+ vaddw.u8 q7, q7, d2 @((q0 - p0) << 2) + p1
+ vsubw.u8 q7, q7, d5 @((q0 - p0) << 2) + (p1 - q1)
+ vbic d11, d11, d13 @final condition for p1
+ vrshr.s16 q15, q7, #3 @delta = (((q0 - p0) << 2) + (p1 - q1) + 4) >> 3
+ vsub.u8 d8, d8, d10 @C0 + (Ap < Beta) + (Aq < Beta)
+ vbic d10, d10, d13 @final condition for q1
+ vabs.s16 q14, q15
+ vmovn.i16 d15, q14 @abs(delta)
+ vand d12, d12, d11 @delatp1
+ vand d9, d9, d10 @deltaq1
+ vmin.u8 d15, d15, d8 @min((abs(delta),C)
+ vadd.i8 d2, d2, d12 @p1+deltap1
+ vadd.i8 d5, d5, d9 @q1+deltaq1
+ vbic d15, d15, d13 @abs(delta) of pixels to be changed only
+ vcge.s16 q14, q15, #0
+ vmovn.i16 d14, q14 @sign(delta)
+ vqsub.u8 d11, d3, d15 @clip(p0-delta)
+ vqadd.u8 d3, d3, d15 @clip(p0+delta)
+ vqadd.u8 d12, d4, d15 @clip(q0+delta)
+ vqsub.u8 d4, d4, d15 @clip(q0-delta)
+ vbif d3, d11, d14 @p0
+ vbif d4, d12, d14 @q0
+
+ sub r0, r0, r1, lsl#3 @restore pointer
+ @D0->p3, D1->p2, D2->p1, D3->p0, D4->q0, D5->q1, D6->q2, D7->q3
+ vzip.8 d0, d1 @D0,D1 -> [p3:p2]
+ vzip.8 d2, d3 @D2,D3 -> [p1:p0]
+ vzip.8 d4, d5 @D4,D5 -> [q0:q1]
+ vzip.8 d6, d7 @D6,D7 -> [q2:q3]
+
+ @storing [p3:p2],[p1:p0]:[q0:q1]:[q2:q3] in every row
+ vst4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0], r1
+ vst4.16 {d0[1], d2[1], d4[1], d6[1]}, [r0], r1
+ vst4.16 {d0[2], d2[2], d4[2], d6[2]}, [r0], r1
+ vst4.16 {d0[3], d2[3], d4[3], d6[3]}, [r0], r1
+ vst4.16 {d1[0], d3[0], d5[0], d7[0]}, [r0], r1
+ vst4.16 {d1[1], d3[1], d5[1], d7[1]}, [r0], r1
+ vst4.16 {d1[2], d3[2], d5[2], d7[2]}, [r0], r1
+ vst4.16 {d1[3], d3[3], d5[3], d7[3]}, [r0], r1
+ vpop {d8 - d15}
+ ldmfd sp!, {r12, pc}
+
+
+
diff --git a/common/arm/ih264_default_weighted_pred_a9q.s b/common/arm/ih264_default_weighted_pred_a9q.s
new file mode 100755
index 0000000..94cda46
--- /dev/null
+++ b/common/arm/ih264_default_weighted_pred_a9q.s
@@ -0,0 +1,359 @@
+@/******************************************************************************
+@ *
+@ * Copyright (C) 2015 The Android Open Source Project
+@ *
+@ * Licensed under the Apache License, Version 2.0 (the "License");
+@ * you may not use this file except in compliance with the License.
+@ * You may obtain a copy of the License at:
+@ *
+@ * http://www.apache.org/licenses/LICENSE-2.0
+@ *
+@ * Unless required by applicable law or agreed to in writing, software
+@ * distributed under the License is distributed on an "AS IS" BASIS,
+@ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@ * See the License for the specific language governing permissions and
+@ * limitations under the License.
+@ *
+@ *****************************************************************************
+@ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
+@*/
+@/**
+@******************************************************************************
+@* @file
+@* ih264_default_weighted_pred_a9q.s
+@*
+@* @brief
+@* Contains function definitions for default weighted prediction.
+@* Functions are coded using NEON intrinsics and can be compiled using ARM RVCT
+@*
+@* @author
+@* Kaushik Senthoor R
+@*
+@* @par List of Functions:
+@*
+@* - ih264_default_weighted_pred_luma_a9q()
+@* - ih264_default_weighted_pred_chroma_a9q()
+@*
+@* @remarks
+@* None
+@*
+@*******************************************************************************
+@*/
+@*******************************************************************************
+@* @function
+@* ih264_default_weighted_pred_luma_a9q()
+@*
+@* @brief
+@* This routine performs the default weighted prediction as described in sec
+@* 8.4.2.3.1 titled "Default weighted sample prediction process" for luma.
+@*
+@* @par Description:
+@* This function gets two ht x wd blocks, calculates their rounded-average and
+@* stores it in the destination block.
+@*
+@* @param[in] pu1_src1:
+@* UWORD8 Pointer to the buffer containing the first input block.
+@*
+@* @param[in] pu1_src2:
+@* UWORD8 Pointer to the buffer containing the second input block.
+@*
+@* @param[out] pu1_dst
+@* UWORD8 pointer to the destination where the output block is stored.
+@*
+@* @param[in] src_strd1
+@* Stride of the first input buffer
+@*
+@* @param[in] src_strd2
+@* Stride of the second input buffer
+@*
+@* @param[in] dst_strd
+@* Stride of the destination buffer
+@*
+@* @param[in] ht
+@* integer height of the array
+@*
+@* @param[in] wd
+@* integer width of the array
+@*
+@* @returns
+@* None
+@*
+@* @remarks
+@* (ht,wd) can be (4,4), (4,8), (8,4), (8,8), (8,16), (16,8) or (16,16).
+@*
+@*******************************************************************************
+@*/
+@void ih264_default_weighted_pred_luma_a9q(UWORD8 *pu1_src1,
+@ UWORD8 *pu1_src2,
+@ UWORD8 *pu1_dst,
+@ WORD32 src_strd1,
+@ WORD32 src_strd2,
+@ WORD32 dst_strd,
+@ WORD32 ht,
+@ WORD32 wd)
+@
+@**************Variables Vs Registers*****************************************
+@ r0 => pu1_src1
+@ r1 => pu1_src2
+@ r2 => pu1_dst
+@ r3 => src_strd1
+@ [sp] => src_strd2 (r4)
+@ [sp+4] => dst_strd (r5)
+@ [sp+8] => ht (r6)
+@ [sp+12] => wd (r7)
+@
+.text
+.p2align 2
+
+ .global ih264_default_weighted_pred_luma_a9q
+
+ih264_default_weighted_pred_luma_a9q:
+
+ stmfd sp!, {r4-r7, r14} @stack stores the values of the arguments
+ ldr r7, [sp, #32] @Load wd
+ ldr r4, [sp, #20] @Load src_strd2
+ ldr r5, [sp, #24] @Load dst_strd
+ cmp r7, #16
+ ldr r6, [sp, #28] @Load ht
+ vpush {d8-d15}
+ beq loop_16 @branch if wd is 16
+ cmp r7, #8
+ beq loop_8 @branch if wd is 8
+
+loop_4: @each iteration processes four rows
+
+ vld1.32 d0[0], [r0], r3 @load row 1 in source 1
+ vld1.32 d0[1], [r0], r3 @load row 2 in source 1
+ vld1.32 d2[0], [r1], r4 @load row 1 in source 2
+ vld1.32 d2[1], [r1], r4 @load row 2 in source 2
+
+ vld1.32 d1[0], [r0], r3 @load row 3 in source 1
+ vld1.32 d1[1], [r0], r3 @load row 4 in source 1
+ vrhadd.u8 d0, d0, d2
+ vld1.32 d3[0], [r1], r4 @load row 3 in source 2
+ vld1.32 d3[1], [r1], r4 @load row 4 in source 2
+
+ subs r6, r6, #4 @decrement ht by 4
+ vst1.32 d0[0], [r2], r5 @load row 1 in destination
+ vst1.32 d0[1], [r2], r5 @load row 2 in destination
+ vrhadd.u8 d1, d1, d3
+ vst1.32 d1[0], [r2], r5 @load row 3 in destination
+ vst1.32 d1[1], [r2], r5 @load row 4 in destination
+
+ bgt loop_4 @if greater than 0 repeat the loop again
+
+ b end_loops
+
+loop_8: @each iteration processes four rows
+
+ vld1.8 d0, [r0], r3 @load row 1 in source 1
+ vld1.8 d4, [r1], r4 @load row 1 in source 2
+ vld1.8 d1, [r0], r3 @load row 2 in source 1
+ vld1.8 d5, [r1], r4 @load row 2 in source 2
+ vld1.8 d2, [r0], r3 @load row 3 in source 1
+ vrhadd.u8 q0, q0, q2
+ vld1.8 d6, [r1], r4 @load row 3 in source 2
+ vld1.8 d3, [r0], r3 @load row 4 in source 1
+ vrhadd.u8 d2, d2, d6
+ vld1.8 d7, [r1], r4 @load row 4 in source 2
+
+ subs r6, r6, #4 @decrement ht by 4
+ vst1.8 d0, [r2], r5 @load row 1 in destination
+ vrhadd.u8 d3, d3, d7
+ vst1.8 d1, [r2], r5 @load row 2 in destination
+ vst1.8 d2, [r2], r5 @load row 3 in destination
+ vst1.8 d3, [r2], r5 @load row 4 in destination
+
+ bgt loop_8 @if greater than 0 repeat the loop again
+
+ b end_loops
+
+loop_16: @each iteration processes eight rows
+
+ vld1.8 {q0}, [r0], r3 @load row 1 in source 1
+ vld1.8 {q8}, [r1], r4 @load row 1 in source 2
+ vld1.8 {q1}, [r0], r3 @load row 2 in source 1
+ vld1.8 {q9}, [r1], r4 @load row 2 in source 2
+ vrhadd.u8 q0, q0, q8
+ vld1.8 {q2}, [r0], r3 @load row 3 in source 1
+ vld1.8 {q10}, [r1], r4 @load row 3 in source 2
+ vrhadd.u8 q1, q1, q9
+ vld1.8 {q3}, [r0], r3 @load row 4 in source 1
+ vld1.8 {q11}, [r1], r4 @load row 4 in source 2
+ vrhadd.u8 q2, q2, q10
+ vld1.8 {q4}, [r0], r3 @load row 5 in source 1
+ vld1.8 {q12}, [r1], r4 @load row 5 in source 2
+ vrhadd.u8 q3, q3, q11
+ vld1.8 {q5}, [r0], r3 @load row 6 in source 1
+ vld1.8 {q13}, [r1], r4 @load row 6 in source 2
+ vrhadd.u8 q4, q4, q12
+ vld1.8 {q6}, [r0], r3 @load row 7 in source 1
+ vld1.8 {q14}, [r1], r4 @load row 7 in source 2
+ vrhadd.u8 q5, q5, q13
+ vld1.8 {q7}, [r0], r3 @load row 8 in source 1
+ vld1.8 {q15}, [r1], r4 @load row 8 in source 2
+
+ vrhadd.u8 q6, q6, q14
+ vst1.8 {q0}, [r2], r5 @load row 1 in destination
+ vst1.8 {q1}, [r2], r5 @load row 2 in destination
+ vrhadd.u8 q7, q7, q15
+ vst1.8 {q2}, [r2], r5 @load row 3 in destination
+ vst1.8 {q3}, [r2], r5 @load row 4 in destination
+ subs r6, r6, #8 @decrement ht by 8
+ vst1.8 {q4}, [r2], r5 @load row 5 in destination
+ vst1.8 {q5}, [r2], r5 @load row 6 in destination
+ vst1.8 {q6}, [r2], r5 @load row 7 in destination
+ vst1.8 {q7}, [r2], r5 @load row 8 in destination
+
+ bgt loop_16 @if greater than 0 repeat the loop again
+
+end_loops:
+
+ vpop {d8-d15}
+ ldmfd sp!, {r4-r7, r15} @Reload the registers from sp
+
+
+@*******************************************************************************
+@* @function
+@* ih264_default_weighted_pred_chroma_a9q()
+@*
+@* @brief
+@* This routine performs the default weighted prediction as described in sec
+@* 8.4.2.3.1 titled "Default weighted sample prediction process" for chroma.
+@*
+@* @par Description:
+@* This function gets two ht x wd blocks, calculates their rounded-average and
+@* stores it in the destination block for U and V.
+@*
+@* @param[in] pu1_src1:
+@* UWORD8 Pointer to the buffer containing the first input block.
+@*
+@* @param[in] pu1_src2:
+@* UWORD8 Pointer to the buffer containing the second input block.
+@*
+@* @param[out] pu1_dst
+@* UWORD8 pointer to the destination where the output block is stored.
+@*
+@* @param[in] src_strd1
+@* Stride of the first input buffer
+@*
+@* @param[in] src_strd2
+@* Stride of the second input buffer
+@*
+@* @param[in] dst_strd
+@* Stride of the destination buffer
+@*
+@* @param[in] ht
+@* integer height of the array
+@*
+@* @param[in] wd
+@* integer width of the array
+@*
+@* @returns
+@* None
+@*
+@* @remarks
+@* (ht,wd) can be (2,2), (2,4), (4,2), (4,4), (4,8), (8,4) or (8,8).
+@*
+@*******************************************************************************
+@*/
+@void ih264_default_weighted_pred_chroma_a9q(UWORD8 *pu1_src1,
+@ UWORD8 *pu1_src2,
+@ UWORD8 *pu1_dst,
+@ WORD32 src_strd1,
+@ WORD32 src_strd2,
+@ WORD32 dst_strd,
+@ WORD32 ht,
+@ WORD32 wd)
+@
+@**************Variables Vs Registers*****************************************
+@ r0 => pu1_src1
+@ r1 => pu1_src2
+@ r2 => pu1_dst
+@ r3 => src_strd1
+@ [sp] => src_strd2 (r4)
+@ [sp+4] => dst_strd (r5)
+@ [sp+8] => ht (r6)
+@ [sp+12] => wd (r7)
+@
+
+
+ .global ih264_default_weighted_pred_chroma_a9q
+
+ih264_default_weighted_pred_chroma_a9q:
+
+ stmfd sp!, {r4-r7, r14} @stack stores the values of the arguments
+ ldr r7, [sp, #32] @Load wd
+ ldr r4, [sp, #20] @Load src_strd2
+ ldr r5, [sp, #24] @Load dst_strd
+ cmp r7, #8
+ ldr r6, [sp, #28] @Load ht
+ vpush {d8-d15}
+ beq loop_8_uv @branch if wd is 8
+ cmp r7, #4
+ beq loop_4_uv @branch if wd is 4
+
+loop_2_uv: @each iteration processes two rows
+
+ vld1.32 d0[0], [r0], r3 @load row 1 in source 1
+ vld1.32 d0[1], [r0], r3 @load row 2 in source 1
+
+ vld1.32 d1[0], [r1], r4 @load row 1 in source 2
+ vld1.32 d1[1], [r1], r4 @load row 2 in source 2
+
+ vrhadd.u8 d0, d0, d1
+
+ subs r6, r6, #2 @decrement ht by 2
+ vst1.32 d0[0], [r2], r5 @load row 1 in destination
+ vst1.32 d0[1], [r2], r5 @load row 2 in destination
+
+ bgt loop_2_uv @if greater than 0 repeat the loop again
+
+ b end_loops_uv
+
+loop_4_uv: @each iteration processes two rows
+
+ vld1.8 d0, [r0], r3 @load row 1 in source 1
+ vld1.8 d2, [r1], r4 @load row 1 in source 2
+ vld1.8 d1, [r0], r3 @load row 2 in source 1
+ vrhadd.u8 d0, d0, d2
+ vld1.8 d3, [r1], r4 @load row 2 in source 2
+
+ vrhadd.u8 d1, d1, d3
+ vst1.8 d0, [r2], r5 @load row 1 in destination
+ subs r6, r6, #2 @decrement ht by 2
+ vst1.8 d1, [r2], r5 @load row 2 in destination
+
+ bgt loop_4_uv @if greater than 0 repeat the loop again
+
+ b end_loops_uv
+
+loop_8_uv: @each iteration processes four rows
+
+ vld1.8 {q0}, [r0], r3 @load row 1 in source 1
+ vld1.8 {q4}, [r1], r4 @load row 1 in source 2
+ vld1.8 {q1}, [r0], r3 @load row 2 in source 1
+ vrhadd.u8 q0, q0, q4
+ vld1.8 {q5}, [r1], r4 @load row 2 in source 2
+ vld1.8 {q2}, [r0], r3 @load row 3 in source 1
+ vrhadd.u8 q1, q1, q5
+ vld1.8 {q6}, [r1], r4 @load row 3 in source 2
+ vld1.8 {q3}, [r0], r3 @load row 4 in source 1
+ vrhadd.u8 q2, q2, q6
+ vld1.8 {q7}, [r1], r4 @load row 4 in source 2
+
+ vst1.8 {q0}, [r2], r5 @load row 1 in destination
+ vrhadd.u8 q3, q3, q7
+ vst1.8 {q1}, [r2], r5 @load row 2 in destination
+ subs r6, r6, #4 @decrement ht by 4
+ vst1.8 {q2}, [r2], r5 @load row 3 in destination
+ vst1.8 {q3}, [r2], r5 @load row 4 in destination
+
+ bgt loop_8_uv @if greater than 0 repeat the loop again
+
+end_loops_uv:
+
+ vpop {d8-d15}
+ ldmfd sp!, {r4-r7, r15} @Reload the registers from sp
+
+
diff --git a/common/arm/ih264_ihadamard_scaling_a9.s b/common/arm/ih264_ihadamard_scaling_a9.s
new file mode 100755
index 0000000..687099a
--- /dev/null
+++ b/common/arm/ih264_ihadamard_scaling_a9.s
@@ -0,0 +1,250 @@
+@/******************************************************************************
+@ *
+@ * Copyright (C) 2015 The Android Open Source Project
+@ *
+@ * Licensed under the Apache License, Version 2.0 (the "License");
+@ * you may not use this file except in compliance with the License.
+@ * You may obtain a copy of the License at:
+@ *
+@ * http://www.apache.org/licenses/LICENSE-2.0
+@ *
+@ * Unless required by applicable law or agreed to in writing, software
+@ * distributed under the License is distributed on an "AS IS" BASIS,
+@ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@ * See the License for the specific language governing permissions and
+@ * limitations under the License.
+@ *
+@ *****************************************************************************
+@ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
+@*/
+@/**
+@ *******************************************************************************
+@ * @file
+@ * ih264_ihadamard_scaling_a9.s
+@ *
+@ * @brief
+@ * Contains function definitions for inverse hadamard transform on 4x4 DC outputs
+@ * of 16x16 intra-prediction
+@ *
+@ * @author
+@ * Mohit
+@ *
+@ * @par List of Functions:
+@ * - ih264_ihadamard_scaling_4x4_a9()
+@ * - ih264_ihadamard_scaling_2x2_uv_a9()
+@ *
+@ * @remarks
+@ * None
+@ *
+@ *******************************************************************************
+@ */
+@ * @brief This function performs a 4x4 inverse hadamard transform on the 4x4 DC coefficients
+@ * of a 16x16 intra prediction macroblock, and then performs scaling.
+@ * prediction buffer
+@ *
+@ * @par Description:
+@ * The DC coefficients pass through a 2-stage inverse hadamard transform.
+@ * This inverse transformed content is scaled to based on Qp value.
+@ *
+@ * @param[in] pi2_src
+@ * input 4x4 block of DC coefficients
+@ *
+@ * @param[out] pi2_out
+@ * output 4x4 block
+@ *
+@ * @param[in] pu2_iscal_mat
+@ * pointer to scaling list
+@ *
+@ * @param[in] pu2_weigh_mat
+@ * pointer to weight matrix
+@ *
+@ * @param[in] u4_qp_div_6
+@ * Floor (qp/6)
+@ *
+@ * @param[in] pi4_tmp
+@ * temporary buffer of size 1*16
+@ *
+@ * @returns none
+@ *
+@ * @remarks none
+@ *
+@ *******************************************************************************
+@ */
+@ *
+@ *******************************************************************************
+@ */
+@ void ih264_ihadamard_scaling_4x4(WORD16* pi2_src,
+@ WORD16* pi2_out,
+@ const UWORD16 *pu2_iscal_mat,
+@ const UWORD16 *pu2_weigh_mat,
+@ UWORD32 u4_qp_div_6,
+@ WORD32* pi4_tmp)
+@**************Variables Vs Registers*****************************************
+@r0 => *pi2_src
+@r1 => *pi2_out
+@r2 => *pu2_iscal_mat
+@r3 => *pu2_weigh_mat
+@r4 => u4_qp_div_6
+
+.text
+.p2align 2
+
+ .global ih264_ihadamard_scaling_4x4_a9
+
+ih264_ihadamard_scaling_4x4_a9:
+
+@VLD4.S16 is used because the pointer is incremented by SUB_BLK_WIDTH_4x4
+@If the macro value changes need to change the instruction according to it.
+@Only one shift is done in horizontal inverse because,
+@if u4_qp_div_6 is lesser than 4 then shift value will be neagative and do negative left shift, in this case rnd_factor has value
+@if u4_qp_div_6 is greater than 4 then shift value will be positive and do left shift, here rnd_factor is 0
+
+ stmfd sp!, {r4-r12, r14} @ stack stores the values of the arguments
+ ldr r4, [sp, #40] @ Loads u4_qp_div_6
+ vdup.s32 q10, r4 @ Populate the u4_qp_div_6 in Q10
+ ldrh r6, [r3] @ load pu2_weight_mat[0] , H for unsigned halfword load
+ ldrh r7, [r2] @ load pu2_iscal_mat[0] , H for unsigned halfword load
+ mul r6, r6, r7 @ pu2_iscal_mat[0]*pu2_weigh_mat[0]
+ vdup.s32 q9, r6 @ Populate pu2_iscal_mat[0]*pu2_weigh_mat[0] 32-bit in Q9
+ vpush {d8-d15}
+@=======================INVERSE HADAMARD TRANSFORM================================
+
+ vld4.s16 {d0, d1, d2, d3}, [r0] @load x4,x5,x6,x7
+ vaddl.s16 q12, d0, d3 @x0 = x4 + x7
+ vaddl.s16 q13, d1, d2 @x1 = x5 + x6
+ vsubl.s16 q14, d1, d2 @x2 = x5 - x6
+ vsubl.s16 q15, d0, d3 @x3 = x4 - x7
+
+ vadd.s32 q2, q12, q13 @pi4_tmp_ptr[0] = x0 + x1
+ vadd.s32 q3, q15, q14 @pi4_tmp_ptr[1] = x3 + x2
+ vsub.s32 q4, q12, q13 @pi4_tmp_ptr[2] = x0 - x1
+ vsub.s32 q5, q15, q14 @pi4_tmp_ptr[3] = x3 - x2
+
+ vtrn.32 q2, q3 @Transpose the register for vertical transform
+ vtrn.32 q4, q5
+
+ vswp d5, d8 @Q2 = x4, Q4 = x6
+ vswp d7, d10 @Q3 = x5, Q5 = x7
+
+
+ vadd.s32 q12, q2, q5 @x0 = x4+x7
+ vadd.s32 q13, q3, q4 @x1 = x5+x6
+ vsub.s32 q14, q3, q4 @x2 = x5-x6
+ vsub.s32 q15, q2, q5 @x3 = x4-x7
+
+ vadd.s32 q0, q12, q13 @pi4_tmp_ptr[0] = x0 + x1
+ vadd.s32 q1, q15, q14 @pi4_tmp_ptr[1] = x3 + x2
+ vsub.s32 q2, q12, q13 @pi4_tmp_ptr[2] = x0 - x1
+ vsub.s32 q3, q15, q14 @pi4_tmp_ptr[3] = x3 - x2
+
+
+ vmul.s32 q0, q0, q9 @ Q0 = p[i] = (x[i] * trns_coeff[i]) where i = 0..3
+ vmul.s32 q1, q1, q9 @ Q1 = p[i] = (x[i] * trns_coeff[i]) where i = 4..7
+ vmul.s32 q2, q2, q9 @ Q2 = p[i] = (x[i] * trns_coeff[i]) where i = 8..11
+ vmul.s32 q3, q3, q9 @ Q3 = p[i] = (x[i] * trns_coeff[i]) where i = 12..15
+
+ vshl.s32 q0, q0, q10 @ Q0 = q[i] = (p[i] << (qP/6)) where i = 0..3
+ vshl.s32 q1, q1, q10 @ Q1 = q[i] = (p[i] << (qP/6)) where i = 4..7
+ vshl.s32 q2, q2, q10 @ Q2 = q[i] = (p[i] << (qP/6)) where i = 8..11
+ vshl.s32 q3, q3, q10 @ Q3 = q[i] = (p[i] << (qP/6)) where i = 12..15
+
+ vqrshrn.s32 d0, q0, #0x6 @ D0 = c[i] = ((q[i] + 32) >> 4) where i = 0..3
+ vqrshrn.s32 d1, q1, #0x6 @ D1 = c[i] = ((q[i] + 32) >> 4) where i = 4..7
+ vqrshrn.s32 d2, q2, #0x6 @ D2 = c[i] = ((q[i] + 32) >> 4) where i = 8..11
+ vqrshrn.s32 d3, q3, #0x6 @ D3 = c[i] = ((q[i] + 32) >> 4) where i = 12..15
+
+ vst1.s16 {d0, d1, d2, d3}, [r1] @IV row store the value
+
+ vpop {d8-d15}
+ ldmfd sp!, {r4-r12, r15} @Reload the registers from SP
+
+
+
+@ *******************************************************************************
+@ */
+@ * @brief This function performs a 2x2 inverse hadamard transform for chroma block
+@ *
+@ * @par Description:
+@ * The DC coefficients pass through a 2-stage inverse hadamard transform.
+@ * This inverse transformed content is scaled to based on Qp value.
+@ * Both DC blocks of U and v blocks are processesd
+@ *
+@ * @param[in] pi2_src
+@ * input 1x8 block of ceffs. First 4 are from U and next from V
+@ *
+@ * @param[out] pi2_out
+@ * output 1x8 block
+@ *
+@ * @param[in] pu2_iscal_mat
+@ * pointer to scaling list
+@ *
+@ * @param[in] pu2_weigh_mat
+@ * pointer to weight matrix
+@ *
+@ * @param[in] u4_qp_div_6
+@ * Floor (qp/6)
+@ *
+@ * @returns none
+@ *
+@ * @remarks none
+@ *
+@ *******************************************************************************
+@ */
+@ *
+@ *******************************************************************************
+@ */
+@ void ih264_ihadamard_scaling_2x2_uv(WORD16* pi2_src,
+@ WORD16* pi2_out,
+@ const UWORD16 *pu2_iscal_mat,
+@ const UWORD16 *pu2_weigh_mat,
+@ UWORD32 u4_qp_div_6,
+
+ .global ih264_ihadamard_scaling_2x2_uv_a9
+ih264_ihadamard_scaling_2x2_uv_a9:
+
+@Registers used
+@ r0 : *pi2_src
+@ r1 : *pi2_out
+@ r2 : *pu2_iscal_mat
+@ r3 : *pu2_weigh_mat
+
+ vld1.u16 d26[0], [r2]
+ vld1.u16 d27[0], [r3]
+ vmull.u16 q15, d26, d27 @pu2_iscal_mat[0] * pu2_weigh_mat[0]
+ vdup.u32 q15, d30[0]
+
+ vld1.u16 d28[0], [sp] @load qp/6
+
+ vpush {d8-d15}
+
+ vmov.u16 d29, #5
+ vsubl.u16 q14, d28, d29 @qp\6 - 5
+ vdup.s32 q14, d28[0]
+
+ vld2.s16 {d0, d1}, [r0] @load 8 dc coeffs
+ @i2_x4,i2_x6,i2_y4,i1_y6 -> d0
+ @i2_x5,i2_x7,i2_y5,i1_y6 -> d1
+
+ vaddl.s16 q1, d0, d1 @ i4_x0 = i4_x4 + i4_x5;...x2
+ vsubl.s16 q2, d0, d1 @ i4_x1 = i4_x4 - i4_x5;...x3
+
+ vtrn.s32 q1, q2 @i4_x0 i4_x1 -> q1
+
+ vadd.s32 q3, q1, q2 @i4_x4 = i4_x0+i4_x2;.. i4_x5
+ vsub.s32 q1, q1, q2 @i4_x6 = i4_x0-i4_x2;.. i4_x7
+
+ vmul.s32 q5, q3, q15
+ vmul.s32 q6, q1, q15
+
+ vshl.s32 q7, q5, q14
+ vshl.s32 q8, q6, q14
+
+ vmovn.s32 d18, q7 @i4_x4 i4_x5 i4_y4 i4_y5
+ vmovn.s32 d19, q8 @i4_x6 i4_x7 i4_y6 i4_y7
+
+ vst2.s32 {d18-d19}, [r1]
+
+ vpop {d8-d15}
+ bx lr
+
+
diff --git a/common/arm/ih264_inter_pred_chroma_a9q.s b/common/arm/ih264_inter_pred_chroma_a9q.s
new file mode 100755
index 0000000..afd2860
--- /dev/null
+++ b/common/arm/ih264_inter_pred_chroma_a9q.s
@@ -0,0 +1,254 @@
+@/******************************************************************************
+@ *
+@ * Copyright (C) 2015 The Android Open Source Project
+@ *
+@ * Licensed under the Apache License, Version 2.0 (the "License");
+@ * you may not use this file except in compliance with the License.
+@ * You may obtain a copy of the License at:
+@ *
+@ * http://www.apache.org/licenses/LICENSE-2.0
+@ *
+@ * Unless required by applicable law or agreed to in writing, software
+@ * distributed under the License is distributed on an "AS IS" BASIS,
+@ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@ * See the License for the specific language governing permissions and
+@ * limitations under the License.
+@ *
+@ *****************************************************************************
+@ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
+@*/
+@/**
+@******************************************************************************
+@* @file
+@* ih264_inter_pred_chroma_a9q.s
+@*
+@* @brief
+@* Contains function definitions for inter prediction interpolation.
+@*
+@* @author
+@* Ittaim
+@*
+@* @par List of Functions:
+@*
+@* - ih264_inter_pred_chroma_a9q()
+@*
+@* @remarks
+@* None
+@*
+@*******************************************************************************
+@*/
+
+@/* All the functions here are replicated from ih264_inter_pred_filters.c
+@
+
+@/**
+@/**
+@/**
+@
+@/**
+@*******************************************************************************
+@*
+@* @brief
+@* Interprediction chroma filter
+@*
+@* @par Description:
+@* Applies filtering to chroma samples as mentioned in
+@* sec 8.4.2.2.2 titled "chroma sample interpolation process"
+@*
+@* @param[in] pu1_src
+@* UWORD8 pointer to the source containing alternate U and V samples
+@*
+@* @param[out] pu1_dst
+@* UWORD8 pointer to the destination
+@*
+@* @param[in] src_strd
+@* integer source stride
+@*
+@* @param[in] dst_strd
+@* integer destination stride
+@*
+@* @param[in]uc_dx
+@* dx value where the sample is to be produced(refer sec 8.4.2.2.2 )
+@*
+@* @param[in] uc_dy
+@* dy value where the sample is to be produced(refer sec 8.4.2.2.2 )
+@*
+@* @param[in] ht
+@* integer height of the array
+@*
+@* @param[in] wd
+@* integer width of the array
+@*
+@* @returns
+@*
+@* @remarks
+@* None
+@*
+@*******************************************************************************
+@*/
+
+@void ih264_inter_pred_chroma(UWORD8 *pu1_src,
+@ UWORD8 *pu1_dst,
+@ WORD32 src_strd,
+@ WORD32 dst_strd,
+@ UWORD8 u1_dx,
+@ UWORD8 u1_dy,
+@ WORD32 ht,
+@ WORD32 wd)
+@**************Variables Vs Registers*****************************************
+@ r0 => *pu1_src
+@ r1 => *pu1_dst
+@ r2 => src_strd
+@ r3 => dst_strd
+@ r4 => u1_dx
+@ r5 => u1_dy
+@ r6 => height
+@ r7 => width
+@
+.text
+.p2align 2
+
+ .global ih264_inter_pred_chroma_a9q
+
+ih264_inter_pred_chroma_a9q:
+
+
+
+ stmfd sp!, {r4-r12, r14} @store register values to stack
+ vstmdb sp!, {d8-d15} @push neon registers to stack
+ ldr r4, [sp, #104]
+ ldr r5, [sp, #108]
+ ldr r6, [sp, #112]
+ ldr r7, [sp, #116]
+
+ rsb r8, r4, #8 @8-u1_dx
+ rsb r9, r5, #8 @8-u1_dy
+ mul r10, r8, r9
+ mul r11, r4, r9
+
+ vdup.u8 d28, r10
+ vdup.u8 d29, r11
+
+ mul r10, r8, r5
+ mul r11, r4, r5
+
+ vdup.u8 d30, r10
+ vdup.u8 d31, r11
+
+ subs r12, r7, #2 @if wd=4 branch to loop_4
+ beq loop_2
+ subs r12, r7, #4 @if wd=8 branch to loop_8
+ beq loop_4
+
+loop_8:
+ sub r6, #1
+ vld1.8 {d0, d1, d2}, [r0], r2 @ Load row0
+ vld1.8 {d5, d6, d7}, [r0], r2 @ Load row1
+ vext.8 d3, d0, d1, #2
+ vext.8 d8, d5, d6, #2
+
+ vmull.u8 q5, d0, d28
+ vmlal.u8 q5, d5, d30
+ vmlal.u8 q5, d3, d29
+ vmlal.u8 q5, d8, d31
+ vext.8 d9, d6, d7, #2
+ vext.8 d4, d1, d2, #2
+
+inner_loop_8:
+ vmull.u8 q6, d6, d30
+ vmlal.u8 q6, d1, d28
+ vmlal.u8 q6, d9, d31
+ vmlal.u8 q6, d4, d29
+ vmov d0, d5
+ vmov d3, d8
+
+ vqrshrun.s16 d14, q5, #6
+ vmov d1, d6
+ vmov d4, d9
+
+ vld1.8 {d5, d6, d7}, [r0], r2 @ Load row1
+ vqrshrun.s16 d15, q6, #6
+
+ vext.8 d8, d5, d6, #2
+ subs r6, #1
+ vext.8 d9, d6, d7, #2
+ vst1.8 {q7}, [r1], r3 @ Store dest row
+
+ vmull.u8 q5, d0, d28
+ vmlal.u8 q5, d5, d30
+ vmlal.u8 q5, d3, d29
+ vmlal.u8 q5, d8, d31
+ bne inner_loop_8
+
+ vmull.u8 q6, d6, d30
+ vmlal.u8 q6, d1, d28
+ vmlal.u8 q6, d9, d31
+ vmlal.u8 q6, d4, d29
+
+ vqrshrun.s16 d14, q5, #6
+ vqrshrun.s16 d15, q6, #6
+
+ vst1.8 {q7}, [r1], r3 @ Store dest row
+
+ b end_func
+
+loop_4:
+ sub r6, #1
+ vld1.8 {d0, d1}, [r0], r2 @ Load row0
+ vld1.8 {d2, d3}, [r0], r2 @ Load row1
+ vext.8 d1, d0, d1, #2
+ vext.8 d3, d2, d3, #2
+
+ vmull.u8 q2, d2, d30
+ vmlal.u8 q2, d0, d28
+ vmlal.u8 q2, d3, d31
+ vmlal.u8 q2, d1, d29
+
+inner_loop_4:
+ subs r6, #1
+ vmov d0, d2
+ vmov d1, d3
+
+ vld1.8 {d2, d3}, [r0], r2 @ Load row1
+ vqrshrun.s16 d6, q2, #6
+
+ vext.8 d3, d2, d3, #2
+ vst1.8 {d6}, [r1], r3 @ Store dest row
+
+ vmull.u8 q2, d0, d28
+ vmlal.u8 q2, d2, d30
+ vmlal.u8 q2, d1, d29
+ vmlal.u8 q2, d3, d31
+ bne inner_loop_4
+
+ vqrshrun.s16 d6, q2, #6
+ vst1.8 {d6}, [r1], r3 @ Store dest row
+
+ b end_func
+
+loop_2:
+ vld1.8 {d0}, [r0], r2 @ Load row0
+ vext.8 d1, d0, d0, #2
+ vld1.8 {d2}, [r0], r2 @ Load row1
+ vext.8 d3, d2, d2, #2
+ vmull.u8 q2, d0, d28
+ vmlal.u8 q2, d1, d29
+ vmlal.u8 q2, d2, d30
+ vmlal.u8 q2, d3, d31
+ vld1.8 {d6}, [r0] @ Load row2
+ vqrshrun.s16 d4, q2, #6
+ vext.8 d7, d6, d6, #2
+ vst1.32 d4[0], [r1], r3 @ Store dest row0
+ vmull.u8 q4, d2, d28
+ vmlal.u8 q4, d3, d29
+ vmlal.u8 q4, d6, d30
+ vmlal.u8 q4, d7, d31
+ subs r6, #2
+ vqrshrun.s16 d8, q4, #6
+ vst1.32 d8[0], [r1], r3 @ Store dest row1
+ bne loop_2 @ repeat if ht=2
+
+end_func:
+ vldmia sp!, {d8-d15} @ Restore neon registers that were saved
+ ldmfd sp!, {r4-r12, pc} @ Restoring registers from stack
+
diff --git a/common/arm/ih264_inter_pred_filters_luma_horz_a9q.s b/common/arm/ih264_inter_pred_filters_luma_horz_a9q.s
new file mode 100755
index 0000000..ea6bba0
--- /dev/null
+++ b/common/arm/ih264_inter_pred_filters_luma_horz_a9q.s
@@ -0,0 +1,245 @@
+@/******************************************************************************
+@ *
+@ * Copyright (C) 2015 The Android Open Source Project
+@ *
+@ * Licensed under the Apache License, Version 2.0 (the "License");
+@ * you may not use this file except in compliance with the License.
+@ * You may obtain a copy of the License at:
+@ *
+@ * http://www.apache.org/licenses/LICENSE-2.0
+@ *
+@ * Unless required by applicable law or agreed to in writing, software
+@ * distributed under the License is distributed on an "AS IS" BASIS,
+@ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@ * See the License for the specific language governing permissions and
+@ * limitations under the License.
+@ *
+@ *****************************************************************************
+@ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
+@*/
+@/**
+@******************************************************************************
+@* @file
+@* ih264_inter_pred_luma_horz_a9q.s
+@*
+@* @brief
+@* Contains function definitions for inter prediction interpolation.
+@*
+@* @author
+@* Ittiam
+@*
+@* @par List of Functions:
+@*
+@* - ih264_inter_pred_luma_horz_a9q()
+@*
+@* @remarks
+@* None
+@*
+@*******************************************************************************
+@*/
+
+@/* All the functions here are replicated from ih264_inter_pred_filters.c
+@
+
+@/**
+@/**
+@*******************************************************************************
+@*
+@* @brief
+@* Interprediction luma filter for horizontal input
+@*
+@* @par Description:
+@* Applies a 6 tap horizontal filter .The output is clipped to 8 bits
+@* sec 8.4.2.2.1 titled "Luma sample interpolation process"
+@*
+@* @param[in] pu1_src
+@* UWORD8 pointer to the source
+@*
+@* @param[out] pu1_dst
+@* UWORD8 pointer to the destination
+@*
+@* @param[in] src_strd
+@* integer source stride
+@*
+@* @param[in] dst_strd
+@* integer destination stride
+@*
+@* @param[in] ht
+@* integer height of the array
+@*
+@* @param[in] wd
+@* integer width of the array
+@*
+@* @returns
+@*
+@ @remarks
+@* None
+@*
+@*******************************************************************************
+@*/
+
+@void ih264_inter_pred_luma_horz (
+@ UWORD8 *pu1_src,
+@ UWORD8 *pu1_dst,
+@ WORD32 src_strd,
+@ WORD32 dst_strd,
+@ WORD32 ht,
+@ WORD32 wd )
+
+@**************Variables Vs Registers*****************************************
+@ r0 => *pu1_src
+@ r1 => *pu1_dst
+@ r2 => src_strd
+@ r3 => dst_strd
+@ r5 => ht
+@ r6 => wd
+
+.text
+.p2align 2
+
+
+ .global ih264_inter_pred_luma_horz_a9q
+
+ih264_inter_pred_luma_horz_a9q:
+
+ stmfd sp!, {r4-r12, r14} @store register values to stack
+ vstmdb sp!, {d8-d15} @push neon registers to stack
+ ldr r5, [sp, #104] @Loads ht
+ sub r0, r0, #2 @pu1_src-2
+ ldr r6, [sp, #108] @Loads wd
+ vmov.i8 d0, #5 @filter coeff
+ subs r12, r6, #8 @if wd=8 branch to loop_8
+ vmov.i8 d1, #20 @filter coeff
+ beq loop_8
+
+ subs r12, r6, #4 @if wd=4 branch to loop_4
+ beq loop_4
+
+loop_16: @when wd=16
+ @// Processing row0 and row1
+ vld1.8 {d2, d3, d4}, [r0], r2 @// Load row0 ;for checking loop
+ vext.8 d31, d2, d3, #5 @//extract a[5] (column1,row0)
+ vld1.8 {d5, d6, d7}, [r0], r2 @// Load row1
+ vext.8 d30, d3, d4, #5 @//extract a[5] (column2,row0)
+ vaddl.u8 q4, d31, d2 @// a0 + a5 (column1,row0)
+ vext.8 d28, d5, d6, #5 @//extract a[5] (column1,row1)
+ vaddl.u8 q5, d30, d3 @// a0 + a5 (column2,row0)
+ vext.8 d27, d6, d7, #5 @//extract a[5] (column2,row1)
+ vaddl.u8 q7, d28, d5 @// a0 + a5 (column1,row1)
+ vext.8 d31, d2, d3, #2 @//extract a[2] (column1,row0)
+ vaddl.u8 q8, d27, d6 @// a0 + a5 (column2,row1)
+ vext.8 d30, d3, d4, #2 @//extract a[2] (column2,row0)
+ vmlal.u8 q4, d31, d1 @// a0 + a5 + 20a2 (column1,row0)
+ vext.8 d28, d5, d6, #2 @//extract a[2] (column1,row1)
+ vmlal.u8 q5, d30, d1 @// a0 + a5 + 20a2 (column2,row0)
+ vext.8 d27, d6, d7, #2 @//extract a[2] (column2,row1)
+ vmlal.u8 q7, d28, d1 @// a0 + a5 + 20a2 (column1,row1)
+ vext.8 d31, d2, d3, #3 @//extract a[3] (column1,row0)
+ vmlal.u8 q8, d27, d1 @// a0 + a5 + 20a2 (column2,row1)
+ vext.8 d30, d3, d4, #3 @//extract a[3] (column2,row0)
+ vmlal.u8 q4, d31, d1 @// a0 + a5 + 20a2 + 20a3 (column1,row0)
+ vext.8 d28, d5, d6, #3 @//extract a[3] (column1,row1)
+ vmlal.u8 q5, d30, d1 @// a0 + a5 + 20a2 + 20a3 (column2,row0)
+ vext.8 d27, d6, d7, #3 @//extract a[3] (column2,row1)
+ vmlal.u8 q7, d28, d1 @// a0 + a5 + 20a2 + 20a3 (column1,row1)
+ vext.8 d31, d2, d3, #1 @//extract a[1] (column1,row0)
+ vmlal.u8 q8, d27, d1 @// a0 + a5 + 20a2 + 20a3 (column2,row1)
+ vext.8 d30, d3, d4, #1 @//extract a[1] (column2,row0)
+ vmlsl.u8 q4, d31, d0 @// a0 + a5 + 20a2 + 20a3 - 5a1 (column1,row0)
+ vext.8 d28, d5, d6, #1 @//extract a[1] (column1,row1)
+ vmlsl.u8 q5, d30, d0 @// a0 + a5 + 20a2 + 20a3 - 5a1 (column2,row0)
+ vext.8 d27, d6, d7, #1 @//extract a[1] (column2,row1)
+ vmlsl.u8 q7, d28, d0 @// a0 + a5 + 20a2 + 20a3 - 5a1 (column1,row1)
+ vext.8 d31, d2, d3, #4 @//extract a[4] (column1,row0)
+ vmlsl.u8 q8, d27, d0 @// a0 + a5 + 20a2 + 20a3 - 5a1 (column2,row1)
+ vext.8 d30, d3, d4, #4 @//extract a[4] (column2,row0)
+ vmlsl.u8 q4, d31, d0 @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1,row0)
+ vext.8 d28, d5, d6, #4 @//extract a[4] (column1,row1)
+ vmlsl.u8 q5, d30, d0 @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column2,row0)
+ vext.8 d27, d6, d7, #4 @//extract a[4] (column2,row1)
+ vmlsl.u8 q7, d28, d0 @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1,row1)
+ vmlsl.u8 q8, d27, d0 @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column2,row1)
+ vqrshrun.s16 d20, q4, #5 @// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column1,row0)
+ vqrshrun.s16 d21, q5, #5 @// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column2,row0)
+ vext.8 d31, d2, d3, #5 @//extract a[5] (column1,row2)
+ vst1.8 {d20, d21}, [r1], r3 @//Store dest row0
+ vqrshrun.s16 d23, q7, #5 @// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column1,row1)
+ vext.8 d30, d3, d4, #5 @//extract a[5] (column2,row2)
+ vqrshrun.s16 d24, q8, #5 @// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column2,row1)
+ vst1.8 {d23, d24}, [r1], r3 @//Store dest row1
+ subs r5, r5, #2 @ 2 rows done, decrement by 2
+
+ beq end_func
+ b loop_16 @ loop if height == 8 or 16
+
+loop_8:
+@// Processing row0 and row1
+ vld1.8 {d5, d6}, [r0], r2 @// Load row1
+ vext.8 d28, d5, d6, #5 @//extract a[5] (column1,row1)
+ vld1.8 {d2, d3}, [r0], r2 @// Load row0
+ vext.8 d25, d5, d6, #2 @//extract a[2] (column1,row1)
+ vext.8 d31, d2, d3, #5 @//extract a[5] (column1,row0)
+ vext.8 d24, d5, d6, #3 @//extract a[3] (column1,row1)
+ vext.8 d23, d5, d6, #1 @//extract a[1] (column1,row1)
+ vext.8 d22, d5, d6, #4 @//extract a[4] (column1,row1)
+ vaddl.u8 q7, d28, d5 @// a0 + a5 (column1,row1)
+ vext.8 d29, d2, d3, #3 @//extract a[3] (column1,row0)
+ vmlal.u8 q7, d25, d1 @// a0 + a5 + 20a2 (column1,row1)
+ vmlal.u8 q7, d24, d1 @// a0 + a5 + 20a2 + 20a3 (column1,row1)
+ vmlsl.u8 q7, d23, d0 @// a0 + a5 + 20a2 + 20a3 - 5a1 (column1,row1)
+ vmlsl.u8 q7, d22, d0 @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1,row1)
+ vext.8 d30, d2, d3, #2 @//extract a[2] (column1,row0)
+ vaddl.u8 q4, d31, d2 @// a0 + a5 (column1,row0)
+ vext.8 d27, d2, d3, #1 @//extract a[1] (column1,row0)
+ vext.8 d26, d2, d3, #4 @//extract a[4] (column1,row0)
+ vmlal.u8 q4, d29, d1 @// a0 + a5 + 20a2 + 20a3 (column1,row0)
+ vmlal.u8 q4, d30, d1 @// a0 + a5 + 20a2 (column1,row0)
+ vmlsl.u8 q4, d27, d0 @// a0 + a5 + 20a2 + 20a3 - 5a1 (column1,row0)
+ vmlsl.u8 q4, d26, d0 @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1,row0)
+ vqrshrun.s16 d23, q7, #5 @// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column1,row0)
+ vst1.8 {d23}, [r1], r3 @//Store dest row0
+ vqrshrun.s16 d20, q4, #5 @// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column1,row1)
+ vst1.8 {d20}, [r1], r3 @//Store dest row1
+ subs r5, r5, #2 @ 2 rows done, decrement by 2
+
+ beq end_func @ Branch if height==4
+
+ b loop_8 @looping if height =8 or 16
+
+loop_4:
+ vld1.8 {d5, d6}, [r0], r2 @// Load row1
+ vext.8 d28, d5, d6, #5 @//extract a[5] (column1,row1)
+ vld1.8 {d2, d3}, [r0], r2 @// Load row0
+ vext.8 d25, d5, d6, #2 @//extract a[2] (column1,row1)
+ vext.8 d31, d2, d3, #5 @//extract a[5] (column1,row0)
+ vaddl.u8 q7, d28, d5 @// a0 + a5 (column1,row1)
+ vext.8 d24, d5, d6, #3 @//extract a[3] (column1,row1)
+ vext.8 d23, d5, d6, #1 @//extract a[1] (column1,row1)
+ vext.8 d22, d5, d6, #4 @//extract a[4] (column1,row1)
+ vext.8 d29, d2, d3, #3 @//extract a[3] (column1,row0)
+ vmlal.u8 q7, d25, d1 @// a0 + a5 + 20a2 (column1,row1)
+ vmlal.u8 q7, d24, d1 @// a0 + a5 + 20a2 + 20a3 (column1,row1)
+ vmlsl.u8 q7, d23, d0 @// a0 + a5 + 20a2 + 20a3 - 5a1 (column1,row1)
+ vmlsl.u8 q7, d22, d0 @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1,row1)
+ vaddl.u8 q4, d31, d2 @// a0 + a5 (column1,row0)
+ vext.8 d30, d2, d3, #2 @//extract a[2] (column1,row0)
+ vext.8 d27, d2, d3, #1 @//extract a[1] (column1,row0)
+ vext.8 d26, d2, d3, #4 @//extract a[4] (column1,row0)
+ vmlal.u8 q4, d29, d1 @// a0 + a5 + 20a2 + 20a3 (column1,row0)
+ vmlal.u8 q4, d30, d1 @// a0 + a5 + 20a2 (column1,row0)
+ vmlsl.u8 q4, d27, d0 @// a0 + a5 + 20a2 + 20a3 - 5a1 (column1,row0)
+ vmlsl.u8 q4, d26, d0 @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1,row0)
+ vqrshrun.s16 d23, q7, #5 @// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column1,row0)
+ vst1.32 d23[0], [r1], r3 @//Store dest row0
+ vqrshrun.s16 d20, q4, #5 @// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column1,row1)
+ vst1.32 d20[0], [r1], r3 @//Store dest row1
+ subs r5, r5, #2 @ 2 rows done, decrement by 2
+ beq end_func
+
+ b loop_4
+
+end_func:
+ vldmia sp!, {d8-d15} @ Restore neon registers that were saved
+ ldmfd sp!, {r4-r12, pc} @Restoring registers from stack
+
+
diff --git a/common/arm/ih264_inter_pred_filters_luma_vert_a9q.s b/common/arm/ih264_inter_pred_filters_luma_vert_a9q.s
new file mode 100755
index 0000000..5b29e02
--- /dev/null
+++ b/common/arm/ih264_inter_pred_filters_luma_vert_a9q.s
@@ -0,0 +1,301 @@
+@/******************************************************************************
+@ *
+@ * Copyright (C) 2015 The Android Open Source Project
+@ *
+@ * Licensed under the Apache License, Version 2.0 (the "License");
+@ * you may not use this file except in compliance with the License.
+@ * You may obtain a copy of the License at:
+@ *
+@ * http://www.apache.org/licenses/LICENSE-2.0
+@ *
+@ * Unless required by applicable law or agreed to in writing, software
+@ * distributed under the License is distributed on an "AS IS" BASIS,
+@ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@ * See the License for the specific language governing permissions and
+@ * limitations under the License.
+@ *
+@ *****************************************************************************
+@ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
+@*/
+@/**
+@******************************************************************************
+@* @file
+@* ih264_inter_pred_luma_vert_a9q.s
+@*
+@* @brief
+@* Contains function definitions for inter prediction interpolation.
+@*
+@* @author
+@* Ittiam
+@*
+@* @par List of Functions:
+@*
+@* - ih264_inter_pred_luma_vert_a9q()
+@*
+@* @remarks
+@* None
+@*
+@*******************************************************************************
+@*/
+
+@/* All the functions here are replicated from ih264_inter_pred_filters.c
+@
+
+@/**
+@/**
+@/**
+@ *******************************************************************************
+@ *
+@ * @brief
+@ * Interprediction luma filter for vertical input
+@ *
+@ * @par Description:
+@ * Applies a 6 tap vertcal filter.The output is clipped to 8 bits
+@ * sec 8.4.2.2.1 titled "Luma sample interpolation process"
+@ *
+@ * @param[in] pu1_src
+@ * UWORD8 pointer to the source
+@ *
+@ * @param[out] pu1_dst
+@ * UWORD8 pointer to the destination
+@ *
+@ * @param[in] src_strd
+@ * integer source stride
+@ *
+@ * @param[in] dst_strd
+@ * integer destination stride
+@ *
+@ * @param[in] ht
+@ * integer height of the array
+@ *
+@ * @param[in] wd
+@ * integer width of the array
+@ *
+@ * @returns
+@ *
+@ * @remarks
+@ * None
+@ *
+@ *******************************************************************************
+
+@void ih264_inter_pred_luma_vert (
+@ UWORD8 *pu1_src,
+@ UWORD8 *pu1_dst,
+@ WORD32 src_strd,
+@ WORD32 dst_strd,
+@ WORD32 ht,
+@ WORD32 wd )
+
+@**************Variables Vs Registers*****************************************
+@ r0 => *pu1_src
+@ r1 => *pu1_dst
+@ r2 => src_strd
+@ r3 => dst_strd
+@ r5 => ht
+@ r6 => wd
+
+.text
+.p2align 2
+
+
+ .global ih264_inter_pred_luma_vert_a9q
+
+ih264_inter_pred_luma_vert_a9q:
+
+ stmfd sp!, {r4-r12, r14} @store register values to stack
+ vstmdb sp!, {d8-d15} @push neon registers to stack
+ ldr r5, [sp, #104] @Loads ht
+ sub r0, r0, r2, lsl #1 @pu1_src-2*src_strd
+ ldr r6, [sp, #108] @Loads wd
+ vmov.u16 q11, #20 @ Filter coeff 0x14 into Q11
+
+ subs r12, r6, #8 @if wd=8 branch to loop_8
+ vmov.u16 q12, #5 @ Filter coeff 0x5 into Q12
+ beq loop_8
+
+ subs r12, r6, #4 @if wd=4 branch to loop_4
+ beq loop_4
+
+loop_16: @when wd=16
+
+ vld1.u32 {q0}, [r0], r2 @ Vector load from src[0_0]
+ vld1.u32 {q1}, [r0], r2 @ Vector load from src[1_0]
+ vld1.u32 {q2}, [r0], r2 @ Vector load from src[2_0]
+ vld1.u32 {q3}, [r0], r2 @ Vector load from src[3_0]
+ vld1.u32 {q4}, [r0], r2 @ Vector load from src[4_0]
+ vaddl.u8 q6, d4, d6 @ temp1 = src[2_0] + src[3_0]
+ vld1.u32 {q5}, [r0], r2 @ Vector load from src[5_0]
+
+ vaddl.u8 q7, d0, d10 @ temp = src[0_0] + src[5_0]
+ vaddl.u8 q8, d2, d8 @ temp2 = src[1_0] + src[4_0]
+ vmla.u16 q7, q6, q11 @ temp += temp1 * 20
+ vaddl.u8 q10, d1, d11 @ temp4 = src[0_8] + src[5_8]
+ vaddl.u8 q9, d5, d7 @ temp3 = src[2_8] + src[3_8]
+ vmla.u16 q10, q9, q11 @ temp4 += temp3 * 20
+ vld1.u32 {q0}, [r0], r2
+ vaddl.u8 q13, d3, d9 @ temp5 = src[1_8] + src[4_8]
+ vaddl.u8 q6, d6, d8
+ vmls.u16 q7, q8, q12 @ temp -= temp2 * 5
+ vaddl.u8 q8, d2, d0
+ vaddl.u8 q9, d4, d10
+ vmla.u16 q8, q6, q11
+ vmls.u16 q10, q13, q12 @ temp4 -= temp5 * 5
+ vaddl.u8 q13, d5, d11
+ vaddl.u8 q6, d7, d9
+ vqrshrun.s16 d30, q7, #5 @ dst[0_0] = CLIP_U8((temp +16) >> 5)
+ vaddl.u8 q7, d3, d1
+ vld1.u32 {q1}, [r0], r2
+ vmla.u16 q7, q6, q11
+ vmls.u16 q8, q9, q12
+ vqrshrun.s16 d31, q10, #5 @ dst[0_8] = CLIP_U8((temp4 +16) >> 5)
+ vaddl.u8 q9, d4, d2
+ vaddl.u8 q6, d8, d10
+
+ vst1.u32 {q15}, [r1], r3 @ Vector store to dst[0_0]
+ vmla.u16 q9, q6, q11
+ vaddl.u8 q10, d6, d0
+ vmls.u16 q7, q13, q12
+ vqrshrun.s16 d30, q8, #5
+ vaddl.u8 q6, d9, d11
+ vaddl.u8 q8, d5, d3
+ vaddl.u8 q13, d7, d1
+ vmla.u16 q8, q6, q11
+ vmls.u16 q9, q10, q12
+ vld1.u32 {q2}, [r0], r2
+
+ vqrshrun.s16 d31, q7, #5
+ vaddl.u8 q6, d10, d0
+ vaddl.u8 q7, d6, d4
+ vaddl.u8 q10, d8, d2
+ vmla.u16 q7, q6, q11
+ vmls.u16 q8, q13, q12
+ vst1.u32 {q15}, [r1], r3 @store row 1
+ vqrshrun.s16 d30, q9, #5
+ vaddl.u8 q9, d7, d5
+ vaddl.u8 q6, d11, d1
+ vmla.u16 q9, q6, q11
+ vaddl.u8 q13, d9, d3
+ vmls.u16 q7, q10, q12
+
+ vqrshrun.s16 d31, q8, #5
+ vmls.u16 q9, q13, q12
+ vaddl.u8 q6, d0, d2 @ temp1 = src[2_0] + src[3_0]
+ vst1.u32 {q15}, [r1], r3 @store row 2
+ vaddl.u8 q8, d10, d4 @ temp2 = src[1_0] + src[4_0]
+ vaddl.u8 q10, d9, d7 @ temp4 = src[0_8] + src[5_8]
+ vqrshrun.s16 d30, q7, #5
+ vaddl.u8 q13, d5, d11 @ temp5 = src[1_8] + src[4_8]
+ vaddl.u8 q7, d8, d6 @ temp = src[0_0] + src[5_0]
+ vqrshrun.s16 d31, q9, #5
+ vmla.u16 q7, q6, q11 @ temp += temp1 * 20
+ vaddl.u8 q9, d1, d3 @ temp3 = src[2_8] + src[3_8]
+ vst1.u32 {q15}, [r1], r3 @store row 3
+ subs r5, r5, #4 @ 4 rows processed, decrement by 4
+ subne r0, r0 , r2, lsl #2
+ subne r0, r0, r2
+ beq end_func @ Branch if height==4
+
+ b loop_16 @ looping if height = 8 or 16
+
+loop_8:
+@// Processing row0 and row1
+
+ vld1.u32 d0, [r0], r2 @ Vector load from src[0_0]
+ vld1.u32 d1, [r0], r2 @ Vector load from src[1_0]
+ vld1.u32 d2, [r0], r2 @ Vector load from src[2_0]
+ vld1.u32 d3, [r0], r2 @ Vector load from src[3_0]
+ vld1.u32 d4, [r0], r2 @ Vector load from src[4_0]
+ vld1.u32 d5, [r0], r2 @ Vector load from src[5_0]
+
+ vaddl.u8 q3, d2, d3 @ temp1 = src[2_0] + src[3_0]
+ vaddl.u8 q4, d0, d5 @ temp = src[0_0] + src[5_0]
+ vaddl.u8 q5, d1, d4 @ temp2 = src[1_0] + src[4_0]
+ vmla.u16 q4, q3, q11 @ temp += temp1 * 20
+ vld1.u32 d6, [r0], r2
+ vaddl.u8 q7, d3, d4
+ vaddl.u8 q8, d1, d6
+ vaddl.u8 q9, d2, d5
+ vmls.u16 q4, q5, q12 @ temp -= temp2 * 5
+ vmla.u16 q8, q7, q11
+ vld1.u32 d7, [r0], r2
+ vaddl.u8 q10, d4, d5
+ vaddl.u8 q6, d2, d7
+ vaddl.u8 q5, d3, d6
+ vmls.u16 q8, q9, q12
+ vqrshrun.s16 d26, q4, #5 @ dst[0_0] = CLIP_U8( (temp + 16) >> 5)
+ vmla.u16 q6, q10, q11
+ vld1.u32 d0, [r0], r2
+ vaddl.u8 q7, d5, d6
+ vqrshrun.s16 d27, q8, #5
+ vaddl.u8 q10, d3, d0
+ vmls.u16 q6, q5, q12
+ vst1.u32 d26, [r1], r3 @ Vector store to dst[0_0]
+ vaddl.u8 q9, d4, d7
+ vmla.u16 q10, q7, q11
+ vst1.u32 d27, [r1], r3
+ vqrshrun.s16 d28, q6, #5
+ vst1.u32 d28, [r1], r3
+ vmls.u16 q10, q9, q12
+ vqrshrun.s16 d29, q10, #5
+ vst1.u32 d29, [r1], r3 @store row 3
+
+ subs r5, r5, #4 @ 4 rows processed, decrement by 4
+ subne r0, r0 , r2, lsl #2
+ subne r0, r0, r2
+ beq end_func @ Branch if height==4
+
+ b loop_8 @looping if height == 8 or 16
+
+
+loop_4:
+@// Processing row0 and row1
+
+ vld1.u32 d0[0], [r0], r2 @ Vector load from src[0_0]
+ vld1.u32 d1[0], [r0], r2 @ Vector load from src[1_0]
+ vld1.u32 d2[0], [r0], r2 @ Vector load from src[2_0]
+ vld1.u32 d3[0], [r0], r2 @ Vector load from src[3_0]
+ vld1.u32 d4[0], [r0], r2 @ Vector load from src[4_0]
+ vld1.u32 d5[0], [r0], r2 @ Vector load from src[5_0]
+
+ vaddl.u8 q3, d2, d3 @ temp1 = src[2_0] + src[3_0]
+ vaddl.u8 q4, d0, d5 @ temp = src[0_0] + src[5_0]
+ vaddl.u8 q5, d1, d4 @ temp2 = src[1_0] + src[4_0]
+ vmla.u16 q4, q3, q11 @ temp += temp1 * 20
+ vld1.u32 d6[0], [r0], r2
+ vaddl.u8 q7, d3, d4
+ vaddl.u8 q8, d1, d6
+ vaddl.u8 q9, d2, d5
+ vmls.u16 q4, q5, q12 @ temp -= temp2 * 5
+ vld1.u32 d7[0], [r0], r2
+ vmla.u16 q8, q7, q11
+ vaddl.u8 q10, d4, d5
+ vaddl.u8 q6, d2, d7
+ vaddl.u8 q5, d3, d6
+ vmls.u16 q8, q9, q12
+ vqrshrun.s16 d26, q4, #5 @ dst[0_0] = CLIP_U8( (temp + 16) >> 5)
+ vmla.u16 q6, q10, q11
+ vld1.u32 d0[0], [r0], r2
+ vaddl.u8 q7, d5, d6
+ vqrshrun.s16 d27, q8, #5
+ vaddl.u8 q10, d3, d0
+ vmls.u16 q6, q5, q12
+ vst1.u32 d26[0], [r1], r3 @ Vector store to dst[0_0]
+ vaddl.u8 q9, d4, d7
+ vmla.u16 q10, q7, q11
+ vst1.u32 d27[0], [r1], r3
+ vqrshrun.s16 d28, q6, #5
+ vst1.u32 d28[0], [r1], r3
+ vmls.u16 q10, q9, q12
+ vqrshrun.s16 d29, q10, #5
+ vst1.u32 d29[0], [r1], r3 @store row 3
+
+ subs r5, r5, #8
+ subeq r0, r0, r2, lsl #2
+ subeq r0, r0, r2
+ beq loop_4 @ Loop if height==8
+
+end_func:
+ vldmia sp!, {d8-d15} @ Restore neon registers that were saved
+ ldmfd sp!, {r4-r12, pc} @Restoring registers from stack
+
+
diff --git a/common/arm/ih264_inter_pred_luma_bilinear_a9q.s b/common/arm/ih264_inter_pred_luma_bilinear_a9q.s
new file mode 100755
index 0000000..6a3c83d
--- /dev/null
+++ b/common/arm/ih264_inter_pred_luma_bilinear_a9q.s
@@ -0,0 +1,398 @@
+@/******************************************************************************
+@ *
+@ * Copyright (C) 2015 The Android Open Source Project
+@ *
+@ * Licensed under the Apache License, Version 2.0 (the "License");
+@ * you may not use this file except in compliance with the License.
+@ * You may obtain a copy of the License at:
+@ *
+@ * http://www.apache.org/licenses/LICENSE-2.0
+@ *
+@ * Unless required by applicable law or agreed to in writing, software
+@ * distributed under the License is distributed on an "AS IS" BASIS,
+@ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@ * See the License for the specific language governing permissions and
+@ * limitations under the License.
+@ *
+@ *****************************************************************************
+@ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
+@*/
+@/**
+@******************************************************************************
+@* @file
+@* ih264_inter_pred_luma_bilinear_a9q.s
+@*
+@* @brief
+@* Contains function definitions for inter prediction interpolation.
+@*
+@* @author
+@* Ittiam
+@*
+@* @par List of Functions:
+@*
+@* - ih264_inter_pred_luma_bilinear_a9q()
+@*
+@* @remarks
+@* None
+@*
+@*******************************************************************************
+@*/
+
+@/* All the functions here are replicated from ih264_inter_pred_filters.c
+@
+
+@/**
+@/**
+@/**
+@ *******************************************************************************
+@ * function:ih264_inter_pred_luma_bilinear
+@ *
+@* @brief
+@* This routine applies the bilinear filter to the predictors .
+@* The filtering operation is described in
+@* sec 8.4.2.2.1 titled "Luma sample interpolation process"
+@*
+@* @par Description:
+@\note
+@* This function is called to obtain pixels lying at the following
+@* locations (1/4,1), (3/4,1),(1,1/4), (1,3/4) ,(1/4,1/2), (3/4,1/2),(1/2,1/4), (1/2,3/4),(3/4,1/4),(1/4,3/4),(3/4,3/4)&& (1/4,1/4) .
+@* The function averages the two adjacent values from the two input arrays in horizontal direction.
+@*
+@*
+@* @param[in] pu1_src1:
+@* UWORD8 Pointer to the buffer containing the first input array.
+@*
+@* @param[in] pu1_src2:
+@* UWORD8 Pointer to the buffer containing the second input array.
+@*
+@* @param[out] pu1_dst
+@* UWORD8 pointer to the destination where the output of bilinear filter is stored.
+@*
+@* @param[in] src_strd1
+@* Stride of the first input buffer
+@*
+@* @param[in] src_strd2
+@* Stride of the second input buffer
+@*
+@* @param[in] dst_strd
+@* integer destination stride of pu1_dst
+@*
+@* @param[in] ht
+@* integer height of the array
+@*
+@* @param[in] wd
+@* integer width of the array
+@*
+@* @returns
+@*
+@* @remarks
+@* None
+@*
+@*******************************************************************************
+@*/
+
+@void ih264_inter_pred_luma_bilinear(UWORD8 *pu1_src1,
+@ UWORD8 *pu1_src2,
+@ UWORD8 *pu1_dst,
+@ WORD32 src_strd1,
+@ WORD32 src_strd2,
+@ WORD32 dst_strd,
+@ WORD32 height,
+@ WORD32 width)
+@
+@**************Variables Vs Registers*****************************************
+@ r0 => *pu1_src1
+@ r1 => *pu1_src2
+@ r2 => *pu1_dst
+@ r3 => src_strd1
+@ r4 => src_strd2
+@ r5 => dst_strd
+@ r6 => height
+@ r7 => width
+@
+.text
+.p2align 2
+
+ .global ih264_inter_pred_luma_bilinear_a9q
+
+ih264_inter_pred_luma_bilinear_a9q:
+
+
+
+ stmfd sp!, {r4-r12, r14} @store register values to stack
+ vstmdb sp!, {d8-d15} @push neon registers to stack
+ ldr r4, [sp, #104]
+ ldr r5, [sp, #108] @
+ ldr r6, [sp, #112]
+ ldr r7, [sp, #116]
+
+ subs r12, r7, #4 @if wd=4 branch to loop_4
+ beq loop_4
+ subs r12, r7, #8 @if wd=8 branch to loop_8
+ beq loop_8
+
+loop_16: @when wd=16
+
+ vld1.8 {q0}, [r0], r3 @// Load row0 ;src1
+ vld1.8 {q2}, [r1], r4 @// Load row0 ;src2
+ vld1.8 {q1}, [r0], r3 @// Load row1 ;src1
+ vaddl.u8 q10, d0, d4
+ vld1.8 {q3}, [r1], r4 @// Load row1 ;src2
+ vaddl.u8 q11, d1, d5
+ vld1.8 {q4}, [r0], r3 @// Load row2 ;src1
+ vaddl.u8 q12, d2, d6
+ vld1.8 {q5}, [r0], r3 @// Load row3 ;src1
+ vaddl.u8 q13, d3, d7
+ vld1.8 {q6}, [r1], r4 @// Load row2 ;src2
+ vaddl.u8 q8, d8, d12
+ vld1.8 {q7}, [r1], r4 @// Load row3 ;src2
+ vaddl.u8 q9, d9, d13
+ vqrshrun.s16 d28, q10, #1
+ vqrshrun.s16 d29, q11, #1
+ vaddl.u8 q10, d10, d14
+ vqrshrun.s16 d30, q12, #1
+ vqrshrun.s16 d31, q13, #1
+ vst1.8 {q14}, [r2], r5 @//Store dest row0
+ vaddl.u8 q11, d11, d15
+ vst1.8 {q15}, [r2], r5 @//Store dest row1
+ vqrshrun.s16 d28, q8, #1
+ vld1.8 {q0}, [r0], r3 @// Load row4 ;src1
+ vqrshrun.s16 d29, q9, #1
+ vld1.8 {q1}, [r0], r3 @// Load row5 ;src1
+ vqrshrun.s16 d30, q10, #1
+ vld1.8 {q2}, [r1], r4 @// Load row4 ;src2
+ vqrshrun.s16 d31, q11, #1
+ vld1.8 {q3}, [r1], r4 @// Load row5 ;src2
+ vaddl.u8 q10, d0, d4
+ vst1.8 {q14}, [r2], r5 @//Store dest row2
+ vaddl.u8 q13, d3, d7
+ vst1.8 {q15}, [r2], r5 @//Store dest row3
+ vaddl.u8 q11, d1, d5
+ vld1.8 {q4}, [r0], r3 @// Load row6 ;src1
+ vaddl.u8 q12, d2, d6
+ vld1.8 {q5}, [r0], r3 @// Load row7 ;src1
+ vqrshrun.s16 d28, q10, #1
+ vld1.8 {q6}, [r1], r4 @// Load row6 ;src2
+ vqrshrun.s16 d29, q11, #1
+ vld1.8 {q7}, [r1], r4 @// Load row7 ;src2
+ vaddl.u8 q8, d8, d12
+ vaddl.u8 q9, d9, d13
+ vaddl.u8 q10, d10, d14
+ vqrshrun.s16 d30, q12, #1
+ vqrshrun.s16 d31, q13, #1
+ vst1.8 {q14}, [r2], r5 @//Store dest row4
+ vaddl.u8 q11, d11, d15
+ vst1.8 {q15}, [r2], r5 @//Store dest row5
+ vqrshrun.s16 d28, q8, #1
+ vqrshrun.s16 d30, q10, #1
+ vqrshrun.s16 d29, q9, #1
+ vld1.8 {q2}, [r1], r4 @// Load row8 ;src2
+ vqrshrun.s16 d31, q11, #1
+ vst1.8 {q14}, [r2], r5 @//Store dest row6
+ subs r12, r6, #8
+ vst1.8 {q15}, [r2], r5 @//Store dest row7
+
+ beq end_func @ end function if ht=8
+
+ vld1.8 {q0}, [r0], r3 @// Load row8 ;src1
+ vaddl.u8 q10, d0, d4
+ vld1.8 {q1}, [r0], r3 @// Load row9 ;src1
+ vaddl.u8 q11, d1, d5
+ vld1.8 {q3}, [r1], r4 @// Load row9 ;src2
+ vqrshrun.s16 d28, q10, #1
+ vld1.8 {q4}, [r0], r3 @// Load row10 ;src1
+ vqrshrun.s16 d29, q11, #1
+ vld1.8 {q5}, [r0], r3 @// Load row11 ;src1
+ vaddl.u8 q12, d2, d6
+ vld1.8 {q6}, [r1], r4 @// Load row10 ;src2
+ vaddl.u8 q13, d3, d7
+ vld1.8 {q7}, [r1], r4 @// Load row11 ;src2
+ vaddl.u8 q8, d8, d12
+ vaddl.u8 q9, d9, d13
+ vaddl.u8 q10, d10, d14
+ vqrshrun.s16 d30, q12, #1
+ vst1.8 {q14}, [r2], r5 @//Store dest row8
+ vqrshrun.s16 d31, q13, #1
+ vst1.8 {q15}, [r2], r5 @//Store dest row9
+ vqrshrun.s16 d28, q8, #1
+ vld1.8 {q0}, [r0], r3 @// Load row12 ;src1
+ vaddl.u8 q11, d11, d15
+ vld1.8 {q1}, [r0], r3 @// Load row13 ;src1
+ vqrshrun.s16 d29, q9, #1
+ vld1.8 {q2}, [r1], r4 @// Load row12 ;src2
+ vqrshrun.s16 d30, q10, #1
+ vld1.8 {q3}, [r1], r4 @// Load row13 ;src2
+ vqrshrun.s16 d31, q11, #1
+ vst1.8 {q14}, [r2], r5 @//Store dest row10
+ vaddl.u8 q10, d0, d4
+ vst1.8 {q15}, [r2], r5 @//Store dest row11
+ vaddl.u8 q11, d1, d5
+ vld1.8 {q4}, [r0], r3 @// Load row14 ;src1
+ vaddl.u8 q13, d3, d7
+ vld1.8 {q5}, [r0], r3 @// Load row15 ;src1
+ vaddl.u8 q12, d2, d6
+ vld1.8 {q6}, [r1], r4 @// Load row14 ;src2
+ vaddl.u8 q8, d8, d12
+ vld1.8 {q7}, [r1], r4 @// Load row15 ;src2
+ vaddl.u8 q9, d9, d13
+ vqrshrun.s16 d28, q10, #1
+ vqrshrun.s16 d29, q11, #1
+ vaddl.u8 q10, d10, d14
+ vst1.8 {q14}, [r2], r5 @//Store dest row12
+ vqrshrun.s16 d30, q12, #1
+ vqrshrun.s16 d31, q13, #1
+ vaddl.u8 q11, d11, d15
+ vst1.8 {q15}, [r2], r5 @//Store dest row13
+ vqrshrun.s16 d28, q8, #1
+ vqrshrun.s16 d29, q9, #1
+ vqrshrun.s16 d30, q10, #1
+ vst1.8 {q14}, [r2], r5 @//Store dest row14
+ vqrshrun.s16 d31, q11, #1
+ vst1.8 {q15}, [r2], r5 @//Store dest row15
+ b end_func
+
+
+
+loop_8: @wd=8;
+ vld1.8 {d0}, [r0], r3 @// Load row0 ;src1
+ vld1.8 {d4}, [r1], r4 @// Load row0 ;src2
+ vld1.8 {d1}, [r0], r3 @// Load row1 ;src1
+ vaddl.u8 q10, d0, d4
+ vld1.8 {d5}, [r1], r4 @// Load row1 ;src2
+ vld1.8 {d2}, [r0], r3 @// Load row2 ;src1
+ vqrshrun.s16 d28, q10, #1
+ vld1.8 {d6}, [r1], r4 @// Load row2 ;src2
+ vaddl.u8 q11, d1, d5
+ vld1.8 {d3}, [r0], r3 @// Load row3 ;src1
+ vaddl.u8 q12, d2, d6
+ vst1.8 {d28}, [r2], r5 @//Store dest row0
+ vqrshrun.s16 d29, q11, #1
+ vld1.8 {d7}, [r1], r4 @// Load row3 ;src2
+ vqrshrun.s16 d30, q12, #1
+ vst1.8 {d29}, [r2], r5 @//Store dest row1
+ vaddl.u8 q13, d3, d7
+ vst1.8 {d30}, [r2], r5 @//Store dest row2
+ vqrshrun.s16 d31, q13, #1
+ subs r12, r6, #4
+ vst1.8 {d31}, [r2], r5 @//Store dest row3
+ beq end_func @ end function if ht=4
+
+ vld1.8 {d12}, [r1], r4 @// Load row4 ;src2
+ vld1.8 {d8}, [r0], r3 @// Load row4 ;src1
+ vld1.8 {d9}, [r0], r3 @// Load row5 ;src1
+ vaddl.u8 q8, d8, d12
+ vld1.8 {d13}, [r1], r4 @// Load row5 ;src2
+ vld1.8 {d10}, [r0], r3 @// Load row6;src1
+ vaddl.u8 q9, d9, d13
+ vld1.8 {d14}, [r1], r4 @// Load row6 ;src2
+ vqrshrun.s16 d28, q8, #1
+ vld1.8 {d11}, [r0], r3 @// Load row7 ;src1
+ vqrshrun.s16 d29, q9, #1
+ vst1.8 {d28}, [r2], r5 @//Store dest row4
+ vaddl.u8 q10, d10, d14
+ vst1.8 {d29}, [r2], r5 @//Store dest row5
+ vqrshrun.s16 d30, q10, #1
+ vld1.8 {d15}, [r1], r4 @// Load row7 ;src2
+ vaddl.u8 q11, d11, d15
+ vst1.8 {d30}, [r2], r5 @//Store dest row6
+ vqrshrun.s16 d31, q11, #1
+ subs r12, r6, #8
+ vst1.8 {d31}, [r2], r5 @//Store dest row7
+ beq end_func @ end function if ht=8
+
+ vld1.8 {d0}, [r0], r3 @// Load row8 ;src1
+ vld1.8 {d4}, [r1], r4 @// Load row8 ;src2
+ vld1.8 {d1}, [r0], r3 @// Load row9 ;src1
+ vaddl.u8 q10, d0, d4
+ vld1.8 {d5}, [r1], r4 @// Load row9 ;src2
+ vld1.8 {d2}, [r0], r3 @// Load row10 ;src1
+ vaddl.u8 q11, d1, d5
+ vld1.8 {d6}, [r1], r4 @// Load row10 ;src2
+ vqrshrun.s16 d28, q10, #1
+ vld1.8 {d3}, [r0], r3 @// Load row11 ;src1
+ vaddl.u8 q12, d2, d6
+ vld1.8 {d7}, [r1], r4 @// Load row11 ;src2
+ vqrshrun.s16 d29, q11, #1
+ vld1.8 {d8}, [r0], r3 @// Load row12 ;src1
+ vaddl.u8 q13, d3, d7
+ vst1.8 {d28}, [r2], r5 @//Store dest row8
+ vqrshrun.s16 d30, q12, #1
+ vld1.8 {d12}, [r1], r4 @// Load row12 ;src2
+ vqrshrun.s16 d31, q13, #1
+ vst1.8 {d29}, [r2], r5 @//Store dest row9
+ vaddl.u8 q8, d8, d12
+ vld1.8 {d9}, [r0], r3 @// Load row13 ;src1
+ vqrshrun.s16 d28, q8, #1
+ vld1.8 {d13}, [r1], r4 @// Load row13 ;src2
+ vld1.8 {d10}, [r0], r3 @// Load row14;src1
+ vaddl.u8 q9, d9, d13
+ vld1.8 {d11}, [r0], r3 @// Load row15 ;src1
+ vld1.8 {d14}, [r1], r4 @// Load row14 ;src2
+ vqrshrun.s16 d29, q9, #1
+ vld1.8 {d15}, [r1], r4 @// Load roW15 ;src2
+ vaddl.u8 q10, d10, d14
+ vst1.8 {d30}, [r2], r5 @//Store dest row10
+ vaddl.u8 q11, d11, d15
+ vst1.8 {d31}, [r2], r5 @//Store dest row11
+ vqrshrun.s16 d30, q10, #1
+ vst1.8 {d28}, [r2], r5 @//Store dest row12
+ vqrshrun.s16 d31, q11, #1
+ vst1.8 {d29}, [r2], r5 @//Store dest row13
+ vst1.8 {d30}, [r2], r5 @//Store dest row14
+ vst1.8 {d31}, [r2], r5 @//Store dest row15
+
+ b end_func
+
+
+
+loop_4:
+ vld1.32 d0[0], [r0], r3 @// Load row0 ;src1
+ vld1.32 d4[0], [r1], r4 @// Load row0 ;src2
+ vld1.32 d1[0], [r0], r3 @// Load row1 ;src1
+ vaddl.u8 q10, d0, d4
+ vld1.32 d5[0], [r1], r4 @// Load row1 ;src2
+ vld1.32 d2[0], [r0], r3 @// Load row2 ;src1
+ vqrshrun.s16 d28, q10, #1
+ vld1.32 d6[0], [r1], r4 @// Load row2 ;src2
+ vaddl.u8 q11, d1, d5
+ vld1.32 d3[0], [r0], r3 @// Load row3 ;src1
+ vaddl.u8 q12, d2, d6
+ vst1.32 d28[0], [r2], r5 @//Store dest row0
+ vqrshrun.s16 d29, q11, #1
+ vld1.32 d7[0], [r1], r4 @// Load row3 ;src2
+ vqrshrun.s16 d30, q12, #1
+ vst1.32 d29[0], [r2], r5 @//Store dest row1
+ vaddl.u8 q13, d3, d7
+ vst1.32 d30[0], [r2], r5 @//Store dest row2
+ vqrshrun.s16 d31, q13, #1
+ subs r12, r6, #4
+ vst1.32 d31[0], [r2], r5 @//Store dest row3
+ beq end_func @ end function if ht=4
+
+ vld1.32 d12[0], [r1], r4 @// Load row4 ;src2
+ vld1.32 d8[0], [r0], r3 @// Load row4 ;src1
+ vld1.32 d9[0], [r0], r3 @// Load row5 ;src1
+ vaddl.u8 q8, d8, d12
+ vld1.32 d13[0], [r1], r4 @// Load row5 ;src2
+ vld1.32 d10[0], [r0], r3 @// Load row6;src1
+ vaddl.u8 q9, d9, d13
+ vld1.32 d14[0], [r1], r4 @// Load row6 ;src2
+ vqrshrun.s16 d28, q8, #1
+ vld1.32 d11[0], [r0], r3 @// Load row7 ;src1
+ vqrshrun.s16 d29, q9, #1
+ vst1.32 d28[0], [r2], r5 @//Store dest row4
+ vaddl.u8 q10, d10, d14
+ vst1.32 d29[0], [r2], r5 @//Store dest row5
+ vqrshrun.s16 d30, q10, #1
+ vld1.32 d15[0], [r1], r4 @// Load row7 ;src2
+ vaddl.u8 q11, d11, d15
+ vst1.32 d30[0], [r2], r5 @//Store dest row6
+ vqrshrun.s16 d31, q11, #1
+ vst1.32 d31[0], [r2], r5 @//Store dest row7
+
+end_func:
+
+ vldmia sp!, {d8-d15} @ Restore neon registers that were saved
+ ldmfd sp!, {r4-r12, pc} @Restoring registers from stack
+
+
diff --git a/common/arm/ih264_inter_pred_luma_copy_a9q.s b/common/arm/ih264_inter_pred_luma_copy_a9q.s
new file mode 100755
index 0000000..8ba2fbf
--- /dev/null
+++ b/common/arm/ih264_inter_pred_luma_copy_a9q.s
@@ -0,0 +1,253 @@
+@/******************************************************************************
+@ *
+@ * Copyright (C) 2015 The Android Open Source Project
+@ *
+@ * Licensed under the Apache License, Version 2.0 (the "License");
+@ * you may not use this file except in compliance with the License.
+@ * You may obtain a copy of the License at:
+@ *
+@ * http://www.apache.org/licenses/LICENSE-2.0
+@ *
+@ * Unless required by applicable law or agreed to in writing, software
+@ * distributed under the License is distributed on an "AS IS" BASIS,
+@ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@ * See the License for the specific language governing permissions and
+@ * limitations under the License.
+@ *
+@ *****************************************************************************
+@ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
+@*/
+@/**
+@/**
+@*******************************************************************************
+@*
+@* @brief
+@* Interprediction luma function for copy
+@*
+@* @par Description:
+@* Copies the array of width 'wd' and height 'ht' from the location pointed
+@* by 'src' to the location pointed by 'dst'
+@*
+@* @param[in] pu1_src
+@* UWORD8 pointer to the source
+@*
+@* @param[out] pu1_dst
+@* UWORD8 pointer to the destination
+@*
+@* @param[in] src_strd
+@* integer source stride
+@*
+@* @param[in] dst_strd
+@* integer destination stride
+@*
+@*
+@* @param[in] ht
+@* integer height of the array
+@*
+@* @param[in] wd
+@* integer width of the array
+@*
+@* @returns
+@*
+@* @remarks
+@* None
+@*
+@*******************************************************************************
+@*/
+@void ih264_inter_pred_luma_copy (
+@ UWORD8 *pu1_src,
+@ UWORD8 *pu1_dst,
+@ WORD32 src_strd,
+@ WORD32 dst_strd,
+@ WORD32 ht,
+@ WORD32 wd )
+
+@**************Variables Vs Registers*****************************************
+@ r0 => *pu1_src
+@ r1 => *pu1_dst
+@ r2 => src_strd
+@ r3 => dst_strd
+@ r7 => ht
+@ r12 => wd
+
+.text
+.p2align 2
+
+ .global ih264_inter_pred_luma_copy_a9q
+
+ih264_inter_pred_luma_copy_a9q:
+ stmfd sp!, {r4-r12, r14} @stack stores the values of the arguments
+ vstmdb sp!, {d8-d15} @push neon registers to stack
+ ldr r12, [sp, #108] @Loads wd
+ ldr r7, [sp, #104] @Loads ht
+ cmp r7, #0 @checks ht == 0
+ ble end_loops
+ tst r12, #15 @checks wd for multiples for 4 & 8
+ beq core_loop_wd_16
+ tst r12, #7 @checks wd for multiples for 4 & 8
+ beq core_loop_wd_8
+ sub r11, r12, #4
+
+outer_loop_wd_4:
+ subs r4, r12, #0 @checks wd == 0
+ ble end_inner_loop_wd_4
+
+inner_loop_wd_4:
+ vld1.32 {d0[0]}, [r0] @vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp, 0)
+ add r5, r0, r2 @pu1_src_tmp += src_strd
+ add r6, r1, r3 @pu1_dst_tmp += dst_strd
+ vst1.32 {d0[0]}, [r1] @vst1_lane_u32((uint32_t *)pu1_dst_tmp, src_tmp, 0)
+ vld1.32 {d0[0]}, [r5], r2 @vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp, 0)
+ add r0, r0, #4 @pu1_src += 4
+ vst1.32 {d0[0]}, [r6], r3 @vst1_lane_u32((uint32_t *)pu1_dst_tmp, src_tmp, 0)
+ vld1.32 {d0[0]}, [r5], r2 @vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp, 0)
+ subs r4, r4, #4 @(wd -4)
+ vst1.32 {d0[0]}, [r6], r3 @vst1_lane_u32((uint32_t *)pu1_dst_tmp, src_tmp, 0)
+ vld1.32 {d0[0]}, [r5], r2 @vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp, 0)
+ add r1, r1, #4 @pu1_dst += 4
+ vst1.32 {d0[0]}, [r6], r3 @vst1_lane_u32((uint32_t *)pu1_dst_tmp, src_tmp, 0)
+
+ bgt inner_loop_wd_4
+
+end_inner_loop_wd_4:
+ subs r7, r7, #4 @ht - 4
+ sub r0, r5, r11 @pu1_src = pu1_src_tmp
+ sub r1, r6, r11 @pu1_dst = pu1_dst_tmp
+ bgt outer_loop_wd_4
+
+end_loops:
+ vldmia sp!, {d8-d15} @ Restore neon registers that were saved
+ ldmfd sp!, {r4-r12, r15} @Reload the registers from SP
+
+
+
+core_loop_wd_8:
+ sub r11, r12, #8
+
+outer_loop_wd_8:
+ subs r4, r12, #0 @checks wd
+ ble end_inner_loop_wd_8
+
+inner_loop_wd_8:
+ add r5, r0, r2 @pu1_src_tmp += src_strd
+ vld1.8 {d0}, [r0]! @vld1_u8(pu1_src_tmp)
+ add r6, r1, r3 @pu1_dst_tmp += dst_strd
+ vst1.8 {d0}, [r1]! @vst1_u8(pu1_dst_tmp, tmp_src)
+ vld1.8 {d1}, [r5], r2 @vld1_u8(pu1_src_tmp)
+ vst1.8 {d1}, [r6], r3 @vst1_u8(pu1_dst_tmp, tmp_src)
+ subs r4, r4, #8 @wd - 8(Loop condition)
+ vld1.8 {d2}, [r5], r2 @vld1_u8(pu1_src_tmp)
+ vst1.8 {d2}, [r6], r3 @vst1_u8(pu1_dst_tmp, tmp_src)
+ vld1.8 {d3}, [r5], r2 @vld1_u8(pu1_src_tmp)
+ vst1.8 {d3}, [r6], r3 @vst1_u8(pu1_dst_tmp, tmp_src)
+ bgt inner_loop_wd_8
+
+end_inner_loop_wd_8:
+ subs r7, r7, #4 @ht -= 4
+ sub r0, r5, r11 @pu1_src = pu1_src_tmp
+ sub r1, r6, r11 @pu1_dst = pu1_dst_tmp
+ bgt outer_loop_wd_8
+
+ vldmia sp!, {d8-d15} @ Restore neon registers that were saved
+ ldmfd sp!, {r4-r12, r15} @Reload the registers from SP
+
+core_loop_wd_16:
+ sub r11, r12, #16
+
+outer_loop_wd_16:
+ subs r4, r12, #0 @checks wd
+ ble end_inner_loop_wd_16
+
+inner_loop_wd_16:
+ add r5, r0, r2 @pu1_src_tmp += src_strd
+ vld1.8 {q0}, [r0]! @vld1_u8(pu1_src_tmp)
+ add r6, r1, r3 @pu1_dst_tmp += dst_strd
+ vst1.8 {q0}, [r1]! @vst1_u8(pu1_dst_tmp, tmp_src)
+ vld1.8 {q1}, [r5], r2 @vld1_u8(pu1_src_tmp)
+ vst1.8 {q1}, [r6], r3 @vst1_u8(pu1_dst_tmp, tmp_src)
+ subs r4, r4, #16 @wd - 8(Loop condition)
+ vld1.8 {q2}, [r5], r2 @vld1_u8(pu1_src_tmp)
+ vst1.8 {q2}, [r6], r3 @vst1_u8(pu1_dst_tmp, tmp_src)
+ vld1.8 {q3}, [r5], r2 @vld1_u8(pu1_src_tmp)
+ vst1.8 {q3}, [r6], r3 @vst1_u8(pu1_dst_tmp, tmp_src)
+ bgt inner_loop_wd_16
+
+end_inner_loop_wd_16:
+ subs r7, r7, #4 @ht -= 4
+ sub r0, r5, r11 @pu1_src = pu1_src_tmp
+ sub r1, r6, r11 @pu1_dst = pu1_dst_tmp
+ bgt outer_loop_wd_16
+
+ vldmia sp!, {d8-d15} @ Restore neon registers that were saved
+ ldmfd sp!, {r4-r12, r15} @Reload the registers from SP
+
+
+@ /*
+@ ********************************************************************************
+@ *
+@ * @brief This function copies a 4x4 block to destination
+@ *
+@ * @par Description:
+@ * Copies a 4x4 block to destination, where both src and dst are interleaved
+@ *
+@ * @param[in] pi2_src
+@ * Source
+@ *
+@ * @param[in] pu1_out
+@ * Output pointer
+@ *
+@ * @param[in] pred_strd,
+@ * Prediction buffer stride
+@ *
+@ * @param[in] out_strd
+@ * output buffer buffer Stride
+@ *
+@ * @returns none
+@ *
+@ * @remarks none
+@ * Currently wd and height is not used, ie a 4x4 block is always copied
+@ *
+@ *******************************************************************************
+@ */
+@ void ih264_interleave_copy(WORD16 *pi2_src,
+@ UWORD8 *pu1_out,
+@ WORD32 pred_strd,
+@ WORD32 out_strd
+@ WORD32 wd
+@ WORD32 ht)
+@ Register Usage
+@ r0 : pi2_src
+@ r1 : pu1_out
+@ r2 : src_strd
+@ r3 : out_strd
+@ Neon registers d0-d7, d16-d30 are used
+@ No need for pushing arm and neon registers
+
+ .global ih264_interleave_copy_a9
+ih264_interleave_copy_a9:
+
+ vld1.u8 d2, [r0], r2 @load src plane 1 => d2 &pred palne 2 => d3
+ vld1.u8 d3, [r0], r2
+ vld1.u8 d4, [r0], r2
+ vld1.u8 d5, [r0], r2
+
+ mov r0, r1
+
+ vld1.u8 d18, [r1], r3 @load out [8 bit size) -8 coeffs
+ vld1.u8 d19, [r1], r3
+ vmov.u16 q15, #0x00ff
+ vld1.u8 d20, [r1], r3
+ vld1.u8 d21, [r1], r3
+
+ vbit.u8 q9, q1, q15
+ vbit.u8 q10, q2, q15
+
+ vst1.u8 d18, [r0], r3 @store out
+ vst1.u8 d19, [r0], r3
+ vst1.u8 d20, [r0], r3
+ vst1.u8 d21, [r0], r3
+
+ bx lr
+
+
+
diff --git a/common/arm/ih264_inter_pred_luma_horz_hpel_vert_hpel_a9q.s b/common/arm/ih264_inter_pred_luma_horz_hpel_vert_hpel_a9q.s
new file mode 100755
index 0000000..43321a8
--- /dev/null
+++ b/common/arm/ih264_inter_pred_luma_horz_hpel_vert_hpel_a9q.s
@@ -0,0 +1,441 @@
+@/******************************************************************************
+@ *
+@ * Copyright (C) 2015 The Android Open Source Project
+@ *
+@ * Licensed under the Apache License, Version 2.0 (the "License");
+@ * you may not use this file except in compliance with the License.
+@ * You may obtain a copy of the License at:
+@ *
+@ * http://www.apache.org/licenses/LICENSE-2.0
+@ *
+@ * Unless required by applicable law or agreed to in writing, software
+@ * distributed under the License is distributed on an "AS IS" BASIS,
+@ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@ * See the License for the specific language governing permissions and
+@ * limitations under the License.
+@ *
+@ *****************************************************************************
+@ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
+@*/
+@/**
+@******************************************************************************
+@* @file
+@* ih264_inter_pred_luma_horz_hpel_vert_hpel_a9q.s
+@*
+@* @brief
+@* Contains function definitions for inter prediction interpolation.
+@*
+@* @author
+@* Mohit
+@*
+@* @par List of Functions:
+@*
+@* - ih264_inter_pred_luma_horz_hpel_vert_hpel_a9q()
+@*
+@* @remarks
+@* None
+@*
+@*******************************************************************************
+@*/
+
+@/* All the functions here are replicated from ih264_inter_pred_filters.c
+@
+
+@/**
+@/**
+@/**
+@*******************************************************************************
+@*
+@* @brief
+@* This function implements a two stage cascaded six tap filter. It
+@* applies the six tap filter in the vertical direction on the
+@* predictor values, followed by applying the same filter in the
+@* horizontal direction on the output of the first stage. The six tap
+@* filtering operation is described in sec 8.4.2.2.1 titled "Luma sample
+@* interpolation process"
+@*
+@* @par Description:
+@* This function is called to obtain pixels lying at the following
+@* location (1/2,1/2). The function interpolates
+@* the predictors first in the horizontal direction and then in the
+@* vertical direction to output the (1/2,1/2).
+@*
+@* @param[in] pu1_src
+@* UWORD8 pointer to the source
+@*
+@* @param[out] pu1_dst
+@* UWORD8 pointer to the destination
+@*
+@* @param[in] src_strd
+@* integer source stride
+@*
+@* @param[in] dst_strd
+@* integer destination stride
+@*
+@* @param[in] ht
+@* integer height of the array
+@*
+@* @param[in] wd
+@* integer width of the array
+@*
+@* @param[in] pu1_tmp: temporary buffer
+@*
+@* @param[in] dydx: x and y reference offset for qpel calculations: UNUSED in this function.
+@*
+@* @returns
+@*
+@* @remarks
+@* None
+@*
+@*******************************************************************************
+@*/;
+
+@void ih264_inter_pred_luma_horz_hpel_vert_hpel(UWORD8 *pu1_src,
+@ UWORD8 *pu1_dst,
+@ WORD32 src_strd,,
+@ WORD32 dst_strd,
+@ WORD32 ht,
+@ WORD32 wd,
+@ UWORD8* pu1_tmp,
+@ UWORD32 dydx)
+
+@**************Variables Vs Registers*****************************************
+@ r0 => *pu1_src
+@ r1 => *pu1_dst
+@ r2 => src_strd
+@ r3 => dst_strd
+@ r8 => ht
+@ r9 => wd
+
+.text
+.p2align 2
+
+ .global ih264_inter_pred_luma_horz_hpel_vert_hpel_a9q
+
+ih264_inter_pred_luma_horz_hpel_vert_hpel_a9q:
+
+ stmfd sp!, {r4-r12, r14} @store register values to stack
+ vstmdb sp!, {d8-d15} @push neon registers to stack
+ ldr r8, [sp, #104] @ loads ht
+ sub r0, r0, r2, lsl #1 @pu1_src-2*src_strd
+ sub r0, r0, #2 @pu1_src-2
+ ldr r9, [sp, #108] @ loads wd
+
+ vmov.s16 d0, #20 @ Filter coeff 20
+ vmov.s16 d1, #5 @ Filter coeff 5
+ subs r12, r9, #4 @if wd=4 branch to loop_4
+ beq loop_4
+ subs r12, r9, #8 @if wd=8 branch to loop_8
+ beq loop_8
+
+ mov r10, #8
+ sub r7, r3, r10
+ @when wd=16
+
+loop_16:
+ vld1.u32 {d2, d3, d4}, [r0], r2 @ Vector load from src[0_0]
+ vld1.u32 {d5, d6, d7}, [r0], r2 @ Vector load from src[1_0]
+ vld1.u32 {d8, d9, d10}, [r0], r2 @ Vector load from src[2_0]
+ vld1.u32 {d11, d12, d13}, [r0], r2 @ Vector load from src[3_0]
+ vld1.u32 {d14, d15, d16}, [r0], r2 @ Vector load from src[4_0]
+ vld1.u32 {d17, d18, d19}, [r0], r2 @ Vector load from src[5_0]
+
+ @ vERTICAL FILTERING FOR ROW 0
+ vaddl.u8 q10, d8, d11 @ temp1 = src[2_0] + src[3_0]
+ vaddl.u8 q12, d2, d17 @ temp2 = src[0_0] + src[5_0]
+ vaddl.u8 q11, d5, d14 @ temp = src[1_0] + src[4_0]
+ vaddl.u8 q13, d3, d18 @ temp2 = src[0_0] + src[5_0]
+ vmla.u16 q12, q10, d0[0] @ temp += temp1 * 20
+ vmls.s16 q12, q11, d1[0] @ temp -= temp2 * 5
+ vaddl.u8 q10, d6, d15 @ temp = src[1_0] + src[4_0]
+ vaddl.u8 q11, d9, d12 @ temp3 = src[2_0] + src[3_0]
+ vaddl.u8 q14, d4, d19 @ temp2 = src[0_0] + src[5_0]
+ vmla.u16 q13, q11, d0[0] @ temp4 += temp3 * 20
+ vmls.s16 q13, q10, d1[0] @ temp -= temp2 * 5
+ vaddl.u8 q11, d10, d13 @ temp3 = src[2_0] + src[3_0]
+ vaddl.u8 q10, d7, d16 @ temp = src[1_0] + src[4_0]
+ vmla.u16 q14, q11, d0[0] @ temp4 += temp3 * 20
+ vmls.s16 q14, q10, d1[0] @ temp -= temp2 * 5
+ vext.16 q10, q12, q13, #5 @//extract a[5] (column1)
+
+ @Q12,Q13,Q14 HAVE VERTICAL FILTERED VALUES
+ @CASCADED FILTERING FOR ROW 0
+ vext.16 q11, q12, q13, #2 @//extract a[2] (column1)
+ vaddl.s16 q1, d20, d24 @// a0 + a5 (column1)
+ vaddl.s16 q15, d21, d25 @// a0 + a5 (column1)
+ vmlal.s16 q1, d22, d0[0] @// a0 + a5 + 20a2 (column1)
+ vmlal.s16 q15, d23, d0[0] @// a0 + a5 + 20a2 (column1)
+ vext.16 q11, q12, q13, #1 @//extract a[1] (column1)
+ vext.16 q10, q12, q13, #3 @//extract a[3] (column1)
+ vmlsl.s16 q1, d22, d1[0] @// a0 + a5 + 20a2 + 20a3 - 5a1 (column1)
+ vmlsl.s16 q15, d23, d1[0] @// a0 + a5 + 20a2 + 20a3 - 5a1 (column1)
+ vmlal.s16 q1, d20, d0[0] @// a0 + a5 + 20a2 + 20a3 (column1)
+ vmlal.s16 q15, d21, d0[0] @// a0 + a5 + 20a2 + 20a3 (column1)
+ vext.16 q11, q12, q13, #4 @//extract a[4] (column1)
+ vext.16 q10, q13, q14, #5 @//extract a[5] (column2)
+ vmlsl.s16 q1, d22, d1[0] @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1)
+ vmlsl.s16 q15, d23, d1[0] @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1)
+ vqrshrun.s32 d22, q1, #10
+ vqrshrun.s32 d23, q15, #10
+ vqshrun.s16 d22, q11, #0
+ vst1.u8 {d22}, [r1], r10 @//Store dest row0, column 1; (1/2,1/2)
+ vext.16 q11, q13, q14, #2 @//extract a[2] (column2)
+ vaddl.s16 q1, d20, d26 @// a0 + a5 (column2)
+ vaddl.s16 q15, d21, d27 @// a0 + a5 (column2)
+ vmlal.s16 q1, d22, d0[0] @// a0 + a5 + 20a2 (column2)
+ vmlal.s16 q15, d23, d0[0] @// a0 + a5 + 20a2 (column2)
+ vext.16 q10, q13, q14, #3 @//extract a[3] (column2)
+ vext.16 q11, q13, q14, #1 @//extract a[1] (column2)
+ vmlal.s16 q1, d20, d0[0] @// a0 + a5 + 20a2 + 20a3 (column2)
+ vmlal.s16 q15, d21, d0[0] @// a0 + a5 + 20a2 + 20a3 (column2)
+ vext.16 q10, q13, q14, #4 @//extract a[4] (column2)
+ vmlsl.s16 q1, d22, d1[0] @// a0 + a5 + 20a2 + 20a3 - 5a1 (column2)
+ vmlsl.s16 q15, d23, d1[0] @// a0 + a5 + 20a2 + 20a3 - 5a1 (column2)
+ vmlsl.s16 q1, d20, d1[0] @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column2)
+ vmlsl.s16 q15, d21, d1[0] @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column2)
+ vqrshrun.s32 d20, q1, #10
+ vqrshrun.s32 d21, q15, #10
+ vld1.u32 {d2, d3, d4}, [r0], r2 @ Vector load from src[6_0]
+ vqshrun.s16 d22, q10, #0
+ vst1.u8 {d22}, [r1], r7 @//Store dest row0 ,column 2; (1/2,1/2)
+
+ @ vERTICAL FILTERING FOR ROW 1
+ vaddl.u8 q10, d11, d14 @ temp1 = src[2_0] + src[3_0]
+ vaddl.u8 q12, d5, d2 @ temp2 = src[0_0] + src[5_0]
+ vaddl.u8 q11, d8, d17 @ temp = src[1_0] + src[4_0]
+ vaddl.u8 q13, d6, d3 @ temp2 = src[0_0] + src[5_0]
+ vmla.u16 q12, q10, d0[0] @ temp += temp1 * 20
+ vaddl.u8 q10, d9, d18 @ temp = src[1_0] + src[4_0]
+ vmls.s16 q12, q11, d1[0] @ temp -= temp2 * 5
+ vaddl.u8 q11, d12, d15 @ temp3 = src[2_0] + src[3_0]
+ vaddl.u8 q14, d7, d4 @ temp2 = src[0_0] + src[5_0]
+ vmla.u16 q13, q11, d0[0] @ temp4 += temp3 * 20
+ vaddl.u8 q11, d13, d16 @ temp3 = src[2_0] + src[3_0]
+ vmls.s16 q13, q10, d1[0] @ temp -= temp2 * 5
+ vmla.u16 q14, q11, d0[0] @ temp4 += temp3 * 20
+ vaddl.u8 q10, d10, d19 @ temp = src[1_0] + src[4_0]
+ vmls.s16 q14, q10, d1[0] @ temp -= temp2 * 5
+ vext.16 q10, q12, q13, #5 @//extract a[5] (column1)
+
+ @Q12,Q13,Q14 HAVE VERTICAL FILTERED VALUES
+ @CASCADED FILTERING FOR ROW 1
+ vext.16 q11, q12, q13, #2 @//extract a[2] (column1)
+ vaddl.s16 q3, d20, d24 @// a0 + a5 (column1)
+ vaddl.s16 q15, d21, d25 @// a0 + a5 (column1)
+ vmlal.s16 q3, d22, d0[0] @// a0 + a5 + 20a2 (column1)
+ vmlal.s16 q15, d23, d0[0] @// a0 + a5 + 20a2 (column1)
+ vext.16 q11, q12, q13, #1 @//extract a[1] (column1)
+ vext.16 q10, q12, q13, #3 @//extract a[3] (column1)
+ vmlsl.s16 q3, d22, d1[0] @// a0 + a5 + 20a2 + 20a3 - 5a1 (column1)
+ vmlsl.s16 q15, d23, d1[0] @// a0 + a5 + 20a2 + 20a3 - 5a1 (column1)
+ vmlal.s16 q3, d20, d0[0] @// a0 + a5 + 20a2 + 20a3 (column1)
+ vmlal.s16 q15, d21, d0[0] @// a0 + a5 + 20a2 + 20a3 (column1)
+ vext.16 q11, q12, q13, #4 @//extract a[4] (column1)
+ vext.16 q10, q13, q14, #5 @//extract a[5] (column2)
+ vmlsl.s16 q3, d22, d1[0] @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1)
+ vmlsl.s16 q15, d23, d1[0] @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1)
+ vqrshrun.s32 d22, q3, #10
+ vqrshrun.s32 d23, q15, #10
+ vqshrun.s16 d22, q11, #0
+ vst1.u8 {d22}, [r1], r10 @//Store dest row1, column 1; (1/2,1/2)
+ vext.16 q11, q13, q14, #2 @//extract a[2] (column2)
+ vaddl.s16 q3, d20, d26 @// a0 + a5 (column2)
+ vaddl.s16 q15, d21, d27 @// a0 + a5 (column2)
+ vmlal.s16 q3, d22, d0[0] @// a0 + a5 + 20a2 (column2)
+ vmlal.s16 q15, d23, d0[0] @// a0 + a5 + 20a2 (column2)
+ vext.16 q10, q13, q14, #3 @//extract a[3] (column2)
+ vext.16 q11, q13, q14, #1 @//extract a[1] (column2)
+ vmlal.s16 q3, d20, d0[0] @// a0 + a5 + 20a2 + 20a3 (column2)
+ vmlal.s16 q15, d21, d0[0] @// a0 + a5 + 20a2 + 20a3 (column2)
+ vext.16 q10, q13, q14, #4 @//extract a[4] (column2)
+ vmlsl.s16 q3, d22, d1[0] @// a0 + a5 + 20a2 + 20a3 - 5a1 (column2)
+ vmlsl.s16 q15, d23, d1[0] @// a0 + a5 + 20a2 + 20a3 - 5a1 (column2)
+ vmlsl.s16 q3, d20, d1[0] @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column2)
+ vmlsl.s16 q15, d21, d1[0] @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column2)
+ vqrshrun.s32 d20, q3, #10
+ vqrshrun.s32 d21, q15, #10
+ vqshrun.s16 d22, q10, #0
+ vst1.u8 {d22}, [r1], r7 @//Store dest row1 ,column 2; (1/2,1/2)
+
+ subs r8, r8, #2 @ 2 rows processed, decrement by 2
+ subne r0, r0 , r2, lsl #2
+ subne r0, r0, r2
+ beq end_func @ Branch if height==4
+
+ b loop_16 @ looping if height = 8 or 16
+
+loop_8:
+ vld1.u32 {d2, d3}, [r0], r2 @ Vector load from src[0_0]
+ vld1.u32 {d4, d5}, [r0], r2 @ Vector load from src[1_0]
+ vld1.u32 {d6, d7}, [r0], r2 @ Vector load from src[2_0]
+ vld1.u32 {d8, d9}, [r0], r2 @ Vector load from src[3_0]
+ vld1.u32 {d10, d11}, [r0], r2 @ Vector load from src[4_0]
+ vld1.u32 {d12, d13}, [r0], r2 @ Vector load from src[5_0]
+
+ @ vERTICAL FILTERING FOR ROW 0
+ vaddl.u8 q10, d6, d8 @ temp1 = src[2_0] + src[3_0]
+ vaddl.u8 q11, d4, d10 @ temp2 = src[1_0] + src4_0]
+ vaddl.u8 q12, d2, d12 @ temp = src[0_0] + src[5_0]
+ vaddl.u8 q13, d3, d13 @ temp = src[0_0] + src[5_0]
+ vaddl.u8 q14, d7, d9 @ temp1 = src[2_0] + src[3_0]
+ vmla.u16 q12, q10, d0[0] @ temp += temp1 * 20
+ vmls.s16 q12, q11, d1[0] @ temp -= temp2 * 5
+ vaddl.u8 q15, d5, d11 @ temp2 = src[1_0] + src4_0]
+ vmla.u16 q13, q14, d0[0] @ temp += temp1 * 20
+ vmls.s16 q13, q15, d1[0] @ temp -= temp2 * 5
+ @Q12,Q13 HAVE VERTICAL FILTERED VALUES
+ @CASCADED FILTERING FOR ROW 0
+
+ vext.16 q10, q12, q13, #5 @//extract a[5] (column1)
+ vext.16 q11, q12, q13, #2 @//extract a[2] (column1)
+ vaddl.s16 q14, d20, d24 @// a0 + a5 (column1)
+ vaddl.s16 q15, d21, d25 @// a0 + a5 (column1)
+ vext.16 q9, q12, q13, #1 @//extract a[1] (column1)
+ vext.16 q10, q12, q13, #3 @//extract a[3] (column1)
+ vext.16 q1, q12, q13, #4 @//extract a[4] (column1)
+ vmlal.s16 q14, d22, d0[0] @// a0 + a5 + 20a2 (column1)
+ vmlsl.s16 q14, d18, d1[0] @// a0 + a5 + 20a2 + 20a3 - 5a1 (column1)
+ vmlal.s16 q14, d20, d0[0] @// a0 + a5 + 20a2 + 20a3 (column1)
+ vmlsl.s16 q14, d2, d1[0] @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1)
+ vld1.u32 {d14, d15}, [r0], r2 @ Vector load from src[6_0]
+ vmlal.s16 q15, d23, d0[0] @// a0 + a5 + 20a2 (column1)
+ vmlal.s16 q15, d21, d0[0] @// a0 + a5 + 20a2 + 20a3 (column1)
+ vmlsl.s16 q15, d19, d1[0] @// a0 + a5 + 20a2 + 20a3 - 5a1 (column1)
+ vmlsl.s16 q15, d3, d1[0] @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1)
+
+ vaddl.u8 q12, d4, d14 @ temp = src[0_0] + src[5_0]
+ vaddl.u8 q13, d5, d15 @ temp = src[0_0] + src[5_0]
+ vqrshrun.s32 d18, q14, #10
+ vaddl.u8 q14, d9, d11 @ temp1 = src[2_0] + src[3_0]
+ vaddl.u8 q10, d8, d10 @ temp1 = src[2_0] + src[3_0]
+ vaddl.u8 q11, d6, d12 @ temp2 = src[1_0] + src4_0]
+ vqrshrun.s32 d19, q15, #10
+ vmla.u16 q12, q10, d0[0] @ temp += temp1 * 20
+ vmls.s16 q12, q11, d1[0] @ temp -= temp2 * 5
+ vaddl.u8 q15, d7, d13 @ temp2 = src[1_0] + src4_0]
+ vmla.u16 q13, q14, d0[0] @ temp += temp1 * 20
+ vmls.s16 q13, q15, d1[0] @ temp -= temp2 * 5
+ vqshrun.s16 d2, q9, #0
+ @ vERTICAL FILTERING FOR ROW 1
+
+ @Q12,Q13 HAVE VERTICAL FILTERED VALUES
+ @CASCADED FILTERING FOR ROW 1
+ vext.16 q10, q12, q13, #5 @//extract a[5] (column1)
+ vext.16 q11, q12, q13, #2 @//extract a[2] (column1)
+ vaddl.s16 q14, d20, d24 @// a0 + a5 (column1)
+ vaddl.s16 q15, d21, d25 @// a0 + a5 (column1)
+ vst1.u8 {d2}, [r1], r3 @//Store dest row0, column 1; (1/2,1/2)
+ vext.16 q9, q12, q13, #1 @//extract a[1] (column1)
+ vext.16 q10, q12, q13, #3 @//extract a[3] (column1)
+ vext.16 q2, q12, q13, #4 @//extract a[4] (column1)
+ vmlal.s16 q14, d22, d0[0] @// a0 + a5 + 20a2 (column1)
+ vmlsl.s16 q14, d18, d1[0] @// a0 + a5 + 20a2 + 20a3 - 5a1 (column1)
+ vmlal.s16 q14, d20, d0[0] @// a0 + a5 + 20a2 + 20a3 (column1)
+ vmlsl.s16 q14, d4, d1[0] @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1)
+ vmlal.s16 q15, d23, d0[0] @// a0 + a5 + 20a2 (column1)
+ vmlal.s16 q15, d21, d0[0] @// a0 + a5 + 20a2 + 20a3 (column1)
+ vmlsl.s16 q15, d19, d1[0] @// a0 + a5 + 20a2 + 20a3 - 5a1 (column1)
+ vmlsl.s16 q15, d5, d1[0] @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1)
+ vqrshrun.s32 d18, q14, #10
+ vqrshrun.s32 d19, q15, #10
+ vqshrun.s16 d3, q9, #0
+ vst1.u8 {d3}, [r1], r3 @//Store dest row1, column 1; (1/2,1/2)
+
+ subs r8, r8, #2 @ 2 rows processed, decrement by 2
+ subne r0, r0 , r2, lsl #2
+ subne r0, r0, r2
+ beq end_func @ Branch if height==4
+
+ b loop_8 @looping if height == 8 or 16
+
+loop_4:
+ vld1.u32 {d2, d3}, [r0], r2 @ Vector load from src[0_0]
+ vld1.u32 {d4, d5}, [r0], r2 @ Vector load from src[1_0]
+ vld1.u32 {d6, d7}, [r0], r2 @ Vector load from src[2_0]
+ vld1.u32 {d8, d9}, [r0], r2 @ Vector load from src[3_0]
+ vld1.u32 {d10, d11}, [r0], r2 @ Vector load from src[4_0]
+ vld1.u32 {d12, d13}, [r0], r2 @ Vector load from src[5_0]
+
+ @ vERTICAL FILTERING FOR ROW 0
+ vaddl.u8 q10, d6, d8 @ temp1 = src[2_0] + src[3_0]
+ vaddl.u8 q11, d4, d10 @ temp2 = src[1_0] + src4_0]
+ vaddl.u8 q12, d2, d12 @ temp = src[0_0] + src[5_0]
+ vaddl.u8 q13, d3, d13 @ temp = src[0_0] + src[5_0]
+ vaddl.u8 q14, d7, d9 @ temp1 = src[2_0] + src[3_0]
+ vmla.u16 q12, q10, d0[0] @ temp += temp1 * 20
+ vmls.s16 q12, q11, d1[0] @ temp -= temp2 * 5
+ vaddl.u8 q15, d5, d11 @ temp2 = src[1_0] + src4_0]
+ vmla.u16 q13, q14, d0[0] @ temp += temp1 * 20
+ vmls.s16 q13, q15, d1[0] @ temp -= temp2 * 5
+ @Q12,Q13 HAVE VERTICAL FILTERED VALUES
+ @CASCADED FILTERING FOR ROW 0
+
+ vext.16 q10, q12, q13, #5 @//extract a[5] (column1)
+ vext.16 q11, q12, q13, #2 @//extract a[2] (column1)
+ vaddl.s16 q14, d20, d24 @// a0 + a5 (column1)
+ vaddl.s16 q15, d21, d25 @// a0 + a5 (column1)
+
+ vext.16 q1, q12, q13, #4 @//extract a[4] (column1)
+ vext.16 q9, q12, q13, #1 @//extract a[1] (column1)
+ vext.16 q10, q12, q13, #3 @//extract a[3] (column1)
+
+ vmlal.s16 q14, d22, d0[0] @// a0 + a5 + 20a2 (column1)
+ vmlsl.s16 q14, d18, d1[0] @// a0 + a5 + 20a2 + 20a3 - 5a1 (column1)
+ vmlal.s16 q14, d20, d0[0] @// a0 + a5 + 20a2 + 20a3 (column1)
+ vmlsl.s16 q14, d2, d1[0] @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1)
+ vld1.u32 {d14, d15}, [r0], r2 @ Vector load from src[6_0]
+ vmlal.s16 q15, d23, d0[0] @// a0 + a5 + 20a2 (column1)
+ vmlal.s16 q15, d21, d0[0] @// a0 + a5 + 20a2 + 20a3 (column1)
+ vmlsl.s16 q15, d19, d1[0] @// a0 + a5 + 20a2 + 20a3 - 5a1 (column1)
+ vmlsl.s16 q15, d3, d1[0] @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1)
+ vaddl.u8 q12, d4, d14 @ temp = src[0_0] + src[5_0]
+ vaddl.u8 q13, d5, d15 @ temp = src[0_0] + src[5_0]
+ vqrshrun.s32 d18, q14, #10
+ vaddl.u8 q14, d9, d11 @ temp1 = src[2_0] + src[3_0]
+ vaddl.u8 q11, d6, d12 @ temp2 = src[1_0] + src4_0]
+ vaddl.u8 q10, d8, d10 @ temp1 = src[2_0] + src[3_0]
+ vqrshrun.s32 d19, q15, #10
+ vmla.u16 q12, q10, d0[0] @ temp += temp1 * 20
+ vmls.s16 q12, q11, d1[0] @ temp -= temp2 * 5
+ vaddl.u8 q15, d7, d13 @ temp2 = src[1_0] + src4_0]
+ vqshrun.s16 d2, q9, #0
+ vmla.u16 q13, q14, d0[0] @ temp += temp1 * 20
+ vmls.s16 q13, q15, d1[0] @ temp -= temp2 * 5
+
+ @ vERTICAL FILTERING FOR ROW 1
+
+ @Q12,Q13 HAVE VERTICAL FILTERED VALUES
+ @CASCADED FILTERING FOR ROW 1
+ vext.16 q10, q12, q13, #5 @//extract a[5] (column1)
+ vext.16 q11, q12, q13, #2 @//extract a[2] (column1)
+ vst1.u32 {d2[0]}, [r1], r3 @//Store dest row0, column 1; (1/2,1/2)
+ vaddl.s16 q14, d20, d24 @// a0 + a5 (column1)
+ vaddl.s16 q15, d21, d25 @// a0 + a5 (column1)
+ vext.16 q9, q12, q13, #1 @//extract a[1] (column1)
+ vext.16 q10, q12, q13, #3 @//extract a[3] (column1)
+ vext.16 q2, q12, q13, #4 @//extract a[4] (column1)
+ vmlal.s16 q14, d22, d0[0] @// a0 + a5 + 20a2 (column1)
+ vmlsl.s16 q14, d18, d1[0] @// a0 + a5 + 20a2 + 20a3 - 5a1 (column1)
+ vmlal.s16 q14, d20, d0[0] @// a0 + a5 + 20a2 + 20a3 (column1)
+ vmlsl.s16 q14, d4, d1[0] @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1)
+ vmlal.s16 q15, d23, d0[0] @// a0 + a5 + 20a2 (column1)
+ vmlal.s16 q15, d21, d0[0] @// a0 + a5 + 20a2 + 20a3 (column1)
+ vmlsl.s16 q15, d19, d1[0] @// a0 + a5 + 20a2 + 20a3 - 5a1 (column1)
+ vmlsl.s16 q15, d5, d1[0] @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1)
+ vqrshrun.s32 d18, q14, #10
+ vqrshrun.s32 d19, q15, #10
+ vqshrun.s16 d4, q9, #0
+ vst1.u32 {d4[0]}, [r1], r3 @//Store dest row1, column 1; (1/2,1/2)
+
+ subs r8, r8, #2 @ 2 rows processed, decrement by 2
+ subne r0, r0 , r2, lsl #2
+ subne r0, r0, r2
+ beq end_func @ Branch if height==4
+
+ b loop_4 @looping if height == 8 or 16
+
+end_func:
+ vldmia sp!, {d8-d15} @ Restore neon registers that were saved
+ ldmfd sp!, {r4-r12, pc} @Restoring registers from stack
+
+
diff --git a/common/arm/ih264_inter_pred_luma_horz_hpel_vert_qpel_a9q.s b/common/arm/ih264_inter_pred_luma_horz_hpel_vert_qpel_a9q.s
new file mode 100755
index 0000000..65a6de7
--- /dev/null
+++ b/common/arm/ih264_inter_pred_luma_horz_hpel_vert_qpel_a9q.s
@@ -0,0 +1,1044 @@
+@/******************************************************************************
+@ *
+@ * Copyright (C) 2015 The Android Open Source Project
+@ *
+@ * Licensed under the Apache License, Version 2.0 (the "License");
+@ * you may not use this file except in compliance with the License.
+@ * You may obtain a copy of the License at:
+@ *
+@ * http://www.apache.org/licenses/LICENSE-2.0
+@ *
+@ * Unless required by applicable law or agreed to in writing, software
+@ * distributed under the License is distributed on an "AS IS" BASIS,
+@ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@ * See the License for the specific language governing permissions and
+@ * limitations under the License.
+@ *
+@ *****************************************************************************
+@ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
+@*/
+@/**
+@******************************************************************************
+@* @file
+@* ih264_inter_pred_luma_horz_hpel_vert_qpel_a9q.s
+@*
+@* @brief
+@* Contains function definitions for inter prediction interpolation.
+@*
+@* @author
+@* Mohit
+@*
+@* @par List of Functions:
+@*
+@* - ih264_inter_pred_luma_horz_hpel_vert_qpel_a9q()
+@*
+@* @remarks
+@* None
+@*
+@*******************************************************************************
+@*/
+
+@/* All the functions here are replicated from ih264_inter_pred_filters.c
+@
+
+@/**
+@/**
+@/**
+@*******************************************************************************
+@*
+@* @brief
+@* This function implements a two stage cascaded six tap filter. It
+@* applies the six tap filter in the horizontal direction on the
+@* predictor values, followed by applying the same filter in the
+@* vertical direction on the output of the first stage. It then averages
+@* the output of the 1st stage and the output of the 2nd stage to obtain
+@* the quarter pel values. The six tap filtering operation is described
+@* in sec 8.4.2.2.1 titled "Luma sample interpolation process".
+@*
+@* @par Description:
+@* This function is called to obtain pixels lying at the following
+@* location (1/2,1/4) or (1/2,3/4). The function interpolates
+@* the predictors first in the horizontal direction and then in the
+@* vertical direction to output the (1/2,1/2). It then averages
+@* the output of the 2nd stage and (1/2,1/2) value to obtain (1/2,1/4)
+@* or (1/2,3/4) depending on the offset.
+@*
+@* @param[in] pu1_src
+@* UWORD8 pointer to the source
+@*
+@* @param[out] pu1_dst
+@* UWORD8 pointer to the destination
+@*
+@* @param[in] src_strd
+@* integer source stride
+@*
+@* @param[in] dst_strd
+@* integer destination stride
+@*
+@* @param[in] ht
+@* integer height of the array
+@*
+@* @param[in] wd
+@* integer width of the array
+@*
+@* @param[in] pu1_tmp: temporary buffer
+@*
+@* @param[in] dydx: x and y reference offset for qpel calculations
+@*
+@* @returns
+@*
+@* @remarks
+@* None
+@*
+@*******************************************************************************
+@*/;
+
+@void ih264_inter_pred_luma_horz_hpel_vert_qpel(UWORD8 *pu1_src,
+@ UWORD8 *pu1_dst,
+@ WORD32 src_strd,,
+@ WORD32 dst_strd,
+@ WORD32 ht,
+@ WORD32 wd,
+@ UWORD8* pu1_tmp,
+@ UWORD32 dydx)
+
+@**************Variables Vs Registers*****************************************
+@ r0 => *pu1_src
+@ r1 => *pu1_dst
+@ r2 => src_strd
+@ r3 => dst_strd
+@ r4 => ht
+@ r5 => wd
+@ r7 => dydx
+@ r9 => *pu1_tmp
+
+.text
+.p2align 2
+
+ .global ih264_inter_pred_luma_horz_hpel_vert_qpel_a9q
+
+ih264_inter_pred_luma_horz_hpel_vert_qpel_a9q:
+
+ stmfd sp!, {r4-r12, r14} @ store register values to stack
+ vstmdb sp!, {d8-d15} @push neon registers to stack
+ ldr r4, [sp, #104] @ loads ht
+ sub r0, r0, r2, lsl #1 @ pu1_src-2*src_strd
+ sub r0, r0, #2 @ pu1_src-2
+ ldr r5, [sp, #108] @ loads wd
+ ldr r7, [sp, #116] @ loads dydx
+ lsr r7, r7, #3 @ dydx >> 2 followed by dydx & 0x3 and dydx>>1 to obtain the deciding bit
+ ldr r9, [sp, #112] @ pu1_tmp
+ add r7, r7, #2
+ mov r6, #48
+ mla r7, r7, r6, r9
+
+ subs r12, r5, #4 @if wd=4 branch to loop_4
+ beq loop_4_start
+
+ subs r12, r5, #8 @if wd=8 branch to loop_8
+ beq loop_8_start
+
+ @when wd=16
+ vmov.u16 q11, #20 @ Filter coeff 0x14 into Q11
+ vmov.u16 q12, #5 @ Filter coeff 0x5 into Q12
+ add r8, r0, #8
+ add r14, r1, #8
+ add r10, r9, #8
+ mov r12, r4
+ add r11, r7, #8
+
+loop_16_lowhalf_start:
+ vld1.32 {q0}, [r0], r2 @ row -2 load for horizontal filter
+ vext.8 d5, d0, d1, #5
+ vaddl.u8 q3, d0, d5
+
+ vext.8 d2, d0, d1, #2
+ vext.8 d3, d0, d1, #3
+ vaddl.u8 q4, d2, d3
+ vext.8 d4, d0, d1, #4
+ vmla.u16 q3, q4, q11
+ vext.8 d1, d0, d1, #1
+ vaddl.u8 q4, d1, d4
+ vld1.32 {q0}, [r0], r2 @ row -1 load for horizontal filter
+ vmls.u16 q3, q4, q12
+ vext.8 d5, d0, d1, #5
+ vaddl.u8 q4, d0, d5
+ vext.8 d2, d0, d1, #2
+ vext.8 d3, d0, d1, #3
+ vaddl.u8 q5, d2, d3
+
+ vst1.32 {q3}, [r9], r6 @ store temp buffer 0
+
+ vext.8 d4, d0, d1, #4
+ vmla.u16 q4, q5, q11
+ vext.8 d1, d0, d1, #1
+ vaddl.u8 q5, d1, d4
+ vld1.32 {q0}, [r0], r2 @ row 0 load for horizontal filter
+ vmls.u16 q4, q5, q12
+ vext.8 d5, d0, d1, #5
+ vaddl.u8 q5, d0, d5
+ vext.8 d2, d0, d1, #2
+ vext.8 d3, d0, d1, #3
+ vaddl.u8 q6, d2, d3
+
+ vst1.32 {q4}, [r9], r6 @ store temp buffer 1
+
+ vext.8 d4, d0, d1, #4
+ vmla.u16 q5, q6, q11
+ vext.8 d1, d0, d1, #1
+ vaddl.u8 q6, d1, d4
+ vld1.32 {q0}, [r0], r2 @ row 1 load for horizontal filter
+ vmls.u16 q5, q6, q12
+ vext.8 d5, d0, d1, #5
+ vaddl.u8 q6, d0, d5
+ vext.8 d2, d0, d1, #2
+ vext.8 d3, d0, d1, #3
+ vaddl.u8 q7, d2, d3
+
+ vst1.32 {q5}, [r9], r6 @ store temp buffer 2
+
+ vext.8 d4, d0, d1, #4
+ vmla.u16 q6, q7, q11
+ vext.8 d1, d0, d1, #1
+ vaddl.u8 q7, d1, d4
+ vld1.32 {q0}, [r0], r2 @ row 2 load for horizontal filter
+ vmls.u16 q6, q7, q12
+ vext.8 d5, d0, d1, #5
+ vaddl.u8 q7, d0, d5
+ vext.8 d2, d0, d1, #2
+ vext.8 d3, d0, d1, #3
+ vaddl.u8 q8, d2, d3
+
+ vst1.32 {q6}, [r9], r6 @ store temp buffer 3
+
+ vext.8 d4, d0, d1, #4
+ vmla.u16 q7, q8, q11
+ vext.8 d1, d0, d1, #1
+ vaddl.u8 q8, d1, d4
+
+ vmls.u16 q7, q8, q12
+loop_16_lowhalf:
+
+ vld1.32 {q0}, [r0], r2 @ row 3 load for horizontal filter
+ vext.8 d5, d0, d1, #5
+ vext.8 d2, d0, d1, #2
+ vext.8 d3, d0, d1, #3
+ vaddl.u8 q8, d0, d5
+
+ vst1.32 {q7}, [r9], r6 @ store temp buffer 4
+ vaddl.u8 q9, d2, d3
+ vext.8 d4, d0, d1, #4
+ vmla.u16 q8, q9, q11
+ vext.8 d1, d0, d1, #1
+ vadd.s16 q14, q4, q7
+ vaddl.u8 q9, d1, d4
+ vadd.s16 q15, q5, q6
+ vmls.u16 q8, q9, q12
+ vld1.32 {q0}, [r0], r2 @ row 4 load for hoorizontal filter
+ vext.8 d5, d0, d1, #5
+ vext.8 d2, d0, d1, #2
+ vext.8 d3, d0, d1, #3
+ vaddl.u8 q10, d0, d5
+
+ vst1.32 {q8}, [r9], r6 @ store temp buffer r5
+
+ vaddl.s16 q9, d6, d16
+
+ vld1.32 {q13}, [r7], r6 @ load from temp buffer 0
+
+ vaddl.s16 q3, d7, d17
+
+ vqrshrun.s16 d26, q13, #5
+
+ vmlal.s16 q9, d30, d22
+ vmlsl.s16 q9, d28, d24
+ vmlal.s16 q3, d31, d22
+ vmlsl.s16 q3, d29, d24
+ vaddl.u8 q1, d2, d3
+ vext.8 d4, d0, d1, #4
+ vmla.u16 q10, q1, q11
+ vqrshrun.s32 d18, q9, #10
+ vext.8 d1, d0, d1, #1
+ vqrshrun.s32 d19, q3, #10
+ vadd.s16 q14, q5, q8
+ vaddl.u8 q1, d1, d4
+ vadd.s16 q15, q6, q7
+ vmls.u16 q10, q1, q12
+ vqmovn.u16 d18, q9
+ vld1.32 {q0}, [r0], r2 @ row 5 load for horizontal filter
+
+ vrhadd.u8 d26, d18, d26
+
+ vext.8 d5, d0, d1, #5
+ vext.8 d2, d0, d1, #2
+
+ vst1.32 {q10}, [r9], r6 @ store temp buffer r6
+
+ vaddl.s16 q9, d8, d20
+
+ vaddl.s16 q3, d9, d21
+
+ vld1.32 {q4}, [r7], r6 @load from temp buffer 1
+
+
+ vst1.32 d26, [r1], r3 @ store row 0
+
+ vmlal.s16 q9, d30, d22
+ vmlsl.s16 q9, d28, d24
+
+ vqrshrun.s16 d28, q4, #5
+
+ vmlal.s16 q3, d31, d22
+ vmlsl.s16 q3, d29, d24
+ vext.8 d3, d0, d1, #3
+ vaddl.u8 q4, d0, d5
+ vaddl.u8 q1, d2, d3
+ vqrshrun.s32 d18, q9, #10
+ vext.8 d4, d0, d1, #4
+ vqrshrun.s32 d19, q3, #10
+ vmla.u16 q4, q1, q11
+ vext.8 d1, d0, d1, #1
+ vadd.s16 q13, q6, q10
+ vaddl.u8 q1, d1, d4
+ vqmovn.u16 d18, q9
+ vadd.s16 q15, q7, q8
+ vmls.u16 q4, q1, q12
+ vld1.32 {q0}, [r0], r2 @ row 6 load for horizontal filter
+
+ vrhadd.u8 d28, d28, d18
+
+ vext.8 d5, d0, d1, #5
+ vext.8 d2, d0, d1, #2
+ vext.8 d3, d0, d1, #3
+
+ vst1.32 d28, [r1], r3 @ store row 1
+
+ vaddl.u8 q14, d0, d5
+
+ vst1.32 {q4}, [r9], r6 @ store temp buffer r7
+
+ vaddl.s16 q9, d10, d8
+ vaddl.s16 q3, d11, d9
+
+ vld1.32 {q5}, [r7], r6 @ load from temp buffer 2
+
+ vmlal.s16 q9, d30, d22
+ vmlsl.s16 q9, d26, d24
+ vmlal.s16 q3, d31, d22
+
+ vqrshrun.s16 d26, q5, #5
+
+ vmlsl.s16 q3, d27, d24
+ vaddl.u8 q1, d2, d3
+ vext.8 d4, d0, d1, #4
+ vmla.u16 q14, q1, q11
+ vqrshrun.s32 d18, q9, #10
+ vext.8 d1, d0, d1, #1
+ vqrshrun.s32 d19, q3, #10
+ vadd.s16 q5, q7, q4
+ vaddl.u8 q1, d1, d4
+ vadd.s16 q15, q8, q10
+ vmls.u16 q14, q1, q12
+ vqmovn.u16 d27, q9
+
+ vaddl.s16 q9, d12, d28
+ vaddl.s16 q3, d13, d29
+
+ vrhadd.u8 d26, d26, d27
+
+ vmlal.s16 q9, d30, d22
+ vmlsl.s16 q9, d10, d24
+ vmlal.s16 q3, d31, d22
+ vmlsl.s16 q3, d11, d24
+
+ vst1.32 d26, [r1], r3 @ store row 2
+
+ vst1.32 {q14}, [r9]
+
+
+ vqrshrun.s32 d18, q9, #10
+ vmov q5, q10
+ vld1.32 {q15}, [r7], r6 @ load from temp buffer 3
+
+ vqrshrun.s32 d19, q3, #10
+ subs r4, r4, #4
+
+ vqrshrun.s16 d30, q15, #5
+
+ vqmovn.u16 d18, q9
+ vmov q6, q4
+ vmov q3, q7
+ vrhadd.u8 d30, d18, d30
+ vmov q4, q8
+ vmov q7, q14
+ vst1.32 d30, [r1], r3 @ store row 3
+
+ bgt loop_16_lowhalf @ looping if height =16
+
+
+loop_16_highhalf_start:
+ vld1.32 {q0}, [r8], r2
+ vext.8 d5, d0, d1, #5
+ vaddl.u8 q3, d0, d5
+ vext.8 d2, d0, d1, #2
+ vext.8 d3, d0, d1, #3
+ vaddl.u8 q4, d2, d3
+ vext.8 d4, d0, d1, #4
+ vmla.u16 q3, q4, q11
+ vext.8 d1, d0, d1, #1
+ vaddl.u8 q4, d1, d4
+ vld1.32 {q0}, [r8], r2
+ vmls.u16 q3, q4, q12
+ vext.8 d5, d0, d1, #5
+ vaddl.u8 q4, d0, d5
+ vext.8 d2, d0, d1, #2
+ vext.8 d3, d0, d1, #3
+ vaddl.u8 q5, d2, d3
+
+ vst1.32 {q3}, [r10], r6
+
+ vext.8 d4, d0, d1, #4
+ vmla.u16 q4, q5, q11
+ vext.8 d1, d0, d1, #1
+ vaddl.u8 q5, d1, d4
+ vld1.32 {q0}, [r8], r2
+ vmls.u16 q4, q5, q12
+ vext.8 d5, d0, d1, #5
+ vaddl.u8 q5, d0, d5
+ vext.8 d2, d0, d1, #2
+ vext.8 d3, d0, d1, #3
+ vaddl.u8 q6, d2, d3
+
+ vst1.32 {q4}, [r10], r6
+
+ vext.8 d4, d0, d1, #4
+ vmla.u16 q5, q6, q11
+ vext.8 d1, d0, d1, #1
+ vaddl.u8 q6, d1, d4
+ vld1.32 {q0}, [r8], r2
+ vmls.u16 q5, q6, q12
+ vext.8 d5, d0, d1, #5
+ vaddl.u8 q6, d0, d5
+ vext.8 d2, d0, d1, #2
+ vext.8 d3, d0, d1, #3
+ vaddl.u8 q7, d2, d3
+
+ vst1.32 {q5}, [r10], r6
+
+ vext.8 d4, d0, d1, #4
+ vmla.u16 q6, q7, q11
+ vext.8 d1, d0, d1, #1
+ vaddl.u8 q7, d1, d4
+ vld1.32 {q0}, [r8], r2
+ vmls.u16 q6, q7, q12
+ vext.8 d5, d0, d1, #5
+ vaddl.u8 q7, d0, d5
+ vext.8 d2, d0, d1, #2
+ vext.8 d3, d0, d1, #3
+ vaddl.u8 q8, d2, d3
+
+ vst1.32 {q6}, [r10], r6
+
+ vext.8 d4, d0, d1, #4
+ vmla.u16 q7, q8, q11
+ vext.8 d1, d0, d1, #1
+ vaddl.u8 q8, d1, d4
+
+ vmls.u16 q7, q8, q12
+
+loop_16_highhalf:
+
+ vld1.32 {q0}, [r8], r2
+ vext.8 d5, d0, d1, #5
+ vext.8 d2, d0, d1, #2
+ vext.8 d3, d0, d1, #3
+ vaddl.u8 q8, d0, d5
+
+ vst1.32 {q7}, [r10], r6
+
+ vaddl.u8 q9, d2, d3
+ vext.8 d4, d0, d1, #4
+ vmla.u16 q8, q9, q11
+ vext.8 d1, d0, d1, #1
+ vadd.s16 q14, q4, q7
+ vaddl.u8 q9, d1, d4
+ vadd.s16 q15, q5, q6
+ vmls.u16 q8, q9, q12
+ vld1.32 {q0}, [r8], r2
+ vext.8 d5, d0, d1, #5
+ vext.8 d2, d0, d1, #2
+ vext.8 d3, d0, d1, #3
+ vaddl.u8 q10, d0, d5
+
+ vst1.32 {q8}, [r10], r6
+
+ vaddl.s16 q9, d6, d16
+
+ vld1.32 {q13}, [r11], r6
+
+ vaddl.s16 q3, d7, d17
+
+ vqrshrun.s16 d26, q13, #5
+
+ vmlal.s16 q9, d30, d22
+ vmlsl.s16 q9, d28, d24
+ vmlal.s16 q3, d31, d22
+ vmlsl.s16 q3, d29, d24
+ vaddl.u8 q1, d2, d3
+ vext.8 d4, d0, d1, #4
+ vmla.u16 q10, q1, q11
+ vqrshrun.s32 d18, q9, #10
+ vext.8 d1, d0, d1, #1
+ vqrshrun.s32 d19, q3, #10
+ vadd.s16 q14, q5, q8
+ vaddl.u8 q1, d1, d4
+ vadd.s16 q15, q6, q7
+ vmls.u16 q10, q1, q12
+ vqmovn.u16 d18, q9
+ vld1.32 {q0}, [r8], r2
+
+ vrhadd.u8 d26, d18, d26
+
+ vext.8 d5, d0, d1, #5
+ vext.8 d2, d0, d1, #2
+
+ vst1.32 {q10}, [r10], r6
+
+ vaddl.s16 q9, d8, d20
+ vaddl.s16 q3, d9, d21
+
+ vld1.32 {q4}, [r11], r6
+
+
+ vst1.32 d26, [r14], r3 @store row 0
+
+ vmlal.s16 q9, d30, d22
+ vmlsl.s16 q9, d28, d24
+
+ vqrshrun.s16 d28, q4, #5
+
+ vmlal.s16 q3, d31, d22
+ vmlsl.s16 q3, d29, d24
+ vext.8 d3, d0, d1, #3
+ vaddl.u8 q4, d0, d5
+ vaddl.u8 q1, d2, d3
+ vqrshrun.s32 d18, q9, #10
+ vext.8 d4, d0, d1, #4
+ vqrshrun.s32 d19, q3, #10
+ vmla.u16 q4, q1, q11
+ vext.8 d1, d0, d1, #1
+ vadd.s16 q13, q6, q10
+ vaddl.u8 q1, d1, d4
+ vqmovn.u16 d18, q9
+ vadd.s16 q15, q7, q8
+ vmls.u16 q4, q1, q12
+ vld1.32 {q0}, [r8], r2
+
+ vrhadd.u8 d28, d28, d18
+
+ vext.8 d5, d0, d1, #5
+ vext.8 d2, d0, d1, #2
+ vext.8 d3, d0, d1, #3
+
+ vst1.32 d28, [r14], r3 @store row 1
+
+ vaddl.u8 q14, d0, d5
+
+ vst1.32 {q4}, [r10], r6
+
+ vaddl.s16 q9, d10, d8
+ vaddl.s16 q3, d11, d9
+
+ vld1.32 {q5}, [r11], r6
+
+ vmlal.s16 q9, d30, d22
+ vmlsl.s16 q9, d26, d24
+ vmlal.s16 q3, d31, d22
+
+ vqrshrun.s16 d26, q5, #5
+
+ vmlsl.s16 q3, d27, d24
+ vaddl.u8 q1, d2, d3
+ vext.8 d4, d0, d1, #4
+ vmla.u16 q14, q1, q11
+ vqrshrun.s32 d18, q9, #10
+ vext.8 d1, d0, d1, #1
+ vqrshrun.s32 d19, q3, #10
+ vadd.s16 q5, q7, q4
+ vaddl.u8 q1, d1, d4
+ vadd.s16 q15, q8, q10
+ vmls.u16 q14, q1, q12
+ vqmovn.u16 d27, q9
+
+
+ vaddl.s16 q9, d12, d28
+ vaddl.s16 q3, d13, d29
+
+ vrhadd.u8 d26, d26, d27
+
+ vmlal.s16 q9, d30, d22
+ vmlsl.s16 q9, d10, d24
+ vmlal.s16 q3, d31, d22
+ vmlsl.s16 q3, d11, d24
+
+ vst1.32 d26, [r14], r3 @ store row 2
+
+ vst1.32 {q14}, [r10]
+
+ vqrshrun.s32 d18, q9, #10
+ vmov q5, q10
+ vld1.32 {q15}, [r11], r6
+
+ vqrshrun.s32 d19, q3, #10
+ subs r12, r12, #4
+
+ vqrshrun.s16 d30, q15, #5
+
+ vqmovn.u16 d18, q9
+ vmov q6, q4
+ vmov q3, q7
+ vrhadd.u8 d30, d18, d30
+ vmov q4, q8
+ vmov q7, q14
+ vst1.32 d30, [r14], r3 @ store row 3
+
+ bgt loop_16_highhalf @ looping if height = 8 or 16
+ b end_func
+
+loop_8_start:
+
+ vmov.u16 q11, #20 @ Filter coeff 20 into Q11
+ vmov.u16 q12, #5 @ Filter coeff 5 into Q12
+ vld1.32 {q0}, [r0], r2 @ row -2 load for horizontal filter
+ vext.8 d5, d0, d1, #5
+ vaddl.u8 q3, d0, d5
+
+ vext.8 d2, d0, d1, #2
+ vext.8 d3, d0, d1, #3
+ vaddl.u8 q4, d2, d3
+ vext.8 d4, d0, d1, #4
+ vmla.u16 q3, q4, q11
+ vext.8 d1, d0, d1, #1
+ vaddl.u8 q4, d1, d4
+ vld1.32 {q0}, [r0], r2 @ row -1 load for horizontal filter
+ vmls.u16 q3, q4, q12
+ vext.8 d5, d0, d1, #5
+ vaddl.u8 q4, d0, d5
+ vext.8 d2, d0, d1, #2
+ vext.8 d3, d0, d1, #3
+ vaddl.u8 q5, d2, d3
+
+ vst1.32 {q3}, [r9], r6 @ store temp buffer 0
+
+ vext.8 d4, d0, d1, #4
+ vmla.u16 q4, q5, q11
+ vext.8 d1, d0, d1, #1
+ vaddl.u8 q5, d1, d4
+ vld1.32 {q0}, [r0], r2 @ row 0 load for horizontal filter
+ vmls.u16 q4, q5, q12
+ vext.8 d5, d0, d1, #5
+ vaddl.u8 q5, d0, d5
+ vext.8 d2, d0, d1, #2
+ vext.8 d3, d0, d1, #3
+ vaddl.u8 q6, d2, d3
+
+ vst1.32 {q4}, [r9], r6 @ store temp buffer 1
+
+ vext.8 d4, d0, d1, #4
+ vmla.u16 q5, q6, q11
+ vext.8 d1, d0, d1, #1
+ vaddl.u8 q6, d1, d4
+ vld1.32 {q0}, [r0], r2 @ row 1 load for horizontal filter
+ vmls.u16 q5, q6, q12
+ vext.8 d5, d0, d1, #5
+ vaddl.u8 q6, d0, d5
+ vext.8 d2, d0, d1, #2
+ vext.8 d3, d0, d1, #3
+ vaddl.u8 q7, d2, d3
+
+ vst1.32 {q5}, [r9], r6 @ store temp buffer 2
+
+ vext.8 d4, d0, d1, #4
+ vmla.u16 q6, q7, q11
+ vext.8 d1, d0, d1, #1
+ vaddl.u8 q7, d1, d4
+ vld1.32 {q0}, [r0], r2 @ row 2 load for horizontal filter
+ vmls.u16 q6, q7, q12
+ vext.8 d5, d0, d1, #5
+ vaddl.u8 q7, d0, d5
+ vext.8 d2, d0, d1, #2
+ vext.8 d3, d0, d1, #3
+ vaddl.u8 q8, d2, d3
+
+ vst1.32 {q6}, [r9], r6 @ store temp buffer 3
+
+ vext.8 d4, d0, d1, #4
+ vmla.u16 q7, q8, q11
+ vext.8 d1, d0, d1, #1
+ vaddl.u8 q8, d1, d4
+
+ vmls.u16 q7, q8, q12
+loop_8:
+
+ vld1.32 {q0}, [r0], r2 @ row 3 load for horizontal filter
+ vext.8 d5, d0, d1, #5
+ vext.8 d2, d0, d1, #2
+ vext.8 d3, d0, d1, #3
+ vaddl.u8 q8, d0, d5
+
+ vst1.32 {q7}, [r9], r6 @ store temp buffer 4
+
+ vaddl.u8 q9, d2, d3
+ vext.8 d4, d0, d1, #4
+ vmla.u16 q8, q9, q11
+ vext.8 d1, d0, d1, #1
+ vadd.s16 q14, q4, q7
+ vaddl.u8 q9, d1, d4
+ vadd.s16 q15, q5, q6
+ vmls.u16 q8, q9, q12
+ vld1.32 {q0}, [r0], r2 @ row 4 load for hoorizontal filter
+ vext.8 d5, d0, d1, #5
+ vext.8 d2, d0, d1, #2
+ vext.8 d3, d0, d1, #3
+ vaddl.u8 q10, d0, d5
+
+ vst1.32 {q8}, [r9], r6 @ store temp buffer r5
+
+ vaddl.s16 q9, d6, d16
+
+ vld1.32 {q13}, [r7], r6 @ load from temp buffer 0
+
+ vaddl.s16 q3, d7, d17
+
+ vqrshrun.s16 d26, q13, #5
+
+ vmlal.s16 q9, d30, d22
+ vmlsl.s16 q9, d28, d24
+ vmlal.s16 q3, d31, d22
+ vmlsl.s16 q3, d29, d24
+ vaddl.u8 q1, d2, d3
+ vext.8 d4, d0, d1, #4
+ vmla.u16 q10, q1, q11
+ vqrshrun.s32 d18, q9, #10
+ vext.8 d1, d0, d1, #1
+ vqrshrun.s32 d19, q3, #10
+ vadd.s16 q14, q5, q8
+ vaddl.u8 q1, d1, d4
+ vadd.s16 q15, q6, q7
+ vmls.u16 q10, q1, q12
+ vqmovn.u16 d18, q9
+ vld1.32 {q0}, [r0], r2 @ row 5 load for horizontal filter
+
+ vrhadd.u8 d26, d18, d26
+
+ vext.8 d5, d0, d1, #5
+ vext.8 d2, d0, d1, #2
+
+ vst1.32 {q10}, [r9], r6 @ store temp buffer r6
+
+ vaddl.s16 q9, d8, d20
+
+ vaddl.s16 q3, d9, d21
+
+ vld1.32 {q4}, [r7], r6 @load from temp buffer 1
+
+
+ vst1.32 d26, [r1], r3 @ store row 0
+
+ vmlal.s16 q9, d30, d22
+ vmlsl.s16 q9, d28, d24
+
+ vqrshrun.s16 d28, q4, #5
+
+ vmlal.s16 q3, d31, d22
+ vmlsl.s16 q3, d29, d24
+ vext.8 d3, d0, d1, #3
+ vaddl.u8 q4, d0, d5
+ vaddl.u8 q1, d2, d3
+ vqrshrun.s32 d18, q9, #10
+ vext.8 d4, d0, d1, #4
+ vqrshrun.s32 d19, q3, #10
+ vmla.u16 q4, q1, q11
+ vext.8 d1, d0, d1, #1
+ vadd.s16 q13, q6, q10
+ vaddl.u8 q1, d1, d4
+ vqmovn.u16 d18, q9
+ vadd.s16 q15, q7, q8
+ vmls.u16 q4, q1, q12
+ vld1.32 {q0}, [r0], r2 @ row 6 load for horizontal filter
+
+ vrhadd.u8 d28, d28, d18
+
+ vext.8 d5, d0, d1, #5
+ vext.8 d2, d0, d1, #2
+ vext.8 d3, d0, d1, #3
+
+ vst1.32 d28, [r1], r3 @ store row 1
+
+ vaddl.u8 q14, d0, d5
+
+ vst1.32 {q4}, [r9], r6 @ store temp buffer r7
+
+ vaddl.s16 q9, d10, d8
+ vaddl.s16 q3, d11, d9
+
+ vld1.32 {q5}, [r7], r6 @ load from temp buffer 2
+
+ vmlal.s16 q9, d30, d22
+ vmlsl.s16 q9, d26, d24
+ vmlal.s16 q3, d31, d22
+
+ vqrshrun.s16 d26, q5, #5
+
+ vmlsl.s16 q3, d27, d24
+ vaddl.u8 q1, d2, d3
+ vext.8 d4, d0, d1, #4
+ vmla.u16 q14, q1, q11
+ vqrshrun.s32 d18, q9, #10
+ vext.8 d1, d0, d1, #1
+ vqrshrun.s32 d19, q3, #10
+ vadd.s16 q5, q7, q4
+ vaddl.u8 q1, d1, d4
+ vadd.s16 q15, q8, q10
+ vmls.u16 q14, q1, q12
+ vqmovn.u16 d27, q9
+
+ vaddl.s16 q9, d12, d28
+ vaddl.s16 q3, d13, d29
+
+ vrhadd.u8 d26, d26, d27
+
+ vmlal.s16 q9, d30, d22
+ vmlsl.s16 q9, d10, d24
+ vmlal.s16 q3, d31, d22
+ vmlsl.s16 q3, d11, d24
+
+ vst1.32 d26, [r1], r3 @ store row 2
+
+ vst1.32 {q14}, [r9]
+
+
+ vqrshrun.s32 d18, q9, #10
+ vmov q5, q10
+ vld1.32 {q15}, [r7], r6 @ load from temp buffer 3
+
+ vqrshrun.s32 d19, q3, #10
+ subs r4, r4, #4
+
+ vqrshrun.s16 d30, q15, #5
+
+ vqmovn.u16 d18, q9
+ vmov q6, q4
+ vmov q3, q7
+ vrhadd.u8 d30, d18, d30
+ vmov q4, q8
+ vmov q7, q14
+ vst1.32 d30, [r1], r3 @ store row 3
+
+ bgt loop_8 @if height =8 or 16 loop
+ b end_func
+
+loop_4_start:
+ vmov.u16 d22, #20 @ Filter coeff 20 into D22
+ vmov.u16 d23, #5 @ Filter coeff 5 into D23
+
+ vld1.32 {q0}, [r0], r2 @row -2 load
+ vext.8 d5, d0, d1, #5
+ vaddl.u8 q3, d0, d5
+ vext.8 d2, d0, d1, #2
+ vext.8 d3, d0, d1, #3
+ vaddl.u8 q4, d2, d3
+ vext.8 d4, d0, d1, #4
+ vmla.u16 d6, d8, d22
+ vext.8 d1, d0, d1, #1
+ vaddl.u8 q4, d1, d4
+ vld1.32 {q0}, [r0], r2 @ row -1 load
+ vmls.u16 d6, d8, d23
+ vext.8 d5, d0, d1, #5
+ vaddl.u8 q4, d0, d5
+ vext.8 d2, d0, d1, #2
+ vext.8 d3, d0, d1, #3
+ vaddl.u8 q5, d2, d3
+
+ vst1.32 d6, [r9], r6 @ store temp buffer 0
+
+ vext.8 d4, d0, d1, #4
+ vmla.u16 d8, d10, d22
+ vext.8 d1, d0, d1, #1
+ vaddl.u8 q5, d1, d4
+ vld1.32 {q0}, [r0], r2 @ row 0 load
+ vmls.u16 d8, d10, d23
+ vext.8 d5, d0, d1, #5
+ vaddl.u8 q5, d0, d5
+ vext.8 d2, d0, d1, #2
+ vext.8 d3, d0, d1, #3
+ vaddl.u8 q6, d2, d3
+
+ vst1.32 d8, [r9], r6 @ store temp buffer 1
+
+ vext.8 d4, d0, d1, #4
+ vmla.u16 d10, d12, d22
+ vext.8 d1, d0, d1, #1
+ vaddl.u8 q6, d1, d4
+ vld1.32 {q0}, [r0], r2 @ row 1 load
+ vmls.u16 d10, d12, d23
+ vext.8 d5, d0, d1, #5
+ vaddl.u8 q6, d0, d5
+ vext.8 d2, d0, d1, #2
+ vext.8 d3, d0, d1, #3
+ vaddl.u8 q7, d2, d3
+
+ vst1.32 d10, [r9], r6 @ store temp buffer 2
+
+ vext.8 d4, d0, d1, #4
+ vmla.u16 d12, d14, d22
+ vext.8 d1, d0, d1, #1
+ vaddl.u8 q7, d1, d4
+ vld1.32 {q0}, [r0], r2 @ row 2 load
+ vmls.u16 d12, d14, d23
+ vext.8 d5, d0, d1, #5
+ vaddl.u8 q7, d0, d5
+ vext.8 d2, d0, d1, #2
+ vext.8 d3, d0, d1, #3
+ vaddl.u8 q8, d2, d3
+ vext.8 d4, d0, d1, #4
+ vmla.u16 d14, d16, d22
+ vext.8 d1, d0, d1, #1
+ vaddl.u8 q8, d1, d4
+
+ vst1.32 d12, [r9], r6 @ store temp buffer 3
+
+ vmls.u16 d14, d16, d23
+
+loop_4:
+
+ vld1.32 {q0}, [r0], r2 @ row 3 load
+ vext.8 d5, d0, d1, #5
+ vaddl.u8 q8, d0, d5
+ vext.8 d2, d0, d1, #2
+ vext.8 d3, d0, d1, #3
+ vaddl.u8 q9, d2, d3
+ vst1.32 d14, [r9], r6 @ store temp buffer 4
+ vext.8 d4, d0, d1, #4
+ vmla.u16 d16, d18, d22
+ vext.8 d1, d0, d1, #1
+ vaddl.u8 q9, d1, d4
+ vadd.s16 d2, d10, d12
+ vmls.u16 d16, d18, d23
+ vadd.s16 d3, d8, d14
+ vld1.32 {q9}, [r0], r2 @ row 4 load
+ vext.8 d25, d18, d19, #5
+ vaddl.u8 q13, d18, d25
+ vext.8 d20, d18, d19, #2
+
+ vst1.32 d16, [r9], r6 @ store temp buffer 5
+
+ vaddl.s16 q0, d6, d16
+ vmlal.s16 q0, d2, d22
+ vext.8 d21, d18, d19, #3
+ vaddl.u8 q14, d20, d21
+ vext.8 d24, d18, d19, #4
+ vmlsl.s16 q0, d3, d23
+ vmla.u16 d26, d28, d22
+ vext.8 d19, d18, d19, #1
+ vaddl.u8 q14, d19, d24
+ vadd.s16 d2, d12, d14
+ vmls.u16 d26, d28, d23
+ vqrshrun.s32 d0, q0, #0xa
+ vadd.s16 d3, d10, d16
+ vld1.32 {q9}, [r0], r2 @ row 5 load
+ vext.8 d25, d18, d19, #5
+ vqmovn.u16 d11, q0
+ vaddl.u8 q14, d18, d25
+
+ vst1.32 d26, [r9], r6 @ store temp buffer 6
+
+ @Q3 available here
+ vld1.32 d6, [r7], r6 @ load from temp buffer 0
+ vld1.32 d7, [r7], r6 @ load from temp buffer 1
+ vqrshrun.s16 d9, q3, #5
+
+ vext.8 d20, d18, d19, #2
+
+ vaddl.s16 q0, d8, d26
+ vmlal.s16 q0, d2, d22
+ vext.8 d21, d18, d19, #3
+ vaddl.u8 q3, d20, d21
+ vext.8 d24, d18, d19, #4
+ vmlsl.s16 q0, d3, d23
+ vmla.u16 d28, d6, d22
+ vext.8 d19, d18, d19, #1
+ vaddl.u8 q3, d19, d24
+ vadd.s16 d2, d14, d16
+ vmls.u16 d28, d6, d23
+ vqrshrun.s32 d0, q0, #0xa
+ vadd.s16 d3, d12, d26
+ vld1.32 {q9}, [r0], r2 @ row 6 load
+ vext.8 d25, d18, d19, #5
+ vqmovn.u16 d13, q0
+
+ vtrn.32 d11, d13
+ vaddl.s16 q0, d10, d28
+ vrhadd.u8 d9, d9, d11
+
+ vst1.32 d28, [r9], r6 @ store temp buffer 7
+
+ vmlal.s16 q0, d2, d22
+ vaddl.u8 q15, d18, d25
+
+ vst1.32 d9[0], [r1], r3 @ store row 0
+
+ vext.8 d20, d18, d19, #2
+
+ vst1.32 d9[1], [r1], r3 @ store row 1
+
+ vext.8 d21, d18, d19, #3
+ vmlsl.s16 q0, d3, d23
+ vaddl.u8 q4, d20, d21
+ vext.8 d24, d18, d19, #4
+ vmla.u16 d30, d8, d22
+ vext.8 d19, d18, d19, #1
+ vaddl.u8 q4, d19, d24
+ vqrshrun.s32 d0, q0, #0xa
+ vadd.s16 d2, d16, d26
+ vmls.u16 d30, d8, d23
+ vqmovn.u16 d4, q0
+
+ vadd.s16 d3, d14, d28
+
+
+ vaddl.s16 q0, d12, d30
+
+ vst1.32 d30, [r9]
+
+ vmlal.s16 q0, d2, d22
+
+ vld1.32 d8, [r7], r6 @ load from temp buffer 2
+ vld1.32 d9, [r7], r6 @ load from temp buffer 3
+ vmlsl.s16 q0, d3, d23
+ subs r4, r4, #4
+ vqrshrun.s16 d10, q4, #5
+
+ vmov d12, d28
+
+ vqrshrun.s32 d0, q0, #0xa
+ vmov d6, d14
+ vmov d8, d16
+
+ vqmovn.u16 d5, q0
+
+ vtrn.32 d4, d5
+ vrhadd.u8 d4, d4, d10
+ vmov d10, d26
+ vmov d14, d30
+
+ vst1.32 d4[0], [r1], r3 @ store row 2
+ vst1.32 d4[1], [r1], r3 @ store row 3
+
+ bgt loop_4
+
+end_func:
+ vldmia sp!, {d8-d15} @ Restore neon registers that were saved
+ ldmfd sp!, {r4-r12, pc} @Restoring registers from stack
+
+
diff --git a/common/arm/ih264_inter_pred_luma_horz_qpel_a9q.s b/common/arm/ih264_inter_pred_luma_horz_qpel_a9q.s
new file mode 100755
index 0000000..c39ae01
--- /dev/null
+++ b/common/arm/ih264_inter_pred_luma_horz_qpel_a9q.s
@@ -0,0 +1,266 @@
+@/******************************************************************************
+@ *
+@ * Copyright (C) 2015 The Android Open Source Project
+@ *
+@ * Licensed under the Apache License, Version 2.0 (the "License");
+@ * you may not use this file except in compliance with the License.
+@ * You may obtain a copy of the License at:
+@ *
+@ * http://www.apache.org/licenses/LICENSE-2.0
+@ *
+@ * Unless required by applicable law or agreed to in writing, software
+@ * distributed under the License is distributed on an "AS IS" BASIS,
+@ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@ * See the License for the specific language governing permissions and
+@ * limitations under the License.
+@ *
+@ *****************************************************************************
+@ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
+@*/
+@/**
+@******************************************************************************
+@* @file
+@* ih264_inter_pred_luma_horz_qpel_a9q.s
+@*
+@* @brief
+@* Contains function definitions for inter prediction horizontal quarter pel interpolation.
+@*
+@* @author
+@* Mohit
+@*
+@* @par List of Functions:
+@*
+@* - ih264_inter_pred_luma_horz_qpe_a9ql()
+@*
+@* @remarks
+@* None
+@*
+@*******************************************************************************
+@*/
+
+@/* All the functions here are replicated from ih264_inter_pred_filters.c
+@
+
+@/**
+@/**
+@*******************************************************************************
+@*
+@* @brief
+@* Quarter pel interprediction luma filter for horizontal input
+@*
+@* @par Description:
+@* Applies a 6 tap horizontal filter .The output is clipped to 8 bits
+@* sec 8.4.2.2.1 titled "Luma sample interpolation process"
+@*
+@* @param[in] pu1_src
+@* UWORD8 pointer to the source
+@*
+@* @param[out] pu1_dst
+@* UWORD8 pointer to the destination
+@*
+@* @param[in] src_strd
+@* integer source stride
+@*
+@* @param[in] dst_strd
+@* integer destination stride
+@*
+@* @param[in] ht
+@* integer height of the array
+@*
+@* @param[in] wd
+@* integer width of the array
+@*
+@ @param[in] pu1_tmp: temporary buffer: UNUSED in this function
+@*
+@* @param[in] dydx: x and y reference offset for qpel calculations.
+@* @returns
+@*
+@ @remarks
+@* None
+@*
+@*******************************************************************************
+@*/
+
+@void ih264_inter_pred_luma_horz (
+@ UWORD8 *pu1_src,
+@ UWORD8 *pu1_dst,
+@ WORD32 src_strd,
+@ WORD32 dst_strd,
+@ WORD32 ht,
+@ WORD32 wd,
+@ UWORD8* pu1_tmp,
+@ UWORD32 dydx)
+
+@**************Variables Vs Registers*****************************************
+@ r0 => *pu1_src
+@ r1 => *pu1_dst
+@ r2 => src_strd
+@ r3 => dst_strd
+@ r5 => ht
+@ r6 => wd
+@ r7 => dydx
+
+.text
+.p2align 2
+
+
+ .global ih264_inter_pred_luma_horz_qpel_a9q
+
+ih264_inter_pred_luma_horz_qpel_a9q:
+
+ stmfd sp!, {r4-r12, r14} @store register values to stack
+ vstmdb sp!, {d8-d15} @push neon registers to stack
+ ldr r5, [sp, #104] @Loads ht
+ ldr r6, [sp, #108] @Loads wd
+ ldr r7, [sp, #116] @Loads dydx
+ and r7, r7, #3 @Finds x-offset
+ add r7, r0, r7, lsr #1 @pu1_src + (x_offset>>1)
+ sub r0, r0, #2 @pu1_src-2
+ vmov.i8 d0, #5 @filter coeff
+ subs r12, r6, #8 @if wd=8 branch to loop_8
+ vmov.i8 d1, #20 @filter coeff
+
+ beq loop_8
+
+ subs r12, r6, #4 @if wd=4 branch to loop_4
+ beq loop_4
+
+loop_16: @when wd=16
+ @// Processing row0 and row1
+ vld1.8 {d2, d3, d4}, [r0], r2 @// Load row0
+ vext.8 d31, d2, d3, #5 @//extract a[5] (column1,row0)
+ vld1.8 {d5, d6, d7}, [r0], r2 @// Load row1
+ vext.8 d30, d3, d4, #5 @//extract a[5] (column2,row0)
+ vaddl.u8 q4, d31, d2 @// a0 + a5 (column1,row0)
+ vext.8 d28, d5, d6, #5 @//extract a[5] (column1,row1)
+ vaddl.u8 q5, d30, d3 @// a0 + a5 (column2,row0)
+ vext.8 d27, d6, d7, #5 @//extract a[5] (column2,row1)
+ vaddl.u8 q7, d28, d5 @// a0 + a5 (column1,row1)
+ vext.8 d31, d2, d3, #2 @//extract a[2] (column1,row0)
+ vaddl.u8 q8, d27, d6 @// a0 + a5 (column2,row1)
+ vext.8 d30, d3, d4, #2 @//extract a[2] (column2,row0)
+ vmlal.u8 q4, d31, d1 @// a0 + a5 + 20a2 (column1,row0)
+ vext.8 d28, d5, d6, #2 @//extract a[2] (column1,row1)
+ vmlal.u8 q5, d30, d1 @// a0 + a5 + 20a2 (column2,row0)
+ vext.8 d27, d6, d7, #2 @//extract a[2] (column2,row1)
+ vmlal.u8 q7, d28, d1 @// a0 + a5 + 20a2 (column1,row1)
+ vext.8 d31, d2, d3, #3 @//extract a[3] (column1,row0)
+ vmlal.u8 q8, d27, d1 @// a0 + a5 + 20a2 (column2,row1)
+ vext.8 d30, d3, d4, #3 @//extract a[3] (column2,row0)
+ vmlal.u8 q4, d31, d1 @// a0 + a5 + 20a2 + 20a3 (column1,row0)
+ vext.8 d28, d5, d6, #3 @//extract a[3] (column1,row1)
+ vmlal.u8 q5, d30, d1 @// a0 + a5 + 20a2 + 20a3 (column2,row0)
+ vext.8 d27, d6, d7, #3 @//extract a[3] (column2,row1)
+ vmlal.u8 q7, d28, d1 @// a0 + a5 + 20a2 + 20a3 (column1,row1)
+ vext.8 d31, d2, d3, #1 @//extract a[1] (column1,row0)
+ vmlal.u8 q8, d27, d1 @// a0 + a5 + 20a2 + 20a3 (column2,row1)
+ vext.8 d30, d3, d4, #1 @//extract a[1] (column2,row0)
+ vmlsl.u8 q4, d31, d0 @// a0 + a5 + 20a2 + 20a3 - 5a1 (column1,row0)
+ vext.8 d28, d5, d6, #1 @//extract a[1] (column1,row1)
+ vmlsl.u8 q5, d30, d0 @// a0 + a5 + 20a2 + 20a3 - 5a1 (column2,row0)
+ vext.8 d27, d6, d7, #1 @//extract a[1] (column2,row1)
+ vmlsl.u8 q7, d28, d0 @// a0 + a5 + 20a2 + 20a3 - 5a1 (column1,row1)
+ vext.8 d31, d2, d3, #4 @//extract a[4] (column1,row0)
+ vmlsl.u8 q8, d27, d0 @// a0 + a5 + 20a2 + 20a3 - 5a1 (column2,row1)
+ vext.8 d30, d3, d4, #4 @//extract a[4] (column2,row0)
+ vmlsl.u8 q4, d31, d0 @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1,row0)
+ vext.8 d28, d5, d6, #4 @//extract a[4] (column1,row1)
+ vmlsl.u8 q5, d30, d0 @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column2,row0)
+ vext.8 d27, d6, d7, #4 @//extract a[4] (column2,row1)
+ vmlsl.u8 q7, d28, d0 @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1,row1)
+ vmlsl.u8 q8, d27, d0 @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column2,row1)
+ vld1.32 {d12, d13}, [r7], r2 @Load value for interpolation (column1,row0)
+ vqrshrun.s16 d20, q4, #5 @// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column1,row0)
+ vqrshrun.s16 d21, q5, #5 @// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column2,row0)
+ vext.8 d31, d2, d3, #5 @//extract a[5] (column1,row2)
+ vrhadd.u8 q10, q6, q10 @Interpolation step for qpel calculation
+ vqrshrun.s16 d18, q7, #5 @// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column1,row1)
+ vst1.8 {d20, d21}, [r1], r3 @//Store dest row0
+ vext.8 d30, d3, d4, #5 @//extract a[5] (column2,row2)
+ vqrshrun.s16 d19, q8, #5 @// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column2,row1)
+ vld1.32 {d12, d13}, [r7], r2 @Load value for interpolation (column1,row1)
+ vrhadd.u8 q9, q6, q9 @Interpolation step for qpel calculation
+ vst1.8 {d18, d19}, [r1], r3 @//Store dest row1
+ subs r5, r5, #2 @ 2 rows done, decrement by 2
+
+ beq end_func
+ b loop_16
+
+loop_8:
+@// Processing row0 and row1
+
+ vld1.8 {d5, d6}, [r0], r2 @// Load row1
+ vext.8 d28, d5, d6, #5 @//extract a[5] (column1,row1)
+ vld1.8 {d2, d3}, [r0], r2 @// Load row0
+ vext.8 d25, d5, d6, #2 @//extract a[2] (column1,row1)
+ vext.8 d31, d2, d3, #5 @//extract a[5] (column1,row0)
+ vext.8 d24, d5, d6, #3 @//extract a[3] (column1,row1)
+ vext.8 d23, d5, d6, #1 @//extract a[1] (column1,row1)
+ vext.8 d22, d5, d6, #4 @//extract a[4] (column1,row1)
+ vaddl.u8 q7, d28, d5 @// a0 + a5 (column1,row1)
+ vext.8 d29, d2, d3, #3 @//extract a[3] (column1,row0)
+ vmlal.u8 q7, d25, d1 @// a0 + a5 + 20a2 (column1,row1)
+ vmlal.u8 q7, d24, d1 @// a0 + a5 + 20a2 + 20a3 (column1,row1)
+ vmlsl.u8 q7, d23, d0 @// a0 + a5 + 20a2 + 20a3 - 5a1 (column1,row1)
+ vmlsl.u8 q7, d22, d0 @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1,row1)
+ vext.8 d30, d2, d3, #2 @//extract a[2] (column1,row0)
+ vaddl.u8 q4, d31, d2 @// a0 + a5 (column1,row0)
+ vext.8 d27, d2, d3, #1 @//extract a[1] (column1,row0)
+ vext.8 d26, d2, d3, #4 @//extract a[4] (column1,row0)
+ vmlal.u8 q4, d29, d1 @// a0 + a5 + 20a2 + 20a3 (column1,row0)
+ vmlal.u8 q4, d30, d1 @// a0 + a5 + 20a2 (column1,row0)
+ vmlsl.u8 q4, d27, d0 @// a0 + a5 + 20a2 + 20a3 - 5a1 (column1,row0)
+ vmlsl.u8 q4, d26, d0 @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1,row0)
+ vqrshrun.s16 d18, q7, #5 @// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column1,row0)
+ vld1.32 d12, [r7], r2 @Load value for interpolation (column1,row0)
+ vld1.32 d13, [r7], r2 @Load value for interpolation (column1,row1)
+ vqrshrun.s16 d19, q4, #5 @// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column1,row1)
+ vrhadd.u8 q9, q6, q9 @Interpolation step for qpel calculation
+ vst1.8 {d18}, [r1], r3 @//Store dest row0
+ vst1.8 {d19}, [r1], r3 @//Store dest row1
+ subs r5, r5, #2 @ 2 rows done, decrement by 2
+
+ beq end_func @ Branch if height==4
+ b loop_8 @looping if height == 8 or 16
+
+loop_4:
+ vld1.8 {d5, d6}, [r0], r2 @// Load row1
+ vext.8 d28, d5, d6, #5 @//extract a[5] (column1,row1)
+ vld1.8 {d2, d3}, [r0], r2 @// Load row0
+ vext.8 d25, d5, d6, #2 @//extract a[2] (column1,row1)
+ vext.8 d31, d2, d3, #5 @//extract a[5] (column1,row0)
+ vaddl.u8 q7, d28, d5 @// a0 + a5 (column1,row1)
+ vext.8 d24, d5, d6, #3 @//extract a[3] (column1,row1)
+ vext.8 d23, d5, d6, #1 @//extract a[1] (column1,row1)
+ vext.8 d22, d5, d6, #4 @//extract a[4] (column1,row1)
+ vext.8 d29, d2, d3, #3 @//extract a[3] (column1,row0)
+ vmlal.u8 q7, d25, d1 @// a0 + a5 + 20a2 (column1,row1)
+ vmlal.u8 q7, d24, d1 @// a0 + a5 + 20a2 + 20a3 (column1,row1)
+ vmlsl.u8 q7, d23, d0 @// a0 + a5 + 20a2 + 20a3 - 5a1 (column1,row1)
+ vmlsl.u8 q7, d22, d0 @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1,row1)
+ vaddl.u8 q4, d31, d2 @// a0 + a5 (column1,row0)
+ vext.8 d30, d2, d3, #2 @//extract a[2] (column1,row0)
+ vld1.32 d12, [r7], r2 @Load value for interpolation (column1,row0)
+ vld1.32 d13, [r7], r2 @Load value for interpolation (column1,row1)
+ vext.8 d27, d2, d3, #1 @//extract a[1] (column1,row0)
+ vext.8 d26, d2, d3, #4 @//extract a[4] (column1,row0)
+ vmlal.u8 q4, d29, d1 @// a0 + a5 + 20a2 + 20a3 (column1,row0)
+ vmlal.u8 q4, d30, d1 @// a0 + a5 + 20a2 (column1,row0)
+ vmlsl.u8 q4, d27, d0 @// a0 + a5 + 20a2 + 20a3 - 5a1 (column1,row0)
+ vmlsl.u8 q4, d26, d0 @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1,row0)
+ vqrshrun.s16 d18, q7, #5 @// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column1,row0)
+ vqrshrun.s16 d19, q4, #5 @// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column1,row1)
+ vrhadd.u8 q9, q6, q9 @Interpolation step for qpel calculation
+ vst1.32 d18[0], [r1], r3 @//Store dest row0
+ vst1.32 d19[0], [r1], r3 @//Store dest row1
+
+ subs r5, r5, #2 @ 2 rows done, decrement by 2
+ beq end_func
+
+ b loop_4
+
+end_func:
+ vldmia sp!, {d8-d15} @ Restore neon registers that were saved
+ ldmfd sp!, {r4-r12, pc} @Restoring registers from stack
+
+
diff --git a/common/arm/ih264_inter_pred_luma_horz_qpel_vert_hpel_a9q.s b/common/arm/ih264_inter_pred_luma_horz_qpel_vert_hpel_a9q.s
new file mode 100755
index 0000000..565cc80
--- /dev/null
+++ b/common/arm/ih264_inter_pred_luma_horz_qpel_vert_hpel_a9q.s
@@ -0,0 +1,505 @@
+@/******************************************************************************
+@ *
+@ * Copyright (C) 2015 The Android Open Source Project
+@ *
+@ * Licensed under the Apache License, Version 2.0 (the "License");
+@ * you may not use this file except in compliance with the License.
+@ * You may obtain a copy of the License at:
+@ *
+@ * http://www.apache.org/licenses/LICENSE-2.0
+@ *
+@ * Unless required by applicable law or agreed to in writing, software
+@ * distributed under the License is distributed on an "AS IS" BASIS,
+@ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@ * See the License for the specific language governing permissions and
+@ * limitations under the License.
+@ *
+@ *****************************************************************************
+@ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
+@*/
+@/**
+@******************************************************************************
+@* @file
+@* ih264_inter_pred_luma_horz_qpel_vert_hpel_a9q.s
+@*
+@* @brief
+@* Contains function definitions for inter prediction interpolation.
+@*
+@* @author
+@* Mohit
+@*
+@* @par List of Functions:
+@*
+@* - ih264_inter_pred_luma_horz_qpel_vert_hpel_a9q()
+@*
+@* @remarks
+@* None
+@*
+@*******************************************************************************
+@*/
+
+@/* All the functions here are replicated from ih264_inter_pred_filters.c
+@
+
+@/**
+@/**
+@/**
+@*******************************************************************************
+@*
+@* @brief
+@* This function implements a two stage cascaded six tap filter. It
+@* applies the six tap filter in the vertical direction on the
+@* predictor values, followed by applying the same filter in the
+@* horizontal direction on the output of the first stage. It then averages
+@* the output of the 1st stage and the final stage to obtain the quarter
+@* pel values.The six tap filtering operation is described in sec 8.4.2.2.1
+@* titled "Luma sample interpolation process".
+@*
+@* @par Description:
+@* This function is called to obtain pixels lying at the following
+@* location (1/4,1/2) or (3/4,1/2). The function interpolates
+@* the predictors first in the verical direction and then in the
+@* horizontal direction to output the (1/2,1/2). It then averages
+@* the output of the 2nd stage and (1/2,1/2) value to obtain (1/4,1/2)
+@* or (3/4,1/2) depending on the offset.
+@*
+@* @param[in] pu1_src
+@* UWORD8 pointer to the source
+@*
+@* @param[out] pu1_dst
+@* UWORD8 pointer to the destination
+@*
+@* @param[in] src_strd
+@* integer source stride
+@*
+@* @param[in] dst_strd
+@* integer destination stride
+@*
+@* @param[in] ht
+@* integer height of the array
+@*
+@* @param[in] wd
+@* integer width of the array
+@*
+@* @param[in] pu1_tmp: temporary buffer
+@*
+@* @param[in] dydx: x and y reference offset for qpel calculations
+@*
+@* @returns
+@*
+@* @remarks
+@* None
+@*
+@*******************************************************************************
+@*/;
+
+@void ih264_inter_pred_luma_horz_qpel_vert_hpel(UWORD8 *pu1_src,
+@ UWORD8 *pu1_dst,
+@ WORD32 src_strd,,
+@ WORD32 dst_strd,
+@ WORD32 ht,
+@ WORD32 wd,
+@ UWORD8* pu1_tmp,
+@ UWORD32 dydx)
+
+@**************Variables Vs Registers*****************************************
+@ r0 => *pu1_src
+@ r1 => *pu1_dst
+@ r2 => src_strd
+@ r3 => dst_strd
+@ r4 => ht
+@ r5 => wd
+@ r6 => dydx
+@ r9 => *pu1_tmp
+
+.text
+.p2align 2
+
+ .global ih264_inter_pred_luma_horz_qpel_vert_hpel_a9q
+
+ih264_inter_pred_luma_horz_qpel_vert_hpel_a9q:
+
+ stmfd sp!, {r4-r12, r14} @store register values to stack
+ vstmdb sp!, {d8-d15} @push neon registers to stack
+ ldr r4, [sp, #104] @ loads ht
+ sub r0, r0, r2, lsl #1 @pu1_src-2*src_strd
+ sub r0, r0, #2 @pu1_src-2
+ ldr r5, [sp, #108] @ loads wd
+ ldr r6, [sp, #116] @ loads dydx
+ and r6, r6, #2 @ dydx & 0x3 followed by dydx>>1 and dydx<<1
+ ldr r9, [sp, #112] @pu1_tmp
+ add r7, r9, #4
+ add r6, r7, r6 @ pi16_pred1_temp += (x_offset>>1)
+
+ vmov.u16 q13, #0x14 @ Filter coeff 20 into Q13
+ vmov.u16 q12, #0x5 @ Filter coeff 5 into Q12
+ mov r7, #0x20
+ mov r8, #0x30
+ subs r12, r5, #4 @if wd=4 branch to loop_4
+ beq loop_4
+
+ subs r12, r5, #8 @if wd=8 branch to loop_8
+ beq loop_8
+
+ @when wd=16
+ vmov.u16 q14, #0x14 @ Filter coeff 20 into Q13
+ vmov.u16 q15, #0x5 @ Filter coeff 5 into Q12
+ add r14, r2, #0
+ sub r2, r2, #16
+
+
+loop_16:
+
+ vld1.u32 {q0}, [r0]! @ Vector load from src[0_0]
+ vld1.u32 d12, [r0], r2 @ Vector load from src[0_0]
+ vld1.u32 {q1}, [r0]! @ Vector load from src[1_0]
+ vld1.u32 d13, [r0], r2 @ Vector load from src[1_0]
+ vld1.u32 {q2}, [r0]! @ Vector load from src[2_0]
+ vld1.u32 d14, [r0], r2 @ Vector load from src[2_0]
+ vld1.u32 {q3}, [r0]! @ Vector load from src[3_0]
+ vld1.u32 d15, [r0], r2 @ Vector load from src[3_0]
+ vld1.u32 {q4}, [r0]! @ Vector load from src[4_0]
+ vld1.u32 d16, [r0], r2 @ Vector load from src[4_0]
+
+ vld1.u32 {q5}, [r0]! @ Vector load from src[5_0]
+ vld1.u32 d17, [r0], r2 @ Vector load from src[5_0]
+
+ vaddl.u8 q10, d4, d6
+ vaddl.u8 q9, d0, d10
+ vaddl.u8 q11, d2, d8
+ vmla.u16 q9, q10, q14
+ vaddl.u8 q12, d5, d7
+ vaddl.u8 q10, d1, d11
+ vaddl.u8 q13, d3, d9
+ vmla.u16 q10, q12, q14
+ vaddl.u8 q12, d14, d15
+ vmls.u16 q9, q11, q15
+ vaddl.u8 q11, d12, d17
+ vmls.u16 q10, q13, q15
+ vaddl.u8 q13, d13, d16
+ vmla.u16 q11, q12, q14
+ vmls.u16 q11, q13, q15
+ vst1.32 {q9}, [r9]!
+ vst1.32 {q10}, [r9]!
+ vext.16 q12, q9, q10, #2
+ vext.16 q13, q9, q10, #3
+ vst1.32 {q11}, [r9]
+ vext.16 q11, q9, q10, #5
+ vadd.s16 q0, q12, q13
+ vext.16 q12, q9, q10, #1
+ vext.16 q13, q9, q10, #4
+ vadd.s16 q12, q12, q13
+
+ vaddl.s16 q13, d18, d22
+ vmlal.s16 q13, d0, d28
+ vmlsl.s16 q13, d24, d30
+
+ vaddl.s16 q11, d19, d23
+ vmlal.s16 q11, d1, d28
+ vmlsl.s16 q11, d25, d30
+
+ vqrshrun.s32 d18, q13, #10
+ vqrshrun.s32 d19, q11, #10
+ vld1.32 {q11}, [r9]!
+ vqmovn.u16 d18, q9
+
+ vext.16 q12, q10, q11, #2
+ vext.16 q13, q10, q11, #3
+ vext.16 q0, q10, q11, #5
+ vst1.32 d18, [r1]
+ vadd.s16 q9, q12, q13
+ vext.16 q12, q10, q11, #1
+ vext.16 q13, q10, q11, #4
+ vadd.s16 q12, q12, q13
+
+ vaddl.s16 q13, d0, d20
+ vmlal.s16 q13, d18, d28
+ vmlsl.s16 q13, d24, d30
+
+ vaddl.s16 q11, d1, d21
+ vmlal.s16 q11, d19, d28
+ vmlsl.s16 q11, d25, d30
+
+ vqrshrun.s32 d18, q13, #10
+ vqrshrun.s32 d19, q11, #10
+
+ vaddl.u8 q12, d7, d9
+ vld1.32 {q10}, [r6]!
+ vld1.32 {q11}, [r6], r7
+
+ vqmovn.u16 d19, q9
+
+ vld1.32 d18, [r1]
+ vqrshrun.s16 d20, q10, #5
+ vqrshrun.s16 d21, q11, #5
+ vaddl.u8 q11, d4, d10
+ vld1.u32 {q0}, [r0]! @ Vector load from src[6_0]
+ vrhadd.u8 q9, q9, q10
+ vld1.u32 d12, [r0], r2 @ Vector load from src[6_0]
+ vaddl.u8 q10, d6, d8
+ vaddl.u8 q13, d5, d11
+ vst1.32 {q9}, [r1], r3 @ store row 0
+
+@ROW_2
+
+ vaddl.u8 q9, d2, d0
+
+ vmla.u16 q9, q10, q14
+
+ vaddl.u8 q10, d3, d1
+
+ vmla.u16 q10, q12, q14
+ vaddl.u8 q12, d15, d16
+ vmls.u16 q9, q11, q15
+ vaddl.u8 q11, d13, d12
+ vmls.u16 q10, q13, q15
+ vaddl.u8 q13, d14, d17
+ vmla.u16 q11, q12, q14
+ vmls.u16 q11, q13, q15
+ vst1.32 {q9}, [r9]!
+ vst1.32 {q10}, [r9]!
+ vext.16 q12, q9, q10, #2
+ vext.16 q13, q9, q10, #3
+ vst1.32 {q11}, [r9]
+ vext.16 q11, q9, q10, #5
+ vadd.s16 q1, q12, q13
+ vext.16 q12, q9, q10, #1
+ vext.16 q13, q9, q10, #4
+ vadd.s16 q12, q12, q13
+
+ vaddl.s16 q13, d18, d22
+ vmlal.s16 q13, d2, d28
+ vmlsl.s16 q13, d24, d30
+
+ vaddl.s16 q11, d19, d23
+ vmlal.s16 q11, d3, d28
+ vmlsl.s16 q11, d25, d30
+
+ vqrshrun.s32 d18, q13, #10
+ vqrshrun.s32 d19, q11, #10
+ vld1.32 {q11}, [r9]!
+ vqmovn.u16 d18, q9
+
+ vext.16 q12, q10, q11, #2
+ vext.16 q13, q10, q11, #3
+ vext.16 q1, q10, q11, #5
+ vst1.32 d18, [r1]
+ vadd.s16 q9, q12, q13
+ vext.16 q12, q10, q11, #1
+ vext.16 q13, q10, q11, #4
+ vadd.s16 q12, q12, q13
+
+ vaddl.s16 q13, d2, d20
+ vmlal.s16 q13, d18, d28
+ vmlsl.s16 q13, d24, d30
+
+ vaddl.s16 q11, d3, d21
+ vmlal.s16 q11, d19, d28
+ vmlsl.s16 q11, d25, d30
+
+ vqrshrun.s32 d18, q13, #10
+ vqrshrun.s32 d19, q11, #10
+ vaddl.u8 q12, d9, d11
+ vld1.32 {q10}, [r6]!
+ vld1.32 {q11}, [r6], r7
+ vqmovn.u16 d19, q9
+ vld1.32 d18, [r1]
+ vqrshrun.s16 d20, q10, #5
+ vqrshrun.s16 d21, q11, #5
+
+ vrhadd.u8 q9, q9, q10
+
+ vst1.32 {q9}, [r1], r3 @ store row 1
+
+ subs r4, r4, #2
+ subne r0, r0 , r14, lsl #2
+ subne r0, r0, r14
+
+ beq end_func @ Branch if height==4
+ b loop_16 @ Loop if height==8
+
+loop_8:
+ vld1.u32 {q0}, [r0], r2 @ Vector load from src[0_0]
+ vld1.u32 {q1}, [r0], r2 @ Vector load from src[1_0]
+ vld1.u32 {q2}, [r0], r2 @ Vector load from src[2_0]
+ vld1.u32 {q3}, [r0], r2 @ Vector load from src[3_0]
+ vld1.u32 {q4}, [r0], r2 @ Vector load from src[4_0]
+
+ vld1.u32 {q5}, [r0], r2 @ Vector load from src[5_0]
+ vaddl.u8 q7, d4, d6
+ vaddl.u8 q6, d0, d10
+ vaddl.u8 q8, d2, d8
+ vmla.u16 q6, q7, q13
+ vaddl.u8 q9, d5, d7
+ vaddl.u8 q7, d1, d11
+ vaddl.u8 q11, d3, d9
+ vmla.u16 q7, q9, q13
+ vmls.u16 q6, q8, q12
+ vld1.32 {q0}, [r0], r2 @ Vector load from src[6_0]
+ vaddl.u8 q8, d6, d8
+ vmls.u16 q7, q11, q12
+ vaddl.u8 q14, d2, d0
+ vst1.32 {q6}, [r9]! @ store row 0 to temp buffer: col 0
+ vext.16 q11, q6, q7, #5
+ vaddl.u8 q9, d4, d10
+ vmla.u16 q14, q8, q13
+ vaddl.s16 q15, d12, d22
+ vst1.32 {q7}, [r9], r7 @ store row 0 to temp buffer: col 1
+ vaddl.s16 q11, d13, d23
+ vext.16 q8, q6, q7, #2
+ vmls.u16 q14, q9, q12
+ vext.16 q9, q6, q7, #3
+ vext.16 q10, q6, q7, #4
+ vext.16 q7, q6, q7, #1
+ vadd.s16 q8, q8, q9
+ vadd.s16 q9, q7, q10
+ vaddl.u8 q10, d7, d9
+ vmlal.s16 q15, d16, d26
+ vmlsl.s16 q15, d18, d24
+ vmlal.s16 q11, d17, d26
+ vmlsl.s16 q11, d19, d24
+ vaddl.u8 q7, d3, d1
+ vst1.32 {q14}, [r9]! @ store row 1 to temp buffer: col 0
+ vmla.u16 q7, q10, q13
+ vqrshrun.s32 d12, q15, #10
+ vaddl.u8 q8, d5, d11
+ vqrshrun.s32 d13, q11, #10
+ vmls.u16 q7, q8, q12
+@ vld1.32 {q1},[r0],r2 ; Vector load from src[7_0]
+ vqmovn.u16 d25, q6
+ vaddl.u8 q8, d8, d10
+
+
+ vext.16 q11, q14, q7, #5
+ vaddl.u8 q10, d4, d2
+ vaddl.s16 q15, d28, d22
+ vmla.u16 q10, q8, q13
+ vst1.32 {q7}, [r9], r7 @ store row 1 to temp buffer: col 1
+ vaddl.s16 q11, d29, d23
+ vext.16 q8, q14, q7, #2
+ vext.16 q9, q14, q7, #3
+ vext.16 q6, q14, q7, #4
+ vext.16 q7, q14, q7, #1
+ vadd.s16 q8, q8, q9
+ vadd.s16 q9, q6, q7
+ vld1.32 {q7}, [r6], r8 @ load row 0 from temp buffer
+ vmlal.s16 q15, d16, d26
+ vmlsl.s16 q15, d18, d24
+ vmlal.s16 q11, d17, d26
+ vmlsl.s16 q11, d19, d24
+ vqrshrun.s16 d14, q7, #0x5
+ vld1.32 {q14}, [r6], r8 @ load row 1 from temp buffer
+ vaddl.u8 q9, d6, d0
+ vqrshrun.s32 d16, q15, #10
+ vqrshrun.s16 d15, q14, #0x5
+ vqrshrun.s32 d17, q11, #10
+ vmov d12, d25
+ vmov d25, d24
+
+ vqmovn.u16 d13, q8
+ vrhadd.u8 q6, q6, q7
+
+ vst1.32 d12, [r1], r3 @ store row 0
+ vst1.32 d13, [r1], r3 @ store row 1
+
+ subs r4, r4, #2
+ subne r0, r0 , r2, lsl #2
+ subne r0, r0, r2
+
+ beq end_func @ Branch if height==4
+ b loop_8 @ Loop if height==8
+
+loop_4:
+ vld1.u32 {q0}, [r0], r2 @ Vector load from src[0_0]
+ vld1.u32 {q1}, [r0], r2 @ Vector load from src[1_0]
+ vld1.u32 {q2}, [r0], r2 @ Vector load from src[2_0]
+ vld1.u32 {q3}, [r0], r2 @ Vector load from src[3_0]
+ vld1.u32 {q4}, [r0], r2 @ Vector load from src[4_0]
+ vld1.u32 {q5}, [r0], r2 @ Vector load from src[5_0]
+
+ vaddl.u8 q7, d4, d6 @ temp1 = src[2_0] + src[3_0]
+ vaddl.u8 q6, d0, d10 @ temp = src[0_0] + src[5_0]
+ vaddl.u8 q8, d2, d8 @ temp2 = src[1_0] + src[4_0]
+ vmla.u16 q6, q7, q13 @ temp += temp1 * 20
+ vaddl.u8 q9, d5, d7 @ temp1 = src[2_0] + src[3_0]
+ vaddl.u8 q7, d1, d11 @ temp = src[0_0] + src[5_0]
+ vaddl.u8 q11, d3, d9 @ temp2 = src[1_0] + src[4_0]
+ vmla.u16 q7, q9, q13 @ temp += temp1 * 20
+ vmls.u16 q6, q8, q12 @ temp -= temp2 * 5
+ vld1.32 {q0}, [r0], r2 @ Vector load from src[6_0]
+ vaddl.u8 q8, d6, d8
+ vmls.u16 q7, q11, q12 @ temp -= temp2 * 5
+ @Q6 and Q7 have filtered values
+ vaddl.u8 q14, d2, d0
+ vst1.32 {q6}, [r9]! @ store row 0 to temp buffer: col 0
+ vext.16 q11, q6, q7, #5
+ vaddl.u8 q9, d4, d10
+ vmla.u16 q14, q8, q13
+ vaddl.s16 q15, d12, d22
+ vst1.32 {q7}, [r9], r7 @ store row 0 to temp buffer: col 1
+ vaddl.s16 q11, d13, d23
+ vext.16 q8, q6, q7, #2
+ vmls.u16 q14, q9, q12
+ vext.16 q9, q6, q7, #3
+ vext.16 q10, q6, q7, #4
+ vext.16 q7, q6, q7, #1
+ vadd.s16 q8, q8, q9
+ vadd.s16 q9, q7, q10
+ vaddl.u8 q10, d7, d9
+ vmlal.s16 q15, d16, d26
+ vmlsl.s16 q15, d18, d24
+ vmlal.s16 q11, d17, d26
+ vmlsl.s16 q11, d19, d24
+ vaddl.u8 q7, d3, d1
+ vst1.32 {q14}, [r9]! @ store row 1 to temp buffer: col 0
+ vmla.u16 q7, q10, q13
+ vqrshrun.s32 d12, q15, #10
+ vaddl.u8 q8, d5, d11
+ vqrshrun.s32 d13, q11, #10
+ vmls.u16 q7, q8, q12
+ vqmovn.u16 d25, q6
+ vaddl.u8 q8, d8, d10
+
+ vext.16 q11, q14, q7, #5
+ vaddl.u8 q10, d4, d2
+ vaddl.s16 q15, d28, d22
+ vmla.u16 q10, q8, q13
+ vst1.32 {q7}, [r9], r7 @ store row 1 to temp buffer: col 1
+ vaddl.s16 q11, d29, d23
+ vext.16 q8, q14, q7, #2
+ vext.16 q9, q14, q7, #3
+ vext.16 q6, q14, q7, #4
+ vext.16 q7, q14, q7, #1
+ vadd.s16 q8, q8, q9
+ vadd.s16 q9, q6, q7
+ vld1.32 d14, [r6], r8 @load row 0 from temp buffer
+ vmlal.s16 q15, d16, d26
+ vmlsl.s16 q15, d18, d24
+ vmlal.s16 q11, d17, d26
+ vmlsl.s16 q11, d19, d24
+ vqrshrun.s16 d14, q7, #0x5
+ vld1.32 d28, [r6], r8 @load row 1 from temp buffer
+ vaddl.u8 q9, d6, d0
+ vqrshrun.s32 d16, q15, #10
+ vqrshrun.s16 d15, q14, #0x5
+ vqrshrun.s32 d17, q11, #10
+ vmov d12, d25
+ vmov d25, d24
+
+ vqmovn.u16 d13, q8
+ vrhadd.u8 q6, q6, q7
+ vst1.32 d12[0], [r1], r3 @ store row 0
+ vst1.32 d13[0], [r1], r3 @store row 1
+
+ subs r4, r4, #2
+ subne r0, r0 , r2, lsl #2
+ subne r0, r0, r2
+
+ beq end_func @ Branch if height==4
+ b loop_4 @ Loop if height==8
+
+end_func:
+ vldmia sp!, {d8-d15} @ Restore neon registers that were saved
+ ldmfd sp!, {r4-r12, pc} @Restoring registers from stack
+
+
diff --git a/common/arm/ih264_inter_pred_luma_horz_qpel_vert_qpel_a9q.s b/common/arm/ih264_inter_pred_luma_horz_qpel_vert_qpel_a9q.s
new file mode 100755
index 0000000..3c8b60a
--- /dev/null
+++ b/common/arm/ih264_inter_pred_luma_horz_qpel_vert_qpel_a9q.s
@@ -0,0 +1,355 @@
+@/******************************************************************************
+@ *
+@ * Copyright (C) 2015 The Android Open Source Project
+@ *
+@ * Licensed under the Apache License, Version 2.0 (the "License");
+@ * you may not use this file except in compliance with the License.
+@ * You may obtain a copy of the License at:
+@ *
+@ * http://www.apache.org/licenses/LICENSE-2.0
+@ *
+@ * Unless required by applicable law or agreed to in writing, software
+@ * distributed under the License is distributed on an "AS IS" BASIS,
+@ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@ * See the License for the specific language governing permissions and
+@ * limitations under the License.
+@ *
+@ *****************************************************************************
+@ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
+@*/
+@/**
+@******************************************************************************
+@* @file
+@* ih264_inter_pred_luma_horz_qpel_vert_qpel_a9q.s
+@*
+@* @brief
+@* Contains function definitions for inter prediction interpolation.
+@*
+@* @author
+@* Mohit
+@*
+@* @par List of Functions:
+@*
+@* - ih264_inter_pred_luma_horz_qpel_vert_qpel_a9q()
+@*
+@* @remarks
+@* None
+@*
+@*******************************************************************************
+@*/
+
+@/* All the functions here are replicated from ih264_inter_pred_filters.c
+@
+
+@/**
+@/**
+@/**
+@*******************************************************************************
+@*
+@* @brief
+@* This function implements two six tap filters. It
+@* applies the six tap filter in the horizontal direction on the
+@* predictor values, then applies the same filter in the
+@* vertical direction on the predictor values. It then averages these
+@* two outputs to obtain quarter pel values in horizontal and vertical direction.
+@* The six tap filtering operation is described in sec 8.4.2.2.1 titled
+@* "Luma sample interpolation process"
+@*
+@* @par Description:
+@* This function is called to obtain pixels lying at the following
+@* location (1/4,1/4) or (3/4,1/4) or (1/4,3/4) or (3/4,3/4).
+@* The function interpolates the predictors first in the horizontal direction
+@* and then in the vertical direction, and then averages these two
+@* values.
+@*
+@* @param[in] pu1_src
+@* UWORD8 pointer to the source
+@*
+@* @param[out] pu1_dst
+@* UWORD8 pointer to the destination
+@*
+@* @param[in] src_strd
+@* integer source stride
+@*
+@* @param[in] dst_strd
+@* integer destination stride
+@*
+@* @param[in] ht
+@* integer height of the array
+@*
+@* @param[in] wd
+@* integer width of the array
+@*
+@* @param[in] pu1_tmp: temporary buffer
+@*
+@* @param[in] dydx: x and y reference offset for qpel calculations
+@*
+@* @returns
+@*
+@* @remarks
+@* None
+@*
+@*******************************************************************************
+@*/;
+
+@void ih264_inter_pred_luma_horz_qpel_vert_qpel(UWORD8 *pu1_src,
+@ UWORD8 *pu1_dst,
+@ WORD32 src_strd,,
+@ WORD32 dst_strd,
+@ WORD32 ht,
+@ WORD32 wd,
+@ UWORD8* pu1_tmp,
+@ UWORD32 dydx)
+
+@**************Variables Vs Registers*****************************************
+@ r0 => *pu1_src
+@ r1 => *pu1_dst
+@ r2 => src_strd
+@ r3 => dst_strd
+@ r4 => ht
+@ r5 => wd
+@ r6 => dydx
+
+.text
+.p2align 2
+
+ .global ih264_inter_pred_luma_horz_qpel_vert_qpel_a9q
+
+ih264_inter_pred_luma_horz_qpel_vert_qpel_a9q:
+
+ stmfd sp!, {r4-r12, r14} @store register values to stack
+ vstmdb sp!, {d8-d15} @push neon registers to stack
+ ldr r4, [sp, #104] @ loads ht
+ ldr r5, [sp, #108] @ loads wd
+ ldr r6, [sp, #116] @dydx
+ and r7, r6, #3
+ add r7, r0, r7, lsr #1 @pu1_pred_vert = pu1_src + (x_offset>>1)
+
+ and r6, r6, #12 @Finds y-offset
+ lsr r6, r6, #3 @dydx>>3
+ mul r6, r2, r6
+ add r6, r0, r6 @pu1_pred_horz = pu1_src + (y_offset>>1)*src_strd
+ sub r7, r7, r2, lsl #1 @pu1_pred_vert-2*src_strd
+ sub r6, r6, #2 @pu1_pred_horz-2
+ vmov.u8 d30, #20 @ Filter coeff 20
+ vmov.u8 d31, #5 @ Filter coeff 5
+
+ subs r12, r5, #4 @if wd=4 branch to loop_4
+ beq loop_4
+ subs r12, r5, #8 @if wd=8 branch to loop_8
+ beq loop_8
+
+loop_16:
+ vld1.32 {q0}, [r7], r2 @ Vector load from src[0_0]
+ vld1.32 {q1}, [r7], r2 @ Vector load from src[1_0]
+ vld1.32 {q2}, [r7], r2 @ Vector load from src[2_0]
+ vld1.32 {q3}, [r7], r2 @ Vector load from src[3_0]
+ vld1.32 {q4}, [r7], r2 @ Vector load from src[4_0]
+ add r11, r6, #8
+ vld1.32 {q5}, [r7], r2 @ Vector load from src[5_0]
+ vld1.32 {q9}, [r6], r2 @ horz row0, col 0
+ vaddl.u8 q12, d0, d10
+ vmlal.u8 q12, d4, d30
+ vmlal.u8 q12, d6, d30
+ vmlsl.u8 q12, d2, d31
+ vmlsl.u8 q12, d8, d31
+ vext.8 d23, d18, d19, #5
+ vext.8 d20, d18, d19, #2
+ vext.8 d21, d18, d19, #3
+ vext.8 d22, d18, d19, #4
+ vext.8 d19, d18, d19, #1
+ vqrshrun.s16 d26, q12, #5
+ vaddl.u8 q14, d18, d23
+ vmlal.u8 q14, d20, d30
+ vmlal.u8 q14, d21, d30
+ vmlsl.u8 q14, d19, d31
+ vmlsl.u8 q14, d22, d31
+ vld1.32 {q9}, [r11], r2 @ horz row 0, col 1
+ vaddl.u8 q12, d1, d11
+ vmlal.u8 q12, d5, d30
+ vmlal.u8 q12, d7, d30
+ vmlsl.u8 q12, d3, d31
+ vmlsl.u8 q12, d9, d31
+ vqrshrun.s16 d28, q14, #5
+ vext.8 d23, d18, d19, #5
+ vext.8 d20, d18, d19, #2
+ vext.8 d21, d18, d19, #3
+ vext.8 d22, d18, d19, #4
+ vext.8 d19, d18, d19, #1
+ vqrshrun.s16 d27, q12, #5
+ vld1.32 {q6}, [r7], r2 @ src[6_0]
+
+ vaddl.u8 q12, d18, d23
+ vmlal.u8 q12, d20, d30
+ vmlal.u8 q12, d21, d30
+ vmlsl.u8 q12, d19, d31
+ vmlsl.u8 q12, d22, d31
+
+ vaddl.u8 q8, d2, d12
+ vmlal.u8 q8, d6, d30
+ vmlal.u8 q8, d8, d30
+ vmlsl.u8 q8, d4, d31
+ vmlsl.u8 q8, d10, d31
+ vqrshrun.s16 d29, q12, #5
+ vld1.32 {q9}, [r6], r2 @ horz row 1, col 0
+
+ vaddl.u8 q12, d3, d13
+ vmlal.u8 q12, d7, d30
+ vmlal.u8 q12, d9, d30
+ vmlsl.u8 q12, d5, d31
+ vmlsl.u8 q12, d11, d31
+ vrhadd.u8 q14, q14, q13
+ vqrshrun.s16 d26, q8, #5
+ vext.8 d23, d18, d19, #5
+ vext.8 d20, d18, d19, #2
+ vext.8 d21, d18, d19, #3
+ vext.8 d22, d18, d19, #4
+ vst1.32 {q14}, [r1], r3 @ store row 0
+ vext.8 d19, d18, d19, #1
+ vqrshrun.s16 d27, q12, #5
+
+ vaddl.u8 q14, d18, d23
+ vmlal.u8 q14, d20, d30
+ vmlal.u8 q14, d21, d30
+ vmlsl.u8 q14, d19, d31
+ vmlsl.u8 q14, d22, d31
+
+ vld1.32 {q9}, [r11], r2 @ horz row 1, col 1
+
+ vext.8 d23, d18, d19, #5
+ vext.8 d20, d18, d19, #2
+ vext.8 d21, d18, d19, #3
+ vext.8 d22, d18, d19, #4
+ vext.8 d19, d18, d19, #1
+
+ vqrshrun.s16 d28, q14, #5
+ vaddl.u8 q12, d18, d23
+ vmlal.u8 q12, d20, d30
+ vmlal.u8 q12, d21, d30
+ vmlsl.u8 q12, d19, d31
+ vmlsl.u8 q12, d22, d31
+
+ vqrshrun.s16 d29, q12, #5
+ vrhadd.u8 q14, q14, q13
+ vst1.32 {q14}, [r1], r3 @ store row 1
+
+ subs r4, r4, #2 @ 2 rows processed, decrement by 2
+ subne r7, r7 , r2, lsl #2
+ subne r7, r7, r2
+ beq end_func @ Branch if height==4
+
+ b loop_16 @ looping if height = 8 or 16
+
+
+loop_8:
+ vld1.32 d0, [r7], r2 @ Vector load from src[0_0]
+ vld1.32 d1, [r7], r2 @ Vector load from src[1_0]
+ vld1.32 d2, [r7], r2 @ Vector load from src[2_0]
+ vld1.32 d3, [r7], r2 @ Vector load from src[3_0]
+ vld1.32 d4, [r7], r2 @ Vector load from src[4_0]
+ vld1.32 d5, [r7], r2 @ Vector load from src[5_0]
+ vaddl.u8 q5, d0, d5
+ vmlal.u8 q5, d2, d30
+ vmlal.u8 q5, d3, d30
+ vmlsl.u8 q5, d1, d31
+ vmlsl.u8 q5, d4, d31
+ vld1.32 {q6}, [r6], r2 @horz row 0
+ vext.8 d17, d12, d13, #5
+ vext.8 d14, d12, d13, #2
+ vext.8 d15, d12, d13, #3
+ vext.8 d16, d12, d13, #4
+ vext.8 d13, d12, d13, #1
+ vqrshrun.s16 d26, q5, #5
+ vld1.32 d6, [r7], r2 @ src[6_0]
+ vaddl.u8 q5, d12, d17
+ vmlal.u8 q5, d14, d30
+ vmlal.u8 q5, d15, d30
+ vmlsl.u8 q5, d13, d31
+ vmlsl.u8 q5, d16, d31
+ vld1.32 {q6}, [r6], r2 @ horz row 1
+ vaddl.u8 q9, d1, d6
+ vmlal.u8 q9, d3, d30
+ vmlal.u8 q9, d4, d30
+ vmlsl.u8 q9, d2, d31
+ vmlsl.u8 q9, d5, d31
+ vqrshrun.s16 d28, q5, #5
+ vext.8 d17, d12, d13, #5
+ vext.8 d14, d12, d13, #2
+ vext.8 d15, d12, d13, #3
+ vext.8 d16, d12, d13, #4
+ vext.8 d13, d12, d13, #1
+ vqrshrun.s16 d27, q9, #5
+ vaddl.u8 q5, d12, d17
+ vmlal.u8 q5, d14, d30
+ vmlal.u8 q5, d15, d30
+ vmlsl.u8 q5, d13, d31
+ vmlsl.u8 q5, d16, d31
+ vqrshrun.s16 d29, q5, #5
+ vrhadd.u8 q13, q13, q14
+ vst1.32 d26, [r1], r3
+ vst1.32 d27, [r1], r3
+
+ subs r4, r4, #2 @ 2 rows processed, decrement by 2
+ subne r7, r7 , r2, lsl #2
+ subne r7, r7, r2
+ beq end_func @ Branch if height==4
+ b loop_8 @looping if height == 8 or 16
+
+loop_4:
+ vld1.32 d0[0], [r7], r2 @ Vector load from src[0_0]
+ vld1.32 d1[0], [r7], r2 @ Vector load from src[1_0]
+ vld1.32 d2[0], [r7], r2 @ Vector load from src[2_0]
+ vld1.32 d3[0], [r7], r2 @ Vector load from src[3_0]
+ vld1.32 d4[0], [r7], r2 @ Vector load from src[4_0]
+ vld1.32 d5[0], [r7], r2 @ Vector load from src[5_0]
+ vaddl.u8 q5, d0, d5
+ vmlal.u8 q5, d2, d30
+ vmlal.u8 q5, d3, d30
+ vmlsl.u8 q5, d1, d31
+ vmlsl.u8 q5, d4, d31
+ vld1.32 {q6}, [r6], r2 @load for horz filter row 0
+ vext.8 d17, d12, d13, #5
+ vext.8 d14, d12, d13, #2
+ vext.8 d15, d12, d13, #3
+ vext.8 d16, d12, d13, #4
+ vext.8 d13, d12, d13, #1
+ vqrshrun.s16 d26, q5, #5
+ vld1.32 d6[0], [r7], r2 @ Vector load from src[6_0]
+ vaddl.u8 q5, d12, d17
+ vmlal.u8 q5, d14, d30
+ vmlal.u8 q5, d15, d30
+ vmlsl.u8 q5, d13, d31
+ vmlsl.u8 q5, d16, d31
+ vld1.32 {q6}, [r6], r2 @horz row 1
+ vaddl.u8 q9, d1, d6
+ vmlal.u8 q9, d3, d30
+ vmlal.u8 q9, d4, d30
+ vmlsl.u8 q9, d2, d31
+ vmlsl.u8 q9, d5, d31
+ vqrshrun.s16 d28, q5, #5
+ vext.8 d17, d12, d13, #5
+ vext.8 d14, d12, d13, #2
+ vext.8 d15, d12, d13, #3
+ vext.8 d16, d12, d13, #4
+ vext.8 d13, d12, d13, #1
+ vqrshrun.s16 d27, q9, #5
+ vaddl.u8 q5, d12, d17
+ vmlal.u8 q5, d14, d30
+ vmlal.u8 q5, d15, d30
+ vmlsl.u8 q5, d13, d31
+ vmlsl.u8 q5, d16, d31
+ vqrshrun.s16 d29, q5, #5
+ vrhadd.u8 q13, q13, q14
+ vst1.32 d26[0], [r1], r3
+ vst1.32 d27[0], [r1], r3
+
+ subs r4, r4, #2 @ 2 rows processed, decrement by 2
+ subne r7, r7 , r2, lsl #2
+ subne r7, r7, r2
+ beq end_func @ Branch if height==4
+ b loop_4 @ Loop if height==8
+end_func:
+ vldmia sp!, {d8-d15} @ Restore neon registers that were saved
+ ldmfd sp!, {r4-r12, pc} @Restoring registers from stack
+
+
diff --git a/common/arm/ih264_inter_pred_luma_vert_qpel_a9q.s b/common/arm/ih264_inter_pred_luma_vert_qpel_a9q.s
new file mode 100755
index 0000000..d45055e
--- /dev/null
+++ b/common/arm/ih264_inter_pred_luma_vert_qpel_a9q.s
@@ -0,0 +1,330 @@
+@/******************************************************************************
+@ *
+@ * Copyright (C) 2015 The Android Open Source Project
+@ *
+@ * Licensed under the Apache License, Version 2.0 (the "License");
+@ * you may not use this file except in compliance with the License.
+@ * You may obtain a copy of the License at:
+@ *
+@ * http://www.apache.org/licenses/LICENSE-2.0
+@ *
+@ * Unless required by applicable law or agreed to in writing, software
+@ * distributed under the License is distributed on an "AS IS" BASIS,
+@ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@ * See the License for the specific language governing permissions and
+@ * limitations under the License.
+@ *
+@ *****************************************************************************
+@ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
+@*/
+@/**
+@******************************************************************************
+@* @file
+@* ih264_inter_pred_luma_vert_qpel_a9q.s
+@*
+@* @brief
+@* Contains function definitions for inter prediction vertical quarter pel interpolation.
+@*
+@* @author
+@* Mohit
+@*
+@* @par List of Functions:
+@*
+@* - ih264_inter_pred_luma_vert_qpel_a9q()
+@*
+@* @remarks
+@* None
+@*
+@*******************************************************************************
+@*/
+
+@/* All the functions here are replicated from ih264_inter_pred_filters.c
+@
+
+@/**
+@/**
+@*******************************************************************************
+@*
+@* @brief
+@* Quarter pel interprediction luma filter for vertical input
+@*
+@* @par Description:
+@* Applies a 6 tap horizontal filter .The output is clipped to 8 bits
+@* sec 8.4.2.2.1 titled "Luma sample interpolation process"
+@*
+@* @param[in] pu1_src
+@* UWORD8 pointer to the source
+@*
+@* @param[out] pu1_dst
+@* UWORD8 pointer to the destination
+@*
+@* @param[in] src_strd
+@* integer source stride
+@*
+@* @param[in] dst_strd
+@* integer destination stride
+@*
+@* @param[in] ht
+@* integer height of the array
+@*
+@* @param[in] wd
+@* integer width of the array
+@*
+@* @param[in] pu1_tmp: temporary buffer: UNUSED in this function
+@*
+@* @param[in] dydx: x and y reference offset for qpel calculations.
+@* @returns
+@*
+@ @remarks
+@* None
+@*
+@*******************************************************************************
+@*/
+
+@void ih264_inter_pred_luma_vert (
+@ UWORD8 *pu1_src,
+@ UWORD8 *pu1_dst,
+@ WORD32 src_strd,
+@ WORD32 dst_strd,
+@ WORD32 ht,
+@ WORD32 wd,
+@ UWORD8* pu1_tmp,
+@ UWORD32 dydx)
+
+@**************Variables Vs Registers*****************************************
+@ r0 => *pu1_src
+@ r1 => *pu1_dst
+@ r2 => src_strd
+@ r3 => dst_strd
+@ r5 => ht
+@ r6 => wd
+@ r7 => dydx
+
+.text
+.p2align 2
+
+ .global ih264_inter_pred_luma_vert_qpel_a9q
+
+ih264_inter_pred_luma_vert_qpel_a9q:
+
+ stmfd sp!, {r4-r12, r14} @store register values to stack
+ vstmdb sp!, {d8-d15} @push neon registers to stack
+ ldr r5, [sp, #104] @Loads ht
+
+ ldr r6, [sp, #108] @Loads wd
+ ldr r7, [sp, #116] @Loads dydx
+ and r7, r7, #12 @Finds y-offset
+ lsr r7, r7, #3 @dydx>>3
+ mul r7, r2, r7
+ add r7, r0, r7 @pu1_src + (y_offset>>1)*src_strd
+ vmov.u16 q11, #20 @ Filter coeff 0x14 into Q11
+ sub r0, r0, r2, lsl #1 @pu1_src-2*src_strd
+ subs r12, r6, #8 @if wd=8 branch to loop_8
+ vmov.u16 q12, #5 @ Filter coeff 0x5 into Q12
+ beq loop_8
+
+ subs r12, r6, #4 @if wd=4 branch to loop_4
+ beq loop_4
+
+loop_16: @when wd=16
+
+ vld1.u32 {q0}, [r0], r2 @ Vector load from src[0_0]
+ vld1.u32 {q1}, [r0], r2 @ Vector load from src[1_0]
+ vld1.u32 {q2}, [r0], r2 @ Vector load from src[2_0]
+ vld1.u32 {q3}, [r0], r2 @ Vector load from src[3_0]
+ vld1.u32 {q4}, [r0], r2 @ Vector load from src[4_0]
+ vaddl.u8 q6, d4, d6 @ temp1 = src[2_0] + src[3_0]
+ vld1.u32 {q5}, [r0], r2 @ Vector load from src[5_0]
+ vaddl.u8 q7, d0, d10 @ temp = src[0_0] + src[5_0]
+ vaddl.u8 q8, d2, d8 @ temp2 = src[1_0] + src[4_0]
+ vmla.u16 q7, q6, q11 @ temp += temp1 * 20
+ vaddl.u8 q10, d1, d11 @ temp4 = src[0_8] + src[5_8]
+ vaddl.u8 q9, d5, d7 @ temp3 = src[2_8] + src[3_8]
+ vmla.u16 q10, q9, q11 @ temp4 += temp3 * 20
+ vld1.u32 {q0}, [r0], r2
+ vaddl.u8 q13, d3, d9 @ temp5 = src[1_8] + src[4_8]
+ vaddl.u8 q6, d6, d8
+ vmls.u16 q7, q8, q12 @ temp -= temp2 * 5
+ vaddl.u8 q8, d2, d0
+ vaddl.u8 q9, d4, d10
+ vmla.u16 q8, q6, q11
+ vmls.u16 q10, q13, q12 @ temp4 -= temp5 * 5
+ vaddl.u8 q13, d5, d11
+ vaddl.u8 q6, d7, d9
+ vqrshrun.s16 d30, q7, #5 @ dst[0_0] = CLIP_U8((temp +16) >> 5)
+ vaddl.u8 q7, d3, d1
+ vld1.u32 {q1}, [r0], r2
+ vmla.u16 q7, q6, q11
+ vmls.u16 q8, q9, q12
+ vqrshrun.s16 d31, q10, #5 @ dst[0_8] = CLIP_U8((temp4 +16) >> 5)
+ vld1.u32 {q10}, [r7], r2 @ Load for interpolation row 0
+ vrhadd.u8 q15, q10, q15 @ Interpolation to obtain qpel value
+ vaddl.u8 q9, d4, d2
+ vaddl.u8 q6, d8, d10
+
+ vst1.u32 {q15}, [r1], r3 @ Vector store to dst[0_0]
+ vmla.u16 q9, q6, q11
+ vaddl.u8 q10, d6, d0
+ vmls.u16 q7, q13, q12
+ vqrshrun.s16 d30, q8, #5
+ vaddl.u8 q6, d9, d11
+ vaddl.u8 q8, d5, d3
+ vaddl.u8 q13, d7, d1
+ vmla.u16 q8, q6, q11
+ vmls.u16 q9, q10, q12
+ vld1.u32 {q2}, [r0], r2
+
+ vqrshrun.s16 d31, q7, #5
+ vld1.u32 {q7}, [r7], r2 @ Load for interpolation row 1
+ vaddl.u8 q6, d10, d0
+ vrhadd.u8 q15, q7, q15 @ Interpolation to obtain qpel value
+ vaddl.u8 q7, d6, d4
+ vaddl.u8 q10, d8, d2
+ vmla.u16 q7, q6, q11
+ vmls.u16 q8, q13, q12
+ vst1.u32 {q15}, [r1], r3 @store row 1
+ vqrshrun.s16 d30, q9, #5
+ vaddl.u8 q9, d7, d5
+ vaddl.u8 q6, d11, d1
+ vmla.u16 q9, q6, q11
+ vaddl.u8 q13, d9, d3
+ vmls.u16 q7, q10, q12
+ vqrshrun.s16 d31, q8, #5
+ vld1.u32 {q8}, [r7], r2 @ Load for interpolation row 2
+ vmls.u16 q9, q13, q12
+ vrhadd.u8 q15, q8, q15 @ Interpolation to obtain qpel value
+ vaddl.u8 q6, d0, d2 @ temp1 = src[2_0] + src[3_0]
+ vst1.u32 {q15}, [r1], r3 @store row 2
+ vaddl.u8 q8, d10, d4 @ temp2 = src[1_0] + src[4_0]
+ vaddl.u8 q10, d9, d7 @ temp4 = src[0_8] + src[5_8]
+ vqrshrun.s16 d30, q7, #5
+ vaddl.u8 q13, d5, d11 @ temp5 = src[1_8] + src[4_8]
+ vaddl.u8 q7, d8, d6 @ temp = src[0_0] + src[5_0]
+ vqrshrun.s16 d31, q9, #5
+ vld1.u32 {q9}, [r7], r2 @ Load for interpolation row 3
+ vmla.u16 q7, q6, q11 @ temp += temp1 * 20
+ vrhadd.u8 q15, q9, q15 @ Interpolation to obtain qpel value
+ vaddl.u8 q9, d1, d3 @ temp3 = src[2_8] + src[3_8]
+ vst1.u32 {q15}, [r1], r3 @store row 3
+ subs r5, r5, #4 @ 4 rows processed, decrement by 4
+ subne r0, r0 , r2, lsl #2
+ subne r0, r0, r2
+ beq end_func @ Branch if height==4
+
+ b loop_16 @ looping if height = 8 or 16
+
+
+loop_8:
+
+ @// Processing row0 and row1
+ vld1.u32 d0, [r0], r2 @ Vector load from src[0_0]
+ vld1.u32 d1, [r0], r2 @ Vector load from src[1_0]
+ vld1.u32 d2, [r0], r2 @ Vector load from src[2_0]
+ vld1.u32 d3, [r0], r2 @ Vector load from src[3_0]
+ vld1.u32 d4, [r0], r2 @ Vector load from src[4_0]
+ vld1.u32 d5, [r0], r2 @ Vector load from src[5_0]
+
+ vaddl.u8 q3, d2, d3 @ temp1 = src[2_0] + src[3_0]
+ vaddl.u8 q4, d0, d5 @ temp = src[0_0] + src[5_0]
+ vaddl.u8 q5, d1, d4 @ temp2 = src[1_0] + src[4_0]
+ vmla.u16 q4, q3, q11 @ temp += temp1 * 20
+ vld1.u32 d6, [r0], r2
+ vaddl.u8 q7, d3, d4
+ vaddl.u8 q8, d1, d6
+ vaddl.u8 q9, d2, d5
+ vmls.u16 q4, q5, q12 @ temp -= temp2 * 5
+ vmla.u16 q8, q7, q11
+ vld1.u32 d7, [r0], r2
+ vaddl.u8 q10, d4, d5
+ vaddl.u8 q6, d2, d7
+ vaddl.u8 q5, d3, d6
+ vmls.u16 q8, q9, q12
+ vqrshrun.s16 d26, q4, #5 @ dst[0_0] = CLIP_U8( (temp + 16) >> 5)
+ vmla.u16 q6, q10, q11
+ vld1.32 d8, [r7], r2 @Load value for interpolation (row0)
+ vld1.32 d9, [r7], r2 @Load value for interpolation (row1)
+ vld1.u32 d0, [r0], r2
+ vaddl.u8 q7, d5, d6
+ vqrshrun.s16 d27, q8, #5
+ vrhadd.u8 q13, q4, q13 @ Interpolation step for qpel calculation
+ vaddl.u8 q10, d3, d0
+ vmls.u16 q6, q5, q12
+ vst1.u32 d26, [r1], r3 @ Vector store to dst[0_0]
+ vaddl.u8 q9, d4, d7
+ vmla.u16 q10, q7, q11
+ vst1.u32 d27, [r1], r3 @ Vector store to dst[1_0]
+ vqrshrun.s16 d28, q6, #5
+ vmls.u16 q10, q9, q12
+ vld1.32 d12, [r7], r2 @Load value for interpolation (row2)
+ vld1.32 d13, [r7], r2 @Load value for interpolation (row3)
+ vqrshrun.s16 d29, q10, #5
+ subs r9, r5, #4
+ vrhadd.u8 q14, q6, q14
+ vst1.u32 d28, [r1], r3 @store row 2
+ vst1.u32 d29, [r1], r3 @store row 3
+
+ subs r5, r5, #4 @ 4 rows processed, decrement by 4
+ subne r0, r0 , r2, lsl #2
+ subne r0, r0, r2
+ beq end_func @ Branch if height==4
+ b loop_8 @looping if height == 8 or 16
+
+loop_4:
+@// Processing row0 and row1
+
+ vld1.u32 d0[0], [r0], r2 @ Vector load from src[0_0]
+ vld1.u32 d1[0], [r0], r2 @ Vector load from src[1_0]
+ vld1.u32 d2[0], [r0], r2 @ Vector load from src[2_0]
+ vld1.u32 d3[0], [r0], r2 @ Vector load from src[3_0]
+ vld1.u32 d4[0], [r0], r2 @ Vector load from src[4_0]
+ vld1.u32 d5[0], [r0], r2 @ Vector load from src[5_0]
+
+ vaddl.u8 q3, d2, d3 @ temp1 = src[2_0] + src[3_0]
+ vaddl.u8 q4, d0, d5 @ temp = src[0_0] + src[5_0]
+ vaddl.u8 q5, d1, d4 @ temp2 = src[1_0] + src[4_0]
+ vmla.u16 q4, q3, q11 @ temp += temp1 * 20
+ vld1.u32 d6, [r0], r2
+ vaddl.u8 q7, d3, d4
+ vaddl.u8 q8, d1, d6
+ vaddl.u8 q9, d2, d5
+ vmls.u16 q4, q5, q12 @ temp -= temp2 * 5
+ vld1.u32 d7[0], [r0], r2
+ vmla.u16 q8, q7, q11
+ vaddl.u8 q10, d4, d5
+ vaddl.u8 q6, d2, d7
+ vaddl.u8 q5, d3, d6
+ vmls.u16 q8, q9, q12
+ vqrshrun.s16 d26, q4, #5 @ dst[0_0] = CLIP_U8( (temp + 16) >> 5)
+ vld1.u32 d8[0], [r7], r2 @Load value for interpolation - row 0
+ vld1.u32 d9[0], [r7], r2 @Load value for interpolation - row 1
+ vmla.u16 q6, q10, q11
+ vld1.u32 d0[0], [r0], r2
+ vaddl.u8 q7, d5, d6
+ vqrshrun.s16 d27, q8, #5
+ vaddl.u8 q10, d3, d0
+ vrhadd.u8 q13, q13, q4 @Interpolation step for qpel calculation
+ vmls.u16 q6, q5, q12
+ vst1.u32 d26[0], [r1], r3 @ Vector store to dst[0_0]
+ vaddl.u8 q9, d4, d7
+ vmla.u16 q10, q7, q11
+ vst1.u32 d27[0], [r1], r3 @ store row 1
+ vqrshrun.s16 d28, q6, #5
+ vld1.u32 d12[0], [r7], r2 @Load value for interpolation - row 2
+ vld1.u32 d13[0], [r7], r2 @Load value for interpolation - row 3
+
+ vmls.u16 q10, q9, q12
+ vqrshrun.s16 d29, q10, #5
+ vrhadd.u8 q14, q6, q14 @Interpolation step for qpel calculation
+ vst1.u32 d28[0], [r1], r3 @store row 2
+ vst1.u32 d29[0], [r1], r3 @store row 3
+
+ subs r5, r5, #8
+ subeq r0, r0, r2, lsl #2
+ subeq r0, r0, r2
+ beq loop_4 @ Loop if height==8
+
+end_func:
+ vldmia sp!, {d8-d15} @ Restore neon registers that were saved
+ ldmfd sp!, {r4-r12, pc} @Restoring registers from stack
+
+
diff --git a/common/arm/ih264_intra_pred_chroma_a9q.s b/common/arm/ih264_intra_pred_chroma_a9q.s
new file mode 100755
index 0000000..d03fc55
--- /dev/null
+++ b/common/arm/ih264_intra_pred_chroma_a9q.s
@@ -0,0 +1,551 @@
+@/******************************************************************************
+@ *
+@ * Copyright (C) 2015 The Android Open Source Project
+@ *
+@ * Licensed under the Apache License, Version 2.0 (the "License");
+@ * you may not use this file except in compliance with the License.
+@ * You may obtain a copy of the License at:
+@ *
+@ * http://www.apache.org/licenses/LICENSE-2.0
+@ *
+@ * Unless required by applicable law or agreed to in writing, software
+@ * distributed under the License is distributed on an "AS IS" BASIS,
+@ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@ * See the License for the specific language governing permissions and
+@ * limitations under the License.
+@ *
+@ *****************************************************************************
+@ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
+@*/
+@/**
+@******************************************************************************
+@* @file
+@* ih264_intra_pred_chroma_a9q.s
+@*
+@* @brief
+@* Contains function definitions for intra chroma prediction .
+@*
+@* @author
+@* Ittiam
+@*
+@* @par List of Functions:
+@*
+@* - ih264_intra_pred_chroma_mode_horz_a9q()
+@* - ih264_intra_pred_chroma_8x8_mode_vert_a9q()
+@* - ih264_intra_pred_chroma_mode_dc_a9q()
+@* - ih264_intra_pred_chroma_mode_plane_a9q()
+@*
+@* @remarks
+@* None
+@*
+@*******************************************************************************
+@*/
+
+@/* All the functions here are replicated from ih264_chroma_intra_pred_filters.c
+@
+
+@/**
+@/**
+@/**
+@
+.text
+.p2align 2
+
+ .extern ih264_gai1_intrapred_chroma_plane_coeffs1
+.hidden ih264_gai1_intrapred_chroma_plane_coeffs1
+ .extern ih264_gai1_intrapred_chroma_plane_coeffs2
+.hidden ih264_gai1_intrapred_chroma_plane_coeffs2
+scratch_chroma_intrapred_addr1:
+ .long ih264_gai1_intrapred_chroma_plane_coeffs1 - scrlblc1 - 8
+
+scratch_intrapred_chroma_plane_addr1:
+ .long ih264_gai1_intrapred_chroma_plane_coeffs2 - scrlblc2 - 8
+@/**
+@*******************************************************************************
+@*
+@*ih264_intra_pred_chroma_8x8_mode_dc
+@*
+@* @brief
+@* Perform Intra prediction for chroma_8x8 mode:DC
+@*
+@* @par Description:
+@* Perform Intra prediction for chroma_8x8 mode:DC ,described in sec 8.3.4.1
+@*
+@* @param[in] pu1_src
+@* UWORD8 pointer to the source containing alternate U and V samples
+@*
+@* @param[out] pu1_dst
+@* UWORD8 pointer to the destination with alternate U and V samples
+@*
+@* @param[in] src_strd
+@* integer source stride
+@*
+@* @param[in] dst_strd
+@* integer destination stride
+@*
+@** @param[in] ui_neighboravailability
+@* availability of neighbouring pixels
+@*
+@* @returns
+@*
+@* @remarks
+@* None
+@*
+@*******************************************************************************/
+@void ih264_intra_pred_chroma_8x8_mode_dc(UWORD8 *pu1_src,
+@ UWORD8 *pu1_dst,
+@ WORD32 src_strd,
+@ WORD32 dst_strd,
+@ WORD32 ui_neighboravailability)
+
+@**************Variables Vs Registers*****************************************
+@ r0 => *pu1_src
+@ r1 => *pu1_dst
+@ r2 => src_strd
+@ r3 => dst_strd
+@ r4 => ui_neighboravailability
+
+
+
+ .global ih264_intra_pred_chroma_8x8_mode_dc_a9q
+
+ih264_intra_pred_chroma_8x8_mode_dc_a9q:
+
+ stmfd sp!, {r4, r14} @store register values to stack
+ ldr r4, [sp, #8] @r4 => ui_neighboravailability
+ vpush {d8-d15}
+
+ ands r2, r4, #0x01 @CHECKING IF LEFT_AVAILABLE ELSE BRANCHING TO ONLY TOP AVAILABLE
+ beq top_available
+ ands r2, r4, #0x04 @CHECKING IF TOP_AVAILABLE ELSE BRANCHING TO ONLY LEFT AVAILABLE
+ beq left_available
+
+ vld1.u8 {q0}, [r0] @BOTH LEFT AND TOP AVAILABLE
+ add r0, r0, #18
+ vld1.u8 {q1}, [r0]
+ vaddl.u8 q2, d1, d2
+ vaddl.u8 q3, d0, d3
+ vmovl.u8 q1, d3
+ vmovl.u8 q0, d0
+
+ vadd.u16 d12, d4, d5
+ vadd.u16 d13, d2, d3
+ vadd.u16 d15, d6, d7
+ vadd.u16 d14, d0, d1
+
+ vpadd.u32 d12, d12, d15
+ vpadd.u32 d14, d13, d14
+ vqrshrun.s16 d12, q6, #3
+ vqrshrun.s16 d14, q7, #2
+ vdup.u16 d8, d12[0]
+ vdup.u16 d9, d14[0]
+ vdup.u16 d10, d14[1]
+ vdup.u16 d11, d12[1]
+ b str_pred
+
+top_available: @ONLY TOP AVAILABLE
+ ands r2, r4, #0x04 @CHECKING TOP AVAILABILTY OR ELSE BRANCH TO NONE AVAILABLE
+ beq none_available
+
+ add r0, r0, #18
+ vld1.u8 {q0}, [r0]
+ vmovl.u8 q1, d0
+ vmovl.u8 q2, d1
+ vadd.u16 d0, d2, d3
+ vadd.u16 d1, d4, d5
+ vpaddl.u32 q0, q0
+ vqrshrun.s16 d0, q0, #2
+ vdup.u16 d8, d0[0]
+ vdup.u16 d9, d0[2]
+ vmov q5, q4
+ b str_pred
+
+left_available: @ONLY LEFT AVAILABLE
+ vld1.u8 {q0}, [r0]
+ vmovl.u8 q1, d0
+ vmovl.u8 q2, d1
+ vadd.u16 d0, d2, d3
+ vadd.u16 d1, d4, d5
+ vpaddl.u32 q0, q0
+ vqrshrun.s16 d0, q0, #2
+ vdup.u16 q5, d0[0]
+ vdup.u16 q4, d0[2]
+ b str_pred
+
+none_available: @NONE AVAILABLE
+ vmov.u8 q4, #128
+ vmov.u8 q5, #128
+
+str_pred:
+ vst1.8 {q4}, [r1], r3
+ vst1.8 {q4}, [r1], r3
+ vst1.8 {q4}, [r1], r3
+ vst1.8 {q4}, [r1], r3
+ vst1.8 {q5}, [r1], r3
+ vst1.8 {q5}, [r1], r3
+ vst1.8 {q5}, [r1], r3
+ vst1.8 {q5}, [r1], r3
+
+ vpop {d8-d15}
+ ldmfd sp!, {r4, pc} @Restoring registers from stack
+
+
+
+@/******************************************************************************
+
+
+@/**
+@*******************************************************************************
+@*
+@*ih264_intra_pred_chroma_8x8_mode_horz
+@*
+@* @brief
+@* Perform Intra prediction for chroma_8x8 mode:Horizontal
+@*
+@* @par Description:
+@* Perform Intra prediction for chroma_8x8 mode:Horizontal ,described in sec 8.3.4.2
+@*
+@* @param[in] pu1_src
+@* UWORD8 pointer to the source containing alternate U and V samples
+@*
+@* @param[out] pu1_dst
+@* UWORD8 pointer to the destination with alternate U and V samples
+@*
+@* @param[in] src_strd
+@* integer source stride
+@*
+@* @param[in] dst_strd
+@* integer destination stride
+@*
+@* @param[in] ui_neighboravailability
+@* availability of neighbouring pixels(Not used in this function)
+@*
+@* @returns
+@*
+@* @remarks
+@* None
+@*
+@*******************************************************************************
+@*/
+@void ih264_intra_pred_chroma_8x8_mode_horz(UWORD8 *pu1_src,
+@ UWORD8 *pu1_dst,
+@ WORD32 src_strd,
+@ WORD32 dst_strd,
+@ WORD32 ui_neighboravailability)
+@**************Variables Vs Registers*****************************************
+@ r0 => *pu1_src
+@ r1 => *pu1_dst
+@ r2 => src_strd
+@ r3 => dst_strd
+@ r4 => ui_neighboravailability
+
+
+ .global ih264_intra_pred_chroma_8x8_mode_horz_a9q
+
+ih264_intra_pred_chroma_8x8_mode_horz_a9q:
+
+ stmfd sp!, {r14} @store register values to stack
+
+ vld1.u8 {q0}, [r0]
+ mov r2, #6
+
+ vdup.u16 q1, d1[3]
+ vdup.u16 q2, d1[2]
+ vst1.8 {q1}, [r1], r3
+
+loop_8x8_horz:
+ vext.8 q0, q0, q0, #12
+ vst1.8 {q2}, [r1], r3
+ vdup.u16 q1, d1[3]
+ subs r2, #2
+ vdup.u16 q2, d1[2]
+ vst1.8 {q1}, [r1], r3
+ bne loop_8x8_horz
+
+ vext.8 q0, q0, q0, #12
+ vst1.8 {q2}, [r1], r3
+
+ ldmfd sp!, {pc} @restoring registers from stack
+
+
+
+
+@/**
+@*******************************************************************************
+@*
+@*ih264_intra_pred_chroma_8x8_mode_vert
+@*
+@* @brief
+@* Perform Intra prediction for chroma_8x8 mode:vertical
+@*
+@* @par Description:
+@*Perform Intra prediction for chroma_8x8 mode:vertical ,described in sec 8.3.4.3
+@*
+@* @param[in] pu1_src
+@* UWORD8 pointer to the source containing alternate U and V samples
+@*
+@* @param[out] pu1_dst
+@* UWORD8 pointer to the destination with alternate U and V samples
+@*
+@* @param[in] src_strd
+@* integer source stride
+@*
+@* @param[in] dst_strd
+@* integer destination stride
+@*
+@* @param[in] ui_neighboravailability
+@* availability of neighbouring pixels(Not used in this function)
+@*
+@* @returns
+@*
+@* @remarks
+@* None
+@*
+@*******************************************************************************
+@void ih264_intra_pred_chroma_8x8_mode_vert(UWORD8 *pu1_src,
+@ UWORD8 *pu1_dst,
+@ WORD32 src_strd,
+@ WORD32 dst_strd,
+@ WORD32 ui_neighboravailability)
+
+@**************Variables Vs Registers*****************************************
+@ r0 => *pu1_src
+@ r1 => *pu1_dst
+@ r2 => src_strd
+@ r3 => dst_strd
+@ r4 => ui_neighboravailability
+
+
+ .global ih264_intra_pred_chroma_8x8_mode_vert_a9q
+
+ih264_intra_pred_chroma_8x8_mode_vert_a9q:
+
+ stmfd sp!, {r4-r12, r14} @store register values to stack
+
+ add r0, r0, #18
+ vld1.8 {q0}, [r0]
+
+ vst1.8 {q0}, [r1], r3
+ vst1.8 {q0}, [r1], r3
+ vst1.8 {q0}, [r1], r3
+ vst1.8 {q0}, [r1], r3
+ vst1.8 {q0}, [r1], r3
+ vst1.8 {q0}, [r1], r3
+ vst1.8 {q0}, [r1], r3
+ vst1.8 {q0}, [r1], r3
+
+ ldmfd sp!, {r4-r12, pc} @Restoring registers from stack
+
+
+
+
+@/******************************************************************************
+
+
+@/**
+@*******************************************************************************
+@*
+@*ih264_intra_pred_chroma_8x8_mode_plane
+@*
+@* @brief
+@* Perform Intra prediction for chroma_8x8 mode:PLANE
+@*
+@* @par Description:
+@* Perform Intra prediction for chroma_8x8 mode:PLANE ,described in sec 8.3.4.4
+@*
+@* @param[in] pu1_src
+@* UWORD8 pointer to the source containing alternate U and V samples
+@*
+@* @param[out] pu1_dst
+@* UWORD8 pointer to the destination with alternate U and V samples
+@*
+@* @param[in] src_strd
+@* integer source stride
+@*
+@* @param[in] dst_strd
+@* integer destination stride
+@*
+@* @param[in] ui_neighboravailability
+@* availability of neighbouring pixels
+@*
+@* @returns
+@*
+@* @remarks
+@* None
+@*
+@*******************************************************************************/
+@void ih264_intra_pred_chroma_8x8_mode_plane(UWORD8 *pu1_src,
+@ UWORD8 *pu1_dst,
+@ WORD32 src_strd,
+@ WORD32 dst_strd,
+@ WORD32 ui_neighboravailability)
+
+@**************Variables Vs Registers*****************************************
+@ r0 => *pu1_src
+@ r1 => *pu1_dst
+@ r2 => src_strd
+@ r3 => dst_strd
+@ r4 => ui_neighboravailability
+
+ .global ih264_intra_pred_chroma_8x8_mode_plane_a9q
+ih264_intra_pred_chroma_8x8_mode_plane_a9q:
+
+ stmfd sp!, {r4-r10, r12, lr}
+ vpush {d8-d15}
+
+
+ vld1.32 d0, [r0]
+ add r10, r0, #10
+ vld1.32 d1, [r10]
+ add r10, r10, #6
+ vrev64.16 d5, d0
+ vld1.32 d2, [r10]!
+ add r10, r10, #2
+ vrev64.16 d7, d2
+ vld1.32 d3, [r10]
+ sub r5, r3, #8
+ ldr r12, scratch_chroma_intrapred_addr1
+scrlblc1:
+ add r12, r12, pc
+ vsubl.u8 q5, d5, d1
+ vld1.64 {q4}, [r12] @ Load multiplication factors 1 to 8 into D3
+ vsubl.u8 q6, d3, d7
+ vmul.s16 q7, q5, q4
+ vmul.s16 q8, q6, q4
+ vuzp.16 q7, q8
+
+ vpadd.s16 d14, d14
+ vpadd.s16 d15, d15
+ vpadd.s16 d16, d16
+ vpadd.s16 d17, d17
+ vpadd.s16 d14, d14
+ vpadd.s16 d15, d15
+ vpadd.s16 d16, d16
+ vpadd.s16 d17, d17
+
+ mov r6, #34
+ vdup.16 q9, r6
+
+ vmull.s16 q11, d14, d18
+ vmull.s16 q12, d15, d18
+ vmull.s16 q13, d16, d18
+ vmull.s16 q14, d17, d18
+
+ vrshrn.s32 d10, q11, #6
+ vrshrn.s32 d12, q12, #6
+ vrshrn.s32 d13, q13, #6
+ vrshrn.s32 d14, q14, #6
+
+
+ ldrb r6, [r0], #1
+ add r10, r0, #31
+ ldrb r8, [r0], #1
+ ldrb r7, [r10], #1
+ ldrb r9, [r10], #1
+
+ add r6, r6, r7
+ add r8, r8, r9
+ lsl r6, r6, #4
+ lsl r8, r8, #4
+
+ vdup.16 q0, r6
+ vdup.16 q1, r8
+ vdup.16 q2, d12[0]
+ vdup.16 q3, d10[0]
+
+ vdup.16 q12, d14[0]
+ vdup.16 q13, d13[0]
+ vzip.16 q2, q12
+ vzip.16 q3, q13
+ vzip.16 q0, q1
+
+ ldr r12, scratch_intrapred_chroma_plane_addr1
+scrlblc2:
+ add r12, r12, pc
+ vld1.64 {q4}, [r12]
+ vmov.16 q5, q4
+ vmov q11, q4
+ vzip.16 q4, q5
+
+ vmul.s16 q6, q2, q4
+ vmul.s16 q8, q2, q5
+ vadd.s16 q6, q0, q6
+ vadd.s16 q8, q0, q8
+
+
+ vdup.16 q10, d22[0]
+ vmul.s16 q2, q3, q10
+ vdup.16 q15, d22[1]
+ vmul.s16 q9, q3, q10
+ vmul.s16 q7, q3, q15
+ vmul.s16 q4, q3, q15
+ vadd.s16 q12, q6, q2
+ vadd.s16 q0, q8, q9
+ vadd.s16 q1, q6, q7
+ vqrshrun.s16 d28, q12, #5
+ vadd.s16 q13, q8, q4
+ vqrshrun.s16 d29, q0, #5
+ vdup.16 q10, d22[2]
+ vst1.8 {q14}, [r1], r3
+ vqrshrun.s16 d28, q1, #5
+ vqrshrun.s16 d29, q13, #5
+ vmul.s16 q2, q3, q10
+ vmul.s16 q9, q3, q10
+ vst1.8 {q14}, [r1], r3
+ vadd.s16 q12, q6, q2
+ vadd.s16 q0, q8, q9
+ vdup.16 q15, d22[3]
+ vqrshrun.s16 d28, q12, #5
+ vqrshrun.s16 d29, q0, #5
+ vmul.s16 q7, q3, q15
+ vmul.s16 q4, q3, q15
+ vst1.8 {q14}, [r1], r3
+ vadd.s16 q1, q6, q7
+ vadd.s16 q13, q8, q4
+ vdup.16 q10, d23[0]
+ vqrshrun.s16 d28, q1, #5
+ vqrshrun.s16 d29, q13, #5
+ vmul.s16 q2, q3, q10
+ vmul.s16 q9, q3, q10
+ vst1.8 {q14}, [r1], r3
+ vadd.s16 q12, q6, q2
+ vadd.s16 q0, q8, q9
+ vdup.16 q15, d23[1]
+ vqrshrun.s16 d28, q12, #5
+ vqrshrun.s16 d29, q0, #5
+ vmul.s16 q7, q3, q15
+ vmul.s16 q4, q3, q15
+ vst1.8 {q14}, [r1], r3
+ vadd.s16 q1, q6, q7
+ vadd.s16 q13, q8, q4
+ vdup.16 q10, d23[2]
+ vqrshrun.s16 d28, q1, #5
+ vqrshrun.s16 d29, q13, #5
+ vmul.s16 q2, q3, q10
+ vmul.s16 q9, q3, q10
+ vst1.8 {q14}, [r1], r3
+ vadd.s16 q12, q6, q2
+ vadd.s16 q0, q8, q9
+ vdup.16 q15, d23[3]
+ vqrshrun.s16 d28, q12, #5
+ vqrshrun.s16 d29, q0, #5
+ vmul.s16 q7, q3, q15
+ vmul.s16 q4, q3, q15
+ vst1.8 {q14}, [r1], r3
+ vadd.s16 q1, q6, q7
+ vadd.s16 q13, q8, q4
+ vqrshrun.s16 d28, q1, #5
+ vqrshrun.s16 d29, q13, #5
+ vst1.8 {q14}, [r1], r3
+
+
+
+end_func_plane:
+
+
+ vpop {d8-d15}
+ ldmfd sp!, {r4-r10, r12, pc}
+
+
+
+
diff --git a/common/arm/ih264_intra_pred_luma_16x16_a9q.s b/common/arm/ih264_intra_pred_luma_16x16_a9q.s
new file mode 100755
index 0000000..e38e203
--- /dev/null
+++ b/common/arm/ih264_intra_pred_luma_16x16_a9q.s
@@ -0,0 +1,520 @@
+@/******************************************************************************
+@ *
+@ * Copyright (C) 2015 The Android Open Source Project
+@ *
+@ * Licensed under the Apache License, Version 2.0 (the "License");
+@ * you may not use this file except in compliance with the License.
+@ * You may obtain a copy of the License at:
+@ *
+@ * http://www.apache.org/licenses/LICENSE-2.0
+@ *
+@ * Unless required by applicable law or agreed to in writing, software
+@ * distributed under the License is distributed on an "AS IS" BASIS,
+@ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@ * See the License for the specific language governing permissions and
+@ * limitations under the License.
+@ *
+@ *****************************************************************************
+@ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
+@*/
+@/**
+@******************************************************************************
+@* @file
+@* ih264_intra_pred_luma_16x16_a9q.s
+@*
+@* @brief
+@* Contains function definitions for intra 16x16 Luma prediction .
+@*
+@* @author
+@* Ittiam
+@*
+@* @par List of Functions:
+@*
+@* - ih264_intra_pred_luma_16x16_mode_vert_a9q()
+@* - ih264_intra_pred_luma_16x16_mode_horz_a9q()
+@* - ih264_intra_pred_luma_16x16_mode_dc_a9q()
+@* - ih264_intra_pred_luma_16x16_mode_plane_a9q()
+@*
+@* @remarks
+@* None
+@*
+@*******************************************************************************
+@*/
+
+@/* All the functions here are replicated from ih264_intra_pred_filters.c
+@
+
+@/**
+@/**
+@/**
+@
+
+.text
+.p2align 2
+
+
+ .extern ih264_gai1_intrapred_luma_plane_coeffs
+.hidden ih264_gai1_intrapred_luma_plane_coeffs
+scratch_intrapred_addr1:
+ .long ih264_gai1_intrapred_luma_plane_coeffs - scrlbl1 - 8
+@/**
+@*******************************************************************************
+@*
+@*ih264_intra_pred_luma_16x16_mode_vert_a9q
+@*
+@* @brief
+@* Perform Intra prediction for luma_16x16 mode:vertical
+@*
+@* @par Description:
+@* Perform Intra prediction for luma_16x16 mode:Vertical ,described in sec 8.3.3.1
+@*
+@* @param[in] pu1_src
+@* UWORD8 pointer to the source
+@*
+@* @param[out] pu1_dst
+@* UWORD8 pointer to the destination
+@*
+@* @param[in] src_strd
+@* integer source stride
+@*
+@* @param[in] dst_strd
+@* integer destination stride
+@*
+@* @param[in] ui_neighboravailability
+@* availability of neighbouring pixels(Not used in this function)
+@*
+@* @returns
+@*
+@* @remarks
+@* None
+@*
+@*******************************************************************************
+@void ih264_intra_pred_luma_16x16_mode_vert(UWORD8 *pu1_src,
+@ UWORD8 *pu1_dst,
+@ WORD32 src_strd,
+@ WORD32 dst_strd,
+@ WORD32 ui_neighboravailability)
+
+@**************Variables Vs Registers*****************************************
+@ r0 => *pu1_src
+@ r1 => *pu1_dst
+@ r2 => src_strd
+@ r3 => dst_strd
+@ r4 => ui_neighboravailability
+
+
+ .global ih264_intra_pred_luma_16x16_mode_vert_a9q
+
+ih264_intra_pred_luma_16x16_mode_vert_a9q:
+
+ stmfd sp!, {r4-r12, r14} @store register values to stack
+
+ add r0, r0, #17
+ vld1.8 {q0}, [r0]
+
+ vst1.8 {q0}, [r1], r3
+ vst1.8 {q0}, [r1], r3
+ vst1.8 {q0}, [r1], r3
+ vst1.8 {q0}, [r1], r3
+ vst1.8 {q0}, [r1], r3
+ vst1.8 {q0}, [r1], r3
+ vst1.8 {q0}, [r1], r3
+ vst1.8 {q0}, [r1], r3
+ vst1.8 {q0}, [r1], r3
+ vst1.8 {q0}, [r1], r3
+ vst1.8 {q0}, [r1], r3
+ vst1.8 {q0}, [r1], r3
+ vst1.8 {q0}, [r1], r3
+ vst1.8 {q0}, [r1], r3
+ vst1.8 {q0}, [r1], r3
+ vst1.8 {q0}, [r1], r3
+
+ ldmfd sp!, {r4-r12, pc} @Restoring registers from stack
+
+
+
+
+
+@/******************************************************************************
+
+
+@/**
+@*******************************************************************************
+@*
+@*ih264_intra_pred_luma_16x16_mode_horz_a9q
+@*
+@* @brief
+@* Perform Intra prediction for luma_16x16 mode:horizontal
+@*
+@* @par Description:
+@* Perform Intra prediction for luma_16x16 mode:horizontal ,described in sec 8.3.3.2
+@*
+@* @param[in] pu1_src
+@* UWORD8 pointer to the source
+@*
+@* @param[out] pu1_dst
+@* UWORD8 pointer to the destination
+@*
+@* @param[in] src_strd
+@* integer source stride
+@*
+@* @param[in] dst_strd
+@* integer destination stride
+@*
+@* @param[in] ui_neighboravailability
+@* availability of neighbouring pixels(Not used in this function)
+@*
+@* @returns
+@*
+@* @remarks
+@* None
+@*
+@*******************************************************************************
+@*/
+@void ih264_intra_pred_luma_16x16_mode_horz(UWORD8 *pu1_src,
+@ UWORD8 *pu1_dst,
+@ WORD32 src_strd,
+@ WORD32 dst_strd,
+@ WORD32 ui_neighboravailability)
+@**************Variables Vs Registers*****************************************
+@ r0 => *pu1_src
+@ r1 => *pu1_dst
+@ r2 => src_strd
+@ r3 => dst_strd
+@ r4 => ui_neighboravailability
+
+ .global ih264_intra_pred_luma_16x16_mode_horz_a9q
+
+ih264_intra_pred_luma_16x16_mode_horz_a9q:
+
+ stmfd sp!, {r14} @store register values to stack
+
+ vld1.u8 {q0}, [r0]
+ mov r2, #14
+
+ vdup.u8 q1, d1[7]
+ vdup.u8 q2, d1[6]
+ vst1.8 {q1}, [r1], r3
+
+loop_16x16_horz:
+ vext.8 q0, q0, q0, #14
+ vst1.8 {q2}, [r1], r3
+ vdup.u8 q1, d1[7]
+ subs r2, #2
+ vdup.u8 q2, d1[6]
+ vst1.8 {q1}, [r1], r3
+ bne loop_16x16_horz
+
+ vext.8 q0, q0, q0, #14
+ vst1.8 {q2}, [r1], r3
+
+ ldmfd sp!, {pc} @Restoring registers from stack
+
+
+
+
+@/******************************************************************************
+
+
+@/**
+@*******************************************************************************
+@*
+@*ih264_intra_pred_luma_16x16_mode_dc_a9q
+@*
+@* @brief
+@* Perform Intra prediction for luma_16x16 mode:DC
+@*
+@* @par Description:
+@* Perform Intra prediction for luma_16x16 mode:DC ,described in sec 8.3.3.3
+@*
+@* @param[in] pu1_src
+@* UWORD8 pointer to the source
+@*
+@* @param[out] pu1_dst
+@* UWORD8 pointer to the destination
+@*
+@* @param[in] src_strd
+@* integer source stride
+@*
+@* @param[in] dst_strd
+@* integer destination stride
+@*
+@* @param[in] ui_neighboravailability
+@* availability of neighbouring pixels
+@*
+@* @returns
+@*
+@* @remarks
+@* None
+@*
+@*******************************************************************************/
+@void ih264_intra_pred_luma_16x16_mode_dc(UWORD8 *pu1_src,
+@ UWORD8 *pu1_dst,
+@ WORD32 src_strd,
+@ WORD32 dst_strd,
+@ WORD32 ui_neighboravailability)
+
+@**************Variables Vs Registers*****************************************
+@ r0 => *pu1_src
+@ r1 => *pu1_dst
+@ r2 => src_strd
+@ r3 => dst_strd
+@ r4 => ui_neighboravailability
+
+ .global ih264_intra_pred_luma_16x16_mode_dc_a9q
+
+ih264_intra_pred_luma_16x16_mode_dc_a9q:
+
+ stmfd sp!, {r4, r14} @store register values to stack
+ ldr r4, [sp, #8] @r4 => ui_neighboravailability
+
+ ands r2, r4, #0x01 @CHECKING IF LEFT_AVAILABLE ELSE BRANCHING TO ONLY TOP AVAILABLE
+ beq top_available
+ ands r2, r4, #0x04 @CHECKING IF TOP_AVAILABLE ELSE BRANCHING TO ONLY LEFT AVAILABLE
+ beq left_available
+
+ vld1.u8 {q0}, [r0] @BOTH LEFT AND TOP AVAILABLE
+ add r0, r0, #17
+ vpaddl.u8 q0, q0
+ vld1.u8 {q1}, [r0]
+ vpaddl.u8 q1, q1
+ vadd.u16 q0, q0, q1
+ vadd.u16 d0, d0, d1
+ vpaddl.u16 d0, d0
+ vpaddl.u32 d0, d0
+ vqrshrun.s16 d0, q0, #5
+ vdup.u8 q0, d0[0]
+ b str_pred
+
+top_available: @ONLY TOP AVAILABLE
+ ands r2, r4, #0x04 @CHECKING TOP AVAILABILTY OR ELSE BRANCH TO NONE AVAILABLE
+ beq none_available
+
+ add r0, r0, #17
+ vld1.u8 {q0}, [r0]
+ vpaddl.u8 q0, q0
+ vadd.u16 d0, d0, d1
+ vpaddl.u16 d0, d0
+ vpaddl.u32 d0, d0
+ vqrshrun.s16 d0, q0, #4
+ vdup.u8 q0, d0[0]
+ b str_pred
+
+left_available: @ONLY LEFT AVAILABLE
+ vld1.u8 {q0}, [r0]
+ vpaddl.u8 q0, q0
+ vadd.u16 d0, d0, d1
+ vpaddl.u16 d0, d0
+ vpaddl.u32 d0, d0
+ vqrshrun.s16 d0, q0, #4
+ vdup.u8 q0, d0[0]
+ b str_pred
+
+none_available: @NONE AVAILABLE
+ vmov.u8 q0, #128
+
+str_pred:
+ vst1.8 {q0}, [r1], r3
+ vst1.8 {q0}, [r1], r3
+ vst1.8 {q0}, [r1], r3
+ vst1.8 {q0}, [r1], r3
+ vst1.8 {q0}, [r1], r3
+ vst1.8 {q0}, [r1], r3
+ vst1.8 {q0}, [r1], r3
+ vst1.8 {q0}, [r1], r3
+ vst1.8 {q0}, [r1], r3
+ vst1.8 {q0}, [r1], r3
+ vst1.8 {q0}, [r1], r3
+ vst1.8 {q0}, [r1], r3
+ vst1.8 {q0}, [r1], r3
+ vst1.8 {q0}, [r1], r3
+ vst1.8 {q0}, [r1], r3
+ vst1.8 {q0}, [r1], r3
+
+ ldmfd sp!, {r4, pc} @Restoring registers from stack
+
+
+
+
+
+@/******************************************************************************
+
+
+@/**
+@*******************************************************************************
+@*
+@*ih264_intra_pred_luma_16x16_mode_plane_a9q
+@*
+@* @brief
+@* Perform Intra prediction for luma_16x16 mode:PLANE
+@*
+@* @par Description:
+@* Perform Intra prediction for luma_16x16 mode:PLANE ,described in sec 8.3.3.4
+@*
+@* @param[in] pu1_src
+@* UWORD8 pointer to the source
+@*
+@* @param[out] pu1_dst
+@* UWORD8 pointer to the destination
+@*
+@* @param[in] src_strd
+@* integer source stride
+@*
+@* @param[in] dst_strd
+@* integer destination stride
+@*
+@* @param[in] ui_neighboravailability
+@* availability of neighbouring pixels
+@*
+@* @returns
+@*
+@* @remarks
+@* None
+@*
+@*******************************************************************************/
+@void ih264_intra_pred_luma_16x16_mode_plane(UWORD8 *pu1_src,
+@ UWORD8 *pu1_dst,
+@ WORD32 src_strd,
+@ WORD32 dst_strd,
+@ WORD32 ui_neighboravailability)
+
+@**************Variables Vs Registers*****************************************
+@ r0 => *pu1_src
+@ r1 => *pu1_dst
+@ r2 => src_strd
+@ r3 => dst_strd
+@ r4 => ui_neighboravailability
+
+ .global ih264_intra_pred_luma_16x16_mode_plane_a9q
+ih264_intra_pred_luma_16x16_mode_plane_a9q:
+
+ stmfd sp!, {r4-r10, r12, lr}
+
+ mov r2, r1
+ add r1, r0, #17
+ add r0, r0, #15
+
+ mov r8, #9
+ sub r1, r1, #1
+ mov r10, r1 @top_left
+ mov r4, #-1
+ vld1.32 d2, [r1], r8
+ ldr r7, scratch_intrapred_addr1
+scrlbl1:
+ add r7, r7, pc
+
+ vld1.32 d0, [r1]
+ vrev64.8 d2, d2
+ vld1.32 {q3}, [r7]
+ vsubl.u8 q0, d0, d2
+ vmovl.u8 q8, d6
+ vmul.s16 q0, q0, q8
+ vmovl.u8 q9, d7
+
+ add r7, r0, r4, lsl #3
+ sub r0, r7, r4, lsl #1
+ rsb lr, r4, #0x0
+
+ vpadd.s16 d0, d0, d1
+
+ ldrb r8, [r7], r4
+ ldrb r9, [r0], lr
+
+ vpaddl.s16 d0, d0
+ sub r12, r8, r9
+
+ ldrb r8, [r7], r4
+
+ vpaddl.s32 d0, d0
+ ldrb r9, [r0], lr
+ sub r8, r8, r9
+ vshl.s32 d2, d0, #2
+ add r12, r12, r8, lsl #1
+
+ vadd.s32 d0, d0, d2
+ ldrb r8, [r7], r4
+ ldrb r9, [r0], lr
+ vrshr.s32 d0, d0, #6 @ i_b = D0[0]
+ sub r8, r8, r9
+ ldrb r5, [r7], r4
+ add r8, r8, r8, lsl #1
+
+ vdup.16 q2, d0[0]
+ add r12, r12, r8
+ ldrb r9, [r0], lr
+ vmul.s16 q0, q2, q8
+ sub r5, r5, r9
+ vmul.s16 q1, q2, q9
+ add r12, r12, r5, lsl #2
+
+ ldrb r8, [r7], r4
+ ldrb r9, [r0], lr
+ sub r8, r8, r9
+ ldrb r5, [r7], r4
+ add r8, r8, r8, lsl #2
+ ldrb r6, [r0], lr
+ add r12, r12, r8
+ ldrb r8, [r7], r4
+ ldrb r9, [r0], lr
+
+ sub r5, r5, r6
+ sub r8, r8, r9
+ add r5, r5, r5, lsl #1
+ rsb r8, r8, r8, lsl #3
+ add r12, r12, r5, lsl #1
+ ldrb r5, [r7], r4
+ ldrb r6, [r10] @top_left
+ add r12, r12, r8
+ sub r9, r5, r6
+ ldrb r6, [r1, #7]
+ add r12, r12, r9, lsl #3 @ i_c = r12
+ add r8, r5, r6
+
+ add r12, r12, r12, lsl #2
+ lsl r8, r8, #4 @ i_a = r8
+
+ add r12, r12, #0x20
+ lsr r12, r12, #6
+
+ vshl.s16 q14, q2, #3
+ vdup.16 q3, r12
+
+ vdup.16 q15, r8
+ vshl.s16 q13, q3, #3
+ vsub.s16 q15, q15, q14
+ vsub.s16 q15, q15, q13
+ vadd.s16 q14, q15, q3
+
+ mov r0, #14
+ vadd.s16 q13, q14, q0
+ vadd.s16 q14, q14, q1
+ vqrshrun.s16 d20, q13, #5
+ vqrshrun.s16 d21, q14, #5
+
+loop_16x16_plane:
+
+ vadd.s16 q13, q13, q3
+ vadd.s16 q14, q14, q3
+ vqrshrun.s16 d22, q13, #5
+ vst1.32 {q10}, [r2], r3
+ vqrshrun.s16 d23, q14, #5
+
+ vadd.s16 q13, q13, q3
+ subs r0, #2
+ vadd.s16 q14, q14, q3
+ vqrshrun.s16 d20, q13, #5
+ vst1.32 {q11}, [r2], r3
+ vqrshrun.s16 d21, q14, #5
+ bne loop_16x16_plane
+
+ vadd.s16 q13, q13, q3
+ vadd.s16 q14, q14, q3
+ vqrshrun.s16 d22, q13, #5
+ vst1.32 {q10}, [r2], r3
+ vqrshrun.s16 d23, q14, #5
+ vst1.32 {q11}, [r2], r3
+
+ ldmfd sp!, {r4-r10, r12, pc}
+
+
+
diff --git a/common/arm/ih264_intra_pred_luma_4x4_a9q.s b/common/arm/ih264_intra_pred_luma_4x4_a9q.s
new file mode 100755
index 0000000..cb386ea
--- /dev/null
+++ b/common/arm/ih264_intra_pred_luma_4x4_a9q.s
@@ -0,0 +1,842 @@
+@/******************************************************************************
+@ *
+@ * Copyright (C) 2015 The Android Open Source Project
+@ *
+@ * Licensed under the Apache License, Version 2.0 (the "License");
+@ * you may not use this file except in compliance with the License.
+@ * You may obtain a copy of the License at:
+@ *
+@ * http://www.apache.org/licenses/LICENSE-2.0
+@ *
+@ * Unless required by applicable law or agreed to in writing, software
+@ * distributed under the License is distributed on an "AS IS" BASIS,
+@ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@ * See the License for the specific language governing permissions and
+@ * limitations under the License.
+@ *
+@ *****************************************************************************
+@ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
+@*/
+@/**
+@******************************************************************************
+@* @file
+@* ih264_intra_pred_luma_4x4_a9q.s
+@*
+@* @brief
+@* Contains function definitions for intra 4x4 Luma prediction .
+@*
+@* @author
+@* Ittiam
+@*
+@* @par List of Functions:
+@*
+@* -ih264_intra_pred_luma_4x4_mode_vert_a9q
+@* -ih264_intra_pred_luma_4x4_mode_horz_a9q
+@* -ih264_intra_pred_luma_4x4_mode_dc_a9q
+@* -ih264_intra_pred_luma_4x4_mode_diag_dl_a9q
+@* -ih264_intra_pred_luma_4x4_mode_diag_dr_a9q
+@* -ih264_intra_pred_luma_4x4_mode_vert_r_a9q
+@* -ih264_intra_pred_luma_4x4_mode_horz_d_a9q
+@* -ih264_intra_pred_luma_4x4_mode_vert_l_a9q
+@* -ih264_intra_pred_luma_4x4_mode_horz_u_a9q
+@*
+@* @remarks
+@* None
+@*
+@*******************************************************************************
+@*/
+
+@/* All the functions here are replicated from ih264_intra_pred_filters.c
+@
+
+@/**
+@/**
+@/**
+@
+
+.text
+.p2align 2
+
+
+@/**
+@*******************************************************************************
+@*
+@*ih264_intra_pred_luma_4x4_mode_vert
+@*
+@* @brief
+@* Perform Intra prediction for luma_4x4 mode:vertical
+@*
+@* @par Description:
+@* Perform Intra prediction for luma_4x4 mode:vertical ,described in sec 8.3.1.2.1
+@*
+@* @param[in] pu1_src
+@* UWORD8 pointer to the source
+@*
+@* @param[out] pu1_dst
+@* UWORD8 pointer to the destination
+@*
+@* @param[in] src_strd
+@* integer source stride
+@*
+@* @param[in] dst_strd
+@* integer destination stride
+@*
+@* @param[in] ui_neighboravailability
+@* availability of neighbouring pixels(Not used in this function)
+@*
+@* @returns
+@*
+@* @remarks
+@* None
+@*
+@*******************************************************************************
+@void ih264_intra_pred_luma_4x4_mode_vert(UWORD8 *pu1_src,
+@ UWORD8 *pu1_dst,
+@ WORD32 src_strd,
+@ WORD32 dst_strd,
+@ WORD32 ui_neighboravailability)
+
+@**************Variables Vs Registers*****************************************
+@ r0 => *pu1_src
+@ r1 => *pu1_dst
+@ r2 => src_strd
+@ r3 => dst_strd
+@ r4 => ui_neighboravailability
+
+ .global ih264_intra_pred_luma_4x4_mode_vert_a9q
+
+ih264_intra_pred_luma_4x4_mode_vert_a9q:
+
+
+
+ stmfd sp!, {r4-r12, r14} @store register values to stack
+
+ add r0, r0, #5
+
+ vld1.32 d0[0], [r0]
+
+ vst1.32 d0[0], [r1], r3
+ vst1.32 d0[0], [r1], r3
+ vst1.32 d0[0], [r1], r3
+ vst1.32 d0[0], [r1], r3
+
+
+
+ ldmfd sp!, {r4-r12, pc} @Restoring registers from stack
+
+
+
+
+
+@/******************************************************************************
+
+
+@/**
+@*******************************************************************************
+@*
+@*ih264_intra_pred_luma_4x4_mode_horz
+@*
+@* @brief
+@* Perform Intra prediction for luma_4x4 mode:horizontal
+@*
+@* @par Description:
+@* Perform Intra prediction for luma_4x4 mode:horizontal ,described in sec 8.3.1.2.2
+@*
+@* @param[in] pu1_src
+@* UWORD8 pointer to the source
+@*
+@* @param[out] pu1_dst
+@* UWORD8 pointer to the destination
+@*
+@* @param[in] src_strd
+@* integer source stride
+@*
+@* @param[in] dst_strd
+@* integer destination stride
+@*
+@* @param[in] ui_neighboravailability
+@* availability of neighbouring pixels(Not used in this function)
+@*
+@* @returns
+@*
+@* @remarks
+@* None
+@*
+@*******************************************************************************
+@*/
+@void ih264_intra_pred_luma_4x4_mode_horz(UWORD8 *pu1_src,
+@ UWORD8 *pu1_dst,
+@ WORD32 src_strd,
+@ WORD32 dst_strd,
+@ WORD32 ui_neighboravailability)
+@**************Variables Vs Registers*****************************************
+@ r0 => *pu1_src
+@ r1 => *pu1_dst
+@ r2 => src_strd
+@ r3 => dst_strd
+@ r4 => ui_neighboravailability
+
+
+
+ .global ih264_intra_pred_luma_4x4_mode_horz_a9q
+
+ih264_intra_pred_luma_4x4_mode_horz_a9q:
+
+
+
+ stmfd sp!, {r4-r12, r14} @store register values to stack
+ add r0, r0, #3
+ mov r2 , #-1
+
+ ldrb r5, [r0], r2
+ vdup.u8 d0, r5
+ ldrb r6, [r0], r2
+ vst1.32 d0[0], [r1], r3
+ vdup.u8 d1, r6
+ ldrb r7, [r0], r2
+ vst1.32 d1[0], [r1], r3
+ vdup.u8 d2, r7
+ ldrb r8, [r0], r2
+ vst1.32 d2[0], [r1], r3
+ vdup.u8 d3, r8
+ vst1.32 d3[0], [r1], r3
+
+
+ ldmfd sp!, {r4-r12, pc} @Restoring registers from stack
+
+
+
+
+
+
+
+@/******************************************************************************
+
+
+@/**
+@*******************************************************************************
+@*
+@*ih264_intra_pred_luma_4x4_mode_dc
+@*
+@* @brief
+@* Perform Intra prediction for luma_4x4 mode:DC
+@*
+@* @par Description:
+@* Perform Intra prediction for luma_4x4 mode:DC ,described in sec 8.3.1.2.3
+@*
+@* @param[in] pu1_src
+@* UWORD8 pointer to the source
+@*
+@* @param[out] pu1_dst
+@* UWORD8 pointer to the destination
+@*
+@* @param[in] src_strd
+@* integer source stride
+@*
+@* @param[in] dst_strd
+@* integer destination stride
+@*
+@* @param[in] ui_neighboravailability
+@* availability of neighbouring pixels
+@*
+@* @returns
+@*
+@* @remarks
+@* None
+@*
+@*******************************************************************************/
+@void ih264_intra_pred_luma_4x4_mode_dc(UWORD8 *pu1_src,
+@ UWORD8 *pu1_dst,
+@ WORD32 src_strd,
+@ WORD32 dst_strd,
+@ WORD32 ui_neighboravailability)
+
+@**************Variables Vs Registers*****************************************
+@ r0 => *pu1_src
+@ r1 => *pu1_dst
+@ r2 => src_strd
+@ r3 => dst_strd
+@ r4 => ui_neighboravailability
+
+
+
+ .global ih264_intra_pred_luma_4x4_mode_dc_a9q
+
+ih264_intra_pred_luma_4x4_mode_dc_a9q:
+
+
+
+ stmfd sp!, {r4-r12, r14} @store register values to stack
+ ldr r4, [sp, #40] @ r4 => ui_neighboravailability
+
+ ands r5, r4, #0x01
+ beq top_available @LEFT NOT AVAILABLE
+
+ add r10, r0, #3
+ mov r2, #-1
+ ldrb r5, [r10], r2
+ ldrb r6, [r10], r2
+ ldrb r7, [r10], r2
+ add r5, r5, r6
+ ldrb r8, [r10], r2
+ add r5, r5, r7
+ ands r11, r4, #0x04 @ CHECKING IF TOP_AVAILABLE ELSE BRANCHING TO ONLY LEFT AVAILABLE
+ add r5, r5, r8
+ beq left_available
+ add r10, r0, #5
+ @ BOTH LEFT AND TOP AVAILABLE
+ ldrb r6, [r10], #1
+ ldrb r7, [r10], #1
+ add r5, r5, r6
+ ldrb r8, [r10], #1
+ add r5, r5, r7
+ ldrb r9, [r10], #1
+ add r5, r5, r8
+ add r5, r5, r9
+ add r5, r5, #4
+ lsr r5, r5, #3
+ vdup.u8 d0, r5
+ vst1.32 d0[0], [r1], r3
+ vst1.32 d0[0], [r1], r3
+ vst1.32 d0[0], [r1], r3
+ vst1.32 d0[0], [r1], r3
+ b end_func
+
+top_available: @ ONLT TOP AVAILABLE
+ ands r11, r4, #0x04 @ CHECKING TOP AVAILABILTY OR ELSE BRANCH TO NONE AVAILABLE
+ beq none_available
+
+ add r10, r0, #5
+ ldrb r6, [r10], #1
+ ldrb r7, [r10], #1
+ ldrb r8, [r10], #1
+ add r5, r6, r7
+ ldrb r9, [r10], #1
+ add r5, r5, r8
+ add r5, r5, r9
+ add r5, r5, #2
+ lsr r5, r5, #2
+ vdup.u8 d0, r5
+ vst1.32 d0[0], [r1], r3
+ vst1.32 d0[0], [r1], r3
+ vst1.32 d0[0], [r1], r3
+ vst1.32 d0[0], [r1], r3
+ b end_func
+
+left_available: @ONLY LEFT AVAILABLE
+ add r5, r5, #2
+ lsr r5, r5, #2
+ vdup.u8 d0, r5
+ vst1.32 d0[0], [r1], r3
+ vst1.32 d0[0], [r1], r3
+ vst1.32 d0[0], [r1], r3
+ vst1.32 d0[0], [r1], r3
+ b end_func
+
+none_available: @NONE AVAILABLE
+ mov r5, #128
+ vdup.u8 d0, r5
+ vst1.32 d0[0], [r1], r3
+ vst1.32 d0[0], [r1], r3
+ vst1.32 d0[0], [r1], r3
+ vst1.32 d0[0], [r1], r3
+ b end_func
+
+
+end_func:
+ ldmfd sp!, {r4-r12, pc} @Restoring registers from stack
+
+
+
+
+
+
+
+@/**
+@*******************************************************************************
+@*
+@*ih264_intra_pred_luma_4x4_mode_diag_dl
+@*
+@* @brief
+@* Perform Intra prediction for luma_4x4 mode:Diagonal_Down_Left
+@*
+@* @par Description:
+@* Perform Intra prediction for luma_4x4 mode:Diagonal_Down_Left ,described in sec 8.3.1.2.4
+@*
+@* @param[in] pu1_src
+@* UWORD8 pointer to the source
+@*
+@* @param[out] pu1_dst
+@* UWORD8 pointer to the destination
+@*
+@* @param[in] src_strd
+@* integer source stride
+@*
+@* @param[in] dst_strd
+@* integer destination stride
+@*
+@* @param[in] ui_neighboravailability
+@* availability of neighbouring pixels
+@*
+@* @returns
+@*
+@* @remarks
+@* None
+@*
+@*******************************************************************************/
+@void ih264_intra_pred_luma_4x4_mode_diag_dl(UWORD8 *pu1_src,
+@ UWORD8 *pu1_dst,
+@ WORD32 src_strd,
+@ WORD32 dst_strd,
+@ WORD32 ui_neighboravailability)
+
+@**************Variables Vs Registers*****************************************
+@ r0 => *pu1_src
+@ r1 => *pu1_dst
+@ r2 => src_strd
+@ r3 => dst_strd
+@ r4 => ui_neighboravailability
+
+
+ .global ih264_intra_pred_luma_4x4_mode_diag_dl_a9q
+
+ih264_intra_pred_luma_4x4_mode_diag_dl_a9q:
+
+ stmfd sp!, {r4-r12, r14} @store register values to stack
+
+ add r0, r0, #5
+ sub r5, r3, #2
+ add r6, r0, #7
+ vld1.8 {d0}, [r0]
+ vext.8 d1, d0, d0, #1
+ vext.8 d2, d0, d0, #2
+ vld1.8 {d2[6]}, [r6]
+ vaddl.u8 q10, d0, d1
+ vaddl.u8 q11, d1, d2
+ vadd.u16 q12, q10, q11
+ vqrshrun.s16 d3, q12, #2
+ vst1.32 {d3[0]}, [r1], r3
+ vext.8 d4, d3, d3, #1
+ vst1.32 {d4[0]}, [r1], r3
+ vst1.16 {d3[1]}, [r1]!
+ vst1.16 {d3[2]}, [r1], r5
+ vst1.16 {d4[1]}, [r1]!
+ vst1.16 {d4[2]}, [r1]
+
+end_func_diag_dl:
+ ldmfd sp!, {r4-r12, pc} @Restoring registers from stack
+
+
+
+
+
+
+
+
+
+@/**
+@*******************************************************************************
+@*
+@*ih264_intra_pred_luma_4x4_mode_diag_dr
+@*
+@* @brief
+@* Perform Intra prediction for luma_4x4 mode:Diagonal_Down_Right
+@*
+@* @par Description:
+@* Perform Intra prediction for luma_4x4 mode:Diagonal_Down_Right ,described in sec 8.3.1.2.5
+@*
+@* @param[in] pu1_src
+@* UWORD8 pointer to the source
+@*
+@* @param[out] pu1_dst
+@* UWORD8 pointer to the destination
+@*
+@* @param[in] src_strd
+@* integer source stride
+@*
+@* @param[in] dst_strd
+@* integer destination stride
+@*
+@* @param[in] ui_neighboravailability
+@* availability of neighbouring pixels
+@*
+@* @returns
+@*
+@* @remarks
+@* None
+@*
+@*******************************************************************************/
+@void ih264_intra_pred_luma_4x4_mode_diag_dr(UWORD8 *pu1_src,
+@ UWORD8 *pu1_dst,
+@ WORD32 src_strd,
+@ WORD32 dst_strd,
+@ WORD32 ui_neighboravailability)
+
+@**************Variables Vs Registers*****************************************
+@ r0 => *pu1_src
+@ r1 => *pu1_dst
+@ r2 => src_strd
+@ r3 => dst_strd
+@ r4 => ui_neighboravailability
+
+
+ .global ih264_intra_pred_luma_4x4_mode_diag_dr_a9q
+
+ih264_intra_pred_luma_4x4_mode_diag_dr_a9q:
+
+ stmfd sp!, {r4-r12, r14} @store register values to stack
+
+
+ vld1.u8 {d0}, [r0]
+ add r0, r0, #1
+ vld1.u8 {d1}, [r0]
+ vext.8 d2, d1, d1, #1
+ vaddl.u8 q10, d0, d1
+ vaddl.u8 q11, d1, d2
+ vadd.u16 q12, q10, q11
+ vqrshrun.s16 d3, q12, #2
+
+ vext.8 d4, d3, d3, #1
+ sub r5, r3, #2
+ vst1.16 {d4[1]}, [r1]!
+ vst1.16 {d4[2]}, [r1], r5
+ vst1.16 {d3[1]}, [r1]!
+ vst1.16 {d3[2]}, [r1], r5
+ vst1.32 {d4[0]}, [r1], r3
+ vst1.32 {d3[0]}, [r1], r3
+
+end_func_diag_dr:
+ ldmfd sp!, {r4-r12, pc} @Restoring registers from stack
+
+
+
+
+
+
+
+@/**
+@*******************************************************************************
+@*
+@*ih264_intra_pred_luma_4x4_mode_vert_r
+@*
+@* @brief
+@* Perform Intra prediction for luma_4x4 mode:Vertical_Right
+@*
+@* @par Description:
+@* Perform Intra prediction for luma_4x4 mode:Vertical_Right ,described in sec 8.3.1.2.6
+@*
+@* @param[in] pu1_src
+@* UWORD8 pointer to the source
+@*
+@* @param[out] pu1_dst
+@* UWORD8 pointer to the destination
+@*
+@* @param[in] src_strd
+@* integer source stride
+@*
+@* @param[in] dst_strd
+@* integer destination stride
+@*
+@* @param[in] ui_neighboravailability
+@* availability of neighbouring pixels
+@*
+@* @returns
+@*
+@* @remarks
+@* None
+@*
+@*******************************************************************************/
+@void ih264_intra_pred_luma_4x4_mode_vert_r(UWORD8 *pu1_src,
+@ UWORD8 *pu1_dst,
+@ WORD32 src_strd,
+@ WORD32 dst_strd,
+@ WORD32 ui_neighboravailability)
+
+@**************Variables Vs Registers*****************************************
+@ r0 => *pu1_src
+@ r1 => *pu1_dst
+@ r2 => src_strd
+@ r3 => dst_strd
+@ r4 => ui_neighboravailability
+
+
+ .global ih264_intra_pred_luma_4x4_mode_vert_r_a9q
+
+ih264_intra_pred_luma_4x4_mode_vert_r_a9q:
+
+ stmfd sp!, {r4-r12, r14} @store register values to stack
+
+
+ vld1.u8 {d0}, [r0]
+ add r0, r0, #1
+ vld1.u8 {d1}, [r0]
+ vext.8 d2, d1, d1, #1
+ vaddl.u8 q10, d0, d1
+ vaddl.u8 q11, d1, d2
+ vadd.u16 q12, q10, q11
+ vqrshrun.s16 d4, q10, #1
+ vqrshrun.s16 d3, q12, #2
+ sub r5, r3, #2
+ vext.8 d5, d3, d3, #3
+ vst1.32 {d4[1]}, [r1], r3
+ vst1.32 {d5[0]}, [r1], r3
+ sub r8, r3, #3
+ vst1.u8 {d3[2]}, [r1]!
+ vst1.16 {d4[2]}, [r1]!
+ vst1.u8 {d4[6]}, [r1], r8
+ vst1.u8 {d3[1]}, [r1]!
+ vst1.16 {d5[0]}, [r1]!
+ vst1.u8 {d5[2]}, [r1]
+
+
+end_func_vert_r:
+ ldmfd sp!, {r4-r12, pc} @Restoring registers from stack
+
+
+
+
+
+@/**
+@*******************************************************************************
+@*
+@*ih264_intra_pred_luma_4x4_mode_horz_d
+@*
+@* @brief
+@* Perform Intra prediction for luma_4x4 mode:Horizontal_Down
+@*
+@* @par Description:
+@* Perform Intra prediction for luma_4x4 mode:Horizontal_Down ,described in sec 8.3.1.2.7
+@*
+@* @param[in] pu1_src
+@* UWORD8 pointer to the source
+@*
+@* @param[out] pu1_dst
+@* UWORD8 pointer to the destination
+@*
+@* @param[in] src_strd
+@* integer source stride
+@*
+@* @param[in] dst_strd
+@* integer destination stride
+@*
+@* @param[in] ui_neighboravailability
+@* availability of neighbouring pixels
+@*
+@* @returns
+@*
+@* @remarks
+@* None
+@*
+@*******************************************************************************/
+@void ih264_intra_pred_luma_4x4_mode_horz_d(UWORD8 *pu1_src,
+@ UWORD8 *pu1_dst,
+@ WORD32 src_strd,
+@ WORD32 dst_strd,
+@ WORD32 ui_neighboravailability)
+
+@**************Variables Vs Registers*****************************************
+@ r0 => *pu1_src
+@ r1 => *pu1_dst
+@ r2 => src_strd
+@ r3 => dst_strd
+@ r4 => ui_neighboravailability
+
+
+ .global ih264_intra_pred_luma_4x4_mode_horz_d_a9q
+
+ih264_intra_pred_luma_4x4_mode_horz_d_a9q:
+
+ stmfd sp!, {r4-r12, r14} @store register values to stack
+
+ vld1.u8 {d0}, [r0]
+ add r0, r0, #1
+ vld1.u8 {d1}, [r0]
+ vext.8 d2, d1, d0, #1
+ vaddl.u8 q10, d0, d1
+ vaddl.u8 q11, d1, d2
+ vadd.u16 q12, q10, q11
+ vqrshrun.s16 d4, q10, #1
+ vqrshrun.s16 d5, q12, #2
+ sub r5, r3, #2
+ vmov.8 d6, d5
+ vtrn.8 d4, d5 @
+ vst1.u16 {d5[1]}, [r1]!
+ vst1.16 {d6[2]}, [r1], r5
+ vst1.u16 {d4[1]}, [r1]!
+ vst1.16 {d5[1]}, [r1], r5
+ vst1.u16 {d5[0]}, [r1]!
+ vst1.16 {d4[1]}, [r1], r5
+ vst1.u16 {d4[0]}, [r1]!
+ vst1.16 {d5[0]}, [r1], r5
+
+end_func_horz_d:
+ ldmfd sp!, {r4-r12, pc} @Restoring registers from stack
+
+
+
+
+
+
+
+@/**
+@*******************************************************************************
+@*
+@*ih264_intra_pred_luma_4x4_mode_vert_l
+@*
+@* @brief
+@* Perform Intra prediction for luma_4x4 mode:Vertical_Left
+@*
+@* @par Description:
+@* Perform Intra prediction for luma_4x4 mode:Vertical_Left ,described in sec 8.3.1.2.8
+@*
+@* @param[in] pu1_src
+@* UWORD8 pointer to the source
+@*
+@* @param[out] pu1_dst
+@* UWORD8 pointer to the destination
+@*
+@* @param[in] src_strd
+@* integer source stride
+@*
+@* @param[in] dst_strd
+@* integer destination stride
+@*
+@* @param[in] ui_neighboravailability
+@* availability of neighbouring pixels
+@*
+@* @returns
+@*
+@* @remarks
+@* None
+@*
+@*******************************************************************************/
+@void ih264_intra_pred_luma_4x4_mode_vert_l(UWORD8 *pu1_src,
+@ UWORD8 *pu1_dst,
+@ WORD32 src_strd,
+@ WORD32 dst_strd,
+@ WORD32 ui_neighboravailability)
+
+@**************Variables Vs Registers*****************************************
+@ r0 => *pu1_src
+@ r1 => *pu1_dst
+@ r2 => src_strd
+@ r3 => dst_strd
+@ r4 => ui_neighboravailability
+
+
+ .global ih264_intra_pred_luma_4x4_mode_vert_l_a9q
+
+ih264_intra_pred_luma_4x4_mode_vert_l_a9q:
+
+ stmfd sp!, {r4-r12, r14} @store register values to stack
+ add r0, r0, #4
+ vld1.u8 {d0}, [r0]
+ add r0, r0, #1
+ vld1.u8 {d1}, [r0]
+ vext.8 d2, d1, d0, #1
+ vaddl.u8 q10, d0, d1
+ vaddl.u8 q11, d1, d2
+ vadd.u16 q12, q10, q11
+ vqrshrun.s16 d4, q10, #1
+ vqrshrun.s16 d5, q12, #2
+ vext.8 d6, d4, d4, #1
+ vext.8 d7, d5, d5, #1
+ vst1.32 {d6[0]}, [r1], r3
+ vext.8 d16, d4, d4, #2
+ vext.8 d17, d5, d5, #2
+ vst1.32 {d7[0]}, [r1], r3
+ vst1.32 {d16[0]}, [r1], r3
+ vst1.32 {d17[0]}, [r1], r3
+
+
+
+end_func_vert_l:
+ ldmfd sp!, {r4-r12, pc} @Restoring registers from stack
+
+
+
+
+
+
+
+@/**
+@*******************************************************************************
+@*
+@*ih264_intra_pred_luma_4x4_mode_horz_u
+@*
+@* @brief
+@* Perform Intra prediction for luma_4x4 mode:Horizontal_Up
+@*
+@* @par Description:
+@* Perform Intra prediction for luma_4x4 mode:Horizontal_Up ,described in sec 8.3.1.2.9
+@*
+@* @param[in] pu1_src
+@* UWORD8 pointer to the source
+@*
+@* @param[out] pu1_dst
+@* UWORD8 pointer to the destination
+@*
+@* @param[in] src_strd
+@* integer source stride
+@*
+@* @param[in] dst_strd
+@* integer destination stride
+@*
+@* @param[in] ui_neighboravailability
+@* availability of neighbouring pixels
+@*
+@* @returns
+@*
+@* @remarks
+@* None
+@*
+@*******************************************************************************/
+@void ih264_intra_pred_luma_4x4_mode_horz_u(UWORD8 *pu1_src,
+@ UWORD8 *pu1_dst,
+@ WORD32 src_strd,
+@ WORD32 dst_strd,
+@ WORD32 ui_neighboravailability)
+
+@**************Variables Vs Registers*****************************************
+@ r0 => *pu1_src
+@ r1 => *pu1_dst
+@ r2 => src_strd
+@ r3 => dst_strd
+@ r4 => ui_neighboravailability
+
+
+ .global ih264_intra_pred_luma_4x4_mode_horz_u_a9q
+
+ih264_intra_pred_luma_4x4_mode_horz_u_a9q:
+
+ stmfd sp!, {r4-r12, r14} @store register values to stack
+ mov r10, r0
+ vld1.u8 {d0}, [r0]
+ ldrb r9, [r0], #1
+ vext.8 d1, d0, d0, #1
+ vld1.u8 {d0[7]}, [r10]
+ vext.8 d2, d1, d1, #1
+ vaddl.u8 q10, d0, d1
+ vaddl.u8 q11, d1, d2
+ vadd.u16 q12, q10, q11
+ vqrshrun.s16 d4, q10, #1
+ vqrshrun.s16 d5, q12, #2
+ vmov d6, d4
+ vext.8 d6, d5, d4, #1
+ vst1.8 {d4[2]}, [r1]!
+ vst1.8 {d6[0]}, [r1]!
+ vtrn.8 d6, d5 @
+ sub r5, r3, #2
+ vtrn.8 d4, d6 @
+ vdup.8 d7, r9
+ vst1.16 {d6[0]}, [r1], r5
+ vst1.16 {d6[0]}, [r1]!
+ vst1.16 {d5[3]}, [r1], r5
+ vst1.16 {d5[3]}, [r1]!
+ vst1.16 {d7[3]}, [r1], r5
+ vst1.32 {d7[0]}, [r1], r3
+
+end_func_horz_u:
+ ldmfd sp!, {r4-r12, pc} @Restoring registers from stack
+
+
diff --git a/common/arm/ih264_intra_pred_luma_8x8_a9q.s b/common/arm/ih264_intra_pred_luma_8x8_a9q.s
new file mode 100755
index 0000000..6da1c95
--- /dev/null
+++ b/common/arm/ih264_intra_pred_luma_8x8_a9q.s
@@ -0,0 +1,1037 @@
+@/******************************************************************************
+@ *
+@ * Copyright (C) 2015 The Android Open Source Project
+@ *
+@ * Licensed under the Apache License, Version 2.0 (the "License");
+@ * you may not use this file except in compliance with the License.
+@ * You may obtain a copy of the License at:
+@ *
+@ * http://www.apache.org/licenses/LICENSE-2.0
+@ *
+@ * Unless required by applicable law or agreed to in writing, software
+@ * distributed under the License is distributed on an "AS IS" BASIS,
+@ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@ * See the License for the specific language governing permissions and
+@ * limitations under the License.
+@ *
+@ *****************************************************************************
+@ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
+@*/
+@/**
+@******************************************************************************
+@* @file
+@* ih264_intra_pred_luma_8x8_a9q.s
+@*
+@* @brief
+@* Contains function definitions for intra 8x8 Luma prediction .
+@*
+@* @author
+@* Ittiam
+@*
+@* @par List of Functions:
+@*
+@* -ih264_intra_pred_luma_8x8_mode_ref_filtering_a9q
+@* -ih264_intra_pred_luma_8x8_mode_vert_a9q
+@* -ih264_intra_pred_luma_8x8_mode_horz_a9q
+@* -ih264_intra_pred_luma_8x8_mode_dc_a9q
+@* -ih264_intra_pred_luma_8x8_mode_diag_dl_a9q
+@* -ih264_intra_pred_luma_8x8_mode_diag_dr_a9q
+@* -ih264_intra_pred_luma_8x8_mode_vert_r_a9q
+@* -ih264_intra_pred_luma_8x8_mode_horz_d_a9q
+@* -ih264_intra_pred_luma_8x8_mode_vert_l_a9q
+@* -ih264_intra_pred_luma_8x8_mode_horz_u_a9q
+@*
+@* @remarks
+@* None
+@*
+@*******************************************************************************
+@*/
+
+@/* All the functions here are replicated from ih264_intra_pred_filters.c
+@
+
+@/**
+@/**
+@/**
+@
+
+
+.text
+.p2align 2
+
+ .extern ih264_gai1_intrapred_luma_8x8_horz_u
+.hidden ih264_gai1_intrapred_luma_8x8_horz_u
+scratch_intrapred_addr_8x8:
+ .long ih264_gai1_intrapred_luma_8x8_horz_u - scrlb8x8l2 - 8
+
+@/**
+@*******************************************************************************
+@*
+@*ih264_intra_pred_luma_8x8_mode_ref_filtering
+@*
+@* @brief
+@* Reference sample filtering process for Intra_8x8 sample prediction
+@*
+@* @par Description:
+@* Perform Reference sample filtering process for Intra_8x8 sample prediction ,described in sec 8.3.2.2.1
+@*
+@* @param[in] pu1_src
+@* UWORD8 pointer to the source
+@*
+@* @param[out] pu1_dst
+@* UWORD8 pointer to the destination
+@*
+@* @param[in] src_strd
+@* integer source stride [Not used]
+@*
+@* @param[in] dst_strd
+@* integer destination stride[Not used]
+@*
+@* @param[in] ui_neighboravailability
+@* availability of neighbouring pixels[Not used]
+@*
+@* @returns
+@*
+@* @remarks
+@* None
+@*
+@*******************************************************************************/
+@void ih264_intra_pred_luma_8x8_mode_ref_filtering(UWORD8 *pu1_src,
+@ UWORD8 *pu1_dst)
+
+@**************Variables Vs Registers*****************************************
+@ r0 => *pu1_src
+@ r1 => *pu1_dst
+
+
+ .global ih264_intra_pred_luma_8x8_mode_ref_filtering_a9q
+
+ih264_intra_pred_luma_8x8_mode_ref_filtering_a9q:
+
+ stmfd sp!, {r4-r12, r14} @store register values to stack
+ vpush {d8-d15}
+
+
+ vld1.u8 {q0}, [r0]! @
+ vld1.u8 {q1}, [r0]
+ add r0, r0, #8 @
+ vext.8 q2, q0, q1, #1
+ vext.8 q3, q1, q1, #1
+ vext.8 q4, q2, q3, #1
+ vext.8 q5, q3, q3, #1
+ vld1.8 {d10[7]}, [r0] @ LOADING SRC[24] AGIN TO THE END FOR p'[ 15, -1 ] = ( p[ 14, -1 ] + 3 * p[ 15, -1 ] + 2 ) >> 2
+ vaddl.u8 q10, d0, d4
+ vaddl.u8 q7, d0, d0 @ SPECIAL CASE FOR p'[ -1 ,7 ] = ( p[ -1, 6 ] + 3 * p[ -1, 7 ] + 2 ) >> 2
+ vadd.u16 q7, q10, q7
+ vaddl.u8 q11, d1, d5
+ vqrshrun.s16 d14, q7, #2
+ vaddl.u8 q12, d4, d8
+ vaddl.u8 q13, d5, d9
+ vst1.8 {d14[0]}, [r1]!
+ vadd.u16 q12, q10, q12
+ vadd.u16 q13, q11, q13
+ vaddl.u8 q9, d2, d6
+ vaddl.u8 q8, d6, d10
+ vqrshrun.s16 d4, q12, #2
+ vqrshrun.s16 d5, q13, #2
+ vadd.u16 q6, q8, q9
+ vst1.8 {q2}, [r1]!
+ vqrshrun.s16 d6, q6, #2
+ vst1.8 {d6}, [r1]
+
+
+end_func_ref_filt:
+ vpop {d8-d15}
+ ldmfd sp!, {r4-r12, pc} @Restoring registers from stack
+
+
+
+
+
+
+@/**
+@*******************************************************************************
+@*
+@*ih264_intra_pred_luma_8x8_mode_vert
+@*
+@* @brief
+@* Perform Intra prediction for luma_8x8 mode:vertical
+@*
+@* @par Description:
+@* Perform Intra prediction for luma_8x8 mode:vertical ,described in sec 8.3.2.2.2
+@*
+@* @param[in] pu1_src
+@* UWORD8 pointer to the source
+@*
+@* @param[out] pu1_dst
+@* UWORD8 pointer to the destination
+@*
+@* @param[in] src_strd
+@* integer source stride
+@*
+@* @param[in] dst_strd
+@* integer destination stride
+@*
+@* @param[in] ui_neighboravailability
+@* availability of neighbouring pixels(Not used in this function)
+@*
+@* @returns
+@*
+@* @remarks
+@* None
+@*
+@*******************************************************************************
+@void ih264_intra_pred_luma_8x8_mode_vert(UWORD8 *pu1_src,
+@ UWORD8 *pu1_dst,
+@ WORD32 src_strd,
+@ WORD32 dst_strd,
+@ WORD32 ui_neighboravailability)
+
+@**************Variables Vs Registers*****************************************
+@ r0 => *pu1_src
+@ r1 => *pu1_dst
+@ r2 => src_strd
+@ r3 => dst_strd
+@ r4 => ui_neighboravailability
+
+
+ .global ih264_intra_pred_luma_8x8_mode_vert_a9q
+
+ih264_intra_pred_luma_8x8_mode_vert_a9q:
+
+ stmfd sp!, {r4-r12, r14} @store register values to stack
+
+ add r0, r0, #9
+ vld1.8 d0, [r0]
+
+ vst1.8 d0, [r1], r3
+ vst1.8 d0, [r1], r3
+ vst1.8 d0, [r1], r3
+ vst1.8 d0, [r1], r3
+ vst1.8 d0, [r1], r3
+ vst1.8 d0, [r1], r3
+ vst1.8 d0, [r1], r3
+ vst1.8 d0, [r1], r3
+
+ ldmfd sp!, {r4-r12, pc} @Restoring registers from stack
+
+
+
+
+
+@/******************************************************************************
+
+
+@/**
+@*******************************************************************************
+@*
+@*ih264_intra_pred_luma_8x8_mode_horz
+@*
+@* @brief
+@* Perform Intra prediction for luma_8x8 mode:horizontal
+@*
+@* @par Description:
+@* Perform Intra prediction for luma_8x8 mode:horizontal ,described in sec 8.3.2.2.2
+@*
+@* @param[in] pu1_src
+@* UWORD8 pointer to the source
+@*
+@* @param[out] pu1_dst
+@* UWORD8 pointer to the destination
+@*
+@* @param[in] src_strd
+@* integer source stride
+@*
+@* @param[in] dst_strd
+@* integer destination stride
+@*
+@* @param[in] ui_neighboravailability
+@* availability of neighbouring pixels(Not used in this function)
+@*
+@* @returns
+@*
+@* @remarks
+@* None
+@*
+@*******************************************************************************
+@*/
+@void ih264_intra_pred_luma_8x8_mode_horz(UWORD8 *pu1_src,
+@ UWORD8 *pu1_dst,
+@ WORD32 src_strd,
+@ WORD32 dst_strd,
+@ WORD32 ui_neighboravailability)
+@**************Variables Vs Registers*****************************************
+@ r0 => *pu1_src
+@ r1 => *pu1_dst
+@ r2 => src_strd
+@ r3 => dst_strd
+@ r4 => ui_neighboravailability
+
+
+ .global ih264_intra_pred_luma_8x8_mode_horz_a9q
+
+ih264_intra_pred_luma_8x8_mode_horz_a9q:
+
+ stmfd sp!, {r14} @store register values to stack
+
+ vld1.u8 {d0}, [r0]
+ mov r2, #6
+
+ vdup.u8 d1, d0[7]
+ vdup.u8 d2, d0[6]
+ vst1.8 {d1}, [r1], r3
+
+loop_8x8_horz:
+ vext.8 d0, d0, d0, #6
+ vst1.8 {d2}, [r1], r3
+ vdup.u8 d1, d0[7]
+ subs r2, #2
+ vdup.u8 d2, d0[6]
+ vst1.8 {d1}, [r1], r3
+ bne loop_8x8_horz
+
+ vext.8 d0, d0, d0, #6
+ vst1.8 {d2}, [r1], r3
+
+ ldmfd sp!, {pc} @restoring registers from stack
+
+
+
+
+
+@/******************************************************************************
+
+
+@/**
+@*******************************************************************************
+@*
+@*ih264_intra_pred_luma_8x8_mode_dc
+@*
+@* @brief
+@* Perform Intra prediction for luma_8x8 mode:DC
+@*
+@* @par Description:
+@* Perform Intra prediction for luma_8x8 mode:DC ,described in sec 8.3.2.2.3
+@*
+@* @param[in] pu1_src
+@* UWORD8 pointer to the source
+@*
+@* @param[out] pu1_dst
+@* UWORD8 pointer to the destination
+@*
+@* @param[in] src_strd
+@* integer source stride
+@*
+@* @param[in] dst_strd
+@* integer destination stride
+@*
+@* @param[in] ui_neighboravailability
+@* availability of neighbouring pixels
+@*
+@* @returns
+@*
+@* @remarks
+@* None
+@*
+@*******************************************************************************/
+@void ih264_intra_pred_luma_8x8_mode_dc(UWORD8 *pu1_src,
+@ UWORD8 *pu1_dst,
+@ WORD32 src_strd,
+@ WORD32 dst_strd,
+@ WORD32 ui_neighboravailability)
+
+@**************Variables Vs Registers*****************************************
+@ r0 => *pu1_src
+@ r1 => *pu1_dst
+@ r2 => src_strd
+@ r3 => dst_strd
+@ r4 => ui_neighboravailability
+
+
+ .global ih264_intra_pred_luma_8x8_mode_dc_a9q
+
+ih264_intra_pred_luma_8x8_mode_dc_a9q:
+
+ stmfd sp!, {r4, r14} @store register values to stack
+ ldr r4, [sp, #8] @r4 => ui_neighboravailability
+
+ ands r2, r4, #0x01 @CHECKING IF LEFT_AVAILABLE ELSE BRANCHING TO ONLY TOP AVAILABLE
+ beq top_available
+ ands r2, r4, #0x04 @CHECKING IF TOP_AVAILABLE ELSE BRANCHING TO ONLY LEFT AVAILABLE
+ beq left_available
+
+ vld1.u8 {d0}, [r0] @BOTH LEFT AND TOP AVAILABLE
+ add r0, r0, #9
+ vld1.u8 {d1}, [r0]
+ vpaddl.u8 q0, q0
+ vadd.u16 d0, d0, d1
+ vpaddl.u16 d0, d0
+ vpaddl.u32 d0, d0
+ vqrshrun.s16 d0, q0, #4
+ vdup.u8 d0, d0[0]
+ b str_pred
+
+top_available: @ONLY TOP AVAILABLE
+ ands r2, r4, #0x04 @CHECKING TOP AVAILABILTY OR ELSE BRANCH TO NONE AVAILABLE
+ beq none_available
+
+ add r0, r0, #9
+ vld1.u8 {d0}, [r0]
+ vpaddl.u8 d0, d0
+ vpaddl.u16 d0, d0
+ vpaddl.u32 d0, d0
+ vqrshrun.s16 d0, q0, #3
+ vdup.u8 d0, d0[0]
+ b str_pred
+
+left_available: @ONLY LEFT AVAILABLE
+ vld1.u8 {d0}, [r0]
+ vpaddl.u8 d0, d0
+ vpaddl.u16 d0, d0
+ vpaddl.u32 d0, d0
+ vqrshrun.s16 d0, q0, #3
+ vdup.u8 d0, d0[0]
+ b str_pred
+
+none_available: @NONE AVAILABLE
+ vmov.u8 q0, #128
+
+str_pred:
+ vst1.8 {d0}, [r1], r3
+ vst1.8 {d0}, [r1], r3
+ vst1.8 {d0}, [r1], r3
+ vst1.8 {d0}, [r1], r3
+ vst1.8 {d0}, [r1], r3
+ vst1.8 {d0}, [r1], r3
+ vst1.8 {d0}, [r1], r3
+ vst1.8 {d0}, [r1], r3
+
+ ldmfd sp!, {r4, pc} @Restoring registers from stack
+
+
+
+
+
+
+@/**
+@*******************************************************************************
+@*
+@*ih264_intra_pred_luma_8x8_mode_diag_dl
+@*
+@* @brief
+@* Perform Intra prediction for luma_8x8 mode:Diagonal_Down_Left
+@*
+@* @par Description:
+@* Perform Intra prediction for luma_8x8 mode:Diagonal_Down_Left ,described in sec 8.3.2.2.4
+@*
+@* @param[in] pu1_src
+@* UWORD8 pointer to the source
+@*
+@* @param[out] pu1_dst
+@* UWORD8 pointer to the destination
+@*
+@* @param[in] src_strd
+@* integer source stride
+@*
+@* @param[in] dst_strd
+@* integer destination stride
+@*
+@* @param[in] ui_neighboravailability
+@* availability of neighbouring pixels
+@*
+@* @returns
+@*
+@* @remarks
+@* None
+@*
+@*******************************************************************************/
+@void ih264_intra_pred_luma_8x8_mode_diag_dl(UWORD8 *pu1_src,
+@ UWORD8 *pu1_dst,
+@ WORD32 src_strd,
+@ WORD32 dst_strd,
+@ WORD32 ui_neighboravailability)
+
+@**************Variables Vs Registers*****************************************
+@ r0 => *pu1_src
+@ r1 => *pu1_dst
+@ r2 => src_strd
+@ r3 => dst_strd
+@ r4 => ui_neighboravailability
+
+ .global ih264_intra_pred_luma_8x8_mode_diag_dl_a9q
+
+ih264_intra_pred_luma_8x8_mode_diag_dl_a9q:
+
+ stmfd sp!, {r4-r12, r14} @store register values to stack
+
+ add r0, r0, #9
+ sub r5, r3, #4
+ add r6, r0, #15
+ vld1.8 {q0}, [r0]
+ vext.8 q2, q0, q0, #2
+ vext.8 q1, q0, q0, #1
+ vld1.8 {d5[6]}, [r6]
+ @ q1 = q0 shifted to left once
+ @ q2 = q1 shifted to left once
+ vaddl.u8 q10, d0, d2 @Adding for FILT121
+ vaddl.u8 q11, d1, d3
+ vaddl.u8 q12, d2, d4
+ vaddl.u8 q13, d3, d5
+ vadd.u16 q12, q10, q12
+ vadd.u16 q13, q11, q13
+
+ vqrshrun.s16 d4, q12, #2
+ vqrshrun.s16 d5, q13, #2
+ @Q2 has all FILT121 values
+ vst1.8 {d4}, [r1], r3
+ vext.8 q9, q2, q2, #1
+ vext.8 q8, q9, q9, #1
+ vst1.8 {d18}, [r1], r3
+ vext.8 q15, q8, q8, #1
+ vst1.8 {d16}, [r1], r3
+ vst1.8 {d30}, [r1], r3
+ vst1.32 {d4[1]}, [r1]!
+ vst1.32 {d5[0]}, [r1], r5
+ vst1.32 {d18[1]}, [r1]!
+ vst1.32 {d19[0]}, [r1], r5
+ vst1.32 {d16[1]}, [r1]!
+ vst1.32 {d17[0]}, [r1], r5
+ vst1.32 {d30[1]}, [r1]!
+ vst1.32 {d31[0]}, [r1], r5
+
+
+end_func_diag_dl:
+ ldmfd sp!, {r4-r12, pc} @Restoring registers from stack
+
+
+
+
+@/**
+@*******************************************************************************
+@*
+@*ih264_intra_pred_luma_8x8_mode_diag_dr
+@*
+@* @brief
+@* Perform Intra prediction for luma_8x8 mode:Diagonal_Down_Right
+@*
+@* @par Description:
+@* Perform Intra prediction for luma_8x8 mode:Diagonal_Down_Right ,described in sec 8.3.2.2.5
+@*
+@* @param[in] pu1_src
+@* UWORD8 pointer to the source
+@*
+@* @param[out] pu1_dst
+@* UWORD8 pointer to the destination
+@*
+@* @param[in] src_strd
+@* integer source stride
+@*
+@* @param[in] dst_strd
+@* integer destination stride
+@*
+@* @param[in] ui_neighboravailability
+@* availability of neighbouring pixels
+@*
+@* @returns
+@*
+@* @remarks
+@* None
+@*
+@*******************************************************************************/
+@void ih264_intra_pred_luma_8x8_mode_diag_dr(UWORD8 *pu1_src,
+@ UWORD8 *pu1_dst,
+@ WORD32 src_strd,
+@ WORD32 dst_strd,
+@ WORD32 ui_neighboravailability)
+
+@**************Variables Vs Registers*****************************************
+@ r0 => *pu1_src
+@ r1 => *pu1_dst
+@ r2 => src_strd
+@ r3 => dst_strd
+@ r4 => ui_neighboravailability
+
+
+ .global ih264_intra_pred_luma_8x8_mode_diag_dr_a9q
+
+ih264_intra_pred_luma_8x8_mode_diag_dr_a9q:
+
+ stmfd sp!, {r4-r12, r14} @store register values to stack
+
+
+ vld1.u8 {q0}, [r0]
+ add r0, r0, #1
+ vld1.u8 {q1}, [r0]
+ vext.8 q2, q1, q1, #1
+ @ q1 = q0 shifted to left once
+ @ q2 = q1 shifted to left once
+ vaddl.u8 q10, d0, d2 @Adding for FILT121
+ vaddl.u8 q11, d1, d3
+ vaddl.u8 q12, d2, d4
+ vaddl.u8 q13, d3, d5
+ vadd.u16 q12, q10, q12
+ vadd.u16 q13, q11, q13
+ vqrshrun.s16 d4, q12, #2
+ vqrshrun.s16 d5, q13, #2
+ @Q2 has all FILT121 values
+ sub r5, r3, #4
+ vext.8 q9, q2, q2, #15
+ vst1.8 {d19}, [r1], r3
+ vext.8 q8, q9, q9, #15
+ vst1.8 {d17}, [r1], r3
+ vext.8 q15, q8, q8, #15
+ vst1.8 {d31}, [r1], r3
+ vst1.32 {d4[1]}, [r1]!
+ vst1.32 {d5[0]}, [r1], r5
+ vst1.32 {d18[1]}, [r1]!
+ vst1.32 {d19[0]}, [r1], r5
+ vst1.32 {d16[1]}, [r1]!
+ vst1.32 {d17[0]}, [r1], r5
+ vst1.32 {d30[1]}, [r1]!
+ vst1.32 {d31[0]}, [r1], r5
+ vst1.8 {d4}, [r1], r3
+
+end_func_diag_dr:
+ ldmfd sp!, {r4-r12, pc} @Restoring registers from stack
+
+
+
+
+@/**
+@*******************************************************************************
+@*
+@*ih264_intra_pred_luma_8x8_mode_vert_r
+@*
+@* @brief
+@* Perform Intra prediction for luma_8x8 mode:Vertical_Right
+@*
+@* @par Description:
+@* Perform Intra prediction for luma_8x8 mode:Vertical_Right ,described in sec 8.3.2.2.6
+@*
+@* @param[in] pu1_src
+@* UWORD8 pointer to the source
+@*
+@* @param[out] pu1_dst
+@* UWORD8 pointer to the destination
+@*
+@* @param[in] src_strd
+@* integer source stride
+@*
+@* @param[in] dst_strd
+@* integer destination stride
+@*
+@* @param[in] ui_neighboravailability
+@* availability of neighbouring pixels
+@*
+@* @returns
+@*
+@* @remarks
+@* None
+@*
+@*******************************************************************************/
+@void ih264_intra_pred_luma_8x8_mode_vert_r(UWORD8 *pu1_src,
+@ UWORD8 *pu1_dst,
+@ WORD32 src_strd,
+@ WORD32 dst_strd,
+@ WORD32 ui_neighboravailability)
+
+@**************Variables Vs Registers*****************************************
+@ r0 => *pu1_src
+@ r1 => *pu1_dst
+@ r2 => src_strd
+@ r3 => dst_strd
+@ r4 => ui_neighboravailability
+
+
+ .global ih264_intra_pred_luma_8x8_mode_vert_r_a9q
+
+ih264_intra_pred_luma_8x8_mode_vert_r_a9q:
+
+ stmfd sp!, {r4-r12, r14} @store register values to stack
+
+ vld1.u8 {q0}, [r0]
+ add r0, r0, #1
+ vld1.u8 {q1}, [r0]
+ vext.8 q2, q1, q1, #1
+ @ q1 = q0 shifted to left once
+ @ q2 = q1 shifted to left once
+ vaddl.u8 q10, d0, d2
+ vaddl.u8 q11, d1, d3
+ vaddl.u8 q12, d2, d4
+ vaddl.u8 q13, d3, d5
+ vadd.u16 q12, q10, q12
+ vadd.u16 q13, q11, q13
+
+ vqrshrun.s16 d4, q10, #1
+ vqrshrun.s16 d5, q11, #1
+ vqrshrun.s16 d6, q12, #2
+ vqrshrun.s16 d7, q13, #2
+ @Q2 has all FILT11 values
+ @Q3 has all FILT121 values
+ sub r5, r3, #6
+ sub r6, r3, #4
+ vst1.8 {d5}, [r1], r3 @ row 0
+ vext.8 q9, q3, q3, #15
+ vmov.8 q11, q9
+ vext.8 q8, q2, q2, #1
+ vst1.8 {d19}, [r1], r3 @row 1
+
+ vmov.8 q15, q8
+ vext.8 q10, q2, q2, #15
+ vuzp.8 q8, q9
+ @row 2
+ vext.8 q14, q8, q8, #1
+ vst1.8 {d21}, [r1]
+ vst1.8 {d6[6]}, [r1], r3
+ @row 3
+
+ vst1.16 {d29[1]}, [r1]!
+ vst1.32 {d7[0]}, [r1]!
+ vst1.16 {d7[2]}, [r1], r5
+@row 4
+ vst1.16 {d19[1]}, [r1]!
+ vst1.32 {d5[0]}, [r1]!
+ vst1.16 {d5[2]}, [r1], r5
+
+@row 5
+ vext.8 q13, q9, q9, #1
+ vst1.16 {d17[1]}, [r1]!
+ vst1.32 {d23[0]}, [r1]!
+ vst1.16 {d23[2]}, [r1], r5
+
+
+@row 6
+ vst1.16 {d27[0]}, [r1]!
+ vst1.8 {d27[2]}, [r1]!
+ vst1.8 {d5[0]}, [r1]!
+ vst1.32 {d31[0]}, [r1], r6
+@row 7
+ vst1.32 {d29[0]}, [r1]!
+ vst1.32 {d7[0]}, [r1]!
+
+
+
+end_func_vert_r:
+ ldmfd sp!, {r4-r12, pc} @Restoring registers from stack
+
+
+
+
+@/**
+@*******************************************************************************
+@*
+@*ih264_intra_pred_luma_8x8_mode_horz_d
+@*
+@* @brief
+@* Perform Intra prediction for luma_8x8 mode:Horizontal_Down
+@*
+@* @par Description:
+@* Perform Intra prediction for luma_8x8 mode:Horizontal_Down ,described in sec 8.3.2.2.7
+@*
+@* @param[in] pu1_src
+@* UWORD8 pointer to the source
+@*
+@* @param[out] pu1_dst
+@* UWORD8 pointer to the destination
+@*
+@* @param[in] src_strd
+@* integer source stride
+@*
+@* @param[in] dst_strd
+@* integer destination stride
+@*
+@* @param[in] ui_neighboravailability
+@* availability of neighbouring pixels
+@*
+@* @returns
+@*
+@* @remarks
+@* None
+@*
+@*******************************************************************************/
+@void ih264_intra_pred_luma_8x8_mode_horz_d(UWORD8 *pu1_src,
+@ UWORD8 *pu1_dst,
+@ WORD32 src_strd,
+@ WORD32 dst_strd,
+@ WORD32 ui_neighboravailability)
+
+@**************Variables Vs Registers*****************************************
+@ r0 => *pu1_src
+@ r1 => *pu1_dst
+@ r2 => src_strd
+@ r3 => dst_strd
+@ r4 => ui_neighboravailability
+
+ .global ih264_intra_pred_luma_8x8_mode_horz_d_a9q
+
+ih264_intra_pred_luma_8x8_mode_horz_d_a9q:
+
+ stmfd sp!, {r4-r12, r14} @store register values to stack
+ vpush {d8-d15}
+
+ vld1.u8 {q0}, [r0]
+ add r0, r0, #1
+ vld1.u8 {q1}, [r0]
+ vext.8 q2, q1, q1, #1
+ @ q1 = q0 shifted to left once
+ @ q2 = q1 shifted to left once
+ vaddl.u8 q10, d0, d2
+ vaddl.u8 q11, d1, d3
+ vaddl.u8 q12, d2, d4
+ vaddl.u8 q13, d3, d5
+ vadd.u16 q12, q10, q12
+ vadd.u16 q13, q11, q13
+
+ vqrshrun.s16 d4, q10, #1
+ vqrshrun.s16 d5, q11, #1
+ vqrshrun.s16 d6, q12, #2
+ vqrshrun.s16 d7, q13, #2
+ @Q2 has all FILT11 values
+ @Q3 has all FILT121 values
+ vmov.8 q4, q2
+ vmov.8 q5, q3
+ sub r6, r3, #6
+ vtrn.8 q4, q5 @
+ vmov.8 q6, q4
+ vmov.8 q7, q5
+ sub r5, r3, #4
+ vtrn.16 q6, q7
+ vext.8 q8, q3, q3, #14
+ @ROW 0
+ vst1.8 {d17}, [r1]
+ vst1.16 {d10[3]}, [r1], r3
+
+ @ROW 1
+ vst1.32 {d14[1]}, [r1]!
+ vst1.32 {d7[0]}, [r1], r5
+ @ROW 2
+ vst1.16 {d10[2]}, [r1]!
+ vst1.32 {d14[1]}, [r1]!
+ vst1.16 {d7[0]}, [r1], r6
+ @ROW 3
+ vst1.32 {d12[1]}, [r1]!
+ vst1.32 {d14[1]}, [r1], r5
+ @ROW 4
+ vst1.16 {d14[1]}, [r1]!
+ vst1.32 {d12[1]}, [r1]!
+ vst1.16 {d14[2]}, [r1], r6
+ @ROW 5
+ vst1.32 {d14[0]}, [r1]!
+ vst1.32 {d12[1]}, [r1], r5
+ @ROW 6
+ vst1.16 {d10[0]}, [r1]!
+ vst1.16 {d8[1]}, [r1]!
+ vst1.16 {d14[1]}, [r1]!
+ vst1.16 {d12[2]}, [r1], r6
+ @ROW 7
+ vst1.32 {d12[0]}, [r1]!
+ vst1.32 {d14[0]}, [r1], r5
+
+end_func_horz_d:
+ vpop {d8-d15}
+ ldmfd sp!, {r4-r12, pc} @Restoring registers from stack
+
+
+
+
+
+@/**
+@*******************************************************************************
+@*
+@*ih264_intra_pred_luma_8x8_mode_vert_l
+@*
+@* @brief
+@* Perform Intra prediction for luma_8x8 mode:Vertical_Left
+@*
+@* @par Description:
+@* Perform Intra prediction for luma_8x8 mode:Vertical_Left ,described in sec 8.3.2.2.8
+@*
+@* @param[in] pu1_src
+@* UWORD8 pointer to the source
+@*
+@* @param[out] pu1_dst
+@* UWORD8 pointer to the destination
+@*
+@* @param[in] src_strd
+@* integer source stride
+@*
+@* @param[in] dst_strd
+@* integer destination stride
+@*
+@* @param[in] ui_neighboravailability
+@* availability of neighbouring pixels
+@*
+@* @returns
+@*
+@* @remarks
+@* None
+@*
+@*******************************************************************************/
+@void ih264_intra_pred_luma_8x8_mode_vert_l(UWORD8 *pu1_src,
+@ UWORD8 *pu1_dst,
+@ WORD32 src_strd,
+@ WORD32 dst_strd,
+@ WORD32 ui_neighboravailability)
+
+@**************Variables Vs Registers*****************************************
+@ r0 => *pu1_src
+@ r1 => *pu1_dst
+@ r2 => src_strd
+@ r3 => dst_strd
+@ r4 => ui_neighboravailability
+
+
+ .global ih264_intra_pred_luma_8x8_mode_vert_l_a9q
+
+ih264_intra_pred_luma_8x8_mode_vert_l_a9q:
+
+ stmfd sp!, {r4-r12, r14} @Restoring registers from stack
+ vpush {d8-d15}
+ add r0, r0, #9
+ vld1.u8 {q0}, [r0]
+ add r0, r0, #1
+ vld1.u8 {q1}, [r0]
+ vext.8 q2, q1, q1, #1
+ vaddl.u8 q10, d0, d2
+ vaddl.u8 q11, d1, d3
+ vaddl.u8 q12, d2, d4
+ vaddl.u8 q13, d3, d5
+ vadd.u16 q12, q10, q12
+ vadd.u16 q13, q11, q13
+
+ vqrshrun.s16 d4, q10, #1
+ vqrshrun.s16 d5, q11, #1
+ vqrshrun.s16 d6, q12, #2
+ vext.8 q4, q2, q2, #1
+ vqrshrun.s16 d7, q13, #2
+ @Q2 has all FILT11 values
+ @Q3 has all FILT121 values
+
+ vext.8 q5, q3, q3, #1
+ @ROW 0,1
+ vst1.8 {d4}, [r1], r3
+ vst1.8 {d6}, [r1], r3
+
+ vext.8 q6, q4, q4, #1
+ vext.8 q7, q5, q5, #1
+ @ROW 2,3
+ vst1.8 {d8}, [r1], r3
+ vst1.8 {d10}, [r1], r3
+
+ vext.8 q8, q6, q6, #1
+ vext.8 q9, q7, q7, #1
+ @ROW 4,5
+ vst1.8 {d12}, [r1], r3
+ vst1.8 {d14}, [r1], r3
+ @ROW 6,7
+ vst1.8 {d16}, [r1], r3
+ vst1.8 {d18}, [r1], r3
+
+end_func_vert_l:
+ vpop {d8-d15}
+ ldmfd sp!, {r4-r12, pc} @Restoring registers from stack
+
+
+
+
+
+@/**
+@*******************************************************************************
+@*
+@*ih264_intra_pred_luma_8x8_mode_horz_u
+@*
+@* @brief
+@* Perform Intra prediction for luma_8x8 mode:Horizontal_Up
+@*
+@* @par Description:
+@* Perform Intra prediction for luma_8x8 mode:Horizontal_Up ,described in sec 8.3.2.2.9
+@*
+@* @param[in] pu1_src
+@* UWORD8 pointer to the source
+@*
+@* @param[out] pu1_dst
+@* UWORD8 pointer to the destination
+@*
+@* @param[in] src_strd
+@* integer source stride
+@*
+@* @param[in] dst_strd
+@* integer destination stride
+@*
+@* @param[in] ui_neighboravailability
+@* availability of neighbouring pixels
+@*
+@* @returns
+@*
+@* @remarks
+@* None
+@*
+@*******************************************************************************/
+@void ih264_intra_pred_luma_8x8_mode_horz_u(UWORD8 *pu1_src,
+@ UWORD8 *pu1_dst,
+@ WORD32 src_strd,
+@ WORD32 dst_strd,
+@ WORD32 ui_neighboravailability)
+
+@**************Variables Vs Registers*****************************************
+@ r0 => *pu1_src
+@ r1 => *pu1_dst
+@ r2 => src_strd
+@ r3 => dst_strd
+@ r4 => ui_neighboravailability
+
+ .global ih264_intra_pred_luma_8x8_mode_horz_u_a9q
+
+ih264_intra_pred_luma_8x8_mode_horz_u_a9q:
+
+ stmfd sp!, {r4-r12, r14} @store register values to stack
+ vpush {d8-d15}
+
+ vld1.u8 {q0}, [r0]
+ vld1.u8 {d1[7]}, [r0]
+ vext.8 q1, q0, q0, #1
+ vext.8 q2, q1, q1, #1
+ @ LOADING V TABLE
+ ldr r12, scratch_intrapred_addr_8x8
+scrlb8x8l2:
+ add r12, r12, pc
+ vaddl.u8 q10, d0, d2
+ vaddl.u8 q11, d1, d3
+ vaddl.u8 q12, d2, d4
+ vaddl.u8 q13, d3, d5
+ vadd.u16 q12, q10, q12
+ vadd.u16 q13, q11, q13
+ vld1.u8 {q5}, [r12]
+ vqrshrun.s16 d4, q10, #1
+ vqrshrun.s16 d5, q11, #1
+ vqrshrun.s16 d6, q12, #2
+ vqrshrun.s16 d7, q13, #2
+ @Q2 has all FILT11 values
+ @Q3 has all FILT121 values
+ vtbl.u8 d12, {q2, q3}, d10
+ vdup.u8 q7, d5[7] @
+ vtbl.u8 d13, {q2, q3}, d11
+ vext.8 q8, q6, q7, #2
+ vext.8 q9, q8, q7, #2
+ vst1.8 {d12}, [r1], r3
+ vext.8 q10, q9, q7, #2
+ vst1.8 {d16}, [r1], r3
+ vst1.8 {d18}, [r1], r3
+ vst1.8 {d20}, [r1], r3
+ vst1.8 {d13}, [r1], r3
+ vst1.8 {d17}, [r1], r3
+ vst1.8 {d19}, [r1], r3
+ vst1.8 {d21}, [r1], r3
+
+
+end_func_horz_u:
+ vpop {d8-d15}
+ ldmfd sp!, {r4-r12, pc} @Restoring registers from stack
+
+
+
+
+
+
+
+
diff --git a/common/arm/ih264_iquant_itrans_recon_a9.s b/common/arm/ih264_iquant_itrans_recon_a9.s
new file mode 100755
index 0000000..f71ca69
--- /dev/null
+++ b/common/arm/ih264_iquant_itrans_recon_a9.s
@@ -0,0 +1,871 @@
+@/******************************************************************************
+@ *
+@ * Copyright (C) 2015 The Android Open Source Project
+@ *
+@ * Licensed under the Apache License, Version 2.0 (the "License");
+@ * you may not use this file except in compliance with the License.
+@ * You may obtain a copy of the License at:
+@ *
+@ * http://www.apache.org/licenses/LICENSE-2.0
+@ *
+@ * Unless required by applicable law or agreed to in writing, software
+@ * distributed under the License is distributed on an "AS IS" BASIS,
+@ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@ * See the License for the specific language governing permissions and
+@ * limitations under the License.
+@ *
+@ *****************************************************************************
+@ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
+@*/
+@/**
+@ *******************************************************************************
+@ * @file
+@ * ih264_iquant_itrans_recon_a9.s
+@ *
+@ * @brief
+@ * Contains function definitions for single stage inverse transform
+@ *
+@ * @author
+@ * Mohit
+@ * Harinarayanaan
+@ *
+@ * @par List of Functions:
+@ * - ih264_iquant_itrans_recon_4x4_a9()
+@ * - ih264_iquant_itrans_recon_8x8_a9()
+@ * - ih264_iquant_itrans_recon_chroma_4x4_a9()
+@ *
+@ * @remarks
+@ * None
+@ *
+@ *******************************************************************************
+@*/
+@/**
+@ *******************************************************************************
+@ *
+@ * @brief
+@ * This function performs inverse quant and Inverse transform type Ci4 for 4*4 block
+@ *
+@ * @par Description:
+@ * Performs inverse transform Ci4 and adds the residue to get the
+@ * reconstructed block
+@ *
+@ * @param[in] pi2_src
+@ * Input 4x4 coefficients
+@ *
+@ * @param[in] pu1_pred
+@ * Prediction 4x4 block
+@ *
+@ * @param[out] pu1_out
+@ * Output 4x4 block
+@ *
+@ * @param[in] u4_qp_div_6
+@ * QP
+@ *
+@ * @param[in] pu2_weigh_mat
+@ * Pointer to weight matrix
+@ *
+@ * @param[in] pred_strd,
+@ * Prediction stride
+@ *
+@ * @param[in] out_strd
+@ * Output Stride
+@ *
+@ *@param[in] pi2_tmp
+@ * temporary buffer of size 1*16
+@ *
+@ * @param[in] pu2_iscal_mat
+@ * Pointer to the inverse quantization matrix
+@ *
+@ * @returns Void
+@ *
+@ * @remarks
+@ * None
+@ *
+@ *******************************************************************************
+@ */
+@void ih264_iquant_itrans_recon_4x4(WORD16 *pi2_src,
+@ UWORD8 *pu1_pred,
+@ UWORD8 *pu1_out,
+@ WORD32 pred_strd,
+@ WORD32 out_strd,
+@ const UWORD16 *pu2_iscal_mat,
+@ const UWORD16 *pu2_weigh_mat,
+@ UWORD32 u4_qp_div_6,
+@ WORD32 *pi4_tmp,
+@ WORD32 iq_start_idx
+@ WORD16 *pi2_dc_ld_addr)
+@**************Variables Vs Registers*****************************************
+@r0 => *pi2_src
+@r1 => *pu1_pred
+@r2 => *pu1_out
+@r3 => pred_strd
+@r4 => out_strd
+@r5 => *pu2_iscal_mat
+@r6 => *pu2_weigh_mat
+@r7 => u4_qp_div_6
+@r8 => iq_start_idx
+@r10=> pi2_dc_ld_addr
+.text
+.p2align 2
+
+ .global ih264_iquant_itrans_recon_4x4_a9
+
+ih264_iquant_itrans_recon_4x4_a9:
+
+@VLD4.S16 is used because the pointer is incremented by SUB_BLK_WIDTH_4x4
+@If the macro value changes need to change the instruction according to it.
+@Only one shift is done in horizontal inverse because,
+@if u4_qp_div_6 is lesser than 4 then shift value will be neagative and do negative left shift, in this case rnd_factor has value
+@if u4_qp_div_6 is greater than 4 then shift value will be positive and do left shift, here rnd_factor is 0
+
+ stmfd sp!, {r4-r12, r14} @stack stores the values of the arguments
+ ldr r7, [sp, #52] @Loads u4_qp_div_6
+ ldr r4, [sp, #40] @Loads out_strd
+ vdup.s32 q15, r7 @Populate the u4_qp_div_6 in Q15
+ ldr r5, [sp, #44] @Loads *pu2_iscal_mat
+
+ ldr r6, [sp, #48] @Loads *pu2_weigh_mat
+
+ ldr r8, [sp, #60] @Loads iq_start_idx
+
+ ldr r10, [sp, #64] @Load alternate dc address
+
+ vpush {d8-d15}
+@=======================DEQUANT FROM HERE===================================
+
+ vld4.s16 {d20, d21, d22, d23}, [r5] @Load pu2_iscal_mat[i], i =0..15
+ vld4.s16 {d26, d27, d28, d29}, [r6] @pu2_weigh_mat[i], i =0..15
+ vmul.s16 q10, q10, q13 @x[i]=(scale[i] * dequant[i]) where i = 0..7
+ vld4.s16 {d16, d17, d18, d19}, [r0] @pi2_src_tmp[i], i =0..15
+
+ vmul.s16 q11, q11, q14 @x[i]=(scale[i] * dequant[i]) where i = 8..15
+
+ subs r8, r8, #1 @ if r8 == 1 => intra case , so result of subtraction is zero and Z flag is set
+ ldreqsh r9, [r10] @ Loads signed halfword pi2_dc_ld_addr[0], if r8==1
+
+ vmull.s16 q0, d16, d20 @ Q0 = p[i] = (x[i] * trns_coeff[i]) where i = 0..3
+ vmull.s16 q1, d17, d21 @ Q1 = p[i] = (x[i] * trns_coeff[i]) where i = 4..7
+ vmull.s16 q2, d18, d22 @ Q2 = p[i] = (x[i] * trns_coeff[i]) where i = 8..11
+ vmull.s16 q3, d19, d23 @ Q3 = p[i] = (x[i] * trns_coeff[i]) where i = 12..15
+
+ vshl.s32 q0, q0, q15 @ Q0 = q[i] = (p[i] << (qP/6)) where i = 0..3
+ vshl.s32 q1, q1, q15 @ Q1 = q[i] = (p[i] << (qP/6)) where i = 4..7
+ vshl.s32 q2, q2, q15 @ Q2 = q[i] = (p[i] << (qP/6)) where i = 8..11
+ vshl.s32 q3, q3, q15 @ Q3 = q[i] = (p[i] << (qP/6)) where i = 12..15
+
+ vqrshrn.s32 d0, q0, #0x4 @ D0 = c[i] = ((q[i] + 32) >> 4) where i = 0..3
+ vqrshrn.s32 d1, q1, #0x4 @ D1 = c[i] = ((q[i] + 32) >> 4) where i = 4..7
+ vqrshrn.s32 d2, q2, #0x4 @ D2 = c[i] = ((q[i] + 32) >> 4) where i = 8..11
+ vqrshrn.s32 d3, q3, #0x4 @ D3 = c[i] = ((q[i] + 32) >> 4) where i = 12..15
+
+ vmoveq.16 d0[0], r9 @ Restore dc value in case of intra, i.e. r8 == 1
+
+@========= PROCESS IDCT FROM HERE =======
+@Steps for Stage 1:
+@------------------
+ vld1.32 d30[0], [r1], r3 @I row Load pu1_pred buffer
+ vadd.s16 d4, d0, d2 @x0 = q0 + q1;
+
+ vsub.s16 d5, d0, d2 @x1 = q0 - q1;
+
+ vshr.s16 d8, d1, #1 @q0>>1
+ vshr.s16 d9, d3, #1 @q1>>1
+
+ vsub.s16 d6, d8, d3 @x2 = (q0 >> 1) - q1;
+ vadd.s16 d7, d1, d9 @x3 = q0+ (q1 >> 1);
+ vld1.32 d30[1], [r1], r3 @II row Load pu1_pred buffer
+
+ vswp d6, d7 @Reverse positions of x2 and x3
+
+ vsub.s16 q6, q2, q3 @x0-x3 and x1-x2 combined
+ vadd.s16 q5, q2, q3 @x0 + x3 and x1+x2 combined
+
+ vld1.32 d31[0], [r1], r3 @III row Load pu1_pred buf
+
+ vswp d12, d13
+@Steps for Stage 2:
+@------------------
+ vtrn.16 d10, d11
+ vtrn.16 d12, d13
+ vtrn.32 d10, d12
+ vtrn.32 d11, d13
+ vadd.s16 d14, d10, d12 @x0 = q0 + q1;
+
+ vsub.s16 d15, d10, d12 @x1 = q0 - q1;
+
+ vshr.s16 d18, d11, #1 @q0>>1
+ vshr.s16 d19, d13, #1 @q1>>1
+
+ vsub.s16 d16, d18, d13 @x2 = (q0 >> 1) - q1;
+ vadd.s16 d17, d11, d19 @x3 = q0+ (q1 >> 1);
+
+ vld1.32 d31[1], [r1], r3 @IV row Load pu1_pred buffer
+ vswp d16, d17 @Reverse positions of x2 and x3
+
+ vsub.s16 q11, q7, q8 @x0-x3 and x1-x2 combined
+ vadd.s16 q10, q7, q8 @x0 + x3 and x1+x2 combined
+
+ vswp d22, d23
+
+ vrshr.s16 q10, q10, #6 @
+ vrshr.s16 q11, q11, #6
+
+ vaddw.u8 q10, q10, d30
+ vaddw.u8 q11, q11, d31
+
+ vqmovun.s16 d0, q10
+ vqmovun.s16 d1, q11
+
+ vst1.32 d0[0], [r2], r4 @I row store the value
+ vst1.32 d0[1], [r2], r4 @II row store the value
+ vst1.32 d1[0], [r2], r4 @III row store the value
+ vst1.32 d1[1], [r2] @IV row store the value
+
+ vpop {d8-d15}
+ ldmfd sp!, {r4-r12, r15} @Reload the registers from SP
+
+
+ @/**
+@ *******************************************************************************
+@ *
+@ * @brief
+@ * This function performs inverse quant and Inverse transform type Ci4 for 4*4 block
+@ *
+@ * @par Description:
+@ * Performs inverse transform Ci4 and adds the residue to get the
+@ * reconstructed block
+@ *
+@ * @param[in] pi2_src
+@ * Input 4x4 coefficients
+@ *
+@ * @param[in] pu1_pred
+@ * Prediction 4x4 block
+@ *
+@ * @param[out] pu1_out
+@ * Output 4x4 block
+@ *
+@ * @param[in] u4_qp_div_6
+@ * QP
+@ *
+@ * @param[in] pu2_weigh_mat
+@ * Pointer to weight matrix
+@ *
+@ * @param[in] pred_strd,
+@ * Prediction stride
+@ *
+@ * @param[in] out_strd
+@ * Output Stride
+@ *
+@ *@param[in] pi2_tmp
+@ * temporary buffer of size 1*16
+@ *
+@ * @param[in] pu2_iscal_mat
+@ * Pointer to the inverse quantization matrix
+@ *
+@ * @returns Void
+@ *
+@ * @remarks
+@ * None
+@ *
+@ *******************************************************************************
+@ */
+@void ih264_iquant_itrans_recon_chroma_4x4(WORD16 *pi2_src,
+@ UWORD8 *pu1_pred,
+@ UWORD8 *pu1_out,
+@ WORD32 pred_strd,
+@ WORD32 out_strd,
+@ const UWORD16 *pu2_iscal_mat,
+@ const UWORD16 *pu2_weigh_mat,
+@ UWORD32 u4_qp_div_6,
+@ WORD32 *pi4_tmp
+@ WORD16 *pi2_dc_src)
+@**************Variables Vs Registers*****************************************
+@r0 => *pi2_src
+@r1 => *pu1_pred
+@r2 => *pu1_out
+@r3 => pred_strd
+@r4 => out_strd
+@r5 => *pu2_iscal_mat
+@r6 => *pu2_weigh_mat
+@r7 => u4_qp_div_6
+
+ .global ih264_iquant_itrans_recon_chroma_4x4_a9
+ih264_iquant_itrans_recon_chroma_4x4_a9:
+
+@VLD4.S16 is used because the pointer is incremented by SUB_BLK_WIDTH_4x4
+@If the macro value changes need to change the instruction according to it.
+@Only one shift is done in horizontal inverse because,
+@if u4_qp_div_6 is lesser than 4 then shift value will be neagative and do negative left shift, in this case rnd_factor has value
+@if u4_qp_div_6 is greater than 4 then shift value will be positive and do left shift, here rnd_factor is 0
+
+ stmfd sp!, {r4-r12, r14} @stack stores the values of the arguments
+ ldr r7, [sp, #52] @Loads u4_qp_div_6
+ ldr r4, [sp, #40] @Loads out_strd
+ vdup.s32 q15, r7 @Populate the u4_qp_div_6 in Q15
+ ldr r5, [sp, #44] @Loads *pu2_iscal_mat
+ ldr r6, [sp, #48] @Loads *pu2_weigh_mat
+ ldr r8, [sp, #60] @loads *pi2_dc_src
+
+ vpush {d8-d15}
+@=======================DEQUANT FROM HERE===================================
+
+ vld4.s16 {d20, d21, d22, d23}, [r5] @Load pu2_iscal_mat[i], i =0..15
+ vld4.s16 {d26, d27, d28, d29}, [r6] @pu2_weigh_mat[i], i =0..15
+ vmul.s16 q10, q10, q13 @x[i]=(scale[i] * dequant[i]) where i = 0..7
+ vld4.s16 {d16, d17, d18, d19}, [r0] @pi2_src_tmp[i], i =0..15
+
+ vmul.s16 q11, q11, q14 @x[i]=(scale[i] * dequant[i]) where i = 8..15
+
+ vmull.s16 q0, d16, d20 @ Q0 = p[i] = (x[i] * trns_coeff[i]) where i = 0..3
+ vmull.s16 q1, d17, d21 @ Q1 = p[i] = (x[i] * trns_coeff[i]) where i = 4..7
+ vmull.s16 q2, d18, d22 @ Q2 = p[i] = (x[i] * trns_coeff[i]) where i = 8..11
+ vmull.s16 q3, d19, d23 @ Q3 = p[i] = (x[i] * trns_coeff[i]) where i = 12..15
+
+ vshl.s32 q0, q0, q15 @ Q0 = q[i] = (p[i] << (qP/6)) where i = 0..3
+ vshl.s32 q1, q1, q15 @ Q1 = q[i] = (p[i] << (qP/6)) where i = 4..7
+ vshl.s32 q2, q2, q15 @ Q2 = q[i] = (p[i] << (qP/6)) where i = 8..11
+ vshl.s32 q3, q3, q15 @ Q3 = q[i] = (p[i] << (qP/6)) where i = 12..15
+
+ vqrshrn.s32 d0, q0, #0x4 @ D0 = c[i] = ((q[i] + 32) >> 4) where i = 0..3
+ vqrshrn.s32 d1, q1, #0x4 @ D1 = c[i] = ((q[i] + 32) >> 4) where i = 4..7
+ vqrshrn.s32 d2, q2, #0x4 @ D2 = c[i] = ((q[i] + 32) >> 4) where i = 8..11
+ vqrshrn.s32 d3, q3, #0x4 @ D3 = c[i] = ((q[i] + 32) >> 4) where i = 12..15
+
+ ldrsh r9, [r8] @ Loads signed halfword pi2_dc_src[0]
+ vmov.16 d0[0], r9 @ Restore dc value since its chroma iq-it
+
+@========= PROCESS IDCT FROM HERE =======
+@Steps for Stage 1:
+@------------------
+ vld2.8 {d28, d29}, [r1], r3 @I row Load pu1_pred buffer
+ vadd.s16 d4, d0, d2 @x0 = q0 + q1;
+
+ vsub.s16 d5, d0, d2 @x1 = q0 - q1;
+
+ vshr.s16 d8, d1, #1 @q0>>1
+ vshr.s16 d9, d3, #1 @q1>>1
+
+ vsub.s16 d6, d8, d3 @x2 = (q0 >> 1) - q1;
+ vadd.s16 d7, d1, d9 @x3 = q0+ (q1 >> 1);
+ vld2.8 {d29, d30}, [r1], r3 @II row Load pu1_pred buffer
+
+ vswp d6, d7 @Reverse positions of x2 and x3
+
+ vsub.s16 q6, q2, q3 @x0-x3 and x1-x2 combined
+ vtrn.32 d28, d29 @ D28 -- row I and II of pu1_pred_buffer
+ vadd.s16 q5, q2, q3 @x0 + x3 and x1+x2 combined
+
+ vld2.8 {d29, d30}, [r1], r3 @III row Load pu1_pred buf
+
+ vswp d12, d13
+@Steps for Stage 2:
+@------------------
+ vtrn.16 d10, d11
+ vtrn.16 d12, d13
+ vtrn.32 d10, d12
+ vtrn.32 d11, d13
+ vadd.s16 d14, d10, d12 @x0 = q0 + q1;
+
+ vsub.s16 d15, d10, d12 @x1 = q0 - q1;
+
+ vshr.s16 d18, d11, #1 @q0>>1
+ vshr.s16 d19, d13, #1 @q1>>1
+
+ vsub.s16 d16, d18, d13 @x2 = (q0 >> 1) - q1;
+ vadd.s16 d17, d11, d19 @x3 = q0+ (q1 >> 1);
+
+ vld2.8 {d30, d31}, [r1], r3 @IV row Load pu1_pred buffer
+ vswp d16, d17 @Reverse positions of x2 and x3
+
+ vsub.s16 q11, q7, q8 @x0-x3 and x1-x2 combined
+ vtrn.32 d29, d30 @ D29 -- row III and IV of pu1_pred_buf
+ vadd.s16 q10, q7, q8 @x0 + x3 and x1+x2 combined
+
+ vswp d22, d23
+
+ vrshr.s16 q10, q10, #6 @
+ vrshr.s16 q11, q11, #6
+
+ vaddw.u8 q10, q10, d28
+ vaddw.u8 q11, q11, d29
+
+ vld1.u8 d0, [r2], r4 @Loading out buffer 16 coeffs
+ vld1.u8 d1, [r2], r4
+ vld1.u8 d2, [r2], r4
+ vld1.u8 d3, [r2], r4
+
+ sub r2, r2, r4, lsl #2
+
+ vqmovun.s16 d20, q10 @Getting quantized coeffs
+ vqmovun.s16 d22, q11
+
+ vmovl.u8 q10, d20 @Move the coffs into 16 bit
+ vmovl.u8 q11, d22 @so that we can use vbit to copy
+
+ vmov.u16 q14, #0x00ff @Copy lsb from qantized(long)coeffs
+
+ vbit.u8 q0, q10, q14
+ vbit.u8 q1, q11, q14
+
+ vst1.u8 d0, [r2], r4
+ vst1.u8 d1, [r2], r4
+ vst1.u8 d2, [r2], r4
+ vst1.u8 d3, [r2]
+
+ vpop {d8-d15}
+ ldmfd sp!, {r4-r12, r15} @Reload the registers from SP
+
+
+@/*
+@ *******************************************************************************
+@ *
+@ * @brief
+@ * This function performs inverse quant and Inverse transform type Ci4 for 8*8 block
+@ *
+@ * @par Description:
+@ * Performs inverse transform Ci8 and adds the residue to get the
+@ * reconstructed block
+@ *
+@ * @param[in] pi2_src
+@ * Input 4x4 coefficients
+@ *
+@ * @param[in] pu1_pred
+@ * Prediction 4x4 block
+@ *
+@ * @param[out] pu1_out
+@ * Output 4x4 block
+@ *
+@ * @param[in] u4_qp_div_6
+@ * QP
+@ *
+@ * @param[in] pu2_weigh_mat
+@ * Pointer to weight matrix
+@ *
+@ * @param[in] pred_strd,
+@ * Prediction stride
+@ *
+@ * @param[in] out_strd
+@ * Output Stride
+@ *
+@ *@param[in] pi2_tmp
+@ * temporary buffer of size 1*64
+@ *
+@ * @param[in] pu2_iscal_mat
+@ * Pointer to the inverse quantization matrix
+@ *
+@ * @returns Void
+@ *
+@ * @remarks
+@ * None
+@ *
+@ *******************************************************************************
+@ */
+@void ih264_iquant_itrans_recon_8x8(WORD16 *pi2_src,
+@ UWORD8 *pu1_pred,
+@ UWORD8 *pu1_out,
+@ WORD32 pred_strd,
+@ WORD32 out_strd,
+@ const UWORD16 *pu2_iscal_mat,
+@ const UWORD16 *pu2_weigh_mat,
+@ UWORD32 u4_qp_div_6,
+@ WORD32 *pi4_tmp,
+@ WORD32 iq_start_idx)
+@**************Variables Vs Registers*****************************************
+@r0 => *pi2_src
+@r1 => *pu1_pred
+@r2 => *pu1_out
+@r3 => pred_strd
+@r4 => out_strd
+@r5 => *pu2_iscal_mat
+@r6 => *pu2_weigh_mat
+@r7 => u4_qp_div_6
+
+
+ .global ih264_iquant_itrans_recon_8x8_a9
+ih264_iquant_itrans_recon_8x8_a9:
+
+ stmfd sp!, {r4-r12, r14} @stack stores the values of the arguments
+ ldr r7, [sp, #52] @Loads u4_qp_div_6
+ ldr r4, [sp, #40] @Loads out_strd
+
+ ldr r5, [sp, #44] @Loads *pu2_iscal_mat
+ ldr r6, [sp, #48] @Loads *pu2_weigh_mat
+ vdup.s32 q15, r7 @Populate the u4_qp_div_6 in Q15
+ vpush {d8-d15}
+
+idct_8x8_begin:
+
+@========= DEQUANT FROM HERE ===========
+
+ vld1.32 {q13}, [r5]! @ Q13 = dequant values row 0
+ vld1.32 {q10}, [r6]! @ Q10 = scaling factors row 0
+ vld1.32 {q14}, [r5]! @ Q14 = dequant values row 1
+ vmul.s16 q10, q10, q13 @ Q10 = x[i] = (scale[i] * dequant[i]) where i = 0..7
+ vld1.32 {q11}, [r6]! @ Q11 = scaling factors row 1
+ vld1.32 {q8}, [r0]! @ Q8 = Source row 0
+ vmul.s16 q11, q11, q14 @ Q11 = x[i] = (scale[i] * dequant[i]) where i = 8..15
+ vmull.s16 q0, d16, d20 @ Q0 = p[i] = (x[i] * trns_coeff[i]) where i = 0..3
+ vld1.32 {q9}, [r0]! @ Q8 = Source row 1
+ vmull.s16 q1, d17, d21 @ Q1 = p[i] = (x[i] * trns_coeff[i]) where i = 4..7
+ vmull.s16 q2, d18, d22 @ Q2 = p[i] = (x[i] * trns_coeff[i]) where i = 8..11
+ vld1.32 {q13}, [r6]! @ Scaling factors row 2
+ vmull.s16 q3, d19, d23 @ Q3 = p[i] = (x[i] * trns_coeff[i]) where i = 12..15
+ vld1.32 {q14}, [r6]! @ Scaling factors row 3
+ vshl.s32 q0, q0, q15 @ Q0 = q[i] = (p[i] << (qP/6)) where i = 0..3
+ vld1.32 {q10}, [r5]! @ Q10 = Dequant values row 2
+ vshl.s32 q1, q1, q15 @ Q1 = q[i] = (p[i] << (qP/6)) where i = 4..7
+ vld1.32 {q8}, [r0]! @ Source Row 2
+ vshl.s32 q2, q2, q15 @ Q2 = q[i] = (p[i] << (qP/6)) where i = 8..11
+ vld1.32 {q11}, [r5]! @ Q11 = Dequant values row 3
+ vshl.s32 q3, q3, q15 @ Q3 = q[i] = (p[i] << (qP/6)) where i = 12..15
+ vld1.32 {q9}, [r0]! @ Source Row 3
+ vmul.s16 q10, q10, q13 @ Dequant row2*scale matrix row 2
+ vmul.s16 q11, q11, q14 @ Dequant row 3*scale matrix row 3
+ vld1.32 {q4}, [r6]! @ Scaling factors row 4
+ vqrshrn.s32 d0, q0, #0x6 @ D0 = c[i] = ((q[i] + 32) >> 6) where i = 0..3
+ vqrshrn.s32 d1, q1, #0x6 @ D1 = c[i] = ((q[i] + 32) >> 6) where i = 4..7
+ vld1.32 {q5}, [r6]! @ Scaling factors row 5
+ vqrshrn.s32 d2, q2, #0x6 @ D2 = c[i] = ((q[i] + 32) >> 6) where i = 8..11
+ vqrshrn.s32 d3, q3, #0x6 @ D3 = c[i] = ((q[i] + 32) >> 6) where i = 12..15
+ vld1.32 {q13}, [r5]! @ Q13 = Dequant values row 4
+ vmull.s16 q2, d16, d20 @ p[i] = (x[i] * trns_coeff[i]) where i=16..19
+ vmull.s16 q3, d17, d21 @ p[i] = (x[i] * trns_coeff[i]) where i=20..23
+ vld1.32 {q12}, [r5]! @ Q12 = Dequant values row 5
+ vmull.s16 q6, d18, d22 @ p[i] = (x[i] * trns_coeff[i]) where i=24..27
+ vmull.s16 q7, d19, d23 @ p[i] = (x[i] * trns_coeff[i]) where i=28..31
+
+ vld1.32 {q14}, [r0]! @ Source row 4
+ vmul.s16 q10, q4, q13 @ Dequant row4*scale matrix row 4
+ vmul.s16 q11, q5, q12 @ Dequant row5*scale matrix row 5
+ vld1.32 {q9}, [r0]! @ Source row 5
+ vshl.s32 q2, q2, q15 @
+ vshl.s32 q3, q3, q15 @
+ vld1.32 {q13}, [r6]! @ Scaling factors row 6
+ vshl.s32 q6, q6, q15 @
+ vshl.s32 q7, q7, q15 @
+ vmull.s16 q4, d28, d20 @ i = 32..35
+ vqrshrn.s32 d4, q2, #0x6 @ D4 = c[i] = ((q[i] + 32) >> 6) where i = 16..19
+ vqrshrn.s32 d5, q3, #0x6 @ D5 = c[i] = ((q[i] + 32) >> 6) where i = 20..23
+ vmull.s16 q5, d29, d21 @ i =36..39
+ vld1.32 {q10}, [r5]! @ Dequant values row 6
+ vqrshrn.s32 d6, q6, #0x6 @ D6 = c[i] = ((q[i] + 32) >> 6) where i = 24..27
+ vqrshrn.s32 d7, q7, #0x6 @ D7 = c[i] = ((q[i] + 32) >> 6) where i = 28..31
+ vld1.32 {q14}, [r6]! @ Scaling factors row 7
+ vmull.s16 q6, d18, d22 @
+ vld1.32 {q8}, [r0]! @ Source row 6
+ vmull.s16 q7, d19, d23 @
+ vld1.32 {q11}, [r5]! @ Dequant values row 7
+ vshl.s32 q4, q4, q15 @
+ vld1.32 {q9}, [r0]! @ Source row 7
+ vshl.s32 q5, q5, q15 @
+
+ vshl.s32 q6, q6, q15 @
+ vshl.s32 q7, q7, q15 @
+ vmul.s16 q10, q10, q13 @ Dequant*scaling row 6
+ vmul.s16 q11, q11, q14 @ Dequant*scaling row 7
+ vqrshrn.s32 d8, q4, #0x6 @ D8 = c[i] = ((q[i] + 32) >> 6) where i = 32..35
+ vqrshrn.s32 d9, q5, #0x6 @ D9 = c[i] = ((q[i] + 32) >> 6) where i = 36..39
+ vqrshrn.s32 d10, q6, #0x6 @ D10 = c[i] = ((q[i] + 32) >> 6) where i = 40..43
+ vqrshrn.s32 d11, q7, #0x6 @ D11 = c[i] = ((q[i] + 32) >> 6) where i = 44..47
+ vmull.s16 q6, d16, d20 @ i= 48..51
+ vmull.s16 q7, d17, d21 @ i= 52..55
+ vmull.s16 q8, d18, d22 @ i=56..59
+ vmull.s16 q9, d19, d23 @ i=60..63
+ vshl.s32 q6, q6, q15 @
+ vzip.s16 q0, q1 @Transpose
+ vshl.s32 q7, q7, q15 @
+ vshl.s32 q8, q8, q15 @
+ vzip.s16 q2, q3 @
+ vshl.s32 q9, q9, q15 @
+ vqrshrn.s32 d12, q6, #0x6 @ D12 = c[i] = ((q[i] + 32) >> 6) where i = 48..51
+ vzip.s16 q4, q5 @Transpose
+ vqrshrn.s32 d13, q7, #0x6 @ D13 = c[i] = ((q[i] + 32) >> 6) where i = 52..55
+ vqrshrn.s32 d14, q8, #0x6 @ D14 = c[i] = ((q[i] + 32) >> 6) where i = 56..59
+ vzip.s32 q0, q2 @Transpose
+ vqrshrn.s32 d15, q9, #0x6 @ D15 = c[i] = ((q[i] + 32) >> 6) where i = 60..63
+
+@========= PROCESS IDCT FROM HERE =======
+
+@Steps for Stage 2:
+@------------------
+
+@ TRANSPOSE 8x8 coeffs to actual order
+
+ vzip.s16 q6, q7 @
+
+ vzip.s32 q1, q3 @
+ vzip.s32 q4, q6 @
+ vzip.s32 q5, q7 @
+
+ vswp d1, d8 @ Q0/Q1 = Row order x0/x1
+ vswp d3, d10 @ Q2/Q3 = Row order x2/x3
+ vswp d5, d12 @ Q4/Q5 = Row order x4/x5
+ vswp d7, d14 @ Q6/Q7 = Row order x6/x7
+
+ vswp q1, q4 @
+ vshr.s16 q10, q2, #0x1 @
+ vswp q3, q6 @
+
+@Steps for Stage 1:
+@------------------
+
+ vadd.s16 q8, q0, q4 @ Q8 = y0
+ vsub.s16 q9, q0, q4 @ Q9 = y2
+
+ vsra.s16 q2, q6, #0x1 @ Q2 = y6
+ vsub.s16 q6, q10, q6 @ Q6 = y4
+
+ vaddl.s16 q12, d14, d2 @ y3 (0-3) 1+7
+ vaddl.s16 q13, d15, d3 @ y3 (4-7) 1+7
+
+ vsubl.s16 q10, d14, d2 @ y5 (0-3) 7-1
+ vsubl.s16 q11, d15, d3 @ y5 (4-7) 7-1
+
+ vadd.s16 q0, q8, q2 @ Q0 = z0
+ vsub.s16 q4, q8, q2 @ Q4 = z6
+
+ vadd.s16 q8, q9, q6 @ Q8 = z2
+ vsub.s16 q2, q9, q6 @ Q2 = z4
+
+ vsubw.s16 q12, q12, d6 @ y3 (0-3) 1+7-3
+ vsubw.s16 q13, q13, d7 @ y3 (0-7) 1+7-3
+
+ vshr.s16 q6, q3, #0x1 @
+
+ vaddw.s16 q10, q10, d10 @
+ vaddw.s16 q11, q11, d11 @
+
+ vshr.s16 q9, q5, #0x1 @
+
+ vsubw.s16 q12, q12, d12 @
+ vsubw.s16 q13, q13, d13 @
+
+ vaddw.s16 q10, q10, d18 @
+ vaddw.s16 q11, q11, d19 @
+
+ vqmovn.s32 d12, q12 @
+ vaddl.s16 q12, d10, d6 @
+ vqmovn.s32 d13, q13 @ Q6 = y3
+ vaddl.s16 q13, d11, d7 @
+ vqmovn.s32 d18, q10 @
+ vsubl.s16 q10, d10, d6 @
+ vqmovn.s32 d19, q11 @ Q9 = y5
+ vsubl.s16 q11, d11, d7 @
+
+ vshr.s16 q3, q6, #0x2 @
+
+ vsra.s16 q6, q9, #0x2 @ Q6 = z3
+
+ vaddw.s16 q12, q12, d2 @
+ vaddw.s16 q13, q13, d3 @
+
+ vshr.s16 q1, #0x1 @
+
+ vsub.s16 q5, q3, q9 @ Q5 = z5
+
+ vsubw.s16 q10, q10, d14 @
+ vsubw.s16 q11, q11, d15 @
+
+ vshr.s16 q7, #0x1 @
+
+ vaddw.s16 q12, q12, d2 @
+ vaddw.s16 q13, q13, d3 @
+
+ vsubw.s16 q10, q10, d14 @
+ vsubw.s16 q11, q11, d15 @
+
+
+ vqmovn.s32 d14, q12 @
+ vadd.s16 q1, q8, q5 @ Q1 = x1
+ vqmovn.s32 d15, q13 @ Q7 = y7
+ vsub.s16 q3, q8, q5 @ Q3 = x6
+ vqmovn.s32 d18, q10 @
+ vsub.s16 q5, q2, q6 @ Q5 = x5
+ vqmovn.s32 d19, q11 @ Q9 = y1
+ vadd.s16 q2, q2, q6 @ Q2 = x2
+
+ vshr.s16 q12, q9, #0x2 @
+ vsra.s16 q9, q7, #0x2 @ Q9 = z1
+
+ vsub.s16 q11, q7, q12 @ Q11 = z7
+
+ vadd.s16 q6, q4, q9 @ Q6 = x3
+ vsub.s16 q4, q4, q9 @ Q4 = x4
+
+ vsub.s16 q7, q0, q11 @ Q7 = x7
+ vadd.s16 q0, q0, q11 @ Q0 = x0
+
+ vswp.s16 q3, q6 @ Q3 = x3, Q6 = x6
+
+
+@Steps for Stage 2:
+@------------------
+
+@ TRANSPOSE 8x8 coeffs to actual order
+
+ vzip.s16 q0, q1 @
+ vzip.s16 q2, q3 @
+ vzip.s16 q4, q5 @
+ vzip.s16 q6, q7 @
+
+ vzip.s32 q0, q2 @
+ vzip.s32 q1, q3 @
+ vzip.s32 q4, q6 @
+ vzip.s32 q5, q7 @
+
+ vswp d1, d8 @ Q0/Q1 = Row order x0/x1
+ vswp d3, d10 @ Q2/Q3 = Row order x2/x3
+ vswp d5, d12 @ Q4/Q5 = Row order x4/x5
+ vswp d7, d14 @ Q6/Q7 = Row order x6/x7
+
+ vswp q1, q4 @
+ vshr.s16 q10, q2, #0x1 @
+ vswp q3, q6 @
+
+@Steps for Stage 3:
+@------------------
+
+@Repeat stage 1 again for vertical transform
+
+ vadd.s16 q8, q0, q4 @ Q8 = y0
+ vld1.32 d28, [r1], r3 @ Q12 = 0x070605....0x070605....
+ vsub.s16 q9, q0, q4 @ Q9 = y2
+
+ vsra.s16 q2, q6, #0x1 @ Q2 = y6
+ vsub.s16 q6, q10, q6 @ Q6 = y4
+
+ vaddl.s16 q12, d14, d2 @
+ vld1.32 d29, [r1], r3 @ Q12 = 0x070605....0x070605....
+ vaddl.s16 q13, d15, d3 @
+
+ vsubl.s16 q10, d14, d2 @
+ vld1.32 d30, [r1], r3 @ Q12 = 0x070605....0x070605....
+ vsubl.s16 q11, d15, d3 @
+
+ vadd.s16 q0, q8, q2 @ Q0 = z0
+ vld1.32 d31, [r1], r3 @ Q12 = 0x070605....0x070605....
+ vsub.s16 q4, q8, q2 @ Q4 = z6
+
+ vadd.s16 q8, q9, q6 @ Q8 = z2
+ vsub.s16 q2, q9, q6 @ Q2 = z4
+
+ vsubw.s16 q12, q12, d6 @
+ vsubw.s16 q13, q13, d7 @
+
+ vshr.s16 q6, q3, #0x1 @
+
+ vaddw.s16 q10, q10, d10 @
+ vaddw.s16 q11, q11, d11 @
+
+ vshr.s16 q9, q5, #0x1 @
+
+ vsubw.s16 q12, q12, d12 @
+ vsubw.s16 q13, q13, d13 @
+
+ vaddw.s16 q10, q10, d18 @
+ vaddw.s16 q11, q11, d19 @
+
+ vqmovn.s32 d12, q12 @
+ vaddl.s16 q12, d10, d6 @
+ vqmovn.s32 d13, q13 @ Q6 = y3
+ vaddl.s16 q13, d11, d7 @
+ vqmovn.s32 d18, q10 @
+ vsubl.s16 q10, d10, d6 @
+ vqmovn.s32 d19, q11 @ Q9 = y5
+ vsubl.s16 q11, d11, d7 @
+
+ vshr.s16 q3, q6, #0x2 @
+
+ vsra.s16 q6, q9, #0x2 @ Q6 = z3
+
+ vaddw.s16 q12, q12, d2 @
+ vaddw.s16 q13, q13, d3 @
+
+ vshr.s16 q1, #0x1 @
+
+ vsub.s16 q5, q3, q9 @ Q5 = z5
+
+ vsubw.s16 q10, q10, d14 @
+ vsubw.s16 q11, q11, d15 @
+
+ vshr.s16 q7, #0x1 @
+
+ vaddw.s16 q12, q12, d2 @
+ vaddw.s16 q13, q13, d3 @
+
+ vsubw.s16 q10, q10, d14 @
+ vsubw.s16 q11, q11, d15 @
+
+ vqmovn.s32 d14, q12 @
+ vadd.s16 q1, q8, q5 @ Q1 = x1
+ vqmovn.s32 d15, q13 @ Q7 = y7
+ vsub.s16 q3, q8, q5 @ Q3 = x6
+ vqmovn.s32 d18, q10 @
+ vsub.s16 q5, q2, q6 @ Q5 = x5
+ vqmovn.s32 d19, q11 @ Q9 = y1
+ vadd.s16 q2, q2, q6 @ Q2 = x2
+
+ vshr.s16 q12, q9, #0x2 @
+ vsra.s16 q9, q7, #0x2 @ Q9 = z1
+
+ vsub.s16 q11, q7, q12 @ Q11 = z7
+
+ vadd.s16 q6, q4, q9 @ Q6 = x3
+ vsub.s16 q4, q4, q9 @ Q4 = x4
+
+ vsub.s16 q7, q0, q11 @ Q7 = x7
+ vadd.s16 q0, q0, q11 @ Q0 = x0
+
+ vswp.s16 q3, q6 @ Q3 <-> Q6
+
+ vrshr.s16 q1, q1, #6 @
+ vld1.32 d16, [r1], r3 @ Q12 = 0x070605....0x070605....
+ vrshr.s16 q2, q2, #6 @
+ vrshr.s16 q4, q4, #6 @
+ vld1.32 d17, [r1], r3 @ Q12 = 0x070605....0x070605....
+ vrshr.s16 q5, q5, #6 @
+ vrshr.s16 q7, q7, #6 @
+ vld1.32 d18, [r1], r3 @ Q12 = 0x070605....0x070605....
+ vrshr.s16 q0, q0, #6 @
+ vrshr.s16 q3, q3, #6 @
+ vld1.32 d19, [r1], r3 @ Q12 = 0x070605....0x070605....
+ vrshr.s16 q6, q6, #6 @
+
+@ Code Added to pack sign and magnitudes
+
+ vaddw.u8 q0, q0, d28
+ vaddw.u8 q1, q1, d29
+ vaddw.u8 q2, q2, d30
+ vaddw.u8 q3, q3, d31
+ vqmovun.s16 d0, q0
+ vaddw.u8 q4, q4, d16
+ vqmovun.s16 d1, q1
+ vaddw.u8 q5, q5, d17
+ vqmovun.s16 d2, q2
+ vaddw.u8 q6, q6, d18
+ vqmovun.s16 d3, q3
+ vaddw.u8 q7, q7, d19
+
+ vqmovun.s16 d4, q4
+ vst1.32 d0, [r2], r4 @ Magnitudes of 1st 4x4 block coeffs
+ vqmovun.s16 d5, q5
+ vst1.32 d1, [r2], r4 @ Magnitudes of 1st 4x4 block coeffs
+ vqmovun.s16 d6, q6
+ vst1.32 d2, [r2], r4 @ Magnitudes of 1st 4x4 block coeffs
+ vqmovun.s16 d7, q7
+ vst1.32 d3, [r2], r4 @ Magnitudes of 1st 4x4 block coeffs
+ vst1.32 d4, [r2], r4 @ Magnitudes of 1st 4x4 block coeffs
+
+ vst1.32 d5, [r2], r4 @ Magnitudes of 1st 4x4 block coeffs
+
+
+ vst1.32 d6, [r2], r4 @ Magnitudes of 1st 4x4 block coeffs
+
+
+ vst1.32 d7, [r2], r4 @ Magnitudes of 1st 4x4 block coeffs
+
+idct_8x8_end:
+
+ vpop {d8-d15}
+ ldmfd sp!, {r4-r12, r15}
+
diff --git a/common/arm/ih264_iquant_itrans_recon_dc_a9.s b/common/arm/ih264_iquant_itrans_recon_dc_a9.s
new file mode 100755
index 0000000..8d71bdb
--- /dev/null
+++ b/common/arm/ih264_iquant_itrans_recon_dc_a9.s
@@ -0,0 +1,399 @@
+@/******************************************************************************
+@ *
+@ * Copyright (C) 2015 The Android Open Source Project
+@ *
+@ * Licensed under the Apache License, Version 2.0 (the "License");
+@ * you may not use this file except in compliance with the License.
+@ * You may obtain a copy of the License at:
+@ *
+@ * http://www.apache.org/licenses/LICENSE-2.0
+@ *
+@ * Unless required by applicable law or agreed to in writing, software
+@ * distributed under the License is distributed on an "AS IS" BASIS,
+@ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@ * See the License for the specific language governing permissions and
+@ * limitations under the License.
+@ *
+@ *****************************************************************************
+@ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
+@*/
+@/**
+@ *******************************************************************************
+@ * @file
+@ * ih264_iquant_itrans_recon_dc_a9.s
+@ *
+@ * @brief
+@ * Contains function definitions for single stage inverse transform
+@ *
+@ * @author
+@ * Mohit
+@ *
+@ * @par List of Functions:
+@ * - ih264_iquant_itrans_recon_4x4_dc_a9()
+@ * - ih264_iquant_itrans_recon_8x8_dc_a9()
+@ * - ih264_iquant_itrans_recon_chroma_4x4_dc_a9()
+@ *
+@ * @remarks
+@ * None
+@ *
+@ *******************************************************************************
+@*/
+@/**
+@ *******************************************************************************
+@ *
+@ * @brief
+@ * This function performs inverse quant and Inverse transform type Ci4 for 4*4 block
+@ * for dc input pattern only, i.e. only the (0,0) element of the input 4x4 block is
+@ * non-zero. For complete function, refer ih264_iquant_itrans_recon_a9.s
+@ *
+@ * @par Description:
+@ * Performs inverse transform Ci4 and adds the residue to get the
+@ * reconstructed block
+@ *
+@ * @param[in] pi2_src
+@ * Input 4x4 coefficients
+@ *
+@ * @param[in] pu1_pred
+@ * Prediction 4x4 block
+@ *
+@ * @param[out] pu1_out
+@ * Output 4x4 block
+@ *
+@ * @param[in] u4_qp_div_6
+@ * QP
+@ *
+@ * @param[in] pu2_weigh_mat
+@ * Pointer to weight matrix
+@ *
+@ * @param[in] pred_strd,
+@ * Prediction stride
+@ *
+@ * @param[in] out_strd
+@ * Output Stride
+@ *
+@ *@param[in] pi2_tmp
+@ * temporary buffer of size 1*16
+@ *
+@ * @param[in] pu2_iscal_mat
+@ * Pointer to the inverse quantization matrix
+@ *
+@ * @returns Void
+@ *
+@ * @remarks
+@ * None
+@ *
+@ *******************************************************************************
+@ */
+@void ih264_iquant_itrans_recon_4x4_dc(WORD16 *pi2_src,
+@ UWORD8 *pu1_pred,
+@ UWORD8 *pu1_out,
+@ WORD32 pred_strd,
+@ WORD32 out_strd,
+@ const UWORD16 *pu2_iscal_mat,
+@ const UWORD16 *pu2_weigh_mat,
+@ UWORD32 u4_qp_div_6,
+@ WORD32 *pi4_tmp,
+@ WORD32 iq_start_idx
+@ WORD16 *pi2_dc_ld_addr)
+@**************Variables Vs Registers*****************************************
+@r0 => *pi2_src
+@r1 => *pu1_pred
+@r2 => *pu1_out
+@r3 => pred_strd
+@r4 => out_strd
+@r5 => *pu2_iscal_mat
+@r6 => *pu2_weigh_mat
+@r7 => u4_qp_div_6
+@r9 => iq_start_idx
+@unused => pi2_dc_ld_addr
+
+.text
+.p2align 2
+
+ .global ih264_iquant_itrans_recon_4x4_dc_a9
+
+ih264_iquant_itrans_recon_4x4_dc_a9:
+
+@Only one shift is done in horizontal inverse because,
+@if u4_qp_div_6 is lesser than 4 then shift value will be neagative and do negative left shift, in this case rnd_factor has value
+@if u4_qp_div_6 is greater than 4 then shift value will be positive and do left shift, here rnd_factor is 0
+
+ stmfd sp!, {r4-r10, r14} @stack stores the values of the arguments
+ ldr r5, [sp, #36] @Loads *pu2_iscal_mat
+ ldr r6, [sp, #40] @Loads *pu2_weigh_mat
+ ldrsh r8, [r0] @load pi2_src[0], SH for signed halfword load
+ ldrh r6, [r6] @load pu2_weight_mat[0] , H for unsigned halfword load
+ ldrh r5, [r5] @load pu2_iscal_mat[0] , H for unsigned halfword load
+@=======================DEQUANT FROM HERE===================================
+ mul r6, r6, r5 @pu2_iscal_mat[0]*pu2_weigh_mat[0]
+ ldr r7, [sp, #44] @Loads u4_qp_div_6
+ mul r6, r6, r8 @pi2_src[0]*pu2_iscal_mat[0]*pu2_weigh_mat[0]
+ ldr r4, [sp, #32] @Loads out_strd
+ ldr r9, [sp, #52] @Loads iq_start_idx
+
+ lsl r6, r6, r7 @(pi2_src[0]*pu2_iscal_mat[0]*pu2_weigh_mat[0])<<u4_qp_div_6
+ add r6, r6, #8 @(pi2_src[0]*pu2_iscal_mat[0]*pu2_weigh_mat[0])<<u4_qp_div_6 + rnd_fact
+ asr r6, r6, #4 @q0 = (pi2_src[0]*pu2_iscal_mat[0]*pu2_weigh_mat[0] + rnd_fact)<<(u4_qp_div_6-4)
+
+ subs r9, r9, #1 @ if r8 == 1 => intra case , so result of subtraction is zero and Z flag is set
+ ldreqsh r10, [r0] @ Loads signed halfword pi2_src[0], if r9==1
+ moveq r6, r10 @ Restore dc value in case of intra, i.e. r9 == 1
+
+ add r6, r6, #32 @i_macro = q0 + 32
+ asr r6, r6, #6 @i_macro >>6 = DC output of 2-stage transform
+ vdup.s16 q0, r6 @copy transform output to Q0
+
+ vld1.32 d30[0], [r1], r3 @I row Load pu1_pred buffer
+
+ vld1.32 d30[1], [r1], r3 @II row Load pu1_pred buffer
+
+ vld1.32 d31[0], [r1], r3 @III row Load pu1_pred buf
+
+ vld1.32 d31[1], [r1], r3 @IV row Load pu1_pred buffer
+ vaddw.u8 q10, q0, d30
+
+ vaddw.u8 q11, q0, d31
+
+ vqmovun.s16 d0, q10
+
+ vst1.32 d0[0], [r2], r4 @I row store the value
+ vqmovun.s16 d1, q11
+ vst1.32 d0[1], [r2], r4 @II row store the value
+ vst1.32 d1[0], [r2], r4 @III row store the value
+ vst1.32 d1[1], [r2] @IV row store the value
+
+ ldmfd sp!, {r4-r10, r15} @Reload the registers from SP
+
+
+
+
+@/*
+@ *******************************************************************************
+@ *
+@ * @brief
+@ * This function performs inverse quant and Inverse transform type Ci4 for 8*8 block
+@ * for dc input pattern only, i.e. only the (0,0) element of the input 8x8 block is
+@ * non-zero. For complete function, refer ih264_iquant_itrans_recon_a9.s
+@ *
+@ * @par Description:
+@ * Performs inverse transform Ci8 and adds the residue to get the
+@ * reconstructed block
+@ *
+@ * @param[in] pi2_src
+@ * Input 4x4 coefficients
+@ *
+@ * @param[in] pu1_pred
+@ * Prediction 4x4 block
+@ *
+@ * @param[out] pu1_out
+@ * Output 4x4 block
+@ *
+@ * @param[in] u4_qp_div_6
+@ * QP
+@ *
+@ * @param[in] pu2_weigh_mat
+@ * Pointer to weight matrix
+@ *
+@ * @param[in] pred_strd,
+@ * Prediction stride
+@ *
+@ * @param[in] out_strd
+@ * Output Stride
+@ *
+@ *@param[in] pi2_tmp
+@ * temporary buffer of size 1*64
+@ *
+@ * @param[in] pu2_iscal_mat
+@ * Pointer to the inverse quantization matrix
+@ *
+@ * @returns Void
+@ *
+@ * @remarks
+@ * None
+@ *
+@ *******************************************************************************
+@ */
+@void ih264_iquant_itrans_recon_8x8_dc(WORD16 *pi2_src,
+@ UWORD8 *pu1_pred,
+@ UWORD8 *pu1_out,
+@ WORD32 pred_strd,
+@ WORD32 out_strd,
+@ const UWORD16 *pu2_iscal_mat,
+@ const UWORD16 *pu2_weigh_mat,
+@ UWORD32 u4_qp_div_6,
+@ WORD32 *pi4_tmp,
+@ WORD32 iq_start_idx)
+@**************Variables Vs Registers*****************************************
+@r0 => *pi2_src
+@r1 => *pu1_pred
+@r2 => *pu1_out
+@r3 => pred_strd
+@r4 => out_strd
+@r5 => *pu2_iscal_mat
+@r6 => *pu2_weigh_mat
+@r7 => u4_qp_div_6
+
+
+ .global ih264_iquant_itrans_recon_8x8_dc_a9
+ih264_iquant_itrans_recon_8x8_dc_a9:
+
+ stmfd sp!, {r4-r8, r14} @stack stores the values of the arguments
+ ldr r5, [sp, #28] @Loads *pu2_iscal_mat
+ ldr r6, [sp, #32] @Loads *pu2_weigh_mat
+ ldrsh r8, [r0] @load pi2_src[0], SH for signed halfword load
+ ldrh r6, [r6] @load pu2_weight_mat[0] , H for unsigned halfword load
+ ldrh r5, [r5] @load pu2_iscal_mat[0] , H for unsigned halfword load
+@=======================DEQUANT FROM HERE===================================
+ mul r6, r6, r5 @pu2_iscal_mat[0]*pu2_weigh_mat[0]
+ ldr r7, [sp, #36] @Loads u4_qp_div_6
+ mul r6, r6, r8 @pi2_src[0]*pu2_iscal_mat[0]*pu2_weigh_mat[0]
+ ldr r4, [sp, #24] @Loads out_strd
+
+ vpush {d8-d15}
+ lsl r6, r6, r7 @(pi2_src[0]*pu2_iscal_mat[0]*pu2_weigh_mat[0])<<u4_qp_div_6
+ add r6, r6, #32 @(pi2_src[0]*pu2_iscal_mat[0]*pu2_weigh_mat[0])<<u4_qp_div_6 + rnd_fact
+ asr r6, r6, #6 @q0 = (pi2_src[0]*pu2_iscal_mat[0]*pu2_weigh_mat[0] + rnd_fact)<<(u4_qp_div_6-4)
+ add r6, r6, #32 @i_macro = q0 + 32
+ asr r6, r6, #6 @i_macro >>6 = DC output of 2-stage transform
+ vdup.s16 q8, r6 @copy transform output to Q0
+
+ vld1.32 d24, [r1], r3 @ Q12 = 0x070605....0x070605....
+
+ vld1.32 d25, [r1], r3 @ Q12 = 0x070605....0x070605....
+
+ vld1.32 d26, [r1], r3 @ Q12 = 0x070605....0x070605....
+ vaddw.u8 q0, q8, d24
+ vld1.32 d27, [r1], r3 @ Q12 = 0x070605....0x070605....
+ vaddw.u8 q1, q8, d25
+ vld1.32 d28, [r1], r3 @ Q12 = 0x070605....0x070605....
+ vaddw.u8 q2, q8, d26
+ vld1.32 d29, [r1], r3 @ Q12 = 0x070605....0x070605....
+ vaddw.u8 q3, q8, d27
+ vld1.32 d30, [r1], r3 @ Q12 = 0x070605....0x070605....
+ vaddw.u8 q4, q8, d28
+ vld1.32 d31, [r1], r3 @ Q12 = 0x070605....0x070605....
+
+@ Code Added to pack sign and magnitudes
+
+
+ vqmovun.s16 d0, q0
+ vaddw.u8 q5, q8, d29
+ vqmovun.s16 d1, q1
+ vaddw.u8 q6, q8, d30
+ vqmovun.s16 d2, q2
+ vqmovun.s16 d3, q3
+ vaddw.u8 q7, q8, d31
+ vqmovun.s16 d4, q4
+ vqmovun.s16 d5, q5
+ vst1.32 d0, [r2], r4 @ Magnitudes of 1st 4x4 block coeffs
+ vqmovun.s16 d6, q6
+ vst1.32 d1, [r2], r4 @ Magnitudes of 1st 4x4 block coeffs
+ vqmovun.s16 d7, q7
+ vst1.32 d2, [r2], r4 @ Magnitudes of 1st 4x4 block coeffs
+ vst1.32 d3, [r2], r4 @ Magnitudes of 1st 4x4 block coeffs
+ vst1.32 d4, [r2], r4 @ Magnitudes of 1st 4x4 block coeffs
+ vst1.32 d5, [r2], r4 @ Magnitudes of 1st 4x4 block coeffs
+ vst1.32 d6, [r2], r4 @ Magnitudes of 1st 4x4 block coeffs
+ vst1.32 d7, [r2], r4 @ Magnitudes of 1st 4x4 block coeffs
+
+ vpop {d8-d15}
+ ldmfd sp!, {r4-r8, r15}
+
+
+@ /*
+@ ********************************************************************************
+@ *
+@ * @brief This function reconstructs a 4x4 sub block from quantized resiude and
+@ * prediction buffer if only dc value is present for residue
+@ *
+@ * @par Description:
+@ * The quantized residue is first inverse quantized,
+@ * This inverse quantized content is added to the prediction buffer to recon-
+@ * struct the end output
+@ *
+@ * @param[in] pi2_src
+@ * quantized dc coeffiient
+@ *
+@ * @param[in] pu1_pred
+@ * prediction 4x4 block in interleaved format
+@ *
+@ * @param[in] pred_strd,
+@ * Prediction buffer stride in interleaved format
+@ *
+@ * @param[in] out_strd
+@ * recon buffer Stride
+@ *
+@ * @returns none
+@ *
+@ * @remarks none
+@ *
+@ *******************************************************************************
+@ */
+@ void ih264_iquant_itrans_recon_chroma_4x4_dc(WORD16 *pi2_src,
+@ UWORD8 *pu1_pred,
+@ UWORD8 *pu1_out,
+@ WORD32 pred_strd,
+@ WORD32 out_strd,
+@ const UWORD16 *pu2_iscal_mat,
+@ const UWORD16 *pu2_weigh_mat,
+@ UWORD32 u4_qp_div_6,
+@ WORD16 *pi2_tmp,
+@ WORD16 *pi2_dc_src)
+@ Register Usage
+@ r0 : pi2_src
+@ r1 : pu1_pred
+@ r2 : pu1_out
+@ r3 : pred_strd
+@ Neon registers d0-d7, d16-d30 are used
+@ No need for pushing arm and neon registers
+ .global ih264_iquant_itrans_recon_chroma_4x4_dc_a9
+ih264_iquant_itrans_recon_chroma_4x4_dc_a9:
+
+ ldr r0, [sp, #20]
+ vld1.s16 d0, [r0] @load pi2_dc_src
+
+ ldr r0, [sp] @load out_strd
+
+ vld2.s8 {d2, d3}, [r1], r3 @load pred plane 1 => d2 &pred palne 2 => d3
+ vld2.s8 {d3, d4}, [r1], r3
+ vrshr.s16 d0, d0, #6 @i_macro = ((q0 + 32) >> 6);
+ vld2.s8 {d4, d5}, [r1], r3
+ vld2.s8 {d5, d6}, [r1], r3
+
+ vdup.s16 q0, d0[0] @duplicate pi2_sr[0]
+ mov r1, r2 @backup pu1_out
+
+ vtrn.32 d2, d3 @mov the 4 coeffs of current block to d2
+ vtrn.32 d4, d5
+
+ vmov.u16 q15, #0x00ff
+
+ vld1.u8 d18, [r2], r0 @load out [8 bit size) -8 coeffs
+ vaddw.u8 q1, q0, d2 @Add pred
+ vld1.u8 d19, [r2], r0
+ vaddw.u8 q2, q0, d4
+ vld1.u8 d20, [r2], r0
+ vld1.u8 d21, [r2], r0
+
+ vqmovun.s16 d2, q1
+ vqmovun.s16 d4, q2
+
+ vmovl.u8 q1, d2
+ vmovl.u8 q2, d4
+
+ vbit.u8 q9, q1, q15
+ vbit.u8 q10, q2, q15
+
+ vst1.u8 d18, [r1], r0 @store out
+ vst1.u8 d19, [r1], r0
+ vst1.u8 d20, [r1], r0
+ vst1.u8 d21, [r1], r0
+
+ bx lr
+
+
+
+
+
+
+
diff --git a/common/arm/ih264_itrans_recon_a9.s b/common/arm/ih264_itrans_recon_a9.s
new file mode 100755
index 0000000..1d74da5
--- /dev/null
+++ b/common/arm/ih264_itrans_recon_a9.s
@@ -0,0 +1,216 @@
+@/******************************************************************************
+@ *
+@ * Copyright (C) 2015 The Android Open Source Project
+@ *
+@ * Licensed under the Apache License, Version 2.0 (the "License");
+@ * you may not use this file except in compliance with the License.
+@ * You may obtain a copy of the License at:
+@ *
+@ * http://www.apache.org/licenses/LICENSE-2.0
+@ *
+@ * Unless required by applicable law or agreed to in writing, software
+@ * distributed under the License is distributed on an "AS IS" BASIS,
+@ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@ * See the License for the specific language governing permissions and
+@ * limitations under the License.
+@ *
+@ *****************************************************************************
+@ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
+@*/
+@/**
+@ *******************************************************************************
+@ * @file
+@ * ih264_itrans_recon_neon_a9.s
+@ *
+@ * @brief
+@ * Contains function definitions for single stage inverse transform
+@ *
+@ *
+@ * @par List of Functions:
+@ * - ih264_itrans_recon_4x4_a9()
+@ *
+@ * @remarks
+@ * None
+@ *
+@ *******************************************************************************
+@*/
+@/**
+@ *******************************************************************************
+@ *
+@ * @brief
+@ * This function performs Inverse transform type Ci4 for 4*4 block
+@ *
+@ * @par Description:
+@ * Performs inverse transform Ci4 and adds the residue to get the
+@ * reconstructed block
+@ *
+@ * @param[in] pi16_levelBlock
+@ * Input 4x4 coefficients
+@ *
+@ * @param[in] puc_predBuffer
+@ * Prediction 4x4 block
+@ *
+@ * @param[out] puc_reconPic
+@ * Output 4x4 block
+@ *
+@ * @param[in] ui16_picWidth
+@ * Input stride
+@ *
+@ * @param[in] pred_strd
+@ * Prediction stride
+@ *
+@ * @param[in] dst_strd
+@ * Output Stride
+@ *
+@ * @param[in] zero_cols
+@ * Zero columns in pi2_src
+@ *
+@ * @returns Void
+@ *
+@ * @remarks
+@ * None
+@ *
+@ *
+@ *******************************************************************************
+@ */
+@void ih264_itrans_recon_4x4(
+@ WORD16 *pi2_src,
+@ UWORD8 *pu1_pred,
+@ UWORD8 *pu1_recon,
+@ WORD32 src_strd,
+@ WORD32 pred_strd,
+@ WORD32 dst_strd,
+@ UWORD32 q_lev, //quantizer level
+@ WORD32 *pi4_tmp)
+@**************Variables Vs Registers*****************************************
+@r0 => *pi2_src
+@r1 => *pu1_pred
+@r2 => *pu1_recon
+@r3 => src_strd
+@r4 => pred_strd
+@r5 => dst_strd
+@r6 => q_lev
+@r7 => *pi4_tmp
+
+.text
+.p2align 2
+
+
+ .global ih264_itrans_recon_4x4_a9
+
+ih264_itrans_recon_4x4_a9:
+ stmfd sp!, {r4-r12, r14} @stack stores the values of the arguments
+ lsl r3, r3, #1
+
+ vld1.16 d0, [r0], r3 @0th row pi2_src_tmp[0]
+ ldr r4, [sp, #40] @Loads pred_strd
+
+ vld1.16 d1, [r0], r3 @I row pi2_src_tmp[0]
+ ldr r5, [sp, #44] @Loads *dst_strd
+
+ vld1.16 d2, [r0], r3 @II row pi2_src_tmp[0]
+
+ vld1.16 d3, [r0] @III row pi2_src_tmp[0]
+ ldr r7, [sp, #52] @Loads *pi4_tmp
+
+ vpush {d8-d15}
+
+ vtrn.16 d0, d1 @Transpose to get all the 0th element in the single D register
+ vtrn.16 d2, d3
+ vtrn.32 d0, d2
+ vtrn.32 d1, d3 @D0 --> pi2_src_tmp[0], D1 --> pi2_src_tmp[1]
+ @D2 --> pi2_src_tmp[2], D3 --> pi2_src_tmp[3]
+
+ vaddl.s16 q3, d0, d2 @x0 = (pi2_src_tmp[0] + pi2_src_tmp[2])
+ vsubl.s16 q4, d0, d2 @x1 = (pi2_src_tmp[0] - pi2_src_tmp[2])
+ vshr.s16 d4, d1, #1 @pi2_src_tmp[1] >> 1
+ vshr.s16 d5, d3, #1 @pi2_src_tmp[3] >> 1
+
+ vsubl.s16 q5, d4, d3 @x2 = D_SHIFT(pi2_src_tmp[1],1,shft) - pi2_src_tmp[3]
+
+ vaddl.s16 q6, d1, d5 @x3 = pi2_src_tmp[1] + D_SHIFT(pi2_src_tmp[3],1,shft)
+
+ vadd.s32 q8, q4, q5 @x1 + x2
+ vsub.s32 q9, q4, q5 @x1 - x2
+
+ vadd.s32 q7, q3, q6 @x0 + x3
+ vsub.s32 q10, q3, q6 @x0 - x3
+
+ vtrn.32 q7, q8 @Transpose the register to have the adjacent values
+
+ vtrn.32 q9, q10
+ vadd.s32 d6, d14, d15 @x0(0,1) = (pi4_tblk[0,1] + pi4_tblk[8,9])
+
+ vsub.s32 d7, d14, d15 @x1(0,1) = (pi4_tblk[0,1] - pi4_tblk[8,9])
+
+ vshr.s32 d4, d16, #1 @pi4_tblk[4,5] >> 1
+ vshr.s32 d5, d17, #1 @pi4_tblk[12,13] >> 1
+
+ vsub.s32 d8, d4, d17 @x2(0,1) = D_SHIFT(pi4_tblk[4,5],1,shft) - pi4_tblk[12,13]
+ vadd.s32 d9, d16, d5 @x3(0,1) = pi4_tblk[4,5] + D_SHIFT(pi4_tblk[12,13],1,shft)
+
+ vadd.s32 d10, d18, d19 @x0(2,3) = (pi4_tblk[2,3] + pi4_tblk[10,11])
+ vsub.s32 d11, d18, d19 @x1(2,3) = (pi4_tblk[2,3] - pi4_tblk[10,11])
+ vshr.s32 d4, d20, #1 @pi4_tblk[6,7] >> 1
+ vshr.s32 d5, d21, #1 @pi4_tblk[14,15] >> 1
+
+ vld1.32 d30[0], [r1], r4 @I row Load pu1_pred buffer
+ vsub.s32 d12, d4, d21 @x2(2,3) = D_SHIFT(pi4_tblk[6,7],1,shft) - pi4_tblk[14,15]
+
+ vmovl.u8 q15, d30 @I row Convert 8 bit pred buffer to 16 bit
+ vadd.s32 d13, d20, d5 @x3(2,3) = pi4_tblk[6,7] + D_SHIFT(pi4_tblk[14,15],1,shft)
+
+ vadd.s32 d16, d6, d9 @I row i_macro(0,1) = x0(0,1) + x3(0,1)
+
+ vld1.32 d28[0], [r1], r4 @II row Load pu1_pred buffer
+ vadd.s32 d17, d10, d13 @I row i_macro(2,3) = x0(2,3) + x3(2,3)
+
+ vqrshrn.s32 d16, q8, #6 @I row i_macro = D_SHIFT(i_macro,6,shft)
+
+ vmovl.u8 q14, d28 @II row Convert 8 bit pred buffer to 16 bit
+ vadd.u16 d16, d16, d30 @I row i_macro += *pu1_pred_tmp
+
+ vqmovun.s16 d16, q8 @I row CLIP_U8(i_macro)
+ vadd.s32 d18, d7, d8 @II row i_macro(0,1) = x1(0,1) + x2(0,1)
+
+ vld1.32 d26[0], [r1], r4 @III row Load pu1_pred buffer
+ vadd.s32 d19, d11, d12 @II row i_macro(2,3) = x1(2,3) + x2(2,3)
+
+ vqrshrn.s32 d18, q9, #6 @II row i_macro = D_SHIFT(i_macro,6,shft)
+
+ vmovl.u8 q13, d26 @III row Convert 8 bit pred buffer to 16 bit
+ vadd.u16 d18, d18, d28 @II row i_macro += *pu1_pred_tmp
+
+ vst1.32 d16[0], [r2], r5 @I row store the value
+ vsub.s32 d20, d7, d8 @III row i_macro(0,1) = x1(0,1) - x2(0,1)
+
+ vqmovun.s16 d18, q9 @II row CLIP_U8(i_macro)
+ vsub.s32 d21, d11, d12 @III row i_macro(2,3) = x1(2,3) - x2(2,3)
+
+ vld1.32 d24[0], [r1], r4 @IV row Load pu1_pred buffer
+ vqrshrn.s32 d20, q10, #6 @III row i_macro = D_SHIFT(i_macro,6,shft)
+
+ vmovl.u8 q12, d24 @IV row Convert 8 bit pred buffer to 16 bit
+ vadd.u16 d20, d20, d26 @III row i_macro += *pu1_pred_tmp
+
+ vqmovun.s16 d20, q10 @III row CLIP_U8(i_macro)
+ vsub.s32 d22, d6, d9 @IV row i_macro(0,1) = x0(0,1) - x3(0,1)
+
+ vst1.32 d18[0], [r2], r5 @II row store the value
+ vsub.s32 d23, d10, d13 @IV row i_macro(2,3) = x0(2,3) - x3(2,3)
+
+ vqrshrn.s32 d22, q11, #6 @IV row i_macro = D_SHIFT(i_macro,6,shft)
+
+ vst1.32 d20[0], [r2], r5 @III row store the value
+ vadd.u16 d22, d22, d24 @IV row i_macro += *pu1_pred_tmp
+
+ vqmovun.s16 d22, q11 @IV row CLIP_U8(i_macro)
+ vst1.32 d22[0], [r2], r5 @IV row store the value
+
+
+ vpop {d8-d15}
+ ldmfd sp!, {r4-r12, r15} @Reload the registers from SP
+
+
+
+
diff --git a/common/arm/ih264_mem_fns_neon.s b/common/arm/ih264_mem_fns_neon.s
new file mode 100755
index 0000000..2808897
--- /dev/null
+++ b/common/arm/ih264_mem_fns_neon.s
@@ -0,0 +1,268 @@
+@/******************************************************************************
+@ *
+@ * Copyright (C) 2015 The Android Open Source Project
+@ *
+@ * Licensed under the Apache License, Version 2.0 (the "License");
+@ * you may not use this file except in compliance with the License.
+@ * You may obtain a copy of the License at:
+@ *
+@ * http://www.apache.org/licenses/LICENSE-2.0
+@ *
+@ * Unless required by applicable law or agreed to in writing, software
+@ * distributed under the License is distributed on an "AS IS" BASIS,
+@ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@ * See the License for the specific language governing permissions and
+@ * limitations under the License.
+@ *
+@ *****************************************************************************
+@ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
+@*/
+@/**
+@ *******************************************************************************
+@ * @file
+@ * ih264_mem_fns_neon.s
+@ *
+@ * @brief
+@ * Contains function definitions for memory manipulation
+@ *
+@ * @author
+@ * Naveen SR
+@ *
+@ * @par List of Functions:
+@ * - ih264_memcpy_mul_8_a9q()
+@ * - ih264_memcpy_a9q()
+@ * - ih264_memset_mul_8_a9q()
+@ * - ih264_memset_a9q()
+@ * - ih264_memset_16bit_mul_8_a9q()
+@ * - ih264_memset_a9q()
+@ *
+@ * @remarks
+@ * None
+@ *
+@ *******************************************************************************
+@*/
+
+@/**
+@*******************************************************************************
+@*
+@* @brief
+@* memcpy of a 1d array
+@*
+@* @par Description:
+@* Does memcpy of 8bit data from source to destination for 8,16 or 32 number of bytes
+@*
+@* @param[in] pu1_dst
+@* UWORD8 pointer to the destination
+@*
+@* @param[in] pu1_src
+@* UWORD8 pointer to the source
+@*
+@* @param[in] num_bytes
+@* number of bytes to copy
+@* @returns
+@*
+@* @remarks
+@* None
+@*
+@*******************************************************************************
+@*/
+@void ih264_memcpy_mul_8(UWORD8 *pu1_dst,
+@ UWORD8 *pu1_src,
+@ UWORD8 num_bytes)
+@**************Variables Vs Registers*************************
+@ r0 => *pu1_dst
+@ r1 => *pu1_src
+@ r2 => num_bytes
+
+.text
+.p2align 2
+
+
+ .global ih264_memcpy_mul_8_a9q
+
+ih264_memcpy_mul_8_a9q:
+
+loop_neon_memcpy_mul_8:
+ @ Memcpy 8 bytes
+ vld1.8 d0, [r1]!
+ vst1.8 d0, [r0]!
+
+ subs r2, r2, #8
+ bne loop_neon_memcpy_mul_8
+ bx lr
+
+
+
+@*******************************************************************************
+@*/
+@void ih264_memcpy(UWORD8 *pu1_dst,
+@ UWORD8 *pu1_src,
+@ UWORD8 num_bytes)
+@**************Variables Vs Registers*************************
+@ r0 => *pu1_dst
+@ r1 => *pu1_src
+@ r2 => num_bytes
+
+
+
+ .global ih264_memcpy_a9q
+
+ih264_memcpy_a9q:
+ subs r2, #8
+ blt memcpy
+loop_neon_memcpy:
+ @ Memcpy 8 bytes
+ vld1.8 d0, [r1]!
+ vst1.8 d0, [r0]!
+
+ subs r2, #8
+ bge loop_neon_memcpy
+ cmp r2, #-8
+ bxeq lr
+
+memcpy:
+ add r2, #8
+
+loop_memcpy:
+ ldrb r3, [r1], #1
+ strb r3, [r0], #1
+ subs r2, #1
+ bne loop_memcpy
+ bx lr
+
+
+
+
+@void ih264_memset_mul_8(UWORD8 *pu1_dst,
+@ UWORD8 value,
+@ UWORD8 num_bytes)
+@**************Variables Vs Registers*************************
+@ r0 => *pu1_dst
+@ r1 => value
+@ r2 => num_bytes
+
+
+
+ .global ih264_memset_mul_8_a9q
+
+ih264_memset_mul_8_a9q:
+
+@ Assumptions: numbytes is either 8, 16 or 32
+ vdup.8 d0, r1
+loop_memset_mul_8:
+ @ Memset 8 bytes
+ vst1.8 d0, [r0]!
+
+ subs r2, r2, #8
+ bne loop_memset_mul_8
+
+ bx lr
+
+
+
+
+@void ih264_memset(UWORD8 *pu1_dst,
+@ UWORD8 value,
+@ UWORD8 num_bytes)
+@**************Variables Vs Registers*************************
+@ r0 => *pu1_dst
+@ r1 => value
+@ r2 => num_bytes
+
+
+
+ .global ih264_memset_a9q
+
+ih264_memset_a9q:
+ subs r2, #8
+ blt memset
+ vdup.8 d0, r1
+loop_neon_memset:
+ @ Memcpy 8 bytes
+ vst1.8 d0, [r0]!
+
+ subs r2, #8
+ bge loop_neon_memset
+ cmp r2, #-8
+ bxeq lr
+
+memset:
+ add r2, #8
+
+loop_memset:
+ strb r1, [r0], #1
+ subs r2, #1
+ bne loop_memset
+ bx lr
+
+
+
+
+@void ih264_memset_16bit_mul_8(UWORD16 *pu2_dst,
+@ UWORD16 value,
+@ UWORD8 num_words)
+@**************Variables Vs Registers*************************
+@ r0 => *pu2_dst
+@ r1 => value
+@ r2 => num_words
+
+
+
+ .global ih264_memset_16bit_mul_8_a9q
+
+ih264_memset_16bit_mul_8_a9q:
+
+@ Assumptions: num_words is either 8, 16 or 32
+
+ @ Memset 8 words
+ vdup.16 d0, r1
+loop_memset_16bit_mul_8:
+ vst1.16 d0, [r0]!
+ vst1.16 d0, [r0]!
+
+ subs r2, r2, #8
+ bne loop_memset_16bit_mul_8
+
+ bx lr
+
+
+
+
+@void ih264_memset_16bit(UWORD16 *pu2_dst,
+@ UWORD16 value,
+@ UWORD8 num_words)
+@**************Variables Vs Registers*************************
+@ r0 => *pu2_dst
+@ r1 => value
+@ r2 => num_words
+
+
+
+ .global ih264_memset_16bit_a9q
+
+ih264_memset_16bit_a9q:
+ subs r2, #8
+ blt memset_16bit
+ vdup.16 d0, r1
+loop_neon_memset_16bit:
+ @ Memset 8 words
+ vst1.16 d0, [r0]!
+ vst1.16 d0, [r0]!
+
+ subs r2, #8
+ bge loop_neon_memset_16bit
+ cmp r2, #-8
+ bxeq lr
+
+memset_16bit:
+ add r2, #8
+
+loop_memset_16bit:
+ strh r1, [r0], #2
+ subs r2, #1
+ bne loop_memset_16bit
+ bx lr
+
+
+
+
diff --git a/common/arm/ih264_padding_neon.s b/common/arm/ih264_padding_neon.s
new file mode 100755
index 0000000..9bab268
--- /dev/null
+++ b/common/arm/ih264_padding_neon.s
@@ -0,0 +1,646 @@
+@/******************************************************************************
+@ *
+@ * Copyright (C) 2015 The Android Open Source Project
+@ *
+@ * Licensed under the Apache License, Version 2.0 (the "License");
+@ * you may not use this file except in compliance with the License.
+@ * You may obtain a copy of the License at:
+@ *
+@ * http://www.apache.org/licenses/LICENSE-2.0
+@ *
+@ * Unless required by applicable law or agreed to in writing, software
+@ * distributed under the License is distributed on an "AS IS" BASIS,
+@ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@ * See the License for the specific language governing permissions and
+@ * limitations under the License.
+@ *
+@ *****************************************************************************
+@ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
+@*/
+@/**
+@ *******************************************************************************
+@ * @file
+@ * ih264_padding_neon.s
+@ *
+@ * @brief
+@ * Contains function definitions padding
+@ *
+@ * @author
+@ * Ittiam
+@ *
+@ * @par List of Functions:
+@ * - ih264_pad_top_a9q()
+@ * - ih264_pad_left_luma_a9q()
+@ * - ih264_pad_left_chroma_a9q()
+@ * - ih264_pad_right_luma_a9q()
+@ * - ih264_pad_right_chroma_a9q()
+@ *
+@ * @remarks
+@ * None
+@ *
+@ *******************************************************************************
+@*/
+
+
+@/**
+@*******************************************************************************
+@*
+@* @brief pad at the top of a 2d array
+@*
+@* @par Description:
+@* The top row of a 2d array is replicated for pad_size times at the top
+@*
+@* @param[in] pu1_src
+@* UWORD8 pointer to the source
+@*
+@* @param[in] src_strd
+@* integer source stride
+@*
+@* @param[in] wd
+@* integer width of the array
+@*
+@* @param[in] pad_size
+@* integer -padding size of the array
+@*
+@* @returns none
+@*
+@* @remarks none
+@*
+@*******************************************************************************
+@*/
+@void ih264_pad_top(UWORD8 *pu1_src,
+@ WORD32 src_strd,
+@ WORD32 wd,
+@ WORD32 pad_size)
+@**************Variables Vs Registers*************************
+@ r0 => *pu1_src
+@ r1 => src_strd
+@ r2 => wd
+@ r3 => pad_size
+
+.text
+.p2align 2
+
+ .global ih264_pad_top_a9q
+
+ih264_pad_top_a9q:
+
+ stmfd sp!, {r4-r11, lr} @stack stores the values of the arguments
+
+ sub r5, r0, r1
+ rsb r6, r1, #0
+
+loop_neon_memcpy_mul_16:
+ @ Load 16 bytes
+ vld1.8 {d0, d1}, [r0]!
+ mov r4, r5
+ mov r7, r3
+ add r5, r5, #16
+
+loop_neon_pad_top:
+ vst1.8 {d0, d1}, [r4], r6
+ subs r7, r7, #1
+ bne loop_neon_pad_top
+
+ subs r2, r2, #16
+ bne loop_neon_memcpy_mul_16
+
+ ldmfd sp!, {r4-r11, pc} @Reload the registers from SP
+
+
+
+
+@/**
+@*******************************************************************************
+@*
+@* @brief
+@* Padding (luma block) at the left of a 2d array
+@*
+@* @par Description:
+@* The left column of a 2d array is replicated for pad_size times at the left
+@*
+@*
+@* @param[in] pu1_src
+@* UWORD8 pointer to the source
+@*
+@* @param[in] src_strd
+@* integer source stride
+@*
+@* @param[in] ht
+@* integer height of the array
+@*
+@* @param[in] wd
+@* integer width of the array
+@*
+@* @param[in] pad_size
+@* integer -padding size of the array
+@*
+@* @param[in] ht
+@* integer height of the array
+@*
+@* @param[in] wd
+@* integer width of the array
+@*
+@* @returns
+@*
+@* @remarks
+@* None
+@*
+@*******************************************************************************
+@*/
+@#if PAD_LEFT_LUMA == C
+@void ih264_pad_left_luma(UWORD8 *pu1_src,
+@ WORD32 src_strd,
+@ WORD32 ht,
+@ WORD32 pad_size)
+@**************Variables Vs Registers*************************
+@ r0 => *pu1_src
+@ r1 => src_strd
+@ r2 => ht
+@ r3 => pad_size
+
+
+ .global ih264_pad_left_luma_a9q
+
+ih264_pad_left_luma_a9q:
+
+ stmfd sp!, {r4-r11, lr} @stack stores the values of the arguments
+
+
+ sub r4, r0, r3
+ sub r6, r1, #16
+ subs r5, r3, #16
+ bne loop_32
+loop_16: @ /*hard coded for width=16 ,height =8,16*/
+ ldrb r8, [r0], r1
+ ldrb r9, [r0], r1
+ vdup.u8 q0, r8
+ ldrb r10, [r0], r1
+ vst1.8 {q0}, [r4], r1 @ 16 bytes store
+ vdup.u8 q1, r9
+ vst1.8 {q1}, [r4], r1 @ 16 bytes store
+ ldrb r11, [r0], r1
+ vdup.u8 q2, r10
+ vdup.u8 q3, r11
+ vst1.8 {q2}, [r4], r1 @ 16 bytes store
+ ldrb r8, [r0], r1
+ vst1.8 {q3}, [r4], r1 @ 16 bytes store
+ ldrb r9, [r0], r1
+ vdup.u8 q0, r8
+ ldrb r10, [r0], r1
+ vst1.8 {q0}, [r4], r1 @ 16 bytes store
+ vdup.u8 q1, r9
+ ldrb r11, [r0], r1
+ vst1.8 {q1}, [r4], r1 @ 16 bytes store
+ vdup.u8 q2, r10
+ vdup.u8 q3, r11
+ subs r2, r2, #8
+ vst1.8 {q2}, [r4], r1 @ 16 bytes store
+ vst1.8 {q3}, [r4], r1 @ 16 bytes store
+ bne loop_16
+ b end_func
+
+loop_32: @ /*hard coded for width=32 ,height =8,16*/
+ ldrb r8, [r0], r1
+ ldrb r9, [r0], r1
+ vdup.u8 q0, r8
+ ldrb r10, [r0], r1
+ vst1.8 {q0}, [r4]! @ 16 bytes store
+ vdup.u8 q1, r9
+ vst1.8 {q0}, [r4], r6
+ vst1.8 {q1}, [r4]! @ 16 bytes store
+ vdup.u8 q2, r10
+ vst1.8 {q1}, [r4], r6 @ 16 bytes store
+ ldrb r11, [r0], r1
+ vst1.8 {q2}, [r4]! @ 16 bytes store
+ vdup.u8 q3, r11
+ vst1.8 {q2}, [r4], r6 @ 16 bytes store
+ ldrb r8, [r0], r1
+ vst1.8 {q3}, [r4]! @ 16 bytes store
+ vdup.u8 q0, r8
+ ldrb r9, [r0], r1
+ vst1.8 {q3}, [r4], r6 @ 16 bytes store
+ ldrb r10, [r0], r1
+ vst1.8 {q0}, [r4]! @ 16 bytes store
+ vdup.u8 q1, r9
+ vst1.8 {q0}, [r4], r6 @ 16 bytes store
+ ldrb r11, [r0], r1
+ vst1.8 {q1}, [r4]! @ 16 bytes store
+ vdup.u8 q2, r10
+ vst1.8 {q1}, [r4], r6 @ 16 bytes store
+ vst1.8 {q2}, [r4]! @ 16 bytes store
+ vdup.u8 q3, r11
+ vst1.8 {q2}, [r4], r6 @ 16 bytes store
+ subs r2, r2, #8
+ vst1.8 {q3}, [r4]! @ 16 bytes store
+ vst1.8 {q3}, [r4], r6 @ 16 bytes store
+ bne loop_32
+
+
+
+end_func:
+ ldmfd sp!, {r4-r11, pc} @Reload the registers from SP
+
+
+
+
+
+@/**
+@*******************************************************************************
+@*
+@* @brief
+@* Padding (chroma block) at the left of a 2d array
+@*
+@* @par Description:
+@* The left column of a 2d array is replicated for pad_size times at the left
+@*
+@*
+@* @param[in] pu1_src
+@* UWORD8 pointer to the source
+@*
+@* @param[in] src_strd
+@* integer source stride
+@*
+@* @param[in] ht
+@* integer height of the array
+@*
+@* @param[in] wd
+@* integer width of the array (each colour component)
+@*
+@* @param[in] pad_size
+@* integer -padding size of the array
+@*
+@* @param[in] ht
+@* integer height of the array
+@*
+@* @param[in] wd
+@* integer width of the array
+@*
+@* @returns
+@*
+@* @remarks
+@* None
+@*
+@*******************************************************************************
+@*/
+@#if PAD_LEFT_CHROMA == C
+@void ih264_pad_left_chroma(UWORD8 *pu1_src,
+@ WORD32 src_strd,
+@ WORD32 ht,
+@ WORD32 pad_size)
+@{
+@ r0 => *pu1_src
+@ r1 => src_strd
+@ r2 => ht
+@ r3 => pad_size
+
+
+
+ .global ih264_pad_left_chroma_a9q
+
+ih264_pad_left_chroma_a9q:
+
+ stmfd sp!, {r4-r11, lr} @stack stores the values of the arguments
+
+ sub r4, r0, r3
+ sub r6, r1, #16
+
+
+loop_32_l_c: @ /*hard coded for width=32 ,height =4,8,12*/
+ ldrh r8, [r0], r1
+ ldrh r9, [r0], r1
+ vdup.u16 q0, r8
+ ldrh r10, [r0], r1
+ vst1.8 {q0}, [r4]! @ 16 bytes store
+ vdup.u16 q1, r9
+ vst1.8 {q0}, [r4], r6 @ 16 bytes store
+ ldrh r11, [r0], r1
+ vst1.8 {q1}, [r4]! @ 16 bytes store
+ vdup.u16 q2, r10
+ vst1.8 {q1}, [r4], r6 @ 16 bytes store
+ vdup.u16 q3, r11
+ vst1.8 {q2}, [r4]! @ 16 bytes store
+ vst1.8 {q2}, [r4], r6 @ 16 bytes store
+ subs r2, r2, #4
+ vst1.8 {q3}, [r4]! @ 16 bytes store
+ vst1.8 {q3}, [r4], r6 @ 16 bytes store
+
+
+ beq end_func_l_c @/* Branching when ht=4*/
+
+ ldrh r8, [r0], r1
+ ldrh r9, [r0], r1
+ vdup.u16 q0, r8
+ ldrh r10, [r0], r1
+ vst1.8 {q0}, [r4]! @ 16 bytes store
+ vdup.u16 q1, r9
+ vst1.8 {q0}, [r4], r6
+ ldrh r11, [r0], r1
+ vst1.8 {q1}, [r4]! @ 16 bytes store
+ vdup.u16 q2, r10
+ vst1.8 {q1}, [r4], r6 @ 16 bytes store
+ vdup.u16 q3, r11
+ vst1.8 {q2}, [r4]! @ 16 bytes store
+ vst1.8 {q2}, [r4], r6 @ 16 bytes store
+ subs r2, r2, #4
+ vst1.8 {q3}, [r4]! @ 16 bytes store
+ vst1.8 {q3}, [r4], r6 @ 16 bytes store
+
+ beq end_func_l_c @/* Branching when ht=8*/
+ bne loop_32_l_c
+
+ ldrh r8, [r0], r1
+ ldrh r9, [r0], r1
+ vdup.u16 q0, r8
+ ldrh r10, [r0], r1
+ vst1.8 {q0}, [r4]! @ 16 bytes store
+ vdup.u16 q1, r9
+ vst1.8 {q0}, [r4], r6
+ ldrh r11, [r0], r1
+ vst1.8 {q1}, [r4]! @ 16 bytes store
+ vdup.u16 q2, r10
+ vst1.8 {q1}, [r4], r6 @ 16 bytes store
+ vdup.u16 q3, r11
+ vst1.8 {q2}, [r4]! @ 16 bytes store
+ vst1.8 {q2}, [r4], r6 @ 16 bytes store
+ vst1.8 {q3}, [r4]! @ 16 bytes store
+ vst1.8 {q3}, [r4], r6 @ 16 bytes store
+
+end_func_l_c:
+ ldmfd sp!, {r4-r11, pc} @Reload the registers from SP
+
+
+
+
+
+@/**
+@*******************************************************************************
+@*
+@* @brief
+@* Padding (luma block) at the right of a 2d array
+@*
+@* @par Description:
+@* The right column of a 2d array is replicated for pad_size times at the right
+@*
+@*
+@* @param[in] pu1_src
+@* UWORD8 pointer to the source
+@*
+@* @param[in] src_strd
+@* integer source stride
+@*
+@* @param[in] ht
+@* integer height of the array
+@*
+@* @param[in] wd
+@* integer width of the array
+@*
+@* @param[in] pad_size
+@* integer -padding size of the array
+@*
+@* @param[in] ht
+@* integer height of the array
+@*
+@* @param[in] wd
+@* integer width of the array
+@*
+@* @returns
+@*
+@* @remarks
+@* None
+@*
+@*******************************************************************************
+@*/
+@#if PAD_RIGHT_LUMA == C
+@void ih264_pad_right_luma(UWORD8 *pu1_src,
+@ WORD32 src_strd,
+@ WORD32 ht,
+@ WORD32 pad_size)
+@{
+@ WORD32 row;
+@
+@ for(row = 0; row < ht; row++)
+@ {
+@ memset(pu1_src, *(pu1_src -1), pad_size);
+@
+@ pu1_src += src_strd;
+@ }
+@}
+@
+@ r0 => *pu1_src
+@ r1 => src_strd
+@ r2 => ht
+@ r3 => pad_size
+
+
+
+ .global ih264_pad_right_luma_a9q
+
+ih264_pad_right_luma_a9q:
+
+ stmfd sp!, {r4-r11, lr} @stack stores the values of the arguments
+
+ mov r4, r0
+ sub r6, r1, #16
+ sub r0, r0, #1
+ subs r5, r3, #16
+ bne loop_32
+loop_16_r: @ /*hard coded for width=16 ,height =8,16*/
+ ldrb r8, [r0], r1
+ ldrb r9, [r0], r1
+ vdup.u8 q0, r8
+ ldrb r10, [r0], r1
+ vst1.8 {q0}, [r4], r1 @ 16 bytes store
+ vdup.u8 q1, r9
+ vst1.8 {q1}, [r4], r1 @ 16 bytes store
+ ldrb r11, [r0], r1
+ vdup.u8 q2, r10
+ vdup.u8 q3, r11
+ vst1.8 {q2}, [r4], r1 @ 16 bytes store
+ ldrb r8, [r0], r1
+ vst1.8 {q3}, [r4], r1 @ 16 bytes store
+ ldrb r9, [r0], r1
+ vdup.u8 q0, r8
+ ldrb r10, [r0], r1
+ vst1.8 {q0}, [r4], r1 @ 16 bytes store
+ vdup.u8 q1, r9
+ ldrb r11, [r0], r1
+ vst1.8 {q1}, [r4], r1 @ 16 bytes store
+ vdup.u8 q2, r10
+ vdup.u8 q3, r11
+ subs r2, r2, #8
+ vst1.8 {q2}, [r4], r1 @ 16 bytes store
+ vst1.8 {q3}, [r4], r1 @ 16 bytes store
+ bne loop_16_r
+ b end_func_r
+
+loop_32_r: @ /*hard coded for width=32 ,height =8,16*/
+ ldrb r8, [r0], r1
+ ldrb r9, [r0], r1
+ vdup.u8 q0, r8
+ ldrb r10, [r0], r1
+ vst1.8 {q0}, [r4]! @ 16 bytes store
+ vdup.u8 q1, r9
+ vst1.8 {q0}, [r4], r6
+ vst1.8 {q1}, [r4]! @ 16 bytes store
+ vdup.u8 q2, r10
+ vst1.8 {q1}, [r4], r6 @ 16 bytes store
+ ldrb r11, [r0], r1
+ vst1.8 {q2}, [r4]! @ 16 bytes store
+ vdup.u8 q3, r11
+ vst1.8 {q2}, [r4], r6 @ 16 bytes store
+ ldrb r8, [r0], r1
+ vst1.8 {q3}, [r4]! @ 16 bytes store
+ ldrb r9, [r0], r1
+ vdup.u8 q0, r8
+ vst1.8 {q3}, [r4], r6 @ 16 bytes store
+ ldrb r10, [r0], r1
+ vst1.8 {q0}, [r4]! @ 16 bytes store
+ vdup.u8 q1, r9
+ vst1.8 {q0}, [r4], r6 @ 16 bytes store
+ ldrb r11, [r0], r1
+ vst1.8 {q1}, [r4]! @ 16 bytes store
+ vdup.u8 q2, r10
+ vst1.8 {q1}, [r4], r6 @ 16 bytes store
+ vst1.8 {q2}, [r4]! @ 16 bytes store
+ vdup.u8 q3, r11
+ vst1.8 {q2}, [r4], r6 @ 16 bytes store
+ subs r2, r2, #8
+ vst1.8 {q3}, [r4]! @ 16 bytes store
+ vst1.8 {q3}, [r4], r6 @ 16 bytes store
+ bne loop_32_r
+
+
+
+end_func_r:
+ ldmfd sp!, {r4-r11, pc} @Reload the registers from SP
+
+
+
+
+
+@/**
+@*******************************************************************************
+@*
+@* @brief
+@;* Padding (chroma block) at the right of a 2d array
+@*
+@* @par Description:
+@* The right column of a 2d array is replicated for pad_size times at the right
+@*
+@*
+@* @param[in] pu1_src
+@;* UWORD8 pointer to the source
+@*
+@* @param[in] src_strd
+@* integer source stride
+@*
+@* @param[in] ht
+@;* integer height of the array
+@*
+@* @param[in] wd
+@* integer width of the array (each colour component)
+@*
+@* @param[in] pad_size
+@* integer -padding size of the array
+@*
+@* @param[in] ht
+@;* integer height of the array
+@*
+@* @param[in] wd
+@* integer width of the array
+@*
+@* @returns
+@*
+@* @remarks
+@* None
+@*
+@*******************************************************************************
+@*/
+@#if PAD_RIGHT_CHROMA == C
+@void ih264_pad_right_chroma(UWORD8 *pu1_src,
+@ WORD32 src_strd,
+@ WORD32 ht,
+@ WORD32 pad_size)
+@ r0 => *pu1_src
+@ r1 => src_strd
+@ r2 => ht
+@ r3 => pad_size
+
+
+
+ .global ih264_pad_right_chroma_a9q
+
+ih264_pad_right_chroma_a9q:
+
+ stmfd sp!, {r4-r11, lr} @stack stores the values of the arguments
+
+ mov r4, r0
+ sub r6, r1, #16
+ sub r0, r0, #2
+loop_32_r_c: @ /*hard coded for width=32 ,height =8,4*/
+ ldrh r8, [r0], r1
+ ldrh r9, [r0], r1
+ vdup.u16 q0, r8
+ ldrh r10, [r0], r1
+ vst1.8 {q0}, [r4]! @ 16 bytes store
+ vdup.u16 q1, r9
+ vst1.8 {q0}, [r4], r6
+ vst1.8 {q1}, [r4]! @ 16 bytes store
+ vdup.u16 q2, r10
+ vst1.8 {q1}, [r4], r6 @ 16 bytes store
+ subs r2, r2, #4
+ ldrh r11, [r0], r1
+ vst1.8 {q2}, [r4]! @ 16 bytes store
+ vdup.u16 q3, r11
+ vst1.8 {q2}, [r4], r6 @ 16 bytes store
+ vst1.8 {q3}, [r4]! @ 16 bytes store
+ vst1.8 {q3}, [r4], r6 @ 16 bytes store
+
+ beq end_func_r_c @/* Branching when ht=4*/
+
+ ldrh r8, [r0], r1
+ vdup.u16 q0, r8
+ ldrh r9, [r0], r1
+ ldrh r10, [r0], r1
+ vst1.8 {q0}, [r4]! @ 16 bytes store
+ vdup.u16 q1, r9
+ vst1.8 {q0}, [r4], r6 @ 16 bytes store
+ ldrh r11, [r0], r1
+ vst1.8 {q1}, [r4]! @ 16 bytes store
+ vdup.u16 q2, r10
+ vst1.8 {q1}, [r4], r6 @ 16 bytes store
+ vst1.8 {q2}, [r4]! @ 16 bytes store
+ vdup.u16 q3, r11
+ vst1.8 {q2}, [r4], r6 @ 16 bytes store
+ subs r2, r2, #4
+ vst1.8 {q3}, [r4]! @ 16 bytes store
+ vst1.8 {q3}, [r4], r6 @ 16 bytes store
+
+ beq end_func_r_c @/* Branching when ht=8*/
+ bne loop_32_r_c
+
+ ldrh r8, [r0], r1
+ vdup.u16 q0, r8
+ ldrh r9, [r0], r1
+ ldrh r10, [r0], r1
+ vst1.8 {q0}, [r4]! @ 16 bytes store
+ vdup.u16 q1, r9
+ vst1.8 {q0}, [r4], r6 @ 16 bytes store
+ ldrh r11, [r0], r1
+ vst1.8 {q1}, [r4]! @ 16 bytes store
+ vdup.u16 q2, r10
+ vst1.8 {q1}, [r4], r6 @ 16 bytes store
+ vst1.8 {q2}, [r4]! @ 16 bytes store
+ vdup.u16 q3, r11
+ vst1.8 {q2}, [r4], r6 @ 16 bytes store
+ vst1.8 {q3}, [r4]! @ 16 bytes store
+ vst1.8 {q3}, [r4], r6 @ 16 bytes store
+
+end_func_r_c:
+ ldmfd sp!, {r4-r11, pc} @Reload the registers from SP
+
+
+
+
+
diff --git a/common/arm/ih264_platform_macros.h b/common/arm/ih264_platform_macros.h
new file mode 100755
index 0000000..1f67403
--- /dev/null
+++ b/common/arm/ih264_platform_macros.h
@@ -0,0 +1,152 @@
+/******************************************************************************
+ *
+ * Copyright (C) 2015 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ *****************************************************************************
+ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
+*/
+/**
+*******************************************************************************
+* @file
+* ih264_platform_macros.h
+*
+* @brief
+* Platform specific Macro definitions used in the codec
+*
+* @author
+* Ittiam
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+#ifndef _IHEVC_PLATFORM_MACROS_H_
+#define _IHEVC_PLATFORM_MACROS_H_
+
+#ifndef ARMV8
+void ih264_arm_dsb(void);
+
+#define DATA_SYNC() ih264_arm_dsb()
+static __inline WORD32 CLIP_U8(WORD32 x)
+{
+ asm("usat %0, #8, %1" : "=r"(x) : "r"(x));
+ return x;
+}
+
+static __inline WORD32 CLIP_S8(WORD32 x)
+{
+ asm("ssat %0, #8, %1" : "=r"(x) : "r"(x));
+ return x;
+}
+
+static __inline WORD32 CLIP_U10(WORD32 x)
+{
+ asm("usat %0, #10, %1" : "=r"(x) : "r"(x));
+ return x;
+}
+
+static __inline WORD32 CLIP_S10(WORD32 x)
+{
+ asm("ssat %0, #10, %1" : "=r"(x) : "r"(x));
+ return x;
+}
+
+static __inline WORD32 CLIP_U12(WORD32 x)
+{
+ asm("usat %0, #12, %1" : "=r"(x) : "r"(x));
+ return x;
+}
+
+static __inline WORD32 CLIP_S12(WORD32 x)
+{
+ asm("ssat %0, #12, %1" : "=r"(x) : "r"(x));
+ return x;
+}
+
+static __inline WORD32 CLIP_U16(WORD32 x)
+{
+ asm("usat %0, #16, %1" : "=r"(x) : "r"(x));
+ return x;
+}
+static __inline WORD32 CLIP_S16(WORD32 x)
+{
+ asm("ssat %0, #16, %1" : "=r"(x) : "r"(x));
+ return x;
+}
+
+
+static __inline UWORD32 ITT_BIG_ENDIAN(UWORD32 x)
+{
+ asm("rev %0, %1" : "=r"(x) : "r"(x));
+ return x;
+}
+#else
+#define DATA_SYNC() ;
+
+#define CLIP_U8(x) CLIP3(0, 255, (x))
+#define CLIP_S8(x) CLIP3(-128, 127, (x))
+
+#define CLIP_U10(x) CLIP3(0, 1023, (x))
+#define CLIP_S10(x) CLIP3(-512, 511, (x))
+
+#define CLIP_U12(x) CLIP3(0, 4095, (x))
+#define CLIP_S12(x) CLIP3(-2048, 2047, (x))
+
+#define CLIP_U16(x) CLIP3(0, 65535, (x))
+#define CLIP_S16(x) CLIP3(-32768, 32767, (x))
+
+#define ITT_BIG_ENDIAN(x) ((x & 0x000000ff) << 24) | \
+ ((x & 0x0000ff00) << 8) | \
+ ((x & 0x00ff0000) >> 8) | \
+ ((UWORD32)x >> 24);
+#endif
+
+#define SHL(x,y) (((y) < 32) ? ((x) << (y)) : 0)
+#define SHR(x,y) (((y) < 32) ? ((x) >> (y)) : 0)
+
+#define SHR_NEG(val,shift) ((shift>0)?(val>>shift):(val<<(-shift)))
+#define SHL_NEG(val,shift) ((shift<0)?(val>>(-shift)):(val<<shift))
+
+#define INLINE inline
+
+static INLINE UWORD32 CLZ(UWORD32 u4_word)
+{
+ if(u4_word)
+ return (__builtin_clz(u4_word));
+ else
+ return 32;
+}
+static INLINE UWORD32 CTZ(UWORD32 u4_word)
+{
+ if(0 == u4_word)
+ return 31;
+ else
+ {
+ unsigned int index;
+ index = __builtin_ctz(u4_word);
+ return (UWORD32)index;
+ }
+}
+
+
+#define NOP(nop_cnt) {UWORD32 nop_i; for (nop_i = 0; nop_i < nop_cnt; nop_i++);}
+
+
+#define MEM_ALIGN8 __attribute__ ((aligned (8)))
+#define MEM_ALIGN16 __attribute__ ((aligned (16)))
+#define MEM_ALIGN32 __attribute__ ((aligned (32)))
+
+#endif /* _IHEVC_PLATFORM_MACROS_H_ */
diff --git a/common/arm/ih264_resi_trans_a9.s b/common/arm/ih264_resi_trans_a9.s
new file mode 100755
index 0000000..08821f5
--- /dev/null
+++ b/common/arm/ih264_resi_trans_a9.s
@@ -0,0 +1,604 @@
+@/******************************************************************************
+@ *
+@ * Copyright (C) 2015 The Android Open Source Project
+@ *
+@ * Licensed under the Apache License, Version 2.0 (the "License");
+@ * you may not use this file except in compliance with the License.
+@ * You may obtain a copy of the License at:
+@ *
+@ * http://www.apache.org/licenses/LICENSE-2.0
+@ *
+@ * Unless required by applicable law or agreed to in writing, software
+@ * distributed under the License is distributed on an "AS IS" BASIS,
+@ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@ * See the License for the specific language governing permissions and
+@ * limitations under the License.
+@ *
+@ *****************************************************************************
+@ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
+@*/
+@/**
+@*******************************************************************************
+@* @file
+@* ih264_resi_trans_a9.s
+@*
+@* @brief
+@* Contains function definitions for residual and forward trans
+@*
+@* @author
+@* Ittiam
+@*
+@* @par List of Functions:
+@* ih264_resi_trans_4x4_a9
+@* ih264_resi_trans_8x8_a9
+@* @remarks
+@* None
+@*
+@*******************************************************************************
+
+
+.text
+.p2align 2
+@*****************************************************************************
+@*
+@* Function Name : ih264_resi_trans_4x4_a9
+@* Description : This function does cf4 of H264 followed by and approximate scaling
+@*
+@* Arguments :
+@ R0 :pointer to src buffer
+@ R1 :pointer to pred buffer
+@ R2 :pointer to dst buffer
+@ R3 :src_stride
+@ STACk :pred_stride,dst_stride
+
+@* Values Returned : NONE
+@*
+@* Register Usage :
+@* Stack Usage :
+@* Cycles : Around
+@* Interruptiaility : Interruptable
+@*
+@* Known Limitations
+@* \Assumptions :
+@*
+@* Revision History :
+@* DD MM YYYY Author(s) Changes
+@* 30 12 2009 100633 First version
+@*
+@*****************************************************************************
+
+
+ .global ih264_resi_trans_4x4_a9
+ .extern g_scal_coff_h264_4x4
+g_scal_coff_h264_4x4_addr:
+ .long g_scal_coff_h264_4x4 - 4x4lbl - 8
+
+ih264_resi_trans_4x4_a9:
+
+ @R0 :pointer to src buffer
+ @R1 :pointer to pred buffer
+ @R2 :pointer to dst buffer
+ @R3 :src_stride
+ @STACk :pred_stride,dst_stride
+
+ push {r4-r12, lr} @push all the variables first
+
+ mov r6, sp
+ add r6, r6, #40 @decrement stack pointer,to accomodate two variables
+ ldmfd r6, {r4-r5} @load the strides into registers
+ @R4 pred_stride
+ @R5 dst_stride
+
+
+ @we have to give the stride as post inrement in VLDR1
+ @but since thr stride is from end of row 1 to start of row 2,
+ @we need to add the size of the curent row to strides ie we need to add 4 to it (4 bytes)
+ @ADD R3,#4
+ @ADD R4,#4
+ @ADD R5,#4
+ @in case of dst the stride represnts 16 bit ie 2*8bits
+ @hence we need to add #4 to it and thenm multiply by 2
+ @--------------------function loading done------------------------
+
+ @lets find residual
+ @data is like 1a -> d0[1:31] d0[32:64]
+ @ a b c d # # # #
+ vld1.u8 d30, [r0], r3 @load 4 pixels of row1 current buffer
+ vld1.u8 d31, [r1], r4 @load 4 pixels of row1 pred buffer
+ @ data is like 1a -> q4[1:63] q4[64:148]
+ @ d8[1:63] d9[1:63]
+ @ a b c d # # # #
+
+ vld1.u8 d28, [r0], r3 @load row 2 of src to d28[0]
+ vld1.u8 d29, [r1], r4 @load row2 of pred to d29[0]
+
+ vld1.u8 d26, [r0], r3 @load row 3 of src to d26[0]
+ vsubl.u8 q0, d30, d31 @curr - pred for row one
+
+ vld1.u8 d27, [r1], r4 @load row 3of pred t0 d27[0]
+ vsubl.u8 q1, d28, d29 @find row 2 of src -pred to d0
+
+ vld1.u8 d24, [r0], r3 @load row 4 of src to d24[0]
+
+ vld1.u8 d25, [r1], r4 @load row 4 of src tp d25[0]
+ vsubl.u8 q2, d26, d27 @load src-pred row 3 to d[2]
+
+ lsl r5, r5, #2 @ multiply dst stride by since we are storing 32 bit values
+ ldr r6, g_scal_coff_h264_4x4_addr
+4x4lbl:
+ add r6, r6, pc @ load the address of global array
+
+ vsubl.u8 q3, d24, d25 @load row 4 of src - pred to q6
+
+ @after this
+ @D0 -> 1a
+ @D2 -> 2a
+ @D4 -> 3a
+ @D6 -> 4a
+
+ @transpose the matrix so that we can do the horizontal transform first
+ @#1 #2 #3 #4
+ @a b c d ---- D0
+ @e f g h -----D2
+ @i j k l -----D4
+ @m n o p -----D6
+ @transpose the inner 2x2 blocks
+ vtrn.16 d0, d2
+ vld1.s16 {q10}, [r6]! @ load the scaling values 0-7;
+ vtrn.16 d4, d6
+ @a e c g
+ @b f d h
+ @i m k o
+ @j n l p
+ vtrn.32 d0, d4
+ vtrn.32 d2, d6
+ @a e i m #1 -- D0 --- x4
+ @b f j n #2 -- D2 --- x5
+ @c g k o #3 -- D4 ----x6
+ @d h l p #4 -- D6 ----x7
+
+ @we have loaded the residuals into the registers , now we need to add and subtract them
+ @let us do the horiz transform first
+
+ vsub.s16 d5, d2, d4 @x2 = x5-x6
+ vsub.s16 d7, d0, d6 @x3 = x4-x7;
+
+ vadd.s16 d3, d2, d4 @x1 = x5+x6
+ vadd.s16 d1, d0, d6 @x0 = x4+x7
+
+
+ vshl.s16 d31, d7, #1 @
+ vshl.s16 d30, d5, #1 @
+
+ vadd.s16 d0, d1, d3 @x0 + x1;
+ vsub.s16 d4, d1, d3 @x0 - x1;
+
+ vadd.s16 d2, d31, d5 @U_SHIFT(x3,1,shft) + x2;
+ vsub.s16 d6, d7, d30 @x3 - U_SHIFT(x2,1,shft);
+
+ @taking transform again so as to make do vert transform
+ vtrn.16 d0, d2
+ vtrn.16 d4, d6
+
+ vtrn.32 d0, d4
+ vtrn.32 d2, d6
+
+ @let us do vertical transform
+ @same code as horiz
+
+ vadd.s16 d1, d0, d6 @x0 = x4+x7
+ vadd.s16 d3, d2, d4 @x1 = x5+x6
+ vsub.s16 d7, d0, d6 @x3 = x4-x7;
+ vsub.s16 d5, d2, d4 @x2 = x5-x6
+
+
+@Since we are going to do scal / quant or whatever, we are going to divide by
+@a 32 bit number. So we have to expand the values
+
+ @VADDL.S16 Q12,D1,D3;x0 + x1
+ @VSUBL.S16 Q14,D1,D3;x0 - x1
+
+ @VSHL.S16 D8,D5,#1;
+ @VSHL.S16 D9,D7,#1;
+
+ @VADDL.S16 Q13,D9,D5 ; + x2
+ @VSUBL.S16 Q15,D7,D8 ;x3 - U_SHIFT(x2,1,shft)
+
+@scaling follows
+
+@now we need to do the scaling,so load the scaling matrix
+@mutliplying by the scaling coeffient; store the results from q5-q8 ;
+
+ vadd.s16 d24, d3, d1 @x4 = x0 + x1
+ vsub.s16 d28, d1, d3 @x6 = x0 - x1
+
+ vshl.s16 d0, d7, #1 @ U_SHIFT(x3,1,shft)
+ vmull.s16 q4, d24, d20 @x4*s0
+
+ vshl.s16 d2, d5, #1 @ U_SHIFT(x2,1,shft)
+
+ vadd.s16 d26, d0, d5 @x5 = U_SHIFT(x3,1,shft) + x2
+ vmull.s16 q5, d26, d21 @x5*s1
+
+ vst1.s32 {q4}, [r2], r5 @save 4 pixels of row1 current buffer and increment pointer by stride
+
+ vld1.s16 {q10}, [r6] @load 8-16 scaling coeffcients
+
+ vsub.s16 d30, d7, d2 @x7 = x3 - U_SHIFT(x2,1,shft)
+
+ vmull.s16 q6, d28, d20 @x6*s2
+ vst1.s32 {q5}, [r2], r5
+
+ vmull.s16 q7, d30, d21 @x7*s3
+
+
+ vst1.s32 {q6}, [r2], r5
+ vst1.s32 {q7}, [r2]
+
+ pop {r4-r12, pc} @pop back all variables
+
+
+
+
+@*****************************************************************************
+@* Function Name : ih264_resi_trans_8x8_a9
+@* Description : This function does cf8 followd by an approximate normalization of H264
+@*
+@* Arguments :
+@* R0 :pointer to src buffer
+@ R1 :pointer to pred buffer
+@ R2 :pointer to dst buffer
+@ R3 :src_stride
+@ STACk :pred_stride,dst_st
+@*
+@*
+@* Values Returned : NONE
+@*
+@* Register Usage :
+@* Stack Usage :
+@* Cycles : Around
+@* Interruptiaility : Interruptable
+@*
+@* Known Limitations
+@* \Assumptions :
+@*
+@* Revision History :
+@* DD MM YYYY Author(s) Changes
+@* 30 12 2009 100633 First version
+@*
+@*****************************************************************************
+
+
+ .global ih264_resi_trans_8x8_a9
+ .extern g_scal_coff_h264_8x8
+g_scal_coff_h264_8x8_addr:
+ .long g_scal_coff_h264_8x8 - 8x8lbl - 8
+
+
+ih264_resi_trans_8x8_a9:
+
+ @R0 :pointer to src buffer
+ @R1 :pointer to pred buffer
+ @R2 :pointer to dst buffer
+ @R3 :src_stride
+ @STACk :pred_stride,dst_stride
+
+ push {r4-r12, lr} @push all the variables first
+
+ mov r6, sp
+ add r6, r6, #40 @decrement stack pointer,to accomodate two variables
+ ldmfd r6, {r4-r5} @load the strides into registers
+ @R4 pred_stride
+ @R5 dst_stride
+
+ @we have to give the stride as post inrement in vst1
+ @in case of dst the stride represnts 16 bit ie 2*8bits
+ @hence we need to add #4 to it and thenm multiply by 2
+ @--------------------function loading done------------------------
+
+ @lets find residual
+ @data is like 1a -> d0[1:31] d0[32:64]
+ @ a b c d # # # #
+ vld1.u8 d30, [r0], r3 @load 4 pixels of row1 current buffer
+ vld1.u8 d31, [r1], r4 @load 4 pixels of row1 pred buffer
+
+ vld1.u8 d28, [r0], r3 @src rw2
+ vld1.u8 d29, [r1], r4 @pred rw2
+ vsubl.u8 q0, d30, d31 @src-pred rw1
+
+ vld1.u8 d26, [r0], r3
+ vld1.u8 d27, [r1], r4
+ vsubl.u8 q1, d28, d29
+
+ vld1.u8 d24, [r0], r3
+ vld1.u8 d25, [r1], r4
+ vsubl.u8 q2, d26, d27
+
+ vld1.u8 d22, [r0], r3
+ vld1.u8 d23, [r1], r4
+ vsubl.u8 q3, d24, d25
+
+ vld1.u8 d20, [r0], r3
+ vld1.u8 d21, [r1], r4
+ vsubl.u8 q4, d22, d23
+
+ vld1.u8 d18, [r0], r3
+ vld1.u8 d19, [r1], r4
+ vsubl.u8 q5, d20, d21
+
+ vld1.u8 d16, [r0], r3
+ vld1.u8 d17, [r1], r4
+ vsubl.u8 q6, d18, d19
+
+ lsl r5, r5, #2
+
+
+ vsubl.u8 q7, d16, d17
+
+ @after this
+ @Q0 -> 1a
+ @Q1 -> 2a
+ @Q2 -> 3a
+ @Q3 -> 4a
+ @Q4 -> 5a
+ @Q5 -> 6a
+ @Q6 -> 7a
+ @Q7 -> 8a
+
+ @transpose the matrix so that we can do the horizontal transform first
+
+ @transpose the inner 2x2 blocks
+ vtrn.16 q0, q1
+ vtrn.16 q2, q3
+ vtrn.16 q4, q5
+ vtrn.16 q6, q7
+
+ @transpose the inner 4x4 blocks
+ vtrn.32 q0, q2
+ vtrn.32 q1, q3
+
+ vtrn.32 q4, q6
+ vtrn.32 q5, q7
+
+ @transpose the outer 8x8 blocks
+ vswp d1, d8
+ vswp d7, d14
+ vswp d3, d10
+ vswp d5, d12
+ @transpose done
+
+@@this point we will have data in Q0-Q7
+@Q7 will be populated within 2 clock cycle
+@all others are availabe @ this clock cycle
+
+ @we have loaded the residuals into the registers , now we need to add and subtract them
+ @let us do the horiz transform first
+
+ vadd.s16 q8, q0, q7 @ a0 = r0 + r7;
+ vadd.s16 q9, q1, q6 @ a1 = r1 + r6;
+ vadd.s16 q10, q2, q5 @ a2 = r2 + r5;
+ vadd.s16 q11, q3, q4 @ a3 = r3 + r4;
+
+ vsub.s16 q12, q0, q7 @ b0 = r0 - r7;
+ vsub.s16 q13, q1, q6 @ b1 = r1 - r6;
+ vsub.s16 q15, q3, q4 @ b3 = r3 - r4;
+ vsub.s16 q14, q2, q5 @ b2 = r2 - r5;
+
+ vadd.s16 q1, q8, q11 @ a4 = a0 + a3;
+ vadd.s16 q3, q9, q10 @ a5 = a1 + a2;
+ vsub.s16 q7, q9, q10 @ a7 = a1 - a2;
+ vsub.s16 q5, q8, q11 @ a6 = a0 - a3;
+
+ ldr r6, g_scal_coff_h264_8x8_addr
+8x8lbl:
+ add r6, r6, pc @ load the address of global array
+
+ vadd.s16 q0, q1, q3 @ pi2_res[0] = a4 + a5;
+ vshr.s16 q8, q7, #1 @ pi2_res[2] = a6 + D_SHIFT(a7,1,shft);
+
+ vsub.s16 q4, q1, q3 @ pi2_res[4] = a4 - a5;
+
+ vadd.s16 q2, q5, q8 @
+
+
+ vshr.s16 q9, q5, #1 @ pi2_res[6] = D_SHIFT(a6,1,shft) - a7;
+ vsub.s16 q6, q9, q7 @
+
+@do not change Q0,Q2.Q4,Q6 they contain results
+@Q1,Q3,Q5,Q7 TO STORE RESULTS
+@Q8 Q9 Q10 Q11 USE @WILL
+
+ vshr.s16 q1, q12, #1 @ D_SHIFT(b0,1,shft)
+ vshr.s16 q3, q13, #1 @ D_SHIFT(b1,1,shft)
+ vshr.s16 q5, q14, #1 @ D_SHIFT(b2,1,shft)
+ vshr.s16 q7, q15, #1 @ D_SHIFT(b3,1,shft)
+
+ vadd.s16 q8, q1, q12 @ (D_SHIFT(b0,1,shft) + b0);
+ vadd.s16 q9, q3, q13 @ (D_SHIFT(b1,1,shft) + b1);
+ vadd.s16 q10, q5, q14 @ (D_SHIFT(b2,1,shft) + b2);
+ vadd.s16 q11, q7, q15 @ (D_SHIFT(b3,1,shft) + b3);
+
+ vadd.s16 q1, q14, q8 @ b2 + (D_SHIFT(b0,1,shft) + b0);
+ vsub.s16 q5, q15, q9 @ b3 - (D_SHIFT(b1,1,shft) + b1);
+ vadd.s16 q3, q15, q10 @ b3 + (D_SHIFT(b2,1,shft) + b2);
+ vsub.s16 q7, q11, q14 @ -b2 + (D_SHIFT(b3,1,shft) + b3);
+
+ vadd.s16 q8, q13, q1 @ b4 = b1 + b2 + (D_SHIFT(b0,1,shft) + b0);
+ vsub.s16 q9, q12, q3 @ b5 = b0 - b3 - (D_SHIFT(b2,1,shft) + b2);
+ vadd.s16 q10, q12, q5 @ b6 = b0 + b3 - (D_SHIFT(b1,1,shft) + b1);
+ vadd.s16 q11, q13, q7 @ b7 = b1 - b2 + (D_SHIFT(b3,1,shft) + b3);
+
+ vshr.s16 q15, q8, #2 @ D_SHIFT(b4,2,shft)
+ vshr.s16 q14, q9, #2 @ D_SHIFT(b5,2,shft);
+ vshr.s16 q13, q10, #2 @ D_SHIFT(b6,2,shft);
+ vshr.s16 q12, q11, #2 @ D_SHIFT(b7,2,shft);
+
+
+ vadd.s16 q3, q9, q13 @ pi2_res[3] = b5 + D_SHIFT(b6,2,shft);
+ vsub.s16 q5, q10, q14 @ pi2_res[5] = b6 - D_SHIFT(b5,2,shft);
+ vadd.s16 q1, q8, q12 @ pi2_res[1] = b4 + D_SHIFT(b7,2,shft);
+ vsub.s16 q7, q15, q11 @ pi2_res[7] = D_SHIFT(b4,2,shft) - b7;
+
+ @------------horiz transform done-------------------------
+ @results are in Q0-Q7
+ @all other neon registes can be used at will
+
+@doing vertical transform
+@code exact copy of horiz transform above
+
+ @transpose the inner 2x2 blocks
+ vtrn.16 q0, q1
+ vtrn.16 q2, q3
+ vtrn.16 q4, q5
+ vtrn.16 q6, q7
+
+ @transpose the inner 4x4 blocks
+ vtrn.32 q0, q2
+ vtrn.32 q1, q3
+
+ vtrn.32 q4, q6
+ vtrn.32 q5, q7
+
+ @transpose the outer 8x8 blocks
+ vswp d1, d8
+ vswp d3, d10
+ vswp d5, d12
+ vswp d7, d14
+
+ @transpose done
+
+ vadd.s16 q8, q0, q7 @ a0 = r0 + r7;
+ vadd.s16 q9, q1, q6 @ a1 = r1 + r6;
+ vadd.s16 q10, q2, q5 @ a2 = r2 + r5;
+ vadd.s16 q11, q3, q4 @ a3 = r3 + r4;
+
+ vsub.s16 q12, q0, q7 @ b0 = r0 - r7;
+ vsub.s16 q13, q1, q6 @ b1 = r1 - r6;
+ vsub.s16 q14, q2, q5 @ b2 = r2 - r5;
+ vsub.s16 q15, q3, q4 @ b3 = r3 - r4;
+
+ vadd.s16 q1, q8, q11 @ a4 = a0 + a3;
+ vadd.s16 q3, q9, q10 @ a5 = a1 + a2;
+ vsub.s16 q5, q8, q11 @ a6 = a0 - a3;
+ vsub.s16 q7, q9, q10 @ a7 = a1 - a2;
+
+
+ vadd.s16 q0, q1, q3 @ pi2_res[0] = a4 + a5;
+
+ vshr.s16 q8, q7, #1 @ pi2_res[2] = a6 + D_SHIFT(a7,1,shft);
+ @DSHIFT_TO_0 Q8,Q7,#1,#0
+ vadd.s16 q2, q5, q8 @
+
+ vsub.s16 q4, q1, q3 @ pi2_res[4] = a4 - a5;
+
+ vshr.s16 q9, q5, #1 @ pi2_res[6] = D_SHIFT(a6,1,shft) - a7;
+ vsub.s16 q6, q9, q7 @
+
+@do not change Q0,Q2.Q4,Q6 they contain results
+@Q1,Q3,Q5,Q7 TO STORE RESULTS
+@Q8 Q9 Q10 Q11 USE @WILL
+
+ vshr.s16 q1, q12, #1 @ D_SHIFT(b0,1,shft)
+ vshr.s16 q3, q13, #1 @ D_SHIFT(b1,1,shft)
+ vshr.s16 q5, q14, #1 @ D_SHIFT(b2,1,shft)
+ vshr.s16 q7, q15, #1 @ D_SHIFT(b3,1,shft)
+
+
+ vadd.s16 q8, q1, q12 @ (D_SHIFT(b0,1,shft) + b0);
+ vadd.s16 q9, q3, q13 @ (D_SHIFT(b1,1,shft) + b1);
+ vadd.s16 q10, q5, q14 @ (D_SHIFT(b2,1,shft) + b2);
+ vadd.s16 q11, q7, q15 @ (D_SHIFT(b3,1,shft) + b3);
+
+ vadd.s16 q1, q14, q8 @ b2 + (D_SHIFT(b0,1,shft) + b0);
+ vadd.s16 q3, q15, q10 @ b3 + (D_SHIFT(b2,1,shft) + b2);
+ vsub.s16 q5, q15, q9 @ b3 - (D_SHIFT(b1,1,shft) + b1);
+ vsub.s16 q7, q11, q14 @ -b2 + (D_SHIFT(b3,1,shft) + b3);
+
+ vadd.s16 q8, q13, q1 @ b4 = b1 + b2 + (D_SHIFT(b0,1,shft) + b0);
+ vsub.s16 q9, q12, q3 @ b5 = b0 - b3 - (D_SHIFT(b2,1,shft) + b2);
+ vadd.s16 q10, q12, q5 @ b6 = b0 + b3 - (D_SHIFT(b1,1,shft) + b1);
+ vadd.s16 q11, q13, q7 @ b7 = b1 - b2 + (D_SHIFT(b3,1,shft) + b3);
+
+ vshr.s16 q15, q8, #2 @ D_SHIFT(b4,2,shft)
+ vshr.s16 q14, q9, #2 @ D_SHIFT(b5,2,shft);
+ vshr.s16 q13, q10, #2 @ D_SHIFT(b6,2,shft);
+ vshr.s16 q12, q11, #2 @ D_SHIFT(b7,2,shft);
+
+
+@since we are going to scal by small values, we need not expand the guys to 32 bit bit values
+ vsub.s16 q5, q10, q14 @ pi2_res[5] = b6 - D_SHIFT(b5,2,shft);
+ vsub.s16 q7, q15, q11 @ pi2_res[7] = D_SHIFT(b4,2,shft) - b7;
+ vadd.s16 q3, q9, q13 @ pi2_res[3] = b5 + D_SHIFT(b6,2,shft);
+ vadd.s16 q1, q8, q12 @ pi2_res[1] = b4 + D_SHIFT(b7,2,shft);
+
+ @------------vert transform done-------------------------
+ @results are in Q0-Q7
+ @all other neon registes can be used at will
+
+ @scaling
+ @since the 8x8 scaling matrix repeats in 1x4,1x4 block ,
+ @we need only load 4 values for each row and in total 4 rows
+ vld1.s16 {q14-q15}, [r6] @
+
+ @since we need to get a 32 bit o/p for two 16 bit multiplications
+ @we need a VMULL instruction
+@-----------------------------first and second row
+
+ vmull.s16 q8, d0, d28 @scale the first row first 4 elem
+ vmull.s16 q9, d28, d1 @scale the second row last 4 elemts
+
+ vmull.s16 q10, d2, d29 @ scale second row first 4 elem
+ vmull.s16 q11, d29, d3 @scale the second row last 4 elem
+ vmull.s16 q12, d4, d30 @scale third row first 4 elem
+
+ vst1.s32 {q8, q9}, [r2], r5 @ write the first row complete
+
+ vmull.s16 q13, d30, d5 @scale the third row last 4 elem
+ vmull.s16 q8, d6, d31 @scale the fourth row first 4 elem
+
+
+ vst1.s32 {q10, q11}, [r2], r5 @store the second row complete
+
+@------------------------------- 3rd and 4th row
+
+ vmull.s16 q9, d31, d7 @scale the fourth row second column
+
+ vst1.s32 {q12, q13}, [r2], r5 @store the third row complete
+
+ vmull.s16 q10, d8, d28 @scale the 5th row fisrst 4 elms
+ vmull.s16 q11, d28, d9 @scale the 5th row second 4 elems
+
+ vmull.s16 q12, d10, d29 @scale the 6th row first4 elements
+
+
+ vst1.s32 {q8, q9}, [r2], r5 @store fifth row
+
+@--------------------------------5th and 6th row
+
+ vmull.s16 q13, d29, d11 @scale 6th row sendond 4 elems
+
+ vmull.s16 q8, d12, d30 @scale 7th rw first 4 elms
+
+ vst1.s32 {q10, q11}, [r2], r5 @store 6th row second 4 elements
+
+ vmull.s16 q9, d30, d13 @scale 7th rw second 4 elms
+ vmull.s16 q10, d14, d31 @scale 8th rw forst 4 elms
+
+
+ vst1.s32 {q12, q13}, [r2], r5 @store 6th row
+
+@----------------------------------7th and 8th row
+ vmull.s16 q11, d31, d15 @scale 8th row second 4 elms
+
+ vst1.s32 {q8, q9}, [r2], r5 @store 7th row
+ vst1.s32 {q10, q11}, [r2], r5 @store 8th row
+
+@----------------------------------done writing
+
+ pop {r4-r12, pc} @pop back all variables
+
+
+
+
+
+
diff --git a/common/arm/ih264_resi_trans_quant_a9.s b/common/arm/ih264_resi_trans_quant_a9.s
new file mode 100755
index 0000000..caf362e
--- /dev/null
+++ b/common/arm/ih264_resi_trans_quant_a9.s
@@ -0,0 +1,694 @@
+@/******************************************************************************
+@ *
+@ * Copyright (C) 2015 The Android Open Source Project
+@ *
+@ * Licensed under the Apache License, Version 2.0 (the "License");
+@ * you may not use this file except in compliance with the License.
+@ * You may obtain a copy of the License at:
+@ *
+@ * http://www.apache.org/licenses/LICENSE-2.0
+@ *
+@ * Unless required by applicable law or agreed to in writing, software
+@ * distributed under the License is distributed on an "AS IS" BASIS,
+@ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@ * See the License for the specific language governing permissions and
+@ * limitations under the License.
+@ *
+@ *****************************************************************************
+@ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
+@*/
+@/**
+@*******************************************************************************
+@* @file
+@* ih264_resi_trans_quant_a9.s
+@*
+@* @brief
+@* Contains function definitions for residual and forward trans
+@*
+@* @author
+@* Ittiam
+@*
+@* @par List of Functions:
+@* ih264_resi_trans_quant_4x4_a9
+@* ih264_resi_trans_quant_8x8_a9
+@* ih264_resi_trans_quant_chroma_4x4_a9
+@* ih264_hadamard_quant_4x4_a9
+@* ih264_hadamard_quant_2x2_uv_a9
+@*
+@* @remarks
+@* None
+@*
+@*******************************************************************************
+
+
+.text
+.p2align 2
+@*****************************************************************************
+@*
+@* Function Name : ih264_resi_trans_quant_4x4_a9
+@* Description : This function does cf4 of H264
+@*
+@* Arguments : R0 :pointer to src buffer
+@ R1 :pointer to pred buffer
+@ R2 :pointer to dst buffer
+@ R3 :source stride
+@ STACK : pred stride,
+@ dst stride,
+@ pointer to scaling matrix,
+@ pointer to threshold matrix,
+@ qbits,
+@ rounding factor,
+@ pointer to store nnz
+@ pointer to store non quantized dc value
+@ Values Returned : NONE
+@
+@ Register Usage :
+@ Stack Usage : 40 bytes
+@ Cycles : Around
+@ Interruptiaility : Interruptable
+@
+@ Known Limitations
+@ \Assumptions :
+@
+@ Revision History :
+@ DD MM YYYY Author(s) Changes
+@ 1 12 2013 100633 First version
+@ 20 1 2014 100633 Changes the API, Optimization
+@
+@*****************************************************************************
+
+ .global ih264_resi_trans_quant_4x4_a9
+ih264_resi_trans_quant_4x4_a9:
+
+ @R0 :pointer to src buffer
+ @R1 :pointer to pred buffer
+ @R2 :pointer to dst buffer
+ @R3 :Source stride
+ @STACk :pred stride
+ @ :scale matirx,
+ @ :threshold matrix
+ @ :qbits
+ @ :round factor
+ @ :nnz
+
+ push {r4-r12, lr} @push all the variables first
+
+ add r11, sp, #40 @decrement stack pointer,to accomodate two variables
+ ldmfd r11, {r4-r10} @load the strides into registers
+
+ @R0 :pointer to src buffer
+ @R1 :pointer to pred buffer
+ @R2 :pointer to dst buffer
+ @R3 :Source stride
+ @R4 :Pred stride
+ @R5 :scale matirx,
+ @R6 :threshold matrix
+ @R7 :qbits
+ @R8 :round factor
+ @R9 :nnz
+
+ vpush {d8-d15}
+
+ mov r11, #0
+ sub r7, r11, r7 @Negate the qbit value for usiing LSL
+
+ @------------Fucntion Loading done----------------;
+
+ vld1.u8 d30, [r0], r3 @load first 8 pix src row 1
+
+ vld1.u8 d31, [r1], r4 @load first 8 pix pred row 1
+
+ vld1.u8 d28, [r0], r3 @load first 8 pix src row 2
+
+ vld1.u8 d29, [r1], r4 @load first 8 pix pred row 2
+
+ vld1.u8 d26, [r0], r3 @load first 8 pix src row 3
+
+ vld1.u8 d27, [r1], r4 @load first 8 pix pred row 3
+ vsubl.u8 q0, d30, d31 @find residue row 1
+
+ vld1.u8 d24, [r0], r3 @load first 8 pix src row 4
+
+ vld1.u8 d25, [r1], r4 @load first 8 pix pred row 4
+ vsubl.u8 q1, d28, d29 @find residue row 2
+
+ vsubl.u8 q2, d26, d27 @find residue row 3
+ vsubl.u8 q3, d24, d25 @find residue row 4
+
+ vtrn.16 d0, d2 @T12
+ vtrn.16 d4, d6 @T23
+ vtrn.32 d0, d4 @T13
+ vtrn.32 d2, d6 @T14
+
+ vadd.s16 d8 , d0, d6 @x0 = x4+x7
+ vadd.s16 d9 , d2, d4 @x1 = x5+x6
+ vsub.s16 d10, d2, d4 @x2 = x5-x6
+ vsub.s16 d11, d0, d6 @x3 = x4-x7
+
+ vshl.s16 d12, d10, #1 @U_SHIFT(x2,1,shft)
+ vshl.s16 d13, d11, #1 @U_SHIFT(x3,1,shft)
+
+ vadd.s16 d14, d8, d9 @x4 = x0 + x1;
+ vsub.s16 d16, d8, d9 @x6 = x0 - x1;
+ vadd.s16 d15, d13, d10 @x5 = U_SHIFT(x3,1,shft) + x2;
+ vsub.s16 d17, d11, d12 @x7 = x3 - U_SHIFT(x2,1,shft);
+
+ @taking transpose again so as to make do vert transform
+ vtrn.16 d14, d15 @T12
+ vtrn.16 d16, d17 @T23
+ vtrn.32 d14, d16 @T13
+ vtrn.32 d15, d17 @T24
+
+ @let us do vertical transform
+ @same code as horiz
+ vadd.s16 d18, d14, d17 @x0 = x4+x7
+ vadd.s16 d19, d15, d16 @x1 = x5+x6
+ vsub.s16 d20, d15, d16 @x2 = x5-x6
+ vsub.s16 d21, d14, d17 @x3 = x4-x7
+
+ vshl.s16 d22, d20, #1 @U_SHIFT(x2,1,shft)
+ vshl.s16 d23, d21, #1 @U_SHIFT(x3,1,shft)
+
+ vdup.s32 q4, r8 @Load rounding value row 1
+
+ vadd.s16 d24, d18, d19 @x5 = x0 + x1;
+ vsub.s16 d26, d18, d19 @x7 = x0 - x1;
+ vadd.s16 d25, d23, d20 @x6 = U_SHIFT(x3,1,shft) + x2;
+ vsub.s16 d27, d21, d22 @x8 = x3 - U_SHIFT(x2,1,shft);
+ vdup.s32 q10, r7 @Load qbit values
+
+ vst1.s16 d24[0], [r10] @Store the dc value to alternate dc sddress
+
+@core tranform is done for 4x8 block 1
+ vld1.s16 {q14-q15}, [r5] @load the scaling values
+
+ vabs.s16 q0, q12 @Abs val of row 1 blk 1
+
+ vabs.s16 q1, q13 @Abs val of row 2 blk 1
+
+ vmov.s32 q5, q4 @copy round fact for row 2
+
+ vmov.s32 q6, q4 @copy round fact for row 2
+ vclt.s16 q2, q12, #0 @Get the sign of row 1 blk 1
+
+ vmov.s32 q7, q4 @copy round fact for row 2
+ vclt.s16 q3, q13, #0 @Get the sign of row 2 blk 1
+
+ vmlal.s16 q4, d0, d28 @Multiply and add row 1
+ vmlal.s16 q5, d1, d29 @Multiply and add row 2
+ vmlal.s16 q6, d2, d30 @Multiply and add row 3
+ vmlal.s16 q7, d3, d31 @Multiply and add row 4
+
+ vshl.s32 q11, q4, q10 @Shift row 1
+ vshl.s32 q12, q5, q10 @Shift row 2
+ vshl.s32 q13, q6, q10 @Shift row 3
+ vshl.s32 q14, q7, q10 @Shift row 4
+
+ vmovn.s32 d30, q11 @Narrow row 1
+ vmovn.s32 d31, q12 @Narrow row 2
+ vmovn.s32 d0 , q13 @Narrow row 3
+ vmovn.s32 d1 , q14 @Narrow row 4
+
+ vneg.s16 q1, q15 @Get negative
+ vneg.s16 q4, q0 @Get negative
+
+ vceq.s16 q5, q15, #0 @I compare with zero row 1 and 2 blk 1
+ vceq.s16 q6, q0 , #0 @I compare with zero row 1 and 2 blk 1
+
+ vbsl.s16 q2, q1, q15 @Restore sign of row 1 and 2
+ vbsl.s16 q3, q4, q0 @Restore sign of row 3 and 4
+
+
+ vmovn.u16 d14, q5 @I Narrow the comparison for row 1 and 2 blk 1
+ vmovn.u16 d15, q6 @I Narrow the comparison for row 1 and 2 blk 2
+
+ vshr.u8 q8, q7, #7 @I Reduce comaparison bit to a signle bit row 1 and 2 blk 1 and 2 [ keep the value for later use ]
+
+ vpadd.u8 d18, d16, d17 @I pair add nnz 1
+ vpadd.u8 d20, d18, d19 @I Pair add nnz 2
+ vpadd.u8 d22, d20, d21 @I Pair add nnz 3
+ vpadd.u8 d24, d22, d23 @I Pair add nnz4
+ vst1.s16 {q2-q3}, [r2] @Store blk
+
+ vmov.u8 d25, #16 @I Get max nnz
+ vsub.u8 d26, d25, d24 @I invert current nnz
+
+ vst1.u8 d26[0], [r9] @I Write nnz
+
+ vpop {d8-d15}
+ pop {r4-r12, pc}
+
+
+
+@*****************************************************************************
+@*
+@* Function Name : ih264_resi_trans_quant_chroma_4x4_a9
+@* Description : This function does residue calculation, forward transform
+@* and quantization for 4x4 chroma block.
+@*
+@* Arguments : R0 :pointer to src buffer
+@ R1 :pointer to pred buffer
+@ R2 :pointer to dst buffer
+@ R3 :source stride
+@ STACK : pred stride,
+@ dst stride,
+@ pointer to scaling matrix,
+@ pointer to threshold matrix,
+@ qbits,
+@ rounding factor,
+@ pointer to store nnz
+@ pointer to store unquantized dc values
+@ Values Returned : NONE
+@
+@ Register Usage :
+@ Stack Usage : 40 bytes
+@ Cycles : Around
+@ Interruptiaility : Interruptable
+@
+@ Known Limitations
+@ \Assumptions :
+@
+@ Revision History :
+@ DD MM YYYY Author(s) Changes
+@ 11 2 2015 100664 First version
+@
+@*****************************************************************************
+
+ .global ih264_resi_trans_quant_chroma_4x4_a9
+ih264_resi_trans_quant_chroma_4x4_a9:
+
+ @R0 :pointer to src buffer
+ @R1 :pointer to pred buffer
+ @R2 :pointer to dst buffer
+ @R3 :Source stride
+ @STACk :pred stride
+ @ :scale matirx,
+ @ :threshold matrix
+ @ :qbits
+ @ :round factor
+ @ :nnz
+ @ :pu1_dc_alt_addr
+ push {r4-r12, lr} @push all the variables first
+
+ add r11, sp, #40 @decrement stack pointer,to accomodate two variables
+ ldmfd r11, {r4-r10} @load the strides into registers
+
+ @R0 :pointer to src buffer
+ @R1 :pointer to pred buffer
+ @R2 :pointer to dst buffer
+ @R3 :Source stride
+ @R4 :Pred stride
+ @R5 :scale matirx,
+ @R6 :threshold matrix
+ @R7 :qbits
+ @R8 :round factor
+ @R9 :nnz
+ vpush {d8-d15}
+ mov r11, #0
+ sub r7, r11, r7 @Negate the qbit value for usiing LSL
+
+ @------------Fucntion Loading done----------------;
+
+ vld2.u8 {d10, d11}, [r0], r3 @load first 8 pix src row 1
+
+ vld2.u8 {d11, d12}, [r1], r4 @load first 8 pix pred row 1
+
+ vld2.u8 {d28, d29}, [r0], r3 @load first 8 pix src row 2
+
+ vld2.u8 {d29, d30}, [r1], r4 @load first 8 pix pred row 2
+
+ vld2.u8 {d25, d26}, [r0], r3 @load first 8 pix src row 3
+
+ vld2.u8 {d26, d27}, [r1], r4 @load first 8 pix pred row 3
+ vsubl.u8 q0, d10, d11 @find residue row 1
+
+ vld2.u8 {d22, d23}, [r0], r3 @load first 8 pix src row 4
+
+ vld2.u8 {d23, d24}, [r1], r4 @load first 8 pix pred row 4
+ vsubl.u8 q1, d28, d29 @find residue row 2
+
+ vsubl.u8 q2, d25, d26 @find residue row 3
+ vsubl.u8 q3, d22, d23 @find residue row 4
+
+ vtrn.16 d0, d2 @T12
+ vtrn.16 d4, d6 @T23
+ vtrn.32 d0, d4 @T13
+ vtrn.32 d2, d6 @T14
+
+ vadd.s16 d8 , d0, d6 @x0 = x4+x7
+ vadd.s16 d9 , d2, d4 @x1 = x5+x6
+ vsub.s16 d10, d2, d4 @x2 = x5-x6
+ vsub.s16 d11, d0, d6 @x3 = x4-x7
+
+ vshl.s16 d12, d10, #1 @U_SHIFT(x2,1,shft)
+ vshl.s16 d13, d11, #1 @U_SHIFT(x3,1,shft)
+
+ vadd.s16 d14, d8, d9 @x4 = x0 + x1;
+ vsub.s16 d16, d8, d9 @x6 = x0 - x1;
+ vadd.s16 d15, d13, d10 @x5 = U_SHIFT(x3,1,shft) + x2;
+ vsub.s16 d17, d11, d12 @x7 = x3 - U_SHIFT(x2,1,shft);
+
+ @taking transpose again so as to make do vert transform
+ vtrn.16 d14, d15 @T12
+ vtrn.16 d16, d17 @T23
+ vtrn.32 d14, d16 @T13
+ vtrn.32 d15, d17 @T24
+
+ @let us do vertical transform
+ @same code as horiz
+ vadd.s16 d18, d14, d17 @x0 = x4+x7
+ vadd.s16 d19, d15, d16 @x1 = x5+x6
+ vsub.s16 d20, d15, d16 @x2 = x5-x6
+ vsub.s16 d21, d14, d17 @x3 = x4-x7
+
+ vshl.s16 d22, d20, #1 @U_SHIFT(x2,1,shft)
+ vshl.s16 d23, d21, #1 @U_SHIFT(x3,1,shft)
+
+ vdup.s32 q4, r8 @Load rounding value row 1
+
+ vadd.s16 d24, d18, d19 @x5 = x0 + x1;
+ vsub.s16 d26, d18, d19 @x7 = x0 - x1;
+ vadd.s16 d25, d23, d20 @x6 = U_SHIFT(x3,1,shft) + x2;
+ vsub.s16 d27, d21, d22 @x8 = x3 - U_SHIFT(x2,1,shft);
+ vdup.s32 q10, r7 @Load qbit values
+
+ vst1.s16 d24[0], [r10] @Store Unquantized dc value to dc alte address
+
+@core tranform is done for 4x8 block 1
+ vld1.s16 {q14-q15}, [r5] @load the scaling values
+
+ vabs.s16 q0, q12 @Abs val of row 1 blk 1
+
+ vabs.s16 q1, q13 @Abs val of row 2 blk 1
+
+ vmov.s32 q5, q4 @copy round fact for row 2
+
+ vmov.s32 q6, q4 @copy round fact for row 2
+ vclt.s16 q2, q12, #0 @Get the sign of row 1 blk 1
+
+ vmov.s32 q7, q4 @copy round fact for row 2
+ vclt.s16 q3, q13, #0 @Get the sign of row 2 blk 1
+
+ vmlal.s16 q4, d0, d28 @Multiply and add row 1
+ vmlal.s16 q5, d1, d29 @Multiply and add row 2
+ vmlal.s16 q6, d2, d30 @Multiply and add row 3
+ vmlal.s16 q7, d3, d31 @Multiply and add row 4
+
+ vshl.s32 q11, q4, q10 @Shift row 1
+ vshl.s32 q12, q5, q10 @Shift row 2
+ vshl.s32 q13, q6, q10 @Shift row 3
+ vshl.s32 q14, q7, q10 @Shift row 4
+
+ vmovn.s32 d30, q11 @Narrow row 1
+ vmovn.s32 d31, q12 @Narrow row 2
+ vmovn.s32 d0 , q13 @Narrow row 3
+ vmovn.s32 d1 , q14 @Narrow row 4
+
+ vneg.s16 q1, q15 @Get negative
+ vneg.s16 q4, q0 @Get negative
+
+ vceq.s16 q5, q15, #0 @I compare with zero row 1 and 2 blk 1
+ vceq.s16 q6, q0 , #0 @I compare with zero row 1 and 2 blk 1
+
+ vbsl.s16 q2, q1, q15 @Restore sign of row 1 and 2
+ vbsl.s16 q3, q4, q0 @Restore sign of row 3 and 4
+
+ vmovn.u16 d14, q5 @I Narrow the comparison for row 1 and 2 blk 1
+ vmovn.u16 d15, q6 @I Narrow the comparison for row 1 and 2 blk 2
+
+ vshr.u8 q8, q7, #7 @I Reduce comaparison bit to a signle bit row 1 and 2 blk 1 and 2 [ keep the value for later use ]
+
+ vpadd.u8 d18, d16, d17 @I pair add nnz 1
+ vpadd.u8 d20, d18, d19 @I Pair add nnz 2
+ vpadd.u8 d22, d20, d21 @I Pair add nnz 3
+ vpadd.u8 d24, d22, d23 @I Pair add nnz4
+ vst1.s16 {q2-q3}, [r2] @Store blk
+
+ vmov.u8 d25, #16 @I Get max nnz
+ vsub.u8 d26, d25, d24 @I invert current nnz
+
+ vst1.u8 d26[0], [r9] @I Write nnz
+
+ vpop {d8-d15}
+ pop {r4-r12, pc}
+
+
+
+@*****************************************************************************
+@*
+@* Function Name : ih264_hadamard_quant_4x4_a9
+@* Description : This function does forward hadamard transform and
+@* quantization for luma dc block
+@*
+@* Arguments : R0 :pointer to src buffer
+@ R1 :pointer to dst buffer
+@ R2 :pu2_scale_matrix
+@ R2 :pu2_threshold_matrix
+@ STACk : u4_qbits
+@ u4_round_factor
+@ pu1_nnz
+@ Values Returned : NONE
+@
+@ Register Usage :
+@ Stack Usage : 0 bytes
+@ Cycles : Around
+@ Interruptiaility : Interruptable
+@
+@ Known Limitations
+@ \Assumptions :
+@
+@ Revision History :
+@ DD MM YYYY Author(s) Changes
+@ 20 2 2015 100633 First version
+@
+@*****************************************************************************
+@ih264_hadamard_quant_4x4_a9(WORD16 *pi2_src, WORD16 *pi2_dst,
+@ const UWORD16 *pu2_scale_matrix,
+@ const UWORD16 *pu2_threshold_matrix, UWORD32 u4_qbits,
+@ UWORD32 u4_round_factor,UWORD8 *pu1_nnz
+@ )
+ .global ih264_hadamard_quant_4x4_a9
+ih264_hadamard_quant_4x4_a9:
+
+@Registert usage
+@ r0 : src
+@ r1 : dst
+@ r2 : *pu2_scale_matrix
+@ r3 : *pu2_threshold_matrix
+
+ vld4.s16 {d0, d1, d2, d3}, [r0]! @Load 4x4 block
+ vpush {d8-d15}
+
+ vld1.u16 d30[0], [r2] @load pu2_scale_matrix[0]
+
+ vaddl.s16 q3, d0, d3 @x0 = x4 + x7;
+ vaddl.s16 q4, d1, d2 @x1 = x5 + x6;
+ vsubl.s16 q5, d1, d2 @x2 = x5 - x6;
+ vsubl.s16 q6, d0, d3 @x3 = x4 - x7;
+
+ vdup.u16 d30, d30[0] @pu2_scale_matrix[0]
+
+ vadd.s32 q7, q3, q4 @pi2_dst[0] = x0 + x1;
+ vadd.s32 q8, q6, q5 @pi2_dst[1] = x3 + x2;
+ add r3, sp, #68 @Get address of u4_round_factor
+ vsub.s32 q9, q3, q4 @pi2_dst[2] = x0 - x1;
+ vsub.s32 q10, q6, q5 @pi2_dst[3] = x3 - x2;
+
+ vtrn.s32 q7, q8 @transpose 4x4 block
+ vtrn.s32 q9, q10
+ vld1.s32 d0[0], [r3] @load u4_round_factor
+ vswp d15, d18
+ vswp d17, d20
+
+ add r3, sp, #64 @Get address of u4_qbits
+ vadd.s32 q11, q7, q10 @x0 = x4 + x7;
+ vadd.s32 q12, q8, q9 @x1 = x5 + x6;
+ vld1.s32 d31[0], [r3] @load u4_qbits
+ vsub.s32 q13, q8, q9 @x2 = x5 - x6;
+ vsub.s32 q14, q7, q10 @x3 = x4 - x7;
+
+ vdup.s32 q7, d0[0] @u4_round_factor
+
+ vadd.s32 q0, q11, q12 @(x0 + x1)
+ vadd.s32 q1, q14, q13 @(x3 + x2)
+ vsub.s32 q2, q11, q12 @(x0 - x1)
+ vsub.s32 q3, q14, q13 @(x3 - x2)
+
+ vdup.s32 q11, d31[0] @u4_round_factor
+
+ vshrn.s32 d0, q0, #1 @i4_value = (x0 + x1) >> 1;
+ vshrn.s32 d1, q1, #1 @i4_value = (x3 + x2) >> 1;
+ vshrn.s32 d2, q2, #1 @i4_value = (x0 - x1) >> 1;
+ vshrn.s32 d3, q3, #1 @i4_value = (x3 - x2) >> 1;
+
+ vabs.s16 q5, q0
+ vabs.s16 q6, q1
+
+ vmov.s32 q8, q7 @Get the round fact
+ vmov.s32 q9, q7
+ vmov.s32 q10, q7
+
+ vclt.s16 q3, q0, #0 @get the sign row 1,2
+ vclt.s16 q4, q1, #0
+
+ vneg.s32 q11, q11 @-u4_round_factor
+
+ vmlal.u16 q7, d10, d30
+ vmlal.u16 q8, d11, d30
+ vmlal.u16 q9, d12, d30
+ vmlal.u16 q10, d13, d30
+
+ vshl.u32 q7, q7, q11
+ vshl.u32 q8, q8, q11
+ vshl.u32 q9, q9, q11
+ vshl.u32 q10, q10, q11
+
+ vqmovn.u32 d22, q7
+ vqmovn.u32 d23, q8
+ vqmovn.u32 d24, q9
+ vqmovn.u32 d25, q10
+
+ vneg.s16 q13, q11
+ vneg.s16 q14, q12
+
+ vbsl.s16 q3, q13, q11
+ vbsl.s16 q4, q14, q12
+
+ vceq.s16 q5, q11, #0
+ vceq.s16 q6, q12, #0
+
+ vst1.s16 {q3}, [r1]!
+
+ vshrn.u16 d14, q5, #8
+ vshrn.u16 d15, q6, #8
+
+ ldr r3, [sp, #72] @Load *pu1_nnz
+
+ vshr.u8 q7, q7, #7
+
+ vst1.s16 {q4}, [r1]!
+
+ vadd.u8 d16, d14, d15
+ vmov.u8 d20, #16
+ vpadd.u8 d17, d16, d16
+ vpadd.u8 d18, d17, d17
+ vpadd.u8 d19, d18, d18
+ vsub.u8 d20, d20, d19
+ vst1.u8 d20[0], [r3]
+
+ vpop {d8-d15}
+ bx lr
+
+
+
+
+@*****************************************************************************
+@*
+@* Function Name : ih264_hadamard_quant_2x2_uv_a9
+@* Description : This function does forward hadamard transform and
+@* quantization for dc block of chroma for both planes
+@*
+@* Arguments : R0 :pointer to src buffer
+@ R1 :pointer to dst buffer
+@ R2 :pu2_scale_matrix
+@ R2 :pu2_threshold_matrix
+@ STACk : u4_qbits
+@ u4_round_factor
+@ pu1_nnz
+@ Values Returned : NONE
+@
+@ Register Usage :
+@ Stack Usage : 0 bytes
+@ Cycles : Around
+@ Interruptiaility : Interruptable
+@
+@ Known Limitations
+@ \Assumptions :
+@
+@ Revision History :
+@ DD MM YYYY Author(s) Changes
+@ 20 2 2015 100633 First version
+@
+@*****************************************************************************
+@ ih264_hadamard_quant_2x2_uv_a9(WORD16 *pi2_src, WORD16 *pi2_dst,
+@ const UWORD16 *pu2_scale_matrix,
+@ const UWORD16 *pu2_threshold_matrix, UWORD32 u4_qbits,
+@ UWORD32 u4_round_factor,UWORD8 *pu1_nnz
+@ )
+
+ .global ih264_hadamard_quant_2x2_uv_a9
+ih264_hadamard_quant_2x2_uv_a9:
+
+ vpush {d8-d15}
+ vld2.s16 {d0-d1}, [r0] @load src
+
+ add r3, sp, #68 @Get address of u4_round_factor
+
+ vaddl.s16 q3, d0, d1 @x0 = x4 + x5;, x2 = x6 + x7;
+ vld1.u16 d30[0], [r2] @load pu2_scale_matrix[0]
+ vsubl.s16 q4, d0, d1 @x1 = x4 - x5; x3 = x6 - x7;
+
+ add r0, sp, #64 @Get affress of u4_qbits
+ vld1.s32 d28[0], [r3] @load u4_round_factor
+ vtrn.s32 q3, q4 @q1 -> x0 x1, q2 -> x2 x3
+
+ vadd.s32 q0, q3, q4 @ (x0 + x2) (x1 + x3) (y0 + y2); (y1 + y3);
+ vld1.s32 d24[0], [r0] @load u4_qbits
+ vsub.s32 q1, q3, q4 @ (x0 - x2) (x1 - x3) (y0 - y2); (y1 - y3);
+
+ vdup.u16 d30, d30[0] @pu2_scale_matrix
+
+ vabs.s32 q2, q0
+ vabs.s32 q3, q1
+
+ vdup.s32 q14, d28[0] @u4_round_factor
+
+ vmovl.u16 q15, d30 @pu2_scale_matrix
+
+ vclt.s32 q4, q0, #0 @get the sign row 1,2
+ vdup.s32 q12, d24[0] @u4_round_factor
+ vclt.s32 q5, q1, #0
+
+ vqmovn.u32 d8, q4
+ vqmovn.s32 d9, q5
+
+ vmov.s32 q13, q14 @Get the round fact
+ vneg.s32 q12, q12 @-u4_round_factor
+
+ vmla.u32 q13, q2, q15
+ vmla.u32 q14, q3, q15
+
+ vshl.u32 q13, q13, q12 @>>qbit
+ vshl.u32 q14, q14, q12 @>>qbit
+
+ vqmovn.u32 d10, q13
+ vqmovn.u32 d11, q14
+
+ vneg.s16 q6, q5
+
+ vbsl.s16 q4, q6, q5 @*sign
+
+ vtrn.s32 d8, d9
+
+ vceq.s16 q7, q4, #0 @Compute nnz
+
+ vshrn.u16 d14, q7, #8 @reduce nnz comparison to 1 bit
+
+ ldr r3, [sp, #72] @Load *pu1_nnz
+ vshr.u8 d14, d14, #7 @reduce nnz comparison to 1 bit
+ vmov.u8 d20, #4 @Since we add zeros, we need to subtract from 4 to get nnz
+ vpadd.u8 d17, d14, d14 @Sum up nnz
+
+ vst1.s16 {q4}, [r1]! @Store the block
+
+ vpadd.u8 d17, d17, d17 @Sum up nnz
+ vsub.u8 d20, d20, d17 @4- numzeros
+ vst1.u16 d20[0], [r3] @store nnz
+
+ vpop {d8-d15}
+ bx lr
+
+
+
+
+
diff --git a/common/arm/ih264_weighted_bi_pred_a9q.s b/common/arm/ih264_weighted_bi_pred_a9q.s
new file mode 100755
index 0000000..ccae779
--- /dev/null
+++ b/common/arm/ih264_weighted_bi_pred_a9q.s
@@ -0,0 +1,642 @@
+@/******************************************************************************
+@ *
+@ * Copyright (C) 2015 The Android Open Source Project
+@ *
+@ * Licensed under the Apache License, Version 2.0 (the "License");
+@ * you may not use this file except in compliance with the License.
+@ * You may obtain a copy of the License at:
+@ *
+@ * http://www.apache.org/licenses/LICENSE-2.0
+@ *
+@ * Unless required by applicable law or agreed to in writing, software
+@ * distributed under the License is distributed on an "AS IS" BASIS,
+@ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@ * See the License for the specific language governing permissions and
+@ * limitations under the License.
+@ *
+@ *****************************************************************************
+@ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
+@*/
+@/**
+@******************************************************************************
+@* @file
+@* ih264_weighted_bi_pred_a9q.s
+@*
+@* @brief
+@* Contains function definitions for weighted biprediction.
+@*
+@* @author
+@* Kaushik Senthoor R
+@*
+@* @par List of Functions:
+@*
+@* - ih264_weighted_bi_pred_luma_a9q()
+@* - ih264_weighted_bi_pred_chroma_a9q()
+@*
+@* @remarks
+@* None
+@*
+@*******************************************************************************
+@*/
+@*******************************************************************************
+@* @function
+@* ih264_weighted_bi_pred_luma_a9q()
+@*
+@* @brief
+@* This routine performs the weighted biprediction as described in sec
+@* 8.4.2.3.2 titled "Weighted sample prediction process" for luma.
+@*
+@* @par Description:
+@* This function gets two ht x wd blocks, calculates the weighted samples,
+@* rounds off, adds offset and stores it in the destination block.
+@*
+@* @param[in] pu1_src1
+@* UWORD8 Pointer to the buffer containing the input block 1.
+@*
+@* @param[in] pu1_src2
+@* UWORD8 Pointer to the buffer containing the input block 2.
+@*
+@* @param[out] pu1_dst
+@* UWORD8 pointer to the destination where the output block is stored.
+@*
+@* @param[in] src_strd1
+@* Stride of the input buffer 1
+@*
+@* @param[in] src_strd2
+@* Stride of the input buffer 2
+@*
+@* @param[in] dst_strd
+@* Stride of the destination buffer
+@*
+@* @param[in] log_wd
+@* number of bits to be rounded off
+@*
+@* @param[in] wt1
+@* weight for the weighted prediction
+@*
+@* @param[in] wt2
+@* weight for the weighted prediction
+@*
+@* @param[in] ofst1
+@* offset 1 used after rounding off
+@*
+@* @param[in] ofst2
+@* offset 2 used after rounding off
+@*
+@* @param[in] ht
+@* integer height of the array
+@*
+@* @param[in] wd
+@* integer width of the array
+@*
+@* @returns
+@* None
+@*
+@* @remarks
+@* (ht,wd) can be (4,4), (4,8), (8,4), (8,8), (8,16), (16,8) or (16,16).
+@*
+@*******************************************************************************
+@*/
+@void ih264_weighted_bi_pred_luma_a9q(UWORD8 *pu1_src1,
+@ UWORD8 *pu1_src2,
+@ UWORD8 *pu1_dst,
+@ WORD32 src_strd1,
+@ WORD32 src_strd2,
+@ WORD32 dst_strd,
+@ WORD32 log_wd,
+@ WORD32 wt1,
+@ WORD32 wt2,
+@ WORD32 ofst1,
+@ WORD32 ofst2,
+@ WORD32 ht,
+@ WORD32 wd)
+@
+@**************Variables Vs Registers*****************************************
+@ r0 => pu1_src1
+@ r1 => pu1_src2
+@ r2 => pu1_dst
+@ r3 => src_strd1
+@ [sp] => src_strd2 (r4)
+@ [sp+4] => dst_strd (r5)
+@ [sp+8] => log_wd (r6)
+@ [sp+12] => wt1 (r7)
+@ [sp+16] => wt2 (r8)
+@ [sp+20] => ofst1 (r9)
+@ [sp+24] => ofst2 (r10)
+@ [sp+28] => ht (r11)
+@ [sp+32] => wd (r12)
+@
+.text
+.p2align 2
+
+ .global ih264_weighted_bi_pred_luma_a9q
+
+ih264_weighted_bi_pred_luma_a9q:
+
+ stmfd sp!, {r4-r12, r14} @stack stores the values of the arguments
+ ldr r6, [sp, #48] @Load log_wd in r6
+ ldr r7, [sp, #52] @Load wt1 in r7
+ ldr r8, [sp, #56] @Load wt2 in r8
+ ldr r9, [sp, #60] @Load ofst1 in r9
+
+ add r6, r6, #1 @r6 = log_wd + 1
+ sxtb r7, r7 @sign-extend 16-bit wt1 to 32-bit
+ ldr r4, [sp, #40] @Load src_strd2 in r4
+ ldr r5, [sp, #44] @Load dst_strd in r5
+ sxtb r9, r9 @sign-extend 8-bit ofst1 to 32-bit
+ rsb r10, r6, #0 @r13 = -(log_wd + 1)
+ ldr r11, [sp, #68] @Load ht in r11
+ ldr r12, [sp, #72] @Load wd in r12
+ vdup.16 q0, r10 @Q0 = -(log_wd + 1) (32-bit)
+ add r9, r9, #1 @r9 = ofst1 + 1
+
+ ldr r10, [sp, #64] @Load ofst2 in r10
+ sxtb r8, r8 @sign-extend 16-bit wt2 to 32-bit
+ cmp r12, #16 @check if wd is 16
+ vpush {d8-d15}
+ sxtb r10, r10 @sign-extend 8-bit ofst2 to 32-bit
+ add r9, r9, r10 @r9 = ofst1 + ofst2 + 1
+ vmov d2, r7, r8 @D2 = {wt1(32-bit), wt2(32-bit)}
+ asr r9, r9, #1 @r9 = ofst = (ofst1 + ofst2 + 1) >> 1
+ vdup.8 d3, r9 @D3 = ofst (8-bit)
+ beq loop_16 @branch if wd is 16
+
+ cmp r12, #8 @check if wd is 8
+ beq loop_8 @branch if wd is 8
+
+loop_4: @each iteration processes four rows
+
+ vld1.32 d4[0], [r0], r3 @load row 1 in source 1
+ vld1.32 d4[1], [r0], r3 @load row 2 in source 1
+ vld1.32 d6[0], [r1], r4 @load row 1 in source 2
+ vld1.32 d6[1], [r1], r4 @load row 2 in source 2
+
+ vmovl.u8 q2, d4 @converting rows 1,2 in source 1 to 16-bit
+ vld1.32 d8[0], [r0], r3 @load row 3 in source 1
+ vld1.32 d8[1], [r0], r3 @load row 4 in source 1
+ vmovl.u8 q3, d6 @converting rows 1,2 in source 2 to 16-bit
+ vld1.32 d10[0], [r1], r4 @load row 3 in source 2
+ vld1.32 d10[1], [r1], r4 @load row 4 in source 2
+
+ vmovl.u8 q4, d8 @converting rows 3,4 in source 1 to 16-bit
+ vmovl.u8 q5, d10 @converting rows 3,4 in source 2 to 16-bit
+
+ vmul.s16 q2, q2, d2[0] @weight 1 mult. for rows 1,2
+ vmla.s16 q2, q3, d2[2] @weight 2 mult. for rows 1,2
+ vmul.s16 q4, q4, d2[0] @weight 1 mult. for rows 3,4
+ vmla.s16 q4, q5, d2[2] @weight 2 mult. for rows 3,4
+
+ subs r11, r11, #4 @decrement ht by 4
+ vrshl.s16 q2, q2, q0 @rounds off the weighted samples from rows 1,2
+ vrshl.s16 q4, q4, q0 @rounds off the weighted samples from rows 3,4
+
+ vaddw.s8 q2, q2, d3 @adding offset for rows 1,2
+ vaddw.s8 q4, q4, d3 @adding offset for rows 3,4
+
+ vqmovun.s16 d4, q2 @saturating rows 1,2 to unsigned 8-bit
+ vqmovun.s16 d8, q4 @saturating rows 3,4 to unsigned 8-bit
+
+ vst1.32 d4[0], [r2], r5 @store row 1 in destination
+ vst1.32 d4[1], [r2], r5 @store row 2 in destination
+ vst1.32 d8[0], [r2], r5 @store row 3 in destination
+ vst1.32 d8[1], [r2], r5 @store row 4 in destination
+
+ bgt loop_4 @if greater than 0 repeat the loop again
+
+ b end_loops
+
+loop_8: @each iteration processes four rows
+
+ vld1.8 d4, [r0], r3 @load row 1 in source 1
+ vld1.8 d6, [r1], r4 @load row 1 in source 2
+ vld1.8 d8, [r0], r3 @load row 2 in source 1
+ vld1.8 d10, [r1], r4 @load row 2 in source 2
+ vmovl.u8 q2, d4 @converting row 1 in source 1 to 16-bit
+ vld1.8 d12, [r0], r3 @load row 3 in source 1
+ vld1.8 d14, [r1], r4 @load row 3 in source 2
+ vmovl.u8 q3, d6 @converting row 1 in source 2 to 16-bit
+ vld1.8 d16, [r0], r3 @load row 4 in source 1
+ vld1.8 d18, [r1], r4 @load row 4 in source 2
+
+ vmovl.u8 q4, d8 @converting row 2 in source 1 to 16-bit
+ vmovl.u8 q5, d10 @converting row 2 in source 2 to 16-bit
+
+ vmul.s16 q2, q2, d2[0] @weight 1 mult. for row 1
+ vmla.s16 q2, q3, d2[2] @weight 2 mult. for row 1
+ vmovl.u8 q6, d12 @converting row 3 in source 1 to 16-bit
+ vmovl.u8 q7, d14 @converting row 3 in source 2 to 16-bit
+ vmul.s16 q4, q4, d2[0] @weight 1 mult. for row 2
+ vmla.s16 q4, q5, d2[2] @weight 2 mult. for row 2
+ vmovl.u8 q8, d16 @converting row 4 in source 1 to 16-bit
+ vmovl.u8 q9, d18 @converting row 4 in source 2 to 16-bit
+
+ vmul.s16 q6, q6, d2[0] @weight 1 mult. for row 3
+ vmla.s16 q6, q7, d2[2] @weight 2 mult. for row 3
+ vmul.s16 q8, q8, d2[0] @weight 1 mult. for row 4
+ vmla.s16 q8, q9, d2[2] @weight 2 mult. for row 4
+
+ vrshl.s16 q2, q2, q0 @rounds off the weighted samples from row 1
+ vrshl.s16 q4, q4, q0 @rounds off the weighted samples from row 2
+ vrshl.s16 q6, q6, q0 @rounds off the weighted samples from row 3
+ vaddw.s8 q2, q2, d3 @adding offset for row 1
+ vrshl.s16 q8, q8, q0 @rounds off the weighted samples from row 4
+ vaddw.s8 q4, q4, d3 @adding offset for row 2
+
+ vaddw.s8 q6, q6, d3 @adding offset for row 3
+ vqmovun.s16 d4, q2 @saturating row 1 to unsigned 8-bit
+ vaddw.s8 q8, q8, d3 @adding offset for row 4
+ vqmovun.s16 d8, q4 @saturating row 2 to unsigned 8-bit
+
+ vqmovun.s16 d12, q6 @saturating row 3 to unsigned 8-bit
+ vqmovun.s16 d16, q8 @saturating row 4 to unsigned 8-bit
+
+ vst1.8 d4, [r2], r5 @store row 1 in destination
+ vst1.8 d8, [r2], r5 @store row 2 in destination
+ subs r11, r11, #4 @decrement ht by 4
+ vst1.8 d12, [r2], r5 @store row 3 in destination
+ vst1.8 d16, [r2], r5 @store row 4 in destination
+
+ bgt loop_8 @if greater than 0 repeat the loop again
+
+ b end_loops
+
+loop_16: @each iteration processes two rows
+
+ vld1.8 {q2}, [r0], r3 @load row 1 in source 1
+ vld1.8 {q3}, [r1], r4 @load row 1 in source 2
+ vld1.8 {q4}, [r0], r3 @load row 2 in source 1
+ vld1.8 {q5}, [r1], r4 @load row 2 in source 2
+ vmovl.u8 q10, d4 @converting row 1L in source 1 to 16-bit
+ vld1.8 {q6}, [r0], r3 @load row 3 in source 1
+ vld1.8 {q7}, [r1], r4 @load row 3 in source 2
+ vmovl.u8 q11, d6 @converting row 1L in source 2 to 16-bit
+ vld1.8 {q8}, [r0], r3 @load row 4 in source 1
+ vld1.8 {q9}, [r1], r4 @load row 4 in source 2
+
+ vmovl.u8 q2, d5 @converting row 1H in source 1 to 16-bit
+ vmovl.u8 q3, d7 @converting row 1H in source 2 to 16-bit
+
+ vmul.s16 q10, q10, d2[0] @weight 1 mult. for row 1L
+ vmla.s16 q10, q11, d2[2] @weight 2 mult. for row 1L
+ vmovl.u8 q12, d8 @converting row 2L in source 1 to 16-bit
+ vmovl.u8 q13, d10 @converting row 2L in source 2 to 16-bit
+
+ vmul.s16 q2, q2, d2[0] @weight 1 mult. for row 1H
+ vmla.s16 q2, q3, d2[2] @weight 2 mult. for row 1H
+ vmovl.u8 q4, d9 @converting row 2H in source 1 to 16-bit
+ vmovl.u8 q5, d11 @converting row 2H in source 2 to 16-bit
+
+ vmul.s16 q12, q12, d2[0] @weight 1 mult. for row 2L
+ vmla.s16 q12, q13, d2[2] @weight 2 mult. for row 2L
+ vmovl.u8 q14, d12 @converting row 3L in source 1 to 16-bit
+ vmovl.u8 q15, d14 @converting row 3L in source 2 to 16-bit
+
+ vmul.s16 q4, q4, d2[0] @weight 1 mult. for row 2H
+ vmla.s16 q4, q5, d2[2] @weight 2 mult. for row 2H
+ vmovl.u8 q6, d13 @converting row 3H in source 1 to 16-bit
+ vmovl.u8 q7, d15 @converting row 3H in source 2 to 16-bit
+
+ vmul.s16 q14, q14, d2[0] @weight 1 mult. for row 3L
+ vmla.s16 q14, q15, d2[2] @weight 2 mult. for row 3L
+ vmovl.u8 q11, d16 @converting row 4L in source 1 to 16-bit
+ vmovl.u8 q3, d18 @converting row 4L in source 2 to 16-bit
+
+ vmul.s16 q6, q6, d2[0] @weight 1 mult. for row 3H
+ vmla.s16 q6, q7, d2[2] @weight 2 mult. for row 3H
+ vmovl.u8 q8, d17 @converting row 4H in source 1 to 16-bit
+ vmovl.u8 q9, d19 @converting row 4H in source 2 to 16-bit
+
+ vmul.s16 q11, q11, d2[0] @weight 1 mult. for row 4L
+ vmla.s16 q11, q3, d2[2] @weight 2 mult. for row 4L
+ vrshl.s16 q10, q10, q0 @rounds off the weighted samples from row 1L
+
+ vmul.s16 q8, q8, d2[0] @weight 1 mult. for row 4H
+ vmla.s16 q8, q9, d2[2] @weight 2 mult. for row 4H
+ vrshl.s16 q2, q2, q0 @rounds off the weighted samples from row 1H
+
+ vrshl.s16 q12, q12, q0 @rounds off the weighted samples from row 2L
+ vaddw.s8 q10, q10, d3 @adding offset for row 1L
+ vrshl.s16 q4, q4, q0 @rounds off the weighted samples from row 2H
+ vaddw.s8 q2, q2, d3 @adding offset for row 1H
+ vrshl.s16 q14, q14, q0 @rounds off the weighted samples from row 3L
+ vaddw.s8 q12, q12, d3 @adding offset for row 2L
+ vrshl.s16 q6, q6, q0 @rounds off the weighted samples from row 3H
+ vaddw.s8 q4, q4, d3 @adding offset for row 2H
+ vrshl.s16 q11, q11, q0 @rounds off the weighted samples from row 4L
+ vaddw.s8 q14, q14, d3 @adding offset for row 3L
+ vrshl.s16 q8, q8, q0 @rounds off the weighted samples from row 4H
+ vaddw.s8 q6, q6, d3 @adding offset for row 3H
+
+ vqmovun.s16 d26, q10 @saturating row 1L to unsigned 8-bit
+ vaddw.s8 q11, q11, d3 @adding offset for row 4L
+ vqmovun.s16 d27, q2 @saturating row 1H to unsigned 8-bit
+ vaddw.s8 q8, q8, d3 @adding offset for row 4H
+
+ vqmovun.s16 d10, q12 @saturating row 2L to unsigned 8-bit
+ vqmovun.s16 d11, q4 @saturating row 2H to unsigned 8-bit
+ vqmovun.s16 d30, q14 @saturating row 3L to unsigned 8-bit
+ vqmovun.s16 d31, q6 @saturating row 3H to unsigned 8-bit
+ vst1.8 {q13}, [r2], r5 @store row 1 in destination
+ vqmovun.s16 d14, q11 @saturating row 4L to unsigned 8-bit
+ vqmovun.s16 d15, q8 @saturating row 4H to unsigned 8-bit
+
+ vst1.8 {q5}, [r2], r5 @store row 2 in destination
+ subs r11, r11, #4 @decrement ht by 4
+ vst1.8 {q15}, [r2], r5 @store row 3 in destination
+ vst1.8 {q7}, [r2], r5 @store row 4 in destination
+
+ bgt loop_16 @if greater than 0 repeat the loop again
+
+end_loops:
+
+ vpop {d8-d15}
+ ldmfd sp!, {r4-r12, r15} @Reload the registers from sp
+
+
+@*******************************************************************************
+@* @function
+@* ih264_weighted_bi_pred_chroma_a9q()
+@*
+@* @brief
+@* This routine performs the default weighted prediction as described in sec
+@* 8.4.2.3.2 titled "Weighted sample prediction process" for chroma.
+@*
+@* @par Description:
+@* This function gets two ht x wd blocks, calculates the weighted samples,
+@* rounds off, adds offset and stores it in the destination block for U and V.
+@*
+@* @param[in] pu1_src1
+@* UWORD8 Pointer to the buffer containing the input block 1.
+@*
+@* @param[in] pu1_src2
+@* UWORD8 Pointer to the buffer containing the input block 2.
+@*
+@* @param[out] pu1_dst
+@* UWORD8 pointer to the destination where the output block is stored.
+@*
+@* @param[in] src_strd1
+@* Stride of the input buffer 1
+@*
+@* @param[in] src_strd2
+@* Stride of the input buffer 2
+@*
+@* @param[in] dst_strd
+@* Stride of the destination buffer
+@*
+@* @param[in] log_wd
+@* number of bits to be rounded off
+@*
+@* @param[in] wt1
+@* weights for the weighted prediction in U and V
+@*
+@* @param[in] wt2
+@* weights for the weighted prediction in U and V
+@*
+@* @param[in] ofst1
+@* offset 1 used after rounding off for U an dV
+@*
+@* @param[in] ofst2
+@* offset 2 used after rounding off for U and V
+@*
+@* @param[in] ht
+@* integer height of the array
+@*
+@* @param[in] wd
+@* integer width of the array
+@*
+@* @returns
+@* None
+@*
+@* @remarks
+@* (ht,wd) can be (2,2), (2,4), (4,2), (4,4), (4,8), (8,4) or (8,8).
+@*
+@*******************************************************************************
+@*/
+@void ih264_weighted_bi_pred_chroma_a9q(UWORD8 *pu1_src1,
+@ UWORD8 *pu1_src2,
+@ UWORD8 *pu1_dst,
+@ WORD32 src_strd1,
+@ WORD32 src_strd2,
+@ WORD32 dst_strd,
+@ WORD32 log_wd,
+@ WORD32 wt1,
+@ WORD32 wt2,
+@ WORD32 ofst1,
+@ WORD32 ofst2,
+@ WORD32 ht,
+@ WORD32 wd)
+@
+@**************Variables Vs Registers*****************************************
+@ r0 => pu1_src1
+@ r1 => pu1_src2
+@ r2 => pu1_dst
+@ r3 => src_strd1
+@ [sp] => src_strd2 (r4)
+@ [sp+4] => dst_strd (r5)
+@ [sp+8] => log_wd (r6)
+@ [sp+12] => wt1 (r7)
+@ [sp+16] => wt2 (r8)
+@ [sp+20] => ofst1 (r9)
+@ [sp+24] => ofst2 (r10)
+@ [sp+28] => ht (r11)
+@ [sp+32] => wd (r12)
+@
+
+
+ .global ih264_weighted_bi_pred_chroma_a9q
+
+ih264_weighted_bi_pred_chroma_a9q:
+
+ stmfd sp!, {r4-r12, r14} @stack stores the values of the arguments
+
+ ldr r6, [sp, #48] @Load log_wd in r6
+ ldr r7, [sp, #52] @Load wt1 in r7
+ ldr r8, [sp, #56] @Load wt2 in r8
+ add r6, r6, #1 @r6 = log_wd + 1
+ ldr r9, [sp, #60] @Load ofst1 in r9
+ ldr r10, [sp, #64] @Load ofst2 in r10
+
+ rsb r12, r6, #0 @r12 = -(log_wd + 1)
+ ldr r4, [sp, #40] @Load src_strd2 in r4
+ ldr r5, [sp, #44] @Load dst_strd in r5
+ vdup.16 q0, r12 @Q0 = -(log_wd + 1) (16-bit)
+
+ ldr r11, [sp, #68] @Load ht in r11
+ vdup.32 q1, r7 @Q1 = (wt1_u, wt1_v) (32-bit)
+ ldr r12, [sp, #72] @Load wd in r12
+ vdup.32 q2, r8 @Q2 = (wt2_u, wt2_v) (32-bit)
+ asr r7, r9, #8 @r7 = ofst1_v
+ asr r8, r10, #8 @r8 = ofst2_v
+ vpush {d8-d15}
+ sxtb r9, r9 @sign-extend 8-bit ofst1_u to 32-bit
+ sxtb r10, r10 @sign-extend 8-bit ofst2_u to 32-bit
+ sxtb r7, r7 @sign-extend 8-bit ofst1_v to 32-bit
+ sxtb r8, r8 @sign-extend 8-bit ofst2_v to 32-bit
+
+ add r9, r9, #1 @r9 = ofst1_u + 1
+ add r7, r7, #1 @r7 = ofst1_v + 1
+ add r9, r9, r10 @r9 = ofst1_u + ofst2_u + 1
+ add r7, r7, r8 @r7 = ofst1_v + ofst2_v + 1
+ asr r9, r9, #1 @r9 = ofst_u = (ofst1_u + ofst2_u + 1) >> 1
+ asr r7, r7, #1 @r7 = ofst_v = (ofst1_v + ofst2_v + 1) >> 1
+ cmp r12, #8 @check if wd is 8
+ pkhbt r9, r9, r7, lsl #16 @r9 = {ofst_u(16-bit), ofst_v(16-bit)}
+ vdup.32 q3, r9 @Q3 = {ofst_u(16-bit), ofst_v(16-bit)}
+ beq loop_8_uv @branch if wd is 8
+
+ cmp r12, #4 @check if wd is 4
+ beq loop_4_uv @branch if wd is 4
+
+loop_2_uv: @each iteration processes two rows
+
+ vld1.32 d8[0], [r0], r3 @load row 1 in source 1
+ vld1.32 d8[1], [r0], r3 @load row 2 in source 1
+ vld1.32 d10[0], [r1], r4 @load row 1 in source 2
+ vld1.32 d10[1], [r1], r4 @load row 2 in source 2
+
+ vmovl.u8 q4, d8 @converting rows 1,2 in source 1 to 16-bit
+ vmovl.u8 q5, d10 @converting rows 1,2 in source 2 to 16-bit
+
+ vmul.s16 q4, q4, q1 @weight 1 mult. for rows 1,2
+ vmla.s16 q4, q5, q2 @weight 2 mult. for rows 1,2
+
+ vrshl.s16 q4, q4, q0 @rounds off the weighted samples from rows 1,2
+
+ vadd.s16 q4, q4, q3 @adding offset for rows 1,2
+
+ vqmovun.s16 d8, q4 @saturating rows 1,2 to unsigned 8-bit
+
+ vst1.32 d8[0], [r2], r5 @store row 1 in destination
+ vst1.32 d8[1], [r2], r5 @store row 2 in destination
+
+ subs r11, r11, #2 @decrement ht by 2
+ bgt loop_2_uv @if greater than 0 repeat the loop again
+
+ b end_loops_uv
+
+loop_4_uv: @each iteration processes two rows
+
+ vld1.8 d8, [r0], r3 @load row 1 in source 1
+ vld1.8 d10, [r1], r4 @load row 1 in source 2
+ vmovl.u8 q4, d8 @converting row 1 in source 1 to 16-bit
+ vld1.8 d12, [r0], r3 @load row 2 in source 1
+ vmovl.u8 q5, d10 @converting row 1 in source 2 to 16-bit
+ vld1.8 d14, [r1], r4 @load row 2 in source 2
+
+ vmovl.u8 q6, d12 @converting row 2 in source 1 to 16-bit
+ vmul.s16 q4, q4, q1 @weight 1 mult. for row 1
+ vmla.s16 q4, q5, q2 @weight 2 mult. for row 1
+ vmovl.u8 q7, d14 @converting row 2 in source 2 to 16-bit
+
+ vmul.s16 q6, q6, q1 @weight 1 mult. for row 2
+ vmla.s16 q6, q7, q2 @weight 2 mult. for row 2
+
+ subs r11, r11, #2 @decrement ht by 2
+ vrshl.s16 q4, q4, q0 @rounds off the weighted samples from row 1
+ vrshl.s16 q6, q6, q0 @rounds off the weighted samples from row 2
+ vadd.s16 q4, q4, q3 @adding offset for row 1
+ vadd.s16 q6, q6, q3 @adding offset for row 2
+
+ vqmovun.s16 d8, q4 @saturating row 1 to unsigned 8-bit
+ vqmovun.s16 d12, q6 @saturating row 2 to unsigned 8-bit
+
+ vst1.8 d8, [r2], r5 @store row 1 in destination
+ vst1.8 d12, [r2], r5 @store row 2 in destination
+
+ bgt loop_4_uv @if greater than 0 repeat the loop again
+
+ b end_loops_uv
+
+loop_8_uv: @each iteration processes two rows
+
+ vld1.8 {q4}, [r0], r3 @load row 1 in source 1
+ vld1.8 {q5}, [r1], r4 @load row 1 in source 2
+ vld1.8 {q6}, [r0], r3 @load row 2 in source 1
+ vld1.8 {q7}, [r1], r4 @load row 2 in source 2
+ vmovl.u8 q12, d8 @converting row 1L in source 1 to 16-bit
+ vld1.8 {q8}, [r0], r3 @load row 3 in source 1
+ vld1.8 {q9}, [r1], r4 @load row 3 in source 2
+ vmovl.u8 q13, d10 @converting row 1L in source 2 to 16-bit
+ vld1.8 {q10}, [r0], r3 @load row 4 in source 1
+ vld1.8 {q11}, [r1], r4 @load row 4 in source 2
+
+ vmovl.u8 q4, d9 @converting row 1H in source 1 to 16-bit
+ vmovl.u8 q5, d11 @converting row 1H in source 2 to 16-bit
+
+ vmul.s16 q12, q12, q1 @weight 1 mult. for row 1L
+ vmla.s16 q12, q13, q2 @weight 2 mult. for row 1L
+ vmovl.u8 q14, d12 @converting row 2L in source 1 to 16-bit
+ vmovl.u8 q15, d14 @converting row 2L in source 2 to 16-bit
+
+ vmul.s16 q4, q4, q1 @weight 1 mult. for row 1H
+ vmla.s16 q4, q5, q2 @weight 2 mult. for row 1H
+ vmovl.u8 q6, d13 @converting row 2H in source 1 to 16-bit
+ vmovl.u8 q7, d15 @converting row 2H in source 2 to 16-bit
+
+ vmul.s16 q14, q14, q1 @weight 1 mult. for row 2L
+ vmla.s16 q14, q15, q2 @weight 2 mult. for row 2L
+ vmovl.u8 q13, d16 @converting row 3L in source 1 to 16-bit
+ vmovl.u8 q5, d18 @converting row 3L in source 2 to 16-bit
+
+ vmul.s16 q6, q6, q1 @weight 1 mult. for row 2H
+ vmla.s16 q6, q7, q2 @weight 2 mult. for row 2H
+ vmovl.u8 q8, d17 @converting row 3H in source 1 to 16-bit
+ vmovl.u8 q9, d19 @converting row 3H in source 2 to 16-bit
+
+ vmul.s16 q13, q13, q1 @weight 1 mult. for row 3L
+ vmla.s16 q13, q5, q2 @weight 2 mult. for row 3L
+ vmovl.u8 q15, d20 @converting row 4L in source 1 to 16-bit
+ vmovl.u8 q7, d22 @converting row 4L in source 2 to 16-bit
+
+ vmul.s16 q8, q8, q1 @weight 1 mult. for row 3H
+ vmla.s16 q8, q9, q2 @weight 2 mult. for row 3H
+ vmovl.u8 q10, d21 @converting row 4H in source 1 to 16-bit
+ vmovl.u8 q11, d23 @converting row 4H in source 2 to 16-bit
+
+ vmul.s16 q15, q15, q1 @weight 1 mult. for row 4L
+ vmla.s16 q15, q7, q2 @weight 2 mult. for row 4L
+ vrshl.s16 q12, q12, q0 @rounds off the weighted samples from row 1L
+
+ vmul.s16 q10, q10, q1 @weight 1 mult. for row 4H
+ vmla.s16 q10, q11, q2 @weight 2 mult. for row 4H
+ vrshl.s16 q4, q4, q0 @rounds off the weighted samples from row 1H
+
+ vrshl.s16 q14, q14, q0 @rounds off the weighted samples from row 2L
+ vadd.s16 q12, q12, q3 @adding offset for row 1L
+ vrshl.s16 q6, q6, q0 @rounds off the weighted samples from row 2H
+ vadd.s16 q4, q4, q3 @adding offset for row 1H
+ vrshl.s16 q13, q13, q0 @rounds off the weighted samples from row 3L
+ vadd.s16 q14, q14, q3 @adding offset for row 2L
+ vrshl.s16 q8, q8, q0 @rounds off the weighted samples from row 3H
+ vadd.s16 q6, q6, q3 @adding offset for row 2H
+ vrshl.s16 q15, q15, q0 @rounds off the weighted samples from row 4L
+ vadd.s16 q13, q13, q3 @adding offset for row 3L
+ vrshl.s16 q10, q10, q0 @rounds off the weighted samples from row 4H
+ vadd.s16 q8, q8, q3 @adding offset for row 3H
+
+ vqmovun.s16 d10, q12 @saturating row 1L to unsigned 8-bit
+ vadd.s16 q15, q15, q3 @adding offset for row 4L
+ vqmovun.s16 d11, q4 @saturating row 1H to unsigned 8-bit
+ vadd.s16 q10, q10, q3 @adding offset for row 4H
+
+ vqmovun.s16 d18, q14 @saturating row 2L to unsigned 8-bit
+ vqmovun.s16 d19, q6 @saturating row 2H to unsigned 8-bit
+ vqmovun.s16 d14, q13 @saturating row 3L to unsigned 8-bit
+ vqmovun.s16 d15, q8 @saturating row 3H to unsigned 8-bit
+ vst1.8 {q5}, [r2], r5 @store row 1 in destination
+ vqmovun.s16 d22, q15 @saturating row 4L to unsigned 8-bit
+ vqmovun.s16 d23, q10 @saturating row 4H to unsigned 8-bit
+
+ vst1.8 {q9}, [r2], r5 @store row 2 in destination
+ subs r11, r11, #4 @decrement ht by 4
+ vst1.8 {q7}, [r2], r5 @store row 3 in destination
+ vst1.8 {q11}, [r2], r5 @store row 4 in destination
+
+ bgt loop_8_uv @if greater than 0 repeat the loop again
+
+end_loops_uv:
+
+ vpop {d8-d15}
+ ldmfd sp!, {r4-r12, r15} @Reload the registers from sp
+
+
diff --git a/common/arm/ih264_weighted_pred_a9q.s b/common/arm/ih264_weighted_pred_a9q.s
new file mode 100755
index 0000000..1ce94d0
--- /dev/null
+++ b/common/arm/ih264_weighted_pred_a9q.s
@@ -0,0 +1,479 @@
+@/******************************************************************************
+@ *
+@ * Copyright (C) 2015 The Android Open Source Project
+@ *
+@ * Licensed under the Apache License, Version 2.0 (the "License");
+@ * you may not use this file except in compliance with the License.
+@ * You may obtain a copy of the License at:
+@ *
+@ * http://www.apache.org/licenses/LICENSE-2.0
+@ *
+@ * Unless required by applicable law or agreed to in writing, software
+@ * distributed under the License is distributed on an "AS IS" BASIS,
+@ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@ * See the License for the specific language governing permissions and
+@ * limitations under the License.
+@ *
+@ *****************************************************************************
+@ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
+@*/
+@/**
+@******************************************************************************
+@* @file
+@* ih264_weighted_pred_a9q.s
+@*
+@* @brief
+@* Contains function definitions for weighted prediction.
+@*
+@* @author
+@* Kaushik Senthoor R
+@*
+@* @par List of Functions:
+@*
+@* - ih264_weighted_pred_luma_a9q()
+@* - ih264_weighted_pred_chroma_a9q()
+@*
+@* @remarks
+@* None
+@*
+@*******************************************************************************
+@*/
+@*******************************************************************************
+@* @function
+@* ih264_weighted_pred_luma_a9q()
+@*
+@* @brief
+@* This routine performs the default weighted prediction as described in sec
+@* 8.4.2.3.2 titled "Weighted sample prediction process" for luma.
+@*
+@* @par Description:
+@* This function gets a ht x wd block, calculates the weighted sample, rounds
+@* off, adds offset and stores it in the destination block.
+@*
+@* @param[in] pu1_src:
+@* UWORD8 Pointer to the buffer containing the input block.
+@*
+@* @param[out] pu1_dst
+@* UWORD8 pointer to the destination where the output block is stored.
+@*
+@* @param[in] src_strd
+@* Stride of the input buffer
+@*
+@* @param[in] dst_strd
+@* Stride of the destination buffer
+@*
+@* @param[in] log_wd
+@* number of bits to be rounded off
+@*
+@* @param[in] wt
+@* weight for the weighted prediction
+@*
+@* @param[in] ofst
+@* offset used after rounding off
+@*
+@* @param[in] ht
+@* integer height of the array
+@*
+@* @param[in] wd
+@* integer width of the array
+@*
+@* @returns
+@* None
+@*
+@* @remarks
+@* (ht,wd) can be (4,4), (4,8), (8,4), (8,8), (8,16), (16,8) or (16,16).
+@*
+@*******************************************************************************
+@*/
+@void ih264_weighted_pred_luma_a9q(UWORD8 *pu1_src,
+@ UWORD8 *pu1_dst,
+@ WORD32 src_strd,
+@ WORD32 dst_strd,
+@ WORD32 log_wd,
+@ WORD32 wt,
+@ WORD32 ofst,
+@ WORD32 ht,
+@ WORD32 wd)
+@
+@**************Variables Vs Registers*****************************************
+@ r0 => pu1_src
+@ r1 => pu1_dst
+@ r2 => src_strd
+@ r3 => dst_strd
+@ [sp] => log_wd (r4)
+@ [sp+4] => wt (r5)
+@ [sp+8] => ofst (r6)
+@ [sp+12] => ht (r7)
+@ [sp+16] => wd (r8)
+@
+.text
+.p2align 2
+
+ .global ih264_weighted_pred_luma_a9q
+
+ih264_weighted_pred_luma_a9q:
+
+ stmfd sp!, {r4-r9, r14} @stack stores the values of the arguments
+ ldr r5, [sp, #32] @Load wt
+ ldr r4, [sp, #28] @Load log_wd in r4
+ ldr r6, [sp, #36] @Load ofst
+ ldr r7, [sp, #40] @Load ht
+ ldr r8, [sp, #44] @Load wd
+ vpush {d8-d15}
+
+ vdup.16 d2, r5 @D2 = wt (16-bit)
+ rsb r9, r4, #0 @r9 = -log_wd
+ vdup.8 d3, r6 @D3 = ofst (8-bit)
+ cmp r8, #16 @check if wd is 16
+ vdup.16 q0, r9 @Q0 = -log_wd (16-bit)
+ beq loop_16 @branch if wd is 16
+
+ cmp r8, #8 @check if wd is 8
+ beq loop_8 @branch if wd is 8
+
+loop_4: @each iteration processes four rows
+
+ vld1.32 d4[0], [r0], r2 @load row 1 in source
+ vld1.32 d4[1], [r0], r2 @load row 2 in source
+ vld1.32 d6[0], [r0], r2 @load row 3 in source
+ vld1.32 d6[1], [r0], r2 @load row 4 in source
+
+ vmovl.u8 q2, d4 @converting rows 1,2 to 16-bit
+ vmovl.u8 q3, d6 @converting rows 3,4 to 16-bit
+
+ vmul.s16 q2, q2, d2[0] @weight mult. for rows 1,2
+ vmul.s16 q3, q3, d2[0] @weight mult. for rows 3,4
+
+ subs r7, r7, #4 @decrement ht by 4
+ vrshl.s16 q2, q2, q0 @rounds off the weighted samples from rows 1,2
+ vrshl.s16 q3, q3, q0 @rounds off the weighted samples from rows 3,4
+
+ vaddw.s8 q2, q2, d3 @adding offset for rows 1,2
+ vaddw.s8 q3, q3, d3 @adding offset for rows 3,4
+
+ vqmovun.s16 d4, q2 @saturating rows 1,2 to unsigned 8-bit
+ vqmovun.s16 d6, q3 @saturating rows 3,4 to unsigned 8-bit
+
+ vst1.32 d4[0], [r1], r3 @store row 1 in destination
+ vst1.32 d4[1], [r1], r3 @store row 2 in destination
+ vst1.32 d6[0], [r1], r3 @store row 3 in destination
+ vst1.32 d6[1], [r1], r3 @store row 4 in destination
+
+ bgt loop_4 @if greater than 0 repeat the loop again
+
+ b end_loops
+
+loop_8: @each iteration processes four rows
+
+ vld1.8 d4, [r0], r2 @load row 1 in source
+ vld1.8 d6, [r0], r2 @load row 2 in source
+ vld1.8 d8, [r0], r2 @load row 3 in source
+ vmovl.u8 q2, d4 @converting row 1 to 16-bit
+ vld1.8 d10, [r0], r2 @load row 4 in source
+ vmovl.u8 q3, d6 @converting row 2 to 16-bit
+
+ vmovl.u8 q4, d8 @converting row 3 to 16-bit
+ vmul.s16 q2, q2, d2[0] @weight mult. for row 1
+ vmovl.u8 q5, d10 @converting row 4 to 16-bit
+ vmul.s16 q3, q3, d2[0] @weight mult. for row 2
+ vmul.s16 q4, q4, d2[0] @weight mult. for row 3
+ vmul.s16 q5, q5, d2[0] @weight mult. for row 4
+
+ vrshl.s16 q2, q2, q0 @rounds off the weighted samples from row 1
+ vrshl.s16 q3, q3, q0 @rounds off the weighted samples from row 2
+ vrshl.s16 q4, q4, q0 @rounds off the weighted samples from row 3
+ vaddw.s8 q2, q2, d3 @adding offset for row 1
+ vrshl.s16 q5, q5, q0 @rounds off the weighted samples from row 4
+ vaddw.s8 q3, q3, d3 @adding offset for row 2
+
+ vaddw.s8 q4, q4, d3 @adding offset for row 3
+ vqmovun.s16 d4, q2 @saturating row 1 to unsigned 8-bit
+ vaddw.s8 q5, q5, d3 @adding offset for row 4
+ vqmovun.s16 d6, q3 @saturating row 2 to unsigned 8-bit
+ vqmovun.s16 d8, q4 @saturating row 3 to unsigned 8-bit
+ vqmovun.s16 d10, q5 @saturating row 4 to unsigned 8-bit
+
+ vst1.8 d4, [r1], r3 @store row 1 in destination
+ vst1.8 d6, [r1], r3 @store row 2 in destination
+ subs r7, r7, #4 @decrement ht by 4
+ vst1.8 d8, [r1], r3 @store row 3 in destination
+ vst1.8 d10, [r1], r3 @store row 4 in destination
+
+ bgt loop_8 @if greater than 0 repeat the loop again
+
+ b end_loops
+
+loop_16: @each iteration processes two rows
+
+ vld1.8 {q2}, [r0], r2 @load row 1 in source
+ vld1.8 {q3}, [r0], r2 @load row 2 in source
+ vmovl.u8 q6, d4 @converting row 1L to 16-bit
+ vld1.8 {q4}, [r0], r2 @load row 3 in source
+ vmovl.u8 q7, d5 @converting row 1H to 16-bit
+ vld1.8 {q5}, [r0], r2 @load row 4 in source
+
+ vmovl.u8 q8, d6 @converting row 2L to 16-bit
+ vmul.s16 q6, q6, d2[0] @weight mult. for row 1L
+ vmovl.u8 q9, d7 @converting row 2H to 16-bit
+ vmul.s16 q7, q7, d2[0] @weight mult. for row 1H
+ vmovl.u8 q10, d8 @converting row 3L to 16-bit
+ vmul.s16 q8, q8, d2[0] @weight mult. for row 2L
+ vmovl.u8 q11, d9 @converting row 3H to 16-bit
+ vmul.s16 q9, q9, d2[0] @weight mult. for row 2H
+ vmovl.u8 q12, d10 @converting row 4L to 16-bit
+ vmul.s16 q10, q10, d2[0] @weight mult. for row 3L
+ vmovl.u8 q13, d11 @converting row 4H to 16-bit
+ vmul.s16 q11, q11, d2[0] @weight mult. for row 3H
+
+ vmul.s16 q12, q12, d2[0] @weight mult. for row 4L
+ vrshl.s16 q6, q6, q0 @rounds off the weighted samples from row 1L
+ vmul.s16 q13, q13, d2[0] @weight mult. for row 4H
+
+ vrshl.s16 q7, q7, q0 @rounds off the weighted samples from row 1H
+ vrshl.s16 q8, q8, q0 @rounds off the weighted samples from row 2L
+ vaddw.s8 q6, q6, d3 @adding offset for row 1L
+ vrshl.s16 q9, q9, q0 @rounds off the weighted samples from row 2H
+ vaddw.s8 q7, q7, d3 @adding offset for row 1H
+ vqmovun.s16 d4, q6 @saturating row 1L to unsigned 8-bit
+ vrshl.s16 q10, q10, q0 @rounds off the weighted samples from row 3L
+ vaddw.s8 q8, q8, d3 @adding offset for row 2L
+ vqmovun.s16 d5, q7 @saturating row 1H to unsigned 8-bit
+ vrshl.s16 q11, q11, q0 @rounds off the weighted samples from row 3H
+ vaddw.s8 q9, q9, d3 @adding offset for row 2H
+ vqmovun.s16 d6, q8 @saturating row 2L to unsigned 8-bit
+ vrshl.s16 q12, q12, q0 @rounds off the weighted samples from row 4L
+ vaddw.s8 q10, q10, d3 @adding offset for row 3L
+ vqmovun.s16 d7, q9 @saturating row 2H to unsigned 8-bit
+ vrshl.s16 q13, q13, q0 @rounds off the weighted samples from row 4H
+ vaddw.s8 q11, q11, d3 @adding offset for row 3H
+
+ vqmovun.s16 d8, q10 @saturating row 3L to unsigned 8-bit
+ vaddw.s8 q12, q12, d3 @adding offset for row 4L
+ vqmovun.s16 d9, q11 @saturating row 3H to unsigned 8-bit
+ vaddw.s8 q13, q13, d3 @adding offset for row 4H
+
+ vqmovun.s16 d10, q12 @saturating row 4L to unsigned 8-bit
+ vst1.8 {q2}, [r1], r3 @store row 1 in destination
+ vqmovun.s16 d11, q13 @saturating row 4H to unsigned 8-bit
+ vst1.8 {q3}, [r1], r3 @store row 2 in destination
+ subs r7, r7, #4 @decrement ht by 4
+ vst1.8 {q4}, [r1], r3 @store row 3 in destination
+ vst1.8 {q5}, [r1], r3 @store row 4 in destination
+
+ bgt loop_16 @if greater than 0 repeat the loop again
+
+end_loops:
+
+ vpop {d8-d15}
+ ldmfd sp!, {r4-r9, r15} @Reload the registers from sp
+
+
+@*******************************************************************************
+@* @function
+@* ih264_weighted_pred_chroma_a9q()
+@*
+@* @brief
+@* This routine performs the default weighted prediction as described in sec
+@* 8.4.2.3.2 titled "Weighted sample prediction process" for chroma.
+@*
+@* @par Description:
+@* This function gets a ht x wd block, calculates the weighted sample, rounds
+@* off, adds offset and stores it in the destination block for U and V.
+@*
+@* @param[in] pu1_src:
+@* UWORD8 Pointer to the buffer containing the input block.
+@*
+@* @param[out] pu1_dst
+@* UWORD8 pointer to the destination where the output block is stored.
+@*
+@* @param[in] src_strd
+@* Stride of the input buffer
+@*
+@* @param[in] dst_strd
+@* Stride of the destination buffer
+@*
+@* @param[in] log_wd
+@* number of bits to be rounded off
+@*
+@* @param[in] wt
+@* weights for the weighted prediction for U and V
+@*
+@* @param[in] ofst
+@* offsets used after rounding off for U and V
+@*
+@* @param[in] ht
+@* integer height of the array
+@*
+@* @param[in] wd
+@* integer width of the array
+@*
+@* @returns
+@* None
+@*
+@* @remarks
+@* (ht,wd) can be (2,2), (2,4), (4,2), (4,4), (4,8), (8,4) or (8,8).
+@*
+@*******************************************************************************
+@*/
+@void ih264_weighted_pred_chroma_a9q(UWORD8 *pu1_src,
+@ UWORD8 *pu1_dst,
+@ WORD32 src_strd,
+@ WORD32 dst_strd,
+@ WORD32 log_wd,
+@ WORD32 wt,
+@ WORD32 ofst,
+@ WORD32 ht,
+@ WORD32 wd)
+@
+@**************Variables Vs Registers*****************************************
+@ r0 => pu1_src
+@ r1 => pu1_dst
+@ r2 => src_strd
+@ r3 => dst_strd
+@ [sp] => log_wd (r4)
+@ [sp+4] => wt (r5)
+@ [sp+8] => ofst (r6)
+@ [sp+12] => ht (r7)
+@ [sp+16] => wd (r8)
+@
+
+
+ .global ih264_weighted_pred_chroma_a9q
+
+ih264_weighted_pred_chroma_a9q:
+
+ stmfd sp!, {r4-r9, r14} @stack stores the values of the arguments
+
+ ldr r4, [sp, #28] @Load log_wd in r4
+ ldr r5, [sp, #32] @Load wt = {wt_u (16-bit), wt_v (16-bit)}
+ ldr r6, [sp, #36] @Load ofst = {ofst_u (8-bit), ofst_v (8-bit)}
+ ldr r8, [sp, #44] @Load wd
+
+ rsb r9, r4, #0 @r9 = -log_wd
+ vdup.32 q1, r5 @Q1 = {wt_u (16-bit), wt_v (16-bit)}
+ ldr r7, [sp, #40] @Load ht
+ vpush {d8-d15}
+ vdup.16 d4, r6 @D4 = {ofst_u (8-bit), ofst_v (8-bit)}
+ cmp r8, #8 @check if wd is 8
+ vdup.16 q0, r9 @Q0 = -log_wd (16-bit)
+ beq loop_8_uv @branch if wd is 8
+
+ cmp r8, #4 @check if ws is 4
+ beq loop_4_uv @branch if wd is 4
+
+loop_2_uv: @each iteration processes two rows
+
+ vld1.32 d6[0], [r0], r2 @load row 1 in source
+ vld1.32 d6[1], [r0], r2 @load row 2 in source
+
+ vmovl.u8 q3, d6 @converting rows 1,2 to 16-bit
+
+ vmul.s16 q3, q3, q1 @weight mult. for rows 1,2
+
+ vrshl.s16 q3, q3, q0 @rounds off the weighted samples from rows 1,2
+
+ vaddw.s8 q3, q3, d4 @adding offset for rows 1,2
+
+ vqmovun.s16 d6, q3 @saturating rows 1,2 to unsigned 8-bit
+
+ subs r7, r7, #2 @decrement ht by 2
+ vst1.32 d6[0], [r1], r3 @store row 1 in destination
+ vst1.32 d6[1], [r1], r3 @store row 2 in destination
+
+ bgt loop_2_uv @if greater than 0 repeat the loop again
+
+ b end_loops_uv
+
+loop_4_uv: @each iteration processes two rows
+
+ vld1.8 d6, [r0], r2 @load row 1 in source
+ vld1.8 d8, [r0], r2 @load row 2 in source
+
+ vmovl.u8 q3, d6 @converting row 1 to 16-bit
+ vmovl.u8 q4, d8 @converting row 2 to 16-bit
+
+ vmul.s16 q3, q3, q1 @weight mult. for row 1
+ vmul.s16 q4, q4, q1 @weight mult. for row 2
+
+ subs r7, r7, #2 @decrement ht by 2
+ vrshl.s16 q3, q3, q0 @rounds off the weighted samples from row 1
+ vrshl.s16 q4, q4, q0 @rounds off the weighted samples from row 2
+
+ vaddw.s8 q3, q3, d4 @adding offset for row 1
+ vaddw.s8 q4, q4, d4 @adding offset for row 2
+
+ vqmovun.s16 d6, q3 @saturating row 1 to unsigned 8-bit
+ vqmovun.s16 d8, q4 @saturating row 2 to unsigned 8-bit
+
+ vst1.8 d6, [r1], r3 @store row 1 in destination
+ vst1.8 d8, [r1], r3 @store row 2 in destination
+
+ bgt loop_4_uv @if greater than 0 repeat the loop again
+
+ b end_loops_uv
+
+loop_8_uv: @each iteration processes two rows
+
+ vld1.8 {q3}, [r0], r2 @load row 1 in source
+ vld1.8 {q4}, [r0], r2 @load row 2 in source
+ vmovl.u8 q7, d6 @converting row 1L to 16-bit
+ vld1.8 {q5}, [r0], r2 @load row 3 in source
+ vmovl.u8 q8, d7 @converting row 1H to 16-bit
+ vld1.8 {q6}, [r0], r2 @load row 4 in source
+
+ vmul.s16 q7, q7, q1 @weight mult. for row 1L
+ vmovl.u8 q9, d8 @converting row 2L to 16-bit
+ vmul.s16 q8, q8, q1 @weight mult. for row 1H
+ vmovl.u8 q10, d9 @converting row 2H to 16-bit
+ vmul.s16 q9, q9, q1 @weight mult. for row 2L
+ vmovl.u8 q11, d10 @converting row 3L to 16-bit
+ vmul.s16 q10, q10, q1 @weight mult. for row 2H
+ vmovl.u8 q12, d11 @converting row 3H to 16-bit
+ vmul.s16 q11, q11, q1 @weight mult. for row 3L
+ vmovl.u8 q13, d12 @converting row 4L to 16-bit
+ vmul.s16 q12, q12, q1 @weight mult. for row 3H
+ vmovl.u8 q14, d13 @converting row 4H to 16-bit
+
+ vmul.s16 q13, q13, q1 @weight mult. for row 4L
+ vrshl.s16 q7, q7, q0 @rounds off the weighted samples from row 1L
+ vmul.s16 q14, q14, q1 @weight mult. for row 4H
+
+ vrshl.s16 q8, q8, q0 @rounds off the weighted samples from row 1H
+ vrshl.s16 q9, q9, q0 @rounds off the weighted samples from row 2L
+ vaddw.s8 q7, q7, d4 @adding offset for row 1L
+ vrshl.s16 q10, q10, q0 @rounds off the weighted samples from row 2H
+ vaddw.s8 q8, q8, d4 @adding offset for row 1H
+ vqmovun.s16 d6, q7 @saturating row 1L to unsigned 8-bit
+ vrshl.s16 q11, q11, q0 @rounds off the weighted samples from row 3L
+ vaddw.s8 q9, q9, d4 @adding offset for row 2L
+ vqmovun.s16 d7, q8 @saturating row 1H to unsigned 8-bit
+ vrshl.s16 q12, q12, q0 @rounds off the weighted samples from row 3H
+ vaddw.s8 q10, q10, d4 @adding offset for row 2H
+ vqmovun.s16 d8, q9 @saturating row 2L to unsigned 8-bit
+ vrshl.s16 q13, q13, q0 @rounds off the weighted samples from row 4L
+ vaddw.s8 q11, q11, d4 @adding offset for row 3L
+ vqmovun.s16 d9, q10 @saturating row 2H to unsigned 8-bit
+ vrshl.s16 q14, q14, q0 @rounds off the weighted samples from row 4H
+ vaddw.s8 q12, q12, d4 @adding offset for row 3H
+
+ vqmovun.s16 d10, q11 @saturating row 3L to unsigned 8-bit
+ vaddw.s8 q13, q13, d4 @adding offset for row 4L
+ vqmovun.s16 d11, q12 @saturating row 3H to unsigned 8-bit
+ vaddw.s8 q14, q14, d4 @adding offset for row 4H
+
+ vqmovun.s16 d12, q13 @saturating row 4L to unsigned 8-bit
+ vst1.8 {q3}, [r1], r3 @store row 1 in destination
+ vqmovun.s16 d13, q14 @saturating row 4H to unsigned 8-bit
+ vst1.8 {q4}, [r1], r3 @store row 2 in destination
+ subs r7, r7, #4 @decrement ht by 4
+ vst1.8 {q5}, [r1], r3 @store row 3 in destination
+ vst1.8 {q6}, [r1], r3 @store row 4 in destination
+
+ bgt loop_8_uv @if greater than 0 repeat the loop again
+
+end_loops_uv:
+
+ vpop {d8-d15}
+ ldmfd sp!, {r4-r9, r15} @Reload the registers from sp
+
+
diff --git a/common/armv8/ih264_deblk_chroma_av8.s b/common/armv8/ih264_deblk_chroma_av8.s
new file mode 100755
index 0000000..3021556
--- /dev/null
+++ b/common/armv8/ih264_deblk_chroma_av8.s
@@ -0,0 +1,585 @@
+//******************************************************************************
+//*
+//* Copyright (C) 2015 The Android Open Source Project
+//*
+//* Licensed under the Apache License, Version 2.0 (the "License");
+//* you may not use this file except in compliance with the License.
+//* You may obtain a copy of the License at:
+//*
+//* http://www.apache.org/licenses/LICENSE-2.0
+//*
+//* Unless required by applicable law or agreed to in writing, software
+//* distributed under the License is distributed on an "AS IS" BASIS,
+//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//* See the License for the specific language governing permissions and
+//* limitations under the License.
+//*
+//*****************************************************************************
+//* Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
+//*/
+///*****************************************************************************/
+///* */
+///* File Name : ih264_deblk_chroma_av8.s */
+///* */
+///* Description : Contains function definitions for deblocking luma */
+///* edge. Functions are coded in NEON assembly and can */
+///* be compiled using ARM RVDS. */
+///* */
+///* List of Functions : ih264_deblk_chroma_vert_bs4_av8() */
+///* ih264_deblk_chroma_vert_bslt4_av8() */
+///* ih264_deblk_chroma_horz_bs4_av8() */
+///* ih264_deblk_chroma_horz_bslt4_av8() */
+///* Issues / Problems : None */
+///* */
+///* Revision History : */
+///* */
+///* DD MM YYYY Author(s) Changes (Describe the changes made) */
+///* 28 11 2013 Ittiam Draft */
+///*****************************************************************************/
+
+
+.text
+.p2align 2
+.include "ih264_neon_macros.s"
+
+///**
+//*******************************************************************************
+//*
+//* @brief
+//* Performs filtering of a chroma block horizontal edge when the
+//* boundary strength is set to 4 in high profile
+//*
+//* @par Description:
+//* This operation is described in Sec. 8.7.2.4 under the title
+//* "Filtering process for edges for bS equal to 4" in ITU T Rec H.264.
+//*
+//* @param[in] x0 - pu1_src
+//* Pointer to the src sample q0
+//*
+//* @param[in] x1 - src_strd
+//* Source stride
+//*
+//* @param[in] x2 - alpha_cb
+//* Alpha Value for the boundary in U
+//*
+//* @param[in] x3 - beta_cb
+//* Beta Value for the boundary in U
+//*
+//* @param[in] sp(0) - alpha_cr
+//* Alpha Value for the boundary in V
+//*
+//* @param[in] sp(4) - beta_cr
+//* Beta Value for the boundary in V
+//*
+//* @returns
+//* None
+//*
+//* @remarks
+//* None
+//*
+//*******************************************************************************
+//*/
+
+ .global ih264_deblk_chroma_horz_bs4_av8
+
+ih264_deblk_chroma_horz_bs4_av8:
+
+ // STMFD sp!,{x4-x6,x14} //
+ push_v_regs
+ stp x19, x20, [sp, #-16]!
+ mov x6, x5
+ mov x5, x4
+ sub x0, x0, x1, lsl #1 //x0 = uc_edgePixel pointing to p1 of chroma
+ ld2 {v6.8b, v7.8b}, [x0], x1 //D6 = p1u , D7 = p1v
+ mov x4, x0 //Keeping a backup of the pointer p0 of chroma
+ ld2 {v4.8b, v5.8b}, [x0], x1 //D4 = p0u , D5 = p0v
+ dup v20.8b, w2 //D20 contains alpha_cb
+ dup v21.8b, w5 //D21 contains alpha_cr
+ mov v20.d[1], v21.d[0]
+ ld2 {v0.8b, v1.8b}, [x0], x1 //D0 = q0u , D1 = q0v
+ uaddl v8.8h, v6.8b, v0.8b //
+ uaddl v10.8h, v7.8b, v1.8b //Q4,Q5 = q0 + p1
+ movi v31.8b, #2 //
+ ld2 {v2.8b, v3.8b}, [x0] //D2 = q1u , D3 = q1v
+ mov v0.d[1], v1.d[0]
+ mov v2.d[1], v3.d[0]
+ mov v4.d[1], v5.d[0]
+ mov v6.d[1], v7.d[0]
+ uabd v26.16b, v6.16b , v4.16b //Q13 = ABS(p1 - p0)
+ umlal v8.8h, v2.8b, v31.8b //
+ umlal v10.8h, v3.8b, v31.8b //Q5,Q4 = (X2(q1U) + q0U + p1U)
+ uabd v22.16b, v4.16b , v0.16b //Q11 = ABS(p0 - q0)
+ uabd v24.16b, v2.16b , v0.16b //Q12 = ABS(q1 - q0)
+ uaddl v14.8h, v4.8b, v2.8b //
+ uaddl v28.8h, v5.8b, v3.8b //Q14,Q7 = P0 + Q1
+ dup v16.8b, w3 //D16 contains beta_cb
+ dup v17.8b, w6 //D17 contains beta_cr
+ mov v16.d[1], v17.d[0]
+ umlal v14.8h, v6.8b, v31.8b //
+ umlal v28.8h, v7.8b, v31.8b //Q14,Q7 = (X2(p1U) + p0U + q1U)
+ cmhs v18.16b, v22.16b, v20.16b
+ cmhs v24.16b, v24.16b, v16.16b
+ cmhs v26.16b, v26.16b, v16.16b
+ rshrn v8.8b, v8.8h, #2 //
+ rshrn v9.8b, v10.8h, #2 //Q4 = (X2(q1U) + q0U + p1U + 2) >> 2
+ mov v8.d[1], v9.d[0]
+ orr v18.16b, v18.16b , v24.16b //Q9 = ( ABS(p0 - q0) >= Alpha ) | ( ABS(q1 - q0) >= Beta )
+ rshrn v10.8b, v14.8h, #2 //
+ rshrn v11.8b, v28.8h, #2 //Q5 = (X2(p1U) + p0U + q1U + 2) >> 2
+ mov v10.d[1], v11.d[0]
+ orr v18.16b, v18.16b , v26.16b //Q9 = ( ABS(p0 - q0) >= Alpha ) | ( ABS(q1 - q0) >= Beta ) | ( ABS(p1 - p0) >= Beta )
+ bit v10.16b, v4.16b , v18.16b //
+ bit v8.16b, v0.16b , v18.16b //
+ mov v11.d[0], v10.d[1]
+ mov v9.d[0], v8.d[1]
+ st2 {v10.8b, v11.8b}, [x4], x1 //
+ st2 {v8.8b, v9.8b}, [x4] //
+ // LDMFD sp!,{x4-x6,pc} //
+ ldp x19, x20, [sp], #16
+ pop_v_regs
+ ret
+
+
+
+///**
+//*******************************************************************************
+//*
+//* @brief
+//* Performs filtering of a chroma block vertical edge when the
+//* boundary strength is set to 4 in high profile
+//*
+//* @par Description:
+//* This operation is described in Sec. 8.7.2.4 under the title
+//* "Filtering process for edges for bS equal to 4" in ITU T Rec H.264.
+//*
+//* @param[in] x0 - pu1_src
+//* Pointer to the src sample q0
+//*
+//* @param[in] x1 - src_strd
+//* Source stride
+//*
+//* @param[in] x2 - alpha_cb
+//* Alpha Value for the boundary in U
+//*
+//* @param[in] x3 - beta_cb
+//* Beta Value for the boundary in U
+//*
+//* @param[in] sp(0) - alpha_cr
+//* Alpha Value for the boundary in V
+//*
+//* @param[in] sp(4) - beta_cr
+//* Beta Value for the boundary in V
+//*
+//* @returns
+//* None
+//*
+//* @remarks
+//* None
+//*
+//*******************************************************************************
+//*/
+
+ .global ih264_deblk_chroma_vert_bs4_av8
+
+ih264_deblk_chroma_vert_bs4_av8:
+
+ // STMFD sp!,{x4,x5,x12,x14}
+ push_v_regs
+ stp x19, x20, [sp, #-16]!
+
+ sub x0, x0, #4 //point x0 to p1u of row0.
+ mov x12, x0 //keep a back up of x0 for buffer write
+
+ add x2, x2, x4, lsl #8 //x2 = (alpha_cr,alpha_cb)
+ add x3, x3, x5, lsl #8 //x3 = (beta_cr,beta_cb)
+
+ ld4 {v0.h, v1.h, v2.h, v3.h}[0], [x0], x1
+ ld4 {v0.h, v1.h, v2.h, v3.h}[1], [x0], x1
+ ld4 {v0.h, v1.h, v2.h, v3.h}[2], [x0], x1
+ ld4 {v0.h, v1.h, v2.h, v3.h}[3], [x0], x1
+
+ ld4 {v4.h, v5.h, v6.h, v7.h}[0], [x0], x1
+ ld4 {v4.h, v5.h, v6.h, v7.h}[1], [x0], x1
+ ld4 {v4.h, v5.h, v6.h, v7.h}[2], [x0], x1
+ ld4 {v4.h, v5.h, v6.h, v7.h}[3], [x0], x1
+
+ mov v10.16b, v2.16b
+ mov v2.16b, v1.16b
+ mov v1.16b, v4.16b
+ mov v4.16b, v10.16b
+ mov v10.16b, v6.16b
+ mov v6.16b, v3.16b
+ mov v3.16b, v5.16b
+ mov v5.16b, v10.16b
+
+ dup v22.8h, w2 //Q11 = alpha
+ dup v24.8h, w3 //Q12 = beta
+ movi v31.8b, #2
+
+ mov v0.d[1], v1.d[0]
+ mov v2.d[1], v3.d[0]
+ mov v4.d[1], v5.d[0]
+ mov v6.d[1], v7.d[0]
+
+ uabd v8.16b, v2.16b , v4.16b //|p0-q0|
+ uabd v10.16b, v6.16b , v4.16b //|q1-q0|
+ uabd v12.16b, v0.16b , v2.16b //|p1-p0|
+ uaddl v14.8h, v2.8b, v6.8b
+ uaddl v16.8h, v3.8b, v7.8b //(p0 + q1)
+ cmhi v8.16b, v22.16b , v8.16b //|p0-q0| < alpha ?
+ cmhi v10.16b, v24.16b , v10.16b //|q1-q0| < beta ?
+ cmhi v12.16b, v24.16b , v12.16b //|p1-p0| < beta ?
+ umlal v14.8h, v0.8b, v31.8b
+ umlal v16.8h, v1.8b, v31.8b //2*p1 + (p0 + q1)
+ uaddl v18.8h, v0.8b, v4.8b
+ uaddl v20.8h, v1.8b, v5.8b //(p1 + q0)
+ and v8.16b, v8.16b , v10.16b //|p0-q0| < alpha && |q1-q0| < beta
+ umlal v18.8h, v6.8b, v31.8b
+ umlal v20.8h, v7.8b, v31.8b //2*q1 + (p1 + q0)
+
+ rshrn v14.8b, v14.8h, #2
+ rshrn v15.8b, v16.8h, #2 //(2*p1 + (p0 + q1) + 2) >> 2
+ mov v14.d[1], v15.d[0]
+ and v8.16b, v8.16b , v12.16b //|p0-q0| < alpha && |q1-q0| < beta && |p1-p0| < beta
+ rshrn v18.8b, v18.8h, #2
+ rshrn v19.8b, v20.8h, #2 //(2*q1 + (p1 + q0) + 2) >> 2
+ mov v18.d[1], v19.d[0]
+ bit v2.16b, v14.16b , v8.16b
+ bit v4.16b, v18.16b , v8.16b
+
+ mov v1.d[0], v0.d[1]
+ mov v3.d[0], v2.d[1]
+ mov v5.d[0], v4.d[1]
+ mov v7.d[0], v6.d[1]
+
+ mov v10.16b, v1.16b
+ mov v1.16b, v2.16b
+ mov v2.16b, v4.16b
+ mov v4.16b, v10.16b
+ mov v10.16b, v3.16b
+ mov v3.16b, v6.16b
+ mov v6.16b, v5.16b
+ mov v5.16b, v10.16b
+
+ st4 {v0.h, v1.h, v2.h, v3.h}[0], [x12], x1
+ st4 {v0.h, v1.h, v2.h, v3.h}[1], [x12], x1
+ st4 {v0.h, v1.h, v2.h, v3.h}[2], [x12], x1
+ st4 {v0.h, v1.h, v2.h, v3.h}[3], [x12], x1
+
+ st4 {v4.h, v5.h, v6.h, v7.h}[0], [x12], x1
+ st4 {v4.h, v5.h, v6.h, v7.h}[1], [x12], x1
+ st4 {v4.h, v5.h, v6.h, v7.h}[2], [x12], x1
+ st4 {v4.h, v5.h, v6.h, v7.h}[3], [x12], x1
+
+ // LDMFD sp!,{x4,x5,x12,pc}
+ ldp x19, x20, [sp], #16
+ pop_v_regs
+ ret
+
+
+
+///**
+//*******************************************************************************
+//*
+//* @brief
+//* Performs filtering of a chroma block horizontal edge for cases where the
+//* boundary strength is less than 4 in high profile
+//*
+//* @par Description:
+//* This operation is described in Sec. 8.7.2.4 under the title
+//* "Filtering process for edges for bS equal to 4" in ITU T Rec H.264.
+//*
+//* @param[in] x0 - pu1_src
+//* Pointer to the src sample q0
+//*
+//* @param[in] x1 - src_strd
+//* Source stride
+//*
+//* @param[in] x2 - alpha_cb
+//* Alpha Value for the boundary in U
+//*
+//* @param[in] x3 - beta_cb
+//* Beta Value for the boundary in U
+//*
+//* @param[in] sp(0) - alpha_cr
+//* Alpha Value for the boundary in V
+//*
+//* @param[in] sp(4) - beta_cr
+//* Beta Value for the boundary in V
+//*
+//* @param[in] sp(8) - u4_bs
+//* Packed Boundary strength array
+//*
+//* @param[in] sp(12) - pu1_cliptab_cb
+//* tc0_table for U
+//*
+//* @param[in] sp(16) - pu1_cliptab_cr
+//* tc0_table for V
+//*
+//* @returns
+//* None
+//*
+//* @remarks
+//* None
+//*
+//*******************************************************************************
+//*/
+
+ .global ih264_deblk_chroma_horz_bslt4_av8
+
+ih264_deblk_chroma_horz_bslt4_av8:
+
+ // STMFD sp!,{x4-x9,x14} //
+ push_v_regs
+ stp x19, x20, [sp, #-16]!
+ mov x8, x7
+ mov x7, x6
+ ldr x9, [sp, #80]
+ sub x0, x0, x1, lsl #1 //x0 = uc_edgePixelU pointing to p1 of chroma U
+ rev w7, w7 //
+ mov v12.2s[0], w7 //D12[0] = ui_Bs
+ ld1 {v16.s}[0], [x8] //D16[0] contains cliptab_cb
+ ld1 {v17.s}[0], [x9] //D17[0] contains cliptab_cr
+ ld2 {v6.8b, v7.8b}, [x0], x1 //Q3=p1
+ tbl v14.8b, {v16.16b}, v12.8b //Retreiving cliptab values for U
+ tbl v28.8b, {v17.16b}, v12.8b //Retrieving cliptab values for V
+ uxtl v12.8h, v12.8b //Q6 = uc_Bs in each 16 bit scalar
+ mov x6, x0 //Keeping a backup of the pointer to chroma U P0
+ ld2 {v4.8b, v5.8b}, [x0], x1 //Q2=p0
+ movi v30.8b, #1 //
+ dup v20.8b, w2 //D20 contains alpha_cb
+ dup v21.8b, w4 //D21 contains alpha_cr
+ mov v20.d[1], v21.d[0]
+ ld2 {v0.8b, v1.8b}, [x0], x1 //Q0=q0
+ uxtl v14.8h, v14.8b //
+ uxtl v28.8h, v28.8b //
+ mov v15.d[0], v28.d[0] //D14 has cliptab values for U, D15 for V
+ mov v14.d[1], v28.d[0]
+ ld2 {v2.8b, v3.8b}, [x0] //Q1=q1
+ usubl v10.8h, v1.8b, v5.8b //
+ usubl v8.8h, v0.8b, v4.8b //Q5,Q4 = (q0 - p0)
+ mov v6.d[1], v7.d[0]
+ mov v4.d[1], v5.d[0]
+ uabd v26.16b, v6.16b , v4.16b //Q13 = ABS(p1 - p0)
+ shl v10.8h, v10.8h, #2 //Q5 = (q0 - p0)<<2
+ mov v0.d[1], v1.d[0]
+ uabd v22.16b, v4.16b , v0.16b //Q11 = ABS(p0 - q0)
+ shl v8.8h, v8.8h, #2 //Q4 = (q0 - p0)<<2
+ mov v14.d[1], v15.d[0]
+ sli v14.8h, v14.8h, #8
+ mov v15.d[0], v14.d[1]
+ mov v2.d[1], v3.d[0]
+ uabd v24.16b, v2.16b , v0.16b //Q12 = ABS(q1 - q0)
+ cmhs v18.16b, v22.16b, v20.16b
+ usubl v20.8h, v6.8b, v2.8b //Q10 = (p1 - q1)L
+ usubl v6.8h, v7.8b, v3.8b //Q3 = (p1 - q1)H
+ dup v16.8b, w3 //Q8 contains beta_cb
+ dup v17.8b, w5 //Q8 contains beta_cr
+ mov v16.d[1], v17.d[0]
+ add v8.8h, v8.8h , v20.8h //
+ add v10.8h, v10.8h , v6.8h //Q5,Q4 = [ (q0 - p0)<<2 ] + (p1 - q1)
+ cmhs v24.16b, v24.16b, v16.16b
+ cmgt v12.4h, v12.4h, #0
+ sqrshrn v8.8b, v8.8h, #3 //
+ sqrshrn v9.8b, v10.8h, #3 //Q4 = i_macro = (((q0 - p0)<<2) + (p1 - q1) + 4)>>3
+ mov v8.d[1], v9.d[0]
+ add v14.8b, v14.8b , v30.8b //D14 = C = C0+1 for U
+ cmhs v26.16b, v26.16b, v16.16b
+ orr v18.16b, v18.16b , v24.16b //Q9 = ( ABS(p0 - q0) >= Alpha ) | ( ABS(q1 - q0) >= Beta )
+ abs v6.16b, v8.16b //Q4 = ABS (i_macro)
+ add v15.8b, v15.8b , v30.8b //D15 = C = C0+1 for V
+ mov v14.d[1], v15.d[0]
+ mov v13.8b, v12.8b
+ mov v12.d[1], v13.d[0] //
+ orr v18.16b, v18.16b , v26.16b //Q9 = ( ABS(p0 - q0) >= Alpha ) | ( ABS(q1 - q0) >= Beta ) | ( ABS(p1 - p0) >= Beta )
+ umin v14.16b, v6.16b , v14.16b //Q7 = delta = (ABS(i_macro) > C) ? C : ABS(i_macro)
+ bic v12.16b, v12.16b , v18.16b //final condition
+ cmge v8.16b, v8.16b, #0
+ and v14.16b, v14.16b , v12.16b //Making delta zero in places where values shouldn be filterd
+ uqadd v16.16b, v4.16b , v14.16b //Q8 = p0 + delta
+ uqsub v4.16b, v4.16b , v14.16b //Q2 = p0 - delta
+ uqadd v18.16b, v0.16b , v14.16b //Q9 = q0 + delta
+ uqsub v0.16b, v0.16b , v14.16b //Q0 = q0 - delta
+ bif v16.16b, v4.16b , v8.16b //Q8 = (i_macro >= 0 ) ? (p0+delta) : (p0-delta)
+ bif v0.16b, v18.16b , v8.16b //Q0 = (i_macro >= 0 ) ? (q0-delta) : (q0+delta)
+ mov v17.d[0], v16.d[1]
+ mov v1.d[0], v0.d[1]
+ st2 {v16.8b, v17.8b}, [x6], x1 //
+ st2 {v0.8b, v1.8b}, [x6] //
+
+ ldp x19, x20, [sp], #16
+ pop_v_regs
+ ret
+
+
+
+
+///**
+//*******************************************************************************
+//*
+//* @brief
+//* Performs filtering of a chroma block vertical edge for cases where the
+//* boundary strength is less than 4 in high profile
+//*
+//* @par Description:
+//* This operation is described in Sec. 8.7.2.4 under the title
+//* "Filtering process for edges for bS equal to 4" in ITU T Rec H.264.
+//*
+//* @param[in] x0 - pu1_src
+//* Pointer to the src sample q0
+//*
+//* @param[in] x1 - src_strd
+//* Source stride
+//*
+//* @param[in] x2 - alpha_cb
+//* Alpha Value for the boundary in U
+//*
+//* @param[in] x3 - beta_cb
+//* Beta Value for the boundary in U
+//*
+//* @param[in] sp(0) - alpha_cr
+//* Alpha Value for the boundary in V
+//*
+//* @param[in] sp(4) - beta_cr
+//* Beta Value for the boundary in V
+//*
+//* @param[in] sp(8) - u4_bs
+//* Packed Boundary strength array
+//*
+//* @param[in] sp(12) - pu1_cliptab_cb
+//* tc0_table for U
+//*
+//* @param[in] sp(16) - pu1_cliptab_cr
+//* tc0_table for V
+//*
+//* @returns
+//* None
+//*
+//* @remarks
+//* None
+//*
+//*******************************************************************************
+//*/
+
+ .global ih264_deblk_chroma_vert_bslt4_av8
+
+ih264_deblk_chroma_vert_bslt4_av8:
+
+ // STMFD sp!,{x4-x7,x10-x12,x14}
+ push_v_regs
+ stp x19, x20, [sp, #-16]!
+ mov x10, x7
+ ldr x11, [sp, #80] //x6 = u4_bs
+ sub x0, x0, #4 //point x0 to p1u of row0.
+ add x2, x2, x4, lsl #8
+ add x3, x3, x5, lsl #8
+ mov x12, x0 //keep a back up of x0 for buffer write
+ ld4 {v0.h, v1.h, v2.h, v3.h}[0], [x0], x1
+ ld4 {v0.h, v1.h, v2.h, v3.h}[1], [x0], x1
+ ld4 {v0.h, v1.h, v2.h, v3.h}[2], [x0], x1
+ ld4 {v0.h, v1.h, v2.h, v3.h}[3], [x0], x1
+
+ ld4 {v4.h, v5.h, v6.h, v7.h}[0], [x0], x1
+ ld4 {v4.h, v5.h, v6.h, v7.h}[1], [x0], x1
+ ld4 {v4.h, v5.h, v6.h, v7.h}[2], [x0], x1
+ ld4 {v4.h, v5.h, v6.h, v7.h}[3], [x0], x1
+
+ mov v10.16b, v2.16b
+ mov v2.16b, v1.16b
+ mov v1.16b, v4.16b
+ mov v4.16b, v10.16b
+ mov v10.16b, v6.16b
+ mov v6.16b, v3.16b
+ mov v3.16b, v5.16b
+ mov v5.16b, v10.16b
+ dup v22.8h, w2 //Q11 = alpha
+ mov v2.d[1], v3.d[0]
+ mov v4.d[1], v5.d[0]
+ uabd v8.16b, v2.16b , v4.16b //|p0-q0|
+ dup v24.8h, w3 //Q12 = beta
+ mov v25.d[0], v24.d[1]
+ mov v6.d[1], v7.d[0]
+ mov v0.d[1], v1.d[0]
+ uabd v10.16b, v6.16b , v4.16b //|q1-q0|
+ uabd v12.16b, v0.16b , v2.16b //|p1-p0|
+ cmhi v8.16b, v22.16b , v8.16b //|p0-q0| < alpha ?
+ usubl v14.8h, v0.8b, v6.8b
+ cmhi v10.16b, v24.16b , v10.16b //|q1-q0| < beta ?
+ usubl v16.8h, v1.8b, v7.8b //(p1 - q1)
+ cmhi v12.16b, v24.16b , v12.16b //|p1-p0| < beta ?
+ usubl v18.8h, v4.8b, v2.8b
+ and v8.16b, v8.16b , v10.16b //|p0-q0| < alpha && |q1-q0| < beta
+ usubl v20.8h, v5.8b, v3.8b //(q0 - p0)
+ movi v28.8h, #4
+ ld1 {v24.s}[0], [x10] //Load ClipTable for U
+ ld1 {v25.s}[0], [x11] //Load ClipTable for V
+ rev w6, w6 //Blocking strengths
+ and v8.16b, v8.16b , v12.16b //|p0-q0| < alpha && |q1-q0| < beta && |p1-p0| < beta
+ mov v10.s[0], w6
+ mla v14.8h, v18.8h , v28.8h
+ mla v16.8h, v20.8h , v28.8h //4*(q0 - p0) + (p1 - q1)
+ uxtl v10.8h, v10.8b
+ sli v10.4h, v10.4h, #8
+ tbl v12.8b, {v24.16b}, v10.8b //tC0 for U
+ tbl v13.8b, {v25.16b}, v10.8b //tC0 for V
+ zip1 v31.8b, v12.8b, v13.8b
+ zip2 v13.8b, v12.8b, v13.8b
+ mov v12.8b, v31.8b
+ mov v12.d[1], v13.d[0]
+ uxtl v10.4s, v10.4h
+ sli v10.4s, v10.4s, #16
+ movi v24.16b, #1
+ add v12.16b, v12.16b , v24.16b //tC0 + 1
+ cmhs v10.16b, v10.16b , v24.16b
+ and v8.16b, v8.16b , v10.16b //|p0-q0| < alpha && |q1-q0| < beta && |p1-p0| < beta && u4_bs != 0
+ // Q0 - Q3(inputs),
+ // Q4 (|p0-q0| < alpha && |q1-q0| < beta && |p1-p0| < beta && u4_bs != 0),
+ // Q6 (tC)
+ srshr v14.8h, v14.8h, #3
+ srshr v16.8h, v16.8h, #3 //(((q0 - p0) << 2) + (p1 - q1) + 4) >> 3)
+ cmgt v18.8h, v14.8h , #0
+ cmgt v20.8h, v16.8h , #0
+ xtn v18.8b, v18.8h
+ xtn v19.8b, v20.8h //Q9 = sign(delta)
+ mov v18.d[1], v19.d[0]
+ abs v14.8h, v14.8h
+ abs v16.8h, v16.8h
+ xtn v14.8b, v14.8h
+ xtn v15.8b, v16.8h
+ mov v14.d[1], v15.d[0]
+ umin v14.16b, v14.16b , v12.16b //Q7 = |delta|
+ uqadd v20.16b, v2.16b , v14.16b //p0+|delta|
+ uqadd v22.16b, v4.16b , v14.16b //q0+|delta|
+ uqsub v24.16b, v2.16b , v14.16b //p0-|delta|
+ uqsub v26.16b, v4.16b , v14.16b //q0-|delta|
+ bit v24.16b, v20.16b , v18.16b //p0 + delta
+ bit v22.16b, v26.16b , v18.16b //q0 - delta
+ bit v2.16b, v24.16b , v8.16b
+ bit v4.16b, v22.16b , v8.16b
+ mov v1.d[0], v0.d[1]
+ mov v3.d[0], v2.d[1]
+ mov v5.d[0], v4.d[1]
+ mov v7.d[0], v6.d[1]
+ mov v10.16b, v1.16b
+ mov v1.16b, v2.16b
+ mov v2.16b, v4.16b
+ mov v4.16b, v10.16b
+ mov v10.16b, v3.16b
+ mov v3.16b, v6.16b
+ mov v6.16b, v5.16b
+ mov v5.16b, v10.16b
+ st4 {v0.h, v1.h, v2.h, v3.h}[0], [x12], x1
+ st4 {v0.h, v1.h, v2.h, v3.h}[1], [x12], x1
+ st4 {v0.h, v1.h, v2.h, v3.h}[2], [x12], x1
+ st4 {v0.h, v1.h, v2.h, v3.h}[3], [x12], x1
+
+ st4 {v4.h, v5.h, v6.h, v7.h}[0], [x12], x1
+ st4 {v4.h, v5.h, v6.h, v7.h}[1], [x12], x1
+ st4 {v4.h, v5.h, v6.h, v7.h}[2], [x12], x1
+ st4 {v4.h, v5.h, v6.h, v7.h}[3], [x12], x1
+
+ ldp x19, x20, [sp], #16
+ pop_v_regs
+ ret
+
+
diff --git a/common/armv8/ih264_deblk_luma_av8.s b/common/armv8/ih264_deblk_luma_av8.s
new file mode 100755
index 0000000..bcdb03f
--- /dev/null
+++ b/common/armv8/ih264_deblk_luma_av8.s
@@ -0,0 +1,1084 @@
+//******************************************************************************
+//*
+//* Copyright (C) 2015 The Android Open Source Project
+//*
+//* Licensed under the Apache License, Version 2.0 (the "License");
+//* you may not use this file except in compliance with the License.
+//* You may obtain a copy of the License at:
+//*
+//* http://www.apache.org/licenses/LICENSE-2.0
+//*
+//* Unless required by applicable law or agreed to in writing, software
+//* distributed under the License is distributed on an "AS IS" BASIS,
+//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//* See the License for the specific language governing permissions and
+//* limitations under the License.
+//*
+//*****************************************************************************
+//* Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
+//*/
+///*****************************************************************************/
+///* */
+///* File Name : ih264_deblk_luma_av8.s */
+///* */
+///* Description : Contains function definitions for deblocking luma */
+///* edge. Functions are coded in NEON assembly and can */
+///* be compiled using ARM RVDS. */
+///* */
+///* List of Functions : ih264_deblk_luma_vert_bs4_av8() */
+///* ih264_deblk_luma_vert_bslt4_av8() */
+///* ih264_deblk_luma_horz_bs4_av8() */
+///* ih264_deblk_luma_horz_bslt4_av8() */
+///* */
+///* Issues / Problems : None */
+///* */
+///* Revision History : */
+///* */
+///* DD MM YYYY Author(s) Changes (Describe the changes made) */
+///* 28 11 2013 Ittiam Draft */
+///* */
+///*****************************************************************************/
+
+
+.text
+.p2align 2
+.include "ih264_neon_macros.s"
+
+
+
+///**
+//*******************************************************************************
+//*
+//* @brief
+//* Performs filtering of a luma block horizontal edge for cases where the
+//* boundary strength is less than 4
+//*
+//* @par Description:
+//* This operation is described in Sec. 8.7.2.4 under the title
+//* "Filtering process for edges for bS equal to 4" in ITU T Rec H.264.
+//*
+//* @param[in] x0 - pu1_src
+//* Pointer to the src sample q0
+//*
+//* @param[in] x1 - src_strd
+//* Source stride
+//*
+//* @param[in] x2 - alpha
+//* Alpha Value for the boundary
+//*
+//* @param[in] x3 - beta
+//* Beta Value for the boundary
+//*
+//* @param[in] sp(0) - u4_bs
+//* Packed Boundary strength array
+//*
+//* @param[in] sp(4) - pu1_cliptab
+//* tc0_table
+//*
+//* @returns
+//* None
+//*
+//* @remarks
+//* None
+//*
+//*******************************************************************************
+//*/
+
+ .global ih264_deblk_luma_horz_bslt4_av8
+
+ih264_deblk_luma_horz_bslt4_av8:
+
+ // STMFD sp!,{x4-x7,x14}
+ push_v_regs
+ stp x19, x20, [sp, #-16]!
+
+ //LDRD x4,x5,[SP,#0x14] //x4 = ui_Bs , x5 = *puc_ClpTab
+ sub x0, x0, x1, lsl #1 //x1 = uc_Horizonpad
+ sub x0, x0, x1 //x0 pointer to p2
+ rev w4, w4 //
+ ld1 {v10.8b, v11.8b}, [x0], x1 //p2 values are loaded into q5
+ mov v12.2s[0], w4 //d12[0] = ui_Bs
+ mov x6, x0 //keeping backup of pointer to p1
+ ld1 {v8.8b, v9.8b}, [x0], x1 //p1 values are loaded into q4
+ mov x7, x0 //keeping backup of pointer to p0
+ ld1 {v6.8b, v7.8b}, [x0], x1 //p0 values are loaded into q3
+ uxtl v12.8h, v12.8b //q6 = uc_Bs in each 16 bt scalar
+ ld1 {v0.8b, v1.8b}, [x0], x1 //q0 values are loaded into q0
+ mov v10.d[1], v11.d[0]
+ mov v8.d[1], v9.d[0]
+ mov v6.d[1], v7.d[0]
+ uabd v26.16b, v8.16b, v6.16b
+ ld1 {v2.8b, v3.8b}, [x0], x1 //q1 values are loaded into q1
+ mov v0.d[1], v1.d[0]
+ mov v2.d[1], v3.d[0]
+ uabd v22.16b, v6.16b, v0.16b
+ ld1 {v16.s}[0], [x5] //D16[0] contains cliptab
+ uabd v24.16b, v2.16b, v0.16b
+ ld1 {v4.8b, v5.8b}, [x0], x1 //q2 values are loaded into q2
+ tbl v14.8b, {v16.16b}, v12.8b //
+ mov v4.d[1], v5.d[0]
+ dup v20.16b, w2 //Q10 contains alpha
+ dup v16.16b, w3 //Q8 contains beta
+ uxtl v12.4s, v12.4h //
+ uxtl v14.4s, v14.4h //
+ uabd v28.16b, v10.16b, v6.16b
+ uabd v30.16b, v4.16b, v0.16b
+ cmgt v12.4s, v12.4s, #0
+ sli v14.4s, v14.4s, #8
+ cmhs v18.16b, v22.16b, v20.16b
+ cmhs v24.16b, v24.16b, v16.16b
+ cmhs v26.16b, v26.16b, v16.16b
+ cmhi v20.16b, v16.16b , v28.16b //Q10=(Ap<Beta)
+ cmhi v22.16b, v16.16b , v30.16b //Q11=(Aq<Beta)
+ sli v14.4s, v14.4s, #16
+ orr v18.16b, v18.16b , v24.16b //Q9 = ( ABS(p0 - q0) >= Alpha ) | ( ABS(q1 - q0) >= Beta )
+ usubl v30.8h, v1.8b, v7.8b //
+ usubl v24.8h, v0.8b, v6.8b //Q15,Q12 = (q0 - p0)
+ orr v18.16b, v18.16b , v26.16b //Q9 = ( ABS(p0 - q0) >= Alpha ) | ( ABS(q1 - q0) >= Beta ) | ( ABS(p1 - p0) >= Beta )
+ usubl v28.8h, v8.8b, v2.8b //Q14 = (p1 - q1)L
+ shl v26.8h, v30.8h, #2 //Q13 = (q0 - p0)<<2
+ shl v24.8h, v24.8h, #2 //Q12 = (q0 - p0)<<2
+ usubl v30.8h, v9.8b, v3.8b //Q15 = (p1 - q1)H
+ bic v12.16b, v12.16b , v18.16b //final condition
+ add v24.8h, v24.8h , v28.8h //
+ add v26.8h, v26.8h , v30.8h //Q13,Q12 = [ (q0 - p0)<<2 ] + (p1 - q1)
+ sub v18.16b, v14.16b , v20.16b //Q9 = C0 + (Ap < Beta)
+ urhadd v16.16b, v6.16b , v0.16b //Q8 = ((p0+q0+1) >> 1)
+ mov v17.d[0], v16.d[1]
+ sqrshrn v24.8b, v24.8h, #3 //
+ sqrshrn v25.8b, v26.8h, #3 //Q12 = i_macro = (((q0 - p0)<<2) + (p1 - q1) + 4)>>3
+ mov v24.d[1], v25.d[0]
+ sub v18.16b, v18.16b , v22.16b //Q9 = C0 + (Ap < Beta) + (Aq < Beta)
+ and v20.16b, v20.16b , v12.16b //
+ and v22.16b, v22.16b , v12.16b //
+ abs v26.16b, v24.16b //Q13 = ABS (i_macro)
+ uaddl v28.8h, v17.8b, v11.8b //
+ uaddl v10.8h, v16.8b, v10.8b //Q14,Q5 = p2 + (p0+q0+1)>>1
+ uaddl v30.8h, v17.8b, v5.8b //
+ umin v18.16b, v26.16b , v18.16b //Q9 = delta = (ABS(i_macro) > C) ? C : ABS(i_macro)
+ ushll v26.8h, v9.8b, #1 //
+ uaddl v4.8h, v16.8b, v4.8b //Q15,Q2 = q2 + (p0+q0+1)>>1
+ ushll v16.8h, v8.8b, #1 //Q13,Q8 = (p1<<1)
+ and v18.16b, v18.16b , v12.16b //Making delta zero in places where values shouldn be filterd
+ sub v28.8h, v28.8h , v26.8h //Q14,Q5 = [p2 + (p0+q0+1)>>1] - (p1<<1)
+ sub v10.8h, v10.8h , v16.8h //
+ ushll v16.8h, v2.8b, #1 //
+ ushll v26.8h, v3.8b, #1 //Q13,Q8 = (q1<<1)
+ sqshrn v29.8b, v28.8h, #1 //
+ sqshrn v28.8b, v10.8h, #1 //Q14 = i_macro_p1
+ mov v28.d[1], v29.d[0]
+ sub v4.8h, v4.8h , v16.8h //
+ sub v30.8h, v30.8h , v26.8h //Q15,Q2 = [q2 + (p0+q0+1)>>1] - (q1<<1)
+ neg v26.16b, v14.16b //Q13 = -C0
+ smin v28.16b, v28.16b , v14.16b //Q14 = min(C0,i_macro_p1)
+ cmge v24.16b, v24.16b, #0
+ sqshrn v31.8b, v30.8h, #1 //
+ sqshrn v30.8b, v4.8h, #1 //Q15 = i_macro_q1
+ mov v30.d[1], v31.d[0]
+ smax v28.16b, v28.16b , v26.16b //Q14 = max( - C0 , min(C0, i_macro_p1) )
+ uqadd v16.16b, v6.16b , v18.16b //Q8 = p0 + delta
+ uqsub v6.16b, v6.16b , v18.16b //Q3 = p0 - delta
+ smin v30.16b, v30.16b , v14.16b //Q15 = min(C0,i_macro_q1)
+ and v28.16b, v20.16b , v28.16b //condition check Ap<beta
+ uqadd v14.16b, v0.16b , v18.16b //Q7 = q0 + delta
+ uqsub v0.16b, v0.16b , v18.16b //Q0 = q0 - delta
+ smax v30.16b, v30.16b , v26.16b //Q15 = max( - C0 , min(C0, i_macro_q1) )
+ bif v16.16b, v6.16b , v24.16b //Q8 = (i_macro >= 0 ) ? (p0+delta) : (p0-delta)
+ bif v0.16b, v14.16b , v24.16b //Q0 = (i_macro >= 0 ) ? (q0-delta) : (q0+delta)
+ add v28.16b, v28.16b , v8.16b //
+ and v30.16b, v22.16b , v30.16b //condition check Aq<beta
+ st1 {v16.16b}, [x7], x1 //writting back filtered value of p0
+ add v30.16b, v30.16b , v2.16b //
+ st1 {v0.16b}, [x7], x1 //writting back filtered value of q0
+ st1 {v28.16b}, [x6] //writting back filtered value of p1
+ st1 {v30.16b}, [x7], x1 //writting back filtered value of q1
+
+ // LDMFD sp!,{x4-x7,pc}
+ ldp x19, x20, [sp], #16
+ pop_v_regs
+ ret
+
+
+
+///**
+//*******************************************************************************
+//*
+//* @brief
+//* Performs filtering of a luma block horizontal edge when the
+//* boundary strength is set to 4
+//*
+//* @par Description:
+//* This operation is described in Sec. 8.7.2.4 under the title
+//* "Filtering process for edges for bS equal to 4" in ITU T Rec H.264.
+//*
+//* @param[in] x0 - pu1_src
+//* Pointer to the src sample q0
+//*
+//* @param[in] x1 - src_strd
+//* Source stride
+//*
+//* @param[in] x2 - alpha
+//* Alpha Value for the boundary
+//*
+//* @param[in] x3 - beta
+//* Beta Value for the boundary
+//*
+//* @returns
+//* None
+//*
+//* @remarks
+//* None
+//*
+//*******************************************************************************
+//*/
+
+ .global ih264_deblk_luma_horz_bs4_av8
+
+ih264_deblk_luma_horz_bs4_av8:
+
+ // Back up necessary registers on stack
+ // STMFD sp!,{x12,x14}
+ push_v_regs
+ stp x19, x20, [sp, #-16]!
+
+ // Init
+ dup v0.16b, w2 //duplicate alpha
+ sub x12, x0, x1 //pointer to p0 = q0 - src_strd
+ dup v2.16b, w3 //duplicate beta
+ sub x14, x0, x1, lsl#1 //pointer to p1 = q0 - src_strd*2
+ sub x2, x0, x1, lsl#2 //pointer to p3 = q0 - src_strd*4
+ sub x3, x14, x1 //pointer to p2 = p1 - src_strd
+
+ // Load Data
+ ld1 {v4.8b, v5.8b}, [x0], x1 //load q0 to Q2, q0 = q0 + src_strd
+ ld1 {v6.8b, v7.8b}, [x12] //load p0 to Q3
+ ld1 {v8.8b, v9.8b}, [x0], x1 //load q1 to Q4, q0 = q0 + src_strd
+ ld1 {v10.8b, v11.8b}, [x14] //load p1 to Q5
+ mov v4.d[1] , v5.d[0]
+ mov v6.d[1] , v7.d[0]
+ mov v8.d[1] , v9.d[0]
+ mov v10.d[1] , v11.d[0]
+
+ // Filter Decision
+ uabd v12.16b , v4.16b, v6.16b
+ uabd v14.16b , v8.16b, v4.16b
+ uabd v16.16b , v10.16b, v6.16b
+ cmhs v18.16b, v12.16b , v0.16b //ABS(p0 - q0) >= Alpha
+ cmhs v14.16b, v14.16b , v2.16b //ABS(q1 - q0) >= Beta
+ cmhs v16.16b, v16.16b , v2.16b //ABS(q1 - q0) >= Beta
+ movi v20.16b, #2
+ orr v18.16b, v18.16b , v14.16b //ABS(p0 - q0) >= Alpha || ABS(q1 - q0) >= Beta
+ ld1 {v14.8b, v15.8b}, [x0], x1 //load q2 to Q7, q0 = q0 + src_strd
+ mov v14.d[1] , v15.d[0]
+ orr v18.16b, v18.16b , v16.16b //ABS(p0 - q0) >= Alpha || ABS(q1 - q0) >= Beta || ABS(p1 - p0) >= Beta
+ usra v20.16b, v0.16b, #2 //alpha >>2 +2
+ uabd v22.16b , v14.16b, v4.16b
+ uaddl v24.8h, v4.8b, v6.8b //p0+q0 L
+ uaddl v26.8h, v5.8b, v7.8b //p0+q0 H
+ cmhi v22.16b, v2.16b , v22.16b //Aq < Beta
+ cmhi v20.16b, v20.16b , v12.16b //(ABS(p0 - q0) <((Alpha >>2) + 2))
+ // Deblock Filtering q0', q1', q2'
+ uaddw v28.8h, v24.8h , v8.8b //p0+q0+q1 L
+ uaddw v30.8h, v26.8h , v9.8b //p0+q0+q1 H
+ and v22.16b, v22.16b , v20.16b //(Aq < Beta && ABS(p0 - q0) <((Alpha >>2) + 2))
+ // q0' if (Aq < Beta && ABS(p0 - q0) <((Alpha >>2) + 2)) TRUE
+ add v16.8h, v28.8h , v28.8h //2*(p0+q0+q1)L
+ add v0.8h, v30.8h , v30.8h //2*(p0+q0+q1)H
+ uaddw v16.8h, v16.8h , v14.8b //2*(p0+q0+q1)+q2 L
+ uaddw v0.8h, v0.8h , v15.8b //2*(p0+q0+q1)+q2 H
+ uaddw v16.8h, v16.8h , v10.8b //2*(p0+q0+q1)+q2 +p1 L
+ uaddw v0.8h, v0.8h , v11.8b //2*(p0+q0+q1)+q2 +p1 H
+ rshrn v12.8b, v16.8h, #3 //(2*(p0+q0+q1)+q2 +p1 +4)>> 3 L [q0']
+ rshrn v13.8b, v0.8h, #3 //(2*(p0+q0+q1)+q2 +p1 +4)>> 3 H [q0']
+ mov v12.d[1] , v13.d[0]
+ // q0" if (Aq < Beta && ABS(p0 - q0) <((Alpha >>2) + 2)) FALSE
+ uaddl v16.8h, v8.8b, v8.8b //2*q1 L
+ uaddl v0.8h, v9.8b, v9.8b //2*q1 H
+ uaddw v16.8h, v16.8h , v4.8b //2*q1+q0 L
+ uaddw v0.8h, v0.8h , v5.8b //2*q1+q0 H
+ uaddw v16.8h, v16.8h , v10.8b //2*q1+q0+p1 L
+ uaddw v0.8h, v0.8h , v11.8b //2*q1+q0+p1 H
+ rshrn v16.8b, v16.8h, #2 //(2*q1+q0+p1+2)>>2 L [q0"]
+ rshrn v17.8b, v0.8h, #2 //(2*q1+q0+p1+2)>>2 H [q0"]
+ mov v16.d[1] , v17.d[0]
+ uaddw v28.8h, v28.8h , v14.8b //p0+q0+q1+q2 L
+ uaddw v30.8h, v30.8h , v15.8b //p0+q0+q1+q2 H
+ ld1 {v0.8b, v1.8b}, [x0], x1 //load q3 to Q0, q0 = q0 + src_strd
+ mov v0.d[1] , v1.d[0]
+ bit v16.16b, v12.16b , v22.16b //choosing between q0' and q0" depending on condn
+ sub x0, x0, x1, lsl #2 //pointer to q0
+ bic v22.16b, v22.16b , v18.16b //((ABS(p0 - q0) >= Alpha || ABS(q1 - q0) >= Beta || ABS(p1 - p0) >= Beta))
+ // && (Aq < Beta && ABS(p0 - q0) <((Alpha >>2) + 2))
+ rshrn v12.8b, v28.8h, #2 //(p0+q0+q1+q2+2)>>2 L [q1']
+ rshrn v13.8b, v30.8h, #2 //(p0+q0+q1+q2+2)>>2 H [q1']
+ mov v12.d[1] , v13.d[0]
+ bif v4.16b, v16.16b , v18.16b //choose q0 or filtered q0
+ mov v5.d[0] , v4.d[1]
+ uaddl v16.8h, v14.8b, v0.8b //q2+q3,L
+ uaddl v0.8h, v15.8b, v1.8b //q2+q3,H
+ add v28.8h, v28.8h , v16.8h //p0+q0+q1+2*q2+q3 L
+ st1 {v4.8b, v5.8b}, [x0], x1 //store q0
+ add v30.8h, v30.8h , v0.8h //p0+q0+q1+2*q2+q3 H
+ add v28.8h, v28.8h , v16.8h //p0+q0+q1+3*q2+2*q3 L
+ add v30.8h, v30.8h , v0.8h //p0+q0+q1+3*q2+2*q3 H
+ rshrn v0.8b, v28.8h, #3 //(p0+q0+q1+3*q2+2*q3+4)>>3 L [q2']
+ rshrn v1.8b, v30.8h, #3 //(p0+q0+q1+3*q2+2*q3+4)>>3 H [q2']
+ mov v0.d[1] , v1.d[0]
+ ld1 {v30.8b, v31.8b}, [x3] //load p2 to Q15
+ mov v30.d[1] , v31.d[0]
+ bif v12.16b, v8.16b , v22.16b //choose q1 or filtered value of q1
+ mov v13.d[0] , v12.d[1]
+ uabd v16.16b , v30.16b, v6.16b
+ uaddw v24.8h, v24.8h , v10.8b //p0+q0+p1 L
+ bif v0.16b, v14.16b , v22.16b //choose q2 or filtered q2
+ mov v1.d[0] , v0.d[1]
+ uaddw v26.8h, v26.8h , v11.8b //p0+q0+p1 H
+ st1 {v12.8b, v13.8b}, [x0], x1 //store q1
+ cmhi v16.16b, v2.16b , v16.16b //Ap < Beta
+ add v28.8h, v24.8h , v24.8h //2*(p0+q0+p1) L
+ add v4.8h, v26.8h , v26.8h //2*(p0+q0+p1) H
+ st1 {v0.8b, v1.8b}, [x0], x1 //store q2
+ and v20.16b, v20.16b , v16.16b //((Ap < Beta) && (ABS(p0 - q0) <((Alpha >>2) + 2)))
+ uaddw v28.8h, v28.8h , v30.8b //2*(p0+q0+p1)+p2 l
+ uaddw v4.8h, v4.8h , v31.8b //2*(p0+q0+p1)+p2 H
+ uaddw v28.8h, v28.8h , v8.8b //2*(p0+q0+p1)+p2+q1 L
+ uaddw v4.8h, v4.8h , v9.8b //2*(p0+q0+p1)+p2+q1 H
+ rshrn v28.8b, v28.8h, #3 //(2*(p0+q0+p1)+p2+q1+4)>>3 L,p0'
+ rshrn v29.8b, v4.8h, #3 //(2*(p0+q0+p1)+p2+q1+4)>>3 H,p0'
+ mov v28.d[1] , v29.d[0]
+ movi v0.8b, #2
+ movi v1.4h, #2
+ uaddl v2.8h, v6.8b, v8.8b //p0+q1 L
+ umlal v2.8h, v10.8b, v0.8b //2*p1+p0+q1 L
+ uaddl v16.8h, v7.8b, v9.8b //p0+q1 H
+ umlal v16.8h, v11.8b, v0.8b //2*p1+p0+q1 H
+ uaddw v12.8h, v24.8h , v30.8b //(p0+q0+p1) +p2 L
+ ld1 {v24.8b, v25.8b}, [x2] //load p3,Q12
+ mov v24.d[1] , v25.d[0]
+ uaddw v4.8h, v26.8h , v31.8b //(p0+q0+p1) +p2 H
+ uaddl v8.8h, v30.8b, v24.8b //p2+p3 L
+ rshrn v26.8b, v12.8h, #2 //((p0+q0+p1)+p2 +2)>>2,p1' L
+ rshrn v2.8b, v2.8h, #2 //(2*p1+p0+q1+2)>>2,p0"L
+ rshrn v27.8b, v4.8h, #2 //((p0+q0+p1)+p2 +2)>>2,p1' H
+ rshrn v3.8b, v16.8h, #2 //(2*p1+p0+q1+2)>>2,p0" H
+ mov v26.d[1] , v27.d[0]
+ mov v2.d[1] , v3.d[0]
+ uaddl v16.8h, v31.8b, v25.8b //p2+p3 H
+ mla v12.8h, v8.8h , v1.4h[0] //(p0+q0+p1)+3*p2+2*p3 L
+ mla v4.8h, v16.8h , v1.4h[0] //(p0+q0+p1)+3*p2+2*p3 H
+ bic v16.16b, v20.16b , v18.16b //((ABS(p0 - q0) >= Alpha || ABS(q1 - q0) >= Beta || ABS(p1 - p0) >= Beta))
+ mov v17.d[0] , v16.d[1] //&& (Ap < Beta && ABS(p0 - q0) <((Alpha >>2) + 2))
+ bit v2.16b, v28.16b , v20.16b //choosing between po' and p0"
+ mov v3.d[0] , v2.d[1]
+ rshrn v12.8b, v12.8h, #3 //((p0+q0+p1)+3*p2+2*p3+4)>>3 L p2'
+ rshrn v13.8b, v4.8h, #3 //((p0+q0+p1)+3*p2+2*p3+4)>>3 H p2'
+ mov v12.d[1] , v13.d[0]
+ bif v6.16b, v2.16b , v18.16b //choosing between p0 and filtered value of p0
+ bit v10.16b, v26.16b , v16.16b //choosing between p1 and p1'
+ bit v30.16b, v12.16b , v16.16b //choosing between p2 and p2'
+ st1 {v6.16b}, [x12] //store p0
+ st1 {v10.16b}, [x14] //store p1
+ st1 {v30.16b}, [x3] //store p2
+
+ // LDMFD sp!,{x12,pc}
+ ldp x19, x20, [sp], #16
+ pop_v_regs
+ ret
+
+
+
+///**
+//*******************************************************************************
+//*
+//* @brief
+//* Performs filtering of a luma block vertical edge for cases where the
+//* boundary strength is less than 4
+//*
+//* @par Description:
+//* This operation is described in Sec. 8.7.2.4 under the title
+//* "Filtering process for edges for bS equal to 4" in ITU T Rec H.264.
+//*
+//* @param[in] x0 - pu1_src
+//* Pointer to the src sample q0
+//*
+//* @param[in] x1 - src_strd
+//* Source stride
+//*
+//* @param[in] x2 - alpha
+//* Alpha Value for the boundary
+//*
+//* @param[in] x3 - beta
+//* Beta Value for the boundary
+//*
+//* @param[in] sp(0) - u4_bs
+//* Packed Boundary strength array
+//*
+//* @param[in] sp(4) - pu1_cliptab
+//* tc0_table
+//*
+//* @returns
+//* None
+//*
+//* @remarks
+//* None
+//*
+//*******************************************************************************
+//*/
+
+ .global ih264_deblk_luma_vert_bslt4_av8
+
+ih264_deblk_luma_vert_bslt4_av8:
+
+ // STMFD sp!,{x12,x14}
+ push_v_regs
+ stp x19, x20, [sp, #-16]!
+
+ sub x0, x0, #4 //pointer uc_edgePixel-4
+ mov x12, x4
+ mov x14, x5
+ mov x17, x0
+ //loading p3:p2:p1:p0:q0:q1:q2:q3 for every row
+ ld1 {v0.8b}, [x0], x1 //row1
+ ld1 {v2.8b}, [x0], x1 //row2
+ ld1 {v4.8b}, [x0], x1 //row3
+ rev w12, w12 //reversing ui_bs
+ ld1 {v6.8b}, [x0], x1 //row4
+ mov v18.2s[0], w12 //d12[0] = ui_Bs
+ ld1 {v16.s}[0], [x14] //D16[0] contains cliptab
+ ld1 {v8.8b}, [x0], x1 //row5
+ uxtl v18.8h, v18.8b //q6 = uc_Bs in each 16 bt scalar
+ ld1 {v10.8b}, [x0], x1 //row6
+ ld1 {v12.8b}, [x0], x1 //row7
+ tbl v16.8b, {v16.16b}, v18.8b //puc_ClipTab[uc_Bs]
+ ld1 {v14.8b}, [x0], x1 //row8
+ ld1 {v1.8b}, [x0], x1 //row9
+ uxtl v16.4s, v16.4h //
+ ld1 {v3.8b}, [x0], x1 //row10
+ ld1 {v5.8b}, [x0], x1 //row11
+ ld1 {v7.8b}, [x0], x1 //row12
+ sli v16.4s, v16.4s, #8 //
+ ld1 {v9.8b}, [x0], x1 //row13
+ ld1 {v11.8b}, [x0], x1 //row14
+ ld1 {v13.8b}, [x0], x1 //row15
+ sli v16.4s, v16.4s, #16
+ ld1 {v15.8b}, [x0], x1 //row16
+
+
+ //taking two 8x8 transposes
+ //2X2 transposes
+ trn1 v21.8b, v0.8b, v2.8b
+ trn2 v2.8b, v0.8b, v2.8b //row1 &2
+ mov v0.8b, v21.8b
+ trn1 v21.8b, v4.8b, v6.8b
+ trn2 v6.8b, v4.8b, v6.8b //row3&row4
+ mov v4.8b, v21.8b
+ trn1 v21.8b, v8.8b, v10.8b
+ trn2 v10.8b, v8.8b, v10.8b //row5&6
+ mov v8.8b, v21.8b
+ trn1 v21.8b, v12.8b, v14.8b
+ trn2 v14.8b, v12.8b, v14.8b //row7 & 8
+ mov v12.8b, v21.8b
+ trn1 v21.8b, v1.8b, v3.8b
+ trn2 v3.8b, v1.8b, v3.8b //row9 &10
+ mov v1.8b, v21.8b
+ trn1 v21.8b, v5.8b, v7.8b
+ trn2 v7.8b, v5.8b, v7.8b //row11 & 12
+ mov v5.8b, v21.8b
+ trn1 v21.8b, v9.8b, v11.8b
+ trn2 v11.8b, v9.8b, v11.8b //row13 &14
+ mov v9.8b, v21.8b
+ trn1 v21.8b, v13.8b, v15.8b
+ trn2 v15.8b, v13.8b, v15.8b //row15 & 16
+ mov v13.8b, v21.8b
+ //4x4 transposes
+ trn1 v21.4h, v2.4h, v6.4h
+ trn2 v6.4h, v2.4h, v6.4h //row2 & row4
+ mov v2.8b, v21.8b
+ trn1 v21.4h, v10.4h, v14.4h
+ trn2 v14.4h, v10.4h, v14.4h //row6 & row8
+ mov v10.8b, v21.8b
+ trn1 v21.4h, v3.4h, v7.4h
+ trn2 v7.4h, v3.4h, v7.4h //row10 & 12
+ mov v3.8b, v21.8b
+ trn1 v21.4h, v11.4h, v15.4h
+ trn2 v15.4h, v11.4h, v15.4h //row14 & row16
+ mov v11.8b, v21.8b
+ trn1 v21.2s, v6.2s, v14.2s
+ trn2 v14.2s, v6.2s, v14.2s //row4 & 8
+ mov v6.8b, v21.8b
+ trn1 v21.2s, v7.2s, v15.2s
+ trn2 v15.2s, v7.2s, v15.2s //row 12 & 16
+ mov v7.8b, v21.8b
+ //now Q3 ->p0 and Q7->q3
+ trn1 v21.4h, v0.4h, v4.4h
+ trn2 v4.4h, v0.4h, v4.4h //row1 & 3
+ mov v0.8b, v21.8b
+ trn1 v21.4h, v8.4h, v12.4h
+ trn2 v12.4h, v8.4h, v12.4h //row 5 & 7
+ mov v8.8b, v21.8b
+ trn1 v21.4h, v1.4h, v5.4h
+ trn2 v5.4h, v1.4h, v5.4h //row9 & row11
+ mov v1.8b, v21.8b
+ trn1 v21.4h, v9.4h, v13.4h
+ trn2 v13.4h, v9.4h, v13.4h //row13 & row15
+ mov v9.8b, v21.8b
+ trn1 v21.2s, v0.2s, v8.2s
+ trn2 v8.2s, v0.2s, v8.2s //row1 & row5
+ mov v0.8b, v21.8b
+ trn1 v21.2s, v1.2s, v9.2s
+ trn2 v9.2s, v1.2s, v9.2s //row9 & 13
+ mov v1.8b, v21.8b
+ //now Q0->p3 & Q4->q0
+ //starting processing as p0 and q0 are now ready
+ trn1 v21.2s, v2.2s, v10.2s
+ trn2 v10.2s, v2.2s, v10.2s //row2 &6
+ mov v2.8b, v21.8b
+ mov v6.d[1] , v7.d[0]
+ mov v8.d[1] , v9.d[0]
+ urhadd v20.16b, v6.16b , v8.16b //((p0 + q0 + 1) >> 1)
+ mov v21.d[0], v20.d[1]
+ trn1 v31.2s, v3.2s, v11.2s
+ trn2 v11.2s, v3.2s, v11.2s //row10&row14
+ mov v3.8b, v31.8b
+ movi v19.8b, #2
+ mov v18.d[1], v19.d[0]
+ //now Q1->p2 & Q5->q1
+ trn1 v31.2s, v4.2s, v12.2s
+ trn2 v12.2s, v4.2s, v12.2s //row3 & 7
+ mov v4.8b, v31.8b
+ uabd v22.16b , v6.16b, v8.16b //ABS(q1 - q0)
+ trn1 v31.2s, v5.2s, v13.2s
+ trn2 v13.2s, v5.2s, v13.2s //row11 & row15
+ mov v5.8b, v31.8b
+ mov v0.d[1] , v1.d[0]
+ mov v2.d[1] , v3.d[0]
+ mov v4.d[1] , v5.d[0]
+ mov v10.d[1] , v11.d[0]
+ mov v12.d[1] , v13.d[0]
+ mov v14.d[1] , v15.d[0]
+ uaddl v24.8h, v20.8b, v2.8b //(p2 + ((p0 + q0 + 1) >> 1) L
+ //now Q2->p1,Q6->q2
+ uaddl v26.8h, v21.8b, v3.8b //(p2 + ((p0 + q0 + 1) >> 1) H
+ umlsl v24.8h, v4.8b, v19.8b //(p2 + ((p0 + q0 + 1) >> 1) - (p1 << 1)) L
+ umlsl v26.8h, v5.8b, v19.8b //(p2 + ((p0 + q0 + 1) >> 1) - (p1 << 1)) H
+ dup v28.16b, w2 //alpha
+ cmhs v22.16b, v22.16b , v28.16b //ABS(p0 - q0) >= Alpha(Alpha <=ABS(p0 - q0))
+ dup v28.16b, w3 //beta
+ uabd v30.16b , v10.16b, v8.16b //ABS(q1 - q0)
+ sqshrn v24.8b, v24.8h, #1 //((p2 + ((p0 + q0 + 1) >> 1) - (p1 << 1)) >> 1) L
+ sqshrn v25.8b, v26.8h, #1 //((p2 + ((p0 + q0 + 1) >> 1) - (p1 << 1)) >> 1) H
+ mov v24.d[1], v25.d[0]
+ cmhs v30.16b, v30.16b , v28.16b //ABS(p0 - q0) >= Alpha(Alpha <=ABS(p0 - q0))
+ uabd v26.16b , v4.16b, v6.16b //ABS(q1 - q0)
+
+ smin v24.16b, v24.16b , v16.16b //min(deltap1 ,C0)
+ orr v22.16b, v22.16b , v30.16b //ABS(q1 - q0) >= Beta ||ABS(p0 - q0) >= Alpha
+ neg v30.16b, v16.16b //-C0
+ cmhs v26.16b, v26.16b , v28.16b //ABS(p0 - q0) >= Alpha(Alpha <=ABS(p0 - q0))
+ smax v24.16b, v24.16b , v30.16b //max(deltap1,-C0)
+ orr v22.16b, v22.16b , v26.16b //ABS(p0 - q0) >= Alpha || ABS(q1 - q0) >= Beta || ABS(p1 - p0) >= Beta)
+ uxtl v26.4s, v18.4h //ui_bs
+ uaddl v18.8h, v20.8b, v12.8b //q2 + ((p0 + q0 + 1) >> 1) L
+ cmeq v26.4s, v26.4s , #0 //ABS(p0 - q0) >= Alpha(Alpha <=ABS(p0 - q0))
+ usubw v18.8h, v18.8h , v10.8b //(q2 + ((p0 + q0 + 1) >> 1) - q1) L
+ uaddl v20.8h, v21.8b, v13.8b //q2 + ((p0 + q0 + 1) >> 1) H
+ usubw v18.8h, v18.8h , v10.8b //(q2 + ((p0 + q0 + 1) >> 1) - 2*q1)L
+ usubw v20.8h, v20.8h , v11.8b //(q2 + ((p0 + q0 + 1) >> 1) - q1) H
+ orr v26.16b, v26.16b , v22.16b //(ABS(p0 - q0) >= Alpha || ABS(q1 - q0) >= Beta || ABS(p1 - p0) >= Beta)) &&(ui_bs)
+ usubw v20.8h, v20.8h , v11.8b //(q2 + ((p0 + q0 + 1) >> 1) - 2*q1) H
+ sqshrn v18.8b, v18.8h, #1 //((q2 + ((p0 + q0 + 1) >> 1) - (q1 << 1)) >> 1) L
+ uabd v22.16b , v2.16b, v6.16b //ABS(q1 - q0)
+ sqshrn v19.8b, v20.8h, #1 //((q2 + ((p0 + q0 + 1) >> 1) - (q1 << 1)) >> 1) H
+ mov v18.d[1], v19.d[0]
+ uabd v20.16b , v12.16b, v8.16b //ABS(q1 - q0)
+ cmhi v22.16b, v28.16b , v22.16b //Ap < Beta
+ smin v18.16b, v18.16b , v16.16b //min(delatq1,C0)
+ cmhi v20.16b, v28.16b , v20.16b //Aq <Beta
+ usubl v28.8h, v8.8b, v6.8b //(q0 - p0) L
+ smax v18.16b, v18.16b , v30.16b //max(deltaq1,-C0)
+ usubl v30.8h, v9.8b, v7.8b //(q0 - p0) H
+ shl v28.8h, v28.8h, #2 //(q0 - p0)<<2 L
+ sub v16.16b, v16.16b , v22.16b //C0 + (Ap < Beta)
+ shl v30.8h, v30.8h, #2 //(q0 - p0) << 2) H
+ uaddw v28.8h, v28.8h , v4.8b //((q0 - p0) << 2) + (p1 L
+ uaddw v30.8h, v30.8h , v5.8b //((q0 - p0) << 2) + (p1 H
+ usubw v28.8h, v28.8h , v10.8b //((q0 - p0) << 2) + (p1 - q1) L
+ usubw v30.8h, v30.8h , v11.8b //((q0 - p0) << 2) + (p1 - q1) H
+ bic v22.16b, v22.16b , v26.16b //final condition for p1
+ rshrn v28.8b, v28.8h, #3 //delta = ((((q0 - p0) << 2) + (p1 - q1) + 4) >> 3); L
+ rshrn v29.8b, v30.8h, #3 //delta = ((((q0 - p0) << 2) + (p1 - q1) + 4) >> 3) H
+ mov v28.d[1], v29.d[0]
+ sub v16.16b, v16.16b , v20.16b //C0 + (Ap < Beta) + (Aq < Beta)
+ bic v20.16b, v20.16b , v26.16b //final condition for q1
+ abs v30.16b, v28.16b //abs(delta)
+ and v24.16b, v24.16b , v22.16b //delatp1
+ and v18.16b, v18.16b , v20.16b //delta q1
+ umin v30.16b, v30.16b , v16.16b //min((abs(delta),C)
+ add v4.16b, v4.16b , v24.16b //p1+deltap1
+ add v10.16b, v10.16b , v18.16b //q1+deltaq1
+ mov v5.d[0], v4.d[1]
+ mov v11.d[0], v10.d[1]
+ bic v30.16b, v30.16b , v26.16b //abs(delta) of pixels to be changed only
+ // VCGE.S8 Q14, Q14,#0 //sign(delta)
+ cmge v28.16b, v28.16b , #0
+ uqsub v22.16b, v6.16b , v30.16b //clip(p0-delta)
+
+ trn1 v21.8b, v0.8b, v2.8b
+ trn2 v2.8b, v0.8b, v2.8b //row1 &2
+ mov v0.8b, v21.8b
+ uqadd v6.16b, v6.16b , v30.16b //clip(p0+delta)
+
+ trn1 v21.8b, v1.8b, v3.8b
+ trn2 v3.8b, v1.8b, v3.8b //row9 &10
+ mov v1.8b, v21.8b
+ uqadd v24.16b, v8.16b , v30.16b //clip(q0+delta)
+ trn1 v21.8b, v12.8b, v14.8b
+ trn2 v14.8b, v12.8b, v14.8b //row7 & 8
+ mov v12.8b, v21.8b
+ uqsub v8.16b, v8.16b , v30.16b //clip(q0-delta)
+ trn1 v21.8b, v13.8b, v15.8b
+ trn2 v15.8b, v13.8b, v15.8b //row15 & 16
+ mov v13.8b, v21.8b
+ bif v6.16b, v22.16b , v28.16b //p0
+ bif v8.16b, v24.16b , v28.16b //q0
+ mov v7.d[0], v6.d[1]
+ mov v9.d[0], v8.d[1]
+ trn1 v21.8b, v4.8b, v6.8b
+ trn2 v6.8b, v4.8b, v6.8b //row3&row4
+ mov v4.8b, v21.8b
+ trn1 v21.8b, v8.8b, v10.8b
+ trn2 v10.8b, v8.8b, v10.8b //row5&6
+ mov v8.8b, v21.8b
+ trn1 v21.8b, v5.8b, v7.8b
+ trn2 v7.8b, v5.8b, v7.8b //row11 & 12
+ mov v5.8b, v21.8b
+ trn1 v21.8b, v9.8b, v11.8b
+ trn2 v11.8b, v9.8b, v11.8b //row13 &14
+ mov v9.8b, v21.8b
+ trn1 v21.4h, v2.4h, v6.4h
+ trn2 v6.4h, v2.4h, v6.4h //row2 & row4
+ mov v2.8b, v21.8b
+ trn1 v21.4h, v10.4h, v14.4h
+ trn2 v14.4h, v10.4h, v14.4h //row6 & row8
+ mov v10.8b, v21.8b
+ trn1 v21.4h, v3.4h, v7.4h
+ trn2 v7.4h, v3.4h, v7.4h //row10 & 12
+ mov v3.8b, v21.8b
+ trn1 v21.4h, v11.4h, v15.4h
+ trn2 v15.4h, v11.4h, v15.4h //row14 & row16
+ mov v11.8b, v21.8b
+ trn1 v21.2s, v6.2s, v14.2s
+ trn2 v14.2s, v6.2s, v14.2s //row4 & 8
+ mov v6.8b, v21.8b
+ trn1 v21.2s, v7.2s, v15.2s
+ trn2 v15.2s, v7.2s, v15.2s //row 12 & 16
+ mov v7.8b, v21.8b
+ //now Q3 ->p0 and Q7->q3
+ trn1 v21.4h, v0.4h, v4.4h
+ trn2 v4.4h, v0.4h, v4.4h //row1 & 3
+ mov v0.8b, v21.8b
+ trn1 v21.4h, v8.4h, v12.4h
+ trn2 v12.4h, v8.4h, v12.4h //row 5 & 7
+ mov v8.8b, v21.8b
+ trn1 v21.4h, v1.4h, v5.4h
+ trn2 v5.4h, v1.4h, v5.4h //row9 & row11
+ mov v1.8b, v21.8b
+ trn1 v21.4h, v9.4h, v13.4h
+ trn2 v13.4h, v9.4h, v13.4h //row13 & row15
+ mov v9.8b, v21.8b
+ sub x0, x0, x1, lsl#4 //restore pointer
+ trn1 v21.2s, v0.2s, v8.2s
+ trn2 v8.2s, v0.2s, v8.2s //row1 & row5
+ mov v0.8b, v21.8b
+ trn1 v21.2s, v1.2s, v9.2s
+ trn2 v9.2s, v1.2s, v9.2s //row9 & 13
+ mov v1.8b, v21.8b
+ trn1 v21.2s, v2.2s, v10.2s
+ trn2 v10.2s, v2.2s, v10.2s //row2 &6
+ mov v2.8b, v21.8b
+ trn1 v21.2s, v3.2s, v11.2s
+ trn2 v11.2s, v3.2s, v11.2s //row10&row14
+ mov v3.8b, v21.8b
+ trn1 v21.2s, v4.2s, v12.2s
+ trn2 v12.2s, v4.2s, v12.2s //row3 & 7
+ mov v4.8b, v21.8b
+ trn1 v21.2s, v5.2s, v13.2s
+ trn2 v13.2s, v5.2s, v13.2s //row11 & row15
+ mov v5.8b, v21.8b
+ st1 {v0.8b}, [x0], x1 //row1
+ st1 {v2.8b}, [x0], x1 //row2
+ st1 {v4.8b}, [x0], x1 //row3
+ st1 {v6.8b}, [x0], x1 //row4
+ st1 {v8.8b}, [x0], x1 //row5
+ st1 {v10.8b}, [x0], x1 //row6
+ st1 {v12.8b}, [x0], x1 //row7
+ st1 {v14.8b}, [x0], x1 //row8
+ st1 {v1.8b}, [x0], x1 //row9
+ st1 {v3.8b}, [x0], x1 //row10
+ st1 {v5.8b}, [x0], x1 //row11
+ st1 {v7.8b}, [x0], x1 //row12
+ st1 {v9.8b}, [x0], x1 //row13
+ st1 {v11.8b}, [x0], x1 //row14
+ st1 {v13.8b}, [x0], x1 //row15
+ st1 {v15.8b}, [x0], x1 //row16
+
+ // LDMFD sp!,{x12,pc}
+ ldp x19, x20, [sp], #16
+ pop_v_regs
+ ret
+
+
+
+///**
+//*******************************************************************************
+//*
+//* @brief
+//* Performs filtering of a luma block vertical edge when the
+//* boundary strength is set to 4
+//*
+//* @par Description:
+//* This operation is described in Sec. 8.7.2.4 under the title
+//* "Filtering process for edges for bS equal to 4" in ITU T Rec H.264.
+//*
+//* @param[in] x0 - pu1_src
+//* Pointer to the src sample q0
+//*
+//* @param[in] x1 - src_strd
+//* Source stride
+//*
+//* @param[in] x2 - alpha
+//* Alpha Value for the boundary
+//*
+//* @param[in] x3 - beta
+//* Beta Value for the boundary
+//*
+//* @returns
+//* None
+//*
+//* @remarks
+//* None
+//*
+//*******************************************************************************
+//*/
+
+ .global ih264_deblk_luma_vert_bs4_av8
+
+ih264_deblk_luma_vert_bs4_av8:
+
+ // STMFD sp!,{x12,x14}
+ push_v_regs
+ stp x19, x20, [sp, #-16]!
+
+ sub x0, x0, #4 //pointer uc_edgePixel-4
+ mov x17, x0
+ //loading p3:p2:p1:p0:q0:q1:q2:q3 for every row
+ ld1 {v0.8b}, [x0], x1 //row1
+ ld1 {v2.8b}, [x0], x1 //row2
+ ld1 {v4.8b}, [x0], x1 //row3
+ ld1 {v6.8b}, [x0], x1 //row4
+ ld1 {v8.8b}, [x0], x1 //row5
+ ld1 {v10.8b}, [x0], x1 //row6
+ ld1 {v12.8b}, [x0], x1 //row7
+ ld1 {v14.8b}, [x0], x1 //row8
+ ld1 {v1.8b}, [x0], x1 //row9
+ ld1 {v3.8b}, [x0], x1 //row10
+ ld1 {v5.8b}, [x0], x1 //row11
+ ld1 {v7.8b}, [x0], x1 //row12
+ ld1 {v9.8b}, [x0], x1 //row13
+ ld1 {v11.8b}, [x0], x1 //row14
+ ld1 {v13.8b}, [x0], x1 //row15
+ ld1 {v15.8b}, [x0], x1 //row16
+
+ //taking two 8x8 transposes
+ //2X2 transposes
+ trn1 v21.8b, v0.8b, v2.8b
+ trn2 v2.8b, v0.8b, v2.8b //row1 &2
+ mov v0.8b, v21.8b
+ trn1 v21.8b, v4.8b, v6.8b
+ trn2 v6.8b, v4.8b, v6.8b //row3&row4
+ mov v4.8b, v21.8b
+ trn1 v21.8b, v8.8b, v10.8b
+ trn2 v10.8b, v8.8b, v10.8b //row5&6
+ mov v8.8b, v21.8b
+ trn1 v21.8b, v12.8b, v14.8b
+ trn2 v14.8b, v12.8b, v14.8b //row7 & 8
+ mov v12.8b, v21.8b
+ trn1 v21.8b, v1.8b, v3.8b
+ trn2 v3.8b, v1.8b, v3.8b //row9 &10
+ mov v1.8b , v21.8b
+ trn1 v21.8b, v5.8b, v7.8b
+ trn2 v7.8b, v5.8b, v7.8b //row11 & 12
+ mov v5.8b , v21.8b
+ trn1 v21.8b, v9.8b, v11.8b
+ trn2 v11.8b, v9.8b, v11.8b //row13 &14
+ mov v9.8b , v21.8b
+ trn1 v21.8b, v13.8b, v15.8b
+ trn2 v15.8b, v13.8b, v15.8b //row15 & 16
+ mov v13.8b , v21.8b
+ //4x4 transposes
+ trn1 v21.4h, v2.4h, v6.4h
+ trn2 v6.4h, v2.4h, v6.4h //row2 & row4
+ mov v2.8b, v21.8b
+ trn1 v21.4h, v10.4h, v14.4h
+ trn2 v14.4h, v10.4h, v14.4h //row6 & row8
+ mov v10.8b , v21.8b
+ trn1 v21.4h, v3.4h, v7.4h
+ trn2 v7.4h, v3.4h, v7.4h //row10 & 12
+ mov v3.8b, v21.8b
+ trn1 v21.4h, v11.4h, v15.4h
+ trn2 v15.4h, v11.4h, v15.4h //row14 & row16
+ mov v11.8b, v21.8b
+ trn1 v21.2s, v6.2s, v14.2s
+ trn2 v14.2s, v6.2s, v14.2s //row4 & 8
+ mov v6.8b, v21.8b
+ trn1 v21.2s, v7.2s, v15.2s
+ trn2 v15.2s, v7.2s, v15.2s //row 12 & 16
+ mov v7.8b, v21.8b
+ //now Q3 ->p0 and Q7->q3
+ trn1 v21.4h, v0.4h, v4.4h
+ trn2 v4.4h, v0.4h, v4.4h //row1 & 3
+ mov v0.8b , v21.8b
+ trn1 v21.4h, v8.4h, v12.4h
+ trn2 v12.4h, v8.4h, v12.4h //row 5 & 7
+ mov v8.8b, v21.8b
+ trn1 v21.4h, v1.4h, v5.4h
+ trn2 v5.4h, v1.4h, v5.4h //row9 & row11
+ mov v1.8b, v21.8b
+ trn1 v21.4h, v9.4h, v13.4h
+ trn2 v13.4h, v9.4h, v13.4h //row13 & row15
+ mov v9.8b , v21.8b
+ trn1 v21.2s, v0.2s, v8.2s
+ trn2 v8.2s, v0.2s, v8.2s //row1 & row5
+ mov v0.8b, v21.8b
+ trn1 v21.2s, v1.2s, v9.2s
+ trn2 v9.2s, v1.2s, v9.2s //row9 & 13
+ mov v1.8b, v21.8b
+ //now Q0->p3 & Q4->q0
+ //starting processing as p0 and q0 are now ready
+ //now Q1->p2 & Q5->q1
+ mov v31.d[0], v14.d[0]
+ mov v31.d[1], v15.d[0]
+ trn1 v21.2s, v4.2s, v12.2s
+ trn2 v12.2s, v4.2s, v12.2s //row3 & 7
+ mov v4.8b, v21.8b
+ movi v28.8h, #2
+ trn1 v21.2s, v5.2s, v13.2s
+ trn2 v13.2s, v5.2s, v13.2s //row11 & row15
+ mov v5.8b, v21.8b
+ uaddl v16.8h, v6.8b, v8.8b //p0+q0 L
+ trn1 v21.2s, v2.2s, v10.2s
+ trn2 v10.2s, v2.2s, v10.2s //row2 &6
+ mov v2.8b, v21.8b
+ uaddl v18.8h, v7.8b, v9.8b //p0+q0 H
+ trn1 v21.2s, v3.2s, v11.2s
+ trn2 v11.2s, v3.2s, v11.2s //row10&row14
+ mov v3.8b, v21.8b
+ uaddw v20.8h, v16.8h , v4.8b //p0+q0+p1 L
+ uaddw v22.8h, v18.8h , v5.8b //p0+q0+p1 H
+ uaddl v24.8h, v2.8b, v10.8b //p2+q1 L
+ uaddl v26.8h, v3.8b, v11.8b //p2+q1 H
+ mla v24.8h, v20.8h , v28.8h //p2 + X2(p1) + X2(p0) + X2(q0) + q1 L
+ mla v26.8h, v22.8h , v28.8h //p2 + X2(p1) + X2(p0) + X2(q0) + q1 H
+ movi v28.16b, #2
+ uaddw v16.8h, v20.8h , v2.8b //p0+q0+p1+p2 L
+ uaddw v18.8h, v22.8h , v3.8b //p0+q0+p1+p2 H
+ dup v30.16b, w2 //duplicate alpha
+ rshrn v20.8b, v16.8h, #2 //(p2 + p1 + p0 + q0 + 2) >> 2)L p1'
+ rshrn v21.8b, v18.8h, #2 //(p2 + p1 + p0 + q0 + 2) >> 2)H p1'
+ mov v20.d[1] , v21.d[0]
+ mov v0.d[1] , v1.d[0]
+ mov v2.d[1] , v3.d[0]
+ mov v4.d[1] , v5.d[0]
+ mov v6.d[1] , v7.d[0]
+ mov v8.d[1] , v9.d[0]
+ mov v10.d[1] , v11.d[0]
+ mov v12.d[1] , v13.d[0]
+ mov v14.d[1] , v15.d[0]
+ uabd v22.16b , v6.16b, v8.16b
+ usra v28.16b, v30.16b, #2 //alpha >>2 +2
+ uabd v30.16b , v2.16b, v6.16b
+ rshrn v24.8b, v24.8h, #3 //((p2 + X2(p1) + X2(p0) + X2(q0) + q1 + 4) >> 3) L p0'
+ rshrn v25.8b, v26.8h, #3 //((p2 + X2(p1) + X2(p0) + X2(q0) + q1 + 4) >> 3) H p0'
+ mov v24.d[1] , v25.d[0]
+ dup v26.16b, w3 //beta
+ cmhi v28.16b, v28.16b , v22.16b //ABS(p0 - q0) <((Alpha >>2) + 2)
+ uaddl v22.8h, v6.8b, v10.8b //p0+q1 L
+ cmhi v14.16b, v26.16b , v30.16b //beta>Ap
+ uaddl v30.8h, v7.8b, v11.8b //p0+q1 H
+ uaddw v22.8h, v22.8h , v4.8b //p0+q1+p1 L
+ uaddw v30.8h, v30.8h , v5.8b //p0+q1+p1 H
+ uaddw v22.8h, v22.8h , v4.8b //p0+q1+2*p1 L
+ uaddw v30.8h, v30.8h , v5.8b //p0+q1+2*p1 H
+ and v14.16b, v14.16b , v28.16b //(Ap < Beta && ABS(p0 - q0) <((Alpha >>2) + 2)
+ rshrn v22.8b, v22.8h, #2 //((X2(p1) + p0 + q1 + 2) >> 2) L p0"
+ rshrn v23.8b, v30.8h, #2 //((X2(p1) + p0 + q1 + 2) >> 2) H p0"
+ mov v22.d[1] , v23.d[0]
+ uaddl v30.8h, v2.8b, v0.8b //p2+p3 L
+ bif v24.16b, v22.16b , v14.16b //p0' or p0 "
+ uaddl v22.8h, v3.8b, v1.8b //p2+p3 H
+ add v30.8h, v30.8h , v30.8h //2*(p2+p3) L
+ add v22.8h, v22.8h , v22.8h //2*(p2+p3)H
+ add v16.8h, v16.8h , v30.8h //(X2(p3) + X3(p2) + p1 + p0 + q0) L
+ add v18.8h, v18.8h , v22.8h //(X2(p3) + X3(p2) + p1 + p0 + q0) H
+ uabd v30.16b , v12.16b, v8.16b
+ uabd v22.16b , v10.16b, v8.16b
+ rshrn v16.8b, v16.8h, #3 //((X2(p3) + X3(p2) + p1 + p0 + q0 + 4) >> 3); L p2'
+ rshrn v17.8b, v18.8h, #3 //((X2(p3) + X3(p2) + p1 + p0 + q0 + 4) >> 3); H p2'
+ mov v16.d[1] , v17.d[0]
+ uabd v18.16b , v4.16b, v6.16b
+ cmhi v30.16b, v26.16b , v30.16b //Aq < Beta
+ cmhs v22.16b, v22.16b, v26.16b
+ cmhs v18.16b, v18.16b, v26.16b
+ dup v26.16b, w2 //duplicate alpha
+ and v30.16b, v30.16b , v28.16b //(Aq < Beta && ABS(p0 - q0) <((Alpha >>2) + 2))
+ uabd v28.16b , v6.16b, v8.16b
+ orr v22.16b, v22.16b , v18.16b //ABS(p1 - p0) >= Beta || ABS(q1 - q0) >= Beta
+ uaddl v18.8h, v6.8b, v8.8b //p0+q0 L
+ cmhs v28.16b, v28.16b, v26.16b
+ uaddl v26.8h, v7.8b, v9.8b //p0+q0 H
+ uaddw v18.8h, v18.8h , v10.8b //p0+q0+q1 L
+ orr v22.16b, v22.16b , v28.16b //ABS(p1 - p0) >= Beta || ABS(q1 - q0) >= Beta||ABS(p0 - q0) >= Alpha
+ uaddw v26.8h, v26.8h , v11.8b //p0+q0+q1 H
+ bic v14.16b, v14.16b , v22.16b //final condn for p's
+ movi v28.16b, #2
+ bif v6.16b, v24.16b , v22.16b //final p0
+ bit v2.16b, v16.16b , v14.16b //final p2
+ bif v20.16b, v4.16b , v14.16b //final p1
+ mov v7.d[0] , v6.d[1]
+ mov v3.d[0] , v2.d[1]
+ mov v21.d[0] , v20.d[1]
+ uaddl v24.8h, v8.8b, v4.8b //q0+p1 L
+ umlal v24.8h, v10.8b, v28.8b //X2(q1) + q0 + p1 L
+ uaddl v16.8h, v9.8b, v5.8b //q0+p1 H
+ umlal v16.8h, v11.8b, v28.8b //X2(q1) + q0 + p1 H
+ movi v28.8h, #2
+ uaddl v14.8h, v4.8b, v12.8b //p1+q2 L
+ mla v14.8h, v18.8h , v28.8h //p1 + X2(p0) + X2(q0) + X2(q1) + q2L
+ uaddl v4.8h, v5.8b, v13.8b //p1+q2H
+ mla v4.8h, v26.8h , v28.8h //p1 + X2(p0) + X2(q0) + X2(q1) + q2H
+ rshrn v24.8b, v24.8h, #2 //(X2(q1) + q0 + p1 + 2) >> 2; L q0'
+ rshrn v25.8b, v16.8h, #2 //(X2(q1) + q0 + p1 + 2) >> 2; H q0'
+ mov v24.d[1] , v25.d[0]
+ uaddw v18.8h, v18.8h , v12.8b //p0 + q0 + q1 + q2 L
+ uaddw v26.8h, v26.8h , v13.8b //p0 + q0 + q1 + q2 H
+ rshrn v16.8b, v14.8h, #3 //(p1 + X2(p0) + X2(q0) + X2(q1) + q2 + 4) >> 3 L qo"
+ mov v14.16b, v31.16b
+ rshrn v17.8b, v4.8h, #3 //(p1 + X2(p0) + X2(q0) + X2(q1) + q2 + 4) >> 3 H qo"
+ mov v16.d[1] , v17.d[0]
+ rshrn v4.8b, v18.8h, #2 //p0 + q0 + q1 + q2 + 2)>>2 L q1'
+ rshrn v5.8b, v26.8h, #2 //p0 + q0 + q1 + q2 + 2)>>2 H q1'
+ mov v4.d[1] , v5.d[0]
+ bit v24.16b, v16.16b , v30.16b //q0' or q0"
+ bic v30.16b, v30.16b , v22.16b //final condn for q's
+ trn1 v31.8b, v0.8b, v2.8b
+ trn2 v2.8b, v0.8b, v2.8b //row1 &2
+ mov v0.8b, v31.8b
+ bit v10.16b, v4.16b , v30.16b
+ mov v11.d[0] , v10.d[1]
+ mov v25.d[0] , v24.d[1]
+ mov v31.d[0] , v30.d[1]
+ trn1 v31.8b, v1.8b, v3.8b
+ trn2 v3.8b, v1.8b, v3.8b //row9 &10
+ mov v1.8b, v31.8b
+ uaddl v16.8h, v12.8b, v14.8b //q2+q3 L
+ trn1 v31.8b, v20.8b, v6.8b
+ trn2 v6.8b, v20.8b, v6.8b //row3&row4
+ mov v20.8b , v31.8b
+ uaddl v4.8h, v13.8b, v15.8b //q2+q3 H
+ trn1 v31.8b, v21.8b, v7.8b
+ trn2 v7.8b, v21.8b, v7.8b //row11 & 12
+ mov v21.8b , v31.8b
+ mla v18.8h, v16.8h , v28.8h //X2(q3) + X3(q2) + q1 + q0 + p0 L
+ trn1 v31.4h, v2.4h, v6.4h
+ trn2 v6.4h, v2.4h, v6.4h //row2 & row4
+ mov v2.8b, v31.8b
+ mla v26.8h, v4.8h , v28.8h //X2(q3) + X3(q2) + q1 + q0 + p0 H
+ trn1 v31.4h, v3.4h, v7.4h
+ trn2 v7.4h, v3.4h, v7.4h //row10 & 12
+ mov v3.8b , v31.8b
+ bif v8.16b, v24.16b , v22.16b //final q0
+ mov v9.d[0] , v8.d[1]
+ trn1 v31.4h, v0.4h, v20.4h
+ trn2 v20.4h, v0.4h, v20.4h //row1 & 3
+ mov v0.8b , v31.8b
+ rshrn v18.8b, v18.8h, #3 //(X2(q3) + X3(q2) + q1 + q0 + p0 + 4) >> 3; L
+ trn1 v31.4h, v1.4h, v21.4h
+ trn2 v21.4h, v1.4h, v21.4h //row9 & row11
+ mov v1.8b, v31.8b
+ rshrn v19.8b, v26.8h, #3 //(X2(q3) + X3(q2) + q1 + q0 + p0 + 4) >> 3; H
+ mov v18.d[1] , v19.d[0]
+ trn1 v31.8b, v8.8b, v10.8b
+ trn2 v10.8b, v8.8b, v10.8b //row5&6
+ mov v8.8b, v31.8b
+ bit v12.16b, v18.16b , v30.16b //final q2
+ mov v13.d[0] , v12.d[1]
+ trn1 v31.8b, v9.8b, v11.8b
+ trn2 v11.8b, v9.8b, v11.8b //row13 &14
+ mov v9.8b, v31.8b
+ trn1 v31.8b, v12.8b, v14.8b
+ trn2 v14.8b, v12.8b, v14.8b //row7 & 8
+ mov v12.8b, v31.8b
+ trn1 v31.8b, v13.8b, v15.8b
+ trn2 v15.8b, v13.8b, v15.8b //row15 & 16
+ mov v13.8b , v31.8b
+ trn1 v31.4h, v10.4h, v14.4h
+ trn2 v14.4h, v10.4h, v14.4h //row6 & row8
+ mov v10.8b, v31.8b
+ trn1 v31.4h, v11.4h, v15.4h
+ trn2 v15.4h, v11.4h, v15.4h //row14 & row16
+ mov v11.8b, v31.8b
+ //now Q3 ->p0 and Q7->q3
+ trn1 v31.4h, v8.4h, v12.4h
+ trn2 v12.4h, v8.4h, v12.4h //row 5 & 7
+ mov v8.8b, v31.8b
+ trn1 v31.4h, v9.4h, v13.4h
+ trn2 v13.4h, v9.4h, v13.4h //row13 & row15
+ mov v9.8b, v31.8b
+ sub x0, x0, x1, lsl#4 //restore pointer
+ trn1 v31.2s, v6.2s, v14.2s
+ trn2 v14.2s, v6.2s, v14.2s //row4 & 8
+ mov v6.8b , v31.8b
+ trn1 v31.2s, v7.2s, v15.2s
+ trn2 v15.2s, v7.2s, v15.2s //row 12 & 16
+ mov v7.8b, v31.8b
+ trn1 v31.2s, v0.2s, v8.2s
+ trn2 v8.2s, v0.2s, v8.2s //row1 & row5
+ mov v0.8b , v31.8b
+ trn1 v31.2s, v1.2s, v9.2s
+ trn2 v9.2s, v1.2s, v9.2s //row9 & 13
+ mov v1.8b , v31.8b
+ trn1 v31.2s, v2.2s, v10.2s
+ trn2 v10.2s, v2.2s, v10.2s //row2 &6
+ mov v2.8b , v31.8b
+ trn1 v31.2s, v3.2s, v11.2s
+ trn2 v11.2s, v3.2s, v11.2s //row10&row14
+ mov v3.8b , v31.8b
+ trn1 v31.2s, v20.2s, v12.2s
+ trn2 v12.2s, v20.2s, v12.2s //row3 & 7
+ mov v20.8b , v31.8b
+ trn1 v31.2s, v21.2s, v13.2s
+ trn2 v13.2s, v21.2s, v13.2s //row11 & row15
+ mov v21.8b, v31.8b
+ st1 {v0.8b}, [x0], x1 //row1
+ st1 {v2.8b}, [x0], x1 //row2
+ st1 {v20.8b}, [x0], x1 //row3
+ st1 {v6.8b}, [x0], x1 //row4
+ st1 {v8.8b}, [x0], x1 //row5
+ st1 {v10.8b}, [x0], x1 //row6
+ st1 {v12.8b}, [x0], x1 //row7
+ st1 {v14.8b}, [x0], x1 //row8
+ st1 {v1.8b}, [x0], x1 //row9
+ st1 {v3.8b}, [x0], x1 //row10
+ st1 {v21.8b}, [x0], x1 //row11
+ st1 {v7.8b}, [x0], x1 //row12
+ st1 {v9.8b}, [x0], x1 //row13
+ st1 {v11.8b}, [x0], x1 //row14
+ st1 {v13.8b}, [x0], x1 //row15
+ st1 {v15.8b}, [x0], x1 //row16
+
+ // LDMFD sp!,{x12,pc}
+ ldp x19, x20, [sp], #16
+ pop_v_regs
+ ret
+
+
diff --git a/common/armv8/ih264_default_weighted_pred_av8.s b/common/armv8/ih264_default_weighted_pred_av8.s
new file mode 100755
index 0000000..aefb902
--- /dev/null
+++ b/common/armv8/ih264_default_weighted_pred_av8.s
@@ -0,0 +1,353 @@
+//******************************************************************************
+//*
+//* Copyright (C) 2015 The Android Open Source Project
+//*
+//* Licensed under the Apache License, Version 2.0 (the "License");
+//* you may not use this file except in compliance with the License.
+//* You may obtain a copy of the License at:
+//*
+//* http://www.apache.org/licenses/LICENSE-2.0
+//*
+//* Unless required by applicable law or agreed to in writing, software
+//* distributed under the License is distributed on an "AS IS" BASIS,
+//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//* See the License for the specific language governing permissions and
+//* limitations under the License.
+//*
+//*****************************************************************************
+//* Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
+//*/
+///**
+//******************************************************************************
+//* @file
+//* ih264_default_weighted_pred_av8.s
+//*
+//* @brief
+//* Contains function definitions for default weighted prediction.
+//* Functions are coded using NEON intrinsics and can be compiled using ARM RVCT
+//*
+//* @author
+//* Kaushik Senthoor R
+//*
+//* @par List of Functions:
+//*
+//* - ih264_default_weighted_pred_luma_av8()
+//* - ih264_default_weighted_pred_chroma_av8()
+//*
+//* @remarks
+//* None
+//*
+//*******************************************************************************
+//*/
+//*******************************************************************************
+//* @function
+//* ih264_default_weighted_pred_luma_av8()
+//*
+//* @brief
+//* This routine performs the default weighted prediction as described in sec
+//* 8.4.2.3.1 titled "Default weighted sample prediction process" for luma.
+//*
+//* @par Description:
+//* This function gets two ht x wd blocks, calculates their rounded-average and
+//* stores it in the destination block.
+//*
+//* @param[in] puc_src1:
+//* UWORD8 Pointer to the buffer containing the first input block.
+//*
+//* @param[in] puc_src2:
+//* UWORD8 Pointer to the buffer containing the second input block.
+//*
+//* @param[out] puc_dst
+//* UWORD8 pointer to the destination where the output block is stored.
+//*
+//* @param[in] src_strd1
+//* Stride of the first input buffer
+//*
+//* @param[in] src_strd2
+//* Stride of the second input buffer
+//*
+//* @param[in] dst_strd
+//* Stride of the destination buffer
+//*
+//* @param[in] ht
+//* integer height of the array
+//*
+//* @param[in] wd
+//* integer width of the array
+//*
+//* @returns
+//* None
+//*
+//* @remarks
+//* (ht,wd) can be (4,4), (4,8), (8,4), (8,8), (8,16), (16,8) or (16,16).
+//*
+//*******************************************************************************
+//*/
+//void ih264_default_weighted_pred_luma_av8(UWORD8 *puc_src1,
+// UWORD8 *puc_src2,
+// UWORD8 *puc_dst,
+// WORD32 src_strd1,
+// WORD32 src_strd2,
+// WORD32 dst_strd,
+// UWORD8 ht,
+// UWORD8 wd)
+//
+//**************Variables Vs Registers*****************************************
+// x0 => puc_src1
+// x1 => puc_src2
+// x2 => puc_dst
+// x3 => src_strd1
+// [sp] => src_strd2 (x4)
+// [sp+4] => dst_strd (x5)
+// [sp+8] => ht (x6)
+// [sp+12] => wd (x7)
+//
+.text
+.p2align 2
+.include "ih264_neon_macros.s"
+
+
+
+ .global ih264_default_weighted_pred_luma_av8
+
+ih264_default_weighted_pred_luma_av8:
+
+ push_v_regs
+ stp x19, x20, [sp, #-16]!
+ cmp w7, #16
+ beq loop_16 //branch if wd is 16
+ cmp w7, #8
+ beq loop_8 //branch if wd is 8
+
+loop_4: //each iteration processes four rows
+
+ ld1 {v0.s}[0], [x0], x3 //load row 1 in source 1
+ ld1 {v0.s}[1], [x0], x3 //load row 2 in source 1
+ ld1 {v2.s}[0], [x1], x4 //load row 1 in source 2
+ ld1 {v2.s}[1], [x1], x4 //load row 2 in source 2
+ ld1 {v1.s}[0], [x0], x3 //load row 3 in source 1
+ ld1 {v1.s}[1], [x0], x3 //load row 4 in source 1
+ urhadd v0.8b, v0.8b , v2.8b
+ ld1 {v3.s}[0], [x1], x4 //load row 3 in source 2
+ ld1 {v3.s}[1], [x1], x4 //load row 4 in source 2
+ subs w6, w6, #4 //decrement ht by 4
+ st1 {v0.s}[0], [x2], x5 //load row 1 in destination
+ st1 {v0.s}[1], [x2], x5 //load row 2 in destination
+ urhadd v1.8b, v1.8b , v3.8b
+ st1 {v1.s}[0], [x2], x5 //load row 3 in destination
+ st1 {v1.s}[1], [x2], x5 //load row 4 in destination
+ bgt loop_4 //if greater than 0 repeat the loop again
+ b end_loops
+
+loop_8: //each iteration processes four rows
+
+ ld1 {v0.8b}, [x0], x3 //load row 1 in source 1
+ ld1 {v4.8b}, [x1], x4 //load row 1 in source 2
+ ld1 {v1.8b}, [x0], x3 //load row 2 in source 1
+ ld1 {v5.8b}, [x1], x4 //load row 2 in source 2
+ ld1 {v2.8b}, [x0], x3 //load row 3 in source 1
+ urhadd v0.16b, v0.16b , v4.16b
+ urhadd v1.16b, v1.16b , v5.16b
+ ld1 {v6.8b}, [x1], x4 //load row 3 in source 2
+ ld1 {v3.8b}, [x0], x3 //load row 4 in source 1
+ urhadd v2.8b, v2.8b , v6.8b
+ ld1 {v7.8b}, [x1], x4 //load row 4 in source 2
+ subs w6, w6, #4 //decrement ht by 4
+ st1 {v0.8b}, [x2], x5 //load row 1 in destination
+ urhadd v3.8b, v3.8b , v7.8b
+ st1 {v1.8b}, [x2], x5 //load row 2 in destination
+ st1 {v2.8b}, [x2], x5 //load row 3 in destination
+ st1 {v3.8b}, [x2], x5 //load row 4 in destination
+ bgt loop_8 //if greater than 0 repeat the loop again
+ b end_loops
+
+loop_16: //each iteration processes eight rows
+
+ ld1 {v0.8b, v1.8b}, [x0], x3 //load row 1 in source 1
+ ld1 {v16.8b, v17.8b}, [x1], x4 //load row 1 in source 2
+ ld1 {v2.8b, v3.8b}, [x0], x3 //load row 2 in source 1
+ ld1 {v18.8b, v19.8b}, [x1], x4 //load row 2 in source 2
+ urhadd v0.16b, v0.16b , v16.16b
+ urhadd v1.16b, v1.16b , v17.16b
+ ld1 {v4.8b, v5.8b}, [x0], x3 //load row 3 in source 1
+ ld1 {v20.8b, v21.8b}, [x1], x4 //load row 3 in source 2
+ urhadd v2.16b, v2.16b , v18.16b
+ urhadd v3.16b, v3.16b , v19.16b
+ ld1 {v6.8b, v7.8b}, [x0], x3 //load row 4 in source 1
+ ld1 {v22.8b, v23.8b}, [x1], x4 //load row 4 in source 2
+ urhadd v4.16b, v4.16b , v20.16b
+ urhadd v5.16b, v5.16b , v21.16b
+ ld1 {v8.8b, v9.8b}, [x0], x3 //load row 5 in source 1
+ ld1 {v24.8b, v25.8b}, [x1], x4 //load row 5 in source 2
+ urhadd v6.16b, v6.16b , v22.16b
+ urhadd v7.16b, v7.16b , v23.16b
+ ld1 {v10.8b, v11.8b}, [x0], x3 //load row 6 in source 1
+ ld1 {v26.8b, v27.8b}, [x1], x4 //load row 6 in source 2
+ urhadd v8.16b, v8.16b , v24.16b
+ urhadd v9.16b, v9.16b , v25.16b
+ ld1 {v12.8b, v13.8b}, [x0], x3 //load row 7 in source 1
+ ld1 {v28.8b, v29.8b}, [x1], x4 //load row 7 in source 2
+ urhadd v10.16b, v10.16b , v26.16b
+ urhadd v11.16b, v11.16b , v27.16b
+ ld1 {v14.8b, v15.8b}, [x0], x3 //load row 8 in source 1
+ ld1 {v30.8b, v31.8b}, [x1], x4 //load row 8 in source 2
+ urhadd v12.16b, v12.16b , v28.16b
+ urhadd v13.16b, v13.16b , v29.16b
+ st1 {v0.8b, v1.8b}, [x2], x5 //load row 1 in destination
+ st1 {v2.8b, v3.8b}, [x2], x5 //load row 2 in destination
+ urhadd v14.16b, v14.16b , v30.16b
+ urhadd v15.16b, v15.16b , v31.16b
+ st1 {v4.8b, v5.8b}, [x2], x5 //load row 3 in destination
+ st1 {v6.8b, v7.8b}, [x2], x5 //load row 4 in destination
+ subs w6, w6, #8 //decrement ht by 8
+ st1 {v8.8b, v9.8b}, [x2], x5 //load row 5 in destination
+ st1 {v10.8b, v11.8b}, [x2], x5 //load row 6 in destination
+ st1 {v12.8b, v13.8b}, [x2], x5 //load row 7 in destination
+ st1 {v14.8b, v15.8b}, [x2], x5 //load row 8 in destination
+ bgt loop_16 //if greater than 0 repeat the loop again
+
+end_loops:
+
+ // LDMFD sp!,{x4-x7,x15} //Reload the registers from sp
+ ldp x19, x20, [sp], #16
+ pop_v_regs
+ ret
+
+
+//*******************************************************************************
+//* @function
+//* ih264_default_weighted_pred_chroma_av8()
+//*
+//* @brief
+//* This routine performs the default weighted prediction as described in sec
+//* 8.4.2.3.1 titled "Default weighted sample prediction process" for chroma.
+//*
+//* @par Description:
+//* This function gets two ht x wd blocks, calculates their rounded-average and
+//* stores it in the destination block for U and V.
+//*
+//* @param[in] puc_src1:
+//* UWORD8 Pointer to the buffer containing the first input block.
+//*
+//* @param[in] puc_src2:
+//* UWORD8 Pointer to the buffer containing the second input block.
+//*
+//* @param[out] puc_dst
+//* UWORD8 pointer to the destination where the output block is stored.
+//*
+//* @param[in] src_strd1
+//* Stride of the first input buffer
+//*
+//* @param[in] src_strd2
+//* Stride of the second input buffer
+//*
+//* @param[in] dst_strd
+//* Stride of the destination buffer
+//*
+//* @param[in] ht
+//* integer height of the array
+//*
+//* @param[in] wd
+//* integer width of the array
+//*
+//* @returns
+//* None
+//*
+//* @remarks
+//* (ht,wd) can be (2,2), (2,4), (4,2), (4,4), (4,8), (8,4) or (8,8).
+//*
+//*******************************************************************************
+//*/
+//void ih264_default_weighted_pred_chroma_av8(UWORD8 *puc_src1,
+// UWORD8 *puc_src2,
+// UWORD8 *puc_dst,
+// WORD32 src_strd1,
+// WORD32 src_strd2,
+// WORD32 dst_strd,
+// UWORD8 ht,
+// UWORD8 wd)
+//
+//**************Variables Vs Registers*****************************************
+// x0 => puc_src1
+// x1 => puc_src2
+// x2 => puc_dst
+// x3 => src_strd1
+// [sp] => src_strd2 (x4)
+// [sp+4] => dst_strd (x5)
+// [sp+8] => ht (x6)
+// [sp+12] => wd (x7)
+//
+
+
+
+
+ .global ih264_default_weighted_pred_chroma_av8
+
+ih264_default_weighted_pred_chroma_av8:
+
+ push_v_regs
+ stp x19, x20, [sp, #-16]!
+ cmp w7, #8
+ beq loop_8_uv //branch if wd is 8
+ cmp w7, #4
+ beq loop_4_uv //branch if wd is 4
+
+loop_2_uv: //each iteration processes two rows
+
+ ld1 {v0.s}[0], [x0], x3 //load row 1 in source 1
+ ld1 {v0.s}[1], [x0], x3 //load row 2 in source 1
+ ld1 {v1.s}[0], [x1], x4 //load row 1 in source 2
+ ld1 {v1.s}[1], [x1], x4 //load row 2 in source 2
+ urhadd v0.8b, v0.8b , v1.8b
+ subs w6, w6, #2 //decrement ht by 2
+ st1 {v0.s}[0], [x2], x5 //load row 1 in destination
+ st1 {v0.s}[1], [x2], x5 //load row 2 in destination
+ bgt loop_2_uv //if greater than 0 repeat the loop again
+ b end_loops_uv
+
+loop_4_uv: //each iteration processes two rows
+
+ ld1 {v0.8b}, [x0], x3 //load row 1 in source 1
+ ld1 {v2.8b}, [x1], x4 //load row 1 in source 2
+ ld1 {v1.8b}, [x0], x3 //load row 2 in source 1
+ urhadd v0.8b, v0.8b , v2.8b
+ ld1 {v3.8b}, [x1], x4 //load row 2 in source 2
+ urhadd v1.8b, v1.8b , v3.8b
+ st1 {v0.8b}, [x2], x5 //load row 1 in destination
+ subs w6, w6, #2 //decrement ht by 2
+ st1 {v1.8b}, [x2], x5 //load row 2 in destination
+ bgt loop_4_uv //if greater than 0 repeat the loop again
+ b end_loops_uv
+
+loop_8_uv: //each iteration processes four rows
+
+ ld1 {v0.8b, v1.8b}, [x0], x3 //load row 1 in source 1
+ ld1 {v8.8b, v9.8b}, [x1], x4 //load row 1 in source 2
+ ld1 {v2.8b, v3.8b}, [x0], x3 //load row 2 in source 1
+ urhadd v0.16b, v0.16b , v8.16b
+ urhadd v1.16b, v1.16b , v9.16b
+ ld1 {v10.8b, v11.8b}, [x1], x4 //load row 2 in source 2
+ ld1 {v4.8b, v5.8b}, [x0], x3 //load row 3 in source 1
+ urhadd v2.16b, v2.16b , v10.16b
+ urhadd v3.16b, v3.16b , v11.16b
+ ld1 {v12.8b, v13.8b}, [x1], x4 //load row 3 in source 2
+ ld1 {v6.8b, v7.8b}, [x0], x3 //load row 4 in source 1
+ urhadd v4.16b, v4.16b , v12.16b
+ urhadd v5.16b, v5.16b , v13.16b
+ ld1 {v14.8b, v15.8b}, [x1], x4 //load row 4 in source 2
+ st1 {v0.8b, v1.8b}, [x2], x5 //load row 1 in destination
+ urhadd v6.16b, v6.16b , v14.16b
+ urhadd v7.16b, v7.16b , v15.16b
+ st1 {v2.8b, v3.8b}, [x2], x5 //load row 2 in destination
+ subs w6, w6, #4 //decrement ht by 4
+ st1 {v4.8b, v5.8b}, [x2], x5 //load row 3 in destination
+ st1 {v6.8b, v7.8b}, [x2], x5 //load row 4 in destination
+ bgt loop_8_uv //if greater than 0 repeat the loop again
+
+end_loops_uv:
+ ldp x19, x20, [sp], #16
+ pop_v_regs
+ ret
+
+
+
diff --git a/common/armv8/ih264_ihadamard_scaling_av8.s b/common/armv8/ih264_ihadamard_scaling_av8.s
new file mode 100755
index 0000000..712c9ae
--- /dev/null
+++ b/common/armv8/ih264_ihadamard_scaling_av8.s
@@ -0,0 +1,250 @@
+//******************************************************************************
+//*
+//* Copyright (C) 2015 The Android Open Source Project
+//*
+//* Licensed under the Apache License, Version 2.0 (the "License");
+//* you may not use this file except in compliance with the License.
+//* You may obtain a copy of the License at:
+//*
+//* http://www.apache.org/licenses/LICENSE-2.0
+//*
+//* Unless required by applicable law or agreed to in writing, software
+//* distributed under the License is distributed on an "AS IS" BASIS,
+//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//* See the License for the specific language governing permissions and
+//* limitations under the License.
+//*
+//*****************************************************************************
+//* Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
+//*/
+///**
+// *******************************************************************************
+// * @file
+// * ih264_ihadamard_scaling_av8.s
+// *
+// * @brief
+// * Contains function definitions for inverse hadamard transform on 4x4 DC outputs
+// * of 16x16 intra-prediction
+// *
+// * @author
+// * Mohit
+// *
+// * @par List of Functions:
+// * - ih264_ihadamard_scaling_4x4_av8()
+// *
+// * @remarks
+// * None
+// *
+.include "ih264_neon_macros.s"
+
+// *******************************************************************************
+// */
+// * @brief This function performs a 4x4 inverse hadamard transform on the 4x4 DC coefficients
+// * of a 16x16 intra prediction macroblock, and then performs scaling.
+// * prediction buffer
+// *
+// * @par Description:
+// * The DC coefficients pass through a 2-stage inverse hadamard transform.
+// * This inverse transformed content is scaled to based on Qp value.
+// *
+// * @param[in] pi2_src
+// * input 4x4 block of DC coefficients
+// *
+// * @param[out] pi2_out
+// * output 4x4 block
+// *
+// * @param[in] pu2_iscal_mat
+// * pointer to scaling list
+// *
+// * @param[in] pu2_weigh_mat
+// * pointer to weight matrix
+// *
+// * @param[in] u4_qp_div_6
+// * Floor (qp/6)
+// *
+// * @param[in] pi4_tmp
+// * temporary buffer of size 1*16
+// *
+// * @returns none
+// *
+// * @remarks none
+// *
+// *******************************************************************************
+// */
+// *
+// *******************************************************************************
+// */
+// void ih264_ihadamard_scaling_4x4(word16* pi2_src,
+// word16* pi2_out,
+// const uword16 *pu2_iscal_mat,
+// const uword16 *pu2_weigh_mat,
+// uword32 u4_qp_div_6,
+// word32* pi4_tmp)
+//**************variables vs registers*****************************************
+//x0 => *pi2_src
+//x1 => *pi2_out
+//x2 => *pu2_iscal_mat
+//x3 => *pu2_weigh_mat
+//x4=> u4_qp_div_6
+
+.text
+.p2align 2
+
+ .global ih264_ihadamard_scaling_4x4_av8
+ih264_ihadamard_scaling_4x4_av8:
+
+//only one shift is done in horizontal inverse because,
+//if u4_qp_div_6 is lesser than 4 then shift value will be neagative and do negative left shift, in this case rnd_factor has value
+//if u4_qp_div_6 is greater than 4 then shift value will be positive and do left shift, here rnd_factor is 0
+ push_v_regs
+
+//=======================inverse hadamard transform================================
+
+ ld4 {v0.4h-v3.4h}, [x0] //load x4,x5,x6,x7
+
+ dup v14.4s, w4 // populate the u4_qp_div_6
+ ld1 {v15.h}[0], [x3] // pu2_weigh_mat
+ ld1 {v16.h}[0], [x2] //pu2_iscal_mat
+
+ saddl v4.4s, v0.4h, v3.4h //x0 = x4 + x7
+ saddl v5.4s, v1.4h, v2.4h //x1 = x5 + x6
+ ssubl v6.4s, v1.4h, v2.4h //x2 = x5 - x6
+ ssubl v7.4s, v0.4h, v3.4h //x3 = x4 - x7
+
+ add v0.4s, v4.4s, v5.4s //pi4_tmp_ptr[0] = x0 + x1
+ add v1.4s, v7.4s, v6.4s //pi4_tmp_ptr[1] = x3 + x2
+ sub v2.4s, v4.4s, v5.4s //pi4_tmp_ptr[2] = x0 - x1
+ sub v3.4s, v7.4s, v6.4s //pi4_tmp_ptr[3] = x3 - x2
+
+ umull v15.4s, v15.4h, v16.4h
+ dup v15.4s, v15.s[0] //pu2_weigh_mat[0]*pu2_iscal_mat[0]
+
+ //transpose
+ trn1 v4.4s, v0.4s, v1.4s
+ trn2 v5.4s, v0.4s, v1.4s
+ trn1 v6.4s, v2.4s, v3.4s
+ trn2 v7.4s, v2.4s, v3.4s
+
+ trn1 v0.2d, v4.2d, v6.2d
+ trn2 v2.2d, v4.2d, v6.2d
+ trn1 v1.2d, v5.2d, v7.2d
+ trn2 v3.2d, v5.2d, v7.2d
+ //end transpose
+
+ add v4.4s, v0.4s, v3.4s //x0 = x4+x7
+ add v5.4s, v1.4s, v2.4s //x1 = x5+x6
+ sub v6.4s, v1.4s, v2.4s //x2 = x5-x6
+ sub v7.4s, v0.4s, v3.4s //x3 = x4-x7
+
+ add v0.4s, v4.4s, v5.4s //pi4_tmp_ptr[0] = x0 + x1
+ add v1.4s, v7.4s, v6.4s //pi4_tmp_ptr[1] = x3 + x2
+ sub v2.4s, v4.4s, v5.4s //pi4_tmp_ptr[2] = x0 - x1
+ sub v3.4s, v7.4s, v6.4s //pi4_tmp_ptr[3] = x3 - x2
+
+ mul v0.4s, v0.4s, v15.4s // q0 = p[i] = (x[i] * trns_coeff[i]) where i = 0..3
+ mul v1.4s, v1.4s, v15.4s // q1 = p[i] = (x[i] * trns_coeff[i]) where i = 4..7
+ mul v2.4s, v2.4s, v15.4s // q2 = p[i] = (x[i] * trns_coeff[i]) where i = 8..11
+ mul v3.4s, v3.4s, v15.4s // q3 = p[i] = (x[i] * trns_coeff[i]) where i = 12..15
+
+ sshl v0.4s, v0.4s, v14.4s // q0 = q[i] = (p[i] << (qp/6)) where i = 0..3
+ sshl v1.4s, v1.4s, v14.4s // q1 = q[i] = (p[i] << (qp/6)) where i = 4..7
+ sshl v2.4s, v2.4s, v14.4s // q2 = q[i] = (p[i] << (qp/6)) where i = 8..11
+ sshl v3.4s, v3.4s, v14.4s // q3 = q[i] = (p[i] << (qp/6)) where i = 12..15
+
+ sqrshrn v0.4h, v0.4s, #6 // d0 = c[i] = ((q[i] + 32) >> 4) where i = 0..3
+ sqrshrn v1.4h, v1.4s, #6 // d1 = c[i] = ((q[i] + 32) >> 4) where i = 4..7
+ sqrshrn v2.4h, v2.4s, #6 // d2 = c[i] = ((q[i] + 32) >> 4) where i = 8..11
+ sqrshrn v3.4h, v3.4s, #6 // d3 = c[i] = ((q[i] + 32) >> 4) where i = 12..15
+
+ st1 {v0.4h-v3.4h}, [x1] //store the result
+
+ pop_v_regs
+ ret
+
+
+// *******************************************************************************
+// */
+// * @brief This function performs a 2x2 inverse hadamard transform for chroma block
+// *
+// * @par Description:
+// * The DC coefficients pass through a 2-stage inverse hadamard transform.
+// * This inverse transformed content is scaled to based on Qp value.
+// * Both DC blocks of U and v blocks are processesd
+// *
+// * @param[in] pi2_src
+// * input 1x8 block of ceffs. First 4 are from U and next from V
+// *
+// * @param[out] pi2_out
+// * output 1x8 block
+// *
+// * @param[in] pu2_iscal_mat
+// * pointer to scaling list
+// *
+// * @param[in] pu2_weigh_mat
+// * pointer to weight matrix
+// *
+// * @param[in] u4_qp_div_6
+// * Floor (qp/6)
+// *
+// * @returns none
+// *
+// * @remarks none
+// *
+// *******************************************************************************
+// */
+// *
+// *******************************************************************************
+// */
+// void ih264_ihadamard_scaling_2x2_uv(WORD16* pi2_src,
+// WORD16* pi2_out,
+// const UWORD16 *pu2_iscal_mat,
+// const UWORD16 *pu2_weigh_mat,
+// UWORD32 u4_qp_div_6,
+
+ .global ih264_ihadamard_scaling_2x2_uv_av8
+ih264_ihadamard_scaling_2x2_uv_av8:
+
+//Registers used
+// x0 : *pi2_src
+// x1 : *pi2_out
+// x2 : *pu2_iscal_mat
+// x3 : *pu2_weigh_mat
+// x4 : u4_qp_div_6
+ push_v_regs
+ ld1 {v26.h}[0], [x2]
+ ld1 {v27.h}[0], [x3]
+
+ sub w4, w4, #5 //qp/6 - 4
+ dup v28.4s, w4 //load qp/6
+
+ ld2 {v0.4h, v1.4h}, [x0] //load 8 dc coeffs
+ //i2_x4,i2_x6,i2_y4,i1_y6 -> d0
+ //i2_x5,i2_x7,i2_y5,i1_y6 -> d1
+
+ saddl v2.4s, v0.4h, v1.4h //i4_x0 = i4_x4 + i4_x5;...x2
+ ssubl v4.4s, v0.4h, v1.4h //i4_x1 = i4_x4 - i4_x5;...x3
+
+ umull v30.4s, v26.4h, v27.4h //pu2_iscal_mat[0]*pu2_weigh_mat[0]
+ dup v30.4s, v30.s[0]
+
+ trn1 v0.4s, v2.4s, v4.4s
+ trn2 v1.4s, v2.4s, v4.4s //i4_x0 i4_x1 -> q1
+
+ add v2.4s, v0.4s, v1.4s //i4_x4 = i4_x0+i4_x2;.. i4_x5
+ sub v3.4s, v0.4s, v1.4s //i4_x6 = i4_x0-i4_x2;.. i4_x7
+
+ mul v2.4s, v2.4s, v30.4s
+ mul v3.4s, v3.4s, v30.4s
+
+ sshl v2.4s, v2.4s, v28.4s
+ sshl v3.4s, v3.4s, v28.4s
+
+ xtn v0.4h, v2.4s //i4_x4 i4_x5 i4_y4 i4_y5
+ xtn v1.4h, v3.4s //i4_x6 i4_x7 i4_y6 i4_y7
+
+ st2 {v0.4s-v1.4s}, [x1]
+ pop_v_regs
+ ret
+
+
+
diff --git a/common/armv8/ih264_inter_pred_chroma_av8.s b/common/armv8/ih264_inter_pred_chroma_av8.s
new file mode 100755
index 0000000..714e271
--- /dev/null
+++ b/common/armv8/ih264_inter_pred_chroma_av8.s
@@ -0,0 +1,392 @@
+//******************************************************************************
+//*
+//* Copyright (C) 2015 The Android Open Source Project
+//*
+//* Licensed under the Apache License, Version 2.0 (the "License");
+//* you may not use this file except in compliance with the License.
+//* You may obtain a copy of the License at:
+//*
+//* http://www.apache.org/licenses/LICENSE-2.0
+//*
+//* Unless required by applicable law or agreed to in writing, software
+//* distributed under the License is distributed on an "AS IS" BASIS,
+//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//* See the License for the specific language governing permissions and
+//* limitations under the License.
+//*
+//*****************************************************************************
+//* Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
+//*/
+///**
+//******************************************************************************
+//* @file
+//* ih264_inter_pred_chroma_av8.s
+//*
+//* @brief
+//* Contains function definitions for inter prediction interpolation.
+//*
+//* @author
+//* Ittaim
+//*
+//* @par List of Functions:
+//*
+//* - ih264_inter_pred_chroma_av8()
+//*
+//* @remarks
+//* None
+//*
+//*******************************************************************************
+//*/
+
+///* All the functions here are replicated from ih264_inter_pred_filters.c
+//
+
+///**
+///**
+///**
+//
+///**
+//*******************************************************************************
+//*
+//* @brief
+//* Interprediction chroma filter
+//*
+//* @par Description:
+//* Applies filtering to chroma samples as mentioned in
+//* sec 8.4.2.2.2 titled "chroma sample interpolation process"
+//*
+//* @param[in] pu1_src
+//* UWORD8 pointer to the source containing alternate U and V samples
+//*
+//* @param[out] pu1_dst
+//* UWORD8 pointer to the destination
+//*
+//* @param[in] src_strd
+//* integer source stride
+//*
+//* @param[in] dst_strd
+//* integer destination stride
+//*
+//* @param[in]uc_dx
+//* dx value where the sample is to be produced(refer sec 8.4.2.2.2 )
+//*
+//* @param[in] uc_dy
+//* dy value where the sample is to be produced(refer sec 8.4.2.2.2 )
+//*
+//* @param[in] ht
+//* integer height of the array
+//*
+//* @param[in] wd
+//* integer width of the array
+//*
+//* @returns
+//*
+//* @remarks
+//* None
+//*
+//*******************************************************************************
+//*/
+
+//void ih264_inter_pred_chroma(UWORD8 *pu1_src,
+// UWORD8 *pu1_dst,
+// WORD32 src_strd,
+// WORD32 dst_strd,
+// UWORD8 u1_dx,
+// UWORD8 u1_dy,
+// WORD32 ht,
+// WORD32 wd)
+//**************Variables Vs Registers*****************************************
+// x0 => *pu1_src
+// x1 => *pu1_dst
+// x2 => src_strd
+// x3 => dst_strd
+// x4 => u1_dx
+// x5 => u1_dy
+// x6 => height
+// x7 => width
+//
+.text
+.p2align 2
+.include "ih264_neon_macros.s"
+
+
+
+ .global ih264_inter_pred_chroma_av8
+
+ih264_inter_pred_chroma_av8:
+
+
+
+ // STMFD sp!, {x4-x12, x14} //store register values to stack
+ push_v_regs
+ stp x19, x20, [sp, #-16]!
+
+
+
+
+
+ sub x20, x4, #8 //8-u1_dx
+ neg x8, x20
+ sub x20, x5, #8 //8-u1_dy
+ neg x9, x20
+ mul x10, x8, x9 //
+ mul x11, x4, x9 //
+
+ dup v28.8b, w10
+ dup v29.8b, w11
+
+ mul x10, x8, x5 //
+ mul x11, x4, x5 //
+
+ dup v30.8b, w10
+ dup v31.8b, w11
+
+ subs x12, x7, #2 //if wd=4 branch to loop_4
+ beq loop_2
+ subs x12, x7, #4 //if wd=8 branch to loop_8
+ beq loop_4
+
+loop_8:
+ ld1 {v0.8b, v1.8b, v2.8b}, [x0], x2 //// Load row0 ;
+ ext v3.8b, v0.8b , v1.8b , #2
+ ld1 {v5.8b, v6.8b, v7.8b}, [x0], x2 //// Load row1;
+ umull v20.8h, v0.8b, v28.8b
+ ext v8.8b, v5.8b , v6.8b , #2
+ umlal v20.8h, v3.8b, v29.8b
+ ext v9.8b, v6.8b , v7.8b , #2
+ umlal v20.8h, v5.8b, v30.8b
+ ext v4.8b, v1.8b , v2.8b , #2
+ umlal v20.8h, v8.8b, v31.8b
+ sqrshrun v26.8b, v20.8h, #6
+ umull v22.8h, v1.8b, v28.8b
+ ld1 {v10.8b, v11.8b, v12.8b}, [x0], x2 //// Load row2 ;
+ umlal v22.8h, v4.8b, v29.8b
+ ext v13.8b, v10.8b , v11.8b , #2
+ umlal v22.8h, v6.8b, v30.8b
+ ext v14.8b, v11.8b , v12.8b , #2
+ umlal v22.8h, v9.8b, v31.8b
+ sqrshrun v27.8b, v22.8h, #6
+ umull v24.8h, v5.8b, v28.8b
+ st1 { v26.8b, v27.8b}, [x1], x3 ////Store dest row
+ umlal v24.8h, v8.8b, v29.8b
+ ld1 {v0.8b, v1.8b, v2.8b}, [x0], x2 //// Load row3 ;
+ umlal v24.8h, v10.8b, v30.8b
+ ext v3.8b, v0.8b , v1.8b , #2
+ umlal v24.8h, v13.8b, v31.8b
+ ext v4.8b, v1.8b , v2.8b , #2
+ umull v16.8h, v6.8b, v28.8b
+ sqrshrun v18.8b, v24.8h, #6
+ umlal v16.8h, v9.8b, v29.8b
+ umlal v16.8h, v11.8b, v30.8b
+ umlal v16.8h, v14.8b, v31.8b
+ sqrshrun v19.8b, v16.8h, #6
+ st1 {v18.8b, v19.8b}, [x1], x3 // store row 1
+ umull v20.8h, v10.8b, v28.8b
+ umlal v20.8h, v13.8b, v29.8b
+ umlal v20.8h, v0.8b, v30.8b
+ umlal v20.8h, v3.8b, v31.8b
+ sqrshrun v26.8b, v20.8h, #6
+ umull v24.8h, v11.8b, v28.8b
+ ld1 {v5.8b, v6.8b, v7.8b}, [x0], x2 //// Load row4;
+ umlal v24.8h, v14.8b, v29.8b
+ ext v8.8b, v5.8b , v6.8b , #2
+ umlal v24.8h, v1.8b, v30.8b
+ ext v9.8b, v6.8b , v7.8b , #2
+ umlal v24.8h, v4.8b, v31.8b
+ umull v20.8h, v0.8b, v28.8b
+ sqrshrun v27.8b, v24.8h, #6
+ umlal v20.8h, v3.8b, v29.8b
+ st1 { v26.8b, v27.8b}, [x1], x3 ////Store dest row2
+ umlal v20.8h, v5.8b, v30.8b
+ umlal v20.8h, v8.8b, v31.8b
+ umull v22.8h, v1.8b, v28.8b
+ umlal v22.8h, v4.8b, v29.8b
+ umlal v22.8h, v6.8b, v30.8b
+ sqrshrun v26.8b, v20.8h, #6
+ umlal v22.8h, v9.8b, v31.8b
+ subs x12, x6, #4
+ sqrshrun v27.8b, v22.8h, #6
+ st1 { v26.8b, v27.8b}, [x1], x3 ////Store dest row3
+
+ beq end_func //If ht=4
+
+ ld1 {v10.8b, v11.8b, v12.8b}, [x0], x2 //// Load row5
+ ext v13.8b, v10.8b , v11.8b , #2
+ umull v24.8h, v5.8b, v28.8b
+ ext v14.8b, v11.8b , v12.8b , #2
+ ld1 {v0.8b, v1.8b, v2.8b}, [x0], x2 //// Load row6;
+ umlal v24.8h, v8.8b, v29.8b
+ umlal v24.8h, v10.8b, v30.8b
+ umlal v24.8h, v13.8b, v31.8b
+ ext v3.8b, v0.8b , v1.8b , #2
+ umull v16.8h, v6.8b, v28.8b
+ sqrshrun v18.8b, v24.8h, #6
+ umlal v16.8h, v9.8b, v29.8b
+ umlal v16.8h, v11.8b, v30.8b
+ umlal v16.8h, v14.8b, v31.8b
+ ext v4.8b, v1.8b , v2.8b , #2
+ sqrshrun v19.8b, v16.8h, #6
+ st1 { v18.8b, v19.8b}, [x1], x3 // store row 4
+ umull v20.8h, v10.8b, v28.8b
+ umlal v20.8h, v13.8b, v29.8b
+ umlal v20.8h, v0.8b, v30.8b
+ umlal v20.8h, v3.8b, v31.8b
+ ld1 {v5.8b, v6.8b, v7.8b}, [x0], x2 //// Load row7;
+ sqrshrun v26.8b, v20.8h, #6
+ umull v24.8h, v11.8b, v28.8b
+ umlal v24.8h, v14.8b, v29.8b
+ ext v8.8b, v5.8b , v6.8b , #2
+ umlal v24.8h, v1.8b, v30.8b
+ umlal v24.8h, v4.8b, v31.8b
+ ext v9.8b, v6.8b , v7.8b , #2
+ sqrshrun v27.8b, v24.8h, #6
+ st1 {v26.8b, v27.8b}, [x1], x3 ////Store dest row5
+ umull v20.8h, v0.8b, v28.8b
+ umlal v20.8h, v3.8b, v29.8b
+ umlal v20.8h, v5.8b, v30.8b
+ umlal v20.8h, v8.8b, v31.8b
+ ld1 {v10.8b, v11.8b, v12.8b}, [x0], x2 //// Load row8 ;
+ sqrshrun v26.8b, v20.8h, #6
+ umull v22.8h, v1.8b, v28.8b
+ umlal v22.8h, v4.8b, v29.8b
+ umlal v22.8h, v6.8b, v30.8b
+ ext v13.8b, v10.8b , v11.8b , #2
+ umlal v22.8h, v9.8b, v31.8b
+ ext v14.8b, v11.8b , v12.8b , #2
+ sqrshrun v27.8b, v22.8h, #6
+ st1 { v26.8b, v27.8b}, [x1], x3 ////Store dest row6
+ umull v24.8h, v5.8b, v28.8b
+ umlal v24.8h, v8.8b, v29.8b
+ umlal v24.8h, v10.8b, v30.8b
+ umlal v24.8h, v13.8b, v31.8b
+ umull v16.8h, v6.8b, v28.8b
+ sqrshrun v18.8b, v24.8h, #6
+ umlal v16.8h, v9.8b, v29.8b
+ umlal v16.8h, v11.8b, v30.8b
+ umlal v16.8h, v14.8b, v31.8b
+ sqrshrun v19.8b, v16.8h, #6
+ st1 { v18.8b, v19.8b}, [x1], x3 // store row 7
+ b end_func
+
+loop_4:
+ ld1 {v0.8b, v1.8b}, [x0], x2 //// Load row0 ;
+ ext v2.8b, v0.8b , v1.8b , #2
+ ld1 {v3.8b, v4.8b}, [x0], x2 //// Load row1;
+ ext v5.8b, v3.8b , v4.8b , #2
+ umull v20.8h, v0.8b, v28.8b
+ umlal v20.8h, v2.8b, v29.8b
+ umlal v20.8h, v3.8b, v30.8b
+ umlal v20.8h, v5.8b, v31.8b
+ ld1 {v6.8b, v7.8b}, [x0], x2 //// Load row2
+ sqrshrun v26.8b, v20.8h, #6
+ ext v8.8b, v6.8b , v7.8b , #2
+ st1 {v26.8b}, [x1], x3 ////Store dest row0
+ umull v22.8h, v3.8b, v28.8b
+ umlal v22.8h, v5.8b, v29.8b
+ umlal v22.8h, v6.8b, v30.8b
+ umlal v22.8h, v8.8b, v31.8b
+ subs x12, x6, #2
+ sqrshrun v27.8b, v22.8h, #6
+ st1 {v27.8b}, [x1], x3 ////Store dest row1
+ beq end_func //If ht=2
+
+ ld1 {v9.8b, v10.8b}, [x0], x2 //// Load row3;
+ ext v11.8b, v9.8b , v10.8b , #2
+ umull v24.8h, v6.8b, v28.8b
+ umlal v24.8h, v8.8b, v29.8b
+ umlal v24.8h, v9.8b, v30.8b
+ umlal v24.8h, v11.8b, v31.8b
+ ld1 {v0.8b, v1.8b}, [x0], x2 //// Load row4 ;
+ sqrshrun v16.8b, v24.8h, #6
+ ext v2.8b, v0.8b , v1.8b , #2
+ st1 {v16.8b}, [x1], x3 ////Store dest row2
+ umull v18.8h, v9.8b, v28.8b
+ umlal v18.8h, v11.8b, v29.8b
+ umlal v18.8h, v0.8b, v30.8b
+ umlal v18.8h, v2.8b, v31.8b
+ subs x12, x6, #4
+ sqrshrun v17.8b, v18.8h, #6
+ st1 {v17.8b}, [x1], x3 ////Store dest row3
+ beq end_func //If ht=4
+
+ ld1 {v3.8b, v4.8b}, [x0], x2 //// Load row5;
+ ext v5.8b, v3.8b , v4.8b , #2
+ umull v20.8h, v0.8b, v28.8b
+ umlal v20.8h, v2.8b, v29.8b
+ umlal v20.8h, v3.8b, v30.8b
+ umlal v20.8h, v5.8b, v31.8b
+ ld1 {v6.8b, v7.8b}, [x0], x2 //// Load row6 ;
+ sqrshrun v26.8b, v20.8h, #6
+ ext v8.8b, v6.8b , v7.8b , #2
+ st1 {v26.8b}, [x1], x3 ////Store dest row4
+ umull v22.8h, v3.8b, v28.8b
+ umlal v22.8h, v5.8b, v29.8b
+ umlal v22.8h, v6.8b, v30.8b
+ umlal v22.8h, v8.8b, v31.8b
+ ld1 {v9.8b, v10.8b}, [x0], x2 //// Load row7;
+ sqrshrun v27.8b, v22.8h, #6
+ ext v11.8b, v9.8b , v10.8b , #2
+ st1 {v27.8b}, [x1], x3 ////Store dest row5
+ umull v24.8h, v6.8b, v28.8b
+ umlal v24.8h, v8.8b, v29.8b
+ umlal v24.8h, v9.8b, v30.8b
+ umlal v24.8h, v11.8b, v31.8b
+ ld1 {v0.8b, v1.8b}, [x0], x2 //// Load row8;
+ sqrshrun v16.8b, v24.8h, #6
+ ext v2.8b, v0.8b , v1.8b , #2
+ st1 {v16.8b}, [x1], x3 ////Store dest row6
+ umull v18.8h, v9.8b, v28.8b
+ umlal v18.8h, v11.8b, v29.8b
+ umlal v18.8h, v0.8b, v30.8b
+ umlal v18.8h, v2.8b, v31.8b
+ sqrshrun v17.8b, v18.8h, #6
+ st1 {v17.8b}, [x1], x3 ////Store dest row7
+ b end_func
+
+loop_2:
+ ld1 {v0.8b}, [x0], x2 //// Load row0 ;
+ ext v2.8b, v0.8b , v0.8b , #2
+ ld1 {v3.8b}, [x0], x2 //// Load row1;
+ ext v5.8b, v3.8b , v3.8b , #2
+ umull v20.8h, v0.8b, v28.8b
+ umlal v20.8h, v2.8b, v29.8b
+ umlal v20.8h, v3.8b, v30.8b
+ umlal v20.8h, v5.8b, v31.8b
+ ld1 {v6.8b}, [x0], x2 //// Load row2
+ sqrshrun v26.8b, v20.8h, #6
+ ext v8.8b, v6.8b , v6.8b , #2
+ st1 {v26.s}[0], [x1], x3 ////Store dest row0
+ umull v22.8h, v3.8b, v28.8b
+ umlal v22.8h, v5.8b, v29.8b
+ umlal v22.8h, v6.8b, v30.8b
+ umlal v22.8h, v8.8b, v31.8b
+ subs x12, x6, #2
+ sqrshrun v27.8b, v22.8h, #6
+ st1 {v27.s}[0], [x1], x3 ////Store dest row1
+ beq end_func //If ht=2
+
+ ld1 {v9.8b}, [x0], x2 //// Load row3;
+ ext v11.8b, v9.8b , v9.8b , #2
+ umull v24.8h, v6.8b, v28.8b
+ umlal v24.8h, v8.8b, v29.8b
+ umlal v24.8h, v9.8b, v30.8b
+ umlal v24.8h, v11.8b, v31.8b
+ ld1 {v0.8b}, [x0], x2 //// Load row4 ;
+ sqrshrun v16.8b, v24.8h, #6
+ ext v2.8b, v0.8b , v0.8b , #2
+ st1 {v16.s}[0], [x1], x3 ////Store dest row2
+ umull v18.8h, v9.8b, v28.8b
+ umlal v18.8h, v11.8b, v29.8b
+ umlal v18.8h, v0.8b, v30.8b
+ umlal v18.8h, v2.8b, v31.8b
+ sqrshrun v17.8b, v18.8h, #6
+ st1 {v17.s}[0], [x1], x3 ////Store dest row3
+
+
+end_func:
+ // LDMFD sp!,{x4-x12,PC} //Restoring registers from stack
+ ldp x19, x20, [sp], #16
+ pop_v_regs
+ ret
+
+
diff --git a/common/armv8/ih264_inter_pred_filters_luma_horz_av8.s b/common/armv8/ih264_inter_pred_filters_luma_horz_av8.s
new file mode 100755
index 0000000..6ad463a
--- /dev/null
+++ b/common/armv8/ih264_inter_pred_filters_luma_horz_av8.s
@@ -0,0 +1,530 @@
+//******************************************************************************
+//*
+//* Copyright (C) 2015 The Android Open Source Project
+//*
+//* Licensed under the Apache License, Version 2.0 (the "License");
+//* you may not use this file except in compliance with the License.
+//* You may obtain a copy of the License at:
+//*
+//* http://www.apache.org/licenses/LICENSE-2.0
+//*
+//* Unless required by applicable law or agreed to in writing, software
+//* distributed under the License is distributed on an "AS IS" BASIS,
+//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//* See the License for the specific language governing permissions and
+//* limitations under the License.
+//*
+//*****************************************************************************
+//* Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
+//*/
+///**
+//******************************************************************************
+//* @file
+//* ih264_inter_pred_luma_horz_av8.s
+//*
+//* @brief
+//* Contains function definitions for inter prediction interpolation.
+//*
+//* @author
+//* Ittiam
+//*
+//* @par List of Functions:
+//*
+//* - ih264_inter_pred_luma_horz_av8()
+//*
+//* @remarks
+//* None
+//*
+//*******************************************************************************
+//*/
+
+///* All the functions here are replicated from ih264_inter_pred_filters.c
+//
+
+///**
+///**
+//*******************************************************************************
+//*
+//* @brief
+//* Interprediction luma filter for horizontal input
+//*
+//* @par Description:
+//* Applies a 6 tap horizontal filter .The output is clipped to 8 bits
+//* sec 8.4.2.2.1 titled "Luma sample interpolation process"
+//*
+//* @param[in] pu1_src
+//* UWORD8 pointer to the source
+//*
+//* @param[out] pu1_dst
+//* UWORD8 pointer to the destination
+//*
+//* @param[in] src_strd
+//* integer source stride
+//*
+//* @param[in] dst_strd
+//* integer destination stride
+//*
+//* @param[in] ht
+//* integer height of the array
+//*
+//* @param[in] wd
+//* integer width of the array
+//*
+//* @returns
+//*
+// @remarks
+//* None
+//*
+//*******************************************************************************
+//*/
+
+//void ih264_inter_pred_luma_horz (
+// UWORD8 *pu1_src,
+// UWORD8 *pu1_dst,
+// WORD32 src_strd,
+// WORD32 dst_strd,
+// WORD32 ht,
+// WORD32 wd )
+
+//**************Variables Vs Registers*****************************************
+// x0 => *pu1_src
+// x1 => *pu1_dst
+// x2 => src_strd
+// x3 => dst_strd
+// x4 => ht
+// x5 => wd
+
+.text
+.p2align 2
+
+.include "ih264_neon_macros.s"
+
+
+
+ .global ih264_inter_pred_luma_horz_av8
+
+ih264_inter_pred_luma_horz_av8:
+
+
+
+
+ // STMFD sp!, {x4-x12, x14} //store register values to stack
+ push_v_regs
+ stp x19, x20, [sp, #-16]!
+ sub x0, x0, #2 //pu1_src-2
+ sub x14, x4, #16
+ movi v0.8b, #5 //filter coeff
+ subs x12, x5, #8 //if wd=8 branch to loop_8
+ movi v1.8b, #20 //filter coeff
+ beq loop_8
+
+ subs x12, x5, #4 //if wd=4 branch to loop_4
+ beq loop_4
+
+loop_16: //when wd=16
+ //// Processing row0 and row1
+ ld1 {v2.8b, v3.8b, v4.8b}, [x0], x2 //// Load row0
+ add x14, x14, #1 //for checking loop
+ ext v31.8b, v2.8b , v3.8b, #5 ////extract a[5] (column1,row0)
+ ld1 {v5.8b, v6.8b, v7.8b}, [x0], x2 //// Load row1
+ ext v30.8b, v3.8b , v4.8b, #5 ////extract a[5] (column2,row0)
+ uaddl v8.8h, v31.8b, v2.8b //// a0 + a5 (column1,row0)
+ ext v28.8b, v5.8b , v6.8b, #5 ////extract a[5] (column1,row1)
+ uaddl v10.8h, v30.8b, v3.8b //// a0 + a5 (column2,row0)
+ ext v27.8b, v6.8b , v7.8b, #5 ////extract a[5] (column2,row1)
+ uaddl v14.8h, v28.8b, v5.8b //// a0 + a5 (column1,row1)
+ ext v31.8b, v2.8b , v3.8b, #2 ////extract a[2] (column1,row0)
+ uaddl v16.8h, v27.8b, v6.8b //// a0 + a5 (column2,row1)
+ ext v30.8b, v3.8b , v4.8b, #2 ////extract a[2] (column2,row0)
+ umlal v8.8h, v31.8b, v1.8b //// a0 + a5 + 20a2 (column1,row0)
+ ext v28.8b, v5.8b , v6.8b, #2 ////extract a[2] (column1,row1)
+ umlal v10.8h, v30.8b, v1.8b //// a0 + a5 + 20a2 (column2,row0)
+ ext v27.8b, v6.8b , v7.8b, #2 ////extract a[2] (column2,row1)
+ umlal v14.8h, v28.8b, v1.8b //// a0 + a5 + 20a2 (column1,row1)
+ ext v31.8b, v2.8b , v3.8b, #3 ////extract a[3] (column1,row0)
+ umlal v16.8h, v27.8b, v1.8b //// a0 + a5 + 20a2 (column2,row1)
+ ext v30.8b, v3.8b , v4.8b, #3 ////extract a[3] (column2,row0)
+ umlal v8.8h, v31.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column1,row0)
+ ext v28.8b, v5.8b , v6.8b, #3 ////extract a[3] (column1,row1)
+ umlal v10.8h, v30.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column2,row0)
+ ext v27.8b, v6.8b , v7.8b, #3 ////extract a[3] (column2,row1)
+ umlal v14.8h, v28.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column1,row1)
+ ext v31.8b, v2.8b , v3.8b, #1 ////extract a[1] (column1,row0)
+ umlal v16.8h, v27.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column2,row1)
+ ext v30.8b, v3.8b , v4.8b, #1 ////extract a[1] (column2,row0)
+ umlsl v8.8h, v31.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column1,row0)
+ ext v28.8b, v5.8b , v6.8b, #1 ////extract a[1] (column1,row1)
+ umlsl v10.8h, v30.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column2,row0)
+ ext v27.8b, v6.8b , v7.8b, #1 ////extract a[1] (column2,row1)
+ umlsl v14.8h, v28.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column1,row1)
+ ext v31.8b, v2.8b , v3.8b, #4 ////extract a[4] (column1,row0)
+ umlsl v16.8h, v27.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column2,row1)
+ ext v30.8b, v3.8b , v4.8b, #4 ////extract a[4] (column2,row0)
+ umlsl v8.8h, v31.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1,row0)
+ ext v28.8b, v5.8b , v6.8b, #4 ////extract a[4] (column1,row1)
+ umlsl v10.8h, v30.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column2,row0)
+ ext v27.8b, v6.8b , v7.8b, #4 ////extract a[4] (column2,row1)
+ umlsl v14.8h, v28.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1,row1)
+ ld1 {v2.8b, v3.8b, v4.8b}, [x0], x2 //// Load row2
+ umlsl v16.8h, v27.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column2,row1)
+
+ sqrshrun v20.8b, v8.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column1,row0)
+ ld1 {v5.8b, v6.8b, v7.8b}, [x0], x2 //// Load row3
+ sqrshrun v21.8b, v10.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column2,row0)
+ ext v31.8b, v2.8b , v3.8b, #5 ////extract a[5] (column1,row2)
+ st1 {v20.8b, v21.8b}, [x1], x3 ////Store dest row0
+ sqrshrun v23.8b, v14.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column1,row1)
+ ext v30.8b, v3.8b , v4.8b, #5 ////extract a[5] (column2,row2)
+ sqrshrun v24.8b, v16.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column2,row1)
+
+
+
+//// Processing row2 and row3
+ ext v28.8b, v5.8b , v6.8b, #5 ////extract a[5] (column1,row3)
+ uaddl v8.8h, v31.8b, v2.8b //// a0 + a5 (column1,row2)
+ st1 {v23.8b, v24.8b}, [x1], x3 ////Store dest row1
+ uaddl v10.8h, v30.8b, v3.8b //// a0 + a5 (column2,row2)
+ ext v27.8b, v6.8b , v7.8b, #5 ////extract a[5] (column2,row3)
+ uaddl v14.8h, v28.8b, v5.8b //// a0 + a5 (column1,row3)
+ ext v31.8b, v2.8b , v3.8b, #2 ////extract a[2] (column1,row2)
+ uaddl v16.8h, v27.8b, v6.8b //// a0 + a5 (column2,row3)
+ ext v30.8b, v3.8b , v4.8b, #2 ////extract a[2] (column2,row2)
+ umlal v8.8h, v31.8b, v1.8b //// a0 + a5 + 20a2 (column1,row2)
+ ext v27.8b, v6.8b , v7.8b, #2 ////extract a[2] (column2,row3)
+ umlal v10.8h, v30.8b, v1.8b //// a0 + a5 + 20a2 (column2,row2)
+ ext v28.8b, v5.8b , v6.8b, #2 ////extract a[2] (column1,row3)
+ umlal v14.8h, v28.8b, v1.8b //// a0 + a5 + 20a2 (column1,row3)
+ ext v31.8b, v2.8b , v3.8b, #3 ////extract a[3] (column1,row2)
+ umlal v16.8h, v27.8b, v1.8b //// a0 + a5 + 20a2 (column2,row3)
+ ext v30.8b, v3.8b , v4.8b, #3 ////extract a[3] (column2,row2)
+ umlal v8.8h, v31.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column1,row2)
+ ext v28.8b, v5.8b , v6.8b, #3 ////extract a[3] (column1,row3)
+ umlal v10.8h, v30.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column2,row2)
+ ext v27.8b, v6.8b , v7.8b, #3 ////extract a[3] (column2,row3)
+ umlal v14.8h, v28.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column1,row3)
+ ext v31.8b, v2.8b , v3.8b, #1 ////extract a[1] (column1,row2)
+ umlal v16.8h, v27.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column2,row3)
+ ext v30.8b, v3.8b , v4.8b, #1 ////extract a[1] (column2,row2)
+ umlsl v8.8h, v31.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column1,row2)
+ ext v28.8b, v5.8b , v6.8b, #1 ////extract a[1] (column1,row3)
+ umlsl v10.8h, v30.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column2,row2)
+ ext v27.8b, v6.8b , v7.8b, #1 ////extract a[1] (column2,row3)
+ umlsl v14.8h, v28.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column1,row3)
+ ext v31.8b, v2.8b , v3.8b, #4 ////extract a[4] (column1,row2)
+ umlsl v16.8h, v27.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column2,row3)
+ ext v30.8b, v3.8b , v4.8b, #4 ////extract a[4] (column2,row2)
+ umlsl v8.8h, v31.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1,row2)
+ ext v28.8b, v5.8b , v6.8b, #4 ////extract a[4] (column1,row3)
+ umlsl v10.8h, v30.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column2,row2)
+ ext v27.8b, v6.8b , v7.8b, #4 ////extract a[4] (column2,row3)
+ umlsl v14.8h, v28.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1,row3)
+ ld1 {v2.8b, v3.8b, v4.8b}, [x0], x2 //// Load row4
+ umlsl v16.8h, v27.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column2,row3)
+
+ sqrshrun v20.8b, v8.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column1,row2)
+ ld1 {v5.8b, v6.8b, v7.8b}, [x0], x2 //// Load row5
+ sqrshrun v21.8b, v10.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column2,row2)
+ ext v31.8b, v2.8b , v3.8b, #5 ////extract a[5] (column1,row4)
+ st1 {v20.8b, v21.8b}, [x1], x3 ////Store dest row2
+ sqrshrun v23.8b, v14.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column1,row3)
+ ext v30.8b, v3.8b , v4.8b, #5 ////extract a[5] (column2,row4)
+ sqrshrun v24.8b, v16.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column2,row3)
+
+
+//// Processing row4 and row5
+ ext v28.8b, v5.8b , v6.8b, #5 ////extract a[5] (column1,row5)
+ uaddl v8.8h, v31.8b, v2.8b //// a0 + a5 (column1,row4)
+ st1 {v23.8b, v24.8b}, [x1], x3 ////Store dest row3
+ uaddl v10.8h, v30.8b, v3.8b //// a0 + a5 (column2,row4)
+ ext v27.8b, v6.8b , v7.8b, #5 ////extract a[5] (column2,row5)
+ uaddl v14.8h, v28.8b, v5.8b //// a0 + a5 (column1,row5)
+ ext v31.8b, v2.8b , v3.8b, #2 ////extract a[2] (column1,row4)
+ uaddl v16.8h, v27.8b, v6.8b //// a0 + a5 (column2,row5)
+ ext v30.8b, v3.8b , v4.8b, #2 ////extract a[2] (column2,row4)
+ umlal v8.8h, v31.8b, v1.8b //// a0 + a5 + 20a2 (column1,row4)
+ ext v27.8b, v6.8b , v7.8b, #2 ////extract a[2] (column2,row5)
+ umlal v10.8h, v30.8b, v1.8b //// a0 + a5 + 20a2 (column2,row4)
+ ext v28.8b, v5.8b , v6.8b, #2 ////extract a[2] (column1,row5)
+ umlal v14.8h, v28.8b, v1.8b //// a0 + a5 + 20a2 (column1,row5)
+ ext v31.8b, v2.8b , v3.8b, #3 ////extract a[3] (column1,row4)
+ umlal v16.8h, v27.8b, v1.8b //// a0 + a5 + 20a2 (column2,row5)
+ ext v30.8b, v3.8b , v4.8b, #3 ////extract a[3] (column2,row4)
+ umlal v8.8h, v31.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column1,row4)
+ ext v28.8b, v5.8b , v6.8b, #3 ////extract a[3] (column1,row5)
+ umlal v10.8h, v30.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column2,row4)
+ ext v27.8b, v6.8b , v7.8b, #3 ////extract a[3] (column2,row5)
+ umlal v14.8h, v28.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column1,row5)
+ ext v31.8b, v2.8b , v3.8b, #1 ////extract a[1] (column1,row4)
+ umlal v16.8h, v27.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column2,row5)
+ ext v30.8b, v3.8b , v4.8b, #1 ////extract a[1] (column2,row4)
+ umlsl v8.8h, v31.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column1,row4)
+ ext v28.8b, v5.8b , v6.8b, #1 ////extract a[1] (column1,row5)
+ umlsl v10.8h, v30.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column2,row4)
+ ext v27.8b, v6.8b , v7.8b, #1 ////extract a[1] (column2,row5)
+ umlsl v14.8h, v28.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column1,row4)
+ ext v31.8b, v2.8b , v3.8b, #4 ////extract a[4] (column1,row4)
+ umlsl v16.8h, v27.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column2,row5)
+ ext v30.8b, v3.8b , v4.8b, #4 ////extract a[4] (column2,row4)
+ umlsl v8.8h, v31.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1,row4)
+ ext v28.8b, v5.8b , v6.8b, #4 ////extract a[4] (column1,row5)
+ umlsl v10.8h, v30.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column2,row4)
+ ext v27.8b, v6.8b , v7.8b, #4 ////extract a[4] (column2,row5)
+ umlsl v14.8h, v28.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1,row5)
+ ld1 {v2.8b, v3.8b, v4.8b}, [x0], x2 //// Load row6
+ umlsl v16.8h, v27.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column2,row5)
+
+ sqrshrun v20.8b, v8.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column1,row4)
+ ld1 {v5.8b, v6.8b, v7.8b}, [x0], x2 //// Load row7
+ sqrshrun v21.8b, v10.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column2,row4)
+ ext v31.8b, v2.8b , v3.8b, #5 ////extract a[5] (column1,row6)
+ st1 {v20.8b, v21.8b}, [x1], x3 ////Store dest row2
+ sqrshrun v23.8b, v14.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column1,row5)
+ ext v30.8b, v3.8b , v4.8b, #5 ////extract a[5] (column2,row6)
+ sqrshrun v24.8b, v16.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column2,row5)
+
+
+
+ //// Processing row6 and row7
+
+ ext v28.8b, v5.8b , v6.8b, #5 ////extract a[5] (column1,row7)
+ uaddl v8.8h, v31.8b, v2.8b //// a0 + a5 (column1,row6)
+ st1 {v23.8b, v24.8b}, [x1], x3 ////Store dest row5
+ uaddl v10.8h, v30.8b, v3.8b //// a0 + a5 (column2,row6)
+ ext v27.8b, v6.8b , v7.8b, #5 ////extract a[5] (column2,row7)
+ uaddl v14.8h, v28.8b, v5.8b //// a0 + a5 (column1,row7)
+ ext v31.8b, v2.8b , v3.8b, #2 ////extract a[2] (column1,row6)
+ uaddl v16.8h, v27.8b, v6.8b //// a0 + a5 (column2,row7)
+ ext v30.8b, v3.8b , v4.8b, #2 ////extract a[2] (column2,row6)
+ umlal v8.8h, v31.8b, v1.8b //// a0 + a5 + 20a2 (column1,row6)
+ ext v27.8b, v6.8b , v7.8b, #2 ////extract a[2] (column2,row7)
+ umlal v10.8h, v30.8b, v1.8b //// a0 + a5 + 20a2 (column2,row6)
+ ext v28.8b, v5.8b , v6.8b, #2 ////extract a[2] (column1,row7)
+ umlal v14.8h, v28.8b, v1.8b //// a0 + a5 + 20a2 (column1,row7)
+ ext v31.8b, v2.8b , v3.8b, #3 ////extract a[3] (column1,row6)
+ umlal v16.8h, v27.8b, v1.8b //// a0 + a5 + 20a2 (column2,row7)
+ ext v30.8b, v3.8b , v4.8b, #3 ////extract a[3] (column2,row6)
+ umlal v8.8h, v31.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column1,row6)
+ ext v28.8b, v5.8b , v6.8b, #3 ////extract a[3] (column1,row7)
+ umlal v10.8h, v30.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column2,row6)
+ ext v27.8b, v6.8b , v7.8b, #3 ////extract a[3] (column2,row7)
+ umlal v14.8h, v28.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column1,row7)
+ ext v31.8b, v2.8b , v3.8b, #1 ////extract a[1] (column1,row6)
+ umlal v16.8h, v27.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column2,row7)
+ ext v30.8b, v3.8b , v4.8b, #1 ////extract a[1] (column2,row6)
+ umlsl v8.8h, v31.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column1,row6)
+ ext v28.8b, v5.8b , v6.8b, #1 ////extract a[1] (column1,row7)
+ umlsl v10.8h, v30.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column2,row6)
+ ext v27.8b, v6.8b , v7.8b, #1 ////extract a[1] (column2,row7)
+ umlsl v14.8h, v28.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column1,row6)
+ ext v31.8b, v2.8b , v3.8b, #4 ////extract a[4] (column1,row6)
+ umlsl v16.8h, v27.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column2,row7)
+ ext v30.8b, v3.8b , v4.8b, #4 ////extract a[4] (column2,row6)
+ umlsl v8.8h, v31.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1,row6)
+ ext v28.8b, v5.8b , v6.8b, #4 ////extract a[4] (column1,row7)
+ umlsl v10.8h, v30.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column2,row6)
+ ext v27.8b, v6.8b , v7.8b, #4 ////extract a[4] (column2,row6)
+
+ sqrshrun v20.8b, v8.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column1,row6)
+ umlsl v14.8h, v28.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1,row7)
+ sqrshrun v21.8b, v10.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column2,row6)
+ umlsl v16.8h, v27.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column2,row7)
+ sqrshrun v23.8b, v14.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column1,row7)
+ st1 {v20.8b, v21.8b}, [x1], x3 ////Store dest row6
+ sqrshrun v24.8b, v16.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column2,row7)
+ subs x12, x14, #1 // if height==16 - looping
+ st1 {v23.8b, v24.8b}, [x1], x3 ////Store dest row7
+
+
+
+ beq loop_16
+ b end_func
+
+
+
+loop_8:
+//// Processing row0 and row1
+
+
+ ld1 {v5.8b, v6.8b}, [x0], x2 //// Load row1
+ add x14, x14, #1 //for checking loop
+ ext v28.8b, v5.8b , v6.8b, #5 ////extract a[5] (column1,row1)
+ ld1 {v2.8b, v3.8b}, [x0], x2 //// Load row0
+ ext v25.8b, v5.8b , v6.8b, #2 ////extract a[2] (column1,row1)
+ ext v31.8b, v2.8b , v3.8b, #5 ////extract a[5] (column1,row0)
+ ext v24.8b, v5.8b , v6.8b, #3 ////extract a[3] (column1,row1)
+ ext v23.8b, v5.8b , v6.8b, #1 ////extract a[1] (column1,row1)
+ ext v22.8b, v5.8b , v6.8b, #4 ////extract a[4] (column1,row1)
+ uaddl v14.8h, v28.8b, v5.8b //// a0 + a5 (column1,row1)
+ ext v29.8b, v2.8b , v3.8b, #3 ////extract a[3] (column1,row0)
+ umlal v14.8h, v25.8b, v1.8b //// a0 + a5 + 20a2 (column1,row1)
+ umlal v14.8h, v24.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column1,row1)
+ umlsl v14.8h, v23.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column1,row1)
+ umlsl v14.8h, v22.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1,row1)
+ ext v30.8b, v2.8b , v3.8b, #2 ////extract a[2] (column1,row0)
+ uaddl v8.8h, v31.8b, v2.8b //// a0 + a5 (column1,row0)
+ ext v27.8b, v2.8b , v3.8b, #1 ////extract a[1] (column1,row0)
+ ext v26.8b, v2.8b , v3.8b, #4 ////extract a[4] (column1,row0)
+ ld1 {v2.8b, v3.8b}, [x0], x2 //// Load row2
+ umlal v8.8h, v29.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column1,row0)
+ umlal v8.8h, v30.8b, v1.8b //// a0 + a5 + 20a2 (column1,row0)
+ umlsl v8.8h, v27.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column1,row0)
+ umlsl v8.8h, v26.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1,row0)
+ ld1 {v5.8b, v6.8b}, [x0], x2 //// Load row3
+ sqrshrun v23.8b, v14.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column1,row0)
+
+ //// Processing row2 and row3
+ ext v28.8b, v5.8b , v6.8b, #5 ////extract a[5] (column1,row3)
+ ext v25.8b, v5.8b , v6.8b, #2 ////extract a[2] (column1,row3)
+ ext v31.8b, v2.8b , v3.8b, #5 ////extract a[5] (column1,row2)
+ uaddl v14.8h, v28.8b, v5.8b //// a0 + a5 (column1,row3)
+ st1 {v23.8b}, [x1], x3 ////Store dest row0
+ ext v24.8b, v5.8b , v6.8b, #3 ////extract a[3] (column1,row2)
+ ext v23.8b, v5.8b , v6.8b, #1 ////extract a[1] (column1,row3)
+ sqrshrun v20.8b, v8.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column1,row1)
+ ext v22.8b, v5.8b , v6.8b, #4 ////extract a[4] (column1,row3)
+ ext v29.8b, v2.8b , v3.8b, #3 ////extract a[3] (column1,row2)
+ umlal v14.8h, v25.8b, v1.8b //// a0 + a5 + 20a2 (column1,row3)
+ umlal v14.8h, v24.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column1,row3)
+ umlsl v14.8h, v23.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column1,row3)
+ umlsl v14.8h, v22.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1,row3)
+ st1 {v20.8b}, [x1], x3 ////Store dest row1
+ uaddl v8.8h, v31.8b, v2.8b //// a0 + a5 (column1,row2)
+ ext v30.8b, v2.8b , v3.8b, #2 ////extract a[2] (column1,row2)
+ ext v27.8b, v2.8b , v3.8b, #1 ////extract a[1] (column1,row2)
+ ext v26.8b, v2.8b , v3.8b, #4 ////extract a[4] (column1,row2)
+ ld1 {v2.8b, v3.8b}, [x0], x2 //// Load row4
+ umlal v8.8h, v29.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column1,row2)
+ umlal v8.8h, v30.8b, v1.8b //// a0 + a5 + 20a2 (column1,row2)
+ umlsl v8.8h, v27.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column1,row2)
+ umlsl v8.8h, v26.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1,row2)
+ ld1 {v5.8b, v6.8b}, [x0], x2 //// Load row3
+ subs x9, x4, #4
+ sqrshrun v23.8b, v14.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column1,row3)
+ ext v28.8b, v5.8b , v6.8b, #5 ////extract a[5] (column1,row5)
+ ext v25.8b, v5.8b , v6.8b, #2 ////extract a[2] (column1,row5)
+ ext v31.8b, v2.8b , v3.8b, #5 ////extract a[5] (column1,row4)
+ uaddl v14.8h, v28.8b, v5.8b //// a0 + a5 (column1,row5)
+ ext v24.8b, v5.8b , v6.8b, #3 ////extract a[3] (column1,row5)
+ sqrshrun v20.8b, v8.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column1,row2)
+ ext v22.8b, v5.8b , v6.8b, #4 ////extract a[4] (column1,row5)
+ ext v29.8b, v2.8b , v3.8b, #3 ////extract a[3] (column1,row4)
+ st1 {v20.8b}, [x1], x3 ////Store dest row2
+ ext v30.8b, v2.8b , v3.8b, #2 ////extract a[2] (column1,row4)
+ uaddl v8.8h, v31.8b, v2.8b //// a0 + a5 (column1,row4)
+ st1 {v23.8b}, [x1], x3 ////Store dest row3
+ beq end_func // Branch if height==4
+
+//// Processing row4 and row5
+ ext v23.8b, v5.8b , v6.8b, #1 ////extract a[1] (column1,row5)
+ umlal v14.8h, v25.8b, v1.8b //// a0 + a5 + 20a2 (column1,row5)
+ umlal v14.8h, v24.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column1,row5)
+ umlsl v14.8h, v23.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column1,row5)
+ umlsl v14.8h, v22.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1,row5)
+ ext v27.8b, v2.8b , v3.8b, #1 ////extract a[1] (column1,row4)
+ ext v26.8b, v2.8b , v3.8b, #4 ////extract a[4] (column1,row4)
+ ld1 {v2.8b, v3.8b}, [x0], x2 //// Load row6
+ umlal v8.8h, v29.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column1,row4)
+ umlal v8.8h, v30.8b, v1.8b //// a0 + a5 + 20a2 (column1,row4)
+ umlsl v8.8h, v27.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column1,row4)
+ umlsl v8.8h, v26.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1,row4)
+ sqrshrun v23.8b, v14.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column1,row5)
+ ld1 {v5.8b, v6.8b}, [x0], x2 //// Load row7
+ ext v31.8b, v2.8b , v3.8b, #5 ////extract a[5] (column1,row6)
+ ext v28.8b, v5.8b , v6.8b, #5 ////extract a[5] (column1,row7)
+ ext v25.8b, v5.8b , v6.8b, #2 ////extract a[2] (column1,row7)
+ uaddl v14.8h, v28.8b, v5.8b //// a0 + a5 (column1,row7)
+ ext v24.8b, v5.8b , v6.8b, #3 ////extract a[3] (column1,row7)
+ ext v22.8b, v5.8b , v6.8b, #4 ////extract a[4] (column1,row7)
+ sqrshrun v20.8b, v8.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column1,row4)
+ ext v29.8b, v2.8b , v3.8b, #3 ////extract a[3] (column1,row6)
+ ext v30.8b, v2.8b , v3.8b, #2 ////extract a[2] (column1,row6)
+ st1 {v20.8b}, [x1], x3 ////Store dest row4
+ ext v27.8b, v2.8b , v3.8b, #1 ////extract a[1] (column1,row6)
+ uaddl v8.8h, v31.8b, v2.8b //// a0 + a5 (column1,row6)
+ ext v26.8b, v2.8b , v3.8b, #4 ////extract a[4] (column1,row6)
+ umlal v8.8h, v29.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column1,row6)
+ umlal v8.8h, v30.8b, v1.8b //// a0 + a5 + 20a2 (column1,row6)
+ umlsl v8.8h, v27.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column1,row6)
+ umlsl v8.8h, v26.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1,row6)
+ //// Processing row6 and row7
+ st1 {v23.8b}, [x1], x3 ////Store dest row5
+ ext v23.8b, v5.8b , v6.8b, #1 ////extract a[1] (column1,row7)
+ umlal v14.8h, v25.8b, v1.8b //// a0 + a5 + 20a2 (column1,row7)
+ umlal v14.8h, v24.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column1,row7)
+ umlsl v14.8h, v23.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column1,row7)
+ umlsl v14.8h, v22.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1,row7)
+ sqrshrun v20.8b, v8.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column1,row6)
+ subs x12, x14, #1
+ st1 {v20.8b}, [x1], x3 ////Store dest row6
+ sqrshrun v23.8b, v14.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column1,row7)
+ st1 {v23.8b}, [x1], x3 ////Store dest row7
+
+ beq loop_8 //looping if height ==16
+
+ b end_func
+loop_4:
+ ld1 {v5.8b, v6.8b}, [x0], x2 //// Load row1
+ ext v28.8b, v5.8b , v6.8b, #5 ////extract a[5] (column1,row1)
+ ld1 {v2.8b, v3.8b}, [x0], x2 //// Load row0
+ ext v25.8b, v5.8b , v6.8b, #2 ////extract a[2] (column1,row1)
+ ext v31.8b, v2.8b , v3.8b, #5 ////extract a[5] (column1,row0)
+ uaddl v14.8h, v28.8b, v5.8b //// a0 + a5 (column1,row1)
+ ext v24.8b, v5.8b , v6.8b, #3 ////extract a[3] (column1,row1)
+ ext v23.8b, v5.8b , v6.8b, #1 ////extract a[1] (column1,row1)
+ ext v22.8b, v5.8b , v6.8b, #4 ////extract a[4] (column1,row1)
+ ext v29.8b, v2.8b , v3.8b, #3 ////extract a[3] (column1,row0)
+ umlal v14.8h, v25.8b, v1.8b //// a0 + a5 + 20a2 (column1,row1)
+ umlal v14.8h, v24.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column1,row1)
+ umlsl v14.8h, v23.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column1,row1)
+ umlsl v14.8h, v22.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1,row1)
+ uaddl v8.8h, v31.8b, v2.8b //// a0 + a5 (column1,row0)
+ ext v30.8b, v2.8b , v3.8b, #2 ////extract a[2] (column1,row0)
+ ext v27.8b, v2.8b , v3.8b, #1 ////extract a[1] (column1,row0)
+ ext v26.8b, v2.8b , v3.8b, #4 ////extract a[4] (column1,row0)
+ ld1 {v2.8b, v3.8b}, [x0], x2 //// Load row2
+ umlal v8.8h, v29.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column1,row0)
+ umlal v8.8h, v30.8b, v1.8b //// a0 + a5 + 20a2 (column1,row0)
+ umlsl v8.8h, v27.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column1,row0)
+ umlsl v8.8h, v26.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1,row0)
+ ld1 {v5.8b, v6.8b}, [x0], x2 //// Load row3
+ ext v28.8b, v5.8b , v6.8b, #5 ////extract a[5] (column1,row3)
+ ext v25.8b, v5.8b , v6.8b, #2 ////extract a[2] (column1,row3)
+ sqrshrun v23.8b, v14.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column1,row0)
+ ext v31.8b, v2.8b , v3.8b, #5 ////extract a[5] (column1,row2)
+ ext v24.8b, v5.8b , v6.8b, #3 ////extract a[3] (column1,row2)
+ st1 {v23.s}[0], [x1], x3 ////Store dest row0
+ ext v23.8b, v5.8b , v6.8b, #1 ////extract a[1] (column1,row3)
+ ext v22.8b, v5.8b , v6.8b, #4 ////extract a[4] (column1,row3)
+ ext v29.8b, v2.8b , v3.8b, #3 ////extract a[3] (column1,row2)
+ sqrshrun v20.8b, v8.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column1,row1)
+ ext v30.8b, v2.8b , v3.8b, #2 ////extract a[2] (column1,row2)
+ ext v27.8b, v2.8b , v3.8b, #1 ////extract a[1] (column1,row2)
+
+ //// Processing row2 and row3
+ st1 {v20.s}[0], [x1], x3 ////Store dest row1
+ uaddl v14.8h, v28.8b, v5.8b //// a0 + a5 (column1,row3)
+ ext v26.8b, v2.8b , v3.8b, #4 ////extract a[4] (column1,row2)
+ umlal v14.8h, v25.8b, v1.8b //// a0 + a5 + 20a2 (column1,row3)
+ umlal v14.8h, v24.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column1,row3)
+ umlsl v14.8h, v23.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column1,row3)
+ umlsl v14.8h, v22.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1,row3)
+ uaddl v8.8h, v31.8b, v2.8b //// a0 + a5 (column1,row2)
+ umlal v8.8h, v29.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column1,row2)
+ umlal v8.8h, v30.8b, v1.8b //// a0 + a5 + 20a2 (column1,row2)
+ umlsl v8.8h, v27.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column1,row2)
+ umlsl v8.8h, v26.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1,row2)
+ sqrshrun v23.8b, v14.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column1,row3)
+ sqrshrun v20.8b, v8.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column1,row2)
+ st1 {v20.s}[0], [x1], x3 ////Store dest row2
+ subs x4, x4, #8 // Loop if height =8
+ st1 {v23.s}[0], [x1], x3 ////Store dest row3
+ beq loop_4
+
+end_func:
+ // LDMFD sp!,{x4-x12,PC} //Restoring registers from stack
+ ldp x19, x20, [sp], #16
+ pop_v_regs
+ ret
+
+
+
diff --git a/common/armv8/ih264_inter_pred_filters_luma_vert_av8.s b/common/armv8/ih264_inter_pred_filters_luma_vert_av8.s
new file mode 100755
index 0000000..38934c9
--- /dev/null
+++ b/common/armv8/ih264_inter_pred_filters_luma_vert_av8.s
@@ -0,0 +1,452 @@
+//******************************************************************************
+//*
+//* Copyright (C) 2015 The Android Open Source Project
+//*
+//* Licensed under the Apache License, Version 2.0 (the "License");
+//* you may not use this file except in compliance with the License.
+//* You may obtain a copy of the License at:
+//*
+//* http://www.apache.org/licenses/LICENSE-2.0
+//*
+//* Unless required by applicable law or agreed to in writing, software
+//* distributed under the License is distributed on an "AS IS" BASIS,
+//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//* See the License for the specific language governing permissions and
+//* limitations under the License.
+//*
+//*****************************************************************************
+//* Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
+//*/
+///**
+//******************************************************************************
+//* @file
+//* ih264_inter_pred_luma_vert_av8.s
+//*
+//* @brief
+//* Contains function definitions for inter prediction interpolation.
+//*
+//* @author
+//* Ittiam
+//*
+//* @par List of Functions:
+//*
+//* - ih264_inter_pred_luma_vert_av8()
+//*
+//* @remarks
+//* None
+//*
+//*******************************************************************************
+//*/
+
+///* All the functions here are replicated from ih264_inter_pred_filters.c
+//
+
+///**
+///**
+///**
+// *******************************************************************************
+// *
+// * @brief
+// * Interprediction luma filter for vertical input
+// *
+// * @par Description:
+// * Applies a 6 tap vertcal filter.The output is clipped to 8 bits
+// * sec 8.4.2.2.1 titled "Luma sample interpolation process"
+// *
+// * @param[in] pu1_src
+// * UWORD8 pointer to the source
+// *
+// * @param[out] pu1_dst
+// * UWORD8 pointer to the destination
+// *
+// * @param[in] src_strd
+// * integer source stride
+// *
+// * @param[in] dst_strd
+// * integer destination stride
+// *
+// * @param[in] ht
+// * integer height of the array
+// *
+// * @param[in] wd
+// * integer width of the array
+// *
+// * @returns
+// *
+// * @remarks
+// * None
+// *
+// *******************************************************************************
+
+//void ih264_inter_pred_luma_vert (
+// UWORD8 *pu1_src,
+// UWORD8 *pu1_dst,
+// WORD32 src_strd,
+// WORD32 dst_strd,
+// WORD32 ht,
+// WORD32 wd )
+
+//**************Variables Vs Registers*****************************************
+// x0 => *pu1_src
+// x1 => *pu1_dst
+// x2 => src_strd
+// x3 => dst_strd
+// x4 => ht
+// x5 => wd
+
+.text
+.p2align 2
+.include "ih264_neon_macros.s"
+
+
+
+
+ .global ih264_inter_pred_luma_vert_av8
+
+ih264_inter_pred_luma_vert_av8:
+
+ // STMFD sp!, {x4-x12, x14} //store register values to stack
+ push_v_regs
+ stp x19, x20, [sp, #-16]!
+
+ sub x0, x0, x2, lsl #1 //pu1_src-2*src_strd
+
+ sub x14, x4, #16
+ movi v22.8h, #20 // Filter coeff 0x14 into Q11
+
+ subs x12, x5, #8 //if wd=8 branch to loop_8
+ movi v24.8h, #5 // Filter coeff 0x4 into Q12
+ beq loop_8_start
+
+ subs x12, x5, #4 //if wd=4 branch to loop_4
+ beq loop_4_start
+
+
+ ld1 {v0.2s, v1.2s}, [x0], x2 // Vector load from src[0_0]
+ ld1 {v2.2s, v3.2s}, [x0], x2 // Vector load from src[1_0]
+ ld1 {v4.2s, v5.2s}, [x0], x2 // Vector load from src[2_0]
+ ld1 {v6.2s, v7.2s}, [x0], x2 // Vector load from src[3_0]
+ add x14, x14, #1 //for checking loop
+ ld1 {v8.2s, v9.2s}, [x0], x2 // Vector load from src[4_0]
+ uaddl v12.8h, v4.8b, v6.8b // temp1 = src[2_0] + src[3_0]
+ ld1 {v10.2s, v11.2s}, [x0], x2 // Vector load from src[5_0]
+
+loop_16: //when wd=16
+
+ uaddl v14.8h, v0.8b, v10.8b // temp = src[0_0] + src[5_0]
+ uaddl v16.8h, v2.8b, v8.8b // temp2 = src[1_0] + src[4_0]
+ mla v14.8h, v12.8h, v22.8h // temp += temp1 * 20
+ uaddl v20.8h, v1.8b, v11.8b // temp4 = src[0_8] + src[5_8]
+ uaddl v18.8h, v5.8b, v7.8b // temp3 = src[2_8] + src[3_8]
+ mla v20.8h, v18.8h , v22.8h // temp4 += temp3 * 20
+ ld1 {v0.2s, v1.2s}, [x0], x2
+ uaddl v26.8h, v3.8b, v9.8b // temp5 = src[1_8] + src[4_8]
+ uaddl v12.8h, v6.8b, v8.8b
+ mls v14.8h, v16.8h , v24.8h // temp -= temp2 * 5
+ uaddl v16.8h, v2.8b, v0.8b
+ uaddl v18.8h, v4.8b, v10.8b
+ mla v16.8h, v12.8h , v22.8h
+ mls v20.8h, v26.8h , v24.8h // temp4 -= temp5 * 5
+ uaddl v26.8h, v5.8b, v11.8b
+ uaddl v12.8h, v7.8b, v9.8b
+ sqrshrun v30.8b, v14.8h, #5 // dst[0_0] = CLIP_U8((temp +16) >> 5)
+ uaddl v14.8h, v3.8b, v1.8b
+ ld1 {v2.2s, v3.2s}, [x0], x2
+ mla v14.8h, v12.8h , v22.8h
+ mls v16.8h, v18.8h , v24.8h
+ sqrshrun v31.8b, v20.8h, #5 // dst[0_8] = CLIP_U8((temp4 +16) >> 5)
+ uaddl v18.8h, v4.8b, v2.8b
+ uaddl v12.8h, v8.8b, v10.8b
+
+ st1 {v30.2s, v31.2s}, [x1], x3 // Vector store to dst[0_0]
+ mla v18.8h, v12.8h , v22.8h
+ uaddl v20.8h, v6.8b, v0.8b
+ mls v14.8h, v26.8h , v24.8h
+ sqrshrun v30.8b, v16.8h, #5
+ uaddl v12.8h, v9.8b, v11.8b
+ uaddl v16.8h, v5.8b, v3.8b
+ uaddl v26.8h, v7.8b, v1.8b
+ mla v16.8h, v12.8h , v22.8h
+ mls v18.8h, v20.8h , v24.8h
+ ld1 {v4.2s, v5.2s}, [x0], x2
+
+ sqrshrun v31.8b, v14.8h, #5
+ uaddl v12.8h, v10.8b, v0.8b
+ uaddl v14.8h, v6.8b, v4.8b
+ uaddl v20.8h, v8.8b, v2.8b
+ mla v14.8h, v12.8h , v22.8h
+ mls v16.8h, v26.8h , v24.8h
+ st1 {v30.2s, v31.2s}, [x1], x3 //store row 1
+ sqrshrun v30.8b, v18.8h, #5
+ uaddl v18.8h, v7.8b, v5.8b
+ uaddl v12.8h, v11.8b, v1.8b
+ mla v18.8h, v12.8h , v22.8h
+ uaddl v26.8h, v9.8b, v3.8b
+ mls v14.8h, v20.8h , v24.8h
+ ld1 {v6.2s, v7.2s}, [x0], x2
+ sqrshrun v31.8b, v16.8h, #5
+ mls v18.8h, v26.8h , v24.8h
+ uaddl v12.8h, v0.8b, v2.8b // temp1 = src[2_0] + src[3_0]
+ st1 {v30.2s, v31.2s}, [x1], x3 //store row 2
+ uaddl v16.8h, v10.8b, v4.8b // temp2 = src[1_0] + src[4_0]
+ uaddl v20.8h, v9.8b, v7.8b // temp4 = src[0_8] + src[5_8]
+ sqrshrun v30.8b, v14.8h, #5
+ uaddl v26.8h, v5.8b, v11.8b // temp5 = src[1_8] + src[4_8]
+ uaddl v14.8h, v8.8b, v6.8b // temp = src[0_0] + src[5_0]
+ sqrshrun v31.8b, v18.8h, #5
+ mla v14.8h, v12.8h , v22.8h // temp += temp1 * 20
+ uaddl v18.8h, v1.8b, v3.8b // temp3 = src[2_8] + src[3_8]
+ st1 {v30.2s, v31.2s}, [x1], x3 //store row 3
+ // 4 rows processed
+ mla v20.8h, v18.8h , v22.8h // temp4 += temp3 * 20
+ ld1 {v8.2s, v9.2s}, [x0], x2
+ uaddl v12.8h, v2.8b, v4.8b
+ uaddl v18.8h, v3.8b, v5.8b
+ mls v14.8h, v16.8h , v24.8h // temp -= temp2 * 5
+ uaddl v28.8h, v9.8b, v11.8b
+ uaddl v16.8h, v6.8b, v0.8b
+ mla v28.8h, v18.8h , v22.8h // temp4 += temp3 * 20
+ mls v20.8h, v26.8h , v24.8h // temp4 -= temp5 * 5
+ uaddl v26.8h, v1.8b, v7.8b
+ uaddl v18.8h, v5.8b, v7.8b
+ sqrshrun v30.8b, v14.8h, #5 // dst[0_0] = CLIP_U8((temp +16) >> 5)
+ uaddl v14.8h, v8.8b, v10.8b
+
+ sqrshrun v31.8b, v20.8h, #5 // dst[0_8] = CLIP_U8((temp4 +16) >> 5)
+ ld1 {v10.2s, v11.2s}, [x0], x2
+ mls v28.8h, v26.8h , v24.8h // temp4 -= temp5 * 5
+ st1 {v30.2s, v31.2s}, [x1], x3 // store row 4
+ mla v14.8h, v12.8h , v22.8h // temp += temp1 * 20
+ uaddl v20.8h, v11.8b, v1.8b
+ uaddl v26.8h, v3.8b, v9.8b
+ mla v20.8h, v18.8h , v22.8h // temp4 += temp3 * 20
+ uaddl v12.8h, v6.8b, v4.8b
+ uaddl v18.8h, v7.8b, v9.8b
+ sqrshrun v31.8b, v28.8h, #5 // dst[0_8] = CLIP_U8((temp4 +16) >> 5)
+ mls v14.8h, v16.8h , v24.8h // temp -= temp2 * 5
+ uaddl v16.8h, v8.8b, v2.8b
+ sqrshrun v30.8b, v14.8h, #5 // dst[0_0] = CLIP_U8((temp +16) >> 5)
+ mls v20.8h, v26.8h , v24.8h // temp4 -= temp5 * 5
+ uaddl v14.8h, v10.8b, v0.8b
+ st1 {v30.2s, v31.2s}, [x1], x3 // store row 5
+ mla v14.8h, v12.8h , v22.8h // temp += temp1 * 20
+ ld1 {v0.2s, v1.2s}, [x0], x2
+ uaddl v26.8h, v5.8b, v11.8b
+ uaddl v12.8h, v8.8b, v6.8b
+ uaddl v28.8h, v0.8b, v2.8b
+ sqrshrun v31.8b, v20.8h, #5 // dst[0_8] = CLIP_U8((temp4 +16) >> 5)
+ mla v28.8h, v12.8h , v22.8h // temp += temp1 * 20
+ uaddl v20.8h, v1.8b, v3.8b
+ mls v14.8h, v16.8h , v24.8h // temp -= temp2 * 5
+ mla v20.8h, v18.8h , v22.8h // temp4 += temp3 * 20
+ uaddl v16.8h, v10.8b, v4.8b
+ sqrshrun v30.8b, v14.8h, #5 // dst[0_0] = CLIP_U8((temp +16) >> 5)
+ mov v2.8b, v6.8b
+ mov v3.8b, v7.8b
+ mls v28.8h, v16.8h , v24.8h // temp -= temp2 * 5
+ st1 {v30.2s, v31.2s}, [x1], x3 // store row 6
+ sqrshrun v30.8b, v28.8h, #5 // dst[0_0] = CLIP_U8((temp +16) >> 5)
+
+ swp v0.8b v4.8b
+ swp v1.8b v5.8b
+
+
+
+ mls v20.8h, v26.8h , v24.8h // temp4 -= temp5 * 5
+ mov v6.8b, v10.8b
+ mov v7.8b, v11.8b
+ subs x12, x14, #1 // if height==16 - looping
+
+ swp v4.8b v8.8b
+ swp v5.8b v9.8b
+
+
+ sqrshrun v31.8b, v20.8h, #5 // dst[0_8] = CLIP_U8((temp4 +16) >> 5)
+ st1 {v30.2s, v31.2s}, [x1], x3 // store row 7
+ bne end_func //if height =8 end function
+ add x14, x14, #1 //for checking loop
+ ld1 {v10.2s, v11.2s}, [x0], x2
+ uaddl v12.8h, v4.8b, v6.8b // temp1 = src[2_0] + src[3_0]
+
+ b loop_16 // looping if height =16
+
+loop_8_start:
+//// Processing row0 and row1
+
+ ld1 {v0.2s}, [x0], x2 // Vector load from src[0_0]
+ ld1 {v1.2s}, [x0], x2 // Vector load from src[1_0]
+ ld1 {v2.2s}, [x0], x2 // Vector load from src[2_0]
+ ld1 {v3.2s}, [x0], x2 // Vector load from src[3_0]
+ add x14, x14, #1 //for checking loop
+ ld1 {v4.2s}, [x0], x2 // Vector load from src[4_0]
+ ld1 {v5.2s}, [x0], x2 // Vector load from src[5_0]
+
+loop_8:
+ //for checking loop
+ uaddl v6.8h, v2.8b, v3.8b // temp1 = src[2_0] + src[3_0]
+ uaddl v8.8h, v0.8b, v5.8b // temp = src[0_0] + src[5_0]
+ uaddl v10.8h, v1.8b, v4.8b // temp2 = src[1_0] + src[4_0]
+ mla v8.8h, v6.8h , v22.8h // temp += temp1 * 20
+ ld1 {v6.2s}, [x0], x2
+ uaddl v14.8h, v3.8b, v4.8b
+ uaddl v16.8h, v1.8b, v6.8b
+ uaddl v18.8h, v2.8b, v5.8b
+ mls v8.8h, v10.8h , v24.8h // temp -= temp2 * 5
+ mla v16.8h, v14.8h , v22.8h
+ ld1 {v7.2s}, [x0], x2
+ uaddl v20.8h, v4.8b, v5.8b
+ uaddl v12.8h, v2.8b, v7.8b
+ uaddl v10.8h, v3.8b, v6.8b
+ mls v16.8h, v18.8h , v24.8h
+ sqrshrun v26.8b, v8.8h, #5 // dst[0_0] = CLIP_U8( (temp + 16) >> 5)
+ mla v12.8h, v20.8h , v22.8h
+ ld1 {v0.2s}, [x0], x2
+ uaddl v14.8h, v5.8b, v6.8b
+ sqrshrun v27.8b, v16.8h, #5
+ uaddl v20.8h, v3.8b, v0.8b
+ mls v12.8h, v10.8h , v24.8h
+ st1 {v26.2s}, [x1], x3 // Vector store to dst[0_0]
+ uaddl v18.8h, v4.8b, v7.8b
+ mla v20.8h, v14.8h , v22.8h
+ st1 {v27.2s}, [x1], x3
+ sqrshrun v28.8b, v12.8h, #5
+ st1 {v28.2s}, [x1], x3
+ mls v20.8h, v18.8h , v24.8h
+ ld1 {v1.2s}, [x0], x2
+ sqrshrun v29.8b, v20.8h, #5
+ subs x9, x4, #4
+ st1 {v29.2s}, [x1], x3 //store row 3
+
+
+ beq end_func // Branch if height==4
+
+
+ uaddl v14.8h, v6.8b, v7.8b // temp1 = src[2_0] + src[3_0]
+ uaddl v16.8h, v0.8b, v5.8b // temp = src[0_0] + src[5_0]
+ uaddl v18.8h, v1.8b, v4.8b // temp2 = src[1_0] + src[4_0]
+ mla v18.8h, v14.8h , v22.8h // temp += temp1 * 20
+ ld1 {v2.2s}, [x0], x2
+ mls v18.8h, v16.8h , v24.8h // temp -= temp2 * 5
+ uaddl v8.8h, v0.8b, v7.8b
+ uaddl v10.8h, v1.8b, v6.8b
+ uaddl v12.8h, v2.8b, v5.8b
+ sqrshrun v26.8b, v18.8h, #5
+ mla v12.8h, v8.8h , v22.8h
+ ld1 {v3.2s}, [x0], x2
+ mls v12.8h, v10.8h , v24.8h
+ st1 {v26.2s}, [x1], x3
+ sqrshrun v27.8b, v12.8h, #5
+ st1 {v27.2s}, [x1], x3
+ uaddl v14.8h, v0.8b, v1.8b // temp1 = src[2_0] + src[3_0]
+ uaddl v16.8h, v2.8b, v7.8b // temp = src[0_0] + src[5_0]
+ uaddl v18.8h, v3.8b, v6.8b // temp2 = src[1_0] + src[4_0]
+ mla v18.8h, v14.8h , v22.8h // temp += temp1 * 20
+ ld1 {v4.2s}, [x0], x2
+ mls v18.8h, v16.8h , v24.8h // temp -= temp2 * 5
+ uaddl v8.8h, v2.8b, v1.8b
+ uaddl v10.8h, v3.8b, v0.8b
+ uaddl v12.8h, v4.8b, v7.8b
+ sqrshrun v26.8b, v18.8h, #5
+ mla v12.8h, v8.8h , v22.8h
+ ld1 {v5.2s}, [x0], x2
+ mls v12.8h, v10.8h , v24.8h
+ st1 {v26.2s}, [x1], x3
+ sqrshrun v27.8b, v12.8h, #5
+ subs x12, x14, #1
+ st1 {v27.2s}, [x1], x3
+ add x14, x14, #1
+ beq loop_8 //looping if height ==16
+
+ b end_func
+
+
+loop_4_start:
+//// Processing row0 and row1
+
+
+ ld1 {v0.s}[0], [x0], x2 // Vector load from src[0_0]
+ ld1 {v1.s}[0], [x0], x2 // Vector load from src[1_0]
+ ld1 {v2.s}[0], [x0], x2 // Vector load from src[2_0]
+ ld1 {v3.s}[0], [x0], x2 // Vector load from src[3_0]
+ ld1 {v4.s}[0], [x0], x2 // Vector load from src[4_0]
+ ld1 {v5.s}[0], [x0], x2 // Vector load from src[5_0]
+
+ uaddl v6.8h, v2.8b, v3.8b // temp1 = src[2_0] + src[3_0]
+ uaddl v8.8h, v0.8b, v5.8b // temp = src[0_0] + src[5_0]
+ uaddl v10.8h, v1.8b, v4.8b // temp2 = src[1_0] + src[4_0]
+ mla v8.8h, v6.8h , v22.8h // temp += temp1 * 20
+ ld1 {v6.2s}, [x0], x2
+ uaddl v14.8h, v3.8b, v4.8b
+ uaddl v16.8h, v1.8b, v6.8b
+ uaddl v18.8h, v2.8b, v5.8b
+ mls v8.8h, v10.8h , v24.8h // temp -= temp2 * 5
+ ld1 {v7.s}[0], [x0], x2
+ mla v16.8h, v14.8h , v22.8h
+ uaddl v20.8h, v4.8b, v5.8b
+ uaddl v12.8h, v2.8b, v7.8b
+ uaddl v10.8h, v3.8b, v6.8b
+ mls v16.8h, v18.8h , v24.8h
+ sqrshrun v26.8b, v8.8h, #5 // dst[0_0] = CLIP_U8( (temp + 16) >> 5)
+ mla v12.8h, v20.8h , v22.8h
+ ld1 {v0.s}[0], [x0], x2
+ uaddl v14.8h, v5.8b, v6.8b
+ sqrshrun v27.8b, v16.8h, #5
+ uaddl v20.8h, v3.8b, v0.8b
+ mls v12.8h, v10.8h , v24.8h
+ st1 {v26.s}[0], [x1], x3 // Vector store to dst[0_0]
+ uaddl v18.8h, v4.8b, v7.8b
+ mla v20.8h, v14.8h , v22.8h
+ st1 {v27.s}[0], [x1], x3
+ sqrshrun v28.8b, v12.8h, #5
+ st1 {v28.s}[0], [x1], x3
+ mls v20.8h, v18.8h , v24.8h
+ ld1 {v1.s}[0], [x0], x2
+ sqrshrun v29.8b, v20.8h, #5
+ st1 {v29.s}[0], [x1], x3 //store row 3
+
+ subs x9, x4, #4
+ beq end_func // Branch if height==4
+
+
+ uaddl v14.8h, v6.8b, v7.8b // temp1 = src[2_0] + src[3_0]
+ uaddl v16.8h, v0.8b, v5.8b // temp = src[0_0] + src[5_0]
+ uaddl v18.8h, v1.8b, v4.8b // temp2 = src[1_0] + src[4_0]
+ mla v18.8h, v14.8h , v22.8h // temp += temp1 * 20
+ ld1 {v2.s}[0], [x0], x2
+ mls v18.8h, v16.8h , v24.8h // temp -= temp2 * 5
+ uaddl v8.8h, v0.8b, v7.8b
+ uaddl v10.8h, v1.8b, v6.8b
+ uaddl v12.8h, v2.8b, v5.8b
+ sqrshrun v26.8b, v18.8h, #5
+ mla v12.8h, v8.8h , v22.8h
+ ld1 {v3.s}[0], [x0], x2
+ mls v12.8h, v10.8h , v24.8h
+ st1 {v26.s}[0], [x1], x3
+ sqrshrun v27.8b, v12.8h, #5
+ st1 {v27.s}[0], [x1], x3
+ uaddl v14.8h, v0.8b, v1.8b // temp1 = src[2_0] + src[3_0]
+ uaddl v16.8h, v2.8b, v7.8b // temp = src[0_0] + src[5_0]
+ uaddl v18.8h, v3.8b, v6.8b // temp2 = src[1_0] + src[4_0]
+ mla v18.8h, v14.8h , v22.8h // temp += temp1 * 20
+ ld1 {v4.s}[0], [x0], x2
+ mls v18.8h, v16.8h , v24.8h // temp -= temp2 * 5
+ uaddl v8.8h, v2.8b, v1.8b
+ uaddl v10.8h, v3.8b, v0.8b
+ uaddl v12.8h, v4.8b, v7.8b
+ sqrshrun v26.8b, v18.8h, #5
+ mla v12.8h, v8.8h , v22.8h
+ ld1 {v5.s}[0], [x0], x2
+ mls v12.8h, v10.8h , v24.8h
+ st1 {v26.s}[0], [x1], x3
+ sqrshrun v27.8b, v12.8h, #5
+ st1 {v27.s}[0], [x1], x3
+
+
+end_func:
+ // LDMFD sp!,{x4-x12,PC} //Restoring registers from stack
+ ldp x19, x20, [sp], #16
+ pop_v_regs
+ ret
+
+
+
diff --git a/common/armv8/ih264_inter_pred_luma_copy_av8.s b/common/armv8/ih264_inter_pred_luma_copy_av8.s
new file mode 100755
index 0000000..1a76c1c
--- /dev/null
+++ b/common/armv8/ih264_inter_pred_luma_copy_av8.s
@@ -0,0 +1,267 @@
+//******************************************************************************
+//*
+//* Copyright (C) 2015 The Android Open Source Project
+//*
+//* Licensed under the Apache License, Version 2.0 (the "License");
+//* you may not use this file except in compliance with the License.
+//* You may obtain a copy of the License at:
+//*
+//* http://www.apache.org/licenses/LICENSE-2.0
+//*
+//* Unless required by applicable law or agreed to in writing, software
+//* distributed under the License is distributed on an "AS IS" BASIS,
+//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//* See the License for the specific language governing permissions and
+//* limitations under the License.
+//*
+//*****************************************************************************
+//* Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
+//*/
+///**
+///**
+//*******************************************************************************
+//*
+//* @brief
+//* Interprediction luma function for copy
+//*
+//* @par Description:
+//* Copies the array of width 'wd' and height 'ht' from the location pointed
+//* by 'src' to the location pointed by 'dst'
+//*
+//* @param[in] pu1_src
+//* UWORD8 pointer to the source
+//*
+//* @param[out] pu1_dst
+//* UWORD8 pointer to the destination
+//*
+//* @param[in] src_strd
+//* integer source stride
+//*
+//* @param[in] dst_strd
+//* integer destination stride
+//*
+//*
+//* @param[in] ht
+//* integer height of the array
+//*
+//* @param[in] wd
+//* integer width of the array
+//*
+//* @returns
+//*
+//* @remarks
+//* None
+//*
+//*******************************************************************************
+//*/
+//void ih264_inter_pred_luma_copy (
+// UWORD8 *pu1_src,
+// UWORD8 *pu1_dst,
+// WORD32 src_strd,
+// WORD32 dst_strd,
+// WORD32 ht,
+// WORD32 wd )
+
+//**************Variables Vs Registers*****************************************
+// x0 => *pu1_src
+// x1 => *pu1_dst
+// x2 => src_strd
+// x3 => dst_strd
+// x7 => ht
+// x12 => wd
+
+.text
+.p2align 2
+.include "ih264_neon_macros.s"
+
+
+
+ .global ih264_inter_pred_luma_copy_av8
+
+ih264_inter_pred_luma_copy_av8:
+
+ push_v_regs
+ stp x19, x20, [sp, #-16]!
+
+ mov x12, x5
+ mov x7, x4
+ cmp x7, #0 //checks ht == 0
+ ble end_loops
+ tst x12, #15 //checks wd for multiples for 4 & 8
+ beq core_loop_wd_16
+ tst x12, #7 //checks wd for multiples for 4 & 8
+ beq core_loop_wd_8
+ sub x11, x12, #4
+
+outer_loop_wd_4:
+ subs x4, x12, #0 //checks wd == 0
+ ble end_inner_loop_wd_4
+
+inner_loop_wd_4:
+ ld1 {v0.s}[0], [x0] //vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp, 0)
+ add x5, x0, x2 //pu1_src_tmp += src_strd
+ add x6, x1, x3 //pu1_dst_tmp += dst_strd
+ st1 {v0.s}[0], [x1] //vst1_lane_u32((uint32_t *)pu1_dst_tmp, src_tmp, 0)
+ ld1 {v0.s}[0], [x5], x2 //vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp, 0)
+ add x0, x0, #4 //pu1_src += 4
+ st1 {v0.s}[0], [x6], x3 //vst1_lane_u32((uint32_t *)pu1_dst_tmp, src_tmp, 0)
+ ld1 {v0.s}[0], [x5], x2 //vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp, 0)
+ subs x4, x4, #4 //(wd -4)
+ st1 {v0.s}[0], [x6], x3 //vst1_lane_u32((uint32_t *)pu1_dst_tmp, src_tmp, 0)
+ ld1 {v0.s}[0], [x5], x2 //vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp, 0)
+ add x1, x1, #4 //pu1_dst += 4
+ st1 {v0.s}[0], [x6], x3 //vst1_lane_u32((uint32_t *)pu1_dst_tmp, src_tmp, 0)
+
+ bgt inner_loop_wd_4
+
+end_inner_loop_wd_4:
+ subs x7, x7, #4 //ht - 4
+ sub x0, x5, x11 //pu1_src = pu1_src_tmp
+ sub x1, x6, x11 //pu1_dst = pu1_dst_tmp
+ bgt outer_loop_wd_4
+
+end_loops:
+ // LDMFD sp!,{x4-x12,x15} //Reload the registers from SP
+ ldp x19, x20, [sp], #16
+ pop_v_regs
+ ret
+
+
+core_loop_wd_8:
+ sub x11, x12, #8
+
+outer_loop_wd_8:
+ subs x4, x12, #0 //checks wd
+ ble end_inner_loop_wd_8
+
+inner_loop_wd_8:
+ add x5, x0, x2 //pu1_src_tmp += src_strd
+ ld1 {v0.8b}, [x0], #8 //vld1_u8(pu1_src_tmp)
+ add x6, x1, x3 //pu1_dst_tmp += dst_strd
+ st1 {v0.8b}, [x1], #8 //vst1_u8(pu1_dst_tmp, tmp_src)
+ ld1 {v1.8b}, [x5], x2 //vld1_u8(pu1_src_tmp)
+ st1 {v1.8b}, [x6], x3 //vst1_u8(pu1_dst_tmp, tmp_src)
+ subs x4, x4, #8 //wd - 8(Loop condition)
+ ld1 {v2.8b}, [x5], x2 //vld1_u8(pu1_src_tmp)
+ st1 {v2.8b}, [x6], x3 //vst1_u8(pu1_dst_tmp, tmp_src)
+ ld1 {v3.8b}, [x5], x2 //vld1_u8(pu1_src_tmp)
+ st1 {v3.8b}, [x6], x3 //vst1_u8(pu1_dst_tmp, tmp_src)
+ bgt inner_loop_wd_8
+
+end_inner_loop_wd_8:
+ subs x7, x7, #4 //ht -= 4
+ sub x0, x5, x11 //pu1_src = pu1_src_tmp
+ sub x1, x6, x11 //pu1_dst = pu1_dst_tmp
+ bgt outer_loop_wd_8
+
+ // LDMFD sp!,{x4-x12,x15} //Reload the registers from SP
+ ldp x19, x20, [sp], #16
+ pop_v_regs
+ ret
+
+core_loop_wd_16:
+ sub x11, x12, #16
+
+outer_loop_wd_16:
+ subs x4, x12, #0 //checks wd
+ ble end_inner_loop_wd_16
+
+inner_loop_wd_16:
+ add x5, x0, x2 //pu1_src_tmp += src_strd
+ ld1 { v0.16b}, [x0], #16 //vld1_u8(pu1_src_tmp)
+ add x6, x1, x3 //pu1_dst_tmp += dst_strd
+ st1 { v0.16b}, [x1], #16 //vst1_u8(pu1_dst_tmp, tmp_src)
+ ld1 { v2.16b}, [x5], x2 //vld1_u8(pu1_src_tmp)
+ st1 { v2.16b}, [x6], x3 //vst1_u8(pu1_dst_tmp, tmp_src)
+ subs x4, x4, #16 //wd - 8(Loop condition)
+ ld1 { v4.16b}, [x5], x2 //vld1_u8(pu1_src_tmp)
+ st1 { v4.16b}, [x6], x3 //vst1_u8(pu1_dst_tmp, tmp_src)
+ ld1 { v6.16b}, [x5], x2 //vld1_u8(pu1_src_tmp)
+ st1 { v6.16b}, [x6], x3 //vst1_u8(pu1_dst_tmp, tmp_src)
+ bgt inner_loop_wd_16
+
+end_inner_loop_wd_16:
+ subs x7, x7, #4 //ht -= 4
+ sub x0, x5, x11 //pu1_src = pu1_src_tmp
+ sub x1, x6, x11 //pu1_dst = pu1_dst_tmp
+ bgt outer_loop_wd_16
+
+
+ ldp x19, x20, [sp], #16
+ pop_v_regs
+ ret
+
+
+// /*
+// ********************************************************************************
+// *
+// * @brief This function copies a 4x4 block to destination
+// *
+// * @par Description:
+// * Copies a 4x4 block to destination, where both src and dst are interleaved
+// *
+// * @param[in] pi2_src
+// * Source
+// *
+// * @param[in] pu1_out
+// * Output pointer
+// *
+// * @param[in] pred_strd,
+// * Prediction buffer stride
+// *
+// * @param[in] out_strd
+// * output buffer buffer Stride
+// *
+// * @returns none
+// *
+// * @remarks none
+// * Currently wd and height is not used, ie a 4x4 block is always copied
+// *
+// *******************************************************************************
+// */
+// void ih264_interleave_copy(WORD16 *pi2_src,
+// UWORD8 *pu1_out,
+// WORD32 pred_strd,
+// WORD32 out_strd
+// WORD32 wd
+// WORD32 ht)
+// Register Usage
+// x0 : pi2_src
+// x1 : pu1_out
+// x2 : src_strd
+// x3 : out_strd
+// Neon registers d0-d7, d16-d30 are used
+// No need for pushing arm and neon registers
+
+ .global ih264_interleave_copy_av8
+ih264_interleave_copy_av8:
+ push_v_regs
+ ld1 {v2.8b}, [x0], x2 //load src plane 1 => d2 &pred palne 2 => d3
+ ld1 {v3.8b}, [x0], x2
+ mov v2.d[1], v3.d[0]
+ ld1 {v4.8b}, [x0], x2
+ ld1 {v5.8b}, [x0], x2
+ mov v4.d[1], v5.d[0]
+
+ mov x0, x1
+
+ ld1 {v18.8b}, [x1], x3 //load out [8 bit size) -8 coeffs
+ ld1 {v19.8b}, [x1], x3
+ mov v18.d[1], v19.d[0]
+ movi v30.8h, #0x00ff
+ ld1 {v20.8b}, [x1], x3
+ ld1 {v21.8b}, [x1], x3
+ mov v20.d[1], v21.d[0]
+
+ bit v18.16b, v2.16b , v30.16b
+ bit v20.16b, v4.16b , v30.16b
+
+ st1 {v18.8b}, [x0], x3 //store out
+ st1 {v18.d}[1], [x0], x3
+ st1 {v20.8b}, [x0], x3
+ st1 {v20.d}[1], [x0], x3
+
+ pop_v_regs
+ ret
+
+
diff --git a/common/armv8/ih264_inter_pred_luma_horz_hpel_vert_hpel_av8.s b/common/armv8/ih264_inter_pred_luma_horz_hpel_vert_hpel_av8.s
new file mode 100755
index 0000000..ea7645e
--- /dev/null
+++ b/common/armv8/ih264_inter_pred_luma_horz_hpel_vert_hpel_av8.s
@@ -0,0 +1,820 @@
+//******************************************************************************
+//*
+//* Copyright (C) 2015 The Android Open Source Project
+//*
+//* Licensed under the Apache License, Version 2.0 (the "License");
+//* you may not use this file except in compliance with the License.
+//* You may obtain a copy of the License at:
+//*
+//* http://www.apache.org/licenses/LICENSE-2.0
+//*
+//* Unless required by applicable law or agreed to in writing, software
+//* distributed under the License is distributed on an "AS IS" BASIS,
+//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//* See the License for the specific language governing permissions and
+//* limitations under the License.
+//*
+//*****************************************************************************
+//* Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
+//*/
+///**
+//******************************************************************************
+//* @file
+//* ih264_inter_pred_luma_horz_hpel_vert_hpel_av8.s
+//*
+//* @brief
+//* Contains function definitions for inter prediction interpolation.
+//*
+//* @author
+//* Mohit
+//*
+//* @par List of Functions:
+//*
+//* - ih264_inter_pred_luma_horz_hpel_vert_hpel_av8()
+//*
+//* @remarks
+//* None
+//*
+//*******************************************************************************
+//*/
+
+
+
+//void ih264_inter_pred_luma_horz_hpel_vert_hpel(UWORD8 *pu1_src,
+// UWORD8 *pu1_dst,
+// WORD32 src_strd,,
+// WORD32 dst_strd,
+// WORD32 ht,
+// WORD32 wd,
+// UWORD8* pu1_tmp,
+// UWORD32 dydx)
+
+//**************Variables Vs Registers*****************************************
+// x0 => *pu1_src
+// x1 => *pu1_dst
+// x2 => src_strd
+// x3 => dst_strd
+// x4 => ht
+// x5 => wd
+
+
+.text
+.p2align 2
+.include "ih264_neon_macros.s"
+
+
+
+ .global ih264_inter_pred_luma_horz_hpel_vert_hpel_av8
+
+ih264_inter_pred_luma_horz_hpel_vert_hpel_av8:
+
+ //store register values to stack
+ push_v_regs
+ stp x19, x20, [sp, #-16]!
+
+ sub x0, x0, x2, lsl #1 //pu1_src-2*src_strd
+ sub x0, x0, #2 //pu1_src-2
+
+ movi v26.8h, #0x14 // Filter coeff 20 into Q13
+ movi v24.8h, #0x5 // Filter coeff 5 into Q12
+ movi v27.8h, #0x14 // Filter coeff 20 into Q13
+ movi v25.8h, #0x5 // Filter coeff 5 into Q12
+ mov x7, #0x20
+ mov x8, #0x30
+ subs x12, x5, #4 //if wd=4 branch to loop_4
+ beq loop_4_start
+
+ subs x12, x5, #8 //if wd=8 branch to loop_8
+ beq loop_8_start
+
+ //when wd=16
+ movi v28.8h, #0x14 // Filter coeff 20 into Q13
+ movi v30.8h, #0x5 // Filter coeff 5 into Q12
+ sub x2, x2, #16
+ ld1 {v0.2s, v1.2s}, [x0], #16 // Vector load from src[0_0]
+ ld1 {v12.2s}, [x0], x2 // Vector load from src[0_0]
+ ld1 {v2.2s, v3.2s}, [x0], #16 // Vector load from src[1_0]
+ ld1 {v13.2s}, [x0], x2 // Vector load from src[1_0]
+ ld1 {v4.2s, v5.2s}, [x0], #16 // Vector load from src[2_0]
+ ld1 {v14.2s}, [x0], x2 // Vector load from src[2_0]
+ ld1 {v6.2s, v7.2s}, [x0], #16 // Vector load from src[3_0]
+ ld1 {v15.2s}, [x0], x2 // Vector load from src[3_0]
+ ld1 {v8.2s, v9.2s}, [x0], #16 // Vector load from src[4_0]
+ ld1 {v16.2s}, [x0], x2 // Vector load from src[4_0]
+loop_16:
+
+ ld1 {v10.2s, v11.2s}, [x0], #16 // Vector load from src[5_0]
+ ld1 {v17.2s}, [x0], x2 // Vector load from src[5_0]
+
+
+ uaddl v20.8h, v4.8b, v6.8b
+ uaddl v18.8h, v0.8b, v10.8b
+ uaddl v22.8h, v2.8b, v8.8b
+ mla v18.8h, v20.8h , v28.8h
+ uaddl v24.8h, v5.8b, v7.8b
+ uaddl v20.8h, v1.8b, v11.8b
+ uaddl v26.8h, v3.8b, v9.8b
+ mla v20.8h, v24.8h , v28.8h
+ uaddl v24.8h, v14.8b, v15.8b
+ mls v18.8h, v22.8h , v30.8h
+ uaddl v22.8h, v12.8b, v17.8b
+ mls v20.8h, v26.8h , v30.8h
+ uaddl v26.8h, v13.8b, v16.8b
+ mla v22.8h, v24.8h , v28.8h
+ mls v22.8h, v26.8h , v30.8h
+
+ ext v24.16b, v18.16b , v20.16b , #4
+ ext v26.16b, v18.16b , v20.16b , #6
+
+ ext v23.16b, v18.16b , v20.16b , #10
+ add v0.8h, v24.8h , v26.8h
+ ext v24.16b, v18.16b , v20.16b , #2
+ ext v26.16b, v18.16b , v20.16b , #8
+ add v24.8h, v24.8h , v26.8h
+
+ saddl v26.4s, v18.4h, v23.4h
+ smlal v26.4s, v0.4h, v28.4h
+ smlsl v26.4s, v24.4h, v30.4h
+
+ saddl2 v23.4s, v18.8h, v23.8h
+ smlal2 v23.4s, v0.8h, v28.8h
+ smlsl2 v23.4s, v24.8h, v30.8h
+
+ sqrshrun v18.4h, v26.4s, #10
+ sqrshrun v19.4h, v23.4s, #10
+
+
+ uqxtn v18.8b, v18.8h
+ uqxtn v19.8b, v19.8h
+ mov v18.2s[1], v19.2s[0]
+
+ ext v24.16b, v20.16b , v22.16b , #4
+ ext v26.16b, v20.16b , v22.16b , #6
+ ext v0.16b, v20.16b , v22.16b , #10
+
+ add v25.8h, v24.8h , v26.8h
+ ext v24.16b, v20.16b , v22.16b , #2
+ ext v26.16b, v20.16b , v22.16b , #8
+ add v24.8h, v24.8h , v26.8h
+
+ saddl v26.4s, v0.4h, v20.4h
+ smlal v26.4s, v25.4h, v28.4h
+ smlsl v26.4s, v24.4h, v30.4h
+
+ saddl2 v22.4s, v0.8h, v20.8h
+ smlal2 v22.4s, v25.8h, v28.8h
+ smlsl2 v22.4s, v24.8h, v30.8h
+
+ sqrshrun v19.4h, v26.4s, #10
+ sqrshrun v25.4h, v22.4s, #10
+
+ uaddl v24.8h, v7.8b, v9.8b
+
+
+
+ uqxtn v19.8b, v19.8h
+ uqxtn v25.8b, v25.8h
+ mov v19.2s[1], v25.2s[0]
+
+ uaddl v22.8h, v4.8b, v10.8b
+ ld1 {v0.2s, v1.2s}, [x0], #16 // Vector load from src[6_0]
+
+
+ ld1 {v12.2s}, [x0], x2 // Vector load from src[6_0]
+ uaddl v20.8h, v6.8b, v8.8b
+ uaddl v26.8h, v5.8b, v11.8b
+ st1 {v18.2s, v19.2s}, [x1], x3 // store row 0
+
+
+//ROW_2
+
+
+ uaddl v18.8h, v2.8b, v0.8b
+
+ mla v18.8h, v20.8h , v28.8h
+
+ uaddl v20.8h, v3.8b, v1.8b
+
+ mla v20.8h, v24.8h , v28.8h
+ uaddl v24.8h, v15.8b, v16.8b
+ mls v18.8h, v22.8h , v30.8h
+ uaddl v22.8h, v13.8b, v12.8b
+ mls v20.8h, v26.8h , v30.8h
+ uaddl v26.8h, v14.8b, v17.8b
+ mla v22.8h, v24.8h , v28.8h
+ mls v22.8h, v26.8h , v30.8h
+
+ ext v24.16b, v18.16b , v20.16b , #4
+ ext v26.16b, v18.16b , v20.16b , #6
+
+ ext v23.16b, v18.16b , v20.16b , #10
+ add v2.8h, v24.8h , v26.8h
+ ext v24.16b, v18.16b , v20.16b , #2
+ ext v26.16b, v18.16b , v20.16b , #8
+ add v24.8h, v24.8h , v26.8h
+
+ saddl v26.4s, v18.4h, v23.4h
+ smlal v26.4s, v2.4h, v28.4h
+ smlsl v26.4s, v24.4h, v30.4h
+
+ saddl2 v23.4s, v18.8h, v23.8h
+ smlal2 v23.4s, v2.8h, v28.8h
+ smlsl2 v23.4s, v24.8h, v30.8h
+
+ sqrshrun v18.4h, v26.4s, #10
+ sqrshrun v19.4h, v23.4s, #10
+
+
+
+ uqxtn v18.8b, v18.8h
+ uqxtn v19.8b, v19.8h
+ mov v18.2s[1], v19.2s[0]
+
+ ext v24.16b, v20.16b , v22.16b , #4
+ ext v26.16b, v20.16b , v22.16b , #6
+ ext v2.16b, v20.16b , v22.16b , #10
+
+ add v25.8h, v24.8h , v26.8h
+ ext v24.16b, v20.16b , v22.16b , #2
+ ext v26.16b, v20.16b , v22.16b , #8
+ add v24.8h, v24.8h , v26.8h
+
+ saddl v26.4s, v2.4h, v20.4h
+ smlal v26.4s, v25.4h, v28.4h
+ smlsl v26.4s, v24.4h, v30.4h
+
+ saddl2 v22.4s, v2.8h, v20.8h
+ smlal2 v22.4s, v25.8h, v28.8h
+ smlsl2 v22.4s, v24.8h, v30.8h
+
+ sqrshrun v19.4h, v26.4s, #10
+ sqrshrun v25.4h, v22.4s, #10
+ uaddl v24.8h, v9.8b, v11.8b
+
+ uqxtn v19.8b, v19.8h
+ uqxtn v25.8b, v25.8h
+ mov v19.2s[1], v25.2s[0]
+
+
+ uaddl v22.8h, v6.8b, v0.8b
+ ld1 {v2.2s, v3.2s}, [x0], #16 // Vector load from src[7_0]
+
+
+ ld1 {v13.2s}, [x0], x2 // Vector load from src[7_0]
+ uaddl v20.8h, v8.8b, v10.8b
+ uaddl v26.8h, v7.8b, v1.8b
+ st1 {v18.2s, v19.2s}, [x1], x3 // store row 1
+
+//ROW_3
+
+
+ uaddl v18.8h, v4.8b, v2.8b
+
+ mla v18.8h, v20.8h , v28.8h
+
+ uaddl v20.8h, v5.8b, v3.8b
+
+ mla v20.8h, v24.8h , v28.8h
+ uaddl v24.8h, v16.8b, v17.8b
+ mls v18.8h, v22.8h , v30.8h
+ uaddl v22.8h, v14.8b, v13.8b
+ mls v20.8h, v26.8h , v30.8h
+ uaddl v26.8h, v15.8b, v12.8b
+ mla v22.8h, v24.8h , v28.8h
+ mls v22.8h, v26.8h , v30.8h
+
+ ext v24.16b, v18.16b , v20.16b , #4
+ ext v26.16b, v18.16b , v20.16b , #6
+
+ ext v23.16b, v18.16b , v20.16b , #10
+ add v4.8h, v24.8h , v26.8h
+ ext v24.16b, v18.16b , v20.16b , #2
+ ext v26.16b, v18.16b , v20.16b , #8
+ add v24.8h, v24.8h , v26.8h
+
+ saddl v26.4s, v18.4h, v23.4h
+ smlal v26.4s, v4.4h, v28.4h
+ smlsl v26.4s, v24.4h, v30.4h
+
+ saddl2 v23.4s, v18.8h, v23.8h
+ smlal2 v23.4s, v4.8h, v28.8h
+ smlsl2 v23.4s, v24.8h, v30.8h
+
+ sqrshrun v18.4h, v26.4s, #10
+ sqrshrun v19.4h, v23.4s, #10
+
+
+ uqxtn v18.8b, v18.8h
+ uqxtn v19.8b, v19.8h
+ mov v18.2s[1], v19.2s[0]
+
+
+ ext v24.16b, v20.16b , v22.16b , #4
+ ext v26.16b, v20.16b , v22.16b , #6
+ ext v4.16b, v20.16b , v22.16b , #10
+
+ add v25.8h, v24.8h , v26.8h
+ ext v24.16b, v20.16b , v22.16b , #2
+ ext v26.16b, v20.16b , v22.16b , #8
+ add v24.8h, v24.8h , v26.8h
+
+ saddl v26.4s, v4.4h, v20.4h
+ smlal v26.4s, v25.4h, v28.4h
+ smlsl v26.4s, v24.4h, v30.4h
+
+ saddl2 v22.4s, v4.8h, v20.8h
+ smlal2 v22.4s, v25.8h, v28.8h
+ smlsl2 v22.4s, v24.8h, v30.8h
+
+ sqrshrun v19.4h, v26.4s, #10
+ sqrshrun v25.4h, v22.4s, #10
+
+ uaddl v24.8h, v11.8b, v1.8b
+
+
+ uqxtn v19.8b, v19.8h
+ uqxtn v25.8b, v25.8h
+ mov v19.2s[1], v25.2s[0]
+
+
+
+ uaddl v22.8h, v8.8b, v2.8b
+ ld1 {v4.2s, v5.2s}, [x0], #16 // Vector load from src[8_0]
+
+
+ ld1 {v14.2s}, [x0], x2 // Vector load from src[8_0]
+ uaddl v20.8h, v10.8b, v0.8b
+ uaddl v26.8h, v9.8b, v3.8b
+ st1 {v18.2s, v19.2s}, [x1], x3 // store row 2
+
+
+//ROW_4
+
+ uaddl v18.8h, v6.8b, v4.8b
+
+ mla v18.8h, v20.8h , v28.8h
+
+ uaddl v20.8h, v7.8b, v5.8b
+
+ mla v20.8h, v24.8h , v28.8h
+ uaddl v24.8h, v17.8b, v12.8b
+ mls v18.8h, v22.8h , v30.8h
+ uaddl v22.8h, v15.8b, v14.8b
+ mls v20.8h, v26.8h , v30.8h
+ uaddl v26.8h, v16.8b, v13.8b
+ mla v22.8h, v24.8h , v28.8h
+ mls v22.8h, v26.8h , v30.8h
+
+ ext v24.16b, v18.16b , v20.16b , #4
+ ext v26.16b, v18.16b , v20.16b , #6
+
+ ext v23.16b, v18.16b , v20.16b , #10
+ add v6.8h, v24.8h , v26.8h
+ ext v24.16b, v18.16b , v20.16b , #2
+ ext v26.16b, v18.16b , v20.16b , #8
+ add v24.8h, v24.8h , v26.8h
+
+ saddl v26.4s, v18.4h, v23.4h
+ smlal v26.4s, v6.4h, v28.4h
+ smlsl v26.4s, v24.4h, v30.4h
+
+ saddl2 v23.4s, v18.8h, v23.8h
+ smlal2 v23.4s, v6.8h, v28.8h
+ smlsl2 v23.4s, v24.8h, v30.8h
+
+ sqrshrun v18.4h, v26.4s, #10
+ sqrshrun v19.4h, v23.4s, #10
+
+ uqxtn v18.8b, v18.8h
+ uqxtn v19.8b, v19.8h
+ mov v18.2s[1], v19.2s[0]
+
+
+ ext v24.16b, v20.16b , v22.16b , #4
+ ext v26.16b, v20.16b , v22.16b , #6
+ ext v6.16b, v20.16b , v22.16b , #10
+
+ add v25.8h, v24.8h , v26.8h
+ ext v24.16b, v20.16b , v22.16b , #2
+ ext v26.16b, v20.16b , v22.16b , #8
+ add v24.8h, v24.8h , v26.8h
+
+ saddl v26.4s, v6.4h, v20.4h
+ smlal v26.4s, v25.4h, v28.4h
+ smlsl v26.4s, v24.4h, v30.4h
+
+ saddl2 v22.4s, v6.8h, v20.8h
+ smlal2 v22.4s, v25.8h, v28.8h
+ smlsl2 v22.4s, v24.8h, v30.8h
+
+ mov v6.16b, v2.16b
+ mov v7.16b, v3.16b
+
+ mov v2.16b, v10.16b
+ mov v3.16b, v11.16b
+
+ subs x4, x4, #4
+ sqrshrun v19.4h, v26.4s, #10
+ sqrshrun v25.4h, v22.4s, #10
+ mov v10.16b, v0.16b
+ mov v11.16b, v1.16b
+
+ mov v24.8b, v14.8b
+
+ mov v14.16b, v12.16b
+ mov v15.16b, v13.16b
+
+
+ uqxtn v19.8b, v19.8h
+ uqxtn v25.8b, v25.8h
+ mov v19.2s[1], v25.2s[0]
+
+
+
+ mov v0.16b, v8.16b
+ mov v1.16b, v9.16b
+
+ mov v8.16b, v4.16b
+ mov v9.16b, v5.16b
+
+ mov v12.16b, v16.16b
+ mov v13.16b, v17.16b
+
+ mov v4.16b, v10.16b
+ mov v5.16b, v11.16b
+
+ mov v16.8b, v24.8b
+ st1 {v18.2s, v19.2s}, [x1], x3 // store row 3
+
+ bgt loop_16 // looping if height =16
+ b end_func
+
+loop_8_start:
+ ld1 {v0.2s, v1.2s}, [x0], x2 // Vector load from src[0_0]
+ ld1 {v2.2s, v3.2s}, [x0], x2 // Vector load from src[1_0]
+ ld1 {v4.2s, v5.2s}, [x0], x2 // Vector load from src[2_0]
+ ld1 {v6.2s, v7.2s}, [x0], x2 // Vector load from src[3_0]
+ ld1 {v8.2s, v9.2s}, [x0], x2 // Vector load from src[4_0]
+
+loop_8:
+
+ ld1 {v10.2s, v11.2s}, [x0], x2 // Vector load from src[5_0]
+ uaddl v14.8h, v4.8b, v6.8b
+ uaddl v12.8h, v0.8b, v10.8b
+ uaddl v16.8h, v2.8b, v8.8b
+ mla v12.8h, v14.8h , v26.8h
+ uaddl v18.8h, v5.8b, v7.8b
+ uaddl v14.8h, v1.8b, v11.8b
+ uaddl v22.8h, v3.8b, v9.8b
+ mla v14.8h, v18.8h , v26.8h
+ mls v12.8h, v16.8h , v24.8h
+ ld1 {v0.2s, v1.2s}, [x0], x2 // Vector load from src[6_0]
+ uaddl v16.8h, v6.8b, v8.8b
+ mls v14.8h, v22.8h , v24.8h
+ uaddl v28.8h, v2.8b, v0.8b
+
+ ext v22.16b, v12.16b , v14.16b , #10
+ uaddl v18.8h, v4.8b, v10.8b
+ mla v28.8h, v16.8h , v26.8h
+ saddl v30.4s, v12.4h, v22.4h
+
+ saddl2 v22.4s, v12.8h, v22.8h
+ ext v16.16b, v12.16b , v14.16b , #4
+ mls v28.8h, v18.8h , v24.8h
+ ext v18.16b, v12.16b , v14.16b , #6
+ ext v20.16b, v12.16b , v14.16b , #8
+ ext v14.16b, v12.16b , v14.16b , #2
+ add v16.8h, v16.8h , v18.8h
+ add v18.8h, v14.8h , v20.8h
+ uaddl v20.8h, v7.8b, v9.8b
+ smlal v30.4s, v16.4h, v26.4h
+ smlsl v30.4s, v18.4h, v24.4h
+ smlal2 v22.4s, v16.8h, v26.8h
+ smlsl2 v22.4s, v18.8h, v24.8h
+ uaddl v14.8h, v3.8b, v1.8b
+
+ mla v14.8h, v20.8h , v26.8h
+ sqrshrun v12.4h, v30.4s, #10
+ uaddl v16.8h, v5.8b, v11.8b
+ sqrshrun v13.4h, v22.4s, #10
+ mls v14.8h, v16.8h , v24.8h
+ ld1 {v2.2s, v3.2s}, [x0], x2 // Vector load from src[7_0]
+ uqxtn v25.8b, v12.8h
+ uqxtn v13.8b, v13.8h
+ mov v25.2s[1], v13.2s[0]
+ uaddl v16.8h, v8.8b, v10.8b
+
+
+ ext v22.16b, v28.16b , v14.16b , #10
+ uaddl v20.8h, v4.8b, v2.8b
+ saddl v30.4s, v28.4h, v22.4h
+ mla v20.8h, v16.8h , v26.8h
+
+ saddl2 v22.4s, v28.8h, v22.8h
+ ext v16.16b, v28.16b , v14.16b , #4
+ ext v18.16b, v28.16b , v14.16b , #6
+ ext v12.16b, v28.16b , v14.16b , #8
+ ext v14.16b, v28.16b , v14.16b , #2
+ add v16.8h, v16.8h , v18.8h
+ add v18.8h, v12.8h , v14.8h
+
+ smlal v30.4s, v16.4h, v26.4h
+ smlsl v30.4s, v18.4h, v24.4h
+ smlal2 v22.4s, v16.8h, v26.8h
+ smlsl2 v22.4s, v18.8h, v24.8h
+
+
+ uaddl v18.8h, v6.8b, v0.8b
+ sqrshrun v16.4h, v30.4s, #10
+
+ sqrshrun v17.4h, v22.4s, #10
+
+ mov v12.8b, v25.8b
+ mov v25.8b, v24.8b
+
+ uaddl v28.8h, v9.8b, v11.8b
+ uqxtn v13.8b, v16.8h
+ uqxtn v17.8b, v17.8h
+ mov v13.2s[1], v17.2s[0]
+
+
+ uaddl v14.8h, v5.8b, v3.8b
+ uaddl v22.8h, v7.8b, v1.8b
+ mls v20.8h, v18.8h , v24.8h
+ st1 {v12.2s}, [x1], x3 // store row 0
+ mla v14.8h, v28.8h , v26.8h
+ ld1 {v4.2s, v5.2s}, [x0], x2 // Vector load from src[8_0]
+ uaddl v30.8h, v10.8b, v0.8b
+ uaddl v28.8h, v6.8b, v4.8b
+ mls v14.8h, v22.8h , v24.8h
+ st1 {v13.2s}, [x1], x3 // store row 1
+ mla v28.8h, v30.8h , v26.8h
+
+ ext v22.16b, v20.16b , v14.16b , #10
+ saddl v30.4s, v20.4h, v22.4h
+
+ saddl2 v22.4s, v20.8h, v22.8h
+ ext v16.16b, v20.16b , v14.16b , #4
+ ext v18.16b, v20.16b , v14.16b , #6
+ ext v12.16b, v20.16b , v14.16b , #8
+ ext v14.16b, v20.16b , v14.16b , #2
+ add v16.8h, v16.8h , v18.8h
+ add v18.8h, v14.8h , v12.8h
+ uaddl v20.8h, v8.8b, v2.8b
+ smlal v30.4s, v16.4h, v26.4h
+ smlsl v30.4s, v18.4h, v24.4h
+ smlal2 v22.4s, v16.8h, v26.8h
+ smlsl2 v22.4s, v18.8h, v24.8h
+ uaddl v18.8h, v11.8b, v1.8b
+ uaddl v16.8h, v7.8b, v5.8b
+ sqrshrun v12.4h, v30.4s, #10
+ uaddl v30.8h, v9.8b, v3.8b
+ mla v16.8h, v18.8h , v26.8h
+ sqrshrun v13.4h, v22.4s, #10
+ mls v28.8h, v20.8h , v24.8h
+
+ mls v16.8h, v30.8h , v24.8h
+ uqxtn v27.8b, v12.8h
+ uqxtn v13.8b, v13.8h
+ mov v27.2s[1], v13.2s[0]
+
+
+ ext v22.16b, v28.16b , v16.16b , #10
+
+ saddl v30.4s, v28.4h, v22.4h
+
+ saddl2 v22.4s, v28.8h, v22.8h
+ ext v12.16b, v28.16b , v16.16b , #4
+ ext v18.16b, v28.16b , v16.16b , #6
+ ext v20.16b, v28.16b , v16.16b , #8
+ ext v28.16b, v28.16b , v16.16b , #2
+ add v12.8h, v12.8h , v18.8h
+ add v18.8h, v28.8h , v20.8h
+
+ smlal v30.4s, v12.4h, v26.4h
+ smlsl v30.4s, v18.4h, v24.4h
+ smlal2 v22.4s, v12.8h, v26.8h
+ smlsl2 v22.4s, v18.8h, v24.8h
+
+
+ mov v12.8b, v27.8b
+ mov v27.8b, v26.8b
+
+ sqrshrun v16.4h, v30.4s, #10
+
+ mov v6.16b, v2.16b
+ mov v7.16b, v3.16b
+
+ sqrshrun v17.4h, v22.4s, #10
+
+ mov v2.16b, v10.16b
+ mov v3.16b, v11.16b
+
+ mov v10.16b, v0.16b
+ mov v11.16b, v1.16b
+
+ subs x4, x4, #4
+ uqxtn v13.8b, v16.8h
+ uqxtn v17.8b, v17.8h
+ mov v13.2s[1], v17.2s[0]
+
+
+ mov v0.16b, v8.16b
+ mov v1.16b, v9.16b
+
+ mov v8.16b, v4.16b
+ mov v9.16b, v5.16b
+
+ mov v4.16b, v10.16b
+ mov v5.16b, v11.16b
+
+ st1 {v12.2s}, [x1], x3 // store row 2
+ st1 {v13.2s}, [x1], x3 // store row 3
+
+ bgt loop_8 //if height =8 loop
+ b end_func
+
+loop_4_start:
+ ld1 {v0.2s, v1.2s}, [x0], x2 // Vector load from src[0_0]
+ ld1 {v2.2s, v3.2s}, [x0], x2 // Vector load from src[1_0]
+ ld1 {v4.2s, v5.2s}, [x0], x2 // Vector load from src[2_0]
+ ld1 {v6.2s, v7.2s}, [x0], x2 // Vector load from src[3_0]
+ ld1 {v8.2s, v9.2s}, [x0], x2 // Vector load from src[4_0]
+
+loop_4:
+ ld1 {v10.2s, v11.2s}, [x0], x2 // Vector load from src[5_0]
+ uaddl v14.8h, v4.8b, v6.8b // temp1 = src[2_0] + src[3_0]
+ uaddl v12.8h, v0.8b, v10.8b // temp = src[0_0] + src[5_0]
+ uaddl v16.8h, v2.8b, v8.8b // temp2 = src[1_0] + src[4_0]
+ mla v12.8h, v14.8h , v26.8h // temp += temp1 * 20
+ uaddl v18.8h, v5.8b, v7.8b // temp1 = src[2_0] + src[3_0]
+ uaddl v14.8h, v1.8b, v11.8b // temp = src[0_0] + src[5_0]
+ uaddl v22.8h, v3.8b, v9.8b // temp2 = src[1_0] + src[4_0]
+ mla v14.8h, v18.8h , v26.8h // temp += temp1 * 20
+ mls v12.8h, v16.8h , v24.8h // temp -= temp2 * 5
+ ld1 {v0.2s, v1.2s}, [x0], x2 // Vector load from src[6_0]
+ uaddl v16.8h, v6.8b, v8.8b
+ mls v14.8h, v22.8h , v24.8h // temp -= temp2 * 5
+ //Q6 and Q7 have filtered values
+ uaddl v28.8h, v2.8b, v0.8b
+
+ ext v22.16b, v12.16b , v14.16b , #10
+ uaddl v18.8h, v4.8b, v10.8b
+ mla v28.8h, v16.8h , v26.8h
+ saddl v30.4s, v12.4h, v22.4h
+
+ saddl v22.4s, v13.4h, v23.4h
+ ext v16.16b, v12.16b , v14.16b , #4
+ mls v28.8h, v18.8h , v24.8h
+ ext v18.16b, v12.16b , v14.16b , #6
+ ext v20.16b, v12.16b , v14.16b , #8
+ ext v14.16b, v12.16b , v14.16b , #2
+ add v16.8h, v16.8h , v18.8h
+ add v18.8h, v14.8h , v20.8h
+ uaddl v20.8h, v7.8b, v9.8b
+ smlal v30.4s, v16.4h, v26.4h
+ smlsl v30.4s, v18.4h, v24.4h
+ smlal v22.4s, v17.4h, v26.4h
+ smlsl v22.4s, v19.4h, v24.4h
+ uaddl v14.8h, v3.8b, v1.8b
+
+ mla v14.8h, v20.8h , v26.8h
+ sqrshrun v12.4h, v30.4s, #10
+ uaddl v16.8h, v5.8b, v11.8b
+ sqrshrun v13.4h, v22.4s, #10
+ mls v14.8h, v16.8h , v24.8h
+ ld1 {v2.2s, v3.2s}, [x0], x2 // Vector load from src[7_0]
+ uqxtn v25.8b, v12.8h
+ uaddl v16.8h, v8.8b, v10.8b
+
+ ext v22.16b, v28.16b , v14.16b , #10
+ uaddl v20.8h, v4.8b, v2.8b
+ saddl v30.4s, v28.4h, v22.4h
+ mla v20.8h, v16.8h , v26.8h
+
+ saddl v22.4s, v29.4h, v23.4h
+ ext v16.16b, v28.16b , v14.16b , #4
+ ext v18.16b, v28.16b , v14.16b , #6
+ ext v12.16b, v28.16b , v14.16b , #8
+ ext v14.16b, v28.16b , v14.16b , #2
+ add v16.8h, v16.8h , v18.8h
+ add v18.8h, v12.8h , v14.8h
+
+ smlal v30.4s, v16.4h, v26.4h
+ smlsl v30.4s, v18.4h, v24.4h
+ smlal v22.4s, v17.4h, v26.4h
+ smlsl v22.4s, v19.4h, v24.4h
+
+
+ uaddl v18.8h, v6.8b, v0.8b
+ sqrshrun v16.4h, v30.4s, #10
+
+ sqrshrun v17.4h, v22.4s, #10
+
+ mov v12.8b, v25.8b
+ mov v25.8b, v24.8b
+
+ uaddl v28.8h, v9.8b, v11.8b
+ uqxtn v13.8b, v16.8h
+
+
+
+ uaddl v14.8h, v5.8b, v3.8b
+ uaddl v22.8h, v7.8b, v1.8b
+ mls v20.8h, v18.8h , v24.8h
+ st1 {v12.s}[0], [x1], x3 // store row 0
+ mla v14.8h, v28.8h , v26.8h
+ ld1 {v4.2s, v5.2s}, [x0], x2 // Vector load from src[8_0]
+ uaddl v30.8h, v10.8b, v0.8b
+ uaddl v28.8h, v6.8b, v4.8b
+ mls v14.8h, v22.8h , v24.8h
+ st1 {v13.s}[0], [x1], x3 //store row 1
+ mla v28.8h, v30.8h , v26.8h
+
+ ext v22.16b, v20.16b , v14.16b , #10
+ saddl v30.4s, v20.4h, v22.4h
+
+ saddl v22.4s, v21.4h, v23.4h
+ ext v16.16b, v20.16b , v14.16b , #4
+ ext v18.16b, v20.16b , v14.16b , #6
+ ext v12.16b, v20.16b , v14.16b , #8
+ ext v14.16b, v20.16b , v14.16b , #2
+ add v16.8h, v16.8h , v18.8h
+ add v18.8h, v14.8h , v12.8h
+ uaddl v20.8h, v8.8b, v2.8b
+ smlal v30.4s, v16.4h, v26.4h
+ smlsl v30.4s, v18.4h, v24.4h
+ smlal v22.4s, v17.4h, v26.4h
+ smlsl v22.4s, v19.4h, v24.4h
+ uaddl v18.8h, v11.8b, v1.8b
+ uaddl v16.8h, v7.8b, v5.8b
+ sqrshrun v12.4h, v30.4s, #10
+ uaddl v30.8h, v9.8b, v3.8b
+ mla v16.8h, v18.8h , v26.8h
+ sqrshrun v13.4h, v22.4s, #10
+ mls v28.8h, v20.8h , v24.8h
+
+ mls v16.8h, v30.8h , v24.8h
+ uqxtn v27.8b, v12.8h
+
+ ext v22.16b, v28.16b , v16.16b , #10
+
+ saddl v30.4s, v28.4h, v22.4h
+
+ saddl v22.4s, v29.4h, v23.4h
+ ext v12.16b, v28.16b , v16.16b , #4
+ ext v18.16b, v28.16b , v16.16b , #6
+ ext v20.16b, v28.16b , v16.16b , #8
+ ext v28.16b, v28.16b , v16.16b , #2
+ add v12.8h, v12.8h , v18.8h
+ add v18.8h, v28.8h , v20.8h
+
+ smlal v30.4s, v12.4h, v26.4h
+ smlsl v30.4s, v18.4h, v24.4h
+ smlal v22.4s, v13.4h, v26.4h
+ smlsl v22.4s, v19.4h, v24.4h
+
+
+ mov v12.8b, v27.8b
+ mov v27.8b, v26.8b
+
+ sqrshrun v16.4h, v30.4s, #10
+
+ mov v6.16b, v2.16b
+ mov v7.16b, v3.16b
+
+ sqrshrun v17.4h, v22.4s, #10
+
+ mov v2.16b, v10.16b
+ mov v3.16b, v11.16b
+
+ mov v10.16b, v0.16b
+ mov v11.16b, v1.16b
+
+ subs x4, x4, #4
+ uqxtn v13.8b, v16.8h
+
+ mov v0.16b, v8.16b
+ mov v1.16b, v9.16b
+
+ mov v8.16b, v4.16b
+ mov v9.16b, v5.16b
+
+
+ mov v4.16b, v10.16b
+ mov v5.16b, v11.16b
+
+
+ st1 {v12.s}[0], [x1], x3 // store row 2
+ st1 {v13.s}[0], [x1], x3 // store row 3
+
+ bgt loop_4
+
+end_func:
+ //Restoring registers from stack
+ ldp x19, x20, [sp], #16
+ pop_v_regs
+ ret
+
+
+
diff --git a/common/armv8/ih264_inter_pred_luma_horz_hpel_vert_qpel_av8.s b/common/armv8/ih264_inter_pred_luma_horz_hpel_vert_qpel_av8.s
new file mode 100755
index 0000000..3737e3f
--- /dev/null
+++ b/common/armv8/ih264_inter_pred_luma_horz_hpel_vert_qpel_av8.s
@@ -0,0 +1,1120 @@
+//******************************************************************************
+//*
+//* Copyright (C) 2015 The Android Open Source Project
+//*
+//* Licensed under the Apache License, Version 2.0 (the "License");
+//* you may not use this file except in compliance with the License.
+//* You may obtain a copy of the License at:
+//*
+//* http://www.apache.org/licenses/LICENSE-2.0
+//*
+//* Unless required by applicable law or agreed to in writing, software
+//* distributed under the License is distributed on an "AS IS" BASIS,
+//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//* See the License for the specific language governing permissions and
+//* limitations under the License.
+//*
+//*****************************************************************************
+//* Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
+//*/
+///**
+//******************************************************************************
+//* @file
+//* ih264_inter_pred_luma_horz_hpel_vert_qpel_av8.s
+//*
+//* @brief
+//* Contains function definitions for inter prediction interpolation.
+//*
+//* @author
+//* Mohit
+//*
+//* @par List of Functions:
+//*
+//* - ih264_inter_pred_luma_horz_hpel_vert_qpel_av8()
+//*
+//* @remarks
+//* None
+//*
+//*******************************************************************************
+//*/
+
+///* All the functions here are replicated from ih264_inter_pred_filters.c
+//
+
+///**
+///**
+///**
+//*******************************************************************************
+//*
+//* @brief
+//* This function implements a two stage cascaded six tap filter. It
+//* applies the six tap filter in the horizontal direction on the
+//* predictor values, followed by applying the same filter in the
+//* vertical direction on the output of the first stage. It then averages
+//* the output of the 1st stage and the output of the 2nd stage to obtain
+//* the quarter pel values. The six tap filtering operation is described
+//* in sec 8.4.2.2.1 titled "Luma sample interpolation process".
+//*
+//* @par Description:
+//* This function is called to obtain pixels lying at the following
+//* location (1/2,1/4) or (1/2,3/4). The function interpolates
+//* the predictors first in the horizontal direction and then in the
+//* vertical direction to output the (1/2,1/2). It then averages
+//* the output of the 2nd stage and (1/2,1/2) value to obtain (1/2,1/4)
+//* or (1/2,3/4) depending on the offset.
+//*
+//* @param[in] pu1_src
+//* UWORD8 pointer to the source
+//*
+//* @param[out] pu1_dst
+//* UWORD8 pointer to the destination
+//*
+//* @param[in] src_strd
+//* integer source stride
+//*
+//* @param[in] dst_strd
+//* integer destination stride
+//*
+//* @param[in] ht
+//* integer height of the array
+//*
+//* @param[in] wd
+//* integer width of the array
+//*
+//* @param[in] pu1_tmp: temporary buffer
+//*
+//* @param[in] dydx: x and y reference offset for qpel calculations
+//*
+//* @returns
+//*
+//* @remarks
+//* None
+//*
+//*******************************************************************************
+//*/;
+
+//void ih264_inter_pred_luma_horz_hpel_vert_qpel(UWORD8 *pu1_src,
+// UWORD8 *pu1_dst,
+// WORD32 src_strd,,
+// WORD32 dst_strd,
+// WORD32 ht,
+// WORD32 wd,
+// UWORD8* pu1_tmp,
+// UWORD32 dydx)
+
+//**************Variables Vs Registers*****************************************
+// x0 => *pu1_src
+// x1 => *pu1_dst
+// x2 => src_strd
+// x3 => dst_strd
+// x4 => ht
+// x5 => wd
+// x7 => dydx
+// x9 => *pu1_tmp
+
+.text
+.p2align 2
+.include "ih264_neon_macros.s"
+
+
+
+ .global ih264_inter_pred_luma_horz_hpel_vert_qpel_av8
+
+ih264_inter_pred_luma_horz_hpel_vert_qpel_av8:
+
+
+ // store register values to stack
+ push_v_regs
+ stp x19, x20, [sp, #-16]!
+
+
+
+ sub x0, x0, x2, lsl #1 // pu1_src-2*src_strd
+ sub x0, x0, #2 // pu1_src-2
+
+ mov x9, x6
+
+ lsr x7, x7, #3 // dydx >> 2 followed by dydx & 0x3 and dydx>>1 to obtain the deciding bit
+
+ add x7, x7, #2
+ mov x6, #48
+ madd x7, x7, x6, x9
+
+ subs x12, x5, #4 //if wd=4 branch to loop_4
+ beq loop_4_start
+
+ subs x12, x5, #8 //if wd=8 branch to loop_8
+ beq loop_8_start
+
+ //when wd=16
+ movi v22.8h, #20 // Filter coeff 0x14 into Q11
+ movi v24.8h, #5 // Filter coeff 0x5 into Q12
+ add x8, x0, #8
+ add x14, x1, #8
+ add x10, x9, #8
+ mov x12, x4
+ add x11, x7, #8
+loop_16_lowhalf_start:
+ ld1 {v0.2s, v1.2s}, [x0], x2 // row -2 load for horizontal filter
+ ext v5.8b, v0.8b , v1.8b , #5
+ uaddl v6.8h, v0.8b, v5.8b
+
+ ext v2.8b, v0.8b , v1.8b , #2
+ ext v3.8b, v0.8b , v1.8b , #3
+ uaddl v8.8h, v2.8b, v3.8b
+ ext v4.8b, v0.8b , v1.8b , #4
+ mla v6.8h, v8.8h , v22.8h
+ ext v1.8b, v0.8b , v1.8b , #1
+ uaddl v8.8h, v1.8b, v4.8b
+ ld1 {v0.2s, v1.2s}, [x0], x2 // row -1 load for horizontal filter
+ mls v6.8h, v8.8h , v24.8h
+ ext v5.8b, v0.8b , v1.8b , #5
+ uaddl v8.8h, v0.8b, v5.8b
+ ext v2.8b, v0.8b , v1.8b , #2
+ ext v3.8b, v0.8b , v1.8b , #3
+ uaddl v10.8h, v2.8b, v3.8b
+
+ st1 {v6.4s}, [x9], x6 // store temp buffer 0
+
+ ext v4.8b, v0.8b , v1.8b , #4
+ mla v8.8h, v10.8h , v22.8h
+ ext v1.8b, v0.8b , v1.8b , #1
+ uaddl v10.8h, v1.8b, v4.8b
+ ld1 {v0.2s, v1.2s}, [x0], x2 // row 0 load for horizontal filter
+ mls v8.8h, v10.8h , v24.8h
+ ext v5.8b, v0.8b , v1.8b , #5
+ uaddl v10.8h, v0.8b, v5.8b
+ ext v2.8b, v0.8b , v1.8b , #2
+ ext v3.8b, v0.8b , v1.8b , #3
+ uaddl v12.8h, v2.8b, v3.8b
+
+ st1 {v8.4s}, [x9], x6 // store temp buffer 1
+
+ ext v4.8b, v0.8b , v1.8b , #4
+ mla v10.8h, v12.8h , v22.8h
+ ext v1.8b, v0.8b , v1.8b , #1
+ uaddl v12.8h, v1.8b, v4.8b
+ ld1 {v0.2s, v1.2s}, [x0], x2 // row 1 load for horizontal filter
+ mls v10.8h, v12.8h , v24.8h
+ ext v5.8b, v0.8b , v1.8b , #5
+ uaddl v12.8h, v0.8b, v5.8b
+ ext v2.8b, v0.8b , v1.8b , #2
+ ext v3.8b, v0.8b , v1.8b , #3
+ uaddl v14.8h, v2.8b, v3.8b
+
+ st1 {v10.4s}, [x9], x6 // store temp buffer 2
+
+ ext v4.8b, v0.8b , v1.8b , #4
+ mla v12.8h, v14.8h , v22.8h
+ ext v1.8b, v0.8b , v1.8b , #1
+ uaddl v14.8h, v1.8b, v4.8b
+ ld1 {v0.2s, v1.2s}, [x0], x2 // row 2 load for horizontal filter
+ mls v12.8h, v14.8h , v24.8h
+ ext v5.8b, v0.8b , v1.8b , #5
+ uaddl v14.8h, v0.8b, v5.8b
+ ext v2.8b, v0.8b , v1.8b , #2
+ ext v3.8b, v0.8b , v1.8b , #3
+ uaddl v16.8h, v2.8b, v3.8b
+
+ st1 {v12.4s}, [x9], x6 // store temp buffer 3
+
+ ext v4.8b, v0.8b , v1.8b , #4
+ mla v14.8h, v16.8h , v22.8h
+ ext v1.8b, v0.8b , v1.8b , #1
+ uaddl v16.8h, v1.8b, v4.8b
+
+ mls v14.8h, v16.8h , v24.8h
+loop_16_lowhalf:
+
+ ld1 {v0.2s, v1.2s}, [x0], x2 // row 3 load for horizontal filter
+ ext v5.8b, v0.8b , v1.8b , #5
+ ext v2.8b, v0.8b , v1.8b , #2
+ ext v3.8b, v0.8b , v1.8b , #3
+ uaddl v16.8h, v0.8b, v5.8b
+
+ st1 {v14.4s}, [x9], x6 // store temp buffer 4
+
+ uaddl v18.8h, v2.8b, v3.8b
+ ext v4.8b, v0.8b , v1.8b , #4
+ mla v16.8h, v18.8h , v22.8h
+ ext v1.8b, v0.8b , v1.8b , #1
+ add v28.8h, v8.8h , v14.8h
+ uaddl v18.8h, v1.8b, v4.8b
+ add v30.8h, v10.8h , v12.8h
+ mls v16.8h, v18.8h , v24.8h
+ ld1 {v0.2s, v1.2s}, [x0], x2 // row 4 load for hoorizontal filter
+ ext v5.8b, v0.8b , v1.8b , #5
+ ext v2.8b, v0.8b , v1.8b , #2
+ ext v3.8b, v0.8b , v1.8b , #3
+ uaddl v20.8h, v0.8b, v5.8b
+
+ st1 {v16.4s}, [x9], x6 // store temp buffer x5
+
+ saddl v18.4s, v6.4h, v16.4h
+
+ ld1 {v26.4s}, [x7], x6 // load from temp buffer 0
+
+ saddl2 v6.4s, v6.8h, v16.8h
+
+ sqrshrun v26.8b, v26.8h, #5
+
+ smlal v18.4s, v30.4h, v22.4h
+ smlsl v18.4s, v28.4h, v24.4h
+ smlal2 v6.4s, v30.8h, v22.8h
+ smlsl2 v6.4s, v28.8h, v24.8h
+ uaddl v2.8h, v2.8b, v3.8b
+ ext v4.8b, v0.8b , v1.8b , #4
+ mla v20.8h, v2.8h , v22.8h
+ sqrshrun v18.4h, v18.4s, #10
+ ext v1.8b, v0.8b , v1.8b , #1
+ sqrshrun v19.4h, v6.4s, #10
+ add v28.8h, v10.8h , v16.8h
+ uaddl v2.8h, v1.8b, v4.8b
+ add v30.8h, v12.8h , v14.8h
+ mls v20.8h, v2.8h , v24.8h
+
+ uqxtn v18.8b, v18.8h
+ uqxtn v19.8b, v19.8h
+ mov v18.2s[1], v19.2s[0]
+
+ ld1 {v0.2s, v1.2s}, [x0], x2 // row 5 load for horizontal filter
+
+ urhadd v26.8b, v18.8b , v26.8b
+
+ ext v5.8b, v0.8b , v1.8b , #5
+ ext v2.8b, v0.8b , v1.8b , #2
+
+ st1 {v20.4s}, [x9], x6 // store temp buffer x6
+
+ saddl v18.4s, v8.4h, v20.4h
+
+ saddl2 v6.4s, v8.8h, v20.8h
+
+ ld1 {v8.4s}, [x7], x6 //load from temp buffer 1
+
+
+ st1 {v26.2s}, [x1], x3 // store row 0
+
+ smlal v18.4s, v30.4h, v22.4h
+ smlsl v18.4s, v28.4h, v24.4h
+ smlal2 v6.4s, v30.8h, v22.8h
+ smlsl2 v6.4s, v28.8h, v24.8h
+
+ sqrshrun v28.8b, v8.8h, #5
+ ext v3.8b, v0.8b , v1.8b , #3
+ uaddl v8.8h, v0.8b, v5.8b
+ uaddl v2.8h, v2.8b, v3.8b
+ sqrshrun v18.4h, v18.4s, #10
+ ext v4.8b, v0.8b , v1.8b , #4
+ sqrshrun v19.4h, v6.4s, #10
+ mla v8.8h, v2.8h , v22.8h
+ ext v1.8b, v0.8b , v1.8b , #1
+ add v26.8h, v12.8h , v20.8h
+ uaddl v2.8h, v1.8b, v4.8b
+ uqxtn v18.8b, v18.8h
+ uqxtn v19.8b, v19.8h
+ mov v18.2s[1], v19.2s[0]
+ add v30.8h, v14.8h , v16.8h
+ mls v8.8h, v2.8h , v24.8h
+ ld1 {v0.2s, v1.2s}, [x0], x2 // row 6 load for horizontal filter
+
+ urhadd v28.8b, v28.8b , v18.8b
+
+ ext v5.8b, v0.8b , v1.8b , #5
+ ext v2.8b, v0.8b , v1.8b , #2
+ ext v3.8b, v0.8b , v1.8b , #3
+
+ st1 {v28.2s}, [x1], x3 // store row 1
+
+ uaddl v28.8h, v0.8b, v5.8b
+
+ st1 {v8.4s}, [x9], x6 // store temp buffer x7
+
+ saddl v18.4s, v10.4h, v8.4h
+ saddl2 v6.4s, v10.8h, v8.8h
+
+ ld1 {v10.4s}, [x7], x6 // load from temp buffer 2
+
+ smlal v18.4s, v30.4h, v22.4h
+ smlsl v18.4s, v26.4h, v24.4h
+
+ smlal2 v6.4s, v30.8h, v22.8h
+ smlsl2 v6.4s, v26.8h, v24.8h
+
+ sqrshrun v26.8b, v10.8h, #5
+
+ uaddl v2.8h, v2.8b, v3.8b
+ ext v4.8b, v0.8b , v1.8b , #4
+ mla v28.8h, v2.8h , v22.8h
+ sqrshrun v18.4h, v18.4s, #10
+ ext v1.8b, v0.8b , v1.8b , #1
+ sqrshrun v19.4h, v6.4s, #10
+ add v10.8h, v14.8h , v8.8h
+ uaddl v2.8h, v1.8b, v4.8b
+ add v30.8h, v16.8h , v20.8h
+ mls v28.8h, v2.8h , v24.8h
+ uqxtn v27.8b, v18.8h
+ uqxtn v19.8b, v19.8h
+ mov v27.2s[1], v19.2s[0]
+ saddl v18.4s, v12.4h, v28.4h
+ saddl2 v6.4s, v12.8h, v28.8h
+
+ urhadd v26.8b, v26.8b , v27.8b
+
+ smlal v18.4s, v30.4h, v22.4h
+ smlsl v18.4s, v10.4h, v24.4h
+ smlal2 v6.4s, v30.8h, v22.8h
+ smlsl2 v6.4s, v10.8h, v24.8h
+
+ st1 {v26.2s}, [x1], x3 // store row 2
+
+ st1 {v28.2s, v29.2s}, [x9]
+
+
+ sqrshrun v18.4h, v18.4s, #10
+
+ mov v10.16b, v20.16b
+ mov v11.16b, v21.16b
+ ld1 {v30.4s}, [x7], x6 // load from temp buffer 3
+
+ sqrshrun v19.4h, v6.4s, #10
+ subs x4, x4, #4
+
+ sqrshrun v30.8b, v30.8h, #5
+
+ uqxtn v18.8b, v18.8h
+ uqxtn v19.8b, v19.8h
+ mov v18.2s[1], v19.2s[0]
+
+ mov v12.16b, v8.16b
+ mov v13.16b, v9.16b
+ mov v6.16b, v14.16b
+ mov v7.16b, v15.16b
+
+ urhadd v30.8b, v18.8b , v30.8b
+
+ mov v8.16b, v16.16b
+ mov v9.16b, v17.16b
+ mov v14.16b, v28.16b
+ mov v15.16b, v29.16b
+
+ st1 {v30.2s}, [x1], x3 // store row 3
+
+ bgt loop_16_lowhalf // looping if height =16
+
+
+loop_16_highhalf_start:
+ ld1 {v0.2s, v1.2s}, [x8], x2
+ ext v5.8b, v0.8b , v1.8b , #5
+ uaddl v6.8h, v0.8b, v5.8b
+ ext v2.8b, v0.8b , v1.8b , #2
+ ext v3.8b, v0.8b , v1.8b , #3
+ uaddl v8.8h, v2.8b, v3.8b
+ ext v4.8b, v0.8b , v1.8b , #4
+ mla v6.8h, v8.8h , v22.8h
+ ext v1.8b, v0.8b , v1.8b , #1
+ uaddl v8.8h, v1.8b, v4.8b
+ ld1 {v0.2s, v1.2s}, [x8], x2
+ mls v6.8h, v8.8h , v24.8h
+ ext v5.8b, v0.8b , v1.8b , #5
+ uaddl v8.8h, v0.8b, v5.8b
+ ext v2.8b, v0.8b , v1.8b , #2
+ ext v3.8b, v0.8b , v1.8b , #3
+ uaddl v10.8h, v2.8b, v3.8b
+
+ st1 {v6.4s}, [x10], x6
+
+ ext v4.8b, v0.8b , v1.8b , #4
+ mla v8.8h, v10.8h , v22.8h
+ ext v1.8b, v0.8b , v1.8b , #1
+ uaddl v10.8h, v1.8b, v4.8b
+ ld1 {v0.2s, v1.2s}, [x8], x2
+ mls v8.8h, v10.8h , v24.8h
+ ext v5.8b, v0.8b , v1.8b , #5
+ uaddl v10.8h, v0.8b, v5.8b
+ ext v2.8b, v0.8b , v1.8b , #2
+ ext v3.8b, v0.8b , v1.8b , #3
+ uaddl v12.8h, v2.8b, v3.8b
+
+ st1 {v8.4s}, [x10], x6
+
+ ext v4.8b, v0.8b , v1.8b , #4
+ mla v10.8h, v12.8h , v22.8h
+ ext v1.8b, v0.8b , v1.8b , #1
+ uaddl v12.8h, v1.8b, v4.8b
+ ld1 {v0.2s, v1.2s}, [x8], x2
+ mls v10.8h, v12.8h , v24.8h
+ ext v5.8b, v0.8b , v1.8b , #5
+ uaddl v12.8h, v0.8b, v5.8b
+ ext v2.8b, v0.8b , v1.8b , #2
+ ext v3.8b, v0.8b , v1.8b , #3
+ uaddl v14.8h, v2.8b, v3.8b
+
+ st1 {v10.4s}, [x10], x6
+
+ ext v4.8b, v0.8b , v1.8b , #4
+ mla v12.8h, v14.8h , v22.8h
+ ext v1.8b, v0.8b , v1.8b , #1
+ uaddl v14.8h, v1.8b, v4.8b
+ ld1 {v0.2s, v1.2s}, [x8], x2
+ mls v12.8h, v14.8h , v24.8h
+ ext v5.8b, v0.8b , v1.8b , #5
+ uaddl v14.8h, v0.8b, v5.8b
+ ext v2.8b, v0.8b , v1.8b , #2
+ ext v3.8b, v0.8b , v1.8b , #3
+ uaddl v16.8h, v2.8b, v3.8b
+
+ st1 {v12.4s}, [x10], x6
+
+ ext v4.8b, v0.8b , v1.8b , #4
+ mla v14.8h, v16.8h , v22.8h
+ ext v1.8b, v0.8b , v1.8b , #1
+ uaddl v16.8h, v1.8b, v4.8b
+
+ mls v14.8h, v16.8h , v24.8h
+
+loop_16_highhalf:
+
+ ld1 {v0.2s, v1.2s}, [x8], x2
+ ext v5.8b, v0.8b , v1.8b , #5
+ ext v2.8b, v0.8b , v1.8b , #2
+ ext v3.8b, v0.8b , v1.8b , #3
+ uaddl v16.8h, v0.8b, v5.8b
+
+ st1 {v14.4s}, [x10], x6
+
+ uaddl v18.8h, v2.8b, v3.8b
+ ext v4.8b, v0.8b , v1.8b , #4
+ mla v16.8h, v18.8h , v22.8h
+ ext v1.8b, v0.8b , v1.8b , #1
+ add v28.8h, v8.8h , v14.8h
+ uaddl v18.8h, v1.8b, v4.8b
+ add v30.8h, v10.8h , v12.8h
+ mls v16.8h, v18.8h , v24.8h
+ ld1 {v0.2s, v1.2s}, [x8], x2
+ ext v5.8b, v0.8b , v1.8b , #5
+ ext v2.8b, v0.8b , v1.8b , #2
+ ext v3.8b, v0.8b , v1.8b , #3
+ uaddl v20.8h, v0.8b, v5.8b
+
+ st1 {v16.4s}, [x10], x6
+
+ saddl v18.4s, v6.4h, v16.4h
+
+ ld1 {v26.4s}, [x11], x6
+
+ saddl2 v6.4s, v6.8h, v16.8h
+
+ sqrshrun v26.8b, v26.8h, #5
+
+ smlal v18.4s, v30.4h, v22.4h
+ smlsl v18.4s, v28.4h, v24.4h
+ smlal2 v6.4s, v30.8h, v22.8h
+ smlsl2 v6.4s, v28.8h, v24.8h
+ uaddl v2.8h, v2.8b, v3.8b
+ ext v4.8b, v0.8b , v1.8b , #4
+ mla v20.8h, v2.8h , v22.8h
+ sqrshrun v18.4h, v18.4s, #10
+ ext v1.8b, v0.8b , v1.8b , #1
+ sqrshrun v19.4h, v6.4s, #10
+ add v28.8h, v10.8h , v16.8h
+ uaddl v2.8h, v1.8b, v4.8b
+ add v30.8h, v12.8h , v14.8h
+ mls v20.8h, v2.8h , v24.8h
+ uqxtn v18.8b, v18.8h
+ uqxtn v19.8b, v19.8h
+ mov v18.2s[1], v19.2s[0]
+ ld1 {v0.2s, v1.2s}, [x8], x2
+
+ urhadd v26.8b, v18.8b , v26.8b
+
+ ext v5.8b, v0.8b , v1.8b , #5
+ ext v2.8b, v0.8b , v1.8b , #2
+
+ st1 {v20.4s}, [x10], x6
+
+ saddl v18.4s, v8.4h, v20.4h
+ saddl2 v6.4s, v8.8h, v20.8h
+
+ ld1 {v8.4s}, [x11], x6
+
+
+ st1 {v26.2s}, [x14], x3 //store row 0
+
+ smlal v18.4s, v30.4h, v22.4h
+ smlsl v18.4s, v28.4h, v24.4h
+ smlal2 v6.4s, v30.8h, v22.8h
+ smlsl2 v6.4s, v28.8h, v24.8h
+ sqrshrun v28.8b, v8.8h, #5
+ ext v3.8b, v0.8b , v1.8b , #3
+ uaddl v8.8h, v0.8b, v5.8b
+ uaddl v2.8h, v2.8b, v3.8b
+ sqrshrun v18.4h, v18.4s, #10
+ ext v4.8b, v0.8b , v1.8b , #4
+ sqrshrun v19.4h, v6.4s, #10
+ mla v8.8h, v2.8h , v22.8h
+ ext v1.8b, v0.8b , v1.8b , #1
+ add v26.8h, v12.8h , v20.8h
+ uaddl v2.8h, v1.8b, v4.8b
+ uqxtn v18.8b, v18.8h
+ uqxtn v19.8b, v19.8h
+ mov v18.2s[1], v19.2s[0]
+ add v30.8h, v14.8h , v16.8h
+ mls v8.8h, v2.8h , v24.8h
+ ld1 {v0.2s, v1.2s}, [x8], x2
+
+ urhadd v28.8b, v28.8b , v18.8b
+
+ ext v5.8b, v0.8b , v1.8b , #5
+ ext v2.8b, v0.8b , v1.8b , #2
+ ext v3.8b, v0.8b , v1.8b , #3
+
+ st1 {v28.2s}, [x14], x3 //store row 1
+
+ uaddl v28.8h, v0.8b, v5.8b
+
+ st1 {v8.4s}, [x10], x6
+
+ saddl v18.4s, v10.4h, v8.4h
+ saddl2 v6.4s, v10.8h, v8.8h
+
+ ld1 {v10.4s}, [x11], x6
+
+ smlal v18.4s, v30.4h, v22.4h
+ smlsl v18.4s, v26.4h, v24.4h
+ smlal2 v6.4s, v30.8h, v22.8h
+ smlsl2 v6.4s, v26.8h, v24.8h
+
+ sqrshrun v26.8b, v10.8h, #5
+ uaddl v2.8h, v2.8b, v3.8b
+ ext v4.8b, v0.8b , v1.8b , #4
+ mla v28.8h, v2.8h , v22.8h
+ sqrshrun v18.4h, v18.4s, #10
+ ext v1.8b, v0.8b , v1.8b , #1
+ sqrshrun v19.4h, v6.4s, #10
+ add v10.8h, v14.8h , v8.8h
+ uaddl v2.8h, v1.8b, v4.8b
+ add v30.8h, v16.8h , v20.8h
+ mls v28.8h, v2.8h , v24.8h
+ uqxtn v27.8b, v18.8h
+ uqxtn v19.8b, v19.8h
+ mov v27.2s[1], v19.2s[0]
+
+
+ saddl v18.4s, v12.4h, v28.4h
+ saddl2 v6.4s, v12.8h, v28.8h
+
+ urhadd v26.8b, v26.8b , v27.8b
+
+ smlal v18.4s, v30.4h, v22.4h
+ smlsl v18.4s, v10.4h, v24.4h
+ smlal2 v6.4s, v30.8h, v22.8h
+ smlsl2 v6.4s, v10.8h, v24.8h
+
+ st1 {v26.2s}, [x14], x3 // store row 2
+
+ st1 {v28.4s}, [x10]
+
+ sqrshrun v18.4h, v18.4s, #10
+ mov v10.16b, v20.16b
+ mov v11.16b, v21.16b
+ ld1 {v30.4s}, [x11], x6
+
+ sqrshrun v19.4h, v6.4s, #10
+ subs x12, x12, #4
+
+ sqrshrun v30.8b, v30.8h, #5
+
+ uqxtn v18.8b, v18.8h
+ uqxtn v19.8b, v19.8h
+ mov v18.2s[1], v19.2s[0]
+
+ mov v12.16b, v8.16b
+ mov v13.16b, v9.16b
+ mov v6.16b, v14.16b
+ mov v7.16b, v15.16b
+ urhadd v30.8b, v18.8b , v30.8b
+
+ mov v8.16b, v16.16b
+ mov v9.16b, v17.16b
+ mov v14.16b, v28.16b
+ mov v15.16b, v29.16b
+ st1 {v30.2s}, [x14], x3 // store row 3
+
+ bgt loop_16_highhalf // looping if height = 8 or 16
+ b end_func
+
+loop_8_start:
+
+ movi v22.8h, #0x14 // Filter coeff 20 into Q11
+ movi v24.8h, #5 // Filter coeff 5 into Q12
+ ld1 {v0.2s, v1.2s}, [x0], x2 // row -2 load for horizontal filter
+ ext v5.8b, v0.8b , v1.8b , #5
+ uaddl v6.8h, v0.8b, v5.8b
+
+ ext v2.8b, v0.8b , v1.8b , #2
+ ext v3.8b, v0.8b , v1.8b , #3
+ uaddl v8.8h, v2.8b, v3.8b
+ ext v4.8b, v0.8b , v1.8b , #4
+ mla v6.8h, v8.8h , v22.8h
+ ext v1.8b, v0.8b , v1.8b , #1
+ uaddl v8.8h, v1.8b, v4.8b
+ ld1 {v0.2s, v1.2s}, [x0], x2 // row -1 load for horizontal filter
+ mls v6.8h, v8.8h , v24.8h
+ ext v5.8b, v0.8b , v1.8b , #5
+ uaddl v8.8h, v0.8b, v5.8b
+ ext v2.8b, v0.8b , v1.8b , #2
+ ext v3.8b, v0.8b , v1.8b , #3
+ uaddl v10.8h, v2.8b, v3.8b
+
+ st1 {v6.4s}, [x9], x6 // store temp buffer 0
+
+ ext v4.8b, v0.8b , v1.8b , #4
+ mla v8.8h, v10.8h , v22.8h
+ ext v1.8b, v0.8b , v1.8b , #1
+ uaddl v10.8h, v1.8b, v4.8b
+ ld1 {v0.2s, v1.2s}, [x0], x2 // row 0 load for horizontal filter
+ mls v8.8h, v10.8h , v24.8h
+ ext v5.8b, v0.8b , v1.8b , #5
+ uaddl v10.8h, v0.8b, v5.8b
+ ext v2.8b, v0.8b , v1.8b , #2
+ ext v3.8b, v0.8b , v1.8b , #3
+ uaddl v12.8h, v2.8b, v3.8b
+
+ st1 {v8.4s}, [x9], x6 // store temp buffer 1
+
+ ext v4.8b, v0.8b , v1.8b , #4
+ mla v10.8h, v12.8h , v22.8h
+ ext v1.8b, v0.8b , v1.8b , #1
+ uaddl v12.8h, v1.8b, v4.8b
+ ld1 {v0.2s, v1.2s}, [x0], x2 // row 1 load for horizontal filter
+ mls v10.8h, v12.8h , v24.8h
+ ext v5.8b, v0.8b , v1.8b , #5
+ uaddl v12.8h, v0.8b, v5.8b
+ ext v2.8b, v0.8b , v1.8b , #2
+ ext v3.8b, v0.8b , v1.8b , #3
+ uaddl v14.8h, v2.8b, v3.8b
+
+ st1 {v10.4s}, [x9], x6 // store temp buffer 2
+
+ ext v4.8b, v0.8b , v1.8b , #4
+ mla v12.8h, v14.8h , v22.8h
+ ext v1.8b, v0.8b , v1.8b , #1
+ uaddl v14.8h, v1.8b, v4.8b
+ ld1 {v0.2s, v1.2s}, [x0], x2 // row 2 load for horizontal filter
+ mls v12.8h, v14.8h , v24.8h
+ ext v5.8b, v0.8b , v1.8b , #5
+ uaddl v14.8h, v0.8b, v5.8b
+ ext v2.8b, v0.8b , v1.8b , #2
+ ext v3.8b, v0.8b , v1.8b , #3
+ uaddl v16.8h, v2.8b, v3.8b
+
+ st1 {v12.4s}, [x9], x6 // store temp buffer 3
+
+ ext v4.8b, v0.8b , v1.8b , #4
+ mla v14.8h, v16.8h , v22.8h
+ ext v1.8b, v0.8b , v1.8b , #1
+ uaddl v16.8h, v1.8b, v4.8b
+
+ mls v14.8h, v16.8h , v24.8h
+loop_8:
+
+ ld1 {v0.2s, v1.2s}, [x0], x2 // row 3 load for horizontal filter
+ ext v5.8b, v0.8b , v1.8b , #5
+ ext v2.8b, v0.8b , v1.8b , #2
+ ext v3.8b, v0.8b , v1.8b , #3
+ uaddl v16.8h, v0.8b, v5.8b
+
+ st1 {v14.4s}, [x9], x6 // store temp buffer 4
+
+ uaddl v18.8h, v2.8b, v3.8b
+ ext v4.8b, v0.8b , v1.8b , #4
+ mla v16.8h, v18.8h , v22.8h
+ ext v1.8b, v0.8b , v1.8b , #1
+ add v28.8h, v8.8h , v14.8h
+ uaddl v18.8h, v1.8b, v4.8b
+ add v30.8h, v10.8h , v12.8h
+ mls v16.8h, v18.8h , v24.8h
+ ld1 {v0.2s, v1.2s} , [x0], x2 // row 4 load for hoorizontal filter
+ ext v5.8b, v0.8b , v1.8b , #5
+ ext v2.8b, v0.8b , v1.8b , #2
+ ext v3.8b, v0.8b , v1.8b , #3
+ uaddl v20.8h, v0.8b, v5.8b
+
+ st1 {v16.4s}, [x9], x6 // store temp buffer x5
+
+ saddl v18.4s, v6.4h, v16.4h
+
+ ld1 {v26.4s}, [x7], x6 // load from temp buffer 0
+
+ saddl2 v6.4s, v6.8h, v16.8h
+
+ sqrshrun v26.8b, v26.8h, #5
+
+ smlal v18.4s, v30.4h, v22.4h
+ smlsl v18.4s, v28.4h, v24.4h
+ smlal2 v6.4s, v30.8h, v22.8h
+ smlsl2 v6.4s, v28.8h, v24.8h
+ uaddl v2.8h, v2.8b, v3.8b
+ ext v4.8b, v0.8b , v1.8b , #4
+ mla v20.8h, v2.8h , v22.8h
+ sqrshrun v18.4h, v18.4s, #10
+ ext v1.8b, v0.8b , v1.8b , #1
+ sqrshrun v19.4h, v6.4s, #10
+ add v28.8h, v10.8h , v16.8h
+ uaddl v2.8h, v1.8b, v4.8b
+ add v30.8h, v12.8h , v14.8h
+ mls v20.8h, v2.8h , v24.8h
+
+ uqxtn v18.8b, v18.8h
+ uqxtn v19.8b, v19.8h
+ mov v18.2s[1], v19.2s[0]
+
+ ld1 {v0.2s, v1.2s}, [x0], x2 // row 5 load for horizontal filter
+
+ urhadd v26.8b, v18.8b , v26.8b
+
+ ext v5.8b, v0.8b , v1.8b , #5
+ ext v2.8b, v0.8b , v1.8b , #2
+
+ st1 {v20.4s}, [x9], x6 // store temp buffer x6
+
+ saddl v18.4s, v8.4h, v20.4h
+
+ saddl2 v6.4s, v8.8h, v20.8h
+
+ ld1 {v8.4s}, [x7], x6 //load from temp buffer 1
+
+
+ st1 {v26.2s}, [x1], x3 // store row 0
+
+ smlal v18.4s, v30.4h, v22.4h
+ smlsl v18.4s, v28.4h, v24.4h
+
+
+
+ smlal2 v6.4s, v30.8h, v22.8h
+ smlsl2 v6.4s, v28.8h, v24.8h
+
+ sqrshrun v28.8b, v8.8h, #5
+
+ ext v3.8b, v0.8b , v1.8b , #3
+ uaddl v8.8h, v0.8b, v5.8b
+ uaddl v2.8h, v2.8b, v3.8b
+ sqrshrun v18.4h, v18.4s, #10
+ ext v4.8b, v0.8b , v1.8b , #4
+ sqrshrun v19.4h, v6.4s, #10
+ mla v8.8h, v2.8h , v22.8h
+ ext v1.8b, v0.8b , v1.8b , #1
+ add v26.8h, v12.8h , v20.8h
+ uaddl v2.8h, v1.8b, v4.8b
+
+
+ uqxtn v18.8b, v18.8h
+ uqxtn v19.8b, v19.8h
+ mov v18.2s[1], v19.2s[0]
+
+ add v30.8h, v14.8h , v16.8h
+ mls v8.8h, v2.8h , v24.8h
+ ld1 {v0.2s, v1.2s}, [x0], x2 // row 6 load for horizontal filter
+
+ urhadd v28.8b, v28.8b , v18.8b
+
+ ext v5.8b, v0.8b , v1.8b , #5
+ ext v2.8b, v0.8b , v1.8b , #2
+ ext v3.8b, v0.8b , v1.8b , #3
+
+ st1 {v28.2s}, [x1], x3 // store row 1
+
+ uaddl v28.8h, v0.8b, v5.8b
+
+ st1 {v8.4s}, [x9], x6 // store temp buffer x7
+
+ saddl v18.4s, v10.4h, v8.4h
+ saddl2 v6.4s, v10.8h, v8.8h
+
+ ld1 {v10.4s}, [x7], x6 // load from temp buffer 2
+
+ smlal v18.4s, v30.4h, v22.4h
+ smlsl v18.4s, v26.4h, v24.4h
+ smlal2 v6.4s, v30.8h, v22.8h
+ smlsl2 v6.4s, v26.8h, v24.8h
+
+ sqrshrun v26.8b, v10.8h, #5
+ uaddl v2.8h, v2.8b, v3.8b
+ ext v4.8b, v0.8b , v1.8b , #4
+ mla v28.8h, v2.8h , v22.8h
+ sqrshrun v18.4h, v18.4s, #10
+ ext v1.8b, v0.8b , v1.8b , #1
+ sqrshrun v19.4h, v6.4s, #10
+ add v10.8h, v14.8h , v8.8h
+ uaddl v2.8h, v1.8b, v4.8b
+ add v30.8h, v16.8h , v20.8h
+ mls v28.8h, v2.8h , v24.8h
+
+ uqxtn v27.8b, v18.8h
+ uqxtn v19.8b, v19.8h
+
+ mov v27.2s[1], v19.2s[0]
+
+ saddl v18.4s, v12.4h, v28.4h
+ saddl2 v6.4s, v12.8h, v28.8h
+
+ urhadd v26.8b, v26.8b , v27.8b
+
+ smlal v18.4s, v30.4h, v22.4h
+ smlsl v18.4s, v10.4h, v24.4h
+ smlal2 v6.4s, v30.8h, v22.8h
+ smlsl2 v6.4s, v10.8h, v24.8h
+
+ st1 {v26.2s}, [x1], x3 // store row 2
+
+ st1 {v28.2s, v29.2s}, [x9]
+
+
+ sqrshrun v18.4h, v18.4s, #10
+ mov v10.16b, v20.16b
+ mov v11.16b, v21.16b
+ ld1 {v30.4s}, [x7], x6 // load from temp buffer 3
+
+ sqrshrun v19.4h, v6.4s, #10
+ subs x4, x4, #4
+
+ sqrshrun v30.8b, v30.8h, #5
+
+
+ uqxtn v18.8b, v18.8h
+ uqxtn v19.8b, v19.8h
+ mov v18.2s[1], v19.2s[0]
+
+
+ mov v12.16b, v8.16b
+ mov v13.16b, v9.16b
+ mov v6.16b, v14.16b
+ mov v7.16b, v15.16b
+
+ urhadd v30.8b, v18.8b , v30.8b
+ mov v8.16b, v16.16b
+ mov v9.16b, v17.16b
+ mov v14.16b, v28.16b
+ mov v15.16b, v29.16b
+ st1 {v30.2s}, [x1], x3 // store row 3
+
+ bgt loop_8 //if height =8 or 16 loop
+ b end_func
+
+loop_4_start:
+ movi v22.8h, #20 // Filter coeff 20 into D22
+ movi v23.8h, #5 // Filter coeff 5 into D23
+
+ ld1 {v0.2s, v1.2s}, [x0], x2 //row -2 load
+ ext v5.8b, v0.8b , v1.8b , #5
+ uaddl v6.8h, v0.8b, v5.8b
+ ext v2.8b, v0.8b , v1.8b , #2
+ ext v3.8b, v0.8b , v1.8b , #3
+ uaddl v8.8h, v2.8b, v3.8b
+ ext v4.8b, v0.8b , v1.8b , #4
+ mla v6.4h, v8.4h , v22.4h
+ ext v1.8b, v0.8b , v1.8b , #1
+ uaddl v8.8h, v1.8b, v4.8b
+ ld1 {v0.2s, v1.2s}, [x0], x2 // row -1 load
+ mls v6.4h, v8.4h , v23.4h
+ ext v5.8b, v0.8b , v1.8b , #5
+ uaddl v8.8h, v0.8b, v5.8b
+ ext v2.8b, v0.8b , v1.8b , #2
+ ext v3.8b, v0.8b , v1.8b , #3
+ uaddl v10.8h, v2.8b, v3.8b
+
+ st1 {v6.2s}, [x9], x6 // store temp buffer 0
+
+ ext v4.8b, v0.8b , v1.8b , #4
+ mla v8.4h, v10.4h , v22.4h
+ ext v1.8b, v0.8b , v1.8b , #1
+ uaddl v10.8h, v1.8b, v4.8b
+ ld1 {v0.2s, v1.2s}, [x0], x2 // row 0 load
+ mls v8.4h, v10.4h , v23.4h
+ ext v5.8b, v0.8b , v1.8b , #5
+ uaddl v10.8h, v0.8b, v5.8b
+ ext v2.8b, v0.8b , v1.8b , #2
+ ext v3.8b, v0.8b , v1.8b , #3
+ uaddl v12.8h, v2.8b, v3.8b
+
+ st1 {v8.2s}, [x9], x6 // store temp buffer 1
+
+ ext v4.8b, v0.8b , v1.8b , #4
+ mla v10.4h, v12.4h , v22.4h
+ ext v1.8b, v0.8b , v1.8b , #1
+ uaddl v12.8h, v1.8b, v4.8b
+ ld1 {v0.2s, v1.2s}, [x0], x2 // row 1 load
+ mls v10.4h, v12.4h , v23.4h
+ ext v5.8b, v0.8b , v1.8b , #5
+ uaddl v12.8h, v0.8b, v5.8b
+ ext v2.8b, v0.8b , v1.8b , #2
+ ext v3.8b, v0.8b , v1.8b , #3
+ uaddl v14.8h, v2.8b, v3.8b
+
+ st1 {v10.2s}, [x9], x6 // store temp buffer 2
+
+ ext v4.8b, v0.8b , v1.8b , #4
+ mla v12.4h, v14.4h , v22.4h
+ ext v1.8b, v0.8b , v1.8b , #1
+ uaddl v14.8h, v1.8b, v4.8b
+ ld1 {v0.2s, v1.2s}, [x0], x2 // row 2 load
+ mls v12.4h, v14.4h , v23.4h
+ ext v5.8b, v0.8b , v1.8b , #5
+ uaddl v14.8h, v0.8b, v5.8b
+ ext v2.8b, v0.8b , v1.8b , #2
+ ext v3.8b, v0.8b , v1.8b , #3
+ uaddl v16.8h, v2.8b, v3.8b
+ ext v4.8b, v0.8b , v1.8b , #4
+ mla v14.4h, v16.4h , v22.4h
+ ext v1.8b, v0.8b , v1.8b , #1
+ uaddl v16.8h, v1.8b, v4.8b
+
+ st1 {v12.2s}, [x9], x6 // store temp buffer 3
+
+ mls v14.4h, v16.4h , v23.4h
+
+loop_4:
+
+ ld1 {v0.2s, v1.2s}, [x0], x2 // row 3 load
+ ext v5.8b, v0.8b , v1.8b , #5
+ uaddl v16.8h, v0.8b, v5.8b
+ ext v2.8b, v0.8b , v1.8b , #2
+ ext v3.8b, v0.8b , v1.8b , #3
+ uaddl v18.8h, v2.8b, v3.8b
+ st1 {v14.2s}, [x9], x6 // store temp buffer 4
+ ext v4.8b, v0.8b , v1.8b , #4
+ mla v16.4h, v18.4h , v22.4h
+ ext v1.8b, v0.8b , v1.8b , #1
+ uaddl v18.8h, v1.8b, v4.8b
+ add v2.4h, v10.4h , v12.4h
+ mls v16.4h, v18.4h , v23.4h
+ add v3.4h, v8.4h , v14.4h
+ ld1 {v18.2s, v19.2s}, [x0], x2 // row 4 load
+ ext v25.8b, v18.8b , v19.8b , #5
+ uaddl v26.8h, v18.8b, v25.8b
+ ext v20.8b, v18.8b , v19.8b , #2
+
+ st1 {v16.2s}, [x9], x6 // store temp buffer 5
+
+ saddl v0.4s, v6.4h, v16.4h
+ smlal v0.4s, v2.4h, v22.4h
+ ext v21.8b, v18.8b , v19.8b , #3
+ uaddl v28.8h, v20.8b, v21.8b
+ ext v24.8b, v18.8b , v19.8b , #4
+ smlsl v0.4s, v3.4h, v23.4h
+ mla v26.4h, v28.4h , v22.4h
+ ext v19.8b, v18.8b , v19.8b , #1
+ uaddl v28.8h, v19.8b, v24.8b
+ add v2.4h, v12.4h , v14.4h
+ mls v26.4h, v28.4h , v23.4h
+ sqrshrun v0.4h, v0.4s, #0xa
+ add v3.4h, v10.4h , v16.4h
+ ld1 {v18.2s, v19.2s}, [x0], x2 // row 5 load
+ ext v25.8b, v18.8b , v19.8b , #5
+ uqxtn v11.8b, v0.8h
+ uaddl v28.8h, v18.8b, v25.8b
+
+ st1 {v26.2s}, [x9], x6 // store temp buffer 6
+
+ //Q3 available here
+ ld1 {v6.2s}, [x7], x6 // load from temp buffer 0
+ ld1 {v7.2s}, [x7], x6 // load from temp buffer 1
+
+ sqrshrun v9.8b, v6.8h, #5
+ sqrshrun v7.8b, v7.8h, #5
+ mov v9.2s[1], v7.2s[0]
+
+ ext v20.8b, v18.8b , v19.8b , #2
+
+ saddl v0.4s, v8.4h, v26.4h
+ smlal v0.4s, v2.4h, v22.4h
+ ext v21.8b, v18.8b , v19.8b , #3
+ uaddl v6.8h, v20.8b, v21.8b
+ ext v24.8b, v18.8b , v19.8b , #4
+ smlsl v0.4s, v3.4h, v23.4h
+ mla v28.4h, v6.4h , v22.4h
+ ext v19.8b, v18.8b , v19.8b , #1
+ uaddl v6.8h, v19.8b, v24.8b
+ add v2.4h, v14.4h , v16.4h
+ mls v28.4h, v6.4h , v23.4h
+ sqrshrun v0.4h, v0.4s, #0xa
+ add v3.4h, v12.4h , v26.4h
+ ld1 {v18.2s, v19.2s}, [x0], x2 // row 6 load
+ ext v25.8b, v18.8b , v19.8b , #5
+ uqxtn v13.8b, v0.8h
+
+ trn1 v11.2s, v11.2s, v13.2s
+ trn2 v13.2s, v11.2s, v13.2s
+ saddl v0.4s, v10.4h, v28.4h
+ urhadd v9.8b, v9.8b , v11.8b
+
+ st1 {v28.2s}, [x9], x6 // store temp buffer 7
+
+ smlal v0.4s, v2.4h, v22.4h
+ uaddl v30.8h, v18.8b, v25.8b
+
+ st1 {v9.s}[0], [x1], x3 // store row 0
+
+ ext v20.8b, v18.8b , v19.8b , #2
+
+ st1 {v9.s}[1], [x1], x3 // store row 1
+
+ ext v21.8b, v18.8b , v19.8b , #3
+ smlsl v0.4s, v3.4h, v23.4h
+ uaddl v8.8h, v20.8b, v21.8b
+ ext v24.8b, v18.8b , v19.8b , #4
+ mla v30.4h, v8.4h , v22.4h
+ ext v19.8b, v18.8b , v19.8b , #1
+ uaddl v8.8h, v19.8b, v24.8b
+ sqrshrun v0.4h, v0.4s, #0xa
+ add v2.4h, v16.4h , v26.4h
+ mls v30.4h, v8.4h , v23.4h
+ uqxtn v4.8b, v0.8h
+
+ add v3.4h, v14.4h , v28.4h
+
+
+ saddl v0.4s, v12.4h, v30.4h
+
+ st1 {v30.2s}, [x9]
+
+ smlal v0.4s, v2.4h, v22.4h
+
+ ld1 {v8.2s}, [x7], x6 // load from temp buffer 2
+ ld1 {v9.2s}, [x7], x6 // load from temp buffer 3
+ smlsl v0.4s, v3.4h, v23.4h
+ subs x4, x4, #4
+
+ sqrshrun v10.8b, v8.8h, #5
+ sqrshrun v9.8b, v9.8h, #5
+ mov v10.2s[1], v9.2s[0]
+
+ mov v12.8b, v28.8b
+
+ sqrshrun v0.4h, v0.4s, #0xa
+ mov v6.8b, v14.8b
+ mov v8.8b, v16.8b
+
+ uqxtn v5.8b, v0.8h
+
+ trn1 v4.2s, v4.2s, v5.2s
+ trn2 v5.2s, v4.2s, v5.2s
+ urhadd v4.8b, v4.8b , v10.8b
+ mov v10.8b, v26.8b
+ mov v14.8b, v30.8b
+
+ st1 {v4.s}[0], [x1], x3 // store row 2
+ st1 {v4.s}[1], [x1], x3 // store row 3
+
+ bgt loop_4
+
+end_func:
+ //Restoring registers from stack
+ ldp x19, x20, [sp], #16
+ pop_v_regs
+ ret
+
+
+
diff --git a/common/armv8/ih264_inter_pred_luma_horz_qpel_av8.s b/common/armv8/ih264_inter_pred_luma_horz_qpel_av8.s
new file mode 100755
index 0000000..39e3253
--- /dev/null
+++ b/common/armv8/ih264_inter_pred_luma_horz_qpel_av8.s
@@ -0,0 +1,597 @@
+//******************************************************************************
+//*
+//* Copyright (C) 2015 The Android Open Source Project
+//*
+//* Licensed under the Apache License, Version 2.0 (the "License");
+//* you may not use this file except in compliance with the License.
+//* You may obtain a copy of the License at:
+//*
+//* http://www.apache.org/licenses/LICENSE-2.0
+//*
+//* Unless required by applicable law or agreed to in writing, software
+//* distributed under the License is distributed on an "AS IS" BASIS,
+//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//* See the License for the specific language governing permissions and
+//* limitations under the License.
+//*
+//*****************************************************************************
+//* Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
+//*/
+///**
+//******************************************************************************
+//* @file
+//* ih264_inter_pred_luma_horz_qpel_av8.s
+//*
+//* @brief
+//* Contains function definitions for inter prediction horizontal quarter pel interpolation.
+//*
+//* @author
+//* Mohit
+//*
+//* @par List of Functions:
+//*
+//* - ih264_inter_pred_luma_horz_qpel_av8()
+//*
+//* @remarks
+//* None
+//*
+//*******************************************************************************
+//*/
+
+///* All the functions here are replicated from ih264_inter_pred_filters.c
+//
+
+///**
+///**
+//*******************************************************************************
+//*
+//* @brief
+//* Quarter pel interprediction luma filter for horizontal input
+//*
+//* @par Description:
+//* Applies a 6 tap horizontal filter .The output is clipped to 8 bits
+//* sec 8.4.2.2.1 titled "Luma sample interpolation process"
+//*
+//* @param[in] pu1_src
+//* UWORD8 pointer to the source
+//*
+//* @param[out] pu1_dst
+//* UWORD8 pointer to the destination
+//*
+//* @param[in] src_strd
+//* integer source stride
+//*
+//* @param[in] dst_strd
+//* integer destination stride
+//*
+//* @param[in] ht
+//* integer height of the array
+//*
+//* @param[in] wd
+//* integer width of the array
+//*
+// @param[in] pu1_tmp: temporary buffer: UNUSED in this function
+//*
+//* @param[in] dydx: x and y reference offset for qpel calculations.
+//* @returns
+//*
+// @remarks
+//* None
+//*
+//*******************************************************************************
+//*/
+
+//void ih264_inter_pred_luma_horz (
+// UWORD8 *pu1_src,
+// UWORD8 *pu1_dst,
+// WORD32 src_strd,
+// WORD32 dst_strd,
+// WORD32 ht,
+// WORD32 wd,
+// UWORD8* pu1_tmp,
+// UWORD32 dydx)
+
+//**************Variables Vs Registers*****************************************
+// x0 => *pu1_src
+// x1 => *pu1_dst
+// x2 => src_strd
+// x3 => dst_strd
+// x4 => ht
+// x5 => wd
+// x7 => dydx
+
+.text
+.p2align 2
+.include "ih264_neon_macros.s"
+
+
+
+
+ .global ih264_inter_pred_luma_horz_qpel_av8
+
+ih264_inter_pred_luma_horz_qpel_av8:
+
+
+ push_v_regs
+ stp x19, x20, [sp, #-16]!
+
+
+ and x7, x7, #3 //Finds x-offset
+ add x7, x0, x7, lsr #1 //pu1_src + (x_offset>>1)
+ sub x0, x0, #2 //pu1_src-2
+ sub x14, x4, #16
+ movi v0.16b, #5 //filter coeff
+ subs x12, x5, #8 //if wd=8 branch to loop_8
+ movi v1.16b, #20 //filter coeff
+
+ beq loop_8
+
+ subs x12, x5, #4 //if wd=4 branch to loop_4
+ beq loop_4
+
+loop_16: //when wd=16
+ //// Processing row0 and row1
+ ld1 {v2.8b, v3.8b, v4.8b}, [x0], x2 //// Load row0
+ add x14, x14, #1 //for checking loop
+ ext v31.8b, v2.8b , v3.8b , #5
+ ld1 {v5.8b, v6.8b, v7.8b}, [x0], x2 //// Load row1
+ ext v30.8b, v3.8b , v4.8b , #5
+ uaddl v8.8h, v31.8b, v2.8b //// a0 + a5 (column1,row0)
+ ext v28.8b, v5.8b , v6.8b , #5
+ uaddl v10.8h, v30.8b, v3.8b //// a0 + a5 (column2,row0)
+ ext v27.8b, v6.8b , v7.8b , #5
+ uaddl v14.8h, v28.8b, v5.8b //// a0 + a5 (column1,row1)
+ ext v31.8b, v2.8b , v3.8b , #2
+ uaddl v16.8h, v27.8b, v6.8b //// a0 + a5 (column2,row1)
+ ext v30.8b, v3.8b , v4.8b , #2
+ umlal v8.8h, v31.8b, v1.8b //// a0 + a5 + 20a2 (column1,row0)
+ ext v28.8b, v5.8b , v6.8b , #2
+ umlal v10.8h, v30.8b, v1.8b //// a0 + a5 + 20a2 (column2,row0)
+ ext v27.8b, v6.8b , v7.8b , #2
+ umlal v14.8h, v28.8b, v1.8b //// a0 + a5 + 20a2 (column1,row1)
+ ext v31.8b, v2.8b , v3.8b , #3
+ umlal v16.8h, v27.8b, v1.8b //// a0 + a5 + 20a2 (column2,row1)
+ ext v30.8b, v3.8b , v4.8b , #3
+ umlal v8.8h, v31.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column1,row0)
+ ext v28.8b, v5.8b , v6.8b , #3
+ umlal v10.8h, v30.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column2,row0)
+ ext v27.8b, v6.8b , v7.8b , #3
+ umlal v14.8h, v28.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column1,row1)
+ ext v31.8b, v2.8b , v3.8b , #1
+ umlal v16.8h, v27.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column2,row1)
+ ext v30.8b, v3.8b , v4.8b , #1
+ umlsl v8.8h, v31.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column1,row0)
+ ext v28.8b, v5.8b , v6.8b , #1
+ umlsl v10.8h, v30.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column2,row0)
+ ext v27.8b, v6.8b , v7.8b , #1
+ umlsl v14.8h, v28.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column1,row1)
+ ext v31.8b, v2.8b , v3.8b , #4
+ umlsl v16.8h, v27.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column2,row1)
+ ext v30.8b, v3.8b , v4.8b , #4
+ umlsl v8.8h, v31.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1,row0)
+ ext v28.8b, v5.8b , v6.8b , #4
+ umlsl v10.8h, v30.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column2,row0)
+ ext v27.8b, v6.8b , v7.8b , #4
+ umlsl v14.8h, v28.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1,row1)
+ ld1 {v2.8b, v3.8b, v4.8b}, [x0], x2 //// Load row2
+ umlsl v16.8h, v27.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column2,row1)
+
+ ld1 {v12.2s, v13.2s}, [x7], x2 //Load value for interpolation (column1,row0)
+ sqrshrun v20.8b, v8.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column1,row0)
+ ld1 {v5.8b, v6.8b, v7.8b}, [x0], x2 //// Load row3
+ sqrshrun v21.8b, v10.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column2,row0)
+ ext v31.8b, v2.8b , v3.8b , #5
+ urhadd v20.16b, v12.16b , v20.16b //Interpolation step for qpel calculation
+ urhadd v21.16b, v13.16b , v21.16b //Interpolation step for qpel calculation
+
+ sqrshrun v18.8b, v14.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column1,row1)
+ st1 {v20.8b, v21.8b}, [x1], x3 ////Store dest row0
+ ext v30.8b, v3.8b , v4.8b , #5
+ sqrshrun v19.8b, v16.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column2,row1)
+
+
+
+//// Processing row2 and row3
+ ld1 {v12.2s, v13.2s}, [x7], x2 //Load value for interpolation (column1,row1)
+ ext v28.8b, v5.8b , v6.8b , #5
+ urhadd v18.16b, v12.16b , v18.16b //Interpolation step for qpel calculation
+ urhadd v19.16b, v13.16b , v19.16b //Interpolation step for qpel calculation
+
+ uaddl v8.8h, v31.8b, v2.8b //// a0 + a5 (column1,row2)
+ st1 {v18.8b, v19.8b}, [x1], x3 ////Store dest row1
+ uaddl v10.8h, v30.8b, v3.8b //// a0 + a5 (column2,row2)
+ ext v27.8b, v6.8b , v7.8b , #5
+ uaddl v14.8h, v28.8b, v5.8b //// a0 + a5 (column1,row3)
+ ext v31.8b, v2.8b , v3.8b , #2
+ uaddl v16.8h, v27.8b, v6.8b //// a0 + a5 (column2,row3)
+ ext v30.8b, v3.8b , v4.8b , #2
+ umlal v8.8h, v31.8b, v1.8b //// a0 + a5 + 20a2 (column1,row2)
+ ext v27.8b, v6.8b , v7.8b , #2
+ umlal v10.8h, v30.8b, v1.8b //// a0 + a5 + 20a2 (column2,row2)
+ ext v28.8b, v5.8b , v6.8b , #2
+ umlal v14.8h, v28.8b, v1.8b //// a0 + a5 + 20a2 (column1,row3)
+ ext v31.8b, v2.8b , v3.8b , #3
+ umlal v16.8h, v27.8b, v1.8b //// a0 + a5 + 20a2 (column2,row3)
+ ext v30.8b, v3.8b , v4.8b , #3
+ umlal v8.8h, v31.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column1,row2)
+ ext v28.8b, v5.8b , v6.8b , #3
+ umlal v10.8h, v30.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column2,row2)
+ ext v27.8b, v6.8b , v7.8b , #3
+ umlal v14.8h, v28.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column1,row3)
+ ext v31.8b, v2.8b , v3.8b , #1
+ umlal v16.8h, v27.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column2,row3)
+ ext v30.8b, v3.8b , v4.8b , #1
+ umlsl v8.8h, v31.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column1,row2)
+ ext v28.8b, v5.8b , v6.8b , #1
+ umlsl v10.8h, v30.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column2,row2)
+ ext v27.8b, v6.8b , v7.8b , #1
+ umlsl v14.8h, v28.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column1,row3)
+ ext v31.8b, v2.8b , v3.8b , #4
+ umlsl v16.8h, v27.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column2,row3)
+ ext v30.8b, v3.8b , v4.8b , #4
+ umlsl v8.8h, v31.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1,row2)
+ ext v28.8b, v5.8b , v6.8b , #4
+ umlsl v10.8h, v30.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column2,row2)
+ ext v27.8b, v6.8b , v7.8b , #4
+ umlsl v14.8h, v28.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1,row3)
+ ld1 {v2.8b, v3.8b, v4.8b}, [x0], x2 //// Load row4
+ umlsl v16.8h, v27.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column2,row3)
+
+ ld1 {v12.2s, v13.2s}, [x7], x2 //Load value for interpolation (column1,row2)
+ sqrshrun v20.8b, v8.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column1,row2)
+ ld1 {v5.8b, v6.8b, v7.8b}, [x0], x2 //// Load row5
+ sqrshrun v21.8b, v10.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column2,row2)
+ ext v31.8b, v2.8b , v3.8b , #5
+ urhadd v20.16b, v12.16b , v20.16b //Interpolation step for qpel calculation
+ urhadd v21.16b, v13.16b , v21.16b //Interpolation step for qpel calculation
+
+ sqrshrun v18.8b, v14.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column1,row3)
+ ext v30.8b, v3.8b , v4.8b , #5
+ st1 {v20.8b, v21.8b}, [x1], x3 ////Store dest row2
+ sqrshrun v19.8b, v16.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column2,row3)
+ ld1 {v12.2s, v13.2s}, [x7], x2 //Load value for interpolation (column1,row3)
+
+//// Processing row4 and row5
+ ext v28.8b, v5.8b , v6.8b , #5
+ urhadd v18.16b, v12.16b , v18.16b //Interpolation step for qpel calculation
+ urhadd v19.16b, v13.16b , v19.16b //Interpolation step for qpel calculation
+
+ uaddl v8.8h, v31.8b, v2.8b //// a0 + a5 (column1,row4)
+ st1 {v18.8b, v19.8b}, [x1], x3 ////Store dest row3
+ uaddl v10.8h, v30.8b, v3.8b //// a0 + a5 (column2,row4)
+ ext v27.8b, v6.8b , v7.8b , #5
+ uaddl v14.8h, v28.8b, v5.8b //// a0 + a5 (column1,row5)
+ ext v31.8b, v2.8b , v3.8b , #2
+ uaddl v16.8h, v27.8b, v6.8b //// a0 + a5 (column2,row5)
+ ext v30.8b, v3.8b , v4.8b , #2
+ umlal v8.8h, v31.8b, v1.8b //// a0 + a5 + 20a2 (column1,row4)
+ ext v27.8b, v6.8b , v7.8b , #2
+ umlal v10.8h, v30.8b, v1.8b //// a0 + a5 + 20a2 (column2,row4)
+ ext v28.8b, v5.8b , v6.8b , #2
+ umlal v14.8h, v28.8b, v1.8b //// a0 + a5 + 20a2 (column1,row5)
+ ext v31.8b, v2.8b , v3.8b , #3
+ umlal v16.8h, v27.8b, v1.8b //// a0 + a5 + 20a2 (column2,row5)
+ ext v30.8b, v3.8b , v4.8b , #3
+ umlal v8.8h, v31.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column1,row4)
+ ext v28.8b, v5.8b , v6.8b , #3
+ umlal v10.8h, v30.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column2,row4)
+ ext v27.8b, v6.8b , v7.8b , #3
+ umlal v14.8h, v28.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column1,row5)
+ ext v31.8b, v2.8b , v3.8b , #1
+ umlal v16.8h, v27.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column2,row5)
+ ext v30.8b, v3.8b , v4.8b , #1
+ umlsl v8.8h, v31.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column1,row4)
+ ext v28.8b, v5.8b , v6.8b , #1
+ umlsl v10.8h, v30.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column2,row4)
+ ext v27.8b, v6.8b , v7.8b , #1
+ umlsl v14.8h, v28.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column1,row4)
+ ext v31.8b, v2.8b , v3.8b , #4
+ umlsl v16.8h, v27.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column2,row5)
+ ext v30.8b, v3.8b , v4.8b , #4
+ umlsl v8.8h, v31.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1,row4)
+ ext v28.8b, v5.8b , v6.8b , #4
+ umlsl v10.8h, v30.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column2,row4)
+ ext v27.8b, v6.8b , v7.8b , #4
+ umlsl v14.8h, v28.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1,row5)
+ ld1 {v2.8b, v3.8b, v4.8b}, [x0], x2 //// Load row6
+ umlsl v16.8h, v27.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column2,row5)
+ ld1 {v12.2s, v13.2s}, [x7], x2 //Load value for interpolation (column1,row4)
+ sqrshrun v20.8b, v8.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column1,row4)
+ ld1 {v5.8b, v6.8b, v7.8b}, [x0], x2 //// Load row7
+ sqrshrun v21.8b, v10.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column2,row4)
+ ext v31.8b, v2.8b , v3.8b , #5
+ urhadd v20.16b, v12.16b , v20.16b //Interpolation step for qpel calculation
+ urhadd v21.16b, v13.16b , v21.16b //Interpolation step for qpel calculation
+
+ sqrshrun v18.8b, v14.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column1,row5)
+ st1 {v20.8b, v21.8b}, [x1], x3 ////Store dest row4
+ ext v30.8b, v3.8b , v4.8b , #5
+ sqrshrun v19.8b, v16.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column2,row5)
+ ld1 {v12.2s, v13.2s}, [x7], x2 //Load value for interpolation (column1,row5)
+
+
+ //// Processing row6 and row7
+
+ ext v28.8b, v5.8b , v6.8b , #5
+ urhadd v18.16b, v12.16b , v18.16b //Interpolation step for qpel calculation
+ urhadd v19.16b, v13.16b , v19.16b //Interpolation step for qpel calculation
+
+ uaddl v8.8h, v31.8b, v2.8b //// a0 + a5 (column1,row6)
+ st1 {v18.8b, v19.8b}, [x1], x3 ////Store dest row5
+ uaddl v10.8h, v30.8b, v3.8b //// a0 + a5 (column2,row6)
+ ext v27.8b, v6.8b , v7.8b , #5
+ uaddl v14.8h, v28.8b, v5.8b //// a0 + a5 (column1,row7)
+ ext v31.8b, v2.8b , v3.8b , #2
+ uaddl v16.8h, v27.8b, v6.8b //// a0 + a5 (column2,row7)
+ ext v30.8b, v3.8b , v4.8b , #2
+ umlal v8.8h, v31.8b, v1.8b //// a0 + a5 + 20a2 (column1,row6)
+ ext v27.8b, v6.8b , v7.8b , #2
+ umlal v10.8h, v30.8b, v1.8b //// a0 + a5 + 20a2 (column2,row6)
+ ext v28.8b, v5.8b , v6.8b , #2
+ umlal v14.8h, v28.8b, v1.8b //// a0 + a5 + 20a2 (column1,row7)
+ ext v31.8b, v2.8b , v3.8b , #3
+ umlal v16.8h, v27.8b, v1.8b //// a0 + a5 + 20a2 (column2,row7)
+ ext v30.8b, v3.8b , v4.8b , #3
+ umlal v8.8h, v31.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column1,row6)
+ ext v28.8b, v5.8b , v6.8b , #3
+ umlal v10.8h, v30.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column2,row6)
+ ext v27.8b, v6.8b , v7.8b , #3
+ umlal v14.8h, v28.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column1,row7)
+ ext v31.8b, v2.8b , v3.8b , #1
+ umlal v16.8h, v27.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column2,row7)
+ ext v30.8b, v3.8b , v4.8b , #1
+ umlsl v8.8h, v31.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column1,row6)
+ ext v28.8b, v5.8b , v6.8b , #1
+ umlsl v10.8h, v30.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column2,row6)
+ ext v27.8b, v6.8b , v7.8b , #1
+ umlsl v14.8h, v28.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column1,row6)
+ ext v31.8b, v2.8b , v3.8b , #4
+ umlsl v16.8h, v27.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column2,row7)
+ ext v30.8b, v3.8b , v4.8b , #4
+ umlsl v8.8h, v31.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1,row6)
+ ext v28.8b, v5.8b , v6.8b , #4
+ umlsl v10.8h, v30.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column2,row6)
+ ext v27.8b, v6.8b , v7.8b , #4
+ ld1 {v12.2s, v13.2s}, [x7], x2 //Load value for interpolation (column1,row6)
+ sqrshrun v20.8b, v8.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column1,row6)
+ umlsl v14.8h, v28.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1,row7)
+ sqrshrun v21.8b, v10.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column2,row6)
+ umlsl v16.8h, v27.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column2,row7)
+ urhadd v20.16b, v12.16b , v20.16b //Interpolation step for qpel calculation
+ urhadd v21.16b, v13.16b , v21.16b //Interpolation step for qpel calculation
+
+ ld1 {v12.2s, v13.2s}, [x7], x2 //Load value for interpolation (column1,row7)
+ sqrshrun v18.8b, v14.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column1,row7)
+ st1 {v20.8b, v21.8b}, [x1], x3 ////Store dest row6
+ sqrshrun v19.8b, v16.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column2,row7)
+ urhadd v18.16b, v12.16b , v18.16b //Interpolation step for qpel calculation
+ urhadd v19.16b, v13.16b , v19.16b //Interpolation step for qpel calculation
+
+ subs x12, x14, #1 // if height==16 - looping
+ st1 {v18.8b, v19.8b}, [x1], x3 ////Store dest row7
+
+
+
+ beq loop_16
+ b end_func
+
+loop_8:
+//// Processing row0 and row1
+
+ ld1 {v5.8b, v6.8b}, [x0], x2 //// Load row1
+ add x14, x14, #1 //for checking loop
+ ext v28.8b, v5.8b , v6.8b , #5
+ ld1 {v2.8b, v3.8b}, [x0], x2 //// Load row0
+ ext v25.8b, v5.8b , v6.8b , #2
+ ext v31.8b, v2.8b , v3.8b , #5
+ ext v24.8b, v5.8b , v6.8b , #3
+ ext v23.8b, v5.8b , v6.8b , #1
+ ext v22.8b, v5.8b , v6.8b , #4
+ uaddl v14.8h, v28.8b, v5.8b //// a0 + a5 (column1,row1)
+ ext v29.8b, v2.8b , v3.8b , #3
+ umlal v14.8h, v25.8b, v1.8b //// a0 + a5 + 20a2 (column1,row1)
+ umlal v14.8h, v24.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column1,row1)
+ umlsl v14.8h, v23.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column1,row1)
+ umlsl v14.8h, v22.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1,row1)
+ ext v30.8b, v2.8b , v3.8b , #2
+ uaddl v8.8h, v31.8b, v2.8b //// a0 + a5 (column1,row0)
+ ext v27.8b, v2.8b , v3.8b , #1
+ ext v26.8b, v2.8b , v3.8b , #4
+ ld1 {v2.8b, v3.8b}, [x0], x2 //// Load row2
+ umlal v8.8h, v29.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column1,row0)
+ umlal v8.8h, v30.8b, v1.8b //// a0 + a5 + 20a2 (column1,row0)
+ umlsl v8.8h, v27.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column1,row0)
+ umlsl v8.8h, v26.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1,row0)
+ ld1 {v5.8b, v6.8b}, [x0], x2 //// Load row3
+ sqrshrun v18.8b, v14.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column1,row0)
+
+ //// Processing row2 and row3
+ ext v28.8b, v5.8b , v6.8b , #5
+ ext v25.8b, v5.8b , v6.8b , #2
+ ext v31.8b, v2.8b , v3.8b , #5
+ uaddl v14.8h, v28.8b, v5.8b //// a0 + a5 (column1,row3)
+ ld1 {v12.2s}, [x7], x2 //Load value for interpolation (column1,row0)
+ ld1 {v13.2s}, [x7], x2 //Load value for interpolation (column1,row1)
+ ext v24.8b, v5.8b , v6.8b , #3
+ ext v23.8b, v5.8b , v6.8b , #1
+ sqrshrun v19.8b, v8.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column1,row1)
+ ext v22.8b, v5.8b , v6.8b , #4
+ ext v29.8b, v2.8b , v3.8b , #3
+ umlal v14.8h, v25.8b, v1.8b //// a0 + a5 + 20a2 (column1,row3)
+ umlal v14.8h, v24.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column1,row3)
+ umlsl v14.8h, v23.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column1,row3)
+ umlsl v14.8h, v22.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1,row3)
+ urhadd v18.16b, v12.16b , v18.16b //Interpolation step for qpel calculation
+ urhadd v19.16b, v13.16b , v19.16b //Interpolation step for qpel calculation
+
+ st1 {v18.8b}, [x1], x3 ////Store dest row0
+ st1 {v19.8b}, [x1], x3 ////Store dest row1
+ uaddl v8.8h, v31.8b, v2.8b //// a0 + a5 (column1,row2)
+ ext v30.8b, v2.8b , v3.8b , #2
+ ext v27.8b, v2.8b , v3.8b , #1
+ ext v26.8b, v2.8b , v3.8b , #4
+ ld1 {v2.8b, v3.8b}, [x0], x2 //// Load row4
+ umlal v8.8h, v29.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column1,row2)
+ umlal v8.8h, v30.8b, v1.8b //// a0 + a5 + 20a2 (column1,row2)
+ umlsl v8.8h, v27.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column1,row2)
+ umlsl v8.8h, v26.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1,row2)
+ ld1 {v5.8b, v6.8b}, [x0], x2 //// Load row5
+ subs x9, x4, #4
+ sqrshrun v19.8b, v14.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column1,row3)
+ ld1 {v12.2s}, [x7], x2 //Load value for interpolation (column1,row2)
+ ld1 {v13.2s}, [x7], x2 //Load value for interpolation (column1,row3)
+ ext v28.8b, v5.8b , v6.8b , #5
+ ext v25.8b, v5.8b , v6.8b , #2
+ ext v31.8b, v2.8b , v3.8b , #5
+ uaddl v14.8h, v28.8b, v5.8b //// a0 + a5 (column1,row5)
+ ext v24.8b, v5.8b , v6.8b , #3
+ sqrshrun v18.8b, v8.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column1,row2)
+ ext v22.8b, v5.8b , v6.8b , #4
+ ext v29.8b, v2.8b , v3.8b , #3
+ urhadd v18.16b, v12.16b , v18.16b //Interpolation step for qpel calculation
+ urhadd v19.16b, v13.16b , v19.16b //Interpolation step for qpel calculation
+
+ st1 {v18.8b}, [x1], x3 ////Store dest row2
+ ext v30.8b, v2.8b , v3.8b , #2
+ uaddl v8.8h, v31.8b, v2.8b //// a0 + a5 (column1,row4)
+ st1 {v19.8b}, [x1], x3 ////Store dest row3
+ beq end_func // Branch if height==4
+
+//// Processing row4 and row5
+ ext v23.8b, v5.8b , v6.8b , #1
+ umlal v14.8h, v25.8b, v1.8b //// a0 + a5 + 20a2 (column1,row5)
+ umlal v14.8h, v24.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column1,row5)
+ umlsl v14.8h, v23.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column1,row5)
+ umlsl v14.8h, v22.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1,row5)
+ ext v27.8b, v2.8b , v3.8b , #1
+ ext v26.8b, v2.8b , v3.8b , #4
+ ld1 {v2.8b, v3.8b}, [x0], x2 //// Load row6
+ umlal v8.8h, v29.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column1,row4)
+ umlal v8.8h, v30.8b, v1.8b //// a0 + a5 + 20a2 (column1,row4)
+ umlsl v8.8h, v27.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column1,row4)
+ umlsl v8.8h, v26.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1,row4)
+ sqrshrun v19.8b, v14.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column1,row5)
+ ld1 {v5.8b, v6.8b}, [x0], x2 //// Load row7
+ ext v31.8b, v2.8b , v3.8b , #5
+ ext v28.8b, v5.8b , v6.8b , #5
+ ld1 {v12.2s}, [x7], x2 //Load value for interpolation (column1,row4)
+ ld1 {v13.2s}, [x7], x2 //Load value for interpolation (column1,row5)
+ ext v25.8b, v5.8b , v6.8b , #2
+ uaddl v14.8h, v28.8b, v5.8b //// a0 + a5 (column1,row7)
+ ext v24.8b, v5.8b , v6.8b , #3
+ ext v22.8b, v5.8b , v6.8b , #4
+ sqrshrun v18.8b, v8.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column1,row4)
+ ext v29.8b, v2.8b , v3.8b , #3
+ ext v30.8b, v2.8b , v3.8b , #2
+ urhadd v18.16b, v12.16b , v18.16b //Interpolation step for qpel calculation
+ urhadd v19.16b, v13.16b , v19.16b //Interpolation step for qpel calculation
+
+ st1 {v18.8b}, [x1], x3 ////Store dest row4
+ ext v27.8b, v2.8b , v3.8b , #1
+ uaddl v8.8h, v31.8b, v2.8b //// a0 + a5 (column1,row6)
+ ext v26.8b, v2.8b , v3.8b , #4
+ umlal v8.8h, v29.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column1,row6)
+ umlal v8.8h, v30.8b, v1.8b //// a0 + a5 + 20a2 (column1,row6)
+ umlsl v8.8h, v27.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column1,row6)
+ umlsl v8.8h, v26.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1,row6)
+ //// Processing row6 and row7
+ st1 {v19.8b}, [x1], x3 ////Store dest row5
+ ext v23.8b, v5.8b , v6.8b , #1
+ umlal v14.8h, v25.8b, v1.8b //// a0 + a5 + 20a2 (column1,row7)
+ umlal v14.8h, v24.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column1,row7)
+ umlsl v14.8h, v23.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column1,row7)
+ umlsl v14.8h, v22.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1,row7)
+ ld1 {v12.2s}, [x7], x2 //Load value for interpolation (column1,row6)
+ ld1 {v13.2s}, [x7], x2 //Load value for interpolation (column1,row7)
+ sqrshrun v18.8b, v8.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column1,row6)
+ subs x12, x14, #1
+ sqrshrun v19.8b, v14.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column1,row7)
+ urhadd v18.16b, v12.16b , v18.16b //Interpolation step for qpel calculation
+ urhadd v19.16b, v13.16b , v19.16b //Interpolation step for qpel calculation
+
+ st1 {v18.8b}, [x1], x3 ////Store dest row6
+ st1 {v19.8b}, [x1], x3 ////Store dest row7
+
+ beq loop_8 //looping if height ==16
+
+ b end_func
+
+loop_4:
+ ld1 {v5.8b, v6.8b}, [x0], x2 //// Load row1
+ ext v28.8b, v5.8b , v6.8b , #5
+ ld1 {v2.8b, v3.8b}, [x0], x2 //// Load row0
+ ext v25.8b, v5.8b , v6.8b , #2
+ ext v31.8b, v2.8b , v3.8b , #5
+ uaddl v14.8h, v28.8b, v5.8b //// a0 + a5 (column1,row1)
+ ext v24.8b, v5.8b , v6.8b , #3
+ ext v23.8b, v5.8b , v6.8b , #1
+ ext v22.8b, v5.8b , v6.8b , #4
+ ext v29.8b, v2.8b , v3.8b , #3
+ umlal v14.8h, v25.8b, v1.8b //// a0 + a5 + 20a2 (column1,row1)
+ umlal v14.8h, v24.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column1,row1)
+ umlsl v14.8h, v23.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column1,row1)
+ umlsl v14.8h, v22.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1,row1)
+ uaddl v8.8h, v31.8b, v2.8b //// a0 + a5 (column1,row0)
+ ext v30.8b, v2.8b , v3.8b , #2
+ ld1 {v12.2s}, [x7], x2 //Load value for interpolation (column1,row0)
+ ld1 {v13.2s}, [x7], x2 //Load value for interpolation (column1,row1)
+ ext v27.8b, v2.8b , v3.8b , #1
+ ext v26.8b, v2.8b , v3.8b , #4
+ ld1 {v2.8b, v3.8b}, [x0], x2 //// Load row2
+ umlal v8.8h, v29.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column1,row0)
+ umlal v8.8h, v30.8b, v1.8b //// a0 + a5 + 20a2 (column1,row0)
+ umlsl v8.8h, v27.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column1,row0)
+ umlsl v8.8h, v26.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1,row0)
+ ld1 {v5.8b, v6.8b}, [x0], x2 //// Load row3
+ ext v28.8b, v5.8b , v6.8b , #5
+ ext v25.8b, v5.8b , v6.8b , #2
+ sqrshrun v18.8b, v14.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column1,row0)
+ ext v31.8b, v2.8b , v3.8b , #5
+ ext v24.8b, v5.8b , v6.8b , #3
+
+ ext v23.8b, v5.8b , v6.8b , #1
+ ext v22.8b, v5.8b , v6.8b , #4
+ ext v29.8b, v2.8b , v3.8b , #3
+ sqrshrun v19.8b, v8.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column1,row1)
+ ext v30.8b, v2.8b , v3.8b , #2
+ ext v27.8b, v2.8b , v3.8b , #1
+
+ //// Processing row2 and row3
+ urhadd v18.16b, v12.16b , v18.16b //Interpolation step for qpel calculation
+ urhadd v19.16b, v13.16b , v19.16b //Interpolation step for qpel calculation
+
+ st1 {v18.s}[0], [x1], x3 ////Store dest row0
+ st1 {v19.s}[0], [x1], x3 ////Store dest row1
+ uaddl v14.8h, v28.8b, v5.8b //// a0 + a5 (column1,row3)
+ ext v26.8b, v2.8b , v3.8b , #4
+ ld1 {v12.2s}, [x7], x2 //Load value for interpolation (column1,row2)
+ ld1 {v13.2s}, [x7], x2 //Load value for interpolation (column1,row3)
+
+ umlal v14.8h, v25.8b, v1.8b //// a0 + a5 + 20a2 (column1,row3)
+ umlal v14.8h, v24.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column1,row3)
+ umlsl v14.8h, v23.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column1,row3)
+ umlsl v14.8h, v22.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1,row3)
+ uaddl v8.8h, v31.8b, v2.8b //// a0 + a5 (column1,row2)
+ umlal v8.8h, v29.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column1,row2)
+ umlal v8.8h, v30.8b, v1.8b //// a0 + a5 + 20a2 (column1,row2)
+ umlsl v8.8h, v27.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column1,row2)
+ umlsl v8.8h, v26.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1,row2)
+ sqrshrun v19.8b, v14.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column1,row3)
+ sqrshrun v18.8b, v8.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column1,row2)
+ urhadd v18.16b, v12.16b , v18.16b //Interpolation step for qpel calculation
+ urhadd v19.16b, v13.16b , v19.16b //Interpolation step for qpel calculation
+
+ st1 {v18.s}[0], [x1], x3 ////Store dest row2
+ subs x4, x4, #8 // Loop if height =8
+ st1 {v19.s}[0], [x1], x3 ////Store dest row3
+
+ beq loop_4
+
+end_func:
+
+ ldp x19, x20, [sp], #16
+ pop_v_regs
+ ret
+
+
+
diff --git a/common/armv8/ih264_inter_pred_luma_horz_qpel_vert_hpel_av8.s b/common/armv8/ih264_inter_pred_luma_horz_qpel_vert_hpel_av8.s
new file mode 100755
index 0000000..b1e4866
--- /dev/null
+++ b/common/armv8/ih264_inter_pred_luma_horz_qpel_vert_hpel_av8.s
@@ -0,0 +1,910 @@
+//******************************************************************************
+//*
+//* Copyright (C) 2015 The Android Open Source Project
+//*
+//* Licensed under the Apache License, Version 2.0 (the "License");
+//* you may not use this file except in compliance with the License.
+//* You may obtain a copy of the License at:
+//*
+//* http://www.apache.org/licenses/LICENSE-2.0
+//*
+//* Unless required by applicable law or agreed to in writing, software
+//* distributed under the License is distributed on an "AS IS" BASIS,
+//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//* See the License for the specific language governing permissions and
+//* limitations under the License.
+//*
+//*****************************************************************************
+//* Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
+//*/
+///**
+//******************************************************************************
+//* @file
+//* ih264_inter_pred_luma_horz_qpel_vert_hpel_av8.s
+//*
+//* @brief
+//* Contains function definitions for inter prediction interpolation.
+//*
+//* @author
+//* Mohit
+//*
+//* @par List of Functions:
+//*
+//* - ih264_inter_pred_luma_horz_qpel_vert_hpel_av8()
+//*
+//* @remarks
+//* None
+//*
+//*******************************************************************************
+//*/
+
+///* All the functions here are replicated from ih264_inter_pred_filters.c
+//
+
+///**
+///**
+///**
+//*******************************************************************************
+//*
+//* @brief
+//* This function implements a two stage cascaded six tap filter. It
+//* applies the six tap filter in the vertical direction on the
+//* predictor values, followed by applying the same filter in the
+//* horizontal direction on the output of the first stage. It then averages
+//* the output of the 1st stage and the final stage to obtain the quarter
+//* pel values.The six tap filtering operation is described in sec 8.4.2.2.1
+//* titled "Luma sample interpolation process".
+//*
+//* @par Description:
+//* This function is called to obtain pixels lying at the following
+//* location (1/4,1/2) or (3/4,1/2). The function interpolates
+//* the predictors first in the verical direction and then in the
+//* horizontal direction to output the (1/2,1/2). It then averages
+//* the output of the 2nd stage and (1/2,1/2) value to obtain (1/4,1/2)
+//* or (3/4,1/2) depending on the offset.
+//*
+//* @param[in] pu1_src
+//* UWORD8 pointer to the source
+//*
+//* @param[out] pu1_dst
+//* UWORD8 pointer to the destination
+//*
+//* @param[in] src_strd
+//* integer source stride
+//*
+//* @param[in] dst_strd
+//* integer destination stride
+//*
+//* @param[in] ht
+//* integer height of the array
+//*
+//* @param[in] wd
+//* integer width of the array
+//*
+//* @param[in] pu1_tmp: temporary buffer
+//*
+//* @param[in] dydx: x and y reference offset for qpel calculations
+//*
+//* @returns
+//*
+//* @remarks
+//* None
+//*
+//*******************************************************************************
+//*/;
+
+//void ih264_inter_pred_luma_horz_qpel_vert_hpel(UWORD8 *pu1_src,
+// UWORD8 *pu1_dst,
+// WORD32 src_strd,,
+// WORD32 dst_strd,
+// WORD32 ht,
+// WORD32 wd,
+// UWORD8* pu1_tmp,
+// UWORD32 dydx)
+
+//**************Variables Vs Registers*****************************************
+// x0 => *pu1_src
+// x1 => *pu1_dst
+// x2 => src_strd
+// x3 => dst_strd
+// x4 => ht
+// x5 => wd
+// x6 => dydx
+// x9 => *pu1_tmp
+
+.text
+.p2align 2
+.include "ih264_neon_macros.s"
+
+
+
+ .global ih264_inter_pred_luma_horz_qpel_vert_hpel_av8
+
+ih264_inter_pred_luma_horz_qpel_vert_hpel_av8:
+
+ // STMFD sp!, {x4-x12, x14} //store register values to stack
+ push_v_regs
+ stp x19, x20, [sp, #-16]!
+
+ sub x0, x0, x2, lsl #1 //pu1_src-2*src_strd
+ sub x0, x0, #2 //pu1_src-2
+ mov x9, x6
+ mov x6, x7
+
+ and x6, x6, #2 // dydx & 0x3 followed by dydx>>1 and dydx<<1
+
+ add x7, x9, #4
+ add x6, x7, x6 // pi16_pred1_temp += (x_offset>>1)
+
+ movi v26.8h, #0x14 // Filter coeff 20 into Q13
+ movi v24.8h, #0x5 // Filter coeff 5 into Q12
+ movi v27.8h, #0x14 // Filter coeff 20 into Q13
+ movi v25.8h, #0x5 // Filter coeff 5 into Q12
+ mov x7, #0x20
+ mov x8, #0x30
+ subs x12, x5, #4 //if wd=4 branch to loop_4
+ beq loop_4_start
+
+ subs x12, x5, #8 //if wd=8 branch to loop_8
+ beq loop_8_start
+
+ //when wd=16
+ movi v28.8h, #0x14 // Filter coeff 20 into Q13
+ movi v30.8h, #0x5 // Filter coeff 5 into Q12
+ sub x2, x2, #16
+ ld1 {v0.2s, v1.2s}, [x0], #16 // Vector load from src[0_0]
+ ld1 {v12.2s}, [x0], x2 // Vector load from src[0_0]
+ ld1 {v2.2s, v3.2s}, [x0], #16 // Vector load from src[1_0]
+ ld1 {v13.2s}, [x0], x2 // Vector load from src[1_0]
+ ld1 {v4.2s, v5.2s}, [x0], #16 // Vector load from src[2_0]
+ ld1 {v14.2s}, [x0], x2 // Vector load from src[2_0]
+ ld1 {v6.2s, v7.2s}, [x0], #16 // Vector load from src[3_0]
+ ld1 {v15.2s}, [x0], x2 // Vector load from src[3_0]
+ ld1 {v8.2s, v9.2s}, [x0], #16 // Vector load from src[4_0]
+ ld1 {v16.2s}, [x0], x2 // Vector load from src[4_0]
+
+loop_16:
+
+ ld1 {v10.2s, v11.2s}, [x0], #16 // Vector load from src[5_0]
+ ld1 {v17.2s}, [x0], x2 // Vector load from src[5_0]
+
+
+ uaddl v20.8h, v4.8b, v6.8b
+ uaddl v18.8h, v0.8b, v10.8b
+ uaddl v22.8h, v2.8b, v8.8b
+ mla v18.8h, v20.8h , v28.8h
+ uaddl v24.8h, v5.8b, v7.8b
+ uaddl v20.8h, v1.8b, v11.8b
+ uaddl v26.8h, v3.8b, v9.8b
+ mla v20.8h, v24.8h , v28.8h
+ uaddl v24.8h, v14.8b, v15.8b
+ mls v18.8h, v22.8h , v30.8h
+ uaddl v22.8h, v12.8b, v17.8b
+ mls v20.8h, v26.8h , v30.8h
+ uaddl v26.8h, v13.8b, v16.8b
+ mla v22.8h, v24.8h , v28.8h
+ mls v22.8h, v26.8h , v30.8h
+ st1 {v18.4s }, [x9], #16
+ st1 {v20.4s}, [x9], #16
+ ext v24.16b, v18.16b , v20.16b , #4
+ ext v26.16b, v18.16b , v20.16b , #6
+ st1 {v22.4s}, [x9]
+ ext v22.16b, v18.16b , v20.16b , #10
+ add v0.8h, v24.8h , v26.8h
+ ext v24.16b, v18.16b , v20.16b , #2
+ ext v26.16b, v18.16b , v20.16b , #8
+ add v24.8h, v24.8h , v26.8h
+
+ saddl v26.4s, v18.4h, v22.4h
+ smlal v26.4s, v0.4h, v28.4h
+ smlsl v26.4s, v24.4h, v30.4h
+
+ saddl2 v22.4s, v18.8h, v22.8h
+ smlal2 v22.4s, v0.8h, v28.8h
+ smlsl2 v22.4s, v24.8h, v30.8h
+
+ sqrshrun v18.4h, v26.4s, #10
+ sqrshrun v19.4h, v22.4s, #10
+ ld1 {v22.4s}, [x9], #16
+
+ uqxtn v18.8b, v18.8h
+ uqxtn v19.8b, v19.8h
+ mov v18.2s[1], v19.2s[0]
+
+ ext v24.16b, v20.16b , v22.16b , #4
+ ext v26.16b, v20.16b , v22.16b , #6
+ ext v0.16b, v20.16b , v22.16b , #10
+ st1 {v18.2s}, [x1]
+ add v18.8h, v24.8h , v26.8h
+ ext v24.16b, v20.16b , v22.16b , #2
+ ext v26.16b, v20.16b , v22.16b , #8
+ add v24.8h, v24.8h , v26.8h
+
+ saddl v26.4s, v0.4h, v20.4h
+ smlal v26.4s, v18.4h, v28.4h
+ smlsl v26.4s, v24.4h, v30.4h
+
+ saddl2 v22.4s, v0.8h, v20.8h
+ smlal2 v22.4s, v18.8h, v28.8h
+ smlsl2 v22.4s, v24.8h, v30.8h
+
+ sqrshrun v19.4h, v26.4s, #10
+ sqrshrun v18.4h, v22.4s, #10
+
+ uaddl v24.8h, v7.8b, v9.8b
+ ld1 {v20.4s}, [x6], #16
+ ld1 {v22.4s}, [x6], x7
+
+
+ uqxtn v19.8b, v19.8h
+ uqxtn v18.8b, v18.8h
+ mov v19.2s[1], v18.2s[0]
+
+ ld1 {v18.2s}, [x1]
+ sqrshrun v20.8b, v20.8h, #5
+ sqrshrun v21.8b, v22.8h, #5
+ uaddl v22.8h, v4.8b, v10.8b
+ ld1 {v0.2s, v1.2s}, [x0], #16 // Vector load from src[6_0]
+ urhadd v18.16b, v18.16b , v20.16b
+ urhadd v19.16b, v19.16b , v21.16b
+
+ ld1 {v12.2s}, [x0], x2 // Vector load from src[6_0]
+ uaddl v20.8h, v6.8b, v8.8b
+ uaddl v26.8h, v5.8b, v11.8b
+ st1 {v18.2s, v19.2s}, [x1], x3 // store row 0
+
+
+//ROW_2
+
+
+ uaddl v18.8h, v2.8b, v0.8b
+
+ mla v18.8h, v20.8h , v28.8h
+
+ uaddl v20.8h, v3.8b, v1.8b
+
+ mla v20.8h, v24.8h , v28.8h
+ uaddl v24.8h, v15.8b, v16.8b
+ mls v18.8h, v22.8h , v30.8h
+ uaddl v22.8h, v13.8b, v12.8b
+ mls v20.8h, v26.8h , v30.8h
+ uaddl v26.8h, v14.8b, v17.8b
+ mla v22.8h, v24.8h , v28.8h
+ mls v22.8h, v26.8h , v30.8h
+ st1 {v18.4s}, [x9], #16
+ st1 {v20.4s}, [x9], #16
+ ext v24.16b, v18.16b , v20.16b , #4
+ ext v26.16b, v18.16b , v20.16b , #6
+ st1 {v22.4s}, [x9]
+ ext v22.16b, v18.16b , v20.16b , #10
+ add v2.8h, v24.8h , v26.8h
+ ext v24.16b, v18.16b , v20.16b , #2
+ ext v26.16b, v18.16b , v20.16b , #8
+ add v24.8h, v24.8h , v26.8h
+
+ saddl v26.4s, v18.4h, v22.4h
+ smlal v26.4s, v2.4h, v28.4h
+ smlsl v26.4s, v24.4h, v30.4h
+
+ saddl2 v22.4s, v18.8h, v22.8h
+ smlal2 v22.4s, v2.8h, v28.8h
+ smlsl2 v22.4s, v24.8h, v30.8h
+
+ sqrshrun v18.4h, v26.4s, #10
+ sqrshrun v19.4h, v22.4s, #10
+
+ ld1 {v22.4s}, [x9], #16
+
+ uqxtn v18.8b, v18.8h
+ uqxtn v19.8b, v19.8h
+ mov v18.2s[1], v19.2s[0]
+
+ ext v24.16b, v20.16b , v22.16b , #4
+ ext v26.16b, v20.16b , v22.16b , #6
+ ext v2.16b, v20.16b , v22.16b , #10
+ st1 {v18.2s}, [x1]
+ add v18.8h, v24.8h , v26.8h
+ ext v24.16b, v20.16b , v22.16b , #2
+ ext v26.16b, v20.16b , v22.16b , #8
+ add v24.8h, v24.8h , v26.8h
+
+ saddl v26.4s, v2.4h, v20.4h
+ smlal v26.4s, v18.4h, v28.4h
+ smlsl v26.4s, v24.4h, v30.4h
+
+ saddl2 v22.4s, v2.8h, v20.8h
+ smlal2 v22.4s, v18.8h, v28.8h
+ smlsl2 v22.4s, v24.8h, v30.8h
+
+ sqrshrun v19.4h, v26.4s, #10
+ sqrshrun v18.4h, v22.4s, #10
+ uaddl v24.8h, v9.8b, v11.8b
+ ld1 {v20.4s}, [x6], #16
+ ld1 {v22.4s}, [x6], x7
+ uqxtn v19.8b, v19.8h
+ uqxtn v18.8b, v18.8h
+ mov v19.2s[1], v18.2s[0]
+ ld1 {v18.4s}, [x1]
+ sqrshrun v20.8b, v20.8h, #5
+ sqrshrun v21.8b, v22.8h, #5
+
+ uaddl v22.8h, v6.8b, v0.8b
+ ld1 {v2.2s, v3.2s}, [x0], #16 // Vector load from src[7_0]
+
+ urhadd v18.16b, v18.16b , v20.16b
+ urhadd v19.16b, v19.16b , v21.16b
+ ld1 {v13.2s}, [x0], x2 // Vector load from src[7_0]
+ uaddl v20.8h, v8.8b, v10.8b
+ uaddl v26.8h, v7.8b, v1.8b
+ st1 {v18.2s, v19.2s}, [x1], x3 // store row 1
+
+//ROW_3
+
+
+ uaddl v18.8h, v4.8b, v2.8b
+
+ mla v18.8h, v20.8h , v28.8h
+
+ uaddl v20.8h, v5.8b, v3.8b
+
+ mla v20.8h, v24.8h , v28.8h
+ uaddl v24.8h, v16.8b, v17.8b
+ mls v18.8h, v22.8h , v30.8h
+ uaddl v22.8h, v14.8b, v13.8b
+ mls v20.8h, v26.8h , v30.8h
+ uaddl v26.8h, v15.8b, v12.8b
+ mla v22.8h, v24.8h , v28.8h
+ mls v22.8h, v26.8h , v30.8h
+ st1 {v18.4s}, [x9], #16
+ st1 {v20.4s}, [x9], #16
+ ext v24.16b, v18.16b , v20.16b , #4
+ ext v26.16b, v18.16b , v20.16b , #6
+ st1 {v22.4s}, [x9]
+ ext v22.16b, v18.16b , v20.16b , #10
+ add v4.8h, v24.8h , v26.8h
+ ext v24.16b, v18.16b , v20.16b , #2
+ ext v26.16b, v18.16b , v20.16b , #8
+ add v24.8h, v24.8h , v26.8h
+
+ saddl v26.4s, v18.4h, v22.4h
+ smlal v26.4s, v4.4h, v28.4h
+ smlsl v26.4s, v24.4h, v30.4h
+
+ saddl2 v22.4s, v18.8h, v22.8h
+ smlal2 v22.4s, v4.8h, v28.8h
+ smlsl2 v22.4s, v24.8h, v30.8h
+
+ sqrshrun v18.4h, v26.4s, #10
+ sqrshrun v19.4h, v22.4s, #10
+ ld1 {v22.4s}, [x9], #16
+
+ uqxtn v18.8b, v18.8h
+ uqxtn v19.8b, v19.8h
+ mov v18.2s[1], v19.2s[0]
+
+
+ ext v24.16b, v20.16b , v22.16b , #4
+ ext v26.16b, v20.16b , v22.16b , #6
+ ext v4.16b, v20.16b , v22.16b , #10
+ st1 {v18.2s}, [x1]
+ add v18.8h, v24.8h , v26.8h
+ ext v24.16b, v20.16b , v22.16b , #2
+ ext v26.16b, v20.16b , v22.16b , #8
+ add v24.8h, v24.8h , v26.8h
+
+ saddl v26.4s, v4.4h, v20.4h
+ smlal v26.4s, v18.4h, v28.4h
+ smlsl v26.4s, v24.4h, v30.4h
+
+ saddl2 v22.4s, v4.8h, v20.8h
+ smlal2 v22.4s, v18.8h, v28.8h
+ smlsl2 v22.4s, v24.8h, v30.8h
+
+ sqrshrun v19.4h, v26.4s, #10
+ sqrshrun v18.4h, v22.4s, #10
+
+ uaddl v24.8h, v11.8b, v1.8b
+ ld1 {v20.4s}, [x6], #16
+ ld1 {v22.4s}, [x6], x7
+
+ uqxtn v19.8b, v19.8h
+ uqxtn v18.8b, v18.8h
+ mov v19.2s[1], v18.2s[0]
+
+ ld1 {v18.2s}, [x1]
+ sqrshrun v20.8b, v20.8h, #5
+ sqrshrun v21.8b, v22.8h, #5
+
+ uaddl v22.8h, v8.8b, v2.8b
+ ld1 {v4.2s, v5.2s}, [x0], #16 // Vector load from src[8_0]
+
+ urhadd v18.16b, v18.16b , v20.16b
+ urhadd v19.16b, v19.16b , v21.16b
+ ld1 {v14.2s}, [x0], x2 // Vector load from src[8_0]
+ uaddl v20.8h, v10.8b, v0.8b
+ uaddl v26.8h, v9.8b, v3.8b
+ st1 {v18.2s, v19.2s}, [x1], x3 // store row 2
+
+
+//ROW_4
+
+ uaddl v18.8h, v6.8b, v4.8b
+
+ mla v18.8h, v20.8h , v28.8h
+
+ uaddl v20.8h, v7.8b, v5.8b
+
+ mla v20.8h, v24.8h , v28.8h
+ uaddl v24.8h, v17.8b, v12.8b
+ mls v18.8h, v22.8h , v30.8h
+ uaddl v22.8h, v15.8b, v14.8b
+ mls v20.8h, v26.8h , v30.8h
+ uaddl v26.8h, v16.8b, v13.8b
+ mla v22.8h, v24.8h , v28.8h
+ mls v22.8h, v26.8h , v30.8h
+ st1 {v18.4s}, [x9], #16
+ st1 {v20.4s}, [x9], #16
+ ext v24.16b, v18.16b , v20.16b , #4
+ ext v26.16b, v18.16b , v20.16b , #6
+ st1 {v22.4s}, [x9]
+ ext v22.16b, v18.16b , v20.16b , #10
+ add v6.8h, v24.8h , v26.8h
+ ext v24.16b, v18.16b , v20.16b , #2
+ ext v26.16b, v18.16b , v20.16b , #8
+ add v24.8h, v24.8h , v26.8h
+
+ saddl v26.4s, v18.4h, v22.4h
+ smlal v26.4s, v6.4h, v28.4h
+ smlsl v26.4s, v24.4h, v30.4h
+
+ saddl2 v22.4s, v18.8h, v22.8h
+ smlal2 v22.4s, v6.8h, v28.8h
+ smlsl2 v22.4s, v24.8h, v30.8h
+
+ sqrshrun v18.4h, v26.4s, #10
+ sqrshrun v19.4h, v22.4s, #10
+ ld1 {v22.4s}, [x9], #16
+ uqxtn v18.8b, v18.8h
+ uqxtn v19.8b, v19.8h
+ mov v18.2s[1], v19.2s[0]
+
+
+ ext v24.16b, v20.16b , v22.16b , #4
+ ext v26.16b, v20.16b , v22.16b , #6
+ ext v6.16b, v20.16b , v22.16b , #10
+ st1 {v18.2s}, [x1]
+ add v18.8h, v24.8h , v26.8h
+ ext v24.16b, v20.16b , v22.16b , #2
+ ext v26.16b, v20.16b , v22.16b , #8
+ add v24.8h, v24.8h , v26.8h
+
+ saddl v26.4s, v6.4h, v20.4h
+ smlal v26.4s, v18.4h, v28.4h
+ smlsl v26.4s, v24.4h, v30.4h
+
+ saddl2 v22.4s, v6.8h, v20.8h
+ smlal2 v22.4s, v18.8h, v28.8h
+ smlsl2 v22.4s, v24.8h, v30.8h
+
+ mov v6.16b, v2.16b
+ mov v7.16b, v3.16b
+
+ mov v2.16b, v10.16b
+ mov v3.16b, v11.16b
+
+ subs x4, x4, #4
+ sqrshrun v19.4h, v26.4s, #10
+ sqrshrun v18.4h, v22.4s, #10
+ mov v10.16b, v0.16b
+ mov v11.16b, v1.16b
+
+ mov v24.8b, v14.8b
+
+ mov v14.16b, v12.16b
+ mov v15.16b, v13.16b
+
+
+ uqxtn v19.8b, v19.8h
+ uqxtn v18.8b, v18.8h
+ mov v19.2s[1], v18.2s[0]
+
+ ld1 {v20.4s}, [x6], #16
+ ld1 {v22.4s}, [x6], x7
+ ld1 {v18.2s}, [x1]
+ sqrshrun v20.8b, v20.8h, #5
+ sqrshrun v21.8b, v22.8h, #5
+
+ mov v0.16b, v8.16b
+ mov v1.16b, v9.16b
+
+ mov v8.16b, v4.16b
+ mov v9.16b, v5.16b
+
+ mov v12.16b, v16.16b
+ mov v13.16b, v17.16b
+ urhadd v18.16b, v18.16b , v20.16b
+ urhadd v19.16b, v19.16b , v21.16b
+
+ mov v4.16b, v10.16b
+ mov v5.16b, v11.16b
+
+ mov v16.8b, v24.8b
+ st1 {v18.2s, v19.2s}, [x1], x3 // store row 3
+
+ bgt loop_16 // looping if height =16
+ b end_func
+
+loop_8_start:
+ ld1 {v0.2s, v1.2s}, [x0], x2 // Vector load from src[0_0]
+ ld1 {v2.2s, v3.2s}, [x0], x2 // Vector load from src[1_0]
+ ld1 {v4.2s, v5.2s}, [x0], x2 // Vector load from src[2_0]
+ ld1 {v6.2s, v7.2s}, [x0], x2 // Vector load from src[3_0]
+ ld1 {v8.2s, v9.2s}, [x0], x2 // Vector load from src[4_0]
+
+loop_8:
+
+ ld1 {v10.2s, v11.2s}, [x0], x2 // Vector load from src[5_0]
+ uaddl v14.8h, v4.8b, v6.8b
+ uaddl v12.8h, v0.8b, v10.8b
+ uaddl v16.8h, v2.8b, v8.8b
+ mla v12.8h, v14.8h , v26.8h
+ uaddl v18.8h, v5.8b, v7.8b
+ uaddl v14.8h, v1.8b, v11.8b
+ uaddl v22.8h, v3.8b, v9.8b
+ mla v14.8h, v18.8h , v26.8h
+ mls v12.8h, v16.8h , v24.8h
+ ld1 {v0.2s, v1.2s}, [x0], x2 // Vector load from src[6_0]
+ uaddl v16.8h, v6.8b, v8.8b
+ mls v14.8h, v22.8h , v24.8h
+ uaddl v28.8h, v2.8b, v0.8b
+ st1 {v12.4s}, [x9], #16 // store row 0 to temp buffer: col 0
+ ext v22.16b, v12.16b , v14.16b , #10
+ uaddl v18.8h, v4.8b, v10.8b
+ mla v28.8h, v16.8h , v26.8h
+ saddl v30.4s, v12.4h, v22.4h
+ st1 {v14.4s}, [x9], x7 // store row 0 to temp buffer: col 1
+ saddl2 v22.4s, v12.8h, v22.8h
+ ext v16.16b, v12.16b , v14.16b , #4
+ mls v28.8h, v18.8h , v24.8h
+ ext v18.16b, v12.16b , v14.16b , #6
+ ext v20.16b, v12.16b , v14.16b , #8
+ ext v14.16b, v12.16b , v14.16b , #2
+ add v16.8h, v16.8h , v18.8h
+ add v18.8h, v14.8h , v20.8h
+ uaddl v20.8h, v7.8b, v9.8b
+ smlal v30.4s, v16.4h, v26.4h
+ smlsl v30.4s, v18.4h, v24.4h
+ smlal2 v22.4s, v16.8h, v26.8h
+ smlsl2 v22.4s, v18.8h, v24.8h
+ uaddl v14.8h, v3.8b, v1.8b
+ st1 {v28.4s}, [x9], #16 // store row 1 to temp buffer: col 0
+ mla v14.8h, v20.8h , v26.8h
+ sqrshrun v12.4h, v30.4s, #10
+ uaddl v16.8h, v5.8b, v11.8b
+ sqrshrun v13.4h, v22.4s, #10
+ mls v14.8h, v16.8h , v24.8h
+ ld1 {v2.2s, v3.2s}, [x0], x2 // Vector load from src[7_0]
+ uqxtn v25.8b, v12.8h
+ uqxtn v13.8b, v13.8h
+ mov v25.2s[1], v13.2s[0]
+ uaddl v16.8h, v8.8b, v10.8b
+
+
+ ext v22.16b, v28.16b , v14.16b , #10
+ uaddl v20.8h, v4.8b, v2.8b
+ saddl v30.4s, v28.4h, v22.4h
+ mla v20.8h, v16.8h , v26.8h
+ st1 {v14.4s}, [x9], x7 // store row 1 to temp buffer: col 1
+ saddl2 v22.4s, v28.8h, v22.8h
+ ext v16.16b, v28.16b , v14.16b , #4
+ ext v18.16b, v28.16b , v14.16b , #6
+ ext v12.16b, v28.16b , v14.16b , #8
+ ext v14.16b, v28.16b , v14.16b , #2
+ add v16.8h, v16.8h , v18.8h
+ add v18.8h, v12.8h , v14.8h
+ ld1 {v14.4s, v15.4s}, [x6], x8 // load row 0 from temp buffer
+ smlal v30.4s, v16.4h, v26.4h
+ smlsl v30.4s, v18.4h, v24.4h
+ smlal2 v22.4s, v16.8h, v26.8h
+ smlsl2 v22.4s, v18.8h, v24.8h
+ sqrshrun v14.8b, v14.8h, #0x5
+ ld1 {v28.4s, v29.4s}, [x6], x8 // load row 1 from temp buffer
+ uaddl v18.8h, v6.8b, v0.8b
+ sqrshrun v16.4h, v30.4s, #10
+ sqrshrun v15.8b, v28.8h, #0x5
+ sqrshrun v17.4h, v22.4s, #10
+
+ mov v12.8b, v25.8b
+ mov v25.8b, v24.8b
+
+ uaddl v28.8h, v9.8b, v11.8b
+ uqxtn v13.8b, v16.8h
+ uqxtn v17.8b, v17.8h
+ mov v13.2s[1], v17.2s[0]
+
+ urhadd v12.16b, v12.16b , v14.16b
+ urhadd v13.16b, v13.16b , v15.16b
+ uaddl v14.8h, v5.8b, v3.8b
+ uaddl v22.8h, v7.8b, v1.8b
+ mls v20.8h, v18.8h , v24.8h
+ st1 {v12.2s}, [x1], x3 // store row 0
+ mla v14.8h, v28.8h , v26.8h
+ ld1 {v4.2s, v5.2s}, [x0], x2 // Vector load from src[8_0]
+ uaddl v30.8h, v10.8b, v0.8b
+ uaddl v28.8h, v6.8b, v4.8b
+ mls v14.8h, v22.8h , v24.8h
+ st1 {v13.2s}, [x1], x3 // store row 1
+ mla v28.8h, v30.8h , v26.8h
+ st1 {v20.4s}, [x9], #16 // store row 2 to temp buffer: col 0
+ ext v22.16b, v20.16b , v14.16b , #10
+ saddl v30.4s, v20.4h, v22.4h
+ st1 {v14.2s, v15.2s}, [x9], x7 // store row 2 to temp buffer: col 0
+ saddl2 v22.4s, v20.8h, v22.8h
+ ext v16.16b, v20.16b , v14.16b , #4
+ ext v18.16b, v20.16b , v14.16b , #6
+ ext v12.16b, v20.16b , v14.16b , #8
+ ext v14.16b, v20.16b , v14.16b , #2
+ add v16.8h, v16.8h , v18.8h
+ add v18.8h, v14.8h , v12.8h
+ uaddl v20.8h, v8.8b, v2.8b
+ smlal v30.4s, v16.4h, v26.4h
+ smlsl v30.4s, v18.4h, v24.4h
+ smlal2 v22.4s, v16.8h, v26.8h
+ smlsl2 v22.4s, v18.8h, v24.8h
+ uaddl v18.8h, v11.8b, v1.8b
+ uaddl v16.8h, v7.8b, v5.8b
+ sqrshrun v12.4h, v30.4s, #10
+ uaddl v30.8h, v9.8b, v3.8b
+ mla v16.8h, v18.8h , v26.8h
+ sqrshrun v13.4h, v22.4s, #10
+ mls v28.8h, v20.8h , v24.8h
+ ld1 {v14.4s, v15.4s}, [x6], x8 // load row 2 from temp buffer
+ mls v16.8h, v30.8h , v24.8h
+ uqxtn v27.8b, v12.8h
+ uqxtn v13.8b, v13.8h
+ mov v27.2s[1], v13.2s[0]
+
+ sqrshrun v14.8b, v14.8h, #5
+ ext v22.16b, v28.16b , v16.16b , #10
+ st1 {v28.4s}, [x9], #16 // store row 3 to temp buffer: col 0
+ saddl v30.4s, v28.4h, v22.4h
+ st1 {v16.2s, v17.2s}, [x9], x7 // store row 3 to temp buffer: col 1
+ saddl2 v22.4s, v28.8h, v22.8h
+ ext v12.16b, v28.16b , v16.16b , #4
+ ext v18.16b, v28.16b , v16.16b , #6
+ ext v20.16b, v28.16b , v16.16b , #8
+ ext v28.16b, v28.16b , v16.16b , #2
+ add v12.8h, v12.8h , v18.8h
+ add v18.8h, v28.8h , v20.8h
+ ld1 {v16.4s, v17.4s}, [x6], x8 // load row 3 from temp buffer
+ smlal v30.4s, v12.4h, v26.4h
+ smlsl v30.4s, v18.4h, v24.4h
+ smlal2 v22.4s, v12.8h, v26.8h
+ smlsl2 v22.4s, v18.8h, v24.8h
+ sqrshrun v15.8b, v16.8h, #0x5
+
+ mov v12.8b, v27.8b
+ mov v27.8b, v26.8b
+
+ sqrshrun v16.4h, v30.4s, #10
+
+ mov v6.16b, v2.16b
+ mov v7.16b, v3.16b
+
+ sqrshrun v17.4h, v22.4s, #10
+
+ mov v2.16b, v10.16b
+ mov v3.16b, v11.16b
+
+ mov v10.16b, v0.16b
+ mov v11.16b, v1.16b
+
+ subs x4, x4, #4
+ uqxtn v13.8b, v16.8h
+ uqxtn v17.8b, v17.8h
+ mov v13.2s[1], v17.2s[0]
+ urhadd v12.16b, v12.16b , v14.16b
+ urhadd v13.16b, v13.16b , v15.16b
+
+ mov v0.16b, v8.16b
+ mov v1.16b, v9.16b
+
+ mov v8.16b, v4.16b
+ mov v9.16b, v5.16b
+
+ mov v4.16b, v10.16b
+ mov v5.16b, v11.16b
+
+ st1 {v12.2s}, [x1], x3 // store row 2
+ st1 {v13.2s}, [x1], x3 // store row 3
+
+ bgt loop_8 //if height =8 loop
+ b end_func
+
+loop_4_start:
+ ld1 {v0.2s, v1.2s}, [x0], x2 // Vector load from src[0_0]
+ ld1 {v2.2s, v3.2s}, [x0], x2 // Vector load from src[1_0]
+ ld1 {v4.2s, v5.2s}, [x0], x2 // Vector load from src[2_0]
+ ld1 {v6.2s, v7.2s}, [x0], x2 // Vector load from src[3_0]
+ ld1 {v8.2s, v9.2s}, [x0], x2 // Vector load from src[4_0]
+
+loop_4:
+ ld1 {v10.2s, v11.2s}, [x0], x2 // Vector load from src[5_0]
+ uaddl v14.8h, v4.8b, v6.8b // temp1 = src[2_0] + src[3_0]
+ uaddl v12.8h, v0.8b, v10.8b // temp = src[0_0] + src[5_0]
+ uaddl v16.8h, v2.8b, v8.8b // temp2 = src[1_0] + src[4_0]
+ mla v12.8h, v14.8h , v26.8h // temp += temp1 * 20
+ uaddl v18.8h, v5.8b, v7.8b // temp1 = src[2_0] + src[3_0]
+ uaddl v14.8h, v1.8b, v11.8b // temp = src[0_0] + src[5_0]
+ uaddl v22.8h, v3.8b, v9.8b // temp2 = src[1_0] + src[4_0]
+ mla v14.8h, v18.8h , v26.8h // temp += temp1 * 20
+ mls v12.8h, v16.8h , v24.8h // temp -= temp2 * 5
+ ld1 {v0.2s, v1.2s}, [x0], x2 // Vector load from src[6_0]
+ uaddl v16.8h, v6.8b, v8.8b
+ mls v14.8h, v22.8h , v24.8h // temp -= temp2 * 5
+ //Q6 and Q7 have filtered values
+ uaddl v28.8h, v2.8b, v0.8b
+ st1 {v12.4s}, [x9], #16 // store row 0 to temp buffer: col 0
+ ext v22.16b, v12.16b , v14.16b , #10
+ uaddl v18.8h, v4.8b, v10.8b
+ mla v28.8h, v16.8h , v26.8h
+ saddl v30.4s, v12.4h, v22.4h
+ st1 {v14.4s}, [x9], x7 // store row 0 to temp buffer: col 1
+ saddl v22.4s, v13.4h, v23.4h
+ ext v16.16b, v12.16b , v14.16b , #4
+ mls v28.8h, v18.8h , v24.8h
+ ext v18.16b, v12.16b , v14.16b , #6
+ ext v20.16b, v12.16b , v14.16b , #8
+ ext v14.16b, v12.16b , v14.16b , #2
+ add v16.8h, v16.8h , v18.8h
+ add v18.8h, v14.8h , v20.8h
+ uaddl v20.8h, v7.8b, v9.8b
+ smlal v30.4s, v16.4h, v26.4h
+ smlsl v30.4s, v18.4h, v24.4h
+ smlal v22.4s, v17.4h, v26.4h
+ smlsl v22.4s, v19.4h, v24.4h
+ uaddl v14.8h, v3.8b, v1.8b
+ st1 {v28.4s}, [x9], #16 // store row 1 to temp buffer: col 0
+ mla v14.8h, v20.8h , v26.8h
+ sqrshrun v12.4h, v30.4s, #10
+ uaddl v16.8h, v5.8b, v11.8b
+ sqrshrun v13.4h, v22.4s, #10
+ mls v14.8h, v16.8h , v24.8h
+ ld1 {v2.2s, v3.2s}, [x0], x2 // Vector load from src[7_0]
+ uqxtn v25.8b, v12.8h
+ uaddl v16.8h, v8.8b, v10.8b
+
+ ext v22.16b, v28.16b , v14.16b , #10
+ uaddl v20.8h, v4.8b, v2.8b
+ saddl v30.4s, v28.4h, v22.4h
+ mla v20.8h, v16.8h , v26.8h
+ st1 {v14.4s}, [x9], x7 // store row 1 to temp buffer: col 1
+ saddl v22.4s, v29.4h, v23.4h
+ ext v16.16b, v28.16b , v14.16b , #4
+ ext v18.16b, v28.16b , v14.16b , #6
+ ext v12.16b, v28.16b , v14.16b , #8
+ ext v14.16b, v28.16b , v14.16b , #2
+ add v16.8h, v16.8h , v18.8h
+ add v18.8h, v12.8h , v14.8h
+ ld1 {v14.2s}, [x6], x8 //load row 0 from temp buffer
+ smlal v30.4s, v16.4h, v26.4h
+ smlsl v30.4s, v18.4h, v24.4h
+ smlal v22.4s, v17.4h, v26.4h
+ smlsl v22.4s, v19.4h, v24.4h
+ sqrshrun v14.8b, v14.8h, #0x5
+ ld1 {v28.2s}, [x6], x8 //load row 1 from temp buffer
+ uaddl v18.8h, v6.8b, v0.8b
+ sqrshrun v16.4h, v30.4s, #10
+ sqrshrun v15.8b, v28.8h, #0x5
+ sqrshrun v17.4h, v22.4s, #10
+
+ mov v12.8b, v25.8b
+ mov v25.8b, v24.8b
+
+ uaddl v28.8h, v9.8b, v11.8b
+ uqxtn v13.8b, v16.8h
+
+ urhadd v12.16b, v12.16b , v14.16b
+ urhadd v13.16b, v13.16b , v15.16b
+
+ uaddl v14.8h, v5.8b, v3.8b
+ uaddl v22.8h, v7.8b, v1.8b
+ mls v20.8h, v18.8h , v24.8h
+ st1 {v12.s}[0], [x1], x3 // store row 0
+ mla v14.8h, v28.8h , v26.8h
+ ld1 {v4.2s, v5.2s}, [x0], x2 // Vector load from src[8_0]
+ uaddl v30.8h, v10.8b, v0.8b
+ uaddl v28.8h, v6.8b, v4.8b
+ mls v14.8h, v22.8h , v24.8h
+ st1 {v13.s}[0], [x1], x3 //store row 1
+ mla v28.8h, v30.8h , v26.8h
+ st1 {v20.4s}, [x9], #16 // store row 2 to temp buffer: col 0
+ ext v22.16b, v20.16b , v14.16b , #10
+ saddl v30.4s, v20.4h, v22.4h
+ st1 {v14.4s}, [x9], x7 // store row 2 to temp buffer: col 1
+ saddl v22.4s, v21.4h, v23.4h
+ ext v16.16b, v20.16b , v14.16b , #4
+ ext v18.16b, v20.16b , v14.16b , #6
+ ext v12.16b, v20.16b , v14.16b , #8
+ ext v14.16b, v20.16b , v14.16b , #2
+ add v16.8h, v16.8h , v18.8h
+ add v18.8h, v14.8h , v12.8h
+ uaddl v20.8h, v8.8b, v2.8b
+ smlal v30.4s, v16.4h, v26.4h
+ smlsl v30.4s, v18.4h, v24.4h
+ smlal v22.4s, v17.4h, v26.4h
+ smlsl v22.4s, v19.4h, v24.4h
+ uaddl v18.8h, v11.8b, v1.8b
+ uaddl v16.8h, v7.8b, v5.8b
+ sqrshrun v12.4h, v30.4s, #10
+ uaddl v30.8h, v9.8b, v3.8b
+ mla v16.8h, v18.8h , v26.8h
+ sqrshrun v13.4h, v22.4s, #10
+ mls v28.8h, v20.8h , v24.8h
+ ld1 {v14.2s}, [x6], x8 //load row 3 from temp buffer
+ mls v16.8h, v30.8h , v24.8h
+ uqxtn v27.8b, v12.8h
+ sqrshrun v14.8b, v14.8h, #5
+ ext v22.16b, v28.16b , v16.16b , #10
+ st1 {v28.4s}, [x9], #16 // store row 3 to temp buffer: col 0
+ saddl v30.4s, v28.4h, v22.4h
+ st1 {v16.4s}, [x9], x7 // store row 3 to temp buffer: col 1
+ saddl v22.4s, v29.4h, v23.4h
+ ext v12.16b, v28.16b , v16.16b , #4
+ ext v18.16b, v28.16b , v16.16b , #6
+ ext v20.16b, v28.16b , v16.16b , #8
+ ext v28.16b, v28.16b , v16.16b , #2
+ add v12.8h, v12.8h , v18.8h
+ add v18.8h, v28.8h , v20.8h
+ ld1 {v16.2s}, [x6], x8 //load row 4 from temp buffer
+ smlal v30.4s, v12.4h, v26.4h
+ smlsl v30.4s, v18.4h, v24.4h
+ smlal v22.4s, v13.4h, v26.4h
+ smlsl v22.4s, v19.4h, v24.4h
+ sqrshrun v15.8b, v16.8h, #0x5
+
+ mov v12.8b, v27.8b
+ mov v27.8b, v26.8b
+
+ sqrshrun v16.4h, v30.4s, #10
+
+ mov v6.16b, v2.16b
+ mov v7.16b, v3.16b
+
+ sqrshrun v17.4h, v22.4s, #10
+
+ mov v2.16b, v10.16b
+ mov v3.16b, v11.16b
+
+ mov v10.16b, v0.16b
+ mov v11.16b, v1.16b
+
+ subs x4, x4, #4
+ uqxtn v13.8b, v16.8h
+ urhadd v12.16b, v12.16b , v14.16b
+ urhadd v13.16b, v13.16b , v15.16b
+
+ mov v0.16b, v8.16b
+ mov v1.16b, v9.16b
+
+ mov v8.16b, v4.16b
+ mov v9.16b, v5.16b
+
+
+ mov v4.16b, v10.16b
+ mov v5.16b, v11.16b
+
+
+ st1 {v12.s}[0], [x1], x3 // store row 2
+ st1 {v13.s}[0], [x1], x3 // store row 3
+
+ bgt loop_4
+
+end_func:
+ // LDMFD sp!,{x4-x12,PC} //Restoring registers from stack
+ ldp x19, x20, [sp], #16
+ pop_v_regs
+ ret
+
+
+
diff --git a/common/armv8/ih264_inter_pred_luma_horz_qpel_vert_qpel_av8.s b/common/armv8/ih264_inter_pred_luma_horz_qpel_vert_qpel_av8.s
new file mode 100755
index 0000000..ab663d0
--- /dev/null
+++ b/common/armv8/ih264_inter_pred_luma_horz_qpel_vert_qpel_av8.s
@@ -0,0 +1,958 @@
+//******************************************************************************
+//*
+//* Copyright (C) 2015 The Android Open Source Project
+//*
+//* Licensed under the Apache License, Version 2.0 (the "License");
+//* you may not use this file except in compliance with the License.
+//* You may obtain a copy of the License at:
+//*
+//* http://www.apache.org/licenses/LICENSE-2.0
+//*
+//* Unless required by applicable law or agreed to in writing, software
+//* distributed under the License is distributed on an "AS IS" BASIS,
+//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//* See the License for the specific language governing permissions and
+//* limitations under the License.
+//*
+//*****************************************************************************
+//* Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
+//*/
+///**
+//******************************************************************************
+//* @file
+//* ih264_inter_pred_luma_horz_qpel_vert_qpel_av8.s
+//*
+//* @brief
+//* Contains function definitions for inter prediction interpolation.
+//*
+//* @author
+//* Mohit
+//*
+//* @par List of Functions:
+//*
+//* - ih264_inter_pred_luma_horz_qpel_vert_qpel_av8()
+//*
+//* @remarks
+//* None
+//*
+//*******************************************************************************
+//*/
+
+///* All the functions here are replicated from ih264_inter_pred_filters.c
+//
+
+///**
+///**
+///**
+//*******************************************************************************
+//*
+//* @brief
+//* This function implements two six tap filters. It
+//* applies the six tap filter in the horizontal direction on the
+//* predictor values, then applies the same filter in the
+//* vertical direction on the predictor values. It then averages these
+//* two outputs to obtain quarter pel values in horizontal and vertical direction.
+//* The six tap filtering operation is described in sec 8.4.2.2.1 titled
+//* "Luma sample interpolation process"
+//*
+//* @par Description:
+//* This function is called to obtain pixels lying at the following
+//* location (1/4,1/4) or (3/4,1/4) or (1/4,3/4) or (3/4,3/4).
+//* The function interpolates the predictors first in the horizontal direction
+//* and then in the vertical direction, and then averages these two
+//* values.
+//*
+//* @param[in] pu1_src
+//* UWORD8 pointer to the source
+//*
+//* @param[out] pu1_dst
+//* UWORD8 pointer to the destination
+//*
+//* @param[in] src_strd
+//* integer source stride
+//*
+//* @param[in] dst_strd
+//* integer destination stride
+//*
+//* @param[in] ht
+//* integer height of the array
+//*
+//* @param[in] wd
+//* integer width of the array
+//*
+//* @param[in] pu1_tmp: temporary buffer
+//*
+//* @param[in] dydx: x and y reference offset for qpel calculations
+//*
+//* @returns
+//*
+//* @remarks
+//* None
+//*
+//*******************************************************************************
+//*/;
+
+//void ih264_inter_pred_luma_horz_qpel_vert_qpel(UWORD8 *pu1_src,
+// UWORD8 *pu1_dst,
+// WORD32 src_strd,,
+// WORD32 dst_strd,
+// WORD32 ht,
+// WORD32 wd,
+// UWORD8* pu1_tmp,
+// UWORD32 dydx)
+
+//**************Variables Vs Registers*****************************************
+// x0 => *pu1_src
+// x1 => *pu1_dst
+// x2 => src_strd
+// x3 => dst_strd
+// x4 => ht
+// x5 => wd
+// x6 => dydx
+
+.text
+.p2align 2
+.include "ih264_neon_macros.s"
+
+
+
+ .global ih264_inter_pred_luma_horz_qpel_vert_qpel_av8
+
+ih264_inter_pred_luma_horz_qpel_vert_qpel_av8:
+
+ push_v_regs
+ stp x19, x20, [sp, #-16]!
+ mov x6, x7
+ and x7, x6, #3
+ add x7, x0, x7, lsr #1 //pu1_pred_vert = pu1_src + (x_offset>>1)
+
+ and x6, x6, #12 //Finds y-offset
+ lsr x6, x6, #3 //dydx>>3
+ mul x6, x2, x6
+ add x6, x0, x6 //pu1_pred_horz = pu1_src + (y_offset>>1)*src_strd
+ sub x7, x7, x2, lsl #1 //pu1_pred_vert-2*src_strd
+ sub x6, x6, #2 //pu1_pred_horz-2
+ movi v30.8b, #20 // Filter coeff 20
+ movi v31.8b, #5 // Filter coeff 5
+
+ subs x12, x5, #4 //if wd=4 branch to loop_4
+ beq loop_4_start
+ subs x12, x5, #8 //if wd=8 branch to loop_8
+ beq loop_8_start
+
+ ld1 {v0.2s, v1.2s}, [x7], x2 // Vector load from src[0_0]
+ ld1 {v2.2s, v3.2s}, [x7], x2 // Vector load from src[1_0]
+
+ ld1 {v4.2s, v5.2s}, [x7], x2 // Vector load from src[2_0]
+ ld1 {v6.2s, v7.2s}, [x7], x2 // Vector load from src[3_0]
+ ld1 {v8.2s, v9.2s}, [x7], x2 // Vector load from src[4_0]
+ add x11, x6, #8
+loop_16:
+ ld1 {v10.2s, v11.2s}, [x7], x2 // Vector load from src[5_0]
+ ld1 {v18.2s, v19.2s}, [x6], x2 // horz row0, col 0
+ uaddl v24.8h, v0.8b, v10.8b
+ umlal v24.8h, v4.8b, v30.8b
+ umlal v24.8h, v6.8b, v30.8b
+ umlsl v24.8h, v2.8b, v31.8b
+ umlsl v24.8h, v8.8b, v31.8b
+ ext v23.8b, v18.8b , v19.8b , #5
+ ext v20.8b, v18.8b , v19.8b , #2
+ ext v21.8b, v18.8b , v19.8b , #3
+ ext v22.8b, v18.8b , v19.8b , #4
+ ext v19.8b, v18.8b , v19.8b , #1
+ sqrshrun v26.8b, v24.8h, #5
+ uaddl v28.8h, v18.8b, v23.8b
+ umlal v28.8h, v20.8b, v30.8b
+ umlal v28.8h, v21.8b, v30.8b
+ umlsl v28.8h, v19.8b, v31.8b
+ umlsl v28.8h, v22.8b, v31.8b
+ ld1 {v18.2s, v19.2s}, [x11], x2 // horz row 0, col 1
+ uaddl v24.8h, v1.8b, v11.8b
+ umlal v24.8h, v5.8b, v30.8b
+ umlal v24.8h, v7.8b, v30.8b
+ umlsl v24.8h, v3.8b, v31.8b
+ umlsl v24.8h, v9.8b, v31.8b
+ sqrshrun v28.8b, v28.8h, #5
+ ext v23.8b, v18.8b , v19.8b , #5
+ ext v20.8b, v18.8b , v19.8b , #2
+ ext v21.8b, v18.8b , v19.8b , #3
+ ext v22.8b, v18.8b , v19.8b , #4
+ ext v19.8b, v18.8b , v19.8b , #1
+
+ sqrshrun v27.8b, v24.8h, #5
+ ld1 {v12.2s, v13.2s}, [x7], x2 // src[6_0]
+
+ uaddl v24.8h, v18.8b, v23.8b
+ umlal v24.8h, v20.8b, v30.8b
+ umlal v24.8h, v21.8b, v30.8b
+ umlsl v24.8h, v19.8b, v31.8b
+ umlsl v24.8h, v22.8b, v31.8b
+
+ uaddl v16.8h, v2.8b, v12.8b
+ umlal v16.8h, v6.8b, v30.8b
+ umlal v16.8h, v8.8b, v30.8b
+ umlsl v16.8h, v4.8b, v31.8b
+ umlsl v16.8h, v10.8b, v31.8b
+
+ sqrshrun v29.8b, v24.8h, #5
+ ld1 {v18.2s, v19.2s}, [x6], x2 // horz row 1, col 0
+
+ uaddl v24.8h, v3.8b, v13.8b
+ umlal v24.8h, v7.8b, v30.8b
+ umlal v24.8h, v9.8b, v30.8b
+ umlsl v24.8h, v5.8b, v31.8b
+ umlsl v24.8h, v11.8b, v31.8b
+ urhadd v28.16b, v28.16b , v26.16b
+ urhadd v29.16b, v29.16b , v27.16b
+ sqrshrun v26.8b, v16.8h, #5
+ ext v23.8b, v18.8b , v19.8b , #5
+ ext v20.8b, v18.8b , v19.8b , #2
+ ext v21.8b, v18.8b , v19.8b , #3
+ ext v22.8b, v18.8b , v19.8b , #4
+ st1 {v28.2s, v29.2s}, [x1], x3 // store row 0
+ ext v19.8b, v18.8b , v19.8b , #1
+
+ sqrshrun v27.8b, v24.8h, #5
+
+ uaddl v28.8h, v18.8b, v23.8b
+ umlal v28.8h, v20.8b, v30.8b
+ umlal v28.8h, v21.8b, v30.8b
+ umlsl v28.8h, v19.8b, v31.8b
+ umlsl v28.8h, v22.8b, v31.8b
+
+ ld1 {v18.2s, v19.2s}, [x11], x2 // horz row 1, col 1
+ ld1 {v14.2s, v15.2s}, [x7], x2 // src[7_0]
+
+ ext v23.8b, v18.8b , v19.8b , #5
+ ext v20.8b, v18.8b , v19.8b , #2
+ ext v21.8b, v18.8b , v19.8b , #3
+ ext v22.8b, v18.8b , v19.8b , #4
+ ext v19.8b, v18.8b , v19.8b , #1
+
+ sqrshrun v28.8b, v28.8h, #5
+ uaddl v24.8h, v18.8b, v23.8b
+ umlal v24.8h, v20.8b, v30.8b
+ umlal v24.8h, v21.8b, v30.8b
+ umlsl v24.8h, v19.8b, v31.8b
+ umlsl v24.8h, v22.8b, v31.8b
+
+ ld1 {v18.2s, v19.2s}, [x6], x2 // horz row 2, col 0
+ uaddl v16.8h, v4.8b, v14.8b
+ umlal v16.8h, v8.8b, v30.8b
+ umlal v16.8h, v10.8b, v30.8b
+ umlsl v16.8h, v6.8b, v31.8b
+ umlsl v16.8h, v12.8b, v31.8b
+
+ sqrshrun v29.8b, v24.8h, #5
+ ext v23.8b, v18.8b , v19.8b , #5
+ ext v20.8b, v18.8b , v19.8b , #2
+ ext v21.8b, v18.8b , v19.8b , #3
+ ext v22.8b, v18.8b , v19.8b , #4
+ ext v19.8b, v18.8b , v19.8b , #1
+ urhadd v28.16b, v28.16b , v26.16b
+ urhadd v29.16b, v29.16b , v27.16b
+ sqrshrun v26.8b, v16.8h, #5
+
+ uaddl v24.8h, v5.8b, v15.8b
+ umlal v24.8h, v9.8b, v30.8b
+ umlal v24.8h, v11.8b, v30.8b
+ umlsl v24.8h, v7.8b, v31.8b
+ umlsl v24.8h, v13.8b, v31.8b
+
+ st1 {v28.2s, v29.2s}, [x1], x3 // store row 1
+
+ uaddl v28.8h, v18.8b, v23.8b
+ umlal v28.8h, v20.8b, v30.8b
+ umlal v28.8h, v21.8b, v30.8b
+ umlsl v28.8h, v19.8b, v31.8b
+ umlsl v28.8h, v22.8b, v31.8b
+
+ ld1 {v18.2s, v19.2s}, [x11], x2 // horz row 2, col 1
+ sqrshrun v27.8b, v24.8h, #5
+ ext v23.8b, v18.8b , v19.8b , #5
+ ext v20.8b, v18.8b , v19.8b , #2
+ ext v21.8b, v18.8b , v19.8b , #3
+ ext v22.8b, v18.8b , v19.8b , #4
+ ext v19.8b, v18.8b , v19.8b , #1
+
+ sqrshrun v28.8b, v28.8h, #5
+ ld1 {v16.2s, v17.2s}, [x7], x2 // src[8_0]
+ uaddl v24.8h, v18.8b, v23.8b
+ umlal v24.8h, v20.8b, v30.8b
+ umlal v24.8h, v21.8b, v30.8b
+ umlsl v24.8h, v19.8b, v31.8b
+ umlsl v24.8h, v22.8b, v31.8b
+
+ ld1 {v18.2s, v19.2s}, [x6], x2 // horz row 3, col 0
+ uaddl v0.8h, v6.8b, v16.8b
+ umlal v0.8h, v10.8b, v30.8b
+ umlal v0.8h, v12.8b, v30.8b
+ umlsl v0.8h, v8.8b, v31.8b
+ umlsl v0.8h, v14.8b, v31.8b
+
+ sqrshrun v29.8b, v24.8h, #5
+
+ ext v23.8b, v18.8b , v19.8b , #5
+ ext v20.8b, v18.8b , v19.8b , #2
+ ext v21.8b, v18.8b , v19.8b , #3
+ urhadd v28.16b, v28.16b , v26.16b
+ urhadd v29.16b, v29.16b , v27.16b
+ ext v22.8b, v18.8b , v19.8b , #4
+ ext v19.8b, v18.8b , v19.8b , #1
+ sqrshrun v26.8b, v0.8h, #5
+ st1 {v28.2s, v29.2s}, [x1], x3 // store row 2
+
+ uaddl v24.8h, v18.8b, v23.8b
+ umlal v24.8h, v20.8b, v30.8b
+ umlal v24.8h, v21.8b, v30.8b
+ umlsl v24.8h, v19.8b, v31.8b
+ umlsl v24.8h, v22.8b, v31.8b
+
+ ld1 {v18.2s, v19.2s}, [x11], x2 // horz row 3, col 1
+
+ uaddl v0.8h, v7.8b, v17.8b
+ umlal v0.8h, v11.8b, v30.8b
+ umlal v0.8h, v13.8b, v30.8b
+ umlsl v0.8h, v9.8b, v31.8b
+ umlsl v0.8h, v15.8b, v31.8b
+
+ sqrshrun v28.8b, v24.8h, #5
+
+ ext v23.8b, v18.8b , v19.8b , #5
+ ext v20.8b, v18.8b , v19.8b , #2
+ ext v21.8b, v18.8b , v19.8b , #3
+ ext v22.8b, v18.8b , v19.8b , #4
+ ext v19.8b, v18.8b , v19.8b , #1
+
+ sqrshrun v27.8b, v0.8h, #5
+
+ uaddl v24.8h, v18.8b, v23.8b
+ umlal v24.8h, v20.8b, v30.8b
+ umlal v24.8h, v21.8b, v30.8b
+ umlsl v24.8h, v19.8b, v31.8b
+ umlsl v24.8h, v22.8b, v31.8b
+
+ mov v0.16b, v8.16b
+ mov v1.16b, v9.16b
+
+ mov v2.16b, v10.16b
+ mov v3.16b, v11.16b
+
+ mov v4.16b, v12.16b
+ mov v5.16b, v13.16b
+
+ mov v6.16b, v14.16b
+ mov v7.16b, v15.16b
+
+ mov v8.16b, v16.16b
+ mov v9.16b, v17.16b
+
+ sqrshrun v29.8b, v24.8h, #5
+ urhadd v28.16b, v28.16b , v26.16b
+ urhadd v29.16b, v29.16b , v27.16b
+ st1 {v28.2s, v29.2s}, [x1], x3 // store row 3
+
+ ld1 {v10.2s, v11.2s}, [x7], x2 // Vector load from src[9_0]
+ ld1 {v18.2s, v19.2s}, [x6], x2 // horz row4, col 0
+ uaddl v24.8h, v0.8b, v10.8b
+ umlal v24.8h, v4.8b, v30.8b
+ umlal v24.8h, v6.8b, v30.8b
+ umlsl v24.8h, v2.8b, v31.8b
+ umlsl v24.8h, v8.8b, v31.8b
+ ext v23.8b, v18.8b , v19.8b , #5
+ ext v20.8b, v18.8b , v19.8b , #2
+ ext v21.8b, v18.8b , v19.8b , #3
+ ext v22.8b, v18.8b , v19.8b , #4
+ ext v19.8b, v18.8b , v19.8b , #1
+ sqrshrun v26.8b, v24.8h, #5
+ uaddl v28.8h, v18.8b, v23.8b
+ umlal v28.8h, v20.8b, v30.8b
+ umlal v28.8h, v21.8b, v30.8b
+ umlsl v28.8h, v19.8b, v31.8b
+ umlsl v28.8h, v22.8b, v31.8b
+ ld1 {v18.2s, v19.2s}, [x11], x2 // horz row 4, col 1
+ uaddl v24.8h, v1.8b, v11.8b
+ umlal v24.8h, v5.8b, v30.8b
+ umlal v24.8h, v7.8b, v30.8b
+ umlsl v24.8h, v3.8b, v31.8b
+ umlsl v24.8h, v9.8b, v31.8b
+ sqrshrun v28.8b, v28.8h, #5
+ ext v23.8b, v18.8b , v19.8b , #5
+ ext v20.8b, v18.8b , v19.8b , #2
+ ext v21.8b, v18.8b , v19.8b , #3
+ ext v22.8b, v18.8b , v19.8b , #4
+ ext v19.8b, v18.8b , v19.8b , #1
+
+ sqrshrun v27.8b, v24.8h, #5
+ ld1 {v12.2s, v13.2s}, [x7], x2 // src[10_0]
+ uaddl v24.8h, v18.8b, v23.8b
+ umlal v24.8h, v20.8b, v30.8b
+ umlal v24.8h, v21.8b, v30.8b
+ umlsl v24.8h, v19.8b, v31.8b
+ umlsl v24.8h, v22.8b, v31.8b
+ uaddl v16.8h, v2.8b, v12.8b
+ umlal v16.8h, v6.8b, v30.8b
+ umlal v16.8h, v8.8b, v30.8b
+ umlsl v16.8h, v4.8b, v31.8b
+ umlsl v16.8h, v10.8b, v31.8b
+ sqrshrun v29.8b, v24.8h, #5
+ ld1 {v18.2s, v19.2s}, [x6], x2 // horz row 5, col 0
+ uaddl v24.8h, v3.8b, v13.8b
+ umlal v24.8h, v7.8b, v30.8b
+ umlal v24.8h, v9.8b, v30.8b
+ umlsl v24.8h, v5.8b, v31.8b
+ umlsl v24.8h, v11.8b, v31.8b
+ urhadd v28.16b, v28.16b , v26.16b
+ urhadd v29.16b, v29.16b , v27.16b
+ sqrshrun v26.8b, v16.8h, #5
+ ext v23.8b, v18.8b , v19.8b , #5
+ ext v20.8b, v18.8b , v19.8b , #2
+ ext v21.8b, v18.8b , v19.8b , #3
+ ext v22.8b, v18.8b , v19.8b , #4
+ st1 {v28.2s, v29.2s}, [x1], x3 // store row 4
+ ext v19.8b, v18.8b , v19.8b , #1
+
+ sqrshrun v27.8b, v24.8h, #5
+
+ uaddl v28.8h, v18.8b, v23.8b
+ umlal v28.8h, v20.8b, v30.8b
+ umlal v28.8h, v21.8b, v30.8b
+ umlsl v28.8h, v19.8b, v31.8b
+ umlsl v28.8h, v22.8b, v31.8b
+
+ ld1 {v18.2s, v19.2s}, [x11], x2 // horz row 5, col 1
+ ld1 {v14.2s, v15.2s}, [x7], x2 // src[11_0]
+
+ ext v23.8b, v18.8b , v19.8b , #5
+ ext v20.8b, v18.8b , v19.8b , #2
+ ext v21.8b, v18.8b , v19.8b , #3
+ ext v22.8b, v18.8b , v19.8b , #4
+ ext v19.8b, v18.8b , v19.8b , #1
+
+ sqrshrun v28.8b, v28.8h, #5
+ uaddl v24.8h, v18.8b, v23.8b
+ umlal v24.8h, v20.8b, v30.8b
+ umlal v24.8h, v21.8b, v30.8b
+ umlsl v24.8h, v19.8b, v31.8b
+ umlsl v24.8h, v22.8b, v31.8b
+
+ ld1 {v18.2s, v19.2s}, [x6], x2 // horz row 6, col 0
+ uaddl v16.8h, v4.8b, v14.8b
+ umlal v16.8h, v8.8b, v30.8b
+ umlal v16.8h, v10.8b, v30.8b
+ umlsl v16.8h, v6.8b, v31.8b
+ umlsl v16.8h, v12.8b, v31.8b
+
+ sqrshrun v29.8b, v24.8h, #5
+ ext v23.8b, v18.8b , v19.8b , #5
+ ext v20.8b, v18.8b , v19.8b , #2
+ ext v21.8b, v18.8b , v19.8b , #3
+ ext v22.8b, v18.8b , v19.8b , #4
+ ext v19.8b, v18.8b , v19.8b , #1
+ urhadd v28.16b, v28.16b , v26.16b
+ urhadd v29.16b, v29.16b , v27.16b
+ sqrshrun v26.8b, v16.8h, #5
+
+ uaddl v24.8h, v5.8b, v15.8b
+ umlal v24.8h, v9.8b, v30.8b
+ umlal v24.8h, v11.8b, v30.8b
+ umlsl v24.8h, v7.8b, v31.8b
+ umlsl v24.8h, v13.8b, v31.8b
+
+ st1 {v28.2s, v29.2s}, [x1], x3 // store row 5
+
+ uaddl v28.8h, v18.8b, v23.8b
+ umlal v28.8h, v20.8b, v30.8b
+ umlal v28.8h, v21.8b, v30.8b
+ umlsl v28.8h, v19.8b, v31.8b
+ umlsl v28.8h, v22.8b, v31.8b
+
+ ld1 {v18.2s, v19.2s}, [x11], x2 // horz row 6, col 1
+ sqrshrun v27.8b, v24.8h, #5
+ ext v23.8b, v18.8b , v19.8b , #5
+ ext v20.8b, v18.8b , v19.8b , #2
+ ext v21.8b, v18.8b , v19.8b , #3
+ ext v22.8b, v18.8b , v19.8b , #4
+ ext v19.8b, v18.8b , v19.8b , #1
+
+ sqrshrun v28.8b, v28.8h, #5
+ ld1 {v16.2s, v17.2s}, [x7], x2 // src[12_0]
+ uaddl v24.8h, v18.8b, v23.8b
+ umlal v24.8h, v20.8b, v30.8b
+ umlal v24.8h, v21.8b, v30.8b
+ umlsl v24.8h, v19.8b, v31.8b
+ umlsl v24.8h, v22.8b, v31.8b
+
+ ld1 {v18.2s, v19.2s}, [x6], x2 // horz row 7, col 0
+ uaddl v0.8h, v6.8b, v16.8b
+ umlal v0.8h, v10.8b, v30.8b
+ umlal v0.8h, v12.8b, v30.8b
+ umlsl v0.8h, v8.8b, v31.8b
+ umlsl v0.8h, v14.8b, v31.8b
+
+ sqrshrun v29.8b, v24.8h, #5
+
+ ext v23.8b, v18.8b , v19.8b , #5
+ ext v20.8b, v18.8b , v19.8b , #2
+ ext v21.8b, v18.8b , v19.8b , #3
+ urhadd v28.16b, v28.16b , v26.16b
+ urhadd v29.16b, v29.16b , v27.16b
+ ext v22.8b, v18.8b , v19.8b , #4
+ ext v19.8b, v18.8b , v19.8b , #1
+ sqrshrun v26.8b, v0.8h, #5
+ st1 {v28.2s, v29.2s}, [x1], x3 // store row 6
+
+ uaddl v24.8h, v18.8b, v23.8b
+ umlal v24.8h, v20.8b, v30.8b
+ umlal v24.8h, v21.8b, v30.8b
+ umlsl v24.8h, v19.8b, v31.8b
+ umlsl v24.8h, v22.8b, v31.8b
+
+ ld1 {v18.2s, v19.2s}, [x11], x2 // horz row 7, col 1
+
+ uaddl v0.8h, v7.8b, v17.8b
+ umlal v0.8h, v11.8b, v30.8b
+ umlal v0.8h, v13.8b, v30.8b
+ umlsl v0.8h, v9.8b, v31.8b
+ umlsl v0.8h, v15.8b, v31.8b
+
+ sqrshrun v28.8b, v24.8h, #5
+
+ ext v23.8b, v18.8b , v19.8b , #5
+ ext v20.8b, v18.8b , v19.8b , #2
+ ext v21.8b, v18.8b , v19.8b , #3
+ ext v22.8b, v18.8b , v19.8b , #4
+ ext v19.8b, v18.8b , v19.8b , #1
+
+ sqrshrun v27.8b, v0.8h, #5
+
+ uaddl v24.8h, v18.8b, v23.8b
+ umlal v24.8h, v20.8b, v30.8b
+ umlal v24.8h, v21.8b, v30.8b
+ umlsl v24.8h, v19.8b, v31.8b
+ umlsl v24.8h, v22.8b, v31.8b
+
+ mov v0.16b, v8.16b
+ mov v1.16b, v9.16b
+
+ mov v2.16b, v10.16b
+ mov v3.16b, v11.16b
+
+ mov v4.16b, v12.16b
+ mov v5.16b, v13.16b
+
+ mov v6.16b, v14.16b
+ mov v7.16b, v15.16b
+
+ mov v8.16b, v16.16b
+ mov v9.16b, v17.16b
+
+ sqrshrun v29.8b, v24.8h, #5
+ subs x4, x4, #8
+ urhadd v28.16b, v28.16b , v26.16b
+ urhadd v29.16b, v29.16b , v27.16b
+ st1 {v28.2s, v29.2s}, [x1], x3 // store row 7
+
+ beq end_func // stop looping if ht == 8
+ b loop_16
+
+
+loop_8_start:
+ ld1 {v0.2s}, [x7], x2 // Vector load from src[0_0]
+ ld1 {v1.2s}, [x7], x2 // Vector load from src[1_0]
+ ld1 {v2.2s}, [x7], x2 // Vector load from src[2_0]
+ ld1 {v3.2s}, [x7], x2 // Vector load from src[3_0]
+ ld1 {v4.2s}, [x7], x2 // Vector load from src[4_0]
+
+loop_8:
+ ld1 {v5.2s}, [x7], x2 // Vector load from src[5_0]
+ uaddl v10.8h, v0.8b, v5.8b
+ umlal v10.8h, v2.8b, v30.8b
+ umlal v10.8h, v3.8b, v30.8b
+ umlsl v10.8h, v1.8b, v31.8b
+ umlsl v10.8h, v4.8b, v31.8b
+ ld1 {v12.2s, v13.2s}, [x6], x2 //horz row 0
+ ext v17.8b, v12.8b , v13.8b , #5
+ ext v14.8b, v12.8b , v13.8b , #2
+ ext v15.8b, v12.8b , v13.8b , #3
+ ext v16.8b, v12.8b , v13.8b , #4
+ ext v13.8b, v12.8b , v13.8b , #1
+ sqrshrun v26.8b, v10.8h, #5
+ ld1 {v6.2s}, [x7], x2 // src[6_0]
+ uaddl v10.8h, v12.8b, v17.8b
+ umlal v10.8h, v14.8b, v30.8b
+ umlal v10.8h, v15.8b, v30.8b
+ umlsl v10.8h, v13.8b, v31.8b
+ umlsl v10.8h, v16.8b, v31.8b
+ ld1 {v12.2s, v13.2s}, [x6], x2 // horz row 1
+ uaddl v18.8h, v1.8b, v6.8b
+ umlal v18.8h, v3.8b, v30.8b
+ umlal v18.8h, v4.8b, v30.8b
+ umlsl v18.8h, v2.8b, v31.8b
+ umlsl v18.8h, v5.8b, v31.8b
+ sqrshrun v28.8b, v10.8h, #5
+ ext v17.8b, v12.8b , v13.8b , #5
+ ext v14.8b, v12.8b , v13.8b , #2
+ ext v15.8b, v12.8b , v13.8b , #3
+ ext v16.8b, v12.8b , v13.8b , #4
+ ext v13.8b, v12.8b , v13.8b , #1
+ sqrshrun v27.8b, v18.8h, #5
+ ld1 {v7.2s}, [x7], x2 // src[7_0]
+ uaddl v10.8h, v12.8b, v17.8b
+ umlal v10.8h, v14.8b, v30.8b
+ umlal v10.8h, v15.8b, v30.8b
+ umlsl v10.8h, v13.8b, v31.8b
+ umlsl v10.8h, v16.8b, v31.8b
+ ld1 {v12.2s, v13.2s}, [x6], x2 // horz row 2
+ uaddl v18.8h, v2.8b, v7.8b
+ umlal v18.8h, v4.8b, v30.8b
+ umlal v18.8h, v5.8b, v30.8b
+ umlsl v18.8h, v3.8b, v31.8b
+ umlsl v18.8h, v6.8b, v31.8b
+ sqrshrun v29.8b, v10.8h, #5
+ ext v17.8b, v12.8b , v13.8b , #5
+ ext v14.8b, v12.8b , v13.8b , #2
+ ext v15.8b, v12.8b , v13.8b , #3
+ ext v16.8b, v12.8b , v13.8b , #4
+ ext v13.8b, v12.8b , v13.8b , #1
+ urhadd v26.16b, v26.16b , v28.16b
+ urhadd v27.16b, v27.16b , v29.16b
+ sqrshrun v28.8b, v18.8h, #5
+ ld1 {v8.2s}, [x7], x2 // src[8_0]
+ uaddl v10.8h, v12.8b, v17.8b
+ umlal v10.8h, v14.8b, v30.8b
+ umlal v10.8h, v15.8b, v30.8b
+ umlsl v10.8h, v13.8b, v31.8b
+ umlsl v10.8h, v16.8b, v31.8b
+ ld1 {v12.2s, v13.2s}, [x6], x2 // horz row 3
+ uaddl v18.8h, v3.8b, v8.8b
+ umlal v18.8h, v5.8b, v30.8b
+ umlal v18.8h, v6.8b, v30.8b
+ umlsl v18.8h, v4.8b, v31.8b
+ umlsl v18.8h, v7.8b, v31.8b
+ sqrshrun v24.8b, v10.8h, #5
+ ext v17.8b, v12.8b , v13.8b , #5
+ ext v14.8b, v12.8b , v13.8b , #2
+ ext v15.8b, v12.8b , v13.8b , #3
+ ext v16.8b, v12.8b , v13.8b , #4
+ ext v13.8b, v12.8b , v13.8b , #1
+ sqrshrun v29.8b, v18.8h, #5
+ uaddl v10.8h, v12.8b, v17.8b
+ umlal v10.8h, v14.8b, v30.8b
+ umlal v10.8h, v15.8b, v30.8b
+ umlsl v10.8h, v13.8b, v31.8b
+ umlsl v10.8h, v16.8b, v31.8b
+ st1 {v26.2s}, [x1], x3
+
+ mov v0.16b, v4.16b
+ mov v1.16b, v5.16b
+
+ st1 {v27.2s}, [x1], x3
+
+ mov v2.16b, v6.16b
+ mov v3.16b, v7.16b
+
+ mov v4.8b, v8.8b
+
+ sqrshrun v25.8b, v10.8h, #5
+ subs x9, x4, #4
+ urhadd v24.16b, v24.16b , v28.16b
+ urhadd v25.16b, v25.16b , v29.16b
+ st1 {v24.2s}, [x1], x3
+ st1 {v25.2s}, [x1], x3
+ beq end_func // Branch if height==4
+
+ ld1 {v5.2s}, [x7], x2 // Vector load from src[9_0]
+ uaddl v10.8h, v0.8b, v5.8b
+ umlal v10.8h, v2.8b, v30.8b
+ umlal v10.8h, v3.8b, v30.8b
+ umlsl v10.8h, v1.8b, v31.8b
+ umlsl v10.8h, v4.8b, v31.8b
+ ld1 {v12.2s, v13.2s}, [x6], x2 //horz row 4
+ ext v17.8b, v12.8b , v13.8b , #5
+ ext v14.8b, v12.8b , v13.8b , #2
+ ext v15.8b, v12.8b , v13.8b , #3
+ ext v16.8b, v12.8b , v13.8b , #4
+ ext v13.8b, v12.8b , v13.8b , #1
+ sqrshrun v26.8b, v10.8h, #5
+ ld1 {v6.2s}, [x7], x2 // src[10_0]
+ uaddl v10.8h, v12.8b, v17.8b
+ umlal v10.8h, v14.8b, v30.8b
+ umlal v10.8h, v15.8b, v30.8b
+ umlsl v10.8h, v13.8b, v31.8b
+ umlsl v10.8h, v16.8b, v31.8b
+ ld1 {v12.2s, v13.2s}, [x6], x2 // horz row 5
+ uaddl v18.8h, v1.8b, v6.8b
+ umlal v18.8h, v3.8b, v30.8b
+ umlal v18.8h, v4.8b, v30.8b
+ umlsl v18.8h, v2.8b, v31.8b
+ umlsl v18.8h, v5.8b, v31.8b
+ sqrshrun v28.8b, v10.8h, #5
+ ext v17.8b, v12.8b , v13.8b , #5
+ ext v14.8b, v12.8b , v13.8b , #2
+ ext v15.8b, v12.8b , v13.8b , #3
+ ext v16.8b, v12.8b , v13.8b , #4
+ ext v13.8b, v12.8b , v13.8b , #1
+ sqrshrun v27.8b, v18.8h, #5
+ ld1 {v7.2s}, [x7], x2 // src[11_0]
+ uaddl v10.8h, v12.8b, v17.8b
+ umlal v10.8h, v14.8b, v30.8b
+ umlal v10.8h, v15.8b, v30.8b
+ umlsl v10.8h, v13.8b, v31.8b
+ umlsl v10.8h, v16.8b, v31.8b
+ ld1 {v12.2s, v13.2s}, [x6], x2 // horz row 6
+ uaddl v18.8h, v2.8b, v7.8b
+ umlal v18.8h, v4.8b, v30.8b
+ umlal v18.8h, v5.8b, v30.8b
+ umlsl v18.8h, v3.8b, v31.8b
+ umlsl v18.8h, v6.8b, v31.8b
+ sqrshrun v29.8b, v10.8h, #5
+ ext v17.8b, v12.8b , v13.8b , #5
+ ext v14.8b, v12.8b , v13.8b , #2
+ ext v15.8b, v12.8b , v13.8b , #3
+ ext v16.8b, v12.8b , v13.8b , #4
+ ext v13.8b, v12.8b , v13.8b , #1
+ urhadd v26.16b, v26.16b , v28.16b
+ urhadd v27.16b, v27.16b , v29.16b
+ sqrshrun v28.8b, v18.8h, #5
+ ld1 {v8.2s}, [x7], x2 // src[12_0]
+ uaddl v10.8h, v12.8b, v17.8b
+ umlal v10.8h, v14.8b, v30.8b
+ umlal v10.8h, v15.8b, v30.8b
+ umlsl v10.8h, v13.8b, v31.8b
+ umlsl v10.8h, v16.8b, v31.8b
+ ld1 {v12.2s, v13.2s}, [x6], x2 // horz row 7
+ uaddl v18.8h, v3.8b, v8.8b
+ umlal v18.8h, v5.8b, v30.8b
+ umlal v18.8h, v6.8b, v30.8b
+ umlsl v18.8h, v4.8b, v31.8b
+ umlsl v18.8h, v7.8b, v31.8b
+ sqrshrun v24.8b, v10.8h, #5
+ ext v17.8b, v12.8b , v13.8b , #5
+ ext v14.8b, v12.8b , v13.8b , #2
+ ext v15.8b, v12.8b , v13.8b , #3
+ ext v16.8b, v12.8b , v13.8b , #4
+ ext v13.8b, v12.8b , v13.8b , #1
+ sqrshrun v29.8b, v18.8h, #5
+ uaddl v10.8h, v12.8b, v17.8b
+ umlal v10.8h, v14.8b, v30.8b
+ umlal v10.8h, v15.8b, v30.8b
+ umlsl v10.8h, v13.8b, v31.8b
+ umlsl v10.8h, v16.8b, v31.8b
+ st1 {v26.2s}, [x1], x3
+
+ mov v0.16b, v4.16b
+ mov v1.16b, v5.16b
+ st1 {v27.2s}, [x1], x3
+
+ mov v2.16b, v6.16b
+ mov v3.16b, v7.16b
+
+ mov v4.8b, v8.8b
+ mov v5.8b, v9.8b
+
+ sqrshrun v25.8b, v10.8h, #5
+ subs x4, x4, #8
+ urhadd v24.16b, v24.16b , v28.16b
+ urhadd v25.16b, v25.16b , v29.16b
+ st1 {v24.2s}, [x1], x3
+ st1 {v25.2s}, [x1], x3
+ bgt loop_8 //if height =8 loop
+ b end_func
+
+loop_4_start:
+ ld1 {v0.s}[0], [x7], x2 // Vector load from src[0_0]
+ ld1 {v1.s}[0], [x7], x2 // Vector load from src[1_0]
+
+ ld1 {v2.s}[0], [x7], x2 // Vector load from src[2_0]
+ ld1 {v3.s}[0], [x7], x2 // Vector load from src[3_0]
+ ld1 {v4.s}[0], [x7], x2 // Vector load from src[4_0]
+
+ ld1 {v5.s}[0], [x7], x2 // Vector load from src[5_0]
+ uaddl v10.8h, v0.8b, v5.8b
+ umlal v10.8h, v2.8b, v30.8b
+ umlal v10.8h, v3.8b, v30.8b
+ umlsl v10.8h, v1.8b, v31.8b
+ umlsl v10.8h, v4.8b, v31.8b
+ ld1 {v12.2s, v13.2s}, [x6], x2 //load for horz filter row 0
+ ext v17.8b, v12.8b , v13.8b , #5
+ ext v14.8b, v12.8b , v13.8b , #2
+ ext v15.8b, v12.8b , v13.8b , #3
+ ext v16.8b, v12.8b , v13.8b , #4
+ ext v13.8b, v12.8b , v13.8b , #1
+ sqrshrun v26.8b, v10.8h, #5
+ ld1 {v6.s}[0], [x7], x2 // Vector load from src[6_0]
+ uaddl v10.8h, v12.8b, v17.8b
+ umlal v10.8h, v14.8b, v30.8b
+ umlal v10.8h, v15.8b, v30.8b
+ umlsl v10.8h, v13.8b, v31.8b
+ umlsl v10.8h, v16.8b, v31.8b
+ ld1 {v12.2s, v13.2s}, [x6], x2 //horz row 1
+ uaddl v18.8h, v1.8b, v6.8b
+ umlal v18.8h, v3.8b, v30.8b
+ umlal v18.8h, v4.8b, v30.8b
+ umlsl v18.8h, v2.8b, v31.8b
+ umlsl v18.8h, v5.8b, v31.8b
+ sqrshrun v28.8b, v10.8h, #5
+ ext v17.8b, v12.8b , v13.8b , #5
+ ext v14.8b, v12.8b , v13.8b , #2
+ ext v15.8b, v12.8b , v13.8b , #3
+ ext v16.8b, v12.8b , v13.8b , #4
+ ext v13.8b, v12.8b , v13.8b , #1
+ sqrshrun v27.8b, v18.8h, #5
+ ld1 {v7.s}[0], [x7], x2 // Vector load from src[7_0]
+ uaddl v10.8h, v12.8b, v17.8b
+ umlal v10.8h, v14.8b, v30.8b
+ umlal v10.8h, v15.8b, v30.8b
+ umlsl v10.8h, v13.8b, v31.8b
+ umlsl v10.8h, v16.8b, v31.8b
+ ld1 {v12.2s, v13.2s}, [x6], x2 //horz row 2
+ uaddl v18.8h, v2.8b, v7.8b
+ umlal v18.8h, v4.8b, v30.8b
+ umlal v18.8h, v5.8b, v30.8b
+ umlsl v18.8h, v3.8b, v31.8b
+ umlsl v18.8h, v6.8b, v31.8b
+ sqrshrun v29.8b, v10.8h, #5
+ ext v17.8b, v12.8b , v13.8b , #5
+ ext v14.8b, v12.8b , v13.8b , #2
+ ext v15.8b, v12.8b , v13.8b , #3
+ ext v16.8b, v12.8b , v13.8b , #4
+ ext v13.8b, v12.8b , v13.8b , #1
+ urhadd v26.16b, v26.16b , v28.16b
+ urhadd v27.16b, v27.16b , v29.16b
+ sqrshrun v28.8b, v18.8h, #5
+ ld1 {v8.s}[0], [x7], x2 // Vector load from src[8_0]
+ uaddl v10.8h, v12.8b, v17.8b
+ umlal v10.8h, v14.8b, v30.8b
+ umlal v10.8h, v15.8b, v30.8b
+ umlsl v10.8h, v13.8b, v31.8b
+ umlsl v10.8h, v16.8b, v31.8b
+ ld1 {v12.2s, v13.2s}, [x6], x2 //horz row 3
+ uaddl v18.8h, v3.8b, v8.8b
+ umlal v18.8h, v5.8b, v30.8b
+ umlal v18.8h, v6.8b, v30.8b
+ umlsl v18.8h, v4.8b, v31.8b
+ umlsl v18.8h, v7.8b, v31.8b
+ sqrshrun v24.8b, v10.8h, #5
+ ext v17.8b, v12.8b , v13.8b , #5
+ ext v14.8b, v12.8b , v13.8b , #2
+ ext v15.8b, v12.8b , v13.8b , #3
+ ext v16.8b, v12.8b , v13.8b , #4
+ ext v13.8b, v12.8b , v13.8b , #1
+ sqrshrun v29.8b, v18.8h, #5
+ uaddl v10.8h, v12.8b, v17.8b
+ umlal v10.8h, v14.8b, v30.8b
+ umlal v10.8h, v15.8b, v30.8b
+ umlsl v10.8h, v13.8b, v31.8b
+ umlsl v10.8h, v16.8b, v31.8b
+ st1 {v26.s}[0], [x1], x3
+
+ mov v0.16b, v4.16b
+ mov v1.16b, v5.16b
+
+ st1 {v27.s}[0], [x1], x3
+
+ mov v2.16b, v6.16b
+ mov v3.16b, v7.16b
+ mov v4.8b, v8.8b
+
+ sqrshrun v25.8b, v10.8h, #5
+ subs x4, x4, #4
+ urhadd v24.16b, v24.16b , v28.16b
+ urhadd v25.16b, v25.16b , v29.16b
+ st1 {v24.s}[0], [x1], x3
+ st1 {v25.s}[0], [x1], x3
+ beq end_func // Branch if height==4
+
+ ld1 {v5.s}[0], [x7], x2 // Vector load from src[5_0]
+ uaddl v10.8h, v0.8b, v5.8b
+ umlal v10.8h, v2.8b, v30.8b
+ umlal v10.8h, v3.8b, v30.8b
+ umlsl v10.8h, v1.8b, v31.8b
+ umlsl v10.8h, v4.8b, v31.8b
+ ld1 {v12.2s, v13.2s}, [x6], x2 //load for horz filter row 4
+ ext v17.8b, v12.8b , v13.8b , #5
+ ext v14.8b, v12.8b , v13.8b , #2
+ ext v15.8b, v12.8b , v13.8b , #3
+ ext v16.8b, v12.8b , v13.8b , #4
+ ext v13.8b, v12.8b , v13.8b , #1
+ sqrshrun v26.8b, v10.8h, #5
+ ld1 {v6.s}[0], [x7], x2
+ uaddl v10.8h, v12.8b, v17.8b
+ umlal v10.8h, v14.8b, v30.8b
+ umlal v10.8h, v15.8b, v30.8b
+ umlsl v10.8h, v13.8b, v31.8b
+ umlsl v10.8h, v16.8b, v31.8b
+ ld1 {v12.2s, v13.2s}, [x6], x2 //horz row 5
+ uaddl v18.8h, v1.8b, v6.8b
+ umlal v18.8h, v3.8b, v30.8b
+ umlal v18.8h, v4.8b, v30.8b
+ umlsl v18.8h, v2.8b, v31.8b
+ umlsl v18.8h, v5.8b, v31.8b
+ sqrshrun v28.8b, v10.8h, #5
+ ext v17.8b, v12.8b , v13.8b , #5
+ ext v14.8b, v12.8b , v13.8b , #2
+ ext v15.8b, v12.8b , v13.8b , #3
+ ext v16.8b, v12.8b , v13.8b , #4
+ ext v13.8b, v12.8b , v13.8b , #1
+ sqrshrun v27.8b, v18.8h, #5
+ ld1 {v7.s}[0], [x7], x2
+ uaddl v10.8h, v12.8b, v17.8b
+ umlal v10.8h, v14.8b, v30.8b
+ umlal v10.8h, v15.8b, v30.8b
+ umlsl v10.8h, v13.8b, v31.8b
+ umlsl v10.8h, v16.8b, v31.8b
+ ld1 {v12.2s, v13.2s}, [x6], x2 //horz row 6
+ uaddl v18.8h, v2.8b, v7.8b
+ umlal v18.8h, v4.8b, v30.8b
+ umlal v18.8h, v5.8b, v30.8b
+ umlsl v18.8h, v3.8b, v31.8b
+ umlsl v18.8h, v6.8b, v31.8b
+ sqrshrun v29.8b, v10.8h, #5
+ ext v17.8b, v12.8b , v13.8b , #5
+ ext v14.8b, v12.8b , v13.8b , #2
+ ext v15.8b, v12.8b , v13.8b , #3
+ ext v16.8b, v12.8b , v13.8b , #4
+ ext v13.8b, v12.8b , v13.8b , #1
+ urhadd v26.16b, v26.16b , v28.16b
+ urhadd v27.16b, v27.16b , v29.16b
+ sqrshrun v28.8b, v18.8h, #5
+ ld1 {v8.s}[0], [x7], x2
+ uaddl v10.8h, v12.8b, v17.8b
+ umlal v10.8h, v14.8b, v30.8b
+ umlal v10.8h, v15.8b, v30.8b
+ umlsl v10.8h, v13.8b, v31.8b
+ umlsl v10.8h, v16.8b, v31.8b
+ ld1 {v12.2s, v13.2s}, [x6], x2 //horz row 7
+ uaddl v18.8h, v3.8b, v8.8b
+ umlal v18.8h, v5.8b, v30.8b
+ umlal v18.8h, v6.8b, v30.8b
+ umlsl v18.8h, v4.8b, v31.8b
+ umlsl v18.8h, v7.8b, v31.8b
+ sqrshrun v24.8b, v10.8h, #5
+ ext v17.8b, v12.8b , v13.8b , #5
+ ext v14.8b, v12.8b , v13.8b , #2
+ ext v15.8b, v12.8b , v13.8b , #3
+ ext v16.8b, v12.8b , v13.8b , #4
+ ext v13.8b, v12.8b , v13.8b , #1
+ sqrshrun v29.8b, v18.8h, #5
+ uaddl v10.8h, v12.8b, v17.8b
+ umlal v10.8h, v14.8b, v30.8b
+ umlal v10.8h, v15.8b, v30.8b
+ umlsl v10.8h, v13.8b, v31.8b
+ umlsl v10.8h, v16.8b, v31.8b
+ st1 {v26.s}[0], [x1], x3
+ st1 {v27.s}[0], [x1], x3
+ sqrshrun v25.8b, v10.8h, #5
+ urhadd v24.16b, v24.16b , v28.16b
+ urhadd v25.16b, v25.16b , v29.16b
+ st1 {v24.s}[0], [x1], x3
+ st1 {v25.s}[0], [x1], x3
+
+end_func:
+ ldp x19, x20, [sp], #16
+ pop_v_regs
+ ret
+
+
+
diff --git a/common/armv8/ih264_inter_pred_luma_vert_qpel_av8.s b/common/armv8/ih264_inter_pred_luma_vert_qpel_av8.s
new file mode 100755
index 0000000..9d19a2d
--- /dev/null
+++ b/common/armv8/ih264_inter_pred_luma_vert_qpel_av8.s
@@ -0,0 +1,511 @@
+//******************************************************************************
+//*
+//* Copyright (C) 2015 The Android Open Source Project
+//*
+//* Licensed under the Apache License, Version 2.0 (the "License");
+//* you may not use this file except in compliance with the License.
+//* You may obtain a copy of the License at:
+//*
+//* http://www.apache.org/licenses/LICENSE-2.0
+//*
+//* Unless required by applicable law or agreed to in writing, software
+//* distributed under the License is distributed on an "AS IS" BASIS,
+//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//* See the License for the specific language governing permissions and
+//* limitations under the License.
+//*
+//*****************************************************************************
+//* Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
+//*/
+///**
+//******************************************************************************
+//* @file
+//* ih264_inter_pred_luma_vert_qpel_av8.s
+//*
+//* @brief
+//* Contains function definitions for inter prediction vertical quarter pel interpolation.
+//*
+//* @author
+//* Mohit
+//*
+//* @par List of Functions:
+//*
+//* - ih264_inter_pred_luma_vert_qpel_av8()
+//*
+//* @remarks
+//* None
+//*
+//*******************************************************************************
+//*/
+
+///* All the functions here are replicated from ih264_inter_pred_filters.c
+//
+
+///**
+///**
+//*******************************************************************************
+//*
+//* @brief
+//* Quarter pel interprediction luma filter for vertical input
+//*
+//* @par Description:
+//* Applies a 6 tap horizontal filter .The output is clipped to 8 bits
+//* sec 8.4.2.2.1 titled "Luma sample interpolation process"
+//*
+//* @param[in] pu1_src
+//* UWORD8 pointer to the source
+//*
+//* @param[out] pu1_dst
+//* UWORD8 pointer to the destination
+//*
+//* @param[in] src_strd
+//* integer source stride
+//*
+//* @param[in] dst_strd
+//* integer destination stride
+//*
+//* @param[in] ht
+//* integer height of the array
+//*
+//* @param[in] wd
+//* integer width of the array
+//*
+//* @param[in] pu1_tmp: temporary buffer: UNUSED in this function
+//*
+//* @param[in] dydx: x and y reference offset for qpel calculations.
+//* @returns
+//*
+// @remarks
+//* None
+//*
+//*******************************************************************************
+//*/
+
+//void ih264_inter_pred_luma_vert (
+// UWORD8 *pu1_src,
+// UWORD8 *pu1_dst,
+// WORD32 src_strd,
+// WORD32 dst_strd,
+// WORD32 ht,
+// WORD32 wd,
+// UWORD8* pu1_tmp,
+// UWORD32 dydx)
+
+//**************Variables Vs Registers*****************************************
+// x0 => *pu1_src
+// x1 => *pu1_dst
+// x2 => src_strd
+// x3 => dst_strd
+// x4 => ht
+// x5 => wd
+// x7 => dydx
+
+.text
+.p2align 2
+.include "ih264_neon_macros.s"
+
+
+
+ .global ih264_inter_pred_luma_vert_qpel_av8
+
+ih264_inter_pred_luma_vert_qpel_av8:
+
+ push_v_regs
+ stp x19, x20, [sp, #-16]!
+
+
+ and x7, x7, #12 //Finds y-offset
+ lsr x7, x7, #3 //dydx>>3
+ mul x7, x2, x7
+ add x7, x0, x7 //pu1_src + (y_offset>>1)*src_strd
+ sub x14, x4, #16
+ movi v22.8h, #20 // Filter coeff 0x14 into Q11
+ sub x0, x0, x2, lsl #1 //pu1_src-2*src_strd
+ subs x12, x5, #8 //if wd=8 branch to loop_8
+ movi v24.8h, #5 // Filter coeff 0x4 into Q12
+ beq loop_8_start
+
+ subs x12, x5, #4 //if wd=4 branch to loop_4
+ beq loop_4_start
+
+
+ ld1 {v0.2s, v1.2s}, [x0], x2 // Vector load from src[0_0]
+ ld1 {v2.2s, v3.2s}, [x0], x2 // Vector load from src[1_0]
+ ld1 {v4.2s, v5.2s}, [x0], x2 // Vector load from src[2_0]
+ ld1 {v6.2s, v7.2s}, [x0], x2 // Vector load from src[3_0]
+ add x14, x14, #1 //for checking loop
+ ld1 {v8.2s, v9.2s}, [x0], x2 // Vector load from src[4_0]
+ uaddl v12.8h, v4.8b, v6.8b // temp1 = src[2_0] + src[3_0]
+ ld1 {v10.2s, v11.2s}, [x0], x2 // Vector load from src[5_0]
+
+loop_16: //when wd=16
+
+ uaddl v14.8h, v0.8b, v10.8b // temp = src[0_0] + src[5_0]
+ uaddl v16.8h, v2.8b, v8.8b // temp2 = src[1_0] + src[4_0]
+ mla v14.8h, v12.8h , v22.8h // temp += temp1 * 20
+ uaddl v20.8h, v1.8b, v11.8b // temp4 = src[0_8] + src[5_8]
+ uaddl v18.8h, v5.8b, v7.8b // temp3 = src[2_8] + src[3_8]
+ mla v20.8h, v18.8h , v22.8h // temp4 += temp3 * 20
+ ld1 {v0.2s, v1.2s}, [x0], x2
+ uaddl v26.8h, v3.8b, v9.8b // temp5 = src[1_8] + src[4_8]
+ uaddl v12.8h, v6.8b, v8.8b
+ mls v14.8h, v16.8h , v24.8h // temp -= temp2 * 5
+ uaddl v16.8h, v2.8b, v0.8b
+ uaddl v18.8h, v4.8b, v10.8b
+ mla v16.8h, v12.8h , v22.8h
+ mls v20.8h, v26.8h , v24.8h // temp4 -= temp5 * 5
+ uaddl v26.8h, v5.8b, v11.8b
+ uaddl v12.8h, v7.8b, v9.8b
+ sqrshrun v30.8b, v14.8h, #5 // dst[0_0] = CLIP_U8((temp +16) >> 5)
+ uaddl v14.8h, v3.8b, v1.8b
+ ld1 {v2.2s, v3.2s}, [x0], x2
+ mla v14.8h, v12.8h , v22.8h
+ mls v16.8h, v18.8h , v24.8h
+ sqrshrun v31.8b, v20.8h, #5 // dst[0_8] = CLIP_U8((temp4 +16) >> 5)
+ ld1 {v20.2s, v21.2s}, [x7], x2 // Load for interpolation row 0
+ urhadd v30.16b, v20.16b , v30.16b // Interpolation to obtain qpel value
+ urhadd v31.16b, v21.16b , v31.16b // Interpolation to obtain qpel value
+ uaddl v18.8h, v4.8b, v2.8b
+ uaddl v12.8h, v8.8b, v10.8b
+ st1 {v30.2s, v31.2s}, [x1], x3 // Vector store to dst[0_0]
+ mla v18.8h, v12.8h , v22.8h
+ uaddl v20.8h, v6.8b, v0.8b
+ mls v14.8h, v26.8h , v24.8h
+ sqrshrun v30.8b, v16.8h, #5
+ uaddl v12.8h, v9.8b, v11.8b
+ uaddl v16.8h, v5.8b, v3.8b
+ uaddl v26.8h, v7.8b, v1.8b
+ mla v16.8h, v12.8h , v22.8h
+ mls v18.8h, v20.8h , v24.8h
+ ld1 {v4.2s, v5.2s}, [x0], x2
+ sqrshrun v31.8b, v14.8h, #5
+ ld1 {v14.2s, v15.2s}, [x7], x2 // Load for interpolation row 1
+ uaddl v12.8h, v10.8b, v0.8b
+ urhadd v30.16b, v14.16b , v30.16b // Interpolation to obtain qpel value
+ urhadd v31.16b, v15.16b , v31.16b // Interpolation to obtain qpel value
+ uaddl v14.8h, v6.8b, v4.8b
+ uaddl v20.8h, v8.8b, v2.8b
+ mla v14.8h, v12.8h , v22.8h
+ mls v16.8h, v26.8h , v24.8h
+ st1 {v30.2s, v31.2s}, [x1], x3 //store row 1
+ sqrshrun v30.8b, v18.8h, #5
+ uaddl v18.8h, v7.8b, v5.8b
+ uaddl v12.8h, v11.8b, v1.8b
+ mla v18.8h, v12.8h , v22.8h
+ uaddl v26.8h, v9.8b, v3.8b
+ mls v14.8h, v20.8h , v24.8h
+ ld1 {v6.2s, v7.2s}, [x0], x2
+ sqrshrun v31.8b, v16.8h, #5
+ ld1 {v16.2s, v17.2s}, [x7], x2 // Load for interpolation row 2
+ mls v18.8h, v26.8h , v24.8h
+ urhadd v30.16b, v16.16b , v30.16b // Interpolation to obtain qpel value
+ urhadd v31.16b, v17.16b , v31.16b // Interpolation to obtain qpel value
+ uaddl v12.8h, v0.8b, v2.8b // temp1 = src[2_0] + src[3_0]
+ st1 {v30.2s, v31.2s}, [x1], x3 //store row 2
+ uaddl v16.8h, v10.8b, v4.8b // temp2 = src[1_0] + src[4_0]
+ uaddl v20.8h, v9.8b, v7.8b // temp4 = src[0_8] + src[5_8]
+ sqrshrun v30.8b, v14.8h, #5
+ uaddl v26.8h, v5.8b, v11.8b // temp5 = src[1_8] + src[4_8]
+ uaddl v14.8h, v8.8b, v6.8b // temp = src[0_0] + src[5_0]
+ sqrshrun v31.8b, v18.8h, #5
+ ld1 {v18.2s, v19.2s}, [x7], x2 // Load for interpolation row 3
+ mla v14.8h, v12.8h , v22.8h // temp += temp1 * 20
+ urhadd v30.16b, v18.16b , v30.16b // Interpolation to obtain qpel value
+ urhadd v31.16b, v19.16b , v31.16b // Interpolation to obtain qpel value
+ uaddl v18.8h, v1.8b, v3.8b // temp3 = src[2_8] + src[3_8]
+ st1 {v30.2s, v31.2s}, [x1], x3 //store row 3
+ // 4 rows processed
+ mla v20.8h, v18.8h , v22.8h // temp4 += temp3 * 20
+ ld1 {v8.2s, v9.2s}, [x0], x2
+ uaddl v12.8h, v2.8b, v4.8b
+ uaddl v18.8h, v3.8b, v5.8b
+ mls v14.8h, v16.8h , v24.8h // temp -= temp2 * 5
+ uaddl v28.8h, v9.8b, v11.8b
+ uaddl v16.8h, v6.8b, v0.8b
+ mla v28.8h, v18.8h , v22.8h // temp4 += temp3 * 20
+ mls v20.8h, v26.8h , v24.8h // temp4 -= temp5 * 5
+ uaddl v26.8h, v1.8b, v7.8b
+ uaddl v18.8h, v5.8b, v7.8b
+ sqrshrun v30.8b, v14.8h, #5 // dst[0_0] = CLIP_U8((temp +16) >> 5)
+ uaddl v14.8h, v8.8b, v10.8b
+ sqrshrun v31.8b, v20.8h, #5 // dst[0_8] = CLIP_U8((temp4 +16) >> 5)
+ ld1 {v20.2s, v21.2s}, [x7], x2 // Load for interpolation row 4
+ ld1 {v10.2s, v11.2s}, [x0], x2
+ urhadd v30.16b, v20.16b , v30.16b // Interpolation to obtain qpel value
+ urhadd v31.16b, v21.16b , v31.16b // Interpolation to obtain qpel value
+ mls v28.8h, v26.8h , v24.8h // temp4 -= temp5 * 5
+ st1 {v30.2s, v31.2s}, [x1], x3 // store row 4
+ mla v14.8h, v12.8h , v22.8h // temp += temp1 * 20
+ uaddl v20.8h, v11.8b, v1.8b
+ uaddl v26.8h, v3.8b, v9.8b
+ mla v20.8h, v18.8h , v22.8h // temp4 += temp3 * 20
+ uaddl v12.8h, v6.8b, v4.8b
+ uaddl v18.8h, v7.8b, v9.8b
+ sqrshrun v31.8b, v28.8h, #5 // dst[0_8] = CLIP_U8((temp4 +16) >> 5)
+ mls v14.8h, v16.8h , v24.8h // temp -= temp2 * 5
+ uaddl v16.8h, v8.8b, v2.8b
+ sqrshrun v30.8b, v14.8h, #5 // dst[0_0] = CLIP_U8((temp +16) >> 5)
+ ld1 {v14.2s, v15.2s}, [x7], x2 // Load for interpolation row 5
+ mls v20.8h, v26.8h , v24.8h // temp4 -= temp5 * 5
+ urhadd v30.16b, v14.16b , v30.16b // Interpolation to obtain qpel value
+ urhadd v31.16b, v15.16b , v31.16b // Interpolation to obtain qpel value
+ uaddl v14.8h, v10.8b, v0.8b
+ st1 {v30.2s, v31.2s}, [x1], x3 // store row 5
+ mla v14.8h, v12.8h , v22.8h // temp += temp1 * 20
+ ld1 {v0.2s, v1.2s}, [x0], x2
+ uaddl v26.8h, v5.8b, v11.8b
+ uaddl v12.8h, v8.8b, v6.8b
+ uaddl v28.8h, v0.8b, v2.8b
+ sqrshrun v31.8b, v20.8h, #5 // dst[0_8] = CLIP_U8((temp4 +16) >> 5)
+ mla v28.8h, v12.8h , v22.8h // temp += temp1 * 20
+ uaddl v20.8h, v1.8b, v3.8b
+ mls v14.8h, v16.8h , v24.8h // temp -= temp2 * 5
+ mla v20.8h, v18.8h , v22.8h // temp4 += temp3 * 20
+ uaddl v16.8h, v10.8b, v4.8b
+ sqrshrun v30.8b, v14.8h, #5 // dst[0_0] = CLIP_U8((temp +16) >> 5)
+ ld1 {v14.2s, v15.2s}, [x7], x2 // Load for interpolation row 6
+ mov v2.8b, v6.8b
+ mov v3.8b, v7.8b
+ urhadd v30.16b, v14.16b , v30.16b // Interpolation to obtain qpel value
+ urhadd v31.16b, v15.16b , v31.16b // Interpolation to obtain qpel value
+
+ mls v28.8h, v16.8h , v24.8h // temp -= temp2 * 5
+ st1 {v30.2s, v31.2s}, [x1], x3 // store row 6
+ sqrshrun v30.8b, v28.8h, #5 // dst[0_0] = CLIP_U8((temp +16) >> 5)
+ swp v0.8b, v4.8b // swapping registers to put it in order
+ swp v1.8b, v5.8b // swapping registers to put it in order
+
+ mls v20.8h, v26.8h , v24.8h // temp4 -= temp5 * 5
+ mov v6.8b, v10.8b
+ mov v7.8b, v11.8b
+ subs x12, x14, #1 // if height==16 - looping
+ swp v4.8b, v8.8b
+ swp v5.8b, v9.8b
+ sqrshrun v31.8b, v20.8h, #5 // dst[0_8] = CLIP_U8((temp4 +16) >> 5)
+ ld1 {v20.2s, v21.2s}, [x7], x2 // Load for interpolation row 7
+ urhadd v30.16b, v20.16b , v30.16b // Interpolation to obtain qpel value
+ urhadd v31.16b, v21.16b , v31.16b // Interpolation to obtain qpel value
+ st1 {v30.2s, v31.2s}, [x1], x3 // store row 7
+ bne end_func //if height =8 end function
+ add x14, x14, #1 //for checking loop
+ ld1 {v10.2s, v11.2s}, [x0], x2
+ uaddl v12.8h, v4.8b, v6.8b // temp1 = src[2_0] + src[3_0]
+
+ b loop_16 // looping if height =16
+
+loop_8_start:
+//// Processing row0 and row1
+
+ ld1 {v0.2s}, [x0], x2 // Vector load from src[0_0]
+ ld1 {v1.2s}, [x0], x2 // Vector load from src[1_0]
+ ld1 {v2.2s}, [x0], x2 // Vector load from src[2_0]
+ ld1 {v3.2s}, [x0], x2 // Vector load from src[3_0]
+ add x14, x14, #1 //for checking loop
+ ld1 {v4.2s}, [x0], x2 // Vector load from src[4_0]
+ ld1 {v5.2s}, [x0], x2 // Vector load from src[5_0]
+
+loop_8:
+ //for checking loop
+ uaddl v6.8h, v2.8b, v3.8b // temp1 = src[2_0] + src[3_0]
+ uaddl v8.8h, v0.8b, v5.8b // temp = src[0_0] + src[5_0]
+ uaddl v10.8h, v1.8b, v4.8b // temp2 = src[1_0] + src[4_0]
+ mla v8.8h, v6.8h , v22.8h // temp += temp1 * 20
+ ld1 {v6.2s}, [x0], x2
+ uaddl v14.8h, v3.8b, v4.8b
+ uaddl v16.8h, v1.8b, v6.8b
+ uaddl v18.8h, v2.8b, v5.8b
+ mls v8.8h, v10.8h , v24.8h // temp -= temp2 * 5
+ mla v16.8h, v14.8h , v22.8h
+ ld1 {v7.2s}, [x0], x2
+ uaddl v20.8h, v4.8b, v5.8b
+ uaddl v12.8h, v2.8b, v7.8b
+ uaddl v10.8h, v3.8b, v6.8b
+ mls v16.8h, v18.8h , v24.8h
+ sqrshrun v26.8b, v8.8h, #5 // dst[0_0] = CLIP_U8( (temp + 16) >> 5)
+ mla v12.8h, v20.8h , v22.8h
+ ld1 {v8.2s}, [x7], x2 //Load value for interpolation (row0)
+ ld1 {v9.2s}, [x7], x2 //Load value for interpolation (row1)
+ ld1 {v0.2s}, [x0], x2
+ uaddl v14.8h, v5.8b, v6.8b
+ sqrshrun v27.8b, v16.8h, #5
+ urhadd v26.16b, v8.16b , v26.16b // Interpolation step for qpel calculation
+ urhadd v27.16b, v9.16b , v27.16b // Interpolation step for qpel calculation
+
+ uaddl v20.8h, v3.8b, v0.8b
+ mls v12.8h, v10.8h , v24.8h
+ st1 {v26.2s}, [x1], x3 // Vector store to dst[0_0]
+ uaddl v18.8h, v4.8b, v7.8b
+ mla v20.8h, v14.8h , v22.8h
+ st1 {v27.2s}, [x1], x3 // Vector store to dst[1_0]
+ sqrshrun v28.8b, v12.8h, #5
+ mls v20.8h, v18.8h , v24.8h
+ ld1 {v12.2s}, [x7], x2 //Load value for interpolation (row2)
+ ld1 {v13.2s}, [x7], x2 //Load value for interpolation (row3)
+ ld1 {v1.2s}, [x0], x2
+ sqrshrun v29.8b, v20.8h, #5
+ subs x9, x4, #4
+ urhadd v28.16b, v12.16b , v28.16b
+ urhadd v29.16b, v13.16b , v29.16b
+ st1 {v28.2s}, [x1], x3 //store row 2
+ st1 {v29.2s}, [x1], x3 //store row 3
+ beq end_func // Branch if height==4
+ uaddl v14.8h, v6.8b, v7.8b // temp1 = src[2_0] + src[3_0]
+ uaddl v16.8h, v0.8b, v5.8b // temp = src[0_0] + src[5_0]
+ uaddl v18.8h, v1.8b, v4.8b // temp2 = src[1_0] + src[4_0]
+ mla v18.8h, v14.8h , v22.8h // temp += temp1 * 20
+ ld1 {v2.2s}, [x0], x2
+ mls v18.8h, v16.8h , v24.8h // temp -= temp2 * 5
+ uaddl v8.8h, v0.8b, v7.8b
+ uaddl v10.8h, v1.8b, v6.8b
+ uaddl v12.8h, v2.8b, v5.8b
+ sqrshrun v26.8b, v18.8h, #5
+ mla v12.8h, v8.8h , v22.8h
+ ld1 {v18.2s}, [x7], x2 //Load value for interpolation (row4)
+ ld1 {v19.2s}, [x7], x2 //Load value for interpolation (row5)
+ ld1 {v3.2s}, [x0], x2
+ mls v12.8h, v10.8h , v24.8h
+ sqrshrun v27.8b, v12.8h, #5
+ urhadd v26.16b, v18.16b , v26.16b // Interpolation step for qpel calculation
+ urhadd v27.16b, v19.16b , v27.16b // Interpolation step for qpel calculation
+
+ st1 {v26.2s}, [x1], x3 // store row 4
+ st1 {v27.2s}, [x1], x3 // store row 5
+ uaddl v14.8h, v0.8b, v1.8b // temp1 = src[2_0] + src[3_0]
+ uaddl v16.8h, v2.8b, v7.8b // temp = src[0_0] + src[5_0]
+ uaddl v18.8h, v3.8b, v6.8b // temp2 = src[1_0] + src[4_0]
+ mla v18.8h, v14.8h , v22.8h // temp += temp1 * 20
+ ld1 {v4.2s}, [x0], x2
+ mls v18.8h, v16.8h , v24.8h // temp -= temp2 * 5
+ uaddl v8.8h, v2.8b, v1.8b
+ uaddl v10.8h, v3.8b, v0.8b
+ uaddl v12.8h, v4.8b, v7.8b
+ sqrshrun v26.8b, v18.8h, #5
+ mla v12.8h, v8.8h , v22.8h
+ ld1 {v18.2s}, [x7], x2 //Load value for interpolation (row6)
+ ld1 {v19.2s}, [x7], x2 //Load value for interpolation (row7)
+ ld1 {v5.2s}, [x0], x2
+ mls v12.8h, v10.8h , v24.8h
+ sqrshrun v27.8b, v12.8h, #5
+ urhadd v26.16b, v18.16b , v26.16b // Interpolation step for qpel calculation
+ urhadd v27.16b, v19.16b , v27.16b // Interpolation step for qpel calculation
+
+ subs x12, x14, #1
+ st1 {v26.2s}, [x1], x3 // store row 6
+ st1 {v27.2s}, [x1], x3 // store row 7
+ add x14, x14, #1
+ beq loop_8 //looping if height ==16
+
+ b end_func
+
+
+loop_4_start:
+//// Processing row0 and row1
+
+
+ ld1 {v0.s}[0], [x0], x2 // Vector load from src[0_0]
+ ld1 {v1.s}[0], [x0], x2 // Vector load from src[1_0]
+ ld1 {v2.s}[0], [x0], x2 // Vector load from src[2_0]
+ ld1 {v3.s}[0], [x0], x2 // Vector load from src[3_0]
+ ld1 {v4.s}[0], [x0], x2 // Vector load from src[4_0]
+ ld1 {v5.s}[0], [x0], x2 // Vector load from src[5_0]
+
+ uaddl v6.8h, v2.8b, v3.8b // temp1 = src[2_0] + src[3_0]
+ uaddl v8.8h, v0.8b, v5.8b // temp = src[0_0] + src[5_0]
+ uaddl v10.8h, v1.8b, v4.8b // temp2 = src[1_0] + src[4_0]
+ mla v8.8h, v6.8h , v22.8h // temp += temp1 * 20
+ ld1 {v6.2s}, [x0], x2
+ uaddl v14.8h, v3.8b, v4.8b
+ uaddl v16.8h, v1.8b, v6.8b
+ uaddl v18.8h, v2.8b, v5.8b
+ mls v8.8h, v10.8h , v24.8h // temp -= temp2 * 5
+ ld1 {v7.s}[0], [x0], x2
+ mla v16.8h, v14.8h , v22.8h
+ uaddl v20.8h, v4.8b, v5.8b
+ uaddl v12.8h, v2.8b, v7.8b
+ uaddl v10.8h, v3.8b, v6.8b
+ mls v16.8h, v18.8h , v24.8h
+ sqrshrun v26.8b, v8.8h, #5 // dst[0_0] = CLIP_U8( (temp + 16) >> 5)
+ ld1 {v8.s}[0], [x7], x2 //Load value for interpolation - row 0
+ ld1 {v9.s}[0], [x7], x2 //Load value for interpolation - row 1
+ mla v12.8h, v20.8h , v22.8h
+ ld1 {v0.s}[0], [x0], x2
+ uaddl v14.8h, v5.8b, v6.8b
+ sqrshrun v27.8b, v16.8h, #5
+ uaddl v20.8h, v3.8b, v0.8b
+ urhadd v26.16b, v26.16b , v8.16b //Interpolation step for qpel calculation
+ urhadd v27.16b, v27.16b , v9.16b //Interpolation step for qpel calculation
+
+ mls v12.8h, v10.8h , v24.8h
+ st1 {v26.s}[0], [x1], x3 // Vector store to dst[0_0]
+ uaddl v18.8h, v4.8b, v7.8b
+ mla v20.8h, v14.8h , v22.8h
+ st1 {v27.s}[0], [x1], x3 // store row 1
+ sqrshrun v28.8b, v12.8h, #5
+ ld1 {v12.s}[0], [x7], x2 //Load value for interpolation - row 2
+ ld1 {v13.s}[0], [x7], x2 //Load value for interpolation - row 3
+
+ mls v20.8h, v18.8h , v24.8h
+ ld1 {v1.s}[0], [x0], x2
+ sqrshrun v29.8b, v20.8h, #5
+ urhadd v28.16b, v12.16b , v28.16b //Interpolation step for qpel calculation
+ urhadd v29.16b, v13.16b , v29.16b //Interpolation step for qpel calculation
+
+ st1 {v28.s}[0], [x1], x3 //store row 2
+ st1 {v29.s}[0], [x1], x3 //store row 3
+
+ subs x9, x4, #4
+ beq end_func // Branch if height==4
+
+
+ uaddl v14.8h, v6.8b, v7.8b // temp1 = src[2_0] + src[3_0]
+ uaddl v16.8h, v0.8b, v5.8b // temp = src[0_0] + src[5_0]
+ uaddl v18.8h, v1.8b, v4.8b // temp2 = src[1_0] + src[4_0]
+ mla v18.8h, v14.8h , v22.8h // temp += temp1 * 20
+ ld1 {v2.s}[0], [x0], x2
+ mls v18.8h, v16.8h , v24.8h // temp -= temp2 * 5
+ uaddl v8.8h, v0.8b, v7.8b
+ uaddl v10.8h, v1.8b, v6.8b
+ uaddl v12.8h, v2.8b, v5.8b
+ sqrshrun v26.8b, v18.8h, #5
+ ld1 {v18.s}[0], [x7], x2 //Load value for interpolation - row 4
+ ld1 {v19.s}[0], [x7], x2 //Load value for interpolation - row 5
+ mla v12.8h, v8.8h , v22.8h
+ ld1 {v3.s}[0], [x0], x2
+ mls v12.8h, v10.8h , v24.8h
+ sqrshrun v27.8b, v12.8h, #5
+ urhadd v26.16b, v18.16b , v26.16b //Interpolation step for qpel calculation
+ urhadd v27.16b, v27.16b , v19.16b //Interpolation step for qpel calculation
+
+ st1 {v26.s}[0], [x1], x3 //store row 4
+ st1 {v27.s}[0], [x1], x3 // store row 5
+ uaddl v14.8h, v0.8b, v1.8b // temp1 = src[2_0] + src[3_0]
+ uaddl v16.8h, v2.8b, v7.8b // temp = src[0_0] + src[5_0]
+ uaddl v18.8h, v3.8b, v6.8b // temp2 = src[1_0] + src[4_0]
+ mla v18.8h, v14.8h , v22.8h // temp += temp1 * 20
+ ld1 {v4.s}[0], [x0], x2
+ mls v18.8h, v16.8h , v24.8h // temp -= temp2 * 5
+ uaddl v8.8h, v2.8b, v1.8b
+ uaddl v10.8h, v3.8b, v0.8b
+ uaddl v12.8h, v4.8b, v7.8b
+ sqrshrun v26.8b, v18.8h, #5
+ ld1 {v18.s}[0], [x7], x2 //Load value for interpolation - row 6
+ ld1 {v19.s}[0], [x7], x2 //Load value for interpolation - row 7
+ mla v12.8h, v8.8h , v22.8h
+ ld1 {v5.s}[0], [x0], x2
+ mls v12.8h, v10.8h , v24.8h
+ sqrshrun v27.8b, v12.8h, #5
+ urhadd v26.16b, v18.16b , v26.16b //Interpolation step for qpel calculation
+ urhadd v27.16b, v19.16b , v27.16b //Interpolation step for qpel calculation
+
+ st1 {v26.s}[0], [x1], x3 // store row 6
+ st1 {v27.s}[0], [x1], x3 // store row 7
+
+
+end_func:
+ // LDMFD sp!,{x4-x12,PC} //Restoring registers from stack
+ ldp x19, x20, [sp], #16
+ pop_v_regs
+ ret
+
+
+
diff --git a/common/armv8/ih264_intra_pred_chroma_av8.s b/common/armv8/ih264_intra_pred_chroma_av8.s
new file mode 100755
index 0000000..62edfdc
--- /dev/null
+++ b/common/armv8/ih264_intra_pred_chroma_av8.s
@@ -0,0 +1,574 @@
+//******************************************************************************
+//*
+//* Copyright (C) 2015 The Android Open Source Project
+//*
+//* Licensed under the Apache License, Version 2.0 (the "License");
+//* you may not use this file except in compliance with the License.
+//* You may obtain a copy of the License at:
+//*
+//* http://www.apache.org/licenses/LICENSE-2.0
+//*
+//* Unless required by applicable law or agreed to in writing, software
+//* distributed under the License is distributed on an "AS IS" BASIS,
+//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//* See the License for the specific language governing permissions and
+//* limitations under the License.
+//*
+//*****************************************************************************
+//* Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
+//*/
+///**
+//******************************************************************************
+//* @file
+//* ih264_intra_pred_chroma.s
+//*
+//* @brief
+//* Contains function definitions for intra chroma prediction .
+//*
+//* @author
+//* Ittiam
+//*
+//* @par List of Functions:
+//*
+//* - ih264_intra_pred_luma_chroma_mode_vert_av8()
+//* - ih264_intra_pred_luma_chroma_mode_horz_av8()
+//* - ih264_intra_pred_luma_chroma_mode_dc_av8()
+//* - ih264_intra_pred_luma_chroma_mode_plane_av8()
+//*
+//* @remarks
+//* None
+//*
+//*******************************************************************************
+//*/
+
+///* All the functions here are replicated from ih264_chroma_intra_pred_filters.c
+//
+
+///**
+///**
+///**
+//
+
+
+.text
+.p2align 2
+.include "ih264_neon_macros.s"
+
+.extern ih264_gai1_intrapred_chroma_plane_coeffs1
+.extern ih264_gai1_intrapred_chroma_plane_coeffs2
+
+
+
+///**
+//*******************************************************************************
+//*
+//*ih264_intra_pred_chroma_8x8_mode_dc
+//*
+//* @brief
+//* Perform Intra prediction for chroma_8x8 mode:DC
+//*
+//* @par Description:
+//* Perform Intra prediction for chroma_8x8 mode:DC ,described in sec 8.3.4.1
+//*
+//* @param[in] pu1_src
+//* UWORD8 pointer to the source containing alternate U and V samples
+//*
+//* @param[out] pu1_dst
+//* UWORD8 pointer to the destination with alternate U and V samples
+//*
+//* @param[in] src_strd
+//* integer source stride
+//*
+//* @param[in] dst_strd
+//* integer destination stride
+//*
+//** @param[in] ui_neighboravailability
+//* availability of neighbouring pixels
+//*
+//* @returns
+//*
+//* @remarks
+//* None
+//*
+//*******************************************************************************/
+//void ih264_intra_pred_chroma_8x8_mode_dc(UWORD8 *pu1_src,
+// UWORD8 *pu1_dst,
+// WORD32 src_strd,
+// WORD32 dst_strd,
+// WORD32 ui_neighboravailability)
+
+//**************Variables Vs Registers*****************************************
+// x0 => *pu1_src
+// x1 => *pu1_dst
+// x2 => src_strd
+// x3 => dst_strd
+// x4 => ui_neighboravailability
+
+
+
+ .global ih264_intra_pred_chroma_8x8_mode_dc_av8
+
+ih264_intra_pred_chroma_8x8_mode_dc_av8:
+
+
+ push_v_regs
+ stp x19, x20, [sp, #-16]!
+
+ mov x19, #5
+ ands x6, x4, x19
+ beq none_available
+ cmp x6, #1
+ beq left_only_available
+ cmp x6, #4
+ beq top_only_available
+
+all_available:
+ ld1 {v0.8b, v1.8b}, [x0]
+ add x6, x0, #18
+ ld1 {v2.8b, v3.8b}, [x6]
+ uxtl v0.8h, v0.8b
+ uxtl v1.8h, v1.8b
+ addp v0.4s, v0.4s , v0.4s
+ addp v1.4s, v1.4s , v1.4s
+ addp v0.4s, v0.4s , v0.4s
+ addp v1.4s, v1.4s , v1.4s
+ uxtl v2.8h, v2.8b
+ uxtl v3.8h, v3.8b
+ addp v2.4s, v2.4s , v2.4s
+ addp v3.4s, v3.4s , v3.4s
+ addp v2.4s, v2.4s , v2.4s
+ addp v3.4s, v3.4s , v3.4s
+ rshrn v5.8b, v0.8h, #2
+ dup v21.8h, v5.h[0]
+ rshrn v6.8b, v3.8h, #2
+ dup v20.8h, v6.h[0]
+ add v1.8h, v1.8h, v2.8h
+ rshrn v1.8b, v1.8h, #3
+ dup v23.8h, v1.h[0]
+ mov v20.d[0], v23.d[0]
+ add v0.8h, v0.8h, v3.8h
+ rshrn v0.8b, v0.8h, #3
+ dup v23.8h, v0.h[0]
+ mov v21.d[1], v23.d[0]
+ b store
+left_only_available:
+ ld1 {v0.8b, v1.8b}, [x0]
+ uxtl v0.8h, v0.8b
+ uxtl v1.8h, v1.8b
+ addp v0.4s, v0.4s , v0.4s
+ addp v1.4s, v1.4s , v1.4s
+ addp v0.4s, v0.4s , v0.4s
+ addp v1.4s, v1.4s , v1.4s
+ rshrn v0.8b, v0.8h, #2
+ rshrn v1.8b, v1.8h, #2
+ dup v20.8h , v1.h[0]
+ dup v21.8h, v0.h[0]
+ b store
+
+top_only_available:
+ add x6, x0, #18
+ ld1 {v0.8b, v1.8b}, [x6]
+ uxtl v0.8h, v0.8b
+ uxtl v1.8h, v1.8b
+ addp v0.4s, v0.4s , v0.4s
+ addp v1.4s, v1.4s , v1.4s
+ addp v0.4s, v0.4s , v0.4s
+ addp v1.4s, v1.4s , v1.4s
+ rshrn v0.8b, v0.8h, #2
+ rshrn v1.8b, v1.8h, #2
+ dup v20.8h , v0.h[0]
+ dup v21.8h, v1.h[0]
+ mov v20.d[1], v21.d[1]
+ mov v21.d[0], v20.d[0]
+ b store
+none_available:
+ mov w15, #128
+ dup v20.16b, w15
+ dup v21.16b, w15
+
+
+store:
+
+ st1 { v20.16b}, [x1], x3
+ st1 { v20.16b}, [x1], x3
+ st1 { v20.16b}, [x1], x3
+ st1 { v20.16b}, [x1], x3
+ st1 { v21.16b}, [x1], x3
+ st1 { v21.16b}, [x1], x3
+ st1 { v21.16b}, [x1], x3
+ st1 { v21.16b}, [x1], x3
+end_func:
+
+ ldp x19, x20, [sp], #16
+ pop_v_regs
+ ret
+
+
+
+
+
+///******************************************************************************
+
+
+///**
+//*******************************************************************************
+//*
+//*ih264_intra_pred_chroma_8x8_mode_horz
+//*
+//* @brief
+//* Perform Intra prediction for chroma_8x8 mode:Horizontal
+//*
+//* @par Description:
+//* Perform Intra prediction for chroma_8x8 mode:Horizontal ,described in sec 8.3.4.2
+//*
+//* @param[in] pu1_src
+//* UWORD8 pointer to the source containing alternate U and V samples
+//*
+//* @param[out] pu1_dst
+//* UWORD8 pointer to the destination with alternate U and V samples
+//*
+//* @param[in] src_strd
+//* integer source stride
+//*
+//* @param[in] dst_strd
+//* integer destination stride
+//*
+//* @param[in] ui_neighboravailability
+//* availability of neighbouring pixels(Not used in this function)
+//*
+//* @returns
+//*
+//* @remarks
+//* None
+//*
+//*******************************************************************************
+//*/
+//void ih264_intra_pred_chroma_8x8_mode_horz(UWORD8 *pu1_src,
+// UWORD8 *pu1_dst,
+// WORD32 src_strd,
+// WORD32 dst_strd,
+// WORD32 ui_neighboravailability)
+//**************Variables Vs Registers*****************************************
+// x0 => *pu1_src
+// x1 => *pu1_dst
+// x2 => src_strd
+// x3 => dst_strd
+// x4 => ui_neighboravailability
+
+
+ .global ih264_intra_pred_chroma_8x8_mode_horz_av8
+
+ih264_intra_pred_chroma_8x8_mode_horz_av8:
+
+
+
+ push_v_regs
+ ld1 {v0.8h}, [x0]
+
+ dup v10.8h, v0.h[7]
+ dup v11.8h, v0.h[6]
+ dup v12.8h, v0.h[5]
+ dup v13.8h, v0.h[4]
+ st1 {v10.8h}, [x1], x3
+ dup v14.8h, v0.h[3]
+ st1 {v11.8h}, [x1], x3
+ dup v15.8h, v0.h[2]
+ st1 {v12.8h}, [x1], x3
+ dup v16.8h, v0.h[1]
+ st1 {v13.8h}, [x1], x3
+ dup v17.8h, v0.h[0]
+ st1 {v14.8h}, [x1], x3
+ st1 {v15.8h}, [x1], x3
+ st1 {v16.8h}, [x1], x3
+ st1 {v17.8h}, [x1], x3
+
+
+ pop_v_regs
+ ret
+
+
+
+
+
+
+///**
+//*******************************************************************************
+//*
+//*ih264_intra_pred_chroma_8x8_mode_vert
+//*
+//* @brief
+//* Perform Intra prediction for chroma_8x8 mode:vertical
+//*
+//* @par Description:
+//*Perform Intra prediction for chroma_8x8 mode:vertical ,described in sec 8.3.4.3
+//*
+//* @param[in] pu1_src
+//* UWORD8 pointer to the source containing alternate U and V samples
+//*
+//* @param[out] pu1_dst
+//* UWORD8 pointer to the destination with alternate U and V samples
+//*
+//* @param[in] src_strd
+//* integer source stride
+//*
+//* @param[in] dst_strd
+//* integer destination stride
+//*
+//* @param[in] ui_neighboravailability
+//* availability of neighbouring pixels(Not used in this function)
+//*
+//* @returns
+//*
+//* @remarks
+//* None
+//*
+//*******************************************************************************
+//void ih264_intra_pred_chroma_8x8_mode_vert(UWORD8 *pu1_src,
+// UWORD8 *pu1_dst,
+// WORD32 src_strd,
+// WORD32 dst_strd,
+// WORD32 ui_neighboravailability)
+
+//**************Variables Vs Registers*****************************************
+// x0 => *pu1_src
+// x1 => *pu1_dst
+// x2 => src_strd
+// x3 => dst_strd
+// x4 => ui_neighboravailability
+
+
+ .global ih264_intra_pred_chroma_8x8_mode_vert_av8
+
+ih264_intra_pred_chroma_8x8_mode_vert_av8:
+
+ push_v_regs
+
+ add x0, x0, #18
+ ld1 {v0.8b, v1.8b}, [x0]
+
+ st1 {v0.8b, v1.8b}, [x1], x3
+ st1 {v0.8b, v1.8b}, [x1], x3
+ st1 {v0.8b, v1.8b}, [x1], x3
+ st1 {v0.8b, v1.8b}, [x1], x3
+ st1 {v0.8b, v1.8b}, [x1], x3
+ st1 {v0.8b, v1.8b}, [x1], x3
+ st1 {v0.8b, v1.8b}, [x1], x3
+ st1 {v0.8b, v1.8b}, [x1], x3
+
+ pop_v_regs
+ ret
+
+
+
+
+///******************************************************************************
+
+
+///**
+//*******************************************************************************
+//*
+//*ih264_intra_pred_chroma_8x8_mode_plane
+//*
+//* @brief
+//* Perform Intra prediction for chroma_8x8 mode:PLANE
+//*
+//* @par Description:
+//* Perform Intra prediction for chroma_8x8 mode:PLANE ,described in sec 8.3.4.4
+//*
+//* @param[in] pu1_src
+//* UWORD8 pointer to the source containing alternate U and V samples
+//*
+//* @param[out] pu1_dst
+//* UWORD8 pointer to the destination with alternate U and V samples
+//*
+//* @param[in] src_strd
+//* integer source stride
+//*
+//* @param[in] dst_strd
+//* integer destination stride
+//*
+//* @param[in] ui_neighboravailability
+//* availability of neighbouring pixels
+//*
+//* @returns
+//*
+//* @remarks
+//* None
+//*
+//*******************************************************************************/
+//void ih264_intra_pred_chroma_8x8_mode_plane(UWORD8 *pu1_src,
+// UWORD8 *pu1_dst,
+// WORD32 src_strd,
+// WORD32 dst_strd,
+// WORD32 ui_neighboravailability)
+
+//**************Variables Vs Registers*****************************************
+// x0 => *pu1_src
+// x1 => *pu1_dst
+// x2 => src_strd
+// x3 => dst_strd
+// x4 => ui_neighboravailability
+
+ .global ih264_intra_pred_chroma_8x8_mode_plane_av8
+ih264_intra_pred_chroma_8x8_mode_plane_av8:
+
+ push_v_regs
+ stp x19, x20, [sp, #-16]!
+
+ ld1 {v0.2s}, [x0]
+ add x10, x0, #10
+ ld1 {v1.2s}, [x10]
+ add x10, x10, #6
+ rev64 v5.4h, v0.4h
+ ld1 {v2.2s}, [x10], #8
+ add x10, x10, #2
+ rev64 v7.4h, v2.4h
+ ld1 {v3.2s}, [x10]
+ sub x5, x3, #8
+ adrp x12, :got:ih264_gai1_intrapred_chroma_plane_coeffs1
+ ldr x12, [x12, #:got_lo12:ih264_gai1_intrapred_chroma_plane_coeffs1]
+ usubl v10.8h, v5.8b, v1.8b
+ ld1 {v8.8b, v9.8b}, [x12] // Load multiplication factors 1 to 8 into D3
+ mov v8.d[1], v9.d[0]
+ usubl v12.8h, v3.8b, v7.8b
+ mul v14.8h, v10.8h , v8.8h
+ mul v16.8h, v12.8h , v8.8h
+ uzp1 v15.8h, v14.8h, v16.8h
+ uzp2 v16.8h, v14.8h, v16.8h
+ mov v14.16b, v15.16b
+ mov v15.d[0], v14.d[1]
+ mov v17.d[0], v16.d[1]
+ addp v14.4h, v14.4h, v14.4h
+ addp v15.4h, v15.4h, v15.4h
+ addp v16.4h, v16.4h, v16.4h
+ addp v17.4h, v17.4h, v17.4h
+ addp v14.4h, v14.4h, v14.4h
+ addp v15.4h, v15.4h, v15.4h
+ addp v16.4h, v16.4h, v16.4h
+ addp v17.4h, v17.4h, v17.4h
+ mov x6, #34
+ dup v18.8h, w6
+ smull v22.4s, v14.4h, v18.4h
+ smull v24.4s, v15.4h, v18.4h
+ smull v26.4s, v16.4h, v18.4h
+ smull v28.4s, v17.4h, v18.4h
+ rshrn v10.4h, v22.4s, #6
+ rshrn v12.4h, v24.4s, #6
+ rshrn v13.4h, v26.4s, #6
+ rshrn v14.4h, v28.4s, #6
+ ldrb w6, [x0], #1
+ sxtw x6, w6
+ add x10, x0, #31
+ ldrb w8, [x0], #1
+ sxtw x8, w8
+ ldrb w7, [x10], #1
+ sxtw x7, w7
+ ldrb w9, [x10], #1
+ sxtw x9, w9
+ add x6, x6, x7
+ add x8, x8, x9
+ lsl x6, x6, #4
+ lsl x8, x8, #4
+ dup v0.8h, w6
+ dup v2.8h, w8
+ dup v4.8h, v12.h[0]
+ dup v6.8h, v10.h[0]
+ dup v24.8h, v14.h[0]
+ dup v26.8h, v13.h[0]
+ zip1 v5.8h, v4.8h, v24.8h
+ zip2 v24.8h, v4.8h, v24.8h
+ mov v4.16b, v5.16b
+ zip1 v7.8h, v6.8h, v26.8h
+ zip2 v26.8h, v6.8h, v26.8h
+ mov v6.16b, v7.16b
+ zip1 v1.8h, v0.8h, v2.8h
+ zip2 v2.8h, v0.8h, v2.8h
+ mov v0.16b, v1.16b
+
+ adrp x12, :got:ih264_gai1_intrapred_chroma_plane_coeffs2
+ ldr x12, [x12, #:got_lo12:ih264_gai1_intrapred_chroma_plane_coeffs2]
+
+ ld1 {v8.2s, v9.2s}, [x12]
+ mov v8.d[1], v9.d[0]
+ mov v10.16b, v8.16b
+ mov v22.16b, v8.16b
+ zip1 v9.8h, v8.8h, v10.8h
+ zip2 v10.8h, v8.8h, v10.8h
+ mov v8.16b, v9.16b
+ mul v12.8h, v4.8h , v8.8h
+ mul v16.8h, v4.8h , v10.8h
+ add v12.8h, v0.8h , v12.8h
+ add v16.8h, v0.8h , v16.8h
+ dup v20.8h, v22.h[0]
+ mul v4.8h, v6.8h , v20.8h
+ dup v30.8h, v22.4h[1]
+ mul v18.8h, v6.8h , v20.8h
+ mul v14.8h, v6.8h , v30.8h
+ mul v8.8h, v6.8h , v30.8h
+ add v24.8h, v12.8h , v4.8h
+ add v0.8h, v16.8h , v18.8h
+ add v2.8h, v12.8h , v14.8h
+ sqrshrun v28.8b, v24.8h, #5
+ add v26.8h, v16.8h , v8.8h
+ sqrshrun v29.8b, v0.8h, #5
+ dup v20.8h, v22.4h[2]
+ st1 {v28.8b, v29.8b}, [x1], x3
+ sqrshrun v28.8b, v2.8h, #5
+ sqrshrun v29.8b, v26.8h, #5
+ mul v4.8h, v6.8h , v20.8h
+ mul v18.8h, v6.8h , v20.8h
+ st1 {v28.8b, v29.8b}, [x1], x3
+ add v24.8h, v12.8h , v4.8h
+ add v0.8h, v16.8h , v18.8h
+ dup v30.8h, v22.4h[3]
+ sqrshrun v28.8b, v24.8h, #5
+ sqrshrun v29.8b, v0.8h, #5
+ mul v14.8h, v6.8h , v30.8h
+ mul v8.8h, v6.8h , v30.8h
+ st1 {v28.8b, v29.8b}, [x1], x3
+ add v2.8h, v12.8h , v14.8h
+ add v26.8h, v16.8h , v8.8h
+ dup v20.8h, v22.h[4]
+ sqrshrun v28.8b, v2.8h, #5
+ sqrshrun v29.8b, v26.8h, #5
+ mul v4.8h, v6.8h , v20.8h
+ mul v18.8h, v6.8h , v20.8h
+ st1 {v28.8b, v29.8b}, [x1], x3
+ add v24.8h, v12.8h , v4.8h
+ add v0.8h, v16.8h , v18.8h
+ dup v30.8h, v22.h[5]
+ sqrshrun v28.8b, v24.8h, #5
+ sqrshrun v29.8b, v0.8h, #5
+ mul v14.8h, v6.8h , v30.8h
+ mul v8.8h, v6.8h , v30.8h
+ st1 {v28.8b, v29.8b}, [x1], x3
+ add v2.8h, v12.8h , v14.8h
+ add v26.8h, v16.8h , v8.8h
+ dup v20.8h, v22.h[6]
+ sqrshrun v28.8b, v2.8h, #5
+ sqrshrun v29.8b, v26.8h, #5
+ mul v4.8h, v6.8h , v20.8h
+ mul v18.8h, v6.8h , v20.8h
+ st1 {v28.8b, v29.8b}, [x1], x3
+ add v24.8h, v12.8h , v4.8h
+ add v0.8h, v16.8h , v18.8h
+ dup v30.8h, v22.h[7]
+ sqrshrun v28.8b, v24.8h, #5
+ sqrshrun v29.8b, v0.8h, #5
+ mul v14.8h, v6.8h , v30.8h
+ mul v8.8h, v6.8h , v30.8h
+ st1 {v28.8b, v29.8b}, [x1], x3
+ add v2.8h, v12.8h , v14.8h
+ add v26.8h, v16.8h , v8.8h
+ sqrshrun v28.8b, v2.8h, #5
+ sqrshrun v29.8b, v26.8h, #5
+ st1 {v28.8b, v29.8b}, [x1], x3
+
+end_func_plane:
+
+ ldp x19, x20, [sp], #16
+ pop_v_regs
+ ret
+
+
+
diff --git a/common/armv8/ih264_intra_pred_luma_16x16_av8.s b/common/armv8/ih264_intra_pred_luma_16x16_av8.s
new file mode 100755
index 0000000..a9eb165
--- /dev/null
+++ b/common/armv8/ih264_intra_pred_luma_16x16_av8.s
@@ -0,0 +1,606 @@
+//******************************************************************************
+//*
+//* Copyright (C) 2015 The Android Open Source Project
+//*
+//* Licensed under the Apache License, Version 2.0 (the "License");
+//* you may not use this file except in compliance with the License.
+//* You may obtain a copy of the License at:
+//*
+//* http://www.apache.org/licenses/LICENSE-2.0
+//*
+//* Unless required by applicable law or agreed to in writing, software
+//* distributed under the License is distributed on an "AS IS" BASIS,
+//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//* See the License for the specific language governing permissions and
+//* limitations under the License.
+//*
+//*****************************************************************************
+//* Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
+//*/
+///**
+//******************************************************************************
+//* @file
+//* ih264_intra_pred_luma_16x16_av8.s
+//*
+//* @brief
+//* Contains function definitions for intra 16x16 Luma prediction .
+//*
+//* @author
+//* Ittiam
+//*
+//* @par List of Functions:
+//*
+//* - ih264_intra_pred_luma_16x16_mode_vert_av8()
+//* - ih264_intra_pred_luma_16x16_mode_horz_av8()
+//* - ih264_intra_pred_luma_16x16_mode_dc_av8()
+//* - ih264_intra_pred_luma_16x16_mode_plane_av8()
+//*
+//* @remarks
+//* None
+//*
+//*******************************************************************************
+//*/
+
+///* All the functions here are replicated from ih264_intra_pred_filters.c
+//
+
+///**
+///**
+///**
+//
+
+
+.text
+.p2align 2
+.include "ih264_neon_macros.s"
+.extern ih264_gai1_intrapred_luma_plane_coeffs
+
+
+
+///**
+//*******************************************************************************
+//*
+//*ih264_intra_pred_luma_16x16_mode_vert
+//*
+//* @brief
+//* Perform Intra prediction for luma_16x16 mode:vertical
+//*
+//* @par Description:
+//* Perform Intra prediction for luma_16x16 mode:Vertical ,described in sec 8.3.3.1
+//*
+//* @param[in] pu1_src
+//* UWORD8 pointer to the source
+//*
+//* @param[out] pu1_dst
+//* UWORD8 pointer to the destination
+//*
+//* @param[in] src_strd
+//* integer source stride
+//*
+//* @param[in] dst_strd
+//* integer destination stride
+//*
+//* @param[in] ui_neighboravailability
+//* availability of neighbouring pixels(Not used in this function)
+//*
+//* @returns
+//*
+//* @remarks
+//* None
+//*
+//*******************************************************************************
+//void ih264_intra_pred_luma_16x16_mode_vert(UWORD8 *pu1_src,
+// UWORD8 *pu1_dst,
+// WORD32 src_strd,
+// WORD32 dst_strd,
+// WORD32 ui_neighboravailability)
+
+//**************Variables Vs Registers*****************************************
+// x0 => *pu1_src
+// x1 => *pu1_dst
+// x2 => src_strd
+// x3 => dst_strd
+// x4 => ui_neighboravailability
+
+
+ .global ih264_intra_pred_luma_16x16_mode_vert_av8
+
+ih264_intra_pred_luma_16x16_mode_vert_av8:
+
+ push_v_regs
+
+
+ add x0, x0, #17
+ ld1 {v0.8b, v1.8b}, [x0]
+
+ st1 {v0.8b, v1.8b}, [x1], x3
+ st1 {v0.8b, v1.8b}, [x1], x3
+ st1 {v0.8b, v1.8b}, [x1], x3
+ st1 {v0.8b, v1.8b}, [x1], x3
+ st1 {v0.8b, v1.8b}, [x1], x3
+ st1 {v0.8b, v1.8b}, [x1], x3
+ st1 {v0.8b, v1.8b}, [x1], x3
+ st1 {v0.8b, v1.8b}, [x1], x3
+ st1 {v0.8b, v1.8b}, [x1], x3
+ st1 {v0.8b, v1.8b}, [x1], x3
+ st1 {v0.8b, v1.8b}, [x1], x3
+ st1 {v0.8b, v1.8b}, [x1], x3
+ st1 {v0.8b, v1.8b}, [x1], x3
+ st1 {v0.8b, v1.8b}, [x1], x3
+ st1 {v0.8b, v1.8b}, [x1], x3
+ st1 {v0.8b, v1.8b}, [x1], x3
+
+ pop_v_regs
+ ret
+
+
+
+
+
+///******************************************************************************
+
+
+///**
+//*******************************************************************************
+//*
+//*ih264_intra_pred_luma_16x16_mode_horz
+//*
+//* @brief
+//* Perform Intra prediction for luma_16x16 mode:horizontal
+//*
+//* @par Description:
+//* Perform Intra prediction for luma_16x16 mode:horizontal ,described in sec 8.3.3.2
+//*
+//* @param[in] pu1_src
+//* UWORD8 pointer to the source
+//*
+//* @param[out] pu1_dst
+//* UWORD8 pointer to the destination
+//*
+//* @param[in] src_strd
+//* integer source stride
+//*
+//* @param[in] dst_strd
+//* integer destination stride
+//*
+//* @param[in] ui_neighboravailability
+//* availability of neighbouring pixels(Not used in this function)
+//*
+//* @returns
+//*
+//* @remarks
+//* None
+//*
+//*******************************************************************************
+//*/
+//void ih264_intra_pred_luma_16x16_mode_horz(UWORD8 *pu1_src,
+// UWORD8 *pu1_dst,
+// WORD32 src_strd,
+// WORD32 dst_strd,
+// WORD32 ui_neighboravailability)
+//**************Variables Vs Registers*****************************************
+// x0 => *pu1_src
+// x1 => *pu1_dst
+// x2 => src_strd
+// x3 => dst_strd
+// x4 => ui_neighboravailability
+
+ .global ih264_intra_pred_luma_16x16_mode_horz_av8
+
+ih264_intra_pred_luma_16x16_mode_horz_av8:
+
+
+
+ push_v_regs
+
+ ld1 {v0.16b}, [x0]
+
+
+
+ dup v10.16b, v0.b[15]
+ dup v11.16b, v0.b[14]
+ dup v12.16b, v0.b[13]
+ dup v13.16b, v0.b[12]
+ st1 {v10.16b}, [x1], x3
+ dup v14.16b, v0.b[11]
+ st1 {v11.16b}, [x1], x3
+ dup v15.16b, v0.b[10]
+ st1 {v12.16b}, [x1], x3
+ dup v16.16b, v0.b[9]
+ st1 {v13.16b}, [x1], x3
+ dup v17.16b, v0.b[8]
+ st1 {v14.16b}, [x1], x3
+ dup v18.16b, v0.b[7]
+ st1 {v15.16b}, [x1], x3
+ dup v19.16b, v0.b[6]
+ st1 {v16.16b}, [x1], x3
+ dup v20.16b, v0.b[5]
+ st1 {v17.16b}, [x1], x3
+ dup v21.16b, v0.b[4]
+ st1 {v18.16b}, [x1], x3
+ dup v22.16b, v0.b[3]
+ st1 {v19.16b}, [x1], x3
+ dup v23.16b, v0.b[2]
+ st1 {v20.16b}, [x1], x3
+ dup v24.16b, v0.b[1]
+ st1 {v21.16b}, [x1], x3
+ dup v25.16b, v0.b[0]
+ st1 {v22.16b}, [x1], x3
+ st1 {v23.16b}, [x1], x3
+ st1 {v24.16b}, [x1], x3
+ st1 {v25.16b}, [x1], x3
+
+ pop_v_regs
+ ret
+
+
+
+
+
+
+
+///******************************************************************************
+
+
+///**
+//*******************************************************************************
+//*
+//*ih264_intra_pred_luma_16x16_mode_dc
+//*
+//* @brief
+//* Perform Intra prediction for luma_16x16 mode:DC
+//*
+//* @par Description:
+//* Perform Intra prediction for luma_16x16 mode:DC ,described in sec 8.3.3.3
+//*
+//* @param[in] pu1_src
+//* UWORD8 pointer to the source
+//*
+//* @param[out] pu1_dst
+//* UWORD8 pointer to the destination
+//*
+//* @param[in] src_strd
+//* integer source stride
+//*
+//* @param[in] dst_strd
+//* integer destination stride
+//*
+//* @param[in] ui_neighboravailability
+//* availability of neighbouring pixels
+//*
+//* @returns
+//*
+//* @remarks
+//* None
+//*
+//*******************************************************************************/
+//void ih264_intra_pred_luma_16x16_mode_dc(UWORD8 *pu1_src,
+// UWORD8 *pu1_dst,
+// WORD32 src_strd,
+// WORD32 dst_strd,
+// WORD32 ui_neighboravailability)
+
+//**************Variables Vs Registers*****************************************
+// x0 => *pu1_src
+// x1 => *pu1_dst
+// x2 => src_strd
+// x3 => dst_strd
+// x4 => ui_neighboravailability
+
+ .global ih264_intra_pred_luma_16x16_mode_dc_av8
+
+ih264_intra_pred_luma_16x16_mode_dc_av8:
+
+
+
+ push_v_regs
+ stp x19, x20, [sp, #-16]!
+
+ sub v0.16b, v0.16b, v0.16b
+ sub v1.16b, v1.16b, v1.16b
+ mov w10, #0
+ mov w11 , #3
+ ands x6, x4, #0x01
+ beq top_available //LEFT NOT AVAILABLE
+ ld1 {v0.16b}, [x0]
+ add w10, w10, #8
+ add w11, w11, #1
+top_available:
+ ands x6, x4, #0x04
+ beq none_available
+ add x6, x0, #17
+ ld1 {v1.16b}, [x6]
+ add w10, w10, #8
+ add w11, w11, #1
+ b summation
+none_available:
+ cmp x4, #0
+ bne summation
+ mov w15, #128
+ dup v20.16b, w15
+ b store
+summation:
+ uaddl v2.8h, v0.8b, v1.8b
+ uaddl2 v3.8h, v0.16b, v1.16b
+ dup v10.8h, w10
+ neg w11, w11
+ dup v20.8h, w11
+ add v0.8h, v2.8h, v3.8h
+ mov v1.d[0], v0.d[1]
+ add v0.4h, v0.4h, v1.4h
+ addp v0.4h, v0.4h , v0.4h
+ addp v0.4h, v0.4h , v0.4h
+ add v0.4h, v0.4h, v10.4h
+ uqshl v0.8h, v0.8h, v20.8h
+ sqxtun v0.8b, v0.8h
+ dup v20.16b, v0.b[0]
+
+store:
+
+ st1 { v20.16b}, [x1], x3
+ st1 { v20.16b}, [x1], x3
+ st1 { v20.16b}, [x1], x3
+ st1 { v20.16b}, [x1], x3
+ st1 { v20.16b}, [x1], x3
+ st1 { v20.16b}, [x1], x3
+ st1 { v20.16b}, [x1], x3
+ st1 { v20.16b}, [x1], x3
+ st1 { v20.16b}, [x1], x3
+ st1 { v20.16b}, [x1], x3
+ st1 { v20.16b}, [x1], x3
+ st1 { v20.16b}, [x1], x3
+ st1 { v20.16b}, [x1], x3
+ st1 { v20.16b}, [x1], x3
+ st1 { v20.16b}, [x1], x3
+ st1 { v20.16b}, [x1], x3
+
+
+
+end_func:
+
+ ldp x19, x20, [sp], #16
+ pop_v_regs
+ ret
+
+
+
+
+
+///******************************************************************************
+
+
+///**
+//*******************************************************************************
+//*
+//*ih264_intra_pred_luma_16x16_mode_plane
+//*
+//* @brief
+//* Perform Intra prediction for luma_16x16 mode:PLANE
+//*
+//* @par Description:
+//* Perform Intra prediction for luma_16x16 mode:PLANE ,described in sec 8.3.3.4
+//*
+//* @param[in] pu1_src
+//* UWORD8 pointer to the source
+//*
+//* @param[out] pu1_dst
+//* UWORD8 pointer to the destination
+//*
+//* @param[in] src_strd
+//* integer source stride
+//*
+//* @param[in] dst_strd
+//* integer destination stride
+//*
+//* @param[in] ui_neighboravailability
+//* availability of neighbouring pixels
+//*
+//* @returns
+//*
+//* @remarks
+//* None
+//*
+//*******************************************************************************/
+//void ih264_intra_pred_luma_16x16_mode_plane(UWORD8 *pu1_src,
+// UWORD8 *pu1_dst,
+// WORD32 src_strd,
+// WORD32 dst_strd,
+// WORD32 ui_neighboravailability)
+
+//**************Variables Vs Registers*****************************************
+// x0 => *pu1_src
+// x1 => *pu1_dst
+// x2 => src_strd
+// x3 => dst_strd
+// x4 => ui_neighboravailability
+
+ .global ih264_intra_pred_luma_16x16_mode_plane_av8
+ih264_intra_pred_luma_16x16_mode_plane_av8:
+
+ push_v_regs
+ stp x19, x20, [sp, #-16]!
+ mov x2, x1
+ add x1, x0, #17
+ add x0, x0, #15
+ mov x8, #9
+ sub x1, x1, #1
+ mov x10, x1 //top_left
+ mov x4, #-1
+ ld1 {v2.2s}, [x1], x8
+
+ adrp x7, :got:ih264_gai1_intrapred_luma_plane_coeffs
+ ldr x7, [x7, #:got_lo12:ih264_gai1_intrapred_luma_plane_coeffs]
+
+ ld1 {v0.2s}, [x1]
+ rev64 v2.8b, v2.8b
+ ld1 {v6.2s, v7.2s}, [x7]
+ usubl v0.8h, v0.8b, v2.8b
+ uxtl v16.8h, v6.8b
+ mul v0.8h, v0.8h , v16.8h
+ uxtl v18.8h, v7.8b
+ add x7, x0, x4, lsl #3
+ sub x0, x7, x4, lsl #1
+ sub x20, x4, #0x0
+ neg x14, x20
+ addp v0.8h, v0.8h, v1.8h
+ ldrb w8, [x7], #-1
+ sxtw x8, w8
+ ldrb w9, [x0], #1
+ sxtw x9, w9
+ saddlp v0.2s, v0.4h
+ sub x12, x8, x9
+ ldrb w8, [x7], #-1
+ sxtw x8, w8
+ saddlp v0.1d, v0.2s
+ ldrb w9, [x0], #1
+ sxtw x9, w9
+ sub x8, x8, x9
+ shl v2.2s, v0.2s, #2
+ add x12, x12, x8, lsl #1
+ add v0.2s, v0.2s , v2.2s
+ ldrb w8, [x7], #-1
+ sxtw x8, w8
+ ldrb w9, [x0], #1
+ sxtw x9, w9
+ srshr v0.2s, v0.2s, #6 // i_b = D0[0]
+ sub x8, x8, x9
+ ldrb w5, [x7], #-1
+ sxtw x5, w5
+ add x8, x8, x8, lsl #1
+ dup v4.8h, v0.4h[0]
+ add x12, x12, x8
+ ldrb w9, [x0], #1
+ sxtw x9, w9
+ mul v0.8h, v4.8h , v16.8h
+ sub x5, x5, x9
+ mul v2.8h, v4.8h , v18.8h
+ add x12, x12, x5, lsl #2
+ ldrb w8, [x7], #-1
+ sxtw x8, w8
+ ldrb w9, [x0], #1
+ sxtw x9, w9
+ sub x8, x8, x9
+ ldrb w5, [x7], #-1
+ sxtw x5, w5
+ add x8, x8, x8, lsl #2
+ ldrb w6, [x0], #1
+ sxtw x6, w6
+ add x12, x12, x8
+ ldrb w8, [x7], #-1
+ sxtw x8, w8
+ ldrb w9, [x0], #1
+ sxtw x9, w9
+ sub x5, x5, x6
+ sub x8, x8, x9
+ add x5, x5, x5, lsl #1
+ sub x20, x8, x8, lsl #3
+ neg x8, x20
+ add x12, x12, x5, lsl #1
+ ldrb w5, [x7], #-1
+ sxtw x5, w5
+ ldrb w6, [x10] //top_left
+ sxtw x6, w6
+ add x12, x12, x8
+ sub x9, x5, x6
+ ldrb w6, [x1, #7]
+ sxtw x6, w6
+ add x12, x12, x9, lsl #3 // i_c = x12
+ add x8, x5, x6
+ add x12, x12, x12, lsl #2
+ lsl x8, x8, #4 // i_a = x8
+ add x12, x12, #0x20
+ lsr x12, x12, #6
+ shl v28.8h, v4.8h, #3
+ dup v6.8h, w12
+ dup v30.8h, w8
+ shl v26.8h, v6.8h, #3
+ sub v30.8h, v30.8h , v28.8h
+ sub v30.8h, v30.8h , v26.8h
+ add v28.8h, v30.8h , v6.8h
+ add v26.8h, v28.8h , v0.8h
+ add v28.8h, v28.8h , v2.8h
+ sqrshrun v20.8b, v26.8h, #5
+ sqrshrun v21.8b, v28.8h, #5
+ add v26.8h, v26.8h , v6.8h
+ add v28.8h, v28.8h , v6.8h
+ sqrshrun v22.8b, v26.8h, #5
+ st1 {v20.2s, v21.2s}, [x2], x3
+ sqrshrun v23.8b, v28.8h, #5
+ add v26.8h, v26.8h , v6.8h
+ add v28.8h, v28.8h , v6.8h
+ sqrshrun v20.8b, v26.8h, #5
+ st1 {v22.2s, v23.2s}, [x2], x3
+ sqrshrun v21.8b, v28.8h, #5
+ add v26.8h, v26.8h , v6.8h
+ add v28.8h, v28.8h , v6.8h
+ sqrshrun v22.8b, v26.8h, #5
+ st1 {v20.2s, v21.2s}, [x2], x3
+ sqrshrun v23.8b, v28.8h, #5
+ add v26.8h, v26.8h , v6.8h
+ add v28.8h, v28.8h , v6.8h
+ sqrshrun v20.8b, v26.8h, #5
+ st1 {v22.2s, v23.2s}, [x2], x3
+ sqrshrun v21.8b, v28.8h, #5
+ add v26.8h, v26.8h , v6.8h
+ add v28.8h, v28.8h , v6.8h
+ sqrshrun v22.8b, v26.8h, #5
+ st1 {v20.2s, v21.2s}, [x2], x3
+ sqrshrun v23.8b, v28.8h, #5
+ add v26.8h, v26.8h , v6.8h
+ add v28.8h, v28.8h , v6.8h
+ sqrshrun v20.8b, v26.8h, #5
+ st1 {v22.2s, v23.2s}, [x2], x3
+ sqrshrun v21.8b, v28.8h, #5
+ add v26.8h, v26.8h , v6.8h
+ add v28.8h, v28.8h , v6.8h
+ sqrshrun v22.8b, v26.8h, #5
+ st1 {v20.2s, v21.2s}, [x2], x3
+ sqrshrun v23.8b, v28.8h, #5
+ add v26.8h, v26.8h , v6.8h
+ add v28.8h, v28.8h , v6.8h
+ sqrshrun v20.8b, v26.8h, #5
+ st1 {v22.2s, v23.2s}, [x2], x3
+ sqrshrun v21.8b, v28.8h, #5
+ add v26.8h, v26.8h , v6.8h
+ add v28.8h, v28.8h , v6.8h
+ sqrshrun v22.8b, v26.8h, #5
+ st1 {v20.2s, v21.2s}, [x2], x3
+ sqrshrun v23.8b, v28.8h, #5
+ add v26.8h, v26.8h , v6.8h
+ add v28.8h, v28.8h , v6.8h
+ sqrshrun v20.8b, v26.8h, #5
+ st1 {v22.2s, v23.2s}, [x2], x3
+ sqrshrun v21.8b, v28.8h, #5
+ add v26.8h, v26.8h , v6.8h
+ add v28.8h, v28.8h , v6.8h
+ sqrshrun v22.8b, v26.8h, #5
+ st1 {v20.2s, v21.2s}, [x2], x3
+ sqrshrun v23.8b, v28.8h, #5
+ add v26.8h, v26.8h , v6.8h
+ add v28.8h, v28.8h , v6.8h
+ sqrshrun v20.8b, v26.8h, #5
+ st1 {v22.2s, v23.2s}, [x2], x3
+ sqrshrun v21.8b, v28.8h, #5
+ add v26.8h, v26.8h , v6.8h
+ add v28.8h, v28.8h , v6.8h
+ sqrshrun v22.8b, v26.8h, #5
+ st1 {v20.2s, v21.2s}, [x2], x3
+ sqrshrun v23.8b, v28.8h, #5
+ add v26.8h, v26.8h , v6.8h
+ add v28.8h, v28.8h , v6.8h
+ sqrshrun v20.8b, v26.8h, #5
+ st1 {v22.2s, v23.2s}, [x2], x3
+ sqrshrun v21.8b, v28.8h, #5
+ add v26.8h, v26.8h , v6.8h
+ add v28.8h, v28.8h , v6.8h
+ sqrshrun v22.8b, v26.8h, #5
+ st1 {v20.2s, v21.2s}, [x2], x3
+ sqrshrun v23.8b, v28.8h, #5
+ st1 {v22.2s, v23.2s}, [x2], x3
+
+end_func_plane:
+
+ ldp x19, x20, [sp], #16
+ pop_v_regs
+ ret
+
diff --git a/common/armv8/ih264_intra_pred_luma_4x4_av8.s b/common/armv8/ih264_intra_pred_luma_4x4_av8.s
new file mode 100755
index 0000000..62e8cee
--- /dev/null
+++ b/common/armv8/ih264_intra_pred_luma_4x4_av8.s
@@ -0,0 +1,876 @@
+//******************************************************************************
+//*
+//* Copyright (C) 2015 The Android Open Source Project
+//*
+//* Licensed under the Apache License, Version 2.0 (the "License");
+//* you may not use this file except in compliance with the License.
+//* You may obtain a copy of the License at:
+//*
+//* http://www.apache.org/licenses/LICENSE-2.0
+//*
+//* Unless required by applicable law or agreed to in writing, software
+//* distributed under the License is distributed on an "AS IS" BASIS,
+//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//* See the License for the specific language governing permissions and
+//* limitations under the License.
+//*
+//*****************************************************************************
+//* Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
+//*/
+///**
+//******************************************************************************
+//* @file
+//* ih264_intra_pred_luma_4x4_av8.s
+//*
+//* @brief
+//* Contains function definitions for intra 4x4 Luma prediction .
+//*
+//* @author
+//* Ittiam
+//*
+//* @par List of Functions:
+//*
+//* -ih264_intra_pred_luma_4x4_mode_vert_av8
+//* -ih264_intra_pred_luma_4x4_mode_horz_av8
+//* -ih264_intra_pred_luma_4x4_mode_dc_av8
+//* -ih264_intra_pred_luma_4x4_mode_diag_dl_av8
+//* -ih264_intra_pred_luma_4x4_mode_diag_dr_av8
+//* -ih264_intra_pred_luma_4x4_mode_vert_r_av8
+//* -ih264_intra_pred_luma_4x4_mode_horz_d_av8
+//* -ih264_intra_pred_luma_4x4_mode_vert_l_av8
+//* -ih264_intra_pred_luma_4x4_mode_horz_u_av8
+//*
+//* @remarks
+//* None
+//*
+//*******************************************************************************
+//*/
+
+///* All the functions here are replicated from ih264_intra_pred_filters.c
+//
+
+///**
+///**
+///**
+//
+
+.text
+.p2align 2
+.include "ih264_neon_macros.s"
+
+
+
+
+///**
+//*******************************************************************************
+//*
+//*ih264_intra_pred_luma_4x4_mode_vert
+//*
+//* @brief
+//* Perform Intra prediction for luma_4x4 mode:vertical
+//*
+//* @par Description:
+//* Perform Intra prediction for luma_4x4 mode:vertical ,described in sec 8.3.1.2.1
+//*
+//* @param[in] pu1_src
+//* UWORD8 pointer to the source
+//*
+//* @param[out] pu1_dst
+//* UWORD8 pointer to the destination
+//*
+//* @param[in] src_strd
+//* integer source stride
+//*
+//* @param[in] dst_strd
+//* integer destination stride
+//*
+//* @param[in] ui_neighboravailability
+//* availability of neighbouring pixels(Not used in this function)
+//*
+//* @returns
+//*
+//* @remarks
+//* None
+//*
+//*******************************************************************************
+//void ih264_intra_pred_luma_4x4_mode_vert(UWORD8 *pu1_src,
+// UWORD8 *pu1_dst,
+// WORD32 src_strd,
+// WORD32 dst_strd,
+// WORD32 ui_neighboravailability)
+
+//**************Variables Vs Registers*****************************************
+// x0 => *pu1_src
+// x1 => *pu1_dst
+// x2 => src_strd
+// x3 => dst_strd
+// x4 => ui_neighboravailability
+
+ .global ih264_intra_pred_luma_4x4_mode_vert_av8
+
+ih264_intra_pred_luma_4x4_mode_vert_av8:
+
+ push_v_regs
+
+ add x0, x0, #5
+
+ ld1 {v0.s}[0], [x0]
+ st1 {v0.s}[0], [x1], x3
+ st1 {v0.s}[0], [x1], x3
+ st1 {v0.s}[0], [x1], x3
+ st1 {v0.s}[0], [x1], x3
+
+ pop_v_regs
+ ret
+
+
+
+
+
+///******************************************************************************
+
+
+///**
+//*******************************************************************************
+//*
+//*ih264_intra_pred_luma_4x4_mode_horz
+//*
+//* @brief
+//* Perform Intra prediction for luma_4x4 mode:horizontal
+//*
+//* @par Description:
+//* Perform Intra prediction for luma_4x4 mode:horizontal ,described in sec 8.3.1.2.2
+//*
+//* @param[in] pu1_src
+//* UWORD8 pointer to the source
+//*
+//* @param[out] pu1_dst
+//* UWORD8 pointer to the destination
+//*
+//* @param[in] src_strd
+//* integer source stride
+//*
+//* @param[in] dst_strd
+//* integer destination stride
+//*
+//* @param[in] ui_neighboravailability
+//* availability of neighbouring pixels(Not used in this function)
+//*
+//* @returns
+//*
+//* @remarks
+//* None
+//*
+//*******************************************************************************
+//*/
+//void ih264_intra_pred_luma_4x4_mode_horz(UWORD8 *pu1_src,
+// UWORD8 *pu1_dst,
+// WORD32 src_strd,
+// WORD32 dst_strd,
+// WORD32 ui_neighboravailability)
+//**************Variables Vs Registers*****************************************
+// x0 => *pu1_src
+// x1 => *pu1_dst
+// x2 => src_strd
+// x3 => dst_strd
+// x4 => ui_neighboravailability
+
+
+
+ .global ih264_intra_pred_luma_4x4_mode_horz_av8
+
+ih264_intra_pred_luma_4x4_mode_horz_av8:
+
+ push_v_regs
+
+ ld1 {v1.s}[0], [x0]
+ dup v0.8b, v1.b[3]
+ dup v2.8b, v1.b[2]
+ st1 {v0.s}[0], [x1], x3
+ dup v3.8b, v1.b[1]
+ st1 {v2.s}[0], [x1], x3
+ dup v4.8b, v1.b[0]
+ st1 {v3.s}[0], [x1], x3
+ st1 {v4.s}[0], [x1], x3
+
+ pop_v_regs
+ ret
+
+
+
+
+
+
+
+///******************************************************************************
+
+
+///**
+//*******************************************************************************
+//*
+//*ih264_intra_pred_luma_4x4_mode_dc
+//*
+//* @brief
+//* Perform Intra prediction for luma_4x4 mode:DC
+//*
+//* @par Description:
+//* Perform Intra prediction for luma_4x4 mode:DC ,described in sec 8.3.1.2.3
+//*
+//* @param[in] pu1_src
+//* UWORD8 pointer to the source
+//*
+//* @param[out] pu1_dst
+//* UWORD8 pointer to the destination
+//*
+//* @param[in] src_strd
+//* integer source stride
+//*
+//* @param[in] dst_strd
+//* integer destination stride
+//*
+//* @param[in] ui_neighboravailability
+//* availability of neighbouring pixels
+//*
+//* @returns
+//*
+//* @remarks
+//* None
+//*
+//*******************************************************************************/
+//void ih264_intra_pred_luma_4x4_mode_dc(UWORD8 *pu1_src,
+// UWORD8 *pu1_dst,
+// WORD32 src_strd,
+// WORD32 dst_strd,
+// WORD32 ui_neighboravailability)
+
+//**************Variables Vs Registers*****************************************
+// x0 => *pu1_src
+// x1 => *pu1_dst
+// x2 => src_strd
+// x3 => dst_strd
+// x4 => ui_neighboravailability
+
+
+
+ .global ih264_intra_pred_luma_4x4_mode_dc_av8
+
+ih264_intra_pred_luma_4x4_mode_dc_av8:
+
+
+
+
+ push_v_regs
+ stp x19, x20, [sp, #-16]!
+
+ ands x5, x4, #0x01
+ beq top_available //LEFT NOT AVAILABLE
+
+ add x10, x0, #3
+ mov x2, #-1
+ ldrb w5, [x10], #-1
+ sxtw x5, w5
+ ldrb w6, [x10], #-1
+ sxtw x6, w6
+ ldrb w7, [x10], #-1
+ sxtw x7, w7
+ add x5, x5, x6
+ ldrb w8, [x10], #-1
+ sxtw x8, w8
+ add x5, x5, x7
+ ands x11, x4, #0x04 // CHECKING IF TOP_AVAILABLE ELSE BRANCHING TO ONLY LEFT AVAILABLE
+ add x5, x5, x8
+ beq left_available
+ add x10, x0, #5
+ // BOTH LEFT AND TOP AVAILABLE
+ ldrb w6, [x10], #1
+ sxtw x6, w6
+ ldrb w7, [x10], #1
+ sxtw x7, w7
+ add x5, x5, x6
+ ldrb w8, [x10], #1
+ sxtw x8, w8
+ add x5, x5, x7
+ ldrb w9, [x10], #1
+ sxtw x9, w9
+ add x5, x5, x8
+ add x5, x5, x9
+ add x5, x5, #4
+ lsr x5, x5, #3
+ dup v0.8b, w5
+ st1 {v0.s}[0], [x1], x3
+ st1 {v0.s}[0], [x1], x3
+ st1 {v0.s}[0], [x1], x3
+ st1 {v0.s}[0], [x1], x3
+ b end_func
+
+top_available: // ONLT TOP AVAILABLE
+ ands x11, x4, #0x04 // CHECKING TOP AVAILABILTY OR ELSE BRANCH TO NONE AVAILABLE
+ beq none_available
+
+ add x10, x0, #5
+ ldrb w6, [x10], #1
+ sxtw x6, w6
+ ldrb w7, [x10], #1
+ sxtw x7, w7
+ ldrb w8, [x10], #1
+ sxtw x8, w8
+ add x5, x6, x7
+ ldrb w9, [x10], #1
+ sxtw x9, w9
+ add x5, x5, x8
+ add x5, x5, x9
+ add x5, x5, #2
+ lsr x5, x5, #2
+ dup v0.8b, w5
+ st1 {v0.s}[0], [x1], x3
+ st1 {v0.s}[0], [x1], x3
+ st1 {v0.s}[0], [x1], x3
+ st1 {v0.s}[0], [x1], x3
+ b end_func
+
+left_available: //ONLY LEFT AVAILABLE
+ add x5, x5, #2
+ lsr x5, x5, #2
+ dup v0.8b, w5
+ st1 {v0.s}[0], [x1], x3
+ st1 {v0.s}[0], [x1], x3
+ st1 {v0.s}[0], [x1], x3
+ st1 {v0.s}[0], [x1], x3
+ b end_func
+
+none_available: //NONE AVAILABLE
+ mov x5, #128
+ dup v0.8b, w5
+ st1 {v0.s}[0], [x1], x3
+ st1 {v0.s}[0], [x1], x3
+ st1 {v0.s}[0], [x1], x3
+ st1 {v0.s}[0], [x1], x3
+ b end_func
+
+
+end_func:
+
+ ldp x19, x20, [sp], #16
+ pop_v_regs
+ ret
+
+
+
+
+
+
+
+///**
+//*******************************************************************************
+//*
+//*ih264_intra_pred_luma_4x4_mode_diag_dl
+//*
+//* @brief
+//* Perform Intra prediction for luma_4x4 mode:Diagonal_Down_Left
+//*
+//* @par Description:
+//* Perform Intra prediction for luma_4x4 mode:Diagonal_Down_Left ,described in sec 8.3.1.2.4
+//*
+//* @param[in] pu1_src
+//* UWORD8 pointer to the source
+//*
+//* @param[out] pu1_dst
+//* UWORD8 pointer to the destination
+//*
+//* @param[in] src_strd
+//* integer source stride
+//*
+//* @param[in] dst_strd
+//* integer destination stride
+//*
+//* @param[in] ui_neighboravailability
+//* availability of neighbouring pixels
+//*
+//* @returns
+//*
+//* @remarks
+//* None
+//*
+//*******************************************************************************/
+//void ih264_intra_pred_luma_4x4_mode_diag_dl(UWORD8 *pu1_src,
+// UWORD8 *pu1_dst,
+// WORD32 src_strd,
+// WORD32 dst_strd,
+// WORD32 ui_neighboravailability)
+
+//**************Variables Vs Registers*****************************************
+// x0 => *pu1_src
+// x1 => *pu1_dst
+// x2 => src_strd
+// x3 => dst_strd
+// x4 => ui_neighboravailability
+
+
+ .global ih264_intra_pred_luma_4x4_mode_diag_dl_av8
+
+ih264_intra_pred_luma_4x4_mode_diag_dl_av8:
+
+
+ push_v_regs
+ stp x19, x20, [sp, #-16]!
+
+ add x0, x0, #5
+ sub x5, x3, #2
+ add x6, x0, #7
+ ld1 {v0.8b}, [x0]
+ ext v1.8b, v0.8b , v0.8b , #1
+ ext v2.8b, v0.8b , v0.8b , #2
+ ld1 {v2.b}[6], [x6]
+ uaddl v20.8h, v0.8b, v1.8b
+ uaddl v22.8h, v1.8b, v2.8b
+ add v24.8h, v20.8h , v22.8h
+ sqrshrun v3.8b, v24.8h, #2
+ st1 {v3.s}[0], [x1], x3
+ ext v4.8b, v3.8b , v3.8b , #1
+ st1 {v4.s}[0], [x1], x3
+ st1 {v3.h}[1], [x1], #2
+ st1 {v3.h}[2], [x1], x5
+ st1 {v4.h}[1], [x1], #2
+ st1 {v4.h}[2], [x1]
+
+end_func_diag_dl:
+
+ ldp x19, x20, [sp], #16
+ pop_v_regs
+ ret
+
+
+
+
+
+
+
+
+
+///**
+//*******************************************************************************
+//*
+//*ih264_intra_pred_luma_4x4_mode_diag_dr
+//*
+//* @brief
+//* Perform Intra prediction for luma_4x4 mode:Diagonal_Down_Right
+//*
+//* @par Description:
+//* Perform Intra prediction for luma_4x4 mode:Diagonal_Down_Right ,described in sec 8.3.1.2.5
+//*
+//* @param[in] pu1_src
+//* UWORD8 pointer to the source
+//*
+//* @param[out] pu1_dst
+//* UWORD8 pointer to the destination
+//*
+//* @param[in] src_strd
+//* integer source stride
+//*
+//* @param[in] dst_strd
+//* integer destination stride
+//*
+//* @param[in] ui_neighboravailability
+//* availability of neighbouring pixels
+//*
+//* @returns
+//*
+//* @remarks
+//* None
+//*
+//*******************************************************************************/
+//void ih264_intra_pred_luma_4x4_mode_diag_dr(UWORD8 *pu1_src,
+// UWORD8 *pu1_dst,
+// WORD32 src_strd,
+// WORD32 dst_strd,
+// WORD32 ui_neighboravailability)
+
+//**************Variables Vs Registers*****************************************
+// x0 => *pu1_src
+// x1 => *pu1_dst
+// x2 => src_strd
+// x3 => dst_strd
+// x4 => ui_neighboravailability
+
+
+ .global ih264_intra_pred_luma_4x4_mode_diag_dr_av8
+
+ih264_intra_pred_luma_4x4_mode_diag_dr_av8:
+
+ push_v_regs
+ stp x19, x20, [sp, #-16]!
+
+
+ ld1 {v0.8b}, [x0]
+ add x0, x0, #1
+ ld1 {v1.8b}, [x0]
+ ext v2.8b, v1.8b , v1.8b , #1
+ uaddl v20.8h, v0.8b, v1.8b
+ uaddl v22.8h, v1.8b, v2.8b
+ add v24.8h, v20.8h , v22.8h
+ sqrshrun v3.8b, v24.8h, #2
+
+ ext v4.8b, v3.8b , v3.8b , #1
+ sub x5, x3, #2
+ st1 {v4.h}[1], [x1], #2
+ st1 {v4.h}[2], [x1], x5
+ st1 {v3.h}[1], [x1], #2
+ st1 {v3.h}[2], [x1], x5
+ st1 {v4.s}[0], [x1], x3
+ st1 {v3.s}[0], [x1], x3
+
+end_func_diag_dr:
+ ldp x19, x20, [sp], #16
+ pop_v_regs
+ ret
+
+
+
+
+
+
+
+///**
+//*******************************************************************************
+//*
+//*ih264_intra_pred_luma_4x4_mode_vert_r
+//*
+//* @brief
+//* Perform Intra prediction for luma_4x4 mode:Vertical_Right
+//*
+//* @par Description:
+//* Perform Intra prediction for luma_4x4 mode:Vertical_Right ,described in sec 8.3.1.2.6
+//*
+//* @param[in] pu1_src
+//* UWORD8 pointer to the source
+//*
+//* @param[out] pu1_dst
+//* UWORD8 pointer to the destination
+//*
+//* @param[in] src_strd
+//* integer source stride
+//*
+//* @param[in] dst_strd
+//* integer destination stride
+//*
+//* @param[in] ui_neighboravailability
+//* availability of neighbouring pixels
+//*
+//* @returns
+//*
+//* @remarks
+//* None
+//*
+//*******************************************************************************/
+//void ih264_intra_pred_luma_4x4_mode_vert_r(UWORD8 *pu1_src,
+// UWORD8 *pu1_dst,
+// WORD32 src_strd,
+// WORD32 dst_strd,
+// WORD32 ui_neighboravailability)
+
+//**************Variables Vs Registers*****************************************
+// x0 => *pu1_src
+// x1 => *pu1_dst
+// x2 => src_strd
+// x3 => dst_strd
+// x4 => ui_neighboravailability
+
+
+ .global ih264_intra_pred_luma_4x4_mode_vert_r_av8
+
+ih264_intra_pred_luma_4x4_mode_vert_r_av8:
+
+ push_v_regs
+ stp x19, x20, [sp, #-16]!
+
+
+ ld1 {v0.8b}, [x0]
+ add x0, x0, #1
+ ld1 {v1.8b}, [x0]
+ ext v2.8b, v1.8b , v1.8b , #1
+ uaddl v20.8h, v0.8b, v1.8b
+ uaddl v22.8h, v1.8b, v2.8b
+ add v24.8h, v20.8h , v22.8h
+ sqrshrun v4.8b, v20.8h, #1
+ sqrshrun v3.8b, v24.8h, #2
+ sub x5, x3, #2
+ ext v5.8b, v3.8b , v3.8b , #3
+ st1 {v4.s}[1], [x1], x3
+ st1 {v5.s}[0], [x1], x3
+ sub x8, x3, #3
+ st1 {v3.b}[2], [x1], #1
+ st1 {v4.h}[2], [x1], #2
+ st1 {v4.b}[6], [x1], x8
+ st1 {v3.b}[1], [x1], #1
+ st1 {v5.h}[0], [x1], #2
+ st1 {v5.b}[2], [x1]
+
+
+end_func_vert_r:
+ ldp x19, x20, [sp], #16
+ pop_v_regs
+ ret
+
+
+
+
+
+///**
+//*******************************************************************************
+//*
+//*ih264_intra_pred_luma_4x4_mode_horz_d
+//*
+//* @brief
+//* Perform Intra prediction for luma_4x4 mode:Horizontal_Down
+//*
+//* @par Description:
+//* Perform Intra prediction for luma_4x4 mode:Horizontal_Down ,described in sec 8.3.1.2.7
+//*
+//* @param[in] pu1_src
+//* UWORD8 pointer to the source
+//*
+//* @param[out] pu1_dst
+//* UWORD8 pointer to the destination
+//*
+//* @param[in] src_strd
+//* integer source stride
+//*
+//* @param[in] dst_strd
+//* integer destination stride
+//*
+//* @param[in] ui_neighboravailability
+//* availability of neighbouring pixels
+//*
+//* @returns
+//*
+//* @remarks
+//* None
+//*
+//*******************************************************************************/
+//void ih264_intra_pred_luma_4x4_mode_horz_d(UWORD8 *pu1_src,
+// UWORD8 *pu1_dst,
+// WORD32 src_strd,
+// WORD32 dst_strd,
+// WORD32 ui_neighboravailability)
+
+//**************Variables Vs Registers*****************************************
+// x0 => *pu1_src
+// x1 => *pu1_dst
+// x2 => src_strd
+// x3 => dst_strd
+// x4 => ui_neighboravailability
+
+
+ .global ih264_intra_pred_luma_4x4_mode_horz_d_av8
+
+ih264_intra_pred_luma_4x4_mode_horz_d_av8:
+
+ push_v_regs
+ stp x19, x20, [sp, #-16]!
+
+ ld1 {v0.8b}, [x0]
+ add x0, x0, #1
+ ld1 {v1.8b}, [x0]
+ ext v2.8b, v1.8b , v0.8b , #1
+ uaddl v20.8h, v0.8b, v1.8b
+ uaddl v22.8h, v1.8b, v2.8b
+ add v24.8h, v20.8h , v22.8h
+ sqrshrun v4.8b, v20.8h, #1
+ sqrshrun v5.8b, v24.8h, #2
+ sub x5, x3, #2
+ mov v6.8b, v5.8b
+ trn1 v10.8b, v4.8b, v5.8b
+ trn2 v5.8b, v4.8b, v5.8b //
+ mov v4.8b, v10.8b
+ st1 {v5.h}[1], [x1], #2
+ st1 {v6.h}[2], [x1], x5
+ st1 {v4.h}[1], [x1], #2
+ st1 {v5.h}[1], [x1], x5
+ st1 {v5.h}[0], [x1], #2
+ st1 {v4.h}[1], [x1], x5
+ st1 {v4.h}[0], [x1], #2
+ st1 {v5.h}[0], [x1], x5
+
+end_func_horz_d:
+ ldp x19, x20, [sp], #16
+ pop_v_regs
+ ret
+
+
+
+
+
+
+
+///**
+//*******************************************************************************
+//*
+//*ih264_intra_pred_luma_4x4_mode_vert_l
+//*
+//* @brief
+//* Perform Intra prediction for luma_4x4 mode:Vertical_Left
+//*
+//* @par Description:
+//* Perform Intra prediction for luma_4x4 mode:Vertical_Left ,described in sec 8.3.1.2.8
+//*
+//* @param[in] pu1_src
+//* UWORD8 pointer to the source
+//*
+//* @param[out] pu1_dst
+//* UWORD8 pointer to the destination
+//*
+//* @param[in] src_strd
+//* integer source stride
+//*
+//* @param[in] dst_strd
+//* integer destination stride
+//*
+//* @param[in] ui_neighboravailability
+//* availability of neighbouring pixels
+//*
+//* @returns
+//*
+//* @remarks
+//* None
+//*
+//*******************************************************************************/
+//void ih264_intra_pred_luma_4x4_mode_vert_l(UWORD8 *pu1_src,
+// UWORD8 *pu1_dst,
+// WORD32 src_strd,
+// WORD32 dst_strd,
+// WORD32 ui_neighboravailability)
+
+//**************Variables Vs Registers*****************************************
+// x0 => *pu1_src
+// x1 => *pu1_dst
+// x2 => src_strd
+// x3 => dst_strd
+// x4 => ui_neighboravailability
+
+
+ .global ih264_intra_pred_luma_4x4_mode_vert_l_av8
+
+ih264_intra_pred_luma_4x4_mode_vert_l_av8:
+
+ push_v_regs
+ stp x19, x20, [sp, #-16]!
+ add x0, x0, #4
+ ld1 {v0.8b}, [x0]
+ add x0, x0, #1
+ ld1 {v1.8b}, [x0]
+ ext v2.8b, v1.8b , v0.8b , #1
+ uaddl v20.8h, v0.8b, v1.8b
+ uaddl v22.8h, v1.8b, v2.8b
+ add v24.8h, v20.8h , v22.8h
+ sqrshrun v4.8b, v20.8h, #1
+ sqrshrun v5.8b, v24.8h, #2
+ ext v6.8b, v4.8b , v4.8b , #1
+ ext v7.8b, v5.8b , v5.8b , #1
+ st1 {v6.s}[0], [x1], x3
+ ext v8.8b, v4.8b , v4.8b , #2
+ ext v9.8b, v5.8b , v5.8b , #2
+ st1 {v7.s}[0], [x1], x3
+ st1 {v8.s}[0], [x1], x3
+ st1 {v9.s}[0], [x1], x3
+
+end_func_vert_l:
+ ldp x19, x20, [sp], #16
+ pop_v_regs
+ ret
+
+
+
+
+
+
+
+///**
+//*******************************************************************************
+//*
+//*ih264_intra_pred_luma_4x4_mode_horz_u
+//*
+//* @brief
+//* Perform Intra prediction for luma_4x4 mode:Horizontal_Up
+//*
+//* @par Description:
+//* Perform Intra prediction for luma_4x4 mode:Horizontal_Up ,described in sec 8.3.1.2.9
+//*
+//* @param[in] pu1_src
+//* UWORD8 pointer to the source
+//*
+//* @param[out] pu1_dst
+//* UWORD8 pointer to the destination
+//*
+//* @param[in] src_strd
+//* integer source stride
+//*
+//* @param[in] dst_strd
+//* integer destination stride
+//*
+//* @param[in] ui_neighboravailability
+//* availability of neighbouring pixels
+//*
+//* @returns
+//*
+//* @remarks
+//* None
+//*
+//*******************************************************************************/
+//void ih264_intra_pred_luma_4x4_mode_horz_u(UWORD8 *pu1_src,
+// UWORD8 *pu1_dst,
+// WORD32 src_strd,
+// WORD32 dst_strd,
+// WORD32 ui_neighboravailability)
+
+//**************Variables Vs Registers*****************************************
+// x0 => *pu1_src
+// x1 => *pu1_dst
+// x2 => src_strd
+// x3 => dst_strd
+// x4 => ui_neighboravailability
+
+
+ .global ih264_intra_pred_luma_4x4_mode_horz_u_av8
+
+ih264_intra_pred_luma_4x4_mode_horz_u_av8:
+
+ push_v_regs
+ stp x19, x20, [sp, #-16]!
+ mov x10, x0
+ ld1 {v0.8b}, [x0]
+ ldrb w9, [x0], #1
+ sxtw x9, w9
+ ext v1.8b, v0.8b , v0.8b , #1
+ ld1 {v0.b}[7], [x10]
+ ext v2.8b, v1.8b , v1.8b , #1
+ uaddl v20.8h, v0.8b, v1.8b
+ uaddl v22.8h, v1.8b, v2.8b
+ add v24.8h, v20.8h , v22.8h
+ sqrshrun v4.8b, v20.8h, #1
+ sqrshrun v5.8b, v24.8h, #2
+ mov v6.8b, v4.8b
+ ext v6.8b, v5.8b , v4.8b , #1
+ st1 {v4.b}[2], [x1], #1
+ st1 {v6.b}[0], [x1], #1
+ trn1 v10.8b, v6.8b, v5.8b
+ trn2 v5.8b, v6.8b, v5.8b //
+ mov v6.8b , v10.8b
+ sub x5, x3, #2
+ trn1 v10.8b, v4.8b, v6.8b
+ trn2 v6.8b, v4.8b, v6.8b //
+ mov v4.8b , v10.8b
+ dup v7.8b, w9
+ st1 {v6.h}[0], [x1], x5
+ st1 {v6.h}[0], [x1], #2
+ st1 {v5.h}[3], [x1], x5
+ st1 {v5.h}[3], [x1], #2
+ st1 {v7.h}[3], [x1], x5
+ st1 {v7.s}[0], [x1], x3
+
+end_func_horz_u:
+ ldp x19, x20, [sp], #16
+ pop_v_regs
+ ret
+
+
+
diff --git a/common/armv8/ih264_intra_pred_luma_8x8_av8.s b/common/armv8/ih264_intra_pred_luma_8x8_av8.s
new file mode 100755
index 0000000..2b972ca
--- /dev/null
+++ b/common/armv8/ih264_intra_pred_luma_8x8_av8.s
@@ -0,0 +1,1084 @@
+//******************************************************************************
+//*
+//* Copyright (C) 2015 The Android Open Source Project
+//*
+//* Licensed under the Apache License, Version 2.0 (the "License");
+//* you may not use this file except in compliance with the License.
+//* You may obtain a copy of the License at:
+//*
+//* http://www.apache.org/licenses/LICENSE-2.0
+//*
+//* Unless required by applicable law or agreed to in writing, software
+//* distributed under the License is distributed on an "AS IS" BASIS,
+//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//* See the License for the specific language governing permissions and
+//* limitations under the License.
+//*
+//*****************************************************************************
+//* Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
+//*/
+///**
+//******************************************************************************
+//* @file
+//* ih264_intra_pred_luma_8x8_av8.s
+//*
+//* @brief
+//* Contains function definitions for intra 8x8 Luma prediction .
+//*
+//* @author
+//* Ittiam
+//*
+//* @par List of Functions:
+//*
+//* -ih264_intra_pred_luma_8x8_mode_vert_av8
+//* -ih264_intra_pred_luma_8x8_mode_horz_av8
+//* -ih264_intra_pred_luma_8x8_mode_dc_av8
+//* -ih264_intra_pred_luma_8x8_mode_diag_dl_av8
+//* -ih264_intra_pred_luma_8x8_mode_diag_dr_av8
+//* -ih264_intra_pred_luma_8x8_mode_vert_r_av8
+//* -ih264_intra_pred_luma_8x8_mode_horz_d_av8
+//* -ih264_intra_pred_luma_8x8_mode_vert_l_av8
+//* -ih264_intra_pred_luma_8x8_mode_horz_u_av8
+//*
+//* @remarks
+//* None
+//*
+//*******************************************************************************
+//*/
+
+///* All the functions here are replicated from ih264_intra_pred_filters.c
+//
+
+///**
+///**
+///**
+
+.text
+.p2align 2
+.include "ih264_neon_macros.s"
+
+.extern ih264_gai1_intrapred_luma_8x8_horz_u
+
+
+
+///**
+//*******************************************************************************
+//*
+//*ih264_intra_pred_luma_8x8_mode_vert
+//*
+//* @brief
+//* Perform Intra prediction for luma_8x8 mode:vertical
+//*
+//* @par Description:
+//* Perform Intra prediction for luma_8x8 mode:vertical ,described in sec 8.3.2.2.2
+//*
+//* @param[in] pu1_src
+//* UWORD8 pointer to the source
+//*
+//* @param[out] pu1_dst
+//* UWORD8 pointer to the destination
+//*
+//* @param[in] src_strd
+//* integer source stride
+//*
+//* @param[in] dst_strd
+//* integer destination stride
+//*
+//* @param[in] ui_neighboravailability
+//* availability of neighbouring pixels(Not used in this function)
+//*
+//* @returns
+//*
+//* @remarks
+//* None
+//*
+//*******************************************************************************
+//void ih264_intra_pred_luma_8x8_mode_vert(UWORD8 *pu1_src,
+// UWORD8 *pu1_dst,
+// WORD32 src_strd,
+// WORD32 dst_strd,
+// WORD32 ui_neighboravailability)
+
+//**************Variables Vs Registers*****************************************
+// x0 => *pu1_src
+// x1 => *pu1_dst
+// x2 => src_strd
+// x3 => dst_strd
+// x4 => ui_neighboravailability
+
+
+ .global ih264_intra_pred_luma_8x8_mode_vert_av8
+
+ih264_intra_pred_luma_8x8_mode_vert_av8:
+
+ // STMFD sp!, {x4-x12, x14} //store register values to stack
+ push_v_regs
+ //stp x19, x20,[sp,#-16]!
+
+ add x0, x0, #9
+ ld1 {v0.8b}, [x0]
+
+ st1 {v0.8b}, [x1], x3
+ st1 {v0.8b}, [x1], x3
+ st1 {v0.8b}, [x1], x3
+ st1 {v0.8b}, [x1], x3
+ st1 {v0.8b}, [x1], x3
+ st1 {v0.8b}, [x1], x3
+ st1 {v0.8b}, [x1], x3
+ st1 {v0.8b}, [x1], x3
+
+ // LDMFD sp!,{x4-x12,PC} //Restoring registers from stack
+ //ldp x19, x20,[sp],#16
+ pop_v_regs
+ ret
+
+
+
+
+
+///******************************************************************************
+
+
+///**
+//*******************************************************************************
+//*
+//*ih264_intra_pred_luma_8x8_mode_horz
+//*
+//* @brief
+//* Perform Intra prediction for luma_8x8 mode:horizontal
+//*
+//* @par Description:
+//* Perform Intra prediction for luma_8x8 mode:horizontal ,described in sec 8.3.2.2.2
+//*
+//* @param[in] pu1_src
+//* UWORD8 pointer to the source
+//*
+//* @param[out] pu1_dst
+//* UWORD8 pointer to the destination
+//*
+//* @param[in] src_strd
+//* integer source stride
+//*
+//* @param[in] dst_strd
+//* integer destination stride
+//*
+//* @param[in] ui_neighboravailability
+//* availability of neighbouring pixels(Not used in this function)
+//*
+//* @returns
+//*
+//* @remarks
+//* None
+//*
+//*******************************************************************************
+//*/
+//void ih264_intra_pred_luma_8x8_mode_horz(UWORD8 *pu1_src,
+// UWORD8 *pu1_dst,
+// WORD32 src_strd,
+// WORD32 dst_strd,
+// WORD32 ui_neighboravailability)
+//**************Variables Vs Registers*****************************************
+// x0 => *pu1_src
+// x1 => *pu1_dst
+// x2 => src_strd
+// x3 => dst_strd
+// x4 => ui_neighboravailability
+
+
+ .global ih264_intra_pred_luma_8x8_mode_horz_av8
+
+ih264_intra_pred_luma_8x8_mode_horz_av8:
+
+
+
+ // STMFD sp!, {x4-x12, x14} //store register values to stack
+ push_v_regs
+ stp x19, x20, [sp, #-16]!
+ add x0, x0, #7
+ mov x2 , #-1
+
+ ldrb w5, [x0], #-1
+ sxtw x5, w5
+ ldrb w6, [x0], #-1
+ sxtw x6, w6
+ dup v0.8b, w5
+ st1 {v0.8b}, [x1], x3
+ ldrb w7, [x0], #-1
+ sxtw x7, w7
+ dup v1.8b, w6
+ st1 {v1.8b}, [x1], x3
+ dup v2.8b, w7
+ ldrb w8, [x0], #-1
+ sxtw x8, w8
+ dup v3.8b, w8
+ st1 {v2.8b}, [x1], x3
+ ldrb w5, [x0], #-1
+ sxtw x5, w5
+ st1 {v3.8b}, [x1], x3
+ dup v0.8b, w5
+ ldrb w6, [x0], #-1
+ sxtw x6, w6
+ st1 {v0.8b}, [x1], x3
+ ldrb w7, [x0], #-1
+ sxtw x7, w7
+ dup v1.8b, w6
+ dup v2.8b, w7
+ st1 {v1.8b}, [x1], x3
+ ldrb w8, [x0], #-1
+ sxtw x8, w8
+ dup v3.8b, w8
+ st1 {v2.8b}, [x1], x3
+ st1 {v3.8b}, [x1], x3
+
+ // LDMFD sp!,{x4-x12,PC} //Restoring registers from stack
+ ldp x19, x20, [sp], #16
+ pop_v_regs
+ ret
+
+
+
+
+
+
+
+///******************************************************************************
+
+
+///**
+//*******************************************************************************
+//*
+//*ih264_intra_pred_luma_8x8_mode_dc
+//*
+//* @brief
+//* Perform Intra prediction for luma_8x8 mode:DC
+//*
+//* @par Description:
+//* Perform Intra prediction for luma_8x8 mode:DC ,described in sec 8.3.2.2.3
+//*
+//* @param[in] pu1_src
+//* UWORD8 pointer to the source
+//*
+//* @param[out] pu1_dst
+//* UWORD8 pointer to the destination
+//*
+//* @param[in] src_strd
+//* integer source stride
+//*
+//* @param[in] dst_strd
+//* integer destination stride
+//*
+//* @param[in] ui_neighboravailability
+//* availability of neighbouring pixels
+//*
+//* @returns
+//*
+//* @remarks
+//* None
+//*
+//*******************************************************************************/
+//void ih264_intra_pred_luma_8x8_mode_dc(UWORD8 *pu1_src,
+// UWORD8 *pu1_dst,
+// WORD32 src_strd,
+// WORD32 dst_strd,
+// WORD32 ui_neighboravailability)
+
+//**************Variables Vs Registers*****************************************
+// x0 => *pu1_src
+// x1 => *pu1_dst
+// x2 => src_strd
+// x3 => dst_strd
+// x4 => ui_neighboravailability
+
+
+ .global ih264_intra_pred_luma_8x8_mode_dc_av8
+
+ih264_intra_pred_luma_8x8_mode_dc_av8:
+
+
+
+ // STMFD sp!, {x4-x12, x14} //store register values to stack
+ push_v_regs
+ stp x19, x20, [sp, #-16]!
+
+ ands x6, x4, #0x01
+ beq top_available //LEFT NOT AVAILABLE
+
+ add x10, x0, #7
+ mov x2, #-1
+ ldrb w5, [x10], -1
+ sxtw x5, w5
+ ldrb w6, [x10], -1
+ sxtw x6, w6
+ ldrb w7, [x10], -1
+ sxtw x7, w7
+ add x5, x5, x6
+ ldrb w8, [x10], -1
+ sxtw x8, w8
+ add x5, x5, x7
+ ldrb w6, [x10], -1
+ sxtw x6, w6
+ add x5, x5, x8
+ ldrb w7, [x10], -1
+ sxtw x7, w7
+ add x5, x5, x6
+ ldrb w8, [x10], -1
+ sxtw x8, w8
+ add x5, x5, x7
+ ands x11, x4, #0x04 // CHECKING IF TOP_AVAILABLE ELSE BRANCHING TO ONLY LEFT AVAILABLE
+ add x5, x5, x8
+ ldrb w6, [x10], -1
+ sxtw x6, w6
+ add x5, x5, x6
+ beq left_available
+ add x10, x0, #9
+ // BOTH LEFT AND TOP AVAILABLE
+ ld1 {v0.8b}, [x10]
+ uaddlp v1.4h, v0.8b
+ uaddlp v3.2s, v1.4h
+ uaddlp v2.1d, v3.2s
+ dup v10.8h, w5
+ dup v8.8h, v2.4h[0]
+ add v12.8h, v8.8h , v10.8h
+ sqrshrun v31.8b, v12.8h, #4
+ st1 {v31.8b}, [x1], x3
+ st1 {v31.8b}, [x1], x3
+ st1 {v31.8b}, [x1], x3
+ st1 {v31.8b}, [x1], x3
+ st1 {v31.8b}, [x1], x3
+ st1 {v31.8b}, [x1], x3
+ st1 {v31.8b}, [x1], x3
+ st1 {v31.8b}, [x1], x3
+ b end_func
+
+top_available: // ONLT TOP AVAILABLE
+ ands x11, x4, #0x04 // CHECKING TOP AVAILABILTY OR ELSE BRANCH TO NONE AVAILABLE
+ beq none_available
+
+ add x10, x0, #9
+ ld1 {v10.8b}, [x10]
+ uaddlp v14.4h, v10.8b
+ uaddlp v13.2s, v14.4h
+ uaddlp v12.1d, v13.2s
+ rshrn v4.8b, v12.8h, #3
+ dup v31.8b, v4.8b[0]
+ st1 {v31.8b}, [x1], x3
+ st1 {v31.8b}, [x1], x3
+ st1 {v31.8b}, [x1], x3
+ st1 {v31.8b}, [x1], x3
+ st1 {v31.8b}, [x1], x3
+ st1 {v31.8b}, [x1], x3
+ st1 {v31.8b}, [x1], x3
+ st1 {v31.8b}, [x1], x3
+ b end_func
+
+
+left_available: //ONLY LEFT AVAILABLE
+ add x5, x5, #4
+ lsr x5, x5, #3
+ dup v0.8b, w5
+ st1 {v0.8b}, [x1], x3
+ st1 {v0.8b}, [x1], x3
+ st1 {v0.8b}, [x1], x3
+ st1 {v0.8b}, [x1], x3
+ st1 {v0.8b}, [x1], x3
+ st1 {v0.8b}, [x1], x3
+ st1 {v0.8b}, [x1], x3
+ st1 {v0.8b}, [x1], x3
+ b end_func
+
+none_available: //NONE AVAILABLE
+ mov x9, #128
+ dup v0.8b, w9
+ st1 {v0.8b}, [x1], x3
+ st1 {v0.8b}, [x1], x3
+ st1 {v0.8b}, [x1], x3
+ st1 {v0.8b}, [x1], x3
+ st1 {v0.8b}, [x1], x3
+ st1 {v0.8b}, [x1], x3
+ st1 {v0.8b}, [x1], x3
+ st1 {v0.8b}, [x1], x3
+
+
+end_func:
+
+ // LDMFD sp!,{x4-x12,PC} //Restoring registers from stack
+ ldp x19, x20, [sp], #16
+ pop_v_regs
+ ret
+
+
+
+
+
+
+///**
+//*******************************************************************************
+//*
+//*ih264_intra_pred_luma_8x8_mode_diag_dl
+//*
+//* @brief
+//* Perform Intra prediction for luma_8x8 mode:Diagonal_Down_Left
+//*
+//* @par Description:
+//* Perform Intra prediction for luma_8x8 mode:Diagonal_Down_Left ,described in sec 8.3.2.2.4
+//*
+//* @param[in] pu1_src
+//* UWORD8 pointer to the source
+//*
+//* @param[out] pu1_dst
+//* UWORD8 pointer to the destination
+//*
+//* @param[in] src_strd
+//* integer source stride
+//*
+//* @param[in] dst_strd
+//* integer destination stride
+//*
+//* @param[in] ui_neighboravailability
+//* availability of neighbouring pixels
+//*
+//* @returns
+//*
+//* @remarks
+//* None
+//*
+//*******************************************************************************/
+//void ih264_intra_pred_luma_8x8_mode_diag_dl(UWORD8 *pu1_src,
+// UWORD8 *pu1_dst,
+// WORD32 src_strd,
+// WORD32 dst_strd,
+// WORD32 ui_neighboravailability)
+
+//**************Variables Vs Registers*****************************************
+// x0 => *pu1_src
+// x1 => *pu1_dst
+// x2 => src_strd
+// x3 => dst_strd
+// x4 => ui_neighboravailability
+
+ .global ih264_intra_pred_luma_8x8_mode_diag_dl_av8
+
+ih264_intra_pred_luma_8x8_mode_diag_dl_av8:
+
+ // STMFD sp!, {x4-x12, x14} //store register values to stack
+ push_v_regs
+ stp x19, x20, [sp, #-16]!
+
+ add x0, x0, #9
+ sub x5, x3, #4
+ add x6, x0, #15
+ ld1 { v0.16b}, [x0]
+ mov v1.d[0], v0.d[1]
+ ext v4.16b, v0.16b , v0.16b , #2
+ mov v5.d[0], v4.d[1]
+ ext v2.16b, v0.16b , v0.16b , #1
+ mov v3.d[0], v2.d[1]
+ ld1 {v5.b}[6], [x6]
+ // q1 = q0 shifted to left once
+ // q2 = q1 shifted to left once
+ uaddl v20.8h, v0.8b, v2.8b //Adding for FILT121
+ uaddl v22.8h, v1.8b, v3.8b
+ uaddl v24.8h, v2.8b, v4.8b
+ uaddl v26.8h, v3.8b, v5.8b
+ add v24.8h, v20.8h , v24.8h
+ add v26.8h, v22.8h , v26.8h
+
+ sqrshrun v4.8b, v24.8h, #2
+ sqrshrun v5.8b, v26.8h, #2
+ mov v4.d[1], v5.d[0]
+ //Q2 has all FILT121 values
+ st1 {v4.8b}, [x1], x3
+ ext v18.16b, v4.16b , v4.16b , #1
+ ext v16.16b, v18.16b , v18.16b , #1
+ st1 {v18.8b}, [x1], x3
+ ext v14.16b, v16.16b , v16.16b , #1
+ st1 {v16.8b}, [x1], x3
+ st1 {v14.8b}, [x1], x3
+ st1 {v4.s}[1], [x1], #4
+ st1 {v5.s}[0], [x1], x5
+ st1 {v18.s}[1], [x1], #4
+ st1 {v18.s}[2], [x1], x5
+ st1 {v16.s}[1], [x1], #4
+ st1 {v16.s}[2], [x1], x5
+ st1 {v14.s}[1], [x1], #4
+ st1 {v14.s}[2], [x1], x5
+
+
+end_func_diag_dl:
+ // LDMFD sp!,{x4-x12,PC} //Restoring registers from stack
+ ldp x19, x20, [sp], #16
+ pop_v_regs
+ ret
+
+
+
+
+///**
+//*******************************************************************************
+//*
+//*ih264_intra_pred_luma_8x8_mode_diag_dr
+//*
+//* @brief
+//* Perform Intra prediction for luma_8x8 mode:Diagonal_Down_Right
+//*
+//* @par Description:
+//* Perform Intra prediction for luma_8x8 mode:Diagonal_Down_Right ,described in sec 8.3.2.2.5
+//*
+//* @param[in] pu1_src
+//* UWORD8 pointer to the source
+//*
+//* @param[out] pu1_dst
+//* UWORD8 pointer to the destination
+//*
+//* @param[in] src_strd
+//* integer source stride
+//*
+//* @param[in] dst_strd
+//* integer destination stride
+//*
+//* @param[in] ui_neighboravailability
+//* availability of neighbouring pixels
+//*
+//* @returns
+//*
+//* @remarks
+//* None
+//*
+//*******************************************************************************/
+//void ih264_intra_pred_luma_8x8_mode_diag_dr(UWORD8 *pu1_src,
+// UWORD8 *pu1_dst,
+// WORD32 src_strd,
+// WORD32 dst_strd,
+// WORD32 ui_neighboravailability)
+
+//**************Variables Vs Registers*****************************************
+// x0 => *pu1_src
+// x1 => *pu1_dst
+// x2 => src_strd
+// x3 => dst_strd
+// x4 => ui_neighboravailability
+
+
+ .global ih264_intra_pred_luma_8x8_mode_diag_dr_av8
+
+ih264_intra_pred_luma_8x8_mode_diag_dr_av8:
+
+ // STMFD sp!, {x4-x12, x14} //store register values to stack
+ push_v_regs
+ stp x19, x20, [sp, #-16]!
+
+
+ ld1 { v0.16b}, [x0]
+ mov v1.d[0], v0.d[1]
+ add x0, x0, #1
+ ld1 { v2.16b}, [x0]
+ mov v3.d[0], v2.d[1]
+ ext v4.16b, v2.16b , v2.16b , #1
+ mov v5.d[0], v4.d[1]
+ // q1 = q0 shifted to left once
+ // q2 = q1 shifted to left once
+ uaddl v20.8h, v0.8b, v2.8b //Adding for FILT121
+ uaddl v22.8h, v1.8b, v3.8b
+ uaddl v24.8h, v2.8b, v4.8b
+ uaddl v26.8h, v3.8b, v5.8b
+ add v24.8h, v20.8h , v24.8h
+ add v26.8h, v22.8h , v26.8h
+ sqrshrun v4.8b, v24.8h, #2
+ sqrshrun v5.8b, v26.8h, #2
+ mov v4.d[1], v5.d[0]
+ //Q2 has all FILT121 values
+ sub x5, x3, #4
+ ext v18.16b, v4.16b , v4.16b , #15
+ st1 {v18.d}[1], [x1], x3
+ ext v16.16b, v18.16b , v18.16b , #15
+ st1 {v16.d}[1], [x1], x3
+ ext v14.16b, v16.16b , v16.16b , #15
+ st1 {v14.d}[1], [x1], x3
+ st1 {v4.s}[1], [x1], #4
+ st1 {v5.s}[0], [x1], x5
+ st1 {v18.s}[1], [x1], #4
+ st1 {v18.s}[2], [x1], x5
+ st1 {v16.s}[1], [x1], #4
+ st1 {v16.s}[2], [x1], x5
+ st1 {v14.s}[1], [x1], #4
+ st1 {v14.s}[2], [x1], x5
+ st1 {v4.8b}, [x1], x3
+
+end_func_diag_dr:
+ // LDMFD sp!,{x4-x12,PC} //Restoring registers from stack
+ ldp x19, x20, [sp], #16
+ pop_v_regs
+ ret
+
+
+
+
+///**
+//*******************************************************************************
+//*
+//*ih264_intra_pred_luma_8x8_mode_vert_r
+//*
+//* @brief
+//* Perform Intra prediction for luma_8x8 mode:Vertical_Right
+//*
+//* @par Description:
+//* Perform Intra prediction for luma_8x8 mode:Vertical_Right ,described in sec 8.3.2.2.6
+//*
+//* @param[in] pu1_src
+//* UWORD8 pointer to the source
+//*
+//* @param[out] pu1_dst
+//* UWORD8 pointer to the destination
+//*
+//* @param[in] src_strd
+//* integer source stride
+//*
+//* @param[in] dst_strd
+//* integer destination stride
+//*
+//* @param[in] ui_neighboravailability
+//* availability of neighbouring pixels
+//*
+//* @returns
+//*
+//* @remarks
+//* None
+//*
+//*******************************************************************************/
+//void ih264_intra_pred_luma_8x8_mode_vert_r(UWORD8 *pu1_src,
+// UWORD8 *pu1_dst,
+// WORD32 src_strd,
+// WORD32 dst_strd,
+// WORD32 ui_neighboravailability)
+
+//**************Variables Vs Registers*****************************************
+// x0 => *pu1_src
+// x1 => *pu1_dst
+// x2 => src_strd
+// x3 => dst_strd
+// x4 => ui_neighboravailability
+
+
+ .global ih264_intra_pred_luma_8x8_mode_vert_r_av8
+
+ih264_intra_pred_luma_8x8_mode_vert_r_av8:
+
+ // STMFD sp!, {x4-x12, x14} //store register values to stack
+ push_v_regs
+ stp x19, x20, [sp, #-16]!
+
+ ld1 { v0.16b}, [x0]
+ mov v1.d[0], v0.d[1]
+ add x0, x0, #1
+ ld1 { v2.16b}, [x0]
+ mov v3.d[0], v2.d[1]
+ ext v4.16b, v2.16b , v2.16b , #1
+ mov v5.d[0], v4.d[1]
+ // q1 = q0 shifted to left once
+ // q2 = q1 shifted to left once
+ uaddl v20.8h, v0.8b, v2.8b
+ uaddl v22.8h, v1.8b, v3.8b
+ uaddl v24.8h, v2.8b, v4.8b
+ uaddl v26.8h, v3.8b, v5.8b
+ add v24.8h, v20.8h , v24.8h
+ add v26.8h, v22.8h , v26.8h
+
+ sqrshrun v4.8b, v20.8h, #1
+ sqrshrun v5.8b, v22.8h, #1
+ mov v4.d[1], v5.d[0]
+ sqrshrun v6.8b, v24.8h, #2
+ sqrshrun v7.8b, v26.8h, #2
+ mov v6.d[1], v7.d[0]
+ //Q2 has all FILT11 values
+ //Q3 has all FILT121 values
+ sub x5, x3, #6
+ sub x6, x3, #4
+ st1 {v5.8b}, [x1], x3 // row 0
+ ext v18.16b, v6.16b , v6.16b , #15
+ mov v22.16b , v18.16b
+ ext v16.16b, v4.16b , v4.16b , #1
+ st1 {v18.d}[1], [x1], x3 //row 1
+ mov v14.16b , v16.16b
+ ext v20.16b, v4.16b , v4.16b , #15
+ uzp1 v17.16b, v16.16b, v18.16b
+ uzp2 v18.16b, v16.16b, v18.16b
+ mov v16.16b , v17.16b
+ //row 2
+ ext v12.16b, v16.16b , v16.16b , #1
+ st1 {v20.d}[1], [x1]
+ st1 {v6.b}[6], [x1], x3
+ //row 3
+
+ st1 {v12.h}[5], [x1], #2
+ st1 {v6.s}[2], [x1], #4
+ st1 {v6.h}[6], [x1], x5
+ //row 4
+ st1 {v18.h}[5], [x1], #2
+ st1 {v4.s}[2], [x1], #4
+ st1 {v4.h}[6], [x1], x5
+ //row 5
+ ext v26.16b, v18.16b , v18.16b , #1
+ st1 {v16.h}[5], [x1], #2
+ st1 {v22.s}[2], [x1], #4
+ st1 {v22.h}[6], [x1], x5
+ //row 6
+ st1 {v26.h}[4], [x1], #2
+ st1 {v26.b}[10], [x1], #1
+ st1 {v4.b}[8], [x1], #1
+ st1 {v14.s}[2], [x1], x6
+ //row 7
+ st1 {v12.s}[2], [x1], #4
+ st1 {v6.s}[2], [x1], #4
+
+end_func_vert_r:
+ // LDMFD sp!,{x4-x12,PC} //Restoring registers from stack
+ ldp x19, x20, [sp], #16
+ pop_v_regs
+ ret
+
+
+
+
+///**
+//*******************************************************************************
+//*
+//*ih264_intra_pred_luma_8x8_mode_horz_d
+//*
+//* @brief
+//* Perform Intra prediction for luma_8x8 mode:Horizontal_Down
+//*
+//* @par Description:
+//* Perform Intra prediction for luma_8x8 mode:Horizontal_Down ,described in sec 8.3.2.2.7
+//*
+//* @param[in] pu1_src
+//* UWORD8 pointer to the source
+//*
+//* @param[out] pu1_dst
+//* UWORD8 pointer to the destination
+//*
+//* @param[in] src_strd
+//* integer source stride
+//*
+//* @param[in] dst_strd
+//* integer destination stride
+//*
+//* @param[in] ui_neighboravailability
+//* availability of neighbouring pixels
+//*
+//* @returns
+//*
+//* @remarks
+//* None
+//*
+//*******************************************************************************/
+//void ih264_intra_pred_luma_8x8_mode_horz_d(UWORD8 *pu1_src,
+// UWORD8 *pu1_dst,
+// WORD32 src_strd,
+// WORD32 dst_strd,
+// WORD32 ui_neighboravailability)
+
+//**************Variables Vs Registers*****************************************
+// x0 => *pu1_src
+// x1 => *pu1_dst
+// x2 => src_strd
+// x3 => dst_strd
+// x4 => ui_neighboravailability
+
+ .global ih264_intra_pred_luma_8x8_mode_horz_d_av8
+
+ih264_intra_pred_luma_8x8_mode_horz_d_av8:
+
+ // STMFD sp!, {x4-x12, x14} //store register values to stack
+ push_v_regs
+ stp x19, x20, [sp, #-16]!
+
+ ld1 { v0.16b}, [x0]
+ mov v1.d[0], v0.d[1]
+ add x0, x0, #1
+ ld1 { v2.16b}, [x0]
+ mov v3.d[0], v2.d[1]
+ ext v4.16b, v2.16b , v2.16b , #1
+ mov v5.d[0], v4.d[1]
+ // q1 = q0 shifted to left once
+ // q2 = q1 shifted to left once
+ uaddl v20.8h, v0.8b, v2.8b
+ uaddl v22.8h, v1.8b, v3.8b
+ uaddl v24.8h, v2.8b, v4.8b
+ uaddl v26.8h, v3.8b, v5.8b
+ add v24.8h, v20.8h , v24.8h
+ add v26.8h, v22.8h , v26.8h
+
+ sqrshrun v4.8b, v20.8h, #1
+ sqrshrun v5.8b, v22.8h, #1
+ mov v4.d[1], v5.d[0]
+ sqrshrun v6.8b, v24.8h, #2
+ sqrshrun v7.8b, v26.8h, #2
+ mov v6.d[1], v7.d[0]
+ //Q2 has all FILT11 values
+ //Q3 has all FILT121 values
+ mov v8.16b, v4.16b
+ mov v10.16b, v6.16b
+ sub x6, x3, #6
+ trn1 v9.16b, v8.16b, v10.16b
+ trn2 v10.16b, v8.16b, v10.16b //
+ mov v8.16b, v9.16b
+ mov v12.16b, v8.16b
+ mov v14.16b, v10.16b
+ sub x5, x3, #4
+ trn1 v13.8h, v12.8h, v14.8h
+ trn2 v14.8h, v12.8h, v14.8h
+ mov v12.16b, v13.16b
+ ext v16.16b, v6.16b , v6.16b , #14
+ //ROW 0
+ st1 {v16.d}[1], [x1]
+ st1 {v10.h}[3], [x1], x3
+
+ //ROW 1
+ st1 {v14.s}[1], [x1], #4
+ st1 {v6.s}[2], [x1], x5
+ //ROW 2
+ st1 {v10.h}[2], [x1], #2
+ st1 {v14.s}[1], [x1], #4
+ st1 {v7.h}[0], [x1], x6
+ //ROW 3
+ st1 {v12.s}[1], [x1], #4
+ st1 {v14.s}[1], [x1], x5
+ //ROW 4
+ st1 {v14.h}[1], [x1], #2
+ st1 {v12.s}[1], [x1], #4
+ st1 {v14.h}[2], [x1], x6
+ //ROW 5
+ st1 {v14.s}[0], [x1], #4
+ st1 {v12.s}[1], [x1], x5
+ //ROW 6
+ st1 {v10.h}[0], [x1], #2
+ st1 {v8.h}[1], [x1], #2
+ st1 {v14.h}[1], [x1], #2
+ st1 {v12.h}[2], [x1], x6
+ //ROW 7
+ st1 {v12.s}[0], [x1], #4
+ st1 {v14.s}[0], [x1], x5
+
+end_func_horz_d:
+ // LDMFD sp!,{x4-x12,PC} //Restoring registers from stack
+ ldp x19, x20, [sp], #16
+ pop_v_regs
+ ret
+
+
+
+
+
+///**
+//*******************************************************************************
+//*
+//*ih264_intra_pred_luma_8x8_mode_vert_l
+//*
+//* @brief
+//* Perform Intra prediction for luma_8x8 mode:Vertical_Left
+//*
+//* @par Description:
+//* Perform Intra prediction for luma_8x8 mode:Vertical_Left ,described in sec 8.3.2.2.8
+//*
+//* @param[in] pu1_src
+//* UWORD8 pointer to the source
+//*
+//* @param[out] pu1_dst
+//* UWORD8 pointer to the destination
+//*
+//* @param[in] src_strd
+//* integer source stride
+//*
+//* @param[in] dst_strd
+//* integer destination stride
+//*
+//* @param[in] ui_neighboravailability
+//* availability of neighbouring pixels
+//*
+//* @returns
+//*
+//* @remarks
+//* None
+//*
+//*******************************************************************************/
+//void ih264_intra_pred_luma_8x8_mode_vert_l(UWORD8 *pu1_src,
+// UWORD8 *pu1_dst,
+// WORD32 src_strd,
+// WORD32 dst_strd,
+// WORD32 ui_neighboravailability)
+
+//**************Variables Vs Registers*****************************************
+// x0 => *pu1_src
+// x1 => *pu1_dst
+// x2 => src_strd
+// x3 => dst_strd
+// x4 => ui_neighboravailability
+
+
+ .global ih264_intra_pred_luma_8x8_mode_vert_l_av8
+
+ih264_intra_pred_luma_8x8_mode_vert_l_av8:
+
+ // STMFD sp!, {x4-x12, x14} //Restoring registers from stack
+ push_v_regs
+ stp x19, x20, [sp, #-16]!
+ add x0, x0, #9
+ ld1 { v0.16b}, [x0]
+ mov v1.d[0], v0.d[1]
+ add x0, x0, #1
+ ld1 { v2.16b}, [x0]
+ mov v3.d[0], v2.d[1]
+ ext v4.16b, v2.16b , v2.16b , #1
+ mov v5.d[0], v4.d[1]
+ uaddl v20.8h, v0.8b, v2.8b
+ uaddl v22.8h, v1.8b, v3.8b
+ uaddl v24.8h, v2.8b, v4.8b
+ uaddl v26.8h, v3.8b, v5.8b
+ add v24.8h, v20.8h , v24.8h
+ add v26.8h, v22.8h , v26.8h
+
+ sqrshrun v4.8b, v20.8h, #1
+ sqrshrun v5.8b, v22.8h, #1
+ mov v4.d[1], v5.d[0]
+ sqrshrun v6.8b, v24.8h, #2
+ ext v8.16b, v4.16b , v4.16b , #1
+ sqrshrun v7.8b, v26.8h, #2
+ mov v6.d[1], v7.d[0]
+ //Q2 has all FILT11 values
+ //Q3 has all FILT121 values
+
+ ext v10.16b, v6.16b , v6.16b , #1
+ //ROW 0,1
+ st1 {v4.8b}, [x1], x3
+ st1 {v6.8b}, [x1], x3
+
+ ext v12.16b, v8.16b , v8.16b , #1
+ ext v14.16b, v10.16b , v10.16b , #1
+ //ROW 2,3
+ st1 {v8.8b}, [x1], x3
+ st1 {v10.8b}, [x1], x3
+
+ ext v16.16b, v12.16b , v12.16b , #1
+ ext v18.16b, v14.16b , v14.16b , #1
+ //ROW 4,5
+ st1 {v12.8b}, [x1], x3
+ st1 {v14.8b}, [x1], x3
+ //ROW 6,7
+ st1 {v16.8b}, [x1], x3
+ st1 {v18.8b}, [x1], x3
+
+end_func_vert_l:
+ // LDMFD sp!,{x4-x12,PC} //Restoring registers from stack
+ ldp x19, x20, [sp], #16
+ pop_v_regs
+ ret
+
+
+
+
+
+///**
+//*******************************************************************************
+//*
+//*ih264_intra_pred_luma_8x8_mode_horz_u
+//*
+//* @brief
+//* Perform Intra prediction for luma_8x8 mode:Horizontal_Up
+//*
+//* @par Description:
+//* Perform Intra prediction for luma_8x8 mode:Horizontal_Up ,described in sec 8.3.2.2.9
+//*
+//* @param[in] pu1_src
+//* UWORD8 pointer to the source
+//*
+//* @param[out] pu1_dst
+//* UWORD8 pointer to the destination
+//*
+//* @param[in] src_strd
+//* integer source stride
+//*
+//* @param[in] dst_strd
+//* integer destination stride
+//*
+//* @param[in] ui_neighboravailability
+//* availability of neighbouring pixels
+//*
+//* @returns
+//*
+//* @remarks
+//* None
+//*
+//*******************************************************************************/
+//void ih264_intra_pred_luma_8x8_mode_horz_u(UWORD8 *pu1_src,
+// UWORD8 *pu1_dst,
+// WORD32 src_strd,
+// WORD32 dst_strd,
+// WORD32 ui_neighboravailability)
+
+//**************Variables Vs Registers*****************************************
+// x0 => *pu1_src
+// x1 => *pu1_dst
+// x2 => src_strd
+// x3 => dst_strd
+// x4 => ui_neighboravailability
+
+ .global ih264_intra_pred_luma_8x8_mode_horz_u_av8
+
+ih264_intra_pred_luma_8x8_mode_horz_u_av8:
+
+ // STMFD sp!, {x4-x12, x14} //store register values to stack
+ push_v_regs
+ stp x19, x20, [sp, #-16]!
+
+ ld1 {v0.8b}, [x0]
+ ld1 {v1.b}[7], [x0]
+ mov v0.d[1], v1.d[0]
+ ext v2.16b, v0.16b , v0.16b , #1
+ mov v3.d[0], v2.d[1]
+ ext v4.16b, v2.16b , v2.16b , #1
+ mov v5.d[0], v4.d[1]
+
+ adrp x12, :got:ih264_gai1_intrapred_luma_8x8_horz_u
+ ldr x12, [x12, #:got_lo12:ih264_gai1_intrapred_luma_8x8_horz_u]
+ uaddl v20.8h, v0.8b, v2.8b
+ uaddl v22.8h, v1.8b, v3.8b
+ uaddl v24.8h, v2.8b, v4.8b
+ uaddl v26.8h, v3.8b, v5.8b
+ add v24.8h, v20.8h , v24.8h
+ add v26.8h, v22.8h , v26.8h
+ ld1 { v10.16b}, [x12]
+ mov v11.d[0], v10.d[1]
+ sqrshrun v4.8b, v20.8h, #1
+ sqrshrun v5.8b, v22.8h, #1
+ mov v4.d[1], v5.d[0]
+ sqrshrun v6.8b, v24.8h, #2
+ sqrshrun v7.8b, v26.8h, #2
+ mov v6.d[1], v7.d[0]
+ //Q2 has all FILT11 values
+ //Q3 has all FILT121 values
+ mov v30.16b, v4.16b
+ mov v31.16b, v6.16b
+ tbl v12.8b, {v30.16b, v31.16b}, v10.8b
+ dup v14.16b, v5.8b[7] //
+ tbl v13.8b, {v30.16b, v31.16b}, v11.8b
+ mov v12.d[1], v13.d[0]
+ ext v16.16b, v12.16b , v14.16b , #2
+ ext v18.16b, v16.16b , v14.16b , #2
+ st1 {v12.8b}, [x1], x3 //0
+ ext v20.16b, v18.16b , v14.16b , #2
+ st1 {v16.8b}, [x1], x3 //1
+ st1 {v18.8b}, [x1], x3 //2
+ st1 {v20.8b}, [x1], x3 //3
+ st1 {v13.8b}, [x1], x3 //4
+ st1 {v16.d}[1], [x1], x3 //5
+ st1 {v18.d}[1], [x1], x3 //6
+ st1 {v20.d}[1], [x1], x3 //7
+
+
+end_func_horz_u:
+ // LDMFD sp!,{x4-x12,PC} //Restoring registers from stack
+ ldp x19, x20, [sp], #16
+ pop_v_regs
+ ret
+
+
diff --git a/common/armv8/ih264_iquant_itrans_recon_av8.s b/common/armv8/ih264_iquant_itrans_recon_av8.s
new file mode 100755
index 0000000..4c83036
--- /dev/null
+++ b/common/armv8/ih264_iquant_itrans_recon_av8.s
@@ -0,0 +1,778 @@
+//******************************************************************************
+//*
+//* Copyright (C) 2015 The Android Open Source Project
+//*
+//* Licensed under the Apache License, Version 2.0 (the "License");
+//* you may not use this file except in compliance with the License.
+//* You may obtain a copy of the License at:
+//*
+//* http://www.apache.org/licenses/LICENSE-2.0
+//*
+//* Unless required by applicable law or agreed to in writing, software
+//* distributed under the License is distributed on an "AS IS" BASIS,
+//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//* See the License for the specific language governing permissions and
+//* limitations under the License.
+//*
+//*****************************************************************************
+//* Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
+//*/
+///**
+///*******************************************************************************
+// * //file
+// * ih264_iquant_itrans_recon_a9.s
+// *
+// * //brief
+// * Contains function definitions for single stage inverse transform
+// *
+// * //author
+// * Parthiban V
+// * Mohit
+// * Harinarayanaan
+// *
+// * //par List of Functions:
+// * - ih264_iquant_itrans_recon_4x4_av8()
+// * - ih264_iquant_itrans_recon_8x8_av8()
+// * - ih264_iquant_itrans_recon_chroma_4x4_av8()
+// *
+// * //remarks
+// * None
+// *
+// *******************************************************************************
+
+.text
+.p2align 2
+.include "ih264_neon_macros.s"
+
+///*
+// *******************************************************************************
+// *
+// * //brief
+// * This function performs inverse quant and Inverse transform type Ci4 for 4*4 block
+// *
+// * //par Description:
+// * Performs inverse transform Ci4 and adds the residue to get the
+// * reconstructed block
+// *
+// * //param[in] pi2_src
+// * Input 4x4 coefficients
+// *
+// * //param[in] pu1_pred
+// * Prediction 4x4 block
+// *
+// * //param[out] pu1_out
+// * Output 4x4 block
+// *
+// * //param[in] u4_qp_div_6
+// * QP
+// *
+// * //param[in] pu2_weigh_mat
+// * Pointer to weight matrix
+// *
+// * //param[in] pred_strd,
+// * Prediction stride
+// *
+// * //param[in] out_strd
+// * Output Stride
+// *
+// *//param[in] pi2_tmp
+// * temporary buffer of size 1*16
+// *
+// * //param[in] pu2_iscal_mat
+// * Pointer to the inverse quantization matrix
+// *
+// * //returns Void
+// *
+// * //remarks
+// * None
+// *
+// *******************************************************************************
+// */
+//void ih264_iquant_itrans_recon_4x4(WORD16 *pi2_src,
+// UWORD8 *pu1_pred,
+// UWORD8 *pu1_out,
+// WORD32 pred_strd,
+// WORD32 out_strd,
+// const UWORD16 *pu2_iscal_mat,
+// const UWORD16 *pu2_weigh_mat,
+// UWORD32 u4_qp_div_6,
+// WORD32 *pi4_tmp,
+// WORD32 iq_start_idx
+// WORD16 *pi2_dc_ld_addr)
+//**************Variables Vs Registers*****************************************
+//x0 => *pi2_src
+//x1 => *pu1_pred
+//x2 => *pu1_out
+//x3 => pred_strd
+//x4 => out_strd
+//x5 => *pu2_iscal_mat
+//x6 => *pu2_weigh_mat
+//x7 => u4_qp_div_6
+// => pi4_tmp
+// => iq_start_idx
+// => pi2_dc_ld_addr
+//Only one shift is done in horizontal inverse because,
+//if u4_qp_div_6 is lesser than 4 then shift value will be neagative and do negative left shift, in this case rnd_factor has value
+//if u4_qp_div_6 is greater than 4 then shift value will be positive and do left shift, here rnd_factor is 0
+
+ .global ih264_iquant_itrans_recon_4x4_av8
+ih264_iquant_itrans_recon_4x4_av8:
+
+ push_v_regs
+
+ dup v30.4s, w7 //Populate the u4_qp_div_6 in Q15
+
+ ldr w8, [sp, #72] //Loads iq_start_idx
+ sxtw x8, w8
+
+ ldr x10, [sp, #80] //Load alternate dc address
+
+ subs x8, x8, #1 // if x8 == 1 => intra case , so result of subtraction is zero and z flag is set
+
+
+//=======================DEQUANT FROM HERE===================================
+
+ ld4 {v20.4h - v23.4h}, [x5] // load pu2_iscal_mat[i], i =0..15
+ ld4 {v26.4h - v29.4h}, [x6] // pu2_weigh_mat[i], i =0..15
+ ld4 {v16.4h - v19.4h}, [x0] // pi2_src_tmp[i], i =0..15
+
+
+ mul v20.4h, v20.4h, v26.4h // x[i]=(scale[i] * dequant[i]) where i = 0..3
+ mul v21.4h, v21.4h, v27.4h // x[i]=(scale[i] * dequant[i]) where i = 4..7
+ mul v22.4h, v22.4h, v28.4h // x[i]=(scale[i] * dequant[i]) where i = 8..11
+ mul v23.4h, v23.4h, v29.4h // x[i]=(scale[i] * dequant[i]) where i = 12..14
+
+ smull v0.4s, v16.4h, v20.4h // q0 = p[i] = (x[i] * trns_coeff[i]) where i = 0..3
+ smull v2.4s, v17.4h, v21.4h // q1 = p[i] = (x[i] * trns_coeff[i]) where i = 4..7
+ smull v4.4s, v18.4h, v22.4h // q2 = p[i] = (x[i] * trns_coeff[i]) where i = 8..11
+ smull v6.4s, v19.4h, v23.4h // q3 = p[i] = (x[i] * trns_coeff[i]) where i = 12..15
+
+ sshl v0.4s, v0.4s, v30.4s // q0 = q[i] = (p[i] << (qp/6)) where i = 0..3
+ sshl v2.4s, v2.4s, v30.4s // q1 = q[i] = (p[i] << (qp/6)) where i = 4..7
+ sshl v4.4s, v4.4s, v30.4s // q2 = q[i] = (p[i] << (qp/6)) where i = 8..11
+ sshl v6.4s, v6.4s, v30.4s // q3 = q[i] = (p[i] << (qp/6)) where i = 12..15
+
+ sqrshrn v0.4h, v0.4s, #0x4 // d0 = c[i] = ((q[i] + 32) >> 4) where i = 0..3
+ sqrshrn v1.4h, v2.4s, #0x4 // d1 = c[i] = ((q[i] + 32) >> 4) where i = 4..7
+ sqrshrn v2.4h, v4.4s, #0x4 // d2 = c[i] = ((q[i] + 32) >> 4) where i = 8..11
+ sqrshrn v3.4h, v6.4s, #0x4 // d3 = c[i] = ((q[i] + 32) >> 4) where i = 12..15
+
+ bne skip_loading_luma_dc_src
+ ld1 {v0.h}[0], [x10] // loads signed halfword pi2_dc_ld_addr[0], if x8==1
+skip_loading_luma_dc_src:
+
+ //========= PROCESS IDCT FROM HERE =======
+ //Steps for Stage 1:
+ //------------------
+ ld1 {v30.s}[0], [x1], x3 // i row load pu1_pred buffer
+
+ sshr v8.4h, v1.4h, #1 // d1>>1
+ sshr v9.4h, v3.4h, #1 // d3>>1
+
+ add v4.4h, v0.4h, v2.4h // x0 = d0 + d2//
+ sub v5.4h, v0.4h, v2.4h // x1 = d0 - d2//
+ sub v6.4h, v8.4h, v3.4h // x2 = (d1 >> 1) - d3//
+ add v7.4h, v1.4h, v9.4h // x3 = d1 + (d3 >> 1)//
+
+ ld1 {v30.s}[1], [x1], x3 // ii row load pu1_pred buffer
+
+ add v10.4h, v4.4h , v7.4h // x0+x3
+ add v11.4h, v5.4h , v6.4h // x1+x2
+ sub v12.4h, v5.4h , v6.4h // x1-x2
+ sub v13.4h, v4.4h , v7.4h
+
+ ld1 {v31.s}[0], [x1], x3 // iii row load pu1_pred buf
+
+
+ //Steps for Stage 2:
+ //transopose
+ trn1 v4.4h, v10.4h, v11.4h
+ trn2 v5.4h, v10.4h, v11.4h
+ trn1 v6.4h, v12.4h, v13.4h
+ trn2 v7.4h, v12.4h, v13.4h
+
+ trn1 v10.2s, v4.2s, v6.2s // 0
+ trn1 v11.2s, v5.2s, v7.2s // 8
+ trn2 v12.2s, v4.2s, v6.2s // 4
+ trn2 v13.2s, v5.2s, v7.2s
+ //end transpose
+
+ sshr v18.4h, v11.4h, #1 // q0>>1
+ sshr v19.4h, v13.4h, #1 // q1>>1
+
+ add v14.4h, v10.4h, v12.4h // x0 = q0 + q2//
+ sub v15.4h, v10.4h, v12.4h // x1 = q0 - q2//
+ sub v16.4h, v18.4h, v13.4h // x2 = (q1 >> 1) - q3//
+ add v17.4h, v11.4h, v19.4h // x3 = q1+ (q3 >> 3)//
+
+
+ ld1 {v31.s}[1], [x1], x3 // iv row load pu1_pred buffer
+
+ add v20.4h, v14.4h, v17.4h // x0 + x3
+ add v21.4h, v15.4h, v16.4h // x1 + x2
+ sub v22.4h, v15.4h, v16.4h // x1 - x2
+ sub v23.4h, v14.4h, v17.4h // x0 - x3
+
+ mov v20.d[1], v21.d[0]
+ mov v22.d[1], v23.d[0]
+
+ srshr v20.8h, v20.8h, #6
+ srshr v22.8h, v22.8h, #6
+
+ uaddw v20.8h, v20.8h , v30.8b
+ uaddw v22.8h, v22.8h , v31.8b
+
+ sqxtun v0.8b, v20.8h
+ sqxtun v1.8b, v22.8h
+
+ st1 {v0.s}[0], [x2], x4 //i row store the value
+ st1 {v0.s}[1], [x2], x4 //ii row store the value
+ st1 {v1.s}[0], [x2], x4 //iii row store the value
+ st1 {v1.s}[1], [x2] //iv row store the value
+
+ pop_v_regs
+ ret
+
+
+///**
+// *******************************************************************************
+// *
+// * @brief
+// * This function performs inverse quant and Inverse transform type Ci4 for 4*4 block
+// *
+// * @par Description:
+// * Performs inverse transform Ci4 and adds the residue to get the
+// * reconstructed block
+// *
+// * @param[in] pi2_src
+// * Input 4x4 coefficients
+// *
+// * @param[in] pu1_pred
+// * Prediction 4x4 block
+// *
+// * @param[out] pu1_out
+// * Output 4x4 block
+// *
+// * @param[in] u4_qp_div_6
+// * QP
+// *
+// * @param[in] pu2_weigh_mat
+// * Pointer to weight matrix
+// *
+// * @param[in] pred_strd,
+// * Prediction stride
+// *
+// * @param[in] out_strd
+// * Output Stride
+// *
+// *@param[in] pi2_tmp
+// * temporary buffer of size 1*16
+// *
+// * @param[in] pu2_iscal_mat
+// * Pointer to the inverse quantization matrix
+// *
+// * @returns Void
+// *
+// * @remarks
+// * None
+// *
+// *******************************************************************************
+// */
+//void ih264_iquant_itrans_recon_chroma_4x4(WORD16 *pi2_src,
+// UWORD8 *pu1_pred,
+// UWORD8 *pu1_out,
+// WORD32 pred_strd,
+// WORD32 out_strd,
+// const UWORD16 *pu2_iscal_mat,
+// const UWORD16 *pu2_weigh_mat,
+// UWORD32 u4_qp_div_6,
+// WORD32 *pi4_tmp
+// WORD16 *pi2_dc_src)
+//**************Variables Vs Registers*****************************************
+//x0 => *pi2_src
+//x1 => *pu1_pred
+//x2 => *pu1_out
+//x3 => pred_strd
+//x4 => out_strd
+//x5 => *pu2_iscal_mat
+//x6 => *pu2_weigh_mat
+//x7 => u4_qp_div_6
+//sp => pi4_tmp
+//sp#8 => *pi2_dc_src
+
+ .global ih264_iquant_itrans_recon_chroma_4x4_av8
+ih264_iquant_itrans_recon_chroma_4x4_av8:
+
+//VLD4.S16 is used because the pointer is incremented by SUB_BLK_WIDTH_4x4
+//If the macro value changes need to change the instruction according to it.
+//Only one shift is done in horizontal inverse because,
+//if u4_qp_div_6 is lesser than 4 then shift value will be neagative and do negative left shift, in this case rnd_factor has value
+//if u4_qp_div_6 is greater than 4 then shift value will be positive and do left shift, here rnd_factor is 0
+
+//at the end of the fucntion, we could have moved 64 bits into heigher 64 bits of register and done further processing
+//but it seem to give only reduce the number of instruction by 1. [Since a15 we saw add and sub to be very high throughput
+//all instructions were taken as equal
+
+ //reduce sp by 64
+ push_v_regs
+
+ dup v30.4s, w7 //Populate the u4_qp_div_6 in Q15
+
+ //was at sp + 8, hence now at sp+64+8 = sp+72
+ ldr x10, [sp, #72] //Load alternate dc address
+
+//=======================DEQUANT FROM HERE===================================
+
+ ld4 {v20.4h - v23.4h}, [x5] // load pu2_iscal_mat[i], i =0..15
+ ld4 {v26.4h - v29.4h}, [x6] // pu2_weigh_mat[i], i =0..15
+ ld4 {v16.4h - v19.4h}, [x0] // pi2_src_tmp[i], i =0..15
+
+
+ mul v20.4h, v20.4h, v26.4h // x[i]=(scale[i] * dequant[i]) where i = 0..3
+ mul v21.4h, v21.4h, v27.4h // x[i]=(scale[i] * dequant[i]) where i = 4..7
+ mul v22.4h, v22.4h, v28.4h // x[i]=(scale[i] * dequant[i]) where i = 8..11
+ mul v23.4h, v23.4h, v29.4h // x[i]=(scale[i] * dequant[i]) where i = 12..14
+
+ smull v0.4s, v16.4h, v20.4h // q0 = p[i] = (x[i] * trns_coeff[i]) where i = 0..3
+ smull v2.4s, v17.4h, v21.4h // q1 = p[i] = (x[i] * trns_coeff[i]) where i = 4..7
+ smull v4.4s, v18.4h, v22.4h // q2 = p[i] = (x[i] * trns_coeff[i]) where i = 8..11
+ smull v6.4s, v19.4h, v23.4h // q3 = p[i] = (x[i] * trns_coeff[i]) where i = 12..15
+
+ sshl v0.4s, v0.4s, v30.4s // q0 = q[i] = (p[i] << (qp/6)) where i = 0..3
+ sshl v2.4s, v2.4s, v30.4s // q1 = q[i] = (p[i] << (qp/6)) where i = 4..7
+ sshl v4.4s, v4.4s, v30.4s // q2 = q[i] = (p[i] << (qp/6)) where i = 8..11
+ sshl v6.4s, v6.4s, v30.4s // q3 = q[i] = (p[i] << (qp/6)) where i = 12..15
+
+ sqrshrn v0.4h, v0.4s, #0x4 // d0 = c[i] = ((q[i] + 32) >> 4) where i = 0..3
+ sqrshrn v1.4h, v2.4s, #0x4 // d1 = c[i] = ((q[i] + 32) >> 4) where i = 4..7
+ sqrshrn v2.4h, v4.4s, #0x4 // d2 = c[i] = ((q[i] + 32) >> 4) where i = 8..11
+ sqrshrn v3.4h, v6.4s, #0x4 // d3 = c[i] = ((q[i] + 32) >> 4) where i = 12..15
+
+ ld1 {v0.h}[0], [x10] // loads signed halfword pi2_dc_src[0]
+
+ //========= PROCESS IDCT FROM HERE =======
+ //Steps for Stage 1:
+ //------------------
+
+ sshr v8.4h, v1.4h, #1 // d1>>1
+ sshr v9.4h, v3.4h, #1 // d3>>1
+
+ add v4.4h, v0.4h, v2.4h // x0 = d0 + d2//
+ sub v5.4h, v0.4h, v2.4h // x1 = d0 - d2//
+ sub v6.4h, v8.4h, v3.4h // x2 = (d1 >> 1) - d3//
+ add v7.4h, v1.4h, v9.4h // x3 = d1 + (d3 >> 1)//
+
+
+ add v10.4h, v4.4h , v7.4h // x0+x3
+ add v11.4h, v5.4h , v6.4h // x1+x2
+ sub v12.4h, v5.4h , v6.4h // x1-x2
+ sub v13.4h, v4.4h , v7.4h
+
+ ld1 {v26.8b}, [x1], x3 // i row load pu1_pred buffer
+ ld1 {v27.8b}, [x1], x3 // ii row load pu1_pred buffer
+ ld1 {v28.8b}, [x1], x3 // iii row load pu1_pred buf
+ ld1 {v29.8b}, [x1], x3 // iv row load pu1_pred buffer
+
+ //Steps for Stage 2:
+ //transopose
+ trn1 v4.4h, v10.4h, v11.4h
+ trn2 v5.4h, v10.4h, v11.4h
+ trn1 v6.4h, v12.4h, v13.4h
+ trn2 v7.4h, v12.4h, v13.4h
+
+ trn1 v10.2s, v4.2s, v6.2s // 0
+ trn1 v11.2s, v5.2s, v7.2s // 8
+ trn2 v12.2s, v4.2s, v6.2s // 4
+ trn2 v13.2s, v5.2s, v7.2s
+ //end transpose
+
+ sshr v18.4h, v11.4h, #1 // q0>>1
+ sshr v19.4h, v13.4h, #1 // q1>>1
+
+ add v14.4h, v10.4h, v12.4h // x0 = q0 + q2//
+ sub v15.4h, v10.4h, v12.4h // x1 = q0 - q2//
+ sub v16.4h, v18.4h, v13.4h // x2 = (q1 >> 1) - q3//
+ add v17.4h, v11.4h, v19.4h // x3 = q1+ (q3 >> 3)//
+
+ //Backup the output addr
+ mov x0, x2
+
+ //load outpt buufer for interleaving
+ ld1 {v10.8b}, [x2], x4
+ ld1 {v11.8b}, [x2], x4
+ ld1 {v12.8b}, [x2], x4
+ ld1 {v13.8b}, [x2]
+
+ add v20.4h, v14.4h, v17.4h // x0 + x3
+ add v21.4h, v15.4h, v16.4h // x1 + x2
+ sub v22.4h, v15.4h, v16.4h // x1 - x2
+ sub v23.4h, v14.4h, v17.4h // x0 - x3
+
+ srshr v20.4h, v20.4h, #6
+ srshr v21.4h, v21.4h, #6
+ srshr v22.4h, v22.4h, #6
+ srshr v23.4h, v23.4h, #6
+
+ //nop v30.8b //dummy for deinterleaving
+ movi v31.4h, #0x00ff //mask for interleaving [copy lower 8 bits]
+
+ //Extract u/v plane from interleaved data
+ uzp1 v26.8b, v26.8b, v30.8b
+ uzp1 v27.8b, v27.8b, v30.8b
+ uzp1 v28.8b, v28.8b, v30.8b
+ uzp1 v29.8b, v29.8b, v30.8b
+
+ uaddw v20.8h, v20.8h, v26.8b
+ uaddw v21.8h, v21.8h, v27.8b
+ uaddw v22.8h, v22.8h, v28.8b
+ uaddw v23.8h, v23.8h, v29.8b
+
+ sqxtun v0.8b, v20.8h
+ sqxtun v1.8b, v21.8h
+ sqxtun v2.8b, v22.8h
+ sqxtun v3.8b, v23.8h
+
+ //long the output so that we have 0 at msb and value at lsb
+ uxtl v6.8h, v0.8b
+ uxtl v7.8h, v1.8b
+ uxtl v8.8h, v2.8b
+ uxtl v9.8h, v3.8b
+
+ //select lsbs from proceesd data and msbs from pu1_out loaded data
+ bit v10.8b, v6.8b, v31.8b
+ bit v11.8b, v7.8b, v31.8b
+ bit v12.8b, v8.8b, v31.8b
+ bit v13.8b, v9.8b, v31.8b
+
+ //store the interleaved result
+ st1 {v10.8b}, [x0], x4
+ st1 {v11.8b}, [x0], x4
+ st1 {v12.8b}, [x0], x4
+ st1 {v13.8b}, [x0]
+
+ pop_v_regs
+ ret
+
+///*
+// *******************************************************************************
+// *
+// * //brief
+// * This function performs inverse quant and Inverse transform type Ci4 for 8*8 block
+// *
+// * //par Description:
+// * Performs inverse transform Ci8 and adds the residue to get the
+// * reconstructed block
+// *
+// * //param[in] pi2_src
+// * Input 4x4 coefficients
+// *
+// * //param[in] pu1_pred
+// * Prediction 4x4 block
+// *
+// * //param[out] pu1_out
+// * Output 4x4 block
+// *
+// * //param[in] u4_qp_div_6
+// * QP
+// *
+// * //param[in] pu2_weigh_mat
+// * Pointer to weight matrix
+// *
+// * //param[in] pred_strd,
+// * Prediction stride
+// *
+// * //param[in] out_strd
+// * Output Stride
+// *
+// *//param[in] pi2_tmp
+// * temporary buffer of size 1*64
+// *
+// * //param[in] pu2_iscal_mat
+// * Pointer to the inverse quantization matrix
+// *
+// * //returns Void
+// *
+// * //remarks
+// * None
+// *
+// *******************************************************************************
+// */
+//void ih264_iquant_itrans_recon_8x8(WORD16 *pi2_src,
+// UWORD8 *pu1_pred,
+// UWORD8 *pu1_out,
+// WORD32 pred_strd,
+// WORD32 out_strd,
+// const UWORD16 *pu2_iscal_mat,
+// const UWORD16 *pu2_weigh_mat,
+// UWORD32 u4_qp_div_6,
+// WORD32 *pi4_tmp,
+// WORD32 iq_start_idx
+// WORD16 *pi2_dc_ld_addr)
+//**************Variables Vs Registers*****************************************
+//x0 => *pi2_src
+//x1 => *pu1_pred
+//x2 => *pu1_out
+//x3 => pred_strd
+//x4 => out_strd
+//x5 => *pu2_iscal_mat
+//x6 => *pu2_weigh_mat
+//x7 => u4_qp_div_6
+//NOT USED => pi4_tmp
+//NOT USED => iq_start_idx
+//NOT USED => pi2_dc_ld_addr
+
+ .global ih264_iquant_itrans_recon_8x8_av8
+ih264_iquant_itrans_recon_8x8_av8:
+
+ push_v_regs
+
+ ld1 {v8.8h -v11.8h}, [x5], #64
+ ld1 {v12.8h-v15.8h}, [x5]
+
+ ld1 {v16.8h -v19.8h}, [x6], #64
+ ld1 {v20.8h -v23.8h}, [x6]
+
+ mov x8, #16
+ ld1 {v0.8h}, [x0], x8
+ ld1 {v1.8h}, [x0], x8
+ ld1 {v2.8h}, [x0], x8
+ ld1 {v3.8h}, [x0], x8
+ ld1 {v4.8h}, [x0], x8
+ ld1 {v5.8h}, [x0], x8
+ ld1 {v6.8h}, [x0], x8
+ ld1 {v7.8h}, [x0]
+
+ mul v8.8h, v8.8h, v16.8h
+ mul v9.8h, v9.8h, v17.8h
+ mul v10.8h, v10.8h, v18.8h
+ mul v11.8h, v11.8h, v19.8h
+ mul v12.8h, v12.8h, v20.8h
+ mul v13.8h, v13.8h, v21.8h
+ mul v14.8h, v14.8h, v22.8h
+ mul v15.8h, v15.8h, v23.8h
+
+ smull v16.4s, v0.4h, v8.4h
+ smull2 v17.4s, v0.8h, v8.8h
+ smull v18.4s, v1.4h, v9.4h
+ smull2 v19.4s, v1.8h, v9.8h
+ smull v20.4s, v2.4h, v10.4h
+ smull2 v21.4s, v2.8h, v10.8h
+ smull v22.4s, v3.4h, v11.4h
+ smull2 v23.4s, v3.8h, v11.8h
+ smull v24.4s, v4.4h, v12.4h
+ smull2 v25.4s, v4.8h, v12.8h
+ smull v26.4s, v5.4h, v13.4h
+ smull2 v27.4s, v5.8h, v13.8h
+ smull v28.4s, v6.4h, v14.4h
+ smull2 v29.4s, v6.8h, v14.8h
+ smull v30.4s, v7.4h, v15.4h
+ smull2 v31.4s, v7.8h, v15.8h
+
+ dup v0.4s, w7
+
+ sshl v16.4s, v16.4s, v0.4s
+ sshl v17.4s, v17.4s, v0.4s
+ sshl v18.4s, v18.4s, v0.4s
+ sshl v19.4s, v19.4s, v0.4s
+ sshl v20.4s, v20.4s, v0.4s
+ sshl v21.4s, v21.4s, v0.4s
+ sshl v22.4s, v22.4s, v0.4s
+ sshl v23.4s, v23.4s, v0.4s
+ sshl v24.4s, v24.4s, v0.4s
+ sshl v25.4s, v25.4s, v0.4s
+ sshl v26.4s, v26.4s, v0.4s
+ sshl v27.4s, v27.4s, v0.4s
+ sshl v28.4s, v28.4s, v0.4s
+ sshl v29.4s, v29.4s, v0.4s
+ sshl v30.4s, v30.4s, v0.4s
+ sshl v31.4s, v31.4s, v0.4s
+
+ sqrshrn v0.4h, v16.4s, #6
+ sqrshrn2 v0.8h, v17.4s, #6
+ sqrshrn v1.4h, v18.4s, #6
+ sqrshrn2 v1.8h, v19.4s, #6
+ sqrshrn v2.4h, v20.4s, #6
+ sqrshrn2 v2.8h, v21.4s, #6
+ sqrshrn v3.4h, v22.4s, #6
+ sqrshrn2 v3.8h, v23.4s, #6
+ sqrshrn v4.4h, v24.4s, #6
+ sqrshrn2 v4.8h, v25.4s, #6
+ sqrshrn v5.4h, v26.4s, #6
+ sqrshrn2 v5.8h, v27.4s, #6
+ sqrshrn v6.4h, v28.4s, #6
+ sqrshrn2 v6.8h, v29.4s, #6
+ sqrshrn v7.4h, v30.4s, #6
+ sqrshrn2 v7.8h, v31.4s, #6
+
+ //loop counter
+ mov x8, #2
+//1x8 transofORM
+trans_1x8_1d:
+
+ //transpose 8x8
+ trn1 v8.8h, v0.8h, v1.8h
+ trn2 v9.8h, v0.8h, v1.8h
+ trn1 v10.8h, v2.8h, v3.8h
+ trn2 v11.8h, v2.8h, v3.8h
+ trn1 v12.8h, v4.8h, v5.8h
+ trn2 v13.8h, v4.8h, v5.8h
+ trn1 v14.8h, v6.8h, v7.8h
+ trn2 v15.8h, v6.8h, v7.8h
+
+ trn1 v0.4s, v8.4s, v10.4s
+ trn2 v2.4s, v8.4s, v10.4s
+ trn1 v1.4s, v9.4s, v11.4s
+ trn2 v3.4s, v9.4s, v11.4s
+ trn1 v4.4s, v12.4s, v14.4s
+ trn2 v6.4s, v12.4s, v14.4s
+ trn1 v5.4s, v13.4s, v15.4s
+ trn2 v7.4s, v13.4s, v15.4s
+
+ trn1 v8.2d, v0.2d, v4.2d //0
+ trn2 v12.2d, v0.2d, v4.2d //1
+ trn1 v9.2d, v1.2d, v5.2d //2
+ trn2 v13.2d, v1.2d, v5.2d //3
+ trn1 v10.2d, v2.2d, v6.2d //4
+ trn2 v14.2d, v2.2d, v6.2d //5
+ trn1 v11.2d, v3.2d, v7.2d //6
+ trn2 v15.2d, v3.2d, v7.2d //7
+
+ // 1 3 5 6 7
+ sshr v16.8h, v9.8h, #1 //(pi2_tmp_ptr[1] >> 1)
+ sshr v17.8h, v10.8h, #1 //(pi2_tmp_ptr[2] >> 1)
+ sshr v18.8h, v11.8h, #1 //(pi2_tmp_ptr[3] >> 1)
+ sshr v19.8h, v13.8h, #1 //(pi2_tmp_ptr[5] >> 1)
+ sshr v20.8h, v14.8h, #1 //(pi2_tmp_ptr[6] >> 1)
+ sshr v21.8h, v15.8h, #1 //(pi2_tmp_ptr[7] >> 1)
+
+ add v0.8h, v8.8h, v12.8h // i_y0 = (pi2_tmp_ptr[0] + pi2_tmp_ptr[4] );
+ sub v2.8h, v8.8h, v12.8h // i_y2 = (pi2_tmp_ptr[0] - pi2_tmp_ptr[4] );
+
+ sub v4.8h, v17.8h, v14.8h //i_y4 = ((pi2_tmp_ptr[2] >> 1) - pi2_tmp_ptr[6] );
+ add v6.8h, v10.8h, v20.8h //i_y6 = (pi2_tmp_ptr[2] + (pi2_tmp_ptr[6] >> 1));
+
+ //-w3 + w5
+ ssubl v22.4s, v13.4h, v11.4h
+ ssubl2 v23.4s, v13.8h, v11.8h
+ //w3 + w5
+ saddl v24.4s, v13.4h, v11.4h
+ saddl2 v25.4s, v13.8h, v11.8h
+ //-w1 + w7
+ ssubl v26.4s, v15.4h, v9.4h
+ ssubl2 v27.4s, v15.8h, v9.8h
+ //w1 + w7
+ saddl v28.4s, v15.4h, v9.4h
+ saddl2 v29.4s, v15.8h, v9.8h
+
+ //-w3 + w5 - w7
+ ssubw v22.4s, v22.4s, v15.4h
+ ssubw2 v23.4s, v23.4s, v15.8h
+ //w3 + w5 + w1
+ saddw v24.4s, v24.4s, v9.4h
+ saddw2 v25.4s, v25.4s, v9.8h
+ //-w1 + w7 + w5
+ saddw v26.4s, v26.4s, v13.4h
+ saddw2 v27.4s, v27.4s, v13.8h
+ //w1 + w7 - w3
+ ssubw v28.4s, v28.4s, v11.4h
+ ssubw2 v29.4s, v29.4s, v11.8h
+
+ //-w3 + w5 - w7 - (w7 >> 1)
+ ssubw v22.4s, v22.4s, v21.4h
+ ssubw2 v23.4s, v23.4s, v21.8h
+ //w3 + w5 + w1 + (w1 >> 1)
+ saddw v24.4s, v24.4s, v16.4h
+ saddw2 v25.4s, v25.4s, v16.8h
+ //-w1 + w7 + w5 + (w5 >> 1)
+ saddw v26.4s, v26.4s, v19.4h
+ saddw2 v27.4s, v27.4s, v19.8h
+ //w1 + w7 - w3 - (w3 >> 1)
+ ssubw v28.4s, v28.4s, v18.4h
+ ssubw2 v29.4s, v29.4s, v18.8h
+
+ xtn v1.4h, v22.4s
+ xtn2 v1.8h, v23.4s
+ xtn v3.4h, v28.4s
+ xtn2 v3.8h, v29.4s
+ xtn v5.4h, v26.4s
+ xtn2 v5.8h, v27.4s
+ xtn v7.4h, v24.4s
+ xtn2 v7.8h, v25.4s
+
+ sshr v16.8h, v1.8h, #2 //(y1 >> 2)
+ sshr v17.8h, v3.8h, #2 //(y3 >> 2)
+ sshr v18.8h, v5.8h, #2 //(y5 >> 2)
+ sshr v19.8h, v7.8h, #2 //(y7 >> 2)
+
+ add v8.8h, v0.8h, v6.8h
+ add v9.8h, v1.8h, v19.8h
+ add v10.8h, v2.8h, v4.8h
+ add v11.8h, v3.8h, v18.8h
+ sub v12.8h, v2.8h, v4.8h
+ sub v13.8h, v17.8h, v5.8h
+ sub v14.8h, v0.8h, v6.8h
+ sub v15.8h, v7.8h, v16.8h
+
+ add v0.8h, v8.8h, v15.8h
+ add v1.8h, v10.8h, v13.8h
+ add v2.8h, v12.8h, v11.8h
+ add v3.8h, v14.8h, v9.8h
+ sub v4.8h, v14.8h, v9.8h
+ sub v5.8h, v12.8h, v11.8h
+ sub v6.8h, v10.8h, v13.8h
+ sub v7.8h, v8.8h, v15.8h
+
+ subs x8, x8, #1
+ bne trans_1x8_1d
+
+ ld1 {v22.8b}, [x1], x3
+ ld1 {v23.8b}, [x1], x3
+ ld1 {v24.8b}, [x1], x3
+ ld1 {v25.8b}, [x1], x3
+ ld1 {v26.8b}, [x1], x3
+ ld1 {v27.8b}, [x1], x3
+ ld1 {v28.8b}, [x1], x3
+ ld1 {v29.8b}, [x1]
+
+ srshr v0.8h, v0.8h, #6
+ srshr v1.8h, v1.8h, #6
+ srshr v2.8h, v2.8h, #6
+ srshr v3.8h, v3.8h, #6
+ srshr v4.8h, v4.8h, #6
+ srshr v5.8h, v5.8h, #6
+ srshr v6.8h, v6.8h, #6
+ srshr v7.8h, v7.8h, #6
+
+ uaddw v0.8h, v0.8h, v22.8b
+ uaddw v1.8h, v1.8h, v23.8b
+ uaddw v2.8h, v2.8h, v24.8b
+ uaddw v3.8h, v3.8h, v25.8b
+ uaddw v4.8h, v4.8h, v26.8b
+ uaddw v5.8h, v5.8h, v27.8b
+ uaddw v6.8h, v6.8h, v28.8b
+ uaddw v7.8h, v7.8h, v29.8b
+
+ sqxtun v0.8b, v0.8h
+ sqxtun v1.8b, v1.8h
+ sqxtun v2.8b, v2.8h
+ sqxtun v3.8b, v3.8h
+ sqxtun v4.8b, v4.8h
+ sqxtun v5.8b, v5.8h
+ sqxtun v6.8b, v6.8h
+ sqxtun v7.8b, v7.8h
+
+ st1 {v0.8b}, [x2], x4
+ st1 {v1.8b}, [x2], x4
+ st1 {v2.8b}, [x2], x4
+ st1 {v3.8b}, [x2], x4
+ st1 {v4.8b}, [x2], x4
+ st1 {v5.8b}, [x2], x4
+ st1 {v6.8b}, [x2], x4
+ st1 {v7.8b}, [x2]
+
+ pop_v_regs
+ ret
+
+
+
+
diff --git a/common/armv8/ih264_iquant_itrans_recon_dc_av8.s b/common/armv8/ih264_iquant_itrans_recon_dc_av8.s
new file mode 100755
index 0000000..8bb9c32
--- /dev/null
+++ b/common/armv8/ih264_iquant_itrans_recon_dc_av8.s
@@ -0,0 +1,397 @@
+//******************************************************************************
+//*
+//* Copyright (C) 2015 The Android Open Source Project
+//*
+//* Licensed under the Apache License, Version 2.0 (the "License");
+//* you may not use this file except in compliance with the License.
+//* You may obtain a copy of the License at:
+//*
+//* http://www.apache.org/licenses/LICENSE-2.0
+//*
+//* Unless required by applicable law or agreed to in writing, software
+//* distributed under the License is distributed on an "AS IS" BASIS,
+//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//* See the License for the specific language governing permissions and
+//* limitations under the License.
+//*
+//*****************************************************************************
+//* Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
+//*/
+///**
+// *******************************************************************************
+// * @file
+// * ih264_iquant_itrans_recon_dc_av8.s
+// *
+// * @brief
+// * Contains function definitions for single stage inverse transform
+// *
+// * @author
+// * Mohit
+// *
+// * @par List of Functions:
+// * - ih264_iquant_itrans_recon_4x4_dc_av8()
+// * - ih264_iquant_itrans_recon_8x8_dc_av8()
+// * - ih264_iquant_itrans_recon_chroma_4x4_dc_av8()
+// *
+// * @remarks
+// * None
+// *
+// *******************************************************************************
+//*/
+
+
+.include "ih264_neon_macros.s"
+
+
+///**
+// *******************************************************************************
+// *
+// * @brief
+// * This function performs inverse quant and Inverse transform type Ci4 for 4*4 block
+// * for dc input pattern only, i.e. only the (0,0) element of the input 4x4 block is
+// * non-zero. For complete function, refer ih264_iquant_itrans_recon_a9.s
+// *
+// * @par Description:
+// * Performs inverse transform Ci4 and adds the residue to get the
+// * reconstructed block
+// *
+// * @param[in] pi2_src
+// * Input 4x4 coefficients
+// *
+// * @param[in] pu1_pred
+// * Prediction 4x4 block
+// *
+// * @param[out] pu1_out
+// * Output 4x4 block
+// *
+// * @param[in] u4_qp_div_6
+// * QP
+// *
+// * @param[in] pu2_weigh_mat
+// * Pointer to weight matrix
+// *
+// * @param[in] pred_strd,
+// * Prediction stride
+// *
+// * @param[in] out_strd
+// * Output Stride
+// *
+// *@param[in] pi2_tmp
+// * temporary buffer of size 1*16
+// *
+// * @param[in] pu2_iscal_mat
+// * Pointer to the inverse quantization matrix
+// *
+// * @returns Void
+// *
+// * @remarks
+// * None
+// *
+// *******************************************************************************
+// */
+//void ih264_iquant_itrans_recon_4x4_dc(WORD16 *pi2_src,
+// UWORD8 *pu1_pred,
+// UWORD8 *pu1_out,
+// WORD32 pred_strd,
+// WORD32 out_strd,
+// const UWORD16 *pu2_iscal_mat,
+// const UWORD16 *pu2_weigh_mat,
+// UWORD32 u4_qp_div_6,
+// WORD32 *pi4_tmp,
+// WORD32 iq_start_idx
+// WORD16 *pi2_dc_ld_addr)
+//**************Variables Vs Registers*****************************************
+//x0 => *pi2_src
+//x1 => *pu1_pred
+//x2 => *pu1_out
+//x3 => pred_strd
+//x4 => out_strd
+//x5 => *pu2_iscal_mat
+//x6 => *pu2_weigh_mat
+//x7 => u4_qp_div_6
+// => pi4_tmp
+// => iq_start_idx
+// => pi2_dc_ld_addr
+
+.text
+.p2align 2
+
+ .global ih264_iquant_itrans_recon_4x4_dc_av8
+ih264_iquant_itrans_recon_4x4_dc_av8:
+
+ ldr w8, [sp, #8] //Loads iq_start_idx
+ subs w8, w8, #1 // if x8 == 1 => intra case , so result of subtraction is zero and z flag is set
+
+ ldr x10, [sp, #16] //Load alternate dc address
+ push_v_regs
+ dup v30.4s, w7 //Populate the u4_qp_div_6 in Q15
+
+
+ bne donot_use_pi2_dc_ld_addr_luma_dc
+ ld1 {v0.h}[0], [x10]
+donot_use_pi2_dc_ld_addr_luma_dc:
+
+ beq donot_use_pi2_src_luma_dc
+ ld1 {v0.h}[0], [x5]
+ ld1 {v1.h}[0], [x6]
+ ld1 {v2.h}[0], [x0]
+ mul v0.4h, v1.4h, v0.4h
+ smull v0.4s, v0.4h, v2.4h
+ sshl v0.4s, v0.4s, v30.4s
+ sqrshrn v0.4h, v0.4s, #4
+donot_use_pi2_src_luma_dc:
+
+
+ dup v0.8h, v0.h[0]
+ srshr v0.8h, v0.8h, #6
+
+ ld1 {v1.s}[0], [x1], x3
+ ld1 {v1.s}[1], [x1], x3
+ ld1 {v2.s}[0], [x1], x3
+ ld1 {v2.s}[1], [x1]
+
+ uxtl v1.8h, v1.8b
+ uxtl v2.8h, v2.8b
+
+ add v1.8h, v0.8h, v1.8h
+ add v2.8h, v0.8h, v2.8h
+
+ sqxtun v1.8b, v1.8h
+ sqxtun v2.8b, v2.8h
+
+ st1 {v1.s}[0], [x2], x4
+ st1 {v1.s}[1], [x2], x4
+ st1 {v2.s}[0], [x2], x4
+ st1 {v2.s}[1], [x2]
+ pop_v_regs
+ ret
+
+// /*
+// ********************************************************************************
+// *
+// * @brief This function reconstructs a 4x4 sub block from quantized resiude and
+// * prediction buffer if only dc value is present for residue
+// *
+// * @par Description:
+// * The quantized residue is first inverse quantized,
+// * This inverse quantized content is added to the prediction buffer to recon-
+// * struct the end output
+// *
+// * @param[in] pi2_src
+// * quantized dc coeffiient
+// *
+// * @param[in] pu1_pred
+// * prediction 4x4 block in interleaved format
+// *
+// * @param[in] pred_strd,
+// * Prediction buffer stride in interleaved format
+// *
+// * @param[in] out_strd
+// * recon buffer Stride
+// *
+// * @returns none
+// *
+// * @remarks none
+// *
+// *******************************************************************************
+// */
+// void ih264_iquant_itrans_recon_chroma_4x4_dc(WORD16 *pi2_src,
+// UWORD8 *pu1_pred,
+// UWORD8 *pu1_out,
+// WORD32 pred_strd,
+// WORD32 out_strd,
+// const UWORD16 *pu2_iscal_mat,
+// const UWORD16 *pu2_weigh_mat,
+// UWORD32 u4_qp_div_6,
+// WORD16 *pi2_tmp,
+// WORD16 *pi2_dc_src)
+// Register Usage
+// x0 : pi2_src
+// x1 : pu1_pred
+// x2 : pu1_out
+// x3 : pred_strd
+// x4 : out_strd
+// x5 : pu2_iscal_mat
+// x6 : pu2_weigh_mat
+// x7 : u4_qp_div_6
+// : pi2_tmp
+// : pi2_dc_src
+// Neon registers d0-d7, d16-d30 are used
+// No need for pushing arm and neon registers
+
+
+ .global ih264_iquant_itrans_recon_chroma_4x4_dc_av8
+ih264_iquant_itrans_recon_chroma_4x4_dc_av8:
+
+ ldr x0, [sp, #8]
+ push_v_regs
+ ld1 {v0.h}[0], [x0]
+ dup v0.8h, v0.h[0]
+ srshr v0.8h, v0.8h, #6
+
+
+ //backup pu1_out
+ mov x0, x2
+
+ //nop v3.16b //dummy for deinterleaving
+ movi v31.8h, #0x00ff //mask for interleaving [copy lower 8 bits]
+
+ ld1 {v1.d}[0], [x1], x3
+ ld1 {v1.d}[1], [x1], x3
+ ld1 {v2.d}[0], [x1], x3
+ ld1 {v2.d}[1], [x1], x3
+
+ ld1 {v11.d}[0], [x2], x4 //load pu1_out for interleaving
+ ld1 {v11.d}[1], [x2], x4
+ ld1 {v12.d}[0], [x2], x4
+ ld1 {v12.d}[1], [x2]
+
+ uzp1 v1.16b, v1.16b, v3.16b
+ uzp1 v2.16b, v2.16b, v3.16b
+
+ uaddw v1.8h, v0.8h, v1.8b
+ uaddw v2.8h, v0.8h, v2.8b
+
+ sqxtun v1.8b, v1.8h
+ sqxtun v2.8b, v2.8h
+
+ uxtl v1.8h, v1.8b
+ uxtl v2.8h, v2.8b
+
+ bit v11.16b, v1.16b, v31.16b
+ bit v12.16b, v2.16b, v31.16b
+
+ st1 {v11.d}[0], [x0], x4
+ st1 {v11.d}[1], [x0], x4
+ st1 {v12.d}[0], [x0], x4
+ st1 {v12.d}[1], [x0]
+ pop_v_regs
+ ret
+
+///*
+// *******************************************************************************
+// *
+// * //brief
+// * This function performs inverse quant and Inverse transform type Ci4 for 8*8 block
+// * [Only for Dc coeff]
+// * //par Description:
+// * Performs inverse transform Ci8 and adds the residue to get the
+// * reconstructed block
+// *
+// * //param[in] pi2_src
+// * Input 4x4 coefficients
+// *
+// * //param[in] pu1_pred
+// * Prediction 4x4 block
+// *
+// * //param[out] pu1_out
+// * Output 4x4 block
+// *
+// * //param[in] u4_qp_div_6
+// * QP
+// *
+// * //param[in] pu2_weigh_mat
+// * Pointer to weight matrix
+// *
+// * //param[in] pred_strd,
+// * Prediction stride
+// *
+// * //param[in] out_strd
+// * Output Stride
+// *
+// *//param[in] pi2_tmp
+// * temporary buffer of size 1*64
+// *
+// * //param[in] pu2_iscal_mat
+// * Pointer to the inverse quantization matrix
+// *
+// * //returns Void
+// *
+// * //remarks
+// * None
+// *
+// *******************************************************************************
+// */
+//void ih264_iquant_itrans_recon_dc_8x8(WORD16 *pi2_src,
+// UWORD8 *pu1_pred,
+// UWORD8 *pu1_out,
+// WORD32 pred_strd,
+// WORD32 out_strd,
+// const UWORD16 *pu2_iscal_mat,
+// const UWORD16 *pu2_weigh_mat,
+// UWORD32 u4_qp_div_6,
+// WORD32 *pi4_tmp,
+// WORD32 iq_start_idx
+// WORD16 *pi2_dc_ld_addr)
+//**************Variables Vs Registers*****************************************
+//x0 => *pi2_src
+//x1 => *pu1_pred
+//x2 => *pu1_out
+//x3 => pred_strd
+//x4 => out_strd
+//x5 => *pu2_iscal_mat
+//x6 => *pu2_weigh_mat
+//x7 => u4_qp_div_6
+//NOT USED => pi4_tmp
+//NOT USED => iq_start_idx
+//NOT USED => pi2_dc_ld_addr
+
+ .global ih264_iquant_itrans_recon_8x8_dc_av8
+ih264_iquant_itrans_recon_8x8_dc_av8:
+
+ push_v_regs
+
+ ld1 {v1.h}[0], [x5]
+ ld1 {v2.h}[0], [x6]
+ ld1 {v0.h}[0], [x0]
+ dup v3.4s, w7
+
+
+ mul v1.8h, v1.8h, v2.8h
+ smull v0.4s, v0.4h, v1.4h
+ sshl v0.4s, v0.4s, v3.4s
+
+ sqrshrn v0.4h, v0.4s, #6
+ srshr v0.8h, v0.8h, #6
+ dup v0.8h, v0.h[0]
+
+ ld1 {v22.8b}, [x1], x3
+ ld1 {v23.8b}, [x1], x3
+ ld1 {v24.8b}, [x1], x3
+ ld1 {v25.8b}, [x1], x3
+ ld1 {v26.8b}, [x1], x3
+ ld1 {v27.8b}, [x1], x3
+ ld1 {v28.8b}, [x1], x3
+ ld1 {v29.8b}, [x1]
+
+ uaddw v1.8h, v0.8h, v22.8b
+ uaddw v2.8h, v0.8h, v23.8b
+ uaddw v3.8h, v0.8h, v24.8b
+ uaddw v8.8h, v0.8h, v25.8b
+ uaddw v9.8h, v0.8h, v26.8b
+ uaddw v10.8h, v0.8h, v27.8b
+ uaddw v11.8h, v0.8h, v28.8b
+ uaddw v12.8h, v0.8h, v29.8b
+
+ sqxtun v1.8b, v1.8h
+ sqxtun v2.8b, v2.8h
+ sqxtun v3.8b, v3.8h
+ sqxtun v8.8b, v8.8h
+ sqxtun v9.8b, v9.8h
+ sqxtun v10.8b, v10.8h
+ sqxtun v11.8b, v11.8h
+ sqxtun v12.8b, v12.8h
+
+ st1 {v1.8b}, [x2], x4
+ st1 {v2.8b}, [x2], x4
+ st1 {v3.8b}, [x2], x4
+ st1 {v8.8b}, [x2], x4
+ st1 {v9.8b}, [x2], x4
+ st1 {v10.8b}, [x2], x4
+ st1 {v11.8b}, [x2], x4
+ st1 {v12.8b}, [x2]
+
+ pop_v_regs
+ ret
+
+
diff --git a/common/armv8/ih264_mem_fns_neon_av8.s b/common/armv8/ih264_mem_fns_neon_av8.s
new file mode 100755
index 0000000..f5c2e29
--- /dev/null
+++ b/common/armv8/ih264_mem_fns_neon_av8.s
@@ -0,0 +1,274 @@
+//******************************************************************************
+//*
+//* Copyright (C) 2015 The Android Open Source Project
+//*
+//* Licensed under the Apache License, Version 2.0 (the "License");
+//* you may not use this file except in compliance with the License.
+//* You may obtain a copy of the License at:
+//*
+//* http://www.apache.org/licenses/LICENSE-2.0
+//*
+//* Unless required by applicable law or agreed to in writing, software
+//* distributed under the License is distributed on an "AS IS" BASIS,
+//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//* See the License for the specific language governing permissions and
+//* limitations under the License.
+//*
+//*****************************************************************************
+//* Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
+//*/
+///**
+// *******************************************************************************
+// * @file
+// * ih264_mem_fns_neon.s
+// *
+// * @brief
+// * Contains function definitions for memory manipulation
+// *
+// * @author
+// * Naveen SR
+// *
+// * @par List of Functions:
+// * - ih264_memcpy_av8()
+// * - ih264_memcpy_mul_8_av8()
+// * - ih264_memset_mul_8_av8()
+// * - ih264_memset_16bit_mul_8_av8()
+// * - ih264_memset_16bit_av8()
+// *
+// * @remarks
+// * None
+// *
+// *******************************************************************************
+//*/
+
+.text
+.p2align 2
+.include "ih264_neon_macros.s"
+///**
+//*******************************************************************************
+//*
+//* @brief
+//* memcpy of a 1d array
+//*
+//* @par Description:
+//* Does memcpy of 8bit data from source to destination for 8,16 or 32 number of bytes
+//*
+//* @param[in] pu1_dst
+//* UWORD8 pointer to the destination
+//*
+//* @param[in] pu1_src
+//* UWORD8 pointer to the source
+//*
+//* @param[in] num_bytes
+//* number of bytes to copy
+//* @returns
+//*
+//* @remarks
+//* None
+//*
+//*******************************************************************************
+//*/
+//void ih264_memcpy_mul_8(UWORD8 *pu1_dst,
+// UWORD8 *pu1_src,
+// UWORD8 num_bytes)
+//**************Variables Vs Registers*************************
+// x0 => *pu1_dst
+// x1 => *pu1_src
+// x2 => num_bytes
+
+
+
+
+
+ .global ih264_memcpy_mul_8_av8
+
+ih264_memcpy_mul_8_av8:
+
+loop_neon_memcpy_mul_8:
+ // Memcpy 8 bytes
+ ld1 {v0.8b}, [x1], #8
+ st1 {v0.8b}, [x0], #8
+
+ subs x2, x2, #8
+ bne loop_neon_memcpy_mul_8
+ ret
+
+
+
+//*******************************************************************************
+//*/
+//void ih264_memcpy(UWORD8 *pu1_dst,
+// UWORD8 *pu1_src,
+// UWORD8 num_bytes)
+//**************Variables Vs Registers*************************
+// x0 => *pu1_dst
+// x1 => *pu1_src
+// x2 => num_bytes
+
+
+
+ .global ih264_memcpy_av8
+
+ih264_memcpy_av8:
+ subs x2, x2, #8
+ blt arm_memcpy
+loop_neon_memcpy:
+ // Memcpy 8 bytes
+ ld1 {v0.8b}, [x1], #8
+ st1 {v0.8b}, [x0], #8
+
+ subs x2, x2, #8
+ bge loop_neon_memcpy
+ cmp x2, #-8
+ beq end_func1
+
+arm_memcpy:
+ add x2, x2, #8
+
+loop_arm_memcpy:
+ ldrb w3, [x1], #1
+ sxtw x3, w3
+ strb w3, [x0], #1
+ sxtw x3, w3
+ subs x2, x2, #1
+ bne loop_arm_memcpy
+ ret
+end_func1:
+ ret
+
+
+//void ih264_memset_mul_8(UWORD8 *pu1_dst,
+// UWORD8 value,
+// UWORD8 num_bytes)
+//**************Variables Vs Registers*************************
+// x0 => *pu1_dst
+// x1 => value
+// x2 => num_bytes
+
+
+ .global ih264_memset_mul_8_av8
+
+ih264_memset_mul_8_av8:
+
+// Assumptions: numbytes is either 8, 16 or 32
+ dup v0.8b, w1
+loop_memset_mul_8:
+ // Memset 8 bytes
+ st1 {v0.8b}, [x0], #8
+
+ subs x2, x2, #8
+ bne loop_memset_mul_8
+
+ ret
+
+
+//void ih264_memset(UWORD8 *pu1_dst,
+// UWORD8 value,
+// UWORD8 num_bytes)
+//**************Variables Vs Registers*************************
+// x0 => *pu1_dst
+// x1 => value
+// x2 => num_bytes
+
+
+
+ .global ih264_memset_av8
+
+ih264_memset_av8:
+ subs x2, x2, #8
+ blt arm_memset
+ dup v0.8b, w1
+loop_neon_memset:
+ // Memcpy 8 bytes
+ st1 {v0.8b}, [x0], #8
+
+ subs x2, x2, #8
+ bge loop_neon_memset
+ cmp x2, #-8
+ beq end_func2
+
+arm_memset:
+ add x2, x2, #8
+
+loop_arm_memset:
+ strb w1, [x0], #1
+ sxtw x1, w1
+ subs x2, x2, #1
+ bne loop_arm_memset
+ ret
+end_func2:
+ ret
+
+
+
+
+
+//void ih264_memset_16bit_mul_8(UWORD16 *pu2_dst,
+// UWORD16 value,
+// UWORD8 num_words)
+//**************Variables Vs Registers*************************
+// x0 => *pu2_dst
+// x1 => value
+// x2 => num_words
+
+
+ .global ih264_memset_16bit_mul_8_av8
+
+ih264_memset_16bit_mul_8_av8:
+
+// Assumptions: num_words is either 8, 16 or 32
+
+ // Memset 8 words
+ dup v0.4h, w1
+loop_memset_16bit_mul_8:
+ st1 {v0.4h}, [x0], #8
+ st1 {v0.4h}, [x0], #8
+
+ subs x2, x2, #8
+ bne loop_memset_16bit_mul_8
+
+ ret
+
+
+
+//void ih264_memset_16bit(UWORD16 *pu2_dst,
+// UWORD16 value,
+// UWORD8 num_words)
+//**************Variables Vs Registers*************************
+// x0 => *pu2_dst
+// x1 => value
+// x2 => num_words
+
+
+
+ .global ih264_memset_16bit_av8
+
+ih264_memset_16bit_av8:
+ subs x2, x2, #8
+ blt arm_memset_16bit
+ dup v0.4h, w1
+loop_neon_memset_16bit:
+ // Memset 8 words
+ st1 {v0.4h}, [x0], #8
+ st1 {v0.4h}, [x0], #8
+
+ subs x2, x2, #8
+ bge loop_neon_memset_16bit
+ cmp x2, #-8
+ beq end_func3
+
+arm_memset_16bit:
+ add x2, x2, #8
+
+loop_arm_memset_16bit:
+ strh w1, [x0], #2
+ sxtw x1, w1
+ subs x2, x2, #1
+ bne loop_arm_memset_16bit
+ ret
+
+end_func3:
+ ret
+
+
+
diff --git a/common/armv8/ih264_neon_macros.s b/common/armv8/ih264_neon_macros.s
new file mode 100755
index 0000000..6ff5b91
--- /dev/null
+++ b/common/armv8/ih264_neon_macros.s
@@ -0,0 +1,41 @@
+//******************************************************************************
+//*
+//* Copyright (C) 2015 The Android Open Source Project
+//*
+//* Licensed under the Apache License, Version 2.0 (the "License");
+//* you may not use this file except in compliance with the License.
+//* You may obtain a copy of the License at:
+//*
+//* http://www.apache.org/licenses/LICENSE-2.0
+//*
+//* Unless required by applicable law or agreed to in writing, software
+//* distributed under the License is distributed on an "AS IS" BASIS,
+//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//* See the License for the specific language governing permissions and
+//* limitations under the License.
+//*
+//*****************************************************************************
+//* Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
+//*/
+//*******************************************************************************
+
+
+.macro push_v_regs
+ stp d8, d9, [sp, #-16]!
+ stp d10, d11, [sp, #-16]!
+ stp d12, d13, [sp, #-16]!
+ stp d14, d15, [sp, #-16]!
+.endm
+.macro pop_v_regs
+ ldp d14, d15, [sp], #16
+ ldp d12, d13, [sp], #16
+ ldp d10, d11, [sp], #16
+ ldp d8, d9, [sp], #16
+.endm
+
+.macro swp reg1, reg2
+ eor \reg1, \reg1, \reg2
+ eor \reg2, \reg1, \reg2
+ eor \reg1, \reg1, \reg2
+.endm
+
diff --git a/common/armv8/ih264_padding_neon_av8.s b/common/armv8/ih264_padding_neon_av8.s
new file mode 100755
index 0000000..35d9c8a
--- /dev/null
+++ b/common/armv8/ih264_padding_neon_av8.s
@@ -0,0 +1,784 @@
+//******************************************************************************
+//*
+//* Copyright (C) 2015 The Android Open Source Project
+//*
+//* Licensed under the Apache License, Version 2.0 (the "License");
+//* you may not use this file except in compliance with the License.
+//* You may obtain a copy of the License at:
+//*
+//* http://www.apache.org/licenses/LICENSE-2.0
+//*
+//* Unless required by applicable law or agreed to in writing, software
+//* distributed under the License is distributed on an "AS IS" BASIS,
+//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//* See the License for the specific language governing permissions and
+//* limitations under the License.
+//*
+//*****************************************************************************
+//* Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
+//*/
+///**
+// *******************************************************************************
+// * @file
+// * ih264_padding_neon.s
+// *
+// * @brief
+// * Contains function definitions padding
+// *
+// * @author
+// * Ittiam
+// *
+// * @par List of Functions:
+// * - ih264_pad_top_av8()
+// * - ih264_pad_left_luma_av8()
+// * - ih264_pad_left_chroma_av8()
+// * - ih264_pad_right_luma_av8()
+// * - ih264_pad_right_chroma_av8()
+// *
+// * @remarks
+// * None
+// *
+// *******************************************************************************
+//*/
+
+.text
+.p2align 2
+.include "ih264_neon_macros.s"
+///**
+//*******************************************************************************
+//*
+//* @brief pad at the top of a 2d array
+//*
+//* @par Description:
+//* The top row of a 2d array is replicated for pad_size times at the top
+//*
+//* @param[in] pu1_src
+//* UWORD8 pointer to the source
+//*
+//* @param[in] src_strd
+//* integer source stride
+//*
+//* @param[in] wd
+//* integer width of the array
+//*
+//* @param[in] pad_size
+//* integer -padding size of the array
+//*
+//* @returns none
+//*
+//* @remarks none
+//*
+//*******************************************************************************
+//*/
+//void ih264_pad_top(UWORD8 *pu1_src,
+// WORD32 src_strd,
+// WORD32 wd,
+// WORD32 pad_size)
+//**************Variables Vs Registers*************************
+// x0 => *pu1_src
+// x1 => src_strd
+// x2 => wd
+// x3 => pad_size
+
+ .global ih264_pad_top_av8
+
+ih264_pad_top_av8:
+
+ // STMFD sp!, {x4-x11,x14} //stack stores the values of the arguments
+ push_v_regs
+ stp x19, x20, [sp, #-16]!
+
+ sub x5, x0, x1
+ sub x20, x1, #0
+ neg x6, x20
+
+loop_neon_memcpy_mul_16:
+ // Load 16 bytes
+ ld1 {v0.8b, v1.8b}, [x0], #16
+ mov x4, x5
+ mov x7, x3
+ add x5, x5, #16
+
+loop_neon_pad_top:
+ st1 {v0.8b, v1.8b}, [x4], x6
+ subs x7, x7, #1
+ bne loop_neon_pad_top
+
+ subs x2, x2, #16
+ bne loop_neon_memcpy_mul_16
+
+ // LDMFD sp!,{x4-x11,pc} //Reload the registers from SP
+ ldp x19, x20, [sp], #16
+ pop_v_regs
+ ret
+
+
+
+
+///**
+//*******************************************************************************
+//*
+//* @brief
+//* Padding (luma block) at the left of a 2d array
+//*
+//* @par Description:
+//* The left column of a 2d array is replicated for pad_size times at the left
+//*
+//*
+//* @param[in] pu1_src
+//* UWORD8 pointer to the source
+//*
+//* @param[in] src_strd
+//* integer source stride
+//*
+//* @param[in] ht
+//* integer height of the array
+//*
+//* @param[in] wd
+//* integer width of the array
+//*
+//* @param[in] pad_size
+//* integer -padding size of the array
+//*
+//* @param[in] ht
+//* integer height of the array
+//*
+//* @param[in] wd
+//* integer width of the array
+//*
+//* @returns
+//*
+//* @remarks
+//* None
+//*
+//*******************************************************************************
+//*/
+//#if PAD_LEFT_LUMA == C
+//void ih264_pad_left_luma(UWORD8 *pu1_src,
+// WORD32 src_strd,
+// WORD32 ht,
+// WORD32 pad_size)
+//**************Variables Vs Registers*************************
+// x0 => *pu1_src
+// x1 => src_strd
+// x2 => ht
+// x3 => pad_size
+
+
+
+ .global ih264_pad_left_luma_av8
+
+ih264_pad_left_luma_av8:
+
+ // STMFD sp!, {x4-x11,x14} //stack stores the values of the arguments
+ push_v_regs
+ stp x19, x20, [sp, #-16]!
+
+
+ sub x4, x0, x3
+ sub x6, x1, #16
+ subs x5, x3, #16
+ bne loop_32
+loop_16: // /*hard coded for width=16 ,height =8,16*/
+ ldrb w8, [x0]
+ add x0, x0, x1
+ sxtw x8, w8
+ ldrb w9, [x0]
+ add x0, x0, x1
+ sxtw x9, w9
+ dup v0.16b, w8
+ ldrb w10, [x0]
+ add x0, x0, x1
+ sxtw x10, w10
+ st1 {v0.16b}, [x4], x1 // 16 bytes store
+ dup v2.16b, w9
+ st1 {v2.16b}, [x4], x1 // 16 bytes store
+ ldrb w11, [x0]
+ add x0, x0, x1
+ sxtw x11, w11
+ dup v4.16b, w10
+ dup v6.16b, w11
+ st1 {v4.16b}, [x4], x1 // 16 bytes store
+ ldrb w8, [x0]
+ add x0, x0, x1
+ sxtw x8, w8
+ st1 {v6.16b}, [x4], x1 // 16 bytes store
+ ldrb w9, [x0]
+ add x0, x0, x1
+ sxtw x9, w9
+ dup v0.16b, w8
+ ldrb w10, [x0]
+ add x0, x0, x1
+ sxtw x10, w10
+ st1 {v0.16b}, [x4], x1 // 16 bytes store
+ dup v2.16b, w9
+ ldrb w11, [x0]
+ add x0, x0, x1
+ sxtw x11, w11
+ st1 {v2.16b}, [x4], x1 // 16 bytes store
+ dup v4.16b, w10
+ dup v6.16b, w11
+ subs x2, x2, #8
+ st1 {v4.16b}, [x4], x1 // 16 bytes store
+ st1 {v6.16b}, [x4], x1 // 16 bytes store
+ bne loop_16
+ b end_func
+
+loop_32: // /*hard coded for width=32 ,height =8,16*/
+ ldrb w8, [x0]
+ add x0, x0, x1
+ sxtw x8, w8
+ ldrb w9, [x0]
+ add x0, x0, x1
+ sxtw x9, w9
+ dup v0.16b, w8
+ ldrb w10, [x0]
+ add x0, x0, x1
+ sxtw x10, w10
+ st1 {v0.16b}, [x4], #16 // 16 bytes store
+ dup v2.16b, w9
+ st1 {v0.16b}, [x4], x6
+ st1 {v2.16b}, [x4], #16 // 16 bytes store
+ dup v4.16b, w10
+ st1 {v2.16b}, [x4], x6 // 16 bytes store
+ ldrb w11, [x0]
+ add x0, x0, x1
+ sxtw x11, w11
+ st1 {v4.16b}, [x4], #16 // 16 bytes store
+ dup v6.16b, w11
+ st1 {v4.16b}, [x4], x6 // 16 bytes store
+ ldrb w8, [x0]
+ add x0, x0, x1
+ sxtw x8, w8
+ st1 {v6.16b}, [x4], #16 // 16 bytes store
+ dup v0.16b, w8
+ ldrb w9, [x0]
+ add x0, x0, x1
+ sxtw x9, w9
+ st1 {v6.16b}, [x4], x6 // 16 bytes store
+ ldrb w10, [x0]
+ add x0, x0, x1
+ sxtw x10, w10
+ st1 {v0.16b}, [x4], #16 // 16 bytes store
+ dup v2.16b, w9
+ st1 {v0.16b}, [x4], x6 // 16 bytes store
+ ldrb w11, [x0]
+ add x0, x0, x1
+ sxtw x11, w11
+ st1 {v2.16b}, [x4], #16 // 16 bytes store
+ dup v4.16b, w10
+ st1 {v2.16b}, [x4], x6 // 16 bytes store
+ st1 {v4.16b}, [x4], #16 // 16 bytes store
+ dup v6.16b, w11
+ st1 {v4.16b}, [x4], x6 // 16 bytes store
+ subs x2, x2, #8
+ st1 {v6.16b}, [x4], #16 // 16 bytes store
+ st1 {v6.16b}, [x4], x6 // 16 bytes store
+ bne loop_32
+
+
+
+end_func:
+ // LDMFD sp!,{x4-x11,pc} //Reload the registers from SP
+ ldp x19, x20, [sp], #16
+ pop_v_regs
+ ret
+
+
+
+
+
+///**
+//*******************************************************************************
+//*
+//* @brief
+//* Padding (chroma block) at the left of a 2d array
+//*
+//* @par Description:
+//* The left column of a 2d array is replicated for pad_size times at the left
+//*
+//*
+//* @param[in] pu1_src
+//* UWORD8 pointer to the source
+//*
+//* @param[in] src_strd
+//* integer source stride
+//*
+//* @param[in] ht
+//* integer height of the array
+//*
+//* @param[in] wd
+//* integer width of the array (each colour component)
+//*
+//* @param[in] pad_size
+//* integer -padding size of the array
+//*
+//* @param[in] ht
+//* integer height of the array
+//*
+//* @param[in] wd
+//* integer width of the array
+//*
+//* @returns
+//*
+//* @remarks
+//* None
+//*
+//*******************************************************************************
+//*/
+//#if PAD_LEFT_CHROMA == C
+//void ih264_pad_left_chroma(UWORD8 *pu1_src,
+// WORD32 src_strd,
+// WORD32 ht,
+// WORD32 pad_size)
+//{
+// x0 => *pu1_src
+// x1 => src_strd
+// x2 => ht
+// x3 => pad_size
+
+
+
+ .global ih264_pad_left_chroma_av8
+
+ih264_pad_left_chroma_av8:
+
+ // STMFD sp!, {x4-x11, x14} //stack stores the values of the arguments
+ push_v_regs
+ stp x19, x20, [sp, #-16]!
+
+ sub x4, x0, x3
+ sub x6, x1, #16
+
+
+loop_32_l_c: // /*hard coded for width=32 ,height =4,8,12*/
+ ldrh w8, [x0]
+ add x0, x0, x1
+ sxtw x8, w8
+ ldrh w9, [x0]
+ add x0, x0, x1
+ sxtw x9, w9
+ dup v0.8h, w8
+ ldrh w10, [x0]
+ add x0, x0, x1
+ sxtw x10, w10
+ st1 {v0.16b}, [x4], #16 // 16 bytes store
+ dup v2.8h, w9
+ st1 {v0.16b}, [x4], x6 // 16 bytes store
+ ldrh w11, [x0]
+ add x0, x0, x1
+ sxtw x11, w11
+ st1 {v2.16b}, [x4], #16 // 16 bytes store
+ dup v4.8h, w10
+ st1 {v2.16b}, [x4], x6 // 16 bytes store
+ dup v6.8h, w11
+ st1 {v4.16b}, [x4], #16 // 16 bytes store
+ st1 {v4.16b}, [x4], x6 // 16 bytes store
+ subs x2, x2, #4
+ st1 {v6.16b}, [x4], #16 // 16 bytes store
+ st1 {v6.16b}, [x4], x6 // 16 bytes store
+
+
+ beq end_func_l_c ///* Branching when ht=4*/
+
+ ldrh w8, [x0]
+ add x0, x0, x1
+ sxtw x8, w8
+ ldrh w9, [x0]
+ add x0, x0, x1
+ sxtw x9, w9
+ dup v0.8h, w8
+ ldrh w10, [x0]
+ add x0, x0, x1
+ sxtw x10, w10
+ st1 {v0.16b}, [x4], #16 // 16 bytes store
+ dup v2.8h, w9
+ st1 {v0.16b}, [x4], x6
+ ldrh w11, [x0]
+ add x0, x0, x1
+ sxtw x11, w11
+ st1 {v2.16b}, [x4], #16 // 16 bytes store
+ dup v4.8h, w10
+ st1 {v2.16b}, [x4], x6 // 16 bytes store
+ dup v6.8h, w11
+ st1 {v4.16b}, [x4], #16 // 16 bytes store
+ st1 {v4.16b}, [x4], x6 // 16 bytes store
+ subs x2, x2, #4
+ st1 {v6.16b}, [x4], #16 // 16 bytes store
+ st1 {v6.16b}, [x4], x6 // 16 bytes store
+
+ beq end_func_l_c ///* Branching when ht=8*/
+ bne loop_32_l_c
+
+ ldrh w8, [x0]
+ add x0, x0, x1
+ sxtw x8, w8
+ ldrh w9, [x0]
+ add x0, x0, x1
+ sxtw x9, w9
+ dup v0.8h, w8
+ ldrh w10, [x0]
+ add x0, x0, x1
+ sxtw x10, w10
+ st1 {v0.16b}, [x4], #16 // 16 bytes store
+ dup v2.8h, w9
+ st1 {v0.16b}, [x4], x6
+ ldrh w11, [x0]
+ add x0, x0, x1
+ sxtw x11, w11
+ st1 {v2.16b}, [x4], #16 // 16 bytes store
+ dup v4.8h, w10
+ st1 {v2.16b}, [x4], x6 // 16 bytes store
+ dup v6.8h, w11
+ st1 {v4.16b}, [x4], #16 // 16 bytes store
+ st1 {v4.16b}, [x4], x6 // 16 bytes store
+ st1 {v6.16b}, [x4], #16 // 16 bytes store
+ st1 {v6.16b}, [x4], x6 // 16 bytes store
+
+end_func_l_c:
+ // LDMFD sp!,{x4-x11,pc} //Reload the registers from SP
+ ldp x19, x20, [sp], #16
+ pop_v_regs
+ ret
+
+
+
+
+
+///**
+//*******************************************************************************
+//*
+//* @brief
+//* Padding (luma block) at the right of a 2d array
+//*
+//* @par Description:
+//* The right column of a 2d array is replicated for pad_size times at the right
+//*
+//*
+//* @param[in] pu1_src
+//* UWORD8 pointer to the source
+//*
+//* @param[in] src_strd
+//* integer source stride
+//*
+//* @param[in] ht
+//* integer height of the array
+//*
+//* @param[in] wd
+//* integer width of the array
+//*
+//* @param[in] pad_size
+//* integer -padding size of the array
+//*
+//* @param[in] ht
+//* integer height of the array
+//*
+//* @param[in] wd
+//* integer width of the array
+//*
+//* @returns
+//*
+//* @remarks
+//* None
+//*
+//*******************************************************************************
+//*/
+//#if PAD_RIGHT_LUMA == C
+//void ih264_pad_right_luma(UWORD8 *pu1_src,
+// WORD32 src_strd,
+// WORD32 ht,
+// WORD32 pad_size)
+//{
+// WORD32 row;
+//
+// for(row = 0; row < ht; row++)
+// {
+// memset(pu1_src, *(pu1_src -1), pad_size);
+//
+// pu1_src += src_strd;
+// }
+//}
+//
+// x0 => *pu1_src
+// x1 => src_strd
+// x2 => ht
+// x3 => pad_size
+
+
+
+ .global ih264_pad_right_luma_av8
+
+ih264_pad_right_luma_av8:
+
+ // STMFD sp!, {x4-x11, x14} //stack stores the values of the arguments
+ push_v_regs
+ stp x19, x20, [sp, #-16]!
+
+ mov x4, x0
+ sub x6, x1, #16
+ sub x0, x0, #1
+ subs x5, x3, #16
+ bne loop_32
+loop_16_r: // /*hard coded for width=16 ,height =8,16*/
+ ldrb w8, [x0]
+ add x0, x0, x1
+ sxtw x8, w8
+ ldrb w9, [x0]
+ add x0, x0, x1
+ sxtw x9, w9
+ dup v0.16b, w8
+ ldrb w10, [x0]
+ add x0, x0, x1
+ sxtw x10, w10
+ st1 {v0.16b}, [x4], x1 // 16 bytes store
+ dup v2.16b, w9
+ st1 {v2.16b}, [x4], x1 // 16 bytes store
+ ldrb w11, [x0]
+ add x0, x0, x1
+ sxtw x11, w11
+ dup v4.16b, w10
+ dup v6.16b, w11
+ st1 {v4.16b}, [x4], x1 // 16 bytes store
+ ldrb w8, [x0]
+ add x0, x0, x1
+ sxtw x8, w8
+ st1 {v6.16b}, [x4], x1 // 16 bytes store
+ ldrb w9, [x0]
+ add x0, x0, x1
+ sxtw x9, w9
+ dup v0.16b, w8
+ ldrb w10, [x0]
+ add x0, x0, x1
+ sxtw x10, w10
+ st1 {v0.16b}, [x4], x1 // 16 bytes store
+ dup v2.16b, w9
+ ldrb w11, [x0]
+ add x0, x0, x1
+ sxtw x11, w11
+ st1 {v2.16b}, [x4], x1 // 16 bytes store
+ dup v4.16b, w10
+ dup v6.16b, w11
+ subs x2, x2, #8
+ st1 {v4.16b}, [x4], x1 // 16 bytes store
+ st1 {v6.16b}, [x4], x1 // 16 bytes store
+ bne loop_16_r
+ b end_func_r
+
+loop_32_r: // /*hard coded for width=32 ,height =8,16*/
+ ldrb w8, [x0]
+ add x0, x0, x1
+ sxtw x8, w8
+ ldrb w9, [x0]
+ add x0, x0, x1
+ sxtw x9, w9
+ dup v0.16b, w8
+ ldrb w10, [x0]
+ add x0, x0, x1
+ sxtw x10, w10
+ st1 {v0.16b}, [x4], #16 // 16 bytes store
+ dup v2.16b, w9
+ st1 {v0.16b}, [x4], x6
+ st1 {v2.16b}, [x4], #16 // 16 bytes store
+ dup v4.16b, w10
+ st1 {v2.16b}, [x4], x6 // 16 bytes store
+ ldrb w11, [x0]
+ add x0, x0, x1
+ sxtw x11, w11
+ st1 {v4.16b}, [x4], #16 // 16 bytes store
+ dup v6.16b, w11
+ st1 {v4.16b}, [x4], x6 // 16 bytes store
+ ldrb w8, [x0]
+ add x0, x0, x1
+ sxtw x8, w8
+ st1 {v6.16b}, [x4], #16 // 16 bytes store
+ ldrb w9, [x0]
+ add x0, x0, x1
+ sxtw x9, w9
+ dup v0.16b, w8
+ st1 {v6.16b}, [x4], x6 // 16 bytes store
+ ldrb w10, [x0]
+ add x0, x0, x1
+ sxtw x10, w10
+ st1 {v0.16b}, [x4], #16 // 16 bytes store
+ dup v2.16b, w9
+ st1 {v0.16b}, [x4], x6 // 16 bytes store
+ ldrb w11, [x0]
+ add x0, x0, x1
+ sxtw x11, w11
+ st1 {v2.16b}, [x4], #16 // 16 bytes store
+ dup v4.16b, w10
+ st1 {v2.16b}, [x4], x6 // 16 bytes store
+ st1 {v4.16b}, [x4], #16 // 16 bytes store
+ dup v6.16b, w11
+ st1 {v4.16b}, [x4], x6 // 16 bytes store
+ subs x2, x2, #8
+ st1 {v6.16b}, [x4], #16 // 16 bytes store
+ st1 {v6.16b}, [x4], x6 // 16 bytes store
+ bne loop_32_r
+
+
+
+end_func_r:
+ // LDMFD sp!,{x4-x11,pc} //Reload the registers from SP
+ ldp x19, x20, [sp], #16
+ pop_v_regs
+ ret
+
+
+
+
+
+///**
+//*******************************************************************************
+//*
+//* @brief
+//;* Padding (chroma block) at the right of a 2d array
+//*
+//* @par Description:
+//* The right column of a 2d array is replicated for pad_size times at the right
+//*
+//*
+//* @param[in] pu1_src
+//;* UWORD8 pointer to the source
+//*
+//* @param[in] src_strd
+//* integer source stride
+//*
+//* @param[in] ht
+//;* integer height of the array
+//*
+//* @param[in] wd
+//* integer width of the array (each colour component)
+//*
+//* @param[in] pad_size
+//* integer -padding size of the array
+//*
+//* @param[in] ht
+//;* integer height of the array
+//*
+//* @param[in] wd
+//* integer width of the array
+//*
+//* @returns
+//*
+//* @remarks
+//* None
+//*
+//*******************************************************************************
+//*/
+//#if PAD_RIGHT_CHROMA == C
+//void ih264_pad_right_chroma(UWORD8 *pu1_src,
+// WORD32 src_strd,
+// WORD32 ht,
+// WORD32 pad_size)
+// x0 => *pu1_src
+// x1 => src_strd
+// x2 => ht
+// x3 => pad_size
+
+
+
+ .global ih264_pad_right_chroma_av8
+
+ih264_pad_right_chroma_av8:
+
+ // STMFD sp!, {x4-x11, x14} //stack stores the values of the arguments
+ push_v_regs
+ stp x19, x20, [sp, #-16]!
+
+ mov x4, x0
+ sub x6, x1, #16
+ sub x0, x0, #2
+loop_32_r_c: // /*hard coded for width=32 ,height =8,4*/
+ ldrh w8, [x0]
+ add x0, x0, x1
+ sxtw x8, w8
+ ldrh w9, [x0]
+ add x0, x0, x1
+ sxtw x9, w9
+ dup v0.8h, w8
+ ldrh w10, [x0]
+ add x0, x0, x1
+ sxtw x10, w10
+ st1 {v0.16b}, [x4], #16 // 16 bytes store
+ dup v2.8h, w9
+ st1 {v0.16b}, [x4], x6
+ st1 {v2.16b}, [x4], #16 // 16 bytes store
+ dup v4.8h, w10
+ st1 {v2.16b}, [x4], x6 // 16 bytes store
+ subs x2, x2, #4
+ ldrh w11, [x0]
+ add x0, x0, x1
+ sxtw x11, w11
+ st1 {v4.16b}, [x4], #16 // 16 bytes store
+ dup v6.8h, w11
+ st1 {v4.16b}, [x4], x6 // 16 bytes store
+ st1 {v6.16b}, [x4], #16 // 16 bytes store
+ st1 {v6.16b}, [x4], x6 // 16 bytes store
+
+ beq end_func_r_c ///* Branching when ht=4*/
+
+ ldrh w8, [x0]
+ add x0, x0, x1
+ sxtw x8, w8
+ dup v0.8h, w8
+ ldrh w9, [x0]
+ add x0, x0, x1
+ sxtw x9, w9
+ ldrh w10, [x0]
+ add x0, x0, x1
+ sxtw x10, w10
+ st1 {v0.16b}, [x4], #16 // 16 bytes store
+ dup v2.8h, w9
+ st1 {v0.16b}, [x4], x6 // 16 bytes store
+ ldrh w11, [x0]
+ add x0, x0, x1
+ sxtw x11, w11
+ st1 {v2.16b}, [x4], #16 // 16 bytes store
+ dup v4.8h, w10
+ st1 {v2.16b}, [x4], x6 // 16 bytes store
+ st1 {v4.16b}, [x4], #16 // 16 bytes store
+ dup v6.8h, w11
+ st1 {v4.16b}, [x4], x6 // 16 bytes store
+ subs x2, x2, #4
+ st1 {v6.16b}, [x4], #16 // 16 bytes store
+ st1 {v6.16b}, [x4], x6 // 16 bytes store
+
+ beq end_func_r_c ///* Branching when ht=8*/
+ bne loop_32_r_c
+ ldrh w8, [x0]
+ add x0, x0, x1
+ sxtw x8, w8
+ dup v0.8h, w8
+ ldrh w9, [x0]
+ add x0, x0, x1
+ sxtw x9, w9
+ ldrh w10, [x0]
+ add x0, x0, x1
+ sxtw x10, w10
+ st1 {v0.16b}, [x4], #16 // 16 bytes store
+ dup v2.8h, w9
+ st1 {v0.16b}, [x4], x6 // 16 bytes store
+ ldrh w11, [x0]
+ add x0, x0, x1
+ sxtw x11, w11
+ st1 {v2.16b}, [x4], #16 // 16 bytes store
+ dup v4.8h, w10
+ st1 {v2.16b}, [x4], x6 // 16 bytes store
+ st1 {v4.16b}, [x4], #16 // 16 bytes store
+ dup v6.8h, w11
+ st1 {v4.16b}, [x4], x6 // 16 bytes store
+ st1 {v6.16b}, [x4], #16 // 16 bytes store
+ st1 {v6.16b}, [x4], x6 // 16 bytes store
+
+end_func_r_c:
+ // LDMFD sp!,{x4-x11,pc} //Reload the registers from SP
+ ldp x19, x20, [sp], #16
+ pop_v_regs
+ ret
+
+
+
+
+
+
diff --git a/common/armv8/ih264_platform_macros.h b/common/armv8/ih264_platform_macros.h
new file mode 100755
index 0000000..1f67403
--- /dev/null
+++ b/common/armv8/ih264_platform_macros.h
@@ -0,0 +1,152 @@
+/******************************************************************************
+ *
+ * Copyright (C) 2015 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ *****************************************************************************
+ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
+*/
+/**
+*******************************************************************************
+* @file
+* ih264_platform_macros.h
+*
+* @brief
+* Platform specific Macro definitions used in the codec
+*
+* @author
+* Ittiam
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+#ifndef _IHEVC_PLATFORM_MACROS_H_
+#define _IHEVC_PLATFORM_MACROS_H_
+
+#ifndef ARMV8
+void ih264_arm_dsb(void);
+
+#define DATA_SYNC() ih264_arm_dsb()
+static __inline WORD32 CLIP_U8(WORD32 x)
+{
+ asm("usat %0, #8, %1" : "=r"(x) : "r"(x));
+ return x;
+}
+
+static __inline WORD32 CLIP_S8(WORD32 x)
+{
+ asm("ssat %0, #8, %1" : "=r"(x) : "r"(x));
+ return x;
+}
+
+static __inline WORD32 CLIP_U10(WORD32 x)
+{
+ asm("usat %0, #10, %1" : "=r"(x) : "r"(x));
+ return x;
+}
+
+static __inline WORD32 CLIP_S10(WORD32 x)
+{
+ asm("ssat %0, #10, %1" : "=r"(x) : "r"(x));
+ return x;
+}
+
+static __inline WORD32 CLIP_U12(WORD32 x)
+{
+ asm("usat %0, #12, %1" : "=r"(x) : "r"(x));
+ return x;
+}
+
+static __inline WORD32 CLIP_S12(WORD32 x)
+{
+ asm("ssat %0, #12, %1" : "=r"(x) : "r"(x));
+ return x;
+}
+
+static __inline WORD32 CLIP_U16(WORD32 x)
+{
+ asm("usat %0, #16, %1" : "=r"(x) : "r"(x));
+ return x;
+}
+static __inline WORD32 CLIP_S16(WORD32 x)
+{
+ asm("ssat %0, #16, %1" : "=r"(x) : "r"(x));
+ return x;
+}
+
+
+static __inline UWORD32 ITT_BIG_ENDIAN(UWORD32 x)
+{
+ asm("rev %0, %1" : "=r"(x) : "r"(x));
+ return x;
+}
+#else
+#define DATA_SYNC() ;
+
+#define CLIP_U8(x) CLIP3(0, 255, (x))
+#define CLIP_S8(x) CLIP3(-128, 127, (x))
+
+#define CLIP_U10(x) CLIP3(0, 1023, (x))
+#define CLIP_S10(x) CLIP3(-512, 511, (x))
+
+#define CLIP_U12(x) CLIP3(0, 4095, (x))
+#define CLIP_S12(x) CLIP3(-2048, 2047, (x))
+
+#define CLIP_U16(x) CLIP3(0, 65535, (x))
+#define CLIP_S16(x) CLIP3(-32768, 32767, (x))
+
+#define ITT_BIG_ENDIAN(x) ((x & 0x000000ff) << 24) | \
+ ((x & 0x0000ff00) << 8) | \
+ ((x & 0x00ff0000) >> 8) | \
+ ((UWORD32)x >> 24);
+#endif
+
+#define SHL(x,y) (((y) < 32) ? ((x) << (y)) : 0)
+#define SHR(x,y) (((y) < 32) ? ((x) >> (y)) : 0)
+
+#define SHR_NEG(val,shift) ((shift>0)?(val>>shift):(val<<(-shift)))
+#define SHL_NEG(val,shift) ((shift<0)?(val>>(-shift)):(val<<shift))
+
+#define INLINE inline
+
+static INLINE UWORD32 CLZ(UWORD32 u4_word)
+{
+ if(u4_word)
+ return (__builtin_clz(u4_word));
+ else
+ return 32;
+}
+static INLINE UWORD32 CTZ(UWORD32 u4_word)
+{
+ if(0 == u4_word)
+ return 31;
+ else
+ {
+ unsigned int index;
+ index = __builtin_ctz(u4_word);
+ return (UWORD32)index;
+ }
+}
+
+
+#define NOP(nop_cnt) {UWORD32 nop_i; for (nop_i = 0; nop_i < nop_cnt; nop_i++);}
+
+
+#define MEM_ALIGN8 __attribute__ ((aligned (8)))
+#define MEM_ALIGN16 __attribute__ ((aligned (16)))
+#define MEM_ALIGN32 __attribute__ ((aligned (32)))
+
+#endif /* _IHEVC_PLATFORM_MACROS_H_ */
diff --git a/common/armv8/ih264_resi_trans_quant_av8.s b/common/armv8/ih264_resi_trans_quant_av8.s
new file mode 100755
index 0000000..dc1c680
--- /dev/null
+++ b/common/armv8/ih264_resi_trans_quant_av8.s
@@ -0,0 +1,731 @@
+//******************************************************************************
+//*
+//* Copyright (C) 2015 The Android Open Source Project
+//*
+//* Licensed under the Apache License, Version 2.0 (the "License");
+//* you may not use this file except in compliance with the License.
+//* You may obtain a copy of the License at:
+//*
+//* http://www.apache.org/licenses/LICENSE-2.0
+//*
+//* Unless required by applicable law or agreed to in writing, software
+//* distributed under the License is distributed on an "AS IS" BASIS,
+//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//* See the License for the specific language governing permissions and
+//* limitations under the License.
+//*
+//*****************************************************************************
+//* Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
+//*/
+///*****************************************************************************/
+///**
+//*******************************************************************************
+//* @file
+//* ih264_resi_trans_quant_av8.c
+//*
+//* @brief
+//* contains function definitions for residual and forward trans
+//*
+//* @author
+//* ittiam
+//*
+//* @par list of functions:
+//* ih264_resi_trans_quant_4x4_av8
+//* ih264_resi_trans_quant_8x8_av8
+//* ih264_resi_trans_quant_chroma_4x4_av8
+//* @remarks
+//* none
+//*
+//*******************************************************************************
+.include "ih264_neon_macros.s"
+.text
+.p2align 2
+//*****************************************************************************
+//*
+//* function name : ih264_resi_trans_quant_4x4
+//* description : this function does cf4 of h264
+//*
+//* arguments : x0 :pointer to src buffer
+// x1 :pointer to pred buffer
+// x2 :pointer to dst buffer
+// x3 :source stride
+// x4 :pred stride,
+// x5 :dst stride,
+// x6 :pointer to scaling matrix,
+// x7 :pointer to threshold matrix,
+// stack qbits,
+// rounding factor,
+// pointer to store nnz
+// pointer to store non quantized dc value
+// values returned : none
+//
+// register usage :
+// stack usage : 64 bytes
+// cycles :
+// interruptiaility : interruptable
+//
+// known limitations
+// \assumptions :
+//
+// revision history :
+// dd mm yyyy author(s) changes
+// 1 12 2013 100633 first version
+// 20 1 2014 100633 changes the api, optimization
+//
+//*****************************************************************************
+
+ .global ih264_resi_trans_quant_4x4_av8
+ih264_resi_trans_quant_4x4_av8:
+
+ //x0 :pointer to src buffer
+ //x1 :pointer to pred buffer
+ //x2 :pointer to dst buffer
+ //x3 :source stride
+ //x4 :pred stride
+ //x5 :dst stride,
+ //x6 :scale matirx,
+ //x7 :threshold matrix
+ // :qbits
+ // :round factor
+ // :nnz
+ // :pointer to store non quantized dc value
+ push_v_regs
+ //x0 :pointer to src buffer
+ //x1 :pointer to pred buffer
+ //x2 :pointer to dst buffer
+ //x3 :source stride
+ //x4 :pred stride
+ //x5 :scale matirx,
+ //x6 :threshold matrix
+ //x7 :qbits
+ //x8 :round factor
+ //x9 :nnz
+ //x10 :pointer to store non quantized dc value
+
+ ldr w8, [sp, #64] //load round factor
+ ldr x10, [sp, #80] //load addres for non quant val
+ neg x7, x7 //negate the qbit value for usiing lsl
+ ldr x9, [sp, #72]
+
+ //------------fucntion loading done----------------;
+
+ ld1 {v30.8b}, [x0], x3 //load first 8 pix src row 1
+ ld1 {v31.8b}, [x1], x4 //load first 8 pix pred row 1
+ ld1 {v28.8b}, [x0], x3 //load first 8 pix src row 2
+ ld1 {v29.8b}, [x1], x4 //load first 8 pix pred row 2
+ ld1 {v26.8b}, [x0], x3 //load first 8 pix src row 3
+ ld1 {v27.8b}, [x1], x4 //load first 8 pix pred row 3
+ ld1 {v24.8b}, [x0] //load first 8 pix src row 4
+ ld1 {v25.8b}, [x1] //load first 8 pix pred row 4
+
+ usubl v0.8h, v30.8b, v31.8b //find residue row 1
+ usubl v2.8h, v28.8b, v29.8b //find residue row 2
+ usubl v4.8h, v26.8b, v27.8b //find residue row 3
+ usubl v6.8h, v24.8b, v25.8b //find residue row 4
+
+ trn1 v1.4h, v0.4h, v2.4h
+ trn2 v3.4h, v0.4h, v2.4h //t12
+ trn1 v5.4h, v4.4h, v6.4h
+ trn2 v7.4h, v4.4h, v6.4h //t23
+
+ trn1 v0.2s, v1.2s, v5.2s
+ trn2 v4.2s, v1.2s, v5.2s //t13
+ trn1 v2.2s, v3.2s, v7.2s
+ trn2 v6.2s, v3.2s, v7.2s //t14
+
+ add v8.4h, v0.4h, v6.4h //x0 = x4+x7
+ add v9.4h, v2.4h, v4.4h //x1 = x5+x6
+ sub v10.4h, v2.4h, v4.4h //x2 = x5-x6
+ sub v11.4h, v0.4h, v6.4h //x3 = x4-x7
+
+ shl v12.4h, v10.4h, #1 //u_shift(x2,1,shft)
+ shl v13.4h, v11.4h, #1 //u_shift(x3,1,shft)
+
+ add v14.4h, v8.4h, v9.4h //x4 = x0 + x1;
+ sub v16.4h, v8.4h, v9.4h //x6 = x0 - x1;
+ add v15.4h, v13.4h, v10.4h //x5 = u_shift(x3,1,shft) + x2;
+ sub v17.4h, v11.4h, v12.4h //x7 = x3 - u_shift(x2,1,shft);
+
+ //taking transpose again so as to make do vert transform
+ trn1 v0.4h, v14.4h, v15.4h
+ trn2 v1.4h, v14.4h, v15.4h //t12
+ trn1 v2.4h, v16.4h, v17.4h
+ trn2 v3.4h, v16.4h, v17.4h //t23
+
+ trn1 v14.2s, v0.2s, v2.2s
+ trn2 v16.2s, v0.2s, v2.2s //t13
+ trn1 v15.2s, v1.2s, v3.2s
+ trn2 v17.2s, v1.2s, v3.2s //t24
+
+ //let us do vertical transform
+ //same code as horiz
+ add v18.4h, v14.4h , v17.4h //x0 = x4+x7
+ add v19.4h, v15.4h , v16.4h //x1 = x5+x6
+ sub v20.4h, v15.4h , v16.4h //x2 = x5-x6
+ sub v21.4h, v14.4h , v17.4h //x3 = x4-x7
+
+ shl v22.4h, v20.4h, #1 //u_shift(x2,1,shft)
+ shl v23.4h, v21.4h, #1 //u_shift(x3,1,shft)
+
+ dup v8.4s, w8 //load rounding value row 1
+
+ add v24.4h, v18.4h , v19.4h //x5 = x0 + x1;
+ sub v26.4h, v18.4h , v19.4h //x7 = x0 - x1;
+ add v25.4h, v23.4h , v20.4h //x6 = u_shift(x3,1,shft) + x2;
+ sub v27.4h, v21.4h , v22.4h //x8 = x3 - u_shift(x2,1,shft);
+
+ dup v23.4s, w8 //load round factor values
+
+ st1 {v24.h}[0], [x10] //store the dc value to alternate dc sddress
+//core tranform is done for 4x8 block 1
+ ld1 {v28.4h-v31.4h}, [x5] //load the scaling values
+
+ abs v0.4h, v24.4h //abs val of row 1
+ abs v1.4h, v25.4h //abs val of row 2
+ abs v2.4h, v26.4h //abs val of row 3
+ abs v3.4h, v27.4h //abs val of row 4
+
+ cmgt v4.4h, v24.4h, #0
+ cmgt v5.4h, v25.4h, #0
+ cmgt v6.4h, v26.4h, #0
+ cmgt v7.4h, v27.4h, #0
+
+ smull v0.4s, v0.4h, v28.4h //multiply and add row 1
+ smull v1.4s, v1.4h, v29.4h //multiply and add row 2
+ smull v2.4s, v2.4h, v30.4h //multiply and add row 3
+ smull v3.4s, v3.4h, v31.4h //multiply and add row 4
+
+ add v20.4s, v0.4s, v23.4s
+ add v21.4s, v1.4s, v23.4s
+ add v22.4s, v2.4s, v23.4s
+ add v23.4s, v3.4s, v23.4s
+
+ dup v24.4s, w7
+
+ sshl v20.4s, v20.4s, v24.4s //shift row 1
+ sshl v21.4s, v21.4s, v24.4s //shift row 2
+ sshl v22.4s, v22.4s, v24.4s //shift row 3
+ sshl v23.4s, v23.4s, v24.4s //shift row 4
+
+ xtn v20.4h, v20.4s //narrow row 1
+ xtn v21.4h, v21.4s //narrow row 2
+ xtn v22.4h, v22.4s //narrow row 3
+ xtn v23.4h, v23.4s //narrow row 4
+
+ neg v24.8h, v20.8h //get negative
+ neg v25.8h, v21.8h //get negative
+ neg v26.8h, v22.8h //get negative
+ neg v27.8h, v23.8h //get negative
+
+ //compare with zero for computng nnz
+ cmeq v0.4h, v20.4h, #0
+ cmeq v1.4h, v21.4h, #0
+ cmeq v2.4h, v22.4h, #0
+ cmeq v3.4h, v23.4h, #0
+
+ bsl v4.8b, v20.8b, v24.8b //restore sign of row 1 and 2
+ bsl v5.8b, v21.8b, v25.8b //restore sign of row 3 and 4
+ bsl v6.8b, v22.8b, v26.8b //restore sign of row 1 and 2
+ bsl v7.8b, v23.8b, v27.8b //restore sign of row 3 and 4
+
+ //narrow the comaprison result
+ mov v0.d[1], v2.d[0]
+ mov v1.d[1], v3.d[0]
+
+ xtn v0.8b, v0.8h
+ xtn v1.8b, v1.8h
+
+ ushr v0.8b, v0.8b, #7 //i reduce comaparison bit to a signle bit row 1 and 2 blk 1 and 2 [ keep the value for later use ]
+ ushr v1.8b, v1.8b, #7 //i reduce comaparison bit to a signle bit row 1 and 2 blk 1 and 2 [ keep the value for later use ]
+
+ add v0.8b, v0.8b, v1.8b //i pair add nnz 1
+ addp v0.8b, v0.8b, v0.8b //i pair add nnz 1
+ addp v0.8b, v0.8b, v0.8b //i pair add nnz 1
+ addp v0.8b, v0.8b, v0.8b //i pair add nnz 1
+
+ st1 {v4.4h-v7.4h}, [x2] //store blk
+
+ movi v25.8b, #16 //get max nnz
+ sub v26.8b, v25.8b , v0.8b //invert current nnz
+ st1 {v26.b}[0], [x9] //write nnz
+
+ pop_v_regs
+ ret
+
+
+//*****************************************************************************
+//*
+//* function name : ih264_resi_trans_quant_chroma_4x4
+//* description : this function does residue calculation, forward transform
+//* and quantization for 4x4 chroma block.
+//*
+//* arguments : x0 :pointer to src buffer
+// x1 :pointer to pred buffer
+// x2 :pointer to dst buffer
+// x3 :source stride
+// x4 :pred stride,
+// x5 :dst stride,
+// x6 :pointer to scaling matrix,
+// x7 :pointer to threshold matrix,
+// stack qbits,
+// rounding factor,
+// pointer to store nnz
+// pointer to store unquantized dc values
+// values returned : none
+//
+// register usage :
+// stack usage : 64 bytes
+// cycles :
+// interruptiaility : interruptable
+//
+// known limitations
+// \assumptions :
+//
+// revision history :
+// dd mm yyyy author(s) changes
+// 11 2 2015 100664 first version
+// 25 2 2015 100633 first av8 version
+//*****************************************************************************
+
+ .global ih264_resi_trans_quant_chroma_4x4_av8
+ih264_resi_trans_quant_chroma_4x4_av8:
+
+ //x0 :pointer to src buffer
+ //x1 :pointer to pred buffer
+ //x2 :pointer to dst buffer
+ //x3 :source stride
+ //stack :pred stride
+ // :scale matirx,
+ // :threshold matrix
+ // :qbits
+ // :round factor
+ // :nnz
+ // :pu1_dc_alt_addr
+ push_v_regs
+ //x0 :pointer to src buffer
+ //x1 :pointer to pred buffer
+ //x2 :pointer to dst buffer
+ //x3 :source stride
+ //x4 :pred stride
+ //x5 :scale matirx,
+ //x6 :threshold matrix
+ //x7 :qbits
+ //x8 :round factor
+ //x9 :nnz
+ //x10 :pointer to store non quantized dc value
+
+ ldr w8, [sp, #64] //load round factor
+ ldr x10, [sp, #80] //load addres for non quant val
+ neg x7, x7 //negate the qbit value for usiing lsl
+ ldr x9, [sp, #72]
+ //------------fucntion loading done----------------;
+
+ ld1 {v30.8b}, [x0], x3 //load first 8 pix src row 1
+ ld1 {v31.8b}, [x1], x4 //load first 8 pix pred row 1
+ ld1 {v28.8b}, [x0], x3 //load first 8 pix src row 2
+ ld1 {v29.8b}, [x1], x4 //load first 8 pix pred row 2
+ ld1 {v26.8b}, [x0], x3 //load first 8 pix src row 3
+ ld1 {v27.8b}, [x1], x4 //load first 8 pix pred row 3
+ ld1 {v24.8b}, [x0] //load first 8 pix src row 4
+ ld1 {v25.8b}, [x1] //load first 8 pix pred row 4
+
+
+ //deinterleave the loaded values
+ uzp1 v30.8b, v30.8b, v30.8b
+ uzp1 v31.8b, v31.8b, v31.8b
+ uzp1 v28.8b, v28.8b, v28.8b
+ uzp1 v29.8b, v29.8b, v29.8b
+ uzp1 v26.8b, v26.8b, v26.8b
+ uzp1 v27.8b, v27.8b, v27.8b
+ uzp1 v24.8b, v24.8b, v24.8b
+ uzp1 v25.8b, v25.8b, v25.8b
+ //this deinterleaving is the only differnece betweenchrom and luma fucntions
+
+ usubl v0.8h, v30.8b, v31.8b //find residue row 1
+ usubl v2.8h, v28.8b, v29.8b //find residue row 2
+ usubl v4.8h, v26.8b, v27.8b //find residue row 3
+ usubl v6.8h, v24.8b, v25.8b //find residue row 4
+
+ trn1 v1.4h, v0.4h, v2.4h
+ trn2 v3.4h, v0.4h, v2.4h //t12
+ trn1 v5.4h, v4.4h, v6.4h
+ trn2 v7.4h, v4.4h, v6.4h //t23
+
+ trn1 v0.2s, v1.2s, v5.2s
+ trn2 v4.2s, v1.2s, v5.2s //t13
+ trn1 v2.2s, v3.2s, v7.2s
+ trn2 v6.2s, v3.2s, v7.2s //t14
+
+ add v8.4h, v0.4h, v6.4h //x0 = x4+x7
+ add v9.4h, v2.4h, v4.4h //x1 = x5+x6
+ sub v10.4h, v2.4h, v4.4h //x2 = x5-x6
+ sub v11.4h, v0.4h, v6.4h //x3 = x4-x7
+
+ shl v12.4h, v10.4h, #1 //u_shift(x2,1,shft)
+ shl v13.4h, v11.4h, #1 //u_shift(x3,1,shft)
+
+ add v14.4h, v8.4h, v9.4h //x4 = x0 + x1;
+ sub v16.4h, v8.4h, v9.4h //x6 = x0 - x1;
+ add v15.4h, v13.4h, v10.4h //x5 = u_shift(x3,1,shft) + x2;
+ sub v17.4h, v11.4h, v12.4h //x7 = x3 - u_shift(x2,1,shft);
+
+ //taking transpose again so as to make do vert transform
+ trn1 v0.4h, v14.4h, v15.4h
+ trn2 v1.4h, v14.4h, v15.4h //t12
+ trn1 v2.4h, v16.4h, v17.4h
+ trn2 v3.4h, v16.4h, v17.4h //t23
+
+ trn1 v14.2s, v0.2s, v2.2s
+ trn2 v16.2s, v0.2s, v2.2s //t13
+ trn1 v15.2s, v1.2s, v3.2s
+ trn2 v17.2s, v1.2s, v3.2s //t24
+
+ //let us do vertical transform
+ //same code as horiz
+ add v18.4h, v14.4h , v17.4h //x0 = x4+x7
+ add v19.4h, v15.4h , v16.4h //x1 = x5+x6
+ sub v20.4h, v15.4h , v16.4h //x2 = x5-x6
+ sub v21.4h, v14.4h , v17.4h //x3 = x4-x7
+
+ shl v22.4h, v20.4h, #1 //u_shift(x2,1,shft)
+ shl v23.4h, v21.4h, #1 //u_shift(x3,1,shft)
+
+ dup v8.4s, w8 //load rounding value row 1
+
+ add v24.4h, v18.4h , v19.4h //x5 = x0 + x1;
+ sub v26.4h, v18.4h , v19.4h //x7 = x0 - x1;
+ add v25.4h, v23.4h , v20.4h //x6 = u_shift(x3,1,shft) + x2;
+ sub v27.4h, v21.4h , v22.4h //x8 = x3 - u_shift(x2,1,shft);
+
+ dup v23.4s, w8 //load round factor values
+
+ st1 {v24.h}[0], [x10] //store the dc value to alternate dc sddress
+//core tranform is done for 4x8 block 1
+ ld1 {v28.4h-v31.4h}, [x5] //load the scaling values
+
+ abs v0.4h, v24.4h //abs val of row 1
+ abs v1.4h, v25.4h //abs val of row 2
+ abs v2.4h, v26.4h //abs val of row 3
+ abs v3.4h, v27.4h //abs val of row 4
+
+ cmgt v4.4h, v24.4h, #0
+ cmgt v5.4h, v25.4h, #0
+ cmgt v6.4h, v26.4h, #0
+ cmgt v7.4h, v27.4h, #0
+
+ smull v0.4s, v0.4h, v28.4h //multiply and add row 1
+ smull v1.4s, v1.4h, v29.4h //multiply and add row 2
+ smull v2.4s, v2.4h, v30.4h //multiply and add row 3
+ smull v3.4s, v3.4h, v31.4h //multiply and add row 4
+
+ add v20.4s, v0.4s, v23.4s
+ add v21.4s, v1.4s, v23.4s
+ add v22.4s, v2.4s, v23.4s
+ add v23.4s, v3.4s, v23.4s
+
+ dup v24.4s, w7
+
+ sshl v20.4s, v20.4s, v24.4s //shift row 1
+ sshl v21.4s, v21.4s, v24.4s //shift row 2
+ sshl v22.4s, v22.4s, v24.4s //shift row 3
+ sshl v23.4s, v23.4s, v24.4s //shift row 4
+
+ xtn v20.4h, v20.4s //narrow row 1
+ xtn v21.4h, v21.4s //narrow row 2
+ xtn v22.4h, v22.4s //narrow row 3
+ xtn v23.4h, v23.4s //narrow row 4
+
+ neg v24.8h, v20.8h //get negative
+ neg v25.8h, v21.8h //get negative
+ neg v26.8h, v22.8h //get negative
+ neg v27.8h, v23.8h //get negative
+
+ //compare with zero for computng nnz
+ cmeq v0.4h, v20.4h, #0
+ cmeq v1.4h, v21.4h, #0
+ cmeq v2.4h, v22.4h, #0
+ cmeq v3.4h, v23.4h, #0
+
+ bsl v4.8b, v20.8b, v24.8b //restore sign of row 1 and 2
+ bsl v5.8b, v21.8b, v25.8b //restore sign of row 3 and 4
+ bsl v6.8b, v22.8b, v26.8b //restore sign of row 1 and 2
+ bsl v7.8b, v23.8b, v27.8b //restore sign of row 3 and 4
+
+ //narrow the comaprison result
+ mov v0.d[1], v2.d[0]
+ mov v1.d[1], v3.d[0]
+
+ xtn v0.8b, v0.8h
+ xtn v1.8b, v1.8h
+
+ ushr v0.8b, v0.8b, #7 //i reduce comaparison bit to a signle bit row 1 and 2 blk 1 and 2 [ keep the value for later use ]
+ ushr v1.8b, v1.8b, #7 //i reduce comaparison bit to a signle bit row 1 and 2 blk 1 and 2 [ keep the value for later use ]
+
+ add v0.8b, v0.8b, v1.8b //i pair add nnz 1
+ addp v0.8b, v0.8b, v0.8b //i pair add nnz 1
+ addp v0.8b, v0.8b, v0.8b //i pair add nnz 1
+ addp v0.8b, v0.8b, v0.8b //i pair add nnz 1
+
+ st1 {v4.4h-v7.4h}, [x2] //store blk
+
+ movi v25.8b, #16 //get max nnz
+ sub v26.8b, v25.8b , v0.8b //invert current nnz
+ st1 {v26.b}[0], [x9] //write nnz
+
+ pop_v_regs
+ ret
+
+
+//*****************************************************************************
+//*
+//* function name : ih264_hadamard_quant_4x4_av8
+//* description : this function does forward hadamard transform and
+//* quantization for luma dc block
+//*
+//* arguments : x0 :pointer to src buffer
+// x1 :pointer to dst buffer
+// x2 :pu2_scale_matrix
+// x2 :pu2_threshold_matrix
+// x3 :u4_qbits
+// x4 :u4_round_factor
+// x5 :pu1_nnz
+// values returned : none
+//
+// register usage :
+// stack usage : 0 bytes
+// cycles : around
+// interruptiaility : interruptable
+//
+// known limitations
+// \assumptions :
+//
+// revision history :
+// dd mm yyyy author(s) changes
+// 20 2 2015 100633 first version
+//
+//*****************************************************************************
+//ih264_hadamard_quant_4x4_av8(word16 *pi2_src, word16 *pi2_dst,
+// const uword16 *pu2_scale_matrix,
+// const uword16 *pu2_threshold_matrix, uword32 u4_qbits,
+// uword32 u4_round_factor,uword8 *pu1_nnz
+// )
+ .global ih264_hadamard_quant_4x4_av8
+ih264_hadamard_quant_4x4_av8:
+
+//x0 :pointer to src buffer
+//x1 :pointer to dst buffer
+//x2 :pu2_scale_matrix
+//x3 :pu2_threshold_matrix
+//x4 :u4_qbits
+//x5 :u4_round_factor
+//x6 :pu1_nnz
+
+ push_v_regs
+
+ ld4 {v0.4h-v3.4h}, [x0] //load 4x4 block
+ ld1 {v30.h}[0], [x2] //load pu2_scale_matrix[0]
+
+ saddl v4.4s, v0.4h, v3.4h //x0 = x4 + x7;
+ saddl v5.4s, v1.4h, v2.4h //x1 = x5 + x6;
+ ssubl v6.4s, v1.4h, v2.4h //x2 = x5 - x6;
+ ssubl v7.4s, v0.4h, v3.4h //x3 = x4 - x7;
+
+ dup v30.8h, v30.h[0] //pu2_scale_matrix[0]
+
+ add v14.4s, v4.4s, v5.4s //pi2_dst[0] = x0 + x1;
+ add v15.4s, v7.4s, v6.4s //pi2_dst[1] = x3 + x2;
+ sub v16.4s, v4.4s, v5.4s //pi2_dst[2] = x0 - x1;
+ sub v17.4s, v7.4s, v6.4s //pi2_dst[3] = x3 - x2;
+
+ //transpose 4x4 block
+ trn1 v18.4s, v14.4s, v15.4s
+ trn2 v19.4s, v14.4s, v15.4s
+ trn1 v20.4s, v16.4s, v17.4s
+ trn2 v21.4s, v16.4s, v17.4s
+
+ trn1 v14.2d, v18.2d, v20.2d
+ trn2 v16.2d, v18.2d, v20.2d
+ trn1 v15.2d, v19.2d, v21.2d
+ trn2 v17.2d, v19.2d, v21.2d
+ //end transpose
+
+ add v18.4s, v14.4s, v17.4s //x0 = x4 + x7;
+ add v19.4s, v15.4s, v16.4s //x1 = x5 + x6;
+ sub v20.4s, v15.4s, v16.4s //x2 = x5 - x6;
+ sub v21.4s, v14.4s, v17.4s //x3 = x4 - x7;
+
+ dup v14.4s, w5 //round factor
+ dup v15.4s, v14.s[0]
+ dup v16.4s, v14.s[0]
+ dup v17.4s, v14.s[0]
+
+ add v22.4s, v18.4s, v19.4s //(x0 + x1)
+ add v23.4s, v21.4s, v20.4s //(x3 + x2)
+ sub v24.4s, v18.4s, v19.4s //(x0 - x1)
+ sub v25.4s, v21.4s, v20.4s //(x3 - x2)
+
+ shrn v0.4h, v22.4s, #1 //i4_value = (x0 + x1) >> 1;
+ shrn2 v0.8h, v23.4s, #1 //i4_value = (x3 + x2) >> 1;
+ shrn v1.4h, v24.4s, #1 //i4_value = (x0 - x1) >> 1;
+ shrn2 v1.8h, v25.4s, #1 //i4_value = (x3 - x2) >> 1;
+
+ abs v2.8h, v0.8h
+ abs v3.8h, v1.8h
+
+ cmgt v4.8h, v0.8h, #0 //get the sign row 1,2
+ cmgt v5.8h, v1.8h, #0
+
+ neg w4, w4 //-u4_qbits
+ dup v22.4s, w4 //load -u4_qbits
+
+ umlal v14.4s, v2.4h, v30.4h
+ umlal2 v15.4s, v2.8h, v30.8h
+ umlal v16.4s, v3.4h, v30.4h
+ umlal2 v17.4s, v3.8h, v30.8h
+
+ ushl v14.4s, v14.4s, v22.4s
+ ushl v15.4s, v15.4s, v22.4s
+ ushl v16.4s, v16.4s, v22.4s
+ ushl v17.4s, v17.4s, v22.4s
+
+ uqxtn v14.4h, v14.4s
+ uqxtn2 v14.8h, v15.4s
+ uqxtn v16.4h, v16.4s
+ uqxtn2 v16.8h, v17.4s
+
+ neg v15.8h, v14.8h
+ neg v17.8h, v16.8h
+
+ bsl v4.16b, v14.16b, v15.16b
+ bsl v5.16b, v16.16b, v17.16b
+
+ cmeq v0.8h, v14.8h, #0
+ cmeq v1.8h, v16.8h, #0
+
+ st1 {v4.8h-v5.8h}, [x1]
+
+ movi v20.8b, #16
+
+ xtn v2.8b, v0.8h
+ xtn v3.8b, v1.8h
+
+ ushr v2.8b, v2.8b, #7
+ ushr v3.8b, v3.8b, #7
+
+ add v2.8b, v2.8b, v3.8b
+ addp v2.8b, v2.8b, v2.8b
+ addp v2.8b, v2.8b, v2.8b
+ addp v2.8b, v2.8b, v2.8b
+ sub v20.8b, v20.8b, v2.8b
+ st1 {v20.b}[0], [x6]
+
+ pop_v_regs
+ ret
+
+
+//*****************************************************************************
+//*
+//* function name : ih264_hadamard_quant_2x2_uv
+//* description : this function does forward hadamard transform and
+//* quantization for dc block of chroma for both planes
+//*
+//* arguments : x0 :pointer to src buffer
+// x1 :pointer to dst buffer
+// x2 :pu2_scale_matrix
+// x2 :pu2_threshold_matrix
+// x3 :u4_qbits
+// x4 :u4_round_factor
+// x5 :pu1_nnz
+// values returned : none
+//
+// register usage :
+// stack usage : 0 bytes
+// cycles : around
+// interruptiaility : interruptable
+//
+// known limitations
+// \assumptions :
+//
+// revision history :
+// dd mm yyyy author(s) changes
+// 20 2 2015 100633 first version
+//
+//*****************************************************************************
+// ih264_hadamard_quant_2x2_uv_av8(word16 *pi2_src, word16 *pi2_dst,
+// const uword16 *pu2_scale_matrix,
+// const uword16 *pu2_threshold_matrix, uword32 u4_qbits,
+// uword32 u4_round_factor,uword8 *pu1_nnz
+// )
+
+ .global ih264_hadamard_quant_2x2_uv_av8
+ih264_hadamard_quant_2x2_uv_av8:
+
+ push_v_regs
+
+ ld2 {v0.4h-v1.4h}, [x0] //load src
+
+ ld1 {v30.h}[0], [x2] //load pu2_scale_matrix[0]
+ dup v30.4h, v30.4h[0] //pu2_scale_matrix
+ uxtl v30.4s, v30.4h //pu2_scale_matrix
+
+ neg w4, w4
+ dup v24.4s, w4 //u4_qbits
+
+ dup v25.4s, w5 //round fact
+ dup v26.4s, v25.s[0]
+
+ saddl v2.4s, v0.4h, v1.4h //x0 = x4 + x5;, x2 = x6 + x7;
+ ssubl v3.4s, v0.4h, v1.4h //x1 = x4 - x5; x3 = x6 - x7;
+
+ trn1 v4.4s, v2.4s, v3.4s
+ trn2 v5.4s, v2.4s, v3.4s //q1 -> x0 x1, q2 -> x2 x3
+
+ add v0.4s, v4.4s , v5.4s // (x0 + x2) (x1 + x3) (y0 + y2); (y1 + y3);
+ sub v1.4s, v4.4s , v5.4s // (x0 - x2) (x1 - x3) (y0 - y2); (y1 - y3);
+
+ abs v2.4s, v0.4s
+ abs v3.4s, v1.4s
+
+ cmgt v4.4s, v0.4s, #0 //get the sign row 1,2
+ cmgt v5.4s, v1.4s, #0
+
+ uqxtn v4.4h, v4.4s
+ sqxtn2 v4.8h, v5.4s
+
+ mla v25.4s, v2.4s, v30.4s
+ mla v26.4s, v3.4s, v30.4s
+
+ ushl v2.4s, v25.4s, v24.4s //>>qbit
+ ushl v3.4s, v26.4s, v24.4s //>>qbit
+
+ uqxtn v2.4h, v2.4s
+ uqxtn2 v2.8h, v3.4s
+
+ neg v5.8h, v2.8h
+
+ bsl v4.16b, v2.16b, v5.16b //*sign
+
+ //rearrange such that we get each plane coeffs as continous
+ mov v5.s[0], v4.s[1]
+ mov v4.s[1], v4.s[2]
+ mov v4.s[2], v5.s[0]
+
+ cmeq v5.8h, v4.8h, #0 //compute nnz
+ xtn v5.8b, v5.8h //reduce nnz comparison to 1 bit
+ ushr v5.8b, v5.8b, #7 //reduce nnz comparison to 1 bit
+ movi v20.8b, #4 //since we add zeros, we need to subtract from 4 to get nnz
+ addp v5.8b, v5.8b, v5.8b //sum up nnz
+ addp v5.8b, v5.8b, v5.8b //sum up nnz
+
+ st1 {v4.8h}, [x1] //store the block
+
+ st1 {v4.8h}, [x1] //store the block
+ sub v20.8b, v20.8b, v5.8b //4- numzeros
+
+ st1 {v20.h}[0], [x6] //store nnz
+
+ pop_v_regs
+ ret
+
+
+
diff --git a/common/armv8/ih264_weighted_bi_pred_av8.s b/common/armv8/ih264_weighted_bi_pred_av8.s
new file mode 100755
index 0000000..f7d0846
--- /dev/null
+++ b/common/armv8/ih264_weighted_bi_pred_av8.s
@@ -0,0 +1,574 @@
+//******************************************************************************
+//*
+//* Copyright (C) 2015 The Android Open Source Project
+//*
+//* Licensed under the Apache License, Version 2.0 (the "License");
+//* you may not use this file except in compliance with the License.
+//* You may obtain a copy of the License at:
+//*
+//* http://www.apache.org/licenses/LICENSE-2.0
+//*
+//* Unless required by applicable law or agreed to in writing, software
+//* distributed under the License is distributed on an "AS IS" BASIS,
+//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//* See the License for the specific language governing permissions and
+//* limitations under the License.
+//*
+//*****************************************************************************
+//* Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
+//*/
+///**
+//******************************************************************************
+//* @file
+//* ih264_weighted_bi_pred_av8.s
+//*
+//* @brief
+//* Contains function definitions for weighted biprediction.
+//* Functions are coded using NEON intrinsics and can be compiled using ARM RVCT
+//*
+//* @author
+//* Kaushik Senthoor R
+//*
+//* @par List of Functions:
+//*
+//* - ih264_weighted_bi_pred_luma_av8()
+//* - ih264_weighted_bi_pred_chroma_av8()
+//*
+//* @remarks
+//* None
+//*
+//*******************************************************************************
+//*/
+//*******************************************************************************
+//* @function
+//* ih264_weighted_bi_pred_luma_av8()
+//*
+//* @brief
+//* This routine performs the default weighted prediction as described in sec
+//* 8.4.2.3.2 titled "Weighted sample prediction process" for luma.
+//*
+//* @par Description:
+//* This function gets two ht x wd blocks, calculates the weighted samples,
+//* rounds off, adds offset and stores it in the destination block.
+//*
+//* @param[in] puc_src1
+//* UWORD8 Pointer to the buffer containing the input block 1.
+//*
+//* @param[in] puc_src2
+//* UWORD8 Pointer to the buffer containing the input block 2.
+//*
+//* @param[out] puc_dst
+//* UWORD8 pointer to the destination where the output block is stored.
+//*
+//* @param[in] src_strd1
+//* Stride of the input buffer 1
+//*
+//* @param[in] src_strd2
+//* Stride of the input buffer 2
+//*
+//* @param[in] dst_strd
+//* Stride of the destination buffer
+//*
+//* @param[in] log_WD
+//* number of bits to be rounded off
+//*
+//* @param[in] wt1
+//* weight for the weighted prediction
+//*
+//* @param[in] wt2
+//* weight for the weighted prediction
+//*
+//* @param[in] ofst1
+//* offset 1 used after rounding off
+//*
+//* @param[in] ofst2
+//* offset 2 used after rounding off
+//*
+//* @param[in] ht
+//* integer height of the array
+//*
+//* @param[in] wd
+//* integer width of the array
+//*
+//* @returns
+//* None
+//*
+//* @remarks
+//* (ht,wd) can be (4,4), (4,8), (8,4), (8,8), (8,16), (16,8) or (16,16).
+//*
+//*******************************************************************************
+//*/
+//void ih264_weighted_bi_pred_luma_av8(UWORD8 *puc_src1,
+// UWORD8 *puc_src2,
+// UWORD8 *puc_dst,
+// WORD32 src_strd1,
+// WORD32 src_strd2,
+// WORD32 dst_strd,
+// UWORD16 log_WD,
+// UWORD32 wt1,
+// UWORD32 wt2,
+// UWORD16 ofst1,
+// UWORD16 ofst2,
+// UWORD8 ht,
+// UWORD8 wd)
+//
+//**************Variables Vs Registers*****************************************
+// x0 => puc_src1
+// x1 => puc_src2
+// x2 => puc_dst
+// x3 => src_strd1
+// [sp] => src_strd2 (x4)
+// [sp+4] => dst_strd (x5)
+// [sp+8] => log_WD (x6)
+// [sp+12] => wt1 (x7)
+// [sp+16] => wt2 (x8)
+// [sp+20] => ofst1 (x9)
+// [sp+24] => ofst2 (x10)
+// [sp+28] => ht (x11)
+// [sp+32] => wd (x12)
+//
+.text
+.p2align 2
+.include "ih264_neon_macros.s"
+
+
+
+ .global ih264_weighted_bi_pred_luma_av8
+
+ih264_weighted_bi_pred_luma_av8:
+
+ // STMFD sp!, {x4-x12,x14} //stack stores the values of the arguments
+ push_v_regs
+ stp x19, x20, [sp, #-16]!
+ ldr x8, [sp, #80] //Load wt2 in x8
+ ldr x9, [sp, #88] //Load ofst1 in x9
+ add x6, x6, #1 //x6 = log_WD + 1
+ sub x20, x6, #0 //x13 = -(log_WD + 1)
+ neg x10, x20
+ dup v0.8h, w10 //Q0 = -(log_WD + 1) (32-bit)
+ ldr x10, [sp, #96] //Load ofst2 in x10
+ ldr x11, [sp, #104] //Load ht in x11
+ ldr x12, [sp, #112] //Load wd in x12
+ add x9, x9, #1 //x9 = ofst1 + 1
+ add x9, x9, x10 //x9 = ofst1 + ofst2 + 1
+ mov v2.s[0], w7
+ mov v2.s[1], w8 //D2 = {wt1(32-bit), wt2(32-bit)}
+ asr x9, x9, #1 //x9 = ofst = (ofst1 + ofst2 + 1) >> 1
+ dup v3.8b, w9 //D3 = ofst (8-bit)
+ cmp w12, #16
+ beq loop_16 //branch if wd is 16
+ cmp w12, #8 //check if wd is 8
+ beq loop_8 //branch if wd is 8
+
+loop_4: //each iteration processes four rows
+
+ ld1 {v4.s}[0], [x0], x3 //load row 1 in source 1
+ ld1 {v4.s}[1], [x0], x3 //load row 2 in source 1
+ ld1 {v6.s}[0], [x1], x4 //load row 1 in source 2
+ ld1 {v6.s}[1], [x1], x4 //load row 2 in source 2
+ uxtl v4.8h, v4.8b //converting rows 1,2 in source 1 to 16-bit
+ ld1 {v8.s}[0], [x0], x3 //load row 3 in source 1
+ ld1 {v8.s}[1], [x0], x3 //load row 4 in source 1
+ uxtl v6.8h, v6.8b //converting rows 1,2 in source 2 to 16-bit
+ ld1 {v10.s}[0], [x1], x4 //load row 3 in source 2
+ ld1 {v10.s}[1], [x1], x4 //load row 4 in source 2
+ uxtl v8.8h, v8.8b //converting rows 3,4 in source 1 to 16-bit
+ uxtl v10.8h, v10.8b //converting rows 3,4 in source 2 to 16-bit
+ mul v4.8h, v4.8h , v2.4h[0] //weight 1 mult. for rows 1,2
+ mla v4.8h, v6.8h , v2.4h[2] //weight 2 mult. for rows 1,2
+ mul v8.8h, v8.8h , v2.4h[0] //weight 1 mult. for rows 3,4
+ mla v8.8h, v10.8h , v2.4h[2] //weight 2 mult. for rows 3,4
+ subs w11, w11, #4 //decrement ht by 4
+ srshl v4.8h, v4.8h , v0.8h //rounds off the weighted samples from rows 1,2
+ srshl v8.8h, v8.8h , v0.8h //rounds off the weighted samples from rows 3,4
+ saddw v4.8h, v4.8h , v3.8b //adding offset for rows 1,2
+ saddw v8.8h, v8.8h , v3.8b //adding offset for rows 3,4
+ sqxtun v4.8b, v4.8h //saturating rows 1,2 to unsigned 8-bit
+ sqxtun v8.8b, v8.8h //saturating rows 3,4 to unsigned 8-bit
+ st1 {v4.s}[0], [x2], x5 //store row 1 in destination
+ st1 {v4.s}[1], [x2], x5 //store row 2 in destination
+ st1 {v8.s}[0], [x2], x5 //store row 3 in destination
+ st1 {v8.s}[1], [x2], x5 //store row 4 in destination
+ bgt loop_4 //if greater than 0 repeat the loop again
+ b end_loops
+
+loop_8: //each iteration processes four rows
+
+ ld1 {v4.8b}, [x0], x3 //load row 1 in source 1
+ ld1 {v6.8b}, [x1], x4 //load row 1 in source 2
+ ld1 {v8.8b}, [x0], x3 //load row 2 in source 1
+ ld1 {v10.8b}, [x1], x4 //load row 2 in source 2
+ uxtl v4.8h, v4.8b //converting row 1 in source 1 to 16-bit
+ ld1 {v12.8b}, [x0], x3 //load row 3 in source 1
+ ld1 {v14.8b}, [x1], x4 //load row 3 in source 2
+ uxtl v6.8h, v6.8b //converting row 1 in source 2 to 16-bit
+ ld1 {v16.8b}, [x0], x3 //load row 4 in source 1
+ ld1 {v18.8b}, [x1], x4 //load row 4 in source 2
+ uxtl v8.8h, v8.8b //converting row 2 in source 1 to 16-bit
+ uxtl v10.8h, v10.8b //converting row 2 in source 2 to 16-bit
+ mul v4.8h, v4.8h , v2.4h[0] //weight 1 mult. for row 1
+ mla v4.8h, v6.8h , v2.4h[2] //weight 2 mult. for row 1
+ uxtl v12.8h, v12.8b //converting row 3 in source 1 to 16-bit
+ uxtl v14.8h, v14.8b //converting row 3 in source 2 to 16-bit
+ mul v8.8h, v8.8h , v2.4h[0] //weight 1 mult. for row 2
+ mla v8.8h, v10.8h , v2.4h[2] //weight 2 mult. for row 2
+ uxtl v16.8h, v16.8b //converting row 4 in source 1 to 16-bit
+ uxtl v18.8h, v18.8b //converting row 4 in source 2 to 16-bit
+ mul v12.8h, v12.8h , v2.4h[0] //weight 1 mult. for row 3
+ mla v12.8h, v14.8h , v2.4h[2] //weight 2 mult. for row 3
+ mul v16.8h, v16.8h , v2.4h[0] //weight 1 mult. for row 4
+ mla v16.8h, v18.8h , v2.4h[2] //weight 2 mult. for row 4
+ srshl v4.8h, v4.8h , v0.8h //rounds off the weighted samples from row 1
+ srshl v8.8h, v8.8h , v0.8h //rounds off the weighted samples from row 2
+ srshl v12.8h, v12.8h , v0.8h //rounds off the weighted samples from row 3
+ saddw v4.8h, v4.8h , v3.8b //adding offset for row 1
+ srshl v16.8h, v16.8h , v0.8h //rounds off the weighted samples from row 4
+ saddw v8.8h, v8.8h , v3.8b //adding offset for row 2
+ saddw v12.8h, v12.8h , v3.8b //adding offset for row 3
+ sqxtun v4.8b, v4.8h //saturating row 1 to unsigned 8-bit
+ saddw v16.8h, v16.8h , v3.8b //adding offset for row 4
+ sqxtun v8.8b, v8.8h //saturating row 2 to unsigned 8-bit
+ sqxtun v12.8b, v12.8h //saturating row 3 to unsigned 8-bit
+ sqxtun v16.8b, v16.8h //saturating row 4 to unsigned 8-bit
+ st1 {v4.8b}, [x2], x5 //store row 1 in destination
+ st1 {v8.8b}, [x2], x5 //store row 2 in destination
+ subs w11, w11, #4 //decrement ht by 4
+ st1 {v12.8b}, [x2], x5 //store row 3 in destination
+ st1 {v16.8b}, [x2], x5 //store row 4 in destination
+ bgt loop_8 //if greater than 0 repeat the loop again
+ b end_loops
+
+loop_16: //each iteration processes two rows
+
+ ld1 {v4.8b, v5.8b}, [x0], x3 //load row 1 in source 1
+ ld1 {v6.8b, v7.8b}, [x1], x4 //load row 1 in source 2
+ ld1 {v8.8b, v9.8b}, [x0], x3 //load row 2 in source 1
+ ld1 {v10.8b, v11.8b}, [x1], x4 //load row 2 in source 2
+ uxtl v20.8h, v4.8b //converting row 1L in source 1 to 16-bit
+ ld1 {v12.8b, v13.8b}, [x0], x3 //load row 3 in source 1
+ ld1 {v14.8b, v15.8b}, [x1], x4 //load row 3 in source 2
+ uxtl v22.8h, v6.8b //converting row 1L in source 2 to 16-bit
+ ld1 {v16.8b, v17.8b}, [x0], x3 //load row 4 in source 1
+ ld1 {v18.8b, v19.8b}, [x1], x4 //load row 4 in source 2
+ uxtl v4.8h, v5.8b //converting row 1H in source 1 to 16-bit
+ uxtl v6.8h, v7.8b //converting row 1H in source 2 to 16-bit
+ mul v20.8h, v20.8h , v2.4h[0] //weight 1 mult. for row 1L
+ mla v20.8h, v22.8h , v2.4h[2] //weight 2 mult. for row 1L
+ uxtl v24.8h, v8.8b //converting row 2L in source 1 to 16-bit
+ uxtl v26.8h, v10.8b //converting row 2L in source 2 to 16-bit
+ mul v4.8h, v4.8h , v2.4h[0] //weight 1 mult. for row 1H
+ mla v4.8h, v6.8h , v2.4h[2] //weight 2 mult. for row 1H
+ uxtl v8.8h, v9.8b //converting row 2H in source 1 to 16-bit
+ uxtl v10.8h, v11.8b //converting row 2H in source 2 to 16-bit
+ mul v24.8h, v24.8h , v2.4h[0] //weight 1 mult. for row 2L
+ mla v24.8h, v26.8h , v2.4h[2] //weight 2 mult. for row 2L
+ uxtl v28.8h, v12.8b //converting row 3L in source 1 to 16-bit
+ uxtl v30.8h, v14.8b //converting row 3L in source 2 to 16-bit
+ mul v8.8h, v8.8h , v2.4h[0] //weight 1 mult. for row 2H
+ mla v8.8h, v10.8h , v2.4h[2] //weight 2 mult. for row 2H
+ uxtl v12.8h, v13.8b //converting row 3H in source 1 to 16-bit
+ uxtl v14.8h, v15.8b //converting row 3H in source 2 to 16-bit
+ mul v28.8h, v28.8h , v2.4h[0] //weight 1 mult. for row 3L
+ mla v28.8h, v30.8h , v2.4h[2] //weight 2 mult. for row 3L
+ uxtl v22.8h, v16.8b //converting row 4L in source 1 to 16-bit
+ uxtl v6.8h, v18.8b //converting row 4L in source 2 to 16-bit
+ mul v12.8h, v12.8h , v2.4h[0] //weight 1 mult. for row 3H
+ mla v12.8h, v14.8h , v2.4h[2] //weight 2 mult. for row 3H
+ uxtl v16.8h, v17.8b //converting row 4H in source 1 to 16-bit
+ uxtl v18.8h, v19.8b //converting row 4H in source 2 to 16-bit
+ mul v22.8h, v22.8h , v2.4h[0] //weight 1 mult. for row 4L
+ mla v22.8h, v6.8h , v2.4h[2] //weight 2 mult. for row 4L
+ srshl v20.8h, v20.8h , v0.8h //rounds off the weighted samples from row 1L
+ mul v16.8h, v16.8h , v2.4h[0] //weight 1 mult. for row 4H
+ mla v16.8h, v18.8h , v2.4h[2] //weight 2 mult. for row 4H
+ srshl v4.8h, v4.8h , v0.8h //rounds off the weighted samples from row 1H
+ srshl v24.8h, v24.8h , v0.8h //rounds off the weighted samples from row 2L
+ saddw v20.8h, v20.8h , v3.8b //adding offset for row 1L
+ srshl v8.8h, v8.8h , v0.8h //rounds off the weighted samples from row 2H
+ saddw v4.8h, v4.8h , v3.8b //adding offset for row 1H
+ srshl v28.8h, v28.8h , v0.8h //rounds off the weighted samples from row 3L
+ saddw v24.8h, v24.8h , v3.8b //adding offset for row 2L
+ srshl v12.8h, v12.8h , v0.8h //rounds off the weighted samples from row 3H
+ saddw v8.8h, v8.8h , v3.8b //adding offset for row 2H
+ srshl v22.8h, v22.8h , v0.8h //rounds off the weighted samples from row 4L
+ saddw v28.8h, v28.8h , v3.8b //adding offset for row 3L
+ srshl v16.8h, v16.8h , v0.8h //rounds off the weighted samples from row 4H
+ saddw v12.8h, v12.8h , v3.8b //adding offset for row 3H
+ sqxtun v26.8b, v20.8h //saturating row 1L to unsigned 8-bit
+ saddw v22.8h, v22.8h , v3.8b //adding offset for row 4L
+ sqxtun v27.8b, v4.8h //saturating row 1H to unsigned 8-bit
+ saddw v16.8h, v16.8h , v3.8b //adding offset for row 4H
+ sqxtun v10.8b, v24.8h //saturating row 2L to unsigned 8-bit
+ sqxtun v11.8b, v8.8h //saturating row 2H to unsigned 8-bit
+ sqxtun v30.8b, v28.8h //saturating row 3L to unsigned 8-bit
+ sqxtun v31.8b, v12.8h //saturating row 3H to unsigned 8-bit
+ st1 {v26.8b, v27.8b}, [x2], x5 //store row 1 in destination
+ sqxtun v14.8b, v22.8h //saturating row 4L to unsigned 8-bit
+ sqxtun v15.8b, v16.8h //saturating row 4H to unsigned 8-bit
+ st1 {v10.8b, v11.8b}, [x2], x5 //store row 2 in destination
+ subs w11, w11, #4 //decrement ht by 4
+ st1 {v30.8b, v31.8b}, [x2], x5 //store row 3 in destination
+ st1 {v14.8b, v15.8b}, [x2], x5 //store row 4 in destination
+ bgt loop_16 //if greater than 0 repeat the loop again
+
+end_loops:
+
+ // LDMFD sp!,{x4-x12,x15} //Reload the registers from sp
+ ldp x19, x20, [sp], #16
+ pop_v_regs
+ ret
+
+
+//*******************************************************************************
+//* @function
+//* ih264_weighted_bi_pred_chroma_av8()
+//*
+//* @brief
+//* This routine performs the default weighted prediction as described in sec
+//* 8.4.2.3.2 titled "Weighted sample prediction process" for chroma.
+//*
+//* @par Description:
+//* This function gets two ht x wd blocks, calculates the weighted samples,
+//* rounds off, adds offset and stores it in the destination block for U and V.
+//*
+//* @param[in] puc_src1
+//* UWORD8 Pointer to the buffer containing the input block 1.
+//*
+//* @param[in] puc_src2
+//* UWORD8 Pointer to the buffer containing the input block 2.
+//*
+//* @param[out] puc_dst
+//* UWORD8 pointer to the destination where the output block is stored.
+//*
+//* @param[in] src_strd1
+//* Stride of the input buffer 1
+//*
+//* @param[in] src_strd2
+//* Stride of the input buffer 2
+//*
+//* @param[in] dst_strd
+//* Stride of the destination buffer
+//*
+//* @param[in] log_WD
+//* number of bits to be rounded off
+//*
+//* @param[in] wt1
+//* weights for the weighted prediction in U and V
+//*
+//* @param[in] wt2
+//* weights for the weighted prediction in U and V
+//*
+//* @param[in] ofst1
+//* offset 1 used after rounding off for U an dV
+//*
+//* @param[in] ofst2
+//* offset 2 used after rounding off for U and V
+//*
+//* @param[in] ht
+//* integer height of the array
+//*
+//* @param[in] wd
+//* integer width of the array
+//*
+//* @returns
+//* None
+//*
+//* @remarks
+//* (ht,wd) can be (2,2), (2,4), (4,2), (4,4), (4,8), (8,4) or (8,8).
+//*
+//*******************************************************************************
+//*/
+//void ih264_weighted_bi_pred_chroma_av8(UWORD8 *puc_src1,
+// UWORD8 *puc_src2,
+// UWORD8 *puc_dst,
+// WORD32 src_strd1,
+// WORD32 src_strd2,
+// WORD32 dst_strd,
+// UWORD16 log_WD,
+// UWORD32 wt1,
+// UWORD32 wt2,
+// UWORD16 ofst1,
+// UWORD16 ofst2,
+// UWORD8 ht,
+// UWORD8 wd)
+//
+//**************Variables Vs Registers*****************************************
+// x0 => puc_src1
+// x1 => puc_src2
+// x2 => puc_dst
+// x3 => src_strd1
+// [sp] => src_strd2 (x4)
+// [sp+4] => dst_strd (x5)
+// [sp+8] => log_WD (x6)
+// [sp+12] => wt1 (x7)
+// [sp+16] => wt2 (x8)
+// [sp+20] => ofst1 (x9)
+// [sp+24] => ofst2 (x10)
+// [sp+28] => ht (x11)
+// [sp+32] => wd (x12)
+//
+
+
+
+
+
+ .global ih264_weighted_bi_pred_chroma_av8
+
+ih264_weighted_bi_pred_chroma_av8:
+
+ // STMFD sp!, {x4-x12,x14} //stack stores the values of the arguments
+ push_v_regs
+ stp x19, x20, [sp, #-16]!
+
+
+ ldr x8, [sp, #80] //Load wt2 in x8
+ dup v4.4s, w8 //Q2 = (wt2_u, wt2_v) (32-bit)
+ dup v2.4s, w7 //Q1 = (wt1_u, wt1_v) (32-bit)
+ add x6, x6, #1 //x6 = log_WD + 1
+ ldr w9, [sp, #88] //Load ofst1 in x9
+ sxtw x9, w9
+ ldr w10, [sp, #96] //Load ofst2 in x10
+ sxtw x10, w10
+ sub x20, x6, #0 //x12 = -(log_WD + 1)
+ neg x20, x20
+ dup v0.8h, w20 //Q0 = -(log_WD + 1) (16-bit)
+ ldr w11, [sp, #104] //Load ht in x11
+ ldr w12, [sp, #112] //Load wd in x12
+ sxtw x11, w11
+ sxtw x12, w12
+ dup v20.8h, w9 //0ffset1
+ dup v21.8h, w10 //0ffset2
+ srhadd v6.8b, v20.8b, v21.8b
+ sxtl v6.8h, v6.8b
+ cmp w12, #8 //check if wd is 8
+ beq loop_8_uv //branch if wd is 8
+ cmp w12, #4 //check if wd is 4
+ beq loop_4_uv //branch if wd is 4
+
+loop_2_uv: //each iteration processes two rows
+
+ ld1 {v8.s}[0], [x0], x3 //load row 1 in source 1
+ ld1 {v8.s}[1], [x0], x3 //load row 2 in source 1
+ ld1 {v10.s}[0], [x1], x4 //load row 1 in source 2
+ ld1 {v10.s}[1], [x1], x4 //load row 2 in source 2
+ uxtl v8.8h, v8.8b //converting rows 1,2 in source 1 to 16-bit
+ uxtl v10.8h, v10.8b //converting rows 1,2 in source 2 to 16-bit
+ mul v8.8h, v8.8h , v2.8h //weight 1 mult. for rows 1,2
+ mla v8.8h, v10.8h , v4.8h //weight 2 mult. for rows 1,2
+ srshl v8.8h, v8.8h , v0.8h //rounds off the weighted samples from rows 1,2
+ add v8.8h, v8.8h , v6.8h //adding offset for rows 1,2
+ sqxtun v8.8b, v8.8h //saturating rows 1,2 to unsigned 8-bit/
+ st1 {v8.s}[0], [x2], x5 //store row 1 in destination
+ st1 {v8.s}[1], [x2], x5 //store row 2 in destination
+ subs w11, w11, #2 //decrement ht by 2
+ bgt loop_2_uv //if greater than 0 repeat the loop again
+ b end_loops_uv
+
+loop_4_uv: //each iteration processes two rows
+
+ ld1 {v8.8b}, [x0], x3 //load row 1 in source 1
+ ld1 {v10.8b}, [x1], x4 //load row 1 in source 2
+ uxtl v8.8h, v8.8b //converting row 1 in source 1 to 16-bit
+ ld1 {v12.8b}, [x0], x3 //load row 2 in source 1
+ uxtl v10.8h, v10.8b //converting row 1 in source 2 to 16-bit
+ ld1 {v14.8b}, [x1], x4 //load row 2 in source 2
+ uxtl v12.8h, v12.8b //converting row 2 in source 1 to 16-bit
+ mul v8.8h, v8.8h , v2.8h //weight 1 mult. for row 1
+ mla v8.8h, v10.8h , v4.8h //weight 2 mult. for row 1
+ uxtl v14.8h, v14.8b //converting row 2 in source 2 to 16-bit
+ mul v12.8h, v12.8h , v2.8h //weight 1 mult. for row 2
+ mla v12.8h, v14.8h , v4.8h //weight 2 mult. for row 2
+ subs w11, w11, #2 //decrement ht by 2
+ srshl v8.8h, v8.8h , v0.8h //rounds off the weighted samples from row 1
+ srshl v12.8h, v12.8h , v0.8h //rounds off the weighted samples from row 2
+ add v8.8h, v8.8h , v6.8h //adding offset for row 1
+ add v12.8h, v12.8h , v6.8h //adding offset for row 2
+ sqxtun v8.8b, v8.8h //saturating row 1 to unsigned 8-bit
+ sqxtun v12.8b, v12.8h //saturating row 2 to unsigned 8-bit
+ st1 {v8.8b}, [x2], x5 //store row 1 in destination
+ st1 {v12.8b}, [x2], x5 //store row 2 in destination
+ bgt loop_4_uv //if greater than 0 repeat the loop again
+ b end_loops_uv
+
+loop_8_uv: //each iteration processes two rows
+
+ ld1 {v8.8b, v9.8b}, [x0], x3 //load row 1 in source 1
+ ld1 {v10.8b, v11.8b}, [x1], x4 //load row 1 in source 2
+ ld1 {v12.8b, v13.8b}, [x0], x3 //load row 2 in source 1
+ ld1 {v14.8b, v15.8b}, [x1], x4 //load row 2 in source 2
+ uxtl v24.8h, v8.8b //converting row 1L in source 1 to 16-bit
+ ld1 {v16.8b, v17.8b}, [x0], x3 //load row 3 in source 1
+ ld1 {v18.8b, v19.8b}, [x1], x4 //load row 3 in source 2
+ uxtl v26.8h, v10.8b //converting row 1L in source 2 to 16-bit
+ ld1 {v20.8b, v21.8b}, [x0], x3 //load row 4 in source 1
+ ld1 {v22.8b, v23.8b}, [x1], x4 //load row 4 in source 2
+ uxtl v8.8h, v9.8b //converting row 1H in source 1 to 16-bit
+ uxtl v10.8h, v11.8b //converting row 1H in source 2 to 16-bit
+ mul v24.8h, v24.8h , v2.8h //weight 1 mult. for row 1L
+ mla v24.8h, v26.8h , v4.8h //weight 2 mult. for row 1L
+ uxtl v28.8h, v12.8b //converting row 2L in source 1 to 16-bit
+ uxtl v30.8h, v14.8b //converting row 2L in source 2 to 16-bit
+ mul v8.8h, v8.8h , v2.8h //weight 1 mult. for row 1H
+ mla v8.8h, v10.8h , v4.8h //weight 2 mult. for row 1H
+ uxtl v12.8h, v13.8b //converting row 2H in source 1 to 16-bit
+ uxtl v14.8h, v15.8b //converting row 2H in source 2 to 16-bit
+ mul v28.8h, v28.8h , v2.8h //weight 1 mult. for row 2L
+ mla v28.8h, v30.8h , v4.8h //weight 2 mult. for row 2L
+ uxtl v26.8h, v16.8b //converting row 3L in source 1 to 16-bit
+ uxtl v10.8h, v18.8b //converting row 3L in source 2 to 16-bit
+ mul v12.8h, v12.8h , v2.8h //weight 1 mult. for row 2H
+ mla v12.8h, v14.8h , v4.8h //weight 2 mult. for row 2H
+ uxtl v16.8h, v17.8b //converting row 3H in source 1 to 16-bit
+ uxtl v18.8h, v19.8b //converting row 3H in source 2 to 16-bit
+ mul v26.8h, v26.8h , v2.8h //weight 1 mult. for row 3L
+ mla v26.8h, v10.8h , v4.8h //weight 2 mult. for row 3L
+ uxtl v30.8h, v20.8b //converting row 4L in source 1 to 16-bit
+ uxtl v14.8h, v22.8b //converting row 4L in source 2 to 16-bit
+ mul v16.8h, v16.8h , v2.8h //weight 1 mult. for row 3H
+ mla v16.8h, v18.8h , v4.8h //weight 2 mult. for row 3H
+ uxtl v20.8h, v21.8b //converting row 4H in source 1 to 16-bit
+ uxtl v22.8h, v23.8b //converting row 4H in source 2 to 16-bit
+ mul v30.8h, v30.8h , v2.8h //weight 1 mult. for row 4L
+ mla v30.8h, v14.8h , v4.8h //weight 2 mult. for row 4L
+ srshl v24.8h, v24.8h , v0.8h //rounds off the weighted samples from row 1L
+ mul v20.8h, v20.8h , v2.8h //weight 1 mult. for row 4H
+ mla v20.8h, v22.8h , v4.8h //weight 2 mult. for row 4H
+ srshl v8.8h, v8.8h , v0.8h //rounds off the weighted samples from row 1H
+ srshl v28.8h, v28.8h , v0.8h //rounds off the weighted samples from row 2L
+ add v24.8h, v24.8h , v6.8h //adding offset for row 1L
+ srshl v12.8h, v12.8h , v0.8h //rounds off the weighted samples from row 2H
+ add v8.8h, v8.8h , v6.8h //adding offset for row 1H
+ srshl v26.8h, v26.8h , v0.8h //rounds off the weighted samples from row 3L
+ add v28.8h, v28.8h , v6.8h //adding offset for row 2L
+ srshl v16.8h, v16.8h , v0.8h //rounds off the weighted samples from row 3H
+ add v12.8h, v12.8h , v6.8h //adding offset for row 2H
+ srshl v30.8h, v30.8h , v0.8h //rounds off the weighted samples from row 4L
+ add v26.8h, v26.8h , v6.8h //adding offset for row 3L
+ srshl v20.8h, v20.8h , v0.8h //rounds off the weighted samples from row 4H
+ add v16.8h, v16.8h , v6.8h //adding offset for row 3H
+ sqxtun v10.8b, v24.8h //saturating row 1L to unsigned 8-bit
+ add v30.8h, v30.8h , v6.8h //adding offset for row 4L
+ sqxtun v11.8b, v8.8h //saturating row 1H to unsigned 8-bit
+ add v20.8h, v20.8h , v6.8h //adding offset for row 4H
+ sqxtun v18.8b, v28.8h //saturating row 2L to unsigned 8-bit
+ sqxtun v19.8b, v12.8h //saturating row 2H to unsigned 8-bit
+ sqxtun v14.8b, v26.8h //saturating row 3L to unsigned 8-bit
+ sqxtun v15.8b, v16.8h //saturating row 3H to unsigned 8-bit
+ st1 {v10.8b, v11.8b}, [x2], x5 //store row 1 in destination
+ sqxtun v22.8b, v30.8h //saturating row 4L to unsigned 8-bit
+ sqxtun v23.8b, v20.8h //saturating row 4H to unsigned 8-bit
+ st1 {v18.8b, v19.8b}, [x2], x5 //store row 2 in destination
+ subs w11, w11, #4 //decrement ht by 4
+ st1 {v14.8b, v15.8b}, [x2], x5 //store row 3 in destination
+ st1 {v22.8b, v23.8b}, [x2], x5 //store row 4 in destination
+ bgt loop_8_uv //if greater than 0 repeat the loop again
+
+end_loops_uv:
+
+ // LDMFD sp!,{x4-x12,x15} //Reload the registers from sp
+ ldp x19, x20, [sp], #16
+ pop_v_regs
+ ret
+
+
+
diff --git a/common/armv8/ih264_weighted_pred_av8.s b/common/armv8/ih264_weighted_pred_av8.s
new file mode 100755
index 0000000..6a03875
--- /dev/null
+++ b/common/armv8/ih264_weighted_pred_av8.s
@@ -0,0 +1,471 @@
+//******************************************************************************
+//*
+//* Copyright (C) 2015 The Android Open Source Project
+//*
+//* Licensed under the Apache License, Version 2.0 (the "License");
+//* you may not use this file except in compliance with the License.
+//* You may obtain a copy of the License at:
+//*
+//* http://www.apache.org/licenses/LICENSE-2.0
+//*
+//* Unless required by applicable law or agreed to in writing, software
+//* distributed under the License is distributed on an "AS IS" BASIS,
+//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//* See the License for the specific language governing permissions and
+//* limitations under the License.
+//*
+//*****************************************************************************
+//* Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
+//*/
+///**
+//******************************************************************************
+//* @file
+//* ih264_weighted_pred_av8.s
+//*
+//* @brief
+//* Contains function definitions for weighted prediction.
+//* Functions are coded using NEON intrinsics and can be compiled using ARM RVCT
+//*
+//* @author
+//* Kaushik Senthoor R
+//*
+//* @par List of Functions:
+//*
+//* - ih264_weighted_pred_luma_av8()
+//* - ih264_weighted_pred_chroma_av8()
+//*
+//* @remarks
+//* None
+//*
+//*******************************************************************************
+//*/
+//*******************************************************************************
+//* @function
+//* ih264_weighted_pred_luma_av8()
+//*
+//* @brief
+//* This routine performs the default weighted prediction as described in sec
+//* 8.4.2.3.2 titled "Weighted sample prediction process" for luma.
+//*
+//* @par Description:
+//* This function gets a ht x wd block, calculates the weighted sample, rounds
+//* off, adds offset and stores it in the destination block.
+//*
+//* @param[in] puc_src:
+//* UWORD8 Pointer to the buffer containing the input block.
+//*
+//* @param[out] puc_dst
+//* UWORD8 pointer to the destination where the output block is stored.
+//*
+//* @param[in] src_strd
+//* Stride of the input buffer
+//*
+//* @param[in] dst_strd
+//* Stride of the destination buffer
+//*
+//* @param[in] log_WD
+//* number of bits to be rounded off
+//*
+//* @param[in] wt
+//* weight for the weighted prediction
+//*
+//* @param[in] ofst
+//* offset used after rounding off
+//*
+//* @param[in] ht
+//* integer height of the array
+//*
+//* @param[in] wd
+//* integer width of the array
+//*
+//* @returns
+//* None
+//*
+//* @remarks
+//* (ht,wd) can be (4,4), (4,8), (8,4), (8,8), (8,16), (16,8) or (16,16).
+//*
+//*******************************************************************************
+//*/
+//void ih264_weighted_pred_luma_av8(UWORD8 *puc_src,
+// UWORD8 *puc_dst,
+// WORD32 src_strd,
+// WORD32 dst_strd,
+// UWORD8 log_WD,
+// UWORD32 wt,
+// UWORD16 ofst,
+// UWORD8 ht,
+// UWORD8 wd)
+//
+//**************Variables Vs Registers*****************************************
+// x0 => puc_src
+// x1 => puc_dst
+// x2 => src_strd
+// x3 => dst_strd
+// [sp] => log_WD (x4)
+// [sp+4] => wt (x5)
+// [sp+8] => ofst (x6)
+// [sp+12] => ht (x7)
+// [sp+16] => wd (x8)
+//
+.text
+.p2align 2
+.include "ih264_neon_macros.s"
+
+
+
+ .global ih264_weighted_pred_luma_av8
+
+ih264_weighted_pred_luma_av8:
+
+ // STMFD sp!, {x4-x9,x14} //stack stores the values of the arguments
+ push_v_regs
+ stp x19, x20, [sp, #-16]!
+ ldr w8, [sp, #80] //Load wd
+ sxtw x8, w8
+
+ dup v2.4h, w5 //D2 = wt (16-bit)
+ sub x20, x4, #0 //x9 = -log_WD
+ neg x9, x20
+ dup v3.8b, w6 //D3 = ofst (8-bit)
+ cmp w8, #16 //check if wd is 16
+ dup v0.8h, w9 //Q0 = -log_WD (16-bit)
+ beq loop_16 //branch if wd is 16
+
+ cmp w8, #8 //check if wd is 8
+ beq loop_8 //branch if wd is 8
+
+loop_4: //each iteration processes four rows
+
+ ld1 {v4.s}[0], [x0], x2 //load row 1 in source
+ ld1 {v4.s}[1], [x0], x2 //load row 2 in source
+ ld1 {v6.s}[0], [x0], x2 //load row 3 in source
+ ld1 {v6.s}[1], [x0], x2 //load row 4 in source
+
+ uxtl v4.8h, v4.8b //converting rows 1,2 to 16-bit
+ uxtl v6.8h, v6.8b //converting rows 3,4 to 16-bit
+
+ mul v4.8h, v4.8h , v2.4h[0] //weight mult. for rows 1,2
+ mul v6.8h, v6.8h , v2.4h[0] //weight mult. for rows 3,4
+
+ subs w7, w7, #4 //decrement ht by 4
+ srshl v4.8h, v4.8h , v0.8h //rounds off the weighted samples from rows 1,2
+ srshl v6.8h, v6.8h , v0.8h //rounds off the weighted samples from rows 3,4
+
+ saddw v4.8h, v4.8h , v3.8b //adding offset for rows 1,2
+ saddw v6.8h, v6.8h , v3.8b //adding offset for rows 3,4
+
+ sqxtun v4.8b, v4.8h //saturating rows 1,2 to unsigned 8-bit
+ sqxtun v6.8b, v6.8h //saturating rows 3,4 to unsigned 8-bit
+
+ st1 {v4.s}[0], [x1], x3 //store row 1 in destination
+ st1 {v4.s}[1], [x1], x3 //store row 2 in destination
+ st1 {v6.s}[0], [x1], x3 //store row 3 in destination
+ st1 {v6.s}[1], [x1], x3 //store row 4 in destination
+
+ bgt loop_4 //if greater than 0 repeat the loop again
+
+ b end_loops
+
+loop_8: //each iteration processes four rows
+
+ ld1 {v4.8b}, [x0], x2 //load row 1 in source
+ ld1 {v6.8b}, [x0], x2 //load row 2 in source
+ ld1 {v8.8b}, [x0], x2 //load row 3 in source
+ uxtl v4.8h, v4.8b //converting row 1 to 16-bit
+ ld1 {v10.8b}, [x0], x2 //load row 4 in source
+ uxtl v6.8h, v6.8b //converting row 2 to 16-bit
+
+ uxtl v8.8h, v8.8b //converting row 3 to 16-bit
+ mul v4.8h, v4.8h , v2.4h[0] //weight mult. for row 1
+ uxtl v10.8h, v10.8b //converting row 4 to 16-bit
+ mul v6.8h, v6.8h , v2.4h[0] //weight mult. for row 2
+ mul v8.8h, v8.8h , v2.4h[0] //weight mult. for row 3
+ mul v10.8h, v10.8h , v2.4h[0] //weight mult. for row 4
+
+ srshl v4.8h, v4.8h , v0.8h //rounds off the weighted samples from row 1
+ srshl v6.8h, v6.8h , v0.8h //rounds off the weighted samples from row 2
+ srshl v8.8h, v8.8h , v0.8h //rounds off the weighted samples from row 3
+ saddw v4.8h, v4.8h , v3.8b //adding offset for row 1
+ srshl v10.8h, v10.8h , v0.8h //rounds off the weighted samples from row 4
+ saddw v6.8h, v6.8h , v3.8b //adding offset for row 2
+
+ saddw v8.8h, v8.8h , v3.8b //adding offset for row 3
+ sqxtun v4.8b, v4.8h //saturating row 1 to unsigned 8-bit
+ saddw v10.8h, v10.8h , v3.8b //adding offset for row 4
+ sqxtun v6.8b, v6.8h //saturating row 2 to unsigned 8-bit
+ sqxtun v8.8b, v8.8h //saturating row 3 to unsigned 8-bit
+ sqxtun v10.8b, v10.8h //saturating row 4 to unsigned 8-bit
+
+ st1 {v4.8b}, [x1], x3 //store row 1 in destination
+ st1 {v6.8b}, [x1], x3 //store row 2 in destination
+ subs w7, w7, #4 //decrement ht by 4
+ st1 {v8.8b}, [x1], x3 //store row 3 in destination
+ st1 {v10.8b}, [x1], x3 //store row 4 in destination
+
+ bgt loop_8 //if greater than 0 repeat the loop again
+
+ b end_loops
+
+loop_16: //each iteration processes two rows
+
+ ld1 {v4.8b, v5.8b}, [x0], x2 //load row 1 in source
+ ld1 {v6.8b, v7.8b}, [x0], x2 //load row 2 in source
+ uxtl v12.8h, v4.8b //converting row 1L to 16-bit
+ ld1 {v8.8b, v9.8b}, [x0], x2 //load row 3 in source
+ uxtl v14.8h, v5.8b //converting row 1H to 16-bit
+ ld1 {v10.8b, v11.8b}, [x0], x2 //load row 4 in source
+ uxtl v16.8h, v6.8b //converting row 2L to 16-bit
+ mul v12.8h, v12.8h , v2.4h[0] //weight mult. for row 1L
+ uxtl v18.8h, v7.8b //converting row 2H to 16-bit
+ mul v14.8h, v14.8h , v2.4h[0] //weight mult. for row 1H
+ uxtl v20.8h, v8.8b //converting row 3L to 16-bit
+ mul v16.8h, v16.8h , v2.4h[0] //weight mult. for row 2L
+ uxtl v22.8h, v9.8b //converting row 3H to 16-bit
+ mul v18.8h, v18.8h , v2.4h[0] //weight mult. for row 2H
+ uxtl v24.8h, v10.8b //converting row 4L to 16-bit
+ mul v20.8h, v20.8h , v2.4h[0] //weight mult. for row 3L
+ uxtl v26.8h, v11.8b //converting row 4H to 16-bit
+ mul v22.8h, v22.8h , v2.4h[0] //weight mult. for row 3H
+ mul v24.8h, v24.8h , v2.4h[0] //weight mult. for row 4L
+ srshl v12.8h, v12.8h , v0.8h //rounds off the weighted samples from row 1L
+ mul v26.8h, v26.8h , v2.4h[0] //weight mult. for row 4H
+ srshl v14.8h, v14.8h , v0.8h //rounds off the weighted samples from row 1H
+ srshl v16.8h, v16.8h , v0.8h //rounds off the weighted samples from row 2L
+ saddw v12.8h, v12.8h , v3.8b //adding offset for row 1L
+ srshl v18.8h, v18.8h , v0.8h //rounds off the weighted samples from row 2H
+ saddw v14.8h, v14.8h , v3.8b //adding offset for row 1H
+ sqxtun v4.8b, v12.8h //saturating row 1L to unsigned 8-bit
+ srshl v20.8h, v20.8h , v0.8h //rounds off the weighted samples from row 3L
+ saddw v16.8h, v16.8h , v3.8b //adding offset for row 2L
+ sqxtun v5.8b, v14.8h //saturating row 1H to unsigned 8-bit
+ srshl v22.8h, v22.8h , v0.8h //rounds off the weighted samples from row 3H
+ saddw v18.8h, v18.8h , v3.8b //adding offset for row 2H
+ sqxtun v6.8b, v16.8h //saturating row 2L to unsigned 8-bit
+ srshl v24.8h, v24.8h , v0.8h //rounds off the weighted samples from row 4L
+ saddw v20.8h, v20.8h , v3.8b //adding offset for row 3L
+ sqxtun v7.8b, v18.8h //saturating row 2H to unsigned 8-bit
+ srshl v26.8h, v26.8h , v0.8h //rounds off the weighted samples from row 4H
+ saddw v22.8h, v22.8h , v3.8b //adding offset for row 3H
+ sqxtun v8.8b, v20.8h //saturating row 3L to unsigned 8-bit
+ saddw v24.8h, v24.8h , v3.8b //adding offset for row 4L
+ sqxtun v9.8b, v22.8h //saturating row 3H to unsigned 8-bit
+ saddw v26.8h, v26.8h , v3.8b //adding offset for row 4H
+ sqxtun v10.8b, v24.8h //saturating row 4L to unsigned 8-bit
+ st1 {v4.8b, v5.8b}, [x1], x3 //store row 1 in destination
+ sqxtun v11.8b, v26.8h //saturating row 4H to unsigned 8-bit
+ st1 {v6.8b, v7.8b}, [x1], x3 //store row 2 in destination
+ subs w7, w7, #4 //decrement ht by 4
+ st1 {v8.8b, v9.8b}, [x1], x3 //store row 3 in destination
+ st1 {v10.8b, v11.8b}, [x1], x3 //store row 4 in destination
+
+ bgt loop_16 //if greater than 0 repeat the loop again
+
+end_loops:
+
+ // LDMFD sp!,{x4-x9,x15} //Reload the registers from sp
+ ldp x19, x20, [sp], #16
+ pop_v_regs
+ ret
+
+
+//*******************************************************************************
+//* @function
+//* ih264_weighted_pred_chroma_av8()
+//*
+//* @brief
+//* This routine performs the default weighted prediction as described in sec
+//* 8.4.2.3.2 titled "Weighted sample prediction process" for chroma.
+//*
+//* @par Description:
+//* This function gets a ht x wd block, calculates the weighted sample, rounds
+//* off, adds offset and stores it in the destination block for U and V.
+//*
+//* @param[in] puc_src:
+//* UWORD8 Pointer to the buffer containing the input block.
+//*
+//* @param[out] puc_dst
+//* UWORD8 pointer to the destination where the output block is stored.
+//*
+//* @param[in] src_strd
+//* Stride of the input buffer
+//*
+//* @param[in] dst_strd
+//* Stride of the destination buffer
+//*
+//* @param[in] log_WD
+//* number of bits to be rounded off
+//*
+//* @param[in] wt
+//* weights for the weighted prediction for U and V
+//*
+//* @param[in] ofst
+//* offsets used after rounding off for U and V
+//*
+//* @param[in] ht
+//* integer height of the array
+//*
+//* @param[in] wd
+//* integer width of the array
+//*
+//* @returns
+//* None
+//*
+//* @remarks
+//* (ht,wd) can be (2,2), (2,4), (4,2), (4,4), (4,8), (8,4) or (8,8).
+//*
+//*******************************************************************************
+//*/
+//void ih264_weighted_pred_chroma_av8(UWORD8 *puc_src,
+// UWORD8 *puc_dst,
+// WORD32 src_strd,
+// WORD32 dst_strd,
+// UWORD8 log_WD,
+// UWORD32 wt,
+// UWORD16 ofst,
+// UWORD8 ht,
+// UWORD8 wd)
+//
+//**************Variables Vs Registers*****************************************
+// x0 => puc_src
+// x1 => puc_dst
+// x2 => src_strd
+// x3 => dst_strd
+// [sp] => log_WD (x4)
+// [sp+4] => wt (x5)
+// [sp+8] => ofst (x6)
+// [sp+12] => ht (x7)
+// [sp+16] => wd (x8)
+//
+
+
+
+
+ .global ih264_weighted_pred_chroma_av8
+
+ih264_weighted_pred_chroma_av8:
+
+ // STMFD sp!, {x4-x9,x14} //stack stores the values of the arguments
+ push_v_regs
+ stp x19, x20, [sp, #-16]!
+
+ ldr w8, [sp, #80] //Load wd
+ sxtw x8, w8
+
+ sub x20, x4, #0 //x9 = -log_WD
+ neg x9, x20
+ dup v2.4s, w5 //Q1 = {wt_u (16-bit), wt_v (16-bit)}
+
+
+ dup v4.4h, w6 //D4 = {ofst_u (8-bit), ofst_v (8-bit)}
+ cmp w8, #8 //check if wd is 8
+ dup v0.8h, w9 //Q0 = -log_WD (16-bit)
+ beq loop_8_uv //branch if wd is 8
+
+ cmp w8, #4 //check if ws is 4
+ beq loop_4_uv //branch if wd is 4
+
+loop_2_uv: //each iteration processes two rows
+
+ ld1 {v6.s}[0], [x0], x2 //load row 1 in source
+ ld1 {v6.s}[1], [x0], x2 //load row 2 in source
+ uxtl v6.8h, v6.8b //converting rows 1,2 to 16-bit
+ mul v6.8h, v6.8h , v2.8h //weight mult. for rows 1,2
+ srshl v6.8h, v6.8h , v0.8h //rounds off the weighted samples from rows 1,2
+ saddw v6.8h, v6.8h , v4.8b //adding offset for rows 1,2
+ sqxtun v6.8b, v6.8h //saturating rows 1,2 to unsigned 8-bit
+ subs w7, w7, #2 //decrement ht by 2
+ st1 {v6.s}[0], [x1], x3 //store row 1 in destination
+ st1 {v6.s}[1], [x1], x3 //store row 2 in destination
+ bgt loop_2_uv //if greater than 0 repeat the loop again
+ b end_loops_uv
+
+loop_4_uv: //each iteration processes two rows
+
+ ld1 {v6.8b}, [x0], x2 //load row 1 in source
+ ld1 {v8.8b}, [x0], x2 //load row 2 in source
+ uxtl v6.8h, v6.8b //converting row 1 to 16-bit
+ uxtl v8.8h, v8.8b //converting row 2 to 16-bit
+ mul v6.8h, v6.8h , v2.8h //weight mult. for row 1
+ mul v8.8h, v8.8h , v2.8h //weight mult. for row 2
+ subs w7, w7, #2 //decrement ht by 2
+ srshl v6.8h, v6.8h , v0.8h //rounds off the weighted samples from row 1
+ srshl v8.8h, v8.8h , v0.8h //rounds off the weighted samples from row 2
+ saddw v6.8h, v6.8h , v4.8b //adding offset for row 1
+ saddw v8.8h, v8.8h , v4.8b //adding offset for row 2
+ sqxtun v6.8b, v6.8h //saturating row 1 to unsigned 8-bit
+ sqxtun v8.8b, v8.8h //saturating row 2 to unsigned 8-bit
+ st1 {v6.8b}, [x1], x3 //store row 1 in destination
+ st1 {v8.8b}, [x1], x3 //store row 2 in destination
+
+ bgt loop_4_uv //if greater than 0 repeat the loop again
+
+ b end_loops_uv
+
+loop_8_uv: //each iteration processes two rows
+
+ ld1 {v6.8b, v7.8b}, [x0], x2 //load row 1 in source
+ ld1 {v8.8b, v9.8b}, [x0], x2 //load row 2 in source
+ uxtl v14.8h, v6.8b //converting row 1L to 16-bit
+ ld1 {v10.8b, v11.8b}, [x0], x2 //load row 3 in source
+ uxtl v16.8h, v7.8b //converting row 1H to 16-bit
+ ld1 {v12.8b, v13.8b}, [x0], x2 //load row 4 in source
+
+ mul v14.8h, v14.8h , v2.8h //weight mult. for row 1L
+ uxtl v18.8h, v8.8b //converting row 2L to 16-bit
+ mul v16.8h, v16.8h , v2.8h //weight mult. for row 1H
+ uxtl v20.8h, v9.8b //converting row 2H to 16-bit
+ mul v18.8h, v18.8h , v2.8h //weight mult. for row 2L
+ uxtl v22.8h, v10.8b //converting row 3L to 16-bit
+ mul v20.8h, v20.8h , v2.8h //weight mult. for row 2H
+ uxtl v24.8h, v11.8b //converting row 3H to 16-bit
+ mul v22.8h, v22.8h , v2.8h //weight mult. for row 3L
+ uxtl v26.8h, v12.8b //converting row 4L to 16-bit
+ mul v24.8h, v24.8h , v2.8h //weight mult. for row 3H
+ uxtl v28.8h, v13.8b //converting row 4H to 16-bit
+
+ mul v26.8h, v26.8h , v2.8h //weight mult. for row 4L
+ srshl v14.8h, v14.8h , v0.8h //rounds off the weighted samples from row 1L
+ mul v28.8h, v28.8h , v2.8h //weight mult. for row 4H
+
+ srshl v16.8h, v16.8h , v0.8h //rounds off the weighted samples from row 1H
+ srshl v18.8h, v18.8h , v0.8h //rounds off the weighted samples from row 2L
+ saddw v14.8h, v14.8h , v4.8b //adding offset for row 1L
+ srshl v20.8h, v20.8h , v0.8h //rounds off the weighted samples from row 2H
+ saddw v16.8h, v16.8h , v4.8b //adding offset for row 1H
+ sqxtun v6.8b, v14.8h //saturating row 1L to unsigned 8-bit
+ srshl v22.8h, v22.8h , v0.8h //rounds off the weighted samples from row 3L
+ saddw v18.8h, v18.8h , v4.8b //adding offset for row 2L
+ sqxtun v7.8b, v16.8h //saturating row 1H to unsigned 8-bit
+ srshl v24.8h, v24.8h , v0.8h //rounds off the weighted samples from row 3H
+ saddw v20.8h, v20.8h , v4.8b //adding offset for row 2H
+ sqxtun v8.8b, v18.8h //saturating row 2L to unsigned 8-bit
+ srshl v26.8h, v26.8h , v0.8h //rounds off the weighted samples from row 4L
+ saddw v22.8h, v22.8h , v4.8b //adding offset for row 3L
+ sqxtun v9.8b, v20.8h //saturating row 2H to unsigned 8-bit
+ srshl v28.8h, v28.8h , v0.8h //rounds off the weighted samples from row 4H
+ saddw v24.8h, v24.8h , v4.8b //adding offset for row 3H
+
+ sqxtun v10.8b, v22.8h //saturating row 3L to unsigned 8-bit
+ saddw v26.8h, v26.8h , v4.8b //adding offset for row 4L
+ sqxtun v11.8b, v24.8h //saturating row 3H to unsigned 8-bit
+ saddw v28.8h, v28.8h , v4.8b //adding offset for row 4H
+
+ sqxtun v12.8b, v26.8h //saturating row 4L to unsigned 8-bit
+ st1 {v6.8b, v7.8b}, [x1], x3 //store row 1 in destination
+ sqxtun v13.8b, v28.8h //saturating row 4H to unsigned 8-bit
+ st1 {v8.8b, v9.8b}, [x1], x3 //store row 2 in destination
+ subs w7, w7, #4 //decrement ht by 4
+ st1 {v10.8b, v11.8b}, [x1], x3 //store row 3 in destination
+ st1 {v12.8b, v13.8b}, [x1], x3 //store row 4 in destination
+
+ bgt loop_8_uv //if greater than 0 repeat the loop again
+
+end_loops_uv:
+
+ // LDMFD sp!,{x4-x9,x15} //Reload the registers from sp
+ ldp x19, x20, [sp], #16
+ pop_v_regs
+ ret
+
+
+
diff --git a/common/ih264_buf_mgr.c b/common/ih264_buf_mgr.c
new file mode 100755
index 0000000..ea4333e
--- /dev/null
+++ b/common/ih264_buf_mgr.c
@@ -0,0 +1,696 @@
+/******************************************************************************
+ *
+ * Copyright (C) 2015 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ *****************************************************************************
+ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
+*/
+/**
+*******************************************************************************
+* @file
+* ih264_buf_mgr.c
+*
+* @brief
+* Contains function definitions for buffer management
+*
+* @author
+* Srinivas T
+*
+* @par List of Functions:
+* - ih264_buf_mgr_size()
+* - ih264_buf_mgr_lock()
+* - ih264_buf_mgr_unlock()
+* - ih264_buf_mgr_yield()
+* - ih264_buf_mgr_free()
+* - ih264_buf_mgr_init()
+* - ih264_buf_mgr_add()
+* - ih264_buf_mgr_get_next_free()
+* - ih264_buf_mgr_check_free()
+* - ih264_buf_mgr_set_status()
+* - ih264_buf_mgr_get_status()
+* - ih264_buf_mgr_get_buf()
+* - ih264_buf_mgr_get_bufid()
+* - ih264_buf_mgr_get_num_active_buf()
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+#include <stdio.h>
+#include <stdlib.h>
+#include "ih264_typedefs.h"
+#include "ih264_macros.h"
+#include "ih264_defs.h"
+#include "ih264_error.h"
+#include "ih264_buf_mgr.h"
+
+#include "ithread.h"
+
+/**
+*******************************************************************************
+*
+* @brief Returns size for buf queue context. Does not include buf queue buffer
+* requirements
+*
+* @par Description
+* Returns size for buf queue context. Does not include buf queue buffer
+* requirements. Buffer size required to store the bufs should be allocated in
+* addition to the value returned here.
+*
+* @returns Size of the buf queue context
+*
+* @remarks
+*
+*******************************************************************************
+*/
+WORD32 ih264_buf_mgr_size(void)
+{
+ WORD32 size;
+
+ size = sizeof(buf_mgr_t);
+ size += ithread_get_mutex_lock_size();
+
+ return size;
+}
+
+/**
+*******************************************************************************
+*
+* @brief
+* Locks the buf_mgr context
+*
+* @par Description
+* Locks the buf_mgr context by calling ithread_mutex_lock()
+*
+* @param[in] ps_buf_mgr
+* Job Queue context
+*
+* @returns IH264_FAIL if mutex lock fails else IH264_SUCCESS
+*
+* @remarks
+*
+*******************************************************************************
+*/
+IH264_ERROR_T ih264_buf_mgr_lock(buf_mgr_t *ps_buf_mgr)
+{
+ WORD32 retval;
+ retval = ithread_mutex_lock(ps_buf_mgr->pv_mutex);
+ if(retval)
+ {
+ return IH264_FAIL;
+ }
+ return IH264_SUCCESS;
+}
+
+/**
+*******************************************************************************
+*
+* @brief
+* Unlocks the buf_mgr context
+*
+* @par Description
+* Unlocks the buf_mgr context by calling ithread_mutex_unlock()
+*
+* @param[in] ps_buf_mgr
+* Job Queue context
+*
+* @returns IH264_FAIL if mutex unlock fails else IH264_SUCCESS
+*
+* @remarks
+*
+*******************************************************************************
+*/
+
+IH264_ERROR_T ih264_buf_mgr_unlock(buf_mgr_t *ps_buf_mgr)
+{
+ WORD32 retval;
+ retval = ithread_mutex_unlock(ps_buf_mgr->pv_mutex);
+ if(retval)
+ {
+ return IH264_FAIL;
+ }
+ return IH264_SUCCESS;
+
+}
+/**
+*******************************************************************************
+*
+* @brief
+* Yeilds the thread
+*
+* @par Description
+* Unlocks the buf_mgr context by calling
+* ih264_buf_mgr_unlock(), ithread_yield() and then ih264_buf_mgr_lock()
+* buf_mgr is unlocked before to ensure the buf_mgr can be accessed by other threads
+* If unlock is not done before calling yield then no other thread can access
+* the buf_mgr functions and update buf_mgr.
+*
+* @param[in] ps_buf_mgr
+* Job Queue context
+*
+* @returns IH264_FAIL if mutex lock unlock or yield fails else IH264_SUCCESS
+*
+* @remarks
+*
+*******************************************************************************
+*/
+IH264_ERROR_T ih264_buf_mgr_yield(buf_mgr_t *ps_buf_mgr)
+{
+
+ IH264_ERROR_T ret = IH264_SUCCESS;
+
+ IH264_ERROR_T rettmp;
+ rettmp = ih264_buf_mgr_unlock(ps_buf_mgr);
+ RETURN_IF((rettmp != IH264_SUCCESS), rettmp);
+
+ //ithread_usleep(10);
+ ithread_yield();
+
+ rettmp = ih264_buf_mgr_lock(ps_buf_mgr);
+ RETURN_IF((rettmp != IH264_SUCCESS), rettmp);
+ return ret;
+}
+
+
+/**
+*******************************************************************************
+*
+* @brief free the buf queue pointers
+*
+* @par Description
+* Frees the buf_mgr context
+*
+* @param[in] pv_buf
+* Memoy for buf queue buffer and buf queue context
+*
+* @returns Pointer to buf queue context
+*
+* @remarks
+* Since it will be called only once by master thread this is not thread safe.
+*
+*******************************************************************************
+*/
+IH264_ERROR_T ih264_buf_mgr_free(buf_mgr_t *ps_buf_mgr)
+{
+ WORD32 ret;
+ ret = ithread_mutex_destroy(ps_buf_mgr->pv_mutex);
+
+ if(0 == ret)
+ return IH264_SUCCESS;
+ else
+ return IH264_FAIL;
+}
+/**
+*******************************************************************************
+*
+* @brief
+* Buffer manager initialization function.
+*
+* @par Description:
+* Initializes the buffer manager structure
+*
+* @param[in] ps_buf_mgr
+* Pointer to the buffer manager
+*
+* @returns
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+
+
+void *ih264_buf_mgr_init(void *pv_buf)
+{
+ WORD32 id;
+ UWORD8 *pu1_buf;
+ buf_mgr_t *ps_buf_mgr;
+ pu1_buf = (UWORD8 *)pv_buf;
+
+ ps_buf_mgr = (buf_mgr_t *)pu1_buf;
+ pu1_buf += sizeof(buf_mgr_t);
+
+ ps_buf_mgr->pv_mutex = pu1_buf;
+ pu1_buf += ithread_get_mutex_lock_size();
+
+ ithread_mutex_init(ps_buf_mgr->pv_mutex);
+
+ ps_buf_mgr->i4_max_buf_cnt = BUF_MGR_MAX_CNT;
+ ps_buf_mgr->i4_active_buf_cnt = 0;
+
+ for(id = 0; id < BUF_MGR_MAX_CNT; id++)
+ {
+ ps_buf_mgr->au4_status[id] = 0;
+ ps_buf_mgr->apv_ptr[id] = NULL;
+ }
+
+ return ps_buf_mgr;
+}
+
+
+/**
+*******************************************************************************
+*
+* @brief
+* Adds and increments the buffer and buffer count.
+*
+* @par Description:
+* Adds a buffer to the buffer manager if it is not already present and
+* increments the active buffer count
+*
+* @param[in] ps_buf_mgr
+* Pointer to the buffer manager
+*
+* @param[in] pv_ptr
+* Pointer to the buffer to be added
+*
+* @returns Returns 0 on success, -1 otherwise
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+IH264_ERROR_T ih264_buf_mgr_add(buf_mgr_t *ps_buf_mgr,
+ void *pv_ptr,
+ WORD32 buf_id)
+{
+
+ IH264_ERROR_T ret = IH264_SUCCESS;
+ ret = ih264_buf_mgr_lock(ps_buf_mgr);
+ RETURN_IF((ret != IH264_SUCCESS), ret);
+
+ /* Check if buffer ID is within allowed range */
+ if(buf_id >= ps_buf_mgr->i4_max_buf_cnt)
+ {
+ ret = ih264_buf_mgr_unlock(ps_buf_mgr);
+ RETURN_IF((ret != IH264_SUCCESS), ret);
+
+ return IH264_FAIL;
+ }
+
+ /* Check if the current ID is being used to hold some other buffer */
+ if((ps_buf_mgr->apv_ptr[buf_id] != NULL) &&
+ (ps_buf_mgr->apv_ptr[buf_id] !=pv_ptr))
+ {
+ ret = ih264_buf_mgr_unlock(ps_buf_mgr);
+ RETURN_IF((ret != IH264_SUCCESS), ret);
+
+ return IH264_FAIL;
+ }
+ ps_buf_mgr->apv_ptr[buf_id] = pv_ptr;
+ ps_buf_mgr->i4_active_buf_cnt++;
+
+ ret = ih264_buf_mgr_unlock(ps_buf_mgr);
+ RETURN_IF((ret != IH264_SUCCESS), ret);
+
+ return ret;
+}
+
+/**
+*******************************************************************************
+*
+* @brief
+* Gets the next free buffer.
+*
+* @par Description:
+* Returns the next free buffer available and sets the corresponding status
+* to DEC
+*
+* @param[in] ps_buf_mgr
+* Pointer to the buffer manager
+*
+* @param[in] pi4_buf_id
+* Pointer to the id of the free buffer
+*
+* @returns Pointer to the free buffer
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+void* ih264_buf_mgr_get_next_free(buf_mgr_t *ps_buf_mgr, WORD32 *pi4_buf_id)
+{
+ WORD32 id;
+ void *pv_ret_ptr;
+ IH264_ERROR_T ret = IH264_SUCCESS;
+ ret = ih264_buf_mgr_lock(ps_buf_mgr);
+ RETURN_IF((ret != IH264_SUCCESS), NULL);
+
+ pv_ret_ptr = NULL;
+ for(id = 0; id < ps_buf_mgr->i4_active_buf_cnt; id++)
+ {
+ /* Check if the buffer is non-null and status is zero */
+ if((ps_buf_mgr->au4_status[id] == 0) && (ps_buf_mgr->apv_ptr[id]))
+ {
+ *pi4_buf_id = id;
+ /* DEC is set to 1 */
+ ps_buf_mgr->au4_status[id] = 1;
+ pv_ret_ptr = ps_buf_mgr->apv_ptr[id];
+ break;
+ }
+ }
+ ret = ih264_buf_mgr_unlock(ps_buf_mgr);
+ RETURN_IF((ret != IH264_SUCCESS), NULL);
+
+ return pv_ret_ptr;
+}
+
+
+/**
+*******************************************************************************
+*
+* @brief
+* Checks the buffer manager for free buffers available.
+*
+* @par Description:
+* Checks if there are any free buffers available
+*
+* @param[in] ps_buf_mgr
+* Pointer to the buffer manager
+*
+* @returns Returns 0 if available, -1 otherwise
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+IH264_ERROR_T ih264_buf_mgr_check_free(buf_mgr_t *ps_buf_mgr)
+{
+ WORD32 id;
+ IH264_ERROR_T ret = IH264_SUCCESS;
+ IH264_ERROR_T rettmp = IH264_SUCCESS;
+ rettmp = ih264_buf_mgr_lock(ps_buf_mgr);
+ RETURN_IF((rettmp != IH264_SUCCESS), ret);
+
+ ret = IH264_FAIL;
+ for(id = 0; id < ps_buf_mgr->i4_active_buf_cnt; id++)
+ {
+ if((ps_buf_mgr->au4_status[id] == 0) &&
+ (ps_buf_mgr->apv_ptr[id]))
+ {
+ ret = IH264_SUCCESS;
+ break;
+ }
+ }
+ rettmp = ih264_buf_mgr_unlock(ps_buf_mgr);
+ RETURN_IF((rettmp != IH264_SUCCESS), ret);
+
+ return ret;
+
+}
+
+
+/**
+*******************************************************************************
+*
+* @brief
+* Resets the status bits.
+*
+* @par Description:
+* resets the status bits that the mask contains (status corresponding to
+* the id)
+*
+* @param[in] ps_buf_mgr
+* Pointer to the buffer manager
+*
+* @param[in] buf_id
+* ID of the buffer status to be released
+*
+* @param[in] mask
+* Contains the bits that are to be reset
+*
+* @returns 0 if success, -1 otherwise
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+IH264_ERROR_T ih264_buf_mgr_release(buf_mgr_t *ps_buf_mgr,
+ WORD32 buf_id,
+ UWORD32 mask)
+{
+ IH264_ERROR_T ret = IH264_SUCCESS;
+ ret = ih264_buf_mgr_lock(ps_buf_mgr);
+ RETURN_IF((ret != IH264_SUCCESS), ret);
+
+
+ /* If the given id is pointing to an id which is not yet added */
+ if(buf_id >= ps_buf_mgr->i4_active_buf_cnt)
+ {
+ ret = ih264_buf_mgr_unlock(ps_buf_mgr);
+ RETURN_IF((ret != IH264_SUCCESS), ret);
+ return IH264_FAIL;
+ }
+
+ ps_buf_mgr->au4_status[buf_id] &= ~mask;
+
+
+/* If both the REF and DISP are zero, DEC is set to zero */
+ if(ps_buf_mgr->au4_status[buf_id] == 1)
+ {
+ ps_buf_mgr->au4_status[buf_id] = 0;
+ }
+
+
+ ret = ih264_buf_mgr_unlock(ps_buf_mgr);
+ RETURN_IF((ret != IH264_SUCCESS), ret);
+
+ return ret;
+}
+
+
+/**
+*******************************************************************************
+*
+* @brief
+* Sets the status bit.
+*
+* @par Description:
+* sets the status bits that the mask contains (status corresponding to the
+* id)
+*
+*
+* @param[in] ps_buf_mgr
+* Pointer to the buffer manager
+*
+* @param[in] buf_id
+* ID of the buffer whose status needs to be modified
+*
+*
+* @param[in] mask
+* Contains the bits that are to be set
+*
+* @returns 0 if success, -1 otherwise
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+IH264_ERROR_T ih264_buf_mgr_set_status(buf_mgr_t *ps_buf_mgr,
+ WORD32 buf_id,
+ UWORD32 mask)
+{
+ IH264_ERROR_T ret = IH264_SUCCESS;
+ ret = ih264_buf_mgr_lock(ps_buf_mgr);
+ RETURN_IF((ret != IH264_SUCCESS), ret);
+
+ if(buf_id >= ps_buf_mgr->i4_active_buf_cnt)
+ {
+ ret = ih264_buf_mgr_unlock(ps_buf_mgr);
+ RETURN_IF((ret != IH264_SUCCESS), ret);
+ return IH264_FAIL;
+ }
+
+
+ if((ps_buf_mgr->au4_status[buf_id] & mask) != 0)
+ {
+ ret = ih264_buf_mgr_unlock(ps_buf_mgr);
+ RETURN_IF((ret != IH264_SUCCESS), ret);
+ return IH264_FAIL;
+ }
+
+ ps_buf_mgr->au4_status[buf_id] |= mask;
+ ret = ih264_buf_mgr_unlock(ps_buf_mgr);
+ RETURN_IF((ret != IH264_SUCCESS), ret);
+
+ return ret;
+}
+
+
+/**
+*******************************************************************************
+*
+* @brief
+* Returns the status of the buffer.
+*
+* @par Description:
+* Returns the status of the buffer corresponding to the id
+*
+* @param[in] ps_buf_mgr
+* Pointer to the buffer manager
+*
+* @param[in] buf_id
+* ID of the buffer status required
+*
+* @returns Status of the buffer corresponding to the id
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+WORD32 ih264_buf_mgr_get_status( buf_mgr_t *ps_buf_mgr, WORD32 buf_id )
+{
+ IH264_ERROR_T ret = IH264_SUCCESS;
+ UWORD32 status;
+
+ ret = ih264_buf_mgr_lock(ps_buf_mgr);
+ RETURN_IF((ret != IH264_SUCCESS), ret);
+
+ status = ps_buf_mgr->au4_status[buf_id];
+
+ ret = ih264_buf_mgr_unlock(ps_buf_mgr);
+ RETURN_IF((ret != IH264_SUCCESS), ret);
+
+ return status;
+}
+
+
+/**
+*******************************************************************************
+*
+* @brief
+* Gets the buffer from the buffer manager
+*
+* @par Description:
+* Returns the pointer to the buffer corresponding to the id
+*
+* @param[in] ps_buf_mgr
+* Pointer to the buffer manager
+*
+* @param[in] buf_id
+* ID of the buffer required
+*
+* @returns Pointer to the buffer required
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+void* ih264_buf_mgr_get_buf(buf_mgr_t *ps_buf_mgr, WORD32 buf_id)
+{
+ IH264_ERROR_T ret = IH264_SUCCESS;
+ void *pv_buf;
+ ret = ih264_buf_mgr_lock(ps_buf_mgr);
+ RETURN_IF((ret != IH264_SUCCESS), NULL);
+
+ pv_buf = ps_buf_mgr->apv_ptr[buf_id];
+
+ ret = ih264_buf_mgr_unlock(ps_buf_mgr);
+ RETURN_IF((ret != IH264_SUCCESS), NULL);
+
+ return pv_buf;
+}
+
+
+/**
+*******************************************************************************
+*
+* @brief
+* Gets the buffer id from the buffer manager if the buffer is added to the
+* buffer manager
+*
+* @par Description:
+* Returns the buffer id corresponding to the given buffer if it exists
+*
+* @param[in] ps_buf_mgr
+* Pointer to the buffer manager
+*
+* @param[in] pv_buf
+* Pointer to the buffer
+*
+* @returns Buffer id if exists, else -1
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+WORD32 ih264_buf_mgr_get_bufid(buf_mgr_t *ps_buf_mgr, void *pv_buf)
+{
+ WORD32 id;
+ WORD32 buf_id = -1;
+ IH264_ERROR_T ret = IH264_SUCCESS;
+ ret = ih264_buf_mgr_lock(ps_buf_mgr);
+ RETURN_IF((ret != IH264_SUCCESS), ret);
+
+ for(id = 0; id < ps_buf_mgr->i4_active_buf_cnt; id++)
+ {
+ if(ps_buf_mgr->apv_ptr[id] == pv_buf)
+ {
+ buf_id = id;
+ break;
+ }
+ }
+ ret = ih264_buf_mgr_unlock(ps_buf_mgr);
+ RETURN_IF((ret != IH264_SUCCESS), ret);
+
+ return buf_id;
+}
+
+
+/**
+*******************************************************************************
+*
+* @brief
+* Gets the no.of active buffer
+*
+* @par Description:
+* Return the number of active buffers in the buffer manager
+*
+* @param[in] ps_buf_mgr
+* Pointer to the buffer manager
+*
+* @returns number of active buffers
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+UWORD32 ih264_buf_mgr_get_num_active_buf(buf_mgr_t *ps_buf_mgr)
+{
+ UWORD32 u4_buf_cnt;
+ IH264_ERROR_T ret = IH264_SUCCESS;
+
+ u4_buf_cnt = 0;
+
+ ret = ih264_buf_mgr_lock(ps_buf_mgr);
+ RETURN_IF((ret != IH264_SUCCESS), ret);
+ u4_buf_cnt = ps_buf_mgr->i4_active_buf_cnt;
+
+ ret = ih264_buf_mgr_unlock(ps_buf_mgr);
+ RETURN_IF((ret != IH264_SUCCESS), ret);
+
+ return u4_buf_cnt;
+}
diff --git a/common/ih264_buf_mgr.h b/common/ih264_buf_mgr.h
new file mode 100755
index 0000000..52efa70
--- /dev/null
+++ b/common/ih264_buf_mgr.h
@@ -0,0 +1,122 @@
+/******************************************************************************
+ *
+ * Copyright (C) 2015 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ *****************************************************************************
+ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
+*/
+/**
+*******************************************************************************
+* @file
+* ih264_buf_mgr.h
+*
+* @brief
+* Function declarations used for buffer management
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+#ifndef _IH264_BUF_MGR_H_
+#define _IH264_BUF_MGR_H_
+
+#define BUF_MGR_MAX_CNT 64
+
+/** Flag for current encoding decoder */
+#define BUF_MGR_CODEC (1 << 1)
+
+/** Flag for reference status */
+#define BUF_MGR_REF (1 << 2)
+
+/** Flag for I/O - Display/output in case of decoder, capture/input in case of encoder */
+#define BUF_MGR_IO (1 << 3)
+
+typedef struct
+{
+ /**
+ * Mutex used to keep the functions thread-safe
+ */
+ void *pv_mutex;
+
+ /**
+ * max_buf_cnt
+ */
+ WORD32 i4_max_buf_cnt;
+
+ /**
+ * active_buf_cnt
+ */
+ WORD32 i4_active_buf_cnt;
+
+ /**
+ * au4_status[BUF_MGR_MAX_CNT]
+ */
+ UWORD32 au4_status[BUF_MGR_MAX_CNT];
+
+ /* The last three bit of status are: */
+
+ /* Bit 0 - IN USE */
+ /* Bit 1 - CODEC */
+ /* Bit 2 - REF */
+ /* Bit 3 - DISP/IO/RECON */
+ void *apv_ptr[BUF_MGR_MAX_CNT];
+
+}buf_mgr_t;
+
+// Returns size of the buffer manager context
+WORD32 ih264_buf_mgr_size(void);
+
+//Free buffer manager
+IH264_ERROR_T ih264_buf_mgr_free(buf_mgr_t *ps_buf_mgr);
+
+// Initializes the buffer API structure
+void *ih264_buf_mgr_init(void *pv_buf);
+
+// Add buffer to buffer manager. 0: success, -1: fail (u4_active_buf_cnt has reached u4_max_buf_cnt)
+IH264_ERROR_T ih264_buf_mgr_add(buf_mgr_t *ps_buf_mgr,
+ void *pv_ptr,
+ WORD32 buf_id);
+
+// this function will set the buffer status to DEC
+void* ih264_buf_mgr_get_next_free(buf_mgr_t *ps_buf_mgr, WORD32 *pi4_id);
+
+// this function will check if there are any free buffers
+IH264_ERROR_T ih264_buf_mgr_check_free(buf_mgr_t *ps_buf_mgr);
+
+// mask will have who released it: DISP:REF:DEC
+IH264_ERROR_T ih264_buf_mgr_release(buf_mgr_t *ps_buf_mgr,
+ WORD32 id,
+ UWORD32 mask);
+
+// sets the status to one or all of DISP:REF:DEC
+IH264_ERROR_T ih264_buf_mgr_set_status(buf_mgr_t *ps_buf_mgr,
+ WORD32 id,
+ UWORD32 mask);
+
+// Gets status of the buffer
+WORD32 ih264_buf_mgr_get_status(buf_mgr_t *ps_buf_mgr, WORD32 id);
+
+// pass the ID - buffer will be returned
+void* ih264_buf_mgr_get_buf(buf_mgr_t *ps_buf_mgr, WORD32 id);
+//Pass buffer to get ID
+WORD32 ih264_buf_mgr_get_bufid(buf_mgr_t *ps_buf_mgr, void *pv_buf);
+
+// will return number of active buffers
+UWORD32 ih264_buf_mgr_get_num_active_buf(buf_mgr_t *ps_buf_mgr);
+
+
+
+#endif /* _IH264_BUF_MGR_H_ */
diff --git a/common/ih264_cabac_tables.c b/common/ih264_cabac_tables.c
new file mode 100755
index 0000000..118ca12
--- /dev/null
+++ b/common/ih264_cabac_tables.c
@@ -0,0 +1,10869 @@
+/******************************************************************************
+ *
+ * Copyright (C) 2015 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ *****************************************************************************
+ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
+*/
+
+
+/**
+******************************************************************************
+* @file
+* ih264_cabac_tables.c
+*
+* @brief
+* This file contains H264 cabac tables for init contexts, rlps and
+* cabac state trasnitions
+*
+* @author
+* Ittiam
+*
+* @par List of Tables
+* - gau1_ih264_cab_ctxts[]
+* - gau1_ih264_next_state[]
+* - gau1_ih264_cab_ctxts[][][]
+*
+******************************************************************************
+*/
+
+
+/*****************************************************************************/
+/* File Includes */
+/*****************************************************************************/
+
+/* User include files */
+#include "ih264_typedefs.h"
+#include "ih264_cabac_tables.h"
+
+
+/*****************************************************************************/
+/* Extern global definitions */
+/*****************************************************************************/
+
+/**
+ ******************************************************************************
+ * @brief Table for rangeTabLPS depending on pStateIdx and qCodIRangeIdx
+ * input : pStateIdx(0-63) and qCodIRangeIdx(0-3) [(Range >> 6) & 0x3]
+ * output : RLPS
+ *
+ * @remarks See Table 9-35 of H264 spec for rangeTabLPS
+ ******************************************************************************
+ */
+const UWORD8 gau1_ih264_cabac_rlps[64][4] =
+{
+ { 128, 176, 208, 240},
+ { 128, 167, 197, 227},
+ { 128, 158, 187, 216},
+ { 123, 150, 178, 205},
+ { 116, 142, 169, 195},
+ { 111, 135, 160, 185},
+ { 105, 128, 152, 175},
+ { 100, 122, 144, 166},
+ { 95, 116, 137, 158},
+ { 90, 110, 130, 150},
+ { 85, 104, 123, 142},
+ { 81, 99, 117, 135},
+ { 77, 94, 111, 128},
+ { 73, 89, 105, 122},
+ { 69, 85, 100, 116},
+ { 66, 80, 95, 110},
+ { 62, 76, 90, 104},
+ { 59, 72, 86, 99},
+ { 56, 69, 81, 94},
+ { 53, 65, 77, 89},
+ { 51, 62, 73, 85},
+ { 48, 59, 69, 80},
+ { 46, 56, 66, 76},
+ { 43, 53, 63, 72},
+ { 41, 50, 59, 69},
+ { 39, 48, 56, 65},
+ { 37, 45, 54, 62},
+ { 35, 43, 51, 59},
+ { 33, 41, 48, 56},
+ { 32, 39, 46, 53},
+ { 30, 37, 43, 50},
+ { 29, 35, 41, 48},
+ { 27, 33, 39, 45},
+ { 26, 31, 37, 43},
+ { 24, 30, 35, 41},
+ { 23, 28, 33, 39},
+ { 22, 27, 32, 37},
+ { 21, 26, 30, 35},
+ { 20, 24, 29, 33},
+ { 19, 23, 27, 31},
+ { 18, 22, 26, 30},
+ { 17, 21, 25, 28},
+ { 16, 20, 23, 27},
+ { 15, 19, 22, 25},
+ { 14, 18, 21, 24},
+ { 14, 17, 20, 23},
+ { 13, 16, 19, 22},
+ { 12, 15, 18, 21},
+ { 12, 14, 17, 20},
+ { 11, 14, 16, 19},
+ { 11, 13, 15, 18},
+ { 10, 12, 15, 17},
+ { 10, 12, 14, 16},
+ { 9, 11, 13, 15},
+ { 9, 11, 12, 14},
+ { 8, 10, 12, 14},
+ { 8, 9, 11, 13},
+ { 7, 9, 11, 12},
+ { 7, 9, 10, 12},
+ { 7, 8, 10, 11},
+ { 6, 8, 9, 11},
+ { 6, 7, 9, 10},
+ { 6, 7, 8, 9},
+ { 2, 2, 2, 2}
+};
+
+/**
+ ******************************************************************************
+ * @brief probaility+MPS state transition tables based on cur State and bin
+ * input : curpState[bits7-2] | curMPS[bit1] | decodedBin[bit0]
+ * output : nextpState[bits6-1] | nextMPS[bit0]
+ * @remarks Modified form of Table-9-36 State Transition table in H264 spec
+ ******************************************************************************
+ */
+const UWORD8 gau1_ih264_next_state[64 * 2 * 2] =
+{
+/*****************************************************************************/
+/* m=0,b=0 | m=0,b=1 | m=1,b=0 | m=1,b=1 */
+/*****************************************************************************/
+ 2, 1, 0, 3,/* mps reversal for m=0,b=1 / m=1,b=0 */
+ 4, 0, 1, 5,
+ 6, 2, 3, 7,
+ 8, 4, 5, 9,
+ 10, 4, 5, 11,
+ 12, 8, 9, 13,
+ 14, 8, 9, 15,
+ 16, 10, 11, 17,
+ 18, 12, 13, 19,
+ 20, 14, 15, 21,
+ 22, 16, 17, 23,
+ 24, 18, 19, 25,
+ 26, 18, 19, 27,
+ 28, 22, 23, 29,
+ 30, 22, 23, 31,
+ 32, 24, 25, 33,
+ 34, 26, 27, 35,
+ 36, 26, 27, 37,
+ 38, 30, 31, 39,
+ 40, 30, 31, 41,
+ 42, 32, 33, 43,
+ 44, 32, 33, 45,
+ 46, 36, 37, 47,
+ 48, 36, 37, 49,
+ 50, 38, 39, 51,
+ 52, 38, 39, 53,
+ 54, 42, 43, 55,
+ 56, 42, 43, 57,
+ 58, 44, 45, 59,
+ 60, 44, 45, 61,
+ 62, 46, 47, 63,
+ 64, 48, 49, 65,
+ 66, 48, 49, 67,
+ 68, 50, 51, 69,
+ 70, 52, 53, 71,
+ 72, 52, 53, 73,
+ 74, 54, 55, 75,
+ 76, 54, 55, 77,
+ 78, 56, 57, 79,
+ 80, 58, 59, 81,
+ 82, 58, 59, 83,
+ 84, 60, 61, 85,
+ 86, 60, 61, 87,
+ 88, 60, 61, 89,
+ 90, 62, 63, 91,
+ 92, 64, 65, 93,
+ 94, 64, 65, 95,
+ 96, 66, 67, 97,
+ 98, 66, 67, 99,
+ 100, 66, 67, 101,
+ 102, 68, 69, 103,
+ 104, 68, 69, 105,
+ 106, 70, 71, 107,
+ 108, 70, 71, 109,
+ 110, 70, 71, 111,
+ 112, 72, 73, 113,
+ 114, 72, 73, 115,
+ 116, 72, 73, 117,
+ 118, 74, 75, 119,
+ 120, 74, 75, 121,
+ 122, 74, 75, 123,
+ 124, 76, 77, 125,
+ 124, 76, 77, 125,
+ 126, 126, 127, 127
+};
+
+
+/*
+******************************************************************************
+* As per H264 standard the cabac initialization of context variables
+* are generated using following logic
+* (ref: section 9.3.1.1 of ITU-T Rec. H.264 (03/2005))
+*
+* The two values assigned to pStateIdx and valMPS during this initialization
+* are derived from SliceQPY
+*
+* Given the two table entries [m, n] (for a given slice type, context index and
+* cabac_init_idc), the initialization is specified by the following pseudo-code process
+*
+* preCtxState = Clip3( 1, 126, ( ( m * Clip3( 0, 51, SliceQPY ) ) >> 4 ) + n )
+* if( preCtxState <= 63 ) {
+* pStateIdx = 63 - preCtxState
+* valMPS = 0
+* } else {
+* pStateIdx = preCtxState - 64
+* valMPS = 1
+* }
+******************************************************************************
+*/
+
+/**
+ ******************************************************************************
+ * @brief Init context tables for all combinations of qp and cabac_init_idc
+ * @remarks Packing format MPS in lsb and pState in bits[1-6]
+ ******************************************************************************
+ */
+const UWORD8 gau1_ih264_cab_ctxts[IH264_NUM_CABAC_INIT_IDC_PLUS_ONE][IH264_MAX_QP][IH264_NUM_CABAC_CTXTS] =
+{
+
+ {
+
+ {
+ /* Context Tables for P, SP, B Slices :: cabac_init_idc = 0, qp = 0 */
+
+ 124, 18, 21, 124, 18, 21, 125, 81, 20, 18,
+ 24, 60, 122, 124, 108, 28, 109, 12, 29, 3,
+ 2, 28, 19, 26, 1, 40, 124, 7, 53, 81,
+ 125, 81, 7, 29, 3, 2, 45, 63, 4, 36,
+ 11, 35, 65, 16, 7, 45, 49, 10, 25, 61,
+ 18, 11, 35, 49, 7, 21, 21, 33, 17, 10,
+ 44, 0, 0, 0, 39, 45, 67, 17, 44, 2,
+ 36, 29, 65, 125, 69, 75, 7, 37, 61, 39,
+ 93, 55, 77, 59, 125, 57, 51, 65, 89, 34,
+ 3, 12, 59, 21, 57, 47, 125, 18, 6, 8,
+ 11, 30, 9, 11, 49, 43, 29, 23, 27, 18,
+ 26, 9, 26, 42, 35, 0, 13, 7, 12, 25,
+ 56, 1, 4, 56, 76, 78, 68, 54, 59, 19,
+ 19, 34, 28, 73, 20, 20, 20, 4, 14, 14,
+ 0, 6, 2, 12, 11, 12, 48, 24, 9, 1,
+ 4, 0, 26, 48, 38, 22, 30, 6, 8, 8,
+ 60, 38, 40, 29, 6, 11, 70, 46, 38, 28,
+ 34, 38, 24, 32, 48, 2, 34, 18, 18, 10,
+ 0, 24, 12, 20, 22, 16, 36, 54, 20, 37,
+ 16, 29, 34, 64, 41, 112, 124, 120, 118, 124,
+ 124, 114, 114, 108, 88, 72, 66, 86, 58, 13,
+ 7, 8, 7, 66, 62, 56, 68, 64, 50, 40,
+ 44, 0, 8, 1, 61, 51, 89, 25, 38, 36,
+ 22, 1, 8, 13, 23, 37, 77, 27, 78, 42,
+ 30, 16, 8, 15, 39, 47, 111, 10, 68, 54,
+ 50, 40, 16, 10, 1, 21, 53, 13, 68, 64,
+ 42, 8, 10, 17, 35, 67, 10, 116, 98, 90,
+ 72, 46, 10, 13, 31, 43, 124, 85, 85, 47,
+ 101, 93, 69, 93, 85, 79, 87, 89, 97, 65,
+ 63, 55, 59, 61, 45, 7, 33, 43, 13, 6,
+ 10, 4, 26, 26, 28, 18, 44, 34, 24, 28,
+ 22, 44, 32, 16, 44, 38, 26, 20, 28, 0,
+ 1, 11, 8, 13, 38, 64, 40, 20, 58, 50,
+ 22, 46, 62, 38, 50, 26, 12, 40, 104, 98,
+ 104, 104, 108, 124, 124, 124, 124, 124, 124, 124,
+ 124, 124, 68, 124, 124, 124, 124, 124, 124, 108,
+ 74, 72, 12, 37, 23, 67, 123, 124, 124, 124,
+ 114, 110, 106, 82, 88, 62, 64, 44, 38, 32,
+ 3, 15, 6, 0, 3, 78, 86, 80, 62, 80,
+ 78, 46, 62, 68, 42, 12, 20, 4, 45, 46,
+ 24, 8, 31, 15, 11, 13, 5, 9, 19, 11,
+ 13, 7, 2, 13, 5, 3, 0, 124, 124, 124,
+ 124, 124, 120, 108, 72, 8, 5, 56, 42, 36,
+ 30, 14, 6, 2, 5, 25, 43, 35, 27, 35,
+ 33, 19, 21, 39, 15, 7, 4, 5, 5, 8,
+ 8, 124, 124, 124, 124, 122, 114, 92, 58, 2,
+ },
+
+ {
+ /* Context Tables for P, SP, B Slices :: cabac_init_idc = 0, qp = 1 */
+
+ 124, 18, 21, 124, 18, 21, 123, 77, 22, 20,
+ 24, 58, 120, 124, 108, 28, 103, 12, 27, 1,
+ 2, 28, 17, 24, 3, 40, 124, 9, 55, 81,
+ 121, 77, 7, 27, 1, 2, 43, 59, 6, 36,
+ 9, 33, 63, 16, 7, 43, 49, 10, 23, 59,
+ 18, 11, 33, 49, 5, 19, 19, 31, 15, 10,
+ 44, 0, 0, 0, 37, 45, 67, 15, 44, 2,
+ 36, 27, 63, 121, 65, 71, 3, 33, 57, 37,
+ 89, 51, 73, 57, 123, 55, 49, 63, 87, 36,
+ 1, 14, 57, 19, 55, 45, 121, 18, 6, 8,
+ 11, 32, 9, 9, 47, 41, 27, 21, 25, 18,
+ 26, 7, 26, 42, 33, 0, 11, 7, 12, 23,
+ 56, 1, 4, 56, 74, 78, 68, 54, 57, 17,
+ 17, 34, 28, 71, 20, 20, 20, 6, 14, 14,
+ 2, 8, 4, 12, 9, 12, 48, 24, 9, 1,
+ 4, 0, 26, 46, 38, 22, 30, 8, 10, 8,
+ 58, 38, 40, 27, 6, 11, 70, 46, 38, 28,
+ 34, 38, 24, 32, 48, 2, 34, 18, 18, 10,
+ 0, 24, 12, 20, 22, 16, 36, 54, 20, 35,
+ 16, 27, 34, 62, 39, 110, 124, 118, 116, 122,
+ 124, 112, 112, 104, 86, 70, 64, 82, 56, 15,
+ 7, 8, 7, 64, 60, 54, 66, 62, 48, 38,
+ 42, 0, 8, 1, 59, 49, 87, 23, 40, 36,
+ 22, 0, 10, 11, 21, 35, 73, 25, 78, 42,
+ 30, 16, 10, 13, 37, 45, 107, 10, 70, 56,
+ 50, 40, 18, 10, 1, 19, 51, 13, 70, 64,
+ 42, 8, 12, 15, 33, 65, 10, 116, 98, 90,
+ 72, 46, 10, 11, 29, 41, 124, 83, 83, 45,
+ 97, 89, 67, 89, 81, 75, 83, 85, 93, 63,
+ 61, 53, 57, 57, 43, 7, 31, 41, 11, 6,
+ 10, 4, 26, 26, 26, 16, 44, 34, 26, 28,
+ 22, 44, 32, 16, 44, 38, 26, 20, 28, 0,
+ 1, 9, 10, 13, 38, 64, 40, 20, 58, 50,
+ 24, 46, 60, 38, 50, 26, 12, 38, 104, 98,
+ 104, 102, 106, 124, 124, 124, 124, 124, 124, 124,
+ 124, 124, 66, 124, 124, 124, 124, 124, 124, 106,
+ 72, 70, 12, 35, 21, 63, 117, 124, 124, 124,
+ 112, 106, 104, 80, 84, 60, 62, 42, 36, 30,
+ 5, 15, 6, 0, 5, 76, 84, 78, 60, 78,
+ 76, 44, 60, 66, 40, 10, 18, 2, 45, 46,
+ 24, 8, 29, 13, 9, 11, 3, 7, 15, 9,
+ 11, 5, 6, 9, 3, 0, 4, 124, 124, 124,
+ 124, 124, 116, 102, 68, 4, 3, 58, 44, 38,
+ 32, 16, 8, 4, 3, 23, 41, 33, 25, 33,
+ 29, 15, 19, 37, 13, 5, 6, 3, 3, 8,
+ 8, 124, 124, 124, 124, 116, 108, 86, 52, 1,
+ },
+
+ {
+ /* Context Tables for P, SP, B Slices :: cabac_init_idc = 0, qp = 2 */
+
+ 124, 18, 21, 124, 18, 21, 119, 75, 22, 20,
+ 24, 56, 118, 122, 108, 28, 99, 12, 25, 0,
+ 2, 26, 17, 22, 5, 38, 120, 13, 57, 83,
+ 115, 75, 7, 25, 0, 2, 43, 57, 6, 34,
+ 9, 33, 61, 16, 7, 43, 49, 10, 23, 57,
+ 18, 11, 33, 49, 5, 19, 19, 31, 15, 10,
+ 44, 0, 0, 0, 35, 45, 67, 15, 42, 2,
+ 36, 27, 63, 117, 61, 67, 1, 29, 55, 35,
+ 87, 49, 71, 55, 119, 55, 49, 63, 85, 36,
+ 1, 14, 55, 19, 53, 45, 119, 18, 6, 8,
+ 11, 32, 9, 9, 47, 41, 27, 21, 25, 18,
+ 26, 7, 26, 42, 33, 0, 11, 7, 12, 23,
+ 54, 1, 4, 54, 72, 76, 66, 52, 55, 17,
+ 17, 32, 26, 71, 18, 20, 20, 6, 14, 14,
+ 4, 8, 4, 12, 9, 12, 46, 24, 11, 1,
+ 4, 1, 26, 44, 38, 22, 28, 8, 10, 8,
+ 56, 38, 38, 27, 6, 13, 68, 46, 38, 28,
+ 34, 38, 24, 32, 48, 2, 34, 18, 18, 10,
+ 0, 24, 12, 20, 22, 16, 34, 52, 18, 35,
+ 16, 27, 32, 60, 39, 106, 124, 114, 112, 118,
+ 120, 108, 108, 100, 82, 66, 60, 78, 52, 17,
+ 7, 8, 9, 62, 58, 52, 64, 58, 46, 36,
+ 40, 1, 6, 3, 59, 49, 85, 23, 40, 36,
+ 22, 0, 10, 11, 21, 35, 71, 23, 78, 42,
+ 30, 16, 10, 13, 35, 43, 103, 10, 70, 56,
+ 50, 40, 18, 10, 1, 19, 49, 13, 70, 64,
+ 42, 8, 12, 15, 33, 63, 10, 114, 96, 88,
+ 70, 46, 10, 11, 29, 41, 124, 81, 81, 43,
+ 95, 87, 65, 87, 79, 73, 81, 83, 89, 61,
+ 59, 53, 55, 55, 43, 9, 31, 39, 11, 6,
+ 8, 4, 24, 24, 24, 14, 42, 34, 26, 28,
+ 20, 42, 32, 16, 42, 36, 26, 20, 26, 0,
+ 1, 9, 10, 13, 36, 62, 38, 20, 56, 48,
+ 24, 44, 58, 38, 50, 24, 10, 34, 102, 96,
+ 102, 100, 104, 124, 124, 124, 124, 124, 124, 124,
+ 124, 124, 64, 124, 124, 124, 124, 124, 124, 102,
+ 70, 68, 12, 33, 21, 61, 113, 120, 120, 124,
+ 108, 102, 100, 76, 80, 58, 58, 40, 32, 28,
+ 7, 17, 4, 0, 7, 74, 82, 74, 56, 74,
+ 72, 42, 56, 62, 38, 8, 16, 0, 47, 44,
+ 22, 6, 29, 13, 9, 9, 3, 5, 13, 7,
+ 9, 3, 8, 7, 1, 2, 6, 124, 124, 124,
+ 124, 120, 110, 96, 62, 0, 3, 58, 44, 38,
+ 32, 18, 8, 4, 3, 23, 41, 33, 23, 33,
+ 27, 13, 19, 35, 11, 3, 6, 3, 1, 8,
+ 8, 124, 124, 124, 120, 110, 100, 78, 46, 7,
+ },
+
+ {
+ /* Context Tables for P, SP, B Slices :: cabac_init_idc = 0, qp = 3 */
+
+ 124, 18, 21, 124, 18, 21, 115, 71, 24, 20,
+ 22, 52, 114, 120, 108, 28, 95, 12, 23, 2,
+ 2, 24, 17, 20, 7, 38, 116, 15, 59, 83,
+ 109, 73, 7, 23, 2, 2, 41, 55, 8, 34,
+ 9, 31, 59, 14, 9, 43, 49, 10, 23, 57,
+ 18, 11, 33, 49, 3, 19, 19, 31, 13, 10,
+ 44, 0, 0, 0, 35, 45, 67, 13, 40, 2,
+ 36, 27, 63, 113, 57, 65, 2, 25, 53, 33,
+ 83, 47, 69, 53, 115, 53, 49, 61, 83, 36,
+ 1, 14, 55, 19, 53, 43, 115, 18, 4, 6,
+ 13, 32, 9, 9, 45, 41, 25, 21, 23, 18,
+ 26, 7, 26, 40, 33, 0, 11, 7, 12, 23,
+ 52, 1, 4, 52, 70, 74, 64, 50, 55, 15,
+ 17, 30, 26, 69, 18, 20, 20, 6, 14, 14,
+ 6, 8, 4, 12, 7, 12, 44, 24, 13, 1,
+ 4, 1, 24, 42, 38, 22, 26, 8, 10, 8,
+ 52, 38, 36, 27, 6, 13, 66, 46, 38, 28,
+ 34, 38, 24, 32, 48, 2, 32, 18, 18, 10,
+ 0, 22, 10, 18, 20, 14, 32, 50, 18, 35,
+ 14, 27, 30, 56, 39, 104, 124, 110, 108, 114,
+ 116, 104, 104, 96, 78, 64, 58, 74, 48, 19,
+ 7, 8, 9, 60, 56, 50, 60, 56, 42, 34,
+ 38, 3, 6, 3, 59, 49, 85, 21, 40, 36,
+ 22, 0, 10, 11, 21, 33, 69, 23, 78, 42,
+ 30, 16, 12, 11, 33, 41, 99, 10, 70, 56,
+ 50, 40, 20, 10, 1, 19, 49, 13, 70, 64,
+ 40, 8, 12, 15, 33, 61, 10, 114, 96, 86,
+ 68, 46, 10, 11, 27, 39, 124, 79, 79, 43,
+ 93, 85, 63, 83, 77, 71, 79, 79, 87, 61,
+ 57, 53, 55, 51, 43, 9, 31, 39, 11, 4,
+ 8, 4, 22, 22, 22, 12, 42, 32, 26, 26,
+ 20, 42, 30, 16, 40, 36, 24, 20, 24, 0,
+ 3, 9, 10, 15, 36, 62, 36, 20, 54, 48,
+ 24, 42, 56, 36, 48, 22, 10, 32, 100, 94,
+ 102, 98, 102, 122, 124, 124, 124, 124, 124, 124,
+ 124, 124, 62, 124, 124, 124, 124, 124, 124, 98,
+ 68, 66, 12, 31, 21, 59, 109, 116, 116, 124,
+ 104, 98, 96, 74, 76, 54, 56, 38, 30, 24,
+ 9, 19, 4, 1, 9, 72, 78, 72, 52, 70,
+ 68, 38, 54, 58, 34, 6, 12, 3, 49, 42,
+ 20, 4, 29, 11, 9, 9, 1, 5, 11, 5,
+ 7, 1, 10, 5, 0, 6, 8, 124, 124, 124,
+ 124, 116, 104, 90, 56, 3, 1, 60, 46, 40,
+ 32, 20, 10, 4, 1, 21, 41, 31, 23, 31,
+ 25, 11, 19, 35, 11, 3, 6, 1, 0, 8,
+ 8, 124, 124, 124, 114, 104, 92, 70, 38, 11,
+ },
+
+ {
+ /* Context Tables for P, SP, B Slices :: cabac_init_idc = 0, qp = 4 */
+
+ 124, 18, 21, 124, 18, 21, 113, 69, 24, 20,
+ 22, 50, 112, 116, 108, 28, 89, 10, 21, 2,
+ 2, 22, 17, 18, 9, 36, 112, 19, 61, 85,
+ 103, 71, 7, 21, 2, 2, 41, 53, 8, 32,
+ 9, 31, 59, 14, 9, 41, 49, 10, 23, 55,
+ 16, 13, 33, 49, 3, 17, 19, 29, 13, 10,
+ 44, 0, 0, 0, 33, 47, 67, 13, 38, 2,
+ 36, 27, 63, 111, 55, 61, 4, 23, 51, 31,
+ 81, 43, 67, 51, 111, 53, 47, 61, 81, 36,
+ 1, 14, 53, 19, 51, 43, 113, 16, 4, 6,
+ 13, 32, 9, 9, 45, 41, 25, 21, 23, 18,
+ 24, 7, 26, 40, 33, 0, 11, 7, 12, 23,
+ 52, 3, 4, 52, 68, 72, 62, 48, 53, 15,
+ 17, 28, 24, 69, 16, 20, 18, 6, 14, 14,
+ 8, 10, 4, 10, 7, 10, 42, 22, 15, 1,
+ 4, 3, 24, 40, 36, 20, 26, 10, 10, 8,
+ 50, 36, 34, 27, 6, 15, 66, 46, 38, 28,
+ 34, 38, 24, 32, 46, 2, 32, 18, 18, 10,
+ 1, 22, 10, 18, 20, 14, 32, 48, 16, 35,
+ 14, 27, 28, 54, 39, 100, 124, 106, 104, 110,
+ 112, 100, 100, 92, 74, 60, 54, 68, 44, 21,
+ 7, 6, 11, 58, 54, 48, 58, 52, 40, 32,
+ 34, 3, 4, 5, 59, 49, 83, 21, 40, 36,
+ 22, 0, 10, 11, 21, 33, 67, 21, 78, 42,
+ 30, 16, 12, 11, 33, 41, 95, 10, 70, 56,
+ 50, 40, 20, 10, 1, 19, 47, 13, 70, 62,
+ 40, 8, 12, 15, 33, 61, 10, 112, 94, 84,
+ 66, 46, 10, 11, 27, 39, 124, 77, 77, 41,
+ 89, 83, 61, 81, 73, 69, 75, 77, 83, 59,
+ 57, 51, 53, 49, 41, 11, 31, 37, 11, 4,
+ 6, 2, 20, 20, 20, 10, 40, 32, 26, 26,
+ 18, 40, 30, 16, 38, 34, 24, 18, 22, 1,
+ 3, 9, 10, 15, 34, 60, 34, 20, 52, 46,
+ 24, 40, 54, 36, 48, 20, 8, 28, 98, 94,
+ 100, 96, 98, 120, 124, 124, 124, 124, 124, 124,
+ 124, 124, 58, 124, 124, 124, 124, 124, 124, 94,
+ 66, 62, 12, 29, 19, 57, 105, 114, 112, 120,
+ 102, 94, 92, 70, 72, 52, 52, 34, 26, 22,
+ 11, 21, 2, 1, 11, 68, 76, 68, 50, 66,
+ 64, 36, 50, 54, 32, 4, 10, 5, 49, 40,
+ 20, 2, 29, 11, 7, 7, 1, 3, 9, 5,
+ 5, 0, 12, 3, 2, 8, 10, 124, 124, 124,
+ 122, 110, 98, 84, 50, 9, 1, 60, 46, 40,
+ 34, 20, 10, 6, 1, 21, 39, 31, 21, 31,
+ 23, 9, 19, 33, 9, 1, 6, 1, 2, 8,
+ 8, 124, 124, 122, 108, 98, 84, 62, 32, 17,
+ },
+
+ {
+ /* Context Tables for P, SP, B Slices :: cabac_init_idc = 0, qp = 5 */
+
+ 124, 18, 21, 124, 18, 21, 109, 65, 24, 20,
+ 20, 46, 108, 114, 108, 28, 85, 10, 19, 4,
+ 2, 22, 15, 16, 11, 36, 108, 23, 63, 85,
+ 97, 67, 7, 19, 4, 2, 41, 51, 8, 32,
+ 9, 31, 57, 14, 11, 41, 49, 10, 23, 53,
+ 16, 13, 33, 49, 1, 17, 17, 29, 11, 10,
+ 44, 0, 0, 0, 33, 47, 67, 11, 36, 2,
+ 36, 25, 63, 107, 51, 59, 8, 19, 47, 29,
+ 79, 41, 65, 49, 107, 51, 47, 59, 79, 36,
+ 1, 14, 53, 19, 51, 41, 109, 16, 4, 6,
+ 13, 32, 9, 7, 43, 41, 25, 21, 21, 18,
+ 24, 7, 26, 40, 31, 0, 9, 7, 12, 23,
+ 50, 3, 4, 50, 66, 72, 60, 46, 51, 13,
+ 17, 26, 24, 67, 16, 20, 18, 6, 14, 14,
+ 10, 10, 4, 10, 7, 10, 40, 22, 17, 1,
+ 4, 3, 22, 38, 36, 20, 24, 10, 10, 8,
+ 48, 36, 32, 27, 6, 15, 64, 46, 38, 28,
+ 34, 38, 24, 32, 46, 2, 32, 18, 18, 10,
+ 1, 22, 10, 16, 20, 14, 30, 46, 16, 35,
+ 12, 27, 26, 52, 39, 98, 122, 104, 102, 106,
+ 108, 96, 96, 88, 70, 56, 50, 64, 42, 23,
+ 7, 6, 11, 56, 52, 46, 56, 50, 36, 30,
+ 32, 5, 4, 5, 59, 49, 83, 21, 40, 36,
+ 22, 0, 10, 9, 19, 31, 65, 21, 78, 42,
+ 30, 16, 12, 9, 31, 39, 91, 10, 70, 56,
+ 50, 40, 20, 10, 1, 19, 45, 13, 72, 62,
+ 38, 8, 12, 15, 33, 59, 10, 112, 92, 82,
+ 64, 46, 10, 11, 27, 37, 124, 75, 75, 39,
+ 87, 81, 59, 79, 71, 67, 73, 73, 79, 57,
+ 55, 51, 53, 47, 41, 11, 29, 35, 11, 2,
+ 6, 2, 20, 18, 18, 8, 38, 30, 26, 24,
+ 18, 40, 30, 16, 36, 32, 24, 18, 20, 1,
+ 3, 9, 10, 15, 32, 60, 34, 20, 50, 44,
+ 24, 38, 52, 34, 46, 18, 6, 24, 96, 92,
+ 100, 94, 96, 116, 124, 124, 124, 124, 124, 124,
+ 124, 124, 56, 124, 124, 124, 124, 124, 122, 90,
+ 64, 60, 12, 27, 19, 55, 101, 110, 110, 116,
+ 98, 90, 88, 68, 68, 50, 48, 32, 22, 18,
+ 13, 23, 2, 1, 13, 66, 72, 64, 46, 64,
+ 62, 32, 48, 52, 28, 2, 8, 7, 51, 40,
+ 18, 0, 27, 9, 7, 7, 0, 1, 7, 3,
+ 3, 2, 16, 1, 4, 10, 14, 124, 124, 124,
+ 116, 106, 92, 78, 44, 13, 1, 62, 48, 42,
+ 34, 22, 10, 6, 0, 19, 39, 31, 19, 29,
+ 21, 7, 17, 31, 9, 1, 6, 0, 4, 8,
+ 8, 124, 124, 116, 102, 92, 78, 54, 24, 23,
+ },
+
+ {
+ /* Context Tables for P, SP, B Slices :: cabac_init_idc = 0, qp = 6 */
+
+ 124, 18, 23, 124, 18, 23, 105, 63, 26, 20,
+ 20, 44, 106, 112, 108, 28, 81, 10, 19, 6,
+ 2, 20, 15, 14, 13, 34, 106, 25, 65, 87,
+ 91, 65, 7, 19, 6, 2, 39, 49, 10, 30,
+ 7, 29, 55, 12, 11, 41, 49, 10, 21, 53,
+ 16, 13, 31, 49, 1, 17, 17, 29, 11, 10,
+ 44, 0, 0, 0, 31, 47, 67, 11, 36, 0,
+ 36, 25, 61, 103, 47, 55, 10, 15, 45, 27,
+ 75, 39, 63, 49, 105, 51, 47, 59, 79, 38,
+ 1, 14, 51, 17, 49, 41, 107, 16, 2, 4,
+ 15, 32, 9, 7, 43, 41, 23, 21, 21, 18,
+ 24, 5, 26, 38, 31, 0, 9, 7, 12, 23,
+ 48, 3, 4, 48, 64, 70, 60, 46, 51, 13,
+ 17, 26, 22, 67, 14, 20, 18, 6, 14, 14,
+ 10, 10, 4, 10, 5, 10, 38, 22, 17, 3,
+ 4, 5, 22, 36, 36, 20, 22, 10, 10, 8,
+ 44, 36, 30, 27, 6, 17, 62, 46, 36, 28,
+ 34, 38, 24, 32, 46, 2, 30, 18, 16, 10,
+ 1, 20, 8, 16, 18, 12, 28, 44, 14, 35,
+ 12, 25, 24, 48, 39, 94, 118, 100, 98, 102,
+ 104, 92, 92, 84, 66, 54, 48, 60, 38, 25,
+ 7, 6, 13, 54, 50, 44, 52, 46, 34, 28,
+ 30, 7, 2, 7, 59, 49, 81, 19, 40, 36,
+ 22, 2, 10, 9, 19, 31, 63, 19, 76, 42,
+ 30, 16, 14, 9, 29, 37, 87, 10, 72, 56,
+ 50, 40, 22, 10, 1, 17, 45, 13, 72, 62,
+ 38, 8, 12, 13, 31, 57, 10, 110, 92, 80,
+ 64, 46, 10, 9, 25, 37, 124, 75, 73, 39,
+ 85, 79, 57, 75, 69, 65, 71, 71, 77, 57,
+ 53, 51, 51, 43, 41, 13, 29, 35, 11, 2,
+ 4, 2, 18, 16, 16, 6, 38, 30, 26, 24,
+ 16, 38, 28, 16, 36, 32, 22, 18, 20, 1,
+ 5, 9, 10, 17, 32, 58, 32, 18, 48, 44,
+ 26, 38, 50, 34, 46, 18, 6, 22, 94, 90,
+ 98, 92, 94, 114, 124, 124, 124, 124, 124, 124,
+ 124, 122, 54, 124, 124, 124, 124, 124, 118, 86,
+ 62, 58, 12, 25, 19, 51, 95, 106, 106, 112,
+ 94, 86, 84, 64, 64, 46, 46, 30, 20, 16,
+ 15, 25, 0, 3, 15, 64, 70, 62, 42, 60,
+ 58, 30, 44, 48, 26, 1, 4, 11, 53, 38,
+ 16, 1, 27, 9, 7, 5, 0, 1, 3, 1,
+ 1, 4, 18, 2, 6, 14, 16, 124, 124, 120,
+ 112, 100, 88, 72, 40, 17, 0, 62, 48, 42,
+ 34, 24, 12, 6, 0, 19, 39, 29, 19, 29,
+ 19, 5, 17, 31, 7, 0, 6, 0, 6, 8,
+ 8, 124, 124, 112, 96, 84, 70, 48, 18, 27,
+ },
+
+ {
+ /* Context Tables for P, SP, B Slices :: cabac_init_idc = 0, qp = 7 */
+
+ 124, 18, 23, 124, 18, 23, 101, 59, 26, 20,
+ 18, 40, 102, 108, 108, 28, 75, 8, 17, 6,
+ 2, 18, 15, 12, 15, 34, 102, 29, 67, 87,
+ 85, 63, 7, 17, 6, 2, 39, 47, 10, 30,
+ 7, 29, 55, 12, 13, 39, 49, 10, 21, 51,
+ 14, 13, 31, 49, 0, 15, 17, 27, 9, 10,
+ 44, 0, 0, 0, 31, 47, 67, 9, 34, 0,
+ 36, 25, 61, 101, 43, 53, 14, 11, 43, 25,
+ 73, 35, 61, 47, 101, 49, 45, 57, 77, 38,
+ 1, 14, 51, 17, 49, 39, 103, 14, 2, 4,
+ 15, 32, 9, 7, 41, 41, 23, 21, 19, 18,
+ 22, 5, 26, 38, 31, 0, 9, 7, 12, 23,
+ 48, 3, 4, 48, 62, 68, 58, 44, 49, 11,
+ 17, 24, 22, 65, 14, 20, 16, 6, 14, 14,
+ 12, 12, 4, 10, 5, 10, 36, 22, 19, 3,
+ 4, 5, 20, 34, 34, 20, 22, 12, 10, 8,
+ 42, 34, 28, 27, 6, 17, 62, 46, 36, 28,
+ 34, 38, 24, 32, 46, 2, 30, 18, 16, 10,
+ 1, 20, 8, 14, 18, 12, 28, 42, 14, 35,
+ 10, 25, 22, 46, 39, 92, 114, 96, 94, 98,
+ 100, 88, 88, 80, 62, 50, 44, 54, 34, 27,
+ 7, 4, 13, 52, 48, 42, 50, 44, 30, 26,
+ 28, 7, 2, 7, 59, 49, 81, 19, 40, 36,
+ 22, 2, 10, 9, 19, 29, 61, 19, 76, 42,
+ 30, 16, 14, 7, 27, 37, 83, 10, 72, 56,
+ 50, 40, 22, 10, 1, 17, 43, 13, 72, 60,
+ 36, 8, 12, 13, 31, 57, 10, 110, 90, 78,
+ 62, 46, 10, 9, 25, 35, 124, 73, 71, 37,
+ 81, 77, 55, 73, 65, 63, 67, 67, 73, 55,
+ 51, 49, 51, 41, 39, 13, 29, 33, 11, 0,
+ 4, 0, 16, 14, 14, 4, 36, 28, 26, 22,
+ 16, 38, 28, 16, 34, 30, 22, 16, 18, 1,
+ 5, 9, 10, 17, 30, 58, 30, 18, 46, 42,
+ 26, 36, 48, 32, 44, 16, 4, 18, 92, 90,
+ 98, 90, 90, 110, 124, 124, 124, 124, 124, 124,
+ 124, 118, 50, 124, 124, 124, 124, 124, 112, 82,
+ 60, 56, 12, 23, 17, 49, 91, 104, 102, 108,
+ 92, 82, 80, 62, 60, 44, 42, 26, 16, 12,
+ 17, 27, 0, 3, 17, 60, 66, 58, 40, 56,
+ 54, 26, 42, 44, 22, 3, 2, 13, 53, 36,
+ 16, 3, 27, 7, 5, 5, 2, 0, 1, 0,
+ 0, 6, 20, 4, 8, 16, 18, 124, 122, 116,
+ 106, 96, 82, 66, 34, 21, 0, 64, 50, 44,
+ 36, 26, 12, 8, 2, 17, 37, 29, 17, 27,
+ 17, 3, 17, 29, 7, 0, 6, 2, 8, 8,
+ 8, 124, 124, 106, 90, 78, 62, 40, 10, 33,
+ },
+
+ {
+ /* Context Tables for P, SP, B Slices :: cabac_init_idc = 0, qp = 8 */
+
+ 124, 16, 23, 124, 16, 23, 99, 57, 26, 20,
+ 18, 38, 100, 106, 108, 28, 71, 8, 15, 8,
+ 2, 16, 15, 10, 19, 32, 98, 33, 69, 89,
+ 81, 61, 7, 15, 8, 2, 39, 45, 10, 28,
+ 7, 29, 53, 10, 13, 39, 51, 10, 21, 51,
+ 14, 15, 31, 49, 0, 15, 17, 27, 9, 10,
+ 44, 0, 0, 0, 29, 49, 67, 9, 32, 0,
+ 36, 25, 61, 97, 41, 49, 16, 9, 41, 23,
+ 71, 33, 59, 45, 97, 49, 45, 57, 75, 38,
+ 1, 14, 49, 17, 47, 39, 101, 14, 0, 2,
+ 17, 32, 9, 7, 41, 41, 23, 21, 19, 16,
+ 22, 5, 26, 36, 31, 0, 9, 7, 10, 23,
+ 46, 5, 4, 46, 58, 66, 56, 42, 49, 11,
+ 17, 22, 20, 65, 12, 18, 16, 6, 14, 14,
+ 14, 12, 4, 8, 5, 8, 34, 20, 21, 3,
+ 4, 7, 20, 32, 34, 18, 20, 12, 10, 8,
+ 38, 34, 26, 27, 6, 19, 60, 44, 36, 28,
+ 34, 36, 22, 32, 44, 0, 28, 18, 16, 8,
+ 3, 18, 6, 14, 16, 10, 26, 40, 12, 35,
+ 10, 25, 20, 42, 39, 88, 110, 92, 90, 94,
+ 94, 84, 84, 76, 58, 46, 40, 50, 30, 29,
+ 7, 4, 15, 50, 44, 38, 46, 40, 28, 22,
+ 24, 9, 0, 9, 59, 49, 79, 19, 40, 36,
+ 22, 2, 10, 9, 19, 29, 59, 17, 76, 42,
+ 30, 16, 14, 7, 27, 35, 81, 10, 72, 56,
+ 50, 38, 22, 10, 1, 17, 43, 13, 72, 60,
+ 36, 8, 12, 13, 31, 55, 10, 108, 88, 76,
+ 60, 44, 10, 9, 25, 35, 124, 71, 69, 37,
+ 79, 75, 55, 71, 63, 61, 65, 65, 71, 55,
+ 51, 49, 49, 39, 39, 15, 29, 33, 11, 0,
+ 2, 0, 14, 12, 10, 2, 34, 28, 26, 22,
+ 14, 36, 26, 14, 32, 28, 20, 16, 16, 3,
+ 7, 9, 10, 19, 28, 56, 28, 18, 44, 40,
+ 26, 34, 46, 32, 44, 14, 2, 14, 90, 88,
+ 96, 86, 88, 108, 124, 124, 124, 124, 124, 124,
+ 124, 112, 48, 124, 124, 124, 124, 122, 108, 78,
+ 56, 52, 12, 23, 17, 47, 87, 100, 98, 104,
+ 88, 76, 76, 58, 56, 40, 38, 24, 12, 10,
+ 19, 29, 1, 5, 19, 58, 64, 54, 36, 52,
+ 50, 24, 38, 40, 20, 5, 1, 17, 55, 34,
+ 14, 5, 27, 7, 5, 3, 2, 0, 0, 0,
+ 2, 8, 22, 6, 10, 18, 20, 122, 118, 112,
+ 102, 90, 76, 60, 28, 27, 0, 64, 50, 44,
+ 36, 26, 12, 8, 2, 17, 37, 29, 17, 27,
+ 15, 1, 17, 29, 5, 2, 6, 2, 8, 8,
+ 6, 124, 122, 102, 84, 72, 54, 32, 4, 39,
+ },
+
+ {
+ /* Context Tables for P, SP, B Slices :: cabac_init_idc = 0, qp = 9 */
+
+ 124, 16, 23, 124, 16, 23, 95, 55, 28, 20,
+ 18, 36, 98, 104, 108, 28, 67, 8, 13, 10,
+ 2, 16, 13, 8, 21, 30, 94, 35, 71, 91,
+ 75, 57, 7, 13, 10, 2, 37, 43, 12, 26,
+ 7, 27, 51, 10, 13, 39, 51, 10, 21, 49,
+ 14, 15, 31, 49, 0, 15, 15, 27, 9, 10,
+ 44, 0, 0, 0, 27, 49, 67, 9, 30, 0,
+ 36, 23, 61, 93, 37, 45, 18, 5, 37, 21,
+ 67, 31, 55, 43, 93, 49, 45, 57, 73, 38,
+ 1, 14, 47, 17, 45, 37, 99, 14, 0, 2,
+ 17, 32, 9, 5, 39, 39, 21, 21, 19, 16,
+ 22, 5, 26, 36, 29, 0, 7, 7, 10, 21,
+ 44, 5, 4, 44, 56, 66, 54, 40, 47, 11,
+ 15, 20, 18, 65, 10, 18, 16, 8, 14, 14,
+ 16, 12, 4, 8, 3, 8, 34, 20, 23, 3,
+ 4, 9, 20, 30, 34, 18, 18, 12, 10, 8,
+ 36, 34, 26, 27, 6, 21, 58, 44, 36, 28,
+ 34, 36, 22, 32, 44, 0, 28, 18, 16, 8,
+ 3, 18, 6, 14, 16, 10, 24, 40, 12, 35,
+ 10, 25, 18, 40, 39, 84, 108, 90, 88, 90,
+ 90, 82, 82, 72, 54, 44, 38, 46, 28, 31,
+ 7, 4, 17, 48, 42, 36, 44, 38, 26, 20,
+ 22, 11, 1, 11, 59, 47, 77, 17, 42, 36,
+ 22, 2, 12, 7, 17, 27, 57, 15, 76, 42,
+ 30, 16, 16, 7, 25, 33, 77, 10, 72, 56,
+ 50, 38, 24, 10, 1, 17, 41, 13, 74, 60,
+ 36, 8, 14, 13, 31, 53, 10, 108, 88, 76,
+ 58, 44, 10, 9, 23, 33, 124, 69, 67, 35,
+ 77, 71, 53, 67, 61, 57, 63, 63, 67, 53,
+ 49, 49, 47, 35, 39, 17, 27, 31, 11, 0,
+ 0, 0, 14, 10, 8, 0, 34, 28, 26, 22,
+ 14, 34, 26, 14, 30, 28, 20, 16, 14, 3,
+ 7, 7, 12, 19, 28, 54, 28, 18, 44, 40,
+ 26, 32, 44, 32, 44, 12, 2, 12, 90, 86,
+ 94, 84, 86, 106, 120, 120, 124, 124, 124, 124,
+ 124, 108, 46, 124, 124, 124, 124, 116, 104, 76,
+ 54, 50, 12, 21, 17, 45, 83, 96, 96, 100,
+ 84, 72, 74, 56, 52, 38, 36, 22, 10, 8,
+ 21, 29, 1, 5, 21, 56, 62, 52, 32, 50,
+ 48, 22, 36, 38, 18, 7, 3, 19, 57, 34,
+ 12, 5, 25, 7, 5, 1, 4, 2, 2, 2,
+ 4, 10, 26, 8, 12, 22, 24, 120, 116, 108,
+ 98, 84, 70, 54, 22, 31, 2, 64, 50, 46,
+ 36, 28, 14, 8, 4, 15, 37, 27, 15, 27,
+ 13, 2, 15, 27, 3, 4, 6, 4, 10, 8,
+ 6, 124, 118, 98, 80, 66, 48, 24, 1, 43,
+ },
+
+ {
+ /* Context Tables for P, SP, B Slices :: cabac_init_idc = 0, qp = 10 */
+
+ 124, 16, 23, 124, 16, 23, 91, 51, 28, 20,
+ 16, 32, 94, 100, 108, 28, 61, 6, 11, 10,
+ 2, 14, 13, 6, 23, 30, 90, 39, 73, 91,
+ 69, 55, 7, 11, 10, 2, 37, 41, 12, 26,
+ 7, 27, 51, 10, 15, 37, 51, 10, 21, 47,
+ 12, 15, 31, 49, 2, 13, 15, 25, 7, 10,
+ 44, 0, 0, 0, 27, 49, 67, 7, 28, 0,
+ 36, 23, 61, 91, 33, 43, 22, 1, 35, 19,
+ 65, 27, 53, 41, 89, 47, 43, 55, 71, 38,
+ 1, 14, 47, 17, 45, 37, 95, 12, 0, 2,
+ 17, 32, 9, 5, 39, 39, 21, 21, 17, 16,
+ 20, 5, 26, 36, 29, 0, 7, 7, 10, 21,
+ 44, 5, 4, 44, 54, 64, 52, 38, 45, 9,
+ 15, 18, 18, 63, 10, 18, 14, 8, 14, 14,
+ 18, 14, 4, 8, 3, 8, 32, 20, 25, 3,
+ 4, 9, 18, 28, 32, 18, 18, 14, 10, 8,
+ 34, 32, 24, 27, 6, 21, 58, 44, 36, 28,
+ 34, 36, 22, 32, 44, 0, 28, 18, 16, 8,
+ 3, 18, 6, 12, 16, 10, 24, 38, 10, 35,
+ 8, 25, 16, 38, 39, 82, 104, 86, 84, 86,
+ 86, 78, 78, 68, 50, 40, 34, 40, 24, 33,
+ 7, 2, 17, 46, 40, 34, 42, 34, 22, 18,
+ 20, 11, 1, 11, 59, 47, 77, 17, 42, 36,
+ 22, 2, 12, 7, 17, 27, 55, 15, 76, 42,
+ 30, 16, 16, 5, 23, 33, 73, 10, 72, 56,
+ 50, 38, 24, 10, 1, 17, 39, 13, 74, 58,
+ 34, 8, 14, 13, 31, 53, 10, 106, 86, 74,
+ 56, 44, 10, 9, 23, 33, 124, 67, 65, 33,
+ 73, 69, 51, 65, 57, 55, 59, 59, 63, 51,
+ 47, 47, 47, 33, 37, 17, 27, 29, 11, 1,
+ 0, 1, 12, 8, 6, 1, 32, 26, 26, 20,
+ 12, 34, 26, 14, 28, 26, 20, 14, 12, 3,
+ 7, 7, 12, 19, 26, 54, 26, 18, 42, 38,
+ 26, 30, 42, 30, 42, 10, 0, 8, 88, 86,
+ 94, 82, 82, 102, 116, 116, 124, 124, 124, 124,
+ 124, 104, 42, 118, 124, 118, 124, 112, 98, 72,
+ 52, 48, 12, 19, 15, 43, 79, 94, 92, 96,
+ 82, 68, 70, 52, 48, 36, 32, 18, 6, 4,
+ 23, 31, 3, 5, 23, 52, 58, 48, 30, 46,
+ 44, 18, 32, 34, 14, 9, 5, 21, 57, 32,
+ 12, 7, 25, 5, 3, 1, 4, 4, 4, 4,
+ 6, 12, 28, 10, 14, 24, 26, 120, 112, 104,
+ 92, 80, 64, 48, 16, 35, 2, 66, 52, 46,
+ 38, 30, 14, 10, 4, 15, 35, 27, 13, 25,
+ 11, 4, 15, 25, 3, 4, 6, 4, 12, 8,
+ 6, 124, 114, 92, 74, 60, 40, 16, 9, 49,
+ },
+
+ {
+ /* Context Tables for P, SP, B Slices :: cabac_init_idc = 0, qp = 11 */
+
+ 124, 16, 25, 124, 16, 25, 87, 49, 30, 20,
+ 16, 30, 92, 98, 108, 28, 57, 6, 11, 12,
+ 2, 12, 13, 4, 25, 28, 88, 41, 75, 93,
+ 63, 53, 7, 11, 12, 2, 35, 39, 14, 24,
+ 5, 25, 49, 8, 15, 37, 51, 10, 19, 47,
+ 12, 15, 29, 49, 2, 13, 15, 25, 7, 10,
+ 44, 0, 0, 0, 25, 49, 67, 7, 28, 1,
+ 36, 23, 59, 87, 29, 39, 24, 2, 33, 17,
+ 61, 25, 51, 41, 87, 47, 43, 55, 71, 40,
+ 1, 14, 45, 15, 43, 35, 93, 12, 1, 0,
+ 19, 32, 9, 5, 37, 39, 19, 21, 17, 16,
+ 20, 3, 26, 34, 29, 0, 7, 7, 10, 21,
+ 42, 5, 4, 42, 52, 62, 52, 38, 45, 9,
+ 15, 18, 16, 63, 8, 18, 14, 8, 14, 14,
+ 18, 14, 4, 8, 1, 8, 30, 20, 25, 5,
+ 4, 11, 18, 26, 32, 18, 16, 14, 10, 8,
+ 30, 32, 22, 27, 6, 23, 56, 44, 34, 28,
+ 34, 36, 22, 32, 44, 0, 26, 18, 14, 8,
+ 3, 16, 4, 12, 14, 8, 22, 36, 10, 35,
+ 8, 23, 14, 34, 39, 78, 100, 82, 80, 82,
+ 82, 74, 74, 64, 46, 38, 32, 36, 20, 35,
+ 7, 2, 19, 44, 38, 32, 38, 32, 20, 16,
+ 18, 13, 3, 13, 59, 47, 75, 15, 42, 36,
+ 22, 4, 12, 7, 17, 25, 53, 13, 74, 42,
+ 30, 16, 18, 5, 21, 31, 69, 10, 74, 56,
+ 50, 38, 26, 10, 1, 15, 39, 13, 74, 58,
+ 34, 8, 14, 11, 29, 51, 10, 106, 86, 72,
+ 56, 44, 10, 7, 21, 31, 124, 67, 63, 33,
+ 71, 67, 49, 61, 55, 53, 57, 57, 61, 51,
+ 45, 47, 45, 29, 37, 19, 27, 29, 11, 1,
+ 1, 1, 10, 6, 4, 3, 32, 26, 26, 20,
+ 12, 32, 24, 14, 28, 26, 18, 14, 12, 3,
+ 9, 7, 12, 21, 26, 52, 24, 16, 40, 38,
+ 28, 30, 40, 30, 42, 10, 0, 6, 86, 84,
+ 92, 80, 80, 100, 112, 112, 122, 120, 124, 124,
+ 120, 98, 40, 114, 124, 112, 124, 106, 94, 68,
+ 50, 46, 12, 17, 15, 39, 73, 90, 88, 92,
+ 78, 64, 66, 50, 44, 32, 30, 16, 4, 2,
+ 25, 33, 3, 7, 25, 50, 56, 46, 26, 42,
+ 40, 16, 30, 30, 12, 13, 9, 25, 59, 30,
+ 10, 9, 25, 5, 3, 0, 6, 4, 8, 6,
+ 8, 14, 30, 14, 16, 28, 28, 118, 110, 100,
+ 88, 74, 60, 42, 12, 39, 4, 66, 52, 48,
+ 38, 32, 16, 10, 6, 13, 35, 25, 13, 25,
+ 9, 6, 15, 25, 1, 6, 6, 6, 14, 8,
+ 6, 124, 110, 88, 68, 52, 32, 10, 15, 53,
+ },
+
+ {
+ /* Context Tables for P, SP, B Slices :: cabac_init_idc = 0, qp = 12 */
+
+ 124, 16, 25, 124, 16, 25, 85, 45, 30, 20,
+ 14, 26, 88, 96, 108, 28, 53, 6, 9, 14,
+ 2, 10, 13, 2, 27, 28, 84, 45, 77, 93,
+ 57, 51, 7, 9, 14, 2, 35, 37, 14, 24,
+ 5, 25, 47, 8, 17, 37, 51, 10, 19, 45,
+ 12, 17, 29, 49, 4, 13, 15, 25, 5, 10,
+ 44, 0, 0, 0, 25, 51, 67, 5, 26, 1,
+ 36, 23, 59, 83, 27, 37, 28, 4, 31, 15,
+ 59, 23, 49, 39, 83, 45, 43, 53, 69, 40,
+ 1, 14, 45, 15, 43, 35, 89, 12, 1, 0,
+ 19, 32, 9, 5, 37, 39, 19, 21, 15, 16,
+ 20, 3, 26, 34, 29, 0, 7, 7, 10, 21,
+ 40, 7, 4, 40, 50, 60, 50, 36, 43, 7,
+ 15, 16, 16, 61, 8, 18, 14, 8, 14, 14,
+ 20, 14, 4, 6, 1, 6, 28, 18, 27, 5,
+ 4, 11, 16, 24, 32, 16, 14, 14, 10, 8,
+ 28, 32, 20, 27, 6, 23, 54, 44, 34, 28,
+ 34, 36, 22, 32, 42, 0, 26, 18, 14, 8,
+ 5, 16, 4, 10, 14, 8, 20, 34, 8, 35,
+ 6, 23, 12, 32, 39, 76, 96, 78, 76, 78,
+ 78, 70, 70, 60, 42, 34, 28, 32, 16, 37,
+ 7, 2, 19, 42, 36, 30, 36, 28, 16, 14,
+ 14, 15, 3, 13, 59, 47, 75, 15, 42, 36,
+ 22, 4, 12, 7, 17, 25, 51, 13, 74, 42,
+ 30, 16, 18, 3, 21, 29, 65, 10, 74, 56,
+ 50, 38, 26, 10, 1, 15, 37, 13, 74, 58,
+ 32, 8, 14, 11, 29, 49, 10, 104, 84, 70,
+ 54, 44, 10, 7, 21, 31, 124, 65, 61, 31,
+ 69, 65, 47, 59, 53, 51, 55, 53, 57, 49,
+ 45, 47, 45, 27, 37, 19, 27, 27, 11, 3,
+ 1, 1, 8, 4, 2, 5, 30, 24, 26, 18,
+ 10, 32, 24, 14, 26, 24, 18, 14, 10, 5,
+ 9, 7, 12, 21, 24, 52, 22, 16, 38, 36,
+ 28, 28, 38, 28, 40, 8, 1, 2, 84, 82,
+ 92, 78, 78, 96, 108, 108, 118, 114, 124, 124,
+ 114, 94, 38, 108, 124, 106, 116, 100, 88, 64,
+ 48, 42, 12, 15, 15, 37, 69, 86, 84, 88,
+ 74, 60, 62, 46, 40, 30, 26, 14, 0, 1,
+ 27, 35, 5, 7, 27, 48, 52, 42, 22, 38,
+ 36, 12, 26, 26, 8, 15, 11, 27, 61, 28,
+ 8, 11, 25, 3, 3, 0, 6, 6, 10, 6,
+ 10, 16, 32, 16, 18, 30, 30, 118, 106, 96,
+ 82, 70, 54, 36, 6, 45, 4, 68, 54, 48,
+ 38, 32, 16, 10, 6, 13, 35, 25, 11, 23,
+ 7, 8, 15, 23, 1, 6, 6, 6, 16, 8,
+ 6, 122, 106, 82, 62, 46, 24, 2, 23, 59,
+ },
+
+ {
+ /* Context Tables for P, SP, B Slices :: cabac_init_idc = 0, qp = 13 */
+
+ 124, 16, 25, 124, 16, 25, 81, 43, 30, 20,
+ 14, 24, 86, 92, 108, 28, 47, 4, 7, 14,
+ 2, 10, 11, 0, 29, 26, 80, 49, 79, 95,
+ 51, 47, 7, 7, 14, 2, 35, 35, 14, 22,
+ 5, 25, 47, 8, 17, 35, 51, 10, 19, 43,
+ 10, 17, 29, 49, 4, 11, 13, 23, 5, 10,
+ 44, 0, 0, 0, 23, 51, 67, 5, 24, 1,
+ 36, 21, 59, 81, 23, 33, 30, 8, 27, 13,
+ 57, 19, 47, 37, 79, 45, 41, 53, 67, 40,
+ 1, 14, 43, 15, 41, 33, 87, 10, 1, 0,
+ 19, 32, 9, 3, 35, 39, 19, 21, 15, 16,
+ 18, 3, 26, 34, 27, 0, 5, 7, 10, 21,
+ 40, 7, 4, 40, 48, 60, 48, 34, 41, 7,
+ 15, 14, 14, 61, 6, 18, 12, 8, 14, 14,
+ 22, 16, 4, 6, 1, 6, 26, 18, 29, 5,
+ 4, 13, 16, 22, 30, 16, 14, 16, 10, 8,
+ 26, 30, 18, 27, 6, 25, 54, 44, 34, 28,
+ 34, 36, 22, 32, 42, 0, 26, 18, 14, 8,
+ 5, 16, 4, 10, 14, 8, 20, 32, 8, 35,
+ 6, 23, 10, 30, 39, 72, 92, 76, 74, 74,
+ 74, 66, 66, 56, 38, 30, 24, 26, 14, 39,
+ 7, 0, 21, 40, 34, 28, 34, 26, 14, 12,
+ 12, 15, 5, 15, 59, 47, 73, 15, 42, 36,
+ 22, 4, 12, 5, 15, 23, 49, 11, 74, 42,
+ 30, 16, 18, 3, 19, 29, 61, 10, 74, 56,
+ 50, 38, 26, 10, 1, 15, 35, 13, 76, 56,
+ 32, 8, 14, 11, 29, 49, 10, 104, 82, 68,
+ 52, 44, 10, 7, 21, 29, 124, 63, 59, 29,
+ 65, 63, 45, 57, 49, 49, 51, 51, 53, 47,
+ 43, 45, 43, 25, 35, 21, 25, 25, 11, 3,
+ 3, 3, 8, 2, 0, 7, 28, 24, 26, 18,
+ 10, 30, 24, 14, 24, 22, 18, 12, 8, 5,
+ 9, 7, 12, 21, 22, 50, 22, 16, 36, 34,
+ 28, 26, 36, 28, 40, 6, 3, 1, 82, 82,
+ 90, 76, 74, 94, 104, 104, 114, 110, 124, 122,
+ 108, 90, 34, 102, 124, 100, 108, 96, 84, 60,
+ 46, 40, 12, 13, 13, 35, 65, 84, 82, 84,
+ 72, 56, 58, 44, 36, 28, 22, 10, 3, 3,
+ 29, 37, 5, 7, 29, 44, 50, 38, 20, 36,
+ 34, 10, 24, 24, 6, 17, 13, 29, 61, 28,
+ 8, 13, 23, 3, 1, 2, 8, 8, 12, 8,
+ 12, 18, 36, 18, 20, 32, 34, 116, 102, 92,
+ 78, 64, 48, 30, 0, 49, 4, 68, 54, 50,
+ 40, 34, 16, 12, 8, 11, 33, 25, 9, 23,
+ 5, 10, 13, 21, 0, 8, 6, 8, 18, 8,
+ 6, 118, 102, 78, 56, 40, 18, 5, 29, 65,
+ },
+
+ {
+ /* Context Tables for P, SP, B Slices :: cabac_init_idc = 0, qp = 14 */
+
+ 122, 16, 25, 122, 16, 25, 77, 39, 32, 20,
+ 12, 20, 82, 90, 108, 28, 43, 4, 5, 16,
+ 2, 8, 11, 1, 31, 26, 76, 51, 81, 95,
+ 45, 45, 7, 5, 16, 2, 33, 33, 16, 22,
+ 5, 23, 45, 6, 19, 35, 51, 10, 19, 43,
+ 10, 17, 29, 49, 6, 11, 13, 23, 3, 10,
+ 44, 0, 0, 0, 23, 51, 67, 3, 22, 1,
+ 36, 21, 59, 77, 19, 31, 34, 12, 25, 11,
+ 53, 17, 45, 35, 75, 43, 41, 51, 65, 40,
+ 1, 14, 43, 15, 41, 33, 83, 10, 3, 1,
+ 21, 32, 9, 3, 35, 39, 17, 21, 13, 16,
+ 18, 3, 26, 32, 27, 0, 5, 7, 10, 21,
+ 38, 7, 4, 38, 46, 58, 46, 32, 41, 5,
+ 15, 12, 14, 59, 6, 18, 12, 8, 14, 14,
+ 24, 16, 4, 6, 0, 6, 24, 18, 31, 5,
+ 4, 13, 14, 20, 30, 16, 12, 16, 10, 8,
+ 22, 30, 16, 27, 6, 25, 52, 44, 34, 28,
+ 34, 36, 22, 32, 42, 0, 24, 18, 14, 8,
+ 5, 14, 2, 8, 12, 6, 18, 30, 6, 35,
+ 4, 23, 8, 26, 39, 70, 88, 72, 70, 70,
+ 70, 62, 62, 52, 34, 28, 22, 22, 10, 41,
+ 7, 0, 21, 38, 32, 26, 30, 22, 10, 10,
+ 10, 17, 5, 15, 59, 47, 73, 13, 42, 36,
+ 22, 4, 12, 5, 15, 23, 47, 11, 74, 42,
+ 30, 16, 20, 1, 17, 27, 57, 10, 74, 56,
+ 50, 38, 28, 10, 1, 15, 35, 13, 76, 56,
+ 30, 8, 14, 11, 29, 47, 10, 102, 82, 66,
+ 50, 44, 10, 7, 19, 29, 124, 61, 57, 29,
+ 63, 61, 43, 53, 47, 47, 49, 47, 51, 47,
+ 41, 45, 43, 21, 35, 21, 25, 25, 11, 5,
+ 3, 3, 6, 0, 1, 9, 28, 22, 26, 16,
+ 8, 30, 22, 14, 22, 22, 16, 12, 6, 5,
+ 11, 7, 12, 23, 22, 50, 20, 16, 34, 34,
+ 28, 24, 34, 26, 38, 4, 3, 3, 80, 80,
+ 90, 74, 72, 90, 100, 100, 110, 104, 120, 118,
+ 102, 84, 32, 96, 124, 94, 100, 90, 78, 56,
+ 44, 38, 12, 11, 13, 33, 61, 80, 78, 80,
+ 68, 52, 54, 40, 32, 24, 20, 8, 5, 7,
+ 31, 39, 7, 9, 31, 42, 46, 36, 16, 32,
+ 30, 6, 20, 20, 2, 19, 17, 33, 63, 26,
+ 6, 15, 23, 1, 1, 2, 8, 8, 14, 10,
+ 14, 20, 38, 20, 22, 36, 36, 116, 100, 88,
+ 72, 60, 42, 24, 5, 53, 6, 70, 56, 50,
+ 40, 36, 18, 12, 8, 11, 33, 23, 9, 21,
+ 3, 12, 13, 21, 0, 8, 6, 8, 20, 8,
+ 6, 116, 98, 72, 50, 34, 10, 13, 37, 69,
+ },
+
+ {
+ /* Context Tables for P, SP, B Slices :: cabac_init_idc = 0, qp = 15 */
+
+ 120, 16, 25, 120, 16, 25, 73, 37, 32, 20,
+ 12, 18, 80, 88, 108, 28, 39, 4, 3, 18,
+ 2, 6, 11, 3, 33, 24, 72, 55, 83, 97,
+ 39, 43, 7, 3, 18, 2, 33, 31, 16, 20,
+ 5, 23, 43, 6, 19, 35, 51, 10, 19, 41,
+ 10, 17, 29, 49, 6, 11, 13, 23, 3, 10,
+ 44, 0, 0, 0, 21, 51, 67, 3, 20, 1,
+ 36, 21, 59, 73, 15, 27, 36, 16, 23, 9,
+ 51, 15, 43, 33, 71, 43, 41, 51, 63, 40,
+ 1, 14, 41, 15, 39, 31, 81, 10, 3, 1,
+ 21, 32, 9, 3, 33, 39, 17, 21, 13, 16,
+ 18, 3, 26, 32, 27, 0, 5, 7, 10, 21,
+ 36, 7, 4, 36, 44, 56, 44, 30, 39, 5,
+ 15, 10, 12, 59, 4, 18, 12, 8, 14, 14,
+ 26, 16, 4, 6, 0, 6, 22, 18, 33, 5,
+ 4, 15, 14, 18, 30, 16, 10, 16, 10, 8,
+ 20, 30, 14, 27, 6, 27, 50, 44, 34, 28,
+ 34, 36, 22, 32, 42, 0, 24, 18, 14, 8,
+ 5, 14, 2, 8, 12, 6, 16, 28, 6, 35,
+ 4, 23, 6, 24, 39, 66, 84, 68, 66, 66,
+ 66, 58, 58, 48, 30, 24, 18, 18, 6, 43,
+ 7, 0, 23, 36, 30, 24, 28, 20, 8, 8,
+ 8, 19, 7, 17, 59, 47, 71, 13, 42, 36,
+ 22, 4, 12, 5, 15, 21, 45, 9, 74, 42,
+ 30, 16, 20, 1, 15, 25, 53, 10, 74, 56,
+ 50, 38, 28, 10, 1, 15, 33, 13, 76, 56,
+ 30, 8, 14, 11, 29, 45, 10, 102, 80, 64,
+ 48, 44, 10, 7, 19, 27, 124, 59, 55, 27,
+ 61, 59, 41, 51, 45, 45, 47, 45, 47, 45,
+ 39, 45, 41, 19, 35, 23, 25, 23, 11, 5,
+ 5, 3, 4, 1, 3, 11, 26, 22, 26, 16,
+ 8, 28, 22, 14, 20, 20, 16, 12, 4, 5,
+ 11, 7, 12, 23, 20, 48, 18, 16, 32, 32,
+ 28, 22, 32, 26, 38, 2, 5, 7, 78, 78,
+ 88, 72, 70, 88, 96, 96, 106, 100, 114, 112,
+ 96, 80, 30, 90, 118, 88, 92, 84, 74, 52,
+ 42, 36, 12, 9, 13, 31, 57, 76, 74, 76,
+ 64, 48, 50, 38, 28, 22, 16, 6, 9, 9,
+ 33, 41, 7, 9, 33, 40, 44, 32, 12, 28,
+ 26, 4, 18, 16, 0, 21, 19, 35, 65, 24,
+ 4, 17, 23, 1, 1, 4, 10, 10, 16, 12,
+ 16, 22, 40, 22, 24, 38, 38, 114, 96, 84,
+ 68, 54, 36, 18, 11, 57, 6, 70, 56, 52,
+ 40, 38, 18, 12, 10, 9, 33, 23, 7, 21,
+ 1, 14, 13, 19, 2, 10, 6, 10, 22, 8,
+ 6, 114, 94, 68, 44, 28, 2, 21, 43, 75,
+ },
+
+ {
+ /* Context Tables for P, SP, B Slices :: cabac_init_idc = 0, qp = 16 */
+
+ 116, 14, 27, 116, 14, 27, 71, 35, 32, 20,
+ 10, 14, 76, 84, 106, 28, 35, 2, 3, 18,
+ 0, 4, 11, 7, 37, 22, 68, 59, 85, 99,
+ 35, 41, 9, 3, 18, 0, 33, 29, 16, 18,
+ 5, 23, 43, 4, 21, 35, 53, 10, 19, 41,
+ 8, 19, 29, 49, 6, 11, 13, 23, 3, 8,
+ 44, 0, 0, 0, 21, 53, 67, 3, 18, 3,
+ 36, 21, 59, 71, 13, 25, 38, 18, 21, 7,
+ 49, 13, 41, 33, 69, 43, 41, 51, 63, 40,
+ 1, 14, 41, 15, 39, 31, 79, 8, 5, 3,
+ 23, 32, 9, 3, 33, 39, 17, 21, 13, 14,
+ 16, 3, 24, 30, 27, 1, 5, 7, 8, 21,
+ 34, 9, 2, 34, 40, 54, 42, 28, 39, 5,
+ 15, 8, 10, 59, 2, 16, 10, 8, 14, 14,
+ 26, 16, 4, 4, 0, 4, 20, 16, 35, 7,
+ 2, 17, 12, 16, 28, 14, 8, 16, 10, 8,
+ 16, 28, 12, 27, 6, 29, 48, 42, 32, 28,
+ 34, 34, 20, 32, 40, 1, 22, 18, 12, 6,
+ 7, 12, 0, 6, 10, 4, 14, 26, 4, 35,
+ 2, 23, 4, 20, 39, 62, 80, 64, 62, 62,
+ 60, 54, 54, 44, 26, 20, 14, 12, 2, 47,
+ 9, 1, 25, 34, 26, 20, 24, 16, 4, 4,
+ 4, 21, 9, 19, 59, 47, 71, 13, 42, 36,
+ 22, 4, 12, 5, 15, 21, 43, 9, 72, 42,
+ 30, 16, 20, 1, 15, 25, 51, 8, 74, 56,
+ 48, 36, 28, 10, 1, 15, 33, 13, 76, 54,
+ 28, 6, 14, 11, 29, 45, 10, 100, 78, 62,
+ 46, 42, 10, 7, 19, 27, 124, 59, 53, 27,
+ 59, 57, 41, 49, 43, 43, 45, 43, 45, 45,
+ 39, 45, 41, 17, 35, 25, 25, 23, 11, 7,
+ 7, 5, 2, 3, 7, 15, 24, 20, 26, 14,
+ 6, 26, 20, 12, 18, 18, 14, 10, 2, 7,
+ 13, 7, 12, 25, 18, 46, 16, 14, 30, 30,
+ 28, 20, 28, 24, 36, 0, 7, 11, 76, 76,
+ 86, 68, 66, 84, 92, 92, 100, 94, 108, 106,
+ 90, 74, 26, 84, 110, 82, 82, 78, 68, 48,
+ 38, 32, 12, 9, 13, 29, 53, 72, 70, 72,
+ 60, 42, 46, 34, 22, 18, 12, 2, 13, 13,
+ 35, 43, 9, 11, 37, 36, 40, 28, 8, 24,
+ 22, 0, 14, 12, 3, 25, 23, 39, 67, 22,
+ 2, 19, 23, 1, 1, 4, 10, 10, 18, 12,
+ 18, 22, 42, 24, 26, 40, 40, 112, 92, 78,
+ 62, 48, 30, 10, 17, 63, 6, 70, 56, 52,
+ 40, 38, 18, 12, 10, 9, 33, 23, 7, 21,
+ 0, 16, 13, 19, 2, 10, 6, 10, 22, 8,
+ 4, 110, 88, 62, 38, 20, 5, 29, 51, 81,
+ },
+
+ {
+ /* Context Tables for P, SP, B Slices :: cabac_init_idc = 0, qp = 17 */
+
+ 114, 14, 27, 114, 14, 27, 67, 31, 34, 22,
+ 10, 12, 74, 82, 106, 28, 29, 2, 1, 20,
+ 0, 4, 9, 9, 39, 22, 66, 61, 87, 99,
+ 29, 37, 9, 1, 20, 0, 31, 25, 18, 18,
+ 3, 21, 41, 4, 21, 33, 53, 10, 17, 39,
+ 8, 19, 27, 49, 8, 9, 11, 21, 1, 8,
+ 44, 0, 0, 0, 19, 53, 67, 1, 18, 3,
+ 36, 19, 57, 67, 9, 21, 42, 22, 17, 5,
+ 45, 9, 37, 31, 65, 41, 39, 49, 61, 42,
+ 0, 16, 39, 13, 37, 29, 75, 8, 5, 3,
+ 23, 34, 9, 1, 31, 37, 15, 19, 11, 14,
+ 16, 1, 24, 30, 25, 1, 3, 7, 8, 19,
+ 34, 9, 2, 34, 38, 54, 42, 28, 37, 3,
+ 13, 8, 10, 57, 2, 16, 10, 10, 14, 14,
+ 28, 18, 6, 4, 2, 4, 20, 16, 35, 7,
+ 2, 17, 12, 14, 28, 14, 8, 18, 12, 8,
+ 14, 28, 12, 25, 6, 29, 48, 42, 32, 28,
+ 34, 34, 20, 32, 40, 1, 22, 18, 12, 6,
+ 7, 12, 0, 6, 10, 4, 14, 26, 4, 33,
+ 2, 21, 4, 18, 37, 60, 78, 62, 60, 58,
+ 56, 52, 52, 40, 24, 18, 12, 8, 0, 49,
+ 9, 1, 25, 32, 24, 18, 22, 14, 2, 2,
+ 2, 21, 9, 19, 57, 45, 69, 11, 44, 36,
+ 22, 6, 14, 3, 13, 19, 39, 7, 72, 42,
+ 30, 16, 22, 0, 13, 23, 47, 8, 76, 58,
+ 48, 36, 30, 10, 1, 13, 31, 13, 78, 54,
+ 28, 6, 16, 9, 27, 43, 10, 100, 78, 62,
+ 46, 42, 10, 5, 17, 25, 124, 57, 51, 25,
+ 55, 53, 39, 45, 39, 39, 41, 39, 41, 43,
+ 37, 43, 39, 13, 33, 25, 23, 21, 9, 7,
+ 7, 5, 2, 3, 9, 17, 24, 20, 28, 14,
+ 6, 26, 20, 12, 18, 18, 14, 10, 2, 7,
+ 13, 5, 14, 25, 18, 46, 16, 14, 30, 30,
+ 30, 20, 26, 24, 36, 0, 7, 13, 76, 76,
+ 86, 66, 64, 82, 88, 88, 96, 90, 104, 102,
+ 86, 70, 24, 80, 104, 76, 74, 74, 64, 46,
+ 36, 30, 12, 7, 11, 25, 47, 70, 68, 70,
+ 58, 38, 44, 32, 18, 16, 10, 0, 15, 15,
+ 37, 43, 9, 11, 39, 34, 38, 26, 6, 22,
+ 20, 1, 12, 10, 5, 27, 25, 41, 67, 22,
+ 2, 19, 21, 0, 0, 6, 12, 12, 22, 14,
+ 20, 24, 46, 28, 28, 44, 44, 112, 90, 74,
+ 58, 44, 26, 4, 21, 67, 8, 72, 58, 54,
+ 42, 40, 20, 14, 12, 7, 31, 21, 5, 19,
+ 4, 20, 11, 17, 4, 12, 8, 12, 24, 8,
+ 4, 108, 84, 58, 34, 14, 11, 35, 57, 85,
+ },
+
+ {
+ /* Context Tables for P, SP, B Slices :: cabac_init_idc = 0, qp = 18 */
+
+ 112, 14, 27, 112, 14, 27, 63, 29, 34, 22,
+ 10, 10, 72, 80, 106, 28, 25, 2, 0, 22,
+ 0, 2, 9, 11, 41, 20, 62, 65, 89, 101,
+ 23, 35, 9, 0, 22, 0, 31, 23, 18, 16,
+ 3, 21, 39, 4, 21, 33, 53, 10, 17, 37,
+ 8, 19, 27, 49, 8, 9, 11, 21, 1, 8,
+ 44, 0, 0, 0, 17, 53, 67, 1, 16, 3,
+ 36, 19, 57, 63, 5, 17, 44, 26, 15, 3,
+ 43, 7, 35, 29, 61, 41, 39, 49, 59, 42,
+ 0, 16, 37, 13, 35, 29, 73, 8, 5, 3,
+ 23, 34, 9, 1, 31, 37, 15, 19, 11, 14,
+ 16, 1, 24, 30, 25, 1, 3, 7, 8, 19,
+ 32, 9, 2, 32, 36, 52, 40, 26, 35, 3,
+ 13, 6, 8, 57, 0, 16, 10, 10, 14, 14,
+ 30, 18, 6, 4, 2, 4, 18, 16, 37, 7,
+ 2, 19, 12, 12, 28, 14, 6, 18, 12, 8,
+ 12, 28, 10, 25, 6, 31, 46, 42, 32, 28,
+ 34, 34, 20, 32, 40, 1, 22, 18, 12, 6,
+ 7, 12, 0, 6, 10, 4, 12, 24, 2, 33,
+ 2, 21, 2, 16, 37, 56, 74, 58, 56, 54,
+ 52, 48, 48, 36, 20, 14, 8, 4, 3, 51,
+ 9, 1, 27, 30, 22, 16, 20, 10, 0, 0,
+ 0, 23, 11, 21, 57, 45, 67, 11, 44, 36,
+ 22, 6, 14, 3, 13, 19, 37, 5, 72, 42,
+ 30, 16, 22, 0, 11, 21, 43, 8, 76, 58,
+ 48, 36, 30, 10, 1, 13, 29, 13, 78, 54,
+ 28, 6, 16, 9, 27, 41, 10, 98, 76, 60,
+ 44, 42, 10, 5, 17, 25, 124, 55, 49, 23,
+ 53, 51, 37, 43, 37, 37, 39, 37, 37, 41,
+ 35, 43, 37, 11, 33, 27, 23, 19, 9, 7,
+ 9, 5, 0, 5, 11, 19, 22, 20, 28, 14,
+ 4, 24, 20, 12, 16, 16, 14, 10, 0, 7,
+ 13, 5, 14, 25, 16, 44, 14, 14, 28, 28,
+ 30, 18, 24, 24, 36, 1, 9, 17, 74, 74,
+ 84, 64, 62, 80, 84, 84, 92, 86, 98, 96,
+ 80, 66, 22, 74, 98, 70, 66, 68, 60, 42,
+ 34, 28, 12, 5, 11, 23, 43, 66, 64, 66,
+ 54, 34, 40, 28, 14, 14, 6, 1, 19, 17,
+ 39, 45, 11, 11, 41, 32, 36, 22, 2, 18,
+ 16, 3, 8, 6, 7, 29, 27, 43, 69, 20,
+ 0, 21, 21, 0, 0, 8, 12, 14, 24, 16,
+ 22, 26, 48, 30, 30, 46, 46, 110, 86, 70,
+ 54, 38, 20, 1, 27, 71, 8, 72, 58, 54,
+ 42, 42, 20, 14, 12, 7, 31, 21, 3, 19,
+ 6, 22, 11, 15, 6, 14, 8, 12, 26, 8,
+ 4, 106, 80, 54, 28, 8, 19, 43, 63, 91,
+ },
+
+ {
+ /* Context Tables for P, SP, B Slices :: cabac_init_idc = 0, qp = 19 */
+
+ 110, 14, 27, 110, 14, 27, 59, 25, 36, 22,
+ 8, 6, 68, 78, 106, 28, 21, 2, 2, 24,
+ 0, 0, 9, 13, 43, 20, 58, 67, 91, 101,
+ 17, 33, 9, 2, 24, 0, 29, 21, 20, 16,
+ 3, 19, 37, 2, 23, 33, 53, 10, 17, 37,
+ 8, 19, 27, 49, 10, 9, 11, 21, 0, 8,
+ 44, 0, 0, 0, 17, 53, 67, 0, 14, 3,
+ 36, 19, 57, 59, 1, 15, 48, 30, 13, 1,
+ 39, 5, 33, 27, 57, 39, 39, 47, 57, 42,
+ 0, 16, 37, 13, 35, 27, 69, 8, 7, 5,
+ 25, 34, 9, 1, 29, 37, 13, 19, 9, 14,
+ 16, 1, 24, 28, 25, 1, 3, 7, 8, 19,
+ 30, 9, 2, 30, 34, 50, 38, 24, 35, 1,
+ 13, 4, 8, 55, 0, 16, 10, 10, 14, 14,
+ 32, 18, 6, 4, 4, 4, 16, 16, 39, 7,
+ 2, 19, 10, 10, 28, 14, 4, 18, 12, 8,
+ 8, 28, 8, 25, 6, 31, 44, 42, 32, 28,
+ 34, 34, 20, 32, 40, 1, 20, 18, 12, 6,
+ 7, 10, 1, 4, 8, 2, 10, 22, 2, 33,
+ 0, 21, 0, 12, 37, 54, 70, 54, 52, 50,
+ 48, 44, 44, 32, 16, 12, 6, 0, 7, 53,
+ 9, 1, 27, 28, 20, 14, 16, 8, 3, 1,
+ 1, 25, 11, 21, 57, 45, 67, 9, 44, 36,
+ 22, 6, 14, 3, 13, 17, 35, 5, 72, 42,
+ 30, 16, 24, 2, 9, 19, 39, 8, 76, 58,
+ 48, 36, 32, 10, 1, 13, 29, 13, 78, 54,
+ 26, 6, 16, 9, 27, 39, 10, 98, 76, 58,
+ 42, 42, 10, 5, 15, 23, 124, 53, 47, 23,
+ 51, 49, 35, 39, 35, 35, 37, 33, 35, 41,
+ 33, 43, 37, 7, 33, 27, 23, 19, 9, 9,
+ 9, 5, 1, 7, 13, 21, 22, 18, 28, 12,
+ 4, 24, 18, 12, 14, 16, 12, 10, 1, 7,
+ 15, 5, 14, 27, 16, 44, 12, 14, 26, 28,
+ 30, 16, 22, 22, 34, 3, 9, 19, 72, 72,
+ 84, 62, 60, 76, 80, 80, 88, 80, 94, 92,
+ 74, 60, 20, 68, 92, 64, 58, 62, 54, 38,
+ 32, 26, 12, 3, 11, 21, 39, 62, 60, 62,
+ 50, 30, 36, 26, 10, 10, 4, 3, 21, 21,
+ 41, 47, 11, 13, 43, 30, 32, 20, 1, 14,
+ 12, 7, 6, 2, 11, 31, 31, 47, 71, 18,
+ 1, 23, 21, 2, 0, 8, 14, 14, 26, 18,
+ 24, 28, 50, 32, 32, 50, 48, 110, 84, 66,
+ 48, 34, 14, 7, 33, 75, 10, 74, 60, 56,
+ 42, 44, 22, 14, 14, 5, 31, 19, 3, 17,
+ 8, 24, 11, 15, 6, 14, 8, 14, 28, 8,
+ 4, 104, 76, 48, 22, 2, 27, 51, 71, 95,
+ },
+
+ {
+ /* Context Tables for P, SP, B Slices :: cabac_init_idc = 0, qp = 20 */
+
+ 106, 14, 27, 106, 14, 27, 57, 23, 36, 22,
+ 8, 4, 66, 74, 106, 28, 15, 0, 4, 24,
+ 0, 1, 9, 15, 45, 18, 54, 71, 93, 103,
+ 11, 31, 9, 4, 24, 0, 29, 19, 20, 14,
+ 3, 19, 37, 2, 23, 31, 53, 10, 17, 35,
+ 6, 21, 27, 49, 10, 7, 11, 19, 0, 8,
+ 44, 0, 0, 0, 15, 55, 67, 0, 12, 3,
+ 36, 19, 57, 57, 0, 11, 50, 32, 11, 0,
+ 37, 1, 31, 25, 53, 39, 37, 47, 55, 42,
+ 0, 16, 35, 13, 33, 27, 67, 6, 7, 5,
+ 25, 34, 9, 1, 29, 37, 13, 19, 9, 14,
+ 14, 1, 24, 28, 25, 1, 3, 7, 8, 19,
+ 30, 11, 2, 30, 32, 48, 36, 22, 33, 1,
+ 13, 2, 6, 55, 1, 16, 8, 10, 14, 14,
+ 34, 20, 6, 2, 4, 2, 14, 14, 41, 7,
+ 2, 21, 10, 8, 26, 12, 4, 20, 12, 8,
+ 6, 26, 6, 25, 6, 33, 44, 42, 32, 28,
+ 34, 34, 20, 32, 38, 1, 20, 18, 12, 6,
+ 9, 10, 1, 4, 8, 2, 10, 20, 0, 33,
+ 0, 21, 1, 10, 37, 50, 66, 50, 48, 46,
+ 44, 40, 40, 28, 12, 8, 2, 5, 11, 55,
+ 9, 3, 29, 26, 18, 12, 14, 4, 5, 3,
+ 5, 25, 13, 23, 57, 45, 65, 9, 44, 36,
+ 22, 6, 14, 3, 13, 17, 33, 3, 72, 42,
+ 30, 16, 24, 2, 9, 19, 35, 8, 76, 58,
+ 48, 36, 32, 10, 1, 13, 27, 13, 78, 52,
+ 26, 6, 16, 9, 27, 39, 10, 96, 74, 56,
+ 40, 42, 10, 5, 15, 23, 124, 51, 45, 21,
+ 47, 47, 33, 37, 31, 33, 33, 31, 31, 39,
+ 33, 41, 35, 5, 31, 29, 23, 17, 9, 9,
+ 11, 7, 3, 9, 15, 23, 20, 18, 28, 12,
+ 2, 22, 18, 12, 12, 14, 12, 8, 3, 9,
+ 15, 5, 14, 27, 14, 42, 10, 14, 24, 26,
+ 30, 14, 20, 22, 34, 5, 11, 23, 70, 72,
+ 82, 60, 56, 74, 76, 76, 84, 76, 88, 86,
+ 68, 56, 16, 62, 84, 58, 50, 58, 50, 34,
+ 30, 22, 12, 1, 9, 19, 35, 60, 56, 58,
+ 48, 26, 32, 22, 6, 8, 0, 7, 25, 23,
+ 43, 49, 13, 13, 45, 26, 30, 16, 3, 10,
+ 8, 9, 2, 1, 13, 33, 33, 49, 71, 16,
+ 1, 25, 21, 2, 2, 10, 14, 16, 28, 18,
+ 26, 30, 52, 34, 34, 52, 50, 108, 80, 62,
+ 44, 28, 8, 13, 39, 81, 10, 74, 60, 56,
+ 44, 44, 22, 16, 14, 5, 29, 19, 1, 17,
+ 10, 26, 11, 13, 8, 16, 8, 14, 30, 8,
+ 4, 100, 72, 44, 16, 3, 35, 59, 77, 101,
+ },
+
+ {
+ /* Context Tables for P, SP, B Slices :: cabac_init_idc = 0, qp = 21 */
+
+ 104, 14, 27, 104, 14, 27, 53, 19, 36, 22,
+ 6, 0, 62, 72, 106, 28, 11, 0, 6, 26,
+ 0, 1, 7, 17, 47, 18, 50, 75, 95, 103,
+ 5, 27, 9, 6, 26, 0, 29, 17, 20, 14,
+ 3, 19, 35, 2, 25, 31, 53, 10, 17, 33,
+ 6, 21, 27, 49, 12, 7, 9, 19, 2, 8,
+ 44, 0, 0, 0, 15, 55, 67, 2, 10, 3,
+ 36, 17, 57, 53, 4, 9, 54, 36, 7, 2,
+ 35, 0, 29, 23, 49, 37, 37, 45, 53, 42,
+ 0, 16, 35, 13, 33, 25, 63, 6, 7, 5,
+ 25, 34, 9, 0, 27, 37, 13, 19, 7, 14,
+ 14, 1, 24, 28, 23, 1, 1, 7, 8, 19,
+ 28, 11, 2, 28, 30, 48, 34, 20, 31, 0,
+ 13, 0, 6, 53, 1, 16, 8, 10, 14, 14,
+ 36, 20, 6, 2, 4, 2, 12, 14, 43, 7,
+ 2, 21, 8, 6, 26, 12, 2, 20, 12, 8,
+ 4, 26, 4, 25, 6, 33, 42, 42, 32, 28,
+ 34, 34, 20, 32, 38, 1, 20, 18, 12, 6,
+ 9, 10, 1, 2, 8, 2, 8, 18, 0, 33,
+ 1, 21, 3, 8, 37, 48, 62, 48, 46, 42,
+ 40, 36, 36, 24, 8, 4, 1, 9, 13, 57,
+ 9, 3, 29, 24, 16, 10, 12, 2, 9, 5,
+ 7, 27, 13, 23, 57, 45, 65, 9, 44, 36,
+ 22, 6, 14, 1, 11, 15, 31, 3, 72, 42,
+ 30, 16, 24, 4, 7, 17, 31, 8, 76, 58,
+ 48, 36, 32, 10, 1, 13, 25, 13, 80, 52,
+ 24, 6, 16, 9, 27, 37, 10, 96, 72, 54,
+ 38, 42, 10, 5, 15, 21, 124, 49, 43, 19,
+ 45, 45, 31, 35, 29, 31, 31, 27, 27, 37,
+ 31, 41, 35, 3, 31, 29, 21, 15, 9, 11,
+ 11, 7, 3, 11, 17, 25, 18, 16, 28, 10,
+ 2, 22, 18, 12, 10, 12, 12, 8, 5, 9,
+ 15, 5, 14, 27, 12, 42, 10, 14, 22, 24,
+ 30, 12, 18, 20, 32, 7, 13, 27, 68, 70,
+ 82, 58, 54, 70, 72, 72, 80, 70, 82, 82,
+ 62, 52, 14, 56, 78, 52, 42, 52, 44, 30,
+ 28, 20, 12, 0, 9, 17, 31, 56, 54, 54,
+ 44, 22, 28, 20, 2, 6, 3, 9, 29, 27,
+ 45, 51, 13, 13, 47, 24, 26, 12, 7, 8,
+ 6, 13, 0, 3, 17, 35, 35, 51, 73, 16,
+ 3, 27, 19, 4, 2, 10, 16, 18, 30, 20,
+ 28, 32, 56, 36, 36, 54, 54, 108, 76, 58,
+ 38, 24, 2, 19, 45, 85, 10, 76, 62, 58,
+ 44, 46, 22, 16, 16, 3, 29, 19, 0, 15,
+ 12, 28, 9, 11, 8, 16, 8, 16, 32, 8,
+ 4, 98, 68, 38, 10, 9, 41, 67, 85, 107,
+ },
+
+ {
+ /* Context Tables for P, SP, B Slices :: cabac_init_idc = 0, qp = 22 */
+
+ 102, 14, 29, 102, 14, 29, 49, 17, 38, 22,
+ 6, 1, 60, 70, 106, 28, 7, 0, 6, 28,
+ 0, 3, 7, 19, 49, 16, 48, 77, 97, 105,
+ 0, 25, 9, 6, 28, 0, 27, 15, 22, 12,
+ 1, 17, 33, 0, 25, 31, 53, 10, 15, 33,
+ 6, 21, 25, 49, 12, 7, 9, 19, 2, 8,
+ 44, 0, 0, 0, 13, 55, 67, 2, 10, 5,
+ 36, 17, 55, 49, 8, 5, 56, 40, 5, 4,
+ 31, 2, 27, 23, 47, 37, 37, 45, 53, 44,
+ 0, 16, 33, 11, 31, 25, 61, 6, 9, 7,
+ 27, 34, 9, 0, 27, 37, 11, 19, 7, 14,
+ 14, 0, 24, 26, 23, 1, 1, 7, 8, 19,
+ 26, 11, 2, 26, 28, 46, 34, 20, 31, 0,
+ 13, 0, 4, 53, 3, 16, 8, 10, 14, 14,
+ 36, 20, 6, 2, 6, 2, 10, 14, 43, 9,
+ 2, 23, 8, 4, 26, 12, 0, 20, 12, 8,
+ 0, 26, 2, 25, 6, 35, 40, 42, 30, 28,
+ 34, 34, 20, 32, 38, 1, 18, 18, 10, 6,
+ 9, 8, 3, 2, 6, 0, 6, 16, 1, 33,
+ 1, 19, 5, 4, 37, 44, 58, 44, 42, 38,
+ 36, 32, 32, 20, 4, 2, 3, 13, 17, 59,
+ 9, 3, 31, 22, 14, 8, 8, 1, 11, 7,
+ 9, 29, 15, 25, 57, 45, 63, 7, 44, 36,
+ 22, 8, 14, 1, 11, 15, 29, 1, 70, 42,
+ 30, 16, 26, 4, 5, 15, 27, 8, 78, 58,
+ 48, 36, 34, 10, 1, 11, 25, 13, 80, 52,
+ 24, 6, 16, 7, 25, 35, 10, 94, 72, 52,
+ 38, 42, 10, 3, 13, 21, 124, 49, 41, 19,
+ 43, 43, 29, 31, 27, 29, 29, 25, 25, 37,
+ 29, 41, 33, 0, 31, 31, 21, 15, 9, 11,
+ 13, 7, 5, 13, 19, 27, 18, 16, 28, 10,
+ 0, 20, 16, 12, 10, 12, 10, 8, 5, 9,
+ 17, 5, 14, 29, 12, 40, 8, 12, 20, 24,
+ 32, 12, 16, 20, 32, 7, 13, 29, 66, 68,
+ 80, 56, 52, 68, 68, 68, 76, 66, 78, 76,
+ 56, 46, 12, 52, 72, 46, 34, 46, 40, 26,
+ 26, 18, 12, 2, 9, 13, 25, 52, 50, 50,
+ 40, 18, 24, 16, 1, 2, 5, 11, 31, 29,
+ 47, 53, 15, 15, 49, 22, 24, 10, 11, 4,
+ 2, 15, 3, 7, 19, 39, 39, 55, 75, 14,
+ 5, 29, 19, 4, 2, 12, 16, 18, 34, 22,
+ 30, 34, 58, 40, 38, 58, 56, 106, 74, 54,
+ 34, 18, 1, 25, 49, 89, 12, 76, 62, 58,
+ 44, 48, 24, 16, 16, 3, 29, 17, 0, 15,
+ 14, 30, 9, 11, 10, 18, 8, 16, 34, 8,
+ 4, 96, 64, 34, 4, 17, 49, 73, 91, 111,
+ },
+
+ {
+ /* Context Tables for P, SP, B Slices :: cabac_init_idc = 0, qp = 23 */
+
+ 100, 14, 29, 100, 14, 29, 45, 13, 38, 22,
+ 4, 5, 56, 66, 106, 28, 1, 1, 8, 28,
+ 0, 5, 7, 21, 51, 16, 44, 81, 99, 105,
+ 6, 23, 9, 8, 28, 0, 27, 13, 22, 12,
+ 1, 17, 33, 0, 27, 29, 53, 10, 15, 31,
+ 4, 21, 25, 49, 14, 5, 9, 17, 4, 8,
+ 44, 0, 0, 0, 13, 55, 67, 4, 8, 5,
+ 36, 17, 55, 47, 12, 3, 60, 44, 3, 6,
+ 29, 6, 25, 21, 43, 35, 35, 43, 51, 44,
+ 0, 16, 33, 11, 31, 23, 57, 4, 9, 7,
+ 27, 34, 9, 0, 25, 37, 11, 19, 5, 14,
+ 12, 0, 24, 26, 23, 1, 1, 7, 8, 19,
+ 26, 11, 2, 26, 26, 44, 32, 18, 29, 2,
+ 13, 1, 4, 51, 3, 16, 6, 10, 14, 14,
+ 38, 22, 6, 2, 6, 2, 8, 14, 45, 9,
+ 2, 23, 6, 2, 24, 12, 0, 22, 12, 8,
+ 1, 24, 0, 25, 6, 35, 40, 42, 30, 28,
+ 34, 34, 20, 32, 38, 1, 18, 18, 10, 6,
+ 9, 8, 3, 0, 6, 0, 6, 14, 1, 33,
+ 3, 19, 7, 2, 37, 42, 54, 40, 38, 34,
+ 32, 28, 28, 16, 0, 1, 7, 19, 21, 61,
+ 9, 5, 31, 20, 12, 6, 6, 3, 15, 9,
+ 11, 29, 15, 25, 57, 45, 63, 7, 44, 36,
+ 22, 8, 14, 1, 11, 13, 27, 1, 70, 42,
+ 30, 16, 26, 6, 3, 15, 23, 8, 78, 58,
+ 48, 36, 34, 10, 1, 11, 23, 13, 80, 50,
+ 22, 6, 16, 7, 25, 35, 10, 94, 70, 50,
+ 36, 42, 10, 3, 13, 19, 124, 47, 39, 17,
+ 39, 41, 27, 29, 23, 27, 25, 21, 21, 35,
+ 27, 39, 33, 2, 29, 31, 21, 13, 9, 13,
+ 13, 9, 7, 15, 21, 29, 16, 14, 28, 8,
+ 0, 20, 16, 12, 8, 10, 10, 6, 7, 9,
+ 17, 5, 14, 29, 10, 40, 6, 12, 18, 22,
+ 32, 10, 14, 18, 30, 9, 15, 33, 64, 68,
+ 80, 54, 48, 64, 64, 64, 72, 60, 72, 72,
+ 50, 42, 8, 46, 64, 40, 26, 42, 34, 22,
+ 24, 16, 12, 4, 7, 11, 21, 50, 46, 46,
+ 38, 14, 20, 14, 5, 0, 9, 15, 35, 33,
+ 49, 55, 15, 15, 51, 18, 20, 6, 13, 0,
+ 1, 19, 5, 11, 23, 41, 41, 57, 75, 12,
+ 5, 31, 19, 6, 4, 12, 18, 20, 36, 24,
+ 32, 36, 60, 42, 40, 60, 58, 106, 70, 50,
+ 28, 14, 7, 31, 55, 93, 12, 78, 64, 60,
+ 46, 50, 24, 18, 18, 1, 27, 17, 2, 13,
+ 16, 32, 9, 9, 10, 18, 8, 18, 36, 8,
+ 4, 92, 60, 28, 1, 23, 57, 81, 99, 117,
+ },
+
+ {
+ /* Context Tables for P, SP, B Slices :: cabac_init_idc = 0, qp = 24 */
+
+ 96, 12, 29, 96, 12, 29, 43, 11, 38, 22,
+ 4, 7, 54, 64, 106, 28, 2, 1, 10, 30,
+ 0, 7, 7, 23, 55, 14, 40, 85, 101, 107,
+ 10, 21, 9, 10, 30, 0, 27, 11, 22, 10,
+ 1, 17, 31, 1, 27, 29, 55, 10, 15, 31,
+ 4, 23, 25, 49, 14, 5, 9, 17, 4, 8,
+ 44, 0, 0, 0, 11, 57, 67, 4, 6, 5,
+ 36, 17, 55, 43, 14, 0, 62, 46, 1, 8,
+ 27, 8, 23, 19, 39, 35, 35, 43, 49, 44,
+ 0, 16, 31, 11, 29, 23, 55, 4, 11, 9,
+ 29, 34, 9, 0, 25, 37, 11, 19, 5, 12,
+ 12, 0, 24, 24, 23, 1, 1, 7, 6, 19,
+ 24, 13, 2, 24, 22, 42, 30, 16, 29, 2,
+ 13, 3, 2, 51, 5, 14, 6, 10, 14, 14,
+ 40, 22, 6, 0, 6, 0, 6, 12, 47, 9,
+ 2, 25, 6, 0, 24, 10, 1, 22, 12, 8,
+ 5, 24, 1, 25, 6, 37, 38, 40, 30, 28,
+ 34, 32, 18, 32, 36, 3, 16, 18, 10, 4,
+ 11, 6, 5, 0, 4, 1, 4, 12, 3, 33,
+ 3, 19, 9, 1, 37, 38, 50, 36, 34, 30,
+ 26, 24, 24, 12, 3, 5, 11, 23, 25, 63,
+ 9, 5, 33, 18, 8, 2, 2, 7, 17, 13,
+ 15, 31, 17, 27, 57, 45, 61, 7, 44, 36,
+ 22, 8, 14, 1, 11, 13, 25, 0, 70, 42,
+ 30, 16, 26, 6, 3, 13, 21, 8, 78, 58,
+ 48, 34, 34, 10, 1, 11, 23, 13, 80, 50,
+ 22, 6, 16, 7, 25, 33, 10, 92, 68, 48,
+ 34, 40, 10, 3, 13, 19, 124, 45, 37, 17,
+ 37, 39, 27, 27, 21, 25, 23, 19, 19, 35,
+ 27, 39, 31, 4, 29, 33, 21, 13, 9, 13,
+ 15, 9, 9, 17, 25, 31, 14, 14, 28, 8,
+ 1, 18, 14, 10, 6, 8, 8, 6, 9, 11,
+ 19, 5, 14, 31, 8, 38, 4, 12, 16, 20,
+ 32, 8, 12, 18, 30, 11, 17, 37, 62, 66,
+ 78, 50, 46, 62, 60, 60, 66, 56, 66, 66,
+ 44, 36, 6, 40, 58, 34, 18, 36, 30, 18,
+ 20, 12, 12, 4, 7, 9, 17, 46, 42, 42,
+ 34, 8, 16, 10, 9, 3, 13, 17, 39, 35,
+ 51, 57, 17, 17, 53, 16, 18, 2, 17, 3,
+ 5, 21, 9, 15, 25, 43, 45, 61, 77, 10,
+ 7, 33, 19, 6, 4, 14, 18, 20, 38, 24,
+ 34, 38, 62, 44, 42, 62, 60, 104, 66, 46,
+ 24, 8, 13, 37, 61, 99, 12, 78, 64, 60,
+ 46, 50, 24, 18, 18, 1, 27, 17, 2, 13,
+ 18, 34, 9, 9, 12, 20, 8, 18, 36, 8,
+ 2, 90, 56, 24, 7, 29, 65, 89, 105, 123,
+ },
+
+ {
+ /* Context Tables for P, SP, B Slices :: cabac_init_idc = 0, qp = 25 */
+
+ 94, 12, 29, 94, 12, 29, 39, 9, 40, 22,
+ 4, 9, 52, 62, 106, 28, 6, 1, 12, 32,
+ 0, 7, 5, 25, 57, 12, 36, 87, 103, 109,
+ 16, 17, 9, 12, 32, 0, 25, 9, 24, 8,
+ 1, 15, 29, 1, 27, 29, 55, 10, 15, 29,
+ 4, 23, 25, 49, 14, 5, 7, 17, 4, 8,
+ 44, 0, 0, 0, 9, 57, 67, 4, 4, 5,
+ 36, 15, 55, 39, 18, 4, 64, 50, 2, 10,
+ 23, 10, 19, 17, 35, 35, 35, 43, 47, 44,
+ 0, 16, 29, 11, 27, 21, 53, 4, 11, 9,
+ 29, 34, 9, 2, 23, 35, 9, 19, 5, 12,
+ 12, 0, 24, 24, 21, 1, 0, 7, 6, 17,
+ 22, 13, 2, 22, 20, 42, 28, 14, 27, 2,
+ 11, 5, 0, 51, 7, 14, 6, 12, 14, 14,
+ 42, 22, 6, 0, 8, 0, 6, 12, 49, 9,
+ 2, 27, 6, 1, 24, 10, 3, 22, 12, 8,
+ 7, 24, 1, 25, 6, 39, 36, 40, 30, 28,
+ 34, 32, 18, 32, 36, 3, 16, 18, 10, 4,
+ 11, 6, 5, 0, 4, 1, 2, 12, 3, 33,
+ 3, 19, 11, 3, 37, 34, 48, 34, 32, 26,
+ 22, 22, 22, 8, 7, 7, 13, 27, 27, 65,
+ 9, 5, 35, 16, 6, 0, 0, 9, 19, 15,
+ 17, 33, 19, 29, 57, 43, 59, 5, 46, 36,
+ 22, 8, 16, 0, 9, 11, 23, 2, 70, 42,
+ 30, 16, 28, 6, 1, 11, 17, 8, 78, 58,
+ 48, 34, 36, 10, 1, 11, 21, 13, 82, 50,
+ 22, 6, 18, 7, 25, 31, 10, 92, 68, 48,
+ 32, 40, 10, 3, 11, 17, 124, 43, 35, 15,
+ 35, 35, 25, 23, 19, 21, 21, 17, 15, 33,
+ 25, 39, 29, 8, 29, 35, 19, 11, 9, 13,
+ 17, 9, 9, 19, 27, 33, 14, 14, 28, 8,
+ 1, 16, 14, 10, 4, 8, 8, 6, 11, 11,
+ 19, 3, 16, 31, 8, 36, 4, 12, 16, 20,
+ 32, 6, 10, 18, 30, 13, 17, 39, 62, 64,
+ 76, 48, 44, 60, 56, 56, 62, 52, 62, 60,
+ 40, 32, 4, 34, 52, 28, 10, 30, 26, 16,
+ 18, 10, 12, 6, 7, 7, 13, 42, 40, 38,
+ 30, 4, 14, 8, 13, 5, 15, 19, 41, 37,
+ 53, 57, 17, 17, 55, 14, 16, 0, 21, 5,
+ 7, 23, 11, 17, 27, 45, 47, 63, 79, 10,
+ 9, 33, 17, 6, 4, 16, 20, 22, 40, 26,
+ 36, 40, 66, 46, 44, 66, 64, 102, 64, 42,
+ 20, 2, 19, 43, 67, 103, 14, 78, 64, 62,
+ 46, 52, 26, 18, 20, 0, 27, 15, 4, 13,
+ 20, 38, 7, 7, 14, 22, 8, 20, 38, 8,
+ 2, 88, 52, 20, 11, 35, 71, 97, 111, 125,
+ },
+
+ {
+ /* Context Tables for P, SP, B Slices :: cabac_init_idc = 0, qp = 26 */
+
+ 92, 12, 29, 92, 12, 29, 35, 5, 40, 22,
+ 2, 13, 48, 58, 106, 28, 12, 3, 14, 32,
+ 0, 9, 5, 27, 59, 12, 32, 91, 105, 109,
+ 22, 15, 9, 14, 32, 0, 25, 7, 24, 8,
+ 1, 15, 29, 1, 29, 27, 55, 10, 15, 27,
+ 2, 23, 25, 49, 16, 3, 7, 15, 6, 8,
+ 44, 0, 0, 0, 9, 57, 67, 6, 2, 5,
+ 36, 15, 55, 37, 22, 6, 68, 54, 4, 12,
+ 21, 14, 17, 15, 31, 33, 33, 41, 45, 44,
+ 0, 16, 29, 11, 27, 21, 49, 2, 11, 9,
+ 29, 34, 9, 2, 23, 35, 9, 19, 3, 12,
+ 10, 0, 24, 24, 21, 1, 0, 7, 6, 17,
+ 22, 13, 2, 22, 18, 40, 26, 12, 25, 4,
+ 11, 7, 0, 49, 7, 14, 4, 12, 14, 14,
+ 44, 24, 6, 0, 8, 0, 4, 12, 51, 9,
+ 2, 27, 4, 3, 22, 10, 3, 24, 12, 8,
+ 9, 22, 3, 25, 6, 39, 36, 40, 30, 28,
+ 34, 32, 18, 32, 36, 3, 16, 18, 10, 4,
+ 11, 6, 5, 1, 4, 1, 2, 10, 5, 33,
+ 5, 19, 13, 5, 37, 32, 44, 30, 28, 22,
+ 18, 18, 18, 4, 11, 11, 17, 33, 31, 67,
+ 9, 7, 35, 14, 4, 1, 1, 13, 23, 17,
+ 19, 33, 19, 29, 57, 43, 59, 5, 46, 36,
+ 22, 8, 16, 0, 9, 11, 21, 2, 70, 42,
+ 30, 16, 28, 8, 0, 11, 13, 8, 78, 58,
+ 48, 34, 36, 10, 1, 11, 19, 13, 82, 48,
+ 20, 6, 18, 7, 25, 31, 10, 90, 66, 46,
+ 30, 40, 10, 3, 11, 17, 124, 41, 33, 13,
+ 31, 33, 23, 21, 15, 19, 17, 13, 11, 31,
+ 23, 37, 29, 10, 27, 35, 19, 9, 9, 15,
+ 17, 11, 11, 21, 29, 35, 12, 12, 28, 6,
+ 3, 16, 14, 10, 2, 6, 8, 4, 13, 11,
+ 19, 3, 16, 31, 6, 36, 2, 12, 14, 18,
+ 32, 4, 8, 16, 28, 15, 19, 43, 60, 64,
+ 76, 46, 40, 56, 52, 52, 58, 46, 56, 56,
+ 34, 28, 0, 28, 44, 22, 2, 26, 20, 12,
+ 16, 8, 12, 8, 5, 5, 9, 40, 36, 34,
+ 28, 0, 10, 4, 17, 7, 19, 23, 45, 41,
+ 55, 59, 19, 17, 57, 10, 12, 3, 23, 9,
+ 11, 27, 15, 21, 31, 47, 49, 65, 79, 8,
+ 9, 35, 17, 8, 6, 16, 20, 24, 42, 28,
+ 38, 42, 68, 48, 46, 68, 66, 102, 60, 38,
+ 14, 1, 25, 49, 73, 107, 14, 80, 66, 62,
+ 48, 54, 26, 20, 20, 0, 25, 15, 6, 11,
+ 22, 40, 7, 5, 14, 22, 8, 20, 40, 8,
+ 2, 84, 48, 14, 17, 41, 79, 105, 119, 125,
+ },
+
+ {
+ /* Context Tables for P, SP, B Slices :: cabac_init_idc = 0, qp = 27 */
+
+ 90, 12, 31, 90, 12, 31, 31, 3, 42, 22,
+ 2, 15, 46, 56, 106, 28, 16, 3, 14, 34,
+ 0, 11, 5, 29, 61, 10, 30, 93, 107, 111,
+ 28, 13, 9, 14, 34, 0, 23, 5, 26, 6,
+ 0, 13, 27, 3, 29, 27, 55, 10, 13, 27,
+ 2, 23, 23, 49, 16, 3, 7, 15, 6, 8,
+ 44, 0, 0, 0, 7, 57, 67, 6, 2, 7,
+ 36, 15, 53, 33, 26, 10, 70, 58, 6, 14,
+ 17, 16, 15, 15, 29, 33, 33, 41, 45, 46,
+ 0, 16, 27, 9, 25, 19, 47, 2, 13, 11,
+ 31, 34, 9, 2, 21, 35, 7, 19, 3, 12,
+ 10, 2, 24, 22, 21, 1, 0, 7, 6, 17,
+ 20, 13, 2, 20, 16, 38, 26, 12, 25, 4,
+ 11, 7, 1, 49, 9, 14, 4, 12, 14, 14,
+ 44, 24, 6, 0, 10, 0, 2, 12, 51, 11,
+ 2, 29, 4, 5, 22, 10, 5, 24, 12, 8,
+ 13, 22, 5, 25, 6, 41, 34, 40, 28, 28,
+ 34, 32, 18, 32, 36, 3, 14, 18, 8, 4,
+ 11, 4, 7, 1, 2, 3, 0, 8, 5, 33,
+ 5, 17, 15, 9, 37, 28, 40, 26, 24, 18,
+ 14, 14, 14, 0, 15, 13, 19, 37, 35, 69,
+ 9, 7, 37, 12, 2, 3, 5, 15, 25, 19,
+ 21, 35, 21, 31, 57, 43, 57, 3, 46, 36,
+ 22, 10, 16, 0, 9, 9, 19, 4, 68, 42,
+ 30, 16, 30, 8, 2, 9, 9, 8, 80, 58,
+ 48, 34, 38, 10, 1, 9, 19, 13, 82, 48,
+ 20, 6, 18, 5, 23, 29, 10, 90, 66, 44,
+ 30, 40, 10, 1, 9, 15, 124, 41, 31, 13,
+ 29, 31, 21, 17, 13, 17, 15, 11, 9, 31,
+ 21, 37, 27, 14, 27, 37, 19, 9, 9, 15,
+ 19, 11, 13, 23, 31, 37, 12, 12, 28, 6,
+ 3, 14, 12, 10, 2, 6, 6, 4, 13, 11,
+ 21, 3, 16, 33, 6, 34, 0, 10, 12, 18,
+ 34, 4, 6, 16, 28, 15, 19, 45, 58, 62,
+ 74, 44, 38, 54, 48, 48, 54, 42, 52, 50,
+ 28, 22, 1, 24, 38, 16, 5, 20, 16, 8,
+ 14, 6, 12, 10, 5, 1, 3, 36, 32, 30,
+ 24, 3, 6, 2, 21, 11, 21, 25, 47, 43,
+ 57, 61, 19, 19, 59, 8, 10, 5, 27, 13,
+ 15, 29, 17, 25, 33, 51, 53, 69, 81, 6,
+ 11, 37, 17, 8, 6, 18, 22, 24, 46, 30,
+ 40, 44, 70, 52, 48, 72, 68, 100, 58, 34,
+ 10, 7, 29, 55, 77, 111, 16, 80, 66, 64,
+ 48, 56, 28, 20, 22, 2, 25, 13, 6, 11,
+ 24, 42, 7, 5, 16, 24, 8, 22, 42, 8,
+ 2, 82, 44, 10, 23, 49, 87, 111, 125, 125,
+ },
+
+ {
+ /* Context Tables for P, SP, B Slices :: cabac_init_idc = 0, qp = 28 */
+
+ 86, 12, 31, 86, 12, 31, 29, 0, 42, 22,
+ 0, 19, 42, 54, 106, 28, 20, 3, 16, 36,
+ 0, 13, 5, 31, 63, 10, 26, 97, 109, 111,
+ 34, 11, 9, 16, 36, 0, 23, 3, 26, 6,
+ 0, 13, 25, 3, 31, 27, 55, 10, 13, 25,
+ 2, 25, 23, 49, 18, 3, 7, 15, 8, 8,
+ 44, 0, 0, 0, 7, 59, 67, 8, 0, 7,
+ 36, 15, 53, 29, 28, 12, 74, 60, 8, 16,
+ 15, 18, 13, 13, 25, 31, 33, 39, 43, 46,
+ 0, 16, 27, 9, 25, 19, 43, 2, 13, 11,
+ 31, 34, 9, 2, 21, 35, 7, 19, 1, 12,
+ 10, 2, 24, 22, 21, 1, 0, 7, 6, 17,
+ 18, 15, 2, 18, 14, 36, 24, 10, 23, 6,
+ 11, 9, 1, 47, 9, 14, 4, 12, 14, 14,
+ 46, 24, 6, 1, 10, 1, 0, 10, 53, 11,
+ 2, 29, 2, 7, 22, 8, 7, 24, 12, 8,
+ 15, 22, 7, 25, 6, 41, 32, 40, 28, 28,
+ 34, 32, 18, 32, 34, 3, 14, 18, 8, 4,
+ 13, 4, 7, 3, 2, 3, 1, 6, 7, 33,
+ 7, 17, 17, 11, 37, 26, 36, 22, 20, 14,
+ 10, 10, 10, 3, 19, 17, 23, 41, 39, 71,
+ 9, 7, 37, 10, 0, 5, 7, 19, 29, 21,
+ 25, 37, 21, 31, 57, 43, 57, 3, 46, 36,
+ 22, 10, 16, 0, 9, 9, 17, 4, 68, 42,
+ 30, 16, 30, 10, 2, 7, 5, 8, 80, 58,
+ 48, 34, 38, 10, 1, 9, 17, 13, 82, 48,
+ 18, 6, 18, 5, 23, 27, 10, 88, 64, 42,
+ 28, 40, 10, 1, 9, 15, 124, 39, 29, 11,
+ 27, 29, 19, 15, 11, 15, 13, 7, 5, 29,
+ 21, 37, 27, 16, 27, 37, 19, 7, 9, 17,
+ 19, 11, 15, 25, 33, 39, 10, 10, 28, 4,
+ 5, 14, 12, 10, 0, 4, 6, 4, 15, 13,
+ 21, 3, 16, 33, 4, 34, 1, 10, 10, 16,
+ 34, 2, 4, 14, 26, 17, 21, 49, 56, 60,
+ 74, 42, 36, 50, 44, 44, 50, 36, 46, 46,
+ 22, 18, 3, 18, 32, 10, 13, 14, 10, 4,
+ 12, 2, 12, 12, 5, 0, 0, 32, 28, 26,
+ 20, 7, 2, 1, 25, 13, 25, 27, 51, 47,
+ 59, 63, 21, 19, 61, 6, 6, 9, 31, 17,
+ 19, 33, 21, 29, 37, 53, 55, 71, 83, 4,
+ 13, 39, 17, 10, 6, 18, 22, 26, 48, 30,
+ 42, 46, 72, 54, 50, 74, 70, 100, 54, 30,
+ 4, 11, 35, 61, 83, 117, 16, 82, 68, 64,
+ 48, 56, 28, 20, 22, 2, 25, 13, 8, 9,
+ 26, 44, 7, 3, 16, 24, 8, 22, 44, 8,
+ 2, 80, 40, 4, 29, 55, 95, 119, 125, 125,
+ },
+
+ {
+ /* Context Tables for P, SP, B Slices :: cabac_init_idc = 0, qp = 29 */
+
+ 84, 12, 31, 84, 12, 31, 25, 2, 42, 22,
+ 0, 21, 40, 50, 106, 28, 26, 5, 18, 36,
+ 0, 13, 3, 33, 65, 8, 22, 101, 111, 113,
+ 40, 7, 9, 18, 36, 0, 23, 1, 26, 4,
+ 0, 13, 25, 3, 31, 25, 55, 10, 13, 23,
+ 0, 25, 23, 49, 18, 1, 5, 13, 8, 8,
+ 44, 0, 0, 0, 5, 59, 67, 8, 1, 7,
+ 36, 13, 53, 27, 32, 16, 76, 64, 12, 18,
+ 13, 22, 11, 11, 21, 31, 31, 39, 41, 46,
+ 0, 16, 25, 9, 23, 17, 41, 0, 13, 11,
+ 31, 34, 9, 4, 19, 35, 7, 19, 1, 12,
+ 8, 2, 24, 22, 19, 1, 2, 7, 6, 17,
+ 18, 15, 2, 18, 12, 36, 22, 8, 21, 6,
+ 11, 11, 3, 47, 11, 14, 2, 12, 14, 14,
+ 48, 26, 6, 1, 10, 1, 1, 10, 55, 11,
+ 2, 31, 2, 9, 20, 8, 7, 26, 12, 8,
+ 17, 20, 9, 25, 6, 43, 32, 40, 28, 28,
+ 34, 32, 18, 32, 34, 3, 14, 18, 8, 4,
+ 13, 4, 7, 3, 2, 3, 1, 4, 7, 33,
+ 7, 17, 19, 13, 37, 22, 32, 20, 18, 10,
+ 6, 6, 6, 7, 23, 21, 27, 47, 41, 73,
+ 9, 9, 39, 8, 1, 7, 9, 21, 31, 23,
+ 27, 37, 23, 33, 57, 43, 55, 3, 46, 36,
+ 22, 10, 16, 2, 7, 7, 15, 6, 68, 42,
+ 30, 16, 30, 10, 4, 7, 1, 8, 80, 58,
+ 48, 34, 38, 10, 1, 9, 15, 13, 84, 46,
+ 18, 6, 18, 5, 23, 27, 10, 88, 62, 40,
+ 26, 40, 10, 1, 9, 13, 124, 37, 27, 9,
+ 23, 27, 17, 13, 7, 13, 9, 5, 1, 27,
+ 19, 35, 25, 18, 25, 39, 17, 5, 9, 17,
+ 21, 13, 15, 27, 35, 41, 8, 10, 28, 4,
+ 5, 12, 12, 10, 1, 2, 6, 2, 17, 13,
+ 21, 3, 16, 33, 2, 32, 1, 10, 8, 14,
+ 34, 0, 2, 14, 26, 19, 23, 53, 54, 60,
+ 72, 40, 32, 48, 40, 40, 46, 32, 40, 40,
+ 16, 14, 7, 12, 24, 4, 21, 10, 6, 0,
+ 10, 0, 12, 14, 3, 2, 4, 30, 26, 22,
+ 18, 11, 1, 3, 29, 15, 29, 31, 55, 49,
+ 61, 65, 21, 19, 63, 2, 4, 13, 33, 19,
+ 21, 35, 23, 31, 39, 55, 57, 73, 83, 4,
+ 13, 41, 15, 10, 8, 20, 24, 28, 50, 32,
+ 44, 48, 76, 56, 52, 76, 74, 98, 50, 26,
+ 0, 17, 41, 67, 89, 121, 16, 82, 68, 66,
+ 50, 58, 28, 22, 24, 4, 23, 13, 10, 9,
+ 28, 46, 5, 1, 18, 26, 8, 24, 46, 8,
+ 2, 76, 36, 0, 35, 61, 101, 125, 125, 125,
+ },
+
+ {
+ /* Context Tables for P, SP, B Slices :: cabac_init_idc = 0, qp = 30 */
+
+ 82, 12, 31, 82, 12, 31, 21, 6, 44, 22,
+ 1, 25, 36, 48, 106, 28, 30, 5, 20, 38,
+ 0, 15, 3, 35, 67, 8, 18, 103, 113, 113,
+ 46, 5, 9, 20, 38, 0, 21, 0, 28, 4,
+ 0, 11, 23, 5, 33, 25, 55, 10, 13, 23,
+ 0, 25, 23, 49, 20, 1, 5, 13, 10, 8,
+ 44, 0, 0, 0, 5, 59, 67, 10, 3, 7,
+ 36, 13, 53, 23, 36, 18, 80, 68, 14, 20,
+ 9, 24, 9, 9, 17, 29, 31, 37, 39, 46,
+ 0, 16, 25, 9, 23, 17, 37, 0, 15, 13,
+ 33, 34, 9, 4, 19, 35, 5, 19, 0, 12,
+ 8, 2, 24, 20, 19, 1, 2, 7, 6, 17,
+ 16, 15, 2, 16, 10, 34, 20, 6, 21, 8,
+ 11, 13, 3, 45, 11, 14, 2, 12, 14, 14,
+ 50, 26, 6, 1, 12, 1, 3, 10, 57, 11,
+ 2, 31, 0, 11, 20, 8, 9, 26, 12, 8,
+ 21, 20, 11, 25, 6, 43, 30, 40, 28, 28,
+ 34, 32, 18, 32, 34, 3, 12, 18, 8, 4,
+ 13, 2, 9, 5, 0, 5, 3, 2, 9, 33,
+ 9, 17, 21, 17, 37, 20, 28, 16, 14, 6,
+ 2, 2, 2, 11, 27, 23, 29, 51, 45, 75,
+ 9, 9, 39, 6, 3, 9, 13, 25, 35, 25,
+ 29, 39, 23, 33, 57, 43, 55, 1, 46, 36,
+ 22, 10, 16, 2, 7, 7, 13, 6, 68, 42,
+ 30, 16, 32, 12, 6, 5, 2, 8, 80, 58,
+ 48, 34, 40, 10, 1, 9, 15, 13, 84, 46,
+ 16, 6, 18, 5, 23, 25, 10, 86, 62, 38,
+ 24, 40, 10, 1, 7, 13, 124, 35, 25, 9,
+ 21, 25, 15, 9, 5, 11, 7, 1, 0, 27,
+ 17, 35, 25, 22, 25, 39, 17, 5, 9, 19,
+ 21, 13, 17, 29, 37, 43, 8, 8, 28, 2,
+ 7, 12, 10, 10, 3, 2, 4, 2, 19, 13,
+ 23, 3, 16, 35, 2, 32, 3, 10, 6, 14,
+ 34, 1, 0, 12, 24, 21, 23, 55, 52, 58,
+ 72, 38, 30, 44, 36, 36, 42, 26, 36, 36,
+ 10, 8, 9, 6, 18, 1, 29, 4, 0, 3,
+ 8, 1, 12, 16, 3, 4, 8, 26, 22, 18,
+ 14, 15, 5, 7, 33, 19, 31, 33, 57, 53,
+ 63, 67, 23, 21, 65, 0, 0, 15, 37, 23,
+ 25, 39, 27, 35, 43, 57, 61, 77, 85, 2,
+ 15, 43, 15, 12, 8, 20, 24, 28, 52, 34,
+ 46, 50, 78, 58, 54, 80, 76, 98, 48, 22,
+ 5, 21, 47, 73, 95, 125, 18, 84, 70, 66,
+ 50, 60, 30, 22, 24, 4, 23, 11, 10, 7,
+ 30, 48, 5, 1, 18, 26, 8, 24, 48, 8,
+ 2, 74, 32, 5, 41, 67, 109, 125, 125, 125,
+ },
+
+ {
+ /* Context Tables for P, SP, B Slices :: cabac_init_idc = 0, qp = 31 */
+
+ 80, 12, 31, 80, 12, 31, 17, 8, 44, 22,
+ 1, 27, 34, 46, 106, 28, 34, 5, 22, 40,
+ 0, 17, 3, 37, 69, 6, 14, 107, 115, 115,
+ 52, 3, 9, 22, 40, 0, 21, 2, 28, 2,
+ 0, 11, 21, 5, 33, 25, 55, 10, 13, 21,
+ 0, 25, 23, 49, 20, 1, 5, 13, 10, 8,
+ 44, 0, 0, 0, 3, 59, 67, 10, 5, 7,
+ 36, 13, 53, 19, 40, 22, 82, 72, 16, 22,
+ 7, 26, 7, 7, 13, 29, 31, 37, 37, 46,
+ 0, 16, 23, 9, 21, 15, 35, 0, 15, 13,
+ 33, 34, 9, 4, 17, 35, 5, 19, 0, 12,
+ 8, 2, 24, 20, 19, 1, 2, 7, 6, 17,
+ 14, 15, 2, 14, 8, 32, 18, 4, 19, 8,
+ 11, 15, 5, 45, 13, 14, 2, 12, 14, 14,
+ 52, 26, 6, 1, 12, 1, 5, 10, 59, 11,
+ 2, 33, 0, 13, 20, 8, 11, 26, 12, 8,
+ 23, 20, 13, 25, 6, 45, 28, 40, 28, 28,
+ 34, 32, 18, 32, 34, 3, 12, 18, 8, 4,
+ 13, 2, 9, 5, 0, 5, 5, 0, 9, 33,
+ 9, 17, 23, 19, 37, 16, 24, 12, 10, 2,
+ 1, 1, 1, 15, 31, 27, 33, 55, 49, 77,
+ 9, 9, 41, 4, 5, 11, 15, 27, 37, 27,
+ 31, 41, 25, 35, 57, 43, 53, 1, 46, 36,
+ 22, 10, 16, 2, 7, 5, 11, 8, 68, 42,
+ 30, 16, 32, 12, 8, 3, 6, 8, 80, 58,
+ 48, 34, 40, 10, 1, 9, 13, 13, 84, 46,
+ 16, 6, 18, 5, 23, 23, 10, 86, 60, 36,
+ 22, 40, 10, 1, 7, 11, 124, 33, 23, 7,
+ 19, 23, 13, 7, 3, 9, 5, 0, 4, 25,
+ 15, 35, 23, 24, 25, 41, 17, 3, 9, 19,
+ 23, 13, 19, 31, 39, 45, 6, 8, 28, 2,
+ 7, 10, 10, 10, 5, 0, 4, 2, 21, 13,
+ 23, 3, 16, 35, 0, 30, 5, 10, 4, 12,
+ 34, 3, 1, 12, 24, 23, 25, 59, 50, 56,
+ 70, 36, 28, 42, 32, 32, 38, 22, 30, 30,
+ 4, 4, 11, 0, 12, 7, 37, 1, 3, 7,
+ 6, 3, 12, 18, 3, 6, 12, 22, 18, 14,
+ 10, 19, 9, 9, 37, 21, 35, 35, 61, 55,
+ 65, 69, 23, 21, 67, 1, 1, 19, 41, 27,
+ 29, 41, 29, 39, 45, 59, 63, 79, 87, 0,
+ 17, 45, 15, 12, 8, 22, 26, 30, 54, 36,
+ 48, 52, 80, 60, 56, 82, 78, 96, 44, 18,
+ 9, 27, 53, 79, 101, 125, 18, 84, 70, 68,
+ 50, 62, 30, 22, 26, 6, 23, 11, 12, 7,
+ 32, 50, 5, 0, 20, 28, 8, 26, 50, 8,
+ 2, 72, 28, 9, 47, 73, 117, 125, 125, 125,
+ },
+
+ {
+ /* Context Tables for P, SP, B Slices :: cabac_init_idc = 0, qp = 32 */
+
+ 76, 10, 33, 76, 10, 33, 15, 10, 44, 22,
+ 3, 31, 30, 42, 104, 28, 38, 7, 22, 40,
+ 1, 19, 3, 41, 73, 4, 10, 111, 117, 117,
+ 56, 1, 11, 22, 40, 1, 21, 4, 28, 0,
+ 0, 11, 21, 7, 35, 25, 57, 10, 13, 21,
+ 1, 27, 23, 49, 20, 1, 5, 13, 10, 6,
+ 44, 0, 0, 0, 3, 61, 67, 10, 7, 9,
+ 36, 13, 53, 17, 42, 24, 84, 74, 18, 24,
+ 5, 28, 5, 7, 11, 29, 31, 37, 37, 46,
+ 0, 16, 23, 9, 21, 15, 33, 1, 17, 15,
+ 35, 34, 9, 4, 17, 35, 5, 19, 0, 10,
+ 6, 2, 22, 18, 19, 3, 2, 7, 4, 17,
+ 12, 17, 0, 12, 4, 30, 16, 2, 19, 8,
+ 11, 17, 7, 45, 15, 12, 0, 12, 14, 14,
+ 52, 26, 6, 3, 12, 3, 7, 8, 61, 13,
+ 0, 35, 1, 15, 18, 6, 13, 26, 12, 8,
+ 27, 18, 15, 25, 6, 47, 26, 38, 26, 28,
+ 34, 30, 16, 32, 32, 5, 10, 18, 6, 2,
+ 15, 0, 11, 7, 1, 7, 7, 1, 11, 33,
+ 11, 17, 25, 23, 37, 12, 20, 8, 6, 1,
+ 7, 5, 5, 19, 35, 31, 37, 61, 53, 81,
+ 11, 11, 43, 2, 9, 15, 19, 31, 41, 31,
+ 35, 43, 27, 37, 57, 43, 53, 1, 46, 36,
+ 22, 10, 16, 2, 7, 5, 9, 8, 66, 42,
+ 30, 16, 32, 12, 8, 3, 8, 6, 80, 58,
+ 46, 32, 40, 10, 1, 9, 13, 13, 84, 44,
+ 14, 4, 18, 5, 23, 23, 10, 84, 58, 34,
+ 20, 38, 10, 1, 7, 11, 124, 33, 21, 7,
+ 17, 21, 13, 5, 1, 7, 3, 2, 6, 25,
+ 15, 35, 23, 26, 25, 43, 17, 3, 9, 21,
+ 25, 15, 21, 33, 43, 49, 4, 6, 28, 0,
+ 9, 8, 8, 8, 7, 1, 2, 0, 23, 15,
+ 25, 3, 16, 37, 1, 28, 7, 8, 2, 10,
+ 34, 5, 5, 10, 22, 25, 27, 63, 48, 54,
+ 68, 32, 24, 38, 28, 28, 32, 16, 24, 24,
+ 1, 1, 15, 5, 4, 13, 47, 7, 9, 11,
+ 2, 7, 12, 18, 3, 8, 16, 18, 14, 10,
+ 6, 25, 13, 13, 43, 25, 39, 39, 65, 59,
+ 67, 71, 25, 23, 71, 5, 5, 23, 45, 31,
+ 33, 45, 33, 43, 49, 63, 67, 83, 89, 1,
+ 19, 47, 15, 12, 8, 22, 26, 30, 56, 36,
+ 50, 52, 82, 62, 58, 84, 80, 94, 40, 12,
+ 15, 33, 59, 87, 107, 125, 18, 84, 70, 68,
+ 50, 62, 30, 22, 26, 6, 23, 11, 12, 7,
+ 34, 52, 5, 0, 20, 28, 8, 26, 50, 8,
+ 0, 68, 22, 15, 53, 81, 125, 125, 125, 125,
+ },
+
+ {
+ /* Context Tables for P, SP, B Slices :: cabac_init_idc = 0, qp = 33 */
+
+ 74, 10, 33, 74, 10, 33, 11, 14, 46, 24,
+ 3, 33, 28, 40, 104, 28, 44, 7, 24, 42,
+ 1, 19, 1, 43, 75, 4, 8, 113, 119, 117,
+ 62, 2, 11, 24, 42, 1, 19, 8, 30, 0,
+ 2, 9, 19, 7, 35, 23, 57, 10, 11, 19,
+ 1, 27, 21, 49, 22, 0, 3, 11, 12, 6,
+ 44, 0, 0, 0, 1, 61, 67, 12, 7, 9,
+ 36, 11, 51, 13, 46, 28, 88, 78, 22, 26,
+ 1, 32, 1, 5, 7, 27, 29, 35, 35, 48,
+ 2, 18, 21, 7, 19, 13, 29, 1, 17, 15,
+ 35, 36, 9, 6, 15, 33, 3, 17, 2, 10,
+ 6, 4, 22, 18, 17, 3, 4, 7, 4, 15,
+ 12, 17, 0, 12, 2, 30, 16, 2, 17, 10,
+ 9, 17, 7, 43, 15, 12, 0, 14, 14, 14,
+ 54, 28, 8, 3, 14, 3, 7, 8, 61, 13,
+ 0, 35, 1, 17, 18, 6, 13, 28, 14, 8,
+ 29, 18, 15, 23, 6, 47, 26, 38, 26, 28,
+ 34, 30, 16, 32, 32, 5, 10, 18, 6, 2,
+ 15, 0, 11, 7, 1, 7, 7, 1, 11, 31,
+ 11, 15, 25, 25, 35, 10, 18, 6, 4, 5,
+ 11, 7, 7, 23, 37, 33, 39, 65, 55, 83,
+ 11, 11, 43, 0, 11, 17, 21, 33, 43, 33,
+ 37, 43, 27, 37, 55, 41, 51, 0, 48, 36,
+ 22, 12, 18, 4, 5, 3, 5, 10, 66, 42,
+ 30, 16, 34, 14, 10, 1, 12, 6, 82, 60,
+ 46, 32, 42, 10, 1, 7, 11, 13, 86, 44,
+ 14, 4, 20, 3, 21, 21, 10, 84, 58, 34,
+ 20, 38, 10, 0, 5, 9, 124, 31, 19, 5,
+ 13, 17, 11, 1, 2, 3, 0, 6, 10, 23,
+ 13, 33, 21, 30, 23, 43, 15, 1, 7, 21,
+ 25, 15, 21, 33, 45, 51, 4, 6, 30, 0,
+ 9, 8, 8, 8, 7, 1, 2, 0, 23, 15,
+ 25, 1, 18, 37, 1, 28, 7, 8, 2, 10,
+ 36, 5, 7, 10, 22, 25, 27, 65, 48, 54,
+ 68, 30, 22, 36, 24, 24, 28, 12, 20, 20,
+ 5, 5, 17, 9, 1, 19, 55, 11, 13, 13,
+ 0, 9, 12, 20, 1, 12, 22, 16, 12, 8,
+ 4, 29, 15, 15, 47, 27, 41, 41, 67, 61,
+ 69, 71, 25, 23, 73, 7, 7, 25, 47, 33,
+ 35, 47, 35, 45, 51, 65, 69, 85, 89, 1,
+ 19, 47, 13, 14, 10, 24, 28, 32, 60, 38,
+ 52, 54, 86, 66, 60, 88, 84, 94, 38, 8,
+ 19, 37, 63, 93, 111, 125, 20, 86, 72, 70,
+ 52, 64, 32, 24, 28, 8, 21, 9, 14, 5,
+ 38, 56, 3, 2, 22, 30, 10, 28, 52, 8,
+ 0, 66, 18, 19, 57, 87, 125, 125, 125, 125,
+ },
+
+ {
+ /* Context Tables for P, SP, B Slices :: cabac_init_idc = 0, qp = 34 */
+
+ 72, 10, 33, 72, 10, 33, 7, 16, 46, 24,
+ 3, 35, 26, 38, 104, 28, 48, 7, 26, 44,
+ 1, 21, 1, 45, 77, 2, 4, 117, 121, 119,
+ 68, 4, 11, 26, 44, 1, 19, 10, 30, 1,
+ 2, 9, 17, 7, 35, 23, 57, 10, 11, 17,
+ 1, 27, 21, 49, 22, 0, 3, 11, 12, 6,
+ 44, 0, 0, 0, 0, 61, 67, 12, 9, 9,
+ 36, 11, 51, 9, 50, 32, 90, 82, 24, 28,
+ 0, 34, 0, 3, 3, 27, 29, 35, 33, 48,
+ 2, 18, 19, 7, 17, 13, 27, 1, 17, 15,
+ 35, 36, 9, 6, 15, 33, 3, 17, 2, 10,
+ 6, 4, 22, 18, 17, 3, 4, 7, 4, 15,
+ 10, 17, 0, 10, 0, 28, 14, 0, 15, 10,
+ 9, 19, 9, 43, 17, 12, 0, 14, 14, 14,
+ 56, 28, 8, 3, 14, 3, 9, 8, 63, 13,
+ 0, 37, 1, 19, 18, 6, 15, 28, 14, 8,
+ 31, 18, 17, 23, 6, 49, 24, 38, 26, 28,
+ 34, 30, 16, 32, 32, 5, 10, 18, 6, 2,
+ 15, 0, 11, 7, 1, 7, 9, 3, 13, 31,
+ 11, 15, 27, 27, 35, 6, 14, 2, 0, 9,
+ 15, 11, 11, 27, 41, 37, 43, 69, 59, 85,
+ 11, 11, 45, 1, 13, 19, 23, 37, 45, 35,
+ 39, 45, 29, 39, 55, 41, 49, 0, 48, 36,
+ 22, 12, 18, 4, 5, 3, 3, 12, 66, 42,
+ 30, 16, 34, 14, 12, 0, 16, 6, 82, 60,
+ 46, 32, 42, 10, 1, 7, 9, 13, 86, 44,
+ 14, 4, 20, 3, 21, 19, 10, 82, 56, 32,
+ 18, 38, 10, 0, 5, 9, 124, 29, 17, 3,
+ 11, 15, 9, 0, 4, 1, 2, 8, 14, 21,
+ 11, 33, 19, 32, 23, 45, 15, 0, 7, 21,
+ 27, 15, 23, 35, 47, 53, 2, 6, 30, 0,
+ 11, 6, 8, 8, 9, 3, 2, 0, 25, 15,
+ 25, 1, 18, 37, 3, 26, 9, 8, 0, 8,
+ 36, 7, 9, 10, 22, 27, 29, 69, 46, 52,
+ 66, 28, 20, 34, 20, 20, 24, 8, 14, 14,
+ 11, 9, 19, 15, 7, 25, 63, 17, 17, 17,
+ 1, 11, 12, 22, 1, 14, 26, 12, 8, 4,
+ 0, 33, 19, 19, 51, 29, 45, 43, 71, 63,
+ 71, 73, 27, 23, 75, 9, 9, 29, 51, 37,
+ 39, 49, 39, 49, 53, 67, 71, 87, 91, 3,
+ 21, 49, 13, 14, 10, 26, 28, 34, 62, 40,
+ 54, 56, 88, 68, 62, 90, 86, 92, 34, 4,
+ 23, 43, 69, 99, 117, 125, 20, 86, 72, 70,
+ 52, 66, 32, 24, 28, 8, 21, 9, 16, 5,
+ 40, 58, 3, 4, 24, 32, 10, 28, 54, 8,
+ 0, 64, 14, 23, 63, 93, 125, 125, 125, 125,
+ },
+
+ {
+ /* Context Tables for P, SP, B Slices :: cabac_init_idc = 0, qp = 35 */
+
+ 70, 10, 33, 70, 10, 33, 3, 20, 48, 24,
+ 5, 39, 22, 36, 104, 28, 52, 7, 28, 46,
+ 1, 23, 1, 47, 79, 2, 0, 119, 123, 119,
+ 74, 6, 11, 28, 46, 1, 17, 12, 32, 1,
+ 2, 7, 15, 9, 37, 23, 57, 10, 11, 17,
+ 1, 27, 21, 49, 24, 0, 3, 11, 14, 6,
+ 44, 0, 0, 0, 0, 61, 67, 14, 11, 9,
+ 36, 11, 51, 5, 54, 34, 94, 86, 26, 30,
+ 4, 36, 2, 1, 0, 25, 29, 33, 31, 48,
+ 2, 18, 19, 7, 17, 11, 23, 1, 19, 17,
+ 37, 36, 9, 6, 13, 33, 1, 17, 4, 10,
+ 6, 4, 22, 16, 17, 3, 4, 7, 4, 15,
+ 8, 17, 0, 8, 1, 26, 12, 1, 15, 12,
+ 9, 21, 9, 41, 17, 12, 0, 14, 14, 14,
+ 58, 28, 8, 3, 16, 3, 11, 8, 65, 13,
+ 0, 37, 3, 21, 18, 6, 17, 28, 14, 8,
+ 35, 18, 19, 23, 6, 49, 22, 38, 26, 28,
+ 34, 30, 16, 32, 32, 5, 8, 18, 6, 2,
+ 15, 1, 13, 9, 3, 9, 11, 5, 13, 31,
+ 13, 15, 29, 31, 35, 4, 10, 1, 3, 13,
+ 19, 15, 15, 31, 45, 39, 45, 73, 63, 87,
+ 11, 11, 45, 3, 15, 21, 27, 39, 49, 37,
+ 41, 47, 29, 39, 55, 41, 49, 2, 48, 36,
+ 22, 12, 18, 4, 5, 1, 1, 12, 66, 42,
+ 30, 16, 36, 16, 14, 2, 20, 6, 82, 60,
+ 46, 32, 44, 10, 1, 7, 9, 13, 86, 44,
+ 12, 4, 20, 3, 21, 17, 10, 82, 56, 30,
+ 16, 38, 10, 0, 3, 7, 124, 27, 15, 3,
+ 9, 13, 7, 4, 6, 0, 4, 12, 16, 21,
+ 9, 33, 19, 36, 23, 45, 15, 0, 7, 23,
+ 27, 15, 25, 37, 49, 55, 2, 4, 30, 1,
+ 11, 6, 6, 8, 11, 3, 0, 0, 27, 15,
+ 27, 1, 18, 39, 3, 26, 11, 8, 1, 8,
+ 36, 9, 11, 8, 20, 29, 29, 71, 44, 50,
+ 66, 26, 18, 30, 16, 16, 20, 2, 10, 10,
+ 17, 15, 21, 21, 13, 31, 71, 23, 23, 21,
+ 3, 13, 12, 24, 1, 16, 30, 8, 4, 0,
+ 3, 37, 23, 21, 55, 33, 47, 45, 73, 67,
+ 73, 75, 27, 25, 77, 11, 13, 31, 55, 41,
+ 43, 53, 41, 53, 57, 69, 75, 91, 93, 5,
+ 23, 51, 13, 16, 10, 26, 30, 34, 64, 42,
+ 56, 58, 90, 70, 64, 94, 88, 92, 32, 0,
+ 29, 47, 75, 105, 123, 125, 22, 88, 74, 72,
+ 52, 68, 34, 24, 30, 10, 21, 7, 16, 3,
+ 42, 60, 3, 4, 24, 32, 10, 30, 56, 8,
+ 0, 62, 10, 29, 69, 99, 125, 125, 125, 125,
+ },
+
+ {
+ /* Context Tables for P, SP, B Slices :: cabac_init_idc = 0, qp = 36 */
+
+ 66, 10, 33, 66, 10, 33, 1, 22, 48, 24,
+ 5, 41, 20, 32, 104, 28, 58, 9, 30, 46,
+ 1, 25, 1, 49, 81, 0, 3, 123, 125, 121,
+ 80, 8, 11, 30, 46, 1, 17, 14, 32, 3,
+ 2, 7, 15, 9, 37, 21, 57, 10, 11, 15,
+ 3, 29, 21, 49, 24, 2, 3, 9, 14, 6,
+ 44, 0, 0, 0, 2, 63, 67, 14, 13, 9,
+ 36, 11, 51, 3, 56, 38, 96, 88, 28, 32,
+ 6, 40, 4, 0, 4, 25, 27, 33, 29, 48,
+ 2, 18, 17, 7, 15, 11, 21, 3, 19, 17,
+ 37, 36, 9, 6, 13, 33, 1, 17, 4, 10,
+ 4, 4, 22, 16, 17, 3, 4, 7, 4, 15,
+ 8, 19, 0, 8, 3, 24, 10, 3, 13, 12,
+ 9, 23, 11, 41, 19, 12, 1, 14, 14, 14,
+ 60, 30, 8, 5, 16, 5, 13, 6, 67, 13,
+ 0, 39, 3, 23, 16, 4, 17, 30, 14, 8,
+ 37, 16, 21, 23, 6, 51, 22, 38, 26, 28,
+ 34, 30, 16, 32, 30, 5, 8, 18, 6, 2,
+ 17, 1, 13, 9, 3, 9, 11, 7, 15, 31,
+ 13, 15, 31, 33, 35, 0, 6, 5, 7, 17,
+ 23, 19, 19, 35, 49, 43, 49, 79, 67, 89,
+ 11, 13, 47, 5, 17, 23, 29, 43, 51, 39,
+ 45, 47, 31, 41, 55, 41, 47, 2, 48, 36,
+ 22, 12, 18, 4, 5, 1, 0, 14, 66, 42,
+ 30, 16, 36, 16, 14, 2, 24, 6, 82, 60,
+ 46, 32, 44, 10, 1, 7, 7, 13, 86, 42,
+ 12, 4, 20, 3, 21, 17, 10, 80, 54, 28,
+ 14, 38, 10, 0, 3, 7, 124, 25, 13, 1,
+ 5, 11, 5, 6, 10, 2, 8, 14, 20, 19,
+ 9, 31, 17, 38, 21, 47, 15, 2, 7, 23,
+ 29, 17, 27, 39, 51, 57, 0, 4, 30, 1,
+ 13, 4, 6, 8, 13, 5, 0, 1, 29, 17,
+ 27, 1, 18, 39, 5, 24, 13, 8, 3, 6,
+ 36, 11, 13, 8, 20, 31, 31, 75, 42, 50,
+ 64, 24, 14, 28, 12, 12, 16, 1, 4, 4,
+ 23, 19, 25, 27, 21, 37, 79, 27, 27, 25,
+ 5, 17, 12, 26, 0, 18, 34, 6, 0, 3,
+ 5, 41, 27, 25, 59, 35, 51, 49, 77, 69,
+ 75, 77, 29, 25, 79, 15, 15, 35, 57, 45,
+ 47, 55, 45, 57, 59, 71, 77, 93, 93, 7,
+ 23, 53, 13, 16, 12, 28, 30, 36, 66, 42,
+ 58, 60, 92, 72, 66, 96, 90, 90, 28, 3,
+ 33, 53, 81, 111, 125, 125, 22, 88, 74, 72,
+ 54, 68, 34, 26, 30, 10, 19, 7, 18, 3,
+ 44, 62, 3, 6, 26, 34, 10, 30, 58, 8,
+ 0, 58, 6, 33, 75, 105, 125, 125, 125, 125,
+ },
+
+ {
+ /* Context Tables for P, SP, B Slices :: cabac_init_idc = 0, qp = 37 */
+
+ 64, 10, 33, 64, 10, 33, 2, 26, 48, 24,
+ 7, 45, 16, 30, 104, 28, 62, 9, 32, 48,
+ 1, 25, 0, 51, 83, 0, 7, 125, 125, 121,
+ 86, 12, 11, 32, 48, 1, 17, 16, 32, 3,
+ 2, 7, 13, 9, 39, 21, 57, 10, 11, 13,
+ 3, 29, 21, 49, 26, 2, 1, 9, 16, 6,
+ 44, 0, 0, 0, 2, 63, 67, 16, 15, 9,
+ 36, 9, 51, 0, 60, 40, 100, 92, 32, 34,
+ 8, 42, 6, 2, 8, 23, 27, 31, 27, 48,
+ 2, 18, 17, 7, 15, 9, 17, 3, 19, 17,
+ 37, 36, 9, 8, 11, 33, 1, 17, 6, 10,
+ 4, 4, 22, 16, 15, 3, 6, 7, 4, 15,
+ 6, 19, 0, 6, 5, 24, 8, 5, 11, 14,
+ 9, 25, 11, 39, 19, 12, 1, 14, 14, 14,
+ 62, 30, 8, 5, 16, 5, 15, 6, 69, 13,
+ 0, 39, 5, 25, 16, 4, 19, 30, 14, 8,
+ 39, 16, 23, 23, 6, 51, 20, 38, 26, 28,
+ 34, 30, 16, 32, 30, 5, 8, 18, 6, 2,
+ 17, 1, 13, 11, 3, 9, 13, 9, 15, 31,
+ 15, 15, 33, 35, 35, 1, 2, 7, 9, 21,
+ 27, 23, 23, 39, 53, 47, 53, 83, 69, 91,
+ 11, 13, 47, 7, 19, 25, 31, 45, 55, 41,
+ 47, 49, 31, 41, 55, 41, 47, 2, 48, 36,
+ 22, 12, 18, 6, 3, 0, 2, 14, 66, 42,
+ 30, 16, 36, 18, 16, 4, 28, 6, 82, 60,
+ 46, 32, 44, 10, 1, 7, 5, 13, 88, 42,
+ 10, 4, 20, 3, 21, 15, 10, 80, 52, 26,
+ 12, 38, 10, 0, 3, 5, 124, 23, 11, 0,
+ 3, 9, 3, 8, 12, 4, 10, 18, 24, 17,
+ 7, 31, 17, 40, 21, 47, 13, 4, 7, 25,
+ 29, 17, 27, 41, 53, 59, 1, 2, 30, 3,
+ 13, 4, 6, 8, 15, 7, 0, 1, 31, 17,
+ 27, 1, 18, 39, 7, 24, 13, 8, 5, 4,
+ 36, 13, 15, 6, 18, 33, 33, 79, 40, 48,
+ 64, 22, 12, 24, 8, 8, 12, 7, 1, 0,
+ 29, 23, 27, 33, 27, 43, 87, 33, 33, 29,
+ 7, 19, 12, 28, 0, 20, 38, 2, 1, 7,
+ 9, 45, 31, 27, 63, 37, 55, 51, 81, 73,
+ 77, 79, 29, 25, 81, 17, 19, 39, 61, 47,
+ 49, 59, 47, 59, 63, 73, 79, 95, 95, 7,
+ 25, 55, 11, 18, 12, 28, 32, 38, 68, 44,
+ 60, 62, 96, 74, 68, 98, 94, 90, 24, 7,
+ 39, 57, 87, 117, 125, 125, 22, 90, 76, 74,
+ 54, 70, 34, 26, 32, 12, 19, 7, 20, 1,
+ 46, 64, 1, 8, 26, 34, 10, 32, 60, 8,
+ 0, 56, 2, 39, 81, 111, 125, 125, 125, 125,
+ },
+
+ {
+ /* Context Tables for P, SP, B Slices :: cabac_init_idc = 0, qp = 38 */
+
+ 62, 10, 35, 62, 10, 35, 6, 28, 50, 24,
+ 7, 47, 14, 28, 104, 28, 66, 9, 32, 50,
+ 1, 27, 0, 53, 85, 1, 9, 125, 125, 123,
+ 92, 14, 11, 32, 50, 1, 15, 18, 34, 5,
+ 4, 5, 11, 11, 39, 21, 57, 10, 9, 13,
+ 3, 29, 19, 49, 26, 2, 1, 9, 16, 6,
+ 44, 0, 0, 0, 4, 63, 67, 16, 15, 11,
+ 36, 9, 49, 4, 64, 44, 102, 96, 34, 36,
+ 12, 44, 8, 2, 10, 23, 27, 31, 27, 50,
+ 2, 18, 15, 5, 13, 9, 15, 3, 21, 19,
+ 39, 36, 9, 8, 11, 33, 0, 17, 6, 10,
+ 4, 6, 22, 14, 15, 3, 6, 7, 4, 15,
+ 4, 19, 0, 4, 7, 22, 8, 5, 11, 14,
+ 9, 25, 13, 39, 21, 12, 1, 14, 14, 14,
+ 62, 30, 8, 5, 18, 5, 17, 6, 69, 15,
+ 0, 41, 5, 27, 16, 4, 21, 30, 14, 8,
+ 43, 16, 25, 23, 6, 53, 18, 38, 24, 28,
+ 34, 30, 16, 32, 30, 5, 6, 18, 4, 2,
+ 17, 3, 15, 11, 5, 11, 15, 11, 17, 31,
+ 15, 13, 35, 39, 35, 5, 1, 11, 13, 25,
+ 31, 27, 27, 43, 57, 49, 55, 87, 73, 93,
+ 11, 13, 49, 9, 21, 27, 35, 49, 57, 43,
+ 49, 51, 33, 43, 55, 41, 45, 4, 48, 36,
+ 22, 14, 18, 6, 3, 0, 4, 16, 64, 42,
+ 30, 16, 38, 18, 18, 6, 32, 6, 84, 60,
+ 46, 32, 46, 10, 1, 5, 5, 13, 88, 42,
+ 10, 4, 20, 1, 19, 13, 10, 78, 52, 24,
+ 12, 38, 10, 2, 1, 5, 124, 23, 9, 0,
+ 1, 7, 1, 12, 14, 6, 12, 20, 26, 17,
+ 5, 31, 15, 44, 21, 49, 13, 4, 7, 25,
+ 31, 17, 29, 43, 55, 61, 1, 2, 30, 3,
+ 15, 2, 4, 8, 15, 7, 1, 1, 31, 17,
+ 29, 1, 18, 41, 7, 22, 15, 6, 7, 4,
+ 38, 13, 17, 6, 18, 33, 33, 81, 38, 46,
+ 62, 20, 10, 22, 4, 4, 8, 11, 5, 5,
+ 35, 29, 29, 37, 33, 49, 95, 39, 37, 33,
+ 9, 21, 12, 30, 0, 24, 44, 1, 5, 11,
+ 13, 49, 35, 31, 67, 41, 57, 53, 83, 75,
+ 79, 81, 31, 27, 83, 19, 21, 41, 65, 51,
+ 53, 61, 51, 63, 65, 77, 83, 99, 97, 9,
+ 27, 57, 11, 18, 12, 30, 32, 38, 72, 46,
+ 62, 64, 98, 78, 70, 102, 96, 88, 22, 11,
+ 43, 63, 91, 123, 125, 125, 24, 90, 76, 74,
+ 54, 72, 36, 26, 32, 12, 19, 5, 20, 1,
+ 48, 66, 1, 8, 28, 36, 10, 32, 62, 8,
+ 0, 54, 1, 43, 87, 119, 125, 125, 125, 125,
+ },
+
+ {
+ /* Context Tables for P, SP, B Slices :: cabac_init_idc = 0, qp = 39 */
+
+ 60, 10, 35, 60, 10, 35, 10, 32, 50, 24,
+ 9, 51, 10, 24, 104, 28, 72, 11, 34, 50,
+ 1, 29, 0, 55, 87, 1, 13, 125, 125, 123,
+ 98, 16, 11, 34, 50, 1, 15, 20, 34, 5,
+ 4, 5, 11, 11, 41, 19, 57, 10, 9, 11,
+ 5, 29, 19, 49, 28, 4, 1, 7, 18, 6,
+ 44, 0, 0, 0, 4, 63, 67, 18, 17, 11,
+ 36, 9, 49, 6, 68, 46, 106, 100, 36, 38,
+ 14, 48, 10, 4, 14, 21, 25, 29, 25, 50,
+ 2, 18, 15, 5, 13, 7, 11, 5, 21, 19,
+ 39, 36, 9, 8, 9, 33, 0, 17, 8, 10,
+ 2, 6, 22, 14, 15, 3, 6, 7, 4, 15,
+ 4, 19, 0, 4, 9, 20, 6, 7, 9, 16,
+ 9, 27, 13, 37, 21, 12, 3, 14, 14, 14,
+ 64, 32, 8, 5, 18, 5, 19, 6, 71, 15,
+ 0, 41, 7, 29, 14, 4, 21, 32, 14, 8,
+ 45, 14, 27, 23, 6, 53, 18, 38, 24, 28,
+ 34, 30, 16, 32, 30, 5, 6, 18, 4, 2,
+ 17, 3, 15, 13, 5, 11, 15, 13, 17, 31,
+ 17, 13, 37, 41, 35, 7, 5, 15, 17, 29,
+ 35, 31, 31, 47, 61, 53, 59, 93, 77, 95,
+ 11, 15, 49, 11, 23, 29, 37, 51, 61, 45,
+ 51, 51, 33, 43, 55, 41, 45, 4, 48, 36,
+ 22, 14, 18, 6, 3, 2, 6, 16, 64, 42,
+ 30, 16, 38, 20, 20, 6, 36, 6, 84, 60,
+ 46, 32, 46, 10, 1, 5, 3, 13, 88, 40,
+ 8, 4, 20, 1, 19, 13, 10, 78, 50, 22,
+ 10, 38, 10, 2, 1, 3, 124, 21, 7, 2,
+ 2, 5, 0, 14, 18, 8, 16, 24, 30, 15,
+ 3, 29, 15, 46, 19, 49, 13, 6, 7, 27,
+ 31, 19, 31, 45, 57, 63, 3, 0, 30, 5,
+ 15, 2, 4, 8, 17, 9, 1, 3, 33, 17,
+ 29, 1, 18, 41, 9, 22, 17, 6, 9, 2,
+ 38, 15, 19, 4, 16, 35, 35, 85, 36, 46,
+ 62, 18, 6, 18, 0, 0, 4, 17, 11, 9,
+ 41, 33, 33, 43, 41, 55, 103, 43, 43, 37,
+ 11, 23, 12, 32, 2, 26, 48, 3, 9, 15,
+ 15, 53, 39, 33, 71, 43, 61, 57, 87, 79,
+ 81, 83, 31, 27, 85, 23, 25, 45, 67, 55,
+ 57, 65, 53, 67, 69, 79, 85, 101, 97, 11,
+ 27, 59, 11, 20, 14, 30, 34, 40, 74, 48,
+ 64, 66, 100, 80, 72, 104, 98, 88, 18, 15,
+ 49, 67, 97, 125, 125, 125, 24, 92, 78, 76,
+ 56, 74, 36, 28, 34, 14, 17, 5, 22, 0,
+ 50, 68, 1, 10, 28, 36, 10, 34, 64, 8,
+ 0, 50, 5, 49, 93, 125, 125, 125, 125, 125,
+ },
+
+ {
+ /* Context Tables for P, SP, B Slices :: cabac_init_idc = 0, qp = 40 */
+
+ 56, 8, 35, 56, 8, 35, 12, 34, 50, 24,
+ 9, 53, 8, 22, 104, 28, 76, 11, 36, 52,
+ 1, 31, 0, 57, 91, 3, 17, 125, 125, 125,
+ 102, 18, 11, 36, 52, 1, 15, 22, 34, 7,
+ 4, 5, 9, 13, 41, 19, 59, 10, 9, 11,
+ 5, 31, 19, 49, 28, 4, 1, 7, 18, 6,
+ 44, 0, 0, 0, 6, 65, 67, 18, 19, 11,
+ 36, 9, 49, 10, 70, 50, 108, 102, 38, 40,
+ 16, 50, 12, 6, 18, 21, 25, 29, 23, 50,
+ 2, 18, 13, 5, 11, 7, 9, 5, 23, 21,
+ 41, 36, 9, 8, 9, 33, 0, 17, 8, 8,
+ 2, 6, 22, 12, 15, 3, 6, 7, 2, 15,
+ 2, 21, 0, 2, 13, 18, 4, 9, 9, 16,
+ 9, 29, 15, 37, 23, 10, 3, 14, 14, 14,
+ 66, 32, 8, 7, 18, 7, 21, 4, 73, 15,
+ 0, 43, 7, 31, 14, 2, 23, 32, 14, 8,
+ 49, 14, 29, 23, 6, 55, 16, 36, 24, 28,
+ 34, 28, 14, 32, 28, 7, 4, 18, 4, 0,
+ 19, 5, 17, 13, 7, 13, 17, 15, 19, 31,
+ 17, 13, 39, 45, 35, 11, 9, 19, 21, 33,
+ 41, 35, 35, 51, 65, 57, 63, 97, 81, 97,
+ 11, 15, 51, 13, 27, 33, 41, 55, 63, 49,
+ 55, 53, 35, 45, 55, 41, 43, 4, 48, 36,
+ 22, 14, 18, 6, 3, 2, 8, 18, 64, 42,
+ 30, 16, 38, 20, 20, 8, 38, 6, 84, 60,
+ 46, 30, 46, 10, 1, 5, 3, 13, 88, 40,
+ 8, 4, 20, 1, 19, 11, 10, 76, 48, 20,
+ 8, 36, 10, 2, 1, 3, 124, 19, 5, 2,
+ 4, 3, 0, 16, 20, 10, 18, 26, 32, 15,
+ 3, 29, 13, 48, 19, 51, 13, 6, 7, 27,
+ 33, 19, 33, 47, 61, 65, 5, 0, 30, 5,
+ 17, 0, 2, 6, 19, 11, 3, 3, 35, 19,
+ 31, 1, 18, 43, 11, 20, 19, 6, 11, 0,
+ 38, 17, 21, 4, 16, 37, 37, 89, 34, 44,
+ 60, 14, 4, 16, 3, 3, 1, 21, 17, 15,
+ 47, 39, 35, 49, 47, 61, 111, 49, 47, 41,
+ 15, 27, 12, 32, 2, 28, 52, 7, 13, 19,
+ 19, 59, 43, 37, 75, 47, 65, 59, 91, 81,
+ 83, 85, 33, 29, 87, 25, 27, 49, 71, 59,
+ 61, 67, 57, 71, 71, 81, 89, 105, 99, 13,
+ 29, 61, 11, 20, 14, 32, 34, 40, 76, 48,
+ 66, 68, 102, 82, 74, 106, 100, 86, 14, 19,
+ 53, 73, 103, 125, 125, 125, 24, 92, 78, 76,
+ 56, 74, 36, 28, 34, 14, 17, 5, 22, 0,
+ 52, 70, 1, 10, 30, 38, 10, 34, 64, 8,
+ 1, 48, 9, 53, 99, 125, 125, 125, 125, 125,
+ },
+
+ {
+ /* Context Tables for P, SP, B Slices :: cabac_init_idc = 0, qp = 41 */
+
+ 54, 8, 35, 54, 8, 35, 16, 36, 52, 24,
+ 9, 55, 6, 20, 104, 28, 80, 11, 38, 54,
+ 1, 31, 2, 59, 93, 5, 21, 125, 125, 125,
+ 108, 22, 11, 38, 54, 1, 13, 24, 36, 9,
+ 4, 3, 7, 13, 41, 19, 59, 10, 9, 9,
+ 5, 31, 19, 49, 28, 4, 0, 7, 18, 6,
+ 44, 0, 0, 0, 8, 65, 67, 18, 21, 11,
+ 36, 7, 49, 14, 74, 54, 110, 106, 42, 42,
+ 20, 52, 16, 8, 22, 21, 25, 29, 21, 50,
+ 2, 18, 11, 5, 9, 5, 7, 5, 23, 21,
+ 41, 36, 9, 10, 7, 31, 2, 17, 8, 8,
+ 2, 6, 22, 12, 13, 3, 8, 7, 2, 13,
+ 0, 21, 0, 0, 15, 18, 2, 11, 7, 16,
+ 7, 31, 17, 37, 25, 10, 3, 16, 14, 14,
+ 68, 32, 8, 7, 20, 7, 21, 4, 75, 15,
+ 0, 45, 7, 33, 14, 2, 25, 32, 14, 8,
+ 51, 14, 29, 23, 6, 57, 14, 36, 24, 28,
+ 34, 28, 14, 32, 28, 7, 4, 18, 4, 0,
+ 19, 5, 17, 13, 7, 13, 19, 15, 19, 31,
+ 17, 13, 41, 47, 35, 15, 11, 21, 23, 37,
+ 45, 37, 37, 55, 69, 59, 65, 101, 83, 99,
+ 11, 15, 53, 15, 29, 35, 43, 57, 65, 51,
+ 57, 55, 37, 47, 55, 39, 41, 6, 50, 36,
+ 22, 14, 20, 8, 1, 4, 10, 20, 64, 42,
+ 30, 16, 40, 20, 22, 10, 42, 6, 84, 60,
+ 46, 30, 48, 10, 1, 5, 1, 13, 90, 40,
+ 8, 4, 22, 1, 19, 9, 10, 76, 48, 20,
+ 6, 36, 10, 2, 0, 1, 124, 17, 3, 4,
+ 6, 0, 2, 20, 22, 14, 20, 28, 36, 13,
+ 1, 29, 11, 52, 19, 53, 11, 8, 7, 27,
+ 35, 19, 33, 49, 63, 67, 5, 0, 30, 5,
+ 17, 1, 2, 6, 21, 11, 3, 3, 37, 19,
+ 31, 0, 20, 43, 11, 18, 19, 6, 11, 0,
+ 38, 19, 23, 4, 16, 39, 37, 91, 34, 42,
+ 58, 12, 2, 14, 7, 7, 5, 25, 21, 21,
+ 51, 43, 37, 55, 53, 67, 119, 55, 51, 43,
+ 17, 29, 12, 34, 2, 30, 56, 11, 15, 23,
+ 23, 63, 45, 39, 79, 49, 67, 61, 93, 83,
+ 85, 85, 33, 29, 89, 27, 29, 51, 75, 61,
+ 63, 69, 59, 73, 73, 83, 91, 107, 101, 13,
+ 31, 61, 9, 20, 14, 34, 36, 42, 78, 50,
+ 68, 70, 106, 84, 76, 110, 104, 84, 12, 23,
+ 57, 79, 109, 125, 125, 125, 26, 92, 78, 78,
+ 56, 76, 38, 28, 36, 16, 17, 3, 24, 0,
+ 54, 74, 0, 12, 32, 40, 10, 36, 66, 8,
+ 1, 46, 13, 57, 103, 125, 125, 125, 125, 125,
+ },
+
+ {
+ /* Context Tables for P, SP, B Slices :: cabac_init_idc = 0, qp = 42 */
+
+ 52, 8, 35, 52, 8, 35, 20, 40, 52, 24,
+ 11, 59, 2, 16, 104, 28, 86, 13, 40, 54,
+ 1, 33, 2, 61, 95, 5, 25, 125, 125, 125,
+ 114, 24, 11, 40, 54, 1, 13, 26, 36, 9,
+ 4, 3, 7, 13, 43, 17, 59, 10, 9, 7,
+ 7, 31, 19, 49, 30, 6, 0, 5, 20, 6,
+ 44, 0, 0, 0, 8, 65, 67, 20, 23, 11,
+ 36, 7, 49, 16, 78, 56, 114, 110, 44, 44,
+ 22, 56, 18, 10, 26, 19, 23, 27, 19, 50,
+ 2, 18, 11, 5, 9, 5, 3, 7, 23, 21,
+ 41, 36, 9, 10, 7, 31, 2, 17, 10, 8,
+ 0, 6, 22, 12, 13, 3, 8, 7, 2, 13,
+ 0, 21, 0, 0, 17, 16, 0, 13, 5, 18,
+ 7, 33, 17, 35, 25, 10, 5, 16, 14, 14,
+ 70, 34, 8, 7, 20, 7, 23, 4, 77, 15,
+ 0, 45, 9, 35, 12, 2, 25, 34, 14, 8,
+ 53, 12, 31, 23, 6, 57, 14, 36, 24, 28,
+ 34, 28, 14, 32, 28, 7, 4, 18, 4, 0,
+ 19, 5, 17, 15, 7, 13, 19, 17, 21, 31,
+ 19, 13, 43, 49, 35, 17, 15, 25, 27, 41,
+ 49, 41, 41, 59, 73, 63, 69, 107, 87, 101,
+ 11, 17, 53, 17, 31, 37, 45, 61, 69, 53,
+ 59, 55, 37, 47, 55, 39, 41, 6, 50, 36,
+ 22, 14, 20, 8, 1, 4, 12, 20, 64, 42,
+ 30, 16, 40, 22, 24, 10, 46, 6, 84, 60,
+ 46, 30, 48, 10, 1, 5, 0, 13, 90, 38,
+ 6, 4, 22, 1, 19, 9, 10, 74, 46, 18,
+ 4, 36, 10, 2, 0, 1, 124, 15, 1, 6,
+ 10, 2, 4, 22, 26, 16, 24, 32, 40, 11,
+ 0, 27, 11, 54, 17, 53, 11, 10, 7, 29,
+ 35, 21, 35, 51, 65, 69, 7, 1, 30, 7,
+ 19, 1, 2, 6, 23, 13, 3, 5, 39, 19,
+ 31, 0, 20, 43, 13, 18, 21, 6, 13, 1,
+ 38, 21, 25, 2, 14, 41, 39, 95, 32, 42,
+ 58, 10, 1, 10, 11, 11, 9, 31, 27, 25,
+ 57, 47, 41, 61, 61, 73, 125, 59, 57, 47,
+ 19, 31, 12, 36, 4, 32, 60, 13, 19, 27,
+ 25, 67, 49, 43, 83, 51, 71, 65, 97, 87,
+ 87, 87, 35, 29, 91, 31, 33, 55, 77, 65,
+ 67, 73, 63, 77, 77, 85, 93, 109, 101, 15,
+ 31, 63, 9, 22, 16, 34, 36, 44, 80, 52,
+ 70, 72, 108, 86, 78, 112, 106, 84, 8, 27,
+ 63, 83, 115, 125, 125, 125, 26, 94, 80, 78,
+ 58, 78, 38, 30, 36, 16, 15, 3, 26, 2,
+ 56, 76, 0, 14, 32, 40, 10, 36, 68, 8,
+ 1, 42, 17, 63, 109, 125, 125, 125, 125, 125,
+ },
+
+ {
+ /* Context Tables for P, SP, B Slices :: cabac_init_idc = 0, qp = 43 */
+
+ 50, 8, 37, 50, 8, 37, 24, 42, 54, 24,
+ 11, 61, 0, 14, 104, 28, 90, 13, 40, 56,
+ 1, 35, 2, 63, 97, 7, 27, 125, 125, 125,
+ 120, 26, 11, 40, 56, 1, 11, 28, 38, 11,
+ 6, 1, 5, 15, 43, 17, 59, 10, 7, 7,
+ 7, 31, 17, 49, 30, 6, 0, 5, 20, 6,
+ 44, 0, 0, 0, 10, 65, 67, 20, 23, 13,
+ 36, 7, 47, 20, 82, 60, 116, 114, 46, 46,
+ 26, 58, 20, 10, 28, 19, 23, 27, 19, 52,
+ 2, 18, 9, 3, 7, 3, 1, 7, 25, 23,
+ 43, 36, 9, 10, 5, 31, 4, 17, 10, 8,
+ 0, 8, 22, 10, 13, 3, 8, 7, 2, 13,
+ 1, 21, 0, 1, 19, 14, 0, 13, 5, 18,
+ 7, 33, 19, 35, 27, 10, 5, 16, 14, 14,
+ 70, 34, 8, 7, 22, 7, 25, 4, 77, 17,
+ 0, 47, 9, 37, 12, 2, 27, 34, 14, 8,
+ 57, 12, 33, 23, 6, 59, 12, 36, 22, 28,
+ 34, 28, 14, 32, 28, 7, 2, 18, 2, 0,
+ 19, 7, 19, 15, 9, 15, 21, 19, 21, 31,
+ 19, 11, 45, 53, 35, 21, 19, 29, 31, 45,
+ 53, 45, 45, 63, 77, 65, 71, 111, 91, 103,
+ 11, 17, 55, 19, 33, 39, 49, 63, 71, 55,
+ 61, 57, 39, 49, 55, 39, 39, 8, 50, 36,
+ 22, 16, 20, 8, 1, 6, 14, 22, 62, 42,
+ 30, 16, 42, 22, 26, 12, 50, 6, 86, 60,
+ 46, 30, 50, 10, 1, 3, 0, 13, 90, 38,
+ 6, 4, 22, 0, 17, 7, 10, 74, 46, 16,
+ 4, 36, 10, 4, 2, 0, 124, 15, 0, 6,
+ 12, 4, 6, 26, 28, 18, 26, 34, 42, 11,
+ 2, 27, 9, 58, 17, 55, 11, 10, 7, 29,
+ 37, 21, 37, 53, 67, 71, 7, 1, 30, 7,
+ 19, 3, 0, 6, 23, 13, 5, 5, 39, 19,
+ 33, 0, 20, 45, 13, 16, 23, 4, 15, 1,
+ 40, 21, 27, 2, 14, 41, 39, 97, 30, 40,
+ 56, 8, 3, 8, 15, 15, 13, 35, 31, 31,
+ 63, 53, 43, 65, 67, 79, 125, 65, 61, 51,
+ 21, 33, 12, 38, 4, 36, 66, 17, 23, 31,
+ 29, 71, 53, 45, 87, 55, 73, 67, 99, 89,
+ 89, 89, 35, 31, 93, 33, 35, 57, 81, 69,
+ 71, 75, 65, 81, 79, 89, 97, 113, 103, 17,
+ 33, 65, 9, 22, 16, 36, 38, 44, 84, 54,
+ 72, 74, 110, 90, 80, 116, 108, 82, 6, 31,
+ 67, 89, 119, 125, 125, 125, 28, 94, 80, 80,
+ 58, 80, 40, 30, 38, 18, 15, 1, 26, 2,
+ 58, 78, 0, 14, 34, 42, 10, 38, 70, 8,
+ 1, 40, 21, 67, 115, 125, 125, 125, 125, 125,
+ },
+
+ {
+ /* Context Tables for P, SP, B Slices :: cabac_init_idc = 0, qp = 44 */
+
+ 46, 8, 37, 46, 8, 37, 26, 46, 54, 24,
+ 13, 65, 3, 12, 104, 28, 94, 13, 42, 58,
+ 1, 37, 2, 65, 99, 7, 31, 125, 125, 125,
+ 124, 28, 11, 42, 58, 1, 11, 30, 38, 11,
+ 6, 1, 3, 15, 45, 17, 59, 10, 7, 5,
+ 7, 33, 17, 49, 32, 6, 0, 5, 22, 6,
+ 44, 0, 0, 0, 10, 67, 67, 22, 25, 13,
+ 36, 7, 47, 24, 84, 62, 120, 116, 48, 48,
+ 28, 60, 22, 12, 32, 17, 23, 25, 17, 52,
+ 2, 18, 9, 3, 7, 3, 2, 7, 25, 23,
+ 43, 36, 9, 10, 5, 31, 4, 17, 12, 8,
+ 0, 8, 22, 10, 13, 3, 8, 7, 2, 13,
+ 3, 23, 0, 3, 21, 12, 1, 15, 3, 20,
+ 7, 35, 19, 33, 27, 10, 5, 16, 14, 14,
+ 72, 34, 8, 9, 22, 9, 27, 2, 79, 17,
+ 0, 47, 11, 39, 12, 0, 29, 34, 14, 8,
+ 59, 12, 35, 23, 6, 59, 10, 36, 22, 28,
+ 34, 28, 14, 32, 26, 7, 2, 18, 2, 0,
+ 21, 7, 19, 17, 9, 15, 23, 21, 23, 31,
+ 21, 11, 47, 55, 35, 23, 23, 33, 35, 49,
+ 57, 49, 49, 67, 81, 69, 75, 115, 95, 105,
+ 11, 17, 55, 21, 35, 41, 51, 67, 75, 57,
+ 65, 59, 39, 49, 55, 39, 39, 8, 50, 36,
+ 22, 16, 20, 8, 1, 6, 16, 22, 62, 42,
+ 30, 16, 42, 24, 26, 14, 54, 6, 86, 60,
+ 46, 30, 50, 10, 1, 3, 2, 13, 90, 38,
+ 4, 4, 22, 0, 17, 5, 10, 72, 44, 14,
+ 2, 36, 10, 4, 2, 0, 124, 13, 2, 8,
+ 14, 6, 8, 28, 30, 20, 28, 38, 46, 9,
+ 2, 27, 9, 60, 17, 55, 11, 12, 7, 31,
+ 37, 21, 39, 55, 69, 73, 9, 3, 30, 9,
+ 21, 3, 0, 6, 25, 15, 5, 5, 41, 21,
+ 33, 0, 20, 45, 15, 16, 25, 4, 17, 3,
+ 40, 23, 29, 0, 12, 43, 41, 101, 28, 38,
+ 56, 6, 5, 4, 19, 19, 17, 41, 37, 35,
+ 69, 57, 45, 71, 73, 85, 125, 71, 67, 55,
+ 23, 37, 12, 40, 4, 38, 70, 21, 27, 35,
+ 33, 75, 57, 49, 91, 57, 77, 69, 103, 93,
+ 91, 91, 37, 31, 95, 35, 39, 61, 85, 73,
+ 75, 79, 69, 85, 83, 91, 99, 115, 105, 19,
+ 35, 67, 9, 24, 16, 36, 38, 46, 86, 54,
+ 74, 76, 112, 92, 82, 118, 110, 82, 2, 35,
+ 73, 93, 125, 125, 125, 125, 28, 96, 82, 80,
+ 58, 80, 40, 30, 38, 18, 15, 1, 28, 4,
+ 60, 80, 0, 16, 34, 42, 10, 38, 72, 8,
+ 1, 38, 25, 73, 121, 125, 125, 125, 125, 125,
+ },
+
+ {
+ /* Context Tables for P, SP, B Slices :: cabac_init_idc = 0, qp = 45 */
+
+ 44, 8, 37, 44, 8, 37, 30, 48, 54, 24,
+ 13, 67, 5, 8, 104, 28, 100, 15, 44, 58,
+ 1, 37, 4, 67, 101, 9, 35, 125, 125, 125,
+ 124, 32, 11, 44, 58, 1, 11, 32, 38, 13,
+ 6, 1, 3, 15, 45, 15, 59, 10, 7, 3,
+ 9, 33, 17, 49, 32, 8, 2, 3, 22, 6,
+ 44, 0, 0, 0, 12, 67, 67, 22, 27, 13,
+ 36, 5, 47, 26, 88, 66, 122, 120, 52, 50,
+ 30, 64, 24, 14, 36, 17, 21, 25, 15, 52,
+ 2, 18, 7, 3, 5, 1, 4, 9, 25, 23,
+ 43, 36, 9, 12, 3, 31, 4, 17, 12, 8,
+ 1, 8, 22, 10, 11, 3, 10, 7, 2, 13,
+ 3, 23, 0, 3, 23, 12, 3, 17, 1, 20,
+ 7, 37, 21, 33, 29, 10, 7, 16, 14, 14,
+ 74, 36, 8, 9, 22, 9, 29, 2, 81, 17,
+ 0, 49, 11, 41, 10, 0, 29, 36, 14, 8,
+ 61, 10, 37, 23, 6, 61, 10, 36, 22, 28,
+ 34, 28, 14, 32, 26, 7, 2, 18, 2, 0,
+ 21, 7, 19, 17, 9, 15, 23, 23, 23, 31,
+ 21, 11, 49, 57, 35, 27, 27, 35, 37, 53,
+ 61, 53, 53, 71, 85, 73, 79, 121, 97, 107,
+ 11, 19, 57, 23, 37, 43, 53, 69, 77, 59,
+ 67, 59, 41, 51, 55, 39, 37, 8, 50, 36,
+ 22, 16, 20, 10, 0, 8, 18, 24, 62, 42,
+ 30, 16, 42, 24, 28, 14, 58, 6, 86, 60,
+ 46, 30, 50, 10, 1, 3, 4, 13, 92, 36,
+ 4, 4, 22, 0, 17, 5, 10, 72, 42, 12,
+ 0, 36, 10, 4, 2, 2, 124, 11, 4, 10,
+ 18, 8, 10, 30, 34, 22, 32, 40, 50, 7,
+ 4, 25, 7, 62, 15, 57, 9, 14, 7, 31,
+ 39, 23, 39, 57, 71, 75, 11, 3, 30, 9,
+ 21, 5, 0, 6, 27, 17, 5, 7, 43, 21,
+ 33, 0, 20, 45, 17, 14, 25, 4, 19, 5,
+ 40, 25, 31, 0, 12, 45, 43, 105, 26, 38,
+ 54, 4, 9, 2, 23, 23, 21, 45, 43, 41,
+ 75, 61, 49, 77, 81, 91, 125, 75, 71, 59,
+ 25, 39, 12, 42, 6, 40, 74, 23, 29, 39,
+ 35, 79, 61, 51, 95, 59, 81, 73, 107, 95,
+ 93, 93, 37, 31, 97, 39, 41, 65, 87, 75,
+ 77, 81, 71, 87, 85, 93, 101, 117, 105, 19,
+ 35, 69, 7, 24, 18, 38, 40, 48, 88, 56,
+ 76, 78, 116, 94, 84, 120, 114, 80, 1, 39,
+ 77, 99, 125, 125, 125, 125, 28, 96, 82, 82,
+ 60, 82, 40, 32, 40, 20, 13, 1, 30, 4,
+ 62, 82, 2, 18, 36, 44, 10, 40, 74, 8,
+ 1, 34, 29, 77, 125, 125, 125, 125, 125, 125,
+ },
+
+ {
+ /* Context Tables for P, SP, B Slices :: cabac_init_idc = 0, qp = 46 */
+
+ 42, 8, 37, 42, 8, 37, 34, 52, 56, 24,
+ 15, 71, 9, 6, 104, 28, 104, 15, 46, 60,
+ 1, 39, 4, 69, 103, 9, 39, 125, 125, 125,
+ 124, 34, 11, 46, 60, 1, 9, 34, 40, 13,
+ 6, 0, 1, 17, 47, 15, 59, 10, 7, 3,
+ 9, 33, 17, 49, 34, 8, 2, 3, 24, 6,
+ 44, 0, 0, 0, 12, 67, 67, 24, 29, 13,
+ 36, 5, 47, 30, 92, 68, 124, 124, 54, 52,
+ 34, 66, 26, 16, 40, 15, 21, 23, 13, 52,
+ 2, 18, 7, 3, 5, 1, 8, 9, 27, 25,
+ 45, 36, 9, 12, 3, 31, 6, 17, 14, 8,
+ 1, 8, 22, 8, 11, 3, 10, 7, 2, 13,
+ 5, 23, 0, 5, 25, 10, 5, 19, 1, 22,
+ 7, 39, 21, 31, 29, 10, 7, 16, 14, 14,
+ 76, 36, 8, 9, 24, 9, 31, 2, 83, 17,
+ 0, 49, 13, 43, 10, 0, 31, 36, 14, 8,
+ 65, 10, 39, 23, 6, 61, 8, 36, 22, 28,
+ 34, 28, 14, 32, 26, 7, 0, 18, 2, 0,
+ 21, 9, 21, 19, 11, 17, 25, 25, 25, 31,
+ 23, 11, 51, 61, 35, 29, 31, 39, 41, 57,
+ 65, 57, 57, 75, 89, 75, 81, 125, 101, 109,
+ 11, 19, 57, 25, 39, 45, 57, 73, 81, 61,
+ 69, 61, 41, 51, 55, 39, 37, 10, 50, 36,
+ 22, 16, 20, 10, 0, 8, 20, 24, 62, 42,
+ 30, 16, 44, 26, 30, 16, 62, 6, 86, 60,
+ 46, 30, 52, 10, 1, 3, 4, 13, 92, 36,
+ 2, 4, 22, 0, 17, 3, 10, 70, 42, 10,
+ 1, 36, 10, 4, 4, 2, 124, 9, 6, 10,
+ 20, 10, 12, 34, 36, 24, 34, 44, 52, 7,
+ 6, 25, 7, 66, 15, 57, 9, 14, 7, 33,
+ 39, 23, 41, 59, 73, 77, 11, 5, 30, 11,
+ 23, 5, 1, 6, 29, 17, 7, 7, 45, 21,
+ 35, 0, 20, 47, 17, 14, 27, 4, 21, 5,
+ 40, 27, 33, 1, 10, 47, 43, 107, 24, 36,
+ 54, 2, 11, 1, 27, 27, 25, 51, 47, 45,
+ 81, 67, 51, 83, 87, 97, 125, 81, 77, 63,
+ 27, 41, 12, 44, 6, 42, 78, 27, 33, 43,
+ 39, 83, 65, 55, 99, 63, 83, 75, 109, 99,
+ 95, 95, 39, 33, 99, 41, 45, 67, 91, 79,
+ 81, 85, 75, 91, 89, 95, 105, 121, 107, 21,
+ 37, 71, 7, 26, 18, 38, 40, 48, 90, 58,
+ 78, 80, 118, 96, 86, 124, 116, 80, 3, 43,
+ 83, 103, 125, 125, 125, 125, 30, 98, 84, 82,
+ 60, 84, 42, 32, 40, 20, 13, 0, 30, 6,
+ 64, 84, 2, 18, 36, 44, 10, 40, 76, 8,
+ 1, 32, 33, 83, 125, 125, 125, 125, 125, 125,
+ },
+
+ {
+ /* Context Tables for P, SP, B Slices :: cabac_init_idc = 0, qp = 47 */
+
+ 40, 8, 37, 40, 8, 37, 38, 54, 56, 24,
+ 15, 73, 11, 4, 104, 28, 108, 15, 48, 62,
+ 1, 41, 4, 71, 105, 11, 43, 125, 125, 125,
+ 124, 36, 11, 48, 62, 1, 9, 36, 40, 15,
+ 6, 0, 0, 17, 47, 15, 59, 10, 7, 1,
+ 9, 33, 17, 49, 34, 8, 2, 3, 24, 6,
+ 44, 0, 0, 0, 14, 67, 67, 24, 31, 13,
+ 36, 5, 47, 34, 96, 72, 124, 124, 56, 54,
+ 36, 68, 28, 18, 44, 15, 21, 23, 11, 52,
+ 2, 18, 5, 3, 3, 0, 10, 9, 27, 25,
+ 45, 36, 9, 12, 1, 31, 6, 17, 14, 8,
+ 1, 8, 22, 8, 11, 3, 10, 7, 2, 13,
+ 7, 23, 0, 7, 27, 8, 7, 21, 0, 22,
+ 7, 41, 23, 31, 31, 10, 7, 16, 14, 14,
+ 78, 36, 8, 9, 24, 9, 33, 2, 85, 17,
+ 0, 51, 13, 45, 10, 0, 33, 36, 14, 8,
+ 67, 10, 41, 23, 6, 63, 6, 36, 22, 28,
+ 34, 28, 14, 32, 26, 7, 0, 18, 2, 0,
+ 21, 9, 21, 19, 11, 17, 27, 27, 25, 31,
+ 23, 11, 53, 63, 35, 33, 35, 43, 45, 61,
+ 69, 61, 61, 79, 93, 79, 85, 125, 105, 111,
+ 11, 19, 59, 27, 41, 47, 59, 75, 83, 63,
+ 71, 63, 43, 53, 55, 39, 35, 10, 50, 36,
+ 22, 16, 20, 10, 0, 10, 22, 26, 62, 42,
+ 30, 16, 44, 26, 32, 18, 66, 6, 86, 60,
+ 46, 30, 52, 10, 1, 3, 6, 13, 92, 36,
+ 2, 4, 22, 0, 17, 1, 10, 70, 40, 8,
+ 3, 36, 10, 4, 4, 4, 124, 7, 8, 12,
+ 22, 12, 14, 36, 38, 26, 36, 46, 56, 5,
+ 8, 25, 5, 68, 15, 59, 9, 16, 7, 33,
+ 41, 23, 43, 61, 75, 79, 13, 5, 30, 11,
+ 23, 7, 1, 6, 31, 19, 7, 7, 47, 21,
+ 35, 0, 20, 47, 19, 12, 29, 4, 23, 7,
+ 40, 29, 35, 1, 10, 49, 45, 111, 22, 34,
+ 52, 0, 13, 3, 31, 31, 29, 55, 53, 51,
+ 87, 71, 53, 89, 93, 103, 125, 87, 81, 67,
+ 29, 43, 12, 46, 6, 44, 82, 31, 37, 47,
+ 43, 87, 69, 57, 103, 65, 87, 77, 113, 101,
+ 97, 97, 39, 33, 101, 43, 47, 71, 95, 83,
+ 85, 87, 77, 95, 91, 97, 107, 123, 109, 23,
+ 39, 73, 7, 26, 18, 40, 42, 50, 92, 60,
+ 80, 82, 120, 98, 88, 124, 118, 78, 7, 47,
+ 87, 109, 125, 125, 125, 125, 30, 98, 84, 84,
+ 60, 86, 42, 32, 42, 22, 13, 0, 32, 6,
+ 66, 86, 2, 20, 38, 46, 10, 42, 78, 8,
+ 1, 30, 37, 87, 125, 125, 125, 125, 125, 125,
+ },
+
+ {
+ /* Context Tables for P, SP, B Slices :: cabac_init_idc = 0, qp = 48 */
+
+ 36, 6, 39, 36, 6, 39, 40, 56, 56, 24,
+ 17, 77, 15, 0, 102, 28, 112, 17, 48, 62,
+ 3, 43, 4, 75, 109, 13, 47, 125, 125, 125,
+ 124, 38, 13, 48, 62, 3, 9, 38, 40, 17,
+ 6, 0, 0, 19, 49, 15, 61, 10, 7, 1,
+ 11, 35, 17, 49, 34, 8, 2, 3, 24, 4,
+ 44, 0, 0, 0, 14, 69, 67, 24, 33, 15,
+ 36, 5, 47, 36, 98, 74, 124, 124, 58, 56,
+ 38, 70, 30, 18, 46, 15, 21, 23, 11, 52,
+ 2, 18, 5, 3, 3, 0, 12, 11, 29, 27,
+ 47, 36, 9, 12, 1, 31, 6, 17, 14, 6,
+ 3, 8, 20, 6, 11, 5, 10, 7, 0, 13,
+ 9, 25, 1, 9, 31, 6, 9, 23, 0, 22,
+ 7, 43, 25, 31, 33, 8, 9, 16, 14, 14,
+ 78, 36, 8, 11, 24, 11, 35, 0, 87, 19,
+ 1, 53, 15, 47, 8, 1, 35, 36, 14, 8,
+ 71, 8, 43, 23, 6, 65, 4, 34, 20, 28,
+ 34, 26, 12, 32, 24, 9, 1, 18, 0, 1,
+ 23, 11, 23, 21, 13, 19, 29, 29, 27, 31,
+ 25, 11, 55, 67, 35, 37, 39, 47, 49, 65,
+ 75, 65, 65, 83, 97, 83, 89, 125, 109, 115,
+ 13, 21, 61, 29, 45, 51, 63, 79, 87, 67,
+ 75, 65, 45, 55, 55, 39, 35, 10, 50, 36,
+ 22, 16, 20, 10, 0, 10, 24, 26, 60, 42,
+ 30, 16, 44, 26, 32, 18, 68, 4, 86, 60,
+ 44, 28, 52, 10, 1, 3, 6, 13, 92, 34,
+ 0, 2, 22, 0, 17, 1, 10, 68, 38, 6,
+ 5, 34, 10, 4, 4, 4, 124, 7, 10, 12,
+ 24, 14, 14, 38, 40, 28, 38, 48, 58, 5,
+ 8, 25, 5, 70, 15, 61, 9, 16, 7, 35,
+ 43, 25, 45, 63, 79, 83, 15, 7, 30, 13,
+ 25, 9, 3, 4, 33, 21, 9, 9, 49, 23,
+ 37, 0, 20, 49, 21, 10, 31, 2, 25, 9,
+ 40, 31, 39, 3, 8, 51, 47, 115, 20, 32,
+ 50, 3, 17, 7, 35, 35, 35, 61, 59, 57,
+ 93, 77, 57, 95, 101, 109, 125, 93, 87, 71,
+ 33, 47, 12, 46, 6, 46, 86, 35, 41, 51,
+ 47, 93, 73, 61, 109, 69, 91, 81, 117, 105,
+ 99, 99, 41, 35, 105, 47, 51, 75, 99, 87,
+ 89, 91, 81, 99, 95, 101, 111, 125, 111, 25,
+ 41, 75, 7, 26, 18, 40, 42, 50, 94, 60,
+ 82, 82, 122, 100, 90, 124, 120, 76, 11, 53,
+ 93, 115, 125, 125, 125, 125, 30, 98, 84, 84,
+ 60, 86, 42, 32, 42, 22, 13, 0, 32, 6,
+ 68, 88, 2, 20, 38, 46, 10, 42, 78, 8,
+ 3, 26, 43, 93, 125, 125, 125, 125, 125, 125,
+ },
+
+ {
+ /* Context Tables for P, SP, B Slices :: cabac_init_idc = 0, qp = 49 */
+
+ 34, 6, 39, 34, 6, 39, 44, 60, 58, 26,
+ 17, 79, 17, 1, 102, 28, 118, 17, 50, 64,
+ 3, 43, 6, 77, 111, 13, 49, 125, 125, 125,
+ 124, 42, 13, 50, 64, 3, 7, 42, 42, 17,
+ 8, 2, 2, 19, 49, 13, 61, 10, 5, 0,
+ 11, 35, 15, 49, 36, 10, 4, 1, 26, 4,
+ 44, 0, 0, 0, 16, 69, 67, 26, 33, 15,
+ 36, 3, 45, 40, 102, 78, 124, 124, 62, 58,
+ 42, 74, 34, 20, 50, 13, 19, 21, 9, 54,
+ 4, 20, 3, 1, 1, 2, 16, 11, 29, 27,
+ 47, 38, 9, 14, 0, 29, 8, 15, 16, 6,
+ 3, 10, 20, 6, 9, 5, 12, 7, 0, 11,
+ 9, 25, 1, 9, 33, 6, 9, 23, 2, 24,
+ 5, 43, 25, 29, 33, 8, 9, 18, 14, 14,
+ 80, 38, 10, 11, 26, 11, 35, 0, 87, 19,
+ 1, 53, 15, 49, 8, 1, 35, 38, 16, 8,
+ 73, 8, 43, 21, 6, 65, 4, 34, 20, 28,
+ 34, 26, 12, 32, 24, 9, 1, 18, 0, 1,
+ 23, 11, 23, 21, 13, 19, 29, 29, 27, 29,
+ 25, 9, 55, 69, 33, 39, 41, 49, 51, 69,
+ 79, 67, 67, 87, 99, 85, 91, 125, 111, 117,
+ 13, 21, 61, 31, 47, 53, 65, 81, 89, 69,
+ 77, 65, 45, 55, 53, 37, 33, 12, 52, 36,
+ 22, 18, 22, 12, 2, 12, 28, 28, 60, 42,
+ 30, 16, 46, 28, 34, 20, 72, 4, 88, 62,
+ 44, 28, 54, 10, 1, 1, 8, 13, 94, 34,
+ 0, 2, 24, 2, 15, 0, 10, 68, 38, 6,
+ 5, 34, 10, 6, 6, 6, 124, 5, 12, 14,
+ 28, 18, 16, 42, 44, 32, 42, 52, 62, 3,
+ 10, 23, 3, 74, 13, 61, 7, 18, 5, 35,
+ 43, 25, 45, 63, 81, 85, 15, 7, 32, 13,
+ 25, 9, 3, 4, 33, 21, 9, 9, 49, 23,
+ 37, 2, 22, 49, 21, 10, 31, 2, 25, 9,
+ 42, 31, 41, 3, 8, 51, 47, 117, 20, 32,
+ 50, 5, 19, 9, 39, 39, 39, 65, 63, 61,
+ 97, 81, 59, 99, 107, 115, 125, 97, 91, 73,
+ 35, 49, 12, 48, 8, 50, 92, 37, 43, 53,
+ 49, 97, 75, 63, 113, 71, 93, 83, 119, 107,
+ 101, 99, 41, 35, 107, 49, 53, 77, 101, 89,
+ 91, 93, 83, 101, 97, 103, 113, 125, 111, 25,
+ 41, 75, 5, 28, 20, 42, 44, 52, 98, 62,
+ 84, 84, 124, 104, 92, 124, 124, 76, 13, 57,
+ 97, 119, 125, 125, 125, 125, 32, 100, 86, 86,
+ 62, 88, 44, 34, 44, 24, 11, 2, 34, 8,
+ 72, 92, 4, 22, 40, 48, 12, 44, 80, 8,
+ 3, 24, 47, 97, 125, 125, 125, 125, 125, 125,
+ },
+
+ {
+ /* Context Tables for P, SP, B Slices :: cabac_init_idc = 0, qp = 50 */
+
+ 32, 6, 39, 32, 6, 39, 48, 62, 58, 26,
+ 17, 81, 19, 3, 102, 28, 122, 17, 52, 66,
+ 3, 45, 6, 79, 113, 15, 53, 125, 125, 125,
+ 124, 44, 13, 52, 66, 3, 7, 44, 42, 19,
+ 8, 2, 4, 19, 49, 13, 61, 10, 5, 2,
+ 11, 35, 15, 49, 36, 10, 4, 1, 26, 4,
+ 44, 0, 0, 0, 18, 69, 67, 26, 35, 15,
+ 36, 3, 45, 44, 106, 82, 124, 124, 64, 60,
+ 44, 76, 36, 22, 54, 13, 19, 21, 7, 54,
+ 4, 20, 1, 1, 0, 2, 18, 11, 29, 27,
+ 47, 38, 9, 14, 0, 29, 8, 15, 16, 6,
+ 3, 10, 20, 6, 9, 5, 12, 7, 0, 11,
+ 11, 25, 1, 11, 35, 4, 11, 25, 4, 24,
+ 5, 45, 27, 29, 35, 8, 9, 18, 14, 14,
+ 82, 38, 10, 11, 26, 11, 37, 0, 89, 19,
+ 1, 55, 15, 51, 8, 1, 37, 38, 16, 8,
+ 75, 8, 45, 21, 6, 67, 2, 34, 20, 28,
+ 34, 26, 12, 32, 24, 9, 1, 18, 0, 1,
+ 23, 11, 23, 21, 13, 19, 31, 31, 29, 29,
+ 25, 9, 57, 71, 33, 43, 45, 53, 55, 73,
+ 83, 71, 71, 91, 103, 89, 95, 125, 115, 119,
+ 13, 21, 63, 33, 49, 55, 67, 85, 91, 71,
+ 79, 67, 47, 57, 53, 37, 31, 12, 52, 36,
+ 22, 18, 22, 12, 2, 12, 30, 30, 60, 42,
+ 30, 16, 46, 28, 36, 22, 76, 4, 88, 62,
+ 44, 28, 54, 10, 1, 1, 10, 13, 94, 34,
+ 0, 2, 24, 2, 15, 2, 10, 66, 36, 4,
+ 7, 34, 10, 6, 6, 6, 124, 3, 14, 16,
+ 30, 20, 18, 44, 46, 34, 44, 54, 66, 1,
+ 12, 23, 1, 76, 13, 63, 7, 20, 5, 35,
+ 45, 25, 47, 65, 83, 87, 17, 7, 32, 13,
+ 27, 11, 3, 4, 35, 23, 9, 9, 51, 23,
+ 37, 2, 22, 49, 23, 8, 33, 2, 27, 11,
+ 42, 33, 43, 3, 8, 53, 49, 121, 18, 30,
+ 48, 7, 21, 11, 43, 43, 43, 69, 69, 67,
+ 103, 85, 61, 105, 113, 121, 125, 103, 95, 77,
+ 37, 51, 12, 50, 8, 52, 96, 41, 47, 57,
+ 53, 101, 79, 67, 117, 73, 97, 85, 123, 109,
+ 103, 101, 43, 35, 109, 51, 55, 81, 105, 93,
+ 95, 95, 87, 105, 99, 105, 115, 125, 113, 27,
+ 43, 77, 5, 28, 20, 44, 44, 54, 100, 64,
+ 86, 86, 124, 106, 94, 124, 124, 74, 17, 61,
+ 101, 125, 125, 125, 125, 125, 32, 100, 86, 86,
+ 62, 90, 44, 34, 44, 24, 11, 2, 36, 8,
+ 74, 94, 4, 24, 42, 50, 12, 44, 82, 8,
+ 3, 22, 51, 101, 125, 125, 125, 125, 125, 125,
+ },
+
+ {
+ /* Context Tables for P, SP, B Slices :: cabac_init_idc = 0, qp = 51 */
+
+ 30, 6, 39, 30, 6, 39, 52, 66, 60, 26,
+ 19, 85, 23, 5, 102, 28, 124, 17, 54, 68,
+ 3, 47, 6, 81, 115, 15, 57, 125, 125, 125,
+ 124, 46, 13, 54, 68, 3, 5, 46, 44, 19,
+ 8, 4, 6, 21, 51, 13, 61, 10, 5, 2,
+ 11, 35, 15, 49, 38, 10, 4, 1, 28, 4,
+ 44, 0, 0, 0, 18, 69, 67, 28, 37, 15,
+ 36, 3, 45, 48, 110, 84, 124, 124, 66, 62,
+ 48, 78, 38, 24, 58, 11, 19, 19, 5, 54,
+ 4, 20, 1, 1, 0, 4, 22, 11, 31, 29,
+ 49, 38, 9, 14, 2, 29, 10, 15, 18, 6,
+ 3, 10, 20, 4, 9, 5, 12, 7, 0, 11,
+ 13, 25, 1, 13, 37, 2, 13, 27, 4, 26,
+ 5, 47, 27, 27, 35, 8, 9, 18, 14, 14,
+ 84, 38, 10, 11, 28, 11, 39, 0, 91, 19,
+ 1, 55, 17, 53, 8, 1, 39, 38, 16, 8,
+ 79, 8, 47, 21, 6, 67, 0, 34, 20, 28,
+ 34, 26, 12, 32, 24, 9, 3, 18, 0, 1,
+ 23, 13, 25, 23, 15, 21, 33, 33, 29, 29,
+ 27, 9, 59, 75, 33, 45, 49, 57, 59, 77,
+ 87, 75, 75, 95, 107, 91, 97, 125, 119, 121,
+ 13, 21, 63, 35, 51, 57, 71, 87, 95, 73,
+ 81, 69, 47, 57, 53, 37, 31, 14, 52, 36,
+ 22, 18, 22, 12, 2, 14, 32, 30, 60, 42,
+ 30, 16, 48, 30, 38, 24, 80, 4, 88, 62,
+ 44, 28, 56, 10, 1, 1, 10, 13, 94, 34,
+ 1, 2, 24, 2, 15, 4, 10, 66, 36, 2,
+ 9, 34, 10, 6, 8, 8, 124, 1, 16, 16,
+ 32, 22, 20, 48, 48, 36, 46, 58, 68, 1,
+ 14, 23, 1, 80, 13, 63, 7, 20, 5, 37,
+ 45, 25, 49, 67, 85, 89, 17, 9, 32, 15,
+ 27, 11, 5, 4, 37, 23, 11, 9, 53, 23,
+ 39, 2, 22, 51, 23, 8, 35, 2, 29, 11,
+ 42, 35, 45, 5, 6, 55, 49, 123, 16, 28,
+ 48, 9, 23, 15, 47, 47, 47, 75, 73, 71,
+ 109, 91, 63, 111, 119, 125, 125, 109, 101, 81,
+ 39, 53, 12, 52, 8, 54, 100, 45, 51, 61,
+ 57, 105, 83, 69, 121, 77, 99, 87, 125, 113,
+ 105, 103, 43, 37, 111, 53, 59, 83, 109, 97,
+ 99, 99, 89, 109, 103, 107, 119, 125, 115, 29,
+ 45, 79, 5, 30, 20, 44, 46, 54, 102, 66,
+ 88, 88, 124, 108, 96, 124, 124, 74, 19, 65,
+ 107, 125, 125, 125, 125, 125, 34, 102, 88, 88,
+ 62, 92, 46, 34, 46, 26, 11, 4, 36, 10,
+ 76, 96, 4, 24, 42, 50, 12, 46, 84, 8,
+ 3, 20, 55, 107, 125, 125, 125, 125, 125, 125,
+ },
+
+ },
+
+ {
+
+ {
+ /* Context Tables for P, SP, B Slices :: cabac_init_idc = 1, qp = 0 */
+
+ 124, 18, 21, 124, 18, 21, 125, 81, 20, 18,
+ 24, 76, 124, 124, 108, 44, 109, 3, 15, 31,
+ 22, 26, 13, 18, 58, 82, 124, 122, 54, 11,
+ 125, 75, 25, 15, 31, 22, 11, 53, 22, 40,
+ 11, 37, 65, 8, 23, 47, 73, 14, 21, 43,
+ 8, 35, 45, 63, 5, 27, 13, 45, 17, 4,
+ 44, 0, 0, 0, 39, 45, 67, 17, 44, 2,
+ 96, 24, 33, 125, 55, 65, 35, 69, 77, 67,
+ 111, 71, 93, 77, 125, 33, 51, 61, 57, 48,
+ 3, 41, 125, 19, 81, 55, 125, 16, 14, 16,
+ 4, 20, 9, 21, 49, 79, 55, 51, 57, 25,
+ 47, 93, 83, 29, 97, 71, 125, 125, 125, 125,
+ 5, 29, 15, 17, 8, 16, 13, 23, 51, 111,
+ 23, 86, 82, 125, 18, 4, 10, 6, 4, 7,
+ 41, 21, 3, 22, 12, 4, 11, 13, 16, 15,
+ 10, 4, 44, 76, 62, 40, 32, 38, 24, 34,
+ 50, 5, 50, 42, 58, 51, 36, 70, 64, 124,
+ 124, 96, 124, 124, 124, 124, 124, 124, 124, 124,
+ 124, 124, 106, 124, 124, 124, 124, 124, 124, 124,
+ 112, 124, 124, 124, 54, 124, 124, 124, 124, 124,
+ 124, 124, 124, 124, 124, 124, 106, 90, 76, 44,
+ 23, 17, 27, 56, 64, 56, 66, 36, 42, 36,
+ 74, 18, 5, 14, 19, 7, 105, 97, 15, 4,
+ 20, 5, 27, 33, 41, 47, 125, 75, 48, 20,
+ 4, 23, 27, 55, 87, 95, 117, 25, 38, 22,
+ 12, 10, 17, 11, 11, 21, 45, 5, 58, 62,
+ 64, 22, 16, 7, 19, 51, 22, 118, 110, 110,
+ 88, 52, 4, 19, 13, 29, 124, 125, 121, 93,
+ 125, 121, 83, 115, 107, 77, 107, 105, 117, 63,
+ 73, 63, 95, 101, 51, 33, 37, 43, 35, 17,
+ 1, 7, 14, 11, 11, 11, 11, 7, 27, 1,
+ 4, 7, 1, 12, 3, 5, 2, 24, 5, 15,
+ 23, 13, 17, 6, 52, 32, 56, 52, 44, 44,
+ 30, 44, 44, 8, 26, 46, 5, 26, 124, 124,
+ 124, 124, 124, 124, 124, 124, 124, 124, 124, 124,
+ 124, 124, 108, 124, 124, 124, 124, 124, 124, 124,
+ 124, 124, 98, 74, 52, 16, 3, 124, 124, 124,
+ 124, 124, 124, 124, 124, 124, 124, 124, 124, 86,
+ 66, 38, 30, 28, 36, 82, 82, 84, 86, 70,
+ 78, 58, 42, 48, 26, 13, 18, 15, 39, 62,
+ 28, 18, 43, 35, 27, 35, 33, 19, 21, 39,
+ 15, 7, 4, 5, 5, 8, 8, 124, 124, 124,
+ 124, 124, 120, 106, 72, 12, 15, 78, 54, 42,
+ 22, 12, 0, 3, 7, 37, 35, 25, 17, 29,
+ 17, 9, 13, 25, 5, 2, 12, 4, 6, 18,
+ 10, 124, 124, 124, 124, 124, 120, 106, 72, 12,
+ },
+
+ {
+ /* Context Tables for P, SP, B Slices :: cabac_init_idc = 1, qp = 1 */
+
+ 124, 18, 21, 124, 18, 21, 123, 77, 22, 20,
+ 24, 74, 122, 124, 110, 44, 105, 3, 13, 29,
+ 22, 26, 11, 18, 56, 80, 122, 116, 50, 13,
+ 121, 73, 23, 13, 29, 22, 11, 51, 22, 40,
+ 9, 35, 63, 8, 23, 45, 71, 14, 19, 41,
+ 8, 33, 43, 61, 3, 25, 13, 43, 15, 4,
+ 44, 0, 0, 0, 37, 45, 67, 15, 44, 2,
+ 96, 24, 33, 121, 51, 61, 31, 63, 73, 63,
+ 107, 67, 89, 73, 121, 33, 49, 59, 55, 48,
+ 3, 39, 121, 17, 79, 53, 123, 16, 14, 16,
+ 4, 22, 9, 19, 47, 77, 53, 49, 55, 23,
+ 45, 89, 79, 27, 93, 67, 117, 117, 119, 121,
+ 3, 27, 13, 15, 8, 18, 11, 21, 49, 105,
+ 21, 82, 80, 121, 18, 6, 10, 8, 6, 5,
+ 37, 19, 1, 22, 12, 4, 9, 11, 14, 13,
+ 10, 4, 44, 74, 62, 40, 32, 38, 24, 34,
+ 48, 3, 50, 42, 58, 51, 36, 70, 64, 124,
+ 124, 94, 124, 124, 124, 122, 124, 124, 124, 124,
+ 124, 124, 104, 124, 124, 124, 124, 124, 124, 124,
+ 108, 124, 120, 124, 52, 124, 124, 124, 124, 124,
+ 124, 124, 124, 124, 124, 122, 104, 88, 74, 42,
+ 23, 17, 27, 56, 62, 54, 64, 34, 40, 34,
+ 72, 16, 5, 12, 19, 7, 103, 93, 13, 6,
+ 20, 3, 25, 31, 39, 45, 121, 71, 50, 22,
+ 6, 21, 25, 51, 83, 91, 113, 23, 40, 24,
+ 14, 12, 15, 9, 9, 19, 43, 5, 60, 62,
+ 64, 22, 18, 5, 19, 49, 22, 118, 110, 108,
+ 86, 52, 6, 17, 11, 27, 124, 121, 117, 89,
+ 121, 117, 79, 111, 103, 73, 103, 101, 111, 61,
+ 71, 61, 91, 97, 49, 31, 35, 41, 33, 15,
+ 1, 7, 14, 11, 11, 11, 9, 5, 25, 0,
+ 4, 5, 0, 12, 1, 3, 2, 24, 3, 13,
+ 21, 11, 15, 6, 50, 32, 54, 52, 44, 44,
+ 30, 44, 44, 8, 26, 44, 5, 24, 124, 124,
+ 124, 124, 124, 124, 124, 124, 124, 124, 124, 124,
+ 124, 124, 104, 124, 124, 124, 124, 124, 124, 124,
+ 122, 124, 96, 72, 50, 16, 3, 124, 124, 124,
+ 124, 124, 124, 124, 124, 124, 124, 124, 124, 84,
+ 64, 36, 30, 28, 34, 80, 80, 82, 82, 68,
+ 76, 56, 40, 46, 24, 13, 16, 15, 39, 60,
+ 26, 16, 41, 33, 25, 33, 29, 15, 19, 37,
+ 13, 5, 6, 3, 3, 8, 8, 124, 124, 124,
+ 124, 120, 112, 98, 64, 8, 13, 78, 56, 44,
+ 24, 14, 2, 1, 5, 35, 33, 23, 15, 27,
+ 15, 7, 11, 23, 3, 4, 12, 6, 8, 18,
+ 10, 124, 124, 124, 124, 120, 112, 98, 64, 8,
+ },
+
+ {
+ /* Context Tables for P, SP, B Slices :: cabac_init_idc = 1, qp = 2 */
+
+ 124, 18, 21, 124, 18, 21, 119, 75, 22, 20,
+ 24, 72, 118, 122, 110, 44, 101, 3, 13, 27,
+ 22, 24, 11, 16, 52, 78, 116, 108, 44, 17,
+ 115, 71, 23, 13, 27, 22, 11, 49, 22, 38,
+ 9, 35, 61, 8, 23, 45, 71, 14, 19, 41,
+ 8, 33, 43, 61, 3, 25, 13, 43, 15, 4,
+ 44, 0, 0, 0, 35, 45, 67, 15, 42, 2,
+ 94, 24, 33, 117, 49, 59, 27, 59, 71, 61,
+ 103, 65, 87, 71, 117, 33, 49, 59, 55, 48,
+ 3, 37, 117, 17, 77, 51, 119, 16, 14, 16,
+ 2, 22, 9, 19, 45, 75, 51, 47, 53, 23,
+ 43, 87, 77, 25, 91, 65, 107, 109, 113, 115,
+ 3, 27, 13, 15, 8, 18, 11, 21, 49, 101,
+ 21, 78, 76, 115, 18, 6, 10, 8, 6, 5,
+ 33, 17, 1, 22, 12, 4, 7, 9, 12, 13,
+ 10, 4, 42, 72, 60, 40, 30, 38, 24, 34,
+ 46, 3, 48, 40, 56, 51, 36, 68, 62, 124,
+ 124, 92, 120, 124, 124, 118, 124, 124, 124, 124,
+ 124, 124, 100, 124, 124, 124, 124, 124, 124, 124,
+ 104, 124, 116, 124, 48, 124, 124, 124, 124, 124,
+ 124, 124, 124, 124, 124, 118, 100, 84, 70, 38,
+ 23, 17, 29, 54, 60, 52, 62, 32, 38, 32,
+ 68, 14, 5, 10, 21, 9, 101, 91, 11, 6,
+ 20, 3, 23, 29, 37, 43, 117, 69, 50, 22,
+ 6, 19, 23, 49, 79, 87, 109, 21, 42, 26,
+ 16, 14, 13, 9, 9, 19, 41, 5, 62, 62,
+ 62, 22, 18, 5, 19, 49, 22, 118, 108, 106,
+ 84, 52, 6, 17, 11, 27, 124, 119, 115, 87,
+ 117, 113, 77, 107, 99, 71, 99, 97, 107, 59,
+ 69, 61, 89, 93, 49, 31, 35, 39, 33, 15,
+ 1, 7, 12, 11, 11, 11, 9, 5, 23, 0,
+ 4, 5, 0, 12, 1, 3, 2, 22, 3, 13,
+ 21, 11, 13, 4, 48, 32, 52, 50, 42, 42,
+ 30, 42, 42, 8, 26, 42, 5, 22, 124, 124,
+ 124, 124, 124, 124, 124, 124, 124, 124, 124, 124,
+ 124, 124, 100, 124, 124, 124, 124, 124, 124, 124,
+ 118, 118, 92, 68, 48, 14, 5, 124, 124, 124,
+ 124, 124, 124, 124, 124, 124, 124, 124, 124, 80,
+ 60, 32, 28, 26, 30, 78, 78, 78, 78, 64,
+ 72, 52, 38, 42, 22, 15, 14, 17, 41, 56,
+ 24, 14, 41, 33, 23, 33, 27, 13, 19, 35,
+ 11, 3, 6, 3, 1, 8, 8, 124, 124, 124,
+ 124, 114, 104, 90, 56, 2, 13, 78, 56, 44,
+ 24, 16, 2, 1, 5, 35, 33, 23, 15, 27,
+ 13, 5, 11, 23, 3, 4, 12, 6, 10, 18,
+ 10, 124, 124, 124, 124, 114, 104, 90, 56, 2,
+ },
+
+ {
+ /* Context Tables for P, SP, B Slices :: cabac_init_idc = 1, qp = 3 */
+
+ 124, 18, 21, 124, 18, 21, 115, 71, 24, 20,
+ 22, 68, 114, 120, 110, 44, 97, 3, 11, 25,
+ 22, 24, 11, 16, 50, 76, 112, 102, 40, 19,
+ 109, 69, 23, 11, 25, 22, 13, 47, 22, 38,
+ 9, 35, 61, 8, 23, 45, 71, 14, 19, 39,
+ 8, 33, 41, 61, 3, 25, 13, 43, 15, 4,
+ 44, 0, 0, 0, 35, 45, 67, 13, 40, 2,
+ 92, 22, 33, 111, 47, 57, 25, 55, 67, 57,
+ 99, 61, 85, 69, 113, 33, 49, 57, 55, 48,
+ 3, 35, 113, 17, 75, 51, 115, 16, 12, 14,
+ 2, 22, 9, 17, 45, 73, 49, 47, 51, 21,
+ 41, 83, 73, 25, 89, 63, 97, 99, 107, 109,
+ 3, 27, 13, 13, 8, 18, 9, 19, 47, 97,
+ 21, 74, 72, 109, 18, 6, 10, 8, 6, 3,
+ 31, 15, 1, 22, 12, 4, 7, 7, 10, 13,
+ 10, 2, 42, 70, 60, 40, 30, 38, 24, 34,
+ 44, 3, 46, 38, 56, 51, 36, 68, 62, 124,
+ 124, 90, 116, 124, 124, 114, 124, 124, 124, 124,
+ 124, 122, 96, 124, 124, 124, 124, 124, 124, 120,
+ 100, 124, 112, 124, 44, 124, 124, 124, 124, 124,
+ 124, 124, 124, 124, 124, 114, 96, 80, 68, 34,
+ 23, 17, 29, 52, 58, 50, 60, 30, 36, 30,
+ 64, 12, 7, 8, 23, 9, 101, 87, 9, 8,
+ 20, 3, 21, 29, 37, 43, 113, 67, 50, 22,
+ 8, 17, 21, 47, 77, 85, 105, 19, 42, 26,
+ 16, 14, 11, 7, 9, 19, 41, 5, 62, 62,
+ 60, 22, 18, 5, 19, 47, 22, 116, 108, 104,
+ 82, 52, 6, 17, 11, 27, 124, 117, 111, 85,
+ 115, 111, 75, 103, 95, 69, 97, 93, 103, 59,
+ 67, 59, 87, 89, 47, 31, 35, 39, 31, 15,
+ 1, 7, 12, 11, 11, 13, 7, 3, 21, 0,
+ 4, 3, 0, 12, 1, 3, 2, 22, 3, 13,
+ 21, 11, 13, 2, 46, 32, 50, 48, 40, 42,
+ 30, 40, 40, 8, 26, 40, 5, 20, 124, 124,
+ 122, 124, 124, 124, 124, 124, 124, 124, 124, 124,
+ 124, 124, 96, 124, 124, 124, 124, 124, 124, 124,
+ 114, 114, 88, 64, 44, 12, 7, 124, 124, 124,
+ 124, 124, 124, 124, 124, 124, 124, 118, 120, 76,
+ 56, 30, 26, 24, 28, 74, 74, 74, 74, 62,
+ 68, 48, 36, 40, 20, 17, 12, 19, 43, 54,
+ 22, 12, 41, 31, 23, 31, 25, 11, 19, 35,
+ 11, 3, 6, 1, 0, 8, 8, 124, 124, 124,
+ 118, 108, 96, 82, 48, 3, 13, 78, 56, 44,
+ 24, 16, 4, 1, 5, 33, 33, 23, 13, 25,
+ 11, 3, 11, 21, 3, 4, 12, 6, 10, 18,
+ 10, 124, 124, 124, 118, 108, 96, 82, 48, 3,
+ },
+
+ {
+ /* Context Tables for P, SP, B Slices :: cabac_init_idc = 1, qp = 4 */
+
+ 124, 18, 21, 124, 18, 21, 113, 69, 24, 20,
+ 22, 66, 110, 118, 110, 42, 93, 3, 11, 23,
+ 20, 22, 11, 14, 46, 74, 106, 94, 34, 23,
+ 103, 67, 23, 11, 23, 20, 13, 45, 22, 36,
+ 9, 33, 59, 8, 23, 45, 71, 14, 19, 39,
+ 8, 33, 41, 59, 3, 25, 13, 43, 13, 4,
+ 44, 0, 0, 0, 33, 47, 67, 13, 38, 2,
+ 90, 22, 33, 107, 45, 55, 21, 51, 65, 55,
+ 97, 59, 81, 67, 109, 33, 47, 57, 55, 48,
+ 3, 33, 109, 17, 75, 49, 111, 16, 12, 14,
+ 0, 22, 9, 17, 43, 71, 47, 45, 49, 21,
+ 41, 81, 71, 23, 87, 61, 87, 91, 101, 103,
+ 3, 25, 13, 13, 8, 18, 9, 19, 47, 93,
+ 21, 70, 68, 105, 18, 8, 10, 8, 6, 3,
+ 27, 13, 0, 20, 12, 4, 5, 7, 8, 13,
+ 10, 2, 40, 68, 58, 38, 28, 38, 24, 34,
+ 42, 3, 44, 36, 54, 51, 34, 66, 60, 124,
+ 124, 88, 112, 124, 124, 110, 124, 124, 124, 124,
+ 124, 118, 92, 118, 124, 124, 124, 124, 124, 114,
+ 96, 124, 108, 124, 42, 124, 124, 124, 124, 124,
+ 124, 124, 124, 124, 120, 110, 92, 76, 64, 30,
+ 23, 17, 31, 50, 56, 48, 56, 28, 32, 28,
+ 62, 10, 7, 6, 23, 11, 99, 85, 7, 8,
+ 20, 1, 21, 27, 35, 41, 109, 63, 50, 24,
+ 8, 17, 19, 45, 73, 81, 103, 19, 44, 28,
+ 18, 16, 9, 7, 9, 17, 39, 5, 64, 62,
+ 60, 20, 18, 5, 19, 47, 22, 116, 106, 102,
+ 80, 52, 6, 15, 11, 27, 124, 113, 109, 83,
+ 111, 107, 73, 101, 93, 67, 93, 91, 99, 57,
+ 65, 59, 85, 87, 47, 31, 35, 37, 31, 15,
+ 3, 7, 10, 11, 11, 13, 7, 3, 19, 0,
+ 4, 3, 0, 12, 1, 3, 2, 20, 3, 13,
+ 21, 11, 11, 0, 44, 32, 48, 48, 38, 40,
+ 30, 38, 38, 8, 26, 38, 5, 18, 124, 124,
+ 120, 124, 124, 124, 124, 124, 124, 124, 124, 124,
+ 124, 124, 92, 124, 124, 124, 124, 124, 124, 124,
+ 108, 108, 84, 60, 42, 10, 7, 124, 124, 124,
+ 124, 124, 124, 124, 124, 124, 124, 114, 114, 72,
+ 52, 26, 24, 24, 24, 72, 72, 72, 70, 58,
+ 64, 46, 34, 36, 18, 19, 8, 21, 43, 50,
+ 18, 8, 39, 31, 21, 31, 23, 9, 19, 33,
+ 9, 1, 6, 1, 2, 8, 8, 124, 124, 124,
+ 112, 100, 88, 72, 40, 9, 11, 78, 56, 44,
+ 24, 18, 4, 1, 5, 33, 33, 23, 13, 25,
+ 11, 1, 11, 21, 1, 6, 12, 6, 12, 18,
+ 10, 124, 124, 124, 112, 100, 88, 72, 40, 9,
+ },
+
+ {
+ /* Context Tables for P, SP, B Slices :: cabac_init_idc = 1, qp = 5 */
+
+ 124, 18, 21, 124, 18, 21, 109, 65, 24, 20,
+ 20, 64, 106, 116, 110, 42, 89, 3, 11, 21,
+ 20, 22, 11, 12, 42, 72, 102, 88, 30, 27,
+ 97, 65, 21, 11, 21, 20, 13, 43, 22, 36,
+ 9, 33, 57, 8, 23, 45, 71, 14, 19, 39,
+ 8, 33, 39, 59, 3, 25, 13, 43, 13, 4,
+ 44, 0, 0, 0, 33, 47, 67, 11, 36, 2,
+ 88, 20, 33, 101, 43, 53, 17, 47, 61, 51,
+ 93, 55, 79, 65, 103, 33, 47, 55, 53, 48,
+ 3, 31, 105, 17, 73, 49, 107, 16, 10, 12,
+ 0, 22, 9, 15, 43, 69, 45, 45, 47, 19,
+ 39, 77, 67, 21, 83, 59, 77, 83, 95, 97,
+ 1, 25, 11, 11, 8, 18, 7, 19, 45, 89,
+ 21, 66, 64, 99, 18, 8, 10, 8, 6, 1,
+ 25, 11, 0, 20, 12, 4, 5, 5, 6, 11,
+ 10, 0, 40, 66, 58, 38, 28, 38, 24, 34,
+ 40, 1, 42, 36, 54, 51, 34, 64, 58, 124,
+ 124, 86, 110, 124, 124, 106, 124, 124, 124, 124,
+ 122, 114, 88, 114, 124, 120, 124, 124, 124, 110,
+ 92, 124, 104, 124, 38, 124, 124, 124, 124, 124,
+ 124, 124, 124, 122, 116, 106, 88, 74, 60, 26,
+ 23, 17, 31, 48, 54, 46, 54, 26, 30, 26,
+ 58, 8, 9, 4, 25, 13, 97, 81, 5, 10,
+ 20, 1, 19, 27, 35, 39, 105, 61, 50, 24,
+ 10, 15, 17, 43, 71, 79, 99, 17, 46, 30,
+ 20, 16, 7, 5, 7, 17, 39, 5, 64, 62,
+ 58, 20, 18, 5, 19, 45, 22, 114, 104, 100,
+ 78, 52, 6, 15, 11, 25, 124, 111, 105, 79,
+ 107, 105, 71, 97, 89, 65, 89, 87, 95, 55,
+ 63, 57, 83, 83, 47, 31, 33, 37, 29, 15,
+ 3, 7, 10, 11, 11, 15, 5, 3, 17, 0,
+ 4, 3, 0, 12, 1, 3, 2, 20, 3, 13,
+ 21, 11, 11, 1, 42, 32, 46, 46, 38, 38,
+ 30, 38, 36, 8, 26, 36, 5, 16, 124, 124,
+ 118, 124, 124, 124, 124, 124, 124, 124, 124, 124,
+ 124, 124, 88, 124, 124, 124, 124, 124, 124, 122,
+ 104, 104, 80, 58, 38, 10, 9, 124, 124, 124,
+ 124, 124, 124, 124, 124, 124, 122, 110, 108, 68,
+ 48, 24, 24, 22, 20, 70, 68, 68, 66, 54,
+ 60, 42, 32, 34, 16, 19, 6, 23, 45, 48,
+ 16, 6, 39, 31, 19, 29, 21, 7, 17, 31,
+ 9, 1, 6, 0, 4, 8, 8, 124, 124, 118,
+ 106, 94, 80, 64, 32, 15, 11, 78, 56, 44,
+ 24, 18, 4, 0, 3, 31, 33, 23, 11, 25,
+ 9, 0, 11, 21, 1, 6, 12, 8, 12, 18,
+ 10, 124, 124, 118, 106, 94, 80, 64, 32, 15,
+ },
+
+ {
+ /* Context Tables for P, SP, B Slices :: cabac_init_idc = 1, qp = 6 */
+
+ 124, 18, 23, 124, 18, 23, 105, 63, 26, 20,
+ 20, 60, 102, 114, 110, 42, 87, 3, 9, 21,
+ 20, 20, 9, 12, 40, 68, 96, 80, 24, 29,
+ 93, 63, 21, 9, 21, 20, 15, 43, 22, 34,
+ 9, 33, 57, 8, 23, 43, 69, 14, 17, 37,
+ 8, 31, 39, 59, 3, 25, 13, 43, 13, 4,
+ 44, 0, 0, 0, 31, 47, 67, 11, 36, 0,
+ 88, 20, 33, 97, 41, 51, 15, 41, 59, 49,
+ 89, 53, 77, 63, 99, 33, 47, 55, 53, 48,
+ 3, 29, 99, 17, 71, 47, 103, 14, 10, 12,
+ 1, 24, 9, 15, 41, 69, 45, 43, 45, 19,
+ 37, 75, 65, 21, 81, 57, 67, 73, 89, 91,
+ 1, 25, 11, 11, 8, 18, 7, 17, 45, 85,
+ 19, 62, 60, 93, 18, 8, 10, 8, 8, 1,
+ 21, 9, 0, 20, 12, 4, 3, 3, 4, 11,
+ 10, 0, 38, 64, 56, 38, 26, 38, 24, 34,
+ 36, 1, 40, 34, 52, 51, 34, 64, 58, 124,
+ 124, 84, 106, 124, 124, 102, 124, 124, 124, 124,
+ 114, 110, 86, 110, 124, 116, 124, 124, 124, 104,
+ 88, 124, 100, 124, 34, 124, 124, 124, 124, 124,
+ 124, 124, 124, 118, 112, 100, 84, 70, 58, 24,
+ 23, 17, 33, 46, 52, 44, 52, 24, 28, 24,
+ 54, 6, 9, 2, 27, 13, 97, 79, 3, 10,
+ 20, 1, 17, 25, 33, 39, 101, 59, 52, 24,
+ 10, 13, 15, 41, 67, 75, 95, 15, 46, 30,
+ 20, 18, 5, 5, 7, 17, 37, 5, 66, 62,
+ 56, 20, 18, 5, 19, 45, 20, 114, 104, 98,
+ 76, 50, 6, 15, 11, 25, 124, 109, 103, 77,
+ 105, 101, 69, 93, 85, 63, 87, 83, 91, 55,
+ 61, 57, 81, 79, 45, 31, 33, 35, 29, 15,
+ 3, 7, 8, 11, 11, 15, 5, 1, 15, 0,
+ 4, 1, 2, 12, 0, 1, 2, 18, 3, 13,
+ 21, 11, 9, 3, 40, 32, 44, 44, 36, 38,
+ 30, 36, 36, 8, 24, 32, 7, 14, 124, 124,
+ 116, 124, 124, 124, 124, 124, 124, 124, 124, 124,
+ 124, 124, 84, 124, 124, 124, 124, 124, 124, 116,
+ 100, 98, 76, 54, 36, 8, 11, 124, 124, 124,
+ 124, 124, 124, 124, 124, 122, 116, 104, 102, 64,
+ 46, 20, 22, 20, 18, 66, 66, 64, 62, 52,
+ 56, 38, 30, 30, 14, 21, 4, 25, 47, 44,
+ 14, 4, 39, 29, 19, 29, 19, 5, 17, 31,
+ 7, 0, 6, 0, 6, 8, 8, 124, 124, 114,
+ 100, 88, 72, 56, 24, 21, 11, 78, 56, 44,
+ 24, 20, 6, 0, 3, 31, 31, 21, 11, 23,
+ 7, 2, 9, 19, 1, 6, 12, 8, 14, 18,
+ 10, 124, 124, 114, 100, 88, 72, 56, 24, 21,
+ },
+
+ {
+ /* Context Tables for P, SP, B Slices :: cabac_init_idc = 1, qp = 7 */
+
+ 124, 18, 23, 124, 18, 23, 101, 59, 26, 20,
+ 18, 58, 98, 112, 110, 42, 83, 3, 9, 19,
+ 18, 20, 9, 10, 36, 66, 92, 74, 20, 33,
+ 87, 61, 21, 9, 19, 18, 15, 41, 22, 34,
+ 9, 31, 55, 8, 23, 43, 69, 14, 17, 37,
+ 8, 31, 37, 57, 3, 25, 13, 43, 11, 4,
+ 44, 0, 0, 0, 31, 47, 67, 9, 34, 0,
+ 86, 18, 33, 91, 39, 49, 11, 37, 55, 45,
+ 87, 49, 73, 61, 95, 33, 45, 53, 53, 48,
+ 3, 27, 95, 17, 69, 47, 99, 14, 8, 10,
+ 1, 24, 9, 13, 41, 67, 43, 43, 43, 17,
+ 35, 71, 61, 19, 79, 55, 57, 65, 83, 85,
+ 1, 23, 11, 9, 8, 18, 5, 17, 43, 81,
+ 19, 58, 56, 87, 18, 10, 10, 8, 8, 0,
+ 19, 7, 2, 18, 12, 4, 3, 3, 2, 11,
+ 10, 1, 38, 62, 56, 36, 26, 38, 24, 34,
+ 34, 1, 38, 32, 52, 51, 34, 62, 56, 120,
+ 124, 82, 102, 124, 124, 98, 124, 122, 124, 124,
+ 108, 106, 82, 104, 124, 110, 124, 124, 124, 98,
+ 84, 124, 96, 124, 32, 124, 124, 124, 124, 124,
+ 124, 124, 124, 114, 106, 96, 80, 66, 54, 20,
+ 23, 17, 33, 44, 50, 42, 48, 22, 26, 22,
+ 52, 4, 11, 0, 27, 15, 95, 75, 1, 12,
+ 20, 0, 17, 25, 33, 37, 97, 55, 52, 26,
+ 12, 13, 13, 39, 65, 73, 91, 15, 48, 32,
+ 22, 18, 3, 3, 7, 15, 37, 5, 66, 62,
+ 56, 18, 18, 5, 19, 43, 20, 112, 102, 96,
+ 74, 50, 6, 13, 11, 25, 124, 105, 99, 75,
+ 101, 99, 67, 91, 83, 61, 83, 81, 87, 53,
+ 59, 55, 79, 75, 45, 31, 33, 35, 27, 15,
+ 5, 7, 8, 11, 11, 17, 3, 1, 13, 0,
+ 4, 1, 2, 12, 0, 1, 2, 18, 3, 13,
+ 21, 11, 9, 5, 38, 32, 42, 44, 34, 36,
+ 30, 34, 34, 8, 24, 30, 7, 12, 122, 124,
+ 114, 124, 124, 124, 124, 124, 124, 124, 124, 124,
+ 124, 124, 80, 124, 124, 124, 124, 124, 124, 112,
+ 96, 94, 72, 50, 32, 6, 11, 124, 124, 124,
+ 124, 124, 124, 124, 124, 118, 112, 100, 96, 60,
+ 42, 18, 20, 20, 14, 64, 62, 62, 58, 48,
+ 52, 36, 28, 28, 12, 23, 0, 27, 47, 42,
+ 10, 0, 37, 29, 17, 27, 17, 3, 17, 29,
+ 7, 0, 6, 2, 8, 8, 8, 124, 124, 108,
+ 94, 80, 64, 48, 16, 27, 9, 78, 56, 44,
+ 24, 20, 6, 0, 3, 29, 31, 21, 9, 23,
+ 5, 4, 9, 19, 0, 8, 12, 8, 14, 18,
+ 10, 124, 124, 108, 94, 80, 64, 48, 16, 27,
+ },
+
+ {
+ /* Context Tables for P, SP, B Slices :: cabac_init_idc = 1, qp = 8 */
+
+ 124, 16, 23, 124, 16, 23, 99, 57, 26, 20,
+ 18, 54, 92, 110, 110, 40, 79, 5, 9, 17,
+ 18, 18, 9, 8, 32, 64, 86, 66, 14, 37,
+ 81, 59, 21, 9, 17, 18, 17, 39, 22, 32,
+ 9, 31, 55, 6, 25, 43, 69, 14, 17, 37,
+ 8, 31, 37, 57, 3, 25, 13, 43, 11, 4,
+ 44, 0, 0, 0, 29, 49, 67, 9, 32, 0,
+ 84, 18, 35, 87, 37, 47, 9, 33, 53, 43,
+ 83, 47, 71, 59, 91, 33, 45, 53, 53, 48,
+ 3, 25, 91, 17, 69, 45, 95, 14, 8, 10,
+ 3, 24, 9, 13, 39, 65, 41, 41, 43, 17,
+ 35, 69, 59, 19, 77, 53, 49, 57, 77, 81,
+ 1, 23, 11, 9, 6, 18, 5, 17, 43, 77,
+ 19, 54, 52, 83, 18, 10, 10, 8, 8, 0,
+ 15, 7, 2, 18, 10, 4, 1, 1, 1, 11,
+ 10, 1, 36, 58, 54, 36, 24, 38, 24, 32,
+ 32, 1, 36, 30, 50, 51, 32, 60, 54, 116,
+ 124, 78, 98, 124, 124, 92, 124, 118, 124, 124,
+ 100, 102, 78, 100, 124, 106, 124, 124, 124, 92,
+ 80, 124, 92, 124, 28, 124, 124, 124, 124, 124,
+ 124, 124, 120, 110, 102, 92, 76, 62, 50, 16,
+ 23, 19, 35, 42, 46, 40, 46, 20, 22, 18,
+ 48, 2, 11, 1, 29, 17, 95, 73, 0, 12,
+ 20, 0, 15, 23, 31, 37, 93, 53, 52, 26,
+ 12, 11, 11, 37, 61, 69, 89, 13, 48, 32,
+ 22, 20, 1, 3, 7, 15, 35, 7, 68, 62,
+ 54, 18, 18, 5, 19, 43, 20, 112, 100, 94,
+ 72, 50, 6, 13, 11, 25, 124, 103, 97, 73,
+ 99, 95, 65, 87, 79, 59, 81, 77, 83, 53,
+ 59, 55, 77, 73, 45, 31, 33, 33, 27, 15,
+ 5, 7, 6, 11, 11, 17, 3, 1, 11, 0,
+ 2, 1, 2, 10, 0, 1, 2, 16, 3, 13,
+ 21, 11, 7, 7, 36, 32, 38, 42, 32, 34,
+ 28, 32, 32, 8, 24, 28, 7, 8, 120, 120,
+ 112, 124, 124, 124, 124, 124, 124, 124, 124, 124,
+ 124, 120, 76, 124, 124, 124, 124, 124, 124, 106,
+ 90, 88, 68, 46, 30, 4, 13, 124, 124, 124,
+ 124, 124, 124, 124, 124, 112, 106, 94, 90, 56,
+ 38, 14, 18, 18, 10, 60, 60, 58, 54, 44,
+ 48, 32, 24, 24, 8, 25, 1, 29, 49, 38,
+ 8, 1, 37, 29, 17, 27, 15, 1, 17, 29,
+ 5, 2, 6, 2, 8, 8, 6, 124, 120, 102,
+ 88, 74, 56, 38, 6, 33, 9, 78, 56, 44,
+ 24, 22, 6, 0, 3, 29, 31, 21, 9, 23,
+ 5, 4, 9, 19, 0, 8, 12, 8, 16, 18,
+ 8, 124, 120, 102, 88, 74, 56, 38, 6, 33,
+ },
+
+ {
+ /* Context Tables for P, SP, B Slices :: cabac_init_idc = 1, qp = 9 */
+
+ 124, 16, 23, 124, 16, 23, 95, 55, 28, 20,
+ 18, 52, 88, 108, 112, 40, 75, 5, 7, 15,
+ 18, 16, 9, 8, 30, 62, 82, 58, 8, 39,
+ 75, 57, 19, 7, 15, 18, 17, 37, 22, 32,
+ 7, 31, 53, 6, 25, 43, 69, 14, 17, 35,
+ 8, 31, 37, 57, 3, 25, 13, 41, 11, 4,
+ 44, 0, 0, 0, 27, 49, 67, 9, 30, 0,
+ 82, 18, 35, 83, 33, 45, 5, 29, 49, 41,
+ 79, 43, 69, 55, 85, 33, 45, 53, 51, 48,
+ 3, 23, 87, 15, 67, 43, 91, 14, 8, 10,
+ 3, 24, 9, 13, 37, 63, 39, 39, 41, 15,
+ 33, 67, 55, 17, 73, 51, 39, 47, 69, 75,
+ 0, 23, 9, 7, 6, 18, 5, 15, 41, 71,
+ 19, 50, 50, 77, 18, 10, 10, 8, 8, 2,
+ 11, 5, 2, 18, 10, 4, 0, 0, 3, 9,
+ 10, 1, 34, 56, 52, 36, 22, 38, 24, 32,
+ 30, 0, 34, 30, 48, 51, 32, 60, 54, 112,
+ 124, 76, 96, 124, 124, 88, 120, 114, 124, 124,
+ 94, 98, 74, 96, 124, 102, 124, 124, 124, 88,
+ 76, 124, 88, 124, 24, 124, 124, 124, 124, 124,
+ 124, 120, 116, 106, 98, 88, 74, 60, 48, 12,
+ 23, 19, 35, 42, 44, 38, 44, 18, 20, 16,
+ 44, 0, 11, 3, 31, 17, 93, 71, 2, 12,
+ 20, 0, 13, 21, 29, 35, 87, 51, 52, 26,
+ 12, 9, 9, 35, 57, 65, 85, 11, 50, 34,
+ 24, 22, 0, 3, 5, 15, 33, 7, 70, 62,
+ 52, 18, 20, 3, 19, 41, 20, 112, 100, 92,
+ 70, 50, 6, 13, 11, 23, 124, 101, 95, 69,
+ 95, 91, 63, 83, 75, 57, 77, 73, 79, 51,
+ 57, 53, 75, 69, 43, 29, 31, 31, 25, 15,
+ 5, 7, 4, 11, 11, 17, 3, 0, 9, 2,
+ 2, 0, 2, 10, 0, 1, 2, 14, 3, 11,
+ 19, 11, 5, 7, 34, 32, 36, 40, 32, 34,
+ 28, 32, 30, 8, 24, 26, 7, 6, 118, 118,
+ 112, 122, 124, 124, 124, 124, 124, 124, 124, 124,
+ 124, 114, 72, 124, 124, 124, 124, 124, 124, 100,
+ 86, 84, 66, 44, 28, 4, 15, 124, 124, 124,
+ 124, 124, 124, 124, 124, 108, 102, 90, 86, 52,
+ 34, 10, 18, 16, 8, 58, 58, 54, 50, 42,
+ 46, 28, 22, 20, 6, 25, 3, 29, 51, 34,
+ 6, 3, 37, 27, 15, 27, 13, 2, 15, 27,
+ 3, 4, 6, 4, 10, 8, 6, 124, 116, 98,
+ 82, 68, 48, 30, 1, 39, 9, 78, 56, 46,
+ 26, 24, 8, 2, 1, 29, 31, 21, 9, 21,
+ 3, 6, 9, 17, 0, 8, 12, 10, 18, 18,
+ 8, 124, 116, 98, 82, 68, 48, 30, 1, 39,
+ },
+
+ {
+ /* Context Tables for P, SP, B Slices :: cabac_init_idc = 1, qp = 10 */
+
+ 124, 16, 23, 124, 16, 23, 91, 51, 28, 20,
+ 16, 50, 84, 106, 112, 40, 71, 5, 7, 13,
+ 16, 16, 9, 6, 26, 60, 76, 52, 4, 43,
+ 69, 55, 19, 7, 13, 16, 17, 35, 22, 30,
+ 7, 29, 51, 6, 25, 43, 69, 14, 17, 35,
+ 8, 31, 35, 55, 3, 25, 13, 41, 9, 4,
+ 44, 0, 0, 0, 27, 49, 67, 7, 28, 0,
+ 80, 16, 35, 77, 31, 43, 1, 25, 47, 37,
+ 77, 41, 65, 53, 81, 33, 43, 51, 51, 48,
+ 3, 21, 83, 15, 65, 43, 87, 14, 6, 8,
+ 5, 24, 9, 11, 37, 61, 37, 39, 39, 15,
+ 31, 63, 53, 15, 71, 49, 29, 39, 63, 69,
+ 0, 21, 9, 7, 6, 18, 3, 15, 41, 67,
+ 19, 46, 46, 71, 18, 12, 10, 8, 8, 2,
+ 9, 3, 4, 16, 10, 4, 0, 0, 5, 9,
+ 10, 3, 34, 54, 52, 34, 22, 38, 24, 32,
+ 28, 0, 32, 28, 48, 51, 32, 58, 52, 108,
+ 124, 74, 92, 124, 124, 84, 114, 110, 124, 124,
+ 86, 94, 70, 90, 122, 96, 124, 124, 124, 82,
+ 72, 116, 84, 124, 22, 124, 124, 124, 124, 124,
+ 120, 116, 112, 102, 92, 84, 70, 56, 44, 8,
+ 23, 19, 37, 40, 42, 36, 40, 16, 18, 14,
+ 42, 1, 13, 5, 31, 19, 91, 67, 4, 14,
+ 20, 2, 13, 21, 29, 33, 83, 47, 52, 28,
+ 14, 9, 7, 33, 55, 63, 81, 11, 52, 36,
+ 26, 22, 2, 1, 5, 13, 33, 7, 70, 62,
+ 52, 16, 20, 3, 19, 41, 20, 110, 98, 90,
+ 68, 50, 6, 11, 11, 23, 124, 97, 91, 67,
+ 91, 89, 61, 81, 73, 55, 73, 71, 75, 49,
+ 55, 53, 73, 65, 43, 29, 31, 31, 25, 15,
+ 7, 7, 4, 11, 11, 19, 1, 0, 7, 2,
+ 2, 0, 2, 10, 0, 1, 2, 14, 3, 11,
+ 19, 11, 5, 9, 32, 32, 34, 40, 30, 32,
+ 28, 30, 28, 8, 24, 24, 7, 4, 116, 116,
+ 110, 118, 120, 124, 124, 124, 124, 124, 124, 124,
+ 124, 110, 68, 124, 124, 124, 124, 124, 124, 96,
+ 82, 78, 62, 40, 24, 2, 15, 124, 124, 124,
+ 124, 124, 124, 124, 124, 104, 96, 86, 80, 48,
+ 30, 8, 16, 16, 4, 56, 54, 52, 46, 38,
+ 42, 26, 20, 18, 4, 27, 7, 31, 51, 32,
+ 2, 7, 35, 27, 13, 25, 11, 4, 15, 25,
+ 3, 4, 6, 4, 12, 8, 6, 124, 112, 92,
+ 76, 60, 40, 22, 9, 45, 7, 78, 56, 46,
+ 26, 24, 8, 2, 1, 27, 31, 21, 7, 21,
+ 1, 8, 9, 17, 2, 10, 12, 10, 18, 18,
+ 8, 124, 112, 92, 76, 60, 40, 22, 9, 45,
+ },
+
+ {
+ /* Context Tables for P, SP, B Slices :: cabac_init_idc = 1, qp = 11 */
+
+ 124, 16, 25, 124, 16, 25, 87, 49, 30, 20,
+ 16, 46, 80, 104, 112, 40, 69, 5, 5, 13,
+ 16, 14, 7, 6, 24, 56, 72, 44, 1, 45,
+ 65, 53, 19, 5, 13, 16, 19, 35, 22, 30,
+ 7, 29, 51, 6, 25, 41, 67, 14, 15, 33,
+ 8, 29, 35, 55, 3, 25, 13, 41, 9, 4,
+ 44, 0, 0, 0, 25, 49, 67, 7, 28, 1,
+ 80, 16, 35, 73, 29, 41, 0, 19, 43, 35,
+ 73, 37, 63, 51, 77, 33, 43, 51, 51, 48,
+ 3, 19, 77, 15, 63, 41, 83, 12, 6, 8,
+ 5, 26, 9, 11, 35, 61, 37, 37, 37, 13,
+ 29, 61, 49, 15, 69, 47, 19, 29, 57, 63,
+ 0, 21, 9, 5, 6, 18, 3, 13, 39, 63,
+ 17, 42, 42, 65, 18, 12, 10, 8, 10, 4,
+ 5, 1, 4, 16, 10, 4, 2, 2, 7, 9,
+ 10, 3, 32, 52, 50, 34, 20, 38, 24, 32,
+ 24, 0, 30, 26, 46, 51, 32, 58, 52, 104,
+ 124, 72, 88, 122, 124, 80, 110, 106, 124, 124,
+ 80, 90, 68, 86, 114, 92, 124, 124, 124, 76,
+ 68, 110, 80, 124, 18, 124, 124, 124, 124, 124,
+ 116, 110, 108, 98, 88, 78, 66, 52, 42, 6,
+ 23, 19, 37, 38, 40, 34, 38, 14, 16, 12,
+ 38, 3, 13, 7, 33, 19, 91, 65, 6, 14,
+ 20, 2, 11, 19, 27, 33, 79, 45, 54, 28,
+ 14, 7, 5, 31, 51, 59, 77, 9, 52, 36,
+ 26, 24, 4, 1, 5, 13, 31, 7, 72, 62,
+ 50, 16, 20, 3, 19, 39, 18, 110, 98, 88,
+ 66, 48, 6, 11, 11, 23, 124, 95, 89, 65,
+ 89, 85, 59, 77, 69, 53, 71, 67, 71, 49,
+ 53, 51, 71, 61, 41, 29, 31, 29, 23, 15,
+ 7, 7, 2, 11, 11, 19, 1, 2, 5, 2,
+ 2, 2, 4, 10, 2, 0, 2, 12, 3, 11,
+ 19, 11, 3, 11, 30, 32, 32, 38, 28, 32,
+ 28, 28, 28, 8, 22, 20, 9, 2, 112, 114,
+ 108, 116, 116, 124, 124, 124, 124, 124, 124, 124,
+ 124, 104, 64, 124, 124, 124, 124, 124, 124, 90,
+ 78, 74, 58, 36, 22, 0, 17, 124, 124, 124,
+ 124, 124, 124, 120, 118, 98, 92, 80, 74, 44,
+ 28, 4, 14, 14, 2, 52, 52, 48, 42, 36,
+ 38, 22, 18, 14, 2, 29, 9, 33, 53, 28,
+ 0, 9, 35, 25, 13, 25, 9, 6, 15, 25,
+ 1, 6, 6, 6, 14, 8, 6, 124, 108, 88,
+ 70, 54, 32, 14, 17, 51, 7, 78, 56, 46,
+ 26, 26, 10, 2, 1, 27, 29, 19, 7, 19,
+ 0, 10, 7, 15, 2, 10, 12, 10, 20, 18,
+ 8, 124, 108, 88, 70, 54, 32, 14, 17, 51,
+ },
+
+ {
+ /* Context Tables for P, SP, B Slices :: cabac_init_idc = 1, qp = 12 */
+
+ 124, 16, 25, 124, 16, 25, 85, 45, 30, 20,
+ 14, 44, 76, 102, 112, 38, 65, 5, 5, 11,
+ 16, 14, 7, 4, 20, 54, 66, 38, 5, 49,
+ 59, 51, 19, 5, 11, 16, 19, 33, 22, 28,
+ 7, 29, 49, 6, 25, 41, 67, 14, 15, 33,
+ 8, 29, 33, 55, 3, 25, 13, 41, 9, 4,
+ 44, 0, 0, 0, 25, 51, 67, 5, 26, 1,
+ 78, 14, 35, 67, 27, 39, 4, 15, 41, 31,
+ 69, 35, 61, 49, 73, 33, 43, 49, 51, 48,
+ 3, 17, 73, 15, 63, 41, 79, 12, 4, 6,
+ 7, 26, 9, 9, 35, 59, 35, 37, 35, 13,
+ 29, 57, 47, 13, 67, 45, 9, 21, 51, 57,
+ 0, 21, 9, 5, 6, 18, 1, 13, 39, 59,
+ 17, 38, 38, 61, 18, 12, 10, 8, 10, 4,
+ 3, 0, 4, 16, 10, 4, 2, 4, 9, 9,
+ 10, 5, 32, 50, 50, 34, 20, 38, 24, 32,
+ 22, 0, 28, 24, 46, 51, 30, 56, 50, 100,
+ 124, 70, 84, 118, 120, 76, 104, 102, 124, 124,
+ 72, 86, 64, 82, 108, 86, 116, 124, 124, 70,
+ 64, 102, 76, 124, 14, 124, 124, 124, 124, 124,
+ 112, 106, 104, 94, 84, 74, 62, 48, 38, 2,
+ 23, 19, 39, 36, 38, 32, 36, 12, 12, 10,
+ 34, 5, 15, 9, 35, 21, 89, 61, 8, 16,
+ 20, 2, 9, 19, 27, 31, 75, 43, 54, 28,
+ 16, 5, 3, 29, 49, 57, 75, 7, 54, 38,
+ 28, 24, 6, 0, 5, 13, 31, 7, 72, 62,
+ 48, 16, 20, 3, 19, 39, 18, 108, 96, 86,
+ 64, 48, 6, 11, 11, 23, 124, 93, 85, 63,
+ 85, 83, 57, 73, 65, 51, 67, 63, 67, 47,
+ 51, 51, 69, 59, 41, 29, 31, 29, 23, 15,
+ 7, 7, 2, 11, 11, 21, 0, 2, 3, 2,
+ 2, 2, 4, 10, 2, 0, 2, 12, 3, 11,
+ 19, 11, 3, 13, 28, 32, 30, 36, 26, 30,
+ 28, 26, 26, 8, 22, 18, 9, 0, 110, 112,
+ 106, 112, 112, 124, 122, 124, 124, 124, 124, 124,
+ 122, 100, 60, 124, 124, 124, 124, 124, 118, 86,
+ 72, 68, 54, 32, 18, 1, 19, 124, 124, 124,
+ 124, 124, 124, 114, 112, 94, 86, 76, 68, 40,
+ 24, 2, 12, 12, 1, 50, 48, 44, 38, 32,
+ 34, 18, 16, 12, 0, 31, 11, 35, 55, 26,
+ 1, 11, 35, 25, 11, 23, 7, 8, 15, 23,
+ 1, 6, 6, 6, 16, 8, 6, 122, 104, 82,
+ 64, 48, 24, 4, 25, 57, 7, 78, 56, 46,
+ 26, 26, 10, 2, 1, 25, 29, 19, 5, 19,
+ 0, 12, 7, 15, 2, 10, 12, 10, 20, 18,
+ 8, 122, 104, 82, 64, 48, 24, 4, 25, 57,
+ },
+
+ {
+ /* Context Tables for P, SP, B Slices :: cabac_init_idc = 1, qp = 13 */
+
+ 124, 16, 25, 124, 16, 25, 81, 43, 30, 20,
+ 14, 42, 72, 100, 112, 38, 61, 5, 5, 9,
+ 14, 12, 7, 2, 16, 52, 62, 30, 11, 53,
+ 53, 49, 17, 5, 9, 14, 19, 31, 22, 28,
+ 7, 27, 47, 6, 25, 41, 67, 14, 15, 33,
+ 8, 29, 33, 53, 3, 25, 13, 41, 7, 4,
+ 44, 0, 0, 0, 23, 51, 67, 5, 24, 1,
+ 76, 14, 35, 63, 25, 37, 8, 11, 37, 29,
+ 67, 31, 57, 47, 67, 33, 41, 49, 49, 48,
+ 3, 15, 69, 15, 61, 39, 75, 12, 4, 6,
+ 7, 26, 9, 9, 33, 57, 33, 35, 33, 11,
+ 27, 55, 43, 11, 63, 43, 0, 13, 45, 51,
+ 2, 19, 7, 3, 6, 18, 1, 13, 37, 55,
+ 17, 34, 34, 55, 18, 14, 10, 8, 10, 6,
+ 0, 2, 6, 14, 10, 4, 4, 4, 11, 7,
+ 10, 5, 30, 48, 48, 32, 18, 38, 24, 32,
+ 20, 2, 26, 24, 44, 51, 30, 54, 48, 96,
+ 124, 68, 82, 114, 116, 72, 100, 98, 124, 124,
+ 66, 82, 60, 76, 102, 82, 110, 124, 124, 66,
+ 60, 96, 72, 124, 12, 124, 124, 124, 122, 120,
+ 108, 102, 100, 90, 78, 70, 58, 46, 34, 1,
+ 23, 19, 39, 34, 36, 30, 32, 10, 10, 8,
+ 32, 7, 15, 11, 35, 23, 87, 59, 10, 16,
+ 20, 4, 9, 17, 25, 29, 71, 39, 54, 30,
+ 16, 5, 1, 27, 45, 53, 71, 7, 56, 40,
+ 30, 26, 8, 0, 3, 11, 29, 7, 74, 62,
+ 48, 14, 20, 3, 19, 37, 18, 108, 94, 84,
+ 62, 48, 6, 9, 11, 21, 124, 89, 83, 59,
+ 81, 79, 55, 71, 63, 49, 63, 61, 63, 45,
+ 49, 49, 67, 55, 41, 29, 29, 27, 21, 15,
+ 9, 7, 0, 11, 11, 21, 0, 2, 1, 2,
+ 2, 2, 4, 10, 2, 0, 2, 10, 3, 11,
+ 19, 11, 1, 15, 26, 32, 28, 36, 26, 28,
+ 28, 26, 24, 8, 22, 16, 9, 1, 108, 110,
+ 104, 108, 108, 124, 118, 122, 124, 118, 124, 124,
+ 116, 94, 56, 124, 124, 124, 124, 118, 112, 80,
+ 68, 64, 50, 30, 16, 1, 19, 124, 124, 124,
+ 124, 118, 118, 110, 106, 90, 82, 72, 62, 36,
+ 20, 1, 12, 12, 5, 48, 46, 42, 34, 28,
+ 30, 16, 14, 8, 1, 31, 15, 37, 55, 22,
+ 5, 15, 33, 25, 9, 23, 5, 10, 13, 21,
+ 0, 8, 6, 8, 18, 8, 6, 120, 100, 76,
+ 58, 40, 16, 3, 33, 63, 5, 78, 56, 46,
+ 26, 28, 10, 4, 0, 25, 29, 19, 5, 19,
+ 2, 14, 7, 15, 4, 12, 12, 12, 22, 18,
+ 8, 120, 100, 76, 58, 40, 16, 3, 33, 63,
+ },
+
+ {
+ /* Context Tables for P, SP, B Slices :: cabac_init_idc = 1, qp = 14 */
+
+ 122, 16, 25, 122, 16, 25, 77, 39, 32, 20,
+ 12, 38, 68, 98, 112, 38, 57, 5, 3, 7,
+ 14, 12, 7, 2, 14, 50, 56, 24, 15, 55,
+ 47, 47, 17, 3, 7, 14, 21, 29, 22, 26,
+ 7, 27, 47, 6, 25, 41, 67, 14, 15, 31,
+ 8, 29, 31, 53, 3, 25, 13, 41, 7, 4,
+ 44, 0, 0, 0, 23, 51, 67, 3, 22, 1,
+ 74, 12, 35, 57, 23, 35, 10, 7, 35, 25,
+ 63, 29, 55, 45, 63, 33, 41, 47, 49, 48,
+ 3, 13, 65, 15, 59, 39, 71, 12, 2, 4,
+ 9, 26, 9, 7, 33, 55, 31, 35, 31, 11,
+ 25, 51, 41, 11, 61, 41, 10, 3, 39, 45,
+ 2, 19, 7, 3, 6, 18, 0, 11, 37, 51,
+ 17, 30, 30, 49, 18, 14, 10, 8, 10, 6,
+ 2, 4, 6, 14, 10, 4, 4, 6, 13, 7,
+ 10, 7, 30, 46, 48, 32, 18, 38, 24, 32,
+ 18, 2, 24, 22, 44, 51, 30, 54, 48, 92,
+ 122, 66, 78, 110, 110, 68, 94, 94, 124, 124,
+ 58, 78, 56, 72, 96, 76, 104, 122, 124, 60,
+ 56, 88, 68, 124, 8, 120, 124, 120, 116, 114,
+ 104, 98, 96, 86, 74, 66, 54, 42, 32, 5,
+ 23, 19, 41, 32, 34, 28, 30, 8, 8, 6,
+ 28, 9, 17, 13, 37, 23, 87, 55, 12, 18,
+ 20, 4, 7, 17, 25, 29, 67, 37, 54, 30,
+ 18, 3, 0, 25, 43, 51, 67, 5, 56, 40,
+ 30, 26, 10, 2, 3, 11, 29, 7, 74, 62,
+ 46, 14, 20, 3, 19, 37, 18, 106, 94, 82,
+ 60, 48, 6, 9, 11, 21, 124, 87, 79, 57,
+ 79, 77, 53, 67, 59, 47, 61, 57, 59, 45,
+ 47, 49, 65, 51, 39, 29, 29, 27, 21, 15,
+ 9, 7, 0, 11, 11, 23, 2, 4, 0, 2,
+ 2, 4, 4, 10, 2, 0, 2, 10, 3, 11,
+ 19, 11, 1, 17, 24, 32, 26, 34, 24, 28,
+ 28, 24, 22, 8, 22, 14, 9, 3, 106, 108,
+ 102, 106, 104, 120, 114, 118, 118, 114, 124, 120,
+ 110, 90, 52, 124, 124, 124, 124, 110, 106, 76,
+ 64, 58, 46, 26, 12, 3, 21, 124, 124, 124,
+ 120, 112, 114, 104, 100, 84, 76, 66, 56, 32,
+ 16, 3, 10, 10, 7, 44, 42, 38, 30, 26,
+ 26, 12, 12, 6, 3, 33, 17, 39, 57, 20,
+ 7, 17, 33, 23, 9, 21, 3, 12, 13, 21,
+ 0, 8, 6, 8, 20, 8, 6, 118, 96, 72,
+ 52, 34, 8, 11, 41, 69, 5, 78, 56, 46,
+ 26, 28, 12, 4, 0, 23, 29, 19, 3, 17,
+ 4, 16, 7, 13, 4, 12, 12, 12, 22, 18,
+ 8, 118, 96, 72, 52, 34, 8, 11, 41, 69,
+ },
+
+ {
+ /* Context Tables for P, SP, B Slices :: cabac_init_idc = 1, qp = 15 */
+
+ 120, 16, 25, 120, 16, 25, 73, 37, 32, 20,
+ 12, 36, 64, 96, 112, 38, 53, 5, 3, 5,
+ 14, 10, 7, 0, 10, 48, 52, 16, 21, 59,
+ 41, 45, 17, 3, 5, 14, 21, 27, 22, 26,
+ 7, 27, 45, 6, 25, 41, 67, 14, 15, 31,
+ 8, 29, 31, 53, 3, 25, 13, 41, 7, 4,
+ 44, 0, 0, 0, 21, 51, 67, 3, 20, 1,
+ 72, 12, 35, 53, 21, 33, 14, 3, 31, 23,
+ 59, 25, 53, 43, 59, 33, 41, 47, 49, 48,
+ 3, 11, 61, 15, 57, 37, 67, 12, 2, 4,
+ 9, 26, 9, 7, 31, 53, 29, 33, 29, 9,
+ 23, 49, 37, 9, 59, 39, 20, 4, 33, 39,
+ 2, 19, 7, 1, 6, 18, 0, 11, 35, 47,
+ 17, 26, 26, 43, 18, 14, 10, 8, 10, 8,
+ 6, 6, 6, 14, 10, 4, 6, 8, 15, 7,
+ 10, 7, 28, 44, 46, 32, 16, 38, 24, 32,
+ 16, 2, 22, 20, 42, 51, 30, 52, 46, 88,
+ 116, 64, 74, 106, 106, 64, 90, 90, 124, 124,
+ 52, 74, 52, 68, 90, 72, 98, 114, 124, 54,
+ 52, 82, 64, 124, 4, 116, 124, 116, 112, 110,
+ 100, 94, 92, 82, 70, 62, 50, 38, 28, 9,
+ 23, 19, 41, 30, 32, 26, 28, 6, 6, 4,
+ 24, 11, 17, 15, 39, 25, 85, 53, 14, 18,
+ 20, 4, 5, 15, 23, 27, 63, 35, 54, 30,
+ 18, 1, 2, 23, 39, 47, 63, 3, 58, 42,
+ 32, 28, 12, 2, 3, 11, 27, 7, 76, 62,
+ 44, 14, 20, 3, 19, 35, 18, 106, 92, 80,
+ 58, 48, 6, 9, 11, 21, 124, 85, 77, 55,
+ 75, 73, 51, 63, 55, 45, 57, 53, 55, 43,
+ 45, 47, 63, 47, 39, 29, 29, 25, 19, 15,
+ 9, 7, 1, 11, 11, 23, 2, 4, 2, 2,
+ 2, 4, 4, 10, 2, 0, 2, 8, 3, 11,
+ 19, 11, 0, 19, 22, 32, 24, 32, 22, 26,
+ 28, 22, 20, 8, 22, 12, 9, 5, 104, 106,
+ 100, 102, 100, 116, 110, 114, 114, 108, 122, 114,
+ 104, 84, 48, 124, 124, 124, 124, 104, 100, 70,
+ 60, 54, 42, 22, 10, 5, 23, 124, 124, 124,
+ 116, 106, 108, 100, 94, 80, 72, 62, 50, 28,
+ 12, 7, 8, 8, 11, 42, 40, 34, 26, 22,
+ 22, 8, 10, 2, 5, 35, 19, 41, 59, 16,
+ 9, 19, 33, 23, 7, 21, 1, 14, 13, 19,
+ 2, 10, 6, 10, 22, 8, 6, 116, 92, 66,
+ 46, 28, 0, 19, 49, 75, 5, 78, 56, 46,
+ 26, 30, 12, 4, 0, 23, 29, 19, 3, 17,
+ 6, 18, 7, 13, 4, 12, 12, 12, 24, 18,
+ 8, 116, 92, 66, 46, 28, 0, 19, 49, 75,
+ },
+
+ {
+ /* Context Tables for P, SP, B Slices :: cabac_init_idc = 1, qp = 16 */
+
+ 116, 14, 27, 116, 14, 27, 71, 35, 32, 20,
+ 10, 32, 58, 94, 112, 36, 51, 7, 3, 5,
+ 12, 8, 7, 1, 6, 44, 46, 8, 27, 63,
+ 37, 45, 17, 3, 5, 12, 23, 27, 22, 24,
+ 7, 27, 45, 4, 27, 41, 67, 12, 15, 31,
+ 8, 29, 31, 53, 3, 25, 15, 41, 7, 4,
+ 44, 0, 0, 0, 21, 53, 67, 3, 18, 3,
+ 70, 10, 37, 49, 19, 31, 16, 0, 29, 21,
+ 57, 23, 51, 41, 55, 33, 41, 47, 49, 48,
+ 3, 11, 57, 15, 57, 37, 65, 10, 0, 2,
+ 11, 26, 9, 7, 31, 53, 29, 33, 29, 9,
+ 23, 47, 35, 9, 57, 37, 28, 12, 27, 35,
+ 2, 19, 7, 1, 4, 18, 0, 11, 35, 43,
+ 17, 22, 22, 39, 18, 14, 10, 8, 10, 8,
+ 8, 6, 6, 12, 8, 4, 6, 8, 19, 7,
+ 10, 9, 26, 40, 44, 30, 14, 38, 24, 30,
+ 12, 2, 20, 18, 40, 51, 28, 50, 44, 82,
+ 108, 60, 70, 100, 100, 58, 84, 86, 110, 124,
+ 44, 68, 48, 62, 82, 66, 90, 104, 118, 48,
+ 48, 74, 60, 124, 0, 110, 118, 110, 106, 104,
+ 94, 88, 86, 78, 64, 56, 46, 34, 24, 13,
+ 23, 21, 43, 28, 28, 22, 24, 2, 2, 0,
+ 20, 13, 19, 17, 41, 27, 85, 51, 14, 18,
+ 20, 4, 5, 15, 23, 27, 59, 33, 54, 30,
+ 18, 1, 2, 21, 37, 45, 61, 3, 58, 42,
+ 32, 28, 14, 2, 3, 11, 27, 9, 76, 60,
+ 42, 12, 20, 3, 19, 35, 16, 104, 90, 76,
+ 56, 46, 6, 9, 11, 21, 124, 83, 75, 53,
+ 73, 71, 49, 61, 53, 43, 55, 51, 51, 43,
+ 45, 47, 61, 45, 39, 29, 29, 25, 19, 15,
+ 11, 9, 3, 11, 13, 25, 2, 4, 4, 2,
+ 0, 4, 4, 8, 2, 0, 2, 6, 3, 11,
+ 19, 11, 0, 21, 20, 32, 20, 30, 20, 24,
+ 26, 20, 18, 8, 20, 8, 11, 9, 100, 102,
+ 98, 98, 96, 110, 104, 108, 108, 102, 116, 108,
+ 96, 78, 44, 124, 124, 122, 120, 96, 92, 64,
+ 54, 48, 38, 18, 6, 7, 25, 118, 120, 120,
+ 110, 100, 102, 94, 86, 74, 66, 56, 44, 24,
+ 8, 11, 6, 6, 15, 38, 36, 30, 20, 18,
+ 18, 4, 6, 1, 9, 37, 23, 43, 61, 12,
+ 13, 23, 33, 23, 7, 21, 0, 16, 13, 19,
+ 2, 10, 6, 10, 22, 8, 4, 112, 88, 60,
+ 38, 20, 7, 29, 59, 81, 5, 78, 56, 46,
+ 26, 30, 12, 4, 0, 23, 29, 19, 3, 17,
+ 6, 18, 7, 13, 4, 12, 12, 12, 24, 16,
+ 6, 112, 88, 60, 38, 20, 7, 29, 59, 81,
+ },
+
+ {
+ /* Context Tables for P, SP, B Slices :: cabac_init_idc = 1, qp = 17 */
+
+ 114, 14, 27, 114, 14, 27, 67, 31, 34, 22,
+ 10, 30, 54, 92, 114, 36, 47, 7, 1, 3,
+ 12, 8, 5, 1, 4, 42, 42, 2, 31, 65,
+ 31, 43, 15, 1, 3, 12, 23, 25, 22, 24,
+ 5, 25, 43, 4, 27, 39, 65, 12, 13, 29,
+ 8, 27, 29, 51, 1, 23, 15, 39, 5, 4,
+ 44, 0, 0, 0, 19, 53, 67, 1, 18, 3,
+ 70, 10, 37, 43, 15, 27, 20, 6, 25, 17,
+ 53, 19, 47, 37, 49, 33, 39, 45, 47, 48,
+ 3, 9, 51, 13, 55, 35, 61, 10, 0, 2,
+ 11, 28, 9, 5, 29, 51, 27, 31, 27, 7,
+ 21, 43, 31, 7, 53, 33, 38, 22, 19, 29,
+ 4, 17, 5, 0, 4, 20, 2, 9, 33, 37,
+ 15, 18, 20, 33, 18, 16, 10, 10, 12, 10,
+ 12, 8, 8, 12, 8, 4, 8, 10, 21, 5,
+ 10, 9, 26, 38, 44, 30, 14, 38, 24, 30,
+ 10, 4, 20, 18, 40, 51, 28, 50, 44, 78,
+ 102, 58, 68, 96, 96, 54, 80, 82, 98, 124,
+ 38, 64, 46, 58, 76, 62, 84, 96, 110, 44,
+ 44, 68, 56, 124, 1, 106, 114, 106, 102, 100,
+ 90, 84, 82, 74, 60, 52, 44, 32, 22, 15,
+ 23, 21, 43, 28, 26, 20, 22, 0, 0, 1,
+ 18, 15, 19, 19, 41, 27, 83, 47, 16, 20,
+ 20, 6, 3, 13, 21, 25, 53, 29, 56, 32,
+ 20, 0, 4, 17, 33, 41, 57, 1, 60, 44,
+ 34, 30, 16, 4, 1, 9, 25, 9, 78, 60,
+ 42, 12, 22, 1, 19, 33, 16, 104, 90, 74,
+ 54, 46, 8, 7, 9, 19, 124, 79, 71, 49,
+ 69, 67, 45, 57, 49, 39, 51, 47, 45, 41,
+ 43, 45, 57, 41, 37, 27, 27, 23, 17, 13,
+ 11, 9, 3, 11, 13, 25, 4, 6, 6, 4,
+ 0, 6, 6, 8, 4, 2, 2, 6, 1, 9,
+ 17, 9, 2, 21, 18, 32, 18, 30, 20, 24,
+ 26, 20, 18, 8, 20, 6, 11, 11, 98, 100,
+ 98, 96, 94, 106, 100, 104, 104, 98, 112, 104,
+ 90, 74, 40, 122, 120, 114, 112, 90, 86, 60,
+ 50, 44, 36, 16, 4, 7, 25, 114, 116, 116,
+ 106, 96, 98, 90, 80, 70, 62, 52, 40, 22,
+ 6, 13, 6, 6, 17, 36, 34, 28, 16, 16,
+ 16, 2, 4, 3, 11, 37, 25, 43, 61, 10,
+ 15, 25, 31, 21, 5, 19, 4, 20, 11, 17,
+ 4, 12, 8, 12, 24, 8, 4, 110, 84, 56,
+ 32, 14, 15, 37, 67, 85, 3, 78, 58, 48,
+ 28, 32, 14, 6, 2, 21, 27, 17, 1, 15,
+ 8, 20, 5, 11, 6, 14, 12, 14, 26, 16,
+ 6, 110, 84, 56, 32, 14, 15, 37, 67, 85,
+ },
+
+ {
+ /* Context Tables for P, SP, B Slices :: cabac_init_idc = 1, qp = 18 */
+
+ 112, 14, 27, 112, 14, 27, 63, 29, 34, 22,
+ 10, 28, 50, 90, 114, 36, 43, 7, 1, 1,
+ 12, 6, 5, 3, 0, 40, 36, 5, 37, 69,
+ 25, 41, 15, 1, 1, 12, 23, 23, 22, 22,
+ 5, 25, 41, 4, 27, 39, 65, 12, 13, 29,
+ 8, 27, 29, 51, 1, 23, 15, 39, 5, 4,
+ 44, 0, 0, 0, 17, 53, 67, 1, 16, 3,
+ 68, 10, 37, 39, 13, 25, 24, 10, 23, 15,
+ 49, 17, 45, 35, 45, 33, 39, 45, 47, 48,
+ 3, 7, 47, 13, 53, 33, 57, 10, 0, 2,
+ 13, 28, 9, 5, 27, 49, 25, 29, 25, 7,
+ 19, 41, 29, 5, 51, 31, 48, 30, 13, 23,
+ 4, 17, 5, 0, 4, 20, 2, 9, 33, 33,
+ 15, 14, 16, 27, 18, 16, 10, 10, 12, 10,
+ 16, 10, 8, 12, 8, 4, 10, 12, 23, 5,
+ 10, 9, 24, 36, 42, 30, 12, 38, 24, 30,
+ 8, 4, 18, 16, 38, 51, 28, 48, 42, 74,
+ 96, 56, 64, 92, 92, 50, 76, 78, 86, 124,
+ 30, 60, 42, 54, 70, 58, 78, 88, 102, 38,
+ 40, 62, 52, 124, 5, 102, 110, 102, 98, 96,
+ 86, 80, 78, 70, 56, 48, 40, 28, 18, 19,
+ 23, 21, 45, 26, 24, 18, 20, 1, 1, 3,
+ 14, 17, 19, 21, 43, 29, 81, 45, 18, 20,
+ 20, 6, 1, 11, 19, 23, 49, 27, 56, 32,
+ 20, 2, 6, 15, 29, 37, 53, 0, 62, 46,
+ 36, 32, 18, 4, 1, 9, 23, 9, 80, 60,
+ 40, 12, 22, 1, 19, 33, 16, 104, 88, 72,
+ 52, 46, 8, 7, 9, 19, 124, 77, 69, 47,
+ 65, 63, 43, 53, 45, 37, 47, 43, 41, 39,
+ 41, 45, 55, 37, 37, 27, 27, 21, 17, 13,
+ 11, 9, 5, 11, 13, 25, 4, 6, 8, 4,
+ 0, 6, 6, 8, 4, 2, 2, 4, 1, 9,
+ 17, 9, 4, 23, 16, 32, 16, 28, 18, 22,
+ 26, 18, 16, 8, 20, 4, 11, 13, 96, 98,
+ 96, 92, 90, 102, 96, 100, 100, 92, 106, 98,
+ 84, 68, 36, 114, 112, 106, 102, 84, 80, 54,
+ 46, 38, 32, 12, 2, 9, 27, 110, 112, 110,
+ 102, 90, 92, 84, 74, 66, 56, 48, 34, 18,
+ 2, 17, 4, 4, 21, 34, 32, 24, 12, 12,
+ 12, 1, 2, 7, 13, 39, 27, 45, 63, 6,
+ 17, 27, 31, 21, 3, 19, 6, 22, 11, 15,
+ 6, 14, 8, 12, 26, 8, 4, 108, 80, 50,
+ 26, 8, 23, 45, 75, 91, 3, 78, 58, 48,
+ 28, 34, 14, 6, 2, 21, 27, 17, 1, 15,
+ 10, 22, 5, 11, 6, 14, 12, 14, 28, 16,
+ 6, 108, 80, 50, 26, 8, 23, 45, 75, 91,
+ },
+
+ {
+ /* Context Tables for P, SP, B Slices :: cabac_init_idc = 1, qp = 19 */
+
+ 110, 14, 27, 110, 14, 27, 59, 25, 36, 22,
+ 8, 24, 46, 88, 114, 36, 39, 7, 0, 0,
+ 12, 6, 5, 3, 1, 38, 32, 11, 41, 71,
+ 19, 39, 15, 0, 0, 12, 25, 21, 22, 22,
+ 5, 25, 41, 4, 27, 39, 65, 12, 13, 27,
+ 8, 27, 27, 51, 1, 23, 15, 39, 5, 4,
+ 44, 0, 0, 0, 17, 53, 67, 0, 14, 3,
+ 66, 8, 37, 33, 11, 23, 26, 14, 19, 11,
+ 45, 13, 43, 33, 41, 33, 39, 43, 47, 48,
+ 3, 5, 43, 13, 51, 33, 53, 10, 1, 0,
+ 13, 28, 9, 3, 27, 47, 23, 29, 23, 5,
+ 17, 37, 25, 5, 49, 29, 58, 40, 7, 17,
+ 4, 17, 5, 2, 4, 20, 4, 7, 31, 29,
+ 15, 10, 12, 21, 18, 16, 10, 10, 12, 12,
+ 18, 12, 8, 12, 8, 4, 10, 14, 25, 5,
+ 10, 11, 24, 34, 42, 30, 12, 38, 24, 30,
+ 6, 4, 16, 14, 38, 51, 28, 48, 42, 70,
+ 90, 54, 60, 88, 86, 46, 70, 74, 72, 124,
+ 24, 56, 38, 50, 64, 52, 72, 80, 94, 32,
+ 36, 54, 48, 124, 9, 98, 106, 98, 92, 90,
+ 82, 76, 74, 66, 52, 44, 36, 24, 16, 23,
+ 23, 21, 45, 24, 22, 16, 18, 3, 3, 5,
+ 10, 19, 21, 23, 45, 29, 81, 41, 20, 22,
+ 20, 6, 0, 11, 19, 23, 45, 25, 56, 32,
+ 22, 4, 8, 13, 27, 35, 49, 2, 62, 46,
+ 36, 32, 20, 6, 1, 9, 23, 9, 80, 60,
+ 38, 12, 22, 1, 19, 31, 16, 102, 88, 70,
+ 50, 46, 8, 7, 9, 19, 124, 75, 65, 45,
+ 63, 61, 41, 49, 41, 35, 45, 39, 37, 39,
+ 39, 43, 53, 33, 35, 27, 27, 21, 15, 13,
+ 11, 9, 5, 11, 13, 27, 6, 8, 10, 4,
+ 0, 8, 6, 8, 4, 2, 2, 4, 1, 9,
+ 17, 9, 4, 25, 14, 32, 14, 26, 16, 22,
+ 26, 16, 14, 8, 20, 2, 11, 15, 94, 96,
+ 94, 90, 86, 98, 92, 96, 94, 88, 100, 92,
+ 78, 64, 32, 106, 104, 98, 92, 76, 74, 50,
+ 42, 34, 28, 8, 1, 11, 29, 106, 106, 106,
+ 96, 84, 88, 80, 68, 60, 52, 42, 28, 14,
+ 1, 19, 2, 2, 23, 30, 28, 20, 8, 10,
+ 8, 5, 0, 9, 15, 41, 29, 47, 65, 4,
+ 19, 29, 31, 19, 3, 17, 8, 24, 11, 15,
+ 6, 14, 8, 14, 28, 8, 4, 106, 76, 46,
+ 20, 2, 31, 53, 83, 97, 3, 78, 58, 48,
+ 28, 34, 16, 6, 2, 19, 27, 17, 0, 13,
+ 12, 24, 5, 9, 6, 14, 12, 14, 28, 16,
+ 6, 106, 76, 46, 20, 2, 31, 53, 83, 97,
+ },
+
+ {
+ /* Context Tables for P, SP, B Slices :: cabac_init_idc = 1, qp = 20 */
+
+ 106, 14, 27, 106, 14, 27, 57, 23, 36, 22,
+ 8, 22, 42, 86, 114, 34, 35, 7, 0, 2,
+ 10, 4, 5, 5, 5, 36, 26, 19, 47, 75,
+ 13, 37, 15, 0, 2, 10, 25, 19, 22, 20,
+ 5, 23, 39, 4, 27, 39, 65, 12, 13, 27,
+ 8, 27, 27, 49, 1, 23, 15, 39, 3, 4,
+ 44, 0, 0, 0, 15, 55, 67, 0, 12, 3,
+ 64, 8, 37, 29, 9, 21, 30, 18, 17, 9,
+ 43, 11, 39, 31, 37, 33, 37, 43, 47, 48,
+ 3, 3, 39, 13, 51, 31, 49, 10, 1, 0,
+ 15, 28, 9, 3, 25, 45, 21, 27, 21, 5,
+ 17, 35, 23, 3, 47, 27, 68, 48, 1, 11,
+ 4, 15, 5, 2, 4, 20, 4, 7, 31, 25,
+ 15, 6, 8, 17, 18, 18, 10, 10, 12, 12,
+ 22, 14, 10, 10, 8, 4, 12, 14, 27, 5,
+ 10, 11, 22, 32, 40, 28, 10, 38, 24, 30,
+ 4, 4, 14, 12, 36, 51, 26, 46, 40, 66,
+ 82, 52, 56, 84, 82, 42, 66, 70, 60, 124,
+ 16, 52, 34, 44, 58, 48, 64, 70, 86, 26,
+ 32, 48, 44, 124, 11, 94, 102, 92, 88, 86,
+ 78, 72, 70, 62, 46, 40, 32, 20, 12, 27,
+ 23, 21, 47, 22, 20, 14, 14, 5, 7, 7,
+ 8, 21, 21, 25, 45, 31, 79, 39, 22, 22,
+ 20, 8, 0, 9, 17, 21, 41, 21, 56, 34,
+ 22, 4, 10, 11, 23, 31, 47, 2, 64, 48,
+ 38, 34, 22, 6, 1, 7, 21, 9, 82, 60,
+ 38, 10, 22, 1, 19, 31, 16, 102, 86, 68,
+ 48, 46, 8, 5, 9, 19, 124, 71, 63, 43,
+ 59, 57, 39, 47, 39, 33, 41, 37, 33, 37,
+ 37, 43, 51, 31, 35, 27, 27, 19, 15, 13,
+ 13, 9, 7, 11, 13, 27, 6, 8, 12, 4,
+ 0, 8, 6, 8, 4, 2, 2, 2, 1, 9,
+ 17, 9, 6, 27, 12, 32, 12, 26, 14, 20,
+ 26, 14, 12, 8, 20, 0, 11, 17, 92, 94,
+ 92, 86, 82, 94, 88, 90, 90, 82, 94, 86,
+ 72, 58, 28, 96, 96, 90, 82, 70, 66, 44,
+ 36, 28, 24, 4, 3, 13, 29, 100, 102, 100,
+ 92, 78, 82, 74, 62, 56, 46, 38, 22, 10,
+ 5, 23, 0, 2, 27, 28, 26, 18, 4, 6,
+ 4, 7, 1, 13, 17, 43, 33, 49, 65, 0,
+ 23, 33, 29, 19, 1, 17, 10, 26, 11, 13,
+ 8, 16, 8, 14, 30, 8, 4, 104, 72, 40,
+ 14, 5, 39, 63, 91, 103, 1, 78, 58, 48,
+ 28, 36, 16, 6, 2, 19, 27, 17, 0, 13,
+ 12, 26, 5, 9, 8, 16, 12, 14, 30, 16,
+ 6, 104, 72, 40, 14, 5, 39, 63, 91, 103,
+ },
+
+ {
+ /* Context Tables for P, SP, B Slices :: cabac_init_idc = 1, qp = 21 */
+
+ 104, 14, 27, 104, 14, 27, 53, 19, 36, 22,
+ 6, 20, 38, 84, 114, 34, 31, 7, 0, 4,
+ 10, 4, 5, 7, 9, 34, 22, 25, 51, 79,
+ 7, 35, 13, 0, 4, 10, 25, 17, 22, 20,
+ 5, 23, 37, 4, 27, 39, 65, 12, 13, 27,
+ 8, 27, 25, 49, 1, 23, 15, 39, 3, 4,
+ 44, 0, 0, 0, 15, 55, 67, 2, 10, 3,
+ 62, 6, 37, 23, 7, 19, 34, 22, 13, 5,
+ 39, 7, 37, 29, 31, 33, 37, 41, 45, 48,
+ 3, 1, 35, 13, 49, 31, 45, 10, 3, 1,
+ 15, 28, 9, 1, 25, 43, 19, 27, 19, 3,
+ 15, 31, 19, 1, 43, 25, 78, 56, 4, 5,
+ 6, 15, 3, 4, 4, 20, 6, 7, 29, 21,
+ 15, 2, 4, 11, 18, 18, 10, 10, 12, 14,
+ 24, 16, 10, 10, 8, 4, 12, 16, 29, 3,
+ 10, 13, 22, 30, 40, 28, 10, 38, 24, 30,
+ 2, 6, 12, 12, 36, 51, 26, 44, 38, 62,
+ 76, 50, 54, 80, 78, 38, 60, 66, 48, 124,
+ 10, 48, 30, 40, 52, 42, 58, 62, 78, 22,
+ 28, 40, 40, 124, 15, 90, 98, 88, 84, 82,
+ 74, 68, 66, 58, 42, 36, 28, 18, 8, 31,
+ 23, 21, 47, 20, 18, 12, 12, 7, 9, 9,
+ 4, 23, 23, 27, 47, 33, 77, 35, 24, 24,
+ 20, 8, 2, 9, 17, 19, 37, 19, 56, 34,
+ 24, 6, 12, 9, 21, 29, 43, 4, 66, 50,
+ 40, 34, 24, 8, 0, 7, 21, 9, 82, 60,
+ 36, 10, 22, 1, 19, 29, 16, 100, 84, 66,
+ 46, 46, 8, 5, 9, 17, 124, 69, 59, 39,
+ 55, 55, 37, 43, 35, 31, 37, 33, 29, 35,
+ 35, 41, 49, 27, 35, 27, 25, 19, 13, 13,
+ 13, 9, 7, 11, 13, 29, 8, 8, 14, 4,
+ 0, 8, 6, 8, 4, 2, 2, 2, 1, 9,
+ 17, 9, 6, 29, 10, 32, 10, 24, 14, 18,
+ 26, 14, 10, 8, 20, 1, 11, 19, 90, 92,
+ 90, 82, 78, 90, 84, 86, 84, 76, 88, 80,
+ 66, 54, 24, 88, 88, 82, 72, 64, 60, 40,
+ 32, 24, 20, 2, 7, 13, 31, 96, 96, 96,
+ 88, 72, 76, 70, 56, 52, 42, 34, 16, 6,
+ 9, 25, 0, 0, 31, 26, 22, 14, 0, 2,
+ 0, 11, 3, 15, 19, 43, 35, 51, 67, 1,
+ 25, 35, 29, 19, 0, 15, 12, 28, 9, 11,
+ 8, 16, 8, 16, 32, 8, 4, 102, 68, 34,
+ 8, 11, 47, 71, 99, 109, 1, 78, 58, 48,
+ 28, 36, 16, 8, 4, 17, 27, 17, 2, 13,
+ 14, 28, 5, 9, 8, 16, 12, 16, 30, 16,
+ 6, 102, 68, 34, 8, 11, 47, 71, 99, 109,
+ },
+
+ {
+ /* Context Tables for P, SP, B Slices :: cabac_init_idc = 1, qp = 22 */
+
+ 102, 14, 29, 102, 14, 29, 49, 17, 38, 22,
+ 6, 16, 34, 82, 114, 34, 29, 7, 2, 4,
+ 10, 2, 3, 7, 11, 30, 16, 33, 57, 81,
+ 3, 33, 13, 2, 4, 10, 27, 17, 22, 18,
+ 5, 23, 37, 4, 27, 37, 63, 12, 11, 25,
+ 8, 25, 25, 49, 1, 23, 15, 39, 3, 4,
+ 44, 0, 0, 0, 13, 55, 67, 2, 10, 5,
+ 62, 6, 37, 19, 5, 17, 36, 28, 11, 3,
+ 35, 5, 35, 27, 27, 33, 37, 41, 45, 48,
+ 3, 0, 29, 13, 47, 29, 41, 8, 3, 1,
+ 17, 30, 9, 1, 23, 43, 19, 25, 17, 3,
+ 13, 29, 17, 1, 41, 23, 88, 66, 10, 0,
+ 6, 15, 3, 4, 4, 20, 6, 5, 29, 17,
+ 13, 1, 0, 5, 18, 18, 10, 10, 14, 14,
+ 28, 18, 10, 10, 8, 4, 14, 18, 31, 3,
+ 10, 13, 20, 28, 38, 28, 8, 38, 24, 30,
+ 1, 6, 10, 10, 34, 51, 26, 44, 38, 58,
+ 70, 48, 50, 74, 72, 34, 56, 62, 34, 124,
+ 2, 44, 28, 36, 44, 38, 52, 54, 68, 16,
+ 24, 34, 36, 124, 19, 86, 94, 84, 78, 76,
+ 70, 62, 62, 54, 38, 30, 24, 14, 6, 33,
+ 23, 21, 49, 18, 16, 10, 10, 9, 11, 11,
+ 0, 25, 23, 29, 49, 33, 77, 33, 26, 24,
+ 20, 8, 4, 7, 15, 19, 33, 17, 58, 34,
+ 24, 8, 14, 7, 17, 25, 39, 6, 66, 50,
+ 40, 36, 26, 8, 0, 7, 19, 9, 84, 60,
+ 34, 10, 22, 1, 19, 29, 14, 100, 84, 64,
+ 44, 44, 8, 5, 9, 17, 124, 67, 57, 37,
+ 53, 51, 35, 39, 31, 29, 35, 29, 25, 35,
+ 33, 41, 47, 23, 33, 27, 25, 17, 13, 13,
+ 13, 9, 9, 11, 13, 29, 8, 10, 16, 4,
+ 0, 10, 8, 8, 6, 4, 2, 0, 1, 9,
+ 17, 9, 8, 31, 8, 32, 8, 22, 12, 18,
+ 26, 12, 10, 8, 18, 5, 13, 21, 86, 90,
+ 88, 80, 74, 86, 80, 82, 80, 72, 82, 76,
+ 60, 48, 20, 80, 80, 74, 64, 56, 54, 34,
+ 28, 18, 16, 1, 9, 15, 33, 92, 92, 90,
+ 82, 66, 72, 64, 50, 46, 36, 28, 10, 2,
+ 11, 29, 1, 1, 33, 22, 20, 10, 3, 0,
+ 3, 15, 5, 19, 21, 45, 37, 53, 69, 5,
+ 27, 37, 29, 17, 0, 15, 14, 30, 9, 11,
+ 10, 18, 8, 16, 34, 8, 4, 100, 64, 30,
+ 2, 17, 55, 79, 107, 115, 1, 78, 58, 48,
+ 28, 38, 18, 8, 4, 17, 25, 15, 2, 11,
+ 16, 30, 3, 7, 8, 16, 12, 16, 32, 16,
+ 6, 100, 64, 30, 2, 17, 55, 79, 107, 115,
+ },
+
+ {
+ /* Context Tables for P, SP, B Slices :: cabac_init_idc = 1, qp = 23 */
+
+ 100, 14, 29, 100, 14, 29, 45, 13, 38, 22,
+ 4, 14, 30, 80, 114, 34, 25, 7, 2, 6,
+ 8, 2, 3, 9, 15, 28, 12, 39, 61, 85,
+ 2, 31, 13, 2, 6, 8, 27, 15, 22, 18,
+ 5, 21, 35, 4, 27, 37, 63, 12, 11, 25,
+ 8, 25, 23, 47, 1, 23, 15, 39, 1, 4,
+ 44, 0, 0, 0, 13, 55, 67, 4, 8, 5,
+ 60, 4, 37, 13, 3, 15, 40, 32, 7, 0,
+ 33, 1, 31, 25, 23, 33, 35, 39, 45, 48,
+ 3, 2, 25, 13, 45, 29, 37, 8, 5, 3,
+ 17, 30, 9, 0, 23, 41, 17, 25, 15, 1,
+ 11, 25, 13, 0, 39, 21, 98, 74, 16, 6,
+ 6, 13, 3, 6, 4, 20, 8, 5, 27, 13,
+ 13, 5, 3, 0, 18, 20, 10, 10, 14, 16,
+ 30, 20, 12, 8, 8, 4, 14, 18, 33, 3,
+ 10, 15, 20, 26, 38, 26, 8, 38, 24, 30,
+ 3, 6, 8, 8, 34, 51, 26, 42, 36, 54,
+ 64, 46, 46, 70, 68, 30, 50, 58, 22, 124,
+ 3, 40, 24, 30, 38, 32, 46, 44, 60, 10,
+ 20, 26, 32, 124, 21, 82, 90, 80, 74, 72,
+ 66, 58, 58, 50, 32, 26, 20, 10, 2, 37,
+ 23, 21, 49, 16, 14, 8, 6, 11, 13, 13,
+ 1, 27, 25, 31, 49, 35, 75, 29, 28, 26,
+ 20, 10, 4, 7, 15, 17, 29, 13, 58, 36,
+ 26, 8, 16, 5, 15, 23, 35, 6, 68, 52,
+ 42, 36, 28, 10, 0, 5, 19, 9, 84, 60,
+ 34, 8, 22, 1, 19, 27, 14, 98, 82, 62,
+ 42, 44, 8, 3, 9, 17, 124, 63, 53, 35,
+ 49, 49, 33, 37, 29, 27, 31, 27, 21, 33,
+ 31, 39, 45, 19, 33, 27, 25, 17, 11, 13,
+ 15, 9, 9, 11, 13, 31, 10, 10, 18, 4,
+ 0, 10, 8, 8, 6, 4, 2, 0, 1, 9,
+ 17, 9, 8, 33, 6, 32, 6, 22, 10, 16,
+ 26, 10, 8, 8, 18, 7, 13, 23, 84, 88,
+ 86, 76, 70, 82, 76, 76, 74, 66, 76, 70,
+ 54, 44, 16, 70, 72, 66, 54, 50, 48, 30,
+ 24, 14, 12, 5, 13, 17, 33, 86, 86, 86,
+ 78, 60, 66, 60, 44, 42, 32, 24, 4, 1,
+ 15, 31, 3, 1, 37, 20, 16, 8, 7, 3,
+ 7, 17, 7, 21, 23, 47, 41, 55, 69, 7,
+ 31, 41, 27, 17, 2, 13, 16, 32, 9, 9,
+ 10, 18, 8, 18, 36, 8, 4, 98, 60, 24,
+ 3, 25, 63, 87, 115, 121, 0, 78, 58, 48,
+ 28, 38, 18, 8, 4, 15, 25, 15, 4, 11,
+ 18, 32, 3, 7, 10, 18, 12, 16, 32, 16,
+ 6, 98, 60, 24, 3, 25, 63, 87, 115, 121,
+ },
+
+ {
+ /* Context Tables for P, SP, B Slices :: cabac_init_idc = 1, qp = 24 */
+
+ 96, 12, 29, 96, 12, 29, 43, 11, 38, 22,
+ 4, 10, 24, 78, 114, 32, 21, 9, 2, 8,
+ 8, 0, 3, 11, 19, 26, 6, 47, 67, 89,
+ 8, 29, 13, 2, 8, 8, 29, 13, 22, 16,
+ 5, 21, 35, 2, 29, 37, 63, 12, 11, 25,
+ 8, 25, 23, 47, 1, 23, 15, 39, 1, 4,
+ 44, 0, 0, 0, 11, 57, 67, 4, 6, 5,
+ 58, 4, 39, 9, 1, 13, 42, 36, 5, 2,
+ 29, 0, 29, 23, 19, 33, 35, 39, 45, 48,
+ 3, 4, 21, 13, 45, 27, 33, 8, 5, 3,
+ 19, 30, 9, 0, 21, 39, 15, 23, 15, 1,
+ 11, 23, 11, 0, 37, 19, 106, 82, 22, 10,
+ 6, 13, 3, 6, 2, 20, 8, 5, 27, 9,
+ 13, 9, 7, 4, 18, 20, 10, 10, 14, 16,
+ 34, 20, 12, 8, 6, 4, 16, 20, 37, 3,
+ 10, 15, 18, 22, 36, 26, 6, 38, 24, 28,
+ 5, 6, 6, 6, 32, 51, 24, 40, 34, 50,
+ 56, 42, 42, 66, 62, 24, 46, 54, 8, 124,
+ 11, 36, 20, 26, 32, 28, 38, 36, 52, 4,
+ 16, 20, 28, 124, 25, 78, 84, 74, 68, 66,
+ 60, 54, 52, 46, 28, 22, 16, 6, 1, 41,
+ 23, 23, 51, 14, 10, 6, 4, 13, 17, 17,
+ 5, 29, 25, 33, 51, 37, 75, 27, 30, 26,
+ 20, 10, 6, 5, 13, 17, 25, 11, 58, 36,
+ 26, 10, 18, 3, 11, 19, 33, 8, 68, 52,
+ 42, 38, 30, 10, 0, 5, 17, 11, 86, 60,
+ 32, 8, 22, 1, 19, 27, 14, 98, 80, 60,
+ 40, 44, 8, 3, 9, 17, 124, 61, 51, 33,
+ 47, 45, 31, 33, 25, 25, 29, 23, 17, 33,
+ 31, 39, 43, 17, 33, 27, 25, 15, 11, 13,
+ 15, 9, 11, 11, 13, 31, 10, 10, 20, 4,
+ 1, 10, 8, 6, 6, 4, 2, 1, 1, 9,
+ 17, 9, 10, 35, 4, 32, 2, 20, 8, 14,
+ 24, 8, 6, 8, 18, 9, 13, 27, 82, 84,
+ 84, 72, 66, 78, 72, 72, 70, 60, 70, 64,
+ 48, 38, 12, 62, 64, 56, 44, 42, 40, 24,
+ 18, 8, 8, 9, 15, 19, 35, 82, 82, 80,
+ 72, 54, 60, 54, 38, 36, 26, 18, 1, 5,
+ 19, 35, 5, 3, 41, 16, 14, 4, 11, 7,
+ 11, 21, 11, 25, 27, 49, 43, 57, 71, 11,
+ 33, 43, 27, 17, 2, 13, 18, 34, 9, 9,
+ 12, 20, 8, 18, 36, 8, 2, 96, 56, 18,
+ 9, 31, 71, 97, 125, 125, 0, 78, 58, 48,
+ 28, 40, 18, 8, 4, 15, 25, 15, 4, 11,
+ 18, 32, 3, 7, 10, 18, 12, 16, 34, 16,
+ 4, 96, 56, 18, 9, 31, 71, 97, 125, 125,
+ },
+
+ {
+ /* Context Tables for P, SP, B Slices :: cabac_init_idc = 1, qp = 25 */
+
+ 94, 12, 29, 94, 12, 29, 39, 9, 40, 22,
+ 4, 8, 20, 76, 116, 32, 17, 9, 4, 10,
+ 8, 1, 3, 11, 21, 24, 2, 55, 73, 91,
+ 14, 27, 11, 4, 10, 8, 29, 11, 22, 16,
+ 3, 21, 33, 2, 29, 37, 63, 12, 11, 23,
+ 8, 25, 23, 47, 1, 23, 15, 37, 1, 4,
+ 44, 0, 0, 0, 9, 57, 67, 4, 4, 5,
+ 56, 4, 39, 5, 2, 11, 46, 40, 1, 4,
+ 25, 4, 27, 19, 13, 33, 35, 39, 43, 48,
+ 3, 6, 17, 11, 43, 25, 29, 8, 5, 3,
+ 19, 30, 9, 0, 19, 37, 13, 21, 13, 0,
+ 9, 21, 7, 2, 33, 17, 116, 92, 30, 16,
+ 8, 13, 1, 8, 2, 20, 8, 3, 25, 3,
+ 13, 13, 9, 10, 18, 20, 10, 10, 14, 18,
+ 38, 22, 12, 8, 6, 4, 18, 22, 39, 1,
+ 10, 15, 16, 20, 34, 26, 4, 38, 24, 28,
+ 7, 8, 4, 6, 30, 51, 24, 40, 34, 46,
+ 50, 40, 40, 62, 58, 20, 42, 50, 3, 124,
+ 17, 32, 16, 22, 26, 24, 32, 28, 44, 0,
+ 12, 14, 24, 124, 29, 74, 80, 70, 64, 62,
+ 56, 50, 48, 42, 24, 18, 14, 4, 3, 45,
+ 23, 23, 51, 14, 8, 4, 2, 15, 19, 19,
+ 9, 31, 25, 35, 53, 37, 73, 25, 32, 26,
+ 20, 10, 8, 3, 11, 15, 19, 9, 58, 36,
+ 26, 12, 20, 1, 7, 15, 29, 10, 70, 54,
+ 44, 40, 32, 10, 2, 5, 15, 11, 88, 60,
+ 30, 8, 24, 0, 19, 25, 14, 98, 80, 58,
+ 38, 44, 8, 3, 9, 15, 124, 59, 49, 29,
+ 43, 41, 29, 29, 21, 23, 25, 19, 13, 31,
+ 29, 37, 41, 13, 31, 25, 23, 13, 9, 13,
+ 15, 9, 13, 11, 13, 31, 10, 12, 22, 6,
+ 1, 12, 8, 6, 6, 4, 2, 3, 1, 7,
+ 15, 9, 12, 35, 2, 32, 0, 18, 8, 14,
+ 24, 8, 4, 8, 18, 11, 13, 29, 80, 82,
+ 84, 70, 62, 74, 68, 68, 66, 56, 64, 58,
+ 42, 32, 8, 54, 56, 48, 34, 36, 34, 18,
+ 14, 4, 6, 11, 17, 19, 37, 78, 78, 76,
+ 68, 50, 56, 50, 32, 32, 22, 14, 5, 9,
+ 23, 39, 5, 5, 43, 14, 12, 0, 15, 9,
+ 13, 25, 13, 29, 29, 49, 45, 57, 73, 15,
+ 35, 45, 27, 15, 4, 13, 20, 38, 7, 7,
+ 14, 22, 8, 20, 38, 8, 2, 94, 52, 14,
+ 15, 37, 79, 105, 125, 125, 0, 78, 58, 50,
+ 30, 42, 20, 10, 6, 15, 25, 15, 4, 9,
+ 20, 34, 3, 5, 10, 18, 12, 18, 36, 16,
+ 4, 94, 52, 14, 15, 37, 79, 105, 125, 125,
+ },
+
+ {
+ /* Context Tables for P, SP, B Slices :: cabac_init_idc = 1, qp = 26 */
+
+ 92, 12, 29, 92, 12, 29, 35, 5, 40, 22,
+ 2, 6, 16, 74, 116, 32, 13, 9, 4, 12,
+ 6, 1, 3, 13, 25, 22, 3, 61, 77, 95,
+ 20, 25, 11, 4, 12, 6, 29, 9, 22, 14,
+ 3, 19, 31, 2, 29, 37, 63, 12, 11, 23,
+ 8, 25, 21, 45, 1, 23, 15, 37, 0, 4,
+ 44, 0, 0, 0, 9, 57, 67, 6, 2, 5,
+ 54, 2, 39, 0, 4, 9, 50, 44, 0, 8,
+ 23, 6, 23, 17, 9, 33, 33, 37, 43, 48,
+ 3, 8, 13, 11, 41, 25, 25, 8, 7, 5,
+ 21, 30, 9, 2, 19, 35, 11, 21, 11, 0,
+ 7, 17, 5, 4, 31, 15, 124, 100, 36, 22,
+ 8, 11, 1, 8, 2, 20, 10, 3, 25, 0,
+ 13, 17, 13, 16, 18, 22, 10, 10, 14, 18,
+ 40, 24, 14, 6, 6, 4, 18, 22, 41, 1,
+ 10, 17, 16, 18, 34, 24, 4, 38, 24, 28,
+ 9, 8, 2, 4, 30, 51, 24, 38, 32, 42,
+ 44, 38, 36, 58, 54, 16, 36, 46, 15, 124,
+ 25, 28, 12, 16, 20, 18, 26, 18, 36, 5,
+ 8, 6, 20, 124, 31, 70, 76, 66, 60, 58,
+ 52, 46, 44, 38, 18, 14, 10, 0, 7, 49,
+ 23, 23, 53, 12, 6, 2, 1, 17, 21, 21,
+ 11, 33, 27, 37, 53, 39, 71, 21, 34, 28,
+ 20, 12, 8, 3, 11, 13, 15, 5, 58, 38,
+ 28, 12, 22, 0, 5, 13, 25, 10, 72, 56,
+ 46, 40, 34, 12, 2, 3, 15, 11, 88, 60,
+ 30, 6, 24, 0, 19, 25, 14, 96, 78, 56,
+ 36, 44, 8, 1, 9, 15, 124, 55, 45, 27,
+ 39, 39, 27, 27, 19, 21, 21, 17, 9, 29,
+ 27, 37, 39, 9, 31, 25, 23, 13, 9, 13,
+ 17, 9, 13, 11, 13, 33, 12, 12, 24, 6,
+ 1, 12, 8, 6, 6, 4, 2, 3, 1, 7,
+ 15, 9, 12, 37, 0, 32, 1, 18, 6, 12,
+ 24, 6, 2, 8, 18, 13, 13, 31, 78, 80,
+ 82, 66, 58, 70, 64, 62, 60, 50, 58, 52,
+ 36, 28, 4, 44, 48, 40, 24, 30, 28, 14,
+ 10, 1, 2, 15, 21, 21, 37, 72, 72, 70,
+ 64, 44, 50, 44, 26, 28, 16, 10, 11, 13,
+ 27, 41, 7, 5, 47, 12, 8, 1, 19, 13,
+ 17, 27, 15, 31, 31, 51, 49, 59, 73, 17,
+ 39, 49, 25, 15, 6, 11, 22, 40, 7, 5,
+ 14, 22, 8, 20, 40, 8, 2, 92, 48, 8,
+ 21, 45, 87, 113, 125, 125, 2, 78, 58, 50,
+ 30, 42, 20, 10, 6, 13, 25, 15, 6, 9,
+ 22, 36, 3, 5, 12, 20, 12, 18, 36, 16,
+ 4, 92, 48, 8, 21, 45, 87, 113, 125, 125,
+ },
+
+ {
+ /* Context Tables for P, SP, B Slices :: cabac_init_idc = 1, qp = 27 */
+
+ 90, 12, 31, 90, 12, 31, 31, 3, 42, 22,
+ 2, 2, 12, 72, 116, 32, 11, 9, 6, 12,
+ 6, 3, 1, 13, 27, 18, 7, 69, 83, 97,
+ 24, 23, 11, 6, 12, 6, 31, 9, 22, 14,
+ 3, 19, 31, 2, 29, 35, 61, 12, 9, 21,
+ 8, 23, 21, 45, 1, 23, 15, 37, 0, 4,
+ 44, 0, 0, 0, 7, 57, 67, 6, 2, 7,
+ 54, 2, 39, 4, 6, 7, 52, 50, 4, 10,
+ 19, 10, 21, 15, 5, 33, 33, 37, 43, 48,
+ 3, 10, 7, 11, 39, 23, 21, 6, 7, 5,
+ 21, 32, 9, 2, 17, 35, 11, 19, 9, 2,
+ 5, 15, 1, 4, 29, 13, 124, 110, 42, 28,
+ 8, 11, 1, 10, 2, 20, 10, 1, 23, 4,
+ 11, 21, 17, 22, 18, 22, 10, 10, 16, 20,
+ 44, 26, 14, 6, 6, 4, 20, 24, 43, 1,
+ 10, 17, 14, 16, 32, 24, 2, 38, 24, 28,
+ 13, 8, 0, 2, 28, 51, 24, 38, 32, 38,
+ 38, 36, 32, 52, 48, 12, 32, 42, 29, 124,
+ 31, 24, 10, 12, 12, 14, 20, 10, 26, 11,
+ 4, 0, 16, 124, 35, 66, 72, 62, 54, 52,
+ 48, 40, 40, 34, 14, 8, 6, 3, 9, 51,
+ 23, 23, 53, 10, 4, 0, 3, 19, 23, 23,
+ 15, 35, 27, 39, 55, 39, 71, 19, 36, 28,
+ 20, 12, 10, 1, 9, 13, 11, 3, 60, 38,
+ 28, 14, 24, 2, 1, 9, 21, 12, 72, 56,
+ 46, 42, 36, 12, 2, 3, 13, 11, 90, 60,
+ 28, 6, 24, 0, 19, 23, 12, 96, 78, 54,
+ 34, 42, 8, 1, 9, 15, 124, 53, 43, 25,
+ 37, 35, 25, 23, 15, 19, 19, 13, 5, 29,
+ 25, 35, 37, 5, 29, 25, 23, 11, 7, 13,
+ 17, 9, 15, 11, 13, 33, 12, 14, 26, 6,
+ 1, 14, 10, 6, 8, 6, 2, 5, 1, 7,
+ 15, 9, 14, 39, 1, 32, 3, 16, 4, 12,
+ 24, 4, 2, 8, 16, 17, 15, 33, 74, 78,
+ 80, 64, 54, 66, 60, 58, 56, 46, 52, 48,
+ 30, 22, 0, 36, 40, 32, 16, 22, 22, 8,
+ 6, 5, 1, 19, 23, 23, 39, 68, 68, 66,
+ 58, 38, 46, 40, 20, 22, 12, 4, 17, 17,
+ 29, 45, 9, 7, 49, 8, 6, 5, 23, 15,
+ 21, 31, 17, 35, 33, 53, 51, 61, 75, 21,
+ 41, 51, 25, 13, 6, 11, 24, 42, 7, 5,
+ 16, 24, 8, 22, 42, 8, 2, 90, 44, 4,
+ 27, 51, 95, 121, 125, 125, 2, 78, 58, 50,
+ 30, 44, 22, 10, 6, 13, 23, 13, 6, 7,
+ 24, 38, 1, 3, 12, 20, 12, 18, 38, 16,
+ 4, 90, 44, 4, 27, 51, 95, 121, 125, 125,
+ },
+
+ {
+ /* Context Tables for P, SP, B Slices :: cabac_init_idc = 1, qp = 28 */
+
+ 86, 12, 31, 86, 12, 31, 29, 0, 42, 22,
+ 0, 0, 8, 70, 116, 30, 7, 9, 6, 14,
+ 6, 3, 1, 15, 31, 16, 13, 75, 87, 101,
+ 30, 21, 11, 6, 14, 6, 31, 7, 22, 12,
+ 3, 19, 29, 2, 29, 35, 61, 12, 9, 21,
+ 8, 23, 19, 45, 1, 23, 15, 37, 0, 4,
+ 44, 0, 0, 0, 7, 59, 67, 8, 0, 7,
+ 52, 0, 39, 10, 8, 5, 56, 54, 6, 14,
+ 15, 12, 19, 13, 1, 33, 33, 35, 43, 48,
+ 3, 12, 3, 11, 39, 23, 17, 6, 9, 7,
+ 23, 32, 9, 4, 17, 33, 9, 19, 7, 2,
+ 5, 11, 0, 6, 27, 11, 124, 118, 48, 34,
+ 8, 11, 1, 10, 2, 20, 12, 1, 23, 8,
+ 11, 25, 21, 26, 18, 22, 10, 10, 16, 20,
+ 46, 28, 14, 6, 6, 4, 20, 26, 45, 1,
+ 10, 19, 14, 14, 32, 24, 2, 38, 24, 28,
+ 15, 8, 1, 0, 28, 51, 22, 36, 30, 34,
+ 30, 34, 28, 48, 44, 8, 26, 38, 41, 124,
+ 39, 20, 6, 8, 6, 8, 12, 2, 18, 17,
+ 0, 7, 12, 124, 39, 62, 68, 56, 50, 48,
+ 44, 36, 36, 30, 10, 4, 2, 7, 13, 55,
+ 23, 23, 55, 8, 2, 1, 5, 21, 27, 25,
+ 19, 37, 29, 41, 57, 41, 69, 15, 38, 30,
+ 20, 12, 12, 1, 9, 11, 7, 1, 60, 38,
+ 30, 16, 26, 4, 0, 7, 19, 14, 74, 58,
+ 48, 42, 38, 14, 2, 3, 13, 11, 90, 60,
+ 26, 6, 24, 0, 19, 23, 12, 94, 76, 52,
+ 32, 42, 8, 1, 9, 15, 124, 51, 39, 23,
+ 33, 33, 23, 19, 11, 17, 15, 9, 1, 27,
+ 23, 35, 35, 3, 29, 25, 23, 11, 7, 13,
+ 17, 9, 15, 11, 13, 35, 14, 14, 28, 6,
+ 1, 14, 10, 6, 8, 6, 2, 5, 1, 7,
+ 15, 9, 14, 41, 3, 32, 5, 14, 2, 10,
+ 24, 2, 0, 8, 16, 19, 15, 35, 72, 76,
+ 78, 60, 50, 62, 56, 54, 50, 40, 46, 42,
+ 24, 18, 3, 28, 32, 24, 6, 16, 14, 4,
+ 0, 11, 5, 23, 27, 25, 41, 64, 62, 60,
+ 54, 32, 40, 34, 14, 18, 6, 0, 23, 21,
+ 33, 47, 11, 9, 53, 6, 2, 9, 27, 19,
+ 25, 35, 19, 37, 35, 55, 53, 63, 77, 23,
+ 43, 53, 25, 13, 8, 9, 26, 44, 7, 3,
+ 16, 24, 8, 22, 44, 8, 2, 88, 40, 1,
+ 33, 57, 103, 125, 125, 125, 2, 78, 58, 50,
+ 30, 44, 22, 10, 6, 11, 23, 13, 8, 7,
+ 24, 40, 1, 3, 12, 20, 12, 18, 38, 16,
+ 4, 88, 40, 1, 33, 57, 103, 125, 125, 125,
+ },
+
+ {
+ /* Context Tables for P, SP, B Slices :: cabac_init_idc = 1, qp = 29 */
+
+ 84, 12, 31, 84, 12, 31, 25, 2, 42, 22,
+ 0, 1, 4, 68, 116, 30, 3, 9, 6, 16,
+ 4, 5, 1, 17, 35, 14, 17, 83, 93, 105,
+ 36, 19, 9, 6, 16, 4, 31, 5, 22, 12,
+ 3, 17, 27, 2, 29, 35, 61, 12, 9, 21,
+ 8, 23, 19, 43, 1, 23, 15, 37, 2, 4,
+ 44, 0, 0, 0, 5, 59, 67, 8, 1, 7,
+ 50, 0, 39, 14, 10, 3, 60, 58, 10, 16,
+ 13, 16, 15, 11, 4, 33, 31, 35, 41, 48,
+ 3, 14, 0, 11, 37, 21, 13, 6, 9, 7,
+ 23, 32, 9, 4, 15, 31, 7, 17, 5, 4,
+ 3, 9, 4, 8, 23, 9, 124, 124, 54, 40,
+ 10, 9, 0, 12, 2, 20, 12, 1, 21, 12,
+ 11, 29, 25, 32, 18, 24, 10, 10, 16, 22,
+ 50, 30, 16, 4, 6, 4, 22, 26, 47, 0,
+ 10, 19, 12, 12, 30, 22, 0, 38, 24, 28,
+ 17, 10, 3, 0, 26, 51, 22, 34, 28, 30,
+ 24, 32, 26, 44, 40, 4, 22, 34, 53, 124,
+ 45, 16, 2, 2, 0, 4, 6, 7, 10, 21,
+ 3, 13, 8, 124, 41, 58, 64, 52, 46, 44,
+ 40, 32, 32, 26, 4, 0, 1, 9, 17, 59,
+ 23, 23, 55, 6, 0, 3, 9, 23, 29, 27,
+ 21, 39, 29, 43, 57, 43, 67, 13, 40, 30,
+ 20, 14, 12, 0, 7, 9, 3, 2, 60, 40,
+ 30, 16, 28, 6, 4, 3, 15, 14, 76, 60,
+ 50, 44, 40, 14, 4, 1, 11, 11, 92, 60,
+ 26, 4, 24, 0, 19, 21, 12, 94, 74, 50,
+ 30, 42, 8, 0, 9, 13, 124, 47, 37, 19,
+ 29, 29, 21, 17, 9, 15, 11, 7, 2, 25,
+ 21, 33, 33, 0, 29, 25, 21, 9, 5, 13,
+ 19, 9, 17, 11, 13, 35, 14, 14, 30, 6,
+ 1, 14, 10, 6, 8, 6, 2, 7, 1, 7,
+ 15, 9, 16, 43, 5, 32, 7, 14, 2, 8,
+ 24, 2, 1, 8, 16, 21, 15, 37, 70, 74,
+ 76, 56, 46, 58, 52, 48, 46, 34, 40, 36,
+ 18, 12, 7, 18, 24, 16, 3, 10, 8, 1,
+ 3, 15, 9, 25, 29, 25, 41, 58, 58, 56,
+ 50, 26, 34, 30, 8, 14, 2, 3, 29, 25,
+ 37, 51, 11, 9, 57, 4, 0, 11, 31, 23,
+ 29, 37, 21, 41, 37, 55, 57, 65, 77, 27,
+ 47, 57, 23, 13, 10, 9, 28, 46, 5, 1,
+ 18, 26, 8, 24, 46, 8, 2, 86, 36, 7,
+ 39, 65, 111, 125, 125, 125, 4, 78, 58, 50,
+ 30, 46, 22, 12, 8, 11, 23, 13, 8, 7,
+ 26, 42, 1, 3, 14, 22, 12, 20, 40, 16,
+ 4, 86, 36, 7, 39, 65, 111, 125, 125, 125,
+ },
+
+ {
+ /* Context Tables for P, SP, B Slices :: cabac_init_idc = 1, qp = 30 */
+
+ 82, 12, 31, 82, 12, 31, 21, 6, 44, 22,
+ 1, 5, 0, 66, 116, 30, 0, 9, 8, 18,
+ 4, 5, 1, 17, 37, 12, 23, 89, 97, 107,
+ 42, 17, 9, 8, 18, 4, 33, 3, 22, 10,
+ 3, 17, 27, 2, 29, 35, 61, 12, 9, 19,
+ 8, 23, 17, 43, 1, 23, 15, 37, 2, 4,
+ 44, 0, 0, 0, 5, 59, 67, 10, 3, 7,
+ 48, 1, 39, 20, 12, 1, 62, 62, 12, 20,
+ 9, 18, 13, 9, 8, 33, 31, 33, 41, 48,
+ 3, 16, 4, 11, 35, 21, 9, 6, 11, 9,
+ 25, 32, 9, 6, 15, 29, 5, 17, 3, 4,
+ 1, 5, 6, 8, 21, 7, 124, 124, 60, 46,
+ 10, 9, 0, 12, 2, 20, 14, 0, 21, 16,
+ 11, 33, 29, 38, 18, 24, 10, 10, 16, 22,
+ 52, 32, 16, 4, 6, 4, 22, 28, 49, 0,
+ 10, 21, 12, 10, 30, 22, 0, 38, 24, 28,
+ 19, 10, 5, 1, 26, 51, 22, 34, 28, 26,
+ 18, 30, 22, 40, 34, 0, 16, 30, 67, 124,
+ 53, 12, 1, 1, 5, 1, 0, 15, 2, 27,
+ 7, 21, 4, 124, 45, 54, 60, 48, 40, 38,
+ 36, 28, 28, 22, 0, 3, 5, 13, 19, 63,
+ 23, 23, 57, 4, 1, 5, 11, 25, 31, 29,
+ 25, 41, 31, 45, 59, 43, 67, 9, 42, 32,
+ 20, 14, 14, 0, 7, 9, 0, 4, 60, 40,
+ 32, 18, 30, 8, 6, 1, 11, 16, 76, 60,
+ 50, 44, 42, 16, 4, 1, 11, 11, 92, 60,
+ 24, 4, 24, 0, 19, 21, 12, 92, 74, 48,
+ 28, 42, 8, 0, 9, 13, 124, 45, 33, 17,
+ 27, 27, 19, 13, 5, 13, 9, 3, 6, 25,
+ 19, 33, 31, 4, 27, 25, 21, 9, 5, 13,
+ 19, 9, 17, 11, 13, 37, 16, 16, 32, 6,
+ 1, 16, 10, 6, 8, 6, 2, 7, 1, 7,
+ 15, 9, 16, 45, 7, 32, 9, 12, 0, 8,
+ 24, 0, 3, 8, 16, 23, 15, 39, 68, 72,
+ 74, 54, 42, 54, 48, 44, 40, 30, 34, 30,
+ 12, 8, 11, 10, 16, 8, 13, 2, 2, 5,
+ 7, 21, 13, 29, 33, 27, 43, 54, 52, 50,
+ 44, 20, 30, 24, 2, 8, 3, 9, 35, 29,
+ 41, 53, 13, 11, 59, 0, 3, 15, 35, 25,
+ 33, 41, 23, 43, 39, 57, 59, 67, 79, 29,
+ 49, 59, 23, 11, 10, 7, 30, 48, 5, 1,
+ 18, 26, 8, 24, 48, 8, 2, 84, 32, 11,
+ 45, 71, 119, 125, 125, 125, 4, 78, 58, 50,
+ 30, 46, 24, 12, 8, 9, 23, 13, 10, 5,
+ 28, 44, 1, 1, 14, 22, 12, 20, 40, 16,
+ 4, 84, 32, 11, 45, 71, 119, 125, 125, 125,
+ },
+
+ {
+ /* Context Tables for P, SP, B Slices :: cabac_init_idc = 1, qp = 31 */
+
+ 80, 12, 31, 80, 12, 31, 17, 8, 44, 22,
+ 1, 7, 3, 64, 116, 30, 4, 9, 8, 20,
+ 4, 7, 1, 19, 41, 10, 27, 97, 103, 111,
+ 48, 15, 9, 8, 20, 4, 33, 1, 22, 10,
+ 3, 17, 25, 2, 29, 35, 61, 12, 9, 19,
+ 8, 23, 17, 43, 1, 23, 15, 37, 2, 4,
+ 44, 0, 0, 0, 3, 59, 67, 10, 5, 7,
+ 46, 1, 39, 24, 14, 0, 66, 66, 16, 22,
+ 5, 22, 11, 7, 12, 33, 31, 33, 41, 48,
+ 3, 18, 8, 11, 33, 19, 5, 6, 11, 9,
+ 25, 32, 9, 6, 13, 27, 3, 15, 1, 6,
+ 0, 3, 10, 10, 19, 5, 124, 124, 66, 52,
+ 10, 9, 0, 14, 2, 20, 14, 0, 19, 20,
+ 11, 37, 33, 44, 18, 24, 10, 10, 16, 24,
+ 56, 34, 16, 4, 6, 4, 24, 30, 51, 0,
+ 10, 21, 10, 8, 28, 22, 1, 38, 24, 28,
+ 21, 10, 7, 3, 24, 51, 22, 32, 26, 22,
+ 12, 28, 18, 36, 30, 3, 12, 26, 79, 124,
+ 59, 8, 5, 5, 11, 5, 5, 23, 5, 33,
+ 11, 27, 0, 124, 49, 50, 56, 44, 36, 34,
+ 32, 24, 24, 18, 3, 7, 9, 17, 23, 67,
+ 23, 23, 57, 2, 3, 7, 13, 27, 33, 31,
+ 29, 43, 31, 47, 61, 45, 65, 7, 44, 32,
+ 20, 14, 16, 2, 5, 7, 4, 6, 60, 40,
+ 32, 20, 32, 10, 10, 2, 7, 18, 78, 62,
+ 52, 46, 44, 16, 4, 1, 9, 11, 94, 60,
+ 22, 4, 24, 0, 19, 19, 12, 92, 72, 46,
+ 26, 42, 8, 0, 9, 13, 124, 43, 31, 15,
+ 23, 23, 17, 9, 1, 11, 5, 0, 10, 23,
+ 17, 31, 29, 8, 27, 25, 21, 7, 3, 13,
+ 19, 9, 19, 11, 13, 37, 16, 16, 34, 6,
+ 1, 16, 10, 6, 8, 6, 2, 9, 1, 7,
+ 15, 9, 18, 47, 9, 32, 11, 10, 1, 6,
+ 24, 1, 5, 8, 16, 25, 15, 41, 66, 70,
+ 72, 50, 38, 50, 44, 40, 36, 24, 28, 24,
+ 6, 2, 15, 2, 8, 0, 23, 3, 3, 11,
+ 11, 25, 17, 33, 35, 29, 45, 50, 48, 46,
+ 40, 14, 24, 20, 3, 4, 7, 13, 41, 33,
+ 45, 57, 15, 13, 63, 1, 5, 19, 39, 29,
+ 37, 45, 25, 47, 41, 59, 61, 69, 81, 33,
+ 51, 61, 23, 11, 12, 7, 32, 50, 5, 0,
+ 20, 28, 8, 26, 50, 8, 2, 82, 28, 17,
+ 51, 77, 125, 125, 125, 125, 4, 78, 58, 50,
+ 30, 48, 24, 12, 8, 9, 23, 13, 10, 5,
+ 30, 46, 1, 1, 14, 22, 12, 20, 42, 16,
+ 4, 82, 28, 17, 51, 77, 125, 125, 125, 125,
+ },
+
+ {
+ /* Context Tables for P, SP, B Slices :: cabac_init_idc = 1, qp = 32 */
+
+ 76, 10, 33, 76, 10, 33, 15, 10, 44, 22,
+ 3, 11, 9, 62, 116, 28, 6, 11, 8, 20,
+ 2, 9, 1, 21, 45, 6, 33, 105, 109, 115,
+ 52, 15, 9, 8, 20, 2, 35, 1, 22, 8,
+ 3, 17, 25, 0, 31, 35, 61, 10, 9, 19,
+ 8, 23, 17, 43, 1, 23, 17, 37, 2, 4,
+ 44, 0, 0, 0, 3, 61, 67, 10, 7, 9,
+ 44, 3, 41, 28, 16, 2, 68, 70, 18, 24,
+ 3, 24, 9, 5, 16, 33, 31, 33, 41, 48,
+ 3, 18, 12, 11, 33, 19, 3, 4, 13, 11,
+ 27, 32, 9, 6, 13, 27, 3, 15, 1, 6,
+ 0, 1, 12, 10, 17, 3, 124, 124, 72, 56,
+ 10, 9, 0, 14, 0, 20, 14, 0, 19, 24,
+ 11, 41, 37, 48, 18, 24, 10, 10, 16, 24,
+ 58, 34, 16, 2, 4, 4, 24, 30, 55, 0,
+ 10, 23, 8, 4, 26, 20, 3, 38, 24, 26,
+ 25, 10, 9, 5, 22, 51, 20, 30, 24, 16,
+ 4, 24, 14, 30, 24, 9, 6, 22, 93, 124,
+ 67, 2, 9, 11, 19, 11, 13, 33, 15, 39,
+ 15, 35, 3, 124, 53, 44, 50, 38, 30, 28,
+ 26, 18, 18, 14, 9, 13, 13, 21, 27, 71,
+ 23, 25, 59, 0, 7, 11, 17, 31, 37, 35,
+ 33, 45, 33, 49, 63, 47, 65, 5, 44, 32,
+ 20, 14, 16, 2, 5, 7, 8, 8, 60, 40,
+ 32, 20, 32, 12, 12, 4, 5, 18, 78, 62,
+ 52, 46, 46, 16, 4, 1, 9, 13, 94, 58,
+ 20, 2, 24, 0, 19, 19, 10, 90, 70, 42,
+ 24, 40, 8, 0, 9, 13, 124, 41, 29, 13,
+ 21, 21, 15, 7, 0, 9, 3, 2, 14, 23,
+ 17, 31, 27, 10, 27, 25, 21, 7, 3, 13,
+ 21, 11, 21, 11, 15, 39, 16, 16, 36, 6,
+ 3, 16, 10, 4, 8, 6, 2, 11, 1, 7,
+ 15, 9, 18, 49, 11, 32, 15, 8, 3, 4,
+ 22, 3, 7, 8, 14, 29, 17, 45, 62, 66,
+ 70, 46, 34, 44, 38, 34, 30, 18, 22, 18,
+ 1, 3, 19, 7, 0, 9, 33, 11, 11, 17,
+ 17, 31, 21, 37, 39, 31, 47, 44, 42, 40,
+ 34, 8, 18, 14, 11, 1, 13, 19, 47, 37,
+ 49, 61, 17, 15, 67, 5, 9, 23, 45, 33,
+ 41, 49, 29, 51, 45, 61, 65, 71, 83, 37,
+ 55, 65, 23, 11, 12, 7, 34, 52, 5, 0,
+ 20, 28, 8, 26, 50, 8, 0, 78, 24, 23,
+ 59, 85, 125, 125, 125, 125, 4, 78, 58, 50,
+ 30, 48, 24, 12, 8, 9, 23, 13, 10, 5,
+ 30, 46, 1, 1, 14, 22, 12, 20, 42, 14,
+ 2, 78, 24, 23, 59, 85, 125, 125, 125, 125,
+ },
+
+ {
+ /* Context Tables for P, SP, B Slices :: cabac_init_idc = 1, qp = 33 */
+
+ 74, 10, 33, 74, 10, 33, 11, 14, 46, 24,
+ 3, 13, 13, 60, 118, 28, 10, 11, 10, 22,
+ 2, 9, 0, 21, 47, 4, 37, 111, 113, 117,
+ 58, 13, 7, 10, 22, 2, 35, 0, 22, 8,
+ 1, 15, 23, 0, 31, 33, 59, 10, 7, 17,
+ 8, 21, 15, 41, 0, 21, 17, 35, 4, 4,
+ 44, 0, 0, 0, 1, 61, 67, 12, 7, 9,
+ 44, 3, 41, 34, 20, 6, 72, 76, 22, 28,
+ 0, 28, 5, 1, 22, 33, 29, 31, 39, 48,
+ 3, 20, 18, 9, 31, 17, 0, 4, 13, 11,
+ 27, 34, 9, 8, 11, 25, 1, 13, 0, 8,
+ 2, 2, 16, 12, 13, 0, 124, 124, 80, 62,
+ 12, 7, 2, 16, 0, 22, 16, 2, 17, 30,
+ 9, 45, 39, 54, 18, 26, 10, 12, 18, 26,
+ 62, 36, 18, 2, 4, 4, 26, 32, 57, 2,
+ 10, 23, 8, 2, 26, 20, 3, 38, 24, 26,
+ 27, 12, 9, 5, 22, 51, 20, 30, 24, 12,
+ 1, 22, 12, 26, 20, 13, 2, 18, 105, 124,
+ 73, 1, 11, 15, 25, 15, 19, 41, 23, 43,
+ 19, 41, 7, 124, 55, 40, 46, 34, 26, 24,
+ 22, 14, 14, 10, 13, 17, 15, 23, 29, 73,
+ 23, 25, 59, 0, 9, 13, 19, 33, 39, 37,
+ 35, 47, 33, 51, 63, 47, 63, 1, 46, 34,
+ 20, 16, 18, 4, 3, 5, 14, 12, 62, 42,
+ 34, 22, 34, 16, 16, 8, 1, 20, 80, 64,
+ 54, 48, 48, 18, 6, 0, 7, 13, 96, 58,
+ 20, 2, 26, 2, 19, 17, 10, 90, 70, 40,
+ 22, 40, 10, 2, 7, 11, 124, 37, 25, 9,
+ 17, 17, 11, 3, 4, 5, 0, 6, 20, 21,
+ 15, 29, 23, 14, 25, 23, 19, 5, 1, 11,
+ 21, 11, 21, 11, 15, 39, 18, 18, 38, 8,
+ 3, 18, 12, 4, 10, 8, 2, 11, 0, 5,
+ 13, 7, 20, 49, 13, 32, 17, 8, 3, 4,
+ 22, 3, 7, 8, 14, 31, 17, 47, 60, 64,
+ 70, 44, 32, 40, 34, 30, 26, 14, 18, 14,
+ 7, 7, 23, 15, 5, 17, 41, 17, 17, 21,
+ 21, 35, 23, 39, 41, 31, 47, 40, 38, 36,
+ 30, 4, 14, 10, 17, 5, 17, 23, 51, 39,
+ 51, 63, 17, 15, 69, 7, 11, 25, 49, 35,
+ 43, 51, 31, 53, 47, 61, 67, 71, 83, 39,
+ 57, 67, 21, 9, 14, 5, 38, 56, 3, 2,
+ 22, 30, 10, 28, 52, 8, 0, 76, 20, 27,
+ 65, 91, 125, 125, 125, 125, 6, 78, 60, 52,
+ 32, 50, 26, 14, 10, 7, 21, 11, 12, 3,
+ 32, 48, 0, 0, 16, 24, 12, 22, 44, 14,
+ 2, 76, 20, 27, 65, 91, 125, 125, 125, 125,
+ },
+
+ {
+ /* Context Tables for P, SP, B Slices :: cabac_init_idc = 1, qp = 34 */
+
+ 72, 10, 33, 72, 10, 33, 7, 16, 46, 24,
+ 3, 15, 17, 58, 118, 28, 14, 11, 10, 24,
+ 2, 11, 0, 23, 51, 2, 43, 119, 119, 121,
+ 64, 11, 7, 10, 24, 2, 35, 2, 22, 6,
+ 1, 15, 21, 0, 31, 33, 59, 10, 7, 17,
+ 8, 21, 15, 41, 0, 21, 17, 35, 4, 4,
+ 44, 0, 0, 0, 0, 61, 67, 12, 9, 9,
+ 42, 3, 41, 38, 22, 8, 76, 80, 24, 30,
+ 4, 30, 3, 0, 26, 33, 29, 31, 39, 48,
+ 3, 22, 22, 9, 29, 15, 4, 4, 13, 11,
+ 29, 34, 9, 8, 9, 23, 0, 11, 2, 8,
+ 4, 4, 18, 14, 11, 2, 124, 124, 86, 68,
+ 12, 7, 2, 16, 0, 22, 16, 2, 17, 34,
+ 9, 49, 43, 60, 18, 26, 10, 12, 18, 26,
+ 66, 38, 18, 2, 4, 4, 28, 34, 59, 2,
+ 10, 23, 6, 0, 24, 20, 5, 38, 24, 26,
+ 29, 12, 11, 7, 20, 51, 20, 28, 22, 8,
+ 7, 20, 8, 22, 16, 17, 1, 14, 117, 124,
+ 81, 5, 15, 19, 31, 19, 25, 49, 31, 49,
+ 23, 47, 11, 124, 59, 36, 42, 30, 22, 20,
+ 18, 10, 10, 6, 17, 21, 19, 27, 33, 77,
+ 23, 25, 61, 1, 11, 15, 21, 35, 41, 39,
+ 39, 49, 33, 53, 65, 49, 61, 0, 48, 34,
+ 20, 16, 20, 6, 1, 3, 18, 14, 62, 42,
+ 34, 24, 36, 18, 20, 12, 2, 22, 82, 66,
+ 56, 50, 50, 18, 6, 0, 5, 13, 98, 58,
+ 18, 2, 26, 2, 19, 17, 10, 90, 68, 38,
+ 20, 40, 10, 2, 7, 11, 124, 35, 23, 7,
+ 13, 13, 9, 0, 8, 3, 4, 10, 24, 19,
+ 13, 29, 21, 18, 25, 23, 19, 3, 1, 11,
+ 21, 11, 23, 11, 15, 39, 18, 18, 40, 8,
+ 3, 18, 12, 4, 10, 8, 2, 13, 0, 5,
+ 13, 7, 22, 51, 15, 32, 19, 6, 5, 2,
+ 22, 5, 9, 8, 14, 33, 17, 49, 58, 62,
+ 68, 40, 28, 36, 30, 26, 22, 8, 12, 8,
+ 13, 13, 27, 23, 13, 25, 51, 23, 23, 27,
+ 25, 41, 27, 43, 43, 33, 49, 36, 34, 30,
+ 26, 1, 8, 4, 23, 9, 23, 27, 57, 43,
+ 55, 67, 19, 17, 73, 9, 13, 29, 53, 39,
+ 47, 55, 33, 57, 49, 63, 69, 73, 85, 43,
+ 59, 69, 21, 9, 16, 5, 40, 58, 3, 4,
+ 24, 32, 10, 28, 54, 8, 0, 74, 16, 33,
+ 71, 97, 125, 125, 125, 125, 6, 78, 60, 52,
+ 32, 52, 26, 14, 10, 7, 21, 11, 12, 3,
+ 34, 50, 0, 0, 16, 24, 12, 22, 46, 14,
+ 2, 74, 16, 33, 71, 97, 125, 125, 125, 125,
+ },
+
+ {
+ /* Context Tables for P, SP, B Slices :: cabac_init_idc = 1, qp = 35 */
+
+ 70, 10, 33, 70, 10, 33, 3, 20, 48, 24,
+ 5, 19, 21, 56, 118, 28, 18, 11, 12, 26,
+ 2, 11, 0, 23, 53, 0, 47, 125, 123, 123,
+ 70, 9, 7, 12, 26, 2, 37, 4, 22, 6,
+ 1, 15, 21, 0, 31, 33, 59, 10, 7, 15,
+ 8, 21, 13, 41, 0, 21, 17, 35, 4, 4,
+ 44, 0, 0, 0, 0, 61, 67, 14, 11, 9,
+ 40, 5, 41, 44, 24, 10, 78, 84, 28, 34,
+ 8, 34, 1, 2, 30, 33, 29, 29, 39, 48,
+ 3, 24, 26, 9, 27, 15, 8, 4, 15, 13,
+ 29, 34, 9, 10, 9, 21, 2, 11, 4, 10,
+ 6, 8, 22, 14, 9, 4, 124, 124, 92, 74,
+ 12, 7, 2, 18, 0, 22, 18, 4, 15, 38,
+ 9, 53, 47, 66, 18, 26, 10, 12, 18, 28,
+ 68, 40, 18, 2, 4, 4, 28, 36, 61, 2,
+ 10, 25, 6, 1, 24, 20, 5, 38, 24, 26,
+ 31, 12, 13, 9, 20, 51, 20, 28, 22, 4,
+ 13, 18, 4, 18, 10, 21, 7, 10, 125, 124,
+ 87, 9, 19, 23, 37, 25, 31, 57, 39, 55,
+ 27, 55, 15, 124, 63, 32, 38, 26, 16, 14,
+ 14, 6, 6, 2, 21, 25, 23, 31, 35, 81,
+ 23, 25, 61, 3, 13, 17, 23, 37, 43, 41,
+ 43, 51, 35, 55, 67, 49, 61, 4, 50, 36,
+ 20, 16, 22, 6, 1, 3, 22, 16, 62, 42,
+ 36, 26, 38, 20, 22, 14, 6, 24, 82, 66,
+ 56, 50, 52, 20, 6, 0, 5, 13, 98, 58,
+ 16, 2, 26, 2, 19, 15, 10, 88, 68, 36,
+ 18, 40, 10, 2, 7, 11, 124, 33, 19, 5,
+ 11, 11, 7, 4, 12, 1, 6, 14, 28, 19,
+ 11, 27, 19, 22, 23, 23, 19, 3, 0, 11,
+ 21, 11, 23, 11, 15, 41, 20, 20, 42, 8,
+ 3, 20, 12, 4, 10, 8, 2, 13, 0, 5,
+ 13, 7, 22, 53, 17, 32, 21, 4, 7, 2,
+ 22, 7, 11, 8, 14, 35, 17, 51, 56, 60,
+ 66, 38, 24, 32, 26, 22, 16, 4, 6, 2,
+ 19, 17, 31, 31, 21, 33, 61, 31, 29, 31,
+ 29, 45, 31, 47, 47, 35, 51, 32, 28, 26,
+ 20, 7, 4, 0, 29, 15, 27, 33, 63, 47,
+ 59, 69, 21, 19, 75, 13, 17, 33, 57, 41,
+ 51, 59, 35, 59, 51, 65, 71, 75, 87, 45,
+ 61, 71, 21, 7, 16, 3, 42, 60, 3, 4,
+ 24, 32, 10, 30, 56, 8, 0, 72, 12, 37,
+ 77, 103, 125, 125, 125, 125, 6, 78, 60, 52,
+ 32, 52, 28, 14, 10, 5, 21, 11, 14, 1,
+ 36, 52, 0, 2, 16, 24, 12, 22, 46, 14,
+ 2, 72, 12, 37, 77, 103, 125, 125, 125, 125,
+ },
+
+ {
+ /* Context Tables for P, SP, B Slices :: cabac_init_idc = 1, qp = 36 */
+
+ 66, 10, 33, 66, 10, 33, 1, 22, 48, 24,
+ 5, 21, 25, 54, 118, 26, 22, 11, 12, 28,
+ 0, 13, 0, 25, 57, 1, 53, 125, 125, 125,
+ 76, 7, 7, 12, 28, 0, 37, 6, 22, 4,
+ 1, 13, 19, 0, 31, 33, 59, 10, 7, 15,
+ 8, 21, 13, 39, 0, 21, 17, 35, 6, 4,
+ 44, 0, 0, 0, 2, 63, 67, 14, 13, 9,
+ 38, 5, 41, 48, 26, 12, 82, 88, 30, 36,
+ 10, 36, 2, 4, 34, 33, 27, 29, 39, 48,
+ 3, 26, 30, 9, 27, 13, 12, 4, 15, 13,
+ 31, 34, 9, 10, 7, 19, 4, 9, 6, 10,
+ 6, 10, 24, 16, 7, 6, 124, 124, 98, 80,
+ 12, 5, 2, 18, 0, 22, 18, 4, 15, 42,
+ 9, 57, 51, 70, 18, 28, 10, 12, 18, 28,
+ 72, 42, 20, 0, 4, 4, 30, 36, 63, 2,
+ 10, 25, 4, 3, 22, 18, 7, 38, 24, 26,
+ 33, 12, 15, 11, 18, 51, 18, 26, 20, 0,
+ 21, 16, 0, 14, 6, 25, 11, 6, 125, 124,
+ 95, 13, 23, 29, 43, 29, 39, 67, 47, 61,
+ 31, 61, 19, 124, 65, 28, 34, 20, 12, 10,
+ 10, 2, 2, 1, 27, 29, 27, 35, 39, 85,
+ 23, 25, 63, 5, 15, 19, 27, 39, 47, 43,
+ 45, 53, 35, 57, 67, 51, 59, 6, 52, 36,
+ 20, 18, 22, 8, 0, 1, 26, 20, 62, 44,
+ 36, 26, 40, 22, 26, 18, 8, 24, 84, 68,
+ 58, 52, 54, 20, 6, 2, 3, 13, 100, 58,
+ 16, 0, 26, 2, 19, 15, 10, 88, 66, 34,
+ 16, 40, 10, 4, 7, 11, 124, 29, 17, 3,
+ 7, 7, 5, 6, 14, 0, 10, 16, 32, 17,
+ 9, 27, 17, 24, 23, 23, 19, 1, 0, 11,
+ 23, 11, 25, 11, 15, 41, 20, 20, 44, 8,
+ 3, 20, 12, 4, 10, 8, 2, 15, 0, 5,
+ 13, 7, 24, 55, 19, 32, 23, 4, 9, 0,
+ 22, 9, 13, 8, 14, 37, 17, 53, 54, 58,
+ 64, 34, 20, 28, 22, 16, 12, 1, 0, 3,
+ 25, 23, 35, 41, 29, 41, 71, 37, 37, 37,
+ 35, 51, 35, 51, 49, 37, 51, 26, 24, 20,
+ 16, 13, 1, 5, 35, 19, 33, 37, 69, 51,
+ 63, 73, 23, 19, 79, 15, 19, 35, 61, 45,
+ 55, 61, 37, 63, 53, 67, 75, 77, 87, 49,
+ 65, 75, 19, 7, 18, 3, 44, 62, 3, 6,
+ 26, 34, 10, 30, 58, 8, 0, 70, 8, 43,
+ 83, 111, 125, 125, 125, 125, 8, 78, 60, 52,
+ 32, 54, 28, 14, 10, 5, 21, 11, 14, 1,
+ 36, 54, 0, 2, 18, 26, 12, 22, 48, 14,
+ 2, 70, 8, 43, 83, 111, 125, 125, 125, 125,
+ },
+
+ {
+ /* Context Tables for P, SP, B Slices :: cabac_init_idc = 1, qp = 37 */
+
+ 64, 10, 33, 64, 10, 33, 2, 26, 48, 24,
+ 7, 23, 29, 52, 118, 26, 26, 11, 12, 30,
+ 0, 13, 0, 27, 61, 3, 57, 125, 125, 125,
+ 82, 5, 5, 12, 30, 0, 37, 8, 22, 4,
+ 1, 13, 17, 0, 31, 33, 59, 10, 7, 15,
+ 8, 21, 11, 39, 0, 21, 17, 35, 6, 4,
+ 44, 0, 0, 0, 2, 63, 67, 16, 15, 9,
+ 36, 7, 41, 54, 28, 14, 86, 92, 34, 40,
+ 14, 40, 4, 6, 40, 33, 27, 27, 37, 48,
+ 3, 28, 34, 9, 25, 13, 16, 4, 17, 15,
+ 31, 34, 9, 12, 7, 17, 6, 9, 8, 12,
+ 8, 14, 28, 18, 3, 8, 124, 124, 104, 86,
+ 14, 5, 4, 20, 0, 22, 20, 4, 13, 46,
+ 9, 61, 55, 76, 18, 28, 10, 12, 18, 30,
+ 74, 44, 20, 0, 4, 4, 30, 38, 65, 4,
+ 10, 27, 4, 5, 22, 18, 7, 38, 24, 26,
+ 35, 14, 17, 11, 18, 51, 18, 24, 18, 3,
+ 27, 14, 1, 10, 2, 29, 17, 2, 125, 124,
+ 101, 17, 27, 33, 49, 35, 45, 75, 55, 65,
+ 35, 69, 23, 124, 69, 24, 30, 16, 8, 6,
+ 6, 1, 1, 5, 31, 33, 31, 37, 43, 89,
+ 23, 25, 63, 7, 17, 21, 29, 41, 49, 45,
+ 49, 55, 37, 59, 69, 53, 57, 10, 54, 38,
+ 20, 18, 24, 8, 0, 0, 30, 22, 62, 44,
+ 38, 28, 42, 24, 28, 20, 12, 26, 86, 70,
+ 60, 52, 56, 22, 8, 2, 3, 13, 100, 58,
+ 14, 0, 26, 2, 19, 13, 10, 86, 64, 32,
+ 14, 40, 10, 4, 7, 9, 124, 27, 13, 0,
+ 3, 5, 3, 10, 18, 2, 14, 20, 36, 15,
+ 7, 25, 15, 28, 23, 23, 17, 1, 2, 11,
+ 23, 11, 25, 11, 15, 43, 22, 20, 46, 8,
+ 3, 20, 12, 4, 10, 8, 2, 15, 0, 5,
+ 13, 7, 24, 57, 21, 32, 25, 2, 9, 1,
+ 22, 9, 15, 8, 14, 39, 17, 55, 52, 56,
+ 62, 30, 16, 24, 18, 12, 6, 7, 5, 9,
+ 31, 27, 39, 49, 37, 49, 81, 43, 43, 41,
+ 39, 55, 39, 53, 53, 37, 53, 22, 18, 16,
+ 12, 19, 7, 9, 41, 23, 37, 41, 75, 55,
+ 67, 75, 23, 21, 83, 17, 23, 39, 65, 49,
+ 59, 65, 39, 65, 55, 67, 77, 79, 89, 51,
+ 67, 77, 19, 7, 20, 1, 46, 64, 1, 8,
+ 26, 34, 10, 32, 60, 8, 0, 68, 4, 49,
+ 89, 117, 125, 125, 125, 125, 8, 78, 60, 52,
+ 32, 54, 28, 16, 12, 3, 21, 11, 16, 1,
+ 38, 56, 0, 2, 18, 26, 12, 24, 48, 14,
+ 2, 68, 4, 49, 89, 117, 125, 125, 125, 125,
+ },
+
+ {
+ /* Context Tables for P, SP, B Slices :: cabac_init_idc = 1, qp = 38 */
+
+ 62, 10, 35, 62, 10, 35, 6, 28, 50, 24,
+ 7, 27, 33, 50, 118, 26, 28, 11, 14, 30,
+ 0, 15, 2, 27, 63, 7, 63, 125, 125, 125,
+ 86, 3, 5, 14, 30, 0, 39, 8, 22, 2,
+ 1, 13, 17, 0, 31, 31, 57, 10, 5, 13,
+ 8, 19, 11, 39, 0, 21, 17, 35, 6, 4,
+ 44, 0, 0, 0, 4, 63, 67, 16, 15, 11,
+ 36, 7, 41, 58, 30, 16, 88, 98, 36, 42,
+ 18, 42, 6, 8, 44, 33, 27, 27, 37, 48,
+ 3, 30, 40, 9, 23, 11, 20, 2, 17, 15,
+ 33, 36, 9, 12, 5, 17, 6, 7, 10, 12,
+ 10, 16, 30, 18, 1, 10, 124, 124, 110, 92,
+ 14, 5, 4, 20, 0, 22, 20, 6, 13, 50,
+ 7, 65, 59, 82, 18, 28, 10, 12, 20, 30,
+ 78, 46, 20, 0, 4, 4, 32, 40, 67, 4,
+ 10, 27, 2, 7, 20, 18, 9, 38, 24, 26,
+ 39, 14, 19, 13, 16, 51, 18, 24, 18, 7,
+ 33, 12, 5, 4, 3, 33, 21, 1, 125, 124,
+ 109, 21, 29, 37, 57, 39, 51, 83, 65, 71,
+ 39, 75, 27, 124, 73, 20, 26, 12, 2, 0,
+ 2, 7, 5, 9, 35, 39, 35, 41, 45, 91,
+ 23, 25, 65, 9, 19, 23, 31, 43, 51, 47,
+ 53, 57, 37, 61, 71, 53, 57, 12, 56, 38,
+ 20, 18, 26, 10, 2, 0, 34, 24, 64, 44,
+ 38, 30, 44, 26, 32, 24, 16, 28, 86, 70,
+ 60, 54, 58, 22, 8, 2, 1, 13, 102, 58,
+ 12, 0, 26, 2, 19, 13, 8, 86, 64, 30,
+ 12, 38, 10, 4, 7, 9, 124, 25, 11, 2,
+ 1, 1, 1, 14, 22, 4, 16, 24, 40, 15,
+ 5, 25, 13, 32, 21, 23, 17, 0, 2, 11,
+ 23, 11, 27, 11, 15, 43, 22, 22, 48, 8,
+ 3, 22, 14, 4, 12, 10, 2, 17, 0, 5,
+ 13, 7, 26, 59, 23, 32, 27, 0, 11, 1,
+ 22, 11, 15, 8, 12, 43, 19, 57, 48, 54,
+ 60, 28, 12, 20, 14, 8, 2, 11, 11, 13,
+ 37, 33, 43, 57, 45, 57, 89, 51, 49, 47,
+ 43, 61, 43, 57, 55, 39, 55, 18, 14, 10,
+ 6, 25, 11, 15, 47, 29, 43, 47, 81, 59,
+ 69, 79, 25, 23, 85, 21, 25, 43, 69, 51,
+ 63, 69, 41, 69, 57, 69, 79, 81, 91, 55,
+ 69, 79, 19, 5, 20, 1, 48, 66, 1, 8,
+ 28, 36, 10, 32, 62, 8, 0, 66, 0, 53,
+ 95, 123, 125, 125, 125, 125, 8, 78, 60, 52,
+ 32, 56, 30, 16, 12, 3, 19, 9, 16, 0,
+ 40, 58, 2, 4, 18, 26, 12, 24, 50, 14,
+ 2, 66, 0, 53, 95, 123, 125, 125, 125, 125,
+ },
+
+ {
+ /* Context Tables for P, SP, B Slices :: cabac_init_idc = 1, qp = 39 */
+
+ 60, 10, 35, 60, 10, 35, 10, 32, 50, 24,
+ 9, 29, 37, 48, 118, 26, 32, 11, 14, 32,
+ 1, 15, 2, 29, 67, 9, 67, 125, 125, 125,
+ 92, 1, 5, 14, 32, 1, 39, 10, 22, 2,
+ 1, 11, 15, 0, 31, 31, 57, 10, 5, 13,
+ 8, 19, 9, 37, 0, 21, 17, 35, 8, 4,
+ 44, 0, 0, 0, 4, 63, 67, 18, 17, 11,
+ 34, 9, 41, 64, 32, 18, 92, 102, 40, 46,
+ 20, 46, 10, 10, 48, 33, 25, 25, 37, 48,
+ 3, 32, 44, 9, 21, 11, 24, 2, 19, 17,
+ 33, 36, 9, 14, 5, 15, 8, 7, 12, 14,
+ 12, 20, 34, 20, 0, 12, 124, 124, 116, 98,
+ 14, 3, 4, 22, 0, 22, 22, 6, 11, 54,
+ 7, 69, 63, 88, 18, 30, 10, 12, 20, 32,
+ 80, 48, 22, 1, 4, 4, 32, 40, 69, 4,
+ 10, 29, 2, 9, 20, 16, 9, 38, 24, 26,
+ 41, 14, 21, 15, 16, 51, 18, 22, 16, 11,
+ 39, 10, 9, 0, 7, 37, 27, 5, 125, 124,
+ 115, 25, 33, 43, 63, 45, 57, 93, 73, 77,
+ 43, 83, 31, 124, 75, 16, 22, 8, 1, 3,
+ 1, 11, 9, 13, 41, 43, 39, 45, 49, 95,
+ 23, 25, 65, 11, 21, 25, 35, 45, 53, 49,
+ 55, 59, 39, 63, 71, 55, 55, 16, 58, 40,
+ 20, 20, 26, 10, 2, 2, 38, 28, 64, 46,
+ 40, 30, 46, 28, 34, 26, 20, 28, 88, 72,
+ 62, 54, 60, 24, 8, 4, 1, 13, 102, 58,
+ 12, 1, 26, 2, 19, 11, 8, 84, 62, 28,
+ 10, 38, 10, 6, 7, 9, 124, 21, 7, 4,
+ 2, 0, 0, 16, 24, 6, 20, 26, 44, 13,
+ 3, 23, 11, 36, 21, 23, 17, 0, 4, 11,
+ 25, 11, 27, 11, 15, 45, 24, 22, 50, 8,
+ 3, 22, 14, 4, 12, 10, 2, 17, 0, 5,
+ 13, 7, 26, 61, 25, 32, 29, 0, 13, 3,
+ 22, 13, 17, 8, 12, 45, 19, 59, 46, 52,
+ 58, 24, 8, 16, 10, 2, 3, 17, 17, 19,
+ 43, 37, 47, 67, 53, 65, 99, 57, 55, 51,
+ 47, 65, 47, 61, 59, 41, 55, 12, 8, 6,
+ 2, 31, 17, 19, 53, 33, 47, 51, 87, 63,
+ 73, 81, 27, 23, 89, 23, 29, 45, 73, 55,
+ 67, 71, 43, 71, 59, 71, 83, 83, 91, 57,
+ 73, 83, 17, 5, 22, 0, 50, 68, 1, 10,
+ 28, 36, 10, 34, 64, 8, 0, 64, 3, 59,
+ 101, 125, 125, 125, 125, 125, 10, 78, 60, 52,
+ 32, 56, 30, 16, 12, 1, 19, 9, 18, 0,
+ 42, 60, 2, 4, 20, 28, 12, 24, 50, 14,
+ 2, 64, 3, 59, 101, 125, 125, 125, 125, 125,
+ },
+
+ {
+ /* Context Tables for P, SP, B Slices :: cabac_init_idc = 1, qp = 40 */
+
+ 56, 8, 35, 56, 8, 35, 12, 34, 50, 24,
+ 9, 33, 43, 46, 118, 24, 36, 13, 14, 34,
+ 1, 17, 2, 31, 71, 11, 73, 125, 125, 125,
+ 98, 0, 5, 14, 34, 1, 41, 12, 22, 0,
+ 1, 11, 15, 1, 33, 31, 57, 10, 5, 13,
+ 8, 19, 9, 37, 0, 21, 17, 35, 8, 4,
+ 44, 0, 0, 0, 6, 65, 67, 18, 19, 11,
+ 32, 9, 43, 68, 34, 20, 94, 106, 42, 48,
+ 24, 48, 12, 12, 52, 33, 25, 25, 37, 48,
+ 3, 34, 48, 9, 21, 9, 28, 2, 19, 17,
+ 35, 36, 9, 14, 3, 13, 10, 5, 12, 14,
+ 12, 22, 36, 20, 2, 14, 124, 124, 122, 102,
+ 14, 3, 4, 22, 1, 22, 22, 6, 11, 58,
+ 7, 73, 67, 92, 18, 30, 10, 12, 20, 32,
+ 84, 48, 22, 1, 2, 4, 34, 42, 73, 4,
+ 10, 29, 0, 13, 18, 16, 11, 38, 24, 24,
+ 43, 14, 23, 17, 14, 51, 16, 20, 14, 15,
+ 47, 6, 13, 3, 13, 43, 31, 9, 125, 124,
+ 123, 29, 37, 47, 69, 49, 65, 101, 81, 83,
+ 47, 89, 35, 124, 79, 12, 16, 2, 7, 9,
+ 7, 15, 15, 17, 45, 47, 43, 49, 53, 99,
+ 23, 27, 67, 13, 25, 27, 37, 47, 57, 53,
+ 59, 61, 39, 65, 73, 57, 55, 18, 60, 40,
+ 20, 20, 28, 12, 4, 2, 42, 30, 64, 46,
+ 40, 32, 48, 30, 38, 30, 22, 30, 88, 72,
+ 62, 56, 62, 24, 8, 4, 0, 15, 104, 58,
+ 10, 1, 26, 2, 19, 11, 8, 84, 60, 26,
+ 8, 38, 10, 6, 7, 9, 124, 19, 5, 6,
+ 4, 4, 2, 20, 28, 8, 22, 30, 48, 13,
+ 3, 23, 9, 38, 21, 23, 17, 2, 4, 11,
+ 25, 11, 29, 11, 15, 45, 24, 22, 52, 8,
+ 5, 22, 14, 2, 12, 10, 2, 19, 0, 5,
+ 13, 7, 28, 63, 27, 32, 33, 1, 15, 5,
+ 20, 15, 19, 8, 12, 47, 19, 63, 44, 48,
+ 56, 20, 4, 12, 6, 1, 7, 23, 23, 25,
+ 49, 43, 51, 75, 61, 75, 109, 65, 63, 57,
+ 53, 71, 51, 65, 61, 43, 57, 8, 4, 0,
+ 3, 37, 23, 25, 59, 39, 53, 57, 93, 67,
+ 77, 85, 29, 25, 93, 27, 31, 49, 77, 59,
+ 71, 75, 47, 75, 63, 73, 85, 85, 93, 61,
+ 75, 85, 17, 5, 22, 0, 52, 70, 1, 10,
+ 30, 38, 10, 34, 64, 8, 1, 62, 7, 65,
+ 107, 125, 125, 125, 125, 125, 10, 78, 60, 52,
+ 32, 58, 30, 16, 12, 1, 19, 9, 18, 0,
+ 42, 60, 2, 4, 20, 28, 12, 24, 52, 14,
+ 0, 62, 7, 65, 107, 125, 125, 125, 125, 125,
+ },
+
+ {
+ /* Context Tables for P, SP, B Slices :: cabac_init_idc = 1, qp = 41 */
+
+ 54, 8, 35, 54, 8, 35, 16, 36, 52, 24,
+ 9, 35, 47, 44, 120, 24, 40, 13, 16, 36,
+ 1, 19, 2, 31, 73, 13, 77, 125, 125, 125,
+ 104, 2, 3, 16, 36, 1, 41, 14, 22, 0,
+ 0, 11, 13, 1, 33, 31, 57, 10, 5, 11,
+ 8, 19, 9, 37, 0, 21, 17, 33, 8, 4,
+ 44, 0, 0, 0, 8, 65, 67, 18, 21, 11,
+ 30, 9, 43, 72, 38, 22, 98, 110, 46, 50,
+ 28, 52, 14, 16, 58, 33, 25, 25, 35, 48,
+ 3, 36, 52, 7, 19, 7, 32, 2, 19, 17,
+ 35, 36, 9, 14, 1, 11, 12, 3, 14, 16,
+ 14, 24, 40, 22, 6, 16, 124, 124, 124, 108,
+ 16, 3, 6, 24, 1, 22, 22, 8, 9, 64,
+ 7, 77, 69, 98, 18, 30, 10, 12, 20, 34,
+ 88, 50, 22, 1, 2, 4, 36, 44, 75, 6,
+ 10, 29, 1, 15, 16, 16, 13, 38, 24, 24,
+ 45, 16, 25, 17, 12, 51, 16, 20, 14, 19,
+ 53, 4, 15, 7, 17, 47, 35, 13, 125, 124,
+ 125, 33, 41, 51, 75, 53, 71, 109, 89, 87,
+ 51, 95, 39, 124, 83, 8, 12, 1, 11, 13,
+ 11, 19, 19, 21, 49, 51, 45, 51, 55, 103,
+ 23, 27, 67, 13, 27, 29, 39, 49, 59, 55,
+ 63, 63, 39, 67, 75, 57, 53, 20, 62, 40,
+ 20, 20, 30, 14, 6, 4, 48, 32, 64, 46,
+ 40, 34, 50, 32, 42, 34, 26, 32, 90, 74,
+ 64, 58, 64, 24, 10, 4, 2, 15, 106, 58,
+ 8, 1, 28, 4, 19, 9, 8, 84, 60, 24,
+ 6, 38, 10, 6, 7, 7, 124, 17, 3, 10,
+ 8, 8, 4, 24, 32, 10, 26, 34, 52, 11,
+ 1, 21, 7, 42, 19, 21, 15, 4, 6, 11,
+ 25, 11, 31, 11, 15, 45, 24, 24, 54, 10,
+ 5, 24, 14, 2, 12, 10, 2, 21, 0, 3,
+ 11, 7, 30, 63, 29, 32, 35, 3, 15, 5,
+ 20, 15, 21, 8, 12, 49, 19, 65, 42, 46,
+ 56, 18, 0, 8, 2, 5, 11, 27, 29, 31,
+ 55, 49, 55, 83, 69, 83, 119, 71, 69, 63,
+ 57, 75, 53, 67, 63, 43, 59, 4, 0, 3,
+ 7, 41, 27, 29, 65, 43, 57, 61, 97, 71,
+ 81, 89, 29, 27, 95, 29, 33, 53, 81, 61,
+ 73, 79, 49, 79, 65, 73, 87, 85, 95, 65,
+ 77, 87, 17, 3, 24, 0, 54, 74, 0, 12,
+ 32, 40, 10, 36, 66, 8, 1, 60, 11, 69,
+ 113, 125, 125, 125, 125, 125, 10, 78, 60, 54,
+ 34, 60, 32, 18, 14, 1, 19, 9, 18, 2,
+ 44, 62, 2, 6, 20, 28, 12, 26, 54, 14,
+ 0, 60, 11, 69, 113, 125, 125, 125, 125, 125,
+ },
+
+ {
+ /* Context Tables for P, SP, B Slices :: cabac_init_idc = 1, qp = 42 */
+
+ 52, 8, 35, 52, 8, 35, 20, 40, 52, 24,
+ 11, 37, 51, 42, 120, 24, 44, 13, 16, 38,
+ 3, 19, 2, 33, 77, 15, 83, 125, 125, 125,
+ 110, 4, 3, 16, 38, 3, 41, 16, 22, 1,
+ 0, 9, 11, 1, 33, 31, 57, 10, 5, 11,
+ 8, 19, 7, 35, 0, 21, 17, 33, 10, 4,
+ 44, 0, 0, 0, 8, 65, 67, 20, 23, 11,
+ 28, 11, 43, 78, 40, 24, 102, 114, 48, 54,
+ 30, 54, 18, 18, 62, 33, 23, 23, 35, 48,
+ 3, 38, 56, 7, 17, 7, 36, 2, 21, 19,
+ 37, 36, 9, 16, 1, 9, 14, 3, 16, 16,
+ 16, 28, 42, 24, 8, 18, 124, 124, 124, 114,
+ 16, 1, 6, 24, 1, 22, 24, 8, 9, 68,
+ 7, 81, 73, 104, 18, 32, 10, 12, 20, 34,
+ 90, 52, 24, 3, 2, 4, 36, 44, 77, 6,
+ 10, 31, 1, 17, 16, 14, 13, 38, 24, 24,
+ 47, 16, 27, 19, 12, 51, 16, 18, 12, 23,
+ 59, 2, 19, 11, 21, 51, 41, 17, 125, 124,
+ 125, 37, 45, 57, 81, 59, 77, 119, 97, 93,
+ 55, 103, 43, 124, 85, 4, 8, 5, 15, 17,
+ 15, 23, 23, 25, 55, 55, 49, 55, 59, 107,
+ 23, 27, 69, 15, 29, 31, 43, 51, 61, 57,
+ 65, 65, 41, 69, 75, 59, 51, 24, 64, 42,
+ 20, 22, 30, 14, 6, 6, 52, 36, 64, 48,
+ 42, 34, 52, 34, 44, 36, 30, 32, 92, 76,
+ 66, 58, 66, 26, 10, 6, 2, 15, 106, 58,
+ 8, 3, 28, 4, 19, 9, 8, 82, 58, 22,
+ 4, 38, 10, 8, 7, 7, 124, 13, 0, 12,
+ 12, 10, 6, 26, 34, 12, 30, 36, 56, 9,
+ 0, 21, 5, 46, 19, 21, 15, 4, 6, 11,
+ 27, 11, 31, 11, 15, 47, 26, 24, 56, 10,
+ 5, 24, 14, 2, 12, 10, 2, 21, 0, 3,
+ 11, 7, 30, 65, 31, 32, 37, 3, 17, 7,
+ 20, 17, 23, 8, 12, 51, 19, 67, 40, 44,
+ 54, 14, 3, 4, 1, 11, 17, 33, 35, 37,
+ 61, 53, 59, 93, 77, 91, 125, 77, 75, 67,
+ 61, 81, 57, 71, 67, 45, 59, 1, 5, 9,
+ 11, 47, 33, 35, 71, 47, 63, 65, 103, 75,
+ 85, 91, 31, 27, 99, 31, 37, 55, 85, 65,
+ 77, 81, 51, 81, 67, 75, 91, 87, 95, 67,
+ 81, 91, 15, 3, 26, 2, 56, 76, 0, 14,
+ 32, 40, 10, 36, 68, 8, 1, 58, 15, 75,
+ 119, 125, 125, 125, 125, 125, 12, 78, 60, 54,
+ 34, 60, 32, 18, 14, 0, 19, 9, 20, 2,
+ 46, 64, 2, 6, 22, 30, 12, 26, 54, 14,
+ 0, 58, 15, 75, 119, 125, 125, 125, 125, 125,
+ },
+
+ {
+ /* Context Tables for P, SP, B Slices :: cabac_init_idc = 1, qp = 43 */
+
+ 50, 8, 37, 50, 8, 37, 24, 42, 54, 24,
+ 11, 41, 55, 40, 120, 24, 46, 13, 18, 38,
+ 3, 21, 4, 33, 79, 19, 87, 125, 125, 125,
+ 114, 6, 3, 18, 38, 3, 43, 16, 22, 1,
+ 0, 9, 11, 1, 33, 29, 55, 10, 3, 9,
+ 8, 17, 7, 35, 0, 21, 17, 33, 10, 4,
+ 44, 0, 0, 0, 10, 65, 67, 20, 23, 13,
+ 28, 11, 43, 82, 42, 26, 104, 120, 52, 56,
+ 34, 58, 20, 20, 66, 33, 23, 23, 35, 48,
+ 3, 40, 62, 7, 15, 5, 40, 0, 21, 19,
+ 37, 38, 9, 16, 0, 9, 14, 1, 18, 18,
+ 18, 30, 46, 24, 10, 20, 124, 124, 124, 120,
+ 16, 1, 6, 26, 1, 22, 24, 10, 7, 72,
+ 5, 85, 77, 110, 18, 32, 10, 12, 22, 36,
+ 94, 54, 24, 3, 2, 4, 38, 46, 79, 6,
+ 10, 31, 3, 19, 14, 14, 15, 38, 24, 24,
+ 51, 16, 29, 21, 10, 51, 16, 18, 12, 27,
+ 65, 0, 23, 17, 27, 55, 45, 21, 125, 124,
+ 125, 41, 47, 61, 89, 63, 83, 125, 107, 99,
+ 59, 109, 47, 124, 89, 0, 4, 9, 21, 23,
+ 19, 29, 27, 29, 59, 61, 53, 59, 61, 109,
+ 23, 27, 69, 17, 31, 33, 45, 53, 63, 59,
+ 69, 67, 41, 71, 77, 59, 51, 26, 66, 42,
+ 20, 22, 32, 16, 8, 6, 56, 38, 66, 48,
+ 42, 36, 54, 36, 48, 40, 34, 34, 92, 76,
+ 66, 60, 68, 26, 10, 6, 4, 15, 108, 58,
+ 6, 3, 28, 4, 19, 7, 6, 82, 58, 20,
+ 2, 36, 10, 8, 7, 7, 124, 11, 2, 14,
+ 14, 14, 8, 30, 38, 14, 32, 40, 60, 9,
+ 2, 19, 3, 50, 17, 21, 15, 6, 8, 11,
+ 27, 11, 33, 11, 15, 47, 26, 26, 58, 10,
+ 5, 26, 16, 2, 14, 12, 2, 23, 0, 3,
+ 11, 7, 32, 67, 33, 32, 39, 5, 19, 7,
+ 20, 19, 23, 8, 10, 55, 21, 69, 36, 42,
+ 52, 12, 7, 0, 5, 15, 21, 37, 41, 41,
+ 67, 59, 63, 101, 85, 99, 125, 85, 81, 73,
+ 65, 85, 61, 75, 69, 47, 61, 5, 9, 13,
+ 17, 53, 37, 39, 77, 53, 67, 71, 109, 79,
+ 87, 95, 33, 29, 101, 35, 39, 59, 89, 67,
+ 81, 85, 53, 85, 69, 77, 93, 89, 97, 71,
+ 83, 93, 15, 1, 26, 2, 58, 78, 0, 14,
+ 34, 42, 10, 38, 70, 8, 1, 56, 19, 79,
+ 125, 125, 125, 125, 125, 125, 12, 78, 60, 54,
+ 34, 62, 34, 18, 14, 0, 17, 7, 20, 4,
+ 48, 66, 4, 8, 22, 30, 12, 26, 56, 14,
+ 0, 56, 19, 79, 125, 125, 125, 125, 125, 125,
+ },
+
+ {
+ /* Context Tables for P, SP, B Slices :: cabac_init_idc = 1, qp = 44 */
+
+ 46, 8, 37, 46, 8, 37, 26, 46, 54, 24,
+ 13, 43, 59, 38, 120, 22, 50, 13, 18, 40,
+ 3, 21, 4, 35, 83, 21, 93, 125, 125, 125,
+ 120, 8, 3, 18, 40, 3, 43, 18, 22, 3,
+ 0, 9, 9, 1, 33, 29, 55, 10, 3, 9,
+ 8, 17, 5, 35, 0, 21, 17, 33, 10, 4,
+ 44, 0, 0, 0, 10, 67, 67, 22, 25, 13,
+ 26, 13, 43, 88, 44, 28, 108, 124, 54, 60,
+ 38, 60, 22, 22, 70, 33, 23, 21, 35, 48,
+ 3, 42, 66, 7, 15, 5, 44, 0, 23, 21,
+ 39, 38, 9, 18, 0, 7, 16, 1, 20, 18,
+ 18, 34, 48, 26, 12, 22, 124, 124, 124, 124,
+ 16, 1, 6, 26, 1, 22, 26, 10, 7, 76,
+ 5, 89, 81, 114, 18, 32, 10, 12, 22, 36,
+ 96, 56, 24, 3, 2, 4, 38, 48, 81, 6,
+ 10, 33, 3, 21, 14, 14, 15, 38, 24, 24,
+ 53, 16, 31, 23, 10, 51, 14, 16, 10, 31,
+ 73, 1, 27, 21, 31, 59, 51, 25, 125, 124,
+ 125, 45, 51, 65, 95, 69, 91, 125, 115, 105,
+ 63, 117, 51, 124, 93, 3, 0, 15, 25, 27,
+ 23, 33, 31, 33, 63, 65, 57, 63, 65, 113,
+ 23, 27, 71, 19, 33, 35, 47, 55, 67, 61,
+ 73, 69, 43, 73, 79, 61, 49, 30, 68, 44,
+ 20, 22, 34, 16, 8, 8, 60, 40, 66, 48,
+ 44, 38, 56, 38, 50, 42, 36, 36, 94, 78,
+ 68, 60, 70, 28, 10, 6, 4, 15, 108, 58,
+ 4, 3, 28, 4, 19, 7, 6, 80, 56, 18,
+ 0, 36, 10, 8, 7, 7, 124, 9, 6, 16,
+ 18, 16, 10, 34, 42, 16, 36, 44, 64, 7,
+ 4, 19, 1, 52, 17, 21, 15, 6, 8, 11,
+ 27, 11, 33, 11, 15, 49, 28, 26, 60, 10,
+ 5, 26, 16, 2, 14, 12, 2, 23, 0, 3,
+ 11, 7, 32, 69, 35, 32, 41, 7, 21, 9,
+ 20, 21, 25, 8, 10, 57, 21, 71, 34, 40,
+ 50, 8, 11, 3, 9, 19, 27, 43, 47, 47,
+ 73, 63, 67, 109, 93, 107, 125, 91, 89, 77,
+ 71, 91, 65, 79, 73, 49, 63, 9, 15, 19,
+ 21, 59, 43, 45, 83, 57, 73, 75, 115, 83,
+ 91, 97, 35, 31, 105, 37, 43, 63, 93, 71,
+ 85, 89, 55, 87, 71, 79, 95, 91, 99, 73,
+ 85, 95, 15, 1, 28, 4, 60, 80, 0, 16,
+ 34, 42, 10, 38, 72, 8, 1, 54, 23, 85,
+ 125, 125, 125, 125, 125, 125, 12, 78, 60, 54,
+ 34, 62, 34, 18, 14, 2, 17, 7, 22, 4,
+ 48, 68, 4, 8, 22, 30, 12, 26, 56, 14,
+ 0, 54, 23, 85, 125, 125, 125, 125, 125, 125,
+ },
+
+ {
+ /* Context Tables for P, SP, B Slices :: cabac_init_idc = 1, qp = 45 */
+
+ 44, 8, 37, 44, 8, 37, 30, 48, 54, 24,
+ 13, 45, 63, 36, 120, 22, 54, 13, 18, 42,
+ 5, 23, 4, 37, 87, 23, 97, 125, 125, 125,
+ 124, 10, 1, 18, 42, 5, 43, 20, 22, 3,
+ 0, 7, 7, 1, 33, 29, 55, 10, 3, 9,
+ 8, 17, 5, 33, 0, 21, 17, 33, 12, 4,
+ 44, 0, 0, 0, 12, 67, 67, 22, 27, 13,
+ 24, 13, 43, 92, 46, 30, 112, 124, 58, 62,
+ 40, 64, 26, 24, 76, 33, 21, 21, 33, 48,
+ 3, 44, 70, 7, 13, 3, 48, 0, 23, 21,
+ 39, 38, 9, 18, 2, 5, 18, 0, 22, 20,
+ 20, 36, 52, 28, 16, 24, 124, 124, 124, 124,
+ 18, 0, 8, 28, 1, 22, 26, 10, 5, 80,
+ 5, 93, 85, 120, 18, 34, 10, 12, 22, 38,
+ 100, 58, 26, 5, 2, 4, 40, 48, 83, 8,
+ 10, 33, 5, 23, 12, 12, 17, 38, 24, 24,
+ 55, 18, 33, 23, 8, 51, 14, 14, 8, 35,
+ 79, 3, 29, 25, 35, 63, 55, 29, 125, 124,
+ 125, 49, 55, 71, 101, 73, 97, 125, 123, 109,
+ 67, 123, 55, 124, 95, 7, 3, 19, 29, 31,
+ 27, 37, 35, 37, 69, 69, 61, 65, 69, 117,
+ 23, 27, 71, 21, 35, 37, 51, 57, 69, 63,
+ 75, 71, 43, 75, 79, 63, 47, 32, 70, 44,
+ 20, 24, 34, 18, 10, 10, 64, 44, 66, 50,
+ 44, 38, 58, 40, 54, 46, 40, 36, 96, 80,
+ 70, 62, 72, 28, 12, 8, 6, 15, 110, 58,
+ 4, 5, 28, 4, 19, 5, 6, 80, 54, 16,
+ 1, 36, 10, 10, 7, 5, 124, 5, 8, 20,
+ 22, 20, 12, 36, 44, 18, 40, 46, 68, 5,
+ 6, 17, 0, 56, 17, 21, 13, 8, 10, 11,
+ 29, 11, 35, 11, 15, 49, 28, 26, 62, 10,
+ 5, 26, 16, 2, 14, 12, 2, 25, 0, 3,
+ 11, 7, 34, 71, 37, 32, 43, 7, 21, 11,
+ 20, 21, 27, 8, 10, 59, 21, 73, 32, 38,
+ 48, 4, 15, 7, 13, 25, 31, 49, 53, 53,
+ 79, 69, 71, 119, 101, 115, 125, 97, 95, 83,
+ 75, 95, 69, 81, 75, 49, 63, 15, 19, 23,
+ 25, 65, 49, 49, 89, 61, 77, 79, 121, 87,
+ 95, 101, 35, 31, 109, 39, 45, 65, 97, 75,
+ 89, 91, 57, 91, 73, 79, 99, 93, 99, 77,
+ 89, 99, 13, 1, 30, 4, 62, 82, 2, 18,
+ 36, 44, 10, 40, 74, 8, 1, 52, 27, 91,
+ 125, 125, 125, 125, 125, 125, 14, 78, 60, 54,
+ 34, 64, 34, 20, 16, 2, 17, 7, 22, 4,
+ 50, 70, 4, 8, 24, 32, 12, 28, 58, 14,
+ 0, 52, 27, 91, 125, 125, 125, 125, 125, 125,
+ },
+
+ {
+ /* Context Tables for P, SP, B Slices :: cabac_init_idc = 1, qp = 46 */
+
+ 42, 8, 37, 42, 8, 37, 34, 52, 56, 24,
+ 15, 49, 67, 34, 120, 22, 58, 13, 20, 44,
+ 5, 23, 4, 37, 89, 25, 103, 125, 125, 125,
+ 124, 12, 1, 20, 44, 5, 45, 22, 22, 5,
+ 0, 7, 7, 1, 33, 29, 55, 10, 3, 7,
+ 8, 17, 3, 33, 0, 21, 17, 33, 12, 4,
+ 44, 0, 0, 0, 12, 67, 67, 24, 29, 13,
+ 22, 15, 43, 98, 48, 32, 114, 124, 60, 66,
+ 44, 66, 28, 26, 80, 33, 21, 19, 33, 48,
+ 3, 46, 74, 7, 11, 3, 52, 0, 25, 23,
+ 41, 38, 9, 20, 2, 3, 20, 0, 24, 20,
+ 22, 40, 54, 28, 18, 26, 124, 124, 124, 124,
+ 18, 0, 8, 28, 1, 22, 28, 12, 5, 84,
+ 5, 97, 89, 124, 18, 34, 10, 12, 22, 38,
+ 102, 60, 26, 5, 2, 4, 40, 50, 85, 8,
+ 10, 35, 5, 25, 12, 12, 17, 38, 24, 24,
+ 57, 18, 35, 25, 8, 51, 14, 14, 8, 39,
+ 85, 5, 33, 29, 41, 67, 61, 33, 125, 124,
+ 125, 53, 59, 75, 107, 79, 103, 125, 125, 115,
+ 71, 125, 59, 124, 99, 11, 7, 23, 35, 37,
+ 31, 41, 39, 41, 73, 73, 65, 69, 71, 121,
+ 23, 27, 73, 23, 37, 39, 53, 59, 71, 65,
+ 79, 73, 45, 77, 81, 63, 47, 36, 72, 46,
+ 20, 24, 36, 18, 10, 10, 68, 46, 66, 50,
+ 46, 40, 60, 42, 56, 48, 44, 38, 96, 80,
+ 70, 62, 74, 30, 12, 8, 6, 15, 110, 58,
+ 2, 5, 28, 4, 19, 5, 6, 78, 54, 14,
+ 3, 36, 10, 10, 7, 5, 124, 3, 12, 22,
+ 24, 22, 14, 40, 48, 20, 42, 50, 72, 5,
+ 8, 17, 2, 60, 15, 21, 13, 8, 10, 11,
+ 29, 11, 35, 11, 15, 51, 30, 28, 64, 10,
+ 5, 28, 16, 2, 14, 12, 2, 25, 0, 3,
+ 11, 7, 34, 73, 39, 32, 45, 9, 23, 11,
+ 20, 23, 29, 8, 10, 61, 21, 75, 30, 36,
+ 46, 2, 19, 11, 17, 29, 37, 53, 59, 59,
+ 85, 73, 75, 125, 109, 123, 125, 105, 101, 87,
+ 79, 101, 73, 85, 79, 51, 65, 19, 25, 29,
+ 31, 71, 53, 55, 95, 67, 83, 85, 125, 91,
+ 99, 103, 37, 33, 111, 43, 49, 69, 101, 77,
+ 93, 95, 59, 93, 75, 81, 101, 95, 101, 79,
+ 91, 101, 13, 0, 30, 6, 64, 84, 2, 18,
+ 36, 44, 10, 40, 76, 8, 1, 50, 31, 95,
+ 125, 125, 125, 125, 125, 125, 14, 78, 60, 54,
+ 34, 64, 36, 20, 16, 4, 17, 7, 24, 6,
+ 52, 72, 4, 10, 24, 32, 12, 28, 58, 14,
+ 0, 50, 31, 95, 125, 125, 125, 125, 125, 125,
+ },
+
+ {
+ /* Context Tables for P, SP, B Slices :: cabac_init_idc = 1, qp = 47 */
+
+ 40, 8, 37, 40, 8, 37, 38, 54, 56, 24,
+ 15, 51, 71, 32, 120, 22, 62, 13, 20, 46,
+ 5, 25, 4, 39, 93, 27, 107, 125, 125, 125,
+ 124, 14, 1, 20, 46, 5, 45, 24, 22, 5,
+ 0, 7, 5, 1, 33, 29, 55, 10, 3, 7,
+ 8, 17, 3, 33, 0, 21, 17, 33, 12, 4,
+ 44, 0, 0, 0, 14, 67, 67, 24, 31, 13,
+ 20, 15, 43, 102, 50, 34, 118, 124, 64, 68,
+ 48, 70, 30, 28, 84, 33, 21, 19, 33, 48,
+ 3, 48, 78, 7, 9, 1, 56, 0, 25, 23,
+ 41, 38, 9, 20, 4, 1, 22, 2, 26, 22,
+ 24, 42, 58, 30, 20, 28, 124, 124, 124, 124,
+ 18, 0, 8, 30, 1, 22, 28, 12, 3, 88,
+ 5, 101, 93, 124, 18, 34, 10, 12, 22, 40,
+ 106, 62, 26, 5, 2, 4, 42, 52, 87, 8,
+ 10, 35, 7, 27, 10, 12, 19, 38, 24, 24,
+ 59, 18, 37, 27, 6, 51, 14, 12, 6, 43,
+ 91, 7, 37, 33, 45, 71, 65, 37, 125, 124,
+ 125, 57, 63, 79, 113, 83, 109, 125, 125, 121,
+ 75, 125, 63, 124, 103, 15, 11, 27, 39, 41,
+ 35, 45, 43, 45, 77, 77, 69, 73, 75, 125,
+ 23, 27, 73, 25, 39, 41, 55, 61, 73, 67,
+ 83, 75, 45, 79, 83, 65, 45, 38, 74, 46,
+ 20, 24, 38, 20, 12, 12, 72, 48, 66, 50,
+ 46, 42, 62, 44, 60, 52, 48, 40, 98, 82,
+ 72, 64, 76, 30, 12, 8, 8, 15, 112, 58,
+ 0, 5, 28, 4, 19, 3, 6, 78, 52, 12,
+ 5, 36, 10, 10, 7, 5, 124, 1, 14, 24,
+ 28, 26, 16, 44, 52, 22, 46, 54, 76, 3,
+ 10, 15, 4, 64, 15, 21, 13, 10, 12, 11,
+ 29, 11, 37, 11, 15, 51, 30, 28, 66, 10,
+ 5, 28, 16, 2, 14, 12, 2, 27, 0, 3,
+ 11, 7, 36, 75, 41, 32, 47, 11, 25, 13,
+ 20, 25, 31, 8, 10, 63, 21, 77, 28, 34,
+ 44, 1, 23, 15, 21, 33, 41, 59, 65, 65,
+ 91, 79, 79, 125, 117, 125, 125, 111, 107, 93,
+ 83, 105, 77, 89, 81, 53, 67, 23, 29, 33,
+ 35, 77, 59, 59, 101, 71, 87, 89, 125, 95,
+ 103, 107, 39, 35, 115, 45, 51, 73, 105, 81,
+ 97, 99, 61, 97, 77, 83, 103, 97, 103, 83,
+ 93, 103, 13, 0, 32, 6, 66, 86, 2, 20,
+ 38, 46, 10, 42, 78, 8, 1, 48, 35, 101,
+ 125, 125, 125, 125, 125, 125, 14, 78, 60, 54,
+ 34, 66, 36, 20, 16, 4, 17, 7, 24, 6,
+ 54, 74, 4, 10, 24, 32, 12, 28, 60, 14,
+ 0, 48, 35, 101, 125, 125, 125, 125, 125, 125,
+ },
+
+ {
+ /* Context Tables for P, SP, B Slices :: cabac_init_idc = 1, qp = 48 */
+
+ 36, 6, 39, 36, 6, 39, 40, 56, 56, 24,
+ 17, 55, 77, 30, 120, 20, 64, 15, 20, 46,
+ 7, 27, 4, 41, 97, 31, 113, 125, 125, 125,
+ 124, 14, 1, 20, 46, 7, 47, 24, 22, 7,
+ 0, 7, 5, 3, 35, 29, 55, 8, 3, 7,
+ 8, 17, 3, 33, 0, 21, 19, 33, 12, 4,
+ 44, 0, 0, 0, 14, 69, 67, 24, 33, 15,
+ 18, 17, 45, 106, 52, 36, 120, 124, 66, 70,
+ 50, 72, 32, 30, 88, 33, 21, 19, 33, 48,
+ 3, 48, 82, 7, 9, 1, 58, 1, 27, 25,
+ 43, 38, 9, 20, 4, 1, 22, 2, 26, 22,
+ 24, 44, 60, 30, 22, 30, 124, 124, 124, 124,
+ 18, 0, 8, 30, 3, 22, 28, 12, 3, 92,
+ 5, 105, 97, 124, 18, 34, 10, 12, 22, 40,
+ 108, 62, 26, 7, 0, 4, 42, 52, 91, 8,
+ 10, 37, 9, 31, 8, 10, 21, 38, 24, 22,
+ 63, 18, 39, 29, 4, 51, 12, 10, 4, 49,
+ 99, 11, 41, 39, 51, 77, 71, 41, 125, 124,
+ 125, 63, 67, 85, 121, 89, 117, 125, 125, 125,
+ 79, 125, 67, 124, 107, 21, 17, 33, 45, 47,
+ 41, 51, 49, 49, 83, 83, 73, 77, 79, 125,
+ 23, 29, 75, 27, 43, 45, 59, 65, 77, 71,
+ 87, 77, 47, 81, 85, 67, 45, 40, 74, 46,
+ 20, 24, 38, 20, 12, 12, 76, 50, 66, 50,
+ 46, 42, 62, 46, 62, 54, 50, 40, 98, 82,
+ 72, 64, 78, 30, 12, 8, 8, 17, 112, 56,
+ 1, 7, 28, 4, 19, 3, 4, 76, 50, 8,
+ 7, 34, 10, 10, 7, 5, 124, 0, 16, 26,
+ 30, 28, 18, 46, 54, 24, 48, 56, 80, 3,
+ 10, 15, 6, 66, 15, 21, 13, 10, 12, 11,
+ 31, 13, 39, 11, 17, 53, 30, 28, 68, 10,
+ 7, 28, 16, 0, 14, 12, 2, 29, 0, 3,
+ 11, 7, 36, 77, 43, 32, 51, 13, 27, 15,
+ 18, 27, 33, 8, 8, 67, 23, 81, 24, 30,
+ 42, 5, 27, 21, 27, 39, 47, 65, 71, 71,
+ 99, 85, 83, 125, 125, 125, 125, 119, 115, 99,
+ 89, 111, 81, 93, 85, 55, 69, 29, 35, 39,
+ 41, 83, 65, 65, 109, 77, 93, 95, 125, 99,
+ 107, 111, 41, 37, 119, 49, 55, 77, 111, 85,
+ 101, 103, 65, 101, 81, 85, 107, 99, 105, 87,
+ 97, 107, 13, 0, 32, 6, 68, 88, 2, 20,
+ 38, 46, 10, 42, 78, 8, 3, 44, 39, 107,
+ 125, 125, 125, 125, 125, 125, 14, 78, 60, 54,
+ 34, 66, 36, 20, 16, 4, 17, 7, 24, 6,
+ 54, 74, 4, 10, 24, 32, 12, 28, 60, 12,
+ 1, 44, 39, 107, 125, 125, 125, 125, 125, 125,
+ },
+
+ {
+ /* Context Tables for P, SP, B Slices :: cabac_init_idc = 1, qp = 49 */
+
+ 34, 6, 39, 34, 6, 39, 44, 60, 58, 26,
+ 17, 57, 81, 28, 122, 20, 68, 15, 22, 48,
+ 7, 27, 6, 41, 99, 33, 117, 125, 125, 125,
+ 124, 16, 0, 22, 48, 7, 47, 26, 22, 7,
+ 2, 5, 3, 3, 35, 27, 53, 8, 1, 5,
+ 8, 15, 1, 31, 2, 19, 19, 31, 14, 4,
+ 44, 0, 0, 0, 16, 69, 67, 26, 33, 15,
+ 18, 17, 45, 112, 56, 40, 124, 124, 70, 74,
+ 54, 76, 36, 34, 94, 33, 19, 17, 31, 48,
+ 3, 50, 88, 5, 7, 0, 62, 1, 27, 25,
+ 43, 40, 9, 22, 6, 0, 24, 4, 28, 24,
+ 26, 48, 64, 32, 26, 34, 124, 124, 124, 124,
+ 20, 2, 10, 32, 3, 24, 30, 14, 1, 98,
+ 3, 109, 99, 124, 18, 36, 10, 14, 24, 42,
+ 112, 64, 28, 7, 0, 4, 44, 54, 93, 10,
+ 10, 37, 9, 33, 8, 10, 21, 38, 24, 22,
+ 65, 20, 39, 29, 4, 51, 12, 10, 4, 53,
+ 105, 13, 43, 43, 55, 81, 75, 45, 125, 124,
+ 125, 67, 69, 89, 125, 93, 123, 125, 125, 125,
+ 83, 125, 71, 124, 109, 25, 21, 37, 49, 51,
+ 45, 55, 53, 53, 87, 87, 75, 79, 81, 125,
+ 23, 29, 75, 27, 45, 47, 61, 67, 79, 73,
+ 89, 79, 47, 83, 85, 67, 43, 44, 76, 48,
+ 20, 26, 40, 22, 14, 14, 82, 54, 68, 52,
+ 48, 44, 64, 50, 66, 58, 54, 42, 100, 84,
+ 74, 66, 80, 32, 14, 10, 10, 17, 114, 56,
+ 1, 7, 30, 6, 19, 1, 4, 76, 50, 6,
+ 9, 34, 12, 12, 5, 3, 124, 4, 20, 30,
+ 34, 32, 22, 50, 58, 28, 52, 60, 86, 1,
+ 12, 13, 10, 70, 13, 19, 11, 12, 14, 9,
+ 31, 13, 39, 11, 17, 53, 32, 30, 70, 12,
+ 7, 30, 18, 0, 16, 14, 2, 29, 2, 1,
+ 9, 5, 38, 77, 45, 32, 53, 13, 27, 15,
+ 18, 27, 33, 8, 8, 69, 23, 83, 22, 28,
+ 42, 7, 29, 25, 31, 43, 51, 69, 75, 75,
+ 105, 89, 87, 125, 125, 125, 125, 125, 121, 103,
+ 93, 115, 83, 95, 87, 55, 69, 33, 39, 43,
+ 45, 87, 69, 69, 115, 81, 97, 99, 125, 101,
+ 109, 113, 41, 37, 121, 51, 57, 79, 115, 87,
+ 103, 105, 67, 103, 83, 85, 109, 99, 105, 89,
+ 99, 109, 11, 2, 34, 8, 72, 92, 4, 22,
+ 40, 48, 12, 44, 80, 8, 3, 42, 43, 111,
+ 125, 125, 125, 125, 125, 125, 16, 78, 62, 56,
+ 36, 68, 38, 22, 18, 6, 15, 5, 26, 8,
+ 56, 76, 6, 12, 26, 34, 12, 30, 62, 12,
+ 1, 42, 43, 111, 125, 125, 125, 125, 125, 125,
+ },
+
+ {
+ /* Context Tables for P, SP, B Slices :: cabac_init_idc = 1, qp = 50 */
+
+ 32, 6, 39, 32, 6, 39, 48, 62, 58, 26,
+ 17, 59, 85, 26, 122, 20, 72, 15, 22, 50,
+ 7, 29, 6, 43, 103, 35, 123, 125, 125, 125,
+ 124, 18, 0, 22, 50, 7, 47, 28, 22, 9,
+ 2, 5, 1, 3, 35, 27, 53, 8, 1, 5,
+ 8, 15, 1, 31, 2, 19, 19, 31, 14, 4,
+ 44, 0, 0, 0, 18, 69, 67, 26, 35, 15,
+ 16, 17, 45, 116, 58, 42, 124, 124, 72, 76,
+ 58, 78, 38, 36, 98, 33, 19, 17, 31, 48,
+ 3, 52, 92, 5, 5, 2, 66, 1, 27, 25,
+ 45, 40, 9, 22, 8, 2, 26, 6, 30, 24,
+ 28, 50, 66, 34, 28, 36, 124, 124, 124, 124,
+ 20, 2, 10, 32, 3, 24, 30, 14, 1, 102,
+ 3, 113, 103, 124, 18, 36, 10, 14, 24, 42,
+ 116, 66, 28, 7, 0, 4, 46, 56, 95, 10,
+ 10, 37, 11, 35, 6, 10, 23, 38, 24, 22,
+ 67, 20, 41, 31, 2, 51, 12, 8, 2, 57,
+ 111, 15, 47, 47, 59, 85, 79, 49, 125, 124,
+ 125, 71, 73, 93, 125, 97, 125, 125, 125, 125,
+ 87, 125, 75, 124, 113, 29, 25, 41, 53, 55,
+ 49, 59, 57, 57, 91, 91, 79, 83, 85, 125,
+ 23, 29, 77, 29, 47, 49, 63, 69, 81, 75,
+ 93, 81, 47, 85, 87, 69, 41, 46, 78, 48,
+ 20, 26, 42, 24, 16, 16, 86, 56, 68, 52,
+ 48, 46, 66, 52, 70, 62, 58, 44, 102, 86,
+ 76, 68, 82, 32, 14, 10, 12, 17, 116, 56,
+ 3, 7, 30, 6, 19, 1, 4, 76, 48, 4,
+ 11, 34, 12, 12, 5, 3, 124, 6, 22, 32,
+ 38, 36, 24, 54, 62, 30, 56, 64, 90, 0,
+ 14, 13, 12, 74, 13, 19, 11, 14, 14, 9,
+ 31, 13, 41, 11, 17, 53, 32, 30, 72, 12,
+ 7, 30, 18, 0, 16, 14, 2, 31, 2, 1,
+ 9, 5, 40, 79, 47, 32, 55, 15, 29, 17,
+ 18, 29, 35, 8, 8, 71, 23, 85, 20, 26,
+ 40, 11, 33, 29, 35, 47, 55, 75, 81, 81,
+ 111, 95, 91, 125, 125, 125, 125, 125, 125, 109,
+ 97, 121, 87, 99, 89, 57, 71, 37, 43, 49,
+ 49, 93, 75, 75, 121, 85, 103, 103, 125, 105,
+ 113, 117, 43, 39, 125, 53, 59, 83, 119, 91,
+ 107, 109, 69, 107, 85, 87, 111, 101, 107, 93,
+ 101, 111, 11, 2, 36, 8, 74, 94, 4, 24,
+ 42, 50, 12, 44, 82, 8, 3, 40, 47, 117,
+ 125, 125, 125, 125, 125, 125, 16, 78, 62, 56,
+ 36, 70, 38, 22, 18, 6, 15, 5, 26, 8,
+ 58, 78, 6, 12, 26, 34, 12, 30, 64, 12,
+ 1, 40, 47, 117, 125, 125, 125, 125, 125, 125,
+ },
+
+ {
+ /* Context Tables for P, SP, B Slices :: cabac_init_idc = 1, qp = 51 */
+
+ 30, 6, 39, 30, 6, 39, 52, 66, 60, 26,
+ 19, 63, 89, 24, 122, 20, 76, 15, 24, 52,
+ 7, 29, 6, 43, 105, 37, 125, 125, 125, 125,
+ 124, 20, 0, 24, 52, 7, 49, 30, 22, 9,
+ 2, 5, 1, 3, 35, 27, 53, 8, 1, 3,
+ 8, 15, 0, 31, 2, 19, 19, 31, 14, 4,
+ 44, 0, 0, 0, 18, 69, 67, 28, 37, 15,
+ 14, 19, 45, 122, 60, 44, 124, 124, 76, 80,
+ 62, 82, 40, 38, 102, 33, 19, 15, 31, 48,
+ 3, 54, 96, 5, 3, 2, 70, 1, 29, 27,
+ 45, 40, 9, 24, 8, 4, 28, 6, 32, 26,
+ 30, 54, 70, 34, 30, 38, 124, 124, 124, 124,
+ 20, 2, 10, 34, 3, 24, 32, 16, 0, 106,
+ 3, 117, 107, 124, 18, 36, 10, 14, 24, 44,
+ 118, 68, 28, 7, 0, 4, 46, 58, 97, 10,
+ 10, 39, 11, 37, 6, 10, 23, 38, 24, 22,
+ 69, 20, 43, 33, 2, 51, 12, 8, 2, 61,
+ 117, 17, 51, 51, 65, 89, 85, 53, 125, 124,
+ 125, 75, 77, 97, 125, 103, 125, 125, 125, 125,
+ 91, 125, 79, 124, 117, 33, 29, 45, 59, 61,
+ 53, 63, 61, 61, 95, 95, 83, 87, 87, 125,
+ 23, 29, 77, 31, 49, 51, 65, 71, 83, 77,
+ 97, 83, 49, 87, 89, 69, 41, 50, 80, 50,
+ 20, 26, 44, 24, 16, 16, 90, 58, 68, 52,
+ 50, 48, 68, 54, 72, 64, 62, 46, 102, 86,
+ 76, 68, 84, 34, 14, 10, 12, 17, 116, 56,
+ 5, 7, 30, 6, 19, 0, 4, 74, 48, 2,
+ 13, 34, 12, 12, 5, 3, 124, 8, 26, 34,
+ 40, 38, 26, 58, 66, 32, 58, 68, 94, 0,
+ 16, 11, 14, 78, 11, 19, 11, 14, 16, 9,
+ 31, 13, 41, 11, 17, 55, 34, 32, 74, 12,
+ 7, 32, 18, 0, 16, 14, 2, 31, 2, 1,
+ 9, 5, 40, 81, 49, 32, 57, 17, 31, 17,
+ 18, 31, 37, 8, 8, 73, 23, 87, 18, 24,
+ 38, 13, 37, 33, 39, 51, 61, 79, 87, 87,
+ 117, 99, 95, 125, 125, 125, 125, 125, 125, 113,
+ 101, 125, 91, 103, 93, 59, 73, 41, 49, 53,
+ 55, 99, 79, 79, 125, 91, 107, 109, 125, 109,
+ 117, 119, 45, 41, 125, 57, 63, 87, 123, 93,
+ 111, 113, 71, 109, 87, 89, 113, 103, 109, 95,
+ 103, 113, 11, 4, 36, 10, 76, 96, 4, 24,
+ 42, 50, 12, 46, 84, 8, 3, 38, 51, 121,
+ 125, 125, 125, 125, 125, 125, 16, 78, 62, 56,
+ 36, 70, 40, 22, 18, 8, 15, 5, 28, 10,
+ 60, 80, 6, 14, 26, 34, 12, 30, 64, 12,
+ 1, 38, 51, 121, 125, 125, 125, 125, 125, 125,
+ },
+
+ },
+
+ {
+
+ {
+ /* Context Tables for P, SP, B Slices :: cabac_init_idc = 2, qp = 0 */
+
+ 124, 18, 21, 124, 18, 21, 125, 81, 20, 18,
+ 24, 94, 124, 124, 24, 2, 71, 94, 43, 77,
+ 12, 12, 19, 12, 46, 106, 124, 124, 42, 67,
+ 125, 107, 21, 43, 77, 12, 59, 49, 38, 16,
+ 51, 79, 105, 12, 10, 41, 65, 0, 43, 85,
+ 0, 23, 53, 75, 16, 31, 23, 67, 26, 6,
+ 44, 0, 0, 0, 39, 45, 67, 17, 44, 2,
+ 58, 49, 125, 125, 55, 63, 41, 45, 51, 55,
+ 125, 25, 79, 53, 125, 33, 25, 41, 29, 16,
+ 4, 39, 125, 31, 81, 55, 125, 3, 31, 17,
+ 57, 14, 9, 15, 69, 45, 49, 37, 17, 7,
+ 17, 51, 11, 8, 5, 12, 15, 15, 10, 21,
+ 38, 11, 2, 24, 32, 42, 44, 20, 25, 29,
+ 39, 22, 7, 53, 7, 17, 23, 33, 39, 1,
+ 64, 1, 61, 23, 0, 21, 56, 72, 55, 3,
+ 11, 27, 5, 2, 9, 35, 66, 112, 80, 21,
+ 5, 121, 52, 124, 124, 125, 48, 42, 58, 68,
+ 64, 52, 42, 46, 60, 40, 54, 32, 16, 10,
+ 6, 38, 38, 42, 30, 14, 22, 52, 28, 10,
+ 30, 36, 11, 60, 0, 124, 124, 124, 106, 124,
+ 124, 124, 124, 92, 76, 68, 60, 96, 86, 19,
+ 58, 64, 38, 94, 54, 54, 70, 84, 86, 102,
+ 94, 42, 59, 14, 12, 50, 125, 103, 37, 2,
+ 20, 8, 43, 51, 61, 57, 125, 73, 12, 7,
+ 15, 27, 43, 49, 81, 69, 125, 37, 30, 4,
+ 5, 13, 23, 31, 39, 57, 89, 31, 11, 23,
+ 10, 10, 29, 39, 35, 71, 35, 50, 2, 10,
+ 8, 19, 25, 45, 39, 47, 124, 125, 125, 113,
+ 125, 101, 107, 109, 107, 99, 109, 113, 121, 61,
+ 77, 71, 85, 125, 57, 12, 45, 61, 55, 27,
+ 15, 19, 1, 35, 1, 12, 7, 9, 7, 9,
+ 27, 1, 9, 29, 16, 8, 3, 18, 38, 6,
+ 13, 25, 45, 13, 1, 13, 16, 14, 11, 3,
+ 21, 18, 18, 25, 37, 27, 27, 42, 124, 124,
+ 124, 124, 124, 124, 124, 124, 124, 124, 124, 124,
+ 124, 124, 104, 124, 124, 124, 124, 124, 124, 96,
+ 124, 124, 92, 50, 36, 18, 31, 124, 124, 124,
+ 124, 96, 96, 76, 82, 94, 90, 70, 44, 70,
+ 32, 2, 64, 74, 78, 80, 94, 66, 68, 44,
+ 42, 6, 22, 6, 29, 119, 20, 14, 4, 60,
+ 26, 4, 29, 21, 17, 17, 23, 15, 0, 13,
+ 23, 17, 7, 20, 8, 22, 9, 124, 124, 124,
+ 124, 112, 102, 80, 50, 1, 15, 52, 38, 28,
+ 14, 8, 0, 7, 9, 31, 29, 21, 17, 17,
+ 23, 15, 0, 13, 23, 17, 7, 20, 8, 22,
+ 9, 124, 124, 124, 124, 112, 102, 80, 50, 1,
+ },
+
+ {
+ /* Context Tables for P, SP, B Slices :: cabac_init_idc = 2, qp = 1 */
+
+ 124, 18, 21, 124, 18, 21, 123, 77, 22, 20,
+ 24, 92, 124, 124, 26, 4, 67, 92, 41, 73,
+ 12, 12, 15, 12, 44, 104, 124, 120, 38, 67,
+ 123, 103, 19, 41, 73, 12, 57, 47, 40, 16,
+ 49, 77, 101, 10, 8, 41, 65, 0, 41, 83,
+ 0, 23, 51, 73, 16, 29, 21, 65, 28, 6,
+ 44, 0, 0, 0, 37, 45, 67, 15, 44, 2,
+ 58, 47, 123, 121, 51, 61, 37, 41, 49, 51,
+ 123, 23, 75, 51, 121, 33, 25, 41, 29, 18,
+ 4, 37, 121, 29, 79, 53, 123, 3, 29, 17,
+ 55, 16, 9, 13, 67, 43, 47, 35, 15, 5,
+ 15, 49, 9, 10, 5, 12, 13, 13, 10, 19,
+ 40, 9, 2, 26, 34, 44, 46, 22, 25, 27,
+ 37, 22, 7, 51, 7, 15, 21, 31, 35, 2,
+ 66, 2, 57, 23, 1, 19, 58, 74, 55, 3,
+ 9, 27, 3, 2, 7, 31, 66, 112, 82, 17,
+ 7, 117, 50, 124, 124, 123, 48, 42, 58, 68,
+ 64, 52, 42, 46, 60, 40, 54, 32, 16, 10,
+ 6, 38, 38, 42, 30, 14, 22, 52, 28, 8,
+ 30, 36, 11, 58, 0, 124, 124, 124, 104, 124,
+ 124, 124, 124, 90, 74, 64, 58, 92, 84, 21,
+ 56, 62, 36, 92, 54, 54, 68, 82, 84, 100,
+ 92, 40, 59, 14, 12, 48, 123, 99, 33, 4,
+ 20, 8, 41, 49, 59, 55, 123, 69, 14, 5,
+ 13, 25, 39, 47, 77, 67, 121, 35, 32, 6,
+ 3, 11, 21, 29, 37, 55, 85, 29, 7, 21,
+ 12, 10, 27, 37, 33, 69, 33, 52, 4, 12,
+ 10, 17, 23, 43, 37, 45, 124, 123, 123, 109,
+ 123, 97, 103, 105, 103, 95, 105, 109, 115, 59,
+ 75, 69, 83, 119, 55, 10, 43, 59, 53, 25,
+ 15, 17, 1, 33, 1, 12, 7, 9, 5, 9,
+ 27, 1, 9, 27, 16, 8, 3, 18, 38, 6,
+ 13, 23, 41, 13, 1, 11, 16, 14, 11, 3,
+ 19, 18, 18, 23, 35, 25, 25, 40, 124, 124,
+ 124, 124, 124, 124, 124, 124, 124, 124, 124, 124,
+ 124, 124, 100, 124, 124, 124, 124, 124, 124, 94,
+ 120, 120, 90, 48, 34, 18, 31, 124, 124, 124,
+ 120, 92, 94, 74, 78, 92, 86, 68, 40, 66,
+ 30, 0, 62, 72, 74, 78, 92, 64, 66, 42,
+ 40, 4, 22, 6, 29, 117, 18, 12, 2, 58,
+ 24, 2, 27, 19, 15, 15, 19, 13, 2, 11,
+ 19, 15, 5, 22, 10, 24, 7, 124, 124, 124,
+ 124, 108, 100, 76, 48, 3, 13, 54, 40, 30,
+ 16, 10, 2, 5, 7, 29, 27, 19, 15, 15,
+ 19, 13, 2, 11, 19, 15, 5, 22, 10, 24,
+ 7, 124, 124, 124, 124, 108, 100, 76, 48, 3,
+ },
+
+ {
+ /* Context Tables for P, SP, B Slices :: cabac_init_idc = 2, qp = 2 */
+
+ 124, 18, 21, 124, 18, 21, 119, 75, 22, 20,
+ 24, 88, 120, 124, 28, 4, 63, 88, 41, 71,
+ 12, 12, 13, 10, 42, 102, 120, 114, 34, 69,
+ 119, 101, 19, 41, 71, 12, 57, 45, 40, 16,
+ 47, 75, 99, 8, 6, 41, 65, 0, 41, 81,
+ 0, 23, 51, 73, 16, 29, 21, 63, 28, 6,
+ 44, 0, 0, 0, 35, 45, 67, 15, 42, 2,
+ 58, 45, 121, 117, 49, 59, 33, 37, 47, 49,
+ 119, 21, 73, 49, 117, 35, 25, 41, 29, 18,
+ 4, 35, 117, 29, 77, 51, 119, 3, 29, 17,
+ 55, 16, 9, 13, 65, 43, 45, 35, 15, 5,
+ 15, 47, 7, 10, 5, 12, 13, 13, 10, 19,
+ 40, 9, 2, 26, 34, 44, 46, 22, 27, 25,
+ 35, 20, 7, 51, 7, 13, 21, 31, 33, 4,
+ 68, 6, 53, 25, 3, 19, 58, 74, 57, 3,
+ 9, 29, 1, 2, 7, 29, 66, 112, 82, 15,
+ 9, 115, 48, 124, 124, 121, 48, 42, 58, 66,
+ 62, 52, 42, 46, 58, 38, 52, 32, 16, 10,
+ 6, 36, 36, 40, 30, 14, 22, 50, 26, 6,
+ 28, 34, 11, 56, 1, 124, 124, 124, 100, 120,
+ 124, 124, 124, 88, 70, 60, 54, 88, 80, 23,
+ 54, 60, 32, 90, 52, 52, 66, 78, 80, 96,
+ 88, 36, 59, 12, 10, 44, 121, 97, 31, 6,
+ 20, 8, 39, 47, 57, 53, 119, 67, 16, 3,
+ 11, 23, 37, 45, 75, 65, 117, 33, 32, 6,
+ 3, 11, 19, 27, 35, 53, 83, 29, 5, 19,
+ 12, 10, 25, 35, 33, 67, 31, 52, 6, 12,
+ 10, 15, 21, 41, 35, 43, 124, 121, 119, 105,
+ 119, 95, 101, 101, 99, 93, 101, 105, 111, 57,
+ 73, 67, 81, 113, 55, 8, 43, 57, 51, 25,
+ 15, 17, 1, 33, 1, 10, 7, 9, 3, 9,
+ 27, 1, 9, 27, 16, 8, 3, 16, 36, 6,
+ 13, 23, 39, 15, 1, 9, 14, 14, 11, 3,
+ 19, 18, 18, 23, 33, 25, 25, 36, 124, 124,
+ 124, 124, 124, 124, 124, 124, 124, 124, 124, 124,
+ 124, 124, 96, 124, 124, 124, 124, 124, 122, 90,
+ 116, 116, 86, 46, 32, 16, 31, 124, 124, 124,
+ 116, 88, 90, 70, 74, 88, 82, 64, 36, 62,
+ 26, 1, 60, 70, 70, 74, 88, 60, 62, 40,
+ 38, 2, 20, 4, 29, 115, 16, 10, 1, 56,
+ 22, 0, 27, 19, 13, 13, 17, 11, 4, 11,
+ 17, 13, 3, 22, 12, 26, 5, 124, 124, 124,
+ 120, 104, 96, 72, 44, 5, 11, 54, 40, 32,
+ 18, 12, 2, 3, 7, 27, 27, 19, 13, 13,
+ 17, 11, 4, 11, 17, 13, 3, 22, 12, 26,
+ 5, 124, 124, 124, 120, 104, 96, 72, 44, 5,
+ },
+
+ {
+ /* Context Tables for P, SP, B Slices :: cabac_init_idc = 2, qp = 3 */
+
+ 124, 18, 21, 124, 18, 21, 115, 71, 24, 20,
+ 22, 84, 118, 122, 28, 4, 59, 86, 41, 67,
+ 12, 10, 11, 8, 40, 100, 116, 106, 30, 71,
+ 115, 97, 19, 41, 67, 12, 55, 43, 42, 16,
+ 45, 73, 97, 6, 4, 41, 67, 0, 41, 79,
+ 0, 25, 51, 73, 16, 29, 21, 61, 30, 6,
+ 44, 0, 0, 0, 35, 45, 67, 13, 40, 2,
+ 56, 45, 119, 113, 47, 57, 31, 35, 45, 47,
+ 115, 19, 71, 47, 113, 37, 25, 41, 29, 20,
+ 4, 33, 113, 29, 75, 49, 115, 3, 29, 17,
+ 55, 18, 9, 11, 63, 43, 43, 35, 15, 5,
+ 13, 45, 7, 10, 5, 12, 13, 13, 10, 19,
+ 40, 9, 2, 28, 34, 46, 46, 24, 27, 25,
+ 33, 20, 7, 51, 7, 11, 21, 29, 31, 6,
+ 70, 8, 49, 25, 5, 17, 58, 74, 59, 3,
+ 7, 29, 1, 2, 7, 27, 66, 112, 82, 13,
+ 11, 111, 46, 124, 124, 117, 48, 42, 56, 64,
+ 62, 50, 40, 46, 58, 36, 50, 32, 16, 10,
+ 4, 36, 34, 38, 28, 14, 22, 48, 26, 4,
+ 28, 32, 11, 54, 1, 124, 124, 122, 98, 116,
+ 124, 124, 124, 86, 66, 56, 52, 84, 76, 27,
+ 52, 58, 28, 88, 50, 50, 64, 76, 76, 92,
+ 84, 34, 59, 10, 8, 42, 117, 93, 27, 6,
+ 20, 8, 37, 45, 55, 51, 115, 65, 18, 1,
+ 9, 23, 35, 43, 71, 63, 113, 33, 34, 8,
+ 1, 9, 17, 27, 35, 51, 81, 29, 1, 17,
+ 12, 10, 23, 35, 33, 65, 29, 54, 8, 14,
+ 10, 13, 21, 39, 35, 43, 124, 117, 117, 103,
+ 115, 93, 97, 99, 97, 89, 97, 101, 107, 57,
+ 71, 67, 79, 107, 55, 6, 43, 55, 49, 25,
+ 15, 17, 1, 31, 1, 8, 7, 9, 3, 9,
+ 27, 1, 9, 27, 14, 8, 3, 14, 34, 6,
+ 13, 23, 37, 17, 1, 7, 12, 14, 11, 3,
+ 17, 18, 16, 21, 31, 25, 25, 34, 124, 124,
+ 124, 124, 124, 124, 124, 124, 124, 124, 124, 124,
+ 124, 124, 92, 124, 124, 124, 124, 124, 118, 86,
+ 112, 110, 82, 44, 30, 14, 31, 124, 124, 124,
+ 112, 84, 86, 68, 70, 84, 78, 60, 32, 58,
+ 22, 3, 58, 68, 66, 72, 84, 58, 58, 36,
+ 34, 0, 18, 2, 29, 113, 14, 6, 3, 54,
+ 20, 1, 27, 17, 13, 13, 15, 9, 6, 11,
+ 15, 11, 1, 24, 14, 26, 3, 124, 124, 124,
+ 116, 100, 92, 68, 40, 7, 11, 56, 42, 34,
+ 18, 14, 4, 3, 5, 27, 27, 17, 13, 13,
+ 15, 9, 6, 11, 15, 11, 1, 24, 14, 26,
+ 3, 124, 124, 124, 116, 100, 92, 68, 40, 7,
+ },
+
+ {
+ /* Context Tables for P, SP, B Slices :: cabac_init_idc = 2, qp = 4 */
+
+ 124, 18, 21, 124, 18, 21, 113, 69, 24, 20,
+ 22, 80, 114, 120, 30, 4, 57, 82, 41, 65,
+ 10, 10, 9, 6, 36, 96, 112, 100, 24, 73,
+ 111, 95, 19, 41, 65, 10, 55, 41, 42, 14,
+ 45, 71, 93, 4, 0, 43, 67, 0, 39, 77,
+ 1, 25, 51, 73, 16, 29, 21, 61, 30, 6,
+ 44, 0, 0, 0, 33, 47, 67, 13, 38, 2,
+ 56, 43, 117, 109, 45, 55, 27, 31, 45, 45,
+ 111, 17, 69, 45, 107, 37, 27, 41, 31, 20,
+ 2, 31, 107, 27, 75, 49, 111, 3, 29, 17,
+ 55, 18, 9, 11, 61, 43, 43, 33, 15, 5,
+ 13, 43, 5, 10, 7, 10, 13, 13, 10, 19,
+ 40, 9, 2, 28, 34, 46, 46, 24, 29, 23,
+ 33, 18, 7, 49, 7, 9, 19, 29, 27, 10,
+ 72, 12, 45, 27, 7, 17, 60, 74, 61, 3,
+ 7, 31, 0, 2, 7, 25, 66, 112, 82, 9,
+ 13, 109, 44, 124, 124, 115, 46, 42, 56, 64,
+ 60, 50, 40, 46, 56, 34, 48, 30, 16, 10,
+ 4, 34, 34, 36, 28, 12, 20, 46, 24, 2,
+ 26, 30, 11, 50, 3, 124, 124, 118, 94, 114,
+ 124, 124, 124, 84, 62, 50, 48, 80, 72, 29,
+ 48, 56, 26, 86, 48, 48, 60, 72, 72, 88,
+ 82, 30, 59, 8, 6, 38, 115, 91, 25, 8,
+ 20, 8, 35, 43, 53, 51, 111, 61, 20, 1,
+ 9, 21, 31, 41, 69, 61, 107, 31, 34, 8,
+ 1, 9, 15, 25, 33, 51, 79, 29, 0, 15,
+ 12, 10, 21, 33, 33, 63, 27, 54, 10, 14,
+ 10, 11, 19, 37, 33, 41, 124, 115, 113, 99,
+ 113, 91, 95, 95, 93, 87, 95, 97, 101, 55,
+ 69, 65, 77, 101, 53, 4, 41, 53, 49, 25,
+ 15, 17, 3, 31, 3, 6, 7, 9, 1, 9,
+ 27, 1, 9, 25, 14, 6, 3, 12, 32, 4,
+ 13, 23, 35, 19, 3, 7, 12, 12, 11, 3,
+ 17, 16, 16, 21, 31, 25, 25, 30, 124, 124,
+ 124, 124, 124, 124, 124, 124, 124, 124, 124, 124,
+ 124, 124, 88, 124, 124, 124, 124, 124, 114, 82,
+ 108, 106, 78, 40, 28, 12, 31, 124, 124, 124,
+ 108, 80, 82, 64, 66, 80, 74, 56, 28, 52,
+ 20, 7, 56, 66, 60, 68, 82, 54, 54, 34,
+ 32, 1, 16, 0, 29, 111, 10, 4, 7, 50,
+ 18, 3, 27, 17, 11, 11, 13, 9, 6, 9,
+ 13, 9, 0, 24, 16, 28, 3, 124, 124, 120,
+ 112, 96, 88, 62, 36, 11, 9, 56, 42, 34,
+ 20, 14, 4, 1, 5, 25, 27, 17, 11, 11,
+ 13, 9, 6, 9, 13, 9, 0, 24, 16, 28,
+ 3, 124, 124, 120, 112, 96, 88, 62, 36, 11,
+ },
+
+ {
+ /* Context Tables for P, SP, B Slices :: cabac_init_idc = 2, qp = 5 */
+
+ 124, 18, 21, 124, 18, 21, 109, 65, 24, 20,
+ 20, 76, 112, 118, 32, 4, 53, 78, 39, 61,
+ 10, 10, 7, 4, 34, 94, 108, 94, 20, 73,
+ 107, 93, 19, 39, 61, 10, 55, 39, 42, 14,
+ 43, 69, 91, 2, 1, 43, 67, 0, 39, 75,
+ 1, 25, 51, 73, 16, 27, 21, 59, 32, 6,
+ 44, 0, 0, 0, 33, 47, 67, 11, 36, 2,
+ 54, 43, 113, 103, 43, 53, 25, 29, 43, 43,
+ 107, 15, 67, 43, 103, 39, 27, 41, 31, 20,
+ 2, 29, 103, 27, 73, 47, 107, 3, 29, 17,
+ 53, 18, 9, 9, 59, 41, 41, 33, 15, 3,
+ 11, 41, 5, 10, 7, 10, 11, 13, 10, 19,
+ 42, 9, 2, 30, 36, 46, 46, 24, 29, 23,
+ 31, 18, 7, 49, 7, 7, 19, 27, 25, 12,
+ 74, 14, 41, 27, 9, 15, 60, 74, 63, 3,
+ 5, 31, 2, 2, 7, 21, 66, 112, 82, 7,
+ 15, 105, 42, 124, 124, 113, 46, 42, 54, 62,
+ 60, 50, 38, 46, 56, 32, 46, 30, 16, 10,
+ 4, 34, 32, 34, 26, 12, 20, 44, 24, 0,
+ 24, 30, 11, 48, 3, 124, 124, 116, 92, 110,
+ 124, 124, 124, 82, 58, 46, 46, 76, 68, 31,
+ 46, 54, 22, 84, 46, 46, 58, 70, 68, 84,
+ 78, 28, 59, 6, 4, 34, 111, 87, 23, 8,
+ 20, 8, 33, 41, 51, 49, 107, 59, 22, 0,
+ 7, 19, 29, 39, 65, 59, 103, 29, 36, 10,
+ 0, 7, 13, 23, 33, 49, 77, 27, 2, 13,
+ 12, 10, 19, 33, 31, 61, 25, 54, 12, 14,
+ 10, 9, 17, 35, 33, 39, 124, 113, 111, 97,
+ 109, 89, 91, 93, 89, 83, 91, 93, 97, 53,
+ 67, 63, 75, 95, 53, 2, 41, 51, 47, 25,
+ 15, 17, 3, 29, 3, 4, 7, 9, 0, 9,
+ 27, 1, 9, 25, 12, 6, 3, 10, 30, 4,
+ 13, 23, 33, 19, 3, 5, 10, 12, 11, 3,
+ 17, 16, 14, 21, 29, 25, 25, 28, 124, 124,
+ 124, 124, 124, 124, 124, 124, 124, 124, 124, 124,
+ 124, 124, 84, 124, 124, 124, 124, 124, 110, 80,
+ 104, 100, 74, 38, 26, 10, 31, 124, 124, 124,
+ 104, 76, 78, 62, 62, 76, 70, 52, 24, 48,
+ 16, 9, 54, 64, 56, 66, 78, 52, 50, 32,
+ 30, 3, 14, 1, 29, 109, 8, 2, 9, 48,
+ 16, 5, 27, 15, 11, 9, 11, 7, 8, 9,
+ 11, 7, 2, 26, 18, 28, 1, 124, 124, 116,
+ 108, 92, 84, 58, 32, 13, 9, 58, 44, 36,
+ 22, 16, 6, 1, 5, 23, 27, 15, 11, 9,
+ 11, 7, 8, 9, 11, 7, 2, 26, 18, 28,
+ 1, 124, 124, 116, 108, 92, 84, 58, 32, 13,
+ },
+
+ {
+ /* Context Tables for P, SP, B Slices :: cabac_init_idc = 2, qp = 6 */
+
+ 124, 18, 23, 124, 18, 23, 105, 63, 26, 20,
+ 20, 74, 108, 116, 32, 6, 49, 76, 39, 59,
+ 10, 8, 5, 2, 32, 92, 106, 86, 16, 75,
+ 103, 89, 19, 39, 59, 10, 53, 37, 44, 14,
+ 41, 67, 89, 1, 3, 43, 69, 0, 39, 75,
+ 1, 27, 49, 73, 14, 27, 21, 57, 32, 6,
+ 44, 0, 0, 0, 31, 47, 67, 11, 36, 0,
+ 54, 41, 111, 99, 41, 51, 21, 25, 41, 41,
+ 103, 13, 65, 43, 99, 41, 27, 41, 31, 22,
+ 2, 27, 99, 27, 71, 45, 103, 3, 29, 17,
+ 53, 20, 11, 9, 59, 41, 39, 33, 13, 3,
+ 11, 39, 3, 10, 7, 10, 11, 13, 10, 19,
+ 42, 9, 2, 30, 36, 48, 48, 26, 31, 21,
+ 29, 16, 7, 49, 7, 5, 19, 27, 23, 14,
+ 74, 18, 39, 29, 11, 15, 60, 74, 63, 5,
+ 5, 33, 2, 0, 5, 19, 66, 112, 84, 5,
+ 17, 103, 40, 124, 124, 109, 46, 42, 54, 60,
+ 58, 48, 38, 44, 54, 32, 46, 30, 14, 10,
+ 2, 32, 30, 32, 26, 12, 20, 44, 22, 3,
+ 24, 28, 11, 46, 5, 124, 124, 112, 88, 106,
+ 124, 124, 124, 78, 54, 42, 42, 72, 64, 35,
+ 44, 50, 18, 80, 44, 44, 56, 66, 64, 80,
+ 74, 24, 59, 4, 2, 32, 109, 85, 19, 10,
+ 20, 8, 31, 41, 51, 47, 105, 57, 24, 2,
+ 5, 19, 27, 37, 63, 57, 99, 29, 36, 10,
+ 0, 7, 11, 23, 31, 47, 75, 27, 6, 11,
+ 12, 10, 19, 31, 31, 61, 25, 56, 12, 16,
+ 10, 7, 17, 35, 31, 39, 124, 109, 107, 93,
+ 105, 85, 89, 89, 87, 81, 87, 89, 93, 53,
+ 65, 63, 75, 89, 53, 0, 41, 51, 45, 25,
+ 15, 17, 3, 29, 3, 2, 7, 9, 0, 9,
+ 27, 1, 9, 25, 12, 6, 3, 8, 28, 4,
+ 13, 23, 31, 21, 3, 3, 8, 12, 11, 3,
+ 15, 16, 14, 19, 27, 25, 25, 24, 124, 124,
+ 124, 124, 124, 124, 124, 124, 124, 124, 124, 124,
+ 124, 124, 80, 124, 124, 124, 124, 124, 104, 76,
+ 100, 96, 70, 36, 24, 8, 31, 124, 124, 124,
+ 100, 72, 76, 58, 58, 72, 64, 48, 20, 44,
+ 12, 11, 52, 60, 52, 62, 74, 48, 46, 28,
+ 26, 5, 12, 3, 31, 107, 6, 1, 13, 46,
+ 12, 7, 25, 15, 9, 9, 9, 5, 10, 9,
+ 9, 5, 4, 26, 20, 30, 0, 124, 124, 112,
+ 104, 88, 80, 54, 28, 15, 7, 58, 44, 38,
+ 22, 18, 6, 0, 3, 23, 25, 15, 9, 9,
+ 9, 5, 10, 9, 9, 5, 4, 26, 20, 30,
+ 0, 124, 124, 112, 104, 88, 80, 54, 28, 15,
+ },
+
+ {
+ /* Context Tables for P, SP, B Slices :: cabac_init_idc = 2, qp = 7 */
+
+ 124, 18, 23, 124, 18, 23, 101, 59, 26, 20,
+ 18, 70, 106, 114, 34, 6, 47, 72, 39, 55,
+ 8, 8, 3, 0, 30, 90, 102, 80, 10, 77,
+ 99, 87, 19, 39, 55, 8, 53, 35, 44, 14,
+ 41, 65, 85, 3, 5, 43, 69, 0, 37, 73,
+ 3, 27, 49, 73, 14, 27, 21, 55, 34, 6,
+ 44, 0, 0, 0, 31, 47, 67, 9, 34, 0,
+ 52, 41, 109, 95, 39, 49, 19, 23, 39, 39,
+ 99, 11, 63, 41, 93, 41, 29, 41, 33, 22,
+ 2, 25, 93, 25, 71, 45, 99, 3, 29, 17,
+ 53, 20, 11, 7, 57, 41, 37, 31, 13, 3,
+ 9, 37, 3, 10, 9, 10, 11, 13, 10, 19,
+ 42, 9, 2, 32, 36, 48, 48, 26, 31, 21,
+ 29, 16, 7, 47, 7, 3, 17, 25, 19, 18,
+ 76, 20, 35, 29, 13, 13, 62, 74, 65, 5,
+ 3, 33, 4, 0, 5, 17, 66, 112, 84, 1,
+ 19, 99, 38, 124, 124, 107, 46, 42, 52, 60,
+ 58, 48, 36, 44, 54, 30, 44, 30, 14, 10,
+ 2, 32, 30, 30, 24, 12, 20, 42, 22, 5,
+ 22, 26, 11, 44, 5, 124, 124, 108, 86, 104,
+ 124, 124, 124, 76, 50, 38, 40, 68, 60, 37,
+ 42, 48, 16, 78, 42, 42, 52, 64, 60, 76,
+ 72, 22, 59, 2, 0, 28, 105, 81, 17, 10,
+ 20, 8, 29, 39, 49, 47, 101, 53, 26, 4,
+ 5, 17, 23, 35, 59, 55, 93, 27, 38, 12,
+ 2, 5, 9, 21, 31, 45, 73, 27, 8, 9,
+ 12, 10, 17, 31, 31, 59, 23, 56, 14, 16,
+ 10, 5, 15, 33, 31, 37, 124, 107, 105, 91,
+ 103, 83, 85, 87, 83, 77, 83, 85, 87, 51,
+ 63, 61, 73, 83, 51, 1, 39, 49, 43, 25,
+ 15, 17, 3, 27, 5, 0, 7, 9, 2, 9,
+ 27, 1, 9, 23, 10, 4, 3, 6, 26, 2,
+ 13, 23, 29, 23, 5, 1, 8, 10, 11, 3,
+ 15, 14, 12, 19, 27, 25, 25, 22, 124, 124,
+ 124, 124, 124, 124, 124, 124, 124, 124, 124, 124,
+ 124, 124, 76, 124, 124, 124, 124, 124, 100, 72,
+ 96, 90, 66, 34, 22, 6, 31, 124, 122, 124,
+ 96, 68, 72, 56, 54, 68, 60, 44, 16, 40,
+ 10, 15, 50, 58, 48, 60, 72, 46, 42, 26,
+ 24, 7, 10, 5, 31, 105, 2, 3, 15, 42,
+ 10, 9, 25, 13, 9, 7, 7, 3, 10, 7,
+ 7, 3, 6, 28, 22, 30, 0, 124, 120, 108,
+ 100, 84, 76, 48, 24, 17, 7, 60, 46, 38,
+ 24, 20, 8, 0, 3, 21, 25, 13, 9, 7,
+ 7, 3, 10, 7, 7, 3, 6, 28, 22, 30,
+ 0, 124, 120, 108, 100, 84, 76, 48, 24, 17,
+ },
+
+ {
+ /* Context Tables for P, SP, B Slices :: cabac_init_idc = 2, qp = 8 */
+
+ 124, 16, 23, 124, 16, 23, 99, 57, 26, 20,
+ 18, 66, 102, 112, 34, 6, 43, 68, 39, 53,
+ 8, 6, 1, 1, 26, 86, 98, 72, 6, 79,
+ 95, 85, 19, 39, 53, 8, 53, 35, 44, 12,
+ 39, 63, 83, 5, 9, 45, 71, 0, 37, 71,
+ 3, 29, 49, 73, 14, 27, 21, 55, 34, 6,
+ 44, 0, 0, 0, 29, 49, 67, 9, 32, 0,
+ 52, 39, 107, 91, 37, 49, 15, 19, 39, 37,
+ 95, 11, 61, 39, 89, 43, 29, 43, 33, 22,
+ 0, 25, 89, 25, 69, 43, 97, 3, 29, 17,
+ 53, 20, 11, 7, 55, 41, 37, 31, 13, 3,
+ 9, 35, 1, 10, 9, 8, 11, 13, 8, 19,
+ 42, 9, 2, 32, 36, 48, 48, 26, 33, 19,
+ 27, 14, 7, 47, 7, 1, 17, 25, 17, 20,
+ 78, 24, 31, 31, 15, 13, 62, 74, 67, 5,
+ 3, 35, 4, 0, 5, 15, 66, 112, 84, 0,
+ 21, 97, 36, 118, 124, 105, 44, 42, 52, 58,
+ 56, 46, 36, 44, 52, 28, 42, 28, 14, 8,
+ 0, 30, 28, 28, 24, 10, 18, 40, 20, 7,
+ 20, 24, 11, 40, 7, 124, 124, 104, 82, 100,
+ 120, 124, 124, 74, 46, 32, 36, 62, 56, 41,
+ 38, 46, 12, 76, 40, 40, 50, 60, 56, 72,
+ 68, 18, 59, 0, 1, 24, 103, 79, 15, 12,
+ 20, 8, 29, 37, 47, 45, 97, 51, 26, 4,
+ 3, 17, 21, 33, 57, 53, 89, 27, 38, 12,
+ 2, 5, 9, 21, 29, 45, 71, 27, 10, 7,
+ 12, 10, 15, 29, 31, 57, 21, 56, 16, 16,
+ 10, 3, 15, 31, 29, 37, 124, 105, 101, 87,
+ 99, 81, 83, 83, 81, 75, 81, 81, 83, 51,
+ 61, 61, 71, 77, 51, 3, 39, 47, 43, 25,
+ 15, 17, 5, 27, 5, 1, 7, 9, 2, 9,
+ 27, 3, 9, 23, 10, 4, 5, 4, 24, 2,
+ 15, 23, 27, 25, 5, 1, 6, 10, 11, 5,
+ 15, 14, 12, 19, 25, 25, 25, 18, 124, 124,
+ 124, 124, 124, 124, 124, 124, 124, 124, 124, 124,
+ 124, 122, 72, 124, 124, 124, 124, 122, 96, 68,
+ 90, 86, 62, 30, 18, 4, 31, 122, 118, 124,
+ 92, 62, 68, 52, 48, 64, 56, 40, 12, 34,
+ 6, 17, 46, 56, 42, 56, 68, 42, 38, 22,
+ 20, 9, 8, 7, 31, 103, 0, 7, 19, 40,
+ 8, 11, 25, 13, 7, 7, 5, 3, 12, 7,
+ 5, 3, 8, 28, 22, 32, 2, 122, 116, 104,
+ 96, 80, 72, 44, 20, 21, 5, 60, 46, 40,
+ 24, 20, 8, 2, 3, 21, 25, 13, 7, 7,
+ 5, 3, 12, 7, 5, 3, 8, 28, 22, 32,
+ 2, 122, 116, 104, 96, 80, 72, 44, 20, 21,
+ },
+
+ {
+ /* Context Tables for P, SP, B Slices :: cabac_init_idc = 2, qp = 9 */
+
+ 124, 16, 23, 124, 16, 23, 95, 55, 28, 20,
+ 18, 62, 98, 112, 36, 6, 39, 66, 37, 49,
+ 8, 6, 0, 1, 24, 84, 94, 66, 2, 79,
+ 91, 81, 17, 37, 49, 8, 51, 33, 46, 12,
+ 37, 61, 81, 7, 11, 45, 71, 0, 37, 69,
+ 3, 29, 49, 73, 14, 25, 19, 53, 34, 6,
+ 44, 0, 0, 0, 27, 49, 67, 9, 30, 0,
+ 52, 37, 103, 85, 35, 47, 11, 15, 37, 35,
+ 91, 9, 57, 37, 85, 45, 29, 43, 33, 24,
+ 0, 23, 85, 25, 67, 41, 93, 3, 27, 17,
+ 51, 22, 11, 5, 53, 39, 35, 31, 13, 1,
+ 7, 33, 0, 10, 9, 8, 9, 11, 8, 19,
+ 44, 9, 2, 32, 38, 50, 48, 28, 33, 17,
+ 25, 12, 7, 47, 7, 0, 17, 23, 15, 22,
+ 80, 28, 27, 33, 17, 11, 62, 76, 69, 5,
+ 3, 35, 6, 0, 5, 11, 66, 112, 84, 2,
+ 23, 95, 34, 114, 124, 101, 44, 42, 52, 56,
+ 56, 46, 36, 44, 52, 26, 40, 28, 14, 8,
+ 0, 30, 26, 28, 24, 10, 18, 38, 18, 9,
+ 20, 24, 11, 38, 7, 124, 124, 102, 80, 96,
+ 116, 124, 124, 72, 42, 28, 34, 58, 54, 43,
+ 36, 44, 8, 74, 38, 38, 48, 56, 54, 68,
+ 64, 16, 59, 0, 3, 22, 99, 75, 11, 14,
+ 20, 8, 27, 35, 45, 43, 93, 49, 28, 6,
+ 1, 15, 19, 31, 55, 51, 85, 25, 40, 14,
+ 4, 5, 7, 19, 27, 43, 67, 25, 14, 5,
+ 14, 10, 13, 27, 29, 55, 19, 58, 18, 18,
+ 12, 1, 13, 29, 27, 35, 124, 101, 97, 83,
+ 95, 79, 81, 79, 77, 71, 77, 77, 79, 49,
+ 59, 59, 69, 69, 51, 5, 39, 45, 41, 23,
+ 15, 17, 5, 27, 5, 3, 7, 9, 4, 9,
+ 27, 3, 9, 23, 10, 4, 5, 4, 22, 2,
+ 15, 21, 23, 25, 5, 0, 4, 10, 11, 5,
+ 13, 14, 12, 17, 23, 23, 23, 14, 124, 124,
+ 124, 124, 124, 124, 124, 124, 124, 124, 124, 124,
+ 124, 116, 68, 124, 124, 124, 124, 116, 92, 66,
+ 86, 82, 60, 28, 16, 2, 31, 118, 114, 120,
+ 88, 58, 64, 50, 44, 60, 52, 36, 8, 30,
+ 2, 19, 44, 54, 38, 54, 64, 40, 34, 20,
+ 18, 11, 6, 7, 31, 101, 1, 9, 23, 38,
+ 6, 13, 25, 11, 5, 5, 1, 1, 14, 7,
+ 3, 1, 10, 30, 24, 34, 4, 120, 114, 100,
+ 92, 76, 68, 40, 16, 23, 3, 60, 48, 42,
+ 26, 22, 10, 4, 1, 19, 25, 11, 5, 5,
+ 1, 1, 14, 7, 3, 1, 10, 30, 24, 34,
+ 4, 120, 114, 100, 92, 76, 68, 40, 16, 23,
+ },
+
+ {
+ /* Context Tables for P, SP, B Slices :: cabac_init_idc = 2, qp = 10 */
+
+ 124, 16, 23, 124, 16, 23, 91, 51, 28, 20,
+ 16, 58, 96, 110, 38, 6, 37, 62, 37, 47,
+ 6, 6, 2, 3, 22, 82, 90, 60, 3, 81,
+ 87, 79, 17, 37, 47, 6, 51, 31, 46, 12,
+ 37, 59, 77, 9, 13, 45, 71, 0, 35, 67,
+ 5, 29, 49, 73, 14, 25, 19, 51, 36, 6,
+ 44, 0, 0, 0, 27, 49, 67, 7, 28, 0,
+ 50, 37, 101, 81, 33, 45, 9, 13, 35, 33,
+ 87, 7, 55, 35, 79, 45, 31, 43, 35, 24,
+ 0, 21, 79, 23, 67, 41, 89, 3, 27, 17,
+ 51, 22, 11, 5, 51, 39, 33, 29, 13, 1,
+ 7, 31, 0, 10, 11, 8, 9, 11, 8, 19,
+ 44, 9, 2, 34, 38, 50, 48, 28, 35, 17,
+ 25, 12, 7, 45, 7, 2, 15, 23, 11, 26,
+ 82, 30, 23, 33, 19, 11, 64, 76, 71, 5,
+ 1, 37, 8, 0, 5, 9, 66, 112, 84, 6,
+ 25, 91, 32, 108, 124, 99, 44, 42, 50, 56,
+ 54, 46, 34, 44, 50, 24, 38, 28, 14, 8,
+ 0, 28, 26, 26, 22, 10, 18, 36, 18, 11,
+ 18, 22, 11, 36, 9, 120, 124, 98, 76, 94,
+ 112, 124, 124, 70, 38, 24, 30, 54, 50, 45,
+ 34, 42, 6, 72, 36, 36, 44, 54, 50, 64,
+ 62, 12, 59, 1, 5, 18, 97, 73, 9, 14,
+ 20, 8, 25, 33, 43, 43, 89, 45, 30, 8,
+ 1, 13, 15, 29, 51, 49, 79, 23, 40, 14,
+ 4, 3, 5, 17, 27, 41, 65, 25, 16, 3,
+ 14, 10, 11, 27, 29, 53, 17, 58, 20, 18,
+ 12, 0, 11, 27, 27, 33, 124, 99, 95, 81,
+ 93, 77, 77, 77, 73, 69, 73, 73, 73, 47,
+ 57, 57, 67, 63, 49, 7, 37, 43, 39, 23,
+ 15, 17, 5, 25, 7, 5, 7, 9, 6, 9,
+ 27, 3, 9, 21, 8, 2, 5, 2, 20, 0,
+ 15, 21, 21, 27, 7, 2, 4, 8, 11, 5,
+ 13, 12, 10, 17, 23, 23, 23, 12, 124, 124,
+ 124, 124, 124, 124, 124, 124, 124, 124, 124, 124,
+ 124, 112, 64, 124, 124, 124, 124, 110, 88, 62,
+ 82, 76, 56, 26, 14, 0, 31, 114, 108, 114,
+ 84, 54, 60, 46, 40, 56, 48, 32, 4, 26,
+ 0, 23, 42, 52, 34, 50, 62, 36, 30, 18,
+ 16, 13, 4, 9, 31, 99, 5, 11, 25, 34,
+ 4, 15, 25, 11, 5, 3, 0, 0, 14, 5,
+ 1, 0, 12, 30, 26, 34, 4, 120, 110, 96,
+ 88, 72, 64, 34, 12, 25, 3, 62, 48, 42,
+ 28, 24, 10, 4, 1, 17, 25, 11, 5, 3,
+ 0, 0, 14, 5, 1, 0, 12, 30, 26, 34,
+ 4, 120, 110, 96, 88, 72, 64, 34, 12, 25,
+ },
+
+ {
+ /* Context Tables for P, SP, B Slices :: cabac_init_idc = 2, qp = 11 */
+
+ 124, 16, 25, 124, 16, 25, 87, 49, 30, 20,
+ 16, 56, 92, 108, 38, 8, 33, 60, 37, 43,
+ 6, 4, 4, 5, 20, 80, 88, 52, 7, 83,
+ 83, 75, 17, 37, 43, 6, 49, 29, 48, 12,
+ 35, 57, 75, 13, 15, 45, 73, 0, 35, 67,
+ 5, 31, 47, 73, 12, 25, 19, 49, 36, 6,
+ 44, 0, 0, 0, 25, 49, 67, 7, 28, 1,
+ 50, 35, 99, 77, 31, 43, 5, 9, 33, 31,
+ 83, 5, 53, 35, 75, 47, 31, 43, 35, 26,
+ 0, 19, 75, 23, 65, 39, 85, 3, 27, 17,
+ 51, 24, 13, 3, 51, 39, 31, 29, 11, 1,
+ 5, 29, 2, 10, 11, 8, 9, 11, 8, 19,
+ 44, 9, 2, 34, 38, 52, 50, 30, 35, 15,
+ 23, 10, 7, 45, 7, 4, 15, 21, 9, 28,
+ 82, 34, 21, 35, 21, 9, 64, 76, 71, 7,
+ 1, 37, 8, 1, 3, 7, 66, 112, 86, 8,
+ 27, 89, 30, 102, 124, 95, 44, 42, 50, 54,
+ 54, 44, 34, 42, 50, 24, 38, 28, 12, 8,
+ 1, 28, 24, 24, 22, 10, 18, 36, 16, 15,
+ 18, 20, 11, 34, 9, 114, 124, 94, 74, 90,
+ 108, 124, 122, 66, 34, 20, 28, 50, 46, 49,
+ 32, 38, 2, 68, 34, 34, 42, 50, 46, 60,
+ 58, 10, 59, 3, 7, 16, 93, 69, 5, 16,
+ 20, 8, 23, 33, 43, 41, 87, 43, 32, 10,
+ 0, 13, 13, 27, 49, 47, 75, 23, 42, 16,
+ 6, 3, 3, 17, 25, 39, 63, 25, 20, 1,
+ 14, 10, 11, 25, 29, 53, 17, 60, 20, 20,
+ 12, 2, 11, 27, 25, 33, 124, 95, 91, 77,
+ 89, 73, 75, 73, 71, 65, 69, 69, 69, 47,
+ 55, 57, 67, 57, 49, 9, 37, 43, 37, 23,
+ 15, 17, 5, 25, 7, 7, 7, 9, 6, 9,
+ 27, 3, 9, 21, 8, 2, 5, 0, 18, 0,
+ 15, 21, 19, 29, 7, 4, 2, 8, 11, 5,
+ 11, 12, 10, 15, 21, 23, 23, 8, 124, 122,
+ 124, 124, 124, 124, 124, 124, 124, 124, 124, 124,
+ 124, 106, 60, 124, 124, 124, 124, 106, 82, 58,
+ 78, 72, 52, 24, 12, 1, 31, 110, 104, 110,
+ 80, 50, 58, 44, 36, 52, 42, 28, 0, 22,
+ 3, 25, 40, 48, 30, 48, 58, 34, 26, 14,
+ 12, 15, 2, 11, 33, 97, 7, 15, 29, 32,
+ 0, 17, 23, 9, 3, 3, 2, 2, 16, 5,
+ 0, 2, 14, 32, 28, 36, 6, 118, 106, 92,
+ 84, 68, 60, 30, 8, 27, 1, 62, 50, 44,
+ 28, 26, 12, 6, 0, 17, 23, 9, 3, 3,
+ 2, 2, 16, 5, 0, 2, 14, 32, 28, 36,
+ 6, 118, 106, 92, 84, 68, 60, 30, 8, 27,
+ },
+
+ {
+ /* Context Tables for P, SP, B Slices :: cabac_init_idc = 2, qp = 12 */
+
+ 124, 16, 25, 124, 16, 25, 85, 45, 30, 20,
+ 14, 52, 90, 106, 40, 8, 29, 56, 37, 41,
+ 6, 4, 6, 7, 16, 76, 84, 46, 11, 85,
+ 79, 73, 17, 37, 41, 6, 49, 27, 48, 10,
+ 33, 55, 73, 15, 19, 47, 73, 0, 35, 65,
+ 5, 31, 47, 73, 12, 25, 19, 49, 38, 6,
+ 44, 0, 0, 0, 25, 51, 67, 5, 26, 1,
+ 48, 35, 97, 73, 29, 41, 3, 7, 33, 29,
+ 79, 3, 51, 33, 71, 49, 31, 43, 35, 26,
+ 1, 17, 71, 23, 63, 37, 81, 3, 27, 17,
+ 51, 24, 13, 3, 49, 39, 31, 29, 11, 1,
+ 5, 27, 2, 10, 11, 6, 9, 11, 8, 19,
+ 44, 9, 2, 36, 38, 52, 50, 30, 37, 15,
+ 21, 10, 7, 45, 7, 6, 15, 21, 7, 30,
+ 84, 36, 17, 35, 23, 9, 64, 76, 73, 7,
+ 0, 39, 10, 1, 3, 5, 66, 112, 86, 10,
+ 29, 85, 28, 96, 120, 93, 42, 42, 48, 52,
+ 52, 44, 32, 42, 48, 22, 36, 26, 12, 8,
+ 1, 26, 22, 22, 20, 8, 16, 34, 16, 17,
+ 16, 18, 11, 30, 11, 110, 124, 90, 70, 86,
+ 104, 124, 116, 64, 30, 14, 24, 46, 42, 51,
+ 28, 36, 1, 66, 32, 32, 40, 48, 42, 56,
+ 54, 6, 59, 5, 9, 12, 91, 67, 3, 16,
+ 20, 8, 21, 31, 41, 39, 83, 41, 34, 10,
+ 2, 11, 11, 25, 45, 45, 71, 21, 42, 16,
+ 6, 1, 1, 15, 25, 39, 61, 25, 22, 0,
+ 14, 10, 9, 25, 29, 51, 15, 60, 22, 20,
+ 12, 4, 9, 25, 25, 31, 124, 93, 89, 75,
+ 85, 71, 71, 71, 67, 63, 67, 65, 65, 45,
+ 53, 55, 65, 51, 49, 11, 37, 41, 37, 23,
+ 15, 17, 7, 23, 7, 9, 7, 9, 8, 9,
+ 27, 3, 9, 21, 6, 2, 5, 1, 16, 0,
+ 15, 21, 17, 31, 7, 4, 0, 8, 11, 5,
+ 11, 12, 8, 15, 19, 23, 23, 6, 124, 120,
+ 124, 124, 124, 124, 124, 124, 124, 124, 124, 124,
+ 124, 100, 56, 124, 124, 124, 124, 100, 78, 54,
+ 74, 66, 48, 20, 10, 3, 31, 104, 100, 106,
+ 76, 46, 54, 40, 32, 48, 38, 24, 3, 16,
+ 7, 27, 38, 46, 24, 44, 54, 30, 22, 12,
+ 10, 17, 0, 13, 33, 95, 9, 17, 31, 30,
+ 1, 19, 23, 9, 3, 1, 4, 2, 18, 5,
+ 2, 4, 16, 32, 30, 36, 8, 118, 102, 88,
+ 80, 64, 56, 26, 4, 31, 1, 64, 50, 46,
+ 30, 26, 12, 6, 0, 15, 23, 9, 3, 1,
+ 4, 2, 18, 5, 2, 4, 16, 32, 30, 36,
+ 8, 118, 102, 88, 80, 64, 56, 26, 4, 31,
+ },
+
+ {
+ /* Context Tables for P, SP, B Slices :: cabac_init_idc = 2, qp = 13 */
+
+ 124, 16, 25, 124, 16, 25, 81, 43, 30, 20,
+ 14, 48, 86, 104, 42, 8, 27, 52, 35, 37,
+ 4, 4, 8, 9, 14, 74, 80, 40, 17, 85,
+ 75, 71, 17, 35, 37, 4, 49, 25, 48, 10,
+ 33, 53, 69, 17, 21, 47, 73, 0, 33, 63,
+ 7, 31, 47, 73, 12, 23, 19, 47, 38, 6,
+ 44, 0, 0, 0, 23, 51, 67, 5, 24, 1,
+ 48, 33, 93, 67, 27, 39, 0, 3, 31, 27,
+ 75, 1, 49, 31, 65, 49, 33, 43, 37, 26,
+ 1, 15, 65, 21, 63, 37, 77, 3, 27, 17,
+ 49, 24, 13, 1, 47, 37, 29, 27, 11, 0,
+ 3, 25, 4, 10, 13, 6, 7, 11, 8, 19,
+ 46, 9, 2, 36, 40, 52, 50, 30, 37, 13,
+ 21, 8, 7, 43, 7, 8, 13, 19, 3, 34,
+ 86, 40, 13, 37, 25, 7, 66, 76, 75, 7,
+ 0, 39, 12, 1, 3, 1, 66, 112, 86, 14,
+ 31, 83, 26, 92, 114, 91, 42, 42, 48, 52,
+ 52, 44, 32, 42, 48, 20, 34, 26, 12, 8,
+ 1, 26, 22, 20, 20, 8, 16, 32, 14, 19,
+ 14, 18, 11, 28, 11, 106, 124, 88, 68, 84,
+ 100, 124, 112, 62, 26, 10, 22, 42, 38, 53,
+ 26, 34, 3, 64, 30, 30, 36, 44, 38, 52,
+ 52, 4, 59, 7, 11, 8, 87, 63, 1, 18,
+ 20, 8, 19, 29, 39, 39, 79, 37, 36, 12,
+ 2, 9, 7, 23, 43, 43, 65, 19, 44, 18,
+ 8, 1, 0, 13, 23, 37, 59, 23, 24, 2,
+ 14, 10, 7, 23, 27, 49, 13, 60, 24, 20,
+ 12, 6, 7, 23, 23, 29, 124, 91, 85, 71,
+ 83, 69, 69, 67, 63, 59, 63, 61, 59, 43,
+ 51, 53, 63, 45, 47, 13, 35, 39, 35, 23,
+ 15, 17, 7, 23, 9, 11, 7, 9, 10, 9,
+ 27, 3, 9, 19, 6, 0, 5, 3, 14, 1,
+ 15, 21, 15, 31, 9, 6, 0, 6, 11, 5,
+ 11, 10, 8, 15, 19, 23, 23, 2, 124, 118,
+ 124, 124, 124, 124, 124, 124, 124, 124, 124, 124,
+ 120, 96, 52, 124, 124, 124, 124, 94, 74, 52,
+ 70, 62, 44, 18, 8, 5, 31, 100, 94, 100,
+ 72, 42, 50, 38, 28, 44, 34, 20, 7, 12,
+ 9, 31, 36, 44, 20, 42, 52, 28, 18, 10,
+ 8, 19, 1, 15, 33, 93, 13, 19, 35, 26,
+ 3, 21, 23, 7, 1, 0, 6, 4, 18, 3,
+ 4, 6, 18, 34, 32, 38, 8, 116, 98, 84,
+ 76, 60, 52, 20, 0, 33, 0, 64, 52, 46,
+ 32, 28, 14, 8, 0, 13, 23, 7, 1, 0,
+ 6, 4, 18, 3, 4, 6, 18, 34, 32, 38,
+ 8, 116, 98, 84, 76, 60, 52, 20, 0, 33,
+ },
+
+ {
+ /* Context Tables for P, SP, B Slices :: cabac_init_idc = 2, qp = 14 */
+
+ 122, 16, 25, 122, 16, 25, 77, 39, 32, 20,
+ 12, 44, 84, 102, 42, 8, 23, 50, 35, 35,
+ 4, 2, 10, 11, 12, 72, 76, 32, 21, 87,
+ 71, 67, 17, 35, 35, 4, 47, 23, 50, 10,
+ 31, 51, 67, 19, 23, 47, 75, 0, 33, 61,
+ 7, 33, 47, 73, 12, 23, 19, 45, 40, 6,
+ 44, 0, 0, 0, 23, 51, 67, 3, 22, 1,
+ 46, 33, 91, 63, 25, 37, 2, 1, 29, 25,
+ 71, 0, 47, 29, 61, 51, 33, 43, 37, 28,
+ 1, 13, 61, 21, 61, 35, 73, 3, 27, 17,
+ 49, 26, 13, 1, 45, 37, 27, 27, 11, 0,
+ 3, 23, 4, 10, 13, 6, 7, 11, 8, 19,
+ 46, 9, 2, 38, 40, 54, 50, 32, 39, 13,
+ 19, 8, 7, 43, 7, 10, 13, 19, 1, 36,
+ 88, 42, 9, 37, 27, 7, 66, 76, 77, 7,
+ 2, 41, 12, 1, 3, 0, 66, 112, 86, 16,
+ 33, 79, 24, 86, 108, 87, 42, 42, 46, 50,
+ 50, 42, 30, 42, 46, 18, 32, 26, 12, 8,
+ 3, 24, 20, 18, 18, 8, 16, 30, 14, 21,
+ 14, 16, 11, 26, 13, 102, 120, 84, 64, 80,
+ 96, 124, 106, 60, 22, 6, 18, 38, 34, 57,
+ 24, 32, 7, 62, 28, 28, 34, 42, 34, 48,
+ 48, 0, 59, 9, 13, 6, 85, 61, 2, 18,
+ 20, 8, 17, 27, 37, 37, 75, 35, 38, 14,
+ 4, 9, 5, 21, 39, 41, 61, 19, 44, 18,
+ 8, 0, 2, 13, 23, 35, 57, 23, 28, 4,
+ 14, 10, 5, 23, 27, 47, 11, 62, 26, 22,
+ 12, 8, 7, 21, 23, 29, 124, 87, 83, 69,
+ 79, 67, 65, 65, 61, 57, 59, 57, 55, 43,
+ 49, 53, 61, 39, 47, 15, 35, 37, 33, 23,
+ 15, 17, 7, 21, 9, 13, 7, 9, 10, 9,
+ 27, 3, 9, 19, 4, 0, 5, 5, 12, 1,
+ 15, 21, 13, 33, 9, 8, 1, 6, 11, 5,
+ 9, 10, 6, 13, 17, 23, 23, 0, 124, 116,
+ 122, 122, 122, 124, 124, 124, 122, 124, 124, 124,
+ 114, 90, 48, 124, 120, 118, 120, 88, 70, 48,
+ 66, 56, 40, 16, 6, 7, 31, 96, 90, 96,
+ 68, 38, 46, 34, 24, 40, 30, 16, 11, 8,
+ 13, 33, 34, 42, 16, 38, 48, 24, 14, 6,
+ 4, 21, 3, 17, 33, 91, 15, 23, 37, 24,
+ 5, 23, 23, 7, 1, 0, 8, 6, 20, 3,
+ 6, 8, 20, 34, 34, 38, 10, 116, 94, 80,
+ 72, 56, 48, 16, 3, 35, 0, 66, 52, 48,
+ 32, 30, 14, 8, 2, 13, 23, 7, 1, 0,
+ 8, 6, 20, 3, 6, 8, 20, 34, 34, 38,
+ 10, 116, 94, 80, 72, 56, 48, 16, 3, 35,
+ },
+
+ {
+ /* Context Tables for P, SP, B Slices :: cabac_init_idc = 2, qp = 15 */
+
+ 120, 16, 25, 120, 16, 25, 73, 37, 32, 20,
+ 12, 40, 80, 100, 44, 8, 19, 46, 35, 31,
+ 4, 2, 12, 13, 10, 70, 72, 26, 25, 89,
+ 67, 65, 17, 35, 31, 4, 47, 21, 50, 10,
+ 29, 49, 65, 21, 25, 47, 75, 0, 33, 59,
+ 7, 33, 47, 73, 12, 23, 19, 43, 40, 6,
+ 44, 0, 0, 0, 21, 51, 67, 3, 20, 1,
+ 46, 31, 89, 59, 23, 35, 6, 2, 27, 23,
+ 67, 2, 45, 27, 57, 53, 33, 43, 37, 28,
+ 1, 11, 57, 21, 59, 33, 69, 3, 27, 17,
+ 49, 26, 13, 0, 43, 37, 25, 27, 11, 0,
+ 1, 21, 6, 10, 13, 6, 7, 11, 8, 19,
+ 46, 9, 2, 38, 40, 54, 50, 32, 39, 11,
+ 17, 6, 7, 43, 7, 12, 13, 17, 0, 38,
+ 90, 46, 5, 39, 29, 5, 66, 76, 79, 7,
+ 2, 41, 14, 1, 3, 2, 66, 112, 86, 18,
+ 35, 77, 22, 80, 102, 85, 42, 42, 46, 48,
+ 50, 42, 30, 42, 46, 16, 30, 26, 12, 8,
+ 3, 24, 18, 16, 18, 8, 16, 28, 12, 23,
+ 12, 14, 11, 24, 13, 98, 116, 80, 62, 76,
+ 92, 118, 102, 58, 18, 2, 16, 34, 30, 59,
+ 22, 30, 11, 60, 26, 26, 32, 38, 30, 44,
+ 44, 1, 59, 11, 15, 2, 81, 57, 4, 20,
+ 20, 8, 15, 25, 35, 35, 71, 33, 40, 16,
+ 6, 7, 3, 19, 37, 39, 57, 17, 46, 20,
+ 10, 0, 4, 11, 21, 33, 55, 23, 30, 6,
+ 14, 10, 3, 21, 27, 45, 9, 62, 28, 22,
+ 12, 10, 5, 19, 21, 27, 124, 85, 79, 65,
+ 75, 65, 63, 61, 57, 53, 55, 53, 51, 41,
+ 47, 51, 59, 33, 47, 17, 35, 35, 31, 23,
+ 15, 17, 7, 21, 9, 15, 7, 9, 12, 9,
+ 27, 3, 9, 19, 4, 0, 5, 7, 10, 1,
+ 15, 21, 11, 35, 9, 10, 3, 6, 11, 5,
+ 9, 10, 6, 13, 15, 23, 23, 3, 122, 114,
+ 120, 118, 118, 124, 124, 124, 118, 120, 124, 122,
+ 108, 84, 44, 122, 114, 110, 110, 82, 66, 44,
+ 62, 52, 36, 14, 4, 9, 31, 92, 86, 92,
+ 64, 34, 42, 32, 20, 36, 26, 12, 15, 4,
+ 17, 35, 32, 40, 12, 36, 44, 22, 10, 4,
+ 2, 23, 5, 19, 33, 89, 17, 25, 41, 22,
+ 7, 25, 23, 5, 0, 2, 10, 8, 22, 3,
+ 8, 10, 22, 36, 36, 40, 12, 114, 90, 76,
+ 68, 52, 44, 12, 7, 37, 2, 66, 54, 50,
+ 34, 32, 16, 10, 2, 11, 23, 5, 0, 2,
+ 10, 8, 22, 3, 8, 10, 22, 36, 36, 40,
+ 12, 114, 90, 76, 68, 52, 44, 12, 7, 37,
+ },
+
+ {
+ /* Context Tables for P, SP, B Slices :: cabac_init_idc = 2, qp = 16 */
+
+ 116, 14, 27, 116, 14, 27, 71, 35, 32, 20,
+ 10, 36, 76, 98, 44, 8, 17, 42, 35, 29,
+ 2, 0, 14, 15, 6, 66, 68, 18, 31, 91,
+ 63, 63, 17, 35, 29, 2, 47, 21, 50, 8,
+ 29, 49, 63, 25, 29, 49, 77, 1, 33, 59,
+ 9, 35, 47, 73, 10, 23, 19, 43, 40, 4,
+ 44, 0, 0, 0, 21, 53, 67, 3, 18, 3,
+ 44, 31, 87, 55, 21, 35, 8, 4, 27, 21,
+ 65, 2, 43, 27, 53, 55, 35, 45, 39, 28,
+ 3, 11, 53, 21, 59, 33, 67, 3, 27, 17,
+ 49, 26, 15, 0, 43, 37, 25, 27, 11, 0,
+ 1, 19, 6, 10, 15, 4, 7, 11, 6, 19,
+ 46, 9, 2, 38, 40, 54, 50, 32, 41, 11,
+ 17, 4, 7, 43, 9, 12, 13, 17, 2, 40,
+ 90, 48, 3, 41, 33, 5, 66, 76, 81, 9,
+ 2, 43, 14, 3, 3, 4, 66, 110, 86, 20,
+ 37, 75, 18, 74, 94, 83, 40, 42, 44, 46,
+ 48, 40, 28, 40, 44, 14, 28, 24, 10, 6,
+ 5, 22, 16, 14, 16, 6, 14, 26, 10, 27,
+ 10, 12, 11, 20, 15, 92, 110, 76, 58, 72,
+ 86, 110, 96, 54, 14, 3, 12, 28, 26, 63,
+ 18, 26, 15, 56, 24, 24, 28, 34, 26, 40,
+ 40, 5, 59, 13, 17, 1, 79, 55, 6, 20,
+ 20, 8, 15, 25, 35, 35, 69, 31, 40, 16,
+ 6, 7, 1, 17, 35, 39, 53, 17, 46, 20,
+ 10, 0, 4, 11, 21, 33, 53, 23, 32, 8,
+ 14, 8, 3, 21, 27, 45, 9, 62, 28, 22,
+ 12, 12, 5, 19, 21, 27, 124, 83, 77, 63,
+ 73, 63, 61, 59, 55, 51, 53, 51, 47, 41,
+ 47, 51, 59, 27, 47, 21, 35, 35, 31, 23,
+ 15, 17, 9, 21, 11, 17, 9, 9, 12, 11,
+ 27, 5, 9, 19, 2, 1, 7, 9, 8, 3,
+ 17, 21, 9, 37, 11, 10, 5, 4, 11, 7,
+ 9, 8, 4, 13, 15, 23, 23, 7, 118, 112,
+ 116, 114, 112, 124, 124, 124, 112, 114, 124, 116,
+ 100, 78, 40, 114, 106, 102, 98, 76, 60, 40,
+ 56, 46, 32, 10, 0, 11, 31, 86, 80, 86,
+ 60, 28, 38, 28, 14, 32, 20, 8, 21, 1,
+ 21, 39, 28, 36, 6, 32, 40, 18, 6, 0,
+ 1, 25, 7, 21, 35, 87, 21, 29, 45, 18,
+ 11, 29, 23, 5, 0, 2, 12, 8, 22, 3,
+ 10, 10, 24, 36, 36, 40, 12, 112, 86, 72,
+ 62, 46, 40, 6, 11, 41, 2, 66, 54, 50,
+ 34, 32, 16, 10, 2, 11, 23, 5, 0, 2,
+ 12, 8, 22, 3, 10, 10, 24, 36, 36, 40,
+ 12, 112, 86, 72, 62, 46, 40, 6, 11, 41,
+ },
+
+ {
+ /* Context Tables for P, SP, B Slices :: cabac_init_idc = 2, qp = 17 */
+
+ 114, 14, 27, 114, 14, 27, 67, 31, 34, 22,
+ 10, 34, 74, 98, 46, 10, 13, 40, 33, 25,
+ 2, 0, 18, 15, 4, 64, 66, 12, 35, 91,
+ 59, 59, 15, 33, 25, 2, 45, 19, 52, 8,
+ 27, 47, 59, 27, 31, 49, 77, 1, 31, 57,
+ 9, 35, 45, 71, 10, 21, 17, 41, 42, 4,
+ 44, 0, 0, 0, 19, 53, 67, 1, 18, 3,
+ 44, 29, 83, 49, 17, 33, 12, 8, 25, 17,
+ 61, 4, 39, 25, 47, 55, 35, 45, 39, 30,
+ 3, 9, 47, 19, 57, 31, 63, 3, 25, 17,
+ 47, 28, 15, 2, 41, 35, 23, 25, 9, 2,
+ 0, 17, 8, 12, 15, 4, 5, 9, 6, 17,
+ 48, 7, 2, 40, 42, 56, 52, 34, 41, 9,
+ 15, 4, 7, 41, 9, 14, 11, 15, 6, 44,
+ 92, 52, 0, 41, 35, 3, 68, 78, 81, 9,
+ 4, 43, 16, 3, 1, 8, 66, 110, 88, 24,
+ 39, 71, 16, 70, 88, 79, 40, 42, 44, 46,
+ 48, 40, 28, 40, 44, 14, 28, 24, 10, 6,
+ 5, 22, 16, 14, 16, 6, 14, 26, 10, 29,
+ 10, 12, 11, 18, 15, 88, 106, 74, 56, 70,
+ 82, 104, 92, 52, 12, 7, 10, 24, 24, 65,
+ 16, 24, 17, 54, 24, 24, 26, 32, 24, 38,
+ 38, 7, 59, 13, 17, 3, 75, 51, 10, 22,
+ 20, 8, 13, 23, 33, 33, 65, 27, 42, 18,
+ 8, 5, 2, 15, 31, 37, 47, 15, 48, 22,
+ 12, 2, 6, 9, 19, 31, 49, 21, 36, 10,
+ 16, 8, 1, 19, 25, 43, 7, 64, 30, 24,
+ 14, 14, 3, 17, 19, 25, 124, 79, 73, 59,
+ 69, 59, 57, 55, 51, 47, 49, 47, 41, 39,
+ 45, 49, 57, 19, 45, 23, 33, 33, 29, 21,
+ 15, 15, 9, 19, 11, 17, 9, 9, 14, 11,
+ 27, 5, 9, 17, 2, 1, 7, 9, 8, 3,
+ 17, 19, 5, 37, 11, 12, 5, 4, 11, 7,
+ 7, 8, 4, 11, 13, 21, 21, 9, 116, 110,
+ 114, 112, 108, 120, 120, 118, 108, 110, 118, 112,
+ 94, 74, 36, 108, 100, 96, 88, 72, 56, 38,
+ 52, 42, 30, 8, 1, 11, 31, 82, 76, 82,
+ 56, 24, 36, 26, 10, 30, 16, 6, 25, 5,
+ 23, 41, 26, 34, 2, 30, 38, 16, 4, 1,
+ 3, 27, 7, 21, 35, 85, 23, 31, 47, 16,
+ 13, 31, 21, 3, 2, 4, 16, 10, 24, 1,
+ 14, 12, 26, 38, 38, 42, 14, 112, 84, 70,
+ 58, 42, 38, 2, 13, 43, 4, 68, 56, 52,
+ 36, 34, 18, 12, 4, 9, 21, 3, 2, 4,
+ 16, 10, 24, 1, 14, 12, 26, 38, 38, 42,
+ 14, 112, 84, 70, 58, 42, 38, 2, 13, 43,
+ },
+
+ {
+ /* Context Tables for P, SP, B Slices :: cabac_init_idc = 2, qp = 18 */
+
+ 112, 14, 27, 112, 14, 27, 63, 29, 34, 22,
+ 10, 30, 70, 96, 48, 10, 9, 36, 33, 23,
+ 2, 0, 20, 17, 2, 62, 62, 6, 39, 93,
+ 55, 57, 15, 33, 23, 2, 45, 17, 52, 8,
+ 25, 45, 57, 29, 33, 49, 77, 1, 31, 55,
+ 9, 35, 45, 71, 10, 21, 17, 39, 42, 4,
+ 44, 0, 0, 0, 17, 53, 67, 1, 16, 3,
+ 44, 27, 81, 45, 15, 31, 16, 12, 23, 15,
+ 57, 6, 37, 23, 43, 57, 35, 45, 39, 30,
+ 3, 7, 43, 19, 55, 29, 59, 3, 25, 17,
+ 47, 28, 15, 2, 39, 35, 21, 25, 9, 2,
+ 0, 15, 10, 12, 15, 4, 5, 9, 6, 17,
+ 48, 7, 2, 40, 42, 56, 52, 34, 43, 7,
+ 13, 2, 7, 41, 9, 16, 11, 15, 8, 46,
+ 94, 56, 4, 43, 37, 3, 68, 78, 83, 9,
+ 4, 45, 18, 3, 1, 10, 66, 110, 88, 26,
+ 41, 69, 14, 64, 82, 77, 40, 42, 44, 44,
+ 46, 40, 28, 40, 42, 12, 26, 24, 10, 6,
+ 5, 20, 14, 12, 16, 6, 14, 24, 8, 31,
+ 8, 10, 11, 16, 17, 84, 102, 70, 52, 66,
+ 78, 98, 88, 50, 8, 11, 6, 20, 20, 67,
+ 14, 22, 21, 52, 22, 22, 24, 28, 20, 34,
+ 34, 11, 59, 15, 19, 7, 73, 49, 12, 24,
+ 20, 8, 11, 21, 31, 31, 61, 25, 44, 20,
+ 10, 3, 4, 13, 29, 35, 43, 13, 48, 22,
+ 12, 2, 8, 7, 17, 29, 47, 21, 38, 12,
+ 16, 8, 0, 17, 25, 41, 5, 64, 32, 24,
+ 14, 16, 1, 15, 17, 23, 124, 77, 69, 55,
+ 65, 57, 55, 51, 47, 45, 45, 43, 37, 37,
+ 43, 47, 55, 13, 45, 25, 33, 31, 27, 21,
+ 15, 15, 9, 19, 11, 19, 9, 9, 16, 11,
+ 27, 5, 9, 17, 2, 1, 7, 11, 6, 3,
+ 17, 19, 3, 39, 11, 14, 7, 4, 11, 7,
+ 7, 8, 4, 11, 11, 21, 21, 13, 114, 108,
+ 112, 108, 104, 114, 114, 112, 104, 104, 112, 106,
+ 88, 68, 32, 100, 92, 88, 78, 66, 52, 34,
+ 48, 38, 26, 6, 3, 13, 31, 78, 72, 78,
+ 52, 20, 32, 22, 6, 26, 12, 2, 29, 9,
+ 27, 43, 24, 32, 1, 26, 34, 12, 0, 3,
+ 5, 29, 9, 23, 35, 83, 25, 33, 51, 14,
+ 15, 33, 21, 3, 4, 6, 18, 12, 26, 1,
+ 16, 14, 28, 38, 40, 44, 16, 110, 80, 66,
+ 54, 38, 34, 1, 17, 45, 6, 68, 56, 54,
+ 38, 36, 18, 14, 4, 7, 21, 3, 4, 6,
+ 18, 12, 26, 1, 16, 14, 28, 38, 40, 44,
+ 16, 110, 80, 66, 54, 38, 34, 1, 17, 45,
+ },
+
+ {
+ /* Context Tables for P, SP, B Slices :: cabac_init_idc = 2, qp = 19 */
+
+ 110, 14, 27, 110, 14, 27, 59, 25, 36, 22,
+ 8, 26, 68, 94, 48, 10, 5, 34, 33, 19,
+ 2, 1, 22, 19, 0, 60, 58, 1, 43, 95,
+ 51, 53, 15, 33, 19, 2, 43, 15, 54, 8,
+ 23, 43, 55, 31, 35, 49, 79, 1, 31, 53,
+ 9, 37, 45, 71, 10, 21, 17, 37, 44, 4,
+ 44, 0, 0, 0, 17, 53, 67, 0, 14, 3,
+ 42, 27, 79, 41, 13, 29, 18, 14, 21, 13,
+ 53, 8, 35, 21, 39, 59, 35, 45, 39, 32,
+ 3, 5, 39, 19, 53, 27, 55, 3, 25, 17,
+ 47, 30, 15, 4, 37, 35, 19, 25, 9, 2,
+ 2, 13, 10, 12, 15, 4, 5, 9, 6, 17,
+ 48, 7, 2, 42, 42, 58, 52, 36, 43, 7,
+ 11, 2, 7, 41, 9, 18, 11, 13, 10, 48,
+ 96, 58, 8, 43, 39, 1, 68, 78, 85, 9,
+ 6, 45, 18, 3, 1, 12, 66, 110, 88, 28,
+ 43, 65, 12, 58, 76, 73, 40, 42, 42, 42,
+ 46, 38, 26, 40, 42, 10, 24, 24, 10, 6,
+ 7, 20, 12, 10, 14, 6, 14, 22, 8, 33,
+ 8, 8, 11, 14, 17, 80, 98, 66, 50, 62,
+ 74, 92, 82, 48, 4, 15, 4, 16, 16, 71,
+ 12, 20, 25, 50, 20, 20, 22, 26, 16, 30,
+ 30, 13, 59, 17, 21, 9, 69, 45, 16, 24,
+ 20, 8, 9, 19, 29, 29, 57, 23, 46, 22,
+ 12, 3, 6, 11, 25, 33, 39, 13, 50, 24,
+ 14, 4, 10, 7, 17, 27, 45, 21, 42, 14,
+ 16, 8, 2, 17, 25, 39, 3, 66, 34, 26,
+ 14, 18, 1, 13, 17, 23, 124, 73, 67, 53,
+ 61, 55, 51, 49, 45, 41, 41, 39, 33, 37,
+ 41, 47, 53, 7, 45, 27, 33, 29, 25, 21,
+ 15, 15, 9, 17, 11, 21, 9, 9, 16, 11,
+ 27, 5, 9, 17, 0, 1, 7, 13, 4, 3,
+ 17, 19, 1, 41, 11, 16, 9, 4, 11, 7,
+ 5, 8, 2, 9, 9, 21, 21, 15, 112, 106,
+ 110, 104, 100, 110, 110, 106, 98, 98, 106, 100,
+ 82, 62, 28, 92, 86, 80, 68, 60, 48, 30,
+ 44, 32, 22, 4, 5, 15, 31, 74, 68, 74,
+ 48, 16, 28, 20, 2, 22, 8, 1, 33, 13,
+ 31, 45, 22, 30, 5, 24, 30, 10, 3, 7,
+ 9, 31, 11, 25, 35, 81, 27, 37, 53, 12,
+ 17, 35, 21, 1, 4, 6, 20, 14, 28, 1,
+ 18, 16, 30, 40, 42, 44, 18, 110, 76, 62,
+ 50, 34, 30, 5, 21, 47, 6, 70, 58, 56,
+ 38, 38, 20, 14, 6, 7, 21, 1, 4, 6,
+ 20, 14, 28, 1, 18, 16, 30, 40, 42, 44,
+ 18, 110, 76, 62, 50, 34, 30, 5, 21, 47,
+ },
+
+ {
+ /* Context Tables for P, SP, B Slices :: cabac_init_idc = 2, qp = 20 */
+
+ 106, 14, 27, 106, 14, 27, 57, 23, 36, 22,
+ 8, 22, 64, 92, 50, 10, 3, 30, 33, 17,
+ 0, 1, 24, 21, 3, 56, 54, 7, 49, 97,
+ 47, 51, 15, 33, 17, 0, 43, 13, 54, 6,
+ 23, 41, 51, 33, 39, 51, 79, 1, 29, 51,
+ 11, 37, 45, 71, 10, 21, 17, 37, 44, 4,
+ 44, 0, 0, 0, 15, 55, 67, 0, 12, 3,
+ 42, 25, 77, 37, 11, 27, 22, 18, 21, 11,
+ 49, 10, 33, 19, 33, 59, 37, 45, 41, 32,
+ 5, 3, 33, 17, 53, 27, 51, 3, 25, 17,
+ 47, 30, 15, 4, 35, 35, 19, 23, 9, 2,
+ 2, 11, 12, 12, 17, 2, 5, 9, 6, 17,
+ 48, 7, 2, 42, 42, 58, 52, 36, 45, 5,
+ 11, 0, 7, 39, 9, 20, 9, 13, 14, 52,
+ 98, 62, 12, 45, 41, 1, 70, 78, 87, 9,
+ 6, 47, 20, 3, 1, 14, 66, 110, 88, 32,
+ 45, 63, 10, 52, 70, 71, 38, 42, 42, 42,
+ 44, 38, 26, 40, 40, 8, 22, 22, 10, 6,
+ 7, 18, 12, 8, 14, 4, 12, 20, 6, 35,
+ 6, 6, 11, 10, 19, 76, 94, 62, 46, 60,
+ 70, 84, 78, 46, 0, 21, 0, 12, 12, 73,
+ 8, 18, 27, 48, 18, 18, 18, 22, 12, 26,
+ 28, 17, 59, 19, 23, 13, 67, 43, 18, 26,
+ 20, 8, 7, 17, 27, 29, 53, 19, 48, 22,
+ 12, 1, 10, 9, 23, 31, 33, 11, 50, 24,
+ 14, 4, 12, 5, 15, 27, 43, 21, 44, 16,
+ 16, 8, 4, 15, 25, 37, 1, 66, 36, 26,
+ 14, 20, 0, 11, 15, 21, 124, 71, 63, 49,
+ 59, 53, 49, 45, 41, 39, 39, 35, 27, 35,
+ 39, 45, 51, 1, 43, 29, 31, 27, 25, 21,
+ 15, 15, 11, 17, 13, 23, 9, 9, 18, 11,
+ 27, 5, 9, 15, 0, 3, 7, 15, 2, 5,
+ 17, 19, 0, 43, 13, 16, 9, 2, 11, 7,
+ 5, 6, 2, 9, 9, 21, 21, 19, 110, 104,
+ 108, 102, 94, 104, 104, 100, 94, 92, 98, 94,
+ 74, 58, 24, 84, 78, 72, 58, 54, 44, 26,
+ 40, 28, 18, 0, 7, 17, 31, 68, 62, 68,
+ 44, 12, 24, 16, 1, 18, 4, 5, 37, 19,
+ 33, 49, 20, 28, 11, 20, 28, 6, 7, 9,
+ 11, 33, 13, 27, 35, 79, 31, 39, 57, 8,
+ 19, 37, 21, 1, 6, 8, 22, 14, 28, 0,
+ 20, 18, 32, 40, 44, 46, 18, 108, 72, 58,
+ 46, 30, 26, 11, 25, 51, 8, 70, 58, 56,
+ 40, 38, 20, 16, 6, 5, 21, 1, 6, 8,
+ 22, 14, 28, 0, 20, 18, 32, 40, 44, 46,
+ 18, 108, 72, 58, 46, 30, 26, 11, 25, 51,
+ },
+
+ {
+ /* Context Tables for P, SP, B Slices :: cabac_init_idc = 2, qp = 21 */
+
+ 104, 14, 27, 104, 14, 27, 53, 19, 36, 22,
+ 6, 18, 62, 90, 52, 10, 0, 26, 31, 13,
+ 0, 1, 26, 23, 5, 54, 50, 13, 53, 97,
+ 43, 49, 15, 31, 13, 0, 43, 11, 54, 6,
+ 21, 39, 49, 35, 41, 51, 79, 1, 29, 49,
+ 11, 37, 45, 71, 10, 19, 17, 35, 46, 4,
+ 44, 0, 0, 0, 15, 55, 67, 2, 10, 3,
+ 40, 25, 73, 31, 9, 25, 24, 20, 19, 9,
+ 45, 12, 31, 17, 29, 61, 37, 45, 41, 32,
+ 5, 1, 29, 17, 51, 25, 47, 3, 25, 17,
+ 45, 30, 15, 6, 33, 33, 17, 23, 9, 4,
+ 4, 9, 12, 12, 17, 2, 3, 9, 6, 17,
+ 50, 7, 2, 44, 44, 58, 52, 36, 45, 5,
+ 9, 0, 7, 39, 9, 22, 9, 11, 16, 54,
+ 100, 64, 16, 45, 43, 0, 70, 78, 89, 9,
+ 8, 47, 22, 3, 1, 18, 66, 110, 88, 34,
+ 47, 59, 8, 48, 64, 69, 38, 42, 40, 40,
+ 44, 38, 24, 40, 40, 6, 20, 22, 10, 6,
+ 7, 18, 10, 6, 12, 4, 12, 18, 6, 37,
+ 4, 6, 11, 8, 19, 72, 90, 60, 44, 56,
+ 66, 78, 72, 44, 3, 25, 1, 8, 8, 75,
+ 6, 16, 31, 46, 16, 16, 16, 20, 8, 22,
+ 24, 19, 59, 21, 25, 17, 63, 39, 20, 26,
+ 20, 8, 5, 15, 25, 27, 49, 17, 50, 24,
+ 14, 0, 12, 7, 19, 29, 29, 9, 52, 26,
+ 16, 6, 14, 3, 15, 25, 41, 19, 46, 18,
+ 16, 8, 6, 15, 23, 35, 0, 66, 38, 26,
+ 14, 22, 2, 9, 15, 19, 124, 69, 61, 47,
+ 55, 51, 45, 43, 37, 35, 35, 31, 23, 33,
+ 37, 43, 49, 4, 43, 31, 31, 25, 23, 21,
+ 15, 15, 11, 15, 13, 25, 9, 9, 20, 11,
+ 27, 5, 9, 15, 1, 3, 7, 17, 0, 5,
+ 17, 19, 2, 43, 13, 18, 11, 2, 11, 7,
+ 5, 6, 0, 9, 7, 21, 21, 21, 108, 102,
+ 106, 98, 90, 100, 98, 94, 88, 86, 92, 88,
+ 68, 52, 20, 76, 72, 64, 48, 48, 40, 24,
+ 36, 22, 14, 1, 9, 19, 31, 64, 58, 64,
+ 40, 8, 20, 14, 5, 14, 0, 9, 41, 23,
+ 37, 51, 18, 26, 15, 18, 24, 4, 11, 11,
+ 13, 35, 15, 29, 35, 77, 33, 41, 59, 6,
+ 21, 39, 21, 0, 6, 10, 24, 16, 30, 0,
+ 22, 20, 34, 42, 46, 46, 20, 108, 68, 54,
+ 42, 26, 22, 15, 29, 53, 8, 72, 60, 58,
+ 42, 40, 22, 16, 6, 3, 21, 0, 6, 10,
+ 24, 16, 30, 0, 22, 20, 34, 42, 46, 46,
+ 20, 108, 68, 54, 42, 26, 22, 15, 29, 53,
+ },
+
+ {
+ /* Context Tables for P, SP, B Slices :: cabac_init_idc = 2, qp = 22 */
+
+ 102, 14, 29, 102, 14, 29, 49, 17, 38, 22,
+ 6, 16, 58, 88, 52, 12, 4, 24, 31, 11,
+ 0, 3, 28, 25, 7, 52, 48, 21, 57, 99,
+ 39, 45, 15, 31, 11, 0, 41, 9, 56, 6,
+ 19, 37, 47, 39, 43, 51, 81, 1, 29, 49,
+ 11, 39, 43, 71, 8, 19, 17, 33, 46, 4,
+ 44, 0, 0, 0, 13, 55, 67, 2, 10, 5,
+ 40, 23, 71, 27, 7, 23, 28, 24, 17, 7,
+ 41, 14, 29, 17, 25, 63, 37, 45, 41, 34,
+ 5, 0, 25, 17, 49, 23, 43, 3, 25, 17,
+ 45, 32, 17, 6, 33, 33, 15, 23, 7, 4,
+ 4, 7, 14, 12, 17, 2, 3, 9, 6, 17,
+ 50, 7, 2, 44, 44, 60, 54, 38, 47, 3,
+ 7, 1, 7, 39, 9, 24, 9, 11, 18, 56,
+ 100, 68, 18, 47, 45, 0, 70, 78, 89, 11,
+ 8, 49, 22, 5, 0, 20, 66, 110, 90, 36,
+ 49, 57, 6, 42, 58, 65, 38, 42, 40, 38,
+ 42, 36, 24, 38, 38, 6, 20, 22, 8, 6,
+ 9, 16, 8, 4, 12, 4, 12, 18, 4, 41,
+ 4, 4, 11, 6, 21, 66, 86, 56, 40, 52,
+ 62, 72, 68, 40, 7, 29, 5, 4, 4, 79,
+ 4, 12, 35, 42, 14, 14, 14, 16, 4, 18,
+ 20, 23, 59, 23, 27, 19, 61, 37, 24, 28,
+ 20, 8, 3, 15, 25, 25, 47, 15, 52, 26,
+ 16, 0, 14, 5, 17, 27, 25, 9, 52, 26,
+ 16, 6, 16, 3, 13, 23, 39, 19, 50, 20,
+ 16, 8, 6, 13, 23, 35, 0, 68, 38, 28,
+ 14, 24, 2, 9, 13, 19, 124, 65, 57, 43,
+ 51, 47, 43, 39, 35, 33, 31, 27, 19, 33,
+ 35, 43, 49, 10, 43, 33, 31, 25, 21, 21,
+ 15, 15, 11, 15, 13, 27, 9, 9, 20, 11,
+ 27, 5, 9, 15, 1, 3, 7, 19, 1, 5,
+ 17, 19, 4, 45, 13, 20, 13, 2, 11, 7,
+ 3, 6, 0, 7, 5, 21, 21, 25, 106, 100,
+ 104, 94, 86, 94, 94, 88, 84, 80, 86, 82,
+ 62, 46, 16, 70, 64, 56, 38, 44, 34, 20,
+ 32, 18, 10, 3, 11, 21, 31, 60, 54, 60,
+ 36, 4, 18, 10, 9, 10, 5, 13, 45, 27,
+ 41, 53, 16, 22, 19, 14, 20, 0, 15, 15,
+ 17, 37, 17, 31, 37, 75, 35, 45, 63, 4,
+ 25, 41, 19, 0, 8, 10, 26, 18, 32, 0,
+ 24, 22, 36, 42, 48, 48, 22, 106, 64, 50,
+ 38, 22, 18, 19, 33, 55, 10, 72, 60, 60,
+ 42, 42, 22, 18, 8, 3, 19, 0, 8, 10,
+ 26, 18, 32, 0, 24, 22, 36, 42, 48, 48,
+ 22, 106, 64, 50, 38, 22, 18, 19, 33, 55,
+ },
+
+ {
+ /* Context Tables for P, SP, B Slices :: cabac_init_idc = 2, qp = 23 */
+
+ 100, 14, 29, 100, 14, 29, 45, 13, 38, 22,
+ 4, 12, 56, 86, 54, 12, 6, 20, 31, 7,
+ 1, 3, 30, 27, 9, 50, 44, 27, 63, 101,
+ 35, 43, 15, 31, 7, 1, 41, 7, 56, 6,
+ 19, 35, 43, 41, 45, 51, 81, 1, 27, 47,
+ 13, 39, 43, 71, 8, 19, 17, 31, 48, 4,
+ 44, 0, 0, 0, 13, 55, 67, 4, 8, 5,
+ 38, 23, 69, 23, 5, 21, 30, 26, 15, 5,
+ 37, 16, 27, 15, 19, 63, 39, 45, 43, 34,
+ 5, 2, 19, 15, 49, 23, 39, 3, 25, 17,
+ 45, 32, 17, 8, 31, 33, 13, 21, 7, 4,
+ 6, 5, 14, 12, 19, 2, 3, 9, 6, 17,
+ 50, 7, 2, 46, 44, 60, 54, 38, 47, 3,
+ 7, 1, 7, 37, 9, 26, 7, 9, 22, 60,
+ 102, 70, 22, 47, 47, 2, 72, 78, 91, 11,
+ 10, 49, 24, 5, 0, 22, 66, 110, 90, 40,
+ 51, 53, 4, 36, 52, 63, 38, 42, 38, 38,
+ 42, 36, 22, 38, 38, 4, 18, 22, 8, 6,
+ 9, 16, 8, 2, 10, 4, 12, 16, 4, 43,
+ 2, 2, 11, 4, 21, 62, 82, 52, 38, 50,
+ 58, 66, 62, 38, 11, 33, 7, 0, 0, 81,
+ 2, 10, 37, 40, 12, 12, 10, 14, 0, 14,
+ 18, 25, 59, 25, 29, 23, 57, 33, 26, 28,
+ 20, 8, 1, 13, 23, 25, 43, 11, 54, 28,
+ 16, 2, 18, 3, 13, 25, 19, 7, 54, 28,
+ 18, 8, 18, 1, 13, 21, 37, 19, 52, 22,
+ 16, 8, 8, 13, 23, 33, 2, 68, 40, 28,
+ 14, 26, 4, 7, 13, 17, 124, 63, 55, 41,
+ 49, 45, 39, 37, 31, 29, 27, 23, 13, 31,
+ 33, 41, 47, 16, 41, 35, 29, 23, 19, 21,
+ 15, 15, 11, 13, 15, 29, 9, 9, 22, 11,
+ 27, 5, 9, 13, 3, 5, 7, 21, 3, 7,
+ 17, 19, 6, 47, 15, 22, 13, 0, 11, 7,
+ 3, 4, 1, 7, 5, 21, 21, 27, 104, 98,
+ 102, 92, 80, 90, 88, 82, 78, 74, 80, 76,
+ 56, 42, 12, 62, 58, 48, 28, 38, 30, 16,
+ 28, 12, 6, 5, 13, 23, 31, 56, 48, 54,
+ 32, 0, 14, 8, 13, 6, 9, 17, 49, 31,
+ 43, 57, 14, 20, 23, 12, 18, 1, 19, 17,
+ 19, 39, 19, 33, 37, 73, 39, 47, 65, 0,
+ 27, 43, 19, 2, 8, 12, 28, 20, 32, 2,
+ 26, 24, 38, 44, 50, 48, 22, 106, 60, 46,
+ 34, 18, 14, 25, 37, 57, 10, 74, 62, 60,
+ 44, 44, 24, 18, 8, 1, 19, 2, 8, 12,
+ 28, 20, 32, 2, 26, 24, 38, 44, 50, 48,
+ 22, 106, 60, 46, 34, 18, 14, 25, 37, 57,
+ },
+
+ {
+ /* Context Tables for P, SP, B Slices :: cabac_init_idc = 2, qp = 24 */
+
+ 96, 12, 29, 96, 12, 29, 43, 11, 38, 22,
+ 4, 8, 52, 84, 54, 12, 10, 16, 31, 5,
+ 1, 5, 32, 29, 13, 46, 40, 35, 67, 103,
+ 31, 41, 15, 31, 5, 1, 41, 7, 56, 4,
+ 17, 33, 41, 43, 49, 53, 83, 1, 27, 45,
+ 13, 41, 43, 71, 8, 19, 17, 31, 48, 4,
+ 44, 0, 0, 0, 11, 57, 67, 4, 6, 5,
+ 38, 21, 67, 19, 3, 21, 34, 30, 15, 3,
+ 33, 16, 25, 13, 15, 65, 39, 47, 43, 34,
+ 7, 2, 15, 15, 47, 21, 37, 3, 25, 17,
+ 45, 32, 17, 8, 29, 33, 13, 21, 7, 4,
+ 6, 3, 16, 12, 19, 0, 3, 9, 4, 17,
+ 50, 7, 2, 46, 44, 60, 54, 38, 49, 1,
+ 5, 3, 7, 37, 9, 28, 7, 9, 24, 62,
+ 104, 74, 26, 49, 49, 2, 72, 78, 93, 11,
+ 10, 51, 24, 5, 0, 24, 66, 110, 90, 42,
+ 53, 51, 2, 30, 44, 61, 36, 42, 38, 36,
+ 40, 34, 22, 38, 36, 2, 16, 20, 8, 4,
+ 11, 14, 6, 0, 10, 2, 10, 14, 2, 45,
+ 0, 0, 11, 0, 23, 58, 78, 48, 34, 46,
+ 52, 58, 58, 36, 15, 39, 11, 5, 3, 85,
+ 1, 8, 41, 38, 10, 10, 8, 10, 3, 10,
+ 14, 29, 59, 27, 31, 27, 55, 31, 28, 30,
+ 20, 8, 1, 11, 21, 23, 39, 9, 54, 28,
+ 18, 2, 20, 1, 11, 23, 15, 7, 54, 28,
+ 18, 8, 18, 1, 11, 21, 35, 19, 54, 24,
+ 16, 8, 10, 11, 23, 31, 4, 68, 42, 28,
+ 14, 28, 4, 5, 11, 17, 124, 61, 51, 37,
+ 45, 43, 37, 33, 29, 27, 25, 19, 9, 31,
+ 31, 41, 45, 22, 41, 37, 29, 21, 19, 21,
+ 15, 15, 13, 13, 15, 31, 9, 9, 22, 11,
+ 27, 7, 9, 13, 3, 5, 9, 23, 5, 7,
+ 19, 19, 8, 49, 15, 22, 15, 0, 11, 9,
+ 3, 4, 1, 7, 3, 21, 21, 31, 102, 96,
+ 100, 88, 76, 84, 82, 76, 74, 68, 72, 70,
+ 48, 36, 8, 54, 50, 40, 18, 32, 26, 12,
+ 22, 8, 2, 9, 17, 25, 31, 50, 44, 50,
+ 28, 5, 10, 4, 19, 2, 13, 21, 53, 37,
+ 47, 59, 10, 18, 29, 8, 14, 5, 23, 21,
+ 23, 41, 21, 35, 37, 71, 41, 51, 69, 1,
+ 29, 45, 19, 2, 10, 12, 30, 20, 34, 2,
+ 28, 24, 40, 44, 50, 50, 24, 104, 56, 42,
+ 30, 14, 10, 29, 41, 61, 12, 74, 62, 62,
+ 44, 44, 24, 20, 8, 1, 19, 2, 10, 12,
+ 30, 20, 34, 2, 28, 24, 40, 44, 50, 50,
+ 24, 104, 56, 42, 30, 14, 10, 29, 41, 61,
+ },
+
+ {
+ /* Context Tables for P, SP, B Slices :: cabac_init_idc = 2, qp = 25 */
+
+ 94, 12, 29, 94, 12, 29, 39, 9, 40, 22,
+ 4, 4, 48, 84, 56, 12, 14, 14, 29, 1,
+ 1, 5, 34, 29, 15, 44, 36, 41, 71, 103,
+ 27, 37, 13, 29, 1, 1, 39, 5, 58, 4,
+ 15, 31, 39, 45, 51, 53, 83, 1, 27, 43,
+ 13, 41, 43, 71, 8, 17, 15, 29, 48, 4,
+ 44, 0, 0, 0, 9, 57, 67, 4, 4, 5,
+ 38, 19, 63, 13, 1, 19, 38, 34, 13, 1,
+ 29, 18, 21, 11, 11, 67, 39, 47, 43, 36,
+ 7, 4, 11, 15, 45, 19, 33, 3, 23, 17,
+ 43, 34, 17, 10, 27, 31, 11, 21, 7, 6,
+ 8, 1, 18, 12, 19, 0, 1, 7, 4, 17,
+ 52, 7, 2, 46, 46, 62, 54, 40, 49, 0,
+ 3, 5, 7, 37, 9, 30, 7, 7, 26, 64,
+ 106, 78, 30, 51, 51, 4, 72, 80, 95, 11,
+ 10, 51, 26, 5, 0, 28, 66, 110, 90, 44,
+ 55, 49, 0, 26, 38, 57, 36, 42, 38, 34,
+ 40, 34, 22, 38, 36, 0, 14, 20, 8, 4,
+ 11, 14, 4, 0, 10, 2, 10, 12, 0, 47,
+ 0, 0, 11, 1, 23, 54, 74, 46, 32, 42,
+ 48, 52, 54, 34, 19, 43, 13, 9, 5, 87,
+ 3, 6, 45, 36, 8, 8, 6, 6, 5, 6,
+ 10, 31, 59, 27, 33, 29, 51, 27, 32, 32,
+ 20, 8, 0, 9, 19, 21, 35, 7, 56, 30,
+ 20, 4, 22, 0, 9, 21, 11, 5, 56, 30,
+ 20, 8, 20, 0, 9, 19, 31, 17, 58, 26,
+ 18, 8, 12, 9, 21, 29, 6, 70, 44, 30,
+ 16, 30, 6, 3, 9, 15, 124, 57, 47, 33,
+ 41, 41, 35, 29, 25, 23, 21, 15, 5, 29,
+ 29, 39, 43, 30, 41, 39, 29, 19, 17, 19,
+ 15, 15, 13, 13, 15, 33, 9, 9, 24, 11,
+ 27, 7, 9, 13, 3, 5, 9, 23, 7, 7,
+ 19, 17, 12, 49, 15, 24, 17, 0, 11, 9,
+ 1, 4, 1, 5, 1, 19, 19, 35, 100, 94,
+ 98, 84, 72, 78, 78, 70, 70, 64, 66, 66,
+ 42, 30, 4, 46, 44, 34, 8, 26, 22, 10,
+ 18, 4, 0, 11, 19, 27, 31, 46, 40, 46,
+ 24, 9, 6, 2, 23, 1, 17, 25, 57, 41,
+ 51, 61, 8, 16, 33, 6, 10, 7, 27, 23,
+ 25, 43, 23, 35, 37, 69, 43, 53, 73, 3,
+ 31, 47, 19, 4, 12, 14, 34, 22, 36, 2,
+ 30, 26, 42, 46, 52, 52, 26, 102, 54, 38,
+ 26, 10, 6, 33, 45, 63, 14, 74, 64, 64,
+ 46, 46, 26, 22, 10, 0, 19, 4, 12, 14,
+ 34, 22, 36, 2, 30, 26, 42, 46, 52, 52,
+ 26, 102, 54, 38, 26, 10, 6, 33, 45, 63,
+ },
+
+ {
+ /* Context Tables for P, SP, B Slices :: cabac_init_idc = 2, qp = 26 */
+
+ 92, 12, 29, 92, 12, 29, 35, 5, 40, 22,
+ 2, 0, 46, 82, 58, 12, 16, 10, 29, 0,
+ 3, 5, 36, 31, 17, 42, 32, 47, 77, 105,
+ 23, 35, 13, 29, 0, 3, 39, 3, 58, 4,
+ 15, 29, 35, 47, 53, 53, 83, 1, 25, 41,
+ 15, 41, 43, 71, 8, 17, 15, 27, 50, 4,
+ 44, 0, 0, 0, 9, 57, 67, 6, 2, 5,
+ 36, 19, 61, 9, 0, 17, 40, 36, 11, 0,
+ 25, 20, 19, 9, 5, 67, 41, 47, 45, 36,
+ 7, 6, 5, 13, 45, 19, 29, 3, 23, 17,
+ 43, 34, 17, 10, 25, 31, 9, 19, 7, 6,
+ 8, 0, 18, 12, 21, 0, 1, 7, 4, 17,
+ 52, 7, 2, 48, 46, 62, 54, 40, 51, 0,
+ 3, 5, 7, 35, 9, 32, 5, 7, 30, 68,
+ 108, 80, 34, 51, 53, 4, 74, 80, 97, 11,
+ 12, 53, 28, 5, 0, 30, 66, 110, 90, 48,
+ 57, 45, 1, 20, 32, 55, 36, 42, 36, 34,
+ 38, 34, 20, 38, 34, 1, 12, 20, 8, 4,
+ 11, 12, 4, 1, 8, 2, 10, 10, 0, 49,
+ 1, 1, 11, 3, 25, 50, 70, 42, 28, 40,
+ 44, 46, 48, 32, 23, 47, 17, 13, 9, 89,
+ 5, 4, 47, 34, 6, 6, 2, 4, 9, 2,
+ 8, 35, 59, 29, 35, 33, 49, 25, 34, 32,
+ 20, 8, 2, 7, 17, 21, 31, 3, 58, 32,
+ 20, 6, 26, 2, 5, 19, 5, 3, 56, 30,
+ 20, 10, 22, 2, 9, 17, 29, 17, 60, 28,
+ 18, 8, 14, 9, 21, 27, 8, 70, 46, 30,
+ 16, 32, 8, 1, 9, 13, 124, 55, 45, 31,
+ 39, 39, 31, 27, 21, 21, 17, 11, 0, 27,
+ 27, 37, 41, 36, 39, 41, 27, 17, 15, 19,
+ 15, 15, 13, 11, 17, 35, 9, 9, 26, 11,
+ 27, 7, 9, 11, 5, 7, 9, 25, 9, 9,
+ 19, 17, 14, 51, 17, 26, 17, 1, 11, 9,
+ 1, 2, 3, 5, 1, 19, 19, 37, 98, 92,
+ 96, 82, 66, 74, 72, 64, 64, 58, 60, 60,
+ 36, 26, 0, 38, 36, 26, 1, 20, 18, 6,
+ 14, 1, 3, 13, 21, 29, 31, 42, 34, 40,
+ 20, 13, 2, 1, 27, 5, 21, 29, 61, 45,
+ 53, 65, 6, 14, 37, 2, 8, 11, 31, 25,
+ 27, 45, 25, 37, 37, 67, 47, 55, 75, 7,
+ 33, 49, 19, 4, 12, 16, 36, 24, 36, 4,
+ 32, 28, 44, 46, 54, 52, 26, 102, 50, 34,
+ 22, 6, 2, 39, 49, 65, 14, 76, 64, 64,
+ 48, 48, 26, 22, 10, 2, 19, 4, 12, 16,
+ 36, 24, 36, 4, 32, 28, 44, 46, 54, 52,
+ 26, 102, 50, 34, 22, 6, 2, 39, 49, 65,
+ },
+
+ {
+ /* Context Tables for P, SP, B Slices :: cabac_init_idc = 2, qp = 27 */
+
+ 90, 12, 31, 90, 12, 31, 31, 3, 42, 22,
+ 2, 1, 42, 80, 58, 14, 20, 8, 29, 4,
+ 3, 7, 38, 33, 19, 40, 30, 55, 81, 107,
+ 19, 31, 13, 29, 4, 3, 37, 1, 60, 4,
+ 13, 27, 33, 51, 55, 53, 85, 1, 25, 41,
+ 15, 43, 41, 71, 6, 17, 15, 25, 50, 4,
+ 44, 0, 0, 0, 7, 57, 67, 6, 2, 7,
+ 36, 17, 59, 5, 2, 15, 44, 40, 9, 2,
+ 21, 22, 17, 9, 1, 69, 41, 47, 45, 38,
+ 7, 8, 1, 13, 43, 17, 25, 3, 23, 17,
+ 43, 36, 19, 12, 25, 31, 7, 19, 5, 6,
+ 10, 2, 20, 12, 21, 0, 1, 7, 4, 17,
+ 52, 7, 2, 48, 46, 64, 56, 42, 51, 2,
+ 1, 7, 7, 35, 9, 34, 5, 5, 32, 70,
+ 108, 84, 36, 53, 55, 6, 74, 80, 97, 13,
+ 12, 53, 28, 7, 2, 32, 66, 110, 92, 50,
+ 59, 43, 3, 14, 26, 51, 36, 42, 36, 32,
+ 38, 32, 20, 36, 34, 1, 12, 20, 6, 4,
+ 13, 12, 2, 3, 8, 2, 10, 10, 1, 53,
+ 1, 3, 11, 5, 25, 44, 66, 38, 26, 36,
+ 40, 40, 44, 28, 27, 51, 19, 17, 13, 93,
+ 7, 0, 51, 30, 4, 4, 0, 0, 13, 1,
+ 4, 37, 59, 31, 37, 35, 45, 21, 38, 34,
+ 20, 8, 4, 7, 17, 19, 29, 1, 60, 34,
+ 22, 6, 28, 4, 3, 17, 1, 3, 58, 32,
+ 22, 10, 24, 2, 7, 15, 27, 17, 64, 30,
+ 18, 8, 14, 7, 21, 27, 8, 72, 46, 32,
+ 16, 34, 8, 1, 7, 13, 124, 51, 41, 27,
+ 35, 35, 29, 23, 19, 17, 13, 7, 4, 27,
+ 25, 37, 41, 42, 39, 43, 27, 17, 13, 19,
+ 15, 15, 13, 11, 17, 37, 9, 9, 26, 11,
+ 27, 7, 9, 11, 5, 7, 9, 27, 11, 9,
+ 19, 17, 16, 53, 17, 28, 19, 1, 11, 9,
+ 0, 2, 3, 3, 0, 19, 19, 41, 96, 90,
+ 94, 78, 62, 68, 68, 58, 60, 52, 54, 54,
+ 30, 20, 3, 32, 30, 18, 11, 16, 12, 2,
+ 10, 5, 7, 15, 23, 31, 31, 38, 30, 36,
+ 16, 17, 0, 3, 31, 9, 27, 33, 65, 49,
+ 57, 67, 4, 10, 41, 0, 4, 13, 35, 29,
+ 31, 47, 27, 39, 39, 65, 49, 59, 79, 9,
+ 37, 51, 17, 6, 14, 16, 38, 26, 38, 4,
+ 34, 30, 46, 48, 56, 54, 28, 100, 46, 30,
+ 18, 2, 1, 43, 53, 67, 16, 76, 66, 66,
+ 48, 50, 28, 24, 12, 2, 17, 6, 14, 16,
+ 38, 26, 38, 4, 34, 30, 46, 48, 56, 54,
+ 28, 100, 46, 30, 18, 2, 1, 43, 53, 67,
+ },
+
+ {
+ /* Context Tables for P, SP, B Slices :: cabac_init_idc = 2, qp = 28 */
+
+ 86, 12, 31, 86, 12, 31, 29, 0, 42, 22,
+ 0, 5, 40, 78, 60, 14, 24, 4, 29, 6,
+ 3, 7, 40, 35, 23, 36, 26, 61, 85, 109,
+ 15, 29, 13, 29, 6, 3, 37, 0, 60, 2,
+ 11, 25, 31, 53, 59, 55, 85, 1, 25, 39,
+ 15, 43, 41, 71, 6, 17, 15, 25, 52, 4,
+ 44, 0, 0, 0, 7, 59, 67, 8, 0, 7,
+ 34, 17, 57, 1, 4, 13, 46, 42, 9, 4,
+ 17, 24, 15, 7, 2, 71, 41, 47, 45, 38,
+ 9, 10, 2, 13, 41, 15, 21, 3, 23, 17,
+ 43, 36, 19, 12, 23, 31, 7, 19, 5, 6,
+ 10, 4, 20, 12, 21, 1, 1, 7, 4, 17,
+ 52, 7, 2, 50, 46, 64, 56, 42, 53, 2,
+ 0, 7, 7, 35, 9, 36, 5, 5, 34, 72,
+ 110, 86, 40, 53, 57, 6, 74, 80, 99, 13,
+ 14, 55, 30, 7, 2, 34, 66, 110, 92, 52,
+ 61, 39, 5, 8, 20, 49, 34, 42, 34, 30,
+ 36, 32, 18, 36, 32, 3, 10, 18, 6, 4,
+ 13, 10, 0, 5, 6, 0, 8, 8, 1, 55,
+ 3, 5, 11, 9, 27, 40, 62, 34, 22, 32,
+ 36, 32, 38, 26, 31, 57, 23, 21, 17, 95,
+ 11, 1, 55, 28, 2, 2, 1, 1, 17, 5,
+ 0, 41, 59, 33, 39, 39, 43, 19, 40, 34,
+ 20, 8, 6, 5, 15, 17, 25, 0, 62, 34,
+ 24, 8, 30, 6, 0, 15, 2, 1, 58, 32,
+ 22, 12, 26, 4, 7, 15, 25, 17, 66, 32,
+ 18, 8, 16, 7, 21, 25, 10, 72, 48, 32,
+ 16, 36, 10, 0, 7, 11, 124, 49, 39, 25,
+ 31, 33, 25, 21, 15, 15, 11, 3, 8, 25,
+ 23, 35, 39, 48, 39, 45, 27, 15, 13, 19,
+ 15, 15, 15, 9, 17, 39, 9, 9, 28, 11,
+ 27, 7, 9, 11, 7, 7, 9, 29, 13, 9,
+ 19, 17, 18, 55, 17, 28, 21, 1, 11, 9,
+ 0, 2, 5, 3, 2, 19, 19, 43, 94, 88,
+ 92, 74, 58, 64, 62, 52, 54, 46, 46, 48,
+ 22, 14, 7, 24, 22, 10, 21, 10, 8, 1,
+ 6, 11, 11, 19, 25, 33, 31, 32, 26, 32,
+ 12, 21, 3, 7, 35, 13, 31, 37, 69, 55,
+ 61, 69, 2, 8, 47, 3, 0, 17, 39, 31,
+ 33, 49, 29, 41, 39, 63, 51, 61, 81, 11,
+ 39, 53, 17, 6, 14, 18, 40, 26, 40, 4,
+ 36, 32, 48, 48, 58, 54, 30, 100, 42, 26,
+ 14, 1, 5, 47, 57, 71, 16, 78, 66, 68,
+ 50, 50, 28, 24, 12, 4, 17, 6, 14, 18,
+ 40, 26, 40, 4, 36, 32, 48, 48, 58, 54,
+ 30, 100, 42, 26, 14, 1, 5, 47, 57, 71,
+ },
+
+ {
+ /* Context Tables for P, SP, B Slices :: cabac_init_idc = 2, qp = 29 */
+
+ 84, 12, 31, 84, 12, 31, 25, 2, 42, 22,
+ 0, 9, 36, 76, 62, 14, 26, 0, 27, 10,
+ 5, 7, 42, 37, 25, 34, 22, 67, 91, 109,
+ 11, 27, 13, 27, 10, 5, 37, 2, 60, 2,
+ 11, 23, 27, 55, 61, 55, 85, 1, 23, 37,
+ 17, 43, 41, 71, 6, 15, 15, 23, 52, 4,
+ 44, 0, 0, 0, 5, 59, 67, 8, 1, 7,
+ 34, 15, 53, 4, 6, 11, 50, 46, 7, 6,
+ 13, 26, 13, 5, 8, 71, 43, 47, 47, 38,
+ 9, 12, 8, 11, 41, 15, 17, 3, 23, 17,
+ 41, 36, 19, 14, 21, 29, 5, 17, 5, 8,
+ 12, 6, 22, 12, 23, 1, 0, 7, 4, 17,
+ 54, 7, 2, 50, 48, 64, 56, 42, 53, 4,
+ 0, 9, 7, 33, 9, 38, 3, 3, 38, 76,
+ 112, 90, 44, 55, 59, 8, 76, 80, 101, 13,
+ 14, 55, 32, 7, 2, 38, 66, 110, 92, 56,
+ 63, 37, 7, 4, 14, 47, 34, 42, 34, 30,
+ 36, 32, 18, 36, 32, 5, 8, 18, 6, 4,
+ 13, 10, 0, 7, 6, 0, 8, 6, 3, 57,
+ 5, 5, 11, 11, 27, 36, 58, 32, 20, 30,
+ 32, 26, 34, 24, 35, 61, 25, 25, 21, 97,
+ 13, 3, 57, 26, 0, 0, 5, 5, 21, 9,
+ 1, 43, 59, 35, 41, 43, 39, 15, 42, 36,
+ 20, 8, 8, 3, 13, 17, 21, 4, 64, 36,
+ 24, 10, 34, 8, 2, 13, 8, 0, 60, 34,
+ 24, 12, 28, 6, 5, 13, 23, 15, 68, 34,
+ 18, 8, 18, 5, 19, 23, 12, 72, 50, 32,
+ 16, 38, 12, 2, 5, 9, 124, 47, 35, 21,
+ 29, 31, 23, 17, 11, 11, 7, 0, 14, 23,
+ 21, 33, 37, 54, 37, 47, 25, 13, 11, 19,
+ 15, 15, 15, 9, 19, 41, 9, 9, 30, 11,
+ 27, 7, 9, 9, 7, 9, 9, 31, 15, 11,
+ 19, 17, 20, 55, 19, 30, 21, 3, 11, 9,
+ 0, 0, 5, 3, 2, 19, 19, 47, 92, 86,
+ 90, 72, 52, 58, 56, 46, 50, 40, 40, 42,
+ 16, 10, 11, 16, 16, 2, 31, 4, 4, 3,
+ 2, 15, 15, 21, 27, 35, 31, 28, 20, 26,
+ 8, 25, 7, 9, 39, 17, 35, 41, 73, 59,
+ 63, 73, 0, 6, 51, 5, 1, 19, 43, 33,
+ 35, 51, 31, 43, 39, 61, 55, 63, 85, 15,
+ 41, 55, 17, 8, 16, 20, 42, 28, 40, 6,
+ 38, 34, 50, 50, 60, 56, 30, 98, 38, 22,
+ 10, 5, 9, 53, 61, 73, 18, 78, 68, 68,
+ 52, 52, 30, 26, 12, 6, 17, 8, 16, 20,
+ 42, 28, 40, 6, 38, 34, 50, 50, 60, 56,
+ 30, 98, 38, 22, 10, 5, 9, 53, 61, 73,
+ },
+
+ {
+ /* Context Tables for P, SP, B Slices :: cabac_init_idc = 2, qp = 30 */
+
+ 82, 12, 31, 82, 12, 31, 21, 6, 44, 22,
+ 1, 13, 34, 74, 62, 14, 30, 1, 27, 12,
+ 5, 9, 44, 39, 27, 32, 18, 75, 95, 111,
+ 7, 23, 13, 27, 12, 5, 35, 4, 62, 2,
+ 9, 21, 25, 57, 63, 55, 87, 1, 23, 35,
+ 17, 45, 41, 71, 6, 15, 15, 21, 54, 4,
+ 44, 0, 0, 0, 5, 59, 67, 10, 3, 7,
+ 32, 15, 51, 8, 8, 9, 52, 48, 5, 8,
+ 9, 28, 11, 3, 12, 73, 43, 47, 47, 40,
+ 9, 14, 12, 11, 39, 13, 13, 3, 23, 17,
+ 41, 38, 19, 14, 19, 29, 3, 17, 5, 8,
+ 12, 8, 22, 12, 23, 1, 0, 7, 4, 17,
+ 54, 7, 2, 52, 48, 66, 56, 44, 55, 4,
+ 2, 9, 7, 33, 9, 40, 3, 3, 40, 78,
+ 114, 92, 48, 55, 61, 8, 76, 80, 103, 13,
+ 16, 57, 32, 7, 2, 40, 66, 110, 92, 58,
+ 65, 33, 9, 1, 8, 43, 34, 42, 32, 28,
+ 34, 30, 16, 36, 30, 7, 6, 18, 6, 4,
+ 15, 8, 1, 9, 4, 0, 8, 4, 3, 59,
+ 5, 7, 11, 13, 29, 32, 54, 28, 16, 26,
+ 28, 20, 28, 22, 39, 65, 29, 29, 25, 101,
+ 15, 5, 61, 24, 1, 1, 7, 7, 25, 13,
+ 5, 47, 59, 37, 43, 45, 37, 13, 46, 36,
+ 20, 8, 10, 1, 11, 15, 17, 6, 66, 38,
+ 26, 10, 36, 10, 6, 11, 12, 0, 60, 34,
+ 24, 14, 30, 6, 5, 11, 21, 15, 72, 36,
+ 18, 8, 20, 5, 19, 21, 14, 74, 52, 34,
+ 16, 40, 12, 4, 5, 9, 124, 43, 33, 19,
+ 25, 29, 19, 15, 9, 9, 3, 4, 18, 23,
+ 19, 33, 35, 60, 37, 49, 25, 11, 9, 19,
+ 15, 15, 15, 7, 19, 43, 9, 9, 30, 11,
+ 27, 7, 9, 9, 9, 9, 9, 33, 17, 11,
+ 19, 17, 22, 57, 19, 32, 23, 3, 11, 9,
+ 2, 0, 7, 1, 4, 19, 19, 49, 90, 84,
+ 88, 68, 48, 54, 52, 40, 44, 34, 34, 36,
+ 10, 4, 15, 8, 8, 5, 41, 1, 0, 7,
+ 1, 21, 19, 23, 29, 37, 31, 24, 16, 22,
+ 4, 29, 11, 13, 43, 21, 39, 45, 77, 63,
+ 67, 75, 1, 4, 55, 9, 5, 23, 47, 37,
+ 39, 53, 33, 45, 39, 59, 57, 67, 87, 17,
+ 43, 57, 17, 8, 16, 20, 44, 30, 42, 6,
+ 40, 36, 52, 50, 62, 56, 32, 98, 34, 18,
+ 6, 9, 13, 57, 65, 75, 18, 80, 68, 70,
+ 52, 54, 30, 26, 14, 6, 17, 8, 16, 20,
+ 44, 30, 42, 6, 40, 36, 52, 50, 62, 56,
+ 32, 98, 34, 18, 6, 9, 13, 57, 65, 75,
+ },
+
+ {
+ /* Context Tables for P, SP, B Slices :: cabac_init_idc = 2, qp = 31 */
+
+ 80, 12, 31, 80, 12, 31, 17, 8, 44, 22,
+ 1, 17, 30, 72, 64, 14, 34, 5, 27, 16,
+ 5, 9, 46, 41, 29, 30, 14, 81, 99, 113,
+ 3, 21, 13, 27, 16, 5, 35, 6, 62, 2,
+ 7, 19, 23, 59, 65, 55, 87, 1, 23, 33,
+ 17, 45, 41, 71, 6, 15, 15, 19, 54, 4,
+ 44, 0, 0, 0, 3, 59, 67, 10, 5, 7,
+ 32, 13, 49, 12, 10, 7, 56, 52, 3, 10,
+ 5, 30, 9, 1, 16, 75, 43, 47, 47, 40,
+ 9, 16, 16, 11, 37, 11, 9, 3, 23, 17,
+ 41, 38, 19, 16, 17, 29, 1, 17, 5, 8,
+ 14, 10, 24, 12, 23, 1, 0, 7, 4, 17,
+ 54, 7, 2, 52, 48, 66, 56, 44, 55, 6,
+ 4, 11, 7, 33, 9, 42, 3, 1, 42, 80,
+ 116, 96, 52, 57, 63, 10, 76, 80, 105, 13,
+ 16, 57, 34, 7, 2, 42, 66, 110, 92, 60,
+ 67, 31, 11, 7, 2, 41, 34, 42, 32, 26,
+ 34, 30, 16, 36, 30, 9, 4, 18, 6, 4,
+ 15, 8, 3, 11, 4, 0, 8, 2, 5, 61,
+ 7, 9, 11, 15, 29, 28, 50, 24, 14, 22,
+ 24, 14, 24, 20, 43, 69, 31, 33, 29, 103,
+ 17, 7, 65, 22, 3, 3, 9, 11, 29, 17,
+ 9, 49, 59, 39, 45, 49, 33, 9, 48, 38,
+ 20, 8, 12, 0, 9, 13, 13, 8, 68, 40,
+ 28, 12, 38, 12, 8, 9, 16, 2, 62, 36,
+ 26, 14, 32, 8, 3, 9, 19, 15, 74, 38,
+ 18, 8, 22, 3, 19, 19, 16, 74, 54, 34,
+ 16, 42, 14, 6, 3, 7, 124, 41, 29, 15,
+ 21, 27, 17, 11, 5, 5, 0, 8, 22, 21,
+ 17, 31, 33, 66, 37, 51, 25, 9, 7, 19,
+ 15, 15, 15, 7, 19, 45, 9, 9, 32, 11,
+ 27, 7, 9, 9, 9, 9, 9, 35, 19, 11,
+ 19, 17, 24, 59, 19, 34, 25, 3, 11, 9,
+ 2, 0, 7, 1, 6, 19, 19, 53, 88, 82,
+ 86, 64, 44, 48, 46, 34, 40, 28, 28, 30,
+ 4, 1, 19, 0, 2, 13, 51, 7, 3, 11,
+ 5, 25, 23, 25, 31, 39, 31, 20, 12, 18,
+ 0, 33, 15, 15, 47, 25, 43, 49, 81, 67,
+ 71, 77, 3, 2, 59, 11, 9, 25, 51, 39,
+ 41, 55, 35, 47, 39, 57, 59, 69, 91, 19,
+ 45, 59, 17, 10, 18, 22, 46, 32, 44, 6,
+ 42, 38, 54, 52, 64, 58, 34, 96, 30, 14,
+ 2, 13, 17, 61, 69, 77, 20, 80, 70, 72,
+ 54, 56, 32, 28, 14, 8, 17, 10, 18, 22,
+ 46, 32, 44, 6, 42, 38, 54, 52, 64, 58,
+ 34, 96, 30, 14, 2, 13, 17, 61, 69, 77,
+ },
+
+ {
+ /* Context Tables for P, SP, B Slices :: cabac_init_idc = 2, qp = 32 */
+
+ 76, 10, 33, 76, 10, 33, 15, 10, 44, 22,
+ 3, 21, 26, 70, 64, 14, 36, 9, 27, 18,
+ 7, 11, 48, 43, 33, 26, 10, 89, 105, 115,
+ 0, 19, 13, 27, 18, 7, 35, 6, 62, 0,
+ 7, 19, 21, 63, 69, 57, 89, 3, 23, 33,
+ 19, 47, 41, 71, 4, 15, 15, 19, 54, 2,
+ 44, 0, 0, 0, 3, 61, 67, 10, 7, 9,
+ 30, 13, 47, 16, 12, 7, 58, 54, 3, 12,
+ 3, 30, 7, 1, 20, 77, 45, 49, 49, 40,
+ 11, 16, 20, 11, 37, 11, 7, 3, 23, 17,
+ 41, 38, 21, 16, 17, 29, 1, 17, 5, 8,
+ 14, 12, 24, 12, 25, 3, 0, 7, 2, 17,
+ 54, 7, 2, 52, 48, 66, 56, 44, 57, 6,
+ 4, 13, 7, 33, 11, 42, 3, 1, 44, 82,
+ 116, 98, 54, 59, 67, 10, 76, 80, 107, 15,
+ 16, 59, 34, 9, 2, 44, 66, 108, 92, 62,
+ 69, 29, 15, 13, 5, 39, 32, 42, 30, 24,
+ 32, 28, 14, 34, 28, 11, 2, 16, 4, 2,
+ 17, 6, 5, 13, 2, 1, 6, 0, 7, 65,
+ 9, 11, 11, 19, 31, 22, 44, 20, 10, 18,
+ 18, 6, 18, 16, 47, 75, 35, 39, 33, 107,
+ 21, 11, 69, 18, 5, 5, 13, 15, 33, 21,
+ 13, 53, 59, 41, 47, 53, 31, 7, 50, 38,
+ 20, 8, 12, 0, 9, 13, 11, 10, 68, 40,
+ 28, 12, 40, 14, 10, 9, 20, 2, 62, 36,
+ 26, 14, 32, 8, 3, 9, 17, 15, 76, 40,
+ 18, 6, 22, 3, 19, 19, 16, 74, 54, 34,
+ 16, 44, 14, 6, 3, 7, 124, 39, 27, 13,
+ 19, 25, 15, 9, 3, 3, 2, 10, 26, 21,
+ 17, 31, 33, 72, 37, 55, 25, 9, 7, 19,
+ 15, 15, 17, 7, 21, 47, 11, 9, 32, 13,
+ 27, 9, 9, 9, 11, 11, 11, 37, 21, 13,
+ 21, 17, 26, 61, 21, 34, 27, 5, 11, 11,
+ 2, 1, 9, 1, 6, 19, 19, 57, 84, 80,
+ 82, 60, 38, 42, 40, 28, 34, 22, 20, 24,
+ 3, 7, 23, 7, 5, 21, 63, 13, 9, 15,
+ 11, 31, 27, 29, 35, 41, 31, 14, 6, 12,
+ 3, 39, 19, 19, 53, 29, 49, 53, 87, 73,
+ 75, 81, 7, 1, 65, 15, 13, 29, 55, 43,
+ 45, 57, 37, 49, 41, 55, 63, 73, 95, 23,
+ 49, 63, 17, 10, 18, 22, 48, 32, 44, 6,
+ 44, 38, 56, 52, 64, 58, 34, 94, 26, 10,
+ 3, 19, 21, 67, 73, 81, 20, 80, 70, 72,
+ 54, 56, 32, 28, 14, 8, 17, 10, 18, 22,
+ 48, 32, 44, 6, 44, 38, 56, 52, 64, 58,
+ 34, 94, 26, 10, 3, 19, 21, 67, 73, 81,
+ },
+
+ {
+ /* Context Tables for P, SP, B Slices :: cabac_init_idc = 2, qp = 33 */
+
+ 74, 10, 33, 74, 10, 33, 11, 14, 46, 24,
+ 3, 23, 24, 70, 66, 16, 40, 11, 25, 22,
+ 7, 11, 52, 43, 35, 24, 8, 95, 109, 115,
+ 4, 15, 11, 25, 22, 7, 33, 8, 64, 0,
+ 5, 17, 17, 65, 71, 57, 89, 3, 21, 31,
+ 19, 47, 39, 69, 4, 13, 13, 17, 56, 2,
+ 44, 0, 0, 0, 1, 61, 67, 12, 7, 9,
+ 30, 11, 43, 22, 16, 5, 62, 58, 1, 16,
+ 0, 32, 3, 0, 26, 77, 45, 49, 49, 42,
+ 11, 18, 26, 9, 35, 9, 3, 3, 21, 17,
+ 39, 40, 21, 18, 15, 27, 0, 15, 3, 10,
+ 16, 14, 26, 14, 25, 3, 2, 5, 2, 15,
+ 56, 5, 2, 54, 50, 68, 58, 46, 57, 8,
+ 6, 13, 7, 31, 11, 44, 1, 0, 48, 86,
+ 118, 102, 58, 59, 69, 12, 78, 82, 107, 15,
+ 18, 59, 36, 9, 4, 48, 66, 108, 94, 66,
+ 71, 25, 17, 17, 11, 35, 32, 42, 30, 24,
+ 32, 28, 14, 34, 28, 11, 2, 16, 4, 2,
+ 17, 6, 5, 13, 2, 1, 6, 0, 7, 67,
+ 9, 11, 11, 21, 31, 18, 40, 18, 8, 16,
+ 14, 0, 14, 14, 49, 79, 37, 43, 35, 109,
+ 23, 13, 71, 16, 5, 5, 15, 17, 35, 23,
+ 15, 55, 59, 41, 47, 55, 27, 3, 54, 40,
+ 20, 8, 14, 2, 7, 11, 7, 14, 70, 42,
+ 30, 14, 44, 16, 14, 7, 26, 4, 64, 38,
+ 28, 16, 34, 10, 1, 7, 13, 13, 80, 42,
+ 20, 6, 24, 1, 17, 17, 18, 76, 56, 36,
+ 18, 46, 16, 8, 1, 5, 124, 35, 23, 9,
+ 15, 21, 11, 5, 0, 0, 6, 14, 32, 19,
+ 15, 29, 31, 80, 35, 57, 23, 7, 5, 17,
+ 15, 13, 17, 5, 21, 47, 11, 9, 34, 13,
+ 27, 9, 9, 7, 11, 11, 11, 37, 21, 13,
+ 21, 15, 30, 61, 21, 36, 27, 5, 11, 11,
+ 4, 1, 9, 0, 8, 17, 17, 59, 82, 78,
+ 80, 58, 34, 38, 36, 22, 30, 18, 14, 20,
+ 9, 11, 27, 13, 11, 27, 73, 17, 13, 17,
+ 15, 35, 29, 31, 37, 41, 31, 10, 2, 8,
+ 7, 43, 21, 21, 57, 31, 53, 55, 91, 77,
+ 77, 83, 9, 3, 69, 17, 15, 31, 57, 45,
+ 47, 59, 37, 49, 41, 53, 65, 75, 97, 25,
+ 51, 65, 15, 12, 20, 24, 52, 34, 46, 8,
+ 48, 40, 58, 54, 66, 60, 36, 94, 24, 8,
+ 7, 23, 23, 71, 75, 83, 22, 82, 72, 74,
+ 56, 58, 34, 30, 16, 10, 15, 12, 20, 24,
+ 52, 34, 46, 8, 48, 40, 58, 54, 66, 60,
+ 36, 94, 24, 8, 7, 23, 23, 71, 75, 83,
+ },
+
+ {
+ /* Context Tables for P, SP, B Slices :: cabac_init_idc = 2, qp = 34 */
+
+ 72, 10, 33, 72, 10, 33, 7, 16, 46, 24,
+ 3, 27, 20, 68, 68, 16, 44, 15, 25, 24,
+ 7, 11, 54, 45, 37, 22, 4, 101, 113, 117,
+ 8, 13, 11, 25, 24, 7, 33, 10, 64, 0,
+ 3, 15, 15, 67, 73, 57, 89, 3, 21, 29,
+ 19, 47, 39, 69, 4, 13, 13, 15, 56, 2,
+ 44, 0, 0, 0, 0, 61, 67, 12, 9, 9,
+ 30, 9, 41, 26, 18, 3, 66, 62, 0, 18,
+ 4, 34, 1, 2, 30, 79, 45, 49, 49, 42,
+ 11, 20, 30, 9, 33, 7, 0, 3, 21, 17,
+ 39, 40, 21, 18, 13, 27, 2, 15, 3, 10,
+ 16, 16, 28, 14, 25, 3, 2, 5, 2, 15,
+ 56, 5, 2, 54, 50, 68, 58, 46, 59, 10,
+ 8, 15, 7, 31, 11, 46, 1, 0, 50, 88,
+ 120, 106, 62, 61, 71, 12, 78, 82, 109, 15,
+ 18, 61, 38, 9, 4, 50, 66, 108, 94, 68,
+ 73, 23, 19, 23, 17, 33, 32, 42, 30, 22,
+ 30, 28, 14, 34, 26, 13, 0, 16, 4, 2,
+ 17, 4, 7, 15, 2, 1, 6, 1, 9, 69,
+ 11, 13, 11, 23, 33, 14, 36, 14, 4, 12,
+ 10, 5, 10, 12, 53, 83, 41, 47, 39, 111,
+ 25, 15, 75, 14, 7, 7, 17, 21, 39, 27,
+ 19, 59, 59, 43, 49, 59, 25, 1, 56, 42,
+ 20, 8, 16, 4, 5, 9, 3, 16, 72, 44,
+ 32, 16, 46, 18, 16, 5, 30, 6, 64, 38,
+ 28, 16, 36, 12, 0, 5, 11, 13, 82, 44,
+ 20, 6, 26, 0, 17, 15, 20, 76, 58, 36,
+ 18, 48, 18, 10, 0, 3, 124, 33, 19, 5,
+ 11, 19, 9, 1, 4, 2, 10, 18, 36, 17,
+ 13, 27, 29, 86, 35, 59, 23, 5, 3, 17,
+ 15, 13, 17, 5, 21, 49, 11, 9, 36, 13,
+ 27, 9, 9, 7, 11, 11, 11, 39, 23, 13,
+ 21, 15, 32, 63, 21, 38, 29, 5, 11, 11,
+ 4, 1, 9, 0, 10, 17, 17, 63, 80, 76,
+ 78, 54, 30, 32, 30, 16, 26, 12, 8, 14,
+ 15, 17, 31, 21, 19, 35, 83, 23, 17, 21,
+ 19, 39, 33, 33, 39, 43, 31, 6, 1, 4,
+ 11, 47, 25, 25, 61, 35, 57, 59, 95, 81,
+ 81, 85, 11, 5, 73, 21, 19, 35, 61, 47,
+ 49, 61, 39, 51, 41, 51, 67, 77, 101, 27,
+ 53, 67, 15, 12, 22, 26, 54, 36, 48, 8,
+ 50, 42, 60, 54, 68, 62, 38, 92, 20, 4,
+ 11, 27, 27, 75, 79, 85, 24, 82, 72, 76,
+ 58, 60, 34, 32, 16, 12, 15, 12, 22, 26,
+ 54, 36, 48, 8, 50, 42, 60, 54, 68, 62,
+ 38, 92, 20, 4, 11, 27, 27, 75, 79, 85,
+ },
+
+ {
+ /* Context Tables for P, SP, B Slices :: cabac_init_idc = 2, qp = 35 */
+
+ 70, 10, 33, 70, 10, 33, 3, 20, 48, 24,
+ 5, 31, 18, 66, 68, 16, 48, 17, 25, 28,
+ 7, 13, 56, 47, 39, 20, 0, 109, 117, 119,
+ 12, 9, 11, 25, 28, 7, 31, 12, 66, 0,
+ 1, 13, 13, 69, 75, 57, 91, 3, 21, 27,
+ 19, 49, 39, 69, 4, 13, 13, 13, 58, 2,
+ 44, 0, 0, 0, 0, 61, 67, 14, 11, 9,
+ 28, 9, 39, 30, 20, 1, 68, 64, 2, 20,
+ 8, 36, 0, 4, 34, 81, 45, 49, 49, 44,
+ 11, 22, 34, 9, 31, 5, 4, 3, 21, 17,
+ 39, 42, 21, 20, 11, 27, 4, 15, 3, 10,
+ 18, 18, 28, 14, 25, 3, 2, 5, 2, 15,
+ 56, 5, 2, 56, 50, 70, 58, 48, 59, 10,
+ 10, 15, 7, 31, 11, 48, 1, 2, 52, 90,
+ 122, 108, 66, 61, 73, 14, 78, 82, 111, 15,
+ 20, 61, 38, 9, 4, 52, 66, 108, 94, 70,
+ 75, 19, 21, 29, 23, 29, 32, 42, 28, 20,
+ 30, 26, 12, 34, 26, 15, 1, 16, 4, 2,
+ 19, 4, 9, 17, 0, 1, 6, 3, 9, 71,
+ 11, 15, 11, 25, 33, 10, 32, 10, 2, 8,
+ 6, 11, 4, 10, 57, 87, 43, 51, 43, 115,
+ 27, 17, 79, 12, 9, 9, 19, 23, 43, 31,
+ 23, 61, 59, 45, 51, 61, 21, 2, 60, 42,
+ 20, 8, 18, 6, 3, 7, 0, 18, 74, 46,
+ 34, 16, 48, 20, 20, 3, 34, 6, 66, 40,
+ 30, 18, 38, 12, 0, 3, 9, 13, 86, 46,
+ 20, 6, 28, 0, 17, 13, 22, 78, 60, 38,
+ 18, 50, 18, 12, 0, 3, 124, 29, 17, 3,
+ 7, 17, 5, 0, 6, 6, 14, 22, 40, 17,
+ 11, 27, 27, 92, 35, 61, 23, 3, 1, 17,
+ 15, 13, 17, 3, 21, 51, 11, 9, 36, 13,
+ 27, 9, 9, 7, 13, 11, 11, 41, 25, 13,
+ 21, 15, 34, 65, 21, 40, 31, 5, 11, 11,
+ 6, 1, 11, 2, 12, 17, 17, 65, 78, 74,
+ 76, 50, 26, 28, 26, 10, 20, 6, 2, 8,
+ 21, 23, 35, 29, 25, 43, 93, 29, 21, 25,
+ 23, 45, 37, 35, 41, 45, 31, 2, 5, 0,
+ 15, 51, 29, 27, 65, 39, 61, 63, 99, 85,
+ 85, 87, 13, 7, 77, 23, 23, 37, 65, 51,
+ 53, 63, 41, 53, 41, 49, 69, 81, 103, 29,
+ 55, 69, 15, 14, 22, 26, 56, 38, 50, 8,
+ 52, 44, 62, 56, 70, 62, 40, 92, 16, 0,
+ 15, 31, 31, 79, 83, 87, 24, 84, 74, 78,
+ 58, 62, 36, 32, 18, 12, 15, 14, 22, 26,
+ 56, 38, 50, 8, 52, 44, 62, 56, 70, 62,
+ 40, 92, 16, 0, 15, 31, 31, 79, 83, 87,
+ },
+
+ {
+ /* Context Tables for P, SP, B Slices :: cabac_init_idc = 2, qp = 36 */
+
+ 66, 10, 33, 66, 10, 33, 1, 22, 48, 24,
+ 5, 35, 14, 64, 70, 16, 50, 21, 25, 30,
+ 9, 13, 58, 49, 43, 16, 3, 115, 123, 121,
+ 16, 7, 11, 25, 30, 9, 31, 14, 66, 1,
+ 1, 11, 9, 71, 79, 59, 91, 3, 19, 25,
+ 21, 49, 39, 69, 4, 13, 13, 13, 58, 2,
+ 44, 0, 0, 0, 2, 63, 67, 14, 13, 9,
+ 28, 7, 37, 34, 22, 0, 72, 68, 2, 22,
+ 12, 38, 2, 6, 40, 81, 47, 49, 51, 44,
+ 13, 24, 40, 7, 31, 5, 8, 3, 21, 17,
+ 39, 42, 21, 20, 9, 27, 4, 13, 3, 10,
+ 18, 20, 30, 14, 27, 5, 2, 5, 2, 15,
+ 56, 5, 2, 56, 50, 70, 58, 48, 61, 12,
+ 10, 17, 7, 29, 11, 50, 0, 2, 56, 94,
+ 124, 112, 70, 63, 75, 14, 80, 82, 113, 15,
+ 20, 63, 40, 9, 4, 54, 66, 108, 94, 74,
+ 77, 17, 23, 35, 29, 27, 30, 42, 28, 20,
+ 28, 26, 12, 34, 24, 17, 3, 14, 4, 2,
+ 19, 2, 9, 19, 0, 3, 4, 5, 11, 73,
+ 13, 17, 11, 29, 35, 6, 28, 6, 1, 6,
+ 2, 19, 0, 8, 61, 93, 47, 55, 47, 117,
+ 31, 19, 81, 10, 11, 11, 23, 27, 47, 35,
+ 25, 65, 59, 47, 53, 65, 19, 4, 62, 44,
+ 20, 8, 20, 8, 1, 7, 4, 22, 76, 46,
+ 34, 18, 52, 22, 22, 1, 40, 8, 66, 40,
+ 30, 18, 40, 14, 2, 3, 7, 13, 88, 48,
+ 20, 6, 30, 2, 17, 11, 24, 78, 62, 38,
+ 18, 52, 20, 14, 2, 1, 124, 27, 13, 0,
+ 5, 15, 3, 4, 10, 8, 16, 26, 46, 15,
+ 9, 25, 25, 98, 33, 63, 21, 1, 1, 17,
+ 15, 13, 19, 3, 23, 53, 11, 9, 38, 13,
+ 27, 9, 9, 5, 13, 13, 11, 43, 27, 15,
+ 21, 15, 36, 67, 23, 40, 31, 7, 11, 11,
+ 6, 3, 11, 2, 12, 17, 17, 69, 76, 72,
+ 74, 48, 20, 22, 20, 4, 16, 0, 5, 2,
+ 29, 27, 39, 37, 33, 51, 103, 35, 25, 29,
+ 27, 49, 41, 39, 43, 47, 31, 3, 11, 5,
+ 19, 55, 33, 31, 69, 43, 65, 67, 103, 91,
+ 87, 91, 15, 9, 83, 27, 25, 41, 69, 53,
+ 55, 65, 43, 55, 41, 47, 73, 83, 107, 33,
+ 57, 71, 15, 14, 24, 28, 58, 38, 50, 10,
+ 54, 46, 64, 56, 72, 64, 40, 90, 12, 3,
+ 19, 35, 35, 85, 87, 91, 26, 84, 74, 78,
+ 60, 62, 36, 34, 18, 14, 15, 14, 24, 28,
+ 58, 38, 50, 10, 54, 46, 64, 56, 72, 64,
+ 40, 90, 12, 3, 19, 35, 35, 85, 87, 91,
+ },
+
+ {
+ /* Context Tables for P, SP, B Slices :: cabac_init_idc = 2, qp = 37 */
+
+ 64, 10, 33, 64, 10, 33, 2, 26, 48, 24,
+ 7, 39, 12, 62, 72, 16, 54, 25, 23, 34,
+ 9, 13, 60, 51, 45, 14, 7, 121, 125, 121,
+ 20, 5, 11, 23, 34, 9, 31, 16, 66, 1,
+ 0, 9, 7, 73, 81, 59, 91, 3, 19, 23,
+ 21, 49, 39, 69, 4, 11, 13, 11, 60, 2,
+ 44, 0, 0, 0, 2, 63, 67, 16, 15, 9,
+ 26, 7, 33, 40, 24, 2, 74, 70, 4, 24,
+ 16, 40, 4, 8, 44, 83, 47, 49, 51, 44,
+ 13, 26, 44, 7, 29, 3, 12, 3, 21, 17,
+ 37, 42, 21, 22, 7, 25, 6, 13, 3, 12,
+ 20, 22, 30, 14, 27, 5, 4, 5, 2, 15,
+ 58, 5, 2, 58, 52, 70, 58, 48, 61, 12,
+ 12, 17, 7, 29, 11, 52, 0, 4, 58, 96,
+ 124, 114, 74, 63, 77, 16, 80, 82, 115, 15,
+ 22, 63, 42, 9, 4, 58, 66, 108, 94, 76,
+ 79, 13, 25, 39, 35, 25, 30, 42, 26, 18,
+ 28, 26, 10, 34, 24, 19, 5, 14, 4, 2,
+ 19, 2, 11, 21, 1, 3, 4, 7, 11, 75,
+ 15, 17, 11, 31, 35, 2, 24, 4, 3, 2,
+ 1, 25, 5, 6, 65, 97, 49, 59, 51, 119,
+ 33, 21, 85, 8, 13, 13, 25, 29, 51, 39,
+ 29, 67, 59, 49, 55, 69, 15, 8, 64, 44,
+ 20, 8, 22, 10, 0, 5, 8, 24, 78, 48,
+ 36, 20, 54, 24, 26, 0, 44, 10, 68, 42,
+ 32, 20, 42, 16, 2, 1, 5, 11, 90, 50,
+ 20, 6, 32, 2, 15, 9, 26, 78, 64, 38,
+ 18, 54, 22, 16, 2, 0, 124, 25, 11, 2,
+ 1, 13, 0, 6, 14, 12, 20, 30, 50, 13,
+ 7, 23, 23, 104, 33, 65, 21, 0, 0, 17,
+ 15, 13, 19, 1, 23, 55, 11, 9, 40, 13,
+ 27, 9, 9, 5, 15, 13, 11, 45, 29, 15,
+ 21, 15, 38, 67, 23, 42, 33, 7, 11, 11,
+ 6, 3, 13, 2, 14, 17, 17, 71, 74, 70,
+ 72, 44, 16, 18, 14, 1, 10, 5, 11, 3,
+ 35, 33, 43, 45, 39, 59, 113, 41, 29, 31,
+ 31, 55, 45, 41, 45, 49, 31, 7, 15, 9,
+ 23, 59, 37, 33, 73, 47, 69, 71, 107, 95,
+ 91, 93, 17, 11, 87, 29, 29, 43, 73, 55,
+ 57, 67, 45, 57, 41, 45, 75, 85, 109, 35,
+ 59, 73, 15, 16, 24, 30, 60, 40, 52, 10,
+ 56, 48, 66, 58, 74, 64, 42, 90, 8, 7,
+ 23, 39, 39, 89, 91, 93, 26, 86, 76, 80,
+ 62, 64, 38, 34, 18, 16, 15, 16, 24, 30,
+ 60, 40, 52, 10, 56, 48, 66, 58, 74, 64,
+ 42, 90, 8, 7, 23, 39, 39, 89, 91, 93,
+ },
+
+ {
+ /* Context Tables for P, SP, B Slices :: cabac_init_idc = 2, qp = 38 */
+
+ 62, 10, 35, 62, 10, 35, 6, 28, 50, 24,
+ 7, 41, 8, 60, 72, 18, 58, 27, 23, 36,
+ 9, 15, 62, 53, 47, 12, 9, 125, 125, 123,
+ 24, 1, 11, 23, 36, 9, 29, 18, 68, 1,
+ 2, 7, 5, 77, 83, 59, 93, 3, 19, 23,
+ 21, 51, 37, 69, 2, 11, 13, 9, 60, 2,
+ 44, 0, 0, 0, 4, 63, 67, 16, 15, 11,
+ 26, 5, 31, 44, 26, 4, 78, 74, 6, 26,
+ 20, 42, 6, 8, 48, 85, 47, 49, 51, 46,
+ 13, 28, 48, 7, 27, 1, 16, 3, 21, 17,
+ 37, 44, 23, 22, 7, 25, 8, 13, 1, 12,
+ 20, 24, 32, 14, 27, 5, 4, 5, 2, 15,
+ 58, 5, 2, 58, 52, 72, 60, 50, 63, 14,
+ 14, 19, 7, 29, 11, 54, 0, 4, 60, 98,
+ 124, 118, 76, 65, 79, 16, 80, 82, 115, 17,
+ 22, 65, 42, 11, 6, 60, 66, 108, 96, 78,
+ 81, 11, 27, 45, 41, 21, 30, 42, 26, 16,
+ 26, 24, 10, 32, 22, 19, 5, 14, 2, 2,
+ 21, 0, 13, 23, 1, 3, 4, 7, 13, 79,
+ 15, 19, 11, 33, 37, 3, 20, 0, 7, 1,
+ 5, 31, 9, 2, 69, 101, 53, 63, 55, 123,
+ 35, 25, 89, 4, 15, 15, 27, 33, 55, 43,
+ 33, 71, 59, 51, 57, 71, 13, 10, 68, 46,
+ 20, 8, 24, 10, 0, 3, 10, 26, 80, 50,
+ 38, 20, 56, 26, 28, 2, 48, 10, 68, 42,
+ 32, 20, 44, 16, 4, 0, 3, 11, 94, 52,
+ 20, 6, 32, 4, 15, 9, 26, 80, 64, 40,
+ 18, 56, 22, 16, 4, 0, 124, 21, 7, 6,
+ 2, 9, 2, 10, 16, 14, 24, 34, 54, 13,
+ 5, 23, 23, 110, 33, 67, 21, 0, 2, 17,
+ 15, 13, 19, 1, 23, 57, 11, 9, 40, 13,
+ 27, 9, 9, 5, 15, 13, 11, 47, 31, 15,
+ 21, 15, 40, 69, 23, 44, 35, 7, 11, 11,
+ 8, 3, 13, 4, 16, 17, 17, 75, 72, 68,
+ 70, 40, 12, 12, 10, 7, 6, 11, 17, 9,
+ 41, 39, 47, 51, 47, 67, 123, 45, 35, 35,
+ 35, 59, 49, 43, 47, 51, 31, 11, 19, 13,
+ 27, 63, 39, 37, 77, 51, 75, 75, 111, 99,
+ 95, 95, 19, 15, 91, 33, 33, 47, 77, 59,
+ 61, 69, 47, 59, 43, 43, 77, 89, 113, 37,
+ 63, 75, 13, 16, 26, 30, 62, 42, 54, 10,
+ 58, 50, 68, 58, 76, 66, 44, 88, 4, 11,
+ 27, 43, 43, 93, 95, 95, 28, 86, 76, 82,
+ 62, 66, 38, 36, 20, 16, 13, 16, 26, 30,
+ 62, 42, 54, 10, 58, 50, 68, 58, 76, 66,
+ 44, 88, 4, 11, 27, 43, 43, 93, 95, 95,
+ },
+
+ {
+ /* Context Tables for P, SP, B Slices :: cabac_init_idc = 2, qp = 39 */
+
+ 60, 10, 35, 60, 10, 35, 10, 32, 50, 24,
+ 9, 45, 6, 58, 74, 18, 60, 31, 23, 40,
+ 11, 15, 64, 55, 49, 10, 13, 125, 125, 125,
+ 28, 0, 11, 23, 40, 11, 29, 20, 68, 1,
+ 2, 5, 1, 79, 85, 59, 93, 3, 17, 21,
+ 23, 51, 37, 69, 2, 11, 13, 7, 62, 2,
+ 44, 0, 0, 0, 4, 63, 67, 18, 17, 11,
+ 24, 5, 29, 48, 28, 6, 80, 76, 8, 28,
+ 24, 44, 8, 10, 54, 85, 49, 49, 53, 46,
+ 13, 30, 54, 5, 27, 1, 20, 3, 21, 17,
+ 37, 44, 23, 24, 5, 25, 10, 11, 1, 12,
+ 22, 26, 32, 14, 29, 5, 4, 5, 2, 15,
+ 58, 5, 2, 60, 52, 72, 60, 50, 63, 14,
+ 14, 19, 7, 27, 11, 56, 2, 6, 64, 102,
+ 124, 120, 80, 65, 81, 18, 82, 82, 117, 17,
+ 24, 65, 44, 11, 6, 62, 66, 108, 96, 82,
+ 83, 7, 29, 51, 47, 19, 30, 42, 24, 16,
+ 26, 24, 8, 32, 22, 21, 7, 14, 2, 2,
+ 21, 0, 13, 25, 3, 3, 4, 9, 13, 81,
+ 17, 21, 11, 35, 37, 7, 16, 3, 9, 3,
+ 9, 37, 15, 0, 73, 105, 55, 67, 59, 125,
+ 37, 27, 91, 2, 17, 17, 31, 35, 59, 47,
+ 35, 73, 59, 53, 59, 75, 9, 14, 70, 46,
+ 20, 8, 26, 12, 2, 3, 14, 30, 82, 52,
+ 38, 22, 60, 28, 32, 4, 54, 12, 70, 44,
+ 34, 22, 46, 18, 4, 2, 1, 11, 96, 54,
+ 20, 6, 34, 4, 15, 7, 28, 80, 66, 40,
+ 18, 58, 24, 18, 4, 2, 124, 19, 5, 8,
+ 4, 7, 6, 12, 20, 18, 28, 38, 60, 11,
+ 3, 21, 21, 116, 31, 69, 19, 2, 4, 17,
+ 15, 13, 19, 0, 25, 59, 11, 9, 42, 13,
+ 27, 9, 9, 3, 17, 15, 11, 49, 33, 17,
+ 21, 15, 42, 71, 25, 46, 35, 9, 11, 11,
+ 8, 5, 15, 4, 16, 17, 17, 77, 70, 66,
+ 68, 38, 6, 8, 4, 13, 0, 17, 23, 15,
+ 47, 43, 51, 59, 53, 75, 125, 51, 39, 39,
+ 39, 65, 53, 45, 49, 53, 31, 15, 25, 19,
+ 31, 67, 43, 39, 81, 55, 79, 79, 115, 103,
+ 97, 99, 21, 17, 95, 35, 35, 49, 81, 61,
+ 63, 71, 49, 61, 43, 41, 81, 91, 115, 41,
+ 65, 77, 13, 18, 26, 32, 64, 44, 54, 12,
+ 60, 52, 70, 60, 78, 66, 44, 88, 0, 15,
+ 31, 47, 47, 99, 99, 97, 28, 88, 78, 82,
+ 64, 68, 40, 36, 20, 18, 13, 18, 26, 32,
+ 64, 44, 54, 12, 60, 52, 70, 60, 78, 66,
+ 44, 88, 0, 15, 31, 47, 47, 99, 99, 97,
+ },
+
+ {
+ /* Context Tables for P, SP, B Slices :: cabac_init_idc = 2, qp = 40 */
+
+ 56, 8, 35, 56, 8, 35, 12, 34, 50, 24,
+ 9, 49, 2, 56, 74, 18, 64, 35, 23, 42,
+ 11, 17, 66, 57, 53, 6, 17, 125, 125, 125,
+ 32, 2, 11, 23, 42, 11, 29, 20, 68, 3,
+ 4, 3, 0, 81, 89, 61, 95, 3, 17, 19,
+ 23, 53, 37, 69, 2, 11, 13, 7, 62, 2,
+ 44, 0, 0, 0, 6, 65, 67, 18, 19, 11,
+ 24, 3, 27, 52, 30, 6, 84, 80, 8, 30,
+ 28, 44, 10, 12, 58, 87, 49, 51, 53, 46,
+ 15, 30, 58, 5, 25, 0, 22, 3, 21, 17,
+ 37, 44, 23, 24, 3, 25, 10, 11, 1, 12,
+ 22, 28, 34, 14, 29, 7, 4, 5, 0, 15,
+ 58, 5, 2, 60, 52, 72, 60, 50, 65, 16,
+ 16, 21, 7, 27, 11, 58, 2, 6, 66, 104,
+ 124, 124, 84, 67, 83, 18, 82, 82, 119, 17,
+ 24, 67, 44, 11, 6, 64, 66, 108, 96, 84,
+ 85, 5, 31, 57, 55, 17, 28, 42, 24, 14,
+ 24, 22, 8, 32, 20, 23, 9, 12, 2, 0,
+ 23, 1, 15, 27, 3, 5, 2, 11, 15, 83,
+ 19, 23, 11, 39, 39, 11, 12, 7, 13, 7,
+ 15, 45, 19, 1, 77, 111, 59, 73, 63, 125,
+ 41, 29, 95, 0, 19, 19, 33, 39, 63, 51,
+ 39, 77, 59, 55, 61, 79, 7, 16, 72, 48,
+ 20, 8, 26, 14, 4, 1, 18, 32, 82, 52,
+ 40, 22, 62, 30, 34, 6, 58, 12, 70, 44,
+ 34, 22, 46, 18, 6, 2, 0, 11, 98, 56,
+ 20, 6, 36, 6, 15, 5, 30, 80, 68, 40,
+ 18, 60, 24, 20, 6, 2, 124, 17, 1, 12,
+ 8, 5, 8, 16, 22, 20, 30, 42, 64, 11,
+ 1, 21, 19, 122, 31, 71, 19, 4, 4, 17,
+ 15, 13, 21, 0, 25, 61, 11, 9, 42, 13,
+ 27, 11, 9, 3, 17, 15, 13, 51, 35, 17,
+ 23, 15, 44, 73, 25, 46, 37, 9, 11, 13,
+ 8, 5, 15, 4, 18, 17, 17, 81, 68, 64,
+ 66, 34, 2, 2, 1, 19, 3, 23, 31, 21,
+ 55, 49, 55, 67, 61, 83, 125, 57, 43, 43,
+ 45, 69, 57, 49, 53, 55, 31, 21, 29, 23,
+ 35, 73, 47, 43, 87, 59, 83, 83, 119, 109,
+ 101, 101, 25, 19, 101, 39, 39, 53, 85, 65,
+ 67, 73, 51, 63, 43, 39, 83, 95, 119, 43,
+ 67, 79, 13, 18, 28, 32, 66, 44, 56, 12,
+ 62, 52, 72, 60, 78, 68, 46, 86, 3, 19,
+ 35, 51, 51, 103, 103, 101, 30, 88, 78, 84,
+ 64, 68, 40, 38, 20, 18, 13, 18, 28, 32,
+ 66, 44, 56, 12, 62, 52, 72, 60, 78, 68,
+ 46, 86, 3, 19, 35, 51, 51, 103, 103, 101,
+ },
+
+ {
+ /* Context Tables for P, SP, B Slices :: cabac_init_idc = 2, qp = 41 */
+
+ 54, 8, 35, 54, 8, 35, 16, 36, 52, 24,
+ 9, 53, 1, 56, 76, 18, 68, 37, 21, 46,
+ 11, 17, 68, 57, 55, 4, 21, 125, 125, 125,
+ 36, 6, 9, 21, 46, 11, 27, 22, 70, 3,
+ 6, 1, 2, 83, 91, 61, 95, 3, 17, 17,
+ 23, 53, 37, 69, 2, 9, 11, 5, 62, 2,
+ 44, 0, 0, 0, 8, 65, 67, 18, 21, 11,
+ 24, 1, 23, 58, 32, 8, 88, 84, 10, 32,
+ 32, 46, 14, 14, 62, 89, 49, 51, 53, 48,
+ 15, 32, 62, 5, 23, 2, 26, 3, 19, 17,
+ 35, 46, 23, 26, 1, 23, 12, 11, 1, 14,
+ 24, 30, 36, 14, 29, 7, 6, 3, 0, 15,
+ 60, 5, 2, 60, 54, 74, 60, 52, 65, 18,
+ 18, 23, 7, 27, 11, 60, 2, 8, 68, 106,
+ 124, 124, 88, 69, 85, 20, 82, 84, 121, 17,
+ 24, 67, 46, 11, 6, 68, 66, 108, 96, 86,
+ 87, 3, 33, 61, 61, 13, 28, 42, 24, 12,
+ 24, 22, 8, 32, 20, 25, 11, 12, 2, 0,
+ 23, 1, 17, 27, 3, 5, 2, 13, 17, 85,
+ 19, 23, 11, 41, 39, 15, 8, 9, 15, 11,
+ 19, 51, 23, 3, 81, 115, 61, 77, 65, 125,
+ 43, 31, 99, 1, 21, 21, 35, 43, 65, 55,
+ 43, 79, 59, 55, 63, 81, 3, 20, 76, 50,
+ 20, 8, 28, 16, 6, 0, 22, 34, 84, 54,
+ 42, 24, 64, 32, 36, 8, 62, 14, 72, 46,
+ 36, 22, 48, 20, 8, 4, 4, 9, 102, 58,
+ 22, 6, 38, 8, 13, 3, 32, 82, 70, 42,
+ 20, 62, 26, 22, 8, 4, 124, 13, 2, 16,
+ 12, 3, 10, 20, 26, 24, 34, 46, 68, 9,
+ 0, 19, 17, 124, 31, 73, 19, 6, 6, 15,
+ 15, 13, 21, 0, 25, 63, 11, 9, 44, 13,
+ 27, 11, 9, 3, 17, 15, 13, 51, 37, 17,
+ 23, 13, 48, 73, 25, 48, 39, 9, 11, 13,
+ 10, 5, 15, 6, 20, 15, 15, 85, 66, 62,
+ 64, 30, 1, 3, 5, 25, 7, 27, 37, 25,
+ 61, 55, 59, 75, 67, 89, 125, 63, 47, 45,
+ 49, 73, 59, 51, 55, 57, 31, 25, 33, 27,
+ 39, 77, 51, 45, 91, 63, 87, 87, 123, 113,
+ 105, 103, 27, 21, 105, 41, 43, 55, 89, 67,
+ 69, 75, 53, 63, 43, 37, 85, 97, 123, 45,
+ 69, 81, 13, 20, 30, 34, 70, 46, 58, 12,
+ 64, 54, 74, 62, 80, 70, 48, 84, 5, 23,
+ 39, 55, 55, 107, 107, 103, 32, 88, 80, 86,
+ 66, 70, 42, 40, 22, 20, 13, 20, 30, 34,
+ 70, 46, 58, 12, 64, 54, 74, 62, 80, 70,
+ 48, 84, 5, 23, 39, 55, 55, 107, 107, 103,
+ },
+
+ {
+ /* Context Tables for P, SP, B Slices :: cabac_init_idc = 2, qp = 42 */
+
+ 52, 8, 35, 52, 8, 35, 20, 40, 52, 24,
+ 11, 57, 3, 54, 78, 18, 70, 41, 21, 48,
+ 13, 17, 70, 59, 57, 2, 25, 125, 125, 125,
+ 40, 8, 9, 21, 48, 13, 27, 24, 70, 3,
+ 6, 0, 6, 85, 93, 61, 95, 3, 15, 15,
+ 25, 53, 37, 69, 2, 9, 11, 3, 64, 2,
+ 44, 0, 0, 0, 8, 65, 67, 20, 23, 11,
+ 22, 1, 21, 62, 34, 10, 90, 86, 12, 34,
+ 36, 48, 16, 16, 68, 89, 51, 51, 55, 48,
+ 15, 34, 68, 3, 23, 2, 30, 3, 19, 17,
+ 35, 46, 23, 26, 0, 23, 14, 9, 1, 14,
+ 24, 32, 36, 14, 31, 7, 6, 3, 0, 15,
+ 60, 5, 2, 62, 54, 74, 60, 52, 67, 18,
+ 18, 23, 7, 25, 11, 62, 4, 8, 72, 110,
+ 124, 124, 92, 69, 87, 20, 84, 84, 123, 17,
+ 26, 69, 48, 11, 6, 70, 66, 108, 96, 90,
+ 89, 0, 35, 67, 67, 11, 28, 42, 22, 12,
+ 22, 22, 6, 32, 18, 27, 13, 12, 2, 0,
+ 23, 3, 17, 29, 5, 5, 2, 15, 17, 87,
+ 21, 25, 11, 43, 41, 19, 4, 13, 19, 13,
+ 23, 57, 29, 5, 85, 119, 65, 81, 69, 125,
+ 45, 33, 101, 3, 23, 23, 39, 45, 69, 59,
+ 45, 83, 59, 57, 65, 85, 1, 22, 78, 50,
+ 20, 8, 30, 18, 8, 0, 26, 38, 86, 56,
+ 42, 26, 68, 34, 40, 10, 68, 16, 72, 46,
+ 36, 24, 50, 22, 8, 6, 6, 9, 104, 60,
+ 22, 6, 40, 8, 13, 1, 34, 82, 72, 42,
+ 20, 64, 28, 24, 8, 6, 124, 11, 4, 18,
+ 14, 1, 14, 22, 30, 26, 38, 50, 74, 7,
+ 2, 17, 15, 124, 29, 75, 17, 8, 8, 15,
+ 15, 13, 21, 2, 27, 65, 11, 9, 46, 13,
+ 27, 11, 9, 1, 19, 17, 13, 53, 39, 19,
+ 23, 13, 50, 75, 27, 50, 39, 11, 11, 13,
+ 10, 7, 17, 6, 20, 15, 15, 87, 64, 60,
+ 62, 28, 7, 7, 11, 31, 13, 33, 43, 31,
+ 67, 59, 63, 83, 75, 97, 125, 69, 51, 49,
+ 53, 79, 63, 53, 57, 59, 31, 29, 39, 33,
+ 43, 81, 55, 49, 95, 67, 91, 91, 125, 117,
+ 107, 107, 29, 23, 109, 45, 45, 59, 93, 69,
+ 71, 77, 55, 65, 43, 35, 89, 99, 125, 49,
+ 71, 83, 13, 20, 30, 36, 72, 48, 58, 14,
+ 66, 56, 76, 62, 82, 70, 48, 84, 9, 27,
+ 43, 59, 59, 113, 111, 105, 32, 90, 80, 86,
+ 68, 72, 42, 40, 22, 22, 13, 20, 30, 36,
+ 72, 48, 58, 14, 66, 56, 76, 62, 82, 70,
+ 48, 84, 9, 27, 43, 59, 59, 113, 111, 105,
+ },
+
+ {
+ /* Context Tables for P, SP, B Slices :: cabac_init_idc = 2, qp = 43 */
+
+ 50, 8, 37, 50, 8, 37, 24, 42, 54, 24,
+ 11, 59, 7, 52, 78, 20, 74, 43, 21, 52,
+ 13, 19, 72, 61, 59, 0, 27, 125, 125, 125,
+ 44, 12, 9, 21, 52, 13, 25, 26, 72, 3,
+ 8, 2, 8, 89, 95, 61, 97, 3, 15, 15,
+ 25, 55, 35, 69, 0, 9, 11, 1, 64, 2,
+ 44, 0, 0, 0, 10, 65, 67, 20, 23, 13,
+ 22, 0, 19, 66, 36, 12, 94, 90, 14, 36,
+ 40, 50, 18, 16, 72, 91, 51, 51, 55, 50,
+ 15, 36, 72, 3, 21, 4, 34, 3, 19, 17,
+ 35, 48, 25, 28, 0, 23, 16, 9, 0, 14,
+ 26, 34, 38, 14, 31, 7, 6, 3, 0, 15,
+ 60, 5, 2, 62, 54, 76, 62, 54, 67, 20,
+ 20, 25, 7, 25, 11, 64, 4, 10, 74, 112,
+ 124, 124, 94, 71, 89, 22, 84, 84, 123, 19,
+ 26, 69, 48, 13, 8, 72, 66, 108, 98, 92,
+ 91, 2, 37, 73, 73, 7, 28, 42, 22, 10,
+ 22, 20, 6, 30, 18, 27, 13, 12, 0, 0,
+ 25, 3, 19, 31, 5, 5, 2, 15, 19, 91,
+ 21, 27, 11, 45, 41, 25, 0, 17, 21, 17,
+ 27, 63, 33, 9, 89, 123, 67, 85, 73, 125,
+ 47, 37, 105, 7, 25, 25, 41, 49, 73, 63,
+ 49, 85, 59, 59, 67, 87, 2, 26, 82, 52,
+ 20, 8, 32, 18, 8, 2, 28, 40, 88, 58,
+ 44, 26, 70, 36, 42, 12, 72, 16, 74, 48,
+ 38, 24, 52, 22, 10, 8, 8, 9, 108, 62,
+ 22, 6, 40, 10, 13, 1, 34, 84, 72, 44,
+ 20, 66, 28, 24, 10, 6, 124, 7, 8, 22,
+ 18, 2, 16, 26, 32, 30, 42, 54, 78, 7,
+ 4, 17, 15, 124, 29, 77, 17, 8, 10, 15,
+ 15, 13, 21, 2, 27, 67, 11, 9, 46, 13,
+ 27, 11, 9, 1, 19, 17, 13, 55, 41, 19,
+ 23, 13, 52, 77, 27, 52, 41, 11, 11, 13,
+ 12, 7, 17, 8, 22, 15, 15, 91, 62, 58,
+ 60, 24, 11, 13, 15, 37, 17, 39, 49, 37,
+ 73, 65, 67, 89, 81, 105, 125, 73, 57, 53,
+ 57, 83, 67, 55, 59, 61, 31, 33, 43, 37,
+ 47, 85, 57, 51, 99, 71, 97, 95, 125, 121,
+ 111, 109, 31, 27, 113, 47, 49, 61, 97, 73,
+ 75, 79, 57, 67, 45, 33, 91, 103, 125, 51,
+ 75, 85, 11, 22, 32, 36, 74, 50, 60, 14,
+ 68, 58, 78, 64, 84, 72, 50, 82, 13, 31,
+ 47, 63, 63, 117, 115, 107, 34, 90, 82, 88,
+ 68, 74, 44, 42, 24, 22, 11, 22, 32, 36,
+ 74, 50, 60, 14, 68, 58, 78, 64, 84, 72,
+ 50, 82, 13, 31, 47, 63, 63, 117, 115, 107,
+ },
+
+ {
+ /* Context Tables for P, SP, B Slices :: cabac_init_idc = 2, qp = 44 */
+
+ 46, 8, 37, 46, 8, 37, 26, 46, 54, 24,
+ 13, 63, 9, 50, 80, 20, 78, 47, 21, 54,
+ 13, 19, 74, 63, 63, 3, 31, 125, 125, 125,
+ 48, 14, 9, 21, 54, 13, 25, 28, 72, 5,
+ 10, 4, 10, 91, 99, 63, 97, 3, 15, 13,
+ 25, 55, 35, 69, 0, 9, 11, 1, 66, 2,
+ 44, 0, 0, 0, 10, 67, 67, 22, 25, 13,
+ 20, 0, 17, 70, 38, 14, 96, 92, 14, 38,
+ 44, 52, 20, 18, 76, 93, 51, 51, 55, 50,
+ 17, 38, 76, 3, 19, 6, 38, 3, 19, 17,
+ 35, 48, 25, 28, 2, 23, 16, 9, 0, 14,
+ 26, 36, 38, 14, 31, 9, 6, 3, 0, 15,
+ 60, 5, 2, 64, 54, 76, 62, 54, 69, 20,
+ 22, 25, 7, 25, 11, 66, 4, 10, 76, 114,
+ 124, 124, 98, 71, 91, 22, 84, 84, 125, 19,
+ 28, 71, 50, 13, 8, 74, 66, 108, 98, 94,
+ 93, 6, 39, 79, 79, 5, 26, 42, 20, 8,
+ 20, 20, 4, 30, 16, 29, 15, 10, 0, 0,
+ 25, 5, 21, 33, 7, 7, 0, 17, 19, 93,
+ 23, 29, 11, 49, 43, 29, 3, 21, 25, 21,
+ 31, 71, 39, 11, 93, 125, 71, 89, 77, 125,
+ 51, 39, 109, 9, 27, 27, 43, 51, 77, 67,
+ 53, 89, 59, 61, 69, 91, 4, 28, 84, 52,
+ 20, 8, 34, 20, 10, 4, 32, 42, 90, 58,
+ 46, 28, 72, 38, 46, 14, 76, 18, 74, 48,
+ 38, 26, 54, 24, 10, 8, 10, 9, 110, 64,
+ 22, 6, 42, 10, 13, 0, 36, 84, 74, 44,
+ 20, 68, 30, 26, 10, 8, 124, 5, 10, 24,
+ 22, 4, 20, 28, 36, 32, 44, 58, 82, 5,
+ 6, 15, 13, 124, 29, 79, 17, 10, 10, 15,
+ 15, 13, 23, 4, 27, 69, 11, 9, 48, 13,
+ 27, 11, 9, 1, 21, 17, 13, 57, 43, 19,
+ 23, 13, 54, 79, 27, 52, 43, 11, 11, 13,
+ 12, 7, 19, 8, 24, 15, 15, 93, 60, 56,
+ 58, 20, 15, 17, 21, 43, 23, 45, 57, 43,
+ 81, 71, 71, 97, 89, 113, 125, 79, 61, 57,
+ 61, 89, 71, 59, 61, 63, 31, 39, 47, 41,
+ 51, 89, 61, 55, 103, 75, 101, 99, 125, 125,
+ 115, 111, 33, 29, 119, 51, 53, 65, 101, 75,
+ 77, 81, 59, 69, 45, 31, 93, 105, 125, 53,
+ 77, 87, 11, 22, 32, 38, 76, 50, 62, 14,
+ 70, 60, 80, 64, 86, 72, 52, 82, 17, 35,
+ 51, 67, 67, 121, 119, 111, 34, 92, 82, 90,
+ 70, 74, 44, 42, 24, 24, 11, 22, 32, 38,
+ 76, 50, 62, 14, 70, 60, 80, 64, 86, 72,
+ 52, 82, 17, 35, 51, 67, 67, 121, 119, 111,
+ },
+
+ {
+ /* Context Tables for P, SP, B Slices :: cabac_init_idc = 2, qp = 45 */
+
+ 44, 8, 37, 44, 8, 37, 30, 48, 54, 24,
+ 13, 67, 13, 48, 82, 20, 80, 51, 19, 58,
+ 15, 19, 76, 65, 65, 5, 35, 125, 125, 125,
+ 52, 16, 9, 19, 58, 15, 25, 30, 72, 5,
+ 10, 6, 14, 93, 101, 63, 97, 3, 13, 11,
+ 27, 55, 35, 69, 0, 7, 11, 0, 66, 2,
+ 44, 0, 0, 0, 12, 67, 67, 22, 27, 13,
+ 20, 2, 13, 76, 40, 16, 100, 96, 16, 40,
+ 48, 54, 22, 20, 82, 93, 53, 51, 57, 50,
+ 17, 40, 82, 1, 19, 6, 42, 3, 19, 17,
+ 33, 48, 25, 30, 4, 21, 18, 7, 0, 16,
+ 28, 38, 40, 14, 33, 9, 8, 3, 0, 15,
+ 62, 5, 2, 64, 56, 76, 62, 54, 69, 22,
+ 22, 27, 7, 23, 11, 68, 6, 12, 80, 118,
+ 124, 124, 102, 73, 93, 24, 86, 84, 125, 19,
+ 28, 71, 52, 13, 8, 78, 66, 108, 98, 98,
+ 95, 8, 41, 83, 85, 3, 26, 42, 20, 8,
+ 20, 20, 4, 30, 16, 31, 17, 10, 0, 0,
+ 25, 5, 21, 35, 7, 7, 0, 19, 21, 95,
+ 25, 29, 11, 51, 43, 33, 7, 23, 27, 23,
+ 35, 77, 43, 13, 97, 125, 73, 93, 81, 125,
+ 53, 41, 111, 11, 29, 29, 47, 55, 81, 71,
+ 55, 91, 59, 63, 71, 95, 8, 32, 86, 54,
+ 20, 8, 36, 22, 12, 4, 36, 46, 92, 60,
+ 46, 30, 76, 40, 48, 16, 82, 20, 76, 50,
+ 40, 26, 56, 26, 12, 10, 12, 7, 112, 66,
+ 22, 6, 44, 12, 11, 2, 38, 84, 76, 44,
+ 20, 70, 32, 28, 12, 10, 124, 3, 14, 28,
+ 24, 6, 22, 32, 40, 36, 48, 62, 88, 3,
+ 8, 13, 11, 124, 27, 81, 15, 12, 12, 15,
+ 15, 13, 23, 4, 29, 71, 11, 9, 50, 13,
+ 27, 11, 9, 0, 21, 19, 13, 59, 45, 21,
+ 23, 13, 56, 79, 29, 54, 43, 13, 11, 13,
+ 12, 9, 19, 8, 24, 15, 15, 97, 58, 54,
+ 56, 18, 21, 23, 27, 49, 27, 51, 63, 49,
+ 87, 75, 75, 105, 95, 121, 125, 85, 65, 59,
+ 65, 93, 75, 61, 63, 65, 31, 43, 53, 47,
+ 55, 93, 65, 57, 107, 79, 105, 103, 125, 125,
+ 117, 115, 35, 31, 123, 53, 55, 67, 105, 77,
+ 79, 83, 61, 71, 45, 29, 97, 107, 125, 57,
+ 79, 89, 11, 24, 34, 40, 78, 52, 62, 16,
+ 72, 62, 82, 66, 88, 74, 52, 80, 21, 39,
+ 55, 71, 71, 125, 123, 113, 36, 92, 84, 90,
+ 72, 76, 46, 44, 24, 26, 11, 24, 34, 40,
+ 78, 52, 62, 16, 72, 62, 82, 66, 88, 74,
+ 52, 80, 21, 39, 55, 71, 71, 125, 123, 113,
+ },
+
+ {
+ /* Context Tables for P, SP, B Slices :: cabac_init_idc = 2, qp = 46 */
+
+ 42, 8, 37, 42, 8, 37, 34, 52, 56, 24,
+ 15, 71, 15, 46, 82, 20, 84, 53, 19, 60,
+ 15, 21, 78, 67, 67, 7, 39, 125, 125, 125,
+ 56, 20, 9, 19, 60, 15, 23, 32, 74, 5,
+ 12, 8, 16, 95, 103, 63, 99, 3, 13, 9,
+ 27, 57, 35, 69, 0, 7, 11, 2, 68, 2,
+ 44, 0, 0, 0, 12, 67, 67, 24, 29, 13,
+ 18, 2, 11, 80, 42, 18, 102, 98, 18, 42,
+ 52, 56, 24, 22, 86, 95, 53, 51, 57, 52,
+ 17, 42, 86, 1, 17, 8, 46, 3, 19, 17,
+ 33, 50, 25, 30, 6, 21, 20, 7, 0, 16,
+ 28, 40, 40, 14, 33, 9, 8, 3, 0, 15,
+ 62, 5, 2, 66, 56, 78, 62, 56, 71, 22,
+ 24, 27, 7, 23, 11, 70, 6, 12, 82, 120,
+ 124, 124, 106, 73, 95, 24, 86, 84, 125, 19,
+ 30, 73, 52, 13, 8, 80, 66, 108, 98, 100,
+ 97, 12, 43, 89, 91, 0, 26, 42, 18, 6,
+ 18, 18, 2, 30, 14, 33, 19, 10, 0, 0,
+ 27, 7, 23, 37, 9, 7, 0, 21, 21, 97,
+ 25, 31, 11, 53, 45, 37, 11, 27, 31, 27,
+ 39, 83, 49, 15, 101, 125, 77, 97, 85, 125,
+ 55, 43, 115, 13, 31, 31, 49, 57, 85, 75,
+ 59, 95, 59, 65, 73, 97, 10, 34, 90, 54,
+ 20, 8, 38, 24, 14, 6, 40, 48, 94, 62,
+ 48, 30, 78, 42, 52, 18, 86, 20, 76, 50,
+ 40, 28, 58, 26, 12, 12, 14, 7, 116, 68,
+ 22, 6, 46, 12, 11, 4, 40, 86, 78, 46,
+ 20, 72, 32, 30, 12, 10, 124, 0, 16, 30,
+ 28, 8, 26, 34, 42, 38, 52, 66, 92, 3,
+ 10, 13, 9, 124, 27, 83, 15, 14, 14, 15,
+ 15, 13, 23, 6, 29, 73, 11, 9, 50, 13,
+ 27, 11, 9, 0, 23, 19, 13, 61, 47, 21,
+ 23, 13, 58, 81, 29, 56, 45, 13, 11, 13,
+ 14, 9, 21, 10, 26, 15, 15, 99, 56, 52,
+ 54, 14, 25, 27, 31, 55, 33, 57, 69, 55,
+ 93, 81, 79, 113, 103, 125, 125, 91, 69, 63,
+ 69, 99, 79, 63, 65, 67, 31, 47, 57, 51,
+ 59, 97, 69, 61, 111, 83, 109, 107, 125, 125,
+ 121, 117, 37, 33, 125, 57, 59, 71, 109, 81,
+ 83, 85, 63, 73, 45, 27, 99, 111, 125, 59,
+ 81, 91, 11, 24, 34, 40, 80, 54, 64, 16,
+ 74, 64, 84, 66, 90, 74, 54, 80, 25, 43,
+ 59, 75, 75, 125, 125, 115, 36, 94, 84, 92,
+ 72, 78, 46, 44, 26, 26, 11, 24, 34, 40,
+ 80, 54, 64, 16, 74, 64, 84, 66, 90, 74,
+ 54, 80, 25, 43, 59, 75, 75, 125, 125, 115,
+ },
+
+ {
+ /* Context Tables for P, SP, B Slices :: cabac_init_idc = 2, qp = 47 */
+
+ 40, 8, 37, 40, 8, 37, 38, 54, 56, 24,
+ 15, 75, 19, 44, 84, 20, 88, 57, 19, 64,
+ 15, 21, 80, 69, 69, 9, 43, 125, 125, 125,
+ 60, 22, 9, 19, 64, 15, 23, 34, 74, 5,
+ 14, 10, 18, 97, 105, 63, 99, 3, 13, 7,
+ 27, 57, 35, 69, 0, 7, 11, 4, 68, 2,
+ 44, 0, 0, 0, 14, 67, 67, 24, 31, 13,
+ 18, 4, 9, 84, 44, 20, 106, 102, 20, 44,
+ 56, 58, 26, 24, 90, 97, 53, 51, 57, 52,
+ 17, 44, 90, 1, 15, 10, 50, 3, 19, 17,
+ 33, 50, 25, 32, 8, 21, 22, 7, 0, 16,
+ 30, 42, 42, 14, 33, 9, 8, 3, 0, 15,
+ 62, 5, 2, 66, 56, 78, 62, 56, 71, 24,
+ 26, 29, 7, 23, 11, 72, 6, 14, 84, 122,
+ 124, 124, 110, 75, 97, 26, 86, 84, 125, 19,
+ 30, 73, 54, 13, 8, 82, 66, 108, 98, 102,
+ 99, 14, 45, 95, 97, 2, 26, 42, 18, 4,
+ 18, 18, 2, 30, 14, 35, 21, 10, 0, 0,
+ 27, 7, 25, 39, 9, 7, 0, 23, 23, 99,
+ 27, 33, 11, 55, 45, 41, 15, 31, 33, 31,
+ 43, 89, 53, 17, 105, 125, 79, 101, 89, 125,
+ 57, 45, 119, 15, 33, 33, 51, 61, 89, 79,
+ 63, 97, 59, 67, 75, 101, 14, 38, 92, 56,
+ 20, 8, 40, 26, 16, 8, 44, 50, 96, 64,
+ 50, 32, 80, 44, 54, 20, 90, 22, 78, 52,
+ 42, 28, 60, 28, 14, 14, 16, 7, 118, 70,
+ 22, 6, 48, 14, 11, 6, 42, 86, 80, 46,
+ 20, 74, 34, 32, 14, 12, 124, 2, 20, 34,
+ 32, 10, 28, 38, 46, 42, 56, 70, 96, 1,
+ 12, 11, 7, 124, 27, 85, 15, 16, 16, 15,
+ 15, 13, 23, 6, 29, 75, 11, 9, 52, 13,
+ 27, 11, 9, 0, 23, 19, 13, 63, 49, 21,
+ 23, 13, 60, 83, 29, 58, 47, 13, 11, 13,
+ 14, 9, 21, 10, 28, 15, 15, 103, 54, 50,
+ 52, 10, 29, 33, 37, 61, 37, 63, 75, 61,
+ 99, 87, 83, 121, 109, 125, 125, 97, 73, 67,
+ 73, 103, 83, 65, 67, 69, 31, 51, 61, 55,
+ 63, 101, 73, 63, 115, 87, 113, 111, 125, 125,
+ 125, 119, 39, 35, 125, 59, 63, 73, 113, 83,
+ 85, 87, 65, 75, 45, 25, 101, 113, 125, 61,
+ 83, 93, 11, 26, 36, 42, 82, 56, 66, 16,
+ 76, 66, 86, 68, 92, 76, 56, 78, 29, 47,
+ 63, 79, 79, 125, 125, 117, 38, 94, 86, 94,
+ 74, 80, 48, 46, 26, 28, 11, 26, 36, 42,
+ 82, 56, 66, 16, 76, 66, 86, 68, 92, 76,
+ 56, 78, 29, 47, 63, 79, 79, 125, 125, 117,
+ },
+
+ {
+ /* Context Tables for P, SP, B Slices :: cabac_init_idc = 2, qp = 48 */
+
+ 36, 6, 39, 36, 6, 39, 40, 56, 56, 24,
+ 17, 79, 23, 42, 84, 20, 90, 61, 19, 66,
+ 17, 23, 82, 71, 73, 13, 47, 125, 125, 125,
+ 64, 24, 9, 19, 66, 17, 23, 34, 74, 7,
+ 14, 10, 20, 101, 109, 65, 101, 5, 13, 7,
+ 29, 59, 35, 69, 1, 7, 11, 4, 68, 0,
+ 44, 0, 0, 0, 14, 69, 67, 24, 33, 15,
+ 16, 4, 7, 88, 46, 20, 108, 104, 20, 46,
+ 58, 58, 28, 24, 94, 99, 55, 53, 59, 52,
+ 19, 44, 94, 1, 15, 10, 52, 3, 19, 17,
+ 33, 50, 27, 32, 8, 21, 22, 7, 0, 16,
+ 30, 44, 42, 14, 35, 11, 8, 3, 1, 15,
+ 62, 5, 2, 66, 56, 78, 62, 56, 73, 24,
+ 26, 31, 7, 23, 13, 72, 6, 14, 86, 124,
+ 124, 124, 112, 77, 101, 26, 86, 84, 125, 21,
+ 30, 75, 54, 15, 8, 84, 66, 106, 98, 104,
+ 101, 16, 49, 101, 105, 4, 24, 42, 16, 2,
+ 16, 16, 0, 28, 12, 37, 23, 8, 1, 1,
+ 29, 9, 27, 41, 11, 9, 1, 25, 25, 103,
+ 29, 35, 11, 59, 47, 47, 21, 35, 37, 35,
+ 49, 97, 59, 21, 109, 125, 83, 107, 93, 125,
+ 61, 49, 123, 19, 35, 35, 55, 65, 93, 83,
+ 67, 101, 59, 69, 77, 105, 16, 40, 94, 56,
+ 20, 8, 40, 26, 16, 8, 46, 52, 96, 64,
+ 50, 32, 82, 46, 56, 20, 94, 22, 78, 52,
+ 42, 28, 60, 28, 14, 14, 18, 7, 120, 72,
+ 22, 4, 48, 14, 11, 6, 42, 86, 80, 46,
+ 20, 76, 34, 32, 14, 12, 124, 4, 22, 36,
+ 34, 12, 30, 40, 48, 44, 58, 72, 100, 1,
+ 12, 11, 7, 124, 27, 89, 15, 16, 16, 15,
+ 15, 13, 25, 6, 31, 77, 13, 9, 52, 15,
+ 27, 13, 9, 0, 25, 21, 15, 65, 51, 23,
+ 25, 13, 62, 85, 31, 58, 49, 15, 11, 15,
+ 14, 11, 23, 10, 28, 15, 15, 107, 50, 48,
+ 48, 6, 35, 39, 43, 67, 43, 69, 83, 67,
+ 107, 93, 87, 125, 117, 125, 125, 103, 79, 71,
+ 79, 109, 87, 69, 71, 71, 31, 57, 67, 61,
+ 67, 107, 77, 67, 121, 91, 119, 115, 125, 125,
+ 125, 123, 43, 39, 125, 63, 67, 77, 117, 87,
+ 89, 89, 67, 77, 47, 23, 105, 117, 125, 65,
+ 87, 97, 11, 26, 36, 42, 84, 56, 66, 16,
+ 78, 66, 88, 68, 92, 76, 56, 76, 33, 51,
+ 69, 85, 83, 125, 125, 121, 38, 94, 86, 94,
+ 74, 80, 48, 46, 26, 28, 11, 26, 36, 42,
+ 84, 56, 66, 16, 78, 66, 88, 68, 92, 76,
+ 56, 76, 33, 51, 69, 85, 83, 125, 125, 121,
+ },
+
+ {
+ /* Context Tables for P, SP, B Slices :: cabac_init_idc = 2, qp = 49 */
+
+ 34, 6, 39, 34, 6, 39, 44, 60, 58, 26,
+ 17, 81, 25, 42, 86, 22, 94, 63, 17, 70,
+ 17, 23, 86, 71, 75, 15, 49, 125, 125, 125,
+ 68, 28, 7, 17, 70, 17, 21, 36, 76, 7,
+ 16, 12, 24, 103, 111, 65, 101, 5, 11, 5,
+ 29, 59, 33, 67, 1, 5, 9, 6, 70, 0,
+ 44, 0, 0, 0, 16, 69, 67, 26, 33, 15,
+ 16, 6, 3, 94, 50, 22, 112, 108, 22, 50,
+ 62, 60, 32, 26, 100, 99, 55, 53, 59, 54,
+ 19, 46, 100, 0, 13, 12, 56, 3, 17, 17,
+ 31, 52, 27, 34, 10, 19, 24, 5, 2, 18,
+ 32, 46, 44, 16, 35, 11, 10, 1, 1, 13,
+ 64, 3, 2, 68, 58, 80, 64, 58, 73, 26,
+ 28, 31, 7, 21, 13, 74, 8, 16, 90, 124,
+ 124, 124, 116, 77, 103, 28, 88, 86, 125, 21,
+ 32, 75, 56, 15, 10, 88, 66, 106, 100, 108,
+ 103, 20, 51, 105, 111, 8, 24, 42, 16, 2,
+ 16, 16, 0, 28, 12, 37, 23, 8, 1, 1,
+ 29, 9, 27, 41, 11, 9, 1, 25, 25, 105,
+ 29, 35, 11, 61, 47, 51, 25, 37, 39, 37,
+ 53, 103, 63, 23, 111, 125, 85, 111, 95, 125,
+ 63, 51, 125, 21, 35, 35, 57, 67, 95, 85,
+ 69, 103, 59, 69, 77, 107, 20, 44, 98, 58,
+ 20, 8, 42, 28, 18, 10, 50, 56, 98, 66,
+ 52, 34, 86, 48, 60, 22, 100, 24, 80, 54,
+ 44, 30, 62, 30, 16, 16, 22, 5, 124, 74,
+ 24, 4, 50, 16, 9, 8, 44, 88, 82, 48,
+ 22, 78, 36, 34, 16, 14, 124, 8, 26, 40,
+ 38, 16, 34, 44, 52, 48, 62, 76, 106, 0,
+ 14, 9, 5, 124, 25, 91, 13, 18, 18, 13,
+ 15, 11, 25, 8, 31, 77, 13, 9, 54, 15,
+ 27, 13, 9, 2, 25, 21, 15, 65, 51, 23,
+ 25, 11, 66, 85, 31, 60, 49, 15, 11, 15,
+ 16, 11, 23, 12, 30, 13, 13, 109, 48, 46,
+ 46, 4, 39, 43, 47, 73, 47, 73, 89, 71,
+ 113, 97, 91, 125, 123, 125, 125, 107, 83, 73,
+ 83, 113, 89, 71, 73, 71, 31, 61, 71, 65,
+ 71, 111, 79, 69, 125, 93, 123, 117, 125, 125,
+ 125, 125, 45, 41, 125, 65, 69, 79, 119, 89,
+ 91, 91, 67, 77, 47, 21, 107, 119, 125, 67,
+ 89, 99, 9, 28, 38, 44, 88, 58, 68, 18,
+ 82, 68, 90, 70, 94, 78, 58, 76, 35, 53,
+ 73, 89, 85, 125, 125, 123, 40, 96, 88, 96,
+ 76, 82, 50, 48, 28, 30, 9, 28, 38, 44,
+ 88, 58, 68, 18, 82, 68, 90, 70, 94, 78,
+ 58, 76, 35, 53, 73, 89, 85, 125, 125, 123,
+ },
+
+ {
+ /* Context Tables for P, SP, B Slices :: cabac_init_idc = 2, qp = 50 */
+
+ 32, 6, 39, 32, 6, 39, 48, 62, 58, 26,
+ 17, 85, 29, 40, 88, 22, 98, 67, 17, 72,
+ 17, 23, 88, 73, 77, 17, 53, 125, 125, 125,
+ 72, 30, 7, 17, 72, 17, 21, 38, 76, 7,
+ 18, 14, 26, 105, 113, 65, 101, 5, 11, 3,
+ 29, 59, 33, 67, 1, 5, 9, 8, 70, 0,
+ 44, 0, 0, 0, 18, 69, 67, 26, 35, 15,
+ 16, 8, 1, 98, 52, 24, 116, 112, 24, 52,
+ 66, 62, 34, 28, 104, 101, 55, 53, 59, 54,
+ 19, 48, 104, 0, 11, 14, 60, 3, 17, 17,
+ 31, 52, 27, 34, 12, 19, 26, 5, 2, 18,
+ 32, 48, 46, 16, 35, 11, 10, 1, 1, 13,
+ 64, 3, 2, 68, 58, 80, 64, 58, 75, 28,
+ 30, 33, 7, 21, 13, 76, 8, 16, 92, 124,
+ 124, 124, 120, 79, 105, 28, 88, 86, 125, 21,
+ 32, 77, 58, 15, 10, 90, 66, 106, 100, 110,
+ 105, 22, 53, 111, 117, 10, 24, 42, 16, 0,
+ 14, 16, 0, 28, 10, 39, 25, 8, 1, 1,
+ 29, 11, 29, 43, 11, 9, 1, 27, 27, 107,
+ 31, 37, 11, 63, 49, 55, 29, 41, 43, 41,
+ 57, 109, 67, 25, 115, 125, 89, 115, 99, 125,
+ 65, 53, 125, 23, 37, 37, 59, 71, 99, 89,
+ 73, 107, 59, 71, 79, 111, 22, 46, 100, 60,
+ 20, 8, 44, 30, 20, 12, 54, 58, 100, 68,
+ 54, 36, 88, 50, 62, 24, 104, 26, 80, 54,
+ 44, 30, 64, 32, 18, 18, 24, 5, 124, 76,
+ 24, 4, 52, 18, 9, 10, 46, 88, 84, 48,
+ 22, 80, 38, 36, 18, 16, 124, 10, 30, 44,
+ 42, 18, 36, 48, 56, 50, 66, 80, 110, 2,
+ 16, 7, 3, 124, 25, 93, 13, 20, 20, 13,
+ 15, 11, 25, 8, 31, 79, 13, 9, 56, 15,
+ 27, 13, 9, 2, 25, 21, 15, 67, 53, 23,
+ 25, 11, 68, 87, 31, 62, 51, 15, 11, 15,
+ 16, 11, 23, 12, 32, 13, 13, 113, 46, 44,
+ 44, 0, 43, 49, 53, 79, 51, 79, 95, 77,
+ 119, 103, 95, 125, 125, 125, 125, 113, 87, 77,
+ 87, 117, 93, 73, 75, 73, 31, 65, 75, 69,
+ 75, 115, 83, 73, 125, 97, 125, 121, 125, 125,
+ 125, 125, 47, 43, 125, 69, 73, 83, 123, 91,
+ 93, 93, 69, 79, 47, 19, 109, 121, 125, 69,
+ 91, 101, 9, 28, 40, 46, 90, 60, 70, 18,
+ 84, 70, 92, 70, 96, 80, 60, 74, 39, 57,
+ 77, 93, 89, 125, 125, 125, 42, 96, 88, 98,
+ 78, 84, 50, 50, 28, 32, 9, 28, 40, 46,
+ 90, 60, 70, 18, 84, 70, 92, 70, 96, 80,
+ 60, 74, 39, 57, 77, 93, 89, 125, 125, 125,
+ },
+
+ {
+ /* Context Tables for P, SP, B Slices :: cabac_init_idc = 2, qp = 51 */
+
+ 30, 6, 39, 30, 6, 39, 52, 66, 60, 26,
+ 19, 89, 31, 38, 88, 22, 102, 69, 17, 76,
+ 17, 25, 90, 75, 79, 19, 57, 125, 125, 125,
+ 76, 34, 7, 17, 76, 17, 19, 40, 78, 7,
+ 20, 16, 28, 107, 115, 65, 103, 5, 11, 1,
+ 29, 61, 33, 67, 1, 5, 9, 10, 72, 0,
+ 44, 0, 0, 0, 18, 69, 67, 28, 37, 15,
+ 14, 8, 0, 102, 54, 26, 118, 114, 26, 54,
+ 70, 64, 36, 30, 108, 103, 55, 53, 59, 56,
+ 19, 50, 108, 0, 9, 16, 64, 3, 17, 17,
+ 31, 54, 27, 36, 14, 19, 28, 5, 2, 18,
+ 34, 50, 46, 16, 35, 11, 10, 1, 1, 13,
+ 64, 3, 2, 70, 58, 82, 64, 60, 75, 28,
+ 32, 33, 7, 21, 13, 78, 8, 18, 94, 124,
+ 124, 124, 124, 79, 107, 30, 88, 86, 125, 21,
+ 34, 77, 58, 15, 10, 92, 66, 106, 100, 112,
+ 107, 26, 55, 117, 123, 14, 24, 42, 14, 1,
+ 14, 14, 1, 28, 10, 41, 27, 8, 1, 1,
+ 31, 11, 31, 45, 13, 9, 1, 29, 27, 109,
+ 31, 39, 11, 65, 49, 59, 33, 45, 45, 45,
+ 61, 115, 73, 27, 119, 125, 91, 119, 103, 125,
+ 67, 55, 125, 25, 39, 39, 61, 73, 103, 93,
+ 77, 109, 59, 73, 81, 113, 26, 50, 104, 60,
+ 20, 8, 46, 32, 22, 14, 58, 60, 102, 70,
+ 56, 36, 90, 52, 66, 26, 108, 26, 82, 56,
+ 46, 32, 66, 32, 18, 20, 26, 5, 124, 78,
+ 24, 4, 54, 18, 9, 12, 48, 90, 86, 50,
+ 22, 82, 38, 38, 18, 16, 124, 14, 32, 46,
+ 46, 20, 40, 50, 58, 54, 70, 84, 114, 2,
+ 18, 7, 1, 124, 25, 95, 13, 22, 22, 13,
+ 15, 11, 25, 10, 31, 81, 13, 9, 56, 15,
+ 27, 13, 9, 2, 27, 21, 15, 69, 55, 23,
+ 25, 11, 70, 89, 31, 64, 53, 15, 11, 15,
+ 18, 11, 25, 14, 34, 13, 13, 115, 44, 42,
+ 42, 3, 47, 53, 57, 85, 57, 85, 101, 83,
+ 125, 109, 99, 125, 125, 125, 125, 119, 91, 81,
+ 91, 123, 97, 75, 77, 75, 31, 69, 79, 73,
+ 79, 119, 87, 75, 125, 101, 125, 125, 125, 125,
+ 125, 125, 49, 45, 125, 71, 77, 85, 125, 95,
+ 97, 95, 71, 81, 47, 17, 111, 125, 125, 71,
+ 93, 103, 9, 30, 40, 46, 92, 62, 72, 18,
+ 86, 72, 94, 72, 98, 80, 62, 74, 43, 61,
+ 81, 97, 93, 125, 125, 125, 42, 98, 90, 100,
+ 78, 86, 52, 50, 30, 32, 9, 30, 40, 46,
+ 92, 62, 72, 18, 86, 72, 94, 72, 98, 80,
+ 62, 74, 43, 61, 81, 97, 93, 125, 125, 125,
+ },
+
+ },
+
+ {
+
+ {
+ /* Context Tables for I, SI Slices :: qp = 0 */
+
+ 124, 18, 21, 124, 18, 21, 125, 81, 20, 18,
+ 24, 60, 122, 124, 108, 28, 109, 12, 29, 3,
+ 2, 28, 19, 26, 1, 40, 124, 7, 53, 81,
+ 125, 81, 7, 29, 3, 2, 45, 63, 4, 36,
+ 11, 35, 65, 16, 7, 45, 49, 10, 25, 61,
+ 18, 11, 35, 49, 7, 21, 21, 33, 17, 10,
+ 44, 0, 0, 0, 39, 45, 67, 17, 44, 2,
+ 104, 16, 11, 125, 77, 37, 21, 87, 125, 125,
+ 125, 63, 125, 101, 125, 119, 103, 117, 103, 0,
+ 9, 41, 81, 13, 59, 53, 125, 21, 67, 55,
+ 125, 14, 37, 25, 123, 59, 47, 27, 15, 0,
+ 9, 41, 2, 3, 4, 14, 5, 1, 4, 29,
+ 26, 22, 56, 38, 50, 36, 34, 38, 92, 24,
+ 26, 88, 60, 2, 89, 73, 75, 55, 61, 49,
+ 41, 45, 39, 47, 61, 13, 17, 21, 8, 77,
+ 73, 63, 23, 17, 23, 15, 34, 11, 2, 3,
+ 52, 17, 12, 18, 2, 17, 124, 108, 76, 90,
+ 108, 88, 52, 90, 68, 60, 66, 36, 10, 2,
+ 4, 50, 36, 48, 42, 38, 36, 44, 28, 58,
+ 42, 16, 24, 34, 51, 124, 124, 124, 124, 124,
+ 124, 124, 124, 124, 124, 124, 124, 124, 124, 92,
+ 124, 120, 82, 124, 124, 124, 124, 120, 116, 124,
+ 94, 82, 30, 52, 6, 9, 67, 15, 42, 26,
+ 18, 2, 10, 0, 17, 21, 55, 7, 72, 48,
+ 38, 34, 1, 9, 29, 27, 45, 57, 16, 6,
+ 2, 3, 19, 25, 33, 49, 93, 67, 41, 31,
+ 19, 21, 45, 65, 67, 107, 29, 60, 30, 20,
+ 2, 15, 31, 45, 53, 67, 124, 59, 41, 31,
+ 5, 15, 2, 6, 8, 23, 2, 10, 5, 31,
+ 15, 9, 38, 2, 54, 46, 72, 68, 38, 54,
+ 62, 42, 30, 2, 34, 1, 81, 67, 65, 49,
+ 43, 43, 43, 49, 5, 27, 25, 25, 10, 25,
+ 39, 71, 63, 63, 25, 21, 13, 23, 9, 3,
+ 19, 2, 2, 9, 23, 16, 1, 13, 114, 88,
+ 94, 98, 100, 104, 96, 94, 80, 80, 86, 74,
+ 38, 46, 32, 92, 84, 82, 72, 68, 56, 26,
+ 12, 0, 27, 37, 61, 11, 91, 124, 124, 124,
+ 124, 124, 124, 124, 124, 124, 124, 124, 122, 100,
+ 56, 10, 124, 124, 66, 124, 124, 124, 120, 124,
+ 116, 104, 116, 102, 104, 68, 74, 48, 5, 84,
+ 64, 26, 113, 97, 101, 43, 57, 51, 15, 35,
+ 33, 9, 13, 14, 9, 26, 21, 124, 124, 124,
+ 124, 120, 114, 58, 18, 37, 23, 80, 58, 40,
+ 18, 16, 4, 1, 9, 57, 85, 67, 53, 53,
+ 49, 19, 31, 45, 19, 13, 11, 5, 1, 10,
+ 8, 124, 124, 124, 124, 120, 108, 86, 54, 7,
+ },
+
+ {
+ /* Context Tables for I, SI Slices :: qp = 1 */
+
+ 124, 18, 21, 124, 18, 21, 123, 77, 22, 20,
+ 24, 58, 120, 124, 108, 28, 103, 12, 27, 1,
+ 2, 28, 17, 24, 3, 40, 124, 9, 55, 81,
+ 121, 77, 7, 27, 1, 2, 43, 59, 6, 36,
+ 9, 33, 63, 16, 7, 43, 49, 10, 23, 59,
+ 18, 11, 33, 49, 5, 19, 19, 31, 15, 10,
+ 44, 0, 0, 0, 37, 45, 67, 15, 44, 2,
+ 104, 16, 11, 123, 75, 37, 19, 83, 123, 123,
+ 123, 59, 123, 97, 123, 115, 101, 115, 101, 2,
+ 7, 39, 79, 11, 57, 51, 123, 19, 65, 53,
+ 123, 16, 35, 23, 119, 57, 45, 25, 13, 2,
+ 7, 39, 4, 1, 4, 14, 3, 1, 4, 27,
+ 26, 22, 56, 38, 50, 36, 34, 38, 90, 24,
+ 26, 86, 58, 2, 87, 71, 73, 53, 59, 47,
+ 39, 43, 37, 45, 57, 13, 17, 19, 6, 75,
+ 71, 63, 21, 17, 21, 13, 34, 9, 2, 3,
+ 50, 15, 12, 16, 2, 17, 124, 108, 76, 90,
+ 108, 88, 52, 90, 68, 58, 66, 36, 10, 2,
+ 4, 50, 36, 48, 42, 38, 34, 44, 28, 56,
+ 40, 16, 22, 32, 51, 124, 124, 124, 124, 124,
+ 124, 124, 124, 124, 124, 124, 124, 124, 120, 88,
+ 124, 118, 80, 124, 124, 124, 124, 116, 112, 122,
+ 90, 78, 30, 50, 4, 9, 67, 13, 44, 28,
+ 20, 4, 10, 0, 15, 19, 53, 5, 74, 50,
+ 40, 34, 0, 7, 27, 25, 43, 55, 18, 8,
+ 4, 1, 17, 23, 31, 47, 89, 65, 37, 29,
+ 17, 19, 43, 63, 65, 103, 27, 62, 32, 22,
+ 4, 13, 29, 43, 51, 65, 124, 57, 39, 29,
+ 5, 13, 2, 8, 10, 21, 4, 12, 3, 29,
+ 15, 9, 38, 4, 54, 46, 70, 68, 38, 52,
+ 60, 42, 30, 2, 32, 1, 79, 65, 63, 47,
+ 41, 41, 41, 47, 5, 25, 23, 23, 10, 23,
+ 37, 69, 61, 63, 25, 19, 13, 21, 9, 3,
+ 17, 2, 2, 7, 21, 16, 1, 13, 114, 88,
+ 94, 98, 98, 104, 96, 94, 80, 80, 86, 74,
+ 38, 44, 30, 90, 82, 80, 70, 66, 54, 26,
+ 12, 0, 25, 35, 59, 11, 89, 124, 124, 124,
+ 124, 124, 124, 124, 124, 124, 124, 122, 118, 96,
+ 54, 10, 124, 124, 64, 124, 124, 124, 116, 124,
+ 112, 100, 112, 98, 100, 66, 70, 46, 7, 82,
+ 62, 24, 109, 93, 97, 41, 55, 49, 11, 33,
+ 31, 9, 11, 18, 5, 30, 19, 124, 124, 124,
+ 124, 116, 110, 54, 14, 39, 21, 82, 58, 40,
+ 18, 18, 4, 1, 9, 55, 83, 65, 51, 51,
+ 45, 17, 29, 43, 17, 11, 9, 3, 0, 12,
+ 8, 124, 124, 124, 124, 118, 106, 82, 52, 7,
+ },
+
+ {
+ /* Context Tables for I, SI Slices :: qp = 2 */
+
+ 124, 18, 21, 124, 18, 21, 119, 75, 22, 20,
+ 24, 56, 118, 122, 108, 28, 99, 12, 25, 0,
+ 2, 26, 17, 22, 5, 38, 120, 13, 57, 83,
+ 115, 75, 7, 25, 0, 2, 43, 57, 6, 34,
+ 9, 33, 61, 16, 7, 43, 49, 10, 23, 57,
+ 18, 11, 33, 49, 5, 19, 19, 31, 15, 10,
+ 44, 0, 0, 0, 35, 45, 67, 15, 42, 2,
+ 104, 16, 11, 121, 73, 37, 19, 81, 119, 119,
+ 121, 57, 119, 95, 119, 113, 99, 113, 99, 4,
+ 7, 37, 77, 11, 57, 49, 119, 19, 65, 53,
+ 121, 16, 35, 23, 117, 57, 43, 25, 13, 2,
+ 7, 37, 4, 1, 2, 14, 3, 1, 4, 27,
+ 26, 22, 54, 38, 48, 36, 34, 38, 86, 24,
+ 26, 82, 56, 0, 85, 69, 71, 51, 57, 45,
+ 37, 41, 37, 43, 55, 13, 17, 19, 4, 75,
+ 69, 63, 21, 17, 19, 13, 32, 7, 2, 3,
+ 48, 13, 10, 14, 2, 19, 120, 106, 74, 88,
+ 106, 86, 50, 88, 68, 56, 64, 36, 10, 2,
+ 4, 48, 34, 46, 40, 36, 32, 42, 26, 52,
+ 38, 14, 20, 30, 51, 124, 124, 124, 124, 124,
+ 124, 124, 124, 124, 124, 124, 124, 124, 116, 82,
+ 124, 114, 76, 124, 124, 124, 124, 112, 108, 116,
+ 86, 74, 28, 46, 2, 11, 67, 13, 44, 28,
+ 20, 4, 10, 0, 15, 19, 51, 5, 74, 50,
+ 40, 34, 2, 7, 25, 25, 41, 53, 20, 10,
+ 4, 1, 15, 23, 31, 45, 87, 63, 35, 27,
+ 17, 19, 41, 61, 63, 101, 27, 62, 32, 22,
+ 4, 11, 27, 41, 49, 63, 124, 57, 39, 29,
+ 5, 13, 2, 8, 10, 21, 4, 12, 1, 29,
+ 15, 9, 36, 4, 52, 44, 68, 66, 38, 50,
+ 58, 42, 30, 0, 30, 3, 77, 63, 61, 47,
+ 41, 41, 39, 45, 5, 25, 23, 23, 8, 23,
+ 37, 69, 59, 63, 25, 19, 13, 19, 9, 3,
+ 15, 2, 2, 7, 19, 14, 1, 15, 112, 88,
+ 94, 96, 96, 102, 94, 92, 78, 78, 84, 72,
+ 36, 42, 28, 86, 80, 76, 66, 64, 52, 24,
+ 10, 0, 25, 35, 59, 13, 87, 124, 124, 124,
+ 124, 124, 124, 124, 124, 124, 124, 118, 114, 92,
+ 52, 8, 124, 120, 62, 124, 124, 124, 112, 120,
+ 108, 96, 108, 94, 96, 62, 66, 42, 9, 78,
+ 58, 20, 107, 91, 95, 39, 53, 47, 7, 31,
+ 29, 9, 9, 20, 3, 32, 17, 124, 124, 124,
+ 124, 110, 104, 48, 10, 41, 21, 82, 58, 40,
+ 18, 18, 4, 1, 9, 53, 81, 63, 49, 49,
+ 43, 15, 27, 41, 15, 9, 7, 3, 2, 12,
+ 8, 124, 124, 124, 122, 114, 102, 78, 48, 9,
+ },
+
+ {
+ /* Context Tables for I, SI Slices :: qp = 3 */
+
+ 124, 18, 21, 124, 18, 21, 115, 71, 24, 20,
+ 22, 52, 114, 120, 108, 28, 95, 12, 23, 2,
+ 2, 24, 17, 20, 7, 38, 116, 15, 59, 83,
+ 109, 73, 7, 23, 2, 2, 41, 55, 8, 34,
+ 9, 31, 59, 14, 9, 43, 49, 10, 23, 57,
+ 18, 11, 33, 49, 3, 19, 19, 31, 13, 10,
+ 44, 0, 0, 0, 35, 45, 67, 13, 40, 2,
+ 104, 16, 11, 119, 71, 37, 17, 79, 115, 115,
+ 117, 55, 115, 93, 115, 111, 97, 111, 97, 6,
+ 7, 35, 75, 11, 55, 49, 115, 19, 63, 51,
+ 119, 16, 35, 21, 113, 55, 41, 25, 13, 2,
+ 7, 35, 6, 0, 2, 14, 3, 1, 4, 27,
+ 26, 20, 54, 38, 46, 36, 34, 38, 82, 24,
+ 24, 78, 54, 1, 83, 67, 69, 49, 55, 45,
+ 35, 41, 35, 41, 53, 13, 17, 19, 2, 73,
+ 67, 63, 21, 17, 17, 13, 30, 5, 2, 3,
+ 46, 11, 10, 12, 2, 21, 118, 104, 74, 86,
+ 104, 84, 50, 86, 66, 54, 62, 36, 10, 2,
+ 2, 46, 32, 44, 38, 34, 30, 40, 26, 48,
+ 36, 14, 18, 28, 51, 124, 124, 124, 124, 124,
+ 124, 124, 124, 124, 124, 124, 124, 124, 110, 78,
+ 124, 110, 74, 124, 122, 124, 118, 108, 102, 112,
+ 82, 68, 26, 42, 0, 13, 67, 13, 46, 28,
+ 20, 4, 10, 0, 15, 19, 51, 5, 74, 50,
+ 40, 34, 4, 5, 25, 23, 41, 51, 22, 10,
+ 6, 1, 13, 21, 29, 45, 85, 61, 33, 25,
+ 15, 19, 39, 59, 61, 99, 25, 62, 32, 22,
+ 4, 9, 27, 39, 47, 61, 124, 55, 37, 27,
+ 5, 13, 2, 8, 10, 21, 4, 12, 1, 29,
+ 15, 9, 36, 6, 50, 42, 66, 64, 38, 48,
+ 56, 42, 30, 0, 28, 3, 75, 61, 59, 45,
+ 39, 39, 39, 43, 5, 25, 23, 21, 8, 23,
+ 37, 67, 57, 63, 25, 19, 13, 17, 9, 3,
+ 13, 2, 2, 7, 17, 12, 1, 17, 110, 86,
+ 92, 94, 94, 100, 92, 90, 76, 76, 82, 70,
+ 34, 40, 26, 84, 78, 74, 62, 60, 50, 22,
+ 10, 1, 25, 35, 59, 13, 85, 124, 124, 124,
+ 124, 124, 124, 124, 124, 124, 124, 114, 108, 88,
+ 48, 6, 122, 118, 58, 124, 124, 120, 108, 116,
+ 104, 92, 104, 90, 90, 58, 62, 38, 11, 74,
+ 54, 18, 105, 89, 93, 37, 51, 45, 5, 29,
+ 27, 9, 7, 24, 0, 36, 15, 124, 124, 124,
+ 124, 104, 98, 42, 6, 43, 21, 82, 58, 40,
+ 18, 18, 4, 1, 9, 53, 79, 61, 47, 47,
+ 41, 15, 27, 39, 15, 9, 7, 3, 2, 12,
+ 8, 124, 124, 124, 118, 110, 98, 74, 44, 11,
+ },
+
+ {
+ /* Context Tables for I, SI Slices :: qp = 4 */
+
+ 124, 18, 21, 124, 18, 21, 113, 69, 24, 20,
+ 22, 50, 112, 116, 108, 28, 89, 10, 21, 2,
+ 2, 22, 17, 18, 9, 36, 112, 19, 61, 85,
+ 103, 71, 7, 21, 2, 2, 41, 53, 8, 32,
+ 9, 31, 59, 14, 9, 41, 49, 10, 23, 55,
+ 16, 13, 33, 49, 3, 17, 19, 29, 13, 10,
+ 44, 0, 0, 0, 33, 47, 67, 13, 38, 2,
+ 104, 16, 11, 117, 69, 37, 17, 75, 113, 111,
+ 115, 53, 113, 89, 111, 109, 97, 109, 97, 6,
+ 7, 33, 73, 11, 55, 47, 111, 19, 63, 51,
+ 117, 16, 33, 21, 111, 55, 41, 25, 11, 2,
+ 7, 35, 6, 0, 0, 12, 3, 1, 4, 27,
+ 26, 20, 52, 38, 46, 36, 34, 36, 78, 24,
+ 24, 74, 52, 3, 81, 65, 67, 47, 55, 43,
+ 33, 39, 35, 39, 51, 13, 17, 17, 0, 73,
+ 65, 63, 21, 17, 17, 13, 28, 3, 2, 3,
+ 42, 9, 8, 10, 2, 23, 114, 102, 72, 84,
+ 102, 82, 48, 84, 66, 50, 60, 34, 10, 2,
+ 2, 44, 32, 42, 38, 32, 28, 38, 24, 44,
+ 34, 12, 16, 26, 51, 124, 124, 124, 124, 124,
+ 124, 124, 124, 124, 124, 124, 124, 118, 106, 72,
+ 124, 108, 70, 124, 118, 124, 114, 102, 98, 106,
+ 78, 64, 24, 40, 3, 15, 67, 13, 46, 30,
+ 20, 4, 10, 0, 15, 19, 49, 3, 76, 50,
+ 40, 34, 6, 5, 23, 23, 39, 51, 24, 12,
+ 6, 1, 13, 21, 29, 43, 83, 61, 31, 25,
+ 15, 19, 37, 57, 61, 97, 25, 64, 32, 22,
+ 4, 7, 25, 39, 45, 59, 124, 55, 37, 27,
+ 5, 13, 2, 8, 10, 19, 4, 12, 0, 29,
+ 15, 9, 34, 6, 48, 40, 64, 62, 38, 44,
+ 54, 40, 30, 1, 26, 5, 75, 61, 57, 45,
+ 39, 39, 37, 41, 7, 25, 23, 21, 6, 23,
+ 37, 67, 55, 63, 25, 17, 13, 17, 9, 3,
+ 11, 2, 0, 7, 15, 12, 3, 19, 108, 86,
+ 92, 92, 92, 98, 90, 88, 74, 74, 80, 68,
+ 32, 38, 24, 80, 74, 70, 58, 58, 48, 20,
+ 8, 1, 25, 35, 59, 15, 85, 124, 124, 124,
+ 124, 124, 124, 124, 124, 124, 120, 110, 104, 84,
+ 46, 4, 118, 114, 56, 124, 124, 116, 104, 110,
+ 100, 88, 100, 86, 86, 54, 58, 34, 13, 70,
+ 50, 14, 103, 87, 91, 37, 49, 43, 1, 27,
+ 25, 9, 5, 26, 2, 38, 15, 124, 124, 124,
+ 124, 98, 92, 36, 2, 45, 21, 82, 58, 40,
+ 18, 18, 4, 1, 9, 51, 77, 59, 45, 47,
+ 39, 13, 25, 37, 13, 7, 5, 1, 4, 14,
+ 8, 124, 124, 124, 114, 106, 94, 70, 40, 13,
+ },
+
+ {
+ /* Context Tables for I, SI Slices :: qp = 5 */
+
+ 124, 18, 21, 124, 18, 21, 109, 65, 24, 20,
+ 20, 46, 108, 114, 108, 28, 85, 10, 19, 4,
+ 2, 22, 15, 16, 11, 36, 108, 23, 63, 85,
+ 97, 67, 7, 19, 4, 2, 41, 51, 8, 32,
+ 9, 31, 57, 14, 11, 41, 49, 10, 23, 53,
+ 16, 13, 33, 49, 1, 17, 17, 29, 11, 10,
+ 44, 0, 0, 0, 33, 47, 67, 11, 36, 2,
+ 104, 16, 11, 115, 67, 37, 15, 73, 109, 107,
+ 111, 51, 109, 87, 107, 107, 95, 107, 95, 8,
+ 7, 31, 71, 11, 53, 45, 107, 19, 63, 49,
+ 113, 18, 33, 19, 109, 53, 39, 25, 11, 4,
+ 5, 33, 8, 2, 0, 12, 3, 1, 4, 27,
+ 26, 18, 50, 38, 44, 36, 34, 36, 74, 24,
+ 22, 72, 50, 5, 79, 63, 65, 45, 53, 41,
+ 31, 37, 33, 37, 49, 13, 17, 17, 1, 71,
+ 63, 63, 19, 17, 15, 13, 26, 1, 2, 3,
+ 40, 7, 8, 8, 2, 23, 112, 100, 72, 82,
+ 100, 80, 46, 84, 66, 48, 58, 34, 10, 2,
+ 0, 44, 30, 40, 36, 30, 26, 38, 22, 40,
+ 32, 10, 14, 24, 51, 124, 124, 124, 124, 124,
+ 124, 124, 124, 124, 124, 124, 124, 114, 102, 68,
+ 120, 104, 66, 124, 114, 120, 110, 98, 94, 100,
+ 74, 58, 22, 36, 5, 15, 67, 13, 46, 30,
+ 20, 4, 10, 0, 15, 19, 49, 3, 76, 50,
+ 40, 34, 8, 3, 21, 23, 37, 49, 26, 14,
+ 6, 0, 11, 19, 27, 43, 81, 59, 27, 23,
+ 15, 17, 35, 55, 59, 95, 23, 64, 34, 22,
+ 4, 5, 23, 37, 43, 57, 124, 55, 37, 25,
+ 5, 13, 2, 8, 10, 19, 4, 14, 0, 29,
+ 15, 9, 32, 8, 46, 38, 62, 62, 38, 42,
+ 52, 40, 30, 3, 24, 5, 73, 59, 55, 43,
+ 37, 37, 37, 39, 7, 25, 23, 21, 4, 23,
+ 37, 65, 53, 63, 25, 17, 13, 15, 9, 3,
+ 9, 2, 0, 7, 13, 10, 3, 19, 106, 86,
+ 90, 92, 90, 96, 88, 86, 74, 72, 78, 66,
+ 30, 36, 22, 78, 72, 68, 54, 56, 46, 18,
+ 6, 3, 25, 33, 59, 15, 83, 124, 124, 124,
+ 124, 124, 124, 124, 124, 120, 116, 106, 100, 80,
+ 42, 2, 114, 110, 54, 122, 124, 112, 100, 106,
+ 96, 84, 96, 82, 80, 50, 54, 30, 15, 66,
+ 46, 12, 101, 83, 89, 35, 47, 41, 2, 25,
+ 23, 9, 3, 30, 6, 42, 13, 124, 124, 124,
+ 124, 94, 86, 32, 1, 47, 21, 82, 58, 40,
+ 18, 18, 4, 1, 9, 51, 75, 57, 43, 45,
+ 37, 11, 25, 35, 11, 5, 3, 1, 4, 14,
+ 8, 124, 124, 124, 112, 102, 90, 66, 36, 15,
+ },
+
+ {
+ /* Context Tables for I, SI Slices :: qp = 6 */
+
+ 124, 18, 23, 124, 18, 23, 105, 63, 26, 20,
+ 20, 44, 106, 112, 108, 28, 81, 10, 19, 6,
+ 2, 20, 15, 14, 13, 34, 106, 25, 65, 87,
+ 91, 65, 7, 19, 6, 2, 39, 49, 10, 30,
+ 7, 29, 55, 12, 11, 41, 49, 10, 21, 53,
+ 16, 13, 31, 49, 1, 17, 17, 29, 11, 10,
+ 44, 0, 0, 0, 31, 47, 67, 11, 36, 0,
+ 104, 16, 11, 113, 67, 37, 15, 71, 105, 103,
+ 109, 49, 105, 85, 103, 105, 93, 105, 93, 10,
+ 7, 29, 71, 9, 53, 45, 103, 19, 61, 49,
+ 111, 18, 33, 19, 105, 53, 37, 23, 11, 4,
+ 5, 31, 8, 2, 1, 12, 3, 1, 4, 27,
+ 26, 18, 50, 38, 42, 36, 34, 36, 70, 24,
+ 22, 68, 48, 7, 79, 61, 65, 45, 51, 41,
+ 29, 37, 33, 37, 45, 13, 17, 17, 3, 71,
+ 61, 63, 19, 17, 13, 11, 24, 1, 2, 3,
+ 38, 5, 6, 6, 2, 25, 108, 98, 70, 82,
+ 98, 80, 46, 82, 64, 46, 56, 34, 10, 2,
+ 0, 42, 28, 38, 34, 30, 24, 36, 22, 36,
+ 30, 10, 12, 22, 51, 124, 124, 124, 124, 124,
+ 124, 124, 124, 124, 124, 124, 124, 108, 96, 62,
+ 116, 100, 64, 124, 108, 114, 104, 94, 88, 96,
+ 68, 54, 20, 32, 7, 17, 67, 11, 48, 30,
+ 22, 4, 10, 0, 15, 19, 47, 3, 76, 52,
+ 40, 34, 10, 3, 21, 21, 37, 47, 28, 14,
+ 8, 0, 9, 19, 27, 41, 79, 57, 25, 21,
+ 13, 17, 35, 55, 57, 91, 23, 64, 34, 22,
+ 6, 5, 23, 35, 43, 55, 124, 53, 35, 25,
+ 5, 13, 2, 8, 10, 19, 6, 14, 2, 29,
+ 15, 11, 32, 8, 44, 36, 60, 60, 38, 40,
+ 50, 40, 30, 3, 22, 7, 71, 57, 53, 43,
+ 37, 37, 35, 39, 7, 23, 21, 19, 4, 23,
+ 37, 65, 51, 63, 25, 17, 13, 13, 9, 3,
+ 7, 0, 0, 7, 13, 8, 3, 21, 104, 84,
+ 90, 90, 88, 96, 88, 84, 72, 72, 76, 64,
+ 28, 34, 20, 74, 70, 64, 50, 52, 42, 16,
+ 6, 3, 25, 33, 57, 17, 81, 124, 124, 124,
+ 124, 124, 124, 124, 124, 116, 110, 102, 94, 76,
+ 40, 2, 112, 108, 50, 118, 124, 108, 96, 102,
+ 92, 80, 90, 78, 76, 46, 50, 28, 19, 62,
+ 42, 8, 99, 81, 87, 33, 45, 39, 4, 23,
+ 21, 9, 1, 32, 8, 44, 11, 124, 124, 124,
+ 118, 88, 82, 26, 5, 51, 19, 82, 58, 40,
+ 18, 18, 4, 1, 9, 49, 73, 57, 41, 43,
+ 35, 11, 23, 33, 11, 5, 3, 1, 6, 14,
+ 8, 124, 124, 122, 108, 100, 88, 60, 34, 17,
+ },
+
+ {
+ /* Context Tables for I, SI Slices :: qp = 7 */
+
+ 124, 18, 23, 124, 18, 23, 101, 59, 26, 20,
+ 18, 40, 102, 108, 108, 28, 75, 8, 17, 6,
+ 2, 18, 15, 12, 15, 34, 102, 29, 67, 87,
+ 85, 63, 7, 17, 6, 2, 39, 47, 10, 30,
+ 7, 29, 55, 12, 13, 39, 49, 10, 21, 51,
+ 14, 13, 31, 49, 0, 15, 17, 27, 9, 10,
+ 44, 0, 0, 0, 31, 47, 67, 9, 34, 0,
+ 104, 16, 11, 111, 65, 37, 13, 67, 103, 99,
+ 105, 47, 103, 81, 99, 103, 91, 103, 93, 12,
+ 7, 27, 69, 9, 51, 43, 99, 19, 61, 47,
+ 109, 18, 31, 17, 103, 51, 37, 23, 9, 4,
+ 5, 29, 10, 4, 1, 10, 3, 1, 4, 27,
+ 26, 16, 48, 38, 42, 36, 34, 34, 66, 24,
+ 20, 64, 46, 9, 77, 59, 63, 43, 49, 39,
+ 27, 35, 31, 35, 43, 13, 17, 15, 5, 69,
+ 59, 63, 19, 17, 13, 11, 22, 0, 2, 3,
+ 34, 3, 6, 4, 2, 27, 106, 96, 70, 80,
+ 96, 78, 44, 80, 64, 44, 54, 34, 10, 2,
+ 1, 40, 28, 36, 34, 28, 22, 34, 20, 32,
+ 28, 8, 10, 20, 51, 124, 124, 124, 124, 124,
+ 124, 124, 124, 124, 124, 120, 122, 104, 92, 58,
+ 112, 98, 60, 124, 104, 110, 100, 88, 84, 90,
+ 64, 48, 18, 30, 11, 19, 67, 11, 48, 32,
+ 22, 4, 10, 0, 15, 19, 47, 1, 78, 52,
+ 40, 34, 12, 1, 19, 21, 35, 45, 30, 16,
+ 8, 0, 7, 17, 25, 41, 77, 57, 23, 21,
+ 13, 17, 33, 53, 57, 89, 21, 66, 34, 22,
+ 6, 3, 21, 33, 41, 53, 124, 53, 35, 23,
+ 5, 13, 2, 8, 10, 17, 6, 14, 2, 29,
+ 15, 11, 30, 10, 42, 34, 58, 58, 38, 38,
+ 48, 38, 30, 5, 20, 7, 69, 57, 51, 41,
+ 35, 35, 35, 37, 7, 23, 21, 19, 2, 23,
+ 37, 63, 49, 63, 25, 15, 13, 13, 9, 3,
+ 5, 0, 0, 7, 11, 8, 5, 23, 102, 84,
+ 88, 88, 86, 94, 86, 82, 70, 70, 74, 62,
+ 26, 32, 18, 72, 66, 62, 46, 50, 40, 14,
+ 4, 5, 25, 33, 57, 17, 79, 124, 124, 124,
+ 124, 124, 124, 124, 122, 112, 106, 98, 90, 72,
+ 36, 0, 108, 104, 48, 114, 124, 104, 92, 98,
+ 88, 76, 86, 74, 70, 42, 46, 24, 21, 58,
+ 38, 6, 97, 79, 85, 33, 43, 37, 8, 21,
+ 19, 9, 0, 36, 12, 48, 11, 124, 124, 122,
+ 112, 82, 76, 20, 9, 53, 19, 82, 58, 40,
+ 18, 18, 4, 1, 9, 49, 71, 55, 39, 41,
+ 33, 9, 23, 31, 9, 3, 1, 0, 6, 16,
+ 8, 124, 124, 118, 104, 96, 84, 56, 30, 19,
+ },
+
+ {
+ /* Context Tables for I, SI Slices :: qp = 8 */
+
+ 124, 16, 23, 124, 16, 23, 99, 57, 26, 20,
+ 18, 38, 100, 106, 108, 28, 71, 8, 15, 8,
+ 2, 16, 15, 10, 19, 32, 98, 33, 69, 89,
+ 81, 61, 7, 15, 8, 2, 39, 45, 10, 28,
+ 7, 29, 53, 10, 13, 39, 51, 10, 21, 51,
+ 14, 15, 31, 49, 0, 15, 17, 27, 9, 10,
+ 44, 0, 0, 0, 29, 49, 67, 9, 32, 0,
+ 104, 16, 11, 109, 63, 37, 13, 65, 99, 95,
+ 103, 45, 99, 79, 97, 101, 91, 101, 91, 12,
+ 7, 25, 67, 9, 51, 43, 97, 19, 61, 47,
+ 107, 18, 31, 17, 101, 51, 35, 23, 9, 4,
+ 5, 29, 10, 4, 3, 10, 3, 1, 4, 27,
+ 26, 16, 46, 38, 40, 36, 34, 34, 62, 24,
+ 20, 60, 44, 11, 75, 57, 61, 41, 49, 39,
+ 25, 35, 31, 33, 41, 13, 17, 15, 9, 69,
+ 57, 63, 19, 19, 11, 11, 20, 2, 2, 3,
+ 32, 1, 4, 2, 2, 29, 102, 94, 68, 78,
+ 94, 76, 42, 78, 62, 40, 52, 32, 10, 2,
+ 1, 38, 26, 34, 32, 26, 20, 32, 18, 28,
+ 24, 6, 8, 18, 51, 124, 124, 124, 124, 124,
+ 124, 124, 124, 124, 122, 116, 116, 98, 86, 52,
+ 108, 94, 56, 122, 100, 104, 94, 84, 78, 84,
+ 60, 44, 16, 26, 13, 21, 69, 11, 48, 32,
+ 22, 4, 10, 0, 15, 19, 45, 1, 78, 52,
+ 40, 34, 14, 1, 19, 21, 35, 45, 32, 16,
+ 8, 0, 7, 17, 25, 39, 75, 55, 21, 19,
+ 13, 17, 31, 51, 55, 87, 21, 66, 34, 22,
+ 6, 1, 21, 33, 39, 53, 124, 53, 35, 23,
+ 5, 13, 2, 8, 10, 17, 6, 14, 4, 29,
+ 15, 11, 28, 10, 40, 32, 56, 56, 38, 34,
+ 44, 38, 30, 7, 18, 9, 69, 55, 49, 41,
+ 35, 35, 33, 35, 9, 23, 21, 19, 0, 23,
+ 37, 63, 49, 65, 25, 15, 13, 11, 9, 3,
+ 5, 0, 1, 7, 9, 6, 5, 25, 100, 82,
+ 88, 86, 82, 92, 84, 80, 68, 68, 72, 60,
+ 24, 30, 16, 68, 64, 58, 42, 46, 38, 12,
+ 2, 5, 25, 33, 57, 19, 79, 124, 124, 124,
+ 124, 124, 124, 122, 116, 108, 102, 94, 84, 68,
+ 34, 1, 104, 100, 44, 110, 122, 98, 86, 92,
+ 82, 72, 82, 68, 66, 38, 40, 20, 23, 54,
+ 34, 2, 95, 77, 83, 31, 41, 37, 10, 19,
+ 19, 9, 0, 38, 14, 50, 9, 124, 124, 116,
+ 106, 76, 70, 14, 13, 55, 19, 82, 58, 40,
+ 18, 18, 4, 1, 9, 47, 71, 53, 37, 41,
+ 31, 9, 21, 31, 9, 3, 1, 0, 8, 16,
+ 6, 124, 124, 114, 100, 92, 80, 52, 26, 21,
+ },
+
+ {
+ /* Context Tables for I, SI Slices :: qp = 9 */
+
+ 124, 16, 23, 124, 16, 23, 95, 55, 28, 20,
+ 18, 36, 98, 104, 108, 28, 67, 8, 13, 10,
+ 2, 16, 13, 8, 21, 30, 94, 35, 71, 91,
+ 75, 57, 7, 13, 10, 2, 37, 43, 12, 26,
+ 7, 27, 51, 10, 13, 39, 51, 10, 21, 49,
+ 14, 15, 31, 49, 0, 15, 15, 27, 9, 10,
+ 44, 0, 0, 0, 27, 49, 67, 9, 30, 0,
+ 104, 16, 11, 107, 61, 37, 13, 63, 95, 91,
+ 99, 41, 95, 77, 93, 99, 89, 99, 89, 14,
+ 5, 23, 65, 9, 49, 41, 93, 19, 59, 47,
+ 103, 20, 31, 17, 97, 51, 33, 23, 9, 6,
+ 3, 27, 10, 4, 3, 10, 1, 1, 4, 25,
+ 26, 16, 46, 38, 38, 36, 34, 34, 58, 24,
+ 20, 58, 42, 11, 73, 55, 59, 39, 47, 37,
+ 23, 33, 31, 31, 39, 13, 17, 15, 11, 67,
+ 55, 63, 17, 19, 9, 11, 18, 4, 2, 3,
+ 30, 0, 2, 0, 2, 29, 100, 92, 68, 76,
+ 92, 74, 42, 78, 62, 38, 50, 32, 10, 2,
+ 1, 38, 24, 32, 30, 24, 18, 32, 18, 26,
+ 22, 6, 6, 16, 51, 124, 124, 124, 124, 124,
+ 124, 124, 124, 124, 118, 112, 112, 92, 82, 46,
+ 106, 90, 54, 118, 96, 100, 90, 80, 74, 80,
+ 56, 40, 16, 22, 15, 21, 69, 11, 50, 32,
+ 22, 6, 10, 0, 13, 19, 43, 1, 78, 52,
+ 42, 34, 16, 0, 17, 19, 33, 43, 34, 18,
+ 10, 2, 5, 15, 25, 37, 73, 53, 17, 17,
+ 11, 15, 29, 49, 53, 85, 19, 66, 36, 24,
+ 6, 0, 19, 31, 37, 51, 124, 51, 33, 21,
+ 5, 13, 2, 10, 12, 17, 6, 16, 6, 29,
+ 15, 11, 28, 10, 38, 32, 54, 56, 38, 32,
+ 42, 38, 30, 7, 16, 11, 67, 53, 47, 41,
+ 33, 35, 31, 33, 9, 23, 21, 17, 0, 23,
+ 37, 63, 47, 65, 25, 15, 13, 9, 9, 3,
+ 3, 0, 1, 7, 7, 4, 5, 25, 98, 82,
+ 88, 86, 80, 90, 82, 78, 68, 66, 70, 60,
+ 24, 28, 14, 66, 62, 54, 38, 44, 36, 12,
+ 2, 5, 23, 31, 57, 21, 77, 124, 124, 124,
+ 124, 124, 124, 118, 112, 104, 98, 90, 80, 64,
+ 32, 3, 100, 98, 42, 106, 118, 94, 82, 88,
+ 78, 68, 78, 64, 62, 36, 36, 16, 25, 50,
+ 30, 1, 93, 73, 79, 29, 39, 35, 14, 17,
+ 17, 9, 2, 42, 16, 54, 7, 124, 124, 112,
+ 100, 72, 64, 10, 17, 57, 19, 82, 58, 40,
+ 18, 20, 4, 1, 9, 45, 69, 51, 35, 39,
+ 27, 7, 19, 29, 7, 1, 0, 0, 10, 16,
+ 6, 124, 122, 112, 98, 88, 76, 48, 22, 21,
+ },
+
+ {
+ /* Context Tables for I, SI Slices :: qp = 10 */
+
+ 124, 16, 23, 124, 16, 23, 91, 51, 28, 20,
+ 16, 32, 94, 100, 108, 28, 61, 6, 11, 10,
+ 2, 14, 13, 6, 23, 30, 90, 39, 73, 91,
+ 69, 55, 7, 11, 10, 2, 37, 41, 12, 26,
+ 7, 27, 51, 10, 15, 37, 51, 10, 21, 47,
+ 12, 15, 31, 49, 2, 13, 15, 25, 7, 10,
+ 44, 0, 0, 0, 27, 49, 67, 7, 28, 0,
+ 104, 16, 11, 105, 59, 37, 11, 59, 93, 87,
+ 97, 39, 93, 73, 89, 97, 87, 97, 89, 16,
+ 5, 21, 63, 9, 49, 39, 89, 19, 59, 45,
+ 101, 20, 29, 15, 95, 49, 33, 23, 7, 6,
+ 3, 25, 12, 6, 5, 8, 1, 1, 4, 25,
+ 26, 14, 44, 38, 38, 36, 34, 32, 54, 24,
+ 18, 54, 40, 13, 71, 53, 57, 37, 45, 35,
+ 21, 31, 29, 29, 37, 13, 17, 13, 13, 67,
+ 53, 63, 17, 19, 9, 11, 16, 6, 2, 3,
+ 26, 2, 2, 1, 2, 31, 96, 90, 66, 74,
+ 90, 72, 40, 76, 62, 36, 48, 32, 10, 2,
+ 3, 36, 24, 30, 30, 22, 16, 30, 16, 22,
+ 20, 4, 4, 14, 51, 124, 124, 124, 124, 124,
+ 124, 124, 124, 124, 114, 108, 106, 88, 78, 42,
+ 102, 88, 50, 112, 92, 96, 86, 74, 70, 74,
+ 52, 34, 14, 20, 19, 23, 69, 11, 50, 34,
+ 22, 6, 10, 0, 13, 19, 43, 0, 80, 52,
+ 42, 34, 18, 0, 15, 19, 31, 41, 36, 20,
+ 10, 2, 3, 15, 23, 37, 71, 53, 15, 17,
+ 11, 15, 27, 47, 53, 83, 19, 68, 36, 24,
+ 6, 2, 17, 29, 35, 49, 124, 51, 33, 21,
+ 5, 13, 2, 10, 12, 15, 6, 16, 6, 29,
+ 15, 11, 26, 12, 36, 30, 52, 54, 38, 30,
+ 40, 36, 30, 9, 14, 11, 65, 53, 45, 39,
+ 33, 33, 31, 31, 9, 23, 21, 17, 1, 23,
+ 37, 61, 45, 65, 25, 13, 13, 9, 9, 3,
+ 1, 0, 1, 7, 5, 4, 7, 27, 96, 82,
+ 86, 84, 78, 88, 80, 76, 66, 64, 68, 58,
+ 22, 26, 12, 62, 58, 52, 34, 42, 34, 10,
+ 0, 7, 23, 31, 57, 21, 75, 124, 124, 124,
+ 124, 124, 120, 114, 106, 100, 94, 86, 76, 60,
+ 28, 5, 96, 94, 40, 102, 114, 90, 78, 84,
+ 74, 64, 74, 60, 56, 32, 32, 12, 27, 46,
+ 26, 3, 91, 71, 77, 29, 37, 33, 18, 15,
+ 15, 9, 4, 44, 20, 56, 7, 124, 120, 106,
+ 94, 66, 58, 4, 21, 59, 19, 82, 58, 40,
+ 18, 20, 4, 1, 9, 45, 67, 49, 33, 37,
+ 25, 5, 19, 27, 5, 0, 2, 2, 10, 18,
+ 6, 120, 118, 108, 94, 84, 72, 44, 18, 23,
+ },
+
+ {
+ /* Context Tables for I, SI Slices :: qp = 11 */
+
+ 124, 16, 25, 124, 16, 25, 87, 49, 30, 20,
+ 16, 30, 92, 98, 108, 28, 57, 6, 11, 12,
+ 2, 12, 13, 4, 25, 28, 88, 41, 75, 93,
+ 63, 53, 7, 11, 12, 2, 35, 39, 14, 24,
+ 5, 25, 49, 8, 15, 37, 51, 10, 19, 47,
+ 12, 15, 29, 49, 2, 13, 15, 25, 7, 10,
+ 44, 0, 0, 0, 25, 49, 67, 7, 28, 1,
+ 104, 16, 11, 103, 59, 37, 11, 57, 89, 83,
+ 93, 37, 89, 71, 85, 95, 85, 95, 87, 18,
+ 5, 19, 63, 7, 47, 39, 85, 19, 57, 45,
+ 99, 20, 29, 15, 91, 49, 31, 21, 7, 6,
+ 3, 23, 12, 6, 5, 8, 1, 1, 4, 25,
+ 26, 14, 44, 38, 36, 36, 34, 32, 50, 24,
+ 18, 50, 38, 15, 71, 51, 57, 37, 43, 35,
+ 19, 31, 29, 29, 33, 13, 17, 13, 15, 65,
+ 51, 63, 17, 19, 7, 9, 14, 6, 2, 3,
+ 24, 4, 0, 3, 2, 33, 94, 88, 66, 74,
+ 88, 72, 40, 74, 60, 34, 46, 32, 10, 2,
+ 3, 34, 22, 28, 28, 22, 14, 28, 16, 18,
+ 18, 4, 2, 12, 51, 124, 124, 124, 124, 124,
+ 124, 124, 124, 124, 108, 104, 102, 82, 72, 36,
+ 98, 84, 48, 108, 86, 90, 80, 70, 64, 70,
+ 46, 30, 12, 16, 21, 25, 69, 9, 52, 34,
+ 24, 6, 10, 0, 13, 19, 41, 0, 80, 54,
+ 42, 34, 20, 2, 15, 17, 31, 39, 38, 20,
+ 12, 2, 1, 13, 23, 35, 69, 51, 13, 15,
+ 9, 15, 27, 47, 51, 79, 17, 68, 36, 24,
+ 8, 2, 17, 27, 35, 47, 124, 49, 31, 19,
+ 5, 13, 2, 10, 12, 15, 8, 16, 8, 29,
+ 15, 13, 26, 12, 34, 28, 50, 52, 38, 28,
+ 38, 36, 30, 9, 12, 13, 63, 51, 43, 39,
+ 31, 33, 29, 31, 9, 21, 19, 15, 1, 23,
+ 37, 61, 43, 65, 25, 13, 13, 7, 9, 3,
+ 0, 1, 1, 7, 5, 2, 7, 29, 94, 80,
+ 86, 82, 76, 88, 80, 74, 64, 64, 66, 56,
+ 20, 24, 10, 60, 56, 48, 30, 38, 30, 8,
+ 0, 7, 23, 31, 55, 23, 73, 124, 124, 124,
+ 124, 124, 116, 110, 102, 96, 88, 82, 70, 56,
+ 26, 5, 94, 92, 36, 98, 108, 86, 74, 80,
+ 70, 60, 68, 56, 52, 28, 28, 10, 31, 42,
+ 22, 7, 89, 69, 75, 27, 35, 31, 20, 13,
+ 13, 9, 6, 48, 22, 60, 5, 122, 118, 102,
+ 88, 60, 54, 1, 25, 63, 17, 82, 58, 40,
+ 18, 20, 4, 1, 9, 43, 65, 49, 31, 35,
+ 23, 5, 17, 25, 5, 0, 2, 2, 12, 18,
+ 6, 118, 116, 104, 90, 82, 70, 38, 16, 25,
+ },
+
+ {
+ /* Context Tables for I, SI Slices :: qp = 12 */
+
+ 124, 16, 25, 124, 16, 25, 85, 45, 30, 20,
+ 14, 26, 88, 96, 108, 28, 53, 6, 9, 14,
+ 2, 10, 13, 2, 27, 28, 84, 45, 77, 93,
+ 57, 51, 7, 9, 14, 2, 35, 37, 14, 24,
+ 5, 25, 47, 8, 17, 37, 51, 10, 19, 45,
+ 12, 17, 29, 49, 4, 13, 15, 25, 5, 10,
+ 44, 0, 0, 0, 25, 51, 67, 5, 26, 1,
+ 104, 16, 11, 101, 57, 37, 9, 55, 85, 79,
+ 91, 35, 85, 69, 81, 93, 85, 93, 85, 18,
+ 5, 17, 61, 7, 47, 37, 81, 19, 57, 43,
+ 97, 20, 29, 13, 89, 47, 29, 21, 7, 6,
+ 3, 23, 14, 8, 7, 8, 1, 1, 4, 25,
+ 26, 12, 42, 38, 34, 36, 34, 32, 46, 24,
+ 16, 46, 36, 17, 69, 49, 55, 35, 43, 33,
+ 17, 29, 27, 27, 31, 13, 17, 13, 17, 65,
+ 49, 63, 17, 19, 5, 9, 12, 8, 2, 3,
+ 22, 6, 0, 5, 2, 35, 90, 86, 64, 72,
+ 86, 70, 38, 72, 60, 30, 44, 30, 10, 2,
+ 5, 32, 20, 26, 26, 20, 12, 26, 14, 14,
+ 16, 2, 0, 10, 51, 124, 124, 122, 124, 124,
+ 124, 124, 124, 122, 104, 100, 96, 78, 68, 32,
+ 94, 80, 44, 104, 82, 86, 76, 66, 60, 64,
+ 42, 24, 10, 12, 23, 27, 69, 9, 52, 34,
+ 24, 6, 10, 0, 13, 19, 41, 0, 80, 54,
+ 42, 34, 22, 2, 13, 17, 29, 39, 40, 22,
+ 12, 2, 1, 13, 21, 35, 67, 49, 11, 13,
+ 9, 15, 25, 45, 49, 77, 17, 68, 36, 24,
+ 8, 4, 15, 27, 33, 45, 124, 49, 31, 19,
+ 5, 13, 2, 10, 12, 15, 8, 16, 8, 29,
+ 15, 13, 24, 14, 32, 26, 48, 50, 38, 24,
+ 36, 36, 30, 11, 10, 13, 63, 49, 41, 37,
+ 31, 31, 29, 29, 11, 21, 19, 15, 3, 23,
+ 37, 59, 41, 65, 25, 13, 13, 5, 9, 3,
+ 2, 1, 3, 7, 3, 0, 7, 31, 92, 80,
+ 84, 80, 74, 86, 78, 72, 62, 62, 64, 54,
+ 18, 22, 8, 56, 54, 46, 26, 36, 28, 6,
+ 1, 9, 23, 31, 55, 23, 73, 124, 124, 124,
+ 124, 124, 112, 106, 96, 92, 84, 78, 66, 52,
+ 22, 7, 90, 88, 34, 94, 104, 82, 70, 74,
+ 66, 56, 64, 52, 46, 24, 24, 6, 33, 38,
+ 18, 9, 87, 67, 73, 25, 33, 29, 24, 11,
+ 11, 9, 8, 50, 26, 62, 3, 118, 114, 96,
+ 82, 54, 48, 7, 29, 65, 17, 82, 58, 40,
+ 18, 20, 4, 1, 9, 43, 63, 47, 29, 35,
+ 21, 3, 17, 23, 3, 2, 4, 2, 12, 18,
+ 6, 116, 112, 100, 86, 78, 66, 34, 12, 27,
+ },
+
+ {
+ /* Context Tables for I, SI Slices :: qp = 13 */
+
+ 124, 16, 25, 124, 16, 25, 81, 43, 30, 20,
+ 14, 24, 86, 92, 108, 28, 47, 4, 7, 14,
+ 2, 10, 11, 0, 29, 26, 80, 49, 79, 95,
+ 51, 47, 7, 7, 14, 2, 35, 35, 14, 22,
+ 5, 25, 47, 8, 17, 35, 51, 10, 19, 43,
+ 10, 17, 29, 49, 4, 11, 13, 23, 5, 10,
+ 44, 0, 0, 0, 23, 51, 67, 5, 24, 1,
+ 104, 16, 11, 99, 55, 37, 9, 51, 83, 75,
+ 87, 33, 83, 65, 77, 91, 83, 91, 85, 20,
+ 5, 15, 59, 7, 45, 35, 77, 19, 57, 43,
+ 93, 22, 27, 13, 87, 47, 29, 21, 5, 8,
+ 1, 21, 14, 8, 7, 6, 1, 1, 4, 25,
+ 26, 12, 40, 38, 34, 36, 34, 30, 42, 24,
+ 16, 44, 34, 19, 67, 47, 53, 33, 41, 31,
+ 15, 27, 27, 25, 29, 13, 17, 11, 19, 63,
+ 47, 63, 15, 19, 5, 9, 10, 10, 2, 3,
+ 18, 8, 1, 7, 2, 35, 88, 84, 64, 70,
+ 84, 68, 36, 72, 60, 28, 42, 30, 10, 2,
+ 5, 32, 20, 24, 26, 18, 10, 26, 12, 10,
+ 14, 0, 1, 8, 51, 122, 124, 118, 124, 122,
+ 120, 120, 120, 118, 100, 96, 92, 72, 64, 26,
+ 90, 78, 40, 98, 78, 82, 72, 60, 56, 58,
+ 38, 20, 8, 10, 27, 27, 69, 9, 52, 36,
+ 24, 6, 10, 0, 13, 19, 39, 2, 82, 54,
+ 42, 34, 24, 4, 11, 17, 27, 37, 42, 24,
+ 12, 4, 0, 11, 21, 33, 65, 49, 7, 13,
+ 9, 13, 23, 43, 49, 75, 15, 70, 38, 24,
+ 8, 6, 13, 25, 31, 43, 124, 49, 31, 17,
+ 5, 13, 2, 10, 12, 13, 8, 18, 10, 29,
+ 15, 13, 22, 14, 30, 24, 46, 50, 38, 22,
+ 34, 34, 30, 13, 8, 15, 61, 49, 39, 37,
+ 29, 31, 27, 27, 11, 21, 19, 15, 5, 23,
+ 37, 59, 39, 65, 25, 11, 13, 5, 9, 3,
+ 4, 1, 3, 7, 1, 0, 9, 31, 90, 80,
+ 84, 80, 72, 84, 76, 70, 62, 60, 62, 52,
+ 16, 20, 6, 54, 50, 42, 22, 34, 26, 4,
+ 3, 9, 23, 29, 55, 25, 71, 124, 124, 124,
+ 124, 120, 108, 102, 92, 88, 80, 74, 62, 48,
+ 20, 9, 86, 84, 32, 90, 100, 78, 66, 70,
+ 62, 52, 60, 48, 42, 20, 20, 2, 35, 34,
+ 14, 13, 85, 63, 71, 25, 31, 27, 28, 9,
+ 9, 9, 10, 54, 28, 66, 3, 116, 110, 92,
+ 76, 50, 42, 11, 33, 67, 17, 82, 58, 40,
+ 18, 20, 4, 1, 9, 41, 61, 45, 27, 33,
+ 19, 1, 15, 21, 1, 4, 6, 4, 14, 20,
+ 6, 112, 110, 98, 84, 74, 62, 30, 8, 29,
+ },
+
+ {
+ /* Context Tables for I, SI Slices :: qp = 14 */
+
+ 122, 16, 25, 122, 16, 25, 77, 39, 32, 20,
+ 12, 20, 82, 90, 108, 28, 43, 4, 5, 16,
+ 2, 8, 11, 1, 31, 26, 76, 51, 81, 95,
+ 45, 45, 7, 5, 16, 2, 33, 33, 16, 22,
+ 5, 23, 45, 6, 19, 35, 51, 10, 19, 43,
+ 10, 17, 29, 49, 6, 11, 13, 23, 3, 10,
+ 44, 0, 0, 0, 23, 51, 67, 3, 22, 1,
+ 104, 16, 11, 97, 53, 37, 7, 49, 79, 71,
+ 85, 31, 79, 63, 73, 89, 81, 89, 83, 22,
+ 5, 13, 57, 7, 45, 35, 73, 19, 55, 41,
+ 91, 22, 27, 11, 83, 45, 27, 21, 5, 8,
+ 1, 19, 16, 10, 9, 6, 1, 1, 4, 25,
+ 26, 10, 40, 38, 32, 36, 34, 30, 38, 24,
+ 14, 40, 32, 21, 65, 45, 51, 31, 39, 31,
+ 13, 27, 25, 23, 27, 13, 17, 11, 21, 63,
+ 45, 63, 15, 19, 3, 9, 8, 12, 2, 3,
+ 16, 10, 1, 9, 2, 37, 84, 82, 62, 68,
+ 82, 66, 36, 70, 58, 26, 40, 30, 10, 2,
+ 7, 30, 18, 22, 24, 16, 8, 24, 12, 6,
+ 12, 0, 3, 6, 51, 120, 122, 116, 124, 118,
+ 116, 116, 116, 112, 94, 92, 86, 68, 58, 22,
+ 86, 74, 38, 94, 74, 76, 66, 56, 50, 54,
+ 34, 14, 6, 6, 29, 29, 69, 9, 54, 36,
+ 24, 6, 10, 0, 13, 19, 39, 2, 82, 54,
+ 42, 34, 26, 4, 11, 15, 27, 35, 44, 24,
+ 14, 4, 2, 11, 19, 33, 63, 47, 5, 11,
+ 7, 13, 21, 41, 47, 73, 15, 70, 38, 24,
+ 8, 8, 13, 23, 29, 41, 124, 47, 29, 17,
+ 5, 13, 2, 10, 12, 13, 8, 18, 10, 29,
+ 15, 13, 22, 16, 28, 22, 44, 48, 38, 20,
+ 32, 34, 30, 13, 6, 15, 59, 47, 37, 35,
+ 29, 29, 27, 25, 11, 21, 19, 13, 5, 23,
+ 37, 57, 37, 65, 25, 11, 13, 3, 9, 3,
+ 6, 1, 3, 7, 0, 1, 9, 33, 88, 78,
+ 82, 78, 70, 82, 74, 68, 60, 58, 60, 50,
+ 14, 18, 4, 50, 48, 40, 18, 30, 24, 2,
+ 3, 11, 23, 29, 55, 25, 69, 124, 124, 122,
+ 122, 114, 104, 98, 86, 84, 76, 70, 56, 44,
+ 16, 11, 82, 82, 28, 86, 96, 74, 62, 66,
+ 58, 48, 56, 44, 36, 16, 16, 1, 37, 30,
+ 10, 15, 83, 61, 69, 23, 29, 25, 30, 7,
+ 7, 9, 12, 56, 32, 68, 1, 112, 108, 86,
+ 70, 44, 36, 17, 37, 69, 17, 82, 58, 40,
+ 18, 20, 4, 1, 9, 41, 59, 43, 25, 31,
+ 17, 1, 15, 19, 1, 4, 6, 4, 14, 20,
+ 6, 110, 106, 94, 80, 70, 58, 26, 4, 31,
+ },
+
+ {
+ /* Context Tables for I, SI Slices :: qp = 15 */
+
+ 120, 16, 25, 120, 16, 25, 73, 37, 32, 20,
+ 12, 18, 80, 88, 108, 28, 39, 4, 3, 18,
+ 2, 6, 11, 3, 33, 24, 72, 55, 83, 97,
+ 39, 43, 7, 3, 18, 2, 33, 31, 16, 20,
+ 5, 23, 43, 6, 19, 35, 51, 10, 19, 41,
+ 10, 17, 29, 49, 6, 11, 13, 23, 3, 10,
+ 44, 0, 0, 0, 21, 51, 67, 3, 20, 1,
+ 104, 16, 11, 95, 51, 37, 7, 47, 75, 67,
+ 81, 29, 75, 61, 69, 87, 79, 87, 81, 24,
+ 5, 11, 55, 7, 43, 33, 69, 19, 55, 41,
+ 89, 22, 27, 11, 81, 45, 25, 21, 5, 8,
+ 1, 17, 16, 10, 9, 6, 1, 1, 4, 25,
+ 26, 10, 38, 38, 30, 36, 34, 30, 34, 24,
+ 14, 36, 30, 23, 63, 43, 49, 29, 37, 29,
+ 11, 25, 25, 21, 25, 13, 17, 11, 23, 61,
+ 43, 63, 15, 19, 1, 9, 6, 14, 2, 3,
+ 14, 12, 3, 11, 2, 39, 82, 80, 62, 66,
+ 80, 64, 34, 68, 58, 24, 38, 30, 10, 2,
+ 7, 28, 16, 20, 22, 14, 6, 22, 10, 2,
+ 10, 1, 5, 4, 51, 116, 120, 112, 120, 114,
+ 112, 112, 112, 108, 90, 88, 82, 62, 54, 16,
+ 82, 70, 34, 90, 70, 72, 62, 52, 46, 48,
+ 30, 10, 4, 2, 31, 31, 69, 9, 54, 36,
+ 24, 6, 10, 0, 13, 19, 37, 2, 82, 54,
+ 42, 34, 28, 6, 9, 15, 25, 33, 46, 26,
+ 14, 4, 4, 9, 19, 31, 61, 45, 3, 9,
+ 7, 13, 19, 39, 45, 71, 13, 70, 38, 24,
+ 8, 10, 11, 21, 27, 39, 124, 47, 29, 15,
+ 5, 13, 2, 10, 12, 13, 8, 18, 12, 29,
+ 15, 13, 20, 16, 26, 20, 42, 46, 38, 18,
+ 30, 34, 30, 15, 4, 17, 57, 45, 35, 35,
+ 27, 29, 25, 23, 11, 21, 19, 13, 7, 23,
+ 37, 57, 35, 65, 25, 11, 13, 1, 9, 3,
+ 8, 1, 3, 7, 2, 3, 9, 35, 86, 78,
+ 82, 76, 68, 80, 72, 66, 58, 56, 58, 48,
+ 12, 16, 2, 48, 46, 36, 14, 28, 22, 0,
+ 5, 11, 23, 29, 55, 27, 67, 124, 124, 118,
+ 118, 108, 100, 94, 82, 80, 72, 66, 52, 40,
+ 14, 13, 78, 78, 26, 82, 92, 70, 58, 62,
+ 54, 44, 52, 40, 32, 12, 12, 5, 39, 26,
+ 6, 19, 81, 59, 67, 21, 27, 23, 34, 5,
+ 5, 9, 14, 60, 34, 72, 0, 110, 104, 82,
+ 64, 38, 30, 23, 41, 71, 17, 82, 58, 40,
+ 18, 20, 4, 1, 9, 39, 57, 41, 23, 29,
+ 15, 0, 13, 17, 0, 6, 8, 4, 16, 20,
+ 6, 108, 104, 90, 76, 66, 54, 22, 0, 33,
+ },
+
+ {
+ /* Context Tables for I, SI Slices :: qp = 16 */
+
+ 116, 14, 27, 116, 14, 27, 71, 35, 32, 20,
+ 10, 14, 76, 84, 106, 28, 35, 2, 3, 18,
+ 0, 4, 11, 7, 37, 22, 68, 59, 85, 99,
+ 35, 41, 9, 3, 18, 0, 33, 29, 16, 18,
+ 5, 23, 43, 4, 21, 35, 53, 10, 19, 41,
+ 8, 19, 29, 49, 6, 11, 13, 23, 3, 8,
+ 44, 0, 0, 0, 21, 53, 67, 3, 18, 3,
+ 104, 14, 11, 93, 51, 37, 7, 45, 73, 65,
+ 79, 27, 73, 59, 67, 85, 79, 85, 81, 24,
+ 5, 11, 55, 7, 43, 33, 67, 19, 55, 41,
+ 87, 22, 27, 11, 79, 45, 25, 21, 5, 8,
+ 1, 17, 16, 10, 11, 4, 1, 3, 4, 25,
+ 24, 8, 36, 38, 28, 34, 34, 28, 30, 22,
+ 12, 32, 28, 25, 63, 43, 49, 29, 37, 29,
+ 9, 25, 25, 21, 23, 15, 17, 11, 27, 61,
+ 43, 63, 15, 21, 1, 9, 4, 14, 2, 3,
+ 10, 12, 5, 13, 2, 41, 78, 78, 60, 64,
+ 78, 62, 32, 66, 56, 20, 36, 28, 8, 2,
+ 9, 26, 14, 18, 20, 12, 4, 20, 8, 1,
+ 6, 3, 9, 0, 51, 112, 116, 108, 116, 110,
+ 106, 106, 106, 102, 84, 82, 76, 56, 48, 10,
+ 78, 66, 30, 84, 64, 66, 56, 46, 40, 42,
+ 24, 4, 2, 1, 35, 33, 71, 9, 54, 36,
+ 24, 6, 10, 1, 13, 19, 37, 2, 82, 54,
+ 42, 34, 30, 6, 9, 15, 25, 33, 46, 26,
+ 14, 4, 4, 9, 19, 31, 59, 45, 1, 9,
+ 7, 13, 19, 39, 45, 69, 13, 70, 38, 24,
+ 8, 10, 11, 21, 27, 39, 124, 47, 29, 15,
+ 5, 13, 2, 10, 12, 13, 8, 18, 12, 29,
+ 15, 15, 18, 16, 24, 18, 40, 44, 36, 14,
+ 26, 32, 28, 17, 0, 19, 57, 45, 33, 35,
+ 27, 29, 25, 23, 13, 21, 19, 13, 9, 23,
+ 37, 57, 35, 67, 25, 11, 13, 1, 11, 3,
+ 8, 3, 5, 7, 2, 5, 11, 37, 84, 76,
+ 80, 74, 64, 78, 70, 64, 56, 54, 56, 46,
+ 10, 12, 1, 44, 42, 32, 10, 24, 18, 1,
+ 7, 13, 23, 29, 55, 29, 67, 124, 122, 114,
+ 112, 102, 94, 88, 76, 74, 66, 60, 46, 34,
+ 10, 15, 74, 74, 22, 78, 86, 64, 52, 56,
+ 48, 40, 46, 34, 26, 8, 6, 9, 43, 22,
+ 2, 23, 79, 57, 65, 21, 27, 23, 36, 5,
+ 5, 9, 14, 62, 36, 74, 0, 106, 100, 76,
+ 56, 32, 24, 29, 47, 75, 17, 82, 56, 38,
+ 18, 20, 4, 3, 9, 39, 57, 41, 23, 29,
+ 13, 0, 13, 17, 0, 6, 8, 4, 16, 20,
+ 4, 104, 100, 86, 72, 62, 50, 16, 3, 35,
+ },
+
+ {
+ /* Context Tables for I, SI Slices :: qp = 17 */
+
+ 114, 14, 27, 114, 14, 27, 67, 31, 34, 22,
+ 10, 12, 74, 82, 106, 28, 29, 2, 1, 20,
+ 0, 4, 9, 9, 39, 22, 66, 61, 87, 99,
+ 29, 37, 9, 1, 20, 0, 31, 25, 18, 18,
+ 3, 21, 41, 4, 21, 33, 53, 10, 17, 39,
+ 8, 19, 27, 49, 8, 9, 11, 21, 1, 8,
+ 44, 0, 0, 0, 19, 53, 67, 1, 18, 3,
+ 104, 14, 11, 89, 49, 37, 5, 41, 69, 61,
+ 75, 23, 69, 55, 63, 81, 77, 83, 79, 26,
+ 3, 9, 53, 5, 41, 31, 63, 17, 53, 39,
+ 83, 24, 25, 9, 75, 43, 23, 19, 3, 10,
+ 0, 15, 18, 12, 11, 4, 0, 3, 4, 23,
+ 24, 8, 36, 38, 28, 34, 34, 28, 28, 22,
+ 12, 30, 26, 25, 61, 41, 47, 27, 35, 27,
+ 7, 23, 23, 19, 19, 15, 17, 9, 29, 59,
+ 41, 63, 13, 21, 0, 7, 4, 16, 2, 3,
+ 8, 14, 5, 15, 2, 41, 76, 78, 60, 64,
+ 78, 62, 32, 66, 56, 18, 36, 28, 8, 2,
+ 9, 26, 14, 18, 20, 12, 2, 20, 8, 3,
+ 4, 3, 11, 1, 51, 110, 114, 106, 114, 108,
+ 102, 102, 102, 98, 80, 78, 72, 52, 44, 6,
+ 76, 64, 28, 80, 60, 62, 52, 42, 36, 38,
+ 20, 0, 2, 3, 37, 33, 71, 7, 56, 38,
+ 26, 8, 10, 1, 11, 17, 35, 4, 84, 56,
+ 44, 34, 32, 8, 7, 13, 23, 31, 48, 28,
+ 16, 6, 6, 7, 17, 29, 55, 43, 2, 7,
+ 5, 11, 17, 37, 43, 65, 11, 72, 40, 26,
+ 10, 12, 9, 19, 25, 37, 124, 45, 27, 13,
+ 5, 11, 2, 12, 14, 11, 10, 20, 14, 27,
+ 15, 15, 18, 18, 24, 18, 38, 44, 36, 12,
+ 24, 32, 28, 17, 1, 19, 55, 43, 31, 33,
+ 25, 27, 23, 21, 13, 19, 17, 11, 9, 21,
+ 35, 55, 33, 67, 25, 9, 13, 0, 11, 3,
+ 10, 3, 5, 5, 4, 5, 11, 37, 84, 76,
+ 80, 74, 62, 78, 70, 64, 56, 54, 56, 46,
+ 10, 10, 3, 42, 40, 30, 8, 22, 16, 1,
+ 7, 13, 21, 27, 53, 29, 65, 120, 118, 110,
+ 108, 98, 90, 84, 72, 70, 62, 56, 42, 30,
+ 8, 15, 72, 72, 20, 76, 82, 60, 48, 52,
+ 44, 36, 42, 30, 22, 6, 2, 11, 45, 20,
+ 0, 25, 75, 53, 61, 19, 25, 21, 40, 3,
+ 3, 9, 16, 66, 40, 78, 2, 104, 98, 72,
+ 50, 28, 20, 33, 51, 77, 15, 84, 56, 38,
+ 18, 22, 4, 3, 9, 37, 55, 39, 21, 27,
+ 9, 2, 11, 15, 2, 8, 10, 6, 18, 22,
+ 4, 102, 98, 84, 70, 60, 48, 12, 5, 35,
+ },
+
+ {
+ /* Context Tables for I, SI Slices :: qp = 18 */
+
+ 112, 14, 27, 112, 14, 27, 63, 29, 34, 22,
+ 10, 10, 72, 80, 106, 28, 25, 2, 0, 22,
+ 0, 2, 9, 11, 41, 20, 62, 65, 89, 101,
+ 23, 35, 9, 0, 22, 0, 31, 23, 18, 16,
+ 3, 21, 39, 4, 21, 33, 53, 10, 17, 37,
+ 8, 19, 27, 49, 8, 9, 11, 21, 1, 8,
+ 44, 0, 0, 0, 17, 53, 67, 1, 16, 3,
+ 104, 14, 11, 87, 47, 37, 5, 39, 65, 57,
+ 73, 21, 65, 53, 59, 79, 75, 81, 77, 28,
+ 3, 7, 51, 5, 41, 29, 59, 17, 53, 39,
+ 81, 24, 25, 9, 73, 43, 21, 19, 3, 10,
+ 0, 13, 18, 12, 13, 4, 0, 3, 4, 23,
+ 24, 8, 34, 38, 26, 34, 34, 28, 24, 22,
+ 12, 26, 24, 27, 59, 39, 45, 25, 33, 25,
+ 5, 21, 23, 17, 17, 15, 17, 9, 31, 59,
+ 39, 63, 13, 21, 2, 7, 2, 18, 2, 3,
+ 6, 16, 7, 17, 2, 43, 72, 76, 58, 62,
+ 76, 60, 30, 64, 56, 16, 34, 28, 8, 2,
+ 9, 24, 12, 16, 18, 10, 0, 18, 6, 7,
+ 2, 5, 13, 3, 51, 106, 112, 102, 110, 104,
+ 98, 98, 98, 92, 76, 74, 66, 46, 40, 0,
+ 72, 60, 24, 76, 56, 58, 48, 38, 32, 32,
+ 16, 3, 0, 7, 39, 35, 71, 7, 56, 38,
+ 26, 8, 10, 1, 11, 17, 33, 4, 84, 56,
+ 44, 34, 34, 8, 5, 13, 21, 29, 50, 30,
+ 16, 6, 8, 7, 17, 27, 53, 41, 4, 5,
+ 5, 11, 15, 35, 41, 63, 11, 72, 40, 26,
+ 10, 14, 7, 17, 23, 35, 124, 45, 27, 13,
+ 5, 11, 2, 12, 14, 11, 10, 20, 16, 27,
+ 15, 15, 16, 18, 22, 16, 36, 42, 36, 10,
+ 22, 32, 28, 19, 3, 21, 53, 41, 29, 33,
+ 25, 27, 21, 19, 13, 19, 17, 11, 11, 21,
+ 35, 55, 31, 67, 25, 9, 13, 2, 11, 3,
+ 12, 3, 5, 5, 6, 7, 11, 39, 82, 76,
+ 80, 72, 60, 76, 68, 62, 54, 52, 54, 44,
+ 8, 8, 5, 38, 38, 26, 4, 20, 14, 3,
+ 9, 13, 21, 27, 53, 31, 63, 116, 114, 106,
+ 104, 92, 86, 80, 66, 66, 58, 52, 38, 26,
+ 6, 17, 68, 68, 18, 72, 78, 56, 44, 48,
+ 40, 32, 38, 26, 18, 2, 1, 15, 47, 16,
+ 3, 29, 73, 51, 59, 17, 23, 19, 44, 1,
+ 1, 9, 18, 68, 42, 80, 4, 102, 94, 66,
+ 44, 22, 14, 39, 55, 79, 15, 84, 56, 38,
+ 18, 22, 4, 3, 9, 35, 53, 37, 19, 25,
+ 7, 4, 9, 13, 4, 10, 12, 6, 20, 22,
+ 4, 100, 94, 80, 66, 56, 44, 8, 9, 37,
+ },
+
+ {
+ /* Context Tables for I, SI Slices :: qp = 19 */
+
+ 110, 14, 27, 110, 14, 27, 59, 25, 36, 22,
+ 8, 6, 68, 78, 106, 28, 21, 2, 2, 24,
+ 0, 0, 9, 13, 43, 20, 58, 67, 91, 101,
+ 17, 33, 9, 2, 24, 0, 29, 21, 20, 16,
+ 3, 19, 37, 2, 23, 33, 53, 10, 17, 37,
+ 8, 19, 27, 49, 10, 9, 11, 21, 0, 8,
+ 44, 0, 0, 0, 17, 53, 67, 0, 14, 3,
+ 104, 14, 11, 85, 45, 37, 3, 37, 61, 53,
+ 69, 19, 61, 51, 55, 77, 73, 79, 75, 30,
+ 3, 5, 49, 5, 39, 29, 55, 17, 51, 37,
+ 79, 24, 25, 7, 69, 41, 19, 19, 3, 10,
+ 0, 11, 20, 14, 13, 4, 0, 3, 4, 23,
+ 24, 6, 34, 38, 24, 34, 34, 28, 20, 22,
+ 10, 22, 22, 29, 57, 37, 43, 23, 31, 25,
+ 3, 21, 21, 15, 15, 15, 17, 9, 33, 57,
+ 37, 63, 13, 21, 4, 7, 0, 20, 2, 3,
+ 4, 18, 7, 19, 2, 45, 70, 74, 58, 60,
+ 74, 58, 30, 62, 54, 14, 32, 28, 8, 2,
+ 11, 22, 10, 14, 16, 8, 1, 16, 6, 11,
+ 0, 5, 15, 5, 51, 104, 108, 100, 106, 100,
+ 94, 94, 94, 88, 70, 70, 62, 42, 34, 3,
+ 68, 56, 22, 72, 52, 52, 42, 34, 26, 28,
+ 12, 9, 1, 11, 41, 37, 71, 7, 58, 38,
+ 26, 8, 10, 1, 11, 17, 33, 4, 84, 56,
+ 44, 34, 36, 10, 5, 11, 21, 27, 52, 30,
+ 18, 6, 10, 5, 15, 27, 51, 39, 6, 3,
+ 3, 11, 13, 33, 39, 61, 9, 72, 40, 26,
+ 10, 16, 7, 15, 21, 33, 124, 43, 25, 11,
+ 5, 11, 2, 12, 14, 11, 10, 20, 16, 27,
+ 15, 15, 16, 20, 20, 14, 34, 40, 36, 8,
+ 20, 32, 28, 19, 5, 21, 51, 39, 27, 31,
+ 23, 25, 21, 17, 13, 19, 17, 9, 11, 21,
+ 35, 53, 29, 67, 25, 9, 13, 4, 11, 3,
+ 14, 3, 5, 5, 8, 9, 11, 41, 80, 74,
+ 78, 70, 58, 74, 66, 60, 52, 50, 52, 42,
+ 6, 6, 7, 36, 36, 24, 0, 16, 12, 5,
+ 9, 15, 21, 27, 53, 31, 61, 112, 110, 102,
+ 100, 86, 82, 76, 62, 62, 54, 48, 32, 22,
+ 2, 19, 64, 66, 14, 68, 74, 52, 40, 44,
+ 36, 28, 34, 22, 12, 1, 5, 19, 49, 12,
+ 7, 31, 71, 49, 57, 15, 21, 17, 46, 0,
+ 0, 9, 20, 72, 46, 84, 6, 98, 92, 62,
+ 38, 16, 8, 45, 59, 81, 15, 84, 56, 38,
+ 18, 22, 4, 3, 9, 35, 51, 35, 17, 23,
+ 5, 4, 9, 11, 4, 10, 12, 6, 20, 22,
+ 4, 98, 92, 76, 62, 52, 40, 4, 13, 39,
+ },
+
+ {
+ /* Context Tables for I, SI Slices :: qp = 20 */
+
+ 106, 14, 27, 106, 14, 27, 57, 23, 36, 22,
+ 8, 4, 66, 74, 106, 28, 15, 0, 4, 24,
+ 0, 1, 9, 15, 45, 18, 54, 71, 93, 103,
+ 11, 31, 9, 4, 24, 0, 29, 19, 20, 14,
+ 3, 19, 37, 2, 23, 31, 53, 10, 17, 35,
+ 6, 21, 27, 49, 10, 7, 11, 19, 0, 8,
+ 44, 0, 0, 0, 15, 55, 67, 0, 12, 3,
+ 104, 14, 11, 83, 43, 37, 3, 33, 59, 49,
+ 67, 17, 59, 47, 51, 75, 73, 77, 75, 30,
+ 3, 3, 47, 5, 39, 27, 51, 17, 51, 37,
+ 77, 24, 23, 7, 67, 41, 19, 19, 1, 10,
+ 0, 11, 20, 14, 15, 2, 0, 3, 4, 23,
+ 24, 6, 32, 38, 24, 34, 34, 26, 16, 22,
+ 10, 18, 20, 31, 55, 35, 41, 21, 31, 23,
+ 1, 19, 21, 13, 13, 15, 17, 7, 35, 57,
+ 35, 63, 13, 21, 4, 7, 1, 22, 2, 3,
+ 0, 20, 9, 21, 2, 47, 66, 72, 56, 58,
+ 72, 56, 28, 60, 54, 10, 30, 26, 8, 2,
+ 11, 20, 10, 12, 16, 6, 3, 14, 4, 15,
+ 1, 7, 17, 7, 51, 100, 106, 96, 102, 96,
+ 90, 88, 90, 82, 66, 66, 56, 36, 30, 9,
+ 64, 54, 18, 66, 48, 48, 38, 28, 22, 22,
+ 8, 13, 3, 13, 45, 39, 71, 7, 58, 40,
+ 26, 8, 10, 1, 11, 17, 31, 6, 86, 56,
+ 44, 34, 38, 10, 3, 11, 19, 27, 54, 32,
+ 18, 6, 10, 5, 15, 25, 49, 39, 8, 3,
+ 3, 11, 11, 31, 39, 59, 9, 74, 40, 26,
+ 10, 18, 5, 15, 19, 31, 124, 43, 25, 11,
+ 5, 11, 2, 12, 14, 9, 10, 20, 18, 27,
+ 15, 15, 14, 20, 18, 12, 32, 38, 36, 4,
+ 18, 30, 28, 21, 7, 23, 51, 39, 25, 31,
+ 23, 25, 19, 15, 15, 19, 17, 9, 13, 21,
+ 35, 53, 27, 67, 25, 7, 13, 4, 11, 3,
+ 16, 3, 7, 5, 10, 9, 13, 43, 78, 74,
+ 78, 68, 56, 72, 64, 58, 50, 48, 50, 40,
+ 4, 4, 9, 32, 32, 20, 3, 14, 10, 7,
+ 11, 15, 21, 27, 53, 33, 61, 106, 104, 98,
+ 94, 80, 78, 72, 56, 58, 50, 44, 28, 18,
+ 0, 21, 60, 62, 12, 64, 70, 48, 36, 38,
+ 32, 24, 30, 18, 8, 5, 9, 23, 51, 8,
+ 11, 35, 69, 47, 55, 15, 19, 15, 50, 2,
+ 2, 9, 22, 74, 48, 86, 6, 96, 88, 56,
+ 32, 10, 2, 51, 63, 83, 15, 84, 56, 38,
+ 18, 22, 4, 3, 9, 33, 49, 33, 15, 23,
+ 3, 6, 7, 9, 6, 12, 14, 8, 22, 24,
+ 4, 94, 88, 72, 58, 48, 36, 0, 17, 41,
+ },
+
+ {
+ /* Context Tables for I, SI Slices :: qp = 21 */
+
+ 104, 14, 27, 104, 14, 27, 53, 19, 36, 22,
+ 6, 0, 62, 72, 106, 28, 11, 0, 6, 26,
+ 0, 1, 7, 17, 47, 18, 50, 75, 95, 103,
+ 5, 27, 9, 6, 26, 0, 29, 17, 20, 14,
+ 3, 19, 35, 2, 25, 31, 53, 10, 17, 33,
+ 6, 21, 27, 49, 12, 7, 9, 19, 2, 8,
+ 44, 0, 0, 0, 15, 55, 67, 2, 10, 3,
+ 104, 14, 11, 81, 41, 37, 1, 31, 55, 45,
+ 63, 15, 55, 45, 47, 73, 71, 75, 73, 32,
+ 3, 1, 45, 5, 37, 25, 47, 17, 51, 35,
+ 73, 26, 23, 5, 65, 39, 17, 19, 1, 12,
+ 2, 9, 22, 16, 15, 2, 0, 3, 4, 23,
+ 24, 4, 30, 38, 22, 34, 34, 26, 12, 22,
+ 8, 16, 18, 33, 53, 33, 39, 19, 29, 21,
+ 0, 17, 19, 11, 11, 15, 17, 7, 37, 55,
+ 33, 63, 11, 21, 6, 7, 3, 24, 2, 3,
+ 1, 22, 9, 23, 2, 47, 64, 70, 56, 56,
+ 70, 54, 26, 60, 54, 8, 28, 26, 8, 2,
+ 13, 20, 8, 10, 14, 4, 5, 14, 2, 19,
+ 3, 9, 19, 9, 51, 96, 104, 92, 98, 94,
+ 86, 84, 86, 78, 62, 62, 52, 32, 26, 13,
+ 60, 50, 14, 62, 44, 44, 34, 24, 18, 16,
+ 4, 19, 5, 17, 47, 39, 71, 7, 58, 40,
+ 26, 8, 10, 1, 11, 17, 31, 6, 86, 56,
+ 44, 34, 40, 12, 1, 11, 17, 25, 56, 34,
+ 18, 8, 12, 3, 13, 25, 47, 37, 12, 1,
+ 3, 9, 9, 29, 37, 57, 7, 74, 42, 26,
+ 10, 20, 3, 13, 17, 29, 124, 43, 25, 9,
+ 5, 11, 2, 12, 14, 9, 10, 22, 18, 27,
+ 15, 15, 12, 22, 16, 10, 30, 38, 36, 2,
+ 16, 30, 28, 23, 9, 23, 49, 37, 23, 29,
+ 21, 23, 19, 13, 15, 19, 17, 9, 15, 21,
+ 35, 51, 25, 67, 25, 7, 13, 6, 11, 3,
+ 18, 3, 7, 5, 12, 11, 13, 43, 76, 74,
+ 76, 68, 54, 70, 62, 56, 50, 46, 48, 38,
+ 2, 2, 11, 30, 30, 18, 7, 12, 8, 9,
+ 13, 17, 21, 25, 53, 33, 59, 102, 100, 94,
+ 90, 76, 74, 68, 52, 54, 46, 40, 24, 14,
+ 3, 23, 56, 58, 10, 60, 66, 44, 32, 34,
+ 28, 20, 26, 14, 2, 9, 13, 27, 53, 4,
+ 15, 37, 67, 43, 53, 13, 17, 13, 54, 4,
+ 4, 9, 24, 78, 52, 90, 8, 92, 84, 52,
+ 26, 6, 3, 55, 67, 85, 15, 84, 56, 38,
+ 18, 22, 4, 3, 9, 33, 47, 31, 13, 21,
+ 1, 8, 7, 7, 8, 14, 16, 8, 22, 24,
+ 4, 92, 86, 70, 56, 44, 32, 3, 21, 43,
+ },
+
+ {
+ /* Context Tables for I, SI Slices :: qp = 22 */
+
+ 102, 14, 29, 102, 14, 29, 49, 17, 38, 22,
+ 6, 1, 60, 70, 106, 28, 7, 0, 6, 28,
+ 0, 3, 7, 19, 49, 16, 48, 77, 97, 105,
+ 0, 25, 9, 6, 28, 0, 27, 15, 22, 12,
+ 1, 17, 33, 0, 25, 31, 53, 10, 15, 33,
+ 6, 21, 25, 49, 12, 7, 9, 19, 2, 8,
+ 44, 0, 0, 0, 13, 55, 67, 2, 10, 5,
+ 104, 14, 11, 79, 41, 37, 1, 29, 51, 41,
+ 61, 13, 51, 43, 43, 71, 69, 73, 71, 34,
+ 3, 0, 45, 3, 37, 25, 43, 17, 49, 35,
+ 71, 26, 23, 5, 61, 39, 15, 17, 1, 12,
+ 2, 7, 22, 16, 17, 2, 0, 3, 4, 23,
+ 24, 4, 30, 38, 20, 34, 34, 26, 8, 22,
+ 8, 12, 16, 35, 53, 31, 39, 19, 27, 21,
+ 2, 17, 19, 11, 7, 15, 17, 7, 39, 55,
+ 31, 63, 11, 21, 8, 5, 5, 24, 2, 3,
+ 3, 24, 11, 25, 2, 49, 60, 68, 54, 56,
+ 68, 54, 26, 58, 52, 6, 26, 26, 8, 2,
+ 13, 18, 6, 8, 12, 4, 7, 12, 2, 23,
+ 5, 9, 21, 11, 51, 94, 100, 90, 94, 90,
+ 82, 80, 82, 72, 56, 58, 46, 26, 20, 19,
+ 56, 46, 12, 58, 38, 38, 28, 20, 12, 12,
+ 1, 23, 7, 21, 49, 41, 71, 5, 60, 40,
+ 28, 8, 10, 1, 11, 17, 29, 6, 86, 58,
+ 44, 34, 42, 12, 1, 9, 17, 23, 58, 34,
+ 20, 8, 14, 3, 13, 23, 45, 35, 14, 0,
+ 1, 9, 9, 29, 35, 53, 7, 74, 42, 26,
+ 12, 20, 3, 11, 17, 27, 124, 41, 23, 9,
+ 5, 11, 2, 12, 14, 9, 12, 22, 20, 27,
+ 15, 17, 12, 22, 14, 8, 28, 36, 36, 0,
+ 14, 30, 28, 23, 11, 25, 47, 35, 21, 29,
+ 21, 23, 17, 13, 15, 17, 15, 7, 15, 21,
+ 35, 51, 23, 67, 25, 7, 13, 8, 11, 3,
+ 20, 5, 7, 5, 12, 13, 13, 45, 74, 72,
+ 76, 66, 52, 70, 62, 54, 48, 46, 46, 36,
+ 0, 0, 13, 26, 28, 14, 11, 8, 4, 11,
+ 13, 17, 21, 25, 51, 35, 57, 98, 96, 90,
+ 86, 70, 70, 64, 46, 50, 40, 36, 18, 10,
+ 5, 23, 54, 56, 6, 56, 60, 40, 28, 30,
+ 24, 16, 20, 10, 1, 13, 17, 29, 57, 0,
+ 19, 41, 65, 41, 51, 11, 15, 11, 56, 6,
+ 6, 9, 26, 80, 54, 92, 10, 90, 82, 46,
+ 20, 0, 7, 61, 71, 89, 13, 84, 56, 38,
+ 18, 22, 4, 3, 9, 31, 45, 31, 11, 19,
+ 0, 8, 5, 5, 8, 14, 16, 8, 24, 24,
+ 4, 90, 82, 66, 52, 42, 30, 9, 23, 45,
+ },
+
+ {
+ /* Context Tables for I, SI Slices :: qp = 23 */
+
+ 100, 14, 29, 100, 14, 29, 45, 13, 38, 22,
+ 4, 5, 56, 66, 106, 28, 1, 1, 8, 28,
+ 0, 5, 7, 21, 51, 16, 44, 81, 99, 105,
+ 6, 23, 9, 8, 28, 0, 27, 13, 22, 12,
+ 1, 17, 33, 0, 27, 29, 53, 10, 15, 31,
+ 4, 21, 25, 49, 14, 5, 9, 17, 4, 8,
+ 44, 0, 0, 0, 13, 55, 67, 4, 8, 5,
+ 104, 14, 11, 77, 39, 37, 0, 25, 49, 37,
+ 57, 11, 49, 39, 39, 69, 67, 71, 71, 36,
+ 3, 2, 43, 3, 35, 23, 39, 17, 49, 33,
+ 69, 26, 21, 3, 59, 37, 15, 17, 0, 12,
+ 2, 5, 24, 18, 17, 0, 0, 3, 4, 23,
+ 24, 2, 28, 38, 20, 34, 34, 24, 4, 22,
+ 6, 8, 14, 37, 51, 29, 37, 17, 25, 19,
+ 4, 15, 17, 9, 5, 15, 17, 5, 41, 53,
+ 29, 63, 11, 21, 8, 5, 7, 26, 2, 3,
+ 7, 26, 11, 27, 2, 51, 58, 66, 54, 54,
+ 66, 52, 24, 56, 52, 4, 24, 26, 8, 2,
+ 15, 16, 6, 6, 12, 2, 9, 10, 0, 27,
+ 7, 11, 23, 13, 51, 90, 98, 86, 90, 86,
+ 78, 74, 78, 68, 52, 54, 42, 22, 16, 23,
+ 52, 44, 8, 52, 34, 34, 24, 14, 8, 6,
+ 5, 29, 9, 23, 53, 43, 71, 5, 60, 42,
+ 28, 8, 10, 1, 11, 17, 29, 8, 88, 58,
+ 44, 34, 44, 14, 0, 9, 15, 21, 60, 36,
+ 20, 8, 16, 1, 11, 23, 43, 35, 16, 0,
+ 1, 9, 7, 27, 35, 51, 5, 76, 42, 26,
+ 12, 22, 1, 9, 15, 25, 124, 41, 23, 7,
+ 5, 11, 2, 12, 14, 7, 12, 22, 20, 27,
+ 15, 17, 10, 24, 12, 6, 26, 34, 36, 1,
+ 12, 28, 28, 25, 13, 25, 45, 35, 19, 27,
+ 19, 21, 17, 11, 15, 17, 15, 7, 17, 21,
+ 35, 49, 21, 67, 25, 5, 13, 8, 11, 3,
+ 22, 5, 7, 5, 14, 13, 15, 47, 72, 72,
+ 74, 64, 50, 68, 60, 52, 46, 44, 44, 34,
+ 1, 1, 15, 24, 24, 12, 15, 6, 2, 13,
+ 15, 19, 21, 25, 51, 35, 55, 94, 92, 86,
+ 80, 64, 66, 60, 42, 46, 36, 32, 14, 6,
+ 9, 25, 50, 52, 4, 52, 56, 36, 24, 26,
+ 20, 12, 16, 6, 7, 17, 21, 33, 59, 3,
+ 23, 43, 63, 39, 49, 11, 13, 9, 60, 8,
+ 8, 9, 28, 84, 58, 96, 10, 86, 78, 42,
+ 14, 5, 13, 67, 75, 91, 13, 84, 56, 38,
+ 18, 22, 4, 3, 9, 31, 43, 29, 9, 17,
+ 2, 10, 5, 3, 10, 16, 18, 10, 24, 26,
+ 4, 86, 80, 62, 48, 38, 26, 13, 27, 47,
+ },
+
+ {
+ /* Context Tables for I, SI Slices :: qp = 24 */
+
+ 96, 12, 29, 96, 12, 29, 43, 11, 38, 22,
+ 4, 7, 54, 64, 106, 28, 2, 1, 10, 30,
+ 0, 7, 7, 23, 55, 14, 40, 85, 101, 107,
+ 10, 21, 9, 10, 30, 0, 27, 11, 22, 10,
+ 1, 17, 31, 1, 27, 29, 55, 10, 15, 31,
+ 4, 23, 25, 49, 14, 5, 9, 17, 4, 8,
+ 44, 0, 0, 0, 11, 57, 67, 4, 6, 5,
+ 104, 14, 11, 75, 37, 37, 0, 23, 45, 33,
+ 55, 9, 45, 37, 37, 67, 67, 69, 69, 36,
+ 3, 4, 41, 3, 35, 23, 37, 17, 49, 33,
+ 67, 26, 21, 3, 57, 37, 13, 17, 0, 12,
+ 2, 5, 24, 18, 19, 0, 0, 3, 4, 23,
+ 24, 2, 26, 38, 18, 34, 34, 24, 0, 22,
+ 6, 4, 12, 39, 49, 27, 35, 15, 25, 19,
+ 6, 15, 17, 7, 3, 15, 17, 5, 45, 53,
+ 27, 63, 11, 23, 10, 5, 9, 28, 2, 3,
+ 9, 28, 13, 29, 2, 53, 54, 64, 52, 52,
+ 64, 50, 22, 54, 50, 0, 22, 24, 8, 2,
+ 15, 14, 4, 4, 10, 0, 11, 8, 1, 31,
+ 11, 13, 25, 15, 51, 86, 94, 82, 86, 82,
+ 74, 70, 74, 62, 46, 50, 36, 16, 10, 29,
+ 48, 40, 4, 48, 30, 28, 18, 10, 2, 0,
+ 9, 33, 11, 27, 55, 45, 73, 5, 60, 42,
+ 28, 8, 10, 1, 11, 17, 27, 8, 88, 58,
+ 44, 34, 46, 14, 0, 9, 15, 21, 62, 36,
+ 20, 8, 16, 1, 11, 21, 41, 33, 18, 2,
+ 1, 9, 5, 25, 33, 49, 5, 76, 42, 26,
+ 12, 24, 1, 9, 13, 25, 124, 41, 23, 7,
+ 5, 11, 2, 12, 14, 7, 12, 22, 22, 27,
+ 15, 17, 8, 24, 10, 4, 24, 32, 36, 5,
+ 8, 28, 28, 27, 15, 27, 45, 33, 17, 27,
+ 19, 21, 15, 9, 17, 17, 15, 7, 19, 21,
+ 35, 49, 21, 69, 25, 5, 13, 10, 11, 3,
+ 22, 5, 9, 5, 16, 15, 15, 49, 70, 70,
+ 74, 62, 46, 66, 58, 50, 44, 42, 42, 32,
+ 3, 3, 17, 20, 22, 8, 19, 2, 0, 15,
+ 17, 19, 21, 25, 51, 37, 55, 88, 86, 82,
+ 76, 58, 60, 54, 36, 42, 32, 28, 8, 2,
+ 11, 27, 46, 48, 0, 48, 52, 30, 18, 20,
+ 14, 8, 12, 0, 11, 21, 27, 37, 61, 7,
+ 27, 47, 61, 37, 47, 9, 11, 9, 62, 10,
+ 8, 9, 28, 86, 60, 98, 12, 84, 74, 36,
+ 8, 11, 19, 73, 79, 93, 13, 84, 56, 38,
+ 18, 22, 4, 3, 9, 29, 43, 27, 7, 17,
+ 4, 10, 3, 3, 10, 16, 18, 10, 26, 26,
+ 2, 84, 76, 58, 44, 34, 22, 17, 31, 49,
+ },
+
+ {
+ /* Context Tables for I, SI Slices :: qp = 25 */
+
+ 94, 12, 29, 94, 12, 29, 39, 9, 40, 22,
+ 4, 9, 52, 62, 106, 28, 6, 1, 12, 32,
+ 0, 7, 5, 25, 57, 12, 36, 87, 103, 109,
+ 16, 17, 9, 12, 32, 0, 25, 9, 24, 8,
+ 1, 15, 29, 1, 27, 29, 55, 10, 15, 29,
+ 4, 23, 25, 49, 14, 5, 7, 17, 4, 8,
+ 44, 0, 0, 0, 9, 57, 67, 4, 4, 5,
+ 104, 14, 11, 73, 35, 37, 0, 21, 41, 29,
+ 51, 5, 41, 35, 33, 65, 65, 67, 67, 38,
+ 1, 6, 39, 3, 33, 21, 33, 17, 47, 33,
+ 63, 28, 21, 3, 53, 37, 11, 17, 0, 14,
+ 4, 3, 24, 18, 19, 0, 2, 3, 4, 21,
+ 24, 2, 26, 38, 16, 34, 34, 24, 3, 22,
+ 6, 2, 10, 39, 47, 25, 33, 13, 23, 17,
+ 8, 13, 17, 5, 1, 15, 17, 5, 47, 51,
+ 25, 63, 9, 23, 12, 5, 11, 30, 2, 3,
+ 11, 30, 15, 31, 2, 53, 52, 62, 52, 50,
+ 62, 48, 22, 54, 50, 1, 20, 24, 8, 2,
+ 15, 14, 2, 2, 8, 1, 13, 8, 1, 33,
+ 13, 13, 27, 17, 51, 84, 92, 80, 84, 80,
+ 70, 66, 70, 58, 42, 46, 32, 10, 6, 35,
+ 46, 36, 2, 44, 26, 24, 14, 6, 1, 3,
+ 13, 37, 11, 31, 57, 45, 73, 5, 62, 42,
+ 28, 10, 10, 1, 9, 17, 25, 8, 88, 58,
+ 46, 34, 48, 16, 2, 7, 13, 19, 64, 38,
+ 22, 10, 18, 0, 11, 19, 39, 31, 22, 4,
+ 0, 7, 3, 23, 31, 47, 3, 76, 44, 28,
+ 12, 26, 0, 7, 11, 23, 124, 39, 21, 5,
+ 5, 11, 2, 14, 16, 7, 12, 24, 24, 27,
+ 15, 17, 8, 24, 8, 4, 22, 32, 36, 7,
+ 6, 28, 28, 27, 17, 29, 43, 31, 15, 27,
+ 17, 21, 13, 7, 17, 17, 15, 5, 19, 21,
+ 35, 49, 19, 69, 25, 5, 13, 12, 11, 3,
+ 24, 5, 9, 5, 18, 17, 15, 49, 68, 70,
+ 74, 62, 44, 64, 56, 48, 44, 40, 40, 32,
+ 3, 5, 19, 18, 20, 4, 23, 0, 1, 15,
+ 17, 19, 19, 23, 51, 39, 53, 84, 82, 78,
+ 72, 54, 56, 50, 32, 38, 28, 24, 4, 1,
+ 13, 29, 42, 46, 1, 44, 48, 26, 14, 16,
+ 10, 4, 8, 3, 15, 23, 31, 41, 63, 11,
+ 31, 51, 59, 33, 43, 7, 9, 7, 66, 12,
+ 10, 9, 30, 90, 62, 102, 14, 82, 72, 32,
+ 2, 15, 25, 77, 83, 95, 13, 84, 56, 38,
+ 18, 24, 4, 3, 9, 27, 41, 25, 5, 15,
+ 8, 12, 1, 1, 12, 18, 20, 10, 28, 26,
+ 2, 82, 74, 56, 42, 30, 18, 21, 35, 49,
+ },
+
+ {
+ /* Context Tables for I, SI Slices :: qp = 26 */
+
+ 92, 12, 29, 92, 12, 29, 35, 5, 40, 22,
+ 2, 13, 48, 58, 106, 28, 12, 3, 14, 32,
+ 0, 9, 5, 27, 59, 12, 32, 91, 105, 109,
+ 22, 15, 9, 14, 32, 0, 25, 7, 24, 8,
+ 1, 15, 29, 1, 29, 27, 55, 10, 15, 27,
+ 2, 23, 25, 49, 16, 3, 7, 15, 6, 8,
+ 44, 0, 0, 0, 9, 57, 67, 6, 2, 5,
+ 104, 14, 11, 71, 33, 37, 2, 17, 39, 25,
+ 49, 3, 39, 31, 29, 63, 63, 65, 67, 40,
+ 1, 8, 37, 3, 33, 19, 29, 17, 47, 31,
+ 61, 28, 19, 1, 51, 35, 11, 17, 2, 14,
+ 4, 1, 26, 20, 21, 1, 2, 3, 4, 21,
+ 24, 0, 24, 38, 16, 34, 34, 22, 7, 22,
+ 4, 1, 8, 41, 45, 23, 31, 11, 21, 15,
+ 10, 11, 15, 3, 0, 15, 17, 3, 49, 51,
+ 23, 63, 9, 23, 12, 5, 13, 32, 2, 3,
+ 15, 32, 15, 33, 2, 55, 48, 60, 50, 48,
+ 60, 46, 20, 52, 50, 3, 18, 24, 8, 2,
+ 17, 12, 2, 0, 8, 3, 15, 6, 3, 37,
+ 15, 15, 29, 19, 51, 80, 90, 76, 80, 76,
+ 66, 60, 66, 52, 38, 42, 26, 6, 2, 39,
+ 42, 34, 1, 38, 22, 20, 10, 0, 5, 9,
+ 17, 43, 13, 33, 61, 47, 73, 5, 62, 44,
+ 28, 10, 10, 1, 9, 17, 25, 10, 90, 58,
+ 46, 34, 50, 16, 4, 7, 11, 17, 66, 40,
+ 22, 10, 20, 0, 9, 19, 37, 31, 24, 4,
+ 0, 7, 1, 21, 31, 45, 3, 78, 44, 28,
+ 12, 28, 2, 5, 9, 21, 124, 39, 21, 5,
+ 5, 11, 2, 14, 16, 5, 12, 24, 24, 27,
+ 15, 17, 6, 26, 6, 2, 20, 30, 36, 9,
+ 4, 26, 28, 29, 19, 29, 41, 31, 13, 25,
+ 17, 19, 13, 5, 17, 17, 15, 5, 21, 21,
+ 35, 47, 17, 69, 25, 3, 13, 12, 11, 3,
+ 26, 5, 9, 5, 20, 17, 17, 51, 66, 70,
+ 72, 60, 42, 62, 54, 46, 42, 38, 38, 30,
+ 5, 7, 21, 14, 16, 2, 27, 1, 3, 17,
+ 19, 21, 19, 23, 51, 39, 51, 80, 78, 74,
+ 66, 48, 52, 46, 26, 34, 24, 20, 0, 5,
+ 17, 31, 38, 42, 3, 40, 44, 22, 10, 12,
+ 6, 0, 4, 7, 21, 27, 35, 45, 65, 15,
+ 35, 53, 57, 31, 41, 7, 7, 5, 70, 14,
+ 12, 9, 32, 92, 66, 104, 14, 78, 68, 26,
+ 3, 21, 31, 83, 87, 97, 13, 84, 56, 38,
+ 18, 24, 4, 3, 9, 27, 39, 23, 3, 13,
+ 10, 14, 1, 0, 14, 20, 22, 12, 28, 28,
+ 2, 78, 70, 52, 38, 26, 14, 25, 39, 51,
+ },
+
+ {
+ /* Context Tables for I, SI Slices :: qp = 27 */
+
+ 90, 12, 31, 90, 12, 31, 31, 3, 42, 22,
+ 2, 15, 46, 56, 106, 28, 16, 3, 14, 34,
+ 0, 11, 5, 29, 61, 10, 30, 93, 107, 111,
+ 28, 13, 9, 14, 34, 0, 23, 5, 26, 6,
+ 0, 13, 27, 3, 29, 27, 55, 10, 13, 27,
+ 2, 23, 23, 49, 16, 3, 7, 15, 6, 8,
+ 44, 0, 0, 0, 7, 57, 67, 6, 2, 7,
+ 104, 14, 11, 69, 33, 37, 2, 15, 35, 21,
+ 45, 1, 35, 29, 25, 61, 61, 63, 65, 42,
+ 1, 10, 37, 1, 31, 19, 25, 17, 45, 31,
+ 59, 28, 19, 1, 47, 35, 9, 15, 2, 14,
+ 4, 0, 26, 20, 21, 1, 2, 3, 4, 21,
+ 24, 0, 24, 38, 14, 34, 34, 22, 11, 22,
+ 4, 5, 6, 43, 45, 21, 31, 11, 19, 15,
+ 12, 11, 15, 3, 4, 15, 17, 3, 51, 49,
+ 21, 63, 9, 23, 14, 3, 15, 32, 2, 3,
+ 17, 34, 17, 35, 2, 57, 46, 58, 50, 48,
+ 58, 46, 20, 50, 48, 5, 16, 24, 8, 2,
+ 17, 10, 0, 1, 6, 3, 17, 4, 3, 41,
+ 17, 15, 31, 21, 51, 78, 86, 74, 76, 72,
+ 62, 56, 62, 48, 32, 38, 22, 0, 3, 45,
+ 38, 30, 3, 34, 16, 14, 4, 3, 11, 13,
+ 23, 47, 15, 37, 63, 49, 73, 3, 64, 44,
+ 30, 10, 10, 1, 9, 17, 23, 10, 90, 60,
+ 46, 34, 52, 18, 4, 5, 11, 15, 68, 40,
+ 24, 10, 22, 2, 9, 17, 35, 29, 26, 6,
+ 2, 7, 1, 21, 29, 41, 1, 78, 44, 28,
+ 14, 28, 2, 3, 9, 19, 124, 37, 19, 3,
+ 5, 11, 2, 14, 16, 5, 14, 24, 26, 27,
+ 15, 19, 6, 26, 4, 0, 18, 28, 36, 11,
+ 2, 26, 28, 29, 21, 31, 39, 29, 11, 25,
+ 15, 19, 11, 5, 17, 15, 13, 3, 21, 21,
+ 35, 47, 15, 69, 25, 3, 13, 14, 11, 3,
+ 28, 7, 9, 5, 20, 19, 17, 53, 64, 68,
+ 72, 58, 40, 62, 54, 44, 40, 38, 36, 28,
+ 7, 9, 23, 12, 14, 1, 31, 5, 7, 19,
+ 19, 21, 19, 23, 49, 41, 49, 76, 74, 70,
+ 62, 42, 48, 42, 22, 30, 18, 16, 5, 9,
+ 19, 31, 36, 40, 7, 36, 38, 18, 6, 8,
+ 2, 3, 1, 11, 25, 31, 39, 47, 69, 19,
+ 39, 57, 55, 29, 39, 5, 5, 3, 72, 16,
+ 14, 9, 34, 96, 68, 108, 16, 76, 66, 22,
+ 9, 27, 35, 89, 91, 101, 11, 84, 56, 38,
+ 18, 24, 4, 3, 9, 25, 37, 23, 1, 11,
+ 12, 14, 0, 2, 14, 20, 22, 12, 30, 28,
+ 2, 76, 68, 48, 34, 24, 12, 31, 41, 53,
+ },
+
+ {
+ /* Context Tables for I, SI Slices :: qp = 28 */
+
+ 86, 12, 31, 86, 12, 31, 29, 0, 42, 22,
+ 0, 19, 42, 54, 106, 28, 20, 3, 16, 36,
+ 0, 13, 5, 31, 63, 10, 26, 97, 109, 111,
+ 34, 11, 9, 16, 36, 0, 23, 3, 26, 6,
+ 0, 13, 25, 3, 31, 27, 55, 10, 13, 25,
+ 2, 25, 23, 49, 18, 3, 7, 15, 8, 8,
+ 44, 0, 0, 0, 7, 59, 67, 8, 0, 7,
+ 104, 14, 11, 67, 31, 37, 4, 13, 31, 17,
+ 43, 0, 31, 27, 21, 59, 61, 61, 63, 42,
+ 1, 12, 35, 1, 31, 17, 21, 17, 45, 29,
+ 57, 28, 19, 0, 45, 33, 7, 15, 2, 14,
+ 4, 0, 28, 22, 23, 1, 2, 3, 4, 21,
+ 24, 1, 22, 38, 12, 34, 34, 22, 15, 22,
+ 2, 9, 4, 45, 43, 19, 29, 9, 19, 13,
+ 14, 9, 13, 1, 6, 15, 17, 3, 53, 49,
+ 19, 63, 9, 23, 16, 3, 17, 34, 2, 3,
+ 19, 36, 17, 37, 2, 59, 42, 56, 48, 46,
+ 56, 44, 18, 48, 48, 9, 14, 22, 8, 2,
+ 19, 8, 1, 3, 4, 5, 19, 2, 5, 45,
+ 19, 17, 33, 23, 51, 74, 84, 70, 72, 68,
+ 58, 52, 58, 42, 28, 34, 16, 3, 7, 49,
+ 34, 26, 7, 30, 12, 10, 0, 7, 15, 19,
+ 27, 53, 17, 41, 65, 51, 73, 3, 64, 44,
+ 30, 10, 10, 1, 9, 17, 23, 10, 90, 60,
+ 46, 34, 54, 18, 6, 5, 9, 15, 70, 42,
+ 24, 10, 22, 2, 7, 17, 33, 27, 28, 8,
+ 2, 7, 0, 19, 27, 39, 1, 78, 44, 28,
+ 14, 30, 4, 3, 7, 17, 124, 37, 19, 3,
+ 5, 11, 2, 14, 16, 5, 14, 24, 26, 27,
+ 15, 19, 4, 28, 2, 1, 16, 26, 36, 15,
+ 0, 26, 28, 31, 23, 31, 39, 27, 9, 23,
+ 15, 17, 11, 3, 19, 15, 13, 3, 23, 21,
+ 35, 45, 13, 69, 25, 3, 13, 16, 11, 3,
+ 30, 7, 11, 5, 22, 21, 17, 55, 62, 68,
+ 70, 56, 38, 60, 52, 42, 38, 36, 34, 26,
+ 9, 11, 25, 8, 12, 3, 35, 7, 9, 21,
+ 21, 23, 19, 23, 49, 41, 49, 70, 68, 66,
+ 58, 36, 44, 38, 16, 26, 14, 12, 9, 13,
+ 23, 33, 32, 36, 9, 32, 34, 14, 2, 2,
+ 1, 7, 5, 15, 31, 35, 43, 51, 71, 23,
+ 43, 59, 53, 27, 37, 3, 3, 1, 76, 18,
+ 16, 9, 36, 98, 72, 110, 18, 72, 62, 16,
+ 15, 33, 41, 95, 95, 103, 11, 84, 56, 38,
+ 18, 24, 4, 3, 9, 25, 35, 21, 0, 11,
+ 14, 16, 0, 4, 16, 22, 24, 12, 30, 28,
+ 2, 74, 64, 44, 30, 20, 8, 35, 45, 55,
+ },
+
+ {
+ /* Context Tables for I, SI Slices :: qp = 29 */
+
+ 84, 12, 31, 84, 12, 31, 25, 2, 42, 22,
+ 0, 21, 40, 50, 106, 28, 26, 5, 18, 36,
+ 0, 13, 3, 33, 65, 8, 22, 101, 111, 113,
+ 40, 7, 9, 18, 36, 0, 23, 1, 26, 4,
+ 0, 13, 25, 3, 31, 25, 55, 10, 13, 23,
+ 0, 25, 23, 49, 18, 1, 5, 13, 8, 8,
+ 44, 0, 0, 0, 5, 59, 67, 8, 1, 7,
+ 104, 14, 11, 65, 29, 37, 4, 9, 29, 13,
+ 39, 2, 29, 23, 17, 57, 59, 59, 63, 44,
+ 1, 14, 33, 1, 29, 15, 17, 17, 45, 29,
+ 53, 30, 17, 0, 43, 33, 7, 15, 4, 16,
+ 6, 2, 28, 22, 23, 3, 2, 3, 4, 21,
+ 24, 1, 20, 38, 12, 34, 34, 20, 19, 22,
+ 2, 11, 2, 47, 41, 17, 27, 7, 17, 11,
+ 16, 7, 13, 0, 8, 15, 17, 1, 55, 47,
+ 17, 63, 7, 23, 16, 3, 19, 36, 2, 3,
+ 23, 38, 19, 39, 2, 59, 40, 54, 48, 44,
+ 54, 42, 16, 48, 48, 11, 12, 22, 8, 2,
+ 19, 8, 1, 5, 4, 7, 21, 2, 7, 49,
+ 21, 19, 35, 25, 51, 70, 82, 66, 68, 66,
+ 54, 46, 54, 38, 24, 30, 12, 9, 11, 55,
+ 30, 24, 11, 24, 8, 6, 3, 13, 19, 25,
+ 31, 57, 19, 43, 69, 51, 73, 3, 64, 46,
+ 30, 10, 10, 1, 9, 17, 21, 12, 92, 60,
+ 46, 34, 56, 20, 8, 5, 7, 13, 72, 44,
+ 24, 12, 24, 4, 7, 15, 31, 27, 32, 8,
+ 2, 5, 2, 17, 27, 37, 0, 80, 46, 28,
+ 14, 32, 6, 1, 5, 15, 124, 37, 19, 1,
+ 5, 11, 2, 14, 16, 3, 14, 26, 28, 27,
+ 15, 19, 2, 28, 0, 3, 14, 26, 36, 17,
+ 1, 24, 28, 33, 25, 33, 37, 27, 7, 23,
+ 13, 17, 9, 1, 19, 15, 13, 3, 25, 21,
+ 35, 45, 11, 69, 25, 1, 13, 16, 11, 3,
+ 32, 7, 11, 5, 24, 21, 19, 55, 60, 68,
+ 70, 56, 36, 58, 50, 40, 38, 34, 32, 24,
+ 11, 13, 27, 6, 8, 7, 39, 9, 11, 23,
+ 23, 23, 19, 21, 49, 43, 47, 66, 64, 62,
+ 52, 32, 40, 34, 12, 22, 10, 8, 13, 17,
+ 25, 35, 28, 32, 11, 28, 30, 10, 1, 1,
+ 5, 11, 9, 19, 35, 39, 47, 55, 73, 27,
+ 47, 63, 51, 23, 35, 3, 1, 0, 80, 20,
+ 18, 9, 38, 102, 74, 114, 18, 70, 58, 12,
+ 21, 37, 47, 99, 99, 105, 11, 84, 56, 38,
+ 18, 24, 4, 3, 9, 23, 33, 19, 2, 9,
+ 16, 18, 2, 6, 18, 24, 26, 14, 32, 30,
+ 2, 70, 62, 42, 28, 16, 4, 39, 49, 57,
+ },
+
+ {
+ /* Context Tables for I, SI Slices :: qp = 30 */
+
+ 82, 12, 31, 82, 12, 31, 21, 6, 44, 22,
+ 1, 25, 36, 48, 106, 28, 30, 5, 20, 38,
+ 0, 15, 3, 35, 67, 8, 18, 103, 113, 113,
+ 46, 5, 9, 20, 38, 0, 21, 0, 28, 4,
+ 0, 11, 23, 5, 33, 25, 55, 10, 13, 23,
+ 0, 25, 23, 49, 20, 1, 5, 13, 10, 8,
+ 44, 0, 0, 0, 5, 59, 67, 10, 3, 7,
+ 104, 14, 11, 63, 27, 37, 6, 7, 25, 9,
+ 37, 4, 25, 21, 13, 55, 57, 57, 61, 46,
+ 1, 16, 31, 1, 29, 15, 13, 17, 43, 27,
+ 51, 30, 17, 2, 39, 31, 5, 15, 4, 16,
+ 6, 4, 30, 24, 25, 3, 2, 3, 4, 21,
+ 24, 3, 20, 38, 10, 34, 34, 20, 23, 22,
+ 0, 15, 0, 49, 39, 15, 25, 5, 15, 11,
+ 18, 7, 11, 2, 10, 15, 17, 1, 57, 47,
+ 15, 63, 7, 23, 18, 3, 21, 38, 2, 3,
+ 25, 40, 19, 41, 2, 61, 36, 52, 46, 42,
+ 52, 40, 16, 46, 46, 13, 10, 22, 8, 2,
+ 21, 6, 3, 7, 2, 9, 23, 0, 7, 53,
+ 23, 19, 37, 27, 51, 68, 78, 64, 64, 62,
+ 50, 42, 50, 32, 18, 26, 6, 13, 17, 59,
+ 26, 20, 13, 20, 4, 0, 9, 17, 25, 29,
+ 35, 63, 21, 47, 71, 53, 73, 3, 66, 46,
+ 30, 10, 10, 1, 9, 17, 21, 12, 92, 60,
+ 46, 34, 58, 20, 8, 3, 7, 11, 74, 44,
+ 26, 12, 26, 4, 5, 15, 29, 25, 34, 10,
+ 4, 5, 4, 15, 25, 35, 0, 80, 46, 28,
+ 14, 34, 6, 0, 3, 13, 124, 35, 17, 1,
+ 5, 11, 2, 14, 16, 3, 14, 26, 28, 27,
+ 15, 19, 2, 30, 1, 5, 12, 24, 36, 19,
+ 3, 24, 28, 33, 27, 33, 35, 25, 5, 21,
+ 13, 15, 9, 0, 19, 15, 13, 1, 25, 21,
+ 35, 43, 9, 69, 25, 1, 13, 18, 11, 3,
+ 34, 7, 11, 5, 26, 23, 19, 57, 58, 66,
+ 68, 54, 34, 56, 48, 38, 36, 32, 30, 22,
+ 13, 15, 29, 2, 6, 9, 43, 13, 13, 25,
+ 23, 25, 19, 21, 49, 43, 45, 62, 60, 58,
+ 48, 26, 36, 30, 6, 18, 6, 4, 19, 21,
+ 29, 37, 24, 30, 15, 24, 26, 6, 5, 5,
+ 9, 15, 13, 23, 41, 43, 51, 59, 75, 31,
+ 51, 65, 49, 21, 33, 1, 0, 2, 82, 22,
+ 20, 9, 40, 104, 78, 116, 20, 66, 56, 6,
+ 27, 43, 53, 105, 103, 107, 11, 84, 56, 38,
+ 18, 24, 4, 3, 9, 23, 31, 17, 4, 7,
+ 18, 18, 2, 8, 18, 24, 26, 14, 32, 30,
+ 2, 68, 58, 38, 24, 12, 0, 43, 53, 59,
+ },
+
+ {
+ /* Context Tables for I, SI Slices :: qp = 31 */
+
+ 80, 12, 31, 80, 12, 31, 17, 8, 44, 22,
+ 1, 27, 34, 46, 106, 28, 34, 5, 22, 40,
+ 0, 17, 3, 37, 69, 6, 14, 107, 115, 115,
+ 52, 3, 9, 22, 40, 0, 21, 2, 28, 2,
+ 0, 11, 21, 5, 33, 25, 55, 10, 13, 21,
+ 0, 25, 23, 49, 20, 1, 5, 13, 10, 8,
+ 44, 0, 0, 0, 3, 59, 67, 10, 5, 7,
+ 104, 14, 11, 61, 25, 37, 6, 5, 21, 5,
+ 33, 6, 21, 19, 9, 53, 55, 55, 59, 48,
+ 1, 18, 29, 1, 27, 13, 9, 17, 43, 27,
+ 49, 30, 17, 2, 37, 31, 3, 15, 4, 16,
+ 6, 6, 30, 24, 25, 3, 2, 3, 4, 21,
+ 24, 3, 18, 38, 8, 34, 34, 20, 27, 22,
+ 0, 19, 1, 51, 37, 13, 23, 3, 13, 9,
+ 20, 5, 11, 4, 12, 15, 17, 1, 59, 45,
+ 13, 63, 7, 23, 20, 3, 23, 40, 2, 3,
+ 27, 42, 21, 43, 2, 63, 34, 50, 46, 40,
+ 50, 38, 14, 44, 46, 15, 8, 22, 8, 2,
+ 21, 4, 5, 9, 0, 11, 25, 1, 9, 57,
+ 25, 21, 39, 29, 51, 64, 76, 60, 60, 58,
+ 46, 38, 46, 28, 14, 22, 2, 19, 21, 65,
+ 22, 16, 17, 16, 0, 3, 13, 21, 29, 35,
+ 39, 67, 23, 51, 73, 55, 73, 3, 66, 46,
+ 30, 10, 10, 1, 9, 17, 19, 12, 92, 60,
+ 46, 34, 60, 22, 10, 3, 5, 9, 76, 46,
+ 26, 12, 28, 6, 5, 13, 27, 23, 36, 12,
+ 4, 5, 6, 13, 23, 33, 2, 80, 46, 28,
+ 14, 36, 8, 2, 1, 11, 124, 35, 17, 0,
+ 5, 11, 2, 14, 16, 3, 14, 26, 30, 27,
+ 15, 19, 0, 30, 3, 7, 10, 22, 36, 21,
+ 5, 24, 28, 35, 29, 35, 33, 23, 3, 21,
+ 11, 15, 7, 2, 19, 15, 13, 1, 27, 21,
+ 35, 43, 7, 69, 25, 1, 13, 20, 11, 3,
+ 36, 7, 11, 5, 28, 25, 19, 59, 56, 66,
+ 68, 52, 32, 54, 46, 36, 34, 30, 28, 20,
+ 15, 17, 31, 0, 4, 13, 47, 15, 15, 27,
+ 25, 25, 19, 21, 49, 45, 43, 58, 56, 54,
+ 44, 20, 32, 26, 2, 14, 2, 0, 23, 25,
+ 31, 39, 20, 26, 17, 20, 22, 2, 9, 9,
+ 13, 19, 17, 27, 45, 47, 55, 63, 77, 35,
+ 55, 69, 47, 19, 31, 0, 2, 4, 86, 24,
+ 22, 9, 42, 108, 80, 120, 22, 64, 52, 2,
+ 33, 49, 59, 111, 107, 109, 11, 84, 56, 38,
+ 18, 24, 4, 3, 9, 21, 29, 15, 6, 5,
+ 20, 20, 4, 10, 20, 26, 28, 14, 34, 30,
+ 2, 66, 56, 34, 20, 8, 3, 47, 57, 61,
+ },
+
+ {
+ /* Context Tables for I, SI Slices :: qp = 32 */
+
+ 76, 10, 33, 76, 10, 33, 15, 10, 44, 22,
+ 3, 31, 30, 42, 104, 28, 38, 7, 22, 40,
+ 1, 19, 3, 41, 73, 4, 10, 111, 117, 117,
+ 56, 1, 11, 22, 40, 1, 21, 4, 28, 0,
+ 0, 11, 21, 7, 35, 25, 57, 10, 13, 21,
+ 1, 27, 23, 49, 20, 1, 5, 13, 10, 6,
+ 44, 0, 0, 0, 3, 61, 67, 10, 7, 9,
+ 104, 12, 11, 59, 25, 37, 6, 3, 19, 3,
+ 31, 8, 19, 17, 7, 51, 55, 53, 59, 48,
+ 1, 18, 29, 1, 27, 13, 7, 17, 43, 27,
+ 47, 30, 17, 2, 35, 31, 3, 15, 4, 16,
+ 6, 6, 30, 24, 27, 5, 2, 5, 4, 21,
+ 22, 5, 16, 38, 6, 32, 34, 18, 31, 20,
+ 1, 23, 3, 53, 37, 13, 23, 3, 13, 9,
+ 22, 5, 11, 4, 14, 17, 17, 1, 63, 45,
+ 13, 63, 7, 25, 20, 3, 25, 40, 2, 3,
+ 31, 42, 23, 45, 2, 65, 30, 48, 44, 38,
+ 48, 36, 12, 42, 44, 19, 6, 20, 6, 2,
+ 23, 2, 7, 11, 1, 13, 27, 3, 11, 61,
+ 29, 23, 43, 33, 51, 60, 72, 56, 56, 54,
+ 40, 32, 40, 22, 8, 16, 3, 25, 27, 71,
+ 18, 12, 21, 10, 5, 9, 19, 27, 35, 41,
+ 45, 73, 25, 55, 77, 57, 75, 3, 66, 46,
+ 30, 10, 10, 3, 9, 17, 19, 12, 92, 60,
+ 46, 34, 62, 22, 10, 3, 5, 9, 76, 46,
+ 26, 12, 28, 6, 5, 13, 25, 23, 38, 12,
+ 4, 5, 6, 13, 23, 31, 2, 80, 46, 28,
+ 14, 36, 8, 2, 1, 11, 124, 35, 17, 0,
+ 5, 11, 2, 14, 16, 3, 14, 26, 30, 27,
+ 15, 21, 1, 30, 5, 9, 8, 20, 34, 25,
+ 9, 22, 26, 37, 33, 37, 33, 23, 1, 21,
+ 11, 15, 7, 2, 21, 15, 13, 1, 29, 21,
+ 35, 43, 7, 71, 25, 1, 13, 20, 13, 3,
+ 36, 9, 13, 5, 28, 27, 21, 61, 54, 64,
+ 66, 50, 28, 52, 44, 34, 32, 28, 26, 18,
+ 17, 21, 35, 3, 0, 17, 51, 19, 19, 29,
+ 27, 27, 19, 21, 49, 47, 43, 52, 50, 50,
+ 38, 14, 26, 20, 3, 8, 3, 5, 29, 31,
+ 35, 41, 16, 22, 21, 16, 16, 3, 15, 15,
+ 19, 23, 23, 33, 51, 51, 61, 67, 81, 39,
+ 59, 73, 45, 17, 29, 0, 2, 4, 88, 24,
+ 22, 9, 42, 110, 82, 122, 22, 60, 48, 3,
+ 41, 55, 65, 117, 113, 113, 11, 84, 54, 36,
+ 18, 24, 4, 5, 9, 21, 29, 15, 6, 5,
+ 22, 20, 4, 10, 20, 26, 28, 14, 34, 30,
+ 0, 62, 52, 30, 16, 4, 7, 53, 61, 63,
+ },
+
+ {
+ /* Context Tables for I, SI Slices :: qp = 33 */
+
+ 74, 10, 33, 74, 10, 33, 11, 14, 46, 24,
+ 3, 33, 28, 40, 104, 28, 44, 7, 24, 42,
+ 1, 19, 1, 43, 75, 4, 8, 113, 119, 117,
+ 62, 2, 11, 24, 42, 1, 19, 8, 30, 0,
+ 2, 9, 19, 7, 35, 23, 57, 10, 11, 19,
+ 1, 27, 21, 49, 22, 0, 3, 11, 12, 6,
+ 44, 0, 0, 0, 1, 61, 67, 12, 7, 9,
+ 104, 12, 11, 55, 23, 37, 8, 0, 15, 0,
+ 27, 12, 15, 13, 3, 47, 53, 51, 57, 50,
+ 0, 20, 27, 0, 25, 11, 3, 15, 41, 25,
+ 43, 32, 15, 4, 31, 29, 1, 13, 6, 18,
+ 8, 8, 32, 26, 27, 5, 4, 5, 4, 19,
+ 22, 5, 16, 38, 6, 32, 34, 18, 33, 20,
+ 1, 25, 5, 53, 35, 11, 21, 1, 11, 7,
+ 24, 3, 9, 6, 18, 17, 17, 0, 65, 43,
+ 11, 63, 5, 25, 22, 1, 25, 42, 2, 3,
+ 33, 44, 23, 47, 2, 65, 28, 48, 44, 38,
+ 48, 36, 12, 42, 44, 21, 6, 20, 6, 2,
+ 23, 2, 7, 11, 1, 13, 29, 3, 11, 63,
+ 31, 23, 45, 35, 51, 58, 70, 54, 54, 52,
+ 36, 28, 36, 18, 4, 12, 7, 29, 31, 75,
+ 16, 10, 23, 6, 9, 13, 23, 31, 39, 45,
+ 49, 77, 25, 57, 79, 57, 75, 1, 68, 48,
+ 32, 12, 10, 3, 7, 15, 17, 14, 94, 62,
+ 48, 34, 64, 24, 12, 1, 3, 7, 78, 48,
+ 28, 14, 30, 8, 3, 11, 21, 21, 42, 14,
+ 6, 3, 8, 11, 21, 27, 4, 82, 48, 30,
+ 16, 38, 10, 4, 0, 9, 124, 33, 15, 2,
+ 5, 9, 2, 16, 18, 1, 16, 28, 32, 25,
+ 15, 21, 1, 32, 5, 9, 6, 20, 34, 27,
+ 11, 22, 26, 37, 35, 37, 31, 21, 0, 19,
+ 9, 13, 5, 4, 21, 13, 11, 0, 29, 19,
+ 33, 41, 5, 71, 25, 0, 13, 22, 13, 3,
+ 38, 9, 13, 3, 30, 27, 21, 61, 54, 64,
+ 66, 50, 26, 52, 44, 34, 32, 28, 26, 18,
+ 17, 23, 37, 5, 1, 19, 53, 21, 21, 29,
+ 27, 27, 17, 19, 47, 47, 41, 48, 46, 46,
+ 34, 10, 22, 16, 7, 4, 7, 9, 33, 35,
+ 37, 41, 14, 20, 23, 14, 12, 7, 19, 19,
+ 23, 27, 27, 37, 55, 53, 65, 69, 83, 41,
+ 61, 75, 41, 13, 25, 2, 4, 6, 92, 26,
+ 24, 9, 44, 114, 86, 124, 24, 58, 46, 7,
+ 47, 59, 69, 121, 117, 115, 9, 86, 54, 36,
+ 18, 26, 4, 5, 9, 19, 27, 13, 8, 3,
+ 26, 22, 6, 12, 22, 28, 30, 16, 36, 32,
+ 0, 60, 50, 28, 14, 2, 9, 57, 63, 63,
+ },
+
+ {
+ /* Context Tables for I, SI Slices :: qp = 34 */
+
+ 72, 10, 33, 72, 10, 33, 7, 16, 46, 24,
+ 3, 35, 26, 38, 104, 28, 48, 7, 26, 44,
+ 1, 21, 1, 45, 77, 2, 4, 117, 121, 119,
+ 68, 4, 11, 26, 44, 1, 19, 10, 30, 1,
+ 2, 9, 17, 7, 35, 23, 57, 10, 11, 17,
+ 1, 27, 21, 49, 22, 0, 3, 11, 12, 6,
+ 44, 0, 0, 0, 0, 61, 67, 12, 9, 9,
+ 104, 12, 11, 53, 21, 37, 8, 2, 11, 4,
+ 25, 14, 11, 11, 0, 45, 51, 49, 55, 52,
+ 0, 22, 25, 0, 25, 9, 0, 15, 41, 25,
+ 41, 32, 15, 4, 29, 29, 0, 13, 6, 18,
+ 8, 10, 32, 26, 29, 5, 4, 5, 4, 19,
+ 22, 5, 14, 38, 4, 32, 34, 18, 37, 20,
+ 1, 29, 7, 55, 33, 9, 19, 0, 9, 5,
+ 26, 1, 9, 8, 20, 17, 17, 0, 67, 43,
+ 9, 63, 5, 25, 24, 1, 27, 44, 2, 3,
+ 35, 46, 25, 49, 2, 67, 24, 46, 42, 36,
+ 46, 34, 10, 40, 44, 23, 4, 20, 6, 2,
+ 23, 0, 9, 13, 3, 15, 31, 5, 13, 67,
+ 33, 25, 47, 37, 51, 54, 68, 50, 50, 48,
+ 32, 24, 32, 12, 0, 8, 13, 35, 35, 81,
+ 12, 6, 27, 2, 13, 17, 27, 35, 43, 51,
+ 53, 81, 27, 61, 81, 59, 75, 1, 68, 48,
+ 32, 12, 10, 3, 7, 15, 15, 14, 94, 62,
+ 48, 34, 66, 24, 14, 1, 1, 5, 80, 50,
+ 28, 14, 32, 8, 3, 9, 19, 19, 44, 16,
+ 6, 3, 10, 9, 19, 25, 4, 82, 48, 30,
+ 16, 40, 12, 6, 2, 7, 124, 33, 15, 2,
+ 5, 9, 2, 16, 18, 1, 16, 28, 34, 25,
+ 15, 21, 3, 32, 7, 11, 4, 18, 34, 29,
+ 13, 22, 26, 39, 37, 39, 29, 19, 2, 19,
+ 9, 13, 3, 6, 21, 13, 11, 0, 31, 19,
+ 33, 41, 3, 71, 25, 0, 13, 24, 13, 3,
+ 40, 9, 13, 3, 32, 29, 21, 63, 52, 64,
+ 66, 48, 24, 50, 42, 32, 30, 26, 24, 16,
+ 19, 25, 39, 9, 3, 23, 57, 23, 23, 31,
+ 29, 27, 17, 19, 47, 49, 39, 44, 42, 42,
+ 30, 4, 18, 12, 13, 0, 11, 13, 37, 39,
+ 39, 43, 10, 16, 25, 10, 8, 11, 23, 23,
+ 27, 31, 31, 41, 59, 57, 69, 73, 85, 45,
+ 65, 79, 39, 11, 23, 4, 6, 8, 96, 28,
+ 26, 9, 46, 116, 88, 124, 26, 56, 42, 13,
+ 53, 65, 75, 125, 121, 117, 9, 86, 54, 36,
+ 18, 26, 4, 5, 9, 17, 25, 11, 10, 1,
+ 28, 24, 8, 14, 24, 30, 32, 16, 38, 32,
+ 0, 58, 46, 24, 10, 1, 13, 61, 67, 65,
+ },
+
+ {
+ /* Context Tables for I, SI Slices :: qp = 35 */
+
+ 70, 10, 33, 70, 10, 33, 3, 20, 48, 24,
+ 5, 39, 22, 36, 104, 28, 52, 7, 28, 46,
+ 1, 23, 1, 47, 79, 2, 0, 119, 123, 119,
+ 74, 6, 11, 28, 46, 1, 17, 12, 32, 1,
+ 2, 7, 15, 9, 37, 23, 57, 10, 11, 17,
+ 1, 27, 21, 49, 24, 0, 3, 11, 14, 6,
+ 44, 0, 0, 0, 0, 61, 67, 14, 11, 9,
+ 104, 12, 11, 51, 19, 37, 10, 4, 7, 8,
+ 21, 16, 7, 9, 4, 43, 49, 47, 53, 54,
+ 0, 24, 23, 0, 23, 9, 4, 15, 39, 23,
+ 39, 32, 15, 6, 25, 27, 2, 13, 6, 18,
+ 8, 12, 34, 28, 29, 5, 4, 5, 4, 19,
+ 22, 7, 14, 38, 2, 32, 34, 18, 41, 20,
+ 3, 33, 9, 57, 31, 7, 17, 2, 7, 5,
+ 28, 1, 7, 10, 22, 17, 17, 0, 69, 41,
+ 7, 63, 5, 25, 26, 1, 29, 46, 2, 3,
+ 37, 48, 25, 51, 2, 69, 22, 44, 42, 34,
+ 44, 32, 10, 38, 42, 25, 2, 20, 6, 2,
+ 25, 1, 11, 15, 5, 17, 33, 7, 13, 71,
+ 35, 25, 49, 39, 51, 52, 64, 48, 46, 44,
+ 28, 20, 28, 8, 5, 4, 17, 39, 41, 85,
+ 8, 2, 29, 1, 17, 23, 33, 39, 49, 55,
+ 57, 87, 29, 65, 83, 61, 75, 1, 70, 48,
+ 32, 12, 10, 3, 7, 15, 15, 14, 94, 62,
+ 48, 34, 68, 26, 14, 0, 1, 3, 82, 50,
+ 30, 14, 34, 10, 1, 9, 17, 17, 46, 18,
+ 8, 3, 12, 7, 17, 23, 6, 82, 48, 30,
+ 16, 42, 12, 8, 4, 5, 124, 31, 13, 4,
+ 5, 9, 2, 16, 18, 1, 16, 28, 34, 25,
+ 15, 21, 3, 34, 9, 13, 2, 16, 34, 31,
+ 15, 22, 26, 39, 39, 39, 27, 17, 4, 17,
+ 7, 11, 3, 8, 21, 13, 11, 2, 31, 19,
+ 33, 39, 1, 71, 25, 0, 13, 26, 13, 3,
+ 42, 9, 13, 3, 34, 31, 21, 65, 50, 62,
+ 64, 46, 22, 48, 40, 30, 28, 24, 22, 14,
+ 21, 27, 41, 11, 5, 25, 61, 27, 25, 33,
+ 29, 29, 17, 19, 47, 49, 37, 40, 38, 38,
+ 26, 1, 14, 8, 17, 3, 15, 17, 43, 43,
+ 43, 45, 6, 14, 29, 6, 4, 15, 27, 27,
+ 31, 35, 35, 45, 65, 61, 73, 77, 87, 49,
+ 69, 81, 37, 9, 21, 6, 8, 10, 98, 30,
+ 28, 9, 48, 120, 92, 124, 28, 52, 40, 17,
+ 59, 71, 81, 125, 125, 119, 9, 86, 54, 36,
+ 18, 26, 4, 5, 9, 17, 23, 9, 12, 0,
+ 30, 24, 8, 16, 24, 30, 32, 16, 38, 32,
+ 0, 56, 44, 20, 6, 5, 17, 65, 71, 67,
+ },
+
+ {
+ /* Context Tables for I, SI Slices :: qp = 36 */
+
+ 66, 10, 33, 66, 10, 33, 1, 22, 48, 24,
+ 5, 41, 20, 32, 104, 28, 58, 9, 30, 46,
+ 1, 25, 1, 49, 81, 0, 3, 123, 125, 121,
+ 80, 8, 11, 30, 46, 1, 17, 14, 32, 3,
+ 2, 7, 15, 9, 37, 21, 57, 10, 11, 15,
+ 3, 29, 21, 49, 24, 2, 3, 9, 14, 6,
+ 44, 0, 0, 0, 2, 63, 67, 14, 13, 9,
+ 104, 12, 11, 49, 17, 37, 10, 8, 5, 12,
+ 19, 18, 5, 5, 8, 41, 49, 45, 53, 54,
+ 0, 26, 21, 0, 23, 7, 8, 15, 39, 23,
+ 37, 32, 13, 6, 23, 27, 2, 13, 8, 18,
+ 8, 12, 34, 28, 31, 7, 4, 5, 4, 19,
+ 22, 7, 12, 38, 2, 32, 34, 16, 45, 20,
+ 3, 37, 11, 59, 29, 5, 15, 4, 7, 3,
+ 30, 0, 7, 12, 24, 17, 17, 2, 71, 41,
+ 5, 63, 5, 25, 26, 1, 31, 48, 2, 3,
+ 41, 50, 27, 53, 2, 71, 18, 42, 40, 32,
+ 42, 30, 8, 36, 42, 29, 0, 18, 6, 2,
+ 25, 3, 11, 17, 5, 19, 35, 9, 15, 75,
+ 37, 27, 51, 41, 51, 48, 62, 44, 42, 40,
+ 24, 14, 24, 2, 9, 0, 23, 45, 45, 91,
+ 4, 0, 33, 7, 21, 27, 37, 45, 53, 61,
+ 61, 91, 31, 67, 87, 63, 75, 1, 70, 50,
+ 32, 12, 10, 3, 7, 15, 13, 16, 96, 62,
+ 48, 34, 70, 26, 16, 0, 0, 3, 84, 52,
+ 30, 14, 34, 10, 1, 7, 15, 17, 48, 18,
+ 8, 3, 14, 5, 17, 21, 6, 84, 48, 30,
+ 16, 44, 14, 8, 6, 3, 124, 31, 13, 4,
+ 5, 9, 2, 16, 18, 0, 16, 28, 36, 25,
+ 15, 21, 5, 34, 11, 15, 0, 14, 34, 35,
+ 17, 20, 26, 41, 41, 41, 27, 17, 6, 17,
+ 7, 11, 1, 10, 23, 13, 11, 2, 33, 19,
+ 33, 39, 0, 71, 25, 2, 13, 26, 13, 3,
+ 44, 9, 15, 3, 36, 31, 23, 67, 48, 62,
+ 64, 44, 20, 46, 38, 28, 26, 22, 20, 12,
+ 23, 29, 43, 15, 9, 29, 65, 29, 27, 35,
+ 31, 29, 17, 19, 47, 51, 37, 34, 32, 34,
+ 20, 7, 10, 4, 23, 7, 19, 21, 47, 47,
+ 45, 47, 2, 10, 31, 2, 0, 19, 31, 33,
+ 35, 39, 39, 49, 69, 65, 77, 81, 89, 53,
+ 73, 85, 35, 7, 19, 6, 10, 12, 102, 32,
+ 30, 9, 50, 122, 94, 124, 28, 50, 36, 23,
+ 65, 77, 87, 125, 125, 121, 9, 86, 54, 36,
+ 18, 26, 4, 5, 9, 15, 21, 7, 14, 0,
+ 32, 26, 10, 18, 26, 32, 34, 18, 40, 34,
+ 0, 52, 40, 16, 2, 9, 21, 69, 75, 69,
+ },
+
+ {
+ /* Context Tables for I, SI Slices :: qp = 37 */
+
+ 64, 10, 33, 64, 10, 33, 2, 26, 48, 24,
+ 7, 45, 16, 30, 104, 28, 62, 9, 32, 48,
+ 1, 25, 0, 51, 83, 0, 7, 125, 125, 121,
+ 86, 12, 11, 32, 48, 1, 17, 16, 32, 3,
+ 2, 7, 13, 9, 39, 21, 57, 10, 11, 13,
+ 3, 29, 21, 49, 26, 2, 1, 9, 16, 6,
+ 44, 0, 0, 0, 2, 63, 67, 16, 15, 9,
+ 104, 12, 11, 47, 15, 37, 12, 10, 1, 16,
+ 15, 20, 1, 3, 12, 39, 47, 43, 51, 56,
+ 0, 28, 19, 0, 21, 5, 12, 15, 39, 21,
+ 33, 34, 13, 8, 21, 25, 4, 13, 8, 20,
+ 10, 14, 36, 30, 31, 7, 4, 5, 4, 19,
+ 22, 9, 10, 38, 0, 32, 34, 16, 49, 20,
+ 5, 39, 13, 61, 27, 3, 13, 6, 5, 1,
+ 32, 2, 5, 14, 26, 17, 17, 2, 73, 39,
+ 3, 63, 3, 25, 28, 1, 33, 50, 2, 3,
+ 43, 52, 27, 55, 2, 71, 16, 40, 40, 30,
+ 40, 28, 6, 36, 42, 31, 1, 18, 6, 2,
+ 27, 3, 13, 19, 7, 21, 37, 9, 17, 79,
+ 39, 29, 53, 43, 51, 44, 60, 40, 38, 38,
+ 20, 10, 20, 1, 13, 3, 27, 49, 49, 95,
+ 0, 3, 37, 11, 25, 31, 41, 49, 57, 67,
+ 65, 97, 33, 71, 89, 63, 75, 1, 70, 50,
+ 32, 12, 10, 3, 7, 15, 13, 16, 96, 62,
+ 48, 34, 72, 28, 18, 0, 2, 1, 86, 54,
+ 30, 16, 36, 12, 0, 7, 13, 15, 52, 20,
+ 8, 1, 16, 3, 15, 19, 8, 84, 50, 30,
+ 16, 46, 16, 10, 8, 1, 124, 31, 13, 6,
+ 5, 9, 2, 16, 18, 0, 16, 30, 36, 25,
+ 15, 21, 7, 36, 13, 17, 1, 14, 34, 37,
+ 19, 20, 26, 43, 43, 41, 25, 15, 8, 15,
+ 5, 9, 1, 12, 23, 13, 11, 2, 35, 19,
+ 33, 37, 2, 71, 25, 2, 13, 28, 13, 3,
+ 46, 9, 15, 3, 38, 33, 23, 67, 46, 62,
+ 62, 44, 18, 44, 36, 26, 26, 20, 18, 10,
+ 25, 31, 45, 17, 11, 31, 69, 31, 29, 37,
+ 33, 31, 17, 17, 47, 51, 35, 30, 28, 30,
+ 16, 11, 6, 0, 27, 11, 23, 25, 51, 51,
+ 49, 49, 1, 6, 33, 1, 3, 23, 35, 37,
+ 39, 43, 43, 53, 75, 69, 81, 85, 91, 57,
+ 77, 87, 33, 3, 17, 8, 12, 14, 106, 34,
+ 32, 9, 52, 124, 98, 124, 30, 46, 32, 27,
+ 71, 81, 93, 125, 125, 123, 9, 86, 54, 36,
+ 18, 26, 4, 5, 9, 15, 19, 5, 16, 2,
+ 34, 28, 10, 20, 28, 34, 36, 18, 40, 34,
+ 0, 50, 38, 14, 0, 13, 25, 73, 79, 71,
+ },
+
+ {
+ /* Context Tables for I, SI Slices :: qp = 38 */
+
+ 62, 10, 35, 62, 10, 35, 6, 28, 50, 24,
+ 7, 47, 14, 28, 104, 28, 66, 9, 32, 50,
+ 1, 27, 0, 53, 85, 1, 9, 125, 125, 123,
+ 92, 14, 11, 32, 50, 1, 15, 18, 34, 5,
+ 4, 5, 11, 11, 39, 21, 57, 10, 9, 13,
+ 3, 29, 19, 49, 26, 2, 1, 9, 16, 6,
+ 44, 0, 0, 0, 4, 63, 67, 16, 15, 11,
+ 104, 12, 11, 45, 15, 37, 12, 12, 2, 20,
+ 13, 22, 2, 1, 16, 37, 45, 41, 49, 58,
+ 0, 30, 19, 2, 21, 5, 16, 15, 37, 21,
+ 31, 34, 13, 8, 17, 25, 6, 11, 8, 20,
+ 10, 16, 36, 30, 33, 7, 4, 5, 4, 19,
+ 22, 9, 10, 38, 1, 32, 34, 16, 53, 20,
+ 5, 43, 15, 63, 27, 1, 13, 6, 3, 1,
+ 34, 2, 5, 14, 30, 17, 17, 2, 75, 39,
+ 1, 63, 3, 25, 30, 0, 35, 50, 2, 3,
+ 45, 54, 29, 57, 2, 73, 12, 38, 38, 30,
+ 38, 28, 6, 34, 40, 33, 3, 18, 6, 2,
+ 27, 5, 15, 21, 9, 21, 39, 11, 17, 83,
+ 41, 29, 55, 45, 51, 42, 56, 38, 34, 34,
+ 16, 6, 16, 7, 19, 7, 33, 55, 55, 101,
+ 3, 7, 39, 15, 31, 37, 47, 53, 63, 71,
+ 71, 101, 35, 75, 91, 65, 75, 0, 72, 50,
+ 34, 12, 10, 3, 7, 15, 11, 16, 96, 64,
+ 48, 34, 74, 28, 18, 2, 2, 0, 88, 54,
+ 32, 16, 38, 12, 0, 5, 11, 13, 54, 22,
+ 10, 1, 16, 3, 13, 15, 8, 84, 50, 30,
+ 18, 46, 16, 12, 8, 0, 124, 29, 11, 6,
+ 5, 9, 2, 16, 18, 0, 18, 30, 38, 25,
+ 15, 23, 7, 36, 15, 19, 3, 12, 34, 39,
+ 21, 20, 26, 43, 45, 43, 23, 13, 10, 15,
+ 5, 9, 0, 12, 23, 11, 9, 4, 35, 19,
+ 33, 37, 4, 71, 25, 2, 13, 30, 13, 3,
+ 48, 11, 15, 3, 38, 35, 23, 69, 44, 60,
+ 62, 42, 16, 44, 36, 24, 24, 20, 16, 8,
+ 27, 33, 47, 21, 13, 35, 73, 35, 33, 39,
+ 33, 31, 17, 17, 45, 53, 33, 26, 24, 26,
+ 12, 17, 2, 3, 33, 15, 29, 29, 57, 55,
+ 51, 49, 3, 4, 37, 5, 9, 27, 39, 41,
+ 43, 47, 49, 57, 79, 73, 85, 87, 95, 61,
+ 81, 91, 31, 1, 15, 10, 14, 16, 108, 36,
+ 34, 9, 54, 124, 100, 124, 32, 44, 30, 33,
+ 77, 87, 97, 125, 125, 125, 7, 86, 54, 36,
+ 18, 26, 4, 5, 9, 13, 17, 5, 18, 4,
+ 36, 28, 12, 22, 28, 34, 36, 18, 42, 34,
+ 0, 48, 34, 10, 3, 15, 27, 79, 81, 73,
+ },
+
+ {
+ /* Context Tables for I, SI Slices :: qp = 39 */
+
+ 60, 10, 35, 60, 10, 35, 10, 32, 50, 24,
+ 9, 51, 10, 24, 104, 28, 72, 11, 34, 50,
+ 1, 29, 0, 55, 87, 1, 13, 125, 125, 123,
+ 98, 16, 11, 34, 50, 1, 15, 20, 34, 5,
+ 4, 5, 11, 11, 41, 19, 57, 10, 9, 11,
+ 5, 29, 19, 49, 28, 4, 1, 7, 18, 6,
+ 44, 0, 0, 0, 4, 63, 67, 18, 17, 11,
+ 104, 12, 11, 43, 13, 37, 14, 16, 4, 24,
+ 9, 24, 4, 2, 20, 35, 43, 39, 49, 60,
+ 0, 32, 17, 2, 19, 3, 20, 15, 37, 19,
+ 29, 34, 11, 10, 15, 23, 6, 11, 10, 20,
+ 10, 18, 38, 32, 33, 9, 4, 5, 4, 19,
+ 22, 11, 8, 38, 1, 32, 34, 14, 57, 20,
+ 7, 47, 17, 65, 25, 0, 11, 8, 1, 0,
+ 36, 4, 3, 16, 32, 17, 17, 4, 77, 37,
+ 0, 63, 3, 25, 30, 0, 37, 52, 2, 3,
+ 49, 56, 29, 59, 2, 75, 10, 36, 38, 28,
+ 36, 26, 4, 32, 40, 35, 5, 18, 6, 2,
+ 29, 7, 15, 23, 9, 23, 41, 13, 19, 87,
+ 43, 31, 57, 47, 51, 38, 54, 34, 30, 30,
+ 12, 0, 12, 11, 23, 11, 37, 59, 59, 105,
+ 7, 9, 43, 21, 35, 41, 51, 59, 67, 77,
+ 75, 107, 37, 77, 95, 67, 75, 0, 72, 52,
+ 34, 12, 10, 3, 7, 15, 11, 18, 98, 64,
+ 48, 34, 76, 30, 20, 2, 4, 2, 90, 56,
+ 32, 16, 40, 14, 2, 5, 9, 13, 56, 22,
+ 10, 1, 18, 1, 13, 13, 10, 86, 50, 30,
+ 18, 48, 18, 14, 10, 2, 124, 29, 11, 8,
+ 5, 9, 2, 16, 18, 2, 18, 30, 38, 25,
+ 15, 23, 9, 38, 17, 21, 5, 10, 34, 41,
+ 23, 18, 26, 45, 47, 43, 21, 13, 12, 13,
+ 3, 7, 0, 14, 23, 11, 9, 4, 37, 19,
+ 33, 35, 6, 71, 25, 4, 13, 30, 13, 3,
+ 50, 11, 15, 3, 40, 35, 25, 71, 42, 60,
+ 60, 40, 14, 42, 34, 22, 22, 18, 14, 6,
+ 29, 35, 49, 23, 17, 37, 77, 37, 35, 41,
+ 35, 33, 17, 17, 45, 53, 31, 22, 20, 22,
+ 6, 23, 1, 7, 37, 19, 33, 33, 61, 59,
+ 55, 51, 7, 0, 39, 9, 13, 31, 43, 45,
+ 47, 51, 53, 61, 85, 77, 89, 91, 97, 65,
+ 85, 93, 29, 0, 13, 10, 16, 18, 112, 38,
+ 36, 9, 56, 124, 104, 124, 32, 40, 26, 37,
+ 83, 93, 103, 125, 125, 125, 7, 86, 54, 36,
+ 18, 26, 4, 5, 9, 13, 15, 3, 20, 6,
+ 38, 30, 12, 24, 30, 36, 38, 20, 42, 36,
+ 0, 44, 32, 6, 7, 19, 31, 83, 85, 75,
+ },
+
+ {
+ /* Context Tables for I, SI Slices :: qp = 40 */
+
+ 56, 8, 35, 56, 8, 35, 12, 34, 50, 24,
+ 9, 53, 8, 22, 104, 28, 76, 11, 36, 52,
+ 1, 31, 0, 57, 91, 3, 17, 125, 125, 125,
+ 102, 18, 11, 36, 52, 1, 15, 22, 34, 7,
+ 4, 5, 9, 13, 41, 19, 59, 10, 9, 11,
+ 5, 31, 19, 49, 28, 4, 1, 7, 18, 6,
+ 44, 0, 0, 0, 6, 65, 67, 18, 19, 11,
+ 104, 12, 11, 41, 11, 37, 14, 18, 8, 28,
+ 7, 26, 8, 4, 22, 33, 43, 37, 47, 60,
+ 0, 34, 15, 2, 19, 3, 22, 15, 37, 19,
+ 27, 34, 11, 10, 13, 23, 8, 11, 10, 20,
+ 10, 18, 38, 32, 35, 9, 4, 5, 4, 19,
+ 22, 11, 6, 38, 3, 32, 34, 14, 61, 20,
+ 7, 51, 19, 67, 23, 2, 9, 10, 1, 0,
+ 38, 4, 3, 18, 34, 17, 17, 4, 81, 37,
+ 2, 63, 3, 27, 32, 0, 39, 54, 2, 3,
+ 51, 58, 31, 61, 2, 77, 6, 34, 36, 26,
+ 34, 24, 2, 30, 38, 39, 7, 16, 6, 2,
+ 29, 9, 17, 25, 11, 25, 43, 15, 21, 91,
+ 47, 33, 59, 49, 51, 34, 50, 30, 26, 26,
+ 8, 3, 8, 17, 29, 15, 43, 65, 65, 111,
+ 11, 13, 47, 25, 39, 47, 57, 63, 73, 83,
+ 79, 111, 39, 81, 97, 69, 77, 0, 72, 52,
+ 34, 12, 10, 3, 7, 15, 9, 18, 98, 64,
+ 48, 34, 78, 30, 20, 2, 4, 2, 92, 56,
+ 32, 16, 40, 14, 2, 3, 7, 11, 58, 24,
+ 10, 1, 20, 0, 11, 11, 10, 86, 50, 30,
+ 18, 50, 18, 14, 12, 2, 124, 29, 11, 8,
+ 5, 9, 2, 16, 18, 2, 18, 30, 40, 25,
+ 15, 23, 11, 38, 19, 23, 7, 8, 34, 45,
+ 27, 18, 26, 47, 49, 45, 21, 11, 14, 13,
+ 3, 7, 2, 16, 25, 11, 9, 4, 39, 19,
+ 33, 35, 6, 73, 25, 4, 13, 32, 13, 3,
+ 50, 11, 17, 3, 42, 37, 25, 73, 40, 58,
+ 60, 38, 10, 40, 32, 20, 20, 16, 12, 4,
+ 31, 37, 51, 27, 19, 41, 81, 41, 37, 43,
+ 37, 33, 17, 17, 45, 55, 31, 16, 14, 18,
+ 2, 29, 7, 13, 43, 23, 37, 37, 67, 63,
+ 57, 53, 11, 3, 43, 13, 17, 37, 49, 51,
+ 53, 55, 57, 67, 89, 81, 95, 95, 99, 69,
+ 89, 97, 27, 2, 11, 12, 18, 18, 114, 40,
+ 36, 9, 56, 124, 106, 124, 34, 38, 22, 43,
+ 89, 99, 109, 125, 125, 125, 7, 86, 54, 36,
+ 18, 26, 4, 5, 9, 11, 15, 1, 22, 6,
+ 40, 30, 14, 24, 30, 36, 38, 20, 44, 36,
+ 1, 42, 28, 2, 11, 23, 35, 87, 89, 77,
+ },
+
+ {
+ /* Context Tables for I, SI Slices :: qp = 41 */
+
+ 54, 8, 35, 54, 8, 35, 16, 36, 52, 24,
+ 9, 55, 6, 20, 104, 28, 80, 11, 38, 54,
+ 1, 31, 2, 59, 93, 5, 21, 125, 125, 125,
+ 108, 22, 11, 38, 54, 1, 13, 24, 36, 9,
+ 4, 3, 7, 13, 41, 19, 59, 10, 9, 9,
+ 5, 31, 19, 49, 28, 4, 0, 7, 18, 6,
+ 44, 0, 0, 0, 8, 65, 67, 18, 21, 11,
+ 104, 12, 11, 39, 9, 37, 14, 20, 12, 32,
+ 3, 30, 12, 6, 26, 31, 41, 35, 45, 62,
+ 2, 36, 13, 2, 17, 1, 26, 15, 35, 19,
+ 23, 36, 11, 10, 9, 23, 10, 11, 10, 22,
+ 12, 20, 38, 32, 35, 9, 6, 5, 4, 17,
+ 22, 11, 6, 38, 5, 32, 34, 14, 65, 20,
+ 7, 53, 21, 67, 21, 4, 7, 12, 0, 2,
+ 40, 6, 3, 20, 36, 17, 17, 4, 83, 35,
+ 4, 63, 1, 27, 34, 0, 41, 56, 2, 3,
+ 53, 60, 33, 63, 2, 77, 4, 32, 36, 24,
+ 32, 22, 2, 30, 38, 41, 9, 16, 6, 2,
+ 29, 9, 19, 27, 13, 27, 45, 15, 21, 93,
+ 49, 33, 61, 51, 51, 32, 48, 28, 24, 24,
+ 4, 7, 4, 21, 33, 19, 47, 71, 69, 117,
+ 13, 17, 49, 29, 43, 51, 61, 67, 77, 87,
+ 83, 115, 39, 85, 99, 69, 77, 0, 74, 52,
+ 34, 14, 10, 3, 5, 15, 7, 18, 98, 64,
+ 50, 34, 80, 32, 22, 4, 6, 4, 94, 58,
+ 34, 18, 42, 16, 2, 1, 5, 9, 62, 26,
+ 12, 0, 22, 2, 9, 9, 12, 86, 52, 32,
+ 18, 52, 20, 16, 14, 4, 124, 27, 9, 10,
+ 5, 9, 2, 18, 20, 2, 18, 32, 42, 25,
+ 15, 23, 11, 38, 21, 23, 9, 8, 34, 47,
+ 29, 18, 26, 47, 51, 47, 19, 9, 16, 13,
+ 1, 7, 4, 18, 25, 11, 9, 6, 39, 19,
+ 33, 35, 8, 73, 25, 4, 13, 34, 13, 3,
+ 52, 11, 17, 3, 44, 39, 25, 73, 38, 58,
+ 60, 38, 8, 38, 30, 18, 20, 14, 10, 4,
+ 31, 39, 53, 29, 21, 45, 85, 43, 39, 43,
+ 37, 33, 15, 15, 45, 57, 29, 12, 10, 14,
+ 1, 33, 11, 17, 47, 27, 41, 41, 71, 67,
+ 59, 55, 15, 5, 45, 17, 21, 41, 53, 55,
+ 57, 59, 61, 71, 93, 83, 99, 99, 101, 73,
+ 93, 101, 25, 6, 7, 14, 20, 20, 118, 42,
+ 38, 9, 58, 124, 108, 124, 36, 36, 20, 47,
+ 95, 103, 115, 125, 125, 125, 7, 86, 54, 36,
+ 18, 28, 4, 5, 9, 9, 13, 0, 24, 8,
+ 44, 32, 16, 26, 32, 38, 40, 20, 46, 36,
+ 1, 40, 26, 0, 13, 27, 39, 91, 93, 77,
+ },
+
+ {
+ /* Context Tables for I, SI Slices :: qp = 42 */
+
+ 52, 8, 35, 52, 8, 35, 20, 40, 52, 24,
+ 11, 59, 2, 16, 104, 28, 86, 13, 40, 54,
+ 1, 33, 2, 61, 95, 5, 25, 125, 125, 125,
+ 114, 24, 11, 40, 54, 1, 13, 26, 36, 9,
+ 4, 3, 7, 13, 43, 17, 59, 10, 9, 7,
+ 7, 31, 19, 49, 30, 6, 0, 5, 20, 6,
+ 44, 0, 0, 0, 8, 65, 67, 20, 23, 11,
+ 104, 12, 11, 37, 7, 37, 16, 24, 14, 36,
+ 1, 32, 14, 10, 30, 29, 39, 33, 45, 64,
+ 2, 38, 11, 2, 17, 0, 30, 15, 35, 17,
+ 21, 36, 9, 12, 7, 21, 10, 11, 12, 22,
+ 12, 22, 40, 34, 37, 11, 6, 5, 4, 17,
+ 22, 13, 4, 38, 5, 32, 34, 12, 69, 20,
+ 9, 57, 23, 69, 19, 6, 5, 14, 2, 4,
+ 42, 8, 1, 22, 38, 17, 17, 6, 85, 35,
+ 6, 63, 1, 27, 34, 0, 43, 58, 2, 3,
+ 57, 62, 33, 65, 2, 79, 0, 30, 34, 22,
+ 30, 20, 0, 28, 38, 43, 11, 16, 6, 2,
+ 31, 11, 19, 29, 13, 29, 47, 17, 23, 97,
+ 51, 35, 63, 53, 51, 28, 46, 24, 20, 20,
+ 0, 13, 0, 27, 37, 23, 53, 75, 73, 121,
+ 17, 19, 53, 35, 47, 55, 65, 73, 81, 93,
+ 87, 121, 41, 87, 103, 71, 77, 0, 74, 54,
+ 34, 14, 10, 3, 5, 15, 7, 20, 100, 64,
+ 50, 34, 82, 32, 24, 4, 8, 6, 96, 60,
+ 34, 18, 44, 16, 4, 1, 3, 9, 64, 26,
+ 12, 0, 24, 4, 9, 7, 12, 88, 52, 32,
+ 18, 54, 22, 18, 16, 6, 124, 27, 9, 10,
+ 5, 9, 2, 18, 20, 4, 18, 32, 42, 25,
+ 15, 23, 13, 40, 23, 25, 11, 6, 34, 49,
+ 31, 16, 26, 49, 53, 47, 17, 9, 18, 11,
+ 1, 5, 4, 20, 25, 11, 9, 6, 41, 19,
+ 33, 33, 10, 73, 25, 6, 13, 34, 13, 3,
+ 54, 11, 17, 3, 46, 39, 27, 75, 36, 58,
+ 58, 36, 6, 36, 28, 16, 18, 12, 8, 2,
+ 33, 41, 55, 33, 25, 47, 89, 45, 41, 45,
+ 39, 35, 15, 15, 45, 57, 27, 8, 6, 10,
+ 7, 39, 15, 21, 53, 31, 45, 45, 75, 71,
+ 63, 57, 19, 9, 47, 21, 25, 45, 57, 59,
+ 61, 63, 65, 75, 99, 87, 103, 103, 103, 77,
+ 97, 103, 23, 8, 5, 14, 22, 22, 122, 44,
+ 40, 9, 60, 124, 112, 124, 36, 32, 16, 53,
+ 101, 109, 121, 125, 125, 125, 7, 86, 54, 36,
+ 18, 28, 4, 5, 9, 9, 11, 2, 26, 10,
+ 46, 34, 16, 28, 34, 40, 42, 22, 46, 38,
+ 1, 36, 22, 3, 17, 31, 43, 95, 97, 79,
+ },
+
+ {
+ /* Context Tables for I, SI Slices :: qp = 43 */
+
+ 50, 8, 37, 50, 8, 37, 24, 42, 54, 24,
+ 11, 61, 0, 14, 104, 28, 90, 13, 40, 56,
+ 1, 35, 2, 63, 97, 7, 27, 125, 125, 125,
+ 120, 26, 11, 40, 56, 1, 11, 28, 38, 11,
+ 6, 1, 5, 15, 43, 17, 59, 10, 7, 7,
+ 7, 31, 17, 49, 30, 6, 0, 5, 20, 6,
+ 44, 0, 0, 0, 10, 65, 67, 20, 23, 13,
+ 104, 12, 11, 35, 7, 37, 16, 26, 18, 40,
+ 2, 34, 18, 12, 34, 27, 37, 31, 43, 66,
+ 2, 40, 11, 4, 15, 0, 34, 15, 33, 17,
+ 19, 36, 9, 12, 3, 21, 12, 9, 12, 22,
+ 12, 24, 40, 34, 37, 11, 6, 5, 4, 17,
+ 22, 13, 4, 38, 7, 32, 34, 12, 73, 20,
+ 9, 61, 25, 71, 19, 8, 5, 14, 4, 4,
+ 44, 8, 1, 22, 42, 17, 17, 6, 87, 33,
+ 8, 63, 1, 27, 36, 2, 45, 58, 2, 3,
+ 59, 64, 35, 67, 2, 81, 1, 28, 34, 22,
+ 28, 20, 0, 26, 36, 45, 13, 16, 6, 2,
+ 31, 13, 21, 31, 15, 29, 49, 19, 23, 101,
+ 53, 35, 65, 55, 51, 26, 42, 22, 16, 16,
+ 3, 17, 3, 31, 43, 27, 57, 81, 79, 125,
+ 21, 23, 55, 39, 53, 61, 71, 77, 87, 97,
+ 93, 125, 43, 91, 105, 73, 77, 2, 76, 54,
+ 36, 14, 10, 3, 5, 15, 5, 20, 100, 66,
+ 50, 34, 84, 34, 24, 6, 8, 8, 98, 60,
+ 36, 18, 46, 18, 4, 0, 1, 7, 66, 28,
+ 14, 0, 24, 4, 7, 3, 14, 88, 52, 32,
+ 20, 54, 22, 20, 16, 8, 124, 25, 7, 12,
+ 5, 9, 2, 18, 20, 4, 20, 32, 44, 25,
+ 15, 25, 13, 40, 25, 27, 13, 4, 34, 51,
+ 33, 16, 26, 49, 55, 49, 15, 7, 20, 11,
+ 0, 5, 6, 20, 25, 9, 7, 8, 41, 19,
+ 33, 33, 12, 73, 25, 6, 13, 36, 13, 3,
+ 56, 13, 17, 3, 46, 41, 27, 77, 34, 56,
+ 58, 34, 4, 36, 28, 14, 16, 12, 6, 0,
+ 35, 43, 57, 35, 27, 51, 93, 49, 45, 47,
+ 39, 35, 15, 15, 43, 59, 25, 4, 2, 6,
+ 11, 45, 19, 25, 57, 35, 51, 49, 81, 75,
+ 65, 57, 21, 11, 51, 25, 31, 49, 61, 63,
+ 65, 67, 71, 79, 103, 91, 107, 105, 107, 81,
+ 101, 107, 21, 10, 3, 16, 24, 24, 124, 46,
+ 42, 9, 62, 124, 114, 124, 38, 30, 14, 57,
+ 107, 115, 125, 125, 125, 125, 5, 86, 54, 36,
+ 18, 28, 4, 5, 9, 7, 9, 2, 28, 12,
+ 48, 34, 18, 30, 34, 40, 42, 22, 48, 38,
+ 1, 34, 20, 7, 21, 33, 45, 101, 99, 81,
+ },
+
+ {
+ /* Context Tables for I, SI Slices :: qp = 44 */
+
+ 46, 8, 37, 46, 8, 37, 26, 46, 54, 24,
+ 13, 65, 3, 12, 104, 28, 94, 13, 42, 58,
+ 1, 37, 2, 65, 99, 7, 31, 125, 125, 125,
+ 124, 28, 11, 42, 58, 1, 11, 30, 38, 11,
+ 6, 1, 3, 15, 45, 17, 59, 10, 7, 5,
+ 7, 33, 17, 49, 32, 6, 0, 5, 22, 6,
+ 44, 0, 0, 0, 10, 67, 67, 22, 25, 13,
+ 104, 12, 11, 33, 5, 37, 18, 28, 22, 44,
+ 4, 36, 22, 14, 38, 25, 37, 29, 41, 66,
+ 2, 42, 9, 4, 15, 2, 38, 15, 33, 15,
+ 17, 36, 9, 14, 1, 19, 14, 9, 12, 22,
+ 12, 24, 42, 36, 39, 11, 6, 5, 4, 17,
+ 22, 15, 2, 38, 9, 32, 34, 12, 77, 20,
+ 11, 65, 27, 73, 17, 10, 3, 16, 4, 6,
+ 46, 10, 0, 24, 44, 17, 17, 6, 89, 33,
+ 10, 63, 1, 27, 38, 2, 47, 60, 2, 3,
+ 61, 66, 35, 69, 2, 83, 5, 26, 32, 20,
+ 26, 18, 1, 24, 36, 49, 15, 14, 6, 2,
+ 33, 15, 23, 33, 17, 31, 51, 21, 25, 105,
+ 55, 37, 67, 57, 51, 22, 40, 18, 12, 12,
+ 7, 21, 7, 37, 47, 31, 63, 85, 83, 125,
+ 25, 27, 59, 43, 57, 65, 75, 81, 91, 103,
+ 97, 125, 45, 95, 107, 75, 77, 2, 76, 54,
+ 36, 14, 10, 3, 5, 15, 5, 20, 100, 66,
+ 50, 34, 86, 34, 26, 6, 10, 8, 100, 62,
+ 36, 18, 46, 18, 6, 0, 0, 5, 68, 30,
+ 14, 0, 26, 6, 5, 1, 14, 88, 52, 32,
+ 20, 56, 24, 20, 18, 10, 124, 25, 7, 12,
+ 5, 9, 2, 18, 20, 4, 20, 32, 44, 25,
+ 15, 25, 15, 42, 27, 29, 15, 2, 34, 55,
+ 35, 16, 26, 51, 57, 49, 15, 5, 22, 9,
+ 0, 3, 6, 22, 27, 9, 7, 8, 43, 19,
+ 33, 31, 14, 73, 25, 6, 13, 38, 13, 3,
+ 58, 13, 19, 3, 48, 43, 27, 79, 32, 56,
+ 56, 32, 2, 34, 26, 12, 14, 10, 4, 1,
+ 37, 45, 59, 39, 29, 53, 97, 51, 47, 49,
+ 41, 37, 15, 15, 43, 59, 25, 1, 3, 2,
+ 15, 51, 23, 29, 63, 39, 55, 53, 85, 79,
+ 69, 59, 25, 15, 53, 29, 35, 53, 65, 69,
+ 69, 71, 75, 83, 109, 95, 111, 109, 109, 85,
+ 105, 109, 19, 12, 1, 18, 26, 26, 124, 48,
+ 44, 9, 64, 124, 118, 124, 40, 26, 10, 63,
+ 113, 121, 125, 125, 125, 125, 5, 86, 54, 36,
+ 18, 28, 4, 5, 9, 7, 7, 4, 30, 12,
+ 50, 36, 18, 32, 36, 42, 44, 22, 48, 38,
+ 1, 32, 16, 11, 25, 37, 49, 105, 103, 83,
+ },
+
+ {
+ /* Context Tables for I, SI Slices :: qp = 45 */
+
+ 44, 8, 37, 44, 8, 37, 30, 48, 54, 24,
+ 13, 67, 5, 8, 104, 28, 100, 15, 44, 58,
+ 1, 37, 4, 67, 101, 9, 35, 125, 125, 125,
+ 124, 32, 11, 44, 58, 1, 11, 32, 38, 13,
+ 6, 1, 3, 15, 45, 15, 59, 10, 7, 3,
+ 9, 33, 17, 49, 32, 8, 2, 3, 22, 6,
+ 44, 0, 0, 0, 12, 67, 67, 22, 27, 13,
+ 104, 12, 11, 31, 3, 37, 18, 32, 24, 48,
+ 8, 38, 24, 18, 42, 23, 35, 27, 41, 68,
+ 2, 44, 7, 4, 13, 4, 42, 15, 33, 15,
+ 13, 38, 7, 14, 0, 19, 14, 9, 14, 24,
+ 14, 26, 42, 36, 39, 13, 6, 5, 4, 17,
+ 22, 15, 0, 38, 9, 32, 34, 10, 81, 20,
+ 11, 67, 29, 75, 15, 12, 1, 18, 6, 8,
+ 48, 12, 0, 26, 46, 17, 17, 8, 91, 31,
+ 12, 63, 0, 27, 38, 2, 49, 62, 2, 3,
+ 65, 68, 37, 71, 2, 83, 7, 24, 32, 18,
+ 24, 16, 3, 24, 36, 51, 17, 14, 6, 2,
+ 33, 15, 23, 35, 17, 33, 53, 21, 27, 109,
+ 57, 39, 69, 59, 51, 18, 38, 14, 8, 10,
+ 11, 27, 11, 41, 51, 35, 67, 91, 87, 125,
+ 29, 29, 63, 49, 61, 69, 79, 87, 95, 109,
+ 101, 125, 47, 97, 111, 75, 77, 2, 76, 56,
+ 36, 14, 10, 3, 5, 15, 3, 22, 102, 66,
+ 50, 34, 88, 36, 28, 6, 12, 10, 102, 64,
+ 36, 20, 48, 20, 6, 2, 2, 5, 72, 30,
+ 14, 2, 28, 8, 5, 0, 16, 90, 54, 32,
+ 20, 58, 26, 22, 20, 12, 124, 25, 7, 14,
+ 5, 9, 2, 18, 20, 6, 20, 34, 46, 25,
+ 15, 25, 17, 42, 29, 31, 17, 2, 34, 57,
+ 37, 14, 26, 53, 59, 51, 13, 5, 24, 9,
+ 2, 3, 8, 24, 27, 9, 7, 8, 45, 19,
+ 33, 31, 16, 73, 25, 8, 13, 38, 13, 3,
+ 60, 13, 19, 3, 50, 43, 29, 79, 30, 56,
+ 56, 32, 0, 32, 24, 10, 14, 8, 2, 3,
+ 39, 47, 61, 41, 33, 57, 101, 53, 49, 51,
+ 43, 37, 15, 13, 43, 61, 23, 5, 7, 1,
+ 21, 55, 27, 33, 67, 43, 59, 57, 89, 83,
+ 71, 61, 29, 19, 55, 33, 39, 57, 69, 73,
+ 73, 75, 79, 87, 113, 99, 115, 113, 111, 89,
+ 109, 113, 17, 16, 0, 18, 28, 28, 124, 50,
+ 46, 9, 66, 124, 120, 124, 40, 24, 6, 67,
+ 119, 125, 125, 125, 125, 125, 5, 86, 54, 36,
+ 18, 28, 4, 5, 9, 5, 5, 6, 32, 14,
+ 52, 38, 20, 34, 38, 44, 46, 24, 50, 40,
+ 1, 28, 14, 13, 27, 41, 53, 109, 107, 85,
+ },
+
+ {
+ /* Context Tables for I, SI Slices :: qp = 46 */
+
+ 42, 8, 37, 42, 8, 37, 34, 52, 56, 24,
+ 15, 71, 9, 6, 104, 28, 104, 15, 46, 60,
+ 1, 39, 4, 69, 103, 9, 39, 125, 125, 125,
+ 124, 34, 11, 46, 60, 1, 9, 34, 40, 13,
+ 6, 0, 1, 17, 47, 15, 59, 10, 7, 3,
+ 9, 33, 17, 49, 34, 8, 2, 3, 24, 6,
+ 44, 0, 0, 0, 12, 67, 67, 24, 29, 13,
+ 104, 12, 11, 29, 1, 37, 20, 34, 28, 52,
+ 10, 40, 28, 20, 46, 21, 33, 25, 39, 70,
+ 2, 46, 5, 4, 13, 4, 46, 15, 31, 13,
+ 11, 38, 7, 16, 4, 17, 16, 9, 14, 24,
+ 14, 28, 44, 38, 41, 13, 6, 5, 4, 17,
+ 22, 17, 0, 38, 11, 32, 34, 10, 85, 20,
+ 13, 71, 31, 77, 13, 14, 0, 20, 8, 8,
+ 50, 12, 2, 28, 48, 17, 17, 8, 93, 31,
+ 14, 63, 0, 27, 40, 2, 51, 64, 2, 3,
+ 67, 70, 37, 73, 2, 85, 11, 22, 30, 16,
+ 22, 14, 3, 22, 34, 53, 19, 14, 6, 2,
+ 35, 17, 25, 37, 19, 35, 55, 23, 27, 113,
+ 59, 39, 71, 61, 51, 16, 34, 12, 4, 6,
+ 15, 31, 15, 47, 57, 39, 73, 95, 93, 125,
+ 33, 33, 65, 53, 65, 75, 85, 91, 101, 113,
+ 105, 125, 49, 101, 113, 77, 77, 2, 78, 56,
+ 36, 14, 10, 3, 5, 15, 3, 22, 102, 66,
+ 50, 34, 90, 36, 28, 8, 12, 12, 104, 64,
+ 38, 20, 50, 20, 8, 2, 4, 3, 74, 32,
+ 16, 2, 30, 10, 3, 2, 16, 90, 54, 32,
+ 20, 60, 26, 24, 22, 14, 124, 23, 5, 14,
+ 5, 9, 2, 18, 20, 6, 20, 34, 46, 25,
+ 15, 25, 17, 44, 31, 33, 19, 0, 34, 59,
+ 39, 14, 26, 53, 61, 51, 11, 3, 26, 7,
+ 2, 1, 8, 26, 27, 9, 7, 10, 45, 19,
+ 33, 29, 18, 73, 25, 8, 13, 40, 13, 3,
+ 62, 13, 19, 3, 52, 45, 29, 81, 28, 54,
+ 54, 30, 1, 30, 22, 8, 12, 6, 0, 5,
+ 41, 49, 63, 45, 35, 59, 105, 57, 51, 53,
+ 43, 39, 15, 13, 43, 61, 21, 9, 11, 5,
+ 25, 61, 31, 37, 73, 47, 63, 61, 95, 87,
+ 75, 63, 33, 21, 59, 37, 43, 61, 73, 77,
+ 77, 79, 83, 91, 119, 103, 119, 117, 113, 93,
+ 113, 115, 15, 18, 2, 20, 30, 30, 124, 52,
+ 48, 9, 68, 124, 124, 124, 42, 20, 4, 73,
+ 125, 125, 125, 125, 125, 125, 5, 86, 54, 36,
+ 18, 28, 4, 5, 9, 5, 3, 8, 34, 16,
+ 54, 38, 20, 36, 38, 44, 46, 24, 50, 40,
+ 1, 26, 10, 17, 31, 45, 57, 113, 111, 87,
+ },
+
+ {
+ /* Context Tables for I, SI Slices :: qp = 47 */
+
+ 40, 8, 37, 40, 8, 37, 38, 54, 56, 24,
+ 15, 73, 11, 4, 104, 28, 108, 15, 48, 62,
+ 1, 41, 4, 71, 105, 11, 43, 125, 125, 125,
+ 124, 36, 11, 48, 62, 1, 9, 36, 40, 15,
+ 6, 0, 0, 17, 47, 15, 59, 10, 7, 1,
+ 9, 33, 17, 49, 34, 8, 2, 3, 24, 6,
+ 44, 0, 0, 0, 14, 67, 67, 24, 31, 13,
+ 104, 12, 11, 27, 0, 37, 20, 36, 32, 56,
+ 14, 42, 32, 22, 50, 19, 31, 23, 37, 72,
+ 2, 48, 3, 4, 11, 6, 50, 15, 31, 13,
+ 9, 38, 7, 16, 6, 17, 18, 9, 14, 24,
+ 14, 30, 44, 38, 41, 13, 6, 5, 4, 17,
+ 22, 17, 1, 38, 13, 32, 34, 10, 89, 20,
+ 13, 75, 33, 79, 11, 16, 2, 22, 10, 10,
+ 52, 14, 2, 30, 50, 17, 17, 8, 95, 29,
+ 16, 63, 0, 27, 42, 2, 53, 66, 2, 3,
+ 69, 72, 39, 75, 2, 87, 13, 20, 30, 14,
+ 20, 12, 5, 20, 34, 55, 21, 14, 6, 2,
+ 35, 19, 27, 39, 21, 37, 57, 25, 29, 117,
+ 61, 41, 73, 63, 51, 12, 32, 8, 0, 2,
+ 19, 35, 19, 51, 61, 43, 77, 101, 97, 125,
+ 37, 37, 69, 57, 69, 79, 89, 95, 105, 119,
+ 109, 125, 51, 105, 115, 79, 77, 2, 78, 56,
+ 36, 14, 10, 3, 5, 15, 1, 22, 102, 66,
+ 50, 34, 92, 38, 30, 8, 14, 14, 106, 66,
+ 38, 20, 52, 22, 8, 4, 6, 1, 76, 34,
+ 16, 2, 32, 12, 1, 4, 18, 90, 54, 32,
+ 20, 62, 28, 26, 24, 16, 124, 23, 5, 16,
+ 5, 9, 2, 18, 20, 6, 20, 34, 48, 25,
+ 15, 25, 19, 44, 33, 35, 21, 1, 34, 61,
+ 41, 14, 26, 55, 63, 53, 9, 1, 28, 7,
+ 4, 1, 10, 28, 27, 9, 7, 10, 47, 19,
+ 33, 29, 20, 73, 25, 8, 13, 42, 13, 3,
+ 64, 13, 19, 3, 54, 47, 29, 83, 26, 54,
+ 54, 28, 3, 28, 20, 6, 10, 4, 1, 7,
+ 43, 51, 65, 47, 37, 63, 109, 59, 53, 55,
+ 45, 39, 15, 13, 43, 63, 19, 13, 15, 9,
+ 29, 67, 35, 41, 77, 51, 67, 65, 99, 91,
+ 77, 65, 37, 25, 61, 41, 47, 65, 77, 81,
+ 81, 83, 87, 95, 123, 107, 123, 121, 115, 97,
+ 117, 119, 13, 20, 4, 22, 32, 32, 124, 54,
+ 50, 9, 70, 124, 124, 124, 44, 18, 0, 77,
+ 125, 125, 125, 125, 125, 125, 5, 86, 54, 36,
+ 18, 28, 4, 5, 9, 3, 1, 10, 36, 18,
+ 56, 40, 22, 38, 40, 46, 48, 24, 52, 40,
+ 1, 24, 8, 21, 35, 49, 61, 117, 115, 89,
+ },
+
+ {
+ /* Context Tables for I, SI Slices :: qp = 48 */
+
+ 36, 6, 39, 36, 6, 39, 40, 56, 56, 24,
+ 17, 77, 15, 0, 102, 28, 112, 17, 48, 62,
+ 3, 43, 4, 75, 109, 13, 47, 125, 125, 125,
+ 124, 38, 13, 48, 62, 3, 9, 38, 40, 17,
+ 6, 0, 0, 19, 49, 15, 61, 10, 7, 1,
+ 11, 35, 17, 49, 34, 8, 2, 3, 24, 4,
+ 44, 0, 0, 0, 14, 69, 67, 24, 33, 15,
+ 104, 10, 11, 25, 0, 37, 20, 38, 34, 58,
+ 16, 44, 34, 24, 52, 17, 31, 21, 37, 72,
+ 2, 48, 3, 4, 11, 6, 52, 15, 31, 13,
+ 7, 38, 7, 16, 8, 17, 18, 9, 14, 24,
+ 14, 30, 44, 38, 43, 15, 6, 7, 4, 17,
+ 20, 19, 3, 38, 15, 30, 34, 8, 93, 18,
+ 15, 79, 35, 81, 11, 16, 2, 22, 10, 10,
+ 54, 14, 2, 30, 52, 19, 17, 8, 99, 29,
+ 16, 63, 0, 29, 42, 2, 55, 66, 2, 3,
+ 73, 72, 41, 77, 2, 89, 17, 18, 28, 12,
+ 18, 10, 7, 18, 32, 59, 23, 12, 4, 2,
+ 37, 21, 29, 41, 23, 39, 59, 27, 31, 121,
+ 65, 43, 77, 67, 51, 8, 28, 4, 3, 1,
+ 25, 41, 25, 57, 67, 49, 83, 107, 103, 125,
+ 41, 41, 73, 63, 75, 85, 95, 101, 111, 125,
+ 115, 125, 53, 109, 119, 81, 79, 2, 78, 56,
+ 36, 14, 10, 5, 5, 15, 1, 22, 102, 66,
+ 50, 34, 94, 38, 30, 8, 14, 14, 106, 66,
+ 38, 20, 52, 22, 8, 4, 8, 1, 78, 34,
+ 16, 2, 32, 12, 1, 6, 18, 90, 54, 32,
+ 20, 62, 28, 26, 24, 16, 124, 23, 5, 16,
+ 5, 9, 2, 18, 20, 6, 20, 34, 48, 25,
+ 15, 27, 21, 44, 35, 37, 23, 3, 32, 65,
+ 45, 12, 24, 57, 67, 55, 9, 1, 30, 7,
+ 4, 1, 10, 28, 29, 9, 7, 10, 49, 19,
+ 33, 29, 20, 75, 25, 8, 13, 42, 15, 3,
+ 64, 15, 21, 3, 54, 49, 31, 85, 24, 52,
+ 52, 26, 7, 26, 18, 4, 8, 2, 3, 9,
+ 45, 55, 69, 51, 41, 67, 113, 63, 57, 57,
+ 47, 41, 15, 13, 43, 65, 19, 19, 21, 13,
+ 35, 73, 41, 47, 83, 57, 73, 71, 105, 97,
+ 81, 67, 41, 29, 65, 45, 53, 71, 83, 87,
+ 87, 87, 93, 101, 125, 111, 125, 125, 119, 101,
+ 121, 123, 11, 22, 6, 22, 32, 32, 124, 54,
+ 50, 9, 70, 124, 124, 124, 44, 14, 3, 83,
+ 125, 125, 125, 125, 125, 125, 5, 86, 52, 34,
+ 18, 28, 4, 7, 9, 3, 1, 10, 36, 18,
+ 58, 40, 22, 38, 40, 46, 48, 24, 52, 40,
+ 3, 20, 4, 25, 39, 53, 65, 123, 119, 91,
+ },
+
+ {
+ /* Context Tables for I, SI Slices :: qp = 49 */
+
+ 34, 6, 39, 34, 6, 39, 44, 60, 58, 26,
+ 17, 79, 17, 1, 102, 28, 118, 17, 50, 64,
+ 3, 43, 6, 77, 111, 13, 49, 125, 125, 125,
+ 124, 42, 13, 50, 64, 3, 7, 42, 42, 17,
+ 8, 2, 2, 19, 49, 13, 61, 10, 5, 0,
+ 11, 35, 15, 49, 36, 10, 4, 1, 26, 4,
+ 44, 0, 0, 0, 16, 69, 67, 26, 33, 15,
+ 104, 10, 11, 21, 2, 37, 22, 42, 38, 62,
+ 20, 48, 38, 28, 56, 13, 29, 19, 35, 74,
+ 4, 50, 1, 6, 9, 8, 56, 13, 29, 11,
+ 3, 40, 5, 18, 12, 15, 20, 7, 16, 26,
+ 16, 32, 46, 40, 43, 15, 8, 7, 4, 15,
+ 20, 19, 3, 38, 15, 30, 34, 8, 95, 18,
+ 15, 81, 37, 81, 9, 18, 4, 24, 12, 12,
+ 56, 16, 4, 32, 56, 19, 17, 10, 101, 27,
+ 18, 63, 2, 29, 44, 4, 55, 68, 2, 3,
+ 75, 74, 41, 79, 2, 89, 19, 18, 28, 12,
+ 18, 10, 7, 18, 32, 61, 23, 12, 4, 2,
+ 37, 21, 29, 41, 23, 39, 61, 27, 31, 123,
+ 67, 43, 79, 69, 51, 6, 26, 2, 5, 3,
+ 29, 45, 29, 61, 71, 53, 87, 111, 107, 125,
+ 43, 43, 75, 67, 79, 89, 99, 105, 115, 125,
+ 119, 125, 53, 111, 121, 81, 79, 4, 80, 58,
+ 38, 16, 10, 5, 3, 13, 0, 24, 104, 68,
+ 52, 34, 96, 40, 32, 10, 16, 16, 108, 68,
+ 40, 22, 54, 24, 10, 6, 12, 0, 82, 36,
+ 18, 4, 34, 14, 0, 10, 20, 92, 56, 34,
+ 22, 64, 30, 28, 26, 18, 124, 21, 3, 18,
+ 5, 7, 2, 20, 22, 8, 22, 36, 50, 23,
+ 15, 27, 21, 46, 35, 37, 25, 3, 32, 67,
+ 47, 12, 24, 57, 69, 55, 7, 0, 32, 5,
+ 6, 0, 12, 30, 29, 7, 5, 12, 49, 17,
+ 31, 27, 22, 75, 25, 10, 13, 44, 15, 3,
+ 66, 15, 21, 1, 56, 49, 31, 85, 24, 52,
+ 52, 26, 9, 26, 18, 4, 8, 2, 3, 9,
+ 45, 57, 71, 53, 43, 69, 115, 65, 59, 57,
+ 47, 41, 13, 11, 41, 65, 17, 23, 25, 17,
+ 39, 77, 45, 51, 87, 61, 77, 75, 109, 101,
+ 83, 67, 43, 31, 67, 47, 57, 75, 87, 91,
+ 91, 91, 97, 105, 125, 113, 125, 125, 121, 103,
+ 123, 125, 7, 26, 10, 24, 34, 34, 124, 56,
+ 52, 9, 72, 124, 124, 124, 46, 12, 5, 87,
+ 125, 125, 125, 125, 125, 125, 3, 88, 52, 34,
+ 18, 30, 4, 7, 9, 1, 0, 12, 38, 20,
+ 62, 42, 24, 40, 42, 48, 50, 26, 54, 42,
+ 3, 18, 2, 27, 41, 55, 67, 125, 121, 91,
+ },
+
+ {
+ /* Context Tables for I, SI Slices :: qp = 50 */
+
+ 32, 6, 39, 32, 6, 39, 48, 62, 58, 26,
+ 17, 81, 19, 3, 102, 28, 122, 17, 52, 66,
+ 3, 45, 6, 79, 113, 15, 53, 125, 125, 125,
+ 124, 44, 13, 52, 66, 3, 7, 44, 42, 19,
+ 8, 2, 4, 19, 49, 13, 61, 10, 5, 2,
+ 11, 35, 15, 49, 36, 10, 4, 1, 26, 4,
+ 44, 0, 0, 0, 18, 69, 67, 26, 35, 15,
+ 104, 10, 11, 19, 4, 37, 22, 44, 42, 66,
+ 22, 50, 42, 30, 60, 11, 27, 17, 33, 76,
+ 4, 52, 0, 6, 9, 10, 60, 13, 29, 11,
+ 1, 40, 5, 18, 14, 15, 22, 7, 16, 26,
+ 16, 34, 46, 40, 45, 15, 8, 7, 4, 15,
+ 20, 19, 5, 38, 17, 30, 34, 8, 99, 18,
+ 15, 85, 39, 83, 7, 20, 6, 26, 14, 14,
+ 58, 18, 4, 34, 58, 19, 17, 10, 103, 27,
+ 20, 63, 2, 29, 46, 4, 57, 70, 2, 3,
+ 77, 76, 43, 81, 2, 91, 23, 16, 26, 10,
+ 16, 8, 9, 16, 32, 63, 25, 12, 4, 2,
+ 37, 23, 31, 43, 25, 41, 63, 29, 33, 125,
+ 69, 45, 81, 71, 51, 2, 24, 1, 9, 7,
+ 33, 49, 33, 67, 75, 57, 93, 117, 111, 125,
+ 47, 47, 79, 71, 83, 93, 103, 109, 119, 125,
+ 123, 125, 55, 115, 123, 83, 79, 4, 80, 58,
+ 38, 16, 10, 5, 3, 13, 2, 24, 104, 68,
+ 52, 34, 98, 40, 34, 10, 18, 18, 110, 70,
+ 40, 22, 56, 24, 10, 8, 14, 2, 84, 38,
+ 18, 4, 36, 16, 2, 12, 20, 92, 56, 34,
+ 22, 66, 32, 30, 28, 20, 124, 21, 3, 18,
+ 5, 7, 2, 20, 22, 8, 22, 36, 52, 23,
+ 15, 27, 23, 46, 37, 39, 27, 5, 32, 69,
+ 49, 12, 24, 59, 71, 57, 5, 2, 34, 5,
+ 6, 0, 14, 32, 29, 7, 5, 12, 51, 17,
+ 31, 27, 24, 75, 25, 10, 13, 46, 15, 3,
+ 68, 15, 21, 1, 58, 51, 31, 87, 22, 52,
+ 52, 24, 11, 24, 16, 2, 6, 0, 5, 11,
+ 47, 59, 73, 57, 45, 73, 119, 67, 61, 59,
+ 49, 41, 13, 11, 41, 67, 15, 27, 29, 21,
+ 43, 83, 49, 55, 93, 65, 81, 79, 113, 105,
+ 85, 69, 47, 35, 69, 51, 61, 79, 91, 95,
+ 95, 95, 101, 109, 125, 117, 125, 125, 123, 107,
+ 125, 125, 5, 28, 12, 26, 36, 36, 124, 58,
+ 54, 9, 74, 124, 124, 124, 48, 10, 9, 93,
+ 125, 125, 125, 125, 125, 125, 3, 88, 52, 34,
+ 18, 30, 4, 7, 9, 0, 2, 14, 40, 22,
+ 64, 44, 26, 42, 44, 50, 52, 26, 56, 42,
+ 3, 16, 1, 31, 45, 59, 71, 125, 125, 93,
+ },
+
+ {
+ /* Context Tables for I, SI Slices :: qp = 51 */
+
+ 30, 6, 39, 30, 6, 39, 52, 66, 60, 26,
+ 19, 85, 23, 5, 102, 28, 124, 17, 54, 68,
+ 3, 47, 6, 81, 115, 15, 57, 125, 125, 125,
+ 124, 46, 13, 54, 68, 3, 5, 46, 44, 19,
+ 8, 4, 6, 21, 51, 13, 61, 10, 5, 2,
+ 11, 35, 15, 49, 38, 10, 4, 1, 28, 4,
+ 44, 0, 0, 0, 18, 69, 67, 28, 37, 15,
+ 104, 10, 11, 17, 6, 37, 24, 46, 46, 70,
+ 26, 52, 46, 32, 64, 9, 25, 15, 31, 78,
+ 4, 54, 2, 6, 7, 10, 64, 13, 27, 9,
+ 0, 40, 5, 20, 18, 13, 24, 7, 16, 26,
+ 16, 36, 48, 42, 45, 15, 8, 7, 4, 15,
+ 20, 21, 5, 38, 19, 30, 34, 8, 103, 18,
+ 17, 89, 41, 85, 5, 22, 8, 28, 16, 14,
+ 60, 18, 6, 36, 60, 19, 17, 10, 105, 25,
+ 22, 63, 2, 29, 48, 4, 59, 72, 2, 3,
+ 79, 78, 43, 83, 2, 93, 25, 14, 26, 8,
+ 14, 6, 9, 14, 30, 65, 27, 12, 4, 2,
+ 39, 25, 33, 45, 27, 43, 65, 31, 33, 125,
+ 71, 45, 83, 73, 51, 0, 20, 3, 13, 11,
+ 37, 53, 37, 71, 81, 61, 97, 121, 117, 125,
+ 51, 51, 81, 75, 87, 99, 109, 113, 125, 125,
+ 125, 125, 57, 119, 125, 85, 79, 4, 82, 58,
+ 38, 16, 10, 5, 3, 13, 2, 24, 104, 68,
+ 52, 34, 100, 42, 34, 12, 18, 20, 112, 70,
+ 42, 22, 58, 26, 12, 8, 16, 4, 86, 40,
+ 20, 4, 38, 18, 4, 14, 22, 92, 56, 34,
+ 22, 68, 32, 32, 30, 22, 124, 19, 1, 20,
+ 5, 7, 2, 20, 22, 8, 22, 36, 52, 23,
+ 15, 27, 23, 48, 39, 41, 29, 7, 32, 71,
+ 51, 12, 24, 59, 73, 57, 3, 4, 36, 3,
+ 8, 2, 14, 34, 29, 7, 5, 14, 51, 17,
+ 31, 25, 26, 75, 25, 10, 13, 48, 15, 3,
+ 70, 15, 21, 1, 60, 53, 31, 89, 20, 50,
+ 50, 22, 13, 22, 14, 0, 4, 1, 7, 13,
+ 49, 61, 75, 59, 47, 75, 123, 71, 63, 61,
+ 49, 43, 13, 11, 41, 67, 13, 31, 33, 25,
+ 47, 89, 53, 59, 97, 69, 85, 83, 119, 109,
+ 89, 71, 51, 37, 73, 55, 65, 83, 95, 99,
+ 99, 99, 105, 113, 125, 121, 125, 125, 125, 111,
+ 125, 125, 3, 30, 14, 28, 38, 38, 124, 60,
+ 56, 9, 76, 124, 124, 124, 50, 6, 11, 97,
+ 125, 125, 125, 125, 125, 125, 3, 88, 52, 34,
+ 18, 30, 4, 7, 9, 0, 4, 16, 42, 24,
+ 66, 44, 26, 44, 44, 50, 52, 26, 56, 42,
+ 3, 14, 3, 35, 49, 63, 75, 125, 125, 95,
+ },
+
+ },
+
+};
diff --git a/common/ih264_cabac_tables.h b/common/ih264_cabac_tables.h
new file mode 100755
index 0000000..0cef51e
--- /dev/null
+++ b/common/ih264_cabac_tables.h
@@ -0,0 +1,101 @@
+/******************************************************************************
+ *
+ * Copyright (C) 2015 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ *****************************************************************************
+ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
+*/
+
+/**
+******************************************************************************
+* @file ih264_cabac_tables.h
+*
+* @brief
+* This file contains enumerations, macros and extern declarations of H264
+* cabac tables
+*
+* @author
+* Ittiam
+*
+* @remarks
+* none
+******************************************************************************
+*/
+
+#ifndef IH264_CABAC_TABLES_H_
+#define IH264_CABAC_TABLES_H_
+
+/*****************************************************************************/
+/* Constant Macros */
+/*****************************************************************************/
+
+/**
+******************************************************************************
+ * @brief maximum range of cabac_init_idc (0-2)
+******************************************************************************
+ */
+#define IH264_NUM_CABAC_INIT_IDC_PLUS_ONE 4
+
+/**
+******************************************************************************
+ * @brief max range of qps in H264 (0-51)
+******************************************************************************
+ */
+#define IH264_MAX_QP 52
+
+/**
+******************************************************************************
+ * @brief max range of cabac contexts in H264 (0-459)
+******************************************************************************
+ */
+#define IH264_NUM_CABAC_CTXTS 460
+
+/*****************************************************************************/
+/* Extern global declarations */
+/*****************************************************************************/
+
+/**
+ ******************************************************************************
+ * @breif Table for rangeTabLPS depending on pStateIdx and qCodIRangeIdx
+ * input : pStateIdx(0-63) and qCodIRangeIdx(0-3) [(Range >> 6) & 0x3]
+ * output : RLps
+ *
+ * @remarks See Table 9-35 of H264 spec for rangeTabLPS
+ *******************************************************************************
+ */
+extern const UWORD8 gau1_ih264_cabac_rlps[64][4];
+
+
+/**
+ ******************************************************************************
+ * @breif probability+MPS state transition tables based on cur State and bin
+ * input : curpState[bits7-2] | curMPS[bit1] | decodedBin[bit0]
+ * output : nextpState[bits6-1] | nextMPS[bit0]
+ * @remarks Modified form of Table-9-36 State Transition table in H264 spec
+ ******************************************************************************
+ */
+extern const UWORD8 gau1_ih264_next_state[128*2];
+
+
+/**
+ ******************************************************************************
+ * @brief Init context tables for all combinations of qp and cabac_init_idc
+ * @remarks Packing format MPS in lsb and pState in bits[1-6]
+ ******************************************************************************
+ */
+extern const UWORD8 gau1_ih264_cab_ctxts[IH264_NUM_CABAC_INIT_IDC_PLUS_ONE][IH264_MAX_QP][IH264_NUM_CABAC_CTXTS];
+
+
+#endif /* IH264_CABAC_TABLES_H_ */
diff --git a/common/ih264_cavlc_tables.c b/common/ih264_cavlc_tables.c
new file mode 100755
index 0000000..f122ab9
--- /dev/null
+++ b/common/ih264_cavlc_tables.c
@@ -0,0 +1,282 @@
+/******************************************************************************
+ *
+ * Copyright (C) 2015 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ *****************************************************************************
+ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
+*/
+
+
+/**
+******************************************************************************
+* @file
+* ih264_cavlc_tables.c
+*
+* @brief
+* This file contains H264 cavlc tables for encoding coeff_tokens, levels, total
+* zeros and runs before zeros
+*
+* @author
+* Ittiam
+*
+* @par List of Tables
+* - gu1_code_coeff_token_table
+* - gu1_size_coeff_token_table
+* - gu1_code_coeff_token_table_chroma
+* - gu1_size_coeff_token_table_chroma
+* - gu1_threshold_vlc_level
+* - gu1_size_zero_table
+* - gu1_code_zero_table
+* - gu1_size_zero_table_chroma
+* - gu1_code_zero_table_chroma
+* - gu1_index_zero_table
+* - gu1_size_run_table
+* - gu1_code_run_table
+* - gu4_codeword_level_tables
+* - gu1_codesize_level_tables
+*
+* @remarks
+* none
+*
+******************************************************************************
+*/
+
+/*****************************************************************************/
+/* File Includes */
+/*****************************************************************************/
+
+/* User include files */
+#include "ih264_typedefs.h"
+#include "ih264_cavlc_tables.h"
+
+
+/*****************************************************************************/
+/* Extern global definitions */
+/*****************************************************************************/
+
+/**
+ ******************************************************************************
+ * @brief Assignment of cbp to a codenum for intra and inter prediction modes
+ * chroma format idc != 0
+ * input : cbp, intra - 0/inter - 1
+ * output : codenum
+ * @remarks Table 9-4 – Assignment of codeNum to values of coded_block_pattern
+ * for macroblock prediction modes in H264 spec
+ ******************************************************************************
+ */
+const UWORD8 gu1_cbp_map_tables[48][2]=
+{
+ { 3, 0}, {29, 2}, {30, 3}, {17, 7}, {31, 4}, {18, 8}, {37, 17}, { 8, 13},
+ {32, 5}, {38, 18}, {19, 9}, { 9, 14}, {20, 10}, {10, 15}, {11, 16}, { 2, 11},
+ {16, 1}, {33, 32}, {34, 33}, {21, 36}, {35, 34}, {22, 37}, {39, 44}, { 4, 40},
+ {36, 35}, {40, 45}, {23, 38}, { 5, 41}, {24, 39}, { 6, 42}, { 7, 43}, { 1, 19},
+ {41, 6}, {42, 24}, {43, 25}, {25, 20}, {44, 26}, {26, 21}, {46, 46}, {12, 28},
+ {45, 27}, {47, 47}, {27, 22}, {13, 29}, {28, 23}, {14, 30}, {15, 31}, { 0, 12},
+};
+
+
+/**
+ ******************************************************************************
+ * @brief total non-zero coefficients and numbers of trailing ones of a residual
+ * block are mapped to coeff_token using the tables given below.
+ * input : VLC-Num | Trailing ones | Total coeffs
+ * output : coeff_token (code word, size of the code word)
+ * @remarks Table-9-5 coeff_token mapping to TotalCoeff( coeff_token )
+ * and TrailingOnes( coeff_token ) in H264 spec
+ ******************************************************************************
+ */
+const UWORD8 gu1_code_coeff_token_table[3][4][16] =
+{
+ {
+ { 5, 7, 7, 7, 7, 15, 11, 8, 15, 11, 15, 11, 15, 11, 7, 4, },
+ { 1, 4, 6, 6, 6, 6, 14, 10, 14, 10, 14, 10, 1, 14, 10, 6, },
+ { 0, 1, 5, 5, 5, 5, 5, 13, 9, 13, 9, 13, 9, 13, 9, 5, },
+ { 0, 0, 3, 3, 4, 4, 4, 4, 4, 12, 12, 8, 12, 8, 12, 8, },
+ },
+ {
+ {11, 7, 7, 7, 4, 7, 15, 11, 15, 11, 8, 15, 11, 7, 9, 7, },
+ { 2, 7, 10, 6, 6, 6, 6, 14, 10, 14, 10, 14, 10, 11, 8, 6, },
+ { 0, 3, 9, 5, 5, 5, 5, 13, 9, 13, 9, 13, 9, 6, 10, 5, },
+ { 0, 0, 5, 4, 6, 8, 4, 4, 4, 12, 8, 12, 12, 8, 1, 4, },
+ },
+ {
+ {15, 11, 8, 15, 11, 9, 8, 15, 11, 15, 11, 8, 13, 9, 5, 1, },
+ {14, 15, 12, 10, 8, 14, 10, 14, 14, 10, 14, 10, 7, 12, 8, 4, },
+ { 0, 13, 14, 11, 9, 13, 9, 13, 10, 13, 9, 13, 9, 11, 7, 3, },
+ { 0, 0, 12, 11, 10, 9, 8, 13, 12, 12, 12, 8, 12, 10, 6, 2, },
+ },
+};
+
+const UWORD8 gu1_size_coeff_token_table[3][4][16] =
+{
+ {
+ { 6, 8, 9, 10, 11, 13, 13, 13, 14, 14, 15, 15, 16, 16, 16, 16, },
+ { 2, 6, 8, 9, 10, 11, 13, 13, 14, 14, 15, 15, 15, 16, 16, 16, },
+ { 0, 3, 7, 8, 9, 10, 11, 13, 13, 14, 14, 15, 15, 16, 16, 16, },
+ { 0, 0, 5, 6, 7, 8, 9, 10, 11, 13, 14, 14, 15, 15, 16, 16, },
+ },
+ {
+ { 6, 6, 7, 8, 8, 9, 11, 11, 12, 12, 12, 13, 13, 13, 14, 14, },
+ { 2, 5, 6, 6, 7, 8, 9, 11, 11, 12, 12, 13, 13, 14, 14, 14, },
+ { 0, 3, 6, 6, 7, 8, 9, 11, 11, 12, 12, 13, 13, 13, 14, 14, },
+ { 0, 0, 4, 4, 5, 6, 6, 7, 9, 11, 11, 12, 13, 13, 13, 14, },
+ },
+ {
+ { 6, 6, 6, 7, 7, 7, 7, 8, 8, 9, 9, 9, 10, 10, 10, 10, },
+ { 4, 5, 5, 5, 5, 6, 6, 7, 8, 8, 9, 9, 9, 10, 10, 10, },
+ { 0, 4, 5, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 10, },
+ { 0, 0, 4, 4, 4, 4, 4, 5, 6, 7, 8, 8, 9, 10, 10, 10, },
+ },
+};
+const UWORD8 gu1_code_coeff_token_table_chroma[4][4] =
+{
+ { 7, 4, 3, 2, },
+ { 1, 6, 3, 3, },
+ { 0, 1, 2, 2, },
+ { 0, 0, 5, 0, },
+};
+
+const UWORD8 gu1_size_coeff_token_table_chroma[4][4] =
+{
+ { 6, 6, 6, 6, },
+ { 1, 6, 7, 8, },
+ { 0, 3, 7, 8, },
+ { 0, 0, 6, 7, },
+};
+
+/**
+ ******************************************************************************
+ * @brief After encoding the current Level, to encode the next level, the choice
+ * of VLC table needs to be updated. The update is carried basing on a set of thresholds.
+ * These thresholds are listed in the table below for lookup.
+ * input : suffix_length
+ * output : threshold
+ ******************************************************************************
+ */
+const UWORD8 gu1_threshold_vlc_level[6] =
+{
+ 0, 3, 6, 12, 24, 48
+};
+
+
+/**
+ ******************************************************************************
+ * @brief table for encoding total number of zeros
+ * input : coeff_token, total zeros
+ * output : code word, size of the code word
+ * @remarks Table-9-7, 9-8 total_zeros tables for 4x4 blocks with
+ * TotalCoeff( coeff_token ) in H264 spec
+ ******************************************************************************
+ */
+const UWORD8 gu1_size_zero_table[135] =
+{
+ 1, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 9,
+ 3, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 6, 6, 6, 6,
+ 4, 3, 3, 3, 4, 4, 3, 3, 4, 5, 5, 6, 5, 6,
+ 5, 3, 4, 4, 3, 3, 3, 4, 3, 4, 5, 5, 5,
+ 4, 4, 4, 3, 3, 3, 3, 3, 4, 5, 4, 5,
+ 6, 5, 3, 3, 3, 3, 3, 3, 4, 3, 6,
+ 6, 5, 3, 3, 3, 2, 3, 4, 3, 6,
+ 6, 4, 5, 3, 2, 2, 3, 3, 6,
+ 6, 6, 4, 2, 2, 3, 2, 5,
+ 5, 5, 3, 2, 2, 2, 4,
+ 4, 4, 3, 3, 1, 3,
+ 4, 4, 2, 1, 3,
+ 3, 3, 1, 2,
+ 2, 2, 1,
+ 1, 1,
+};
+const UWORD8 gu1_code_zero_table[135] =
+{
+ 1, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 1,
+ 7, 6, 5, 4, 3, 5, 4, 3, 2, 3, 2, 3, 2, 1, 0,
+ 5, 7, 6, 5, 4, 3, 4, 3, 2, 3, 2, 1, 1, 0,
+ 3, 7, 5, 4, 6, 5, 4, 3, 3, 2, 2, 1, 0,
+ 5, 4, 3, 7, 6, 5, 4, 3, 2, 1, 1, 0,
+ 1, 1, 7, 6, 5, 4, 3, 2, 1, 1, 0,
+ 1, 1, 5, 4, 3, 3, 2, 1, 1, 0,
+ 1, 1, 1, 3, 3, 2, 2, 1, 0,
+ 1, 0, 1, 3, 2, 1, 1, 1,
+ 1, 0, 1, 3, 2, 1, 1,
+ 0, 1, 1, 2, 1, 3,
+ 0, 1, 1, 1, 1,
+ 0, 1, 1, 1,
+ 0, 1, 1,
+ 0, 1,
+};
+const UWORD8 gu1_size_zero_table_chroma[9] =
+{
+ 1, 2, 3, 3,
+ 1, 2, 2,
+ 1, 1,
+};
+const UWORD8 gu1_code_zero_table_chroma[9] =
+{
+ 1, 1, 1, 0,
+ 1, 1, 0,
+ 1, 0,
+};
+
+/**
+ ******************************************************************************
+ * @brief index to access zero table (look up)
+ * input : TotalCoeff( coeff_token )
+ * output : index to access zero table
+ ******************************************************************************
+ */
+const UWORD8 gu1_index_zero_table[15] =
+{
+ 0, 16, 31, 45, 58, 70, 81, 91, 100, 108, 115, 121, 126, 130, 133,
+};
+
+/**
+ ******************************************************************************
+ * @brief table for encoding runs of zeros before
+ * input : zeros left, runs of zeros before
+ * output : code word, size of the code word
+ * @remarks Table-9-10 table for run_before in H264 spec
+ ******************************************************************************
+ */
+const UWORD8 gu1_size_run_table[42] =
+{
+ 1, 1,
+ 1, 2, 2,
+ 2, 2, 2, 2,
+ 2, 2, 2, 3, 3,
+ 2, 2, 3, 3, 3, 3,
+ 2, 3, 3, 3, 3, 3, 3,
+ 3, 3, 3, 3, 3, 3, 3, 4, 5, 6, 7, 8, 9, 10, 11,
+};
+const UWORD8 gu1_code_run_table[42] =
+{
+ 1, 0,
+ 1, 1, 0,
+ 3, 2, 1, 0,
+ 3, 2, 1, 1, 0,
+ 3, 2, 3, 2, 1, 0,
+ 3, 0, 1, 3, 2, 5, 4,
+ 7, 6, 5, 4, 3, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+};
+/**
+ ******************************************************************************
+ * @brief index to access zero table (look up)
+ * input : TotalCoeff( coeff_token )
+ * output : index to access zero table
+ ******************************************************************************
+ */
+const UWORD8 gu1_index_run_table[7] =
+{
+ 0, 2, 5, 9, 14, 20, 27,
+};
diff --git a/common/ih264_cavlc_tables.h b/common/ih264_cavlc_tables.h
new file mode 100755
index 0000000..78057b5
--- /dev/null
+++ b/common/ih264_cavlc_tables.h
@@ -0,0 +1,133 @@
+/******************************************************************************
+ *
+ * Copyright (C) 2015 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ *****************************************************************************
+ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
+*/
+
+/**
+******************************************************************************
+* @file ih264_cavlc_tables.h
+*
+* @brief
+* This file contains enumerations, macros and extern declarations of H264
+* cavlc tables
+*
+* @author
+* Ittiam
+*
+* @remarks
+* none
+******************************************************************************
+*/
+
+#ifndef IH264_CAVLC_TABLES_H_
+#define IH264_CAVLC_TABLES_H_
+
+/*****************************************************************************/
+/* Constant Macros */
+/*****************************************************************************/
+/**
+******************************************************************************
+ * @brief maximum zeros left
+******************************************************************************
+ */
+#define MAX_ZERO_LEFT 6
+
+/*****************************************************************************/
+/* Extern global declarations */
+/*****************************************************************************/
+
+/**
+ ******************************************************************************
+ * @brief Assignment of cbp to a codenum for intra and inter prediction modes
+ * chroma format idc != 0
+ * input : cbp, intra - 0/inter - 1
+ * output : codenum
+ * @remarks Table 9-4 – Assignment of codeNum to values of coded_block_pattern
+ * for macroblock prediction modes in H264 spec
+ ******************************************************************************
+ */
+extern const UWORD8 gu1_cbp_map_tables[48][2];
+
+/**
+ ******************************************************************************
+ * @brief total non-zero coefficients and numbers of trailing ones of a residual
+ * block are mapped to coefftoken using the tables given below.
+ * input : VLC-Num | Trailing ones | Total coeffs
+ * output : coeff_token (code word, size of the code word)
+ * @remarks Table-9-5 coeff_token mapping to TotalCoeff( coeff_token )
+ * and TrailingOnes( coeff_token ) in H264 spec
+ ******************************************************************************
+ */
+extern const UWORD8 gu1_code_coeff_token_table[3][4][16];
+extern const UWORD8 gu1_size_coeff_token_table[3][4][16];
+extern const UWORD8 gu1_code_coeff_token_table_chroma[4][4];
+extern const UWORD8 gu1_size_coeff_token_table_chroma[4][4];
+
+/**
+ ******************************************************************************
+ * @brief Thresholds for determining whether to increment Level table number.
+ * input : suffix_length
+ * output : threshold
+ ******************************************************************************
+ */
+extern const UWORD8 gu1_threshold_vlc_level[6];
+
+/**
+ ******************************************************************************
+ * @brief table for encoding total number of zeros
+ * input : coeff_token, total zeros
+ * output : code word, size of the code word
+ * @remarks Table-9-7, 9-8 total_zeros tables for 4x4 blocks with
+ * TotalCoeff( coeff_token ) in H264 spec
+ ******************************************************************************
+ */
+extern const UWORD8 gu1_size_zero_table[135];
+extern const UWORD8 gu1_code_zero_table[135];
+extern const UWORD8 gu1_size_zero_table_chroma[9];
+extern const UWORD8 gu1_code_zero_table_chroma[9];
+
+/**
+ ******************************************************************************
+ * @brief index to access zero table (for speed)
+ * input : TotalCoeff( coeff_token )
+ * output : index to access zero table
+ ******************************************************************************
+ */
+extern const UWORD8 gu1_index_zero_table[15];
+
+/**
+ ******************************************************************************
+ * @brief table for encoding runs of zeros before
+ * input : zeros left, runs of zeros before
+ * output : code word, size of the code word
+ * @remarks Table-9-10 table for run_before in H264 spec
+ ******************************************************************************
+ */
+extern const UWORD8 gu1_size_run_table[42];
+extern const UWORD8 gu1_code_run_table[42];
+
+/**
+ ******************************************************************************
+ * @brief index to access run table (look up)
+ * input : zeros left
+ * output : index to access run table
+ ******************************************************************************
+ */
+extern const UWORD8 gu1_index_run_table[7];
+
+#endif /* IH264_CAVLC_TABLES_H_ */
diff --git a/common/ih264_chroma_intra_pred_filters.c b/common/ih264_chroma_intra_pred_filters.c
new file mode 100755
index 0000000..ee145e5
--- /dev/null
+++ b/common/ih264_chroma_intra_pred_filters.c
@@ -0,0 +1,478 @@
+/******************************************************************************
+ *
+ * Copyright (C) 2015 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ *****************************************************************************
+ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
+*/
+/**
+*******************************************************************************
+* @file
+* ih264_chroma_intra_pred_filters.c
+*
+* @brief
+* Contains function definitions for chroma intra prediction filters
+*
+* @author
+* Ittiam
+*
+* @par List of Functions:
+* -ih264_intra_pred_chroma_8x8_mode_dc
+* -ih264_intra_pred_chroma_8x8_mode_horz
+* -ih264_intra_pred_chroma_8x8_mode_vert
+* -ih264_intra_pred_chroma_8x8_mode_plane
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+
+/*****************************************************************************/
+/* File Includes */
+/*****************************************************************************/
+
+/* System include files */
+#include <stdio.h>
+#include <stddef.h>
+#include <string.h>
+
+/* User include files */
+#include "ih264_defs.h"
+#include "ih264_typedefs.h"
+#include "ih264_macros.h"
+#include "ih264_platform_macros.h"
+#include "ih264_intra_pred_filters.h"
+
+/* Global variables used only in assembly files*/
+const WORD8 ih264_gai1_intrapred_chroma_plane_coeffs1[] =
+{ 0x01,0x00,0x01,0x00,
+ 0x02,0x00,0x02,0x00,
+ 0x03,0x00,0x03,0x00,
+ 0x04,0x00,0x04,0x00
+};
+ const WORD8 ih264_gai1_intrapred_chroma_plane_coeffs2[] =
+ { 0xfd,0xff,0xfe,0xff,
+ 0xff,0xff,0x00,0x00,
+ 0x01,0x00,0x02,0x00,
+ 0x03,0x00,0x04,0x00,
+ };
+
+/*****************************************************************************/
+/* Chroma Intra prediction 8x8 filters */
+/*****************************************************************************/
+
+/**
+*******************************************************************************
+*
+* ih264_intra_pred_chroma_8x8_mode_dc
+*
+* @brief
+* Perform Intra prediction for chroma_8x8 mode:DC
+*
+* @par Description:
+* Perform Intra prediction for chroma_8x8 mode:DC ,described in sec 8.3.4.1
+*
+* @param[in] pu1_src
+* UWORD8 pointer to the source containing alternate U and V samples
+*
+* @param[out] pu1_dst
+* UWORD8 pointer to the destination with alternate U and V samples
+*
+* @param[in] src_strd
+* integer source stride
+*
+* @param[in] dst_strd
+* integer destination stride
+*
+** @param[in] ngbr_avail
+* availability of neighbouring pixels
+*
+* @returns
+*
+* @remarks
+* None
+*
+******************************************************************************
+*/
+void ih264_intra_pred_chroma_8x8_mode_dc(UWORD8 *pu1_src,
+ UWORD8 *pu1_dst,
+ WORD32 src_strd,
+ WORD32 dst_strd,
+ WORD32 ngbr_avail)
+{
+ WORD32 left_avail, left_avail1, left_avail2; /* availability of left predictors (only for DC) */
+ WORD32 top_avail; /* availability of top predictors (only for DC) */
+ UWORD8 *pu1_left = NULL; /* Pointer to start of left predictors */
+ UWORD8 *pu1_top = NULL; /* Pointer to start of top predictors */
+ UNUSED(src_strd);
+
+ /* temporary variables to store accumulated first left half,second left half,
+ * first top half,second top half of U and V values*/
+ WORD32 val_u_l1 = 0, val_u_l2 = 0, val_u_t1 = 0, val_u_t2 = 0;
+ WORD32 val_v_l1 = 0, val_v_l2 = 0, val_v_t1 = 0, val_v_t2 = 0;
+
+ WORD32 val_u1 = 0, val_u2 = 0, val_v1 = 0, val_v2 = 0;
+
+ WORD32 col, row; /*loop variables*/
+
+ left_avail = ngbr_avail & 0x11;
+ left_avail1 = ngbr_avail & 1;
+ left_avail2 = (ngbr_avail >> 4) & 1;
+ top_avail = (ngbr_avail >> 2) & 1;
+
+ pu1_top = pu1_src + 2 * BLK8x8SIZE + 2;
+ pu1_left = pu1_src + 2 * BLK8x8SIZE - 2;
+
+ if(left_avail1)
+ { /* First 4x4 block*/
+ val_u_l1 += *pu1_left;
+ val_v_l1 += *(pu1_left + 1);
+ pu1_left -= 2;
+ val_u_l1 += *pu1_left;
+ val_v_l1 += *(pu1_left + 1);
+ pu1_left -= 2;
+ val_u_l1 += *pu1_left;
+ val_v_l1 += *(pu1_left + 1);
+ pu1_left -= 2;
+ val_u_l1 += *pu1_left + 2;
+ val_v_l1 += *(pu1_left + 1) + 2;
+ pu1_left -= 2;
+ }
+ else
+ pu1_left -= 2 * 4;
+
+ if(left_avail2)
+ {
+ /* Second 4x4 block*/
+ val_u_l2 += *pu1_left;
+ val_v_l2 += *(pu1_left + 1);
+ pu1_left -= 2;
+ val_u_l2 += *pu1_left;
+ val_v_l2 += *(pu1_left + 1);
+ pu1_left -= 2;
+ val_u_l2 += *pu1_left;
+ val_v_l2 += *(pu1_left + 1);
+ pu1_left -= 2;
+ val_u_l2 += *pu1_left + 2;
+ val_v_l2 += *(pu1_left + 1) + 2;
+ pu1_left -= 2;
+ }
+ else
+ pu1_left -= 2 * 4;
+
+ if(top_avail)
+ {
+ val_u_t1 += *pu1_top + *(pu1_top + 2) + *(pu1_top + 4)
+ + *(pu1_top + 6) + 2;
+ val_u_t2 += *(pu1_top + 8) + *(pu1_top + 10) + *(pu1_top + 12)
+ + *(pu1_top + 14) + 2;
+ val_v_t1 += *(pu1_top + 1) + *(pu1_top + 3) + *(pu1_top + 5)
+ + *(pu1_top + 7) + 2;
+ val_v_t2 += *(pu1_top + 9) + *(pu1_top + 11) + *(pu1_top + 13)
+ + *(pu1_top + 15) + 2;
+ }
+
+ if(left_avail + top_avail)
+ {
+ val_u1 = (left_avail1 + top_avail) ?
+ ((val_u_l1 + val_u_t1)
+ >> (1 + left_avail1 + top_avail)) :128;
+ val_v1 = (left_avail1 + top_avail) ?
+ ((val_v_l1 + val_v_t1)
+ >> (1 + left_avail1 + top_avail)) :128;
+ if(top_avail)
+ {
+ val_u2 = val_u_t2 >> 2;
+ val_v2 = val_v_t2 >> 2;
+ }
+ else if(left_avail1)
+ {
+ val_u2 = val_u_l1 >> 2;
+ val_v2 = val_v_l1 >> 2;
+ }
+ else
+ {
+ val_u2 = val_v2 = 128;
+ }
+
+ for(row = 0; row < 4; row++)
+ {
+ /*top left 4x4 block*/
+ for(col = 0; col < 8; col += 2)
+ {
+ *(pu1_dst + row * dst_strd + col) = val_u1;
+ *(pu1_dst + row * dst_strd + col + 1) = val_v1;
+ }
+ /*top right 4x4 block*/
+ for(col = 8; col < 16; col += 2)
+ {
+ *(pu1_dst + row * dst_strd + col) = val_u2;
+ *(pu1_dst + row * dst_strd + col + 1) = val_v2;
+ }
+ }
+
+ if(left_avail2)
+ {
+ val_u1 = val_u_l2 >> 2;
+ val_v1 = val_v_l2 >> 2;
+ }
+ else if(top_avail)
+ {
+ val_u1 = val_u_t1 >> 2;
+ val_v1 = val_v_t1 >> 2;
+ }
+ else
+ {
+ val_u1 = val_v1 = 128;
+ }
+ val_u2 = (left_avail2 + top_avail) ?
+ ((val_u_l2 + val_u_t2)
+ >> (1 + left_avail2 + top_avail)) : 128;
+ val_v2 = (left_avail2 + top_avail) ?
+ ((val_v_l2 + val_v_t2)
+ >> (1 + left_avail2 + top_avail)) : 128;
+
+ for(row = 4; row < 8; row++)
+ { /*bottom left 4x4 block*/
+ for(col = 0; col < 8; col += 2)
+ {
+ *(pu1_dst + row * dst_strd + col) = val_u1;
+ *(pu1_dst + row * dst_strd + col + 1) = val_v1;
+ }
+ /*bottom right 4x4 block*/
+ for(col = 8; col < 16; col += 2)
+ {
+ *(pu1_dst + row * dst_strd + col) = val_u2;
+ *(pu1_dst + row * dst_strd + col + 1) = val_v2;
+ }
+ }
+ }
+ else
+ {
+ /* Both left and top are unavailable, set the block to 128 */
+ for(row = 0; row < 8; row++)
+ {
+ memset(pu1_dst + row * dst_strd, 128, 8 * sizeof(UWORD16));
+ }
+ }
+}
+
+/**
+*******************************************************************************
+*
+*ih264_intra_pred_chroma_8x8_mode_horz
+*
+* @brief
+* Perform Intra prediction for chroma_8x8 mode:Horizontal
+*
+* @par Description:
+* Perform Intra prediction for chroma_8x8 mode:Horizontal ,described in sec 8.3.4.2
+*
+* @param[in] pu1_src
+* UWORD8 pointer to the source containing alternate U and V samples
+*
+* @param[out] pu1_dst
+* UWORD8 pointer to the destination with alternate U and V samples
+*
+* @param[in] src_strd
+* integer source stride
+*
+* @param[in] dst_strd
+* integer destination stride
+*
+* @param[in] ngbr_avail
+* availability of neighbouring pixels(Not used in this function)
+*
+* @returns
+*
+* @remarks
+* None
+*
+******************************************************************************
+*/
+void ih264_intra_pred_chroma_8x8_mode_horz(UWORD8 *pu1_src,
+ UWORD8 *pu1_dst,
+ WORD32 src_strd,
+ WORD32 dst_strd,
+ WORD32 ngbr_avail)
+{
+
+ UWORD8 *pu1_left = NULL; /* Pointer to start of top predictors */
+ WORD32 rows, cols; /* loop variables*/
+ UNUSED(src_strd);
+ UNUSED(ngbr_avail);
+ pu1_left = pu1_src + 2 * BLK8x8SIZE - 2;
+ for(rows = 0; rows < 8; rows++)
+ {
+ for(cols = 0; cols < 16; cols += 2)
+ {
+ *(pu1_dst + rows * dst_strd + cols) = *pu1_left;
+
+ *(pu1_dst + rows * dst_strd + cols + 1) = *(pu1_left + 1);
+ }
+ pu1_left -= 2;
+ }
+
+}
+
+/**
+*******************************************************************************
+*
+*ih264_intra_pred_chroma_8x8_mode_vert
+*
+* @brief
+* Perform Intra prediction for chroma_8x8 mode:vertical
+*
+* @par Description:
+* Perform Intra prediction for chroma_8x8 mode:vertical ,described in sec 8.3.4.3
+*
+* @param[in] pu1_src
+* UWORD8 pointer to the source containing alternate U and V samples
+*
+* @param[out] pu1_dst
+* UWORD8 pointer to the destination with alternate U and V samples
+*
+* @param[in] src_strd
+* integer source stride
+*
+* @param[in] dst_strd
+* integer destination stride
+*
+* @param[in] ngbr_avail
+* availability of neighbouring pixels(Not used in this function)
+*
+* @returns
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+void ih264_intra_pred_chroma_8x8_mode_vert(UWORD8 *pu1_src,
+ UWORD8 *pu1_dst,
+ WORD32 src_strd,
+ WORD32 dst_strd,
+ WORD32 ngbr_avail)
+{
+
+ UWORD8 *pu1_top = NULL; /* Pointer to start of top predictors */
+ WORD32 row;/*loop variable*/
+ UNUSED(src_strd);
+ UNUSED(ngbr_avail);
+ pu1_top = pu1_src + 2 * BLK8x8SIZE + 2;
+
+ /* 8 bytes are copied from src to dst */
+ for(row = 0; row < 2; row++)
+ {
+ memcpy(pu1_dst, pu1_top, 16);
+
+ pu1_dst += dst_strd;
+ memcpy(pu1_dst, pu1_top, 16);
+
+ pu1_dst += dst_strd;
+ memcpy(pu1_dst, pu1_top, 16);
+
+ pu1_dst += dst_strd;
+ memcpy(pu1_dst, pu1_top, 16);
+
+ pu1_dst += dst_strd;
+ }
+}
+
+/**
+*******************************************************************************
+*
+* ih264_intra_pred_chroma_8x8_mode_plane
+*
+* @brief
+* Perform Intra prediction for chroma_8x8 mode:PLANE
+*
+* @par Description:
+* Perform Intra prediction for chroma_8x8 mode:PLANE ,described in sec 8.3.4.4
+*
+* @param[in] pu1_src
+* UWORD8 pointer to the source containing alternate U and V samples
+*
+* @param[out] pu1_dst
+* UWORD8 pointer to the destination with alternate U and V samples
+*
+* @param[in] src_strd
+* integer source stride
+*
+* @param[in] dst_strd
+* integer destination stride
+*
+* @param[in] ngbr_avail
+* availability of neighbouring pixels(Not used in this function)
+*
+* @returns
+*
+* @remarks
+* None
+*
+******************************************************************************
+*/
+void ih264_intra_pred_chroma_8x8_mode_plane(UWORD8 *pu1_src,
+ UWORD8 *pu1_dst,
+ WORD32 src_strd,
+ WORD32 dst_strd,
+ WORD32 ngbr_avail)
+{
+
+ UWORD8 *pu1_left = NULL; /* Pointer to start of left predictors */
+ UWORD8 *pu1_top = NULL; /* Pointer to start of top predictors */
+ WORD32 val = 0;
+ WORD32 rows, cols; /* loop variables*/
+ WORD32 a_u, b_u, c_u, h_u, v_u; /* Implementing section 8.3.4.4 . The variables represent the corresponding variables in the section*/
+ WORD32 a_v, b_v, c_v, h_v, v_v;
+ UNUSED(src_strd);
+ UNUSED(ngbr_avail);
+ a_u = b_u = c_u = h_u = v_u = 0;
+ a_v = b_v = c_v = h_v = v_v = 0;
+ /* As chroma format 4:2:0 is used,xCF = 4 * ( chroma_format_idc = = 3 ) = 0 and
+ yCF = 4 * ( chroma_format_idc != 1 ) = 0 */
+ pu1_top = pu1_src + 2 * BLK8x8SIZE + 2;
+ pu1_left = pu1_src + 2 * BLK8x8SIZE - 2;
+ /* Implementing section 8.3.4.4 */
+ for(cols = 0; cols < 4; cols++)
+ {
+ h_u += (cols + 1) * (pu1_top[8 + 2 * cols] - pu1_top[4 - 2 * cols]);/*section 8.3.4.4 equation (8-144)*/
+ h_v += (cols + 1) * (pu1_top[8 + 2 * cols + 1] - pu1_top[4 - 2 * cols+ 1]);
+
+ v_u += (cols + 1) * (pu1_left[(4 + cols) * (-2)] - pu1_left[(2 - cols) * (-2)]);
+ v_v += (cols + 1) * (pu1_left[(4 + cols) * (-2) + 1] - pu1_left[(2 - cols) * (-2) + 1]);/*section 8.3.4.4 equation (8-145)*/
+ }
+ a_u = 16 * (pu1_left[7 * (-2)] + pu1_top[14]);
+ a_v = 16 * (pu1_left[7 * (-2) + 1] + pu1_top[15]);/*section 8.3.3.4 equation (8-141)*/
+ b_u = (34 * h_u + 32) >> 6;/*section 8.3.3.4 equation (8-142)*/
+ b_v = (34 * h_v + 32) >> 6;/*section 8.3.3.4 equation (8-142)*/
+ c_u = (34 * v_u + 32) >> 6;/*section 8.3.3.4 equation (8-143)*/
+ c_v = (34 * v_v + 32) >> 6;/*section 8.3.3.4 equation (8-143)*/
+
+ for(rows = 0; rows < 8; rows++)
+ {
+ for(cols = 0; cols < 8; cols++)
+ {
+ val = (a_u + b_u * (cols - 3) + c_u * (rows - 3) );/*section 8.3.4.4 equation (8-140)*/
+ val = (val + 16) >> 5;
+ *(pu1_dst + rows * dst_strd + 2 * cols) = CLIP_U8(val);
+ val = (a_v + b_v * (cols - 3) + c_v * (rows - 3) );/*section 8.3.4.4 equation (8-140)*/
+ val = (val + 16) >> 5;
+ *(pu1_dst + rows * dst_strd + 2 * cols + 1) = CLIP_U8(val);
+ }
+ }
+}
+
diff --git a/common/ih264_common_tables.c b/common/ih264_common_tables.c
new file mode 100755
index 0000000..c53c276
--- /dev/null
+++ b/common/ih264_common_tables.c
@@ -0,0 +1,725 @@
+/******************************************************************************
+ *
+ * Copyright (C) 2015 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ *****************************************************************************
+ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
+*/
+/**
+*******************************************************************************
+* @file
+* ih264_common_tables.c
+*
+* @brief
+* Contains common global tables
+*
+* @author
+* Harish M
+*
+* @par List of Functions:
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+
+/*****************************************************************************/
+/* File Includes */
+/*****************************************************************************/
+
+/* User include files */
+#include "ih264_typedefs.h"
+#include "ih264_defs.h"
+#include "ih264_macros.h"
+#include "ih264_structs.h"
+#include "ih264_common_tables.h"
+
+
+/*****************************************************************************/
+/* Extern global definitions */
+/*****************************************************************************/
+
+/**
+ ******************************************************************************
+ * @brief while encoding, basing on the input configuration parameters, the
+ * the level of the bitstream is computed basing on the table below.
+ * input : table_idx
+ * output : level_idc or cpb size
+ * @remarks Table A-1 – level table limits
+ ******************************************************************************
+ */
+const level_tables_t gas_ih264_lvl_tbl[16] =
+{
+ { IH264_LEVEL_10, 1485, 99, 297, 64, 175, 64 },
+ { IH264_LEVEL_11, 1485, 99, 297, 128, 350, 64 },
+ { IH264_LEVEL_1B, 3000, 396, 675, 192, 500, 128 },
+ { IH264_LEVEL_12, 6000, 396, 1782, 384, 1000, 128 },
+ { IH264_LEVEL_13, 11880, 396, 1782, 768, 2000, 128 },
+ { IH264_LEVEL_20, 11880, 396, 1782, 2000, 2000, 128 },
+ { IH264_LEVEL_21, 19800, 792, 3564, 4000, 4000, 256 },
+ { IH264_LEVEL_22, 20250, 1620, 6075, 4000, 4000, 256 },
+ { IH264_LEVEL_30, 40500, 1620, 6075, 10000, 10000, 256 },
+ { IH264_LEVEL_31, 108000, 3600, 13500, 14000, 14000, 512 },
+ { IH264_LEVEL_32, 216000, 5120, 15360, 20000, 20000, 512 },
+ { IH264_LEVEL_40, 245760, 8192, 24576, 20000, 25000, 512 },
+ { IH264_LEVEL_41, 245760, 8192, 24576, 50000, 62500, 512 },
+ { IH264_LEVEL_42, 522240, 8704, 26112, 50000, 62500, 512 },
+ { IH264_LEVEL_50, 589824, 22080, 82800, 135000, 135000, 512 },
+ { IH264_LEVEL_51, 983040, 36864, 138240, 240000, 240000, 512 },
+};
+
+
+/**
+ * Array containing supported levels
+ */
+const WORD32 gai4_ih264_levels[] =
+{
+ IH264_LEVEL_10,
+ IH264_LEVEL_11,
+ IH264_LEVEL_12,
+ IH264_LEVEL_13,
+ IH264_LEVEL_20,
+ IH264_LEVEL_21,
+ IH264_LEVEL_22,
+ IH264_LEVEL_30,
+ IH264_LEVEL_31,
+ IH264_LEVEL_32,
+ IH264_LEVEL_40,
+ IH264_LEVEL_41,
+ IH264_LEVEL_42,
+ IH264_LEVEL_50,
+ IH264_LEVEL_51,
+};
+
+
+/**
+ * Array giving size of max luma samples in a picture for a given level
+ */
+const WORD32 gai4_ih264_max_luma_pic_size[] =
+{
+ /* Level 1 */
+ 25344,
+ /* Level 1.1 */
+ 101376,
+ /* Level 1.2 */
+ 101376,
+ /* Level 1.3 */
+ 101376,
+ /* Level 2 */
+ 101376,
+ /* Level 2.1 */
+ 202752,
+ /* Level 2.2 */
+ 414720,
+ /* Level 3 */
+ 414720,
+ /* Level 3.1 */
+ 921600,
+ /* Level 3.1 */
+ 1310720,
+ /* Level 4 */
+ 2097152,
+ /* Level 4.1 */
+ 2097152,
+ /* Level 4.2 */
+ 2228224,
+ /* Level 5 */
+ 5652480,
+ /* Level 5.1 */
+ 9437184
+};
+
+
+/** Max width and height allowed for a given level */
+/** This is derived as SQRT(8 * gai4_ih264_max_luma_pic_size[]) */
+const WORD32 gai4_ih264_max_wd_ht[] =
+{
+ /* Level 1 */
+ 451,
+ /* Level 1.1 */
+ 901,
+ /* Level 1.2 */
+ 901,
+ /* Level 1.3 */
+ 901,
+ /* Level 2 */
+ 901,
+ /* Level 2.1 */
+ 1274,
+ /* Level 2.2 */
+ 1822,
+ /* Level 3 */
+ 1822,
+ /* Level 3.1 */
+ 2716,
+ /* Level 3.2 */
+ 3239,
+ /* Level 4 */
+ 4096,
+ /* Level 4.1 */
+ 4096,
+ /* Level 4.2 */
+ 4223,
+ /* Level 5 */
+ 6725,
+ /* Level 5.1 */
+ 8689
+};
+
+/** Min width and height allowed for a given level */
+/** This is derived as gai4_ih264_max_luma_pic_size[]/gai4_ih264_max_wd_ht[] */
+const WORD32 gai4_ih264_min_wd_ht[] =
+{
+ /* Level 1 */
+ 57,
+ /* Level 1.1 */
+ 113,
+ /* Level 1.2 */
+ 113,
+ /* Level 1.3 */
+ 113,
+ /* Level 2 */
+ 113,
+ /* Level 2.1 */
+ 160,
+ /* Level 2.2 */
+ 228,
+ /* Level 3 */
+ 228,
+ /* Level 3.1 */
+ 340,
+ /* Level 3.2 */
+ 405,
+ /* Level 4 */
+ 512,
+ /* Level 4.1 */
+ 512,
+ /* Level 4.2 */
+ 528,
+ /* Level 5 */
+ 841,
+ /* Level 5.1 */
+ 1087
+
+};
+
+
+/** Table 7-11 Macroblock types for I slices */
+intra_mbtype_info_t gas_ih264_i_mbtype_info[] =
+{
+ /* For first entry, if transform_size_8x8_flag is 1, mode will be MBPART_I8x8 */
+ /* This has to be taken care while accessing the table */
+ {0, MBPART_I4x4, VERT_I16x16, 0, 0},
+ {0, MBPART_I16x16, VERT_I16x16, 0, 0},
+ {0, MBPART_I16x16, HORZ_I16x16, 0, 0},
+ {0, MBPART_I16x16, DC_I16x16, 0, 0},
+ {0, MBPART_I16x16, PLANE_I16x16, 0, 0},
+ {0, MBPART_I16x16, VERT_I16x16, 1, 0},
+ {0, MBPART_I16x16, HORZ_I16x16, 1, 0},
+ {0, MBPART_I16x16, DC_I16x16, 1, 0},
+ {0, MBPART_I16x16, PLANE_I16x16, 1, 0},
+ {0, MBPART_I16x16, VERT_I16x16, 2, 0},
+ {0, MBPART_I16x16, HORZ_I16x16, 2, 0},
+ {0, MBPART_I16x16, DC_I16x16, 2, 0},
+ {0, MBPART_I16x16, PLANE_I16x16, 2, 0},
+ {0, MBPART_I16x16, VERT_I16x16, 0, 15},
+ {0, MBPART_I16x16, HORZ_I16x16, 0, 15},
+ {0, MBPART_I16x16, DC_I16x16, 0, 15},
+ {0, MBPART_I16x16, PLANE_I16x16, 0, 15},
+ {0, MBPART_I16x16, VERT_I16x16, 1, 15},
+ {0, MBPART_I16x16, HORZ_I16x16, 1, 15},
+ {0, MBPART_I16x16, DC_I16x16, 1, 15},
+ {0, MBPART_I16x16, PLANE_I16x16, 1, 15},
+ {0, MBPART_I16x16, VERT_I16x16, 2, 15},
+ {0, MBPART_I16x16, HORZ_I16x16, 2, 15},
+ {0, MBPART_I16x16, DC_I16x16, 2, 15},
+ {0, MBPART_I16x16, PLANE_I16x16, 2, 15},
+ {0, MBPART_IPCM, VERT_I16x16, 0, 0}
+};
+
+/** Table 7-13 Macroblock types for P slices */
+inter_mbtype_info_t gas_ih264_p_mbtype_info[] =
+{
+ {1, MBPART_L0, MBPART_NA, 16, 16},
+ {2, MBPART_L0, MBPART_L0, 16, 8},
+ {2, MBPART_L0, MBPART_L0, 8, 16},
+ {4, MBPART_NA, MBPART_NA, 8, 8},
+ {4, MBPART_NA, MBPART_NA, 8, 8},
+};
+
+/** Table 7-14 Macroblock types for B slices */
+inter_mbtype_info_t gas_ih264_b_mbtype_info[] =
+{
+ {0, MBPART_DIRECT, MBPART_NA, 8, 8, },
+ {1, MBPART_L0, MBPART_NA, 16, 16, },
+ {1, MBPART_L1, MBPART_NA, 16, 16, },
+ {1, MBPART_BI, MBPART_NA, 16, 16, },
+ {2, MBPART_L0, MBPART_L0, 16, 8, },
+ {2, MBPART_L0, MBPART_L0, 8, 16, },
+ {2, MBPART_L1, MBPART_L1, 16, 8, },
+ {2, MBPART_L1, MBPART_L1, 8, 16, },
+ {2, MBPART_L0, MBPART_L1, 16, 8, },
+ {2, MBPART_L0, MBPART_L1, 8, 16, },
+ {2, MBPART_L1, MBPART_L0, 16, 8, },
+ {2, MBPART_L1, MBPART_L0, 8, 16, },
+ {2, MBPART_L0, MBPART_BI, 16, 8, },
+ {2, MBPART_L0, MBPART_BI, 8, 16, },
+ {2, MBPART_L1, MBPART_BI, 16, 8, },
+ {2, MBPART_L1, MBPART_BI, 8, 16, },
+ {2, MBPART_BI, MBPART_L0, 16, 8, },
+ {2, MBPART_BI, MBPART_L0, 8, 16, },
+ {2, MBPART_BI, MBPART_L1, 16, 8, },
+ {2, MBPART_BI, MBPART_L1, 8, 16, },
+ {2, MBPART_BI, MBPART_BI, 16, 8, },
+ {2, MBPART_BI, MBPART_BI, 8, 16, },
+ {4, MBPART_NA, MBPART_NA, 8, 8, },
+};
+
+/** Table 7-17 – Sub-macroblock types in P macroblocks */
+submbtype_info_t gas_ih264_p_submbtype_info[] =
+{
+ {1, MBPART_L0, 8, 8},
+ {2, MBPART_L0, 8, 4},
+ {2, MBPART_L0, 4, 8},
+ {4, MBPART_L0, 4, 4},
+};
+
+/** Table 7-18 – Sub-macroblock types in B macroblocks */
+submbtype_info_t gas_ih264_b_submbtype_info[] =
+{
+ {4, MBPART_DIRECT, 4, 4},
+ {1, MBPART_L0, 8, 8},
+ {1, MBPART_L1, 8, 8},
+ {1, MBPART_BI, 8, 8},
+ {2, MBPART_L0, 8, 4},
+ {2, MBPART_L0, 4, 8},
+ {2, MBPART_L1, 8, 4},
+ {2, MBPART_L1, 4, 8},
+ {2, MBPART_BI, 8, 4},
+ {2, MBPART_BI, 4, 8},
+ {4, MBPART_L0, 4, 4},
+ {4, MBPART_L1, 4, 4},
+ {4, MBPART_BI, 4, 4},
+};
+
+
+
+
+const UWORD8 gau1_ih264_inv_scan_prog4x4[] =
+{
+ 0, 1, 4, 8,
+ 5, 2, 3, 6,
+ 9, 12, 13, 10,
+ 7, 11, 14, 15
+};
+
+const UWORD8 gau1_ih264_inv_scan_int4x4[] =
+{
+ 0, 4, 1, 8,
+ 12, 5, 9, 13,
+ 2, 6, 10, 14,
+ 3, 7, 11, 15
+};
+
+/** Inverse scan tables for individual 4x4 blocks of 8x8 transform coeffs of CAVLC */
+/* progressive */
+const UWORD8 gau1_ih264_inv_scan_prog8x8_cavlc[64] =
+{
+ 0, 9, 17, 18, 12, 40, 27, 7,
+ 35, 57, 29, 30, 58, 38, 53, 47,
+ 1, 2, 24, 11, 19, 48, 20, 14,
+ 42, 50, 22, 37, 59, 31, 60, 55,
+ 8, 3, 32, 4, 26, 41, 13, 21,
+ 49, 43, 15, 44, 52, 39, 61, 62,
+ 16, 10, 25, 5, 33, 34, 6, 28,
+ 56, 36, 23, 51, 45, 46, 54, 63
+};
+
+/* interlace */
+const UWORD8 gau1_ih264_inv_scan_int8x8_cavlc[64] =
+{
+ 0, 9, 2, 56, 18, 26, 34, 27,
+ 35, 28, 36, 29, 45, 7, 54, 39,
+ 8, 24, 25, 33, 41, 11, 42, 12,
+ 43, 13, 44, 14, 53, 15, 62, 47,
+ 16, 32, 40, 10, 49, 4, 50, 5,
+ 51, 6, 52, 22, 61, 38, 23, 55,
+ 1, 17, 48, 3, 57, 19, 58, 20,
+ 59, 21, 60, 37, 30, 46, 31, 63
+};
+
+
+
+/*Inverse scan tables for individual 8x8 blocks of 8x8 transform coeffs of CABAC */
+/* progressive */
+
+const UWORD8 gau1_ih264_inv_scan_prog8x8_cabac[64] =
+{
+ 0, 1, 8, 16, 9, 2, 3, 10,
+ 17, 24, 32, 25, 18, 11, 4, 5,
+ 12, 19, 26, 33, 40, 48, 41, 34,
+ 27, 20, 13, 6, 7, 14, 21, 28,
+ 35, 42, 49, 56, 57, 50, 43, 36,
+ 29, 22, 15, 23, 30, 37, 44, 51,
+ 58, 59, 52, 45, 38, 31, 39, 46,
+ 53, 60, 61, 54, 47, 55, 62, 63
+};
+
+
+/* interlace */
+
+const UWORD8 gau1_ih264_inv_scan_int8x8_cabac[64] =
+{
+ 0, 8, 16, 1, 9, 24, 32, 17,
+ 2, 25, 40, 48, 56, 33, 10, 3,
+ 18, 41, 49, 57, 26, 11, 4, 19,
+ 34, 42, 50, 58, 27, 12, 5, 20,
+ 35, 43, 51, 59, 28, 13, 6, 21,
+ 36, 44, 52, 60, 29, 14, 22, 37,
+ 45, 53, 61, 30, 7, 15, 38, 46,
+ 54, 62, 23, 31, 39, 47, 55, 63
+};
+
+
+const UWORD8 *gpau1_ih264_inv_scan8x8[] =
+{
+ gau1_ih264_inv_scan_prog8x8_cavlc,
+ gau1_ih264_inv_scan_int8x8_cavlc,
+ gau1_ih264_inv_scan_prog8x8_cabac,
+ gau1_ih264_inv_scan_int8x8_cabac
+};
+
+const UWORD8 *gpau1_ih264_inv_scan4x4[] =
+{
+ gau1_ih264_inv_scan_prog4x4,
+ gau1_ih264_inv_scan_int4x4,
+};
+
+const UWORD8 gau1_ih264_8x8_subblk_idx[] =
+{
+ 0, 1, 4, 5,
+ 2, 3, 6, 7,
+ 8, 9, 12, 13,
+ 10, 11, 14, 15
+};
+
+
+/* Table 8-15 Chroma QP offset table */
+const UWORD8 gau1_ih264_chroma_qp[] =
+{
+ 0, 1, 2, 3, 4, 5, 6, 7,
+ 8, 9, 10, 11, 12, 13, 14, 15,
+ 16, 17, 18, 19, 20, 21, 22, 23,
+ 24, 25, 26, 27, 28, 29, 29, 30,
+ 31, 32, 32, 33, 34, 34, 35, 35,
+ 36, 36, 37, 37, 37, 38, 38, 38,
+ 39, 39, 39, 39
+};
+
+
+/**
+******************************************************************************
+* @brief look up table to compute neigbour availability of 4x4 blocks
+* input : subblk idx, mb neighbor availability
+* output : sub blk neighbor availability
+* @remarks
+******************************************************************************
+*/
+const UWORD8 gau1_ih264_4x4_ngbr_avbl[16][16] =
+{
+ { 0x0, 0x1, 0xc, 0x7, 0x1, 0x1, 0xf, 0x7, 0xc, 0xf, 0xc, 0x7, 0xf, 0x7, 0xf, 0x7 },
+ { 0x1, 0x1, 0xf, 0x7, 0x1, 0x1, 0xf, 0x7, 0xf, 0xf, 0xf, 0x7, 0xf, 0x7, 0xf, 0x7 },
+ { 0x2, 0x1, 0xc, 0x7, 0x1, 0x1, 0xf, 0x7, 0xc, 0xf, 0xc, 0x7, 0xf, 0x7, 0xf, 0x7 },
+ { 0x3, 0x1, 0xf, 0x7, 0x1, 0x1, 0xf, 0x7, 0xf, 0xf, 0xf, 0x7, 0xf, 0x7, 0xf, 0x7 },
+
+ { 0xc, 0xf, 0xc, 0x7, 0xf, 0x7, 0xf, 0x7, 0xc, 0xf, 0xc, 0x7, 0xf, 0x7, 0xf, 0x7 },
+ { 0xd, 0xf, 0xf, 0x7, 0xf, 0x7, 0xf, 0x7, 0xf, 0xf, 0xf, 0x7, 0xf, 0x7, 0xf, 0x7 },
+ { 0xe, 0xf, 0xc, 0x7, 0xf, 0x7, 0xf, 0x7, 0xc, 0xf, 0xc, 0x7, 0xf, 0x7, 0xf, 0x7 },
+ { 0xf, 0xf, 0xf, 0x7, 0xf, 0x7, 0xf, 0x7, 0xf, 0xf, 0xf, 0x7, 0xf, 0x7, 0xf, 0x7 },
+
+ { 0x0, 0x1, 0xc, 0x7, 0x1, 0x9, 0xf, 0x7, 0xc, 0xf, 0xc, 0x7, 0xf, 0x7, 0xf, 0x7 },
+ { 0x1, 0x1, 0xf, 0x7, 0x1, 0x9, 0xf, 0x7, 0xf, 0xf, 0xf, 0x7, 0xf, 0x7, 0xf, 0x7 },
+ { 0x2, 0x1, 0xc, 0x7, 0x1, 0x9, 0xf, 0x7, 0xc, 0xf, 0xc, 0x7, 0xf, 0x7, 0xf, 0x7 },
+ { 0x3, 0x1, 0xf, 0x7, 0x1, 0x9, 0xf, 0x7, 0xf, 0xf, 0xf, 0x7, 0xf, 0x7, 0xf, 0x7 },
+
+ { 0xc, 0xf, 0xc, 0x7, 0xf, 0xf, 0xf, 0x7, 0xc, 0xf, 0xc, 0x7, 0xf, 0x7, 0xf, 0x7 },
+ { 0xd, 0xf, 0xf, 0x7, 0xf, 0xf, 0xf, 0x7, 0xf, 0xf, 0xf, 0x7, 0xf, 0x7, 0xf, 0x7 },
+ { 0xe, 0xf, 0xc, 0x7, 0xf, 0xf, 0xf, 0x7, 0xc, 0xf, 0xc, 0x7, 0xf, 0x7, 0xf, 0x7 },
+ { 0xf, 0xf, 0xf, 0x7, 0xf, 0xf, 0xf, 0x7, 0xf, 0xf, 0xf, 0x7, 0xf, 0x7, 0xf, 0x7 },
+};
+
+
+/**
+******************************************************************************
+* @brief look up table to compute neigbour availability of 8x8 blocks
+* input : subblk idx, mb neighbor availability
+* output : sub blk neighbor availability
+* @remarks
+******************************************************************************
+*/
+const UWORD8 gau1_ih264_8x8_ngbr_avbl[16][4] =
+{
+ { 0x0, 0x1, 0xc, 0x7 },
+ { 0x1, 0x1, 0xf, 0x7 },
+ { 0x2, 0x1, 0xc, 0x7 },
+ { 0x3, 0x1, 0xf, 0x7 },
+
+ { 0xc, 0x7, 0xc, 0x7 },
+ { 0xd, 0x7, 0xf, 0x7 },
+ { 0xe, 0x7, 0xc, 0x7 },
+ { 0xf, 0x7, 0xf, 0x7 },
+
+ { 0x0, 0x9, 0xc, 0x7 },
+ { 0x1, 0x9, 0xf, 0x7 },
+ { 0x2, 0x9, 0xc, 0x7 },
+ { 0x3, 0x9, 0xf, 0x7 },
+
+ { 0xc, 0xf, 0xc, 0x7 },
+ { 0xd, 0xf, 0xf, 0x7 },
+ { 0xe, 0xf, 0xc, 0x7 },
+ { 0xf, 0xf, 0xf, 0x7 },
+};
+
+/** Table 7-3 Default intra 4x4 scaling list */
+const UWORD16 gau2_ih264_default_intra4x4_scaling_list[] =
+{
+ 6, 13, 13, 20,
+ 20, 20, 28, 28,
+ 28, 28, 32, 32,
+ 32, 37, 37, 42
+};
+
+/** Table 7-3 Default inter 4x4 scaling list */
+const UWORD16 gau2_ih264_default_inter4x4_scaling_list[] =
+{
+ 10, 14, 14, 20,
+ 20, 20, 24, 24,
+ 24, 24, 27, 27,
+ 27, 30, 30, 34
+};
+
+/* Inverse scanned output of gau2_ih264_default_intra4x4_scaling_list */
+const UWORD16 gau2_ih264_default_intra4x4_weight_scale[] =
+{
+ 6, 13, 20, 28,
+ 13, 20, 28, 32,
+ 20, 28, 32, 37,
+ 28, 32, 37, 42
+};
+
+/* Inverse scanned output of gau2_ih264_default_inter4x4_scaling_list */
+const UWORD16 gau2_ih264_default_inter4x4_weight_scale[] =
+{
+ 10, 14, 20, 24,
+ 14, 20, 24, 27,
+ 20, 24, 27, 30,
+ 24, 27, 30, 34
+};
+
+/** Table 7-4 Default intra 8x8 scaling list */
+const UWORD16 gau2_ih264_default_intra8x8_scaling_list[] =
+{
+ 6, 10, 10, 13, 11, 13, 16, 16,
+ 16, 16, 18, 18, 18, 18, 18, 23,
+ 23, 23, 23, 23, 23, 25, 25, 25,
+ 25, 25, 25, 25, 27, 27, 27, 27,
+ 27, 27, 27, 27, 29, 29, 29, 29,
+ 29, 29, 29, 31, 31, 31, 31, 31,
+ 31, 33, 33, 33, 33, 33, 36, 36,
+ 36, 36, 38, 38, 38, 40, 40, 42
+};
+
+/** Table 7-4 Default inter 8x8 scaling list */
+const UWORD16 gau2_ih264_default_inter8x8_scaling_list[] =
+{
+ 9, 13, 13, 15, 13, 15, 17, 17,
+ 17, 17, 19, 19, 19, 19, 19, 21,
+ 21, 21, 21, 21, 21, 22, 22, 22,
+ 22, 22, 22, 22, 24, 24, 24, 24,
+ 24, 24, 24, 24, 25, 25, 25, 25,
+ 25, 25, 25, 27, 27, 27, 27, 27,
+ 27, 28, 28, 28, 28, 28, 30, 30,
+ 30, 30, 32, 32, 32, 33, 33, 35
+};
+
+/* Inverse scanned output of gau2_ih264_default_intra8x8_scaling_list */
+const UWORD16 gau2_ih264_default_intra8x8_weight_scale[] =
+{
+ 6, 10, 13, 16, 18, 23, 25, 27,
+ 10, 11, 16, 18, 23, 25, 27, 29,
+ 13, 16, 18, 23, 25, 27, 29, 31,
+ 16, 18, 23, 25, 27, 29, 31, 33,
+ 18, 23, 25, 27, 29, 31, 33, 36,
+ 23, 25, 27, 29, 31, 33, 36, 38,
+ 25, 27, 29, 31, 33, 36, 38, 40,
+ 27, 29, 31, 33, 36, 38, 40, 42
+};
+
+/* Inverse scanned output of gau2_ih264_default_inter8x8_scaling_list */
+const UWORD16 gau2_ih264_default_inter8x8_weight_scale[] =
+{
+ 9, 13, 15, 17, 19, 21, 22, 24,
+ 13, 13, 17, 19, 21, 22, 24, 25,
+ 15, 17, 19, 21, 22, 24, 25, 27,
+ 17, 19, 21, 22, 24, 25, 27, 28,
+ 19, 21, 22, 24, 25, 27, 28, 30,
+ 21, 22, 24, 25, 27, 28, 30, 32,
+ 22, 24, 25, 27, 28, 30, 32, 33,
+ 24, 25, 27, 28, 30, 32, 33, 35
+};
+/* Eq 7-8 Flat scaling matrix for 4x4 */
+const UWORD16 gau2_ih264_flat_4x4_weight_scale[] =
+{
+ 16, 16, 16, 16,
+ 16, 16, 16, 16,
+ 16, 16, 16, 16,
+ 16, 16, 16, 16
+};
+
+/* Eq 7-9 Flat scaling matrix for 8x8 */
+const UWORD16 gau2_ih264_flat_8x8_weight_scale[] =
+{
+ 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16
+};
+
+
+/**
+ ******************************************************************************
+ * @brief Scale Table for inverse quantizing 4x4 subblock. To inverse quantize
+ * a given 4x4 quantized block, the coefficient at index location (i,j) is scaled
+ * by one of the constants in this table and right shift the result by abs (4 -
+ * floor(qp/6)), here qp is the quantization parameter used to quantize the mb.
+ *
+ * input : 16 * qp%6, index location (i,j)
+ * output : scale constant.
+ *
+ * @remarks 16 constants for each index position of the subblock and 6 for each
+ * qp%6 in the range 0-5 inclusive.
+ ******************************************************************************
+ */
+
+const UWORD16 gau2_ih264_iquant_scale_matrix_4x4[96] =
+{
+ 10, 13, 10, 13,
+ 13, 16, 13, 16,
+ 10, 13, 10, 13,
+ 13, 16, 13, 16,
+
+ 11, 14, 11, 14,
+ 14, 18, 14, 18,
+ 11, 14, 11, 14,
+ 14, 18, 14, 18,
+
+ 13, 16, 13, 16,
+ 16, 20, 16, 20,
+ 13, 16, 13, 16,
+ 16, 20, 16, 20,
+
+ 14, 18, 14, 18,
+ 18, 23, 18, 23,
+ 14, 18, 14, 18,
+ 18, 23, 18, 23,
+
+ 16, 20, 16, 20,
+ 20, 25, 20, 25,
+ 16, 20, 16, 20,
+ 20, 25, 20, 25,
+
+ 18, 23, 18, 23,
+ 23, 29, 23, 29,
+ 18, 23, 18, 23,
+ 23, 29, 23, 29,
+
+};
+
+/**
+ ******************************************************************************
+ * @brief Scale Table for inverse quantizing 8x8 subblock. To inverse quantize
+ * a given 8x8 quantized block, the coefficient at index location (i,j) is scaled
+ * by one of the constants in this table and right shift the result by abs (4 -
+ * floor(qp/6)), here qp is the quantization parameter used to quantize the mb.
+ *
+ * input : qp%6, index location (i,j)
+ * output : scale constant.
+ *
+ * @remarks 64 constants for each index position of the subblock and 6 for each
+ * qp%6 in the range 0-5 inclusive.
+ ******************************************************************************
+ */
+const UWORD16 gau2_ih264_iquant_scale_matrix_8x8 [384] =
+{
+ 20, 19, 25, 19, 20, 19, 25, 19,
+ 19, 18, 24, 18, 19, 18, 24, 18,
+ 25, 24, 32, 24, 25, 24, 32, 24,
+ 19, 18, 24, 18, 19, 18, 24, 18,
+ 20, 19, 25, 19, 20, 19, 25, 19,
+ 19, 18, 24, 18, 19, 18, 24, 18,
+ 25, 24, 32, 24, 25, 24, 32, 24,
+ 19, 18, 24, 18, 19, 18, 24, 18,
+
+ 22, 21, 28, 21, 22, 21, 28, 21,
+ 21, 19, 26, 19, 21, 19, 26, 19,
+ 28, 26, 35, 26, 28, 26, 35, 26,
+ 21, 19, 26, 19, 21, 19, 26, 19,
+ 22, 21, 28, 21, 22, 21, 28, 21,
+ 21, 19, 26, 19, 21, 19, 26, 19,
+ 28, 26, 35, 26, 28, 26, 35, 26,
+ 21, 19, 26, 19, 21, 19, 26, 19,
+
+ 26, 24, 33, 24, 26, 24, 33, 24,
+ 24, 23, 31, 23, 24, 23, 31, 23,
+ 33, 31, 42, 31, 33, 31, 42, 31,
+ 24, 23, 31, 23, 24, 23, 31, 23,
+ 26, 24, 33, 24, 26, 24, 33, 24,
+ 24, 23, 31, 23, 24, 23, 31, 23,
+ 33, 31, 42, 31, 33, 31, 42, 31,
+ 24, 23, 31, 23, 24, 23, 31, 23,
+
+ 28, 26, 35, 26, 28, 26, 35, 26,
+ 26, 25, 33, 25, 26, 25, 33, 25,
+ 35, 33, 45, 33, 35, 33, 45, 33,
+ 26, 25, 33, 25, 26, 25, 33, 25,
+ 28, 26, 35, 26, 28, 26, 35, 26,
+ 26, 25, 33, 25, 26, 25, 33, 25,
+ 35, 33, 45, 33, 35, 33, 45, 33,
+ 26, 25, 33, 25, 26, 25, 33, 25,
+
+ 32, 30, 40, 30, 32, 30, 40, 30,
+ 30, 28, 38, 28, 30, 28, 38, 28,
+ 40, 38, 51, 38, 40, 38, 51, 38,
+ 30, 28, 38, 28, 30, 28, 38, 28,
+ 32, 30, 40, 30, 32, 30, 40, 30,
+ 30, 28, 38, 28, 30, 28, 38, 28,
+ 40, 38, 51, 38, 40, 38, 51, 38,
+ 30, 28, 38, 28, 30, 28, 38, 28,
+
+ 36, 34, 46, 34, 36, 34, 46, 34,
+ 34, 32, 43, 32, 34, 32, 43, 32,
+ 46, 43, 58, 43, 46, 43, 58, 43,
+ 34, 32, 43, 32, 34, 32, 43, 32,
+ 36, 34, 46, 34, 36, 34, 46, 34,
+ 34, 32, 43, 32, 34, 32, 43, 32,
+ 46, 43, 58, 43, 46, 43, 58, 43,
+ 34, 32, 43, 32, 34, 32, 43, 32,
+
+};
diff --git a/common/ih264_common_tables.h b/common/ih264_common_tables.h
new file mode 100755
index 0000000..3127a2c
--- /dev/null
+++ b/common/ih264_common_tables.h
@@ -0,0 +1,136 @@
+/******************************************************************************
+ *
+ * Copyright (C) 2015 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ *****************************************************************************
+ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
+*/
+/**
+*******************************************************************************
+* @file
+* ih264_common_tables.h
+*
+* @brief
+* Common tables
+*
+* @author
+* Harish
+*
+* @par List of Functions:
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+
+#ifndef _IH264_COMMON_TABLES_H_
+#define _IH264_COMMON_TABLES_H_
+
+
+/*****************************************************************************/
+/* Structures */
+/*****************************************************************************/
+
+/**
+******************************************************************************
+ * @brief level tables
+******************************************************************************
+ */
+typedef struct
+{
+ /* level */
+ IH264_LEVEL_T u4_level_idc;
+
+ /* max macroblock processing rate */
+ UWORD32 u4_max_mbps;
+
+ /* max frame size in mbs */
+ UWORD32 u4_max_fs;
+
+ /* max dpb size / 768 */
+ UWORD32 u4_max_dpb_size;
+
+ /* max bit rate */
+ UWORD32 u4_max_br;
+
+ /* max cpb size */
+ UWORD32 u4_max_cpb_size;
+
+ /* max vertical MV component range */
+ UWORD32 u4_max_mv_y;
+
+}level_tables_t;
+
+/*****************************************************************************/
+/* Extern global declarations */
+/*****************************************************************************/
+
+/**
+ ******************************************************************************
+ * @brief while encoding, basing on the input configuration parameters, the
+ * the level of the bitstream is computed basing on the table below.
+ * input : table_idx
+ * output : level_idc or cpb size
+ * @remarks Table A-1 – level table limits
+ ******************************************************************************
+ */
+extern const level_tables_t gas_ih264_lvl_tbl[16];
+
+extern const WORD32 gai4_ih264_levels[];
+extern const WORD32 gai4_ih264_max_luma_pic_size[];
+extern const WORD32 gai4_ih264_max_wd_ht[];
+extern const WORD32 gai4_ih264_min_wd_ht[];
+
+extern intra_mbtype_info_t gas_ih264_i_mbtype_info[];
+extern inter_mbtype_info_t gas_ih264_p_mbtype_info[];
+extern inter_mbtype_info_t gas_ih264_b_mbtype_info[];
+extern submbtype_info_t gas_ih264_p_submbtype_info[];
+extern submbtype_info_t gas_ih264_b_submbtype_info[];
+
+
+extern const UWORD8 gau1_ih264_inv_scan_prog4x4[];
+extern const UWORD8 gau1_ih264_inv_scan_int4x4[];
+extern const UWORD8 gau1_ih264_inv_scan_prog8x8_cavlc[64];
+extern const UWORD8 gau1_ih264_inv_scan_int8x8_cavlc[64];
+extern const UWORD8 gau1_ih264_inv_scan_prog8x8_cabac[64];
+extern const UWORD8 gau1_ih264_inv_scan_int8x8_cabac[64];
+
+extern const UWORD8 *gpau1_ih264_inv_scan8x8[];
+extern const UWORD8 *gpau1_ih264_inv_scan4x4[];
+
+extern const UWORD8 gau1_ih264_8x8_subblk_idx[];
+
+extern const UWORD8 gau1_ih264_chroma_qp[];
+
+extern const UWORD8 gau1_ih264_4x4_ngbr_avbl[16][16];
+extern const UWORD8 gau1_ih264_8x8_ngbr_avbl[16][4];
+
+
+extern const UWORD16 gau2_ih264_default_inter4x4_weight_scale[];
+extern const UWORD16 gau2_ih264_default_intra4x4_weight_scale[];
+extern const UWORD16 gau2_ih264_default_intra4x4_scaling_list[];
+extern const UWORD16 gau2_ih264_default_inter4x4_scaling_list[];
+extern const UWORD16 gau2_ih264_default_intra8x8_scaling_list[];
+extern const UWORD16 gau2_ih264_default_inter8x8_scaling_list[];
+extern const UWORD16 gau2_ih264_default_intra8x8_weight_scale[];
+extern const UWORD16 gau2_ih264_default_inter8x8_weight_scale[];
+extern const UWORD16 gau2_ih264_flat_4x4_weight_scale[];
+extern const UWORD16 gau2_ih264_flat_8x8_weight_scale[];
+
+extern const UWORD16 gau2_ih264_iquant_scale_matrix_4x4 [96];
+extern const UWORD16 gau2_ih264_iquant_scale_matrix_8x8 [384];
+
+#endif /*_IH264_COMMON_TABLES_H_*/
diff --git a/common/ih264_deblk_edge_filters.c b/common/ih264_deblk_edge_filters.c
new file mode 100755
index 0000000..d2ffefd
--- /dev/null
+++ b/common/ih264_deblk_edge_filters.c
@@ -0,0 +1,2087 @@
+/******************************************************************************
+ *
+ * Copyright (C) 2015 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ *****************************************************************************
+ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
+*/
+/**************************************************************************** */
+/* */
+/* File Name : ih264_deblk_edge_filters.c */
+/* */
+/* Description : Contains function definitions for deblocking */
+/* */
+/* List of Functions : ih264_deblk_luma_vert_bs4() */
+/* ih264_deblk_luma_horz_bs4() */
+/* ih264_deblk_luma_vert_bslt4() */
+/* ih264_deblk_luma_horz_bslt4() */
+/* ih264_deblk_luma_vert_bs4_mbaff() */
+/* ih264_deblk_luma_vert_bslt4_mbaff() */
+/* ih264_deblk_chroma_vert_bs4_bp() */
+/* ih264_deblk_chroma_horz_bs4_bp() */
+/* ih264_deblk_chroma_vert_bslt4_bp() */
+/* ih264_deblk_chroma_horz_bslt4_bp() */
+/* ih264_deblk_chroma_vert_bs4_mbaff_bp() */
+/* ih264_deblk_chroma_vert_bslt4_mbaff_bp() */
+/* ih264_deblk_chroma_vert_bs4() */
+/* ih264_deblk_chroma_horz_bs4() */
+/* ih264_deblk_chroma_vert_bslt4() */
+/* ih264_deblk_chroma_horz_bslt4() */
+/* ih264_deblk_chroma_vert_bs4_mbaff() */
+/* ih264_deblk_chroma_vert_bslt4_mbaff() */
+/* */
+/* Issues / Problems : None */
+/* */
+/* Revision History : */
+/* */
+/* DD MM YYYY Author(s) Changes (Describe the changes made) */
+/* 28 11 2013 Ittiam Draft */
+/* 29 12 2014 Kaushik Added double-call vertical */
+/* Senthoor deblocking and high profile */
+/* deblocking functions */
+/* */
+/******************************************************************************/
+
+/*****************************************************************************/
+/* File Includes */
+/*****************************************************************************/
+
+/* System include files */
+#include <stdio.h>
+
+/* User include files */
+#include "ih264_typedefs.h"
+#include "ih264_platform_macros.h"
+#include "ih264_deblk_edge_filters.h"
+#include "ih264_macros.h"
+
+/*****************************************************************************/
+/* Function Definitions */
+/*****************************************************************************/
+
+/*****************************************************************************/
+/* */
+/* Function Name : ih264_deblk_luma_vert_bs4() */
+/* */
+/* Description : This function performs filtering of a luma block */
+/* vertical edge when the boundary strength is set to 4. */
+/* */
+/* Inputs : pu1_src - pointer to the src sample q0 */
+/* src_strd - source stride */
+/* alpha - alpha value for the boundary */
+/* beta - beta value for the boundary */
+/* */
+/* Globals : None */
+/* */
+/* Processing : This operation is described in Sec. 8.7.2.4 under the */
+/* title "Filtering process for edges for bS equal to 4" in */
+/* ITU T Rec H.264. */
+/* */
+/* Outputs : None */
+/* */
+/* Returns : None */
+/* */
+/* Issues : None */
+/* */
+/* Revision History: */
+/* */
+/* DD MM YYYY Author(s) Changes (Describe the changes made) */
+/* 28 11 2013 Ittiam Draft */
+/* */
+/*****************************************************************************/
+void ih264_deblk_luma_vert_bs4(UWORD8 *pu1_src,
+ WORD32 src_strd,
+ WORD32 alpha,
+ WORD32 beta)
+{
+ UWORD8 p3, p2, p1, p0, q0, q1, q2, q3;
+ WORD32 pos_p3, pos_p2, pos_p1, pos_p0;
+ WORD32 pos_q0, pos_q1, pos_q2,pos_q3;
+ UWORD8 a_p, a_q; /* threshold variables */
+ WORD32 blk_strd = src_strd << 2; /* block_increment = src_strd * 4 */
+ UWORD8 *pu1_src_temp;
+ WORD8 i = 0, edge;
+
+ pos_q0 = 0;
+ pos_q1 = 1;
+ pos_q2 = 2;
+ pos_q3 = 3;
+ pos_p0 = -1;
+ pos_p1 = -2;
+ pos_p2 = -3;
+ pos_p3 = -4;
+
+ for(edge = 0; edge < 4; edge++, pu1_src += blk_strd)
+ {
+ pu1_src_temp = pu1_src;
+ for(i = 0; i < 4; ++i, pu1_src_temp += src_strd)
+ {
+ q0 = pu1_src_temp[pos_q0];
+ q1 = pu1_src_temp[pos_q1];
+ p0 = pu1_src_temp[pos_p0];
+ p1 = pu1_src_temp[pos_p1];
+
+ /* Filter Decision */
+ if((ABS(p0 - q0) >= alpha) ||
+ (ABS(q1 - q0) >= beta) ||
+ (ABS(p1 - p0) >= beta))
+ continue;
+
+ p2 = pu1_src_temp[pos_p2];
+ p3 = pu1_src_temp[pos_p3];
+ q2 = pu1_src_temp[pos_q2];
+ q3 = pu1_src_temp[pos_q3];
+
+ if(ABS(p0 - q0) < ((alpha >> 2) + 2))
+ {
+ /* Threshold Variables */
+ a_p = (UWORD8)ABS(p2 - p0);
+ a_q = (UWORD8)ABS(q2 - q0);
+
+ if(a_p < beta)
+ {
+ /* p0', p1', p2' */
+ pu1_src_temp[pos_p0] = ((p2 + X2(p1) + X2(p0) + X2(q0) + q1
+ + 4) >> 3);
+ pu1_src_temp[pos_p1] = ((p2 + p1 + p0 + q0 + 2) >> 2);
+ pu1_src_temp[pos_p2] =
+ ((X2(p3) + X3(p2) + p1 + p0 + q0
+ + 4) >> 3);
+ }
+ else
+ {
+ /* p0'*/
+ pu1_src_temp[pos_p0] = ((X2(p1) + p0 + q1 + 2) >> 2);
+ }
+
+ if(a_q < beta)
+ {
+ /* q0', q1', q2' */
+ pu1_src_temp[pos_q0] = (p1 + X2(p0) + X2(q0) + X2(q1) + q2
+ + 4) >> 3;
+ pu1_src_temp[pos_q1] = (p0 + q0 + q1 + q2 + 2) >> 2;
+ pu1_src_temp[pos_q2] = (X2(q3) + X3(q2) + q1 + q0 + p0 + 4)
+ >> 3;
+ }
+ else
+ {
+ /* q0'*/
+ pu1_src_temp[pos_q0] = (X2(q1) + q0 + p1 + 2) >> 2;
+ }
+ }
+ else
+ {
+ /* p0', q0'*/
+ pu1_src_temp[pos_p0] = ((X2(p1) + p0 + q1 + 2) >> 2);
+ pu1_src_temp[pos_q0] = (X2(q1) + q0 + p1 + 2) >> 2;
+ }
+ }
+ }
+}
+
+/*****************************************************************************/
+/* */
+/* Function Name : ih264_deblk_luma_horz_bs4() */
+/* */
+/* Description : This function performs filtering of a luma block */
+/* horizontal edge when the boundary strength is set to 4. */
+/* */
+/* Inputs : pu1_src - pointer to the src sample q0 */
+/* src_strd - source stride */
+/* alpha - alpha value for the boundary */
+/* beta - beta value for the boundary */
+/* */
+/* Globals : None */
+/* */
+/* Processing : This operation is described in Sec. 8.7.2.4 under the */
+/* title "Filtering process for edges for bS equal to 4" in */
+/* ITU T Rec H.264. */
+/* */
+/* Outputs : None */
+/* */
+/* Returns : None */
+/* */
+/* Issues : None */
+/* */
+/* Revision History: */
+/* */
+/* DD MM YYYY Author(s) Changes (Describe the changes made) */
+/* 28 11 2013 Ittiam Draft */
+/* */
+/*****************************************************************************/
+void ih264_deblk_luma_horz_bs4(UWORD8 *pu1_src,
+ WORD32 src_strd,
+ WORD32 alpha,
+ WORD32 beta)
+{
+ UWORD8 p3, p2, p1, p0, q0, q1, q2, q3;
+ WORD32 pos_p3, pos_p2, pos_p1, pos_p0, pos_q0, pos_q1,
+ pos_q2, pos_q3;
+ UWORD8 a_p, a_q; /* threshold variables */
+ UWORD8 *pu1_p3; /* pointer to the src sample p3 */
+ UWORD8 *pu1_p3_temp;
+ UWORD8 *pu1_src_temp;
+ WORD8 i = 0, edge;
+
+ pu1_p3 = pu1_src - (src_strd << 2);
+ pos_q0 = 0;
+ pos_q1 = src_strd;
+ pos_q2 = X2(src_strd);
+ pos_q3 = X3(src_strd);
+ pos_p0 = X3(src_strd);
+ pos_p1 = X2(src_strd);
+ pos_p2 = src_strd;
+ pos_p3 = 0;
+
+ for(edge = 0; edge < 4; edge++, pu1_src += 4, pu1_p3 += 4)
+ {
+ pu1_src_temp = pu1_src;
+ pu1_p3_temp = pu1_p3;
+ for(i = 0; i < 4; ++i, pu1_src_temp++, pu1_p3_temp++)
+ {
+ q0 = pu1_src_temp[pos_q0];
+ q1 = pu1_src_temp[pos_q1];
+ p0 = pu1_p3_temp[pos_p0];
+ p1 = pu1_p3_temp[pos_p1];
+
+ /* Filter Decision */
+ if((ABS(p0 - q0) >= alpha) ||
+ (ABS(q1 - q0) >= beta) ||
+ (ABS(p1 - p0) >= beta))
+ continue;
+
+ p2 = pu1_p3_temp[pos_p2];
+ p3 = pu1_p3_temp[pos_p3];
+ q2 = pu1_src_temp[pos_q2];
+ q3 = pu1_src_temp[pos_q3];
+
+ if(ABS(p0 - q0) < ((alpha >> 2) + 2))
+ {
+ /* Threshold Variables */
+ a_p = ABS(p2 - p0);
+ a_q = ABS(q2 - q0);
+
+ if((a_p < beta))
+ {
+ /* p0', p1', p2' */
+ pu1_p3_temp[pos_p0] = (p2 + X2(p1) + X2(p0) + X2(q0) + q1
+ + 4) >> 3;
+ pu1_p3_temp[pos_p1] = (p2 + p1 + p0 + q0 + 2) >> 2;
+ pu1_p3_temp[pos_p2] =
+ (X2(p3) + X3(p2) + p1 + p0 + q0
+ + 4) >> 3;
+ }
+ else
+ {
+ /* p0'*/
+ pu1_p3_temp[pos_p0] = (X2(p1) + p0 + q1 + 2) >> 2;
+ }
+
+ if(a_q < beta)
+ {
+ /* q0', q1', q2' */
+ pu1_src_temp[pos_q0] = (p1 + X2(p0) + X2(q0) + X2(q1)
+ + q2 + 4) >> 3;
+ pu1_src_temp[pos_q1] = (p0 + q0 + q1 + q2 + 2) >> 2;
+ pu1_src_temp[pos_q2] = (X2(q3) + X3(q2) + q1 + q0 + p0
+ + 4) >> 3;
+ }
+ else
+ {
+ /* q0'*/
+ pu1_src_temp[pos_q0] = (X2(q1) + q0 + p1 + 2) >> 2;
+ }
+ }
+ else
+ {
+ /* p0', q0'*/
+ pu1_p3_temp[pos_p0] = (X2(p1) + p0 + q1 + 2) >> 2;
+ pu1_src_temp[pos_q0] = (X2(q1) + q0 + p1 + 2) >> 2;
+ }
+ }
+ }
+}
+
+/*****************************************************************************/
+/* */
+/* Function Name : ih264_deblk_chroma_vert_bs4_bp() */
+/* */
+/* Description : This function performs filtering of a chroma block */
+/* vertical edge when the boundary strength is set to 4. */
+/* */
+/* Inputs : pu1_src - pointer to the src sample q0 of U */
+/* src_strd - source stride */
+/* alpha - alpha value for the boundary */
+/* beta - beta value for the boundary */
+/* */
+/* Globals : None */
+/* */
+/* Processing : This operation is described in Sec. 8.7.2.4 under the */
+/* title "Filtering process for edges for bS equal to 4" in */
+/* ITU T Rec H.264. */
+/* */
+/* Outputs : None */
+/* */
+/* Returns : None */
+/* */
+/* Issues : None */
+/* */
+/* Revision History: */
+/* */
+/* DD MM YYYY Author(s) Changes (Describe the changes made) */
+/* 28 11 2013 Ittiam Draft */
+/* */
+/*****************************************************************************/
+void ih264_deblk_chroma_vert_bs4_bp(UWORD8 *pu1_src,
+ WORD32 src_strd,
+ WORD32 alpha,
+ WORD32 beta)
+{
+ UWORD8 *pu1_src_u = pu1_src; /* pointer to the src sample q0 of U */
+ UWORD8 *pu1_src_v = pu1_src + 1; /* pointer to the src sample q0 of V */
+ UWORD8 p1_u, p0_u, q0_u, q1_u, p1_v, p0_v, q0_v, q1_v;
+ WORD32 blk_strd = src_strd << 1; /* block_increment = src_strd * 2 */
+ WORD32 pos_p1, pos_p0, pos_q0, pos_q1;
+ UWORD8 *pu1_src_temp_u, *pu1_src_temp_v;
+ WORD8 i = 0, edge;
+
+ pos_q0 = 0;
+ pos_q1 = 2;
+ pos_p0 = -2;
+ pos_p1 = -4;
+
+ for(edge = 0; edge < 4;
+ edge++, pu1_src_u += blk_strd, pu1_src_v += blk_strd)
+ {
+ pu1_src_temp_u = pu1_src_u;
+ pu1_src_temp_v = pu1_src_v;
+ for(i = 0; i < 2; ++i, pu1_src_temp_u += src_strd, pu1_src_temp_v +=
+ src_strd)
+ {
+ q0_u = pu1_src_temp_u[pos_q0];
+ q1_u = pu1_src_temp_u[pos_q1];
+ p0_u = pu1_src_temp_u[pos_p0];
+ p1_u = pu1_src_temp_u[pos_p1];
+ q0_v = pu1_src_temp_v[pos_q0];
+ q1_v = pu1_src_temp_v[pos_q1];
+ p0_v = pu1_src_temp_v[pos_p0];
+ p1_v = pu1_src_temp_v[pos_p1];
+
+ /* Filter Decision */
+ if((ABS(p0_u - q0_u) < alpha) &&
+ (ABS(q1_u - q0_u) < beta) &&
+ (ABS(p1_u - p0_u) < beta))
+ {
+ /* p0' */
+ pu1_src_temp_u[pos_p0] = ((X2(p1_u) + p0_u + q1_u + 2) >> 2);
+ /* q0' */
+ pu1_src_temp_u[pos_q0] = (X2(q1_u) + q0_u + p1_u + 2) >> 2;
+ }
+
+ /* Filter Decision */
+ if((ABS(p0_v - q0_v) < alpha) &&
+ (ABS(q1_v - q0_v) < beta) &&
+ (ABS(p1_v - p0_v) < beta))
+ {
+ /* p0' */
+ pu1_src_temp_v[pos_p0] = ((X2(p1_v) + p0_v + q1_v + 2) >> 2);
+ /* q0' */
+ pu1_src_temp_v[pos_q0] = (X2(q1_v) + q0_v + p1_v + 2) >> 2;
+ }
+ }
+ }
+}
+
+/*****************************************************************************/
+/* */
+/* Function Name : ih264_deblk_chroma_horz_bs4_bp() */
+/* */
+/* Description : This function performs filtering of a chroma block */
+/* horizontal edge when the boundary strength is set to 4. */
+/* */
+/* Inputs : pu1_src - pointer to the src sample q0 of U */
+/* src_strd - source stride */
+/* alpha - alpha value for the boundary */
+/* beta - beta value for the boundary */
+/* */
+/* Globals : None */
+/* */
+/* Processing : This operation is described in Sec. 8.7.2.4 under the */
+/* title "Filtering process for edges for bS equal to 4" in */
+/* ITU T Rec H.264. */
+/* */
+/* Outputs : None */
+/* */
+/* Returns : None */
+/* */
+/* Issues : None */
+/* */
+/* Revision History: */
+/* */
+/* DD MM YYYY Author(s) Changes (Describe the changes made) */
+/* 28 11 2013 Ittiam Draft */
+/* */
+/*****************************************************************************/
+void ih264_deblk_chroma_horz_bs4_bp(UWORD8 *pu1_src,
+ WORD32 src_strd,
+ WORD32 alpha,
+ WORD32 beta)
+{
+ UWORD8 *pu1_src_u = pu1_src; /* pointer to the src sample q0 of U */
+ UWORD8 *pu1_src_v = pu1_src + 1; /* pointer to the src sample q0 of V */
+ UWORD8 p1_u, p0_u, q0_u, q1_u, p1_v, p0_v, q0_v, q1_v;
+ WORD32 pos_p1, pos_p0, pos_q0, pos_q1;
+ UWORD8 *pu1_src_temp_u, *pu1_src_temp_v;
+ UWORD8 *pu1_p1_u; /* pointer to the src sample p1 of U */
+ UWORD8 *pu1_p1_v; /* pointer to the src sample p1 of U */
+ UWORD8 *pu1_p1_temp_u, *pu1_p1_temp_v;
+ WORD8 i = 0, edge;
+
+ pu1_p1_u = pu1_src_u - (src_strd << 1);
+ pu1_p1_v = pu1_src_v - (src_strd << 1);
+ pos_q0 = 0;
+ pos_q1 = src_strd;
+ pos_p0 = src_strd;
+ pos_p1 = 0;
+
+ for(edge = 0; edge < 4; edge++, pu1_src_u += 4, pu1_p1_u += 4,
+ pu1_src_v += 4, pu1_p1_v += 4)
+ {
+ pu1_src_temp_u = pu1_src_u;
+ pu1_p1_temp_u = pu1_p1_u;
+ pu1_src_temp_v = pu1_src_v;
+ pu1_p1_temp_v = pu1_p1_v;
+ for(i = 0; i < 2; ++i, pu1_src_temp_u += 2, pu1_p1_temp_u += 2,
+ pu1_src_temp_v += 2, pu1_p1_temp_v += 2)
+ {
+ q0_u = pu1_src_temp_u[pos_q0];
+ q1_u = pu1_src_temp_u[pos_q1];
+ p0_u = pu1_p1_temp_u[pos_p0];
+ p1_u = pu1_p1_temp_u[pos_p1];
+
+ q0_v = pu1_src_temp_v[pos_q0];
+ q1_v = pu1_src_temp_v[pos_q1];
+ p0_v = pu1_p1_temp_v[pos_p0];
+ p1_v = pu1_p1_temp_v[pos_p1];
+
+ /* Filter Decision */
+ if((ABS(p0_u - q0_u) < alpha) &&
+ (ABS(q1_u - q0_u) < beta) &&
+ (ABS(p1_u - p0_u) < beta))
+ {
+ /* p0' */
+ pu1_p1_temp_u[pos_p0] = (X2(p1_u) + p0_u + q1_u + 2) >> 2;
+ /* q0' */
+ pu1_src_temp_u[pos_q0] = (X2(q1_u) + q0_u + p1_u + 2) >> 2;
+ }
+
+ /* Filter Decision */
+ if((ABS(p0_v - q0_v) < alpha) &&
+ (ABS(q1_v - q0_v) < beta) &&
+ (ABS(p1_v - p0_v) < beta))
+ {
+ /* p0' */
+ pu1_p1_temp_v[pos_p0] = (X2(p1_v) + p0_v + q1_v + 2) >> 2;
+ /* q0' */
+ pu1_src_temp_v[pos_q0] = (X2(q1_v) + q0_v + p1_v + 2) >> 2;
+ }
+ }
+ }
+}
+
+/*****************************************************************************/
+/* */
+/* Function Name : ih264_deblk_luma_vert_bslt4() */
+/* */
+/* Description : This function performs filtering of a luma block */
+/* vertical edge when the boundary strength is less than 4. */
+/* */
+/* Inputs : pu1_src - pointer to the src sample q0 */
+/* src_strd - source stride */
+/* alpha - alpha value for the boundary */
+/* beta - beta value for the boundary */
+/* u4_bs - packed Boundary strength array */
+/* pu1_cliptab - tc0_table */
+/* */
+/* Globals : None */
+/* */
+/* Processing : This operation is described in Sec. 8.7.2.3 under the */
+/* title "Filtering process for edges for bS less than 4" */
+/* in ITU T Rec H.264. */
+/* */
+/* Outputs : None */
+/* */
+/* Returns : None */
+/* */
+/* Issues : None */
+/* */
+/* Revision History: */
+/* */
+/* DD MM YYYY Author(s) Changes (Describe the changes made) */
+/* 28 11 2013 Ittiam Draft */
+/* */
+/*****************************************************************************/
+void ih264_deblk_luma_vert_bslt4(UWORD8 *pu1_src,
+ WORD32 src_strd,
+ WORD32 alpha,
+ WORD32 beta,
+ UWORD32 u4_bs,
+ const UWORD8 *pu1_cliptab)
+{
+ WORD8 i = 0, edge;
+ UWORD8 p2, p1, p0, q0, q1, q2;
+ WORD32 pos_p2, pos_p1, pos_p0, pos_q0, pos_q1, pos_q2;
+ UWORD8 a_p, a_q; /* threshold variables */
+ WORD32 blk_strd = src_strd << 2; /* block_increment = src_strd * 4 */
+ UWORD8 *pu1_src_temp;
+ WORD8 delta;
+ WORD8 tc;
+ WORD16 val;
+ UWORD8 tc0, u1_bs;
+
+ pos_q0 = 0;
+ pos_q1 = 1;
+ pos_q2 = 2;
+ pos_p0 = -1;
+ pos_p1 = -2;
+ pos_p2 = -3;
+
+ for(edge = 0; edge < 4; edge++, pu1_src += blk_strd)
+ {
+ pu1_src_temp = pu1_src;
+ /* Filter Decision */
+ u1_bs = (UWORD8)((u4_bs >> ((3 - edge) << 3)) & 0x0ff);
+ if(!u1_bs)
+ continue;
+ /* tc0 */
+ tc0 = pu1_cliptab[u1_bs];
+ for(i = 0; i < 4; ++i, pu1_src_temp += src_strd)
+ {
+ q0 = pu1_src_temp[pos_q0];
+ q1 = pu1_src_temp[pos_q1];
+ p0 = pu1_src_temp[pos_p0];
+ p1 = pu1_src_temp[pos_p1];
+
+ /* Filter Decision */
+ if((ABS(p0 - q0) >= alpha) ||
+ (ABS(q1 - q0) >= beta) ||
+ (ABS(p1 - p0) >= beta))
+ continue;
+
+ q2 = pu1_src_temp[pos_q2];
+ p2 = pu1_src_temp[pos_p2];
+
+ a_p = ABS(p2 - p0);
+ a_q = ABS(q2 - q0);
+
+ /* tc */
+ tc = tc0 + (a_p < beta) + (a_q < beta);
+
+ val = ((((q0 - p0) << 2) + (p1 - q1) + 4) >> 3);
+ delta = CLIP3(-tc, tc, val);
+
+ /* p0' */
+ val = p0 + delta;
+ pu1_src_temp[pos_p0] = CLIP_U8(val);
+ /* q0' */
+ val = q0 - delta;
+ pu1_src_temp[pos_q0] = CLIP_U8(val);
+
+ /* Luma only */
+ if(a_p < beta)
+ {
+ /* p1' */
+ val = ((p2 + ((p0 + q0 + 1) >> 1) - (p1 << 1)) >> 1);
+ pu1_src_temp[pos_p1] += CLIP3(-tc0, tc0, val);
+ }
+
+ if(a_q < beta)
+ {
+ /* q1' */
+ val = ((q2 + ((p0 + q0 + 1) >> 1) - (q1 << 1)) >> 1);
+ pu1_src_temp[pos_q1] += CLIP3(-tc0, tc0, val);
+ }
+ }
+ }
+}
+
+/*****************************************************************************/
+/* */
+/* Function Name : ih264_deblk_chroma_vert_bslt4_bp() */
+/* */
+/* Description : This function performs filtering of a chroma block */
+/* vertical edge when the boundary strength is less than 4. */
+/* */
+/* Inputs : pu1_src - pointer to the src sample q0 of U */
+/* src_strd - source stride */
+/* alpha - alpha value for the boundary */
+/* beta - beta value for the boundary */
+/* u4_bs - packed Boundary strength array */
+/* pu1_cliptab - tc0_table */
+/* */
+/* Globals : None */
+/* */
+/* Processing : This operation is described in Sec. 8.7.2.3 under the */
+/* title "Filtering process for edges for bS less than 4" */
+/* in ITU T Rec H.264. */
+/* */
+/* Outputs : None */
+/* */
+/* Returns : None */
+/* */
+/* Issues : None */
+/* */
+/* Revision History: */
+/* */
+/* DD MM YYYY Author(s) Changes (Describe the changes made) */
+/* 28 11 2013 Ittiam Draft */
+/* */
+/*****************************************************************************/
+void ih264_deblk_chroma_vert_bslt4_bp(UWORD8 *pu1_src,
+ WORD32 src_strd,
+ WORD32 alpha,
+ WORD32 beta,
+ UWORD32 u4_bs,
+ const UWORD8 *pu1_cliptab)
+{
+ UWORD8 *pu1_src_u = pu1_src; /* Pointer to the src sample q0 of plane U*/
+ UWORD8 *pu1_src_v = pu1_src + 1; /* Pointer to the src sample q0 of plane V*/
+ UWORD8 p1_u, p0_u, q0_u, q1_u, p1_v, p0_v, q0_v, q1_v;
+ WORD32 blk_strd = src_strd << 1; /* block_increment = src_strd * (4 >> 1)*/
+ WORD32 pos_p1, pos_p0, pos_q0, pos_q1;
+ UWORD8 *pu1_src_temp_u, *pu1_src_temp_v;
+ WORD8 i = 0, edge;
+ WORD8 delta;
+ WORD8 tc;
+ WORD16 val;
+ UWORD8 tc0, u1_bs;
+
+ pos_q0 = 0;
+ pos_q1 = 2;
+ pos_p0 = -2;
+ pos_p1 = -4;
+
+ for(edge = 0; edge < 4;
+ edge++, pu1_src_u += blk_strd, pu1_src_v += blk_strd)
+ {
+ pu1_src_temp_u = pu1_src_u;
+ pu1_src_temp_v = pu1_src_v;
+ /* Filter Decision */
+ u1_bs = (UWORD8)((u4_bs >> ((3 - edge) << 3)) & 0x0ff);
+ if(!u1_bs)
+ continue;
+ /* tc0 */
+ tc0 = pu1_cliptab[u1_bs];
+ tc = tc0 + 1;
+ for(i = 0; i < 2; ++i, pu1_src_temp_u += src_strd, pu1_src_temp_v +=
+ src_strd)
+ {
+ q0_u = pu1_src_temp_u[pos_q0];
+ q1_u = pu1_src_temp_u[pos_q1];
+ p0_u = pu1_src_temp_u[pos_p0];
+ p1_u = pu1_src_temp_u[pos_p1];
+
+ q0_v = pu1_src_temp_v[pos_q0];
+ q1_v = pu1_src_temp_v[pos_q1];
+ p0_v = pu1_src_temp_v[pos_p0];
+ p1_v = pu1_src_temp_v[pos_p1];
+
+ /* Filter Decision */
+ if((ABS(p0_u - q0_u) < alpha) &&
+ (ABS(q1_u - q0_u) < beta) &&
+ (ABS(p1_u - p0_u) < beta))
+ {
+ val = ((((q0_u - p0_u) << 2) + (p1_u - q1_u) + 4) >> 3);
+ delta = CLIP3(-tc, tc, val);
+ /* p0' */
+ val = p0_u + delta;
+ pu1_src_temp_u[pos_p0] = CLIP_U8(val);
+ /* q0' */
+ val = q0_u - delta;
+ pu1_src_temp_u[pos_q0] = CLIP_U8(val);
+ }
+
+ /* Filter Decision */
+ if((ABS(p0_v - q0_v) < alpha) &&
+ (ABS(q1_v - q0_v) < beta) &&
+ (ABS(p1_v - p0_v) < beta))
+ {
+ val = ((((q0_v - p0_v) << 2) + (p1_v - q1_v) + 4) >> 3);
+ delta = CLIP3(-tc, tc, val);
+ /* p0' */
+ val = p0_v + delta;
+ pu1_src_temp_v[pos_p0] = CLIP_U8(val);
+ /* q0' */
+ val = q0_v - delta;
+ pu1_src_temp_v[pos_q0] = CLIP_U8(val);
+ }
+ }
+ }
+}
+
+/*****************************************************************************/
+/* */
+/* Function Name : ih264_deblk_luma_horz_bslt4() */
+/* */
+/* Description : This function performs filtering of a luma block */
+/* horizontal edge when boundary strength is less than 4. */
+/* */
+/* Inputs : pu1_src - pointer to the src sample q0 */
+/* src_strd - source stride */
+/* alpha - alpha value for the boundary */
+/* beta - beta value for the boundary */
+/* u4_bs - packed Boundary strength array */
+/* pu1_cliptab - tc0_table */
+/* */
+/* Globals : None */
+/* */
+/* Processing : This operation is described in Sec. 8.7.2.3 under the */
+/* title "Filtering process for edges for bS less than 4" */
+/* in ITU T Rec H.264. */
+/* */
+/* Outputs : None */
+/* */
+/* Returns : None */
+/* */
+/* Issues : None */
+/* */
+/* Revision History: */
+/* */
+/* DD MM YYYY Author(s) Changes (Describe the changes made) */
+/* 28 11 2013 Ittiam Draft */
+/* */
+/*****************************************************************************/
+void ih264_deblk_luma_horz_bslt4(UWORD8 *pu1_src,
+ WORD32 src_strd,
+ WORD32 alpha,
+ WORD32 beta,
+ UWORD32 u4_bs,
+ const UWORD8 *pu1_cliptab)
+{
+ UWORD8 p2, p1, p0, q0, q1, q2;
+ WORD32 pos_p2, pos_p1, pos_p0, pos_q0, pos_q1, pos_q2;
+ UWORD8 a_p, a_q; /* Threshold variables */
+ UWORD8 *pu1_p2; /* Pointer to the src sample p2 */
+ UWORD8 *pu1_p2_temp;
+ UWORD8 *pu1_src_temp;
+ WORD8 i = 0, edge;
+ WORD8 delta;
+ WORD8 tc;
+ WORD16 val;
+ UWORD8 tc0, u1_bs;
+
+ pu1_p2 = pu1_src - (src_strd << 2);
+ pos_q0 = 0;
+ pos_q1 = src_strd;
+ pos_q2 = X2(src_strd);
+ pos_p0 = X3(src_strd);
+ pos_p1 = X2(src_strd);
+ pos_p2 = src_strd;
+
+ for(edge = 0; edge < 4; edge++, pu1_src += 4, pu1_p2 += 4)
+ {
+ pu1_src_temp = pu1_src;
+ pu1_p2_temp = pu1_p2;
+
+ /* Filter Decision */
+ u1_bs = (UWORD8)((u4_bs >> ((3 - edge) << 3)) & 0x0ff);
+ if(!u1_bs)
+ continue;
+ /* tc0 */
+ tc0 = pu1_cliptab[u1_bs];
+
+ for(i = 0; i < 4; ++i, pu1_src_temp++, pu1_p2_temp++)
+ {
+ q0 = pu1_src_temp[pos_q0];
+ q1 = pu1_src_temp[pos_q1];
+ p0 = pu1_p2_temp[pos_p0];
+ p1 = pu1_p2_temp[pos_p1];
+
+ /* Filter Decision */
+ if((ABS(p0 - q0) >= alpha) ||
+ (ABS(q1 - q0) >= beta) ||
+ (ABS(p1 - p0) >= beta))
+ continue;
+
+ q2 = pu1_src_temp[pos_q2];
+ p2 = pu1_p2_temp[pos_p2];
+
+ a_p = ABS(p2 - p0);
+ a_q = ABS(q2 - q0);
+
+ /* tc */
+ tc = tc0 + (a_p < beta) + (a_q < beta);
+ val = ((((q0 - p0) << 2) + (p1 - q1) + 4) >> 3);
+ delta = CLIP3(-tc, tc, val);
+ /* p0' */
+ val = p0 + delta;
+ pu1_p2_temp[pos_p0] = CLIP_U8(val);
+ /* q0' */
+ val = q0 - delta;
+ pu1_src_temp[pos_q0] = CLIP_U8(val);
+
+ /* Luma */
+ if(a_p < beta)
+ {
+ /* p1' */
+ val = ((p2 + ((p0 + q0 + 1) >> 1) - (p1 << 1)) >> 1);
+ pu1_p2_temp[pos_p1] += CLIP3(-tc0, tc0, val);
+ }
+
+ if(a_q < beta)
+ {
+ /* q1' */
+ val = ((q2 + ((p0 + q0 + 1) >> 1) - (q1 << 1)) >> 1);
+ pu1_src_temp[pos_q1] += CLIP3(-tc0, tc0, val);
+ }
+ }
+ }
+}
+
+/*****************************************************************************/
+/* */
+/* Function Name : ih264_deblk_chroma_horz_bslt4_bp() */
+/* */
+/* Description : This function performs filtering of a chroma block */
+/* horizontal edge when boundary strength is less than 4. */
+/* */
+/* Inputs : pu1_src - pointer to the src sample q0 of U */
+/* src_strd - source stride */
+/* alpha - alpha value for the boundary */
+/* beta - beta value for the boundary */
+/* u4_bs - packed Boundary strength array */
+/* pu1_cliptab - tc0_table */
+/* */
+/* Globals : None */
+/* */
+/* Processing : This operation is described in Sec. 8.7.2.3 under the */
+/* title "Filtering process for edges for bS less than 4" */
+/* in ITU T Rec H.264. */
+/* */
+/* Outputs : None */
+/* */
+/* Returns : None */
+/* */
+/* Issues : None */
+/* */
+/* Revision History: */
+/* */
+/* DD MM YYYY Author(s) Changes (Describe the changes made) */
+/* 28 11 2013 Ittiam Draft */
+/* */
+/*****************************************************************************/
+void ih264_deblk_chroma_horz_bslt4_bp(UWORD8 *pu1_src,
+ WORD32 src_strd,
+ WORD32 alpha,
+ WORD32 beta,
+ UWORD32 u4_bs,
+ const UWORD8 *pu1_cliptab)
+{
+ UWORD8 *pu1_src_u = pu1_src; /* Pointer to the src sample q0 of plane U*/
+ UWORD8 *pu1_src_v = pu1_src + 1; /* Pointer to the src sample q0 of plane V*/
+ UWORD8 p1_u, p0_u, q0_u, q1_u, p1_v, p0_v, q0_v, q1_v;
+ WORD32 pos_p1, pos_p0, pos_q0, pos_q1;
+ UWORD8 *pu1_src_temp_u, *pu1_src_temp_v;
+ UWORD8 *pu1_p1_u; /* Pointer to the src sample p1 of plane U*/
+ UWORD8 *pu1_p1_v; /* Pointer to the src sample p1 of plane V*/
+ UWORD8 *pu1_p1_temp_u, *pu1_p1_temp_v;
+ WORD8 i = 0, edge;
+ WORD8 delta;
+ WORD8 tc;
+ WORD16 val;
+ UWORD8 u1_bs;
+ UWORD8 tc0;
+
+ pu1_p1_u = pu1_src_u - (src_strd << 1);
+ pu1_p1_v = pu1_src_v - (src_strd << 1);
+ pos_q0 = 0;
+ pos_q1 = src_strd;
+ pos_p0 = src_strd;
+ pos_p1 = 0;
+
+ for(edge = 0; edge < 4; edge++, pu1_src_u += 4, pu1_p1_u += 4,
+ pu1_src_v += 4, pu1_p1_v += 4)
+ {
+ pu1_src_temp_u = pu1_src_u;
+ pu1_p1_temp_u = pu1_p1_u;
+ pu1_src_temp_v = pu1_src_v;
+ pu1_p1_temp_v = pu1_p1_v;
+
+ /* Filter Decision */
+ u1_bs = (UWORD8)((u4_bs >> ((3 - edge) << 3)) & 0x0ff);
+ if(!u1_bs)
+ continue;
+ /* tc0 */
+ tc0 = pu1_cliptab[u1_bs];
+
+ for(i = 0; i < 2; ++i, pu1_src_temp_u += 2, pu1_p1_temp_u += 2,
+ pu1_src_temp_v += 2, pu1_p1_temp_v += 2)
+ {
+ q0_u = pu1_src_temp_u[pos_q0];
+ q1_u = pu1_src_temp_u[pos_q1];
+ p0_u = pu1_p1_temp_u[pos_p0];
+ p1_u = pu1_p1_temp_u[pos_p1];
+
+ q0_v = pu1_src_temp_v[pos_q0];
+ q1_v = pu1_src_temp_v[pos_q1];
+ p0_v = pu1_p1_temp_v[pos_p0];
+ p1_v = pu1_p1_temp_v[pos_p1];
+
+ /* tc */
+ tc = tc0 + 1;
+ /* Filter Decision */
+ if(ABS(p0_u - q0_u) < alpha && ABS(q1_u - q0_u) < beta
+ && ABS(p1_u - p0_u) < beta)
+ {
+ val = ((((q0_u - p0_u) << 2) + (p1_u - q1_u) + 4) >> 3);
+ delta = CLIP3(-tc, tc, val);
+ /* p0' */
+ val = p0_u + delta;
+ pu1_p1_temp_u[pos_p0] = CLIP_U8(val);
+ /* q0' */
+ val = q0_u - delta;
+ pu1_src_temp_u[pos_q0] = CLIP_U8(val);
+ }
+ /* Filter Decision */
+ if(ABS(p0_v - q0_v) < alpha && ABS(q1_v - q0_v) < beta
+ && ABS(p1_v - p0_v) < beta)
+ {
+ val = ((((q0_v - p0_v) << 2) + (p1_v - q1_v) + 4) >> 3);
+ delta = CLIP3(-tc, tc, val);
+ /* p0' */
+ val = p0_v + delta;
+ pu1_p1_temp_v[pos_p0] = CLIP_U8(val);
+ /* q0' */
+ val = q0_v - delta;
+ pu1_src_temp_v[pos_q0] = CLIP_U8(val);
+ }
+ }
+ }
+}
+
+/*****************************************************************************/
+/* Function Definitions for vertical edge deblocking for double-call */
+/*****************************************************************************/
+
+/*****************************************************************************/
+/* */
+/* Function Name : ih264_deblk_luma_vert_bs4_mbaff() */
+/* */
+/* Description : This function performs filtering of a luma block */
+/* vertical edge when boundary strength is set to 4. */
+/* */
+/* Inputs : pu1_src - pointer to the src sample q0 */
+/* src_strd - source stride */
+/* alpha - alpha value for the boundary */
+/* beta - beta value for the boundary */
+/* */
+/* Globals : None */
+/* */
+/* Processing : When the function is called twice, this operation is as */
+/* described in Sec. 8.7.2.3 under the title "Filtering */
+/* process for edges for bS equal to 4" in ITU T Rec H.264. */
+/* */
+/* Outputs : None */
+/* */
+/* Returns : None */
+/* */
+/* Issues : None */
+/* */
+/* Revision History: */
+/* */
+/* DD MM YYYY Author(s) Changes (Describe the changes made) */
+/* 29 12 2014 Kaushik Draft */
+/* Senthoor */
+/* */
+/*****************************************************************************/
+void ih264_deblk_luma_vert_bs4_mbaff(UWORD8 *pu1_src,
+ WORD32 src_strd,
+ WORD32 alpha,
+ WORD32 beta)
+{
+ UWORD8 p3, p2, p1, p0, q0, q1, q2, q3;
+ WORD32 pos_p3, pos_p2, pos_p1, pos_p0;
+ WORD32 pos_q0, pos_q1, pos_q2, pos_q3;
+ UWORD8 a_p, a_q; /* threshold variables */
+ WORD32 blk_strd = src_strd << 1; /* block_increment = src_strd * 2 */
+ UWORD8 *pu1_src_temp;
+ WORD8 i = 0, edge;
+
+ pos_q0 = 0;
+ pos_q1 = 1;
+ pos_q2 = 2;
+ pos_q3 = 3;
+ pos_p0 = -1;
+ pos_p1 = -2;
+ pos_p2 = -3;
+ pos_p3 = -4;
+
+ for(edge = 0; edge < 4; edge++, pu1_src += blk_strd)
+ {
+ pu1_src_temp = pu1_src;
+ for(i = 0; i < 2; ++i, pu1_src_temp += src_strd)
+ {
+ q0 = pu1_src_temp[pos_q0];
+ q1 = pu1_src_temp[pos_q1];
+ p0 = pu1_src_temp[pos_p0];
+ p1 = pu1_src_temp[pos_p1];
+
+ /* Filter Decision */
+ if((ABS(p0 - q0) >= alpha) ||
+ (ABS(q1 - q0) >= beta) ||
+ (ABS(p1 - p0) >= beta))
+ continue;
+
+ p2 = pu1_src_temp[pos_p2];
+ p3 = pu1_src_temp[pos_p3];
+ q2 = pu1_src_temp[pos_q2];
+ q3 = pu1_src_temp[pos_q3];
+
+ if(ABS(p0 - q0) < ((alpha >> 2) + 2))
+ {
+ /* Threshold Variables */
+ a_p = (UWORD8)ABS(p2 - p0);
+ a_q = (UWORD8)ABS(q2 - q0);
+
+ if(a_p < beta)
+ {
+ /* p0', p1', p2' */
+ pu1_src_temp[pos_p0] = ((p2 + X2(p1) + X2(p0) + X2(q0) + q1
+ + 4) >> 3);
+ pu1_src_temp[pos_p1] = ((p2 + p1 + p0 + q0 + 2) >> 2);
+ pu1_src_temp[pos_p2] =
+ ((X2(p3) + X3(p2) + p1 + p0 + q0
+ + 4) >> 3);
+ }
+ else
+ {
+ /* p0'*/
+ pu1_src_temp[pos_p0] = ((X2(p1) + p0 + q1 + 2) >> 2);
+ }
+
+ if(a_q < beta)
+ {
+ /* q0', q1', q2' */
+ pu1_src_temp[pos_q0] = (p1 + X2(p0) + X2(q0) + X2(q1) + q2
+ + 4) >> 3;
+ pu1_src_temp[pos_q1] = (p0 + q0 + q1 + q2 + 2) >> 2;
+ pu1_src_temp[pos_q2] = (X2(q3) + X3(q2) + q1 + q0 + p0 + 4)
+ >> 3;
+ }
+ else
+ {
+ /* q0'*/
+ pu1_src_temp[pos_q0] = (X2(q1) + q0 + p1 + 2) >> 2;
+ }
+ }
+ else
+ {
+ /* p0', q0'*/
+ pu1_src_temp[pos_p0] = ((X2(p1) + p0 + q1 + 2) >> 2);
+ pu1_src_temp[pos_q0] = (X2(q1) + q0 + p1 + 2) >> 2;
+ }
+ }
+ }
+}
+
+/*****************************************************************************/
+/* */
+/* Function Name : ih264_deblk_chroma_vert_bs4_mbaff_bp() */
+/* */
+/* Description : This function performs filtering of a chroma block */
+/* vertical edge when boundary strength is set to 4. */
+/* */
+/* Inputs : pu1_src - pointer to the src sample q0 of U */
+/* src_strd - source stride */
+/* alpha - alpha value for the boundary */
+/* beta - beta value for the boundary */
+/* */
+/* Globals : None */
+/* */
+/* Processing : When the function is called twice, this operation is as */
+/* described in Sec. 8.7.2.3 under the title "Filtering */
+/* process for edges for bS equal to 4" in ITU T Rec H.264. */
+/* */
+/* Outputs : None */
+/* */
+/* Returns : None */
+/* */
+/* Issues : None */
+/* */
+/* Revision History: */
+/* */
+/* DD MM YYYY Author(s) Changes (Describe the changes made) */
+/* 29 12 2014 Kaushik Draft */
+/* Senthoor */
+/* */
+/*****************************************************************************/
+void ih264_deblk_chroma_vert_bs4_mbaff_bp(UWORD8 *pu1_src,
+ WORD32 src_strd,
+ WORD32 alpha,
+ WORD32 beta)
+{
+ UWORD8 *pu1_src_u = pu1_src; /* Pointer to the src sample q0 of U */
+ UWORD8 *pu1_src_v = pu1_src + 1; /* Pointer to the src sample q0 of V */
+ UWORD8 p1_u, p0_u, q0_u, q1_u, p1_v, p0_v, q0_v, q1_v;
+ WORD32 blk_strd = src_strd;
+ WORD32 pos_p1, pos_p0, pos_q0, pos_q1;
+ UWORD8 *pu1_src_temp_u, *pu1_src_temp_v;
+ WORD8 edge;
+
+ pos_q0 = 0;
+ pos_q1 = 2;
+ pos_p0 = -2;
+ pos_p1 = -4;
+
+ for(edge = 0; edge < 4;
+ edge++, pu1_src_u += blk_strd, pu1_src_v += blk_strd)
+ {
+ pu1_src_temp_u = pu1_src_u;
+ pu1_src_temp_v = pu1_src_v;
+
+ q0_u = pu1_src_temp_u[pos_q0];
+ q1_u = pu1_src_temp_u[pos_q1];
+ p0_u = pu1_src_temp_u[pos_p0];
+ p1_u = pu1_src_temp_u[pos_p1];
+ q0_v = pu1_src_temp_v[pos_q0];
+ q1_v = pu1_src_temp_v[pos_q1];
+ p0_v = pu1_src_temp_v[pos_p0];
+ p1_v = pu1_src_temp_v[pos_p1];
+
+ /* Filter Decision */
+ if((ABS(p0_u - q0_u) < alpha) &&
+ (ABS(q1_u - q0_u) < beta) &&
+ (ABS(p1_u - p0_u) < beta))
+ {
+ /* p0' */
+ pu1_src_temp_u[pos_p0] = ((X2(p1_u) + p0_u + q1_u + 2) >> 2);
+ /* q0' */
+ pu1_src_temp_u[pos_q0] = (X2(q1_u) + q0_u + p1_u + 2) >> 2;
+ }
+
+ /* Filter Decision */
+ if(ABS(p0_v - q0_v) < alpha && ABS(q1_v - q0_v) < beta
+ && ABS(p1_v - p0_v) < beta)
+ {
+ /* p0' */
+ pu1_src_temp_v[pos_p0] = ((X2(p1_v) + p0_v + q1_v + 2) >> 2);
+ /* q0' */
+ pu1_src_temp_v[pos_q0] = (X2(q1_v) + q0_v + p1_v + 2) >> 2;
+ }
+ }
+}
+
+/*****************************************************************************/
+/* */
+/* Function Name : ih264_deblk_luma_vert_bslt4_mbaff() */
+/* */
+/* Description : This function performs filtering of a luma block */
+/* vertical edge when boundary strength is less than 4. */
+/* */
+/* Inputs : pu1_src - pointer to the src sample q0 */
+/* src_strd - source stride */
+/* alpha - alpha value for the boundary */
+/* beta - beta value for the boundary */
+/* u4_bs - packed Boundary strength array */
+/* pu1_cliptab - tc0_table */
+/* */
+/* Globals : None */
+/* */
+/* Processing : When the function is called twice, this operation is as */
+/* described in Sec. 8.7.2.3 under the title "Filtering */
+/* process for edges for bS less than 4" in ITU T Rec H.264.*/
+/* */
+/* Outputs : None */
+/* */
+/* Returns : None */
+/* */
+/* Issues : None */
+/* */
+/* Revision History: */
+/* */
+/* DD MM YYYY Author(s) Changes (Describe the changes made) */
+/* 29 12 2014 Kaushik Draft */
+/* Senthoor */
+/* */
+/*****************************************************************************/
+void ih264_deblk_luma_vert_bslt4_mbaff(UWORD8 *pu1_src,
+ WORD32 src_strd,
+ WORD32 alpha,
+ WORD32 beta,
+ UWORD32 u4_bs,
+ const UWORD8 *pu1_cliptab)
+{
+ WORD8 i = 0, edge;
+ UWORD8 p2, p1, p0, q0, q1, q2;
+ WORD32 pos_p2, pos_p1, pos_p0, pos_q0, pos_q1, pos_q2;
+ UWORD8 a_p, a_q; /* Threshold variables */
+ WORD32 blk_strd = src_strd << 1; /* block_increment = src_strd * 2 */
+ UWORD8 *pu1_src_temp;
+ WORD8 delta;
+ WORD8 tc;
+ WORD16 val;
+ UWORD8 tc0, u1_bs;
+
+ pos_q0 = 0;
+ pos_q1 = 1;
+ pos_q2 = 2;
+ pos_p0 = -1;
+ pos_p1 = -2;
+ pos_p2 = -3;
+
+ for(edge = 0; edge < 4; edge++, pu1_src += blk_strd)
+ {
+ pu1_src_temp = pu1_src;
+ /* Filter Decision */
+ u1_bs = (UWORD8)((u4_bs >> ((3 - edge) << 3)) & 0x0ff);
+ if(!u1_bs)
+ continue;
+ /* tc0 */
+ tc0 = pu1_cliptab[u1_bs];
+ for(i = 0; i < 2; ++i, pu1_src_temp += src_strd)
+ {
+ q0 = pu1_src_temp[pos_q0];
+ q1 = pu1_src_temp[pos_q1];
+ p0 = pu1_src_temp[pos_p0];
+ p1 = pu1_src_temp[pos_p1];
+
+ /* Filter Decision */
+ if((ABS(p0 - q0) >= alpha) ||
+ (ABS(q1 - q0) >= beta) ||
+ (ABS(p1 - p0) >= beta))
+ continue;
+
+ q2 = pu1_src_temp[pos_q2];
+ p2 = pu1_src_temp[pos_p2];
+
+ a_p = ABS(p2 - p0);
+ a_q = ABS(q2 - q0);
+
+ /* tc */
+ tc = tc0 + (a_p < beta) + (a_q < beta);
+
+ val = ((((q0 - p0) << 2) + (p1 - q1) + 4) >> 3);
+ delta = CLIP3(-tc, tc, val);
+ /* p0' */
+ val = p0 + delta;
+ pu1_src_temp[pos_p0] = CLIP_U8(val);
+ /* q0' */
+ val = q0 - delta;
+ pu1_src_temp[pos_q0] = CLIP_U8(val);
+
+ /* Luma only */
+ if(a_p < beta)
+ {
+ /* p1' */
+ val = ((p2 + ((p0 + q0 + 1) >> 1) - (p1 << 1)) >> 1);
+ pu1_src_temp[pos_p1] += CLIP3(-tc0, tc0, val);
+ }
+
+ if(a_q < beta)
+ {
+ /* q1' */
+ val = ((q2 + ((p0 + q0 + 1) >> 1) - (q1 << 1)) >> 1);
+ pu1_src_temp[pos_q1] += CLIP3(-tc0, tc0, val);
+ }
+ }
+ }
+}
+
+/*****************************************************************************/
+/* */
+/* Function Name : ih264_deblk_chroma_vert_bslt4_mbaff_bp() */
+/* */
+/* Description : This function performs filtering of a chroma block */
+/* vertical edge when boundary strength is less than 4. */
+/* */
+/* Inputs : pu1_src - pointer to the src sample q0 of U */
+/* src_strd - source stride */
+/* alpha - alpha value for the boundary */
+/* beta - beta value for the boundary */
+/* u4_bs - packed Boundary strength array */
+/* pu1_cliptab - tc0_table */
+/* */
+/* Globals : None */
+/* */
+/* Processing : When the function is called twice, this operation is as */
+/* described in Sec. 8.7.2.3 under the title "Filtering */
+/* process for edges for bS less than 4" in ITU T Rec H.264.*/
+/* */
+/* Outputs : None */
+/* */
+/* Returns : None */
+/* */
+/* Issues : None */
+/* */
+/* Revision History: */
+/* */
+/* DD MM YYYY Author(s) Changes (Describe the changes made) */
+/* 29 12 2014 Kaushik Draft */
+/* Senthoor */
+/* */
+/*****************************************************************************/
+void ih264_deblk_chroma_vert_bslt4_mbaff_bp(UWORD8 *pu1_src,
+ WORD32 src_strd,
+ WORD32 alpha,
+ WORD32 beta,
+ UWORD32 u4_bs,
+ const UWORD8 *pu1_cliptab)
+{
+ UWORD8 *pu1_src_u = pu1_src; /* Pointer to the src sample q0 of plane U*/
+ UWORD8 *pu1_src_v = pu1_src + 1; /* Pointer to the src sample q0 of plane V*/
+ UWORD8 p1_u, p0_u, q0_u, q1_u, p1_v, p0_v, q0_v, q1_v;
+ WORD32 blk_strd = src_strd;
+ WORD32 pos_p1, pos_p0, pos_q0, pos_q1;
+ UWORD8 *pu1_src_temp_u, *pu1_src_temp_v;
+ WORD8 edge;
+ WORD8 delta;
+ WORD8 tc;
+ WORD16 val;
+ UWORD8 tc0, u1_bs;
+
+ pos_q0 = 0;
+ pos_q1 = 2;
+ pos_p0 = -2;
+ pos_p1 = -4;
+
+ for(edge = 0; edge < 4;
+ edge++, pu1_src_u += blk_strd, pu1_src_v += blk_strd)
+ {
+ pu1_src_temp_u = pu1_src_u;
+ pu1_src_temp_v = pu1_src_v;
+ /* Filter Decision */
+ u1_bs = (UWORD8)((u4_bs >> ((3 - edge) << 3)) & 0x0ff);
+ if(!u1_bs)
+ continue;
+ /* tc0 */
+ tc0 = pu1_cliptab[u1_bs];
+ tc = tc0 + 1;
+
+ q0_u = pu1_src_temp_u[pos_q0];
+ q1_u = pu1_src_temp_u[pos_q1];
+ p0_u = pu1_src_temp_u[pos_p0];
+ p1_u = pu1_src_temp_u[pos_p1];
+
+ q0_v = pu1_src_temp_v[pos_q0];
+ q1_v = pu1_src_temp_v[pos_q1];
+ p0_v = pu1_src_temp_v[pos_p0];
+ p1_v = pu1_src_temp_v[pos_p1];
+
+ /* Filter Decision */
+ if((ABS(p0_u - q0_u) < alpha) &&
+ (ABS(q1_u - q0_u) < beta) &&
+ (ABS(p1_u - p0_u) < beta))
+ {
+ val = ((((q0_u - p0_u) << 2) + (p1_u - q1_u) + 4) >> 3);
+ delta = CLIP3(-tc, tc, val);
+ /* p0' */
+ val = p0_u + delta;
+ pu1_src_temp_u[pos_p0] = CLIP_U8(val);
+ /* q0' */
+ val = q0_u - delta;
+ pu1_src_temp_u[pos_q0] = CLIP_U8(val);
+ }
+
+ /* Filter Decision */
+ if((ABS(p0_v - q0_v) < alpha) &&
+ (ABS(q1_v - q0_v) < beta) &&
+ (ABS(p1_v - p0_v) < beta))
+ {
+ val = ((((q0_v - p0_v) << 2) + (p1_v - q1_v) + 4) >> 3);
+ delta = CLIP3(-tc, tc, val);
+ /* p0' */
+ val = p0_v + delta;
+ pu1_src_temp_v[pos_p0] = CLIP_U8(val);
+ /* q0' */
+ val = q0_v - delta;
+ pu1_src_temp_v[pos_q0] = CLIP_U8(val);
+ }
+ }
+}
+
+/*****************************************************************************/
+/* Function Definitions for chroma deblocking in high profile */
+/*****************************************************************************/
+
+/*****************************************************************************/
+/* */
+/* Function Name : ih264_deblk_chroma_vert_bs4() */
+/* */
+/* Description : This function performs filtering of a chroma block */
+/* vertical edge when the boundary strength is set to 4 in */
+/* high profile. */
+/* */
+/* Inputs : pu1_src - pointer to the src sample q0 of U */
+/* src_strd - source stride */
+/* alpha_cb - alpha value for the boundary in U */
+/* beta_cb - beta value for the boundary in U */
+/* alpha_cr - alpha value for the boundary in V */
+/* beta_cr - beta value for the boundary in V */
+/* */
+/* Globals : None */
+/* */
+/* Processing : This operation is described in Sec. 8.7.2.4 under the */
+/* title "Filtering process for edges for bS equal to 4" in */
+/* ITU T Rec H.264 with alpha and beta values different in */
+/* U and V. */
+/* */
+/* Outputs : None */
+/* */
+/* Returns : None */
+/* */
+/* Issues : None */
+/* */
+/* Revision History: */
+/* */
+/* DD MM YYYY Author(s) Changes (Describe the changes made) */
+/* 29 12 2014 Kaushik Draft */
+/* Senthoor */
+/* */
+/*****************************************************************************/
+void ih264_deblk_chroma_vert_bs4(UWORD8 *pu1_src,
+ WORD32 src_strd,
+ WORD32 alpha_cb,
+ WORD32 beta_cb,
+ WORD32 alpha_cr,
+ WORD32 beta_cr)
+{
+ UWORD8 *pu1_src_u = pu1_src; /* Pointer to the src sample q0 of U */
+ UWORD8 *pu1_src_v = pu1_src + 1; /* Pointer to the src sample q0 of V */
+ UWORD8 p1_u, p0_u, q0_u, q1_u, p1_v, p0_v, q0_v, q1_v;
+ WORD32 blk_strd = src_strd << 1; /* block_increment = src_strd * 2*/
+ WORD32 pos_p1, pos_p0, pos_q0, pos_q1;
+ UWORD8 *pu1_src_temp_u, *pu1_src_temp_v;
+ WORD8 i = 0, edge;
+
+ pos_q0 = 0;
+ pos_q1 = 2;
+ pos_p0 = -2;
+ pos_p1 = -4;
+
+ for(edge = 0; edge < 4;
+ edge++, pu1_src_u += blk_strd, pu1_src_v += blk_strd)
+ {
+ pu1_src_temp_u = pu1_src_u;
+ pu1_src_temp_v = pu1_src_v;
+ for(i = 0; i < 2; ++i, pu1_src_temp_u += src_strd, pu1_src_temp_v +=
+ src_strd)
+ {
+ q0_u = pu1_src_temp_u[pos_q0];
+ q1_u = pu1_src_temp_u[pos_q1];
+ p0_u = pu1_src_temp_u[pos_p0];
+ p1_u = pu1_src_temp_u[pos_p1];
+ q0_v = pu1_src_temp_v[pos_q0];
+ q1_v = pu1_src_temp_v[pos_q1];
+ p0_v = pu1_src_temp_v[pos_p0];
+ p1_v = pu1_src_temp_v[pos_p1];
+
+ /* Filter Decision */
+ if((ABS(p0_u - q0_u) < alpha_cb) &&
+ (ABS(q1_u - q0_u) < beta_cb) &&
+ (ABS(p1_u - p0_u) < beta_cb))
+ {
+ /* p0' */
+ pu1_src_temp_u[pos_p0] = ((X2(p1_u) + p0_u + q1_u + 2) >> 2);
+ /* q0' */
+ pu1_src_temp_u[pos_q0] = (X2(q1_u) + q0_u + p1_u + 2) >> 2;
+ }
+
+ /* Filter Decision */
+ if((ABS(p0_v - q0_v) < alpha_cr) &&
+ (ABS(q1_v - q0_v) < beta_cr) &&
+ (ABS(p1_v - p0_v) < beta_cr))
+ {
+ /* p0' */
+ pu1_src_temp_v[pos_p0] = ((X2(p1_v) + p0_v + q1_v + 2) >> 2);
+ /* q0' */
+ pu1_src_temp_v[pos_q0] = (X2(q1_v) + q0_v + p1_v + 2) >> 2;
+ }
+ }
+ }
+}
+
+/*****************************************************************************/
+/* */
+/* Function Name : ih264_deblk_chroma_horz_bs4() */
+/* */
+/* Description : This function performs filtering of a chroma block */
+/* horizontal edge when the boundary strength is set to 4 */
+/* in high profile. */
+/* */
+/* Inputs : pu1_src - pointer to the src sample q0 of U */
+/* src_strd - source stride */
+/* alpha_cb - alpha value for the boundary in U */
+/* beta_cb - beta value for the boundary in U */
+/* alpha_cr - alpha value for the boundary in V */
+/* beta_cr - beta value for the boundary in V */
+/* */
+/* Globals : None */
+/* */
+/* Processing : This operation is described in Sec. 8.7.2.4 under the */
+/* title "Filtering process for edges for bS equal to 4" in */
+/* ITU T Rec H.264 with alpha and beta values different in */
+/* U and V. */
+/* */
+/* Outputs : None */
+/* */
+/* Returns : None */
+/* */
+/* Issues : None */
+/* */
+/* Revision History: */
+/* */
+/* DD MM YYYY Author(s) Changes (Describe the changes made) */
+/* 29 12 2014 Kaushik Draft */
+/* Senthoor */
+/* */
+/*****************************************************************************/
+void ih264_deblk_chroma_horz_bs4(UWORD8 *pu1_src,
+ WORD32 src_strd,
+ WORD32 alpha_cb,
+ WORD32 beta_cb,
+ WORD32 alpha_cr,
+ WORD32 beta_cr)
+{
+ UWORD8 *pu1_src_u = pu1_src; /* Pointer to the src sample q0 of U */
+ UWORD8 *pu1_src_v = pu1_src + 1; /* Pointer to the src sample q0 of V */
+ UWORD8 p1_u, p0_u, q0_u, q1_u, p1_v, p0_v, q0_v, q1_v;
+ WORD32 pos_p1, pos_p0, pos_q0, pos_q1;
+ UWORD8 *pu1_src_temp_u, *pu1_src_temp_v;
+ UWORD8 *pu1_p1_u; /* Pointer to the src sample p1 of U */
+ UWORD8 *pu1_p1_v; /* Pointer to the src sample p1 of U */
+ UWORD8 *pu1_p1_temp_u, *pu1_p1_temp_v;
+ WORD8 i = 0, edge;
+
+ pu1_p1_u = pu1_src_u - (src_strd << 1);
+ pu1_p1_v = pu1_src_v - (src_strd << 1);
+ pos_q0 = 0;
+ pos_q1 = src_strd;
+ pos_p0 = src_strd;
+ pos_p1 = 0;
+
+ for(edge = 0; edge < 4; edge++, pu1_src_u += 4, pu1_p1_u += 4, pu1_src_v +=
+ 4, pu1_p1_v += 4)
+ {
+ pu1_src_temp_u = pu1_src_u;
+ pu1_p1_temp_u = pu1_p1_u;
+ pu1_src_temp_v = pu1_src_v;
+ pu1_p1_temp_v = pu1_p1_v;
+ for(i = 0; i < 2; ++i, pu1_src_temp_u += 2, pu1_p1_temp_u += 2,
+ pu1_src_temp_v += 2, pu1_p1_temp_v += 2)
+ {
+ q0_u = pu1_src_temp_u[pos_q0];
+ q1_u = pu1_src_temp_u[pos_q1];
+ p0_u = pu1_p1_temp_u[pos_p0];
+ p1_u = pu1_p1_temp_u[pos_p1];
+
+ q0_v = pu1_src_temp_v[pos_q0];
+ q1_v = pu1_src_temp_v[pos_q1];
+ p0_v = pu1_p1_temp_v[pos_p0];
+ p1_v = pu1_p1_temp_v[pos_p1];
+
+ /* Filter Decision */
+ if(ABS(p0_u - q0_u) < alpha_cb && ABS(q1_u - q0_u) < beta_cb
+ && ABS(p1_u - p0_u) < beta_cb)
+ {
+ /* p0' */
+ pu1_p1_temp_u[pos_p0] = (X2(p1_u) + p0_u + q1_u + 2) >> 2;
+ /* q0' */
+ pu1_src_temp_u[pos_q0] = (X2(q1_u) + q0_u + p1_u + 2) >> 2;
+ }
+
+ /* Filter Decision */
+ if(ABS(p0_v - q0_v) < alpha_cr && ABS(q1_v - q0_v) < beta_cr
+ && ABS(p1_v - p0_v) < beta_cr)
+ {
+ /* p0' */
+ pu1_p1_temp_v[pos_p0] = (X2(p1_v) + p0_v + q1_v + 2) >> 2;
+ /* q0' */
+ pu1_src_temp_v[pos_q0] = (X2(q1_v) + q0_v + p1_v + 2) >> 2;
+ }
+ }
+ }
+}
+
+/*****************************************************************************/
+/* */
+/* Function Name : ih264_deblk_chroma_vert_bslt4() */
+/* */
+/* Description : This function performs filtering of a chroma block */
+/* vertical edge when the boundary strength is less than 4 */
+/* in high profile. */
+/* */
+/* Inputs : pu1_src - pointer to the src sample q0 of U */
+/* src_strd - source stride */
+/* alpha_cb - alpha value for the boundary in U */
+/* beta_cb - beta value for the boundary in U */
+/* alpha_cr - alpha value for the boundary in V */
+/* beta_cr - beta value for the boundary in V */
+/* u4_bs - packed Boundary strength array */
+/* pu1_cliptab_cb - tc0_table for U */
+/* pu1_cliptab_cr - tc0_table for V */
+/* */
+/* Globals : None */
+/* */
+/* Processing : This operation is described in Sec. 8.7.2.3 under the */
+/* title "Filtering process for edges for bS less than 4" */
+/* in ITU T Rec H.264 with alpha and beta values different */
+/* in U and V. */
+/* */
+/* Outputs : None */
+/* */
+/* Returns : None */
+/* */
+/* Issues : None */
+/* */
+/* Revision History: */
+/* */
+/* DD MM YYYY Author(s) Changes (Describe the changes made) */
+/* 29 12 2014 Kaushik Draft */
+/* Senthoor */
+/* */
+/*****************************************************************************/
+void ih264_deblk_chroma_vert_bslt4(UWORD8 *pu1_src,
+ WORD32 src_strd,
+ WORD32 alpha_cb,
+ WORD32 beta_cb,
+ WORD32 alpha_cr,
+ WORD32 beta_cr,
+ UWORD32 u4_bs,
+ const UWORD8 *pu1_cliptab_cb,
+ const UWORD8 *pu1_cliptab_cr)
+{
+ UWORD8 *pu1_src_u = pu1_src; /* Pointer to the src sample q0 of plane U*/
+ UWORD8 *pu1_src_v = pu1_src + 1; /* Pointer to the src sample q0 of plane V*/
+ UWORD8 p1_u, p0_u, q0_u, q1_u, p1_v, p0_v, q0_v, q1_v;
+ WORD32 blk_strd = src_strd << 1; /* block_increment = src_strd * 2 */
+ WORD32 pos_p1, pos_p0, pos_q0, pos_q1;
+ UWORD8 *pu1_src_temp_u, *pu1_src_temp_v;
+ WORD8 i = 0, edge;
+ WORD8 delta;
+ WORD8 tcb, tcr;
+ WORD16 val;
+ UWORD8 tcb0, tcr0, u1_bs;
+
+ pos_q0 = 0;
+ pos_q1 = 2;
+ pos_p0 = -2;
+ pos_p1 = -4;
+
+ for(edge = 0; edge < 4;
+ edge++, pu1_src_u += blk_strd, pu1_src_v += blk_strd)
+ {
+ pu1_src_temp_u = pu1_src_u;
+ pu1_src_temp_v = pu1_src_v;
+ /* Filter Decision */
+ u1_bs = (UWORD8)((u4_bs >> ((3 - edge) << 3)) & 0x0ff);
+ if(!u1_bs)
+ continue;
+ /* tc0 */
+ tcb0 = pu1_cliptab_cb[u1_bs];
+ tcr0 = pu1_cliptab_cr[u1_bs];
+ tcb = tcb0 + 1;
+ tcr = tcr0 + 1;
+ for(i = 0; i < 2; ++i, pu1_src_temp_u += src_strd, pu1_src_temp_v +=
+ src_strd)
+ {
+ q0_u = pu1_src_temp_u[pos_q0];
+ q1_u = pu1_src_temp_u[pos_q1];
+ p0_u = pu1_src_temp_u[pos_p0];
+ p1_u = pu1_src_temp_u[pos_p1];
+
+ q0_v = pu1_src_temp_v[pos_q0];
+ q1_v = pu1_src_temp_v[pos_q1];
+ p0_v = pu1_src_temp_v[pos_p0];
+ p1_v = pu1_src_temp_v[pos_p1];
+
+ /* Filter Decision */
+ if(ABS(p0_u - q0_u) < alpha_cb && ABS(q1_u - q0_u) < beta_cb
+ && ABS(p1_u - p0_u) < beta_cb)
+ {
+ val = ((((q0_u - p0_u) << 2) + (p1_u - q1_u) + 4) >> 3);
+ delta = CLIP3(-tcb, tcb, val);
+ /* p0' */
+ val = p0_u + delta;
+ pu1_src_temp_u[pos_p0] = CLIP_U8(val);
+ /* q0' */
+ val = q0_u - delta;
+ pu1_src_temp_u[pos_q0] = CLIP_U8(val);
+ }
+
+ /* Filter Decision */
+ if(ABS(p0_v - q0_v) < alpha_cr && ABS(q1_v - q0_v) < beta_cr
+ && ABS(p1_v - p0_v) < beta_cr)
+ {
+ val = ((((q0_v - p0_v) << 2) + (p1_v - q1_v) + 4) >> 3);
+ delta = CLIP3(-tcr, tcr, val);
+ /* p0' */
+ val = p0_v + delta;
+ pu1_src_temp_v[pos_p0] = CLIP_U8(val);
+ /* q0' */
+ val = q0_v - delta;
+ pu1_src_temp_v[pos_q0] = CLIP_U8(val);
+ }
+ }
+ }
+}
+
+/*****************************************************************************/
+/* */
+/* Function Name : ih264_deblk_chroma_horz_bslt4() */
+/* */
+/* Description : This function performs filtering of a chroma block */
+/* horizontal edge when the boundary strength is less than */
+/* 4 in high profile. */
+/* */
+/* Inputs : pu1_src - pointer to the src sample q0 of U */
+/* src_strd - source stride */
+/* alpha_cb - alpha value for the boundary in U */
+/* beta_cb - beta value for the boundary in U */
+/* alpha_cr - alpha value for the boundary in V */
+/* beta_cr - beta value for the boundary in V */
+/* u4_bs - packed Boundary strength array */
+/* pu1_cliptab_cb - tc0_table for U */
+/* pu1_cliptab_cr - tc0_table for V */
+/* */
+/* Globals : None */
+/* */
+/* Processing : This operation is described in Sec. 8.7.2.3 under the */
+/* title "Filtering process for edges for bS less than 4" */
+/* in ITU T Rec H.264 with alpha and beta values different */
+/* in U and V. */
+/* */
+/* Outputs : None */
+/* */
+/* Returns : None */
+/* */
+/* Issues : None */
+/* */
+/* Revision History: */
+/* */
+/* DD MM YYYY Author(s) Changes (Describe the changes made) */
+/* 29 12 2014 Kaushik Draft */
+/* Senthoor */
+/* */
+/*****************************************************************************/
+void ih264_deblk_chroma_horz_bslt4(UWORD8 *pu1_src,
+ WORD32 src_strd,
+ WORD32 alpha_cb,
+ WORD32 beta_cb,
+ WORD32 alpha_cr,
+ WORD32 beta_cr,
+ UWORD32 u4_bs,
+ const UWORD8 *pu1_cliptab_cb,
+ const UWORD8 *pu1_cliptab_cr)
+{
+ UWORD8 *pu1_src_u = pu1_src; /* Pointer to the src sample q0 of plane U*/
+ UWORD8 *pu1_src_v = pu1_src + 1; /* Pointer to the src sample q0 of plane V*/
+ UWORD8 p1_u, p0_u, q0_u, q1_u, p1_v, p0_v, q0_v, q1_v;
+ WORD32 pos_p1, pos_p0, pos_q0, pos_q1;
+ UWORD8 *pu1_src_temp_u, *pu1_src_temp_v;
+ UWORD8 *pu1_p1_u; /* Pointer to the src sample p1 of plane U*/
+ UWORD8 *pu1_p1_v; /* Pointer to the src sample p1 of plane V*/
+ UWORD8 *pu1_p1_temp_u, *pu1_p1_temp_v;
+ WORD8 i = 0, edge;
+ WORD8 delta;
+ WORD8 tcb, tcr;
+ WORD16 val;
+ UWORD8 u1_bs;
+ UWORD8 tcb0, tcr0;
+
+ pu1_p1_u = pu1_src_u - (src_strd << 1);
+ pu1_p1_v = pu1_src_v - (src_strd << 1);
+ pos_q0 = 0;
+ pos_q1 = src_strd;
+ pos_p0 = src_strd;
+ pos_p1 = 0;
+
+ for(edge = 0; edge < 4; edge++, pu1_src_u += 4, pu1_p1_u += 4,
+ pu1_src_v += 4, pu1_p1_v += 4)
+ {
+ pu1_src_temp_u = pu1_src_u;
+ pu1_p1_temp_u = pu1_p1_u;
+ pu1_src_temp_v = pu1_src_v;
+ pu1_p1_temp_v = pu1_p1_v;
+
+ /* Filter Decision */
+ u1_bs = (UWORD8)((u4_bs >> ((3 - edge) << 3)) & 0x0ff);
+ if(!u1_bs)
+ continue;
+ /* tc0 */
+ tcb0 = pu1_cliptab_cb[u1_bs];
+ tcr0 = pu1_cliptab_cr[u1_bs];
+
+ for(i = 0; i < 2; ++i, pu1_src_temp_u += 2, pu1_p1_temp_u += 2,
+ pu1_src_temp_v += 2, pu1_p1_temp_v += 2)
+ {
+ q0_u = pu1_src_temp_u[pos_q0];
+ q1_u = pu1_src_temp_u[pos_q1];
+ p0_u = pu1_p1_temp_u[pos_p0];
+ p1_u = pu1_p1_temp_u[pos_p1];
+
+ q0_v = pu1_src_temp_v[pos_q0];
+ q1_v = pu1_src_temp_v[pos_q1];
+ p0_v = pu1_p1_temp_v[pos_p0];
+ p1_v = pu1_p1_temp_v[pos_p1];
+
+ /* tc */
+ tcb = tcb0 + 1;
+ tcr = tcr0 + 1;
+ /* Filter Decision */
+ if(ABS(p0_u - q0_u) < alpha_cb && ABS(q1_u - q0_u) < beta_cb
+ && ABS(p1_u - p0_u) < beta_cb)
+ {
+ val = ((((q0_u - p0_u) << 2) + (p1_u - q1_u) + 4) >> 3);
+ delta = CLIP3(-tcb, tcb, val);
+ /* p0' */
+ val = p0_u + delta;
+ pu1_p1_temp_u[pos_p0] = CLIP_U8(val);
+ /* q0' */
+ val = q0_u - delta;
+ pu1_src_temp_u[pos_q0] = CLIP_U8(val);
+ }
+ /* Filter Decision */
+ if(ABS(p0_v - q0_v) < alpha_cr && ABS(q1_v - q0_v) < beta_cr
+ && ABS(p1_v - p0_v) < beta_cr)
+ {
+ val = ((((q0_v - p0_v) << 2) + (p1_v - q1_v) + 4) >> 3);
+ delta = CLIP3(-tcr, tcr, val);
+ /* p0' */
+ val = p0_v + delta;
+ pu1_p1_temp_v[pos_p0] = CLIP_U8(val);
+ /* q0' */
+ val = q0_v - delta;
+ pu1_src_temp_v[pos_q0] = CLIP_U8(val);
+ }
+ }
+ }
+}
+
+/*****************************************************************************/
+/* */
+/* Function Name : ih264_deblk_chroma_vert_bs4_mbaff() */
+/* */
+/* Description : This function performs filtering of a chroma block */
+/* vertical edge when boundary strength is set to 4 in high */
+/* profile. */
+/* */
+/* Inputs : pu1_src - pointer to the src sample q0 of U */
+/* src_strd - source stride */
+/* alpha_cb - alpha value for the boundary in U */
+/* beta_cb - beta value for the boundary in U */
+/* alpha_cr - alpha value for the boundary in V */
+/* beta_cr - beta value for the boundary in V */
+/* u4_bs - packed Boundary strength array */
+/* pu1_cliptab_cb - tc0_table for U */
+/* pu1_cliptab_cr - tc0_table for V */
+/* */
+/* Globals : None */
+/* */
+/* Processing : When the function is called twice, this operation is as */
+/* described in Sec. 8.7.2.4 under the title "Filtering */
+/* process for edges for bS equal to 4" in ITU T Rec H.264 */
+/* with alpha and beta values different in U and V. */
+/* */
+/* Outputs : None */
+/* */
+/* Returns : None */
+/* */
+/* Issues : None */
+/* */
+/* Revision History: */
+/* */
+/* DD MM YYYY Author(s) Changes (Describe the changes made) */
+/* 29 12 2014 Kaushik Draft */
+/* Senthoor */
+/* */
+/*****************************************************************************/
+void ih264_deblk_chroma_vert_bs4_mbaff(UWORD8 *pu1_src,
+ WORD32 src_strd,
+ WORD32 alpha_cb,
+ WORD32 beta_cb,
+ WORD32 alpha_cr,
+ WORD32 beta_cr)
+{
+ UWORD8 *pu1_src_u = pu1_src; /* Pointer to the src sample q0 of U */
+ UWORD8 *pu1_src_v = pu1_src + 1; /* Pointer to the src sample q0 of V */
+ UWORD8 p1_u, p0_u, q0_u, q1_u, p1_v, p0_v, q0_v, q1_v;
+ WORD32 blk_strd = src_strd;
+ WORD32 pos_p1, pos_p0, pos_q0, pos_q1;
+ UWORD8 *pu1_src_temp_u, *pu1_src_temp_v;
+ WORD8 edge;
+
+ pos_q0 = 0;
+ pos_q1 = 2;
+ pos_p0 = -2;
+ pos_p1 = -4;
+
+ for(edge = 0; edge < 4;
+ edge++, pu1_src_u += blk_strd, pu1_src_v += blk_strd)
+ {
+ pu1_src_temp_u = pu1_src_u;
+ pu1_src_temp_v = pu1_src_v;
+ q0_u = pu1_src_temp_u[pos_q0];
+ q1_u = pu1_src_temp_u[pos_q1];
+ p0_u = pu1_src_temp_u[pos_p0];
+ p1_u = pu1_src_temp_u[pos_p1];
+ q0_v = pu1_src_temp_v[pos_q0];
+ q1_v = pu1_src_temp_v[pos_q1];
+ p0_v = pu1_src_temp_v[pos_p0];
+ p1_v = pu1_src_temp_v[pos_p1];
+
+ /* Filter Decision */
+ if((ABS(p0_u - q0_u) < alpha_cb) &&
+ (ABS(q1_u - q0_u) < beta_cb) &&
+ (ABS(p1_u - p0_u) < beta_cb))
+ {
+ /* p0' */
+ pu1_src_temp_u[pos_p0] = ((X2(p1_u) + p0_u + q1_u + 2) >> 2);
+ /* q0' */
+ pu1_src_temp_u[pos_q0] = (X2(q1_u) + q0_u + p1_u + 2) >> 2;
+ }
+
+ /* Filter Decision */
+ if((ABS(p0_v - q0_v) < alpha_cr) &&
+ (ABS(q1_v - q0_v) < beta_cr) &&
+ (ABS(p1_v - p0_v) < beta_cr))
+ {
+ /* p0' */
+ pu1_src_temp_v[pos_p0] = ((X2(p1_v) + p0_v + q1_v + 2) >> 2);
+ /* q0' */
+ pu1_src_temp_v[pos_q0] = (X2(q1_v) + q0_v + p1_v + 2) >> 2;
+ }
+ }
+}
+
+/*****************************************************************************/
+/* */
+/* Function Name : ih264_deblk_chroma_vert_bslt4_mbaff() */
+/* */
+/* Description : This function performs filtering of a chroma block */
+/* vertical edge when boundary strength is less than 4 in */
+/* high profile. */
+/* */
+/* Inputs : pu1_src - pointer to the src sample q0 of U */
+/* src_strd - source stride */
+/* alpha_cb - alpha value for the boundary in U */
+/* beta_cb - beta value for the boundary in U */
+/* alpha_cr - alpha value for the boundary in V */
+/* beta_cr - beta value for the boundary in V */
+/* u4_bs - packed Boundary strength array */
+/* pu1_cliptab_cb - tc0_table for U */
+/* pu1_cliptab_cr - tc0_table for V */
+/* */
+/* Globals : None */
+/* */
+/* Processing : When the function is called twice, this operation is as */
+/* described in Sec. 8.7.2.4 under the title "Filtering */
+/* process for edges for bS less than 4" in ITU T Rec H.264 */
+/* with alpha and beta values different in U and V. */
+/* */
+/* Outputs : None */
+/* */
+/* Returns : None */
+/* */
+/* Issues : None */
+/* */
+/* Revision History: */
+/* */
+/* DD MM YYYY Author(s) Changes (Describe the changes made) */
+/* 29 12 2014 Kaushik Draft */
+/* Senthoor */
+/* */
+/*****************************************************************************/
+void ih264_deblk_chroma_vert_bslt4_mbaff(UWORD8 *pu1_src,
+ WORD32 src_strd,
+ WORD32 alpha_cb,
+ WORD32 beta_cb,
+ WORD32 alpha_cr,
+ WORD32 beta_cr,
+ UWORD32 u4_bs,
+ const UWORD8 *pu1_cliptab_cb,
+ const UWORD8 *pu1_cliptab_cr)
+{
+ UWORD8 *pu1_src_u = pu1_src; /* Pointer to the src sample q0 of plane U*/
+ UWORD8 *pu1_src_v = pu1_src + 1; /* Pointer to the src sample q0 of plane V*/
+ UWORD8 p1_u, p0_u, q0_u, q1_u, p1_v, p0_v, q0_v, q1_v;
+ WORD32 blk_strd = src_strd;
+ WORD32 pos_p1, pos_p0, pos_q0, pos_q1;
+ UWORD8 *pu1_src_temp_u, *pu1_src_temp_v;
+ WORD8 edge;
+ WORD8 delta;
+ WORD8 tcb, tcr;
+ WORD16 val;
+ UWORD8 tcb0, tcr0, u1_bs;
+
+ pos_q0 = 0;
+ pos_q1 = 2;
+ pos_p0 = -2;
+ pos_p1 = -4;
+
+ for(edge = 0; edge < 4;
+ edge++, pu1_src_u += blk_strd, pu1_src_v += blk_strd)
+ {
+ pu1_src_temp_u = pu1_src_u;
+ pu1_src_temp_v = pu1_src_v;
+ /* Filter Decision */
+ u1_bs = (UWORD8)((u4_bs >> ((3 - edge) << 3)) & 0x0ff);
+ if(!u1_bs)
+ continue;
+ /* tc0 */
+ tcb0 = pu1_cliptab_cb[u1_bs];
+ tcr0 = pu1_cliptab_cr[u1_bs];
+ tcb = tcb0 + 1;
+ tcr = tcr0 + 1;
+ q0_u = pu1_src_temp_u[pos_q0];
+ q1_u = pu1_src_temp_u[pos_q1];
+ p0_u = pu1_src_temp_u[pos_p0];
+ p1_u = pu1_src_temp_u[pos_p1];
+
+ q0_v = pu1_src_temp_v[pos_q0];
+ q1_v = pu1_src_temp_v[pos_q1];
+ p0_v = pu1_src_temp_v[pos_p0];
+ p1_v = pu1_src_temp_v[pos_p1];
+
+ /* Filter Decision */
+ if((ABS(p0_u - q0_u) < alpha_cb) &&
+ (ABS(q1_u - q0_u) < beta_cb) &&
+ (ABS(p1_u - p0_u) < beta_cb))
+ {
+ val = ((((q0_u - p0_u) << 2) + (p1_u - q1_u) + 4) >> 3);
+ delta = CLIP3(-tcb, tcb, val);
+ /* p0' */
+ val = p0_u + delta;
+ pu1_src_temp_u[pos_p0] = CLIP_U8(val);
+ /* q0' */
+ val = q0_u - delta;
+ pu1_src_temp_u[pos_q0] = CLIP_U8(val);
+ }
+
+ /* Filter Decision */
+ if((ABS(p0_v - q0_v) < alpha_cr) &&
+ (ABS(q1_v - q0_v) < beta_cr) &&
+ (ABS(p1_v - p0_v) < beta_cr))
+ {
+ val = ((((q0_v - p0_v) << 2) + (p1_v - q1_v) + 4) >> 3);
+ delta = CLIP3(-tcr, tcr, val);
+ /* p0' */
+ val = p0_v + delta;
+ pu1_src_temp_v[pos_p0] = CLIP_U8(val);
+ /* q0' */
+ val = q0_v - delta;
+ pu1_src_temp_v[pos_q0] = CLIP_U8(val);
+ }
+ }
+}
diff --git a/common/ih264_deblk_edge_filters.h b/common/ih264_deblk_edge_filters.h
new file mode 100755
index 0000000..4079dd2
--- /dev/null
+++ b/common/ih264_deblk_edge_filters.h
@@ -0,0 +1,195 @@
+/******************************************************************************
+ *
+ * Copyright (C) 2015 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ *****************************************************************************
+ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
+*/
+/**
+ *******************************************************************************
+ * @file
+ * ih264_deblk_edge_filters.h
+ *
+ * @brief
+ * This file contains declarations of functions used for deblocking
+ *
+ * @author
+ * Ittiam
+ *
+ * @remarks
+ * None
+ *
+ *******************************************************************************
+ */
+
+#ifndef IH264_DEBLK_H_
+#define IH264_DEBLK_H_
+
+/*****************************************************************************/
+/* Extern Function Declarations */
+/*****************************************************************************/
+
+typedef void ih264_deblk_edge_bslt4_ft(UWORD8 *pu1_src,
+ WORD32 src_strd,
+ WORD32 alpha,
+ WORD32 beta,
+ UWORD32 u4_bs,
+ const UWORD8 *pu1_cliptab );
+
+typedef void ih264_deblk_edge_bs4_ft(UWORD8 *pu1_src,
+ WORD32 src_strd,
+ WORD32 alpha,
+ WORD32 beta );
+
+typedef void ih264_deblk_chroma_edge_bslt4_ft(UWORD8 *pu1_src,
+ WORD32 src_strd,
+ WORD32 alpha_cb,
+ WORD32 beta_cb,
+ WORD32 alpha_cr,
+ WORD32 beta_cr,
+ UWORD32 u4_bs,
+ const UWORD8 *pu1_cliptab_cb,
+ const UWORD8 *pu1_cliptab_cr);
+
+typedef void ih264_deblk_chroma_edge_bs4_ft(UWORD8 *pu1_src,
+ WORD32 src_strd,
+ WORD32 alpha_cb,
+ WORD32 beta_cb,
+ WORD32 alpha_cr,
+ WORD32 beta_cr);
+
+
+
+ih264_deblk_edge_bs4_ft ih264_deblk_luma_horz_bs4;
+ih264_deblk_edge_bs4_ft ih264_deblk_luma_vert_bs4;
+ih264_deblk_edge_bs4_ft ih264_deblk_luma_vert_bs4_mbaff;
+
+
+ih264_deblk_edge_bs4_ft ih264_deblk_chroma_horz_bs4_bp;
+ih264_deblk_edge_bs4_ft ih264_deblk_chroma_vert_bs4_bp;
+ih264_deblk_edge_bs4_ft ih264_deblk_chroma_vert_bs4_mbaff_bp;
+
+
+ih264_deblk_edge_bslt4_ft ih264_deblk_luma_horz_bslt4;
+ih264_deblk_edge_bslt4_ft ih264_deblk_luma_vert_bslt4;
+ih264_deblk_edge_bslt4_ft ih264_deblk_luma_vert_bslt4_mbaff;
+
+
+ih264_deblk_edge_bslt4_ft ih264_deblk_chroma_horz_bslt4_bp;
+ih264_deblk_edge_bslt4_ft ih264_deblk_chroma_vert_bslt4_bp;
+ih264_deblk_edge_bslt4_ft ih264_deblk_chroma_vert_bslt4_mbaff_bp;
+
+ih264_deblk_chroma_edge_bs4_ft ih264_deblk_chroma_vert_bs4;
+ih264_deblk_chroma_edge_bs4_ft ih264_deblk_chroma_horz_bs4;
+ih264_deblk_chroma_edge_bs4_ft ih264_deblk_chroma_vert_bs4_mbaff;
+ih264_deblk_chroma_edge_bs4_ft ih264_deblk_chroma_horz_bs4_mbaff;
+
+ih264_deblk_chroma_edge_bslt4_ft ih264_deblk_chroma_vert_bslt4;
+ih264_deblk_chroma_edge_bslt4_ft ih264_deblk_chroma_horz_bslt4;
+ih264_deblk_chroma_edge_bslt4_ft ih264_deblk_chroma_vert_bslt4_mbaff;
+ih264_deblk_chroma_edge_bslt4_ft ih264_deblk_chroma_horz_bslt4_mbaff;
+
+
+/*A9*/
+ih264_deblk_edge_bs4_ft ih264_deblk_luma_horz_bs4_a9;
+ih264_deblk_edge_bs4_ft ih264_deblk_luma_vert_bs4_a9;
+ih264_deblk_edge_bs4_ft ih264_deblk_luma_vert_bs4_mbaff_a9;
+
+
+ih264_deblk_edge_bs4_ft ih264_deblk_chroma_horz_bs4_bp_a9;
+ih264_deblk_edge_bs4_ft ih264_deblk_chroma_vert_bs4_bp_a9;
+ih264_deblk_edge_bs4_ft ih264_deblk_chroma_vert_bs4_mbaff_bp_a9;
+
+
+ih264_deblk_edge_bslt4_ft ih264_deblk_luma_horz_bslt4_a9;
+ih264_deblk_edge_bslt4_ft ih264_deblk_luma_vert_bslt4_a9;
+ih264_deblk_edge_bslt4_ft ih264_deblk_luma_vert_bslt4_mbaff_a9;
+
+
+ih264_deblk_edge_bslt4_ft ih264_deblk_chroma_horz_bslt4_bp_a9;
+ih264_deblk_edge_bslt4_ft ih264_deblk_chroma_vert_bslt4_bp_a9;
+ih264_deblk_edge_bslt4_ft ih264_deblk_chroma_vert_bslt4_mbaff_bp_a9;
+
+ih264_deblk_chroma_edge_bs4_ft ih264_deblk_chroma_vert_bs4_a9;
+ih264_deblk_chroma_edge_bs4_ft ih264_deblk_chroma_horz_bs4_a9;
+ih264_deblk_chroma_edge_bs4_ft ih264_deblk_chroma_vert_bs4_mbaff_a9;
+ih264_deblk_chroma_edge_bs4_ft ih264_deblk_chroma_horz_bs4_mbaff_a9;
+
+ih264_deblk_chroma_edge_bslt4_ft ih264_deblk_chroma_vert_bslt4_a9;
+ih264_deblk_chroma_edge_bslt4_ft ih264_deblk_chroma_horz_bslt4_a9;
+ih264_deblk_chroma_edge_bslt4_ft ih264_deblk_chroma_vert_bslt4_mbaff_a9;
+ih264_deblk_chroma_edge_bslt4_ft ih264_deblk_chroma_horz_bslt4_mbaff_a9;
+
+/*AV8*/
+ih264_deblk_edge_bs4_ft ih264_deblk_luma_horz_bs4_av8;
+ih264_deblk_edge_bs4_ft ih264_deblk_luma_vert_bs4_av8;
+ih264_deblk_edge_bs4_ft ih264_deblk_luma_vert_bs4_mbaff_av8;
+
+
+ih264_deblk_edge_bs4_ft ih264_deblk_chroma_horz_bs4_bp_av8;
+ih264_deblk_edge_bs4_ft ih264_deblk_chroma_vert_bs4_bp_av8;
+ih264_deblk_edge_bs4_ft ih264_deblk_chroma_vert_bs4_mbaff_bp_av8;
+
+
+ih264_deblk_edge_bslt4_ft ih264_deblk_luma_horz_bslt4_av8;
+ih264_deblk_edge_bslt4_ft ih264_deblk_luma_vert_bslt4_av8;
+ih264_deblk_edge_bslt4_ft ih264_deblk_luma_vert_bslt4_mbaff_av8;
+
+
+ih264_deblk_edge_bslt4_ft ih264_deblk_chroma_horz_bslt4_bp_av8;
+ih264_deblk_edge_bslt4_ft ih264_deblk_chroma_vert_bslt4_bp_av8;
+ih264_deblk_edge_bslt4_ft ih264_deblk_chroma_vert_bslt4_mbaff_bp_av8;
+
+ih264_deblk_chroma_edge_bs4_ft ih264_deblk_chroma_vert_bs4_av8;
+ih264_deblk_chroma_edge_bs4_ft ih264_deblk_chroma_horz_bs4_av8;
+ih264_deblk_chroma_edge_bs4_ft ih264_deblk_chroma_vert_bs4_mbaff_av8;
+ih264_deblk_chroma_edge_bs4_ft ih264_deblk_chroma_horz_bs4_mbaff_av8;
+
+ih264_deblk_chroma_edge_bslt4_ft ih264_deblk_chroma_vert_bslt4_av8;
+ih264_deblk_chroma_edge_bslt4_ft ih264_deblk_chroma_horz_bslt4_av8;
+ih264_deblk_chroma_edge_bslt4_ft ih264_deblk_chroma_vert_bslt4_mbaff_av8;
+ih264_deblk_chroma_edge_bslt4_ft ih264_deblk_chroma_horz_bslt4_mbaff_av8;
+
+/*SSE3*/
+ih264_deblk_edge_bs4_ft ih264_deblk_luma_horz_bs4_ssse3;
+ih264_deblk_edge_bs4_ft ih264_deblk_luma_vert_bs4_ssse3;
+ih264_deblk_edge_bs4_ft ih264_deblk_luma_vert_bs4_mbaff_ssse3;
+
+
+ih264_deblk_edge_bs4_ft ih264_deblk_chroma_horz_bs4_bp_ssse3;
+ih264_deblk_edge_bs4_ft ih264_deblk_chroma_vert_bs4_bp_ssse3;
+ih264_deblk_edge_bs4_ft ih264_deblk_chroma_vert_bs4_mbaff_bp_ssse3;
+
+
+ih264_deblk_edge_bslt4_ft ih264_deblk_luma_horz_bslt4_ssse3;
+ih264_deblk_edge_bslt4_ft ih264_deblk_luma_vert_bslt4_ssse3;
+ih264_deblk_edge_bslt4_ft ih264_deblk_luma_vert_bslt4_mbaff_ssse3;
+
+
+ih264_deblk_edge_bslt4_ft ih264_deblk_chroma_horz_bslt4_bp_ssse3;
+ih264_deblk_edge_bslt4_ft ih264_deblk_chroma_vert_bslt4_bp_ssse3;
+ih264_deblk_edge_bslt4_ft ih264_deblk_chroma_vert_bslt4_mbaff_bp_ssse3;
+
+ih264_deblk_chroma_edge_bs4_ft ih264_deblk_chroma_vert_bs4_ssse3;
+ih264_deblk_chroma_edge_bs4_ft ih264_deblk_chroma_horz_bs4_ssse3;
+ih264_deblk_chroma_edge_bs4_ft ih264_deblk_chroma_vert_bs4_mbaff_ssse3;
+ih264_deblk_chroma_edge_bs4_ft ih264_deblk_chroma_horz_bs4_mbaff_ssse3;
+
+ih264_deblk_chroma_edge_bslt4_ft ih264_deblk_chroma_vert_bslt4_ssse3;
+ih264_deblk_chroma_edge_bslt4_ft ih264_deblk_chroma_horz_bslt4_ssse3;
+ih264_deblk_chroma_edge_bslt4_ft ih264_deblk_chroma_vert_bslt4_mbaff_ssse3;
+ih264_deblk_chroma_edge_bslt4_ft ih264_deblk_chroma_horz_bslt4_mbaff_ssse3;
+
+#endif /* IH264_DEBLK_H_ */
diff --git a/common/ih264_deblk_tables.c b/common/ih264_deblk_tables.c
new file mode 100755
index 0000000..91e28e0
--- /dev/null
+++ b/common/ih264_deblk_tables.c
@@ -0,0 +1,119 @@
+/******************************************************************************
+ *
+ * Copyright (C) 2015 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ *****************************************************************************
+ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
+*/
+/**
+*******************************************************************************
+* @file
+* ih264_deblk_tables.c
+*
+* @brief
+* Contains tables used for deblocking
+*
+* @author
+* Ittiam
+*
+* @par List of Tables:
+* - guc_ih264_qp_scale_cr[]
+* - guc_ih264_alpha_table[]
+* - guc_ih264_beta_table[]
+* - guc_ih264_clip_table[][]
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+
+/*****************************************************************************/
+/* File Includes */
+/*****************************************************************************/
+
+/* System include files */
+#include <stdio.h>
+
+/* User include files */
+#include "ih264_typedefs.h"
+#include "ih264_deblk_tables.h"
+
+/*****************************************************************************/
+/* Extern global definitions */
+/*****************************************************************************/
+
+/**
+ ******************************************************************************
+ * @brief alpha & beta tables for deblocking
+ * input : indexA [0-51] & indexB [0-51]
+ * output : alpha & beta
+ *
+ * @remarks Table 8-16 – in H264 Specification,
+ * Derivation of offset dependent threshold variables
+ * alpha and beta from indexA and indexB
+ ******************************************************************************
+ */
+const UWORD8 gu1_ih264_alpha_table[52] =
+{
+ /* indexA :: 0-51 inclusive */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 4, 4, 5, 6, 7, 8, 9, 10,
+ 12, 13, 15, 17, 20, 22, 25, 28,
+ 32, 36, 40, 45, 50, 56, 63, 71,
+ 80, 90, 101, 113, 127, 144, 162, 182,
+ 203, 226, 255, 255,
+};
+
+const UWORD8 gu1_ih264_beta_table[52] =
+{
+ /* indexB :: 0-51 inclusive */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 2, 2, 2, 3, 3, 3, 3, 4,
+ 4, 4, 6, 6, 7, 7, 8, 8,
+ 9, 9, 10, 10, 11, 11, 12, 12,
+ 13, 13, 14, 14, 15, 15, 16, 16,
+ 17, 17, 18, 18,
+};
+
+/**
+ ******************************************************************************
+ * @brief t'C0 table for deblocking
+ * input : indexA [0-51] and bS [1,3]
+ * output : t'C0
+ *
+ * @remarks Table 8-17 – in H264 Specification,
+ * Value of variable t'C0 as a function of indexA and bS
+ ******************************************************************************
+ */
+const UWORD8 gu1_ih264_clip_table[52][4] =
+{
+ /* indexA :: 0-51 inclusive */
+ { 0, 0, 0, 0}, { 0, 0, 0, 0}, { 0, 0, 0, 0}, { 0, 0, 0, 0},
+ { 0, 0, 0, 0}, { 0, 0, 0, 0}, { 0, 0, 0, 0}, { 0, 0, 0, 0},
+ { 0, 0, 0, 0}, { 0, 0, 0, 0}, { 0, 0, 0, 0}, { 0, 0, 0, 0},
+ { 0, 0, 0, 0}, { 0, 0, 0, 0}, { 0, 0, 0, 0}, { 0, 0, 0, 0},
+ { 0, 0, 0, 0}, { 0, 0, 0, 1}, { 0, 0, 0, 1}, { 0, 0, 0, 1},
+ { 0, 0, 0, 1}, { 0, 0, 1, 1}, { 0, 0, 1, 1}, { 0, 1, 1, 1},
+ { 0, 1, 1, 1}, { 0, 1, 1, 1}, { 0, 1, 1, 1}, { 0, 1, 1, 2},
+ { 0, 1, 1, 2}, { 0, 1, 1, 2}, { 0, 1, 1, 2}, { 0, 1, 2, 3},
+ { 0, 1, 2, 3}, { 0, 2, 2, 3}, { 0, 2, 2, 4}, { 0, 2, 3, 4},
+ { 0, 2, 3, 4}, { 0, 3, 3, 5}, { 0, 3, 4, 6}, { 0, 3, 4, 6},
+ { 0, 4, 5, 7}, { 0, 4, 5, 8}, { 0, 4, 6, 9}, { 0, 5, 7,10},
+ { 0, 6, 8,11}, { 0, 6, 8,13}, { 0, 7,10,14}, { 0, 8,11,16},
+ { 0, 9,12,18}, { 0,10,13,20}, { 0,11,15,23}, { 0,13,17,25},
+};
diff --git a/common/ih264_deblk_tables.h b/common/ih264_deblk_tables.h
new file mode 100755
index 0000000..3935dcb
--- /dev/null
+++ b/common/ih264_deblk_tables.h
@@ -0,0 +1,73 @@
+/******************************************************************************
+ *
+ * Copyright (C) 2015 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ *****************************************************************************
+ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
+*/
+/**
+ *******************************************************************************
+ * @file
+ * ih264_deblk_tables.h
+ *
+ * @brief
+ * This file contains declarations of tables used for deblocking
+ *
+ * @author
+ * Ittiam
+ *
+ * @par List of Functions:
+ *
+ * @remarks
+ * None
+ *
+ *******************************************************************************
+ */
+
+#ifndef IH264_DEBLK_TABLES_H_
+#define IH264_DEBLK_TABLES_H_
+
+/*****************************************************************************/
+/* Extern global declarations */
+/*****************************************************************************/
+
+/**
+ ******************************************************************************
+ * @brief alpha & beta tables for deblocking
+ * input : indexA [0-51] & indexB [0-51]
+ * output : alpha & beta
+ *
+ * @remarks Table 8-16 – in H264 Specification,
+ * Derivation of offset dependent threshold variables
+ * alpha and beta from indexA and indexB
+ ******************************************************************************
+ */
+extern const UWORD8 gu1_ih264_alpha_table[52];
+
+extern const UWORD8 gu1_ih264_beta_table[52];
+
+/**
+ ******************************************************************************
+ * @brief t'C0 table for deblocking
+ * input : indexA [0-51] and bS [1,3]
+ * output : t'C0
+ *
+ * @remarks Table 8-17 – in H264 Specification,
+ * Value of variable t'C0 as a function of indexA and bS
+ ******************************************************************************
+ */
+extern const UWORD8 gu1_ih264_clip_table[52][4];
+
+#endif /* IH264_DEBLK_TABLES_H_ */
diff --git a/common/ih264_debug.h b/common/ih264_debug.h
new file mode 100755
index 0000000..96ff2a7
--- /dev/null
+++ b/common/ih264_debug.h
@@ -0,0 +1,61 @@
+/******************************************************************************
+ *
+ * Copyright (C) 2015 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ *****************************************************************************
+ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
+*/
+/**
+*******************************************************************************
+* @file
+* ih264_debug.h
+*
+* @brief
+* Definitions for codec debugging
+*
+* @author
+* Ittiam
+*
+* @par List of Functions:
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+#ifndef _IH264_DEBUG_H_
+#define _IH264_DEBUG_H_
+
+
+#if DEBUG_PRINT
+
+#define DEBUG(...) \
+{ \
+ printf("\n[H264 DBG] %s/%d:: ", __FUNCTION__, __LINE__); \
+ printf(__VA_ARGS__); \
+}
+
+#else
+
+#define DEBUG(...) {}
+
+#endif
+
+
+#define ASSERT(x) assert((x))
+
+
+#endif /* _IH264_DEBUG_H_ */
+
diff --git a/common/ih264_defs.h b/common/ih264_defs.h
new file mode 100755
index 0000000..8d7e387
--- /dev/null
+++ b/common/ih264_defs.h
@@ -0,0 +1,690 @@
+/******************************************************************************
+ *
+ * Copyright (C) 2015 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ *****************************************************************************
+ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
+*/
+/**
+*******************************************************************************
+* @file
+* ih264_defs.h
+*
+* @brief
+* Definitions used in the codec
+*
+* @author
+* Ittiam
+*
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+
+#ifndef IH264_DEFS_H_
+#define IH264_DEFS_H_
+
+/*****************************************************************************/
+/* Enums */
+/*****************************************************************************/
+
+
+/*****************************************************************************/
+/* Profile and Levels */
+/*****************************************************************************/
+
+/**
+******************************************************************************
+ * @enum PROFILE_IDC
+ * @brief Defines the set of possible profiles
+******************************************************************************
+*/
+enum
+{
+ IH264_PROFILE_BASELINE = 66,
+ IH264_PROFILE_MAIN = 77,
+ IH264_PROFILE_EXTENDED = 88,
+ IH264_PROFILE_HIGH = 100,
+ IH264_PROFILE_HIGH10 = 110,
+ IH264_PROFILE_HIGH422 = 122,
+ IH264_PROFILE_HIGH444 = 144,
+};
+
+/**
+******************************************************************************
+ * @enum LEVEL_IDC
+ * @brief Defines the set of possible levels
+******************************************************************************
+*/
+typedef enum
+{
+ IH264_LEVEL_10 = 10,
+ IH264_LEVEL_1B = 9,
+ IH264_LEVEL_11 = 11,
+ IH264_LEVEL_12 = 12,
+ IH264_LEVEL_13 = 13,
+ IH264_LEVEL_20 = 20,
+ IH264_LEVEL_21 = 21,
+ IH264_LEVEL_22 = 22,
+ IH264_LEVEL_30 = 30,
+ IH264_LEVEL_31 = 31,
+ IH264_LEVEL_32 = 32,
+ IH264_LEVEL_40 = 40,
+ IH264_LEVEL_41 = 41,
+ IH264_LEVEL_42 = 42,
+ IH264_LEVEL_50 = 50,
+ IH264_LEVEL_51 = 51,
+}IH264_LEVEL_T;
+
+
+/**
+******************************************************************************
+ * @enum PIC TYPES
+ * @brief Defines the set of possible picture type - not signaled in bitstream
+******************************************************************************
+*/
+typedef enum
+{
+ PIC_NA = 0x7FFFFFFF,
+ PIC_IDR = 0,
+ PIC_I = 1,
+ PIC_P = 2,
+ PIC_B = 3,
+ PIC_P_NONREF = 4,
+ PIC_B_NONREF = 5,
+ PIC_MAX,
+}PIC_TYPE_T;
+
+/**
+******************************************************************************
+ * @enum FRAME-FIELD types
+ * @brief Defines the set of possible field types.
+******************************************************************************
+*/
+enum
+{
+ TOP_FIELD,
+ BOTTOM_FIELD,
+ FRAME,
+};
+
+/**
+******************************************************************************
+ * @enum SLICE TYPES
+ * @brief Defines the set of possible SLICE TYPES
+******************************************************************************
+*/
+enum
+{
+ PSLICE = 0,
+ BSLICE = 1,
+ ISLICE = 2,
+ SPSLICE = 3,
+ SISLICE = 4,
+ MAXSLICE_TYPE,
+};
+
+/**
+******************************************************************************
+ * @enum NAL_UNIT_TYPE
+ * @brief Defines the set of possible nal unit types
+******************************************************************************
+*/
+enum
+{
+ NAL_UNSPEC_0 = 0,
+ NAL_SLICE_NON_IDR = 1,
+ NAL_SLICE_DPA = 2,
+ NAL_SLICE_DPB = 3,
+ NAL_SLICE_DPC = 4,
+ NAL_SLICE_IDR = 5,
+ NAL_SEI = 6,
+ NAL_SPS = 7,
+ NAL_PPS = 8,
+ NAL_AUD = 9,
+ NAL_EOSEQ = 10,
+ NAL_EOSTR = 11,
+ NAL_FILLER = 12,
+ NAL_SPSE = 13,
+ NAL_RES_18 = 14,
+ NAL_AUX_PIC = 19,
+ NAL_RES_23 = 20,
+ NAL_UNSPEC_31 = 24,
+};
+
+/**
+******************************************************************************
+ * @enum CHROMA_FORMAT_IDC
+ * @brief Defines the set of possible chroma formats
+ * Note Chorma format Do not change enum values
+******************************************************************************
+*/
+enum
+{
+ CHROMA_FMT_IDC_MONOCHROME = 0,
+ CHROMA_FMT_IDC_YUV420 = 1,
+ CHROMA_FMT_IDC_YUV422 = 2,
+ CHROMA_FMT_IDC_YUV444 = 3,
+ CHROMA_FMT_IDC_YUV444_PLANES = 4,
+};
+
+
+/**
+******************************************************************************
+ * @enum MBMODES_I16x16
+ * @brief Defines the set of possible intra 16x16 mb modes
+******************************************************************************
+*/
+typedef enum
+{
+ VERT_I16x16 = 0,
+ HORZ_I16x16 = 1,
+ DC_I16x16 = 2,
+ PLANE_I16x16 = 3,
+ MAX_I16x16 = 4,
+}MBMODES_I16x16;
+
+/**
+******************************************************************************
+ * @enum MBMODES_I4x4
+ * @brief Defines the set of possible intra 4x4 mb modes
+******************************************************************************
+*/
+typedef enum
+{
+ VERT_I4x4 = 0,
+ HORZ_I4x4 = 1,
+ DC_I4x4 = 2,
+ DIAG_DL_I4x4 = 3,
+ DIAG_DR_I4x4 = 4,
+ VERT_R_I4x4 = 5,
+ HORZ_D_I4x4 = 6,
+ VERT_L_I4x4 = 7,
+ HORZ_U_I4x4 = 8,
+ MAX_I4x4 = 9,
+}MBMODES_I4x4;
+
+/**
+******************************************************************************
+ * @enum MBMODES_I8x8
+ * @brief Defines the set of possible intra 8x8 mb modes
+******************************************************************************
+*/
+typedef enum
+{
+ VERT_I8x8 = 0,
+ HORZ_I8x8 = 1,
+ DC_I8x8 = 2,
+ DIAG_DL_I8x8 = 3,
+ DIAG_DR_I8x8 = 4,
+ VERT_R_I8x8 = 5,
+ HORZ_D_I8x8 = 6,
+ VERT_L_I8x8 = 7,
+ HORZ_U_I8x8 = 8,
+ MAX_I8x8 = 9,
+}MBMODES_I8x8;
+
+/**
+******************************************************************************
+ * @enum MBMODES_CHROMA_I8x8 (Chroma)
+ * @brief Defines the set of possible intra 8x8 mb modes for chroma
+******************************************************************************
+*/
+typedef enum
+{
+ DC_CH_I8x8 = 0,
+ HORZ_CH_I8x8 = 1,
+ VERT_CH_I8x8 = 2,
+ PLANE_CH_I8x8 = 3,
+ MAX_CH_I8x8 = 4,
+}MBMODES_CHROMA_I8x8;
+
+/**
+******************************************************************************
+ * @enum MBTYPES
+ * @brief Defines the set of possible macro block types
+******************************************************************************
+*/
+typedef enum
+{
+ I16x16 = 0,
+ I4x4 = 1,
+ I8x8 = 2,
+ P16x16 = 3,
+ P16x8 = 4,
+ P8x16 = 5,
+ P8x8 = 6,
+ PSKIP = 7,
+ IPCM = 8,
+ MAX_MBTYPES,
+}MBTYPES_T;
+
+/* Prediction list */
+/* Do not change enum values */
+enum
+{
+ PRED_L0 = 0,
+ PRED_L1 = 1,
+ PRED_BI = 2
+};
+
+
+/**
+******************************************************************************
+ * @enum ENTROPY_BLK_TYPE
+ * @brief Defines the nature of blocks employed in entropy coding
+******************************************************************************
+*/
+typedef enum
+{
+ ENTROPY_BLK_INVALID = -1,
+ CAVLC_LUMA_4x4_DC = 0,
+ CAVLC_LUMA_4x4_AC = 1,
+ CAVLC_LUMA_4x4 = 2,
+ CAVLC_CHROMA_4x4_DC = 3,
+ CAVLC_CHROMA_4x4_AC = 4,
+} ENTROPY_BLK_TYPE;
+
+/**
+******************************************************************************
+ * @enum ENTROPY_MODE
+ * @brief Entropy coding modes
+******************************************************************************
+*/
+typedef enum
+{
+ CAVLC = 0,
+ CABAC = 1,
+} ENTROPY_MODE;
+
+/**
+******************************************************************************
+ * @enum COMPONENT_TYPE
+ * @brief components Y, U & V
+******************************************************************************
+*/
+typedef enum
+{
+ Y,
+ U,
+ V,
+} COMPONENT_TYPE;
+
+
+/**
+******************************************************************************
+ * @enum MBPART_PREDMODE_T
+ * @brief MbPartPredMode Table 7-11 to 7-14
+******************************************************************************
+*/
+typedef enum
+{
+ MBPART_NA,
+ MBPART_I4x4,
+ MBPART_I8x8,
+ MBPART_I16x16,
+ MBPART_L0,
+ MBPART_L1,
+ MBPART_BI,
+ MBPART_DIRECT,
+ MBPART_IPCM,
+}MBPART_PREDMODE_T;
+
+
+typedef enum
+{
+ I_NxN,
+ I_16x16_0_0_0,
+ I_16x16_1_0_0,
+ I_16x16_2_0_0,
+ I_16x16_3_0_0,
+ I_16x16_0_1_0,
+ I_16x16_1_1_0,
+ I_16x16_2_1_0,
+ I_16x16_3_1_0,
+ I_16x16_0_2_0,
+ I_16x16_1_2_0,
+ I_16x16_2_2_0,
+ I_16x16_3_2_0,
+ I_16x16_0_0_1,
+ I_16x16_1_0_1,
+ I_16x16_2_0_1,
+ I_16x16_3_0_1,
+ I_16x16_0_1_1,
+ I_16x16_1_1_1,
+ I_16x16_2_1_1,
+ I_16x16_3_1_1,
+ I_16x16_0_2_1,
+ I_16x16_1_2_1,
+ I_16x16_2_2_1,
+ I_16x16_3_2_1,
+ I_PCM,
+}MBTYPE_ISLICE_T;
+
+typedef enum
+{
+ P_L0_16x16,
+ P_L0_L0_16x8,
+ P_L0_L0_8x16,
+ P_8x8,
+ P_8x8REF0,
+ P_SKIP
+}MBTYPE_PSLICE_T;
+
+typedef enum
+{
+ B_DIRECT_16x16,
+ B_L0_16x16,
+ B_L1_16x16,
+ B_BI_16x16,
+ B_L0_L0_16x8,
+ B_L0_L0_8x16,
+ B_L1_L1_16x8,
+ B_L1_L1_8x16,
+ B_L0_L1_16x8,
+ B_L0_L1_8x16,
+ B_L1_L0_16x8,
+ B_L1_L0_8x16,
+ B_L0_BI_16x8,
+ B_L0_BI_8x16,
+ B_L1_BI_16x8,
+ B_L1_BI_8x16,
+ B_BI_L0_16x8,
+ B_BI_L0_8x16,
+ B_BI_L1_16x8,
+ B_BI_L1_8x16,
+ B_BI_BI_16x8,
+ B_BI_BI_8x16,
+ B_8x8,
+ B_SKIP,
+}MBTYPE_BSLICE_T;
+
+
+typedef enum
+{
+ P_L0_8x8,
+ P_L0_8x4,
+ P_L0_4x8,
+ P_L0_4x4,
+}SUBMBTYPE_PSLICE_T;
+
+typedef enum
+{
+ B_DIRECT_8x8,
+ B_L0_8x8,
+ B_L1_8x8,
+ B_BI_8x8,
+ B_L0_8x4,
+ B_L0_4x8,
+ B_L1_8x4,
+ B_L1_4x8,
+ B_BI_8x4,
+ B_BI_4x8,
+ B_L0_4x4,
+ B_L1_4x4,
+ B_BI_4x4,
+}SUBMBTYPE_BSLICE_T;
+
+/**
+ * DC Mode pattern for 4 4x4 sub blocks in an MB row
+ */
+#define DC_I16X16_MB_ROW (DC_I16x16 << 24) | (DC_I16x16 << 16) | \
+ (DC_I16x16 << 8) | DC_I16x16
+
+
+
+/*****************************************************************************/
+/* Constant Macros */
+/*****************************************************************************/
+
+/*****************************************************************************/
+/* Reference frame defs */
+/*****************************************************************************/
+/* Maximum DPB size */
+#define MAX_DPB_SIZE 16
+
+/* Maximum mmco commands in slice header */
+#define MAX_MMCO_COMMANDS 32
+
+/* Maximum reference reorder idc */
+#define MAX_MODICATION_IDC 32
+
+/*****************************************************************************/
+/* SPS restrictions */
+/*****************************************************************************/
+
+/* Number of SPS allowed */
+/* An extra buffer is allocated to write the parsed data
+ * It is copied to the appropriate location later */
+#define MAX_SPS_CNT (32 + 1)
+
+/* Maximum long term reference pics */
+#define MAX_LTREF_PICS_SPS 16
+
+/* Maximum short term reference pics */
+#define MAX_STREF_PICS_SPS 64
+
+
+/*****************************************************************************/
+/* PPS restrictions */
+/*****************************************************************************/
+
+/* Number of PPS allowed */
+/* An extra buffer is allocated to write the parsed data
+ * It is copied to the appropriate location later */
+#define MAX_PPS_CNT (256 + 1)
+
+/*****************************************************************************/
+/* Macro definitions for sizes of MB, PU, TU, CU */
+/*****************************************************************************/
+#define MB_SIZE 16
+#define BLK8x8SIZE 8
+#define BLK_SIZE 4
+
+
+/* TU Size Range */
+#define MAX_TU_SIZE 8
+#define MIN_TU_SIZE 4
+
+/* Max Transform Size */
+#define MAX_TRANS_SIZE (MAX_TU_SIZE*MAX_TU_SIZE)
+
+/* PU Size Range */
+#define MAX_PU_SIZE 16
+#define MIN_PU_SIZE 4
+
+/* Number of max TU in a MB row */
+#define MAX_TU_IN_MB_ROW ((MB_SIZE / MIN_TU_SIZE))
+
+/* Number of max PU in a CTb row */
+#define MAX_PU_IN_MB_ROW ((MB_SIZE / MIN_PU_SIZE))
+
+
+/* Number of max PU in a MB */
+/*****************************************************************************/
+/* Note though for 64 x 64 MB, Max PU in MB is 128, in order to store */
+/* intra pred info, 256 entries are needed */
+/*****************************************************************************/
+#define MAX_PU_IN_MB ((MB_SIZE / MIN_PU_SIZE) * \
+ (MB_SIZE / MIN_PU_SIZE))
+
+/* Number of max TU in a MB */
+#define MAX_TU_IN_MB ((MB_SIZE / MIN_TU_SIZE) * \
+ (MB_SIZE / MIN_TU_SIZE))
+
+
+
+/**
+ * Maximum transform depths
+ */
+#define MAX_TRAFO_DEPTH 5
+
+#define MAX_DC_4x4_SUBBLK_LUMA 1
+#define MAX_AC_4x4_SUBBLK_LUMA 16
+#define MAX_DC_4x4_SUBBLK_CHROMA 2
+#define MAX_AC_4x4_SUBBLK_CHROMA 8
+
+#define MAX_4x4_SUBBLKS (MAX_DC_4x4_SUBBLK_LUMA + MAX_DC_4x4_SUBBLK_CHROMA +\
+ MAX_AC_4x4_SUBBLK_LUMA + MAX_AC_4x4_SUBBLK_CHROMA)
+
+/* Max number of deblocking edges */
+#define MAX_VERT_DEBLK_EDGES ((MB_SIZE/8) * (MB_SIZE/4))
+#define MAX_HORZ_DEBLK_EDGES ((MB_SIZE/4) * (MB_SIZE/8))
+
+/* Qp can not change below 8x8 level */
+#define MAX_DEBLK_QP_CNT ((MB_SIZE/8) * (MB_SIZE/8))
+
+/*****************************************************************************/
+/* Parsing related macros */
+/*****************************************************************************/
+#define SUBBLK_COEFF_CNT 16
+
+/* Quant and Trans defs */
+
+/*****************************************************************************/
+/* Sizes for Transform functions */
+/*****************************************************************************/
+#define TRANS_SIZE_4 4
+#define TRANS_SIZE_8 8
+#define TRANS_SIZE_16 16
+#define TRANS_SIZE_32 32
+
+
+#define IT_SHIFT_STAGE_1 7
+#define IT_SHIFT_STAGE_2 12
+
+/**
+ * @breif Maximum transform dynamic range (excluding sign bit)
+ */
+#define MAX_TR_DYNAMIC_RANGE 15
+
+/**
+ * @brief Q(QP%6) * IQ(QP%6) = 2^20
+ */
+#define QUANT_IQUANT_SHIFT 20
+
+/**
+ * @breif Q factor for Qp%6 multiplication
+ */
+#define QUANT_SHIFT 14
+
+/**
+ * @breif Q shift factor for flat rescale matrix weights
+ */
+#define FLAT_RESCALE_MAT_Q_SHIFT 11
+
+/**
+ * @breif Scaling matrix is represented in Q15 format
+ */
+#define SCALING_Q_SHIFT 15
+
+/**
+ * @brief rounding factor for quantization represented in Q9 format
+ */
+#define QUANT_ROUND_FACTOR_Q 9
+
+/**
+ * @brief Minimum qp supported in H264 spec
+ */
+#define MIN_H264_QP 0
+
+/**
+ * @brief Maximum qp supported in H264 spec
+ */
+#define MAX_H264_QP 51
+
+/**
+ * @breif Total number of transform sizes
+ * used for sizeID while getting scale matrix
+ */
+#define NUM_UNIQUE_TRANS_SIZE 4
+
+/**
+ * @breif Maximum number of bits in frameNumber signaling
+ */
+#define MAX_BITS_IN_FRAME_NUM 16
+
+/**
+ * @breif Maximum number of bits in POC LSB signaling
+ */
+#define MAX_BITS_IN_POC_LSB 16
+
+
+/**
+ * @breif Maximum PIC Order Count type
+ */
+#define MAX_PIC_ORDER_COUNT_TYPE 2
+
+
+/**
+ * @breif Maximum Weighted bipred idc
+ */
+#define MAX_WEIGHT_BIPRED_IDC 2
+
+/*****************************************************************************/
+/* Number of scaling matrices for each transform size */
+/*****************************************************************************/
+#define SCALE_MAT_CNT_TRANS_SIZE_4 6
+#define SCALE_MAT_CNT_TRANS_SIZE_8 6
+#define SCALE_MAT_CNT_TRANS_SIZE_16 6
+#define SCALE_MAT_CNT_TRANS_SIZE_32 2
+
+/* Maximum number of scale matrices for a given transform size */
+#define SCALE_MAT_CNT_MAX_PER_TRANS_SIZE 6
+
+/* Total number of scale matrices */
+#define TOTAL_SCALE_MAT_COUNT (SCALE_MAT_CNT_TRANS_SIZE_4 + \
+ SCALE_MAT_CNT_TRANS_SIZE_8 + \
+ SCALE_MAT_CNT_TRANS_SIZE_16 + \
+ SCALE_MAT_CNT_TRANS_SIZE_32)
+
+
+/*****************************************************************************/
+/* Intra pred Macros */
+/*****************************************************************************/
+/** Planar Intra prediction mode */
+#define INTRA_PLANAR 0
+
+/** DC Intra prediction mode */
+#define INTRA_DC 1
+
+/** Gives angular mode for intra prediction */
+#define INTRA_ANGULAR(x) (x)
+
+/** Following is used to signal no intra prediction in case of pcm blocks
+ */
+#define INTRA_PRED_NONE 63
+
+
+/** Following is used to signal no intra prediction is needed for first three
+ * 4x4 luma blocks in case of 4x4 TU sizes
+ * Also used in pcm cases
+ */
+#define INTRA_PRED_CHROMA_IDX_NONE 7
+
+
+/**
+******************************************************************************
+ * @brief neighbor availability masks
+******************************************************************************
+ */
+#define LEFT_MB_AVAILABLE_MASK 0x01
+#define TOP_LEFT_MB_AVAILABLE_MASK 0x02
+#define TOP_MB_AVAILABLE_MASK 0x04
+#define TOP_RIGHT_MB_AVAILABLE_MASK 0x08
+
+#endif /* IH264_DEFS_H_ */
diff --git a/common/ih264_disp_mgr.c b/common/ih264_disp_mgr.c
new file mode 100755
index 0000000..2bdb524
--- /dev/null
+++ b/common/ih264_disp_mgr.c
@@ -0,0 +1,186 @@
+/******************************************************************************
+ *
+ * Copyright (C) 2015 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ *****************************************************************************
+ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
+*/
+/**
+*******************************************************************************
+* @file
+* ih264_disp_mgr.c
+*
+* @brief
+* Contains function definitions for display management
+*
+* @author
+* Srinivas T
+*
+* @par List of Functions:
+* - ih264_disp_mgr_init()
+* - ih264_disp_mgr_add()
+* - ih264_disp_mgr_get()
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+#include <stdlib.h>
+#include "ih264_typedefs.h"
+#include "ih264_macros.h"
+#include "ih264_disp_mgr.h"
+
+
+/**
+*******************************************************************************
+*
+* @brief
+* Initialization function for display buffer manager
+*
+* @par Description:
+* Initializes the display buffer management structure
+*
+* @param[in] ps_disp_mgr
+* Pointer to the display buffer management structure
+*
+* @returns none
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+void ih264_disp_mgr_init(disp_mgr_t *ps_disp_mgr)
+{
+ WORD32 id;
+
+ ps_disp_mgr->u4_last_abs_poc = DEFAULT_POC;
+
+ for(id = 0; id < DISP_MGR_MAX_CNT; id++)
+ {
+ ps_disp_mgr->ai4_abs_poc[id] = DEFAULT_POC;
+ ps_disp_mgr->apv_ptr[id] = NULL;
+ }
+}
+
+
+/**
+*******************************************************************************
+*
+* @brief
+* Adds a buffer to the display manager
+*
+* @par Description:
+* Adds a buffer to the display buffer manager
+*
+* @param[in] ps_disp_mgr
+* Pointer to the display buffer management structure
+*
+* @param[in] buf_id
+* ID of the display buffer
+*
+* @param[in] abs_poc
+* Absolute POC of the display buffer
+*
+* @param[in] pv_ptr
+* Pointer to the display buffer
+*
+* @returns 0 if success, -1 otherwise
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+WORD32 ih264_disp_mgr_add(disp_mgr_t *ps_disp_mgr,
+ WORD32 buf_id,
+ WORD32 abs_poc,
+ void *pv_ptr)
+{
+ if(buf_id >= DISP_MGR_MAX_CNT)
+ {
+ return (-1);
+ }
+
+ if(ps_disp_mgr->apv_ptr[buf_id] != NULL)
+ {
+ return (-1);
+ }
+
+ ps_disp_mgr->apv_ptr[buf_id] = pv_ptr;
+ ps_disp_mgr->ai4_abs_poc[buf_id] = abs_poc;
+ return 0;
+}
+
+
+/**
+*******************************************************************************
+*
+* @brief
+* Gets the next buffer
+*
+* @par Description:
+* Gets the next display buffer
+*
+* @param[in] ps_disp_mgr
+* Pointer to the display buffer structure
+*
+* @param[out] pi4_buf_id
+* Pointer to hold buffer id of the display buffer being returned
+*
+* @returns Pointer to the next display buffer
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+void* ih264_disp_mgr_get(disp_mgr_t *ps_disp_mgr, WORD32 *pi4_buf_id)
+{
+ WORD32 id;
+ void *pv_ret_ptr;
+ WORD32 i4_min_poc;
+ WORD32 min_poc_id;
+
+
+ pv_ret_ptr = NULL;
+ i4_min_poc = 0x7FFFFFFF;
+ min_poc_id = -1;
+
+ /* Find minimum POC */
+ for(id = 0; id < DISP_MGR_MAX_CNT; id++)
+ {
+ if((DEFAULT_POC != ps_disp_mgr->ai4_abs_poc[id]) &&
+ (ps_disp_mgr->ai4_abs_poc[id] <= i4_min_poc))
+ {
+ i4_min_poc = ps_disp_mgr->ai4_abs_poc[id];
+ min_poc_id = id;
+ }
+ }
+ *pi4_buf_id = min_poc_id;
+ /* If all pocs are still default_poc then return NULL */
+ if(-1 == min_poc_id)
+ {
+ return NULL;
+ }
+
+ pv_ret_ptr = ps_disp_mgr->apv_ptr[min_poc_id];
+
+ /* Set abs poc to default and apv_ptr to null so that the buffer is not returned again */
+ ps_disp_mgr->apv_ptr[min_poc_id] = NULL;
+ ps_disp_mgr->ai4_abs_poc[min_poc_id] = DEFAULT_POC;
+ return pv_ret_ptr;
+}
diff --git a/common/ih264_disp_mgr.h b/common/ih264_disp_mgr.h
new file mode 100755
index 0000000..6f56493
--- /dev/null
+++ b/common/ih264_disp_mgr.h
@@ -0,0 +1,70 @@
+/******************************************************************************
+ *
+ * Copyright (C) 2015 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ *****************************************************************************
+ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
+*/
+/**
+*******************************************************************************
+* @file
+* ih264_disp_mgr.h
+*
+* @brief
+* Function declarations used for display management
+*
+* @author
+* Srinivas T
+*
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+#ifndef _DISP_MGR_H_
+#define _DISP_MGR_H_
+
+#define DISP_MGR_MAX_CNT 64
+#define DEFAULT_POC 0x7FFFFFFF
+
+typedef struct
+{
+ /**
+ * last_abs_poc
+ */
+ UWORD32 u4_last_abs_poc;
+
+ /**
+ * au4_abs_poc[DISP_MGR_MAX_CNT]
+ */
+ WORD32 ai4_abs_poc[DISP_MGR_MAX_CNT];
+
+ /**
+ * apv_ptr[DISP_MGR_MAX_CNT]
+ */
+ void *apv_ptr[DISP_MGR_MAX_CNT];
+}disp_mgr_t;
+
+void ih264_disp_mgr_init(disp_mgr_t *ps_disp_mgr);
+
+WORD32 ih264_disp_mgr_add(disp_mgr_t *ps_disp_mgr,
+ WORD32 id,
+ WORD32 abs_poc,
+ void *pv_ptr);
+
+void* ih264_disp_mgr_get(disp_mgr_t *ps_disp_mgr, WORD32 *pi4_buf_id);
+
+#endif //_DISP_MGR_H_
diff --git a/common/ih264_dpb_mgr.c b/common/ih264_dpb_mgr.c
new file mode 100755
index 0000000..8e087d3
--- /dev/null
+++ b/common/ih264_dpb_mgr.c
@@ -0,0 +1,1176 @@
+/******************************************************************************
+ *
+ * Copyright (C) 2015 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ *****************************************************************************
+ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
+*/
+/**
+ *******************************************************************************
+ * @file
+ * ih264_dpb_mgr.c
+ *
+ * @brief
+ * Function definitions used for decoded picture buffer management
+ *
+ * @author
+ * Srinivas T
+ *
+ * @par List of Functions:
+ * - ih264_dpb_mgr_init()
+ * - ih264_dpb_mgr_sort_short_term_fields_by_frame_num()
+ * - ih264_dpb_mgr_sort_short_term_fields_by_poc_l0()
+ * - ih264_dpb_mgr_sort_short_term_fields_by_poc_l1()
+ * - ih264_dpb_mgr_sort_long_term_fields_by_frame_idx()
+ * - ih264_dpb_mgr_alternate_ref_fields()
+ * - ih264_dpb_mgr_insert_ref_field()
+ * - ih264_dpb_mgr_insert_ref_frame()
+ * - ih264_dpb_mgr_count_ref_frames()
+ * - ih264_dpb_mgr_delete_ref_frame()
+ * - ih264_dpb_mgr_delete_long_ref_fields_max_frame_idx()
+ * - ih264_dpb_mgr_delete_short_ref_frame()
+ * - ih264_dpb_mgr_delete_all_ref_frames()
+ * - ih264_dpb_mgr_reset()
+ * - ih264_dpb_mgr_release_pics()
+ *
+ * @remarks
+ * None
+ *
+ *******************************************************************************
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <assert.h>
+
+#include "ih264_typedefs.h"
+#include "ih264_defs.h"
+#include "ih264_macros.h"
+#include "ih264_error.h"
+#include "ih264_structs.h"
+#include "ih264_buf_mgr.h"
+#include "ih264_dpb_mgr.h"
+#include "ih264_debug.h"
+
+/**
+ *******************************************************************************
+ *
+ * @brief
+ * DPB manager initializer
+ *
+ * @par Description:
+ * Initialises the DPB manager structure
+ *
+ * @param[in] ps_dpb_mgr
+ * Pointer to the DPB manager structure
+ *
+ * @returns
+ *
+ * @remarks
+ *
+ *
+ *******************************************************************************
+ */
+
+void ih264_dpb_mgr_init(dpb_mgr_t *ps_dpb_mgr)
+{
+ UWORD32 i;
+ dpb_info_t *ps_dpb_info = ps_dpb_mgr->as_dpb_info;
+ for(i = 0; i < MAX_DPB_BUFS; i++)
+ {
+ ps_dpb_info[i].ps_prev_dpb = NULL;
+ ps_dpb_info[i].ps_pic_buf = NULL;
+ ps_dpb_mgr->as_top_field_pics[i].i4_used_as_ref = INVALID;
+ ps_dpb_mgr->as_bottom_field_pics[i].i4_used_as_ref = INVALID;
+ ps_dpb_mgr->as_top_field_pics[i].i1_field_type = INVALID;
+ ps_dpb_mgr->as_bottom_field_pics[i].i1_field_type = INVALID;
+ ps_dpb_mgr->as_top_field_pics[i].i4_long_term_frame_idx = -1;
+ ps_dpb_mgr->as_bottom_field_pics[i].i4_long_term_frame_idx = -1;
+ }
+
+ ps_dpb_mgr->u1_num_short_term_ref_bufs = 0;
+ ps_dpb_mgr->u1_num_long_term_ref_bufs = 0;
+ ps_dpb_mgr->ps_dpb_short_term_head = NULL;
+ ps_dpb_mgr->ps_dpb_long_term_head = NULL;
+}
+
+/**
+ *******************************************************************************
+ *
+ * @brief
+ * Function to sort sort term pics by frame_num.
+ *
+ * @par Description:
+ * Sorts short term fields by frame_num. For 2 fields having same frame_num,
+ * orders them based on requested first field type.
+ *
+ * @param[in] ps_dpb_mgr
+ * Pointer to the DPB manager structure
+ *
+ * @param[in] curr_frame_num
+ * frame_num of the current pic
+ *
+ * @param[in] first_field_type
+ * For complementary fields, required first field
+ *
+ * @param[in] max_frame_num
+ * Maximum frame_num allowed
+ *
+ * @returns
+ *
+ * @remarks
+ *
+ *
+ *******************************************************************************
+ */
+WORD32 ih264_dpb_mgr_sort_short_term_fields_by_frame_num(dpb_mgr_t *ps_dpb_mgr,
+ WORD32 curr_frame_num,
+ WORD32 first_field_type,
+ WORD32 max_frame_num)
+{
+ dpb_info_t *ps_dpb_node1 = ps_dpb_mgr->ps_dpb_short_term_head;
+ dpb_info_t *ps_dpb_node2;
+ WORD32 frame_num_node1;
+ WORD32 frame_num_node2;
+ pic_buf_t *ps_pic_buf;
+
+ if(ps_dpb_node1 == NULL)
+ return -1;
+
+ for (; ps_dpb_node1 != NULL; ps_dpb_node1 = ps_dpb_node1->ps_prev_dpb)
+ {
+ for (ps_dpb_node2 = ps_dpb_node1->ps_prev_dpb; ps_dpb_node2 != NULL; ps_dpb_node2 = ps_dpb_node2->ps_prev_dpb)
+ {
+ frame_num_node1 = ps_dpb_node1->ps_pic_buf->i4_frame_num;
+ frame_num_node2 = ps_dpb_node2->ps_pic_buf->i4_frame_num;
+
+ if(frame_num_node1 > curr_frame_num)
+ frame_num_node1 = frame_num_node1 - max_frame_num;
+ if(frame_num_node2 > curr_frame_num)
+ frame_num_node2 = frame_num_node2 - max_frame_num;
+
+ if(frame_num_node1 < frame_num_node2)
+ {
+ ps_pic_buf = ps_dpb_node1->ps_pic_buf;
+ ps_dpb_node1->ps_pic_buf = ps_dpb_node2->ps_pic_buf;
+ ps_dpb_node2->ps_pic_buf = ps_pic_buf;
+ }
+ }
+ }
+
+ /**
+ * For frames and complementary field pairs,
+ * ensure first_field_type appears first in the list
+ */
+ ps_dpb_node1 = ps_dpb_mgr->ps_dpb_short_term_head;
+ ps_dpb_node2 = ps_dpb_node1->ps_prev_dpb;
+ while(ps_dpb_node2 != NULL)
+ {
+ pic_buf_t *ps_pic_node1 = ps_dpb_node1->ps_pic_buf;
+ pic_buf_t *ps_pic_node2 = ps_dpb_node2->ps_pic_buf;
+ frame_num_node1 = ps_pic_node1->i4_frame_num;
+ frame_num_node2 = ps_pic_node2->i4_frame_num;
+ if(frame_num_node1 == frame_num_node2)
+ {
+ ASSERT(ps_pic_node1->i1_field_type != ps_pic_node2->i1_field_type);
+ if(ps_pic_node1->i1_field_type != first_field_type)
+ {
+ ps_dpb_node1->ps_pic_buf = ps_pic_node2;
+ ps_dpb_node2->ps_pic_buf = ps_pic_node1;
+ }
+ }
+ ps_dpb_node1 = ps_dpb_node2;
+ ps_dpb_node2 = ps_dpb_node2->ps_prev_dpb;
+ }
+ return 0;
+
+}
+
+/**
+ *******************************************************************************
+ *
+ * @brief
+ * Function to sort sort term pics by poc for list 0.
+ *
+ * @par Description:
+ * Orders all the pocs less than current poc in the descending order.
+ * Then orders all the pocs greater than current poc in the ascending order.
+ *
+ * @param[in] ps_dpb_mgr
+ * Pointer to the DPB manager structure
+ *
+ * @param[in] curr_poc
+ * Poc of the current pic
+ *
+ * @param[in] first_field_type
+ * For complementary fields, required first field
+ *
+ * @returns
+ *
+ * @remarks
+ *
+ *
+ *******************************************************************************
+ */
+WORD32 ih264_dpb_mgr_sort_short_term_fields_by_poc_l0(dpb_mgr_t *ps_dpb_mgr,
+ WORD32 curr_poc,
+ WORD32 first_field_type)
+{
+ dpb_info_t *ps_dpb_node1 = ps_dpb_mgr->ps_dpb_short_term_head;
+ dpb_info_t *ps_dpb_node2;
+ WORD32 poc_node1;
+ WORD32 poc_node2;
+ WORD32 frame_num_node1;
+ WORD32 frame_num_node2;
+ pic_buf_t *ps_pic_buf;
+
+ if(ps_dpb_node1 == NULL)
+ return -1;
+
+ /**
+ * Sort the fields by poc.
+ * All POCs less than current poc are first placed in the descending order.
+ * Then all POCs greater than current poc are placed in the ascending order.
+ */
+ for (; ps_dpb_node1 != NULL; ps_dpb_node1 = ps_dpb_node1->ps_prev_dpb)
+ {
+ for (ps_dpb_node2 = ps_dpb_node1->ps_prev_dpb; ps_dpb_node2 != NULL; ps_dpb_node2 = ps_dpb_node2->ps_prev_dpb)
+ {
+ poc_node1 = ps_dpb_node1->ps_pic_buf->i4_abs_poc;
+ poc_node2 = ps_dpb_node2->ps_pic_buf->i4_abs_poc;
+ ASSERT(poc_node1 != curr_poc);
+ ASSERT(poc_node2 != curr_poc);
+ if(((poc_node1 < curr_poc) && (poc_node2 > curr_poc)) ||
+ ((poc_node1 < curr_poc) && (poc_node2 < curr_poc) && (poc_node1 > poc_node2)) ||
+ ((poc_node1 > curr_poc) && (poc_node2 > curr_poc) && (poc_node1 < poc_node2)))
+ continue;
+
+ ps_pic_buf = ps_dpb_node1->ps_pic_buf;
+ ps_dpb_node1->ps_pic_buf = ps_dpb_node2->ps_pic_buf;
+ ps_dpb_node2->ps_pic_buf = ps_pic_buf;
+ }
+ }
+
+ ps_dpb_node1 = ps_dpb_mgr->ps_dpb_short_term_head;
+ ps_dpb_node2 = ps_dpb_node1->ps_prev_dpb;
+ while(ps_dpb_node2 != NULL)
+ {
+ pic_buf_t *ps_pic_node1 = ps_dpb_node1->ps_pic_buf;
+ pic_buf_t *ps_pic_node2 = ps_dpb_node2->ps_pic_buf;
+ frame_num_node1 = ps_pic_node1->i4_frame_num;
+ frame_num_node2 = ps_pic_node2->i4_frame_num;
+ if(frame_num_node1 == frame_num_node2)
+ {
+ ASSERT(ps_pic_node1->i1_field_type != ps_pic_node2->i1_field_type);
+ if(ps_pic_node1->i1_field_type != first_field_type)
+ {
+ ps_dpb_node1->ps_pic_buf = ps_pic_node2;
+ ps_dpb_node2->ps_pic_buf = ps_pic_node1;
+ }
+ }
+ ps_dpb_node1 = ps_dpb_node2;
+ ps_dpb_node2 = ps_dpb_node2->ps_prev_dpb;
+ }
+ return 0;
+
+}
+
+/**
+ *******************************************************************************
+ *
+ * @brief
+ * Function to sort sort term pics by poc for list 1.
+ *
+ * @par Description:
+ * Orders all the pocs greater than current poc in the ascending order.
+ * Then rrders all the pocs less than current poc in the descending order.
+ *
+ * @param[in] ps_dpb_mgr
+ * Pointer to the DPB manager structure
+ *
+ * @param[in] curr_poc
+ * Poc of the current pic
+ *
+ * @param[in] first_field_type
+ * For complementary fields, required first field
+ *
+ * @returns
+ *
+ * @remarks
+ *
+ *
+ *******************************************************************************
+ */
+WORD32 ih264_dpb_mgr_sort_short_term_fields_by_poc_l1(dpb_mgr_t *ps_dpb_mgr,
+ WORD32 curr_poc,
+ WORD32 first_field_type)
+{
+ dpb_info_t *ps_dpb_node1 = ps_dpb_mgr->ps_dpb_short_term_head;
+ dpb_info_t *ps_dpb_node2;
+ WORD32 poc_node1;
+ WORD32 poc_node2;
+ WORD32 frame_num_node1;
+ WORD32 frame_num_node2;
+ pic_buf_t *ps_pic_buf;
+
+ if(ps_dpb_node1 == NULL)
+ return -1;
+
+ /**
+ * Sort the fields by poc.
+ * All POCs greater than current poc are first placed in the ascending order.
+ * Then all POCs less than current poc are placed in the decending order.
+ */
+ for (; ps_dpb_node1 != NULL; ps_dpb_node1 = ps_dpb_node1->ps_prev_dpb)
+ {
+ for (ps_dpb_node2 = ps_dpb_node1->ps_prev_dpb; ps_dpb_node2 != NULL; ps_dpb_node2 = ps_dpb_node2->ps_prev_dpb)
+ {
+ poc_node1 = ps_dpb_node1->ps_pic_buf->i4_abs_poc;
+ poc_node2 = ps_dpb_node2->ps_pic_buf->i4_abs_poc;
+ ASSERT(poc_node1 != curr_poc);
+ ASSERT(poc_node2 != curr_poc);
+ if(((poc_node1 > curr_poc) && (poc_node2 < curr_poc)) ||
+ ((poc_node1 < curr_poc) && (poc_node2 < curr_poc) && (poc_node1 > poc_node2)) ||
+ ((poc_node1 > curr_poc) && (poc_node2 > curr_poc) && (poc_node1 < poc_node2)))
+ continue;
+
+ ps_pic_buf = ps_dpb_node1->ps_pic_buf;
+ ps_dpb_node1->ps_pic_buf = ps_dpb_node2->ps_pic_buf;
+ ps_dpb_node2->ps_pic_buf = ps_pic_buf;
+ }
+ }
+
+ ps_dpb_node1 = ps_dpb_mgr->ps_dpb_short_term_head;
+ ps_dpb_node2 = ps_dpb_node1->ps_prev_dpb;
+ while(ps_dpb_node2 != NULL)
+ {
+ pic_buf_t *ps_pic_node1 = ps_dpb_node1->ps_pic_buf;
+ pic_buf_t *ps_pic_node2 = ps_dpb_node2->ps_pic_buf;
+ frame_num_node1 = ps_pic_node1->i4_frame_num;
+ frame_num_node2 = ps_pic_node2->i4_frame_num;
+ if(frame_num_node1 == frame_num_node2)
+ {
+ ASSERT(ps_pic_node1->i1_field_type != ps_pic_node2->i1_field_type);
+ if(ps_pic_node1->i1_field_type != first_field_type)
+ {
+ ps_dpb_node1->ps_pic_buf = ps_pic_node2;
+ ps_dpb_node2->ps_pic_buf = ps_pic_node1;
+ }
+ }
+ ps_dpb_node1 = ps_dpb_node2;
+ ps_dpb_node2 = ps_dpb_node2->ps_prev_dpb;
+ }
+ return 0;
+}
+/**
+ *******************************************************************************
+ *
+ * @brief
+ * Function to sort long term pics by long term frame idx.
+ *
+ * @par Description:
+ * Sorts long term fields by long term frame idx. For 2 fields
+ * having same frame_num, orders them based on requested first field type.
+ *
+ * @param[in] ps_dpb_mgr
+ * Pointer to the DPB manager structure
+ *
+ * @param[in] first_field_type
+ * For complementary fields, required first field
+ *
+ * @returns
+ *
+ * @remarks
+ *
+ *
+ *******************************************************************************
+ */
+WORD32 ih264_dpb_mgr_sort_long_term_fields_by_frame_idx(dpb_mgr_t *ps_dpb_mgr,
+ WORD32 first_field_type)
+{
+ dpb_info_t *ps_dpb_node1 = ps_dpb_mgr->ps_dpb_long_term_head;
+ dpb_info_t *ps_dpb_node2;
+ WORD32 frame_idx_node1;
+ WORD32 frame_idx_node2;
+ pic_buf_t *ps_pic_buf;
+
+ if(ps_dpb_node1 == NULL)
+ return -1;
+
+ /* Sort the fields by frame idx */
+ for (; ps_dpb_node1 != NULL; ps_dpb_node1 = ps_dpb_node1->ps_prev_dpb)
+ {
+ for (ps_dpb_node2 = ps_dpb_node1->ps_prev_dpb; ps_dpb_node2 != NULL; ps_dpb_node2 = ps_dpb_node2->ps_prev_dpb)
+ {
+ frame_idx_node1 = ps_dpb_node1->ps_pic_buf->i4_long_term_frame_idx;
+ frame_idx_node2 = ps_dpb_node2->ps_pic_buf->i4_long_term_frame_idx;
+
+ if(frame_idx_node1 > frame_idx_node2)
+ {
+ ps_pic_buf = ps_dpb_node1->ps_pic_buf;
+ ps_dpb_node1->ps_pic_buf = ps_dpb_node2->ps_pic_buf;
+ ps_dpb_node2->ps_pic_buf = ps_pic_buf;
+ }
+ }
+ }
+
+ /**
+ * For frames and complementary field pairs,
+ * ensure first_field_type appears first in the list
+ */
+ ps_dpb_node1 = ps_dpb_mgr->ps_dpb_long_term_head;
+ ps_dpb_node2 = ps_dpb_node1->ps_prev_dpb;
+ while(ps_dpb_node2 != NULL)
+ {
+ pic_buf_t *ps_pic_node1 = ps_dpb_node1->ps_pic_buf;
+ pic_buf_t *ps_pic_node2 = ps_dpb_node2->ps_pic_buf;
+ frame_idx_node1 = ps_pic_node1->i4_long_term_frame_idx;
+ frame_idx_node2 = ps_pic_node2->i4_long_term_frame_idx;
+ if(frame_idx_node1 == frame_idx_node2)
+ {
+ ASSERT(ps_pic_node1->i1_field_type != ps_pic_node2->i1_field_type);
+ if(ps_pic_node1->i1_field_type != first_field_type)
+ {
+ ps_dpb_node1->ps_pic_buf = ps_pic_node2;
+ ps_dpb_node2->ps_pic_buf = ps_pic_node1;
+ }
+ }
+ ps_dpb_node1 = ps_dpb_node2;
+ ps_dpb_node2 = ps_dpb_node2->ps_prev_dpb;
+ }
+ return 0;
+}
+
+/**
+ *******************************************************************************
+ *
+ * @brief
+ * Function to alternate fields.
+ *
+ * @par Description:
+ * In the ordered list of fields, alternate fields starting with
+ * first_field_type
+ *
+ * @param[in] ps_dpb_mgr
+ * Pointer to the DPB manager structure
+ *
+ * @param[in] reference_type
+ * This is used to select between short-term and long-term linked list.
+ *
+ * @param[in] first_field_type
+ * For complementary fields, required first field
+ *
+ * @returns
+ *
+ * @remarks
+ *
+ *
+ *******************************************************************************
+ */
+WORD32 ih264_dpb_mgr_alternate_ref_fields(dpb_mgr_t *ps_dpb_mgr,
+ WORD32 reference_type,
+ WORD32 first_field_type)
+{
+ dpb_info_t s_dpb_head;
+ dpb_info_t *ps_dpb_head;
+ dpb_info_t *ps_dpb_node1;
+ dpb_info_t *ps_dpb_node2;
+ dpb_info_t *ps_dpb_node3;
+ dpb_info_t *ps_dpb_node4;
+ WORD32 expected_field;
+
+ expected_field = first_field_type;
+
+ ps_dpb_head = &s_dpb_head;
+
+ ps_dpb_head->ps_prev_dpb = (reference_type == SHORT_TERM_REF) ?
+ ps_dpb_mgr->ps_dpb_short_term_head:
+ ps_dpb_mgr->ps_dpb_long_term_head;
+
+ ps_dpb_node1 = ps_dpb_head;
+ ps_dpb_node2 = ps_dpb_node1->ps_prev_dpb;
+ while(ps_dpb_node2 != NULL)
+ {
+ pic_buf_t *ps_pic_node2 = ps_dpb_node2->ps_pic_buf;
+ if(ps_pic_node2->i1_field_type != expected_field)
+ {
+ /*
+ * If it is not expected field, loop over the node till
+ * the expected field.
+ */
+ ps_dpb_node3 = ps_dpb_node2;
+ ps_dpb_node4 = ps_dpb_node2->ps_prev_dpb;
+ while((ps_dpb_node4 != NULL) &&
+ (ps_dpb_node4->ps_pic_buf->i1_field_type != expected_field))
+ {
+ ps_dpb_node3 = ps_dpb_node4;
+ ps_dpb_node4 = ps_dpb_node4->ps_prev_dpb;
+ }
+ if(ps_dpb_node4 != NULL)
+ {
+ ps_dpb_node1->ps_prev_dpb = ps_dpb_node4;
+ ps_dpb_node3->ps_prev_dpb = ps_dpb_node4->ps_prev_dpb;
+ ps_dpb_node4->ps_prev_dpb = ps_dpb_node2;
+ }
+ else
+ {
+ /* node4 null means we have reached the end */
+ break;
+ }
+ }
+ ps_dpb_node1 = ps_dpb_node1->ps_prev_dpb;
+ ps_dpb_node2 = ps_dpb_node1->ps_prev_dpb;
+ expected_field = (ps_dpb_node1->ps_pic_buf->i1_field_type == TOP_FIELD)?
+ BOTTOM_FIELD:TOP_FIELD;
+ }
+
+ if((reference_type == SHORT_TERM_REF))
+ {
+ ps_dpb_mgr->ps_dpb_short_term_head = ps_dpb_head->ps_prev_dpb;
+ }
+ else
+ {
+ ps_dpb_mgr->ps_dpb_long_term_head = ps_dpb_head->ps_prev_dpb;
+ }
+
+ return 0;
+}
+
+/**
+ *******************************************************************************
+ *
+ * @brief
+ * Add a ref field to short-term or long-term linked list.
+ *
+ * @par Description:
+ * This function adds a ref field to either short-term or long-term linked
+ * list. It picks up memory for the link from the array of dpb_info in
+ * dpb_mgr. The field is added to the beginning of the linked list and the
+ * head is set the the field.
+ *
+ * @param[in] ps_dpb_mgr
+ * Pointer to the DPB manager structure
+ *
+ * @param[in] ps_pic_buf
+ * Pic buf structure for the field being added.
+ *
+ * @param[in] reference_type
+ * This is used to select between short-term and long-term linked list.
+ *
+ * @param[in] frame_num
+ * frame_num for the field.
+ *
+ * @param[in] long_term_frame_idx
+ * If the ref being added is long-term, long_term_frame_idx of the field.
+ * Otherwise invalid.
+ *
+ * @returns
+ *
+ * @remarks
+ *
+ *
+ *******************************************************************************
+ */
+WORD32 ih264_dpb_mgr_insert_ref_field(dpb_mgr_t *ps_dpb_mgr,
+ pic_buf_t *ps_pic_buf,
+ WORD32 reference_type,
+ UWORD32 frame_num,
+ WORD32 long_term_frame_idx)
+{
+ WORD32 i;
+ dpb_info_t *ps_dpb_info;
+ dpb_info_t *ps_dpb_head;
+
+ ps_dpb_info = ps_dpb_mgr->as_dpb_info;
+
+ /* Return error if buffer is already present in the DPB */
+ for(i = 0; i < MAX_DPB_BUFS; i++)
+ {
+ if( (ps_dpb_info[i].ps_pic_buf == ps_pic_buf)
+ && (ps_dpb_info[i].ps_pic_buf->i4_used_as_ref == reference_type) )
+ {
+ return (-1);
+ }
+ }
+
+ /* Find an unused DPB location */
+ for(i = 0; i < MAX_DPB_BUFS; i++)
+ {
+ if(NULL == ps_dpb_info[i].ps_pic_buf)
+ {
+ break;
+ }
+ }
+ if(i == MAX_DPB_BUFS)
+ {
+ return (-1);
+ }
+
+ ps_dpb_head = (reference_type == SHORT_TERM_REF)
+ ?ps_dpb_mgr->ps_dpb_short_term_head
+ :ps_dpb_mgr->ps_dpb_long_term_head;
+
+ if(reference_type == SHORT_TERM_REF)
+ long_term_frame_idx = -1;
+
+ /* Create DPB info */
+ ps_dpb_info[i].ps_pic_buf = ps_pic_buf;
+ ps_dpb_info[i].ps_prev_dpb = ps_dpb_head;
+ ps_dpb_info[i].ps_pic_buf->i4_used_as_ref = reference_type;
+ ps_dpb_info[i].ps_pic_buf->i4_frame_num = frame_num;
+ ps_dpb_info[i].ps_pic_buf->i4_long_term_frame_idx = long_term_frame_idx;
+
+ /* update the head node of linked list to point to the current picture */
+ if(reference_type == SHORT_TERM_REF)
+ {
+ ps_dpb_mgr->ps_dpb_short_term_head = ps_dpb_info + i;
+
+ /* Increment Short term buffer count */
+ ps_dpb_mgr->u1_num_short_term_ref_bufs++;
+
+ }
+ else
+ {
+ ps_dpb_mgr->ps_dpb_long_term_head = ps_dpb_info + i;
+
+ /* Increment Long term buffer count */
+ ps_dpb_mgr->u1_num_long_term_ref_bufs++;
+ }
+
+ return 0;
+}
+
+/**
+ *******************************************************************************
+ *
+ * @brief
+ * Add a ref frame to short-term or long-term linked list.
+ *
+ * @par Description:
+ * This function adds a ref frame to either short-term or long-term linked
+ * list. Internally it calls add ref field twice to add top and bottom field.
+ *
+ * @param[in] ps_dpb_mgr
+ * Pointer to the DPB manager structure
+ *
+ * @param[in] ps_pic_buf
+ * Pic buf structure for the field being added.
+ *
+ * @param[in] reference_type
+ * This is used to select between short-term and long-term linked list.
+ *
+ * @param[in] frame_num
+ * frame_num for the field.
+ *
+ * @param[in] long_term_frame_idx
+ * If the ref being added is long-term, long_term_frame_idx of the field.
+ * Otherwise invalid.
+ *
+ * @returns
+ *
+ * @remarks
+ *
+ *
+ *******************************************************************************
+ */
+WORD32 ih264_dpb_mgr_insert_ref_frame(dpb_mgr_t *ps_dpb_mgr,
+ pic_buf_t *ps_pic_buf,
+ WORD32 reference_type,
+ UWORD32 frame_num,
+ WORD32 long_term_frame_idx)
+{
+ WORD32 buf_id;
+ pic_buf_t *ps_pic_top;
+ pic_buf_t *ps_pic_bottom;
+ WORD32 ret;
+
+ /*
+ * For a frame, since the ps_pic_buf passed to this function is that of top field
+ * obtain bottom field using buf_id.
+ */
+ ps_pic_top = ps_pic_buf;
+ buf_id = ps_pic_top->i4_buf_id;
+ ps_pic_bottom = &ps_dpb_mgr->as_bottom_field_pics[buf_id];
+
+ /* Insert top field */
+ ret = ih264_dpb_mgr_insert_ref_field(ps_dpb_mgr,
+ ps_pic_top,
+ reference_type,
+ frame_num,
+ long_term_frame_idx);
+
+ if(ret != 0)
+ return ret;
+
+ /* Insert bottom field */
+ ret = ih264_dpb_mgr_insert_ref_field(ps_dpb_mgr,
+ ps_pic_bottom,
+ reference_type,
+ frame_num,
+ long_term_frame_idx);
+
+ if(ret != 0)
+ return ret;
+
+ return ret;
+}
+
+/**
+ *******************************************************************************
+ *
+ * @brief
+ * Returns the number of ref frames in both the linked list.
+ *
+ * @par Description:
+ * Returns the count of number of frames, number of complementary field pairs
+ * and number of unpaired fields.
+ *
+ * @param[in] ps_dpb_mgr
+ * Pointer to the DPB manager structure
+ *
+ * @param[in] curr_frame_num
+ * frame_num for the field.
+ *
+ * @param[in] max_frame_num
+ * Maximum frame_num allowed
+ *
+ * @returns
+ *
+ * @remarks
+ *
+ *
+ *******************************************************************************
+ */
+WORD32 ih264_dpb_mgr_count_ref_frames(dpb_mgr_t *ps_dpb_mgr,
+ WORD32 curr_frame_num,
+ WORD32 max_frame_num)
+{
+ WORD32 numShortTerm = 0;
+ WORD32 numLongTerm = 0;
+ dpb_info_t *ps_dpb_node;
+ WORD32 frame_num;
+ WORD32 prev_frame_num;
+
+ /*
+ * Compute the number of short-term frames/complementary field pairs/
+ * unpaired fields
+ */
+ if(ps_dpb_mgr->ps_dpb_short_term_head != NULL)
+ {
+ /* Sort the short-term list by frame_num */
+ ih264_dpb_mgr_sort_short_term_fields_by_frame_num(ps_dpb_mgr,
+ curr_frame_num,
+ TOP_FIELD,
+ max_frame_num);
+
+ ps_dpb_node = ps_dpb_mgr->ps_dpb_short_term_head;
+ if(ps_dpb_node != NULL)
+ {
+ numShortTerm++;
+ prev_frame_num = ps_dpb_node->ps_pic_buf->i4_frame_num;
+ ps_dpb_node = ps_dpb_node->ps_prev_dpb;
+ }
+
+ while(ps_dpb_node != NULL)
+ {
+ frame_num = ps_dpb_node->ps_pic_buf->i4_frame_num;
+ if(frame_num != prev_frame_num)
+ numShortTerm++;
+ prev_frame_num = ps_dpb_node->ps_pic_buf->i4_frame_num;
+ ps_dpb_node = ps_dpb_node->ps_prev_dpb;
+ }
+ }
+
+ /*
+ * Compute the number of long-term frames/complementary field pairs/
+ * unpaired fields
+ */
+ if(ps_dpb_mgr->ps_dpb_long_term_head != NULL)
+ {
+ ih264_dpb_mgr_sort_long_term_fields_by_frame_idx(ps_dpb_mgr,
+ TOP_FIELD);
+
+ ps_dpb_node = ps_dpb_mgr->ps_dpb_long_term_head;
+ if(ps_dpb_node != NULL)
+ {
+ numLongTerm++;
+ prev_frame_num = ps_dpb_node->ps_pic_buf->i4_frame_num;
+ ps_dpb_node = ps_dpb_node->ps_prev_dpb;
+ }
+
+ while(ps_dpb_node != NULL)
+ {
+ frame_num = ps_dpb_node->ps_pic_buf->i4_frame_num;
+ if(frame_num != prev_frame_num)
+ numLongTerm++;
+ prev_frame_num = ps_dpb_node->ps_pic_buf->i4_frame_num;
+ ps_dpb_node = ps_dpb_node->ps_prev_dpb;
+ }
+ }
+ return (numShortTerm + numLongTerm);
+}
+
+/**
+ *******************************************************************************
+ *
+ * @brief
+ * Deletes the ref frame at the end of the linked list.
+ *
+ * @par Description:
+ * Deletes the ref frame at the end of the linked list. For unpaired fields,
+ * it deletes just the last node. For frame or complementary field pair, it
+ * deletes the last two nodes.
+ *
+ * @param[in] ps_dpb_mgr
+ * Pointer to the DPB manager structure
+ *
+ * @param[in] reference_type
+ * This is used to select between short-term and long-term linked list.
+ *
+ * @returns
+ *
+ * @remarks
+ *
+ *
+ *******************************************************************************
+ */
+WORD32 ih264_dpb_mgr_delete_ref_frame(dpb_mgr_t *ps_dpb_mgr,
+ WORD32 reference_type)
+{
+ dpb_info_t *ps_dpb_node1;
+ dpb_info_t *ps_dpb_node2;
+ dpb_info_t *ps_dpb_node3;
+
+ /*
+ * Assumption: The nodes sorted for frame num.
+ */
+
+
+ /* Select bw short-term and long-term list. */
+ ps_dpb_node1 = (reference_type == SHORT_TERM_REF)
+ ?ps_dpb_mgr->ps_dpb_short_term_head
+ :ps_dpb_mgr->ps_dpb_long_term_head;
+ /* If null, no entries in the list. Hence return. */
+ if(ps_dpb_node1 == NULL)
+ return 0;
+
+ /* If only one node in the list, set as unsed for refer and return. */
+ if(ps_dpb_node1->ps_prev_dpb == NULL)
+ {
+ /* Set the picture as unused for reference */
+ ps_dpb_node1->ps_pic_buf->i4_used_as_ref = UNUSED_FOR_REF;
+ ps_dpb_node1->ps_pic_buf = NULL;
+
+ if(reference_type == SHORT_TERM_REF)
+ {
+ ps_dpb_mgr->ps_dpb_short_term_head = NULL;
+
+ /* Increment Short term buffer count */
+ ps_dpb_mgr->u1_num_short_term_ref_bufs = 0;
+
+ }
+ else
+ {
+ ps_dpb_mgr->ps_dpb_long_term_head = NULL;
+
+ /* Increment Long term buffer count */
+ ps_dpb_mgr->u1_num_long_term_ref_bufs = 0;
+
+ }
+ return 0;
+ }
+
+ /**
+ * If there are only 2 nodes in the list, set second node as unused for reference.
+ * If the frame_num of second node and first node is same, set first node also as
+ * unused for reference and set the corresponding head to NULL.
+ */
+ ps_dpb_node2 = ps_dpb_node1->ps_prev_dpb;
+ if(ps_dpb_node2->ps_prev_dpb == NULL)
+ {
+ /* Set the picture as unused for reference */
+ if(ps_dpb_node2->ps_pic_buf->i4_frame_num == ps_dpb_node1->ps_pic_buf->i4_frame_num)
+ {
+ /* Set the picture as unused for reference */
+ ps_dpb_node1->ps_pic_buf->i4_used_as_ref = UNUSED_FOR_REF;
+ ps_dpb_node1->ps_pic_buf = NULL;
+ if(reference_type == SHORT_TERM_REF)
+ {
+ ps_dpb_mgr->ps_dpb_short_term_head = NULL;
+
+ /* Increment Short term buffer count */
+ ps_dpb_mgr->u1_num_short_term_ref_bufs = 0;
+
+ }
+ else
+ {
+ ps_dpb_mgr->ps_dpb_long_term_head = NULL;
+
+ /* Increment Long term buffer count */
+ ps_dpb_mgr->u1_num_long_term_ref_bufs = 0;
+
+ }
+
+ }
+ ps_dpb_node2->ps_pic_buf->i4_used_as_ref = UNUSED_FOR_REF;
+ ps_dpb_node2->ps_pic_buf = NULL;
+ ps_dpb_node1->ps_prev_dpb = NULL;
+ return 0;
+ }
+ /*
+ * If there are more than 2 nodes, run a loop to get the last 3 nodes.
+ */
+ ps_dpb_node3 = ps_dpb_node2->ps_prev_dpb;
+ while(ps_dpb_node3->ps_prev_dpb != NULL)
+ {
+ ps_dpb_node1 = ps_dpb_node2;
+ ps_dpb_node2 = ps_dpb_node3;
+ ps_dpb_node3 = ps_dpb_node3->ps_prev_dpb;
+ }
+ /*
+ * If node 2 and node 3 frame_nums are same, set node 2 also as unsed for
+ * reference and del reference from node1.
+ */
+ if(ps_dpb_node2->ps_pic_buf->i4_frame_num == ps_dpb_node3->ps_pic_buf->i4_frame_num)
+ {
+ ps_dpb_node2->ps_pic_buf->i4_used_as_ref = UNUSED_FOR_REF;
+ ps_dpb_node2->ps_pic_buf = NULL;
+ ps_dpb_node1->ps_prev_dpb = NULL;
+
+ }
+ /* Set the third node as unused for reference */
+ ps_dpb_node3->ps_pic_buf->i4_used_as_ref = UNUSED_FOR_REF;
+ ps_dpb_node3->ps_pic_buf = NULL;
+ ps_dpb_node2->ps_prev_dpb = NULL;
+
+ return 0;
+}
+/**
+ *******************************************************************************
+ *
+ * @brief
+ * Delete long-term ref fields above max frame idx.
+ *
+ * @par Description:
+ * Deletes all the long-term ref fields having idx greater than max_frame_idx
+ *
+ * @param[in] ps_dpb_mgr
+ * Pointer to the DPB manager structure
+ *
+ * @param[in] max_frame_idx
+ * Max long-term frame idx allowed.
+ *
+ * @returns
+ *
+ * @remarks
+ *
+ *
+ *******************************************************************************
+ */
+WORD32 ih264_dpb_mgr_delete_long_ref_fields_max_frame_idx(dpb_mgr_t *ps_dpb_mgr,
+ WORD32 max_frame_idx)
+{
+ dpb_info_t *ps_dpb_node1;
+ dpb_info_t *ps_dpb_node2;
+ /*
+ * Loop until there is node which isn't to be deleted is encountered.
+ */
+ while(ps_dpb_mgr->ps_dpb_long_term_head != NULL)
+ {
+ if(ps_dpb_mgr->ps_dpb_long_term_head->ps_pic_buf->i4_long_term_frame_idx
+ <= max_frame_idx)
+ {
+ break;
+ }
+ ps_dpb_mgr->ps_dpb_long_term_head->ps_pic_buf->i4_used_as_ref = UNUSED_FOR_REF;
+ ps_dpb_mgr->ps_dpb_long_term_head->ps_pic_buf = NULL;
+ ps_dpb_mgr->ps_dpb_long_term_head = ps_dpb_mgr->ps_dpb_long_term_head->ps_prev_dpb;
+ }
+
+ ps_dpb_node1 = ps_dpb_mgr->ps_dpb_long_term_head;
+ if(ps_dpb_node1 == NULL)
+ return 0;
+ /*
+ * With the node that isn't to be deleted as head, loop until the end.
+ */
+ ps_dpb_node2 = ps_dpb_node1->ps_prev_dpb;
+ while(ps_dpb_node2 != NULL)
+ {
+ if(ps_dpb_node2->ps_pic_buf->i4_long_term_frame_idx > max_frame_idx)
+ {
+ ps_dpb_node2->ps_pic_buf->i4_used_as_ref = UNUSED_FOR_REF;
+ ps_dpb_node2->ps_pic_buf = NULL;
+ ps_dpb_node1->ps_prev_dpb = ps_dpb_node2->ps_prev_dpb;
+ }
+ ps_dpb_node1 = ps_dpb_node1->ps_prev_dpb;
+ if(ps_dpb_node1 == NULL)
+ break;
+ ps_dpb_node2 = ps_dpb_node1->ps_prev_dpb;
+ }
+ return 0;
+}
+
+/**
+ *******************************************************************************
+ *
+ * @brief
+ * Deletes the short-term with least frame_num
+ *
+ * @par Description:
+ * Deletes the short-term with least frame_num. It sorts the function the
+ * short-term linked list by frame-num and the function that deletes the last
+ * frame in the linked list.
+ *
+ * @param[in] ps_dpb_mgr
+ * Pointer to the DPB manager structure
+ *
+ * @param[in] curr_frame_num
+ * frame_num of the current pic
+ *
+ * @param[in] max_frame_num
+ * Maximum frame_num allowed
+ *
+ * @returns
+ *
+ * @remarks
+ *
+ *
+ *******************************************************************************
+ */
+WORD32 ih264_dpb_mgr_delete_short_ref_frame(dpb_mgr_t *ps_dpb_mgr,
+ WORD32 curr_frame_num,
+ WORD32 max_frame_num)
+{
+ WORD32 ret;
+ /* Sort the short-term list by frame_num */
+ ret = ih264_dpb_mgr_sort_short_term_fields_by_frame_num(ps_dpb_mgr,
+ curr_frame_num,
+ TOP_FIELD,
+ max_frame_num);
+
+ /* Delete the last reference frame or field */
+ ret = ih264_dpb_mgr_delete_ref_frame(ps_dpb_mgr,SHORT_TERM_REF);
+
+ if(ret != 0)
+ {
+ ASSERT(0);
+ }
+
+ return ret;
+}
+/**
+ *******************************************************************************
+ *
+ * @brief
+ * Deletes all the ref frames.
+ *
+ * @par Description:
+ * Deletes all of the ref frames/fields in the short-term and long-term linked
+ * list.
+ *
+ * @param[in] ps_dpb_mgr
+ * Pointer to the DPB manager structure
+ *
+ * @returns
+ *
+ * @remarks
+ *
+ *
+ *******************************************************************************
+ */
+WORD32 ih264_dpb_mgr_delete_all_ref_frames(dpb_mgr_t *ps_dpb_mgr)
+{
+ /* Loop over short-term linked list. */
+ while(ps_dpb_mgr->ps_dpb_short_term_head != NULL)
+ {
+ ih264_dpb_mgr_delete_ref_frame(ps_dpb_mgr,SHORT_TERM_REF);
+ }
+
+ /* Loop over long-term linked list. */
+ while(ps_dpb_mgr->ps_dpb_long_term_head != NULL)
+ {
+ ih264_dpb_mgr_delete_ref_frame(ps_dpb_mgr,LONG_TERM_REF);
+ }
+ return 0;
+}
+
+
+void ih264_dpb_mgr_reset(dpb_mgr_t *ps_dpb_mgr, buf_mgr_t *ps_buf_mgr)
+{
+ WORD32 i;
+ dpb_info_t *ps_dpb_info;
+ ASSERT(0);
+
+
+ ps_dpb_info = ps_dpb_mgr->as_dpb_info;
+
+ for(i = 0; i < MAX_DPB_BUFS; i++)
+ {
+ if(ps_dpb_info[i].ps_pic_buf->i4_used_as_ref)
+ {
+ ps_dpb_info[i].ps_pic_buf->i4_used_as_ref = UNUSED_FOR_REF;
+ ps_dpb_info[i].ps_prev_dpb = NULL;
+ //Release physical buffer
+ ih264_buf_mgr_release(ps_buf_mgr, ps_dpb_info[i].ps_pic_buf->i4_buf_id,
+ BUF_MGR_REF);
+
+ ps_dpb_info[i].ps_pic_buf = NULL;
+ }
+ }
+ ps_dpb_mgr->u1_num_short_term_ref_bufs = 0;
+ ps_dpb_mgr->u1_num_long_term_ref_bufs = 0;
+ ps_dpb_mgr->ps_dpb_short_term_head = NULL;
+ ps_dpb_mgr->ps_dpb_long_term_head = NULL;
+
+}
+
+/**
+ *******************************************************************************
+ *
+ * @brief
+ * deletes all pictures from DPB
+ *
+ * @par Description:
+ * Deletes all pictures present in the DPB manager
+ *
+ * @param[in] ps_buf_mgr
+ * Pointer to buffer manager structure
+ *
+ * @param[in] u1_disp_bufs
+ * Number of buffers to be deleted
+ *
+ * @returns
+ *
+ * @remarks
+ *
+ *
+ *******************************************************************************
+ */
+
+void ih264_dpb_mgr_release_pics(buf_mgr_t *ps_buf_mgr, UWORD8 u1_disp_bufs)
+{
+ WORD8 i;
+ UWORD32 buf_status;
+ ASSERT(0);
+
+ for(i = 0; i < u1_disp_bufs; i++)
+ {
+ buf_status = ih264_buf_mgr_get_status(ps_buf_mgr, i);
+ if(0 != buf_status)
+ {
+ ih264_buf_mgr_release((buf_mgr_t *)ps_buf_mgr, i, BUF_MGR_REF);
+ }
+ }
+}
diff --git a/common/ih264_dpb_mgr.h b/common/ih264_dpb_mgr.h
new file mode 100755
index 0000000..b0cf0fd
--- /dev/null
+++ b/common/ih264_dpb_mgr.h
@@ -0,0 +1,186 @@
+/******************************************************************************
+ *
+ * Copyright (C) 2015 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ *****************************************************************************
+ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
+*/
+
+/**
+ *******************************************************************************
+ * @file
+ * ih264_dpb_mgr.h
+ *
+ * @brief
+ * Function declarations used for decoded picture buffer management
+ *
+ * @author
+ * Srinivas T
+ *
+ *
+ * @remarks
+ * None
+ *
+ *******************************************************************************
+ */
+#ifndef _IH264_DPB_MGR_H_
+#define _IH264_DPB_MGR_H_
+
+/* Temporary definitions. Have to be defined later */
+
+#define MAX_DPB_BUFS (MAX_DPB_SIZE * 4)
+
+#define MARK_ST_PICNUM_AS_NONREF 1
+#define MARK_LT_INDEX_AS_NONREF 2
+#define MARK_ST_PICNUM_AS_LT_INDEX 3
+#define RESET_REF_PICTURES 5
+
+typedef struct dpb_info_t dpb_info_t;
+
+enum
+{
+ INVALID = -1,
+ UNUSED_FOR_REF = 0 ,
+ LONG_TERM_REF ,
+ SHORT_TERM_REF ,
+};
+struct dpb_info_t
+{
+ /**
+ * Pointer to picture buffer structure
+ */
+ pic_buf_t *ps_pic_buf;
+
+ /**
+ * Link to the DPB buffer with previous link
+ */
+ dpb_info_t *ps_prev_dpb;
+
+};
+
+typedef struct
+{
+ /**
+ * Pointer to the most recent pic Num
+ */
+ dpb_info_t *ps_dpb_short_term_head;
+
+ /**
+ * Pointer to the most recent pic Num
+ */
+ dpb_info_t *ps_dpb_long_term_head;
+
+ /**
+ * Physical storage for dpbInfo for ref bufs
+ */
+ dpb_info_t as_dpb_info[MAX_DPB_BUFS];
+
+ /**
+ * Array of structures for bottom field.
+ */
+ pic_buf_t as_top_field_pics[MAX_DPB_BUFS];
+
+ /**
+ * Array of structures for bottom field.
+ */
+ pic_buf_t as_bottom_field_pics[MAX_DPB_BUFS];
+
+ /**
+ * Number of short-term reference buffers
+ */
+ UWORD8 u1_num_short_term_ref_bufs;
+
+ /**
+ * Number of long-term reference buffers
+ */
+ UWORD8 u1_num_long_term_ref_bufs;
+
+ /**
+ * buffer ID current frame
+ */
+ WORD32 i4_cur_frame_buf_id;
+
+} dpb_mgr_t;
+
+void ih264_dpb_mgr_init(dpb_mgr_t *ps_dpb_mgr);
+
+WORD32 ih264_dpb_mgr_insert_ref_frame(dpb_mgr_t *ps_dpb_mgr,
+ pic_buf_t *ps_pic_buf,
+ WORD32 reference_type,
+ UWORD32 frame_num,
+ WORD32 long_term_frame_idx);
+
+WORD32 ih264_dpb_mgr_delete_ref_frame(dpb_mgr_t *ps_dpb_mgr,
+ WORD32 reference_type);
+
+WORD32 ih264_dpb_mgr_delete_all_ref_frames(dpb_mgr_t *ps_dpb_mgr);
+
+WORD32 ih264_dpb_mgr_count_ref_frames(dpb_mgr_t *ps_dpb_mgr,
+ WORD32 curr_frame_num,
+ WORD32 max_frame_num);
+
+WORD32 ih264_dpb_mgr_delete_short_ref_frame(dpb_mgr_t *ps_dpb_mgr,
+ WORD32 curr_frame_num,
+ WORD32 max_frame_num);
+
+WORD32 ih264_dpb_mgr_insert_ref_field(dpb_mgr_t *ps_dpb_mgr,
+ pic_buf_t *ps_pic_buf,
+ WORD32 reference_type,
+ UWORD32 frame_num,
+ WORD32 long_term_frame_idx);
+
+WORD32 ih264_dpb_mgr_delete_ref_field(dpb_mgr_t *ps_dpb_mgr,
+ WORD32 reference_type);
+
+WORD32 ih264_dpb_mgr_alternate_ref_fields(dpb_mgr_t *ps_dpb_mgr,
+ WORD32 reference_type,
+ WORD32 first_field_type);
+
+WORD32 ih264_dpb_mgr_sort_short_term_fields_by_frame_num(dpb_mgr_t *ps_dpb_mgr,
+ WORD32 curr_frame_num,
+ WORD32 first_field_type,
+ WORD32 max_frame_num);
+
+WORD32 ih264_dpb_mgr_sort_short_term_fields_by_poc_l0(dpb_mgr_t *ps_dpb_mgr,
+ WORD32 curr_poc,
+ WORD32 first_field_type);
+
+WORD32 ih264_dpb_mgr_sort_short_term_fields_by_poc_l1(dpb_mgr_t *ps_dpb_mgr,
+ WORD32 curr_poc,
+ WORD32 first_field_type);
+
+WORD32 ih264_dpb_mgr_sort_long_term_fields_by_frame_idx(dpb_mgr_t *ps_dpb_mgr,
+ WORD32 first_field_type);
+
+WORD32 ih264_dpb_mgr_delete_long_ref_fields_max_frame_idx(dpb_mgr_t *ps_dpb_mgr,
+ WORD32 max_frame_idx);
+
+void ih264_dpb_mgr_del_ref(dpb_mgr_t *ps_dpb_mgr,
+ buf_mgr_t *ps_buf_mgr,
+ WORD32 u4_abs_poc);
+
+pic_buf_t *ih264_dpb_mgr_get_ref_by_nearest_poc(dpb_mgr_t *ps_dpb_mgr,
+ WORD32 cur_abs_poc);
+
+pic_buf_t *ih264_dpb_mgr_get_ref_by_poc(dpb_mgr_t *ps_dpb_mgr, WORD32 abs_poc);
+
+pic_buf_t *ih264_dpb_mgr_get_ref_by_poc_lsb(dpb_mgr_t *ps_dpb_mgr,
+ WORD32 poc_lsb);
+
+void ih264_dpb_mgr_reset(dpb_mgr_t *ps_dpb_mgr, buf_mgr_t *ps_buf_mgr);
+
+void ih264_dpb_mgr_release_pics(buf_mgr_t *ps_buf_mgr, UWORD8 u1_disp_bufs);
+
+#endif /* _IH264_DPB_MGR_H_ */
diff --git a/common/ih264_error.h b/common/ih264_error.h
new file mode 100755
index 0000000..ff1662d
--- /dev/null
+++ b/common/ih264_error.h
@@ -0,0 +1,68 @@
+/******************************************************************************
+ *
+ * Copyright (C) 2015 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ *****************************************************************************
+ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
+*/
+/**
+*******************************************************************************
+* @file
+* ih264_error.h
+*
+* @brief
+* Definitions related to error handling for common modules
+*
+* @author
+* Harish
+*
+* @par List of Functions:
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+
+#ifndef _IH264_ERROR_H_
+#define _IH264_ERROR_H_
+
+/**
+ * Enumerations for error codes used in the codec.
+ * Not all these are expected to be returned to the application.
+ * Only select few will be exported
+ */
+typedef enum
+{
+ /**
+ * No error
+ */
+ IH264_SUCCESS = 0,
+ /**
+ * Start error code for decoder
+ */
+ IH264_DEC_ERROR_START = 0x100,
+
+ /**
+ * Start error code for encoder
+ */
+ IH264_ENC_ERROR_START = 0x200,
+ /**
+ * Generic failure
+ */
+ IH264_FAIL = 0x7FFFFFFF
+}IH264_ERROR_T;
+
+#endif /* _IH264_ERROR_H_ */
diff --git a/common/ih264_ihadamard_scaling.c b/common/ih264_ihadamard_scaling.c
new file mode 100755
index 0000000..e4729c8
--- /dev/null
+++ b/common/ih264_ihadamard_scaling.c
@@ -0,0 +1,216 @@
+/******************************************************************************
+ *
+ * Copyright (C) 2015 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ *****************************************************************************
+ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
+*/
+/**
+ *******************************************************************************
+ * @file
+ * ih264_ihadamard_scaling.c
+ *
+ * @brief
+ * Contains definition of functions for h264 inverse hadamard 4x4 transform and scaling
+ *
+ * @author
+ * Mohit
+ *
+ * @par List of Functions:
+ * - ih264_ihadamard_scaling_4x4()
+ *
+ * @remarks
+ *
+ *******************************************************************************
+ */
+
+/*****************************************************************************/
+/* File Includes */
+/*****************************************************************************/
+
+/* User include files */
+#include "ih264_typedefs.h"
+#include "ih264_defs.h"
+#include "ih264_trans_macros.h"
+#include "ih264_macros.h"
+#include "ih264_trans_data.h"
+#include "ih264_size_defs.h"
+#include "ih264_structs.h"
+#include "ih264_trans_quant_itrans_iquant.h"
+
+/*
+ ********************************************************************************
+ *
+ * @brief This function performs a 4x4 inverse hadamard transform on the 4x4 DC coefficients
+ * of a 16x16 intra prediction macroblock, and then performs scaling.
+ * prediction buffer
+ *
+ * @par Description:
+ * The DC coefficients pass through a 2-stage inverse hadamard transform.
+ * This inverse transformed content is scaled to based on Qp value.
+ *
+ * @param[in] pi2_src
+ * input 4x4 block of DC coefficients
+ *
+ * @param[out] pi2_out
+ * output 4x4 block
+ *
+ * @param[in] pu2_iscal_mat
+ * pointer to scaling list
+ *
+ * @param[in] pu2_weigh_mat
+ * pointer to weight matrix
+ *
+ * @param[in] u4_qp_div_6
+ * Floor (qp/6)
+ *
+ * @param[in] pi4_tmp
+ * temporary buffer of size 1*16
+ *
+ * @returns none
+ *
+ * @remarks none
+ *
+ *******************************************************************************
+ */
+void ih264_ihadamard_scaling_4x4(WORD16* pi2_src,
+ WORD16* pi2_out,
+ const UWORD16 *pu2_iscal_mat,
+ const UWORD16 *pu2_weigh_mat,
+ UWORD32 u4_qp_div_6,
+ WORD32* pi4_tmp)
+{
+ WORD32 i;
+ WORD32 x0, x1, x2, x3, x4, x5, x6, x7;
+ WORD16* pi2_src_ptr, *pi2_out_ptr;
+ WORD32* pi4_tmp_ptr;
+ WORD32 rnd_fact = (u4_qp_div_6 < 6) ? (1 << (5 - u4_qp_div_6)) : 0;
+ pi4_tmp_ptr = pi4_tmp;
+ pi2_src_ptr = pi2_src;
+ pi2_out_ptr = pi2_out;
+ // Horizontal transform
+ for(i = 0; i < SUB_BLK_WIDTH_4x4; i++)
+ {
+ x4 = pi2_src_ptr[0];
+ x5 = pi2_src_ptr[1];
+ x6 = pi2_src_ptr[2];
+ x7 = pi2_src_ptr[3];
+
+ x0 = x4 + x7;
+ x1 = x5 + x6;
+ x2 = x5 - x6;
+ x3 = x4 - x7;
+
+ pi4_tmp_ptr[0] = x0 + x1;
+ pi4_tmp_ptr[1] = x2 + x3;
+ pi4_tmp_ptr[2] = x0 - x1;
+ pi4_tmp_ptr[3] = x3 - x2;
+
+ pi4_tmp_ptr += SUB_BLK_WIDTH_4x4;
+ pi2_src_ptr += SUB_BLK_WIDTH_4x4;
+ }
+ pi4_tmp_ptr = pi4_tmp;
+ // Vertical Transform
+ for(i = 0; i < SUB_BLK_WIDTH_4x4; i++)
+ {
+ x4 = pi4_tmp_ptr[0];
+ x5 = pi4_tmp_ptr[4];
+ x6 = pi4_tmp_ptr[8];
+ x7 = pi4_tmp_ptr[12];
+
+ x0 = x4 + x7;
+ x1 = x5 + x6;
+ x2 = x5 - x6;
+ x3 = x4 - x7;
+
+ pi4_tmp_ptr[0] = x0 + x1;
+ pi4_tmp_ptr[4] = x2 + x3;
+ pi4_tmp_ptr[8] = x0 - x1;
+ pi4_tmp_ptr[12] = x3 - x2;
+
+ pi4_tmp_ptr++;
+ }
+ pi4_tmp_ptr = pi4_tmp;
+ //Scaling
+ for(i = 0; i < (SUB_BLK_WIDTH_4x4 * SUB_BLK_WIDTH_4x4); i++)
+ {
+ INV_QUANT(pi4_tmp_ptr[i], pu2_iscal_mat[0], pu2_weigh_mat[0], u4_qp_div_6,
+ rnd_fact, 6);
+ pi2_out_ptr[i] = pi4_tmp_ptr[i];
+ }
+}
+
+void ih264_ihadamard_scaling_2x2_uv(WORD16* pi2_src,
+ WORD16* pi2_out,
+ const UWORD16 *pu2_iscal_mat,
+ const UWORD16 *pu2_weigh_mat,
+ UWORD32 u4_qp_div_6,
+ WORD32* pi4_tmp)
+{
+ WORD32 i4_x0,i4_x1,i4_x2,i4_x3,i4_x4,i4_x5,i4_x6,i4_x7;
+ WORD32 i4_y0,i4_y1,i4_y2,i4_y3,i4_y4,i4_y5,i4_y6,i4_y7;
+
+ UNUSED(pi4_tmp);
+
+ i4_x4 = pi2_src[0];
+ i4_x5 = pi2_src[1];
+ i4_x6 = pi2_src[2];
+ i4_x7 = pi2_src[3];
+
+ i4_x0 = i4_x4 + i4_x5;
+ i4_x1 = i4_x4 - i4_x5;
+ i4_x2 = i4_x6 + i4_x7;
+ i4_x3 = i4_x6 - i4_x7;
+
+ i4_x4 = i4_x0+i4_x2;
+ i4_x5 = i4_x1+i4_x3;
+ i4_x6 = i4_x0-i4_x2;
+ i4_x7 = i4_x1-i4_x3;
+
+ INV_QUANT(i4_x4,pu2_iscal_mat[0],pu2_weigh_mat[0],u4_qp_div_6,0,5);
+ INV_QUANT(i4_x5,pu2_iscal_mat[0],pu2_weigh_mat[0],u4_qp_div_6,0,5);
+ INV_QUANT(i4_x6,pu2_iscal_mat[0],pu2_weigh_mat[0],u4_qp_div_6,0,5);
+ INV_QUANT(i4_x7,pu2_iscal_mat[0],pu2_weigh_mat[0],u4_qp_div_6,0,5);
+
+ pi2_out[0] = i4_x4;
+ pi2_out[1] = i4_x5;
+ pi2_out[2] = i4_x6;
+ pi2_out[3] = i4_x7;
+
+ i4_y4 = pi2_src[4];
+ i4_y5 = pi2_src[5];
+ i4_y6 = pi2_src[6];
+ i4_y7 = pi2_src[7];
+
+ i4_y0 = i4_y4 + i4_y5;
+ i4_y1 = i4_y4 - i4_y5;
+ i4_y2 = i4_y6 + i4_y7;
+ i4_y3 = i4_y6 - i4_y7;
+
+ i4_y4 = i4_y0+i4_y2;
+ i4_y5 = i4_y1+i4_y3;
+ i4_y6 = i4_y0-i4_y2;
+ i4_y7 = i4_y1-i4_y3;
+
+ INV_QUANT(i4_y4,pu2_iscal_mat[0],pu2_weigh_mat[0],u4_qp_div_6,0,5);
+ INV_QUANT(i4_y5,pu2_iscal_mat[0],pu2_weigh_mat[0],u4_qp_div_6,0,5);
+ INV_QUANT(i4_y6,pu2_iscal_mat[0],pu2_weigh_mat[0],u4_qp_div_6,0,5);
+ INV_QUANT(i4_y7,pu2_iscal_mat[0],pu2_weigh_mat[0],u4_qp_div_6,0,5);
+
+ pi2_out[4] = i4_y4;
+ pi2_out[5] = i4_y5;
+ pi2_out[6] = i4_y6;
+ pi2_out[7] = i4_y7;
+}
diff --git a/common/ih264_inter_pred_filters.c b/common/ih264_inter_pred_filters.c
new file mode 100755
index 0000000..7d1e407
--- /dev/null
+++ b/common/ih264_inter_pred_filters.c
@@ -0,0 +1,1042 @@
+/******************************************************************************
+ *
+ * Copyright (C) 2015 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ *****************************************************************************
+ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
+*/
+/**
+ *******************************************************************************
+ * @file
+ * ih264_inter_pred_filters.c
+ *
+ * @brief
+ * Contains function definitions for inter prediction interpolation filters
+ *
+ * @author
+ * Ittiam
+ *
+ * @par List of Functions:
+ * - ih264_inter_pred_luma_copy
+ * - ih264_interleave_copy
+ * - ih264_inter_pred_luma_horz
+ * - ih264_inter_pred_luma_vert
+ * - ih264_inter_pred_luma_horz_hpel_vert_hpel
+ * - ih264_inter_pred_luma_horz_qpel
+ * - ih264_inter_pred_luma_vert_qpel
+ * - ih264_inter_pred_luma_horz_qpel_vert_qpel
+ * - ih264_inter_pred_luma_horz_hpel_vert_qpel
+ * - ih264_inter_pred_luma_horz_qpel_vert_hpel
+ * - ih264_inter_pred_luma_bilinear
+ * - ih264_inter_pred_chroma
+ *
+ * @remarks
+ * None
+ *
+ *******************************************************************************
+ */
+
+/*****************************************************************************/
+/* File Includes */
+/*****************************************************************************/
+
+/* User include files */
+#include "ih264_typedefs.h"
+#include "ih264_macros.h"
+#include "ih264_platform_macros.h"
+#include "ih264_inter_pred_filters.h"
+
+
+/*****************************************************************************/
+/* Constant Data variables */
+/*****************************************************************************/
+
+/* coefficients for 6 tap filtering*/
+const WORD32 ih264_g_six_tap[3] ={1,-5,20};
+
+
+/*****************************************************************************/
+/* Function definitions . */
+/*****************************************************************************/
+/**
+ *******************************************************************************
+ *
+ * @brief
+ * Interprediction luma function for copy
+ *
+ * @par Description:
+ * Copies the array of width 'wd' and height 'ht' from the location pointed
+ * by 'src' to the location pointed by 'dst'
+ *
+ * @param[in] pu1_src
+ * UWORD8 pointer to the source
+ *
+ * @param[out] pu1_dst
+ * UWORD8 pointer to the destination
+ *
+ * @param[in] src_strd
+ * integer source stride
+ *
+ * @param[in] dst_strd
+ * integer destination stride
+ *
+ *
+ * @param[in] ht
+ * integer height of the array
+ *
+ * @param[in] wd
+ * integer width of the array
+ *
+ * @returns
+ *
+ * @remarks
+ * None
+ *
+ *******************************************************************************
+ */
+
+void ih264_inter_pred_luma_copy(UWORD8 *pu1_src,
+ UWORD8 *pu1_dst,
+ WORD32 src_strd,
+ WORD32 dst_strd,
+ WORD32 ht,
+ WORD32 wd,
+ UWORD8* pu1_tmp,
+ WORD32 dydx)
+{
+ WORD32 row, col;
+ UNUSED(pu1_tmp);
+ UNUSED(dydx);
+ for(row = 0; row < ht; row++)
+ {
+ for(col = 0; col < wd; col++)
+ {
+ pu1_dst[col] = pu1_src[col];
+ }
+
+ pu1_src += src_strd;
+ pu1_dst += dst_strd;
+ }
+}
+
+/**
+ *******************************************************************************
+ *
+ * @brief
+ * Fucntion for copying to an interleaved destination
+ *
+ * @par Description:
+ * Copies the array of width 'wd' and height 'ht' from the location pointed
+ * by 'src' to the location pointed by 'dst'
+ *
+ * @param[in] pu1_src
+ * UWORD8 pointer to the source
+ *
+ * @param[out] pu1_dst
+ * UWORD8 pointer to the destination
+ *
+ * @param[in] src_strd
+ * integer source stride
+ *
+ * @param[in] dst_strd
+ * integer destination stride
+ *
+ * @param[in] ht
+ * integer height of the array
+ *
+ * @param[in] wd
+ * integer width of the array
+ *
+ * @returns
+ *
+ * @remarks
+ * The alternate elements of src will be copied to alternate locations in dsr
+ * Other locations are not touched
+ *
+ *******************************************************************************
+ */
+void ih264_interleave_copy(UWORD8 *pu1_src,
+ UWORD8 *pu1_dst,
+ WORD32 src_strd,
+ WORD32 dst_strd,
+ WORD32 ht,
+ WORD32 wd)
+{
+ WORD32 row, col;
+ wd *= 2;
+
+ for(row = 0; row < ht; row++)
+ {
+ for(col = 0; col < wd; col+=2)
+ {
+ pu1_dst[col] = pu1_src[col];
+ }
+
+ pu1_src += src_strd;
+ pu1_dst += dst_strd;
+ }
+}
+
+/**
+ *******************************************************************************
+ *
+ * @brief
+ * Interprediction luma filter for horizontal input
+ *
+ * @par Description:
+ * Applies a 6 tap horizontal filter .The output is clipped to 8 bits
+ * sec 8.4.2.2.1 titled "Luma sample interpolation process"
+ *
+ * @param[in] pu1_src
+ * UWORD8 pointer to the source
+ *
+ * @param[out] pu1_dst
+ * UWORD8 pointer to the destination
+ *
+ * @param[in] src_strd
+ * integer source stride
+ *
+ * @param[in] dst_strd
+ * integer destination stride
+ *
+ * @param[in] ht
+ * integer height of the array
+ *
+ * @param[in] wd
+ * integer width of the array
+ *
+ * @returns
+ *
+ * @remarks
+ * None
+ *
+ *******************************************************************************
+ */
+void ih264_inter_pred_luma_horz(UWORD8 *pu1_src,
+ UWORD8 *pu1_dst,
+ WORD32 src_strd,
+ WORD32 dst_strd,
+ WORD32 ht,
+ WORD32 wd,
+ UWORD8* pu1_tmp,
+ WORD32 dydx)
+{
+ WORD32 row, col;
+ WORD16 i2_tmp;
+ UNUSED(pu1_tmp);
+ UNUSED(dydx);
+
+ for(row = 0; row < ht; row++)
+ {
+ for(col = 0; col < wd; col++)
+ {
+ i2_tmp = 0;/*ih264_g_six_tap[] is the array containing the filter coeffs*/
+ i2_tmp = ih264_g_six_tap[0] *
+ (pu1_src[col - 2] + pu1_src[col + 3])
+ + ih264_g_six_tap[1] *
+ (pu1_src[col - 1] + pu1_src[col + 2])
+ + ih264_g_six_tap[2] *
+ (pu1_src[col] + pu1_src[col + 1]);
+ i2_tmp = (i2_tmp + 16) >> 5;
+ pu1_dst[col] = CLIP_U8(i2_tmp);
+ }
+
+ pu1_src += src_strd;
+ pu1_dst += dst_strd;
+ }
+
+}
+
+/**
+ *******************************************************************************
+ *
+ * @brief
+ * Interprediction luma filter for vertical input
+ *
+ * @par Description:
+ * Applies a 6 tap vertical filter.The output is clipped to 8 bits
+ * sec 8.4.2.2.1 titled "Luma sample interpolation process"
+ *
+ * @param[in] pu1_src
+ * UWORD8 pointer to the source
+ *
+ * @param[out] pu1_dst
+ * UWORD8 pointer to the destination
+ *
+ * @param[in] src_strd
+ * integer source stride
+ *
+ * @param[in] dst_strd
+ * integer destination stride
+ *
+ * @param[in] ht
+ * integer height of the array
+ *
+ * @param[in] wd
+ * integer width of the array
+ *
+ * @returns
+ *
+ * @remarks
+ * None
+ *
+ *******************************************************************************
+ */
+void ih264_inter_pred_luma_vert(UWORD8 *pu1_src,
+ UWORD8 *pu1_dst,
+ WORD32 src_strd,
+ WORD32 dst_strd,
+ WORD32 ht,
+ WORD32 wd,
+ UWORD8* pu1_tmp,
+ WORD32 dydx)
+{
+ WORD32 row, col;
+ WORD16 i2_tmp;
+ UNUSED(pu1_tmp);
+ UNUSED(dydx);
+
+ for(row = 0; row < ht; row++)
+ {
+ for(col = 0; col < wd; col++)
+ {
+ i2_tmp = 0; /*ih264_g_six_tap[] is the array containing the filter coeffs*/
+ i2_tmp = ih264_g_six_tap[0] *
+ (pu1_src[col - 2 * src_strd] + pu1_src[col + 3 * src_strd])
+ + ih264_g_six_tap[1] *
+ (pu1_src[col - 1 * src_strd] + pu1_src[col + 2 * src_strd])
+ + ih264_g_six_tap[2] *
+ (pu1_src[col] + pu1_src[col + 1 * src_strd]);
+ i2_tmp = (i2_tmp + 16) >> 5;
+ pu1_dst[col] = CLIP_U8(i2_tmp);
+ }
+ pu1_src += src_strd;
+ pu1_dst += dst_strd;
+ }
+}
+
+/*!
+ **************************************************************************
+ * \if Function name : ih264_inter_pred_luma_horz_hpel_vert_hpel \endif
+ *
+ * \brief
+ * This function implements a two stage cascaded six tap filter. It
+ * applies the six tap filter in the horizontal direction on the
+ * predictor values, followed by applying the same filter in the
+ * vertical direction on the output of the first stage. The six tap
+ * filtering operation is described in sec 8.4.2.2.1 titled "Luma sample
+ * interpolation process"
+ *
+ * \param pu1_src: Pointer to the buffer containing the predictor values.
+ * pu1_src could point to the frame buffer or the predictor buffer.
+ * \param pu1_dst: Pointer to the destination buffer where the output of
+ * the six tap filter is stored.
+ * \param ht: Height of the rectangular pixel grid to be interpolated
+ * \param wd: Width of the rectangular pixel grid to be interpolated
+ * \param src_strd: Width of the buffer pointed to by pu1_src.
+ * \param dst_strd: Width of the destination buffer
+ * \param pu1_tmp: temporary buffer.
+ * \param dydx: x and y reference offset for qpel calculations: UNUSED in this function.
+ *
+ * \return
+ * None.
+ *
+ * \note
+ * This function takes the 8 bit predictor values, applies the six tap
+ * filter in the horizontal direction and outputs the result clipped to
+ * 8 bit precision. The input is stored in the buffer pointed to by
+ * pu1_src while the output is stored in the buffer pointed by pu1_dst.
+ * Both pu1_src and pu1_dst could point to the same buffer i.e. the
+ * six tap filter could be done in place.
+ *
+ **************************************************************************
+ */
+void ih264_inter_pred_luma_horz_hpel_vert_hpel(UWORD8 *pu1_src,
+ UWORD8 *pu1_dst,
+ WORD32 src_strd,
+ WORD32 dst_strd,
+ WORD32 ht,
+ WORD32 wd,
+ UWORD8* pu1_tmp,
+ WORD32 dydx)
+{
+ WORD32 row, col;
+ WORD32 tmp;
+ WORD16* pi2_pred1_temp;
+ WORD16* pi2_pred1;
+ UNUSED(dydx);
+ pi2_pred1_temp = (WORD16*)pu1_tmp;
+ pi2_pred1_temp += 2;
+ pi2_pred1 = pi2_pred1_temp;
+ for(row = 0; row < ht; row++)
+ {
+ for(col = -2; col < wd + 3; col++)
+ {
+ tmp = 0;/*ih264_g_six_tap[] is the array containing the filter coeffs*/
+ tmp = ih264_g_six_tap[0] *
+ (pu1_src[col - 2 * src_strd] + pu1_src[col + 3 * src_strd])
+ + ih264_g_six_tap[1] *
+ (pu1_src[col - 1 * src_strd] + pu1_src[col + 2 * src_strd])
+ + ih264_g_six_tap[2] *
+ (pu1_src[col] + pu1_src[col + 1 * src_strd]);
+ pi2_pred1_temp[col] = tmp;
+ }
+ pu1_src += src_strd;
+ pi2_pred1_temp = pi2_pred1_temp + wd + 5;
+ }
+
+ for(row = 0; row < ht; row++)
+ {
+ for(col = 0; col < wd; col++)
+ {
+ tmp = 0;/*ih264_g_six_tap[] is the array containing the filter coeffs*/
+ tmp = ih264_g_six_tap[0] *
+ (pi2_pred1[col - 2] + pi2_pred1[col + 3])
+ + ih264_g_six_tap[1] *
+ (pi2_pred1[col - 1] + pi2_pred1[col + 2])
+ + ih264_g_six_tap[2] * (pi2_pred1[col] + pi2_pred1[col + 1]);
+ tmp = (tmp + 512) >> 10;
+ pu1_dst[col] = CLIP_U8(tmp);
+ }
+ pi2_pred1 += (wd + 5);
+ pu1_dst += dst_strd;
+ }
+}
+
+/*!
+ **************************************************************************
+ * \if Function name : ih264_inter_pred_luma_horz_qpel \endif
+ *
+ * \brief
+ * This routine applies the six tap filter to the predictors in the
+ * horizontal direction. The six tap filtering operation is described in
+ * sec 8.4.2.2.1 titled "Luma sample interpolation process"
+ *
+ * \param pu1_src: Pointer to the buffer containing the predictor values.
+ * pu1_src could point to the frame buffer or the predictor buffer.
+ * \param pu1_dst: Pointer to the destination buffer where the output of
+ * the six tap filter is stored.
+ * \param ht: Height of the rectangular pixel grid to be interpolated
+ * \param wd: Width of the rectangular pixel grid to be interpolated
+ * \param src_strd: Width of the buffer pointed to by pu1_src.
+ * \param dst_strd: Width of the destination buffer
+ * \param pu1_tmp: temporary buffer: UNUSED in this function
+ * \param dydx: x and y reference offset for qpel calculations.
+ *
+ * \return
+ * None.
+ *
+ * \note
+ * This function takes the 8 bit predictor values, applies the six tap
+ * filter in the horizontal direction and outputs the result clipped to
+ * 8 bit precision. The input is stored in the buffer pointed to by
+ * pu1_src while the output is stored in the buffer pointed by pu1_dst.
+ * Both pu1_src and pu1_dst could point to the same buffer i.e. the
+ * six tap filter could be done in place.
+ *
+ **************************************************************************
+ */
+void ih264_inter_pred_luma_horz_qpel(UWORD8 *pu1_src,
+ UWORD8 *pu1_dst,
+ WORD32 src_strd,
+ WORD32 dst_strd,
+ WORD32 ht,
+ WORD32 wd,
+ UWORD8* pu1_tmp,
+ WORD32 dydx)
+{
+ WORD32 row, col;
+ UWORD8 *pu1_pred1;
+ WORD32 x_offset = dydx & 0x3;
+ UNUSED(pu1_tmp);
+ pu1_pred1 = pu1_src + (x_offset >> 1);
+
+ for(row = 0; row < ht; row++)
+ {
+ for(col = 0; col < wd; col++, pu1_src++, pu1_dst++)
+ {
+ WORD16 i2_temp;
+ /* The logic below implements the following equation
+ i2_temp = puc_pred[-2] - 5 * (puc_pred[-1] + puc_pred[2]) +
+ 20 * (puc_pred[0] + puc_pred[1]) + puc_pred[3]; */
+ i2_temp = pu1_src[-2] + pu1_src[3]
+ - (pu1_src[-1] + pu1_src[2])
+ + ((pu1_src[0] + pu1_src[1] - pu1_src[-1] - pu1_src[2]) << 2)
+ + ((pu1_src[0] + pu1_src[1]) << 4);
+ i2_temp = (i2_temp + 16) >> 5;
+ i2_temp = CLIP_U8(i2_temp);
+ *pu1_dst = (i2_temp + *pu1_pred1 + 1) >> 1;
+
+ pu1_pred1++;
+ }
+ pu1_dst += dst_strd - wd;
+ pu1_src += src_strd - wd;
+ pu1_pred1 += src_strd - wd;
+ }
+}
+
+/*!
+ **************************************************************************
+ * \if Function name : ih264_inter_pred_luma_vert_qpel \endif
+ *
+ * \brief
+ * This routine applies the six tap filter to the predictors in the
+ * vertical direction and interpolates them to obtain pixels at quarter vertical
+ * positions (0, 1/4) and (0, 3/4). The six tap filtering operation is
+ * described in sec 8.4.2.2.1 titled "Luma sample interpolation process"
+ *
+ * \param pu1_src: Pointer to the buffer containing the predictor values.
+ * pu1_src could point to the frame buffer or the predictor buffer.
+ * \param pu1_dst: Pointer to the destination buffer where the output of
+ * the six tap filter is stored.
+ * \param ht: Height of the rectangular pixel grid to be interpolated
+ * \param wd: Width of the rectangular pixel grid to be interpolated
+ * \param src_strd: Width of the buffer pointed to by puc_pred.
+ * \param dst_strd: Width of the destination buffer
+ * \param pu1_tmp: temporary buffer: UNUSED in this function
+ * \param dydx: x and y reference offset for qpel calculations.
+ *
+ * \return
+ * void
+ *
+ * \note
+ * This function takes the 8 bit predictor values, applies the six tap
+ * filter in the vertical direction and outputs the result clipped to
+ * 8 bit precision. The input is stored in the buffer pointed to by
+ * puc_pred while the output is stored in the buffer pointed by puc_dest.
+ * Both puc_pred and puc_dest could point to the same buffer i.e. the
+ * six tap filter could be done in place.
+ *
+ * \para <title>
+ * <paragraph>
+ * ...
+ **************************************************************************
+ */
+void ih264_inter_pred_luma_vert_qpel(UWORD8 *pu1_src,
+ UWORD8 *pu1_dst,
+ WORD32 src_strd,
+ WORD32 dst_strd,
+ WORD32 ht,
+ WORD32 wd,
+ UWORD8* pu1_tmp,
+ WORD32 dydx)
+{
+ WORD32 row, col;
+ WORD32 y_offset = dydx >> 2;
+ WORD32 off1, off2, off3;
+ UWORD8 *pu1_pred1;
+ UNUSED(pu1_tmp);
+ y_offset = y_offset & 0x3;
+
+ off1 = src_strd;
+ off2 = src_strd << 1;
+ off3 = off1 + off2;
+
+ pu1_pred1 = pu1_src + (y_offset >> 1) * src_strd;
+
+ for(row = 0; row < ht; row++)
+ {
+ for(col = 0; col < wd; col++, pu1_dst++, pu1_src++, pu1_pred1++)
+ {
+ WORD16 i2_temp;
+ /* The logic below implements the following equation
+ i16_temp = puc_pred[-2*src_strd] + puc_pred[3*src_strd] -
+ 5 * (puc_pred[-1*src_strd] + puc_pred[2*src_strd]) +
+ 20 * (puc_pred[0] + puc_pred[src_strd]); */
+ i2_temp = pu1_src[-off2] + pu1_src[off3]
+ - (pu1_src[-off1] + pu1_src[off2])
+ + ((pu1_src[0] + pu1_src[off1] - pu1_src[-off1] - pu1_src[off2]) << 2)
+ + ((pu1_src[0] + pu1_src[off1]) << 4);
+ i2_temp = (i2_temp + 16) >> 5;
+ i2_temp = CLIP_U8(i2_temp);
+
+ *pu1_dst = (i2_temp + *pu1_pred1 + 1) >> 1;
+ }
+ pu1_src += src_strd - wd;
+ pu1_pred1 += src_strd - wd;
+ pu1_dst += dst_strd - wd;
+ }
+}
+
+/*!
+ **************************************************************************
+ * \if Function name : ih264_inter_pred_luma_horz_qpel_vert_qpel \endif
+ *
+ * \brief
+ * This routine applies the six tap filter to the predictors in the
+ * vertical and horizontal direction and averages them to get pixels at locations
+ * (1/4,1/4), (1/4, 3/4), (3/4, 1/4) & (3/4, 3/4). The six tap filtering operation
+ * is described in sec 8.4.2.2.1 titled "Luma sample interpolation process"
+ *
+ * \param pu1_src: Pointer to the buffer containing the predictor values.
+ * pu1_src could point to the frame buffer or the predictor buffer.
+ * \param pu1_dst: Pointer to the destination buffer where the output of
+ * the six tap filter is stored.
+ * \param wd: Width of the rectangular pixel grid to be interpolated
+ * \param ht: Height of the rectangular pixel grid to be interpolated
+ * \param src_strd: Width of the buffer pointed to by puc_pred.
+ * \param dst_strd: Width of the destination buffer
+ * \param pu1_tmp: temporary buffer, UNUSED in this function
+ * \param dydx: x and y reference offset for qpel calculations.
+ *
+ * \return
+ * void
+ *
+ * \note
+ * This function takes the 8 bit predictor values, applies the six tap
+ * filter in the vertical direction and outputs the result clipped to
+ * 8 bit precision. The input is stored in the buffer pointed to by
+ * puc_pred while the output is stored in the buffer pointed by puc_dest.
+ * Both puc_pred and puc_dest could point to the same buffer i.e. the
+ * six tap filter could be done in place.
+ *
+ * \para <title>
+ * <paragraph>
+ * ...
+ **************************************************************************
+ */
+void ih264_inter_pred_luma_horz_qpel_vert_qpel(UWORD8 *pu1_src,
+ UWORD8 *pu1_dst,
+ WORD32 src_strd,
+ WORD32 dst_strd,
+ WORD32 ht,
+ WORD32 wd,
+ UWORD8* pu1_tmp,
+ WORD32 dydx)
+{
+ WORD32 row, col;
+ WORD32 x_offset = dydx & 0x3;
+ WORD32 y_offset = dydx >> 2;
+
+ WORD32 off1, off2, off3;
+ UWORD8* pu1_pred_vert, *pu1_pred_horz;
+ UNUSED(pu1_tmp);
+ y_offset = y_offset & 0x3;
+
+ off1 = src_strd;
+ off2 = src_strd << 1;
+ off3 = off1 + off2;
+
+ pu1_pred_horz = pu1_src + (y_offset >> 1) * src_strd;
+ pu1_pred_vert = pu1_src + (x_offset >> 1);
+
+ for(row = 0; row < ht; row++)
+ {
+ for(col = 0; col < wd;
+ col++, pu1_dst++, pu1_pred_vert++, pu1_pred_horz++)
+ {
+ WORD16 i2_temp_vert, i2_temp_horz;
+ /* The logic below implements the following equation
+ i2_temp = puc_pred[-2*src_strd] + puc_pred[3*src_strd] -
+ 5 * (puc_pred[-1*src_strd] + puc_pred[2*src_strd]) +
+ 20 * (puc_pred[0] + puc_pred[src_strd]); */
+ i2_temp_vert = pu1_pred_vert[-off2] + pu1_pred_vert[off3]
+ - (pu1_pred_vert[-off1] + pu1_pred_vert[off2])
+ + ((pu1_pred_vert[0] + pu1_pred_vert[off1]
+ - pu1_pred_vert[-off1]
+ - pu1_pred_vert[off2]) << 2)
+ + ((pu1_pred_vert[0] + pu1_pred_vert[off1]) << 4);
+ i2_temp_vert = (i2_temp_vert + 16) >> 5;
+ i2_temp_vert = CLIP_U8(i2_temp_vert);
+
+ /* The logic below implements the following equation
+ i16_temp = puc_pred[-2] - 5 * (puc_pred[-1] + puc_pred[2]) +
+ 20 * (puc_pred[0] + puc_pred[1]) + puc_pred[3]; */
+ i2_temp_horz = pu1_pred_horz[-2] + pu1_pred_horz[3]
+ - (pu1_pred_horz[-1] + pu1_pred_horz[2])
+ + ((pu1_pred_horz[0] + pu1_pred_horz[1]
+ - pu1_pred_horz[-1]
+ - pu1_pred_horz[2]) << 2)
+ + ((pu1_pred_horz[0] + pu1_pred_horz[1]) << 4);
+ i2_temp_horz = (i2_temp_horz + 16) >> 5;
+ i2_temp_horz = CLIP_U8(i2_temp_horz);
+ *pu1_dst = (i2_temp_vert + i2_temp_horz + 1) >> 1;
+ }
+ pu1_pred_vert += (src_strd - wd);
+ pu1_pred_horz += (src_strd - wd);
+ pu1_dst += (dst_strd - wd);
+ }
+}
+
+/*!
+ **************************************************************************
+ * \if Function name : ih264_inter_pred_luma_horz_qpel_vert_hpel \endif
+ *
+ * \brief
+ * This routine applies the six tap filter to the predictors in the vertical
+ * and horizontal direction to obtain the pixel at (1/2,1/2). It then interpolates
+ * pixel at (0,1/2) and (1/2,1/2) to obtain pixel at (1/4,1/2). Similarly for (3/4,1/2).
+ * The six tap filtering operation is described in sec 8.4.2.2.1 titled
+ * "Luma sample interpolation process"
+ *
+ * \param pu1_src: Pointer to the buffer containing the predictor values.
+ * pu1_src could point to the frame buffer or the predictor buffer.
+ * \param pu1_dst: Pointer to the destination buffer where the output of
+ * the six tap filter followed by interpolation is stored.
+ * \param wd: Width of the rectangular pixel grid to be interpolated
+ * \param ht: Height of the rectangular pixel grid to be interpolated
+ * \param src_strd: Width of the buffer pointed to by puc_pred.
+ * \param dst_strd: Width of the destination buffer
+ * \param pu1_tmp: buffer to store temporary output after 1st 6-tap filter.
+ * \param dydx: x and y reference offset for qpel calculations.
+ *
+ * \return
+ * void
+ *
+ * \note
+ * This function takes the 8 bit predictor values, applies the six tap
+ * filter in the vertical direction and outputs the result clipped to
+ * 8 bit precision. The input is stored in the buffer pointed to by
+ * puc_pred while the output is stored in the buffer pointed by puc_dest.
+ * Both puc_pred and puc_dest could point to the same buffer i.e. the
+ * six tap filter could be done in place.
+ *
+ * \para <title>
+ * <paragraph>
+ * ...
+ **************************************************************************
+ */
+void ih264_inter_pred_luma_horz_qpel_vert_hpel(UWORD8 *pu1_src,
+ UWORD8 *pu1_dst,
+ WORD32 src_strd,
+ WORD32 dst_strd,
+ WORD32 ht,
+ WORD32 wd,
+ UWORD8* pu1_tmp,
+ WORD32 dydx)
+{
+ WORD32 row, col;
+ WORD32 tmp;
+ WORD16* pi2_pred1_temp, *pi2_pred1;
+ UWORD8* pu1_dst_tmp;
+ WORD32 x_offset = dydx & 0x3;
+ WORD16 i2_macro;
+
+ pi2_pred1_temp = (WORD16*)pu1_tmp;
+ pi2_pred1_temp += 2;
+ pi2_pred1 = pi2_pred1_temp;
+ pu1_dst_tmp = pu1_dst;
+
+ for(row = 0; row < ht; row++)
+ {
+ for(col = -2; col < wd + 3; col++)
+ {
+ tmp = 0;/*ih264_g_six_tap[] is the array containing the filter coeffs*/
+ tmp = ih264_g_six_tap[0] *
+ (pu1_src[col - 2 * src_strd] + pu1_src[col + 3 * src_strd])
+ + ih264_g_six_tap[1] *
+ (pu1_src[col - 1 * src_strd] + pu1_src[col + 2 * src_strd])
+ + ih264_g_six_tap[2] *
+ (pu1_src[col] + pu1_src[col + 1 * src_strd]);
+ pi2_pred1_temp[col] = tmp;
+ }
+
+ pu1_src += src_strd;
+ pi2_pred1_temp = pi2_pred1_temp + wd + 5;
+ }
+
+ pi2_pred1_temp = pi2_pred1;
+ for(row = 0; row < ht; row++)
+ {
+ for(col = 0; col < wd; col++)
+ {
+ tmp = 0;/*ih264_g_six_tap[] is the array containing the filter coeffs*/
+ tmp = ih264_g_six_tap[0] *
+ (pi2_pred1[col - 2] + pi2_pred1[col + 3])
+ + ih264_g_six_tap[1] *
+ (pi2_pred1[col - 1] + pi2_pred1[col + 2])
+ + ih264_g_six_tap[2] *
+ (pi2_pred1[col] + pi2_pred1[col + 1]);
+ tmp = (tmp + 512) >> 10;
+ pu1_dst[col] = CLIP_U8(tmp);
+ }
+ pi2_pred1 += (wd + 5);
+ pu1_dst += dst_strd;
+ }
+
+ pu1_dst = pu1_dst_tmp;
+ pi2_pred1_temp += (x_offset >> 1);
+ for(row = ht; row != 0; row--)
+ {
+ for(col = wd; col != 0; col--, pu1_dst++, pi2_pred1_temp++)
+ {
+ UWORD8 uc_temp;
+ /* Clipping the output of the six tap filter obtained from the
+ first stage of the 2d filter stage */
+ *pi2_pred1_temp = (*pi2_pred1_temp + 16) >> 5;
+ i2_macro = (*pi2_pred1_temp);
+ uc_temp = CLIP_U8(i2_macro);
+ *pu1_dst = (*pu1_dst + uc_temp + 1) >> 1;
+ }
+ pi2_pred1_temp += 5;
+ pu1_dst += dst_strd - wd;
+ }
+}
+
+/*!
+ **************************************************************************
+ * \if Function name : ih264_inter_pred_luma_horz_hpel_vert_qpel \endif
+ *
+ * \brief
+ * This routine applies the six tap filter to the predictors in the horizontal
+ * and vertical direction to obtain the pixel at (1/2,1/2). It then interpolates
+ * pixel at (1/2,0) and (1/2,1/2) to obtain pixel at (1/2,1/4). Similarly for (1/2,3/4).
+ * The six tap filtering operation is described in sec 8.4.2.2.1 titled
+ * "Luma sample interpolation process"
+ *
+ * \param pu1_src: Pointer to the buffer containing the predictor values.
+ * pu1_src could point to the frame buffer or the predictor buffer.
+ * \param pu1_dst: Pointer to the destination buffer where the output of
+ * the six tap filter followed by interpolation is stored.
+ * \param wd: Width of the rectangular pixel grid to be interpolated
+ * \param ht: Height of the rectangular pixel grid to be interpolated
+ * \param src_strd: Width of the buffer pointed to by puc_pred.
+ * \param dst_strd: Width of the destination buffer
+ * \param pu1_tmp: buffer to store temporary output after 1st 6-tap filter.
+ * \param dydx: x and y reference offset for qpel calculations.
+ *
+ * \return
+ * void
+ *
+ * \note
+ * This function takes the 8 bit predictor values, applies the six tap
+ * filter in the vertical direction and outputs the result clipped to
+ * 8 bit precision. The input is stored in the buffer pointed to by
+ * puc_pred while the output is stored in the buffer pointed by puc_dest.
+ * Both puc_pred and puc_dest could point to the same buffer i.e. the
+ * six tap filter could be done in place.
+ *
+ * \para <title>
+ * <paragraph>
+ * ...
+ **************************************************************************
+ */
+void ih264_inter_pred_luma_horz_hpel_vert_qpel(UWORD8 *pu1_src,
+ UWORD8 *pu1_dst,
+ WORD32 src_strd,
+ WORD32 dst_strd,
+ WORD32 ht,
+ WORD32 wd,
+ UWORD8* pu1_tmp,
+ WORD32 dydx)
+{
+
+ WORD32 row, col;
+ WORD32 tmp;
+ WORD32 y_offset = dydx >> 2;
+ WORD16* pi2_pred1_temp, *pi2_pred1;
+ UWORD8* pu1_dst_tmp;
+ //WORD32 x_offset = dydx & 0x3;
+ WORD16 i2_macro;
+
+ y_offset = y_offset & 0x3;
+
+ pi2_pred1_temp = (WORD16*)pu1_tmp;
+ pi2_pred1_temp += 2 * wd;
+ pi2_pred1 = pi2_pred1_temp;
+ pu1_dst_tmp = pu1_dst;
+ pu1_src -= 2 * src_strd;
+ for(row = -2; row < ht + 3; row++)
+ {
+ for(col = 0; col < wd; col++)
+ {
+ tmp = 0;/*ih264_g_six_tap[] is the array containing the filter coeffs*/
+ tmp = ih264_g_six_tap[0] * (pu1_src[col - 2] + pu1_src[col + 3])
+ + ih264_g_six_tap[1] * (pu1_src[col - 1] + pu1_src[col + 2])
+ + ih264_g_six_tap[2] * (pu1_src[col] + pu1_src[col + 1]);
+ pi2_pred1_temp[col - 2 * wd] = tmp;
+ }
+
+ pu1_src += src_strd;
+ pi2_pred1_temp += wd;
+ }
+ pi2_pred1_temp = pi2_pred1;
+ for(row = 0; row < ht; row++)
+ {
+ for(col = 0; col < wd; col++)
+ {
+ tmp = 0;/*ih264_g_six_tap[] is the array containing the filter coeffs*/
+ tmp = ih264_g_six_tap[0] * (pi2_pred1[col - 2 * wd] + pi2_pred1[col + 3 * wd])
+ + ih264_g_six_tap[1] * (pi2_pred1[col - 1 * wd] + pi2_pred1[col + 2 * wd])
+ + ih264_g_six_tap[2] * (pi2_pred1[col] + pi2_pred1[col + 1 * wd]);
+ tmp = (tmp + 512) >> 10;
+ pu1_dst[col] = CLIP_U8(tmp);
+ }
+ pi2_pred1 += wd;
+ pu1_dst += dst_strd;
+ }
+ pu1_dst = pu1_dst_tmp;
+ pi2_pred1_temp += (y_offset >> 1) * wd;
+ for(row = ht; row != 0; row--)
+
+ {
+ for(col = wd; col != 0; col--, pu1_dst++, pi2_pred1_temp++)
+ {
+ UWORD8 u1_temp;
+ /* Clipping the output of the six tap filter obtained from the
+ first stage of the 2d filter stage */
+ *pi2_pred1_temp = (*pi2_pred1_temp + 16) >> 5;
+ i2_macro = (*pi2_pred1_temp);
+ u1_temp = CLIP_U8(i2_macro);
+ *pu1_dst = (*pu1_dst + u1_temp + 1) >> 1;
+ }
+ //pi16_pred1_temp += wd;
+ pu1_dst += dst_strd - wd;
+ }
+}
+
+/**
+ *******************************************************************************
+ * function:ih264_inter_pred_luma_bilinear
+ *
+ * @brief
+ * This routine applies the bilinear filter to the predictors .
+ * The filtering operation is described in
+ * sec 8.4.2.2.1 titled "Luma sample interpolation process"
+ *
+ * @par Description:
+\note
+ * This function is called to obtain pixels lying at the following
+ * locations (1/4,1), (3/4,1),(1,1/4), (1,3/4) ,(1/4,1/2), (3/4,1/2),(1/2,1/4), (1/2,3/4),(3/4,1/4),(1/4,3/4),(3/4,3/4)&& (1/4,1/4) .
+ * The function averages the two adjacent values from the two input arrays in horizontal direction.
+ *
+ *
+ * @param[in] pu1_src1:
+ * UWORD8 Pointer to the buffer containing the first input array.
+ *
+ * @param[in] pu1_src2:
+ * UWORD8 Pointer to the buffer containing the second input array.
+ *
+ * @param[out] pu1_dst
+ * UWORD8 pointer to the destination where the output of bilinear filter is stored.
+ *
+ * @param[in] src_strd1
+ * Stride of the first input buffer
+ *
+ * @param[in] src_strd2
+ * Stride of the second input buffer
+ *
+ * @param[in] dst_strd
+ * integer destination stride of pu1_dst
+ *
+ * @param[in] ht
+ * integer height of the array
+ *
+ * @param[in] wd
+ * integer width of the array
+ *
+ * @returns
+ *
+ * @remarks
+ * None
+ *
+ *******************************************************************************
+ */
+void ih264_inter_pred_luma_bilinear(UWORD8 *pu1_src1,
+ UWORD8 *pu1_src2,
+ UWORD8 *pu1_dst,
+ WORD32 src_strd1,
+ WORD32 src_strd2,
+ WORD32 dst_strd,
+ WORD32 ht,
+ WORD32 wd)
+{
+ WORD32 row, col;
+ WORD16 i2_tmp;
+
+ for(row = 0; row < ht; row++)
+ {
+ for(col = 0; col < wd; col++)
+ {
+ i2_tmp = pu1_src1[col] + pu1_src2[col];
+ i2_tmp = (i2_tmp + 1) >> 1;
+ pu1_dst[col] = CLIP_U8(i2_tmp);
+ }
+ pu1_src1 += src_strd1;
+ pu1_src2 += src_strd2;
+ pu1_dst += dst_strd;
+ }
+
+}
+
+/**
+ *******************************************************************************
+ *
+ * @brief
+ * Interprediction chroma filter
+ *
+ * @par Description:
+ * Applies filtering to chroma samples as mentioned in
+ * sec 8.4.2.2.2 titled "chroma sample interpolation process"
+ *
+ * @param[in] pu1_src
+ * UWORD8 pointer to the source containing alternate U and V samples
+ *
+ * @param[out] pu1_dst
+ * UWORD8 pointer to the destination
+ *
+ * @param[in] src_strd
+ * integer source stride
+ *
+ * @param[in] dst_strd
+ * integer destination stride
+ *
+ * @param[in] u1_dx
+ * dx value where the sample is to be produced(refer sec 8.4.2.2.2 )
+ *
+ * @param[in] u1_dy
+ * dy value where the sample is to be produced(refer sec 8.4.2.2.2 )
+ *
+ * @param[in] ht
+ * integer height of the array
+ *
+ * @param[in] wd
+ * integer width of the array
+ *
+ * @returns
+ *
+ * @remarks
+ * None
+ *
+ *******************************************************************************
+ */
+void ih264_inter_pred_chroma(UWORD8 *pu1_src,
+ UWORD8 *pu1_dst,
+ WORD32 src_strd,
+ WORD32 dst_strd,
+ WORD32 dx,
+ WORD32 dy,
+ WORD32 ht,
+ WORD32 wd)
+{
+ WORD32 row, col;
+ WORD16 i2_tmp;
+
+ for(row = 0; row < ht; row++)
+ {
+ for(col = 0; col < 2 * wd; col++)
+ {
+ i2_tmp = 0; /* applies equation (8-266) in section 8.4.2.2.2 */
+ i2_tmp = (8 - dx) * (8 - dy) * pu1_src[col]
+ + (dx) * (8 - dy) * pu1_src[col + 2]
+ + (8 - dx) * (dy) * (pu1_src + src_strd)[col]
+ + (dx) * (dy) * (pu1_src + src_strd)[col + 2];
+ i2_tmp = (i2_tmp + 32) >> 6;
+ pu1_dst[col] = CLIP_U8(i2_tmp);
+ }
+ pu1_src += src_strd;
+ pu1_dst += dst_strd;
+ }
+}
diff --git a/common/ih264_inter_pred_filters.h b/common/ih264_inter_pred_filters.h
new file mode 100755
index 0000000..c439ab8
--- /dev/null
+++ b/common/ih264_inter_pred_filters.h
@@ -0,0 +1,241 @@
+/******************************************************************************
+ *
+ * Copyright (C) 2015 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ *****************************************************************************
+ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
+*/
+
+/**
+ *******************************************************************************
+ * @file
+ * ih264_inter_pred_filters.h
+ *
+ * @brief
+ * Declarations of functions used for inter prediction
+ *
+ * @author
+ * Ittiam
+ *
+ * @par List of Functions:
+ * -ih264_inter_pred_luma_copy
+ * -ih264_interleave_copy
+ * -ih264_inter_pred_luma_horz
+ * -ih264_inter_pred_luma_vert
+ * -ih264_inter_pred_luma_horz_hpel_vert_hpel
+ * -ih264_inter_pred_luma_vert_qpel
+ * -ih264_inter_pred_luma_horz_qpel
+ * -ih264_inter_pred_luma_horz_qpel_vert_qpel
+ * -ih264_inter_pred_luma_horz_qpel_vert_hpel
+ * -ih264_inter_pred_luma_horz_hpel_vert_qpel
+ * -ih264_inter_pred_luma_bilinear
+ * -ih264_inter_pred_chroma
+ * -ih264_inter_pred_luma_copy_a9q
+ * -ih264_interleave_copy_a9
+ * -ih264_inter_pred_luma_horz_a9q
+ * -ih264_inter_pred_luma_vert_a9q
+ * -ih264_inter_pred_luma_bilinear_a9q
+ * -ih264_inter_pred_luma_horz_hpel_vert_hpel_a9q
+ * -ih264_inter_pred_luma_horz_qpel_a9q
+ * -ih264_inter_pred_luma_vert_qpel_a9q
+ * -ih264_inter_pred_luma_horz_qpel_vert_qpel_a9q
+ * -ih264_inter_pred_luma_horz_qpel_vert_hpel_a9q
+ * -ih264_inter_pred_luma_horz_hpel_vert_qpel_a9q
+ * -ih264_inter_pred_chroma_a9q
+ * -ih264_inter_pred_luma_copy_av8
+ * -ih264_interleave_copy_av8
+ * -ih264_inter_pred_luma_horz_av8
+ * -ih264_inter_pred_luma_vert_av8
+ * -ih264_inter_pred_luma_bilinear_av8
+ * -ih264_inter_pred_luma_horz_hpel_vert_hpel_av8
+ * -ih264_inter_pred_luma_horz_qpel_av8
+ * -ih264_inter_pred_luma_vert_qpel_av8
+ * -ih264_inter_pred_luma_horz_qpel_vert_qpel_av8
+ * -ih264_inter_pred_luma_horz_qpel_vert_hpel_av8
+ * -ih264_inter_pred_luma_horz_hpel_vert_qpel_av8
+ * -ih264_inter_pred_chroma_av8
+ * -ih264_inter_pred_chroma_dx_zero_av8
+ * -ih264_inter_pred_chroma_dy_zero_av8
+ * -ih264_inter_pred_luma_copy_ssse3
+ * -ih264_inter_pred_luma_copy_ssse3
+ * -ih264_inter_pred_luma_horz_ssse3
+ * -ih264_inter_pred_luma_vert_ssse3
+ * -ih264_inter_pred_luma_bilinear_ssse3
+ * -ih264_inter_pred_luma_horz_hpel_vert_hpel_ssse3
+ * -ih264_inter_pred_luma_horz_qpel_ssse3
+ * -ih264_inter_pred_luma_vert_qpel_ssse3
+ * -ih264_inter_pred_luma_horz_qpel_vert_qpel_ssse3
+ * -ih264_inter_pred_luma_horz_qpel_vert_hpel_ssse3
+ * -ih264_inter_pred_luma_horz_hpel_vert_qpel_ssse3
+ * -ih264_inter_pred_chroma_ssse3
+ *
+ * @remarks
+ * None
+ *
+ *******************************************************************************
+ */
+
+#ifndef _IH264_INTER_PRED_H_
+#define _IH264_INTER_PRED_H_
+
+/*****************************************************************************/
+/* Constant Data variables */
+/*****************************************************************************/
+
+extern const WORD32 ih264_g_six_tap[3];/* coefficients for 6 tap filtering*/
+
+/*****************************************************************************/
+/* Extern Function Declarations */
+/*****************************************************************************/
+
+typedef void ih264_inter_pred_luma_ft(UWORD8 *pu1_src,
+ UWORD8 *pu1_dst,
+ WORD32 src_strd,
+ WORD32 dst_strd,
+ WORD32 ht,
+ WORD32 wd,
+ UWORD8* pu1_tmp,
+ WORD32 dydx);
+
+typedef void ih264_interleave_copy_ft(UWORD8 *pu1_src,
+ UWORD8 *pu1_dst,
+ WORD32 src_strd,
+ WORD32 dst_strd,
+ WORD32 ht,
+ WORD32 wd);
+
+typedef void ih264_inter_pred_luma_bilinear_ft(UWORD8 *pu1_src1,
+ UWORD8 *pu1_src2,
+ UWORD8 *pu1_dst,
+ WORD32 src_strd1,
+ WORD32 src_strd2,
+ WORD32 dst_strd,
+ WORD32 height,
+ WORD32 width);
+
+typedef void ih264_inter_pred_chroma_ft(UWORD8 *pu1_src,
+ UWORD8 *pu1_dst,
+ WORD32 src_strd,
+ WORD32 dst_strd,
+ WORD32 dx,
+ WORD32 dy,
+ WORD32 ht,
+ WORD32 wd);
+
+/* No NEON Declarations */
+
+ih264_inter_pred_luma_ft ih264_inter_pred_luma_copy;
+
+ih264_interleave_copy_ft ih264_interleave_copy;
+
+ih264_inter_pred_luma_ft ih264_inter_pred_luma_horz;
+
+ih264_inter_pred_luma_ft ih264_inter_pred_luma_vert;
+
+ih264_inter_pred_luma_ft ih264_inter_pred_luma_horz_hpel_vert_hpel;
+
+ih264_inter_pred_luma_ft ih264_inter_pred_luma_vert_qpel;
+
+ih264_inter_pred_luma_ft ih264_inter_pred_luma_horz_qpel;
+
+ih264_inter_pred_luma_ft ih264_inter_pred_luma_horz_qpel_vert_qpel;
+
+ih264_inter_pred_luma_ft ih264_inter_pred_luma_horz_qpel_vert_hpel;
+
+ih264_inter_pred_luma_ft ih264_inter_pred_luma_horz_hpel_vert_qpel;
+
+ih264_inter_pred_luma_bilinear_ft ih264_inter_pred_luma_bilinear;
+
+ih264_inter_pred_chroma_ft ih264_inter_pred_chroma;
+
+/* A9 NEON Declarations */
+ih264_inter_pred_luma_ft ih264_inter_pred_luma_copy_a9q;
+
+ih264_interleave_copy_ft ih264_interleave_copy_a9;
+
+ih264_inter_pred_luma_ft ih264_inter_pred_luma_horz_a9q;
+
+ih264_inter_pred_luma_ft ih264_inter_pred_luma_vert_a9q;
+
+ih264_inter_pred_luma_bilinear_ft ih264_inter_pred_luma_bilinear_a9q;
+
+ih264_inter_pred_luma_ft ih264_inter_pred_luma_horz_hpel_vert_hpel_a9q;
+
+ih264_inter_pred_luma_ft ih264_inter_pred_luma_horz_qpel_a9q;
+
+ih264_inter_pred_luma_ft ih264_inter_pred_luma_vert_qpel_a9q;
+
+ih264_inter_pred_luma_ft ih264_inter_pred_luma_horz_qpel_vert_qpel_a9q;
+
+ih264_inter_pred_luma_ft ih264_inter_pred_luma_horz_qpel_vert_hpel_a9q;
+
+ih264_inter_pred_luma_ft ih264_inter_pred_luma_horz_hpel_vert_qpel_a9q;
+
+ih264_inter_pred_chroma_ft ih264_inter_pred_chroma_a9q;
+
+/* AV8 NEON Declarations */
+ih264_inter_pred_luma_ft ih264_inter_pred_luma_copy_av8;
+
+ih264_interleave_copy_ft ih264_interleave_copy_av8;
+
+ih264_inter_pred_luma_ft ih264_inter_pred_luma_horz_av8;
+
+ih264_inter_pred_luma_ft ih264_inter_pred_luma_vert_av8;
+
+ih264_inter_pred_luma_ft ih264_inter_pred_luma_horz_hpel_vert_hpel_av8;
+
+ih264_inter_pred_luma_ft ih264_inter_pred_luma_horz_qpel_av8;
+
+ih264_inter_pred_luma_ft ih264_inter_pred_luma_vert_qpel_av8;
+
+ih264_inter_pred_luma_ft ih264_inter_pred_luma_horz_qpel_vert_qpel_av8;
+
+ih264_inter_pred_luma_ft ih264_inter_pred_luma_horz_qpel_vert_hpel_av8;
+
+ih264_inter_pred_luma_ft ih264_inter_pred_luma_horz_hpel_vert_qpel_av8;
+
+ih264_inter_pred_chroma_ft ih264_inter_pred_chroma_av8;
+
+ih264_inter_pred_chroma_ft ih264_inter_pred_chroma_dx_zero_av8;
+
+ih264_inter_pred_chroma_ft ih264_inter_pred_chroma_dy_zero_av8;
+
+
+/* SSSE3 Intrinsic Declarations */
+ih264_inter_pred_luma_ft ih264_inter_pred_luma_copy_ssse3;
+
+ih264_inter_pred_luma_ft ih264_inter_pred_luma_horz_ssse3;
+
+ih264_inter_pred_luma_ft ih264_inter_pred_luma_vert_ssse3;
+
+ih264_inter_pred_luma_bilinear_ft ih264_inter_pred_luma_bilinear_ssse3;
+
+ih264_inter_pred_luma_ft ih264_inter_pred_luma_horz_hpel_vert_hpel_ssse3;
+
+ih264_inter_pred_luma_ft ih264_inter_pred_luma_horz_qpel_ssse3;
+
+ih264_inter_pred_luma_ft ih264_inter_pred_luma_vert_qpel_ssse3;
+
+ih264_inter_pred_luma_ft ih264_inter_pred_luma_horz_qpel_vert_qpel_ssse3;
+
+ih264_inter_pred_luma_ft ih264_inter_pred_luma_horz_qpel_vert_hpel_ssse3;
+
+ih264_inter_pred_luma_ft ih264_inter_pred_luma_horz_hpel_vert_qpel_ssse3;
+
+ih264_inter_pred_chroma_ft ih264_inter_pred_chroma_ssse3;
+
+#endif
+
+/** Nothing past this point */
diff --git a/common/ih264_intra_pred_filters.h b/common/ih264_intra_pred_filters.h
new file mode 100755
index 0000000..caf6b33
--- /dev/null
+++ b/common/ih264_intra_pred_filters.h
@@ -0,0 +1,331 @@
+/******************************************************************************
+ *
+ * Copyright (C) 2015 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ *****************************************************************************
+ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
+*/
+/**
+ *******************************************************************************
+ * @file
+ * ih264_intra_pred_filters.h
+ *
+ * @brief
+ * Declarations of functions used for intra prediction
+ *
+ * @author
+ * Ittiam
+ *
+ * @remarks
+ * None
+ *
+ *******************************************************************************
+ */
+
+#ifndef IH264_INTRA_PRED_FILTERS_H_
+
+#define IH264_INTRA_PRED_FILTERS_H_
+
+/*****************************************************************************/
+/* Macro Expansion */
+/*****************************************************************************/
+
+/*! Filter (1,2,1) i.e (a + 2b + c) / 4 */
+#define FILT121(a,b,c) ((a + (b<<1) + c + 2)>>2)
+/*! Filter (1,1) i.e (a + b) / 2 */
+#define FILT11(a,b) ((a + b + 1)>>1)
+/*****************************************************************************/
+/* Global Variables */
+/*****************************************************************************/
+
+/* Global variables used only in assembly files*/
+extern const WORD8 ih264_gai1_intrapred_luma_plane_coeffs[];
+extern const WORD8 ih264_gai1_intrapred_chroma_plane_coeffs1[];
+extern const WORD8 ih264_gai1_intrapred_chroma_plane_coeffs2[];
+extern const WORD8 ih264_gai1_intrapred_luma_8x8_horz_u[];
+
+/*****************************************************************************/
+/* Extern Function Declarations */
+/*****************************************************************************/
+
+
+typedef void ih264_intra_pred_ref_filtering_ft(UWORD8 *pu1_left,
+ UWORD8 *pu1_topleft,
+ UWORD8 *pu1_top,
+ UWORD8 *pu1_dst,
+ WORD32 left_strd,
+ WORD32 ngbr_avail);
+
+typedef void ih264_intra_pred_luma_ft(UWORD8 *pu1_src,
+ UWORD8 *pu1_dst,
+ WORD32 src_strd,
+ WORD32 dst_strd,
+ WORD32 ngbr_avail);
+
+/* No Neon Definitions */
+
+/* Luma 4x4 Intra pred filters */
+
+ih264_intra_pred_luma_ft ih264_intra_pred_luma_4x4_mode_vert;
+
+ih264_intra_pred_luma_ft ih264_intra_pred_luma_4x4_mode_horz;
+
+ih264_intra_pred_luma_ft ih264_intra_pred_luma_4x4_mode_dc;
+
+ih264_intra_pred_luma_ft ih264_intra_pred_luma_4x4_mode_diag_dl;
+
+ih264_intra_pred_luma_ft ih264_intra_pred_luma_4x4_mode_diag_dr;
+
+ih264_intra_pred_luma_ft ih264_intra_pred_luma_4x4_mode_vert_r;
+
+ih264_intra_pred_luma_ft ih264_intra_pred_luma_4x4_mode_horz_d;
+
+ih264_intra_pred_luma_ft ih264_intra_pred_luma_4x4_mode_vert_l;
+
+ih264_intra_pred_luma_ft ih264_intra_pred_luma_4x4_mode_horz_u;
+
+/* Luma 8x8 Intra pred filters */
+
+ih264_intra_pred_luma_ft ih264_intra_pred_luma_8x8_mode_vert;
+
+ih264_intra_pred_luma_ft ih264_intra_pred_luma_8x8_mode_horz;
+
+ih264_intra_pred_luma_ft ih264_intra_pred_luma_8x8_mode_dc;
+
+ih264_intra_pred_luma_ft ih264_intra_pred_luma_8x8_mode_diag_dl;
+
+ih264_intra_pred_luma_ft ih264_intra_pred_luma_8x8_mode_diag_dr;
+
+ih264_intra_pred_luma_ft ih264_intra_pred_luma_8x8_mode_vert_r;
+
+ih264_intra_pred_luma_ft ih264_intra_pred_luma_8x8_mode_horz_d;
+
+ih264_intra_pred_luma_ft ih264_intra_pred_luma_8x8_mode_vert_l;
+
+ih264_intra_pred_luma_ft ih264_intra_pred_luma_8x8_mode_horz_u;
+
+/* Luma 16x16 Intra pred filters */
+
+ih264_intra_pred_luma_ft ih264_intra_pred_luma_16x16_mode_vert;
+
+ih264_intra_pred_luma_ft ih264_intra_pred_luma_16x16_mode_horz;
+
+ih264_intra_pred_luma_ft ih264_intra_pred_luma_16x16_mode_dc;
+
+ih264_intra_pred_luma_ft ih264_intra_pred_luma_16x16_mode_plane;
+
+/* Chroma 8x8 Intra pred filters */
+
+typedef ih264_intra_pred_luma_ft ih264_intra_pred_chroma_ft;
+
+ih264_intra_pred_chroma_ft ih264_intra_pred_chroma_8x8_mode_dc;
+
+ih264_intra_pred_chroma_ft ih264_intra_pred_chroma_8x8_mode_horz;
+
+ih264_intra_pred_chroma_ft ih264_intra_pred_chroma_8x8_mode_vert;
+
+ih264_intra_pred_chroma_ft ih264_intra_pred_chroma_8x8_mode_plane;
+
+
+ih264_intra_pred_ref_filtering_ft ih264_intra_pred_luma_8x8_mode_ref_filtering;
+
+/* A9 Definition */
+
+/* Luma 4x4 Intra pred filters */
+
+ih264_intra_pred_luma_ft ih264_intra_pred_luma_4x4_mode_vert_a9q;
+
+ih264_intra_pred_luma_ft ih264_intra_pred_luma_4x4_mode_horz_a9q;
+
+ih264_intra_pred_luma_ft ih264_intra_pred_luma_4x4_mode_dc_a9q;
+
+ih264_intra_pred_luma_ft ih264_intra_pred_luma_4x4_mode_diag_dl_a9q;
+
+ih264_intra_pred_luma_ft ih264_intra_pred_luma_4x4_mode_diag_dr_a9q;
+
+ih264_intra_pred_luma_ft ih264_intra_pred_luma_4x4_mode_vert_r_a9q;
+
+ih264_intra_pred_luma_ft ih264_intra_pred_luma_4x4_mode_horz_d_a9q;
+
+ih264_intra_pred_luma_ft ih264_intra_pred_luma_4x4_mode_vert_l_a9q;
+
+ih264_intra_pred_luma_ft ih264_intra_pred_luma_4x4_mode_horz_u_a9q;
+
+/* Luma 8x8 Intra pred filters */
+
+ih264_intra_pred_ref_filtering_ft ih264_intra_pred_luma_8x8_mode_ref_filtering_a9q;
+
+ih264_intra_pred_luma_ft ih264_intra_pred_luma_8x8_mode_vert_a9q;
+
+ih264_intra_pred_luma_ft ih264_intra_pred_luma_8x8_mode_horz_a9q;
+
+ih264_intra_pred_luma_ft ih264_intra_pred_luma_8x8_mode_dc_a9q;
+
+ih264_intra_pred_luma_ft ih264_intra_pred_luma_8x8_mode_diag_dl_a9q;
+
+ih264_intra_pred_luma_ft ih264_intra_pred_luma_8x8_mode_diag_dr_a9q;
+
+ih264_intra_pred_luma_ft ih264_intra_pred_luma_8x8_mode_vert_r_a9q;
+
+ih264_intra_pred_luma_ft ih264_intra_pred_luma_8x8_mode_horz_d_a9q;
+
+ih264_intra_pred_luma_ft ih264_intra_pred_luma_8x8_mode_vert_l_a9q;
+
+ih264_intra_pred_luma_ft ih264_intra_pred_luma_8x8_mode_horz_u_a9q;
+
+/* Luma 16x16 Intra pred filters */
+
+ih264_intra_pred_luma_ft ih264_intra_pred_luma_16x16_mode_vert_a9q;
+
+ih264_intra_pred_luma_ft ih264_intra_pred_luma_16x16_mode_horz_a9q;
+
+ih264_intra_pred_luma_ft ih264_intra_pred_luma_16x16_mode_dc_a9q;
+
+ih264_intra_pred_luma_ft ih264_intra_pred_luma_16x16_mode_plane_a9q;
+
+/* Chroma 8x8 Intra pred filters */
+
+ih264_intra_pred_chroma_ft ih264_intra_pred_chroma_8x8_mode_dc_a9q;
+
+ih264_intra_pred_chroma_ft ih264_intra_pred_chroma_8x8_mode_horz_a9q;
+
+ih264_intra_pred_chroma_ft ih264_intra_pred_chroma_8x8_mode_vert_a9q;
+
+ih264_intra_pred_chroma_ft ih264_intra_pred_chroma_8x8_mode_plane_a9q;
+
+/* X86 Intrinsic Definitions */
+
+/* Luma 4x4 Intra pred filters */
+
+ih264_intra_pred_luma_ft ih264_intra_pred_luma_4x4_mode_vert_ssse3;
+
+ih264_intra_pred_luma_ft ih264_intra_pred_luma_4x4_mode_horz_ssse3;
+
+ih264_intra_pred_luma_ft ih264_intra_pred_luma_4x4_mode_dc_ssse3;
+
+ih264_intra_pred_luma_ft ih264_intra_pred_luma_4x4_mode_diag_dl_ssse3;
+
+ih264_intra_pred_luma_ft ih264_intra_pred_luma_4x4_mode_diag_dr_ssse3;
+
+ih264_intra_pred_luma_ft ih264_intra_pred_luma_4x4_mode_vert_r_ssse3;
+
+ih264_intra_pred_luma_ft ih264_intra_pred_luma_4x4_mode_horz_d_ssse3;
+
+ih264_intra_pred_luma_ft ih264_intra_pred_luma_4x4_mode_vert_l_ssse3;
+
+ih264_intra_pred_luma_ft ih264_intra_pred_luma_4x4_mode_horz_u_ssse3;
+
+/* Luma 8x8 Intra pred filters */
+
+ih264_intra_pred_luma_ft ih264_intra_pred_luma_8x8_mode_vert_ssse3;
+
+ih264_intra_pred_luma_ft ih264_intra_pred_luma_8x8_mode_horz_ssse3;
+
+ih264_intra_pred_luma_ft ih264_intra_pred_luma_8x8_mode_dc_ssse3;
+
+ih264_intra_pred_luma_ft ih264_intra_pred_luma_8x8_mode_diag_dl_ssse3;
+
+ih264_intra_pred_luma_ft ih264_intra_pred_luma_8x8_mode_diag_dr_ssse3;
+
+ih264_intra_pred_luma_ft ih264_intra_pred_luma_8x8_mode_vert_r_ssse3;
+
+ih264_intra_pred_luma_ft ih264_intra_pred_luma_8x8_mode_horz_d_ssse3;
+
+ih264_intra_pred_luma_ft ih264_intra_pred_luma_8x8_mode_vert_l_ssse3;
+
+ih264_intra_pred_luma_ft ih264_intra_pred_luma_8x8_mode_horz_u_ssse3;
+
+/* Luma 16x16 Intra pred filters */
+
+ih264_intra_pred_luma_ft ih264_intra_pred_luma_16x16_mode_vert_ssse3;
+
+ih264_intra_pred_luma_ft ih264_intra_pred_luma_16x16_mode_horz_ssse3;
+
+ih264_intra_pred_luma_ft ih264_intra_pred_luma_16x16_mode_dc_ssse3;
+
+ih264_intra_pred_luma_ft ih264_intra_pred_luma_16x16_mode_plane_ssse3;
+
+/* Chroma 8x8 Intra pred filters */
+
+ih264_intra_pred_chroma_ft ih264_intra_pred_chroma_8x8_mode_dc_ssse3;
+
+ih264_intra_pred_chroma_ft ih264_intra_pred_chroma_8x8_mode_horz_ssse3;
+
+ih264_intra_pred_chroma_ft ih264_intra_pred_chroma_8x8_mode_vert_ssse3;
+
+ih264_intra_pred_chroma_ft ih264_intra_pred_chroma_8x8_mode_plane_ssse3;
+
+/* AV8 Definition */
+
+/* Luma 4x4 Intra pred filters */
+ih264_intra_pred_luma_ft ih264_intra_pred_luma_4x4_mode_vert_av8;
+
+ih264_intra_pred_luma_ft ih264_intra_pred_luma_4x4_mode_horz_av8;
+
+ih264_intra_pred_luma_ft ih264_intra_pred_luma_4x4_mode_dc_av8;
+
+ih264_intra_pred_luma_ft ih264_intra_pred_luma_4x4_mode_diag_dl_av8;
+
+ih264_intra_pred_luma_ft ih264_intra_pred_luma_4x4_mode_diag_dr_av8;
+
+ih264_intra_pred_luma_ft ih264_intra_pred_luma_4x4_mode_vert_r_av8;
+
+ih264_intra_pred_luma_ft ih264_intra_pred_luma_4x4_mode_horz_d_av8;
+
+ih264_intra_pred_luma_ft ih264_intra_pred_luma_4x4_mode_vert_l_av8;
+
+ih264_intra_pred_luma_ft ih264_intra_pred_luma_4x4_mode_horz_u_av8;
+
+/* Luma 8x8 Intra pred filters */
+
+ih264_intra_pred_luma_ft ih264_intra_pred_luma_8x8_mode_vert_av8;
+
+ih264_intra_pred_luma_ft ih264_intra_pred_luma_8x8_mode_horz_av8;
+
+ih264_intra_pred_luma_ft ih264_intra_pred_luma_8x8_mode_dc_av8;
+
+ih264_intra_pred_luma_ft ih264_intra_pred_luma_8x8_mode_diag_dl_av8;
+
+ih264_intra_pred_luma_ft ih264_intra_pred_luma_8x8_mode_diag_dr_av8;
+
+ih264_intra_pred_luma_ft ih264_intra_pred_luma_8x8_mode_vert_r_av8;
+
+ih264_intra_pred_luma_ft ih264_intra_pred_luma_8x8_mode_horz_d_av8;
+
+ih264_intra_pred_luma_ft ih264_intra_pred_luma_8x8_mode_vert_l_av8;
+
+ih264_intra_pred_luma_ft ih264_intra_pred_luma_8x8_mode_horz_u_av8;
+
+/* Luma 16x16 Intra pred filters */
+
+ih264_intra_pred_luma_ft ih264_intra_pred_luma_16x16_mode_vert_av8;
+
+ih264_intra_pred_luma_ft ih264_intra_pred_luma_16x16_mode_horz_av8;
+
+ih264_intra_pred_luma_ft ih264_intra_pred_luma_16x16_mode_dc_av8;
+
+ih264_intra_pred_luma_ft ih264_intra_pred_luma_16x16_mode_plane_av8;
+
+/* Chroma 8x8 Intra pred filters */
+
+ih264_intra_pred_chroma_ft ih264_intra_pred_chroma_8x8_mode_dc_av8;
+
+ih264_intra_pred_chroma_ft ih264_intra_pred_chroma_8x8_mode_horz_av8;
+
+ih264_intra_pred_chroma_ft ih264_intra_pred_chroma_8x8_mode_vert_av8;
+
+ih264_intra_pred_chroma_ft ih264_intra_pred_chroma_8x8_mode_plane_av8;
+
+#endif /* IH264_INTRA_PRED_FILTERS_H_ */
diff --git a/common/ih264_iquant_itrans_recon.c b/common/ih264_iquant_itrans_recon.c
new file mode 100755
index 0000000..3c14046
--- /dev/null
+++ b/common/ih264_iquant_itrans_recon.c
@@ -0,0 +1,873 @@
+/******************************************************************************
+ *
+ * Copyright (C) 2015 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ *****************************************************************************
+ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
+*/
+/**
+ *******************************************************************************
+ * @file
+ * ih264_iquant_itrans_recon.c
+ *
+ * @brief
+ * Contains definition of functions for h264 inverse quantization inverse transformation and recon
+ *
+ * @author
+ * Ittiam
+ *
+ * @par List of Functions:
+ * - ih264_iquant_itrans_recon_4x4()
+ * - ih264_iquant_itrans_recon_8x8()
+ * - ih264_iquant_itrans_recon_4x4_dc()
+ * - ih264_iquant_itrans_recon_8x8_dc()
+ * - ih264_iquant_itrans_recon_chroma_4x4()
+ * -ih264_iquant_itrans_recon_chroma_4x4_dc()
+ *
+ * @remarks
+ *
+ *******************************************************************************
+ */
+
+/*****************************************************************************/
+/* File Includes */
+/*****************************************************************************/
+
+/* User include files */
+#include "ih264_typedefs.h"
+#include "ih264_defs.h"
+#include "ih264_trans_macros.h"
+#include "ih264_macros.h"
+#include "ih264_platform_macros.h"
+#include "ih264_trans_data.h"
+#include "ih264_size_defs.h"
+#include "ih264_structs.h"
+#include "ih264_trans_quant_itrans_iquant.h"
+
+/*
+ ********************************************************************************
+ *
+ * @brief This function reconstructs a 4x4 sub block from quantized resiude and
+ * prediction buffer
+ *
+ * @par Description:
+ * The quantized residue is first inverse quantized, then inverse transformed.
+ * This inverse transformed content is added to the prediction buffer to recon-
+ * struct the end output
+ *
+ * @param[in] pi2_src
+ * quantized 4x4 block
+ *
+ * @param[in] pu1_pred
+ * prediction 4x4 block
+ *
+ * @param[out] pu1_out
+ * reconstructed 4x4 block
+ *
+ * @param[in] src_strd
+ * quantization buffer stride
+ *
+ * @param[in] pred_strd,
+ * Prediction buffer stride
+ *
+ * @param[in] out_strd
+ * recon buffer Stride
+ *
+ * @param[in] pu2_scaling_list
+ * pointer to scaling list
+ *
+ * @param[in] pu2_norm_adjust
+ * pointer to inverse scale matrix
+ *
+ * @param[in] u4_qp_div_6
+ * Floor (qp/6)
+ *
+ * @param[in] pi4_tmp
+ * temporary buffer of size 1*16
+ *
+ * @returns none
+ *
+ * @remarks none
+ *
+ *******************************************************************************
+ */
+void ih264_iquant_itrans_recon_4x4(WORD16 *pi2_src,
+ UWORD8 *pu1_pred,
+ UWORD8 *pu1_out,
+ WORD32 pred_strd,
+ WORD32 out_strd,
+ const UWORD16 *pu2_iscal_mat,
+ const UWORD16 *pu2_weigh_mat,
+ UWORD32 u4_qp_div_6,
+ WORD16 *pi2_tmp,
+ WORD32 iq_start_idx,
+ WORD16 *pi2_dc_ld_addr
+)
+{
+ WORD16 *pi2_src_ptr = pi2_src;
+ WORD16 *pi2_tmp_ptr = pi2_tmp;
+ UWORD8 *pu1_pred_ptr = pu1_pred;
+ UWORD8 *pu1_out_ptr = pu1_out;
+ WORD16 x0, x1, x2, x3, i;
+ WORD32 q0, q1, q2, q3;
+ WORD16 i_macro;
+ WORD16 rnd_fact = (u4_qp_div_6 < 4) ? 1 << (3 - u4_qp_div_6) : 0;
+
+ /* inverse quant */
+ /*horizontal inverse transform */
+ for(i = 0; i < SUB_BLK_WIDTH_4x4; i++)
+ {
+ q0 = pi2_src_ptr[0];
+ INV_QUANT(q0, pu2_iscal_mat[0], pu2_weigh_mat[0], u4_qp_div_6, rnd_fact,
+ 4);
+ if (i==0 && iq_start_idx == 1)
+ q0 = pi2_dc_ld_addr[0]; // Restoring dc value for intra case
+
+ q2 = pi2_src_ptr[2];
+ INV_QUANT(q2, pu2_iscal_mat[2], pu2_weigh_mat[2], u4_qp_div_6, rnd_fact,
+ 4);
+
+ x0 = q0 + q2;
+ x1 = q0 - q2;
+
+ q1 = pi2_src_ptr[1];
+ INV_QUANT(q1, pu2_iscal_mat[1], pu2_weigh_mat[1], u4_qp_div_6, rnd_fact,
+ 4);
+
+ q3 = pi2_src_ptr[3];
+ INV_QUANT(q3, pu2_iscal_mat[3], pu2_weigh_mat[3], u4_qp_div_6, rnd_fact,
+ 4);
+
+ x2 = (q1 >> 1) - q3;
+ x3 = q1 + (q3 >> 1);
+
+ pi2_tmp_ptr[0] = x0 + x3;
+ pi2_tmp_ptr[1] = x1 + x2;
+ pi2_tmp_ptr[2] = x1 - x2;
+ pi2_tmp_ptr[3] = x0 - x3;
+
+ pi2_src_ptr += SUB_BLK_WIDTH_4x4;
+ pi2_tmp_ptr += SUB_BLK_WIDTH_4x4;
+ pu2_iscal_mat += SUB_BLK_WIDTH_4x4;
+ pu2_weigh_mat += SUB_BLK_WIDTH_4x4;
+ }
+
+ /* vertical inverse transform */
+ pi2_tmp_ptr = pi2_tmp;
+ for(i = 0; i < SUB_BLK_WIDTH_4x4; i++)
+ {
+ pu1_pred_ptr = pu1_pred;
+ pu1_out = pu1_out_ptr;
+
+ x0 = (pi2_tmp_ptr[0] + pi2_tmp_ptr[8]);
+ x1 = (pi2_tmp_ptr[0] - pi2_tmp_ptr[8]);
+ x2 = (pi2_tmp_ptr[4] >> 1) - pi2_tmp_ptr[12];
+ x3 = pi2_tmp_ptr[4] + (pi2_tmp_ptr[12] >> 1);
+
+ /* inverse prediction */
+ i_macro = x0 + x3;
+ i_macro = ((i_macro + 32) >> 6);
+ i_macro += *pu1_pred_ptr;
+ *pu1_out = CLIP_U8(i_macro);
+ pu1_pred_ptr += pred_strd;
+ pu1_out += out_strd;
+
+ i_macro = x1 + x2;
+ i_macro = ((i_macro + 32) >> 6);
+ i_macro += *pu1_pred_ptr;
+ *pu1_out = CLIP_U8(i_macro);
+ pu1_pred_ptr += pred_strd;
+ pu1_out += out_strd;
+
+ i_macro = x1 - x2;
+ i_macro = ((i_macro + 32) >> 6);
+ i_macro += *pu1_pred_ptr;
+ *pu1_out = CLIP_U8(i_macro);
+ pu1_pred_ptr += pred_strd;
+ pu1_out += out_strd;
+
+ i_macro = x0 - x3;
+ i_macro = ((i_macro + 32) >> 6);
+ i_macro += *pu1_pred_ptr;
+ *pu1_out = CLIP_U8(i_macro);
+
+ pi2_tmp_ptr++;
+ pu1_out_ptr++;
+ pu1_pred++;
+ }
+
+}
+
+void ih264_iquant_itrans_recon_4x4_dc(WORD16 *pi2_src,
+ UWORD8 *pu1_pred,
+ UWORD8 *pu1_out,
+ WORD32 pred_strd,
+ WORD32 out_strd,
+ const UWORD16 *pu2_iscal_mat,
+ const UWORD16 *pu2_weigh_mat,
+ UWORD32 u4_qp_div_6,
+ WORD16 *pi2_tmp,
+ WORD32 iq_start_idx,
+ WORD16 *pi2_dc_ld_addr)
+{
+ UWORD8 *pu1_pred_ptr = pu1_pred;
+ UWORD8 *pu1_out_ptr = pu1_out;
+ WORD32 q0;
+ WORD16 x, i_macro, i;
+ WORD16 rnd_fact = (u4_qp_div_6 < 4) ? 1 << (3 - u4_qp_div_6) : 0;
+ UNUSED(pi2_tmp);
+
+ if (iq_start_idx == 0)
+ {
+ q0 = pi2_src[0];
+ INV_QUANT(q0, pu2_iscal_mat[0], pu2_weigh_mat[0], u4_qp_div_6, rnd_fact, 4);
+ }
+ else
+ {
+ q0 = pi2_dc_ld_addr[0]; // Restoring dc value for intra case3
+ }
+ i_macro = ((q0 + 32) >> 6);
+ for(i = 0; i < SUB_BLK_WIDTH_4x4; i++)
+ {
+ pu1_pred_ptr = pu1_pred;
+ pu1_out = pu1_out_ptr;
+
+ /* inverse prediction */
+
+ x = i_macro + *pu1_pred_ptr;
+ *pu1_out = CLIP_U8(x);
+ pu1_pred_ptr += pred_strd;
+ pu1_out += out_strd;
+
+ x = i_macro + *pu1_pred_ptr;
+ *pu1_out = CLIP_U8(x);
+ pu1_pred_ptr += pred_strd;
+ pu1_out += out_strd;
+
+ x = i_macro + *pu1_pred_ptr;
+ *pu1_out = CLIP_U8(x);
+ pu1_pred_ptr += pred_strd;
+ pu1_out += out_strd;
+
+ x = i_macro + *pu1_pred_ptr;
+ *pu1_out = CLIP_U8(x);
+
+ pu1_out_ptr++;
+ pu1_pred++;
+ }
+}
+
+/**
+ *******************************************************************************
+ *
+ * @brief
+ * This function performs inverse quant and Inverse transform type Ci4 for 8x8 block
+ *
+ * @par Description:
+ * Performs inverse transform Ci8 and adds the residue to get the
+ * reconstructed block
+ *
+ * @param[in] pi2_src
+ * Input 8x8coefficients
+ *
+ * @param[in] pu1_pred
+ * Prediction 8x8 block
+ *
+ * @param[out] pu1_recon
+ * Output 8x8 block
+ *
+ * @param[in] q_div
+ * QP/6
+ *
+ * @param[in] q_rem
+ * QP%6
+ *
+ * @param[in] q_lev
+ * Quantizer level
+ *
+ * @param[in] src_strd
+ * Input stride
+ *
+ * @param[in] pred_strd,
+ * Prediction stride
+ *
+ * @param[in] out_strd
+ * Output Stride
+ *
+ * @param[in] pi4_tmp
+ * temporary buffer of size 1*16 we dont need a bigger blcok since we reuse
+ * the tmp for each block
+ *
+ * @param[in] pu4_iquant_mat
+ * Pointer to the inverse quantization matrix
+ *
+ * @returns Void
+ *
+ * @remarks
+ * None
+ *
+ *******************************************************************************
+ */
+void ih264_iquant_itrans_recon_8x8(WORD16 *pi2_src,
+ UWORD8 *pu1_pred,
+ UWORD8 *pu1_out,
+ WORD32 pred_strd,
+ WORD32 out_strd,
+ const UWORD16 *pu2_iscale_mat,
+ const UWORD16 *pu2_weigh_mat,
+ UWORD32 qp_div,
+ WORD16 *pi2_tmp,
+ WORD32 iq_start_idx,
+ WORD16 *pi2_dc_ld_addr
+)
+{
+ WORD32 i;
+ WORD16 *pi2_tmp_ptr = pi2_tmp;
+ UWORD8 *pu1_pred_ptr = pu1_pred;
+ UWORD8 *pu1_out_ptr = pu1_out;
+ WORD16 i_z0, i_z1, i_z2, i_z3, i_z4, i_z5, i_z6, i_z7;
+ WORD16 i_y0, i_y1, i_y2, i_y3, i_y4, i_y5, i_y6, i_y7;
+ WORD16 i_macro;
+ WORD32 q;
+ WORD32 rnd_fact = (qp_div < 6) ? (1 << (5 - qp_div)) : 0;
+ UNUSED(iq_start_idx);
+ UNUSED(pi2_dc_ld_addr);
+ /*************************************************************/
+ /* De quantization of coefficients. Will be replaced by SIMD */
+ /* operations on platform. Note : DC coeff is not scaled */
+ /*************************************************************/
+ for(i = 0; i < (SUB_BLK_WIDTH_8x8 * SUB_BLK_WIDTH_8x8); i++)
+ {
+ q = pi2_src[i];
+ INV_QUANT(q, pu2_iscale_mat[i], pu2_weigh_mat[i], qp_div, rnd_fact, 6);
+ pi2_tmp_ptr[i] = q;
+ }
+ /* Perform Inverse transform */
+ /*--------------------------------------------------------------------*/
+ /* IDCT [ Horizontal transformation ] */
+ /*--------------------------------------------------------------------*/
+ for(i = 0; i < SUB_BLK_WIDTH_8x8; i++)
+ {
+ /*------------------------------------------------------------------*/
+ /* y0 = w0 + w4 */
+ /* y1 = -w3 + w5 - w7 - (w7 >> 1) */
+ /* y2 = w0 - w4 */
+ /* y3 = w1 + w7 - w3 - (w3 >> 1) */
+ /* y4 = (w2 >> 1) - w6 */
+ /* y5 = -w1 + w7 + w5 + (w5 >> 1) */
+ /* y6 = w2 + (w6 >> 1) */
+ /* y7 = w3 + w5 + w1 + (w1 >> 1) */
+ /*------------------------------------------------------------------*/
+ i_y0 = (pi2_tmp_ptr[0] + pi2_tmp_ptr[4] );
+
+ i_y1 = ((WORD32)(-pi2_tmp_ptr[3]) + pi2_tmp_ptr[5] - pi2_tmp_ptr[7]
+ - (pi2_tmp_ptr[7] >> 1));
+
+ i_y2 = (pi2_tmp_ptr[0] - pi2_tmp_ptr[4] );
+
+ i_y3 = ((WORD32)pi2_tmp_ptr[1] + pi2_tmp_ptr[7] - pi2_tmp_ptr[3]
+ - (pi2_tmp_ptr[3] >> 1));
+
+ i_y4 = ((pi2_tmp_ptr[2] >> 1) - pi2_tmp_ptr[6] );
+
+ i_y5 = ((WORD32)(-pi2_tmp_ptr[1]) + pi2_tmp_ptr[7] + pi2_tmp_ptr[5]
+ + (pi2_tmp_ptr[5] >> 1));
+
+ i_y6 = (pi2_tmp_ptr[2] + (pi2_tmp_ptr[6] >> 1));
+
+ i_y7 = ((WORD32)pi2_tmp_ptr[3] + pi2_tmp_ptr[5] + pi2_tmp_ptr[1]
+ + (pi2_tmp_ptr[1] >> 1));
+
+ /*------------------------------------------------------------------*/
+ /* z0 = y0 + y6 */
+ /* z1 = y1 + (y7 >> 2) */
+ /* z2 = y2 + y4 */
+ /* z3 = y3 + (y5 >> 2) */
+ /* z4 = y2 - y4 */
+ /* z5 = (y3 >> 2) - y5 */
+ /* z6 = y0 - y6 */
+ /* z7 = y7 - (y1 >> 2) */
+ /*------------------------------------------------------------------*/
+ i_z0 = i_y0 + i_y6;
+ i_z1 = i_y1 + (i_y7 >> 2);
+ i_z2 = i_y2 + i_y4;
+ i_z3 = i_y3 + (i_y5 >> 2);
+ i_z4 = i_y2 - i_y4;
+ i_z5 = (i_y3 >> 2) - i_y5;
+ i_z6 = i_y0 - i_y6;
+ i_z7 = i_y7 - (i_y1 >> 2);
+
+ /*------------------------------------------------------------------*/
+ /* x0 = z0 + z7 */
+ /* x1 = z2 + z5 */
+ /* x2 = z4 + z3 */
+ /* x3 = z6 + z1 */
+ /* x4 = z6 - z1 */
+ /* x5 = z4 - z3 */
+ /* x6 = z2 - z5 */
+ /* x7 = z0 - z7 */
+ /*------------------------------------------------------------------*/
+ pi2_tmp_ptr[0] = i_z0 + i_z7;
+ pi2_tmp_ptr[1] = i_z2 + i_z5;
+ pi2_tmp_ptr[2] = i_z4 + i_z3;
+ pi2_tmp_ptr[3] = i_z6 + i_z1;
+ pi2_tmp_ptr[4] = i_z6 - i_z1;
+ pi2_tmp_ptr[5] = i_z4 - i_z3;
+ pi2_tmp_ptr[6] = i_z2 - i_z5;
+ pi2_tmp_ptr[7] = i_z0 - i_z7;
+
+ /* move to the next row */
+ //pi2_src_ptr += SUB_BLK_WIDTH_8x8;
+ pi2_tmp_ptr += SUB_BLK_WIDTH_8x8;
+ }
+ /*--------------------------------------------------------------------*/
+ /* IDCT [ Vertical transformation] and Xij = (xij + 32)>>6 */
+ /* */
+ /* Add the prediction and store it back to reconstructed frame buffer */
+ /* [Prediction buffer itself in this case] */
+ /*--------------------------------------------------------------------*/
+
+ pi2_tmp_ptr = pi2_tmp;
+ for(i = 0; i < SUB_BLK_WIDTH_8x8; i++)
+ {
+ pu1_pred_ptr = pu1_pred;
+ pu1_out = pu1_out_ptr;
+ /*------------------------------------------------------------------*/
+ /* y0j = w0j + w4j */
+ /* y1j = -w3j + w5j -w7j -(w7j >> 1) */
+ /* y2j = w0j -w4j */
+ /* y3j = w1j + w7j -w3j -(w3j >> 1) */
+ /* y4j = ( w2j >> 1 ) -w6j */
+ /* y5j = -w1j + w7j + w5j + (w5j >> 1) */
+ /* y6j = w2j + ( w6j >> 1 ) */
+ /* y7j = w3j + w5j + w1j + (w1j >> 1) */
+ /*------------------------------------------------------------------*/
+ i_y0 = pi2_tmp_ptr[0] + pi2_tmp_ptr[32];
+
+ i_y1 = (WORD32)(-pi2_tmp_ptr[24]) + pi2_tmp_ptr[40] - pi2_tmp_ptr[56]
+ - (pi2_tmp_ptr[56] >> 1);
+
+ i_y2 = pi2_tmp_ptr[0] - pi2_tmp_ptr[32];
+
+ i_y3 = (WORD32)pi2_tmp_ptr[8] + pi2_tmp_ptr[56] - pi2_tmp_ptr[24]
+ - (pi2_tmp_ptr[24] >> 1);
+
+ i_y4 = (pi2_tmp_ptr[16] >> 1) - pi2_tmp_ptr[48];
+
+ i_y5 = (WORD32)(-pi2_tmp_ptr[8]) + pi2_tmp_ptr[56] + pi2_tmp_ptr[40]
+ + (pi2_tmp_ptr[40] >> 1);
+
+ i_y6 = pi2_tmp_ptr[16] + (pi2_tmp_ptr[48] >> 1);
+
+ i_y7 = (WORD32)pi2_tmp_ptr[24] + pi2_tmp_ptr[40] + pi2_tmp_ptr[8]
+ + (pi2_tmp_ptr[8] >> 1);
+
+ /*------------------------------------------------------------------*/
+ /* z0j = y0j + y6j */
+ /* z1j = y1j + (y7j >> 2) */
+ /* z2j = y2j + y4j */
+ /* z3j = y3j + (y5j >> 2) */
+ /* z4j = y2j -y4j */
+ /* z5j = (y3j >> 2) -y5j */
+ /* z6j = y0j -y6j */
+ /* z7j = y7j -(y1j >> 2) */
+ /*------------------------------------------------------------------*/
+ i_z0 = i_y0 + i_y6;
+ i_z1 = i_y1 + (i_y7 >> 2);
+ i_z2 = i_y2 + i_y4;
+ i_z3 = i_y3 + (i_y5 >> 2);
+ i_z4 = i_y2 - i_y4;
+ i_z5 = (i_y3 >> 2) - i_y5;
+ i_z6 = i_y0 - i_y6;
+ i_z7 = i_y7 - (i_y1 >> 2);
+
+ /*------------------------------------------------------------------*/
+ /* x0j = z0j + z7j */
+ /* x1j = z2j + z5j */
+ /* x2j = z4j + z3j */
+ /* x3j = z6j + z1j */
+ /* x4j = z6j -z1j */
+ /* x5j = z4j -z3j */
+ /* x6j = z2j -z5j */
+ /* x7j = z0j -z7j */
+ /*------------------------------------------------------------------*/
+ i_macro = ((i_z0 + i_z7 + 32) >> 6) + *pu1_pred_ptr;
+ *pu1_out = CLIP_U8(i_macro);
+ /* Change uc_recBuffer to Point to next element in the same column*/
+ pu1_pred_ptr += pred_strd;
+ pu1_out += out_strd;
+
+ i_macro = ((i_z2 + i_z5 + 32) >> 6) + *pu1_pred_ptr;
+ *pu1_out = CLIP_U8(i_macro);
+ pu1_pred_ptr += pred_strd;
+ pu1_out += out_strd;
+
+ i_macro = ((i_z4 + i_z3 + 32) >> 6) + *pu1_pred_ptr;
+ *pu1_out = CLIP_U8(i_macro);
+ pu1_pred_ptr += pred_strd;
+ pu1_out += out_strd;
+
+ i_macro = ((i_z6 + i_z1 + 32) >> 6) + *pu1_pred_ptr;
+ *pu1_out = CLIP_U8(i_macro);
+ pu1_pred_ptr += pred_strd;
+ pu1_out += out_strd;
+
+ i_macro = ((i_z6 - i_z1 + 32) >> 6) + *pu1_pred_ptr;
+ *pu1_out = CLIP_U8(i_macro);
+ pu1_pred_ptr += pred_strd;
+ pu1_out += out_strd;
+
+ i_macro = ((i_z4 - i_z3 + 32) >> 6) + *pu1_pred_ptr;
+ *pu1_out = CLIP_U8(i_macro);
+ pu1_pred_ptr += pred_strd;
+ pu1_out += out_strd;
+
+ i_macro = ((i_z2 - i_z5 + 32) >> 6) + *pu1_pred_ptr;
+ *pu1_out = CLIP_U8(i_macro);
+ pu1_pred_ptr += pred_strd;
+ pu1_out += out_strd;
+
+ i_macro = ((i_z0 - i_z7 + 32) >> 6) + *pu1_pred_ptr;
+ *pu1_out = CLIP_U8(i_macro);
+
+ pi2_tmp_ptr++;
+ pu1_out_ptr++;
+ pu1_pred++;
+ }
+}
+
+void ih264_iquant_itrans_recon_8x8_dc(WORD16 *pi2_src,
+ UWORD8 *pu1_pred,
+ UWORD8 *pu1_out,
+ WORD32 pred_strd,
+ WORD32 out_strd,
+ const UWORD16 *pu2_iscale_mat,
+ const UWORD16 *pu2_weigh_mat,
+ UWORD32 qp_div,
+ WORD16 *pi2_tmp,
+ WORD32 iq_start_idx,
+ WORD16 *pi2_dc_ld_addr)
+{
+ UWORD8 *pu1_pred_ptr = pu1_pred;
+ UWORD8 *pu1_out_ptr = pu1_out;
+ WORD16 x, i, i_macro;
+ WORD32 q;
+ WORD32 rnd_fact = (qp_div < 6) ? (1 << (5 - qp_div)) : 0;
+ UNUSED(pi2_tmp);
+ UNUSED(iq_start_idx);
+ UNUSED(pi2_dc_ld_addr);
+ /*************************************************************/
+ /* Dequantization of coefficients. Will be replaced by SIMD */
+ /* operations on platform. Note : DC coeff is not scaled */
+ /*************************************************************/
+ q = pi2_src[0];
+ INV_QUANT(q, pu2_iscale_mat[0], pu2_weigh_mat[0], qp_div, rnd_fact, 6);
+ i_macro = (q + 32) >> 6;
+ /* Perform Inverse transform */
+ /*--------------------------------------------------------------------*/
+ /* IDCT [ Horizontal transformation ] */
+ /*--------------------------------------------------------------------*/
+ /*--------------------------------------------------------------------*/
+ /* IDCT [ Vertical transformation] and Xij = (xij + 32)>>6 */
+ /* */
+ /* Add the prediction and store it back to reconstructed frame buffer */
+ /* [Prediction buffer itself in this case] */
+ /*--------------------------------------------------------------------*/
+ for(i = 0; i < SUB_BLK_WIDTH_8x8; i++)
+ {
+ pu1_pred_ptr = pu1_pred;
+ pu1_out = pu1_out_ptr;
+
+ x = i_macro + *pu1_pred_ptr;
+ *pu1_out = CLIP_U8(x);
+ /* Change uc_recBuffer to Point to next element in the same column*/
+ pu1_pred_ptr += pred_strd;
+ pu1_out += out_strd;
+
+ x = i_macro + *pu1_pred_ptr;
+ *pu1_out = CLIP_U8(x);
+ pu1_pred_ptr += pred_strd;
+ pu1_out += out_strd;
+
+ x = i_macro + *pu1_pred_ptr;
+ *pu1_out = CLIP_U8(x);
+ pu1_pred_ptr += pred_strd;
+ pu1_out += out_strd;
+
+ x = i_macro + *pu1_pred_ptr;
+ *pu1_out = CLIP_U8(x);
+ pu1_pred_ptr += pred_strd;
+ pu1_out += out_strd;
+
+ x = i_macro + *pu1_pred_ptr;
+ *pu1_out = CLIP_U8(x);
+ pu1_pred_ptr += pred_strd;
+ pu1_out += out_strd;
+
+ x = i_macro + *pu1_pred_ptr;
+ *pu1_out = CLIP_U8(x);
+ pu1_pred_ptr += pred_strd;
+ pu1_out += out_strd;
+
+ x = i_macro + *pu1_pred_ptr;
+ *pu1_out = CLIP_U8(x);
+ pu1_pred_ptr += pred_strd;
+ pu1_out += out_strd;
+
+ x = i_macro + *pu1_pred_ptr;
+ *pu1_out = CLIP_U8(x);
+
+ pu1_out_ptr++;
+ pu1_pred++;
+ }
+}
+
+/*
+ ********************************************************************************
+ *
+ * @brief This function reconstructs a 4x4 sub block from quantized resiude and
+ * prediction buffer
+ *
+ * @par Description:
+ * The quantized residue is first inverse quantized, then inverse transformed.
+ * This inverse transformed content is added to the prediction buffer to recon-
+ * struct the end output
+ *
+ * @param[in] pi2_src
+ * quantized 4x4 block
+ *
+ * @param[in] pu1_pred
+ * prediction 4x4 block
+ *
+ * @param[out] pu1_out
+ * reconstructed 4x4 block
+ *
+ * @param[in] src_strd
+ * quantization buffer stride
+ *
+ * @param[in] pred_strd,
+ * Prediction buffer stride
+ *
+ * @param[in] out_strd
+ * recon buffer Stride
+ *
+ * @param[in] pu2_scaling_list
+ * pointer to scaling list
+ *
+ * @param[in] pu2_norm_adjust
+ * pointer to inverse scale matrix
+ *
+ * @param[in] u4_qp_div_6
+ * Floor (qp/6)
+ *
+ * @param[in] pi4_tmp
+ * temporary buffer of size 1*16
+ *
+ * @returns none
+ *
+ * @remarks none
+ *
+ *******************************************************************************
+ */
+void ih264_iquant_itrans_recon_chroma_4x4(WORD16 *pi2_src,
+ UWORD8 *pu1_pred,
+ UWORD8 *pu1_out,
+ WORD32 pred_strd,
+ WORD32 out_strd,
+ const UWORD16 *pu2_iscal_mat,
+ const UWORD16 *pu2_weigh_mat,
+ UWORD32 u4_qp_div_6,
+ WORD16 *pi2_tmp,
+ WORD16 *pi2_dc_src)
+{
+ WORD16 *pi2_src_ptr = pi2_src;
+ WORD16 *pi2_tmp_ptr = pi2_tmp;
+ UWORD8 *pu1_pred_ptr = pu1_pred;
+ UWORD8 *pu1_out_ptr = pu1_out;
+ WORD16 x0, x1, x2, x3, i;
+ WORD32 q0, q1, q2, q3;
+ WORD16 i_macro;
+ WORD16 rnd_fact = (u4_qp_div_6 < 4) ? 1 << (3 - u4_qp_div_6) : 0;
+
+ /* inverse quant */
+ /*horizontal inverse transform */
+ for(i = 0; i < SUB_BLK_WIDTH_4x4; i++)
+ {
+ if(i==0)
+ {
+ q0 = pi2_dc_src[0];
+ }
+ else
+ {
+ q0 = pi2_src_ptr[0];
+ INV_QUANT(q0, pu2_iscal_mat[0], pu2_weigh_mat[0], u4_qp_div_6, rnd_fact, 4);
+ }
+
+ q2 = pi2_src_ptr[2];
+ INV_QUANT(q2, pu2_iscal_mat[2], pu2_weigh_mat[2], u4_qp_div_6, rnd_fact,
+ 4);
+
+ x0 = q0 + q2;
+ x1 = q0 - q2;
+
+ q1 = pi2_src_ptr[1];
+ INV_QUANT(q1, pu2_iscal_mat[1], pu2_weigh_mat[1], u4_qp_div_6, rnd_fact,
+ 4);
+
+ q3 = pi2_src_ptr[3];
+ INV_QUANT(q3, pu2_iscal_mat[3], pu2_weigh_mat[3], u4_qp_div_6, rnd_fact,
+ 4);
+
+ x2 = (q1 >> 1) - q3;
+ x3 = q1 + (q3 >> 1);
+
+ pi2_tmp_ptr[0] = x0 + x3;
+ pi2_tmp_ptr[1] = x1 + x2;
+ pi2_tmp_ptr[2] = x1 - x2;
+ pi2_tmp_ptr[3] = x0 - x3;
+
+ pi2_src_ptr += SUB_BLK_WIDTH_4x4;
+ pi2_tmp_ptr += SUB_BLK_WIDTH_4x4;
+ pu2_iscal_mat += SUB_BLK_WIDTH_4x4;
+ pu2_weigh_mat += SUB_BLK_WIDTH_4x4;
+ }
+
+ /* vertical inverse transform */
+ pi2_tmp_ptr = pi2_tmp;
+ for(i = 0; i < SUB_BLK_WIDTH_4x4; i++)
+ {
+ pu1_pred_ptr = pu1_pred;
+ pu1_out = pu1_out_ptr;
+
+ x0 = (pi2_tmp_ptr[0] + pi2_tmp_ptr[8]);
+ x1 = (pi2_tmp_ptr[0] - pi2_tmp_ptr[8]);
+ x2 = (pi2_tmp_ptr[4] >> 1) - pi2_tmp_ptr[12];
+ x3 = pi2_tmp_ptr[4] + (pi2_tmp_ptr[12] >> 1);
+
+ /* inverse prediction */
+ i_macro = x0 + x3;
+ i_macro = ((i_macro + 32) >> 6);
+ i_macro += *pu1_pred_ptr;
+ *pu1_out = CLIP_U8(i_macro);
+ pu1_pred_ptr += pred_strd;
+ pu1_out += out_strd;
+
+ i_macro = x1 + x2;
+ i_macro = ((i_macro + 32) >> 6);
+ i_macro += *pu1_pred_ptr;
+ *pu1_out = CLIP_U8(i_macro);
+ pu1_pred_ptr += pred_strd;
+ pu1_out += out_strd;
+
+ i_macro = x1 - x2;
+ i_macro = ((i_macro + 32) >> 6);
+ i_macro += *pu1_pred_ptr;
+ *pu1_out = CLIP_U8(i_macro);
+ pu1_pred_ptr += pred_strd;
+ pu1_out += out_strd;
+
+ i_macro = x0 - x3;
+ i_macro = ((i_macro + 32) >> 6);
+ i_macro += *pu1_pred_ptr;
+ *pu1_out = CLIP_U8(i_macro);
+
+ pi2_tmp_ptr++;
+ pu1_out_ptr+= 2; //Interleaved store for output
+ pu1_pred+= 2; //Interleaved load for pred buffer
+ }
+}
+
+/*
+ ********************************************************************************
+ *
+ * @brief This function reconstructs a 4x4 sub block from quantized resiude and
+ * prediction buffer if only dc value is present for residue
+ *
+ * @par Description:
+ * The quantized residue is first inverse quantized,
+ * This inverse quantized content is added to the prediction buffer to recon-
+ * struct the end output
+ *
+ * @param[in] pi2_src
+ * quantized dc coefficient
+ *
+ * @param[in] pu1_pred
+ * prediction 4x4 block in interleaved format
+ *
+ * @param[in] pred_strd,
+ * Prediction buffer stride in interleaved format
+ *
+ * @param[in] out_strd
+ * recon buffer Stride
+ *
+ * @returns none
+ *
+ * @remarks none
+ *
+ *******************************************************************************
+ */
+
+void ih264_iquant_itrans_recon_chroma_4x4_dc(WORD16 *pi2_src,
+ UWORD8 *pu1_pred,
+ UWORD8 *pu1_out,
+ WORD32 pred_strd,
+ WORD32 out_strd,
+ const UWORD16 *pu2_iscal_mat,
+ const UWORD16 *pu2_weigh_mat,
+ UWORD32 u4_qp_div_6,
+ WORD16 *pi2_tmp,
+ WORD16 *pi2_dc_src)
+{
+ UWORD8 *pu1_pred_ptr = pu1_pred;
+ UWORD8 *pu1_out_ptr = pu1_out;
+ WORD32 q0;
+ WORD16 x, i_macro, i;
+ UNUSED(pi2_src);
+ UNUSED(pu2_iscal_mat);
+ UNUSED(pu2_weigh_mat);
+ UNUSED(u4_qp_div_6);
+ UNUSED(pi2_tmp);
+
+ q0 = pi2_dc_src[0]; // Restoring dc value for intra case3
+ i_macro = ((q0 + 32) >> 6);
+
+ for(i = 0; i < SUB_BLK_WIDTH_4x4; i++)
+ {
+ pu1_pred_ptr = pu1_pred;
+ pu1_out = pu1_out_ptr;
+
+ /* inverse prediction */
+ x = i_macro + *pu1_pred_ptr;
+ *pu1_out = CLIP_U8(x);
+ pu1_pred_ptr += pred_strd;
+ pu1_out += out_strd;
+
+ x = i_macro + *pu1_pred_ptr;
+ *pu1_out = CLIP_U8(x);
+ pu1_pred_ptr += pred_strd;
+ pu1_out += out_strd;
+
+ x = i_macro + *pu1_pred_ptr;
+ *pu1_out = CLIP_U8(x);
+ pu1_pred_ptr += pred_strd;
+ pu1_out += out_strd;
+
+ x = i_macro + *pu1_pred_ptr;
+ *pu1_out = CLIP_U8(x);
+
+ pu1_out_ptr+=2;
+ pu1_pred+=2;
+ }
+}
diff --git a/common/ih264_itrans_recon.h b/common/ih264_itrans_recon.h
new file mode 100755
index 0000000..fd1f239
--- /dev/null
+++ b/common/ih264_itrans_recon.h
@@ -0,0 +1,71 @@
+/******************************************************************************
+ *
+ * Copyright (C) 2015 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ *****************************************************************************
+ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
+*/
+/**
+*******************************************************************************
+* @file
+* ih264_itrans_recon.h
+*
+* @brief
+* Contains function declarations for inverse transform and reconstruction of
+* the quantized macro blocks
+*
+* @author
+* Ittiam
+*
+* @par List of Functions:
+* - ih264_itrans_recon_ft
+* - ih264_itrans_recon_4x4
+* - ih264_itrans_recon_8x8
+* - ih264_itrans_recon_4x4_a9
+*
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+
+#ifndef IH264_ITRANS_RECON_H_
+#define IH264_ITRANS_RECON_H_
+
+/*****************************************************************************/
+/* Extern Function Declarations */
+/*****************************************************************************/
+
+typedef void ih264_itrans_recon_ft(WORD16 *pi2_src,
+ UWORD8 *pu1_pred,
+ UWORD8 *pu1_recon,
+ WORD32 src_strd,
+ WORD32 pred_strd,
+ WORD32 dst_strd,
+ UWORD32 q_lev,
+ WORD32 *pi4_tmp);
+
+/*C declarations*/
+
+ih264_itrans_recon_ft ih264_itrans_recon_4x4;
+
+ih264_itrans_recon_ft ih264_itrans_recon_8x8;
+
+/*A9 declarations */
+
+ih264_itrans_recon_ft ih264_itrans_recon_4x4_a9;
+
+#endif /* IH264_ITRANS_RECON_H_ */
diff --git a/common/ih264_list.c b/common/ih264_list.c
new file mode 100755
index 0000000..736b41c
--- /dev/null
+++ b/common/ih264_list.c
@@ -0,0 +1,574 @@
+/******************************************************************************
+ *
+ * Copyright (C) 2015 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ *****************************************************************************
+ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
+*/
+/**
+*******************************************************************************
+* @file
+* ih264_list.c
+*
+* @brief
+* Contains functions for buf queue
+*
+* @author
+* Harish
+*
+* @par List of Functions:
+* ih264_list_size()
+* ih264_list_lock()
+* ih264_list_unlock()
+* ih264_list_yield()
+* ih264_list_free()
+* ih264_list_init()
+* ih264_list_reset()
+* ih264_list_deinit()
+* ih264_list_terminate()
+* ih264_list_queue()
+* ih264_list_dequeue()
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+/*****************************************************************************/
+/* File Includes */
+/*****************************************************************************/
+#include <stdio.h>
+#include <stddef.h>
+#include <stdlib.h>
+#include <string.h>
+#include <assert.h>
+
+#include "ih264_typedefs.h"
+#include "ithread.h"
+#include "ih264_platform_macros.h"
+#include "ih264_macros.h"
+#include "ih264_debug.h"
+#include "ih264_error.h"
+#include "ih264_list.h"
+
+/**
+*******************************************************************************
+*
+* @brief Returns size for buf queue context. Does not include buf queue buffer
+* requirements
+*
+* @par Description
+* Returns size for buf queue context. Does not include buf queue buffer
+* requirements. Buffer size required to store the bufs should be allocated in
+* addition to the value returned here.
+*
+* @returns Size of the buf queue context
+*
+* @remarks
+*
+*******************************************************************************
+*/
+WORD32 ih264_list_size(WORD32 num_entries, WORD32 entry_size)
+{
+ WORD32 size;
+ WORD32 clz;
+ size = sizeof(list_t);
+ size += ithread_get_mutex_lock_size();
+
+ /* Use next power of two number of entries*/
+ clz = CLZ(num_entries);
+ num_entries = 1 << (32 - clz);
+
+ size += num_entries * entry_size;
+ return size;
+}
+
+/**
+*******************************************************************************
+*
+* @brief
+* Locks the list context
+*
+* @par Description
+* Locks the list context by calling ithread_mutex_lock()
+*
+* @param[in] ps_list
+* Job Queue context
+*
+* @returns IH264_FAIL if mutex lock fails else IH264_SUCCESS
+*
+* @remarks
+*
+*******************************************************************************
+*/
+IH264_ERROR_T ih264_list_lock(list_t *ps_list)
+{
+ WORD32 retval;
+ retval = ithread_mutex_lock(ps_list->pv_mutex);
+ if(retval)
+ {
+ return IH264_FAIL;
+ }
+ return IH264_SUCCESS;
+}
+
+/**
+*******************************************************************************
+*
+* @brief
+* Unlocks the list context
+*
+* @par Description
+* Unlocks the list context by calling ithread_mutex_unlock()
+*
+* @param[in] ps_list
+* Job Queue context
+*
+* @returns IH264_FAIL if mutex unlock fails else IH264_SUCCESS
+*
+* @remarks
+*
+*******************************************************************************
+*/
+
+IH264_ERROR_T ih264_list_unlock(list_t *ps_list)
+{
+ WORD32 retval;
+ retval = ithread_mutex_unlock(ps_list->pv_mutex);
+ if(retval)
+ {
+ return IH264_FAIL;
+ }
+ return IH264_SUCCESS;
+
+}
+/**
+*******************************************************************************
+*
+* @brief
+* Yields the thread
+*
+* @par Description
+* Unlocks the list context by calling
+* ih264_list_unlock(), ithread_yield() and then ih264_list_lock()
+* list is unlocked before to ensure the list can be accessed by other threads
+* If unlock is not done before calling yield then no other thread can access
+* the list functions and update list.
+*
+* @param[in] ps_list
+* Job Queue context
+*
+* @returns IH264_FAIL if mutex lock unlock or yield fails else IH264_SUCCESS
+*
+* @remarks
+*
+*******************************************************************************
+*/
+IH264_ERROR_T ih264_list_yield(list_t *ps_list)
+{
+
+ IH264_ERROR_T ret = IH264_SUCCESS;
+
+ IH264_ERROR_T rettmp;
+ rettmp = ih264_list_unlock(ps_list);
+ RETURN_IF((rettmp != IH264_SUCCESS), rettmp);
+
+ ithread_yield();
+
+ if(ps_list->i4_yeild_interval_us > 0)
+ ithread_usleep(ps_list->i4_yeild_interval_us);
+
+ rettmp = ih264_list_lock(ps_list);
+ RETURN_IF((rettmp != IH264_SUCCESS), rettmp);
+ return ret;
+}
+
+
+/**
+*******************************************************************************
+*
+* @brief free the buf queue pointers
+*
+* @par Description
+* Frees the list context
+*
+* @param[in] pv_buf
+* Memory for buf queue buffer and buf queue context
+*
+* @returns Pointer to buf queue context
+*
+* @remarks
+* Since it will be called only once by master thread this is not thread safe.
+*
+*******************************************************************************
+*/
+IH264_ERROR_T ih264_list_free(list_t *ps_list)
+{
+ WORD32 ret;
+ ret = ithread_mutex_destroy(ps_list->pv_mutex);
+
+ if(0 == ret)
+ return IH264_SUCCESS;
+ else
+ return IH264_FAIL;
+}
+
+/**
+*******************************************************************************
+*
+* @brief Initialize the buf queue
+*
+* @par Description
+* Initializes the list context and sets write and read pointers to start of
+* buf queue buffer
+*
+* @param[in] pv_buf
+* Memoy for buf queue buffer and buf queue context
+*
+* @param[in] buf_size
+* Size of the total memory allocated
+*
+* @returns Pointer to buf queue context
+*
+* @remarks
+* Since it will be called only once by master thread this is not thread safe.
+*
+*******************************************************************************
+*/
+void* ih264_list_init(void *pv_buf,
+ WORD32 buf_size,
+ WORD32 num_entries,
+ WORD32 entry_size,
+ WORD32 yeild_interval_us)
+{
+ list_t *ps_list;
+ UWORD8 *pu1_buf;
+
+ pu1_buf = (UWORD8 *)pv_buf;
+
+ ps_list = (list_t *)pu1_buf;
+ pu1_buf += sizeof(list_t);
+ buf_size -= sizeof(list_t);
+
+ ps_list->pv_mutex = pu1_buf;
+ pu1_buf += ithread_get_mutex_lock_size();
+ buf_size -= ithread_get_mutex_lock_size();
+
+ if (buf_size <= 0)
+ return NULL;
+
+ ithread_mutex_init(ps_list->pv_mutex);
+
+ /* Ensure num_entries is power of two */
+ ASSERT(0 == (num_entries & (num_entries - 1)));
+
+ /* Ensure remaining buffer is large enough to hold given number of entries */
+ ASSERT((num_entries * entry_size) <= buf_size);
+
+ ps_list->pv_buf_base = pu1_buf;
+ ps_list->i4_terminate = 0;
+ ps_list->i4_entry_size = entry_size;
+ ps_list->i4_buf_rd_idx = 0;
+ ps_list->i4_buf_wr_idx = 0;
+ ps_list->i4_log2_buf_max_idx = 32 - CLZ(num_entries);
+ ps_list->i4_buf_max_idx = num_entries;
+ ps_list->i4_yeild_interval_us = yeild_interval_us;
+
+ return ps_list;
+}
+/**
+*******************************************************************************
+*
+* @brief
+* Resets the list context
+*
+* @par Description
+* Resets the list context by initializing buf queue context elements
+*
+* @param[in] ps_list
+* Job Queue context
+*
+* @returns IH264_FAIL if lock unlock fails else IH264_SUCCESS
+*
+* @remarks
+*
+*******************************************************************************
+*/
+IH264_ERROR_T ih264_list_reset(list_t *ps_list)
+{
+ IH264_ERROR_T ret = IH264_SUCCESS;
+ ret = ih264_list_lock(ps_list);
+ RETURN_IF((ret != IH264_SUCCESS), ret);
+
+ ps_list->i4_terminate = 0;
+ ps_list->i4_buf_rd_idx = 0;
+ ps_list->i4_buf_wr_idx = 0;
+
+ ret = ih264_list_unlock(ps_list);
+ RETURN_IF((ret != IH264_SUCCESS), ret);
+
+ return ret;
+}
+
+/**
+*******************************************************************************
+*
+* @brief
+* Deinitializes the list context
+*
+* @par Description
+* Deinitializes the list context by calling ih264_list_reset()
+* and then destrying the mutex created
+*
+* @param[in] ps_list
+* Job Queue context
+*
+* @returns IH264_FAIL if lock unlock fails else IH264_SUCCESS
+*
+* @remarks
+*
+*******************************************************************************
+*/
+IH264_ERROR_T ih264_list_deinit(list_t *ps_list)
+{
+ WORD32 retval;
+ IH264_ERROR_T ret = IH264_SUCCESS;
+
+ ret = ih264_list_reset(ps_list);
+ RETURN_IF((ret != IH264_SUCCESS), ret);
+
+ retval = ithread_mutex_destroy(ps_list->pv_mutex);
+ if(retval)
+ {
+ return IH264_FAIL;
+ }
+
+ return IH264_SUCCESS;
+}
+
+
+/**
+*******************************************************************************
+*
+* @brief
+* Terminates the list
+*
+* @par Description
+* Terminates the list by setting a flag in context.
+*
+* @param[in] ps_list
+* Job Queue context
+*
+* @returns IH264_FAIL if lock unlock fails else IH264_SUCCESS
+*
+* @remarks
+*
+*******************************************************************************
+*/
+
+IH264_ERROR_T ih264_list_terminate(list_t *ps_list)
+{
+ IH264_ERROR_T ret = IH264_SUCCESS;
+ ret = ih264_list_lock(ps_list);
+ RETURN_IF((ret != IH264_SUCCESS), ret);
+
+ ps_list->i4_terminate = 1;
+
+ ret = ih264_list_unlock(ps_list);
+ RETURN_IF((ret != IH264_SUCCESS), ret);
+ return ret;
+}
+
+
+/**
+*******************************************************************************
+*
+* @brief Adds a buf to the queue
+*
+* @par Description
+* Adds a buf to the queue and updates wr address to next location.
+* Format/content of the buf structure is abstracted and hence size of the buf
+* buffer is being passed.
+*
+* @param[in] ps_list
+* Job Queue context
+*
+* @param[in] pv_buf
+* Pointer to the location that contains details of the buf to be added
+*
+* @param[in] buf_size
+* Size of the buf buffer
+*
+* @param[in] blocking
+* To signal if the write is blocking or non-blocking.
+*
+* @returns
+*
+* @remarks
+* Job Queue buffer is assumed to be allocated to handle worst case number of bufs
+* Wrap around is not supported
+*
+*******************************************************************************
+*/
+IH264_ERROR_T ih264_list_queue(list_t *ps_list, void *pv_buf, WORD32 blocking)
+{
+ IH264_ERROR_T ret = IH264_SUCCESS;
+ IH264_ERROR_T rettmp;
+
+ WORD32 diff;
+ void *pv_buf_wr;
+
+ volatile WORD32 *pi4_wr_idx, *pi4_rd_idx;
+ WORD32 buf_size = ps_list->i4_entry_size;
+
+
+ rettmp = ih264_list_lock(ps_list);
+ RETURN_IF((rettmp != IH264_SUCCESS), rettmp);
+
+
+
+ while(1)
+ {
+ /* Ensure wr idx does not go beyond rd idx by more than number of entries
+ */
+ pi4_wr_idx = &ps_list->i4_buf_wr_idx;
+ pi4_rd_idx = &ps_list->i4_buf_rd_idx;
+ diff = *pi4_wr_idx - *pi4_rd_idx;
+
+ if(diff < ps_list->i4_buf_max_idx)
+ {
+ WORD32 wr_idx;
+ wr_idx = ps_list->i4_buf_wr_idx & (ps_list->i4_buf_max_idx - 1);
+ pv_buf_wr = (UWORD8 *)ps_list->pv_buf_base + wr_idx * buf_size;
+
+ memcpy(pv_buf_wr, pv_buf, buf_size);
+ ps_list->i4_buf_wr_idx++;
+ break;
+ }
+ else
+ {
+ /* wr is ahead, so wait for rd to consume */
+ if(blocking)
+ {
+ ih264_list_yield(ps_list);
+ }
+ else
+ {
+ ret = IH264_FAIL;
+ break;
+ }
+ }
+
+ }
+ ps_list->i4_terminate = 0;
+
+ rettmp = ih264_list_unlock(ps_list);
+ RETURN_IF((rettmp != IH264_SUCCESS), rettmp);
+
+ return ret;
+}
+/**
+*******************************************************************************
+*
+* @brief Gets next from the Job queue
+*
+* @par Description
+* Gets next buf from the buf queue and updates rd address to next location.
+* Format/content of the buf structure is abstracted and hence size of the buf
+* buffer is being passed. If it is a blocking call and if there is no new buf
+* then this functions unlocks the mutex and calls yield and then locks it back.
+* and continues till a buf is available or terminate is set
+*
+* @param[in] ps_list
+* Job Queue context
+*
+* @param[out] pv_buf
+* Pointer to the location that contains details of the buf to be written
+*
+* @param[in] buf_size
+* Size of the buf buffer
+*
+* @param[in] blocking
+* To signal if the read is blocking or non-blocking.
+*
+* @returns
+*
+* @remarks
+* Job Queue buffer is assumed to be allocated to handle worst case number of bufs
+* Wrap around is not supported
+*
+*******************************************************************************
+*/
+IH264_ERROR_T ih264_list_dequeue(list_t *ps_list, void *pv_buf, WORD32 blocking)
+{
+ IH264_ERROR_T ret = IH264_SUCCESS;
+ IH264_ERROR_T rettmp;
+ WORD32 buf_size = ps_list->i4_entry_size;
+ WORD32 diff;
+
+ void *pv_buf_rd;
+ volatile WORD32 *pi4_wr_idx, *pi4_rd_idx;
+
+ rettmp = ih264_list_lock(ps_list);
+ RETURN_IF((rettmp != IH264_SUCCESS), rettmp);
+
+ while(1)
+ {
+ /* Ensure wr idx is ahead of rd idx and
+ * wr idx does not go beyond rd idx by more than number of entries
+ */
+ pi4_wr_idx = &ps_list->i4_buf_wr_idx;
+ pi4_rd_idx = &ps_list->i4_buf_rd_idx;
+ diff = *pi4_wr_idx - *pi4_rd_idx;
+
+
+ if(diff > 0)
+ {
+ WORD32 rd_idx;
+ rd_idx = ps_list->i4_buf_rd_idx & (ps_list->i4_buf_max_idx - 1);
+ pv_buf_rd = (UWORD8 *)ps_list->pv_buf_base + rd_idx * buf_size;
+
+ memcpy(pv_buf, pv_buf_rd, buf_size);
+ ps_list->i4_buf_rd_idx++;
+ break;
+ }
+ else
+ {
+ /* If terminate is signaled then break */
+ if(ps_list->i4_terminate)
+ {
+ ret = IH264_FAIL;
+ break;
+ }
+ /* wr is ahead, so wait for rd to consume */
+ if(blocking)
+ {
+ ih264_list_yield(ps_list);
+ }
+ else
+ {
+ ret = IH264_FAIL;
+ break;
+ }
+ }
+
+ }
+
+
+ rettmp = ih264_list_unlock(ps_list);
+ RETURN_IF((rettmp != IH264_SUCCESS), rettmp);
+
+ return ret;
+}
diff --git a/common/ih264_list.h b/common/ih264_list.h
new file mode 100755
index 0000000..fc59d95
--- /dev/null
+++ b/common/ih264_list.h
@@ -0,0 +1,93 @@
+/******************************************************************************
+ *
+ * Copyright (C) 2015 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ *****************************************************************************
+ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
+*/
+/**
+*******************************************************************************
+* @file
+* ih264_list.h
+*
+* @brief
+* Contains functions for buf queue
+*
+* @author
+* Harish
+*
+* @par List of Functions:
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+
+#ifndef _IH264_LIST_H_
+#define _IH264_LIST_H_
+
+typedef struct
+{
+ /** Pointer to buffer base which contains the bufs */
+ void *pv_buf_base;
+
+ /** Mutex used to keep the functions thread-safe */
+ void *pv_mutex;
+
+ /** Current write index */
+ volatile WORD32 i4_buf_wr_idx;
+
+ /** Current read index */
+ volatile WORD32 i4_buf_rd_idx;
+
+ /** Maximum index */
+ WORD32 i4_buf_max_idx;
+
+ /** Log2(buf_max_idx) -
+ * To ensure number of entries is power of two
+ * This makes it easier to wrap around by using AND with buf_max_idx - 1
+ * */
+ WORD32 i4_log2_buf_max_idx;
+
+ /** Flag to indicate list has to be terminated */
+ WORD32 i4_terminate;
+
+ /** Size of each entry */
+ WORD32 i4_entry_size;
+
+ /** If the list is to be used frequently send this as zero, else send a large value
+ * to ensure cores are not loaded unnecessarily.
+ * For eg: For picture level queues this can be a large value like 100us
+ * but for jobq this will be zero.
+ */
+ WORD32 i4_yeild_interval_us;
+
+}list_t;
+
+WORD32 ih264_list_size(WORD32 num_entries, WORD32 entry_size);
+void* ih264_list_init(void *pv_buf,
+ WORD32 buf_size,
+ WORD32 num_entries,
+ WORD32 entry_size,
+ WORD32 yeild_interval_us);
+IH264_ERROR_T ih264_list_free(list_t *ps_list);
+IH264_ERROR_T ih264_list_reset(list_t *ps_list);
+IH264_ERROR_T ih264_list_deinit(list_t *ps_list);
+IH264_ERROR_T ih264_list_terminate(list_t *ps_list);
+IH264_ERROR_T ih264_list_queue(list_t *ps_list, void *pv_buf, WORD32 blocking);
+IH264_ERROR_T ih264_list_dequeue(list_t *ps_list, void *pv_buf, WORD32 blocking);
+
+#endif /* _IH264_PROCESS_SLICE_H_ */
diff --git a/common/ih264_luma_intra_pred_filters.c b/common/ih264_luma_intra_pred_filters.c
new file mode 100755
index 0000000..4a5b143
--- /dev/null
+++ b/common/ih264_luma_intra_pred_filters.c
@@ -0,0 +1,1933 @@
+/******************************************************************************
+ *
+ * Copyright (C) 2015 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ *****************************************************************************
+ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
+*/
+/**
+ *******************************************************************************
+ * @file
+ * ih264_luma_intra_pred_filters.c
+ *
+ * @brief
+ * Contains function definitions for intra prediction filters
+ *
+ * @author
+ * Ittiam
+ *
+ * @par List of Functions:
+ * - ih264_intra_pred_luma_4x4_mode_vert
+ * - ih264_intra_pred_luma_4x4_mode_horz
+ * - ih264_intra_pred_luma_4x4_mode_dc
+ * - ih264_intra_pred_luma_4x4_mode_diag_dl
+ * - ih264_intra_pred_luma_4x4_mode_diag_dr
+ * - ih264_intra_pred_luma_4x4_mode_vert_r
+ * - ih264_intra_pred_luma_4x4_mode_horz_d
+ * - ih264_intra_pred_luma_4x4_mode_vert_l
+ * - ih264_intra_pred_luma_4x4_mode_horz_u
+ * - ih264_intra_pred_luma_8x8_mode_ref_filtering
+ * - ih264_intra_pred_luma_8x8_mode_vert
+ * - ih264_intra_pred_luma_8x8_mode_horz
+ * - ih264_intra_pred_luma_8x8_mode_dc
+ * - ih264_intra_pred_luma_8x8_mode_diag_dl
+ * - ih264_intra_pred_luma_8x8_mode_diag_dr
+ * - ih264_intra_pred_luma_8x8_mode_vert_r
+ * - ih264_intra_pred_luma_8x8_mode_horz_d
+ * - ih264_intra_pred_luma_8x8_mode_vert_l
+ * - ih264_intra_pred_luma_8x8_mode_horz_u
+ * - ih264_intra_pred_luma_16x16_mode_vert
+ * - ih264_intra_pred_luma_16x16_mode_horz
+ * - ih264_intra_pred_luma_16x16_mode_dc
+ * - ih264_intra_pred_luma_16x16_mode_plane
+ *
+ *
+ * @remarks
+ * None
+ *
+ ******************************************************************************
+ */
+
+/*****************************************************************************/
+/* File Includes */
+/*****************************************************************************/
+/* System include files */
+#include <stdio.h>
+#include <stddef.h>
+#include <string.h>
+
+/* User include files */
+#include "ih264_defs.h"
+#include "ih264_typedefs.h"
+#include "ih264_macros.h"
+#include "ih264_platform_macros.h"
+#include "ih264_intra_pred_filters.h"
+
+/* Global variables used only in assembly files*/
+const WORD8 ih264_gai1_intrapred_luma_plane_coeffs[] =
+{ 0x01, 0x02, 0x03, 0x04,
+ 0x05, 0x06, 0x07, 0x08,
+ 0x09, 0x0A, 0x0B, 0x0C,
+ 0x0D, 0x0E, 0x0F, 0x10, };
+
+const WORD8 ih264_gai1_intrapred_luma_8x8_horz_u[] =
+{ 0x06,0x15,0x05,0x14,
+ 0x04,0x13,0x03,0x12,
+ 0x02,0x11,0x01,0x10,
+ 0x00,0x1F,0x0F,0x0F
+};
+
+/******************* LUMA INTRAPREDICTION *******************/
+
+/******************* 4x4 Modes *******************/
+
+/**
+ *******************************************************************************
+ *
+ *ih264_intra_pred_luma_4x4_mode_vert
+ *
+ * @brief
+ * Perform Intra prediction for luma_4x4 mode:vertical
+ *
+ * @par Description:
+ * Perform Intra prediction for luma_4x4 mode:vertical ,described in sec 8.3.1.2.1
+ *
+ * @param[in] pu1_src
+ * UWORD8 pointer to the source
+ *
+ * @param[out] pu1_dst
+ * UWORD8 pointer to the destination
+ *
+ * @param[in] src_strd
+ * integer source stride
+ *
+ * @param[in] dst_strd
+ * integer destination stride
+ *
+ * @param[in] ngbr_avail
+ * availability of neighbouring pixels(Not used in this function)
+ *
+ * @returns
+ *
+ * @remarks
+ * None
+ *
+ *******************************************************************************
+ */
+void ih264_intra_pred_luma_4x4_mode_vert(UWORD8 *pu1_src,
+ UWORD8 *pu1_dst,
+ WORD32 src_strd,
+ WORD32 dst_strd,
+ WORD32 ngbr_avail)
+{
+ UWORD8 *pu1_top = NULL; /* Pointer to start of top predictors */
+ UNUSED(src_strd);
+ UNUSED(ngbr_avail);
+ pu1_top = pu1_src + BLK_SIZE + 1;
+
+ memcpy(pu1_dst, pu1_top, 4);
+ memcpy(pu1_dst + dst_strd, pu1_top, 4);
+ memcpy(pu1_dst + 2 * dst_strd, pu1_top, 4);
+ memcpy(pu1_dst + 3 * dst_strd, pu1_top, 4);
+}
+
+/**
+ *******************************************************************************
+ *
+ *ih264_intra_pred_luma_4x4_mode_horz
+ *
+ * @brief
+ * Perform Intra prediction for luma_4x4 mode:horizontal
+ *
+ * @par Description:
+ * Perform Intra prediction for luma_4x4 mode:horizontal ,described in sec 8.3.1.2.2
+ *
+ * @param[in] pu1_src
+ * UWORD8 pointer to the source
+ *
+ * @param[out] pu1_dst
+ * UWORD8 pointer to the destination
+ *
+ * @param[in] src_strd
+ * integer source stride
+ *
+ * @param[in] dst_strd
+ * integer destination stride
+ *
+ * @param[in] ngbr_avail
+ * availability of neighbouring pixels(Not used in this function)
+ *
+ * @returns
+ *
+ * @remarks
+ * None
+ *
+ *******************************************************************************
+ */
+void ih264_intra_pred_luma_4x4_mode_horz(UWORD8 *pu1_src,
+ UWORD8 *pu1_dst,
+ WORD32 src_strd,
+ WORD32 dst_strd,
+ WORD32 ngbr_avail)
+{
+ UWORD8 *pu1_left = NULL; /* Pointer to start of left predictors */
+
+ UNUSED(src_strd);
+ UNUSED(ngbr_avail);
+ pu1_left = pu1_src + BLK_SIZE - 1;
+
+ memset(pu1_dst, *pu1_left, 4);
+ memset(pu1_dst + dst_strd, *(pu1_left - 1), 4);
+ memset(pu1_dst + 2 * dst_strd, *(pu1_left - 2), 4);
+ memset(pu1_dst + 3 * dst_strd, *(pu1_left - 3), 4);
+}
+
+/**
+ *******************************************************************************
+ *
+ *ih264_intra_pred_luma_4x4_mode_dc
+ *
+ * @brief
+ * Perform Intra prediction for luma_4x4 mode:DC
+ *
+ * @par Description:
+ * Perform Intra prediction for luma_4x4 mode:DC ,described in sec 8.3.1.2.3
+ *
+ * @param[in] pu1_src
+ * UWORD8 pointer to the source
+ *
+ * @param[out] pu1_dst
+ * UWORD8 pointer to the destination
+ *
+ * @param[in] src_strd
+ * integer source stride
+ *
+ * @param[in] dst_strd
+ * integer destination stride
+ *
+ * @param[in] ngbr_avail
+ * availability of neighbouring pixels
+ *
+ * @returns
+ *
+ * @remarks
+ * None
+ *
+ *******************************************************************************/
+void ih264_intra_pred_luma_4x4_mode_dc(UWORD8 *pu1_src,
+ UWORD8 *pu1_dst,
+ WORD32 src_strd,
+ WORD32 dst_strd,
+ WORD32 ngbr_avail)
+{
+ UWORD8 u1_useleft; /* availability of left predictors (only for DC) */
+ UWORD8 u1_usetop; /* availability of top predictors (only for DC) */
+ UWORD8 *pu1_left = NULL; /* Pointer to start of left predictors */
+ UWORD8 *pu1_top = NULL; /* Pointer to start of top predictors */
+ WORD32 val = 0;
+ UNUSED(src_strd);
+ UNUSED(ngbr_avail);
+ u1_useleft = BOOLEAN(ngbr_avail & LEFT_MB_AVAILABLE_MASK);
+ u1_usetop = BOOLEAN(ngbr_avail & TOP_MB_AVAILABLE_MASK);
+ pu1_top = pu1_src + BLK_SIZE + 1;
+ pu1_left = pu1_src + BLK_SIZE - 1;
+
+ if(u1_useleft)
+ {
+ val += *pu1_left--;
+ val += *pu1_left--;
+ val += *pu1_left--;
+ val += *pu1_left + 2;
+ }
+ if(u1_usetop)
+ {
+ val += *pu1_top + *(pu1_top + 1) + *(pu1_top + 2) + *(pu1_top + 3)
+ + 2;
+ }
+ /* Since 2 is added if either left/top pred is there,
+ val still being zero implies both preds are not there */
+ val = (val) ? (val >> (1 + u1_useleft + u1_usetop)) : 128;
+
+ /* 4 bytes are copied from src to dst */
+ memset(pu1_dst, val, 4);
+ memset(pu1_dst + dst_strd, val, 4);
+ memset(pu1_dst + 2 * dst_strd, val, 4);
+ memset(pu1_dst + 3 * dst_strd, val, 4);
+}
+
+/**
+ *******************************************************************************
+ *
+ *ih264_intra_pred_luma_4x4_mode_diag_dl
+ *
+ * @brief
+ * Perform Intra prediction for luma_4x4 mode:Diagonal_Down_Left
+ *
+ * @par Description:
+ * Perform Intra prediction for luma_4x4 mode:Diagonal_Down_Left ,described in sec 8.3.1.2.4
+ *
+ * @param[in] pu1_src
+ * UWORD8 pointer to the source
+ *
+ * @param[out] pu1_dst
+ * UWORD8 pointer to the destination
+ *
+ * @param[in] src_strd
+ * integer source stride
+ *
+ * @param[in] dst_strd
+ * integer destination stride
+ *
+ * @param[in] ngbr_avail
+ * availability of neighbouring pixels(Not used in this function)
+ *
+ * @returns
+ *
+ * @remarks
+ * None
+ *
+ *******************************************************************************/
+void ih264_intra_pred_luma_4x4_mode_diag_dl(UWORD8 *pu1_src,
+ UWORD8 *pu1_dst,
+ WORD32 src_strd,
+ WORD32 dst_strd,
+ WORD32 ngbr_avail)
+{
+ UWORD8 *pu1_top = NULL; /* Pointer to start of top predictors */
+ UWORD32 ui4_a, ui4_b, ui4_c, ui4_d, ui4_e, ui4_f, ui4_g, ui4_h;
+ UWORD8 predicted_pixels[7];
+ UNUSED(src_strd);
+ UNUSED(ngbr_avail);
+ pu1_top = pu1_src +BLK_SIZE + 1;
+
+ ui4_a = *pu1_top++;
+ ui4_b = *pu1_top++;
+ ui4_c = *pu1_top++;
+ ui4_d = *pu1_top++;
+ ui4_e = *pu1_top++;
+ ui4_f = *pu1_top++;
+ ui4_g = *pu1_top++;
+ ui4_h = *pu1_top;
+
+ predicted_pixels[0] = FILT121(ui4_a, ui4_b, ui4_c);
+ predicted_pixels[1] = FILT121(ui4_b, ui4_c, ui4_d);
+ predicted_pixels[2] = FILT121(ui4_c, ui4_d, ui4_e);
+ predicted_pixels[3] = FILT121(ui4_d, ui4_e, ui4_f);
+ predicted_pixels[4] = FILT121(ui4_e, ui4_f, ui4_g);
+ predicted_pixels[5] = FILT121(ui4_f, ui4_g, ui4_h);
+ predicted_pixels[6] = FILT121(ui4_g, ui4_h, ui4_h);
+
+ memcpy(pu1_dst, predicted_pixels, 4);
+ memcpy(pu1_dst + dst_strd, predicted_pixels + 1, 4);
+ memcpy(pu1_dst + 2 * dst_strd, predicted_pixels + 2, 4);
+ memcpy(pu1_dst + 3 * dst_strd, predicted_pixels + 3, 4);
+}
+
+/**
+ *******************************************************************************
+ *
+ *ih264_intra_pred_luma_4x4_mode_diag_dr
+ *
+ * @brief
+ * Perform Intra prediction for luma_4x4 mode:Diagonal_Down_Right
+ *
+ * @par Description:
+ * Perform Intra prediction for luma_4x4 mode:Diagonal_Down_Right ,described in sec 8.3.1.2.5
+ *
+ * @param[in] pu1_src
+ * UWORD8 pointer to the source
+ *
+ * @param[out] pu1_dst
+ * UWORD8 pointer to the destination
+ *
+ * @param[in] src_strd
+ * integer source stride
+ *
+ * @param[in] dst_strd
+ * integer destination stride
+ *
+ * @param[in] ngbr_avail
+ * availability of neighbouring pixels(Not used in this function)
+ *
+ * @returns
+ *
+ * @remarks
+ * None
+ *
+ *******************************************************************************/
+void ih264_intra_pred_luma_4x4_mode_diag_dr(UWORD8 *pu1_src,
+ UWORD8 *pu1_dst,
+ WORD32 src_strd,
+ WORD32 dst_strd,
+ WORD32 ngbr_avail)
+{
+ UWORD8 *pu1_top = NULL; /* Pointer to start of top predictors */
+ UWORD8 *pu1_left = NULL; /* Pointer to start of left predictors */
+ UWORD8 *pu1_topleft = NULL;/* Pointer to top left predictor */
+ UWORD32 ui4_a, ui4_b, ui4_c, ui4_d, ui4_i, ui4_j, ui4_k, ui4_l, ui4_m;
+ UWORD8 predicted_pixels[7];
+ UNUSED(src_strd);
+ UNUSED(ngbr_avail);
+ pu1_top = pu1_src + BLK_SIZE + 1;
+ pu1_left = pu1_src + BLK_SIZE - 1;
+ pu1_topleft = pu1_src +BLK_SIZE;
+
+ ui4_a = *pu1_top++;
+ ui4_b = *pu1_top++;
+ ui4_c = *pu1_top++;
+ ui4_d = *pu1_top++;
+ ui4_i = *pu1_left--;
+ ui4_j = *pu1_left--;
+ ui4_k = *pu1_left--;
+ ui4_l = *pu1_left;
+ ui4_m = *pu1_topleft;
+
+ predicted_pixels[2] = FILT121(ui4_j, ui4_i, ui4_m);
+ predicted_pixels[1] = FILT121(ui4_k, ui4_j, ui4_i);
+ predicted_pixels[0] = FILT121(ui4_l, ui4_k, ui4_j);
+ predicted_pixels[3] = FILT121(ui4_i, ui4_m, ui4_a);
+ predicted_pixels[4] = FILT121(ui4_m, ui4_a, ui4_b);
+ predicted_pixels[5] = FILT121(ui4_a, ui4_b, ui4_c);
+ predicted_pixels[6] = FILT121(ui4_b, ui4_c, ui4_d);
+
+ memcpy(pu1_dst, predicted_pixels + 3, 4);
+ memcpy(pu1_dst + dst_strd, predicted_pixels + 2, 4);
+ memcpy(pu1_dst + 2 * dst_strd, predicted_pixels + 1, 4);
+ memcpy(pu1_dst + 3 * dst_strd, predicted_pixels, 4);
+}
+
+/**
+ *******************************************************************************
+ *
+ *ih264_intra_pred_luma_4x4_mode_vert_r
+ *
+ * @brief
+ * Perform Intra prediction for luma_4x4 mode:Vertical_Right
+ *
+ * @par Description:
+ * Perform Intra prediction for luma_4x4 mode:Vertical_Right ,described in sec 8.3.1.2.6
+ *
+ * @param[in] pu1_src
+ * UWORD8 pointer to the source
+ *
+ * @param[out] pu1_dst
+ * UWORD8 pointer to the destination
+ *
+ * @param[in] src_strd
+ * integer source stride
+ *
+ * @param[in] dst_strd
+ * integer destination stride
+ *
+ * @param[in] ngbr_avail
+ * availability of neighbouring pixels(Not used in this function)
+ *
+ * @returns
+ *
+ * @remarks
+ * None
+ *
+ *******************************************************************************/
+void ih264_intra_pred_luma_4x4_mode_vert_r(UWORD8 *pu1_src,
+ UWORD8 *pu1_dst,
+ WORD32 src_strd,
+ WORD32 dst_strd,
+ WORD32 ngbr_avail)
+{
+
+ UWORD32 ui4_a, ui4_b, ui4_c, ui4_d, ui4_i, ui4_j, ui4_k, ui4_m;
+ UWORD8 *pu1_top = NULL; /* Pointer to start of top predictors */
+ UWORD8 *pu1_left = NULL; /* Pointer to start of left predictors */
+ UWORD8 *pu1_topleft = NULL;/* Pointer to top left predictor */
+ UWORD8 predicted_pixels[10];
+ UNUSED(src_strd);
+ UNUSED(ngbr_avail);
+ pu1_top = pu1_src +BLK_SIZE + 1;
+ pu1_left = pu1_src + BLK_SIZE - 1;
+ pu1_topleft = pu1_src + BLK_SIZE;
+
+ ui4_a = *pu1_top++;
+ ui4_b = *pu1_top++;
+ ui4_c = *pu1_top++;
+ ui4_d = *pu1_top++;
+ ui4_i = *pu1_left--;
+ ui4_j = *pu1_left--;
+ ui4_k = *pu1_left;
+ ui4_m = *pu1_topleft;
+
+ predicted_pixels[6] = FILT11(ui4_m, ui4_a);
+ predicted_pixels[7] = FILT11(ui4_a, ui4_b);
+ predicted_pixels[8] = FILT11(ui4_b, ui4_c);
+ predicted_pixels[9] = FILT11(ui4_c, ui4_d);
+ predicted_pixels[1] = FILT121(ui4_i, ui4_m, ui4_a);
+ predicted_pixels[2] = FILT121(ui4_m, ui4_a, ui4_b);
+ predicted_pixels[3] = FILT121(ui4_a, ui4_b, ui4_c);
+ predicted_pixels[4] = FILT121(ui4_b, ui4_c, ui4_d);
+ predicted_pixels[5] = FILT121(ui4_j, ui4_i, ui4_m);
+ predicted_pixels[0] = FILT121(ui4_k, ui4_j, ui4_i);
+
+ memcpy(pu1_dst, predicted_pixels + 6, 4);
+ memcpy(pu1_dst + dst_strd, predicted_pixels + 1, 4);
+ memcpy(pu1_dst + 2 * dst_strd, predicted_pixels + 5, 4);
+ memcpy(pu1_dst + 3 * dst_strd, predicted_pixels, 4);
+}
+
+/*
+ *******************************************************************************
+ *
+ *ih264_intra_pred_luma_4x4_mode_horz_d
+ *
+ * @brief
+ * Perform Intra prediction for luma_4x4 mode:Horizontal_Down
+ *
+ * @par Description:
+ * Perform Intra prediction for luma_4x4 mode:Horizontal_Down ,described in sec 8.3.1.2.7
+ *
+ * @param[in] pu1_src
+ * UWORD8 pointer to the source
+ *
+ * @param[out] pu1_dst
+ * UWORD8 pointer to the destination
+ *
+ * @param[in] src_strd
+ * integer source stride
+ *
+ * @param[in] dst_strd
+ * integer destination stride
+ *
+ * @param[in] ngbr_avail
+ * availability of neighbouring pixels(Not used in this function)
+ *
+ * @returns
+ *
+ * @remarks
+ * None
+ *
+ *******************************************************************************/
+void ih264_intra_pred_luma_4x4_mode_horz_d(UWORD8 *pu1_src,
+ UWORD8 *pu1_dst,
+ WORD32 src_strd,
+ WORD32 dst_strd,
+ WORD32 ngbr_avail)
+{
+ UWORD8 *pu1_top = NULL; /* Pointer to start of top predictors */
+ UWORD8 *pu1_left = NULL; /* Pointer to start of left predictors */
+ UWORD8 *pu1_topleft = NULL;/* Pointer to top left predictor */
+ UWORD32 ui4_a, ui4_b, ui4_c, ui4_i, ui4_j, ui4_k, ui4_l, ui4_m;
+ UWORD8 predicted_pixels[10];
+ UNUSED(src_strd);
+ UNUSED(ngbr_avail);
+ pu1_top = pu1_src + BLK_SIZE + 1;
+ pu1_left = pu1_src + BLK_SIZE - 1;
+ pu1_topleft = pu1_src + BLK_SIZE;
+
+ ui4_a = *pu1_top++;
+ ui4_b = *pu1_top++;
+ ui4_c = *pu1_top++;
+ ui4_i = *pu1_left--;
+ ui4_j = *pu1_left--;
+ ui4_k = *pu1_left--;
+ ui4_l = *pu1_left--;
+ ui4_m = *pu1_topleft;
+
+ predicted_pixels[6] = FILT11(ui4_i, ui4_m);
+ predicted_pixels[7] = FILT121(ui4_i, ui4_m, ui4_a);
+ predicted_pixels[8] = FILT121(ui4_m, ui4_a, ui4_b);
+ predicted_pixels[9] = FILT121(ui4_a, ui4_b, ui4_c);
+ predicted_pixels[1] = FILT121(ui4_l, ui4_k, ui4_j);
+ predicted_pixels[2] = FILT11(ui4_k, ui4_j);
+ predicted_pixels[3] = FILT121(ui4_k, ui4_j, ui4_i);
+ predicted_pixels[4] = FILT11(ui4_j, ui4_i);
+ predicted_pixels[5] = FILT121(ui4_j, ui4_i, ui4_m);
+ predicted_pixels[0] = FILT11(ui4_l, ui4_k);
+
+ memcpy(pu1_dst, predicted_pixels + 6, 4);
+ memcpy(pu1_dst + dst_strd, predicted_pixels + 4, 4);
+ memcpy(pu1_dst + 2 * dst_strd, predicted_pixels + 2, 4);
+ memcpy(pu1_dst + 3 * dst_strd, predicted_pixels, 4);
+}
+
+/**
+ *******************************************************************************
+ *
+ *ih264_intra_pred_luma_4x4_mode_vert_l
+ *
+ * @brief
+ * Perform Intra prediction for luma_4x4 mode:Vertical_Left
+ *
+ * @par Description:
+ * Perform Intra prediction for luma_4x4 mode:Vertical_Left ,described in sec 8.3.1.2.8
+ *
+ * @param[in] pu1_src
+ * UWORD8 pointer to the source
+ *
+ * @param[out] pu1_dst
+ * UWORD8 pointer to the destination
+ *
+ * @param[in] src_strd
+ * integer source stride
+ *
+ * @param[in] dst_strd
+ * integer destination stride
+ *
+ * @param[in] ngbr_avail
+ * availability of neighbouring pixels(Not used in this function)
+ *
+ * @returns
+ *
+ * @remarks
+ * None
+ *
+ *******************************************************************************/
+void ih264_intra_pred_luma_4x4_mode_vert_l(UWORD8 *pu1_src,
+ UWORD8 *pu1_dst,
+ WORD32 src_strd,
+ WORD32 dst_strd,
+ WORD32 ngbr_avail)
+{
+ UWORD8 *pu1_top = NULL; /* Pointer to start of top predictors */
+ UWORD32 ui4_a, ui4_b, ui4_c, ui4_d, ui4_e, ui4_f, ui4_g;
+ UWORD8 predicted_pixels[10];
+ UNUSED(src_strd);
+ UNUSED(ngbr_avail);
+ pu1_top = pu1_src + BLK_SIZE + 1;
+
+ ui4_a = *pu1_top++;
+ ui4_b = *pu1_top++;
+ ui4_c = *pu1_top++;
+ ui4_d = *pu1_top++;
+ ui4_e = *pu1_top++;
+ ui4_f = *pu1_top++;
+ ui4_g = *pu1_top;
+
+ predicted_pixels[5] = FILT11(ui4_a, ui4_b);
+ predicted_pixels[6] = FILT11(ui4_b, ui4_c);
+ predicted_pixels[7] = FILT11(ui4_c, ui4_d);
+ predicted_pixels[8] = FILT11(ui4_d, ui4_e);
+ predicted_pixels[0] = FILT121(ui4_a, ui4_b, ui4_c);
+ predicted_pixels[1] = FILT121(ui4_b, ui4_c, ui4_d);
+ predicted_pixels[2] = FILT121(ui4_c, ui4_d, ui4_e);
+ predicted_pixels[3] = FILT121(ui4_d, ui4_e, ui4_f);
+ predicted_pixels[9] = FILT11(ui4_e, ui4_f);
+ predicted_pixels[4] = FILT121(ui4_e, ui4_f, ui4_g);
+
+ memcpy(pu1_dst, predicted_pixels + 5, 4);
+ memcpy(pu1_dst + dst_strd, predicted_pixels, 4);
+ memcpy(pu1_dst + 2 * dst_strd, predicted_pixels + 6, 4);
+ memcpy(pu1_dst + 3 * dst_strd, predicted_pixels + 1, 4);
+}
+
+/**
+ *******************************************************************************
+ *
+ *ih264_intra_pred_luma_4x4_mode_horz_u
+ *
+ * @brief
+ * Perform Intra prediction for luma_4x4 mode:Horizontal_Up
+ *
+ * @par Description:
+ * Perform Intra prediction for luma_4x4 mode:Horizontal_Up ,described in sec 8.3.1.2.9
+ *
+ * @param[in] pu1_src
+ * UWORD8 pointer to the source
+ *
+ * @param[out] pu1_dst
+ * UWORD8 pointer to the destination
+ *
+ * @param[in] src_strd
+ * integer source stride
+ *
+ * @param[in] dst_strd
+ * integer destination stride
+ *
+ * @param[in] ngbr_avail
+ * availability of neighbouring pixels(Not used in this function)
+ *
+ * @returns
+ *
+ * @remarks
+ * None
+ *
+ *******************************************************************************/
+void ih264_intra_pred_luma_4x4_mode_horz_u(UWORD8 *pu1_src,
+ UWORD8 *pu1_dst,
+ WORD32 src_strd,
+ WORD32 dst_strd,
+ WORD32 ngbr_avail)
+{
+ UWORD8 *pu1_left = NULL; /* Pointer to start of left predictors */
+ UWORD32 ui4_i, ui4_j, ui4_k, ui4_l;
+ UWORD8 predicted_pixels[10];
+ UNUSED(src_strd);
+ UNUSED(ngbr_avail);
+ pu1_left = pu1_src + BLK_SIZE - 1;
+
+ ui4_i = *pu1_left--;
+ ui4_j = *pu1_left--;
+ ui4_k = *pu1_left--;
+ ui4_l = *pu1_left--;
+
+ predicted_pixels[0] = FILT11(ui4_j, ui4_i);
+ predicted_pixels[1] = FILT121(ui4_k, ui4_j, ui4_i);
+ predicted_pixels[2] = FILT11(ui4_k, ui4_j);
+ predicted_pixels[3] = FILT121(ui4_l, ui4_k, ui4_j);
+ predicted_pixels[4] = FILT11(ui4_l, ui4_k);
+ predicted_pixels[5] = FILT121(ui4_l, ui4_l, ui4_k);
+ predicted_pixels[6] = ui4_l;
+ predicted_pixels[7] = ui4_l;
+ predicted_pixels[8] = ui4_l;
+ predicted_pixels[9] = ui4_l;
+
+ memcpy(pu1_dst, predicted_pixels, 4);
+ memcpy(pu1_dst + dst_strd, predicted_pixels + 2, 4);
+ memcpy(pu1_dst + 2 * dst_strd, predicted_pixels + 4, 4);
+ memcpy(pu1_dst + 3 * dst_strd, predicted_pixels + 6, 4);
+}
+
+/******************* 8x8 Modes *******************/
+
+/**
+ *******************************************************************************
+ *
+ *ih264_intra_pred_luma_8x8_mode_ref_filtering
+ *
+ * @brief
+ * Reference sample filtering process for Intra_8x8 sample prediction
+ *
+ * @par Description:
+ * Perform Reference sample filtering process for Intra_8x8 sample prediction ,described in sec 8.3.2.2.1
+ *
+ * @param[in] pu1_src
+ * UWORD8 pointer to the source
+ *
+ * @param[out] pu1_dst
+ * UWORD8 pointer to the destination
+ *
+ * @param[in] src_strd
+ * integer source stride[Not Used]
+ *
+ * @param[in] dst_strd
+ * integer destination stride[Not Used]
+ *
+ * @param[in] ngbr_avail
+ * availability of neighbouring pixels(Not used in this function)
+ *
+ * @returns
+ *
+ * @remarks
+ * None
+ *
+ *
+ *******************************************************************************/
+void ih264_intra_pred_luma_8x8_mode_ref_filtering(UWORD8 *pu1_left,
+ UWORD8 *pu1_topleft,
+ UWORD8 *pu1_top,
+ UWORD8 *pu1_dst,
+ WORD32 left_strd,
+ WORD32 ngbr_avail)
+{
+ WORD32 top_avail, left_avail, top_left_avail, top_right_avail;
+
+ left_avail = BOOLEAN(ngbr_avail & LEFT_MB_AVAILABLE_MASK);
+ top_avail = BOOLEAN(ngbr_avail & TOP_MB_AVAILABLE_MASK);
+ top_left_avail = BOOLEAN(ngbr_avail & TOP_LEFT_MB_AVAILABLE_MASK);
+ top_right_avail = BOOLEAN(ngbr_avail & TOP_RIGHT_MB_AVAILABLE_MASK);
+
+ if(top_avail)
+ {
+ WORD32 i;
+ UWORD32 u4_xm1;
+
+ if(!top_right_avail)
+ {
+ memset(pu1_dst + 8 + 1 + 8, pu1_top[7], 8);
+ top_right_avail = 1;
+ }
+ else
+ {
+ memcpy(pu1_dst + 8 + 1 + 8, pu1_top + 8, 8);
+ }
+
+ if(top_left_avail)
+ {
+ pu1_dst[8 + 1 + 0] = FILT121((*pu1_topleft), pu1_top[0],
+ pu1_top[1]);
+
+ }
+ else
+ {
+ pu1_dst[8 + 1] = ((3 * pu1_top[0]) + pu1_top[1] + 2) >> 2;
+ }
+
+ for(i = 1; i <= 6; i++)
+ {
+ pu1_dst[8 + 1 + i] = FILT121(pu1_top[i - 1], pu1_top[i],
+ pu1_top[i + 1]);
+
+ }
+ /* First byte of Top Right input is in pu1_dst[8 + 1 + 8]*/
+ pu1_dst[8 + 1 + 7] = FILT121(pu1_top[6], pu1_top[7],
+ pu1_dst[8 + 1 + 8]);
+
+ /* filtered output and source in same buf, to prevent output(x - 1)
+ being over written in process */
+ u4_xm1 = pu1_top[7];
+
+ for(i = 8; i <= 14; i++)
+ {
+ UWORD32 u4_x;
+ u4_x = (u4_xm1 + (pu1_dst[8 + 1 + i] << 1) + pu1_dst[8 + 1 + i + 1]
+ + 2) >> 2;
+ /* assigning u4_xm1 from the un-filtered values for the next iteration */
+ u4_xm1 = pu1_dst[8 + 1 + i];
+ pu1_dst[8 + 1 + i] = u4_x;
+ }
+
+ pu1_dst[8 + 1 + 15] = (u4_xm1 + (3 * pu1_dst[8 + 1 + 15]) + 2) >> 2;
+
+ }
+
+ /* pu1_topleft is overloaded. It is both: */
+ /* a. A pointer for the top left pixel */
+ /* b. An indicator of availability of top left. */
+ /* If it is null then top left not available */
+ if(top_left_avail)
+ {
+ if((!top_avail) || (!left_avail))
+ {
+ if(top_avail)
+ pu1_dst[8] = (3 * pu1_topleft[0] + pu1_top[0] + 2) >> 2;
+ else if(left_avail)
+ pu1_dst[8] = (3 * pu1_topleft[0] + pu1_left[0] + 2) >> 2;
+ }
+ else
+ {
+ pu1_dst[8] = FILT121(pu1_top[0], (*pu1_topleft), pu1_left[0]);
+ }
+ }
+
+ if(left_avail)
+ {
+ UWORD32 idx;
+ if(0 != pu1_topleft)
+ {
+ pu1_dst[7] = FILT121((*pu1_topleft), pu1_left[0],
+ pu1_left[left_strd]);
+ }
+ else
+ {
+ pu1_dst[7] = ((3 * pu1_left[0]) + pu1_left[left_strd] + 2) >> 2;
+ }
+
+ for(idx = 1; idx <= 6; idx++)
+ {
+ pu1_dst[7 - idx] = FILT121(pu1_left[(idx - 1) * left_strd],
+ pu1_left[idx * left_strd],
+ pu1_left[(idx + 1) * left_strd]);
+
+ }
+ pu1_dst[0] = (pu1_left[6 * left_strd] + 3 * pu1_left[7 * left_strd] + 2)
+ >> 2;
+
+ }
+}
+
+/**
+ *******************************************************************************
+ *
+ *ih264_intra_pred_luma_8x8_mode_vert
+ *
+ * @brief
+ * Perform Intra prediction for luma_8x8 mode:vertical
+ *
+ * @par Description:
+ * Perform Intra prediction for luma_8x8 mode:vertical ,described in sec 8.3.2.2.2
+ *
+ * @param[in] pu1_src
+ * UWORD8 pointer to the source
+ *
+ * @param[out] pu1_dst
+ * UWORD8 pointer to the destination
+ *
+ * @param[in] src_strd
+ * integer source stride
+ *
+ * @param[in] dst_strd
+ * integer destination stride
+ *
+ * @param[in] ngbr_avail
+ * availability of neighbouring pixels(Not used in this function)
+ *
+ * @returns
+ *
+ * @remarks
+ * None
+ *
+ *******************************************************************************
+ */
+void ih264_intra_pred_luma_8x8_mode_vert(UWORD8 *pu1_src,
+ UWORD8 *pu1_dst,
+ WORD32 src_strd,
+ WORD32 dst_strd,
+ WORD32 ngbr_avail)
+{
+ UWORD8 *pu1_top = NULL;
+ UNUSED(src_strd);
+ UNUSED(ngbr_avail);
+ pu1_top = pu1_src + BLK8x8SIZE + 1;
+
+ memcpy(pu1_dst, pu1_top, 8);
+ memcpy(pu1_dst + dst_strd, pu1_top, 8);
+ memcpy(pu1_dst + 2 * dst_strd, pu1_top, 8);
+ memcpy(pu1_dst + 3 * dst_strd, pu1_top, 8);
+ memcpy(pu1_dst + 4 * dst_strd, pu1_top, 8);
+ memcpy(pu1_dst + 5 * dst_strd, pu1_top, 8);
+ memcpy(pu1_dst + 6 * dst_strd, pu1_top, 8);
+ memcpy(pu1_dst + 7 * dst_strd, pu1_top, 8);
+}
+
+/**
+ *******************************************************************************
+ *
+ *ih264_intra_pred_luma_8x8_mode_horz
+ *
+ * @brief
+ * Perform Intra prediction for luma_8x8 mode:horizontal
+ *
+ * @par Description:
+ * Perform Intra prediction for luma_8x8 mode:horizontal ,described in sec 8.3.2.2.2
+ *
+ * @param[in] pu1_src
+ * UWORD8 pointer to the source
+ *
+ * @param[out] pu1_dst
+ * UWORD8 pointer to the destination
+ *
+ * @param[in] src_strd
+ * integer source stride
+ *
+ * @param[in] dst_strd
+ * integer destination stride
+ *
+ * @param[in] ngbr_avail
+ * availability of neighbouring pixels(Not used in this function)
+ *
+ * @returns
+ *
+ * @remarks
+ * None
+ *
+ *******************************************************************************
+ */
+
+void ih264_intra_pred_luma_8x8_mode_horz(UWORD8 *pu1_src,
+ UWORD8 *pu1_dst,
+ WORD32 src_strd,
+ WORD32 dst_strd,
+ WORD32 ngbr_avail)
+{
+ UWORD8 *pu1_left = pu1_src + BLK8x8SIZE - 1;
+ UNUSED(src_strd);
+ UNUSED(ngbr_avail);
+ memset(pu1_dst, *pu1_left, 8);
+ memset(pu1_dst + dst_strd, *(pu1_left - 1), 8);
+ memset(pu1_dst + 2 * dst_strd, *(pu1_left - 2), 8);
+ memset(pu1_dst + 3 * dst_strd, *(pu1_left - 3), 8);
+ memset(pu1_dst + 4 * dst_strd, *(pu1_left - 4), 8);
+ memset(pu1_dst + 5 * dst_strd, *(pu1_left - 5), 8);
+ memset(pu1_dst + 6 * dst_strd, *(pu1_left - 6), 8);
+ memset(pu1_dst + 7 * dst_strd, *(pu1_left - 7), 8);
+}
+
+/**
+ *******************************************************************************
+ *
+ *ih264_intra_pred_luma_8x8_mode_dc
+ *
+ * @brief
+ * Perform Intra prediction for luma_8x8 mode:DC
+ *
+ * @par Description:
+ * Perform Intra prediction for luma_8x8 mode:DC ,described in sec 8.3.2.2.4
+ *
+ * @param[in] pu1_src
+ * UWORD8 pointer to the source
+ *
+ * @param[out] pu1_dst
+ * UWORD8 pointer to the destination
+ *
+ * @param[in] src_strd
+ * integer source stride
+ *
+ * @param[in] dst_strd
+ * integer destination stride
+ *
+ * @param[in] ngbr_avail
+ * availability of neighbouring pixels
+ *
+ * @returns
+ *
+ * @remarks
+ * None
+ *
+ *******************************************************************************/
+void ih264_intra_pred_luma_8x8_mode_dc(UWORD8 *pu1_src,
+ UWORD8 *pu1_dst,
+ WORD32 src_strd,
+ WORD32 dst_strd,
+ WORD32 ngbr_avail)
+{
+ UWORD8 u1_useleft; /* availability of left predictors (only for DC) */
+ UWORD8 u1_usetop; /* availability of top predictors (only for DC) */
+ UWORD8 *pu1_left = NULL; /* Pointer to start of left predictors */
+ UWORD8 *pu1_top = NULL; /* Pointer to start of top predictors */
+ WORD32 row;
+ WORD32 val = 0;
+ UNUSED(src_strd);
+
+ u1_useleft = BOOLEAN(ngbr_avail & LEFT_MB_AVAILABLE_MASK);
+ u1_usetop = BOOLEAN(ngbr_avail & TOP_MB_AVAILABLE_MASK);
+ pu1_top = pu1_src + BLK8x8SIZE + 1;
+ pu1_left = pu1_src + BLK8x8SIZE - 1;
+
+ if(u1_useleft)
+ {
+ for(row = 0; row < BLK8x8SIZE; row++)
+ val += *(pu1_left - row);
+ val += 4;
+ }
+ if(u1_usetop)
+ {
+ for(row = 0; row < BLK8x8SIZE; row++)
+ val += *(pu1_top + row);
+ val += 4;
+ }
+
+ /* Since 4 is added if either left/top pred is there,
+ val still being zero implies both preds are not there */
+ val = (val) ? (val >> (2 + u1_useleft + u1_usetop)) : 128;
+
+ memset(pu1_dst, val, 8);
+ memset(pu1_dst + dst_strd, val, 8);
+ memset(pu1_dst + 2 * dst_strd, val, 8);
+ memset(pu1_dst + 3 * dst_strd, val, 8);
+ memset(pu1_dst + 4 * dst_strd, val, 8);
+ memset(pu1_dst + 5 * dst_strd, val, 8);
+ memset(pu1_dst + 6 * dst_strd, val, 8);
+ memset(pu1_dst + 7 * dst_strd, val, 8);
+}
+
+/**
+ *******************************************************************************
+ *
+ *ih264_intra_pred_luma_8x8_mode_diag_dl
+ *
+ * @brief
+ * Perform Intra prediction for luma_8x8 mode:Diagonal_Down_Left
+ *
+ * @par Description:
+ * Perform Intra prediction for luma_8x8 mode:Diagonal_Down_Left ,described in sec 8.3.2.2.5
+ *
+ * @param[in] pu1_src
+ * UWORD8 pointer to the source
+ *
+ * @param[out] pu1_dst
+ * UWORD8 pointer to the destination
+ *
+ * @param[in] src_strd
+ * integer source stride
+ *
+ * @param[in] dst_strd
+ * integer destination stride
+ *
+ * @param[in] ngbr_avail
+ * availability of neighbouring pixels(Not used in this function)
+ *
+ * @returns
+ *
+ * @remarks
+ * None
+ *
+ *******************************************************************************/
+void ih264_intra_pred_luma_8x8_mode_diag_dl(UWORD8 *pu1_src,
+ UWORD8 *pu1_dst,
+ WORD32 src_strd,
+ WORD32 dst_strd,
+ WORD32 ngbr_avail)
+{
+ UWORD8 *pu1_top = NULL; /* Pointer to start of top predictors */
+ UWORD32 ui4_a, ui4_b, ui4_c, ui4_d, ui4_e, ui4_f, ui4_g, ui4_h;
+ UWORD32 ui4_i, ui4_j, ui4_k, ui4_l, ui4_m, ui4_n, ui4_o, ui4_p;
+ UWORD8 predicted_pixels[15];
+ UNUSED(src_strd);
+ UNUSED(ngbr_avail);
+ pu1_top = pu1_src + BLK8x8SIZE + 1;
+
+ ui4_a = *pu1_top++;
+ ui4_b = *pu1_top++;
+ ui4_c = *pu1_top++;
+ ui4_d = *pu1_top++;
+ ui4_e = *pu1_top++;
+ ui4_f = *pu1_top++;
+ ui4_g = *pu1_top++;
+ ui4_h = *pu1_top++;
+ ui4_i = *pu1_top++;
+ ui4_j = *pu1_top++;
+ ui4_k = *pu1_top++;
+ ui4_l = *pu1_top++;
+ ui4_m = *pu1_top++;
+ ui4_n = *pu1_top++;
+ ui4_o = *pu1_top++;
+ ui4_p = *pu1_top;
+
+ predicted_pixels[0] = FILT121(ui4_a, ui4_b, ui4_c);
+ predicted_pixels[1] = FILT121(ui4_b, ui4_c, ui4_d);
+ predicted_pixels[2] = FILT121(ui4_c, ui4_d, ui4_e);
+ predicted_pixels[3] = FILT121(ui4_d, ui4_e, ui4_f);
+ predicted_pixels[4] = FILT121(ui4_e, ui4_f, ui4_g);
+ predicted_pixels[5] = FILT121(ui4_f, ui4_g, ui4_h);
+ predicted_pixels[6] = FILT121(ui4_g, ui4_h, ui4_i);
+ predicted_pixels[7] = FILT121(ui4_h, ui4_i, ui4_j);
+ predicted_pixels[8] = FILT121(ui4_i, ui4_j, ui4_k);
+ predicted_pixels[9] = FILT121(ui4_j, ui4_k, ui4_l);
+ predicted_pixels[10] = FILT121(ui4_k, ui4_l, ui4_m);
+ predicted_pixels[11] = FILT121(ui4_l, ui4_m, ui4_n);
+ predicted_pixels[12] = FILT121(ui4_m, ui4_n, ui4_o);
+ predicted_pixels[13] = FILT121(ui4_n, ui4_o, ui4_p);
+ predicted_pixels[14] = FILT121(ui4_o, ui4_p, ui4_p);
+
+ memcpy(pu1_dst, predicted_pixels, 8);
+ memcpy(pu1_dst + dst_strd, predicted_pixels + 1, 8);
+ memcpy(pu1_dst + 2 * dst_strd, predicted_pixels + 2, 8);
+ memcpy(pu1_dst + 3 * dst_strd, predicted_pixels + 3, 8);
+ memcpy(pu1_dst + 4 * dst_strd, predicted_pixels + 4, 8);
+ memcpy(pu1_dst + 5 * dst_strd, predicted_pixels + 5, 8);
+ memcpy(pu1_dst + 6 * dst_strd, predicted_pixels + 6, 8);
+ memcpy(pu1_dst + 7 * dst_strd, predicted_pixels + 7, 8);
+}
+
+/**
+ *******************************************************************************
+ *
+ *ih264_intra_pred_luma_8x8_mode_diag_dr
+ *
+ * @brief
+ * Perform Intra prediction for luma_8x8 mode:Diagonal_Down_Right
+ *
+ * @par Description:
+ * Perform Intra prediction for luma_8x8 mode:Diagonal_Down_Right ,described in sec 8.3.2.2.6
+ *
+ * @param[in] pu1_src
+ * UWORD8 pointer to the source
+ *
+ * @param[out] pu1_dst
+ * UWORD8 pointer to the destination
+ *
+ * @param[in] src_strd
+ * integer source stride
+ *
+ * @param[in] dst_strd
+ * integer destination stride
+ *
+ * @param[in] ngbr_avail
+ * availability of neighbouring pixels(Not used in this function)
+ *
+ * @returns
+ *
+ * @remarks
+ * None
+ *
+ *******************************************************************************/
+void ih264_intra_pred_luma_8x8_mode_diag_dr(UWORD8 *pu1_src,
+ UWORD8 *pu1_dst,
+ WORD32 src_strd,
+ WORD32 dst_strd,
+ WORD32 ngbr_avail)
+{
+ UWORD8 *pu1_left = NULL; /* Pointer to start of left predictors */
+ UWORD8 *pu1_top = NULL; /* Pointer to start of top predictors */
+ UWORD8 *pu1_topleft = NULL; /* Pointer to start of top left predictors */
+ UWORD32 ui4_a;
+ UWORD32 ui4_b, ui4_c, ui4_d, ui4_e, ui4_f, ui4_g, ui4_h, ui4_i;
+ UWORD32 ui4_j, ui4_k, ui4_l, ui4_m, ui4_n, ui4_o, ui4_p, ui4_q;
+ UWORD8 predicted_pixels[15];
+ UNUSED(src_strd);
+ UNUSED(ngbr_avail);
+ pu1_top = pu1_src + BLK8x8SIZE + 1;
+ pu1_left = pu1_src + BLK8x8SIZE - 1;
+ pu1_topleft = pu1_src + BLK8x8SIZE;
+
+ ui4_a = *pu1_topleft;
+ ui4_b = *pu1_top++;
+ ui4_c = *pu1_top++;
+ ui4_d = *pu1_top++;
+ ui4_e = *pu1_top++;
+ ui4_f = *pu1_top++;
+ ui4_g = *pu1_top++;
+ ui4_h = *pu1_top++;
+ ui4_i = *pu1_top;
+ ui4_j = *pu1_left--;
+ ui4_k = *pu1_left--;
+ ui4_l = *pu1_left--;
+ ui4_m = *pu1_left--;
+ ui4_n = *pu1_left--;
+ ui4_o = *pu1_left--;
+ ui4_p = *pu1_left--;
+ ui4_q = *pu1_left;
+
+ predicted_pixels[6] = FILT121(ui4_a, ui4_j, ui4_k);
+ predicted_pixels[5] = FILT121(ui4_j, ui4_k, ui4_l);
+ predicted_pixels[4] = FILT121(ui4_k, ui4_l, ui4_m);
+ predicted_pixels[3] = FILT121(ui4_l, ui4_m, ui4_n);
+ predicted_pixels[2] = FILT121(ui4_m, ui4_n, ui4_o);
+ predicted_pixels[1] = FILT121(ui4_n, ui4_o, ui4_p);
+ predicted_pixels[0] = FILT121(ui4_o, ui4_p, ui4_q);
+ predicted_pixels[7] = FILT121(ui4_b, ui4_a, ui4_j);
+ predicted_pixels[8] = FILT121(ui4_a, ui4_b, ui4_c);
+ predicted_pixels[9] = FILT121(ui4_b, ui4_c, ui4_d);
+ predicted_pixels[10] = FILT121(ui4_c, ui4_d, ui4_e);
+ predicted_pixels[11] = FILT121(ui4_d, ui4_e, ui4_f);
+ predicted_pixels[12] = FILT121(ui4_e, ui4_f, ui4_g);
+ predicted_pixels[13] = FILT121(ui4_f, ui4_g, ui4_h);
+ predicted_pixels[14] = FILT121(ui4_g, ui4_h, ui4_i);
+
+ memcpy(pu1_dst, predicted_pixels + 7, 8);
+ memcpy(pu1_dst + dst_strd, predicted_pixels + 6, 8);
+ memcpy(pu1_dst + 2 * dst_strd, predicted_pixels + 5, 8);
+ memcpy(pu1_dst + 3 * dst_strd, predicted_pixels + 4, 8);
+ memcpy(pu1_dst + 4 * dst_strd, predicted_pixels + 3, 8);
+ memcpy(pu1_dst + 5 * dst_strd, predicted_pixels + 2, 8);
+ memcpy(pu1_dst + 6 * dst_strd, predicted_pixels + 1, 8);
+ memcpy(pu1_dst + 7 * dst_strd, predicted_pixels, 8);
+}
+
+/**
+ *******************************************************************************
+ *
+ *ih264_intra_pred_luma_8x8_mode_vert_r
+ *
+ * @brief
+ * Perform Intra prediction for luma_8x8 mode:Vertical_Right
+ *
+ * @par Description:
+ * Perform Intra prediction for luma_8x8 mode:Vertical_Right ,described in sec 8.3.2.2.7
+ *
+ * @param[in] pu1_src
+ * UWORD8 pointer to the source
+ *
+ * @param[out] pu1_dst
+ * UWORD8 pointer to the destination
+ *
+ * @param[in] src_strd
+ * integer source stride
+ *
+ * @param[in] dst_strd
+ * integer destination stride
+ *
+ * @param[in] ngbr_avail
+ * availability of neighbouring pixels(Not used in this function)
+ *
+ * @returns
+ *
+ * @remarks
+ * None
+ *
+ *******************************************************************************/
+void ih264_intra_pred_luma_8x8_mode_vert_r(UWORD8 *pu1_src,
+ UWORD8 *pu1_dst,
+ WORD32 src_strd,
+ WORD32 dst_strd,
+ WORD32 ngbr_avail)
+{
+ UWORD8 *pu1_left = NULL; /* Pointer to start of left predictors */
+ UWORD8 *pu1_top = NULL; /* Pointer to start of top predictors */
+ UWORD8 *pu1_topleft = NULL; /* Pointer to start of top left predictors */
+ UWORD32 ui4_a;
+ UWORD32 ui4_b, ui4_c, ui4_d, ui4_e, ui4_f, ui4_g, ui4_h, ui4_i;
+ UWORD32 ui4_j, ui4_k, ui4_l, ui4_m, ui4_n, ui4_o, ui4_p;
+ UWORD8 predicted_pixels[22];
+
+ UNUSED(src_strd);
+ UNUSED(ngbr_avail);
+ pu1_top = pu1_src + BLK8x8SIZE + 1;
+ pu1_left = pu1_src + BLK8x8SIZE - 1;
+ pu1_topleft = pu1_src + BLK8x8SIZE;
+
+ ui4_a = *pu1_topleft;
+
+ ui4_b = *pu1_top++;
+ ui4_c = *pu1_top++;
+ ui4_d = *pu1_top++;
+ ui4_e = *pu1_top++;
+ ui4_f = *pu1_top++;
+ ui4_g = *pu1_top++;
+ ui4_h = *pu1_top++;
+ ui4_i = *pu1_top;
+ ui4_j = *pu1_left--;
+ ui4_k = *pu1_left--;
+ ui4_l = *pu1_left--;
+ ui4_m = *pu1_left--;
+ ui4_n = *pu1_left--;
+ ui4_o = *pu1_left--;
+ ui4_p = *pu1_left--;
+
+ predicted_pixels[0] = FILT121(ui4_o, ui4_n, ui4_m);
+ predicted_pixels[1] = FILT121(ui4_m, ui4_l, ui4_k);
+ predicted_pixels[2] = FILT121(ui4_k, ui4_j, ui4_a);
+ predicted_pixels[3] = FILT11(ui4_a, ui4_b);
+ predicted_pixels[4] = FILT11(ui4_b, ui4_c);
+ predicted_pixels[5] = FILT11(ui4_c, ui4_d);
+ predicted_pixels[6] = FILT11(ui4_d, ui4_e);
+ predicted_pixels[7] = FILT11(ui4_e, ui4_f);
+ predicted_pixels[8] = FILT11(ui4_f, ui4_g);
+ predicted_pixels[9] = FILT11(ui4_g, ui4_h);
+ predicted_pixels[10] = FILT11(ui4_h, ui4_i);
+ predicted_pixels[11] = FILT121(ui4_p, ui4_o, ui4_n);
+ predicted_pixels[12] = FILT121(ui4_n, ui4_m, ui4_l);
+ predicted_pixels[13] = FILT121(ui4_l, ui4_k, ui4_j);
+ predicted_pixels[14] = FILT121(ui4_b, ui4_a, ui4_j);
+ predicted_pixels[15] = FILT121(ui4_a, ui4_b, ui4_c);
+ predicted_pixels[16] = FILT121(ui4_b, ui4_c, ui4_d);
+ predicted_pixels[17] = FILT121(ui4_c, ui4_d, ui4_e);
+ predicted_pixels[18] = FILT121(ui4_d, ui4_e, ui4_f);
+ predicted_pixels[19] = FILT121(ui4_e, ui4_f, ui4_g);
+ predicted_pixels[20] = FILT121(ui4_f, ui4_g, ui4_h);
+ predicted_pixels[21] = FILT121(ui4_g, ui4_h, ui4_i);
+
+ memcpy(pu1_dst, predicted_pixels + 3, 8);
+ memcpy(pu1_dst + 1 * dst_strd, predicted_pixels + 14, 8);
+ memcpy(pu1_dst + 2 * dst_strd, predicted_pixels + 2, 8);
+ memcpy(pu1_dst + 3 * dst_strd, predicted_pixels + 13, 8);
+ memcpy(pu1_dst + 4 * dst_strd, predicted_pixels + 1, 8);
+ memcpy(pu1_dst + 5 * dst_strd, predicted_pixels + 12, 8);
+ memcpy(pu1_dst + 6 * dst_strd, predicted_pixels, 8);
+ memcpy(pu1_dst + 7 * dst_strd, predicted_pixels + 11, 8);
+
+}
+
+/*
+ *******************************************************************************
+ *
+ *ih264_intra_pred_luma_8x8_mode_horz_d
+ *
+ * @brief
+ * Perform Intra prediction for luma_8x8 mode:Horizontal_Down
+ *
+ * @par Description:
+ * Perform Intra prediction for luma_8x8 mode:Horizontal_Down ,described in sec 8.3.2.2.8
+ *
+ * @param[in] pu1_src
+ * UWORD8 pointer to the source
+ *
+ * @param[out] pu1_dst
+ * UWORD8 pointer to the destination
+ *
+ * @param[in] src_strd
+ * integer source stride
+ *
+ * @param[in] dst_strd
+ * integer destination stride
+ *
+ * @param[in] ngbr_avail
+ * availability of neighbouring pixels(Not used in this function)
+ *
+ * @returns
+ *
+ * @remarks
+ * None
+ *
+ *******************************************************************************/
+
+void ih264_intra_pred_luma_8x8_mode_horz_d(UWORD8 *pu1_src,
+ UWORD8 *pu1_dst,
+ WORD32 src_strd,
+ WORD32 dst_strd,
+ WORD32 ngbr_avail)
+{
+ UWORD8 *pu1_left = NULL; /* Pointer to start of left predictors */
+ UWORD8 *pu1_top = NULL; /* Pointer to start of top predictors */
+ UWORD8 *pu1_topleft = NULL; /* Pointer to start of top left predictors */
+ UWORD32 ui4_a;
+ UWORD32 ui4_b, ui4_c, ui4_d, ui4_e, ui4_f, ui4_g, ui4_h, ui4_i;
+ UWORD32 ui4_j, ui4_k, ui4_l, ui4_m, ui4_n, ui4_o, ui4_p;
+ UWORD8 predicted_pixels[22];
+ UNUSED(src_strd);
+ UNUSED(ngbr_avail);
+ pu1_top = pu1_src + BLK8x8SIZE + 1;
+ pu1_left = pu1_src + BLK8x8SIZE - 1;
+ pu1_topleft = pu1_src + BLK8x8SIZE;
+
+ ui4_a = *pu1_topleft;
+ ui4_j = *pu1_top++;
+ ui4_k = *pu1_top++;
+ ui4_l = *pu1_top++;
+ ui4_m = *pu1_top++;
+ ui4_n = *pu1_top++;
+ ui4_o = *pu1_top++;
+ ui4_p = *pu1_top++;
+ ui4_b = *pu1_left--;
+ ui4_c = *pu1_left--;
+ ui4_d = *pu1_left--;
+ ui4_e = *pu1_left--;
+ ui4_f = *pu1_left--;
+ ui4_g = *pu1_left--;
+ ui4_h = *pu1_left--;
+ ui4_i = *pu1_left;
+
+ predicted_pixels[0] = FILT11(ui4_h, ui4_i);
+ predicted_pixels[1] = FILT121(ui4_g, ui4_h, ui4_i);
+ predicted_pixels[2] = FILT11(ui4_g, ui4_h);
+ predicted_pixels[3] = FILT121(ui4_f, ui4_g, ui4_h);
+ predicted_pixels[4] = FILT11(ui4_f, ui4_g);
+ predicted_pixels[5] = FILT121(ui4_e, ui4_f, ui4_g);
+ predicted_pixels[6] = FILT11(ui4_e, ui4_f);
+ predicted_pixels[7] = FILT121(ui4_d, ui4_e, ui4_f);
+ predicted_pixels[8] = FILT11(ui4_d, ui4_e);
+ predicted_pixels[9] = FILT121(ui4_c, ui4_d, ui4_e);
+ predicted_pixels[10] = FILT11(ui4_c, ui4_d);
+ predicted_pixels[11] = FILT121(ui4_b, ui4_c, ui4_d);
+ predicted_pixels[12] = FILT11(ui4_b, ui4_c);
+ predicted_pixels[13] = FILT121(ui4_a, ui4_b, ui4_c);
+ predicted_pixels[14] = FILT11(ui4_a, ui4_b);
+ predicted_pixels[15] = FILT121(ui4_j, ui4_a, ui4_b);
+ predicted_pixels[16] = FILT121(ui4_k, ui4_j, ui4_a);
+ predicted_pixels[17] = FILT121(ui4_l, ui4_k, ui4_j);
+ predicted_pixels[18] = FILT121(ui4_m, ui4_l, ui4_k);
+ predicted_pixels[19] = FILT121(ui4_n, ui4_m, ui4_l);
+ predicted_pixels[20] = FILT121(ui4_o, ui4_n, ui4_m);
+ predicted_pixels[21] = FILT121(ui4_p, ui4_o, ui4_n);
+
+ memcpy(pu1_dst, predicted_pixels + 14, 8);
+ memcpy(pu1_dst + dst_strd, predicted_pixels + 12, 8);
+ memcpy(pu1_dst + 2 * dst_strd, predicted_pixels + 10, 8);
+ memcpy(pu1_dst + 3 * dst_strd, predicted_pixels + 8, 8);
+ memcpy(pu1_dst + 4 * dst_strd, predicted_pixels + 6, 8);
+ memcpy(pu1_dst + 5 * dst_strd, predicted_pixels + 4, 8);
+ memcpy(pu1_dst + 6 * dst_strd, predicted_pixels + 2, 8);
+ memcpy(pu1_dst + 7 * dst_strd, predicted_pixels, 8);
+}
+
+/**
+ *******************************************************************************
+ *
+ *ih264_intra_pred_luma_8x8_mode_vert_l
+ *
+ * @brief
+ * Perform Intra prediction for luma_8x8 mode:Vertical_Left
+ *
+ * @par Description:
+ * Perform Intra prediction for luma_8x8 mode:Vertical_Left ,described in sec 8.3.2.2.9
+ *
+ * @param[in] pu1_src
+ * UWORD8 pointer to the source
+ *
+ * @param[out] pu1_dst
+ * UWORD8 pointer to the destination
+ *
+ * @param[in] src_strd
+ * integer source stride
+ *
+ * @param[in] dst_strd
+ * integer destination stride
+ *
+ * @param[in] ngbr_avail
+ * availability of neighbouring pixels(Not used in this function)
+ *
+ * @returns
+ *
+ * @remarks
+ * None
+ *
+ *******************************************************************************/
+
+void ih264_intra_pred_luma_8x8_mode_vert_l(UWORD8 *pu1_src,
+ UWORD8 *pu1_dst,
+ WORD32 src_strd,
+ WORD32 dst_strd,
+ WORD32 ngbr_avail)
+{
+ UWORD8 *pu1_top = NULL; /* Pointer to start of top predictors */
+ UWORD32 ui4_a, ui4_b, ui4_c, ui4_d, ui4_e, ui4_f, ui4_g, ui4_h;
+ UWORD32 ui4_i, ui4_j, ui4_k, ui4_l, ui4_m;
+ UWORD8 predicted_pixels[22];
+ UNUSED(src_strd);
+ UNUSED(ngbr_avail);
+ pu1_top = pu1_src + BLK8x8SIZE + 1;
+
+ ui4_a = *pu1_top++;
+ ui4_b = *pu1_top++;
+ ui4_c = *pu1_top++;
+ ui4_d = *pu1_top++;
+ ui4_e = *pu1_top++;
+ ui4_f = *pu1_top++;
+ ui4_g = *pu1_top++;
+ ui4_h = *pu1_top++;
+ ui4_i = *pu1_top++;
+ ui4_j = *pu1_top++;
+ ui4_k = *pu1_top++;
+ ui4_l = *pu1_top++;
+ ui4_m = *pu1_top++;
+
+ predicted_pixels[0] = FILT11(ui4_a, ui4_b);
+ predicted_pixels[1] = FILT11(ui4_b, ui4_c);
+ predicted_pixels[2] = FILT11(ui4_c, ui4_d);
+ predicted_pixels[3] = FILT11(ui4_d, ui4_e);
+ predicted_pixels[4] = FILT11(ui4_e, ui4_f);
+ predicted_pixels[5] = FILT11(ui4_f, ui4_g);
+ predicted_pixels[6] = FILT11(ui4_g, ui4_h);
+ predicted_pixels[7] = FILT11(ui4_h, ui4_i);
+ predicted_pixels[8] = FILT11(ui4_i, ui4_j);
+ predicted_pixels[9] = FILT11(ui4_j, ui4_k);
+ predicted_pixels[10] = FILT11(ui4_k, ui4_l);
+ predicted_pixels[11] = FILT121(ui4_a, ui4_b, ui4_c);
+ predicted_pixels[12] = FILT121(ui4_b, ui4_c, ui4_d);
+ predicted_pixels[13] = FILT121(ui4_c, ui4_d, ui4_e);
+ predicted_pixels[14] = FILT121(ui4_d, ui4_e, ui4_f);
+ predicted_pixels[15] = FILT121(ui4_e, ui4_f, ui4_g);
+ predicted_pixels[16] = FILT121(ui4_f, ui4_g, ui4_h);
+ predicted_pixels[17] = FILT121(ui4_g, ui4_h, ui4_i);
+ predicted_pixels[18] = FILT121(ui4_h, ui4_i, ui4_j);
+ predicted_pixels[19] = FILT121(ui4_i, ui4_j, ui4_k);
+ predicted_pixels[20] = FILT121(ui4_j, ui4_k, ui4_l);
+ predicted_pixels[21] = FILT121(ui4_k, ui4_l, ui4_m);
+
+ memcpy(pu1_dst, predicted_pixels, 8);
+ memcpy(pu1_dst + 2 * dst_strd, predicted_pixels + 1, 8);
+ memcpy(pu1_dst + 4 * dst_strd, predicted_pixels + 2, 8);
+ memcpy(pu1_dst + 6 * dst_strd, predicted_pixels + 3, 8);
+ memcpy(pu1_dst + 1 * dst_strd, predicted_pixels + 11, 8);
+ memcpy(pu1_dst + 3 * dst_strd, predicted_pixels + 12, 8);
+ memcpy(pu1_dst + 5 * dst_strd, predicted_pixels + 13, 8);
+ memcpy(pu1_dst + 7 * dst_strd, predicted_pixels + 14, 8);
+}
+
+/**
+ *******************************************************************************
+ *
+ *ih264_intra_pred_luma_8x8_mode_horz_u
+ *
+ * @brief
+ * Perform Intra prediction for luma_8x8 mode:Horizontal_Up
+ *
+ * @par Description:
+ * Perform Intra prediction for luma_8x8 mode:Horizontal_Up ,described in sec 8.3.2.2.10
+ *
+ * @param[in] pu1_src
+ * UWORD8 pointer to the source
+ *
+ * @param[out] pu1_dst
+ * UWORD8 pointer to the destination
+ *
+ * @param[in] src_strd
+ * integer source stride
+ *
+ * @param[in] dst_strd
+ * integer destination stride
+ *
+ * @param[in] ngbr_avail
+ * availability of neighbouring pixels(Not used in this function)
+ *
+ * @returns
+ *
+ * @remarks
+ * None
+ *
+ *******************************************************************************/
+
+void ih264_intra_pred_luma_8x8_mode_horz_u(UWORD8 *pu1_src,
+ UWORD8 *pu1_dst,
+ WORD32 src_strd,
+ WORD32 dst_strd,
+ WORD32 ngbr_avail)
+
+{
+ UWORD8 *pu1_left = NULL; /* Pointer to start of left predictors */
+ UWORD32 ui4_j, ui4_k, ui4_l, ui4_m, ui4_n, ui4_o, ui4_p, ui4_q;
+ UWORD8 predicted_pixels[22];
+ UNUSED(src_strd);
+ UNUSED(ngbr_avail);
+ pu1_left = pu1_src + BLK8x8SIZE - 1;
+
+ ui4_j = *pu1_left--;
+ ui4_k = *pu1_left--;
+ ui4_l = *pu1_left--;
+ ui4_m = *pu1_left--;
+ ui4_n = *pu1_left--;
+ ui4_o = *pu1_left--;
+ ui4_p = *pu1_left--;
+ ui4_q = *pu1_left;
+
+ pu1_left = pu1_src + BLK8x8SIZE - 1;
+
+ predicted_pixels[0] = FILT11(ui4_j, ui4_k);
+ predicted_pixels[1] = FILT121(ui4_j, ui4_k, ui4_l);
+ predicted_pixels[2] = FILT11(ui4_k, ui4_l);
+ predicted_pixels[3] = FILT121(ui4_k, ui4_l, ui4_m);
+ predicted_pixels[4] = FILT11(ui4_l, ui4_m);
+ predicted_pixels[5] = FILT121(ui4_l, ui4_m, ui4_n);
+ predicted_pixels[6] = FILT11(ui4_m, ui4_n);
+ predicted_pixels[7] = FILT121(ui4_m, ui4_n, ui4_o);
+ predicted_pixels[8] = FILT11(ui4_n, ui4_o);
+ predicted_pixels[9] = FILT121(ui4_n, ui4_o, ui4_p);
+ predicted_pixels[10] = FILT11(ui4_o, ui4_p);
+ predicted_pixels[11] = FILT121(ui4_o, ui4_p, ui4_q);
+ predicted_pixels[12] = FILT11(ui4_p, ui4_q);
+ predicted_pixels[13] = FILT121(ui4_p, ui4_q, ui4_q);
+ memset(predicted_pixels+14,ui4_q,8);
+
+ memcpy(pu1_dst, predicted_pixels, 8);
+ memcpy(pu1_dst + 1 * dst_strd, predicted_pixels + 2, 8);
+ memcpy(pu1_dst + 2 * dst_strd, predicted_pixels + 4, 8);
+ memcpy(pu1_dst + 3 * dst_strd, predicted_pixels + 6, 8);
+ memcpy(pu1_dst + 4 * dst_strd, predicted_pixels + 8, 8);
+ memcpy(pu1_dst + 5 * dst_strd, predicted_pixels + 10, 8);
+ memcpy(pu1_dst + 6 * dst_strd, predicted_pixels + 12, 8);
+ memcpy(pu1_dst + 7 * dst_strd, predicted_pixels + 14, 8);
+}
+
+
+/******************* 16x16 Modes *******************/
+
+/**
+ *******************************************************************************
+ *
+ *ih264_intra_pred_luma_16x16_mode_vert
+ *
+ * @brief
+ * Perform Intra prediction for luma_16x16 mode:Vertical
+ *
+ * @par Description:
+ * Perform Intra prediction for luma_16x16 mode:Vertical, described in sec 8.3.3.1
+ *
+ * @param[in] pu1_src
+ * UWORD8 pointer to the source
+ *
+ * @param[out] pu1_dst
+ * UWORD8 pointer to the destination
+ *
+ * @param[in] src_strd
+ * integer source stride
+ *
+ * @param[in] dst_strd
+ * integer destination stride
+ *
+ * @param[in] ngbr_avail
+ * availability of neighbouring pixels (Not used in this function)
+ *
+ * @returns
+ *
+ * @remarks
+ * None
+ *
+ *******************************************************************************/
+
+void ih264_intra_pred_luma_16x16_mode_vert(UWORD8 *pu1_src,
+ UWORD8 *pu1_dst,
+ WORD32 src_strd,
+ WORD32 dst_strd,
+ WORD32 ngbr_avail)
+{
+ UWORD8 *pu1_top = NULL; /* Pointer to start of top predictors */
+ WORD32 rows; /* loop variables*/
+ UNUSED(src_strd);
+ UNUSED(ngbr_avail);
+ pu1_top = pu1_src + MB_SIZE + 1;
+
+ for(rows = 0; rows < 16; rows += 4, pu1_dst += dst_strd)
+ {
+ memcpy(pu1_dst, pu1_top, 16);
+ pu1_dst += dst_strd;
+ memcpy(pu1_dst, pu1_top, 16);
+ pu1_dst += dst_strd;
+ memcpy(pu1_dst, pu1_top, 16);
+ pu1_dst += dst_strd;
+ memcpy(pu1_dst, pu1_top, 16);
+ }
+}
+
+/**
+ *******************************************************************************
+ *
+ *ih264_intra_pred_luma_16x16_mode_horz
+ *
+ * @brief
+ * Perform Intra prediction for luma_16x16 mode:Horizontal
+ *
+ * @par Description:
+ * Perform Intra prediction for luma_16x16 mode:Horizontal, described in sec 8.3.3.2
+ *
+ * @param[in] pu1_src
+ * UWORD8 pointer to the source
+ *
+ * @param[out] pu1_dst
+ * UWORD8 pointer to the destination
+ *
+ * @param[in] src_strd
+ * integer source stride
+ *
+ * @param[in] dst_strd
+ * integer destination stride
+ *
+ * @param[in] ngbr_avail
+ * availability of neighbouring pixels(Not used in this function)
+ *
+ * @returns
+ *
+ * @remarks
+ * None
+ *
+ *******************************************************************************/
+
+void ih264_intra_pred_luma_16x16_mode_horz(UWORD8 *pu1_src,
+ UWORD8 *pu1_dst,
+ WORD32 src_strd,
+ WORD32 dst_strd,
+ WORD32 ngbr_avail)
+{
+ UWORD8 *pu1_left = NULL; /* Pointer to start of top predictors */
+ WORD32 rows;
+ UNUSED(src_strd);
+ UNUSED(ngbr_avail);
+ pu1_left = pu1_src + MB_SIZE - 1;
+
+ for(rows = 0; rows < 16; rows += 4, pu1_dst += dst_strd, pu1_left --)
+ {
+ memset(pu1_dst, *pu1_left, 16); /* copy the left value to the entire row*/
+ pu1_left --;
+ pu1_dst += dst_strd;
+ memset(pu1_dst, *pu1_left, 16);
+ pu1_left --;
+ pu1_dst += dst_strd;
+ memset(pu1_dst, *pu1_left, 16);
+ pu1_left --;
+ pu1_dst += dst_strd;
+ memset(pu1_dst, *pu1_left, 16);
+ }
+}
+
+/**
+ *******************************************************************************
+ *
+ *ih264_intra_pred_luma_16x16_mode_dc
+ *
+ * @brief
+ * Perform Intra prediction for luma_16x16 mode:DC
+ *
+ * @par Description:
+ * Perform Intra prediction for luma_16x16 mode:DC, described in sec 8.3.3.3
+ *
+ * @param[in] pu1_src
+ * UWORD8 pointer to the source
+ *
+ * @param[out] pu1_dst
+ * UWORD8 pointer to the destination
+ *
+ * @param[in] src_strd
+ * integer source stride
+ *
+ * @param[in] dst_strd
+ * integer destination stride
+ *
+ ** @param[in] ngbr_avail
+ * availability of neighbouring pixels
+ *
+ * @returns
+ *
+ * @remarks
+ * None
+ *
+ *******************************************************************************/
+
+void ih264_intra_pred_luma_16x16_mode_dc(UWORD8 *pu1_src,
+ UWORD8 *pu1_dst,
+ WORD32 src_strd,
+ WORD32 dst_strd,
+ WORD32 ngbr_avail)
+{
+ WORD8 u1_useleft; /* availability of left predictors (only for DC) */
+ UWORD8 u1_usetop; /* availability of top predictors (only for DC) */
+ UWORD8 *pu1_left = NULL; /* Pointer to start of left predictors */
+ UWORD8 *pu1_top = NULL; /* Pointer to start of top predictors */
+ WORD32 rows; /* loop variables*/
+ WORD32 val = 0;
+ UNUSED(src_strd);
+
+ u1_useleft = BOOLEAN(ngbr_avail & LEFT_MB_AVAILABLE_MASK);
+ u1_usetop = BOOLEAN(ngbr_avail & TOP_MB_AVAILABLE_MASK);
+ pu1_top = pu1_src + MB_SIZE + 1;
+ pu1_left = pu1_src + MB_SIZE - 1;
+ if(u1_useleft)
+ {
+ for(rows = 0; rows < 16; rows++)
+ val += *(pu1_left - rows);
+ val += 8;
+ }
+ if(u1_usetop)
+ {
+ for(rows = 0; rows < 16; rows++)
+ val += *(pu1_top + rows);
+ val += 8;
+ }
+ /* Since 8 is added if either left/top pred is there,
+ val still being zero implies both preds are not there */
+ val = (val) ? (val >> (3 + u1_useleft + u1_usetop)) : 128;
+
+ for(rows = 0; rows < 16; rows += 4, pu1_dst += dst_strd)
+ {
+ memset(pu1_dst, val, 16);
+ pu1_dst += dst_strd;
+ memset(pu1_dst, val, 16);
+ pu1_dst += dst_strd;
+ memset(pu1_dst, val, 16);
+ pu1_dst += dst_strd;
+ memset(pu1_dst, val, 16);
+ }
+}
+
+/**
+ *******************************************************************************
+ *
+ *ih264_intra_pred_luma_16x16_mode_plane
+ *
+ * @brief
+ * Perform Intra prediction for luma_16x16 mode:PLANE
+ *
+ * @par Description:
+ * Perform Intra prediction for luma_16x16 mode:PLANE, described in sec 8.3.3.4
+ *
+ * @param[in] pu1_src
+ * UWORD8 pointer to the source
+ *
+ * @param[out] pu1_dst
+ * UWORD8 pointer to the destination
+ *
+ * @param[in] src_strd
+ * integer source stride
+ *
+ * @param[in] dst_strd
+ * integer destination stride
+ *
+ * @param[in] ngbr_avail
+ * availability of neighbouring pixels(Not used in this function)
+ *
+ * @returns
+ *
+ * @remarks
+ * None
+ *
+ *******************************************************************************/
+
+void ih264_intra_pred_luma_16x16_mode_plane(UWORD8 *pu1_src,
+ UWORD8 *pu1_dst,
+ WORD32 src_strd,
+ WORD32 dst_strd,
+ WORD32 ngbr_avail)
+{
+ /*! Written with no multiplications */
+ UWORD8 *pu1_left = NULL; /* Pointer to start of left predictors */
+ UWORD8 *pu1_top = NULL; /* Pointer to start of top predictors */
+ UWORD8 *pu1_topleft = NULL;
+ WORD32 a, b, c, tmp;
+ UWORD8 *pu1_tmp1, *pu1_tmp2;
+ WORD32 shift;
+ UNUSED(src_strd);
+ UNUSED(ngbr_avail);
+ pu1_top = pu1_src + MB_SIZE + 1;
+ pu1_left = pu1_src + MB_SIZE - 1;
+ pu1_topleft = pu1_src + MB_SIZE;
+
+ {
+ a = (*(pu1_top + 15) + *(pu1_left - 15)) << 4;
+
+ /*! Implement Sum(x*(P((x+7),-1) - P((x-7),-1))) x=1...8 */
+ pu1_tmp1 = pu1_top + 8;
+ pu1_tmp2 = pu1_tmp1 - 2;
+
+ /* Pixel diffs are only 9 bits;
+ so sign extension allows shifts to be used even for signed */
+ b = ((*pu1_tmp1++) - (*pu1_tmp2--)); /* x=1 */
+ b += ((*pu1_tmp1++) - (*pu1_tmp2--)) << 1; /* x=2 */
+ tmp = ((*pu1_tmp1++) - (*pu1_tmp2--));
+ b += (tmp << 1) + tmp; /* x=3 */
+ b += ((*pu1_tmp1++) - (*pu1_tmp2--)) << 2; /* x=4 */
+
+ tmp = ((*pu1_tmp1++) - (*pu1_tmp2--));
+ b += (tmp << 2) + tmp; /* x=5 */
+ tmp = ((*pu1_tmp1++) - (*pu1_tmp2--));
+ b += (tmp << 2) + (tmp << 1); /* x=6 */
+ tmp = ((*pu1_tmp1++) - (*pu1_tmp2--));
+ b += (tmp << 3) - tmp; /* x=7 */
+ b += ((*pu1_tmp1) - (*pu1_topleft)) << 3; /* x=8 */
+
+ b = ((b << 2) + b + 32) >> 6; /*! (5*H + 32)>>6 */
+
+ /*! Implement Sum(y*(P(-1,(y+7)) - P(-1,(y-7)))) y=1...8 */
+ pu1_tmp1 = pu1_left - 8;
+ pu1_tmp2 = pu1_tmp1 + 2;
+
+ c = ((*pu1_tmp1) - (*pu1_tmp2)); /* y=1 */
+ pu1_tmp1--;
+ pu1_tmp2++;
+ c += ((*pu1_tmp1) - (*pu1_tmp2)) << 1; /* y=2 */
+ pu1_tmp1--;
+ pu1_tmp2++;
+ tmp = ((*pu1_tmp1) - (*pu1_tmp2));
+ c += (tmp << 1) + tmp; /* y=3 */
+ pu1_tmp1--;
+ pu1_tmp2++;
+ c += ((*pu1_tmp1) - (*pu1_tmp2)) << 2; /* y=4 */
+ pu1_tmp1--;
+ pu1_tmp2++;
+
+ tmp = ((*pu1_tmp1) - (*pu1_tmp2));
+ c += (tmp << 2) + tmp; /* y=5 */
+ pu1_tmp1--;
+ pu1_tmp2++;
+ tmp = ((*pu1_tmp1) - (*pu1_tmp2));
+ c += (tmp << 2) + (tmp << 1); /* y=6 */
+ pu1_tmp1--;
+ pu1_tmp2++;
+ tmp = ((*pu1_tmp1) - (*pu1_tmp2));
+ c += (tmp << 3) - tmp; /* y=7 */
+ pu1_tmp1--; //pu1_tmp2 ++;
+ /* Modified to get (-1,-1) location as *(pu1_top - 1) instead of (pu1_left - ui4_stride) */
+ //c += ((*pu1_tmp1) - (*(pu1_top - 1)))<<3; /* y=8 */
+ c += ((*pu1_tmp1) - (*pu1_topleft)) << 3; /* y=8 */
+
+ c = ((c << 2) + c + 32) >> 6; /*! (5*V + 32)>>32 */
+ shift = 3;
+ }
+
+ /*! Now from the plane parameters a, b, and c,
+ compute the fitted plane values over the block */
+ {
+ WORD32 tmp1, tmpx, tmpx_init, j, i;
+
+ tmpx_init = -(b << shift); /* -8b */
+ tmp = a - (c << shift) + 16; /* a-((4or8)*c)+16 */
+ for(i = 0; i < 16; i++)
+ {
+ tmp += c; /*increment every time by c to get c*(y-7or3)*/
+ tmpx = tmpx_init; /* Init to -8b */
+ for(j = 0; j < 16; j++)
+ {
+ tmpx += b; /* increment every time by b to get b*(x-7or3) */
+ tmp1 = (tmp + tmpx) >> 5;
+ *pu1_dst++ = CLIP_U8(tmp1);
+ }
+ pu1_dst += (dst_strd - 16);
+ }
+ }
+}
diff --git a/common/ih264_macros.h b/common/ih264_macros.h
new file mode 100755
index 0000000..6e4cb16
--- /dev/null
+++ b/common/ih264_macros.h
@@ -0,0 +1,110 @@
+/******************************************************************************
+ *
+ * Copyright (C) 2015 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ *****************************************************************************
+ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
+*/
+
+/*********************************************************************************
+* @file
+* ih264_macros.h
+*
+* @brief
+* Macro definitions used in the codec
+*
+* @author
+* Ittiam
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+#ifndef _IH264_MACROS_H_
+#define _IH264_MACROS_H_
+
+/*****************************************************************************/
+/* Function Macros */
+/*****************************************************************************/
+#define RETURN_IF(cond, retval) if(cond) {return (retval);}
+#define UNUSED(x) ((void)(x))
+
+#define ALIGN128(x) ((((x) + 127) >> 7) << 7)
+#define ALIGN64(x) ((((x) + 63) >> 6) << 6)
+#define ALIGN32(x) ((((x) + 31) >> 5) << 5)
+#define ALIGN16(x) ((((x) + 15) >> 4) << 4)
+#define ALIGN8(x) ((((x) + 7) >> 3) << 3)
+#define ALIGN4(x) ((((x) + 3) >> 2) << 2)
+
+
+/**
+******************************************************************************
+ * @brief Min, Max
+******************************************************************************
+ */
+#define MAX(a,b) ((a > b)?(a):(b))
+#define MIN(a,b) ((a < b)?(a):(b))
+#define MIN3(a,b,c) ((a) < (b)) ? (((a) < (c)) ? (a) : (c)) : (((b) < (c)) ? (b) : (c))
+#define MAX3(a,b,c) ((a) > (b)) ? (((a) > (c)) ? (a) : (c)) : (((b) > (c)) ? (b) : (c))
+/**
+******************************************************************************
+ * @brief Div, Mod
+******************************************************************************
+ */
+#define MOD(x,y) ((x)%(y))
+#define DIV(x,y) ((x)/(y))
+
+/**
+******************************************************************************
+ * @brief Clip
+******************************************************************************
+ */
+#define CLIP3(miny, maxy, y) (((y) < (miny))?(miny):(((y) > (maxy))?(maxy):(y)))
+
+/**
+******************************************************************************
+ * @brief True, False
+******************************************************************************
+ */
+#define BOOLEAN(x) (!!(x))
+
+/**
+******************************************************************************
+ * @brief Frequently used multiplications x2. x3, and x4
+******************************************************************************
+ */
+#define X2(a) ((a) << 1)
+#define X3(a) (((a) << 1) + (a))
+#define X4(a) ((a) << 2)
+
+/**
+******************************************************************************
+ * @brief Misc
+******************************************************************************
+ */
+#define ABS(x) ((x) < 0 ? (-(x)) : (x))
+#define SIGNXY(x,y) (((y) < 0) ? (-1 * (x)) : (x))
+
+#define SIGN(x) (((x) >= 0) ? (((x) > 0) ? 1 : 0) : -1)
+
+#define RESET_BIT(x, pos) (x) = (x) & ~(1 << pos);
+#define SET_BIT(x, pos) (x) = (x) | (1 << pos);
+#define GET_BIT(x, pos) ((x) >> (pos)) & 0x1
+
+#define INSERT_BIT(x, pos, bit) { RESET_BIT(x, pos); (x) = (x) | (bit << pos); }
+#endif /*_IH264_MACROS_H_*/
+
+
diff --git a/common/ih264_mem_fns.c b/common/ih264_mem_fns.c
new file mode 100755
index 0000000..1c1f328
--- /dev/null
+++ b/common/ih264_mem_fns.c
@@ -0,0 +1,176 @@
+/******************************************************************************
+ *
+ * Copyright (C) 2015 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ *****************************************************************************
+ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
+*/
+/**
+ *******************************************************************************
+ * @file
+ * ih264_mem_fns.c
+ *
+ * @brief
+ * Functions used for memory operations
+ *
+ * @author
+ * Ittiam
+ *
+ * @par List of Functions:
+ * ih264_memcpy()
+ * ih264_memcpy_mul_8()
+ * ih264_memset()
+ * ih264_memset_mul_8()
+ * ih264_memset_16bit()
+ * ih264_memset_16bit_mul_8()
+ *
+ * @remarks
+ * None
+ *
+ ******************************************************************************
+ */
+
+/*****************************************************************************/
+/* File Includes */
+/*****************************************************************************/
+/* System include files */
+#include <stdio.h>
+#include <stddef.h>
+#include <stdlib.h>
+#include <string.h>
+#include <assert.h>
+
+
+/* User include files */
+#include "ih264_typedefs.h"
+#include "ih264_mem_fns.h"
+
+/**
+ *******************************************************************************
+ *
+ * @brief
+ * memcpy of a 8,16 or 32 bytes
+ *
+ * @par Description:
+ * Does memcpy of 8bit data from source to destination for 8,16 or 32 number of bytes
+ *
+ * @param[in] pu1_dst
+ * UWORD8 pointer to the destination
+ *
+ * @param[in] pu1_src
+ * UWORD8 pointer to the source
+ *
+ * @param[in] num_bytes
+ * number of bytes to copy
+ * @returns
+ *
+ * @remarks
+ * None
+ *
+ *******************************************************************************
+ */
+
+void ih264_memcpy(UWORD8 *pu1_dst, UWORD8 *pu1_src, UWORD32 num_bytes)
+{
+ memcpy(pu1_dst, pu1_src, num_bytes);
+}
+
+
+void ih264_memcpy_mul_8(UWORD8 *pu1_dst, UWORD8 *pu1_src, UWORD32 num_bytes)
+{
+ memcpy(pu1_dst, pu1_src, num_bytes);
+}
+
+/**
+ *******************************************************************************
+ *
+ * @brief
+ * memset of a 8,16 or 32 bytes
+ *
+ * @par Description:
+ * Does memset of 8bit data for 8,16 or 32 number of bytes
+ *
+ * @param[in] pu1_dst
+ * UWORD8 pointer to the destination
+ *
+ * @param[in] value
+ * UWORD8 value used for memset
+ *
+ * @param[in] num_bytes
+ * number of bytes to set
+ * @returns
+ *
+ * @remarks
+ * None
+ *
+ *******************************************************************************
+ */
+
+void ih264_memset(UWORD8 *pu1_dst, UWORD8 value, UWORD32 num_bytes)
+{
+ memset(pu1_dst, value, num_bytes);
+}
+
+
+void ih264_memset_mul_8(UWORD8 *pu1_dst, UWORD8 value, UWORD32 num_bytes)
+{
+ memset(pu1_dst, value, num_bytes);
+}
+
+/**
+ *******************************************************************************
+ *
+ * @brief
+ * memset of 16bit data of a 8,16 or 32 bytes
+ *
+ * @par Description:
+ * Does memset of 16bit data for 8,16 or 32 number of bytes
+ *
+ * @param[in] pu2_dst
+ * UWORD8 pointer to the destination
+ *
+ * @param[in] value
+ * UWORD16 value used for memset
+ *
+ * @param[in] num_words
+ * number of words to set
+ * @returns
+ *
+ * @remarks
+ * None
+ *
+ *******************************************************************************
+ */
+
+void ih264_memset_16bit(UWORD16 *pu2_dst, UWORD16 value, UWORD32 num_words)
+{
+ UWORD32 i;
+ for(i = 0; i < num_words; i++)
+ {
+ *pu2_dst++ = value;
+ }
+}
+
+void ih264_memset_16bit_mul_8(UWORD16 *pu2_dst,
+ UWORD16 value,
+ UWORD32 num_words)
+{
+ UWORD32 i;
+ for(i = 0; i < num_words; i++)
+ {
+ *pu2_dst++ = value;
+ }
+}
+
diff --git a/common/ih264_mem_fns.h b/common/ih264_mem_fns.h
new file mode 100755
index 0000000..e0167f4
--- /dev/null
+++ b/common/ih264_mem_fns.h
@@ -0,0 +1,126 @@
+/******************************************************************************
+ *
+ * Copyright (C) 2015 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ *****************************************************************************
+ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
+*/
+/**
+*******************************************************************************
+* @file
+* ih264_mem_fns.h
+*
+* @brief
+* Function declarations used for memory functions
+*
+* @author
+* Ittiam
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+#ifndef _IH264_MEM_FNS_H_
+#define _IH264_MEM_FNS_H_
+
+typedef void ih264_memcpy_ft(UWORD8 *pu1_dst, UWORD8 *pu1_src, UWORD32 num_bytes);
+
+typedef void ih264_memcpy_mul_8_ft(UWORD8 *pu1_dst, UWORD8 *pu1_src, UWORD32 num_bytes);
+/**
+ *******************************************************************************
+ *
+ * @brief
+ * memset of a 8,16 or 32 bytes
+ *
+ * @par Description:
+ * Does memset of 8bit data for 8,16 or 32 number of bytes
+ *
+ * @param[in] pu1_dst
+ * UWORD8 pointer to the destination
+ *
+ * @param[in] value
+ * UWORD8 value used for memset
+ *
+ * @param[in] num_bytes
+ * number of bytes to set
+ * @returns
+ *
+ * @remarks
+ * None
+ *
+ *******************************************************************************
+ */
+typedef void ih264_memset_ft(UWORD8 *pu1_dst, UWORD8 value, UWORD32 num_bytes);
+
+typedef void ih264_memset_mul_8_ft(UWORD8 *pu1_dst, UWORD8 value, UWORD32 num_bytes);
+
+/**
+ *******************************************************************************
+ *
+ * @brief
+ * memset of 16bit data of a 8,16 or 32 bytes
+ *
+ * @par Description:
+ * Does memset of 16bit data for 8,16 or 32 number of bytes
+ *
+ * @param[in] pu2_dst
+ * UWORD8 pointer to the destination
+ *
+ * @param[in] value
+ * UWORD16 value used for memset
+ *
+ * @param[in] num_words
+ * number of words to set
+ * @returns
+ *
+ * @remarks
+ * None
+ *
+ *******************************************************************************
+ */
+typedef void ih264_memset_16bit_ft(UWORD16 *pu2_dst, UWORD16 value, UWORD32 num_words);
+
+typedef void ih264_memset_16bit_mul_8_ft(UWORD16 *pu2_dst, UWORD16 value, UWORD32 num_words);
+
+/* C function declarations */
+ih264_memcpy_ft ih264_memcpy;
+ih264_memcpy_mul_8_ft ih264_memcpy_mul_8;
+ih264_memset_ft ih264_memset;
+ih264_memset_mul_8_ft ih264_memset_mul_8;
+ih264_memset_16bit_ft ih264_memset_16bit;
+ih264_memset_16bit_mul_8_ft ih264_memset_16bit_mul_8;
+
+/* A9 Q function declarations */
+ih264_memcpy_ft ih264_memcpy_a9q;
+ih264_memcpy_mul_8_ft ih264_memcpy_mul_8_a9q;
+ih264_memset_ft ih264_memset_a9q;
+ih264_memset_mul_8_ft ih264_memset_mul_8_a9q;
+ih264_memset_16bit_ft ih264_memset_16bit_a9q;
+ih264_memset_16bit_mul_8_ft ih264_memset_16bit_mul_8_a9q;
+
+/* AV8 function declarations */
+ih264_memcpy_ft ih264_memcpy_av8;
+ih264_memcpy_mul_8_ft ih264_memcpy_mul_8_av8;
+ih264_memset_ft ih264_memset_av8;
+ih264_memset_mul_8_ft ih264_memset_mul_8_av8;
+ih264_memset_16bit_ft ih264_memset_16bit_av8;
+ih264_memset_16bit_mul_8_ft ih264_memset_16bit_mul_8_av8;
+
+
+ih264_memcpy_mul_8_ft ih264_memcpy_mul_8_ssse3;
+ih264_memset_mul_8_ft ih264_memset_mul_8_ssse3;
+ih264_memset_16bit_mul_8_ft ih264_memset_16bit_mul_8_ssse3;
+#endif //_MEM_FNS_H_
diff --git a/common/ih264_padding.c b/common/ih264_padding.c
new file mode 100755
index 0000000..8e8f3e2
--- /dev/null
+++ b/common/ih264_padding.c
@@ -0,0 +1,331 @@
+/******************************************************************************
+ *
+ * Copyright (C) 2015 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ *****************************************************************************
+ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
+*/
+
+/**
+*******************************************************************************
+* @file
+* ih264_padding.c
+*
+* @brief
+* Contains function definitions for Padding
+*
+* @author
+* Ittiam
+*
+* @par List of Functions:
+* - ih264_pad_top()
+* - ih264_pad_bottom()
+* - ih264_pad_left_luma()
+* - ih264_pad_left_chroma()
+* - ih264_pad_right_luma()
+* - ih264_pad_right_chroma()
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+
+/*****************************************************************************/
+/* File Includes */
+/*****************************************************************************/
+
+/* System include files */
+#include <stddef.h>
+#include <string.h>
+
+/* User include files */
+#include "ih264_typedefs.h"
+#include "ih264_macros.h"
+#include "ih264_padding.h"
+
+
+/*****************************************************************************/
+/* Function Definitions */
+/*****************************************************************************/
+
+/**
+*******************************************************************************
+*
+* @brief pad at the top of a 2d array
+*
+* @par Description:
+* The top row of a 2d array is replicated for pad_size times at the top
+*
+* @param[in] pu1_src
+* UWORD8 pointer to the source
+*
+* @param[in] src_strd
+* integer source stride
+*
+* @param[in] wd
+* integer width of the array
+*
+* @param[in] pad_size
+* integer -padding size of the array
+*
+* @returns none
+*
+* @remarks none
+*
+*******************************************************************************
+*/
+void ih264_pad_top(UWORD8 *pu1_src,
+ WORD32 src_strd,
+ WORD32 wd,
+ WORD32 pad_size)
+{
+ WORD32 row;
+
+ for(row = 1; row <= pad_size; row++)
+ {
+ memcpy(pu1_src - row * src_strd, pu1_src, wd);
+ }
+}
+
+
+
+/**
+*******************************************************************************
+*
+* @brief pad at the bottom of a 2d array
+*
+* @par Description:
+* The bottom row of a 2d array is replicated for pad_size times at the bottom
+*
+* @param[in] pu1_src
+* UWORD8 pointer to the source
+*
+* @param[in] src_strd
+* integer source stride
+*
+* @param[in] wd
+* integer width of the array
+*
+* @param[in] pad_size
+* integer -padding size of the array
+*
+* @returns none
+*
+* @remarks none
+*
+*******************************************************************************
+*/
+void ih264_pad_bottom(UWORD8 *pu1_src,
+ WORD32 src_strd,
+ WORD32 wd,
+ WORD32 pad_size)
+{
+ WORD32 row;
+
+ for(row = 1; row <= pad_size; row++)
+ {
+ memcpy(pu1_src + (row - 1) * src_strd, pu1_src - 1 * src_strd, wd);
+ }
+}
+
+/**
+*******************************************************************************
+*
+* @brief pad (luma block) at the left of a 2d array
+*
+* @par Description:
+* The left column of a 2d array is replicated for pad_size times to the left
+*
+* @param[in] pu1_src
+* UWORD8 pointer to the source
+*
+* @param[in] src_strd
+* integer source stride
+*
+* @param[in] ht
+* integer height of the array
+*
+* @param[in] pad_size
+* integer -padding size of the array
+*
+* @returns none
+*
+* @remarks none
+*
+*******************************************************************************
+ */
+void ih264_pad_left_luma(UWORD8 *pu1_src,
+ WORD32 src_strd,
+ WORD32 ht,
+ WORD32 pad_size)
+{
+ WORD32 row;
+
+ for(row = 0; row < ht; row++)
+ {
+
+ memset(pu1_src - pad_size, *pu1_src, pad_size);
+
+ pu1_src += src_strd;
+ }
+}
+
+/**
+*******************************************************************************
+*
+* @brief pad (chroma block) at the left of a 2d array
+*
+* @par Description:
+* The left column of a 2d array is replicated for pad_size times to the left
+*
+* @param[in] pu1_src
+* UWORD8 pointer to the source
+*
+* @param[in] src_strd
+* integer source stride
+*
+* @param[in] ht
+* integer height of the array
+*
+* @param[in] pad_size
+* integer -padding size of the array
+*
+* @returns none
+*
+* @remarks none
+*
+*******************************************************************************
+*/
+void ih264_pad_left_chroma(UWORD8 *pu1_src,
+ WORD32 src_strd,
+ WORD32 ht,
+ WORD32 pad_size)
+{
+ /* temp var */
+ WORD32 row, col;
+ UWORD16 u2_uv_val;
+
+ /* pointer to src */
+ UWORD16 *pu2_src = (UWORD16 *)pu1_src;
+
+ src_strd >>= 1;
+ pad_size >>= 1;
+
+ for(row = 0; row < ht; row++)
+ {
+ u2_uv_val = pu2_src[0];
+
+ for (col = -pad_size; col < 0; col++)
+ {
+ pu2_src[col] = u2_uv_val;
+ }
+
+ pu2_src += src_strd;
+ }
+}
+
+/**
+*******************************************************************************
+*
+* @brief pad (luma block) at the right of a 2d array
+*
+* @par Description:
+* The right column of a 2d array is replicated for pad_size times at the right
+*
+* @param[in] pu1_src
+* UWORD8 pointer to the source
+*
+* @param[in] src_strd
+* integer source stride
+*
+* @param[in] ht
+* integer height of the array
+*
+* @param[in] pad_size
+* integer -padding size of the array
+*
+* @returns none
+*
+* @remarks none
+*
+*******************************************************************************
+*/
+void ih264_pad_right_luma(UWORD8 *pu1_src,
+ WORD32 src_strd,
+ WORD32 ht,
+ WORD32 pad_size)
+{
+ WORD32 row;
+
+ for(row = 0; row < ht; row++)
+ {
+ memset(pu1_src, *(pu1_src -1), pad_size);
+
+ pu1_src += src_strd;
+ }
+}
+
+/**
+*******************************************************************************
+*
+* @brief pad (chroma block) at the right of a 2d array
+*
+* @par Description:
+* The right column of a 2d array is replicated for pad_size times at the right
+*
+* @param[in] pu1_src
+* UWORD8 pointer to the source
+*
+* @param[in] src_strd
+* integer source stride
+*
+* @param[in] ht
+* integer height of the array
+*
+* @param[in] pad_size
+* integer -padding size of the array
+*
+* @returns none
+*
+* @remarks none
+*
+*******************************************************************************
+*/
+void ih264_pad_right_chroma(UWORD8 *pu1_src,
+ WORD32 src_strd,
+ WORD32 ht,
+ WORD32 pad_size)
+{
+ WORD32 row, col;
+ UWORD16 u2_uv_val;
+ UWORD16 *pu2_src = (UWORD16 *)pu1_src;
+
+ src_strd >>= 1;
+ pad_size >>= 1;
+
+ for(row = 0; row < ht; row++)
+ {
+ u2_uv_val = pu2_src[-1];
+
+ for (col = 0; col < pad_size; col++)
+ {
+ pu2_src[col] = u2_uv_val;
+ }
+
+ pu2_src += src_strd;
+ }
+}
+
diff --git a/common/ih264_padding.h b/common/ih264_padding.h
new file mode 100755
index 0000000..e4e18fb
--- /dev/null
+++ b/common/ih264_padding.h
@@ -0,0 +1,74 @@
+/******************************************************************************
+ *
+ * Copyright (C) 2015 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ *****************************************************************************
+ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
+*/
+
+/**
+*******************************************************************************
+* @file
+* ih264_padding.h
+*
+* @brief
+* Declarations for padding functions
+*
+* @author
+* Ittiam
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+#ifndef _IH264_PADDING_H_
+#define _IH264_PADDING_H_
+
+/*****************************************************************************/
+/* Function Declarations */
+/*****************************************************************************/
+
+typedef void ih264_pad(UWORD8 *, WORD32, WORD32, WORD32);
+
+/* C function declarations */
+ih264_pad ih264_pad_top;
+ih264_pad ih264_pad_bottom;
+ih264_pad ih264_pad_left_luma;
+ih264_pad ih264_pad_left_chroma;
+ih264_pad ih264_pad_right_luma;
+ih264_pad ih264_pad_right_chroma;
+
+/* A9 Q function declarations */
+ih264_pad ih264_pad_top_a9q;
+ih264_pad ih264_pad_left_luma_a9q;
+ih264_pad ih264_pad_left_chroma_a9q;
+ih264_pad ih264_pad_right_luma_a9q;
+ih264_pad ih264_pad_right_chroma_a9q;
+
+/* AV8 function declarations */
+ih264_pad ih264_pad_top_av8;
+ih264_pad ih264_pad_left_luma_av8;
+ih264_pad ih264_pad_left_chroma_av8;
+ih264_pad ih264_pad_right_luma_av8;
+ih264_pad ih264_pad_right_chroma_av8;
+
+
+ih264_pad ih264_pad_left_luma_ssse3;
+ih264_pad ih264_pad_left_chroma_ssse3;
+ih264_pad ih264_pad_right_luma_ssse3;
+ih264_pad ih264_pad_right_chroma_ssse3;
+
+#endif /*_IH264_PADDING_H_*/
diff --git a/common/ih264_resi_trans.h b/common/ih264_resi_trans.h
new file mode 100755
index 0000000..ee0add3
--- /dev/null
+++ b/common/ih264_resi_trans.h
@@ -0,0 +1,70 @@
+/******************************************************************************
+ *
+ * Copyright (C) 2015 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ *****************************************************************************
+ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
+*/
+/**
+*******************************************************************************
+* @file
+* ih264_resi_trans.h
+*
+* @brief
+* Functions declarations for residue and forward transform
+*
+* @par List of Functions:
+* - ih264_resi_trans_ft
+* - ih264_resi_trans_4x4
+* - ih264_resi_trans_4x4
+* - ih264_resi_trans_4x4_a9
+* - ih264_resi_trans_4x4_a9
+*
+* @author
+* Ittiam
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+
+#ifndef IH264_RESI_TRANS_H_
+#define IH264_RESI_TRANS_H_
+
+/*****************************************************************************/
+/* Extern Function Declarations */
+/*****************************************************************************/
+
+typedef void ih264_resi_trans_ft(UWORD8 *pu1_src,
+ UWORD8 *pu1_pred,
+ WORD32 *pi4_out,
+ WORD32 src_strd,
+ WORD32 pred_strd,
+ WORD32 out_strd);
+
+/*C functions*/
+
+ih264_resi_trans_ft ih264_resi_trans_4x4;
+
+ih264_resi_trans_ft ih264_resi_trans_8x8;
+
+/*A9 functions*/
+
+ih264_resi_trans_ft ih264_resi_trans_4x4_a9;
+
+ih264_resi_trans_ft ih264_resi_trans_8x8_a9;
+
+#endif /* IH264_RESI_TRANS_H_ */
diff --git a/common/ih264_resi_trans_quant.c b/common/ih264_resi_trans_quant.c
new file mode 100755
index 0000000..cf1d43c
--- /dev/null
+++ b/common/ih264_resi_trans_quant.c
@@ -0,0 +1,814 @@
+/******************************************************************************
+ *
+ * Copyright (C) 2015 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ *****************************************************************************
+ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
+*/
+/**
+ *******************************************************************************
+ * @file
+ * ih264_resi_trans_quant.c
+ *
+ * @brief
+ * Contains function definitions single stage forward transform for H.264
+ * It will calculate the residue, do the cf and then do quantization
+ *
+ * @author
+ * Ittiam
+ *
+ * @par List of Functions:
+ * - ih264_resi_trans_quant_4x4()
+ * - ih264_resi_trans_quant_chroma_4x4
+ * - ih264_hadamard_quant_4x4
+ * - ih264_hadamard_quant_2x2_uv
+ * - ih264_resi_trans_quant_8x8
+ *
+ * @remarks
+ *******************************************************************************
+ */
+
+/*****************************************************************************/
+/* File Includes */
+/*****************************************************************************/
+
+/* System include files */
+#include <stddef.h>
+
+/* User include files */
+#include "ih264_typedefs.h"
+#include "ih264_defs.h"
+#include "ih264_size_defs.h"
+#include "ih264_macros.h"
+#include "ih264_trans_macros.h"
+#include "ih264_trans_data.h"
+#include "ih264_structs.h"
+#include "ih264_trans_quant_itrans_iquant.h"
+
+/**
+ *******************************************************************************
+ *
+ * @brief
+ * This function performs forward transform and quantization on a 4*4 block
+ *
+ * @par Description:
+ * The function accepts source buffer and estimation buffer. From these, it
+ * computes the residue. This is residue is then transformed and quantized.
+ * The transform and quantization are in placed computed. They use the residue
+ * buffer for this.
+ *
+ * @param[in] pu1_src
+ * Pointer to source sub-block
+ *
+ * @param[in] pu1_pred
+ * Pointer to prediction sub-block
+ *
+ * @param[in] pi2_out
+ * Pointer to residual sub-block
+ *
+ * @param[in] src_strd
+ * Source stride
+ *
+ * @param[in] pred_strd
+ * Prediction stride
+ *
+ * @param[in] dst_strd
+ * Destination stride
+ *
+ * @param[in] u4_qbits
+ * QP_BITS_h264_4x4 + floor(QP/6)
+ *
+ * @param[in] pu2_threshold_matrix
+ * Pointer to Forward Quant Threshold Matrix
+ *
+ * @param[in] pu2_scale_matrix
+ * Pointer to Forward Quant Scale Matrix
+ *
+ * @param[in] u4_round_factor
+ * Quantization Round factor
+ *
+ * @param[out] pu1_nnz
+ * Total non-zero coefficients in the current sub-block
+ *
+ * @returns
+ *
+ * @remarks
+ * None
+ *
+ *******************************************************************************
+ */
+void ih264_resi_trans_quant_4x4(UWORD8 *pu1_src,
+ UWORD8 *pu1_pred,
+ WORD16 *pi2_out,
+ WORD32 src_strd,
+ WORD32 pred_strd,
+ const UWORD16 *pu2_scale_matrix,
+ const UWORD16 *pu2_threshold_matrix,
+ UWORD32 u4_qbits,
+ UWORD32 u4_round_factor,
+ UWORD8 *pu1_nnz,
+ WORD16 *pi2_alt_dc_addr)
+{
+ UWORD32 i;
+ WORD32 x0, x1, x2, x3, x4, x5, x6, x7;
+ WORD32 i4_value, i4_sign;
+ UWORD32 u4_abs_value;
+ WORD16 *pi2_out_tmp = pi2_out;
+ UWORD32 u4_nonzero_coeff = 0;
+
+ for (i = 0; i < SUB_BLK_WIDTH_4x4; i++)
+ {
+ /* computing prediction error (residue) */
+ x4 = pu1_src[0] - pu1_pred[0];
+ x5 = pu1_src[1] - pu1_pred[1];
+ x6 = pu1_src[2] - pu1_pred[2];
+ x7 = pu1_src[3] - pu1_pred[3];
+
+ /* Horizontal transform */
+ x0 = x4 + x7;
+ x1 = x5 + x6;
+ x2 = x5 - x6;
+ x3 = x4 - x7;
+
+ pi2_out_tmp[0] = x0 + x1;
+ pi2_out_tmp[1] = (x3 <<1) + x2;
+ pi2_out_tmp[2] = x0 - x1;
+ pi2_out_tmp[3] = x3 - (x2<<1);
+
+ /* pointing to next row; */
+ pu1_src += src_strd;
+ pu1_pred += pred_strd;
+ pi2_out_tmp += 4;
+
+ }
+ pi2_out_tmp = pi2_out;
+ for (i = 0; i < SUB_BLK_WIDTH_4x4; i++)
+ {
+
+ /* Vertical transform and quantization */
+ x4 = pi2_out_tmp[0];
+ x5 = pi2_out_tmp[4];
+ x6 = pi2_out_tmp[8];
+ x7 = pi2_out_tmp[12];
+
+
+ x0 = x4 + x7;
+ x1 = x5 + x6;
+ x2 = x5 - x6;
+ x3 = x4 - x7;
+
+ /* quantization is done in place */
+
+ i4_value = x0 + x1;
+
+ if(i==0)
+ {
+ (*pi2_alt_dc_addr) = i4_value;
+ }
+
+ FWD_QUANT(i4_value, u4_abs_value, i4_sign, pu2_threshold_matrix[0], pu2_scale_matrix[0], u4_round_factor, u4_qbits, u4_nonzero_coeff);
+ pi2_out_tmp[0] = i4_value;
+
+
+ i4_value = (x3 << 1) + x2;
+ FWD_QUANT(i4_value, u4_abs_value, i4_sign, pu2_threshold_matrix[4], pu2_scale_matrix[4], u4_round_factor, u4_qbits, u4_nonzero_coeff);
+ pi2_out_tmp[4] = i4_value;
+
+
+ i4_value = x0 - x1;
+ FWD_QUANT(i4_value, u4_abs_value, i4_sign, pu2_threshold_matrix[8], pu2_scale_matrix[8], u4_round_factor, u4_qbits, u4_nonzero_coeff);
+ pi2_out_tmp[8] = i4_value;
+
+
+ i4_value = x3 - (x2 << 1);
+ FWD_QUANT(i4_value, u4_abs_value, i4_sign, pu2_threshold_matrix[12], pu2_scale_matrix[12], u4_round_factor, u4_qbits, u4_nonzero_coeff);
+ pi2_out_tmp[12] = i4_value;
+
+ pi2_out_tmp ++;
+ pu2_scale_matrix++;
+ pu2_threshold_matrix++;
+ }
+
+ /* Return total nonzero coefficients in the current sub block */
+ *pu1_nnz = u4_nonzero_coeff;
+}
+/**
+ *******************************************************************************
+ *
+ * @brief
+ * This function performs forward transform and quantization on a 4*4 chroma block
+ * with interleaved values
+ *
+ * @par Description:
+ * The function accepts source buffer and estimation buffer. From these, it
+ * computes the residue. This is residue is then transformed and quantized.
+ * The transform and quantization are in placed computed. They use the residue
+ * buffer for this.
+ *
+ * @param[in] pu1_src
+ * Pointer to source sub-block
+ *
+ * @param[in] pu1_pred
+ * Pointer to prediction sub-block
+ *
+ * @param[in] pi2_out
+ * Pointer to residual sub-block
+ *
+ * @param[in] src_strd
+ * Source stride
+ *
+ * @param[in] pred_strd
+ * Prediction stride
+ *
+ * @param[in] dst_strd
+ * Destination stride
+ *
+ * @param[in] u4_qbits
+ * QP_BITS_h264_4x4 + floor(QP/6)
+ *
+ * @param[in] pu2_threshold_matrix
+ * Pointer to Forward Quant Threshold Matrix
+ *
+ * @param[in] pu2_scale_matrix
+ * Pointer to Forward Quant Scale Matrix
+ *
+ * @param[in] u4_round_factor
+ * Quantization Round factor
+ *
+ * @param[out] pu1_nnz
+ * Total non-zero coefficients in the current sub-block
+ *
+ * @returns
+ *
+ * @remarks
+ * None
+ *
+ *******************************************************************************
+ */
+void ih264_resi_trans_quant_chroma_4x4(UWORD8 *pu1_src,
+ UWORD8 *pu1_pred,
+ WORD16 *pi2_out,
+ WORD32 src_strd,
+ WORD32 pred_strd,
+ const UWORD16 *pu2_scale_matrix,
+ const UWORD16 *pu2_threshold_matrix,
+ UWORD32 u4_qbits,
+ UWORD32 u4_round_factor,
+ UWORD8 *pu1_nnz,
+ WORD16 *pu1_dc_alt_addr)
+{
+ UWORD32 i;
+ WORD32 x0, x1, x2, x3, x4, x5, x6, x7;
+ WORD32 i4_value, i4_sign;
+ UWORD32 u4_abs_value;
+ WORD16 *pi2_out_tmp = pi2_out;
+ UWORD32 u4_nonzero_coeff = 0;
+
+ for (i = 0; i < SUB_BLK_WIDTH_4x4; i++)
+ {
+ /* computing prediction error (residue) */
+ x4 = pu1_src[0] - pu1_pred[0];
+ x5 = pu1_src[2] - pu1_pred[2];
+ x6 = pu1_src[4] - pu1_pred[4];
+ x7 = pu1_src[6] - pu1_pred[6];
+
+ /* Horizontal transform */
+ x0 = x4 + x7;
+ x1 = x5 + x6;
+ x2 = x5 - x6;
+ x3 = x4 - x7;
+
+ pi2_out_tmp[0] = x0 + x1;
+ pi2_out_tmp[1] = (x3 <<1) + x2;
+ pi2_out_tmp[2] = x0 - x1;
+ pi2_out_tmp[3] = x3 - (x2<<1);
+
+ /* pointing to next row; */
+ pu1_src += src_strd;
+ pu1_pred += pred_strd;
+ pi2_out_tmp += 4;
+
+ }
+ pi2_out_tmp = pi2_out;
+ for (i = 0; i < SUB_BLK_WIDTH_4x4; i++)
+ {
+
+ /* Vertical transform and quantization */
+ x4 = pi2_out_tmp[0];
+ x5 = pi2_out_tmp[4];
+ x6 = pi2_out_tmp[8];
+ x7 = pi2_out_tmp[12];
+
+
+ x0 = x4 + x7;
+ x1 = x5 + x6;
+ x2 = x5 - x6;
+ x3 = x4 - x7;
+
+ /* quantization is done in place */
+
+ i4_value = x0 + x1;
+
+ if(i==0)
+ {
+ *pu1_dc_alt_addr = i4_value;
+ }
+
+ FWD_QUANT(i4_value, u4_abs_value, i4_sign, pu2_threshold_matrix[0],
+ pu2_scale_matrix[0], u4_round_factor, u4_qbits,
+ u4_nonzero_coeff);
+ pi2_out_tmp[0] = i4_value;
+
+ i4_value = (x3 << 1) + x2;
+ FWD_QUANT(i4_value, u4_abs_value, i4_sign, pu2_threshold_matrix[4],
+ pu2_scale_matrix[4], u4_round_factor, u4_qbits,
+ u4_nonzero_coeff);
+ pi2_out_tmp[4] = i4_value;
+
+ i4_value = x0 - x1;
+ FWD_QUANT(i4_value, u4_abs_value, i4_sign, pu2_threshold_matrix[8],
+ pu2_scale_matrix[8], u4_round_factor, u4_qbits,
+ u4_nonzero_coeff);
+ pi2_out_tmp[8] = i4_value;
+
+ i4_value = x3 - (x2 << 1);
+ FWD_QUANT(i4_value, u4_abs_value, i4_sign, pu2_threshold_matrix[12],
+ pu2_scale_matrix[12], u4_round_factor, u4_qbits,
+ u4_nonzero_coeff);
+ pi2_out_tmp[12] = i4_value;
+
+ pi2_out_tmp ++;
+ pu2_scale_matrix++;
+ pu2_threshold_matrix++;
+ }
+
+ /* Return total nonzero coefficients in the current sub block */
+ *pu1_nnz = u4_nonzero_coeff;
+}
+
+/**
+ *******************************************************************************
+ *
+ * @brief
+ * This function performs forward hadamard transform and quantization on a 4*4 block
+ *
+ * @par Description:
+ * The function accepts source buffer and estimation buffer. From these, it
+ * computes the residue. This is residue is then transformed and quantized.
+ * The transform and quantization are in placed computed. They use the residue
+ * buffer for this.
+ *
+ * @param[in] pu1_src
+ * Pointer to source sub-block
+ *
+ * @param[in] pu1_pred
+ * Pointer to prediction sub-block
+ *
+ * @param[in] pi2_out
+ * Pointer to residual sub-block
+ *
+ * @param[in] src_strd
+ * Source stride
+ *
+ * @param[in] pred_strd
+ * Prediction stride
+ *
+ * @param[in] dst_strd
+ * Destination stride
+ *
+ * @param[in] u4_qbits
+ * QP_BITS_h264_4x4 + floor(QP/6)
+ *
+ * @param[in] pu2_threshold_matrix
+ * Pointer to Forward Quant Threshold Matrix
+ *
+ * @param[in] pu2_scale_matrix
+ * Pointer to Forward Quant Scale Matrix
+ *
+ * @param[in] u4_round_factor
+ * Quantization Round factor
+ *
+ * @param[out] pu1_nnz
+ * Total non-zero coefficients in the current sub-block
+ *
+ * @returns
+ *
+ * @remarks
+ * None
+ *
+ */
+
+void ih264_hadamard_quant_4x4(WORD16 *pi2_src,
+ WORD16 *pi2_dst,
+ const UWORD16 *pu2_scale_matrix,
+ const UWORD16 *pu2_threshold_matrix,
+ UWORD32 u4_qbits,
+ UWORD32 u4_round_factor,
+ UWORD8 *pu1_nnz)
+{
+ WORD32 i;
+ WORD32 x0,x1,x2,x3,x4,x5,x6,x7,i4_value;
+ UWORD32 u4_abs_value;
+ WORD32 i4_sign;
+
+ *pu1_nnz = 0;
+
+ for (i = 0; i < SUB_BLK_WIDTH_4x4; i++)
+ {
+ x4 = pi2_src[0];
+ x5 = pi2_src[1];
+ x6 = pi2_src[2];
+ x7 = pi2_src[3];
+
+ x0 = x4 + x7;
+ x1 = x5 + x6;
+ x2 = x5 - x6;
+ x3 = x4 - x7;
+
+ pi2_dst[0] = x0 + x1;
+ pi2_dst[1] = x3 + x2;
+ pi2_dst[2] = x0 - x1;
+ pi2_dst[3] = x3 - x2;
+
+ pi2_src += 4;
+ pi2_dst += 4;
+ }
+
+ /* Vertical transform and quantization */
+ pi2_dst -= SUB_BLK_WIDTH_4x4<<2;
+
+ for (i = 0; i < SUB_BLK_WIDTH_4x4; i++)
+ {
+ x4 = pi2_dst[0];
+ x5 = pi2_dst[4];
+ x6 = pi2_dst[8];
+ x7 = pi2_dst[12] ;
+
+ x0 = x4 + x7;
+ x1 = x5 + x6;
+ x2 = x5 - x6;
+ x3 = x4 - x7;
+
+
+ i4_value = (x0 + x1) >> 1;
+ FWD_QUANT(i4_value, u4_abs_value, i4_sign, pu2_threshold_matrix[0],
+ pu2_scale_matrix[0], u4_round_factor, u4_qbits, pu1_nnz[0]);
+ pi2_dst[0] = i4_value;
+
+ i4_value = (x3 + x2) >> 1;
+ FWD_QUANT(i4_value, u4_abs_value, i4_sign, pu2_threshold_matrix[0],
+ pu2_scale_matrix[0], u4_round_factor, u4_qbits, pu1_nnz[0]);
+ pi2_dst[4] = i4_value;
+
+ i4_value = (x0 - x1) >> 1;
+ FWD_QUANT(i4_value, u4_abs_value, i4_sign, pu2_threshold_matrix[0],
+ pu2_scale_matrix[0], u4_round_factor, u4_qbits, pu1_nnz[0]);
+ pi2_dst[8] = i4_value;
+
+ i4_value = (x3 - x2) >> 1;
+ FWD_QUANT(i4_value, u4_abs_value, i4_sign, pu2_threshold_matrix[0],
+ pu2_scale_matrix[0], u4_round_factor, u4_qbits, pu1_nnz[0]);
+ pi2_dst[12] = i4_value;
+
+ pi2_dst ++;
+ }
+}
+
+/**
+ *******************************************************************************
+ *
+ * @brief
+ * This function performs forward hadamard transform and quantization on a 2*2 block
+ * for both U and V planes
+ *
+ * @par Description:
+ * The function accepts source buffer and estimation buffer. From these, it
+ * computes the residue. This is residue is then transformed and quantized.
+ * The transform and quantization are in placed computed. They use the residue
+ * buffer for this.
+ *
+ * @param[in] pu1_src
+ * Pointer to source sub-block
+ *
+ * @param[in] pu1_pred
+ * Pointer to prediction sub-block
+ *
+ * @param[in] pi2_out
+ * Pointer to residual sub-block
+ *
+ * @param[in] src_strd
+ * Source stride
+ *
+ * @param[in] pred_strd
+ * Prediction stride
+ *
+ * @param[in] dst_strd
+ * Destination stride
+ *
+ * @param[in] u4_qbits
+ * QP_BITS_h264_4x4 + floor(QP/6)
+ *
+ * @param[in] pu2_threshold_matrix
+ * Pointer to Forward Quant Threshold Matrix
+ *
+ * @param[in] pu2_scale_matrix
+ * Pointer to Forward Quant Scale Matrix
+ *
+ * @param[in] u4_round_factor
+ * Quantization Round factor
+ *
+ * @param[out] pu1_nnz
+ * Total non-zero coefficients in the current sub-block
+ *
+ * @returns
+ *
+ * @remarks
+ * NNZ for dc is populated at 0 and 5th position of pu1_nnz
+ *
+ */
+
+void ih264_hadamard_quant_2x2_uv(WORD16 *pi2_src,
+ WORD16 *pi2_dst,
+ const UWORD16 *pu2_scale_matrix,
+ const UWORD16 *pu2_threshold_matrix,
+ UWORD32 u4_qbits,
+ UWORD32 u4_round_factor,
+ UWORD8 *pu1_nnz)
+{
+ WORD32 x0, x1, x2, x3, x4, x5, x6, x7;
+ WORD32 i4_value, i4_sign, plane;
+ UWORD32 u4_abs_value;
+
+ for(plane = 0; plane < 2; plane++)
+ {
+ pu1_nnz[plane] = 0;
+
+ /* Horizontal transform */
+ x4 = pi2_src[0];
+ x5 = pi2_src[1];
+ x6 = pi2_src[2];
+ x7 = pi2_src[3];
+
+ x0 = x4 + x5;
+ x1 = x4 - x5;
+ x2 = x6 + x7;
+ x3 = x6 - x7;
+
+ /* Vertical transform and quantization */
+ i4_value = (x0 + x2);
+ FWD_QUANT(i4_value, u4_abs_value, i4_sign, pu2_threshold_matrix[0],
+ pu2_scale_matrix[0], u4_round_factor, u4_qbits,
+ pu1_nnz[plane]);
+ pi2_dst[0] = i4_value;
+
+ i4_value = (x0 - x2);
+ FWD_QUANT(i4_value, u4_abs_value, i4_sign, pu2_threshold_matrix[0],
+ pu2_scale_matrix[0], u4_round_factor, u4_qbits,
+ pu1_nnz[plane]);
+ pi2_dst[2] = i4_value;
+
+ i4_value = (x1 - x3);
+ FWD_QUANT(i4_value, u4_abs_value, i4_sign, pu2_threshold_matrix[0],
+ pu2_scale_matrix[0], u4_round_factor, u4_qbits,
+ pu1_nnz[plane]);
+ pi2_dst[3] = i4_value;
+
+ i4_value = (x1 + x3);
+ FWD_QUANT(i4_value, u4_abs_value, i4_sign, pu2_threshold_matrix[0],
+ pu2_scale_matrix[0], u4_round_factor, u4_qbits,
+ pu1_nnz[plane]);
+ pi2_dst[1] = i4_value;
+
+ pi2_dst += 4;
+ pi2_src += 4;
+
+ }
+}
+
+/*
+ *******************************************************************************
+ *
+ * @brief
+ * This function performs Single stage forward transform CF8 and quantization on 8*8 blocks
+ * for h.264
+ *
+ * @par Description:
+ * Performs single stage 8x8 forward transform CF8 after calculating the residue
+ * The result is then quantized
+ *
+ * @param[in] pu1_src
+ * Input 8x8 pixels
+ *
+ * @param[in] pu1_pred
+ * Input 8x8 pixels
+ *
+ * @param[in] pi1_out
+ * Output 8x8 pixels
+ *
+ * @param[in] u4_thresh
+ * Threshold under which the coeffs are not quantized
+ *
+ * @param[in] u4_qp_div
+ * QP/6
+ *
+ * @param[in] u4_qp_rem
+ * QP%6
+ *
+ * @param[in] u2_src_stride
+ * Source stride
+ *
+ * @param[in] pred_strd
+ * stride for prediciton buffer
+ *
+ * @param[in] dst_strd
+ * stride for destination buffer
+ *
+ * @param[in] pu4_quant_mat
+ * Pointer to the 4x4 quantization matrix
+ *
+ * @returns Void
+ *
+ *
+ *******************************************************************************
+ */
+void ih264_resi_trans_quant_8x8(UWORD8 *pu1_src,
+ UWORD8 *pu1_pred,
+ WORD16 *pi2_out,
+ WORD32 src_strd,
+ WORD32 pred_strd,
+ const UWORD16 *pu2_scale_matrix,
+ const UWORD16 *pu2_threshold_matrix,
+ UWORD32 u4_qbits,
+ UWORD32 u4_round_factor,
+ UWORD8 *pu1_nnz,
+ WORD16 *pu1_dc_alt_addr)
+
+{
+ WORD16 *pi2_out_tmp = pi2_out;
+ UWORD32 i;
+ WORD32 a0, a1, a2, a3, a4, a5, a6, a7;
+ WORD32 r0, r1, r2, r3, r4, r5, r6, r7;
+ WORD32 i4_sign;
+ UWORD32 u4_abs_value;
+ UWORD32 u4_nonzero_coeff = 0;
+
+ UNUSED(pu1_dc_alt_addr);
+
+ /*Horizontal transform */
+ /* we are going to use the a's and r's in a twisted way since */
+ /*i dont want to declare more variables */
+ for(i = 0; i < SUB_BLK_WIDTH_8x8; ++i)
+ {
+ r0 = pu1_src[0];
+ r0 -= pu1_pred[0];
+ r1 = pu1_src[1];
+ r1 -= pu1_pred[1];
+ r2 = pu1_src[2];r2 -= pu1_pred[2];
+ r3 = pu1_src[3];r3 -= pu1_pred[3];
+ r4 = pu1_src[4];r4 -= pu1_pred[4];
+ r5 = pu1_src[5];r5 -= pu1_pred[5];
+ r6 = pu1_src[6];r6 -= pu1_pred[6];
+ r7 = pu1_src[7];r7 -= pu1_pred[7];
+
+
+ a0 = r0 + r7;
+ a1 = r1 + r6;
+ a2 = r2 + r5;
+ a3 = r3 + r4;
+
+ a4 = a0 + a3;
+ a5 = a1 + a2;
+ a6 = a0 - a3;
+ a7 = a1 - a2;
+
+ pi2_out_tmp[0] = a4 + a5;
+
+ pi2_out_tmp[2] = a6 + (a7>>1);
+ pi2_out_tmp[4] = a4 - a5;
+ pi2_out_tmp[6] = (a6>>1) - a7;
+
+ a0 = r0 - r7;
+ a1 = r1 - r6;
+ a2 = r2 - r5;
+ a3 = r3 - r4;
+
+ a4 = a1 + a2 + ((a0>>1) + a0);
+ a5 = a0 - a3 - ((a2>>1) + a2);
+ a6 = a0 + a3 - ((a1>>1) + a1);
+ a7 = a1 - a2 + ((a3>>1) + a3);
+
+ pi2_out_tmp[1] = a4 + (a7>>2);
+ pi2_out_tmp[3] = a5 + (a6>>2);
+ pi2_out_tmp[5] = a6 - (a5>>2);
+ pi2_out_tmp[7] = (a4>>2) - a7;
+
+ pu1_src += src_strd;
+ pu1_pred += pred_strd;
+ pi2_out_tmp += 8;
+ }
+
+ /*vertical transform and quant */
+
+ pi2_out_tmp = pi2_out;
+
+ for (i = 0; i < SUB_BLK_WIDTH_8x8; ++i)
+ {
+
+ r0 = pi2_out_tmp[0];
+ r1 = pi2_out_tmp[8];
+ r2 = pi2_out_tmp[16];
+ r3 = pi2_out_tmp[24];
+ r4 = pi2_out_tmp[32];
+ r5 = pi2_out_tmp[40];
+ r6 = pi2_out_tmp[48];
+ r7 = pi2_out_tmp[56];
+
+ a0 = r0 + r7;
+ a1 = r1 + r6;
+ a2 = r2 + r5;
+ a3 = r3 + r4;
+
+ a4 = a0 + a3;
+ a5 = a1 + a2;
+ a6 = a0 - a3;
+ a7 = a1 - a2;
+
+ a0 = r0 - r7;
+ a1 = r1 - r6;
+ a2 = r2 - r5;
+ a3 = r3 - r4;
+
+ r0 = a4 + a5;
+ r2 = a6 + (a7>>1);
+ r4 = a4 - a5;
+ r6 = (a6>>1) - a7;
+
+ a4 = a1 + a2 + ((a0>>1) + a0);
+ a5 = a0 - a3 - ((a2>>1) + a2);
+ a6 = a0 + a3 - ((a1>>1) + a1);
+ a7 = a1 - a2 + ((a3>>1) + a3);
+
+ r1 = a4 + (a7>>2);
+ r3 = a5 + (a6>>2);
+ r5 = a6 - (a5>>2);
+ r7 = (a4>>2) - a7;
+
+ FWD_QUANT(r0, u4_abs_value, i4_sign, pu2_threshold_matrix[0],
+ pu2_scale_matrix[0], u4_round_factor, u4_qbits,
+ u4_nonzero_coeff);
+ pi2_out_tmp[0] = r0;
+
+ FWD_QUANT(r1, u4_abs_value, i4_sign, pu2_threshold_matrix[8],
+ pu2_scale_matrix[8], u4_round_factor, u4_qbits,
+ u4_nonzero_coeff);
+ pi2_out_tmp[8] = r1;
+
+ FWD_QUANT(r2, u4_abs_value, i4_sign, pu2_threshold_matrix[16],
+ pu2_scale_matrix[16], u4_round_factor, u4_qbits,
+ u4_nonzero_coeff);
+ pi2_out_tmp[16] = r2;
+
+ FWD_QUANT(r3, u4_abs_value, i4_sign, pu2_threshold_matrix[24],
+ pu2_scale_matrix[24], u4_round_factor, u4_qbits,
+ u4_nonzero_coeff);
+ pi2_out_tmp[24] = r3;
+
+ FWD_QUANT(r4, u4_abs_value, i4_sign, pu2_threshold_matrix[32],
+ pu2_scale_matrix[32], u4_round_factor, u4_qbits,
+ u4_nonzero_coeff);
+ pi2_out_tmp[32] = r4;
+
+ FWD_QUANT(r5, u4_abs_value, i4_sign, pu2_threshold_matrix[40],
+ pu2_scale_matrix[40], u4_round_factor, u4_qbits,
+ u4_nonzero_coeff);
+ pi2_out_tmp[40] = r5;
+
+ FWD_QUANT(r6, u4_abs_value, i4_sign, pu2_threshold_matrix[48],
+ pu2_scale_matrix[48], u4_round_factor, u4_qbits,
+ u4_nonzero_coeff);
+ pi2_out_tmp[48] = r6;
+
+ FWD_QUANT(r7, u4_abs_value, i4_sign, pu2_threshold_matrix[56],
+ pu2_scale_matrix[56], u4_round_factor, u4_qbits,
+ u4_nonzero_coeff);
+ pi2_out_tmp[56] = r7;
+
+ pi2_out_tmp++;
+ pu2_scale_matrix++;
+ pu2_threshold_matrix++;
+ }
+ /* Return total nonzero coefficients in the current sub block */
+ *pu1_nnz = u4_nonzero_coeff;
+}
diff --git a/common/ih264_size_defs.h b/common/ih264_size_defs.h
new file mode 100755
index 0000000..e2a8b76
--- /dev/null
+++ b/common/ih264_size_defs.h
@@ -0,0 +1,85 @@
+/******************************************************************************
+ *
+ * Copyright (C) 2015 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ *****************************************************************************
+ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
+*/
+/**
+ *******************************************************************************
+ * @file
+ * ih264_size_defs.h
+ *
+ * @brief
+ * Contains declaration of global variables for H264 transform , quant and inverse quant
+ *
+ * @author
+ * Ittiam
+ *
+ * @remarks
+ *
+ ********************************************************************************/
+
+#ifndef IH264_SIZE_DEFS_H_
+#define IH264_SIZE_DEFS_H_
+
+/*****************************************************************************/
+/* Constant Macros */
+/*****************************************************************************/
+
+/*-----------------------Primary defs--------------------------*/
+
+/*Width of a 4x4 block*/
+#define SUB_BLK_WIDTH_4x4 4
+
+/*Width of an 8x8 block*/
+#define SUB_BLK_WIDTH_8x8 8
+
+/*Number of chroma blocks in a row of coffs*/
+#define SUB_BLK_COUNT_CHROMA_4x4_420 2
+
+/*Number of luma blocks in a row of coffs*/
+#define SUB_BLK_COUNT_LUMA_4x4 4
+
+/*Numbr of chroma planes*/
+#define NUM_CHROMA_PLANES 2
+
+/*Constant bit shifts*/
+#define QP_BITS_h264_4x4 15
+#define QP_BITS_h264_8x8 16
+
+
+/*---------------------------Derived defs------------------------*/
+
+/*Number of coefficients ina 4x4 block*/
+#define COFF_CNT_SUB_BLK_4x4 SUB_BLK_WIDTH_4x4*SUB_BLK_WIDTH_4x4;
+
+/*Number of luma blocks in a row of coffs*/
+#define SUB_BLK_LUMA_4X4_CNT_MB SUB_BLK_COUNT_LUMA_4x4 * SUB_BLK_COUNT_LUMA_4x4
+
+/*Number of chroma coffs in an MB*/
+#define SUB_BLK_CHROMA_4X4_CNT_MB SUB_BLK_COUNT_CHROMA_4x4_420 * SUB_BLK_COUNT_CHROMA_4x4_420
+#define SUB_BLK_CHROMA_4X4_CNT_MB_BIPLANE SUB_BLK_CHROMA_4X4_CNT_MB*NUM_CHROMA_PLANES
+
+/*Size of trans buff = 4x4 for DC block + 4x4 * coffs for 4x4 ac blocks*/
+#define SIZE_TRANS_BUFF (SUB_BLK_WIDTH_4x4*SUB_BLK_WIDTH_4x4*+ \
+ SUB_BLK_WIDTH_4x4*SUB_BLK_WIDTH_4x4* \
+ SUB_BLK_COUNT_LUMA_4x4*SUB_BLK_COUNT_LUMA_4x4)
+
+/*memory size = memory size of 4x4 block of resi coff + 4x4 for DC coff block */
+#define SIZE_TMP_BUFF_ITRANS ((SUB_BLK_WIDTH_4x4*SUB_BLK_WIDTH_4x4) +\
+ (SUB_BLK_WIDTH_4x4*SUB_BLK_WIDTH_4x4))
+
+#endif /* IH264_DEFS_H_ */
diff --git a/common/ih264_structs.h b/common/ih264_structs.h
new file mode 100755
index 0000000..fa4e142
--- /dev/null
+++ b/common/ih264_structs.h
@@ -0,0 +1,1722 @@
+/******************************************************************************
+ *
+ * Copyright (C) 2015 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ *****************************************************************************
+ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
+*/
+
+/**
+ *******************************************************************************
+ * @file
+ * ih264_structs.h
+ *
+ * @brief
+ * Structure definitions used in the code
+ *
+ * @author
+ * Ittiam
+ *
+ * @par List of Functions:
+ *
+ * @remarks
+ * None
+ *
+ *******************************************************************************
+ */
+
+#ifndef _IH264_STRUCTS_H_
+#define _IH264_STRUCTS_H_
+
+/** MB Type info for Intra MBs */
+typedef struct
+{
+ UWORD32 u4_num_mbpart;
+ MBPART_PREDMODE_T e_mbpart_predmode;
+ MBMODES_I16x16 e_intra_predmode;
+ UWORD32 u4_cpb_chroma;
+ UWORD32 u4_cpb_luma;
+}intra_mbtype_info_t;
+
+/** MB Type info for Inter MBs */
+typedef struct
+{
+ UWORD32 u4_num_mbpart;
+ MBPART_PREDMODE_T e_mbpart_predmode_0;
+ MBPART_PREDMODE_T e_mbpart_predmode_1;
+ UWORD32 u4_mbpart_wd;
+ UWORD32 u4_mbpart_ht;
+}inter_mbtype_info_t;
+
+
+/** Sub MB Type info for Inter MBs */
+typedef struct
+{
+ UWORD32 u4_num_mbpart;
+ MBPART_PREDMODE_T e_mbpart_predmode;
+ UWORD32 u4_mbpart_wd;
+ UWORD32 u4_mbpart_ht;
+}submbtype_info_t;
+
+/**
+ * Picture buffer
+ */
+typedef struct
+{
+ UWORD8* pu1_luma;
+ UWORD8* pu1_chroma;
+
+ WORD32 i4_abs_poc;
+ WORD32 i4_poc_lsb;
+
+
+ /** Lower 32 bit of time stamp */
+ UWORD32 u4_timestamp_low;
+
+ /** Upper 32 bit of time stamp */
+ UWORD32 u4_timestamp_high;
+
+ WORD32 i4_used_as_ref;
+
+ /**
+ * frame_num in the slice header
+ */
+ WORD32 i4_frame_num;
+
+ /**
+ * Long-term frame idx
+ * TODO: store in frame_num
+ */
+ WORD32 i4_long_term_frame_idx;
+
+ /*
+ * 0: Top Field
+ * 1: Bottom Field
+ */
+ WORD8 i1_field_type;
+
+ /**
+ * buffer ID from frame buffer manager
+ */
+ WORD32 i4_buf_id;
+
+} pic_buf_t;
+
+
+/**
+ * Reference List
+ */
+typedef struct
+{
+ void *pv_pic_buf;
+
+ void *pv_mv_buf;
+
+} ref_list_t;
+
+
+/**
+ * Motion vector
+ */
+typedef struct
+{
+ /**
+ * Horizontal Motion Vector
+ */
+ WORD16 i2_mvx;
+
+ /**
+ * Vertical Motion Vector
+ */
+ WORD16 i2_mvy;
+} mv_t;
+
+/*****************************************************************************/
+/* Following results in packed 48 bit structure. If mv_t included */
+/* ref_pic_buf_id, then 8 bits will be wasted for each mv for aligning. */
+/* Also using mv_t as elements directly instead of a pointer to l0 and l1 */
+/* mvs. Since pointer takes 4 bytes and MV itself is 4 bytes. It does not */
+/* really help using pointers. */
+/*****************************************************************************/
+
+/**
+ * PU Motion Vector info
+ */
+typedef struct
+{
+ /**
+ * L0 Motion Vector
+ */
+ mv_t s_l0_mv;
+
+ /**
+ * L1 Motion Vector
+ */
+ mv_t s_l1_mv;
+
+ /**
+ * L0 Ref index
+ */
+ WORD8 i1_l0_ref_idx;
+
+ /**
+ * L1 Ref index
+ */
+ WORD8 i1_l1_ref_idx;
+
+ /**
+ * L0 Ref Pic Buf ID
+ */
+ WORD8 i1_l0_ref_pic_buf_id;
+
+ /**
+ * L1 Ref Pic Buf ID
+ */
+ WORD8 i1_l1_ref_pic_buf_id;
+
+} pu_mv_t;
+
+/**
+ * PU information
+ */
+typedef struct
+{
+
+ /**
+ * Motion Vectors
+ */
+ pu_mv_t s_mv;
+
+ /**
+ * PU X position in terms of min PU (4x4) units
+ */
+ UWORD32 b2_pos_x : 2;
+
+ /**
+ * PU Y position in terms of min PU (4x4) units
+ */
+ UWORD32 b2_pos_y : 2;
+
+ /**
+ * PU width in pixels = (b2_wd + 1) << 2
+ */
+ UWORD32 b2_wd : 2;
+
+ /**
+ * PU height in pixels = (b2_ht + 1) << 2
+ */
+ UWORD32 b2_ht : 2;
+
+ /**
+ * Intra or Inter flag for each partition - 0 or 1
+ */
+ UWORD32 b1_intra_flag : 1;
+
+ /**
+ * PRED_L0, PRED_L1, PRED_BI
+ */
+ UWORD32 b2_pred_mode : 2;
+
+} pu_t;
+
+
+/**
+ * MB information to be stored for entire frame
+ */
+typedef struct
+{
+ /**
+ * Transform sizes 0: 4x4, 1: 8x8,
+ */
+ UWORD32 b1_trans_size : 1;
+
+ /**
+ * CBP - 4 bits for Y, 1 for U and 1 for V
+ */
+ UWORD32 b6_cbp: 6;
+
+ /**
+ * Intra pred sizes 0: 4x4, 1: 8x8, 2: 16x16
+ */
+ UWORD32 b2_intra_pred_size : 2;
+
+ /**
+ * Flag to signal if the current MB is IPCM
+ */
+ UWORD32 b1_ipcm : 1;
+
+}mb_t;
+
+/*****************************************************************************/
+/* Info from last TU row of MB is stored in a row level neighbour buffer */
+/* , which will be used for Boundary Strength computation */
+/*****************************************************************************/
+/**
+ * MB neighbor info
+ */
+typedef struct
+{
+ /**
+ * Slice index of the mb
+ */
+ UWORD16 u2_slice_idx;
+
+ /*************************************************************************/
+ /* CBF of bottom TU row (replicated in 4 pixel boundary) */
+ /* MSB contains CBF of first TU in the last row and LSB contains CBF */
+ /* of last TU in the last row */
+ /*************************************************************************/
+ /**
+ * CBF of bottom TU row
+ */
+ UWORD16 u2_packed_cbf;
+
+ /*************************************************************************/
+ /* QP of bottom TU row (replicated at 8 pixel boundary (Since QP can */
+ /* not change at less than min CU granularity) */
+ /*************************************************************************/
+ /**
+ * QP of bottom TU row
+ */
+ UWORD8 u1_qp;
+
+} mb_top_ny_info_t;
+
+/**
+ * MB level context
+ */
+typedef struct _mb_ctxt_t
+{
+ /*************************************************************************/
+ /* Tile boundary can be detected by looking at tile start x and tile */
+ /* start y. And based on the tile, slice and frame boundary the */
+ /* following will be initialized. */
+ /*************************************************************************/
+ /**
+ * Pointer to left MB
+ */
+ /* If not available, this will be set to NULL */
+ struct _mb_ctxt_t *ps_mb_left;
+
+ /**
+ * Pointer to top-left MB
+ */
+ /* If not available, this will be set to NULL */
+ mb_top_ny_info_t *ps_mb_ny_topleft;
+
+ /**
+ * Pointer to top MB
+ */
+ /* If not available, this will be set to NULL */
+ mb_top_ny_info_t *ps_mb_ny_top;
+
+ /**
+ * Pointer to top-right MB
+ */
+ /* If not available, this will be set to NULL */
+ mb_top_ny_info_t *ps_mb_ny_topright;
+
+ /*************************************************************************/
+ /* Pointer to PU data. */
+ /* This points to a MV Bank stored at frame level. Though this */
+ /* pointer can be derived by reading offset at frame level, it is */
+ /* stored here for faster access. Can be removed if storage of MB */
+ /* structure is critical */
+ /*************************************************************************/
+ /**
+ * Pointer to PU data
+ */
+ pu_t *ps_pu;
+
+ /*************************************************************************/
+ /* Pointer to a PU map stored at frame level, */
+ /* Though this pointer can be derived by multiplying MB address with */
+ /* number of minTUs in a MB, it is stored here for faster access. */
+ /* Can be removed if storage of MB structure is critical */
+ /*************************************************************************/
+ /**
+ * Pointer to a PU map stored at frame level
+ */
+ UWORD8 *pu1_pu_map;
+
+ /**
+ * Number of TUs filled in as_tu
+ */
+ /*************************************************************************/
+ /* Having the first entry as 32 bit data, helps in keeping each of */
+ /* the structures aligned to 32 bits at MB level */
+ /*************************************************************************/
+ WORD32 i4_tu_cnt;
+
+ /**
+ * Pointer to transform coeff data
+ */
+ /*************************************************************************/
+ /* Following format is repeated for every coded TU */
+ /* Luma Block */
+ /* num_coeffs : 16 bits */
+ /* zero_cols : 8 bits ( 1 bit per 4 columns) */
+ /* sig_coeff_map : ((TU Size * TU Size) + 31) >> 5 number of WORD32s */
+ /* coeff_data : Non zero coefficients */
+ /* Cb Block (only for last TU in 4x4 case else for every luma TU) */
+ /* num_coeffs : 16 bits */
+ /* zero_cols : 8 bits ( 1 bit per 4 columns) */
+ /* sig_coeff_map : ((TU Size * TU Size) + 31) >> 5 number of WORD32s */
+ /* coeff_data : Non zero coefficients */
+ /* Cr Block (only for last TU in 4x4 case else for every luma TU) */
+ /* num_coeffs : 16 bits */
+ /* zero_cols : 8 bits ( 1 bit per 4 columns) */
+ /* sig_coeff_map : ((TU Size * TU Size) + 31) >> 5 number of WORD32s */
+ /* coeff_data : Non zero coefficients */
+ /*************************************************************************/
+ void *pv_coeff_data;
+
+ /**
+ * Slice to which the MB belongs to
+ */
+ WORD32 i4_slice_idx;
+
+ /**
+ * MB column position
+ */
+ WORD32 i4_pos_x;
+
+ /**
+ * MB row position
+ */
+ WORD32 i4_pos_y;
+
+ /**
+ * Number of PUs filled in ps_pu
+ */
+ WORD32 i4_pu_cnt;
+
+ /**
+ * Index of current PU being processed in ps_pu
+ */
+ /* Scratch variable set to 0 at the start of any PU processing function */
+ WORD32 i4_pu_idx;
+
+ /**
+ * Vertical Boundary strength
+ */
+ /* Two bits per edge.
+ Stored in format. BS[15] | BS[14] | .. |BS[0]*/
+ UWORD32 *pu4_vert_bs;
+
+ /**
+ * Horizontal Boundary strength
+ */
+
+ /* Two bits per edge.
+ Stored in format. BS[15] | BS[14] | .. |BS[0]*/
+ UWORD32 *pu4_horz_bs;
+
+ /**
+ * Qp array stored for each 8x8 pixels
+ */
+ UWORD8 *pu1_qp;
+
+ /**
+ * Pointer to current frame's pu_t array
+ */
+ pu_t *ps_frm_pu;
+
+ /**
+ * Pointer to current frame's pu_t index array, which stores starting index
+ * of pu_t for every MB
+ */
+ UWORD32 *pu4_frm_pu_idx;
+
+ /**
+ * Pointer to current frame's pu map array
+ */
+ UWORD8 *pu1_frm_pu_map;
+
+ /*************************************************************************/
+ /* Need to add encoder specific elements for identifying the order of */
+ /* coding for CU, TU and PU if any */
+ /*************************************************************************/
+} mb_ctxt_t;
+
+/*************************************************************************/
+/* The following describes how each of the CU cases are handled */
+/*************************************************************************/
+
+/*************************************************************************/
+/* For SKIP MB */
+/* One Inter PU with appropriate MV */
+/* One TU which says CBP is zero and size is 16x16 */
+/*************************************************************************/
+
+/*************************************************************************/
+/* For Inter MB */
+/* M Inter PU with appropriate MVs (M between 1 to 4) */
+/* Number of TUs derived based on transform size */
+/*************************************************************************/
+
+/*************************************************************************/
+/* For Intra MB */
+/* Number of TUs derived based on transform size */
+/* N Intra Modes are signaled along with coeff data at the start */
+/*************************************************************************/
+
+/*************************************************************************/
+/* For Intra PCM MB */
+/* One TU which says ipcm is 1 */
+/*************************************************************************/
+
+
+
+/**
+ * Structure to hold quantization parameters of an mb
+ */
+typedef struct
+{
+
+ /*
+ * mb qp
+ */
+ UWORD8 u1_mb_qp;
+
+ /*
+ * mb qp / 6
+ */
+ UWORD8 u1_qp_div;
+
+ /*
+ * mb qp mod 6
+ */
+ UWORD8 u1_qp_rem;
+
+ /*
+ * QP bits
+ */
+ UWORD8 u1_qbits;
+
+ /*
+ * forward scale matrix
+ */
+ const UWORD16 *pu2_scale_mat;
+
+ /*
+ * threshold matrix for quantization
+ */
+ UWORD16 *pu2_thres_mat;
+
+ /*
+ * Threshold to compare the sad with
+ */
+ UWORD16 *pu2_sad_thrsh;
+
+ /*
+ * qp dependent rounding constant
+ */
+ UWORD32 u4_dead_zone;
+
+ /*
+ * inverse scale matrix
+ */
+ const UWORD16 *pu2_iscale_mat;
+
+ /*
+ * Weight matrix in iquant
+ */
+ UWORD16 *pu2_weigh_mat;
+
+}quant_params_t;
+
+/**
+ * Structure to hold Profile tier level info for a given layer
+ */
+
+typedef struct
+{
+ /**
+ * NAL unit type
+ */
+ WORD8 i1_nal_unit_type;
+
+ /**
+ * NAL ref idc
+ */
+ WORD8 i1_nal_ref_idc;
+
+
+} nal_header_t;
+
+/**
+ * HRD parameters Info
+ */
+typedef struct
+{
+ /**
+ * Specifies the number of alternative CPB specifications in the
+ * bitstream
+ */
+ UWORD8 u1_cpb_cnt_minus1;
+
+ /**
+ * (together with bit_rate_value_minus1) specifies the
+ * maximum input bit rate of the i-th CPB
+ */
+ UWORD32 u4_bit_rate_scale;
+
+ /**
+ * (together with cpb_size_du_value_minus1) specifies
+ * CPB size of the i-th CPB when the CPB operates
+ * at the access unit level
+ */
+ UWORD32 u4_cpb_size_scale;
+
+ /**
+ * (together with bit_rate_scale) specifies the
+ * maximum input bit rate for the i-th CPB
+ */
+ UWORD32 au4_bit_rate_value_minus1[32];
+ /**
+ * together with cpb_size_scale to specify the
+ * CPB size when the CPB operates at the access unit level.
+ */
+ UWORD32 au4_cpb_size_value_minus1[32];
+
+ /**
+ * if 1, specifies that the HSS operates in a constant bit rate (CBR) mode
+ * if 0, specifies that the HSS operates in a intermittent bit rate (CBR) mode
+ */
+ UWORD8 au1_cbr_flag[32];
+
+
+ /**
+ * specifies the length, in bits for initial cpb delay (nal/vcl)syntax in bp sei
+ */
+ UWORD8 u1_initial_cpb_removal_delay_length_minus1;
+
+ /**
+ * specifies the length, in bits for the cpb delay syntax in pt_sei
+ */
+ UWORD8 u1_cpb_removal_delay_length_minus1;
+
+ /**
+ * specifies the length, in bits, of the pic_dpb_output_delay syntax element in the pt SEI message
+ */
+ UWORD8 u1_dpb_output_delay_length_minus1;
+
+ /**
+ * Specifies length of the time offset parameter
+ */
+ UWORD8 u1_time_offset_length;
+
+}hrd_params_t;
+
+
+/**
+ * Structure to hold VUI parameters Info
+ */
+typedef struct
+{
+ /**
+ * indicates the presence of aspect_ratio
+ */
+ UWORD8 u1_aspect_ratio_info_present_flag;
+
+ /**
+ * specifies the aspect ratio of the luma samples
+ */
+ UWORD8 u1_aspect_ratio_idc;
+
+ /**
+ * width of the luma samples. user dependent
+ */
+ UWORD16 u2_sar_width;
+
+ /**
+ * Height of the luma samples. user dependent
+ */
+ UWORD16 u2_sar_height;
+
+ /**
+ * if 1, specifies that the overscan_appropriate_flag is present
+ * if 0, the preferred display method for the video signal is unspecified
+ */
+ UWORD8 u1_overscan_info_present_flag;
+
+ /**
+ * if 1,indicates that the cropped decoded pictures output
+ * are suitable for display using overscan
+ */
+ UWORD8 u1_overscan_appropriate_flag;
+
+ /**
+ * if 1 specifies that video_format, video_full_range_flag and
+ * colour_description_present_flag are present
+ */
+ UWORD8 u1_video_signal_type_present_flag;
+
+ /**
+ * pal, secam, ntsc, ...
+ */
+ UWORD8 u1_video_format;
+
+ /**
+ * indicates the black level and range of the luma and chroma signals
+ */
+ UWORD8 u1_video_full_range_flag;
+
+ /**
+ * if 1,to 1 specifies that colour_primaries, transfer_characteristics
+ * and matrix_coefficients are present
+ */
+ UWORD8 u1_colour_description_present_flag;
+
+ /**
+ * indicates the chromaticity coordinates of the source primaries
+ */
+ UWORD8 u1_colour_primaries;
+
+ /**
+ * indicates the opto-electronic transfer characteristic of the source picture
+ */
+ UWORD8 u1_transfer_characteristics;
+
+ /**
+ * the matrix coefficients used in deriving luma and chroma signals
+ * from the green, blue, and red primaries
+ */
+ UWORD8 u1_matrix_coefficients;
+
+ /**
+ * if 1, specifies that chroma_sample_loc_type_top_field and
+ * chroma_sample_loc_type_bottom_field are present
+ */
+ UWORD8 u1_chroma_loc_info_present_flag;
+
+ /**
+ * location of chroma samples
+ */
+ UWORD8 u1_chroma_sample_loc_type_top_field;
+
+ UWORD8 u1_chroma_sample_loc_type_bottom_field;
+
+ /**
+ * Indicates the presence of the
+ * num_units_in_ticks, time_scale flag
+ */
+ UWORD8 u1_vui_timing_info_present_flag;
+
+ /**
+ * Number of units that
+ * correspond to one increment of the
+ * clock. Indicates the resolution
+ */
+ UWORD32 u4_vui_num_units_in_tick;
+
+ /**
+ * The number of time units that pass in one second
+ */
+ UWORD32 u4_vui_time_scale;
+
+ /**
+ * Flag indicating that time difference between two frames is a constant
+ */
+ UWORD8 u1_fixed_frame_rate_flag;
+
+ /**
+ * Indicates the presence of NAL HRD parameters
+ */
+ UWORD8 u1_nal_hrd_parameters_present_flag;
+
+ /**
+ * NAL level HRD parameters
+ */
+ hrd_params_t s_nal_hrd_parameters;
+
+ /**
+ * Indicates the presence of VCL HRD parameters
+ */
+ UWORD8 u1_vcl_hrd_parameters_present_flag;
+
+ /**
+ * VCL level HRD parameters
+ */
+ hrd_params_t s_vcl_hrd_parameters;
+
+ /**
+ * Specifies the HRD operational mode
+ */
+ UWORD8 u1_low_delay_hrd_flag;
+
+ /**
+ * Indicates presence of SEI messages which include pic_struct syntax element
+ */
+ UWORD8 u1_pic_struct_present_flag;
+
+ /**
+ * 1, specifies that the following cvs bitstream restriction parameters are present
+ */
+ UWORD8 u1_bitstream_restriction_flag;
+
+ /**
+ * if 0, indicates that no pel outside the pic boundaries and
+ * no sub-pels derived using pels outside the pic boundaries is used for inter prediction
+ */
+ UWORD8 u1_motion_vectors_over_pic_boundaries_flag;
+
+ /**
+ * Indicates a number of bytes not exceeded by the sum of the sizes of the VCL NAL units
+ * associated with any coded picture
+ */
+ UWORD8 u1_max_bytes_per_pic_denom;
+
+ /**
+ * Indicates an upper bound for the number of bits of coding_unit() data
+ */
+ UWORD8 u1_max_bits_per_mb_denom;
+
+ /**
+ * Indicate the maximum absolute value of a decoded horizontal MV component
+ * in quarter-pel luma units
+ */
+ UWORD8 u1_log2_max_mv_length_horizontal;
+
+ /**
+ * Indicate the maximum absolute value of a decoded vertical MV component
+ * in quarter-pel luma units
+ */
+ UWORD8 u1_log2_max_mv_length_vertical;
+
+ /**
+ * Max number of frames that are not synchronized in display and decode order
+ */
+ UWORD8 u1_num_reorder_frames;
+
+ /**
+ * specifies required size of the HRD DPB in units of frame buffers.
+ */
+ UWORD8 u1_max_dec_frame_buffering;
+
+} vui_t;
+
+
+/**
+ * Structure to hold SPS info
+ */
+typedef struct
+{
+ /**
+ * profile_idc
+ */
+ UWORD8 u1_profile_idc;
+
+ /** constraint_set0_flag */
+ UWORD8 u1_constraint_set0_flag;
+
+ /** constraint_set1_flag */
+ UWORD8 u1_constraint_set1_flag;
+
+ /** constraint_set2_flag */
+ UWORD8 u1_constraint_set2_flag;
+
+ /** constraint_set3_flag */
+ UWORD8 u1_constraint_set3_flag;
+
+ /**
+ * level_idc
+ */
+ UWORD8 u1_level_idc;
+
+ /**
+ * seq_parameter_set_id
+ */
+ UWORD8 u1_sps_id;
+
+
+ /**
+ * chroma_format_idc
+ */
+ UWORD8 u1_chroma_format_idc;
+
+ /**
+ * residual_colour_transform_flag
+ */
+ WORD8 i1_residual_colour_transform_flag;
+
+ /**
+ * bit_depth_luma_minus8
+ */
+ WORD8 i1_bit_depth_luma;
+
+ /**
+ * bit_depth_chroma_minus8
+ */
+ WORD8 i1_bit_depth_chroma;
+
+ /**
+ * qpprime_y_zero_transform_bypass_flag
+ */
+ WORD8 i1_qpprime_y_zero_transform_bypass_flag;
+
+ /**
+ * seq_scaling_matrix_present_flag
+ */
+ WORD8 i1_seq_scaling_matrix_present_flag;
+
+ /**
+ * seq_scaling_list_present_flag
+ */
+ WORD8 ai1_seq_scaling_list_present_flag[8];
+
+ /**
+ * log2_max_frame_num_minus4
+ */
+ WORD8 i1_log2_max_frame_num;
+
+ /**
+ * MaxFrameNum in the standard
+ * 1 << i1_log2_max_frame_num
+ */
+ WORD32 i4_max_frame_num;
+
+ /**
+ * pic_order_cnt_type
+ */
+ WORD8 i1_pic_order_cnt_type;
+
+ /**
+ * log2_max_pic_order_cnt_lsb_minus4
+ */
+ WORD8 i1_log2_max_pic_order_cnt_lsb;
+
+ /**
+ * MaxPicOrderCntLsb in the standard.
+ * 1 << log2_max_pic_order_cnt_lsb_minus4
+ */
+ WORD32 i4_max_pic_order_cnt_lsb;
+
+ /**
+ * delta_pic_order_always_zero_flag
+ */
+ WORD8 i1_delta_pic_order_always_zero_flag;
+
+ /**
+ * offset_for_non_ref_pic
+ */
+ WORD32 i4_offset_for_non_ref_pic;
+
+ /**
+ * offset_for_top_to_bottom_field
+ */
+ WORD32 i4_offset_for_top_to_bottom_field;
+
+ /**
+ * num_ref_frames_in_pic_order_cnt_cycle
+ */
+ UWORD8 u1_num_ref_frames_in_pic_order_cnt_cycle;
+
+ /**
+ * Offset_for_ref_frame
+ */
+ WORD32 ai4_offset_for_ref_frame[256];
+
+ /**
+ * max_num_ref_frames
+ */
+ UWORD8 u1_max_num_ref_frames;
+
+ /**
+ * gaps_in_frame_num_value_allowed_flag
+ */
+ WORD8 i1_gaps_in_frame_num_value_allowed_flag;
+
+ /**
+ * pic_width_in_mbs_minus1
+ */
+ WORD16 i2_pic_width_in_mbs_minus1;
+
+ /**
+ * pic_height_in_map_units_minus1
+ */
+ WORD16 i2_pic_height_in_map_units_minus1;
+
+ /**
+ * frame_mbs_only_flag
+ */
+ WORD8 i1_frame_mbs_only_flag;
+
+ /**
+ * mb_adaptive_frame_field_flag
+ */
+ WORD8 i1_mb_adaptive_frame_field_flag;
+
+ /**
+ * direct_8x8_inference_flag
+ */
+ WORD8 i1_direct_8x8_inference_flag;
+
+ /**
+ * frame_cropping_flag
+ */
+ WORD8 i1_frame_cropping_flag;
+
+ /**
+ * frame_crop_left_offset
+ */
+ WORD16 i2_frame_crop_left_offset;
+
+ /**
+ * frame_crop_right_offset
+ */
+ WORD16 i2_frame_crop_right_offset;
+
+ /**
+ * frame_crop_top_offset
+ */
+ WORD16 i2_frame_crop_top_offset;
+
+ /**
+ * frame_crop_bottom_offset
+ */
+ WORD16 i2_frame_crop_bottom_offset;
+
+ /**
+ * vui_parameters_present_flag
+ */
+ WORD8 i1_vui_parameters_present_flag;
+
+ /**
+ * vui_parameters_Structure_info
+ */
+ vui_t s_vui_parameters;
+
+ /**
+ * Flag to give status of SPS structure
+ */
+ WORD8 i1_sps_valid;
+
+ /**
+ * Coded Picture width
+ */
+ WORD32 i2_pic_wd;
+
+ /**
+ * Coded Picture height
+ */
+ WORD32 i2_pic_ht;
+
+ /**
+ * Picture width in MB units
+ */
+
+ WORD16 i2_pic_wd_in_mb;
+
+ /**
+ * Picture height in MB units
+ */
+
+ WORD16 i2_pic_ht_in_mb;
+
+ /**
+ * useDefaultScalingMatrixFlag
+ */
+ WORD8 ai1_use_default_scaling_matrix_flag[8];
+
+ /**
+ * 4x4 Scaling lists after inverse zig zag scan
+ */
+ UWORD16 au2_4x4_weight_scale[6][16];
+
+ /**
+ * 4x4 Scaling lists after inverse zig zag scan
+ */
+ UWORD16 au2_8x8_weight_scale[2][64];
+
+} sps_t;
+
+
+/**
+ * Structure to hold PPS info
+ */
+typedef struct
+{
+ /**
+ * pic_parameter_set_id
+ */
+ UWORD8 u1_pps_id;
+
+ /**
+ * seq_parameter_set_id
+ */
+ UWORD8 u1_sps_id;
+
+ /**
+ * Entropy coding : 0-VLC; 1 - CABAC
+ */
+ UWORD8 u1_entropy_coding_mode_flag;
+
+ /*
+ * Pic order present flag
+ */
+ UWORD8 u1_pic_order_present_flag;
+
+ /*
+ * Number of slice groups
+ */
+ UWORD8 u1_num_slice_groups;
+
+ /*
+ * Slice group map type
+ */
+ UWORD8 u1_slice_group_map_type;
+
+ /*
+ * Maximum reference picture index in the reference list 0 : range [0 - 31]
+ */
+ WORD8 i1_num_ref_idx_l0_default_active;
+
+ /*
+ * Maximum reference picture index in the reference list 1 : range [0 - 31]
+ */
+ WORD8 i1_num_ref_idx_l1_default_active;
+
+ /**
+ * weighted_pred_flag
+ */
+ WORD8 i1_weighted_pred_flag;
+
+ /**
+ * weighted_bipred_flag
+ */
+ WORD8 i1_weighted_bipred_idc;
+
+ /**
+ * pic_init_qp_minus26
+ */
+ WORD8 i1_pic_init_qp;
+
+ /**
+ * pic_init_qs_minus26
+ */
+ WORD8 i1_pic_init_qs;
+
+ /*
+ * Chroma QP offset w.r.t QPY {-12,12}
+ */
+ WORD8 i1_chroma_qp_index_offset;
+
+ /**
+ * deblocking_filter_control_present_flag
+ */
+ WORD8 i1_deblocking_filter_control_present_flag;
+
+ /**
+ * constrained_intra_pred_flag
+ */
+ WORD8 i1_constrained_intra_pred_flag;
+
+ /**
+ * redundant_pic_cnt_present_flag
+ */
+ WORD8 i1_redundant_pic_cnt_present_flag;
+
+ /**
+ * transform_8x8_mode_flag
+ */
+ WORD8 i1_transform_8x8_mode_flag;
+
+ /**
+ * pic_scaling_matrix_present_flag
+ */
+ WORD8 i1_pic_scaling_matrix_present_flag;
+
+ /*
+ * Second chroma QP offset
+ */
+ WORD8 i1_second_chroma_qp_index_offset;
+
+
+ /**
+ * useDefaultScalingMatrixFlag
+ */
+ WORD8 ai1_use_default_scaling_matrix_flag[8];
+
+ /**
+ * 4x4 Scaling lists after inverse zig zag scan
+ */
+ UWORD16 au2_4x4_weight_scale[6][16];
+
+ /**
+ * 4x4 Scaling lists after inverse zig zag scan
+ */
+ UWORD16 au2_8x8_weight_scale[2][64];
+
+
+ /**
+ * pic_scaling_list_present_flag
+ */
+ WORD8 ai1_pic_scaling_list_present_flag[8];
+
+ /**
+ * Flag to give status of PPS structure
+ */
+ WORD8 i1_pps_valid;
+
+
+} pps_t;
+
+/**
+ * MMCO commands and params.
+ */
+typedef struct
+{
+ /* memory management control operation command */
+ UWORD8 u1_memory_management_control_operation;
+
+ /*
+ * Contains difference of pic nums of short-term pic/frame
+ * 1. To signal it as "unused for reference" if mmco = 1
+ * 2. To signal it as "used for long-term reference" if mmco = 3
+ */
+ UWORD32 u4_difference_of_pic_nums_minus1;
+
+ /* Long-term pic num to be set as "unused for reference" */
+ UWORD8 u1_long_term_pic_num;
+
+ /*
+ * Assign a long-term idx to a picture as follows
+ * 1. Assign to a short-term pic if mmco = 3
+ * 2. Assign to the current pic if mmco = 6
+ */
+ UWORD8 u1_long_term_frame_idx;
+
+ /*
+ * The max long-term idx. The long-term pics having idx above
+ * are set as "unused for reference
+ */
+ UWORD8 u1_max_long_term_frame_idx_plus1;
+
+}mmco_prms_t;
+
+/**
+ * Structure to hold Reference picture list modification info
+ */
+typedef struct
+{
+ /* ref_pic_list_modification_flag_l0 */
+ WORD8 i1_ref_pic_list_modification_flag_l0;
+
+ /* Modification required in list0 */
+ WORD8 i1_modification_of_pic_nums_idc_l0[MAX_MODICATION_IDC];
+
+ /*
+ * The absolute difference between the picture number of
+ * the picture being moved to the current index in
+ * list0 and the picture number prediction value
+ */
+ UWORD32 u4_abs_diff_pic_num_minus1_l0[MAX_MODICATION_IDC];
+
+ /*
+ * The long-term picture number of the picture being moved
+ * to the current index in list0
+ */
+ UWORD8 u1_long_term_pic_num_l0[MAX_MODICATION_IDC];
+
+ /* ref_pic_list_modification_flag_l1 */
+ WORD8 i1_ref_pic_list_modification_flag_l1;
+
+ /* Modification required in list1 */
+ WORD8 i1_modification_of_pic_nums_idc_l1[MAX_MODICATION_IDC];
+
+ /*
+ * The absolute difference between the picture number of
+ * the picture being moved to the current index in
+ * list1 and the picture number prediction value
+ */
+ UWORD32 u4_abs_diff_pic_num_minus1_l1[MAX_MODICATION_IDC];
+
+ /*
+ * The long-term picture number of the picture being moved
+ * to the current index in list1
+ */
+ UWORD8 u1_long_term_pic_num_l1[MAX_MODICATION_IDC];
+}rplm_t;
+
+/**
+ * Structure to hold Slice Header info
+ */
+typedef struct
+{
+
+ /*
+ * nal_unit_type
+ */
+ WORD8 i1_nal_unit_type;
+
+ /*
+ * nal_unit_idc
+ */
+ WORD8 i1_nal_unit_idc;
+
+ /*
+ * first_mb_in_slice
+ */
+ UWORD16 u2_first_mb_in_slice;
+
+ /*
+ * slice_type
+ */
+ UWORD8 u1_slice_type;
+
+ /*
+ * pic_parameter_set_id
+ */
+ UWORD8 u1_pps_id;
+
+ /*
+ * frame_num
+ */
+ WORD32 i4_frame_num;
+
+ /*
+ * field_pic_flag
+ */
+ WORD8 i1_field_pic_flag;
+
+ /*
+ * bottom_field_flag
+ */
+ WORD8 i1_bottom_field_flag;
+
+ /*
+ * second_field
+ */
+ WORD8 i1_second_field_flag;
+
+ /*
+ * idr_pic_id
+ */
+ UWORD16 u2_idr_pic_id ;
+
+ /*
+ * pic_order_cnt_lsb
+ */
+ UWORD16 i4_pic_order_cnt_lsb;
+
+ /*
+ * delta_pic_order_cnt_bottom
+ */
+ WORD32 i4_delta_pic_order_cnt_bottom;
+
+ /*
+ * delta_pic_order_cnt
+ */
+ WORD32 ai4_delta_pic_order_cnt[2];
+
+ /*
+ * redundant_pic_cnt
+ */
+ UWORD8 u1_redundant_pic_cnt;
+
+ /*
+ * direct_spatial_mv_pred_flag
+ */
+ UWORD8 u1_direct_spatial_mv_pred_flag;
+
+ /*
+ * num_ref_idx_active_override_flag
+ */
+ UWORD8 u1_num_ref_idx_active_override_flag;
+
+ /*
+ * num_ref_idx_l0_active
+ */
+ WORD8 i1_num_ref_idx_l0_active;
+
+ /*
+ * num_ref_idx_l1_active_minus1
+ */
+ WORD8 i1_num_ref_idx_l1_active;
+
+ /*
+ * ref_pic_list_reordering_flag_l0
+ */
+ UWORD8 u1_ref_idx_reordering_flag_l0;
+
+ /**
+ * Reference prediction list modification
+ */
+ rplm_t s_rplm;
+
+ /**
+ * L0 Reference pic lists
+ */
+ ref_list_t as_ref_pic_list0[MAX_DPB_SIZE];
+
+ /**
+ * L1 Reference pic lists
+ */
+ ref_list_t as_ref_pic_list1[MAX_DPB_SIZE];
+
+ /*
+ * weighted_bipred_idc
+ */
+ WORD8 u1_weighted_bipred_idc;
+
+ /*
+ * no_output_of_prior_pics_flag
+ */
+ UWORD8 u1_no_output_of_prior_pics_flag;
+
+ /*
+ * long_term_reference_flag
+ */
+ UWORD8 u1_long_term_reference_flag;
+
+ /*
+ * adaptive_ref_pic_marking_mode_flag
+ */
+ UWORD8 u1_adaptive_ref_pic_marking_mode_flag;
+
+ /*
+ * Array to structures to store mmco commands
+ * and parameters.
+ */
+ mmco_prms_t as_mmco_prms[MAX_MMCO_COMMANDS];
+
+ /*
+ * entropy_coding_mode_flag
+ */
+ WORD8 u1_entropy_coding_mode_flag;
+
+ /*
+ * cabac_init_idc
+ */
+ WORD8 i1_cabac_init_idc;
+
+ /*
+ * i1_slice_qp
+ */
+ WORD8 i1_slice_qp;
+
+ /*
+ * sp_for_switch_flag
+ */
+ UWORD8 u1_sp_for_switch_flag;
+
+ /*
+ * slice_qs_delta
+ */
+ UWORD8 u1_slice_qs;
+
+ /*
+ * disable_deblocking_filter_idc
+ */
+ WORD8 u1_disable_deblocking_filter_idc;
+
+ /*
+ * slice_alpha_c0_offset_div2
+ */
+ WORD8 i1_slice_alpha_c0_offset_div2;
+
+ /*
+ * slice_beta_offset_div2
+ */
+ WORD8 i1_slice_beta_offset_div2;
+
+ /*
+ * num_slice_groups_minus1
+ */
+ WORD8 u1_num_slice_groups_minus1;
+
+ /*
+ * slice_group_change_cycle
+ */
+ WORD8 u1_slice_group_change_cycle;
+
+ /**
+ * Start MB X
+ */
+ UWORD16 i2_mb_x;
+
+ /**
+ * Start MB Y
+ */
+ UWORD16 i2_mb_y;
+
+ /**
+ * Absolute POC. Contains minimum of top and bottom POC.
+ */
+ WORD32 i4_abs_pic_order_cnt;
+
+ /**
+ * Absolute top POC. Contains top poc for frame or top
+ * field. Invalid for bottom field.
+ */
+ WORD32 i4_abs_top_pic_order_cnt;
+
+ /**
+ * Absolute top POC. Contains bottom poc for frame or bottom
+ * field. Invalid for top field.
+ */
+ WORD32 i4_abs_bottom_pic_order_cnt;
+
+ /** Flag signaling if the current slice is ref slice */
+ UWORD8 i1_nal_ref_idc;
+
+ /** Flag to indicate if the current slice is MBAFF Frame */
+ UWORD8 u1_mbaff_frame_flag;
+
+ /** luma_log2_weight_denom */
+ UWORD8 u1_luma_log2_weight_denom;
+
+ /** chroma_log2_weight_denom */
+ UWORD8 u1_chroma_log2_weight_denom;
+
+ /** luma_weight_l0_flag */
+ UWORD8 au1_luma_weight_l0_flag[MAX_DPB_SIZE];
+
+ /** luma_weight_l0 : (-128, 127 )is the range of weights
+ * when weighted pred is enabled, 128 is default value */
+ WORD16 ai2_luma_weight_l0[MAX_DPB_SIZE];
+
+ /** luma_offset_l0 : (-128, 127 )is the range of offset
+ * when weighted pred is enabled, 0 is default value */
+ WORD8 ai1_luma_offset_l0[MAX_DPB_SIZE];
+
+ /** chroma_weight_l0_flag */
+ UWORD8 au1_chroma_weight_l0_flag[MAX_DPB_SIZE];
+
+ /** chroma_weight_l0 : (-128, 127 )is the range of weights
+ * when weighted pred is enabled, 128 is default value*/
+ WORD16 ai2_chroma_weight_l0[MAX_DPB_SIZE][2];
+
+ /** chroma_offset_l0 : (-128, 127 )is the range of offset
+ * when weighted pred is enabled, 0 is default value*/
+ WORD8 ai1_chroma_offset_l0[MAX_DPB_SIZE][2];
+
+ /** luma_weight_l0_flag */
+ UWORD8 au1_luma_weight_l1_flag[MAX_DPB_SIZE];
+
+ /** luma_weight_l1 : (-128, 127 )is the range of weights
+ * when weighted pred is enabled, 128 is default value */
+ WORD16 ai2_luma_weight_l1[MAX_DPB_SIZE];
+
+ /** luma_offset_l1 : (-128, 127 )is the range of offset
+ * when weighted pred is enabled, 0 is default value */
+ WORD8 ai1_luma_offset_l1[MAX_DPB_SIZE];
+
+ /** chroma_weight_l1_flag */
+ UWORD8 au1_chroma_weight_l1_flag[MAX_DPB_SIZE];
+
+ /** chroma_weight_l1 : (-128, 127 )is the range of weights
+ * when weighted pred is enabled, 128 is default value */
+ WORD16 ai2_chroma_weight_l1[MAX_DPB_SIZE][2];
+
+ /** chroma_offset_l1 :(-128, 127 )is the range of offset
+ * when weighted pred is enabled, 0 is default value */
+ WORD8 ai1_chroma_offset_l1[MAX_DPB_SIZE][2];
+}slice_header_t;
+
+
+/*****************************************************************************/
+/* The following can be used to type cast coefficient data that is stored */
+/* per subblock. Note that though i2_level is shown as an array that */
+/* holds 16 coefficients, only the first few entries will be valid. Next */
+/* subblocks data starts after the valid number of coefficients. Number */
+/* of non-zero coefficients will be derived using number of non-zero bits */
+/* in sig coeff map */
+/*****************************************************************************/
+
+/**
+ * Structure to hold coefficient info for a 2x2 chroma DC transform
+ */
+typedef struct
+{
+ /**
+ * significant coefficient map
+ */
+ UWORD8 u1_sig_coeff_map;
+
+ /**
+ * sub block position
+ */
+ UWORD8 u1_subblk_pos;
+
+ /**
+ * holds coefficients
+ */
+ WORD16 ai2_level[2 * 2];
+}tu_sblk2x2_coeff_data_t;
+
+/**
+ * Structure to hold coefficient info for a 4x4 transform
+ */
+typedef struct
+{
+ /**
+ * significant coefficient map
+ */
+ UWORD16 u2_sig_coeff_map;
+
+ /**
+ * sub block position
+ */
+ UWORD16 u2_subblk_pos;
+
+ /**
+ * holds coefficients
+ */
+ WORD16 ai2_level[SUBBLK_COEFF_CNT];
+}tu_sblk4x4_coeff_data_t;
+
+/**
+ * Structure to hold coefficient info for a 8x8 transform
+ */
+typedef struct
+{
+
+ /**
+ * significant coefficient map
+ */
+ UWORD32 au4_sig_coeff_map[2];
+
+ /**
+ * sub block position
+ */
+ UWORD16 u2_subblk_pos;
+
+ /**
+ * holds coefficients
+ */
+ WORD16 ai2_level[TRANS_SIZE_8 * TRANS_SIZE_8];
+}tu_blk8x8_coeff_data_t;
+
+
+/**
+ * Structure to hold coefficient info for a 16x16 IPCM MB
+ */
+typedef struct
+{
+ /**
+ * holds coefficients
+ */
+ UWORD8 au1_level[MB_SIZE * MB_SIZE * 3 / 2];
+}tu_ipcm_coeff_data_t;
+
+
+typedef struct
+{
+ /**
+ * Transform sizes 0: 4x4, 1: 8x8,
+ */
+ UWORD32 b1_trans_size : 1;
+
+ /**
+ * Flag to signal if the current MB is IPCM
+ */
+ UWORD32 b1_ipcm : 1;
+
+ /**
+ * Intra pred sizes 0: 4x4, 1: 8x8, 2: 16x16
+ */
+ UWORD32 b2_intra_pred_size : 2;
+
+ /**
+ * Chroma intra mode
+ */
+ UWORD32 b2_intra_chroma_pred_mode: 2;
+
+ /**
+ * Number of coded subblocks in the current MB, for which
+ * tu data is sent. Maximum of 27 subblocks in the following
+ * order.
+ * 1 4x4 luma DC(for intra16x16),
+ * 16 4x4 luma,
+ * 2 2x2 chroma DC,
+ * 8 4x4 chroma,
+ */
+ WORD32 b5_num_coded_sblks: 5;
+
+ /**
+ * Flag to signal if 4x4 subblock for DC values (in INTRA 16x16 MB)
+ * is coded
+ */
+ UWORD32 b1_luma_dc_coded: 1;
+
+ /**
+ * Flag to signal if 4x4 subblock for DC values (in INTRA 16x16 MB)
+ * is coded
+ */
+ UWORD32 b1_chroma_dc_coded: 1;
+
+ /**
+ * CSBP - 16 bits, 1 bit for each 4x4
+ * for intra16x16 mb_type only ac coefficients are
+ */
+ UWORD32 b16_luma_csbp: 16;
+
+ /**
+ * CSBP - 16 bits, 1 bit for each 4x4
+ * for intra16x16 mb_type only ac coefficients are
+ */
+ UWORD32 b8_chroma_csbp: 8;
+
+ /**
+ * Luma Intra pred modes,
+ * Based on intra pred size either 16, 4 or 1 entry will be
+ * populated below.
+ */
+ UWORD8 au1_luma_intra_modes[16];
+
+}intra_mb_t;
+
+
+typedef struct
+{
+ /**
+ * Transform sizes 0: 4x4, 1: 8x8,
+ */
+ UWORD8 b1_trans_size : 1;
+
+
+ /**
+ * Skip flag
+ */
+ UWORD8 b1_skip : 1;
+
+
+ /**
+ * Number of coded subblocks in the current MB, for which
+ * tu data is sent. Maximum of 26 subblocks in the following
+ * order.
+ * 16 4x4 luma,
+ * 2 2x2 chroma DC,
+ * 8 4x4 chroma,
+ */
+ WORD32 b5_num_coded_sblks: 5;
+
+ /**
+ * CSBP - 16 bits, 1 bit for each 4x4
+ * for intra16x16 mb_type only ac coefficients are
+ */
+ UWORD32 b16_luma_csbp: 16;
+
+ /**
+ * CSBP - 16 bits, 1 bit for each 4x4
+ * for intra16x16 mb_type only ac coefficients are
+ */
+ UWORD32 b16_chroma_csbp: 8;
+}inter_mb_t;
+
+#endif /* _IH264_STRUCTS_H_ */
diff --git a/common/ih264_trans_data.c b/common/ih264_trans_data.c
new file mode 100755
index 0000000..a1231e6
--- /dev/null
+++ b/common/ih264_trans_data.c
@@ -0,0 +1,312 @@
+/******************************************************************************
+ *
+ * Copyright (C) 2015 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ *****************************************************************************
+ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
+*/
+/**
+ *******************************************************************************
+ * @file
+ * ih264_trans_data.c
+ *
+ * @brief
+ * Contains definition of global variables for H264 encoder
+ *
+ * @author
+ * Ittiam
+ *
+ * @remarks
+ *
+ *******************************************************************************
+ */
+
+#include "ih264_typedefs.h"
+#include "ih264_trans_data.h"
+
+/*****************************************************************************/
+/* Extern global definitions */
+/*****************************************************************************/
+
+/*
+ * Since we don't have a division operation in neon
+ * we will multiply by LCM of 16,6,10 and scale accordingly
+ * so care that to get the actual transform you need to divide by LCM
+ * LCM = 240
+ */
+
+const UWORD16 g_scal_coff_h264_4x4[16] ={
+ 15,40,40,40,
+ 40,24,40,24,
+ 15,40,40,15,
+ 40,24,40,24};
+
+
+
+const UWORD16 g_scal_coff_h264_8x8[16]=
+ {
+ 16, 15, 20, 15,
+ 15, 14, 19, 14,
+ 20, 19, 25, 19,
+ 15, 14, 19, 14
+ };
+/*
+ * The scaling is by an 8x8 matrix, but due its 4x4 symmetry we can use
+ * a 4x4 matrix for scaling
+ * now since divide is to be avoided, we will compute 1/ values and scale it up
+ * to preserve information since our data is max 10 bit +1 sign bit we can shift a maximum of 21 bits up
+ * hence multiply the matrix as such
+{16.000 15.059 20.227 15.059
+15.059 14.173 19.051 14.173
+20.227 19.051 25.600 19.051
+15.059 14.173 19.051 14.173};
+{512, 544, 405, 544,
+544, 578, 430, 578,
+405, 430, 320, 430,
+544, 578, 430, 578};*/
+
+
+/**
+ ******************************************************************************
+ * @brief Scale Table for quantizing 4x4 subblock. To quantize a given 4x4 DCT
+ * transformed block, the coefficient at index location (i,j) is scaled by one of
+ * the constants in this table and right shift the result by (QP_BITS_h264_4x4 +
+ * floor(qp/6)), here qp is the quantization parameter used to quantize the mb.
+ *
+ * input : qp%6, index location (i,j)
+ * output : scale constant.
+ *
+ * @remarks 16 constants for each index position of the subblock and 6 for each
+ * qp%6 in the range 0-5 inclusive.
+ ******************************************************************************
+ */
+const UWORD16 gu2_quant_scale_matrix_4x4[96] =
+{
+ 13107, 8066, 13107, 8066,
+ 8066, 5243, 8066, 5243,
+ 13107, 8066, 13107, 8066,
+ 8066, 5243, 8066, 5243,
+
+ 11916, 7490, 11916, 7490,
+ 7490, 4660, 7490, 4660,
+ 11916, 7490, 11916, 7490,
+ 7490, 4660, 7490, 4660,
+
+ 10082, 6554, 10082, 6554,
+ 6554, 4194, 6554, 4194,
+ 10082, 6554, 10082, 6554,
+ 6554, 4194, 6554, 4194,
+
+ 9362, 5825, 9362, 5825,
+ 5825, 3647, 5825, 3647,
+ 9362, 5825, 9362, 5825,
+ 5825, 3647, 5825, 3647,
+
+ 8192, 5243, 8192, 5243,
+ 5243, 3355, 5243, 3355,
+ 8192, 5243, 8192, 5243,
+ 5243, 3355, 5243, 3355,
+
+ 7282, 4559, 7282, 4559,
+ 4559, 2893, 4559, 2893,
+ 7282, 4559, 7282, 4559,
+ 4559, 2893, 4559, 2893,
+
+};
+
+/**
+ ******************************************************************************
+ * @brief Round Factor for quantizing subblock. While quantizing a given 4x4 DCT
+ * transformed block, the coefficient at index location (i,j) is scaled by one of
+ * the constants in the table gu2_forward_quant_scalar_4x4 and then right shift
+ * the result by (QP_BITS_h264_4x4 + floor(qp/6)).
+ * Before right shifting a round factor is added.
+ * The round factor can be any value [a * (1 << (QP_BITS_h264_4x4 + floor(qp/6)))]
+ * for 'a' lies in the range 0-0.5.
+ * Here qp is the quantization parameter used to quantize the mb.
+ *
+ * input : qp/6
+ * output : round factor.
+ *
+ * @remarks The round factor is constructed by setting a = 1/3
+ *
+ * round factor constructed by setting a = 1/3
+ {
+ 10922, 21845, 43690, 87381,
+ 174762, 349525, 699050, 1398101,
+ 2796202,
+ }
+ *
+ * round factor constructed by setting a = 0.49
+ *{
+ 16056, 32112, 64225,
+ 128450, 256901, 513802,
+ 1027604, 2055208, 4110417,
+ };
+
+ * round factor constructed by setting a = 0.5
+ 16384, 32768, 65536,
+ 131072, 262144, 524288,
+ 1048576, 2097152, 4194304,
+
+ ******************************************************************************
+ */
+const UWORD32 gu4_forward_quant_round_factor_4x4[9] =
+{
+ 10922, 21845, 43690, 87381,
+ 174762, 349525, 699050, 1398101,
+ 2796202,
+};
+
+
+
+/**
+ ******************************************************************************
+ * @brief Threshold Table. Quantizing the given DCT coefficient is done only if
+ * it exceeds the threshold value presented in this table.
+ *
+ * input : qp/6, qp%6, index location (i,j)
+ * output : Threshold constant.
+ *
+ * @remarks 16 constants for each index position of the subblock and 6 for each
+ * qp%6 in the range 0-5 inclusive and 9 for each qp/6 in the range 0-51.
+ ******************************************************************************
+ */
+const UWORD16 gu2_forward_quant_threshold_4x4[96] =
+{
+ 426, 693, 426, 693,
+ 693, 1066, 693, 1066,
+ 426, 693, 426, 693,
+ 693, 1066, 693, 1066,
+
+ 469, 746, 469, 746,
+ 746, 1200, 746, 1200,
+ 469, 746, 469, 746,
+ 746, 1200, 746, 1200,
+
+ 554, 853, 554, 853,
+ 853, 1333, 853, 1333,
+ 554, 853, 554, 853,
+ 853, 1333, 853, 1333,
+
+ 597, 960, 597, 960,
+ 960, 1533, 960, 1533,
+ 597, 960, 597, 960,
+ 960, 1533, 960, 1533,
+
+ 682, 1066, 682, 1066,
+ 1066, 1666, 1066, 1666,
+ 682, 1066, 682, 1066,
+ 1066, 1666, 1066, 1666,
+
+ 767, 1226, 767, 1226,
+ 1226, 1933, 1226, 1933,
+ 767, 1226, 767, 1226,
+ 1226, 1933, 1226, 1933,
+};
+
+/**
+ ******************************************************************************
+ * @brief Scale Table for quantizing 8x8 subblock. To quantize a given 8x8 DCT
+ * transformed block, the coefficient at index location (i,j) is scaled by one of
+ * the constants in this table and right shift the result by (QP_BITS_h264_8x8 +
+ * floor(qp/6)), here qp is the quantization parameter used to quantize the mb.
+ *
+ * input : qp%6, index location (i,j)
+ * output : scale constant.
+ *
+ * @remarks 64 constants for each index position of the subblock and 6 for each
+ * qp%6 in the range 0-5 inclusive.
+ ******************************************************************************
+ */
+const UWORD16 gu2_quant_scale_matrix_8x8 [384] =
+{
+ 13107, 12222, 16777, 12222, 13107, 12222, 16777, 12222,
+ 12222, 11428, 15481, 11428, 12222, 11428, 15481, 11428,
+ 16777, 15481, 20972, 15481, 16777, 15481, 20972, 15481,
+ 12222, 11428, 15481, 11428, 12222, 11428, 15481, 11428,
+ 13107, 12222, 16777, 12222, 13107, 12222, 16777, 12222,
+ 12222, 11428, 15481, 11428, 12222, 11428, 15481, 11428,
+ 16777, 15481, 20972, 15481, 16777, 15481, 20972, 15481,
+ 12222, 11428, 15481, 11428, 12222, 11428, 15481, 11428,
+
+ 11916, 11058, 14980, 11058, 11916, 11058, 14980, 11058,
+ 11058, 10826, 14290, 10826, 11058, 10826, 14290, 10826,
+ 14980, 14290, 19174, 14290, 14980, 14290, 19174, 14290,
+ 11058, 10826, 14290, 10826, 11058, 10826, 14290, 10826,
+ 11916, 11058, 14980, 11058, 11916, 11058, 14980, 11058,
+ 11058, 10826, 14290, 10826, 11058, 10826, 14290, 10826,
+ 14980, 14290, 19174, 14290, 14980, 14290, 19174, 14290,
+ 11058, 10826, 14290, 10826, 11058, 10826, 14290, 10826,
+
+ 10082, 9675, 12710, 9675, 10082, 9675, 12710, 9675,
+ 9675, 8943, 11985, 8943, 9675, 8943, 11985, 8943,
+ 12710, 11985, 15978, 11985, 12710, 11985, 15978, 11985,
+ 9675, 8943, 11985, 8943, 9675, 8943, 11985, 8943,
+ 10082, 9675, 12710, 9675, 10082, 9675, 12710, 9675,
+ 9675, 8943, 11985, 8943, 9675, 8943, 11985, 8943,
+ 12710, 11985, 15978, 11985, 12710, 11985, 15978, 11985,
+ 9675, 8943, 11985, 8943, 9675, 8943, 11985, 8943,
+
+ 9362, 8931, 11984, 8931, 9362, 8931, 11984, 8931,
+ 8931, 8228, 11259, 8228, 8931, 8228, 11259, 8228,
+ 11984, 11259, 14913, 11259, 11984, 11259, 14913, 11259,
+ 8931, 8228, 11259, 8228, 8931, 8228, 11259, 8228,
+ 9362, 8931, 11984, 8931, 9362, 8931, 11984, 8931,
+ 8931, 8228, 11259, 8228, 8931, 8228, 11259, 8228,
+ 11984, 11259, 14913, 11259, 11984, 11259, 14913, 11259,
+ 8931, 8228, 11259, 8228, 8931, 8228, 11259, 8228,
+
+ 8192, 7740, 10486, 7740, 8192, 7740, 10486, 7740,
+ 7740, 7346, 9777, 7346, 7740, 7346, 9777, 7346,
+ 10486, 9777, 13159, 9777, 10486, 9777, 13159, 9777,
+ 7740, 7346, 9777, 7346, 7740, 7346, 9777, 7346,
+ 8192, 7740, 10486, 7740, 8192, 7740, 10486, 7740,
+ 7740, 7346, 9777, 7346, 7740, 7346, 9777, 7346,
+ 10486, 9777, 13159, 9777, 10486, 9777, 13159, 9777,
+ 7740, 7346, 9777, 7346, 7740, 7346, 9777, 7346,
+
+ 7282, 6830, 9118, 6830, 7282, 6830, 9118, 6830,
+ 6830, 6428, 8640, 6428, 6830, 6428, 8640, 6428,
+ 9118, 8640, 11570, 8640, 9118, 8640, 11570, 8640,
+ 6830, 6428, 8640, 6428, 6830, 6428, 8640, 6428,
+ 7282, 6830, 9118, 6830, 7282, 6830, 9118, 6830,
+ 6830, 6428, 8640, 6428, 6830, 6428, 8640, 6428,
+ 9118, 8640, 11570, 8640, 9118, 8640, 11570, 8640,
+ 6830, 6428, 8640, 6428, 6830, 6428, 8640, 6428,
+
+};
+
+
+/**
+ ******************************************************************************
+ * @brief Specification of QPc as a function of qPi
+ *
+ * input : qp luma
+ * output : qp chroma.
+ *
+ * @remarks Refer Table 8-15 of h264 specification.
+ ******************************************************************************
+ */
+const UWORD8 gu1_qpc_fqpi[52] =
+{
+ 0, 1, 2, 3, 4, 5, 6, 7,
+ 8, 9, 10, 11, 12, 13, 14, 15,
+ 16, 17, 18, 19, 20, 21, 22, 23,
+ 24, 25, 26, 27, 28, 29, 29, 30,
+ 31, 32, 32, 33, 34, 34, 35, 35,
+ 36, 36, 37, 37, 37, 38, 38, 38,
+ 39, 39, 39, 39,
+};
diff --git a/common/ih264_trans_data.h b/common/ih264_trans_data.h
new file mode 100755
index 0000000..dc77ae7
--- /dev/null
+++ b/common/ih264_trans_data.h
@@ -0,0 +1,125 @@
+/******************************************************************************
+ *
+ * Copyright (C) 2015 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ *****************************************************************************
+ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
+*/
+/**
+ *******************************************************************************
+ * @file
+ * ih264_trans_data.h
+ *
+ * @brief
+ * Contains declaration of global variables for H264 transform , qnat and inverse quant
+ *
+ * @author
+ * Ittiam
+ *
+ * @remarks
+ *
+ *******************************************************************************
+ */
+#ifndef IH264_GLOBAL_DATA_H_
+#define IH264_GLOBAL_DATA_H_
+
+/*****************************************************************************/
+/* Extern global declarations */
+/*****************************************************************************/
+
+/* Scaling matrices for h264 quantization */
+extern const UWORD16 g_scal_coff_h264_4x4[16];
+extern const UWORD16 g_scal_coff_h264_8x8[16];
+
+
+/**
+ ******************************************************************************
+ * @brief Scale Table for quantizing 4x4 subblock. To quantize a given 4x4 DCT
+ * transformed block, the coefficient at index location (i,j) is scaled by one of
+ * the constants in this table and right shift the result by (QP_BITS_h264_4x4 +
+ * floor(qp/6)), here qp is the quantization parameter used to quantize the mb.
+ *
+ * input : qp%6, index location (i,j)
+ * output : scale constant.
+ *
+ * @remarks 16 constants for each index position of the subblock and 6 for each
+ * qp%6 in the range 0-5 inclusive.
+ ******************************************************************************
+ */
+extern const UWORD16 gu2_quant_scale_matrix_4x4[96];
+
+/**
+ ******************************************************************************
+ * @brief Round Factor for quantizing subblock. While quantizing a given 4x4 DCT
+ * transformed block, the coefficient at index location (i,j) is scaled by one of
+ * the constants in the table gu2_forward_quant_scalar_4x4 and then right shift
+ * the result by (QP_BITS_h264_4x4 + floor(qp/6)).
+ * Before right shifting a round factor is added.
+ * The round factor can be any value [a * (1 << (QP_BITS_h264_4x4 + floor(qp/6)))]
+ * for 'a' lies in the range 0-0.5.
+ * Here qp is the quantization parameter used to quantize the mb.
+ *
+ * input : qp/6
+ * output : round factor.
+ *
+ * @remarks The round factor is constructed by setting a = 1/3
+ ******************************************************************************
+ */
+extern const UWORD32 gu4_forward_quant_round_factor_4x4[9];
+
+/**
+ ******************************************************************************
+ * @brief Threshold Table. Quantizing the given DCT coefficient is done only if
+ * it exceeds the threshold value presented in this table.
+ *
+ * input : qp/6, qp%6, index location (i,j)
+ * output : Threshold constant.
+ *
+ * @remarks 16 constants for each index position of the subblock and 6 for each
+ * qp%6 in the range 0-5 inclusive and 9 for each qp/6 in the range 0-51.
+ ******************************************************************************
+ */
+extern const UWORD16 gu2_forward_quant_threshold_4x4[96];
+
+/**
+ ******************************************************************************
+ * @brief Scale Table for quantizing 8x8 subblock. To quantize a given 8x8 DCT
+ * transformed block, the coefficient at index location (i,j) is scaled by one of
+ * the constants in this table and right shift the result by (QP_BITS_h264_8x8 +
+ * floor(qp/6)), here qp is the quantization parameter used to quantize the mb.
+ *
+ * input : qp%6, index location (i,j)
+ * output : scale constant.
+ *
+ * @remarks 64 constants for each index position of the subblock and 6 for each
+ * qp%6 in the range 0-5 inclusive.
+ ******************************************************************************
+ */
+extern const UWORD16 gu2_quant_scale_matrix_8x8 [384];
+
+/**
+ ******************************************************************************
+ * @brief Specification of QPc as a function of qPi
+ *
+ * input : qp luma
+ * output : qp chroma.
+ *
+ * @remarks Refer Table 8-15 of h264 specification.
+ ******************************************************************************
+ */
+extern const UWORD8 gu1_qpc_fqpi[52];
+
+
+#endif /* IH264_GLOBAL_DATA_H_ */
diff --git a/common/ih264_trans_macros.h b/common/ih264_trans_macros.h
new file mode 100755
index 0000000..f114d0e
--- /dev/null
+++ b/common/ih264_trans_macros.h
@@ -0,0 +1,124 @@
+/******************************************************************************
+ *
+ * Copyright (C) 2015 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ *****************************************************************************
+ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
+*/
+/**
+*******************************************************************************
+* @file
+* ih264_trans_macros.h
+*
+* @brief
+* The file contains definitions of macros that perform forward and inverse
+* quantization
+*
+* @author
+* Ittiam
+*
+* @remark
+* None
+*
+*******************************************************************************
+*/
+
+#ifndef IH264_TRANS_MACROS_H_
+#define IH264_TRANS_MACROS_H_
+
+/*****************************************************************************/
+/* Function Macros */
+/*****************************************************************************/
+
+/**
+******************************************************************************
+ * @brief Macro to perform forward quantization.
+ * @description The value to be quantized is first compared with a threshold.
+ * If the value is less than the threshold, the quantization value is returned
+ * as zero else the value is quantized traditionally as per the rules of
+ * h264 specification
+******************************************************************************
+ */
+#define FWD_QUANT(i4_value, u4_abs_value, i4_sign, threshold, scale, rndfactor, qbits, u4_nnz) \
+ {\
+ if (i4_value < 0)\
+ {\
+ u4_abs_value = -i4_value;\
+ i4_sign = -1;\
+ }\
+ else\
+ {\
+ u4_abs_value = i4_value;\
+ i4_sign = 1;\
+ }\
+ if (u4_abs_value < threshold)\
+ {\
+ i4_value = 0;\
+ }\
+ else\
+ {\
+ u4_abs_value *= scale;\
+ u4_abs_value += rndfactor;\
+ u4_abs_value >>= qbits;\
+ i4_value = u4_abs_value * i4_sign;\
+ if (i4_value)\
+ {\
+ u4_nnz++;\
+ }\
+ }\
+ }
+
+/**
+******************************************************************************
+ * @brief Macro to perform inverse quantization.
+ * @remarks The value can also be de-quantized as
+ * if (u4_qp_div_6 < 4)
+ * {
+ * i4_value = (quant_scale * weight_scale * i4_value + (1 << (3-u4_qp_div_6)))
+ * i4_value >>= (4 - u4_qp_div_6)
+ * }
+ * else
+ * {
+ * i4_value = (quant_scale * weight_scale * i4_value) << (u4_qp_div_6 -4)
+ * }
+******************************************************************************
+ */
+#define INV_QUANT(i4_value, quant_scale, weight_scale, u4_qp_div_6, rndfactor, qbits)\
+ {\
+ i4_value *= quant_scale;\
+ i4_value *= weight_scale;\
+ i4_value += rndfactor;\
+ i4_value <<= u4_qp_div_6;\
+ i4_value >>= qbits;\
+ }
+
+#define QUANT_H264(x,y,w,z,shft) (shft = ABS(x),\
+ shft *= y,\
+ shft += z,\
+ shft = shft>>w,\
+ shft = SIGNXY(shft,x))
+
+#define IQUANT_H264(x,y,wscal,w,shft) (shft = x, \
+ shft *=y, \
+ shft *=wscal, \
+ shft = shft<<w)
+
+#define IQUANT_lev_H264(x,y,wscal,add_f,w,shft) (shft = x, \
+ shft *=y, \
+ shft *=wscal, \
+ shft+= add_f, \
+ shft = shft>>w)
+
+#endif /* IH264_TRANS_MACROS_H_ */
diff --git a/common/ih264_trans_quant_itrans_iquant.h b/common/ih264_trans_quant_itrans_iquant.h
new file mode 100755
index 0000000..83551aa
--- /dev/null
+++ b/common/ih264_trans_quant_itrans_iquant.h
@@ -0,0 +1,232 @@
+/******************************************************************************
+ *
+ * Copyright (C) 2015 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ *****************************************************************************
+ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
+*/
+/**
+ *******************************************************************************
+ * @file
+ * ih264_trans_quant.h
+ *
+ * @brief
+ * Contains declarations for forward and inverse transform paths for H264
+ *
+ * @author
+ * Ittiam
+ *
+ * @remarks
+ *
+ *******************************************************************************
+ */
+
+#ifndef IH264_TRANS_QUANT_H_
+#define IH264_TRANS_QUANT_H_
+
+/*****************************************************************************/
+/* Extern Function Declarations */
+/*****************************************************************************/
+
+
+typedef void ih264_resi_trans_dctrans_quant_ft(UWORD8*pu1_src,
+ UWORD8 *pu1_pred,
+ WORD16 *pi2_out,
+ WORD32 src_strd,
+ WORD32 pred_strd,
+ WORD32 dst_strd,
+ const UWORD16 *pu2_scale_mat,
+ const UWORD16 *pu2_thresh_mat,
+ UWORD32 u4_qbit,
+ UWORD32 u4_round_fact,
+ UWORD8 *pu1_nnz);
+
+typedef void ih264_idctrans_iquant_itrans_recon_ft(WORD16 *pi2_src,
+ UWORD8 *pu1_pred,
+ UWORD8 *pu1_out,
+ WORD32 src_strd,
+ WORD32 pred_strd,
+ WORD32 out_strd,
+ const UWORD16 *pu2_iscale_mat,
+ const UWORD16 *pu2_weigh_mat,
+ UWORD32 qp_div,
+ UWORD32 pi4_cntrl,
+ WORD32 *pi4_tmp);
+
+
+/*Function prototype declarations*/
+typedef void ih264_resi_trans_quant_ft(UWORD8*pu1_src,
+ UWORD8 *pu1_pred,
+ WORD16 *pi2_out,
+ WORD32 src_strd,
+ WORD32 pred_strd,
+ const UWORD16 *pu2_scale_mat,
+ const UWORD16 *pu2_thresh_mat,
+ UWORD32 u4_qbit,
+ UWORD32 u4_round_fact,
+ UWORD8 *pu1_nnz,
+ WORD16 *pi2_alt_dc_addr);
+
+typedef void ih264_luma_16x16_resi_trans_dctrans_quant_ft(UWORD8 *pu1_src,
+ UWORD8 *pu1_pred,
+ WORD16 *pi2_out,
+ WORD32 src_strd,
+ WORD32 pred_strd,
+ WORD32 dst_strd,
+ const UWORD16 *pu2_scale_matrix,
+ const UWORD16 *pu2_threshold_matrix,
+ UWORD32 u4_qbits,
+ UWORD32 u4_round_factor,
+ UWORD8 *pu1_nnz,
+ UWORD32 u4_dc_flag);
+
+typedef void ih264_chroma_8x8_resi_trans_dctrans_quant_ft(UWORD8 *pu1_src,
+ UWORD8 *pu1_pred,
+ WORD16 *pi2_out,
+ WORD32 src_strd,
+ WORD32 pred_strd,
+ WORD32 dst_strd,
+ const UWORD16 *pu2_scale_matrix,
+ const UWORD16 *pu2_threshold_matrix,
+ UWORD32 u4_qbits,
+ UWORD32 u4_round_factor,
+ UWORD8 *pu1_nnz);
+
+typedef void ih264_iquant_itrans_recon_ft(WORD16 *pi2_src,
+ UWORD8 *pu1_pred,
+ UWORD8 *pu1_out,
+ WORD32 pred_strd,
+ WORD32 out_strd,
+ const UWORD16 *pu2_iscale_mat,
+ const UWORD16 *pu2_weigh_mat,
+ UWORD32 qp_div,
+ WORD16 *pi2_tmp,
+ WORD32 iq_start_idx,
+ WORD16 *pi2_dc_ld_addr);
+
+
+typedef void ih264_iquant_itrans_recon_chroma_ft(WORD16 *pi2_src,
+ UWORD8 *pu1_pred,
+ UWORD8 *pu1_out,
+ WORD32 pred_strd,
+ WORD32 out_strd,
+ const UWORD16 *pu2_iscal_mat,
+ const UWORD16 *pu2_weigh_mat,
+ UWORD32 u4_qp_div_6,
+ WORD16 *pi2_tmp,
+ WORD16 *pi2_dc_src);
+
+
+typedef void ih264_luma_16x16_idctrans_iquant_itrans_recon_ft(WORD16 *pi2_src,
+ UWORD8 *pu1_pred,
+ UWORD8 *pu1_out,
+ WORD32 src_strd,
+ WORD32 pred_strd,
+ WORD32 out_strd,
+ const UWORD16 *pu2_iscale_mat,
+ const UWORD16 *pu2_weigh_mat,
+ UWORD32 qp_div,
+ UWORD32 pi4_cntrl,
+ UWORD32 u4_dc_trans_flag,
+ WORD32 *pi4_tmp);
+
+typedef void ih264_chroma_8x8_idctrans_iquant_itrans_recon_ft(WORD16 *pi2_src,
+ UWORD8 *pu1_pred,
+ UWORD8 *pu1_out,
+ WORD32 src_strd,
+ WORD32 pred_strd,
+ WORD32 out_strd,
+ const UWORD16 *pu2_iscale_mat,
+ const UWORD16 *pu2_weigh_mat,
+ UWORD32 qp_div,
+ UWORD32 pi4_cntrl,
+ WORD32 *pi4_tmp);
+
+typedef void ih264_ihadamard_scaling_ft(WORD16* pi2_src,
+ WORD16* pi2_out,
+ const UWORD16 *pu2_iscal_mat,
+ const UWORD16 *pu2_weigh_mat,
+ UWORD32 u4_qp_div_6,
+ WORD32* pi4_tmp);
+
+typedef void ih264_hadamard_quant_ft(WORD16 *pi2_src, WORD16 *pi2_dst,
+ const UWORD16 *pu2_scale_matrix,
+ const UWORD16 *pu2_threshold_matrix, UWORD32 u4_qbits,
+ UWORD32 u4_round_factor,UWORD8 *pu1_nnz);
+
+ih264_resi_trans_quant_ft ih264_resi_trans_quant_4x4;
+ih264_resi_trans_quant_ft ih264_resi_trans_quant_chroma_4x4;
+ih264_resi_trans_quant_ft ih264_resi_trans_quant_8x8;
+ih264_iquant_itrans_recon_ft ih264_iquant_itrans_recon_4x4;
+ih264_iquant_itrans_recon_ft ih264_iquant_itrans_recon_8x8;
+ih264_iquant_itrans_recon_ft ih264_iquant_itrans_recon_4x4_dc;
+ih264_iquant_itrans_recon_ft ih264_iquant_itrans_recon_8x8_dc;
+ih264_iquant_itrans_recon_chroma_ft ih264_iquant_itrans_recon_chroma_4x4;
+ih264_iquant_itrans_recon_chroma_ft ih264_iquant_itrans_recon_chroma_4x4_dc;
+ih264_ihadamard_scaling_ft ih264_ihadamard_scaling_4x4;
+ih264_ihadamard_scaling_ft ih264_ihadamard_scaling_2x2_uv;
+ih264_hadamard_quant_ft ih264_hadamard_quant_4x4;
+ih264_hadamard_quant_ft ih264_hadamard_quant_2x2_uv;
+
+/*A9 Declarations*/
+ih264_resi_trans_quant_ft ih264_resi_trans_quant_4x4_a9;
+ih264_resi_trans_quant_ft ih264_resi_trans_quant_chroma_4x4_a9;
+ih264_iquant_itrans_recon_ft ih264_iquant_itrans_recon_4x4_a9;
+ih264_iquant_itrans_recon_ft ih264_iquant_itrans_recon_8x8_a9;
+ih264_iquant_itrans_recon_ft ih264_iquant_itrans_recon_4x4_dc_a9;
+ih264_iquant_itrans_recon_ft ih264_iquant_itrans_recon_8x8_dc_a9;
+ih264_iquant_itrans_recon_chroma_ft ih264_iquant_itrans_recon_chroma_4x4_a9;
+ih264_iquant_itrans_recon_chroma_ft ih264_iquant_itrans_recon_chroma_4x4_dc_a9;
+ih264_luma_16x16_resi_trans_dctrans_quant_ft ih264_luma_16x16_resi_trans_dctrans_quant_a9;
+ih264_chroma_8x8_resi_trans_dctrans_quant_ft ih264_chroma_8x8_resi_trans_dctrans_quant_a9;
+ih264_luma_16x16_idctrans_iquant_itrans_recon_ft ih264_luma_16x16_idctrans_iquant_itrans_recon_a9;
+ih264_chroma_8x8_idctrans_iquant_itrans_recon_ft ih264_chroma_8x8_idctrans_iquant_itrans_recon_a9;
+ih264_ihadamard_scaling_ft ih264_ihadamard_scaling_4x4_a9;
+ih264_ihadamard_scaling_ft ih264_ihadamard_scaling_2x2_uv_a9;
+ih264_hadamard_quant_ft ih264_hadamard_quant_4x4_a9;
+ih264_hadamard_quant_ft ih264_hadamard_quant_2x2_uv_a9;
+
+/*Av8 Declarations*/
+ih264_resi_trans_quant_ft ih264_resi_trans_quant_4x4_av8;
+ih264_resi_trans_quant_ft ih264_resi_trans_quant_chroma_4x4_av8;
+ih264_iquant_itrans_recon_ft ih264_iquant_itrans_recon_4x4_av8;
+ih264_iquant_itrans_recon_ft ih264_iquant_itrans_recon_8x8_av8;
+ih264_iquant_itrans_recon_ft ih264_iquant_itrans_recon_4x4_dc_av8;
+ih264_iquant_itrans_recon_ft ih264_iquant_itrans_recon_8x8_dc_av8;
+ih264_iquant_itrans_recon_chroma_ft ih264_iquant_itrans_recon_chroma_4x4_av8;
+ih264_iquant_itrans_recon_chroma_ft ih264_iquant_itrans_recon_chroma_4x4_dc_av8;
+ih264_ihadamard_scaling_ft ih264_ihadamard_scaling_4x4_av8;
+ih264_ihadamard_scaling_ft ih264_ihadamard_scaling_2x2_uv_av8;
+ih264_hadamard_quant_ft ih264_hadamard_quant_4x4_av8;
+ih264_hadamard_quant_ft ih264_hadamard_quant_2x2_uv_av8;
+
+/*SSSE3 Declarations*/
+ih264_iquant_itrans_recon_ft ih264_iquant_itrans_recon_4x4_ssse3;
+ih264_iquant_itrans_recon_ft ih264_iquant_itrans_recon_8x8_ssse3;
+ih264_iquant_itrans_recon_ft ih264_iquant_itrans_recon_4x4_dc_ssse3;
+ih264_iquant_itrans_recon_ft ih264_iquant_itrans_recon_8x8_dc_ssse3;
+ih264_iquant_itrans_recon_chroma_ft ih264_iquant_itrans_recon_chroma_4x4_dc_ssse3;
+ih264_ihadamard_scaling_ft ih264_ihadamard_scaling_4x4_ssse3;
+ih264_ihadamard_scaling_ft ih264_ihadamard_scaling_2x2_uv_ssse3;
+/*SSSE42 Declarations*/
+ih264_resi_trans_quant_ft ih264_resi_trans_quant_4x4_sse42;
+ih264_resi_trans_quant_ft ih264_resi_trans_quant_chroma_4x4_sse42;
+ih264_iquant_itrans_recon_ft ih264_iquant_itrans_recon_4x4_sse42;
+ih264_iquant_itrans_recon_chroma_ft ih264_iquant_itrans_recon_chroma_4x4_sse42;
+ih264_ihadamard_scaling_ft ih264_ihadamard_scaling_4x4_sse42;
+ih264_hadamard_quant_ft ih264_hadamard_quant_4x4_sse42;
+ih264_hadamard_quant_ft ih264_hadamard_quant_2x2_uv_sse42;
+
+#endif /* IH264_TRANS_QUANT_H_ */
diff --git a/common/ih264_typedefs.h b/common/ih264_typedefs.h
new file mode 100755
index 0000000..8e4685a
--- /dev/null
+++ b/common/ih264_typedefs.h
@@ -0,0 +1,64 @@
+/******************************************************************************
+ *
+ * Copyright (C) 2015 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ *****************************************************************************
+ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
+*/
+/**
+*******************************************************************************
+* @file
+* ih264_typedefs.h
+*
+* @brief
+* Type definitions used in the code
+*
+* @author
+* Ittiam
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+
+#ifndef _IH264_TYPEDEFS_H_
+#define _IH264_TYPEDEFS_H_
+
+
+/*****************************************************************************/
+/* Unsigned data types */
+/*****************************************************************************/
+typedef unsigned char UWORD8;
+typedef unsigned short UWORD16;
+typedef unsigned int UWORD32;
+typedef unsigned long long UWORD64;
+
+
+/*****************************************************************************/
+/* Signed data types */
+/*****************************************************************************/
+typedef signed char WORD8;
+typedef short WORD16;
+typedef int WORD32;
+
+
+/*****************************************************************************/
+/* Miscellaneous data types */
+/*****************************************************************************/
+typedef char CHAR;
+typedef double DOUBLE;
+
+#endif /* _IH264_TYPEDEFS_H_ */
diff --git a/common/ih264_weighted_pred.c b/common/ih264_weighted_pred.c
new file mode 100755
index 0000000..d5d73f2
--- /dev/null
+++ b/common/ih264_weighted_pred.c
@@ -0,0 +1,495 @@
+/******************************************************************************
+ *
+ * Copyright (C) 2015 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ *****************************************************************************
+ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
+*/
+/*****************************************************************************/
+/* */
+/* File Name : ih264_weighted_pred.c */
+/* */
+/* Description : Contains function definitions for weighted */
+/* prediction functions */
+/* */
+/* List of Functions : ih264_default_weighted_pred_luma() */
+/* ih264_default_weighted_pred_chroma() */
+/* ih264_weighted_pred_luma() */
+/* ih264_weighted_pred_chroma() */
+/* ih264_weighted_bipred_luma() */
+/* ih264_weighted_bipred_chroma() */
+/* */
+/* Issues / Problems : None */
+/* */
+/* Revision History : */
+/* */
+/* DD MM YYYY Author(s) Changes */
+/* 07 01 2015 Kaushik Initial version */
+/* Senthoor */
+/* */
+/*****************************************************************************/
+/*****************************************************************************/
+/* File Includes */
+/*****************************************************************************/
+
+/* User include files */
+#include "ih264_typedefs.h"
+#include "ih264_macros.h"
+#include "ih264_platform_macros.h"
+#include "ih264_weighted_pred.h"
+
+/*****************************************************************************/
+/* Function definitions . */
+/*****************************************************************************/
+/*****************************************************************************/
+/* */
+/* Function Name : ih264_default_weighted_pred_luma */
+/* */
+/* Description : This function performs the default weighted prediction */
+/* as described in sec 8.4.2.3.1 titled "Default weighted */
+/* sample prediction process" for luma. The function gets */
+/* two ht x wd blocks, calculates their rounded-average and */
+/* stores it in the destination block. (ht,wd) can be */
+/* (4,4), (8,4), (4,8), (8,8), (16,8), (8,16) or (16,16). */
+/* */
+/* Inputs : puc_src1 - Pointer to source 1 */
+/* puc_src2 - Pointer to source 2 */
+/* puc_dst - Pointer to destination */
+/* src_strd1 - stride for source 1 */
+/* src_strd1 - stride for source 2 */
+/* dst_strd - stride for destination */
+/* ht - height of the block */
+/* wd - width of the block */
+/* */
+/* Issues : None */
+/* */
+/* Revision History: */
+/* */
+/* DD MM YYYY Author(s) Changes */
+/* 07 01 2015 Kaushik Initial Version */
+/* Senthoor */
+/* */
+/*****************************************************************************/
+void ih264_default_weighted_pred_luma(UWORD8 *pu1_src1,
+ UWORD8 *pu1_src2,
+ UWORD8 *pu1_dst,
+ WORD32 src_strd1,
+ WORD32 src_strd2,
+ WORD32 dst_strd,
+ WORD32 ht,
+ WORD32 wd)
+{
+ WORD32 i, j;
+
+ src_strd1 -= wd;
+ src_strd2 -= wd;
+ dst_strd -= wd;
+
+ for(i = 0; i < ht; i++)
+ {
+ for(j = 0; j < wd; j++, pu1_src1++, pu1_src2++, pu1_dst++)
+ *pu1_dst = (*pu1_src1 + *pu1_src2 + 1) >> 1;
+
+ pu1_src1 += src_strd1;
+ pu1_src2 += src_strd2;
+ pu1_dst += dst_strd;
+ }
+}
+
+/*****************************************************************************/
+/* */
+/* Function Name : ih264_default_weighted_pred_chroma */
+/* */
+/* Description : This function performs the default weighted prediction */
+/* as described in sec 8.4.2.3.1 titled "Default weighted */
+/* sample prediction process" for chroma. The function gets */
+/* two ht x wd blocks, calculates their rounded-average and */
+/* stores it in the destination block. (ht,wd) can be */
+/* (2,2), (4,2) , (2,4), (4,4), (8,4), (4,8) or (8,8). */
+/* */
+/* Inputs : puc_src1 - Pointer to source 1 */
+/* puc_src2 - Pointer to source 2 */
+/* puc_dst - Pointer to destination */
+/* src_strd1 - stride for source 1 */
+/* src_strd1 - stride for source 2 */
+/* dst_strd - stride for destination */
+/* ht - height of the block */
+/* wd - width of the block */
+/* */
+/* Issues : None */
+/* */
+/* Revision History: */
+/* */
+/* DD MM YYYY Author(s) Changes */
+/* 07 01 2015 Kaushik Initial Version */
+/* Senthoor */
+/* */
+/*****************************************************************************/
+void ih264_default_weighted_pred_chroma(UWORD8 *pu1_src1,
+ UWORD8 *pu1_src2,
+ UWORD8 *pu1_dst,
+ WORD32 src_strd1,
+ WORD32 src_strd2,
+ WORD32 dst_strd,
+ WORD32 ht,
+ WORD32 wd)
+{
+ WORD32 i, j;
+
+ wd = wd << 1;
+
+ src_strd1 -= wd;
+ src_strd2 -= wd;
+ dst_strd -= wd;
+
+ for(i = 0; i < ht; i++)
+ {
+ for(j = 0; j < wd; j++, pu1_src1++, pu1_src2++, pu1_dst++)
+ *pu1_dst = (*pu1_src1 + *pu1_src2 + 1) >> 1;
+
+ pu1_src1 += src_strd1;
+ pu1_src2 += src_strd2;
+ pu1_dst += dst_strd;
+ }
+}
+
+/*****************************************************************************/
+/* */
+/* Function Name : ih264_weighted_pred_luma */
+/* */
+/* Description : This function performs the weighted prediction as */
+/* described in sec 8.4.2.3.2 titled "Weighted sample */
+/* prediction process" for luma. The function gets one */
+/* ht x wd block, weights it, rounds it off, offsets it, */
+/* saturates it to unsigned 8-bit and stores it in the */
+/* destination block. (ht,wd) can be (4,4), (8,4), (4,8), */
+/* (8,8), (16,8), (8,16) or (16,16). */
+/* */
+/* Inputs : puc_src - Pointer to source */
+/* puc_dst - Pointer to destination */
+/* src_strd - stride for source */
+/* dst_strd - stride for destination */
+/* log_wd - number of bits to be rounded off */
+/* wt - weight value */
+/* ofst - offset value */
+/* ht - height of the block */
+/* wd - width of the block */
+/* */
+/* Issues : None */
+/* */
+/* Revision History: */
+/* */
+/* DD MM YYYY Author(s) Changes */
+/* 07 01 2015 Kaushik Initial Version */
+/* Senthoor */
+/* */
+/*****************************************************************************/
+void ih264_weighted_pred_luma(UWORD8 *pu1_src,
+ UWORD8 *pu1_dst,
+ WORD32 src_strd,
+ WORD32 dst_strd,
+ WORD32 log_wd,
+ WORD32 wt,
+ WORD32 ofst,
+ WORD32 ht,
+ WORD32 wd)
+{
+ WORD32 i, j;
+
+ wt = (WORD16)(wt & 0xffff);
+ ofst = (WORD8)(ofst & 0xff);
+
+ src_strd -= wd;
+ dst_strd -= wd;
+
+ if(log_wd >= 1)
+ {
+ WORD32 i_ofst = (1 << (log_wd - 1)) + (ofst << log_wd);
+ for(i = 0; i < ht; i++)
+ {
+ for(j = 0; j < wd; j++, pu1_src++, pu1_dst++)
+ *pu1_dst = CLIP_U8((wt * (*pu1_src) + i_ofst) >> log_wd);
+
+ pu1_src += src_strd;
+ pu1_dst += dst_strd;
+ }
+ }
+ else
+ {
+ for(i = 0; i < ht; i++)
+ {
+ for(j = 0; j < wd; j++, pu1_src++, pu1_dst++)
+ *pu1_dst = CLIP_U8(wt * (*pu1_src) + ofst);
+
+ pu1_src += src_strd;
+ pu1_dst += dst_strd;
+ }
+ }
+}
+
+/*****************************************************************************/
+/* */
+/* Function Name : ih264_weighted_pred_chroma */
+/* */
+/* Description : This function performs the weighted prediction as */
+/* described in sec 8.4.2.3.2 titled "Weighted sample */
+/* prediction process" for chroma. The function gets one */
+/* ht x wd block, weights it, rounds it off, offsets it, */
+/* saturates it to unsigned 8-bit and stores it in the */
+/* destination block. (ht,wd) can be (2,2), (4,2), (2,4), */
+/* (4,4), (8,4), (4,8) or (8,8). */
+/* */
+/* Inputs : puc_src - Pointer to source */
+/* puc_dst - Pointer to destination */
+/* src_strd - stride for source */
+/* dst_strd - stride for destination */
+/* log_wd - number of bits to be rounded off */
+/* wt - weight values for u and v */
+/* ofst - offset values for u and v */
+/* ht - height of the block */
+/* wd - width of the block */
+/* */
+/* Issues : None */
+/* */
+/* Revision History: */
+/* */
+/* DD MM YYYY Author(s) Changes */
+/* 07 01 2015 Kaushik Initial Version */
+/* Senthoor */
+/* */
+/*****************************************************************************/
+void ih264_weighted_pred_chroma(UWORD8 *pu1_src,
+ UWORD8 *pu1_dst,
+ WORD32 src_strd,
+ WORD32 dst_strd,
+ WORD32 log_wd,
+ WORD32 wt,
+ WORD32 ofst,
+ WORD32 ht,
+ WORD32 wd)
+{
+ WORD32 i, j;
+ WORD32 wt_u, wt_v;
+ WORD32 ofst_u, ofst_v;
+
+ wt_u = (WORD16)(wt & 0xffff);
+ wt_v = (WORD16)(wt >> 16);
+
+ ofst_u = (WORD8)(ofst & 0xff);
+ ofst_v = (WORD8)(ofst >> 8);
+
+ src_strd -= wd << 1;
+ dst_strd -= wd << 1;
+
+ if(log_wd >= 1)
+ {
+ ofst_u = (1 << (log_wd - 1)) + (ofst_u << log_wd);
+ ofst_v = (1 << (log_wd - 1)) + (ofst_v << log_wd);
+
+ for(i = 0; i < ht; i++)
+ {
+ for(j = 0; j < wd; j++, pu1_src++, pu1_dst++)
+ {
+ *pu1_dst = CLIP_U8((wt_u * (*pu1_src) + ofst_u) >> log_wd);
+ pu1_src++;
+ pu1_dst++;
+ *pu1_dst = CLIP_U8((wt_v * (*pu1_src) + ofst_v) >> log_wd);
+ }
+ pu1_src += src_strd;
+ pu1_dst += dst_strd;
+ }
+ }
+ else
+ {
+ for(i = 0; i < ht; i++)
+ {
+ for(j = 0; j < wd; j++, pu1_src++, pu1_dst++)
+ {
+ *pu1_dst = CLIP_U8(wt_u * (*pu1_src) + ofst_u);
+ pu1_src++;
+ pu1_dst++;
+ *pu1_dst = CLIP_U8(wt_v * (*pu1_src) + ofst_v);
+ }
+ pu1_src += src_strd;
+ pu1_dst += dst_strd;
+ }
+ }
+}
+
+/*****************************************************************************/
+/* */
+/* Function Name : ih264_weighted_bi_pred_luma */
+/* */
+/* Description : This function performs the weighted biprediction as */
+/* described in sec 8.4.2.3.2 titled "Weighted sample */
+/* prediction process" for luma. The function gets two */
+/* ht x wd blocks, weights them, adds them, rounds off the */
+/* sum, offsets it, saturates it to unsigned 8-bit and */
+/* stores it in the destination block. (ht,wd) can be */
+/* (4,4), (8,4), (4,8), (8,8), (16,8), (8,16) or (16,16). */
+/* */
+/* Inputs : puc_src1 - Pointer to source 1 */
+/* puc_src2 - Pointer to source 2 */
+/* puc_dst - Pointer to destination */
+/* src_strd1 - stride for source 1 */
+/* src_strd2 - stride for source 2 */
+/* dst_strd2 - stride for destination */
+/* log_wd - number of bits to be rounded off */
+/* wt1 - weight value for source 1 */
+/* wt2 - weight value for source 2 */
+/* ofst1 - offset value for source 1 */
+/* ofst2 - offset value for source 2 */
+/* ht - height of the block */
+/* wd - width of the block */
+/* */
+/* Issues : None */
+/* */
+/* Revision History: */
+/* */
+/* DD MM YYYY Author(s) Changes */
+/* 07 01 2015 Kaushik Initial Version */
+/* Senthoor */
+/* */
+/*****************************************************************************/
+void ih264_weighted_bi_pred_luma(UWORD8 *pu1_src1,
+ UWORD8 *pu1_src2,
+ UWORD8 *pu1_dst,
+ WORD32 src_strd1,
+ WORD32 src_strd2,
+ WORD32 dst_strd,
+ WORD32 log_wd,
+ WORD32 wt1,
+ WORD32 wt2,
+ WORD32 ofst1,
+ WORD32 ofst2,
+ WORD32 ht,
+ WORD32 wd)
+{
+ WORD32 i, j;
+ WORD32 shft, ofst;
+
+ ofst1 = (WORD8)(ofst1 & 0xff);
+ ofst2 = (WORD8)(ofst2 & 0xff);
+ wt1 = (WORD16)(wt1 & 0xffff);
+ wt2 = (WORD16)(wt2 & 0xffff);
+ ofst = (ofst1 + ofst2 + 1) >> 1;
+
+ shft = log_wd + 1;
+ ofst = (1 << log_wd) + (ofst << shft);
+
+ src_strd1 -= wd;
+ src_strd2 -= wd;
+ dst_strd -= wd;
+
+ for(i = 0; i < ht; i++)
+ {
+ for(j = 0; j < wd; j++, pu1_src1++, pu1_src2++, pu1_dst++)
+ *pu1_dst = CLIP_U8((wt1 * (*pu1_src1) + wt2 * (*pu1_src2) + ofst) >> shft);
+
+ pu1_src1 += src_strd1;
+ pu1_src2 += src_strd2;
+ pu1_dst += dst_strd;
+ }
+}
+
+/*****************************************************************************/
+/* */
+/* Function Name : ih264_weighted_bi_pred_chroma */
+/* */
+/* Description : This function performs the weighted biprediction as */
+/* described in sec 8.4.2.3.2 titled "Weighted sample */
+/* prediction process" for chroma. The function gets two */
+/* ht x wd blocks, weights them, adds them, rounds off the */
+/* sum, offsets it, saturates it to unsigned 8-bit and */
+/* stores it in the destination block. (ht,wd) can be */
+/* (2,2), (4,2), (2,4), (4,4), (8,4), (4,8) or (8,8). */
+/* */
+/* Inputs : puc_src1 - Pointer to source 1 */
+/* puc_src2 - Pointer to source 2 */
+/* puc_dst - Pointer to destination */
+/* src_strd1 - stride for source 1 */
+/* src_strd2 - stride for source 2 */
+/* dst_strd2 - stride for destination */
+/* log_wd - number of bits to be rounded off */
+/* wt1 - weight values for u and v in source 1 */
+/* wt2 - weight values for u and v in source 2 */
+/* ofst1 - offset value for u and v in source 1 */
+/* ofst2 - offset value for u and v in source 2 */
+/* ht - height of the block */
+/* wd - width of the block */
+/* */
+/* Issues : None */
+/* */
+/* Revision History: */
+/* */
+/* DD MM YYYY Author(s) Changes */
+/* 07 01 2015 Kaushik Initial Version */
+/* Senthoor */
+/* */
+/*****************************************************************************/
+void ih264_weighted_bi_pred_chroma(UWORD8 *pu1_src1,
+ UWORD8 *pu1_src2,
+ UWORD8 *pu1_dst,
+ WORD32 src_strd1,
+ WORD32 src_strd2,
+ WORD32 dst_strd,
+ WORD32 log_wd,
+ WORD32 wt1,
+ WORD32 wt2,
+ WORD32 ofst1,
+ WORD32 ofst2,
+ WORD32 ht,
+ WORD32 wd)
+{
+ WORD32 i, j;
+ WORD32 wt1_u, wt1_v, wt2_u, wt2_v;
+ WORD32 ofst1_u, ofst1_v, ofst2_u, ofst2_v;
+ WORD32 ofst_u, ofst_v;
+ WORD32 shft;
+
+ ofst1_u = (WORD8)(ofst1 & 0xff);
+ ofst1_v = (WORD8)(ofst1 >> 8);
+ ofst2_u = (WORD8)(ofst2 & 0xff);
+ ofst2_v = (WORD8)(ofst2 >> 8);
+ wt1_u = (WORD16)(wt1 & 0xffff);
+ wt1_v = (WORD16)(wt1 >> 16);
+ wt2_u = (WORD16)(wt2 & 0xffff);
+ wt2_v = (WORD16)(wt2 >> 16);
+ ofst_u = (ofst1_u + ofst2_u + 1) >> 1;
+ ofst_v = (ofst1_v + ofst2_v + 1) >> 1;
+
+ src_strd1 -= wd << 1;
+ src_strd2 -= wd << 1;
+ dst_strd -= wd << 1;
+
+ shft = log_wd + 1;
+ ofst_u = (1 << log_wd) + (ofst_u << shft);
+ ofst_v = (1 << log_wd) + (ofst_v << shft);
+
+ for(i = 0; i < ht; i++)
+ {
+ for(j = 0; j < wd; j++, pu1_src1++, pu1_src2++, pu1_dst++)
+ {
+ *pu1_dst = CLIP_U8((wt1_u * (*pu1_src1) + wt2_u * (*pu1_src2) + ofst_u) >> shft);
+ pu1_src1++;
+ pu1_src2++;
+ pu1_dst++;
+ *pu1_dst = CLIP_U8((wt1_v * (*pu1_src1) + wt2_v * (*pu1_src2) + ofst_v) >> shft);
+ }
+ pu1_src1 += src_strd1;
+ pu1_src2 += src_strd2;
+ pu1_dst += dst_strd;
+ }
+}
diff --git a/common/ih264_weighted_pred.h b/common/ih264_weighted_pred.h
new file mode 100755
index 0000000..f9b93b0
--- /dev/null
+++ b/common/ih264_weighted_pred.h
@@ -0,0 +1,164 @@
+/******************************************************************************
+ *
+ * Copyright (C) 2015 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ *****************************************************************************
+ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
+*/
+
+/**
+*******************************************************************************
+* @file
+* ih264_weighted_pred.h
+*
+* @brief
+* Declarations of functions used for weighted prediction
+*
+* @author
+* Ittiam
+*
+* @par List of Functions:
+* -ih264_default_weighted_pred_luma
+* -ih264_default_weighted_pred_chroma
+* -ih264_weighted_pred_luma
+* -ih264_weighted_pred_chroma
+* -ih264_weighted_bi_pred_luma
+* -ih264_weighted_bi_pred_chroma
+* -ih264_default_weighted_pred_luma_a9q
+* -ih264_default_weighted_pred_chroma_a9q
+* -ih264_weighted_pred_luma_a9q
+* -ih264_weighted_pred_luma_a9q
+* -ih264_weighted_bi_pred_luma_a9q
+* -ih264_weighted_bi_pred_chroma_a9q
+* -ih264_default_weighted_pred_luma_av8
+* -ih264_default_weighted_pred_chroma_av8
+* -ih264_weighted_pred_luma_av8
+* -ih264_weighted_pred_chroma_av8
+* -ih264_weighted_bi_pred_luma_av8
+* -ih264_weighted_bi_pred_chroma_av8
+* -ih264_default_weighted_pred_luma_sse42
+* -ih264_default_weighted_pred_chroma_sse42
+* -ih264_weighted_pred_luma_sse42
+* -ih264_weighted_pred_chroma_sse42
+* -ih264_weighted_bi_pred_luma_sse42
+* -ih264_weighted_bi_pred_chroma_sse42
+*
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+
+#ifndef IH264_WEIGHTED_PRED_H_
+#define IH264_WEIGHTED_PRED_H_
+
+/*****************************************************************************/
+/* Extern Function Declarations */
+/*****************************************************************************/
+typedef void ih264_default_weighted_pred_ft(UWORD8 *puc_src1,
+ UWORD8 *puc_src2,
+ UWORD8 *puc_dst,
+ WORD32 src_strd1,
+ WORD32 src_strd2,
+ WORD32 dst_strd,
+ WORD32 ht,
+ WORD32 wd);
+
+typedef void ih264_weighted_pred_ft(UWORD8 *puc_src,
+ UWORD8 *puc_dst,
+ WORD32 src_strd,
+ WORD32 dst_strd,
+ WORD32 log_wd,
+ WORD32 wt,
+ WORD32 ofst,
+ WORD32 ht,
+ WORD32 wd);
+
+typedef void ih264_weighted_bi_pred_ft(UWORD8 *puc_src1,
+ UWORD8 *puc_src2,
+ UWORD8 *puc_dst,
+ WORD32 src_strd1,
+ WORD32 src_strd2,
+ WORD32 dst_strd,
+ WORD32 log_wd,
+ WORD32 wt1,
+ WORD32 wt2,
+ WORD32 ofst1,
+ WORD32 ofst2,
+ WORD32 ht,
+ WORD32 wd);
+
+/* No NEON Declarations */
+
+ih264_default_weighted_pred_ft ih264_default_weighted_pred_luma;
+
+ih264_default_weighted_pred_ft ih264_default_weighted_pred_chroma;
+
+ih264_weighted_pred_ft ih264_weighted_pred_luma;
+
+ih264_weighted_pred_ft ih264_weighted_pred_chroma;
+
+ih264_weighted_bi_pred_ft ih264_weighted_bi_pred_luma;
+
+ih264_weighted_bi_pred_ft ih264_weighted_bi_pred_chroma;
+
+/* A9 NEON Declarations */
+
+ih264_default_weighted_pred_ft ih264_default_weighted_pred_luma_a9q;
+
+ih264_default_weighted_pred_ft ih264_default_weighted_pred_chroma_a9q;
+
+ih264_weighted_pred_ft ih264_weighted_pred_luma_a9q;
+
+ih264_weighted_pred_ft ih264_weighted_pred_chroma_a9q;
+
+ih264_weighted_bi_pred_ft ih264_weighted_bi_pred_luma_a9q;
+
+ih264_weighted_bi_pred_ft ih264_weighted_bi_pred_chroma_a9q;
+
+
+/* AV8 NEON Declarations */
+
+ih264_default_weighted_pred_ft ih264_default_weighted_pred_luma_av8;
+
+ih264_default_weighted_pred_ft ih264_default_weighted_pred_chroma_av8;
+
+ih264_weighted_pred_ft ih264_weighted_pred_luma_av8;
+
+ih264_weighted_pred_ft ih264_weighted_pred_chroma_av8;
+
+ih264_weighted_bi_pred_ft ih264_weighted_bi_pred_luma_av8;
+
+ih264_weighted_bi_pred_ft ih264_weighted_bi_pred_chroma_av8;
+
+
+/* SSE42 Intrinsic Declarations */
+
+ih264_default_weighted_pred_ft ih264_default_weighted_pred_luma_sse42;
+
+ih264_default_weighted_pred_ft ih264_default_weighted_pred_chroma_sse42;
+
+ih264_weighted_pred_ft ih264_weighted_pred_luma_sse42;
+
+ih264_weighted_pred_ft ih264_weighted_pred_chroma_sse42;
+
+ih264_weighted_bi_pred_ft ih264_weighted_bi_pred_luma_sse42;
+
+ih264_weighted_bi_pred_ft ih264_weighted_bi_pred_chroma_sse42;
+
+#endif /* IH264_WEIGHTED_PRED_H_ */
+
+/** Nothing past this point */
diff --git a/common/ithread.c b/common/ithread.c
new file mode 100755
index 0000000..4ffb98a
--- /dev/null
+++ b/common/ithread.c
@@ -0,0 +1,604 @@
+/******************************************************************************
+ *
+ * Copyright (C) 2015 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ *****************************************************************************
+ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
+*/
+/*****************************************************************************/
+/* */
+/* File Name : ithread.c */
+/* */
+/* Description : Contains abstraction for threads, mutex and semaphores*/
+/* */
+/* List of Functions : */
+/* */
+/* Issues / Problems : None */
+/* */
+/* Revision History : */
+/* */
+/* DD MM YYYY Author(s) Changes */
+/* 07 09 2012 Harish Initial Version */
+/*****************************************************************************/
+/*****************************************************************************/
+/* File Includes */
+/*****************************************************************************/
+#include <string.h>
+#include "ih264_typedefs.h"
+
+/*
+ * If the end target is bare metal, then there shall be no OS.
+ * In this case, the functions ithread_* used inside the h264 encoder library to assist multicore
+ * will not longer be functional. To resolve link issues, the functions are re-defined with no body.
+ */
+#ifndef BAREMETAL
+
+
+#include "ithread.h"
+#include <sys/types.h>
+
+
+#define UNUSED(x) ((void)(x))
+
+#ifndef X86_MSVC
+//#define PTHREAD_AFFINITY
+//#define SYSCALL_AFFINITY
+
+#ifdef PTHREAD_AFFINITY
+#define _GNU_SOURCE
+#define __USE_GNU
+#endif
+
+#include <pthread.h>
+#include <sched.h>
+#include <semaphore.h>
+#include <unistd.h>
+#ifdef PTHREAD_AFFINITY
+#include <sys/prctl.h>
+#endif
+
+#endif
+
+#ifdef X86_MSVC
+
+#include <windows.h>
+#define SEM_MAX_COUNT 100
+#define SEM_INCREMENT_COUNT 1
+
+UWORD32 ithread_get_handle_size(void)
+{
+ return (sizeof(HANDLE));
+}
+
+UWORD32 ithread_get_mutex_lock_size(void)
+{
+ return (sizeof(HANDLE));
+}
+
+WORD32 ithread_create(void *thread_handle, void *attribute, void *strt, void *argument)
+{
+ HANDLE *ppv_thread_handle;
+ HANDLE thread_handle_value;
+
+ UNUSED(attribute);
+
+ if(0 == thread_handle)
+ return -1;
+
+ ppv_thread_handle = (HANDLE *)thread_handle;
+ thread_handle_value = (void *)CreateThread
+ (NULL, /* Attributes */
+ 1024*128, /* Stack i4_size */
+ (LPTHREAD_START_ROUTINE)strt, /* Thread function */
+ argument, /* Parameters */
+ 0, /* Creation flags */
+ NULL); /* Thread ID */
+ *ppv_thread_handle = (HANDLE)thread_handle_value;
+
+ return 0;
+}
+
+WORD32 ithread_join(void *thread_handle, void ** val_ptr)
+{
+ HANDLE *ppv_thread_handle;
+ HANDLE thread_handle_value;
+
+ UNUSED(val_ptr);
+
+ if(0 == thread_handle)
+ return -1;
+
+ ppv_thread_handle = (HANDLE *)thread_handle;
+ thread_handle_value = *ppv_thread_handle;
+
+ if(WAIT_OBJECT_0 == WaitForSingleObject(thread_handle_value, INFINITE))
+ {
+ CloseHandle(thread_handle_value);
+ }
+
+ return 0;
+}
+
+void ithread_exit(void *thread_handle)
+{
+ HANDLE *ppv_thread_handle;
+ HANDLE thread_handle_value;
+ DWORD thread_exit_code;
+
+ if(0 == thread_handle)
+ return;
+
+ ppv_thread_handle = (HANDLE *)thread_handle;
+ thread_handle_value = *ppv_thread_handle;
+ /* Get exit code for thread. If the return value is 0, means thread is busy */
+ if( 0 != GetExitCodeThread(thread_handle_value, &thread_exit_code))
+ {
+ TerminateThread(thread_handle_value, thread_exit_code);
+ }
+
+ return;
+}
+
+WORD32 ithread_get_mutex_struct_size(void)
+{
+ return (sizeof(HANDLE));
+}
+
+WORD32 ithread_mutex_init(void *mutex)
+{
+ HANDLE *ppv_mutex_handle;
+ HANDLE mutex_handle_value;
+
+ if(0 == mutex)
+ return -1;
+
+ ppv_mutex_handle = (HANDLE *)mutex;
+ mutex_handle_value = CreateSemaphore(NULL, 1, 1, NULL);
+ *ppv_mutex_handle = mutex_handle_value;
+ return 0;
+}
+
+WORD32 ithread_mutex_destroy(void *mutex)
+{
+ HANDLE *ppv_mutex_handle;
+ HANDLE mutex_handle_value;
+
+ if(0 == mutex)
+ return -1;
+
+ ppv_mutex_handle = (HANDLE *)mutex;
+ mutex_handle_value = *ppv_mutex_handle;
+ CloseHandle(mutex_handle_value);
+ return 0;
+}
+
+WORD32 ithread_mutex_lock(void *mutex)
+{
+ HANDLE *ppv_mutex_handle;
+ HANDLE mutex_handle_value;
+ DWORD result = 0;
+
+ if(0 == mutex)
+ return -1;
+
+ ppv_mutex_handle = (HANDLE *)mutex;
+ mutex_handle_value = *ppv_mutex_handle;
+ result = WaitForSingleObject(mutex_handle_value, INFINITE);
+
+ if(WAIT_OBJECT_0 == result)
+ return 0;
+
+ return 1;
+
+}
+
+WORD32 ithread_mutex_unlock(void *mutex)
+{
+ HANDLE *ppv_mutex_handle;
+ HANDLE mutex_handle_value;
+ DWORD result = 0;
+
+ if(0 == mutex)
+ return -1;
+
+ ppv_mutex_handle = (HANDLE *)mutex;
+ mutex_handle_value = *ppv_mutex_handle;
+ result = ReleaseSemaphore(mutex_handle_value, 1, NULL);
+
+ if(0 == result)
+ return -1;
+
+ return 0;
+}
+
+void ithread_yield(void) { }
+
+void ithread_usleep(UWORD32 u4_time_us)
+{
+ UWORD32 u4_time_ms = u4_time_us / 1000;
+ Sleep(u4_time_ms);
+}
+
+void ithread_msleep(UWORD32 u4_time_ms)
+{
+ Sleep(u4_time_ms);
+}
+
+void ithread_sleep(UWORD32 u4_time)
+{
+ UWORD32 u4_time_ms = u4_time * 1000;
+ Sleep(u4_time_ms);
+}
+
+UWORD32 ithread_get_sem_struct_size(void)
+{
+ return (sizeof(HANDLE));
+}
+
+WORD32 ithread_sem_init(void *sem,WORD32 pshared,UWORD32 value)
+{
+ HANDLE *sem_handle = (HANDLE *)sem;
+ HANDLE sem_handle_value;
+
+ if(0 == sem)
+ return -1;
+
+ sem_handle_value = CreateSemaphore(NULL, /* Security Attribute*/
+ value, /* Initial count */
+ SEM_MAX_COUNT,/* Max value */
+ NULL); /* Name, not used */
+ *sem_handle = sem_handle_value;
+ return 0;
+}
+
+WORD32 ithread_sem_post(void *sem)
+{
+ HANDLE *sem_handle = (HANDLE *)sem;
+ HANDLE sem_handle_value;
+
+ if(0 == sem)
+ return -1;
+
+ sem_handle_value = *sem_handle;
+
+ /* Post on Semaphore by releasing the lock on mutex */
+ if(ReleaseSemaphore(sem_handle_value, SEM_INCREMENT_COUNT, NULL))
+ return 0;
+
+ return -1;
+}
+
+WORD32 ithread_sem_wait(void *sem)
+{
+ DWORD result = 0;
+ HANDLE *sem_handle = (HANDLE *)sem;
+ HANDLE sem_handle_value;
+
+ if(0 == sem)
+ return -1;
+
+ sem_handle_value = *sem_handle;
+
+ /* Wait on Semaphore object infinitly */
+ result = WaitForSingleObject(sem_handle_value, INFINITE);
+
+ /* If lock on semaphore is acquired, return SUCCESS */
+ if(WAIT_OBJECT_0 == result)
+ return 0;
+
+ /* If call timeouts, return FAILURE */
+ if(WAIT_TIMEOUT == result)
+ return -1;
+
+ return 0;
+}
+
+WORD32 ithread_sem_destroy(void *sem)
+{
+ HANDLE *sem_handle = (HANDLE *)sem;
+ HANDLE sem_handle_value;
+
+ if(0 == sem)
+ return -1;
+
+ sem_handle_value = *sem_handle;
+
+ if(FALSE == CloseHandle(sem_handle_value) )
+ {
+ return -1;
+ }
+ return 0;
+}
+
+WORD32 ithread_set_affinity(WORD32 core_id)
+{
+ return 1;
+}
+
+#else
+
+UWORD32 ithread_get_handle_size(void)
+{
+ return sizeof(pthread_t);
+}
+
+UWORD32 ithread_get_mutex_lock_size(void)
+{
+ return sizeof(pthread_mutex_t);
+}
+
+
+WORD32 ithread_create(void *thread_handle, void *attribute, void *strt, void *argument)
+{
+ UNUSED(attribute);
+ return pthread_create((pthread_t *)thread_handle, NULL,(void *(*)(void *)) strt, argument);
+}
+
+WORD32 ithread_join(void *thread_handle, void ** val_ptr)
+{
+ UNUSED(val_ptr);
+ pthread_t *pthread_handle = (pthread_t *)thread_handle;
+ return pthread_join(*pthread_handle, NULL);
+}
+
+void ithread_exit(void *val_ptr)
+{
+ return pthread_exit(val_ptr);
+}
+
+WORD32 ithread_get_mutex_struct_size(void)
+{
+ return(sizeof(pthread_mutex_t));
+}
+WORD32 ithread_mutex_init(void *mutex)
+{
+ return pthread_mutex_init((pthread_mutex_t *) mutex, NULL);
+}
+
+WORD32 ithread_mutex_destroy(void *mutex)
+{
+ return pthread_mutex_destroy((pthread_mutex_t *) mutex);
+}
+
+WORD32 ithread_mutex_lock(void *mutex)
+{
+ return pthread_mutex_lock((pthread_mutex_t *)mutex);
+}
+
+WORD32 ithread_mutex_unlock(void *mutex)
+{
+ return pthread_mutex_unlock((pthread_mutex_t *)mutex);
+}
+
+void ithread_yield(void)
+{
+ sched_yield();
+}
+
+void ithread_sleep(UWORD32 u4_time)
+{
+ usleep(u4_time * 1000 * 1000);
+}
+
+void ithread_msleep(UWORD32 u4_time_ms)
+{
+ usleep(u4_time_ms * 1000);
+}
+
+void ithread_usleep(UWORD32 u4_time_us)
+{
+ usleep(u4_time_us);
+}
+
+UWORD32 ithread_get_sem_struct_size(void)
+{
+ return(sizeof(sem_t));
+}
+
+
+WORD32 ithread_sem_init(void *sem,WORD32 pshared,UWORD32 value)
+{
+ return sem_init((sem_t *)sem,pshared,value);
+}
+
+WORD32 ithread_sem_post(void *sem)
+{
+ return sem_post((sem_t *)sem);
+}
+
+
+WORD32 ithread_sem_wait(void *sem)
+{
+ return sem_wait((sem_t *)sem);
+}
+
+
+WORD32 ithread_sem_destroy(void *sem)
+{
+ return sem_destroy((sem_t *)sem);
+}
+
+void ithread_set_name(CHAR *pc_thread_name)
+{
+
+#ifndef WIN32
+#ifndef QNX
+#ifndef IOS
+ UNUSED(pc_thread_name);
+//prctl(PR_SET_NAME, (unsigned long)pu1_thread_name, 0, 0, 0);
+#endif
+#endif
+#endif
+
+}
+WORD32 ithread_set_affinity(WORD32 core_id)
+{
+#ifdef PTHREAD_AFFINITY
+ cpu_set_t cpuset;
+ int num_cores = sysconf(_SC_NPROCESSORS_ONLN);
+ pthread_t cur_thread = pthread_self();
+
+ if (core_id >= num_cores)
+ return -1;
+
+ CPU_ZERO(&cpuset);
+ CPU_SET(core_id, &cpuset);
+
+ return pthread_setaffinity_np(cur_thread, sizeof(cpu_set_t), &cpuset);
+
+#elif SYSCALL_AFFINITY
+ WORD32 i4_sys_res;
+ UNUSED(core_id);
+
+ pid_t pid = gettid();
+
+
+ i4_sys_res = syscall(__NR_sched_setaffinity, pid, sizeof(i4_mask), &i4_mask);
+ if (i4_sys_res)
+ {
+ //WORD32 err;
+ //err = errno;
+ //perror("Error in setaffinity syscall PERROR : ");
+ //LOG_ERROR("Error in the syscall setaffinity: mask=0x%x err=0x%x", i4_mask, i4_sys_res);
+ return -1;
+ }
+#else
+ UNUSED(core_id);
+#endif
+ return 1;
+
+}
+#endif
+
+#else
+
+UWORD32 ithread_get_handle_size(void)
+{
+ return sizeof(int);
+}
+
+UWORD32 ithread_get_mutex_lock_size(void)
+{
+ return sizeof(int);
+}
+
+UWORD32 ithread_get_cond_size(void)
+{
+ return(sizeof(int));
+}
+WORD32 ithread_create(void *thread_handle, void *attribute, void *strt, void *argument)
+{
+ return 0;
+}
+
+WORD32 ithread_join(void *thread_handle, void ** val_ptr)
+{
+ return 0;
+}
+
+void ithread_exit(void *val_ptr)
+{
+ return;
+}
+
+WORD32 ithread_mutex_init(void *mutex)
+{
+ return 0;
+}
+
+WORD32 ithread_mutex_destroy(void *mutex)
+{
+ return 0;
+}
+
+WORD32 ithread_mutex_lock(void *mutex)
+{
+ return 0;
+}
+
+WORD32 ithread_mutex_unlock(void *mutex)
+{
+ return 0;
+}
+
+void ithread_yield(void)
+{
+ return;
+}
+
+void ithread_sleep(UWORD32 u4_time_in_us)
+{
+ return;
+}
+
+void ithread_usleep(UWORD32 u4_time_us)
+{
+ return;
+}
+
+UWORD32 ithread_get_sem_strcut_size(void)
+{
+ return(sizeof(int));
+}
+
+
+WORD32 ithread_sem_init(void *sem,WORD32 pshared,UWORD32 value)
+{
+ return 0;
+}
+
+WORD32 ithread_sem_post(void *sem)
+{
+ return 0;
+}
+
+
+WORD32 ithread_sem_wait(void *sem)
+{
+ return 0;
+}
+
+WORD32 ithread_sem_destroy(void *sem)
+{
+ return 0;
+}
+
+void ithread_set_name(UWORD8 *pu1_thread_name)
+{
+ return;
+}
+
+void ithread_condition_init(void *condition)
+{
+ return;
+}
+
+void ithread_condition_signal(void * condition)
+{
+ return;
+}
+
+
+
+void ithread_condition_wait(void *condition,void *mutex)
+{
+ return;
+}
+
+WORD32 ithread_set_affinity(WORD32 core_id)
+{
+ return 1;
+}
+#endif
diff --git a/common/ithread.h b/common/ithread.h
new file mode 100755
index 0000000..f926f83
--- /dev/null
+++ b/common/ithread.h
@@ -0,0 +1,104 @@
+/******************************************************************************
+ *
+ * Copyright (C) 2015 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ *****************************************************************************
+ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
+*/
+/*****************************************************************************/
+/* */
+/* File Name : ithread.h */
+/* */
+/* Description : This file contains all the necessary structure and */
+/* enumeration definitions needed for the Application */
+/* Program Interface(API) of the */
+/* Thread Abstraction Layer */
+/* */
+/* List of Functions : ithread_get_handle_size */
+/* ithread_get_mutex_lock_size */
+/* ithread_create */
+/* ithread_exit */
+/* ithread_join */
+/* ithread_get_mutex_struct_size */
+/* ithread_mutex_init */
+/* ithread_mutex_destroy */
+/* ithread_mutex_lock */
+/* ithread_mutex_unlock */
+/* ithread_yield */
+/* ithread_sleep */
+/* ithread_msleep */
+/* ithread_usleep */
+/* ithread_get_sem_struct_size */
+/* ithread_sem_init */
+/* ithread_sem_post */
+/* ithread_sem_wait */
+/* ithread_sem_destroy */
+/* ithread_set_affinity */
+/* */
+/* Issues / Problems : None */
+/* */
+/* Revision History : */
+/* */
+/* DD MM YYYY Author(s) Changes */
+/* 06 09 2012 Harish Initial Version */
+/* */
+/*****************************************************************************/
+
+#ifndef _ITHREAD_H_
+#define _ITHREAD_H_
+
+UWORD32 ithread_get_handle_size(void);
+
+UWORD32 ithread_get_mutex_lock_size(void);
+
+WORD32 ithread_create(void *thread_handle, void *attribute, void *strt, void *argument);
+
+void ithread_exit(void *val_ptr);
+
+WORD32 ithread_join(void *thread_id, void ** val_ptr);
+
+WORD32 ithread_get_mutex_struct_size(void);
+
+WORD32 ithread_mutex_init(void *mutex);
+
+WORD32 ithread_mutex_destroy(void *mutex);
+
+WORD32 ithread_mutex_lock(void *mutex);
+
+WORD32 ithread_mutex_unlock(void *mutex);
+
+void ithread_yield(void);
+
+void ithread_sleep(UWORD32 u4_time);
+
+void ithread_msleep(UWORD32 u4_time_ms);
+
+void ithread_usleep(UWORD32 u4_time_us);
+
+UWORD32 ithread_get_sem_struct_size(void);
+
+WORD32 ithread_sem_init(void *sem,WORD32 pshared,UWORD32 value);
+
+WORD32 ithread_sem_post(void *sem);
+
+WORD32 ithread_sem_wait(void *sem);
+
+WORD32 ithread_sem_destroy(void *sem);
+
+WORD32 ithread_set_affinity(WORD32 core_id);
+
+void ithread_set_name(CHAR *pc_thread_name);
+
+#endif /* _ITHREAD_H_ */
diff --git a/common/mips/ih264_platform_macros.h b/common/mips/ih264_platform_macros.h
new file mode 100755
index 0000000..d098372
--- /dev/null
+++ b/common/mips/ih264_platform_macros.h
@@ -0,0 +1,102 @@
+/******************************************************************************
+ *
+ * Copyright (C) 2015 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ *****************************************************************************
+ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
+*/
+/**
+*******************************************************************************
+* @file
+* ih264_platform_macros.h
+*
+* @brief
+* Platform specific Macro definitions used in the codec
+*
+* @author
+* Ittiam
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+
+
+#ifndef _IH264_PLATFORM_MACROS_H_
+#define _IH264_PLATFORM_MACROS_H_
+
+#define CLIP_U8(x) CLIP3(0, 255, (x))
+#define CLIP_S8(x) CLIP3(-128, 127, (x))
+
+#define CLIP_U10(x) CLIP3(0, 1023, (x))
+#define CLIP_S10(x) CLIP3(-512, 511, (x))
+
+#define CLIP_U12(x) CLIP3(0, 4095, (x))
+#define CLIP_S12(x) CLIP3(-2048, 2047, (x))
+
+#define CLIP_U16(x) CLIP3(0, 65535, (x))
+#define CLIP_S16(x) CLIP3(-32768, 32767, (x))
+
+#define MEM_ALIGN16 __attribute__ ((aligned (16)))
+
+#define SHL(x,y) (((y) < 32) ? ((x) << (y)) : 0)
+#define SHR(x,y) (((y) < 32) ? ((x) >> (y)) : 0)
+
+#define SHR_NEG(val,shift) ((shift>0)?(val>>shift):(val<<(-shift)))
+#define SHL_NEG(val,shift) ((shift<0)?(val>>(-shift)):(val<<shift))
+
+
+#define ITT_BIG_ENDIAN(x) ((x << 24)) | \
+ ((x & 0x0000ff00) << 8) | \
+ ((x & 0x00ff0000) >> 8) | \
+ ((UWORD32)x >> 24);
+
+
+#define NOP(nop_cnt) {UWORD32 nop_i; for (nop_i = 0; nop_i < nop_cnt; nop_i++);}
+
+#define PLD(a)
+
+static __inline UWORD32 CLZ(UWORD32 u4_word)
+{
+ if(u4_word)
+ return(__builtin_clz(u4_word));
+ else
+ return 32;
+}
+
+static __inline UWORD32 CTZ(UWORD32 u4_word)
+{
+ if(0 == u4_word)
+ return 31;
+ else
+ {
+ unsigned int index;
+ index = __builtin_ctz(u4_word);
+ return (UWORD32)index;
+ }
+}
+
+#define DATA_SYNC()
+
+#define INLINE
+
+#define PREFETCH(ptr, type)
+
+#define MEM_ALIGN8 __attribute__ ((aligned (8)))
+#define MEM_ALIGN16 __attribute__ ((aligned (16)))
+#define MEM_ALIGN32 __attribute__ ((aligned (32)))
+
+#endif /* _IH264_PLATFORM_MACROS_H_ */
diff --git a/common/x86/ih264_chroma_intra_pred_filters_ssse3.c b/common/x86/ih264_chroma_intra_pred_filters_ssse3.c
new file mode 100755
index 0000000..45101a4
--- /dev/null
+++ b/common/x86/ih264_chroma_intra_pred_filters_ssse3.c
@@ -0,0 +1,433 @@
+/******************************************************************************
+ *
+ * Copyright (C) 2015 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ *****************************************************************************
+ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
+*/
+/**
+*******************************************************************************
+* @file
+* ih264_chroma_intra_pred_filters_ssse3.c
+*
+* @brief
+* Contains function definitions for chroma intra prediction filters in x86
+* intrinsics
+*
+* @author
+* Ittiam
+*
+* @par List of Functions:
+* -ih264_intra_pred_chroma_8x8_mode_horz_ssse3
+* -ih264_intra_pred_chroma_8x8_mode_vert_ssse3
+* -ih264_intra_pred_chroma_8x8_mode_plane_ssse3
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+
+/*****************************************************************************/
+/* File Includes */
+/*****************************************************************************/
+
+/* System include files */
+#include <stdio.h>
+#include <stddef.h>
+#include <string.h>
+
+/* User include files */
+#include "ih264_defs.h"
+#include "ih264_typedefs.h"
+#include "ih264_macros.h"
+#include "ih264_platform_macros.h"
+#include "ih264_intra_pred_filters.h"
+
+
+/*****************************************************************************/
+/* Chroma Intra prediction 8x8 filters */
+/*****************************************************************************/
+/**
+*******************************************************************************
+*
+* ih264_intra_pred_chroma_8x8_mode_horz_ssse3
+*
+* @brief
+* Perform Intra prediction for chroma_8x8 mode:Horizontal
+*
+* @par Description:
+* Perform Intra prediction for chroma_8x8 mode:Horizontal ,described in sec 8.3.4.2
+*
+* @param[in] pu1_src
+* UWORD8 pointer to the source containing alternate U and V samples
+*
+* @param[out] pu1_dst
+* UWORD8 pointer to the destination with alternate U and V samples
+*
+* @param[in] src_strd
+* integer source stride
+*
+* @param[in] dst_strd
+* integer destination stride
+*
+* @param[in] ngbr_avail
+* availability of neighbouring pixels(Not used in this function)
+*
+* @returns
+*
+* @remarks
+* None
+*
+******************************************************************************
+*/
+void ih264_intra_pred_chroma_8x8_mode_horz_ssse3(UWORD8 *pu1_src,
+ UWORD8 *pu1_dst,
+ WORD32 src_strd,
+ WORD32 dst_strd,
+ WORD32 ngbr_avail)
+{
+
+ UWORD8 *pu1_left; /* Pointer to start of top predictors */
+ WORD32 dst_strd2;
+
+ __m128i left_16x8b, left_sh_16x8b;
+ __m128i row1_16x8b, row2_16x8b;
+ __m128i const_14_15_16x8b;
+
+ UNUSED(src_strd);
+ UNUSED(ngbr_avail);
+
+ pu1_left = pu1_src + 2 * BLK8x8SIZE - 2;
+
+ left_16x8b = _mm_loadu_si128((__m128i *)(pu1_left - 14));
+
+ const_14_15_16x8b = _mm_set1_epi16(0x0f0e);
+
+ dst_strd2 = dst_strd << 1;
+ left_sh_16x8b = _mm_slli_si128(left_16x8b, 2);
+ row1_16x8b = _mm_shuffle_epi8(left_16x8b, const_14_15_16x8b);
+ row2_16x8b = _mm_shuffle_epi8(left_sh_16x8b, const_14_15_16x8b);
+ _mm_storeu_si128((__m128i *)pu1_dst, row1_16x8b);
+ _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd), row2_16x8b);
+
+ left_16x8b = _mm_slli_si128(left_16x8b, 4);
+ left_sh_16x8b = _mm_slli_si128(left_sh_16x8b, 4);
+ pu1_dst += dst_strd2;
+ row1_16x8b = _mm_shuffle_epi8(left_16x8b, const_14_15_16x8b);
+ row2_16x8b = _mm_shuffle_epi8(left_sh_16x8b, const_14_15_16x8b);
+ _mm_storeu_si128((__m128i *)pu1_dst, row1_16x8b);
+ _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd), row2_16x8b);
+
+ left_16x8b = _mm_slli_si128(left_16x8b, 4);
+ left_sh_16x8b = _mm_slli_si128(left_sh_16x8b, 4);
+ pu1_dst += dst_strd2;
+ row1_16x8b = _mm_shuffle_epi8(left_16x8b, const_14_15_16x8b);
+ row2_16x8b = _mm_shuffle_epi8(left_sh_16x8b, const_14_15_16x8b);
+ _mm_storeu_si128((__m128i *)pu1_dst, row1_16x8b);
+ _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd), row2_16x8b);
+
+ left_16x8b = _mm_slli_si128(left_16x8b, 4);
+ left_sh_16x8b = _mm_slli_si128(left_sh_16x8b, 4);
+ pu1_dst += dst_strd2;
+ row1_16x8b = _mm_shuffle_epi8(left_16x8b, const_14_15_16x8b);
+ row2_16x8b = _mm_shuffle_epi8(left_sh_16x8b, const_14_15_16x8b);
+ _mm_storeu_si128((__m128i *)pu1_dst, row1_16x8b);
+ _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd), row2_16x8b);
+}
+
+/**
+*******************************************************************************
+*
+* ih264_intra_pred_chroma_8x8_mode_vert_ssse3
+*
+* @brief
+* Perform Intra prediction for chroma_8x8 mode:vertical
+*
+* @par Description:
+* Perform Intra prediction for chroma_8x8 mode:vertical ,described in sec 8.3.4.3
+*
+* @param[in] pu1_src
+* UWORD8 pointer to the source containing alternate U and V samples
+*
+* @param[out] pu1_dst
+* UWORD8 pointer to the destination with alternate U and V samples
+*
+* @param[in] src_strd
+* integer source stride
+*
+* @param[in] dst_strd
+* integer destination stride
+*
+* @param[in] ngbr_avail
+* availability of neighbouring pixels(Not used in this function)
+*
+* @returns
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+void ih264_intra_pred_chroma_8x8_mode_vert_ssse3(UWORD8 *pu1_src,
+ UWORD8 *pu1_dst,
+ WORD32 src_strd,
+ WORD32 dst_strd,
+ WORD32 ngbr_avail)
+{
+ UWORD8 *pu1_top; /* Pointer to start of top predictors */
+ WORD32 dst_strd2;
+
+ __m128i top_16x8b;
+
+ UNUSED(src_strd);
+ UNUSED(ngbr_avail);
+
+ pu1_top = pu1_src + 2 * BLK8x8SIZE + 2;
+
+ top_16x8b = _mm_loadu_si128((__m128i *)pu1_top);
+
+ dst_strd2 = dst_strd << 1;
+ _mm_storeu_si128((__m128i *)pu1_dst, top_16x8b);
+ _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd), top_16x8b);
+
+ pu1_dst += dst_strd2;
+ _mm_storeu_si128((__m128i *)pu1_dst, top_16x8b);
+ _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd), top_16x8b);
+
+ pu1_dst += dst_strd2;
+ _mm_storeu_si128((__m128i *)pu1_dst, top_16x8b);
+ _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd), top_16x8b);
+
+ pu1_dst += dst_strd2;
+ _mm_storeu_si128((__m128i *)pu1_dst, top_16x8b);
+ _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd), top_16x8b);
+}
+
+/**
+*******************************************************************************
+*
+* ih264_intra_pred_chroma_8x8_mode_plane_ssse3
+*
+* @brief
+* Perform Intra prediction for chroma_8x8 mode:PLANE
+*
+* @par Description:
+* Perform Intra prediction for chroma_8x8 mode:PLANE ,described in sec 8.3.4.4
+*
+* @param[in] pu1_src
+* UWORD8 pointer to the source containing alternate U and V samples
+*
+* @param[out] pu1_dst
+* UWORD8 pointer to the destination with alternate U and V samples
+*
+* @param[in] src_strd
+* integer source stride
+*
+* @param[in] dst_strd
+* integer destination stride
+*
+* @param[in] ngbr_avail
+* availability of neighbouring pixels(Not used in this function)
+*
+* @returns
+*
+* @remarks
+* None
+*
+******************************************************************************
+*/
+void ih264_intra_pred_chroma_8x8_mode_plane_ssse3(UWORD8 *pu1_src,
+ UWORD8 *pu1_dst,
+ WORD32 src_strd,
+ WORD32 dst_strd,
+ WORD32 ngbr_avail)
+{
+ UWORD8 *pu1_left, *pu1_top;
+ WORD32 a_u, a_v, b_u, b_v, c_u, c_v;
+
+ __m128i mul_8x16b, shuffle_8x16b;
+
+ UNUSED(src_strd);
+ UNUSED(ngbr_avail);
+
+ pu1_top = pu1_src + MB_SIZE + 2;
+ pu1_left = pu1_src + MB_SIZE - 2;
+
+ mul_8x16b = _mm_setr_epi16(1, 2, 3, 4, 1, 2, 3, 4);
+ shuffle_8x16b = _mm_setr_epi16(0xff00, 0xff02, 0xff04, 0xff06,
+ 0xff01, 0xff03, 0xff05, 0xff07);
+
+ //calculating a, b and c
+ {
+ WORD32 h_u, h_v, v_u, v_v;
+ WORD32 temp1, temp2;
+
+ __m128i h_val1_16x8b, h_val2_16x8b;
+ __m128i h_val1_8x16b, h_val2_8x16b, h_val_4x32b;
+ __m128i v_val1_16x8b, v_val2_16x8b;
+ __m128i v_val1_8x16b, v_val2_8x16b, v_val_4x32b;
+ __m128i hv_val_4x32b;
+
+ h_val1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_top + 8));
+ h_val2_16x8b = _mm_loadl_epi64((__m128i *)(pu1_top - 2));
+ v_val1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_left - 14));
+ v_val2_16x8b = _mm_loadl_epi64((__m128i *)(pu1_left - 4));
+
+ // reversing the order
+ h_val2_16x8b = _mm_shufflelo_epi16(h_val2_16x8b, 0x1b);
+ v_val1_16x8b = _mm_shufflelo_epi16(v_val1_16x8b, 0x1b);
+
+ // separating u and v and 8-bit to 16-bit conversion
+ h_val1_8x16b = _mm_shuffle_epi8(h_val1_16x8b, shuffle_8x16b);
+ h_val2_8x16b = _mm_shuffle_epi8(h_val2_16x8b, shuffle_8x16b);
+ v_val1_8x16b = _mm_shuffle_epi8(v_val1_16x8b, shuffle_8x16b);
+ v_val2_8x16b = _mm_shuffle_epi8(v_val2_16x8b, shuffle_8x16b);
+
+ h_val1_8x16b = _mm_sub_epi16(h_val1_8x16b, h_val2_8x16b);
+ v_val1_8x16b = _mm_sub_epi16(v_val1_8x16b, v_val2_8x16b);
+
+ h_val_4x32b = _mm_madd_epi16(mul_8x16b, h_val1_8x16b);
+ v_val_4x32b = _mm_madd_epi16(mul_8x16b, v_val1_8x16b);
+
+ temp1 = _mm_extract_epi16(h_val1_16x8b, 3);
+ temp2 = _mm_extract_epi16(v_val1_16x8b, 3);
+
+ hv_val_4x32b = _mm_hadd_epi32(h_val_4x32b, v_val_4x32b);
+
+ a_u = ((temp1 & 0xff) + (temp2 & 0xff)) << 4;
+ a_v = ((temp1 >> 8) + (temp2 >> 8)) << 4;
+
+ h_u = _mm_extract_epi16(hv_val_4x32b, 0);
+ h_v = _mm_extract_epi16(hv_val_4x32b, 2);
+ v_u = _mm_extract_epi16(hv_val_4x32b, 4);
+ v_v = _mm_extract_epi16(hv_val_4x32b, 6);
+
+ h_u = (h_u << 16) >> 15; // sign-extension and multiplication by 2
+ h_v = (h_v << 16) >> 15;
+ v_u = (v_u << 16) >> 15;
+ v_v = (v_v << 16) >> 15;
+
+ b_u = ((h_u << 4) + h_u + 32) >> 6;
+ b_v = ((h_v << 4) + h_v + 32) >> 6;
+ c_u = ((v_u << 4) + v_u + 32) >> 6;
+ c_v = ((v_v << 4) + v_v + 32) >> 6;
+ }
+ //using a, b and c to compute the fitted plane values
+ {
+ __m128i const_8x16b, c2_8x16b;
+ __m128i res1_l_8x16b, res1_h_8x16b;
+ __m128i res2_l_8x16b, res2_h_8x16b;
+ __m128i res1_sh_l_8x16b, res1_sh_h_8x16b, res1_16x8b;
+ __m128i res2_sh_l_8x16b, res2_sh_h_8x16b, res2_16x8b;
+
+ WORD32 b_u2, b_v2, b_u3, b_v3;
+ WORD32 const_u, const_v;
+ WORD32 dst_strd2;
+
+ const_u = a_u - (c_u << 1) - c_u + 16;
+ const_v = a_v - (c_v << 1) - c_v + 16;
+
+ b_u2 = b_u << 1;
+ b_v2 = b_v << 1;
+ b_u3 = b_u + b_u2;
+ b_v3 = b_v + b_v2;
+
+ const_8x16b = _mm_setr_epi16(const_u, const_v, const_u, const_v, const_u, const_v, const_u, const_v);
+ res1_l_8x16b = _mm_setr_epi16(-b_u3, -b_v3, -b_u2, -b_v2, -b_u, -b_v, 0, 0);
+ //contains {-b*3, -b*2, -b*1, b*0}
+ res1_h_8x16b = _mm_setr_epi16(b_u, b_v, b_u2, b_v2, b_u3, b_v3, b_u << 2, b_v << 2);
+ //contains {b*1, b*2, b*3, b*4}
+ c2_8x16b = _mm_setr_epi16(c_u, c_v, c_u, c_v, c_u, c_v, c_u, c_v);
+
+ // rows 1, 2
+ res1_l_8x16b = _mm_add_epi16(res1_l_8x16b, const_8x16b);
+ res1_h_8x16b = _mm_add_epi16(res1_h_8x16b, const_8x16b);
+ res2_l_8x16b = _mm_add_epi16(res1_l_8x16b, c2_8x16b);
+ res2_h_8x16b = _mm_add_epi16(res1_h_8x16b, c2_8x16b);
+
+ res1_sh_l_8x16b = _mm_srai_epi16(res1_l_8x16b, 5);
+ res1_sh_h_8x16b = _mm_srai_epi16(res1_h_8x16b, 5);
+ res2_sh_l_8x16b = _mm_srai_epi16(res2_l_8x16b, 5);
+ res2_sh_h_8x16b = _mm_srai_epi16(res2_h_8x16b, 5);
+
+ dst_strd2 = dst_strd << 1;
+ c2_8x16b = _mm_slli_epi16(c2_8x16b, 1);
+
+ res1_16x8b = _mm_packus_epi16(res1_sh_l_8x16b, res1_sh_h_8x16b);
+ res2_16x8b = _mm_packus_epi16(res2_sh_l_8x16b, res2_sh_h_8x16b);
+
+ _mm_storeu_si128((__m128i *)pu1_dst, res1_16x8b);
+ _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd), res2_16x8b);
+
+ // rows 3, 4
+ res1_l_8x16b = _mm_add_epi16(res1_l_8x16b, c2_8x16b);
+ res1_h_8x16b = _mm_add_epi16(res1_h_8x16b, c2_8x16b);
+ res2_l_8x16b = _mm_add_epi16(res2_l_8x16b, c2_8x16b);
+ res2_h_8x16b = _mm_add_epi16(res2_h_8x16b, c2_8x16b);
+
+ res1_sh_l_8x16b = _mm_srai_epi16(res1_l_8x16b, 5);
+ res1_sh_h_8x16b = _mm_srai_epi16(res1_h_8x16b, 5);
+ res2_sh_l_8x16b = _mm_srai_epi16(res2_l_8x16b, 5);
+ res2_sh_h_8x16b = _mm_srai_epi16(res2_h_8x16b, 5);
+
+ pu1_dst += dst_strd2;
+
+ res1_16x8b = _mm_packus_epi16(res1_sh_l_8x16b, res1_sh_h_8x16b);
+ res2_16x8b = _mm_packus_epi16(res2_sh_l_8x16b, res2_sh_h_8x16b);
+
+ _mm_storeu_si128((__m128i *)pu1_dst, res1_16x8b);
+ _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd), res2_16x8b);
+
+ // rows 5, 6
+ res1_l_8x16b = _mm_add_epi16(res1_l_8x16b, c2_8x16b);
+ res1_h_8x16b = _mm_add_epi16(res1_h_8x16b, c2_8x16b);
+ res2_l_8x16b = _mm_add_epi16(res2_l_8x16b, c2_8x16b);
+ res2_h_8x16b = _mm_add_epi16(res2_h_8x16b, c2_8x16b);
+
+ res1_sh_l_8x16b = _mm_srai_epi16(res1_l_8x16b, 5);
+ res1_sh_h_8x16b = _mm_srai_epi16(res1_h_8x16b, 5);
+ res2_sh_l_8x16b = _mm_srai_epi16(res2_l_8x16b, 5);
+ res2_sh_h_8x16b = _mm_srai_epi16(res2_h_8x16b, 5);
+
+ pu1_dst += dst_strd2;
+
+ res1_16x8b = _mm_packus_epi16(res1_sh_l_8x16b, res1_sh_h_8x16b);
+ res2_16x8b = _mm_packus_epi16(res2_sh_l_8x16b, res2_sh_h_8x16b);
+
+ _mm_storeu_si128((__m128i *)pu1_dst, res1_16x8b);
+ _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd), res2_16x8b);
+
+ // rows 7, 8
+ res1_l_8x16b = _mm_add_epi16(res1_l_8x16b, c2_8x16b);
+ res1_h_8x16b = _mm_add_epi16(res1_h_8x16b, c2_8x16b);
+ res2_l_8x16b = _mm_add_epi16(res2_l_8x16b, c2_8x16b);
+ res2_h_8x16b = _mm_add_epi16(res2_h_8x16b, c2_8x16b);
+
+ res1_sh_l_8x16b = _mm_srai_epi16(res1_l_8x16b, 5);
+ res1_sh_h_8x16b = _mm_srai_epi16(res1_h_8x16b, 5);
+ res2_sh_l_8x16b = _mm_srai_epi16(res2_l_8x16b, 5);
+ res2_sh_h_8x16b = _mm_srai_epi16(res2_h_8x16b, 5);
+
+ pu1_dst += dst_strd2;
+
+ res1_16x8b = _mm_packus_epi16(res1_sh_l_8x16b, res1_sh_h_8x16b);
+ res2_16x8b = _mm_packus_epi16(res2_sh_l_8x16b, res2_sh_h_8x16b);
+
+ _mm_storeu_si128((__m128i *)pu1_dst, res1_16x8b);
+ _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd), res2_16x8b);
+
+ }
+}
diff --git a/common/x86/ih264_deblk_chroma_ssse3.c b/common/x86/ih264_deblk_chroma_ssse3.c
new file mode 100755
index 0000000..a36447a
--- /dev/null
+++ b/common/x86/ih264_deblk_chroma_ssse3.c
@@ -0,0 +1,1087 @@
+/******************************************************************************
+ *
+ * Copyright (C) 2015 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ *****************************************************************************
+ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
+*/
+/*****************************************************************************/
+/* */
+/* File Name : ih264_deblk_chroma_ssse3.c */
+/* */
+/* Description : Contains function definitions for deblocking */
+/* */
+/* List of Functions : ih264_deblk_chroma_vert_bs4_ssse3() */
+/* ih264_deblk_chroma_horz_bs4_ssse3() */
+/* ih264_deblk_chroma_vert_bslt4_ssse3() */
+/* ih264_deblk_chroma_horz_bslt4_ssse3() */
+/* ih264_deblk_chroma_vert_bs4_mbaff_ssse3() */
+/* ih264_deblk_chroma_vert_bslt4_mbaff_ssse3() */
+/* */
+/* Issues / Problems : None */
+/* */
+/* Revision History : */
+/* */
+/* DD MM YYYY Author(s) Changes (Describe the changes made) */
+/* 12 02 2015 Naveen Kumar P Added chrom deblocking ssse3 */
+/* intrinsics */
+/* */
+/*****************************************************************************/
+
+/*****************************************************************************/
+/* File Includes */
+/*****************************************************************************/
+
+/* System include files */
+#include <stdio.h>
+
+/* User include files */
+#include "ih264_typedefs.h"
+#include "ih264_platform_macros.h"
+#include "ih264_deblk_edge_filters.h"
+#include "ih264_macros.h"
+
+/*****************************************************************************/
+/* Function Definitions */
+/*****************************************************************************/
+
+/*****************************************************************************/
+/* */
+/* Function Name : ih264_deblk_chroma_vert_bs4_ssse3() */
+/* */
+/* Description : This function performs filtering of a chroma block */
+/* vertical edge when the boundary strength is set to 4 in */
+/* high profile. */
+/* */
+/* Inputs : pu1_src - pointer to the src sample q0 of U */
+/* src_strd - source stride */
+/* alpha_cb - alpha value for the boundary in U */
+/* beta_cb - beta value for the boundary in U */
+/* alpha_cr - alpha value for the boundary in V */
+/* beta_cr - beta value for the boundary in V */
+/* */
+/* Globals : None */
+/* */
+/* Processing : This operation is described in Sec. 8.7.2.4 under the */
+/* title "Filtering process for edges for bS equal to 4" in */
+/* ITU T Rec H.264 with alpha and beta values different in */
+/* U and V. */
+/* */
+/* Outputs : None */
+/* */
+/* Returns : None */
+/* */
+/* Issues : None */
+/* */
+/* Revision History: */
+/* */
+/* DD MM YYYY Author(s) Changes (Describe the changes made) */
+/* 12 02 2015 Naveen Kumar P Initial version */
+/* */
+/*****************************************************************************/
+void ih264_deblk_chroma_vert_bs4_ssse3(UWORD8 *pu1_src,
+ WORD32 src_strd,
+ WORD32 alpha_cb,
+ WORD32 beta_cb,
+ WORD32 alpha_cr,
+ WORD32 beta_cr)
+{
+ UWORD8 *pu1_src_uv = pu1_src; /* Pointer to the src sample q0 of plane U*/
+ WORD32 alpha_cbcr = (alpha_cr << 16) + alpha_cb;
+ WORD32 beta_cbcr = (beta_cr << 16) + beta_cb;
+ __m128i linea, lineb, linec, lined, linee, linef, lineg, lineh;
+ __m128i temp1, temp2, temp3, temp4;
+
+ __m128i q0_uv_16x8, p0_uv_16x8, q1_uv_16x8, p1_uv_16x8;
+ __m128i q0_uv_8x16, p0_uv_8x16, q1_uv_8x16, p1_uv_8x16;
+ __m128i flag1, flag2;
+ __m128i diff, alpha_cbcr_16x8, beta_cbcr_16x8;
+ __m128i zero = _mm_setzero_si128();
+ __m128i p0_uv_8x16_1, p0_uv_8x16_2, q0_uv_8x16_1, q0_uv_8x16_2;
+
+ /* Load and transpose the pixel values */
+ linea = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4));
+ lineb = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + src_strd));
+ linec = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + 2 * src_strd));
+ lined = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + 3 * src_strd));
+ linee = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + 4 * src_strd));
+ linef = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + 5 * src_strd));
+ lineg = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + 6 * src_strd));
+ lineh = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + 7 * src_strd));
+
+ temp1 = _mm_unpacklo_epi16(linea, lineb);
+ temp2 = _mm_unpacklo_epi16(linec, lined);
+ temp3 = _mm_unpacklo_epi16(linee, linef);
+ temp4 = _mm_unpacklo_epi16(lineg, lineh);
+
+ p1_uv_8x16 = _mm_unpacklo_epi32(temp1, temp2);
+ p0_uv_8x16 = _mm_unpacklo_epi32(temp3, temp4);
+ q0_uv_8x16 = _mm_unpackhi_epi32(temp1, temp2);
+ q1_uv_8x16 = _mm_unpackhi_epi32(temp3, temp4);
+
+ p1_uv_16x8 = _mm_unpacklo_epi64(p1_uv_8x16, p0_uv_8x16);
+ p0_uv_16x8 = _mm_unpackhi_epi64(p1_uv_8x16, p0_uv_8x16);
+ q0_uv_16x8 = _mm_unpacklo_epi64(q0_uv_8x16, q1_uv_8x16);
+ q1_uv_16x8 = _mm_unpackhi_epi64(q0_uv_8x16, q1_uv_8x16);
+ /* End of transpose */
+
+ q0_uv_8x16 = _mm_unpacklo_epi8(q0_uv_16x8, zero);
+ q1_uv_8x16 = _mm_unpacklo_epi8(q1_uv_16x8, zero);
+ p1_uv_8x16 = _mm_unpacklo_epi8(p1_uv_16x8, zero);
+ p0_uv_8x16 = _mm_unpacklo_epi8(p0_uv_16x8, zero);
+
+ diff = _mm_subs_epi16(p0_uv_8x16, q0_uv_8x16); //Condn 1
+ diff = _mm_abs_epi16(diff);
+ alpha_cbcr_16x8 = _mm_set1_epi32(alpha_cbcr);
+ flag1 = _mm_cmpgt_epi16(alpha_cbcr_16x8, diff);
+
+ diff = _mm_subs_epi16(q1_uv_8x16, q0_uv_8x16); //Condtn 2
+ diff = _mm_abs_epi16(diff);
+ beta_cbcr_16x8 = _mm_set1_epi32(beta_cbcr);
+ flag1 = _mm_and_si128(flag1, _mm_cmpgt_epi16(beta_cbcr_16x8, diff));
+
+ diff = _mm_subs_epi16(p1_uv_8x16, p0_uv_8x16); //Condtn 3
+ diff = _mm_abs_epi16(diff);
+ flag1 = _mm_and_si128(flag1, _mm_cmpgt_epi16(beta_cbcr_16x8, diff));
+
+ temp1 = _mm_slli_epi16(p1_uv_8x16, 1);
+ temp2 = _mm_add_epi16(p0_uv_8x16, q1_uv_8x16);
+ temp1 = _mm_add_epi16(temp1, _mm_set1_epi16(2));
+ temp1 = _mm_add_epi16(temp1, temp2);
+ p0_uv_8x16_1 = _mm_srai_epi16(temp1, 2);
+
+ temp1 = _mm_slli_epi16(q1_uv_8x16, 1);
+ temp2 = _mm_add_epi16(p1_uv_8x16, q0_uv_8x16);
+ temp1 = _mm_add_epi16(temp1, _mm_set1_epi16(2));
+ temp1 = _mm_add_epi16(temp1, temp2);
+ q0_uv_8x16_1 = _mm_srai_epi16(temp1, 2);
+
+ q0_uv_8x16 = _mm_unpackhi_epi8(q0_uv_16x8, zero);
+ q1_uv_8x16 = _mm_unpackhi_epi8(q1_uv_16x8, zero);
+ p1_uv_8x16 = _mm_unpackhi_epi8(p1_uv_16x8, zero);
+ p0_uv_8x16 = _mm_unpackhi_epi8(p0_uv_16x8, zero);
+
+ diff = _mm_subs_epi16(p0_uv_8x16, q0_uv_8x16); //Condn 1
+ diff = _mm_abs_epi16(diff);
+ alpha_cbcr_16x8 = _mm_set1_epi32(alpha_cbcr);
+ flag2 = _mm_cmpgt_epi16(alpha_cbcr_16x8, diff);
+
+ diff = _mm_subs_epi16(q1_uv_8x16, q0_uv_8x16); //Condtn 2
+ diff = _mm_abs_epi16(diff);
+ beta_cbcr_16x8 = _mm_set1_epi32(beta_cbcr);
+ flag2 = _mm_and_si128(flag2, _mm_cmpgt_epi16(beta_cbcr_16x8, diff));
+
+ diff = _mm_subs_epi16(p1_uv_8x16, p0_uv_8x16); //Condtn 3
+ diff = _mm_abs_epi16(diff);
+ flag2 = _mm_and_si128(flag2, _mm_cmpgt_epi16(beta_cbcr_16x8, diff));
+
+ temp1 = _mm_slli_epi16(p1_uv_8x16, 1);
+ temp2 = _mm_add_epi16(p0_uv_8x16, q1_uv_8x16);
+ temp1 = _mm_add_epi16(temp1, _mm_set1_epi16(2));
+ temp1 = _mm_add_epi16(temp1, temp2);
+ p0_uv_8x16_2 = _mm_srai_epi16(temp1, 2);
+
+ temp1 = _mm_slli_epi16(q1_uv_8x16, 1);
+ temp2 = _mm_add_epi16(p1_uv_8x16, q0_uv_8x16);
+ temp1 = _mm_add_epi16(temp1, _mm_set1_epi16(2));
+ temp1 = _mm_add_epi16(temp1, temp2);
+ q0_uv_8x16_2 = _mm_srai_epi16(temp1, 2);
+
+ p0_uv_8x16_2 = _mm_packus_epi16(p0_uv_8x16_1, p0_uv_8x16_2);
+ q0_uv_8x16_2 = _mm_packus_epi16(q0_uv_8x16_1, q0_uv_8x16_2);
+
+ flag1 = _mm_packs_epi16(flag1, flag2);
+
+ p0_uv_8x16_1 = _mm_and_si128(p0_uv_16x8,
+ _mm_xor_si128(flag1, _mm_set1_epi8(0xFF)));
+ p0_uv_8x16_2 = _mm_and_si128(p0_uv_8x16_2, flag1);
+ p0_uv_16x8 = _mm_add_epi8(p0_uv_8x16_1, p0_uv_8x16_2);
+
+ q0_uv_8x16_1 = _mm_and_si128(q0_uv_16x8,
+ _mm_xor_si128(flag1, _mm_set1_epi8(0xFF)));
+ q0_uv_8x16_2 = _mm_and_si128(q0_uv_8x16_2, flag1);
+ q0_uv_16x8 = _mm_add_epi8(q0_uv_8x16_1, q0_uv_8x16_2);
+
+ /* Inverse-transpose and store back */
+ temp1 = _mm_unpacklo_epi16(p1_uv_16x8, p0_uv_16x8);
+ temp2 = _mm_unpackhi_epi16(p1_uv_16x8, p0_uv_16x8);
+ temp3 = _mm_unpacklo_epi16(q0_uv_16x8, q1_uv_16x8);
+ temp4 = _mm_unpackhi_epi16(q0_uv_16x8, q1_uv_16x8);
+
+ linea = _mm_unpacklo_epi32(temp1, temp3);
+ lineb = _mm_srli_si128(linea, 8);
+ linec = _mm_unpackhi_epi32(temp1, temp3);
+ lined = _mm_srli_si128(linec, 8);
+ linee = _mm_unpacklo_epi32(temp2, temp4);
+ linef = _mm_srli_si128(linee, 8);
+ lineg = _mm_unpackhi_epi32(temp2, temp4);
+ lineh = _mm_srli_si128(lineg, 8);
+
+ _mm_storel_epi64((__m128i *)(pu1_src_uv - 4), linea);
+ _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + src_strd), lineb);
+ _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + 2 * src_strd), linec);
+ _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + 3 * src_strd), lined);
+ _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + 4 * src_strd), linee);
+ _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + 5 * src_strd), linef);
+ _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + 6 * src_strd), lineg);
+ _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + 7 * src_strd), lineh);
+
+}
+
+/*****************************************************************************/
+/* */
+/* Function Name : ih264_deblk_chroma_horz_bs4_ssse3() */
+/* */
+/* Description : This function performs filtering of a chroma block */
+/* horizontal edge when the boundary strength is set to 4 */
+/* in high profile. */
+/* */
+/* Inputs : pu1_src - pointer to the src sample q0 of U */
+/* src_strd - source stride */
+/* alpha_cb - alpha value for the boundary in U */
+/* beta_cb - beta value for the boundary in U */
+/* alpha_cr - alpha value for the boundary in V */
+/* beta_cr - beta value for the boundary in V */
+/* */
+/* Globals : None */
+/* */
+/* Processing : This operation is described in Sec. 8.7.2.4 under the */
+/* title "Filtering process for edges for bS equal to 4" in */
+/* ITU T Rec H.264 with alpha and beta values different in */
+/* U and V. */
+/* */
+/* Outputs : None */
+/* */
+/* Returns : None */
+/* */
+/* Issues : None */
+/* */
+/* Revision History: */
+/* */
+/* DD MM YYYY Author(s) Changes (Describe the changes made) */
+/* 12 02 2015 Naveen Kumar P Initial version */
+/* */
+/*****************************************************************************/
+void ih264_deblk_chroma_horz_bs4_ssse3(UWORD8 *pu1_src,
+ WORD32 src_strd,
+ WORD32 alpha_cb,
+ WORD32 beta_cb,
+ WORD32 alpha_cr,
+ WORD32 beta_cr)
+{
+ UWORD8 *pu1_src_uv = pu1_src; /* Pointer to the src sample q0 of plane U*/
+ WORD16 i16_posP1, i16_posP0, i16_posQ1;
+
+ UWORD8 *pu1_HorzPixelUV; /*! < Pointer to the first pixel of the boundary */
+ WORD32 alpha_cbcr = (alpha_cr << 16) + alpha_cb;
+ WORD32 beta_cbcr = (beta_cr << 16) + beta_cb;
+ __m128i q0_uv_16x8, p0_uv_16x8, q1_uv_16x8, p1_uv_16x8;
+ __m128i q0_uv_8x16, p0_uv_8x16, q1_uv_8x16, p1_uv_8x16;
+ __m128i flag1, flag2;
+ __m128i diff, alpha_cbcr_16x8, beta_cbcr_16x8;
+ __m128i zero = _mm_setzero_si128();
+ __m128i p0_uv_8x16_1, p0_uv_8x16_2, q0_uv_8x16_1, q0_uv_8x16_2;
+ __m128i temp1, temp2;
+
+ pu1_HorzPixelUV = pu1_src_uv - (src_strd << 1);
+
+ i16_posQ1 = src_strd;
+ i16_posP0 = src_strd;
+ i16_posP1 = 0;
+
+ q0_uv_16x8 = _mm_loadu_si128((__m128i *)(pu1_src_uv));
+ q1_uv_16x8 = _mm_loadu_si128((__m128i *)(pu1_src_uv + i16_posQ1));
+ p1_uv_16x8 = _mm_loadu_si128((__m128i *)(pu1_HorzPixelUV + i16_posP1));
+ p0_uv_16x8 = _mm_loadu_si128((__m128i *)(pu1_HorzPixelUV + i16_posP0));
+
+ q0_uv_8x16 = _mm_unpacklo_epi8(q0_uv_16x8, zero);
+ q1_uv_8x16 = _mm_unpacklo_epi8(q1_uv_16x8, zero);
+ p1_uv_8x16 = _mm_unpacklo_epi8(p1_uv_16x8, zero);
+ p0_uv_8x16 = _mm_unpacklo_epi8(p0_uv_16x8, zero);
+
+ diff = _mm_subs_epi16(p0_uv_8x16, q0_uv_8x16); //Condn 1
+ diff = _mm_abs_epi16(diff);
+ alpha_cbcr_16x8 = _mm_set1_epi32(alpha_cbcr);
+ flag1 = _mm_cmpgt_epi16(alpha_cbcr_16x8, diff);
+
+ diff = _mm_subs_epi16(q1_uv_8x16, q0_uv_8x16); //Condtn 2
+ diff = _mm_abs_epi16(diff);
+ beta_cbcr_16x8 = _mm_set1_epi32(beta_cbcr);
+ flag1 = _mm_and_si128(flag1, _mm_cmpgt_epi16(beta_cbcr_16x8, diff));
+
+ diff = _mm_subs_epi16(p1_uv_8x16, p0_uv_8x16); //Condtn 3
+ diff = _mm_abs_epi16(diff);
+ flag1 = _mm_and_si128(flag1, _mm_cmpgt_epi16(beta_cbcr_16x8, diff));
+
+ temp1 = _mm_slli_epi16(p1_uv_8x16, 1);
+ temp2 = _mm_add_epi16(p0_uv_8x16, q1_uv_8x16);
+ temp1 = _mm_add_epi16(temp1, _mm_set1_epi16(2));
+ temp1 = _mm_add_epi16(temp1, temp2);
+ p0_uv_8x16_1 = _mm_srai_epi16(temp1, 2);
+
+ temp1 = _mm_slli_epi16(q1_uv_8x16, 1);
+ temp2 = _mm_add_epi16(p1_uv_8x16, q0_uv_8x16);
+ temp1 = _mm_add_epi16(temp1, _mm_set1_epi16(2));
+ temp1 = _mm_add_epi16(temp1, temp2);
+ q0_uv_8x16_1 = _mm_srai_epi16(temp1, 2);
+
+ q0_uv_8x16 = _mm_unpackhi_epi8(q0_uv_16x8, zero);
+ q1_uv_8x16 = _mm_unpackhi_epi8(q1_uv_16x8, zero);
+ p1_uv_8x16 = _mm_unpackhi_epi8(p1_uv_16x8, zero);
+ p0_uv_8x16 = _mm_unpackhi_epi8(p0_uv_16x8, zero);
+
+ diff = _mm_subs_epi16(p0_uv_8x16, q0_uv_8x16); //Condn 1
+ diff = _mm_abs_epi16(diff);
+ alpha_cbcr_16x8 = _mm_set1_epi32(alpha_cbcr);
+ flag2 = _mm_cmpgt_epi16(alpha_cbcr_16x8, diff);
+
+ diff = _mm_subs_epi16(q1_uv_8x16, q0_uv_8x16); //Condtn 2
+ diff = _mm_abs_epi16(diff);
+ beta_cbcr_16x8 = _mm_set1_epi32(beta_cbcr);
+ flag2 = _mm_and_si128(flag2, _mm_cmpgt_epi16(beta_cbcr_16x8, diff));
+
+ diff = _mm_subs_epi16(p1_uv_8x16, p0_uv_8x16); //Condtn 3
+ diff = _mm_abs_epi16(diff);
+ flag2 = _mm_and_si128(flag2, _mm_cmpgt_epi16(beta_cbcr_16x8, diff));
+
+ temp1 = _mm_slli_epi16(p1_uv_8x16, 1);
+ temp2 = _mm_add_epi16(p0_uv_8x16, q1_uv_8x16);
+ temp1 = _mm_add_epi16(temp1, _mm_set1_epi16(2));
+ temp1 = _mm_add_epi16(temp1, temp2);
+ p0_uv_8x16_2 = _mm_srai_epi16(temp1, 2);
+
+ temp1 = _mm_slli_epi16(q1_uv_8x16, 1);
+ temp2 = _mm_add_epi16(p1_uv_8x16, q0_uv_8x16);
+ temp1 = _mm_add_epi16(temp1, _mm_set1_epi16(2));
+ temp1 = _mm_add_epi16(temp1, temp2);
+ q0_uv_8x16_2 = _mm_srai_epi16(temp1, 2);
+
+ p0_uv_8x16_2 = _mm_packus_epi16(p0_uv_8x16_1, p0_uv_8x16_2);
+ q0_uv_8x16_2 = _mm_packus_epi16(q0_uv_8x16_1, q0_uv_8x16_2);
+
+ flag1 = _mm_packs_epi16(flag1, flag2);
+
+ p0_uv_8x16_1 = _mm_and_si128(p0_uv_16x8,
+ _mm_xor_si128(flag1, _mm_set1_epi8(0xFF)));
+ p0_uv_8x16_2 = _mm_and_si128(p0_uv_8x16_2, flag1);
+ p0_uv_8x16_1 = _mm_add_epi8(p0_uv_8x16_1, p0_uv_8x16_2);
+ _mm_storeu_si128((__m128i *)(pu1_HorzPixelUV + i16_posP0), p0_uv_8x16_1);
+
+ q0_uv_8x16_1 = _mm_and_si128(q0_uv_16x8,
+ _mm_xor_si128(flag1, _mm_set1_epi8(0xFF)));
+ q0_uv_8x16_2 = _mm_and_si128(q0_uv_8x16_2, flag1);
+ q0_uv_8x16_1 = _mm_add_epi8(q0_uv_8x16_1, q0_uv_8x16_2);
+ _mm_storeu_si128((__m128i *)(pu1_src_uv), q0_uv_8x16_1);
+
+}
+
+/*****************************************************************************/
+/* */
+/* Function Name : ih264_deblk_chroma_vert_bslt4_ssse3() */
+/* */
+/* Description : This function performs filtering of a chroma block */
+/* vertical edge when the boundary strength is less than 4 */
+/* in high profile. */
+/* */
+/* Inputs : pu1_src - pointer to the src sample q0 of U */
+/* src_strd - source stride */
+/* alpha_cb - alpha value for the boundary in U */
+/* beta_cb - beta value for the boundary in U */
+/* alpha_cr - alpha value for the boundary in V */
+/* beta_cr - beta value for the boundary in V */
+/* u4_bs - packed Boundary strength array */
+/* pu1_cliptab_cb - tc0_table for U */
+/* pu1_cliptab_cr - tc0_table for V */
+/* */
+/* Globals : None */
+/* */
+/* Processing : This operation is described in Sec. 8.7.2.3 under the */
+/* title "Filtering process for edges for bS less than 4" */
+/* in ITU T Rec H.264 with alpha and beta values different */
+/* in U and V. */
+/* */
+/* Outputs : None */
+/* */
+/* Returns : None */
+/* */
+/* Issues : None */
+/* */
+/* Revision History: */
+/* */
+/* DD MM YYYY Author(s) Changes (Describe the changes made) */
+/* 12 02 2015 Naveen Kumar P Initial version */
+/* */
+/*****************************************************************************/
+void ih264_deblk_chroma_vert_bslt4_ssse3(UWORD8 *pu1_src,
+ WORD32 src_strd,
+ WORD32 alpha_cb,
+ WORD32 beta_cb,
+ WORD32 alpha_cr,
+ WORD32 beta_cr,
+ UWORD32 u4_bs,
+ const UWORD8 *pu1_cliptab_cb,
+ const UWORD8 *pu1_cliptab_cr)
+{
+ UWORD8 *pu1_src_uv = pu1_src; /* Pointer to the src sample q0 of plane U*/
+ UWORD8 u1_Bs0, u1_Bs1, u1_Bs2, u1_Bs3;
+ WORD32 alpha_cbcr = (alpha_cr << 16) + alpha_cb;
+ WORD32 beta_cbcr = (beta_cr << 16) + beta_cb;
+ __m128i linea, lineb, linec, lined, linee, linef, lineg, lineh;
+ __m128i temp1, temp2, temp3, temp4;
+
+ __m128i q0_uv_16x8, p0_uv_16x8, q1_uv_16x8, p1_uv_16x8;
+ __m128i q0_uv_8x16, p0_uv_8x16, q1_uv_8x16, p1_uv_8x16;
+ __m128i flag_bs, flag1, flag2;
+ __m128i diff, diff1, alpha_cbcr_16x8, beta_cbcr_16x8, in_macro;
+ __m128i zero = _mm_setzero_si128();
+ __m128i C0_uv_8x16;
+ __m128i p0_uv_8x16_1, p0_uv_8x16_2, q0_uv_8x16_1, q0_uv_8x16_2;
+
+ u1_Bs0 = (u4_bs >> 24) & 0xff;
+ u1_Bs1 = (u4_bs >> 16) & 0xff;
+ u1_Bs2 = (u4_bs >> 8) & 0xff;
+ u1_Bs3 = (u4_bs >> 0) & 0xff;
+
+ flag_bs = _mm_set_epi8(u1_Bs3, u1_Bs3, u1_Bs3, u1_Bs3, u1_Bs2, u1_Bs2,
+ u1_Bs2, u1_Bs2, u1_Bs1, u1_Bs1, u1_Bs1, u1_Bs1,
+ u1_Bs0, u1_Bs0, u1_Bs0, u1_Bs0);
+ flag_bs = _mm_cmpeq_epi8(flag_bs, zero); //Set flag to 1s and 0s
+ flag_bs = _mm_xor_si128(flag_bs, _mm_set1_epi8(0xFF)); //Invert for required mask
+
+ /* Load and transpose the pixel values */
+ linea = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4));
+ lineb = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + src_strd));
+ linec = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + 2 * src_strd));
+ lined = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + 3 * src_strd));
+ linee = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + 4 * src_strd));
+ linef = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + 5 * src_strd));
+ lineg = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + 6 * src_strd));
+ lineh = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + 7 * src_strd));
+
+ temp1 = _mm_unpacklo_epi16(linea, lineb);
+ temp2 = _mm_unpacklo_epi16(linec, lined);
+ temp3 = _mm_unpacklo_epi16(linee, linef);
+ temp4 = _mm_unpacklo_epi16(lineg, lineh);
+
+ p1_uv_8x16 = _mm_unpacklo_epi32(temp1, temp2);
+ p0_uv_8x16 = _mm_unpacklo_epi32(temp3, temp4);
+ q0_uv_8x16 = _mm_unpackhi_epi32(temp1, temp2);
+ q1_uv_8x16 = _mm_unpackhi_epi32(temp3, temp4);
+
+ p1_uv_16x8 = _mm_unpacklo_epi64(p1_uv_8x16, p0_uv_8x16);
+ p0_uv_16x8 = _mm_unpackhi_epi64(p1_uv_8x16, p0_uv_8x16);
+ q0_uv_16x8 = _mm_unpacklo_epi64(q0_uv_8x16, q1_uv_8x16);
+ q1_uv_16x8 = _mm_unpackhi_epi64(q0_uv_8x16, q1_uv_8x16);
+ /* End of transpose */
+
+ q0_uv_8x16 = _mm_unpacklo_epi8(q0_uv_16x8, zero);
+ q1_uv_8x16 = _mm_unpacklo_epi8(q1_uv_16x8, zero);
+ p1_uv_8x16 = _mm_unpacklo_epi8(p1_uv_16x8, zero);
+ p0_uv_8x16 = _mm_unpacklo_epi8(p0_uv_16x8, zero);
+
+ diff = _mm_subs_epi16(p0_uv_8x16, q0_uv_8x16); //Condn 1
+ diff = _mm_abs_epi16(diff);
+ alpha_cbcr_16x8 = _mm_set1_epi32(alpha_cbcr);
+ flag1 = _mm_cmpgt_epi16(alpha_cbcr_16x8, diff);
+
+ diff = _mm_subs_epi16(q1_uv_8x16, q0_uv_8x16); //Condtn 2
+ diff = _mm_abs_epi16(diff);
+ beta_cbcr_16x8 = _mm_set1_epi32(beta_cbcr);
+ flag1 = _mm_and_si128(flag1, _mm_cmpgt_epi16(beta_cbcr_16x8, diff));
+
+ diff = _mm_subs_epi16(p1_uv_8x16, p0_uv_8x16); //Condtn 3
+ diff = _mm_abs_epi16(diff);
+ flag1 = _mm_and_si128(flag1, _mm_cmpgt_epi16(beta_cbcr_16x8, diff));
+
+ diff = _mm_subs_epi16(q0_uv_8x16, p0_uv_8x16);
+ diff = _mm_slli_epi16(diff, 2);
+ diff1 = _mm_subs_epi16(p1_uv_8x16, q1_uv_8x16);
+ diff = _mm_add_epi16(diff, diff1);
+ diff = _mm_add_epi16(diff, _mm_set1_epi16(4));
+ in_macro = _mm_srai_epi16(diff, 3);
+
+ C0_uv_8x16 = _mm_set_epi16(pu1_cliptab_cr[u1_Bs1], pu1_cliptab_cb[u1_Bs1],
+ pu1_cliptab_cr[u1_Bs1], pu1_cliptab_cb[u1_Bs1],
+ pu1_cliptab_cr[u1_Bs0], pu1_cliptab_cb[u1_Bs0],
+ pu1_cliptab_cr[u1_Bs0], pu1_cliptab_cb[u1_Bs0]);
+
+ C0_uv_8x16 = _mm_add_epi16(C0_uv_8x16, _mm_set1_epi16(1));
+
+ in_macro = _mm_min_epi16(C0_uv_8x16, in_macro); //CLIP3
+ C0_uv_8x16 = _mm_subs_epi16(zero, C0_uv_8x16);
+ in_macro = _mm_max_epi16(C0_uv_8x16, in_macro);
+
+ p0_uv_8x16_1 = _mm_add_epi16(p0_uv_8x16, in_macro);
+ q0_uv_8x16_1 = _mm_sub_epi16(q0_uv_8x16, in_macro);
+
+ q0_uv_8x16 = _mm_unpackhi_epi8(q0_uv_16x8, zero);
+ q1_uv_8x16 = _mm_unpackhi_epi8(q1_uv_16x8, zero);
+ p1_uv_8x16 = _mm_unpackhi_epi8(p1_uv_16x8, zero);
+ p0_uv_8x16 = _mm_unpackhi_epi8(p0_uv_16x8, zero);
+
+ diff = _mm_subs_epi16(p0_uv_8x16, q0_uv_8x16); //Condn 1
+ diff = _mm_abs_epi16(diff);
+ alpha_cbcr_16x8 = _mm_set1_epi32(alpha_cbcr);
+ flag2 = _mm_cmpgt_epi16(alpha_cbcr_16x8, diff);
+
+ diff = _mm_subs_epi16(q1_uv_8x16, q0_uv_8x16); //Condtn 2
+ diff = _mm_abs_epi16(diff);
+ beta_cbcr_16x8 = _mm_set1_epi32(beta_cbcr);
+ flag2 = _mm_and_si128(flag2, _mm_cmpgt_epi16(beta_cbcr_16x8, diff));
+
+ diff = _mm_subs_epi16(p1_uv_8x16, p0_uv_8x16); //Condtn 3
+ diff = _mm_abs_epi16(diff);
+ flag2 = _mm_and_si128(flag2, _mm_cmpgt_epi16(beta_cbcr_16x8, diff));
+
+ diff = _mm_subs_epi16(q0_uv_8x16, p0_uv_8x16);
+ diff = _mm_slli_epi16(diff, 2);
+ diff1 = _mm_subs_epi16(p1_uv_8x16, q1_uv_8x16);
+ diff = _mm_add_epi16(diff, diff1);
+ diff = _mm_add_epi16(diff, _mm_set1_epi16(4));
+ in_macro = _mm_srai_epi16(diff, 3);
+
+ C0_uv_8x16 = _mm_set_epi16(pu1_cliptab_cr[u1_Bs3], pu1_cliptab_cb[u1_Bs3],
+ pu1_cliptab_cr[u1_Bs3], pu1_cliptab_cb[u1_Bs3],
+ pu1_cliptab_cr[u1_Bs2], pu1_cliptab_cb[u1_Bs2],
+ pu1_cliptab_cr[u1_Bs2], pu1_cliptab_cb[u1_Bs2]);
+
+ C0_uv_8x16 = _mm_add_epi16(C0_uv_8x16, _mm_set1_epi16(1));
+
+ in_macro = _mm_min_epi16(C0_uv_8x16, in_macro); //CLIP3
+ C0_uv_8x16 = _mm_subs_epi16(zero, C0_uv_8x16);
+ in_macro = _mm_max_epi16(C0_uv_8x16, in_macro);
+
+ p0_uv_8x16_2 = _mm_add_epi16(p0_uv_8x16, in_macro);
+ q0_uv_8x16_2 = _mm_sub_epi16(q0_uv_8x16, in_macro);
+
+ p0_uv_8x16_2 = _mm_packus_epi16(p0_uv_8x16_1, p0_uv_8x16_2);
+ q0_uv_8x16_2 = _mm_packus_epi16(q0_uv_8x16_1, q0_uv_8x16_2);
+
+ flag1 = _mm_packs_epi16(flag1, flag2);
+ flag1 = _mm_and_si128(flag1, flag_bs); //Final flag (BS condition + other 3 conditions)
+
+ p0_uv_8x16_1 = _mm_and_si128(p0_uv_16x8,
+ _mm_xor_si128(flag1, _mm_set1_epi8(0xFF)));
+ p0_uv_8x16_2 = _mm_and_si128(p0_uv_8x16_2, flag1);
+ p0_uv_16x8 = _mm_add_epi8(p0_uv_8x16_1, p0_uv_8x16_2);
+
+ q0_uv_8x16_1 = _mm_and_si128(q0_uv_16x8,
+ _mm_xor_si128(flag1, _mm_set1_epi8(0xFF)));
+ q0_uv_8x16_2 = _mm_and_si128(q0_uv_8x16_2, flag1);
+ q0_uv_16x8 = _mm_add_epi8(q0_uv_8x16_1, q0_uv_8x16_2);
+
+ /* Inverse-transpose and store back */
+ temp1 = _mm_unpacklo_epi16(p1_uv_16x8, p0_uv_16x8);
+ temp2 = _mm_unpackhi_epi16(p1_uv_16x8, p0_uv_16x8);
+ temp3 = _mm_unpacklo_epi16(q0_uv_16x8, q1_uv_16x8);
+ temp4 = _mm_unpackhi_epi16(q0_uv_16x8, q1_uv_16x8);
+
+ linea = _mm_unpacklo_epi32(temp1, temp3);
+ lineb = _mm_srli_si128(linea, 8);
+ linec = _mm_unpackhi_epi32(temp1, temp3);
+ lined = _mm_srli_si128(linec, 8);
+ linee = _mm_unpacklo_epi32(temp2, temp4);
+ linef = _mm_srli_si128(linee, 8);
+ lineg = _mm_unpackhi_epi32(temp2, temp4);
+ lineh = _mm_srli_si128(lineg, 8);
+
+ _mm_storel_epi64((__m128i *)(pu1_src_uv - 4), linea);
+ _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + src_strd), lineb);
+ _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + 2 * src_strd), linec);
+ _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + 3 * src_strd), lined);
+ _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + 4 * src_strd), linee);
+ _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + 5 * src_strd), linef);
+ _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + 6 * src_strd), lineg);
+ _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + 7 * src_strd), lineh);
+
+}
+
+/*****************************************************************************/
+/* */
+/* Function Name : ih264_deblk_chroma_horz_bslt4_ssse3() */
+/* */
+/* Description : This function performs filtering of a chroma block */
+/* horizontal edge when the boundary strength is less than */
+/* 4 in high profile. */
+/* */
+/* Inputs : pu1_src - pointer to the src sample q0 of U */
+/* src_strd - source stride */
+/* alpha_cb - alpha value for the boundary in U */
+/* beta_cb - beta value for the boundary in U */
+/* alpha_cr - alpha value for the boundary in V */
+/* beta_cr - beta value for the boundary in V */
+/* u4_bs - packed Boundary strength array */
+/* pu1_cliptab_cb - tc0_table for U */
+/* pu1_cliptab_cr - tc0_table for V */
+/* */
+/* Globals : None */
+/* */
+/* Processing : This operation is described in Sec. 8.7.2.3 under the */
+/* title "Filtering process for edges for bS less than 4" */
+/* in ITU T Rec H.264 with alpha and beta values different */
+/* in U and V. */
+/* */
+/* Outputs : None */
+/* */
+/* Returns : None */
+/* */
+/* Issues : None */
+/* */
+/* Revision History: */
+/* */
+/* DD MM YYYY Author(s) Changes (Describe the changes made) */
+/* 12 02 2015 Naveen Kumar P Initial version */
+/* */
+/*****************************************************************************/
+void ih264_deblk_chroma_horz_bslt4_ssse3(UWORD8 *pu1_src,
+ WORD32 src_strd,
+ WORD32 alpha_cb,
+ WORD32 beta_cb,
+ WORD32 alpha_cr,
+ WORD32 beta_cr,
+ UWORD32 u4_bs,
+ const UWORD8 *pu1_cliptab_cb,
+ const UWORD8 *pu1_cliptab_cr)
+{
+ UWORD8 *pu1_src_uv = pu1_src; /* Pointer to the src sample q0 of plane U*/
+ WORD16 i16_posP1, i16_posP0, i16_posQ1;
+ UWORD8 u1_Bs0, u1_Bs1, u1_Bs2, u1_Bs3;
+
+ UWORD8 *pu1_HorzPixelUV; /*! < Pointer to the first pixel of the boundary */
+ WORD32 alpha_cbcr = (alpha_cr << 16) + alpha_cb;
+ WORD32 beta_cbcr = (beta_cr << 16) + beta_cb;
+ __m128i q0_uv_16x8, p0_uv_16x8, q1_uv_16x8, p1_uv_16x8;
+ __m128i q0_uv_8x16, p0_uv_8x16, q1_uv_8x16, p1_uv_8x16;
+ __m128i flag_bs, flag1, flag2;
+ __m128i diff, diff1, alpha_cbcr_16x8, beta_cbcr_16x8, in_macro;
+ __m128i zero = _mm_setzero_si128();
+ __m128i C0_uv_8x16;
+ __m128i p0_uv_8x16_1, p0_uv_8x16_2, q0_uv_8x16_1, q0_uv_8x16_2;
+
+ pu1_HorzPixelUV = pu1_src_uv - (src_strd << 1);
+
+ i16_posQ1 = src_strd;
+ i16_posP0 = src_strd;
+ i16_posP1 = 0;
+
+ u1_Bs0 = (u4_bs >> 24) & 0xff;
+ u1_Bs1 = (u4_bs >> 16) & 0xff;
+ u1_Bs2 = (u4_bs >> 8) & 0xff;
+ u1_Bs3 = (u4_bs >> 0) & 0xff;
+
+ flag_bs = _mm_set_epi8(u1_Bs3, u1_Bs3, u1_Bs3, u1_Bs3, u1_Bs2, u1_Bs2,
+ u1_Bs2, u1_Bs2, u1_Bs1, u1_Bs1, u1_Bs1, u1_Bs1,
+ u1_Bs0, u1_Bs0, u1_Bs0, u1_Bs0);
+ flag_bs = _mm_cmpeq_epi8(flag_bs, zero); //Set flag to 1s and 0s
+ flag_bs = _mm_xor_si128(flag_bs, _mm_set1_epi8(0xFF)); //Invert for required mask
+
+ q0_uv_16x8 = _mm_loadu_si128((__m128i *)(pu1_src_uv));
+ q1_uv_16x8 = _mm_loadu_si128((__m128i *)(pu1_src_uv + i16_posQ1));
+ p1_uv_16x8 = _mm_loadu_si128((__m128i *)(pu1_HorzPixelUV + i16_posP1));
+ p0_uv_16x8 = _mm_loadu_si128((__m128i *)(pu1_HorzPixelUV + i16_posP0));
+
+ q0_uv_8x16 = _mm_unpacklo_epi8(q0_uv_16x8, zero);
+ q1_uv_8x16 = _mm_unpacklo_epi8(q1_uv_16x8, zero);
+ p1_uv_8x16 = _mm_unpacklo_epi8(p1_uv_16x8, zero);
+ p0_uv_8x16 = _mm_unpacklo_epi8(p0_uv_16x8, zero);
+
+ diff = _mm_subs_epi16(p0_uv_8x16, q0_uv_8x16); //Condn 1
+ diff = _mm_abs_epi16(diff);
+ alpha_cbcr_16x8 = _mm_set1_epi32(alpha_cbcr);
+ flag1 = _mm_cmpgt_epi16(alpha_cbcr_16x8, diff);
+
+ diff = _mm_subs_epi16(q1_uv_8x16, q0_uv_8x16); //Condtn 2
+ diff = _mm_abs_epi16(diff);
+ beta_cbcr_16x8 = _mm_set1_epi32(beta_cbcr);
+ flag1 = _mm_and_si128(flag1, _mm_cmpgt_epi16(beta_cbcr_16x8, diff));
+
+ diff = _mm_subs_epi16(p1_uv_8x16, p0_uv_8x16); //Condtn 3
+ diff = _mm_abs_epi16(diff);
+ flag1 = _mm_and_si128(flag1, _mm_cmpgt_epi16(beta_cbcr_16x8, diff));
+
+ diff = _mm_subs_epi16(q0_uv_8x16, p0_uv_8x16);
+ diff = _mm_slli_epi16(diff, 2);
+ diff1 = _mm_subs_epi16(p1_uv_8x16, q1_uv_8x16);
+ diff = _mm_add_epi16(diff, diff1);
+ diff = _mm_add_epi16(diff, _mm_set1_epi16(4));
+ in_macro = _mm_srai_epi16(diff, 3);
+
+ C0_uv_8x16 = _mm_set_epi16(pu1_cliptab_cr[u1_Bs1], pu1_cliptab_cb[u1_Bs1],
+ pu1_cliptab_cr[u1_Bs1], pu1_cliptab_cb[u1_Bs1],
+ pu1_cliptab_cr[u1_Bs0], pu1_cliptab_cb[u1_Bs0],
+ pu1_cliptab_cr[u1_Bs0], pu1_cliptab_cb[u1_Bs0]);
+
+ C0_uv_8x16 = _mm_add_epi16(C0_uv_8x16, _mm_set1_epi16(1));
+
+ in_macro = _mm_min_epi16(C0_uv_8x16, in_macro); //CLIP3
+ C0_uv_8x16 = _mm_subs_epi16(zero, C0_uv_8x16);
+ in_macro = _mm_max_epi16(C0_uv_8x16, in_macro);
+
+ p0_uv_8x16_1 = _mm_add_epi16(p0_uv_8x16, in_macro);
+ q0_uv_8x16_1 = _mm_sub_epi16(q0_uv_8x16, in_macro);
+
+ q0_uv_8x16 = _mm_unpackhi_epi8(q0_uv_16x8, zero);
+ q1_uv_8x16 = _mm_unpackhi_epi8(q1_uv_16x8, zero);
+ p1_uv_8x16 = _mm_unpackhi_epi8(p1_uv_16x8, zero);
+ p0_uv_8x16 = _mm_unpackhi_epi8(p0_uv_16x8, zero);
+
+ diff = _mm_subs_epi16(p0_uv_8x16, q0_uv_8x16); //Condn 1
+ diff = _mm_abs_epi16(diff);
+ alpha_cbcr_16x8 = _mm_set1_epi32(alpha_cbcr);
+ flag2 = _mm_cmpgt_epi16(alpha_cbcr_16x8, diff);
+
+ diff = _mm_subs_epi16(q1_uv_8x16, q0_uv_8x16); //Condtn 2
+ diff = _mm_abs_epi16(diff);
+ beta_cbcr_16x8 = _mm_set1_epi32(beta_cbcr);
+ flag2 = _mm_and_si128(flag2, _mm_cmpgt_epi16(beta_cbcr_16x8, diff));
+
+ diff = _mm_subs_epi16(p1_uv_8x16, p0_uv_8x16); //Condtn 3
+ diff = _mm_abs_epi16(diff);
+ flag2 = _mm_and_si128(flag2, _mm_cmpgt_epi16(beta_cbcr_16x8, diff));
+
+ diff = _mm_subs_epi16(q0_uv_8x16, p0_uv_8x16);
+ diff = _mm_slli_epi16(diff, 2);
+ diff1 = _mm_subs_epi16(p1_uv_8x16, q1_uv_8x16);
+ diff = _mm_add_epi16(diff, diff1);
+ diff = _mm_add_epi16(diff, _mm_set1_epi16(4));
+ in_macro = _mm_srai_epi16(diff, 3);
+
+ C0_uv_8x16 = _mm_set_epi16(pu1_cliptab_cr[u1_Bs3], pu1_cliptab_cb[u1_Bs3],
+ pu1_cliptab_cr[u1_Bs3], pu1_cliptab_cb[u1_Bs3],
+ pu1_cliptab_cr[u1_Bs2], pu1_cliptab_cb[u1_Bs2],
+ pu1_cliptab_cr[u1_Bs2], pu1_cliptab_cb[u1_Bs2]);
+
+ C0_uv_8x16 = _mm_add_epi16(C0_uv_8x16, _mm_set1_epi16(1));
+
+ in_macro = _mm_min_epi16(C0_uv_8x16, in_macro); //CLIP3
+ C0_uv_8x16 = _mm_subs_epi16(zero, C0_uv_8x16);
+ in_macro = _mm_max_epi16(C0_uv_8x16, in_macro);
+
+ p0_uv_8x16_2 = _mm_add_epi16(p0_uv_8x16, in_macro);
+ q0_uv_8x16_2 = _mm_sub_epi16(q0_uv_8x16, in_macro);
+
+ p0_uv_8x16_2 = _mm_packus_epi16(p0_uv_8x16_1, p0_uv_8x16_2);
+ q0_uv_8x16_2 = _mm_packus_epi16(q0_uv_8x16_1, q0_uv_8x16_2);
+
+ flag1 = _mm_packs_epi16(flag1, flag2);
+ flag1 = _mm_and_si128(flag1, flag_bs); //Final flag (BS condition + other 3 conditions)
+
+ p0_uv_8x16_1 = _mm_and_si128(p0_uv_16x8,
+ _mm_xor_si128(flag1, _mm_set1_epi8(0xFF)));
+ p0_uv_8x16_2 = _mm_and_si128(p0_uv_8x16_2, flag1);
+ p0_uv_8x16_1 = _mm_add_epi8(p0_uv_8x16_1, p0_uv_8x16_2);
+ _mm_storeu_si128((__m128i *)(pu1_HorzPixelUV + i16_posP0), p0_uv_8x16_1);
+
+ q0_uv_8x16_1 = _mm_and_si128(q0_uv_16x8,
+ _mm_xor_si128(flag1, _mm_set1_epi8(0xFF)));
+ q0_uv_8x16_2 = _mm_and_si128(q0_uv_8x16_2, flag1);
+ q0_uv_8x16_1 = _mm_add_epi8(q0_uv_8x16_1, q0_uv_8x16_2);
+ _mm_storeu_si128((__m128i *)(pu1_src_uv), q0_uv_8x16_1);
+
+}
+
+/*****************************************************************************/
+/* */
+/* Function Name : ih264_deblk_chroma_vert_bs4_mbaff_ssse3() */
+/* */
+/* Description : This function performs filtering of a chroma block */
+/* vertical edge when boundary strength is set to 4 in high */
+/* profile. */
+/* */
+/* Inputs : pu1_src - pointer to the src sample q0 of U */
+/* src_strd - source stride */
+/* alpha_cb - alpha value for the boundary in U */
+/* beta_cb - beta value for the boundary in U */
+/* alpha_cr - alpha value for the boundary in V */
+/* beta_cr - beta value for the boundary in V */
+/* u4_bs - packed Boundary strength array */
+/* pu1_cliptab_cb - tc0_table for U */
+/* pu1_cliptab_cr - tc0_table for V */
+/* */
+/* Globals : None */
+/* */
+/* Processing : When the function is called twice, this operation is as */
+/* described in Sec. 8.7.2.4 under the title "Filtering */
+/* process for edges for bS equal to 4" in ITU T Rec H.264 */
+/* with alpha and beta values different in U and V. */
+/* */
+/* Outputs : None */
+/* */
+/* Returns : None */
+/* */
+/* Issues : None */
+/* */
+/* Revision History: */
+/* */
+/* DD MM YYYY Author(s) Changes (Describe the changes made) */
+/* 12 02 2015 Naveen Kumar P Initial version */
+/* */
+/*****************************************************************************/
+void ih264_deblk_chroma_vert_bs4_mbaff_ssse3(UWORD8 *pu1_src,
+ WORD32 src_strd,
+ WORD32 alpha_cb,
+ WORD32 beta_cb,
+ WORD32 alpha_cr,
+ WORD32 beta_cr)
+{
+ UWORD8 *pu1_src_uv = pu1_src; /* Pointer to the src sample q0 of plane U*/
+ WORD32 alpha_cbcr = (alpha_cr << 16) + alpha_cb;
+ WORD32 beta_cbcr = (beta_cr << 16) + beta_cb;
+ __m128i linea, lineb, linec, lined;
+ __m128i temp1, temp2;
+
+ __m128i q0_uv_16x8, p0_uv_16x8, q1_uv_16x8, p1_uv_16x8;
+ __m128i q0_uv_8x16, p0_uv_8x16, q1_uv_8x16, p1_uv_8x16;
+ __m128i flag1;
+ __m128i diff, alpha_cbcr_16x8, beta_cbcr_16x8;
+ __m128i zero = _mm_setzero_si128();
+ __m128i p0_uv_8x16_1, p0_uv_8x16_2, q0_uv_8x16_1, q0_uv_8x16_2;
+
+ /* Load and transpose the pixel values */
+ linea = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4));
+ lineb = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + src_strd));
+ linec = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + 2 * src_strd));
+ lined = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + 3 * src_strd));
+
+ temp1 = _mm_unpacklo_epi16(linea, lineb);
+ temp2 = _mm_unpacklo_epi16(linec, lined);
+
+ p1_uv_16x8 = _mm_unpacklo_epi32(temp1, temp2);
+ p0_uv_16x8 = _mm_srli_si128(p1_uv_16x8, 8);
+ q0_uv_16x8 = _mm_unpackhi_epi32(temp1, temp2);
+ q1_uv_16x8 = _mm_srli_si128(q0_uv_16x8, 8);
+ /* End of transpose */
+
+ q0_uv_8x16 = _mm_unpacklo_epi8(q0_uv_16x8, zero);
+ q1_uv_8x16 = _mm_unpacklo_epi8(q1_uv_16x8, zero);
+ p1_uv_8x16 = _mm_unpacklo_epi8(p1_uv_16x8, zero);
+ p0_uv_8x16 = _mm_unpacklo_epi8(p0_uv_16x8, zero);
+
+ diff = _mm_subs_epi16(p0_uv_8x16, q0_uv_8x16); //Condn 1
+ diff = _mm_abs_epi16(diff);
+ alpha_cbcr_16x8 = _mm_set1_epi32(alpha_cbcr);
+ flag1 = _mm_cmpgt_epi16(alpha_cbcr_16x8, diff);
+
+ diff = _mm_subs_epi16(q1_uv_8x16, q0_uv_8x16); //Condtn 2
+ diff = _mm_abs_epi16(diff);
+ beta_cbcr_16x8 = _mm_set1_epi32(beta_cbcr);
+ flag1 = _mm_and_si128(flag1, _mm_cmpgt_epi16(beta_cbcr_16x8, diff));
+
+ diff = _mm_subs_epi16(p1_uv_8x16, p0_uv_8x16); //Condtn 3
+ diff = _mm_abs_epi16(diff);
+ flag1 = _mm_and_si128(flag1, _mm_cmpgt_epi16(beta_cbcr_16x8, diff));
+
+ temp1 = _mm_slli_epi16(p1_uv_8x16, 1);
+ temp2 = _mm_add_epi16(p0_uv_8x16, q1_uv_8x16);
+ temp1 = _mm_add_epi16(temp1, _mm_set1_epi16(2));
+ temp1 = _mm_add_epi16(temp1, temp2);
+ p0_uv_8x16_1 = _mm_srai_epi16(temp1, 2);
+
+ temp1 = _mm_slli_epi16(q1_uv_8x16, 1);
+ temp2 = _mm_add_epi16(p1_uv_8x16, q0_uv_8x16);
+ temp1 = _mm_add_epi16(temp1, _mm_set1_epi16(2));
+ temp1 = _mm_add_epi16(temp1, temp2);
+ q0_uv_8x16_1 = _mm_srai_epi16(temp1, 2);
+
+ p0_uv_8x16_2 = _mm_packus_epi16(p0_uv_8x16_1, p0_uv_8x16_1);
+ q0_uv_8x16_2 = _mm_packus_epi16(q0_uv_8x16_1, q0_uv_8x16_1);
+
+ flag1 = _mm_packs_epi16(flag1, flag1);
+
+ p0_uv_8x16_1 = _mm_and_si128(p0_uv_16x8,
+ _mm_xor_si128(flag1, _mm_set1_epi8(0xFF)));
+ p0_uv_8x16_2 = _mm_and_si128(p0_uv_8x16_2, flag1);
+ p0_uv_16x8 = _mm_add_epi8(p0_uv_8x16_1, p0_uv_8x16_2);
+
+ q0_uv_8x16_1 = _mm_and_si128(q0_uv_16x8,
+ _mm_xor_si128(flag1, _mm_set1_epi8(0xFF)));
+ q0_uv_8x16_2 = _mm_and_si128(q0_uv_8x16_2, flag1);
+ q0_uv_16x8 = _mm_add_epi8(q0_uv_8x16_1, q0_uv_8x16_2);
+
+ /* Inverse-transpose and store back */
+ temp1 = _mm_unpacklo_epi16(p1_uv_16x8, p0_uv_16x8);
+ temp2 = _mm_unpacklo_epi16(q0_uv_16x8, q1_uv_16x8);
+
+ linea = _mm_unpacklo_epi32(temp1, temp2);
+ lineb = _mm_srli_si128(linea, 8);
+ linec = _mm_unpackhi_epi32(temp1, temp2);
+ lined = _mm_srli_si128(linec, 8);
+
+ _mm_storel_epi64((__m128i *)(pu1_src_uv - 4), linea);
+ _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + src_strd), lineb);
+ _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + 2 * src_strd), linec);
+ _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + 3 * src_strd), lined);
+
+}
+
+/*****************************************************************************/
+/* */
+/* Function Name : ih264_deblk_chroma_vert_bslt4_mbaff_ssse3() */
+/* */
+/* Description : This function performs filtering of a chroma block */
+/* vertical edge when boundary strength is less than 4 in */
+/* high profile. */
+/* */
+/* Inputs : pu1_src - pointer to the src sample q0 of U */
+/* src_strd - source stride */
+/* alpha_cb - alpha value for the boundary in U */
+/* beta_cb - beta value for the boundary in U */
+/* alpha_cr - alpha value for the boundary in V */
+/* beta_cr - beta value for the boundary in V */
+/* u4_bs - packed Boundary strength array */
+/* pu1_cliptab_cb - tc0_table for U */
+/* pu1_cliptab_cr - tc0_table for V */
+/* */
+/* Globals : None */
+/* */
+/* Processing : When the function is called twice, this operation is as */
+/* described in Sec. 8.7.2.4 under the title "Filtering */
+/* process for edges for bS less than 4" in ITU T Rec H.264 */
+/* with alpha and beta values different in U and V. */
+/* */
+/* Outputs : None */
+/* */
+/* Returns : None */
+/* */
+/* Issues : None */
+/* */
+/* Revision History: */
+/* */
+/* DD MM YYYY Author(s) Changes (Describe the changes made) */
+/* 12 02 2015 Naveen Kumar P Initial version */
+/* */
+/*****************************************************************************/
+void ih264_deblk_chroma_vert_bslt4_mbaff_ssse3(UWORD8 *pu1_src,
+ WORD32 src_strd,
+ WORD32 alpha_cb,
+ WORD32 beta_cb,
+ WORD32 alpha_cr,
+ WORD32 beta_cr,
+ UWORD32 u4_bs,
+ const UWORD8 *pu1_cliptab_cb,
+ const UWORD8 *pu1_cliptab_cr)
+{
+ UWORD8 *pu1_src_uv = pu1_src; /* Pointer to the src sample q0 of plane U*/
+ UWORD8 u1_Bs0, u1_Bs1, u1_Bs2, u1_Bs3;
+ WORD32 alpha_cbcr = (alpha_cr << 16) + alpha_cb;
+ WORD32 beta_cbcr = (beta_cr << 16) + beta_cb;
+ __m128i linea, lineb, linec, lined;
+ __m128i temp1, temp2;
+
+ __m128i q0_uv_16x8, p0_uv_16x8, q1_uv_16x8, p1_uv_16x8;
+ __m128i q0_uv_8x16, p0_uv_8x16, q1_uv_8x16, p1_uv_8x16;
+ __m128i flag_bs, flag1;
+ __m128i diff, diff1, alpha_cbcr_16x8, beta_cbcr_16x8, in_macro;
+ __m128i zero = _mm_setzero_si128();
+ __m128i C0_uv_8x16;
+ __m128i p0_uv_8x16_1, p0_uv_8x16_2, q0_uv_8x16_1, q0_uv_8x16_2;
+
+ u1_Bs0 = (u4_bs >> 24) & 0xff;
+ u1_Bs1 = (u4_bs >> 16) & 0xff;
+ u1_Bs2 = (u4_bs >> 8) & 0xff;
+ u1_Bs3 = (u4_bs >> 0) & 0xff;
+
+ flag_bs = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, u1_Bs3, u1_Bs3, u1_Bs2,
+ u1_Bs2, u1_Bs1, u1_Bs1, u1_Bs0, u1_Bs0);
+ flag_bs = _mm_cmpeq_epi8(flag_bs, zero); //Set flag to 1s and 0s
+ flag_bs = _mm_xor_si128(flag_bs, _mm_set1_epi8(0xFF)); //Invert for required mask
+
+ /* Load and transpose the pixel values */
+ linea = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4));
+ lineb = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + src_strd));
+ linec = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + 2 * src_strd));
+ lined = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + 3 * src_strd));
+
+ temp1 = _mm_unpacklo_epi16(linea, lineb);
+ temp2 = _mm_unpacklo_epi16(linec, lined);
+
+ p1_uv_16x8 = _mm_unpacklo_epi32(temp1, temp2);
+ p0_uv_16x8 = _mm_srli_si128(p1_uv_16x8, 8);
+ q0_uv_16x8 = _mm_unpackhi_epi32(temp1, temp2);
+ q1_uv_16x8 = _mm_srli_si128(q0_uv_16x8, 8);
+ /* End of transpose */
+
+ q0_uv_8x16 = _mm_unpacklo_epi8(q0_uv_16x8, zero);
+ q1_uv_8x16 = _mm_unpacklo_epi8(q1_uv_16x8, zero);
+ p1_uv_8x16 = _mm_unpacklo_epi8(p1_uv_16x8, zero);
+ p0_uv_8x16 = _mm_unpacklo_epi8(p0_uv_16x8, zero);
+
+ diff = _mm_subs_epi16(p0_uv_8x16, q0_uv_8x16); //Condn 1
+ diff = _mm_abs_epi16(diff);
+ alpha_cbcr_16x8 = _mm_set1_epi32(alpha_cbcr);
+ flag1 = _mm_cmpgt_epi16(alpha_cbcr_16x8, diff);
+
+ diff = _mm_subs_epi16(q1_uv_8x16, q0_uv_8x16); //Condtn 2
+ diff = _mm_abs_epi16(diff);
+ beta_cbcr_16x8 = _mm_set1_epi32(beta_cbcr);
+ flag1 = _mm_and_si128(flag1, _mm_cmpgt_epi16(beta_cbcr_16x8, diff));
+
+ diff = _mm_subs_epi16(p1_uv_8x16, p0_uv_8x16); //Condtn 3
+ diff = _mm_abs_epi16(diff);
+ flag1 = _mm_and_si128(flag1, _mm_cmpgt_epi16(beta_cbcr_16x8, diff));
+
+ diff = _mm_subs_epi16(q0_uv_8x16, p0_uv_8x16);
+ diff = _mm_slli_epi16(diff, 2);
+ diff1 = _mm_subs_epi16(p1_uv_8x16, q1_uv_8x16);
+ diff = _mm_add_epi16(diff, diff1);
+ diff = _mm_add_epi16(diff, _mm_set1_epi16(4));
+ in_macro = _mm_srai_epi16(diff, 3);
+
+ C0_uv_8x16 = _mm_set_epi16(pu1_cliptab_cr[u1_Bs3], pu1_cliptab_cb[u1_Bs3],
+ pu1_cliptab_cr[u1_Bs2], pu1_cliptab_cb[u1_Bs2],
+ pu1_cliptab_cr[u1_Bs1], pu1_cliptab_cb[u1_Bs1],
+ pu1_cliptab_cr[u1_Bs0], pu1_cliptab_cb[u1_Bs0]);
+
+ C0_uv_8x16 = _mm_add_epi16(C0_uv_8x16, _mm_set1_epi16(1));
+
+ in_macro = _mm_min_epi16(C0_uv_8x16, in_macro); //CLIP3
+ C0_uv_8x16 = _mm_subs_epi16(zero, C0_uv_8x16);
+ in_macro = _mm_max_epi16(C0_uv_8x16, in_macro);
+
+ p0_uv_8x16_1 = _mm_add_epi16(p0_uv_8x16, in_macro);
+ q0_uv_8x16_1 = _mm_sub_epi16(q0_uv_8x16, in_macro);
+
+ p0_uv_8x16_2 = _mm_packus_epi16(p0_uv_8x16_1, p0_uv_8x16_1);
+ q0_uv_8x16_2 = _mm_packus_epi16(q0_uv_8x16_1, q0_uv_8x16_1);
+
+ flag1 = _mm_packs_epi16(flag1, flag1);
+ flag1 = _mm_and_si128(flag1, flag_bs); //Final flag (BS condition + other 3 conditions)
+
+ p0_uv_8x16_1 = _mm_and_si128(p0_uv_16x8,
+ _mm_xor_si128(flag1, _mm_set1_epi8(0xFF)));
+ p0_uv_8x16_2 = _mm_and_si128(p0_uv_8x16_2, flag1);
+ p0_uv_16x8 = _mm_add_epi8(p0_uv_8x16_1, p0_uv_8x16_2);
+
+ q0_uv_8x16_1 = _mm_and_si128(q0_uv_16x8,
+ _mm_xor_si128(flag1, _mm_set1_epi8(0xFF)));
+ q0_uv_8x16_2 = _mm_and_si128(q0_uv_8x16_2, flag1);
+ q0_uv_16x8 = _mm_add_epi8(q0_uv_8x16_1, q0_uv_8x16_2);
+
+ /* Inverse-transpose and store back */
+ temp1 = _mm_unpacklo_epi16(p1_uv_16x8, p0_uv_16x8);
+ temp2 = _mm_unpacklo_epi16(q0_uv_16x8, q1_uv_16x8);
+
+ linea = _mm_unpacklo_epi32(temp1, temp2);
+ lineb = _mm_srli_si128(linea, 8);
+ linec = _mm_unpackhi_epi32(temp1, temp2);
+ lined = _mm_srli_si128(linec, 8);
+
+ _mm_storel_epi64((__m128i *)(pu1_src_uv - 4), linea);
+ _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + src_strd), lineb);
+ _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + 2 * src_strd), linec);
+ _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + 3 * src_strd), lined);
+
+}
+
diff --git a/common/x86/ih264_deblk_luma_ssse3.c b/common/x86/ih264_deblk_luma_ssse3.c
new file mode 100755
index 0000000..440d5f0
--- /dev/null
+++ b/common/x86/ih264_deblk_luma_ssse3.c
@@ -0,0 +1,2012 @@
+/******************************************************************************
+ *
+ * Copyright (C) 2015 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ *****************************************************************************
+ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
+*/
+/*****************************************************************************/
+/* */
+/* File Name : ih264_deblk_luma_ssse3.c */
+/* */
+/* Description : Contains function definitions for deblocking */
+/* */
+/* List of Functions : ih264_deblk_luma_vert_bs4_ssse3() */
+/* ih264_deblk_luma_horz_bs4_ssse3() */
+/* ih264_deblk_luma_vert_bslt4_ssse3() */
+/* ih264_deblk_luma_horz_bslt4_ssse3() */
+/* ih264_deblk_luma_vert_bs4_mbaff_ssse3() */
+/* ih264_deblk_luma_vert_bslt4_mbaff_ssse3() */
+/* */
+/* Issues / Problems : None */
+/* */
+/* Revision History : */
+/* */
+/* DD MM YYYY Author(s) Changes (Describe the changes made) */
+/* 12 02 2015 Naveen Kumar P Added luma deblocking ssse3 */
+/* intrinsics */
+/* */
+/*****************************************************************************/
+
+/*****************************************************************************/
+/* File Includes */
+/*****************************************************************************/
+
+/* System include files */
+#include <stdio.h>
+
+/* User include files */
+#include "ih264_typedefs.h"
+#include "ih264_platform_macros.h"
+#include "ih264_deblk_edge_filters.h"
+#include "ih264_macros.h"
+
+/*****************************************************************************/
+/* Function Definitions */
+/*****************************************************************************/
+
+/*****************************************************************************/
+/* */
+/* Function Name : ih264_deblk_luma_vert_bs4_ssse3() */
+/* */
+/* Description : This function performs filtering of a luma block */
+/* vertical edge when the boundary strength is set to 4. */
+/* */
+/* Inputs : pu1_src - pointer to the src sample q0 */
+/* src_strd - source stride */
+/* alpha - alpha value for the boundary */
+/* beta - beta value for the boundary */
+/* */
+/* Globals : None */
+/* */
+/* Processing : This operation is described in Sec. 8.7.2.4 under the */
+/* title "Filtering process for edges for bS equal to 4" in */
+/* ITU T Rec H.264. */
+/* */
+/* Outputs : None */
+/* */
+/* Returns : None */
+/* */
+/* Issues : None */
+/* */
+/* Revision History: */
+/* */
+/* DD MM YYYY Author(s) Changes (Describe the changes made) */
+/* 12 02 2015 Naveen Kumar P Initial version */
+/* */
+/*****************************************************************************/
+void ih264_deblk_luma_vert_bs4_ssse3(UWORD8 *pu1_src,
+ WORD32 src_strd,
+ WORD32 alpha,
+ WORD32 beta)
+{
+ __m128i zero = _mm_setzero_si128();
+ __m128i q0_16x8, q1_16x8, q2_16x8, q3_16x8;
+ __m128i p0_16x8, p1_16x8, p2_16x8, p3_16x8;
+ __m128i q0_8x16, q1_8x16, q2_8x16, q3_8x16;
+ __m128i p0_8x16, p1_8x16, p2_8x16, p3_8x16;
+ __m128i q0_16x8_1;
+ __m128i p0_16x8_1;
+ __m128i q0_16x8_2, q1_16x8_2, q2_16x8_2;
+ __m128i p0_16x8_2, p1_16x8_2, p2_16x8_2;
+ __m128i temp1, temp2, temp3, temp4, temp5, temp6;
+ __m128i Alpha_8x16, Beta_8x16;
+ __m128i flag1_16x8, flag2_16x8, flag3_16x8, flag4_16x8;
+ __m128i const_val2_16x8 = _mm_set1_epi16(2);
+ __m128i line1, line2, line3, line4, line5, line6, line7, line8;
+
+ Alpha_8x16 = _mm_set1_epi16(alpha);
+ Beta_8x16 = _mm_set1_epi16(beta);
+
+ line1 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 0 * src_strd));
+ line2 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 1 * src_strd));
+ line3 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 2 * src_strd));
+ line4 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 3 * src_strd));
+ line5 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 4 * src_strd));
+ line6 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 5 * src_strd));
+ line7 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 6 * src_strd));
+ line8 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 7 * src_strd));
+
+ temp1 = _mm_unpacklo_epi8(line1, line2);
+ temp2 = _mm_unpacklo_epi8(line3, line4);
+ temp3 = _mm_unpacklo_epi8(line5, line6);
+ temp4 = _mm_unpacklo_epi8(line7, line8);
+
+ line1 = _mm_unpacklo_epi16(temp1, temp2);
+ line2 = _mm_unpackhi_epi16(temp1, temp2);
+ line3 = _mm_unpacklo_epi16(temp3, temp4);
+ line4 = _mm_unpackhi_epi16(temp3, temp4);
+
+ p1_8x16 = _mm_unpacklo_epi32(line1, line3);
+ p0_8x16 = _mm_unpackhi_epi32(line1, line3);
+ q0_8x16 = _mm_unpacklo_epi32(line2, line4);
+ q1_8x16 = _mm_unpackhi_epi32(line2, line4);
+
+ line1 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 8 * src_strd));
+ line2 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 9 * src_strd));
+ line3 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 10 * src_strd));
+ line4 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 11 * src_strd));
+ line5 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 12 * src_strd));
+ line6 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 13 * src_strd));
+ line7 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 14 * src_strd));
+ line8 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 15 * src_strd));
+
+ temp1 = _mm_unpacklo_epi8(line1, line2);
+ temp2 = _mm_unpacklo_epi8(line3, line4);
+ temp3 = _mm_unpacklo_epi8(line5, line6);
+ temp4 = _mm_unpacklo_epi8(line7, line8);
+
+ line1 = _mm_unpacklo_epi16(temp1, temp2);
+ line2 = _mm_unpackhi_epi16(temp1, temp2);
+ line3 = _mm_unpacklo_epi16(temp3, temp4);
+ line4 = _mm_unpackhi_epi16(temp3, temp4);
+
+ temp1 = _mm_unpacklo_epi32(line1, line3);
+ temp2 = _mm_unpackhi_epi32(line1, line3);
+ temp3 = _mm_unpacklo_epi32(line2, line4);
+ temp4 = _mm_unpackhi_epi32(line2, line4);
+
+ p3_16x8 = _mm_unpacklo_epi64(p1_8x16, temp1);
+ p2_16x8 = _mm_unpackhi_epi64(p1_8x16, temp1);
+ q2_16x8 = _mm_unpacklo_epi64(q1_8x16, temp4);
+ q3_16x8 = _mm_unpackhi_epi64(q1_8x16, temp4);
+ p1_16x8 = _mm_unpacklo_epi64(p0_8x16, temp2);
+ p0_16x8 = _mm_unpackhi_epi64(p0_8x16, temp2);
+ q0_16x8 = _mm_unpacklo_epi64(q0_8x16, temp3);
+ q1_16x8 = _mm_unpackhi_epi64(q0_8x16, temp3);
+
+ //Cond1 (ABS(p0 - q0) < alpha)
+ temp1 = _mm_subs_epu8(q0_16x8, p0_16x8);
+ temp2 = _mm_subs_epu8(p0_16x8, q0_16x8);
+ temp1 = _mm_add_epi8(temp1, temp2);
+
+ temp2 = _mm_unpacklo_epi8(temp1, zero);
+ temp1 = _mm_unpackhi_epi8(temp1, zero);
+
+ temp2 = _mm_cmpgt_epi16(Alpha_8x16, temp2);
+ temp1 = _mm_cmpgt_epi16(Alpha_8x16, temp1);
+
+ flag1_16x8 = _mm_packs_epi16(temp2, temp1);
+
+ //Cond2 (ABS(q1 - q0) < beta)
+ temp1 = _mm_subs_epu8(q0_16x8, q1_16x8);
+ temp2 = _mm_subs_epu8(q1_16x8, q0_16x8);
+ temp1 = _mm_add_epi8(temp1, temp2);
+
+ temp2 = _mm_unpacklo_epi8(temp1, zero);
+ temp1 = _mm_unpackhi_epi8(temp1, zero);
+
+ temp2 = _mm_cmpgt_epi16(Beta_8x16, temp2);
+ temp1 = _mm_cmpgt_epi16(Beta_8x16, temp1);
+
+ flag2_16x8 = _mm_packs_epi16(temp2, temp1);
+
+ flag1_16x8 = _mm_and_si128(flag1_16x8, flag2_16x8);
+
+ //Cond3 (ABS(p1 - p0) < beta)
+ temp1 = _mm_subs_epu8(p0_16x8, p1_16x8);
+ temp2 = _mm_subs_epu8(p1_16x8, p0_16x8);
+ temp1 = _mm_add_epi8(temp1, temp2);
+
+ temp2 = _mm_unpacklo_epi8(temp1, zero);
+ temp1 = _mm_unpackhi_epi8(temp1, zero);
+
+ temp2 = _mm_cmpgt_epi16(Beta_8x16, temp2);
+ temp1 = _mm_cmpgt_epi16(Beta_8x16, temp1);
+
+ flag2_16x8 = _mm_packs_epi16(temp2, temp1);
+
+ // !((ABS(p0 - q0) < alpha) || (ABS(q1 - q0) < beta) || (ABS(p1 - p0) < beta))
+ flag1_16x8 = _mm_and_si128(flag1_16x8, flag2_16x8);
+
+ // (ABS(p0 - q0) < ((alpha >> 2) + 2))
+ temp1 = _mm_subs_epu8(p0_16x8, q0_16x8);
+ temp2 = _mm_subs_epu8(q0_16x8, p0_16x8);
+ temp1 = _mm_add_epi8(temp1, temp2);
+ Alpha_8x16 = _mm_srai_epi16(Alpha_8x16, 2);
+ Alpha_8x16 = _mm_add_epi16(Alpha_8x16, const_val2_16x8);
+
+ temp2 = _mm_unpacklo_epi8(temp1, zero);
+ temp1 = _mm_unpackhi_epi8(temp1, zero);
+ temp2 = _mm_cmpgt_epi16(Alpha_8x16, temp2);
+ temp1 = _mm_cmpgt_epi16(Alpha_8x16, temp1);
+
+ flag2_16x8 = _mm_packs_epi16(temp2, temp1);
+ flag2_16x8 = _mm_and_si128(flag1_16x8, flag2_16x8);
+
+ // (ABS(p2 - p0) < beta)
+ temp1 = _mm_subs_epu8(p0_16x8, p2_16x8);
+ temp2 = _mm_subs_epu8(p2_16x8, p0_16x8);
+ temp1 = _mm_add_epi8(temp1, temp2);
+
+ temp2 = _mm_unpacklo_epi8(temp1, zero);
+ temp1 = _mm_unpackhi_epi8(temp1, zero);
+ temp2 = _mm_cmpgt_epi16(Beta_8x16, temp2);
+ temp1 = _mm_cmpgt_epi16(Beta_8x16, temp1);
+
+ flag3_16x8 = _mm_packs_epi16(temp2, temp1);
+ flag3_16x8 = _mm_and_si128(flag3_16x8, flag2_16x8);
+
+ // (ABS(q2 - q0) < beta)
+ temp1 = _mm_subs_epu8(q0_16x8, q2_16x8);
+ temp2 = _mm_subs_epu8(q2_16x8, q0_16x8);
+ temp1 = _mm_add_epi8(temp1, temp2);
+
+ temp2 = _mm_unpacklo_epi8(temp1, zero);
+ temp1 = _mm_unpackhi_epi8(temp1, zero);
+ temp2 = _mm_cmpgt_epi16(Beta_8x16, temp2);
+ temp1 = _mm_cmpgt_epi16(Beta_8x16, temp1);
+
+ flag4_16x8 = _mm_packs_epi16(temp2, temp1);
+ flag4_16x8 = _mm_and_si128(flag4_16x8, flag2_16x8);
+
+ // First 8 pixels
+ p3_8x16 = _mm_unpacklo_epi8(p3_16x8, zero);
+ p2_8x16 = _mm_unpacklo_epi8(p2_16x8, zero);
+ p1_8x16 = _mm_unpacklo_epi8(p1_16x8, zero);
+ p0_8x16 = _mm_unpacklo_epi8(p0_16x8, zero);
+ q0_8x16 = _mm_unpacklo_epi8(q0_16x8, zero);
+ q1_8x16 = _mm_unpacklo_epi8(q1_16x8, zero);
+ q2_8x16 = _mm_unpacklo_epi8(q2_16x8, zero);
+ q3_8x16 = _mm_unpacklo_epi8(q3_16x8, zero);
+
+ // p0_1 and q0_1
+ temp1 = _mm_add_epi16(p0_8x16, q1_8x16);
+ temp2 = _mm_add_epi16(p1_8x16, q0_8x16);
+ temp5 = _mm_add_epi16(temp1, const_val2_16x8);
+ temp6 = _mm_add_epi16(temp2, const_val2_16x8);
+ temp3 = _mm_slli_epi16(p1_8x16, 1);
+ temp4 = _mm_slli_epi16(q1_8x16, 1);
+ temp1 = _mm_add_epi16(temp5, temp3);
+ temp2 = _mm_add_epi16(temp6, temp4);
+ p0_16x8_1 = _mm_srai_epi16(temp1, 2);
+ q0_16x8_1 = _mm_srai_epi16(temp2, 2);
+
+ // p1_2 and q1_2
+ temp6 = _mm_add_epi16(temp6, p0_8x16);
+ temp5 = _mm_add_epi16(temp5, q0_8x16);
+ temp1 = _mm_add_epi16(temp6, p2_8x16);
+ temp2 = _mm_add_epi16(temp5, q2_8x16);
+ p1_16x8_2 = _mm_srai_epi16(temp1, 2);
+ q1_16x8_2 = _mm_srai_epi16(temp2, 2);
+
+ // p0_2 and q0_2
+ temp1 = _mm_add_epi16(temp3, p2_8x16);
+ temp2 = _mm_add_epi16(temp4, q2_8x16);
+ temp1 = _mm_add_epi16(temp1, q1_8x16);
+ temp2 = _mm_add_epi16(temp2, p1_8x16);
+ temp3 = _mm_add_epi16(p0_8x16, q0_8x16);
+ temp3 = _mm_slli_epi16(temp3, 1);
+ temp1 = _mm_add_epi16(temp1, temp3);
+ temp2 = _mm_add_epi16(temp2, temp3);
+ temp1 = _mm_add_epi16(temp1, _mm_set1_epi16(4));
+ temp2 = _mm_add_epi16(temp2, _mm_set1_epi16(4));
+ p0_16x8_2 = _mm_srai_epi16(temp1, 3);
+ q0_16x8_2 = _mm_srai_epi16(temp2, 3);
+
+ // p2_2 and q2_2
+ temp1 = _mm_add_epi16(temp6, const_val2_16x8);
+ temp2 = _mm_add_epi16(temp5, const_val2_16x8);
+ temp3 = _mm_slli_epi16(p2_8x16, 1);
+ temp4 = _mm_slli_epi16(q2_8x16, 1);
+ temp3 = _mm_add_epi16(p2_8x16, temp3);
+ temp4 = _mm_add_epi16(q2_8x16, temp4);
+ temp5 = _mm_slli_epi16(p3_8x16, 1);
+ temp6 = _mm_slli_epi16(q3_8x16, 1);
+ temp1 = _mm_add_epi16(temp1, temp3);
+ temp2 = _mm_add_epi16(temp2, temp4);
+ temp1 = _mm_add_epi16(temp1, temp5);
+ temp2 = _mm_add_epi16(temp2, temp6);
+ p2_16x8_2 = _mm_srai_epi16(temp1, 3);
+ q2_16x8_2 = _mm_srai_epi16(temp2, 3);
+
+ // Second 8 pixels and packing with first 8 pixels
+ p3_8x16 = _mm_unpackhi_epi8(p3_16x8, zero);
+ p2_8x16 = _mm_unpackhi_epi8(p2_16x8, zero);
+ p1_8x16 = _mm_unpackhi_epi8(p1_16x8, zero);
+ p0_8x16 = _mm_unpackhi_epi8(p0_16x8, zero);
+ q0_8x16 = _mm_unpackhi_epi8(q0_16x8, zero);
+ q1_8x16 = _mm_unpackhi_epi8(q1_16x8, zero);
+ q2_8x16 = _mm_unpackhi_epi8(q2_16x8, zero);
+ q3_8x16 = _mm_unpackhi_epi8(q3_16x8, zero);
+
+ // p0_1 and q0_1
+ temp1 = _mm_add_epi16(p0_8x16, q1_8x16);
+ temp2 = _mm_add_epi16(p1_8x16, q0_8x16);
+ temp5 = _mm_add_epi16(temp1, const_val2_16x8);
+ temp6 = _mm_add_epi16(temp2, const_val2_16x8);
+ temp3 = _mm_slli_epi16(p1_8x16, 1);
+ temp4 = _mm_slli_epi16(q1_8x16, 1);
+ temp1 = _mm_add_epi16(temp5, temp3);
+ temp2 = _mm_add_epi16(temp6, temp4);
+ temp1 = _mm_srai_epi16(temp1, 2);
+ temp2 = _mm_srai_epi16(temp2, 2);
+ p0_16x8_1 = _mm_packus_epi16(p0_16x8_1, temp1);
+ q0_16x8_1 = _mm_packus_epi16(q0_16x8_1, temp2);
+
+ // p1_2 and q1_2
+ temp6 = _mm_add_epi16(temp6, p0_8x16);
+ temp5 = _mm_add_epi16(temp5, q0_8x16);
+ temp1 = _mm_add_epi16(temp6, p2_8x16);
+ temp2 = _mm_add_epi16(temp5, q2_8x16);
+ temp1 = _mm_srai_epi16(temp1, 2);
+ temp2 = _mm_srai_epi16(temp2, 2);
+ p1_16x8_2 = _mm_packus_epi16(p1_16x8_2, temp1);
+ q1_16x8_2 = _mm_packus_epi16(q1_16x8_2, temp2);
+
+ // p0_2 and q0_2
+ temp1 = _mm_add_epi16(temp3, p2_8x16);
+ temp2 = _mm_add_epi16(temp4, q2_8x16);
+ temp1 = _mm_add_epi16(temp1, q1_8x16);
+ temp2 = _mm_add_epi16(temp2, p1_8x16);
+ temp3 = _mm_add_epi16(p0_8x16, q0_8x16);
+ temp3 = _mm_slli_epi16(temp3, 1);
+ temp1 = _mm_add_epi16(temp1, temp3);
+ temp2 = _mm_add_epi16(temp2, temp3);
+ temp1 = _mm_add_epi16(temp1, _mm_set1_epi16(4));
+ temp2 = _mm_add_epi16(temp2, _mm_set1_epi16(4));
+ temp1 = _mm_srai_epi16(temp1, 3);
+ temp2 = _mm_srai_epi16(temp2, 3);
+ p0_16x8_2 = _mm_packus_epi16(p0_16x8_2, temp1);
+ q0_16x8_2 = _mm_packus_epi16(q0_16x8_2, temp2);
+
+ // p2_2 and q2_2
+ temp1 = _mm_add_epi16(temp6, const_val2_16x8);
+ temp2 = _mm_add_epi16(temp5, const_val2_16x8);
+ temp3 = _mm_slli_epi16(p2_8x16, 1);
+ temp4 = _mm_slli_epi16(q2_8x16, 1);
+ temp3 = _mm_add_epi16(p2_8x16, temp3);
+ temp4 = _mm_add_epi16(q2_8x16, temp4);
+ temp5 = _mm_slli_epi16(p3_8x16, 1);
+ temp6 = _mm_slli_epi16(q3_8x16, 1);
+ temp1 = _mm_add_epi16(temp1, temp3);
+ temp2 = _mm_add_epi16(temp2, temp4);
+ temp1 = _mm_add_epi16(temp1, temp5);
+ temp2 = _mm_add_epi16(temp2, temp6);
+ temp1 = _mm_srai_epi16(temp1, 3);
+ temp2 = _mm_srai_epi16(temp2, 3);
+ p2_16x8_2 = _mm_packus_epi16(p2_16x8_2, temp1);
+ q2_16x8_2 = _mm_packus_epi16(q2_16x8_2, temp2);
+
+ // p0 and q0
+ p0_16x8 = _mm_and_si128(p0_16x8,
+ _mm_xor_si128(flag1_16x8, _mm_set1_epi8(0xFF)));
+ p0_16x8_1 = _mm_and_si128(p0_16x8_1, flag1_16x8);
+ p0_16x8 = _mm_add_epi8(p0_16x8, p0_16x8_1);
+ q0_16x8 = _mm_and_si128(q0_16x8,
+ _mm_xor_si128(flag1_16x8, _mm_set1_epi8(0xFF)));
+ q0_16x8_1 = _mm_and_si128(q0_16x8_1, flag1_16x8);
+ q0_16x8 = _mm_add_epi8(q0_16x8, q0_16x8_1);
+
+ // p0 and q0
+ p0_16x8 = _mm_and_si128(p0_16x8,
+ _mm_xor_si128(flag3_16x8, _mm_set1_epi8(0xFF)));
+ p0_16x8_2 = _mm_and_si128(p0_16x8_2, flag3_16x8);
+ p0_16x8 = _mm_add_epi8(p0_16x8, p0_16x8_2);
+ q0_16x8 = _mm_and_si128(q0_16x8,
+ _mm_xor_si128(flag4_16x8, _mm_set1_epi8(0xFF)));
+ q0_16x8_2 = _mm_and_si128(q0_16x8_2, flag4_16x8);
+ q0_16x8 = _mm_add_epi8(q0_16x8, q0_16x8_2);
+
+ // p1 and q1
+ p1_16x8 = _mm_and_si128(p1_16x8,
+ _mm_xor_si128(flag3_16x8, _mm_set1_epi8(0xFF)));
+ p1_16x8_2 = _mm_and_si128(p1_16x8_2, flag3_16x8);
+ p1_16x8 = _mm_add_epi8(p1_16x8, p1_16x8_2);
+ q1_16x8 = _mm_and_si128(q1_16x8,
+ _mm_xor_si128(flag4_16x8, _mm_set1_epi8(0xFF)));
+ q1_16x8_2 = _mm_and_si128(q1_16x8_2, flag4_16x8);
+ q1_16x8 = _mm_add_epi8(q1_16x8, q1_16x8_2);
+
+ // p2 and q2
+ p2_16x8 = _mm_and_si128(p2_16x8,
+ _mm_xor_si128(flag3_16x8, _mm_set1_epi8(0xFF)));
+ p2_16x8_2 = _mm_and_si128(p2_16x8_2, flag3_16x8);
+ p2_16x8 = _mm_add_epi8(p2_16x8, p2_16x8_2);
+ q2_16x8 = _mm_and_si128(q2_16x8,
+ _mm_xor_si128(flag4_16x8, _mm_set1_epi8(0xFF)));
+ q2_16x8_2 = _mm_and_si128(q2_16x8_2, flag4_16x8);
+ q2_16x8 = _mm_add_epi8(q2_16x8, q2_16x8_2);
+
+ temp1 = _mm_unpacklo_epi8(p3_16x8, p2_16x8);
+ temp2 = _mm_unpacklo_epi8(p1_16x8, p0_16x8);
+ temp3 = _mm_unpacklo_epi8(q0_16x8, q1_16x8);
+ temp4 = _mm_unpacklo_epi8(q2_16x8, q3_16x8);
+
+ p3_8x16 = _mm_unpacklo_epi16(temp1, temp2);
+ p2_8x16 = _mm_unpackhi_epi16(temp1, temp2);
+ q2_8x16 = _mm_unpacklo_epi16(temp3, temp4);
+ q3_8x16 = _mm_unpackhi_epi16(temp3, temp4);
+
+ line1 = _mm_unpacklo_epi32(p3_8x16, q2_8x16);
+ line2 = _mm_srli_si128(line1, 8);
+ line3 = _mm_unpackhi_epi32(p3_8x16, q2_8x16);
+ line4 = _mm_srli_si128(line3, 8);
+ line5 = _mm_unpacklo_epi32(p2_8x16, q3_8x16);
+ line6 = _mm_srli_si128(line5, 8);
+ line7 = _mm_unpackhi_epi32(p2_8x16, q3_8x16);
+ line8 = _mm_srli_si128(line7, 8);
+
+ _mm_storel_epi64((__m128i *)(pu1_src - 4 + 0 * src_strd), line1);
+ _mm_storel_epi64((__m128i *)(pu1_src - 4 + 1 * src_strd), line2);
+ _mm_storel_epi64((__m128i *)(pu1_src - 4 + 2 * src_strd), line3);
+ _mm_storel_epi64((__m128i *)(pu1_src - 4 + 3 * src_strd), line4);
+ _mm_storel_epi64((__m128i *)(pu1_src - 4 + 4 * src_strd), line5);
+ _mm_storel_epi64((__m128i *)(pu1_src - 4 + 5 * src_strd), line6);
+ _mm_storel_epi64((__m128i *)(pu1_src - 4 + 6 * src_strd), line7);
+ _mm_storel_epi64((__m128i *)(pu1_src - 4 + 7 * src_strd), line8);
+
+ temp1 = _mm_unpackhi_epi8(p3_16x8, p2_16x8);
+ temp2 = _mm_unpackhi_epi8(p1_16x8, p0_16x8);
+ temp3 = _mm_unpackhi_epi8(q0_16x8, q1_16x8);
+ temp4 = _mm_unpackhi_epi8(q2_16x8, q3_16x8);
+
+ p3_8x16 = _mm_unpacklo_epi16(temp1, temp2);
+ p2_8x16 = _mm_unpackhi_epi16(temp1, temp2);
+ q2_8x16 = _mm_unpacklo_epi16(temp3, temp4);
+ q3_8x16 = _mm_unpackhi_epi16(temp3, temp4);
+
+ line1 = _mm_unpacklo_epi32(p3_8x16, q2_8x16);
+ line2 = _mm_srli_si128(line1, 8);
+ line3 = _mm_unpackhi_epi32(p3_8x16, q2_8x16);
+ line4 = _mm_srli_si128(line3, 8);
+ line5 = _mm_unpacklo_epi32(p2_8x16, q3_8x16);
+ line6 = _mm_srli_si128(line5, 8);
+ line7 = _mm_unpackhi_epi32(p2_8x16, q3_8x16);
+ line8 = _mm_srli_si128(line7, 8);
+
+ _mm_storel_epi64((__m128i *)(pu1_src - 4 + 8 * src_strd), line1);
+ _mm_storel_epi64((__m128i *)(pu1_src - 4 + 9 * src_strd), line2);
+ _mm_storel_epi64((__m128i *)(pu1_src - 4 + 10 * src_strd), line3);
+ _mm_storel_epi64((__m128i *)(pu1_src - 4 + 11 * src_strd), line4);
+ _mm_storel_epi64((__m128i *)(pu1_src - 4 + 12 * src_strd), line5);
+ _mm_storel_epi64((__m128i *)(pu1_src - 4 + 13 * src_strd), line6);
+ _mm_storel_epi64((__m128i *)(pu1_src - 4 + 14 * src_strd), line7);
+ _mm_storel_epi64((__m128i *)(pu1_src - 4 + 15 * src_strd), line8);
+
+}
+
+/*****************************************************************************/
+/* */
+/* Function Name : ih264_deblk_luma_horz_bs4_ssse3() */
+/* */
+/* Description : This function performs filtering of a luma block */
+/* horizontal edge when the boundary strength is set to 4. */
+/* */
+/* Inputs : pu1_src - pointer to the src sample q0 */
+/* src_strd - source stride */
+/* alpha - alpha value for the boundary */
+/* beta - beta value for the boundary */
+/* */
+/* Globals : None */
+/* */
+/* Processing : This operation is described in Sec. 8.7.2.4 under the */
+/* title "Filtering process for edges for bS equal to 4" in */
+/* ITU T Rec H.264. */
+/* */
+/* Outputs : None */
+/* */
+/* Returns : None */
+/* */
+/* Issues : None */
+/* */
+/* Revision History: */
+/* */
+/* DD MM YYYY Author(s) Changes (Describe the changes made) */
+/* 12 02 2015 Naveen Kumar P Initial version */
+/* */
+/*****************************************************************************/
+void ih264_deblk_luma_horz_bs4_ssse3(UWORD8 *pu1_src,
+ WORD32 src_strd,
+ WORD32 alpha,
+ WORD32 beta)
+{
+ WORD16 i16_posP3, i16_posP2, i16_posP1, i16_posP0;
+ WORD16 i16_posQ1, i16_posQ2, i16_posQ3;
+ UWORD8 *pu1_HorzPixel;
+ __m128i zero = _mm_setzero_si128();
+ __m128i q0_16x8, q1_16x8, q2_16x8, q3_16x8;
+ __m128i p0_16x8, p1_16x8, p2_16x8, p3_16x8;
+ __m128i q0_8x16, q1_8x16, q2_8x16, q3_8x16;
+ __m128i p0_8x16, p1_8x16, p2_8x16, p3_8x16;
+ __m128i q0_16x8_1;
+ __m128i p0_16x8_1;
+ __m128i q0_16x8_2, q1_16x8_2, q2_16x8_2;
+ __m128i p0_16x8_2, p1_16x8_2, p2_16x8_2;
+ __m128i temp1, temp2, temp3, temp4, temp5, temp6;
+ __m128i Alpha_8x16, Beta_8x16;
+ __m128i flag1_16x8, flag2_16x8, flag3_16x8, flag4_16x8;
+ __m128i const_val2_16x8 = _mm_set1_epi16(2);
+
+ pu1_HorzPixel = pu1_src - (src_strd << 2);
+
+ i16_posQ1 = src_strd;
+ i16_posQ2 = X2(src_strd);
+ i16_posQ3 = X3(src_strd);
+ i16_posP0 = X3(src_strd);
+ i16_posP1 = X2(src_strd);
+ i16_posP2 = src_strd;
+ i16_posP3 = 0;
+
+ Alpha_8x16 = _mm_set1_epi16(alpha);
+ Beta_8x16 = _mm_set1_epi16(beta);
+
+ p3_16x8 = _mm_loadu_si128((__m128i *)(pu1_HorzPixel + i16_posP3));
+ p2_16x8 = _mm_loadu_si128((__m128i *)(pu1_HorzPixel + i16_posP2));
+ p1_16x8 = _mm_loadu_si128((__m128i *)(pu1_HorzPixel + i16_posP1));
+ p0_16x8 = _mm_loadu_si128((__m128i *)(pu1_HorzPixel + i16_posP0));
+ q0_16x8 = _mm_loadu_si128((__m128i *)(pu1_src));
+ q1_16x8 = _mm_loadu_si128((__m128i *)(pu1_src + i16_posQ1));
+ q2_16x8 = _mm_loadu_si128((__m128i *)(pu1_src + i16_posQ2));
+ q3_16x8 = _mm_loadu_si128((__m128i *)(pu1_src + i16_posQ3));
+
+ //Cond1 (ABS(p0 - q0) < alpha)
+ temp1 = _mm_subs_epu8(q0_16x8, p0_16x8);
+ temp2 = _mm_subs_epu8(p0_16x8, q0_16x8);
+ temp1 = _mm_add_epi8(temp1, temp2);
+
+ temp2 = _mm_unpacklo_epi8(temp1, zero);
+ temp1 = _mm_unpackhi_epi8(temp1, zero);
+
+ temp2 = _mm_cmpgt_epi16(Alpha_8x16, temp2);
+ temp1 = _mm_cmpgt_epi16(Alpha_8x16, temp1);
+
+ flag1_16x8 = _mm_packs_epi16(temp2, temp1);
+
+ //Cond2 (ABS(q1 - q0) < beta)
+ temp1 = _mm_subs_epu8(q0_16x8, q1_16x8);
+ temp2 = _mm_subs_epu8(q1_16x8, q0_16x8);
+ temp1 = _mm_add_epi8(temp1, temp2);
+
+ temp2 = _mm_unpacklo_epi8(temp1, zero);
+ temp1 = _mm_unpackhi_epi8(temp1, zero);
+
+ temp2 = _mm_cmpgt_epi16(Beta_8x16, temp2);
+ temp1 = _mm_cmpgt_epi16(Beta_8x16, temp1);
+
+ flag2_16x8 = _mm_packs_epi16(temp2, temp1);
+
+ flag1_16x8 = _mm_and_si128(flag1_16x8, flag2_16x8);
+
+ //Cond3 (ABS(p1 - p0) < beta)
+ temp1 = _mm_subs_epu8(p0_16x8, p1_16x8);
+ temp2 = _mm_subs_epu8(p1_16x8, p0_16x8);
+ temp1 = _mm_add_epi8(temp1, temp2);
+
+ temp2 = _mm_unpacklo_epi8(temp1, zero);
+ temp1 = _mm_unpackhi_epi8(temp1, zero);
+
+ temp2 = _mm_cmpgt_epi16(Beta_8x16, temp2);
+ temp1 = _mm_cmpgt_epi16(Beta_8x16, temp1);
+
+ flag2_16x8 = _mm_packs_epi16(temp2, temp1);
+
+ // !((ABS(p0 - q0) < alpha) || (ABS(q1 - q0) < beta) || (ABS(p1 - p0) < beta))
+ flag1_16x8 = _mm_and_si128(flag1_16x8, flag2_16x8);
+
+ // (ABS(p0 - q0) < ((alpha >> 2) + 2))
+ temp1 = _mm_subs_epu8(p0_16x8, q0_16x8);
+ temp2 = _mm_subs_epu8(q0_16x8, p0_16x8);
+ temp1 = _mm_add_epi8(temp1, temp2);
+ Alpha_8x16 = _mm_srai_epi16(Alpha_8x16, 2);
+ Alpha_8x16 = _mm_add_epi16(Alpha_8x16, const_val2_16x8);
+
+ temp2 = _mm_unpacklo_epi8(temp1, zero);
+ temp1 = _mm_unpackhi_epi8(temp1, zero);
+ temp2 = _mm_cmpgt_epi16(Alpha_8x16, temp2);
+ temp1 = _mm_cmpgt_epi16(Alpha_8x16, temp1);
+
+ flag2_16x8 = _mm_packs_epi16(temp2, temp1);
+ flag2_16x8 = _mm_and_si128(flag1_16x8, flag2_16x8);
+
+ // (ABS(p2 - p0) < beta)
+ temp1 = _mm_subs_epu8(p0_16x8, p2_16x8);
+ temp2 = _mm_subs_epu8(p2_16x8, p0_16x8);
+ temp1 = _mm_add_epi8(temp1, temp2);
+
+ temp2 = _mm_unpacklo_epi8(temp1, zero);
+ temp1 = _mm_unpackhi_epi8(temp1, zero);
+ temp2 = _mm_cmpgt_epi16(Beta_8x16, temp2);
+ temp1 = _mm_cmpgt_epi16(Beta_8x16, temp1);
+
+ flag3_16x8 = _mm_packs_epi16(temp2, temp1);
+ flag3_16x8 = _mm_and_si128(flag3_16x8, flag2_16x8);
+
+ // (ABS(q2 - q0) < beta)
+ temp1 = _mm_subs_epu8(q0_16x8, q2_16x8);
+ temp2 = _mm_subs_epu8(q2_16x8, q0_16x8);
+ temp1 = _mm_add_epi8(temp1, temp2);
+
+ temp2 = _mm_unpacklo_epi8(temp1, zero);
+ temp1 = _mm_unpackhi_epi8(temp1, zero);
+ temp2 = _mm_cmpgt_epi16(Beta_8x16, temp2);
+ temp1 = _mm_cmpgt_epi16(Beta_8x16, temp1);
+
+ flag4_16x8 = _mm_packs_epi16(temp2, temp1);
+ flag4_16x8 = _mm_and_si128(flag4_16x8, flag2_16x8);
+
+ // First 8 pixels
+ p3_8x16 = _mm_unpacklo_epi8(p3_16x8, zero);
+ p2_8x16 = _mm_unpacklo_epi8(p2_16x8, zero);
+ p1_8x16 = _mm_unpacklo_epi8(p1_16x8, zero);
+ p0_8x16 = _mm_unpacklo_epi8(p0_16x8, zero);
+ q0_8x16 = _mm_unpacklo_epi8(q0_16x8, zero);
+ q1_8x16 = _mm_unpacklo_epi8(q1_16x8, zero);
+ q2_8x16 = _mm_unpacklo_epi8(q2_16x8, zero);
+ q3_8x16 = _mm_unpacklo_epi8(q3_16x8, zero);
+
+ // p0_1 and q0_1
+ temp1 = _mm_add_epi16(p0_8x16, q1_8x16);
+ temp2 = _mm_add_epi16(p1_8x16, q0_8x16);
+ temp5 = _mm_add_epi16(temp1, const_val2_16x8);
+ temp6 = _mm_add_epi16(temp2, const_val2_16x8);
+ temp3 = _mm_slli_epi16(p1_8x16, 1);
+ temp4 = _mm_slli_epi16(q1_8x16, 1);
+ temp1 = _mm_add_epi16(temp5, temp3);
+ temp2 = _mm_add_epi16(temp6, temp4);
+ p0_16x8_1 = _mm_srai_epi16(temp1, 2);
+ q0_16x8_1 = _mm_srai_epi16(temp2, 2);
+
+ // p1_2 and q1_2
+ temp6 = _mm_add_epi16(temp6, p0_8x16);
+ temp5 = _mm_add_epi16(temp5, q0_8x16);
+ temp1 = _mm_add_epi16(temp6, p2_8x16);
+ temp2 = _mm_add_epi16(temp5, q2_8x16);
+ p1_16x8_2 = _mm_srai_epi16(temp1, 2);
+ q1_16x8_2 = _mm_srai_epi16(temp2, 2);
+
+ // p0_2 and q0_2
+ temp1 = _mm_add_epi16(temp3, p2_8x16);
+ temp2 = _mm_add_epi16(temp4, q2_8x16);
+ temp1 = _mm_add_epi16(temp1, q1_8x16);
+ temp2 = _mm_add_epi16(temp2, p1_8x16);
+ temp3 = _mm_add_epi16(p0_8x16, q0_8x16);
+ temp3 = _mm_slli_epi16(temp3, 1);
+ temp1 = _mm_add_epi16(temp1, temp3);
+ temp2 = _mm_add_epi16(temp2, temp3);
+ temp1 = _mm_add_epi16(temp1, _mm_set1_epi16(4));
+ temp2 = _mm_add_epi16(temp2, _mm_set1_epi16(4));
+ p0_16x8_2 = _mm_srai_epi16(temp1, 3);
+ q0_16x8_2 = _mm_srai_epi16(temp2, 3);
+
+ // p2_2 and q2_2
+ temp1 = _mm_add_epi16(temp6, const_val2_16x8);
+ temp2 = _mm_add_epi16(temp5, const_val2_16x8);
+ temp3 = _mm_slli_epi16(p2_8x16, 1);
+ temp4 = _mm_slli_epi16(q2_8x16, 1);
+ temp3 = _mm_add_epi16(p2_8x16, temp3);
+ temp4 = _mm_add_epi16(q2_8x16, temp4);
+ temp5 = _mm_slli_epi16(p3_8x16, 1);
+ temp6 = _mm_slli_epi16(q3_8x16, 1);
+ temp1 = _mm_add_epi16(temp1, temp3);
+ temp2 = _mm_add_epi16(temp2, temp4);
+ temp1 = _mm_add_epi16(temp1, temp5);
+ temp2 = _mm_add_epi16(temp2, temp6);
+ p2_16x8_2 = _mm_srai_epi16(temp1, 3);
+ q2_16x8_2 = _mm_srai_epi16(temp2, 3);
+
+ // Second 8 pixels and packing with first 8 pixels
+ p3_8x16 = _mm_unpackhi_epi8(p3_16x8, zero);
+ p2_8x16 = _mm_unpackhi_epi8(p2_16x8, zero);
+ p1_8x16 = _mm_unpackhi_epi8(p1_16x8, zero);
+ p0_8x16 = _mm_unpackhi_epi8(p0_16x8, zero);
+ q0_8x16 = _mm_unpackhi_epi8(q0_16x8, zero);
+ q1_8x16 = _mm_unpackhi_epi8(q1_16x8, zero);
+ q2_8x16 = _mm_unpackhi_epi8(q2_16x8, zero);
+ q3_8x16 = _mm_unpackhi_epi8(q3_16x8, zero);
+
+ // p0_1 and q0_1
+ temp1 = _mm_add_epi16(p0_8x16, q1_8x16);
+ temp2 = _mm_add_epi16(p1_8x16, q0_8x16);
+ temp5 = _mm_add_epi16(temp1, const_val2_16x8);
+ temp6 = _mm_add_epi16(temp2, const_val2_16x8);
+ temp3 = _mm_slli_epi16(p1_8x16, 1);
+ temp4 = _mm_slli_epi16(q1_8x16, 1);
+ temp1 = _mm_add_epi16(temp5, temp3);
+ temp2 = _mm_add_epi16(temp6, temp4);
+ temp1 = _mm_srai_epi16(temp1, 2);
+ temp2 = _mm_srai_epi16(temp2, 2);
+ p0_16x8_1 = _mm_packus_epi16(p0_16x8_1, temp1);
+ q0_16x8_1 = _mm_packus_epi16(q0_16x8_1, temp2);
+
+ // p1_2 and q1_2
+ temp6 = _mm_add_epi16(temp6, p0_8x16);
+ temp5 = _mm_add_epi16(temp5, q0_8x16);
+ temp1 = _mm_add_epi16(temp6, p2_8x16);
+ temp2 = _mm_add_epi16(temp5, q2_8x16);
+ temp1 = _mm_srai_epi16(temp1, 2);
+ temp2 = _mm_srai_epi16(temp2, 2);
+ p1_16x8_2 = _mm_packus_epi16(p1_16x8_2, temp1);
+ q1_16x8_2 = _mm_packus_epi16(q1_16x8_2, temp2);
+
+ // p0_2 and q0_2
+ temp1 = _mm_add_epi16(temp3, p2_8x16);
+ temp2 = _mm_add_epi16(temp4, q2_8x16);
+ temp1 = _mm_add_epi16(temp1, q1_8x16);
+ temp2 = _mm_add_epi16(temp2, p1_8x16);
+ temp3 = _mm_add_epi16(p0_8x16, q0_8x16);
+ temp3 = _mm_slli_epi16(temp3, 1);
+ temp1 = _mm_add_epi16(temp1, temp3);
+ temp2 = _mm_add_epi16(temp2, temp3);
+ temp1 = _mm_add_epi16(temp1, _mm_set1_epi16(4));
+ temp2 = _mm_add_epi16(temp2, _mm_set1_epi16(4));
+ temp1 = _mm_srai_epi16(temp1, 3);
+ temp2 = _mm_srai_epi16(temp2, 3);
+ p0_16x8_2 = _mm_packus_epi16(p0_16x8_2, temp1);
+ q0_16x8_2 = _mm_packus_epi16(q0_16x8_2, temp2);
+
+ // p2_2 and q2_2
+ temp1 = _mm_add_epi16(temp6, const_val2_16x8);
+ temp2 = _mm_add_epi16(temp5, const_val2_16x8);
+ temp3 = _mm_slli_epi16(p2_8x16, 1);
+ temp4 = _mm_slli_epi16(q2_8x16, 1);
+ temp3 = _mm_add_epi16(p2_8x16, temp3);
+ temp4 = _mm_add_epi16(q2_8x16, temp4);
+ temp5 = _mm_slli_epi16(p3_8x16, 1);
+ temp6 = _mm_slli_epi16(q3_8x16, 1);
+ temp1 = _mm_add_epi16(temp1, temp3);
+ temp2 = _mm_add_epi16(temp2, temp4);
+ temp1 = _mm_add_epi16(temp1, temp5);
+ temp2 = _mm_add_epi16(temp2, temp6);
+ temp1 = _mm_srai_epi16(temp1, 3);
+ temp2 = _mm_srai_epi16(temp2, 3);
+ p2_16x8_2 = _mm_packus_epi16(p2_16x8_2, temp1);
+ q2_16x8_2 = _mm_packus_epi16(q2_16x8_2, temp2);
+
+ // p0 and q0
+ p0_16x8 = _mm_and_si128(p0_16x8,
+ _mm_xor_si128(flag1_16x8, _mm_set1_epi8(0xFF)));
+ p0_16x8_1 = _mm_and_si128(p0_16x8_1, flag1_16x8);
+ p0_16x8 = _mm_add_epi8(p0_16x8, p0_16x8_1);
+ q0_16x8 = _mm_and_si128(q0_16x8,
+ _mm_xor_si128(flag1_16x8, _mm_set1_epi8(0xFF)));
+ q0_16x8_1 = _mm_and_si128(q0_16x8_1, flag1_16x8);
+ q0_16x8 = _mm_add_epi8(q0_16x8, q0_16x8_1);
+
+ // p0 and q0
+ p0_16x8 = _mm_and_si128(p0_16x8,
+ _mm_xor_si128(flag3_16x8, _mm_set1_epi8(0xFF)));
+ p0_16x8_2 = _mm_and_si128(p0_16x8_2, flag3_16x8);
+ p0_16x8 = _mm_add_epi8(p0_16x8, p0_16x8_2);
+ q0_16x8 = _mm_and_si128(q0_16x8,
+ _mm_xor_si128(flag4_16x8, _mm_set1_epi8(0xFF)));
+ q0_16x8_2 = _mm_and_si128(q0_16x8_2, flag4_16x8);
+ q0_16x8 = _mm_add_epi8(q0_16x8, q0_16x8_2);
+
+ // p1 and q1
+ p1_16x8 = _mm_and_si128(p1_16x8,
+ _mm_xor_si128(flag3_16x8, _mm_set1_epi8(0xFF)));
+ p1_16x8_2 = _mm_and_si128(p1_16x8_2, flag3_16x8);
+ p1_16x8 = _mm_add_epi8(p1_16x8, p1_16x8_2);
+ q1_16x8 = _mm_and_si128(q1_16x8,
+ _mm_xor_si128(flag4_16x8, _mm_set1_epi8(0xFF)));
+ q1_16x8_2 = _mm_and_si128(q1_16x8_2, flag4_16x8);
+ q1_16x8 = _mm_add_epi8(q1_16x8, q1_16x8_2);
+
+ // p2 and q2
+ p2_16x8 = _mm_and_si128(p2_16x8,
+ _mm_xor_si128(flag3_16x8, _mm_set1_epi8(0xFF)));
+ p2_16x8_2 = _mm_and_si128(p2_16x8_2, flag3_16x8);
+ p2_16x8 = _mm_add_epi8(p2_16x8, p2_16x8_2);
+ q2_16x8 = _mm_and_si128(q2_16x8,
+ _mm_xor_si128(flag4_16x8, _mm_set1_epi8(0xFF)));
+ q2_16x8_2 = _mm_and_si128(q2_16x8_2, flag4_16x8);
+ q2_16x8 = _mm_add_epi8(q2_16x8, q2_16x8_2);
+
+ _mm_storeu_si128((__m128i *)(pu1_HorzPixel + i16_posP2), p2_16x8);
+ _mm_storeu_si128((__m128i *)(pu1_HorzPixel + i16_posP1), p1_16x8);
+ _mm_storeu_si128((__m128i *)(pu1_HorzPixel + i16_posP0), p0_16x8);
+
+ _mm_storeu_si128((__m128i *)(pu1_src), q0_16x8);
+ _mm_storeu_si128((__m128i *)(pu1_src + i16_posQ1), q1_16x8);
+ _mm_storeu_si128((__m128i *)(pu1_src + i16_posQ2), q2_16x8);
+
+}
+
+/*****************************************************************************/
+/* */
+/* Function Name : ih264_deblk_luma_vert_bslt4_ssse3() */
+/* */
+/* Description : This function performs filtering of a luma block */
+/* vertical edge when the boundary strength is less than 4. */
+/* */
+/* Inputs : pu1_src - pointer to the src sample q0 */
+/* src_strd - source stride */
+/* alpha - alpha value for the boundary */
+/* beta - beta value for the boundary */
+/* u4_bs - packed Boundary strength array */
+/* pu1_cliptab - tc0_table */
+/* */
+/* Globals : None */
+/* */
+/* Processing : This operation is described in Sec. 8.7.2.3 under the */
+/* title "Filtering process for edges for bS less than 4" */
+/* in ITU T Rec H.264. */
+/* */
+/* Outputs : None */
+/* */
+/* Returns : None */
+/* */
+/* Issues : None */
+/* */
+/* Revision History: */
+/* */
+/* DD MM YYYY Author(s) Changes (Describe the changes made) */
+/* 12 02 2015 Naveen Kumar P Initial version */
+/* */
+/*****************************************************************************/
+void ih264_deblk_luma_vert_bslt4_ssse3(UWORD8 *pu1_src,
+ WORD32 src_strd,
+ WORD32 alpha,
+ WORD32 beta,
+ UWORD32 u4_bs,
+ const UWORD8 *pu1_cliptab)
+{
+ UWORD8 u1_Bs, u1_Bs1;
+
+ UWORD32 j = 0;
+
+ __m128i linea, lineb, linec, lined, linee, linef, lineg, lineh;
+ __m128i int1, int2, int3, int4, high1, high2;
+ __m128i flag, flag1, i_C, i_C0;
+ __m128i i_Ap, i_Aq, diff, const1, const2, in_macro, in_macrotemp, temp,
+ temp1;
+ __m128i zero = _mm_setzero_si128();
+
+ for(j = 0; j <= 8 * src_strd; j += 8 * src_strd)
+ {
+ //Transpose
+ linea = _mm_loadl_epi64((__m128i *)(pu1_src - 3 + j));
+ lineb = _mm_loadl_epi64((__m128i *)(pu1_src - 3 + src_strd + j));
+ linec = _mm_loadl_epi64((__m128i *)(pu1_src - 3 + 2 * src_strd + j));
+ lined = _mm_loadl_epi64((__m128i *)(pu1_src - 3 + 3 * src_strd + j));
+
+ linea = _mm_unpacklo_epi8(linea, zero);
+ lineb = _mm_unpacklo_epi8(lineb, zero);
+ linec = _mm_unpacklo_epi8(linec, zero);
+ lined = _mm_unpacklo_epi8(lined, zero);
+
+ int1 = _mm_unpacklo_epi16(linea, lineb);
+ lineb = _mm_unpackhi_epi16(linea, lineb);
+
+ int2 = _mm_unpacklo_epi16(linec, lined);
+ lined = _mm_unpackhi_epi16(linec, lined);
+
+ linea = _mm_unpacklo_epi16(int1, int2);
+ int1 = _mm_unpackhi_epi16(int1, int2);
+
+ linec = _mm_unpacklo_epi16(lineb, lined);
+ high1 = _mm_unpackhi_epi16(lineb, lined);
+
+ linee = _mm_loadl_epi64((__m128i *)(pu1_src - 3 + 4 * src_strd + j));
+ linef = _mm_loadl_epi64((__m128i *)(pu1_src - 3 + 5 * src_strd + j));
+ lineg = _mm_loadl_epi64((__m128i *)(pu1_src - 3 + 6 * src_strd + j));
+ lineh = _mm_loadl_epi64((__m128i *)(pu1_src - 3 + 7 * src_strd + j));
+
+ linee = _mm_unpacklo_epi8(linee, zero);
+ linef = _mm_unpacklo_epi8(linef, zero);
+ lineg = _mm_unpacklo_epi8(lineg, zero);
+ lineh = _mm_unpacklo_epi8(lineh, zero);
+
+ int2 = _mm_unpacklo_epi16(linee, linef);
+ linef = _mm_unpackhi_epi16(linee, linef);
+
+ int3 = _mm_unpacklo_epi16(lineg, lineh);
+ lineh = _mm_unpackhi_epi16(lineg, lineh);
+
+ linee = _mm_unpacklo_epi16(int2, int3);
+ int2 = _mm_unpackhi_epi16(int2, int3);
+
+ lineg = _mm_unpacklo_epi16(linef, lineh);
+ high2 = _mm_unpackhi_epi16(linef, lineh);
+
+ int4 = _mm_unpacklo_epi16(linea, linee);
+ lineb = _mm_unpackhi_epi16(linea, linee);
+
+ int3 = _mm_unpacklo_epi16(int1, int2);
+ lined = _mm_unpackhi_epi16(int1, int2);
+
+ int2 = _mm_unpacklo_epi16(linec, lineg);
+ linef = _mm_unpackhi_epi16(linec, lineg);
+
+ linea = int4;
+ linec = int3;
+ linee = int2;
+
+ lineg = _mm_unpacklo_epi16(high1, high2);
+ lineh = _mm_unpackhi_epi16(high1, high2);
+
+ //end of transpose
+
+ u1_Bs = (u4_bs >> 24) & 0xff;
+ u1_Bs1 = (u4_bs >> 16) & 0xff;
+ u4_bs <<= 16;
+
+ flag1 = _mm_set_epi16(u1_Bs1, u1_Bs, u1_Bs1, u1_Bs, u1_Bs1, u1_Bs,
+ u1_Bs1, u1_Bs);
+ flag1 = _mm_cmpeq_epi16(flag1, zero); //Set flag to 1s and 0s
+ flag1 = _mm_xor_si128(flag1, _mm_set1_epi16(0xFFFF)); //Invert for required mask
+
+ i_C0 = _mm_set_epi16(pu1_cliptab[u1_Bs1], pu1_cliptab[u1_Bs],
+ pu1_cliptab[u1_Bs1], pu1_cliptab[u1_Bs],
+ pu1_cliptab[u1_Bs1], pu1_cliptab[u1_Bs],
+ pu1_cliptab[u1_Bs1], pu1_cliptab[u1_Bs]);
+
+ diff = _mm_subs_epi16(linec, lined); //Condn 1
+ diff = _mm_abs_epi16(diff);
+ const1 = _mm_set1_epi16(alpha);
+ flag = _mm_cmpgt_epi16(const1, diff);
+
+ diff = _mm_subs_epi16(linee, lined); //Condtn 2
+ diff = _mm_abs_epi16(diff);
+ const1 = _mm_set1_epi16(beta);
+ flag = _mm_and_si128(flag, _mm_cmpgt_epi16(const1, diff));
+
+ diff = _mm_subs_epi16(lineb, linec); //Condtn 3
+ diff = _mm_abs_epi16(diff);
+ flag = _mm_and_si128(flag, _mm_cmpgt_epi16(const1, diff)); //Const 1= Beta from now on
+
+ flag = _mm_and_si128(flag, flag1); //Final flag (ui_B condition + other 3 conditions)
+
+ //Adding Ap<Beta and Aq<Beta
+ i_Ap = _mm_subs_epi16(linea, linec);
+ i_Ap = _mm_abs_epi16(i_Ap);
+ const2 = _mm_cmpgt_epi16(const1, i_Ap);
+ const2 = _mm_subs_epi16(zero, const2); //Make FFFF=1 and 0000=0
+ i_C = _mm_add_epi16(i_C0, const2);
+
+ i_Aq = _mm_subs_epi16(linef, lined);
+ i_Aq = _mm_abs_epi16(i_Aq);
+ const2 = _mm_cmpgt_epi16(const1, i_Aq);
+ const2 = _mm_subs_epi16(zero, const2);
+ i_C = _mm_add_epi16(i_C, const2);
+
+ //Calculate in_macro
+ diff = _mm_subs_epi16(lined, linec);
+ diff = _mm_slli_epi16(diff, 2);
+ const2 = _mm_subs_epi16(lineb, linee);
+ diff = _mm_add_epi16(diff, const2);
+ const2 = _mm_set1_epi16(4);
+ diff = _mm_add_epi16(diff, const2);
+ in_macro = _mm_srai_epi16(diff, 3);
+
+ in_macro = _mm_min_epi16(i_C, in_macro); //CLIP3
+ i_C = _mm_subs_epi16(zero, i_C);
+ in_macro = _mm_max_epi16(i_C, in_macro);
+
+ //Compute and store
+ in_macrotemp = _mm_add_epi16(linec, in_macro);
+ in_macrotemp = _mm_and_si128(in_macrotemp, flag);
+ temp = _mm_and_si128(linec,
+ _mm_xor_si128(flag, _mm_set1_epi16(0xFFFF)));
+ temp = _mm_add_epi16(temp, in_macrotemp);
+ //temp= _mm_packus_epi16 (temp, zero);
+ //_mm_storel_epi64(uc_HorzPixel+i16_posP0+i, in_macrotemp);
+
+ in_macrotemp = _mm_subs_epi16(lined, in_macro);
+ in_macrotemp = _mm_and_si128(in_macrotemp, flag);
+ temp1 = _mm_and_si128(lined,
+ _mm_xor_si128(flag, _mm_set1_epi16(0xFFFF)));
+ temp1 = _mm_add_epi16(temp1, in_macrotemp);
+ //temp1= _mm_packus_epi16 (temp1, zero);
+ //_mm_storel_epi64(pu1_src+i, in_macrotemp);
+
+ //If Ap<Beta
+ flag1 = _mm_cmpgt_epi16(const1, i_Ap);
+ flag1 = _mm_and_si128(flag, flag1);
+ in_macrotemp = _mm_add_epi16(linec, lined);
+ in_macrotemp = _mm_add_epi16(in_macrotemp, _mm_set1_epi16(1));
+ in_macrotemp = _mm_srai_epi16(in_macrotemp, 1);
+ in_macro = _mm_add_epi16(in_macrotemp, linea);
+ in_macro = _mm_subs_epi16(in_macro, _mm_slli_epi16(lineb, 1));
+ in_macro = _mm_srai_epi16(in_macro, 1);
+
+ in_macro = _mm_min_epi16(i_C0, in_macro); //CLIP3
+ i_C0 = _mm_subs_epi16(zero, i_C0);
+ in_macro = _mm_max_epi16(i_C0, in_macro);
+
+ in_macro = _mm_and_si128(in_macro, flag1);
+ lineb = _mm_add_epi16(lineb, in_macro);
+ //in_macro= _mm_packus_epi16 (i_p1, zero);
+ //_mm_storel_epi64(uc_HorzPixel+i16_posP1+i, in_macro);
+
+ flag1 = _mm_cmpgt_epi16(const1, i_Aq);
+ flag1 = _mm_and_si128(flag, flag1);
+ in_macro = _mm_add_epi16(in_macrotemp, linef);
+ in_macro = _mm_subs_epi16(in_macro, _mm_slli_epi16(linee, 1));
+ in_macro = _mm_srai_epi16(in_macro, 1);
+
+ i_C0 = _mm_abs_epi16(i_C0);
+ in_macro = _mm_min_epi16(i_C0, in_macro); //CLIP3
+ i_C0 = _mm_subs_epi16(zero, i_C0);
+ in_macro = _mm_max_epi16(i_C0, in_macro);
+
+ in_macro = _mm_and_si128(in_macro, flag1);
+ linee = _mm_add_epi16(linee, in_macro);
+ //in_macro= _mm_packus_epi16 (i_q1, zero);
+ //_mm_storel_epi64(pu1_src+i16_posQ1+i, in_macro);
+ linec = temp;
+ lined = temp1;
+ //End of filtering
+
+ int1 = _mm_unpacklo_epi16(linea, linee);
+ linee = _mm_unpackhi_epi16(linea, linee);
+
+ int2 = _mm_unpacklo_epi16(linec, lineg);
+ lineg = _mm_unpackhi_epi16(linec, lineg);
+
+ linea = _mm_unpacklo_epi16(int1, int2);
+ int3 = _mm_unpackhi_epi16(int1, int2);
+
+ linec = _mm_unpacklo_epi16(linee, lineg);
+ lineg = _mm_unpackhi_epi16(linee, lineg);
+
+ int1 = _mm_unpacklo_epi16(lineb, linef);
+ linef = _mm_unpackhi_epi16(lineb, linef);
+
+ int2 = _mm_unpacklo_epi16(lined, lineh);
+ lineh = _mm_unpackhi_epi16(lined, lineh);
+
+ lineb = _mm_unpacklo_epi16(int1, int2);
+ int4 = _mm_unpackhi_epi16(int1, int2);
+
+ lined = _mm_unpacklo_epi16(linef, lineh);
+ lineh = _mm_unpackhi_epi16(linef, lineh);
+
+ int1 = _mm_unpackhi_epi16(linea, lineb);
+ linea = _mm_unpacklo_epi16(linea, lineb);
+
+ int2 = _mm_unpacklo_epi16(int3, int4);
+ high1 = _mm_unpackhi_epi16(int3, int4);
+
+ lineb = _mm_unpacklo_epi16(linec, lined);
+ linef = _mm_unpackhi_epi16(linec, lined);
+
+ lined = _mm_unpacklo_epi16(lineg, lineh);
+ lineh = _mm_unpackhi_epi16(lineg, lineh);
+
+ linee = int1;
+ lineg = high1;
+ linec = int2;
+ //End of inverse transpose
+
+ //Packs and stores
+ linea = _mm_packus_epi16(linea, zero);
+ _mm_storel_epi64((__m128i *)(pu1_src - 3 + j), linea);
+
+ lineb = _mm_packus_epi16(lineb, zero);
+ _mm_storel_epi64((__m128i *)(pu1_src - 3 + src_strd + j), lineb);
+
+ linec = _mm_packus_epi16(linec, zero);
+ _mm_storel_epi64((__m128i *)(pu1_src - 3 + 2 * src_strd + j), linec);
+
+ lined = _mm_packus_epi16(lined, zero);
+ _mm_storel_epi64((__m128i *)(pu1_src - 3 + 3 * src_strd + j), lined);
+
+ linee = _mm_packus_epi16(linee, zero);
+ _mm_storel_epi64((__m128i *)(pu1_src - 3 + 4 * src_strd + j), linee);
+
+ linef = _mm_packus_epi16(linef, zero);
+ _mm_storel_epi64((__m128i *)(pu1_src - 3 + 5 * src_strd + j), linef);
+
+ lineg = _mm_packus_epi16(lineg, zero);
+ _mm_storel_epi64((__m128i *)(pu1_src - 3 + 6 * src_strd + j), lineg);
+
+ lineh = _mm_packus_epi16(lineh, zero);
+ _mm_storel_epi64((__m128i *)(pu1_src - 3 + 7 * src_strd + j), lineh);
+
+ }
+}
+
+/*****************************************************************************/
+/* */
+/* Function Name : ih264_deblk_luma_horz_bslt4_ssse3() */
+/* */
+/* Description : This function performs filtering of a luma block */
+/* horizontal edge when boundary strength is less than 4. */
+/* */
+/* Inputs : pu1_src - pointer to the src sample q0 */
+/* src_strd - source stride */
+/* alpha - alpha value for the boundary */
+/* beta - beta value for the boundary */
+/* u4_bs - packed Boundary strength array */
+/* pu1_cliptab - tc0_table */
+/* */
+/* Globals : None */
+/* */
+/* Processing : This operation is described in Sec. 8.7.2.3 under the */
+/* title "Filtering process for edges for bS less than 4" */
+/* in ITU T Rec H.264. */
+/* */
+/* Outputs : None */
+/* */
+/* Returns : None */
+/* */
+/* Issues : None */
+/* */
+/* Revision History: */
+/* */
+/* DD MM YYYY Author(s) Changes (Describe the changes made) */
+/* 12 02 2015 Naveen Kumar P Initial version */
+/* */
+/*****************************************************************************/
+void ih264_deblk_luma_horz_bslt4_ssse3(UWORD8 *pu1_src,
+ WORD32 src_strd,
+ WORD32 alpha,
+ WORD32 beta,
+ UWORD32 u4_bs,
+ const UWORD8 *pu1_cliptab)
+{
+ WORD16 i16_posP2, i16_posP1, i16_posP0, i16_posQ1, i16_posQ2;
+ UWORD8 *pu1_HorzPixel;
+ __m128i zero = _mm_setzero_si128();
+ __m128i bs_flag_16x8b, C0_16x8, C0_8x16, C0_hi_8x16, C_8x16, C_hi_8x16;
+ __m128i q0_16x8, q1_16x8, q2_16x8, p0_16x8, p1_16x8, p2_16x8;
+ __m128i temp1, temp2;
+ __m128i Alpha_8x16, Beta_8x16, flag1_16x8, flag2_16x8, flag3_16x8;
+ __m128i in_macro_16x8, in_macro_hi_16x8;
+ __m128i const_val4_8x16;
+ UWORD8 u1_Bs0, u1_Bs1, u1_Bs2, u1_Bs3;
+ UWORD8 clip0, clip1, clip2, clip3;
+
+ pu1_HorzPixel = pu1_src - (src_strd << 2);
+
+ i16_posQ1 = src_strd;
+ i16_posQ2 = X2(src_strd);
+ i16_posP0 = X3(src_strd);
+ i16_posP1 = X2(src_strd);
+ i16_posP2 = src_strd;
+
+ q0_16x8 = _mm_loadu_si128((__m128i *)(pu1_src));
+ q1_16x8 = _mm_loadu_si128((__m128i *)(pu1_src + i16_posQ1));
+
+ u1_Bs0 = (u4_bs >> 24) & 0xff;
+ u1_Bs1 = (u4_bs >> 16) & 0xff;
+ u1_Bs2 = (u4_bs >> 8) & 0xff;
+ u1_Bs3 = (u4_bs >> 0) & 0xff;
+ clip0 = pu1_cliptab[u1_Bs0];
+ clip1 = pu1_cliptab[u1_Bs1];
+ clip2 = pu1_cliptab[u1_Bs2];
+ clip3 = pu1_cliptab[u1_Bs3];
+
+ Alpha_8x16 = _mm_set1_epi16(alpha);
+ Beta_8x16 = _mm_set1_epi16(beta);
+
+ bs_flag_16x8b = _mm_set_epi8(u1_Bs3, u1_Bs3, u1_Bs3, u1_Bs3, u1_Bs2, u1_Bs2,
+ u1_Bs2, u1_Bs2, u1_Bs1, u1_Bs1, u1_Bs1, u1_Bs1,
+ u1_Bs0, u1_Bs0, u1_Bs0, u1_Bs0);
+
+ C0_16x8 = _mm_set_epi8(clip3, clip3, clip3, clip3, clip2, clip2, clip2,
+ clip2, clip1, clip1, clip1, clip1, clip0, clip0,
+ clip0, clip0);
+
+ bs_flag_16x8b = _mm_cmpeq_epi8(bs_flag_16x8b, zero);
+ bs_flag_16x8b = _mm_xor_si128(bs_flag_16x8b, _mm_set1_epi8(0xFF)); //Invert for required mask
+ C0_8x16 = _mm_unpacklo_epi8(C0_16x8, zero);
+ C0_hi_8x16 = _mm_unpackhi_epi8(C0_16x8, zero);
+
+ p1_16x8 = _mm_loadu_si128((__m128i *)(pu1_HorzPixel + i16_posP1));
+ p0_16x8 = _mm_loadu_si128((__m128i *)(pu1_HorzPixel + i16_posP0));
+ p2_16x8 = _mm_loadu_si128((__m128i *)(pu1_HorzPixel + i16_posP2));
+ q2_16x8 = _mm_loadu_si128((__m128i *)(pu1_src + i16_posQ2));
+
+ //Cond1 (ABS(p0 - q0) < alpha)
+ temp1 = _mm_subs_epu8(q0_16x8, p0_16x8);
+ temp2 = _mm_subs_epu8(p0_16x8, q0_16x8);
+ temp1 = _mm_add_epi8(temp1, temp2);
+
+ temp2 = _mm_unpacklo_epi8(temp1, zero);
+ temp1 = _mm_unpackhi_epi8(temp1, zero);
+
+ temp2 = _mm_cmpgt_epi16(Alpha_8x16, temp2);
+ temp1 = _mm_cmpgt_epi16(Alpha_8x16, temp1);
+
+ flag1_16x8 = _mm_packs_epi16(temp2, temp1);
+ flag1_16x8 = _mm_and_si128(flag1_16x8, bs_flag_16x8b);
+
+ //Cond2 (ABS(q1 - q0) < beta)
+ temp1 = _mm_subs_epu8(q0_16x8, q1_16x8);
+ temp2 = _mm_subs_epu8(q1_16x8, q0_16x8);
+ temp1 = _mm_add_epi8(temp1, temp2);
+
+ temp2 = _mm_unpacklo_epi8(temp1, zero);
+ temp1 = _mm_unpackhi_epi8(temp1, zero);
+
+ temp2 = _mm_cmpgt_epi16(Beta_8x16, temp2);
+ temp1 = _mm_cmpgt_epi16(Beta_8x16, temp1);
+
+ flag2_16x8 = _mm_packs_epi16(temp2, temp1);
+
+ flag1_16x8 = _mm_and_si128(flag1_16x8, flag2_16x8);
+
+ //Cond3 (ABS(p1 - p0) < beta)
+ temp1 = _mm_subs_epu8(p0_16x8, p1_16x8);
+ temp2 = _mm_subs_epu8(p1_16x8, p0_16x8);
+ temp1 = _mm_add_epi8(temp1, temp2);
+
+ temp2 = _mm_unpacklo_epi8(temp1, zero);
+ temp1 = _mm_unpackhi_epi8(temp1, zero);
+
+ temp2 = _mm_cmpgt_epi16(Beta_8x16, temp2);
+ temp1 = _mm_cmpgt_epi16(Beta_8x16, temp1);
+
+ flag2_16x8 = _mm_packs_epi16(temp2, temp1);
+
+ // !((ABS(p0 - q0) < alpha) || (ABS(q1 - q0) < beta) || (ABS(p1 - p0) < beta))
+ flag1_16x8 = _mm_and_si128(flag1_16x8, flag2_16x8);
+
+ // (ABS(p2 - p0) < beta)
+ temp1 = _mm_subs_epu8(p0_16x8, p2_16x8);
+ temp2 = _mm_subs_epu8(p2_16x8, p0_16x8);
+ temp1 = _mm_add_epi8(temp1, temp2);
+
+ temp2 = _mm_unpacklo_epi8(temp1, zero);
+ temp1 = _mm_unpackhi_epi8(temp1, zero);
+ temp2 = _mm_cmpgt_epi16(Beta_8x16, temp2);
+ temp1 = _mm_cmpgt_epi16(Beta_8x16, temp1);
+
+ flag2_16x8 = _mm_packs_epi16(temp2, temp1);
+ flag2_16x8 = _mm_and_si128(flag1_16x8, flag2_16x8);
+
+ temp2 = _mm_subs_epi16(zero, temp2);
+ temp1 = _mm_subs_epi16(zero, temp1);
+
+ C_8x16 = _mm_add_epi16(C0_8x16, temp2);
+ C_hi_8x16 = _mm_add_epi16(C0_hi_8x16, temp1);
+
+ // (ABS(q2 - q0) < beta)
+ temp1 = _mm_subs_epu8(q0_16x8, q2_16x8);
+ temp2 = _mm_subs_epu8(q2_16x8, q0_16x8);
+ temp1 = _mm_add_epi8(temp1, temp2);
+
+ temp2 = _mm_unpacklo_epi8(temp1, zero);
+ temp1 = _mm_unpackhi_epi8(temp1, zero);
+ temp2 = _mm_cmpgt_epi16(Beta_8x16, temp2);
+ temp1 = _mm_cmpgt_epi16(Beta_8x16, temp1);
+
+ flag3_16x8 = _mm_packs_epi16(temp2, temp1);
+ flag3_16x8 = _mm_and_si128(flag1_16x8, flag3_16x8);
+
+ temp2 = _mm_subs_epi16(zero, temp2);
+ temp1 = _mm_subs_epi16(zero, temp1);
+
+ C_8x16 = _mm_add_epi16(C_8x16, temp2);
+ C_hi_8x16 = _mm_add_epi16(C_hi_8x16, temp1);
+
+ const_val4_8x16 = _mm_set1_epi16(4);
+ temp1 = _mm_subs_epi16(_mm_unpacklo_epi8(q0_16x8, zero),
+ _mm_unpacklo_epi8(p0_16x8, zero));
+ temp2 = _mm_subs_epi16(_mm_unpacklo_epi8(p1_16x8, zero),
+ _mm_unpacklo_epi8(q1_16x8, zero));
+ temp1 = _mm_slli_epi16(temp1, 2);
+ temp1 = _mm_add_epi16(temp1, temp2);
+ temp1 = _mm_add_epi16(temp1, const_val4_8x16);
+ in_macro_16x8 = _mm_srai_epi16(temp1, 3);
+
+ temp1 = _mm_subs_epi16(_mm_unpackhi_epi8(q0_16x8, zero),
+ _mm_unpackhi_epi8(p0_16x8, zero));
+ temp2 = _mm_subs_epi16(_mm_unpackhi_epi8(p1_16x8, zero),
+ _mm_unpackhi_epi8(q1_16x8, zero));
+ temp1 = _mm_slli_epi16(temp1, 2);
+ temp1 = _mm_add_epi16(temp1, temp2);
+ temp1 = _mm_add_epi16(temp1, const_val4_8x16);
+ in_macro_hi_16x8 = _mm_srai_epi16(temp1, 3);
+
+ in_macro_16x8 = _mm_min_epi16(C_8x16, in_macro_16x8); //CLIP3
+ in_macro_hi_16x8 = _mm_min_epi16(C_hi_8x16, in_macro_hi_16x8); //CLIP3
+ C_8x16 = _mm_subs_epi16(zero, C_8x16);
+ C_hi_8x16 = _mm_subs_epi16(zero, C_hi_8x16);
+ in_macro_16x8 = _mm_max_epi16(C_8x16, in_macro_16x8); //CLIP3
+ in_macro_hi_16x8 = _mm_max_epi16(C_hi_8x16, in_macro_hi_16x8); //CLIP3
+
+ temp1 = _mm_add_epi16(_mm_unpacklo_epi8(p0_16x8, zero), in_macro_16x8);
+ temp2 = _mm_add_epi16(_mm_unpackhi_epi8(p0_16x8, zero), in_macro_hi_16x8);
+
+ temp1 = _mm_packus_epi16(temp1, temp2);
+
+ temp1 = _mm_and_si128(temp1, flag1_16x8);
+ temp2 = _mm_and_si128(p0_16x8,
+ _mm_xor_si128(flag1_16x8, _mm_set1_epi16(0xFFFF)));
+
+ temp1 = _mm_add_epi8(temp1, temp2);
+
+ _mm_storeu_si128((__m128i *)(pu1_HorzPixel + i16_posP0), temp1);
+
+ temp1 = _mm_sub_epi16(_mm_unpacklo_epi8(q0_16x8, zero), in_macro_16x8);
+ temp2 = _mm_sub_epi16(_mm_unpackhi_epi8(q0_16x8, zero), in_macro_hi_16x8);
+
+ temp1 = _mm_packus_epi16(temp1, temp2);
+
+ temp1 = _mm_and_si128(temp1, flag1_16x8);
+ temp2 = _mm_and_si128(q0_16x8,
+ _mm_xor_si128(flag1_16x8, _mm_set1_epi16(0xFFFF)));
+
+ temp1 = _mm_add_epi8(temp1, temp2);
+ _mm_storeu_si128((__m128i *)(pu1_src), temp1);
+
+ //if(Ap < Beta)
+ temp1 = _mm_avg_epu16(_mm_unpacklo_epi8(q0_16x8, zero),
+ _mm_unpacklo_epi8(p0_16x8, zero));
+ temp2 = _mm_slli_epi16(_mm_unpacklo_epi8(p1_16x8, zero), 1);
+ //temp2 = _mm_subs_epi16(zero,temp2);
+ temp2 = _mm_subs_epi16(_mm_unpacklo_epi8(p2_16x8, zero), temp2);
+ temp2 = _mm_add_epi16(temp1, temp2);
+ in_macro_16x8 = _mm_srai_epi16(temp2, 1);
+
+ temp1 = _mm_avg_epu16(_mm_unpackhi_epi8(q0_16x8, zero),
+ _mm_unpackhi_epi8(p0_16x8, zero));
+ temp2 = _mm_slli_epi16(_mm_unpackhi_epi8(p1_16x8, zero), 1);
+ //temp2 = _mm_subs_epi16(zero,temp2);
+ temp2 = _mm_subs_epi16(_mm_unpackhi_epi8(p2_16x8, zero), temp2);
+ temp2 = _mm_add_epi16(temp1, temp2);
+ in_macro_hi_16x8 = _mm_srai_epi16(temp2, 1);
+
+ in_macro_16x8 = _mm_min_epi16(C0_8x16, in_macro_16x8); //CLIP3
+ in_macro_hi_16x8 = _mm_min_epi16(C0_hi_8x16, in_macro_hi_16x8); //CLIP3
+ C0_8x16 = _mm_subs_epi16(zero, C0_8x16);
+ C0_hi_8x16 = _mm_subs_epi16(zero, C0_hi_8x16);
+ in_macro_16x8 = _mm_max_epi16(C0_8x16, in_macro_16x8); //CLIP3
+ in_macro_hi_16x8 = _mm_max_epi16(C0_hi_8x16, in_macro_hi_16x8); //CLIP3
+
+ temp1 = _mm_add_epi16(_mm_unpacklo_epi8(p1_16x8, zero), in_macro_16x8);
+ temp2 = _mm_add_epi16(_mm_unpackhi_epi8(p1_16x8, zero), in_macro_hi_16x8);
+
+ temp1 = _mm_packus_epi16(temp1, temp2);
+
+ temp1 = _mm_and_si128(temp1, flag2_16x8);
+ temp2 = _mm_and_si128(p1_16x8,
+ _mm_xor_si128(flag2_16x8, _mm_set1_epi16(0xFFFF)));
+ temp1 = _mm_add_epi8(temp1, temp2);
+ _mm_storeu_si128((__m128i *)(pu1_HorzPixel + i16_posP1), temp1);
+
+ //if(Aq < Beta)
+ temp1 = _mm_avg_epu16(_mm_unpacklo_epi8(q0_16x8, zero),
+ _mm_unpacklo_epi8(p0_16x8, zero));
+ temp2 = _mm_slli_epi16(_mm_unpacklo_epi8(q1_16x8, zero), 1);
+ //temp2 = _mm_slli_epi16 (temp2, 1);
+ temp2 = _mm_subs_epi16(_mm_unpacklo_epi8(q2_16x8, zero), temp2);
+ temp2 = _mm_add_epi16(temp1, temp2);
+ in_macro_16x8 = _mm_srai_epi16(temp2, 1);
+
+ temp1 = _mm_avg_epu16(_mm_unpackhi_epi8(q0_16x8, zero),
+ _mm_unpackhi_epi8(p0_16x8, zero));
+ temp2 = _mm_slli_epi16(_mm_unpackhi_epi8(q1_16x8, zero), 1);
+ //temp2 = _mm_slli_epi16 (temp2, 1);
+ temp2 = _mm_subs_epi16(_mm_unpackhi_epi8(q2_16x8, zero), temp2);
+ temp2 = _mm_add_epi16(temp1, temp2);
+ in_macro_hi_16x8 = _mm_srai_epi16(temp2, 1);
+
+ in_macro_16x8 = _mm_max_epi16(C0_8x16, in_macro_16x8); //CLIP3
+ in_macro_hi_16x8 = _mm_max_epi16(C0_hi_8x16, in_macro_hi_16x8); //CLIP3
+ C0_8x16 = _mm_subs_epi16(zero, C0_8x16);
+ C0_hi_8x16 = _mm_subs_epi16(zero, C0_hi_8x16);
+ in_macro_16x8 = _mm_min_epi16(C0_8x16, in_macro_16x8); //CLIP3
+ in_macro_hi_16x8 = _mm_min_epi16(C0_hi_8x16, in_macro_hi_16x8); //CLIP3
+
+ temp1 = _mm_add_epi16(_mm_unpacklo_epi8(q1_16x8, zero), in_macro_16x8);
+ temp2 = _mm_add_epi16(_mm_unpackhi_epi8(q1_16x8, zero), in_macro_hi_16x8);
+
+ temp1 = _mm_packus_epi16(temp1, temp2);
+
+ temp1 = _mm_and_si128(temp1, flag3_16x8);
+ temp2 = _mm_and_si128(q1_16x8,
+ _mm_xor_si128(flag3_16x8, _mm_set1_epi16(0xFFFF)));
+ temp1 = _mm_add_epi8(temp1, temp2);
+
+ _mm_storeu_si128((__m128i *)(pu1_src + i16_posQ1), temp1);
+
+}
+
+/*****************************************************************************/
+/* */
+/* Function Name : ih264_deblk_luma_vert_bs4_mbaff_ssse3() */
+/* */
+/* Description : This function performs filtering of a luma block */
+/* vertical edge when boundary strength is set to 4. */
+/* */
+/* Inputs : pu1_src - pointer to the src sample q0 */
+/* src_strd - source stride */
+/* alpha - alpha value for the boundary */
+/* beta - beta value for the boundary */
+/* */
+/* Globals : None */
+/* */
+/* Processing : When the function is called twice, this operation is as */
+/* described in Sec. 8.7.2.3 under the title "Filtering */
+/* process for edges for bS equal to 4" in ITU T Rec H.264. */
+/* */
+/* Outputs : None */
+/* */
+/* Returns : None */
+/* */
+/* Issues : None */
+/* */
+/* Revision History: */
+/* */
+/* DD MM YYYY Author(s) Changes (Describe the changes made) */
+/* 12 02 2015 Naveen Kumar P Initial version */
+/* */
+/*****************************************************************************/
+void ih264_deblk_luma_vert_bs4_mbaff_ssse3(UWORD8 *pu1_src,
+ WORD32 src_strd,
+ WORD32 alpha,
+ WORD32 beta)
+{
+ __m128i zero = _mm_setzero_si128();
+ __m128i q0_16x8, q1_16x8, q2_16x8, q3_16x8;
+ __m128i p0_16x8, p1_16x8, p2_16x8, p3_16x8;
+ __m128i q0_8x16, q1_8x16, q2_8x16, q3_8x16;
+ __m128i p0_8x16, p1_8x16, p2_8x16, p3_8x16;
+ __m128i q0_16x8_1;
+ __m128i p0_16x8_1;
+ __m128i q0_16x8_2, q1_16x8_2, q2_16x8_2;
+ __m128i p0_16x8_2, p1_16x8_2, p2_16x8_2;
+ __m128i temp1, temp2, temp3, temp4, temp5, temp6;
+ __m128i Alpha_8x16, Beta_8x16;
+ __m128i flag1_16x8, flag2_16x8, flag3_16x8, flag4_16x8;
+ __m128i const_val2_16x8 = _mm_set1_epi16(2);
+ __m128i line1, line2, line3, line4, line5, line6, line7, line8;
+
+ Alpha_8x16 = _mm_set1_epi16(alpha);
+ Beta_8x16 = _mm_set1_epi16(beta);
+
+ line1 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 0 * src_strd));
+ line2 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 1 * src_strd));
+ line3 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 2 * src_strd));
+ line4 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 3 * src_strd));
+ line5 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 4 * src_strd));
+ line6 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 5 * src_strd));
+ line7 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 6 * src_strd));
+ line8 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 7 * src_strd));
+
+ temp1 = _mm_unpacklo_epi8(line1, line2);
+ temp2 = _mm_unpacklo_epi8(line3, line4);
+ temp3 = _mm_unpacklo_epi8(line5, line6);
+ temp4 = _mm_unpacklo_epi8(line7, line8);
+
+ line1 = _mm_unpacklo_epi16(temp1, temp2);
+ line2 = _mm_unpackhi_epi16(temp1, temp2);
+ line3 = _mm_unpacklo_epi16(temp3, temp4);
+ line4 = _mm_unpackhi_epi16(temp3, temp4);
+
+ p1_8x16 = _mm_unpacklo_epi32(line1, line3);
+ p0_8x16 = _mm_unpackhi_epi32(line1, line3);
+ q0_8x16 = _mm_unpacklo_epi32(line2, line4);
+ q1_8x16 = _mm_unpackhi_epi32(line2, line4);
+
+ p3_16x8 = _mm_unpacklo_epi64(p1_8x16, zero);
+ p2_16x8 = _mm_unpackhi_epi64(p1_8x16, zero);
+ q2_16x8 = _mm_unpacklo_epi64(q1_8x16, zero);
+ q3_16x8 = _mm_unpackhi_epi64(q1_8x16, zero);
+ p1_16x8 = _mm_unpacklo_epi64(p0_8x16, zero);
+ p0_16x8 = _mm_unpackhi_epi64(p0_8x16, zero);
+ q0_16x8 = _mm_unpacklo_epi64(q0_8x16, zero);
+ q1_16x8 = _mm_unpackhi_epi64(q0_8x16, zero);
+
+ //Cond1 (ABS(p0 - q0) < alpha)
+ temp1 = _mm_subs_epu8(q0_16x8, p0_16x8);
+ temp2 = _mm_subs_epu8(p0_16x8, q0_16x8);
+ temp1 = _mm_add_epi8(temp1, temp2);
+
+ temp2 = _mm_unpacklo_epi8(temp1, zero);
+ temp1 = _mm_unpackhi_epi8(temp1, zero);
+
+ temp2 = _mm_cmpgt_epi16(Alpha_8x16, temp2);
+ temp1 = _mm_cmpgt_epi16(Alpha_8x16, temp1);
+
+ flag1_16x8 = _mm_packs_epi16(temp2, temp1);
+
+ //Cond2 (ABS(q1 - q0) < beta)
+ temp1 = _mm_subs_epu8(q0_16x8, q1_16x8);
+ temp2 = _mm_subs_epu8(q1_16x8, q0_16x8);
+ temp1 = _mm_add_epi8(temp1, temp2);
+
+ temp2 = _mm_unpacklo_epi8(temp1, zero);
+ temp1 = _mm_unpackhi_epi8(temp1, zero);
+
+ temp2 = _mm_cmpgt_epi16(Beta_8x16, temp2);
+ temp1 = _mm_cmpgt_epi16(Beta_8x16, temp1);
+
+ flag2_16x8 = _mm_packs_epi16(temp2, temp1);
+
+ flag1_16x8 = _mm_and_si128(flag1_16x8, flag2_16x8);
+
+ //Cond3 (ABS(p1 - p0) < beta)
+ temp1 = _mm_subs_epu8(p0_16x8, p1_16x8);
+ temp2 = _mm_subs_epu8(p1_16x8, p0_16x8);
+ temp1 = _mm_add_epi8(temp1, temp2);
+
+ temp2 = _mm_unpacklo_epi8(temp1, zero);
+ temp1 = _mm_unpackhi_epi8(temp1, zero);
+
+ temp2 = _mm_cmpgt_epi16(Beta_8x16, temp2);
+ temp1 = _mm_cmpgt_epi16(Beta_8x16, temp1);
+
+ flag2_16x8 = _mm_packs_epi16(temp2, temp1);
+
+ // !((ABS(p0 - q0) < alpha) || (ABS(q1 - q0) < beta) || (ABS(p1 - p0) < beta))
+ flag1_16x8 = _mm_and_si128(flag1_16x8, flag2_16x8);
+
+ // (ABS(p0 - q0) < ((alpha >> 2) + 2))
+ temp1 = _mm_subs_epu8(p0_16x8, q0_16x8);
+ temp2 = _mm_subs_epu8(q0_16x8, p0_16x8);
+ temp1 = _mm_add_epi8(temp1, temp2);
+ Alpha_8x16 = _mm_srai_epi16(Alpha_8x16, 2);
+ Alpha_8x16 = _mm_add_epi16(Alpha_8x16, const_val2_16x8);
+
+ temp2 = _mm_unpacklo_epi8(temp1, zero);
+ temp1 = _mm_unpackhi_epi8(temp1, zero);
+ temp2 = _mm_cmpgt_epi16(Alpha_8x16, temp2);
+ temp1 = _mm_cmpgt_epi16(Alpha_8x16, temp1);
+
+ flag2_16x8 = _mm_packs_epi16(temp2, temp1);
+ flag2_16x8 = _mm_and_si128(flag1_16x8, flag2_16x8);
+
+ // (ABS(p2 - p0) < beta)
+ temp1 = _mm_subs_epu8(p0_16x8, p2_16x8);
+ temp2 = _mm_subs_epu8(p2_16x8, p0_16x8);
+ temp1 = _mm_add_epi8(temp1, temp2);
+
+ temp2 = _mm_unpacklo_epi8(temp1, zero);
+ temp1 = _mm_unpackhi_epi8(temp1, zero);
+ temp2 = _mm_cmpgt_epi16(Beta_8x16, temp2);
+ temp1 = _mm_cmpgt_epi16(Beta_8x16, temp1);
+
+ flag3_16x8 = _mm_packs_epi16(temp2, temp1);
+ flag3_16x8 = _mm_and_si128(flag3_16x8, flag2_16x8);
+
+ // (ABS(q2 - q0) < beta)
+ temp1 = _mm_subs_epu8(q0_16x8, q2_16x8);
+ temp2 = _mm_subs_epu8(q2_16x8, q0_16x8);
+ temp1 = _mm_add_epi8(temp1, temp2);
+
+ temp2 = _mm_unpacklo_epi8(temp1, zero);
+ temp1 = _mm_unpackhi_epi8(temp1, zero);
+ temp2 = _mm_cmpgt_epi16(Beta_8x16, temp2);
+ temp1 = _mm_cmpgt_epi16(Beta_8x16, temp1);
+
+ flag4_16x8 = _mm_packs_epi16(temp2, temp1);
+ flag4_16x8 = _mm_and_si128(flag4_16x8, flag2_16x8);
+
+ // First 8 pixels
+ p3_8x16 = _mm_unpacklo_epi8(p3_16x8, zero);
+ p2_8x16 = _mm_unpacklo_epi8(p2_16x8, zero);
+ p1_8x16 = _mm_unpacklo_epi8(p1_16x8, zero);
+ p0_8x16 = _mm_unpacklo_epi8(p0_16x8, zero);
+ q0_8x16 = _mm_unpacklo_epi8(q0_16x8, zero);
+ q1_8x16 = _mm_unpacklo_epi8(q1_16x8, zero);
+ q2_8x16 = _mm_unpacklo_epi8(q2_16x8, zero);
+ q3_8x16 = _mm_unpacklo_epi8(q3_16x8, zero);
+
+ // p0_1 and q0_1
+ temp1 = _mm_add_epi16(p0_8x16, q1_8x16);
+ temp2 = _mm_add_epi16(p1_8x16, q0_8x16);
+ temp5 = _mm_add_epi16(temp1, const_val2_16x8);
+ temp6 = _mm_add_epi16(temp2, const_val2_16x8);
+ temp3 = _mm_slli_epi16(p1_8x16, 1);
+ temp4 = _mm_slli_epi16(q1_8x16, 1);
+ temp1 = _mm_add_epi16(temp5, temp3);
+ temp2 = _mm_add_epi16(temp6, temp4);
+ p0_16x8_1 = _mm_srai_epi16(temp1, 2);
+ q0_16x8_1 = _mm_srai_epi16(temp2, 2);
+
+ // p1_2 and q1_2
+ temp6 = _mm_add_epi16(temp6, p0_8x16);
+ temp5 = _mm_add_epi16(temp5, q0_8x16);
+ temp1 = _mm_add_epi16(temp6, p2_8x16);
+ temp2 = _mm_add_epi16(temp5, q2_8x16);
+ p1_16x8_2 = _mm_srai_epi16(temp1, 2);
+ q1_16x8_2 = _mm_srai_epi16(temp2, 2);
+
+ // p0_2 and q0_2
+ temp1 = _mm_add_epi16(temp3, p2_8x16);
+ temp2 = _mm_add_epi16(temp4, q2_8x16);
+ temp1 = _mm_add_epi16(temp1, q1_8x16);
+ temp2 = _mm_add_epi16(temp2, p1_8x16);
+ temp3 = _mm_add_epi16(p0_8x16, q0_8x16);
+ temp3 = _mm_slli_epi16(temp3, 1);
+ temp1 = _mm_add_epi16(temp1, temp3);
+ temp2 = _mm_add_epi16(temp2, temp3);
+ temp1 = _mm_add_epi16(temp1, _mm_set1_epi16(4));
+ temp2 = _mm_add_epi16(temp2, _mm_set1_epi16(4));
+ p0_16x8_2 = _mm_srai_epi16(temp1, 3);
+ q0_16x8_2 = _mm_srai_epi16(temp2, 3);
+
+ // p2_2 and q2_2
+ temp1 = _mm_add_epi16(temp6, const_val2_16x8);
+ temp2 = _mm_add_epi16(temp5, const_val2_16x8);
+ temp3 = _mm_slli_epi16(p2_8x16, 1);
+ temp4 = _mm_slli_epi16(q2_8x16, 1);
+ temp3 = _mm_add_epi16(p2_8x16, temp3);
+ temp4 = _mm_add_epi16(q2_8x16, temp4);
+ temp5 = _mm_slli_epi16(p3_8x16, 1);
+ temp6 = _mm_slli_epi16(q3_8x16, 1);
+ temp1 = _mm_add_epi16(temp1, temp3);
+ temp2 = _mm_add_epi16(temp2, temp4);
+ temp1 = _mm_add_epi16(temp1, temp5);
+ temp2 = _mm_add_epi16(temp2, temp6);
+ p2_16x8_2 = _mm_srai_epi16(temp1, 3);
+ q2_16x8_2 = _mm_srai_epi16(temp2, 3);
+
+ // p0_1 and q0_1
+ p0_16x8_1 = _mm_packus_epi16(p0_16x8_1, zero);
+ q0_16x8_1 = _mm_packus_epi16(q0_16x8_1, zero);
+
+ // p1_2 and q1_2
+ p1_16x8_2 = _mm_packus_epi16(p1_16x8_2, zero);
+ q1_16x8_2 = _mm_packus_epi16(q1_16x8_2, zero);
+
+ // p0_2 and q0_2
+ p0_16x8_2 = _mm_packus_epi16(p0_16x8_2, zero);
+ q0_16x8_2 = _mm_packus_epi16(q0_16x8_2, zero);
+
+ // p2_2 and q2_2
+ p2_16x8_2 = _mm_packus_epi16(p2_16x8_2, zero);
+ q2_16x8_2 = _mm_packus_epi16(q2_16x8_2, zero);
+
+ // p0 and q0
+ p0_16x8 = _mm_and_si128(p0_16x8,
+ _mm_xor_si128(flag1_16x8, _mm_set1_epi8(0xFF)));
+ p0_16x8_1 = _mm_and_si128(p0_16x8_1, flag1_16x8);
+ p0_16x8 = _mm_add_epi8(p0_16x8, p0_16x8_1);
+ q0_16x8 = _mm_and_si128(q0_16x8,
+ _mm_xor_si128(flag1_16x8, _mm_set1_epi8(0xFF)));
+ q0_16x8_1 = _mm_and_si128(q0_16x8_1, flag1_16x8);
+ q0_16x8 = _mm_add_epi8(q0_16x8, q0_16x8_1);
+
+ // p0 and q0
+ p0_16x8 = _mm_and_si128(p0_16x8,
+ _mm_xor_si128(flag3_16x8, _mm_set1_epi8(0xFF)));
+ p0_16x8_2 = _mm_and_si128(p0_16x8_2, flag3_16x8);
+ p0_16x8 = _mm_add_epi8(p0_16x8, p0_16x8_2);
+ q0_16x8 = _mm_and_si128(q0_16x8,
+ _mm_xor_si128(flag4_16x8, _mm_set1_epi8(0xFF)));
+ q0_16x8_2 = _mm_and_si128(q0_16x8_2, flag4_16x8);
+ q0_16x8 = _mm_add_epi8(q0_16x8, q0_16x8_2);
+
+ // p1 and q1
+ p1_16x8 = _mm_and_si128(p1_16x8,
+ _mm_xor_si128(flag3_16x8, _mm_set1_epi8(0xFF)));
+ p1_16x8_2 = _mm_and_si128(p1_16x8_2, flag3_16x8);
+ p1_16x8 = _mm_add_epi8(p1_16x8, p1_16x8_2);
+ q1_16x8 = _mm_and_si128(q1_16x8,
+ _mm_xor_si128(flag4_16x8, _mm_set1_epi8(0xFF)));
+ q1_16x8_2 = _mm_and_si128(q1_16x8_2, flag4_16x8);
+ q1_16x8 = _mm_add_epi8(q1_16x8, q1_16x8_2);
+
+ // p2 and q2
+ p2_16x8 = _mm_and_si128(p2_16x8,
+ _mm_xor_si128(flag3_16x8, _mm_set1_epi8(0xFF)));
+ p2_16x8_2 = _mm_and_si128(p2_16x8_2, flag3_16x8);
+ p2_16x8 = _mm_add_epi8(p2_16x8, p2_16x8_2);
+ q2_16x8 = _mm_and_si128(q2_16x8,
+ _mm_xor_si128(flag4_16x8, _mm_set1_epi8(0xFF)));
+ q2_16x8_2 = _mm_and_si128(q2_16x8_2, flag4_16x8);
+ q2_16x8 = _mm_add_epi8(q2_16x8, q2_16x8_2);
+
+ temp1 = _mm_unpacklo_epi8(p3_16x8, p2_16x8);
+ temp2 = _mm_unpacklo_epi8(p1_16x8, p0_16x8);
+ temp3 = _mm_unpacklo_epi8(q0_16x8, q1_16x8);
+ temp4 = _mm_unpacklo_epi8(q2_16x8, q3_16x8);
+
+ p3_8x16 = _mm_unpacklo_epi16(temp1, temp2);
+ p2_8x16 = _mm_unpackhi_epi16(temp1, temp2);
+ q2_8x16 = _mm_unpacklo_epi16(temp3, temp4);
+ q3_8x16 = _mm_unpackhi_epi16(temp3, temp4);
+
+ line1 = _mm_unpacklo_epi32(p3_8x16, q2_8x16);
+ line2 = _mm_srli_si128(line1, 8);
+ line3 = _mm_unpackhi_epi32(p3_8x16, q2_8x16);
+ line4 = _mm_srli_si128(line3, 8);
+ line5 = _mm_unpacklo_epi32(p2_8x16, q3_8x16);
+ line6 = _mm_srli_si128(line5, 8);
+ line7 = _mm_unpackhi_epi32(p2_8x16, q3_8x16);
+ line8 = _mm_srli_si128(line7, 8);
+
+ _mm_storel_epi64((__m128i *)(pu1_src - 4 + 0 * src_strd), line1);
+ _mm_storel_epi64((__m128i *)(pu1_src - 4 + 1 * src_strd), line2);
+ _mm_storel_epi64((__m128i *)(pu1_src - 4 + 2 * src_strd), line3);
+ _mm_storel_epi64((__m128i *)(pu1_src - 4 + 3 * src_strd), line4);
+ _mm_storel_epi64((__m128i *)(pu1_src - 4 + 4 * src_strd), line5);
+ _mm_storel_epi64((__m128i *)(pu1_src - 4 + 5 * src_strd), line6);
+ _mm_storel_epi64((__m128i *)(pu1_src - 4 + 6 * src_strd), line7);
+ _mm_storel_epi64((__m128i *)(pu1_src - 4 + 7 * src_strd), line8);
+
+}
+
+/*****************************************************************************/
+/* */
+/* Function Name : ih264_deblk_luma_vert_bslt4_mbaff_ssse3() */
+/* */
+/* Description : This function performs filtering of a luma block */
+/* vertical edge when boundary strength is less than 4. */
+/* */
+/* Inputs : pu1_src - pointer to the src sample q0 */
+/* src_strd - source stride */
+/* alpha - alpha value for the boundary */
+/* beta - beta value for the boundary */
+/* u4_bs - packed Boundary strength array */
+/* pu1_cliptab - tc0_table */
+/* */
+/* Globals : None */
+/* */
+/* Processing : When the function is called twice, this operation is as */
+/* described in Sec. 8.7.2.3 under the title "Filtering */
+/* process for edges for bS less than 4" in ITU T Rec H.264.*/
+/* */
+/* Outputs : None */
+/* */
+/* Returns : None */
+/* */
+/* Issues : None */
+/* */
+/* Revision History: */
+/* */
+/* DD MM YYYY Author(s) Changes (Describe the changes made) */
+/* 12 02 2015 Naveen Kumar P Initial version */
+/* */
+/*****************************************************************************/
+void ih264_deblk_luma_vert_bslt4_mbaff_ssse3(UWORD8 *pu1_src,
+ WORD32 src_strd,
+ WORD32 alpha,
+ WORD32 beta,
+ UWORD32 u4_bs,
+ const UWORD8 *pu1_cliptab)
+{
+ __m128i zero = _mm_setzero_si128();
+ __m128i bs_flag_16x8b, C0_16x8, C0_8x16, C_8x16;
+ __m128i q0_16x8, q1_16x8, q2_16x8, q3_16x8;
+ __m128i p0_16x8, p1_16x8, p2_16x8, p3_16x8;
+ __m128i temp1, temp2, temp3, temp4;
+ __m128i Alpha_8x16, Beta_8x16, flag1_16x8, flag2_16x8, flag3_16x8;
+ __m128i in_macro_16x8;
+ __m128i const_val4_8x16;
+ UWORD8 u1_Bs0, u1_Bs1, u1_Bs2, u1_Bs3;
+ UWORD8 clip0, clip1, clip2, clip3;
+ __m128i line1, line2, line3, line4, line5, line6, line7, line8;
+ __m128i q0_16x8_1, q1_16x8_1, q0_16x8_2;
+ __m128i p0_16x8_1, p1_16x8_1, p0_16x8_2;
+
+ line1 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 0 * src_strd));
+ line2 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 1 * src_strd));
+ line3 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 2 * src_strd));
+ line4 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 3 * src_strd));
+ line5 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 4 * src_strd));
+ line6 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 5 * src_strd));
+ line7 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 6 * src_strd));
+ line8 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 7 * src_strd));
+
+ temp1 = _mm_unpacklo_epi8(line1, line2);
+ temp2 = _mm_unpacklo_epi8(line3, line4);
+ temp3 = _mm_unpacklo_epi8(line5, line6);
+ temp4 = _mm_unpacklo_epi8(line7, line8);
+
+ line1 = _mm_unpacklo_epi16(temp1, temp2);
+ line2 = _mm_unpackhi_epi16(temp1, temp2);
+ line3 = _mm_unpacklo_epi16(temp3, temp4);
+ line4 = _mm_unpackhi_epi16(temp3, temp4);
+
+ temp1 = _mm_unpacklo_epi32(line1, line3);
+ temp2 = _mm_unpackhi_epi32(line1, line3);
+ temp3 = _mm_unpacklo_epi32(line2, line4);
+ temp4 = _mm_unpackhi_epi32(line2, line4);
+
+ p3_16x8 = _mm_unpacklo_epi64(temp1, zero);
+ p2_16x8 = _mm_unpackhi_epi64(temp1, zero);
+ q2_16x8 = _mm_unpacklo_epi64(temp4, zero);
+ q3_16x8 = _mm_unpackhi_epi64(temp4, zero);
+ p1_16x8 = _mm_unpacklo_epi64(temp2, zero);
+ p0_16x8 = _mm_unpackhi_epi64(temp2, zero);
+ q0_16x8 = _mm_unpacklo_epi64(temp3, zero);
+ q1_16x8 = _mm_unpackhi_epi64(temp3, zero);
+
+ u1_Bs0 = (u4_bs >> 24) & 0xff;
+ u1_Bs1 = (u4_bs >> 16) & 0xff;
+ u1_Bs2 = (u4_bs >> 8) & 0xff;
+ u1_Bs3 = (u4_bs >> 0) & 0xff;
+ clip0 = pu1_cliptab[u1_Bs0];
+ clip1 = pu1_cliptab[u1_Bs1];
+ clip2 = pu1_cliptab[u1_Bs2];
+ clip3 = pu1_cliptab[u1_Bs3];
+
+ Alpha_8x16 = _mm_set1_epi16(alpha);
+ Beta_8x16 = _mm_set1_epi16(beta);
+
+ bs_flag_16x8b = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, u1_Bs3, u1_Bs3, u1_Bs2,
+ u1_Bs2, u1_Bs1, u1_Bs1, u1_Bs0, u1_Bs0);
+
+ C0_16x8 = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, clip3, clip3, clip2, clip2,
+ clip1, clip1, clip0, clip0);
+
+ bs_flag_16x8b = _mm_cmpeq_epi8(bs_flag_16x8b, zero);
+ bs_flag_16x8b = _mm_xor_si128(bs_flag_16x8b, _mm_set1_epi8(0xFF)); //Invert for required mask
+ C0_8x16 = _mm_unpacklo_epi8(C0_16x8, zero);
+
+ //Cond1 (ABS(p0 - q0) < alpha)
+ temp1 = _mm_subs_epu8(q0_16x8, p0_16x8);
+ temp2 = _mm_subs_epu8(p0_16x8, q0_16x8);
+ temp1 = _mm_add_epi8(temp1, temp2);
+
+ temp2 = _mm_unpacklo_epi8(temp1, zero);
+ temp2 = _mm_cmpgt_epi16(Alpha_8x16, temp2);
+
+ flag1_16x8 = _mm_packs_epi16(temp2, zero);
+ flag1_16x8 = _mm_and_si128(flag1_16x8, bs_flag_16x8b);
+
+ //Cond2 (ABS(q1 - q0) < beta)
+ temp1 = _mm_subs_epu8(q0_16x8, q1_16x8);
+ temp2 = _mm_subs_epu8(q1_16x8, q0_16x8);
+ temp1 = _mm_add_epi8(temp1, temp2);
+
+ temp2 = _mm_unpacklo_epi8(temp1, zero);
+ temp2 = _mm_cmpgt_epi16(Beta_8x16, temp2);
+
+ flag2_16x8 = _mm_packs_epi16(temp2, zero);
+ flag1_16x8 = _mm_and_si128(flag1_16x8, flag2_16x8);
+
+ //Cond3 (ABS(p1 - p0) < beta)
+ temp1 = _mm_subs_epu8(p0_16x8, p1_16x8);
+ temp2 = _mm_subs_epu8(p1_16x8, p0_16x8);
+ temp1 = _mm_add_epi8(temp1, temp2);
+
+ temp2 = _mm_unpacklo_epi8(temp1, zero);
+ temp2 = _mm_cmpgt_epi16(Beta_8x16, temp2);
+
+ flag2_16x8 = _mm_packs_epi16(temp2, zero);
+
+ // !((ABS(p0 - q0) < alpha) || (ABS(q1 - q0) < beta) || (ABS(p1 - p0) < beta))
+ flag1_16x8 = _mm_and_si128(flag1_16x8, flag2_16x8);
+
+ // (ABS(p2 - p0) < beta)
+ temp1 = _mm_subs_epu8(p0_16x8, p2_16x8);
+ temp2 = _mm_subs_epu8(p2_16x8, p0_16x8);
+ temp1 = _mm_add_epi8(temp1, temp2);
+
+ temp2 = _mm_unpacklo_epi8(temp1, zero);
+ temp2 = _mm_cmpgt_epi16(Beta_8x16, temp2);
+
+ flag2_16x8 = _mm_packs_epi16(temp2, zero);
+ flag2_16x8 = _mm_and_si128(flag1_16x8, flag2_16x8);
+
+ temp2 = _mm_subs_epi16(zero, temp2);
+
+ C_8x16 = _mm_add_epi16(C0_8x16, temp2);
+
+ // (ABS(q2 - q0) < beta)
+ temp1 = _mm_subs_epu8(q0_16x8, q2_16x8);
+ temp2 = _mm_subs_epu8(q2_16x8, q0_16x8);
+ temp1 = _mm_add_epi8(temp1, temp2);
+
+ temp2 = _mm_unpacklo_epi8(temp1, zero);
+ temp2 = _mm_cmpgt_epi16(Beta_8x16, temp2);
+
+ flag3_16x8 = _mm_packs_epi16(temp2, zero);
+ flag3_16x8 = _mm_and_si128(flag1_16x8, flag3_16x8);
+
+ temp2 = _mm_subs_epi16(zero, temp2);
+
+ C_8x16 = _mm_add_epi16(C_8x16, temp2);
+
+ const_val4_8x16 = _mm_set1_epi16(4);
+ temp1 = _mm_subs_epi16(_mm_unpacklo_epi8(q0_16x8, zero),
+ _mm_unpacklo_epi8(p0_16x8, zero));
+ temp2 = _mm_subs_epi16(_mm_unpacklo_epi8(p1_16x8, zero),
+ _mm_unpacklo_epi8(q1_16x8, zero));
+ temp1 = _mm_slli_epi16(temp1, 2);
+ temp1 = _mm_add_epi16(temp1, temp2);
+ temp1 = _mm_add_epi16(temp1, const_val4_8x16);
+ in_macro_16x8 = _mm_srai_epi16(temp1, 3);
+
+ in_macro_16x8 = _mm_min_epi16(C_8x16, in_macro_16x8); //CLIP3
+ C_8x16 = _mm_subs_epi16(zero, C_8x16);
+ in_macro_16x8 = _mm_max_epi16(C_8x16, in_macro_16x8); //CLIP3
+
+ // p0
+ temp1 = _mm_add_epi16(_mm_unpacklo_epi8(p0_16x8, zero), in_macro_16x8);
+
+ temp1 = _mm_packus_epi16(temp1, zero);
+
+ p0_16x8_1 = _mm_and_si128(temp1, flag1_16x8);
+ p0_16x8_2 = _mm_and_si128(
+ p0_16x8, _mm_xor_si128(flag1_16x8, _mm_set1_epi16(0xFFFF)));
+
+ p0_16x8_1 = _mm_add_epi8(p0_16x8_1, p0_16x8_2);
+
+ // q0
+ temp1 = _mm_sub_epi16(_mm_unpacklo_epi8(q0_16x8, zero), in_macro_16x8);
+
+ temp1 = _mm_packus_epi16(temp1, zero);
+
+ q0_16x8_1 = _mm_and_si128(temp1, flag1_16x8);
+ q0_16x8_2 = _mm_and_si128(
+ q0_16x8, _mm_xor_si128(flag1_16x8, _mm_set1_epi16(0xFFFF)));
+
+ q0_16x8_1 = _mm_add_epi8(q0_16x8_1, q0_16x8_2);
+
+ //if(Ap < Beta)
+ temp1 = _mm_avg_epu16(_mm_unpacklo_epi8(q0_16x8, zero),
+ _mm_unpacklo_epi8(p0_16x8, zero));
+ temp2 = _mm_slli_epi16(_mm_unpacklo_epi8(p1_16x8, zero), 1);
+ //temp2 = _mm_subs_epi16(zero,temp2);
+ temp2 = _mm_subs_epi16(_mm_unpacklo_epi8(p2_16x8, zero), temp2);
+ temp2 = _mm_add_epi16(temp1, temp2);
+ in_macro_16x8 = _mm_srai_epi16(temp2, 1);
+
+ in_macro_16x8 = _mm_min_epi16(C0_8x16, in_macro_16x8); //CLIP3
+ C0_8x16 = _mm_subs_epi16(zero, C0_8x16);
+ in_macro_16x8 = _mm_max_epi16(C0_8x16, in_macro_16x8); //CLIP3
+
+ // p1
+ temp1 = _mm_add_epi16(_mm_unpacklo_epi8(p1_16x8, zero), in_macro_16x8);
+
+ temp1 = _mm_packus_epi16(temp1, zero);
+
+ p1_16x8_1 = _mm_and_si128(temp1, flag2_16x8);
+ p1_16x8 = _mm_and_si128(p1_16x8,
+ _mm_xor_si128(flag2_16x8, _mm_set1_epi16(0xFFFF)));
+ p1_16x8 = _mm_add_epi8(p1_16x8, p1_16x8_1);
+
+ //if(Aq < Beta)
+ temp1 = _mm_avg_epu16(_mm_unpacklo_epi8(q0_16x8, zero),
+ _mm_unpacklo_epi8(p0_16x8, zero));
+ temp2 = _mm_slli_epi16(_mm_unpacklo_epi8(q1_16x8, zero), 1);
+ //temp2 = _mm_slli_epi16 (temp2, 1);
+ temp2 = _mm_subs_epi16(_mm_unpacklo_epi8(q2_16x8, zero), temp2);
+ temp2 = _mm_add_epi16(temp1, temp2);
+ in_macro_16x8 = _mm_srai_epi16(temp2, 1);
+
+ in_macro_16x8 = _mm_max_epi16(C0_8x16, in_macro_16x8); //CLIP3
+ C0_8x16 = _mm_subs_epi16(zero, C0_8x16);
+ in_macro_16x8 = _mm_min_epi16(C0_8x16, in_macro_16x8); //CLIP3
+
+ temp1 = _mm_add_epi16(_mm_unpacklo_epi8(q1_16x8, zero), in_macro_16x8);
+
+ // q1
+ temp1 = _mm_packus_epi16(temp1, zero);
+
+ q1_16x8_1 = _mm_and_si128(temp1, flag3_16x8);
+ q1_16x8 = _mm_and_si128(q1_16x8,
+ _mm_xor_si128(flag3_16x8, _mm_set1_epi16(0xFFFF)));
+ q1_16x8 = _mm_add_epi8(q1_16x8, q1_16x8_1);
+
+ temp1 = _mm_unpacklo_epi8(p3_16x8, p2_16x8);
+ temp2 = _mm_unpacklo_epi8(p1_16x8, p0_16x8_1);
+ temp3 = _mm_unpacklo_epi8(q0_16x8_1, q1_16x8);
+ temp4 = _mm_unpacklo_epi8(q2_16x8, q3_16x8);
+
+ line7 = _mm_unpacklo_epi16(temp1, temp2);
+ temp1 = _mm_unpackhi_epi16(temp1, temp2);
+ line8 = _mm_unpacklo_epi16(temp3, temp4);
+ temp2 = _mm_unpackhi_epi16(temp3, temp4);
+
+ line1 = _mm_unpacklo_epi32(line7, line8);
+ line2 = _mm_srli_si128(line1, 8);
+ line3 = _mm_unpackhi_epi32(line7, line8);
+ line4 = _mm_srli_si128(line3, 8);
+ line5 = _mm_unpacklo_epi32(temp1, temp2);
+ line6 = _mm_srli_si128(line5, 8);
+ line7 = _mm_unpackhi_epi32(temp1, temp2);
+ line8 = _mm_srli_si128(line7, 8);
+
+ _mm_storel_epi64((__m128i *)(pu1_src - 4 + 0 * src_strd), line1);
+ _mm_storel_epi64((__m128i *)(pu1_src - 4 + 1 * src_strd), line2);
+ _mm_storel_epi64((__m128i *)(pu1_src - 4 + 2 * src_strd), line3);
+ _mm_storel_epi64((__m128i *)(pu1_src - 4 + 3 * src_strd), line4);
+ _mm_storel_epi64((__m128i *)(pu1_src - 4 + 4 * src_strd), line5);
+ _mm_storel_epi64((__m128i *)(pu1_src - 4 + 5 * src_strd), line6);
+ _mm_storel_epi64((__m128i *)(pu1_src - 4 + 6 * src_strd), line7);
+ _mm_storel_epi64((__m128i *)(pu1_src - 4 + 7 * src_strd), line8);
+}
+
diff --git a/common/x86/ih264_ihadamard_scaling_sse42.c b/common/x86/ih264_ihadamard_scaling_sse42.c
new file mode 100755
index 0000000..895291b
--- /dev/null
+++ b/common/x86/ih264_ihadamard_scaling_sse42.c
@@ -0,0 +1,238 @@
+/******************************************************************************
+ *
+ * Copyright (C) 2015 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ *****************************************************************************
+ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
+*/
+/**
+ *******************************************************************************
+ * @file
+ * ih264_ihadamard_scaling_sse42.c
+ *
+ * @brief
+ * Contains definition of functions for h264 inverse hadamard 4x4 transform and scaling
+ *
+ * @author
+ * Mohit
+ *
+ * @par List of Functions:
+ * - ih264_ihadamard_scaling_4x4_sse42()
+ * - ih264_ihadamard_scaling_2x2_uv_ssse42()
+ *
+ * @remarks
+ *
+ *******************************************************************************
+ */
+/*****************************************************************************/
+/* File Includes */
+/*****************************************************************************/
+
+/* User include files */
+#include "ih264_typedefs.h"
+#include "ih264_defs.h"
+#include "ih264_trans_macros.h"
+#include "ih264_macros.h"
+#include "ih264_trans_data.h"
+#include "ih264_size_defs.h"
+#include "ih264_structs.h"
+#include "ih264_trans_quant_itrans_iquant.h"
+#include <immintrin.h>
+
+/*
+ ********************************************************************************
+ *
+ * @brief This function performs a 4x4 inverse hadamard transform on the 4x4 DC coefficients
+ * of a 16x16 intra prediction macroblock, and then performs scaling.
+ * prediction buffer
+ *
+ * @par Description:
+ * The DC coefficients pass through a 2-stage inverse hadamard transform.
+ * This inverse transformed content is scaled to based on Qp value.
+ *
+ * @param[in] pi2_src
+ * input 4x4 block of DC coefficients
+ *
+ * @param[out] pi2_out
+ * output 4x4 block
+ *
+ * @param[in] pu2_iscal_mat
+ * pointer to scaling list
+ *
+ * @param[in] pu2_weigh_mat
+ * pointer to weight matrix
+ *
+ * @param[in] u4_qp_div_6
+ * Floor (qp/6)
+ *
+ * @param[in] pi4_tmp
+ * temporary buffer of size 1*16
+ *
+ * @returns none
+ *
+ * @remarks none
+ *
+ *******************************************************************************
+ */
+void ih264_ihadamard_scaling_4x4_sse42(WORD16* pi2_src, WORD16* pi2_out,
+ const UWORD16 *pu2_iscal_mat, const UWORD16 *pu2_weigh_mat,
+ UWORD32 u4_qp_div_6, WORD32* pi4_tmp) {
+ __m128i src_r0_r1, src_r2_r3;
+ __m128i src_r0, src_r1, src_r2, src_r3;
+ __m128i temp0, temp1, temp2, temp3;
+ __m128i add_rshift = _mm_set1_epi32((1 << (5 - u4_qp_div_6)));
+ __m128i mult_val = _mm_set1_epi32(pu2_iscal_mat[0] * pu2_weigh_mat[0]);
+
+ src_r0_r1 = _mm_loadu_si128((__m128i *) (pi2_src)); //a00 a01 a02 a03 a10 a11 a12 a13 -- the source matrix 0th,1st row
+ src_r2_r3 = _mm_loadu_si128((__m128i *) (pi2_src + 8)); //a20 a21 a22 a23 a30 a31 a32 a33 -- the source matrix 2nd,3rd row
+ //sign_reg = _mm_cmpgt_epi16(zero_8x16b, src_r0_r1);
+ src_r0 = _mm_cvtepi16_epi32(src_r0_r1);
+ src_r0_r1 = _mm_srli_si128(src_r0_r1, 8);
+ src_r1 = _mm_cvtepi16_epi32(src_r0_r1);
+
+ src_r2 = _mm_cvtepi16_epi32(src_r2_r3);
+ src_r2_r3 = _mm_srli_si128(src_r2_r3, 8);
+ src_r3 = _mm_cvtepi16_epi32(src_r2_r3);
+
+ /* Perform Inverse transform */
+ /*-------------------------------------------------------------*/
+ /* IDCT [ Horizontal transformation ] */
+ /*-------------------------------------------------------------*/
+ // Matrix transpose
+ /*
+ * a0 a1 a2 a3
+ * b0 b1 b2 b3
+ * c0 c1 c2 c3
+ * d0 d1 d2 d3
+ */
+ temp0 = _mm_unpacklo_epi32(src_r0, src_r1); //a0 b0 a1 b1
+ temp2 = _mm_unpacklo_epi32(src_r2, src_r3); //c0 d0 c1 d1
+ temp1 = _mm_unpackhi_epi32(src_r0, src_r1); //a2 b2 a3 b3
+ temp3 = _mm_unpackhi_epi32(src_r2, src_r3); //c2 d2 c3 d3
+ src_r0 = _mm_unpacklo_epi64(temp0, temp2); //a0 b0 c0 d0
+ src_r1 = _mm_unpackhi_epi64(temp0, temp2); //a1 b1 c1 d1
+ src_r2 = _mm_unpacklo_epi64(temp1, temp3); //a2 b2 c2 d2
+ src_r3 = _mm_unpackhi_epi64(temp1, temp3); //a3 b3 c3 d3
+
+ temp0 = _mm_add_epi32(src_r0, src_r3);
+ temp1 = _mm_add_epi32(src_r1, src_r2);
+ temp2 = _mm_sub_epi32(src_r1, src_r2);
+ temp3 = _mm_sub_epi32(src_r0, src_r3);
+
+ src_r0 = _mm_add_epi32(temp0, temp1);
+ src_r1 = _mm_add_epi32(temp2, temp3);
+ src_r2 = _mm_sub_epi32(temp0, temp1);
+ src_r3 = _mm_sub_epi32(temp3, temp2);
+
+ /*-------------------------------------------------------------*/
+ /* IDCT [ Vertical transformation ] */
+ /*-------------------------------------------------------------*/
+ // Matrix transpose
+ /*
+ * a0 b0 c0 d0
+ * a1 b1 c1 d1
+ * a2 b2 c2 d2
+ * a3 b3 c3 d3
+ */
+ temp0 = _mm_unpacklo_epi32(src_r0, src_r1); //a0 a1 b0 b1
+ temp2 = _mm_unpacklo_epi32(src_r2, src_r3); //a2 a3 b2 b3
+ temp1 = _mm_unpackhi_epi32(src_r0, src_r1); //c0 c1 d0 d1
+ temp3 = _mm_unpackhi_epi32(src_r2, src_r3); //c2 c3 d2 d3
+ src_r0 = _mm_unpacklo_epi64(temp0, temp2); //a0 a1 a2 a3
+ src_r1 = _mm_unpackhi_epi64(temp0, temp2); //b0 b1 b2 b3
+ src_r2 = _mm_unpacklo_epi64(temp1, temp3); //c0 c1 c2 c3
+ src_r3 = _mm_unpackhi_epi64(temp1, temp3); //d0 d1 d2 d3
+
+ temp0 = _mm_add_epi32(src_r0, src_r3);
+ temp1 = _mm_add_epi32(src_r1, src_r2);
+ temp2 = _mm_sub_epi32(src_r1, src_r2);
+ temp3 = _mm_sub_epi32(src_r0, src_r3);
+
+ src_r0 = _mm_add_epi32(temp0, temp1);
+ src_r1 = _mm_add_epi32(temp2, temp3);
+ src_r2 = _mm_sub_epi32(temp0, temp1);
+ src_r3 = _mm_sub_epi32(temp3, temp2);
+
+ src_r0 = _mm_mullo_epi32(src_r0, mult_val);
+ src_r1 = _mm_mullo_epi32(src_r1, mult_val);
+ src_r2 = _mm_mullo_epi32(src_r2, mult_val);
+ src_r3 = _mm_mullo_epi32(src_r3, mult_val);
+
+ //Scaling
+ if (u4_qp_div_6 >= 6) {
+ src_r0 = _mm_slli_epi32(src_r0, u4_qp_div_6 - 6);
+ src_r1 = _mm_slli_epi32(src_r1, u4_qp_div_6 - 6);
+ src_r2 = _mm_slli_epi32(src_r2, u4_qp_div_6 - 6);
+ src_r3 = _mm_slli_epi32(src_r3, u4_qp_div_6 - 6);
+ } else {
+ temp0 = _mm_add_epi32(src_r0, add_rshift);
+ temp1 = _mm_add_epi32(src_r1, add_rshift);
+ temp2 = _mm_add_epi32(src_r2, add_rshift);
+ temp3 = _mm_add_epi32(src_r3, add_rshift);
+ src_r0 = _mm_srai_epi32(temp0, 6 - u4_qp_div_6);
+ src_r1 = _mm_srai_epi32(temp1, 6 - u4_qp_div_6);
+ src_r2 = _mm_srai_epi32(temp2, 6 - u4_qp_div_6);
+ src_r3 = _mm_srai_epi32(temp3, 6 - u4_qp_div_6);
+ }
+ src_r0_r1 = _mm_packs_epi32(src_r0, src_r1);
+ src_r2_r3 = _mm_packs_epi32(src_r2, src_r3);
+
+ _mm_storeu_si128((__m128i *) (&pi2_out[0]), src_r0_r1);
+ _mm_storeu_si128((__m128i *) (&pi2_out[8]), src_r2_r3);
+}
+
+void ih264_ihadamard_scaling_2x2_uv_sse42(WORD16* pi2_src,
+ WORD16* pi2_out,
+ const UWORD16 *pu2_iscal_mat,
+ const UWORD16 *pu2_weigh_mat,
+ UWORD32 u4_qp_div_6,
+ WORD32* pi4_tmp)
+{
+ UNUSED(pi4_tmp);
+ __m128i src, plane_0, plane_1, temp0, temp1, sign_reg;
+ __m128i zero_8x16b = _mm_setzero_si128();
+ __m128i scale_val = _mm_set1_epi32((WORD32)(pu2_iscal_mat[0] * pu2_weigh_mat[0]));
+ src = _mm_loadu_si128((__m128i *) pi2_src); //a0 a1 a2 a3 b0 b1 b2 b3
+ sign_reg = _mm_cmpgt_epi16(zero_8x16b, src);
+ plane_0 = _mm_unpacklo_epi16(src, sign_reg); //a0 a1 a2 a3 -- 32 bits
+ plane_1 = _mm_unpackhi_epi16(src, sign_reg); //b0 b1 b2 b3 -- 32 bits
+
+ temp0 = _mm_hadd_epi32(plane_0, plane_1); //a0+a1 a2+a3 b0+b1 b2+b3
+ temp1 = _mm_hsub_epi32(plane_0, plane_1); //a0-a1 a2-a3 b0-b1 b2-b3
+ plane_0 = _mm_hadd_epi32(temp0, temp1); //a0+a1+a2+a3 b0+b1+b2+b3 a0-a1+a2-a3 b0-b1+b2-b3
+ plane_1 = _mm_hsub_epi32(temp0, temp1); //a0+a1-a2-a3 b0+b1-b2-b3 a0-a1-a2+a3 b0-b1-b2+b3
+ temp0 = _mm_unpacklo_epi32(plane_0, plane_1); //a0+a1+a2+a3 a0+a1-a2-a3 b0+b1+b2+b3 b0+b1-b2-b3
+ temp1 = _mm_unpackhi_epi32(plane_0, plane_1); //a0-a1+a2-a3 a0-a1-a2+a3 b0-b1+b2-b3 b0-b1-b2+b3
+
+ plane_0 = _mm_unpacklo_epi64(temp0, temp1); //a0+a1+a2+a3 a0+a1-a2-a3 a0-a1+a2-a3 a0-a1-a2+a3
+ plane_1 = _mm_unpackhi_epi64(temp0, temp1); //b0+b1+b2+b3 b0+b1-b2-b3 b0-b1+b2-b3 b0-b1-b2+b3
+
+ plane_0 = _mm_shuffle_epi32(plane_0, 0xd8); //a0+a1+a2+a3 a0-a1+a2-a3 a0+a1-a2-a3 a0-a1-a2+a3
+ plane_1 = _mm_shuffle_epi32(plane_1, 0xd8); //b0+b1+b2+b3 b0-b1+b2-b3 b0+b1-b2-b3 b0-b1-b2+b3
+
+ temp0 = _mm_mullo_epi32(scale_val, plane_0); //multiply by pu2_iscal_mat[0] * pu2_weigh_mat[0]
+ temp1 = _mm_mullo_epi32(scale_val, plane_1); //multiply by pu2_iscal_mat[0] * pu2_weigh_mat[0]
+
+ temp0 = _mm_slli_epi32(temp0, u4_qp_div_6);
+ temp1 = _mm_slli_epi32(temp1, u4_qp_div_6);
+
+ temp0 = _mm_srai_epi32(temp0, 5);
+ temp1 = _mm_srai_epi32(temp1, 5);
+
+ temp0 = _mm_packs_epi32(temp0, temp1); //Final values are 16-bits only.
+
+ _mm_storeu_si128((__m128i *) (&pi2_out[0]), temp0);
+
+}
diff --git a/common/x86/ih264_ihadamard_scaling_ssse3.c b/common/x86/ih264_ihadamard_scaling_ssse3.c
new file mode 100755
index 0000000..232d9fa
--- /dev/null
+++ b/common/x86/ih264_ihadamard_scaling_ssse3.c
@@ -0,0 +1,200 @@
+/******************************************************************************
+ *
+ * Copyright (C) 2015 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ *****************************************************************************
+ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
+*/
+/**
+ *******************************************************************************
+ * @file
+ * ih264_ihadamard_scaling_ssse3.c
+ *
+ * @brief
+ * Contains definition of functions for h264 inverse hadamard 4x4 transform and scaling
+ *
+ * @author
+ * Mohit
+ *
+ * @par List of Functions:
+ * - ih264_ihadamard_scaling_4x4_ssse3()
+ *
+ * @remarks
+ *
+ *******************************************************************************
+ */
+/*****************************************************************************/
+/* File Includes */
+/*****************************************************************************/
+
+/* User include files */
+#include "ih264_typedefs.h"
+#include "ih264_defs.h"
+#include "ih264_trans_macros.h"
+#include "ih264_macros.h"
+#include "ih264_trans_data.h"
+#include "ih264_size_defs.h"
+#include "ih264_structs.h"
+#include "ih264_trans_quant_itrans_iquant.h"
+#include <immintrin.h>
+
+/*
+ ********************************************************************************
+ *
+ * @brief This function performs a 4x4 inverse hadamard transform on the 4x4 DC coefficients
+ * of a 16x16 intra prediction macroblock, and then performs scaling.
+ * prediction buffer
+ *
+ * @par Description:
+ * The DC coefficients pass through a 2-stage inverse hadamard transform.
+ * This inverse transformed content is scaled to based on Qp value.
+ *
+ * @param[in] pi2_src
+ * input 4x4 block of DC coefficients
+ *
+ * @param[out] pi2_out
+ * output 4x4 block
+ *
+ * @param[in] pu2_iscal_mat
+ * pointer to scaling list
+ *
+ * @param[in] pu2_weigh_mat
+ * pointer to weight matrix
+ *
+ * @param[in] u4_qp_div_6
+ * Floor (qp/6)
+ *
+ * @param[in] pi4_tmp
+ * temporary buffer of size 1*16
+ *
+ * @returns none
+ *
+ * @remarks none
+ *
+ *******************************************************************************
+ */
+void ih264_ihadamard_scaling_4x4_ssse3(WORD16* pi2_src, WORD16* pi2_out,
+ const UWORD16 *pu2_iscal_mat, const UWORD16 *pu2_weigh_mat,
+ UWORD32 u4_qp_div_6, WORD32* pi4_tmp) {
+ int val = 0xFFFF;
+ __m128i src_r0_r1, src_r2_r3, sign_reg, zero_8x16b = _mm_setzero_si128();
+ __m128i src_r0, src_r1, src_r2, src_r3;
+ __m128i temp0, temp1, temp2, temp3;
+ __m128i add_rshift = _mm_set1_epi32((1 << (5 - u4_qp_div_6)));
+ __m128i mult_val = _mm_set1_epi32(pu2_iscal_mat[0] * pu2_weigh_mat[0]);
+
+ __m128i mask = _mm_set1_epi32(val);
+ mult_val = _mm_and_si128(mult_val, mask);
+
+ src_r0_r1 = _mm_loadu_si128((__m128i *) (pi2_src)); //a00 a01 a02 a03 a10 a11 a12 a13 -- the source matrix 0th,1st row
+ src_r2_r3 = _mm_loadu_si128((__m128i *) (pi2_src + 8)); //a20 a21 a22 a23 a30 a31 a32 a33 -- the source matrix 2nd,3rd row
+ sign_reg = _mm_cmpgt_epi16(zero_8x16b, src_r0_r1);
+ src_r0 = _mm_unpacklo_epi16(src_r0_r1, sign_reg);
+ src_r1 = _mm_unpackhi_epi16(src_r0_r1, sign_reg);
+ sign_reg = _mm_cmpgt_epi16(zero_8x16b, src_r2_r3);
+ src_r2 = _mm_unpacklo_epi16(src_r2_r3, sign_reg);
+ src_r3 = _mm_unpackhi_epi16(src_r2_r3, sign_reg);
+
+ /* Perform Inverse transform */
+ /*-------------------------------------------------------------*/
+ /* IDCT [ Horizontal transformation ] */
+ /*-------------------------------------------------------------*/
+ // Matrix transpose
+ /*
+ * a0 a1 a2 a3
+ * b0 b1 b2 b3
+ * c0 c1 c2 c3
+ * d0 d1 d2 d3
+ */
+ temp0 = _mm_unpacklo_epi32(src_r0, src_r1); //a0 b0 a1 b1
+ temp2 = _mm_unpacklo_epi32(src_r2, src_r3); //c0 d0 c1 d1
+ temp1 = _mm_unpackhi_epi32(src_r0, src_r1); //a2 b2 a3 b3
+ temp3 = _mm_unpackhi_epi32(src_r2, src_r3); //c2 d2 c3 d3
+ src_r0 = _mm_unpacklo_epi64(temp0, temp2); //a0 b0 c0 d0
+ src_r1 = _mm_unpackhi_epi64(temp0, temp2); //a1 b1 c1 d1
+ src_r2 = _mm_unpacklo_epi64(temp1, temp3); //a2 b2 c2 d2
+ src_r3 = _mm_unpackhi_epi64(temp1, temp3); //a3 b3 c3 d3
+
+ temp0 = _mm_add_epi32(src_r0, src_r3);
+ temp1 = _mm_add_epi32(src_r1, src_r2);
+ temp2 = _mm_sub_epi32(src_r1, src_r2);
+ temp3 = _mm_sub_epi32(src_r0, src_r3);
+
+ src_r0 = _mm_add_epi32(temp0, temp1);
+ src_r1 = _mm_add_epi32(temp2, temp3);
+ src_r2 = _mm_sub_epi32(temp0, temp1);
+ src_r3 = _mm_sub_epi32(temp3, temp2);
+
+ /*-------------------------------------------------------------*/
+ /* IDCT [ Vertical transformation ] */
+ /*-------------------------------------------------------------*/
+ // Matrix transpose
+ /*
+ * a0 b0 c0 d0
+ * a1 b1 c1 d1
+ * a2 b2 c2 d2
+ * a3 b3 c3 d3
+ */
+ temp0 = _mm_unpacklo_epi32(src_r0, src_r1); //a0 a1 b0 b1
+ temp2 = _mm_unpacklo_epi32(src_r2, src_r3); //a2 a3 b2 b3
+ temp1 = _mm_unpackhi_epi32(src_r0, src_r1); //c0 c1 d0 d1
+ temp3 = _mm_unpackhi_epi32(src_r2, src_r3); //c2 c3 d2 d3
+ src_r0 = _mm_unpacklo_epi64(temp0, temp2); //a0 a1 a2 a3
+ src_r1 = _mm_unpackhi_epi64(temp0, temp2); //b0 b1 b2 b3
+ src_r2 = _mm_unpacklo_epi64(temp1, temp3); //c0 c1 c2 c3
+ src_r3 = _mm_unpackhi_epi64(temp1, temp3); //d0 d1 d2 d3
+
+ temp0 = _mm_add_epi32(src_r0, src_r3);
+ temp1 = _mm_add_epi32(src_r1, src_r2);
+ temp2 = _mm_sub_epi32(src_r1, src_r2);
+ temp3 = _mm_sub_epi32(src_r0, src_r3);
+
+ src_r0 = _mm_add_epi32(temp0, temp1);
+ src_r1 = _mm_add_epi32(temp2, temp3);
+ src_r2 = _mm_sub_epi32(temp0, temp1);
+ src_r3 = _mm_sub_epi32(temp3, temp2);
+
+ src_r0 = _mm_and_si128(src_r0, mask);
+ src_r1 = _mm_and_si128(src_r1, mask);
+ src_r2 = _mm_and_si128(src_r2, mask);
+ src_r3 = _mm_and_si128(src_r3, mask);
+
+ src_r0 = _mm_madd_epi16(src_r0, mult_val);
+ src_r1 = _mm_madd_epi16(src_r1, mult_val);
+ src_r2 = _mm_madd_epi16(src_r2, mult_val);
+ src_r3 = _mm_madd_epi16(src_r3, mult_val);
+
+ //Scaling
+ if (u4_qp_div_6 >= 6) {
+ src_r0 = _mm_slli_epi32(src_r0, u4_qp_div_6 - 6);
+ src_r1 = _mm_slli_epi32(src_r1, u4_qp_div_6 - 6);
+ src_r2 = _mm_slli_epi32(src_r2, u4_qp_div_6 - 6);
+ src_r3 = _mm_slli_epi32(src_r3, u4_qp_div_6 - 6);
+ } else {
+ temp0 = _mm_add_epi32(src_r0, add_rshift);
+ temp1 = _mm_add_epi32(src_r1, add_rshift);
+ temp2 = _mm_add_epi32(src_r2, add_rshift);
+ temp3 = _mm_add_epi32(src_r3, add_rshift);
+ src_r0 = _mm_srai_epi32(temp0, 6 - u4_qp_div_6);
+ src_r1 = _mm_srai_epi32(temp1, 6 - u4_qp_div_6);
+ src_r2 = _mm_srai_epi32(temp2, 6 - u4_qp_div_6);
+ src_r3 = _mm_srai_epi32(temp3, 6 - u4_qp_div_6);
+ }
+ src_r0_r1 = _mm_packs_epi32(src_r0, src_r1);
+ src_r2_r3 = _mm_packs_epi32(src_r2, src_r3);
+
+ _mm_storeu_si128((__m128i *) (&pi2_out[0]), src_r0_r1);
+ _mm_storeu_si128((__m128i *) (&pi2_out[8]), src_r2_r3);
+}
diff --git a/common/x86/ih264_inter_pred_filters_ssse3.c b/common/x86/ih264_inter_pred_filters_ssse3.c
new file mode 100755
index 0000000..64e364e
--- /dev/null
+++ b/common/x86/ih264_inter_pred_filters_ssse3.c
@@ -0,0 +1,4375 @@
+/******************************************************************************
+ *
+ * Copyright (C) 2015 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ *****************************************************************************
+ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
+*/
+/*****************************************************************************/
+/* */
+/* File Name : ih264_inter_pred_filters_intr_ssse3.c */
+/* */
+/* Description : Contains function definitions for weighted */
+/* prediction functions in x86 sse4 intrinsics */
+/* */
+/* List of Functions : ih264_inter_pred_luma_copy_ssse3() */
+/* ih264_inter_pred_luma_horz_ssse3() */
+/* ih264_inter_pred_luma_vert_ssse3() */
+/* ih264_inter_pred_luma_horz_hpel_vert_hpel_ssse3() */
+/* ih264_inter_pred_luma_horz_qpel_ssse3() */
+/* ih264_inter_pred_luma_vert_qpel_ssse3() */
+/* ih264_inter_pred_luma_horz_qpel_vert_qpel_ssse3() */
+/* ih264_inter_pred_luma_horz_hpel_vert_qpel_ssse3() */
+/* ih264_inter_pred_luma_horz_qpel_vert_hpel_ssse3() */
+/* ih264_inter_pred_chroma_ssse3() */
+/* */
+/* Issues / Problems : None */
+/* */
+/* Revision History : */
+/* */
+/* DD MM YYYY Author(s) Changes */
+/* 13 02 2015 Kaushik Initial version */
+/* Senthoor */
+/* */
+/*****************************************************************************/
+/*****************************************************************************/
+/* File Includes */
+/*****************************************************************************/
+
+#include <immintrin.h>
+#include "ih264_typedefs.h"
+#include "ih264_macros.h"
+#include "ih264_platform_macros.h"
+#include "ih264_inter_pred_filters.h"
+
+/*****************************************************************************/
+/* Constant Data variables */
+/*****************************************************************************/
+
+/* coefficients for 6 tap filtering*/
+//const WORD32 ih264_g_six_tap[3] ={1,-5,20};
+/*****************************************************************************/
+/* Function definitions . */
+/*****************************************************************************/
+/*****************************************************************************/
+/* */
+/* Function Name : ih264_inter_pred_luma_copy_ssse3 */
+/* */
+/* Description : This function copies the contents of ht x wd block from */
+/* source to destination. (ht,wd) can be (4,4), (8,4), */
+/* (4,8), (8,8), (16,8), (8,16) or (16,16). */
+/* */
+/* Inputs : puc_src - pointer to source */
+/* puc_dst - pointer to destination */
+/* src_strd - stride for source */
+/* dst_strd - stride for destination */
+/* ht - height of the block */
+/* wd - width of the block */
+/* */
+/* Issues : None */
+/* */
+/* Revision History: */
+/* */
+/* DD MM YYYY Author(s) Changes */
+/* 13 02 2015 Kaushik Initial Version */
+/* Senthoor */
+/* */
+/*****************************************************************************/
+void ih264_inter_pred_luma_copy_ssse3(UWORD8 *pu1_src,
+ UWORD8 *pu1_dst,
+ WORD32 src_strd,
+ WORD32 dst_strd,
+ WORD32 ht,
+ WORD32 wd,
+ UWORD8* pu1_tmp,
+ WORD32 dydx)
+{
+ __m128i y_0_16x8b, y_1_16x8b, y_2_16x8b, y_3_16x8b;
+
+ UNUSED(pu1_tmp);
+ UNUSED(dydx);
+
+ WORD32 src_strd2, src_strd3, src_strd4, dst_strd2, dst_strd3, dst_strd4;
+
+ src_strd2 = src_strd << 1;
+ dst_strd2 = dst_strd << 1;
+ src_strd4 = src_strd << 2;
+ dst_strd4 = dst_strd << 2;
+ src_strd3 = src_strd2 + src_strd;
+ dst_strd3 = dst_strd2 + dst_strd;
+
+ if(wd == 4)
+ {
+ __m128i mask_full_128b, mask_low_32b;
+
+ mask_full_128b = _mm_set1_epi8(0xff);
+ mask_low_32b = _mm_srli_si128(mask_full_128b, 12);
+ // mask for first four bytes
+
+ do
+ {
+ y_0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src);
+ y_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + src_strd));
+ y_2_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + src_strd2));
+ y_3_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + src_strd3));
+
+ _mm_maskmoveu_si128(y_0_16x8b, mask_low_32b, (char*)pu1_dst);
+ _mm_maskmoveu_si128(y_1_16x8b, mask_low_32b, (char*)(pu1_dst + dst_strd));
+ _mm_maskmoveu_si128(y_2_16x8b, mask_low_32b, (char*)(pu1_dst + dst_strd2));
+ _mm_maskmoveu_si128(y_3_16x8b, mask_low_32b, (char*)(pu1_dst + dst_strd3));
+
+ ht -= 4;
+ pu1_src += src_strd4;
+ pu1_dst += dst_strd4;
+ }
+ while(ht > 0);
+ }
+ else if(wd == 8)
+ {
+ do
+ {
+ y_0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src);
+ y_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + src_strd));
+ y_2_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + src_strd2));
+ y_3_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + src_strd3));
+
+ _mm_storel_epi64((__m128i *)pu1_dst, y_0_16x8b);
+ _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd), y_1_16x8b);
+ _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd2), y_2_16x8b);
+ _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd3), y_3_16x8b);
+
+ ht -= 4;
+ pu1_src += src_strd4;
+ pu1_dst += dst_strd4;
+ }
+ while(ht > 0);
+ }
+ else // wd == 16
+ {
+ WORD32 src_strd5, src_strd6, src_strd7, src_strd8;
+ WORD32 dst_strd5, dst_strd6, dst_strd7, dst_strd8;
+
+ __m128i y_4_16x8b, y_5_16x8b, y_6_16x8b, y_7_16x8b;
+
+ src_strd5 = src_strd2 + src_strd3;
+ dst_strd5 = dst_strd2 + dst_strd3;
+ src_strd6 = src_strd3 << 1;
+ dst_strd6 = dst_strd3 << 1;
+ src_strd7 = src_strd3 + src_strd4;
+ dst_strd7 = dst_strd3 + dst_strd4;
+ src_strd8 = src_strd << 3;
+ dst_strd8 = dst_strd << 3;
+
+ do
+ {
+ y_0_16x8b = _mm_loadu_si128((__m128i *)pu1_src);
+ y_1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + src_strd));
+ y_2_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + src_strd2));
+ y_3_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + src_strd3));
+ y_4_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + src_strd4));
+ y_5_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + src_strd5));
+ y_6_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + src_strd6));
+ y_7_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + src_strd7));
+
+ _mm_storeu_si128((__m128i *)pu1_dst, y_0_16x8b);
+ _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd), y_1_16x8b);
+ _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd2), y_2_16x8b);
+ _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd3), y_3_16x8b);
+ _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd4), y_4_16x8b);
+ _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd5), y_5_16x8b);
+ _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd6), y_6_16x8b);
+ _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd7), y_7_16x8b);
+
+ ht -= 8;
+ pu1_src += src_strd8;
+ pu1_dst += dst_strd8;
+ }
+ while(ht > 0);
+ }
+}
+
+/*****************************************************************************/
+/* */
+/* Function Name : ih264_inter_pred_luma_horz_ssse3 */
+/* */
+/* Description : This function applies a horizontal 6-tap filter on */
+/* ht x wd block as mentioned in sec. 8.4.2.2.1 titled */
+/* "Luma sample interpolation process". (ht,wd) can be */
+/* (4,4), (8,4), (4,8), (8,8), (16,8), (8,16) or (16,16). */
+/* */
+/* Inputs : puc_src - pointer to source */
+/* puc_dst - pointer to destination */
+/* src_strd - stride for source */
+/* dst_strd - stride for destination */
+/* ht - height of the block */
+/* wd - width of the block */
+/* */
+/* Issues : None */
+/* */
+/* Revision History: */
+/* */
+/* DD MM YYYY Author(s) Changes */
+/* 13 02 2015 Kaushik Initial Version */
+/* Senthoor */
+/* */
+/*****************************************************************************/
+void ih264_inter_pred_luma_horz_ssse3(UWORD8 *pu1_src,
+ UWORD8 *pu1_dst,
+ WORD32 src_strd,
+ WORD32 dst_strd,
+ WORD32 ht,
+ WORD32 wd,
+ UWORD8* pu1_tmp,
+ WORD32 dydx)
+{
+ __m128i coeff0_1_16x8b, coeff2_3_16x8b, coeff4_5_16x8b;
+ __m128i const_val16_8x16b;
+
+ UNUSED(pu1_tmp);
+ UNUSED(dydx);
+
+ pu1_src -= 2; // the filter input starts from x[-2] (till x[3])
+
+ coeff0_1_16x8b = _mm_set1_epi32(0xFB01FB01); //c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 c0 c1
+ coeff2_3_16x8b = _mm_set1_epi32(0x14141414); //c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 c2 c3
+ coeff4_5_16x8b = _mm_set1_epi32(0x01FB01FB); //c4 c5 c4 c5 c4 c5 c4 c5 c4 c5 c4 c5 c4 c5 c4 c5
+ //c0 = c5 = 1, c1 = c4 = -5, c2 = c3 = 20
+ const_val16_8x16b = _mm_set1_epi16(16);
+
+ if(wd == 4)
+ {
+ __m128i src_r0_16x8b, src_r1_16x8b, src_r0r1_16x8b;
+ __m128i src_r0_sht_16x8b, src_r1_sht_16x8b;
+
+ __m128i res_r0r1_t1_8x16b, res_r0r1_t2_8x16b, res_r0r1_t3_8x16b;
+ __m128i res_r0r1_16x8b;
+
+ __m128i mask_full_16x8b, mask_low32b;
+
+ mask_full_16x8b = _mm_set1_epi8(0xff);
+ mask_low32b = _mm_srli_si128(mask_full_16x8b, 12); // mask for first four bytes
+
+ //Row0 : a0 a1 a2 a3 a4 a5 a6 a7 a8 a9.....
+ //Row1 : b0 b1 b2 b3 b4 b5 b6 b7 b8 b9.....
+
+ do
+ {
+ src_r0_16x8b = _mm_loadu_si128((__m128i *)pu1_src); //a0 a1 a2 a3 a4 a5 a6 a7 a8 a9....a15
+ src_r1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + src_strd)); //b0 b1 b2 b3 b4 b5 b6 b7 b8 b9....b15
+
+ src_r0_sht_16x8b = _mm_srli_si128(src_r0_16x8b, 1); //a1 a2 a3 a4 a5 a6 a7 a8 a9....a15 0
+ src_r1_sht_16x8b = _mm_srli_si128(src_r1_16x8b, 1); //b1 b2 b3 b4 b5 b6 b7 b8 b9....b15 0
+
+ src_r0_16x8b = _mm_unpacklo_epi8(src_r0_16x8b, src_r0_sht_16x8b); //a0 a1 a1 a2 a2 a3 a3 a4 a4 a5 a5 a6 a6 a7 a7 a8
+ src_r1_16x8b = _mm_unpacklo_epi8(src_r1_16x8b, src_r1_sht_16x8b); //b0 b1 b1 b2 b2 b3 b3 b4 b4 b5 b5 b6 b6 b7 b7 b8
+
+ src_r0r1_16x8b = _mm_unpacklo_epi64(src_r0_16x8b, src_r1_16x8b); //a0 a1 a1 a2 a2 a3 a3 a4 b0 b1 b1 b2 b2 b3 b3 b4
+ res_r0r1_t1_8x16b = _mm_maddubs_epi16(src_r0r1_16x8b, coeff0_1_16x8b); //a0*c0+a1*c1 a1*c0+a2*c1 a2*c0+a3*c1 a3*c0+a4*c1
+ //b0*c0+b1*c1 b1*c0+b2*c1 b2*c0+b3*c1 b3*c0+b4*c1
+
+ src_r0_16x8b = _mm_srli_si128(src_r0_16x8b, 4); //a2 a3 a3 a4 a4 a5 a5 a6 a6 a7 a7 a8 0 0 0 0
+ src_r1_16x8b = _mm_srli_si128(src_r1_16x8b, 4); //b2 b3 b3 b4 b4 b5 b5 b6 b6 b7 b7 b8 0 0 0 0
+
+ src_r0r1_16x8b = _mm_unpacklo_epi64(src_r0_16x8b, src_r1_16x8b); //a2 a3 a3 a4 a4 a5 a5 a6 b2 b3 b3 b4 b4 b5 b5 b6
+ res_r0r1_t2_8x16b = _mm_maddubs_epi16(src_r0r1_16x8b, coeff2_3_16x8b); //a2*c2+a3*c3 a3*c2+a4*c3 a4*c2+a5*c3 a5*c2+a6*c3
+ //b2*c2+b3*c3 b3*c2+b4*c3 b4*c2+b5*c3 b5*c2+b6*c3
+
+ src_r0_16x8b = _mm_srli_si128(src_r0_16x8b, 4); //a4 a5 a5 a6 a6 a7 a7 a8 0 0 0 0 0 0 0 0
+ src_r1_16x8b = _mm_srli_si128(src_r1_16x8b, 4); //b4 b5 b5 b6 b6 b7 b7 b8 0 0 0 0 0 0 0 0
+
+ src_r0r1_16x8b = _mm_unpacklo_epi64(src_r0_16x8b, src_r1_16x8b); //a4 a5 a5 a6 a6 a7 a7 a8 b4 b5 b5 b6 b6 b7 b7 b8
+ res_r0r1_t3_8x16b = _mm_maddubs_epi16(src_r0r1_16x8b, coeff4_5_16x8b); //a4*c4+a5*c5 a5*c4+a6*c5 a6*c4+a7*c5 a7*c4+a8*c5
+ //b4*c4+b5*c5 b5*c4+b6*c5 b4*c6+b7*c5 b7*c4+b8*c5
+
+ res_r0r1_t1_8x16b = _mm_add_epi16(res_r0r1_t1_8x16b, res_r0r1_t2_8x16b);
+ res_r0r1_t3_8x16b = _mm_add_epi16(const_val16_8x16b, res_r0r1_t3_8x16b);
+ res_r0r1_t1_8x16b = _mm_add_epi16(res_r0r1_t1_8x16b, res_r0r1_t3_8x16b); //a0*c0+a1*c1+a2*c2+a3*c3+a4*a4+a5*c5 + 16;
+ //a1*c0+a2*c1+a2*c2+a3*c3+a5*a4+a6*c5 + 16;
+ //a2*c0+a3*c1+a4*c2+a5*c3+a6*a4+a7*c5 + 16;
+ //a3*c0+a4*c1+a5*c2+a6*c3+a6*a4+a8*c5 + 16;
+ //b0*c0+b1*c1+b2*c2+b3*c3+b4*b4+b5*c5 + 16;
+ //b1*c0+b2*c1+b2*c2+b3*c3+b5*b4+b6*c5 + 16;
+ //b2*c0+b3*c1+b4*c2+b5*c3+b6*b4+b7*c5 + 16;
+ //b3*c0+b4*c1+b5*c2+b6*c3+b6*b4+b8*c5 + 16;
+
+ res_r0r1_t1_8x16b = _mm_srai_epi16(res_r0r1_t1_8x16b, 5); //shifting right by 5 bits.
+
+ res_r0r1_16x8b = _mm_packus_epi16(res_r0r1_t1_8x16b, res_r0r1_t1_8x16b);
+
+ _mm_maskmoveu_si128(res_r0r1_16x8b, mask_low32b, (char*)pu1_dst);
+ res_r0r1_16x8b = _mm_srli_si128(res_r0r1_16x8b, 4);
+ _mm_maskmoveu_si128(res_r0r1_16x8b, mask_low32b, (char*)(pu1_dst + dst_strd));
+
+ ht -= 2;
+ pu1_src += src_strd << 1;
+ pu1_dst += dst_strd << 1;
+ }
+ while(ht > 0);
+ }
+ else if(wd == 8)
+ {
+ __m128i src_r0_16x8b, src_r1_16x8b, src_r0_sht_16x8b, src_r1_sht_16x8b;
+ __m128i src_r0_t1_16x8b, src_r1_t1_16x8b;
+
+ __m128i res_r0_t1_8x16b, res_r0_t2_8x16b, res_r0_t3_8x16b;
+ __m128i res_r1_t1_8x16b, res_r1_t2_8x16b, res_r1_t3_8x16b;
+
+ //Row0 : a0 a1 a2 a3 a4 a5 a6 a7 a8 a9.....
+ //Row1 : b0 b1 b2 b3 b4 b5 b6 b7 b8 b9.....
+
+ do
+ {
+ src_r0_16x8b = _mm_loadu_si128((__m128i *)pu1_src); //a0 a1 a2 a3 a4 a5 a6 a7 a8 a9....a15
+ src_r1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + src_strd)); //b0 b1 b2 b3 b4 b5 b6 b7 b8 b9....b15
+
+ src_r0_sht_16x8b = _mm_srli_si128(src_r0_16x8b, 1); //a1 a2 a3 a4 a5 a6 a7 a8 a9....a15 0
+ src_r1_sht_16x8b = _mm_srli_si128(src_r1_16x8b, 1); //b1 b2 b3 b4 b5 b6 b7 b8 b9....b15 0
+
+ src_r0_t1_16x8b = _mm_unpacklo_epi8(src_r0_16x8b, src_r0_sht_16x8b); //a0 a1 a1 a2 a2 a3 a3 a4 a4 a5 a5 a6 a6 a7 a7 a8
+ src_r1_t1_16x8b = _mm_unpacklo_epi8(src_r1_16x8b, src_r1_sht_16x8b); //b0 b1 b1 b2 b2 b3 b3 b4 b4 b5 b5 b6 b6 b7 b7 b8
+
+ res_r0_t1_8x16b = _mm_maddubs_epi16(src_r0_t1_16x8b, coeff0_1_16x8b); //a0*c0+a1*c1 a1*c0+a2*c1 a2*c0+a3*c1 a3*c0+a4*c1
+ //a4*c0+a5*c1 a5*c0+a6*c1 a6*c0+a7*c1 a7*c0+a8*c1
+ res_r1_t1_8x16b = _mm_maddubs_epi16(src_r1_t1_16x8b, coeff0_1_16x8b); //b0*c0+b1*c1 b1*c0+b2*c1 b2*c0+b3*c1 b3*c0+b4*c1
+ //b4*c0+b5*c1 b5*c0+b6*c1 b6*c0+b7*c1 b7*c0+b8*c1
+
+ src_r0_16x8b = _mm_srli_si128(src_r0_16x8b, 2); //a2 a3 a4 a5 a6 a7 a8 a9....a15 0 0
+ src_r1_16x8b = _mm_srli_si128(src_r1_16x8b, 2); //b2 b3 b4 b5 b6 b7 b8 b9....b15 0 0
+
+ src_r0_sht_16x8b = _mm_srli_si128(src_r0_sht_16x8b, 2); //a3 a4 a5 a6 a7 a8 a9....a15 0 0 0
+ src_r1_sht_16x8b = _mm_srli_si128(src_r1_sht_16x8b, 2); //b3 b4 b5 b6 b7 b8 b9....b15 0 0 0
+
+ src_r0_t1_16x8b = _mm_unpacklo_epi8(src_r0_16x8b, src_r0_sht_16x8b); //a2 a3 a3 a4 a4 a5 a5 a6 a6 a7 a7 a8 a8 a9 a9 a10
+ src_r1_t1_16x8b = _mm_unpacklo_epi8(src_r1_16x8b, src_r1_sht_16x8b); //b2 b3 b3 b4 b4 b5 b5 b6 b6 b7 b7 b8 a8 a9 a9 a10
+
+ res_r0_t2_8x16b = _mm_maddubs_epi16(src_r0_t1_16x8b, coeff2_3_16x8b); //a2*c2+a3*c3 a3*c2+a4*c3 a4*c2+a5*c3 a5*c2+a6*c3
+ //a6*c2+a7*c3 a7*c2+a8*c3 a8*c2+a9*c3 a9*c2+a10*c3
+ res_r1_t2_8x16b = _mm_maddubs_epi16(src_r1_t1_16x8b, coeff2_3_16x8b); //b2*c2+b3*c3 b3*c2+b4*c3 b2*c4+b5*c3 b5*c2+b6*c3
+ //b6*c2+b7*c3 b7*c2+b8*c3 b8*c2+b9*c3 b9*c2+b10*c3
+
+ src_r0_16x8b = _mm_srli_si128(src_r0_16x8b, 2); //a4 a5 a6 a7 a8 a9....a15 0 0 0 0
+ src_r1_16x8b = _mm_srli_si128(src_r1_16x8b, 2); //b4 b5 b6 b7 b8 b9....b15 0 0 0 0
+
+ src_r0_sht_16x8b = _mm_srli_si128(src_r0_sht_16x8b, 2); //a5 a6 a7 a8 a9....a15 0 0 0 0 0
+ src_r1_sht_16x8b = _mm_srli_si128(src_r1_sht_16x8b, 2); //b5 b6 b7 b8 b9....b15 0 0 0 0 0
+
+ src_r0_t1_16x8b = _mm_unpacklo_epi8(src_r0_16x8b, src_r0_sht_16x8b); //a4 a5 a5 a6 a6 a7 a7 a8 a8 a9 a9 a10 a10 a11 a11 a12
+ src_r1_t1_16x8b = _mm_unpacklo_epi8(src_r1_16x8b, src_r1_sht_16x8b); //b4 b5 b5 b6 b6 b7 b7 b8 b8 b9 b9 b10 b10 b11 b11 b12
+
+ res_r0_t3_8x16b = _mm_maddubs_epi16(src_r0_t1_16x8b, coeff4_5_16x8b); //a4*c4+a5*c5 a5*c4+a6*c5 a6*c4+a7*c5 a7*c4+a8*c5
+ //a8*c4+a9*c5 a9*c4+a10*c5 a10*c4+a11*c5 a11*c4+a12*c5
+ res_r1_t3_8x16b = _mm_maddubs_epi16(src_r1_t1_16x8b, coeff4_5_16x8b); //b4*c4+b5*c5 b5*c4+b6*c5 b6*c4+b7*c5 b7*c4+b8*c5
+ //b8*c4+b9*c5 b9*c4+b10*c5 b10*c4+b11*c5 b11*c4+b12*c5
+ res_r0_t1_8x16b = _mm_add_epi16(res_r0_t1_8x16b, res_r0_t2_8x16b);
+ res_r1_t1_8x16b = _mm_add_epi16(res_r1_t1_8x16b, res_r1_t2_8x16b);
+ res_r0_t3_8x16b = _mm_add_epi16(res_r0_t3_8x16b, const_val16_8x16b);
+ res_r1_t3_8x16b = _mm_add_epi16(res_r1_t3_8x16b, const_val16_8x16b);
+ res_r0_t1_8x16b = _mm_add_epi16(res_r0_t1_8x16b, res_r0_t3_8x16b);
+ res_r1_t1_8x16b = _mm_add_epi16(res_r1_t1_8x16b, res_r1_t3_8x16b);
+
+ res_r0_t1_8x16b = _mm_srai_epi16(res_r0_t1_8x16b, 5); //shifting right by 5 bits.
+ res_r1_t1_8x16b = _mm_srai_epi16(res_r1_t1_8x16b, 5);
+
+ src_r0_16x8b = _mm_packus_epi16(res_r0_t1_8x16b, res_r0_t1_8x16b);
+ src_r1_16x8b = _mm_packus_epi16(res_r1_t1_8x16b, res_r1_t1_8x16b);
+
+ _mm_storel_epi64((__m128i *)pu1_dst, src_r0_16x8b);
+ _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd), src_r1_16x8b);
+
+ ht -= 2;
+ pu1_src += src_strd << 1;
+ pu1_dst += dst_strd << 1;
+ }
+ while(ht > 0);
+ }
+ else // wd == 16
+ {
+ __m128i src_r0_16x8b, src_r1_16x8b, src_r0_sht_16x8b, src_r1_sht_16x8b;
+ __m128i src_r0_t1_16x8b, src_r1_t1_16x8b;
+
+ __m128i res_r0_t1_8x16b, res_r0_t2_8x16b, res_r0_t3_8x16b;
+ __m128i res_r1_t1_8x16b, res_r1_t2_8x16b, res_r1_t3_8x16b;
+
+ //Row0 : a0 a1 a2 a3 a4 a5 a6 a7 a8 a9.....
+ //Row0 : b0 b1 b2 b3 b4 b5 b6 b7 b8 b9.....
+ //b0 is same a8. Similarly other bn pixels are same as a(n+8) pixels.
+
+ do
+ {
+ src_r0_16x8b = _mm_loadu_si128((__m128i *)pu1_src); //a0 a1 a2 a3 a4 a5 a6 a7 a8 a9....a15
+ src_r1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + 8)); //b0 b1 b2 b3 b4 b5 b6 b7 b8 b9....b15
+
+ src_r0_sht_16x8b = _mm_srli_si128(src_r0_16x8b, 1); //a1 a2 a3 a4 a5 a6 a7 a8 a9....a15 0
+ src_r1_sht_16x8b = _mm_srli_si128(src_r1_16x8b, 1); //b1 b2 b3 b4 b5 b6 b7 b8 b9....b15 0
+
+ src_r0_t1_16x8b = _mm_unpacklo_epi8(src_r0_16x8b, src_r0_sht_16x8b); //a0 a1 a1 a2 a2 a3 a3 a4 a4 a5 a5 a6 a6 a7 a7 a8
+ src_r1_t1_16x8b = _mm_unpacklo_epi8(src_r1_16x8b, src_r1_sht_16x8b); //b0 b1 b1 b2 b2 b3 b3 b4 b4 b5 b5 b6 b6 b7 b7 b8
+
+ res_r0_t1_8x16b = _mm_maddubs_epi16(src_r0_t1_16x8b, coeff0_1_16x8b); //a0*c0+a1*c1 a1*c0+a2*c1 a2*c0+a3*c1 a3*c0+a4*c1
+ //a4*c0+a5*c1 a5*c0+a6*c1 a6*c0+a7*c1 a7*c0+a8*c1
+ res_r1_t1_8x16b = _mm_maddubs_epi16(src_r1_t1_16x8b, coeff0_1_16x8b); //b0*c0+b1*c1 b1*c0+b2*c1 b2*c0+b3*c1 b3*c0+b4*c1
+ //b4*c0+b5*c1 b5*c0+b6*c1 b6*c0+b7*c1 b7*c0+b8*c1
+
+ src_r0_16x8b = _mm_srli_si128(src_r0_16x8b, 2); //a2 a3 a4 a5 a6 a7 a8 a9....a15 0 0
+ src_r1_16x8b = _mm_srli_si128(src_r1_16x8b, 2); //b2 b3 b4 b5 b6 b7 b8 b9....b15 0 0
+
+ src_r0_sht_16x8b = _mm_srli_si128(src_r0_sht_16x8b, 2); //a3 a4 a5 a6 a7 a8 a9....a15 0 0 0
+ src_r1_sht_16x8b = _mm_srli_si128(src_r1_sht_16x8b, 2); //b3 b4 b5 b6 b7 b8 b9....b15 0 0 0
+
+ src_r0_t1_16x8b = _mm_unpacklo_epi8(src_r0_16x8b, src_r0_sht_16x8b); //a2 a3 a3 a4 a4 a5 a5 a6 a6 a7 a7 a8 a8 a9 a9 a10
+ src_r1_t1_16x8b = _mm_unpacklo_epi8(src_r1_16x8b, src_r1_sht_16x8b); //b2 b3 b3 b4 b4 b5 b5 b6 b6 b7 b7 b8 a8 a9 a9 a10
+
+ res_r0_t2_8x16b = _mm_maddubs_epi16(src_r0_t1_16x8b, coeff2_3_16x8b); //a2*c2+a3*c3 a3*c2+a4*c3 a4*c2+a5*c3 a5*c2+a6*c3
+ //a6*c2+a7*c3 a7*c2+a8*c3 a8*c2+a9*c3 a9*c2+a10*c3
+ res_r1_t2_8x16b = _mm_maddubs_epi16(src_r1_t1_16x8b, coeff2_3_16x8b); //b2*c2+b3*c3 b3*c2+b4*c3 b2*c4+b5*c3 b5*c2+b6*c3
+ //b6*c2+b7*c3 b7*c2+b8*c3 b8*c2+b9*c3 b9*c2+b10*c3
+
+ src_r0_16x8b = _mm_srli_si128(src_r0_16x8b, 2); //a4 a5 a6 a7 a8 a9....a15 0 0 0 0
+ src_r1_16x8b = _mm_srli_si128(src_r1_16x8b, 2); //b4 b5 b6 b7 b8 b9....b15 0 0 0 0
+
+ src_r0_sht_16x8b = _mm_srli_si128(src_r0_sht_16x8b, 2); //a5 a6 a7 a8 a9....a15 0 0 0 0 0
+ src_r1_sht_16x8b = _mm_srli_si128(src_r1_sht_16x8b, 2); //b5 b6 b7 b8 b9....b15 0 0 0 0 0
+
+ src_r0_t1_16x8b = _mm_unpacklo_epi8(src_r0_16x8b, src_r0_sht_16x8b); //a4 a5 a5 a6 a6 a7 a7 a8 a8 a9 a9 a10 a10 a11 a11 a12
+ src_r1_t1_16x8b = _mm_unpacklo_epi8(src_r1_16x8b, src_r1_sht_16x8b); //b4 b5 b5 b6 b6 b7 b7 b8 b8 b9 b9 b10 b10 b11 b11 b12
+
+ res_r0_t3_8x16b = _mm_maddubs_epi16(src_r0_t1_16x8b, coeff4_5_16x8b); //a4*c4+a5*c5 a5*c4+a6*c5 a6*c4+a7*c5 a7*c4+a8*c5
+ //a8*c4+a9*c5 a9*c4+a10*c5 a10*c4+a11*c5 a11*c4+a12*c5
+ res_r1_t3_8x16b = _mm_maddubs_epi16(src_r1_t1_16x8b, coeff4_5_16x8b); //b4*c4+b5*c5 b5*c4+b6*c5 b6*c4+b7*c5 b7*c4+b8*c5
+ //b8*c4+b9*c5 b9*c4+b10*c5 b10*c4+b11*c5 b11*c4+b12*c5
+ res_r0_t1_8x16b = _mm_add_epi16(res_r0_t1_8x16b, res_r0_t2_8x16b);
+ res_r1_t1_8x16b = _mm_add_epi16(res_r1_t1_8x16b, res_r1_t2_8x16b);
+ res_r0_t3_8x16b = _mm_add_epi16(res_r0_t3_8x16b, const_val16_8x16b);
+ res_r1_t3_8x16b = _mm_add_epi16(res_r1_t3_8x16b, const_val16_8x16b);
+ res_r0_t1_8x16b = _mm_add_epi16(res_r0_t1_8x16b, res_r0_t3_8x16b);
+ res_r1_t1_8x16b = _mm_add_epi16(res_r1_t1_8x16b, res_r1_t3_8x16b);
+
+ res_r0_t1_8x16b = _mm_srai_epi16(res_r0_t1_8x16b, 5); //shifting right by 5 bits.
+ res_r1_t1_8x16b = _mm_srai_epi16(res_r1_t1_8x16b, 5);
+
+ src_r0_16x8b = _mm_packus_epi16(res_r0_t1_8x16b, res_r1_t1_8x16b);
+ _mm_storeu_si128((__m128i *)pu1_dst, src_r0_16x8b);
+
+ ht--;
+ pu1_src += src_strd;
+ pu1_dst += dst_strd;
+ }
+ while(ht > 0);
+ }
+}
+
+/*****************************************************************************/
+/* */
+/* Function Name : ih264_inter_pred_luma_vert_ssse3 */
+/* */
+/* Description : This function applies a vertical 6-tap filter on */
+/* ht x wd block as mentioned in sec. 8.4.2.2.1 titled */
+/* "Luma sample interpolation process". (ht,wd) can be */
+/* (4,4), (8,4), (4,8), (8,8), (16,8), (8,16) or (16,16). */
+/* */
+/* Inputs : puc_src - pointer to source */
+/* puc_dst - pointer to destination */
+/* src_strd - stride for source */
+/* dst_strd - stride for destination */
+/* ht - height of the block */
+/* wd - width of the block */
+/* */
+/* Issues : None */
+/* */
+/* Revision History: */
+/* */
+/* DD MM YYYY Author(s) Changes */
+/* 13 02 2015 Kaushik Initial Version */
+/* Senthoor */
+/* */
+/*****************************************************************************/
+void ih264_inter_pred_luma_vert_ssse3(UWORD8 *pu1_src,
+ UWORD8 *pu1_dst,
+ WORD32 src_strd,
+ WORD32 dst_strd,
+ WORD32 ht,
+ WORD32 wd,
+ UWORD8* pu1_tmp,
+ WORD32 dydx)
+{
+ __m128i src_r0_16x8b, src_r1_16x8b, src_r2_16x8b, src_r3_16x8b, src_r4_16x8b;
+ __m128i src_r5_16x8b, src_r6_16x8b;
+ __m128i src_r0r1_16x8b, src_r2r3_16x8b, src_r4r5_16x8b;
+
+ __m128i res_16x8b, res_t1_8x16b, res_t2_8x16b, res_t3_8x16b;
+
+ __m128i coeff0_1_16x8b, coeff2_3_16x8b, coeff4_5_16x8b;
+ __m128i const_val16_8x16b;
+
+ UNUSED(pu1_tmp);
+ UNUSED(dydx);
+
+ coeff0_1_16x8b = _mm_set1_epi32(0xFB01FB01); //c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 c0 c1
+ coeff2_3_16x8b = _mm_set1_epi32(0x14141414); //c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 c2 c3
+ coeff4_5_16x8b = _mm_set1_epi32(0x01FB01FB); //c4 c5 c4 c5 c4 c5 c4 c5 c4 c5 c4 c5 c4 c5 c4 c5
+ //c0 = c5 = 1, c1 = c4 = -5, c2 = c3 = 20
+ const_val16_8x16b = _mm_set1_epi16(16);
+
+ pu1_src -= src_strd << 1; // the filter input starts from x[-2] (till x[3])
+
+ if(wd == 4)
+ {
+ __m128i mask_low32b;
+
+ mask_low32b = _mm_set1_epi8(0xff);
+
+ //Epilogue: Load all the pred rows except sixth and seventh row
+ // for the first and second row processing.
+ src_r0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src);
+ pu1_src += src_strd;
+ src_r1_16x8b = _mm_loadl_epi64((__m128i *)pu1_src);
+ pu1_src += src_strd;
+ src_r2_16x8b = _mm_loadl_epi64((__m128i *)pu1_src);
+ pu1_src += src_strd;
+ src_r3_16x8b = _mm_loadl_epi64((__m128i *)pu1_src);
+ pu1_src += src_strd;
+ src_r4_16x8b = _mm_loadl_epi64((__m128i *)pu1_src);
+ pu1_src += src_strd;
+
+ mask_low32b = _mm_srli_si128(mask_low32b, 12); // mask for first four bytes
+
+ src_r0_16x8b = _mm_unpacklo_epi32(src_r0_16x8b, src_r1_16x8b);
+ src_r1_16x8b = _mm_unpacklo_epi32(src_r1_16x8b, src_r2_16x8b);
+ src_r2_16x8b = _mm_unpacklo_epi32(src_r2_16x8b, src_r3_16x8b);
+ src_r3_16x8b = _mm_unpacklo_epi32(src_r3_16x8b, src_r4_16x8b);
+
+ do
+ {
+ src_r5_16x8b = _mm_loadl_epi64((__m128i *)pu1_src);
+ src_r6_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + src_strd));
+
+ src_r4_16x8b = _mm_unpacklo_epi32(src_r4_16x8b, src_r5_16x8b);
+ src_r5_16x8b = _mm_unpacklo_epi32(src_r5_16x8b, src_r6_16x8b);
+
+ src_r0r1_16x8b = _mm_unpacklo_epi8(src_r0_16x8b, src_r1_16x8b);
+ src_r2r3_16x8b = _mm_unpacklo_epi8(src_r2_16x8b, src_r3_16x8b);
+ src_r4r5_16x8b = _mm_unpacklo_epi8(src_r4_16x8b, src_r5_16x8b);
+
+ res_t1_8x16b = _mm_maddubs_epi16(src_r0r1_16x8b, coeff0_1_16x8b);
+ res_t2_8x16b = _mm_maddubs_epi16(src_r2r3_16x8b, coeff2_3_16x8b);
+ res_t3_8x16b = _mm_maddubs_epi16(src_r4r5_16x8b, coeff4_5_16x8b);
+
+ res_t1_8x16b = _mm_add_epi16(res_t1_8x16b, res_t2_8x16b);
+ res_t3_8x16b = _mm_add_epi16(res_t3_8x16b, const_val16_8x16b);
+ res_t1_8x16b = _mm_add_epi16(res_t1_8x16b, res_t3_8x16b);
+
+ res_t1_8x16b = _mm_srai_epi16(res_t1_8x16b, 5); //shifting right by 5 bits.
+ res_16x8b = _mm_packus_epi16(res_t1_8x16b, res_t1_8x16b);
+
+ _mm_maskmoveu_si128(res_16x8b, mask_low32b, (char*)pu1_dst);
+ res_16x8b = _mm_srli_si128(res_16x8b, 4);
+ _mm_maskmoveu_si128(res_16x8b, mask_low32b, (char*)(pu1_dst + dst_strd));
+
+ src_r0_16x8b = src_r2_16x8b;
+ src_r1_16x8b = src_r3_16x8b;
+ src_r2_16x8b = src_r4_16x8b;
+ src_r3_16x8b = src_r5_16x8b;
+ src_r4_16x8b = src_r6_16x8b;
+
+ ht -= 2;
+ pu1_src += src_strd << 1;
+ pu1_dst += dst_strd << 1;
+ }
+ while(ht > 0);
+ }
+
+ else if(wd == 8)
+ {
+ //Epilogue: Load all the pred rows except sixth and seventh row
+ // for the first and second row processing.
+ src_r0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src);
+ pu1_src += src_strd;
+ src_r1_16x8b = _mm_loadl_epi64((__m128i *)pu1_src);
+ pu1_src += src_strd;
+ src_r2_16x8b = _mm_loadl_epi64((__m128i *)pu1_src);
+ pu1_src += src_strd;
+ src_r3_16x8b = _mm_loadl_epi64((__m128i *)pu1_src);
+ pu1_src += src_strd;
+ src_r4_16x8b = _mm_loadl_epi64((__m128i *)pu1_src);
+ pu1_src += src_strd;
+
+ src_r0_16x8b = _mm_unpacklo_epi64(src_r0_16x8b, src_r1_16x8b);
+ src_r1_16x8b = _mm_unpacklo_epi64(src_r1_16x8b, src_r2_16x8b);
+ src_r2_16x8b = _mm_unpacklo_epi64(src_r2_16x8b, src_r3_16x8b);
+ src_r3_16x8b = _mm_unpacklo_epi64(src_r3_16x8b, src_r4_16x8b);
+
+ do
+ {
+ src_r5_16x8b = _mm_loadl_epi64((__m128i *)pu1_src);
+ src_r6_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + src_strd));
+
+ src_r4_16x8b = _mm_unpacklo_epi64(src_r4_16x8b, src_r5_16x8b);
+ src_r5_16x8b = _mm_unpacklo_epi64(src_r5_16x8b, src_r6_16x8b);
+
+ src_r0r1_16x8b = _mm_unpacklo_epi8(src_r0_16x8b, src_r1_16x8b);
+ src_r2r3_16x8b = _mm_unpacklo_epi8(src_r2_16x8b, src_r3_16x8b);
+ src_r4r5_16x8b = _mm_unpacklo_epi8(src_r4_16x8b, src_r5_16x8b);
+
+ res_t1_8x16b = _mm_maddubs_epi16(src_r0r1_16x8b, coeff0_1_16x8b);
+ res_t2_8x16b = _mm_maddubs_epi16(src_r2r3_16x8b, coeff2_3_16x8b);
+ res_t3_8x16b = _mm_maddubs_epi16(src_r4r5_16x8b, coeff4_5_16x8b);
+
+ res_t1_8x16b = _mm_add_epi16(res_t1_8x16b, res_t2_8x16b);
+ res_t3_8x16b = _mm_add_epi16(res_t3_8x16b, const_val16_8x16b);
+ res_t1_8x16b = _mm_add_epi16(res_t1_8x16b, res_t3_8x16b);
+
+ res_t1_8x16b = _mm_srai_epi16(res_t1_8x16b, 5); //shifting right by 5 bits.
+ res_16x8b = _mm_packus_epi16(res_t1_8x16b, res_t1_8x16b);
+
+ _mm_storel_epi64((__m128i *)pu1_dst, res_16x8b);
+
+ src_r0r1_16x8b = _mm_unpackhi_epi8(src_r0_16x8b, src_r1_16x8b);
+ src_r2r3_16x8b = _mm_unpackhi_epi8(src_r2_16x8b, src_r3_16x8b);
+ src_r4r5_16x8b = _mm_unpackhi_epi8(src_r4_16x8b, src_r5_16x8b);
+
+ res_t1_8x16b = _mm_maddubs_epi16(src_r0r1_16x8b, coeff0_1_16x8b);
+ res_t2_8x16b = _mm_maddubs_epi16(src_r2r3_16x8b, coeff2_3_16x8b);
+ res_t3_8x16b = _mm_maddubs_epi16(src_r4r5_16x8b, coeff4_5_16x8b);
+
+ res_t1_8x16b = _mm_add_epi16(res_t1_8x16b, res_t2_8x16b);
+ res_t3_8x16b = _mm_add_epi16(res_t3_8x16b, const_val16_8x16b);
+ res_t1_8x16b = _mm_add_epi16(res_t1_8x16b, res_t3_8x16b);
+
+ res_t1_8x16b = _mm_srai_epi16(res_t1_8x16b, 5); //shifting right by 5 bits.
+ res_16x8b = _mm_packus_epi16(res_t1_8x16b, res_t1_8x16b);
+
+ _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd), res_16x8b);
+
+ src_r0_16x8b = src_r2_16x8b;
+ src_r1_16x8b = src_r3_16x8b;
+ src_r2_16x8b = src_r4_16x8b;
+ src_r3_16x8b = src_r5_16x8b;
+ src_r4_16x8b = src_r6_16x8b;
+
+ ht -= 2;
+ pu1_src += src_strd << 1;
+ pu1_dst += dst_strd << 1;
+ }
+ while(ht > 0);
+ }
+ else // wd == 16
+ {
+ __m128i res_t0_8x16b;
+
+ //Epilogue: Load all the pred rows except sixth and seventh row
+ // for the first and second row processing.
+ src_r0_16x8b = _mm_loadu_si128((__m128i *)pu1_src);
+ pu1_src += src_strd;
+ src_r1_16x8b = _mm_loadu_si128((__m128i *)pu1_src);
+ pu1_src += src_strd;
+ src_r2_16x8b = _mm_loadu_si128((__m128i *)pu1_src);
+ pu1_src += src_strd;
+ src_r3_16x8b = _mm_loadu_si128((__m128i *)pu1_src);
+ pu1_src += src_strd;
+ src_r4_16x8b = _mm_loadu_si128((__m128i *)pu1_src);
+ pu1_src += src_strd;
+
+ do
+ {
+ src_r5_16x8b = _mm_loadu_si128((__m128i *)pu1_src);
+ src_r6_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + src_strd));
+
+ src_r0r1_16x8b = _mm_unpacklo_epi8(src_r0_16x8b, src_r1_16x8b);
+ src_r2r3_16x8b = _mm_unpacklo_epi8(src_r2_16x8b, src_r3_16x8b);
+ src_r4r5_16x8b = _mm_unpacklo_epi8(src_r4_16x8b, src_r5_16x8b);
+
+ res_t1_8x16b = _mm_maddubs_epi16(src_r0r1_16x8b, coeff0_1_16x8b);
+ res_t2_8x16b = _mm_maddubs_epi16(src_r2r3_16x8b, coeff2_3_16x8b);
+ res_t3_8x16b = _mm_maddubs_epi16(src_r4r5_16x8b, coeff4_5_16x8b);
+
+ res_t1_8x16b = _mm_add_epi16(res_t1_8x16b, res_t2_8x16b);
+ res_t3_8x16b = _mm_add_epi16(res_t3_8x16b, const_val16_8x16b);
+ res_t1_8x16b = _mm_add_epi16(res_t1_8x16b, res_t3_8x16b);
+ res_t0_8x16b = _mm_srai_epi16(res_t1_8x16b, 5); //shifting right by 5 bits.
+
+ src_r0r1_16x8b = _mm_unpackhi_epi8(src_r0_16x8b, src_r1_16x8b);
+ src_r2r3_16x8b = _mm_unpackhi_epi8(src_r2_16x8b, src_r3_16x8b);
+ src_r4r5_16x8b = _mm_unpackhi_epi8(src_r4_16x8b, src_r5_16x8b);
+
+ res_t1_8x16b = _mm_maddubs_epi16(src_r0r1_16x8b, coeff0_1_16x8b);
+ res_t2_8x16b = _mm_maddubs_epi16(src_r2r3_16x8b, coeff2_3_16x8b);
+ res_t3_8x16b = _mm_maddubs_epi16(src_r4r5_16x8b, coeff4_5_16x8b);
+
+ res_t1_8x16b = _mm_add_epi16(res_t1_8x16b, res_t2_8x16b);
+ res_t3_8x16b = _mm_add_epi16(res_t3_8x16b, const_val16_8x16b);
+ res_t1_8x16b = _mm_add_epi16(res_t1_8x16b, res_t3_8x16b);
+ res_t1_8x16b = _mm_srai_epi16(res_t1_8x16b, 5); //shifting right by 5 bits.
+
+ res_16x8b = _mm_packus_epi16(res_t0_8x16b, res_t1_8x16b);
+
+ _mm_storeu_si128((__m128i *)pu1_dst, res_16x8b);
+
+ src_r0r1_16x8b = _mm_unpacklo_epi8(src_r1_16x8b, src_r2_16x8b);
+ src_r2r3_16x8b = _mm_unpacklo_epi8(src_r3_16x8b, src_r4_16x8b);
+ src_r4r5_16x8b = _mm_unpacklo_epi8(src_r5_16x8b, src_r6_16x8b);
+
+ res_t1_8x16b = _mm_maddubs_epi16(src_r0r1_16x8b, coeff0_1_16x8b);
+ res_t2_8x16b = _mm_maddubs_epi16(src_r2r3_16x8b, coeff2_3_16x8b);
+ res_t3_8x16b = _mm_maddubs_epi16(src_r4r5_16x8b, coeff4_5_16x8b);
+
+ res_t1_8x16b = _mm_add_epi16(res_t1_8x16b, res_t2_8x16b);
+ res_t3_8x16b = _mm_add_epi16(res_t3_8x16b, const_val16_8x16b);
+ res_t1_8x16b = _mm_add_epi16(res_t1_8x16b, res_t3_8x16b);
+ res_t0_8x16b = _mm_srai_epi16(res_t1_8x16b, 5); //shifting right by 5 bits.
+
+ src_r0r1_16x8b = _mm_unpackhi_epi8(src_r1_16x8b, src_r2_16x8b);
+ src_r2r3_16x8b = _mm_unpackhi_epi8(src_r3_16x8b, src_r4_16x8b);
+ src_r4r5_16x8b = _mm_unpackhi_epi8(src_r5_16x8b, src_r6_16x8b);
+
+ res_t1_8x16b = _mm_maddubs_epi16(src_r0r1_16x8b, coeff0_1_16x8b);
+ res_t2_8x16b = _mm_maddubs_epi16(src_r2r3_16x8b, coeff2_3_16x8b);
+ res_t3_8x16b = _mm_maddubs_epi16(src_r4r5_16x8b, coeff4_5_16x8b);
+
+ res_t1_8x16b = _mm_add_epi16(res_t1_8x16b, res_t2_8x16b);
+ res_t3_8x16b = _mm_add_epi16(res_t3_8x16b, const_val16_8x16b);
+ res_t1_8x16b = _mm_add_epi16(res_t1_8x16b, res_t3_8x16b);
+ res_t1_8x16b = _mm_srai_epi16(res_t1_8x16b, 5); //shifting right by 5 bits.
+
+ res_16x8b = _mm_packus_epi16(res_t0_8x16b, res_t1_8x16b);
+
+ _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd), res_16x8b);
+
+ src_r0_16x8b = src_r2_16x8b;
+ src_r1_16x8b = src_r3_16x8b;
+ src_r2_16x8b = src_r4_16x8b;
+ src_r3_16x8b = src_r5_16x8b;
+ src_r4_16x8b = src_r6_16x8b;
+
+ ht -= 2;
+ pu1_src += src_strd << 1;
+ pu1_dst += dst_strd << 1;
+ }
+ while(ht > 0);
+ }
+}
+
+/*****************************************************************************/
+/* */
+/* Function Name : ih264_inter_pred_luma_horz_hpel_vert_hpel_ssse3 */
+/* */
+/* Description : This function implements a two stage cascaded six tap */
+/* filter, horizontally and then vertically on ht x wd */
+/* block as mentioned in sec. 8.4.2.2.1 titled "Luma sample */
+/* interpolation process". (ht,wd) can be (4,4), (8,4), */
+/* (4,8), (8,8), (16,8), (8,16) or (16,16). */
+/* */
+/* Inputs : puc_src - pointer to source */
+/* puc_dst - pointer to destination */
+/* src_strd - stride for source */
+/* dst_strd - stride for destination */
+/* ht - height of the block */
+/* wd - width of the block */
+/* pu1_tmp - pointer to temporary buffer */
+/* */
+/* Issues : None */
+/* */
+/* Revision History: */
+/* */
+/* DD MM YYYY Author(s) Changes */
+/* 13 02 2015 Kaushik Initial Version */
+/* Senthoor */
+/* */
+/*****************************************************************************/
+void ih264_inter_pred_luma_horz_hpel_vert_hpel_ssse3(UWORD8 *pu1_src,
+ UWORD8 *pu1_dst,
+ WORD32 src_strd,
+ WORD32 dst_strd,
+ WORD32 ht,
+ WORD32 wd,
+ UWORD8* pu1_tmp,
+ WORD32 dydx)
+{
+ UNUSED(dydx);
+
+ if(wd == 4)
+ {
+ WORD16 *pi2_temp;
+
+ pu1_tmp += 4;
+ pu1_src -= src_strd << 1;
+ pi2_temp = (WORD16 *)pu1_tmp;
+ pu1_src -= 2; // the filter input starts from x[-2] (till x[3])
+
+ // Horizontal 6-tap filtering
+ {
+ WORD32 ht_tmp = ht + 4;
+
+ __m128i src_r0_16x8b, src_r1_16x8b;
+ __m128i src_r0_sht_16x8b, src_r1_sht_16x8b;
+ __m128i src_r0r1_t1_16x8b;
+ __m128i res_r0r1_t1_8x16b, res_r0r1_t2_8x16b, res_r0r1_t3_8x16b;
+ __m128i coeff0_1_16x8b, coeff2_3_16x8b, coeff4_5_16x8b;
+
+ coeff0_1_16x8b = _mm_set1_epi32(0xFB01FB01); //c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 c0 c1
+ coeff2_3_16x8b = _mm_set1_epi32(0x14141414); //c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 c2 c3
+ coeff4_5_16x8b = _mm_set1_epi32(0x01FB01FB); //c4 c5 c5 c5 c4 c5 c5 c5 c4 c5 c5 c5 c4 c5 c5 c5
+ //c0 = c5 = 1, c1 = c4 = -5, c2 = c3 = 20
+ //Row0 : a0 a1 a2 a3 a4 a5 a6 a7 a8 a9.....
+ //Row1 : b0 b1 b2 b3 b4 b5 b6 b7 b8 b9.....
+
+ do
+ {
+ src_r0_16x8b = _mm_loadu_si128((__m128i *)pu1_src); //a0 a1 a2 a3 a4 a5 a6 a7 a8 a9....a15
+ src_r1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + src_strd)); //b0 b1 b2 b3 b4 b5 b6 b7 b8 b9....b15
+
+ src_r0_sht_16x8b = _mm_srli_si128(src_r0_16x8b, 1); //a1 a2 a3 a4 a5 a6 a7 a8 a9....a15 0
+ src_r1_sht_16x8b = _mm_srli_si128(src_r1_16x8b, 1); //b1 b2 b3 b4 b5 b6 b7 b8 b9....b15 0
+
+ src_r0_16x8b = _mm_unpacklo_epi8(src_r0_16x8b, src_r0_sht_16x8b); //a0 a1 a1 a2 a2 a3 a3 a4 a4 a5 a5 a6 a6 a7 a7 a8
+ src_r1_16x8b = _mm_unpacklo_epi8(src_r1_16x8b, src_r1_sht_16x8b); //b0 b1 b1 b2 b2 b3 b3 b4 b4 b5 b5 b6 b6 b7 b7 b8
+
+ src_r0r1_t1_16x8b = _mm_unpacklo_epi64(src_r0_16x8b, src_r1_16x8b); //a0 a1 a1 a2 a2 a3 a3 a4 b0 b1 b1 b2 b2 b3 b3 b4
+ res_r0r1_t1_8x16b = _mm_maddubs_epi16(src_r0r1_t1_16x8b, coeff0_1_16x8b); //a0*c0+a1*c1 a1*c0+a2*c1 a2*c0+a3*c1 a3*c0+a4*c1
+ //b0*c0+b1*c1 b1*c0+b2*c1 b2*c0+b3*c1 b3*c0+b4*c1
+
+ src_r0_16x8b = _mm_srli_si128(src_r0_16x8b, 4); //a2 a3 a3 a4 a4 a5 a5 a6 a6 a7 a7 a8 0 0 0 0
+ src_r1_16x8b = _mm_srli_si128(src_r1_16x8b, 4); //b2 b3 b3 b4 b4 b5 b5 b6 b6 b7 b7 b8 0 0 0 0
+
+ src_r0r1_t1_16x8b = _mm_unpacklo_epi64(src_r0_16x8b, src_r1_16x8b); //a2 a3 a3 a4 a4 a5 a5 a6 b2 b3 b3 b4 b4 b5 b5 b6
+ res_r0r1_t2_8x16b = _mm_maddubs_epi16(src_r0r1_t1_16x8b, coeff2_3_16x8b); //a2*c2+a3*c3 a3*c2+a4*c3 a4*c2+a5*c3 a5*c2+a6*c3
+ //b2*c2+b3*c3 b3*c2+b4*c3 b4*c2+b5*c3 b5*c2+b6*c3
+
+ src_r0_16x8b = _mm_srli_si128(src_r0_16x8b, 4); //a4 a5 a5 a6 a6 a7 a7 a8 0 0 0 0 0 0 0 0
+ src_r1_16x8b = _mm_srli_si128(src_r1_16x8b, 4); //b4 b5 b5 b6 b6 b7 b7 b8 0 0 0 0 0 0 0 0
+
+ src_r0r1_t1_16x8b = _mm_unpacklo_epi64(src_r0_16x8b, src_r1_16x8b); //a4 a5 a5 a6 a6 a7 a7 a8 b4 b5 b5 b6 b6 b7 b7 b8
+ res_r0r1_t3_8x16b = _mm_maddubs_epi16(src_r0r1_t1_16x8b, coeff4_5_16x8b); //a4*c4+a5*c5 a5*c4+a6*c5 a6*c4+a7*c5 a7*c4+a8*c5
+ //b4*c4+b5*c5 b5*c4+b6*c5 b4*c6+b7*c5 b7*c4+b8*c5
+ res_r0r1_t1_8x16b = _mm_add_epi16(res_r0r1_t1_8x16b, res_r0r1_t2_8x16b);
+ res_r0r1_t1_8x16b = _mm_add_epi16(res_r0r1_t3_8x16b, res_r0r1_t1_8x16b);
+
+ _mm_storeu_si128((__m128i *)pi2_temp, res_r0r1_t1_8x16b);
+
+ ht_tmp -= 2;
+ pu1_src += src_strd << 1;
+ pi2_temp += 8;
+ }
+ while(ht_tmp > 0);
+
+ src_r0_16x8b = _mm_loadu_si128((__m128i *)pu1_src); //a0 a1 a2 a3 a4 a5 a6 a7 a8 a9....a15
+ src_r0_sht_16x8b = _mm_srli_si128(src_r0_16x8b, 1); //a1 a2 a3 a4 a5 a6 a7 a8 a9....a15 0
+
+ src_r0_16x8b = _mm_unpacklo_epi8(src_r0_16x8b, src_r0_sht_16x8b); //a0 a1 a1 a2 a2 a3 a3 a4 a4 a5 a5 a6 a6 a7 a7 a8
+ res_r0r1_t1_8x16b = _mm_maddubs_epi16(src_r0_16x8b, coeff0_1_16x8b); //a0*c0+a1*c1 a1*c0+a2*c1 a2*c0+a3*c1 a3*c0+a4*c1
+
+ src_r0_16x8b = _mm_srli_si128(src_r0_16x8b,4); //a2 a3 a3 a4 a4 a5 a5 a6 a6 a7 a7 a8 0 0 0 0
+ res_r0r1_t2_8x16b = _mm_maddubs_epi16(src_r0_16x8b, coeff2_3_16x8b); //a2*c2+a3*c3 a3*c2+a4*c3 a4*c2+a5*c3 a5*c2+a6*c3
+
+ src_r0_16x8b = _mm_srli_si128(src_r0_16x8b,4); //a4 a5 a5 a6 a6 a7 a7 a8 0 0 0 0 0 0 0 0
+ res_r0r1_t3_8x16b = _mm_maddubs_epi16(src_r0_16x8b, coeff4_5_16x8b); //a4*c4+a5*c5 a5*c4+a6*c5 a6*c4+a7*c5 a7*c4+a8*c5
+
+ res_r0r1_t1_8x16b = _mm_add_epi16(res_r0r1_t1_8x16b, res_r0r1_t2_8x16b);
+ res_r0r1_t1_8x16b = _mm_add_epi16(res_r0r1_t3_8x16b, res_r0r1_t1_8x16b);
+
+ _mm_storel_epi64((__m128i *)pi2_temp, res_r0r1_t1_8x16b);
+ }
+
+ pi2_temp = (WORD16 *)pu1_tmp;
+
+ // Vertical 6-tap filtering
+ {
+ __m128i src_r0_8x16b, src_r1_8x16b, src_r2_8x16b, src_r3_8x16b,
+ src_r4_8x16b;
+ __m128i src_r5_8x16b, src_r6_8x16b;
+ __m128i src_t1_8x16b, src_t2_8x16b;
+
+ __m128i res_t0_4x32b, res_t1_4x32b, res_t2_4x32b, res_t3_4x32b;
+ __m128i res_8x16b, res_16x8b;
+
+ __m128i coeff0_1_8x16b, coeff2_3_8x16b, coeff4_5_8x16b;
+ __m128i const_val512_4x32b, mask_low32b;
+
+ mask_low32b = _mm_set1_epi8(0xff);
+
+ coeff0_1_8x16b = _mm_set1_epi32(0xFFFB0001);
+ coeff2_3_8x16b = _mm_set1_epi32(0x00140014);
+ coeff4_5_8x16b = _mm_set1_epi32(0x0001FFFB);
+
+ mask_low32b = _mm_srli_si128(mask_low32b, 12);
+ const_val512_4x32b = _mm_set1_epi32(512);
+
+ src_r0_8x16b = _mm_loadl_epi64((__m128i *)(pi2_temp));
+ src_r1_8x16b = _mm_loadl_epi64((__m128i *)(pi2_temp + 4));
+ src_r2_8x16b = _mm_loadl_epi64((__m128i *)(pi2_temp + 8));
+ src_r3_8x16b = _mm_loadl_epi64((__m128i *)(pi2_temp + 12));
+ src_r4_8x16b = _mm_loadl_epi64((__m128i *)(pi2_temp + 16));
+ pi2_temp += 20;
+
+ do
+ {
+ src_r5_8x16b = _mm_loadl_epi64((__m128i *)pi2_temp);
+ src_r6_8x16b = _mm_loadl_epi64((__m128i *)(pi2_temp + 4));
+
+ src_r0_8x16b = _mm_unpacklo_epi16(src_r0_8x16b, src_r1_8x16b);
+ src_t1_8x16b = _mm_unpacklo_epi16(src_r2_8x16b, src_r3_8x16b);
+ src_t2_8x16b = _mm_unpacklo_epi16(src_r4_8x16b, src_r5_8x16b);
+
+ res_t1_4x32b = _mm_madd_epi16(src_r0_8x16b, coeff0_1_8x16b);
+ res_t2_4x32b = _mm_madd_epi16(src_t1_8x16b, coeff2_3_8x16b);
+ res_t3_4x32b = _mm_madd_epi16(src_t2_8x16b, coeff4_5_8x16b);
+
+ res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t2_4x32b);
+ res_t3_4x32b = _mm_add_epi32(res_t3_4x32b, const_val512_4x32b);
+ res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t3_4x32b);
+
+ res_t0_4x32b = _mm_srai_epi32(res_t1_4x32b, 10);
+
+ src_r1_8x16b = _mm_unpacklo_epi16(src_r1_8x16b, src_r2_8x16b);
+ src_t1_8x16b = _mm_unpacklo_epi16(src_r3_8x16b, src_r4_8x16b);
+ src_t2_8x16b = _mm_unpacklo_epi16(src_r5_8x16b, src_r6_8x16b);
+
+ res_t1_4x32b = _mm_madd_epi16(src_r1_8x16b, coeff0_1_8x16b);
+ res_t2_4x32b = _mm_madd_epi16(src_t1_8x16b, coeff2_3_8x16b);
+ res_t3_4x32b = _mm_madd_epi16(src_t2_8x16b, coeff4_5_8x16b);
+
+ res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t2_4x32b);
+ res_t3_4x32b = _mm_add_epi32(res_t3_4x32b, const_val512_4x32b);
+ res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t3_4x32b);
+
+ res_t1_4x32b = _mm_srai_epi32(res_t1_4x32b, 10);
+
+ res_8x16b = _mm_packs_epi32(res_t0_4x32b, res_t1_4x32b);
+ res_16x8b = _mm_packus_epi16(res_8x16b, res_8x16b);
+
+ _mm_maskmoveu_si128(res_16x8b, mask_low32b, (char*)pu1_dst);
+ res_16x8b = _mm_srli_si128(res_16x8b, 4);
+ _mm_maskmoveu_si128(res_16x8b, mask_low32b, (char*)(pu1_dst + dst_strd));
+
+ src_r0_8x16b = src_r2_8x16b;
+ src_r1_8x16b = src_r3_8x16b;
+ src_r2_8x16b = src_r4_8x16b;
+ src_r3_8x16b = src_r5_8x16b;
+ src_r4_8x16b = src_r6_8x16b;
+
+ ht -= 2;
+ pi2_temp += 8;
+ pu1_dst += dst_strd << 1;
+ }
+ while(ht > 0);
+ }
+ }
+ else if(wd == 8)
+ {
+ WORD16 *pi2_temp;
+
+ pu1_tmp += 4;
+ pu1_src -= src_strd << 1;
+ pi2_temp = (WORD16 *)pu1_tmp;
+ pu1_src -= 2; // the filter input starts from x[-2] (till x[3])
+
+ // Horizontal 6-tap filtering
+ {
+ WORD32 ht_tmp = ht + 4;
+
+ __m128i src_r0_16x8b, src_r1_16x8b;
+ __m128i src_r0_sht_16x8b, src_r1_sht_16x8b;
+ __m128i src_r0_t1_16x8b, src_r1_t1_16x8b;
+ __m128i res_r0_t1_8x16b, res_r0_t2_8x16b, res_r0_t3_8x16b;
+ __m128i res_r1_t1_8x16b, res_r1_t2_8x16b, res_r1_t3_8x16b;
+ __m128i coeff0_1_16x8b, coeff2_3_16x8b, coeff4_5_16x8b;
+
+ coeff0_1_16x8b = _mm_set1_epi32(0xFB01FB01); //c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 c0 c1
+ coeff2_3_16x8b = _mm_set1_epi32(0x14141414); //c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 c2 c3
+ coeff4_5_16x8b = _mm_set1_epi32(0x01FB01FB); //c4 c5 c5 c5 c4 c5 c5 c5 c4 c5 c5 c5 c4 c5 c5 c5
+ //c0 = c5 = 1, c1 = c4 = -5, c2 = c3 = 20
+ //Row0 : a0 a1 a2 a3 a4 a5 a6 a7 a8 a9.....
+ //Row1 : b0 b1 b2 b3 b4 b5 b6 b7 b8 b9.....
+
+ do
+ {
+ src_r0_16x8b = _mm_loadu_si128((__m128i *)pu1_src); //a0 a1 a2 a3 a4 a5 a6 a7 a8 a9 a10 a11 a12 a13 a14 a15
+ src_r1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + src_strd)); //b0 b1 b2 b3 b4 b5 b6 b7 b8 b9 b10 b11 b12 b13 b14 b15
+
+ src_r0_sht_16x8b = _mm_srli_si128(src_r0_16x8b, 1); //a1 a2 a3 a4 a5 a6 a7 a8 a9....a15 0
+ src_r1_sht_16x8b = _mm_srli_si128(src_r1_16x8b, 1); //b1 b2 b3 b4 b5 b6 b7 b8 b9....b15 0
+
+ src_r0_t1_16x8b = _mm_unpacklo_epi8(src_r0_16x8b, src_r0_sht_16x8b); //a0 a1 a1 a2 a2 a3 a3 a4 a4 a5 a5 a6 a6 a7 a7 a8
+ src_r1_t1_16x8b = _mm_unpacklo_epi8(src_r1_16x8b, src_r1_sht_16x8b); //b0 b1 b1 b2 b2 b3 b3 b4 b4 b5 b5 b6 b6 b7 b7 b8
+
+ res_r0_t1_8x16b = _mm_maddubs_epi16(src_r0_t1_16x8b, coeff0_1_16x8b); //a0*c0+a1*c1 a1*c0+a2*c1 a2*c0+a3*c1 a3*c0+a4*c1
+ //a4*c0+a5*c1 a5*c0+a6*c1 a6*c0+a7*c1 a7*c0+a8*c1
+ res_r1_t1_8x16b = _mm_maddubs_epi16(src_r1_t1_16x8b, coeff0_1_16x8b); //b0*c0+b1*c1 b1*c0+b2*c1 b2*c0+b3*c1 b3*c0+b4*c1
+ //b4*c0+b5*c1 b5*c0+b6*c1 b6*c0+b7*c1 b7*c0+b8*c1
+
+ src_r0_16x8b = _mm_srli_si128(src_r0_16x8b, 2); //a2 a3 a4 a5 a6 a7 a8 a9....a15 0 0
+ src_r1_16x8b = _mm_srli_si128(src_r1_16x8b, 2); //b2 b3 b4 b5 b6 b7 b8 b9....b15 0 0
+
+ src_r0_sht_16x8b = _mm_srli_si128(src_r0_sht_16x8b, 2); //a3 a4 a5 a6 a7 a8 a9....a15 0 0 0
+ src_r1_sht_16x8b = _mm_srli_si128(src_r1_sht_16x8b, 2); //b3 b4 b5 b6 b7 b8 b9....b15 0 0 0
+
+ src_r0_t1_16x8b = _mm_unpacklo_epi8(src_r0_16x8b, src_r0_sht_16x8b); //a2 a3 a3 a4 a4 a5 a5 a6 a6 a7 a7 a8 a8 a9 a9 a10
+ src_r1_t1_16x8b = _mm_unpacklo_epi8(src_r1_16x8b, src_r1_sht_16x8b); //b2 b3 b3 b4 b4 b5 b5 b6 b6 b7 b7 b8 a8 a9 a9 a10
+
+ res_r0_t2_8x16b = _mm_maddubs_epi16(src_r0_t1_16x8b, coeff2_3_16x8b); //a2*c2+a3*c3 a3*c2+a4*c3 a4*c2+a5*c3 a5*c2+a6*c3
+ //a6*c2+a7*c3 a7*c2+a8*c3 a8*c2+a9*c3 a9*c2+a10*c3
+ res_r1_t2_8x16b = _mm_maddubs_epi16(src_r1_t1_16x8b, coeff2_3_16x8b); //b2*c2+b3*c3 b3*c2+b4*c3 b2*c4+b5*c3 b5*c2+b6*c3
+ //b6*c2+b7*c3 b7*c2+b8*c3 b8*c2+b9*c3 b9*c2+b10*c3
+
+ src_r0_16x8b = _mm_srli_si128(src_r0_16x8b, 2); //a4 a5 a6 a7 a8 a9....a15 0 0 0 0
+ src_r1_16x8b = _mm_srli_si128(src_r1_16x8b, 2); //b4 b5 b6 b7 b8 b9....b15 0 0 0 0
+
+ src_r0_sht_16x8b = _mm_srli_si128(src_r0_sht_16x8b, 2); //a5 a6 a7 a8 a9....a15 0 0 0 0 0
+ src_r1_sht_16x8b = _mm_srli_si128(src_r1_sht_16x8b, 2); //b5 b6 b7 b8 b9....b15 0 0 0 0 0
+
+ src_r0_t1_16x8b = _mm_unpacklo_epi8(src_r0_16x8b, src_r0_sht_16x8b); //a4 a5 a5 a6 a6 a7 a7 a8 a8 a9 a9 a10 a10 a11 a11 a12
+ src_r1_t1_16x8b = _mm_unpacklo_epi8(src_r1_16x8b, src_r1_sht_16x8b); //b4 b5 b5 b6 b6 b7 b7 b8 b8 b9 b9 b10 b10 b11 b11 b12
+
+ res_r0_t3_8x16b = _mm_maddubs_epi16(src_r0_t1_16x8b, coeff4_5_16x8b); //a4*c4+a5*c5 a5*c4+a6*c5 a6*c4+a7*c5 a7*c4+a8*c5
+ //a8*c4+a9*c5 a9*c4+a10*c5 a10*c4+a11*c5 a11*c4+a12*c5
+ res_r1_t3_8x16b = _mm_maddubs_epi16(src_r1_t1_16x8b, coeff4_5_16x8b); //b4*c4+b5*c5 b5*c4+b6*c5 b6*c4+b7*c5 b7*c4+b8*c5
+ //b8*c4+b9*c5 b9*c4+b10*c5 b10*c4+b11*c5 b11*c4+b12*c5
+ res_r0_t1_8x16b = _mm_add_epi16(res_r0_t1_8x16b, res_r0_t2_8x16b);
+ res_r0_t1_8x16b = _mm_add_epi16(res_r0_t1_8x16b, res_r0_t3_8x16b);
+
+ res_r1_t1_8x16b = _mm_add_epi16(res_r1_t1_8x16b, res_r1_t2_8x16b);
+ res_r1_t1_8x16b = _mm_add_epi16(res_r1_t1_8x16b, res_r1_t3_8x16b);
+
+ _mm_storeu_si128((__m128i *)pi2_temp, res_r0_t1_8x16b);
+ _mm_storeu_si128((__m128i *)(pi2_temp + 8), res_r1_t1_8x16b);
+
+ ht_tmp -= 2;
+ pu1_src += src_strd << 1;
+ pi2_temp += 16;
+ }
+ while(ht_tmp > 0);
+
+ src_r0_16x8b = _mm_loadu_si128((__m128i *)pu1_src); //a0 a1 a2 a3 a4 a5 a6 a7 a8 a9 a10 a11 a12 a13 a14 a15
+ src_r0_sht_16x8b = _mm_srli_si128(src_r0_16x8b, 1); //a1 a2 a3 a4 a5 a6 a7 a8 a9....a15 0
+
+ src_r0_t1_16x8b = _mm_unpacklo_epi8(src_r0_16x8b,src_r0_sht_16x8b); //a0 a1 a1 a2 a2 a3 a3 a4 a4 a5 a5 a6 a6 a7 a7 a8
+ res_r0_t1_8x16b = _mm_maddubs_epi16(src_r0_t1_16x8b,coeff0_1_16x8b); //a0*c0+a1*c1 a1*c0+a2*c1 a2*c0+a3*c1 a3*c0+a4*c1
+ //a4*c0+a5*c1 a5*c0+a6*c1 a6*c0+a7*c1 a7*c0+a8*c1
+
+ src_r0_16x8b = _mm_srli_si128(src_r0_16x8b, 2); //a2 a3 a4 a5 a6 a7 a8 a9....a15 0 0
+ src_r0_sht_16x8b = _mm_srli_si128(src_r0_sht_16x8b, 2); //a3 a4 a5 a6 a7 a8 a9....a15 0 0 0
+
+ src_r0_t1_16x8b = _mm_unpacklo_epi8(src_r0_16x8b, src_r0_sht_16x8b); //a2 a3 a3 a4 a4 a5 a5 a6 a6 a7 a7 a8 a8 a9 a9 a10
+ res_r0_t2_8x16b = _mm_maddubs_epi16(src_r0_t1_16x8b, coeff2_3_16x8b); //a2*c2+a3*c3 a3*c2+a4*c3 a4*c2+a5*c3 a5*c2+a6*c3
+ //a6*c2+a7*c3 a7*c2+a8*c3 a8*c2+a9*c3 a9*c2+a10*c3
+
+ src_r0_16x8b = _mm_srli_si128(src_r0_16x8b, 2); //a4 a5 a6 a7 a8 a9....a15 0 0 0 0
+ src_r0_sht_16x8b = _mm_srli_si128(src_r0_sht_16x8b, 2); //a5 a6 a7 a8 a9....a15 0 0 0 0 0
+
+ src_r0_t1_16x8b = _mm_unpacklo_epi8(src_r0_16x8b, src_r0_sht_16x8b); //a4 a5 a5 a6 a6 a7 a7 a8 a8 a9 a9 a10 a10 a11 a11 a12
+ res_r0_t3_8x16b = _mm_maddubs_epi16(src_r0_t1_16x8b, coeff4_5_16x8b); //a4*c4+a5*c5 a5*c4+a6*c5 a6*c4+a7*c5 a7*c4+a8*c5
+ //a8*c4+a9*c5 a9*c4+a10*c5 a10*c4+a11*c5 a11*c4+a12*c5
+ res_r0_t1_8x16b = _mm_add_epi16(res_r0_t1_8x16b, res_r0_t2_8x16b);
+ res_r0_t1_8x16b = _mm_add_epi16(res_r0_t1_8x16b, res_r0_t3_8x16b);
+
+ _mm_storeu_si128((__m128i *)pi2_temp, res_r0_t1_8x16b);
+ }
+
+ pi2_temp = (WORD16 *)pu1_tmp;
+
+ // Vertical 6-tap filtering
+ {
+ __m128i src_r0_8x16b, src_r1_8x16b, src_r2_8x16b, src_r3_8x16b,
+ src_r4_8x16b;
+ __m128i src_r5_8x16b, src_r6_8x16b;
+ __m128i src_r0r1_8x16b, src_r2r3_8x16b, src_r4r5_8x16b;
+
+ __m128i res_t1_4x32b, res_t2_4x32b, res_t3_4x32b;
+ __m128i res_c0_4x32b, res_c1_4x32b;
+ __m128i res_8x16b, res_16x8b;
+
+ __m128i coeff0_1_8x16b, coeff2_3_8x16b, coeff4_5_8x16b;
+ __m128i const_val512_4x32b;
+
+ coeff0_1_8x16b = _mm_set1_epi32(0xFFFB0001);
+ coeff2_3_8x16b = _mm_set1_epi32(0x00140014);
+ coeff4_5_8x16b = _mm_set1_epi32(0x0001FFFB);
+
+ const_val512_4x32b = _mm_set1_epi32(512);
+
+ src_r0_8x16b = _mm_loadu_si128((__m128i *)pi2_temp);
+ src_r1_8x16b = _mm_loadu_si128((__m128i *)(pi2_temp + 8));
+ src_r2_8x16b = _mm_loadu_si128((__m128i *)(pi2_temp + 16));
+ src_r3_8x16b = _mm_loadu_si128((__m128i *)(pi2_temp + 24));
+ src_r4_8x16b = _mm_loadu_si128((__m128i *)(pi2_temp + 32));
+ pi2_temp += 40;
+
+ do
+ {
+ src_r5_8x16b = _mm_loadu_si128((__m128i *)pi2_temp);
+ src_r6_8x16b = _mm_loadu_si128((__m128i *)(pi2_temp + 8));
+
+ src_r0r1_8x16b = _mm_unpacklo_epi16(src_r0_8x16b, src_r1_8x16b);
+ src_r2r3_8x16b = _mm_unpacklo_epi16(src_r2_8x16b, src_r3_8x16b);
+ src_r4r5_8x16b = _mm_unpacklo_epi16(src_r4_8x16b, src_r5_8x16b);
+
+ res_t1_4x32b = _mm_madd_epi16(src_r0r1_8x16b, coeff0_1_8x16b);
+ res_t2_4x32b = _mm_madd_epi16(src_r2r3_8x16b, coeff2_3_8x16b);
+ res_t3_4x32b = _mm_madd_epi16(src_r4r5_8x16b, coeff4_5_8x16b);
+
+ res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t2_4x32b);
+ res_t3_4x32b = _mm_add_epi32(res_t3_4x32b, const_val512_4x32b);
+ res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t3_4x32b);
+ res_c0_4x32b = _mm_srai_epi32(res_t1_4x32b, 10);
+
+ src_r0r1_8x16b = _mm_unpackhi_epi16(src_r0_8x16b, src_r1_8x16b);
+ src_r2r3_8x16b = _mm_unpackhi_epi16(src_r2_8x16b, src_r3_8x16b);
+ src_r4r5_8x16b = _mm_unpackhi_epi16(src_r4_8x16b, src_r5_8x16b);
+
+ res_t1_4x32b = _mm_madd_epi16(src_r0r1_8x16b, coeff0_1_8x16b);
+ res_t2_4x32b = _mm_madd_epi16(src_r2r3_8x16b, coeff2_3_8x16b);
+ res_t3_4x32b = _mm_madd_epi16(src_r4r5_8x16b, coeff4_5_8x16b);
+
+ res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t2_4x32b);
+ res_t3_4x32b = _mm_add_epi32(res_t3_4x32b, const_val512_4x32b);
+ res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t3_4x32b);
+ res_c1_4x32b = _mm_srai_epi32(res_t1_4x32b, 10);
+
+ res_8x16b = _mm_packs_epi32(res_c0_4x32b, res_c1_4x32b);
+ res_16x8b = _mm_packus_epi16(res_8x16b, res_8x16b);
+
+ _mm_storel_epi64((__m128i *)pu1_dst, res_16x8b);
+
+ src_r0r1_8x16b = _mm_unpacklo_epi16(src_r1_8x16b, src_r2_8x16b);
+ src_r2r3_8x16b = _mm_unpacklo_epi16(src_r3_8x16b, src_r4_8x16b);
+ src_r4r5_8x16b = _mm_unpacklo_epi16(src_r5_8x16b, src_r6_8x16b);
+
+ res_t1_4x32b = _mm_madd_epi16(src_r0r1_8x16b, coeff0_1_8x16b);
+ res_t2_4x32b = _mm_madd_epi16(src_r2r3_8x16b, coeff2_3_8x16b);
+ res_t3_4x32b = _mm_madd_epi16(src_r4r5_8x16b, coeff4_5_8x16b);
+
+ res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t2_4x32b);
+ res_t3_4x32b = _mm_add_epi32(res_t3_4x32b, const_val512_4x32b);
+ res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t3_4x32b);
+ res_c0_4x32b = _mm_srai_epi32(res_t1_4x32b, 10);
+
+ src_r0r1_8x16b = _mm_unpackhi_epi16(src_r1_8x16b, src_r2_8x16b);
+ src_r2r3_8x16b = _mm_unpackhi_epi16(src_r3_8x16b, src_r4_8x16b);
+ src_r4r5_8x16b = _mm_unpackhi_epi16(src_r5_8x16b, src_r6_8x16b);
+
+ res_t1_4x32b = _mm_madd_epi16(src_r0r1_8x16b, coeff0_1_8x16b);
+ res_t2_4x32b = _mm_madd_epi16(src_r2r3_8x16b, coeff2_3_8x16b);
+ res_t3_4x32b = _mm_madd_epi16(src_r4r5_8x16b, coeff4_5_8x16b);
+
+ res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t2_4x32b);
+ res_t3_4x32b = _mm_add_epi32(res_t3_4x32b, const_val512_4x32b);
+ res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t3_4x32b);
+ res_c1_4x32b = _mm_srai_epi32(res_t1_4x32b, 10);
+
+ res_8x16b = _mm_packs_epi32(res_c0_4x32b, res_c1_4x32b);
+ res_16x8b = _mm_packus_epi16(res_8x16b, res_8x16b);
+
+ _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd), res_16x8b);
+
+ src_r0_8x16b = src_r2_8x16b;
+ src_r1_8x16b = src_r3_8x16b;
+ src_r2_8x16b = src_r4_8x16b;
+ src_r3_8x16b = src_r5_8x16b;
+ src_r4_8x16b = src_r6_8x16b;
+
+ ht -= 2;
+ pi2_temp += 16;
+ pu1_dst += dst_strd << 1;
+ }
+ while(ht > 0);
+ }
+ }
+ else // wd == 16
+ {
+ WORD16 *pi2_temp;
+ WORD32 ht_tmp;
+
+ pu1_tmp += 4;
+ pu1_src -= src_strd << 1;
+ pi2_temp = (WORD16 *)pu1_tmp;
+ pu1_src -= 2; // the filter input starts from x[-2] (till x[3])
+
+ // Horizontal 6-tap filtering
+ {
+ ht_tmp = ht + 5;
+
+ __m128i src_r0_16x8b, src_r1_16x8b, src_r0_sht_16x8b, src_r1_sht_16x8b;
+ __m128i src_r0_t1_16x8b, src_r1_t1_16x8b;
+
+ __m128i res_r0_t1_8x16b, res_r0_t2_8x16b, res_r0_t3_8x16b;
+ __m128i res_r1_t1_8x16b, res_r1_t2_8x16b, res_r1_t3_8x16b;
+
+ __m128i coeff0_1_16x8b, coeff2_3_16x8b, coeff4_5_16x8b;
+
+ coeff0_1_16x8b = _mm_set1_epi32(0xFB01FB01); //c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 c0 c1
+ coeff2_3_16x8b = _mm_set1_epi32(0x14141414); //c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 c2 c3
+ coeff4_5_16x8b = _mm_set1_epi32(0x01FB01FB); //c4 c5 c5 c5 c4 c5 c5 c5 c4 c5 c5 c5 c4 c5 c5 c5
+ //c0 = c5 = 1, c1 = c4 = -5, c2 = c3 = 20
+ //Row0 : a0 a1 a2 a3 a4 a5 a6 a7 a8 a9.....
+ //Row0 : b0 b1 b2 b3 b4 b5 b6 b7 b8 b9.....
+ //b0 is same a8. Similarly other bn pixels are same as a(n+8) pixels.
+
+ do
+ {
+ src_r0_16x8b = _mm_loadu_si128((__m128i *)pu1_src); //a0 a1 a2 a3 a4 a5 a6 a7 a8 a9....a15
+ src_r1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + 8)); //b0 b1 b2 b3 b4 b5 b6 b7 b8 b9....b15
+
+ src_r0_sht_16x8b = _mm_srli_si128(src_r0_16x8b, 1); //a1 a2 a3 a4 a5 a6 a7 a8 a9....a15 0
+ src_r1_sht_16x8b = _mm_srli_si128(src_r1_16x8b, 1); //b1 b2 b3 b4 b5 b6 b7 b8 b9....b15 0
+
+ src_r0_t1_16x8b = _mm_unpacklo_epi8(src_r0_16x8b, src_r0_sht_16x8b); //a0 a1 a1 a2 a2 a3 a3 a4 a4 a5 a5 a6 a6 a7 a7 a8
+ src_r1_t1_16x8b = _mm_unpacklo_epi8(src_r1_16x8b, src_r1_sht_16x8b); //b0 b1 b1 b2 b2 b3 b3 b4 b4 b5 b5 b6 b6 b7 b7 b8
+
+ res_r0_t1_8x16b = _mm_maddubs_epi16(src_r0_t1_16x8b, coeff0_1_16x8b); //a0*c0+a1*c1 a1*c0+a2*c1 a2*c0+a3*c1 a3*c0+a4*c1
+ //a4*c0+a5*c1 a5*c0+a6*c1 a6*c0+a7*c1 a7*c0+a8*c1
+ res_r1_t1_8x16b = _mm_maddubs_epi16(src_r1_t1_16x8b, coeff0_1_16x8b); //b0*c0+b1*c1 b1*c0+b2*c1 b2*c0+b3*c1 b3*c0+b4*c1
+ //b4*c0+b5*c1 b5*c0+b6*c1 b6*c0+b7*c1 b7*c0+b8*c1
+
+ src_r0_16x8b = _mm_srli_si128(src_r0_16x8b, 2); //a2 a3 a4 a5 a6 a7 a8 a9....a15 0 0
+ src_r1_16x8b = _mm_srli_si128(src_r1_16x8b, 2); //b2 b3 b4 b5 b6 b7 b8 b9....b15 0 0
+
+ src_r0_sht_16x8b = _mm_srli_si128(src_r0_sht_16x8b, 2); //a3 a4 a5 a6 a7 a8 a9....a15 0 0 0
+ src_r1_sht_16x8b = _mm_srli_si128(src_r1_sht_16x8b, 2); //b3 b4 b5 b6 b7 b8 b9....b15 0 0 0
+
+ src_r0_t1_16x8b = _mm_unpacklo_epi8(src_r0_16x8b, src_r0_sht_16x8b); //a2 a3 a3 a4 a4 a5 a5 a6 a6 a7 a7 a8 a8 a9 a9 a10
+ src_r1_t1_16x8b = _mm_unpacklo_epi8(src_r1_16x8b, src_r1_sht_16x8b); //b2 b3 b3 b4 b4 b5 b5 b6 b6 b7 b7 b8 a8 a9 a9 a10
+
+ res_r0_t2_8x16b = _mm_maddubs_epi16(src_r0_t1_16x8b, coeff2_3_16x8b); //a2*c2+a3*c3 a3*c2+a4*c3 a4*c2+a5*c3 a5*c2+a6*c3
+ //a6*c2+a7*c3 a7*c2+a8*c3 a8*c2+a9*c3 a9*c2+a10*c3
+ res_r1_t2_8x16b = _mm_maddubs_epi16(src_r1_t1_16x8b, coeff2_3_16x8b); //b2*c2+b3*c3 b3*c2+b4*c3 b2*c4+b5*c3 b5*c2+b6*c3
+ //b6*c2+b7*c3 b7*c2+b8*c3 b8*c2+b9*c3 b9*c2+b10*c3
+
+ src_r0_16x8b = _mm_srli_si128(src_r0_16x8b, 2); //a4 a5 a6 a7 a8 a9....a15 0 0 0 0
+ src_r1_16x8b = _mm_srli_si128(src_r1_16x8b, 2); //b4 b5 b6 b7 b8 b9....b15 0 0 0 0
+
+ src_r0_sht_16x8b = _mm_srli_si128(src_r0_sht_16x8b, 2); //a5 a6 a7 a8 a9....a15 0 0 0 0 0
+ src_r1_sht_16x8b = _mm_srli_si128(src_r1_sht_16x8b, 2); //b5 b6 b7 b8 b9....b15 0 0 0 0 0
+
+ src_r0_t1_16x8b = _mm_unpacklo_epi8(src_r0_16x8b, src_r0_sht_16x8b); //a4 a5 a5 a6 a6 a7 a7 a8 a8 a9 a9 a10 a10 a11 a11 a12
+ src_r1_t1_16x8b = _mm_unpacklo_epi8(src_r1_16x8b, src_r1_sht_16x8b); //b4 b5 b5 b6 b6 b7 b7 b8 b8 b9 b9 b10 b10 b11 b11 b12
+
+ res_r0_t3_8x16b = _mm_maddubs_epi16(src_r0_t1_16x8b, coeff4_5_16x8b); //a4*c4+a5*c5 a5*c4+a6*c5 a6*c4+a7*c5 a7*c4+a8*c5
+ //a8*c4+a9*c5 a9*c4+a10*c5 a10*c4+a11*c5 a11*c4+a12*c5
+ res_r1_t3_8x16b = _mm_maddubs_epi16(src_r1_t1_16x8b, coeff4_5_16x8b); //b4*c4+b5*c5 b5*c4+b6*c5 b6*c4+b7*c5 b7*c4+b8*c5
+ //b8*c4+b9*c5 b9*c4+b10*c5 b10*c4+b11*c5 b11*c4+b12*c5
+ res_r0_t1_8x16b = _mm_add_epi16(res_r0_t1_8x16b, res_r0_t2_8x16b);
+ res_r0_t1_8x16b = _mm_add_epi16(res_r0_t1_8x16b, res_r0_t3_8x16b);
+
+ res_r1_t1_8x16b = _mm_add_epi16(res_r1_t1_8x16b, res_r1_t2_8x16b);
+ res_r1_t1_8x16b = _mm_add_epi16(res_r1_t1_8x16b, res_r1_t3_8x16b);
+
+ _mm_storeu_si128((__m128i *)pi2_temp, res_r0_t1_8x16b);
+ _mm_storeu_si128((__m128i *)(pi2_temp + 8), res_r1_t1_8x16b);
+
+ ht_tmp--;
+ pu1_src += src_strd;
+ pi2_temp += 16;
+ }
+ while(ht_tmp > 0);
+ }
+
+ pi2_temp = (WORD16 *)pu1_tmp;
+
+ // Vertical 6-tap filtering
+ {
+ WORD16 *pi2_temp2;
+ UWORD8 *pu1_dst2;
+ WORD32 ht_tmp;
+
+ __m128i src_r0_8x16b, src_r1_8x16b, src_r2_8x16b, src_r3_8x16b, src_r4_8x16b;
+ __m128i src_r5_8x16b, src_r6_8x16b;
+ __m128i src_r0r1_8x16b, src_r2r3_8x16b, src_r4r5_8x16b;
+
+ __m128i res_t1_4x32b, res_t2_4x32b, res_t3_4x32b;
+ __m128i res_c0_4x32b, res_c1_4x32b;
+ __m128i res_8x16b, res_16x8b;
+
+ __m128i coeff0_1_8x16b, coeff2_3_8x16b, coeff4_5_8x16b;
+ __m128i const_val512_4x32b;
+
+ coeff0_1_8x16b = _mm_set1_epi32(0xFFFB0001);
+ coeff2_3_8x16b = _mm_set1_epi32(0x00140014);
+ coeff4_5_8x16b = _mm_set1_epi32(0x0001FFFB);
+
+ const_val512_4x32b = _mm_set1_epi32(512);
+
+ pi2_temp2 = pi2_temp + 8;
+ pu1_dst2 = pu1_dst + 8;
+ ht_tmp = ht;
+
+ /**********************************************************/
+ /* Do first height x 8 block */
+ /**********************************************************/
+ src_r0_8x16b = _mm_loadu_si128((__m128i *)pi2_temp);
+ src_r1_8x16b = _mm_loadu_si128((__m128i *)(pi2_temp + 16));
+ src_r2_8x16b = _mm_loadu_si128((__m128i *)(pi2_temp + 32));
+ src_r3_8x16b = _mm_loadu_si128((__m128i *)(pi2_temp + 48));
+ src_r4_8x16b = _mm_loadu_si128((__m128i *)(pi2_temp + 64));
+ pi2_temp += 80;
+
+ do
+ {
+ src_r5_8x16b = _mm_loadu_si128((__m128i *)pi2_temp);
+ src_r6_8x16b = _mm_loadu_si128((__m128i *)(pi2_temp + 16));
+
+ src_r0r1_8x16b = _mm_unpacklo_epi16(src_r0_8x16b, src_r1_8x16b);
+ src_r2r3_8x16b = _mm_unpacklo_epi16(src_r2_8x16b, src_r3_8x16b);
+ src_r4r5_8x16b = _mm_unpacklo_epi16(src_r4_8x16b, src_r5_8x16b);
+
+ res_t1_4x32b = _mm_madd_epi16(src_r0r1_8x16b, coeff0_1_8x16b);
+ res_t2_4x32b = _mm_madd_epi16(src_r2r3_8x16b, coeff2_3_8x16b);
+ res_t3_4x32b = _mm_madd_epi16(src_r4r5_8x16b, coeff4_5_8x16b);
+
+ res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t2_4x32b);
+ res_t3_4x32b = _mm_add_epi32(res_t3_4x32b, const_val512_4x32b);
+ res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t3_4x32b);
+ res_c0_4x32b = _mm_srai_epi32(res_t1_4x32b, 10);
+
+ src_r0r1_8x16b = _mm_unpackhi_epi16(src_r0_8x16b, src_r1_8x16b);
+ src_r2r3_8x16b = _mm_unpackhi_epi16(src_r2_8x16b, src_r3_8x16b);
+ src_r4r5_8x16b = _mm_unpackhi_epi16(src_r4_8x16b, src_r5_8x16b);
+
+ res_t1_4x32b = _mm_madd_epi16(src_r0r1_8x16b, coeff0_1_8x16b);
+ res_t2_4x32b = _mm_madd_epi16(src_r2r3_8x16b, coeff2_3_8x16b);
+ res_t3_4x32b = _mm_madd_epi16(src_r4r5_8x16b, coeff4_5_8x16b);
+
+ res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t2_4x32b);
+ res_t3_4x32b = _mm_add_epi32(res_t3_4x32b, const_val512_4x32b);
+ res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t3_4x32b);
+ res_c1_4x32b = _mm_srai_epi32(res_t1_4x32b, 10);
+
+ res_8x16b = _mm_packs_epi32(res_c0_4x32b, res_c1_4x32b);
+ res_16x8b = _mm_packus_epi16(res_8x16b, res_8x16b);
+
+ _mm_storel_epi64((__m128i *)pu1_dst, res_16x8b);
+
+ src_r0r1_8x16b = _mm_unpacklo_epi16(src_r1_8x16b, src_r2_8x16b);
+ src_r2r3_8x16b = _mm_unpacklo_epi16(src_r3_8x16b, src_r4_8x16b);
+ src_r4r5_8x16b = _mm_unpacklo_epi16(src_r5_8x16b, src_r6_8x16b);
+
+ res_t1_4x32b = _mm_madd_epi16(src_r0r1_8x16b, coeff0_1_8x16b);
+ res_t2_4x32b = _mm_madd_epi16(src_r2r3_8x16b, coeff2_3_8x16b);
+ res_t3_4x32b = _mm_madd_epi16(src_r4r5_8x16b, coeff4_5_8x16b);
+
+ res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t2_4x32b);
+ res_t3_4x32b = _mm_add_epi32(res_t3_4x32b, const_val512_4x32b);
+ res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t3_4x32b);
+ res_c0_4x32b = _mm_srai_epi32(res_t1_4x32b, 10);
+
+ src_r0r1_8x16b = _mm_unpackhi_epi16(src_r1_8x16b, src_r2_8x16b);
+ src_r2r3_8x16b = _mm_unpackhi_epi16(src_r3_8x16b, src_r4_8x16b);
+ src_r4r5_8x16b = _mm_unpackhi_epi16(src_r5_8x16b, src_r6_8x16b);
+
+ res_t1_4x32b = _mm_madd_epi16(src_r0r1_8x16b, coeff0_1_8x16b);
+ res_t2_4x32b = _mm_madd_epi16(src_r2r3_8x16b, coeff2_3_8x16b);
+ res_t3_4x32b = _mm_madd_epi16(src_r4r5_8x16b, coeff4_5_8x16b);
+
+ res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t2_4x32b);
+ res_t3_4x32b = _mm_add_epi32(res_t3_4x32b, const_val512_4x32b);
+ res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t3_4x32b);
+ res_c1_4x32b = _mm_srai_epi32(res_t1_4x32b, 10);
+
+ res_8x16b = _mm_packs_epi32(res_c0_4x32b, res_c1_4x32b);
+ res_16x8b = _mm_packus_epi16(res_8x16b, res_8x16b);
+
+ _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd), res_16x8b);
+
+ src_r0_8x16b = src_r2_8x16b;
+ src_r1_8x16b = src_r3_8x16b;
+ src_r2_8x16b = src_r4_8x16b;
+ src_r3_8x16b = src_r5_8x16b;
+ src_r4_8x16b = src_r6_8x16b;
+
+ ht_tmp -= 2;
+ pi2_temp += 32;
+ pu1_dst += dst_strd << 1;
+ }
+ while(ht_tmp > 0);
+
+ /**********************************************************/
+ /* Do second ht x 8 block */
+ /**********************************************************/
+ src_r0_8x16b = _mm_loadu_si128((__m128i *)pi2_temp2);
+ src_r1_8x16b = _mm_loadu_si128((__m128i *)(pi2_temp2 + 16));
+ src_r2_8x16b = _mm_loadu_si128((__m128i *)(pi2_temp2 + 32));
+ src_r3_8x16b = _mm_loadu_si128((__m128i *)(pi2_temp2 + 48));
+ src_r4_8x16b = _mm_loadu_si128((__m128i *)(pi2_temp2 + 64));
+ pi2_temp2 += 80;
+
+ do
+ {
+ src_r5_8x16b = _mm_loadu_si128((__m128i *)pi2_temp2);
+ src_r6_8x16b = _mm_loadu_si128((__m128i *)(pi2_temp2 + 16));
+
+ src_r0r1_8x16b = _mm_unpacklo_epi16(src_r0_8x16b, src_r1_8x16b);
+ src_r2r3_8x16b = _mm_unpacklo_epi16(src_r2_8x16b, src_r3_8x16b);
+ src_r4r5_8x16b = _mm_unpacklo_epi16(src_r4_8x16b, src_r5_8x16b);
+
+ res_t1_4x32b = _mm_madd_epi16(src_r0r1_8x16b, coeff0_1_8x16b);
+ res_t2_4x32b = _mm_madd_epi16(src_r2r3_8x16b, coeff2_3_8x16b);
+ res_t3_4x32b = _mm_madd_epi16(src_r4r5_8x16b, coeff4_5_8x16b);
+
+ res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t2_4x32b);
+ res_t3_4x32b = _mm_add_epi32(res_t3_4x32b, const_val512_4x32b);
+ res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t3_4x32b);
+ res_c0_4x32b = _mm_srai_epi32(res_t1_4x32b, 10);
+
+ src_r0r1_8x16b = _mm_unpackhi_epi16(src_r0_8x16b, src_r1_8x16b);
+ src_r2r3_8x16b = _mm_unpackhi_epi16(src_r2_8x16b, src_r3_8x16b);
+ src_r4r5_8x16b = _mm_unpackhi_epi16(src_r4_8x16b, src_r5_8x16b);
+
+ res_t1_4x32b = _mm_madd_epi16(src_r0r1_8x16b, coeff0_1_8x16b);
+ res_t2_4x32b = _mm_madd_epi16(src_r2r3_8x16b, coeff2_3_8x16b);
+ res_t3_4x32b = _mm_madd_epi16(src_r4r5_8x16b, coeff4_5_8x16b);
+
+ res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t2_4x32b);
+ res_t3_4x32b = _mm_add_epi32(res_t3_4x32b, const_val512_4x32b);
+ res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t3_4x32b);
+ res_c1_4x32b = _mm_srai_epi32(res_t1_4x32b, 10);
+
+ res_8x16b = _mm_packs_epi32(res_c0_4x32b, res_c1_4x32b);
+ res_16x8b = _mm_packus_epi16(res_8x16b, res_8x16b);
+
+ _mm_storel_epi64((__m128i *)pu1_dst2, res_16x8b);
+
+ src_r0r1_8x16b = _mm_unpacklo_epi16(src_r1_8x16b, src_r2_8x16b);
+ src_r2r3_8x16b = _mm_unpacklo_epi16(src_r3_8x16b, src_r4_8x16b);
+ src_r4r5_8x16b = _mm_unpacklo_epi16(src_r5_8x16b, src_r6_8x16b);
+
+ res_t1_4x32b = _mm_madd_epi16(src_r0r1_8x16b, coeff0_1_8x16b);
+ res_t2_4x32b = _mm_madd_epi16(src_r2r3_8x16b, coeff2_3_8x16b);
+ res_t3_4x32b = _mm_madd_epi16(src_r4r5_8x16b, coeff4_5_8x16b);
+
+ res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t2_4x32b);
+ res_t3_4x32b = _mm_add_epi32(res_t3_4x32b, const_val512_4x32b);
+ res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t3_4x32b);
+ res_c0_4x32b = _mm_srai_epi32(res_t1_4x32b, 10);
+
+ src_r0r1_8x16b = _mm_unpackhi_epi16(src_r1_8x16b, src_r2_8x16b);
+ src_r2r3_8x16b = _mm_unpackhi_epi16(src_r3_8x16b, src_r4_8x16b);
+ src_r4r5_8x16b = _mm_unpackhi_epi16(src_r5_8x16b, src_r6_8x16b);
+
+ res_t1_4x32b = _mm_madd_epi16(src_r0r1_8x16b, coeff0_1_8x16b);
+ res_t2_4x32b = _mm_madd_epi16(src_r2r3_8x16b, coeff2_3_8x16b);
+ res_t3_4x32b = _mm_madd_epi16(src_r4r5_8x16b, coeff4_5_8x16b);
+
+ res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t2_4x32b);
+ res_t3_4x32b = _mm_add_epi32(res_t3_4x32b, const_val512_4x32b);
+ res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t3_4x32b);
+ res_c1_4x32b = _mm_srai_epi32(res_t1_4x32b, 10);
+
+ res_8x16b = _mm_packs_epi32(res_c0_4x32b, res_c1_4x32b);
+ res_16x8b = _mm_packus_epi16(res_8x16b, res_8x16b);
+
+ _mm_storel_epi64((__m128i *)(pu1_dst2 + dst_strd), res_16x8b);
+
+ src_r0_8x16b = src_r2_8x16b;
+ src_r1_8x16b = src_r3_8x16b;
+ src_r2_8x16b = src_r4_8x16b;
+ src_r3_8x16b = src_r5_8x16b;
+ src_r4_8x16b = src_r6_8x16b;
+
+ ht -= 2;
+ pi2_temp2 += 32;
+ pu1_dst2 += dst_strd << 1;
+ }
+ while(ht > 0);
+ }
+ }
+}
+
+/*****************************************************************************/
+/* */
+/* Function Name : ih264_inter_pred_luma_horz_qpel_ssse3 */
+/* */
+/* Description : This function implements a six-tap filter horizontally */
+/* on ht x wd block and averages the values with the source */
+/* pixels to calculate horizontal quarter-pel as mentioned */
+/* in sec. 8.4.2.2.1 titled "Luma sample interpolation */
+/* process". (ht,wd) can be (4,4), (8,4), (4,8), (8,8), */
+/* (16,8), (8,16) or (16,16). */
+/* */
+/* Inputs : puc_src - pointer to source */
+/* puc_dst - pointer to destination */
+/* src_strd - stride for source */
+/* dst_strd - stride for destination */
+/* ht - height of the block */
+/* wd - width of the block */
+/* pu1_tmp - pointer to temporary buffer */
+/* dydx - x and y reference offset for q-pel */
+/* calculations */
+/* */
+/* Issues : None */
+/* */
+/* Revision History: */
+/* */
+/* DD MM YYYY Author(s) Changes */
+/* 13 02 2015 Kaushik Initial Version */
+/* Senthoor */
+/* */
+/*****************************************************************************/
+void ih264_inter_pred_luma_horz_qpel_ssse3(UWORD8 *pu1_src,
+ UWORD8 *pu1_dst,
+ WORD32 src_strd,
+ WORD32 dst_strd,
+ WORD32 ht,
+ WORD32 wd,
+ UWORD8* pu1_tmp,
+ WORD32 dydx)
+{
+ WORD32 x_offset;
+ UWORD8 *pu1_pred1;
+
+ __m128i src_r0_16x8b, src_r1_16x8b;
+ __m128i src_r0_sht_16x8b, src_r1_sht_16x8b;
+ __m128i coeff0_1_16x8b, coeff2_3_16x8b, coeff4_5_16x8b;
+ __m128i const_val16_8x16b;
+
+ UNUSED(pu1_tmp);
+
+ x_offset = dydx & 3;
+
+ coeff0_1_16x8b = _mm_set1_epi32(0xFB01FB01); //c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 c0 c1
+ coeff2_3_16x8b = _mm_set1_epi32(0x14141414); //c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 c2 c3
+ coeff4_5_16x8b = _mm_set1_epi32(0x01FB01FB); //c4 c5 c4 c5 c4 c5 c4 c5 c4 c5 c4 c5 c4 c5 c4 c5
+ //c0 = c5 = 1, c1 = c4 = -5, c2 = c3 = 20
+ pu1_pred1 = pu1_src + (x_offset >> 1);
+
+ const_val16_8x16b = _mm_set1_epi16(16);
+
+ pu1_src -= 2; // the filter input starts from x[-2] (till x[3])
+
+ if(wd == 4)
+ {
+ __m128i src_r0r1_16x8b;
+
+ __m128i res_r0r1_t1_8x16b, res_r0r1_t2_8x16b, res_r0r1_t3_8x16b;
+ __m128i res_r0r1_16x8b;
+
+ __m128i mask_full_16x8b, mask_low32b;
+
+ mask_full_16x8b = _mm_set1_epi8(0xff);
+ mask_low32b = _mm_srli_si128(mask_full_16x8b, 12); // mask for first four bytes
+
+ //Row0 : a0 a1 a2 a3 a4 a5 a6 a7 a8 a9.....
+ //Row1 : b0 b1 b2 b3 b4 b5 b6 b7 b8 b9.....
+
+ do
+ {
+ src_r0_16x8b = _mm_loadu_si128((__m128i *)pu1_src); //a0 a1 a2 a3 a4 a5 a6 a7 a8 a9....a15
+ src_r1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + src_strd)); //b0 b1 b2 b3 b4 b5 b6 b7 b8 b9....b15
+
+ src_r0_sht_16x8b = _mm_srli_si128(src_r0_16x8b, 1); //a1 a2 a3 a4 a5 a6 a7 a8 a9....a15 0
+ src_r1_sht_16x8b = _mm_srli_si128(src_r1_16x8b, 1); //b1 b2 b3 b4 b5 b6 b7 b8 b9....b15 0
+
+ src_r0_16x8b = _mm_unpacklo_epi8(src_r0_16x8b, src_r0_sht_16x8b); //a0 a1 a1 a2 a2 a3 a3 a4 a4 a5 a5 a6 a6 a7 a7 a8
+ src_r1_16x8b = _mm_unpacklo_epi8(src_r1_16x8b, src_r1_sht_16x8b); //b0 b1 b1 b2 b2 b3 b3 b4 b4 b5 b5 b6 b6 b7 b7 b8
+
+ src_r0r1_16x8b = _mm_unpacklo_epi64(src_r0_16x8b, src_r1_16x8b); //a0 a1 a1 a2 a2 a3 a3 a4 b0 b1 b1 b2 b2 b3 b3 b4
+ res_r0r1_t1_8x16b = _mm_maddubs_epi16(src_r0r1_16x8b, coeff0_1_16x8b); //a0*c0+a1*c1 a1*c0+a2*c1 a2*c0+a3*c1 a3*c0+a4*c1
+ //b0*c0+b1*c1 b1*c0+b2*c1 b2*c0+b3*c1 b3*c0+b4*c1
+
+ src_r0_16x8b = _mm_srli_si128(src_r0_16x8b, 4); //a2 a3 a3 a4 a4 a5 a5 a6 a6 a7 a7 a8 0 0 0 0
+ src_r1_16x8b = _mm_srli_si128(src_r1_16x8b, 4); //b2 b3 b3 b4 b4 b5 b5 b6 b6 b7 b7 b8 0 0 0 0
+
+ src_r0r1_16x8b = _mm_unpacklo_epi64(src_r0_16x8b, src_r1_16x8b); //a2 a3 a3 a4 a4 a5 a5 a6 b2 b3 b3 b4 b4 b5 b5 b6
+ res_r0r1_t2_8x16b = _mm_maddubs_epi16(src_r0r1_16x8b, coeff2_3_16x8b); //a2*c2+a3*c3 a3*c2+a4*c3 a4*c2+a5*c3 a5*c2+a6*c3
+ //b2*c2+b3*c3 b3*c2+b4*c3 b4*c2+b5*c3 b5*c2+b6*c3
+
+ src_r0_16x8b = _mm_srli_si128(src_r0_16x8b, 4); //a4 a5 a5 a6 a6 a7 a7 a8 0 0 0 0 0 0 0 0
+ src_r1_16x8b = _mm_srli_si128(src_r1_16x8b, 4); //b4 b5 b5 b6 b6 b7 b7 b8 0 0 0 0 0 0 0 0
+
+ src_r0r1_16x8b = _mm_unpacklo_epi64(src_r0_16x8b, src_r1_16x8b); //a4 a5 a5 a6 a6 a7 a7 a8 b4 b5 b5 b6 b6 b7 b7 b8
+ res_r0r1_t3_8x16b = _mm_maddubs_epi16(src_r0r1_16x8b, coeff4_5_16x8b); //a4*c4+a5*c5 a5*c4+a6*c5 a6*c4+a7*c5 a7*c4+a8*c5
+ //b4*c4+b5*c5 b5*c4+b6*c5 b4*c6+b7*c5 b7*c4+b8*c5
+ src_r0_16x8b = _mm_loadl_epi64((__m128i *)pu1_pred1);
+ src_r1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_pred1 + src_strd));
+
+ res_r0r1_t1_8x16b = _mm_add_epi16(res_r0r1_t1_8x16b, res_r0r1_t2_8x16b);
+ res_r0r1_t3_8x16b = _mm_add_epi16(const_val16_8x16b, res_r0r1_t3_8x16b);
+ res_r0r1_t1_8x16b = _mm_add_epi16(res_r0r1_t1_8x16b, res_r0r1_t3_8x16b); //a0*c0+a1*c1+a2*c2+a3*c3+a4*a4+a5*c5 + 16;
+ //a1*c0+a2*c1+a2*c2+a3*c3+a5*a4+a6*c5 + 16;
+ //a2*c0+a3*c1+a4*c2+a5*c3+a6*a4+a7*c5 + 16;
+ //a3*c0+a4*c1+a5*c2+a6*c3+a6*a4+a8*c5 + 16;
+ //b0*c0+b1*c1+b2*c2+b3*c3+b4*b4+b5*c5 + 16;
+ //b1*c0+b2*c1+b2*c2+b3*c3+b5*b4+b6*c5 + 16;
+ //b2*c0+b3*c1+b4*c2+b5*c3+b6*b4+b7*c5 + 16;
+ //b3*c0+b4*c1+b5*c2+b6*c3+b6*b4+b8*c5 + 16;
+ src_r0r1_16x8b = _mm_unpacklo_epi32(src_r0_16x8b,src_r1_16x8b);
+
+ res_r0r1_t1_8x16b = _mm_srai_epi16(res_r0r1_t1_8x16b, 5); //shifting right by 5 bits.
+
+ res_r0r1_16x8b = _mm_packus_epi16(res_r0r1_t1_8x16b, res_r0r1_t1_8x16b);
+ res_r0r1_16x8b = _mm_avg_epu8(src_r0r1_16x8b, res_r0r1_16x8b); //computing q-pel
+
+ _mm_maskmoveu_si128(res_r0r1_16x8b, mask_low32b, (char*)pu1_dst);
+ res_r0r1_16x8b = _mm_srli_si128(res_r0r1_16x8b, 4);
+ _mm_maskmoveu_si128(res_r0r1_16x8b, mask_low32b, (char*)(pu1_dst + dst_strd));
+
+ ht -= 2;
+ pu1_src += src_strd << 1;
+ pu1_pred1 += src_strd << 1;
+ pu1_dst += dst_strd << 1;
+ }
+ while(ht > 0);
+ }
+ else if(wd == 8)
+ {
+ __m128i src_r0_t1_16x8b, src_r1_t1_16x8b;
+
+ __m128i res_r0_t1_8x16b, res_r0_t2_8x16b, res_r0_t3_8x16b;
+ __m128i res_r1_t1_8x16b, res_r1_t2_8x16b, res_r1_t3_8x16b;
+ __m128i res_r0_16x8b, res_r1_16x8b;
+
+ //Row0 : a0 a1 a2 a3 a4 a5 a6 a7 a8 a9.....
+ //Row1 : b0 b1 b2 b3 b4 b5 b6 b7 b8 b9.....
+
+ do
+ {
+ src_r0_16x8b = _mm_loadu_si128((__m128i *)pu1_src); //a0 a1 a2 a3 a4 a5 a6 a7 a8 a9....a15
+ src_r1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + src_strd)); //b0 b1 b2 b3 b4 b5 b6 b7 b8 b9....b15
+
+ src_r0_sht_16x8b = _mm_srli_si128(src_r0_16x8b, 1); //a1 a2 a3 a4 a5 a6 a7 a8 a9....a15 0
+ src_r1_sht_16x8b = _mm_srli_si128(src_r1_16x8b, 1); //b1 b2 b3 b4 b5 b6 b7 b8 b9....b15 0
+
+ src_r0_t1_16x8b = _mm_unpacklo_epi8(src_r0_16x8b, src_r0_sht_16x8b); //a0 a1 a1 a2 a2 a3 a3 a4 a4 a5 a5 a6 a6 a7 a7 a8
+ src_r1_t1_16x8b = _mm_unpacklo_epi8(src_r1_16x8b, src_r1_sht_16x8b); //b0 b1 b1 b2 b2 b3 b3 b4 b4 b5 b5 b6 b6 b7 b7 b8
+
+ res_r0_t1_8x16b = _mm_maddubs_epi16(src_r0_t1_16x8b, coeff0_1_16x8b); //a0*c0+a1*c1 a1*c0+a2*c1 a2*c0+a3*c1 a3*c0+a4*c1
+ //a4*c0+a5*c1 a5*c0+a6*c1 a6*c0+a7*c1 a7*c0+a8*c1
+ res_r1_t1_8x16b = _mm_maddubs_epi16(src_r1_t1_16x8b, coeff0_1_16x8b); //b0*c0+b1*c1 b1*c0+b2*c1 b2*c0+b3*c1 b3*c0+b4*c1
+ //b4*c0+b5*c1 b5*c0+b6*c1 b6*c0+b7*c1 b7*c0+b8*c1
+
+ src_r0_16x8b = _mm_srli_si128(src_r0_16x8b, 2); //a2 a3 a4 a5 a6 a7 a8 a9....a15 0 0
+ src_r1_16x8b = _mm_srli_si128(src_r1_16x8b, 2); //b2 b3 b4 b5 b6 b7 b8 b9....b15 0 0
+
+ src_r0_sht_16x8b = _mm_srli_si128(src_r0_sht_16x8b, 2); //a3 a4 a5 a6 a7 a8 a9....a15 0 0 0
+ src_r1_sht_16x8b = _mm_srli_si128(src_r1_sht_16x8b, 2); //b3 b4 b5 b6 b7 b8 b9....b15 0 0 0
+
+ src_r0_t1_16x8b = _mm_unpacklo_epi8(src_r0_16x8b, src_r0_sht_16x8b); //a2 a3 a3 a4 a4 a5 a5 a6 a6 a7 a7 a8 a8 a9 a9 a10
+ src_r1_t1_16x8b = _mm_unpacklo_epi8(src_r1_16x8b, src_r1_sht_16x8b); //b2 b3 b3 b4 b4 b5 b5 b6 b6 b7 b7 b8 a8 a9 a9 a10
+
+ res_r0_t2_8x16b = _mm_maddubs_epi16(src_r0_t1_16x8b, coeff2_3_16x8b); //a2*c2+a3*c3 a3*c2+a4*c3 a4*c2+a5*c3 a5*c2+a6*c3
+ //a6*c2+a7*c3 a7*c2+a8*c3 a8*c2+a9*c3 a9*c2+a10*c3
+ res_r1_t2_8x16b = _mm_maddubs_epi16(src_r1_t1_16x8b, coeff2_3_16x8b); //b2*c2+b3*c3 b3*c2+b4*c3 b2*c4+b5*c3 b5*c2+b6*c3
+ //b6*c2+b7*c3 b7*c2+b8*c3 b8*c2+b9*c3 b9*c2+b10*c3
+
+ src_r0_16x8b = _mm_srli_si128(src_r0_16x8b, 2); //a4 a5 a6 a7 a8 a9....a15 0 0 0 0
+ src_r1_16x8b = _mm_srli_si128(src_r1_16x8b, 2); //b4 b5 b6 b7 b8 b9....b15 0 0 0 0
+
+ src_r0_sht_16x8b = _mm_srli_si128(src_r0_sht_16x8b, 2); //a5 a6 a7 a8 a9....a15 0 0 0 0 0
+ src_r1_sht_16x8b = _mm_srli_si128(src_r1_sht_16x8b, 2); //b5 b6 b7 b8 b9....b15 0 0 0 0 0
+
+ src_r0_t1_16x8b = _mm_unpacklo_epi8(src_r0_16x8b, src_r0_sht_16x8b); //a4 a5 a5 a6 a6 a7 a7 a8 a8 a9 a9 a10 a10 a11 a11 a12
+ src_r1_t1_16x8b = _mm_unpacklo_epi8(src_r1_16x8b, src_r1_sht_16x8b); //b4 b5 b5 b6 b6 b7 b7 b8 b8 b9 b9 b10 b10 b11 b11 b12
+
+ res_r0_t3_8x16b = _mm_maddubs_epi16(src_r0_t1_16x8b, coeff4_5_16x8b); //a4*c4+a5*c5 a5*c4+a6*c5 a6*c4+a7*c5 a7*c4+a8*c5
+ //a8*c4+a9*c5 a9*c4+a10*c5 a10*c4+a11*c5 a11*c4+a12*c5
+ res_r1_t3_8x16b = _mm_maddubs_epi16(src_r1_t1_16x8b, coeff4_5_16x8b); //b4*c4+b5*c5 b5*c4+b6*c5 b6*c4+b7*c5 b7*c4+b8*c5
+ //b8*c4+b9*c5 b9*c4+b10*c5 b10*c4+b11*c5 b11*c4+b12*c5
+ src_r0_16x8b = _mm_loadl_epi64((__m128i *)pu1_pred1);
+ src_r1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_pred1 + src_strd));
+
+ res_r0_t1_8x16b = _mm_add_epi16(res_r0_t1_8x16b, res_r0_t2_8x16b);
+ res_r1_t1_8x16b = _mm_add_epi16(res_r1_t1_8x16b, res_r1_t2_8x16b);
+ res_r0_t3_8x16b = _mm_add_epi16(const_val16_8x16b, res_r0_t3_8x16b);
+ res_r1_t3_8x16b = _mm_add_epi16(const_val16_8x16b, res_r1_t3_8x16b);
+ res_r0_t1_8x16b = _mm_add_epi16(res_r0_t1_8x16b, res_r0_t3_8x16b);
+ res_r1_t1_8x16b = _mm_add_epi16(res_r1_t1_8x16b, res_r1_t3_8x16b);
+
+ res_r0_t1_8x16b = _mm_srai_epi16(res_r0_t1_8x16b, 5);
+ res_r1_t1_8x16b = _mm_srai_epi16(res_r1_t1_8x16b, 5); //shifting right by 5 bits.
+
+ res_r0_16x8b = _mm_packus_epi16(res_r0_t1_8x16b, res_r0_t1_8x16b);
+ res_r1_16x8b = _mm_packus_epi16(res_r1_t1_8x16b, res_r1_t1_8x16b);
+
+ res_r0_16x8b = _mm_avg_epu8(src_r0_16x8b, res_r0_16x8b);
+ res_r1_16x8b = _mm_avg_epu8(src_r1_16x8b, res_r1_16x8b); //computing q-pel
+
+ _mm_storel_epi64((__m128i *)pu1_dst, res_r0_16x8b);
+ _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd), res_r1_16x8b);
+
+ ht -= 2;
+ pu1_src += src_strd << 1;
+ pu1_pred1 += src_strd << 1;
+ pu1_dst += dst_strd << 1;
+ }
+ while(ht > 0);
+ }
+ else // wd == 16
+ {
+ __m128i src_r0_t1_16x8b, src_r1_t1_16x8b;
+
+ __m128i res_r0_t1_8x16b, res_r0_t2_8x16b, res_r0_t3_8x16b;
+ __m128i res_r1_t1_8x16b, res_r1_t2_8x16b, res_r1_t3_8x16b;
+ __m128i res_16x8b;
+
+ //Row0 : a0 a1 a2 a3 a4 a5 a6 a7 a8 a9.....
+ //Row0 : b0 b1 b2 b3 b4 b5 b6 b7 b8 b9.....
+ //b0 is same a8. Similarly other bn pixels are same as a(n+8) pixels.
+
+ do
+ {
+ src_r0_16x8b = _mm_loadu_si128((__m128i *)pu1_src); //a0 a1 a2 a3 a4 a5 a6 a7 a8 a9....a15
+ src_r1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + 8)); //b0 b1 b2 b3 b4 b5 b6 b7 b8 b9....b15
+
+ src_r0_sht_16x8b = _mm_srli_si128(src_r0_16x8b, 1); //a1 a2 a3 a4 a5 a6 a7 a8 a9....a15 0
+ src_r1_sht_16x8b = _mm_srli_si128(src_r1_16x8b, 1); //b1 b2 b3 b4 b5 b6 b7 b8 b9....b15 0
+
+ src_r0_t1_16x8b = _mm_unpacklo_epi8(src_r0_16x8b, src_r0_sht_16x8b); //a0 a1 a1 a2 a2 a3 a3 a4 a4 a5 a5 a6 a6 a7 a7 a8
+ src_r1_t1_16x8b = _mm_unpacklo_epi8(src_r1_16x8b, src_r1_sht_16x8b); //b0 b1 b1 b2 b2 b3 b3 b4 b4 b5 b5 b6 b6 b7 b7 b8
+
+ res_r0_t1_8x16b = _mm_maddubs_epi16(src_r0_t1_16x8b, coeff0_1_16x8b); //a0*c0+a1*c1 a1*c0+a2*c1 a2*c0+a3*c1 a3*c0+a4*c1
+ //a4*c0+a5*c1 a5*c0+a6*c1 a6*c0+a7*c1 a7*c0+a8*c1
+ res_r1_t1_8x16b = _mm_maddubs_epi16(src_r1_t1_16x8b, coeff0_1_16x8b); //b0*c0+b1*c1 b1*c0+b2*c1 b2*c0+b3*c1 b3*c0+b4*c1
+ //b4*c0+b5*c1 b5*c0+b6*c1 b6*c0+b7*c1 b7*c0+b8*c1
+
+ src_r0_16x8b = _mm_srli_si128(src_r0_16x8b, 2); //a2 a3 a4 a5 a6 a7 a8 a9....a15 0 0
+ src_r1_16x8b = _mm_srli_si128(src_r1_16x8b, 2); //b2 b3 b4 b5 b6 b7 b8 b9....b15 0 0
+
+ src_r0_sht_16x8b = _mm_srli_si128(src_r0_sht_16x8b, 2); //a3 a4 a5 a6 a7 a8 a9....a15 0 0 0
+ src_r1_sht_16x8b = _mm_srli_si128(src_r1_sht_16x8b, 2); //b3 b4 b5 b6 b7 b8 b9....b15 0 0 0
+
+ src_r0_t1_16x8b = _mm_unpacklo_epi8(src_r0_16x8b, src_r0_sht_16x8b); //a2 a3 a3 a4 a4 a5 a5 a6 a6 a7 a7 a8 a8 a9 a9 a10
+ src_r1_t1_16x8b = _mm_unpacklo_epi8(src_r1_16x8b, src_r1_sht_16x8b); //b2 b3 b3 b4 b4 b5 b5 b6 b6 b7 b7 b8 a8 a9 a9 a10
+
+ res_r0_t2_8x16b = _mm_maddubs_epi16(src_r0_t1_16x8b, coeff2_3_16x8b); //a2*c2+a3*c3 a3*c2+a4*c3 a4*c2+a5*c3 a5*c2+a6*c3
+ //a6*c2+a7*c3 a7*c2+a8*c3 a8*c2+a9*c3 a9*c2+a10*c3
+ res_r1_t2_8x16b = _mm_maddubs_epi16(src_r1_t1_16x8b, coeff2_3_16x8b); //b2*c2+b3*c3 b3*c2+b4*c3 b2*c4+b5*c3 b5*c2+b6*c3
+ //b6*c2+b7*c3 b7*c2+b8*c3 b8*c2+b9*c3 b9*c2+b10*c3
+
+ src_r0_16x8b = _mm_srli_si128(src_r0_16x8b, 2); //a4 a5 a6 a7 a8 a9....a15 0 0 0 0
+ src_r1_16x8b = _mm_srli_si128(src_r1_16x8b, 2); //b4 b5 b6 b7 b8 b9....b15 0 0 0 0
+
+ src_r0_sht_16x8b = _mm_srli_si128(src_r0_sht_16x8b, 2); //a5 a6 a7 a8 a9....a15 0 0 0 0 0
+ src_r1_sht_16x8b = _mm_srli_si128(src_r1_sht_16x8b, 2); //b5 b6 b7 b8 b9....b15 0 0 0 0 0
+
+ src_r0_t1_16x8b = _mm_unpacklo_epi8(src_r0_16x8b, src_r0_sht_16x8b); //a4 a5 a5 a6 a6 a7 a7 a8 a8 a9 a9 a10 a10 a11 a11 a12
+ src_r1_t1_16x8b = _mm_unpacklo_epi8(src_r1_16x8b, src_r1_sht_16x8b); //b4 b5 b5 b6 b6 b7 b7 b8 b8 b9 b9 b10 b10 b11 b11 b12
+
+ res_r0_t3_8x16b = _mm_maddubs_epi16(src_r0_t1_16x8b, coeff4_5_16x8b); //a4*c4+a5*c5 a5*c4+a6*c5 a6*c4+a7*c5 a7*c4+a8*c5
+ //a8*c4+a9*c5 a9*c4+a10*c5 a10*c4+a11*c5 a11*c4+a12*c5
+ res_r1_t3_8x16b = _mm_maddubs_epi16(src_r1_t1_16x8b, coeff4_5_16x8b); //b4*c4+b5*c5 b5*c4+b6*c5 b6*c4+b7*c5 b7*c4+b8*c5
+ //b8*c4+b9*c5 b9*c4+b10*c5 b10*c4+b11*c5 b11*c4+b12*c5
+ src_r0_16x8b = _mm_loadu_si128((__m128i *)pu1_pred1);
+
+ res_r0_t1_8x16b = _mm_add_epi16(res_r0_t1_8x16b, res_r0_t2_8x16b);
+ res_r1_t1_8x16b = _mm_add_epi16(res_r1_t1_8x16b, res_r1_t2_8x16b);
+ res_r0_t3_8x16b = _mm_add_epi16(const_val16_8x16b, res_r0_t3_8x16b);
+ res_r1_t3_8x16b = _mm_add_epi16(const_val16_8x16b, res_r1_t3_8x16b);
+ res_r0_t1_8x16b = _mm_add_epi16(res_r0_t1_8x16b, res_r0_t3_8x16b);
+ res_r1_t1_8x16b = _mm_add_epi16(res_r1_t1_8x16b, res_r1_t3_8x16b);
+
+ res_r0_t1_8x16b = _mm_srai_epi16(res_r0_t1_8x16b, 5);
+ res_r1_t1_8x16b = _mm_srai_epi16(res_r1_t1_8x16b, 5); //shifting right by 5 bits
+
+ res_16x8b = _mm_packus_epi16(res_r0_t1_8x16b, res_r1_t1_8x16b);
+ res_16x8b = _mm_avg_epu8(src_r0_16x8b, res_16x8b); //computing q-pel
+
+ _mm_storeu_si128((__m128i *)pu1_dst, res_16x8b);
+
+ ht--;
+ pu1_src += src_strd;
+ pu1_pred1 += src_strd;
+ pu1_dst += dst_strd;
+ }
+ while(ht > 0);
+ }
+}
+
+/*****************************************************************************/
+/* */
+/* Function Name : ih264_inter_pred_luma_vert_qpel_ssse3 */
+/* */
+/* Description : This function implements a six-tap filter vertically on */
+/* ht x wd block and averages the values with the source */
+/* pixels to calculate vertical quarter-pel as mentioned in */
+/* sec. 8.4.2.2.1 titled "Luma sample interpolation */
+/* process". (ht,wd) can be (4,4), (8,4), (4,8), (8,8), */
+/* (16,8), (8,16) or (16,16). */
+/* */
+/* Inputs : puc_src - pointer to source */
+/* puc_dst - pointer to destination */
+/* src_strd - stride for source */
+/* dst_strd - stride for destination */
+/* ht - height of the block */
+/* wd - width of the block */
+/* pu1_tmp - pointer to temporary buffer */
+/* dydx - x and y reference offset for q-pel */
+/* calculations */
+/* */
+/* Issues : None */
+/* */
+/* Revision History: */
+/* */
+/* DD MM YYYY Author(s) Changes */
+/* 13 02 2015 Kaushik Initial Version */
+/* Senthoor */
+/* */
+/*****************************************************************************/
+void ih264_inter_pred_luma_vert_qpel_ssse3(UWORD8 *pu1_src,
+ UWORD8 *pu1_dst,
+ WORD32 src_strd,
+ WORD32 dst_strd,
+ WORD32 ht,
+ WORD32 wd,
+ UWORD8* pu1_tmp,
+ WORD32 dydx)
+{
+ WORD32 y_offset;
+ UWORD8 *pu1_pred1;
+
+ UNUSED(pu1_tmp);
+
+ __m128i src_r0_16x8b, src_r1_16x8b, src_r2_16x8b, src_r3_16x8b, src_r4_16x8b;
+ __m128i src_r5_16x8b, src_r6_16x8b;
+ __m128i src_r0r1_16x8b, src_r2r3_16x8b, src_r4r5_16x8b;
+ __m128i res_16x8b, res_t1_8x16b, res_t2_8x16b, res_t3_8x16b;
+
+ __m128i coeff0_1_16x8b, coeff2_3_16x8b, coeff4_5_16x8b;
+ __m128i const_val16_8x16b;
+
+ y_offset = dydx & 0xf;
+
+ coeff0_1_16x8b = _mm_set1_epi32(0xFB01FB01); //c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 c0 c1
+ coeff2_3_16x8b = _mm_set1_epi32(0x14141414); //c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 c2 c3
+ coeff4_5_16x8b = _mm_set1_epi32(0x01FB01FB); //c4 c5 c4 c5 c4 c5 c4 c5 c4 c5 c4 c5 c4 c5 c4 c5
+ //c0 = c5 = 1, c1 = c4 = -5, c2 = c3 = 20
+
+ pu1_pred1 = pu1_src + (y_offset >> 3) * src_strd;
+
+ const_val16_8x16b = _mm_set1_epi16(16);
+
+ pu1_src -= src_strd << 1; // the filter input starts from x[-2] (till x[3])
+
+ if(wd == 4)
+ {
+ __m128i mask_low32b;
+
+ mask_low32b = _mm_set1_epi8(0xff);
+
+ //Epilogue: Load all the pred rows except sixth and seventh row
+ // for the first and second row processing.
+ src_r0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src);
+ pu1_src += src_strd;
+ src_r1_16x8b = _mm_loadl_epi64((__m128i *)pu1_src);
+ pu1_src += src_strd;
+ src_r2_16x8b = _mm_loadl_epi64((__m128i *)pu1_src);
+ pu1_src += src_strd;
+ src_r3_16x8b = _mm_loadl_epi64((__m128i *)pu1_src);
+ pu1_src += src_strd;
+ src_r4_16x8b = _mm_loadl_epi64((__m128i *)pu1_src);
+ pu1_src += src_strd;
+
+ mask_low32b = _mm_srli_si128(mask_low32b, 12); // mask for first four bytes
+
+ src_r0_16x8b = _mm_unpacklo_epi32(src_r0_16x8b, src_r1_16x8b);
+ src_r1_16x8b = _mm_unpacklo_epi32(src_r1_16x8b, src_r2_16x8b);
+ src_r2_16x8b = _mm_unpacklo_epi32(src_r2_16x8b, src_r3_16x8b);
+ src_r3_16x8b = _mm_unpacklo_epi32(src_r3_16x8b, src_r4_16x8b);
+
+ do
+ {
+ src_r5_16x8b = _mm_loadl_epi64((__m128i *)pu1_src);
+ src_r6_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + src_strd));
+
+ src_r4_16x8b = _mm_unpacklo_epi32(src_r4_16x8b, src_r5_16x8b);
+ src_r5_16x8b = _mm_unpacklo_epi32(src_r5_16x8b, src_r6_16x8b);
+
+ src_r0r1_16x8b = _mm_unpacklo_epi8(src_r0_16x8b, src_r1_16x8b);
+ src_r2r3_16x8b = _mm_unpacklo_epi8(src_r2_16x8b, src_r3_16x8b);
+ src_r4r5_16x8b = _mm_unpacklo_epi8(src_r4_16x8b, src_r5_16x8b);
+
+ res_t1_8x16b = _mm_maddubs_epi16(src_r0r1_16x8b, coeff0_1_16x8b);
+ res_t2_8x16b = _mm_maddubs_epi16(src_r2r3_16x8b, coeff2_3_16x8b);
+ res_t3_8x16b = _mm_maddubs_epi16(src_r4r5_16x8b, coeff4_5_16x8b);
+
+ src_r0_16x8b = _mm_loadl_epi64((__m128i *)pu1_pred1);
+ src_r1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_pred1 + src_strd));
+
+ res_t1_8x16b = _mm_add_epi16(res_t1_8x16b, res_t2_8x16b);
+ res_t3_8x16b = _mm_add_epi16(const_val16_8x16b, res_t3_8x16b);
+ res_t1_8x16b = _mm_add_epi16(res_t1_8x16b, res_t3_8x16b);
+
+ src_r0r1_16x8b = _mm_unpacklo_epi32(src_r0_16x8b,src_r1_16x8b);
+
+ res_t1_8x16b = _mm_srai_epi16(res_t1_8x16b, 5); //shifting right by 5 bits.
+
+ res_16x8b = _mm_packus_epi16(res_t1_8x16b, res_t1_8x16b);
+
+ res_16x8b = _mm_avg_epu8(src_r0r1_16x8b, res_16x8b); //computing q-pel
+
+ _mm_maskmoveu_si128(res_16x8b, mask_low32b, (char*)pu1_dst);
+ res_16x8b = _mm_srli_si128(res_16x8b, 4);
+ _mm_maskmoveu_si128(res_16x8b, mask_low32b, (char*)(pu1_dst + dst_strd));
+
+ src_r0_16x8b = src_r2_16x8b;
+ src_r1_16x8b = src_r3_16x8b;
+ src_r2_16x8b = src_r4_16x8b;
+ src_r3_16x8b = src_r5_16x8b;
+ src_r4_16x8b = src_r6_16x8b;
+
+ ht -= 2;
+ pu1_src += src_strd << 1;
+ pu1_pred1 += src_strd << 1;
+ pu1_dst += dst_strd << 1;
+ }
+ while(ht > 0);
+ }
+
+ else if(wd == 8)
+ {
+ //Epilogue: Load all the pred rows except sixth and seventh row
+ // for the first and second row processing.
+ src_r0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src);
+ pu1_src += src_strd;
+ src_r1_16x8b = _mm_loadl_epi64((__m128i *)pu1_src);
+ pu1_src += src_strd;
+ src_r2_16x8b = _mm_loadl_epi64((__m128i *)pu1_src);
+ pu1_src += src_strd;
+ src_r3_16x8b = _mm_loadl_epi64((__m128i *)pu1_src);
+ pu1_src += src_strd;
+ src_r4_16x8b = _mm_loadl_epi64((__m128i *)pu1_src);
+ pu1_src += src_strd;
+
+ src_r0_16x8b = _mm_unpacklo_epi64(src_r0_16x8b, src_r1_16x8b);
+ src_r1_16x8b = _mm_unpacklo_epi64(src_r1_16x8b, src_r2_16x8b);
+ src_r2_16x8b = _mm_unpacklo_epi64(src_r2_16x8b, src_r3_16x8b);
+ src_r3_16x8b = _mm_unpacklo_epi64(src_r3_16x8b, src_r4_16x8b);
+
+ do
+ {
+ src_r5_16x8b = _mm_loadl_epi64((__m128i *)pu1_src);
+ src_r6_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + src_strd));
+
+ src_r4_16x8b = _mm_unpacklo_epi64(src_r4_16x8b, src_r5_16x8b);
+ src_r5_16x8b = _mm_unpacklo_epi64(src_r5_16x8b, src_r6_16x8b);
+
+ src_r0r1_16x8b = _mm_unpacklo_epi8(src_r0_16x8b, src_r1_16x8b);
+ src_r2r3_16x8b = _mm_unpacklo_epi8(src_r2_16x8b, src_r3_16x8b);
+ src_r4r5_16x8b = _mm_unpacklo_epi8(src_r4_16x8b, src_r5_16x8b);
+
+ res_t1_8x16b = _mm_maddubs_epi16(src_r0r1_16x8b, coeff0_1_16x8b);
+ res_t2_8x16b = _mm_maddubs_epi16(src_r2r3_16x8b, coeff2_3_16x8b);
+ res_t3_8x16b = _mm_maddubs_epi16(src_r4r5_16x8b, coeff4_5_16x8b);
+
+ src_r0r1_16x8b = _mm_loadl_epi64((__m128i *)pu1_pred1);
+
+ res_t1_8x16b = _mm_add_epi16(res_t1_8x16b, res_t2_8x16b);
+ res_t3_8x16b = _mm_add_epi16(const_val16_8x16b, res_t3_8x16b);
+ res_t1_8x16b = _mm_add_epi16(res_t1_8x16b, res_t3_8x16b);
+
+ res_t1_8x16b = _mm_srai_epi16(res_t1_8x16b, 5); //shifting right by 5 bits.
+
+ res_16x8b = _mm_packus_epi16(res_t1_8x16b, res_t1_8x16b);
+ res_16x8b = _mm_avg_epu8(src_r0r1_16x8b, res_16x8b); //computing q-pel
+
+ _mm_storel_epi64((__m128i *)pu1_dst, res_16x8b);
+
+ src_r0r1_16x8b = _mm_unpackhi_epi8(src_r0_16x8b, src_r1_16x8b);
+ src_r2r3_16x8b = _mm_unpackhi_epi8(src_r2_16x8b, src_r3_16x8b);
+ src_r4r5_16x8b = _mm_unpackhi_epi8(src_r4_16x8b, src_r5_16x8b);
+
+ res_t1_8x16b = _mm_maddubs_epi16(src_r0r1_16x8b, coeff0_1_16x8b);
+ res_t2_8x16b = _mm_maddubs_epi16(src_r2r3_16x8b, coeff2_3_16x8b);
+ res_t3_8x16b = _mm_maddubs_epi16(src_r4r5_16x8b, coeff4_5_16x8b);
+
+ src_r0r1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_pred1 + src_strd));
+
+ res_t1_8x16b = _mm_add_epi16(res_t1_8x16b, res_t2_8x16b);
+ res_t3_8x16b = _mm_add_epi16(const_val16_8x16b, res_t3_8x16b);
+ res_t1_8x16b = _mm_add_epi16(res_t1_8x16b, res_t3_8x16b);
+
+ res_t1_8x16b = _mm_srai_epi16(res_t1_8x16b, 5); //shifting right by 5 bits.
+
+ res_16x8b = _mm_packus_epi16(res_t1_8x16b, res_t1_8x16b);
+ res_16x8b = _mm_avg_epu8(src_r0r1_16x8b, res_16x8b); //computing q-pel
+
+ _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd), res_16x8b);
+
+ src_r0_16x8b = src_r2_16x8b;
+ src_r1_16x8b = src_r3_16x8b;
+ src_r2_16x8b = src_r4_16x8b;
+ src_r3_16x8b = src_r5_16x8b;
+ src_r4_16x8b = src_r6_16x8b;
+
+ ht -= 2;
+ pu1_src += src_strd << 1;
+ pu1_pred1 += src_strd << 1;
+ pu1_dst += dst_strd << 1;
+ }
+ while(ht > 0);
+ }
+ else // wd == 16
+ {
+ __m128i res_t0_8x16b;
+
+ //Epilogue: Load all the pred rows except sixth and seventh row
+ // for the first and second row processing.
+ src_r0_16x8b = _mm_loadu_si128((__m128i *)pu1_src);
+ pu1_src += src_strd;
+ src_r1_16x8b = _mm_loadu_si128((__m128i *)pu1_src);
+ pu1_src += src_strd;
+ src_r2_16x8b = _mm_loadu_si128((__m128i *)pu1_src);
+ pu1_src += src_strd;
+ src_r3_16x8b = _mm_loadu_si128((__m128i *)pu1_src);
+ pu1_src += src_strd;
+ src_r4_16x8b = _mm_loadu_si128((__m128i *)pu1_src);
+ pu1_src += src_strd;
+
+ do
+ {
+ src_r5_16x8b = _mm_loadu_si128((__m128i *)pu1_src);
+ src_r6_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + src_strd));
+
+ src_r0r1_16x8b = _mm_unpacklo_epi8(src_r0_16x8b, src_r1_16x8b);
+ src_r2r3_16x8b = _mm_unpacklo_epi8(src_r2_16x8b, src_r3_16x8b);
+ src_r4r5_16x8b = _mm_unpacklo_epi8(src_r4_16x8b, src_r5_16x8b);
+
+ res_t1_8x16b = _mm_maddubs_epi16(src_r0r1_16x8b, coeff0_1_16x8b);
+ res_t2_8x16b = _mm_maddubs_epi16(src_r2r3_16x8b, coeff2_3_16x8b);
+ res_t3_8x16b = _mm_maddubs_epi16(src_r4r5_16x8b, coeff4_5_16x8b);
+
+ res_t1_8x16b = _mm_add_epi16(res_t1_8x16b, res_t2_8x16b);
+ res_t3_8x16b = _mm_add_epi16(const_val16_8x16b, res_t3_8x16b);
+ res_t1_8x16b = _mm_add_epi16(res_t1_8x16b, res_t3_8x16b);
+
+ res_t0_8x16b = _mm_srai_epi16(res_t1_8x16b, 5); //shifting right by 5 bits.
+
+ src_r0r1_16x8b = _mm_unpackhi_epi8(src_r0_16x8b, src_r1_16x8b);
+ src_r2r3_16x8b = _mm_unpackhi_epi8(src_r2_16x8b, src_r3_16x8b);
+ src_r4r5_16x8b = _mm_unpackhi_epi8(src_r4_16x8b, src_r5_16x8b);
+
+ res_t1_8x16b = _mm_maddubs_epi16(src_r0r1_16x8b, coeff0_1_16x8b);
+ res_t2_8x16b = _mm_maddubs_epi16(src_r2r3_16x8b, coeff2_3_16x8b);
+ res_t3_8x16b = _mm_maddubs_epi16(src_r4r5_16x8b, coeff4_5_16x8b);
+
+ src_r0r1_16x8b = _mm_loadu_si128((__m128i *)pu1_pred1);
+
+ res_t1_8x16b = _mm_add_epi16(res_t1_8x16b, res_t2_8x16b);
+ res_t3_8x16b = _mm_add_epi16(const_val16_8x16b, res_t3_8x16b);
+ res_t1_8x16b = _mm_add_epi16(res_t1_8x16b, res_t3_8x16b);
+
+ res_t1_8x16b = _mm_srai_epi16(res_t1_8x16b, 5); //shifting right by 5 bits.
+
+ res_16x8b = _mm_packus_epi16(res_t0_8x16b, res_t1_8x16b);
+ res_16x8b = _mm_avg_epu8(src_r0r1_16x8b, res_16x8b); //computing q-pel
+
+ _mm_storeu_si128((__m128i *)pu1_dst, res_16x8b);
+
+ src_r0r1_16x8b = _mm_unpacklo_epi8(src_r1_16x8b, src_r2_16x8b);
+ src_r2r3_16x8b = _mm_unpacklo_epi8(src_r3_16x8b, src_r4_16x8b);
+ src_r4r5_16x8b = _mm_unpacklo_epi8(src_r5_16x8b, src_r6_16x8b);
+
+ res_t1_8x16b = _mm_maddubs_epi16(src_r0r1_16x8b, coeff0_1_16x8b);
+ res_t2_8x16b = _mm_maddubs_epi16(src_r2r3_16x8b, coeff2_3_16x8b);
+ res_t3_8x16b = _mm_maddubs_epi16(src_r4r5_16x8b, coeff4_5_16x8b);
+
+ res_t1_8x16b = _mm_add_epi16(res_t1_8x16b, res_t2_8x16b);
+ res_t3_8x16b = _mm_add_epi16(const_val16_8x16b, res_t3_8x16b);
+ res_t1_8x16b = _mm_add_epi16(res_t1_8x16b, res_t3_8x16b);
+
+ res_t0_8x16b = _mm_srai_epi16(res_t1_8x16b, 5); //shifting right by 5 bits.
+
+ src_r0r1_16x8b = _mm_unpackhi_epi8(src_r1_16x8b, src_r2_16x8b);
+ src_r2r3_16x8b = _mm_unpackhi_epi8(src_r3_16x8b, src_r4_16x8b);
+ src_r4r5_16x8b = _mm_unpackhi_epi8(src_r5_16x8b, src_r6_16x8b);
+
+ res_t1_8x16b = _mm_maddubs_epi16(src_r0r1_16x8b, coeff0_1_16x8b);
+ res_t2_8x16b = _mm_maddubs_epi16(src_r2r3_16x8b, coeff2_3_16x8b);
+ res_t3_8x16b = _mm_maddubs_epi16(src_r4r5_16x8b, coeff4_5_16x8b);
+
+ src_r0r1_16x8b = _mm_loadu_si128((__m128i *)(pu1_pred1 + src_strd));
+
+ res_t1_8x16b = _mm_add_epi16(res_t1_8x16b, res_t2_8x16b);
+ res_t3_8x16b = _mm_add_epi16(const_val16_8x16b, res_t3_8x16b);
+ res_t1_8x16b = _mm_add_epi16(res_t1_8x16b, res_t3_8x16b);
+
+ res_t1_8x16b = _mm_srai_epi16(res_t1_8x16b, 5); //shifting right by 5 bits.
+
+ res_16x8b = _mm_packus_epi16(res_t0_8x16b, res_t1_8x16b);
+ res_16x8b = _mm_avg_epu8(src_r0r1_16x8b, res_16x8b); //computing q-pel
+
+ _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd), res_16x8b);
+
+ src_r0_16x8b = src_r2_16x8b;
+ src_r1_16x8b = src_r3_16x8b;
+ src_r2_16x8b = src_r4_16x8b;
+ src_r3_16x8b = src_r5_16x8b;
+ src_r4_16x8b = src_r6_16x8b;
+
+ ht -= 2;
+ pu1_src += src_strd << 1;
+ pu1_pred1 += src_strd << 1;
+ pu1_dst += dst_strd << 1;
+ }
+ while(ht > 0);
+ }
+}
+
+/*****************************************************************************/
+/* */
+/* Function Name : ih264_inter_pred_luma_horz_qpel_vert_qpel_ssse3 */
+/* */
+/* Description : This function implements a six-tap filter vertically and */
+/* horizontally on ht x wd block separately and averages */
+/* the two sets of values to calculate values at (1/4,1/4), */
+/* (1/4, 3/4), (3/4, 1/4) or (3/4, 3/4) as mentioned in */
+/* sec. 8.4.2.2.1 titled "Luma sample interpolation */
+/* process". (ht,wd) can be (4,4), (8,4), (4,8), (8,8), */
+/* (16,8), (8,16) or (16,16). */
+/* */
+/* Inputs : puc_src - pointer to source */
+/* puc_dst - pointer to destination */
+/* src_strd - stride for source */
+/* dst_strd - stride for destination */
+/* ht - height of the block */
+/* wd - width of the block */
+/* pu1_tmp - pointer to temporary buffer */
+/* dydx - x and y reference offset for q-pel */
+/* calculations */
+/* */
+/* Issues : None */
+/* */
+/* Revision History: */
+/* */
+/* DD MM YYYY Author(s) Changes */
+/* 13 02 2015 Kaushik Initial Version */
+/* Senthoor */
+/* */
+/*****************************************************************************/
+void ih264_inter_pred_luma_horz_qpel_vert_qpel_ssse3(UWORD8 *pu1_src,
+ UWORD8 *pu1_dst,
+ WORD32 src_strd,
+ WORD32 dst_strd,
+ WORD32 ht,
+ WORD32 wd,
+ UWORD8* pu1_tmp,
+ WORD32 dydx)
+{
+ WORD32 ht_temp;
+ UWORD8 *pu1_pred_vert,*pu1_pred_horiz;
+ UWORD8 *pu1_tmp1, *pu1_tmp2;
+ WORD32 x_offset, y_offset;
+
+ pu1_tmp1 = pu1_tmp;
+
+ dydx &= 0xf;
+ ht_temp = ht;
+ x_offset = dydx & 0x3;
+ y_offset = dydx >> 2;
+ pu1_tmp2 = pu1_tmp1;
+
+ pu1_pred_vert = pu1_src + (x_offset >> 1) - 2*src_strd;
+ pu1_pred_horiz = pu1_src + (y_offset >> 1) * src_strd - 2;
+ //the filter input starts from x[-2] (till x[3])
+
+ __m128i coeff0_1_16x8b, coeff2_3_16x8b, coeff4_5_16x8b;
+ __m128i const_val16_8x16b;
+
+ coeff0_1_16x8b = _mm_set1_epi32(0xFB01FB01); //c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 c0 c1
+ coeff2_3_16x8b = _mm_set1_epi32(0x14141414); //c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 c2 c3
+ coeff4_5_16x8b = _mm_set1_epi32(0x01FB01FB); //c4 c5 c5 c5 c4 c5 c5 c5 c4 c5 c5 c5 c4 c5 c5 c5
+ //c0 = c5 = 1, c1 = c4 = -5, c2 = c3 = 20
+ const_val16_8x16b = _mm_set1_epi16(16);
+
+ if(wd == 4)
+ {
+ //vertical q-pel filter
+ {
+ __m128i src_r0_16x8b, src_r1_16x8b, src_r2_16x8b, src_r3_16x8b, src_r4_16x8b;
+ __m128i src_r5_16x8b, src_r6_16x8b;
+ __m128i src_r0r1_16x8b, src_r2r3_16x8b, src_r4r5_16x8b;
+
+ __m128i res_r0r1_16x8b, res_t1_8x16b, res_t2_8x16b, res_t3_8x16b;
+
+ //epilogue: Load all the pred rows except sixth and seventh row for the
+ //first and second row processing.
+ src_r0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_pred_vert));
+ pu1_pred_vert = pu1_pred_vert + src_strd;
+
+ src_r1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_pred_vert));
+ pu1_pred_vert = pu1_pred_vert + src_strd;
+ src_r0_16x8b = _mm_unpacklo_epi32(src_r0_16x8b, src_r1_16x8b);
+
+ src_r2_16x8b = _mm_loadl_epi64((__m128i *)(pu1_pred_vert));
+ pu1_pred_vert = pu1_pred_vert + src_strd;
+ src_r1_16x8b = _mm_unpacklo_epi32(src_r1_16x8b, src_r2_16x8b);
+
+ src_r3_16x8b = _mm_loadl_epi64((__m128i *)(pu1_pred_vert));
+ pu1_pred_vert = pu1_pred_vert + src_strd;
+ src_r2_16x8b = _mm_unpacklo_epi32(src_r2_16x8b, src_r3_16x8b);
+
+ src_r4_16x8b = _mm_loadl_epi64((__m128i *)(pu1_pred_vert));
+ pu1_pred_vert = pu1_pred_vert + src_strd;
+ src_r3_16x8b = _mm_unpacklo_epi32(src_r3_16x8b, src_r4_16x8b);
+
+ //Core Loop: Process all the rows.
+ do
+ {
+ src_r5_16x8b = _mm_loadl_epi64((__m128i *)(pu1_pred_vert));
+ src_r4_16x8b = _mm_unpacklo_epi32(src_r4_16x8b, src_r5_16x8b);
+
+ src_r6_16x8b = _mm_loadl_epi64((__m128i *)(pu1_pred_vert + src_strd));
+ src_r5_16x8b = _mm_unpacklo_epi32(src_r5_16x8b, src_r6_16x8b);
+
+ src_r0r1_16x8b = _mm_unpacklo_epi8(src_r0_16x8b, src_r1_16x8b);
+ src_r2r3_16x8b = _mm_unpacklo_epi8(src_r2_16x8b, src_r3_16x8b);
+ src_r4r5_16x8b = _mm_unpacklo_epi8(src_r4_16x8b, src_r5_16x8b);
+
+ res_t1_8x16b = _mm_maddubs_epi16(src_r0r1_16x8b, coeff0_1_16x8b);
+ res_t2_8x16b = _mm_maddubs_epi16(src_r2r3_16x8b, coeff2_3_16x8b);
+ res_t3_8x16b = _mm_maddubs_epi16(src_r4r5_16x8b, coeff4_5_16x8b);
+
+ res_t1_8x16b = _mm_add_epi16(res_t1_8x16b, res_t2_8x16b);
+ res_t3_8x16b = _mm_add_epi16(const_val16_8x16b, res_t3_8x16b);
+ res_t1_8x16b = _mm_add_epi16(res_t3_8x16b, res_t1_8x16b);
+
+ res_t1_8x16b = _mm_srai_epi16(res_t1_8x16b, 5); //shifting right by 5 bits.
+ res_r0r1_16x8b = _mm_packus_epi16(res_t1_8x16b, res_t1_8x16b);
+
+ _mm_storel_epi64((__m128i *)pu1_tmp1, res_r0r1_16x8b);
+
+ src_r0_16x8b = src_r2_16x8b;
+ src_r1_16x8b = src_r3_16x8b;
+ src_r2_16x8b = src_r4_16x8b;
+ src_r3_16x8b = src_r5_16x8b;
+ src_r4_16x8b = src_r6_16x8b;
+
+ ht_temp -= 2;
+ pu1_pred_vert += src_strd << 1;
+ pu1_tmp1 += 8;
+ }
+ while(ht_temp > 0);
+ }
+
+ //horizontal q-pel filter
+ {
+ __m128i src_r0_16x8b, src_r1_16x8b;
+ __m128i src_r0_sht_16x8b, src_r1_sht_16x8b;
+ __m128i src_r0r1_vpel_16x8b, src_r0r1_t1_16x8b;
+
+ __m128i res_r0r1_t1_8x16b, res_r0r1_t2_8x16b, res_r0r1_t3_8x16b;
+ __m128i res_r0r1_16x8b;
+
+ __m128i mask_low32b;
+
+ mask_low32b = _mm_set1_epi8(0xff);
+ mask_low32b = _mm_srli_si128(mask_low32b, 12);
+
+ //Row0 : a0 a1 a2 a3 a4 a5 a6 a7 a8 a9.....
+ //Row1 : b0 b1 b2 b3 b4 b5 b6 b7 b8 b9.....
+
+ do
+ {
+ src_r0_16x8b = _mm_loadu_si128((__m128i *)pu1_pred_horiz); //a0 a1 a2 a3 a4 a5 a6 a7 a8 a9....a15
+ src_r1_16x8b = _mm_loadu_si128((__m128i *)(pu1_pred_horiz + src_strd)); //b0 b1 b2 b3 b4 b5 b6 b7 b8 b9....b15
+
+ src_r0r1_vpel_16x8b = _mm_loadl_epi64((__m128i *)pu1_tmp2);
+
+ src_r0_sht_16x8b = _mm_srli_si128(src_r0_16x8b, 1); //a1 a2 a3 a4 a5 a6 a7 a8 a9....a15 0
+ src_r1_sht_16x8b = _mm_srli_si128(src_r1_16x8b, 1); //b1 b2 b3 b4 b5 b6 b7 b8 b9....b15 0
+
+ src_r0_16x8b = _mm_unpacklo_epi8(src_r0_16x8b, src_r0_sht_16x8b); //a0 a1 a1 a2 a2 a3 a3 a4 a4 a5 a5 a6 a6 a7 a7 a8
+ src_r1_16x8b = _mm_unpacklo_epi8(src_r1_16x8b, src_r1_sht_16x8b); //b0 b1 b1 b2 b2 b3 b3 b4 b4 b5 b5 b6 b6 b7 b7 b8
+
+ src_r0r1_t1_16x8b = _mm_unpacklo_epi64(src_r0_16x8b, src_r1_16x8b); //a0 a1 a1 a2 a2 a3 a3 a4 b0 b1 b1 b2 b2 b3 b3 b4
+ res_r0r1_t1_8x16b = _mm_maddubs_epi16(src_r0r1_t1_16x8b, coeff0_1_16x8b); //a0*c0+a1*c1 a1*c0+a2*c1 a2*c0+a3*c1 a3*c0+a4*c1
+ //b0*c0+b1*c1 b1*c0+b2*c1 b2*c0+b3*c1 b3*c0+b4*c1
+
+ src_r0_16x8b = _mm_srli_si128(src_r0_16x8b, 4); //a2 a3 a3 a4 a4 a5 a5 a6 a6 a7 a7 a8 0 0 0 0
+ src_r1_16x8b = _mm_srli_si128(src_r1_16x8b, 4); //b2 b3 b3 b4 b4 b5 b5 b6 b6 b7 b7 b8 0 0 0 0
+
+ src_r0r1_t1_16x8b = _mm_unpacklo_epi64(src_r0_16x8b, src_r1_16x8b); //a2 a3 a3 a4 a4 a5 a5 a6 b2 b3 b3 b4 b4 b5 b5 b6
+ res_r0r1_t2_8x16b = _mm_maddubs_epi16(src_r0r1_t1_16x8b, coeff2_3_16x8b); //a2*c2+a3*c3 a3*c2+a4*c3 a4*c2+a5*c3 a5*c2+a6*c3
+ //b2*c2+b3*c3 b3*c2+b4*c3 b4*c2+b5*c3 b5*c2+b6*c3
+
+ src_r0_16x8b = _mm_srli_si128(src_r0_16x8b, 4); //a4 a5 a5 a6 a6 a7 a7 a8 0 0 0 0 0 0 0 0
+ src_r1_16x8b = _mm_srli_si128(src_r1_16x8b, 4); //b4 b5 b5 b6 b6 b7 b7 b8 0 0 0 0 0 0 0 0
+
+ src_r0r1_t1_16x8b = _mm_unpacklo_epi64(src_r0_16x8b, src_r1_16x8b); //a4 a5 a5 a6 a6 a7 a7 a8 b4 b5 b5 b6 b6 b7 b7 b8
+ res_r0r1_t3_8x16b = _mm_maddubs_epi16(src_r0r1_t1_16x8b, coeff4_5_16x8b); //a4*c4+a5*c5 a5*c4+a6*c5 a6*c4+a7*c5 a7*c4+a8*c5
+ //b4*c4+b5*c5 b5*c4+b6*c5 b4*c6+b7*c5 b7*c4+b8*c5
+
+ res_r0r1_t1_8x16b = _mm_add_epi16(res_r0r1_t1_8x16b, res_r0r1_t2_8x16b);
+ res_r0r1_t3_8x16b = _mm_add_epi16(const_val16_8x16b, res_r0r1_t3_8x16b);
+ res_r0r1_t1_8x16b = _mm_add_epi16(res_r0r1_t1_8x16b, res_r0r1_t3_8x16b); //a0*c0+a1*c1+a2*c2+a3*c3+a4*a4+a5*c5 + 15;
+ //a1*c0+a2*c1+a2*c2+a3*c3+a5*a4+a6*c5 + 15;
+ //a2*c0+a3*c1+a4*c2+a5*c3+a6*a4+a7*c5 + 15;
+ //a3*c0+a4*c1+a5*c2+a6*c3+a6*a4+a8*c5 + 15;
+ //b0*c0+b1*c1+b2*c2+b3*c3+b4*b4+b5*c5 + 15;
+ //b1*c0+b2*c1+b2*c2+b3*c3+b5*b4+b6*c5 + 15;
+ //b2*c0+b3*c1+b4*c2+b5*c3+b6*b4+b7*c5 + 15;
+ //b3*c0+b4*c1+b5*c2+b6*c3+b6*b4+b8*c5 + 15;
+
+ res_r0r1_t1_8x16b = _mm_srai_epi16(res_r0r1_t1_8x16b, 5); //shifting right by 5 bits.
+
+ res_r0r1_16x8b = _mm_packus_epi16(res_r0r1_t1_8x16b,res_r0r1_t1_8x16b);
+
+ res_r0r1_16x8b = _mm_avg_epu8(res_r0r1_16x8b,src_r0r1_vpel_16x8b);
+
+ _mm_maskmoveu_si128(res_r0r1_16x8b, mask_low32b, (char*)pu1_dst);
+ res_r0r1_16x8b = _mm_srli_si128(res_r0r1_16x8b, 4);
+ _mm_maskmoveu_si128(res_r0r1_16x8b, mask_low32b, (char*)(pu1_dst + dst_strd));
+
+ ht -= 2;
+ pu1_pred_horiz += src_strd << 1;
+ pu1_tmp2 += 8;
+ pu1_dst += dst_strd << 1;
+ }
+ while(ht > 0);
+ }
+ }
+ else if(wd == 8)
+ {
+ //vertical q-pel filter
+ {
+ __m128i src_r0_16x8b, src_r1_16x8b, src_r2_16x8b, src_r3_16x8b;
+ __m128i src_r4_16x8b, src_r5_16x8b, src_r6_16x8b;
+ __m128i src_r0r1_16x8b, src_r2r3_16x8b, src_r4r5_16x8b;
+
+ __m128i res_16x8b, res_t1_8x16b, res_t2_8x16b, res_t3_8x16b;
+
+ //epilogue: Load all the pred rows except sixth and seventh row for the
+ //first and second row processing.
+ src_r0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_pred_vert));
+ pu1_pred_vert = pu1_pred_vert + src_strd;
+
+ src_r1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_pred_vert));
+ pu1_pred_vert = pu1_pred_vert + src_strd;
+ src_r0_16x8b = _mm_unpacklo_epi64(src_r0_16x8b, src_r1_16x8b);
+
+ src_r2_16x8b = _mm_loadl_epi64((__m128i *)(pu1_pred_vert));
+ pu1_pred_vert = pu1_pred_vert + src_strd;
+ src_r1_16x8b = _mm_unpacklo_epi64(src_r1_16x8b, src_r2_16x8b);
+
+ src_r3_16x8b = _mm_loadl_epi64((__m128i *)(pu1_pred_vert));
+ pu1_pred_vert = pu1_pred_vert + src_strd;
+ src_r2_16x8b = _mm_unpacklo_epi64(src_r2_16x8b, src_r3_16x8b);
+
+ src_r4_16x8b = _mm_loadl_epi64((__m128i *)(pu1_pred_vert));
+ pu1_pred_vert = pu1_pred_vert + src_strd;
+ src_r3_16x8b = _mm_unpacklo_epi64(src_r3_16x8b, src_r4_16x8b);
+
+ //Core Loop: Process all the rows.
+ do
+ {
+ src_r5_16x8b = _mm_loadl_epi64((__m128i *)(pu1_pred_vert));
+ src_r4_16x8b = _mm_unpacklo_epi64(src_r4_16x8b, src_r5_16x8b);
+
+ src_r6_16x8b = _mm_loadl_epi64((__m128i *)(pu1_pred_vert + src_strd));
+ src_r5_16x8b = _mm_unpacklo_epi64(src_r5_16x8b, src_r6_16x8b);
+
+ src_r0r1_16x8b = _mm_unpacklo_epi8(src_r0_16x8b, src_r1_16x8b);
+ src_r2r3_16x8b = _mm_unpacklo_epi8(src_r2_16x8b, src_r3_16x8b);
+ src_r4r5_16x8b = _mm_unpacklo_epi8(src_r4_16x8b, src_r5_16x8b);
+
+ res_t1_8x16b = _mm_maddubs_epi16(src_r0r1_16x8b, coeff0_1_16x8b);
+ res_t2_8x16b = _mm_maddubs_epi16(src_r2r3_16x8b, coeff2_3_16x8b);
+ res_t3_8x16b = _mm_maddubs_epi16(src_r4r5_16x8b, coeff4_5_16x8b);
+
+ res_t1_8x16b = _mm_add_epi16(res_t1_8x16b, res_t2_8x16b);
+ res_t3_8x16b = _mm_add_epi16(const_val16_8x16b, res_t3_8x16b);
+ res_t1_8x16b = _mm_add_epi16(res_t3_8x16b, res_t1_8x16b);
+
+ res_t1_8x16b = _mm_srai_epi16(res_t1_8x16b, 5); //shifting right by 5 bits.
+ res_16x8b = _mm_packus_epi16(res_t1_8x16b, res_t1_8x16b);
+
+ _mm_storel_epi64((__m128i *)(pu1_tmp1), res_16x8b);
+
+ src_r0r1_16x8b = _mm_unpackhi_epi8(src_r0_16x8b, src_r1_16x8b);
+ src_r2r3_16x8b = _mm_unpackhi_epi8(src_r2_16x8b, src_r3_16x8b);
+ src_r4r5_16x8b = _mm_unpackhi_epi8(src_r4_16x8b, src_r5_16x8b);
+
+ res_t1_8x16b = _mm_maddubs_epi16(src_r0r1_16x8b, coeff0_1_16x8b);
+ res_t2_8x16b = _mm_maddubs_epi16(src_r2r3_16x8b, coeff2_3_16x8b);
+ res_t3_8x16b = _mm_maddubs_epi16(src_r4r5_16x8b, coeff4_5_16x8b);
+
+ res_t1_8x16b = _mm_add_epi16(res_t1_8x16b, res_t2_8x16b);
+ res_t3_8x16b = _mm_add_epi16(const_val16_8x16b, res_t3_8x16b);
+ res_t1_8x16b = _mm_add_epi16(res_t3_8x16b, res_t1_8x16b);
+
+ res_t1_8x16b = _mm_srai_epi16(res_t1_8x16b, 5); //shifting right by 5 bits.
+ res_16x8b = _mm_packus_epi16(res_t1_8x16b, res_t1_8x16b);
+
+ _mm_storel_epi64((__m128i *)(pu1_tmp1 + 8), res_16x8b);
+
+ src_r0_16x8b = src_r2_16x8b;
+ src_r1_16x8b = src_r3_16x8b;
+ src_r2_16x8b = src_r4_16x8b;
+ src_r3_16x8b = src_r5_16x8b;
+ src_r4_16x8b = src_r6_16x8b;
+
+ ht_temp -= 2;
+ pu1_pred_vert += src_strd << 1;
+ pu1_tmp1 += 16;
+ }
+ while(ht_temp > 0);
+ }
+
+ //horizontal q-pel filter
+ {
+ __m128i src_r0_16x8b, src_r1_16x8b, src_r0_sht_16x8b, src_r1_sht_16x8b;
+ __m128i src_r0_t1_16x8b, src_r1_t1_16x8b;
+ __m128i src_r0_vpel_16x8b, src_r1_vpel_16x8b;
+
+ __m128i res_r0_t1_8x16b, res_r0_t2_8x16b, res_r0_t3_8x16b;
+ __m128i res_r1_t1_8x16b, res_r1_t2_8x16b, res_r1_t3_8x16b, res_16x8b;
+
+ //Row0 : a0 a1 a2 a3 a4 a5 a6 a7 a8 a9.....
+ //Row1 : b0 b1 b2 b3 b4 b5 b6 b7 b8 b9.....
+
+ do
+ {
+ src_r0_16x8b = _mm_loadu_si128((__m128i *)(pu1_pred_horiz)); //a0 a1 a2 a3 a4 a5 a6 a7 a8 a9....a15
+ src_r1_16x8b = _mm_loadu_si128((__m128i *)(pu1_pred_horiz + src_strd)); //b0 b1 b2 b3 b4 b5 b6 b7 b8 b9....b15
+
+ src_r0_vpel_16x8b = _mm_loadl_epi64((__m128i *)(pu1_tmp2)); //a2 a3 a4 a5 a6 a7 a8....a15 0 or
+ //a3 a4 a5 a6 a7 a8 a9....a15 0
+ src_r1_vpel_16x8b = _mm_loadl_epi64((__m128i *)(pu1_tmp2 + 8));
+ //b2 b3 b4 b5 b6 b7 b8....b15 0 or
+ //b3 b4 b5 b6 b7 b8 b9....b15 0
+
+ src_r0_sht_16x8b = _mm_srli_si128(src_r0_16x8b, 1); //a1 a2 a3 a4 a5 a6 a7 a8 a9....a15 0
+ src_r1_sht_16x8b = _mm_srli_si128(src_r1_16x8b, 1); //b1 b2 b3 b4 b5 b6 b7 b8 b9....b15 0
+
+ src_r0_t1_16x8b = _mm_unpacklo_epi8(src_r0_16x8b, src_r0_sht_16x8b); //a0 a1 a1 a2 a2 a3 a3 a4 a4 a5 a5 a6 a6 a7 a7 a8
+ src_r1_t1_16x8b = _mm_unpacklo_epi8(src_r1_16x8b, src_r1_sht_16x8b); //b0 b1 b1 b2 b2 b3 b3 b4 b4 b5 b5 b6 b6 b7 b7 b8
+
+ res_r0_t1_8x16b = _mm_maddubs_epi16(src_r0_t1_16x8b, coeff0_1_16x8b); //a0*c0+a1*c1 a1*c0+a2*c1 a2*c0+a3*c1 a3*c0+a4*c1
+ //a4*c0+a5*c1 a5*c0+a6*c1 a6*c0+a7*c1 a7*c0+a8*c1
+ res_r1_t1_8x16b = _mm_maddubs_epi16(src_r1_t1_16x8b, coeff0_1_16x8b); //b0*c0+b1*c1 b1*c0+b2*c1 b2*c0+b3*c1 b3*c0+b4*c1
+ //b4*c0+b5*c1 b5*c0+b6*c1 b6*c0+b7*c1 b7*c0+b8*c1
+
+ src_r0_16x8b = _mm_srli_si128(src_r0_16x8b, 2); //a2 a3 a4 a5 a6 a7 a8 a9....a15 0 0
+ src_r1_16x8b = _mm_srli_si128(src_r1_16x8b, 2); //b2 b3 b4 b5 b6 b7 b8 b9....b15 0 0
+
+ src_r0_sht_16x8b = _mm_srli_si128(src_r0_sht_16x8b, 2); //a3 a4 a5 a6 a7 a8 a9....a15 0 0 0
+ src_r1_sht_16x8b = _mm_srli_si128(src_r1_sht_16x8b, 2); //b3 b4 b5 b6 b7 b8 b9....b15 0 0 0
+
+ src_r0_t1_16x8b = _mm_unpacklo_epi8(src_r0_16x8b, src_r0_sht_16x8b); //a2 a3 a3 a4 a4 a5 a5 a6 a6 a7 a7 a8 a8 a9 a9 a10
+ src_r1_t1_16x8b = _mm_unpacklo_epi8(src_r1_16x8b, src_r1_sht_16x8b); //b2 b3 b3 b4 b4 b5 b5 b6 b6 b7 b7 b8 a8 a9 a9 a10
+
+ res_r0_t2_8x16b = _mm_maddubs_epi16(src_r0_t1_16x8b, coeff2_3_16x8b); //a2*c2+a3*c3 a3*c2+a4*c3 a4*c2+a5*c3 a5*c2+a6*c3
+ //a6*c2+a7*c3 a7*c2+a8*c3 a8*c2+a9*c3 a9*c2+a10*c3
+ res_r1_t2_8x16b = _mm_maddubs_epi16(src_r1_t1_16x8b, coeff2_3_16x8b); //b2*c2+b3*c3 b3*c2+b4*c3 b2*c4+b5*c3 b5*c2+b6*c3
+ //b6*c2+b7*c3 b7*c2+b8*c3 b8*c2+b9*c3 b9*c2+b10*c3
+
+ src_r0_16x8b = _mm_srli_si128(src_r0_16x8b, 2); //a4 a5 a6 a7 a8 a9....a15 0 0 0 0
+ src_r1_16x8b = _mm_srli_si128(src_r1_16x8b, 2); //b4 b5 b6 b7 b8 b9....b15 0 0 0 0
+
+ src_r0_sht_16x8b = _mm_srli_si128(src_r0_sht_16x8b, 2); //a5 a6 a7 a8 a9....a15 0 0 0 0 0
+ src_r1_sht_16x8b = _mm_srli_si128(src_r1_sht_16x8b, 2); //b5 b6 b7 b8 b9....b15 0 0 0 0 0
+
+ src_r0_t1_16x8b = _mm_unpacklo_epi8(src_r0_16x8b, src_r0_sht_16x8b); //a4 a5 a5 a6 a6 a7 a7 a8 a8 a9 a9 a10 a10 a11 a11 a12
+ src_r1_t1_16x8b = _mm_unpacklo_epi8(src_r1_16x8b, src_r1_sht_16x8b); //b4 b5 b5 b6 b6 b7 b7 b8 b8 b9 b9 b10 b10 b11 b11 b12
+
+ res_r0_t3_8x16b = _mm_maddubs_epi16(src_r0_t1_16x8b, coeff4_5_16x8b); //a4*c4+a5*c5 a5*c4+a6*c5 a6*c4+a7*c5 a7*c4+a8*c5
+ //a8*c4+a9*c5 a9*c4+a10*c5 a10*c4+a11*c5 a11*c4+a12*c5
+ res_r1_t3_8x16b = _mm_maddubs_epi16(src_r1_t1_16x8b, coeff4_5_16x8b); //b4*c4+b5*c5 b5*c4+b6*c5 b6*c4+b7*c5 b7*c4+b8*c5
+ //b8*c4+b9*c5 b9*c4+b10*c5 b10*c4+b11*c5 b11*c4+b12*c5
+ res_r0_t1_8x16b = _mm_add_epi16(res_r0_t1_8x16b, res_r0_t2_8x16b);
+ res_r0_t3_8x16b = _mm_add_epi16(const_val16_8x16b, res_r0_t3_8x16b);
+ res_r0_t1_8x16b = _mm_add_epi16(res_r0_t1_8x16b, res_r0_t3_8x16b);
+ res_r0_t1_8x16b = _mm_srai_epi16(res_r0_t1_8x16b, 5); //shifting right by 5 bits.
+
+ res_16x8b = _mm_packus_epi16(res_r0_t1_8x16b, res_r0_t1_8x16b);
+ res_16x8b = _mm_avg_epu8(res_16x8b, src_r0_vpel_16x8b);
+
+ _mm_storel_epi64((__m128i *)(pu1_dst), res_16x8b);
+
+ res_r1_t1_8x16b = _mm_add_epi16(res_r1_t1_8x16b, res_r1_t2_8x16b);
+ res_r1_t3_8x16b = _mm_add_epi16(const_val16_8x16b, res_r1_t3_8x16b);
+ res_r1_t1_8x16b = _mm_add_epi16(res_r1_t1_8x16b, res_r1_t3_8x16b);
+ res_r1_t1_8x16b = _mm_srai_epi16(res_r1_t1_8x16b, 5); //shifting right by 5 bits.
+
+ res_16x8b = _mm_packus_epi16(res_r1_t1_8x16b, res_r1_t1_8x16b);
+ res_16x8b = _mm_avg_epu8(res_16x8b,src_r1_vpel_16x8b);
+
+ _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd), res_16x8b);
+
+ ht -= 2;
+ pu1_pred_horiz += src_strd << 1;
+ pu1_dst += dst_strd << 1;
+ pu1_tmp2 += 16;
+ }
+ while(ht > 0);
+ }
+ }
+ else // wd == 16
+ {
+ //vertical q-pel filter
+ {
+ __m128i src_r0_16x8b, src_r1_16x8b, src_r2_16x8b, src_r3_16x8b;
+ __m128i src_r4_16x8b, src_r5_16x8b, src_r6_16x8b;
+ __m128i src_r0r1_16x8b, src_r2r3_16x8b, src_r4r5_16x8b;
+
+ __m128i res_t0_8x16b, res_t1_8x16b, res_t2_8x16b, res_t3_8x16b;
+ __m128i res_16x8b;
+
+ //epilogue: Load all the pred rows except sixth and seventh row for the
+ //first and second row processing.
+ src_r0_16x8b = _mm_loadu_si128((__m128i *)(pu1_pred_vert));
+ pu1_pred_vert = pu1_pred_vert + src_strd;
+ src_r1_16x8b = _mm_loadu_si128((__m128i *)(pu1_pred_vert));
+ pu1_pred_vert = pu1_pred_vert + src_strd;
+ src_r2_16x8b = _mm_loadu_si128((__m128i *)(pu1_pred_vert));
+ pu1_pred_vert = pu1_pred_vert + src_strd;
+ src_r3_16x8b = _mm_loadu_si128((__m128i *)(pu1_pred_vert));
+ pu1_pred_vert = pu1_pred_vert + src_strd;
+ src_r4_16x8b = _mm_loadu_si128((__m128i *)(pu1_pred_vert));
+ pu1_pred_vert = pu1_pred_vert + src_strd;
+
+ //Core Loop: Process all the rows.
+ do
+ {
+ src_r5_16x8b = _mm_loadu_si128((__m128i *)(pu1_pred_vert));
+ src_r6_16x8b = _mm_loadu_si128((__m128i *)(pu1_pred_vert + src_strd));
+
+ src_r0r1_16x8b = _mm_unpacklo_epi8(src_r0_16x8b, src_r1_16x8b);
+ src_r2r3_16x8b = _mm_unpacklo_epi8(src_r2_16x8b, src_r3_16x8b);
+ src_r4r5_16x8b = _mm_unpacklo_epi8(src_r4_16x8b, src_r5_16x8b);
+
+ res_t1_8x16b = _mm_maddubs_epi16(src_r0r1_16x8b, coeff0_1_16x8b);
+ res_t2_8x16b = _mm_maddubs_epi16(src_r2r3_16x8b, coeff2_3_16x8b);
+ res_t3_8x16b = _mm_maddubs_epi16(src_r4r5_16x8b, coeff4_5_16x8b);
+
+ res_t1_8x16b = _mm_add_epi16(res_t1_8x16b, res_t2_8x16b);
+ res_t3_8x16b = _mm_add_epi16(const_val16_8x16b, res_t3_8x16b);
+ res_t1_8x16b = _mm_add_epi16(res_t1_8x16b, res_t3_8x16b);
+ res_t0_8x16b = _mm_srai_epi16(res_t1_8x16b, 5); //shifting right by 5 bits.
+
+ src_r0r1_16x8b = _mm_unpackhi_epi8(src_r0_16x8b, src_r1_16x8b);
+ src_r2r3_16x8b = _mm_unpackhi_epi8(src_r2_16x8b, src_r3_16x8b);
+ src_r4r5_16x8b = _mm_unpackhi_epi8(src_r4_16x8b, src_r5_16x8b);
+
+ res_t1_8x16b = _mm_maddubs_epi16(src_r0r1_16x8b, coeff0_1_16x8b);
+ res_t2_8x16b = _mm_maddubs_epi16(src_r2r3_16x8b, coeff2_3_16x8b);
+ res_t3_8x16b = _mm_maddubs_epi16(src_r4r5_16x8b, coeff4_5_16x8b);
+
+ res_t1_8x16b = _mm_add_epi16(res_t1_8x16b, res_t2_8x16b);
+ res_t3_8x16b = _mm_add_epi16(const_val16_8x16b, res_t3_8x16b);
+ res_t1_8x16b = _mm_add_epi16(res_t1_8x16b, res_t3_8x16b);
+ res_t1_8x16b = _mm_srai_epi16(res_t1_8x16b, 5); //shifting right by 5 bits.
+
+ res_16x8b = _mm_packus_epi16(res_t0_8x16b, res_t1_8x16b);
+
+ _mm_storeu_si128((__m128i *)(pu1_tmp1), res_16x8b);
+
+ src_r0r1_16x8b = _mm_unpacklo_epi8(src_r1_16x8b, src_r2_16x8b);
+ src_r2r3_16x8b = _mm_unpacklo_epi8(src_r3_16x8b, src_r4_16x8b);
+ src_r4r5_16x8b = _mm_unpacklo_epi8(src_r5_16x8b, src_r6_16x8b);
+
+ res_t1_8x16b = _mm_maddubs_epi16(src_r0r1_16x8b, coeff0_1_16x8b);
+ res_t2_8x16b = _mm_maddubs_epi16(src_r2r3_16x8b, coeff2_3_16x8b);
+ res_t3_8x16b = _mm_maddubs_epi16(src_r4r5_16x8b, coeff4_5_16x8b);
+
+ res_t1_8x16b = _mm_add_epi16(res_t1_8x16b, res_t2_8x16b);
+ res_t3_8x16b = _mm_add_epi16(const_val16_8x16b, res_t3_8x16b);
+ res_t1_8x16b = _mm_add_epi16(res_t1_8x16b, res_t3_8x16b);
+ res_t0_8x16b = _mm_srai_epi16(res_t1_8x16b, 5); //shifting right by 5 bits.
+
+ src_r0r1_16x8b = _mm_unpackhi_epi8(src_r1_16x8b, src_r2_16x8b);
+ src_r2r3_16x8b = _mm_unpackhi_epi8(src_r3_16x8b, src_r4_16x8b);
+ src_r4r5_16x8b = _mm_unpackhi_epi8(src_r5_16x8b, src_r6_16x8b);
+
+ res_t1_8x16b = _mm_maddubs_epi16(src_r0r1_16x8b, coeff0_1_16x8b);
+ res_t2_8x16b = _mm_maddubs_epi16(src_r2r3_16x8b, coeff2_3_16x8b);
+ res_t3_8x16b = _mm_maddubs_epi16(src_r4r5_16x8b, coeff4_5_16x8b);
+
+ res_t1_8x16b = _mm_add_epi16(res_t1_8x16b, res_t2_8x16b);
+ res_t3_8x16b = _mm_add_epi16(const_val16_8x16b, res_t3_8x16b);
+ res_t1_8x16b = _mm_add_epi16(res_t1_8x16b, res_t3_8x16b);
+ res_t1_8x16b = _mm_srai_epi16(res_t1_8x16b, 5); //shifting right by 5 bits.
+
+ res_16x8b = _mm_packus_epi16(res_t0_8x16b, res_t1_8x16b);
+
+ _mm_storeu_si128((__m128i *)(pu1_tmp1 + 16), res_16x8b);
+
+ src_r0_16x8b = src_r2_16x8b;
+ src_r1_16x8b = src_r3_16x8b;
+ src_r2_16x8b = src_r4_16x8b;
+ src_r3_16x8b = src_r5_16x8b;
+ src_r4_16x8b = src_r6_16x8b;
+
+ ht_temp -= 2;
+ pu1_pred_vert += src_strd << 1;
+ pu1_tmp1 += 32;
+ }
+ while(ht_temp > 0);
+ }
+ //horizontal q-pel filter
+ {
+ __m128i src_r0_16x8b, src_r1_16x8b, src_r0_sht_16x8b, src_r1_sht_16x8b;
+ __m128i src_r0_t1_16x8b, src_r1_t1_16x8b;
+ __m128i src_vpel_16x8b;
+
+ __m128i res_r0_t1_8x16b, res_r0_t2_8x16b, res_r0_t3_8x16b;
+ __m128i res_r1_t1_8x16b, res_r1_t2_8x16b, res_r1_t3_8x16b;
+ __m128i res_16x8b;
+
+ //Row0 : a0 a1 a2 a3 a4 a5 a6 a7 a8 a9.....
+ //Row0 : b0 b1 b2 b3 b4 b5 b6 b7 b8 b9.....
+ //b0 is same a8. Similarly other bn pixels are same as a(n+8) pixels.
+
+ do
+ {
+ src_r0_16x8b = _mm_loadu_si128((__m128i *)(pu1_pred_horiz)); //a0 a1 a2 a3 a4 a5 a6 a7 a8 a9....a15
+ src_r1_16x8b = _mm_loadu_si128((__m128i *)(pu1_pred_horiz + 8)); //b0 b1 b2 b3 b4 b5 b6 b7 b8 b9....b15
+ src_vpel_16x8b = _mm_loadu_si128((__m128i *)(pu1_tmp2));
+
+ src_r0_sht_16x8b = _mm_srli_si128(src_r0_16x8b, 1); //a1 a2 a3 a4 a5 a6 a7 a8 a9....a15 0
+ src_r1_sht_16x8b = _mm_srli_si128(src_r1_16x8b, 1); //b1 b2 b3 b4 b5 b6 b7 b8 b9....b15 0
+
+ src_r0_t1_16x8b = _mm_unpacklo_epi8(src_r0_16x8b, src_r0_sht_16x8b); //a0 a1 a1 a2 a2 a3 a3 a4 a4 a5 a5 a6 a6 a7 a7 a8
+ src_r1_t1_16x8b = _mm_unpacklo_epi8(src_r1_16x8b, src_r1_sht_16x8b); //b0 b1 b1 b2 b2 b3 b3 b4 b4 b5 b5 b6 b6 b7 b7 b8
+
+ res_r0_t1_8x16b = _mm_maddubs_epi16(src_r0_t1_16x8b, coeff0_1_16x8b); //a0*c0+a1*c1 a1*c0+a2*c1 a2*c0+a3*c1 a3*c0+a4*c1
+ //a4*c0+a5*c1 a5*c0+a6*c1 a6*c0+a7*c1 a7*c0+a8*c1
+ res_r1_t1_8x16b = _mm_maddubs_epi16(src_r1_t1_16x8b, coeff0_1_16x8b); //b0*c0+b1*c1 b1*c0+b2*c1 b2*c0+b3*c1 b3*c0+b4*c1
+ //b4*c0+b5*c1 b5*c0+b6*c1 b6*c0+b7*c1 b7*c0+b8*c1
+
+ src_r0_16x8b = _mm_srli_si128(src_r0_16x8b, 2); //a2 a3 a4 a5 a6 a7 a8 a9....a15 0 0
+ src_r1_16x8b = _mm_srli_si128(src_r1_16x8b, 2); //b2 b3 b4 b5 b6 b7 b8 b9....b15 0 0
+
+ src_r0_sht_16x8b = _mm_srli_si128(src_r0_sht_16x8b, 2); //a3 a4 a5 a6 a7 a8 a9....a15 0 0 0
+ src_r1_sht_16x8b = _mm_srli_si128(src_r1_sht_16x8b, 2); //b3 b4 b5 b6 b7 b8 b9....b15 0 0 0
+
+ src_r0_t1_16x8b = _mm_unpacklo_epi8(src_r0_16x8b, src_r0_sht_16x8b); //a2 a3 a3 a4 a4 a5 a5 a6 a6 a7 a7 a8 a8 a9 a9 a10
+ src_r1_t1_16x8b = _mm_unpacklo_epi8(src_r1_16x8b, src_r1_sht_16x8b); //b2 b3 b3 b4 b4 b5 b5 b6 b6 b7 b7 b8 a8 a9 a9 a10
+
+ res_r0_t2_8x16b = _mm_maddubs_epi16(src_r0_t1_16x8b, coeff2_3_16x8b); //a2*c2+a3*c3 a3*c2+a4*c3 a4*c2+a5*c3 a5*c2+a6*c3
+ //a6*c2+a7*c3 a7*c2+a8*c3 a8*c2+a9*c3 a9*c2+a10*c3
+ res_r1_t2_8x16b = _mm_maddubs_epi16(src_r1_t1_16x8b, coeff2_3_16x8b); //b2*c2+b3*c3 b3*c2+b4*c3 b2*c4+b5*c3 b5*c2+b6*c3
+ //b6*c2+b7*c3 b7*c2+b8*c3 b8*c2+b9*c3 b9*c2+b10*c3
+
+ src_r0_16x8b = _mm_srli_si128(src_r0_16x8b, 2); //a4 a5 a6 a7 a8 a9....a15 0 0 0 0
+ src_r1_16x8b = _mm_srli_si128(src_r1_16x8b, 2); //b4 b5 b6 b7 b8 b9....b15 0 0 0 0
+
+ src_r0_sht_16x8b = _mm_srli_si128(src_r0_sht_16x8b, 2); //a5 a6 a7 a8 a9....a15 0 0 0 0 0
+ src_r1_sht_16x8b = _mm_srli_si128(src_r1_sht_16x8b, 2); //b5 b6 b7 b8 b9....b15 0 0 0 0 0
+
+ src_r0_t1_16x8b = _mm_unpacklo_epi8(src_r0_16x8b, src_r0_sht_16x8b); //a4 a5 a5 a6 a6 a7 a7 a8 a8 a9 a9 a10 a10 a11 a11 a12
+ src_r1_t1_16x8b = _mm_unpacklo_epi8(src_r1_16x8b, src_r1_sht_16x8b); //b4 b5 b5 b6 b6 b7 b7 b8 b8 b9 b9 b10 b10 b11 b11 b12
+
+ res_r0_t3_8x16b = _mm_maddubs_epi16(src_r0_t1_16x8b, coeff4_5_16x8b); //a4*c4+a5*c5 a5*c4+a6*c5 a6*c4+a7*c5 a7*c4+a8*c5
+ //a8*c4+a9*c5 a9*c4+a10*c5 a10*c4+a11*c5 a11*c4+a12*c5
+ res_r1_t3_8x16b = _mm_maddubs_epi16(src_r1_t1_16x8b, coeff4_5_16x8b); //b4*c4+b5*c5 b5*c4+b6*c5 b6*c4+b7*c5 b7*c4+b8*c5
+ //b8*c4+b9*c5 b9*c4+b10*c5 b10*c4+b11*c5 b11*c4+b12*c5
+ res_r0_t1_8x16b = _mm_add_epi16(res_r0_t1_8x16b, res_r0_t2_8x16b);
+ res_r0_t3_8x16b = _mm_add_epi16(const_val16_8x16b, res_r0_t3_8x16b);
+ res_r0_t1_8x16b = _mm_add_epi16(res_r0_t1_8x16b, res_r0_t3_8x16b);
+ res_r0_t1_8x16b = _mm_srai_epi16(res_r0_t1_8x16b, 5); //shifting right by 5 bits.
+
+ res_r1_t1_8x16b = _mm_add_epi16(res_r1_t1_8x16b, res_r1_t2_8x16b);
+ res_r1_t1_8x16b = _mm_add_epi16(res_r1_t1_8x16b, res_r1_t3_8x16b);
+ res_r1_t1_8x16b = _mm_add_epi16(res_r1_t1_8x16b, const_val16_8x16b);
+ res_r1_t1_8x16b = _mm_srai_epi16(res_r1_t1_8x16b, 5); //shifting right by 5 bits.
+
+ res_16x8b = _mm_packus_epi16(res_r0_t1_8x16b, res_r1_t1_8x16b);
+
+ res_16x8b = _mm_avg_epu8(res_16x8b, src_vpel_16x8b);
+ _mm_storeu_si128((__m128i *)(pu1_dst), res_16x8b);
+
+ ht --;
+ pu1_pred_horiz += src_strd;
+ pu1_dst += dst_strd;
+ pu1_tmp2 += 16;
+ }
+ while(ht > 0);
+ }
+ }
+}
+
+/*****************************************************************************/
+/* */
+/* Function Name : ih264_inter_pred_luma_horz_qpel_vert_hpel_ssse3 */
+/* */
+/* Description : This function implements a six-tap filter vertically and */
+/* horizontally on ht x wd block separately and averages */
+/* the two sets of values to calculate values at (1/4,1/2), */
+/* or (3/4, 1/2) as mentioned in sec. 8.4.2.2.1 titled */
+/* "Luma sample interpolation process". (ht,wd) can be */
+/* (4,4), (8,4), (4,8), (8,8), (16,8), (8,16) or (16,16). */
+/* */
+/* Inputs : puc_src - pointer to source */
+/* puc_dst - pointer to destination */
+/* src_strd - stride for source */
+/* dst_strd - stride for destination */
+/* ht - height of the block */
+/* wd - width of the block */
+/* pu1_tmp - pointer to temporary buffer */
+/* dydx - x and y reference offset for q-pel */
+/* calculations */
+/* */
+/* Issues : None */
+/* */
+/* Revision History: */
+/* */
+/* DD MM YYYY Author(s) Changes */
+/* 13 02 2015 Kaushik Initial Version */
+/* Senthoor */
+/* */
+/*****************************************************************************/
+void ih264_inter_pred_luma_horz_qpel_vert_hpel_ssse3(UWORD8 *pu1_src,
+ UWORD8 *pu1_dst,
+ WORD32 src_strd,
+ WORD32 dst_strd,
+ WORD32 ht,
+ WORD32 wd,
+ UWORD8* pu1_tmp,
+ WORD32 dydx)
+{
+ WORD32 ht_temp;
+ WORD32 x_offset;
+ WORD32 off0,off1, off2, off3, off4, off5;
+ WORD16 *pi2_temp1,*pi2_temp2,*pi2_temp3;
+
+ ht_temp = ht;
+ x_offset = dydx & 0x3;
+ pi2_temp1 = (WORD16 *)pu1_tmp;
+ pi2_temp2 = pi2_temp1;
+ pi2_temp3 = pi2_temp1 + (x_offset >> 1);
+
+ pu1_src -= 2 * src_strd;
+ pu1_src -= 2;
+ pi2_temp3 += 2;
+ //the filter input starts from x[-2] (till x[3])
+
+ if(wd == 4)
+ {
+ //vertical half-pel
+ {
+ __m128i src_r0_16x8b, src_r1_16x8b, src_r2_16x8b, src_r3_16x8b, src_r4_16x8b;
+ __m128i src_r5_16x8b, src_r6_16x8b;
+ __m128i src_r0r1_16x8b, src_r2r3_16x8b, src_r4r5_16x8b;
+
+ __m128i res_t1_8x16b, res_t2_8x16b, res_t3_8x16b;
+
+ __m128i coeff0_1_16x8b, coeff2_3_16x8b, coeff4_5_16x8b;
+
+ coeff0_1_16x8b = _mm_set1_epi32(0xFB01FB01); //c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 c0 c1
+ coeff2_3_16x8b = _mm_set1_epi32(0x14141414); //c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 c2 c3
+ coeff4_5_16x8b = _mm_set1_epi32(0x01FB01FB); //c4 c5 c5 c5 c4 c5 c5 c5 c4 c5 c5 c5 c4 c5 c5 c5
+ //c0 = c5 = 1, c1 = c4 = -5, c2 = c3 = 20
+ off0 = -((src_strd << 2) + src_strd) + 8;
+ off1 = -(src_strd << 2) + 8;
+ off2 = -((src_strd << 1) + src_strd) + 8;
+ off3 = -(src_strd << 1) + 8;
+ off4 = -src_strd + 8;
+ off5 = 8;
+
+ //epilogue: Load all the pred rows except sixth and seventh row for the
+ //first and second row processing.
+ src_r0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src));
+ pu1_src = pu1_src + src_strd;
+
+ src_r1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src));
+ pu1_src = pu1_src + src_strd;
+
+ src_r2_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src));
+ pu1_src = pu1_src + src_strd;
+
+ src_r3_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src));
+ pu1_src = pu1_src + src_strd;
+
+ src_r4_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src));
+ pu1_src = pu1_src + src_strd;
+
+ //Core Loop: Process all the rows.
+ do
+ {
+ src_r5_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src));
+
+ src_r0r1_16x8b = _mm_unpacklo_epi8(src_r0_16x8b, src_r1_16x8b);
+ src_r2r3_16x8b = _mm_unpacklo_epi8(src_r2_16x8b, src_r3_16x8b);
+ src_r4r5_16x8b = _mm_unpacklo_epi8(src_r4_16x8b, src_r5_16x8b);
+
+ res_t1_8x16b = _mm_maddubs_epi16(src_r0r1_16x8b, coeff0_1_16x8b);
+ res_t2_8x16b = _mm_maddubs_epi16(src_r2r3_16x8b, coeff2_3_16x8b);
+ res_t3_8x16b = _mm_maddubs_epi16(src_r4r5_16x8b, coeff4_5_16x8b);
+
+ res_t1_8x16b = _mm_add_epi16(res_t2_8x16b, res_t1_8x16b);
+ res_t1_8x16b = _mm_add_epi16(res_t3_8x16b, res_t1_8x16b);
+
+ _mm_storeu_si128((__m128i *)(pi2_temp1), res_t1_8x16b);
+
+ pi2_temp1[8] = pu1_src[off0] + pu1_src[off5]
+ - (pu1_src[off1] + pu1_src[off4])
+ + ((pu1_src[off2] + pu1_src[off3] - pu1_src[off1] - pu1_src[off4]) << 2)
+ + ((pu1_src[off2] + pu1_src[off3]) << 4);
+
+ pu1_src = pu1_src + src_strd;
+ pi2_temp1 = pi2_temp1 + 9;
+
+ src_r6_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src));
+
+ src_r0r1_16x8b = _mm_unpacklo_epi8(src_r1_16x8b, src_r2_16x8b);
+ src_r2r3_16x8b = _mm_unpacklo_epi8(src_r3_16x8b, src_r4_16x8b);
+ src_r4r5_16x8b = _mm_unpacklo_epi8(src_r5_16x8b, src_r6_16x8b);
+
+ res_t1_8x16b = _mm_maddubs_epi16(src_r0r1_16x8b, coeff0_1_16x8b);
+ res_t2_8x16b = _mm_maddubs_epi16(src_r2r3_16x8b, coeff2_3_16x8b);
+ res_t3_8x16b = _mm_maddubs_epi16(src_r4r5_16x8b, coeff4_5_16x8b);
+
+ res_t1_8x16b = _mm_add_epi16(res_t2_8x16b, res_t1_8x16b);
+ res_t1_8x16b = _mm_add_epi16(res_t3_8x16b, res_t1_8x16b);
+
+ _mm_storeu_si128((__m128i *)(pi2_temp1), res_t1_8x16b);
+
+ pi2_temp1[8] = pu1_src[off0] + pu1_src[off5]
+ - (pu1_src[off1] + pu1_src[off4])
+ + ((pu1_src[off2] + pu1_src[off3] - pu1_src[off1] - pu1_src[off4]) << 2)
+ + ((pu1_src[off2] + pu1_src[off3]) << 4);
+
+ ht_temp -= 2;
+ pu1_src = pu1_src + src_strd;
+ pi2_temp1 = pi2_temp1 + 9;
+
+ src_r0_16x8b = src_r2_16x8b;
+ src_r1_16x8b = src_r3_16x8b;
+ src_r2_16x8b = src_r4_16x8b;
+ src_r3_16x8b = src_r5_16x8b;
+ src_r4_16x8b = src_r6_16x8b;
+ }
+ while(ht_temp > 0);
+ }
+
+ //horizontal q-pel
+ {
+ __m128i src_r0_8x16b, src_r1_8x16b, src_r2_8x16b;
+ __m128i src_r3_8x16b, src_r4_8x16b, src_r5_8x16b;
+ __m128i src_r0r1_c0_8x16b, src_r2r3_c0_8x16b, src_r4r5_c0_8x16b;
+ __m128i src_hpel_16x8b, src_hpel_8x16b;
+
+ __m128i res_t1_4x32b, res_t2_4x32b, res_t3_4x32b;
+ __m128i res_8x16b, res_16x8b;
+
+ __m128i coeff0_1_8x16b, coeff2_3_8x16b, coeff4_5_8x16b;
+ __m128i const_val512_4x32b, const_val16_8x16b;
+ __m128i mask_low32b;
+
+ mask_low32b = _mm_set1_epi8(0xff);
+
+ coeff0_1_8x16b = _mm_set1_epi32(0xFFFB0001);
+ coeff2_3_8x16b = _mm_set1_epi32(0x00140014);
+ coeff4_5_8x16b = _mm_set1_epi32(0x0001FFFB);
+
+ mask_low32b = _mm_srli_si128(mask_low32b, 12);
+
+ const_val512_4x32b = _mm_set1_epi32(512);
+ const_val16_8x16b = _mm_set1_epi16(16);
+
+ do
+ {
+ src_r0_8x16b = _mm_loadl_epi64((__m128i *)(pi2_temp2));
+ src_r1_8x16b = _mm_loadu_si128((__m128i *)(pi2_temp2 + 1));
+ src_r2_8x16b = _mm_srli_si128(src_r1_8x16b, 2);
+ src_r3_8x16b = _mm_srli_si128(src_r1_8x16b, 4);
+ src_r4_8x16b = _mm_srli_si128(src_r1_8x16b, 6);
+ src_r5_8x16b = _mm_srli_si128(src_r1_8x16b, 8);
+
+ src_r0r1_c0_8x16b = _mm_unpacklo_epi16(src_r0_8x16b, src_r1_8x16b);
+ src_r2r3_c0_8x16b = _mm_unpacklo_epi16(src_r2_8x16b, src_r3_8x16b);
+ src_r4r5_c0_8x16b = _mm_unpacklo_epi16(src_r4_8x16b, src_r5_8x16b);
+
+ res_t1_4x32b = _mm_madd_epi16(src_r0r1_c0_8x16b, coeff0_1_8x16b);
+ res_t2_4x32b = _mm_madd_epi16(src_r2r3_c0_8x16b, coeff2_3_8x16b);
+ res_t3_4x32b = _mm_madd_epi16(src_r4r5_c0_8x16b, coeff4_5_8x16b);
+
+ res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t2_4x32b);
+ res_t3_4x32b = _mm_add_epi32(const_val512_4x32b, res_t3_4x32b);
+ res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t3_4x32b);
+ res_t1_4x32b = _mm_srai_epi32(res_t1_4x32b, 10);
+
+ res_8x16b = _mm_packs_epi32(res_t1_4x32b, res_t1_4x32b);
+ res_16x8b = _mm_packus_epi16(res_8x16b, res_8x16b);
+
+ src_hpel_8x16b = _mm_loadl_epi64((__m128i *)(pi2_temp3));
+ src_hpel_8x16b = _mm_add_epi16(src_hpel_8x16b, const_val16_8x16b);
+ src_hpel_8x16b = _mm_srai_epi16(src_hpel_8x16b, 5); //shifting right by 5 bits.
+ src_hpel_16x8b = _mm_packus_epi16(src_hpel_8x16b, src_hpel_8x16b);
+
+ res_16x8b = _mm_avg_epu8(res_16x8b, src_hpel_16x8b);
+
+ _mm_maskmoveu_si128(res_16x8b, mask_low32b, (char*)pu1_dst);
+
+ ht--;
+ pi2_temp2 = pi2_temp2 + 4 + 5;
+ pi2_temp3 = pi2_temp3 + 4 + 5;
+ pu1_dst = pu1_dst + dst_strd;
+ }
+ while(ht > 0);
+ }
+ }
+ else if(wd == 8)
+ {
+ // vertical half-pel
+ {
+ __m128i src_r0_16x8b, src_r1_16x8b, src_r2_16x8b, src_r3_16x8b, src_r4_16x8b;
+ __m128i src_r5_16x8b, src_r6_16x8b;
+ __m128i src_r0r1_16x8b, src_r2r3_16x8b, src_r4r5_16x8b;
+
+ __m128i res_t1_8x16b, res_t2_8x16b, res_t3_8x16b;
+
+ __m128i coeff0_1_16x8b, coeff2_3_16x8b, coeff4_5_16x8b;
+
+ coeff0_1_16x8b = _mm_set1_epi32(0xFB01FB01); //c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 c0 c1
+ coeff2_3_16x8b = _mm_set1_epi32(0x14141414); //c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 c2 c3
+ coeff4_5_16x8b = _mm_set1_epi32(0x01FB01FB); //c4 c5 c5 c5 c4 c5 c5 c5 c4 c5 c5 c5 c4 c5 c5 c5
+
+ //epilogue: Load all the pred rows except sixth and seventh row for the
+ //first and second row processing.
+ src_r0_16x8b = _mm_loadu_si128((__m128i *)(pu1_src));
+ pu1_src = pu1_src + src_strd;
+
+ src_r1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src));
+ pu1_src = pu1_src + src_strd;
+
+ src_r2_16x8b = _mm_loadu_si128((__m128i *)(pu1_src));
+ pu1_src = pu1_src + src_strd;
+
+ src_r3_16x8b = _mm_loadu_si128((__m128i *)(pu1_src));
+ pu1_src = pu1_src + src_strd;
+
+ src_r4_16x8b = _mm_loadu_si128((__m128i *)(pu1_src));
+ pu1_src = pu1_src + src_strd;
+
+ //Core Loop: Process all the rows.
+ do
+ {
+ src_r5_16x8b = _mm_loadu_si128((__m128i *)(pu1_src));
+ src_r6_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + src_strd));
+
+ src_r0r1_16x8b = _mm_unpacklo_epi8(src_r0_16x8b, src_r1_16x8b);
+ src_r2r3_16x8b = _mm_unpacklo_epi8(src_r2_16x8b, src_r3_16x8b);
+ src_r4r5_16x8b = _mm_unpacklo_epi8(src_r4_16x8b, src_r5_16x8b);
+
+ res_t1_8x16b = _mm_maddubs_epi16(src_r0r1_16x8b, coeff0_1_16x8b);
+ res_t2_8x16b = _mm_maddubs_epi16(src_r2r3_16x8b, coeff2_3_16x8b);
+ res_t3_8x16b = _mm_maddubs_epi16(src_r4r5_16x8b, coeff4_5_16x8b);
+
+ res_t1_8x16b = _mm_add_epi16(res_t1_8x16b, res_t2_8x16b);
+ res_t1_8x16b = _mm_add_epi16(res_t3_8x16b, res_t1_8x16b);
+
+ _mm_storeu_si128((__m128i *)(pi2_temp1), res_t1_8x16b);
+
+ src_r0r1_16x8b = _mm_unpackhi_epi8(src_r0_16x8b, src_r1_16x8b);
+ src_r2r3_16x8b = _mm_unpackhi_epi8(src_r2_16x8b, src_r3_16x8b);
+ src_r4r5_16x8b = _mm_unpackhi_epi8(src_r4_16x8b, src_r5_16x8b);
+
+ res_t1_8x16b = _mm_maddubs_epi16(src_r0r1_16x8b, coeff0_1_16x8b);
+ res_t2_8x16b = _mm_maddubs_epi16(src_r2r3_16x8b, coeff2_3_16x8b);
+ res_t3_8x16b = _mm_maddubs_epi16(src_r4r5_16x8b, coeff4_5_16x8b);
+
+ res_t1_8x16b = _mm_add_epi16(res_t1_8x16b, res_t2_8x16b);
+ res_t1_8x16b = _mm_add_epi16(res_t3_8x16b, res_t1_8x16b);
+
+ _mm_storeu_si128((__m128i *)(pi2_temp1 + 8), res_t1_8x16b);
+
+ src_r0r1_16x8b = _mm_unpacklo_epi8(src_r1_16x8b, src_r2_16x8b);
+ src_r2r3_16x8b = _mm_unpacklo_epi8(src_r3_16x8b, src_r4_16x8b);
+ src_r4r5_16x8b = _mm_unpacklo_epi8(src_r5_16x8b, src_r6_16x8b);
+
+ res_t1_8x16b = _mm_maddubs_epi16(src_r0r1_16x8b, coeff0_1_16x8b);
+ res_t2_8x16b = _mm_maddubs_epi16(src_r2r3_16x8b, coeff2_3_16x8b);
+ res_t3_8x16b = _mm_maddubs_epi16(src_r4r5_16x8b, coeff4_5_16x8b);
+
+ res_t1_8x16b = _mm_add_epi16(res_t1_8x16b, res_t2_8x16b);
+ res_t1_8x16b = _mm_add_epi16(res_t3_8x16b, res_t1_8x16b);
+
+ _mm_storeu_si128((__m128i *)(pi2_temp1 + 8 + 5), res_t1_8x16b);
+
+ src_r0r1_16x8b = _mm_unpackhi_epi8(src_r1_16x8b, src_r2_16x8b);
+ src_r2r3_16x8b = _mm_unpackhi_epi8(src_r3_16x8b, src_r4_16x8b);
+ src_r4r5_16x8b = _mm_unpackhi_epi8(src_r5_16x8b, src_r6_16x8b);
+
+ res_t1_8x16b = _mm_maddubs_epi16(src_r0r1_16x8b, coeff0_1_16x8b);
+ res_t2_8x16b = _mm_maddubs_epi16(src_r2r3_16x8b, coeff2_3_16x8b);
+ res_t3_8x16b = _mm_maddubs_epi16(src_r4r5_16x8b, coeff4_5_16x8b);
+
+ res_t1_8x16b = _mm_add_epi16(res_t1_8x16b, res_t2_8x16b);
+ res_t1_8x16b = _mm_add_epi16(res_t3_8x16b, res_t1_8x16b);
+
+ _mm_storeu_si128((__m128i *)(pi2_temp1 + 8 + 5 + 8), res_t1_8x16b);
+
+ src_r0_16x8b = src_r2_16x8b;
+ src_r1_16x8b = src_r3_16x8b;
+ src_r2_16x8b = src_r4_16x8b;
+ src_r3_16x8b = src_r5_16x8b;
+ src_r4_16x8b = src_r6_16x8b;
+
+ ht_temp -= 2;
+ pu1_src = pu1_src + (src_strd << 1);
+ pi2_temp1 = pi2_temp1 + (13 << 1);
+ }
+ while(ht_temp > 0);
+ }
+ // horizontal q-pel
+ {
+ __m128i src_r0_8x16b, src_r1_8x16b, src_r2_8x16b, src_r3_8x16b;
+ __m128i src_r4_8x16b, src_r5_8x16b;
+ __m128i src_r0r1_c0_8x16b, src_r2r3_c0_8x16b, src_r4r5_c0_8x16b;
+ __m128i src_r0r1_c1_8x16b, src_r2r3_c1_8x16b, src_r4r5_c1_8x16b;
+ __m128i src_hpel_8x16b, src_hpel_16x8b;
+
+ __m128i res_t0_4x32b, res_t1_4x32b, res_t2_4x32b, res_t3_4x32b;
+ __m128i res_8x16b, res_16x8b;
+
+ __m128i coeff0_1_8x16b, coeff2_3_8x16b, coeff4_5_8x16b;
+ __m128i const_val512_4x32b, const_val16_8x16b;
+
+ coeff0_1_8x16b = _mm_set1_epi32(0xFFFB0001);
+ coeff2_3_8x16b = _mm_set1_epi32(0x00140014);
+ coeff4_5_8x16b = _mm_set1_epi32(0x0001FFFB);
+
+ const_val512_4x32b = _mm_set1_epi32(512);
+ const_val16_8x16b = _mm_set1_epi16(16);
+
+ do
+ {
+ src_r0_8x16b = _mm_loadu_si128((__m128i *)(pi2_temp2));
+ src_r1_8x16b = _mm_loadu_si128((__m128i *)(pi2_temp2 + 1));
+ src_r2_8x16b = _mm_loadu_si128((__m128i *)(pi2_temp2 + 2));
+ src_r3_8x16b = _mm_loadu_si128((__m128i *)(pi2_temp2 + 3));
+ src_r4_8x16b = _mm_loadu_si128((__m128i *)(pi2_temp2 + 4));
+ src_r5_8x16b = _mm_loadu_si128((__m128i *)(pi2_temp2 + 5));
+
+ src_r0r1_c0_8x16b = _mm_unpacklo_epi16(src_r0_8x16b, src_r1_8x16b);
+ src_r2r3_c0_8x16b = _mm_unpacklo_epi16(src_r2_8x16b, src_r3_8x16b);
+ src_r4r5_c0_8x16b = _mm_unpacklo_epi16(src_r4_8x16b, src_r5_8x16b);
+
+ src_r0r1_c1_8x16b = _mm_unpackhi_epi16(src_r0_8x16b, src_r1_8x16b);
+ src_r2r3_c1_8x16b = _mm_unpackhi_epi16(src_r2_8x16b, src_r3_8x16b);
+ src_r4r5_c1_8x16b = _mm_unpackhi_epi16(src_r4_8x16b, src_r5_8x16b);
+
+ res_t1_4x32b = _mm_madd_epi16(src_r0r1_c0_8x16b, coeff0_1_8x16b);
+ res_t2_4x32b = _mm_madd_epi16(src_r2r3_c0_8x16b, coeff2_3_8x16b);
+ res_t3_4x32b = _mm_madd_epi16(src_r4r5_c0_8x16b, coeff4_5_8x16b);
+
+ res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t2_4x32b);
+ res_t3_4x32b = _mm_add_epi32(res_t3_4x32b, const_val512_4x32b);
+ res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t3_4x32b);
+
+ res_t0_4x32b = _mm_srai_epi32(res_t1_4x32b, 10);
+
+ res_t1_4x32b = _mm_madd_epi16(src_r0r1_c1_8x16b, coeff0_1_8x16b);
+ res_t2_4x32b = _mm_madd_epi16(src_r2r3_c1_8x16b, coeff2_3_8x16b);
+ res_t3_4x32b = _mm_madd_epi16(src_r4r5_c1_8x16b, coeff4_5_8x16b);
+
+ res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t2_4x32b);
+ res_t3_4x32b = _mm_add_epi32(res_t3_4x32b, const_val512_4x32b);
+ res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t3_4x32b);
+
+ res_t1_4x32b = _mm_srai_epi32(res_t1_4x32b, 10);
+
+ res_8x16b = _mm_packs_epi32(res_t0_4x32b, res_t1_4x32b);
+ res_16x8b = _mm_packus_epi16(res_8x16b, res_8x16b);
+
+ src_hpel_8x16b = _mm_loadu_si128((__m128i *)(pi2_temp3));
+ src_hpel_8x16b = _mm_add_epi16(src_hpel_8x16b, const_val16_8x16b);
+ src_hpel_8x16b = _mm_srai_epi16(src_hpel_8x16b, 5); //shifting right by 5 bits.
+ src_hpel_16x8b = _mm_packus_epi16(src_hpel_8x16b, src_hpel_8x16b);
+
+ res_16x8b = _mm_avg_epu8(res_16x8b, src_hpel_16x8b);
+
+ _mm_storel_epi64((__m128i *)(pu1_dst), res_16x8b);
+
+ ht--;
+ pi2_temp2 = pi2_temp2 + 8 + 5;
+ pi2_temp3 = pi2_temp3 + 8 + 5;
+ pu1_dst = pu1_dst + dst_strd;
+ }
+ while(ht > 0);
+ }
+ }
+ else // wd == 16
+ {
+ // vertical half-pel
+ {
+ __m128i src_r0_16x8b, src_r1_16x8b, src_r2_16x8b, src_r3_16x8b;
+ __m128i src_r4_16x8b, src_r5_16x8b;
+ __m128i src_r0_c2_16x8b, src_r1_c2_16x8b, src_r2_c2_16x8b, src_r3_c2_16x8b;
+ __m128i src_r4_c2_16x8b, src_r5_c2_16x8b;
+ __m128i src_r0r1_16x8b, src_r2r3_16x8b, src_r4r5_16x8b;
+
+ __m128i res_t1_8x16b, res_t2_8x16b, res_t3_8x16b;
+
+ __m128i coeff0_1_16x8b,coeff2_3_16x8b,coeff4_5_16x8b;
+
+ coeff0_1_16x8b = _mm_set1_epi32(0xFB01FB01); //c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 c0 c1
+ coeff2_3_16x8b = _mm_set1_epi32(0x14141414); //c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 c2 c3
+ coeff4_5_16x8b = _mm_set1_epi32(0x01FB01FB); //c4 c5 c5 c5 c4 c5 c5 c5 c4 c5 c5 c5 c4 c5 c5 c5
+
+ src_r0_16x8b = _mm_loadu_si128((__m128i *)(pu1_src));
+ src_r0_c2_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + 16));
+ pu1_src = pu1_src + src_strd;
+ src_r1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src));
+ src_r1_c2_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + 16));
+ pu1_src = pu1_src + src_strd;
+ src_r2_16x8b = _mm_loadu_si128((__m128i *)(pu1_src));
+ src_r2_c2_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + 16));
+ pu1_src = pu1_src + src_strd;
+ src_r3_16x8b = _mm_loadu_si128((__m128i *)(pu1_src));
+ src_r3_c2_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + 16));
+ pu1_src = pu1_src + src_strd;
+ src_r4_16x8b = _mm_loadu_si128((__m128i *)(pu1_src));
+ src_r4_c2_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + 16));
+ pu1_src = pu1_src + src_strd;
+
+ //Core Loop: Process all the rows.
+ do
+ {
+ src_r5_16x8b = _mm_loadu_si128((__m128i *)(pu1_src));
+ src_r5_c2_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + 16));
+
+ src_r0r1_16x8b = _mm_unpacklo_epi8(src_r0_16x8b, src_r1_16x8b);
+ src_r2r3_16x8b = _mm_unpacklo_epi8(src_r2_16x8b, src_r3_16x8b);
+ src_r4r5_16x8b = _mm_unpacklo_epi8(src_r4_16x8b, src_r5_16x8b);
+
+ res_t1_8x16b = _mm_maddubs_epi16(src_r0r1_16x8b, coeff0_1_16x8b);
+ res_t2_8x16b = _mm_maddubs_epi16(src_r2r3_16x8b, coeff2_3_16x8b);
+ res_t3_8x16b = _mm_maddubs_epi16(src_r4r5_16x8b, coeff4_5_16x8b);
+
+ res_t1_8x16b = _mm_add_epi16(res_t1_8x16b, res_t2_8x16b);
+ res_t1_8x16b = _mm_add_epi16(res_t3_8x16b, res_t1_8x16b);
+
+ _mm_storeu_si128((__m128i *)(pi2_temp1), res_t1_8x16b);
+
+ src_r0r1_16x8b = _mm_unpackhi_epi8(src_r0_16x8b, src_r1_16x8b);
+ src_r2r3_16x8b = _mm_unpackhi_epi8(src_r2_16x8b, src_r3_16x8b);
+ src_r4r5_16x8b = _mm_unpackhi_epi8(src_r4_16x8b, src_r5_16x8b);
+
+ res_t1_8x16b = _mm_maddubs_epi16(src_r0r1_16x8b, coeff0_1_16x8b);
+ res_t2_8x16b = _mm_maddubs_epi16(src_r2r3_16x8b, coeff2_3_16x8b);
+ res_t3_8x16b = _mm_maddubs_epi16(src_r4r5_16x8b, coeff4_5_16x8b);
+
+ res_t1_8x16b = _mm_add_epi16(res_t1_8x16b, res_t2_8x16b);
+ res_t1_8x16b = _mm_add_epi16(res_t3_8x16b, res_t1_8x16b);
+
+ _mm_storeu_si128((__m128i *)(pi2_temp1 + 8), res_t1_8x16b);
+
+ src_r0r1_16x8b = _mm_unpacklo_epi8(src_r0_c2_16x8b, src_r1_c2_16x8b);
+ src_r2r3_16x8b = _mm_unpacklo_epi8(src_r2_c2_16x8b, src_r3_c2_16x8b);
+ src_r4r5_16x8b = _mm_unpacklo_epi8(src_r4_c2_16x8b, src_r5_c2_16x8b);
+
+ res_t1_8x16b = _mm_maddubs_epi16(src_r0r1_16x8b, coeff0_1_16x8b);
+ res_t2_8x16b = _mm_maddubs_epi16(src_r2r3_16x8b, coeff2_3_16x8b);
+ res_t3_8x16b = _mm_maddubs_epi16(src_r4r5_16x8b, coeff4_5_16x8b);
+
+ res_t1_8x16b = _mm_add_epi16(res_t1_8x16b, res_t2_8x16b);
+ res_t1_8x16b = _mm_add_epi16(res_t3_8x16b, res_t1_8x16b);
+
+ _mm_storeu_si128((__m128i *)(pi2_temp1 + 16), res_t1_8x16b);
+
+ src_r0_16x8b = src_r1_16x8b;
+ src_r1_16x8b = src_r2_16x8b;
+ src_r2_16x8b = src_r3_16x8b;
+ src_r3_16x8b = src_r4_16x8b;
+ src_r4_16x8b = src_r5_16x8b;
+
+ src_r0_c2_16x8b = src_r1_c2_16x8b;
+ src_r1_c2_16x8b = src_r2_c2_16x8b;
+ src_r2_c2_16x8b = src_r3_c2_16x8b;
+ src_r3_c2_16x8b = src_r4_c2_16x8b;
+ src_r4_c2_16x8b = src_r5_c2_16x8b;
+
+ ht_temp--;
+ pu1_src = pu1_src + src_strd;
+ pi2_temp1 = pi2_temp1 + 16 + 5;
+ }
+ while(ht_temp > 0);
+ }
+ // horizontal q-pel
+ {
+ __m128i src_r0_8x16b, src_r1_8x16b, src_r2_8x16b, src_r3_8x16b;
+ __m128i src_r4_8x16b, src_r5_8x16b;
+ __m128i src_r0r1_8x16b, src_r2r3_8x16b, src_r4r5_8x16b;
+ __m128i src_hpel1_8x16b, src_hpel2_8x16b, src_hpel_16x8b;
+
+ __m128i res_t0_4x32b, res_t1_4x32b, res_t2_4x32b, res_t3_4x32b;
+ __m128i res_c0_8x16b, res_c1_8x16b, res_16x8b;
+
+ __m128i coeff0_1_8x16b, coeff2_3_8x16b, coeff4_5_8x16b;
+ __m128i const_val512_4x32b, const_val16_8x16b;
+
+ coeff0_1_8x16b = _mm_set1_epi32(0xFFFB0001);
+ coeff2_3_8x16b = _mm_set1_epi32(0x00140014);
+ coeff4_5_8x16b = _mm_set1_epi32(0x0001FFFB);
+
+ const_val512_4x32b = _mm_set1_epi32(512);
+ const_val16_8x16b = _mm_set1_epi16(16);
+
+ do
+ {
+ src_r0_8x16b = _mm_loadu_si128((__m128i *)(pi2_temp2));
+ src_r1_8x16b = _mm_loadu_si128((__m128i *)(pi2_temp2 + 1));
+ src_r2_8x16b = _mm_loadu_si128((__m128i *)(pi2_temp2 + 2));
+ src_r3_8x16b = _mm_loadu_si128((__m128i *)(pi2_temp2 + 3));
+ src_r4_8x16b = _mm_loadu_si128((__m128i *)(pi2_temp2 + 4));
+ src_r5_8x16b = _mm_loadu_si128((__m128i *)(pi2_temp2 + 5));
+
+ src_r0r1_8x16b = _mm_unpacklo_epi16(src_r0_8x16b, src_r1_8x16b);
+ src_r2r3_8x16b = _mm_unpacklo_epi16(src_r2_8x16b, src_r3_8x16b);
+ src_r4r5_8x16b = _mm_unpacklo_epi16(src_r4_8x16b, src_r5_8x16b);
+
+ res_t1_4x32b = _mm_madd_epi16(src_r0r1_8x16b, coeff0_1_8x16b);
+ res_t2_4x32b = _mm_madd_epi16(src_r2r3_8x16b, coeff2_3_8x16b);
+ res_t3_4x32b = _mm_madd_epi16(src_r4r5_8x16b, coeff4_5_8x16b);
+
+ res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t2_4x32b);
+ res_t3_4x32b = _mm_add_epi32(res_t3_4x32b, const_val512_4x32b);
+ res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t3_4x32b);
+ res_t0_4x32b = _mm_srai_epi32(res_t1_4x32b, 10);
+
+ src_r0r1_8x16b = _mm_unpackhi_epi16(src_r0_8x16b, src_r1_8x16b);
+ src_r2r3_8x16b = _mm_unpackhi_epi16(src_r2_8x16b, src_r3_8x16b);
+ src_r4r5_8x16b = _mm_unpackhi_epi16(src_r4_8x16b, src_r5_8x16b);
+
+ res_t1_4x32b = _mm_madd_epi16(src_r0r1_8x16b, coeff0_1_8x16b);
+ res_t2_4x32b = _mm_madd_epi16(src_r2r3_8x16b, coeff2_3_8x16b);
+ res_t3_4x32b = _mm_madd_epi16(src_r4r5_8x16b, coeff4_5_8x16b);
+
+ res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t2_4x32b);
+ res_t3_4x32b = _mm_add_epi32(const_val512_4x32b, res_t3_4x32b);
+ res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t3_4x32b);
+ res_t1_4x32b = _mm_srai_epi32(res_t1_4x32b, 10);
+
+ res_c0_8x16b = _mm_packs_epi32(res_t0_4x32b, res_t1_4x32b);
+
+ src_r0_8x16b = _mm_loadu_si128((__m128i *)(pi2_temp2 + 8));
+ src_r1_8x16b = _mm_loadu_si128((__m128i *)(pi2_temp2 + 8 + 1));
+ src_r2_8x16b = _mm_loadu_si128((__m128i *)(pi2_temp2 + 8 + 2));
+ src_r3_8x16b = _mm_loadu_si128((__m128i *)(pi2_temp2 + 8 + 3));
+ src_r4_8x16b = _mm_loadu_si128((__m128i *)(pi2_temp2 + 8 + 4));
+ src_r5_8x16b = _mm_loadu_si128((__m128i *)(pi2_temp2 + 8 + 5));
+
+ src_r0r1_8x16b = _mm_unpacklo_epi16(src_r0_8x16b, src_r1_8x16b);
+ src_r2r3_8x16b = _mm_unpacklo_epi16(src_r2_8x16b, src_r3_8x16b);
+ src_r4r5_8x16b = _mm_unpacklo_epi16(src_r4_8x16b, src_r5_8x16b);
+
+ res_t1_4x32b = _mm_madd_epi16(src_r0r1_8x16b, coeff0_1_8x16b);
+ res_t2_4x32b = _mm_madd_epi16(src_r2r3_8x16b, coeff2_3_8x16b);
+ res_t3_4x32b = _mm_madd_epi16(src_r4r5_8x16b, coeff4_5_8x16b);
+
+ res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t2_4x32b);
+ res_t3_4x32b = _mm_add_epi32(const_val512_4x32b, res_t3_4x32b);
+ res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t3_4x32b);
+ res_t0_4x32b = _mm_srai_epi32(res_t1_4x32b ,10);
+
+ src_r0r1_8x16b = _mm_unpackhi_epi16(src_r0_8x16b, src_r1_8x16b);
+ src_r2r3_8x16b = _mm_unpackhi_epi16(src_r2_8x16b, src_r3_8x16b);
+ src_r4r5_8x16b = _mm_unpackhi_epi16(src_r4_8x16b, src_r5_8x16b);
+
+ res_t1_4x32b = _mm_madd_epi16(src_r0r1_8x16b, coeff0_1_8x16b);
+ res_t2_4x32b = _mm_madd_epi16(src_r2r3_8x16b, coeff2_3_8x16b);
+ res_t3_4x32b = _mm_madd_epi16(src_r4r5_8x16b, coeff4_5_8x16b);
+
+ res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t2_4x32b);
+ res_t3_4x32b = _mm_add_epi32(const_val512_4x32b, res_t3_4x32b);
+ res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t3_4x32b);
+ res_t1_4x32b = _mm_srai_epi32(res_t1_4x32b, 10);
+
+ res_c1_8x16b = _mm_packs_epi32(res_t0_4x32b, res_t1_4x32b);
+ res_16x8b = _mm_packus_epi16(res_c0_8x16b, res_c1_8x16b);
+
+ src_hpel1_8x16b = _mm_loadu_si128((__m128i *)(pi2_temp3));
+ src_hpel1_8x16b = _mm_add_epi16(src_hpel1_8x16b, const_val16_8x16b);
+ src_hpel1_8x16b = _mm_srai_epi16(src_hpel1_8x16b, 5); //shifting right by 5 bits.
+
+ src_hpel2_8x16b = _mm_loadu_si128((__m128i *)(pi2_temp3 + 8));
+ src_hpel2_8x16b = _mm_add_epi16(src_hpel2_8x16b, const_val16_8x16b);
+ src_hpel2_8x16b = _mm_srai_epi16(src_hpel2_8x16b, 5); //shifting right by 5 bits.
+
+ src_hpel_16x8b = _mm_packus_epi16(src_hpel1_8x16b, src_hpel2_8x16b);
+ res_16x8b = _mm_avg_epu8(res_16x8b, src_hpel_16x8b);
+
+ _mm_storeu_si128((__m128i *)(pu1_dst), res_16x8b);
+
+ ht--;
+ pi2_temp2 = pi2_temp2 + 16 + 5;
+ pi2_temp3 = pi2_temp3 + 16 + 5;
+ pu1_dst = pu1_dst + dst_strd;
+ }
+ while(ht > 0);
+ }
+ }
+}
+
+/*****************************************************************************/
+/* */
+/* Function Name : ih264_inter_pred_luma_horz_hpel_vert_qpel_ssse3 */
+/* */
+/* Description : This function implements a six-tap filter vertically and */
+/* horizontally on ht x wd block separately and averages */
+/* the two sets of values to calculate values at (1/2,1/4), */
+/* or (1/2, 3/4) as mentioned in sec. 8.4.2.2.1 titled */
+/* "Luma sample interpolation process". (ht,wd) can be */
+/* (4,4), (8,4), (4,8), (8,8), (16,8), (8,16) or (16,16). */
+/* */
+/* Inputs : puc_src - pointer to source */
+/* puc_dst - pointer to destination */
+/* src_strd - stride for source */
+/* dst_strd - stride for destination */
+/* ht - height of the block */
+/* wd - width of the block */
+/* pu1_tmp - pointer to temporary buffer */
+/* dydx - x and y reference offset for q-pel */
+/* calculations */
+/* */
+/* Issues : None */
+/* */
+/* Revision History: */
+/* */
+/* DD MM YYYY Author(s) Changes */
+/* 13 02 2015 Kaushik Initial Version */
+/* Senthoor */
+/* */
+/*****************************************************************************/
+void ih264_inter_pred_luma_horz_hpel_vert_qpel_ssse3(UWORD8 *pu1_src,
+ UWORD8 *pu1_dst,
+ WORD32 src_strd,
+ WORD32 dst_strd,
+ WORD32 ht,
+ WORD32 wd,
+ UWORD8* pu1_tmp,
+ WORD32 dydx)
+{
+ WORD32 ht_temp;
+ WORD32 y_offset;
+ WORD16 *pi2_temp1,*pi2_temp2,*pi2_temp3;
+
+ y_offset = (dydx & 0xf) >> 2;
+ pi2_temp1 = (WORD16 *)pu1_tmp;
+ pi2_temp2 = pi2_temp1;
+ pi2_temp3 = pi2_temp1 + (y_offset >> 1) * wd;
+
+ ht_temp = ht + 5;
+ pu1_src -= src_strd << 1;
+ pu1_src -= 2;
+ pi2_temp3 += wd << 1;
+ //the filter input starts from x[-2] (till x[3])
+
+ if(wd == 4)
+ {
+ // horizontal half-pel
+ {
+ __m128i src_r0_16x8b, src_r1_16x8b, src_r0r1_t1_16x8b;
+ __m128i src_r0_sht_16x8b, src_r1_sht_16x8b;
+ __m128i res_r0r1_t1_8x16b, res_r0r1_t2_8x16b, res_r0r1_t3_8x16b;
+ __m128i coeff0_1_16x8b, coeff2_3_16x8b, coeff4_5_16x8b;
+
+ coeff0_1_16x8b = _mm_set1_epi32(0xFB01FB01); //c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 c0 c1
+ coeff2_3_16x8b = _mm_set1_epi32(0x14141414); //c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 c2 c3
+ coeff4_5_16x8b = _mm_set1_epi32(0x01FB01FB); //c4 c5 c5 c5 c4 c5 c5 c5 c4 c5 c5 c5 c4 c5 c5 c5
+
+ //Row0 : a0 a1 a2 a3 a4 a5 a6 a7 a8 a9.....
+ //Row1 : b0 b1 b2 b3 b4 b5 b6 b7 b8 b9.....
+
+ do
+ {
+ src_r0_16x8b = _mm_loadu_si128((__m128i *)pu1_src); //a0 a1 a2 a3 a4 a5 a6 a7 a8 a9....a15
+ src_r1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + src_strd)); //b0 b1 b2 b3 b4 b5 b6 b7 b8 b9....b15
+
+ src_r0_sht_16x8b = _mm_srli_si128(src_r0_16x8b, 1); //a1 a2 a3 a4 a5 a6 a7 a8 a9....a15 0
+ src_r1_sht_16x8b = _mm_srli_si128(src_r1_16x8b, 1); //b1 b2 b3 b4 b5 b6 b7 b8 b9....b15 0
+
+ src_r0_16x8b = _mm_unpacklo_epi8(src_r0_16x8b, src_r0_sht_16x8b); //a0 a1 a1 a2 a2 a3 a3 a4 a4 a5 a5 a6 a6 a7 a7 a8
+ src_r1_16x8b = _mm_unpacklo_epi8(src_r1_16x8b, src_r1_sht_16x8b); //b0 b1 b1 b2 b2 b3 b3 b4 b4 b5 b5 b6 b6 b7 b7 b8
+
+ src_r0r1_t1_16x8b = _mm_unpacklo_epi64(src_r0_16x8b, src_r1_16x8b); //a0 a1 a1 a2 a2 a3 a3 a4 b0 b1 b1 b2 b2 b3 b3 b4
+ res_r0r1_t1_8x16b = _mm_maddubs_epi16(src_r0r1_t1_16x8b, coeff0_1_16x8b); //a0*c0+a1*c1 a1*c0+a2*c1 a2*c0+a3*c1 a3*c0+a4*c1
+ //b0*c0+b1*c1 b1*c0+b2*c1 b2*c0+b3*c1 b3*c0+b4*c1
+
+ src_r0_16x8b = _mm_srli_si128(src_r0_16x8b, 4); //a2 a3 a3 a4 a4 a5 a5 a6 a6 a7 a7 a8 0 0 0 0
+ src_r1_16x8b = _mm_srli_si128(src_r1_16x8b, 4); //b2 b3 b3 b4 b4 b5 b5 b6 b6 b7 b7 b8 0 0 0 0
+
+ src_r0r1_t1_16x8b = _mm_unpacklo_epi64(src_r0_16x8b, src_r1_16x8b); //a2 a3 a3 a4 a4 a5 a5 a6 b2 b3 b3 b4 b4 b5 b5 b6
+ res_r0r1_t2_8x16b = _mm_maddubs_epi16(src_r0r1_t1_16x8b, coeff2_3_16x8b); //a2*c2+a3*c3 a3*c2+a4*c3 a4*c2+a5*c3 a5*c2+a6*c3
+ //b2*c2+b3*c3 b3*c2+b4*c3 b4*c2+b5*c3 b5*c2+b6*c3
+
+ src_r0_16x8b = _mm_srli_si128(src_r0_16x8b, 4); //a4 a5 a5 a6 a6 a7 a7 a8 0 0 0 0 0 0 0 0
+ src_r1_16x8b = _mm_srli_si128(src_r1_16x8b, 4); //b4 b5 b5 b6 b6 b7 b7 b8 0 0 0 0 0 0 0 0
+
+ src_r0r1_t1_16x8b = _mm_unpacklo_epi64(src_r0_16x8b, src_r1_16x8b); //a4 a5 a5 a6 a6 a7 a7 a8 b4 b5 b5 b6 b6 b7 b7 b8
+ res_r0r1_t3_8x16b = _mm_maddubs_epi16(src_r0r1_t1_16x8b, coeff4_5_16x8b); //a4*c4+a5*c5 a5*c4+a6*c5 a6*c4+a7*c5 a7*c4+a8*c5
+ //b4*c4+b5*c5 b5*c4+b6*c5 b4*c6+b7*c5 b7*c4+b8*c5
+
+ res_r0r1_t1_8x16b = _mm_add_epi16(res_r0r1_t1_8x16b, res_r0r1_t2_8x16b);
+ res_r0r1_t1_8x16b = _mm_add_epi16(res_r0r1_t1_8x16b, res_r0r1_t3_8x16b);
+
+
+ _mm_storeu_si128((__m128i *)(pi2_temp1), res_r0r1_t1_8x16b);
+
+ ht_temp -= 2;
+ pu1_src = pu1_src + (src_strd << 1);
+ pi2_temp1 = pi2_temp1 + (4 << 1);
+ }
+ while(ht_temp > 0);
+ }
+ // vertical q-pel
+ {
+ __m128i src_r0_8x16b, src_r1_8x16b, src_r2_8x16b, src_r3_8x16b;
+ __m128i src_r4_8x16b, src_r5_8x16b, src_r6_8x16b;
+ __m128i src_r0r1_c0_8x16b, src_r2r3_c0_8x16b, src_r4r5_c0_8x16b;
+ __m128i src_hpel_16x8b, src_hpel_8x16b;
+
+ __m128i res_t0_4x32b, res_t1_4x32b, res_t2_4x32b, res_t3_4x32b;
+ __m128i res_8x16b, res_16x8b;
+
+ __m128i coeff0_1_8x16b, coeff2_3_8x16b, coeff4_5_8x16b;
+ __m128i const_val512_4x32b, const_val16_8x16b;
+ __m128i mask_low32b;
+
+ mask_low32b = _mm_set1_epi8(0xff);
+ const_val512_4x32b = _mm_set1_epi32(512);
+ const_val16_8x16b = _mm_set1_epi16(16);
+ mask_low32b = _mm_srli_si128(mask_low32b, 12);
+
+ coeff0_1_8x16b = _mm_set1_epi32(0xFFFB0001);
+ coeff2_3_8x16b = _mm_set1_epi32(0x00140014);
+ coeff4_5_8x16b = _mm_set1_epi32(0x0001FFFB);
+
+ src_r0_8x16b = _mm_loadl_epi64((__m128i *)(pi2_temp2));
+ src_r1_8x16b = _mm_loadl_epi64((__m128i *)(pi2_temp2 + 4));
+ src_r2_8x16b = _mm_loadl_epi64((__m128i *)(pi2_temp2 + 8));
+ src_r3_8x16b = _mm_loadl_epi64((__m128i *)(pi2_temp2 + 12));
+ src_r4_8x16b = _mm_loadl_epi64((__m128i *)(pi2_temp2 + 16));
+ pi2_temp2 += 20;
+
+ do
+ {
+ src_r5_8x16b = _mm_loadl_epi64((__m128i *)(pi2_temp2));
+ src_r6_8x16b = _mm_loadl_epi64((__m128i *)(pi2_temp2 + 4));
+
+ src_r0r1_c0_8x16b = _mm_unpacklo_epi16(src_r0_8x16b, src_r1_8x16b);
+ src_r2r3_c0_8x16b = _mm_unpacklo_epi16(src_r2_8x16b, src_r3_8x16b);
+ src_r4r5_c0_8x16b = _mm_unpacklo_epi16(src_r4_8x16b, src_r5_8x16b);
+
+ res_t1_4x32b = _mm_madd_epi16(src_r0r1_c0_8x16b, coeff0_1_8x16b);
+ res_t2_4x32b = _mm_madd_epi16(src_r2r3_c0_8x16b, coeff2_3_8x16b);
+ res_t3_4x32b = _mm_madd_epi16(src_r4r5_c0_8x16b, coeff4_5_8x16b);
+
+ res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t2_4x32b);
+ res_t3_4x32b = _mm_add_epi32(res_t3_4x32b, const_val512_4x32b);
+ res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t3_4x32b);
+ res_t0_4x32b = _mm_srai_epi32(res_t1_4x32b, 10);
+
+ src_r0r1_c0_8x16b = _mm_unpacklo_epi16(src_r1_8x16b, src_r2_8x16b);
+ src_r2r3_c0_8x16b = _mm_unpacklo_epi16(src_r3_8x16b, src_r4_8x16b);
+ src_r4r5_c0_8x16b = _mm_unpacklo_epi16(src_r5_8x16b, src_r6_8x16b);
+
+ res_t1_4x32b = _mm_madd_epi16(src_r0r1_c0_8x16b, coeff0_1_8x16b);
+ res_t2_4x32b = _mm_madd_epi16(src_r2r3_c0_8x16b, coeff2_3_8x16b);
+ res_t3_4x32b = _mm_madd_epi16(src_r4r5_c0_8x16b, coeff4_5_8x16b);
+
+ res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t2_4x32b);
+ res_t3_4x32b = _mm_add_epi32(res_t3_4x32b, const_val512_4x32b);
+ res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t3_4x32b);
+ res_t1_4x32b = _mm_srai_epi32(res_t1_4x32b, 10);
+
+ res_8x16b = _mm_packs_epi32(res_t0_4x32b, res_t1_4x32b);
+ res_16x8b = _mm_packus_epi16(res_8x16b, res_8x16b);
+
+ src_hpel_8x16b = _mm_loadu_si128((__m128i *)pi2_temp3);
+ src_hpel_8x16b = _mm_add_epi16(src_hpel_8x16b, const_val16_8x16b);
+ src_hpel_8x16b = _mm_srai_epi16(src_hpel_8x16b, 5); //shifting right by 5 bits.
+ src_hpel_16x8b = _mm_packus_epi16(src_hpel_8x16b, src_hpel_8x16b);
+
+ res_16x8b = _mm_avg_epu8(res_16x8b, src_hpel_16x8b);
+
+ _mm_maskmoveu_si128(res_16x8b, mask_low32b, (char *)(pu1_dst));
+ res_16x8b = _mm_srli_si128(res_16x8b, 4);
+ _mm_maskmoveu_si128(res_16x8b, mask_low32b, (char *)(pu1_dst + dst_strd));
+
+ src_r0_8x16b = src_r2_8x16b;
+ src_r1_8x16b = src_r3_8x16b;
+ src_r2_8x16b = src_r4_8x16b;
+ src_r3_8x16b = src_r5_8x16b;
+ src_r4_8x16b = src_r6_8x16b;
+
+ ht -= 2;
+ pi2_temp2 = pi2_temp2 + (4 << 1);
+ pi2_temp3 = pi2_temp3 + (4 << 1);
+ pu1_dst = pu1_dst + (dst_strd << 1);
+ }
+ while(ht > 0);
+ }
+ }
+ else if(wd == 8)
+ {
+ // horizontal half-pel
+ {
+ __m128i src_r0_16x8b, src_r1_16x8b, src_r0_sht_16x8b, src_r1_sht_16x8b;
+ __m128i src_r0_t1_16x8b, src_r1_t1_16x8b;
+
+ __m128i res_r0_t1_8x16b, res_r0_t2_8x16b, res_r0_t3_8x16b;
+ __m128i res_r1_t1_8x16b, res_r1_t2_8x16b, res_r1_t3_8x16b;
+
+ __m128i coeff0_1_16x8b, coeff2_3_16x8b, coeff4_5_16x8b;
+
+ coeff0_1_16x8b = _mm_set1_epi32(0xFB01FB01); //c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 c0 c1
+ coeff2_3_16x8b = _mm_set1_epi32(0x14141414); //c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 c2 c3
+ coeff4_5_16x8b = _mm_set1_epi32(0x01FB01FB); //c4 c5 c5 c5 c4 c5 c5 c5 c4 c5 c5 c5 c4 c5 c5 c5
+
+ //Row0 : a0 a1 a2 a3 a4 a5 a6 a7 a8 a9.....
+ //Row1 : b0 b1 b2 b3 b4 b5 b6 b7 b8 b9.....
+
+ do
+ {
+ src_r0_16x8b = _mm_loadu_si128((__m128i *)(pu1_src)); //a0 a1 a2 a3 a4 a5 a6 a7 a8 a9....a15
+ src_r1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + src_strd)); //b0 b1 b2 b3 b4 b5 b6 b7 b8 b9....b15
+
+ src_r0_sht_16x8b = _mm_srli_si128(src_r0_16x8b, 1); //a1 a2 a3 a4 a5 a6 a7 a8 a9....a15 0
+ src_r1_sht_16x8b = _mm_srli_si128(src_r1_16x8b, 1); //b1 b2 b3 b4 b5 b6 b7 b8 b9....b15 0
+
+ src_r0_t1_16x8b = _mm_unpacklo_epi8(src_r0_16x8b, src_r0_sht_16x8b); //a0 a1 a1 a2 a2 a3 a3 a4 a4 a5 a5 a6 a6 a7 a7 a8
+ src_r1_t1_16x8b = _mm_unpacklo_epi8(src_r1_16x8b, src_r1_sht_16x8b); //b0 b1 b1 b2 b2 b3 b3 b4 b4 b5 b5 b6 b6 b7 b7 b8
+
+ res_r0_t1_8x16b = _mm_maddubs_epi16(src_r0_t1_16x8b, coeff0_1_16x8b); //a0*c0+a1*c1 a1*c0+a2*c1 a2*c0+a3*c1 a3*c0+a4*c1
+ //a4*c0+a5*c1 a5*c0+a6*c1 a6*c0+a7*c1 a7*c0+a8*c1
+ res_r1_t1_8x16b = _mm_maddubs_epi16(src_r1_t1_16x8b, coeff0_1_16x8b); //b0*c0+b1*c1 b1*c0+b2*c1 b2*c0+b3*c1 b3*c0+b4*c1
+ //b4*c0+b5*c1 b5*c0+b6*c1 b6*c0+b7*c1 b7*c0+b8*c1
+
+ src_r0_16x8b = _mm_srli_si128(src_r0_16x8b, 2); //a2 a3 a4 a5 a6 a7 a8 a9....a15 0 0
+ src_r1_16x8b = _mm_srli_si128(src_r1_16x8b, 2); //b2 b3 b4 b5 b6 b7 b8 b9....b15 0 0
+
+ src_r0_sht_16x8b = _mm_srli_si128(src_r0_sht_16x8b, 2); //a3 a4 a5 a6 a7 a8 a9....a15 0 0 0
+ src_r1_sht_16x8b = _mm_srli_si128(src_r1_sht_16x8b, 2); //b3 b4 b5 b6 b7 b8 b9....b15 0 0 0
+
+ src_r0_t1_16x8b = _mm_unpacklo_epi8(src_r0_16x8b, src_r0_sht_16x8b); //a2 a3 a3 a4 a4 a5 a5 a6 a6 a7 a7 a8 a8 a9 a9 a10
+ src_r1_t1_16x8b = _mm_unpacklo_epi8(src_r1_16x8b, src_r1_sht_16x8b); //b2 b3 b3 b4 b4 b5 b5 b6 b6 b7 b7 b8 a8 a9 a9 a10
+
+ res_r0_t2_8x16b = _mm_maddubs_epi16(src_r0_t1_16x8b, coeff2_3_16x8b); //a2*c2+a3*c3 a3*c2+a4*c3 a4*c2+a5*c3 a5*c2+a6*c3
+ //a6*c2+a7*c3 a7*c2+a8*c3 a8*c2+a9*c3 a9*c2+a10*c3
+ res_r1_t2_8x16b = _mm_maddubs_epi16(src_r1_t1_16x8b, coeff2_3_16x8b); //b2*c2+b3*c3 b3*c2+b4*c3 b2*c4+b5*c3 b5*c2+b6*c3
+ //b6*c2+b7*c3 b7*c2+b8*c3 b8*c2+b9*c3 b9*c2+b10*c3
+
+ src_r0_16x8b = _mm_srli_si128(src_r0_16x8b, 2); //a4 a5 a6 a7 a8 a9....a15 0 0 0 0
+ src_r1_16x8b = _mm_srli_si128(src_r1_16x8b, 2); //b4 b5 b6 b7 b8 b9....b15 0 0 0 0
+
+ src_r0_sht_16x8b = _mm_srli_si128(src_r0_sht_16x8b, 2); //a5 a6 a7 a8 a9....a15 0 0 0 0 0
+ src_r1_sht_16x8b = _mm_srli_si128(src_r1_sht_16x8b, 2); //b5 b6 b7 b8 b9....b15 0 0 0 0 0
+
+ src_r0_t1_16x8b = _mm_unpacklo_epi8(src_r0_16x8b, src_r0_sht_16x8b); //a4 a5 a5 a6 a6 a7 a7 a8 a8 a9 a9 a10 a10 a11 a11 a12
+ src_r1_t1_16x8b = _mm_unpacklo_epi8(src_r1_16x8b, src_r1_sht_16x8b); //b4 b5 b5 b6 b6 b7 b7 b8 b8 b9 b9 b10 b10 b11 b11 b12
+
+ res_r0_t3_8x16b = _mm_maddubs_epi16(src_r0_t1_16x8b, coeff4_5_16x8b); //a4*c4+a5*c5 a5*c4+a6*c5 a6*c4+a7*c5 a7*c4+a8*c5
+ //a8*c4+a9*c5 a9*c4+a10*c5 a10*c4+a11*c5 a11*c4+a12*c5
+ res_r1_t3_8x16b = _mm_maddubs_epi16(src_r1_t1_16x8b, coeff4_5_16x8b); //b4*c4+b5*c5 b5*c4+b6*c5 b6*c4+b7*c5 b7*c4+b8*c5
+ //b8*c4+b9*c5 b9*c4+b10*c5 b10*c4+b11*c5 b11*c4+b12*c5
+ res_r0_t1_8x16b = _mm_add_epi16(res_r0_t1_8x16b, res_r0_t2_8x16b);
+ res_r0_t1_8x16b = _mm_add_epi16(res_r0_t1_8x16b, res_r0_t3_8x16b);
+
+ res_r1_t1_8x16b = _mm_add_epi16(res_r1_t1_8x16b, res_r1_t2_8x16b);
+ res_r1_t1_8x16b = _mm_add_epi16(res_r1_t1_8x16b, res_r1_t3_8x16b);
+
+ _mm_storeu_si128((__m128i *)(pi2_temp1), res_r0_t1_8x16b);
+ _mm_storeu_si128((__m128i *)(pi2_temp1 + 8), res_r1_t1_8x16b);
+
+ ht_temp -= 2;
+ pu1_src = pu1_src + (src_strd << 1);
+ pi2_temp1 = pi2_temp1 + (8 << 1);
+ }
+ while(ht_temp > 0);
+ }
+ // vertical q-pel
+ {
+ __m128i src_r0_8x16b, src_r1_8x16b, src_r2_8x16b, src_r3_8x16b;
+ __m128i src_r4_8x16b, src_r5_8x16b, src_r6_8x16b;
+ __m128i src_r0r1_8x16b, src_r2r3_8x16b, src_r4r5_8x16b;
+ __m128i src_hpel_8x16b, src_hpel_16x8b;
+
+ __m128i res_t0_4x32b, res_t1_4x32b, res_t2_4x32b, res_t3_4x32b;
+ __m128i res_8x16b, res_16x8b;
+
+ __m128i coeff0_1_8x16b, coeff2_3_8x16b, coeff4_5_8x16b;
+ __m128i const_val512_4x32b, const_val16_8x16b;
+
+ coeff0_1_8x16b = _mm_set1_epi32(0xFFFB0001);
+ coeff2_3_8x16b = _mm_set1_epi32(0x00140014);
+ coeff4_5_8x16b = _mm_set1_epi32(0x0001FFFB);
+
+ const_val512_4x32b = _mm_set1_epi32(512);
+ const_val16_8x16b = _mm_set1_epi16(16);
+
+ src_r0_8x16b = _mm_loadu_si128((__m128i *)(pi2_temp2));
+ src_r1_8x16b = _mm_loadu_si128((__m128i *)(pi2_temp2 + 8));
+ src_r2_8x16b = _mm_loadu_si128((__m128i *)(pi2_temp2 + 16));
+ src_r3_8x16b = _mm_loadu_si128((__m128i *)(pi2_temp2 + 24));
+ src_r4_8x16b = _mm_loadu_si128((__m128i *)(pi2_temp2 + 32));
+ pi2_temp2 += 40;
+
+ do
+ {
+ src_r5_8x16b = _mm_loadu_si128((__m128i *)(pi2_temp2));
+ src_r6_8x16b = _mm_loadu_si128((__m128i *)(pi2_temp2 + 8));
+
+ src_r0r1_8x16b = _mm_unpacklo_epi16(src_r0_8x16b, src_r1_8x16b);
+ src_r2r3_8x16b = _mm_unpacklo_epi16(src_r2_8x16b, src_r3_8x16b);
+ src_r4r5_8x16b = _mm_unpacklo_epi16(src_r4_8x16b, src_r5_8x16b);
+
+ res_t1_4x32b = _mm_madd_epi16(src_r0r1_8x16b, coeff0_1_8x16b);
+ res_t2_4x32b = _mm_madd_epi16(src_r2r3_8x16b, coeff2_3_8x16b);
+ res_t3_4x32b = _mm_madd_epi16(src_r4r5_8x16b, coeff4_5_8x16b);
+
+ res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t2_4x32b);
+ res_t3_4x32b = _mm_add_epi32(res_t3_4x32b, const_val512_4x32b);
+ res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t3_4x32b);
+ res_t0_4x32b = _mm_srai_epi32(res_t1_4x32b, 10);
+
+ src_r0r1_8x16b = _mm_unpackhi_epi16(src_r0_8x16b, src_r1_8x16b);
+ src_r2r3_8x16b = _mm_unpackhi_epi16(src_r2_8x16b, src_r3_8x16b);
+ src_r4r5_8x16b = _mm_unpackhi_epi16(src_r4_8x16b, src_r5_8x16b);
+
+ res_t1_4x32b = _mm_madd_epi16(src_r0r1_8x16b, coeff0_1_8x16b);
+ res_t2_4x32b = _mm_madd_epi16(src_r2r3_8x16b, coeff2_3_8x16b);
+ res_t3_4x32b = _mm_madd_epi16(src_r4r5_8x16b, coeff4_5_8x16b);
+
+ res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t2_4x32b);
+ res_t3_4x32b = _mm_add_epi32(res_t3_4x32b, const_val512_4x32b);
+ res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t3_4x32b);
+ res_t1_4x32b = _mm_srai_epi32(res_t1_4x32b, 10);
+
+ res_8x16b = _mm_packs_epi32(res_t0_4x32b, res_t1_4x32b);
+ res_16x8b = _mm_packus_epi16(res_8x16b, res_8x16b);
+
+ src_hpel_8x16b = _mm_loadu_si128((__m128i *)pi2_temp3);
+ src_hpel_8x16b = _mm_add_epi16(const_val16_8x16b, src_hpel_8x16b);
+ src_hpel_8x16b = _mm_srai_epi16(src_hpel_8x16b, 5); //shifting right by 5 bits.
+ src_hpel_16x8b = _mm_packus_epi16(src_hpel_8x16b, src_hpel_8x16b);
+
+ res_16x8b = _mm_avg_epu8(res_16x8b, src_hpel_16x8b);
+
+ _mm_storel_epi64((__m128i *)(pu1_dst), res_16x8b);
+
+ src_r0r1_8x16b = _mm_unpacklo_epi16(src_r1_8x16b, src_r2_8x16b);
+ src_r2r3_8x16b = _mm_unpacklo_epi16(src_r3_8x16b, src_r4_8x16b);
+ src_r4r5_8x16b = _mm_unpacklo_epi16(src_r5_8x16b, src_r6_8x16b);
+
+ res_t1_4x32b = _mm_madd_epi16(src_r0r1_8x16b, coeff0_1_8x16b);
+ res_t2_4x32b = _mm_madd_epi16(src_r2r3_8x16b, coeff2_3_8x16b);
+ res_t3_4x32b = _mm_madd_epi16(src_r4r5_8x16b, coeff4_5_8x16b);
+
+ res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t2_4x32b);
+ res_t3_4x32b = _mm_add_epi32(res_t3_4x32b, const_val512_4x32b);
+ res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t3_4x32b);
+ res_t0_4x32b = _mm_srai_epi32(res_t1_4x32b, 10);
+
+ src_r0r1_8x16b = _mm_unpackhi_epi16(src_r1_8x16b, src_r2_8x16b);
+ src_r2r3_8x16b = _mm_unpackhi_epi16(src_r3_8x16b, src_r4_8x16b);
+ src_r4r5_8x16b = _mm_unpackhi_epi16(src_r5_8x16b, src_r6_8x16b);
+
+ res_t1_4x32b = _mm_madd_epi16(src_r0r1_8x16b, coeff0_1_8x16b);
+ res_t2_4x32b = _mm_madd_epi16(src_r2r3_8x16b, coeff2_3_8x16b);
+ res_t3_4x32b = _mm_madd_epi16(src_r4r5_8x16b, coeff4_5_8x16b);
+
+ res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t2_4x32b);
+ res_t3_4x32b = _mm_add_epi32(res_t3_4x32b, const_val512_4x32b);
+ res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t3_4x32b);
+ res_t1_4x32b = _mm_srai_epi32(res_t1_4x32b, 10);
+
+ res_8x16b = _mm_packs_epi32(res_t0_4x32b, res_t1_4x32b);
+ res_16x8b = _mm_packus_epi16(res_8x16b, res_8x16b);
+
+ src_hpel_8x16b = _mm_loadu_si128((__m128i *)(pi2_temp3 + 8));
+ src_hpel_8x16b = _mm_add_epi16(const_val16_8x16b, src_hpel_8x16b);
+ src_hpel_8x16b = _mm_srai_epi16(src_hpel_8x16b, 5); //shifting right by 5 bits.
+ src_hpel_16x8b = _mm_packus_epi16(src_hpel_8x16b, src_hpel_8x16b);
+
+ res_16x8b = _mm_avg_epu8(res_16x8b, src_hpel_16x8b);
+
+ _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd), res_16x8b);
+
+ src_r0_8x16b = src_r2_8x16b;
+ src_r1_8x16b = src_r3_8x16b;
+ src_r2_8x16b = src_r4_8x16b;
+ src_r3_8x16b = src_r5_8x16b;
+ src_r4_8x16b = src_r6_8x16b;
+
+ ht -= 2;
+ pi2_temp2 = pi2_temp2 + (8 << 1);
+ pi2_temp3 = pi2_temp3 + (8 << 1);
+ pu1_dst = pu1_dst + (dst_strd << 1);
+ }
+ while(ht > 0);
+ }
+ }
+ else // wd == 16
+ {
+ UWORD8 *pu1_dst1;
+ WORD16 *pi2_temp4,*pi2_temp5;
+
+ pu1_dst1 = pu1_dst + 8;
+ pi2_temp4 = pi2_temp2 + 8;
+ pi2_temp5 = pi2_temp3 + 8;
+
+ // horizontal half-pel
+ {
+ __m128i src_r0_16x8b, src_r1_16x8b, src_r0_sht_16x8b, src_r1_sht_16x8b;
+ __m128i src_r0_t1_16x8b, src_r1_t1_16x8b;
+
+ __m128i res_r0_t1_8x16b, res_r0_t2_8x16b, res_r0_t3_8x16b;
+ __m128i res_r1_t1_8x16b, res_r1_t2_8x16b, res_r1_t3_8x16b;
+
+ __m128i coeff0_1_16x8b, coeff2_3_16x8b, coeff4_5_16x8b;
+
+ coeff0_1_16x8b = _mm_set1_epi32(0xFB01FB01); //c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 c0 c1
+ coeff2_3_16x8b = _mm_set1_epi32(0x14141414); //c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 c2 c3
+ coeff4_5_16x8b = _mm_set1_epi32(0x01FB01FB); //c4 c5 c5 c5 c4 c5 c5 c5 c4 c5 c5 c5 c4 c5 c5 c5
+
+ //Row0 : a0 a1 a2 a3 a4 a5 a6 a7 a8 a9.....
+ //Row0 : b0 b1 b2 b3 b4 b5 b6 b7 b8 b9.....
+ //b0 is same a8. Similarly other bn pixels are same as a(n+8) pixels.
+
+ do
+ {
+ src_r0_16x8b = _mm_loadu_si128((__m128i *)(pu1_src)); //a0 a1 a2 a3 a4 a5 a6 a7 a8 a9....a15
+ src_r1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + 8)); //b0 b1 b2 b3 b4 b5 b6 b7 b8 b9....b15
+
+ src_r0_sht_16x8b = _mm_srli_si128(src_r0_16x8b, 1); //a1 a2 a3 a4 a5 a6 a7 a8 a9....a15 0
+ src_r1_sht_16x8b = _mm_srli_si128(src_r1_16x8b, 1); //b1 b2 b3 b4 b5 b6 b7 b8 b9....b15 0
+
+ src_r0_t1_16x8b = _mm_unpacklo_epi8(src_r0_16x8b, src_r0_sht_16x8b); //a0 a1 a1 a2 a2 a3 a3 a4 a4 a5 a5 a6 a6 a7 a7 a8
+ src_r1_t1_16x8b = _mm_unpacklo_epi8(src_r1_16x8b, src_r1_sht_16x8b); //b0 b1 b1 b2 b2 b3 b3 b4 b4 b5 b5 b6 b6 b7 b7 b8
+
+ res_r0_t1_8x16b = _mm_maddubs_epi16(src_r0_t1_16x8b, coeff0_1_16x8b); //a0*c0+a1*c1 a1*c0+a2*c1 a2*c0+a3*c1 a3*c0+a4*c1
+ //a4*c0+a5*c1 a5*c0+a6*c1 a6*c0+a7*c1 a7*c0+a8*c1
+ res_r1_t1_8x16b = _mm_maddubs_epi16(src_r1_t1_16x8b, coeff0_1_16x8b); //b0*c0+b1*c1 b1*c0+b2*c1 b2*c0+b3*c1 b3*c0+b4*c1
+ //b4*c0+b5*c1 b5*c0+b6*c1 b6*c0+b7*c1 b7*c0+b8*c1
+
+ src_r0_16x8b = _mm_srli_si128(src_r0_16x8b, 2); //a2 a3 a4 a5 a6 a7 a8 a9....a15 0 0
+ src_r1_16x8b = _mm_srli_si128(src_r1_16x8b, 2); //b2 b3 b4 b5 b6 b7 b8 b9....b15 0 0
+
+ src_r0_sht_16x8b = _mm_srli_si128(src_r0_sht_16x8b, 2); //a3 a4 a5 a6 a7 a8 a9....a15 0 0 0
+ src_r1_sht_16x8b = _mm_srli_si128(src_r1_sht_16x8b, 2); //b3 b4 b5 b6 b7 b8 b9....b15 0 0 0
+
+ src_r0_t1_16x8b = _mm_unpacklo_epi8(src_r0_16x8b, src_r0_sht_16x8b); //a2 a3 a3 a4 a4 a5 a5 a6 a6 a7 a7 a8 a8 a9 a9 a10
+ src_r1_t1_16x8b = _mm_unpacklo_epi8(src_r1_16x8b, src_r1_sht_16x8b); //b2 b3 b3 b4 b4 b5 b5 b6 b6 b7 b7 b8 a8 a9 a9 a10
+
+ res_r0_t2_8x16b = _mm_maddubs_epi16(src_r0_t1_16x8b, coeff2_3_16x8b); //a2*c2+a3*c3 a3*c2+a4*c3 a4*c2+a5*c3 a5*c2+a6*c3
+ //a6*c2+a7*c3 a7*c2+a8*c3 a8*c2+a9*c3 a9*c2+a10*c3
+ res_r1_t2_8x16b = _mm_maddubs_epi16(src_r1_t1_16x8b, coeff2_3_16x8b); //b2*c2+b3*c3 b3*c2+b4*c3 b2*c4+b5*c3 b5*c2+b6*c3
+ //b6*c2+b7*c3 b7*c2+b8*c3 b8*c2+b9*c3 b9*c2+b10*c3
+
+ src_r0_16x8b = _mm_srli_si128(src_r0_16x8b, 2); //a4 a5 a6 a7 a8 a9....a15 0 0 0 0
+ src_r1_16x8b = _mm_srli_si128(src_r1_16x8b, 2); //b4 b5 b6 b7 b8 b9....b15 0 0 0 0
+
+ src_r0_sht_16x8b = _mm_srli_si128(src_r0_sht_16x8b, 2); //a5 a6 a7 a8 a9....a15 0 0 0 0 0
+ src_r1_sht_16x8b = _mm_srli_si128(src_r1_sht_16x8b, 2); //b5 b6 b7 b8 b9....b15 0 0 0 0 0
+
+ src_r0_t1_16x8b = _mm_unpacklo_epi8(src_r0_16x8b, src_r0_sht_16x8b); //a4 a5 a5 a6 a6 a7 a7 a8 a8 a9 a9 a10 a10 a11 a11 a12
+ src_r1_t1_16x8b = _mm_unpacklo_epi8(src_r1_16x8b, src_r1_sht_16x8b); //b4 b5 b5 b6 b6 b7 b7 b8 b8 b9 b9 b10 b10 b11 b11 b12
+
+ res_r0_t3_8x16b = _mm_maddubs_epi16(src_r0_t1_16x8b, coeff4_5_16x8b); //a4*c4+a5*c5 a5*c4+a6*c5 a6*c4+a7*c5 a7*c4+a8*c5
+ //a8*c4+a9*c5 a9*c4+a10*c5 a10*c4+a11*c5 a11*c4+a12*c5
+ res_r1_t3_8x16b = _mm_maddubs_epi16(src_r1_t1_16x8b, coeff4_5_16x8b); //b4*c4+b5*c5 b5*c4+b6*c5 b6*c4+b7*c5 b7*c4+b8*c5
+ //b8*c4+b9*c5 b9*c4+b10*c5 b10*c4+b11*c5 b11*c4+b12*c5
+ res_r0_t1_8x16b = _mm_add_epi16(res_r0_t1_8x16b, res_r0_t2_8x16b);
+ res_r0_t1_8x16b = _mm_add_epi16(res_r0_t1_8x16b, res_r0_t3_8x16b);
+
+ res_r1_t1_8x16b = _mm_add_epi16(res_r1_t1_8x16b, res_r1_t2_8x16b);
+ res_r1_t1_8x16b = _mm_add_epi16(res_r1_t1_8x16b, res_r1_t3_8x16b);
+
+ _mm_storeu_si128((__m128i *)(pi2_temp1), res_r0_t1_8x16b);
+ _mm_storeu_si128((__m128i *)(pi2_temp1 + 8), res_r1_t1_8x16b);
+
+ ht_temp--;
+ pu1_src = pu1_src + src_strd;
+ pi2_temp1 = pi2_temp1 + 16;
+ }
+ while(ht_temp > 0);
+ }
+ // vertical q-pel
+ {
+ __m128i src_r0_8x16b, src_r1_8x16b, src_r2_8x16b, src_r3_8x16b, src_r4_8x16b;
+ __m128i src_r5_8x16b, src_r6_8x16b;
+ __m128i src_r0r1_8x16b, src_r2r3_8x16b, src_r4r5_8x16b;
+ __m128i src_hpel_8x16b, src_hpel_16x8b;
+
+ __m128i res_t0_4x32b, res_t1_4x32b, res_t2_4x32b, res_t3_4x32b;
+ __m128i res_8x16b, res_16x8b;
+
+ __m128i coeff0_1_8x16b, coeff2_3_8x16b, coeff4_5_8x16b;
+ __m128i const_val512_4x32b, const_val16_8x16b;
+
+ coeff0_1_8x16b = _mm_set1_epi32(0xFFFB0001);
+ coeff2_3_8x16b = _mm_set1_epi32(0x00140014);
+ coeff4_5_8x16b = _mm_set1_epi32(0x0001FFFB);
+
+ const_val512_4x32b = _mm_set1_epi32(512);
+ const_val16_8x16b = _mm_set1_epi16(16);
+
+ /**********************************************************/
+ /* Do first height x 8 block */
+ /**********************************************************/
+ src_r0_8x16b = _mm_loadu_si128((__m128i *)(pi2_temp2));
+ src_r1_8x16b = _mm_loadu_si128((__m128i *)(pi2_temp2 + 16));
+ src_r2_8x16b = _mm_loadu_si128((__m128i *)(pi2_temp2 + 32));
+ src_r3_8x16b = _mm_loadu_si128((__m128i *)(pi2_temp2 + 48));
+ src_r4_8x16b = _mm_loadu_si128((__m128i *)(pi2_temp2 + 64));
+ pi2_temp2 += 80;
+
+ ht_temp = ht;
+ do
+ {
+ src_r5_8x16b = _mm_loadu_si128((__m128i *)(pi2_temp2));
+ src_r6_8x16b = _mm_loadu_si128((__m128i *)(pi2_temp2 + 16));
+
+ src_r0r1_8x16b = _mm_unpacklo_epi16(src_r0_8x16b, src_r1_8x16b);
+ src_r2r3_8x16b = _mm_unpacklo_epi16(src_r2_8x16b, src_r3_8x16b);
+ src_r4r5_8x16b = _mm_unpacklo_epi16(src_r4_8x16b, src_r5_8x16b);
+
+ res_t1_4x32b = _mm_madd_epi16(src_r0r1_8x16b, coeff0_1_8x16b);
+ res_t2_4x32b = _mm_madd_epi16(src_r2r3_8x16b, coeff2_3_8x16b);
+ res_t3_4x32b = _mm_madd_epi16(src_r4r5_8x16b, coeff4_5_8x16b);
+
+ res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t2_4x32b);
+ res_t3_4x32b = _mm_add_epi32(res_t3_4x32b, const_val512_4x32b);
+ res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t3_4x32b);
+ res_t0_4x32b = _mm_srai_epi32(res_t1_4x32b, 10);
+
+ src_r0r1_8x16b = _mm_unpackhi_epi16(src_r0_8x16b, src_r1_8x16b);
+ src_r2r3_8x16b = _mm_unpackhi_epi16(src_r2_8x16b, src_r3_8x16b);
+ src_r4r5_8x16b = _mm_unpackhi_epi16(src_r4_8x16b, src_r5_8x16b);
+
+ res_t1_4x32b = _mm_madd_epi16(src_r0r1_8x16b, coeff0_1_8x16b);
+ res_t2_4x32b = _mm_madd_epi16(src_r2r3_8x16b, coeff2_3_8x16b);
+ res_t3_4x32b = _mm_madd_epi16(src_r4r5_8x16b, coeff4_5_8x16b);
+
+ res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t2_4x32b);
+ res_t3_4x32b = _mm_add_epi32(res_t3_4x32b, const_val512_4x32b);
+ res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t3_4x32b);
+ res_t1_4x32b = _mm_srai_epi32(res_t1_4x32b, 10);
+
+ res_8x16b = _mm_packs_epi32(res_t0_4x32b, res_t1_4x32b);
+ res_16x8b = _mm_packus_epi16(res_8x16b, res_8x16b);
+
+ src_hpel_8x16b = _mm_loadu_si128((__m128i *)(pi2_temp3));
+ src_hpel_8x16b = _mm_add_epi16(src_hpel_8x16b, const_val16_8x16b);
+ src_hpel_8x16b = _mm_srai_epi16(src_hpel_8x16b, 5); //shifting right by 5 bits.
+ src_hpel_16x8b = _mm_packus_epi16(src_hpel_8x16b, src_hpel_8x16b);
+
+ res_16x8b = _mm_avg_epu8(res_16x8b, src_hpel_16x8b);
+ _mm_storel_epi64((__m128i *)(pu1_dst), res_16x8b);
+
+ src_r0r1_8x16b = _mm_unpacklo_epi16(src_r1_8x16b, src_r2_8x16b);
+ src_r2r3_8x16b = _mm_unpacklo_epi16(src_r3_8x16b, src_r4_8x16b);
+ src_r4r5_8x16b = _mm_unpacklo_epi16(src_r5_8x16b, src_r6_8x16b);
+
+ res_t1_4x32b = _mm_madd_epi16(src_r0r1_8x16b, coeff0_1_8x16b);
+ res_t2_4x32b = _mm_madd_epi16(src_r2r3_8x16b, coeff2_3_8x16b);
+ res_t3_4x32b = _mm_madd_epi16(src_r4r5_8x16b, coeff4_5_8x16b);
+
+ res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t2_4x32b);
+ res_t3_4x32b = _mm_add_epi32(res_t3_4x32b, const_val512_4x32b);
+ res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t3_4x32b);
+ res_t0_4x32b = _mm_srai_epi32(res_t1_4x32b, 10);
+
+ src_r0r1_8x16b = _mm_unpackhi_epi16(src_r1_8x16b, src_r2_8x16b);
+ src_r2r3_8x16b = _mm_unpackhi_epi16(src_r3_8x16b, src_r4_8x16b);
+ src_r4r5_8x16b = _mm_unpackhi_epi16(src_r5_8x16b, src_r6_8x16b);
+
+ res_t1_4x32b = _mm_madd_epi16(src_r0r1_8x16b, coeff0_1_8x16b);
+ res_t2_4x32b = _mm_madd_epi16(src_r2r3_8x16b, coeff2_3_8x16b);
+ res_t3_4x32b = _mm_madd_epi16(src_r4r5_8x16b, coeff4_5_8x16b);
+
+ res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t2_4x32b);
+ res_t3_4x32b = _mm_add_epi32(res_t3_4x32b, const_val512_4x32b);
+ res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t3_4x32b);
+ res_t1_4x32b = _mm_srai_epi32(res_t1_4x32b, 10);
+
+ res_8x16b = _mm_packs_epi32(res_t0_4x32b, res_t1_4x32b);
+ res_16x8b = _mm_packus_epi16(res_8x16b, res_8x16b);
+
+ src_hpel_8x16b = _mm_loadu_si128((__m128i *)(pi2_temp3 + 16));
+ src_hpel_8x16b = _mm_add_epi16(src_hpel_8x16b, const_val16_8x16b);
+ src_hpel_8x16b = _mm_srai_epi16(src_hpel_8x16b, 5); //shifting right by 5 bits.
+ src_hpel_16x8b = _mm_packus_epi16(src_hpel_8x16b, src_hpel_8x16b);
+
+ res_16x8b = _mm_avg_epu8(res_16x8b, src_hpel_16x8b);
+ _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd), res_16x8b);
+
+ src_r0_8x16b = src_r2_8x16b;
+ src_r1_8x16b = src_r3_8x16b;
+ src_r2_8x16b = src_r4_8x16b;
+ src_r3_8x16b = src_r5_8x16b;
+ src_r4_8x16b = src_r6_8x16b;
+
+ ht_temp -= 2;
+ pi2_temp3 = pi2_temp3 + (16 << 1);
+ pi2_temp2 = pi2_temp2 + (16 << 1);
+ pu1_dst = pu1_dst + (dst_strd << 1);
+ }
+ while(ht_temp > 0);
+
+ /**********************************************************/
+ /* Do second height * 8 block */
+ /**********************************************************/
+ src_r0_8x16b = _mm_loadu_si128((__m128i *)(pi2_temp4));
+ src_r1_8x16b = _mm_loadu_si128((__m128i *)(pi2_temp4 + 16));
+ src_r2_8x16b = _mm_loadu_si128((__m128i *)(pi2_temp4 + 32));
+ src_r3_8x16b = _mm_loadu_si128((__m128i *)(pi2_temp4 + 48));
+ src_r4_8x16b = _mm_loadu_si128((__m128i *)(pi2_temp4 + 64));
+ pi2_temp4 += 80;
+
+ do
+ {
+ src_r5_8x16b = _mm_loadu_si128((__m128i *)(pi2_temp4));
+ src_r6_8x16b = _mm_loadu_si128((__m128i *)(pi2_temp4 + 16));
+
+ src_r0r1_8x16b = _mm_unpacklo_epi16(src_r0_8x16b, src_r1_8x16b);
+ src_r2r3_8x16b = _mm_unpacklo_epi16(src_r2_8x16b, src_r3_8x16b);
+ src_r4r5_8x16b = _mm_unpacklo_epi16(src_r4_8x16b, src_r5_8x16b);
+
+ res_t1_4x32b = _mm_madd_epi16(src_r0r1_8x16b, coeff0_1_8x16b);
+ res_t2_4x32b = _mm_madd_epi16(src_r2r3_8x16b, coeff2_3_8x16b);
+ res_t3_4x32b = _mm_madd_epi16(src_r4r5_8x16b, coeff4_5_8x16b);
+
+ res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t2_4x32b);
+ res_t3_4x32b = _mm_add_epi32(res_t3_4x32b, const_val512_4x32b);
+ res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t3_4x32b);
+ res_t0_4x32b = _mm_srai_epi32(res_t1_4x32b, 10);
+
+ src_r0r1_8x16b = _mm_unpackhi_epi16(src_r0_8x16b, src_r1_8x16b);
+ src_r2r3_8x16b = _mm_unpackhi_epi16(src_r2_8x16b, src_r3_8x16b);
+ src_r4r5_8x16b = _mm_unpackhi_epi16(src_r4_8x16b, src_r5_8x16b);
+
+ res_t1_4x32b = _mm_madd_epi16(src_r0r1_8x16b, coeff0_1_8x16b);
+ res_t2_4x32b = _mm_madd_epi16(src_r2r3_8x16b, coeff2_3_8x16b);
+ res_t3_4x32b = _mm_madd_epi16(src_r4r5_8x16b, coeff4_5_8x16b);
+
+ res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t2_4x32b);
+ res_t3_4x32b = _mm_add_epi32(res_t3_4x32b, const_val512_4x32b);
+ res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t3_4x32b);
+ res_t1_4x32b = _mm_srai_epi32(res_t1_4x32b, 10);
+
+ res_8x16b = _mm_packs_epi32(res_t0_4x32b, res_t1_4x32b);
+ res_16x8b = _mm_packus_epi16(res_8x16b, res_8x16b);
+
+ src_hpel_8x16b = _mm_loadu_si128((__m128i *)(pi2_temp5));
+ src_hpel_8x16b = _mm_add_epi16(src_hpel_8x16b, const_val16_8x16b);
+ src_hpel_8x16b = _mm_srai_epi16(src_hpel_8x16b, 5); //shifting right by 5 bits.
+ src_hpel_16x8b = _mm_packus_epi16(src_hpel_8x16b, src_hpel_8x16b);
+
+ res_16x8b = _mm_avg_epu8(res_16x8b, src_hpel_16x8b);
+ _mm_storel_epi64((__m128i *)(pu1_dst1), res_16x8b);
+
+ src_r0r1_8x16b = _mm_unpacklo_epi16(src_r1_8x16b, src_r2_8x16b);
+ src_r2r3_8x16b = _mm_unpacklo_epi16(src_r3_8x16b, src_r4_8x16b);
+ src_r4r5_8x16b = _mm_unpacklo_epi16(src_r5_8x16b, src_r6_8x16b);
+
+ res_t1_4x32b = _mm_madd_epi16(src_r0r1_8x16b, coeff0_1_8x16b);
+ res_t2_4x32b = _mm_madd_epi16(src_r2r3_8x16b, coeff2_3_8x16b);
+ res_t3_4x32b = _mm_madd_epi16(src_r4r5_8x16b, coeff4_5_8x16b);
+
+ res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t2_4x32b);
+ res_t3_4x32b = _mm_add_epi32(res_t3_4x32b, const_val512_4x32b);
+ res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t3_4x32b);
+ res_t0_4x32b = _mm_srai_epi32(res_t1_4x32b, 10);
+
+ src_r0r1_8x16b = _mm_unpackhi_epi16(src_r1_8x16b, src_r2_8x16b);
+ src_r2r3_8x16b = _mm_unpackhi_epi16(src_r3_8x16b, src_r4_8x16b);
+ src_r4r5_8x16b = _mm_unpackhi_epi16(src_r5_8x16b, src_r6_8x16b);
+
+ res_t1_4x32b = _mm_madd_epi16(src_r0r1_8x16b, coeff0_1_8x16b);
+ res_t2_4x32b = _mm_madd_epi16(src_r2r3_8x16b, coeff2_3_8x16b);
+ res_t3_4x32b = _mm_madd_epi16(src_r4r5_8x16b, coeff4_5_8x16b);
+
+ res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t2_4x32b);
+ res_t3_4x32b = _mm_add_epi32(res_t3_4x32b, const_val512_4x32b);
+ res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t3_4x32b);
+ res_t1_4x32b = _mm_srai_epi32(res_t1_4x32b, 10);
+
+ res_8x16b = _mm_packs_epi32(res_t0_4x32b, res_t1_4x32b);
+ res_16x8b = _mm_packus_epi16(res_8x16b, res_8x16b);
+
+ src_hpel_8x16b = _mm_loadu_si128((__m128i *)(pi2_temp5 + 16));
+ src_hpel_8x16b = _mm_add_epi16(src_hpel_8x16b, const_val16_8x16b);
+ src_hpel_8x16b = _mm_srai_epi16(src_hpel_8x16b, 5); //shifting right by 5 bits.
+ src_hpel_16x8b = _mm_packus_epi16(src_hpel_8x16b, src_hpel_8x16b);
+
+ res_16x8b = _mm_avg_epu8(res_16x8b, src_hpel_16x8b);
+ _mm_storel_epi64((__m128i *)(pu1_dst1 + dst_strd), res_16x8b);
+
+ src_r0_8x16b = src_r2_8x16b;
+ src_r1_8x16b = src_r3_8x16b;
+ src_r2_8x16b = src_r4_8x16b;
+ src_r3_8x16b = src_r5_8x16b;
+ src_r4_8x16b = src_r6_8x16b;
+
+ ht -= 2;
+ pi2_temp5 = pi2_temp5 + (16 << 1);
+ pi2_temp4 = pi2_temp4 + (16 << 1);
+ pu1_dst1 = pu1_dst1 + (dst_strd << 1);
+ }
+ while(ht > 0);
+ }
+ }
+}
+
+/*****************************************************************************/
+/* */
+/* Function Name : ih264_inter_pred_chroma_ssse3 */
+/* */
+/* Description : This function implements a four-tap 2D filter as */
+/* mentioned in sec. 8.4.2.2.2 titled "Chroma sample */
+/* "interpolation process". (ht,wd) can be (2,2), (4,2), */
+/* (2,4), (4,4), (8,4), (4,8) or (8,8). */
+/* */
+/* Inputs : puc_src - pointer to source */
+/* puc_dst - pointer to destination */
+/* src_strd - stride for source */
+/* dst_strd - stride for destination */
+/* dx - x position of destination value */
+/* dy - y position of destination value */
+/* ht - height of the block */
+/* wd - width of the block */
+/* */
+/* Issues : None */
+/* */
+/* Revision History: */
+/* */
+/* DD MM YYYY Author(s) Changes */
+/* 13 02 2015 Kaushik Initial Version */
+/* Senthoor */
+/* */
+/*****************************************************************************/
+void ih264_inter_pred_chroma_ssse3(UWORD8 *pu1_src,
+ UWORD8 *pu1_dst,
+ WORD32 src_strd,
+ WORD32 dst_strd,
+ WORD32 dx,
+ WORD32 dy,
+ WORD32 ht,
+ WORD32 wd)
+{
+ WORD32 i, j, A, B, C, D;
+
+ i = 8 - dx;
+ j = 8 - dy;
+
+ A = i * j;
+ B = dx * j;
+ C = i * dy;
+ D = dx * dy;
+
+ if(wd == 2)
+ {
+ WORD32 tmp1, tmp2, tmp3, tmp4;
+
+ do
+ {
+ //U
+ tmp1 = A * pu1_src[0] + B * pu1_src[2] + C * pu1_src[src_strd] + D * pu1_src[src_strd + 2];
+ tmp2 = A * pu1_src[2] + B * pu1_src[4] + C * pu1_src[src_strd + 2] + D * pu1_src[src_strd + 4];
+ //V
+ tmp3 = A * pu1_src[1] + B * pu1_src[3] + C * pu1_src[src_strd + 1] + D * pu1_src[src_strd + 3];
+ tmp4 = A * pu1_src[3] + B * pu1_src[5] + C * pu1_src[src_strd + 3] + D * pu1_src[src_strd + 5];
+
+ tmp1 = (tmp1 + 32) >> 6;
+ tmp2 = (tmp2 + 32) >> 6;
+ tmp3 = (tmp3 + 32) >> 6;
+ tmp4 = (tmp4 + 32) >> 6;
+
+ pu1_dst[0] = CLIP_U8(tmp1);
+ pu1_dst[2] = CLIP_U8(tmp2);
+ pu1_dst[1] = CLIP_U8(tmp3);
+ pu1_dst[3] = CLIP_U8(tmp4);
+
+ pu1_src += src_strd;
+ pu1_dst += dst_strd;
+
+ tmp1 = A * pu1_src[0] + B * pu1_src[2] + C * pu1_src[src_strd] + D * pu1_src[src_strd + 2];
+ tmp2 = A * pu1_src[2] + B * pu1_src[4] + C * pu1_src[src_strd + 2] + D * pu1_src[src_strd + 4];
+ tmp3 = A * pu1_src[1] + B * pu1_src[3] + C * pu1_src[src_strd + 1] + D * pu1_src[src_strd + 3];
+ tmp4 = A * pu1_src[3] + B * pu1_src[5] + C * pu1_src[src_strd + 3] + D * pu1_src[src_strd + 5];
+
+ tmp1 = (tmp1 + 32) >> 6;
+ tmp2 = (tmp2 + 32) >> 6;
+ tmp3 = (tmp3 + 32) >> 6;
+ tmp4 = (tmp4 + 32) >> 6;
+
+ pu1_dst[0] = CLIP_U8(tmp1);
+ pu1_dst[2] = CLIP_U8(tmp2);
+ pu1_dst[1] = CLIP_U8(tmp3);
+ pu1_dst[3] = CLIP_U8(tmp4);
+
+ ht -= 2;
+ pu1_src += src_strd;
+ pu1_dst += dst_strd;
+ }
+ while(ht > 0);
+
+ /*
+ WORD32 AB, CD;
+
+ __m128i src_r1_16x8b, src_r2_16x8b, src_r3_16x8b;
+ __m128i src_r1r2_16x8b, src_r2r3_16x8b;
+ __m128i res_AB_8x16b, res_CD_8x16b, res_8x16b, res_16x8b;
+ __m128i mask_low32b;
+
+ __m128i coeffAB_16x8b, coeffCD_16x8b, round_add32_8x16b;
+ __m128i const_shuff_16x8b;
+
+ AB = (B << 8) + A;
+ CD = (D << 8) + C;
+
+ coeffAB_16x8b = _mm_set1_epi16(AB);
+ coeffCD_16x8b = _mm_set1_epi16(CD);
+
+ round_add32_8x16b = _mm_set1_epi16(32);
+
+ mask_low32b = _mm_set1_epi8(0xff);
+ src_r1_16x8b = _mm_loadl_epi64((__m128i *)pu1_src); //u1[0] v1[0] u1[1] v1[1] u1[2] v1[2] u1[3] v1[3]
+ pu1_src += src_strd;
+
+ const_shuff_16x8b = _mm_setr_epi32(0x03010200, 0x05030402, 0x0b090a08, 0x0d0b0c0a);
+ mask_low32b = _mm_srli_si128(mask_low32b, 12);
+
+ do
+ {
+ src_r2_16x8b = _mm_loadl_epi64((__m128i *)pu1_src); //u2[0] v2[0] u2[1] v2[1] u1[2] v2[2] u2[3] v2[3]
+ src_r3_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + src_strd)); //u3[0] v3[0] u3[1] v3[1] u3[2] v3[2] u3[3] v3[3]
+
+ src_r1r2_16x8b = _mm_unpacklo_epi64(src_r1_16x8b, src_r2_16x8b);
+ src_r2r3_16x8b = _mm_unpacklo_epi64(src_r2_16x8b, src_r3_16x8b);
+
+ src_r1r2_16x8b = _mm_shuffle_epi8(src_r1r2_16x8b, const_shuff_16x8b); //u1[0] u1[1] v1[0] v1[1] u1[1] u1[2] v1[1] v1[2]
+ //u2[0] u2[1] v2[0] v2[1] u2[1] u2[2] v2[1] v2[2]
+ src_r2r3_16x8b = _mm_shuffle_epi8(src_r2r3_16x8b, const_shuff_16x8b); //u2[0] u2[1] v2[0] v2[1] u2[1] u2[2] v2[1] v2[2]
+ //u3[0] u3[1] v3[0] v3[1] u3[1] u3[2] v3[1] v3[2]
+ res_AB_8x16b = _mm_maddubs_epi16(src_r1r2_16x8b, coeffAB_16x8b);
+ res_CD_8x16b = _mm_maddubs_epi16(src_r2r3_16x8b, coeffCD_16x8b);
+
+ res_8x16b = _mm_add_epi16(res_AB_8x16b, round_add32_8x16b);
+ res_8x16b = _mm_add_epi16(res_8x16b, res_CD_8x16b);
+ res_8x16b = _mm_srai_epi16(res_8x16b, 6);
+ res_16x8b = _mm_packus_epi16(res_8x16b, res_8x16b);
+
+ _mm_maskmoveu_si128(res_16x8b, mask_low32b, (char *)pu1_dst);
+
+ ht -= 2;
+ pu1_src += src_strd << 1;
+ res_16x8b = _mm_srli_si128(res_16x8b, 4);
+ src_r1_16x8b = src_r3_16x8b;
+
+ _mm_maskmoveu_si128(res_16x8b, mask_low32b, (char *)(pu1_dst + dst_strd));
+
+ pu1_dst += dst_strd << 1;
+ }
+ while(ht > 0);
+ */
+ }
+ else if(wd == 4)
+ {
+ WORD32 AB, CD;
+
+ __m128i src_r1_16x8b, src_r2_16x8b, src_r3_16x8b;
+ __m128i res1_AB_8x16b, res1_CD_8x16b, res1_8x16b, res1_16x8b;
+ __m128i res2_AB_8x16b, res2_CD_8x16b, res2_8x16b, res2_16x8b;
+
+ __m128i coeffAB_16x8b, coeffCD_16x8b, round_add32_8x16b;
+ __m128i const_shuff_16x8b;
+
+ AB = (B << 8) + A;
+ CD = (D << 8) + C;
+
+ coeffAB_16x8b = _mm_set1_epi16(AB);
+ coeffCD_16x8b = _mm_set1_epi16(CD);
+
+ round_add32_8x16b = _mm_set1_epi16(32);
+
+ const_shuff_16x8b = _mm_setr_epi32(0x03010200, 0x05030402, 0x07050604, 0x09070806);
+
+ src_r1_16x8b = _mm_loadu_si128((__m128i *)pu1_src);
+ src_r1_16x8b = _mm_shuffle_epi8(src_r1_16x8b, const_shuff_16x8b);
+ pu1_src += src_strd;
+
+ do
+ {
+ src_r2_16x8b = _mm_loadu_si128((__m128i *)pu1_src);
+ src_r3_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + src_strd));
+
+ src_r2_16x8b = _mm_shuffle_epi8(src_r2_16x8b, const_shuff_16x8b);
+ src_r3_16x8b = _mm_shuffle_epi8(src_r3_16x8b, const_shuff_16x8b);
+
+ res1_AB_8x16b = _mm_maddubs_epi16(src_r1_16x8b, coeffAB_16x8b);
+ res1_CD_8x16b = _mm_maddubs_epi16(src_r2_16x8b, coeffCD_16x8b);
+ res2_AB_8x16b = _mm_maddubs_epi16(src_r2_16x8b, coeffAB_16x8b);
+ res2_CD_8x16b = _mm_maddubs_epi16(src_r3_16x8b, coeffCD_16x8b);
+
+ res1_8x16b = _mm_add_epi16(res1_AB_8x16b, res1_CD_8x16b);
+ res2_8x16b = _mm_add_epi16(res2_AB_8x16b, res2_CD_8x16b);
+ res1_8x16b = _mm_add_epi16(res1_8x16b, round_add32_8x16b);
+ res2_8x16b = _mm_add_epi16(res2_8x16b, round_add32_8x16b);
+
+ res1_8x16b = _mm_srai_epi16(res1_8x16b, 6);
+ res2_8x16b = _mm_srai_epi16(res2_8x16b, 6);
+
+ res1_16x8b = _mm_packus_epi16(res1_8x16b, res1_8x16b);
+ res2_16x8b = _mm_packus_epi16(res2_8x16b, res2_8x16b);
+
+ _mm_storel_epi64((__m128i *)pu1_dst, res1_16x8b);
+ _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd), res2_16x8b);
+
+ src_r1_16x8b = src_r3_16x8b;
+
+ ht -= 2;
+ pu1_src += src_strd << 1;
+ pu1_dst += dst_strd << 1;
+ }
+ while(ht > 0);
+ }
+ else // wd == 8
+ {
+ WORD32 AB, CD;
+
+ __m128i src_r1l_16x8b, src_r2l_16x8b;
+ __m128i src_r1h_16x8b, src_r2h_16x8b;
+
+ __m128i res_l_AB_8x16b, res_l_CD_8x16b;
+ __m128i res_h_AB_8x16b, res_h_CD_8x16b;
+ __m128i res_l_8x16b, res_h_8x16b, res_16x8b;
+
+ __m128i coeffAB_16x8b, coeffCD_16x8b, round_add32_8x16b;
+ __m128i const_shuff_16x8b;
+
+ AB = (B << 8) + A;
+ CD = (D << 8) + C;
+
+ coeffAB_16x8b = _mm_set1_epi16(AB);
+ coeffCD_16x8b = _mm_set1_epi16(CD);
+
+ round_add32_8x16b = _mm_set1_epi16(32);
+
+ const_shuff_16x8b = _mm_setr_epi32(0x03010200, 0x05030402, 0x07050604, 0x09070806);
+
+ src_r1l_16x8b = _mm_loadu_si128((__m128i *)pu1_src);
+ src_r1h_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + 8));
+
+ src_r1l_16x8b = _mm_shuffle_epi8(src_r1l_16x8b, const_shuff_16x8b);
+ src_r1h_16x8b = _mm_shuffle_epi8(src_r1h_16x8b, const_shuff_16x8b);
+
+ pu1_src += src_strd;
+
+ do
+ {
+ //row 1
+ src_r2l_16x8b = _mm_loadu_si128((__m128i *)pu1_src);
+ src_r2h_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + 8));
+
+ src_r2l_16x8b = _mm_shuffle_epi8(src_r2l_16x8b, const_shuff_16x8b);
+ src_r2h_16x8b = _mm_shuffle_epi8(src_r2h_16x8b, const_shuff_16x8b);
+
+ res_l_AB_8x16b = _mm_maddubs_epi16(src_r1l_16x8b, coeffAB_16x8b);
+ res_h_AB_8x16b = _mm_maddubs_epi16(src_r1h_16x8b, coeffAB_16x8b);
+ res_l_CD_8x16b = _mm_maddubs_epi16(src_r2l_16x8b, coeffCD_16x8b);
+ res_h_CD_8x16b = _mm_maddubs_epi16(src_r2h_16x8b, coeffCD_16x8b);
+
+ res_l_8x16b = _mm_add_epi16(res_l_AB_8x16b, round_add32_8x16b);
+ res_h_8x16b = _mm_add_epi16(res_h_AB_8x16b, round_add32_8x16b);
+ res_l_8x16b = _mm_add_epi16(res_l_8x16b, res_l_CD_8x16b);
+ res_h_8x16b = _mm_add_epi16(res_h_8x16b, res_h_CD_8x16b);
+
+ res_l_8x16b = _mm_srai_epi16(res_l_8x16b, 6);
+ res_h_8x16b = _mm_srai_epi16(res_h_8x16b, 6);
+
+ res_16x8b = _mm_packus_epi16(res_l_8x16b, res_h_8x16b);
+
+ _mm_storeu_si128((__m128i *)pu1_dst, res_16x8b);
+
+ pu1_src += src_strd;
+ pu1_dst += dst_strd;
+
+ //row 2
+ src_r1l_16x8b = _mm_loadu_si128((__m128i *)pu1_src);
+ src_r1h_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + 8));
+
+ src_r1l_16x8b = _mm_shuffle_epi8(src_r1l_16x8b, const_shuff_16x8b);
+ src_r1h_16x8b = _mm_shuffle_epi8(src_r1h_16x8b, const_shuff_16x8b);
+
+ res_l_AB_8x16b = _mm_maddubs_epi16(src_r2l_16x8b, coeffAB_16x8b);
+ res_h_AB_8x16b = _mm_maddubs_epi16(src_r2h_16x8b, coeffAB_16x8b);
+ res_l_CD_8x16b = _mm_maddubs_epi16(src_r1l_16x8b, coeffCD_16x8b);
+ res_h_CD_8x16b = _mm_maddubs_epi16(src_r1h_16x8b, coeffCD_16x8b);
+
+ res_l_8x16b = _mm_add_epi16(res_l_AB_8x16b, round_add32_8x16b);
+ res_h_8x16b = _mm_add_epi16(res_h_AB_8x16b, round_add32_8x16b);
+ res_l_8x16b = _mm_add_epi16(res_l_8x16b, res_l_CD_8x16b);
+ res_h_8x16b = _mm_add_epi16(res_h_8x16b, res_h_CD_8x16b);
+
+ res_l_8x16b = _mm_srai_epi16(res_l_8x16b, 6);
+ res_h_8x16b = _mm_srai_epi16(res_h_8x16b, 6);
+
+ res_16x8b = _mm_packus_epi16(res_l_8x16b, res_h_8x16b);
+
+ _mm_storeu_si128((__m128i *)pu1_dst, res_16x8b);
+
+ pu1_src += src_strd;
+ pu1_dst += dst_strd;
+
+ //row 3
+ src_r2l_16x8b = _mm_loadu_si128((__m128i *)pu1_src);
+ src_r2h_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + 8));
+
+ src_r2l_16x8b = _mm_shuffle_epi8(src_r2l_16x8b, const_shuff_16x8b);
+ src_r2h_16x8b = _mm_shuffle_epi8(src_r2h_16x8b, const_shuff_16x8b);
+
+ res_l_AB_8x16b = _mm_maddubs_epi16(src_r1l_16x8b, coeffAB_16x8b);
+ res_h_AB_8x16b = _mm_maddubs_epi16(src_r1h_16x8b, coeffAB_16x8b);
+ res_l_CD_8x16b = _mm_maddubs_epi16(src_r2l_16x8b, coeffCD_16x8b);
+ res_h_CD_8x16b = _mm_maddubs_epi16(src_r2h_16x8b, coeffCD_16x8b);
+
+ res_l_8x16b = _mm_add_epi16(res_l_AB_8x16b, round_add32_8x16b);
+ res_h_8x16b = _mm_add_epi16(res_h_AB_8x16b, round_add32_8x16b);
+ res_l_8x16b = _mm_add_epi16(res_l_8x16b, res_l_CD_8x16b);
+ res_h_8x16b = _mm_add_epi16(res_h_8x16b, res_h_CD_8x16b);
+
+ res_l_8x16b = _mm_srai_epi16(res_l_8x16b, 6);
+ res_h_8x16b = _mm_srai_epi16(res_h_8x16b, 6);
+
+ res_16x8b = _mm_packus_epi16(res_l_8x16b, res_h_8x16b);
+
+ _mm_storeu_si128((__m128i *)pu1_dst, res_16x8b);
+
+ pu1_src += src_strd;
+ pu1_dst += dst_strd;
+
+ //row 1
+ src_r1l_16x8b = _mm_loadu_si128((__m128i *)pu1_src);
+ src_r1h_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + 8));
+
+ src_r1l_16x8b = _mm_shuffle_epi8(src_r1l_16x8b, const_shuff_16x8b);
+ src_r1h_16x8b = _mm_shuffle_epi8(src_r1h_16x8b, const_shuff_16x8b);
+
+ res_l_AB_8x16b = _mm_maddubs_epi16(src_r2l_16x8b, coeffAB_16x8b);
+ res_h_AB_8x16b = _mm_maddubs_epi16(src_r2h_16x8b, coeffAB_16x8b);
+ res_l_CD_8x16b = _mm_maddubs_epi16(src_r1l_16x8b, coeffCD_16x8b);
+ res_h_CD_8x16b = _mm_maddubs_epi16(src_r1h_16x8b, coeffCD_16x8b);
+
+ res_l_8x16b = _mm_add_epi16(res_l_AB_8x16b, round_add32_8x16b);
+ res_h_8x16b = _mm_add_epi16(res_h_AB_8x16b, round_add32_8x16b);
+ res_l_8x16b = _mm_add_epi16(res_l_8x16b, res_l_CD_8x16b);
+ res_h_8x16b = _mm_add_epi16(res_h_8x16b, res_h_CD_8x16b);
+
+ res_l_8x16b = _mm_srai_epi16(res_l_8x16b, 6);
+ res_h_8x16b = _mm_srai_epi16(res_h_8x16b, 6);
+
+ res_16x8b = _mm_packus_epi16(res_l_8x16b, res_h_8x16b);
+
+ _mm_storeu_si128((__m128i *)pu1_dst, res_16x8b);
+
+ ht -= 4;
+ pu1_src += src_strd;
+ pu1_dst += dst_strd;
+ }
+ while(ht > 0);
+ }
+}
diff --git a/common/x86/ih264_iquant_itrans_recon_dc_ssse3.c b/common/x86/ih264_iquant_itrans_recon_dc_ssse3.c
new file mode 100755
index 0000000..d43c8e2
--- /dev/null
+++ b/common/x86/ih264_iquant_itrans_recon_dc_ssse3.c
@@ -0,0 +1,437 @@
+/******************************************************************************
+ *
+ * Copyright (C) 2015 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ *****************************************************************************
+ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
+*/
+/**
+ *******************************************************************************
+ * @file
+ * ih264_iquant_itrans_recon_dc_ssse3.c
+ *
+ * @brief
+ * Contains function definitions for inverse quantization, inverse
+ * transform and reconstruction
+ *
+ * @author
+ * Mohit [100664]
+ *
+ * @par List of Functions:
+ * - ihevc_iquant_itrans_recon_4x4_dc_ssse3()
+ * - ihevc_iquant_itrans_recon_8x8_dc_ssse3()
+ *
+ * @remarks
+ * None
+ *
+ *******************************************************************************
+ */
+/* User include files */
+#include "ih264_typedefs.h"
+#include "ih264_defs.h"
+#include "ih264_trans_macros.h"
+#include "ih264_macros.h"
+#include "ih264_platform_macros.h"
+#include "ih264_trans_data.h"
+#include "ih264_size_defs.h"
+#include "ih264_structs.h"
+#include "ih264_trans_quant_itrans_iquant.h"
+#include <immintrin.h>
+
+/*
+ ********************************************************************************
+ *
+ * @brief This function reconstructs a 4x4 sub block from quantized resiude and
+ * prediction buffer for dc input pattern only, i.e. only the (0,0) element of the input
+ * 4x4 block is non-zero. For complete function, refer ih264_iquant_itrans_recon_ssse3.c
+ *
+ * @par Description:
+ * The quantized residue is first inverse quantized, then inverse transformed.
+ * This inverse transformed content is added to the prediction buffer to recon-
+ * struct the end output
+ *
+ * @param[in] pi2_src
+ * quantized 4x4 block
+ *
+ * @param[in] pu1_pred
+ * prediction 4x4 block
+ *
+ * @param[out] pu1_out
+ * reconstructed 4x4 block
+ *
+ * @param[in] src_strd
+ * quantization buffer stride
+ *
+ * @param[in] pred_strd,
+ * Prediction buffer stride
+ *
+ * @param[in] out_strd
+ * recon buffer Stride
+ *
+ * @param[in] pu2_scaling_list
+ * pointer to scaling list
+ *
+ * @param[in] pu2_norm_adjust
+ * pointer to inverse scale matrix
+ *
+ * @param[in] u4_qp_div_6
+ * Floor (qp/6)
+ *
+ * @param[in] pi4_tmp
+ * temporary buffer of size 1*16
+ *
+ * @returns none
+ *
+ * @remarks none
+ *
+ *******************************************************************************
+ */
+void ih264_iquant_itrans_recon_4x4_dc_ssse3(WORD16 *pi2_src,
+ UWORD8 *pu1_pred,
+ UWORD8 *pu1_out,
+ WORD32 pred_strd,
+ WORD32 out_strd,
+ const UWORD16 *pu2_iscal_mat,
+ const UWORD16 *pu2_weigh_mat,
+ UWORD32 u4_qp_div_6,
+ WORD16 *pi2_tmp,
+ WORD32 iq_start_idx,
+ WORD16 *pi2_dc_ld_addr)
+{
+ UWORD32 *pu4_out = (UWORD32 *)pu1_out;
+ WORD32 q0 = pi2_src[0];
+ WORD16 i_macro, rnd_fact = (u4_qp_div_6 < 4) ? 1 << (3 - u4_qp_div_6) : 0;
+ INV_QUANT(q0, pu2_iscal_mat[0], pu2_weigh_mat[0], u4_qp_div_6, rnd_fact, 4);
+
+ if (iq_start_idx != 0 )
+ q0 = pi2_dc_ld_addr[0]; // Restoring dc value for intra case
+
+ i_macro = ((q0 + 32) >> 6);
+
+ __m128i predload_r,pred_r0, pred_r1, pred_r2, pred_r3;
+ __m128i sign_reg;
+ __m128i zero_8x16b = _mm_setzero_si128(); // all bits reset to zero
+ __m128i temp4, temp5, temp6, temp7;
+ __m128i value_add = _mm_set1_epi16(i_macro);
+
+ zero_8x16b = _mm_setzero_si128(); // all bits reset to zero
+ //Load pred buffer
+ predload_r = _mm_loadl_epi64((__m128i *) (&pu1_pred[0])); //p00 p01 p02 p03 0 0 0 0 0 0 0 0 -- all 8 bits
+ pred_r0 = _mm_unpacklo_epi8(predload_r, zero_8x16b); //p00 p01 p02 p03 0 0 0 0 -- all 16 bits
+ predload_r = _mm_loadl_epi64((__m128i *) (&pu1_pred[pred_strd])); //p10 p11 p12 p13 0 0 0 0 0 0 0 0 -- all 8 bits
+ pred_r1 = _mm_unpacklo_epi8(predload_r, zero_8x16b); //p10 p11 p12 p13 0 0 0 0 -- all 16 bits
+ predload_r = _mm_loadl_epi64((__m128i *) (&pu1_pred[2*pred_strd])); //p20 p21 p22 p23 0 0 0 0 0 0 0 0 -- all 8 bits
+ pred_r2 = _mm_unpacklo_epi8(predload_r, zero_8x16b); //p20 p21 p22 p23 0 0 0 0 -- all 16 bits
+ predload_r = _mm_loadl_epi64((__m128i *) (&pu1_pred[3*pred_strd])); //p30 p31 p32 p33 0 0 0 0 0 0 0 0 -- all 8 bits
+ pred_r3 = _mm_unpacklo_epi8(predload_r, zero_8x16b); //p30 p31 p32 p33 0 0 0 0 -- all 16 bits
+
+ pred_r0 = _mm_unpacklo_epi64(pred_r0, pred_r1); //p00 p01 p02 p03 p10 p11 p12 p13
+ pred_r2 = _mm_unpacklo_epi64(pred_r2, pred_r3); //p20 p21 p22p p23 p30 p31 p32 p33
+
+ temp4 = _mm_add_epi16(value_add, pred_r0);
+ temp5 = _mm_add_epi16(value_add, pred_r2);
+ /*------------------------------------------------------------------*/
+ //Clipping the results to 8 bits
+ sign_reg = _mm_cmpgt_epi16(temp4, zero_8x16b); // sign check
+ temp4 = _mm_and_si128(temp4, sign_reg);
+ sign_reg = _mm_cmpgt_epi16(temp5, zero_8x16b); // sign check
+ temp5 = _mm_and_si128(temp5, sign_reg);
+
+ temp4 = _mm_packus_epi16(temp4,temp5);
+ temp5 = _mm_srli_si128(temp4,4);
+ temp6 = _mm_srli_si128(temp5,4);
+ temp7 = _mm_srli_si128(temp6,4);
+
+ *pu4_out = _mm_cvtsi128_si32(temp4);
+ pu1_out += out_strd;
+ pu4_out = (UWORD32 *)(pu1_out);
+ *(pu4_out) = _mm_cvtsi128_si32(temp5);
+ pu1_out += out_strd;
+ pu4_out = (UWORD32 *)(pu1_out);
+ *(pu4_out) = _mm_cvtsi128_si32(temp6);
+ pu1_out += out_strd;
+ pu4_out = (UWORD32 *)(pu1_out);
+ *(pu4_out) = _mm_cvtsi128_si32(temp7);
+}
+/**
+ *******************************************************************************
+ *
+ * @brief
+ * This function performs inverse quant and Inverse transform type Ci4 for 8x8 block
+ * for dc input pattern only, i.e. only the (0,0) element of the input 8x8 block is
+ * non-zero. For complete function, refer ih264_iquant_itrans_recon_ssse3.c
+ *
+ * @par Description:
+ * Performs inverse transform Ci8 and adds the residue to get the
+ * reconstructed block
+ *
+ * @param[in] pi2_src
+ * Input 8x8coefficients
+ *
+ * @param[in] pu1_pred
+ * Prediction 8x8 block
+ *
+ * @param[out] pu1_recon
+ * Output 8x8 block
+ *
+ * @param[in] q_div
+ * QP/6
+ *
+ * @param[in] q_rem
+ * QP%6
+ *
+ * @param[in] q_lev
+ * Quantizer level
+ *
+ * @param[in] u4_src_stride
+ * Input stride
+ *
+ * @param[in] u4_pred_stride,
+ * Prediction stride
+ *
+ * @param[in] u4_out_stride
+ * Output Stride
+ *
+ * @param[in] pi4_tmp
+ * temporary buffer of size 1*64
+ * the tmp for each block
+ *
+ * @param[in] pu4_iquant_mat
+ * Pointer to the inverse quantization matrix
+ *
+ * @returns Void
+ *
+ * @remarks
+ * None
+ *
+ *******************************************************************************
+ */
+
+void ih264_iquant_itrans_recon_8x8_dc_ssse3 (WORD16 *pi2_src,
+ UWORD8 *pu1_pred,
+ UWORD8 *pu1_out,
+ WORD32 pred_strd,
+ WORD32 out_strd,
+ const UWORD16 *pu2_iscale_mat,
+ const UWORD16 *pu2_weigh_mat,
+ UWORD32 qp_div,
+ WORD16 *pi2_tmp,
+ WORD32 iq_start_idx,
+ WORD16 *pi2_dc_ld_addr)
+{
+ WORD32 q0 = pi2_src[0];
+ WORD16 i_macro, rnd_fact = (qp_div < 6) ? 1 << (5 - qp_div) : 0;
+ INV_QUANT(q0, pu2_iscale_mat[0], pu2_weigh_mat[0], qp_div, rnd_fact, 6);
+ i_macro = ((q0 + 32) >> 6);
+
+ __m128i predload_r,pred_r0, pred_r1, pred_r2, pred_r3,pred_r4,pred_r5,pred_r6,pred_r7;
+ __m128i sign_reg;
+ __m128i zero_8x16b = _mm_setzero_si128(); // all bits reset to zero
+ __m128i temp1,temp2,temp3,temp4, temp5, temp6, temp7,temp8;
+ __m128i value_add = _mm_set1_epi16(i_macro);
+
+ //Load pred buffer row 0
+ predload_r = _mm_loadl_epi64((__m128i *)(&pu1_pred[0])); //p0 p1 p2 p3 p4 p5 p6 p7 0 0 0 0 0 0 0 0 -- all 8 bits
+ pred_r0 = _mm_unpacklo_epi8(predload_r, zero_8x16b); //p0 p1 p2 p3 p4 p5 p6 p7 -- all 16 bits
+ //Load pred buffer row 1
+ predload_r = _mm_loadl_epi64((__m128i *)(&pu1_pred[pred_strd])); //p0 p1 p2 p3 p4 p5 p6 p7 0 0 0 0 0 0 0 0 -- all 8 bits
+ pred_r1 = _mm_unpacklo_epi8(predload_r, zero_8x16b); //p0 p1 p2 p3 p4 p5 p6 p7 -- all 16 bits
+ //Load pred buffer row 2
+ predload_r = _mm_loadl_epi64(
+ (__m128i *)(&pu1_pred[2 * pred_strd])); //p0 p1 p2 p3 p4 p5 p6 p7 0 0 0 0 0 0 0 0 -- all 8 bits
+ pred_r2 = _mm_unpacklo_epi8(predload_r, zero_8x16b); //p0 p1 p2 p3 p4 p5 p6 p7 -- all 16 bits
+ //Load pred buffer row 3
+ predload_r = _mm_loadl_epi64(
+ (__m128i *)(&pu1_pred[3 * pred_strd])); //p0 p1 p2 p3 p4 p5 p6 p7 0 0 0 0 0 0 0 0 -- all 8 bits
+ pred_r3 = _mm_unpacklo_epi8(predload_r, zero_8x16b); //p0 p1 p2 p3 p4 p5 p6 p7 -- all 16 bits
+ //Load pred buffer row 4
+ predload_r = _mm_loadl_epi64(
+ (__m128i *)(&pu1_pred[4 * pred_strd])); //p0 p1 p2 p3 p4 p5 p6 p7 0 0 0 0 0 0 0 0 -- all 8 bits
+ pred_r4 = _mm_unpacklo_epi8(predload_r, zero_8x16b); //p0 p1 p2 p3 p4 p5 p6 p7 -- all 16 bits
+ //Load pred buffer row 5
+ predload_r = _mm_loadl_epi64(
+ (__m128i *)(&pu1_pred[5 * pred_strd])); //p0 p1 p2 p3 p4 p5 p6 p7 0 0 0 0 0 0 0 0 -- all 8 bit
+ pred_r5 = _mm_unpacklo_epi8(predload_r, zero_8x16b); //p0 p1 p2 p3 p4 p5 p6 p7 -- all 16 bits
+ //Load pred buffer row 6
+ predload_r = _mm_loadl_epi64(
+ (__m128i *)(&pu1_pred[6 * pred_strd])); //p0 p1 p2 p3 p4 p5 p6 p7 0 0 0 0 0 0 0 0 -- all 8 bits
+ pred_r6 = _mm_unpacklo_epi8(predload_r, zero_8x16b); //p0 p1 p2 p3 p4 p5 p6 p7 -- all 16 bits
+ //Load pred buffer row 7
+ predload_r = _mm_loadl_epi64(
+ (__m128i *)(&pu1_pred[7 * pred_strd])); //p0 p1 p2 p3 p4 p5 p6 p7 0 0 0 0 0 0 0 0 -- all 8 bits
+ pred_r7 = _mm_unpacklo_epi8(predload_r, zero_8x16b); //p0 p1 p2 p3 p4 p5 p6 p7 -- all 16 bits
+
+ temp1 = _mm_add_epi16(value_add, pred_r0);
+
+ temp2 = _mm_add_epi16(value_add, pred_r1);
+
+ temp3 = _mm_add_epi16(value_add, pred_r2);
+
+ temp4 = _mm_add_epi16(value_add, pred_r3);
+
+ temp5 = _mm_add_epi16(value_add, pred_r4);
+
+ temp6 = _mm_add_epi16(value_add, pred_r5);
+
+ temp7 = _mm_add_epi16(value_add, pred_r6);
+
+ temp8 = _mm_add_epi16(value_add, pred_r7);
+ /*------------------------------------------------------------------*/
+ //Clipping the results to 8 bits
+ sign_reg = _mm_cmpgt_epi16(temp1, zero_8x16b); // sign check
+ temp1 = _mm_and_si128(temp1, sign_reg);
+ sign_reg = _mm_cmpgt_epi16(temp2, zero_8x16b); // sign check
+ temp2 = _mm_and_si128(temp2, sign_reg);
+ sign_reg = _mm_cmpgt_epi16(temp3, zero_8x16b); // sign check
+ temp3 = _mm_and_si128(temp3, sign_reg);
+ sign_reg = _mm_cmpgt_epi16(temp4, zero_8x16b); // sign check
+ temp4 = _mm_and_si128(temp4, sign_reg);
+ sign_reg = _mm_cmpgt_epi16(temp5, zero_8x16b); // sign check
+ temp5 = _mm_and_si128(temp5, sign_reg);
+ sign_reg = _mm_cmpgt_epi16(temp6, zero_8x16b); // sign check
+ temp6 = _mm_and_si128(temp6, sign_reg);
+ sign_reg = _mm_cmpgt_epi16(temp7, zero_8x16b); // sign check
+ temp7 = _mm_and_si128(temp7, sign_reg);
+ sign_reg = _mm_cmpgt_epi16(temp8, zero_8x16b); // sign check
+ temp8 = _mm_and_si128(temp8, sign_reg);
+
+ temp1 = _mm_packus_epi16(temp1, zero_8x16b);
+ temp2 = _mm_packus_epi16(temp2, zero_8x16b);
+ temp3 = _mm_packus_epi16(temp3, zero_8x16b);
+ temp4 = _mm_packus_epi16(temp4, zero_8x16b);
+ temp5 = _mm_packus_epi16(temp5, zero_8x16b);
+ temp6 = _mm_packus_epi16(temp6, zero_8x16b);
+ temp7 = _mm_packus_epi16(temp7, zero_8x16b);
+ temp8 = _mm_packus_epi16(temp8, zero_8x16b);
+
+ _mm_storel_epi64((__m128i *)(&pu1_out[0]), temp1);
+ _mm_storel_epi64((__m128i *)(&pu1_out[out_strd]), temp2);
+ _mm_storel_epi64((__m128i *)(&pu1_out[2 * out_strd]), temp3);
+ _mm_storel_epi64((__m128i *)(&pu1_out[3 * out_strd]), temp4);
+ _mm_storel_epi64((__m128i *)(&pu1_out[4 * out_strd]), temp5);
+ _mm_storel_epi64((__m128i *)(&pu1_out[5 * out_strd]), temp6);
+ _mm_storel_epi64((__m128i *)(&pu1_out[6 * out_strd]), temp7);
+ _mm_storel_epi64((__m128i *)(&pu1_out[7 * out_strd]), temp8);
+}
+
+/*
+ ********************************************************************************
+ *
+ * @brief This function reconstructs a 4x4 sub block from quantized chroma resiude and
+ * prediction buffer
+ *
+ * @par Description:
+ * The quantized residue is first inverse quantized, then inverse transformed.
+ * This inverse transformed content is added to the prediction buffer to recon-
+ * struct the end output
+ *
+ * @param[in] pi2_src
+ * quantized 4x4 block
+ *
+ * @param[in] pu1_pred
+ * prediction 4x4 block
+ *
+ * @param[out] pu1_out
+ * reconstructed 4x4 block
+ *
+ * @param[in] src_strd
+ * quantization buffer stride
+ *
+ * @param[in] pred_strd,
+ * Prediction buffer stride
+ *
+ * @param[in] out_strd
+ * recon buffer Stride
+ *
+ * @param[in] pu2_scaling_list
+ * pointer to scaling list
+ *
+ * @param[in] pu2_norm_adjust
+ * pointer to inverse scale matrix
+ *
+ * @param[in] u4_qp_div_6
+ * Floor (qp/6)
+ *
+ * @param[in] pi4_tmp
+ * temporary buffer of size 1*16
+ *
+ * @returns none
+ *
+ * @remarks none
+ *
+ *******************************************************************************
+ */
+void ih264_iquant_itrans_recon_chroma_4x4_dc_ssse3(WORD16 *pi2_src,
+ UWORD8 *pu1_pred,
+ UWORD8 *pu1_out,
+ WORD32 pred_strd,
+ WORD32 out_strd,
+ const UWORD16 *pu2_iscal_mat,
+ const UWORD16 *pu2_weigh_mat,
+ UWORD32 u4_qp_div_6,
+ WORD16 *pi2_tmp,
+ WORD16 *pi2_dc_src)
+ {
+ WORD16 q0 = pi2_dc_src[0]; // DC value won't be dequantized for chroma inverse transform
+ WORD16 i_macro = ((q0 + 32) >> 6);
+
+ __m128i pred_r0, pred_r1, pred_r2, pred_r3, sign_reg;
+ __m128i zero_8x16b = _mm_setzero_si128(); // all bits reset to zero
+ __m128i chroma_mask = _mm_set1_epi16 (0xFF);
+ __m128i value_add = _mm_set1_epi16(i_macro);
+
+ //Load pred buffer
+ pred_r0 = _mm_loadl_epi64((__m128i *) (&pu1_pred[0])); //p00 p01 p02 p03 0 0 0 0 0 0 0 0 -- all 8 bits
+ pred_r1 = _mm_loadl_epi64((__m128i *) (&pu1_pred[pred_strd])); //p10 p11 p12 p13 0 0 0 0 0 0 0 0 -- all 8 bits
+ pred_r2 = _mm_loadl_epi64((__m128i *) (&pu1_pred[2 * pred_strd])); //p20 p21 p22 p23 0 0 0 0 0 0 0 0 -- all 8 bits
+ pred_r3 = _mm_loadl_epi64((__m128i *) (&pu1_pred[3 * pred_strd])); //p30 p31 p32 p33 0 0 0 0 0 0 0 0 -- all 8 bits
+
+ pred_r0 = _mm_and_si128(pred_r0, chroma_mask);
+ pred_r1 = _mm_and_si128(pred_r1, chroma_mask);
+ pred_r2 = _mm_and_si128(pred_r2, chroma_mask);
+ pred_r3 = _mm_and_si128(pred_r3, chroma_mask);
+
+ pred_r0 = _mm_unpacklo_epi64(pred_r0, pred_r1); //p00 p01 p02 p03 p10 p11 p12 p13
+ pred_r2 = _mm_unpacklo_epi64(pred_r2, pred_r3); //p20 p21 p22p p23 p30 p31 p32 p33
+
+ pred_r0 = _mm_add_epi16(value_add, pred_r0);
+ pred_r2 = _mm_add_epi16(value_add, pred_r2);
+
+ /*------------------------------------------------------------------*/
+ //Clipping the results to 8 bits
+ sign_reg = _mm_cmpgt_epi16(pred_r0, zero_8x16b); // sign check
+ pred_r0 = _mm_and_si128(pred_r0, sign_reg);
+ sign_reg = _mm_cmpgt_epi16(pred_r2, zero_8x16b);
+ pred_r2 = _mm_and_si128(pred_r2, sign_reg);
+
+ pred_r0 = _mm_packus_epi16(pred_r0, pred_r2);
+ pred_r1 = _mm_srli_si128(pred_r0, 4);
+ pred_r2 = _mm_srli_si128(pred_r1, 4);
+ pred_r3 = _mm_srli_si128(pred_r2, 4);
+
+ pred_r0 = _mm_unpacklo_epi8(pred_r0, zero_8x16b); //p00 p01 p02 p03 -- all 16 bits
+ pred_r1 = _mm_unpacklo_epi8(pred_r1, zero_8x16b); //p10 p11 p12 p13 -- all 16 bits
+ pred_r2 = _mm_unpacklo_epi8(pred_r2, zero_8x16b); //p20 p21 p22 p23 -- all 16 bits
+ pred_r3 = _mm_unpacklo_epi8(pred_r3, zero_8x16b); //p30 p31 p32 p33 -- all 16 bits
+
+ chroma_mask = _mm_unpacklo_epi64(chroma_mask, zero_8x16b); //1 0 1 0 1 0 1 0 0 0 0 0 0 0 0 0 -- 8 bits
+
+ _mm_maskmoveu_si128(pred_r0, chroma_mask, (char *)(&pu1_out[0]));
+ _mm_maskmoveu_si128(pred_r1, chroma_mask, (char *)(&pu1_out[out_strd]));
+ _mm_maskmoveu_si128(pred_r2, chroma_mask, (char *)(&pu1_out[2*out_strd]));
+ _mm_maskmoveu_si128(pred_r3, chroma_mask, (char *)(&pu1_out[3*out_strd]));
+}
+
+
diff --git a/common/x86/ih264_iquant_itrans_recon_sse42.c b/common/x86/ih264_iquant_itrans_recon_sse42.c
new file mode 100755
index 0000000..2a4ea3f
--- /dev/null
+++ b/common/x86/ih264_iquant_itrans_recon_sse42.c
@@ -0,0 +1,554 @@
+/******************************************************************************
+ *
+ * Copyright (C) 2015 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ *****************************************************************************
+ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
+*/
+/**
+ *******************************************************************************
+ * @file
+ * ih264_iquant_itrans_recon_sse42.c
+ *
+ * @brief
+ * Contains function definitions for inverse quantization, inverse
+ * transform and reconstruction
+ *
+ * @author
+ * Mohit [100664]
+ *
+ * @par List of Functions:
+ * - ihevc_iquant_itrans_recon_4x4_sse42()
+ * - ihevc_iquant_itrans_recon_chroma_4x4_sse42()
+ *
+ * @remarks
+ * None
+ *
+ *******************************************************************************
+ */
+/* User include files */
+#include "ih264_typedefs.h"
+#include "ih264_defs.h"
+#include "ih264_trans_macros.h"
+#include "ih264_macros.h"
+#include "ih264_platform_macros.h"
+#include "ih264_trans_data.h"
+#include "ih264_size_defs.h"
+#include "ih264_structs.h"
+#include "ih264_trans_quant_itrans_iquant.h"
+#include <immintrin.h>
+
+/*
+ ********************************************************************************
+ *
+ * @brief This function reconstructs a 4x4 sub block from quantized resiude and
+ * prediction buffer
+ *
+ * @par Description:
+ * The quantized residue is first inverse quantized, then inverse transformed.
+ * This inverse transformed content is added to the prediction buffer to recon-
+ * struct the end output
+ *
+ * @param[in] pi2_src
+ * quantized 4x4 block
+ *
+ * @param[in] pu1_pred
+ * prediction 4x4 block
+ *
+ * @param[out] pu1_out
+ * reconstructed 4x4 block
+ *
+ * @param[in] src_strd
+ * quantization buffer stride
+ *
+ * @param[in] pred_strd,
+ * Prediction buffer stride
+ *
+ * @param[in] out_strd
+ * recon buffer Stride
+ *
+ * @param[in] pu2_scaling_list
+ * pointer to scaling list
+ *
+ * @param[in] pu2_norm_adjust
+ * pointer to inverse scale matrix
+ *
+ * @param[in] u4_qp_div_6
+ * Floor (qp/6)
+ *
+ * @param[in] pi4_tmp
+ * temporary buffer of size 1*16
+ *
+ * @returns none
+ *
+ * @remarks none
+ *
+ *******************************************************************************
+ */
+void ih264_iquant_itrans_recon_4x4_sse42(WORD16 *pi2_src,
+ UWORD8 *pu1_pred,
+ UWORD8 *pu1_out,
+ WORD32 pred_strd,
+ WORD32 out_strd,
+ const UWORD16 *pu2_iscal_mat,
+ const UWORD16 *pu2_weigh_mat,
+ UWORD32 u4_qp_div_6,
+ WORD16 *pi2_tmp,
+ WORD32 iq_start_idx,
+ WORD16 *pi2_dc_ld_addr)
+ {
+ UWORD32 *pu4_out = (UWORD32 *) pu1_out;
+ __m128i src_r0_r1, src_r2_r3;
+ __m128i src_r0, src_r1, src_r2, src_r3;
+ __m128i scalemat_r0_r1, scalemat_r2_r3;
+ __m128i pred_r0, pred_r1, pred_r2, pred_r3;
+ __m128i sign_reg, dequant_r0_r1, dequant_r2_r3;
+ __m128i zero_8x16b = _mm_setzero_si128(); // all bits reset to zero
+ __m128i temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
+ __m128i resq_r0, resq_r1, resq_r2, resq_r3;
+ __m128i add_rshift = _mm_set1_epi32((1 << (3 - u4_qp_div_6)));
+ __m128i value_32 = _mm_set1_epi32(32);
+
+ /*************************************************************/
+ /* Dequantization of coefficients. Will be replaced by SIMD */
+ /* operations on platform */
+ /*************************************************************/
+ src_r0_r1 = _mm_loadu_si128((__m128i *) (pi2_src)); //a00 a01 a02 a03 a10 a11 a12 a13 -- the source matrix 0th,1st row
+ src_r2_r3 = _mm_loadu_si128((__m128i *) (pi2_src + 8)); //a20 a21 a22 a23 a30 a31 a32 a33 -- the source matrix 2nd,3rd row
+ scalemat_r0_r1 = _mm_loadu_si128((__m128i *) (pu2_iscal_mat)); //b00 b01 b02 b03 b10 b11 b12 b13 -- the scaling matrix 0th,1st row
+ scalemat_r2_r3 = _mm_loadu_si128((__m128i *) (pu2_iscal_mat + 8)); //b20 b21 b22 b23 b30 b31 b32 b33 -- the scaling matrix 2nd,3rd row
+ dequant_r0_r1 = _mm_loadu_si128((__m128i *) (pu2_weigh_mat)); //q00 q01 q02 q03 q10 q11 q12 q13 -- all 16 bits
+ dequant_r2_r3 = _mm_loadu_si128((__m128i *) (pu2_weigh_mat + 8)); //q20 q21 q22 q23 q30 q31 q32 q33 -- all 16 bits
+
+ temp0 = _mm_mullo_epi16(scalemat_r0_r1, dequant_r0_r1); //b00*q00 b01*q01 b02*q02 b03*q03 b10*q10 b11*q11 b12*q12 b13*q13 -- 16 bit result
+ temp1 = _mm_mullo_epi16(scalemat_r2_r3, dequant_r2_r3); //b00*q00 b01*q01 b02*q02 b03*q03 b10*q10 b11*q11 b12*q12 b13*q13 -- 16 bit result
+
+ temp4 = _mm_unpacklo_epi16(temp0, zero_8x16b); // b00*q00 0 b01*q01 0 b02*q02 0 b03*q03 0 -- 16 bit long
+ temp5 = _mm_unpackhi_epi16(temp0, zero_8x16b); // b10*q10 0 b11*q11 0 b12*q12 0 b13*q13 0 -- 16 bit long
+ temp6 = _mm_unpacklo_epi16(temp1, zero_8x16b); // b00*q00 0 b01*q01 0 b02*q02 0 b03*q03 0 -- 16 bit long
+ temp7 = _mm_unpackhi_epi16(temp1, zero_8x16b); // b10*q10 0 b11*q11 0 b12*q12 0 b13*q13 0 -- 16 bit long
+
+ src_r0 = _mm_unpacklo_epi16(src_r0_r1, zero_8x16b); // a00 0 a01 0 a02 0 a03 0 -- 16 bit long
+ src_r1 = _mm_unpackhi_epi16(src_r0_r1, zero_8x16b); // a10 0 a11 0 a12 0 a13 0 -- 16 bit long
+ src_r2 = _mm_unpacklo_epi16(src_r2_r3, zero_8x16b); // a20 0 a21 0 a22 0 a23 0 -- 16 bit long
+ src_r3 = _mm_unpackhi_epi16(src_r2_r3, zero_8x16b); // a30 0 a31 0 a32 0 a33 0 -- 16 bit long
+
+ temp4 = _mm_madd_epi16(src_r0, temp4); //a00*b00*q00 a10*b10*q10 a20*b20*q20 a30*b30 q30 -- 32 bits long
+ temp5 = _mm_madd_epi16(src_r1, temp5);
+ temp6 = _mm_madd_epi16(src_r2, temp6);
+ temp7 = _mm_madd_epi16(src_r3, temp7);
+
+ if (u4_qp_div_6 >= 4) {
+ resq_r0 = _mm_slli_epi32(temp4, u4_qp_div_6 - 4);
+ resq_r1 = _mm_slli_epi32(temp5, u4_qp_div_6 - 4);
+ resq_r2 = _mm_slli_epi32(temp6, u4_qp_div_6 - 4);
+ resq_r3 = _mm_slli_epi32(temp7, u4_qp_div_6 - 4);
+ } else {
+ temp4 = _mm_add_epi32(temp4, add_rshift);
+ temp5 = _mm_add_epi32(temp5, add_rshift);
+ temp6 = _mm_add_epi32(temp6, add_rshift);
+ temp7 = _mm_add_epi32(temp7, add_rshift);
+ resq_r0 = _mm_srai_epi32(temp4, 4 - u4_qp_div_6);
+ resq_r1 = _mm_srai_epi32(temp5, 4 - u4_qp_div_6);
+ resq_r2 = _mm_srai_epi32(temp6, 4 - u4_qp_div_6);
+ resq_r3 = _mm_srai_epi32(temp7, 4 - u4_qp_div_6);
+ }
+
+ if (iq_start_idx == 1)
+ resq_r0 = _mm_insert_epi32(resq_r0,(WORD32)pi2_dc_ld_addr[0],0);
+ /* Perform Inverse transform */
+ /*-------------------------------------------------------------*/
+ /* IDCT [ Horizontal transformation ] */
+ /*-------------------------------------------------------------*/
+ // Matrix transpose
+ /*
+ * a0 a1 a2 a3
+ * b0 b1 b2 b3
+ * c0 c1 c2 c3
+ * d0 d1 d2 d3
+ */
+ temp1 = _mm_unpacklo_epi32(resq_r0, resq_r1); //a0 b0 a1 b1
+ temp3 = _mm_unpacklo_epi32(resq_r2, resq_r3); //c0 d0 c1 d1
+ temp2 = _mm_unpackhi_epi32(resq_r0, resq_r1); //a2 b2 a3 b3
+ temp4 = _mm_unpackhi_epi32(resq_r2, resq_r3); //c2 d2 c3 d3
+ resq_r0 = _mm_unpacklo_epi64(temp1, temp3); //a0 b0 c0 d0
+ resq_r1 = _mm_unpackhi_epi64(temp1, temp3); //a1 b1 c1 d1
+ resq_r2 = _mm_unpacklo_epi64(temp2, temp4); //a2 b2 c2 d2
+ resq_r3 = _mm_unpackhi_epi64(temp2, temp4); //a3 b3 c3 d3
+ //Transform starts -- horizontal transform
+ /*------------------------------------------------------------------*/
+ /* z0 = w0 + w2 */
+ temp0 = _mm_add_epi32(resq_r0, resq_r2);
+ /* z1 = w0 - w2 */
+ temp1 = _mm_sub_epi32(resq_r0, resq_r2);
+ /* z2 = (w1 >> 1) - w3 */
+ temp2 = _mm_srai_epi32(resq_r1, 1); //(w1>>1)
+ temp2 = _mm_sub_epi32(temp2, resq_r3); //(w1>>1) - w3
+ /* z3 = w1 + (w3 >> 1) */
+ temp3 = _mm_srai_epi32(resq_r3, 1); //(w3>>1) + w1
+ temp3 = _mm_add_epi32(temp3, resq_r1);
+ /*----------------------------------------------------------*/
+ /* x0 = z0 + z3 */
+ resq_r0 = _mm_add_epi32(temp0, temp3);
+ /* x1 = z1 + z2 */
+ resq_r1 = _mm_add_epi32(temp1, temp2);
+ /* x2 = z1 - z2 */
+ resq_r2 = _mm_sub_epi32(temp1, temp2);
+ /* x3 = z0 - z3 */
+ resq_r3 = _mm_sub_epi32(temp0, temp3);
+ // Matrix transpose
+ /*
+ * a0 b0 c0 d0
+ * a1 b1 c1 d1
+ * a2 b2 c2 d2
+ * a3 b3 c3 d3
+ */
+ temp1 = _mm_unpacklo_epi32(resq_r0, resq_r1); //a0 a1 b0 b1
+ temp3 = _mm_unpacklo_epi32(resq_r2, resq_r3); //a2 a3 b2 b3
+ temp2 = _mm_unpackhi_epi32(resq_r0, resq_r1); //c0 c1 d0 d1
+ temp4 = _mm_unpackhi_epi32(resq_r2, resq_r3); //c2 c3 d2 d3
+ resq_r0 = _mm_unpacklo_epi64(temp1, temp3); //a0 a1 a2 a3
+ resq_r1 = _mm_unpackhi_epi64(temp1, temp3); //b0 b1 b2 b3
+ resq_r2 = _mm_unpacklo_epi64(temp2, temp4); //c0 c1 c2 c3
+ resq_r3 = _mm_unpackhi_epi64(temp2, temp4); //d0 d1 d2 d3
+ //Transform ends -- horizontal transform
+
+ //Load pred buffer
+ pred_r0 = _mm_loadl_epi64((__m128i *) (&pu1_pred[0])); //p00 p01 p02 p03 0 0 0 0 0 0 0 0 -- all 8 bits
+ pred_r1 = _mm_loadl_epi64((__m128i *) (&pu1_pred[pred_strd])); //p10 p11 p12 p13 0 0 0 0 0 0 0 0 -- all 8 bits
+ pred_r2 = _mm_loadl_epi64((__m128i *) (&pu1_pred[2 * pred_strd])); //p20 p21 p22 p23 0 0 0 0 0 0 0 0 -- all 8 bits
+ pred_r3 = _mm_loadl_epi64((__m128i *) (&pu1_pred[3 * pred_strd])); //p30 p31 p32 p33 0 0 0 0 0 0 0 0 -- all 8 bits
+
+ pred_r0 = _mm_cvtepu8_epi32(pred_r0); //p00 p01 p02 p03 -- all 32 bits
+ pred_r1 = _mm_cvtepu8_epi32(pred_r1); //p10 p11 p12 p13 -- all 32 bits
+ pred_r2 = _mm_cvtepu8_epi32(pred_r2); //p20 p21 p22 p23 -- all 32 bits
+ pred_r3 = _mm_cvtepu8_epi32(pred_r3); //p30 p31 p32 p33 -- all 32 bits
+
+ /*--------------------------------------------------------------*/
+ /* IDCT [ Vertical transformation] and Xij = (xij + 32)>>6 */
+ /* */
+ /* Add the prediction and store it back to same buffer */
+ /*--------------------------------------------------------------*/
+ /* z0j = y0j + y2j */
+ temp0 = _mm_add_epi32(resq_r0, resq_r2);
+ /* z1j = y0j - y2j */
+ temp1 = _mm_sub_epi32(resq_r0, resq_r2);
+ /* z2j = (y1j>>1) - y3j */
+ temp2 = _mm_srai_epi32(resq_r1, 1); //(y1j>>1)
+ temp2 = _mm_sub_epi32(temp2, resq_r3);
+ /* z3j = y1j + (y3j>>1) */
+ temp3 = _mm_srai_epi32(resq_r3, 1); //(y3j>>1)
+ temp3 = _mm_add_epi32(temp3, resq_r1);
+
+ /* x0j = z0j + z3j */
+ temp4 = _mm_add_epi32(temp0, temp3);
+ temp4 = _mm_add_epi32(temp4, value_32);
+ temp4 = _mm_srai_epi32(temp4, 6);
+ temp4 = _mm_add_epi32(temp4, pred_r0);
+ /* x1j = z1j + z2j */
+ temp5 = _mm_add_epi32(temp1, temp2);
+ temp5 = _mm_add_epi32(temp5, value_32);
+ temp5 = _mm_srai_epi32(temp5, 6);
+ temp5 = _mm_add_epi32(temp5, pred_r1);
+ /* x2j = z1j - z2j */
+ temp6 = _mm_sub_epi32(temp1, temp2);
+ temp6 = _mm_add_epi32(temp6, value_32);
+ temp6 = _mm_srai_epi32(temp6, 6);
+ temp6 = _mm_add_epi32(temp6, pred_r2);
+ /* x3j = z0j - z3j */
+ temp7 = _mm_sub_epi32(temp0, temp3);
+ temp7 = _mm_add_epi32(temp7, value_32);
+ temp7 = _mm_srai_epi32(temp7, 6);
+ temp7 = _mm_add_epi32(temp7, pred_r3);
+
+ // 32-bit to 16-bit conversion
+ temp0 = _mm_packs_epi32(temp4, temp5);
+ temp1 = _mm_packs_epi32(temp6, temp7);
+ /*------------------------------------------------------------------*/
+ //Clipping the results to 8 bits
+ sign_reg = _mm_cmpgt_epi16(temp0, zero_8x16b); // sign check
+ temp0 = _mm_and_si128(temp0, sign_reg);
+ sign_reg = _mm_cmpgt_epi16(temp1, zero_8x16b);
+ temp1 = _mm_and_si128(temp1, sign_reg);
+
+ resq_r0 = _mm_packus_epi16(temp0, temp1);
+ resq_r1 = _mm_srli_si128(resq_r0, 4);
+ resq_r2 = _mm_srli_si128(resq_r1, 4);
+ resq_r3 = _mm_srli_si128(resq_r2, 4);
+
+ *pu4_out = _mm_cvtsi128_si32(resq_r0);
+ pu1_out += out_strd;
+ pu4_out = (UWORD32 *) (pu1_out);
+ *(pu4_out) = _mm_cvtsi128_si32(resq_r1);
+ pu1_out += out_strd;
+ pu4_out = (UWORD32 *) (pu1_out);
+ *(pu4_out) = _mm_cvtsi128_si32(resq_r2);
+ pu1_out += out_strd;
+ pu4_out = (UWORD32 *) (pu1_out);
+ *(pu4_out) = _mm_cvtsi128_si32(resq_r3);
+}
+
+/*
+ ********************************************************************************
+ *
+ * @brief This function reconstructs a 4x4 sub block from quantized chroma resiude and
+ * prediction buffer
+ *
+ * @par Description:
+ * The quantized residue is first inverse quantized, then inverse transformed.
+ * This inverse transformed content is added to the prediction buffer to recon-
+ * struct the end output
+ *
+ * @param[in] pi2_src
+ * quantized 4x4 block
+ *
+ * @param[in] pu1_pred
+ * prediction 4x4 block
+ *
+ * @param[out] pu1_out
+ * reconstructed 4x4 block
+ *
+ * @param[in] src_strd
+ * quantization buffer stride
+ *
+ * @param[in] pred_strd,
+ * Prediction buffer stride
+ *
+ * @param[in] out_strd
+ * recon buffer Stride
+ *
+ * @param[in] pu2_scaling_list
+ * pointer to scaling list
+ *
+ * @param[in] pu2_norm_adjust
+ * pointer to inverse scale matrix
+ *
+ * @param[in] u4_qp_div_6
+ * Floor (qp/6)
+ *
+ * @param[in] pi4_tmp
+ * temporary buffer of size 1*16
+ *
+ * @returns none
+ *
+ * @remarks none
+ *
+ *******************************************************************************
+ */
+void ih264_iquant_itrans_recon_chroma_4x4_sse42(WORD16 *pi2_src,
+ UWORD8 *pu1_pred,
+ UWORD8 *pu1_out,
+ WORD32 pred_strd,
+ WORD32 out_strd,
+ const UWORD16 *pu2_iscal_mat,
+ const UWORD16 *pu2_weigh_mat,
+ UWORD32 u4_qp_div_6,
+ WORD16 *pi2_tmp,
+ WORD16 *pi2_dc_ld_addr)
+ {
+ __m128i src_r0_r1, src_r2_r3;
+ __m128i src_r0, src_r1, src_r2, src_r3;
+ __m128i scalemat_r0_r1, scalemat_r2_r3;
+ __m128i pred_r0, pred_r1, pred_r2, pred_r3;
+ __m128i sign_reg, dequant_r0_r1, dequant_r2_r3;
+ __m128i zero_8x16b = _mm_setzero_si128(); // all bits reset to zero
+ __m128i temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
+ __m128i resq_r0, resq_r1, resq_r2, resq_r3;
+ __m128i add_rshift = _mm_set1_epi32((1 << (3 - u4_qp_div_6)));
+ __m128i value_32 = _mm_set1_epi32(32);
+ __m128i chroma_mask = _mm_set1_epi16 (0xFF);
+ /*************************************************************/
+ /* Dequantization of coefficients. Will be replaced by SIMD */
+ /* operations on platform */
+ /*************************************************************/
+ src_r0_r1 = _mm_loadu_si128((__m128i *) (pi2_src)); //a00 a01 a02 a03 a10 a11 a12 a13 -- the source matrix 0th,1st row
+ src_r2_r3 = _mm_loadu_si128((__m128i *) (pi2_src + 8)); //a20 a21 a22 a23 a30 a31 a32 a33 -- the source matrix 2nd,3rd row
+ scalemat_r0_r1 = _mm_loadu_si128((__m128i *) (pu2_iscal_mat)); //b00 b01 b02 b03 b10 b11 b12 b13 -- the scaling matrix 0th,1st row
+ scalemat_r2_r3 = _mm_loadu_si128((__m128i *) (pu2_iscal_mat + 8)); //b20 b21 b22 b23 b30 b31 b32 b33 -- the scaling matrix 2nd,3rd row
+ dequant_r0_r1 = _mm_loadu_si128((__m128i *) (pu2_weigh_mat)); //q00 q01 q02 q03 q10 q11 q12 q13 -- all 16 bits
+ dequant_r2_r3 = _mm_loadu_si128((__m128i *) (pu2_weigh_mat + 8)); //q20 q21 q22 q23 q30 q31 q32 q33 -- all 16 bits
+
+ temp0 = _mm_mullo_epi16(scalemat_r0_r1, dequant_r0_r1); //b00*q00 b01*q01 b02*q02 b03*q03 b10*q10 b11*q11 b12*q12 b13*q13 -- 16 bit result
+ temp1 = _mm_mullo_epi16(scalemat_r2_r3, dequant_r2_r3); //b00*q00 b01*q01 b02*q02 b03*q03 b10*q10 b11*q11 b12*q12 b13*q13 -- 16 bit result
+
+ temp4 = _mm_unpacklo_epi16(temp0, zero_8x16b); // b00*q00 0 b01*q01 0 b02*q02 0 b03*q03 0 -- 16 bit long
+ temp5 = _mm_unpackhi_epi16(temp0, zero_8x16b); // b10*q10 0 b11*q11 0 b12*q12 0 b13*q13 0 -- 16 bit long
+ temp6 = _mm_unpacklo_epi16(temp1, zero_8x16b); // b00*q00 0 b01*q01 0 b02*q02 0 b03*q03 0 -- 16 bit long
+ temp7 = _mm_unpackhi_epi16(temp1, zero_8x16b); // b10*q10 0 b11*q11 0 b12*q12 0 b13*q13 0 -- 16 bit long
+
+ src_r0 = _mm_unpacklo_epi16(src_r0_r1, zero_8x16b); // a00 0 a01 0 a02 0 a03 0 -- 16 bit long
+ src_r1 = _mm_unpackhi_epi16(src_r0_r1, zero_8x16b); // a10 0 a11 0 a12 0 a13 0 -- 16 bit long
+ src_r2 = _mm_unpacklo_epi16(src_r2_r3, zero_8x16b); // a20 0 a21 0 a22 0 a23 0 -- 16 bit long
+ src_r3 = _mm_unpackhi_epi16(src_r2_r3, zero_8x16b); // a30 0 a31 0 a32 0 a33 0 -- 16 bit long
+
+ temp4 = _mm_madd_epi16(src_r0, temp4); //a00*b00*q00 a10*b10*q10 a20*b20*q20 a30*b30 q30 -- 32 bits long
+ temp5 = _mm_madd_epi16(src_r1, temp5);
+ temp6 = _mm_madd_epi16(src_r2, temp6);
+ temp7 = _mm_madd_epi16(src_r3, temp7);
+
+ if (u4_qp_div_6 >= 4) {
+ resq_r0 = _mm_slli_epi32(temp4, u4_qp_div_6 - 4);
+ resq_r1 = _mm_slli_epi32(temp5, u4_qp_div_6 - 4);
+ resq_r2 = _mm_slli_epi32(temp6, u4_qp_div_6 - 4);
+ resq_r3 = _mm_slli_epi32(temp7, u4_qp_div_6 - 4);
+ } else {
+ temp4 = _mm_add_epi32(temp4, add_rshift);
+ temp5 = _mm_add_epi32(temp5, add_rshift);
+ temp6 = _mm_add_epi32(temp6, add_rshift);
+ temp7 = _mm_add_epi32(temp7, add_rshift);
+ resq_r0 = _mm_srai_epi32(temp4, 4 - u4_qp_div_6);
+ resq_r1 = _mm_srai_epi32(temp5, 4 - u4_qp_div_6);
+ resq_r2 = _mm_srai_epi32(temp6, 4 - u4_qp_div_6);
+ resq_r3 = _mm_srai_epi32(temp7, 4 - u4_qp_div_6);
+ }
+
+ resq_r0 = _mm_insert_epi32(resq_r0,(WORD32)pi2_dc_ld_addr[0],0);
+ /* Perform Inverse transform */
+ /*-------------------------------------------------------------*/
+ /* IDCT [ Horizontal transformation ] */
+ /*-------------------------------------------------------------*/
+ // Matrix transpose
+ /*
+ * a0 a1 a2 a3
+ * b0 b1 b2 b3
+ * c0 c1 c2 c3
+ * d0 d1 d2 d3
+ */
+ temp1 = _mm_unpacklo_epi32(resq_r0, resq_r1); //a0 b0 a1 b1
+ temp3 = _mm_unpacklo_epi32(resq_r2, resq_r3); //c0 d0 c1 d1
+ temp2 = _mm_unpackhi_epi32(resq_r0, resq_r1); //a2 b2 a3 b3
+ temp4 = _mm_unpackhi_epi32(resq_r2, resq_r3); //c2 d2 c3 d3
+ resq_r0 = _mm_unpacklo_epi64(temp1, temp3); //a0 b0 c0 d0
+ resq_r1 = _mm_unpackhi_epi64(temp1, temp3); //a1 b1 c1 d1
+ resq_r2 = _mm_unpacklo_epi64(temp2, temp4); //a2 b2 c2 d2
+ resq_r3 = _mm_unpackhi_epi64(temp2, temp4); //a3 b3 c3 d3
+ //Transform starts -- horizontal transform
+ /*------------------------------------------------------------------*/
+ /* z0 = w0 + w2 */
+ temp0 = _mm_add_epi32(resq_r0, resq_r2);
+ /* z1 = w0 - w2 */
+ temp1 = _mm_sub_epi32(resq_r0, resq_r2);
+ /* z2 = (w1 >> 1) - w3 */
+ temp2 = _mm_srai_epi32(resq_r1, 1); //(w1>>1)
+ temp2 = _mm_sub_epi32(temp2, resq_r3); //(w1>>1) - w3
+ /* z3 = w1 + (w3 >> 1) */
+ temp3 = _mm_srai_epi32(resq_r3, 1); //(w3>>1) + w1
+ temp3 = _mm_add_epi32(temp3, resq_r1);
+ /*----------------------------------------------------------*/
+ /* x0 = z0 + z3 */
+ resq_r0 = _mm_add_epi32(temp0, temp3);
+ /* x1 = z1 + z2 */
+ resq_r1 = _mm_add_epi32(temp1, temp2);
+ /* x2 = z1 - z2 */
+ resq_r2 = _mm_sub_epi32(temp1, temp2);
+ /* x3 = z0 - z3 */
+ resq_r3 = _mm_sub_epi32(temp0, temp3);
+ // Matrix transpose
+ /*
+ * a0 b0 c0 d0
+ * a1 b1 c1 d1
+ * a2 b2 c2 d2
+ * a3 b3 c3 d3
+ */
+ temp1 = _mm_unpacklo_epi32(resq_r0, resq_r1); //a0 a1 b0 b1
+ temp3 = _mm_unpacklo_epi32(resq_r2, resq_r3); //a2 a3 b2 b3
+ temp2 = _mm_unpackhi_epi32(resq_r0, resq_r1); //c0 c1 d0 d1
+ temp4 = _mm_unpackhi_epi32(resq_r2, resq_r3); //c2 c3 d2 d3
+ resq_r0 = _mm_unpacklo_epi64(temp1, temp3); //a0 a1 a2 a3
+ resq_r1 = _mm_unpackhi_epi64(temp1, temp3); //b0 b1 b2 b3
+ resq_r2 = _mm_unpacklo_epi64(temp2, temp4); //c0 c1 c2 c3
+ resq_r3 = _mm_unpackhi_epi64(temp2, temp4); //d0 d1 d2 d3
+ //Transform ends -- horizontal transform
+
+ //Load pred buffer
+ pred_r0 = _mm_loadl_epi64((__m128i *) (&pu1_pred[0])); //p00 p01 p02 p03 0 0 0 0 0 0 0 0 -- all 8 bits
+ pred_r1 = _mm_loadl_epi64((__m128i *) (&pu1_pred[pred_strd])); //p10 p11 p12 p13 0 0 0 0 0 0 0 0 -- all 8 bits
+ pred_r2 = _mm_loadl_epi64((__m128i *) (&pu1_pred[2 * pred_strd])); //p20 p21 p22 p23 0 0 0 0 0 0 0 0 -- all 8 bits
+ pred_r3 = _mm_loadl_epi64((__m128i *) (&pu1_pred[3 * pred_strd])); //p30 p31 p32 p33 0 0 0 0 0 0 0 0 -- all 8 bits
+
+ pred_r0 = _mm_and_si128(pred_r0, chroma_mask);
+ pred_r1 = _mm_and_si128(pred_r1, chroma_mask);
+ pred_r2 = _mm_and_si128(pred_r2, chroma_mask);
+ pred_r3 = _mm_and_si128(pred_r3, chroma_mask);
+
+ pred_r0 = _mm_cvtepu16_epi32(pred_r0); //p00 p01 p02 p03 -- all 32 bits
+ pred_r1 = _mm_cvtepu16_epi32(pred_r1); //p10 p11 p12 p13 -- all 32 bits
+ pred_r2 = _mm_cvtepu16_epi32(pred_r2); //p20 p21 p22 p23 -- all 32 bits
+ pred_r3 = _mm_cvtepu16_epi32(pred_r3); //p30 p31 p32 p33 -- all 32 bits
+
+ /*--------------------------------------------------------------*/
+ /* IDCT [ Vertical transformation] and Xij = (xij + 32)>>6 */
+ /* */
+ /* Add the prediction and store it back to same buffer */
+ /*--------------------------------------------------------------*/
+ /* z0j = y0j + y2j */
+ temp0 = _mm_add_epi32(resq_r0, resq_r2);
+ /* z1j = y0j - y2j */
+ temp1 = _mm_sub_epi32(resq_r0, resq_r2);
+ /* z2j = (y1j>>1) - y3j */
+ temp2 = _mm_srai_epi32(resq_r1, 1); //(y1j>>1)
+ temp2 = _mm_sub_epi32(temp2, resq_r3);
+ /* z3j = y1j + (y3j>>1) */
+ temp3 = _mm_srai_epi32(resq_r3, 1); //(y3j>>1)
+ temp3 = _mm_add_epi32(temp3, resq_r1);
+
+ /* x0j = z0j + z3j */
+ temp4 = _mm_add_epi32(temp0, temp3);
+ temp4 = _mm_add_epi32(temp4, value_32);
+ temp4 = _mm_srai_epi32(temp4, 6);
+ temp4 = _mm_add_epi32(temp4, pred_r0);
+ /* x1j = z1j + z2j */
+ temp5 = _mm_add_epi32(temp1, temp2);
+ temp5 = _mm_add_epi32(temp5, value_32);
+ temp5 = _mm_srai_epi32(temp5, 6);
+ temp5 = _mm_add_epi32(temp5, pred_r1);
+ /* x2j = z1j - z2j */
+ temp6 = _mm_sub_epi32(temp1, temp2);
+ temp6 = _mm_add_epi32(temp6, value_32);
+ temp6 = _mm_srai_epi32(temp6, 6);
+ temp6 = _mm_add_epi32(temp6, pred_r2);
+ /* x3j = z0j - z3j */
+ temp7 = _mm_sub_epi32(temp0, temp3);
+ temp7 = _mm_add_epi32(temp7, value_32);
+ temp7 = _mm_srai_epi32(temp7, 6);
+ temp7 = _mm_add_epi32(temp7, pred_r3);
+
+ // 32-bit to 16-bit conversion
+ temp0 = _mm_packs_epi32(temp4, temp5);
+ temp1 = _mm_packs_epi32(temp6, temp7);
+ /*------------------------------------------------------------------*/
+ //Clipping the results to 8 bits
+ sign_reg = _mm_cmpgt_epi16(temp0, zero_8x16b); // sign check
+ temp0 = _mm_and_si128(temp0, sign_reg);
+ sign_reg = _mm_cmpgt_epi16(temp1, zero_8x16b);
+ temp1 = _mm_and_si128(temp1, sign_reg);
+
+ resq_r0 = _mm_packus_epi16(temp0, temp1);
+ resq_r1 = _mm_srli_si128(resq_r0, 4);
+ resq_r2 = _mm_srli_si128(resq_r1, 4);
+ resq_r3 = _mm_srli_si128(resq_r2, 4);
+
+ resq_r0 = _mm_cvtepu8_epi16(resq_r0); //p00 p01 p02 p03 -- all 16 bits
+ resq_r1 = _mm_cvtepu8_epi16(resq_r1); //p10 p11 p12 p13 -- all 16 bits
+ resq_r2 = _mm_cvtepu8_epi16(resq_r2); //p20 p21 p22 p23 -- all 16 bits
+ resq_r3 = _mm_cvtepu8_epi16(resq_r3); //p30 p31 p32 p33 -- all 16 bits
+
+ chroma_mask = _mm_unpacklo_epi64(chroma_mask, zero_8x16b);
+
+ _mm_maskmoveu_si128(resq_r0, chroma_mask, (char *)(&pu1_out[0]));
+ _mm_maskmoveu_si128(resq_r1, chroma_mask, (char *)(&pu1_out[out_strd]));
+ _mm_maskmoveu_si128(resq_r2, chroma_mask, (char *)(&pu1_out[2*out_strd]));
+ _mm_maskmoveu_si128(resq_r3, chroma_mask, (char *)(&pu1_out[3*out_strd]));
+}
diff --git a/common/x86/ih264_iquant_itrans_recon_ssse3.c b/common/x86/ih264_iquant_itrans_recon_ssse3.c
new file mode 100755
index 0000000..ca1397e
--- /dev/null
+++ b/common/x86/ih264_iquant_itrans_recon_ssse3.c
@@ -0,0 +1,1035 @@
+/******************************************************************************
+ *
+ * Copyright (C) 2015 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ *****************************************************************************
+ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
+*/
+/**
+ *******************************************************************************
+ * @file
+ * ih264_iquant_itrans_recon_ssse3.c
+ *
+ * @brief
+ * Contains function definitions for inverse quantization, inverse
+ * transform and reconstruction
+ *
+ * @author
+ * Mohit [100664]
+ *
+ * @par List of Functions:
+ * - ihevc_iquant_itrans_recon_4x4_ssse3()
+ * - ihevc_iquant_itrans_recon_8x8_ssse3()
+ *
+ * @remarks
+ * None
+ *
+ *******************************************************************************
+ */
+/* User include files */
+#include "ih264_typedefs.h"
+#include "ih264_defs.h"
+#include "ih264_trans_macros.h"
+#include "ih264_macros.h"
+#include "ih264_platform_macros.h"
+#include "ih264_trans_data.h"
+#include "ih264_size_defs.h"
+#include "ih264_structs.h"
+#include "ih264_trans_quant_itrans_iquant.h"
+#include <immintrin.h>
+
+/*
+ ********************************************************************************
+ *
+ * @brief This function reconstructs a 4x4 sub block from quantized resiude and
+ * prediction buffer
+ *
+ * @par Description:
+ * The quantized residue is first inverse quantized, then inverse transformed.
+ * This inverse transformed content is added to the prediction buffer to recon-
+ * struct the end output
+ *
+ * @param[in] pi2_src
+ * quantized 4x4 block
+ *
+ * @param[in] pu1_pred
+ * prediction 4x4 block
+ *
+ * @param[out] pu1_out
+ * reconstructed 4x4 block
+ *
+ * @param[in] src_strd
+ * quantization buffer stride
+ *
+ * @param[in] pred_strd,
+ * Prediction buffer stride
+ *
+ * @param[in] out_strd
+ * recon buffer Stride
+ *
+ * @param[in] pu2_scaling_list
+ * pointer to scaling list
+ *
+ * @param[in] pu2_norm_adjust
+ * pointer to inverse scale matrix
+ *
+ * @param[in] u4_qp_div_6
+ * Floor (qp/6)
+ *
+ * @param[in] pi4_tmp
+ * temporary buffer of size 1*16
+ *
+ * @returns none
+ *
+ * @remarks none
+ *
+ *******************************************************************************
+ */
+void ih264_iquant_itrans_recon_4x4_ssse3(WORD16 *pi2_src,
+ UWORD8 *pu1_pred,
+ UWORD8 *pu1_out,
+ WORD32 pred_strd,
+ WORD32 out_strd,
+ const UWORD16 *pu2_iscal_mat,
+ const UWORD16 *pu2_weigh_mat,
+ UWORD32 u4_qp_div_6,
+ WORD16 *pi2_tmp,
+ WORD32 iq_start_idx,
+ WORD16 *pi2_dc_ld_addr)
+{
+ UWORD32 *pu4_out = (UWORD32 *) pu1_out;
+ __m128i src_r0_r1, src_r2_r3;
+ __m128i src_r0, src_r1, src_r2, src_r3;
+ __m128i scalemat_r0_r1, scalemat_r2_r3, predload_r;
+ __m128i pred_r0, pred_r1, pred_r2, pred_r3;
+ __m128i sign_reg, dequant_r0_r1, dequant_r2_r3;
+ __m128i zero_8x16b = _mm_setzero_si128(); // all bits reset to zero
+ __m128i temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
+ __m128i resq_r0, resq_r1, resq_r2, resq_r3;
+ __m128i add_rshift = _mm_set1_epi32((1 << (3 - u4_qp_div_6)));
+ __m128i value_32 = _mm_set1_epi32(32);
+
+ /*************************************************************/
+ /* Dequantization of coefficients. Will be replaced by SIMD */
+ /* operations on platform */
+ /*************************************************************/
+ src_r0_r1 = _mm_loadu_si128((__m128i *) (pi2_src)); //a00 a01 a02 a03 a10 a11 a12 a13 -- the source matrix 0th,1st row
+ src_r2_r3 = _mm_loadu_si128((__m128i *) (pi2_src + 8)); //a20 a21 a22 a23 a30 a31 a32 a33 -- the source matrix 2nd,3rd row
+ scalemat_r0_r1 = _mm_loadu_si128((__m128i *) (pu2_iscal_mat)); //b00 b01 b02 b03 b10 b11 b12 b13 -- the scaling matrix 0th,1st row
+ scalemat_r2_r3 = _mm_loadu_si128((__m128i *) (pu2_iscal_mat + 8)); //b20 b21 b22 b23 b30 b31 b32 b33 -- the scaling matrix 2nd,3rd row
+ dequant_r0_r1 = _mm_loadu_si128((__m128i *) (pu2_weigh_mat)); //q00 q01 q02 q03 q10 q11 q12 q13 -- all 16 bits
+ dequant_r2_r3 = _mm_loadu_si128((__m128i *) (pu2_weigh_mat + 8)); //q20 q21 q22 q23 q30 q31 q32 q33 -- all 16 bits
+
+ temp0 = _mm_mullo_epi16(scalemat_r0_r1, dequant_r0_r1); //b00*q00 b01*q01 b02*q02 b03*q03 b10*q10 b11*q11 b12*q12 b13*q13 -- 16 bit result
+ temp1 = _mm_mullo_epi16(scalemat_r2_r3, dequant_r2_r3); //b00*q00 b01*q01 b02*q02 b03*q03 b10*q10 b11*q11 b12*q12 b13*q13 -- 16 bit result
+
+ temp4 = _mm_unpacklo_epi16(temp0, zero_8x16b); // b00*q00 0 b01*q01 0 b02*q02 0 b03*q03 0 -- 16 bit long
+ temp5 = _mm_unpackhi_epi16(temp0, zero_8x16b); // b10*q10 0 b11*q11 0 b12*q12 0 b13*q13 0 -- 16 bit long
+ temp6 = _mm_unpacklo_epi16(temp1, zero_8x16b); // b00*q00 0 b01*q01 0 b02*q02 0 b03*q03 0 -- 16 bit long
+ temp7 = _mm_unpackhi_epi16(temp1, zero_8x16b); // b10*q10 0 b11*q11 0 b12*q12 0 b13*q13 0 -- 16 bit long
+
+ src_r0 = _mm_unpacklo_epi16(src_r0_r1, zero_8x16b); // a00 0 a01 0 a02 0 a03 0 -- 16 bit long
+ src_r1 = _mm_unpackhi_epi16(src_r0_r1, zero_8x16b); // a10 0 a11 0 a12 0 a13 0 -- 16 bit long
+ src_r2 = _mm_unpacklo_epi16(src_r2_r3, zero_8x16b); // a20 0 a21 0 a22 0 a23 0 -- 16 bit long
+ src_r3 = _mm_unpackhi_epi16(src_r2_r3, zero_8x16b); // a30 0 a31 0 a32 0 a33 0 -- 16 bit long
+
+ temp4 = _mm_madd_epi16(src_r0, temp4); //a00*b00*q00 a10*b10*q10 a20*b20*q20 a30*b30 q30 -- 32 bits long
+ temp5 = _mm_madd_epi16(src_r1, temp5);
+ temp6 = _mm_madd_epi16(src_r2, temp6);
+ temp7 = _mm_madd_epi16(src_r3, temp7);
+
+ if (u4_qp_div_6 >= 4) {
+ resq_r0 = _mm_slli_epi32(temp4, u4_qp_div_6 - 4);
+ resq_r1 = _mm_slli_epi32(temp5, u4_qp_div_6 - 4);
+ resq_r2 = _mm_slli_epi32(temp6, u4_qp_div_6 - 4);
+ resq_r3 = _mm_slli_epi32(temp7, u4_qp_div_6 - 4);
+ } else {
+ temp4 = _mm_add_epi32(temp4, add_rshift);
+ temp5 = _mm_add_epi32(temp5, add_rshift);
+ temp6 = _mm_add_epi32(temp6, add_rshift);
+ temp7 = _mm_add_epi32(temp7, add_rshift);
+ resq_r0 = _mm_srai_epi32(temp4, 4 - u4_qp_div_6);
+ resq_r1 = _mm_srai_epi32(temp5, 4 - u4_qp_div_6);
+ resq_r2 = _mm_srai_epi32(temp6, 4 - u4_qp_div_6);
+ resq_r3 = _mm_srai_epi32(temp7, 4 - u4_qp_div_6);
+ }
+
+ if (iq_start_idx == 1)
+ {
+ resq_r0 = _mm_insert_epi16(resq_r0,(WORD32)pi2_src[0],0);
+ if (pi2_src[0] >= 0)
+ resq_r0 = _mm_insert_epi16(resq_r0,0,1);
+ else
+ resq_r0 = _mm_insert_epi16(resq_r0,-1,1);
+ }
+ /* Perform Inverse transform */
+ /*-------------------------------------------------------------*/
+ /* IDCT [ Horizontal transformation ] */
+ /*-------------------------------------------------------------*/
+ // Matrix transpose
+ /*
+ * a0 a1 a2 a3
+ * b0 b1 b2 b3
+ * c0 c1 c2 c3
+ * d0 d1 d2 d3
+ */
+ temp1 = _mm_unpacklo_epi32(resq_r0, resq_r1); //a0 b0 a1 b1
+ temp3 = _mm_unpacklo_epi32(resq_r2, resq_r3); //c0 d0 c1 d1
+ temp2 = _mm_unpackhi_epi32(resq_r0, resq_r1); //a2 b2 a3 b3
+ temp4 = _mm_unpackhi_epi32(resq_r2, resq_r3); //c2 d2 c3 d3
+ resq_r0 = _mm_unpacklo_epi64(temp1, temp3); //a0 b0 c0 d0
+ resq_r1 = _mm_unpackhi_epi64(temp1, temp3); //a1 b1 c1 d1
+ resq_r2 = _mm_unpacklo_epi64(temp2, temp4); //a2 b2 c2 d2
+ resq_r3 = _mm_unpackhi_epi64(temp2, temp4); //a3 b3 c3 d3
+ //Transform starts -- horizontal transform
+ /*------------------------------------------------------------------*/
+ /* z0 = w0 + w2 */
+ temp0 = _mm_add_epi32(resq_r0, resq_r2);
+ /* z1 = w0 - w2 */
+ temp1 = _mm_sub_epi32(resq_r0, resq_r2);
+ /* z2 = (w1 >> 1) - w3 */
+ temp2 = _mm_srai_epi32(resq_r1, 1); //(w1>>1)
+ temp2 = _mm_sub_epi32(temp2, resq_r3); //(w1>>1) - w3
+ /* z3 = w1 + (w3 >> 1) */
+ temp3 = _mm_srai_epi32(resq_r3, 1); //(w3>>1) + w1
+ temp3 = _mm_add_epi32(temp3, resq_r1);
+ /*----------------------------------------------------------*/
+ /* x0 = z0 + z3 */
+ resq_r0 = _mm_add_epi32(temp0, temp3);
+ /* x1 = z1 + z2 */
+ resq_r1 = _mm_add_epi32(temp1, temp2);
+ /* x2 = z1 - z2 */
+ resq_r2 = _mm_sub_epi32(temp1, temp2);
+ /* x3 = z0 - z3 */
+ resq_r3 = _mm_sub_epi32(temp0, temp3);
+ // Matrix transpose
+ /*
+ * a0 b0 c0 d0
+ * a1 b1 c1 d1
+ * a2 b2 c2 d2
+ * a3 b3 c3 d3
+ */
+ temp1 = _mm_unpacklo_epi32(resq_r0, resq_r1); //a0 a1 b0 b1
+ temp3 = _mm_unpacklo_epi32(resq_r2, resq_r3); //a2 a3 b2 b3
+ temp2 = _mm_unpackhi_epi32(resq_r0, resq_r1); //c0 c1 d0 d1
+ temp4 = _mm_unpackhi_epi32(resq_r2, resq_r3); //c2 c3 d2 d3
+ resq_r0 = _mm_unpacklo_epi64(temp1, temp3); //a0 a1 a2 a3
+ resq_r1 = _mm_unpackhi_epi64(temp1, temp3); //b0 b1 b2 b3
+ resq_r2 = _mm_unpacklo_epi64(temp2, temp4); //c0 c1 c2 c3
+ resq_r3 = _mm_unpackhi_epi64(temp2, temp4); //d0 d1 d2 d3
+ //Transform ends -- horizontal transform
+
+ zero_8x16b = _mm_setzero_si128(); // all bits reset to zero
+ //Load pred buffer
+ predload_r = _mm_loadl_epi64((__m128i *) (&pu1_pred[0])); //p00 p01 p02 p03 0 0 0 0 0 0 0 0 -- all 8 bits
+ pred_r0 = _mm_unpacklo_epi8(predload_r, zero_8x16b); //p00 p01 p02 p03 0 0 0 0 -- all 16 bits
+
+ predload_r = _mm_loadl_epi64((__m128i *) (&pu1_pred[pred_strd])); //p10 p11 p12 p13 0 0 0 0 0 0 0 0 -- all 8 bits
+ pred_r1 = _mm_unpacklo_epi8(predload_r, zero_8x16b); //p10 p11 p12 p13 0 0 0 0 -- all 16 bits
+
+ predload_r = _mm_loadl_epi64((__m128i *) (&pu1_pred[2 * pred_strd])); //p20 p21 p22 p23 0 0 0 0 0 0 0 0 -- all 8 bits
+ pred_r2 = _mm_unpacklo_epi8(predload_r, zero_8x16b); //p20 p21 p22 p23 0 0 0 0 -- all 16 bits
+
+ predload_r = _mm_loadl_epi64((__m128i *) (&pu1_pred[3 * pred_strd])); //p30 p31 p32 p33 0 0 0 0 0 0 0 0 -- all 8 bits
+ pred_r3 = _mm_unpacklo_epi8(predload_r, zero_8x16b); //p30 p31 p32 p33 0 0 0 0 -- all 16 bits
+ pred_r0 = _mm_unpacklo_epi16(pred_r0, zero_8x16b); //p00 p01 p02 p03 -- 32 bits sign extended
+ pred_r1 = _mm_unpacklo_epi16(pred_r1, zero_8x16b); //p10 p11 p12 p13 -- 32 bits sign extended
+ pred_r2 = _mm_unpacklo_epi16(pred_r2, zero_8x16b); //p20 p21 p22 p23 -- 32 bits sign extended
+ pred_r3 = _mm_unpacklo_epi16(pred_r3, zero_8x16b); //p30 p31 p32 p33 -- 32 bits sign extended
+
+ /*--------------------------------------------------------------*/
+ /* IDCT [ Vertical transformation] and Xij = (xij + 32)>>6 */
+ /* */
+ /* Add the prediction and store it back to same buffer */
+ /*--------------------------------------------------------------*/
+ /* z0j = y0j + y2j */
+ temp0 = _mm_add_epi32(resq_r0, resq_r2);
+ /* z1j = y0j - y2j */
+ temp1 = _mm_sub_epi32(resq_r0, resq_r2);
+ /* z2j = (y1j>>1) - y3j */
+ temp2 = _mm_srai_epi32(resq_r1, 1); //(y1j>>1)
+ temp2 = _mm_sub_epi32(temp2, resq_r3);
+ /* z3j = y1j + (y3j>>1) */
+ temp3 = _mm_srai_epi32(resq_r3, 1); //(y3j>>1)
+ temp3 = _mm_add_epi32(temp3, resq_r1);
+
+ /* x0j = z0j + z3j */
+ temp4 = _mm_add_epi32(temp0, temp3);
+ temp4 = _mm_add_epi32(temp4, value_32);
+ temp4 = _mm_srai_epi32(temp4, 6);
+ temp4 = _mm_add_epi32(temp4, pred_r0);
+ /* x1j = z1j + z2j */
+ temp5 = _mm_add_epi32(temp1, temp2);
+ temp5 = _mm_add_epi32(temp5, value_32);
+ temp5 = _mm_srai_epi32(temp5, 6);
+ temp5 = _mm_add_epi32(temp5, pred_r1);
+ /* x2j = z1j - z2j */
+ temp6 = _mm_sub_epi32(temp1, temp2);
+ temp6 = _mm_add_epi32(temp6, value_32);
+ temp6 = _mm_srai_epi32(temp6, 6);
+ temp6 = _mm_add_epi32(temp6, pred_r2);
+ /* x3j = z0j - z3j */
+ temp7 = _mm_sub_epi32(temp0, temp3);
+ temp7 = _mm_add_epi32(temp7, value_32);
+ temp7 = _mm_srai_epi32(temp7, 6);
+ temp7 = _mm_add_epi32(temp7, pred_r3);
+
+ // 32-bit to 16-bit conversion
+ temp0 = _mm_packs_epi32(temp4, temp5);
+ temp1 = _mm_packs_epi32(temp6, temp7);
+ /*------------------------------------------------------------------*/
+ //Clipping the results to 8 bits
+ sign_reg = _mm_cmpgt_epi16(temp0, zero_8x16b); // sign check
+ temp0 = _mm_and_si128(temp0, sign_reg);
+ sign_reg = _mm_cmpgt_epi16(temp1, zero_8x16b);
+ temp1 = _mm_and_si128(temp1, sign_reg);
+
+ resq_r0 = _mm_packus_epi16(temp0, temp1);
+ resq_r1 = _mm_srli_si128(resq_r0, 4);
+ resq_r2 = _mm_srli_si128(resq_r1, 4);
+ resq_r3 = _mm_srli_si128(resq_r2, 4);
+
+ *pu4_out = _mm_cvtsi128_si32(resq_r0);
+ pu1_out += out_strd;
+ pu4_out = (UWORD32 *) (pu1_out);
+ *(pu4_out) = _mm_cvtsi128_si32(resq_r1);
+ pu1_out += out_strd;
+ pu4_out = (UWORD32 *) (pu1_out);
+ *(pu4_out) = _mm_cvtsi128_si32(resq_r2);
+ pu1_out += out_strd;
+ pu4_out = (UWORD32 *) (pu1_out);
+ *(pu4_out) = _mm_cvtsi128_si32(resq_r3);
+}
+/**
+ *******************************************************************************
+ *
+ * @brief
+ * This function performs inverse quant and Inverse transform type Ci4 for 8x8 block
+ *
+ * @par Description:
+ * Performs inverse transform Ci8 and adds the residue to get the
+ * reconstructed block
+ *
+ * @param[in] pi2_src
+ * Input 8x8coefficients
+ *
+ * @param[in] pu1_pred
+ * Prediction 8x8 block
+ *
+ * @param[out] pu1_recon
+ * Output 8x8 block
+ *
+ * @param[in] q_div
+ * QP/6
+ *
+ * @param[in] q_rem
+ * QP%6
+ *
+ * @param[in] q_lev
+ * Quantizer level
+ *
+ * @param[in] u4_src_stride
+ * Input stride
+ *
+ * @param[in] u4_pred_stride,
+ * Prediction stride
+ *
+ * @param[in] u4_out_stride
+ * Output Stride
+ *
+ * @param[in] pi4_tmp
+ * temporary buffer of size 1*64
+ * the tmp for each block
+ *
+ * @param[in] pu4_iquant_mat
+ * Pointer to the inverse quantization matrix
+ *
+ * @returns Void
+ *
+ * @remarks
+ * None
+ *
+ *******************************************************************************
+ */
+
+void ih264_iquant_itrans_recon_8x8_ssse3(WORD16 *pi2_src,
+ UWORD8 *pu1_pred,
+ UWORD8 *pu1_out,
+ WORD32 pred_strd,
+ WORD32 out_strd,
+ const UWORD16 *pu2_iscale_mat,
+ const UWORD16 *pu2_weigh_mat,
+ UWORD32 qp_div,
+ WORD16 *pi2_tmp,
+ WORD32 iq_start_idx,
+ WORD16 *pi2_dc_ld_addr)
+{
+ __m128i src_r0;
+ __m128i scalemat_r0;
+ __m128i zero_8x16b = _mm_setzero_si128(); // all bits reset to zero
+ // __m128i one_8x16b = _mm_set1_epi8(255); // all bits set to 1
+ // __m128i one_zero_mask = _mm_unpacklo_epi16(one_8x16b, zero_8x16b); // 1 0 1 0 1 0 1 0 --- 16 bits size
+ __m128i value_32 = _mm_set1_epi32(32);
+ __m128i add_rshift = _mm_set1_epi32((1 << (5 - qp_div)));
+ __m128i dequant_r0;
+ __m128i predload_r;
+ __m128i pred_r0_1, pred_r1_1, pred_r2_1, pred_r3_1, pred_r4_1, pred_r5_1,
+ pred_r6_1, pred_r7_1;
+ __m128i sign_reg;
+ __m128i src_r0_1, src_r0_2;
+ __m128i scalemat_r0_1, scalemat_r0_2;
+ __m128i temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8;
+ __m128i temp10, temp11, temp12, temp13, temp14, temp15, temp16, temp17,
+ temp18, temp19, temp20;
+ // To store dequantization results
+ __m128i resq_r0_1, resq_r0_2, resq_r1_1, resq_r1_2, resq_r2_1, resq_r2_2,
+ resq_r3_1, resq_r3_2, resq_r4_1, resq_r4_2, resq_r5_1, resq_r5_2,
+ resq_r6_1, resq_r6_2, resq_r7_1, resq_r7_2;
+
+ /*************************************************************/
+ /* Dequantization of coefficients. Will be replaced by SIMD */
+ /* operations on platform. Note : DC coeff is not scaled */
+ /*************************************************************/
+
+ // Row 0 processing
+ src_r0 = _mm_loadu_si128((__m128i *) (pi2_src)); //a00 a01 a02 a03 a04 a05 a06 a07 -- the source matrix 0th row
+ scalemat_r0 = _mm_loadu_si128((__m128i *) (pu2_iscale_mat)); //b00 b01 b02 b03 b04 b05 b06 b07 -- the scaling matrix 0th row
+ dequant_r0 = _mm_loadu_si128((__m128i *) (&pu2_weigh_mat[0])); //q0 q1 q2 q3 q4 q5 q6 q7 -- all 16 bits
+ src_r0_1 = _mm_unpacklo_epi16(src_r0, zero_8x16b); //a00 0 a01 0 a02 0 a03 0 -- 16 bit long
+ src_r0_2 = _mm_unpackhi_epi16(src_r0, zero_8x16b); // a04 0 a05 0 a06 0 a07 0 -- 16 bit long
+ temp10 = _mm_mullo_epi16(scalemat_r0, dequant_r0); //b00*q0 b01*q1 b02*q2 b03*q3 b04*q4 b05*q5 b06*q6 b07*q7 -- 16 bit result
+ scalemat_r0_1 = _mm_unpacklo_epi16(temp10, zero_8x16b); // b00*q0 0 b01*q1 0 b02*q2 0 b03*q3 0 -- 16 bit long
+ scalemat_r0_2 = _mm_unpackhi_epi16(temp10, zero_8x16b); // b04*q4 0 b05*q5 0 b06*q6 0 b07*q7 0 -- 16 bit long
+
+ temp5 = _mm_madd_epi16(src_r0_1, scalemat_r0_1); // a00*b00*q0 a01*b01*q1 a02*b02*q2 a03*b03*q3 -- 32 bits long
+ temp7 = _mm_madd_epi16(src_r0_2, scalemat_r0_2); // a04*b04*q4 a05*b05*q5 a06*b06*q6 a07*b07*q7 -- 32 bits long
+
+ if (qp_div >= 6) {
+ resq_r0_1 = _mm_slli_epi32(temp5, qp_div - 6);
+ resq_r0_2 = _mm_slli_epi32(temp7, qp_div - 6);
+ } else {
+ temp5 = _mm_add_epi32(temp5, add_rshift);
+ temp7 = _mm_add_epi32(temp7, add_rshift);
+ resq_r0_1 = _mm_srai_epi32(temp5, 6 - qp_div);
+ resq_r0_2 = _mm_srai_epi32(temp7, 6 - qp_div);
+ }
+ resq_r0_1 = _mm_packs_epi32(resq_r0_1, resq_r0_2); //a00*b00*q0 a01*b01*q1 a02*b02*q2 a03*b03*q3 a04*b04*q4 a05*b05*q5 a06*b06*q6 a07*b07*q7 -- 16 bit long
+ // Row 1 processing
+ src_r0 = _mm_loadu_si128((__m128i *) (pi2_src + 8)); //a00 a01 a02 a03 a04 a05 a06 a07 a08 -- the source matrix 1st row
+ scalemat_r0 = _mm_loadu_si128((__m128i *) (pu2_iscale_mat + 8)); //b00 b01 b02 b03 b04 b05 b06 b07 b08 -- the scaling matrix 1st row
+ dequant_r0 = _mm_loadu_si128((__m128i *) (&pu2_weigh_mat[8])); //q0 q1 q2 q3 q4 q5 q6 q7 -- all 16 bits
+ src_r0_1 = _mm_unpacklo_epi16(src_r0, zero_8x16b); //a00 0 a01 0 a02 0 a03 0 -- 16 bit long
+ src_r0_2 = _mm_unpackhi_epi16(src_r0, zero_8x16b); // a04 0 a05 0 a06 0 a07 0 -- 16 bit long
+ temp10 = _mm_mullo_epi16(scalemat_r0, dequant_r0); //b00*q0 b01*q1 b02*q2 b03*q3 b04*q4 b05*q5 b06*q6 b07*q7 -- 16 bit result
+ scalemat_r0_1 = _mm_unpacklo_epi16(temp10, zero_8x16b); // b00*q0 0 b01*q1 0 b02*q2 0 b03*q3 0 -- 16 bit long
+ scalemat_r0_2 = _mm_unpackhi_epi16(temp10, zero_8x16b); // b04*q4 0 b05*q5 0 b06*q6 0 b07*q7 0 -- 16 bit long
+ temp5 = _mm_madd_epi16(src_r0_1, scalemat_r0_1); // a00*b00*q0 a01*b01*q1 a02*b02*q2 a03*b03*q3 -- 32 bits long
+ temp7 = _mm_madd_epi16(src_r0_2, scalemat_r0_2); // a04*b04*q4 a05*b05*q5 a06*b06*q6 a07*b07*q7 -- 32 bits long
+ if (qp_div >= 6) {
+ resq_r1_1 = _mm_slli_epi32(temp5, qp_div - 6);
+ resq_r1_2 = _mm_slli_epi32(temp7, qp_div - 6);
+ } else {
+ temp5 = _mm_add_epi32(temp5, add_rshift);
+ temp7 = _mm_add_epi32(temp7, add_rshift);
+ resq_r1_1 = _mm_srai_epi32(temp5, 6 - qp_div);
+ resq_r1_2 = _mm_srai_epi32(temp7, 6 - qp_div);
+ }
+ resq_r1_1 = _mm_packs_epi32(resq_r1_1, resq_r1_2); //a00*b00*q0 a01*b01*q1 a02*b02*q2 a03*b03*q3 a04*b04*q4 a05*b05*q5 a06*b06*q6 a07*b07*q7 -- 16 bit long
+ // Row 2 processing
+ src_r0 = _mm_loadu_si128((__m128i *) (pi2_src + 16)); //a00 a01 a02 a03 a04 a05 a06 a07 a08 -- the source matrix 2nd row
+ scalemat_r0 = _mm_loadu_si128((__m128i *) (pu2_iscale_mat + 16)); //b00 b01 b02 b03 b04 b05 b06 b07 b08 -- the scaling matrix 2nd row
+ dequant_r0 = _mm_loadu_si128((__m128i *) (&pu2_weigh_mat[16])); //q0 q1 q2 q3 q4 q5 q6 q7 -- all 16 bits
+ src_r0_1 = _mm_unpacklo_epi16(src_r0, zero_8x16b); //a00 0 a01 0 a02 0 a03 0 -- 16 bit long
+ src_r0_2 = _mm_unpackhi_epi16(src_r0, zero_8x16b); // a04 0 a05 0 a06 0 a07 0 -- 16 bit long
+ temp10 = _mm_mullo_epi16(scalemat_r0, dequant_r0); //b00*q0 b01*q1 b02*q2 b03*q3 b04*q4 b05*q5 b06*q6 b07*q7 -- 16 bit result
+ scalemat_r0_1 = _mm_unpacklo_epi16(temp10, zero_8x16b); // b00*q0 0 b01*q1 0 b02*q2 0 b03*q3 0 -- 16 bit long
+ scalemat_r0_2 = _mm_unpackhi_epi16(temp10, zero_8x16b); // b04*q4 0 b05*q5 0 b06*q6 0 b07*q7 0 -- 16 bit long
+ temp5 = _mm_madd_epi16(src_r0_1, scalemat_r0_1); // a00*b00*q0 a01*b01*q1 a02*b02*q2 a03*b03*q3 -- 32 bits long
+ temp7 = _mm_madd_epi16(src_r0_2, scalemat_r0_2); // a04*b04*q4 a05*b05*q5 a06*b06*q6 a07*b07*q7 -- 32 bits long
+ if (qp_div >= 6) {
+ resq_r2_1 = _mm_slli_epi32(temp5, qp_div - 6);
+ resq_r2_2 = _mm_slli_epi32(temp7, qp_div - 6);
+ } else {
+ temp5 = _mm_add_epi32(temp5, add_rshift);
+ temp7 = _mm_add_epi32(temp7, add_rshift);
+ resq_r2_1 = _mm_srai_epi32(temp5, 6 - qp_div);
+ resq_r2_2 = _mm_srai_epi32(temp7, 6 - qp_div);
+ }
+ resq_r2_1 = _mm_packs_epi32(resq_r2_1, resq_r2_2); //a00*b00*q0 a01*b01*q1 a02*b02*q2 a03*b03*q3 a04*b04*q4 a05*b05*q5 a06*b06*q6 a07*b07*q7 -- 16 bit long
+ // Row 3 processing
+ src_r0 = _mm_loadu_si128((__m128i *) (pi2_src + 24)); //a00 a01 a02 a03 a04 a05 a06 a07 a08 -- the source matrix 3rd row
+ scalemat_r0 = _mm_loadu_si128((__m128i *) (pu2_iscale_mat + 24)); //b00 b01 b02 b03 b04 b05 b06 b07 b08 -- the scaling matrix 3rd row
+ dequant_r0 = _mm_loadu_si128((__m128i *) (&pu2_weigh_mat[24])); //q0 q1 q2 q3 q4 q5 q6 q7 -- all 16 bits
+ src_r0_1 = _mm_unpacklo_epi16(src_r0, zero_8x16b); //a00 0 a01 0 a02 0 a03 0 -- 16 bit long
+ src_r0_2 = _mm_unpackhi_epi16(src_r0, zero_8x16b); // a04 0 a05 0 a06 0 a07 0 -- 16 bit long
+ temp10 = _mm_mullo_epi16(scalemat_r0, dequant_r0); //b00*q0 b01*q1 b02*q2 b03*q3 b04*q4 b05*q5 b06*q6 b07*q7 -- 16 bit result
+ scalemat_r0_1 = _mm_unpacklo_epi16(temp10, zero_8x16b); // b00*q0 0 b01*q1 0 b02*q2 0 b03*q3 0 -- 16 bit long
+ scalemat_r0_2 = _mm_unpackhi_epi16(temp10, zero_8x16b); // b04*q4 0 b05*q5 0 b06*q6 0 b07*q7 0 -- 16 bit long
+ temp5 = _mm_madd_epi16(src_r0_1, scalemat_r0_1); // a00*b00*q0 a01*b01*q1 a02*b02*q2 a03*b03*q3 - 32 bits long
+ temp7 = _mm_madd_epi16(src_r0_2, scalemat_r0_2); // a04*b04*q4 a05*b05*q5 a06*b06*q6 a07*b07*q7 -- 32 bits long
+ if (qp_div >= 6) {
+ resq_r3_1 = _mm_slli_epi32(temp5, qp_div - 6);
+ resq_r3_2 = _mm_slli_epi32(temp7, qp_div - 6);
+ } else {
+ temp5 = _mm_add_epi32(temp5, add_rshift);
+ temp7 = _mm_add_epi32(temp7, add_rshift);
+ resq_r3_1 = _mm_srai_epi32(temp5, 6 - qp_div);
+ resq_r3_2 = _mm_srai_epi32(temp7, 6 - qp_div);
+ }
+ resq_r3_1 = _mm_packs_epi32(resq_r3_1, resq_r3_2); //a00*b00*q0 a01*b01*q1 a02*b02*q2 a03*b03*q3 a04*b04*q4 a05*b05*q5 a06*b06*q6 a07*b07*q7 -- 16 bit long
+ // Row 4 processing
+ src_r0 = _mm_loadu_si128((__m128i *) (pi2_src + 32)); //a00 a01 a02 a03 a04 a05 a06 a07 a08 -- the source matrix 4th row
+ scalemat_r0 = _mm_loadu_si128((__m128i *) (pu2_iscale_mat + 32)); //b00 b01 b02 b03 b04 b05 b06 b07 b08 -- the scaling matrix 4th row
+ dequant_r0 = _mm_loadu_si128((__m128i *) (&pu2_weigh_mat[32])); //q0 q1 q2 q3 q4 q5 q6 q7 -- all 16 bits
+ src_r0_1 = _mm_unpacklo_epi16(src_r0, zero_8x16b); //a00 0 a01 0 a02 0 a03 0 -- 16 bit long
+ src_r0_2 = _mm_unpackhi_epi16(src_r0, zero_8x16b); // a04 0 a05 0 a06 0 a07 0 -- 16 bit long
+ temp10 = _mm_mullo_epi16(scalemat_r0, dequant_r0); //b00*q0 b01*q1 b02*q2 b03*q3 b04*q4 b05*q5 b06*q6 b07*q7 -- 16 bit result
+ scalemat_r0_1 = _mm_unpacklo_epi16(temp10, zero_8x16b); // b00*q0 0 b01*q1 0 b02*q2 0 b03*q3 0 -- 16 bit long
+ scalemat_r0_2 = _mm_unpackhi_epi16(temp10, zero_8x16b); // b04*q4 0 b05*q5 0 b06*q6 0 b07*q7 0 -- 16 bit long
+ temp5 = _mm_madd_epi16(src_r0_1, scalemat_r0_1); // a00*b00*q0 a01*b01*q1 a02*b02*q2 a03*b03*q3 -- 32 bits long
+ temp7 = _mm_madd_epi16(src_r0_2, scalemat_r0_2); // a04*b04*q4 a05*b05*q5 a06*b06*q6 a07*b07*q7 -- 32 bits long
+ if (qp_div >= 6) {
+ resq_r4_1 = _mm_slli_epi32(temp5, qp_div - 6);
+ resq_r4_2 = _mm_slli_epi32(temp7, qp_div - 6);
+
+ } else {
+ temp5 = _mm_add_epi32(temp5, add_rshift);
+ temp7 = _mm_add_epi32(temp7, add_rshift);
+ resq_r4_1 = _mm_srai_epi32(temp5, 6 - qp_div);
+ resq_r4_2 = _mm_srai_epi32(temp7, 6 - qp_div);
+ }
+ resq_r4_1 = _mm_packs_epi32(resq_r4_1, resq_r4_2); //a00*b00*q0 a01*b01*q1 a02*b02*q2 a03*b03*q3 a04*b04*q4 a05*b05*q5 a06*b06*q6 a07*b07*q7 -- 16 bit long
+ // Row 5 processing
+ src_r0 = _mm_loadu_si128((__m128i *) (pi2_src + 40)); //a00 a01 a02 a03 a04 a05 a06 a07 a08 -- the source matrix 5th row
+ scalemat_r0 = _mm_loadu_si128((__m128i *) (pu2_iscale_mat + 40)); //b00 b01 b02 b03 b04 b05 b06 b07 b08 -- the scaling matrix 5th row
+ dequant_r0 = _mm_loadu_si128((__m128i *) (&pu2_weigh_mat[40])); //q0 q1 q2 q3 q4 q5 q6 q7 -- all 16 bits
+ src_r0_1 = _mm_unpacklo_epi16(src_r0, zero_8x16b); //a00 0 a01 0 a02 0 a03 0 -- 16 bit long
+ src_r0_2 = _mm_unpackhi_epi16(src_r0, zero_8x16b); // a04 0 a05 0 a06 0 a07 0 -- 16 bit long
+ temp10 = _mm_mullo_epi16(scalemat_r0, dequant_r0); //b00*q0 b01*q1 b02*q2 b03*q3 b04*q4 b05*q5 b06*q6 b07*q7 -- 16 bit result
+ scalemat_r0_1 = _mm_unpacklo_epi16(temp10, zero_8x16b); // b00*q0 0 b01*q1 0 b02*q2 0 b03*q3 0 -- 16 bit long
+ scalemat_r0_2 = _mm_unpackhi_epi16(temp10, zero_8x16b); // b04*q4 0 b05*q5 0 b06*q6 0 b07*q7 0 -- 16 bit long
+ temp5 = _mm_madd_epi16(src_r0_1, scalemat_r0_1); // a00*b00*q0 a01*b01*q1 a02*b02*q2 a03*b03*q3 -- 32 bits long
+ temp7 = _mm_madd_epi16(src_r0_2, scalemat_r0_2); // a04*b04*q4 a05*b05*q5 a06*b06*q6 a07*b07*q7 -- 32 bits long
+ if (qp_div >= 6) {
+ resq_r5_1 = _mm_slli_epi32(temp5, qp_div - 6);
+ resq_r5_2 = _mm_slli_epi32(temp7, qp_div - 6);
+ //resq_r5_1 = _mm_and_si128(resq_r5_1,one_zero_mask);
+ //resq_r5_2 = _mm_and_si128(resq_r5_2,one_zero_mask);
+ } else {
+ temp5 = _mm_add_epi32(temp5, add_rshift);
+ temp7 = _mm_add_epi32(temp7, add_rshift);
+ resq_r5_1 = _mm_srai_epi32(temp5, 6 - qp_div);
+ resq_r5_2 = _mm_srai_epi32(temp7, 6 - qp_div);
+ }
+ resq_r5_1 = _mm_packs_epi32(resq_r5_1, resq_r5_2); //a00*b00*q0 a01*b01*q1 a02*b02*q2 a03*b03*q3 a04*b04*q4 a05*b05*q5 a06*b06*q6 a07*b07*q7 -- 16 bit long
+ // Row 6 processing
+ src_r0 = _mm_loadu_si128((__m128i *) (pi2_src + 48)); //a00 a01 a02 a03 a04 a05 a06 a07 a08 -- the source matrix 6th row
+ scalemat_r0 = _mm_loadu_si128((__m128i *) (pu2_iscale_mat + 48)); //b00 b01 b02 b03 b04 b05 b06 b07 b08 -- the scaling matrix 6th row
+ dequant_r0 = _mm_loadu_si128((__m128i *) (&pu2_weigh_mat[48])); //q0 q1 q2 q3 q4 q5 q6 q7 -- all 16 bits
+ src_r0_1 = _mm_unpacklo_epi16(src_r0, zero_8x16b); //a00 0 a01 0 a02 0 a03 0 -- 16 bit long
+ src_r0_2 = _mm_unpackhi_epi16(src_r0, zero_8x16b); // a04 0 a05 0 a06 0 a07 0 -- 16 bit long
+ temp10 = _mm_mullo_epi16(scalemat_r0, dequant_r0); //b00*q0 b01*q1 b02*q2 b03*q3 b04*q4 b05*q5 b06*q6 b07*q7 -- 16 bit result
+ scalemat_r0_1 = _mm_unpacklo_epi16(temp10, zero_8x16b); // b00*q0 0 b01*q1 0 b02*q2 0 b03*q3 0 -- 16 bit long
+ scalemat_r0_2 = _mm_unpackhi_epi16(temp10, zero_8x16b); // b04*q4 0 b05*q5 0 b06*q6 0 b07*q7 0 -- 16 bit long
+ temp5 = _mm_madd_epi16(src_r0_1, scalemat_r0_1); // a00*b00*q0 a01*b01*q1 a02*b02*q2 a03*b03*q3 -- 32 bits long
+ temp7 = _mm_madd_epi16(src_r0_2, scalemat_r0_2); // a04*b04*q4 a05*b05*q5 a06*b06*q6 a07*b07*q7 -- 32 bits long
+ if (qp_div >= 6) {
+ resq_r6_1 = _mm_slli_epi32(temp5, qp_div - 6);
+ resq_r6_2 = _mm_slli_epi32(temp7, qp_div - 6);
+ //resq_r6_1 = _mm_and_si128(resq_r6_1,one_zero_mask);
+ //resq_r6_2 = _mm_and_si128(resq_r6_2,one_zero_mask);
+ } else {
+ temp5 = _mm_add_epi32(temp5, add_rshift);
+ temp7 = _mm_add_epi32(temp7, add_rshift);
+ resq_r6_1 = _mm_srai_epi32(temp5, 6 - qp_div);
+ resq_r6_2 = _mm_srai_epi32(temp7, 6 - qp_div);
+ //resq_r6_1 = _mm_and_si128(resq_r6_1,one_zero_mask);
+ //resq_r6_2 = _mm_and_si128(resq_r6_2,one_zero_mask);
+ }
+ resq_r6_1 = _mm_packs_epi32(resq_r6_1, resq_r6_2); //a00*b00*q0 a01*b01*q1 a02*b02*q2 a03*b03*q3 a04*b04*q4 a05*b05*q5 a06*b06*q6 a07*b07*q7 -- 16 bit long
+ // Row 7 processing
+ src_r0 = _mm_loadu_si128((__m128i *) (pi2_src + 56)); //a00 a01 a02 a03 a04 a05 a06 a07 a08 -- the source matrix 7th row
+ scalemat_r0 = _mm_loadu_si128((__m128i *) (pu2_iscale_mat + 56)); //b00 b01 b02 b03 b04 b05 b06 b07 b08 -- the scaling matrix 7th row
+ dequant_r0 = _mm_loadu_si128((__m128i *) (&pu2_weigh_mat[56])); //q0 q1 q2 q3 q4 q5 q6 q7 -- all 16 bits
+ src_r0_1 = _mm_unpacklo_epi16(src_r0, zero_8x16b); //a00 0 a01 0 a02 0 a03 0 -- 16 bit long
+ src_r0_2 = _mm_unpackhi_epi16(src_r0, zero_8x16b); // a04 0 a05 0 a06 0 a07 0 -- 16 bit long
+ temp10 = _mm_mullo_epi16(scalemat_r0, dequant_r0); //b00*q0 b01*q1 b02*q2 b03*q3 b04*q4 b05*q5 b06*q6 b07*q7 -- 16 bit result
+ scalemat_r0_1 = _mm_unpacklo_epi16(temp10, zero_8x16b); // b00*q0 0 b01*q1 0 b02*q2 0 b03*q3 0 -- 16 bit long
+ scalemat_r0_2 = _mm_unpackhi_epi16(temp10, zero_8x16b); // b04*q4 0 b05*q5 0 b06*q6 0 b07*q7 0 -- 16 bit long
+ temp5 = _mm_madd_epi16(src_r0_1, scalemat_r0_1); // a00*b00*q0 a01*b01*q1 a02*b02*q2 a03*b03*q3 -- 32 bits long
+ temp7 = _mm_madd_epi16(src_r0_2, scalemat_r0_2); // a04*b04*q4 a05*b05*q5 a06*b06*q6 a07*b07*q7 -- 32 bits long
+ if (qp_div >= 6) {
+ resq_r7_1 = _mm_slli_epi32(temp5, qp_div - 6);
+ resq_r7_2 = _mm_slli_epi32(temp7, qp_div - 6);
+ } else {
+ temp5 = _mm_add_epi32(temp5, add_rshift);
+ temp7 = _mm_add_epi32(temp7, add_rshift);
+ resq_r7_1 = _mm_srai_epi32(temp5, 6 - qp_div);
+ resq_r7_2 = _mm_srai_epi32(temp7, 6 - qp_div);
+ }
+ resq_r7_1 = _mm_packs_epi32(resq_r7_1, resq_r7_2); //a00*b00*q0 a01*b01*q1 a02*b02*q2 a03*b03*q3 a04*b04*q4 a05*b05*q5 a06*b06*q6 a07*b07*q7 -- 16 bit long
+ /* Perform Inverse transform */
+ /*--------------------------------------------------------------------*/
+ /* IDCT [ Horizontal transformation ] */
+ /*--------------------------------------------------------------------*/
+ // Matrix transpose
+ /*
+ * a0 a1 a2 a3 a4 a5 a6 a7
+ * b0 b1 b2 b3 b4 b5 b6 b7
+ * c0 c1 c2 c3 c4 c5 c6 c7
+ * d0 d1 d2 d3 d4 d5 d6 d7
+ */
+ temp1 = _mm_unpacklo_epi16(resq_r0_1, resq_r1_1); //a0 b0 a1 b1 a2 b2 a3 b3
+ temp3 = _mm_unpacklo_epi16(resq_r2_1, resq_r3_1); //c0 d0 c1 d1 c2 d2 c3 d3
+ temp2 = _mm_unpackhi_epi16(resq_r0_1, resq_r1_1); //a4 b4 a5 b5 a6 b6 a7 b7
+ temp4 = _mm_unpackhi_epi16(resq_r2_1, resq_r3_1); //c4 d4 c5 d5 c6 d6 c7 d7
+ resq_r0_1 = _mm_unpacklo_epi32(temp1, temp3); //a0 b0 c0 d0 a1 b1 c1 d1
+ resq_r1_1 = _mm_unpackhi_epi32(temp1, temp3); //a2 b2 c2 d2 a3 b3 c3 d3
+ resq_r2_1 = _mm_unpacklo_epi32(temp2, temp4); //a4 b4 c4 d4 a5 b5 c5 d5
+ resq_r3_1 = _mm_unpackhi_epi32(temp2, temp4); //a6 b6 c6 d6 a7 b7 c7 d7
+ /*
+ * e0 e1 e2 e3 e4 e5 e6 e7
+ * f0 f1 f2 f3 f4 f5 f6 f7
+ * g0 g1 g2 g3 g4 g5 g6 g7
+ * h0 h1 h2 h3 h4 h5 h6 h7
+ */
+ temp1 = _mm_unpacklo_epi16(resq_r4_1, resq_r5_1); //e0 f0 e1 f1 e2 f2 e2 f3
+ temp3 = _mm_unpacklo_epi16(resq_r6_1, resq_r7_1); //g0 h0 g1 h1 g2 h2 g3 h3
+ temp2 = _mm_unpackhi_epi16(resq_r4_1, resq_r5_1); //e4 f4 e5 f5 e6 f6 e7 f7
+ temp4 = _mm_unpackhi_epi16(resq_r6_1, resq_r7_1); //g4 h4 g5 h5 g6 h6 g7 h7
+ resq_r4_1 = _mm_unpacklo_epi32(temp1, temp3); //e0 f0 g0 h0 e1 f1 g1 h1
+ resq_r5_1 = _mm_unpackhi_epi32(temp1, temp3); //e2 f2 g2 h2 e3 f3 g3 h3
+ resq_r6_1 = _mm_unpacklo_epi32(temp2, temp4); //e4 f4 g4 h4 e5 f5 g5 h5
+ resq_r7_1 = _mm_unpackhi_epi32(temp2, temp4); //e6 f6 g6 h6 e7 f7 g7 h7
+ /*
+ * a0 b0 c0 d0 a1 b1 c1 d1
+ * a2 b2 c2 d2 a3 b3 c3 d3
+ * a4 b4 c4 d4 a5 b5 c5 d5
+ * a6 b6 c6 d6 a7 b7 c7 d7
+ * e0 f0 g0 h0 e1 f1 g1 h1
+ * e2 f2 g2 h2 e3 f3 g3 h3
+ * e4 f4 g4 h4 e5 f5 g5 h5
+ * e6 f6 g6 h6 e7 f7 g7 h7
+ */
+ resq_r0_2 = _mm_unpacklo_epi64(resq_r0_1, resq_r4_1); //a0 b0 c0 d0 e0 f0 g0 h0
+ resq_r1_2 = _mm_unpackhi_epi64(resq_r0_1, resq_r4_1); //a1 b1 c1 d1 e1 f1 g1 h1
+ resq_r2_2 = _mm_unpacklo_epi64(resq_r1_1, resq_r5_1); //a2 b2 c2 d2 e2 f2 g2 h2
+ resq_r3_2 = _mm_unpackhi_epi64(resq_r1_1, resq_r5_1); //a3 b3 c3 d3 e3 f3 g3 h3
+ resq_r4_2 = _mm_unpacklo_epi64(resq_r2_1, resq_r6_1); //a4 b4 c4 d4 e4 f4 g4 h4
+ resq_r5_2 = _mm_unpackhi_epi64(resq_r2_1, resq_r6_1); //a5 b5 c5 d5 e5 f5 g5 h5
+ resq_r6_2 = _mm_unpacklo_epi64(resq_r3_1, resq_r7_1); //a6 b6 c6 d6 e6 f6 g6 h6
+ resq_r7_2 = _mm_unpackhi_epi64(resq_r3_1, resq_r7_1); //a7 b7 c7 d7 e7 f7 g7 h7
+
+ sign_reg = _mm_cmpgt_epi16(zero_8x16b, resq_r1_2);
+ resq_r1_1 = _mm_unpacklo_epi16(resq_r1_2, sign_reg); //a1 b1 c1 d1 -- 32 bit
+ resq_r1_2 = _mm_unpackhi_epi16(resq_r1_2, sign_reg); //e1 f1 g1 h1 -- 32 bit
+ sign_reg = _mm_cmpgt_epi16(zero_8x16b, resq_r3_2);
+ resq_r3_1 = _mm_unpacklo_epi16(resq_r3_2, sign_reg); //a3 b3 c3 d3 -- 32 bit
+ resq_r3_2 = _mm_unpackhi_epi16(resq_r3_2, sign_reg); //e3 f3 g3 h3 -- 32 bit
+ sign_reg = _mm_cmpgt_epi16(zero_8x16b, resq_r5_2);
+ resq_r5_1 = _mm_unpacklo_epi16(resq_r5_2, sign_reg); //a5 b5 c5 d5 -- 32 bit
+ resq_r5_2 = _mm_unpackhi_epi16(resq_r5_2, sign_reg); //e5 f5 g5 h5 -- 32 bit
+ sign_reg = _mm_cmpgt_epi16(zero_8x16b, resq_r7_2);
+ resq_r7_1 = _mm_unpacklo_epi16(resq_r7_2, sign_reg); //a7 b7 c7 d7 -- 32 bit
+ resq_r7_2 = _mm_unpackhi_epi16(resq_r7_2, sign_reg); //e7 f7 g7 h7 -- 32 bit
+ //Transform starts -- horizontal transform
+ /*------------------------------------------------------------------*/
+ /* y0 = w0 + w4 */
+ temp1 = _mm_add_epi16(resq_r0_2, resq_r4_2);
+ /* y2 = w0 - w4 */
+ temp3 = _mm_sub_epi16(resq_r0_2, resq_r4_2);
+ /* y1 = -w3 + w5 - w7 - (w7 >> 1) */
+ temp2 = _mm_sub_epi32(resq_r5_1, resq_r3_1); //-w3+w5
+ temp10 = _mm_sub_epi32(resq_r5_2, resq_r3_2);
+ temp4 = _mm_sub_epi32(temp2, resq_r7_1); //-w3+w5-w7
+ temp12 = _mm_sub_epi32(temp10, resq_r7_2);
+ temp5 = _mm_srai_epi32(resq_r7_1, 1); //w7>>1
+ temp13 = _mm_srai_epi32(resq_r7_2, 1);
+ temp2 = _mm_sub_epi32(temp4, temp5); //-w3+w5-w7 -(w7>>1)
+ temp10 = _mm_sub_epi32(temp12, temp13);
+ temp2 = _mm_packs_epi32(temp2, temp10);
+ /* y3 = w1 + w7 - w3 - (w3 >> 1) */
+ temp4 = _mm_add_epi32(resq_r1_1, resq_r7_1); //w1+w7
+ temp12 = _mm_add_epi32(resq_r1_2, resq_r7_2);
+ temp4 = _mm_sub_epi32(temp4, resq_r3_1); //w1+w7-w3
+ temp12 = _mm_sub_epi32(temp12, resq_r3_2);
+ temp5 = _mm_srai_epi32(resq_r3_1, 1); //w3>>1
+ temp13 = _mm_srai_epi32(resq_r3_2, 1);
+ temp4 = _mm_sub_epi32(temp4, temp5); //w1+w7-w3-(w3>>1)
+ temp12 = _mm_sub_epi32(temp12, temp13);
+ temp4 = _mm_packs_epi32(temp4, temp12);
+ /* y4 = (w2 >> 1) - w6 */
+ temp5 = _mm_srai_epi16(resq_r2_2, 1); //w2>>1
+ temp5 = _mm_sub_epi16(temp5, resq_r6_2); //(w2>>1)-w6
+ /* y5 = -w1 + w7 + w5 + (w5 >> 1) */
+ temp6 = _mm_sub_epi32(resq_r7_1, resq_r1_1); //w7-w1
+ temp14 = _mm_sub_epi32(resq_r7_2, resq_r1_2);
+ temp6 = _mm_add_epi32(temp6, resq_r5_1); //w7-w1+w5
+ temp14 = _mm_add_epi32(temp14, resq_r5_2);
+ temp7 = _mm_srai_epi32(resq_r5_1, 1); //w5>>1
+ temp15 = _mm_srai_epi32(resq_r5_2, 1);
+ temp6 = _mm_add_epi32(temp6, temp7); //w7-w1_w5+(w5>>1)
+ temp14 = _mm_add_epi32(temp14, temp15);
+ temp6 = _mm_packs_epi32(temp6, temp14);
+ /* y6 = w2 + (w6 >> 1) */
+ temp7 = _mm_srai_epi16(resq_r6_2, 1); //w6>>1
+ temp7 = _mm_add_epi16(temp7, resq_r2_2); //(w6>>1)+w2
+ /* y7 = w3 + w5 + w1 + (w1 >> 1) */
+ temp8 = _mm_add_epi32(resq_r3_1, resq_r5_1); //w3+w5
+ temp16 = _mm_add_epi32(resq_r3_2, resq_r5_2);
+ temp8 = _mm_add_epi32(temp8, resq_r1_1); //w3+w5+w1
+ temp16 = _mm_add_epi32(temp16, resq_r1_2);
+ temp17 = _mm_srai_epi32(resq_r1_1, 1); //w1>>1
+ temp18 = _mm_srai_epi32(resq_r1_2, 1);
+ temp8 = _mm_add_epi32(temp8, temp17); //w3+w5+w1+(w1>>1)
+ temp16 = _mm_add_epi32(temp16, temp18);
+ temp8 = _mm_packs_epi32(temp8, temp16);
+ /*------------------------------------------------------------------*/
+ /*------------------------------------------------------------------*/
+ /* z0 = y0 + y6 */
+ resq_r0_1 = _mm_add_epi16(temp1, temp7);
+ /* z1 = y1 + (y7 >> 2) */
+ resq_r1_1 = _mm_srai_epi16(temp8, 2);
+ resq_r1_1 = _mm_add_epi16(resq_r1_1, temp2);
+ /* z2 = y2 + y4 */
+ resq_r2_1 = _mm_add_epi16(temp3, temp5);
+ /* z3 = y3 + (y5 >> 2) */
+ resq_r3_1 = _mm_srai_epi16(temp6, 2);
+ resq_r3_1 = _mm_add_epi16(resq_r3_1, temp4);
+ /* z4 = y2 - y4 */
+ resq_r4_1 = _mm_sub_epi16(temp3, temp5);
+ /* z5 = (y3 >> 2) - y5 */
+ resq_r5_1 = _mm_srai_epi16(temp4, 2);
+ resq_r5_1 = _mm_sub_epi16(resq_r5_1, temp6);
+ /* z6 = y0 - y6 */
+ resq_r6_1 = _mm_sub_epi16(temp1, temp7);
+ /* z7 = y7 - (y1 >> 2) */
+ resq_r7_1 = _mm_srai_epi16(temp2, 2);
+ resq_r7_1 = _mm_sub_epi16(temp8, resq_r7_1);
+ /*------------------------------------------------------------------*/
+ /*------------------------------------------------------------------*/
+ /* x0 = z0 + z7 */
+ temp1 = _mm_add_epi16(resq_r0_1, resq_r7_1);
+ /* x1 = z2 + z5 */
+ temp2 = _mm_add_epi16(resq_r2_1, resq_r5_1);
+ /* x2 = z4 + z3 */
+ temp3 = _mm_add_epi16(resq_r4_1, resq_r3_1);
+ /* x3 = z6 + z1 */
+ temp4 = _mm_add_epi16(resq_r6_1, resq_r1_1);
+ /* x4 = z6 - z1 */
+ temp5 = _mm_sub_epi16(resq_r6_1, resq_r1_1);
+ /* x5 = z4 - z3 */
+ temp6 = _mm_sub_epi16(resq_r4_1, resq_r3_1);
+ /* x6 = z2 - z5 */
+ temp7 = _mm_sub_epi16(resq_r2_1, resq_r5_1);
+ /* x7 = z0 - z7 */
+ temp8 = _mm_sub_epi16(resq_r0_1, resq_r7_1);
+ /*------------------------------------------------------------------*/
+ // Matrix transpose
+ /*
+ * a0 b0 c0 d0 e0 f0 g0 h0
+ * a1 b1 c1 d1 e1 f1 g1 h1
+ * a2 b2 c2 d2 e2 f2 g2 h2
+ * a3 b3 c3 d3 e3 f3 g3 h3
+ */
+ temp17 = _mm_unpacklo_epi16(temp1, temp2); //a0 a1 b0 b1 c0 c1 d0 d1
+ temp19 = _mm_unpacklo_epi16(temp3, temp4); //a2 a3 b2 b3 c2 c3 d2 d3
+ temp18 = _mm_unpackhi_epi16(temp1, temp2); //e0 e1 f0 f1 g0 g1 h0 h1
+ temp20 = _mm_unpackhi_epi16(temp3, temp4); //e2 e3 f2 f3 g2 g3 h2 h3
+
+ resq_r0_1 = _mm_unpacklo_epi32(temp17, temp19); //a0 a1 a2 a3 b0 b1 b2 b3
+ resq_r1_1 = _mm_unpackhi_epi32(temp17, temp19); //c0 c1 c2 c3 d0 d1 d2 d3
+ resq_r2_1 = _mm_unpacklo_epi32(temp18, temp20); //e0 e1 e2 e3 f0 f1 f2 f3
+ resq_r3_1 = _mm_unpackhi_epi32(temp18, temp20); //g0 g2 g2 g3 h0 h1 h2 h3
+ /*
+ * a4 b4 c4 d4 e4 f4 g4 h4
+ * a5 b5 c5 d5 e5 f5 g5 h5
+ * a6 b6 c6 d6 e6 f6 g6 h6
+ * a7 b7 c7 d7 e7 f7 g7 h7
+ */
+ temp17 = _mm_unpacklo_epi16(temp5, temp6); //a4 a5 b4 b5 c4 c5 d4 d5
+ temp19 = _mm_unpacklo_epi16(temp7, temp8); //a6 a7 b6 b7 c6 c7 d6 d7
+ temp18 = _mm_unpackhi_epi16(temp5, temp6); //e4 e5 f4 f5 g4 g5 h4 h5
+ temp20 = _mm_unpackhi_epi16(temp7, temp8); //e6 e7 f6 f7 g6 g7 h6 h7
+
+ resq_r4_1 = _mm_unpacklo_epi32(temp17, temp19); //a4 a5 a6 a7 b4 b5 b6 b7
+ resq_r5_1 = _mm_unpackhi_epi32(temp17, temp19); //c4 c5 c6 c7 d4 d5 d6 d7
+ resq_r6_1 = _mm_unpacklo_epi32(temp18, temp20); //e4 e5 e6 e7 f4 f5 f6 f7
+ resq_r7_1 = _mm_unpackhi_epi32(temp18, temp20); //g4 g5 g6 g7 h4 h5 h6 h7
+ /* a0 a1 a2 a3 b0 b1 b2 b3
+ * c0 c1 c2 c3 d0 d1 d2 d3
+ * e0 e1 e2 e3 f0 f1 f2 f3
+ * g0 g2 g2 g3 h0 h1 h2 h3
+ * a4 a5 a6 a7 b4 b5 b6 b7
+ * c4 c5 c6 c7 d4 d5 d6 d7
+ * e4 e5 e6 e7 f4 f5 f6 f7
+ * g4 g5 g6 g7 h4 h5 h6 h7
+ */
+ resq_r0_2 = _mm_unpacklo_epi64(resq_r0_1, resq_r4_1); //a0 a1 a2 a3 a4 a5 a6 a7
+ resq_r1_2 = _mm_unpackhi_epi64(resq_r0_1, resq_r4_1); //b0 b1 b2 b3 b4 b5 b6 b7
+ resq_r2_2 = _mm_unpacklo_epi64(resq_r1_1, resq_r5_1); //c0 c1 c2 c3 c4 c5 c6 c7
+ resq_r3_2 = _mm_unpackhi_epi64(resq_r1_1, resq_r5_1); //d0 d1 d2 d3 d4 d5 d6 d7
+ resq_r4_2 = _mm_unpacklo_epi64(resq_r2_1, resq_r6_1); //e0 e1 e2 e3 e4 e5 e6 e7
+ resq_r5_2 = _mm_unpackhi_epi64(resq_r2_1, resq_r6_1); //f0 f1 f2 f3 f4 f5 f6 f7
+ resq_r6_2 = _mm_unpacklo_epi64(resq_r3_1, resq_r7_1); //g0 g1 g2 g3 g4 g5 g6 g7
+ resq_r7_2 = _mm_unpackhi_epi64(resq_r3_1, resq_r7_1); //h0 h1 h2 h3 h4 h5 h6 h7
+
+ sign_reg = _mm_cmpgt_epi16(zero_8x16b, resq_r1_2);
+ resq_r1_1 = _mm_unpacklo_epi16(resq_r1_2, sign_reg); //a1 b1 c1 d1 -- 32 bit
+ resq_r1_2 = _mm_unpackhi_epi16(resq_r1_2, sign_reg); //e1 f1 g1 h1 -- 32 bit
+ sign_reg = _mm_cmpgt_epi16(zero_8x16b, resq_r3_2);
+ resq_r3_1 = _mm_unpacklo_epi16(resq_r3_2, sign_reg); //a3 b3 c3 d3 -- 32 bit
+ resq_r3_2 = _mm_unpackhi_epi16(resq_r3_2, sign_reg); //e3 f3 g3 h3 -- 32 bit
+ sign_reg = _mm_cmpgt_epi16(zero_8x16b, resq_r5_2);
+ resq_r5_1 = _mm_unpacklo_epi16(resq_r5_2, sign_reg); //a5 b5 c5 d5 -- 32 bit
+ resq_r5_2 = _mm_unpackhi_epi16(resq_r5_2, sign_reg); //e5 f5 g5 h5 -- 32 bit
+ sign_reg = _mm_cmpgt_epi16(zero_8x16b, resq_r7_2);
+ resq_r7_1 = _mm_unpacklo_epi16(resq_r7_2, sign_reg); //a7 b7 c7 d7 -- 32 bit
+ resq_r7_2 = _mm_unpackhi_epi16(resq_r7_2, sign_reg); //e7 f7 g7 h7 -- 32 bit
+
+ zero_8x16b = _mm_setzero_si128(); // all bits reset to zero
+ //Load pred buffer row 0
+ predload_r = _mm_loadl_epi64((__m128i *) (&pu1_pred[0])); //p0 p1 p2 p3 p4 p5 p6 p7 0 0 0 0 0 0 0 0 -- all 8 bits
+ pred_r0_1 = _mm_unpacklo_epi8(predload_r, zero_8x16b); //p0 p1 p2 p3 p4 p5 p6 p7 -- all 16 bits
+ //Load pred buffer row 1
+ predload_r = _mm_loadl_epi64((__m128i *) (&pu1_pred[pred_strd])); //p0 p1 p2 p3 p4 p5 p6 p7 0 0 0 0 0 0 0 0 -- all 8 bits
+ pred_r1_1 = _mm_unpacklo_epi8(predload_r, zero_8x16b); //p0 p1 p2 p3 p4 p5 p6 p7 -- all 16 bits
+ //Load pred buffer row 2
+ predload_r = _mm_loadl_epi64((__m128i *) (&pu1_pred[2 * pred_strd])); //p0 p1 p2 p3 p4 p5 p6 p7 0 0 0 0 0 0 0 0 -- all 8 bits
+ pred_r2_1 = _mm_unpacklo_epi8(predload_r, zero_8x16b); //p0 p1 p2 p3 p4 p5 p6 p7 -- all 16 bits
+ //Load pred buffer row 3
+ predload_r = _mm_loadl_epi64((__m128i *) (&pu1_pred[3 * pred_strd])); //p0 p1 p2 p3 p4 p5 p6 p7 0 0 0 0 0 0 0 0 -- all 8 bits
+ pred_r3_1 = _mm_unpacklo_epi8(predload_r, zero_8x16b); //p0 p1 p2 p3 p4 p5 p6 p7 -- all 16 bits
+ //Load pred buffer row 4
+ predload_r = _mm_loadl_epi64((__m128i *) (&pu1_pred[4 * pred_strd])); //p0 p1 p2 p3 p4 p5 p6 p7 0 0 0 0 0 0 0 0 -- all 8 bits
+ pred_r4_1 = _mm_unpacklo_epi8(predload_r, zero_8x16b); //p0 p1 p2 p3 p4 p5 p6 p7 -- all 16 bits
+ //Load pred buffer row 5
+ predload_r = _mm_loadl_epi64((__m128i *) (&pu1_pred[5 * pred_strd])); //p0 p1 p2 p3 p4 p5 p6 p7 0 0 0 0 0 0 0 0 -- all 8 bit
+ pred_r5_1 = _mm_unpacklo_epi8(predload_r, zero_8x16b); //p0 p1 p2 p3 p4 p5 p6 p7 -- all 16 bits
+ //Load pred buffer row 6
+ predload_r = _mm_loadl_epi64((__m128i *) (&pu1_pred[6 * pred_strd])); //p0 p1 p2 p3 p4 p5 p6 p7 0 0 0 0 0 0 0 0 -- all 8 bits
+ pred_r6_1 = _mm_unpacklo_epi8(predload_r, zero_8x16b); //p0 p1 p2 p3 p4 p5 p6 p7 -- all 16 bits
+ //Load pred buffer row 7
+ predload_r = _mm_loadl_epi64((__m128i *) (&pu1_pred[7 * pred_strd])); //p0 p1 p2 p3 p4 p5 p6 p7 0 0 0 0 0 0 0 0 -- all 8 bits
+ pred_r7_1 = _mm_unpacklo_epi8(predload_r, zero_8x16b); //p0 p1 p2 p3 p4 p5 p6 p7 -- all 16 bits
+
+ /*--------------------------------------------------------------------*/
+ /* IDCT [ Vertical transformation] and Xij = (xij + 32)>>6 */
+ /* */
+ /* Add the prediction and store it back to reconstructed frame buffer */
+ /* [Prediction buffer itself in this case] */
+ /*--------------------------------------------------------------------*/
+
+ /* y0j = w0j + w4j */
+ temp1 = _mm_add_epi16(resq_r0_2, resq_r4_2);
+ /* y2j = w0j - w4j */
+ temp3 = _mm_sub_epi16(resq_r0_2, resq_r4_2);
+ /* y1j = -w3j + w5j - w7j - (w7j >> 1) */
+ temp2 = _mm_sub_epi32(resq_r5_1, resq_r3_1); //-w3+w5
+ temp10 = _mm_sub_epi32(resq_r5_2, resq_r3_2);
+ temp4 = _mm_sub_epi32(temp2, resq_r7_1); //-w3+w5-w7
+ temp12 = _mm_sub_epi32(temp10, resq_r7_2);
+ temp5 = _mm_srai_epi32(resq_r7_1, 1); //w7>>1
+ temp13 = _mm_srai_epi32(resq_r7_2, 1);
+ temp2 = _mm_sub_epi32(temp4, temp5); //-w3+w5-w7 -(w7>>1)
+ temp10 = _mm_sub_epi32(temp12, temp13);
+ temp2 = _mm_packs_epi32(temp2, temp10);
+ /* y3j = w1j + w7j - w3j - (w3j >> 1) */
+ temp4 = _mm_add_epi32(resq_r1_1, resq_r7_1); //w1+w7
+ temp12 = _mm_add_epi32(resq_r1_2, resq_r7_2);
+ temp4 = _mm_sub_epi32(temp4, resq_r3_1); //w1+w7-w3
+ temp12 = _mm_sub_epi32(temp12, resq_r3_2);
+ temp5 = _mm_srai_epi32(resq_r3_1, 1); //w3>>1
+ temp13 = _mm_srai_epi32(resq_r3_2, 1);
+ temp4 = _mm_sub_epi32(temp4, temp5); //w1+w7-w3-(w3>>1)
+ temp12 = _mm_sub_epi32(temp12, temp13);
+ temp4 = _mm_packs_epi32(temp4, temp12);
+ /* y4j = (w2j >> 1) - w6j */
+ temp5 = _mm_srai_epi16(resq_r2_2, 1); //w2>>1
+ temp5 = _mm_sub_epi16(temp5, resq_r6_2); //(w2>>1)-w6
+ /* y5j = -w1j + w7j + w5j + (w5j >> 1) */
+ temp6 = _mm_sub_epi32(resq_r7_1, resq_r1_1); //w7-w1
+ temp14 = _mm_sub_epi32(resq_r7_2, resq_r1_2);
+ temp6 = _mm_add_epi32(temp6, resq_r5_1); //w7-w1+w5
+ temp14 = _mm_add_epi32(temp14, resq_r5_2);
+ temp7 = _mm_srai_epi32(resq_r5_1, 1); //w5>>1
+ temp15 = _mm_srai_epi32(resq_r5_2, 1);
+ temp6 = _mm_add_epi32(temp6, temp7); //w7-w1_w5+(w5>>1)
+ temp14 = _mm_add_epi32(temp14, temp15);
+ temp6 = _mm_packs_epi32(temp6, temp14);
+ /* y6j = w2j + (w6j >> 1) */
+ temp7 = _mm_srai_epi16(resq_r6_2, 1); //w6>>1
+ temp7 = _mm_add_epi16(temp7, resq_r2_2); //(w6>>1)+w2
+ /* y7j = w3j + w5j + w1j + (w1j >> 1) */
+ temp8 = _mm_add_epi32(resq_r3_1, resq_r5_1); //w3+w5
+ temp16 = _mm_add_epi32(resq_r3_2, resq_r5_2);
+ temp8 = _mm_add_epi32(temp8, resq_r1_1); //w3+w5+w1
+ temp16 = _mm_add_epi32(temp16, resq_r1_2);
+ temp17 = _mm_srai_epi32(resq_r1_1, 1); //w1>>1
+ temp18 = _mm_srai_epi32(resq_r1_2, 1);
+ temp8 = _mm_add_epi32(temp8, temp17); //w3+w5+w1+(w1>>1)
+ temp16 = _mm_add_epi32(temp16, temp18);
+ temp8 = _mm_packs_epi32(temp8, temp16);
+ /*------------------------------------------------------------------*/
+ /*------------------------------------------------------------------*/
+ /* z0j = y0j + y6j */
+ resq_r0_1 = _mm_add_epi16(temp1, temp7);
+ /* z1j = y1j + (y7j >> 2) */
+ resq_r1_1 = _mm_srai_epi16(temp8, 2);
+ resq_r1_1 = _mm_add_epi16(resq_r1_1, temp2);
+ /* z2j = y2j + y4j */
+ resq_r2_1 = _mm_add_epi16(temp3, temp5);
+ /* z3j = y3j + (y5j >> 2) */
+ resq_r3_1 = _mm_srai_epi16(temp6, 2);
+ resq_r3_1 = _mm_add_epi16(resq_r3_1, temp4);
+ /* z4j = y2j - y4j */
+ resq_r4_1 = _mm_sub_epi16(temp3, temp5);
+ /* z5j = (y3j >> 2) - y5j */
+ resq_r5_1 = _mm_srai_epi16(temp4, 2);
+ resq_r5_1 = _mm_sub_epi16(resq_r5_1, temp6);
+ /* z6j = y0j - y6j */
+ resq_r6_1 = _mm_sub_epi16(temp1, temp7);
+ /* z7j = y7j - (y1j >> 2) */
+ resq_r7_1 = _mm_srai_epi16(temp2, 2);
+ resq_r7_1 = _mm_sub_epi16(temp8, resq_r7_1);
+ /*------------------------------------------------------------------*/
+
+ /*------------------------------------------------------------------*/
+ /* x0j = z0j + z7j */
+ temp1 = _mm_add_epi16(resq_r0_1, resq_r7_1);
+ sign_reg = _mm_cmpgt_epi16(zero_8x16b, temp1);
+ temp10 = _mm_unpacklo_epi16(temp1, sign_reg);
+ temp11 = _mm_unpackhi_epi16(temp1, sign_reg);
+ temp10 = _mm_add_epi32(temp10, value_32);
+ temp11 = _mm_add_epi32(temp11, value_32);
+ temp10 = _mm_srai_epi32(temp10, 6);
+ temp11 = _mm_srai_epi32(temp11, 6);
+ temp10 = _mm_packs_epi32(temp10, temp11);
+ temp1 = _mm_add_epi16(temp10, pred_r0_1);
+ /* x1j = z2j + z5j */
+ temp2 = _mm_add_epi16(resq_r2_1, resq_r5_1);
+ sign_reg = _mm_cmpgt_epi16(zero_8x16b, temp2);
+ temp10 = _mm_unpacklo_epi16(temp2, sign_reg);
+ temp11 = _mm_unpackhi_epi16(temp2, sign_reg);
+ temp10 = _mm_add_epi32(temp10, value_32);
+ temp11 = _mm_add_epi32(temp11, value_32);
+ temp10 = _mm_srai_epi32(temp10, 6);
+ temp11 = _mm_srai_epi32(temp11, 6);
+ temp10 = _mm_packs_epi32(temp10, temp11);
+ temp2 = _mm_add_epi16(temp10, pred_r1_1);
+ /* x2j = z4j + z3j */
+ temp3 = _mm_add_epi16(resq_r4_1, resq_r3_1);
+ sign_reg = _mm_cmpgt_epi16(zero_8x16b, temp3);
+ temp10 = _mm_unpacklo_epi16(temp3, sign_reg);
+ temp11 = _mm_unpackhi_epi16(temp3, sign_reg);
+ temp10 = _mm_add_epi32(temp10, value_32);
+ temp11 = _mm_add_epi32(temp11, value_32);
+ temp10 = _mm_srai_epi32(temp10, 6);
+ temp11 = _mm_srai_epi32(temp11, 6);
+ temp10 = _mm_packs_epi32(temp10, temp11);
+ temp3 = _mm_add_epi16(temp10, pred_r2_1);
+ /* x3j = z6j + z1j */
+ temp4 = _mm_add_epi16(resq_r6_1, resq_r1_1);
+ sign_reg = _mm_cmpgt_epi16(zero_8x16b, temp4);
+ temp10 = _mm_unpacklo_epi16(temp4, sign_reg);
+ temp11 = _mm_unpackhi_epi16(temp4, sign_reg);
+ temp10 = _mm_add_epi32(temp10, value_32);
+ temp11 = _mm_add_epi32(temp11, value_32);
+ temp10 = _mm_srai_epi32(temp10, 6);
+ temp11 = _mm_srai_epi32(temp11, 6);
+ temp10 = _mm_packs_epi32(temp10, temp11);
+ temp4 = _mm_add_epi16(temp10, pred_r3_1);
+ /* x4j = z6j - z1j */
+ temp5 = _mm_sub_epi16(resq_r6_1, resq_r1_1);
+ sign_reg = _mm_cmpgt_epi16(zero_8x16b, temp5);
+ temp10 = _mm_unpacklo_epi16(temp5, sign_reg);
+ temp11 = _mm_unpackhi_epi16(temp5, sign_reg);
+ temp10 = _mm_add_epi32(temp10, value_32);
+ temp11 = _mm_add_epi32(temp11, value_32);
+ temp10 = _mm_srai_epi32(temp10, 6);
+ temp11 = _mm_srai_epi32(temp11, 6);
+ temp10 = _mm_packs_epi32(temp10, temp11);
+ temp5 = _mm_add_epi16(temp10, pred_r4_1);
+ /* x5j = z4j - z3j */
+ temp6 = _mm_sub_epi16(resq_r4_1, resq_r3_1);
+ sign_reg = _mm_cmpgt_epi16(zero_8x16b, temp6);
+ temp10 = _mm_unpacklo_epi16(temp6, sign_reg);
+ temp11 = _mm_unpackhi_epi16(temp6, sign_reg);
+ temp10 = _mm_add_epi32(temp10, value_32);
+ temp11 = _mm_add_epi32(temp11, value_32);
+ temp10 = _mm_srai_epi32(temp10, 6);
+ temp11 = _mm_srai_epi32(temp11, 6);
+ temp10 = _mm_packs_epi32(temp10, temp11);
+ temp6 = _mm_add_epi16(temp10, pred_r5_1);
+ /* x6j = z2j - z5j */
+ temp7 = _mm_sub_epi16(resq_r2_1, resq_r5_1);
+ sign_reg = _mm_cmpgt_epi16(zero_8x16b, temp7);
+ temp10 = _mm_unpacklo_epi16(temp7, sign_reg);
+ temp11 = _mm_unpackhi_epi16(temp7, sign_reg);
+ temp10 = _mm_add_epi32(temp10, value_32);
+ temp11 = _mm_add_epi32(temp11, value_32);
+ temp10 = _mm_srai_epi32(temp10, 6);
+ temp11 = _mm_srai_epi32(temp11, 6);
+ temp10 = _mm_packs_epi32(temp10, temp11);
+ temp7 = _mm_add_epi16(temp10, pred_r6_1);
+ /* x7j = z0j - z7j */
+ temp8 = _mm_sub_epi16(resq_r0_1, resq_r7_1);
+ sign_reg = _mm_cmpgt_epi16(zero_8x16b, temp8);
+ temp10 = _mm_unpacklo_epi16(temp8, sign_reg);
+ temp11 = _mm_unpackhi_epi16(temp8, sign_reg);
+ temp10 = _mm_add_epi32(temp10, value_32);
+ temp11 = _mm_add_epi32(temp11, value_32);
+ temp10 = _mm_srai_epi32(temp10, 6);
+ temp11 = _mm_srai_epi32(temp11, 6);
+ temp10 = _mm_packs_epi32(temp10, temp11);
+ temp8 = _mm_add_epi16(temp10, pred_r7_1);
+ /*------------------------------------------------------------------*/
+ //Clipping the results to 8 bits
+ sign_reg = _mm_cmpgt_epi16(temp1, zero_8x16b); // sign check
+ temp1 = _mm_and_si128(temp1, sign_reg);
+ sign_reg = _mm_cmpgt_epi16(temp2, zero_8x16b); // sign check
+ temp2 = _mm_and_si128(temp2, sign_reg);
+ sign_reg = _mm_cmpgt_epi16(temp3, zero_8x16b); // sign check
+ temp3 = _mm_and_si128(temp3, sign_reg);
+ sign_reg = _mm_cmpgt_epi16(temp4, zero_8x16b); // sign check
+ temp4 = _mm_and_si128(temp4, sign_reg);
+ sign_reg = _mm_cmpgt_epi16(temp5, zero_8x16b); // sign check
+ temp5 = _mm_and_si128(temp5, sign_reg);
+ sign_reg = _mm_cmpgt_epi16(temp6, zero_8x16b); // sign check
+ temp6 = _mm_and_si128(temp6, sign_reg);
+ sign_reg = _mm_cmpgt_epi16(temp7, zero_8x16b); // sign check
+ temp7 = _mm_and_si128(temp7, sign_reg);
+ sign_reg = _mm_cmpgt_epi16(temp8, zero_8x16b); // sign check
+ temp8 = _mm_and_si128(temp8, sign_reg);
+
+ resq_r0_2 = _mm_packus_epi16(temp1, zero_8x16b);
+ resq_r1_2 = _mm_packus_epi16(temp2, zero_8x16b);
+ resq_r2_2 = _mm_packus_epi16(temp3, zero_8x16b);
+ resq_r3_2 = _mm_packus_epi16(temp4, zero_8x16b);
+ resq_r4_2 = _mm_packus_epi16(temp5, zero_8x16b);
+ resq_r5_2 = _mm_packus_epi16(temp6, zero_8x16b);
+ resq_r6_2 = _mm_packus_epi16(temp7, zero_8x16b);
+ resq_r7_2 = _mm_packus_epi16(temp8, zero_8x16b);
+
+ _mm_storel_epi64((__m128i *) (&pu1_out[0]), resq_r0_2);
+ _mm_storel_epi64((__m128i *) (&pu1_out[out_strd]), resq_r1_2);
+ _mm_storel_epi64((__m128i *) (&pu1_out[2 * out_strd]), resq_r2_2);
+ _mm_storel_epi64((__m128i *) (&pu1_out[3 * out_strd]), resq_r3_2);
+ _mm_storel_epi64((__m128i *) (&pu1_out[4 * out_strd]), resq_r4_2);
+ _mm_storel_epi64((__m128i *) (&pu1_out[5 * out_strd]), resq_r5_2);
+ _mm_storel_epi64((__m128i *) (&pu1_out[6 * out_strd]), resq_r6_2);
+ _mm_storel_epi64((__m128i *) (&pu1_out[7 * out_strd]), resq_r7_2);
+}
+
diff --git a/common/x86/ih264_luma_intra_pred_filters_ssse3.c b/common/x86/ih264_luma_intra_pred_filters_ssse3.c
new file mode 100755
index 0000000..5a35372
--- /dev/null
+++ b/common/x86/ih264_luma_intra_pred_filters_ssse3.c
@@ -0,0 +1,2282 @@
+/******************************************************************************
+ *
+ * Copyright (C) 2015 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ *****************************************************************************
+ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
+*/
+/**
+ *******************************************************************************
+ * @file
+ * ih264_luma_intra_pred_filters_ssse3.c
+ *
+ * @brief
+ * Contains function definitions for luma intra prediction filters in x86
+ * intrinsics
+ *
+ * @author
+ * Ittiam
+ *
+ * @par List of Functions:
+ * - ih264_intra_pred_luma_4x4_mode_vert_ssse3
+ * - ih264_intra_pred_luma_4x4_mode_horz_ssse3
+ * - ih264_intra_pred_luma_4x4_mode_dc_ssse3
+ * - ih264_intra_pred_luma_4x4_mode_diag_dl_ssse3
+ * - ih264_intra_pred_luma_4x4_mode_diag_dr_ssse3
+ * - ih264_intra_pred_luma_4x4_mode_vert_r_ssse3
+ * - ih264_intra_pred_luma_4x4_mode_horz_d_ssse3
+ * - ih264_intra_pred_luma_4x4_mode_vert_l_ssse3
+ * - ih264_intra_pred_luma_4x4_mode_horz_u_ssse3
+ * - ih264_intra_pred_luma_8x8_mode_vert_ssse3
+ * - ih264_intra_pred_luma_8x8_mode_horz_ssse3
+ * - ih264_intra_pred_luma_8x8_mode_dc_ssse3
+ * - ih264_intra_pred_luma_8x8_mode_diag_dl_ssse3
+ * - ih264_intra_pred_luma_8x8_mode_diag_dr_ssse3
+ * - ih264_intra_pred_luma_8x8_mode_vert_r_ssse3
+ * - ih264_intra_pred_luma_8x8_mode_horz_d_ssse3
+ * - ih264_intra_pred_luma_8x8_mode_vert_l_ssse3
+ * - ih264_intra_pred_luma_8x8_mode_horz_u_ssse3
+ * - ih264_intra_pred_luma_16x16_mode_vert_ssse3
+ * - ih264_intra_pred_luma_16x16_mode_horz_ssse3
+ * - ih264_intra_pred_luma_16x16_mode_dc_ssse3
+ * - ih264_intra_pred_luma_16x16_mode_plane_ssse3
+ *
+ * @remarks
+ * None
+ *
+ ******************************************************************************
+ */
+
+/*****************************************************************************/
+/* File Includes */
+/*****************************************************************************/
+/* System include files */
+#include <stdio.h>
+#include <stddef.h>
+#include <string.h>
+#include <immintrin.h>
+
+/* User include files */
+#include "ih264_defs.h"
+#include "ih264_typedefs.h"
+#include "ih264_macros.h"
+#include "ih264_platform_macros.h"
+#include "ih264_intra_pred_filters.h"
+
+
+
+/******************* LUMA INTRAPREDICTION *******************/
+
+/******************* 4x4 Modes *******************/
+
+/**
+ *******************************************************************************
+ *
+ * ih264_intra_pred_luma_4x4_mode_vert_ssse3
+ *
+ * @brief
+ * Perform Intra prediction for luma_4x4 mode:vertical
+ *
+ * @par Description:
+ * Perform Intra prediction for luma_4x4 mode:vertical ,described in sec 8.3.1.2.1
+ *
+ * @param[in] pu1_src
+ * UWORD8 pointer to the source
+ *
+ * @param[out] pu1_dst
+ * UWORD8 pointer to the destination
+ *
+ * @param[in] src_strd
+ * integer source stride
+ *
+ * @param[in] dst_strd
+ * integer destination stride
+ *
+ * @param[in] ngbr_avail
+ * availability of neighbouring pixels(Not used in this function)
+ *
+ * @returns
+ *
+ * @remarks
+ * None
+ *
+ *******************************************************************************
+ */
+void ih264_intra_pred_luma_4x4_mode_vert_ssse3(UWORD8 *pu1_src,
+ UWORD8 *pu1_dst,
+ WORD32 src_strd,
+ WORD32 dst_strd,
+ WORD32 ngbr_avail)
+{
+ UWORD8 *pu1_top;
+ WORD32 dst_strd2, dst_strd3;
+
+ __m128i top_16x8b;
+ __m128i mask_full_128b, mask_low_32b;
+
+ UNUSED(src_strd);
+ UNUSED(ngbr_avail);
+
+ mask_full_128b = _mm_set1_epi8(0xff);
+
+ pu1_top = pu1_src + BLK_SIZE + 1;
+
+ mask_low_32b = _mm_srli_si128(mask_full_128b, 12);
+
+ top_16x8b = _mm_loadl_epi64((__m128i *)pu1_top);
+
+ dst_strd2 = dst_strd << 1;
+ dst_strd3 = dst_strd + dst_strd2;
+
+ _mm_maskmoveu_si128(top_16x8b, mask_low_32b, (char*)pu1_dst);
+ _mm_maskmoveu_si128(top_16x8b, mask_low_32b, (char*)(pu1_dst + dst_strd));
+ _mm_maskmoveu_si128(top_16x8b, mask_low_32b, (char*)(pu1_dst + dst_strd2));
+ _mm_maskmoveu_si128(top_16x8b, mask_low_32b, (char*)(pu1_dst + dst_strd3));
+}
+
+/**
+ *******************************************************************************
+ *
+ *ih264_intra_pred_luma_4x4_mode_horz_ssse3
+ *
+ * @brief
+ * Perform Intra prediction for luma_4x4 mode:horizontal
+ *
+ * @par Description:
+ * Perform Intra prediction for luma_4x4 mode:horizontal ,described in sec 8.3.1.2.2
+ *
+ * @param[in] pu1_src
+ * UWORD8 pointer to the source
+ *
+ * @param[out] pu1_dst
+ * UWORD8 pointer to the destination
+ *
+ * @param[in] src_strd
+ * integer source stride
+ *
+ * @param[in] dst_strd
+ * integer destination stride
+ *
+ * @param[in] ngbr_avail
+ * availability of neighbouring pixels(Not used in this function)
+ *
+ * @returns
+ *
+ * @remarks
+ * None
+ *
+ *******************************************************************************
+ */
+void ih264_intra_pred_luma_4x4_mode_horz_ssse3(UWORD8 *pu1_src,
+ UWORD8 *pu1_dst,
+ WORD32 src_strd,
+ WORD32 dst_strd,
+ WORD32 ngbr_avail)
+{
+ UWORD8 *pu1_left;
+ WORD32 dst_strd2, dst_strd3;
+ WORD32 val1, val2;
+
+ __m128i left_16x8b;
+ __m128i row1_16x8b, row2_16x8b, row3_16x8b, row4_16x8b;
+ __m128i mask_full_128b, mask_low_32b;
+
+ UNUSED(src_strd);
+ UNUSED(ngbr_avail);
+
+ mask_full_128b = _mm_set1_epi8(0xff);
+
+ pu1_left = pu1_src + BLK_SIZE - 1;
+
+ mask_low_32b = _mm_srli_si128(mask_full_128b, 12);
+ left_16x8b = _mm_loadl_epi64((__m128i *)(pu1_left - 3));
+
+ val1 = _mm_extract_epi16(left_16x8b, 1);
+ val2 = _mm_extract_epi16(left_16x8b, 0);
+
+ row1_16x8b = _mm_set1_epi8(val1 >> 8);
+ row2_16x8b = _mm_set1_epi8(val1 & 0xff);
+ row3_16x8b = _mm_set1_epi8(val2 >> 8);
+ row4_16x8b = _mm_set1_epi8(val2 & 0xff);
+
+ dst_strd2 = dst_strd << 1;
+ dst_strd3 = dst_strd + dst_strd2;
+
+ _mm_maskmoveu_si128(row1_16x8b, mask_low_32b, (char*)pu1_dst);
+ _mm_maskmoveu_si128(row2_16x8b, mask_low_32b, (char*)(pu1_dst + dst_strd));
+ _mm_maskmoveu_si128(row3_16x8b, mask_low_32b, (char*)(pu1_dst + dst_strd2));
+ _mm_maskmoveu_si128(row4_16x8b, mask_low_32b, (char*)(pu1_dst + dst_strd3));
+}
+
+/**
+ *******************************************************************************
+ *
+ * ih264_intra_pred_luma_4x4_mode_dc_ssse3
+ *
+ * @brief
+ * Perform Intra prediction for luma_4x4 mode:DC
+ *
+ * @par Description:
+ * Perform Intra prediction for luma_4x4 mode:DC ,described in sec 8.3.1.2.3
+ *
+ * @param[in] pu1_src
+ * UWORD8 pointer to the source
+ *
+ * @param[out] pu1_dst
+ * UWORD8 pointer to the destination
+ *
+ * @param[in] src_strd
+ * integer source stride
+ *
+ * @param[in] dst_strd
+ * integer destination stride
+ *
+ * @param[in] ngbr_avail
+ * availability of neighbouring pixels
+ *
+ * @returns
+ *
+ * @remarks
+ * None
+ *
+ *******************************************************************************/
+void ih264_intra_pred_luma_4x4_mode_dc_ssse3(UWORD8 *pu1_src,
+ UWORD8 *pu1_dst,
+ WORD32 src_strd,
+ WORD32 dst_strd,
+ WORD32 ngbr_avail)
+{
+ UWORD8 u1_useleft; /* availability of left predictors (only for DC) */
+ UWORD8 u1_usetop; /* availability of top predictors (only for DC) */
+ UWORD8 *pu1_left, *pu1_top;
+ WORD32 dc_val, flag;
+ WORD32 dst_strd2, dst_strd3;
+
+ __m128i mask_full_128b, mask_low_32b;
+ __m128i dcval_16x8b;
+
+ UNUSED(src_strd);
+ UNUSED(ngbr_avail);
+
+ mask_full_128b = _mm_set1_epi8(0xff);
+
+ u1_useleft = BOOLEAN(ngbr_avail & LEFT_MB_AVAILABLE_MASK);
+ u1_usetop = BOOLEAN(ngbr_avail & TOP_MB_AVAILABLE_MASK);
+
+ pu1_left = pu1_src + BLK_SIZE - 1;
+ pu1_top = pu1_src + BLK_SIZE + 1;
+
+ mask_low_32b = _mm_srli_si128(mask_full_128b, 12);
+
+ flag = u1_useleft + u1_usetop;
+
+ if(flag)
+ {
+ WORD32 shft, ofst = 0;
+
+ __m128i left_16x8b, top_16x8b, val_16x8b, tmp_8x16b, zero_vector;
+
+ if(u1_useleft)
+ {
+ left_16x8b = _mm_loadl_epi64((__m128i *)(pu1_left - 3));
+ ofst += 2;
+ }
+ else
+ left_16x8b = _mm_setzero_si128();
+
+ zero_vector = _mm_setzero_si128();
+
+ if(u1_usetop)
+ {
+ top_16x8b = _mm_loadl_epi64((__m128i *)pu1_top);
+ ofst += 2;
+ }
+ else
+ top_16x8b = _mm_setzero_si128();
+
+ shft = flag + 1;
+ val_16x8b = _mm_unpacklo_epi32(left_16x8b, top_16x8b);
+ tmp_8x16b = _mm_sad_epu8(val_16x8b, zero_vector);
+
+ dc_val = _mm_extract_epi16(tmp_8x16b, 0);
+ dc_val = (dc_val + ofst) >> shft;
+ }
+ else
+ dc_val = 128;
+
+ dst_strd2 = dst_strd << 1;
+ dst_strd3 = dst_strd + dst_strd2;
+
+ dcval_16x8b = _mm_set1_epi8(dc_val);
+
+ _mm_maskmoveu_si128(dcval_16x8b, mask_low_32b, (char*)pu1_dst);
+ _mm_maskmoveu_si128(dcval_16x8b, mask_low_32b, (char*)(pu1_dst + dst_strd));
+ _mm_maskmoveu_si128(dcval_16x8b, mask_low_32b, (char*)(pu1_dst + dst_strd2));
+ _mm_maskmoveu_si128(dcval_16x8b, mask_low_32b, (char*)(pu1_dst + dst_strd3));
+}
+
+/**
+ *******************************************************************************
+ *
+ * ih264_intra_pred_luma_4x4_mode_diag_dl_ssse3
+ *
+ * @brief
+ * Perform Intra prediction for luma_4x4 mode:Diagonal_Down_Left
+ *
+ * @par Description:
+ * Perform Intra prediction for luma_4x4 mode:Diagonal_Down_Left ,described in sec 8.3.1.2.4
+ *
+ * @param[in] pu1_src
+ * UWORD8 pointer to the source
+ *
+ * @param[out] pu1_dst
+ * UWORD8 pointer to the destination
+ *
+ * @param[in] src_strd
+ * integer source stride
+ *
+ * @param[in] dst_strd
+ * integer destination stride
+ *
+ * @param[in] ngbr_avail
+ * availability of neighbouring pixels(Not used in this function)
+ *
+ * @returns
+ *
+ * @remarks
+ * None
+ *
+ *******************************************************************************/
+void ih264_intra_pred_luma_4x4_mode_diag_dl_ssse3(UWORD8 *pu1_src,
+ UWORD8 *pu1_dst,
+ WORD32 src_strd,
+ WORD32 dst_strd,
+ WORD32 ngbr_avail)
+{
+ UWORD8 *pu1_top;
+ WORD32 dst_strd2, dst_strd3;
+
+ __m128i top_16x8b, top_8x16b, top_sh_8x16b;
+ __m128i res1_8x16b, res2_8x16b, res_16x8b;
+ __m128i zero_vector, const_2_8x16b;
+ __m128i mask_full_128b, mask_low_32b;
+
+ UNUSED(src_strd);
+ UNUSED(ngbr_avail);
+
+ pu1_top = pu1_src + BLK_SIZE + 1;
+
+ top_16x8b = _mm_loadl_epi64((__m128i *)pu1_top);
+ zero_vector = _mm_setzero_si128();
+ top_8x16b = _mm_unpacklo_epi8(top_16x8b, zero_vector); //t0 t1 t2 t3 t4 t5 t6 t7
+
+ mask_full_128b = _mm_set1_epi8(0xff);
+ top_sh_8x16b = _mm_srli_si128(top_8x16b, 2); //t1 t2 t3 t4 t5 t6 t7 0
+ const_2_8x16b = _mm_set1_epi16(2);
+
+ top_sh_8x16b = _mm_shufflehi_epi16(top_sh_8x16b, 0xa4); //t1 t2 t3 t4 t5 t6 t7 t7
+ res1_8x16b = _mm_add_epi16(top_8x16b, top_sh_8x16b);
+ mask_low_32b = _mm_srli_si128(mask_full_128b, 12);
+ res2_8x16b = _mm_srli_si128(res1_8x16b, 2);
+
+ res1_8x16b = _mm_add_epi16(res1_8x16b, const_2_8x16b);
+ res1_8x16b = _mm_add_epi16(res2_8x16b, res1_8x16b);
+ res1_8x16b = _mm_srai_epi16(res1_8x16b, 2);
+
+ dst_strd2 = dst_strd << 1;
+ dst_strd3 = dst_strd + dst_strd2;
+
+ res_16x8b = _mm_packus_epi16(res1_8x16b, res1_8x16b);
+ _mm_maskmoveu_si128(res_16x8b, mask_low_32b, (char*)pu1_dst);
+ res_16x8b = _mm_srli_si128(res_16x8b, 1);
+ _mm_maskmoveu_si128(res_16x8b, mask_low_32b, (char*)(pu1_dst + dst_strd));
+ res_16x8b = _mm_srli_si128(res_16x8b, 1);
+ _mm_maskmoveu_si128(res_16x8b, mask_low_32b, (char*)(pu1_dst + dst_strd2));
+ res_16x8b = _mm_srli_si128(res_16x8b, 1);
+ _mm_maskmoveu_si128(res_16x8b, mask_low_32b, (char*)(pu1_dst + dst_strd3));
+}
+
+/**
+ *******************************************************************************
+ *
+ * ih264_intra_pred_luma_4x4_mode_diag_dr_ssse3
+ *
+ * @brief
+ * Perform Intra prediction for luma_4x4 mode:Diagonal_Down_Right
+ *
+ * @par Description:
+ * Perform Intra prediction for luma_4x4 mode:Diagonal_Down_Right ,described in sec 8.3.1.2.5
+ *
+ * @param[in] pu1_src
+ * UWORD8 pointer to the source
+ *
+ * @param[out] pu1_dst
+ * UWORD8 pointer to the destination
+ *
+ * @param[in] src_strd
+ * integer source stride
+ *
+ * @param[in] dst_strd
+ * integer destination stride
+ *
+ * @param[in] ngbr_avail
+ * availability of neighbouring pixels(Not used in this function)
+ *
+ * @returns
+ *
+ * @remarks
+ * None
+ *
+ *******************************************************************************/
+void ih264_intra_pred_luma_4x4_mode_diag_dr_ssse3(UWORD8 *pu1_src,
+ UWORD8 *pu1_dst,
+ WORD32 src_strd,
+ WORD32 dst_strd,
+ WORD32 ngbr_avail)
+{
+ UWORD8 *pu1_left;
+ WORD32 dst_strd2, dst_strd3;
+
+ __m128i top_left_16x8b, top_left_8x16b;
+ __m128i top_left_sh_16x8b, top_left_sh_8x16b;
+ __m128i res1_8x16b, res2_8x16b;
+ __m128i res1_16x8b, res2_16x8b;
+ __m128i zero_vector, const_2_8x16b;
+ __m128i mask_full_128b, mask_low_32b;
+
+ UNUSED(src_strd);
+ UNUSED(ngbr_avail);
+
+ pu1_left = pu1_src + BLK_SIZE - 1;
+
+ top_left_16x8b = _mm_loadu_si128((__m128i *)(pu1_left - 3)); //l3 l2 l1 l0 tl t0 t1 t2...
+ zero_vector = _mm_setzero_si128();
+ top_left_sh_16x8b = _mm_srli_si128(top_left_16x8b, 1); //l2 l1 l0 tl t0 t1 t2 t3...
+
+ top_left_8x16b = _mm_unpacklo_epi8(top_left_16x8b, zero_vector);
+ top_left_sh_8x16b = _mm_unpacklo_epi8(top_left_sh_16x8b, zero_vector);
+
+ mask_full_128b = _mm_set1_epi8(0xff);
+ res1_8x16b = _mm_add_epi16(top_left_8x16b, top_left_sh_8x16b); //l3+l2 l2+l1 l1+l0 l0+tl tl+t0 t0+t1 t1+t2 t2+t3...
+ const_2_8x16b = _mm_set1_epi16(2);
+ res2_8x16b = _mm_srli_si128(res1_8x16b, 2); //l2+l1 l1+l0 l0+tl tl+t0 t0+t1 t1+t2 t2+t3...
+
+ res1_8x16b = _mm_add_epi16(res1_8x16b, const_2_8x16b);
+ mask_low_32b = _mm_srli_si128(mask_full_128b, 12);
+ res1_8x16b = _mm_add_epi16(res2_8x16b, res1_8x16b); //l3+2*l2+l1+2 l2+2*l1+l0+2...
+ res1_8x16b = _mm_srai_epi16(res1_8x16b, 2);
+ res1_16x8b = _mm_packus_epi16(res1_8x16b, res1_8x16b);
+
+ dst_strd2 = dst_strd << 1;
+ dst_strd3 = dst_strd + dst_strd2;
+
+ res2_16x8b = _mm_srli_si128(res1_16x8b, 3);
+ _mm_maskmoveu_si128(res2_16x8b, mask_low_32b, (char*)pu1_dst);
+ res2_16x8b = _mm_srli_si128(res1_16x8b, 2);
+ _mm_maskmoveu_si128(res2_16x8b, mask_low_32b, (char*)(pu1_dst + dst_strd));
+ res2_16x8b = _mm_srli_si128(res1_16x8b, 1);
+ _mm_maskmoveu_si128(res2_16x8b, mask_low_32b, (char*)(pu1_dst + dst_strd2));
+ _mm_maskmoveu_si128(res1_16x8b, mask_low_32b, (char*)(pu1_dst + dst_strd3));
+}
+
+/**
+ *******************************************************************************
+ *
+ * ih264_intra_pred_luma_4x4_mode_vert_r_ssse3
+ *
+ * @brief
+ * Perform Intra prediction for luma_4x4 mode:Vertical_Right
+ *
+ * @par Description:
+ * Perform Intra prediction for luma_4x4 mode:Vertical_Right ,described in sec 8.3.1.2.6
+ *
+ * @param[in] pu1_src
+ * UWORD8 pointer to the source
+ *
+ * @param[out] pu1_dst
+ * UWORD8 pointer to the destination
+ *
+ * @param[in] src_strd
+ * integer source stride
+ *
+ * @param[in] dst_strd
+ * integer destination stride
+ *
+ * @param[in] ngbr_avail
+ * availability of neighbouring pixels(Not used in this function)
+ *
+ * @returns
+ *
+ * @remarks
+ * None
+ *
+ *******************************************************************************/
+void ih264_intra_pred_luma_4x4_mode_vert_r_ssse3(UWORD8 *pu1_src,
+ UWORD8 *pu1_dst,
+ WORD32 src_strd,
+ WORD32 dst_strd,
+ WORD32 ngbr_avail)
+{
+ UWORD8 *pu1_left;
+ WORD32 dst_strd2, dst_strd3;
+
+ __m128i val_16x8b, temp_16x8b;
+ __m128i w11_a1_16x8b, w11_a2_16x8b;
+ __m128i w121_a1_8x16b, w121_a2_8x16b, w121_sh_8x16b;
+ __m128i row1_16x8b, row2_16x8b, row3_16x8b, row4_16x8b;
+ __m128i zero_vector, const_2_8x16b;
+ __m128i mask_full_128b, mask_low_32b;
+
+ UNUSED(src_strd);
+ UNUSED(ngbr_avail);
+
+ mask_full_128b = _mm_set1_epi8(0xff);
+ mask_low_32b = _mm_srli_si128(mask_full_128b, 12);
+
+ pu1_left = pu1_src + BLK_SIZE - 1;
+
+ val_16x8b = _mm_loadl_epi64((__m128i *)(pu1_left - 2));
+ zero_vector = _mm_setzero_si128();
+
+ w121_a1_8x16b = _mm_unpacklo_epi8(val_16x8b, zero_vector); //l2 l1 l0 tl t0 t1 t2 t3
+ w11_a1_16x8b = _mm_srli_si128(val_16x8b, 3);
+ w121_a2_8x16b = _mm_srli_si128(w121_a1_8x16b, 2); //l1 l0 tl t0 t1 t2 t3 0
+ w11_a2_16x8b = _mm_srli_si128(val_16x8b, 4);
+
+ w121_a1_8x16b = _mm_add_epi16(w121_a1_8x16b, w121_a2_8x16b); //l2+l1 l1+l0 l0+tl tl+t0 t0+t1 t1+t2 t2+t3 t3
+ row1_16x8b = _mm_avg_epu8(w11_a1_16x8b, w11_a2_16x8b);
+ w121_a2_8x16b = _mm_srli_si128(w121_a1_8x16b, 2); //l1+l0 l0+tl tl+t0 t0+t1 t1+t2 t2+t3 t3 0
+
+ const_2_8x16b = _mm_set1_epi16(2);
+ w121_a1_8x16b = _mm_add_epi16(w121_a1_8x16b, w121_a2_8x16b); //l2+2*l1+l0 l1+2*l0+tl ...
+ w121_a1_8x16b = _mm_add_epi16(w121_a1_8x16b, const_2_8x16b);
+ w121_a1_8x16b = _mm_srai_epi16(w121_a1_8x16b, 2);
+
+ w121_sh_8x16b = _mm_shufflelo_epi16(w121_a1_8x16b, 0xe1);
+ w121_sh_8x16b = _mm_srli_si128(w121_sh_8x16b, 2);
+
+ row4_16x8b = _mm_packus_epi16(w121_sh_8x16b, w121_sh_8x16b);
+ temp_16x8b = _mm_slli_si128(w121_a1_8x16b, 13);
+ row2_16x8b = _mm_srli_si128(row4_16x8b, 1);
+ row3_16x8b = _mm_alignr_epi8(row1_16x8b, temp_16x8b, 15);
+
+ dst_strd2 = dst_strd << 1;
+ dst_strd3 = dst_strd + dst_strd2;
+
+ _mm_maskmoveu_si128(row1_16x8b, mask_low_32b, (char*)pu1_dst);
+ _mm_maskmoveu_si128(row2_16x8b, mask_low_32b, (char*)(pu1_dst + dst_strd));
+ _mm_maskmoveu_si128(row3_16x8b, mask_low_32b, (char*)(pu1_dst + dst_strd2));
+ _mm_maskmoveu_si128(row4_16x8b, mask_low_32b, (char*)(pu1_dst + dst_strd3));
+}
+
+/*
+ *******************************************************************************
+ *
+ * ih264_intra_pred_luma_4x4_mode_horz_d_ssse3
+ *
+ * @brief
+ * Perform Intra prediction for luma_4x4 mode:Horizontal_Down
+ *
+ * @par Description:
+ * Perform Intra prediction for luma_4x4 mode:Horizontal_Down ,described in sec 8.3.1.2.7
+ *
+ * @param[in] pu1_src
+ * UWORD8 pointer to the source
+ *
+ * @param[out] pu1_dst
+ * UWORD8 pointer to the destination
+ *
+ * @param[in] src_strd
+ * integer source stride
+ *
+ * @param[in] dst_strd
+ * integer destination stride
+ *
+ * @param[in] ngbr_avail
+ * availability of neighbouring pixels(Not used in this function)
+ *
+ * @returns
+ *
+ * @remarks
+ * None
+ *
+ *******************************************************************************/
+void ih264_intra_pred_luma_4x4_mode_horz_d_ssse3(UWORD8 *pu1_src,
+ UWORD8 *pu1_dst,
+ WORD32 src_strd,
+ WORD32 dst_strd,
+ WORD32 ngbr_avail)
+{
+ UWORD8 *pu1_left;
+ WORD32 dst_strd2, dst_strd3;
+ WORD32 val_121_t0t1;
+
+ __m128i val_16x8b, val_sh_16x8b;
+ __m128i w11_16x8b;
+ __m128i w121_a1_8x16b, w121_a2_8x16b, w121_16x8b;
+ __m128i row1_16x8b, row2_16x8b, row3_16x8b, row4_16x8b;
+
+ __m128i zero_vector, const_2_8x16b;
+ __m128i mask_full_128b, mask_low_32b;
+
+ UNUSED(src_strd);
+ UNUSED(ngbr_avail);
+
+ mask_full_128b = _mm_set1_epi8(0xff);
+ mask_low_32b = _mm_srli_si128(mask_full_128b, 12);
+
+ pu1_left = pu1_src + BLK_SIZE - 1;
+
+ val_16x8b = _mm_loadl_epi64((__m128i *)(pu1_left - 3));
+ zero_vector = _mm_setzero_si128();
+ val_sh_16x8b = _mm_srli_si128(val_16x8b, 1);
+ w11_16x8b = _mm_avg_epu8(val_16x8b, val_sh_16x8b);
+
+ w121_a1_8x16b = _mm_unpacklo_epi8(val_16x8b, zero_vector); //l3 l2 l1 l0 tl t0 t1 t2
+ w121_a2_8x16b = _mm_srli_si128(w121_a1_8x16b, 2); //l2 l1 l0 tl t0 t1 t2 0
+ w121_a1_8x16b = _mm_add_epi16(w121_a1_8x16b, w121_a2_8x16b); //l3+l2 l2+l1 l1+l0 l0+tl tl+t0 t0+t1 t1+t2 t2
+ w121_a2_8x16b = _mm_srli_si128(w121_a1_8x16b, 2); //l2+l1 l1+l0 l0+tl tl+t0 t0+t1 t1+t2 t2 0
+
+ zero_vector = _mm_setzero_si128();
+ const_2_8x16b = _mm_set1_epi16(2);
+
+ w121_a1_8x16b = _mm_add_epi16(w121_a1_8x16b, w121_a2_8x16b); //l3+2*l2+l1 l2+2*l1+l0 l1+2*l0+tl ...
+ w121_a1_8x16b = _mm_add_epi16(w121_a1_8x16b, const_2_8x16b);
+ w121_a1_8x16b = _mm_srai_epi16(w121_a1_8x16b, 2);
+
+ w121_16x8b = _mm_packus_epi16(w121_a1_8x16b, w121_a1_8x16b);
+
+ row4_16x8b = _mm_unpacklo_epi8(w11_16x8b, w121_16x8b);
+ val_121_t0t1 = _mm_extract_epi16(w121_16x8b, 2);
+ row4_16x8b = _mm_insert_epi16(row4_16x8b, val_121_t0t1, 4);
+
+ dst_strd2 = dst_strd << 1;
+ dst_strd3 = dst_strd + dst_strd2;
+
+ row1_16x8b = _mm_srli_si128(row4_16x8b, 6);
+ row2_16x8b = _mm_srli_si128(row4_16x8b, 4);
+ row3_16x8b = _mm_srli_si128(row4_16x8b, 2);
+
+ _mm_maskmoveu_si128(row1_16x8b, mask_low_32b, (char*)pu1_dst);
+ _mm_maskmoveu_si128(row2_16x8b, mask_low_32b, (char*)(pu1_dst + dst_strd));
+ _mm_maskmoveu_si128(row3_16x8b, mask_low_32b, (char*)(pu1_dst + dst_strd2));
+ _mm_maskmoveu_si128(row4_16x8b, mask_low_32b, (char*)(pu1_dst + dst_strd3));
+}
+
+/**
+ *******************************************************************************
+ *
+ * ih264_intra_pred_luma_4x4_mode_vert_l_ssse3
+ *
+ * @brief
+ * Perform Intra prediction for luma_4x4 mode:Vertical_Left
+ *
+ * @par Description:
+ * Perform Intra prediction for luma_4x4 mode:Vertical_Left ,described in sec 8.3.1.2.8
+ *
+ * @param[in] pu1_src
+ * UWORD8 pointer to the source
+ *
+ * @param[out] pu1_dst
+ * UWORD8 pointer to the destination
+ *
+ * @param[in] src_strd
+ * integer source stride
+ *
+ * @param[in] dst_strd
+ * integer destination stride
+ *
+ * @param[in] ngbr_avail
+ * availability of neighbouring pixels(Not used in this function)
+ *
+ * @returns
+ *
+ * @remarks
+ * None
+ *
+ *******************************************************************************/
+void ih264_intra_pred_luma_4x4_mode_vert_l_ssse3(UWORD8 *pu1_src,
+ UWORD8 *pu1_dst,
+ WORD32 src_strd,
+ WORD32 dst_strd,
+ WORD32 ngbr_avail)
+{
+ UWORD8 *pu1_top;
+ WORD32 dst_strd2, dst_strd3;
+
+ __m128i val_16x8b, val_sh_16x8b;
+ __m128i w121_a1_8x16b, w121_a2_8x16b;
+ __m128i row1_16x8b, row2_16x8b, row3_16x8b, row4_16x8b;
+
+ __m128i zero_vector, const_2_8x16b;
+ __m128i mask_full_128b, mask_low_32b;
+
+ UNUSED(src_strd);
+ UNUSED(ngbr_avail);
+
+ mask_full_128b = _mm_set1_epi8(0xff);
+ mask_low_32b = _mm_srli_si128(mask_full_128b, 12);
+
+ pu1_top = pu1_src +BLK_SIZE + 1;
+
+ val_16x8b = _mm_loadl_epi64((__m128i *)pu1_top);
+ zero_vector = _mm_setzero_si128();
+ val_sh_16x8b = _mm_srli_si128(val_16x8b, 1);
+ row1_16x8b = _mm_avg_epu8(val_16x8b, val_sh_16x8b);
+
+ w121_a1_8x16b = _mm_unpacklo_epi8(val_16x8b, zero_vector); //t0 t1 t2 t3 t4 t5...
+ w121_a2_8x16b = _mm_srli_si128(w121_a1_8x16b, 2); //t1 t2 t3 t4 t5 t6...
+ w121_a1_8x16b = _mm_add_epi16(w121_a1_8x16b, w121_a2_8x16b); //t0+t1 t1+t2 t2+t3 t3+t4 t4+t5...
+ w121_a2_8x16b = _mm_srli_si128(w121_a1_8x16b, 2); //t1+t2 t2+t3 t3+t4 t4+t5 t5+t6...
+
+ zero_vector = _mm_setzero_si128();
+ const_2_8x16b = _mm_set1_epi16(2);
+
+ w121_a1_8x16b = _mm_add_epi16(w121_a1_8x16b, w121_a2_8x16b); //t0+2*t1+t2 t1+2*t2+t3 t2+2*t3+t4...
+ w121_a1_8x16b = _mm_add_epi16(w121_a1_8x16b, const_2_8x16b);
+ w121_a1_8x16b = _mm_srai_epi16(w121_a1_8x16b, 2);
+
+ row2_16x8b = _mm_packus_epi16(w121_a1_8x16b, w121_a1_8x16b);
+
+ dst_strd2 = dst_strd << 1;
+ dst_strd3 = dst_strd + dst_strd2;
+
+ row3_16x8b = _mm_srli_si128(row1_16x8b, 1);
+ row4_16x8b = _mm_srli_si128(row2_16x8b, 1);
+
+ _mm_maskmoveu_si128(row1_16x8b, mask_low_32b, (char*)pu1_dst);
+ _mm_maskmoveu_si128(row2_16x8b, mask_low_32b, (char*)(pu1_dst + dst_strd));
+ _mm_maskmoveu_si128(row3_16x8b, mask_low_32b, (char*)(pu1_dst + dst_strd2));
+ _mm_maskmoveu_si128(row4_16x8b, mask_low_32b, (char*)(pu1_dst + dst_strd3));
+}
+
+/**
+ *******************************************************************************
+ *
+ * ih264_intra_pred_luma_4x4_mode_horz_u_ssse3
+ *
+ * @brief
+ * Perform Intra prediction for luma_4x4 mode:Horizontal_Up
+ *
+ * @par Description:
+ * Perform Intra prediction for luma_4x4 mode:Horizontal_Up ,described in sec 8.3.1.2.9
+ *
+ * @param[in] pu1_src
+ * UWORD8 pointer to the source
+ *
+ * @param[out] pu1_dst
+ * UWORD8 pointer to the destination
+ *
+ * @param[in] src_strd
+ * integer source stride
+ *
+ * @param[in] dst_strd
+ * integer destination stride
+ *
+ * @param[in] ngbr_avail
+ * availability of neighbouring pixels(Not used in this function)
+ *
+ * @returns
+ *
+ * @remarks
+ * None
+ *
+ *******************************************************************************/
+void ih264_intra_pred_luma_4x4_mode_horz_u_ssse3(UWORD8 *pu1_src,
+ UWORD8 *pu1_dst,
+ WORD32 src_strd,
+ WORD32 dst_strd,
+ WORD32 ngbr_avail)
+{
+ UWORD8 *pu1_left;
+ WORD32 dst_strd2, dst_strd3;
+
+ __m128i val_16x8b, val_sh_16x8b;
+ __m128i w11_16x8b;
+ __m128i w121_a1_8x16b, w121_a2_8x16b, w121_16x8b;
+ __m128i row1_16x8b, row2_16x8b, row3_16x8b, row4_16x8b;
+
+ __m128i zero_vector, const_2_8x16b, rev_16x8b;
+ __m128i mask_full_128b, mask_low_32b;
+
+ UNUSED(src_strd);
+ UNUSED(ngbr_avail);
+
+ mask_full_128b = _mm_set1_epi8(0xff);
+ mask_low_32b = _mm_srli_si128(mask_full_128b, 12);
+
+ pu1_left = pu1_src + BLK_SIZE - 1;
+
+ zero_vector = _mm_setzero_si128();
+ rev_16x8b = _mm_setr_epi8(3, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
+
+ val_16x8b = _mm_loadl_epi64((__m128i *)(pu1_left - 3)); //l3 l2 l1 l0 0 0 0...
+ val_16x8b = _mm_shuffle_epi8(val_16x8b, rev_16x8b); //l0 l1 l2 l3 l3 l3 l3...
+
+ val_sh_16x8b = _mm_srli_si128(val_16x8b, 1);
+ w11_16x8b = _mm_avg_epu8(val_16x8b, val_sh_16x8b);
+
+ w121_a1_8x16b = _mm_unpacklo_epi8(val_16x8b, zero_vector); //l0 l1 l2 l3 l3 l3...
+ w121_a2_8x16b = _mm_srli_si128(w121_a1_8x16b, 2); //l1 l2 l3 l3 l3 l3...
+
+ w121_a1_8x16b = _mm_add_epi16(w121_a1_8x16b, w121_a2_8x16b); //l0+t1 l1+l2 l2+l3 2*l3 2*l3...
+ w121_a2_8x16b = _mm_srli_si128(w121_a1_8x16b, 2); //l1+t2 l2+l3 2*l3 2*l3 2*l3...
+
+ zero_vector = _mm_setzero_si128();
+ const_2_8x16b = _mm_set1_epi16(2);
+
+ w121_a1_8x16b = _mm_add_epi16(w121_a1_8x16b, w121_a2_8x16b); //l0+2*l1+l2 l1+2*l2+l3 l2+3*l3 4*l3 4*l3...
+ w121_a1_8x16b = _mm_add_epi16(w121_a1_8x16b, const_2_8x16b);
+ w121_a1_8x16b = _mm_srai_epi16(w121_a1_8x16b, 2);
+
+ w121_16x8b = _mm_packus_epi16(w121_a1_8x16b, w121_a1_8x16b);
+
+ dst_strd2 = dst_strd << 1;
+ dst_strd3 = dst_strd + dst_strd2;
+
+ row1_16x8b = _mm_unpacklo_epi8(w11_16x8b, w121_16x8b);
+ row2_16x8b = _mm_srli_si128(row1_16x8b, 2);
+ row3_16x8b = _mm_srli_si128(row1_16x8b, 4);
+ row4_16x8b = _mm_srli_si128(row1_16x8b, 6);
+
+ _mm_maskmoveu_si128(row1_16x8b, mask_low_32b, (char*)pu1_dst);
+ _mm_maskmoveu_si128(row2_16x8b, mask_low_32b, (char*)(pu1_dst + dst_strd));
+ _mm_maskmoveu_si128(row3_16x8b, mask_low_32b, (char*)(pu1_dst + dst_strd2));
+ _mm_maskmoveu_si128(row4_16x8b, mask_low_32b, (char*)(pu1_dst + dst_strd3));
+}
+
+/******************* 8x8 Modes *******************/
+
+/**
+ *******************************************************************************
+ *
+ * ih264_intra_pred_luma_8x8_mode_vert_ssse3
+ *
+ * @brief
+ * Perform Intra prediction for luma_8x8 mode:vertical
+ *
+ * @par Description:
+ * Perform Intra prediction for luma_8x8 mode:vertical ,described in sec 8.3.2.2.2
+ *
+ * @param[in] pu1_src
+ * UWORD8 pointer to the source
+ *
+ * @param[out] pu1_dst
+ * UWORD8 pointer to the destination
+ *
+ * @param[in] src_strd
+ * integer source stride
+ *
+ * @param[in] dst_strd
+ * integer destination stride
+ *
+ * @param[in] ngbr_avail
+ * availability of neighbouring pixels(Not used in this function)
+ *
+ * @returns
+ *
+ * @remarks
+ * None
+ *
+ *******************************************************************************
+ */
+void ih264_intra_pred_luma_8x8_mode_vert_ssse3(UWORD8 *pu1_src,
+ UWORD8 *pu1_dst,
+ WORD32 src_strd,
+ WORD32 dst_strd,
+ WORD32 ngbr_avail)
+{
+ UWORD8 *pu1_top = NULL;
+ __m128i top_8x8b;
+ UNUSED(src_strd);
+ UNUSED(ngbr_avail);
+ pu1_top = pu1_src + BLK8x8SIZE + 1;
+
+ top_8x8b = _mm_loadl_epi64((__m128i *)pu1_top);
+
+ _mm_storel_epi64((__m128i *)(pu1_dst + 0 * dst_strd), top_8x8b);
+ _mm_storel_epi64((__m128i *)(pu1_dst + 1 * dst_strd), top_8x8b);
+ _mm_storel_epi64((__m128i *)(pu1_dst + 2 * dst_strd), top_8x8b);
+ _mm_storel_epi64((__m128i *)(pu1_dst + 3 * dst_strd), top_8x8b);
+ _mm_storel_epi64((__m128i *)(pu1_dst + 4 * dst_strd), top_8x8b);
+ _mm_storel_epi64((__m128i *)(pu1_dst + 5 * dst_strd), top_8x8b);
+ _mm_storel_epi64((__m128i *)(pu1_dst + 6 * dst_strd), top_8x8b);
+ _mm_storel_epi64((__m128i *)(pu1_dst + 7 * dst_strd), top_8x8b);
+}
+
+/**
+ *******************************************************************************
+ *
+ * ih264_intra_pred_luma_8x8_mode_horz_ssse3
+ *
+ * @brief
+ * Perform Intra prediction for luma_8x8 mode:horizontal
+ *
+ * @par Description:
+ * Perform Intra prediction for uma_8x8 mode:horizontal ,described in sec 8.3.2.2.2
+ *
+ * @param[in] pu1_src
+ * UWORD8 pointer to the source
+ *
+ * @param[out] pu1_dst
+ * UWORD8 pointer to the destination
+ *
+ * @param[in] src_strd
+ * integer source stride
+ *
+ * @param[in] dst_strd
+ * integer destination stride
+ *
+ * @param[in] ngbr_avail
+ * availability of neighbouring pixels(Not used in this function)
+ *
+ * @returns
+ *
+ * @remarks
+ * None
+ *
+ *******************************************************************************
+ */
+void ih264_intra_pred_luma_8x8_mode_horz_ssse3(UWORD8 *pu1_src,
+ UWORD8 *pu1_dst,
+ WORD32 src_strd,
+ WORD32 dst_strd,
+ WORD32 ngbr_avail)
+{
+ UWORD8 *pu1_left = pu1_src + BLK8x8SIZE - 1;
+ __m128i row1_8x8b, row2_8x8b, row3_8x8b, row4_8x8b;
+ __m128i row5_8x8b, row6_8x8b, row7_8x8b, row8_8x8b;
+
+ UNUSED(src_strd);
+ UNUSED(ngbr_avail);
+
+ row1_8x8b = _mm_set1_epi8(pu1_left[0]);
+ row2_8x8b = _mm_set1_epi8(pu1_left[-1]);
+ row3_8x8b = _mm_set1_epi8(pu1_left[-2]);
+ row4_8x8b = _mm_set1_epi8(pu1_left[-3]);
+ row5_8x8b = _mm_set1_epi8(pu1_left[-4]);
+ row6_8x8b = _mm_set1_epi8(pu1_left[-5]);
+ row7_8x8b = _mm_set1_epi8(pu1_left[-6]);
+ row8_8x8b = _mm_set1_epi8(pu1_left[-7]);
+
+ _mm_storel_epi64((__m128i *)(pu1_dst + 0 * dst_strd), row1_8x8b);
+ _mm_storel_epi64((__m128i *)(pu1_dst + 1 * dst_strd), row2_8x8b);
+ _mm_storel_epi64((__m128i *)(pu1_dst + 2 * dst_strd), row3_8x8b);
+ _mm_storel_epi64((__m128i *)(pu1_dst + 3 * dst_strd), row4_8x8b);
+ _mm_storel_epi64((__m128i *)(pu1_dst + 4 * dst_strd), row5_8x8b);
+ _mm_storel_epi64((__m128i *)(pu1_dst + 5 * dst_strd), row6_8x8b);
+ _mm_storel_epi64((__m128i *)(pu1_dst + 6 * dst_strd), row7_8x8b);
+ _mm_storel_epi64((__m128i *)(pu1_dst + 7 * dst_strd), row8_8x8b);
+}
+
+/**
+ *******************************************************************************
+ *
+ * ih264_intra_pred_luma_8x8_mode_dc_ssse3
+ *
+ * @brief
+ * Perform Intra prediction for luma_8x8 mode:DC
+ *
+ * @par Description:
+ * Perform Intra prediction for luma_8x8 mode:DC ,described in sec 8.3.2.2.4
+ *
+ * @param[in] pu1_src
+ * UWORD8 pointer to the source
+ *
+ * @param[out] pu1_dst
+ * UWORD8 pointer to the destination
+ *
+ * @param[in] src_strd
+ * integer source stride
+ *
+ * @param[in] dst_strd
+ * integer destination stride
+ *
+ * @param[in] ngbr_avail
+ * availability of neighbouring pixels
+ *
+ * @returns
+ *
+ * @remarks
+ * None
+ *
+ *******************************************************************************/
+void ih264_intra_pred_luma_8x8_mode_dc_ssse3(UWORD8 *pu1_src,
+ UWORD8 *pu1_dst,
+ WORD32 src_strd,
+ WORD32 dst_strd,
+ WORD32 ngbr_avail)
+{
+ UWORD8 u1_useleft; /* availability of left predictors (only for DC) */
+ UWORD8 u1_usetop; /* availability of top predictors (only for DC) */
+ UWORD8 *pu1_left = NULL; /* Pointer to start of left predictors */
+ UWORD8 *pu1_top = NULL; /* Pointer to start of top predictors */
+ __m128i dc_val_8x8b;
+ WORD32 dc_val = 0;
+ UNUSED(src_strd);
+
+ u1_useleft = BOOLEAN(ngbr_avail & LEFT_MB_AVAILABLE_MASK);
+ u1_usetop = BOOLEAN(ngbr_avail & TOP_MB_AVAILABLE_MASK);
+ pu1_top = pu1_src + BLK8x8SIZE + 1;
+ pu1_left = pu1_src + BLK8x8SIZE - 1;
+
+ if(u1_useleft || u1_usetop)
+ {
+ WORD32 shft = 2;
+ __m128i val_8x8b, zero_8x8b, sum_8x16b;
+
+ zero_8x8b = _mm_setzero_si128();
+
+ if(u1_useleft)
+ {
+ val_8x8b = _mm_loadl_epi64((__m128i *)(pu1_left - 7));
+ sum_8x16b = _mm_sad_epu8(zero_8x8b, val_8x8b);
+
+ shft++;
+ dc_val += 4;
+ dc_val += _mm_extract_epi16(sum_8x16b, 0);
+ }
+ if(u1_usetop)
+ {
+ val_8x8b = _mm_loadl_epi64((__m128i *)pu1_top);
+ sum_8x16b = _mm_sad_epu8(zero_8x8b, val_8x8b);
+
+ shft++;
+ dc_val += 4;
+ dc_val += _mm_extract_epi16(sum_8x16b, 0);
+ }
+ dc_val = dc_val >> shft;
+ }
+ else
+ dc_val = 128;
+
+ dc_val_8x8b = _mm_set1_epi8(dc_val);
+
+ _mm_storel_epi64((__m128i *)(pu1_dst + 0 * dst_strd), dc_val_8x8b);
+ _mm_storel_epi64((__m128i *)(pu1_dst + 1 * dst_strd), dc_val_8x8b);
+ _mm_storel_epi64((__m128i *)(pu1_dst + 2 * dst_strd), dc_val_8x8b);
+ _mm_storel_epi64((__m128i *)(pu1_dst + 3 * dst_strd), dc_val_8x8b);
+ _mm_storel_epi64((__m128i *)(pu1_dst + 4 * dst_strd), dc_val_8x8b);
+ _mm_storel_epi64((__m128i *)(pu1_dst + 5 * dst_strd), dc_val_8x8b);
+ _mm_storel_epi64((__m128i *)(pu1_dst + 6 * dst_strd), dc_val_8x8b);
+ _mm_storel_epi64((__m128i *)(pu1_dst + 7 * dst_strd), dc_val_8x8b);
+}
+
+/**
+ *******************************************************************************
+ *
+ * ih264_intra_pred_luma_8x8_mode_diag_dl_ssse3
+ *
+ * @brief
+ * Perform Intra prediction for luma_8x8 mode:Diagonal_Down_Left
+ *
+ * @par Description:
+ * Perform Intra prediction for luma_8x8 mode:Diagonal_Down_Left ,described in sec 8.3.2.2.5
+ *
+ * @param[in] pu1_src
+ * UWORD8 pointer to the source
+ *
+ * @param[out] pu1_dst
+ * UWORD8 pointer to the destination
+ *
+ * @param[in] src_strd
+ * integer source stride
+ *
+ * @param[in] dst_strd
+ * integer destination stride
+ *
+ * @param[in] ngbr_avail
+ * availability of neighbouring pixels(Not used in this function)
+ *
+ * @returns
+ *
+ * @remarks
+ * None
+ *
+ *******************************************************************************/
+void ih264_intra_pred_luma_8x8_mode_diag_dl_ssse3(UWORD8 *pu1_src,
+ UWORD8 *pu1_dst,
+ WORD32 src_strd,
+ WORD32 dst_strd,
+ WORD32 ngbr_avail)
+{
+ UWORD8 *pu1_top = NULL; /* Pointer to start of top predictors */
+ __m128i top_16x8;
+ __m128i out_15x16;
+ __m128i a0_8x16, a1_8x16, a2_8x16;
+ __m128i temp1, temp2;
+ __m128i res1_8x16, res2_8x16;
+ __m128i zero = _mm_setzero_si128();
+ __m128i const_val2_8x16 = _mm_set1_epi16(2);
+
+ UNUSED(src_strd);
+ UNUSED(ngbr_avail);
+
+ pu1_top = pu1_src + BLK8x8SIZE + 1;
+
+ top_16x8 = _mm_loadu_si128((__m128i *)(pu1_top));
+
+ temp1 = _mm_srli_si128(top_16x8, 1);
+ temp2 = _mm_srli_si128(top_16x8, 2);
+ a0_8x16 = _mm_unpacklo_epi8(top_16x8, zero);
+ a1_8x16 = _mm_unpacklo_epi8(temp1, zero);
+ a2_8x16 = _mm_unpacklo_epi8(temp2, zero);
+
+ a0_8x16 = _mm_add_epi16(a0_8x16, a2_8x16);
+ a1_8x16 = _mm_add_epi16(a1_8x16, a1_8x16);
+ a0_8x16 = _mm_add_epi16(a0_8x16, const_val2_8x16);
+ a0_8x16 = _mm_add_epi16(a0_8x16, a1_8x16);
+ res1_8x16 = _mm_srai_epi16(a0_8x16, 2);
+
+ temp2 = _mm_srli_si128(top_16x8, 2);
+ temp1 = _mm_srli_si128(top_16x8, 1);
+ a2_8x16 = _mm_unpackhi_epi8(temp2, zero);
+ a0_8x16 = _mm_unpackhi_epi8(top_16x8, zero);
+ a2_8x16 = _mm_shufflehi_epi16(a2_8x16, 0x14);
+ a1_8x16 = _mm_unpackhi_epi8(temp1, zero);
+
+ a0_8x16 = _mm_add_epi16(a0_8x16, a2_8x16);
+ a1_8x16 = _mm_add_epi16(a1_8x16, a1_8x16);
+ a0_8x16 = _mm_add_epi16(a0_8x16, const_val2_8x16);
+ a0_8x16 = _mm_add_epi16(a0_8x16, a1_8x16);
+ res2_8x16 = _mm_srai_epi16(a0_8x16, 2);
+
+ out_15x16 = _mm_packus_epi16(res1_8x16, res2_8x16);
+
+ _mm_storel_epi64((__m128i *)(pu1_dst + 0 * dst_strd), out_15x16);
+ out_15x16 = _mm_srli_si128(out_15x16, 1);
+ _mm_storel_epi64((__m128i *)(pu1_dst + 1 * dst_strd), out_15x16);
+ out_15x16 = _mm_srli_si128(out_15x16, 1);
+ _mm_storel_epi64((__m128i *)(pu1_dst + 2 * dst_strd), out_15x16);
+ out_15x16 = _mm_srli_si128(out_15x16, 1);
+ _mm_storel_epi64((__m128i *)(pu1_dst + 3 * dst_strd), out_15x16);
+ out_15x16 = _mm_srli_si128(out_15x16, 1);
+ _mm_storel_epi64((__m128i *)(pu1_dst + 4 * dst_strd), out_15x16);
+ out_15x16 = _mm_srli_si128(out_15x16, 1);
+ _mm_storel_epi64((__m128i *)(pu1_dst + 5 * dst_strd), out_15x16);
+ out_15x16 = _mm_srli_si128(out_15x16, 1);
+ _mm_storel_epi64((__m128i *)(pu1_dst + 6 * dst_strd), out_15x16);
+ out_15x16 = _mm_srli_si128(out_15x16, 1);
+ _mm_storel_epi64((__m128i *)(pu1_dst + 7 * dst_strd), out_15x16);
+}
+
+/**
+ *******************************************************************************
+ *
+ * ih264_intra_pred_luma_8x8_mode_diag_dr_ssse3
+ *
+ * @brief
+ * Perform Intra prediction for luma_8x8 mode:Diagonal_Down_Right
+ *
+ * @par Description:
+ * Perform Intra prediction for luma_8x8 mode:Diagonal_Down_Right ,described in sec 8.3.2.2.6
+ *
+ * @param[in] pu1_src
+ * UWORD8 pointer to the source
+ *
+ * @param[out] pu1_dst
+ * UWORD8 pointer to the destination
+ *
+ * @param[in] src_strd
+ * integer source stride
+ *
+ * @param[in] dst_strd
+ * integer destination stride
+ *
+ * @param[in] ngbr_avail
+ * availability of neighbouring pixels(Not used in this function)
+ *
+ * @returns
+ *
+ * @remarks
+ * None
+ *
+ *******************************************************************************/
+void ih264_intra_pred_luma_8x8_mode_diag_dr_ssse3(UWORD8 *pu1_src,
+ UWORD8 *pu1_dst,
+ WORD32 src_strd,
+ WORD32 dst_strd,
+ WORD32 ngbr_avail)
+{
+ UWORD8 *pu1_left = NULL; /* Pointer to start of left predictors */
+ UWORD8 *pu1_top = NULL; /* Pointer to start of top predictors */
+ __m128i top_8x8, left_16x8;
+ __m128i out_15x16;
+ __m128i a0_8x16, a1_8x16, a2_8x16;
+ __m128i temp1, temp2;
+ __m128i res1_8x16, res2_8x16;
+ __m128i zero = _mm_setzero_si128();
+ __m128i const_val2_8x16 = _mm_set1_epi16(2);
+ __m128i str_8x8;
+
+ UNUSED(src_strd);
+ UNUSED(ngbr_avail);
+
+ pu1_left = pu1_src + BLK8x8SIZE - 1;
+ pu1_top = pu1_src + BLK8x8SIZE + 1;
+
+ left_16x8 = _mm_loadu_si128((__m128i *)(pu1_left - 7));
+
+ temp1 = _mm_srli_si128(left_16x8, 1);
+ temp2 = _mm_srli_si128(left_16x8, 2);
+ a0_8x16 = _mm_unpacklo_epi8(left_16x8, zero);
+ a1_8x16 = _mm_unpacklo_epi8(temp1, zero);
+ a2_8x16 = _mm_unpacklo_epi8(temp2, zero);
+
+ a0_8x16 = _mm_add_epi16(a0_8x16, a2_8x16);
+ a1_8x16 = _mm_add_epi16(a1_8x16, a1_8x16);
+ a0_8x16 = _mm_add_epi16(a0_8x16, const_val2_8x16);
+ a0_8x16 = _mm_add_epi16(a0_8x16, a1_8x16);
+ res1_8x16 = _mm_srai_epi16(a0_8x16, 2);
+
+ top_8x8 = _mm_loadu_si128((__m128i *)(pu1_top - 1));
+
+ temp1 = _mm_srli_si128(top_8x8, 1);
+ temp2 = _mm_srli_si128(top_8x8, 2);
+ a0_8x16 = _mm_unpacklo_epi8(top_8x8, zero);
+ a1_8x16 = _mm_unpacklo_epi8(temp1, zero);
+ a2_8x16 = _mm_unpacklo_epi8(temp2, zero);
+
+ a0_8x16 = _mm_add_epi16(a0_8x16, a2_8x16);
+ a1_8x16 = _mm_add_epi16(a1_8x16, a1_8x16);
+ a0_8x16 = _mm_add_epi16(a0_8x16, const_val2_8x16);
+ a0_8x16 = _mm_add_epi16(a0_8x16, a1_8x16);
+ res2_8x16 = _mm_srai_epi16(a0_8x16, 2);
+
+ out_15x16 = _mm_packus_epi16(res1_8x16, res2_8x16);
+
+ str_8x8 = _mm_srli_si128(out_15x16, 7);
+ _mm_storel_epi64((__m128i *)(pu1_dst + 0 * dst_strd), str_8x8);
+ str_8x8 = _mm_srli_si128(out_15x16, 6);
+ _mm_storel_epi64((__m128i *)(pu1_dst + 1 * dst_strd), str_8x8);
+ str_8x8 = _mm_srli_si128(out_15x16, 5);
+ _mm_storel_epi64((__m128i *)(pu1_dst + 2 * dst_strd), str_8x8);
+ str_8x8 = _mm_srli_si128(out_15x16, 4);
+ _mm_storel_epi64((__m128i *)(pu1_dst + 3 * dst_strd), str_8x8);
+ str_8x8 = _mm_srli_si128(out_15x16, 3);
+ _mm_storel_epi64((__m128i *)(pu1_dst + 4 * dst_strd), str_8x8);
+ str_8x8 = _mm_srli_si128(out_15x16, 2);
+ _mm_storel_epi64((__m128i *)(pu1_dst + 5 * dst_strd), str_8x8);
+ str_8x8 = _mm_srli_si128(out_15x16, 1);
+ _mm_storel_epi64((__m128i *)(pu1_dst + 6 * dst_strd), str_8x8);
+ _mm_storel_epi64((__m128i *)(pu1_dst + 7 * dst_strd), out_15x16);
+}
+
+/**
+ *******************************************************************************
+ *
+ * ih264_intra_pred_luma_8x8_mode_vert_r_ssse3
+ *
+ * @brief
+ * Perform Intra prediction for luma_8x8 mode:Vertical_Right
+ *
+ * @par Description:
+ * Perform Intra prediction for luma_8x8 mode:Vertical_Right ,described in sec 8.3.2.2.7
+ *
+ * @param[in] pu1_src
+ * UWORD8 pointer to the source
+ *
+ * @param[out] pu1_dst
+ * UWORD8 pointer to the destination
+ *
+ * @param[in] src_strd
+ * integer source stride
+ *
+ * @param[in] dst_strd
+ * integer destination stride
+ *
+ * @param[in] ngbr_avail
+ * availability of neighbouring pixels(Not used in this function)
+ *
+ * @returns
+ *
+ * @remarks
+ * None
+ *
+ *******************************************************************************/
+void ih264_intra_pred_luma_8x8_mode_vert_r_ssse3(UWORD8 *pu1_src,
+ UWORD8 *pu1_dst,
+ WORD32 src_strd,
+ WORD32 dst_strd,
+ WORD32 ngbr_avail)
+{
+ UWORD8 *pu1_left = NULL; /* Pointer to start of left predictors */
+ UWORD8 *pu1_top = NULL; /* Pointer to start of top predictors */
+ __m128i top_8x8, left_16x8;
+ __m128i out1_16x16, out2_16x16;
+ __m128i a0_8x16, a1_8x16, a2_8x16;
+ __m128i temp1, temp2;
+ __m128i res1_8x16, res2_8x16, res3_8x16;
+ __m128i zero = _mm_setzero_si128();
+ __m128i const_val2_8x16 = _mm_set1_epi16(2);
+ __m128i str_8x8;
+ __m128i mask = _mm_set1_epi32(0xFFFF);
+
+ UNUSED(src_strd);
+ UNUSED(ngbr_avail);
+
+ pu1_left = pu1_src + BLK8x8SIZE - 1;
+ pu1_top = pu1_src + BLK8x8SIZE + 1;
+
+ left_16x8 = _mm_loadu_si128((__m128i *)(pu1_left - 6));
+
+ temp1 = _mm_srli_si128(left_16x8, 1);
+ temp2 = _mm_srli_si128(left_16x8, 2);
+ a0_8x16 = _mm_unpacklo_epi8(left_16x8, zero);
+ a1_8x16 = _mm_unpacklo_epi8(temp1, zero);
+ a2_8x16 = _mm_unpacklo_epi8(temp2, zero);
+
+ a0_8x16 = _mm_add_epi16(a0_8x16, a2_8x16);
+ a1_8x16 = _mm_add_epi16(a1_8x16, a1_8x16);
+ a0_8x16 = _mm_add_epi16(a0_8x16, const_val2_8x16);
+ a0_8x16 = _mm_add_epi16(a0_8x16, a1_8x16);
+ res1_8x16 = _mm_srai_epi16(a0_8x16, 2);
+
+ top_8x8 = _mm_loadu_si128((__m128i *)(pu1_top - 1));
+
+ temp1 = _mm_srli_si128(top_8x8, 1);
+ temp2 = _mm_srli_si128(top_8x8, 2);
+ a0_8x16 = _mm_unpacklo_epi8(top_8x8, zero);
+ a1_8x16 = _mm_unpacklo_epi8(temp1, zero);
+ a2_8x16 = _mm_unpacklo_epi8(temp2, zero);
+
+ res3_8x16 = _mm_avg_epu16(a0_8x16, a1_8x16);
+
+ a0_8x16 = _mm_add_epi16(a0_8x16, a2_8x16);
+ a1_8x16 = _mm_add_epi16(a1_8x16, a1_8x16);
+ a0_8x16 = _mm_add_epi16(a0_8x16, const_val2_8x16);
+ a0_8x16 = _mm_add_epi16(a0_8x16, a1_8x16);
+ res2_8x16 = _mm_srai_epi16(a0_8x16, 2);
+
+ str_8x8 = _mm_packus_epi16(res3_8x16, zero);
+ _mm_storel_epi64((__m128i *)(pu1_dst + 0 * dst_strd), str_8x8);
+
+ temp1 = _mm_and_si128(res1_8x16, mask);
+ temp1 = _mm_packs_epi32(temp1, temp1);
+ out1_16x16 = _mm_packus_epi16(temp1, res2_8x16);
+
+ res1_8x16 = _mm_slli_si128(res1_8x16, 2);
+ temp1 = _mm_and_si128(res1_8x16, mask);
+ temp1 = _mm_packs_epi32(temp1, temp1);
+ out2_16x16 = _mm_packus_epi16(temp1, res3_8x16);
+
+ str_8x8 = _mm_srli_si128(out1_16x16, 7);
+ _mm_storel_epi64((__m128i *)(pu1_dst + 1 * dst_strd), str_8x8);
+
+ str_8x8 = _mm_srli_si128(out2_16x16, 7);
+ _mm_storel_epi64((__m128i *)(pu1_dst + 2 * dst_strd), str_8x8);
+
+ str_8x8 = _mm_srli_si128(out1_16x16, 6);
+ _mm_storel_epi64((__m128i *)(pu1_dst + 3 * dst_strd), str_8x8);
+
+ str_8x8 = _mm_srli_si128(out2_16x16, 6);
+ _mm_storel_epi64((__m128i *)(pu1_dst + 4 * dst_strd), str_8x8);
+
+ str_8x8 = _mm_srli_si128(out1_16x16, 5);
+ _mm_storel_epi64((__m128i *)(pu1_dst + 5 * dst_strd), str_8x8);
+
+ str_8x8 = _mm_srli_si128(out2_16x16, 5);
+ _mm_storel_epi64((__m128i *)(pu1_dst + 6 * dst_strd), str_8x8);
+
+ str_8x8 = _mm_srli_si128(out1_16x16, 4);
+ _mm_storel_epi64((__m128i *)(pu1_dst + 7 * dst_strd), str_8x8);
+}
+
+/*
+ *******************************************************************************
+ *
+ * ih264_intra_pred_luma_8x8_mode_horz_d_ssse3
+ *
+ * @brief
+ * Perform Intra prediction for luma_8x8 mode:Horizontal_Down
+ *
+ * @par Description:
+ * Perform Intra prediction for luma_8x8 mode:Horizontal_Down ,described in sec 8.3.2.2.8
+ *
+ * @param[in] pu1_src
+ * UWORD8 pointer to the source
+ *
+ * @param[out] pu1_dst
+ * UWORD8 pointer to the destination
+ *
+ * @param[in] src_strd
+ * integer source stride
+ *
+ * @param[in] dst_strd
+ * integer destination stride
+ *
+ * @param[in] ngbr_avail
+ * availability of neighbouring pixels(Not used in this function)
+ *
+ * @returns
+ *
+ * @remarks
+ * None
+ *
+ *******************************************************************************/
+void ih264_intra_pred_luma_8x8_mode_horz_d_ssse3(UWORD8 *pu1_src,
+ UWORD8 *pu1_dst,
+ WORD32 src_strd,
+ WORD32 dst_strd,
+ WORD32 ngbr_avail)
+{
+ UWORD8 *pu1_left = NULL; /* Pointer to start of left predictors */
+ __m128i pels_16x16;
+ __m128i temp1, temp2, temp3, temp4;
+ __m128i a0_8x16, a1_8x16, a2_8x16;
+ __m128i zero = _mm_setzero_si128();
+ __m128i const_val2_8x16 = _mm_set1_epi16(2);
+ __m128i res1_8x16, res2_8x16;
+ __m128i out1_16x16, out2_16x16;
+ __m128i str_8x8;
+ UNUSED(src_strd);
+ UNUSED(ngbr_avail);
+
+ pu1_left = pu1_src + BLK8x8SIZE - 1;
+
+ pels_16x16 = _mm_loadu_si128((__m128i *)(pu1_left - 7));
+
+ temp1 = _mm_srli_si128(pels_16x16, 1);
+ temp2 = _mm_srli_si128(pels_16x16, 2);
+ a0_8x16 = _mm_unpacklo_epi8(pels_16x16, zero);
+ a1_8x16 = _mm_unpacklo_epi8(temp1, zero);
+ a2_8x16 = _mm_unpacklo_epi8(temp2, zero);
+
+ res1_8x16 = _mm_avg_epu16(a0_8x16, a1_8x16);
+
+ a0_8x16 = _mm_add_epi16(a0_8x16, a2_8x16);
+ a1_8x16 = _mm_add_epi16(a1_8x16, a1_8x16);
+ a0_8x16 = _mm_add_epi16(a0_8x16, const_val2_8x16);
+ a0_8x16 = _mm_add_epi16(a0_8x16, a1_8x16);
+ res2_8x16 = _mm_srai_epi16(a0_8x16, 2);
+
+ temp3 = _mm_unpacklo_epi16(res1_8x16, res2_8x16);
+ temp4 = _mm_unpackhi_epi16(res1_8x16, res2_8x16);
+ out2_16x16 = _mm_packus_epi16(temp3, temp4);
+
+ a0_8x16 = _mm_unpackhi_epi8(pels_16x16, zero);
+ a1_8x16 = _mm_unpackhi_epi8(temp1, zero);
+ a2_8x16 = _mm_unpackhi_epi8(temp2, zero);
+
+ a0_8x16 = _mm_add_epi16(a0_8x16, a2_8x16);
+ a1_8x16 = _mm_add_epi16(a1_8x16, a1_8x16);
+ a0_8x16 = _mm_add_epi16(a0_8x16, const_val2_8x16);
+ a0_8x16 = _mm_add_epi16(a0_8x16, a1_8x16);
+ res2_8x16 = _mm_srai_epi16(a0_8x16, 2);
+
+ out1_16x16 = _mm_packus_epi16(res2_8x16, zero);
+ temp1 = _mm_srli_si128(out2_16x16, 8);
+ out1_16x16 = _mm_unpacklo_epi64(temp1, out1_16x16);
+
+ str_8x8 = _mm_srli_si128(out1_16x16, 6);
+ _mm_storel_epi64((__m128i *)(pu1_dst + 0 * dst_strd), str_8x8);
+ str_8x8 = _mm_srli_si128(out1_16x16, 4);
+ _mm_storel_epi64((__m128i *)(pu1_dst + 1 * dst_strd), str_8x8);
+ str_8x8 = _mm_srli_si128(out1_16x16, 2);
+ _mm_storel_epi64((__m128i *)(pu1_dst + 2 * dst_strd), str_8x8);
+ _mm_storel_epi64((__m128i *)(pu1_dst + 3 * dst_strd), out1_16x16);
+
+ str_8x8 = _mm_srli_si128(out2_16x16, 6);
+ _mm_storel_epi64((__m128i *)(pu1_dst + 4 * dst_strd), str_8x8);
+ str_8x8 = _mm_srli_si128(out2_16x16, 4);
+ _mm_storel_epi64((__m128i *)(pu1_dst + 5 * dst_strd), str_8x8);
+ str_8x8 = _mm_srli_si128(out2_16x16, 2);
+ _mm_storel_epi64((__m128i *)(pu1_dst + 6 * dst_strd), str_8x8);
+ _mm_storel_epi64((__m128i *)(pu1_dst + 7 * dst_strd), out2_16x16);
+}
+
+/**
+ *******************************************************************************
+ *
+ * ih264_intra_pred_luma_8x8_mode_vert_l_ssse3
+ *
+ * @brief
+ * Perform Intra prediction for luma_8x8 mode:Vertical_Left
+ *
+ * @par Description:
+ * Perform Intra prediction for luma_8x8 mode:Vertical_Left ,described in sec 8.3.2.2.9
+ *
+ * @param[in] pu1_src
+ * UWORD8 pointer to the source
+ *
+ * @param[out] pu1_dst
+ * UWORD8 pointer to the destination
+ *
+ * @param[in] src_strd
+ * integer source stride
+ *
+ * @param[in] dst_strd
+ * integer destination stride
+ *
+ * @param[in] ngbr_avail
+ * availability of neighbouring pixels(Not used in this function)
+ *
+ * @returns
+ *
+ * @remarks
+ * None
+ *
+ *******************************************************************************/
+
+void ih264_intra_pred_luma_8x8_mode_vert_l_ssse3(UWORD8 *pu1_src,
+ UWORD8 *pu1_dst,
+ WORD32 src_strd,
+ WORD32 dst_strd,
+ WORD32 ngbr_avail)
+{
+ UWORD8 *pu1_top = NULL; /* Pointer to start of top predictors */
+ __m128i top_16x16;
+ __m128i temp1, temp2;
+ __m128i a0_8x16, a1_8x16, a2_8x16;
+ __m128i zero = _mm_setzero_si128();
+ __m128i const_val2_8x16 = _mm_set1_epi16(2);
+ __m128i res1_8x16, res2_8x16, res3_8x16, res4_8x16;
+ __m128i out1_16x16, out2_16x16;
+ UNUSED(src_strd);
+ UNUSED(ngbr_avail);
+ pu1_top = pu1_src + BLK8x8SIZE + 1;
+
+ top_16x16 = _mm_loadu_si128((__m128i *)(pu1_top));
+ temp1 = _mm_srli_si128(top_16x16, 1);
+ temp2 = _mm_srli_si128(top_16x16, 2);
+ a0_8x16 = _mm_unpacklo_epi8(top_16x16, zero);
+ a1_8x16 = _mm_unpacklo_epi8(temp1, zero);
+ a2_8x16 = _mm_unpacklo_epi8(temp2, zero);
+
+ res1_8x16 = _mm_avg_epu16(a0_8x16, a1_8x16);
+
+ a0_8x16 = _mm_add_epi16(a0_8x16, a2_8x16);
+ a1_8x16 = _mm_add_epi16(a1_8x16, a1_8x16);
+ a0_8x16 = _mm_add_epi16(a0_8x16, const_val2_8x16);
+ a0_8x16 = _mm_add_epi16(a0_8x16, a1_8x16);
+ res2_8x16 = _mm_srai_epi16(a0_8x16, 2);
+
+ a0_8x16 = _mm_unpackhi_epi8(top_16x16, zero);
+ a1_8x16 = _mm_unpackhi_epi8(temp1, zero);
+ a2_8x16 = _mm_unpackhi_epi8(temp2, zero);
+
+ res3_8x16 = _mm_avg_epu16(a0_8x16, a1_8x16);
+
+ a0_8x16 = _mm_add_epi16(a0_8x16, a2_8x16);
+ a1_8x16 = _mm_add_epi16(a1_8x16, a1_8x16);
+ a0_8x16 = _mm_add_epi16(a0_8x16, const_val2_8x16);
+ a0_8x16 = _mm_add_epi16(a0_8x16, a1_8x16);
+ res4_8x16 = _mm_srai_epi16(a0_8x16, 2);
+
+ out1_16x16 = _mm_packus_epi16(res1_8x16, res3_8x16);
+ out2_16x16 = _mm_packus_epi16(res2_8x16, res4_8x16);
+
+ _mm_storel_epi64((__m128i *)(pu1_dst + 0 * dst_strd), out1_16x16);
+ _mm_storel_epi64((__m128i *)(pu1_dst + 1 * dst_strd), out2_16x16);
+ out1_16x16 = _mm_srli_si128(out1_16x16, 1);
+ out2_16x16 = _mm_srli_si128(out2_16x16, 1);
+ _mm_storel_epi64((__m128i *)(pu1_dst + 2 * dst_strd), out1_16x16);
+ _mm_storel_epi64((__m128i *)(pu1_dst + 3 * dst_strd), out2_16x16);
+ out1_16x16 = _mm_srli_si128(out1_16x16, 1);
+ out2_16x16 = _mm_srli_si128(out2_16x16, 1);
+ _mm_storel_epi64((__m128i *)(pu1_dst + 4 * dst_strd), out1_16x16);
+ _mm_storel_epi64((__m128i *)(pu1_dst + 5 * dst_strd), out2_16x16);
+ out1_16x16 = _mm_srli_si128(out1_16x16, 1);
+ out2_16x16 = _mm_srli_si128(out2_16x16, 1);
+ _mm_storel_epi64((__m128i *)(pu1_dst + 6 * dst_strd), out1_16x16);
+ _mm_storel_epi64((__m128i *)(pu1_dst + 7 * dst_strd), out2_16x16);
+}
+
+/**
+ *******************************************************************************
+ *
+ * ih264_intra_pred_luma_8x8_mode_horz_u_ssse3
+ *
+ * @brief
+ * Perform Intra prediction for luma_8x8 mode:Horizontal_Up
+ *
+ * @par Description:
+ * Perform Intra prediction for luma_8x8 mode:Horizontal_Up ,described in sec 8.3.2.2.10
+ *
+ * @param[in] pu1_src
+ * UWORD8 pointer to the source
+ *
+ * @param[out] pu1_dst
+ * UWORD8 pointer to the destination
+ *
+ * @param[in] src_strd
+ * integer source stride
+ *
+ * @param[in] dst_strd
+ * integer destination stride
+ *
+ * @param[in] ngbr_avail
+ * availability of neighbouring pixels(Not used in this function)
+ *
+ * @returns
+ *
+ * @remarks
+ * None
+ *
+ *******************************************************************************/
+void ih264_intra_pred_luma_8x8_mode_horz_u_ssse3(UWORD8 *pu1_src,
+ UWORD8 *pu1_dst,
+ WORD32 src_strd,
+ WORD32 dst_strd,
+ WORD32 ngbr_avail)
+{
+ UWORD8 *pu1_left = NULL; /* Pointer to start of left predictors */
+ __m128i left_16x16;
+ __m128i temp1, temp2;
+ __m128i a0_8x16, a1_8x16, a2_8x16;
+ __m128i zero = _mm_setzero_si128();
+ __m128i const_val2_8x16 = _mm_set1_epi16(2);
+ __m128i res1_8x16, res2_8x16;
+ __m128i out1_16x16;
+ __m128i str_8x8;
+ __m128i shuffle_16x16;
+ UNUSED(src_strd);
+ UNUSED(ngbr_avail);
+
+ pu1_left = pu1_src + BLK8x8SIZE - 1;
+ shuffle_16x16 = _mm_set_epi8(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
+ 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E,
+ 0x0F);
+
+ left_16x16 = _mm_loadu_si128((__m128i *)(pu1_left - 7));
+ temp1 = _mm_srli_si128(left_16x16, 1);
+ a0_8x16 = _mm_unpacklo_epi8(left_16x16, zero);
+ a0_8x16 = _mm_slli_si128(a0_8x16, 2);
+ a1_8x16 = _mm_unpacklo_epi8(left_16x16, zero);
+ a0_8x16 = _mm_shufflelo_epi16(a0_8x16, 0xE5);
+ a2_8x16 = _mm_unpacklo_epi8(temp1, zero);
+
+ res1_8x16 = _mm_avg_epu16(a0_8x16, a1_8x16);
+
+ a0_8x16 = _mm_add_epi16(a0_8x16, a2_8x16);
+ a1_8x16 = _mm_add_epi16(a1_8x16, a1_8x16);
+ a0_8x16 = _mm_add_epi16(a0_8x16, const_val2_8x16);
+ a0_8x16 = _mm_add_epi16(a0_8x16, a1_8x16);
+ res2_8x16 = _mm_srai_epi16(a0_8x16, 2);
+
+ temp1 = _mm_unpacklo_epi16(res1_8x16, res2_8x16);
+ temp2 = _mm_unpackhi_epi16(res1_8x16, res2_8x16);
+ out1_16x16 = _mm_packus_epi16(temp1, temp2);
+ out1_16x16 = _mm_shuffle_epi8(out1_16x16, shuffle_16x16);
+
+ str_8x8 = _mm_srli_si128(out1_16x16, 1);
+ _mm_storel_epi64((__m128i *)(pu1_dst + 0 * dst_strd), str_8x8);
+ str_8x8 = _mm_srli_si128(out1_16x16, 3);
+ _mm_storel_epi64((__m128i *)(pu1_dst + 1 * dst_strd), str_8x8);
+ str_8x8 = _mm_srli_si128(out1_16x16, 5);
+ _mm_storel_epi64((__m128i *)(pu1_dst + 2 * dst_strd), str_8x8);
+ str_8x8 = _mm_srli_si128(out1_16x16, 7);
+ _mm_storel_epi64((__m128i *)(pu1_dst + 3 * dst_strd), str_8x8);
+ temp1 = _mm_set1_epi8(pu1_left[-7]);
+ str_8x8 = _mm_unpacklo_epi64(str_8x8, temp1);
+ str_8x8 = _mm_srli_si128(str_8x8, 2);
+ _mm_storel_epi64((__m128i *)(pu1_dst + 4 * dst_strd), str_8x8);
+ str_8x8 = _mm_srli_si128(str_8x8, 2);
+ _mm_storel_epi64((__m128i *)(pu1_dst + 5 * dst_strd), str_8x8);
+ str_8x8 = _mm_srli_si128(str_8x8, 2);
+ _mm_storel_epi64((__m128i *)(pu1_dst + 6 * dst_strd), str_8x8);
+ str_8x8 = _mm_srli_si128(str_8x8, 2);
+ _mm_storel_epi64((__m128i *)(pu1_dst + 7 * dst_strd), str_8x8);
+
+}
+
+
+/******************* 16x16 Modes *******************/
+
+/**
+ *******************************************************************************
+ *
+ *ih264_intra_pred_luma_16x16_mode_vert_ssse3
+ *
+ * @brief
+ * Perform Intra prediction for luma_16x16 mode:Vertical
+ *
+ * @par Description:
+ * Perform Intra prediction for luma_16x16 mode:Vertical, described in sec 8.3.3.1
+ *
+ * @param[in] pu1_src
+ * UWORD8 pointer to the source
+ *
+ * @param[out] pu1_dst
+ * UWORD8 pointer to the destination
+ *
+ * @param[in] src_strd
+ * integer source stride
+ *
+ * @param[in] dst_strd
+ * integer destination stride
+ *
+ * @param[in] ngbr_avail
+ * availability of neighbouring pixels (Not used in this function)
+ *
+ * @returns
+ *
+ * @remarks
+ * None
+ *
+ *******************************************************************************/
+void ih264_intra_pred_luma_16x16_mode_vert_ssse3(UWORD8 *pu1_src,
+ UWORD8 *pu1_dst,
+ WORD32 src_strd,
+ WORD32 dst_strd,
+ WORD32 ngbr_avail)
+{
+ UWORD8 *pu1_top;
+ WORD32 dst_strd2, dst_strd3, dst_strd4;
+
+ __m128i top_16x8b;
+
+ UNUSED(src_strd);
+ UNUSED(ngbr_avail);
+
+ pu1_top = pu1_src + MB_SIZE + 1;
+
+ dst_strd2 = dst_strd << 1;
+ dst_strd4 = dst_strd << 2;
+
+ top_16x8b = _mm_loadu_si128((__m128i *)pu1_top);
+
+ dst_strd3 = dst_strd + dst_strd2;
+
+ _mm_storeu_si128((__m128i *)pu1_dst, top_16x8b);
+ _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd), top_16x8b);
+ _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd2), top_16x8b);
+ _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd3), top_16x8b);
+ pu1_dst += dst_strd4;
+
+ _mm_storeu_si128((__m128i *)pu1_dst, top_16x8b);
+ _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd), top_16x8b);
+ _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd2), top_16x8b);
+ _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd3), top_16x8b);
+ pu1_dst += dst_strd4;
+
+ _mm_storeu_si128((__m128i *)pu1_dst, top_16x8b);
+ _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd), top_16x8b);
+ _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd2), top_16x8b);
+ _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd3), top_16x8b);
+ pu1_dst += dst_strd4;
+
+ _mm_storeu_si128((__m128i *)pu1_dst, top_16x8b);
+ _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd), top_16x8b);
+ _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd2), top_16x8b);
+ _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd3), top_16x8b);
+}
+
+/**
+ *******************************************************************************
+ *
+ *ih264_intra_pred_luma_16x16_mode_horz_ssse3
+ *
+ * @brief
+ * Perform Intra prediction for luma_16x16 mode:Horizontal
+ *
+ * @par Description:
+ * Perform Intra prediction for luma_16x16 mode:Horizontal, described in sec 8.3.3.2
+ *
+ * @param[in] pu1_src
+ * UWORD8 pointer to the source
+ *
+ * @param[out] pu1_dst
+ * UWORD8 pointer to the destination
+ *
+ * @param[in] src_strd
+ * integer source stride
+ *
+ * @param[in] dst_strd
+ * integer destination stride
+ *
+ * @param[in] ngbr_avail
+ * availability of neighbouring pixels(Not used in this function)
+ *
+ * @returns
+ *
+ * @remarks
+ * None
+ *
+ *******************************************************************************/
+void ih264_intra_pred_luma_16x16_mode_horz_ssse3(UWORD8 *pu1_src,
+ UWORD8 *pu1_dst,
+ WORD32 src_strd,
+ WORD32 dst_strd,
+ WORD32 ngbr_avail)
+{
+ UWORD8 *pu1_left;
+ WORD32 dst_strd2, dst_strd3, dst_strd4;
+ WORD32 val1, val2;
+
+ __m128i val_16x8b;
+ __m128i row1_16x8b, row2_16x8b, row3_16x8b, row4_16x8b;
+
+ UNUSED(src_strd);
+ UNUSED(ngbr_avail);
+
+ pu1_left = pu1_src + MB_SIZE - 1;
+
+ dst_strd4 = dst_strd << 2;
+
+ val_16x8b = _mm_loadu_si128((__m128i *)(pu1_left - 15));
+
+ dst_strd2 = dst_strd << 1;
+ dst_strd3 = dst_strd4 - dst_strd;
+
+ val1 = _mm_extract_epi16(val_16x8b, 7);
+ val2 = _mm_extract_epi16(val_16x8b, 6);
+
+ row1_16x8b = _mm_set1_epi8(val1 >> 8);
+ row2_16x8b = _mm_set1_epi8(val1 & 0xff);
+ row3_16x8b = _mm_set1_epi8(val2 >> 8);
+ row4_16x8b = _mm_set1_epi8(val2 & 0xff);
+
+ _mm_storeu_si128((__m128i *)pu1_dst, row1_16x8b);
+ _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd), row2_16x8b);
+ _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd2), row3_16x8b);
+ _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd3), row4_16x8b);
+
+ val1 = _mm_extract_epi16(val_16x8b, 5);
+ val2 = _mm_extract_epi16(val_16x8b, 4);
+
+ pu1_dst += dst_strd4;
+ row1_16x8b = _mm_set1_epi8(val1 >> 8);
+ row2_16x8b = _mm_set1_epi8(val1 & 0xff);
+ row3_16x8b = _mm_set1_epi8(val2 >> 8);
+ row4_16x8b = _mm_set1_epi8(val2 & 0xff);
+
+ _mm_storeu_si128((__m128i *)pu1_dst, row1_16x8b);
+ _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd), row2_16x8b);
+ _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd2), row3_16x8b);
+ _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd3), row4_16x8b);
+
+ val1 = _mm_extract_epi16(val_16x8b, 3);
+ val2 = _mm_extract_epi16(val_16x8b, 2);
+
+ pu1_dst += dst_strd4;
+ row1_16x8b = _mm_set1_epi8(val1 >> 8);
+ row2_16x8b = _mm_set1_epi8(val1 & 0xff);
+ row3_16x8b = _mm_set1_epi8(val2 >> 8);
+ row4_16x8b = _mm_set1_epi8(val2 & 0xff);
+
+ _mm_storeu_si128((__m128i *)pu1_dst, row1_16x8b);
+ _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd), row2_16x8b);
+ _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd2), row3_16x8b);
+ _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd3), row4_16x8b);
+
+ val1 = _mm_extract_epi16(val_16x8b, 1);
+ val2 = _mm_extract_epi16(val_16x8b, 0);
+
+ pu1_dst += dst_strd4;
+ row1_16x8b = _mm_set1_epi8(val1 >> 8);
+ row2_16x8b = _mm_set1_epi8(val1 & 0xff);
+ row3_16x8b = _mm_set1_epi8(val2 >> 8);
+ row4_16x8b = _mm_set1_epi8(val2 & 0xff);
+
+ _mm_storeu_si128((__m128i *)pu1_dst, row1_16x8b);
+ _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd), row2_16x8b);
+ _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd2), row3_16x8b);
+ _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd3), row4_16x8b);
+}
+
+/**
+ *******************************************************************************
+ *
+ *ih264_intra_pred_luma_16x16_mode_dc_ssse3
+ *
+ * @brief
+ * Perform Intra prediction for luma_16x16 mode:DC
+ *
+ * @par Description:
+ * Perform Intra prediction for luma_16x16 mode:DC, described in sec 8.3.3.3
+ *
+ * @param[in] pu1_src
+ * UWORD8 pointer to the source
+ *
+ * @param[out] pu1_dst
+ * UWORD8 pointer to the destination
+ *
+ * @param[in] src_strd
+ * integer source stride
+ *
+ * @param[in] dst_strd
+ * integer destination stride
+ *
+ ** @param[in] ngbr_avail
+ * availability of neighbouring pixels
+ *
+ * @returns
+ *
+ * @remarks
+ * None
+ *
+ *******************************************************************************/
+void ih264_intra_pred_luma_16x16_mode_dc_ssse3(UWORD8 *pu1_src,
+ UWORD8 *pu1_dst,
+ WORD32 src_strd,
+ WORD32 dst_strd,
+ WORD32 ngbr_avail)
+{
+ WORD8 u1_useleft, u1_usetop;
+ WORD32 dc_val;
+
+ WORD32 dst_strd2, dst_strd3, dst_strd4;
+
+ __m128i dc_val_16x8b;
+
+ UNUSED(src_strd);
+
+ u1_useleft = BOOLEAN(ngbr_avail & LEFT_MB_AVAILABLE_MASK);
+ u1_usetop = BOOLEAN(ngbr_avail & TOP_MB_AVAILABLE_MASK);
+
+ if(u1_useleft || u1_usetop)
+ {
+ WORD32 shft;
+ __m128i val_16x8b, zero_16x8b, sum_8x16b;
+
+ dc_val = 0;
+ shft = 3;
+
+ zero_16x8b = _mm_setzero_si128();
+
+ if(u1_useleft)
+ {
+ UWORD8 *pu1_left;
+
+ pu1_left = pu1_src + MB_SIZE - 1;
+
+ val_16x8b = _mm_loadu_si128((__m128i *)(pu1_left - 15));
+ sum_8x16b = _mm_sad_epu8(zero_16x8b, val_16x8b);
+
+ shft++;
+ dc_val += 8;
+ dc_val += _mm_extract_epi16(sum_8x16b, 0);
+ dc_val += _mm_extract_epi16(sum_8x16b, 4);
+ }
+ if(u1_usetop)
+ {
+ UWORD8 *pu1_top;
+
+ pu1_top = pu1_src + MB_SIZE + 1;
+
+ val_16x8b = _mm_loadu_si128((__m128i *)pu1_top);
+ sum_8x16b = _mm_sad_epu8(zero_16x8b, val_16x8b);
+
+ shft++;
+ dc_val += 8;
+ dc_val += _mm_extract_epi16(sum_8x16b, 0);
+ dc_val += _mm_extract_epi16(sum_8x16b, 4);
+ }
+ dc_val = dc_val >> shft;
+ }
+ else
+ dc_val = 128;
+
+ dc_val_16x8b = _mm_set1_epi8(dc_val);
+
+ dst_strd2 = dst_strd << 1;
+ dst_strd4 = dst_strd << 2;
+ dst_strd3 = dst_strd + dst_strd2;
+
+ _mm_storeu_si128((__m128i *)pu1_dst, dc_val_16x8b);
+ _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd), dc_val_16x8b);
+ _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd2), dc_val_16x8b);
+ _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd3), dc_val_16x8b);
+ pu1_dst += dst_strd4;
+
+ _mm_storeu_si128((__m128i *)pu1_dst, dc_val_16x8b);
+ _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd), dc_val_16x8b);
+ _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd2), dc_val_16x8b);
+ _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd3), dc_val_16x8b);
+ pu1_dst += dst_strd4;
+
+ _mm_storeu_si128((__m128i *)pu1_dst, dc_val_16x8b);
+ _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd), dc_val_16x8b);
+ _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd2), dc_val_16x8b);
+ _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd3), dc_val_16x8b);
+ pu1_dst += dst_strd4;
+
+ _mm_storeu_si128((__m128i *)pu1_dst, dc_val_16x8b);
+ _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd), dc_val_16x8b);
+ _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd2), dc_val_16x8b);
+ _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd3), dc_val_16x8b);
+}
+
+/**
+ *******************************************************************************
+ *
+ *ih264_intra_pred_luma_16x16_mode_plane_ssse3
+ *
+ * @brief
+ * Perform Intra prediction for luma_16x16 mode:PLANE
+ *
+ * @par Description:
+ * Perform Intra prediction for luma_16x16 mode:PLANE, described in sec 8.3.3.4
+ *
+ * @param[in] pu1_src
+ * UWORD8 pointer to the source
+ *
+ * @param[out] pu1_dst
+ * UWORD8 pointer to the destination
+ *
+ * @param[in] src_strd
+ * integer source stride
+ *
+ * @param[in] dst_strd
+ * integer destination stride
+ *
+ * @param[in] ngbr_avail
+ * availability of neighbouring pixels(Not used in this function)
+ *
+ * @returns
+ *
+ * @remarks
+ * None
+ *
+ *******************************************************************************/
+void ih264_intra_pred_luma_16x16_mode_plane_ssse3(UWORD8 *pu1_src,
+ UWORD8 *pu1_dst,
+ WORD32 src_strd,
+ WORD32 dst_strd,
+ WORD32 ngbr_avail)
+{
+ UWORD8 *pu1_left, *pu1_top;
+ WORD32 a, b, c;
+
+ __m128i rev_8x16b, mul_8x16b, zero_16x8b;
+
+ UNUSED(src_strd);
+ UNUSED(ngbr_avail);
+
+ pu1_top = pu1_src + MB_SIZE + 1;
+ pu1_left = pu1_src + MB_SIZE - 1;
+
+ rev_8x16b = _mm_setr_epi16(0x0f0e, 0x0d0c, 0x0b0a, 0x0908, 0x0706, 0x0504, 0x0302, 0x0100);
+ //used to reverse the order of 16-bit values in a vector
+
+ mul_8x16b = _mm_setr_epi16(1, 2, 3, 4, 5, 6, 7, 8);
+ zero_16x8b = _mm_setzero_si128();
+
+ //calculating a, b and c
+ {
+ WORD32 h, v;
+
+ __m128i h_val1_16x8b, h_val2_16x8b;
+ __m128i h_val1_8x16b, h_val2_8x16b, h_val_4x32b;
+ __m128i v_val1_16x8b, v_val2_16x8b;
+ __m128i v_val1_8x16b, v_val2_8x16b, v_val_4x32b;
+ __m128i hv_val_4x32b;
+
+ a = (pu1_top[15] + pu1_left[-15]) << 4;
+
+ h_val1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_top + 8));
+ h_val2_16x8b = _mm_loadl_epi64((__m128i *)(pu1_top - 1));
+ v_val1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_left - 15));
+ v_val2_16x8b = _mm_loadl_epi64((__m128i *)(pu1_left - 6));
+
+ h_val1_8x16b = _mm_unpacklo_epi8(h_val1_16x8b, zero_16x8b);
+ h_val2_8x16b = _mm_unpacklo_epi8(h_val2_16x8b, zero_16x8b);
+ v_val1_8x16b = _mm_unpacklo_epi8(v_val1_16x8b, zero_16x8b);
+ v_val2_8x16b = _mm_unpacklo_epi8(v_val2_16x8b, zero_16x8b);
+
+ h_val2_8x16b = _mm_shuffle_epi8(h_val2_8x16b, rev_8x16b);
+ v_val1_8x16b = _mm_shuffle_epi8(v_val1_8x16b, rev_8x16b);
+
+ h_val1_8x16b = _mm_sub_epi16(h_val1_8x16b, h_val2_8x16b);
+ v_val1_8x16b = _mm_sub_epi16(v_val1_8x16b, v_val2_8x16b);
+
+ h_val_4x32b = _mm_madd_epi16(mul_8x16b, h_val1_8x16b);
+ v_val_4x32b = _mm_madd_epi16(mul_8x16b, v_val1_8x16b);
+
+ hv_val_4x32b = _mm_hadd_epi32(h_val_4x32b, v_val_4x32b);
+ hv_val_4x32b = _mm_hadd_epi32(hv_val_4x32b, hv_val_4x32b);
+
+ h = _mm_extract_epi16(hv_val_4x32b, 0);
+ v = _mm_extract_epi16(hv_val_4x32b, 2);
+ h = (h << 16) >> 16;
+ v = (v << 16) >> 16;
+
+ b = ((h << 2) + h + 32) >> 6;
+ c = ((v << 2) + v + 32) >> 6;
+ }
+
+ //using a, b and c to compute the fitted plane values
+ {
+ __m128i const_8x16b, b_8x16b, c_8x16b, c2_8x16b;
+ __m128i res1_l_8x16b, res1_h_8x16b;
+ __m128i res2_l_8x16b, res2_h_8x16b;
+ __m128i res1_sh_l_8x16b, res1_sh_h_8x16b, res1_16x8b;
+ __m128i res2_sh_l_8x16b, res2_sh_h_8x16b, res2_16x8b;
+
+ b_8x16b = _mm_set1_epi16(b);
+ c_8x16b = _mm_set1_epi16(c);
+ c2_8x16b = _mm_set1_epi16(c << 1);
+ const_8x16b = _mm_set1_epi16(a - c*7 + 16);
+
+ res1_h_8x16b = _mm_mullo_epi16(mul_8x16b, b_8x16b);
+ //contains {b*1, b*2, b*3,... b*8}
+
+ res1_l_8x16b = _mm_shuffle_epi8(res1_h_8x16b, rev_8x16b);
+ res1_l_8x16b = _mm_srli_si128(res1_l_8x16b, 2);
+ res1_l_8x16b = _mm_sub_epi16(zero_16x8b, res1_l_8x16b);
+ //contains {-b*7, -b*6,... -b*1, b*0}
+
+ // rows 1, 2
+ res1_h_8x16b = _mm_add_epi16(res1_h_8x16b, const_8x16b);
+ res1_l_8x16b = _mm_add_epi16(res1_l_8x16b, const_8x16b);
+ res2_h_8x16b = _mm_add_epi16(res1_h_8x16b, c_8x16b);
+ res2_l_8x16b = _mm_add_epi16(res1_l_8x16b, c_8x16b);
+
+ res1_sh_h_8x16b = _mm_srai_epi16(res1_h_8x16b, 5);
+ res1_sh_l_8x16b = _mm_srai_epi16(res1_l_8x16b, 5);
+ res2_sh_h_8x16b = _mm_srai_epi16(res2_h_8x16b, 5);
+ res2_sh_l_8x16b = _mm_srai_epi16(res2_l_8x16b, 5);
+
+ res1_16x8b = _mm_packus_epi16(res1_sh_l_8x16b, res1_sh_h_8x16b);
+ res2_16x8b = _mm_packus_epi16(res2_sh_l_8x16b, res2_sh_h_8x16b);
+
+ _mm_storeu_si128((__m128i *)pu1_dst, res1_16x8b);
+ _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd), res2_16x8b);
+
+ // rows 3, 4
+ res1_h_8x16b = _mm_add_epi16(res1_h_8x16b, c2_8x16b);
+ res1_l_8x16b = _mm_add_epi16(res1_l_8x16b, c2_8x16b);
+ res2_h_8x16b = _mm_add_epi16(res2_h_8x16b, c2_8x16b);
+ res2_l_8x16b = _mm_add_epi16(res2_l_8x16b, c2_8x16b);
+
+ res1_sh_h_8x16b = _mm_srai_epi16(res1_h_8x16b, 5);
+ res1_sh_l_8x16b = _mm_srai_epi16(res1_l_8x16b, 5);
+ res2_sh_h_8x16b = _mm_srai_epi16(res2_h_8x16b, 5);
+ res2_sh_l_8x16b = _mm_srai_epi16(res2_l_8x16b, 5);
+
+ pu1_dst += dst_strd << 1;
+
+ res1_16x8b = _mm_packus_epi16(res1_sh_l_8x16b, res1_sh_h_8x16b);
+ res2_16x8b = _mm_packus_epi16(res2_sh_l_8x16b, res2_sh_h_8x16b);
+
+ _mm_storeu_si128((__m128i *)pu1_dst, res1_16x8b);
+ _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd), res2_16x8b);
+
+ // rows 5, 6
+ res1_h_8x16b = _mm_add_epi16(res1_h_8x16b, c2_8x16b);
+ res1_l_8x16b = _mm_add_epi16(res1_l_8x16b, c2_8x16b);
+ res2_h_8x16b = _mm_add_epi16(res2_h_8x16b, c2_8x16b);
+ res2_l_8x16b = _mm_add_epi16(res2_l_8x16b, c2_8x16b);
+
+ res1_sh_h_8x16b = _mm_srai_epi16(res1_h_8x16b, 5);
+ res1_sh_l_8x16b = _mm_srai_epi16(res1_l_8x16b, 5);
+ res2_sh_h_8x16b = _mm_srai_epi16(res2_h_8x16b, 5);
+ res2_sh_l_8x16b = _mm_srai_epi16(res2_l_8x16b, 5);
+
+ pu1_dst += dst_strd << 1;
+
+ res1_16x8b = _mm_packus_epi16(res1_sh_l_8x16b, res1_sh_h_8x16b);
+ res2_16x8b = _mm_packus_epi16(res2_sh_l_8x16b, res2_sh_h_8x16b);
+
+ _mm_storeu_si128((__m128i *)pu1_dst, res1_16x8b);
+ _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd), res2_16x8b);
+
+ // rows 7, 8
+ res1_h_8x16b = _mm_add_epi16(res1_h_8x16b, c2_8x16b);
+ res1_l_8x16b = _mm_add_epi16(res1_l_8x16b, c2_8x16b);
+ res2_h_8x16b = _mm_add_epi16(res2_h_8x16b, c2_8x16b);
+ res2_l_8x16b = _mm_add_epi16(res2_l_8x16b, c2_8x16b);
+
+ res1_sh_h_8x16b = _mm_srai_epi16(res1_h_8x16b, 5);
+ res1_sh_l_8x16b = _mm_srai_epi16(res1_l_8x16b, 5);
+ res2_sh_h_8x16b = _mm_srai_epi16(res2_h_8x16b, 5);
+ res2_sh_l_8x16b = _mm_srai_epi16(res2_l_8x16b, 5);
+
+ pu1_dst += dst_strd << 1;
+
+ res1_16x8b = _mm_packus_epi16(res1_sh_l_8x16b, res1_sh_h_8x16b);
+ res2_16x8b = _mm_packus_epi16(res2_sh_l_8x16b, res2_sh_h_8x16b);
+
+ _mm_storeu_si128((__m128i *)pu1_dst, res1_16x8b);
+ _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd), res2_16x8b);
+
+ // rows 9, 10
+ res1_h_8x16b = _mm_add_epi16(res1_h_8x16b, c2_8x16b);
+ res1_l_8x16b = _mm_add_epi16(res1_l_8x16b, c2_8x16b);
+ res2_h_8x16b = _mm_add_epi16(res2_h_8x16b, c2_8x16b);
+ res2_l_8x16b = _mm_add_epi16(res2_l_8x16b, c2_8x16b);
+
+ res1_sh_h_8x16b = _mm_srai_epi16(res1_h_8x16b, 5);
+ res1_sh_l_8x16b = _mm_srai_epi16(res1_l_8x16b, 5);
+ res2_sh_h_8x16b = _mm_srai_epi16(res2_h_8x16b, 5);
+ res2_sh_l_8x16b = _mm_srai_epi16(res2_l_8x16b, 5);
+
+ pu1_dst += dst_strd << 1;
+
+ res1_16x8b = _mm_packus_epi16(res1_sh_l_8x16b, res1_sh_h_8x16b);
+ res2_16x8b = _mm_packus_epi16(res2_sh_l_8x16b, res2_sh_h_8x16b);
+
+ _mm_storeu_si128((__m128i *)pu1_dst, res1_16x8b);
+ _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd), res2_16x8b);
+
+ // rows 11, 12
+ res1_h_8x16b = _mm_add_epi16(res1_h_8x16b, c2_8x16b);
+ res1_l_8x16b = _mm_add_epi16(res1_l_8x16b, c2_8x16b);
+ res2_h_8x16b = _mm_add_epi16(res2_h_8x16b, c2_8x16b);
+ res2_l_8x16b = _mm_add_epi16(res2_l_8x16b, c2_8x16b);
+
+ res1_sh_h_8x16b = _mm_srai_epi16(res1_h_8x16b, 5);
+ res1_sh_l_8x16b = _mm_srai_epi16(res1_l_8x16b, 5);
+ res2_sh_h_8x16b = _mm_srai_epi16(res2_h_8x16b, 5);
+ res2_sh_l_8x16b = _mm_srai_epi16(res2_l_8x16b, 5);
+
+ pu1_dst += dst_strd << 1;
+
+ res1_16x8b = _mm_packus_epi16(res1_sh_l_8x16b, res1_sh_h_8x16b);
+ res2_16x8b = _mm_packus_epi16(res2_sh_l_8x16b, res2_sh_h_8x16b);
+
+ _mm_storeu_si128((__m128i *)pu1_dst, res1_16x8b);
+ _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd), res2_16x8b);
+
+ // rows 13, 14
+ res1_h_8x16b = _mm_add_epi16(res1_h_8x16b, c2_8x16b);
+ res1_l_8x16b = _mm_add_epi16(res1_l_8x16b, c2_8x16b);
+ res2_h_8x16b = _mm_add_epi16(res2_h_8x16b, c2_8x16b);
+ res2_l_8x16b = _mm_add_epi16(res2_l_8x16b, c2_8x16b);
+
+ res1_sh_h_8x16b = _mm_srai_epi16(res1_h_8x16b, 5);
+ res1_sh_l_8x16b = _mm_srai_epi16(res1_l_8x16b, 5);
+ res2_sh_h_8x16b = _mm_srai_epi16(res2_h_8x16b, 5);
+ res2_sh_l_8x16b = _mm_srai_epi16(res2_l_8x16b, 5);
+
+ pu1_dst += dst_strd << 1;
+
+ res1_16x8b = _mm_packus_epi16(res1_sh_l_8x16b, res1_sh_h_8x16b);
+ res2_16x8b = _mm_packus_epi16(res2_sh_l_8x16b, res2_sh_h_8x16b);
+
+ _mm_storeu_si128((__m128i *)pu1_dst, res1_16x8b);
+ _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd), res2_16x8b);
+
+ // rows 15, 16
+ res1_h_8x16b = _mm_add_epi16(res1_h_8x16b, c2_8x16b);
+ res1_l_8x16b = _mm_add_epi16(res1_l_8x16b, c2_8x16b);
+ res2_h_8x16b = _mm_add_epi16(res2_h_8x16b, c2_8x16b);
+ res2_l_8x16b = _mm_add_epi16(res2_l_8x16b, c2_8x16b);
+
+ res1_sh_h_8x16b = _mm_srai_epi16(res1_h_8x16b, 5);
+ res1_sh_l_8x16b = _mm_srai_epi16(res1_l_8x16b, 5);
+ res2_sh_h_8x16b = _mm_srai_epi16(res2_h_8x16b, 5);
+ res2_sh_l_8x16b = _mm_srai_epi16(res2_l_8x16b, 5);
+
+ pu1_dst += dst_strd << 1;
+
+ res1_16x8b = _mm_packus_epi16(res1_sh_l_8x16b, res1_sh_h_8x16b);
+ res2_16x8b = _mm_packus_epi16(res2_sh_l_8x16b, res2_sh_h_8x16b);
+
+ _mm_storeu_si128((__m128i *)pu1_dst, res1_16x8b);
+ _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd), res2_16x8b);
+ }
+}
diff --git a/common/x86/ih264_mem_fns_ssse3.c b/common/x86/ih264_mem_fns_ssse3.c
new file mode 100755
index 0000000..8ca1f3e
--- /dev/null
+++ b/common/x86/ih264_mem_fns_ssse3.c
@@ -0,0 +1,169 @@
+/******************************************************************************
+ *
+ * Copyright (C) 2015 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ *****************************************************************************
+ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
+*/
+/**
+ *******************************************************************************
+ * @file
+ * ih264_mem_fns_atom_intr.c
+ *
+ * @brief
+ * Functions used for memory operations
+ *
+ * @author
+ * Ittiam
+ *
+ * @par List of Functions:
+ *
+ * @remarks
+ * None
+ *
+ *******************************************************************************
+ */
+
+/*****************************************************************************/
+/* File Includes */
+/*****************************************************************************/
+#include <stdio.h>
+#include <stddef.h>
+#include <stdlib.h>
+#include <string.h>
+#include <assert.h>
+
+#include "ih264_typedefs.h"
+#include "ih264_mem_fns.h"
+
+#include <immintrin.h>
+
+/**
+ *******************************************************************************
+ *
+ * @brief
+ * memcpy of a 8,16 or 32 bytes
+ *
+ * @par Description:
+ * Does memcpy of 8bit data from source to destination for 8,16 or 32 number of bytes
+ *
+ * @param[in] pu1_dst
+ * UWORD8 pointer to the destination
+ *
+ * @param[in] pu1_src
+ * UWORD8 pointer to the source
+ *
+ * @param[in] num_bytes
+ * number of bytes to copy
+ * @returns
+ *
+ * @remarks
+ * None
+ *
+ *******************************************************************************
+ */
+
+
+
+
+void ih264_memcpy_mul_8_ssse3(UWORD8 *pu1_dst, UWORD8 *pu1_src, UWORD32 num_bytes)
+{
+ int col;
+ for(col = num_bytes; col >= 8; col -= 8)
+ {
+ __m128i src_temp16x8b;
+ src_temp16x8b = _mm_loadl_epi64((__m128i *)(pu1_src));
+ pu1_src += 8;
+ _mm_storel_epi64((__m128i *)(pu1_dst), src_temp16x8b);
+ pu1_dst += 8;
+ }
+}
+
+/**
+ *******************************************************************************
+ *
+ * @brief
+ * memset of a 8,16 or 32 bytes
+ *
+ * @par Description:
+ * Does memset of 8bit data for 8,16 or 32 number of bytes
+ *
+ * @param[in] pu1_dst
+ * UWORD8 pointer to the destination
+ *
+ * @param[in] value
+ * UWORD8 value used for memset
+ *
+ * @param[in] num_bytes
+ * number of bytes to set
+ * @returns
+ *
+ * @remarks
+ * None
+ *
+ *******************************************************************************
+ */
+
+
+void ih264_memset_mul_8_ssse3(UWORD8 *pu1_dst, UWORD8 value, UWORD32 num_bytes)
+{
+ int col;
+ __m128i src_temp16x8b;
+ src_temp16x8b = _mm_set1_epi8(value);
+ for(col = num_bytes; col >= 8; col -= 8)
+ {
+ _mm_storel_epi64((__m128i *)(pu1_dst), src_temp16x8b);
+ pu1_dst += 8;
+ }
+}
+
+/**
+ *******************************************************************************
+ *
+ * @brief
+ * memset of 16bit data of a 8,16 or 32 bytes
+ *
+ * @par Description:
+ * Does memset of 16bit data for 8,16 or 32 number of bytes
+ *
+ * @param[in] pu2_dst
+ * UWORD8 pointer to the destination
+ *
+ * @param[in] value
+ * UWORD16 value used for memset
+ *
+ * @param[in] num_words
+ * number of words to set
+ * @returns
+ *
+ * @remarks
+ * None
+ *
+ *******************************************************************************
+ */
+
+
+void ih264_memset_16bit_mul_8_ssse3(UWORD16 *pu2_dst, UWORD16 value, UWORD32 num_words)
+{
+ int col;
+ __m128i src_temp16x8b;
+ src_temp16x8b = _mm_set1_epi16(value);
+ for(col = num_words; col >= 8; col -= 8)
+ {
+ _mm_storeu_si128((__m128i *)(pu2_dst), src_temp16x8b);
+ pu2_dst += 8;
+ }
+}
+
diff --git a/common/x86/ih264_padding_ssse3.c b/common/x86/ih264_padding_ssse3.c
new file mode 100755
index 0000000..6dadd39
--- /dev/null
+++ b/common/x86/ih264_padding_ssse3.c
@@ -0,0 +1,335 @@
+/******************************************************************************
+ *
+ * Copyright (C) 2015 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ *****************************************************************************
+ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
+*/
+/**
+*******************************************************************************
+* @file
+* ih264_padding_atom_intr.c
+*
+* @brief
+* Contains function definitions for Padding
+*
+* @author
+* Srinivas T
+*
+* @par List of Functions:
+* - ih264_pad_left_luma_ssse3()
+* - ih264_pad_left_chroma_ssse3()
+* - ih264_pad_right_luma_ssse3()
+* - ih264_pad_right_chroma_ssse3()
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+
+#include <string.h>
+#include <assert.h>
+#include "ih264_typedefs.h"
+#include "ih264_platform_macros.h"
+#include "ih264_mem_fns.h"
+#include "ih264_debug.h"
+
+#include <immintrin.h>
+
+
+/**
+*******************************************************************************
+*
+* @brief
+* Padding (luma block) at the left of a 2d array
+*
+* @par Description:
+* The left column of a 2d array is replicated for pad_size times at the left
+*
+*
+* @param[in] pu1_src
+* UWORD8 pointer to the source
+*
+* @param[in] src_strd
+* integer source stride
+*
+* @param[in] ht
+* integer height of the array
+*
+* @param[in] wd
+* integer width of the array
+*
+* @param[in] pad_size
+* integer -padding size of the array
+*
+* @param[in] ht
+* integer height of the array
+*
+* @param[in] wd
+* integer width of the array
+*
+* @returns
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+
+void ih264_pad_left_luma_ssse3(UWORD8 *pu1_src,
+ WORD32 src_strd,
+ WORD32 ht,
+ WORD32 pad_size)
+{
+ WORD32 row;
+ WORD32 i;
+ UWORD8 *pu1_dst;
+ __m128i const0_16x8b;
+
+ const0_16x8b = _mm_setzero_si128();
+
+ ASSERT(pad_size % 8 == 0);
+
+ for(row = 0; row < ht; row++)
+ {
+ __m128i src_temp0_16x8b;
+
+ src_temp0_16x8b = _mm_loadu_si128((__m128i *)pu1_src);
+ pu1_dst = pu1_src - pad_size;
+ src_temp0_16x8b = _mm_shuffle_epi8(src_temp0_16x8b, const0_16x8b);
+ for(i = 0; i < pad_size; i += 8)
+ {
+ _mm_storel_epi64((__m128i *)(pu1_dst + i), src_temp0_16x8b);
+ }
+ pu1_src += src_strd;
+ }
+
+}
+
+
+
+/**
+*******************************************************************************
+*
+* @brief
+* Padding (chroma block) at the left of a 2d array
+*
+* @par Description:
+* The left column of a 2d array is replicated for pad_size times at the left
+*
+*
+* @param[in] pu1_src
+* UWORD8 pointer to the source
+*
+* @param[in] src_strd
+* integer source stride
+*
+* @param[in] ht
+* integer height of the array
+*
+* @param[in] wd
+* integer width of the array (each colour component)
+*
+* @param[in] pad_size
+* integer -padding size of the array
+*
+* @param[in] ht
+* integer height of the array
+*
+* @param[in] wd
+* integer width of the array
+*
+* @returns
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+
+void ih264_pad_left_chroma_ssse3(UWORD8 *pu1_src,
+ WORD32 src_strd,
+ WORD32 ht,
+ WORD32 pad_size)
+{
+ WORD32 row;
+ WORD32 col;
+ UWORD8 *pu1_dst;
+ __m128i const0_16x8b, const1_16x8b;
+ const0_16x8b = _mm_setzero_si128();
+ const1_16x8b = _mm_set1_epi8(1);
+ const0_16x8b = _mm_unpacklo_epi8(const0_16x8b, const1_16x8b);
+
+ ASSERT(pad_size % 8 == 0);
+ for(row = 0; row < ht; row++)
+ {
+ __m128i src_temp0_16x8b;
+
+ src_temp0_16x8b = _mm_loadu_si128((__m128i *)pu1_src);
+ pu1_dst = pu1_src - pad_size;
+ src_temp0_16x8b = _mm_shuffle_epi8(src_temp0_16x8b, const0_16x8b);
+
+ for(col = 0; col < pad_size; col += 8)
+ {
+ _mm_storel_epi64((__m128i *)(pu1_dst + col), src_temp0_16x8b);
+ }
+ pu1_src += src_strd;
+ }
+
+}
+
+
+
+/**
+*******************************************************************************
+*
+* @brief
+* Padding (luma block) at the right of a 2d array
+*
+* @par Description:
+* The right column of a 2d array is replicated for pad_size times at the right
+*
+*
+* @param[in] pu1_src
+* UWORD8 pointer to the source
+*
+* @param[in] src_strd
+* integer source stride
+*
+* @param[in] ht
+* integer height of the array
+*
+* @param[in] wd
+* integer width of the array
+*
+* @param[in] pad_size
+* integer -padding size of the array
+*
+* @param[in] ht
+* integer height of the array
+*
+* @param[in] wd
+* integer width of the array
+*
+* @returns
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+
+void ih264_pad_right_luma_ssse3(UWORD8 *pu1_src,
+ WORD32 src_strd,
+ WORD32 ht,
+ WORD32 pad_size)
+{
+ WORD32 row;
+ WORD32 col;
+ UWORD8 *pu1_dst;
+ __m128i const0_16x8b;
+
+ ASSERT(pad_size % 8 == 0);
+
+ for(row = 0; row < ht; row++)
+ {
+ __m128i src_temp0_16x8b;
+
+ src_temp0_16x8b = _mm_loadu_si128((__m128i *)(pu1_src - 1));
+ const0_16x8b = _mm_setzero_si128();
+ pu1_dst = pu1_src;
+ src_temp0_16x8b = _mm_shuffle_epi8(src_temp0_16x8b, const0_16x8b);
+ for(col = 0; col < pad_size; col += 8)
+ {
+ _mm_storel_epi64((__m128i *)(pu1_dst + col), src_temp0_16x8b);
+ }
+ pu1_src += src_strd;
+ }
+
+}
+
+
+
+/**
+*******************************************************************************
+*
+* @brief
+* Padding (chroma block) at the right of a 2d array
+*
+* @par Description:
+* The right column of a 2d array is replicated for pad_size times at the right
+*
+*
+* @param[in] pu1_src
+* UWORD8 pointer to the source
+*
+* @param[in] src_strd
+* integer source stride
+*
+* @param[in] ht
+* integer height of the array
+*
+* @param[in] wd
+* integer width of the array (each colour component)
+*
+* @param[in] pad_size
+* integer -padding size of the array
+*
+* @param[in] ht
+* integer height of the array
+*
+* @param[in] wd
+* integer width of the array
+*
+* @returns
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+
+void ih264_pad_right_chroma_ssse3(UWORD8 *pu1_src,
+ WORD32 src_strd,
+ WORD32 ht,
+ WORD32 pad_size)
+{
+ WORD32 row;
+ WORD32 col;
+ UWORD8 *pu1_dst;
+ __m128i const0_16x8b, const1_16x8b;
+ const0_16x8b = _mm_setzero_si128();
+ const1_16x8b = _mm_set1_epi8(1);
+ const0_16x8b = _mm_unpacklo_epi8(const0_16x8b, const1_16x8b);
+
+ ASSERT(pad_size % 8 == 0);
+
+ for(row = 0; row < ht; row++)
+ {
+ __m128i src_temp0_16x8b;
+
+ src_temp0_16x8b = _mm_loadu_si128((__m128i *)(pu1_src - 2));
+ pu1_dst = pu1_src;
+ src_temp0_16x8b = _mm_shuffle_epi8(src_temp0_16x8b, const0_16x8b);
+ for(col = 0; col < pad_size; col += 8)
+ {
+ _mm_storel_epi64((__m128i *)(pu1_dst + col), src_temp0_16x8b);
+ }
+
+ pu1_src += src_strd;
+ }
+}
+
diff --git a/common/x86/ih264_platform_macros.h b/common/x86/ih264_platform_macros.h
new file mode 100755
index 0000000..e4b9821
--- /dev/null
+++ b/common/x86/ih264_platform_macros.h
@@ -0,0 +1,114 @@
+/******************************************************************************
+ *
+ * Copyright (C) 2015 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ *****************************************************************************
+ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
+*/
+/**
+*******************************************************************************
+* @file
+* ih264_platform_macros.h
+*
+* @brief
+* Platform specific Macro definitions used in the codec
+*
+* @author
+* Ittiam
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+
+
+#ifndef _IH264_PLATFORM_MACROS_H_
+#define _IH264_PLATFORM_MACROS_H_
+
+#include <immintrin.h>
+
+
+#define CLIP_U8(x) CLIP3(0, 255, (x))
+#define CLIP_S8(x) CLIP3(-128, 127, (x))
+
+#define CLIP_U10(x) CLIP3(0, 1023, (x))
+#define CLIP_S10(x) CLIP3(-512, 511, (x))
+
+#define CLIP_U12(x) CLIP3(0, 4095, (x))
+#define CLIP_S12(x) CLIP3(-2048, 2047, (x))
+
+#define CLIP_U16(x) CLIP3(0, 65535, (x))
+#define CLIP_S16(x) CLIP3(-32768, 32767, (x))
+
+#define MEM_ALIGN16 __attribute__ ((aligned (16)))
+
+#define SHL(x,y) (((y) < 32) ? ((x) << (y)) : 0)
+#define SHR(x,y) (((y) < 32) ? ((x) >> (y)) : 0)
+
+#define SHR_NEG(val,shift) ((shift>0)?(val>>shift):(val<<(-shift)))
+#define SHL_NEG(val,shift) ((shift<0)?(val>>(-shift)):(val<<shift))
+
+
+#define ITT_BIG_ENDIAN(x) ((x << 24)) | \
+ ((x & 0x0000ff00) << 8) | \
+ ((x & 0x00ff0000) >> 8) | \
+ ((UWORD32)x >> 24);
+
+
+#define NOP(nop_cnt) {UWORD32 nop_i; for (nop_i = 0; nop_i < nop_cnt; nop_i++);}
+
+#define PLD(a)
+
+static __inline UWORD32 CLZ(UWORD32 u4_word)
+{
+ if(u4_word)
+ return(__builtin_clz(u4_word));
+ else
+ return 32;
+}
+
+static __inline UWORD32 CTZ(UWORD32 u4_word)
+{
+ if(0 == u4_word)
+ return 31;
+ else
+ {
+ unsigned int index;
+ index = __builtin_ctz(u4_word);
+ return (UWORD32)index;
+ }
+}
+
+#define DATA_SYNC() __sync_synchronize()
+
+
+
+//#define INLINE __inline
+#define INLINE
+
+#define PREFETCH_ENABLE 1
+
+#if PREFETCH_ENABLE
+#define PREFETCH(ptr, type) _mm_prefetch(ptr, type);
+#else
+#define PREFETCH(ptr, type)
+#endif
+
+#define MEM_ALIGN8 __attribute__ ((aligned (8)))
+#define MEM_ALIGN16 __attribute__ ((aligned (16)))
+#define MEM_ALIGN32 __attribute__ ((aligned (32)))
+
+#endif /* _IH264_PLATFORM_MACROS_H_ */
diff --git a/common/x86/ih264_resi_trans_quant_sse42.c b/common/x86/ih264_resi_trans_quant_sse42.c
new file mode 100755
index 0000000..c267651
--- /dev/null
+++ b/common/x86/ih264_resi_trans_quant_sse42.c
@@ -0,0 +1,984 @@
+/******************************************************************************
+ *
+ * Copyright (C) 2015 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ *****************************************************************************
+ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
+*/
+/**
+ *******************************************************************************
+ * @file
+ * ih264_resi_trans_quant_sse42.c
+ *
+ * @brief
+ * Contains function definitions single stage forward transform for H.264
+ * It will calculate the residue, do the cf and then do quantization
+ *
+ * @author
+ * Mohit [100664]
+ *
+ * @par List of Functions:
+ * - ih264_resi_trans_quant_4x4_sse42()
+ * - ih264_resi_trans_quant_chroma_4x4_sse42()
+ *
+ * @remarks
+ * None
+ *
+ *******************************************************************************
+ */
+/* System include files */
+#include <stddef.h>
+
+/* User include files */
+#include "ih264_typedefs.h"
+#include "ih264_defs.h"
+#include "ih264_size_defs.h"
+#include "ih264_macros.h"
+#include "ih264_trans_macros.h"
+#include "ih264_trans_data.h"
+#include "ih264_structs.h"
+#include "ih264_trans_quant_itrans_iquant.h"
+#include <immintrin.h>
+/**
+ *******************************************************************************
+ *
+ * @brief
+ * This function performs forward transform and quantization on a 4*4 block
+ *
+ * @par Description:
+ * The function accepts source buffer and estimation buffer. From these, it
+ * computes the residue. This is residue is then transformed and quantized.
+ * The transform and quantization are in placed computed. They use the residue
+ * buffer for this.
+ *
+ * @param[in] pu1_src
+ * Pointer to source sub-block
+ *
+ * @param[in] pu1_pred
+ * Pointer to prediction sub-block
+ *
+ * @param[in] pi2_out
+ * Pointer to residual sub-block
+ *
+ * @param[in] src_strd
+ * Source stride
+ *
+ * @param[in] pred_strd
+ * Prediction stride
+ *
+ * @param[in] dst_strd
+ * Destination stride
+ *
+ * @param[in] u4_qbits
+ * QP_BITS_h264_4x4 + floor(QP/6)
+ *
+ * @param[in] pu2_threshold_matrix
+ * Pointer to Forward Quant Threshold Matrix
+ *
+ * @param[in] pu2_scale_matrix
+ * Pointer to Forward Quant Scale Matrix
+ *
+ * @param[in] u4_round_factor
+ * Quantization Round factor
+ *
+ * @param[out] pu1_nnz
+ * Total non-zero coefficients in the current sub-block
+ *
+ * @returns
+ *
+ * @remarks
+ * None
+ *
+ *******************************************************************************
+ */
+void ih264_resi_trans_quant_4x4_sse42(UWORD8 *pu1_src, UWORD8 *pu1_pred,
+ WORD16 *pi2_out, WORD32 src_strd, WORD32 pred_strd,
+ const UWORD16 *pu2_scale_matrix, const UWORD16 *pu2_threshold_matrix,
+ UWORD32 u4_qbits, UWORD32 u4_round_factor, UWORD8 *pu1_nnz,
+ WORD16 *pi2_alt_dc_addr)
+{
+ WORD32 tmp_dc, u4_zero_coeff, u4_nonzero_coeff = 0;
+ WORD32 mask0, mask1;
+ __m128i sum0, sum1, sum2, cmp0, cmp1;
+ __m128i rnd_fact = _mm_set1_epi32(u4_round_factor);
+ __m128i temp_2 = _mm_set1_epi16(2);
+ __m128i temp_1 = _mm_set1_epi16(1);
+ __m128i src_r0, src_r1, src_r2, src_r3;
+ __m128i pred_r0, pred_r1, pred_r2, pred_r3;
+ __m128i temp0, temp1, temp2, temp3;
+ __m128i zero_8x16b = _mm_setzero_si128(); // all bits reset to zero
+ __m128i sign_reg0, sign_reg2;
+ __m128i scalemat_r0_r1, scalemat_r2_r3;
+ scalemat_r0_r1 = _mm_loadu_si128((__m128i *) (pu2_scale_matrix)); //b00 b01 b02 b03 b10 b11 b12 b13 -- the scaling matrix 0th,1st row
+ scalemat_r2_r3 = _mm_loadu_si128((__m128i *) (pu2_scale_matrix + 8)); //b20 b21 b22 b23 b30 b31 b32 b33 -- the scaling matrix 2nd,3rd row
+ src_r0 = _mm_loadl_epi64((__m128i *) (&pu1_src[0])); //a00 a01 a02 a03 0 0 0 0 0 0 0 0 -- all 8 bits
+ src_r1 = _mm_loadl_epi64((__m128i *) (&pu1_src[src_strd])); //a10 a11 a12 a13 0 0 0 0 0 0 0 0 -- all 8 bits
+ src_r2 = _mm_loadl_epi64((__m128i *) (&pu1_src[2 * src_strd])); //a20 a21 a22 a23 0 0 0 0 0 0 0 0 -- all 8 bits
+ src_r3 = _mm_loadl_epi64((__m128i *) (&pu1_src[3 * src_strd])); //a30 a31 a32 a33 0 0 0 0 0 0 0 0 -- all 8 bits
+
+ src_r0 = _mm_cvtepu8_epi16(src_r0);
+ src_r1 = _mm_cvtepu8_epi16(src_r1);
+ src_r2 = _mm_cvtepu8_epi16(src_r2);
+ src_r3 = _mm_cvtepu8_epi16(src_r3);
+
+ pred_r0 = _mm_loadl_epi64((__m128i *) (&pu1_pred[0])); //p00 p01 p02 p03 0 0 0 0 0 0 0 0 -- all 8 bits
+ pred_r1 = _mm_loadl_epi64((__m128i *) (&pu1_pred[pred_strd])); //p10 p11 p12 p13 0 0 0 0 0 0 0 0 -- all 8 bits
+ pred_r2 = _mm_loadl_epi64((__m128i *) (&pu1_pred[2 * pred_strd])); //p20 p21 p22 p23 0 0 0 0 0 0 0 0 -- all 8 bits
+ pred_r3 = _mm_loadl_epi64((__m128i *) (&pu1_pred[3 * pred_strd])); //p30 p31 p32 p33 0 0 0 0 0 0 0 0 -- all 8 bits
+
+ pred_r0 = _mm_cvtepu8_epi16(pred_r0); //p00 p01 p02 p03 -- all 16 bits
+ pred_r1 = _mm_cvtepu8_epi16(pred_r1); //p10 p11 p12 p13 -- all 16 bits
+ pred_r2 = _mm_cvtepu8_epi16(pred_r2); //p20 p21 p22 p23 -- all 16 bits
+ pred_r3 = _mm_cvtepu8_epi16(pred_r3); //p30 p31 p32 p33 -- all 16 bits
+
+ src_r0 = _mm_sub_epi16(src_r0, pred_r0);
+ src_r1 = _mm_sub_epi16(src_r1, pred_r1);
+ src_r2 = _mm_sub_epi16(src_r2, pred_r2);
+ src_r3 = _mm_sub_epi16(src_r3, pred_r3);
+
+ /* Perform Forward transform */
+ /*-------------------------------------------------------------*/
+ /* DCT [ Horizontal transformation ] */
+ /*-------------------------------------------------------------*/
+ // Matrix transpose
+ /*
+ * a0 a1 a2 a3
+ * b0 b1 b2 b3
+ * c0 c1 c2 c3
+ * d0 d1 d2 d3
+ */
+ temp0 = _mm_unpacklo_epi16(src_r0, src_r1); //a0 b0 a1 b1 a2 b2 a3 b3
+ temp2 = _mm_unpacklo_epi16(src_r2, src_r3); //c0 d0 c1 d1 c2 d2 c3 d3
+ temp1 = _mm_unpacklo_epi32(temp0, temp2); //a0 b0 c0 d0 a1 b1 c1 d1
+ temp3 = _mm_unpackhi_epi32(temp0, temp2); //a2 b2 c2 d2 a3 b3 c3 d3
+
+ src_r0 = _mm_unpacklo_epi64(temp1, zero_8x16b); //a0 b0 c0 d0
+ src_r1 = _mm_unpackhi_epi64(temp1, zero_8x16b); //a1 b1 c1 d1
+ src_r2 = _mm_unpacklo_epi64(temp3, zero_8x16b); //a2 b2 c2 d2
+ src_r3 = _mm_unpackhi_epi64(temp3, zero_8x16b); //a3 b3 c3 d3
+
+ /*----------------------------------------------------------*/
+ /* x0 = z0 + z3 */
+ temp0 = _mm_add_epi16(src_r0, src_r3);
+ /* x1 = z1 + z2 */
+ temp1 = _mm_add_epi16(src_r1, src_r2);
+ /* x2 = z1 - z2 */
+ temp2 = _mm_sub_epi16(src_r1, src_r2);
+ /* x3 = z0 - z3 */
+ temp3 = _mm_sub_epi16(src_r0, src_r3);
+
+ /* z0 = x0 + x1 */
+ src_r0 = _mm_add_epi16(temp0, temp1);
+ /* z1 = (x3 << 1) + x2 */
+ src_r1 = _mm_slli_epi16(temp3, 1); //(x3<<1)
+ src_r1 = _mm_add_epi16(src_r1, temp2);
+ /* z2 = x0 - x1 */
+ src_r2 = _mm_sub_epi16(temp0, temp1);
+ /* z3 = x3 - (x2 << 1) */
+ src_r3 = _mm_slli_epi16(temp2, 1); //(x2<<1)
+ src_r3 = _mm_sub_epi16(temp3, src_r3);
+
+ // Matrix transpose
+ /*
+ * a0 b0 c0 d0
+ * a1 b1 c1 d1
+ * a2 b2 c2 d2
+ * a3 b3 c3 d3
+ */
+ temp0 = _mm_unpacklo_epi16(src_r0, src_r1); //a0 a1 b0 b1 c0 c1 d0 d1
+ temp2 = _mm_unpacklo_epi16(src_r2, src_r3); //a2 a3 b2 b3 c2 c3 d2 d3
+ temp1 = _mm_unpacklo_epi32(temp0, temp2); //a0 a1 a2 a3 b0 b1 b2 b3
+ temp3 = _mm_unpackhi_epi32(temp0, temp2); //c0 c1 c2 c3 d0 d1 d2 d3
+
+ src_r0 = _mm_unpacklo_epi64(temp1, zero_8x16b); //a0 a1 a2 a3
+ src_r1 = _mm_unpackhi_epi64(temp1, zero_8x16b); //b0 b1 b2 b3
+ src_r2 = _mm_unpacklo_epi64(temp3, zero_8x16b); //c0 c1 c2 c3
+ src_r3 = _mm_unpackhi_epi64(temp3, zero_8x16b); //d0 d1 d2 d3
+
+ /*----------------------------------------------------------*/
+ /* x0 = z0 + z3 */
+ temp0 = _mm_add_epi16(src_r0, src_r3);
+ /* x1 = z1 + z2 */
+ temp1 = _mm_add_epi16(src_r1, src_r2);
+ /* x2 = z1 - z2 */
+ temp2 = _mm_sub_epi16(src_r1, src_r2);
+ /* x3 = z0 - z3 */
+ temp3 = _mm_sub_epi16(src_r0, src_r3);
+
+ /* z0 = x0 + x1 */
+ src_r0 = _mm_add_epi16(temp0, temp1);
+ /* z1 = (x3 << 1) + x2 */
+ src_r1 = _mm_slli_epi16(temp3, 1); //(x3<<1)
+ src_r1 = _mm_add_epi16(src_r1, temp2);
+ /* z2 = x0 - x1 */
+ src_r2 = _mm_sub_epi16(temp0, temp1);
+ /* z3 = x3 - (x2 << 1) */
+ src_r3 = _mm_slli_epi16(temp2, 1); //(x2<<1)
+ src_r3 = _mm_sub_epi16(temp3, src_r3);
+
+ tmp_dc = _mm_extract_epi16(src_r0,0); //a0
+ *pi2_alt_dc_addr = tmp_dc;
+
+ src_r0 = _mm_unpacklo_epi64(src_r0, src_r1); //a0 a1 a2 a3 b0 b1 b2 b3
+ src_r2 = _mm_unpacklo_epi64(src_r2, src_r3); //c0 c1 c2 c3 d0 d1 d2 d3
+ sign_reg0 = _mm_cmpgt_epi16(zero_8x16b,src_r0);
+ sign_reg2 = _mm_cmpgt_epi16(zero_8x16b,src_r2);
+
+ sign_reg0 = _mm_mullo_epi16(temp_2,sign_reg0);
+ sign_reg2 = _mm_mullo_epi16(temp_2,sign_reg2);
+
+ sign_reg0 = _mm_add_epi16(temp_1,sign_reg0);
+ sign_reg2 = _mm_add_epi16(temp_1,sign_reg2);
+
+ src_r0 = _mm_abs_epi16(src_r0);
+ src_r2 = _mm_abs_epi16(src_r2);
+
+ src_r1 = _mm_srli_si128(src_r0, 8);
+ src_r0 = _mm_cvtepu16_epi32(src_r0);
+ src_r1 = _mm_cvtepu16_epi32(src_r1);
+ src_r3 = _mm_srli_si128(src_r2, 8);
+ src_r2 = _mm_cvtepu16_epi32(src_r2);
+ src_r3 = _mm_cvtepu16_epi32(src_r3);
+
+ temp0 = _mm_cvtepu16_epi32(scalemat_r0_r1);
+ scalemat_r0_r1 = _mm_srli_si128(scalemat_r0_r1, 8);
+ temp2 = _mm_cvtepu16_epi32(scalemat_r2_r3);
+ scalemat_r2_r3 = _mm_srli_si128(scalemat_r2_r3, 8);
+ temp1 = _mm_cvtepu16_epi32(scalemat_r0_r1);
+ temp3 = _mm_cvtepu16_epi32(scalemat_r2_r3);
+
+ temp0 = _mm_mullo_epi32(temp0, src_r0);
+ temp1 = _mm_mullo_epi32(temp1, src_r1);
+ temp2 = _mm_mullo_epi32(temp2, src_r2);
+ temp3 = _mm_mullo_epi32(temp3, src_r3);
+
+ temp0 = _mm_add_epi32(temp0,rnd_fact);
+ temp1 = _mm_add_epi32(temp1,rnd_fact);
+ temp2 = _mm_add_epi32(temp2,rnd_fact);
+ temp3 = _mm_add_epi32(temp3,rnd_fact);
+
+ temp0 = _mm_srli_epi32(temp0,u4_qbits);
+ temp1 = _mm_srli_epi32(temp1,u4_qbits);
+ temp2 = _mm_srli_epi32(temp2,u4_qbits);
+ temp3 = _mm_srli_epi32(temp3,u4_qbits);
+
+ temp0 = _mm_packs_epi32 (temp0,temp1);
+ temp2 = _mm_packs_epi32 (temp2,temp3);
+
+ temp0 = _mm_sign_epi16(temp0, sign_reg0);
+ temp2 = _mm_sign_epi16(temp2, sign_reg2);
+
+ _mm_storeu_si128((__m128i *) (&pi2_out[0]), temp0);
+ _mm_storeu_si128((__m128i *) (&pi2_out[8]), temp2);
+
+ cmp0 = _mm_cmpeq_epi16(temp0, zero_8x16b);
+ cmp1 = _mm_cmpeq_epi16(temp2, zero_8x16b);
+
+ mask0 = _mm_movemask_epi8(cmp0);
+ mask1 = _mm_movemask_epi8(cmp1);
+ u4_zero_coeff = 0;
+ if(mask0)
+ {
+ if(mask0 == 0xffff)
+ u4_zero_coeff+=8;
+ else
+ {
+ cmp0 = _mm_and_si128(temp_1, cmp0);
+ sum0 = _mm_hadd_epi16(cmp0, zero_8x16b);
+ sum1 = _mm_hadd_epi16(sum0, zero_8x16b);
+ sum2 = _mm_hadd_epi16(sum1, zero_8x16b);
+ u4_zero_coeff += _mm_cvtsi128_si32(sum2);
+ }
+ }
+ if(mask1)
+ {
+ if(mask1 == 0xffff)
+ u4_zero_coeff+=8;
+ else
+ {
+ cmp1 = _mm_and_si128(temp_1, cmp1);
+ sum0 = _mm_hadd_epi16(cmp1, zero_8x16b);
+ sum1 = _mm_hadd_epi16(sum0, zero_8x16b);
+ sum2 = _mm_hadd_epi16(sum1, zero_8x16b);
+ u4_zero_coeff += _mm_cvtsi128_si32(sum2);
+ }
+ }
+
+ /* Return total nonzero coefficients in the current sub block */
+ u4_nonzero_coeff = 16 - u4_zero_coeff;
+ *pu1_nnz = u4_nonzero_coeff;
+}
+
+/**
+ *******************************************************************************
+ *
+ * @brief
+ * This function performs forward transform and quantization on a 4*4 chroma block
+ *
+ * @par Description:
+ * The function accepts source buffer and estimation buffer. From these, it
+ * computes the residue. This is residue is then transformed and quantized.
+ * The transform and quantization are in placed computed. They use the residue
+ * buffer for this.
+ *
+ * @param[in] pu1_src
+ * Pointer to source sub-block
+ *
+ * @param[in] pu1_pred
+ * Pointer to prediction sub-block
+ *
+ * @param[in] pi2_out
+ * Pointer to residual sub-block
+ *
+ * @param[in] src_strd
+ * Source stride
+ *
+ * @param[in] pred_strd
+ * Prediction stride
+ *
+ * @param[in] dst_strd
+ * Destination stride
+ *
+ * @param[in] u4_qbits
+ * QP_BITS_h264_4x4 + floor(QP/6)
+ *
+ * @param[in] pu2_threshold_matrix
+ * Pointer to Forward Quant Threshold Matrix
+ *
+ * @param[in] pu2_scale_matrix
+ * Pointer to Forward Quant Scale Matrix
+ *
+ * @param[in] u4_round_factor
+ * Quantization Round factor
+ *
+ * @param[out] pu1_nnz
+ * Total non-zero coefficients in the current sub-block
+ *
+ * @returns
+ *
+ * @remarks
+ * None
+ *
+ *******************************************************************************
+ */
+void ih264_resi_trans_quant_chroma_4x4_sse42(UWORD8 *pu1_src,UWORD8 *pu1_pred,WORD16 *pi2_out,
+ WORD32 src_strd,WORD32 pred_strd,
+ const UWORD16 *pu2_scale_matrix,
+ const UWORD16 *pu2_threshold_matrix,
+ UWORD32 u4_qbits,UWORD32 u4_round_factor,
+ UWORD8 *pu1_nnz, WORD16 *pi2_alt_dc_addr)
+{
+ WORD32 tmp_dc, u4_zero_coeff, u4_nonzero_coeff = 0;
+ WORD32 mask0, mask1;
+ __m128i cmp0, cmp1, sum0, sum1, sum2;
+ __m128i rnd_fact = _mm_set1_epi32(u4_round_factor);
+ __m128i temp_2 = _mm_set1_epi16(2);
+ __m128i temp_1 = _mm_set1_epi16(1);
+ __m128i src_r0, src_r1, src_r2, src_r3;
+ __m128i pred_r0, pred_r1, pred_r2, pred_r3;
+ __m128i temp0, temp1, temp2, temp3;
+ __m128i zero_8x16b = _mm_setzero_si128(); // all bits reset to zero
+ __m128i sign_reg0, sign_reg2;
+ __m128i scalemat_r0_r1, scalemat_r2_r3;
+ __m128i chroma_mask = _mm_set1_epi16 (0xFF);
+
+ scalemat_r0_r1 = _mm_loadu_si128((__m128i *) (pu2_scale_matrix)); //b00 b01 b02 b03 b10 b11 b12 b13 -- the scaling matrix 0th,1st row
+ scalemat_r2_r3 = _mm_loadu_si128((__m128i *) (pu2_scale_matrix + 8)); //b20 b21 b22 b23 b30 b31 b32 b33 -- the scaling matrix 2nd,3rd row
+ src_r0 = _mm_loadl_epi64((__m128i *) (&pu1_src[0])); //a00 a01 a02 a03 0 0 0 0 0 0 0 0 -- all 8 bits
+ src_r1 = _mm_loadl_epi64((__m128i *) (&pu1_src[src_strd])); //a10 a11 a12 a13 0 0 0 0 0 0 0 0 -- all 8 bits
+ src_r2 = _mm_loadl_epi64((__m128i *) (&pu1_src[2 * src_strd])); //a20 a21 a22 a23 0 0 0 0 0 0 0 0 -- all 8 bits
+ src_r3 = _mm_loadl_epi64((__m128i *) (&pu1_src[3 * src_strd])); //a30 a31 a32 a33 0 0 0 0 0 0 0 0 -- all 8 bits
+
+ src_r0 = _mm_and_si128(src_r0, chroma_mask);
+ src_r1 = _mm_and_si128(src_r1, chroma_mask);
+ src_r2 = _mm_and_si128(src_r2, chroma_mask);
+ src_r3 = _mm_and_si128(src_r3, chroma_mask);
+// src_r0 = _mm_cvtepu8_epi16(src_r0);
+// src_r1 = _mm_cvtepu8_epi16(src_r1);
+// src_r2 = _mm_cvtepu8_epi16(src_r2);
+// src_r3 = _mm_cvtepu8_epi16(src_r3);
+
+ pred_r0 = _mm_loadl_epi64((__m128i *) (&pu1_pred[0])); //p00 p01 p02 p03 0 0 0 0 0 0 0 0 -- all 8 bits
+ pred_r1 = _mm_loadl_epi64((__m128i *) (&pu1_pred[pred_strd])); //p10 p11 p12 p13 0 0 0 0 0 0 0 0 -- all 8 bits
+ pred_r2 = _mm_loadl_epi64((__m128i *) (&pu1_pred[2 * pred_strd])); //p20 p21 p22 p23 0 0 0 0 0 0 0 0 -- all 8 bits
+ pred_r3 = _mm_loadl_epi64((__m128i *) (&pu1_pred[3 * pred_strd])); //p30 p31 p32 p33 0 0 0 0 0 0 0 0 -- all 8 bits
+
+ pred_r0 = _mm_and_si128(pred_r0, chroma_mask);
+ pred_r1 = _mm_and_si128(pred_r1, chroma_mask);
+ pred_r2 = _mm_and_si128(pred_r2, chroma_mask);
+ pred_r3 = _mm_and_si128(pred_r3, chroma_mask);
+// pred_r0 = _mm_cvtepu8_epi16(pred_r0); //p00 p01 p02 p03 -- all 16 bits
+// pred_r1 = _mm_cvtepu8_epi16(pred_r1); //p10 p11 p12 p13 -- all 16 bits
+// pred_r2 = _mm_cvtepu8_epi16(pred_r2); //p20 p21 p22 p23 -- all 16 bits
+// pred_r3 = _mm_cvtepu8_epi16(pred_r3); //p30 p31 p32 p33 -- all 16 bits
+
+ src_r0 = _mm_sub_epi16(src_r0, pred_r0);
+ src_r1 = _mm_sub_epi16(src_r1, pred_r1);
+ src_r2 = _mm_sub_epi16(src_r2, pred_r2);
+ src_r3 = _mm_sub_epi16(src_r3, pred_r3);
+
+ /* Perform Forward transform */
+ /*-------------------------------------------------------------*/
+ /* DCT [ Horizontal transformation ] */
+ /*-------------------------------------------------------------*/
+ // Matrix transpose
+ /*
+ * a0 a1 a2 a3
+ * b0 b1 b2 b3
+ * c0 c1 c2 c3
+ * d0 d1 d2 d3
+ */
+ temp0 = _mm_unpacklo_epi16(src_r0, src_r1); //a0 b0 a1 b1 a2 b2 a3 b3
+ temp2 = _mm_unpacklo_epi16(src_r2, src_r3); //c0 d0 c1 d1 c2 d2 c3 d3
+ temp1 = _mm_unpacklo_epi32(temp0, temp2); //a0 b0 c0 d0 a1 b1 c1 d1
+ temp3 = _mm_unpackhi_epi32(temp0, temp2); //a2 b2 c2 d2 a3 b3 c3 d3
+
+ src_r0 = _mm_unpacklo_epi64(temp1, zero_8x16b); //a0 b0 c0 d0
+ src_r1 = _mm_unpackhi_epi64(temp1, zero_8x16b); //a1 b1 c1 d1
+ src_r2 = _mm_unpacklo_epi64(temp3, zero_8x16b); //a2 b2 c2 d2
+ src_r3 = _mm_unpackhi_epi64(temp3, zero_8x16b); //a3 b3 c3 d3
+
+ /*----------------------------------------------------------*/
+ /* x0 = z0 + z3 */
+ temp0 = _mm_add_epi16(src_r0, src_r3);
+ /* x1 = z1 + z2 */
+ temp1 = _mm_add_epi16(src_r1, src_r2);
+ /* x2 = z1 - z2 */
+ temp2 = _mm_sub_epi16(src_r1, src_r2);
+ /* x3 = z0 - z3 */
+ temp3 = _mm_sub_epi16(src_r0, src_r3);
+
+ /* z0 = x0 + x1 */
+ src_r0 = _mm_add_epi16(temp0, temp1);
+ /* z1 = (x3 << 1) + x2 */
+ src_r1 = _mm_slli_epi16(temp3, 1); //(x3<<1)
+ src_r1 = _mm_add_epi16(src_r1, temp2);
+ /* z2 = x0 - x1 */
+ src_r2 = _mm_sub_epi16(temp0, temp1);
+ /* z3 = x3 - (x2 << 1) */
+ src_r3 = _mm_slli_epi16(temp2, 1); //(x2<<1)
+ src_r3 = _mm_sub_epi16(temp3, src_r3);
+
+ // Matrix transpose
+ /*
+ * a0 b0 c0 d0
+ * a1 b1 c1 d1
+ * a2 b2 c2 d2
+ * a3 b3 c3 d3
+ */
+ temp0 = _mm_unpacklo_epi16(src_r0, src_r1); //a0 a1 b0 b1 c0 c1 d0 d1
+ temp2 = _mm_unpacklo_epi16(src_r2, src_r3); //a2 a3 b2 b3 c2 c3 d2 d3
+ temp1 = _mm_unpacklo_epi32(temp0, temp2); //a0 a1 a2 a3 b0 b1 b2 b3
+ temp3 = _mm_unpackhi_epi32(temp0, temp2); //c0 c1 c2 c3 d0 d1 d2 d3
+
+ src_r0 = _mm_unpacklo_epi64(temp1, zero_8x16b); //a0 a1 a2 a3
+ src_r1 = _mm_unpackhi_epi64(temp1, zero_8x16b); //b0 b1 b2 b3
+ src_r2 = _mm_unpacklo_epi64(temp3, zero_8x16b); //c0 c1 c2 c3
+ src_r3 = _mm_unpackhi_epi64(temp3, zero_8x16b); //d0 d1 d2 d3
+
+ /*----------------------------------------------------------*/
+ /* x0 = z0 + z3 */
+ temp0 = _mm_add_epi16(src_r0, src_r3);
+ /* x1 = z1 + z2 */
+ temp1 = _mm_add_epi16(src_r1, src_r2);
+ /* x2 = z1 - z2 */
+ temp2 = _mm_sub_epi16(src_r1, src_r2);
+ /* x3 = z0 - z3 */
+ temp3 = _mm_sub_epi16(src_r0, src_r3);
+
+ /* z0 = x0 + x1 */
+ src_r0 = _mm_add_epi16(temp0, temp1);
+ /* z1 = (x3 << 1) + x2 */
+ src_r1 = _mm_slli_epi16(temp3, 1); //(x3<<1)
+ src_r1 = _mm_add_epi16(src_r1, temp2);
+ /* z2 = x0 - x1 */
+ src_r2 = _mm_sub_epi16(temp0, temp1);
+ /* z3 = x3 - (x2 << 1) */
+ src_r3 = _mm_slli_epi16(temp2, 1); //(x2<<1)
+ src_r3 = _mm_sub_epi16(temp3, src_r3);
+
+ tmp_dc = _mm_extract_epi16(src_r0,0); //a0
+ *pi2_alt_dc_addr = tmp_dc;
+
+ src_r0 = _mm_unpacklo_epi64(src_r0, src_r1); //a0 a1 a2 a3 b0 b1 b2 b3
+ src_r2 = _mm_unpacklo_epi64(src_r2, src_r3); //c0 c1 c2 c3 d0 d1 d2 d3
+ sign_reg0 = _mm_cmpgt_epi16(zero_8x16b,src_r0);
+ sign_reg2 = _mm_cmpgt_epi16(zero_8x16b,src_r2);
+
+ sign_reg0 = _mm_mullo_epi16(temp_2,sign_reg0);
+ sign_reg2 = _mm_mullo_epi16(temp_2,sign_reg2);
+
+ sign_reg0 = _mm_add_epi16(temp_1,sign_reg0);
+ sign_reg2 = _mm_add_epi16(temp_1,sign_reg2);
+
+ src_r0 = _mm_abs_epi16(src_r0);
+ src_r2 = _mm_abs_epi16(src_r2);
+
+ src_r1 = _mm_srli_si128(src_r0, 8);
+ src_r0 = _mm_cvtepu16_epi32(src_r0);
+ src_r1 = _mm_cvtepu16_epi32(src_r1);
+ src_r3 = _mm_srli_si128(src_r2, 8);
+ src_r2 = _mm_cvtepu16_epi32(src_r2);
+ src_r3 = _mm_cvtepu16_epi32(src_r3);
+
+ temp0 = _mm_cvtepu16_epi32(scalemat_r0_r1);
+ scalemat_r0_r1 = _mm_srli_si128(scalemat_r0_r1, 8);
+ temp2 = _mm_cvtepu16_epi32(scalemat_r2_r3);
+ scalemat_r2_r3 = _mm_srli_si128(scalemat_r2_r3, 8);
+ temp1 = _mm_cvtepu16_epi32(scalemat_r0_r1);
+ temp3 = _mm_cvtepu16_epi32(scalemat_r2_r3);
+
+ temp0 = _mm_mullo_epi32(temp0, src_r0);
+ temp1 = _mm_mullo_epi32(temp1, src_r1);
+ temp2 = _mm_mullo_epi32(temp2, src_r2);
+ temp3 = _mm_mullo_epi32(temp3, src_r3);
+
+ temp0 = _mm_add_epi32(temp0,rnd_fact);
+ temp1 = _mm_add_epi32(temp1,rnd_fact);
+ temp2 = _mm_add_epi32(temp2,rnd_fact);
+ temp3 = _mm_add_epi32(temp3,rnd_fact);
+
+ temp0 = _mm_srli_epi32(temp0,u4_qbits);
+ temp1 = _mm_srli_epi32(temp1,u4_qbits);
+ temp2 = _mm_srli_epi32(temp2,u4_qbits);
+ temp3 = _mm_srli_epi32(temp3,u4_qbits);
+
+ temp0 = _mm_packs_epi32 (temp0,temp1);
+ temp2 = _mm_packs_epi32 (temp2,temp3);
+
+ temp0 = _mm_sign_epi16(temp0, sign_reg0);
+ temp2 = _mm_sign_epi16(temp2, sign_reg2);
+
+ //temp0 = _mm_insert_epi16(temp0, tmp_dc, 0);
+
+ _mm_storeu_si128((__m128i *) (&pi2_out[0]), temp0);
+ _mm_storeu_si128((__m128i *) (&pi2_out[8]), temp2);
+
+ cmp0 = _mm_cmpeq_epi16(temp0, zero_8x16b);
+ cmp1 = _mm_cmpeq_epi16(temp2, zero_8x16b);
+
+ mask0 = _mm_movemask_epi8(cmp0);
+ mask1 = _mm_movemask_epi8(cmp1);
+ u4_zero_coeff = 0;
+ if(mask0)
+ {
+ if(mask0 == 0xffff)
+ u4_zero_coeff+=8;
+ else
+ {
+ cmp0 = _mm_and_si128(temp_1, cmp0);
+ sum0 = _mm_hadd_epi16(cmp0, zero_8x16b);
+ sum1 = _mm_hadd_epi16(sum0, zero_8x16b);
+ sum2 = _mm_hadd_epi16(sum1, zero_8x16b);
+ u4_zero_coeff += _mm_cvtsi128_si32(sum2);
+ }
+ }
+ if(mask1)
+ {
+ if(mask1 == 0xffff)
+ u4_zero_coeff+=8;
+ else
+ {
+ cmp1 = _mm_and_si128(temp_1, cmp1);
+ sum0 = _mm_hadd_epi16(cmp1, zero_8x16b);
+ sum1 = _mm_hadd_epi16(sum0, zero_8x16b);
+ sum2 = _mm_hadd_epi16(sum1, zero_8x16b);
+ u4_zero_coeff += _mm_cvtsi128_si32(sum2);
+ }
+ }
+
+ /* Return total nonzero coefficients in the current sub block */
+ u4_nonzero_coeff = 16 - u4_zero_coeff;
+ *pu1_nnz = u4_nonzero_coeff;
+
+}
+
+
+/**
+ *******************************************************************************
+ *
+ * @brief
+ * This function performs forward hadamard transform and quantization on a 4*4 block
+ *
+ * @par Description:
+ * The function accepts source buffer and estimation buffer. From these, it
+ * computes the residue. This is residue is then transformed and quantized.
+ * The transform and quantization are in placed computed. They use the residue
+ * buffer for this.
+ *
+ * @param[in] pu1_src
+ * Pointer to source sub-block
+ *
+ * @param[in] pu1_pred
+ * Pointer to prediction sub-block
+ *
+ * @param[in] pi2_out
+ * Pointer to residual sub-block
+ *
+ * @param[in] src_strd
+ * Source stride
+ *
+ * @param[in] pred_strd
+ * Prediction stride
+ *
+ * @param[in] dst_strd
+ * Destination stride
+ *
+ * @param[in] u4_qbits
+ * QP_BITS_h264_4x4 + floor(QP/6)
+ *
+ * @param[in] pu2_threshold_matrix
+ * Pointer to Forward Quant Threshold Matrix
+ *
+ * @param[in] pu2_scale_matrix
+ * Pointer to Forward Quant Scale Matrix
+ *
+ * @param[in] u4_round_factor
+ * Quantization Round factor
+ *
+ * @param[out] pu1_nnz
+ * Total non-zero coefficients in the current sub-block
+ *
+ * @returns
+ *
+ * @remarks
+ * None
+ *
+ */
+
+void ih264_hadamard_quant_4x4_sse42(WORD16 *pi2_src, WORD16 *pi2_dst,
+ const UWORD16 *pu2_scale_matrix,
+ const UWORD16 *pu2_threshold_matrix, UWORD32 u4_qbits,
+ UWORD32 u4_round_factor,UWORD8 *pu1_nnz
+ )
+{
+ WORD32 u4_zero_coeff,u4_nonzero_coeff=0;
+ __m128i cmp0, cmp1, sum0, sum1, sum2;
+ WORD32 mask0, mask1;
+ __m128i src_r0_r1, src_r2_r3, sign_reg;
+ __m128i src_r0, src_r1, src_r2, src_r3;
+ __m128i zero_8x16b = _mm_setzero_si128();
+ __m128i temp0, temp1, temp2, temp3;
+ __m128i sign_reg0, sign_reg1, sign_reg2, sign_reg3;
+ __m128i temp_1 = _mm_set1_epi16(1);
+ __m128i rnd_fact = _mm_set1_epi32(u4_round_factor);
+ __m128i scale_val = _mm_set1_epi32(pu2_scale_matrix[0]);
+
+ src_r0_r1 = _mm_loadu_si128((__m128i *) (pi2_src)); //a00 a01 a02 a03 a10 a11 a12 a13 -- the source matrix 0th,1st row
+ src_r2_r3 = _mm_loadu_si128((__m128i *) (pi2_src + 8)); //a20 a21 a22 a23 a30 a31 a32 a33 -- the source matrix 2nd,3rd row
+ sign_reg = _mm_cmpgt_epi16(zero_8x16b, src_r0_r1);
+ src_r0 = _mm_unpacklo_epi16(src_r0_r1, sign_reg); //a0 a1 a2 a3
+ src_r1 = _mm_unpackhi_epi16(src_r0_r1, sign_reg); //b0 b1 b2 b3
+ sign_reg = _mm_cmpgt_epi16(zero_8x16b, src_r2_r3);
+ src_r2 = _mm_unpacklo_epi16(src_r2_r3, sign_reg); //c0 c1 c2 c3
+ src_r3 = _mm_unpackhi_epi16(src_r2_r3, sign_reg); //d0 d1 d2 d3
+
+ /* Perform Inverse transform */
+ /*-------------------------------------------------------------*/
+ /* Forward DC transform [ Horizontal transformation ] */
+ /*-------------------------------------------------------------*/
+ // Matrix transpose
+ /*
+ * a0 a1 a2 a3
+ * b0 b1 b2 b3
+ * c0 c1 c2 c3
+ * d0 d1 d2 d3
+ */
+ temp0 = _mm_unpacklo_epi32(src_r0, src_r1); //a0 b0 a1 b1
+ temp2 = _mm_unpacklo_epi32(src_r2, src_r3); //c0 d0 c1 d1
+ temp1 = _mm_unpackhi_epi32(src_r0, src_r1); //a2 b2 a3 b3
+ temp3 = _mm_unpackhi_epi32(src_r2, src_r3); //c2 d2 c3 d3
+ src_r0 = _mm_unpacklo_epi64(temp0, temp2); //a0 b0 c0 d0
+ src_r1 = _mm_unpackhi_epi64(temp0, temp2); //a1 b1 c1 d1
+ src_r2 = _mm_unpacklo_epi64(temp1, temp3); //a2 b2 c2 d2
+ src_r3 = _mm_unpackhi_epi64(temp1, temp3); //a3 b3 c3 d3
+
+ temp0 = _mm_add_epi32(src_r0, src_r3);
+ temp1 = _mm_add_epi32(src_r1, src_r2);
+ temp2 = _mm_sub_epi32(src_r1, src_r2);
+ temp3 = _mm_sub_epi32(src_r0, src_r3);
+
+ src_r0 = _mm_add_epi32(temp0, temp1);
+ src_r1 = _mm_add_epi32(temp2, temp3);
+ src_r2 = _mm_sub_epi32(temp0, temp1);
+ src_r3 = _mm_sub_epi32(temp3, temp2);
+
+ /*-------------------------------------------------------------*/
+ /* Forward DC transform [ Vertical transformation ] */
+ /*-------------------------------------------------------------*/
+ // Matrix transpose
+ /*
+ * a0 b0 c0 d0
+ * a1 b1 c1 d1
+ * a2 b2 c2 d2
+ * a3 b3 c3 d3
+ */
+ temp0 = _mm_unpacklo_epi32(src_r0, src_r1); //a0 a1 b0 b1
+ temp2 = _mm_unpacklo_epi32(src_r2, src_r3); //a2 a3 b2 b3
+ temp1 = _mm_unpackhi_epi32(src_r0, src_r1); //c0 c1 d0 d1
+ temp3 = _mm_unpackhi_epi32(src_r2, src_r3); //c2 c3 d2 d3
+ src_r0 = _mm_unpacklo_epi64(temp0, temp2); //a0 a1 a2 a3
+ src_r1 = _mm_unpackhi_epi64(temp0, temp2); //b0 b1 b2 b3
+ src_r2 = _mm_unpacklo_epi64(temp1, temp3); //c0 c1 c2 c3
+ src_r3 = _mm_unpackhi_epi64(temp1, temp3); //d0 d1 d2 d3
+
+ temp0 = _mm_add_epi32(src_r0, src_r3);
+ temp1 = _mm_add_epi32(src_r1, src_r2);
+ temp2 = _mm_sub_epi32(src_r1, src_r2);
+ temp3 = _mm_sub_epi32(src_r0, src_r3);
+
+ src_r0 = _mm_add_epi32(temp0, temp1);
+ src_r1 = _mm_add_epi32(temp2, temp3);
+ src_r2 = _mm_sub_epi32(temp0, temp1);
+ src_r3 = _mm_sub_epi32(temp3, temp2);
+
+ src_r0 = _mm_srai_epi32(src_r0, 1);
+ src_r1 = _mm_srai_epi32(src_r1, 1);
+ src_r2 = _mm_srai_epi32(src_r2, 1);
+ src_r3 = _mm_srai_epi32(src_r3, 1);
+
+ // Quantization
+ sign_reg0 = _mm_cmpgt_epi32(zero_8x16b, src_r0); //Find sign of each value for later restoration
+ sign_reg1 = _mm_cmpgt_epi32(zero_8x16b, src_r1);
+ sign_reg2 = _mm_cmpgt_epi32(zero_8x16b, src_r2);
+ sign_reg3 = _mm_cmpgt_epi32(zero_8x16b, src_r3);
+
+ sign_reg0 = _mm_packs_epi32(sign_reg0, sign_reg1); //Sign = -1 or 0 depending on <0 or >0 respectively
+ sign_reg2 = _mm_packs_epi32(sign_reg2, sign_reg3);
+
+ sign_reg0 = _mm_slli_epi16(sign_reg0, 1); //Sign = -2 or 0 depending on <0 or >0 respectively
+ sign_reg2 = _mm_slli_epi16(sign_reg2, 1);
+
+ sign_reg0 = _mm_add_epi16(temp_1,sign_reg0); //Sign = -1 or 1 depending on <0 or >0 respectively
+ sign_reg2 = _mm_add_epi16(temp_1,sign_reg2);
+
+ src_r0 = _mm_abs_epi32(src_r0); //Absolute values
+ src_r1 = _mm_abs_epi32(src_r1);
+ src_r2 = _mm_abs_epi32(src_r2);
+ src_r3 = _mm_abs_epi32(src_r3);
+
+ temp0 = _mm_mullo_epi32(scale_val, src_r0); //multiply by pu2_scale_matrix[0]
+ temp1 = _mm_mullo_epi32(scale_val, src_r1);
+ temp2 = _mm_mullo_epi32(scale_val, src_r2);
+ temp3 = _mm_mullo_epi32(scale_val, src_r3);
+
+ temp0 = _mm_add_epi32(temp0,rnd_fact); //Add round factor
+ temp1 = _mm_add_epi32(temp1,rnd_fact);
+ temp2 = _mm_add_epi32(temp2,rnd_fact);
+ temp3 = _mm_add_epi32(temp3,rnd_fact);
+
+ temp0 = _mm_srli_epi32(temp0,u4_qbits); //RIght shift by qbits, unsigned variable, so shift right immediate works
+ temp1 = _mm_srli_epi32(temp1,u4_qbits);
+ temp2 = _mm_srli_epi32(temp2,u4_qbits);
+ temp3 = _mm_srli_epi32(temp3,u4_qbits);
+
+ temp0 = _mm_packs_epi32 (temp0,temp1); //Final values are 16-bits only.
+ temp2 = _mm_packs_epi32 (temp2,temp3);
+
+ temp0 = _mm_sign_epi16(temp0, sign_reg0); //Sign restoration
+ temp2 = _mm_sign_epi16(temp2, sign_reg2);
+
+ _mm_storeu_si128((__m128i *) (&pi2_dst[0]), temp0);
+ _mm_storeu_si128((__m128i *) (&pi2_dst[8]), temp2);
+
+ cmp0 = _mm_cmpeq_epi16(temp0, zero_8x16b);
+ cmp1 = _mm_cmpeq_epi16(temp2, zero_8x16b);
+
+ mask0 = _mm_movemask_epi8(cmp0);
+ mask1 = _mm_movemask_epi8(cmp1);
+ u4_zero_coeff = 0;
+ if(mask0)
+ {
+ if(mask0 == 0xffff)
+ u4_zero_coeff+=8;
+ else
+ {
+ cmp0 = _mm_and_si128(temp_1, cmp0);
+ sum0 = _mm_hadd_epi16(cmp0, zero_8x16b);
+ sum1 = _mm_hadd_epi16(sum0, zero_8x16b);
+ sum2 = _mm_hadd_epi16(sum1, zero_8x16b);
+ u4_zero_coeff += _mm_cvtsi128_si32(sum2);
+ }
+ }
+ if(mask1)
+ {
+ if(mask1 == 0xffff)
+ u4_zero_coeff+=8;
+ else
+ {
+ cmp1 = _mm_and_si128(temp_1, cmp1);
+ sum0 = _mm_hadd_epi16(cmp1, zero_8x16b);
+ sum1 = _mm_hadd_epi16(sum0, zero_8x16b);
+ sum2 = _mm_hadd_epi16(sum1, zero_8x16b);
+ u4_zero_coeff += _mm_cvtsi128_si32(sum2);
+ }
+ }
+
+ /* Return total nonzero coefficients in the current sub block */
+ u4_nonzero_coeff = 16 - u4_zero_coeff;
+ pu1_nnz[0] = u4_nonzero_coeff;
+}
+
+
+/**
+ *******************************************************************************
+ *
+ * @brief
+ * This function performs forward hadamard transform and quantization on a 2*2 block
+ * for both U and V planes
+ *
+ * @par Description:
+ * The function accepts source buffer and estimation buffer. From these, it
+ * computes the residue. This is residue is then transformed and quantized.
+ * The transform and quantization are in placed computed. They use the residue
+ * buffer for this.
+ *
+ * @param[in] pu1_src
+ * Pointer to source sub-block
+ *
+ * @param[in] pu1_pred
+ * Pointer to prediction sub-block
+ *
+ * @param[in] pi2_out
+ * Pointer to residual sub-block
+ *
+ * @param[in] src_strd
+ * Source stride
+ *
+ * @param[in] pred_strd
+ * Prediction stride
+ *
+ * @param[in] dst_strd
+ * Destination stride
+ *
+ * @param[in] u4_qbits
+ * QP_BITS_h264_4x4 + floor(QP/6)
+ *
+ * @param[in] pu2_threshold_matrix
+ * Pointer to Forward Quant Threshold Matrix
+ *
+ * @param[in] pu2_scale_matrix
+ * Pointer to Forward Quant Scale Matrix
+ *
+ * @param[in] u4_round_factor
+ * Quantization Round factor
+ *
+ * @param[out] pu1_nnz
+ * Total non-zero coefficients in the current sub-block
+ *
+ * @returns
+ *
+ * @remarks
+ * NNZ for dc is populated at 0 and 5th position of pu1_nnz
+ *
+ */
+
+void ih264_hadamard_quant_2x2_uv_sse42(WORD16 *pi2_src, WORD16 *pi2_dst,
+ const UWORD16 *pu2_scale_matrix,
+ const UWORD16 *pu2_threshold_matrix, UWORD32 u4_qbits,
+ UWORD32 u4_round_factor,UWORD8 *pu1_nnz)
+{
+ WORD32 val, nonzero_coeff_0, nonzero_coeff_1=0;
+ nonzero_coeff_0 = 0;
+ __m128i cmp, cmp0, cmp1;
+ __m128i sum0, sum1;
+ WORD32 mask, mask0, mask1;
+ __m128i src, plane_0, plane_1, temp0, temp1, sign_reg;
+ __m128i zero_8x16b = _mm_setzero_si128();
+ __m128i scale_val = _mm_set1_epi32(pu2_scale_matrix[0]);
+ __m128i sign_reg0, sign_reg1;
+ __m128i temp_1 = _mm_set1_epi16(1);
+ __m128i rnd_fact = _mm_set1_epi32(u4_round_factor);
+
+ src = _mm_loadu_si128((__m128i *)pi2_src); //a0 a1 a2 a3 b0 b1 b2 b3
+ sign_reg = _mm_cmpgt_epi16(zero_8x16b, src);
+ plane_0 = _mm_unpacklo_epi16(src, sign_reg); //a0 a1 a2 a3 -- 32 bits
+ plane_1 = _mm_unpackhi_epi16(src, sign_reg); //b0 b1 b2 b3 -- 32 bits
+
+ temp0 = _mm_hadd_epi32(plane_0, plane_1); //a0+a1 a2+a3 b0+b1 b2+b3
+ temp1 = _mm_hsub_epi32(plane_0, plane_1); //a0-a1 a2-a3 b0-b1 b2-b3
+
+ plane_0 = _mm_hadd_epi32(temp0, temp1); //a0+a1+a2+a3 b0+b1+b2+b3 a0-a1+a2-a3 b0-b1+b2-b3
+ plane_1 = _mm_hsub_epi32(temp0, temp1); //a0+a1-a2-a3 b0+b1-b2-b3 a0-a1-a2+a3 b0-b1-b2+b3
+
+ temp0 = _mm_unpacklo_epi32(plane_0, plane_1); //a0+a1+a2+a3 a0+a1-a2-a3 b0+b1+b2+b3 b0+b1-b2-b3
+ temp1 = _mm_unpackhi_epi32(plane_0, plane_1); //a0-a1+a2-a3 a0-a1-a2+a3 b0-b1+b2-b3 b0-b1-b2+b3
+
+ plane_0 = _mm_unpacklo_epi64(temp0, temp1); //a0+a1+a2+a3 a0+a1-a2-a3 a0-a1+a2-a3 a0-a1-a2+a3
+ plane_1 = _mm_unpackhi_epi64(temp0, temp1); //b0+b1+b2+b3 b0+b1-b2-b3 b0-b1+b2-b3 b0-b1-b2+b3
+
+ plane_0 = _mm_shuffle_epi32(plane_0, 0xd8); //a0+a1+a2+a3 a0-a1+a2-a3 a0+a1-a2-a3 a0-a1-a2+a3
+ plane_1 = _mm_shuffle_epi32(plane_1, 0xd8); //b0+b1+b2+b3 b0-b1+b2-b3 b0+b1-b2-b3 b0-b1-b2+b3
+ // Quantization
+ sign_reg0 = _mm_cmpgt_epi32(zero_8x16b, plane_0); //Find sign of each value for later restoration
+ sign_reg1 = _mm_cmpgt_epi32(zero_8x16b, plane_1);
+
+ sign_reg0 = _mm_packs_epi32(sign_reg0, sign_reg1); //Sign = -1 or 0 depending on <0 or >0 respectively
+ sign_reg0 = _mm_slli_epi16(sign_reg0, 1); //Sign = -2 or 0 depending on <0 or >0 respectively
+ sign_reg0 = _mm_add_epi16(temp_1,sign_reg0); //Sign = -1 or 1 depending on <0 or >0 respectively
+
+ plane_0 = _mm_abs_epi32(plane_0); //Absolute values
+ plane_1 = _mm_abs_epi32(plane_1);
+
+ temp0 = _mm_mullo_epi32(scale_val, plane_0); //multiply by pu2_scale_matrix[0]
+ temp1 = _mm_mullo_epi32(scale_val, plane_1); //multiply by pu2_scale_matrix[0]
+
+ temp0 = _mm_add_epi32(temp0,rnd_fact); //Add round factor
+ temp1 = _mm_add_epi32(temp1,rnd_fact);
+
+ temp0 = _mm_srli_epi32(temp0,u4_qbits); //RIght shift by qbits, unsigned variable, so shift right immediate works
+ temp1 = _mm_srli_epi32(temp1,u4_qbits);
+
+ temp0 = _mm_packs_epi32 (temp0,temp1); //Final values are 16-bits only.
+ temp0 = _mm_sign_epi16(temp0, sign_reg0); //Sign restoration
+
+ _mm_storeu_si128((__m128i *) (&pi2_dst[0]), temp0);
+
+ cmp = _mm_cmpeq_epi16(temp0, zero_8x16b);
+ mask = _mm_movemask_epi8(cmp);
+ mask0 = mask & 0xff;
+ mask1 = mask>>8;
+ if(mask0)
+ {
+ if(mask0 == 0xff)
+ nonzero_coeff_0 += 4;
+ else
+ {
+ cmp0 = _mm_and_si128(temp_1, cmp);
+ sum0 = _mm_hadd_epi16(cmp0, zero_8x16b);
+ sum1 = _mm_hadd_epi16(sum0, zero_8x16b);
+ val = _mm_cvtsi128_si32(sum1);
+ val = val & 0xffff;
+ nonzero_coeff_0 += val;
+ }
+ }
+ if(mask1)
+ {
+ if(mask1 == 0xff)
+ nonzero_coeff_1 += 4;
+ else
+ {
+ cmp1 = _mm_srli_si128(cmp, 8);
+ cmp1 = _mm_and_si128(temp_1, cmp1);
+ sum0 = _mm_hadd_epi16(cmp1, zero_8x16b);
+ sum1 = _mm_hadd_epi16(sum0, zero_8x16b);
+ nonzero_coeff_1 += _mm_cvtsi128_si32(sum1);
+ }
+ }
+
+ pu1_nnz[0] = 4 - nonzero_coeff_0;
+ pu1_nnz[1] = 4 - nonzero_coeff_1;
+
+}
diff --git a/common/x86/ih264_weighted_pred_sse42.c b/common/x86/ih264_weighted_pred_sse42.c
new file mode 100755
index 0000000..b1684b7
--- /dev/null
+++ b/common/x86/ih264_weighted_pred_sse42.c
@@ -0,0 +1,1349 @@
+/******************************************************************************
+ *
+ * Copyright (C) 2015 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ *****************************************************************************
+ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
+*/
+/*****************************************************************************/
+/* */
+/* File Name : ih264_weighted_pred_intr_sse42.c */
+/* */
+/* Description : Contains function definitions for weighted */
+/* prediction functions in x86 sse4 intrinsics */
+/* */
+/* List of Functions : ih264_default_weighted_pred_luma_sse42() */
+/* ih264_default_weighted_pred_chroma_sse42() */
+/* ih264_weighted_pred_luma_sse42() */
+/* ih264_weighted_pred_chroma_sse42() */
+/* ih264_weighted_bipred_luma_sse42() */
+/* ih264_weighted_bipred_chroma_sse42() */
+/* */
+/* Issues / Problems : None */
+/* */
+/* Revision History : */
+/* */
+/* DD MM YYYY Author(s) Changes */
+/* 30 01 2015 Kaushik Initial version */
+/* Senthoor */
+/* */
+/*****************************************************************************/
+/*****************************************************************************/
+/* File Includes */
+/*****************************************************************************/
+
+#include <immintrin.h>
+#include "ih264_typedefs.h"
+#include "ih264_macros.h"
+#include "ih264_platform_macros.h"
+#include "ih264_weighted_pred.h"
+
+/*****************************************************************************/
+/* Function definitions . */
+/*****************************************************************************/
+/*****************************************************************************/
+/* */
+/* Function Name : ih264_default_weighted_pred_luma_sse42 */
+/* */
+/* Description : This function performs the default weighted prediction */
+/* as described in sec 8.4.2.3.1 titled "Default weighted */
+/* sample prediction process" for luma. The function gets */
+/* two ht x wd blocks, calculates their rounded-average and */
+/* stores it in the destination block. (ht,wd) can be */
+/* (4,4), (8,4), (4,8), (8,8), (16,8), (8,16) or (16,16). */
+/* */
+/* Inputs : pu1_src1 - Pointer to source 1 */
+/* pu1_src2 - Pointer to source 2 */
+/* pu1_dst - Pointer to destination */
+/* src_strd1 - stride for source 1 */
+/* src_strd1 - stride for source 2 */
+/* dst_strd - stride for destination */
+/* ht - height of the block */
+/* wd - width of the block */
+/* */
+/* Issues : None */
+/* */
+/* Revision History: */
+/* */
+/* DD MM YYYY Author(s) Changes */
+/* 04 02 2015 Kaushik Initial Version */
+/* Senthoor */
+/* */
+/*****************************************************************************/
+void ih264_default_weighted_pred_luma_sse42(UWORD8 *pu1_src1,
+ UWORD8 *pu1_src2,
+ UWORD8 *pu1_dst,
+ WORD32 src_strd1,
+ WORD32 src_strd2,
+ WORD32 dst_strd,
+ WORD32 ht,
+ WORD32 wd)
+{
+ __m128i y0_0_16x8b, y0_1_16x8b, y0_2_16x8b, y0_3_16x8b;
+ __m128i y1_0_16x8b, y1_1_16x8b, y1_2_16x8b, y1_3_16x8b;
+
+ if(wd == 4)
+ {
+ __m128i mask_full_16x8b, mask_ll4B_16x8b;
+
+ mask_full_16x8b = _mm_set1_epi8(0xff);
+ mask_ll4B_16x8b = _mm_srli_si128(mask_full_16x8b, 12);
+ // mask for first four bytes
+
+ do
+ {
+ y0_0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src1);
+ y0_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src1 + src_strd1));
+ y0_2_16x8b = _mm_loadl_epi64(
+ (__m128i *)(pu1_src1 + (src_strd1 << 1)));
+ y0_3_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src1 + src_strd1 * 3));
+
+ y1_0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src2);
+ y1_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src2 + src_strd2));
+ y1_2_16x8b = _mm_loadl_epi64(
+ (__m128i *)(pu1_src2 + (src_strd2 << 1)));
+ y1_3_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src2 + src_strd2 * 3));
+
+ y0_0_16x8b = _mm_avg_epu8(y0_0_16x8b, y1_0_16x8b);
+ y0_1_16x8b = _mm_avg_epu8(y0_1_16x8b, y1_1_16x8b);
+ y0_2_16x8b = _mm_avg_epu8(y0_2_16x8b, y1_2_16x8b);
+ y0_3_16x8b = _mm_avg_epu8(y0_3_16x8b, y1_3_16x8b);
+
+ _mm_maskmoveu_si128(y0_0_16x8b, mask_ll4B_16x8b, (char*)pu1_dst);
+ _mm_maskmoveu_si128(y0_1_16x8b, mask_ll4B_16x8b,
+ (char*)(pu1_dst + dst_strd));
+ _mm_maskmoveu_si128(y0_2_16x8b, mask_ll4B_16x8b,
+ (char*)(pu1_dst + (dst_strd << 1)));
+ _mm_maskmoveu_si128(y0_3_16x8b, mask_ll4B_16x8b,
+ (char*)(pu1_dst + dst_strd * 3));
+
+ ht -= 4;
+ pu1_src1 += src_strd1 << 2;
+ pu1_src2 += src_strd2 << 2;
+ pu1_dst += dst_strd << 2;
+ }
+ while(ht > 0);
+ }
+ else if(wd == 8)
+ {
+ do
+ {
+ y0_0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src1);
+ y0_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src1 + src_strd1));
+ y0_2_16x8b = _mm_loadl_epi64(
+ (__m128i *)(pu1_src1 + (src_strd1 << 1)));
+ y0_3_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src1 + src_strd1 * 3));
+
+ y1_0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src2);
+ y1_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src2 + src_strd2));
+ y1_2_16x8b = _mm_loadl_epi64(
+ (__m128i *)(pu1_src2 + (src_strd2 << 1)));
+ y1_3_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src2 + src_strd2 * 3));
+
+ y0_0_16x8b = _mm_avg_epu8(y0_0_16x8b, y1_0_16x8b);
+ y0_1_16x8b = _mm_avg_epu8(y0_1_16x8b, y1_1_16x8b);
+ y0_2_16x8b = _mm_avg_epu8(y0_2_16x8b, y1_2_16x8b);
+ y0_3_16x8b = _mm_avg_epu8(y0_3_16x8b, y1_3_16x8b);
+
+ _mm_storel_epi64((__m128i *)pu1_dst, y0_0_16x8b);
+ _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd), y0_1_16x8b);
+ _mm_storel_epi64((__m128i *)(pu1_dst + (dst_strd << 1)), y0_2_16x8b);
+ _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd * 3), y0_3_16x8b);
+
+ ht -= 4;
+ pu1_src1 += src_strd1 << 2;
+ pu1_src2 += src_strd2 << 2;
+ pu1_dst += dst_strd << 2;
+ }
+ while(ht > 0);
+ }
+ else // wd == 16
+ {
+ __m128i y0_4_16x8b, y0_5_16x8b, y0_6_16x8b, y0_7_16x8b;
+ __m128i y1_4_16x8b, y1_5_16x8b, y1_6_16x8b, y1_7_16x8b;
+
+ do
+ {
+ y0_0_16x8b = _mm_loadu_si128((__m128i *)pu1_src1);
+ y0_1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src1 + src_strd1));
+ y0_2_16x8b = _mm_loadu_si128(
+ (__m128i *)(pu1_src1 + (src_strd1 << 1)));
+ y0_3_16x8b = _mm_loadu_si128((__m128i *)(pu1_src1 + src_strd1 * 3));
+ y0_4_16x8b = _mm_loadu_si128(
+ (__m128i *)(pu1_src1 + (src_strd1 << 2)));
+ y0_5_16x8b = _mm_loadu_si128((__m128i *)(pu1_src1 + src_strd1 * 5));
+ y0_6_16x8b = _mm_loadu_si128((__m128i *)(pu1_src1 + src_strd1 * 6));
+ y0_7_16x8b = _mm_loadu_si128((__m128i *)(pu1_src1 + src_strd1 * 7));
+
+ y1_0_16x8b = _mm_loadu_si128((__m128i *)pu1_src2);
+ y1_1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src2 + src_strd2));
+ y1_2_16x8b = _mm_loadu_si128(
+ (__m128i *)(pu1_src2 + (src_strd2 << 1)));
+ y1_3_16x8b = _mm_loadu_si128((__m128i *)(pu1_src2 + src_strd2 * 3));
+ y1_4_16x8b = _mm_loadu_si128(
+ (__m128i *)(pu1_src2 + (src_strd2 << 2)));
+ y1_5_16x8b = _mm_loadu_si128((__m128i *)(pu1_src2 + src_strd2 * 5));
+ y1_6_16x8b = _mm_loadu_si128((__m128i *)(pu1_src2 + src_strd2 * 6));
+ y1_7_16x8b = _mm_loadu_si128((__m128i *)(pu1_src2 + src_strd2 * 7));
+
+ y0_0_16x8b = _mm_avg_epu8(y0_0_16x8b, y1_0_16x8b);
+ y0_1_16x8b = _mm_avg_epu8(y0_1_16x8b, y1_1_16x8b);
+ y0_2_16x8b = _mm_avg_epu8(y0_2_16x8b, y1_2_16x8b);
+ y0_3_16x8b = _mm_avg_epu8(y0_3_16x8b, y1_3_16x8b);
+ y0_4_16x8b = _mm_avg_epu8(y0_4_16x8b, y1_4_16x8b);
+ y0_5_16x8b = _mm_avg_epu8(y0_5_16x8b, y1_5_16x8b);
+ y0_6_16x8b = _mm_avg_epu8(y0_6_16x8b, y1_6_16x8b);
+ y0_7_16x8b = _mm_avg_epu8(y0_7_16x8b, y1_7_16x8b);
+
+ _mm_storeu_si128((__m128i *)pu1_dst, y0_0_16x8b);
+ _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd), y0_1_16x8b);
+ _mm_storeu_si128((__m128i *)(pu1_dst + (dst_strd << 1)), y0_2_16x8b);
+ _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd * 3), y0_3_16x8b);
+ _mm_storeu_si128((__m128i *)(pu1_dst + (dst_strd << 2)), y0_4_16x8b);
+ _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd * 5), y0_5_16x8b);
+ _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd * 6), y0_6_16x8b);
+ _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd * 7), y0_7_16x8b);
+
+ ht -= 8;
+ pu1_src1 += src_strd1 << 3;
+ pu1_src2 += src_strd2 << 3;
+ pu1_dst += dst_strd << 3;
+ }
+ while(ht > 0);
+ }
+}
+
+/*****************************************************************************/
+/* */
+/* Function Name : ih264_default_weighted_pred_chroma_sse42 */
+/* */
+/* Description : This function performs the default weighted prediction */
+/* as described in sec 8.4.2.3.1 titled "Default weighted */
+/* sample prediction process" for chroma. The function gets */
+/* two ht x wd blocks, calculates their rounded-average and */
+/* stores it in the destination block. (ht,wd) can be */
+/* (2,2), (4,2) , (2,4), (4,4), (8,4), (4,8) or (8,8). */
+/* */
+/* Inputs : pu1_src1 - Pointer to source 1 */
+/* pu1_src2 - Pointer to source 2 */
+/* pu1_dst - Pointer to destination */
+/* src_strd1 - stride for source 1 */
+/* src_strd1 - stride for source 2 */
+/* dst_strd - stride for destination */
+/* ht - height of the block */
+/* wd - width of the block */
+/* */
+/* Issues : None */
+/* */
+/* Revision History: */
+/* */
+/* DD MM YYYY Author(s) Changes */
+/* 04 02 2015 Kaushik Initial Version */
+/* Senthoor */
+/* */
+/*****************************************************************************/
+void ih264_default_weighted_pred_chroma_sse42(UWORD8 *pu1_src1,
+ UWORD8 *pu1_src2,
+ UWORD8 *pu1_dst,
+ WORD32 src_strd1,
+ WORD32 src_strd2,
+ WORD32 dst_strd,
+ WORD32 ht,
+ WORD32 wd)
+{
+ __m128i uv0_0_16x8b, uv0_1_16x8b;
+ __m128i uv1_0_16x8b, uv1_1_16x8b;
+
+ if(wd == 2)
+ {
+ __m128i mask_full_16x8b, mask_ll4B_16x8b;
+
+ mask_full_16x8b = _mm_set1_epi8(0xff);
+ mask_ll4B_16x8b = _mm_srli_si128(mask_full_16x8b, 12);
+ // mask for first four bytes
+
+ do
+ {
+ uv0_0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src1);
+ uv0_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src1 + src_strd1));
+
+ uv1_0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src2);
+ uv1_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src2 + src_strd2));
+
+ uv0_0_16x8b = _mm_avg_epu8(uv0_0_16x8b, uv1_0_16x8b);
+ uv0_1_16x8b = _mm_avg_epu8(uv0_1_16x8b, uv1_1_16x8b);
+
+ _mm_maskmoveu_si128(uv0_0_16x8b, mask_ll4B_16x8b, (char*)pu1_dst);
+ _mm_maskmoveu_si128(uv0_1_16x8b, mask_ll4B_16x8b,
+ (char*)(pu1_dst + dst_strd));
+
+ ht -= 2;
+ pu1_src1 += src_strd1 << 1;
+ pu1_src2 += src_strd2 << 1;
+ pu1_dst += dst_strd << 1;
+ }
+ while(ht > 0);
+ }
+ else if(wd == 4)
+ {
+ do
+ {
+ uv0_0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src1);
+ uv0_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src1 + src_strd1));
+
+ uv1_0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src2);
+ uv1_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src2 + src_strd2));
+
+ uv0_0_16x8b = _mm_avg_epu8(uv0_0_16x8b, uv1_0_16x8b);
+ uv0_1_16x8b = _mm_avg_epu8(uv0_1_16x8b, uv1_1_16x8b);
+
+ _mm_storel_epi64((__m128i *)pu1_dst, uv0_0_16x8b);
+ _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd), uv0_1_16x8b);
+
+ ht -= 2;
+ pu1_src1 += src_strd1 << 1;
+ pu1_src2 += src_strd2 << 1;
+ pu1_dst += dst_strd << 1;
+ }
+ while(ht > 0);
+ }
+ else // wd == 8
+ {
+ __m128i uv0_2_16x8b, uv0_3_16x8b;
+ __m128i uv1_2_16x8b, uv1_3_16x8b;
+
+ do
+ {
+ uv0_0_16x8b = _mm_loadu_si128((__m128i *)pu1_src1);
+ uv0_1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src1 + src_strd1));
+ uv0_2_16x8b = _mm_loadu_si128(
+ (__m128i *)(pu1_src1 + (src_strd1 << 1)));
+ uv0_3_16x8b = _mm_loadu_si128(
+ (__m128i *)(pu1_src1 + src_strd1 * 3));
+
+ uv1_0_16x8b = _mm_loadu_si128((__m128i *)pu1_src2);
+ uv1_1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src2 + src_strd2));
+ uv1_2_16x8b = _mm_loadu_si128(
+ (__m128i *)(pu1_src2 + (src_strd2 << 1)));
+ uv1_3_16x8b = _mm_loadu_si128(
+ (__m128i *)(pu1_src2 + src_strd2 * 3));
+
+ uv0_0_16x8b = _mm_avg_epu8(uv0_0_16x8b, uv1_0_16x8b);
+ uv0_1_16x8b = _mm_avg_epu8(uv0_1_16x8b, uv1_1_16x8b);
+ uv0_2_16x8b = _mm_avg_epu8(uv0_2_16x8b, uv1_2_16x8b);
+ uv0_3_16x8b = _mm_avg_epu8(uv0_3_16x8b, uv1_3_16x8b);
+
+ _mm_storeu_si128((__m128i *)pu1_dst, uv0_0_16x8b);
+ _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd), uv0_1_16x8b);
+ _mm_storeu_si128(
+ (__m128i *)(pu1_dst + (dst_strd << 1)), uv0_2_16x8b);
+ _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd * 3), uv0_3_16x8b);
+
+ ht -= 4;
+ pu1_src1 += src_strd1 << 2;
+ pu1_src2 += src_strd2 << 2;
+ pu1_dst += dst_strd << 2;
+ }
+ while(ht > 0);
+ }
+}
+
+/*****************************************************************************/
+/* */
+/* Function Name : ih264_weighted_pred_luma_sse42 */
+/* */
+/* Description : This function performs the weighted prediction as */
+/* described in sec 8.4.2.3.2 titled "Weighted sample */
+/* prediction process" for luma. The function gets one */
+/* ht x wd block, weights it, rounds it off, offsets it, */
+/* saturates it to unsigned 8-bit and stores it in the */
+/* destination block. (ht,wd) can be (4,4), (8,4), (4,8), */
+/* (8,8), (16,8), (8,16) or (16,16). */
+/* */
+/* Inputs : pu1_src - Pointer to source */
+/* pu1_dst - Pointer to destination */
+/* src_strd - stride for source */
+/* dst_strd - stride for destination */
+/* log_wd - number of bits to be rounded off */
+/* wt - weight value */
+/* ofst - offset value */
+/* ht - height of the block */
+/* wd - width of the block */
+/* */
+/* Issues : None */
+/* */
+/* Revision History: */
+/* */
+/* DD MM YYYY Author(s) Changes */
+/* 04 02 2015 Kaushik Initial Version */
+/* Senthoor */
+/* */
+/*****************************************************************************/
+void ih264_weighted_pred_luma_sse42(UWORD8 *pu1_src,
+ UWORD8 *pu1_dst,
+ WORD32 src_strd,
+ WORD32 dst_strd,
+ WORD32 log_wd,
+ WORD32 wt,
+ WORD32 ofst,
+ WORD32 ht,
+ WORD32 wd)
+{
+ __m128i y_0_16x8b, y_1_16x8b, y_2_16x8b, y_3_16x8b;
+
+ __m128i wt_8x16b, round_8x16b, ofst_8x16b;
+
+ WORD32 round_val;
+
+ wt = (WORD16)(wt & 0xffff);
+ round_val = 1 << (log_wd - 1);
+ ofst = (WORD8)(ofst & 0xff);
+
+ wt_8x16b = _mm_set1_epi16(wt);
+ round_8x16b = _mm_set1_epi16(round_val);
+ ofst_8x16b = _mm_set1_epi16(ofst);
+
+ if(wd == 4)
+ {
+ __m128i y_0_8x16b, y_2_8x16b;
+
+ __m128i mask_full_16x8b, mask_ll4B_16x8b;
+
+ mask_full_16x8b = _mm_set1_epi8(0xff);
+ mask_ll4B_16x8b = _mm_srli_si128(mask_full_16x8b, 12);
+ // mask for first four bytes
+
+ do
+ {
+ y_0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src);
+ y_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + src_strd));
+ y_2_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + (src_strd << 1)));
+ y_3_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + src_strd * 3));
+
+ y_0_16x8b = _mm_unpacklo_epi32(y_0_16x8b, y_1_16x8b);
+ y_2_16x8b = _mm_unpacklo_epi32(y_2_16x8b, y_3_16x8b);
+
+ y_0_8x16b = _mm_cvtepu8_epi16(y_0_16x8b);
+ y_2_8x16b = _mm_cvtepu8_epi16(y_2_16x8b);
+
+ y_0_8x16b = _mm_mullo_epi16(y_0_8x16b, wt_8x16b);
+ y_2_8x16b = _mm_mullo_epi16(y_2_8x16b, wt_8x16b);
+
+ y_0_8x16b = _mm_adds_epi16(round_8x16b, y_0_8x16b);
+ y_2_8x16b = _mm_adds_epi16(round_8x16b, y_2_8x16b);
+
+ y_0_8x16b = _mm_srai_epi16(y_0_8x16b, log_wd);
+ y_2_8x16b = _mm_srai_epi16(y_2_8x16b, log_wd);
+
+ y_0_8x16b = _mm_adds_epi16(ofst_8x16b, y_0_8x16b);
+ y_2_8x16b = _mm_adds_epi16(ofst_8x16b, y_2_8x16b);
+
+ y_0_16x8b = _mm_packus_epi16(y_0_8x16b, y_2_8x16b);
+ y_1_16x8b = _mm_srli_si128(y_0_16x8b, 4);
+ y_2_16x8b = _mm_srli_si128(y_0_16x8b, 8);
+ y_3_16x8b = _mm_srli_si128(y_0_16x8b, 12);
+
+ _mm_maskmoveu_si128(y_0_16x8b, mask_ll4B_16x8b, (char*)pu1_dst);
+ _mm_maskmoveu_si128(y_1_16x8b, mask_ll4B_16x8b,
+ (char*)(pu1_dst + dst_strd));
+ _mm_maskmoveu_si128(y_2_16x8b, mask_ll4B_16x8b,
+ (char*)(pu1_dst + (dst_strd << 1)));
+ _mm_maskmoveu_si128(y_3_16x8b, mask_ll4B_16x8b,
+ (char*)(pu1_dst + dst_strd * 3));
+
+ ht -= 4;
+ pu1_src += src_strd << 2;
+ pu1_dst += dst_strd << 2;
+ }
+ while(ht > 0);
+ }
+ else if(wd == 8)
+ {
+ __m128i y_0_8x16b, y_1_8x16b, y_2_8x16b, y_3_8x16b;
+
+ do
+ {
+ y_0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src);
+ y_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + src_strd));
+ y_2_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + (src_strd << 1)));
+ y_3_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + src_strd * 3));
+
+ y_0_8x16b = _mm_cvtepu8_epi16(y_0_16x8b);
+ y_1_8x16b = _mm_cvtepu8_epi16(y_1_16x8b);
+ y_2_8x16b = _mm_cvtepu8_epi16(y_2_16x8b);
+ y_3_8x16b = _mm_cvtepu8_epi16(y_3_16x8b);
+
+ y_0_8x16b = _mm_mullo_epi16(y_0_8x16b, wt_8x16b);
+ y_1_8x16b = _mm_mullo_epi16(y_1_8x16b, wt_8x16b);
+ y_2_8x16b = _mm_mullo_epi16(y_2_8x16b, wt_8x16b);
+ y_3_8x16b = _mm_mullo_epi16(y_3_8x16b, wt_8x16b);
+
+ y_0_8x16b = _mm_adds_epi16(round_8x16b, y_0_8x16b);
+ y_1_8x16b = _mm_adds_epi16(round_8x16b, y_1_8x16b);
+ y_2_8x16b = _mm_adds_epi16(round_8x16b, y_2_8x16b);
+ y_3_8x16b = _mm_adds_epi16(round_8x16b, y_3_8x16b);
+
+ y_0_8x16b = _mm_srai_epi16(y_0_8x16b, log_wd);
+ y_1_8x16b = _mm_srai_epi16(y_1_8x16b, log_wd);
+ y_2_8x16b = _mm_srai_epi16(y_2_8x16b, log_wd);
+ y_3_8x16b = _mm_srai_epi16(y_3_8x16b, log_wd);
+
+ y_0_8x16b = _mm_adds_epi16(ofst_8x16b, y_0_8x16b);
+ y_1_8x16b = _mm_adds_epi16(ofst_8x16b, y_1_8x16b);
+ y_2_8x16b = _mm_adds_epi16(ofst_8x16b, y_2_8x16b);
+ y_3_8x16b = _mm_adds_epi16(ofst_8x16b, y_3_8x16b);
+
+ y_0_16x8b = _mm_packus_epi16(y_0_8x16b, y_1_8x16b);
+ y_2_16x8b = _mm_packus_epi16(y_2_8x16b, y_3_8x16b);
+ y_1_16x8b = _mm_srli_si128(y_0_16x8b, 8);
+ y_3_16x8b = _mm_srli_si128(y_2_16x8b, 8);
+
+ _mm_storel_epi64((__m128i *)pu1_dst, y_0_16x8b);
+ _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd), y_1_16x8b);
+ _mm_storel_epi64((__m128i *)(pu1_dst + (dst_strd << 1)), y_2_16x8b);
+ _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd * 3), y_3_16x8b);
+
+ ht -= 4;
+ pu1_src += src_strd << 2;
+ pu1_dst += dst_strd << 2;
+ }
+ while(ht > 0);
+ }
+ else // wd == 16
+ {
+ __m128i y_0L_8x16b, y_1L_8x16b, y_2L_8x16b, y_3L_8x16b;
+ __m128i y_0H_8x16b, y_1H_8x16b, y_2H_8x16b, y_3H_8x16b;
+
+ __m128i zero_16x8b;
+ zero_16x8b = _mm_set1_epi8(0);
+
+ do
+ {
+ y_0_16x8b = _mm_loadu_si128((__m128i *)pu1_src);
+ y_1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + src_strd));
+ y_2_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + (src_strd << 1)));
+ y_3_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + src_strd * 3));
+
+ y_0L_8x16b = _mm_cvtepu8_epi16(y_0_16x8b);
+ y_0H_8x16b = _mm_unpackhi_epi8(y_0_16x8b, zero_16x8b);
+ y_1L_8x16b = _mm_cvtepu8_epi16(y_1_16x8b);
+ y_1H_8x16b = _mm_unpackhi_epi8(y_1_16x8b, zero_16x8b);
+ y_2L_8x16b = _mm_cvtepu8_epi16(y_2_16x8b);
+ y_2H_8x16b = _mm_unpackhi_epi8(y_2_16x8b, zero_16x8b);
+ y_3L_8x16b = _mm_cvtepu8_epi16(y_3_16x8b);
+ y_3H_8x16b = _mm_unpackhi_epi8(y_3_16x8b, zero_16x8b);
+
+ y_0L_8x16b = _mm_mullo_epi16(y_0L_8x16b, wt_8x16b);
+ y_0H_8x16b = _mm_mullo_epi16(y_0H_8x16b, wt_8x16b);
+ y_1L_8x16b = _mm_mullo_epi16(y_1L_8x16b, wt_8x16b);
+ y_1H_8x16b = _mm_mullo_epi16(y_1H_8x16b, wt_8x16b);
+ y_2L_8x16b = _mm_mullo_epi16(y_2L_8x16b, wt_8x16b);
+ y_2H_8x16b = _mm_mullo_epi16(y_2H_8x16b, wt_8x16b);
+ y_3L_8x16b = _mm_mullo_epi16(y_3L_8x16b, wt_8x16b);
+ y_3H_8x16b = _mm_mullo_epi16(y_3H_8x16b, wt_8x16b);
+
+ y_0L_8x16b = _mm_adds_epi16(round_8x16b, y_0L_8x16b);
+ y_0H_8x16b = _mm_adds_epi16(round_8x16b, y_0H_8x16b);
+ y_1L_8x16b = _mm_adds_epi16(round_8x16b, y_1L_8x16b);
+ y_1H_8x16b = _mm_adds_epi16(round_8x16b, y_1H_8x16b);
+ y_2L_8x16b = _mm_adds_epi16(round_8x16b, y_2L_8x16b);
+ y_2H_8x16b = _mm_adds_epi16(round_8x16b, y_2H_8x16b);
+ y_3L_8x16b = _mm_adds_epi16(round_8x16b, y_3L_8x16b);
+ y_3H_8x16b = _mm_adds_epi16(round_8x16b, y_3H_8x16b);
+
+ y_0L_8x16b = _mm_srai_epi16(y_0L_8x16b, log_wd);
+ y_0H_8x16b = _mm_srai_epi16(y_0H_8x16b, log_wd);
+ y_1L_8x16b = _mm_srai_epi16(y_1L_8x16b, log_wd);
+ y_1H_8x16b = _mm_srai_epi16(y_1H_8x16b, log_wd);
+ y_2L_8x16b = _mm_srai_epi16(y_2L_8x16b, log_wd);
+ y_2H_8x16b = _mm_srai_epi16(y_2H_8x16b, log_wd);
+ y_3L_8x16b = _mm_srai_epi16(y_3L_8x16b, log_wd);
+ y_3H_8x16b = _mm_srai_epi16(y_3H_8x16b, log_wd);
+
+ y_0L_8x16b = _mm_adds_epi16(ofst_8x16b, y_0L_8x16b);
+ y_0H_8x16b = _mm_adds_epi16(ofst_8x16b, y_0H_8x16b);
+ y_1L_8x16b = _mm_adds_epi16(ofst_8x16b, y_1L_8x16b);
+ y_1H_8x16b = _mm_adds_epi16(ofst_8x16b, y_1H_8x16b);
+ y_2L_8x16b = _mm_adds_epi16(ofst_8x16b, y_2L_8x16b);
+ y_2H_8x16b = _mm_adds_epi16(ofst_8x16b, y_2H_8x16b);
+ y_3L_8x16b = _mm_adds_epi16(ofst_8x16b, y_3L_8x16b);
+ y_3H_8x16b = _mm_adds_epi16(ofst_8x16b, y_3H_8x16b);
+
+ y_0_16x8b = _mm_packus_epi16(y_0L_8x16b, y_0H_8x16b);
+ y_1_16x8b = _mm_packus_epi16(y_1L_8x16b, y_1H_8x16b);
+ y_2_16x8b = _mm_packus_epi16(y_2L_8x16b, y_2H_8x16b);
+ y_3_16x8b = _mm_packus_epi16(y_3L_8x16b, y_3H_8x16b);
+
+ _mm_storeu_si128((__m128i *)pu1_dst, y_0_16x8b);
+ _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd), y_1_16x8b);
+ _mm_storeu_si128((__m128i *)(pu1_dst + (dst_strd << 1)), y_2_16x8b);
+ _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd * 3), y_3_16x8b);
+
+ ht -= 4;
+ pu1_src += src_strd << 2;
+ pu1_dst += dst_strd << 2;
+ }
+ while(ht > 0);
+ }
+}
+
+/*****************************************************************************/
+/* */
+/* Function Name : ih264_weighted_pred_chroma_sse42 */
+/* */
+/* Description : This function performs the weighted prediction as */
+/* described in sec 8.4.2.3.2 titled "Weighted sample */
+/* prediction process" for chroma. The function gets one */
+/* ht x wd block, weights it, rounds it off, offsets it, */
+/* saturates it to unsigned 8-bit and stores it in the */
+/* destination block. (ht,wd) can be (2,2), (4,2), (2,4), */
+/* (4,4), (8,4), (4,8) or (8,8). */
+/* */
+/* Inputs : pu1_src - Pointer to source */
+/* pu1_dst - Pointer to destination */
+/* src_strd - stride for source */
+/* dst_strd - stride for destination */
+/* log_wd - number of bits to be rounded off */
+/* wt - weight values for u and v */
+/* ofst - offset values for u and v */
+/* ht - height of the block */
+/* wd - width of the block */
+/* */
+/* Issues : None */
+/* */
+/* Revision History: */
+/* */
+/* DD MM YYYY Author(s) Changes */
+/* 04 02 2015 Kaushik Initial Version */
+/* Senthoor */
+/* */
+/*****************************************************************************/
+void ih264_weighted_pred_chroma_sse42(UWORD8 *pu1_src,
+ UWORD8 *pu1_dst,
+ WORD32 src_strd,
+ WORD32 dst_strd,
+ WORD32 log_wd,
+ WORD32 wt,
+ WORD32 ofst,
+ WORD32 ht,
+ WORD32 wd)
+{
+ __m128i y_0_16x8b, y_1_16x8b;
+
+ __m128i wt_8x16b, round_8x16b, ofst_8x16b;
+
+ WORD32 ofst_u, ofst_v;
+ WORD32 round_val;
+
+ ofst_u = (WORD8)(ofst & 0xff);
+ ofst_v = (WORD8)(ofst >> 8);
+ round_val = 1 << (log_wd - 1);
+ ofst = (ofst_u & 0xffff) | (ofst_v << 16);
+
+ wt_8x16b = _mm_set1_epi32(wt);
+ round_8x16b = _mm_set1_epi16(round_val);
+ ofst_8x16b = _mm_set1_epi32(ofst);
+
+ if(wd == 2)
+ {
+ __m128i y_0_8x16b;
+
+ __m128i mask_full_16x8b, mask_ll4B_16x8b;
+
+ mask_full_16x8b = _mm_set1_epi8(0xff);
+ mask_ll4B_16x8b = _mm_srli_si128(mask_full_16x8b, 12);
+ // mask for first four bytes
+
+ do
+ {
+ y_0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src);
+ y_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + src_strd));
+
+ y_0_16x8b = _mm_unpacklo_epi32(y_0_16x8b, y_1_16x8b);
+
+ y_0_8x16b = _mm_cvtepu8_epi16(y_0_16x8b);
+
+ y_0_8x16b = _mm_mullo_epi16(y_0_8x16b, wt_8x16b);
+
+ y_0_8x16b = _mm_adds_epi16(round_8x16b, y_0_8x16b);
+
+ y_0_8x16b = _mm_srai_epi16(y_0_8x16b, log_wd);
+
+ y_0_8x16b = _mm_adds_epi16(ofst_8x16b, y_0_8x16b);
+
+ y_0_16x8b = _mm_packus_epi16(y_0_8x16b, y_0_8x16b);
+ y_1_16x8b = _mm_srli_si128(y_0_16x8b, 4);
+
+ _mm_maskmoveu_si128(y_0_16x8b, mask_ll4B_16x8b, (char*)pu1_dst);
+ _mm_maskmoveu_si128(y_1_16x8b, mask_ll4B_16x8b,
+ (char*)(pu1_dst + dst_strd));
+
+ ht -= 2;
+ pu1_src += src_strd << 1;
+ pu1_dst += dst_strd << 1;
+ }
+ while(ht > 0);
+ }
+ else if(wd == 4)
+ {
+ __m128i y_0_8x16b, y_1_8x16b;
+
+ do
+ {
+ y_0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src);
+ y_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + src_strd));
+
+ y_0_8x16b = _mm_cvtepu8_epi16(y_0_16x8b);
+ y_1_8x16b = _mm_cvtepu8_epi16(y_1_16x8b);
+
+ y_0_8x16b = _mm_mullo_epi16(y_0_8x16b, wt_8x16b);
+ y_1_8x16b = _mm_mullo_epi16(y_1_8x16b, wt_8x16b);
+
+ y_0_8x16b = _mm_adds_epi16(round_8x16b, y_0_8x16b);
+ y_1_8x16b = _mm_adds_epi16(round_8x16b, y_1_8x16b);
+
+ y_0_8x16b = _mm_srai_epi16(y_0_8x16b, log_wd);
+ y_1_8x16b = _mm_srai_epi16(y_1_8x16b, log_wd);
+
+ y_0_8x16b = _mm_adds_epi16(ofst_8x16b, y_0_8x16b);
+ y_1_8x16b = _mm_adds_epi16(ofst_8x16b, y_1_8x16b);
+
+ y_0_16x8b = _mm_packus_epi16(y_0_8x16b, y_1_8x16b);
+ y_1_16x8b = _mm_srli_si128(y_0_16x8b, 8);
+
+ _mm_storel_epi64((__m128i *)pu1_dst, y_0_16x8b);
+ _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd), y_1_16x8b);
+
+ ht -= 2;
+ pu1_src += src_strd << 1;
+ pu1_dst += dst_strd << 1;
+ }
+ while(ht > 0);
+ }
+ else // wd == 16
+ {
+ __m128i y_2_16x8b, y_3_16x8b;
+ __m128i y_0L_8x16b, y_1L_8x16b, y_2L_8x16b, y_3L_8x16b;
+ __m128i y_0H_8x16b, y_1H_8x16b, y_2H_8x16b, y_3H_8x16b;
+
+ __m128i zero_16x8b;
+ zero_16x8b = _mm_set1_epi8(0);
+
+ do
+ {
+ y_0_16x8b = _mm_loadu_si128((__m128i *)pu1_src);
+ y_1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + src_strd));
+ y_2_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + (src_strd << 1)));
+ y_3_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + src_strd * 3));
+
+ y_0L_8x16b = _mm_cvtepu8_epi16(y_0_16x8b);
+ y_0H_8x16b = _mm_unpackhi_epi8(y_0_16x8b, zero_16x8b);
+ y_1L_8x16b = _mm_cvtepu8_epi16(y_1_16x8b);
+ y_1H_8x16b = _mm_unpackhi_epi8(y_1_16x8b, zero_16x8b);
+ y_2L_8x16b = _mm_cvtepu8_epi16(y_2_16x8b);
+ y_2H_8x16b = _mm_unpackhi_epi8(y_2_16x8b, zero_16x8b);
+ y_3L_8x16b = _mm_cvtepu8_epi16(y_3_16x8b);
+ y_3H_8x16b = _mm_unpackhi_epi8(y_3_16x8b, zero_16x8b);
+
+ y_0L_8x16b = _mm_mullo_epi16(y_0L_8x16b, wt_8x16b);
+ y_0H_8x16b = _mm_mullo_epi16(y_0H_8x16b, wt_8x16b);
+ y_1L_8x16b = _mm_mullo_epi16(y_1L_8x16b, wt_8x16b);
+ y_1H_8x16b = _mm_mullo_epi16(y_1H_8x16b, wt_8x16b);
+ y_2L_8x16b = _mm_mullo_epi16(y_2L_8x16b, wt_8x16b);
+ y_2H_8x16b = _mm_mullo_epi16(y_2H_8x16b, wt_8x16b);
+ y_3L_8x16b = _mm_mullo_epi16(y_3L_8x16b, wt_8x16b);
+ y_3H_8x16b = _mm_mullo_epi16(y_3H_8x16b, wt_8x16b);
+
+ y_0L_8x16b = _mm_adds_epi16(round_8x16b, y_0L_8x16b);
+ y_0H_8x16b = _mm_adds_epi16(round_8x16b, y_0H_8x16b);
+ y_1L_8x16b = _mm_adds_epi16(round_8x16b, y_1L_8x16b);
+ y_1H_8x16b = _mm_adds_epi16(round_8x16b, y_1H_8x16b);
+ y_2L_8x16b = _mm_adds_epi16(round_8x16b, y_2L_8x16b);
+ y_2H_8x16b = _mm_adds_epi16(round_8x16b, y_2H_8x16b);
+ y_3L_8x16b = _mm_adds_epi16(round_8x16b, y_3L_8x16b);
+ y_3H_8x16b = _mm_adds_epi16(round_8x16b, y_3H_8x16b);
+
+ y_0L_8x16b = _mm_srai_epi16(y_0L_8x16b, log_wd);
+ y_0H_8x16b = _mm_srai_epi16(y_0H_8x16b, log_wd);
+ y_1L_8x16b = _mm_srai_epi16(y_1L_8x16b, log_wd);
+ y_1H_8x16b = _mm_srai_epi16(y_1H_8x16b, log_wd);
+ y_2L_8x16b = _mm_srai_epi16(y_2L_8x16b, log_wd);
+ y_2H_8x16b = _mm_srai_epi16(y_2H_8x16b, log_wd);
+ y_3L_8x16b = _mm_srai_epi16(y_3L_8x16b, log_wd);
+ y_3H_8x16b = _mm_srai_epi16(y_3H_8x16b, log_wd);
+
+ y_0L_8x16b = _mm_adds_epi16(ofst_8x16b, y_0L_8x16b);
+ y_0H_8x16b = _mm_adds_epi16(ofst_8x16b, y_0H_8x16b);
+ y_1L_8x16b = _mm_adds_epi16(ofst_8x16b, y_1L_8x16b);
+ y_1H_8x16b = _mm_adds_epi16(ofst_8x16b, y_1H_8x16b);
+ y_2L_8x16b = _mm_adds_epi16(ofst_8x16b, y_2L_8x16b);
+ y_2H_8x16b = _mm_adds_epi16(ofst_8x16b, y_2H_8x16b);
+ y_3L_8x16b = _mm_adds_epi16(ofst_8x16b, y_3L_8x16b);
+ y_3H_8x16b = _mm_adds_epi16(ofst_8x16b, y_3H_8x16b);
+
+ y_0_16x8b = _mm_packus_epi16(y_0L_8x16b, y_0H_8x16b);
+ y_1_16x8b = _mm_packus_epi16(y_1L_8x16b, y_1H_8x16b);
+ y_2_16x8b = _mm_packus_epi16(y_2L_8x16b, y_2H_8x16b);
+ y_3_16x8b = _mm_packus_epi16(y_3L_8x16b, y_3H_8x16b);
+
+ _mm_storeu_si128((__m128i *)pu1_dst, y_0_16x8b);
+ _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd), y_1_16x8b);
+ _mm_storeu_si128((__m128i *)(pu1_dst + (dst_strd << 1)), y_2_16x8b);
+ _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd * 3), y_3_16x8b);
+
+ ht -= 4;
+ pu1_src += src_strd << 2;
+ pu1_dst += dst_strd << 2;
+ }
+ while(ht > 0);
+ }
+}
+
+/*****************************************************************************/
+/* */
+/* Function Name : ih264_weighted_bi_pred_luma_sse42 */
+/* */
+/* Description : This function performs the weighted biprediction as */
+/* described in sec 8.4.2.3.2 titled "Weighted sample */
+/* prediction process" for luma. The function gets two */
+/* ht x wd blocks, weights them, adds them, rounds off the */
+/* sum, offsets it, saturates it to unsigned 8-bit and */
+/* stores it in the destination block. (ht,wd) can be */
+/* (4,4), (8,4), (4,8), (8,8), (16,8), (8,16) or (16,16). */
+/* */
+/* Inputs : pu1_src1 - Pointer to source 1 */
+/* pu1_src2 - Pointer to source 2 */
+/* pu1_dst - Pointer to destination */
+/* src_strd1 - stride for source 1 */
+/* src_strd2 - stride for source 2 */
+/* dst_strd2 - stride for destination */
+/* log_wd - number of bits to be rounded off */
+/* wt1 - weight value for source 1 */
+/* wt2 - weight value for source 2 */
+/* ofst1 - offset value for source 1 */
+/* ofst2 - offset value for source 2 */
+/* ht - height of the block */
+/* wd - width of the block */
+/* */
+/* Issues : None */
+/* */
+/* Revision History: */
+/* */
+/* DD MM YYYY Author(s) Changes */
+/* 04 02 2015 Kaushik Initial Version */
+/* Senthoor */
+/* */
+/*****************************************************************************/
+void ih264_weighted_bi_pred_luma_sse42(UWORD8 *pu1_src1,
+ UWORD8 *pu1_src2,
+ UWORD8 *pu1_dst,
+ WORD32 src_strd1,
+ WORD32 src_strd2,
+ WORD32 dst_strd,
+ WORD32 log_wd,
+ WORD32 wt1,
+ WORD32 wt2,
+ WORD32 ofst1,
+ WORD32 ofst2,
+ WORD32 ht,
+ WORD32 wd)
+{
+ __m128i y1_0_16x8b, y1_1_16x8b;
+ __m128i y2_0_16x8b, y2_1_16x8b;
+
+ __m128i wt1_8x16b, wt2_8x16b;
+ __m128i ofst_8x16b, round_8x16b;
+
+ WORD32 ofst;
+ WORD32 round_val, shft;
+
+ wt1 = (WORD16)(wt1 & 0xffff);
+ wt2 = (WORD16)(wt2 & 0xffff);
+ round_val = 1 << log_wd;
+ shft = log_wd + 1;
+ ofst1 = (WORD8)(ofst1 & 0xff);
+ ofst2 = (WORD8)(ofst2 & 0xff);
+ ofst = (ofst1 + ofst2 + 1) >> 1;
+
+ wt1_8x16b = _mm_set1_epi16(wt1);
+ wt2_8x16b = _mm_set1_epi16(wt2);
+ round_8x16b = _mm_set1_epi16(round_val);
+ ofst_8x16b = _mm_set1_epi16(ofst);
+
+ if(wd == 4)
+ {
+ __m128i y1_2_16x8b, y1_3_16x8b;
+ __m128i y2_2_16x8b, y2_3_16x8b;
+
+ __m128i y1_0_8x16b, y1_2_8x16b;
+ __m128i y2_0_8x16b, y2_2_8x16b;
+
+ __m128i mask_ll4B_16x8b;
+
+ mask_ll4B_16x8b = _mm_set1_epi8(0xff);
+ mask_ll4B_16x8b = _mm_srli_si128(mask_ll4B_16x8b, 12);
+ // mask for first four bytes
+
+ do
+ {
+ y1_0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src1);
+ y1_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src1 + src_strd1));
+ y1_2_16x8b = _mm_loadl_epi64(
+ (__m128i *)(pu1_src1 + (src_strd1 << 1)));
+ y1_3_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src1 + src_strd1 * 3));
+
+ y2_0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src2);
+ y2_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src2 + src_strd2));
+ y2_2_16x8b = _mm_loadl_epi64(
+ (__m128i *)(pu1_src2 + (src_strd2 << 1)));
+ y2_3_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src2 + src_strd2 * 3));
+
+ y1_0_16x8b = _mm_unpacklo_epi32(y1_0_16x8b, y1_1_16x8b);
+ y1_2_16x8b = _mm_unpacklo_epi32(y1_2_16x8b, y1_3_16x8b);
+ y2_0_16x8b = _mm_unpacklo_epi32(y2_0_16x8b, y2_1_16x8b);
+ y2_2_16x8b = _mm_unpacklo_epi32(y2_2_16x8b, y2_3_16x8b);
+
+ y1_0_8x16b = _mm_cvtepu8_epi16(y1_0_16x8b);
+ y1_2_8x16b = _mm_cvtepu8_epi16(y1_2_16x8b);
+ y2_0_8x16b = _mm_cvtepu8_epi16(y2_0_16x8b);
+ y2_2_8x16b = _mm_cvtepu8_epi16(y2_2_16x8b);
+
+ y1_0_8x16b = _mm_mullo_epi16(y1_0_8x16b, wt1_8x16b);
+ y2_0_8x16b = _mm_mullo_epi16(y2_0_8x16b, wt2_8x16b);
+ y1_2_8x16b = _mm_mullo_epi16(y1_2_8x16b, wt1_8x16b);
+ y2_2_8x16b = _mm_mullo_epi16(y2_2_8x16b, wt2_8x16b);
+
+ y1_0_8x16b = _mm_adds_epi16(y1_0_8x16b, y2_0_8x16b);
+ y1_2_8x16b = _mm_adds_epi16(y1_2_8x16b, y2_2_8x16b);
+
+ y1_0_8x16b = _mm_adds_epi16(round_8x16b, y1_0_8x16b);
+ y1_2_8x16b = _mm_adds_epi16(round_8x16b, y1_2_8x16b);
+
+ y1_0_8x16b = _mm_srai_epi16(y1_0_8x16b, shft);
+ y1_2_8x16b = _mm_srai_epi16(y1_2_8x16b, shft);
+
+ y1_0_8x16b = _mm_adds_epi16(ofst_8x16b, y1_0_8x16b);
+ y1_2_8x16b = _mm_adds_epi16(ofst_8x16b, y1_2_8x16b);
+
+ y1_0_16x8b = _mm_packus_epi16(y1_0_8x16b, y1_2_8x16b);
+ y1_1_16x8b = _mm_srli_si128(y1_0_16x8b, 4);
+ y1_2_16x8b = _mm_srli_si128(y1_0_16x8b, 8);
+ y1_3_16x8b = _mm_srli_si128(y1_0_16x8b, 12);
+
+ _mm_maskmoveu_si128(y1_0_16x8b, mask_ll4B_16x8b, (char*)pu1_dst);
+ _mm_maskmoveu_si128(y1_1_16x8b, mask_ll4B_16x8b,
+ (char*)(pu1_dst + dst_strd));
+ _mm_maskmoveu_si128(y1_2_16x8b, mask_ll4B_16x8b,
+ (char*)(pu1_dst + (dst_strd << 1)));
+ _mm_maskmoveu_si128(y1_3_16x8b, mask_ll4B_16x8b,
+ (char*)(pu1_dst + dst_strd * 3));
+
+ ht -= 4;
+ pu1_src1 += src_strd1 << 2;
+ pu1_src2 += src_strd2 << 2;
+ pu1_dst += dst_strd << 2;
+ }
+ while(ht > 0);
+ }
+ else if(wd == 8)
+ {
+ __m128i y1_2_16x8b, y1_3_16x8b;
+ __m128i y2_2_16x8b, y2_3_16x8b;
+
+ __m128i y1_0_8x16b, y1_1_8x16b, y1_2_8x16b, y1_3_8x16b;
+ __m128i y2_0_8x16b, y2_1_8x16b, y2_2_8x16b, y2_3_8x16b;
+
+ do
+ {
+ y1_0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src1);
+ y1_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src1 + src_strd1));
+ y1_2_16x8b = _mm_loadl_epi64(
+ (__m128i *)(pu1_src1 + (src_strd1 << 1)));
+ y1_3_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src1 + src_strd1 * 3));
+
+ y2_0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src2);
+ y2_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src2 + src_strd2));
+ y2_2_16x8b = _mm_loadl_epi64(
+ (__m128i *)(pu1_src2 + (src_strd2 << 1)));
+ y2_3_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src2 + src_strd2 * 3));
+
+ y1_0_8x16b = _mm_cvtepu8_epi16(y1_0_16x8b);
+ y1_1_8x16b = _mm_cvtepu8_epi16(y1_1_16x8b);
+ y1_2_8x16b = _mm_cvtepu8_epi16(y1_2_16x8b);
+ y1_3_8x16b = _mm_cvtepu8_epi16(y1_3_16x8b);
+
+ y2_0_8x16b = _mm_cvtepu8_epi16(y2_0_16x8b);
+ y2_1_8x16b = _mm_cvtepu8_epi16(y2_1_16x8b);
+ y2_2_8x16b = _mm_cvtepu8_epi16(y2_2_16x8b);
+ y2_3_8x16b = _mm_cvtepu8_epi16(y2_3_16x8b);
+
+ y1_0_8x16b = _mm_mullo_epi16(y1_0_8x16b, wt1_8x16b);
+ y2_0_8x16b = _mm_mullo_epi16(y2_0_8x16b, wt2_8x16b);
+ y1_1_8x16b = _mm_mullo_epi16(y1_1_8x16b, wt1_8x16b);
+ y2_1_8x16b = _mm_mullo_epi16(y2_1_8x16b, wt2_8x16b);
+
+ y1_2_8x16b = _mm_mullo_epi16(y1_2_8x16b, wt1_8x16b);
+ y2_2_8x16b = _mm_mullo_epi16(y2_2_8x16b, wt2_8x16b);
+ y1_3_8x16b = _mm_mullo_epi16(y1_3_8x16b, wt1_8x16b);
+ y2_3_8x16b = _mm_mullo_epi16(y2_3_8x16b, wt2_8x16b);
+
+ y1_0_8x16b = _mm_adds_epi16(y1_0_8x16b, y2_0_8x16b);
+ y1_1_8x16b = _mm_adds_epi16(y1_1_8x16b, y2_1_8x16b);
+ y1_2_8x16b = _mm_adds_epi16(y1_2_8x16b, y2_2_8x16b);
+ y1_3_8x16b = _mm_adds_epi16(y1_3_8x16b, y2_3_8x16b);
+
+ y1_0_8x16b = _mm_adds_epi16(round_8x16b, y1_0_8x16b);
+ y1_1_8x16b = _mm_adds_epi16(round_8x16b, y1_1_8x16b);
+ y1_2_8x16b = _mm_adds_epi16(round_8x16b, y1_2_8x16b);
+ y1_3_8x16b = _mm_adds_epi16(round_8x16b, y1_3_8x16b);
+
+ y1_0_8x16b = _mm_srai_epi16(y1_0_8x16b, shft);
+ y1_1_8x16b = _mm_srai_epi16(y1_1_8x16b, shft);
+ y1_2_8x16b = _mm_srai_epi16(y1_2_8x16b, shft);
+ y1_3_8x16b = _mm_srai_epi16(y1_3_8x16b, shft);
+
+ y1_0_8x16b = _mm_adds_epi16(ofst_8x16b, y1_0_8x16b);
+ y1_1_8x16b = _mm_adds_epi16(ofst_8x16b, y1_1_8x16b);
+ y1_2_8x16b = _mm_adds_epi16(ofst_8x16b, y1_2_8x16b);
+ y1_3_8x16b = _mm_adds_epi16(ofst_8x16b, y1_3_8x16b);
+
+ y1_0_16x8b = _mm_packus_epi16(y1_0_8x16b, y1_1_8x16b);
+ y1_2_16x8b = _mm_packus_epi16(y1_2_8x16b, y1_3_8x16b);
+ y1_1_16x8b = _mm_srli_si128(y1_0_16x8b, 8);
+ y1_3_16x8b = _mm_srli_si128(y1_2_16x8b, 8);
+
+ _mm_storel_epi64((__m128i *)pu1_dst, y1_0_16x8b);
+ _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd), y1_1_16x8b);
+ _mm_storel_epi64((__m128i *)(pu1_dst + (dst_strd << 1)), y1_2_16x8b);
+ _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd * 3), y1_3_16x8b);
+
+ ht -= 4;
+ pu1_src1 += src_strd1 << 2;
+ pu1_src2 += src_strd2 << 2;
+ pu1_dst += dst_strd << 2;
+ }
+ while(ht > 0);
+ }
+ else // wd == 16
+ {
+ __m128i y1_0L_8x16b, y1_0H_8x16b, y1_1L_8x16b, y1_1H_8x16b;
+ __m128i y2_0L_8x16b, y2_0H_8x16b, y2_1L_8x16b, y2_1H_8x16b;
+
+ __m128i zero_16x8b;
+ zero_16x8b = _mm_set1_epi8(0);
+
+ do
+ {
+ y1_0_16x8b = _mm_loadu_si128((__m128i *)pu1_src1);
+ y1_1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src1 + src_strd1));
+ y2_0_16x8b = _mm_loadu_si128((__m128i *)pu1_src2);
+ y2_1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src2 + src_strd2));
+
+ y1_0L_8x16b = _mm_cvtepu8_epi16(y1_0_16x8b);
+ y1_0H_8x16b = _mm_unpackhi_epi8(y1_0_16x8b, zero_16x8b);
+ y1_1L_8x16b = _mm_cvtepu8_epi16(y1_1_16x8b);
+ y1_1H_8x16b = _mm_unpackhi_epi8(y1_1_16x8b, zero_16x8b);
+
+ y2_0L_8x16b = _mm_cvtepu8_epi16(y2_0_16x8b);
+ y2_0H_8x16b = _mm_unpackhi_epi8(y2_0_16x8b, zero_16x8b);
+ y2_1L_8x16b = _mm_cvtepu8_epi16(y2_1_16x8b);
+ y2_1H_8x16b = _mm_unpackhi_epi8(y2_1_16x8b, zero_16x8b);
+
+ y1_0L_8x16b = _mm_mullo_epi16(y1_0L_8x16b, wt1_8x16b);
+ y1_0H_8x16b = _mm_mullo_epi16(y1_0H_8x16b, wt1_8x16b);
+ y1_1L_8x16b = _mm_mullo_epi16(y1_1L_8x16b, wt1_8x16b);
+ y1_1H_8x16b = _mm_mullo_epi16(y1_1H_8x16b, wt1_8x16b);
+
+ y2_0L_8x16b = _mm_mullo_epi16(y2_0L_8x16b, wt2_8x16b);
+ y2_0H_8x16b = _mm_mullo_epi16(y2_0H_8x16b, wt2_8x16b);
+ y2_1L_8x16b = _mm_mullo_epi16(y2_1L_8x16b, wt2_8x16b);
+ y2_1H_8x16b = _mm_mullo_epi16(y2_1H_8x16b, wt2_8x16b);
+
+ y1_0L_8x16b = _mm_adds_epi16(y1_0L_8x16b, y2_0L_8x16b);
+ y1_0H_8x16b = _mm_adds_epi16(y1_0H_8x16b, y2_0H_8x16b);
+ y1_1L_8x16b = _mm_adds_epi16(y1_1L_8x16b, y2_1L_8x16b);
+ y1_1H_8x16b = _mm_adds_epi16(y1_1H_8x16b, y2_1H_8x16b);
+
+ y1_0L_8x16b = _mm_adds_epi16(round_8x16b, y1_0L_8x16b);
+ y1_0H_8x16b = _mm_adds_epi16(round_8x16b, y1_0H_8x16b);
+ y1_1L_8x16b = _mm_adds_epi16(round_8x16b, y1_1L_8x16b);
+ y1_1H_8x16b = _mm_adds_epi16(round_8x16b, y1_1H_8x16b);
+
+ y1_0L_8x16b = _mm_srai_epi16(y1_0L_8x16b, shft);
+ y1_0H_8x16b = _mm_srai_epi16(y1_0H_8x16b, shft);
+ y1_1L_8x16b = _mm_srai_epi16(y1_1L_8x16b, shft);
+ y1_1H_8x16b = _mm_srai_epi16(y1_1H_8x16b, shft);
+
+ y1_0L_8x16b = _mm_adds_epi16(ofst_8x16b, y1_0L_8x16b);
+ y1_0H_8x16b = _mm_adds_epi16(ofst_8x16b, y1_0H_8x16b);
+ y1_1L_8x16b = _mm_adds_epi16(ofst_8x16b, y1_1L_8x16b);
+ y1_1H_8x16b = _mm_adds_epi16(ofst_8x16b, y1_1H_8x16b);
+
+ y1_0_16x8b = _mm_packus_epi16(y1_0L_8x16b, y1_0H_8x16b);
+ y1_1_16x8b = _mm_packus_epi16(y1_1L_8x16b, y1_1H_8x16b);
+
+ _mm_storeu_si128((__m128i *)pu1_dst, y1_0_16x8b);
+ _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd), y1_1_16x8b);
+
+ ht -= 2;
+ pu1_src1 += src_strd1 << 1;
+ pu1_src2 += src_strd2 << 1;
+ pu1_dst += dst_strd << 1;
+ }
+ while(ht > 0);
+ }
+}
+
+/*****************************************************************************/
+/* */
+/* Function Name : ih264_weighted_bi_pred_chroma_sse42 */
+/* */
+/* Description : This function performs the weighted biprediction as */
+/* described in sec 8.4.2.3.2 titled "Weighted sample */
+/* prediction process" for chroma. The function gets two */
+/* ht x wd blocks, weights them, adds them, rounds off the */
+/* sum, offsets it, saturates it to unsigned 8-bit and */
+/* stores it in the destination block. (ht,wd) can be */
+/* (2,2), (4,2), (2,4), (4,4), (8,4), (4,8) or (8,8). */
+/* */
+/* Inputs : pu1_src1 - Pointer to source 1 */
+/* pu1_src2 - Pointer to source 2 */
+/* pu1_dst - Pointer to destination */
+/* src_strd1 - stride for source 1 */
+/* src_strd2 - stride for source 2 */
+/* dst_strd2 - stride for destination */
+/* log_wd - number of bits to be rounded off */
+/* wt1 - weight values for u and v in source 1 */
+/* wt2 - weight values for u and v in source 2 */
+/* ofst1 - offset value for u and v in source 1 */
+/* ofst2 - offset value for u and v in source 2 */
+/* ht - height of the block */
+/* wd - width of the block */
+/* */
+/* Issues : None */
+/* */
+/* Revision History: */
+/* */
+/* DD MM YYYY Author(s) Changes */
+/* 04 02 2015 Kaushik Initial Version */
+/* Senthoor */
+/* */
+/*****************************************************************************/
+void ih264_weighted_bi_pred_chroma_sse42(UWORD8 *pu1_src1,
+ UWORD8 *pu1_src2,
+ UWORD8 *pu1_dst,
+ WORD32 src_strd1,
+ WORD32 src_strd2,
+ WORD32 dst_strd,
+ WORD32 log_wd,
+ WORD32 wt1,
+ WORD32 wt2,
+ WORD32 ofst1,
+ WORD32 ofst2,
+ WORD32 ht,
+ WORD32 wd)
+{
+ __m128i y1_0_16x8b, y1_1_16x8b;
+ __m128i y2_0_16x8b, y2_1_16x8b;
+
+ __m128i wt1_8x16b, wt2_8x16b;
+ __m128i ofst_8x16b, round_8x16b;
+
+ WORD32 ofst1_u, ofst2_u, ofst_u;
+ WORD32 ofst1_v, ofst2_v, ofst_v;
+ WORD32 round_val, shft, ofst_val;
+
+ round_val = 1 << log_wd;
+ shft = log_wd + 1;
+
+ ofst1_u = (WORD8)(ofst1 & 0xff);
+ ofst1_v = (WORD8)(ofst1 >> 8);
+ ofst2_u = (WORD8)(ofst2 & 0xff);
+ ofst2_v = (WORD8)(ofst2 >> 8);
+
+ wt1_8x16b = _mm_set1_epi32(wt1);
+ wt2_8x16b = _mm_set1_epi32(wt2);
+
+ ofst_u = (ofst1_u + ofst2_u + 1) >> 1;
+ ofst_v = (ofst1_v + ofst2_v + 1) >> 1;
+ ofst_val = (ofst_u & 0xffff) | (ofst_v << 16);
+
+ round_8x16b = _mm_set1_epi16(round_val);
+ ofst_8x16b = _mm_set1_epi32(ofst_val);
+
+ if(wd == 2)
+ {
+ __m128i y1_0_8x16b, y2_0_8x16b;
+
+ __m128i mask_full_16x8b, mask_ll4B_16x8b;
+
+ mask_full_16x8b = _mm_set1_epi8(0xff);
+ mask_ll4B_16x8b = _mm_srli_si128(mask_full_16x8b, 12);
+
+ do
+ {
+ y1_0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src1);
+ y1_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src1 + src_strd1));
+
+ y2_0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src2);
+ y2_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src2 + src_strd2));
+
+ y1_0_16x8b = _mm_unpacklo_epi32(y1_0_16x8b, y1_1_16x8b);
+ y2_0_16x8b = _mm_unpacklo_epi32(y2_0_16x8b, y2_1_16x8b);
+
+ y1_0_8x16b = _mm_cvtepu8_epi16(y1_0_16x8b);
+ y2_0_8x16b = _mm_cvtepu8_epi16(y2_0_16x8b);
+
+ y1_0_8x16b = _mm_mullo_epi16(y1_0_8x16b, wt1_8x16b);
+ y2_0_8x16b = _mm_mullo_epi16(y2_0_8x16b, wt2_8x16b);
+
+ y1_0_8x16b = _mm_adds_epi16(y1_0_8x16b, y2_0_8x16b);
+ y1_0_8x16b = _mm_adds_epi16(round_8x16b, y1_0_8x16b);
+
+ y1_0_8x16b = _mm_srai_epi16(y1_0_8x16b, shft);
+ y1_0_8x16b = _mm_adds_epi16(ofst_8x16b, y1_0_8x16b);
+
+ y1_0_16x8b = _mm_packus_epi16(y1_0_8x16b, y1_0_8x16b);
+ y1_1_16x8b = _mm_srli_si128(y1_0_16x8b, 4);
+
+ _mm_maskmoveu_si128(y1_0_16x8b, mask_ll4B_16x8b, (char*)pu1_dst);
+ _mm_maskmoveu_si128(y1_1_16x8b, mask_ll4B_16x8b,
+ (char*)(pu1_dst + dst_strd));
+
+ ht -= 2;
+ pu1_src1 += src_strd1 << 1;
+ pu1_src2 += src_strd2 << 1;
+ pu1_dst += dst_strd << 1;
+ }
+ while(ht > 0);
+ }
+ else if(wd == 4)
+ {
+ __m128i y1_0_8x16b, y1_1_8x16b;
+ __m128i y2_0_8x16b, y2_1_8x16b;
+
+ do
+ {
+ y1_0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src1);
+ y1_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src1 + src_strd1));
+
+ y2_0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src2);
+ y2_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src2 + src_strd2));
+
+ y1_0_8x16b = _mm_cvtepu8_epi16(y1_0_16x8b);
+ y1_1_8x16b = _mm_cvtepu8_epi16(y1_1_16x8b);
+
+ y2_0_8x16b = _mm_cvtepu8_epi16(y2_0_16x8b);
+ y2_1_8x16b = _mm_cvtepu8_epi16(y2_1_16x8b);
+
+ y1_0_8x16b = _mm_mullo_epi16(y1_0_8x16b, wt1_8x16b);
+ y2_0_8x16b = _mm_mullo_epi16(y2_0_8x16b, wt2_8x16b);
+ y1_1_8x16b = _mm_mullo_epi16(y1_1_8x16b, wt1_8x16b);
+ y2_1_8x16b = _mm_mullo_epi16(y2_1_8x16b, wt2_8x16b);
+
+ y1_0_8x16b = _mm_adds_epi16(y1_0_8x16b, y2_0_8x16b);
+ y1_1_8x16b = _mm_adds_epi16(y1_1_8x16b, y2_1_8x16b);
+
+ y1_0_8x16b = _mm_adds_epi16(round_8x16b, y1_0_8x16b);
+ y1_1_8x16b = _mm_adds_epi16(round_8x16b, y1_1_8x16b);
+
+ y1_0_8x16b = _mm_srai_epi16(y1_0_8x16b, shft);
+ y1_1_8x16b = _mm_srai_epi16(y1_1_8x16b, shft);
+
+ y1_0_8x16b = _mm_adds_epi16(ofst_8x16b, y1_0_8x16b);
+ y1_1_8x16b = _mm_adds_epi16(ofst_8x16b, y1_1_8x16b);
+
+ y1_0_16x8b = _mm_packus_epi16(y1_0_8x16b, y1_1_8x16b);
+ y1_1_16x8b = _mm_srli_si128(y1_0_16x8b, 8);
+
+ _mm_storel_epi64((__m128i *)pu1_dst, y1_0_16x8b);
+ _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd), y1_1_16x8b);
+
+ ht -= 2;
+ pu1_src1 += src_strd1 << 1;
+ pu1_src2 += src_strd2 << 1;
+ pu1_dst += dst_strd << 1;
+ }
+ while(ht > 0);
+ }
+ else // wd == 8
+ {
+ __m128i y1_0L_8x16b, y1_0H_8x16b, y1_1L_8x16b, y1_1H_8x16b;
+ __m128i y2_0L_8x16b, y2_0H_8x16b, y2_1L_8x16b, y2_1H_8x16b;
+
+ __m128i zero_16x8b;
+ zero_16x8b = _mm_set1_epi8(0);
+
+ do
+ {
+ y1_0_16x8b = _mm_loadu_si128((__m128i *)pu1_src1);
+ y1_1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src1 + src_strd1));
+ y2_0_16x8b = _mm_loadu_si128((__m128i *)pu1_src2);
+ y2_1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src2 + src_strd2));
+
+ y1_0L_8x16b = _mm_cvtepu8_epi16(y1_0_16x8b);
+ y1_0H_8x16b = _mm_unpackhi_epi8(y1_0_16x8b, zero_16x8b);
+ y1_1L_8x16b = _mm_cvtepu8_epi16(y1_1_16x8b);
+ y1_1H_8x16b = _mm_unpackhi_epi8(y1_1_16x8b, zero_16x8b);
+
+ y2_0L_8x16b = _mm_cvtepu8_epi16(y2_0_16x8b);
+ y2_0H_8x16b = _mm_unpackhi_epi8(y2_0_16x8b, zero_16x8b);
+ y2_1L_8x16b = _mm_cvtepu8_epi16(y2_1_16x8b);
+ y2_1H_8x16b = _mm_unpackhi_epi8(y2_1_16x8b, zero_16x8b);
+
+ y1_0L_8x16b = _mm_mullo_epi16(y1_0L_8x16b, wt1_8x16b);
+ y1_0H_8x16b = _mm_mullo_epi16(y1_0H_8x16b, wt1_8x16b);
+ y1_1L_8x16b = _mm_mullo_epi16(y1_1L_8x16b, wt1_8x16b);
+ y1_1H_8x16b = _mm_mullo_epi16(y1_1H_8x16b, wt1_8x16b);
+
+ y2_0L_8x16b = _mm_mullo_epi16(y2_0L_8x16b, wt2_8x16b);
+ y2_0H_8x16b = _mm_mullo_epi16(y2_0H_8x16b, wt2_8x16b);
+ y2_1L_8x16b = _mm_mullo_epi16(y2_1L_8x16b, wt2_8x16b);
+ y2_1H_8x16b = _mm_mullo_epi16(y2_1H_8x16b, wt2_8x16b);
+
+ y1_0L_8x16b = _mm_adds_epi16(y1_0L_8x16b, y2_0L_8x16b);
+ y1_0H_8x16b = _mm_adds_epi16(y1_0H_8x16b, y2_0H_8x16b);
+ y1_1L_8x16b = _mm_adds_epi16(y1_1L_8x16b, y2_1L_8x16b);
+ y1_1H_8x16b = _mm_adds_epi16(y1_1H_8x16b, y2_1H_8x16b);
+
+ y1_0L_8x16b = _mm_adds_epi16(round_8x16b, y1_0L_8x16b);
+ y1_0H_8x16b = _mm_adds_epi16(round_8x16b, y1_0H_8x16b);
+ y1_1L_8x16b = _mm_adds_epi16(round_8x16b, y1_1L_8x16b);
+ y1_1H_8x16b = _mm_adds_epi16(round_8x16b, y1_1H_8x16b);
+
+ y1_0L_8x16b = _mm_srai_epi16(y1_0L_8x16b, shft);
+ y1_0H_8x16b = _mm_srai_epi16(y1_0H_8x16b, shft);
+ y1_1L_8x16b = _mm_srai_epi16(y1_1L_8x16b, shft);
+ y1_1H_8x16b = _mm_srai_epi16(y1_1H_8x16b, shft);
+
+ y1_0L_8x16b = _mm_adds_epi16(ofst_8x16b, y1_0L_8x16b);
+ y1_0H_8x16b = _mm_adds_epi16(ofst_8x16b, y1_0H_8x16b);
+ y1_1L_8x16b = _mm_adds_epi16(ofst_8x16b, y1_1L_8x16b);
+ y1_1H_8x16b = _mm_adds_epi16(ofst_8x16b, y1_1H_8x16b);
+
+ y1_0_16x8b = _mm_packus_epi16(y1_0L_8x16b, y1_0H_8x16b);
+ y1_1_16x8b = _mm_packus_epi16(y1_1L_8x16b, y1_1H_8x16b);
+
+ _mm_storeu_si128((__m128i *)pu1_dst, y1_0_16x8b);
+ _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd), y1_1_16x8b);
+
+ ht -= 2;
+ pu1_src1 += src_strd1 << 1;
+ pu1_src2 += src_strd2 << 1;
+ pu1_dst += dst_strd << 1;
+ }
+ while(ht > 0);
+ }
+}