Initial version

Change-Id: I7efe9a589cd24edf86e8d086b40c27cbbf8b4017
author: Hamsalekha S <hamsalekha.s@ittiam.com> 2015-03-13 21:24:58 +0530
committer: Hamsalekha S <hamsalekha.s@ittiam.com> 2015-04-02 15:59:02 +0530
commit: 8d3d303c7942ced6a987a52db8977d768dc3605f (patch)
tree: cc806c96794356996b13ba9970941d0aed74a97e /encoder/ih264e_core_coding.c
parent: 3956d913d37327dcb340f836e604b04bd478b158 (diff)
download: android_external_libavc-8d3d303c7942ced6a987a52db8977d768dc3605f.tar.gz
android_external_libavc-8d3d303c7942ced6a987a52db8977d768dc3605f.tar.bz2
android_external_libavc-8d3d303c7942ced6a987a52db8977d768dc3605f.zip
1 files changed, 2365 insertions, 0 deletions
diff --git a/encoder/ih264e_core_coding.c b/encoder/ih264e_core_coding.c
new file mode 100755
index 0000000..5ba18de
--- /dev/null
+++ b/encoder/ih264e_core_coding.c
@@ -0,0 +1,2365 @@
+/******************************************************************************
+ *
+ * Copyright (C) 2015 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ *****************************************************************************
+ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
+*/
+
+/**
+ *******************************************************************************
+ * @file
+ *  ih264e_core_coding.c
+ *
+ * @brief
+ *  This file contains routines that perform luma and chroma core coding for
+ *  intra macroblocks
+ *
+ * @author
+ *  ittiam
+ *
+ * @par List of Functions:
+ *  - ih264e_pack_l_mb_i16()
+ *  - ih264e_pack_c_mb_i8()
+ *  - ih264e_code_luma_intra_macroblock_16x16()
+ *  - ih264e_code_luma_intra_macroblock_4x4()
+ *  - ih264e_code_chroma_intra_macroblock_8x8()
+ *
+ * @remarks
+ *  None
+ *
+ *******************************************************************************
+ */
+
+/*****************************************************************************/
+/* File Includes                                                             */
+/*****************************************************************************/
+
+/* System include files */
+#include <stdio.h>
+#include <string.h>
+#include <assert.h>
+
+/* User include files */
+#include "ih264e_config.h"
+#include "ih264_typedefs.h"
+#include "ih264_platform_macros.h"
+#include "iv2.h"
+#include "ive2.h"
+#include "ih264_defs.h"
+#include "ih264e_defs.h"
+#include "ih264_trans_data.h"
+#include "ih264e_error.h"
+#include "ih264e_bitstream.h"
+#include "ime_distortion_metrics.h"
+#include "ime_structs.h"
+#include "ih264_structs.h"
+#include "ih264_trans_quant_itrans_iquant.h"
+#include "ih264_inter_pred_filters.h"
+#include "ih264_mem_fns.h"
+#include "ih264_padding.h"
+#include "ih264_intra_pred_filters.h"
+#include "ih264_deblk_edge_filters.h"
+#include "irc_cntrl_param.h"
+#include "irc_frame_info_collector.h"
+#include "ih264e_rate_control.h"
+#include "ih264e_structs.h"
+#include "ih264e_globals.h"
+#include "ih264e_core_coding.h"
+#include "ih264e_mc.h"
+
+
+/*****************************************************************************/
+/* Function Definitions                                                      */
+/*****************************************************************************/
+
+/**
+*******************************************************************************
+*
+* @brief
+*  This function performs does the DCT transform then Hadamard transform
+*  and quantization for a macroblock when the mb mode is intra 16x16 mode
+*
+* @par Description:
+*  First  cf4 is done on all 16 4x4 blocks of the 16x16 input block.
+*  Then hadamard transform is done on the DC coefficients
+*  Quantization is then performed on the 16x16 block, 4x4 wise
+*
+* @param[in] pu1_src
+*  Pointer to source sub-block
+*
+* @param[in] pu1_pred
+*  Pointer to prediction sub-block
+*
+* @param[in] pi2_out
+*  Pointer to residual sub-block
+*  The output will be in linear format
+*  The first 16 continuous locations will contain the values of Dc block
+*  After DC block and a stride 1st AC block will follow
+*  After one more stride next AC block will follow
+*  The blocks will be in raster scan order
+*
+* @param[in] src_strd
+*  Source stride
+*
+* @param[in] pred_strd
+*  Prediction stride
+*
+* @param[in] dst_strd
+*  Destination stride
+*
+* @param[in] pu2_scale_matrix
+*  The quantization matrix for 4x4 transform
+*
+* @param[in] pu2_threshold_matrix
+*  Threshold matrix
+*
+* @param[in] u4_qbits
+*  15+QP/6
+*
+* @param[in] u4_round_factor
+*  Round factor for quant
+*
+* @param[out] pu1_nnz
+*  Memory to store the non-zeros after transform
+*  The first byte will be the nnz of DC block
+*  From the next byte the AC nnzs will be stored in raster scan order
+*
+* @param u4_dc_flag
+*  Signals if Dc transform is to be done or not
+*   1 -> Dc transform will be done
+*   0 -> Dc transform will not be done
+*
+* @remarks
+*
+*******************************************************************************
+*/
+void ih264e_luma_16x16_resi_trans_dctrans_quant(codec_t *ps_codec,
+                                                UWORD8 *pu1_src,
+                                                UWORD8 *pu1_pred,
+                                                WORD16 *pi2_out,
+                                                WORD32 src_strd,
+                                                WORD32 pred_strd,
+                                                WORD32 dst_strd,
+                                                const UWORD16 *pu2_scale_matrix,
+                                                const UWORD16 *pu2_threshold_matrix,
+                                                UWORD32 u4_qbits,
+                                                UWORD32 u4_round_factor,
+                                                UWORD8 *pu1_nnz,
+                                                UWORD32 u4_dc_flag)
+
+{
+    WORD32 blk_cntr;
+    WORD32 i4_offsetx, i4_offsety;
+    UWORD8 *pu1_curr_src, *pu1_curr_pred;
+
+    WORD16 *pi2_dc_str = pi2_out;
+
+    /* Move to the ac addresses */
+    pu1_nnz++;
+    pi2_out += dst_strd;
+
+    for (blk_cntr = 0; blk_cntr < NUM_LUMA4x4_BLOCKS_IN_MB; blk_cntr++)
+    {
+        IND2SUB_LUMA_MB(blk_cntr, i4_offsetx, i4_offsety);
+
+        pu1_curr_src = pu1_src + i4_offsetx + i4_offsety * src_strd;
+        pu1_curr_pred = pu1_pred + i4_offsetx + i4_offsety * pred_strd;
+
+        ps_codec->pf_resi_trans_quant_4x4(pu1_curr_src, pu1_curr_pred,
+                                          pi2_out + blk_cntr * dst_strd,
+                                          src_strd, pred_strd, pu2_scale_matrix,
+                                          pu2_threshold_matrix, u4_qbits,
+                                          u4_round_factor, &pu1_nnz[blk_cntr],
+                                          &pi2_dc_str[blk_cntr]);
+
+    }
+
+    if (!u4_dc_flag)
+        return;
+
+    /*
+     * In case of i16x16, we need to remove the contribution of dc coeffs into
+     * nnz of each block. We are doing that in the packing function
+     */
+
+    /* Adjust pointers to point to dc values */
+    pi2_out -= dst_strd;
+    pu1_nnz--;
+
+    u4_qbits++;
+    u4_round_factor <<= 1;
+
+    ps_codec->pf_hadamard_quant_4x4(pi2_dc_str, pi2_out, pu2_scale_matrix,
+                                    pu2_threshold_matrix, u4_qbits,
+                                    u4_round_factor, &pu1_nnz[0]);
+}
+
+/**
+*******************************************************************************
+*
+* @brief
+*  This function performs the intra 16x16 inverse transform process for H264
+*  it includes inverse Dc transform, inverse quant and then inverse transform
+*
+* @par Description:
+*
+* @param[in] pi2_src
+*  Input data, 16x16 size
+*  First 16 mem locations will have the Dc coffs in rater scan order in linear fashion
+*  after a stride 1st AC clock will be present again in raster can order
+*  Then each AC block of the 16x16 block will follow in raster scan order
+*
+* @param[in] pu1_pred
+*  The predicted data, 16x16 size
+*  Block by block form
+*
+* @param[in] pu1_out
+*  Output 16x16
+*  In block by block form
+*
+* @param[in] src_strd
+*  Source stride
+*
+* @param[in] pred_strd
+*  input stride for prediction buffer
+*
+* @param[in] out_strd
+*  input stride for output buffer
+*
+* @param[in] pu2_iscale_mat
+*  Inverse quantization matrix for 4x4 transform
+*
+* @param[in] pu2_weigh_mat
+*  weight matrix of 4x4 transform
+*
+* @param[in] qp_div
+*  QP/6
+*
+* @param[in] pi4_tmp
+*  Input temporary buffer
+*  needs to be at least 20 in size
+*
+* @param[in] pu4_cntrl
+*  Controls the transform path
+*  total Last 17 bits are used
+*  the 16th th bit will correspond to DC block
+*  and 32-17 will correspond to the ac blocks in raster scan order
+*  bit equaling zero indicates that the entire 4x4 block is zero for DC
+*  For AC blocks a bit equaling zero will mean that all 15 AC coffs of the block is nonzero
+*
+* @param[in] pi4_tmp
+*  Input temporary buffer
+*  needs to be at least COFF_CNT_SUB_BLK_4x4+COFF_CNT_SUB_BLK_4x4 size
+*
+* @returns
+*  none
+*
+* @remarks
+*  The all zero case must be taken care outside
+*
+*******************************************************************************
+*/
+void ih264e_luma_16x16_idctrans_iquant_itrans_recon(codec_t *ps_codec,
+                                                    WORD16 *pi2_src,
+                                                    UWORD8 *pu1_pred,
+                                                    UWORD8 *pu1_out,
+                                                    WORD32 src_strd,
+                                                    WORD32 pred_strd,
+                                                    WORD32 out_strd,
+                                                    const UWORD16 *pu2_iscale_mat,
+                                                    const UWORD16 *pu2_weigh_mat,
+                                                    UWORD32 qp_div,
+                                                    UWORD32 u4_cntrl,
+                                                    UWORD32 u4_dc_trans_flag,
+                                                    WORD32 *pi4_tmp)
+{
+    /* Start index for inverse quant in a 4x4 block */
+    WORD32 iq_start_idx = (u4_dc_trans_flag == 0) ? 0 : 1;
+
+    /* Cntrl bits for 4x4 transforms
+     * u4_blk_cntrl       : controls if a 4x4 block should be processed in ac path
+     * u4_dc_cntrl        : controls is a 4x4 block is to be processed in dc path
+     *                    : dc block must contain only single dc coefficient
+     * u4_empty_blk_cntrl : control fot 4x4 block with no coeffs, ie no dc and ac
+     *                    : ie not (ac or dc)
+     */
+    UWORD32 u4_blk_cntrl, u4_dc_cntrl, u4_empty_blk_cntrl;
+
+    /* tmp registers for block ids */
+    UWORD32 u4_blk_id;
+
+    /* Subscrripts */
+    WORD32 i4_offset_x, i4_offset_y;
+
+    UWORD8 *pu1_cur_prd_blk, *pu1_cur_out_blk;
+
+    /* Src and stride for dc coeffs */
+    UWORD32 u4_dc_inc;
+    WORD16 *pi2_dc_src;
+
+    /*
+     * For intra blocks we need to do inverse dc transform
+     * In case if intra blocks, its here that we populate the dc bits in cntrl
+     * as they cannot be populated any earlier
+     */
+    if (u4_dc_trans_flag)
+    {
+        UWORD32 cntr, u4_dc_cntrl;
+        /* Do inv hadamard and place the results at the start of each AC block */
+        ps_codec->pf_ihadamard_scaling_4x4(pi2_src, pi2_src, pu2_iscale_mat,
+                                           pu2_weigh_mat, qp_div, pi4_tmp);
+
+        /* Update the cntrl flag */
+        u4_dc_cntrl = 0;
+        for (cntr = 0; cntr < DC_COEFF_CNT_LUMA_MB; cntr++)
+        {
+            u4_dc_cntrl |= ((pi2_src[cntr] != 0) << (15 - cntr));
+        }
+        /* Mark dc bits as 1 if corresponding ac bit is 0 */
+        u4_dc_cntrl = (~(u4_cntrl >> 16) & u4_dc_cntrl);
+        /* Combine both ac and dc bits */
+        u4_cntrl = (u4_cntrl & CNTRL_FLAG_AC_MASK_LUMA)
+                        | (u4_dc_cntrl & CNTRL_FLAG_DC_MASK_LUMA);
+    }
+
+    /* Source for dc coeffs
+     * If the block is intra, we have to read dc values from first row of src
+     * then stride for each block is 1, other wise its src stride
+     */
+    pi2_dc_src = (iq_start_idx == 0) ? (pi2_src + src_strd) : pi2_src;
+    u4_dc_inc = (iq_start_idx == 0) ? src_strd : 1;
+
+    /* The AC blocks starts from 2nd row */
+    pi2_src += src_strd;
+
+    /* Get the block bits */
+    u4_blk_cntrl = (u4_cntrl & CNTRL_FLAG_AC_MASK_LUMA);
+    u4_dc_cntrl = (u4_cntrl & CNTRL_FLAG_DC_MASK_LUMA) << 16;
+    u4_empty_blk_cntrl = (~(u4_dc_cntrl | u4_blk_cntrl)) & 0xFFFF0000;
+
+    /* Get first block to process */
+    DEQUEUE_BLKID_FROM_CONTROL(u4_dc_cntrl, u4_blk_id);
+    while (u4_blk_id < NUM_LUMA4x4_BLOCKS_IN_MB)
+    {
+        /* Compute address of src blocks */
+        WORD32 i4_src_offset = u4_dc_inc * u4_blk_id;
+
+        IND2SUB_LUMA_MB(u4_blk_id, i4_offset_x, i4_offset_y);
+
+        /* Compute address of out and pred blocks */
+        pu1_cur_prd_blk = pu1_pred + i4_offset_x + i4_offset_y * pred_strd;
+        pu1_cur_out_blk = pu1_out + i4_offset_x + i4_offset_y * out_strd;
+
+        /* Do inv dc transform */
+        ps_codec->pf_iquant_itrans_recon_4x4_dc(pi2_dc_src + i4_src_offset,
+                                                pu1_cur_prd_blk,
+                                                pu1_cur_out_blk, pred_strd,
+                                                out_strd, pu2_iscale_mat,
+                                                pu2_weigh_mat, qp_div, NULL,
+                                                iq_start_idx,
+                                                pi2_dc_src + i4_src_offset);
+        /* Get next DC block to process */
+        DEQUEUE_BLKID_FROM_CONTROL(u4_dc_cntrl, u4_blk_id);
+    }
+
+    /* now process ac/mixed blocks */
+    DEQUEUE_BLKID_FROM_CONTROL(u4_blk_cntrl, u4_blk_id);
+    while (u4_blk_id < NUM_LUMA4x4_BLOCKS_IN_MB)
+    {
+
+        WORD32 i4_src_offset = src_strd * u4_blk_id;
+
+        IND2SUB_LUMA_MB(u4_blk_id, i4_offset_x, i4_offset_y);
+
+        pu1_cur_prd_blk = pu1_pred + i4_offset_x + i4_offset_y * pred_strd;
+        pu1_cur_out_blk = pu1_out + i4_offset_x + i4_offset_y * out_strd;
+
+        ps_codec->pf_iquant_itrans_recon_4x4(pi2_src + i4_src_offset,
+                                             pu1_cur_prd_blk, pu1_cur_out_blk,
+                                             pred_strd, out_strd,
+                                             pu2_iscale_mat, pu2_weigh_mat,
+                                             qp_div, (WORD16*) pi4_tmp,
+                                             iq_start_idx,
+                                             pi2_dc_src + u4_blk_id);
+
+        DEQUEUE_BLKID_FROM_CONTROL(u4_blk_cntrl, u4_blk_id);
+    }
+
+    /* Now process empty blocks */
+    DEQUEUE_BLKID_FROM_CONTROL(u4_empty_blk_cntrl, u4_blk_id);
+    while (u4_blk_id < NUM_LUMA4x4_BLOCKS_IN_MB)
+    {
+        IND2SUB_LUMA_MB(u4_blk_id, i4_offset_x, i4_offset_y);
+
+        pu1_cur_prd_blk = pu1_pred + i4_offset_x + i4_offset_y * pred_strd;
+        pu1_cur_out_blk = pu1_out + i4_offset_x + i4_offset_y * out_strd;
+
+        ps_codec->pf_inter_pred_luma_copy(pu1_cur_prd_blk, pu1_cur_out_blk,
+                                          pred_strd, out_strd, SIZE_4X4_BLK_HRZ,
+                                          SIZE_4X4_BLK_VERT, 0, 0);
+
+        DEQUEUE_BLKID_FROM_CONTROL(u4_empty_blk_cntrl, u4_blk_id);
+    }
+}
+
+/**
+*******************************************************************************
+*
+* @brief
+*  This function performs does the DCT transform then Hadamard transform
+*  and quantization for a chroma macroblock
+*
+* @par Description:
+*  First  cf4 is done on all 16 4x4 blocks of the 8x8input block
+*  Then hadamard transform is done on the DC coefficients
+*  Quantization is then performed on the 8x8 block, 4x4 wise
+*
+* @param[in] pu1_src
+*  Pointer to source sub-block
+*  The input is in interleaved format for two chroma planes
+*
+* @param[in] pu1_pred
+*  Pointer to prediction sub-block
+*  Prediction is in inter leaved format
+*
+* @param[in] pi2_out
+*  Pointer to residual sub-block
+*  The output will be in linear format
+*  The first 4 continuous locations will contain the values of DC block for U
+*  and then next 4 will contain for V.
+*  After DC block and a stride 1st AC block of U plane will follow
+*  After one more stride next AC block of V plane will follow
+*  The blocks will be in raster scan order
+*
+*  After all the AC blocks of U plane AC blocks of V plane will follow in exact
+*  same way
+*
+* @param[in] src_strd
+*  Source stride
+*
+* @param[in] pred_strd
+*  Prediction stride
+*
+* @param[in] dst_strd
+*  Destination stride
+*
+* @param[in] pu2_scale_matrix
+*  The quantization matrix for 4x4 transform
+*
+* @param[in] pu2_threshold_matrix
+*  Threshold matrix
+*
+* @param[in] u4_qbits
+*  15+QP/6
+*
+* @param[in] u4_round_factor
+*  Round factor for quant
+*
+* @param[out] pu1_nnz
+*  Memory to store the non-zeros after transform
+*  The first byte will be the nnz od DC block for U plane
+*  From the next byte the AC nnzs will be storerd in raster scan order
+*  The fifth byte will be nnz of Dc block of V plane
+*  Then Ac blocks will follow
+*
+* @param u4_dc_flag
+*  Signals if Dc transform is to be done or not
+*   1 -> Dc transform will be done
+*   0 -> Dc transform will not be done
+*
+* @remarks
+*
+*******************************************************************************
+*/
+void ih264e_chroma_8x8_resi_trans_dctrans_quant(codec_t *ps_codec,
+                                                UWORD8 *pu1_src,
+                                                UWORD8 *pu1_pred,
+                                                WORD16 *pi2_out,
+                                                WORD32 src_strd,
+                                                WORD32 pred_strd,
+                                                WORD32 out_strd,
+                                                const UWORD16 *pu2_scale_matrix,
+                                                const UWORD16 *pu2_threshold_matrix,
+                                                UWORD32 u4_qbits,
+                                                UWORD32 u4_round_factor,
+                                                UWORD8 *pu1_nnz_c)
+{
+    WORD32 blk_cntr;
+    WORD32 i4_offsetx, i4_offsety;
+    UWORD8 *pu1_curr_src, *pu1_curr_pred;
+
+    WORD16 pi2_dc_str[8];
+    UWORD8 au1_dcnnz[2];
+
+    /* Move to the ac addresses */
+    pu1_nnz_c++;
+    pi2_out += out_strd;
+
+    for (blk_cntr = 0; blk_cntr < NUM_CHROMA4x4_BLOCKS_IN_MB; blk_cntr++)
+    {
+        IND2SUB_CHROMA_MB(blk_cntr, i4_offsetx, i4_offsety);
+
+        pu1_curr_src = pu1_src + i4_offsetx + i4_offsety * src_strd;
+        pu1_curr_pred = pu1_pred + i4_offsetx + i4_offsety * pred_strd;
+
+        /* For chroma, v plane nnz is populated from position 5 */
+        ps_codec->pf_resi_trans_quant_chroma_4x4(
+                        pu1_curr_src, pu1_curr_pred,
+                        pi2_out + blk_cntr * out_strd, src_strd, pred_strd,
+                        pu2_scale_matrix, pu2_threshold_matrix, u4_qbits,
+                        u4_round_factor, &pu1_nnz_c[blk_cntr + (blk_cntr > 3)],
+                        &pi2_dc_str[blk_cntr]);
+    }
+
+    /* Adjust pointers to point to dc values */
+    pi2_out -= out_strd;
+    pu1_nnz_c--;
+
+    u4_qbits++;
+    u4_round_factor <<= 1;
+
+    ps_codec->pf_hadamard_quant_2x2_uv(pi2_dc_str, pi2_out, pu2_scale_matrix,
+                                       pu2_threshold_matrix, u4_qbits,
+                                       u4_round_factor, au1_dcnnz);
+
+    /* Copy the dc nnzs */
+    pu1_nnz_c[0] = au1_dcnnz[0];
+    pu1_nnz_c[5] = au1_dcnnz[1];
+
+}
+
+/**
+*******************************************************************************
+* @brief
+*  This function performs the inverse transform with process for chroma MB of H264
+*
+* @par Description:
+*  Does inverse DC transform ,inverse quantization inverse transform
+*
+* @param[in] pi2_src
+*  Input data, 16x16 size
+*  The input is in the form of, first 4 locations will contain DC coeffs of
+*  U plane, next 4 will contain DC coeffs of V plane, then AC blocks of U plane
+*  in raster scan order will follow, each block as linear array in raster scan order.
+*  After a stride next AC block will follow. After all AC blocks of U plane
+*  V plane AC blocks will follow in exact same order.
+*
+* @param[in] pu1_pred
+*  The predicted data, 8x16 size, U and V interleaved
+*
+* @param[in] pu1_out
+*  Output 8x16, U and V interleaved
+*
+* @param[in] src_strd
+*  Source stride
+*
+* @param[in] pred_strd
+*  input stride for prediction buffer
+*
+* @param[in] out_strd
+*  input stride for output buffer
+*
+* @param[in] pu2_iscale_mat
+*  Inverse quantization martix for 4x4 transform
+*
+* @param[in] pu2_weigh_mat
+*  weight matrix of 4x4 transform
+*
+* @param[in] qp_div
+*  QP/6
+*
+* @param[in] pi4_tmp
+*  Input temporary buffer
+*  needs to be at least COFF_CNT_SUB_BLK_4x4 + Number of Dc cofss for chroma * number of planes
+*  in size
+*
+* @param[in] pu4_cntrl
+*  Controls the transform path
+*  the 15 th bit will correspond to DC block of U plane , 14th will indicate the V plane Dc block
+*  32-28 bits will indicate AC blocks of U plane in raster scan order
+*  27-23 bits will indicate AC blocks of V plane in rater scan order
+*  The bit 1 implies that there is at least one non zero coeff in a block
+*
+* @returns
+*  none
+*
+* @remarks
+*******************************************************************************
+*/
+void ih264e_chroma_8x8_idctrans_iquant_itrans_recon(codec_t *ps_codec,
+                                                    WORD16 *pi2_src,
+                                                    UWORD8 *pu1_pred,
+                                                    UWORD8 *pu1_out,
+                                                    WORD32 src_strd,
+                                                    WORD32 pred_strd,
+                                                    WORD32 out_strd,
+                                                    const UWORD16 *pu2_iscale_mat,
+                                                    const UWORD16 *pu2_weigh_mat,
+                                                    UWORD32 qp_div,
+                                                    UWORD32 u4_cntrl,
+                                                    WORD32 *pi4_tmp)
+{
+    /* Cntrl bits for 4x4 transforms
+     * u4_blk_cntrl       : controls if a 4x4 block should be processed in ac path
+     * u4_dc_cntrl        : controls is a 4x4 block is to be processed in dc path
+     *                    : dc block must contain only single dc coefficient
+     * u4_empty_blk_cntrl : control fot 4x4 block with no coeffs, ie no dc and ac
+     *                    : ie not (ac or dc)
+     */
+
+    UWORD32 u4_blk_cntrl, u4_dc_cntrl, u4_empty_blk_cntrl;
+
+    /* tmp registers for block ids */
+    WORD32 u4_blk_id;
+
+    /* Offsets for pointers */
+    WORD32 i4_offset_x, i4_offset_y;
+
+    /* Pointer to 4x4 blocks */
+    UWORD8 *pu1_cur_4x4_prd_blk, *pu1_cur_4x4_out_blk;
+
+    /* Tmp register for pointer to dc coffs */
+    WORD16 *pi2_dc_src;
+
+    WORD16 i2_zero = 0;
+
+    /* Increment for dc block */
+    WORD32 i4_dc_inc;
+
+    /*
+     * Lets do the inverse transform for dc coeffs in chroma
+     */
+    if (u4_cntrl & CNTRL_FLAG_DCBLK_MASK_CHROMA)
+    {
+        UWORD32 cntr, u4_dc_cntrl;
+        /* Do inv hadamard for u an v block */
+
+        ps_codec->pf_ihadamard_scaling_2x2_uv(pi2_src, pi2_src, pu2_iscale_mat,
+                                              pu2_weigh_mat, qp_div, NULL);
+        /*
+         * Update the cntrl flag
+         * Flag is updated as follows bits 15-11 -> u block dc bits
+         */
+        u4_dc_cntrl = 0;
+        for (cntr = 0; cntr < 8; cntr++)
+        {
+            u4_dc_cntrl |= ((pi2_src[cntr] != 0) << (15 - cntr));
+        }
+
+        /* Mark dc bits as 1 if corresponding ac bit is 0 */
+        u4_dc_cntrl = (~(u4_cntrl >> 16) & u4_dc_cntrl);
+        /* Combine both ac and dc bits */
+        u4_cntrl = (u4_cntrl & CNTRL_FLAG_AC_MASK_CHROMA)
+                        | (u4_dc_cntrl & CNTRL_FLAG_DC_MASK_CHROMA);
+
+        /* Since we populated the dc coffs, we have to read them from there */
+        pi2_dc_src = pi2_src;
+        i4_dc_inc = 1;
+    }
+    else
+    {
+        u4_cntrl = u4_cntrl & CNTRL_FLAG_AC_MASK_CHROMA;
+        pi2_dc_src = &i2_zero;
+        i4_dc_inc = 0;
+    }
+
+    /* Get the block bits */
+    u4_blk_cntrl = (u4_cntrl & CNTRL_FLAG_AC_MASK_CHROMA);
+    u4_dc_cntrl = (u4_cntrl & CNTRL_FLAG_DC_MASK_CHROMA) << 16;
+    u4_empty_blk_cntrl = (~(u4_dc_cntrl | u4_blk_cntrl)) & 0xFF000000;
+
+    /* The AC blocks starts from 2nd row */
+    pi2_src += src_strd;
+
+    DEQUEUE_BLKID_FROM_CONTROL(u4_dc_cntrl, u4_blk_id);
+    while (u4_blk_id < 8)
+    {
+        WORD32 dc_src_offset = u4_blk_id * i4_dc_inc;
+
+        IND2SUB_CHROMA_MB(u4_blk_id, i4_offset_x, i4_offset_y);
+
+        pu1_cur_4x4_prd_blk = pu1_pred + i4_offset_x + i4_offset_y * pred_strd;
+        pu1_cur_4x4_out_blk = pu1_out + i4_offset_x + i4_offset_y * out_strd;
+
+        ps_codec->pf_iquant_itrans_recon_chroma_4x4_dc(
+                        pi2_dc_src + dc_src_offset, pu1_cur_4x4_prd_blk,
+                        pu1_cur_4x4_out_blk, pred_strd, out_strd, NULL, NULL, 0,
+                        NULL, pi2_dc_src + dc_src_offset);
+        /* Get next DC block to process */
+        DEQUEUE_BLKID_FROM_CONTROL(u4_dc_cntrl, u4_blk_id);
+    }
+
+    /* now process ac/mixed blocks */
+    DEQUEUE_BLKID_FROM_CONTROL(u4_blk_cntrl, u4_blk_id);
+    while (u4_blk_id < 8)
+    {
+        WORD32 i4_src_offset = src_strd * u4_blk_id;
+        WORD32 dc_src_offset = i4_dc_inc * u4_blk_id;
+
+        IND2SUB_CHROMA_MB(u4_blk_id, i4_offset_x, i4_offset_y);
+
+        pu1_cur_4x4_prd_blk = pu1_pred + i4_offset_x + i4_offset_y * pred_strd;
+        pu1_cur_4x4_out_blk = pu1_out + i4_offset_x + i4_offset_y * out_strd;
+
+        ps_codec->pf_iquant_itrans_recon_chroma_4x4(pi2_src + i4_src_offset,
+                                                    pu1_cur_4x4_prd_blk,
+                                                    pu1_cur_4x4_out_blk,
+                                                    pred_strd, out_strd,
+                                                    pu2_iscale_mat,
+                                                    pu2_weigh_mat, qp_div,
+                                                    (WORD16 *) pi4_tmp,
+                                                    pi2_dc_src + dc_src_offset);
+
+        DEQUEUE_BLKID_FROM_CONTROL(u4_blk_cntrl, u4_blk_id);
+    }
+
+    /* Now process empty blocks */
+    DEQUEUE_BLKID_FROM_CONTROL(u4_empty_blk_cntrl, u4_blk_id);
+    while (u4_blk_id < 8)
+    {
+        IND2SUB_CHROMA_MB(u4_blk_id, i4_offset_x, i4_offset_y);
+
+        pu1_cur_4x4_prd_blk = pu1_pred + i4_offset_x + i4_offset_y * pred_strd;
+        pu1_cur_4x4_out_blk = pu1_out + i4_offset_x + i4_offset_y * out_strd;
+
+        ps_codec->pf_interleave_copy(pu1_cur_4x4_prd_blk, pu1_cur_4x4_out_blk,
+                                     pred_strd, out_strd, SIZE_4X4_BLK_VERT,
+                                     SIZE_4X4_BLK_HRZ);
+
+        DEQUEUE_BLKID_FROM_CONTROL(u4_empty_blk_cntrl, u4_blk_id);
+    }
+}
+
+/**
+******************************************************************************
+*
+* @brief  This function packs residue of an i16x16 luma mb for entropy coding
+*
+* @par   Description
+*  An i16 macro block contains two classes of units, dc 4x4 block and
+*  4x4 ac blocks. while packing the mb, the dc block is sent first, and
+*  the 16 ac blocks are sent next in scan order. Each and every block is
+*  represented by 3 parameters (nnz, significant coefficient map and the
+*  residue coefficients itself). If a 4x4 unit does not have any coefficients
+*  then only nnz is sent. Inside a 4x4 block the individual coefficients are
+*  sent in scan order.
+*
+*  The first byte of each block will be nnz of the block, if it is non zero,
+*  a 2 byte significance map is sent. This is followed by nonzero coefficients.
+*  This is repeated for 1 dc + 16 ac blocks.
+*
+* @param[in]  pi2_res_mb
+*  pointer to residue mb
+*
+* @param[in, out]  pv_mb_coeff_data
+*  buffer pointing to packed residue coefficients
+*
+* @param[in]  u4_res_strd
+*  residual block stride
+*
+* @param[out]  u1_cbp_l
+*  coded block pattern luma
+*
+* @param[in]   pu1_nnz
+*  number of non zero coefficients in each 4x4 unit
+*
+* @param[out]
+*  Control signal for inverse transform of 16x16 blocks
+*
+* @return none
+*
+* @ remarks
+*
+******************************************************************************
+*/
+void ih264e_pack_l_mb_i16(WORD16 *pi2_res_mb,
+                          void **pv_mb_coeff_data,
+                          WORD32 i4_res_strd,
+                          UWORD8 *u1_cbp_l,
+                          UWORD8 *pu1_nnz,
+                          UWORD32 *pu4_cntrl)
+{
+    /* pointer to packed sub block buffer space */
+    tu_sblk_coeff_data_t *ps_mb_coeff_data = (*pv_mb_coeff_data), *ps_mb_coeff_data_ac;
+
+    /* no of non zero coefficients in the current sub block */
+    UWORD32 u4_nnz_cnt;
+
+    /* significant coefficient map */
+    UWORD32 u4_s_map;
+
+    /* pointer to scanning matrix */
+    const UWORD8 *pu1_scan_order;
+
+    /* number of non zeros in sub block */
+    UWORD32 u4_nnz;
+
+    /* coeff scan order */
+    const UWORD8 u1_scan_order[16] = {0, 1, 4, 5, 2, 3, 6, 7, 8, 9, 12, 13, 10, 11, 14, 15};
+
+    /* temp var */
+    UWORD32 coeff_cnt, mask, b4,u4_cntrl=0;
+
+    /*DC and AC coeff pointers*/
+    WORD16 *pi2_res_mb_ac,*pi2_res_mb_dc;
+
+    /********************************************************/
+    /*  pack dc coeff data for entropy coding               */
+    /********************************************************/
+
+    pi2_res_mb_dc = pi2_res_mb;
+    pu1_scan_order = gu1_luma_scan_order_dc;
+
+    u4_nnz = *pu1_nnz;
+    u4_cntrl = 0;
+
+    /* write number of non zero coefficients */
+    ps_mb_coeff_data->i4_sig_map_nnz = u4_nnz;
+
+    if (u4_nnz)
+    {
+        for (u4_nnz_cnt = 0, coeff_cnt = 0, mask = 1, u4_s_map = 0; u4_nnz_cnt < u4_nnz; coeff_cnt++)
+        {
+            if (pi2_res_mb_dc[pu1_scan_order[coeff_cnt]])
+            {
+                /* write residue */
+                ps_mb_coeff_data->ai2_residue[u4_nnz_cnt++] = pi2_res_mb_dc[pu1_scan_order[coeff_cnt]];
+                u4_s_map |= mask;
+            }
+            mask <<= 1;
+        }
+        /* write significant coeff map */
+        ps_mb_coeff_data->i4_sig_map_nnz |= (u4_s_map << 16);
+        (*pv_mb_coeff_data) = ps_mb_coeff_data->ai2_residue + u4_nnz_cnt;
+
+        u4_cntrl = 0x00008000;// Set DC bit in ctrl code
+    }
+    else
+    {
+        (*pv_mb_coeff_data) = ps_mb_coeff_data->ai2_residue;
+    }
+
+    /********************************************************/
+    /*  pack ac coeff data for entropy coding               */
+    /********************************************************/
+
+    pu1_nnz ++;
+    pu1_scan_order = gu1_luma_scan_order;
+    pi2_res_mb += i4_res_strd; /*Move to AC block*/
+
+    ps_mb_coeff_data_ac = (*pv_mb_coeff_data);
+
+    for (b4 = 0; b4 < 16; b4++)
+    {
+        ps_mb_coeff_data = (*pv_mb_coeff_data);
+
+        u4_nnz = pu1_nnz[u1_scan_order[b4]];
+
+        /* Jump according to the scan order */
+        pi2_res_mb_ac = pi2_res_mb + (i4_res_strd * u1_scan_order[b4]);
+
+        /*
+         * Since this is a i16x16 block, we should not count dc coeff on indi
+         * vidual 4x4 blocks to nnz. But due to the implementation of 16x16
+         * trans function, we add dc's nnz to u4_nnz too. Hence we adjust that
+         * here
+         */
+        u4_nnz -= (pi2_res_mb_ac[0] != 0);
+
+        /* write number of non zero coefficients */
+        ps_mb_coeff_data->i4_sig_map_nnz = u4_nnz;
+
+        if (u4_nnz)
+        {
+            for (u4_nnz_cnt = 0, coeff_cnt = 1, mask = 1, u4_s_map = 0; u4_nnz_cnt < u4_nnz; coeff_cnt++)
+            {
+                if (pi2_res_mb_ac[pu1_scan_order[coeff_cnt]])
+                {
+                    /* write residue */
+                    ps_mb_coeff_data->ai2_residue[u4_nnz_cnt++] = pi2_res_mb_ac[pu1_scan_order[coeff_cnt]];
+                    u4_s_map |= mask;
+                }
+                mask <<= 1;
+            }
+            /* write significant coeff map */
+            ps_mb_coeff_data->i4_sig_map_nnz |= (u4_s_map << 16);
+            (*pv_mb_coeff_data) = ps_mb_coeff_data->ai2_residue + u4_nnz_cnt;
+            *u1_cbp_l = 15;
+
+            u4_cntrl |= (1 << (31 - u1_scan_order[b4]));
+        }
+        else
+        {
+            (*pv_mb_coeff_data) = ps_mb_coeff_data->ai2_residue;
+        }
+
+    }
+
+    if (!(*u1_cbp_l))
+    {
+        (*pv_mb_coeff_data) = ps_mb_coeff_data_ac;
+    }
+
+    /* Store the cntrl signal */
+    (*pu4_cntrl) = u4_cntrl;
+    return;
+}
+
+/**
+******************************************************************************
+*
+* @brief  This function packs residue of an p16x16 luma mb for entropy coding
+*
+* @par   Description
+*  A p16x16 macro block contains two classes of units 16  4x4 ac blocks.
+*  while packing the mb, the dc block is sent first, and
+*  the 16 ac blocks are sent next in scan order. Each and every block is
+*  represented by 3 parameters (nnz, significant coefficient map and the
+*  residue coefficients itself). If a 4x4 unit does not have any coefficients
+*  then only nnz is sent. Inside a 4x4 block the individual coefficients are
+*  sent in scan order.
+*
+*  The first byte of each block will be nnz of the block, if it is non zero,
+*  a 2 byte significance map is sent. This is followed by nonzero coefficients.
+*  This is repeated for 1 dc + 16 ac blocks.
+*
+* @param[in]  pi2_res_mb
+*  pointer to residue mb
+*
+* @param[in, out]  pv_mb_coeff_data
+*  buffer pointing to packed residue coefficients
+*
+* @param[in]  i4_res_strd
+*  residual block stride
+*
+* @param[out]  u1_cbp_l
+*  coded block pattern luma
+*
+* @param[in]   pu1_nnz
+*  number of non zero coefficients in each 4x4 unit
+*
+* @param[out] pu4_cntrl
+*  Control signal for inverse transform
+*
+* @return none
+*
+* @remarks Killing coffs not yet coded
+*
+******************************************************************************
+*/
+void ih264e_pack_l_mb(WORD16 *pi2_res_mb,
+                      void **pv_mb_coeff_data,
+                      WORD32 i4_res_strd,
+                      UWORD8 *u1_cbp_l,
+                      UWORD8 *pu1_nnz,
+                      UWORD32 u4_thres_resi,
+                      UWORD32 *pu4_cntrl)
+{
+    /* pointer to packed sub block buffer space */
+    tu_sblk_coeff_data_t *ps_mb_coeff_data, *ps_mb_coeff_data_b8, *ps_mb_coeff_data_mb;
+
+    /* no of non zero coefficients in the current sub block */
+    UWORD32 u4_nnz_cnt;
+
+    /* significant coefficient map */
+    UWORD32 u4_s_map;
+
+    /* pointer to scanning matrix */
+    const UWORD8 *pu1_scan_order = gu1_luma_scan_order;
+
+    /* number of non zeros in sub block */
+    UWORD32 u4_nnz;
+
+    /* pointer to residual sub block */
+    WORD16  *pi2_res_sb;
+
+    /* coeff scan order */
+    const UWORD8 u1_scan_order[16] = {0, 1, 4, 5, 2, 3, 6, 7, 8, 9, 12, 13, 10, 11, 14, 15};
+
+    /* coeff cost */
+    const UWORD8  *pu1_coeff_cost = gu1_coeff_cost;
+
+    /* temp var */
+    UWORD32 u4_mb_coeff_cost = 0, u4_b8_coeff_cost = 0, coeff_cnt, mask, u4_cntrl = 0, b4, b8;
+
+    /* temp var */
+    WORD32 i4_res_val, i4_run = -1, dcac_block;
+
+    /* When Hadamard transform is disabled, first row values are dont care, ignore them */
+    pi2_res_mb += i4_res_strd;
+
+    /* When Hadamard transform is disabled, first unit value is dont care, ignore this */
+    pu1_nnz ++;
+
+    ps_mb_coeff_data_mb = ps_mb_coeff_data_b8 = (*pv_mb_coeff_data);
+
+    /********************************************************/
+    /*  pack coeff data for entropy coding                  */
+    /********************************************************/
+
+    for (b4 = 0; b4 < 16; b4++)
+    {
+        ps_mb_coeff_data = (*pv_mb_coeff_data);
+
+        b8 = b4 >> 2;
+
+        u4_nnz = pu1_nnz[u1_scan_order[b4]];
+
+        /* Jump according to the scan order */
+        pi2_res_sb = pi2_res_mb + (i4_res_strd * u1_scan_order[b4]);
+
+        /* write number of non zero coefficients */
+        ps_mb_coeff_data->i4_sig_map_nnz = u4_nnz;
+
+        if (u4_nnz)
+        {
+            for (u4_nnz_cnt = 0, coeff_cnt = 0, mask = 1, u4_s_map = 0; u4_nnz_cnt < u4_nnz; coeff_cnt++)
+            {
+                /* number of runs of zero before, this is used to compute coeff cost */
+                i4_run++;
+
+                i4_res_val = pi2_res_sb[pu1_scan_order[coeff_cnt]];
+
+                if (i4_res_val)
+                {
+                    /* write residue */
+                    ps_mb_coeff_data->ai2_residue[u4_nnz_cnt++] = i4_res_val;
+                    u4_s_map |= mask;
+
+                    if (u4_thres_resi)
+                    {
+                        /* compute coeff cost */
+                        if (i4_res_val == 1 || i4_res_val == -1)
+                        {
+                            if (i4_run < 6)
+                                u4_b8_coeff_cost += pu1_coeff_cost[i4_run];
+                        }
+                        else
+                            u4_b8_coeff_cost += 9;
+
+                        i4_run = -1;
+                    }
+                }
+
+                mask <<= 1;
+            }
+
+            /* write significant coeff map */
+            ps_mb_coeff_data->i4_sig_map_nnz |= (u4_s_map << 16);
+            (*pv_mb_coeff_data) = ps_mb_coeff_data->ai2_residue + u4_nnz_cnt;
+
+            /* cbp */
+            *u1_cbp_l |= (1 << b8);
+
+            /* Cntrl map for inverse transform computation
+             *
+             * If coeff_cnt is zero, it means that only nonzero was a dc coeff
+             * Hence we have to set the 16 - u1_scan_order[b4]) position instead
+             * of 31 - u1_scan_order[b4]
+             */
+            dcac_block = (coeff_cnt == 0)?16:31;
+            u4_cntrl |= (1 << (dcac_block - u1_scan_order[b4]));
+        }
+        else
+        {
+            (*pv_mb_coeff_data) = ps_mb_coeff_data->ai2_residue;
+        }
+
+        /* Decide if the 8x8 unit has to be sent for entropy coding? */
+        if ((b4+1) % 4 == 0)
+        {
+            if ( u4_thres_resi && (u4_b8_coeff_cost <= LUMA_SUB_BLOCK_SKIP_THRESHOLD) &&
+                            (*u1_cbp_l & (1 << b8)) )
+            {
+
+
+                /*
+                 * When we want to reset the full 8x8 block, we have to reset
+                 * both the dc and ac coeff bits hence we have the symmetric
+                 * arrangement of bits
+                 */
+                const UWORD32 cntrl_mask_map[4] = {0xcc00cc00, 0x33003300, 0x00cc00cc, 0x00330033};
+
+                /* restore cbp */
+                *u1_cbp_l = (*u1_cbp_l & (~(1 << b8)));
+
+                /* correct cntrl flag */
+                u4_cntrl = u4_cntrl & (~cntrl_mask_map[(b4 >> 2)]);
+
+                /* correct nnz */
+                pu1_nnz[u1_scan_order[b4 - 3]] = 0;
+                pu1_nnz[u1_scan_order[b4 - 2]] = 0;
+                pu1_nnz[u1_scan_order[b4 - 1]] = 0;
+                pu1_nnz[u1_scan_order[b4]] = 0;
+
+                /* reset blk cost */
+                u4_b8_coeff_cost = 0;
+            }
+
+            if (!(*u1_cbp_l & (1 << b8)))
+            {
+                (*pv_mb_coeff_data) = ps_mb_coeff_data_b8;
+            }
+
+            u4_mb_coeff_cost += u4_b8_coeff_cost;
+
+            u4_b8_coeff_cost = 0;
+            i4_run = -1;
+            ps_mb_coeff_data_b8 = (*pv_mb_coeff_data);
+        }
+    }
+
+    if (u4_thres_resi && (u4_mb_coeff_cost <= LUMA_BLOCK_SKIP_THRESHOLD)
+                    && (*u1_cbp_l))
+    {
+        (*pv_mb_coeff_data) = ps_mb_coeff_data_mb;
+        *u1_cbp_l = 0;
+        u4_cntrl = 0;
+        memset(pu1_nnz, 0, 16);
+    }
+
+    (*pu4_cntrl) = u4_cntrl;
+
+    return;
+}
+
+/**
+******************************************************************************
+*
+* @brief  This function packs residue of an i8x8 chroma mb for entropy coding
+*
+* @par   Description
+*  An i8 chroma macro block contains two classes of units, dc 2x2 block and
+*  4x4 ac blocks. while packing the mb, the dc block is sent first, and
+*  the 4 ac blocks are sent next in scan order. Each and every block is
+*  represented by 3 parameters (nnz, significant coefficient map and the
+*  residue coefficients itself). If a 4x4 unit does not have any coefficients
+*  then only nnz is sent. Inside a 4x4 block the individual coefficients are
+*  sent in scan order.
+*
+*  The first byte of each block will be nnz of the block, if it is non zero,
+*  a 2 byte significance map is sent. This is followed by nonzero coefficients.
+*  This is repeated for 1 dc + 4 ac blocks.
+*
+* @param[in]  pi2_res_mb
+*  pointer to residue mb
+*
+* @param[in, out]  pv_mb_coeff_data
+*  buffer pointing to packed residue coefficients
+*
+* @param[in]  u4_res_strd
+*  residual block stride
+*
+* @param[out]  u1_cbp_c
+*  coded block pattern chroma
+*
+* @param[in]   pu1_nnz
+*  number of non zero coefficients in each 4x4 unit
+*
+* @param[out]   pu1_nnz
+*  Control signal for inverse transform
+*
+* @param[in]   u4_swap_uv
+*  Swaps the order of U and V planes in entropy bitstream
+*
+* @return none
+*
+* @ remarks
+*
+******************************************************************************
+*/
+void ih264e_pack_c_mb(WORD16 *pi2_res_mb,
+                      void **pv_mb_coeff_data,
+                      WORD32 i4_res_strd,
+                      UWORD8 *u1_cbp_c,
+                      UWORD8 *pu1_nnz,
+                      UWORD32 u4_thres_resi,
+                      UWORD32 *pu4_cntrl,
+                      UWORD32 u4_swap_uv)
+{
+    /* pointer to packed sub block buffer space */
+    tu_sblk_coeff_data_t *ps_mb_coeff_data = (*pv_mb_coeff_data);
+    tu_sblk_coeff_data_t *ps_mb_coeff_data_dc, *ps_mb_coeff_data_ac;
+
+    /* nnz pointer */
+    UWORD8 *pu1_nnz_ac, *pu1_nnz_dc;
+
+    /* nnz counter */
+    UWORD32 u4_nnz_cnt;
+
+    /* significant coefficient map */
+    UWORD32 u4_s_map;
+
+    /* pointer to scanning matrix */
+    const UWORD8 *pu1_scan_order;
+
+    /* no of non zero coefficients in the current sub block */
+    UWORD32 u4_nnz;
+
+    /* pointer to residual sub block, res val */
+    WORD16 *pi2_res_sb, i2_res_val;
+
+    /* temp var */
+    UWORD32 coeff_cnt, mask, b4,plane;
+
+    /* temp var */
+    UWORD32 u4_coeff_cost;
+    WORD32 i4_run;
+
+    /* coeff cost */
+    const UWORD8 *pu1_coeff_cost = gu1_coeff_cost;
+
+    /* pointer to packed buffer space */
+    UWORD32 *pu4_mb_coeff_data = NULL;
+
+    /* ac coded block pattern */
+    UWORD8 u1_cbp_ac;
+
+    /* Variable to store the current bit pos in cntrl variable*/
+    UWORD32 cntrl_pos = 0;
+
+    /********************************************************/
+    /*  pack dc coeff data for entropy coding               */
+    /********************************************************/
+    pu1_scan_order = gu1_chroma_scan_order_dc;
+    pi2_res_sb = pi2_res_mb;
+    pu1_nnz_dc = pu1_nnz;
+    (*pu4_cntrl) = 0;
+    cntrl_pos = 15;
+    ps_mb_coeff_data_dc = (*pv_mb_coeff_data);
+
+    /* Color space conversion between SP_UV and SP_VU
+     * We always assume SP_UV for all the processing
+     * Hence to get proper stream output we need to swap U and V channels here
+     *
+     * For that there are two paths we need to look for
+     * One is the path to bitstream , these variables should have the proper input
+     * configured UV or VU
+     * For the other path the inverse transform variables should have ehat ever 0ordering the
+     * input had
+     */
+
+    if (u4_swap_uv)
+    {
+        pu1_nnz_dc += 5;/* Move to NNZ of V planve */
+        pi2_res_sb += 4;/* Move to DC coff of V plane */
+
+        cntrl_pos = 14; /* Control bit for V plane */
+    }
+
+    for (plane = 0; plane < 2; plane++)
+    {
+        ps_mb_coeff_data = (*pv_mb_coeff_data);
+
+        u4_nnz = *pu1_nnz_dc;
+        /* write number of non zero coefficients U/V */
+        ps_mb_coeff_data->i4_sig_map_nnz = u4_nnz;
+
+        if (u4_nnz)
+        {
+            for (u4_nnz_cnt = 0, coeff_cnt = 0, mask = 1, u4_s_map = 0; u4_nnz_cnt < u4_nnz; coeff_cnt++)
+            {
+                i2_res_val = pi2_res_sb[pu1_scan_order[coeff_cnt]];
+                if (i2_res_val)
+                {
+                    /* write residue U/V */
+                    ps_mb_coeff_data->ai2_residue[u4_nnz_cnt++] = i2_res_val;
+                    u4_s_map |= mask;
+                }
+                mask <<= 1;
+            }
+            /* write significant coeff map U/V */
+            ps_mb_coeff_data->i4_sig_map_nnz |= (u4_s_map << 16);
+            (*pv_mb_coeff_data) = ps_mb_coeff_data->ai2_residue + u4_nnz_cnt;
+            *u1_cbp_c = 1;
+
+            (*pu4_cntrl) |= (1 << cntrl_pos);
+        }
+        else
+        {
+            (*pv_mb_coeff_data) = ps_mb_coeff_data->ai2_residue;
+        }
+
+        if (u4_swap_uv)
+        {
+            cntrl_pos++; /* Control bit for U plane */
+            pu1_nnz_dc -= 5; /* Move to NNZ of U plane */
+            pi2_res_sb -= 4; /* Move to DC coff of U plane */
+
+        }
+        else
+        {
+            cntrl_pos--; /* Control bit for U plane */
+            pu1_nnz_dc += 5; /* 4 for AC NNZ and 1 for DC */
+            pi2_res_sb += 4; /* Move to DC coff of V plane */
+        }
+    }
+
+    /********************************************************/
+    /*  pack ac coeff data for entropy coding               */
+    /********************************************************/
+
+    pu1_scan_order = gu1_chroma_scan_order;
+    ps_mb_coeff_data_ac = (*pv_mb_coeff_data);
+
+    if (u4_swap_uv)
+    {
+        pi2_res_sb = pi2_res_mb + i4_res_strd * 5; /* Move to V plane ,ie 1dc row+ 4 ac row */
+        cntrl_pos = 27; /* The control bits are to be added for V bloc ie 31-4 th bit */
+        pu1_nnz_ac = pu1_nnz + 6;/*Move the nnz to V block NNZ 1 dc + 1dc + 4 ac */
+    }
+    else
+    {
+        pi2_res_sb = pi2_res_mb + i4_res_strd; /* Move to U plane ,ie 1dc row */
+        cntrl_pos = 31;
+        pu1_nnz_ac = pu1_nnz + 1; /* Move the nnz to V block NNZ 1 dc */
+    }
+
+    for (plane = 0; plane < 2; plane++)
+    {
+        pu4_mb_coeff_data = (*pv_mb_coeff_data);
+
+        u4_coeff_cost = 0;
+        i4_run = -1;
+
+        /* get the current cbp, so that it automatically
+         * gets reverted in case of zero ac values */
+        u1_cbp_ac = *u1_cbp_c;
+
+        for (b4 = 0; b4 < 4; b4++)
+        {
+            ps_mb_coeff_data = (*pv_mb_coeff_data);
+
+            u4_nnz = *pu1_nnz_ac;
+
+            /*
+             * We are scanning only ac coeffs, but the nnz is for the
+             * complete 4x4 block. Hence we have to discount the nnz contributed
+             * by the dc coefficient
+             */
+            u4_nnz -= (pi2_res_sb[0]!=0);
+
+            /* write number of non zero coefficients U/V */
+            ps_mb_coeff_data->i4_sig_map_nnz = u4_nnz;
+
+            if (u4_nnz)
+            {
+                for (u4_nnz_cnt = 0, coeff_cnt = 0, mask = 1, u4_s_map = 0; u4_nnz_cnt < u4_nnz; coeff_cnt++)
+                {
+                    i2_res_val = pi2_res_sb[pu1_scan_order[coeff_cnt]];
+
+                    i4_run++;
+
+                    if (i2_res_val)
+                    {
+                        /* write residue U/V */
+                        ps_mb_coeff_data->ai2_residue[u4_nnz_cnt++] = i2_res_val;
+                        u4_s_map |= mask;
+
+                        if ( u4_thres_resi && (u4_coeff_cost < CHROMA_BLOCK_SKIP_THRESHOLD) )
+                        {
+                            /* compute coeff cost */
+                            if (i2_res_val == 1 || i2_res_val == -1)
+                            {
+                                if (i4_run < 6)
+                                    u4_coeff_cost += pu1_coeff_cost[i4_run];
+                            }
+                            else
+                                u4_coeff_cost += 9;
+
+                            i4_run = -1;
+                        }
+                    }
+                    mask <<= 1;
+                }
+
+                /* write significant coeff map U/V */
+                ps_mb_coeff_data->i4_sig_map_nnz |= (u4_s_map << 16);
+                (*pv_mb_coeff_data) = ps_mb_coeff_data->ai2_residue + u4_nnz_cnt;
+                u1_cbp_ac = 2;
+
+                (*pu4_cntrl) |= 1 << cntrl_pos;
+            }
+            else
+            {
+                (*pv_mb_coeff_data) = ps_mb_coeff_data->ai2_residue;
+            }
+
+            pu1_nnz_ac++;
+            pi2_res_sb += i4_res_strd;
+            cntrl_pos--;
+        }
+
+        /* reset block */
+        if (u4_thres_resi && (u4_coeff_cost < CHROMA_BLOCK_SKIP_THRESHOLD))
+        {
+            pu4_mb_coeff_data[0] = 0;
+            pu4_mb_coeff_data[1] = 0;
+            pu4_mb_coeff_data[2] = 0;
+            pu4_mb_coeff_data[3] = 0;
+            (*pv_mb_coeff_data) = pu4_mb_coeff_data + 4;
+
+            /* Generate the control signal */
+            /* Zero out the current plane's AC coefficients */
+            (*pu4_cntrl) &= ((plane == u4_swap_uv) ? 0x0FFFFFFF : 0xF0FFFFFF);
+
+            /* Similarly do for the NNZ also */
+            *(pu1_nnz_ac - 4) = 0;
+            *(pu1_nnz_ac - 3) = 0;
+            *(pu1_nnz_ac - 2) = 0;
+            *(pu1_nnz_ac - 1) = 0;
+        }
+        else
+        {
+            *u1_cbp_c = u1_cbp_ac;
+        }
+
+        if (u4_swap_uv)
+        {
+            pi2_res_sb = pi2_res_mb + i4_res_strd; /* Move to V plane ,ie 1dc row+ 4 ac row + 1 dc row */
+            cntrl_pos = 31; /* The control bits are to be added for V bloc ie 31-4 th bit */
+            pu1_nnz_ac = pu1_nnz + 1; /* Move the nnz to V block NNZ 1 dc + 1dc + 4 ac */
+
+            pu1_nnz_ac = pu1_nnz + 1;
+        }
+        else
+            pu1_nnz_ac = pu1_nnz + 6; /* Go to nnz of V plane */
+    }
+
+    /* restore the ptr basing on cbp */
+    if (*u1_cbp_c == 0)
+    {
+        (*pv_mb_coeff_data) = ps_mb_coeff_data_dc;
+    }
+    else if (*u1_cbp_c == 1)
+    {
+        (*pv_mb_coeff_data) = ps_mb_coeff_data_ac;
+    }
+
+    return ;
+}
+
+/**
+*******************************************************************************
+*
+* @brief performs luma core coding when intra mode is i16x16
+*
+* @par Description:
+*  If the current mb is to be coded as intra of mb type i16x16, the mb is first
+*  predicted using one of i16x16 prediction filters, basing on the intra mode
+*  chosen. Then, error is computed between the input blk and the estimated blk.
+*  This error is transformed (hierarchical transform i.e., dct followed by hada-
+*  -mard), quantized. The quantized coefficients are packed in scan order for
+*  entropy coding.
+*
+* @param[in] ps_proc_ctxt
+*  pointer to the current macro block context
+*
+* @returns u1_cbp_l
+*  coded block pattern luma
+*
+* @remarks none
+*
+*******************************************************************************
+*/
+
+UWORD8 ih264e_code_luma_intra_macroblock_16x16(process_ctxt_t *ps_proc)
+{
+    /* Codec Context */
+    codec_t *ps_codec = ps_proc->ps_codec;
+
+    /* pointer to ref macro block */
+    UWORD8 *pu1_ref_mb = ps_proc->pu1_rec_buf_luma;
+
+    /* pointer to src macro block */
+    UWORD8 *pu1_curr_mb = ps_proc->pu1_src_buf_luma;
+
+    /* pointer to prediction macro block */
+    UWORD8 *pu1_pred_mb = NULL;
+
+    /* pointer to residual macro block */
+    WORD16 *pi2_res_mb = ps_proc->pi2_res_buf;
+
+    /* strides */
+    WORD32 i4_src_strd = ps_proc->i4_src_strd;
+    WORD32 i4_rec_strd = ps_proc->i4_rec_strd;
+    WORD32 i4_pred_strd = ps_proc->i4_pred_strd;
+    WORD32 i4_res_strd = ps_proc->i4_res_strd;
+
+    /* intra mode */
+    UWORD8 u1_intra_mode = ps_proc->u1_l_i16_mode;
+
+    /* coded block pattern */
+    UWORD8 u1_cbp_l = 0;
+
+    /* number of non zero coeffs*/
+    UWORD32 au4_nnz[5];
+    UWORD8  *pu1_nnz = (UWORD8 *)au4_nnz;
+
+    /*Cntrol signal for itrans*/
+    UWORD32 u4_cntrl;
+
+    /* quantization parameters */
+    quant_params_t *ps_qp_params = ps_proc->ps_qp_params[0];
+
+    /* pointer to packed mb coeff data */
+    void **pv_mb_coeff_data = &(ps_proc->pv_mb_coeff_data);
+
+    /* init nnz */
+    au4_nnz[0] = 0;
+    au4_nnz[1] = 0;
+    au4_nnz[2] = 0;
+    au4_nnz[3] = 0;
+    au4_nnz[4] = 0;
+
+    if (u1_intra_mode == PLANE_I16x16)
+    {
+        pu1_pred_mb = ps_proc->pu1_pred_mb_intra_16x16_plane;
+    }
+    else
+    {
+        pu1_pred_mb = ps_proc->pu1_pred_mb_intra_16x16;
+    }
+
+    /********************************************************/
+    /*  error estimation,                                   */
+    /*  transform                                           */
+    /*  quantization                                        */
+    /********************************************************/
+    ih264e_luma_16x16_resi_trans_dctrans_quant(ps_codec, pu1_curr_mb,
+                                               pu1_pred_mb, pi2_res_mb,
+                                               i4_src_strd, i4_pred_strd,
+                                               i4_res_strd,
+                                               ps_qp_params->pu2_scale_mat,
+                                               ps_qp_params->pu2_thres_mat,
+                                               ps_qp_params->u1_qbits,
+                                               ps_qp_params->u4_dead_zone,
+                                               pu1_nnz, ENABLE_DC_TRANSFORM);
+
+    /********************************************************/
+    /*  pack coeff data for entropy coding                  */
+    /********************************************************/
+    ih264e_pack_l_mb_i16(pi2_res_mb, pv_mb_coeff_data, i4_res_strd, &u1_cbp_l,
+                         pu1_nnz, &u4_cntrl);
+
+    /********************************************************/
+    /*  ierror estimation,                                  */
+    /*  itransform                                          */
+    /*  iquantization                                       */
+    /********************************************************/
+    /*
+     *if refernce frame is not to be computed
+     *we only need the right and bottom border 4x4 blocks to predict next intra
+     *blocks, hence only compute them
+     */
+    if (!ps_proc->u4_compute_recon)
+    {
+        u4_cntrl &= 0x111F8000;
+    }
+
+    if (u4_cntrl)
+    {
+        ih264e_luma_16x16_idctrans_iquant_itrans_recon(
+                        ps_codec, pi2_res_mb, pu1_pred_mb, pu1_ref_mb,
+                        i4_res_strd, i4_pred_strd, i4_rec_strd,
+                        ps_qp_params->pu2_iscale_mat,
+                        ps_qp_params->pu2_weigh_mat, ps_qp_params->u1_qp_div,
+                        u4_cntrl, ENABLE_DC_TRANSFORM,
+                        ps_proc->pv_scratch_buff);
+    }
+    else
+    {
+        ps_codec->pf_inter_pred_luma_copy(pu1_pred_mb, pu1_ref_mb, i4_pred_strd,
+                                          i4_rec_strd, MB_SIZE, MB_SIZE, NULL,
+                                          0);
+    }
+
+    return (u1_cbp_l);
+}
+
+
+/**
+*******************************************************************************
+*
+* @brief performs luma core coding when intra mode is i4x4
+*
+* @par Description:
+*  If the current mb is to be coded as intra of mb type i4x4, the mb is first
+*  predicted using one of i4x4 prediction filters, basing on the intra mode
+*  chosen. Then, error is computed between the input blk and the estimated blk.
+*  This error is dct transformed and quantized. The quantized coefficients are
+*  packed in scan order for entropy coding.
+*
+* @param[in] ps_proc_ctxt
+*  pointer to the current macro block context
+*
+* @returns u1_cbp_l
+*  coded block pattern luma
+*
+* @remarks
+*  The traversal of 4x4 subblocks in the 16x16 macroblock is as per the scan order
+*  mentioned in h.264 specification
+*
+*******************************************************************************
+*/
+UWORD8 ih264e_code_luma_intra_macroblock_4x4(process_ctxt_t *ps_proc)
+{
+    /* Codec Context */
+    codec_t *ps_codec = ps_proc->ps_codec;
+
+    /* pointer to ref macro block */
+    UWORD8 *pu1_ref_mb = ps_proc->pu1_rec_buf_luma;
+
+    /* pointer to src macro block */
+    UWORD8 *pu1_curr_mb = ps_proc->pu1_src_buf_luma;
+
+    /* pointer to prediction macro block */
+    UWORD8 *pu1_pred_mb = ps_proc->pu1_pred_mb;
+
+    /* pointer to residual macro block */
+    WORD16 *pi2_res_mb = ps_proc->pi2_res_buf;
+
+    /* strides */
+    WORD32 i4_src_strd = ps_proc->i4_src_strd;
+    WORD32 i4_rec_strd = ps_proc->i4_rec_strd;
+    WORD32 i4_pred_strd = ps_proc->i4_pred_strd;
+
+    /* pointer to neighbors: left, top, top-left */
+    UWORD8 *pu1_mb_a;
+    UWORD8 *pu1_mb_b;
+    UWORD8 *pu1_mb_c;
+    UWORD8 *pu1_mb_d;
+
+    /* intra mode */
+    UWORD8 u1_intra_mode = ps_proc->u1_l_i16_mode;
+
+    /* neighbor availability */
+    WORD32 i4_ngbr_avbl;
+
+    /* neighbor pels for intra prediction */
+    UWORD8 *pu1_ngbr_pels_i4 = ps_proc->au1_ngbr_pels;
+
+    /* coded block pattern */
+    UWORD8 u1_cbp_l = 0;
+
+    /* number of non zero coeffs*/
+    UWORD8  u1_nnz;
+
+    /* quantization parameters */
+    quant_params_t *ps_qp_params = ps_proc->ps_qp_params[0];
+
+    /* pointer to packed mb coeff data */
+    void **pv_mb_coeff_data = &(ps_proc->pv_mb_coeff_data);
+
+    /* pointer to packed mb coeff data */
+    tu_sblk_coeff_data_t *ps_mb_coeff_data, *ps_mb_coeff_data_b8;
+
+    /* no of non zero coefficients in the current sub block */
+    UWORD32 u4_nnz_cnt;
+
+    /* significant coefficient map */
+    UWORD32 u4_s_map;
+
+    /* pointer to scanning matrix */
+    const UWORD8 *pu1_scan_order = gu1_luma_scan_order;
+
+    /*Dummy variable for 4x4 trans fucntion*/
+    WORD16 i2_dc_dummy;
+
+    /* temp var */
+    UWORD32 i, b8, b4, u1_blk_x, u1_blk_y, u1_pix_x, u1_pix_y, coeff_cnt, mask;
+
+    /* Process 16 4x4 lum sub-blocks of the MB in scan order */
+    for (b8 = 0; b8 < 4; b8++)
+    {
+        u1_blk_x = GET_BLK_RASTER_POS_X(b8) << 3;
+        u1_blk_y = GET_BLK_RASTER_POS_Y(b8) << 3;
+
+        /* if in case cbp for the 8x8 block is zero, send no residue */
+        ps_mb_coeff_data_b8 = *pv_mb_coeff_data;
+
+        for (b4 = 0; b4 < 4; b4++)
+        {
+            /* index of pel in MB */
+            u1_pix_x = u1_blk_x + (GET_SUB_BLK_RASTER_POS_X(b4) << 2);
+            u1_pix_y = u1_blk_y + (GET_SUB_BLK_RASTER_POS_Y(b4) << 2);
+
+            /* Initialize source and reference pointers */
+            pu1_curr_mb = ps_proc->pu1_src_buf_luma + u1_pix_x + (u1_pix_y * i4_src_strd);
+            pu1_ref_mb = ps_proc->pu1_rec_buf_luma + u1_pix_x + (u1_pix_y * i4_rec_strd);
+
+            /* pointer to left of ref macro block */
+            pu1_mb_a = pu1_ref_mb - 1;
+            /* pointer to top of ref macro block */
+            pu1_mb_b = pu1_ref_mb - i4_rec_strd;
+            /* pointer to topright of ref macro block */
+            pu1_mb_c = pu1_mb_b + 4;
+            /* pointer to topleft macro block */
+            pu1_mb_d = pu1_mb_b - 1;
+
+            /* compute neighbor availability */
+            i4_ngbr_avbl = ps_proc->au1_ngbr_avbl_4x4_subblks[(b8 << 2) + b4];
+
+            /* sub block intra mode */
+            u1_intra_mode = ps_proc->au1_intra_luma_mb_4x4_modes[(b8 << 2) + b4];
+
+            /********************************************************/
+            /* gather prediction pels from neighbors for prediction */
+            /********************************************************/
+            /* left pels */
+            if (i4_ngbr_avbl & LEFT_MB_AVAILABLE_MASK)
+            {
+                for (i = 0; i < 4; i++)
+                    pu1_ngbr_pels_i4[4 - 1 - i] = pu1_mb_a[i * i4_rec_strd];
+            }
+            else
+            {
+                memset(pu1_ngbr_pels_i4, 0, 4);
+            }
+
+            /* top pels */
+            if (i4_ngbr_avbl & TOP_MB_AVAILABLE_MASK)
+            {
+                memcpy(pu1_ngbr_pels_i4 + 4 + 1, pu1_mb_b, 4);
+            }
+            else
+            {
+                memset(pu1_ngbr_pels_i4 + 5, 0, 4);
+            }
+            /* top left pels */
+            if (i4_ngbr_avbl & TOP_LEFT_MB_AVAILABLE_MASK)
+            {
+                pu1_ngbr_pels_i4[4] = *pu1_mb_d;
+            }
+            else
+            {
+                pu1_ngbr_pels_i4[4] = 0;
+            }
+            /* top right pels */
+            if (i4_ngbr_avbl & TOP_RIGHT_MB_AVAILABLE_MASK)
+            {
+                memcpy(pu1_ngbr_pels_i4+8+1,pu1_mb_c,4);
+            }
+            else if (i4_ngbr_avbl & TOP_MB_AVAILABLE_MASK)
+            {
+                memset(pu1_ngbr_pels_i4+8+1,pu1_ngbr_pels_i4[8],4);
+            }
+
+            /********************************************************/
+            /*  prediction                                          */
+            /********************************************************/
+            (ps_codec->apf_intra_pred_4_l)[u1_intra_mode](pu1_ngbr_pels_i4,
+                                                          pu1_pred_mb, 0,
+                                                          i4_pred_strd,
+                                                          i4_ngbr_avbl);
+
+            /********************************************************/
+            /*  error estimation,                                   */
+            /*  transform                                           */
+            /*  quantization                                        */
+            /********************************************************/
+            ps_codec->pf_resi_trans_quant_4x4(pu1_curr_mb, pu1_pred_mb,
+                                              pi2_res_mb, i4_src_strd,
+                                              i4_pred_strd,
+                                              ps_qp_params->pu2_scale_mat,
+                                              ps_qp_params->pu2_thres_mat,
+                                              ps_qp_params->u1_qbits,
+                                              ps_qp_params->u4_dead_zone,
+                                              &u1_nnz, &i2_dc_dummy);
+
+            /********************************************************/
+            /*  pack coeff data for entropy coding                  */
+            /********************************************************/
+            ps_mb_coeff_data = *pv_mb_coeff_data;
+
+            /* write number of non zero coefficients */
+            ps_mb_coeff_data->i4_sig_map_nnz = u1_nnz;
+
+            if (u1_nnz)
+            {
+                for (u4_nnz_cnt = 0, coeff_cnt = 0, mask = 1, u4_s_map = 0; u4_nnz_cnt < u1_nnz; coeff_cnt++)
+                {
+                    if (pi2_res_mb[pu1_scan_order[coeff_cnt]])
+                    {
+                        /* write residue */
+                        ps_mb_coeff_data->ai2_residue[u4_nnz_cnt++] = pi2_res_mb[pu1_scan_order[coeff_cnt]];
+                        u4_s_map |= mask;
+                    }
+                    mask <<= 1;
+                }
+                /* write significant coeff map */
+                ps_mb_coeff_data->i4_sig_map_nnz |= (u4_s_map << 16);
+
+                /* update ptr to coeff data */
+                (*pv_mb_coeff_data) = ps_mb_coeff_data->ai2_residue + u4_nnz_cnt;
+
+                /* cbp */
+                u1_cbp_l |= (1 << b8);
+            }
+            else
+            {
+                (*pv_mb_coeff_data) = ps_mb_coeff_data->ai2_residue;
+            }
+
+            /********************************************************/
+            /*  ierror estimation,                                  */
+            /*  itransform                                          */
+            /*  iquantization                                       */
+            /********************************************************/
+            /* If the frame is not to be used for P frame reference or dumping recon
+             * we only will use the recon for only predicting intra Mbs
+             * This will need only right and bottom edge 4x4 blocks recon
+             * Hence we selectively enable them
+             */
+            if (ps_proc->u4_compute_recon || (0xF888 & (1 << ((b8 << 2) + b4))))
+            {
+                if (u1_nnz)
+                    ps_codec->pf_iquant_itrans_recon_4x4(
+                                    pi2_res_mb, pu1_pred_mb, pu1_ref_mb,
+                                    /*No input stride,*/i4_pred_strd,
+                                    i4_rec_strd, ps_qp_params->pu2_iscale_mat,
+                                    ps_qp_params->pu2_weigh_mat,
+                                    ps_qp_params->u1_qp_div,
+                                    ps_proc->pv_scratch_buff, 0, 0);
+                else
+                    ps_codec->pf_inter_pred_luma_copy(pu1_pred_mb, pu1_ref_mb,
+                                                      i4_pred_strd, i4_rec_strd,
+                                                      BLK_SIZE, BLK_SIZE, NULL,
+                                                      0);
+            }
+
+        }
+
+        /* if the 8x8 block has no residue, nothing needs to be sent to entropy */
+        if (!(u1_cbp_l & (1 << b8)))
+        {
+            *pv_mb_coeff_data = ps_mb_coeff_data_b8;
+        }
+    }
+
+    return (u1_cbp_l);
+}
+
+/**
+*******************************************************************************
+*
+* @brief performs luma core coding when intra mode is i4x4
+*
+* @par Description:
+*  If the current mb is to be coded as intra of mb type i4x4, the mb is first
+*  predicted using one of i4x4 prediction filters, basing on the intra mode
+*  chosen. Then, error is computed between the input blk and the estimated blk.
+*  This error is dct transformed and quantized. The quantized coefficients are
+*  packed in scan order for entropy coding.
+*
+* @param[in] ps_proc_ctxt
+*  pointer to the current macro block context
+*
+* @returns u1_cbp_l
+*  coded block pattern luma
+*
+* @remarks
+*  The traversal of 4x4 subblocks in the 16x16 macroblock is as per the scan order
+*  mentioned in h.264 specification
+*
+*******************************************************************************
+*/
+UWORD8 ih264e_code_luma_intra_macroblock_4x4_rdopt_on(process_ctxt_t *ps_proc)
+{
+    /* Codec Context */
+    codec_t *ps_codec = ps_proc->ps_codec;
+
+    /* pointer to ref macro block */
+    UWORD8 *pu1_ref_mb_intra_4x4 = ps_proc->pu1_ref_mb_intra_4x4;
+
+    /* pointer to recon buffer */
+    UWORD8 *pu1_rec_mb = ps_proc->pu1_rec_buf_luma;
+
+    /* pointer to residual macro block */
+    WORD16 *pi2_res_mb = ps_proc->pi2_res_buf_intra_4x4;
+
+    /* strides */
+    WORD32 i4_rec_strd = ps_proc->i4_rec_strd;
+
+    /* number of non zero coeffs*/
+    UWORD8  *pu1_nnz = (UWORD8 *)ps_proc->au4_nnz_intra_4x4;
+
+    /* coded block pattern */
+    UWORD8 u1_cbp_l = 0;
+
+    /* pointer to packed mb coeff data */
+    void **pv_mb_coeff_data = &(ps_proc->pv_mb_coeff_data);
+
+    /* pointer to packed mb coeff data */
+    tu_sblk_coeff_data_t *ps_mb_coeff_data, *ps_mb_coeff_data_b8;
+
+    /* no of non zero coefficients in the current sub block */
+    UWORD32 u4_nnz_cnt;
+
+    /* significant coefficient map */
+    UWORD32 u4_s_map;
+
+    /* pointer to scanning matrix */
+    const UWORD8 *pu1_scan_order = gu1_luma_scan_order;
+
+    /* temp var */
+    UWORD32 b8, b4, coeff_cnt, mask;
+
+    /* Process 16 4x4 lum sub-blocks of the MB in scan order */
+    for (b8 = 0; b8 < 4; b8++)
+    {
+        /* if in case cbp for the 8x8 block is zero, send no residue */
+        ps_mb_coeff_data_b8 = *pv_mb_coeff_data;
+
+        for (b4 = 0; b4 < 4; b4++, pu1_nnz++, pi2_res_mb += MB_SIZE)
+        {
+            /********************************************************/
+            /*  pack coeff data for entropy coding                  */
+            /********************************************************/
+            ps_mb_coeff_data = *pv_mb_coeff_data;
+
+            /* write number of non zero coefficients */
+            ps_mb_coeff_data->i4_sig_map_nnz = *pu1_nnz;
+
+            if (*pu1_nnz)
+            {
+                for (u4_nnz_cnt = 0, coeff_cnt = 0, mask = 1, u4_s_map = 0; u4_nnz_cnt < *pu1_nnz; coeff_cnt++)
+                {
+                    if (pi2_res_mb[pu1_scan_order[coeff_cnt]])
+                    {
+                        /* write residue */
+                        ps_mb_coeff_data->ai2_residue[u4_nnz_cnt++] = pi2_res_mb[pu1_scan_order[coeff_cnt]];
+                        u4_s_map |= mask;
+                    }
+                    mask <<= 1;
+                }
+                /* write significant coeff map */
+                ps_mb_coeff_data->i4_sig_map_nnz |= (u4_s_map << 16);
+
+                /* update ptr to coeff data */
+                (*pv_mb_coeff_data) = ps_mb_coeff_data->ai2_residue + u4_nnz_cnt;
+
+                /* cbp */
+                u1_cbp_l |= (1 << b8);
+            }
+            else
+            {
+                (*pv_mb_coeff_data) = ps_mb_coeff_data->ai2_residue;
+            }
+        }
+
+        /* if the 8x8 block has no residue, nothing needs to be sent to entropy */
+        if (!(u1_cbp_l & (1 << b8)))
+        {
+            *pv_mb_coeff_data = ps_mb_coeff_data_b8;
+        }
+    }
+
+    /* memcpy recon */
+    ps_codec->pf_inter_pred_luma_copy(pu1_ref_mb_intra_4x4, pu1_rec_mb, MB_SIZE, i4_rec_strd, MB_SIZE, MB_SIZE, NULL, 0);
+
+    return (u1_cbp_l);
+}
+
+
+/**
+*******************************************************************************
+*
+* @brief performs chroma core coding for intra macro blocks
+*
+* @par Description:
+*  If the current MB is to be intra coded with mb type chroma I8x8, the MB is
+*  first predicted using intra 8x8 prediction filters. The predicted data is
+*  compared with the input for error and the error is transformed. The DC
+*  coefficients of each transformed sub blocks are further transformed using
+*  Hadamard transform. The resulting coefficients are quantized, packed and sent
+*  for entropy coding.
+*
+* @param[in] ps_proc_ctxt
+*  pointer to the current macro block context
+*
+* @returns u1_cbp_c
+*  coded block pattern chroma
+*
+* @remarks
+*  The traversal of 4x4 subblocks in the 8x8 macroblock is as per the scan order
+*  mentioned in h.264 specification
+*
+*******************************************************************************
+*/
+UWORD8 ih264e_code_chroma_intra_macroblock_8x8(process_ctxt_t *ps_proc)
+{
+    /* Codec Context */
+    codec_t *ps_codec = ps_proc->ps_codec;
+
+    /* pointer to ref macro block */
+    UWORD8 *pu1_ref_mb = ps_proc->pu1_rec_buf_chroma;
+
+    /* pointer to src macro block */
+    UWORD8 *pu1_curr_mb = ps_proc->pu1_src_buf_chroma;
+
+    /* pointer to prediction macro block */
+    UWORD8 *pu1_pred_mb = NULL;
+
+    /* pointer to residual macro block */
+    WORD16 *pi2_res_mb = ps_proc->pi2_res_buf;
+
+    /* strides */
+    WORD32 i4_src_strd = ps_proc->i4_src_strd;
+    WORD32 i4_rec_strd = ps_proc->i4_rec_strd;
+    WORD32 i4_pred_strd = ps_proc->i4_pred_strd;
+    WORD32 i4_res_strd = ps_proc->i4_res_strd;
+
+    /* intra mode */
+    UWORD8 u1_intra_mode = ps_proc->u1_c_i8_mode;
+
+    /* coded block pattern */
+    UWORD8 u1_cbp_c = 0;
+
+    /* number of non zero coeffs*/
+    UWORD8 au1_nnz[18] = {0};
+
+    /* quantization parameters */
+    quant_params_t *ps_qp_params = ps_proc->ps_qp_params[1];
+
+    /* Control signal for inverse transform */
+    UWORD32 u4_cntrl;
+
+    /* pointer to packed mb coeff data */
+    void **pv_mb_coeff_data = &(ps_proc->pv_mb_coeff_data);
+
+    /* See if we need to swap U and V plances for entropy */
+    UWORD32 u4_swap_uv = ps_codec->s_cfg.e_inp_color_fmt == IV_YUV_420SP_VU;
+
+    if (PLANE_CH_I8x8 == u1_intra_mode)
+    {
+        pu1_pred_mb = ps_proc->pu1_pred_mb_intra_chroma_plane;
+    }
+    else
+    {
+        pu1_pred_mb = ps_proc->pu1_pred_mb_intra_chroma;
+    }
+
+    /********************************************************/
+    /*  error estimation,                                   */
+    /*  transform                                           */
+    /*  quantization                                        */
+    /********************************************************/
+    ih264e_chroma_8x8_resi_trans_dctrans_quant(ps_codec, pu1_curr_mb,
+                                               pu1_pred_mb, pi2_res_mb,
+                                               i4_src_strd, i4_pred_strd,
+                                               i4_res_strd,
+                                               ps_qp_params->pu2_scale_mat,
+                                               ps_qp_params->pu2_thres_mat,
+                                               ps_qp_params->u1_qbits,
+                                               ps_qp_params->u4_dead_zone,
+                                               au1_nnz);
+
+    /********************************************************/
+    /*  pack coeff data for entropy coding                  */
+    /********************************************************/
+    ih264e_pack_c_mb(pi2_res_mb, pv_mb_coeff_data, i4_res_strd, &u1_cbp_c,
+                     au1_nnz, ps_codec->u4_thres_resi, &u4_cntrl, u4_swap_uv);
+
+    /********************************************************/
+    /*  ierror estimation,                                  */
+    /*  itransform                                          */
+    /*  iquantization                                       */
+    /********************************************************/
+    ih264e_chroma_8x8_idctrans_iquant_itrans_recon(ps_codec, pi2_res_mb,
+                                                   pu1_pred_mb, pu1_ref_mb,
+                                                   i4_res_strd, i4_pred_strd,
+                                                   i4_rec_strd,
+                                                   ps_qp_params->pu2_iscale_mat,
+                                                   ps_qp_params->pu2_weigh_mat,
+                                                   ps_qp_params->u1_qp_div,
+                                                   u4_cntrl,
+                                                   ps_proc->pv_scratch_buff);
+    return (u1_cbp_c);
+}
+
+
+/**
+*******************************************************************************
+*
+* @brief performs luma core coding when  mode is inter
+*
+* @par Description:
+*  If the current mb is to be coded as inter the mb is predicted based on the
+*  sub mb partitions and corresponding motion vectors generated by ME. Then,
+*  error is computed between the input blk and the estimated blk. This error is
+*  transformed, quantized. The quantized coefficients are packed in scan order
+*  for entropy coding
+*
+* @param[in] ps_proc_ctxt
+*  pointer to the current macro block context
+*
+* @returns u1_cbp_l
+*  coded block pattern luma
+*
+* @remarks none
+*
+*******************************************************************************
+*/
+
+UWORD8 ih264e_code_luma_inter_macroblock_16x16(process_ctxt_t *ps_proc)
+{
+    /* Codec Context */
+    codec_t *ps_codec = ps_proc->ps_codec;
+
+    /* pointer to ref macro block */
+    UWORD8 *pu1_rec_mb = ps_proc->pu1_rec_buf_luma;
+
+    /* pointer to src macro block */
+    UWORD8 *pu1_curr_mb = ps_proc->pu1_src_buf_luma;
+
+    /* pointer to prediction macro block */
+    UWORD8 *pu1_pred_mb = ps_proc->pu1_pred_mb;
+
+    /* pointer to residual macro block */
+    WORD16 *pi2_res_mb = ps_proc->pi2_res_buf;
+
+    /* strides */
+    WORD32 i4_src_strd = ps_proc->i4_src_strd;
+    WORD32 i4_rec_strd = ps_proc->i4_rec_strd;
+    WORD32 i4_pred_strd = ps_proc->i4_pred_strd;
+    WORD32 i4_res_strd = ps_proc->i4_res_strd;
+
+    /* coded block pattern */
+    UWORD8 u1_cbp_l = 0;
+
+    /*Control signal of itrans*/
+    UWORD32 u4_cntrl;
+
+    /* number of non zero coeffs*/
+    UWORD8  *pu1_nnz = (UWORD8 *)ps_proc->au4_nnz;
+
+    /* quantization parameters */
+    quant_params_t *ps_qp_params = ps_proc->ps_qp_params[0];
+
+    /* pointer to packed mb coeff data */
+    void **pv_mb_coeff_data = &(ps_proc->pv_mb_coeff_data);
+
+    /* pseudo pred buffer */
+    UWORD8 *pu1_pseudo_pred = pu1_pred_mb;
+
+    /* pseudo pred buffer stride */
+    WORD32 i4_pseudo_pred_strd = i4_pred_strd;
+
+    /* init nnz */
+    ps_proc->au4_nnz[0] = 0;
+    ps_proc->au4_nnz[1] = 0;
+    ps_proc->au4_nnz[2] = 0;
+    ps_proc->au4_nnz[3] = 0;
+    ps_proc->au4_nnz[4] = 0;
+
+    /********************************************************/
+    /*  prediction                                          */
+    /********************************************************/
+    ih264e_motion_comp_luma(ps_proc, &pu1_pseudo_pred, &i4_pseudo_pred_strd);
+
+    /********************************************************/
+    /*  error estimation,                                   */
+    /*  transform                                           */
+    /*  quantization                                        */
+    /********************************************************/
+    if (ps_proc->u4_min_sad_reached == 0 || ps_proc->u4_min_sad != 0)
+    {
+        ih264e_luma_16x16_resi_trans_dctrans_quant(ps_codec, pu1_curr_mb,
+                                                   pu1_pseudo_pred, pi2_res_mb,
+                                                   i4_src_strd,
+                                                   i4_pseudo_pred_strd,
+                                                   i4_res_strd,
+                                                   ps_qp_params->pu2_scale_mat,
+                                                   ps_qp_params->pu2_thres_mat,
+                                                   ps_qp_params->u1_qbits,
+                                                   ps_qp_params->u4_dead_zone,
+                                                   pu1_nnz,
+                                                   DISABLE_DC_TRANSFORM);
+
+        /********************************************************/
+        /*  pack coeff data for entropy coding                  */
+        /********************************************************/
+        ih264e_pack_l_mb(pi2_res_mb, pv_mb_coeff_data, i4_res_strd, &u1_cbp_l,
+                         pu1_nnz, ps_codec->u4_thres_resi, &u4_cntrl);
+    }
+    else
+    {
+        u1_cbp_l = 0;
+        u4_cntrl = 0;
+    }
+
+    /********************************************************/
+    /*  ierror estimation,                                  */
+    /*  itransform                                          */
+    /*  iquantization                                       */
+    /********************************************************/
+
+    /*If the frame is not to be used for P frame reference or dumping recon
+     * we only will use the reocn for only predicting intra Mbs
+     * THis will need only right and bottom edge 4x4 blocks recon
+     * Hence we selectively enable them using control signal(including DC)
+     */
+    if (ps_proc->u4_compute_recon != 1)
+    {
+        u4_cntrl &= 0x111F0000;
+    }
+
+    if (u4_cntrl)
+    {
+        ih264e_luma_16x16_idctrans_iquant_itrans_recon(
+                        ps_codec, pi2_res_mb, pu1_pseudo_pred, pu1_rec_mb,
+                        i4_res_strd, i4_pseudo_pred_strd, i4_rec_strd,
+                        ps_qp_params->pu2_iscale_mat,
+                        ps_qp_params->pu2_weigh_mat, ps_qp_params->u1_qp_div,
+                        u4_cntrl /*Cntrl*/, DISABLE_DC_TRANSFORM,
+                        ps_proc->pv_scratch_buff);
+    }
+    else
+    {
+        ps_codec->pf_inter_pred_luma_copy(pu1_pseudo_pred, pu1_rec_mb,
+                                          i4_pseudo_pred_strd, i4_rec_strd,
+                                          MB_SIZE, MB_SIZE, NULL, 0);
+    }
+
+
+    return (u1_cbp_l);
+}
+
+/**
+*******************************************************************************
+*
+* @brief performs chroma core coding for inter macro blocks
+*
+* @par Description:
+*  If the current mb is to be coded as inter predicted mb,based on the sub mb partitions
+*  and corresponding motion vectors generated by ME  ,prediction is done.
+*  Then, error is computed between the input blk and the estimated blk.
+*  This error is transformed , quantized. The quantized coefficients
+*  are packed in scan order for
+*  entropy coding.
+*
+* @param[in] ps_proc_ctxt
+*  pointer to the current macro block context
+*
+* @returns u1_cbp_l
+*  coded block pattern chroma
+*
+* @remarks none
+*
+*******************************************************************************
+*/
+UWORD8 ih264e_code_chroma_inter_macroblock_8x8(process_ctxt_t *ps_proc)
+{
+    /* Codec Context */
+    codec_t *ps_codec = ps_proc->ps_codec;
+
+    /* pointer to ref macro block */
+    UWORD8 *pu1_rec_mb = ps_proc->pu1_rec_buf_chroma;
+
+    /* pointer to src macro block */
+    UWORD8 *pu1_curr_mb = ps_proc->pu1_src_buf_chroma;
+
+    /* pointer to prediction macro block */
+    UWORD8 *pu1_pred_mb = ps_proc->pu1_pred_mb;
+
+    /* pointer to residual macro block */
+    WORD16 *pi2_res_mb = ps_proc->pi2_res_buf;
+
+    /* strides */
+    WORD32 i4_src_strd = ps_proc->i4_src_strd;
+    WORD32 i4_rec_strd = ps_proc->i4_rec_strd;
+    WORD32 i4_pred_strd = ps_proc->i4_pred_strd;
+    WORD32 i4_res_strd = ps_proc->i4_res_strd;
+
+    /* coded block pattern */
+    UWORD8 u1_cbp_c = 0;
+
+    /*Control signal for inverse transform*/
+    UWORD32 u4_cntrl;
+
+    /* number of non zero coeffs*/
+    UWORD8 au1_nnz[10] = {0};
+
+    /* quantization parameters */
+    quant_params_t *ps_qp_params = ps_proc->ps_qp_params[1];
+
+    /* pointer to packed mb coeff data */
+    void **pv_mb_coeff_data = &(ps_proc->pv_mb_coeff_data);
+
+    /*See if we need to swap U and V plances for entropy*/
+    UWORD32 u4_swap_uv = ps_codec->s_cfg.e_inp_color_fmt == IV_YUV_420SP_VU;
+
+    /********************************************************/
+    /*  prediction                                          */
+    /********************************************************/
+    ih264e_motion_comp_chroma(ps_proc);
+
+    /********************************************************/
+    /*  error estimation,                                   */
+    /*  transform                                           */
+    /*  quantization                                        */
+    /********************************************************/
+    ih264e_chroma_8x8_resi_trans_dctrans_quant(ps_codec, pu1_curr_mb,
+                                               pu1_pred_mb, pi2_res_mb,
+                                               i4_src_strd, i4_pred_strd,
+                                               i4_res_strd,
+                                               ps_qp_params->pu2_scale_mat,
+                                               ps_qp_params->pu2_thres_mat,
+                                               ps_qp_params->u1_qbits,
+                                               ps_qp_params->u4_dead_zone,
+                                               au1_nnz);
+
+    /********************************************************/
+    /*  pack coeff data for entropy coding                  */
+    /********************************************************/
+    ih264e_pack_c_mb(pi2_res_mb, pv_mb_coeff_data, i4_res_strd, &u1_cbp_c,
+                     au1_nnz, ps_codec->u4_thres_resi, &u4_cntrl, u4_swap_uv);
+
+    /********************************************************/
+    /*  ierror estimation,                                  */
+    /*  itransform                                          */
+    /*  iquantization                                       */
+    /********************************************************/
+
+    /* If the frame is not to be used for P frame reference or dumping recon
+     * we only will use the reocn for only predicting intra Mbs
+     * THis will need only right and bottom edge 4x4 blocks recon
+     * Hence we selectively enable them using control signal(including DC)
+     */
+    if (!ps_proc->u4_compute_recon)
+    {
+        u4_cntrl &= 0x7700C000;
+    }
+
+    if (u4_cntrl)
+    {
+        ih264e_chroma_8x8_idctrans_iquant_itrans_recon(
+                        ps_codec, pi2_res_mb, pu1_pred_mb, pu1_rec_mb,
+                        i4_res_strd, i4_pred_strd, i4_rec_strd,
+                        ps_qp_params->pu2_iscale_mat,
+                        ps_qp_params->pu2_weigh_mat, ps_qp_params->u1_qp_div,
+                        u4_cntrl, ps_proc->pv_scratch_buff);
+    }
+    else
+    {
+        ps_codec->pf_inter_pred_luma_copy(pu1_pred_mb, pu1_rec_mb, i4_pred_strd,
+                                          i4_rec_strd, MB_SIZE >> 1, MB_SIZE,
+                                          NULL, 0);
+    }
+
+    return (u1_cbp_c);
+}
author	Hamsalekha S <hamsalekha.s@ittiam.com>	2015-03-13 21:24:58 +0530
committer	Hamsalekha S <hamsalekha.s@ittiam.com>	2015-04-02 15:59:02 +0530
commit	8d3d303c7942ced6a987a52db8977d768dc3605f (patch)
tree	cc806c96794356996b13ba9970941d0aed74a97e /encoder/ih264e_core_coding.c
parent	3956d913d37327dcb340f836e604b04bd478b158 (diff)
download	android_external_libavc-8d3d303c7942ced6a987a52db8977d768dc3605f.tar.gz android_external_libavc-8d3d303c7942ced6a987a52db8977d768dc3605f.tar.bz2 android_external_libavc-8d3d303c7942ced6a987a52db8977d768dc3605f.zip